diff --git a/dateutil/parser/__init__.py b/dateutil/parser/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d174b0e4dcc472999b75e55ebb88af320ae38081
--- /dev/null
+++ b/dateutil/parser/__init__.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+from ._parser import parse, parser, parserinfo, ParserError
+from ._parser import DEFAULTPARSER, DEFAULTTZPARSER
+from ._parser import UnknownTimezoneWarning
+
+from ._parser import __doc__
+
+from .isoparser import isoparser, isoparse
+
+__all__ = ['parse', 'parser', 'parserinfo',
+           'isoparse', 'isoparser',
+           'ParserError',
+           'UnknownTimezoneWarning']
+
+
+###
+# Deprecate portions of the private interface so that downstream code that
+# is improperly relying on it is given *some* notice.
+
+
+def __deprecated_private_func(f):
+    from functools import wraps
+    import warnings
+
+    msg = ('{name} is a private function and may break without warning, '
+           'it will be moved and or renamed in future versions.')
+    msg = msg.format(name=f.__name__)
+
+    @wraps(f)
+    def deprecated_func(*args, **kwargs):
+        warnings.warn(msg, DeprecationWarning)
+        return f(*args, **kwargs)
+
+    return deprecated_func
+
+def __deprecate_private_class(c):
+    import warnings
+
+    msg = ('{name} is a private class and may break without warning, '
+           'it will be moved and or renamed in future versions.')
+    msg = msg.format(name=c.__name__)
+
+    class private_class(c):
+        __doc__ = c.__doc__
+
+        def __init__(self, *args, **kwargs):
+            warnings.warn(msg, DeprecationWarning)
+            super(private_class, self).__init__(*args, **kwargs)
+
+    private_class.__name__ = c.__name__
+
+    return private_class
+
+
+from ._parser import _timelex, _resultbase
+from ._parser import _tzparser, _parsetz
+
+_timelex = __deprecate_private_class(_timelex)
+_tzparser = __deprecate_private_class(_tzparser)
+_resultbase = __deprecate_private_class(_resultbase)
+_parsetz = __deprecated_private_func(_parsetz)
diff --git a/dateutil/parser/_parser.py b/dateutil/parser/_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..37d1663b2f72447800d9a553929e3de932244289
--- /dev/null
+++ b/dateutil/parser/_parser.py
@@ -0,0 +1,1613 @@
+# -*- coding: utf-8 -*-
+"""
+This module offers a generic date/time string parser which is able to parse
+most known formats to represent a date and/or time.
+
+This module attempts to be forgiving with regards to unlikely input formats,
+returning a datetime object even for dates which are ambiguous. If an element
+of a date/time stamp is omitted, the following rules are applied:
+
+- If AM or PM is left unspecified, a 24-hour clock is assumed, however, an hour
+  on a 12-hour clock (``0 <= hour <= 12``) *must* be specified if AM or PM is
+  specified.
+- If a time zone is omitted, a timezone-naive datetime is returned.
+
+If any other elements are missing, they are taken from the
+:class:`datetime.datetime` object passed to the parameter ``default``. If this
+results in a day number exceeding the valid number of days per month, the
+value falls back to the end of the month.
+
+Additional resources about date/time string formats can be found below:
+
+- `A summary of the international standard date and time notation
+  <https://www.cl.cam.ac.uk/~mgk25/iso-time.html>`_
+- `W3C Date and Time Formats <https://www.w3.org/TR/NOTE-datetime>`_
+- `Time Formats (Planetary Rings Node) <https://pds-rings.seti.org:443/tools/time_formats.html>`_
+- `CPAN ParseDate module
+  <https://metacpan.org/pod/release/MUIR/Time-modules-2013.0912/lib/Time/ParseDate.pm>`_
+- `Java SimpleDateFormat Class
+  <https://docs.oracle.com/javase/6/docs/api/java/text/SimpleDateFormat.html>`_
+"""
+from __future__ import unicode_literals
+
+import datetime
+import re
+import string
+import time
+import warnings
+
+from calendar import monthrange
+from io import StringIO
+
+import six
+from six import integer_types, text_type
+
+from decimal import Decimal
+
+from warnings import warn
+
+from .. import relativedelta
+from .. import tz
+
+__all__ = ["parse", "parserinfo", "ParserError"]
+
+
+# TODO: pandas.core.tools.datetimes imports this explicitly.  Might be worth
+# making public and/or figuring out if there is something we can
+# take off their plate.
+class _timelex(object):
+    # Fractional seconds are sometimes split by a comma
+    _split_decimal = re.compile("([.,])")
+
+    def __init__(self, instream):
+        if isinstance(instream, (bytes, bytearray)):
+            instream = instream.decode()
+
+        if isinstance(instream, text_type):
+            instream = StringIO(instream)
+        elif getattr(instream, 'read', None) is None:
+            raise TypeError('Parser must be a string or character stream, not '
+                            '{itype}'.format(itype=instream.__class__.__name__))
+
+        self.instream = instream
+        self.charstack = []
+        self.tokenstack = []
+        self.eof = False
+
+    def get_token(self):
+        """
+        This function breaks the time string into lexical units (tokens), which
+        can be parsed by the parser. Lexical units are demarcated by changes in
+        the character set, so any continuous string of letters is considered
+        one unit, any continuous string of numbers is considered one unit.
+
+        The main complication arises from the fact that dots ('.') can be used
+        both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
+        "4:30:21.447"). As such, it is necessary to read the full context of
+        any dot-separated strings before breaking it into tokens; as such, this
+        function maintains a "token stack", for when the ambiguous context
+        demands that multiple tokens be parsed at once.
+        """
+        if self.tokenstack:
+            return self.tokenstack.pop(0)
+
+        seenletters = False
+        token = None
+        state = None
+
+        while not self.eof:
+            # We only realize that we've reached the end of a token when we
+            # find a character that's not part of the current token - since
+            # that character may be part of the next token, it's stored in the
+            # charstack.
+            if self.charstack:
+                nextchar = self.charstack.pop(0)
+            else:
+                nextchar = self.instream.read(1)
+                while nextchar == '\x00':
+                    nextchar = self.instream.read(1)
+
+            if not nextchar:
+                self.eof = True
+                break
+            elif not state:
+                # First character of the token - determines if we're starting
+                # to parse a word, a number or something else.
+                token = nextchar
+                if self.isword(nextchar):
+                    state = 'a'
+                elif self.isnum(nextchar):
+                    state = '0'
+                elif self.isspace(nextchar):
+                    token = ' '
+                    break  # emit token
+                else:
+                    break  # emit token
+            elif state == 'a':
+                # If we've already started reading a word, we keep reading
+                # letters until we find something that's not part of a word.
+                seenletters = True
+                if self.isword(nextchar):
+                    token += nextchar
+                elif nextchar == '.':
+                    token += nextchar
+                    state = 'a.'
+                else:
+                    self.charstack.append(nextchar)
+                    break  # emit token
+            elif state == '0':
+                # If we've already started reading a number, we keep reading
+                # numbers until we find something that doesn't fit.
+                if self.isnum(nextchar):
+                    token += nextchar
+                elif nextchar == '.' or (nextchar == ',' and len(token) >= 2):
+                    token += nextchar
+                    state = '0.'
+                else:
+                    self.charstack.append(nextchar)
+                    break  # emit token
+            elif state == 'a.':
+                # If we've seen some letters and a dot separator, continue
+                # parsing, and the tokens will be broken up later.
+                seenletters = True
+                if nextchar == '.' or self.isword(nextchar):
+                    token += nextchar
+                elif self.isnum(nextchar) and token[-1] == '.':
+                    token += nextchar
+                    state = '0.'
+                else:
+                    self.charstack.append(nextchar)
+                    break  # emit token
+            elif state == '0.':
+                # If we've seen at least one dot separator, keep going, we'll
+                # break up the tokens later.
+                if nextchar == '.' or self.isnum(nextchar):
+                    token += nextchar
+                elif self.isword(nextchar) and token[-1] == '.':
+                    token += nextchar
+                    state = 'a.'
+                else:
+                    self.charstack.append(nextchar)
+                    break  # emit token
+
+        if (state in ('a.', '0.') and (seenletters or token.count('.') > 1 or
+                                       token[-1] in '.,')):
+            l = self._split_decimal.split(token)
+            token = l[0]
+            for tok in l[1:]:
+                if tok:
+                    self.tokenstack.append(tok)
+
+        if state == '0.' and token.count('.') == 0:
+            token = token.replace(',', '.')
+
+        return token
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        token = self.get_token()
+        if token is None:
+            raise StopIteration
+
+        return token
+
+    def next(self):
+        return self.__next__()  # Python 2.x support
+
+    @classmethod
+    def split(cls, s):
+        return list(cls(s))
+
+    @classmethod
+    def isword(cls, nextchar):
+        """ Whether or not the next character is part of a word """
+        return nextchar.isalpha()
+
+    @classmethod
+    def isnum(cls, nextchar):
+        """ Whether the next character is part of a number """
+        return nextchar.isdigit()
+
+    @classmethod
+    def isspace(cls, nextchar):
+        """ Whether the next character is whitespace """
+        return nextchar.isspace()
+
+
+class _resultbase(object):
+
+    def __init__(self):
+        for attr in self.__slots__:
+            setattr(self, attr, None)
+
+    def _repr(self, classname):
+        l = []
+        for attr in self.__slots__:
+            value = getattr(self, attr)
+            if value is not None:
+                l.append("%s=%s" % (attr, repr(value)))
+        return "%s(%s)" % (classname, ", ".join(l))
+
+    def __len__(self):
+        return (sum(getattr(self, attr) is not None
+                    for attr in self.__slots__))
+
+    def __repr__(self):
+        return self._repr(self.__class__.__name__)
+
+
+class parserinfo(object):
+    """
+    Class which handles what inputs are accepted. Subclass this to customize
+    the language and acceptable values for each parameter.
+
+    :param dayfirst:
+        Whether to interpret the first value in an ambiguous 3-integer date
+        (e.g. 01/05/09) as the day (``True``) or month (``False``). If
+        ``yearfirst`` is set to ``True``, this distinguishes between YDM
+        and YMD. Default is ``False``.
+
+    :param yearfirst:
+        Whether to interpret the first value in an ambiguous 3-integer date
+        (e.g. 01/05/09) as the year. If ``True``, the first number is taken
+        to be the year, otherwise the last number is taken to be the year.
+        Default is ``False``.
+    """
+
+    # m from a.m/p.m, t from ISO T separator
+    JUMP = [" ", ".", ",", ";", "-", "/", "'",
+            "at", "on", "and", "ad", "m", "t", "of",
+            "st", "nd", "rd", "th"]
+
+    WEEKDAYS = [("Mon", "Monday"),
+                ("Tue", "Tuesday"),     # TODO: "Tues"
+                ("Wed", "Wednesday"),
+                ("Thu", "Thursday"),    # TODO: "Thurs"
+                ("Fri", "Friday"),
+                ("Sat", "Saturday"),
+                ("Sun", "Sunday")]
+    MONTHS = [("Jan", "January"),
+              ("Feb", "February"),      # TODO: "Febr"
+              ("Mar", "March"),
+              ("Apr", "April"),
+              ("May", "May"),
+              ("Jun", "June"),
+              ("Jul", "July"),
+              ("Aug", "August"),
+              ("Sep", "Sept", "September"),
+              ("Oct", "October"),
+              ("Nov", "November"),
+              ("Dec", "December")]
+    HMS = [("h", "hour", "hours"),
+           ("m", "minute", "minutes"),
+           ("s", "second", "seconds")]
+    AMPM = [("am", "a"),
+            ("pm", "p")]
+    UTCZONE = ["UTC", "GMT", "Z", "z"]
+    PERTAIN = ["of"]
+    TZOFFSET = {}
+    # TODO: ERA = ["AD", "BC", "CE", "BCE", "Stardate",
+    #              "Anno Domini", "Year of Our Lord"]
+
+    def __init__(self, dayfirst=False, yearfirst=False):
+        self._jump = self._convert(self.JUMP)
+        self._weekdays = self._convert(self.WEEKDAYS)
+        self._months = self._convert(self.MONTHS)
+        self._hms = self._convert(self.HMS)
+        self._ampm = self._convert(self.AMPM)
+        self._utczone = self._convert(self.UTCZONE)
+        self._pertain = self._convert(self.PERTAIN)
+
+        self.dayfirst = dayfirst
+        self.yearfirst = yearfirst
+
+        self._year = time.localtime().tm_year
+        self._century = self._year // 100 * 100
+
+    def _convert(self, lst):
+        dct = {}
+        for i, v in enumerate(lst):
+            if isinstance(v, tuple):
+                for v in v:
+                    dct[v.lower()] = i
+            else:
+                dct[v.lower()] = i
+        return dct
+
+    def jump(self, name):
+        return name.lower() in self._jump
+
+    def weekday(self, name):
+        try:
+            return self._weekdays[name.lower()]
+        except KeyError:
+            pass
+        return None
+
+    def month(self, name):
+        try:
+            return self._months[name.lower()] + 1
+        except KeyError:
+            pass
+        return None
+
+    def hms(self, name):
+        try:
+            return self._hms[name.lower()]
+        except KeyError:
+            return None
+
+    def ampm(self, name):
+        try:
+            return self._ampm[name.lower()]
+        except KeyError:
+            return None
+
+    def pertain(self, name):
+        return name.lower() in self._pertain
+
+    def utczone(self, name):
+        return name.lower() in self._utczone
+
+    def tzoffset(self, name):
+        if name in self._utczone:
+            return 0
+
+        return self.TZOFFSET.get(name)
+
+    def convertyear(self, year, century_specified=False):
+        """
+        Converts two-digit years to year within [-50, 49]
+        range of self._year (current local time)
+        """
+
+        # Function contract is that the year is always positive
+        assert year >= 0
+
+        if year < 100 and not century_specified:
+            # assume current century to start
+            year += self._century
+
+            if year >= self._year + 50:  # if too far in future
+                year -= 100
+            elif year < self._year - 50:  # if too far in past
+                year += 100
+
+        return year
+
+    def validate(self, res):
+        # move to info
+        if res.year is not None:
+            res.year = self.convertyear(res.year, res.century_specified)
+
+        if ((res.tzoffset == 0 and not res.tzname) or
+             (res.tzname == 'Z' or res.tzname == 'z')):
+            res.tzname = "UTC"
+            res.tzoffset = 0
+        elif res.tzoffset != 0 and res.tzname and self.utczone(res.tzname):
+            res.tzoffset = 0
+        return True
+
+
+class _ymd(list):
+    def __init__(self, *args, **kwargs):
+        super(self.__class__, self).__init__(*args, **kwargs)
+        self.century_specified = False
+        self.dstridx = None
+        self.mstridx = None
+        self.ystridx = None
+
+    @property
+    def has_year(self):
+        return self.ystridx is not None
+
+    @property
+    def has_month(self):
+        return self.mstridx is not None
+
+    @property
+    def has_day(self):
+        return self.dstridx is not None
+
+    def could_be_day(self, value):
+        if self.has_day:
+            return False
+        elif not self.has_month:
+            return 1 <= value <= 31
+        elif not self.has_year:
+            # Be permissive, assume leap year
+            month = self[self.mstridx]
+            return 1 <= value <= monthrange(2000, month)[1]
+        else:
+            month = self[self.mstridx]
+            year = self[self.ystridx]
+            return 1 <= value <= monthrange(year, month)[1]
+
+    def append(self, val, label=None):
+        if hasattr(val, '__len__'):
+            if val.isdigit() and len(val) > 2:
+                self.century_specified = True
+                if label not in [None, 'Y']:  # pragma: no cover
+                    raise ValueError(label)
+                label = 'Y'
+        elif val > 100:
+            self.century_specified = True
+            if label not in [None, 'Y']:  # pragma: no cover
+                raise ValueError(label)
+            label = 'Y'
+
+        super(self.__class__, self).append(int(val))
+
+        if label == 'M':
+            if self.has_month:
+                raise ValueError('Month is already set')
+            self.mstridx = len(self) - 1
+        elif label == 'D':
+            if self.has_day:
+                raise ValueError('Day is already set')
+            self.dstridx = len(self) - 1
+        elif label == 'Y':
+            if self.has_year:
+                raise ValueError('Year is already set')
+            self.ystridx = len(self) - 1
+
+    def _resolve_from_stridxs(self, strids):
+        """
+        Try to resolve the identities of year/month/day elements using
+        ystridx, mstridx, and dstridx, if enough of these are specified.
+        """
+        if len(self) == 3 and len(strids) == 2:
+            # we can back out the remaining stridx value
+            missing = [x for x in range(3) if x not in strids.values()]
+            key = [x for x in ['y', 'm', 'd'] if x not in strids]
+            assert len(missing) == len(key) == 1
+            key = key[0]
+            val = missing[0]
+            strids[key] = val
+
+        assert len(self) == len(strids)  # otherwise this should not be called
+        out = {key: self[strids[key]] for key in strids}
+        return (out.get('y'), out.get('m'), out.get('d'))
+
+    def resolve_ymd(self, yearfirst, dayfirst):
+        len_ymd = len(self)
+        year, month, day = (None, None, None)
+
+        strids = (('y', self.ystridx),
+                  ('m', self.mstridx),
+                  ('d', self.dstridx))
+
+        strids = {key: val for key, val in strids if val is not None}
+        if (len(self) == len(strids) > 0 or
+                (len(self) == 3 and len(strids) == 2)):
+            return self._resolve_from_stridxs(strids)
+
+        mstridx = self.mstridx
+
+        if len_ymd > 3:
+            raise ValueError("More than three YMD values")
+        elif len_ymd == 1 or (mstridx is not None and len_ymd == 2):
+            # One member, or two members with a month string
+            if mstridx is not None:
+                month = self[mstridx]
+                # since mstridx is 0 or 1, self[mstridx-1] always
+                # looks up the other element
+                other = self[mstridx - 1]
+            else:
+                other = self[0]
+
+            if len_ymd > 1 or mstridx is None:
+                if other > 31:
+                    year = other
+                else:
+                    day = other
+
+        elif len_ymd == 2:
+            # Two members with numbers
+            if self[0] > 31:
+                # 99-01
+                year, month = self
+            elif self[1] > 31:
+                # 01-99
+                month, year = self
+            elif dayfirst and self[1] <= 12:
+                # 13-01
+                day, month = self
+            else:
+                # 01-13
+                month, day = self
+
+        elif len_ymd == 3:
+            # Three members
+            if mstridx == 0:
+                if self[1] > 31:
+                    # Apr-2003-25
+                    month, year, day = self
+                else:
+                    month, day, year = self
+            elif mstridx == 1:
+                if self[0] > 31 or (yearfirst and self[2] <= 31):
+                    # 99-Jan-01
+                    year, month, day = self
+                else:
+                    # 01-Jan-01
+                    # Give precedence to day-first, since
+                    # two-digit years is usually hand-written.
+                    day, month, year = self
+
+            elif mstridx == 2:
+                # WTF!?
+                if self[1] > 31:
+                    # 01-99-Jan
+                    day, year, month = self
+                else:
+                    # 99-01-Jan
+                    year, day, month = self
+
+            else:
+                if (self[0] > 31 or
+                    self.ystridx == 0 or
+                        (yearfirst and self[1] <= 12 and self[2] <= 31)):
+                    # 99-01-01
+                    if dayfirst and self[2] <= 12:
+                        year, day, month = self
+                    else:
+                        year, month, day = self
+                elif self[0] > 12 or (dayfirst and self[1] <= 12):
+                    # 13-01-01
+                    day, month, year = self
+                else:
+                    # 01-13-01
+                    month, day, year = self
+
+        return year, month, day
+
+
+class parser(object):
+    def __init__(self, info=None):
+        self.info = info or parserinfo()
+
+    def parse(self, timestr, default=None,
+              ignoretz=False, tzinfos=None, **kwargs):
+        """
+        Parse the date/time string into a :class:`datetime.datetime` object.
+
+        :param timestr:
+            Any date/time string using the supported formats.
+
+        :param default:
+            The default datetime object, if this is a datetime object and not
+            ``None``, elements specified in ``timestr`` replace elements in the
+            default object.
+
+        :param ignoretz:
+            If set ``True``, time zones in parsed strings are ignored and a
+            naive :class:`datetime.datetime` object is returned.
+
+        :param tzinfos:
+            Additional time zone names / aliases which may be present in the
+            string. This argument maps time zone names (and optionally offsets
+            from those time zones) to time zones. This parameter can be a
+            dictionary with timezone aliases mapping time zone names to time
+            zones or a function taking two parameters (``tzname`` and
+            ``tzoffset``) and returning a time zone.
+
+            The timezones to which the names are mapped can be an integer
+            offset from UTC in seconds or a :class:`tzinfo` object.
+
+            .. doctest::
+               :options: +NORMALIZE_WHITESPACE
+
+                >>> from dateutil.parser import parse
+                >>> from dateutil.tz import gettz
+                >>> tzinfos = {"BRST": -7200, "CST": gettz("America/Chicago")}
+                >>> parse("2012-01-19 17:21:00 BRST", tzinfos=tzinfos)
+                datetime.datetime(2012, 1, 19, 17, 21, tzinfo=tzoffset(u'BRST', -7200))
+                >>> parse("2012-01-19 17:21:00 CST", tzinfos=tzinfos)
+                datetime.datetime(2012, 1, 19, 17, 21,
+                                  tzinfo=tzfile('/usr/share/zoneinfo/America/Chicago'))
+
+            This parameter is ignored if ``ignoretz`` is set.
+
+        :param \\*\\*kwargs:
+            Keyword arguments as passed to ``_parse()``.
+
+        :return:
+            Returns a :class:`datetime.datetime` object or, if the
+            ``fuzzy_with_tokens`` option is ``True``, returns a tuple, the
+            first element being a :class:`datetime.datetime` object, the second
+            a tuple containing the fuzzy tokens.
+
+        :raises ParserError:
+            Raised for invalid or unknown string format, if the provided
+            :class:`tzinfo` is not in a valid format, or if an invalid date
+            would be created.
+
+        :raises TypeError:
+            Raised for non-string or character stream input.
+
+        :raises OverflowError:
+            Raised if the parsed date exceeds the largest valid C integer on
+            your system.
+        """
+
+        if default is None:
+            default = datetime.datetime.now().replace(hour=0, minute=0,
+                                                      second=0, microsecond=0)
+
+        res, skipped_tokens = self._parse(timestr, **kwargs)
+
+        if res is None:
+            raise ParserError("Unknown string format: %s", timestr)
+
+        if len(res) == 0:
+            raise ParserError("String does not contain a date: %s", timestr)
+
+        try:
+            ret = self._build_naive(res, default)
+        except ValueError as e:
+            six.raise_from(ParserError(str(e) + ": %s", timestr), e)
+
+        if not ignoretz:
+            ret = self._build_tzaware(ret, res, tzinfos)
+
+        if kwargs.get('fuzzy_with_tokens', False):
+            return ret, skipped_tokens
+        else:
+            return ret
+
+    class _result(_resultbase):
+        __slots__ = ["year", "month", "day", "weekday",
+                     "hour", "minute", "second", "microsecond",
+                     "tzname", "tzoffset", "ampm","any_unused_tokens"]
+
+    def _parse(self, timestr, dayfirst=None, yearfirst=None, fuzzy=False,
+               fuzzy_with_tokens=False):
+        """
+        Private method which performs the heavy lifting of parsing, called from
+        ``parse()``, which passes on its ``kwargs`` to this function.
+
+        :param timestr:
+            The string to parse.
+
+        :param dayfirst:
+            Whether to interpret the first value in an ambiguous 3-integer date
+            (e.g. 01/05/09) as the day (``True``) or month (``False``). If
+            ``yearfirst`` is set to ``True``, this distinguishes between YDM
+            and YMD. If set to ``None``, this value is retrieved from the
+            current :class:`parserinfo` object (which itself defaults to
+            ``False``).
+
+        :param yearfirst:
+            Whether to interpret the first value in an ambiguous 3-integer date
+            (e.g. 01/05/09) as the year. If ``True``, the first number is taken
+            to be the year, otherwise the last number is taken to be the year.
+            If this is set to ``None``, the value is retrieved from the current
+            :class:`parserinfo` object (which itself defaults to ``False``).
+
+        :param fuzzy:
+            Whether to allow fuzzy parsing, allowing for string like "Today is
+            January 1, 2047 at 8:21:00AM".
+
+        :param fuzzy_with_tokens:
+            If ``True``, ``fuzzy`` is automatically set to True, and the parser
+            will return a tuple where the first element is the parsed
+            :class:`datetime.datetime` datetimestamp and the second element is
+            a tuple containing the portions of the string which were ignored:
+
+            .. doctest::
+
+                >>> from dateutil.parser import parse
+                >>> parse("Today is January 1, 2047 at 8:21:00AM", fuzzy_with_tokens=True)
+                (datetime.datetime(2047, 1, 1, 8, 21), (u'Today is ', u' ', u'at '))
+
+        """
+        if fuzzy_with_tokens:
+            fuzzy = True
+
+        info = self.info
+
+        if dayfirst is None:
+            dayfirst = info.dayfirst
+
+        if yearfirst is None:
+            yearfirst = info.yearfirst
+
+        res = self._result()
+        l = _timelex.split(timestr)         # Splits the timestr into tokens
+
+        skipped_idxs = []
+
+        # year/month/day list
+        ymd = _ymd()
+
+        len_l = len(l)
+        i = 0
+        try:
+            while i < len_l:
+
+                # Check if it's a number
+                value_repr = l[i]
+                try:
+                    value = float(value_repr)
+                except ValueError:
+                    value = None
+
+                if value is not None:
+                    # Numeric token
+                    i = self._parse_numeric_token(l, i, info, ymd, res, fuzzy)
+
+                # Check weekday
+                elif info.weekday(l[i]) is not None:
+                    value = info.weekday(l[i])
+                    res.weekday = value
+
+                # Check month name
+                elif info.month(l[i]) is not None:
+                    value = info.month(l[i])
+                    ymd.append(value, 'M')
+
+                    if i + 1 < len_l:
+                        if l[i + 1] in ('-', '/'):
+                            # Jan-01[-99]
+                            sep = l[i + 1]
+                            ymd.append(l[i + 2])
+
+                            if i + 3 < len_l and l[i + 3] == sep:
+                                # Jan-01-99
+                                ymd.append(l[i + 4])
+                                i += 2
+
+                            i += 2
+
+                        elif (i + 4 < len_l and l[i + 1] == l[i + 3] == ' ' and
+                              info.pertain(l[i + 2])):
+                            # Jan of 01
+                            # In this case, 01 is clearly year
+                            if l[i + 4].isdigit():
+                                # Convert it here to become unambiguous
+                                value = int(l[i + 4])
+                                year = str(info.convertyear(value))
+                                ymd.append(year, 'Y')
+                            else:
+                                # Wrong guess
+                                pass
+                                # TODO: not hit in tests
+                            i += 4
+
+                # Check am/pm
+                elif info.ampm(l[i]) is not None:
+                    value = info.ampm(l[i])
+                    val_is_ampm = self._ampm_valid(res.hour, res.ampm, fuzzy)
+
+                    if val_is_ampm:
+                        res.hour = self._adjust_ampm(res.hour, value)
+                        res.ampm = value
+
+                    elif fuzzy:
+                        skipped_idxs.append(i)
+
+                # Check for a timezone name
+                elif self._could_be_tzname(res.hour, res.tzname, res.tzoffset, l[i]):
+                    res.tzname = l[i]
+                    res.tzoffset = info.tzoffset(res.tzname)
+
+                    # Check for something like GMT+3, or BRST+3. Notice
+                    # that it doesn't mean "I am 3 hours after GMT", but
+                    # "my time +3 is GMT". If found, we reverse the
+                    # logic so that timezone parsing code will get it
+                    # right.
+                    if i + 1 < len_l and l[i + 1] in ('+', '-'):
+                        l[i + 1] = ('+', '-')[l[i + 1] == '+']
+                        res.tzoffset = None
+                        if info.utczone(res.tzname):
+                            # With something like GMT+3, the timezone
+                            # is *not* GMT.
+                            res.tzname = None
+
+                # Check for a numbered timezone
+                elif res.hour is not None and l[i] in ('+', '-'):
+                    signal = (-1, 1)[l[i] == '+']
+                    len_li = len(l[i + 1])
+
+                    # TODO: check that l[i + 1] is integer?
+                    if len_li == 4:
+                        # -0300
+                        hour_offset = int(l[i + 1][:2])
+                        min_offset = int(l[i + 1][2:])
+                    elif i + 2 < len_l and l[i + 2] == ':':
+                        # -03:00
+                        hour_offset = int(l[i + 1])
+                        min_offset = int(l[i + 3])  # TODO: Check that l[i+3] is minute-like?
+                        i += 2
+                    elif len_li <= 2:
+                        # -[0]3
+                        hour_offset = int(l[i + 1][:2])
+                        min_offset = 0
+                    else:
+                        raise ValueError(timestr)
+
+                    res.tzoffset = signal * (hour_offset * 3600 + min_offset * 60)
+
+                    # Look for a timezone name between parenthesis
+                    if (i + 5 < len_l and
+                            info.jump(l[i + 2]) and l[i + 3] == '(' and
+                            l[i + 5] == ')' and
+                            3 <= len(l[i + 4]) and
+                            self._could_be_tzname(res.hour, res.tzname,
+                                                  None, l[i + 4])):
+                        # -0300 (BRST)
+                        res.tzname = l[i + 4]
+                        i += 4
+
+                    i += 1
+
+                # Check jumps
+                elif not (info.jump(l[i]) or fuzzy):
+                    raise ValueError(timestr)
+
+                else:
+                    skipped_idxs.append(i)
+                i += 1
+
+            # Process year/month/day
+            year, month, day = ymd.resolve_ymd(yearfirst, dayfirst)
+
+            res.century_specified = ymd.century_specified
+            res.year = year
+            res.month = month
+            res.day = day
+
+        except (IndexError, ValueError):
+            return None, None
+
+        if not info.validate(res):
+            return None, None
+
+        if fuzzy_with_tokens:
+            skipped_tokens = self._recombine_skipped(l, skipped_idxs)
+            return res, tuple(skipped_tokens)
+        else:
+            return res, None
+
+    def _parse_numeric_token(self, tokens, idx, info, ymd, res, fuzzy):
+        # Token is a number
+        value_repr = tokens[idx]
+        try:
+            value = self._to_decimal(value_repr)
+        except Exception as e:
+            six.raise_from(ValueError('Unknown numeric token'), e)
+
+        len_li = len(value_repr)
+
+        len_l = len(tokens)
+
+        if (len(ymd) == 3 and len_li in (2, 4) and
+            res.hour is None and
+            (idx + 1 >= len_l or
+             (tokens[idx + 1] != ':' and
+              info.hms(tokens[idx + 1]) is None))):
+            # 19990101T23[59]
+            s = tokens[idx]
+            res.hour = int(s[:2])
+
+            if len_li == 4:
+                res.minute = int(s[2:])
+
+        elif len_li == 6 or (len_li > 6 and tokens[idx].find('.') == 6):
+            # YYMMDD or HHMMSS[.ss]
+            s = tokens[idx]
+
+            if not ymd and '.' not in tokens[idx]:
+                ymd.append(s[:2])
+                ymd.append(s[2:4])
+                ymd.append(s[4:])
+            else:
+                # 19990101T235959[.59]
+
+                # TODO: Check if res attributes already set.
+                res.hour = int(s[:2])
+                res.minute = int(s[2:4])
+                res.second, res.microsecond = self._parsems(s[4:])
+
+        elif len_li in (8, 12, 14):
+            # YYYYMMDD
+            s = tokens[idx]
+            ymd.append(s[:4], 'Y')
+            ymd.append(s[4:6])
+            ymd.append(s[6:8])
+
+            if len_li > 8:
+                res.hour = int(s[8:10])
+                res.minute = int(s[10:12])
+
+                if len_li > 12:
+                    res.second = int(s[12:])
+
+        elif self._find_hms_idx(idx, tokens, info, allow_jump=True) is not None:
+            # HH[ ]h or MM[ ]m or SS[.ss][ ]s
+            hms_idx = self._find_hms_idx(idx, tokens, info, allow_jump=True)
+            (idx, hms) = self._parse_hms(idx, tokens, info, hms_idx)
+            if hms is not None:
+                # TODO: checking that hour/minute/second are not
+                # already set?
+                self._assign_hms(res, value_repr, hms)
+
+        elif idx + 2 < len_l and tokens[idx + 1] == ':':
+            # HH:MM[:SS[.ss]]
+            res.hour = int(value)
+            value = self._to_decimal(tokens[idx + 2])  # TODO: try/except for this?
+            (res.minute, res.second) = self._parse_min_sec(value)
+
+            if idx + 4 < len_l and tokens[idx + 3] == ':':
+                res.second, res.microsecond = self._parsems(tokens[idx + 4])
+
+                idx += 2
+
+            idx += 2
+
+        elif idx + 1 < len_l and tokens[idx + 1] in ('-', '/', '.'):
+            sep = tokens[idx + 1]
+            ymd.append(value_repr)
+
+            if idx + 2 < len_l and not info.jump(tokens[idx + 2]):
+                if tokens[idx + 2].isdigit():
+                    # 01-01[-01]
+                    ymd.append(tokens[idx + 2])
+                else:
+                    # 01-Jan[-01]
+                    value = info.month(tokens[idx + 2])
+
+                    if value is not None:
+                        ymd.append(value, 'M')
+                    else:
+                        raise ValueError()
+
+                if idx + 3 < len_l and tokens[idx + 3] == sep:
+                    # We have three members
+                    value = info.month(tokens[idx + 4])
+
+                    if value is not None:
+                        ymd.append(value, 'M')
+                    else:
+                        ymd.append(tokens[idx + 4])
+                    idx += 2
+
+                idx += 1
+            idx += 1
+
+        elif idx + 1 >= len_l or info.jump(tokens[idx + 1]):
+            if idx + 2 < len_l and info.ampm(tokens[idx + 2]) is not None:
+                # 12 am
+                hour = int(value)
+                res.hour = self._adjust_ampm(hour, info.ampm(tokens[idx + 2]))
+                idx += 1
+            else:
+                # Year, month or day
+                ymd.append(value)
+            idx += 1
+
+        elif info.ampm(tokens[idx + 1]) is not None and (0 <= value < 24):
+            # 12am
+            hour = int(value)
+            res.hour = self._adjust_ampm(hour, info.ampm(tokens[idx + 1]))
+            idx += 1
+
+        elif ymd.could_be_day(value):
+            ymd.append(value)
+
+        elif not fuzzy:
+            raise ValueError()
+
+        return idx
+
+    def _find_hms_idx(self, idx, tokens, info, allow_jump):
+        len_l = len(tokens)
+
+        if idx+1 < len_l and info.hms(tokens[idx+1]) is not None:
+            # There is an "h", "m", or "s" label following this token.  We take
+            # assign the upcoming label to the current token.
+            # e.g. the "12" in 12h"
+            hms_idx = idx + 1
+
+        elif (allow_jump and idx+2 < len_l and tokens[idx+1] == ' ' and
+              info.hms(tokens[idx+2]) is not None):
+            # There is a space and then an "h", "m", or "s" label.
+            # e.g. the "12" in "12 h"
+            hms_idx = idx + 2
+
+        elif idx > 0 and info.hms(tokens[idx-1]) is not None:
+            # There is a "h", "m", or "s" preceding this token.  Since neither
+            # of the previous cases was hit, there is no label following this
+            # token, so we use the previous label.
+            # e.g. the "04" in "12h04"
+            hms_idx = idx-1
+
+        elif (1 < idx == len_l-1 and tokens[idx-1] == ' ' and
+              info.hms(tokens[idx-2]) is not None):
+            # If we are looking at the final token, we allow for a
+            # backward-looking check to skip over a space.
+            # TODO: Are we sure this is the right condition here?
+            hms_idx = idx - 2
+
+        else:
+            hms_idx = None
+
+        return hms_idx
+
+    def _assign_hms(self, res, value_repr, hms):
+        # See GH issue #427, fixing float rounding
+        value = self._to_decimal(value_repr)
+
+        if hms == 0:
+            # Hour
+            res.hour = int(value)
+            if value % 1:
+                res.minute = int(60*(value % 1))
+
+        elif hms == 1:
+            (res.minute, res.second) = self._parse_min_sec(value)
+
+        elif hms == 2:
+            (res.second, res.microsecond) = self._parsems(value_repr)
+
+    def _could_be_tzname(self, hour, tzname, tzoffset, token):
+        return (hour is not None and
+                tzname is None and
+                tzoffset is None and
+                len(token) <= 5 and
+                (all(x in string.ascii_uppercase for x in token)
+                 or token in self.info.UTCZONE))
+
+    def _ampm_valid(self, hour, ampm, fuzzy):
+        """
+        For fuzzy parsing, 'a' or 'am' (both valid English words)
+        may erroneously trigger the AM/PM flag. Deal with that
+        here.
+        """
+        val_is_ampm = True
+
+        # If there's already an AM/PM flag, this one isn't one.
+        if fuzzy and ampm is not None:
+            val_is_ampm = False
+
+        # If AM/PM is found and hour is not, raise a ValueError
+        if hour is None:
+            if fuzzy:
+                val_is_ampm = False
+            else:
+                raise ValueError('No hour specified with AM or PM flag.')
+        elif not 0 <= hour <= 12:
+            # If AM/PM is found, it's a 12 hour clock, so raise
+            # an error for invalid range
+            if fuzzy:
+                val_is_ampm = False
+            else:
+                raise ValueError('Invalid hour specified for 12-hour clock.')
+
+        return val_is_ampm
+
+    def _adjust_ampm(self, hour, ampm):
+        if hour < 12 and ampm == 1:
+            hour += 12
+        elif hour == 12 and ampm == 0:
+            hour = 0
+        return hour
+
+    def _parse_min_sec(self, value):
+        # TODO: Every usage of this function sets res.second to the return
+        # value. Are there any cases where second will be returned as None and
+        # we *don't* want to set res.second = None?
+        minute = int(value)
+        second = None
+
+        sec_remainder = value % 1
+        if sec_remainder:
+            second = int(60 * sec_remainder)
+        return (minute, second)
+
+    def _parse_hms(self, idx, tokens, info, hms_idx):
+        # TODO: Is this going to admit a lot of false-positives for when we
+        # just happen to have digits and "h", "m" or "s" characters in non-date
+        # text?  I guess hex hashes won't have that problem, but there's plenty
+        # of random junk out there.
+        if hms_idx is None:
+            hms = None
+            new_idx = idx
+        elif hms_idx > idx:
+            hms = info.hms(tokens[hms_idx])
+            new_idx = hms_idx
+        else:
+            # Looking backwards, increment one.
+            hms = info.hms(tokens[hms_idx]) + 1
+            new_idx = idx
+
+        return (new_idx, hms)
+
+    # ------------------------------------------------------------------
+    # Handling for individual tokens.  These are kept as methods instead
+    #  of functions for the sake of customizability via subclassing.
+
+    def _parsems(self, value):
+        """Parse a I[.F] seconds value into (seconds, microseconds)."""
+        if "." not in value:
+            return int(value), 0
+        else:
+            i, f = value.split(".")
+            return int(i), int(f.ljust(6, "0")[:6])
+
+    def _to_decimal(self, val):
+        try:
+            decimal_value = Decimal(val)
+            # See GH 662, edge case, infinite value should not be converted
+            #  via `_to_decimal`
+            if not decimal_value.is_finite():
+                raise ValueError("Converted decimal value is infinite or NaN")
+        except Exception as e:
+            msg = "Could not convert %s to decimal" % val
+            six.raise_from(ValueError(msg), e)
+        else:
+            return decimal_value
+
+    # ------------------------------------------------------------------
+    # Post-Parsing construction of datetime output.  These are kept as
+    #  methods instead of functions for the sake of customizability via
+    #  subclassing.
+
+    def _build_tzinfo(self, tzinfos, tzname, tzoffset):
+        if callable(tzinfos):
+            tzdata = tzinfos(tzname, tzoffset)
+        else:
+            tzdata = tzinfos.get(tzname)
+        # handle case where tzinfo is paased an options that returns None
+        # eg tzinfos = {'BRST' : None}
+        if isinstance(tzdata, datetime.tzinfo) or tzdata is None:
+            tzinfo = tzdata
+        elif isinstance(tzdata, text_type):
+            tzinfo = tz.tzstr(tzdata)
+        elif isinstance(tzdata, integer_types):
+            tzinfo = tz.tzoffset(tzname, tzdata)
+        else:
+            raise TypeError("Offset must be tzinfo subclass, tz string, "
+                            "or int offset.")
+        return tzinfo
+
+    def _build_tzaware(self, naive, res, tzinfos):
+        if (callable(tzinfos) or (tzinfos and res.tzname in tzinfos)):
+            tzinfo = self._build_tzinfo(tzinfos, res.tzname, res.tzoffset)
+            aware = naive.replace(tzinfo=tzinfo)
+            aware = self._assign_tzname(aware, res.tzname)
+
+        elif res.tzname and res.tzname in time.tzname:
+            aware = naive.replace(tzinfo=tz.tzlocal())
+
+            # Handle ambiguous local datetime
+            aware = self._assign_tzname(aware, res.tzname)
+
+            # This is mostly relevant for winter GMT zones parsed in the UK
+            if (aware.tzname() != res.tzname and
+                    res.tzname in self.info.UTCZONE):
+                aware = aware.replace(tzinfo=tz.UTC)
+
+        elif res.tzoffset == 0:
+            aware = naive.replace(tzinfo=tz.UTC)
+
+        elif res.tzoffset:
+            aware = naive.replace(tzinfo=tz.tzoffset(res.tzname, res.tzoffset))
+
+        elif not res.tzname and not res.tzoffset:
+            # i.e. no timezone information was found.
+            aware = naive
+
+        elif res.tzname:
+            # tz-like string was parsed but we don't know what to do
+            # with it
+            warnings.warn("tzname {tzname} identified but not understood.  "
+                          "Pass `tzinfos` argument in order to correctly "
+                          "return a timezone-aware datetime.  In a future "
+                          "version, this will raise an "
+                          "exception.".format(tzname=res.tzname),
+                          category=UnknownTimezoneWarning)
+            aware = naive
+
+        return aware
+
+    def _build_naive(self, res, default):
+        repl = {}
+        for attr in ("year", "month", "day", "hour",
+                     "minute", "second", "microsecond"):
+            value = getattr(res, attr)
+            if value is not None:
+                repl[attr] = value
+
+        if 'day' not in repl:
+            # If the default day exceeds the last day of the month, fall back
+            # to the end of the month.
+            cyear = default.year if res.year is None else res.year
+            cmonth = default.month if res.month is None else res.month
+            cday = default.day if res.day is None else res.day
+
+            if cday > monthrange(cyear, cmonth)[1]:
+                repl['day'] = monthrange(cyear, cmonth)[1]
+
+        naive = default.replace(**repl)
+
+        if res.weekday is not None and not res.day:
+            naive = naive + relativedelta.relativedelta(weekday=res.weekday)
+
+        return naive
+
+    def _assign_tzname(self, dt, tzname):
+        if dt.tzname() != tzname:
+            new_dt = tz.enfold(dt, fold=1)
+            if new_dt.tzname() == tzname:
+                return new_dt
+
+        return dt
+
+    def _recombine_skipped(self, tokens, skipped_idxs):
+        """
+        >>> tokens = ["foo", " ", "bar", " ", "19June2000", "baz"]
+        >>> skipped_idxs = [0, 1, 2, 5]
+        >>> _recombine_skipped(tokens, skipped_idxs)
+        ["foo bar", "baz"]
+        """
+        skipped_tokens = []
+        for i, idx in enumerate(sorted(skipped_idxs)):
+            if i > 0 and idx - 1 == skipped_idxs[i - 1]:
+                skipped_tokens[-1] = skipped_tokens[-1] + tokens[idx]
+            else:
+                skipped_tokens.append(tokens[idx])
+
+        return skipped_tokens
+
+
+DEFAULTPARSER = parser()
+
+
+def parse(timestr, parserinfo=None, **kwargs):
+    """
+
+    Parse a string in one of the supported formats, using the
+    ``parserinfo`` parameters.
+
+    :param timestr:
+        A string containing a date/time stamp.
+
+    :param parserinfo:
+        A :class:`parserinfo` object containing parameters for the parser.
+        If ``None``, the default arguments to the :class:`parserinfo`
+        constructor are used.
+
+    The ``**kwargs`` parameter takes the following keyword arguments:
+
+    :param default:
+        The default datetime object, if this is a datetime object and not
+        ``None``, elements specified in ``timestr`` replace elements in the
+        default object.
+
+    :param ignoretz:
+        If set ``True``, time zones in parsed strings are ignored and a naive
+        :class:`datetime` object is returned.
+
+    :param tzinfos:
+        Additional time zone names / aliases which may be present in the
+        string. This argument maps time zone names (and optionally offsets
+        from those time zones) to time zones. This parameter can be a
+        dictionary with timezone aliases mapping time zone names to time
+        zones or a function taking two parameters (``tzname`` and
+        ``tzoffset``) and returning a time zone.
+
+        The timezones to which the names are mapped can be an integer
+        offset from UTC in seconds or a :class:`tzinfo` object.
+
+        .. doctest::
+           :options: +NORMALIZE_WHITESPACE
+
+            >>> from dateutil.parser import parse
+            >>> from dateutil.tz import gettz
+            >>> tzinfos = {"BRST": -7200, "CST": gettz("America/Chicago")}
+            >>> parse("2012-01-19 17:21:00 BRST", tzinfos=tzinfos)
+            datetime.datetime(2012, 1, 19, 17, 21, tzinfo=tzoffset(u'BRST', -7200))
+            >>> parse("2012-01-19 17:21:00 CST", tzinfos=tzinfos)
+            datetime.datetime(2012, 1, 19, 17, 21,
+                              tzinfo=tzfile('/usr/share/zoneinfo/America/Chicago'))
+
+        This parameter is ignored if ``ignoretz`` is set.
+
+    :param dayfirst:
+        Whether to interpret the first value in an ambiguous 3-integer date
+        (e.g. 01/05/09) as the day (``True``) or month (``False``). If
+        ``yearfirst`` is set to ``True``, this distinguishes between YDM and
+        YMD. If set to ``None``, this value is retrieved from the current
+        :class:`parserinfo` object (which itself defaults to ``False``).
+
+    :param yearfirst:
+        Whether to interpret the first value in an ambiguous 3-integer date
+        (e.g. 01/05/09) as the year. If ``True``, the first number is taken to
+        be the year, otherwise the last number is taken to be the year. If
+        this is set to ``None``, the value is retrieved from the current
+        :class:`parserinfo` object (which itself defaults to ``False``).
+
+    :param fuzzy:
+        Whether to allow fuzzy parsing, allowing for string like "Today is
+        January 1, 2047 at 8:21:00AM".
+
+    :param fuzzy_with_tokens:
+        If ``True``, ``fuzzy`` is automatically set to True, and the parser
+        will return a tuple where the first element is the parsed
+        :class:`datetime.datetime` datetimestamp and the second element is
+        a tuple containing the portions of the string which were ignored:
+
+        .. doctest::
+
+            >>> from dateutil.parser import parse
+            >>> parse("Today is January 1, 2047 at 8:21:00AM", fuzzy_with_tokens=True)
+            (datetime.datetime(2047, 1, 1, 8, 21), (u'Today is ', u' ', u'at '))
+
+    :return:
+        Returns a :class:`datetime.datetime` object or, if the
+        ``fuzzy_with_tokens`` option is ``True``, returns a tuple, the
+        first element being a :class:`datetime.datetime` object, the second
+        a tuple containing the fuzzy tokens.
+
+    :raises ParserError:
+        Raised for invalid or unknown string formats, if the provided
+        :class:`tzinfo` is not in a valid format, or if an invalid date would
+        be created.
+
+    :raises OverflowError:
+        Raised if the parsed date exceeds the largest valid C integer on
+        your system.
+    """
+    if parserinfo:
+        return parser(parserinfo).parse(timestr, **kwargs)
+    else:
+        return DEFAULTPARSER.parse(timestr, **kwargs)
+
+
+class _tzparser(object):
+
+    class _result(_resultbase):
+
+        __slots__ = ["stdabbr", "stdoffset", "dstabbr", "dstoffset",
+                     "start", "end"]
+
+        class _attr(_resultbase):
+            __slots__ = ["month", "week", "weekday",
+                         "yday", "jyday", "day", "time"]
+
+        def __repr__(self):
+            return self._repr("")
+
+        def __init__(self):
+            _resultbase.__init__(self)
+            self.start = self._attr()
+            self.end = self._attr()
+
+    def parse(self, tzstr):
+        res = self._result()
+        l = [x for x in re.split(r'([,:.]|[a-zA-Z]+|[0-9]+)',tzstr) if x]
+        used_idxs = list()
+        try:
+
+            len_l = len(l)
+
+            i = 0
+            while i < len_l:
+                # BRST+3[BRDT[+2]]
+                j = i
+                while j < len_l and not [x for x in l[j]
+                                         if x in "0123456789:,-+"]:
+                    j += 1
+                if j != i:
+                    if not res.stdabbr:
+                        offattr = "stdoffset"
+                        res.stdabbr = "".join(l[i:j])
+                    else:
+                        offattr = "dstoffset"
+                        res.dstabbr = "".join(l[i:j])
+
+                    for ii in range(j):
+                        used_idxs.append(ii)
+                    i = j
+                    if (i < len_l and (l[i] in ('+', '-') or l[i][0] in
+                                       "0123456789")):
+                        if l[i] in ('+', '-'):
+                            # Yes, that's right.  See the TZ variable
+                            # documentation.
+                            signal = (1, -1)[l[i] == '+']
+                            used_idxs.append(i)
+                            i += 1
+                        else:
+                            signal = -1
+                        len_li = len(l[i])
+                        if len_li == 4:
+                            # -0300
+                            setattr(res, offattr, (int(l[i][:2]) * 3600 +
+                                                   int(l[i][2:]) * 60) * signal)
+                        elif i + 1 < len_l and l[i + 1] == ':':
+                            # -03:00
+                            setattr(res, offattr,
+                                    (int(l[i]) * 3600 +
+                                     int(l[i + 2]) * 60) * signal)
+                            used_idxs.append(i)
+                            i += 2
+                        elif len_li <= 2:
+                            # -[0]3
+                            setattr(res, offattr,
+                                    int(l[i][:2]) * 3600 * signal)
+                        else:
+                            return None
+                        used_idxs.append(i)
+                        i += 1
+                    if res.dstabbr:
+                        break
+                else:
+                    break
+
+
+            if i < len_l:
+                for j in range(i, len_l):
+                    if l[j] == ';':
+                        l[j] = ','
+
+                assert l[i] == ','
+
+                i += 1
+
+            if i >= len_l:
+                pass
+            elif (8 <= l.count(',') <= 9 and
+                  not [y for x in l[i:] if x != ','
+                       for y in x if y not in "0123456789+-"]):
+                # GMT0BST,3,0,30,3600,10,0,26,7200[,3600]
+                for x in (res.start, res.end):
+                    x.month = int(l[i])
+                    used_idxs.append(i)
+                    i += 2
+                    if l[i] == '-':
+                        value = int(l[i + 1]) * -1
+                        used_idxs.append(i)
+                        i += 1
+                    else:
+                        value = int(l[i])
+                    used_idxs.append(i)
+                    i += 2
+                    if value:
+                        x.week = value
+                        x.weekday = (int(l[i]) - 1) % 7
+                    else:
+                        x.day = int(l[i])
+                    used_idxs.append(i)
+                    i += 2
+                    x.time = int(l[i])
+                    used_idxs.append(i)
+                    i += 2
+                if i < len_l:
+                    if l[i] in ('-', '+'):
+                        signal = (-1, 1)[l[i] == "+"]
+                        used_idxs.append(i)
+                        i += 1
+                    else:
+                        signal = 1
+                    used_idxs.append(i)
+                    res.dstoffset = (res.stdoffset + int(l[i]) * signal)
+
+                # This was a made-up format that is not in normal use
+                warn(('Parsed time zone "%s"' % tzstr) +
+                     'is in a non-standard dateutil-specific format, which ' +
+                     'is now deprecated; support for parsing this format ' +
+                     'will be removed in future versions. It is recommended ' +
+                     'that you switch to a standard format like the GNU ' +
+                     'TZ variable format.', tz.DeprecatedTzFormatWarning)
+            elif (l.count(',') == 2 and l[i:].count('/') <= 2 and
+                  not [y for x in l[i:] if x not in (',', '/', 'J', 'M',
+                                                     '.', '-', ':')
+                       for y in x if y not in "0123456789"]):
+                for x in (res.start, res.end):
+                    if l[i] == 'J':
+                        # non-leap year day (1 based)
+                        used_idxs.append(i)
+                        i += 1
+                        x.jyday = int(l[i])
+                    elif l[i] == 'M':
+                        # month[-.]week[-.]weekday
+                        used_idxs.append(i)
+                        i += 1
+                        x.month = int(l[i])
+                        used_idxs.append(i)
+                        i += 1
+                        assert l[i] in ('-', '.')
+                        used_idxs.append(i)
+                        i += 1
+                        x.week = int(l[i])
+                        if x.week == 5:
+                            x.week = -1
+                        used_idxs.append(i)
+                        i += 1
+                        assert l[i] in ('-', '.')
+                        used_idxs.append(i)
+                        i += 1
+                        x.weekday = (int(l[i]) - 1) % 7
+                    else:
+                        # year day (zero based)
+                        x.yday = int(l[i]) + 1
+
+                    used_idxs.append(i)
+                    i += 1
+
+                    if i < len_l and l[i] == '/':
+                        used_idxs.append(i)
+                        i += 1
+                        # start time
+                        len_li = len(l[i])
+                        if len_li == 4:
+                            # -0300
+                            x.time = (int(l[i][:2]) * 3600 +
+                                      int(l[i][2:]) * 60)
+                        elif i + 1 < len_l and l[i + 1] == ':':
+                            # -03:00
+                            x.time = int(l[i]) * 3600 + int(l[i + 2]) * 60
+                            used_idxs.append(i)
+                            i += 2
+                            if i + 1 < len_l and l[i + 1] == ':':
+                                used_idxs.append(i)
+                                i += 2
+                                x.time += int(l[i])
+                        elif len_li <= 2:
+                            # -[0]3
+                            x.time = (int(l[i][:2]) * 3600)
+                        else:
+                            return None
+                        used_idxs.append(i)
+                        i += 1
+
+                    assert i == len_l or l[i] == ','
+
+                    i += 1
+
+                assert i >= len_l
+
+        except (IndexError, ValueError, AssertionError):
+            return None
+
+        unused_idxs = set(range(len_l)).difference(used_idxs)
+        res.any_unused_tokens = not {l[n] for n in unused_idxs}.issubset({",",":"})
+        return res
+
+
+DEFAULTTZPARSER = _tzparser()
+
+
+def _parsetz(tzstr):
+    return DEFAULTTZPARSER.parse(tzstr)
+
+
+class ParserError(ValueError):
+    """Exception subclass used for any failure to parse a datetime string.
+
+    This is a subclass of :py:exc:`ValueError`, and should be raised any time
+    earlier versions of ``dateutil`` would have raised ``ValueError``.
+
+    .. versionadded:: 2.8.1
+    """
+    def __str__(self):
+        try:
+            return self.args[0] % self.args[1:]
+        except (TypeError, IndexError):
+            return super(ParserError, self).__str__()
+
+    def __repr__(self):
+        args = ", ".join("'%s'" % arg for arg in self.args)
+        return "%s(%s)" % (self.__class__.__name__, args)
+
+
+class UnknownTimezoneWarning(RuntimeWarning):
+    """Raised when the parser finds a timezone it cannot parse into a tzinfo.
+
+    .. versionadded:: 2.7.0
+    """
+# vim:ts=4:sw=4:et
diff --git a/dateutil/parser/isoparser.py b/dateutil/parser/isoparser.py
new file mode 100644
index 0000000000000000000000000000000000000000..7060087df4776a07347cbb60127a70db393e3a65
--- /dev/null
+++ b/dateutil/parser/isoparser.py
@@ -0,0 +1,416 @@
+# -*- coding: utf-8 -*-
+"""
+This module offers a parser for ISO-8601 strings
+
+It is intended to support all valid date, time and datetime formats per the
+ISO-8601 specification.
+
+..versionadded:: 2.7.0
+"""
+from datetime import datetime, timedelta, time, date
+import calendar
+from dateutil import tz
+
+from functools import wraps
+
+import re
+import six
+
+__all__ = ["isoparse", "isoparser"]
+
+
+def _takes_ascii(f):
+    @wraps(f)
+    def func(self, str_in, *args, **kwargs):
+        # If it's a stream, read the whole thing
+        str_in = getattr(str_in, 'read', lambda: str_in)()
+
+        # If it's unicode, turn it into bytes, since ISO-8601 only covers ASCII
+        if isinstance(str_in, six.text_type):
+            # ASCII is the same in UTF-8
+            try:
+                str_in = str_in.encode('ascii')
+            except UnicodeEncodeError as e:
+                msg = 'ISO-8601 strings should contain only ASCII characters'
+                six.raise_from(ValueError(msg), e)
+
+        return f(self, str_in, *args, **kwargs)
+
+    return func
+
+
+class isoparser(object):
+    def __init__(self, sep=None):
+        """
+        :param sep:
+            A single character that separates date and time portions. If
+            ``None``, the parser will accept any single character.
+            For strict ISO-8601 adherence, pass ``'T'``.
+        """
+        if sep is not None:
+            if (len(sep) != 1 or ord(sep) >= 128 or sep in '0123456789'):
+                raise ValueError('Separator must be a single, non-numeric ' +
+                                 'ASCII character')
+
+            sep = sep.encode('ascii')
+
+        self._sep = sep
+
+    @_takes_ascii
+    def isoparse(self, dt_str):
+        """
+        Parse an ISO-8601 datetime string into a :class:`datetime.datetime`.
+
+        An ISO-8601 datetime string consists of a date portion, followed
+        optionally by a time portion - the date and time portions are separated
+        by a single character separator, which is ``T`` in the official
+        standard. Incomplete date formats (such as ``YYYY-MM``) may *not* be
+        combined with a time portion.
+
+        Supported date formats are:
+
+        Common:
+
+        - ``YYYY``
+        - ``YYYY-MM``
+        - ``YYYY-MM-DD`` or ``YYYYMMDD``
+
+        Uncommon:
+
+        - ``YYYY-Www`` or ``YYYYWww`` - ISO week (day defaults to 0)
+        - ``YYYY-Www-D`` or ``YYYYWwwD`` - ISO week and day
+
+        The ISO week and day numbering follows the same logic as
+        :func:`datetime.date.isocalendar`.
+
+        Supported time formats are:
+
+        - ``hh``
+        - ``hh:mm`` or ``hhmm``
+        - ``hh:mm:ss`` or ``hhmmss``
+        - ``hh:mm:ss.ssssss`` (Up to 6 sub-second digits)
+
+        Midnight is a special case for `hh`, as the standard supports both
+        00:00 and 24:00 as a representation. The decimal separator can be
+        either a dot or a comma.
+
+
+        .. caution::
+
+            Support for fractional components other than seconds is part of the
+            ISO-8601 standard, but is not currently implemented in this parser.
+
+        Supported time zone offset formats are:
+
+        - `Z` (UTC)
+        - `±HH:MM`
+        - `±HHMM`
+        - `±HH`
+
+        Offsets will be represented as :class:`dateutil.tz.tzoffset` objects,
+        with the exception of UTC, which will be represented as
+        :class:`dateutil.tz.tzutc`. Time zone offsets equivalent to UTC (such
+        as `+00:00`) will also be represented as :class:`dateutil.tz.tzutc`.
+
+        :param dt_str:
+            A string or stream containing only an ISO-8601 datetime string
+
+        :return:
+            Returns a :class:`datetime.datetime` representing the string.
+            Unspecified components default to their lowest value.
+
+        .. warning::
+
+            As of version 2.7.0, the strictness of the parser should not be
+            considered a stable part of the contract. Any valid ISO-8601 string
+            that parses correctly with the default settings will continue to
+            parse correctly in future versions, but invalid strings that
+            currently fail (e.g. ``2017-01-01T00:00+00:00:00``) are not
+            guaranteed to continue failing in future versions if they encode
+            a valid date.
+
+        .. versionadded:: 2.7.0
+        """
+        components, pos = self._parse_isodate(dt_str)
+
+        if len(dt_str) > pos:
+            if self._sep is None or dt_str[pos:pos + 1] == self._sep:
+                components += self._parse_isotime(dt_str[pos + 1:])
+            else:
+                raise ValueError('String contains unknown ISO components')
+
+        if len(components) > 3 and components[3] == 24:
+            components[3] = 0
+            return datetime(*components) + timedelta(days=1)
+
+        return datetime(*components)
+
+    @_takes_ascii
+    def parse_isodate(self, datestr):
+        """
+        Parse the date portion of an ISO string.
+
+        :param datestr:
+            The string portion of an ISO string, without a separator
+
+        :return:
+            Returns a :class:`datetime.date` object
+        """
+        components, pos = self._parse_isodate(datestr)
+        if pos < len(datestr):
+            raise ValueError('String contains unknown ISO ' +
+                             'components: {!r}'.format(datestr.decode('ascii')))
+        return date(*components)
+
+    @_takes_ascii
+    def parse_isotime(self, timestr):
+        """
+        Parse the time portion of an ISO string.
+
+        :param timestr:
+            The time portion of an ISO string, without a separator
+
+        :return:
+            Returns a :class:`datetime.time` object
+        """
+        components = self._parse_isotime(timestr)
+        if components[0] == 24:
+            components[0] = 0
+        return time(*components)
+
+    @_takes_ascii
+    def parse_tzstr(self, tzstr, zero_as_utc=True):
+        """
+        Parse a valid ISO time zone string.
+
+        See :func:`isoparser.isoparse` for details on supported formats.
+
+        :param tzstr:
+            A string representing an ISO time zone offset
+
+        :param zero_as_utc:
+            Whether to return :class:`dateutil.tz.tzutc` for zero-offset zones
+
+        :return:
+            Returns :class:`dateutil.tz.tzoffset` for offsets and
+            :class:`dateutil.tz.tzutc` for ``Z`` and (if ``zero_as_utc`` is
+            specified) offsets equivalent to UTC.
+        """
+        return self._parse_tzstr(tzstr, zero_as_utc=zero_as_utc)
+
+    # Constants
+    _DATE_SEP = b'-'
+    _TIME_SEP = b':'
+    _FRACTION_REGEX = re.compile(b'[\\.,]([0-9]+)')
+
+    def _parse_isodate(self, dt_str):
+        try:
+            return self._parse_isodate_common(dt_str)
+        except ValueError:
+            return self._parse_isodate_uncommon(dt_str)
+
+    def _parse_isodate_common(self, dt_str):
+        len_str = len(dt_str)
+        components = [1, 1, 1]
+
+        if len_str < 4:
+            raise ValueError('ISO string too short')
+
+        # Year
+        components[0] = int(dt_str[0:4])
+        pos = 4
+        if pos >= len_str:
+            return components, pos
+
+        has_sep = dt_str[pos:pos + 1] == self._DATE_SEP
+        if has_sep:
+            pos += 1
+
+        # Month
+        if len_str - pos < 2:
+            raise ValueError('Invalid common month')
+
+        components[1] = int(dt_str[pos:pos + 2])
+        pos += 2
+
+        if pos >= len_str:
+            if has_sep:
+                return components, pos
+            else:
+                raise ValueError('Invalid ISO format')
+
+        if has_sep:
+            if dt_str[pos:pos + 1] != self._DATE_SEP:
+                raise ValueError('Invalid separator in ISO string')
+            pos += 1
+
+        # Day
+        if len_str - pos < 2:
+            raise ValueError('Invalid common day')
+        components[2] = int(dt_str[pos:pos + 2])
+        return components, pos + 2
+
+    def _parse_isodate_uncommon(self, dt_str):
+        if len(dt_str) < 4:
+            raise ValueError('ISO string too short')
+
+        # All ISO formats start with the year
+        year = int(dt_str[0:4])
+
+        has_sep = dt_str[4:5] == self._DATE_SEP
+
+        pos = 4 + has_sep       # Skip '-' if it's there
+        if dt_str[pos:pos + 1] == b'W':
+            # YYYY-?Www-?D?
+            pos += 1
+            weekno = int(dt_str[pos:pos + 2])
+            pos += 2
+
+            dayno = 1
+            if len(dt_str) > pos:
+                if (dt_str[pos:pos + 1] == self._DATE_SEP) != has_sep:
+                    raise ValueError('Inconsistent use of dash separator')
+
+                pos += has_sep
+
+                dayno = int(dt_str[pos:pos + 1])
+                pos += 1
+
+            base_date = self._calculate_weekdate(year, weekno, dayno)
+        else:
+            # YYYYDDD or YYYY-DDD
+            if len(dt_str) - pos < 3:
+                raise ValueError('Invalid ordinal day')
+
+            ordinal_day = int(dt_str[pos:pos + 3])
+            pos += 3
+
+            if ordinal_day < 1 or ordinal_day > (365 + calendar.isleap(year)):
+                raise ValueError('Invalid ordinal day' +
+                                 ' {} for year {}'.format(ordinal_day, year))
+
+            base_date = date(year, 1, 1) + timedelta(days=ordinal_day - 1)
+
+        components = [base_date.year, base_date.month, base_date.day]
+        return components, pos
+
+    def _calculate_weekdate(self, year, week, day):
+        """
+        Calculate the day of corresponding to the ISO year-week-day calendar.
+
+        This function is effectively the inverse of
+        :func:`datetime.date.isocalendar`.
+
+        :param year:
+            The year in the ISO calendar
+
+        :param week:
+            The week in the ISO calendar - range is [1, 53]
+
+        :param day:
+            The day in the ISO calendar - range is [1 (MON), 7 (SUN)]
+
+        :return:
+            Returns a :class:`datetime.date`
+        """
+        if not 0 < week < 54:
+            raise ValueError('Invalid week: {}'.format(week))
+
+        if not 0 < day < 8:     # Range is 1-7
+            raise ValueError('Invalid weekday: {}'.format(day))
+
+        # Get week 1 for the specific year:
+        jan_4 = date(year, 1, 4)   # Week 1 always has January 4th in it
+        week_1 = jan_4 - timedelta(days=jan_4.isocalendar()[2] - 1)
+
+        # Now add the specific number of weeks and days to get what we want
+        week_offset = (week - 1) * 7 + (day - 1)
+        return week_1 + timedelta(days=week_offset)
+
+    def _parse_isotime(self, timestr):
+        len_str = len(timestr)
+        components = [0, 0, 0, 0, None]
+        pos = 0
+        comp = -1
+
+        if len_str < 2:
+            raise ValueError('ISO time too short')
+
+        has_sep = False
+
+        while pos < len_str and comp < 5:
+            comp += 1
+
+            if timestr[pos:pos + 1] in b'-+Zz':
+                # Detect time zone boundary
+                components[-1] = self._parse_tzstr(timestr[pos:])
+                pos = len_str
+                break
+
+            if comp == 1 and timestr[pos:pos+1] == self._TIME_SEP:
+                has_sep = True
+                pos += 1
+            elif comp == 2 and has_sep:
+                if timestr[pos:pos+1] != self._TIME_SEP:
+                    raise ValueError('Inconsistent use of colon separator')
+                pos += 1
+
+            if comp < 3:
+                # Hour, minute, second
+                components[comp] = int(timestr[pos:pos + 2])
+                pos += 2
+
+            if comp == 3:
+                # Fraction of a second
+                frac = self._FRACTION_REGEX.match(timestr[pos:])
+                if not frac:
+                    continue
+
+                us_str = frac.group(1)[:6]  # Truncate to microseconds
+                components[comp] = int(us_str) * 10**(6 - len(us_str))
+                pos += len(frac.group())
+
+        if pos < len_str:
+            raise ValueError('Unused components in ISO string')
+
+        if components[0] == 24:
+            # Standard supports 00:00 and 24:00 as representations of midnight
+            if any(component != 0 for component in components[1:4]):
+                raise ValueError('Hour may only be 24 at 24:00:00.000')
+
+        return components
+
+    def _parse_tzstr(self, tzstr, zero_as_utc=True):
+        if tzstr == b'Z' or tzstr == b'z':
+            return tz.UTC
+
+        if len(tzstr) not in {3, 5, 6}:
+            raise ValueError('Time zone offset must be 1, 3, 5 or 6 characters')
+
+        if tzstr[0:1] == b'-':
+            mult = -1
+        elif tzstr[0:1] == b'+':
+            mult = 1
+        else:
+            raise ValueError('Time zone offset requires sign')
+
+        hours = int(tzstr[1:3])
+        if len(tzstr) == 3:
+            minutes = 0
+        else:
+            minutes = int(tzstr[(4 if tzstr[3:4] == self._TIME_SEP else 3):])
+
+        if zero_as_utc and hours == 0 and minutes == 0:
+            return tz.UTC
+        else:
+            if minutes > 59:
+                raise ValueError('Invalid minutes in time zone offset')
+
+            if hours > 23:
+                raise ValueError('Invalid hours in time zone offset')
+
+            return tz.tzoffset(None, mult * (hours * 60 + minutes) * 60)
+
+
+DEFAULT_ISOPARSER = isoparser()
+isoparse = DEFAULT_ISOPARSER.isoparse
diff --git a/dateutil/tz/__init__.py b/dateutil/tz/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..af1352c47292f4eebc5cae8da45641b5544558e3
--- /dev/null
+++ b/dateutil/tz/__init__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+from .tz import *
+from .tz import __doc__
+
+__all__ = ["tzutc", "tzoffset", "tzlocal", "tzfile", "tzrange",
+           "tzstr", "tzical", "tzwin", "tzwinlocal", "gettz",
+           "enfold", "datetime_ambiguous", "datetime_exists",
+           "resolve_imaginary", "UTC", "DeprecatedTzFormatWarning"]
+
+
+class DeprecatedTzFormatWarning(Warning):
+    """Warning raised when time zones are parsed from deprecated formats."""
diff --git a/dateutil/tz/_common.py b/dateutil/tz/_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6ac11831522b266114d5b68ee1da298e3aeb14a
--- /dev/null
+++ b/dateutil/tz/_common.py
@@ -0,0 +1,419 @@
+from six import PY2
+
+from functools import wraps
+
+from datetime import datetime, timedelta, tzinfo
+
+
+ZERO = timedelta(0)
+
+__all__ = ['tzname_in_python2', 'enfold']
+
+
+def tzname_in_python2(namefunc):
+    """Change unicode output into bytestrings in Python 2
+
+    tzname() API changed in Python 3. It used to return bytes, but was changed
+    to unicode strings
+    """
+    if PY2:
+        @wraps(namefunc)
+        def adjust_encoding(*args, **kwargs):
+            name = namefunc(*args, **kwargs)
+            if name is not None:
+                name = name.encode()
+
+            return name
+
+        return adjust_encoding
+    else:
+        return namefunc
+
+
+# The following is adapted from Alexander Belopolsky's tz library
+# https://github.com/abalkin/tz
+if hasattr(datetime, 'fold'):
+    # This is the pre-python 3.6 fold situation
+    def enfold(dt, fold=1):
+        """
+        Provides a unified interface for assigning the ``fold`` attribute to
+        datetimes both before and after the implementation of PEP-495.
+
+        :param fold:
+            The value for the ``fold`` attribute in the returned datetime. This
+            should be either 0 or 1.
+
+        :return:
+            Returns an object for which ``getattr(dt, 'fold', 0)`` returns
+            ``fold`` for all versions of Python. In versions prior to
+            Python 3.6, this is a ``_DatetimeWithFold`` object, which is a
+            subclass of :py:class:`datetime.datetime` with the ``fold``
+            attribute added, if ``fold`` is 1.
+
+        .. versionadded:: 2.6.0
+        """
+        return dt.replace(fold=fold)
+
+else:
+    class _DatetimeWithFold(datetime):
+        """
+        This is a class designed to provide a PEP 495-compliant interface for
+        Python versions before 3.6. It is used only for dates in a fold, so
+        the ``fold`` attribute is fixed at ``1``.
+
+        .. versionadded:: 2.6.0
+        """
+        __slots__ = ()
+
+        def replace(self, *args, **kwargs):
+            """
+            Return a datetime with the same attributes, except for those
+            attributes given new values by whichever keyword arguments are
+            specified. Note that tzinfo=None can be specified to create a naive
+            datetime from an aware datetime with no conversion of date and time
+            data.
+
+            This is reimplemented in ``_DatetimeWithFold`` because pypy3 will
+            return a ``datetime.datetime`` even if ``fold`` is unchanged.
+            """
+            argnames = (
+                'year', 'month', 'day', 'hour', 'minute', 'second',
+                'microsecond', 'tzinfo'
+            )
+
+            for arg, argname in zip(args, argnames):
+                if argname in kwargs:
+                    raise TypeError('Duplicate argument: {}'.format(argname))
+
+                kwargs[argname] = arg
+
+            for argname in argnames:
+                if argname not in kwargs:
+                    kwargs[argname] = getattr(self, argname)
+
+            dt_class = self.__class__ if kwargs.get('fold', 1) else datetime
+
+            return dt_class(**kwargs)
+
+        @property
+        def fold(self):
+            return 1
+
+    def enfold(dt, fold=1):
+        """
+        Provides a unified interface for assigning the ``fold`` attribute to
+        datetimes both before and after the implementation of PEP-495.
+
+        :param fold:
+            The value for the ``fold`` attribute in the returned datetime. This
+            should be either 0 or 1.
+
+        :return:
+            Returns an object for which ``getattr(dt, 'fold', 0)`` returns
+            ``fold`` for all versions of Python. In versions prior to
+            Python 3.6, this is a ``_DatetimeWithFold`` object, which is a
+            subclass of :py:class:`datetime.datetime` with the ``fold``
+            attribute added, if ``fold`` is 1.
+
+        .. versionadded:: 2.6.0
+        """
+        if getattr(dt, 'fold', 0) == fold:
+            return dt
+
+        args = dt.timetuple()[:6]
+        args += (dt.microsecond, dt.tzinfo)
+
+        if fold:
+            return _DatetimeWithFold(*args)
+        else:
+            return datetime(*args)
+
+
+def _validate_fromutc_inputs(f):
+    """
+    The CPython version of ``fromutc`` checks that the input is a ``datetime``
+    object and that ``self`` is attached as its ``tzinfo``.
+    """
+    @wraps(f)
+    def fromutc(self, dt):
+        if not isinstance(dt, datetime):
+            raise TypeError("fromutc() requires a datetime argument")
+        if dt.tzinfo is not self:
+            raise ValueError("dt.tzinfo is not self")
+
+        return f(self, dt)
+
+    return fromutc
+
+
+class _tzinfo(tzinfo):
+    """
+    Base class for all ``dateutil`` ``tzinfo`` objects.
+    """
+
+    def is_ambiguous(self, dt):
+        """
+        Whether or not the "wall time" of a given datetime is ambiguous in this
+        zone.
+
+        :param dt:
+            A :py:class:`datetime.datetime`, naive or time zone aware.
+
+
+        :return:
+            Returns ``True`` if ambiguous, ``False`` otherwise.
+
+        .. versionadded:: 2.6.0
+        """
+
+        dt = dt.replace(tzinfo=self)
+
+        wall_0 = enfold(dt, fold=0)
+        wall_1 = enfold(dt, fold=1)
+
+        same_offset = wall_0.utcoffset() == wall_1.utcoffset()
+        same_dt = wall_0.replace(tzinfo=None) == wall_1.replace(tzinfo=None)
+
+        return same_dt and not same_offset
+
+    def _fold_status(self, dt_utc, dt_wall):
+        """
+        Determine the fold status of a "wall" datetime, given a representation
+        of the same datetime as a (naive) UTC datetime. This is calculated based
+        on the assumption that ``dt.utcoffset() - dt.dst()`` is constant for all
+        datetimes, and that this offset is the actual number of hours separating
+        ``dt_utc`` and ``dt_wall``.
+
+        :param dt_utc:
+            Representation of the datetime as UTC
+
+        :param dt_wall:
+            Representation of the datetime as "wall time". This parameter must
+            either have a `fold` attribute or have a fold-naive
+            :class:`datetime.tzinfo` attached, otherwise the calculation may
+            fail.
+        """
+        if self.is_ambiguous(dt_wall):
+            delta_wall = dt_wall - dt_utc
+            _fold = int(delta_wall == (dt_utc.utcoffset() - dt_utc.dst()))
+        else:
+            _fold = 0
+
+        return _fold
+
+    def _fold(self, dt):
+        return getattr(dt, 'fold', 0)
+
+    def _fromutc(self, dt):
+        """
+        Given a timezone-aware datetime in a given timezone, calculates a
+        timezone-aware datetime in a new timezone.
+
+        Since this is the one time that we *know* we have an unambiguous
+        datetime object, we take this opportunity to determine whether the
+        datetime is ambiguous and in a "fold" state (e.g. if it's the first
+        occurrence, chronologically, of the ambiguous datetime).
+
+        :param dt:
+            A timezone-aware :class:`datetime.datetime` object.
+        """
+
+        # Re-implement the algorithm from Python's datetime.py
+        dtoff = dt.utcoffset()
+        if dtoff is None:
+            raise ValueError("fromutc() requires a non-None utcoffset() "
+                             "result")
+
+        # The original datetime.py code assumes that `dst()` defaults to
+        # zero during ambiguous times. PEP 495 inverts this presumption, so
+        # for pre-PEP 495 versions of python, we need to tweak the algorithm.
+        dtdst = dt.dst()
+        if dtdst is None:
+            raise ValueError("fromutc() requires a non-None dst() result")
+        delta = dtoff - dtdst
+
+        dt += delta
+        # Set fold=1 so we can default to being in the fold for
+        # ambiguous dates.
+        dtdst = enfold(dt, fold=1).dst()
+        if dtdst is None:
+            raise ValueError("fromutc(): dt.dst gave inconsistent "
+                             "results; cannot convert")
+        return dt + dtdst
+
+    @_validate_fromutc_inputs
+    def fromutc(self, dt):
+        """
+        Given a timezone-aware datetime in a given timezone, calculates a
+        timezone-aware datetime in a new timezone.
+
+        Since this is the one time that we *know* we have an unambiguous
+        datetime object, we take this opportunity to determine whether the
+        datetime is ambiguous and in a "fold" state (e.g. if it's the first
+        occurrence, chronologically, of the ambiguous datetime).
+
+        :param dt:
+            A timezone-aware :class:`datetime.datetime` object.
+        """
+        dt_wall = self._fromutc(dt)
+
+        # Calculate the fold status given the two datetimes.
+        _fold = self._fold_status(dt, dt_wall)
+
+        # Set the default fold value for ambiguous dates
+        return enfold(dt_wall, fold=_fold)
+
+
+class tzrangebase(_tzinfo):
+    """
+    This is an abstract base class for time zones represented by an annual
+    transition into and out of DST. Child classes should implement the following
+    methods:
+
+        * ``__init__(self, *args, **kwargs)``
+        * ``transitions(self, year)`` - this is expected to return a tuple of
+          datetimes representing the DST on and off transitions in standard
+          time.
+
+    A fully initialized ``tzrangebase`` subclass should also provide the
+    following attributes:
+        * ``hasdst``: Boolean whether or not the zone uses DST.
+        * ``_dst_offset`` / ``_std_offset``: :class:`datetime.timedelta` objects
+          representing the respective UTC offsets.
+        * ``_dst_abbr`` / ``_std_abbr``: Strings representing the timezone short
+          abbreviations in DST and STD, respectively.
+        * ``_hasdst``: Whether or not the zone has DST.
+
+    .. versionadded:: 2.6.0
+    """
+    def __init__(self):
+        raise NotImplementedError('tzrangebase is an abstract base class')
+
+    def utcoffset(self, dt):
+        isdst = self._isdst(dt)
+
+        if isdst is None:
+            return None
+        elif isdst:
+            return self._dst_offset
+        else:
+            return self._std_offset
+
+    def dst(self, dt):
+        isdst = self._isdst(dt)
+
+        if isdst is None:
+            return None
+        elif isdst:
+            return self._dst_base_offset
+        else:
+            return ZERO
+
+    @tzname_in_python2
+    def tzname(self, dt):
+        if self._isdst(dt):
+            return self._dst_abbr
+        else:
+            return self._std_abbr
+
+    def fromutc(self, dt):
+        """ Given a datetime in UTC, return local time """
+        if not isinstance(dt, datetime):
+            raise TypeError("fromutc() requires a datetime argument")
+
+        if dt.tzinfo is not self:
+            raise ValueError("dt.tzinfo is not self")
+
+        # Get transitions - if there are none, fixed offset
+        transitions = self.transitions(dt.year)
+        if transitions is None:
+            return dt + self.utcoffset(dt)
+
+        # Get the transition times in UTC
+        dston, dstoff = transitions
+
+        dston -= self._std_offset
+        dstoff -= self._std_offset
+
+        utc_transitions = (dston, dstoff)
+        dt_utc = dt.replace(tzinfo=None)
+
+        isdst = self._naive_isdst(dt_utc, utc_transitions)
+
+        if isdst:
+            dt_wall = dt + self._dst_offset
+        else:
+            dt_wall = dt + self._std_offset
+
+        _fold = int(not isdst and self.is_ambiguous(dt_wall))
+
+        return enfold(dt_wall, fold=_fold)
+
+    def is_ambiguous(self, dt):
+        """
+        Whether or not the "wall time" of a given datetime is ambiguous in this
+        zone.
+
+        :param dt:
+            A :py:class:`datetime.datetime`, naive or time zone aware.
+
+
+        :return:
+            Returns ``True`` if ambiguous, ``False`` otherwise.
+
+        .. versionadded:: 2.6.0
+        """
+        if not self.hasdst:
+            return False
+
+        start, end = self.transitions(dt.year)
+
+        dt = dt.replace(tzinfo=None)
+        return (end <= dt < end + self._dst_base_offset)
+
+    def _isdst(self, dt):
+        if not self.hasdst:
+            return False
+        elif dt is None:
+            return None
+
+        transitions = self.transitions(dt.year)
+
+        if transitions is None:
+            return False
+
+        dt = dt.replace(tzinfo=None)
+
+        isdst = self._naive_isdst(dt, transitions)
+
+        # Handle ambiguous dates
+        if not isdst and self.is_ambiguous(dt):
+            return not self._fold(dt)
+        else:
+            return isdst
+
+    def _naive_isdst(self, dt, transitions):
+        dston, dstoff = transitions
+
+        dt = dt.replace(tzinfo=None)
+
+        if dston < dstoff:
+            isdst = dston <= dt < dstoff
+        else:
+            isdst = not dstoff <= dt < dston
+
+        return isdst
+
+    @property
+    def _dst_base_offset(self):
+        return self._dst_offset - self._std_offset
+
+    __hash__ = None
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __repr__(self):
+        return "%s(...)" % self.__class__.__name__
+
+    __reduce__ = object.__reduce__
diff --git a/dateutil/tz/_factories.py b/dateutil/tz/_factories.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8a65891a023ebf9eb0c24d391ba67541b7133f1
--- /dev/null
+++ b/dateutil/tz/_factories.py
@@ -0,0 +1,80 @@
+from datetime import timedelta
+import weakref
+from collections import OrderedDict
+
+from six.moves import _thread
+
+
+class _TzSingleton(type):
+    def __init__(cls, *args, **kwargs):
+        cls.__instance = None
+        super(_TzSingleton, cls).__init__(*args, **kwargs)
+
+    def __call__(cls):
+        if cls.__instance is None:
+            cls.__instance = super(_TzSingleton, cls).__call__()
+        return cls.__instance
+
+
+class _TzFactory(type):
+    def instance(cls, *args, **kwargs):
+        """Alternate constructor that returns a fresh instance"""
+        return type.__call__(cls, *args, **kwargs)
+
+
+class _TzOffsetFactory(_TzFactory):
+    def __init__(cls, *args, **kwargs):
+        cls.__instances = weakref.WeakValueDictionary()
+        cls.__strong_cache = OrderedDict()
+        cls.__strong_cache_size = 8
+
+        cls._cache_lock = _thread.allocate_lock()
+
+    def __call__(cls, name, offset):
+        if isinstance(offset, timedelta):
+            key = (name, offset.total_seconds())
+        else:
+            key = (name, offset)
+
+        instance = cls.__instances.get(key, None)
+        if instance is None:
+            instance = cls.__instances.setdefault(key,
+                                                  cls.instance(name, offset))
+
+        # This lock may not be necessary in Python 3. See GH issue #901
+        with cls._cache_lock:
+            cls.__strong_cache[key] = cls.__strong_cache.pop(key, instance)
+
+            # Remove an item if the strong cache is overpopulated
+            if len(cls.__strong_cache) > cls.__strong_cache_size:
+                cls.__strong_cache.popitem(last=False)
+
+        return instance
+
+
+class _TzStrFactory(_TzFactory):
+    def __init__(cls, *args, **kwargs):
+        cls.__instances = weakref.WeakValueDictionary()
+        cls.__strong_cache = OrderedDict()
+        cls.__strong_cache_size = 8
+
+        cls.__cache_lock = _thread.allocate_lock()
+
+    def __call__(cls, s, posix_offset=False):
+        key = (s, posix_offset)
+        instance = cls.__instances.get(key, None)
+
+        if instance is None:
+            instance = cls.__instances.setdefault(key,
+                cls.instance(s, posix_offset))
+
+        # This lock may not be necessary in Python 3. See GH issue #901
+        with cls.__cache_lock:
+            cls.__strong_cache[key] = cls.__strong_cache.pop(key, instance)
+
+            # Remove an item if the strong cache is overpopulated
+            if len(cls.__strong_cache) > cls.__strong_cache_size:
+                cls.__strong_cache.popitem(last=False)
+
+        return instance
+
diff --git a/dateutil/tz/tz.py b/dateutil/tz/tz.py
new file mode 100644
index 0000000000000000000000000000000000000000..617591446bd92eb1cc7b7d67fa3f17435e691cdd
--- /dev/null
+++ b/dateutil/tz/tz.py
@@ -0,0 +1,1849 @@
+# -*- coding: utf-8 -*-
+"""
+This module offers timezone implementations subclassing the abstract
+:py:class:`datetime.tzinfo` type. There are classes to handle tzfile format
+files (usually are in :file:`/etc/localtime`, :file:`/usr/share/zoneinfo`,
+etc), TZ environment string (in all known formats), given ranges (with help
+from relative deltas), local machine timezone, fixed offset timezone, and UTC
+timezone.
+"""
+import datetime
+import struct
+import time
+import sys
+import os
+import bisect
+import weakref
+from collections import OrderedDict
+
+import six
+from six import string_types
+from six.moves import _thread
+from ._common import tzname_in_python2, _tzinfo
+from ._common import tzrangebase, enfold
+from ._common import _validate_fromutc_inputs
+
+from ._factories import _TzSingleton, _TzOffsetFactory
+from ._factories import _TzStrFactory
+try:
+    from .win import tzwin, tzwinlocal
+except ImportError:
+    tzwin = tzwinlocal = None
+
+# For warning about rounding tzinfo
+from warnings import warn
+
+ZERO = datetime.timedelta(0)
+EPOCH = datetime.datetime(1970, 1, 1, 0, 0)
+EPOCHORDINAL = EPOCH.toordinal()
+
+
+@six.add_metaclass(_TzSingleton)
+class tzutc(datetime.tzinfo):
+    """
+    This is a tzinfo object that represents the UTC time zone.
+
+    **Examples:**
+
+    .. doctest::
+
+        >>> from datetime import *
+        >>> from dateutil.tz import *
+
+        >>> datetime.now()
+        datetime.datetime(2003, 9, 27, 9, 40, 1, 521290)
+
+        >>> datetime.now(tzutc())
+        datetime.datetime(2003, 9, 27, 12, 40, 12, 156379, tzinfo=tzutc())
+
+        >>> datetime.now(tzutc()).tzname()
+        'UTC'
+
+    .. versionchanged:: 2.7.0
+        ``tzutc()`` is now a singleton, so the result of ``tzutc()`` will
+        always return the same object.
+
+        .. doctest::
+
+            >>> from dateutil.tz import tzutc, UTC
+            >>> tzutc() is tzutc()
+            True
+            >>> tzutc() is UTC
+            True
+    """
+    def utcoffset(self, dt):
+        return ZERO
+
+    def dst(self, dt):
+        return ZERO
+
+    @tzname_in_python2
+    def tzname(self, dt):
+        return "UTC"
+
+    def is_ambiguous(self, dt):
+        """
+        Whether or not the "wall time" of a given datetime is ambiguous in this
+        zone.
+
+        :param dt:
+            A :py:class:`datetime.datetime`, naive or time zone aware.
+
+
+        :return:
+            Returns ``True`` if ambiguous, ``False`` otherwise.
+
+        .. versionadded:: 2.6.0
+        """
+        return False
+
+    @_validate_fromutc_inputs
+    def fromutc(self, dt):
+        """
+        Fast track version of fromutc() returns the original ``dt`` object for
+        any valid :py:class:`datetime.datetime` object.
+        """
+        return dt
+
+    def __eq__(self, other):
+        if not isinstance(other, (tzutc, tzoffset)):
+            return NotImplemented
+
+        return (isinstance(other, tzutc) or
+                (isinstance(other, tzoffset) and other._offset == ZERO))
+
+    __hash__ = None
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __repr__(self):
+        return "%s()" % self.__class__.__name__
+
+    __reduce__ = object.__reduce__
+
+
+#: Convenience constant providing a :class:`tzutc()` instance
+#:
+#: .. versionadded:: 2.7.0
+UTC = tzutc()
+
+
+@six.add_metaclass(_TzOffsetFactory)
+class tzoffset(datetime.tzinfo):
+    """
+    A simple class for representing a fixed offset from UTC.
+
+    :param name:
+        The timezone name, to be returned when ``tzname()`` is called.
+    :param offset:
+        The time zone offset in seconds, or (since version 2.6.0, represented
+        as a :py:class:`datetime.timedelta` object).
+    """
+    def __init__(self, name, offset):
+        self._name = name
+
+        try:
+            # Allow a timedelta
+            offset = offset.total_seconds()
+        except (TypeError, AttributeError):
+            pass
+
+        self._offset = datetime.timedelta(seconds=_get_supported_offset(offset))
+
+    def utcoffset(self, dt):
+        return self._offset
+
+    def dst(self, dt):
+        return ZERO
+
+    @tzname_in_python2
+    def tzname(self, dt):
+        return self._name
+
+    @_validate_fromutc_inputs
+    def fromutc(self, dt):
+        return dt + self._offset
+
+    def is_ambiguous(self, dt):
+        """
+        Whether or not the "wall time" of a given datetime is ambiguous in this
+        zone.
+
+        :param dt:
+            A :py:class:`datetime.datetime`, naive or time zone aware.
+        :return:
+            Returns ``True`` if ambiguous, ``False`` otherwise.
+
+        .. versionadded:: 2.6.0
+        """
+        return False
+
+    def __eq__(self, other):
+        if not isinstance(other, tzoffset):
+            return NotImplemented
+
+        return self._offset == other._offset
+
+    __hash__ = None
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __repr__(self):
+        return "%s(%s, %s)" % (self.__class__.__name__,
+                               repr(self._name),
+                               int(self._offset.total_seconds()))
+
+    __reduce__ = object.__reduce__
+
+
+class tzlocal(_tzinfo):
+    """
+    A :class:`tzinfo` subclass built around the ``time`` timezone functions.
+    """
+    def __init__(self):
+        super(tzlocal, self).__init__()
+
+        self._std_offset = datetime.timedelta(seconds=-time.timezone)
+        if time.daylight:
+            self._dst_offset = datetime.timedelta(seconds=-time.altzone)
+        else:
+            self._dst_offset = self._std_offset
+
+        self._dst_saved = self._dst_offset - self._std_offset
+        self._hasdst = bool(self._dst_saved)
+        self._tznames = tuple(time.tzname)
+
+    def utcoffset(self, dt):
+        if dt is None and self._hasdst:
+            return None
+
+        if self._isdst(dt):
+            return self._dst_offset
+        else:
+            return self._std_offset
+
+    def dst(self, dt):
+        if dt is None and self._hasdst:
+            return None
+
+        if self._isdst(dt):
+            return self._dst_offset - self._std_offset
+        else:
+            return ZERO
+
+    @tzname_in_python2
+    def tzname(self, dt):
+        return self._tznames[self._isdst(dt)]
+
+    def is_ambiguous(self, dt):
+        """
+        Whether or not the "wall time" of a given datetime is ambiguous in this
+        zone.
+
+        :param dt:
+            A :py:class:`datetime.datetime`, naive or time zone aware.
+
+
+        :return:
+            Returns ``True`` if ambiguous, ``False`` otherwise.
+
+        .. versionadded:: 2.6.0
+        """
+        naive_dst = self._naive_is_dst(dt)
+        return (not naive_dst and
+                (naive_dst != self._naive_is_dst(dt - self._dst_saved)))
+
+    def _naive_is_dst(self, dt):
+        timestamp = _datetime_to_timestamp(dt)
+        return time.localtime(timestamp + time.timezone).tm_isdst
+
+    def _isdst(self, dt, fold_naive=True):
+        # We can't use mktime here. It is unstable when deciding if
+        # the hour near to a change is DST or not.
+        #
+        # timestamp = time.mktime((dt.year, dt.month, dt.day, dt.hour,
+        #                         dt.minute, dt.second, dt.weekday(), 0, -1))
+        # return time.localtime(timestamp).tm_isdst
+        #
+        # The code above yields the following result:
+        #
+        # >>> import tz, datetime
+        # >>> t = tz.tzlocal()
+        # >>> datetime.datetime(2003,2,15,23,tzinfo=t).tzname()
+        # 'BRDT'
+        # >>> datetime.datetime(2003,2,16,0,tzinfo=t).tzname()
+        # 'BRST'
+        # >>> datetime.datetime(2003,2,15,23,tzinfo=t).tzname()
+        # 'BRST'
+        # >>> datetime.datetime(2003,2,15,22,tzinfo=t).tzname()
+        # 'BRDT'
+        # >>> datetime.datetime(2003,2,15,23,tzinfo=t).tzname()
+        # 'BRDT'
+        #
+        # Here is a more stable implementation:
+        #
+        if not self._hasdst:
+            return False
+
+        # Check for ambiguous times:
+        dstval = self._naive_is_dst(dt)
+        fold = getattr(dt, 'fold', None)
+
+        if self.is_ambiguous(dt):
+            if fold is not None:
+                return not self._fold(dt)
+            else:
+                return True
+
+        return dstval
+
+    def __eq__(self, other):
+        if isinstance(other, tzlocal):
+            return (self._std_offset == other._std_offset and
+                    self._dst_offset == other._dst_offset)
+        elif isinstance(other, tzutc):
+            return (not self._hasdst and
+                    self._tznames[0] in {'UTC', 'GMT'} and
+                    self._std_offset == ZERO)
+        elif isinstance(other, tzoffset):
+            return (not self._hasdst and
+                    self._tznames[0] == other._name and
+                    self._std_offset == other._offset)
+        else:
+            return NotImplemented
+
+    __hash__ = None
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __repr__(self):
+        return "%s()" % self.__class__.__name__
+
+    __reduce__ = object.__reduce__
+
+
+class _ttinfo(object):
+    __slots__ = ["offset", "delta", "isdst", "abbr",
+                 "isstd", "isgmt", "dstoffset"]
+
+    def __init__(self):
+        for attr in self.__slots__:
+            setattr(self, attr, None)
+
+    def __repr__(self):
+        l = []
+        for attr in self.__slots__:
+            value = getattr(self, attr)
+            if value is not None:
+                l.append("%s=%s" % (attr, repr(value)))
+        return "%s(%s)" % (self.__class__.__name__, ", ".join(l))
+
+    def __eq__(self, other):
+        if not isinstance(other, _ttinfo):
+            return NotImplemented
+
+        return (self.offset == other.offset and
+                self.delta == other.delta and
+                self.isdst == other.isdst and
+                self.abbr == other.abbr and
+                self.isstd == other.isstd and
+                self.isgmt == other.isgmt and
+                self.dstoffset == other.dstoffset)
+
+    __hash__ = None
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __getstate__(self):
+        state = {}
+        for name in self.__slots__:
+            state[name] = getattr(self, name, None)
+        return state
+
+    def __setstate__(self, state):
+        for name in self.__slots__:
+            if name in state:
+                setattr(self, name, state[name])
+
+
+class _tzfile(object):
+    """
+    Lightweight class for holding the relevant transition and time zone
+    information read from binary tzfiles.
+    """
+    attrs = ['trans_list', 'trans_list_utc', 'trans_idx', 'ttinfo_list',
+             'ttinfo_std', 'ttinfo_dst', 'ttinfo_before', 'ttinfo_first']
+
+    def __init__(self, **kwargs):
+        for attr in self.attrs:
+            setattr(self, attr, kwargs.get(attr, None))
+
+
+class tzfile(_tzinfo):
+    """
+    This is a ``tzinfo`` subclass that allows one to use the ``tzfile(5)``
+    format timezone files to extract current and historical zone information.
+
+    :param fileobj:
+        This can be an opened file stream or a file name that the time zone
+        information can be read from.
+
+    :param filename:
+        This is an optional parameter specifying the source of the time zone
+        information in the event that ``fileobj`` is a file object. If omitted
+        and ``fileobj`` is a file stream, this parameter will be set either to
+        ``fileobj``'s ``name`` attribute or to ``repr(fileobj)``.
+
+    See `Sources for Time Zone and Daylight Saving Time Data
+    <https://data.iana.org/time-zones/tz-link.html>`_ for more information.
+    Time zone files can be compiled from the `IANA Time Zone database files
+    <https://www.iana.org/time-zones>`_ with the `zic time zone compiler
+    <https://www.freebsd.org/cgi/man.cgi?query=zic&sektion=8>`_
+
+    .. note::
+
+        Only construct a ``tzfile`` directly if you have a specific timezone
+        file on disk that you want to read into a Python ``tzinfo`` object.
+        If you want to get a ``tzfile`` representing a specific IANA zone,
+        (e.g. ``'America/New_York'``), you should call
+        :func:`dateutil.tz.gettz` with the zone identifier.
+
+
+    **Examples:**
+
+    Using the US Eastern time zone as an example, we can see that a ``tzfile``
+    provides time zone information for the standard Daylight Saving offsets:
+
+    .. testsetup:: tzfile
+
+        from dateutil.tz import gettz
+        from datetime import datetime
+
+    .. doctest:: tzfile
+
+        >>> NYC = gettz('America/New_York')
+        >>> NYC
+        tzfile('/usr/share/zoneinfo/America/New_York')
+
+        >>> print(datetime(2016, 1, 3, tzinfo=NYC))     # EST
+        2016-01-03 00:00:00-05:00
+
+        >>> print(datetime(2016, 7, 7, tzinfo=NYC))     # EDT
+        2016-07-07 00:00:00-04:00
+
+
+    The ``tzfile`` structure contains a fully history of the time zone,
+    so historical dates will also have the right offsets. For example, before
+    the adoption of the UTC standards, New York used local solar  mean time:
+
+    .. doctest:: tzfile
+
+       >>> print(datetime(1901, 4, 12, tzinfo=NYC))    # LMT
+       1901-04-12 00:00:00-04:56
+
+    And during World War II, New York was on "Eastern War Time", which was a
+    state of permanent daylight saving time:
+
+    .. doctest:: tzfile
+
+        >>> print(datetime(1944, 2, 7, tzinfo=NYC))    # EWT
+        1944-02-07 00:00:00-04:00
+
+    """
+
+    def __init__(self, fileobj, filename=None):
+        super(tzfile, self).__init__()
+
+        file_opened_here = False
+        if isinstance(fileobj, string_types):
+            self._filename = fileobj
+            fileobj = open(fileobj, 'rb')
+            file_opened_here = True
+        elif filename is not None:
+            self._filename = filename
+        elif hasattr(fileobj, "name"):
+            self._filename = fileobj.name
+        else:
+            self._filename = repr(fileobj)
+
+        if fileobj is not None:
+            if not file_opened_here:
+                fileobj = _nullcontext(fileobj)
+
+            with fileobj as file_stream:
+                tzobj = self._read_tzfile(file_stream)
+
+            self._set_tzdata(tzobj)
+
+    def _set_tzdata(self, tzobj):
+        """ Set the time zone data of this object from a _tzfile object """
+        # Copy the relevant attributes over as private attributes
+        for attr in _tzfile.attrs:
+            setattr(self, '_' + attr, getattr(tzobj, attr))
+
+    def _read_tzfile(self, fileobj):
+        out = _tzfile()
+
+        # From tzfile(5):
+        #
+        # The time zone information files used by tzset(3)
+        # begin with the magic characters "TZif" to identify
+        # them as time zone information files, followed by
+        # sixteen bytes reserved for future use, followed by
+        # six four-byte values of type long, written in a
+        # ``standard'' byte order (the high-order  byte
+        # of the value is written first).
+        if fileobj.read(4).decode() != "TZif":
+            raise ValueError("magic not found")
+
+        fileobj.read(16)
+
+        (
+            # The number of UTC/local indicators stored in the file.
+            ttisgmtcnt,
+
+            # The number of standard/wall indicators stored in the file.
+            ttisstdcnt,
+
+            # The number of leap seconds for which data is
+            # stored in the file.
+            leapcnt,
+
+            # The number of "transition times" for which data
+            # is stored in the file.
+            timecnt,
+
+            # The number of "local time types" for which data
+            # is stored in the file (must not be zero).
+            typecnt,
+
+            # The  number  of  characters  of "time zone
+            # abbreviation strings" stored in the file.
+            charcnt,
+
+        ) = struct.unpack(">6l", fileobj.read(24))
+
+        # The above header is followed by tzh_timecnt four-byte
+        # values  of  type long,  sorted  in ascending order.
+        # These values are written in ``standard'' byte order.
+        # Each is used as a transition time (as  returned  by
+        # time(2)) at which the rules for computing local time
+        # change.
+
+        if timecnt:
+            out.trans_list_utc = list(struct.unpack(">%dl" % timecnt,
+                                                    fileobj.read(timecnt*4)))
+        else:
+            out.trans_list_utc = []
+
+        # Next come tzh_timecnt one-byte values of type unsigned
+        # char; each one tells which of the different types of
+        # ``local time'' types described in the file is associated
+        # with the same-indexed transition time. These values
+        # serve as indices into an array of ttinfo structures that
+        # appears next in the file.
+
+        if timecnt:
+            out.trans_idx = struct.unpack(">%dB" % timecnt,
+                                          fileobj.read(timecnt))
+        else:
+            out.trans_idx = []
+
+        # Each ttinfo structure is written as a four-byte value
+        # for tt_gmtoff  of  type long,  in  a  standard  byte
+        # order, followed  by a one-byte value for tt_isdst
+        # and a one-byte  value  for  tt_abbrind.   In  each
+        # structure, tt_gmtoff  gives  the  number  of
+        # seconds to be added to UTC, tt_isdst tells whether
+        # tm_isdst should be set by  localtime(3),  and
+        # tt_abbrind serves  as an index into the array of
+        # time zone abbreviation characters that follow the
+        # ttinfo structure(s) in the file.
+
+        ttinfo = []
+
+        for i in range(typecnt):
+            ttinfo.append(struct.unpack(">lbb", fileobj.read(6)))
+
+        abbr = fileobj.read(charcnt).decode()
+
+        # Then there are tzh_leapcnt pairs of four-byte
+        # values, written in  standard byte  order;  the
+        # first  value  of  each pair gives the time (as
+        # returned by time(2)) at which a leap second
+        # occurs;  the  second  gives the  total  number of
+        # leap seconds to be applied after the given time.
+        # The pairs of values are sorted in ascending order
+        # by time.
+
+        # Not used, for now (but seek for correct file position)
+        if leapcnt:
+            fileobj.seek(leapcnt * 8, os.SEEK_CUR)
+
+        # Then there are tzh_ttisstdcnt standard/wall
+        # indicators, each stored as a one-byte value;
+        # they tell whether the transition times associated
+        # with local time types were specified as standard
+        # time or wall clock time, and are used when
+        # a time zone file is used in handling POSIX-style
+        # time zone environment variables.
+
+        if ttisstdcnt:
+            isstd = struct.unpack(">%db" % ttisstdcnt,
+                                  fileobj.read(ttisstdcnt))
+
+        # Finally, there are tzh_ttisgmtcnt UTC/local
+        # indicators, each stored as a one-byte value;
+        # they tell whether the transition times associated
+        # with local time types were specified as UTC or
+        # local time, and are used when a time zone file
+        # is used in handling POSIX-style time zone envi-
+        # ronment variables.
+
+        if ttisgmtcnt:
+            isgmt = struct.unpack(">%db" % ttisgmtcnt,
+                                  fileobj.read(ttisgmtcnt))
+
+        # Build ttinfo list
+        out.ttinfo_list = []
+        for i in range(typecnt):
+            gmtoff, isdst, abbrind = ttinfo[i]
+            gmtoff = _get_supported_offset(gmtoff)
+            tti = _ttinfo()
+            tti.offset = gmtoff
+            tti.dstoffset = datetime.timedelta(0)
+            tti.delta = datetime.timedelta(seconds=gmtoff)
+            tti.isdst = isdst
+            tti.abbr = abbr[abbrind:abbr.find('\x00', abbrind)]
+            tti.isstd = (ttisstdcnt > i and isstd[i] != 0)
+            tti.isgmt = (ttisgmtcnt > i and isgmt[i] != 0)
+            out.ttinfo_list.append(tti)
+
+        # Replace ttinfo indexes for ttinfo objects.
+        out.trans_idx = [out.ttinfo_list[idx] for idx in out.trans_idx]
+
+        # Set standard, dst, and before ttinfos. before will be
+        # used when a given time is before any transitions,
+        # and will be set to the first non-dst ttinfo, or to
+        # the first dst, if all of them are dst.
+        out.ttinfo_std = None
+        out.ttinfo_dst = None
+        out.ttinfo_before = None
+        if out.ttinfo_list:
+            if not out.trans_list_utc:
+                out.ttinfo_std = out.ttinfo_first = out.ttinfo_list[0]
+            else:
+                for i in range(timecnt-1, -1, -1):
+                    tti = out.trans_idx[i]
+                    if not out.ttinfo_std and not tti.isdst:
+                        out.ttinfo_std = tti
+                    elif not out.ttinfo_dst and tti.isdst:
+                        out.ttinfo_dst = tti
+
+                    if out.ttinfo_std and out.ttinfo_dst:
+                        break
+                else:
+                    if out.ttinfo_dst and not out.ttinfo_std:
+                        out.ttinfo_std = out.ttinfo_dst
+
+                for tti in out.ttinfo_list:
+                    if not tti.isdst:
+                        out.ttinfo_before = tti
+                        break
+                else:
+                    out.ttinfo_before = out.ttinfo_list[0]
+
+        # Now fix transition times to become relative to wall time.
+        #
+        # I'm not sure about this. In my tests, the tz source file
+        # is setup to wall time, and in the binary file isstd and
+        # isgmt are off, so it should be in wall time. OTOH, it's
+        # always in gmt time. Let me know if you have comments
+        # about this.
+        lastdst = None
+        lastoffset = None
+        lastdstoffset = None
+        lastbaseoffset = None
+        out.trans_list = []
+
+        for i, tti in enumerate(out.trans_idx):
+            offset = tti.offset
+            dstoffset = 0
+
+            if lastdst is not None:
+                if tti.isdst:
+                    if not lastdst:
+                        dstoffset = offset - lastoffset
+
+                    if not dstoffset and lastdstoffset:
+                        dstoffset = lastdstoffset
+
+                    tti.dstoffset = datetime.timedelta(seconds=dstoffset)
+                    lastdstoffset = dstoffset
+
+            # If a time zone changes its base offset during a DST transition,
+            # then you need to adjust by the previous base offset to get the
+            # transition time in local time. Otherwise you use the current
+            # base offset. Ideally, I would have some mathematical proof of
+            # why this is true, but I haven't really thought about it enough.
+            baseoffset = offset - dstoffset
+            adjustment = baseoffset
+            if (lastbaseoffset is not None and baseoffset != lastbaseoffset
+                    and tti.isdst != lastdst):
+                # The base DST has changed
+                adjustment = lastbaseoffset
+
+            lastdst = tti.isdst
+            lastoffset = offset
+            lastbaseoffset = baseoffset
+
+            out.trans_list.append(out.trans_list_utc[i] + adjustment)
+
+        out.trans_idx = tuple(out.trans_idx)
+        out.trans_list = tuple(out.trans_list)
+        out.trans_list_utc = tuple(out.trans_list_utc)
+
+        return out
+
+    def _find_last_transition(self, dt, in_utc=False):
+        # If there's no list, there are no transitions to find
+        if not self._trans_list:
+            return None
+
+        timestamp = _datetime_to_timestamp(dt)
+
+        # Find where the timestamp fits in the transition list - if the
+        # timestamp is a transition time, it's part of the "after" period.
+        trans_list = self._trans_list_utc if in_utc else self._trans_list
+        idx = bisect.bisect_right(trans_list, timestamp)
+
+        # We want to know when the previous transition was, so subtract off 1
+        return idx - 1
+
+    def _get_ttinfo(self, idx):
+        # For no list or after the last transition, default to _ttinfo_std
+        if idx is None or (idx + 1) >= len(self._trans_list):
+            return self._ttinfo_std
+
+        # If there is a list and the time is before it, return _ttinfo_before
+        if idx < 0:
+            return self._ttinfo_before
+
+        return self._trans_idx[idx]
+
+    def _find_ttinfo(self, dt):
+        idx = self._resolve_ambiguous_time(dt)
+
+        return self._get_ttinfo(idx)
+
+    def fromutc(self, dt):
+        """
+        The ``tzfile`` implementation of :py:func:`datetime.tzinfo.fromutc`.
+
+        :param dt:
+            A :py:class:`datetime.datetime` object.
+
+        :raises TypeError:
+            Raised if ``dt`` is not a :py:class:`datetime.datetime` object.
+
+        :raises ValueError:
+            Raised if this is called with a ``dt`` which does not have this
+            ``tzinfo`` attached.
+
+        :return:
+            Returns a :py:class:`datetime.datetime` object representing the
+            wall time in ``self``'s time zone.
+        """
+        # These isinstance checks are in datetime.tzinfo, so we'll preserve
+        # them, even if we don't care about duck typing.
+        if not isinstance(dt, datetime.datetime):
+            raise TypeError("fromutc() requires a datetime argument")
+
+        if dt.tzinfo is not self:
+            raise ValueError("dt.tzinfo is not self")
+
+        # First treat UTC as wall time and get the transition we're in.
+        idx = self._find_last_transition(dt, in_utc=True)
+        tti = self._get_ttinfo(idx)
+
+        dt_out = dt + datetime.timedelta(seconds=tti.offset)
+
+        fold = self.is_ambiguous(dt_out, idx=idx)
+
+        return enfold(dt_out, fold=int(fold))
+
+    def is_ambiguous(self, dt, idx=None):
+        """
+        Whether or not the "wall time" of a given datetime is ambiguous in this
+        zone.
+
+        :param dt:
+            A :py:class:`datetime.datetime`, naive or time zone aware.
+
+
+        :return:
+            Returns ``True`` if ambiguous, ``False`` otherwise.
+
+        .. versionadded:: 2.6.0
+        """
+        if idx is None:
+            idx = self._find_last_transition(dt)
+
+        # Calculate the difference in offsets from current to previous
+        timestamp = _datetime_to_timestamp(dt)
+        tti = self._get_ttinfo(idx)
+
+        if idx is None or idx <= 0:
+            return False
+
+        od = self._get_ttinfo(idx - 1).offset - tti.offset
+        tt = self._trans_list[idx]          # Transition time
+
+        return timestamp < tt + od
+
+    def _resolve_ambiguous_time(self, dt):
+        idx = self._find_last_transition(dt)
+
+        # If we have no transitions, return the index
+        _fold = self._fold(dt)
+        if idx is None or idx == 0:
+            return idx
+
+        # If it's ambiguous and we're in a fold, shift to a different index.
+        idx_offset = int(not _fold and self.is_ambiguous(dt, idx))
+
+        return idx - idx_offset
+
+    def utcoffset(self, dt):
+        if dt is None:
+            return None
+
+        if not self._ttinfo_std:
+            return ZERO
+
+        return self._find_ttinfo(dt).delta
+
+    def dst(self, dt):
+        if dt is None:
+            return None
+
+        if not self._ttinfo_dst:
+            return ZERO
+
+        tti = self._find_ttinfo(dt)
+
+        if not tti.isdst:
+            return ZERO
+
+        # The documentation says that utcoffset()-dst() must
+        # be constant for every dt.
+        return tti.dstoffset
+
+    @tzname_in_python2
+    def tzname(self, dt):
+        if not self._ttinfo_std or dt is None:
+            return None
+        return self._find_ttinfo(dt).abbr
+
+    def __eq__(self, other):
+        if not isinstance(other, tzfile):
+            return NotImplemented
+        return (self._trans_list == other._trans_list and
+                self._trans_idx == other._trans_idx and
+                self._ttinfo_list == other._ttinfo_list)
+
+    __hash__ = None
+
+    def __ne__(self, other):
+        return not (self == other)
+
+    def __repr__(self):
+        return "%s(%s)" % (self.__class__.__name__, repr(self._filename))
+
+    def __reduce__(self):
+        return self.__reduce_ex__(None)
+
+    def __reduce_ex__(self, protocol):
+        return (self.__class__, (None, self._filename), self.__dict__)
+
+
+class tzrange(tzrangebase):
+    """
+    The ``tzrange`` object is a time zone specified by a set of offsets and
+    abbreviations, equivalent to the way the ``TZ`` variable can be specified
+    in POSIX-like systems, but using Python delta objects to specify DST
+    start, end and offsets.
+
+    :param stdabbr:
+        The abbreviation for standard time (e.g. ``'EST'``).
+
+    :param stdoffset:
+        An integer or :class:`datetime.timedelta` object or equivalent
+        specifying the base offset from UTC.
+
+        If unspecified, +00:00 is used.
+
+    :param dstabbr:
+        The abbreviation for DST / "Summer" time (e.g. ``'EDT'``).
+
+        If specified, with no other DST information, DST is assumed to occur
+        and the default behavior or ``dstoffset``, ``start`` and ``end`` is
+        used. If unspecified and no other DST information is specified, it
+        is assumed that this zone has no DST.
+
+        If this is unspecified and other DST information is *is* specified,
+        DST occurs in the zone but the time zone abbreviation is left
+        unchanged.
+
+    :param dstoffset:
+        A an integer or :class:`datetime.timedelta` object or equivalent
+        specifying the UTC offset during DST. If unspecified and any other DST
+        information is specified, it is assumed to be the STD offset +1 hour.
+
+    :param start:
+        A :class:`relativedelta.relativedelta` object or equivalent specifying
+        the time and time of year that daylight savings time starts. To
+        specify, for example, that DST starts at 2AM on the 2nd Sunday in
+        March, pass:
+
+            ``relativedelta(hours=2, month=3, day=1, weekday=SU(+2))``
+
+        If unspecified and any other DST information is specified, the default
+        value is 2 AM on the first Sunday in April.
+
+    :param end:
+        A :class:`relativedelta.relativedelta` object or equivalent
+        representing the time and time of year that daylight savings time
+        ends, with the same specification method as in ``start``. One note is
+        that this should point to the first time in the *standard* zone, so if
+        a transition occurs at 2AM in the DST zone and the clocks are set back
+        1 hour to 1AM, set the ``hours`` parameter to +1.
+
+
+    **Examples:**
+
+    .. testsetup:: tzrange
+
+        from dateutil.tz import tzrange, tzstr
+
+    .. doctest:: tzrange
+
+        >>> tzstr('EST5EDT') == tzrange("EST", -18000, "EDT")
+        True
+
+        >>> from dateutil.relativedelta import *
+        >>> range1 = tzrange("EST", -18000, "EDT")
+        >>> range2 = tzrange("EST", -18000, "EDT", -14400,
+        ...                  relativedelta(hours=+2, month=4, day=1,
+        ...                                weekday=SU(+1)),
+        ...                  relativedelta(hours=+1, month=10, day=31,
+        ...                                weekday=SU(-1)))
+        >>> tzstr('EST5EDT') == range1 == range2
+        True
+
+    """
+    def __init__(self, stdabbr, stdoffset=None,
+                 dstabbr=None, dstoffset=None,
+                 start=None, end=None):
+
+        global relativedelta
+        from dateutil import relativedelta
+
+        self._std_abbr = stdabbr
+        self._dst_abbr = dstabbr
+
+        try:
+            stdoffset = stdoffset.total_seconds()
+        except (TypeError, AttributeError):
+            pass
+
+        try:
+            dstoffset = dstoffset.total_seconds()
+        except (TypeError, AttributeError):
+            pass
+
+        if stdoffset is not None:
+            self._std_offset = datetime.timedelta(seconds=stdoffset)
+        else:
+            self._std_offset = ZERO
+
+        if dstoffset is not None:
+            self._dst_offset = datetime.timedelta(seconds=dstoffset)
+        elif dstabbr and stdoffset is not None:
+            self._dst_offset = self._std_offset + datetime.timedelta(hours=+1)
+        else:
+            self._dst_offset = ZERO
+
+        if dstabbr and start is None:
+            self._start_delta = relativedelta.relativedelta(
+                hours=+2, month=4, day=1, weekday=relativedelta.SU(+1))
+        else:
+            self._start_delta = start
+
+        if dstabbr and end is None:
+            self._end_delta = relativedelta.relativedelta(
+                hours=+1, month=10, day=31, weekday=relativedelta.SU(-1))
+        else:
+            self._end_delta = end
+
+        self._dst_base_offset_ = self._dst_offset - self._std_offset
+        self.hasdst = bool(self._start_delta)
+
+    def transitions(self, year):
+        """
+        For a given year, get the DST on and off transition times, expressed
+        always on the standard time side. For zones with no transitions, this
+        function returns ``None``.
+
+        :param year:
+            The year whose transitions you would like to query.
+
+        :return:
+            Returns a :class:`tuple` of :class:`datetime.datetime` objects,
+            ``(dston, dstoff)`` for zones with an annual DST transition, or
+            ``None`` for fixed offset zones.
+        """
+        if not self.hasdst:
+            return None
+
+        base_year = datetime.datetime(year, 1, 1)
+
+        start = base_year + self._start_delta
+        end = base_year + self._end_delta
+
+        return (start, end)
+
+    def __eq__(self, other):
+        if not isinstance(other, tzrange):
+            return NotImplemented
+
+        return (self._std_abbr == other._std_abbr and
+                self._dst_abbr == other._dst_abbr and
+                self._std_offset == other._std_offset and
+                self._dst_offset == other._dst_offset and
+                self._start_delta == other._start_delta and
+                self._end_delta == other._end_delta)
+
+    @property
+    def _dst_base_offset(self):
+        return self._dst_base_offset_
+
+
+@six.add_metaclass(_TzStrFactory)
+class tzstr(tzrange):
+    """
+    ``tzstr`` objects are time zone objects specified by a time-zone string as
+    it would be passed to a ``TZ`` variable on POSIX-style systems (see
+    the `GNU C Library: TZ Variable`_ for more details).
+
+    There is one notable exception, which is that POSIX-style time zones use an
+    inverted offset format, so normally ``GMT+3`` would be parsed as an offset
+    3 hours *behind* GMT. The ``tzstr`` time zone object will parse this as an
+    offset 3 hours *ahead* of GMT. If you would like to maintain the POSIX
+    behavior, pass a ``True`` value to ``posix_offset``.
+
+    The :class:`tzrange` object provides the same functionality, but is
+    specified using :class:`relativedelta.relativedelta` objects. rather than
+    strings.
+
+    :param s:
+        A time zone string in ``TZ`` variable format. This can be a
+        :class:`bytes` (2.x: :class:`str`), :class:`str` (2.x:
+        :class:`unicode`) or a stream emitting unicode characters
+        (e.g. :class:`StringIO`).
+
+    :param posix_offset:
+        Optional. If set to ``True``, interpret strings such as ``GMT+3`` or
+        ``UTC+3`` as being 3 hours *behind* UTC rather than ahead, per the
+        POSIX standard.
+
+    .. caution::
+
+        Prior to version 2.7.0, this function also supported time zones
+        in the format:
+
+            * ``EST5EDT,4,0,6,7200,10,0,26,7200,3600``
+            * ``EST5EDT,4,1,0,7200,10,-1,0,7200,3600``
+
+        This format is non-standard and has been deprecated; this function
+        will raise a :class:`DeprecatedTZFormatWarning` until
+        support is removed in a future version.
+
+    .. _`GNU C Library: TZ Variable`:
+        https://www.gnu.org/software/libc/manual/html_node/TZ-Variable.html
+    """
+    def __init__(self, s, posix_offset=False):
+        global parser
+        from dateutil.parser import _parser as parser
+
+        self._s = s
+
+        res = parser._parsetz(s)
+        if res is None or res.any_unused_tokens:
+            raise ValueError("unknown string format")
+
+        # Here we break the compatibility with the TZ variable handling.
+        # GMT-3 actually *means* the timezone -3.
+        if res.stdabbr in ("GMT", "UTC") and not posix_offset:
+            res.stdoffset *= -1
+
+        # We must initialize it first, since _delta() needs
+        # _std_offset and _dst_offset set. Use False in start/end
+        # to avoid building it two times.
+        tzrange.__init__(self, res.stdabbr, res.stdoffset,
+                         res.dstabbr, res.dstoffset,
+                         start=False, end=False)
+
+        if not res.dstabbr:
+            self._start_delta = None
+            self._end_delta = None
+        else:
+            self._start_delta = self._delta(res.start)
+            if self._start_delta:
+                self._end_delta = self._delta(res.end, isend=1)
+
+        self.hasdst = bool(self._start_delta)
+
+    def _delta(self, x, isend=0):
+        from dateutil import relativedelta
+        kwargs = {}
+        if x.month is not None:
+            kwargs["month"] = x.month
+            if x.weekday is not None:
+                kwargs["weekday"] = relativedelta.weekday(x.weekday, x.week)
+                if x.week > 0:
+                    kwargs["day"] = 1
+                else:
+                    kwargs["day"] = 31
+            elif x.day:
+                kwargs["day"] = x.day
+        elif x.yday is not None:
+            kwargs["yearday"] = x.yday
+        elif x.jyday is not None:
+            kwargs["nlyearday"] = x.jyday
+        if not kwargs:
+            # Default is to start on first sunday of april, and end
+            # on last sunday of october.
+            if not isend:
+                kwargs["month"] = 4
+                kwargs["day"] = 1
+                kwargs["weekday"] = relativedelta.SU(+1)
+            else:
+                kwargs["month"] = 10
+                kwargs["day"] = 31
+                kwargs["weekday"] = relativedelta.SU(-1)
+        if x.time is not None:
+            kwargs["seconds"] = x.time
+        else:
+            # Default is 2AM.
+            kwargs["seconds"] = 7200
+        if isend:
+            # Convert to standard time, to follow the documented way
+            # of working with the extra hour. See the documentation
+            # of the tzinfo class.
+            delta = self._dst_offset - self._std_offset
+            kwargs["seconds"] -= delta.seconds + delta.days * 86400
+        return relativedelta.relativedelta(**kwargs)
+
+    def __repr__(self):
+        return "%s(%s)" % (self.__class__.__name__, repr(self._s))
+
+
+class _tzicalvtzcomp(object):
+    def __init__(self, tzoffsetfrom, tzoffsetto, isdst,
+                 tzname=None, rrule=None):
+        self.tzoffsetfrom = datetime.timedelta(seconds=tzoffsetfrom)
+        self.tzoffsetto = datetime.timedelta(seconds=tzoffsetto)
+        self.tzoffsetdiff = self.tzoffsetto - self.tzoffsetfrom
+        self.isdst = isdst
+        self.tzname = tzname
+        self.rrule = rrule
+
+
+class _tzicalvtz(_tzinfo):
+    def __init__(self, tzid, comps=[]):
+        super(_tzicalvtz, self).__init__()
+
+        self._tzid = tzid
+        self._comps = comps
+        self._cachedate = []
+        self._cachecomp = []
+        self._cache_lock = _thread.allocate_lock()
+
+    def _find_comp(self, dt):
+        if len(self._comps) == 1:
+            return self._comps[0]
+
+        dt = dt.replace(tzinfo=None)
+
+        try:
+            with self._cache_lock:
+                return self._cachecomp[self._cachedate.index(
+                    (dt, self._fold(dt)))]
+        except ValueError:
+            pass
+
+        lastcompdt = None
+        lastcomp = None
+
+        for comp in self._comps:
+            compdt = self._find_compdt(comp, dt)
+
+            if compdt and (not lastcompdt or lastcompdt < compdt):
+                lastcompdt = compdt
+                lastcomp = comp
+
+        if not lastcomp:
+            # RFC says nothing about what to do when a given
+            # time is before the first onset date. We'll look for the
+            # first standard component, or the first component, if
+            # none is found.
+            for comp in self._comps:
+                if not comp.isdst:
+                    lastcomp = comp
+                    break
+            else:
+                lastcomp = comp[0]
+
+        with self._cache_lock:
+            self._cachedate.insert(0, (dt, self._fold(dt)))
+            self._cachecomp.insert(0, lastcomp)
+
+            if len(self._cachedate) > 10:
+                self._cachedate.pop()
+                self._cachecomp.pop()
+
+        return lastcomp
+
+    def _find_compdt(self, comp, dt):
+        if comp.tzoffsetdiff < ZERO and self._fold(dt):
+            dt -= comp.tzoffsetdiff
+
+        compdt = comp.rrule.before(dt, inc=True)
+
+        return compdt
+
+    def utcoffset(self, dt):
+        if dt is None:
+            return None
+
+        return self._find_comp(dt).tzoffsetto
+
+    def dst(self, dt):
+        comp = self._find_comp(dt)
+        if comp.isdst:
+            return comp.tzoffsetdiff
+        else:
+            return ZERO
+
+    @tzname_in_python2
+    def tzname(self, dt):
+        return self._find_comp(dt).tzname
+
+    def __repr__(self):
+        return "<tzicalvtz %s>" % repr(self._tzid)
+
+    __reduce__ = object.__reduce__
+
+
+class tzical(object):
+    """
+    This object is designed to parse an iCalendar-style ``VTIMEZONE`` structure
+    as set out in `RFC 5545`_ Section 4.6.5 into one or more `tzinfo` objects.
+
+    :param `fileobj`:
+        A file or stream in iCalendar format, which should be UTF-8 encoded
+        with CRLF endings.
+
+    .. _`RFC 5545`: https://tools.ietf.org/html/rfc5545
+    """
+    def __init__(self, fileobj):
+        global rrule
+        from dateutil import rrule
+
+        if isinstance(fileobj, string_types):
+            self._s = fileobj
+            # ical should be encoded in UTF-8 with CRLF
+            fileobj = open(fileobj, 'r')
+        else:
+            self._s = getattr(fileobj, 'name', repr(fileobj))
+            fileobj = _nullcontext(fileobj)
+
+        self._vtz = {}
+
+        with fileobj as fobj:
+            self._parse_rfc(fobj.read())
+
+    def keys(self):
+        """
+        Retrieves the available time zones as a list.
+        """
+        return list(self._vtz.keys())
+
+    def get(self, tzid=None):
+        """
+        Retrieve a :py:class:`datetime.tzinfo` object by its ``tzid``.
+
+        :param tzid:
+            If there is exactly one time zone available, omitting ``tzid``
+            or passing :py:const:`None` value returns it. Otherwise a valid
+            key (which can be retrieved from :func:`keys`) is required.
+
+        :raises ValueError:
+            Raised if ``tzid`` is not specified but there are either more
+            or fewer than 1 zone defined.
+
+        :returns:
+            Returns either a :py:class:`datetime.tzinfo` object representing
+            the relevant time zone or :py:const:`None` if the ``tzid`` was
+            not found.
+        """
+        if tzid is None:
+            if len(self._vtz) == 0:
+                raise ValueError("no timezones defined")
+            elif len(self._vtz) > 1:
+                raise ValueError("more than one timezone available")
+            tzid = next(iter(self._vtz))
+
+        return self._vtz.get(tzid)
+
+    def _parse_offset(self, s):
+        s = s.strip()
+        if not s:
+            raise ValueError("empty offset")
+        if s[0] in ('+', '-'):
+            signal = (-1, +1)[s[0] == '+']
+            s = s[1:]
+        else:
+            signal = +1
+        if len(s) == 4:
+            return (int(s[:2]) * 3600 + int(s[2:]) * 60) * signal
+        elif len(s) == 6:
+            return (int(s[:2]) * 3600 + int(s[2:4]) * 60 + int(s[4:])) * signal
+        else:
+            raise ValueError("invalid offset: " + s)
+
+    def _parse_rfc(self, s):
+        lines = s.splitlines()
+        if not lines:
+            raise ValueError("empty string")
+
+        # Unfold
+        i = 0
+        while i < len(lines):
+            line = lines[i].rstrip()
+            if not line:
+                del lines[i]
+            elif i > 0 and line[0] == " ":
+                lines[i-1] += line[1:]
+                del lines[i]
+            else:
+                i += 1
+
+        tzid = None
+        comps = []
+        invtz = False
+        comptype = None
+        for line in lines:
+            if not line:
+                continue
+            name, value = line.split(':', 1)
+            parms = name.split(';')
+            if not parms:
+                raise ValueError("empty property name")
+            name = parms[0].upper()
+            parms = parms[1:]
+            if invtz:
+                if name == "BEGIN":
+                    if value in ("STANDARD", "DAYLIGHT"):
+                        # Process component
+                        pass
+                    else:
+                        raise ValueError("unknown component: "+value)
+                    comptype = value
+                    founddtstart = False
+                    tzoffsetfrom = None
+                    tzoffsetto = None
+                    rrulelines = []
+                    tzname = None
+                elif name == "END":
+                    if value == "VTIMEZONE":
+                        if comptype:
+                            raise ValueError("component not closed: "+comptype)
+                        if not tzid:
+                            raise ValueError("mandatory TZID not found")
+                        if not comps:
+                            raise ValueError(
+                                "at least one component is needed")
+                        # Process vtimezone
+                        self._vtz[tzid] = _tzicalvtz(tzid, comps)
+                        invtz = False
+                    elif value == comptype:
+                        if not founddtstart:
+                            raise ValueError("mandatory DTSTART not found")
+                        if tzoffsetfrom is None:
+                            raise ValueError(
+                                "mandatory TZOFFSETFROM not found")
+                        if tzoffsetto is None:
+                            raise ValueError(
+                                "mandatory TZOFFSETFROM not found")
+                        # Process component
+                        rr = None
+                        if rrulelines:
+                            rr = rrule.rrulestr("\n".join(rrulelines),
+                                                compatible=True,
+                                                ignoretz=True,
+                                                cache=True)
+                        comp = _tzicalvtzcomp(tzoffsetfrom, tzoffsetto,
+                                              (comptype == "DAYLIGHT"),
+                                              tzname, rr)
+                        comps.append(comp)
+                        comptype = None
+                    else:
+                        raise ValueError("invalid component end: "+value)
+                elif comptype:
+                    if name == "DTSTART":
+                        # DTSTART in VTIMEZONE takes a subset of valid RRULE
+                        # values under RFC 5545.
+                        for parm in parms:
+                            if parm != 'VALUE=DATE-TIME':
+                                msg = ('Unsupported DTSTART param in ' +
+                                       'VTIMEZONE: ' + parm)
+                                raise ValueError(msg)
+                        rrulelines.append(line)
+                        founddtstart = True
+                    elif name in ("RRULE", "RDATE", "EXRULE", "EXDATE"):
+                        rrulelines.append(line)
+                    elif name == "TZOFFSETFROM":
+                        if parms:
+                            raise ValueError(
+                                "unsupported %s parm: %s " % (name, parms[0]))
+                        tzoffsetfrom = self._parse_offset(value)
+                    elif name == "TZOFFSETTO":
+                        if parms:
+                            raise ValueError(
+                                "unsupported TZOFFSETTO parm: "+parms[0])
+                        tzoffsetto = self._parse_offset(value)
+                    elif name == "TZNAME":
+                        if parms:
+                            raise ValueError(
+                                "unsupported TZNAME parm: "+parms[0])
+                        tzname = value
+                    elif name == "COMMENT":
+                        pass
+                    else:
+                        raise ValueError("unsupported property: "+name)
+                else:
+                    if name == "TZID":
+                        if parms:
+                            raise ValueError(
+                                "unsupported TZID parm: "+parms[0])
+                        tzid = value
+                    elif name in ("TZURL", "LAST-MODIFIED", "COMMENT"):
+                        pass
+                    else:
+                        raise ValueError("unsupported property: "+name)
+            elif name == "BEGIN" and value == "VTIMEZONE":
+                tzid = None
+                comps = []
+                invtz = True
+
+    def __repr__(self):
+        return "%s(%s)" % (self.__class__.__name__, repr(self._s))
+
+
+if sys.platform != "win32":
+    TZFILES = ["/etc/localtime", "localtime"]
+    TZPATHS = ["/usr/share/zoneinfo",
+               "/usr/lib/zoneinfo",
+               "/usr/share/lib/zoneinfo",
+               "/etc/zoneinfo"]
+else:
+    TZFILES = []
+    TZPATHS = []
+
+
+def __get_gettz():
+    tzlocal_classes = (tzlocal,)
+    if tzwinlocal is not None:
+        tzlocal_classes += (tzwinlocal,)
+
+    class GettzFunc(object):
+        """
+        Retrieve a time zone object from a string representation
+
+        This function is intended to retrieve the :py:class:`tzinfo` subclass
+        that best represents the time zone that would be used if a POSIX
+        `TZ variable`_ were set to the same value.
+
+        If no argument or an empty string is passed to ``gettz``, local time
+        is returned:
+
+        .. code-block:: python3
+
+            >>> gettz()
+            tzfile('/etc/localtime')
+
+        This function is also the preferred way to map IANA tz database keys
+        to :class:`tzfile` objects:
+
+        .. code-block:: python3
+
+            >>> gettz('Pacific/Kiritimati')
+            tzfile('/usr/share/zoneinfo/Pacific/Kiritimati')
+
+        On Windows, the standard is extended to include the Windows-specific
+        zone names provided by the operating system:
+
+        .. code-block:: python3
+
+            >>> gettz('Egypt Standard Time')
+            tzwin('Egypt Standard Time')
+
+        Passing a GNU ``TZ`` style string time zone specification returns a
+        :class:`tzstr` object:
+
+        .. code-block:: python3
+
+            >>> gettz('AEST-10AEDT-11,M10.1.0/2,M4.1.0/3')
+            tzstr('AEST-10AEDT-11,M10.1.0/2,M4.1.0/3')
+
+        :param name:
+            A time zone name (IANA, or, on Windows, Windows keys), location of
+            a ``tzfile(5)`` zoneinfo file or ``TZ`` variable style time zone
+            specifier. An empty string, no argument or ``None`` is interpreted
+            as local time.
+
+        :return:
+            Returns an instance of one of ``dateutil``'s :py:class:`tzinfo`
+            subclasses.
+
+        .. versionchanged:: 2.7.0
+
+            After version 2.7.0, any two calls to ``gettz`` using the same
+            input strings will return the same object:
+
+            .. code-block:: python3
+
+                >>> tz.gettz('America/Chicago') is tz.gettz('America/Chicago')
+                True
+
+            In addition to improving performance, this ensures that
+            `"same zone" semantics`_ are used for datetimes in the same zone.
+
+
+        .. _`TZ variable`:
+            https://www.gnu.org/software/libc/manual/html_node/TZ-Variable.html
+
+        .. _`"same zone" semantics`:
+            https://blog.ganssle.io/articles/2018/02/aware-datetime-arithmetic.html
+        """
+        def __init__(self):
+
+            self.__instances = weakref.WeakValueDictionary()
+            self.__strong_cache_size = 8
+            self.__strong_cache = OrderedDict()
+            self._cache_lock = _thread.allocate_lock()
+
+        def __call__(self, name=None):
+            with self._cache_lock:
+                rv = self.__instances.get(name, None)
+
+                if rv is None:
+                    rv = self.nocache(name=name)
+                    if not (name is None
+                            or isinstance(rv, tzlocal_classes)
+                            or rv is None):
+                        # tzlocal is slightly more complicated than the other
+                        # time zone providers because it depends on environment
+                        # at construction time, so don't cache that.
+                        #
+                        # We also cannot store weak references to None, so we
+                        # will also not store that.
+                        self.__instances[name] = rv
+                    else:
+                        # No need for strong caching, return immediately
+                        return rv
+
+                self.__strong_cache[name] = self.__strong_cache.pop(name, rv)
+
+                if len(self.__strong_cache) > self.__strong_cache_size:
+                    self.__strong_cache.popitem(last=False)
+
+            return rv
+
+        def set_cache_size(self, size):
+            with self._cache_lock:
+                self.__strong_cache_size = size
+                while len(self.__strong_cache) > size:
+                    self.__strong_cache.popitem(last=False)
+
+        def cache_clear(self):
+            with self._cache_lock:
+                self.__instances = weakref.WeakValueDictionary()
+                self.__strong_cache.clear()
+
+        @staticmethod
+        def nocache(name=None):
+            """A non-cached version of gettz"""
+            tz = None
+            if not name:
+                try:
+                    name = os.environ["TZ"]
+                except KeyError:
+                    pass
+            if name is None or name in ("", ":"):
+                for filepath in TZFILES:
+                    if not os.path.isabs(filepath):
+                        filename = filepath
+                        for path in TZPATHS:
+                            filepath = os.path.join(path, filename)
+                            if os.path.isfile(filepath):
+                                break
+                        else:
+                            continue
+                    if os.path.isfile(filepath):
+                        try:
+                            tz = tzfile(filepath)
+                            break
+                        except (IOError, OSError, ValueError):
+                            pass
+                else:
+                    tz = tzlocal()
+            else:
+                try:
+                    if name.startswith(":"):
+                        name = name[1:]
+                except TypeError as e:
+                    if isinstance(name, bytes):
+                        new_msg = "gettz argument should be str, not bytes"
+                        six.raise_from(TypeError(new_msg), e)
+                    else:
+                        raise
+                if os.path.isabs(name):
+                    if os.path.isfile(name):
+                        tz = tzfile(name)
+                    else:
+                        tz = None
+                else:
+                    for path in TZPATHS:
+                        filepath = os.path.join(path, name)
+                        if not os.path.isfile(filepath):
+                            filepath = filepath.replace(' ', '_')
+                            if not os.path.isfile(filepath):
+                                continue
+                        try:
+                            tz = tzfile(filepath)
+                            break
+                        except (IOError, OSError, ValueError):
+                            pass
+                    else:
+                        tz = None
+                        if tzwin is not None:
+                            try:
+                                tz = tzwin(name)
+                            except (WindowsError, UnicodeEncodeError):
+                                # UnicodeEncodeError is for Python 2.7 compat
+                                tz = None
+
+                        if not tz:
+                            from dateutil.zoneinfo import get_zonefile_instance
+                            tz = get_zonefile_instance().get(name)
+
+                        if not tz:
+                            for c in name:
+                                # name is not a tzstr unless it has at least
+                                # one offset. For short values of "name", an
+                                # explicit for loop seems to be the fastest way
+                                # To determine if a string contains a digit
+                                if c in "0123456789":
+                                    try:
+                                        tz = tzstr(name)
+                                    except ValueError:
+                                        pass
+                                    break
+                            else:
+                                if name in ("GMT", "UTC"):
+                                    tz = UTC
+                                elif name in time.tzname:
+                                    tz = tzlocal()
+            return tz
+
+    return GettzFunc()
+
+
+gettz = __get_gettz()
+del __get_gettz
+
+
+def datetime_exists(dt, tz=None):
+    """
+    Given a datetime and a time zone, determine whether or not a given datetime
+    would fall in a gap.
+
+    :param dt:
+        A :class:`datetime.datetime` (whose time zone will be ignored if ``tz``
+        is provided.)
+
+    :param tz:
+        A :class:`datetime.tzinfo` with support for the ``fold`` attribute. If
+        ``None`` or not provided, the datetime's own time zone will be used.
+
+    :return:
+        Returns a boolean value whether or not the "wall time" exists in
+        ``tz``.
+
+    .. versionadded:: 2.7.0
+    """
+    if tz is None:
+        if dt.tzinfo is None:
+            raise ValueError('Datetime is naive and no time zone provided.')
+        tz = dt.tzinfo
+
+    dt = dt.replace(tzinfo=None)
+
+    # This is essentially a test of whether or not the datetime can survive
+    # a round trip to UTC.
+    dt_rt = dt.replace(tzinfo=tz).astimezone(UTC).astimezone(tz)
+    dt_rt = dt_rt.replace(tzinfo=None)
+
+    return dt == dt_rt
+
+
+def datetime_ambiguous(dt, tz=None):
+    """
+    Given a datetime and a time zone, determine whether or not a given datetime
+    is ambiguous (i.e if there are two times differentiated only by their DST
+    status).
+
+    :param dt:
+        A :class:`datetime.datetime` (whose time zone will be ignored if ``tz``
+        is provided.)
+
+    :param tz:
+        A :class:`datetime.tzinfo` with support for the ``fold`` attribute. If
+        ``None`` or not provided, the datetime's own time zone will be used.
+
+    :return:
+        Returns a boolean value whether or not the "wall time" is ambiguous in
+        ``tz``.
+
+    .. versionadded:: 2.6.0
+    """
+    if tz is None:
+        if dt.tzinfo is None:
+            raise ValueError('Datetime is naive and no time zone provided.')
+
+        tz = dt.tzinfo
+
+    # If a time zone defines its own "is_ambiguous" function, we'll use that.
+    is_ambiguous_fn = getattr(tz, 'is_ambiguous', None)
+    if is_ambiguous_fn is not None:
+        try:
+            return tz.is_ambiguous(dt)
+        except Exception:
+            pass
+
+    # If it doesn't come out and tell us it's ambiguous, we'll just check if
+    # the fold attribute has any effect on this particular date and time.
+    dt = dt.replace(tzinfo=tz)
+    wall_0 = enfold(dt, fold=0)
+    wall_1 = enfold(dt, fold=1)
+
+    same_offset = wall_0.utcoffset() == wall_1.utcoffset()
+    same_dst = wall_0.dst() == wall_1.dst()
+
+    return not (same_offset and same_dst)
+
+
+def resolve_imaginary(dt):
+    """
+    Given a datetime that may be imaginary, return an existing datetime.
+
+    This function assumes that an imaginary datetime represents what the
+    wall time would be in a zone had the offset transition not occurred, so
+    it will always fall forward by the transition's change in offset.
+
+    .. doctest::
+
+        >>> from dateutil import tz
+        >>> from datetime import datetime
+        >>> NYC = tz.gettz('America/New_York')
+        >>> print(tz.resolve_imaginary(datetime(2017, 3, 12, 2, 30, tzinfo=NYC)))
+        2017-03-12 03:30:00-04:00
+
+        >>> KIR = tz.gettz('Pacific/Kiritimati')
+        >>> print(tz.resolve_imaginary(datetime(1995, 1, 1, 12, 30, tzinfo=KIR)))
+        1995-01-02 12:30:00+14:00
+
+    As a note, :func:`datetime.astimezone` is guaranteed to produce a valid,
+    existing datetime, so a round-trip to and from UTC is sufficient to get
+    an extant datetime, however, this generally "falls back" to an earlier time
+    rather than falling forward to the STD side (though no guarantees are made
+    about this behavior).
+
+    :param dt:
+        A :class:`datetime.datetime` which may or may not exist.
+
+    :return:
+        Returns an existing :class:`datetime.datetime`. If ``dt`` was not
+        imaginary, the datetime returned is guaranteed to be the same object
+        passed to the function.
+
+    .. versionadded:: 2.7.0
+    """
+    if dt.tzinfo is not None and not datetime_exists(dt):
+
+        curr_offset = (dt + datetime.timedelta(hours=24)).utcoffset()
+        old_offset = (dt - datetime.timedelta(hours=24)).utcoffset()
+
+        dt += curr_offset - old_offset
+
+    return dt
+
+
+def _datetime_to_timestamp(dt):
+    """
+    Convert a :class:`datetime.datetime` object to an epoch timestamp in
+    seconds since January 1, 1970, ignoring the time zone.
+    """
+    return (dt.replace(tzinfo=None) - EPOCH).total_seconds()
+
+
+if sys.version_info >= (3, 6):
+    def _get_supported_offset(second_offset):
+        return second_offset
+else:
+    def _get_supported_offset(second_offset):
+        # For python pre-3.6, round to full-minutes if that's not the case.
+        # Python's datetime doesn't accept sub-minute timezones. Check
+        # http://python.org/sf/1447945 or https://bugs.python.org/issue5288
+        # for some information.
+        old_offset = second_offset
+        calculated_offset = 60 * ((second_offset + 30) // 60)
+        return calculated_offset
+
+
+try:
+    # Python 3.7 feature
+    from contextlib import nullcontext as _nullcontext
+except ImportError:
+    class _nullcontext(object):
+        """
+        Class for wrapping contexts so that they are passed through in a
+        with statement.
+        """
+        def __init__(self, context):
+            self.context = context
+
+        def __enter__(self):
+            return self.context
+
+        def __exit__(*args, **kwargs):
+            pass
+
+# vim:ts=4:sw=4:et
diff --git a/dateutil/tz/win.py b/dateutil/tz/win.py
new file mode 100644
index 0000000000000000000000000000000000000000..cde07ba792c40903f0c334839140173b39fd8124
--- /dev/null
+++ b/dateutil/tz/win.py
@@ -0,0 +1,370 @@
+# -*- coding: utf-8 -*-
+"""
+This module provides an interface to the native time zone data on Windows,
+including :py:class:`datetime.tzinfo` implementations.
+
+Attempting to import this module on a non-Windows platform will raise an
+:py:obj:`ImportError`.
+"""
+# This code was originally contributed by Jeffrey Harris.
+import datetime
+import struct
+
+from six.moves import winreg
+from six import text_type
+
+try:
+    import ctypes
+    from ctypes import wintypes
+except ValueError:
+    # ValueError is raised on non-Windows systems for some horrible reason.
+    raise ImportError("Running tzwin on non-Windows system")
+
+from ._common import tzrangebase
+
+__all__ = ["tzwin", "tzwinlocal", "tzres"]
+
+ONEWEEK = datetime.timedelta(7)
+
+TZKEYNAMENT = r"SOFTWARE\Microsoft\Windows NT\CurrentVersion\Time Zones"
+TZKEYNAME9X = r"SOFTWARE\Microsoft\Windows\CurrentVersion\Time Zones"
+TZLOCALKEYNAME = r"SYSTEM\CurrentControlSet\Control\TimeZoneInformation"
+
+
+def _settzkeyname():
+    handle = winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE)
+    try:
+        winreg.OpenKey(handle, TZKEYNAMENT).Close()
+        TZKEYNAME = TZKEYNAMENT
+    except WindowsError:
+        TZKEYNAME = TZKEYNAME9X
+    handle.Close()
+    return TZKEYNAME
+
+
+TZKEYNAME = _settzkeyname()
+
+
+class tzres(object):
+    """
+    Class for accessing ``tzres.dll``, which contains timezone name related
+    resources.
+
+    .. versionadded:: 2.5.0
+    """
+    p_wchar = ctypes.POINTER(wintypes.WCHAR)        # Pointer to a wide char
+
+    def __init__(self, tzres_loc='tzres.dll'):
+        # Load the user32 DLL so we can load strings from tzres
+        user32 = ctypes.WinDLL('user32')
+
+        # Specify the LoadStringW function
+        user32.LoadStringW.argtypes = (wintypes.HINSTANCE,
+                                       wintypes.UINT,
+                                       wintypes.LPWSTR,
+                                       ctypes.c_int)
+
+        self.LoadStringW = user32.LoadStringW
+        self._tzres = ctypes.WinDLL(tzres_loc)
+        self.tzres_loc = tzres_loc
+
+    def load_name(self, offset):
+        """
+        Load a timezone name from a DLL offset (integer).
+
+        >>> from dateutil.tzwin import tzres
+        >>> tzr = tzres()
+        >>> print(tzr.load_name(112))
+        'Eastern Standard Time'
+
+        :param offset:
+            A positive integer value referring to a string from the tzres dll.
+
+        .. note::
+
+            Offsets found in the registry are generally of the form
+            ``@tzres.dll,-114``. The offset in this case is 114, not -114.
+
+        """
+        resource = self.p_wchar()
+        lpBuffer = ctypes.cast(ctypes.byref(resource), wintypes.LPWSTR)
+        nchar = self.LoadStringW(self._tzres._handle, offset, lpBuffer, 0)
+        return resource[:nchar]
+
+    def name_from_string(self, tzname_str):
+        """
+        Parse strings as returned from the Windows registry into the time zone
+        name as defined in the registry.
+
+        >>> from dateutil.tzwin import tzres
+        >>> tzr = tzres()
+        >>> print(tzr.name_from_string('@tzres.dll,-251'))
+        'Dateline Daylight Time'
+        >>> print(tzr.name_from_string('Eastern Standard Time'))
+        'Eastern Standard Time'
+
+        :param tzname_str:
+            A timezone name string as returned from a Windows registry key.
+
+        :return:
+            Returns the localized timezone string from tzres.dll if the string
+            is of the form `@tzres.dll,-offset`, else returns the input string.
+        """
+        if not tzname_str.startswith('@'):
+            return tzname_str
+
+        name_splt = tzname_str.split(',-')
+        try:
+            offset = int(name_splt[1])
+        except:
+            raise ValueError("Malformed timezone string.")
+
+        return self.load_name(offset)
+
+
+class tzwinbase(tzrangebase):
+    """tzinfo class based on win32's timezones available in the registry."""
+    def __init__(self):
+        raise NotImplementedError('tzwinbase is an abstract base class')
+
+    def __eq__(self, other):
+        # Compare on all relevant dimensions, including name.
+        if not isinstance(other, tzwinbase):
+            return NotImplemented
+
+        return  (self._std_offset == other._std_offset and
+                 self._dst_offset == other._dst_offset and
+                 self._stddayofweek == other._stddayofweek and
+                 self._dstdayofweek == other._dstdayofweek and
+                 self._stdweeknumber == other._stdweeknumber and
+                 self._dstweeknumber == other._dstweeknumber and
+                 self._stdhour == other._stdhour and
+                 self._dsthour == other._dsthour and
+                 self._stdminute == other._stdminute and
+                 self._dstminute == other._dstminute and
+                 self._std_abbr == other._std_abbr and
+                 self._dst_abbr == other._dst_abbr)
+
+    @staticmethod
+    def list():
+        """Return a list of all time zones known to the system."""
+        with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as handle:
+            with winreg.OpenKey(handle, TZKEYNAME) as tzkey:
+                result = [winreg.EnumKey(tzkey, i)
+                          for i in range(winreg.QueryInfoKey(tzkey)[0])]
+        return result
+
+    def display(self):
+        """
+        Return the display name of the time zone.
+        """
+        return self._display
+
+    def transitions(self, year):
+        """
+        For a given year, get the DST on and off transition times, expressed
+        always on the standard time side. For zones with no transitions, this
+        function returns ``None``.
+
+        :param year:
+            The year whose transitions you would like to query.
+
+        :return:
+            Returns a :class:`tuple` of :class:`datetime.datetime` objects,
+            ``(dston, dstoff)`` for zones with an annual DST transition, or
+            ``None`` for fixed offset zones.
+        """
+
+        if not self.hasdst:
+            return None
+
+        dston = picknthweekday(year, self._dstmonth, self._dstdayofweek,
+                               self._dsthour, self._dstminute,
+                               self._dstweeknumber)
+
+        dstoff = picknthweekday(year, self._stdmonth, self._stddayofweek,
+                                self._stdhour, self._stdminute,
+                                self._stdweeknumber)
+
+        # Ambiguous dates default to the STD side
+        dstoff -= self._dst_base_offset
+
+        return dston, dstoff
+
+    def _get_hasdst(self):
+        return self._dstmonth != 0
+
+    @property
+    def _dst_base_offset(self):
+        return self._dst_base_offset_
+
+
+class tzwin(tzwinbase):
+    """
+    Time zone object created from the zone info in the Windows registry
+
+    These are similar to :py:class:`dateutil.tz.tzrange` objects in that
+    the time zone data is provided in the format of a single offset rule
+    for either 0 or 2 time zone transitions per year.
+
+    :param: name
+        The name of a Windows time zone key, e.g. "Eastern Standard Time".
+        The full list of keys can be retrieved with :func:`tzwin.list`.
+    """
+
+    def __init__(self, name):
+        self._name = name
+
+        with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as handle:
+            tzkeyname = text_type("{kn}\\{name}").format(kn=TZKEYNAME, name=name)
+            with winreg.OpenKey(handle, tzkeyname) as tzkey:
+                keydict = valuestodict(tzkey)
+
+        self._std_abbr = keydict["Std"]
+        self._dst_abbr = keydict["Dlt"]
+
+        self._display = keydict["Display"]
+
+        # See http://ww_winreg.jsiinc.com/SUBA/tip0300/rh0398.htm
+        tup = struct.unpack("=3l16h", keydict["TZI"])
+        stdoffset = -tup[0]-tup[1]          # Bias + StandardBias * -1
+        dstoffset = stdoffset-tup[2]        # + DaylightBias * -1
+        self._std_offset = datetime.timedelta(minutes=stdoffset)
+        self._dst_offset = datetime.timedelta(minutes=dstoffset)
+
+        # for the meaning see the win32 TIME_ZONE_INFORMATION structure docs
+        # http://msdn.microsoft.com/en-us/library/windows/desktop/ms725481(v=vs.85).aspx
+        (self._stdmonth,
+         self._stddayofweek,   # Sunday = 0
+         self._stdweeknumber,  # Last = 5
+         self._stdhour,
+         self._stdminute) = tup[4:9]
+
+        (self._dstmonth,
+         self._dstdayofweek,   # Sunday = 0
+         self._dstweeknumber,  # Last = 5
+         self._dsthour,
+         self._dstminute) = tup[12:17]
+
+        self._dst_base_offset_ = self._dst_offset - self._std_offset
+        self.hasdst = self._get_hasdst()
+
+    def __repr__(self):
+        return "tzwin(%s)" % repr(self._name)
+
+    def __reduce__(self):
+        return (self.__class__, (self._name,))
+
+
+class tzwinlocal(tzwinbase):
+    """
+    Class representing the local time zone information in the Windows registry
+
+    While :class:`dateutil.tz.tzlocal` makes system calls (via the :mod:`time`
+    module) to retrieve time zone information, ``tzwinlocal`` retrieves the
+    rules directly from the Windows registry and creates an object like
+    :class:`dateutil.tz.tzwin`.
+
+    Because Windows does not have an equivalent of :func:`time.tzset`, on
+    Windows, :class:`dateutil.tz.tzlocal` instances will always reflect the
+    time zone settings *at the time that the process was started*, meaning
+    changes to the machine's time zone settings during the run of a program
+    on Windows will **not** be reflected by :class:`dateutil.tz.tzlocal`.
+    Because ``tzwinlocal`` reads the registry directly, it is unaffected by
+    this issue.
+    """
+    def __init__(self):
+        with winreg.ConnectRegistry(None, winreg.HKEY_LOCAL_MACHINE) as handle:
+            with winreg.OpenKey(handle, TZLOCALKEYNAME) as tzlocalkey:
+                keydict = valuestodict(tzlocalkey)
+
+            self._std_abbr = keydict["StandardName"]
+            self._dst_abbr = keydict["DaylightName"]
+
+            try:
+                tzkeyname = text_type('{kn}\\{sn}').format(kn=TZKEYNAME,
+                                                          sn=self._std_abbr)
+                with winreg.OpenKey(handle, tzkeyname) as tzkey:
+                    _keydict = valuestodict(tzkey)
+                    self._display = _keydict["Display"]
+            except OSError:
+                self._display = None
+
+        stdoffset = -keydict["Bias"]-keydict["StandardBias"]
+        dstoffset = stdoffset-keydict["DaylightBias"]
+
+        self._std_offset = datetime.timedelta(minutes=stdoffset)
+        self._dst_offset = datetime.timedelta(minutes=dstoffset)
+
+        # For reasons unclear, in this particular key, the day of week has been
+        # moved to the END of the SYSTEMTIME structure.
+        tup = struct.unpack("=8h", keydict["StandardStart"])
+
+        (self._stdmonth,
+         self._stdweeknumber,  # Last = 5
+         self._stdhour,
+         self._stdminute) = tup[1:5]
+
+        self._stddayofweek = tup[7]
+
+        tup = struct.unpack("=8h", keydict["DaylightStart"])
+
+        (self._dstmonth,
+         self._dstweeknumber,  # Last = 5
+         self._dsthour,
+         self._dstminute) = tup[1:5]
+
+        self._dstdayofweek = tup[7]
+
+        self._dst_base_offset_ = self._dst_offset - self._std_offset
+        self.hasdst = self._get_hasdst()
+
+    def __repr__(self):
+        return "tzwinlocal()"
+
+    def __str__(self):
+        # str will return the standard name, not the daylight name.
+        return "tzwinlocal(%s)" % repr(self._std_abbr)
+
+    def __reduce__(self):
+        return (self.__class__, ())
+
+
+def picknthweekday(year, month, dayofweek, hour, minute, whichweek):
+    """ dayofweek == 0 means Sunday, whichweek 5 means last instance """
+    first = datetime.datetime(year, month, 1, hour, minute)
+
+    # This will work if dayofweek is ISO weekday (1-7) or Microsoft-style (0-6),
+    # Because 7 % 7 = 0
+    weekdayone = first.replace(day=((dayofweek - first.isoweekday()) % 7) + 1)
+    wd = weekdayone + ((whichweek - 1) * ONEWEEK)
+    if (wd.month != month):
+        wd -= ONEWEEK
+
+    return wd
+
+
+def valuestodict(key):
+    """Convert a registry key's values to a dictionary."""
+    dout = {}
+    size = winreg.QueryInfoKey(key)[1]
+    tz_res = None
+
+    for i in range(size):
+        key_name, value, dtype = winreg.EnumValue(key, i)
+        if dtype == winreg.REG_DWORD or dtype == winreg.REG_DWORD_LITTLE_ENDIAN:
+            # If it's a DWORD (32-bit integer), it's stored as unsigned - convert
+            # that to a proper signed integer
+            if value & (1 << 31):
+                value = value - (1 << 32)
+        elif dtype == winreg.REG_SZ:
+            # If it's a reference to the tzres DLL, load the actual string
+            if value.startswith('@tzres'):
+                tz_res = tz_res or tzres()
+                value = tz_res.name_from_string(value)
+
+            value = value.rstrip('\x00')    # Remove trailing nulls
+
+        dout[key_name] = value
+
+    return dout
diff --git a/dateutil/zoneinfo/__init__.py b/dateutil/zoneinfo/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..34f11ad66c88047f2c049a4cdcc937b4b78ea6d6
--- /dev/null
+++ b/dateutil/zoneinfo/__init__.py
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+import warnings
+import json
+
+from tarfile import TarFile
+from pkgutil import get_data
+from io import BytesIO
+
+from dateutil.tz import tzfile as _tzfile
+
+__all__ = ["get_zonefile_instance", "gettz", "gettz_db_metadata"]
+
+ZONEFILENAME = "dateutil-zoneinfo.tar.gz"
+METADATA_FN = 'METADATA'
+
+
+class tzfile(_tzfile):
+    def __reduce__(self):
+        return (gettz, (self._filename,))
+
+
+def getzoneinfofile_stream():
+    try:
+        return BytesIO(get_data(__name__, ZONEFILENAME))
+    except IOError as e:  # TODO  switch to FileNotFoundError?
+        warnings.warn("I/O error({0}): {1}".format(e.errno, e.strerror))
+        return None
+
+
+class ZoneInfoFile(object):
+    def __init__(self, zonefile_stream=None):
+        if zonefile_stream is not None:
+            with TarFile.open(fileobj=zonefile_stream) as tf:
+                self.zones = {zf.name: tzfile(tf.extractfile(zf), filename=zf.name)
+                              for zf in tf.getmembers()
+                              if zf.isfile() and zf.name != METADATA_FN}
+                # deal with links: They'll point to their parent object. Less
+                # waste of memory
+                links = {zl.name: self.zones[zl.linkname]
+                         for zl in tf.getmembers() if
+                         zl.islnk() or zl.issym()}
+                self.zones.update(links)
+                try:
+                    metadata_json = tf.extractfile(tf.getmember(METADATA_FN))
+                    metadata_str = metadata_json.read().decode('UTF-8')
+                    self.metadata = json.loads(metadata_str)
+                except KeyError:
+                    # no metadata in tar file
+                    self.metadata = None
+        else:
+            self.zones = {}
+            self.metadata = None
+
+    def get(self, name, default=None):
+        """
+        Wrapper for :func:`ZoneInfoFile.zones.get`. This is a convenience method
+        for retrieving zones from the zone dictionary.
+
+        :param name:
+            The name of the zone to retrieve. (Generally IANA zone names)
+
+        :param default:
+            The value to return in the event of a missing key.
+
+        .. versionadded:: 2.6.0
+
+        """
+        return self.zones.get(name, default)
+
+
+# The current API has gettz as a module function, although in fact it taps into
+# a stateful class. So as a workaround for now, without changing the API, we
+# will create a new "global" class instance the first time a user requests a
+# timezone. Ugly, but adheres to the api.
+#
+# TODO: Remove after deprecation period.
+_CLASS_ZONE_INSTANCE = []
+
+
+def get_zonefile_instance(new_instance=False):
+    """
+    This is a convenience function which provides a :class:`ZoneInfoFile`
+    instance using the data provided by the ``dateutil`` package. By default, it
+    caches a single instance of the ZoneInfoFile object and returns that.
+
+    :param new_instance:
+        If ``True``, a new instance of :class:`ZoneInfoFile` is instantiated and
+        used as the cached instance for the next call. Otherwise, new instances
+        are created only as necessary.
+
+    :return:
+        Returns a :class:`ZoneInfoFile` object.
+
+    .. versionadded:: 2.6
+    """
+    if new_instance:
+        zif = None
+    else:
+        zif = getattr(get_zonefile_instance, '_cached_instance', None)
+
+    if zif is None:
+        zif = ZoneInfoFile(getzoneinfofile_stream())
+
+        get_zonefile_instance._cached_instance = zif
+
+    return zif
+
+
+def gettz(name):
+    """
+    This retrieves a time zone from the local zoneinfo tarball that is packaged
+    with dateutil.
+
+    :param name:
+        An IANA-style time zone name, as found in the zoneinfo file.
+
+    :return:
+        Returns a :class:`dateutil.tz.tzfile` time zone object.
+
+    .. warning::
+        It is generally inadvisable to use this function, and it is only
+        provided for API compatibility with earlier versions. This is *not*
+        equivalent to ``dateutil.tz.gettz()``, which selects an appropriate
+        time zone based on the inputs, favoring system zoneinfo. This is ONLY
+        for accessing the dateutil-specific zoneinfo (which may be out of
+        date compared to the system zoneinfo).
+
+    .. deprecated:: 2.6
+        If you need to use a specific zoneinfofile over the system zoneinfo,
+        instantiate a :class:`dateutil.zoneinfo.ZoneInfoFile` object and call
+        :func:`dateutil.zoneinfo.ZoneInfoFile.get(name)` instead.
+
+        Use :func:`get_zonefile_instance` to retrieve an instance of the
+        dateutil-provided zoneinfo.
+    """
+    warnings.warn("zoneinfo.gettz() will be removed in future versions, "
+                  "to use the dateutil-provided zoneinfo files, instantiate a "
+                  "ZoneInfoFile object and use ZoneInfoFile.zones.get() "
+                  "instead. See the documentation for details.",
+                  DeprecationWarning)
+
+    if len(_CLASS_ZONE_INSTANCE) == 0:
+        _CLASS_ZONE_INSTANCE.append(ZoneInfoFile(getzoneinfofile_stream()))
+    return _CLASS_ZONE_INSTANCE[0].zones.get(name)
+
+
+def gettz_db_metadata():
+    """ Get the zonefile metadata
+
+    See `zonefile_metadata`_
+
+    :returns:
+        A dictionary with the database metadata
+
+    .. deprecated:: 2.6
+        See deprecation warning in :func:`zoneinfo.gettz`. To get metadata,
+        query the attribute ``zoneinfo.ZoneInfoFile.metadata``.
+    """
+    warnings.warn("zoneinfo.gettz_db_metadata() will be removed in future "
+                  "versions, to use the dateutil-provided zoneinfo files, "
+                  "ZoneInfoFile object and query the 'metadata' attribute "
+                  "instead. See the documentation for details.",
+                  DeprecationWarning)
+
+    if len(_CLASS_ZONE_INSTANCE) == 0:
+        _CLASS_ZONE_INSTANCE.append(ZoneInfoFile(getzoneinfofile_stream()))
+    return _CLASS_ZONE_INSTANCE[0].metadata
diff --git a/dateutil/zoneinfo/rebuild.py b/dateutil/zoneinfo/rebuild.py
new file mode 100644
index 0000000000000000000000000000000000000000..684c6586f091350c347f2b6150935f5214ffec27
--- /dev/null
+++ b/dateutil/zoneinfo/rebuild.py
@@ -0,0 +1,75 @@
+import logging
+import os
+import tempfile
+import shutil
+import json
+from subprocess import check_call, check_output
+from tarfile import TarFile
+
+from dateutil.zoneinfo import METADATA_FN, ZONEFILENAME
+
+
+def rebuild(filename, tag=None, format="gz", zonegroups=[], metadata=None):
+    """Rebuild the internal timezone info in dateutil/zoneinfo/zoneinfo*tar*
+
+    filename is the timezone tarball from ``ftp.iana.org/tz``.
+
+    """
+    tmpdir = tempfile.mkdtemp()
+    zonedir = os.path.join(tmpdir, "zoneinfo")
+    moduledir = os.path.dirname(__file__)
+    try:
+        with TarFile.open(filename) as tf:
+            for name in zonegroups:
+                tf.extract(name, tmpdir)
+            filepaths = [os.path.join(tmpdir, n) for n in zonegroups]
+
+            _run_zic(zonedir, filepaths)
+
+        # write metadata file
+        with open(os.path.join(zonedir, METADATA_FN), 'w') as f:
+            json.dump(metadata, f, indent=4, sort_keys=True)
+        target = os.path.join(moduledir, ZONEFILENAME)
+        with TarFile.open(target, "w:%s" % format) as tf:
+            for entry in os.listdir(zonedir):
+                entrypath = os.path.join(zonedir, entry)
+                tf.add(entrypath, entry)
+    finally:
+        shutil.rmtree(tmpdir)
+
+
+def _run_zic(zonedir, filepaths):
+    """Calls the ``zic`` compiler in a compatible way to get a "fat" binary.
+
+    Recent versions of ``zic`` default to ``-b slim``, while older versions
+    don't even have the ``-b`` option (but default to "fat" binaries). The
+    current version of dateutil does not support Version 2+ TZif files, which
+    causes problems when used in conjunction with "slim" binaries, so this
+    function is used to ensure that we always get a "fat" binary.
+    """
+
+    try:
+        help_text = check_output(["zic", "--help"])
+    except OSError as e:
+        _print_on_nosuchfile(e)
+        raise
+
+    if b"-b " in help_text:
+        bloat_args = ["-b", "fat"]
+    else:
+        bloat_args = []
+
+    check_call(["zic"] + bloat_args + ["-d", zonedir] + filepaths)
+
+
+def _print_on_nosuchfile(e):
+    """Print helpful troubleshooting message
+
+    e is an exception raised by subprocess.check_call()
+
+    """
+    if e.errno == 2:
+        logging.error(
+            "Could not find zic. Perhaps you need to install "
+            "libc-bin or some other package that provides it, "
+            "or it's not in your PATH?")
diff --git a/pandas/_config/__init__.py b/pandas/_config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d7fe239b8f8a8f9615ecee81caef501076e8224
--- /dev/null
+++ b/pandas/_config/__init__.py
@@ -0,0 +1,45 @@
+"""
+pandas._config is considered explicitly upstream of everything else in pandas,
+should have no intra-pandas dependencies.
+
+importing `dates` and `display` ensures that keys needed by _libs
+are initialized.
+"""
+
+__all__ = [
+    "config",
+    "describe_option",
+    "detect_console_encoding",
+    "get_option",
+    "option_context",
+    "options",
+    "reset_option",
+    "set_option",
+]
+from pandas._config import config
+from pandas._config import dates  # pyright: ignore[reportUnusedImport]  # noqa: F401
+from pandas._config.config import (
+    _global_config,
+    describe_option,
+    get_option,
+    option_context,
+    options,
+    reset_option,
+    set_option,
+)
+from pandas._config.display import detect_console_encoding
+
+
+def using_string_dtype() -> bool:
+    _mode_options = _global_config["future"]
+    return _mode_options["infer_string"]
+
+
+def using_python_scalars() -> bool:
+    _mode_options = _global_config["future"]
+    return _mode_options["python_scalars"]
+
+
+def is_nan_na() -> bool:
+    _mode_options = _global_config["future"]
+    return not _mode_options["distinguish_nan_and_na"]
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..041596cf002a3c9a0d8a58e97a25de559d41f49d
--- /dev/null
+++ b/pandas/_config/config.py
@@ -0,0 +1,954 @@
+"""
+The config module holds package-wide configurables and provides
+a uniform API for working with them.
+
+Overview
+========
+
+This module supports the following requirements:
+- options are referenced using keys in dot.notation, e.g. "x.y.option - z".
+- keys are case-insensitive.
+- functions should accept partial/regex keys, when unambiguous.
+- options can be registered by modules at import time.
+- options can be registered at init-time (via core.config_init)
+- options have a default value, and (optionally) a description and
+  validation function associated with them.
+- options can be deprecated, in which case referencing them
+  should produce a warning.
+- deprecated options can optionally be rerouted to a replacement
+  so that accessing a deprecated option reroutes to a differently
+  named option.
+- options can be reset to their default value.
+- all option can be reset to their default value at once.
+- all options in a certain sub - namespace can be reset at once.
+- the user can set / get / reset or ask for the description of an option.
+- a developer can register and mark an option as deprecated.
+- you can register a callback to be invoked when the option value
+  is set or reset. Changing the stored value is considered misuse, but
+  is not verboten.
+
+Implementation
+==============
+
+- Data is stored using nested dictionaries, and should be accessed
+  through the provided API.
+
+- "Registered options" and "Deprecated options" have metadata associated
+  with them, which are stored in auxiliary dictionaries keyed on the
+  fully-qualified key, e.g. "x.y.z.option".
+
+- the config_init module is imported by the package's __init__.py file.
+  placing any register_option() calls there will ensure those options
+  are available as soon as pandas is loaded. If you use register_option
+  in a module, it will only be available after that module is imported,
+  which you should be aware of.
+
+- `config_prefix` is a context_manager (for use with the `with` keyword)
+  which can save developers some typing, see the docstring.
+
+"""
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+import re
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    NamedTuple,
+    cast,
+)
+import warnings
+
+from pandas._typing import F
+from pandas.util._exceptions import find_stack_level
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Generator,
+        Sequence,
+    )
+
+
+class DeprecatedOption(NamedTuple):
+    key: str
+    category: type[Warning]
+    msg: str | None
+    rkey: str | None
+    removal_ver: str | None
+
+
+class RegisteredOption(NamedTuple):
+    key: str
+    defval: Any
+    doc: str
+    validator: Callable[[object], Any] | None
+    cb: Callable[[str], Any] | None
+
+
+# holds deprecated option metadata
+_deprecated_options: dict[str, DeprecatedOption] = {}
+
+# holds registered option metadata
+_registered_options: dict[str, RegisteredOption] = {}
+
+# holds the current values for registered options
+_global_config: dict[str, Any] = {}
+
+# keys which have a special meaning
+_reserved_keys: list[str] = ["all"]
+
+
+class OptionError(AttributeError, KeyError):
+    """
+    Exception raised for pandas.options.
+
+    Backwards compatible with KeyError checks.
+
+    See Also
+    --------
+    options : Access and modify global pandas settings.
+
+    Examples
+    --------
+    >>> pd.options.context
+    Traceback (most recent call last):
+    OptionError: No such option
+    """
+
+    __module__ = "pandas.errors"
+
+
+#
+# User API
+
+
+def _get_single_key(pat: str) -> str:
+    keys = _select_options(pat)
+    if len(keys) == 0:
+        _warn_if_deprecated(pat)
+        raise OptionError(f"No such keys(s): {pat!r}")
+    if len(keys) > 1:
+        raise OptionError("Pattern matched multiple keys")
+    key = keys[0]
+
+    _warn_if_deprecated(key)
+
+    key = _translate_key(key)
+
+    return key
+
+
+def get_option(pat: str) -> Any:
+    """
+    Retrieve the value of the specified option.
+
+    This method allows users to query the current value of a given option
+    in the pandas configuration system. Options control various display,
+    performance, and behavior-related settings within pandas.
+
+    Parameters
+    ----------
+    pat : str
+        Regexp which should match a single option.
+
+        .. warning::
+
+            Partial matches are supported for convenience, but unless you use the
+            full option name (e.g. x.y.z.option_name), your code may break in future
+            versions if new options with similar names are introduced.
+
+    Returns
+    -------
+    Any
+        The value of the option.
+
+    Raises
+    ------
+    OptionError : if no such option exists
+
+    See Also
+    --------
+    set_option : Set the value of the specified option or options.
+    reset_option : Reset one or more options to their default value.
+    describe_option : Print the description for one or more registered options.
+
+    Notes
+    -----
+    For all available options, please view the :ref:`User Guide <options.available>`
+    or use ``pandas.describe_option()``.
+
+    Examples
+    --------
+    >>> pd.get_option("display.max_columns")  # doctest: +SKIP
+    4
+    """
+    key = _get_single_key(pat)
+
+    # walk the nested dict
+    root, k = _get_root(key)
+    return root[k]
+
+
+def set_option(*args) -> None:
+    """
+    Set the value of the specified option or options.
+
+    This method allows fine-grained control over the behavior and display settings
+    of pandas. Options affect various functionalities such as output formatting,
+    display limits, and operational behavior. Settings can be modified at runtime
+    without requiring changes to global configurations or environment variables.
+
+    Parameters
+    ----------
+    *args : str | object | dict
+        Arguments provided in pairs, which will be interpreted as (pattern, value),
+        or as a single dictionary containing multiple option-value pairs.
+        pattern: str
+        Regexp which should match a single option
+        value: object
+        New value of option
+
+        .. warning::
+
+            Partial pattern matches are supported for convenience, but unless you
+            use the full option name (e.g. x.y.z.option_name), your code may break in
+            future versions if new options with similar names are introduced.
+
+    Returns
+    -------
+    None
+        No return value.
+
+    Raises
+    ------
+    ValueError if odd numbers of non-keyword arguments are provided
+    TypeError if keyword arguments are provided
+    OptionError if no such option exists
+
+    See Also
+    --------
+    get_option : Retrieve the value of the specified option.
+    reset_option : Reset one or more options to their default value.
+    describe_option : Print the description for one or more registered options.
+    option_context : Context manager to temporarily set options in a ``with``
+        statement.
+
+    Notes
+    -----
+    For all available options, please view the :ref:`User Guide <options.available>`
+    or use ``pandas.describe_option()``.
+
+    Examples
+    --------
+    Option-Value Pair Input:
+
+    >>> pd.set_option("display.max_columns", 4)
+    >>> df = pd.DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+    >>> df
+    0  1  ...  3   4
+    0  1  2  ...  4   5
+    1  6  7  ...  9  10
+    [2 rows x 5 columns]
+    >>> pd.reset_option("display.max_columns")
+
+    Dictionary Input:
+
+    >>> pd.set_option({"display.max_columns": 4, "display.precision": 1})
+    >>> df = pd.DataFrame([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
+    >>> df
+    0  1  ...  3   4
+    0  1  2  ...  4   5
+    1  6  7  ...  9  10
+    [2 rows x 5 columns]
+    >>> pd.reset_option("display.max_columns")
+    >>> pd.reset_option("display.precision")
+    """
+    # Handle dictionary input
+    if len(args) == 1 and isinstance(args[0], dict):
+        args = tuple(kv for item in args[0].items() for kv in item)
+
+    nargs = len(args)
+    if not nargs or nargs % 2 != 0:
+        raise ValueError("Must provide an even number of non-keyword arguments")
+
+    for k, v in zip(args[::2], args[1::2], strict=True):
+        key = _get_single_key(k)
+
+        opt = _get_registered_option(key)
+        if opt and opt.validator:
+            opt.validator(v)
+
+        # walk the nested dict
+        root, k_root = _get_root(key)
+        root[k_root] = v
+
+        if opt.cb:
+            opt.cb(key)
+
+
+def describe_option(pat: str = "", _print_desc: bool = True) -> str | None:
+    """
+    Print the description for one or more registered options.
+
+    Call with no arguments to get a listing for all registered options.
+
+    Parameters
+    ----------
+    pat : str, default ""
+        String or string regexp pattern.
+        Empty string will return all options.
+        For regexp strings, all matching keys will have their description displayed.
+    _print_desc : bool, default True
+        If True (default) the description(s) will be printed to stdout.
+        Otherwise, the description(s) will be returned as a string
+        (for testing).
+
+    Returns
+    -------
+    None
+        If ``_print_desc=True``.
+    str
+        If the description(s) as a string if ``_print_desc=False``.
+
+    See Also
+    --------
+    get_option : Retrieve the value of the specified option.
+    set_option : Set the value of the specified option or options.
+    reset_option : Reset one or more options to their default value.
+
+    Notes
+    -----
+    For all available options, please view the
+    :ref:`User Guide <options.available>`.
+
+    Examples
+    --------
+    >>> pd.describe_option("display.max_columns")  # doctest: +SKIP
+    display.max_columns : int
+        If max_cols is exceeded, switch to truncate view...
+    """
+    keys = _select_options(pat)
+    if len(keys) == 0:
+        raise OptionError(f"No such keys(s) for {pat=}")
+
+    s = "\n".join([_build_option_description(k) for k in keys])
+
+    if _print_desc:
+        print(s)
+        return None
+    return s
+
+
+def reset_option(pat: str) -> None:
+    """
+    Reset one or more options to their default value.
+
+    This method resets the specified pandas option(s) back to their default
+    values. It allows partial string matching for convenience, but users should
+    exercise caution to avoid unintended resets due to changes in option names
+    in future versions.
+
+    Parameters
+    ----------
+    pat : str/regex
+        If specified only options matching ``pat*`` will be reset.
+        Pass ``"all"`` as argument to reset all options.
+
+        .. warning::
+
+            Partial matches are supported for convenience, but unless you
+            use the full option name (e.g. x.y.z.option_name), your code may break
+            in future versions if new options with similar names are introduced.
+
+    Returns
+    -------
+    None
+        No return value.
+
+    See Also
+    --------
+    get_option : Retrieve the value of the specified option.
+    set_option : Set the value of the specified option or options.
+    describe_option : Print the description for one or more registered options.
+
+    Notes
+    -----
+    For all available options, please view the
+    :ref:`User Guide <options.available>`.
+
+    Examples
+    --------
+    >>> pd.reset_option("display.max_columns")  # doctest: +SKIP
+    """
+    keys = _select_options(pat)
+
+    if len(keys) == 0:
+        raise OptionError(f"No such keys(s) for {pat=}")
+
+    if len(keys) > 1 and len(pat) < 4 and pat != "all":
+        raise ValueError(
+            "You must specify at least 4 characters when "
+            "resetting multiple keys, use the special keyword "
+            '"all" to reset all the options to their default value'
+        )
+
+    for k in keys:
+        set_option(k, _registered_options[k].defval)
+
+
+def get_default_val(pat: str):
+    key = _get_single_key(pat)
+    return _get_registered_option(key).defval
+
+
+class DictWrapper:
+    """provide attribute-style access to a nested dict"""
+
+    d: dict[str, Any]
+
+    def __init__(self, d: dict[str, Any], prefix: str = "") -> None:
+        object.__setattr__(self, "d", d)
+        object.__setattr__(self, "prefix", prefix)
+
+    def __setattr__(self, key: str, val: Any) -> None:
+        prefix = object.__getattribute__(self, "prefix")
+        if prefix:
+            prefix += "."
+        prefix += key
+        # you can't set new keys
+        # can you can't overwrite subtrees
+        if key in self.d and not isinstance(self.d[key], dict):
+            set_option(prefix, val)
+        else:
+            raise OptionError("You can only set the value of existing options")
+
+    def __getattr__(self, key: str):
+        prefix = object.__getattribute__(self, "prefix")
+        if prefix:
+            prefix += "."
+        prefix += key
+        try:
+            v = object.__getattribute__(self, "d")[key]
+        except KeyError as err:
+            raise OptionError("No such option") from err
+        if isinstance(v, dict):
+            return DictWrapper(v, prefix)
+        else:
+            return get_option(prefix)
+
+    def __dir__(self) -> list[str]:
+        return list(self.d.keys())
+
+
+options = DictWrapper(_global_config)
+# DictWrapper defines a custom setattr
+object.__setattr__(options, "__module__", "pandas")
+
+#
+# Functions for use by pandas developers, in addition to User - api
+
+
+@contextmanager
+def option_context(*args) -> Generator[None]:
+    """
+    Context manager to temporarily set options in a ``with`` statement.
+
+    This method allows users to set one or more pandas options temporarily
+    within a controlled block. The previous options' values are restored
+    once the block is exited. This is useful when making temporary adjustments
+    to pandas' behavior without affecting the global state.
+
+    Parameters
+    ----------
+    *args : str | object | dict
+        An even amount of arguments provided in pairs which will be
+        interpreted as (pattern, value) pairs. Alternatively, a single
+        dictionary of {pattern: value} may be provided.
+
+    Returns
+    -------
+    None
+        No return value.
+
+    Yields
+    ------
+    None
+        No yield value.
+
+    See Also
+    --------
+    get_option : Retrieve the value of the specified option.
+    set_option : Set the value of the specified option.
+    reset_option : Reset one or more options to their default value.
+    describe_option : Print the description for one or more registered options.
+
+    Notes
+    -----
+    For all available options, please view the :ref:`User Guide <options.available>`
+    or use ``pandas.describe_option()``.
+
+    Examples
+    --------
+    >>> from pandas import option_context
+    >>> with option_context("display.max_rows", 10, "display.max_columns", 5):
+    ...     pass
+    >>> with option_context({"display.max_rows": 10, "display.max_columns": 5}):
+    ...     pass
+    """
+    if len(args) == 1 and isinstance(args[0], dict):
+        args = tuple(kv for item in args[0].items() for kv in item)
+
+    if len(args) % 2 != 0 or len(args) < 2:
+        raise ValueError(
+            "Provide an even amount of arguments as "
+            "option_context(pat, val, pat, val...)."
+        )
+
+    ops = tuple(zip(args[::2], args[1::2], strict=True))
+    undo: tuple[tuple[Any, Any], ...] = ()
+    try:
+        undo = tuple((pat, get_option(pat)) for pat, val in ops)
+        for pat, val in ops:
+            set_option(pat, val)
+        yield
+    finally:
+        for pat, val in undo:
+            set_option(pat, val)
+
+
+def register_option(
+    key: str,
+    defval: object,
+    doc: str = "",
+    validator: Callable[[object], Any] | None = None,
+    cb: Callable[[str], Any] | None = None,
+) -> None:
+    """
+    Register an option in the package-wide pandas config object
+
+    Parameters
+    ----------
+    key : str
+        Fully-qualified key, e.g. "x.y.option - z".
+    defval : object
+        Default value of the option.
+    doc : str
+        Description of the option.
+    validator : Callable, optional
+        Function of a single argument, should raise `ValueError` if
+        called with a value which is not a legal value for the option.
+    cb
+        a function of a single argument "key", which is called
+        immediately after an option value is set/reset. key is
+        the full name of the option.
+
+    Raises
+    ------
+    ValueError if `validator` is specified and `defval` is not a valid value.
+
+    """
+    import keyword
+    import tokenize
+
+    key = key.lower()
+
+    if key in _registered_options:
+        raise OptionError(f"Option '{key}' has already been registered")
+    if key in _reserved_keys:
+        raise OptionError(f"Option '{key}' is a reserved key")
+
+    # the default value should be legal
+    if validator:
+        validator(defval)
+
+    # walk the nested dict, creating dicts as needed along the path
+    path = key.split(".")
+
+    for k in path:
+        if not re.match("^" + tokenize.Name + "$", k):
+            raise ValueError(f"{k} is not a valid identifier")
+        if keyword.iskeyword(k):
+            raise ValueError(f"{k} is a python keyword")
+
+    cursor = _global_config
+    msg = "Path prefix to option '{option}' is already an option"
+
+    for i, p in enumerate(path[:-1]):
+        if not isinstance(cursor, dict):
+            raise OptionError(msg.format(option=".".join(path[:i])))
+        if p not in cursor:
+            cursor[p] = {}
+        cursor = cursor[p]
+
+    if not isinstance(cursor, dict):
+        raise OptionError(msg.format(option=".".join(path[:-1])))
+
+    cursor[path[-1]] = defval  # initialize
+
+    # save the option metadata
+    _registered_options[key] = RegisteredOption(
+        key=key, defval=defval, doc=doc, validator=validator, cb=cb
+    )
+
+
+def deprecate_option(
+    key: str,
+    category: type[Warning],
+    msg: str | None = None,
+    rkey: str | None = None,
+    removal_ver: str | None = None,
+) -> None:
+    """
+    Mark option `key` as deprecated, if code attempts to access this option,
+    a warning will be produced, using `msg` if given, or a default message
+    if not.
+    if `rkey` is given, any access to the key will be re-routed to `rkey`.
+
+    Neither the existence of `key` nor that if `rkey` is checked. If they
+    do not exist, any subsequence access will fail as usual, after the
+    deprecation warning is given.
+
+    Parameters
+    ----------
+    key : str
+        Name of the option to be deprecated.
+        must be a fully-qualified option name (e.g "x.y.z.rkey").
+    category : Warning
+        Warning class for the deprecation.
+    msg : str, optional
+        Warning message to output when the key is referenced.
+        if no message is given a default message will be emitted.
+    rkey : str, optional
+        Name of an option to reroute access to.
+        If specified, any referenced `key` will be
+        re-routed to `rkey` including set/get/reset.
+        rkey must be a fully-qualified option name (e.g "x.y.z.rkey").
+        used by the default message if no `msg` is specified.
+    removal_ver : str, optional
+        Specifies the version in which this option will
+        be removed. used by the default message if no `msg` is specified.
+
+    Raises
+    ------
+    OptionError
+        If the specified key has already been deprecated.
+    """
+    key = key.lower()
+
+    if key in _deprecated_options:
+        raise OptionError(f"Option '{key}' has already been defined as deprecated.")
+
+    _deprecated_options[key] = DeprecatedOption(key, category, msg, rkey, removal_ver)
+
+
+#
+# functions internal to the module
+
+
+def _select_options(pat: str) -> list[str]:
+    """
+    returns a list of keys matching `pat`
+
+    if pat=="all", returns all registered options
+    """
+    # short-circuit for exact key
+    if pat in _registered_options:
+        return [pat]
+
+    # else look through all of them
+    keys = sorted(_registered_options.keys())
+    if pat == "all":  # reserved key
+        return keys
+
+    return [k for k in keys if re.search(pat, k, re.I)]
+
+
+def _get_root(key: str) -> tuple[dict[str, Any], str]:
+    path = key.split(".")
+    cursor = _global_config
+    for p in path[:-1]:
+        cursor = cursor[p]
+    return cursor, path[-1]
+
+
+def _get_deprecated_option(key: str):
+    """
+    Retrieves the metadata for a deprecated option, if `key` is deprecated.
+
+    Returns
+    -------
+    DeprecatedOption (namedtuple) if key is deprecated, None otherwise
+    """
+    try:
+        d = _deprecated_options[key]
+    except KeyError:
+        return None
+    else:
+        return d
+
+
+def _get_registered_option(key: str):
+    """
+    Retrieves the option metadata if `key` is a registered option.
+
+    Returns
+    -------
+    RegisteredOption (namedtuple) if key is deprecated, None otherwise
+    """
+    return _registered_options.get(key)
+
+
+def _translate_key(key: str) -> str:
+    """
+    if `key` is deprecated and a replacement key defined, will return the
+    replacement key, otherwise returns `key` as-is
+    """
+    d = _get_deprecated_option(key)
+    if d:
+        return d.rkey or key
+    else:
+        return key
+
+
+def _warn_if_deprecated(key: str) -> bool:
+    """
+    Checks if `key` is a deprecated option and if so, prints a warning.
+
+    Returns
+    -------
+    bool - True if `key` is deprecated, False otherwise.
+    """
+    d = _get_deprecated_option(key)
+    if d:
+        if d.msg:
+            warnings.warn(
+                d.msg,
+                d.category,
+                stacklevel=find_stack_level(),
+            )
+        else:
+            msg = f"'{key}' is deprecated"
+            if d.removal_ver:
+                msg += f" and will be removed in {d.removal_ver}"
+            if d.rkey:
+                msg += f", please use '{d.rkey}' instead."
+            else:
+                msg += ", please refrain from using it."
+
+            warnings.warn(
+                msg,
+                d.category,
+                stacklevel=find_stack_level(),
+            )
+        return True
+    return False
+
+
+def _build_option_description(k: str) -> str:
+    """Builds a formatted description of a registered option and prints it"""
+    o = _get_registered_option(k)
+    d = _get_deprecated_option(k)
+
+    s = f"{k} "
+
+    if o.doc:
+        s += "\n".join(o.doc.strip().split("\n"))
+    else:
+        s += "No description available."
+
+    if o:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", FutureWarning)
+            warnings.simplefilter("ignore", DeprecationWarning)
+            s += f"\n    [default: {o.defval}] [currently: {get_option(k)}]"
+
+    if d:
+        rkey = d.rkey or ""
+        s += "\n    (Deprecated"
+        s += f", use `{rkey}` instead."
+        s += ")"
+
+    return s
+
+
+# helpers
+
+
+@contextmanager
+def config_prefix(prefix: str) -> Generator[None]:
+    """
+    contextmanager for multiple invocations of API with a common prefix
+
+    supported API functions: (register / get / set )__option
+
+    Warning: This is not thread - safe, and won't work properly if you import
+    the API functions into your module using the "from x import y" construct.
+
+    Example
+    -------
+    import pandas._config.config as cf
+    with cf.config_prefix("display.font"):
+        cf.register_option("color", "red")
+        cf.register_option("size", " 5 pt")
+        cf.set_option(size, " 6 pt")
+        cf.get_option(size)
+        ...
+
+        etc'
+
+    will register options "display.font.color", "display.font.size", set the
+    value of "display.font.size"... and so on.
+    """
+    # Note: reset_option relies on set_option, and on key directly
+    # it does not fit in to this monkey-patching scheme
+
+    global register_option, get_option, set_option
+
+    def wrap(func: F) -> F:
+        def inner(key: str, *args, **kwds):
+            pkey = f"{prefix}.{key}"
+            return func(pkey, *args, **kwds)
+
+        return cast(F, inner)
+
+    _register_option = register_option
+    _get_option = get_option
+    _set_option = set_option
+    set_option = wrap(set_option)
+    get_option = wrap(get_option)
+    register_option = wrap(register_option)
+    try:
+        yield
+    finally:
+        set_option = _set_option
+        get_option = _get_option
+        register_option = _register_option
+
+
+# These factories and methods are handy for use as the validator
+# arg in register_option
+
+
+def is_type_factory(_type: type[Any]) -> Callable[[Any], None]:
+    """
+
+    Parameters
+    ----------
+    `_type` - a type to be compared against (e.g. type(x) == `_type`)
+
+    Returns
+    -------
+    validator - a function of a single argument x , which raises
+                ValueError if type(x) is not equal to `_type`
+
+    """
+
+    def inner(x) -> None:
+        if type(x) != _type:
+            raise ValueError(f"Value must have type '{_type}'")
+
+    return inner
+
+
+def is_instance_factory(_type: type | tuple[type, ...]) -> Callable[[Any], None]:
+    """
+
+    Parameters
+    ----------
+    `_type` - the type to be checked against
+
+    Returns
+    -------
+    validator - a function of a single argument x , which raises
+                ValueError if x is not an instance of `_type`
+
+    """
+    if isinstance(_type, tuple):
+        type_repr = "|".join(map(str, _type))
+    else:
+        type_repr = f"'{_type}'"
+
+    def inner(x) -> None:
+        if not isinstance(x, _type):
+            raise ValueError(f"Value must be an instance of {type_repr}")
+
+    return inner
+
+
+def is_one_of_factory(legal_values: Sequence) -> Callable[[Any], None]:
+    callables = [c for c in legal_values if callable(c)]
+    legal_values = [c for c in legal_values if not callable(c)]
+
+    def inner(x) -> None:
+        if x not in legal_values:
+            if not any(c(x) for c in callables):
+                uvals = [str(lval) for lval in legal_values]
+                pp_values = "|".join(uvals)
+                msg = f"Value must be one of {pp_values}"
+                if len(callables):
+                    msg += " or a callable"
+                raise ValueError(msg)
+
+    return inner
+
+
+def is_nonnegative_int(value: object) -> None:
+    """
+    Verify that value is None or a positive int.
+
+    Parameters
+    ----------
+    value : None or int
+            The `value` to be checked.
+
+    Raises
+    ------
+    ValueError
+        When the value is not None or is a negative integer
+    """
+    if value is None:
+        return
+
+    elif isinstance(value, int):
+        if value >= 0:
+            return
+
+    msg = "Value must be a nonnegative integer or None"
+    raise ValueError(msg)
+
+
+# common type validators, for convenience
+# usage: register_option(... , validator = is_int)
+is_int = is_type_factory(int)
+is_bool = is_type_factory(bool)
+is_float = is_type_factory(float)
+is_str = is_type_factory(str)
+is_text = is_instance_factory((str, bytes))
+
+
+def is_callable(obj: object) -> bool:
+    """
+
+    Parameters
+    ----------
+    `obj` - the object to be checked
+
+    Returns
+    -------
+    validator - returns True if object is callable
+        raises ValueError otherwise.
+
+    """
+    if not callable(obj):
+        raise ValueError("Value must be a callable")
+    return True
+
+
+# import set_module here would cause circular import
+get_option.__module__ = "pandas"
+set_option.__module__ = "pandas"
+describe_option.__module__ = "pandas"
+reset_option.__module__ = "pandas"
+option_context.__module__ = "pandas"
diff --git a/pandas/_config/dates.py b/pandas/_config/dates.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d9f5d390dc9c82d66c2f02c20f8d569085db177
--- /dev/null
+++ b/pandas/_config/dates.py
@@ -0,0 +1,26 @@
+"""
+config for datetime formatting
+"""
+
+from __future__ import annotations
+
+from pandas._config import config as cf
+
+pc_date_dayfirst_doc = """
+: boolean
+    When True, prints and parses dates with the day first, eg 20/01/2005
+"""
+
+pc_date_yearfirst_doc = """
+: boolean
+    When True, prints and parses dates with the year first, eg 2005/01/20
+"""
+
+with cf.config_prefix("display"):
+    # Needed upstream of `_libs` because these are used in tslibs.parsing
+    cf.register_option(
+        "date_dayfirst", False, pc_date_dayfirst_doc, validator=cf.is_bool
+    )
+    cf.register_option(
+        "date_yearfirst", False, pc_date_yearfirst_doc, validator=cf.is_bool
+    )
diff --git a/pandas/_config/display.py b/pandas/_config/display.py
new file mode 100644
index 0000000000000000000000000000000000000000..df2c3ad36c855d77c33d80c78c3d83ab3c09d5f9
--- /dev/null
+++ b/pandas/_config/display.py
@@ -0,0 +1,62 @@
+"""
+Unopinionated display configuration.
+"""
+
+from __future__ import annotations
+
+import locale
+import sys
+
+from pandas._config import config as cf
+
+# -----------------------------------------------------------------------------
+# Global formatting options
+_initial_defencoding: str | None = None
+
+
+def detect_console_encoding() -> str:
+    """
+    Try to find the most capable encoding supported by the console.
+    slightly modified from the way IPython handles the same issue.
+    """
+    global _initial_defencoding
+
+    encoding = None
+    try:
+        encoding = sys.stdout.encoding or sys.stdin.encoding
+    except (AttributeError, OSError):
+        pass
+
+    # try again for something better
+    if not encoding or "ascii" in encoding.lower():
+        try:
+            encoding = locale.getpreferredencoding()
+        except locale.Error:
+            # can be raised by locale.setlocale(), which is
+            #  called by getpreferredencoding
+            #  (on some systems, see stdlib locale docs)
+            pass
+
+    # when all else fails. this will usually be "ascii"
+    if not encoding or "ascii" in encoding.lower():
+        encoding = sys.getdefaultencoding()
+
+    # GH#3360, save the reported defencoding at import time
+    # MPL backends may change it. Make available for debugging.
+    if not _initial_defencoding:
+        _initial_defencoding = sys.getdefaultencoding()
+
+    return encoding
+
+
+pc_encoding_doc = """
+: str/unicode
+    Defaults to the detected encoding of the console.
+    Specifies the encoding to be used for strings returned by to_string,
+    these are generally strings meant to be displayed on the console.
+"""
+
+with cf.config_prefix("display"):
+    cf.register_option(
+        "encoding", detect_console_encoding(), pc_encoding_doc, validator=cf.is_text
+    )
diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e2af78f68eb21201476bcc42afaba419f8e1a8a
--- /dev/null
+++ b/pandas/_config/localization.py
@@ -0,0 +1,176 @@
+"""
+Helpers for configuring locale settings.
+
+Name `localization` is chosen to avoid overlap with builtin `locale` module.
+"""
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+import locale
+import platform
+import re
+import subprocess
+from typing import (
+    TYPE_CHECKING,
+    cast,
+)
+
+from pandas._config.config import options
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+@contextmanager
+def set_locale(
+    new_locale: str | tuple[str, str], lc_var: int = locale.LC_ALL
+) -> Generator[str | tuple[str, str]]:
+    """
+    Context manager for temporarily setting a locale.
+
+    Parameters
+    ----------
+    new_locale : str or tuple
+        A string of the form <language_country>.<encoding>. For example to set
+        the current locale to US English with a UTF8 encoding, you would pass
+        "en_US.UTF-8".
+    lc_var : int, default `locale.LC_ALL`
+        The category of the locale being set.
+
+    Notes
+    -----
+    This is useful when you want to run a particular block of code under a
+    particular locale, without globally setting the locale. This probably isn't
+    thread-safe.
+    """
+    # getlocale is not always compliant with setlocale, use setlocale. GH#46595
+    current_locale = locale.setlocale(lc_var)
+
+    try:
+        locale.setlocale(lc_var, new_locale)
+        normalized_code, normalized_encoding = locale.getlocale()
+        if normalized_code is not None and normalized_encoding is not None:
+            yield f"{normalized_code}.{normalized_encoding}"
+        else:
+            yield new_locale
+    finally:
+        locale.setlocale(lc_var, current_locale)
+
+
+def can_set_locale(lc: str, lc_var: int = locale.LC_ALL) -> bool:
+    """
+    Check to see if we can set a locale, and subsequently get the locale,
+    without raising an Exception.
+
+    Parameters
+    ----------
+    lc : str
+        The locale to attempt to set.
+    lc_var : int, default `locale.LC_ALL`
+        The category of the locale being set.
+
+    Returns
+    -------
+    bool
+        Whether the passed locale can be set
+    """
+    try:
+        with set_locale(lc, lc_var=lc_var):
+            pass
+    except (ValueError, locale.Error):
+        # horrible name for an Exception subclass
+        return False
+    else:
+        return True
+
+
+def _valid_locales(locales: list[str] | str, normalize: bool) -> list[str]:
+    """
+    Return a list of normalized locales that do not throw an ``Exception``
+    when set.
+
+    Parameters
+    ----------
+    locales : str
+        A string where each locale is separated by a newline.
+    normalize : bool
+        Whether to call ``locale.normalize`` on each locale.
+
+    Returns
+    -------
+    valid_locales : list
+        A list of valid locales.
+    """
+    return [
+        loc
+        for loc in (
+            locale.normalize(loc.strip()) if normalize else loc.strip()
+            for loc in locales
+        )
+        if can_set_locale(loc)
+    ]
+
+
+def get_locales(
+    prefix: str | None = None,
+    normalize: bool = True,
+) -> list[str]:
+    """
+    Get all the locales that are available on the system.
+
+    Parameters
+    ----------
+    prefix : str
+        If not ``None`` then return only those locales with the prefix
+        provided. For example to get all English language locales (those that
+        start with ``"en"``), pass ``prefix="en"``.
+    normalize : bool
+        Call ``locale.normalize`` on the resulting list of available locales.
+        If ``True``, only locales that can be set without throwing an
+        ``Exception`` are returned.
+
+    Returns
+    -------
+    locales : list of strings
+        A list of locale strings that can be set with ``locale.setlocale()``.
+        For example::
+
+            locale.setlocale(locale.LC_ALL, locale_string)
+
+    On error will return an empty list (no locale available, e.g. Windows)
+
+    """
+    if platform.system() in ("Linux", "Darwin"):
+        raw_locales = subprocess.check_output(["locale", "-a"])
+    else:
+        # Other platforms e.g. windows platforms don't define "locale -a"
+        #  Note: is_platform_windows causes circular import here
+        return []
+
+    try:
+        # raw_locales is "\n" separated list of locales
+        # it may contain non-decodable parts, so split
+        # extract what we can and then rejoin.
+        split_raw_locales = raw_locales.split(b"\n")
+        out_locales = []
+        for x in split_raw_locales:
+            try:
+                out_locales.append(str(x, encoding=cast(str, options.display.encoding)))
+            except UnicodeError:
+                # 'locale -a' is used to populated 'raw_locales' and on
+                # Redhat 7 Linux (and maybe others) prints locale names
+                # using windows-1252 encoding.  Bug only triggered by
+                # a few special characters and when there is an
+                # extensive list of installed locales.
+                out_locales.append(str(x, encoding="windows-1252"))
+
+    except TypeError:
+        pass
+
+    if prefix is None:
+        return _valid_locales(out_locales, normalize)
+
+    pattern = re.compile(f"{prefix}.*")
+    found = pattern.findall("\n".join(out_locales))
+    return _valid_locales(found, normalize)
diff --git a/pandas/_libs/__init__.py b/pandas/_libs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d499f9a6cd75e53dc63a4a83c073449025caac94
--- /dev/null
+++ b/pandas/_libs/__init__.py
@@ -0,0 +1,27 @@
+__all__ = [
+    "Interval",
+    "NaT",
+    "NaTType",
+    "OutOfBoundsDatetime",
+    "Period",
+    "Timedelta",
+    "Timestamp",
+    "iNaT",
+]
+
+
+# Below imports needs to happen first to ensure pandas top level
+# module gets monkeypatched with the pandas_datetime_CAPI
+# see pandas_datetime_exec in pd_datetime.c
+import pandas._libs.pandas_parser  # isort: skip # type: ignore[reportUnusedImport]
+import pandas._libs.pandas_datetime  # noqa: F401 # isort: skip # type: ignore[reportUnusedImport]
+from pandas._libs.interval import Interval
+from pandas._libs.tslibs import (
+    NaT,
+    NaTType,
+    OutOfBoundsDatetime,
+    Period,
+    Timedelta,
+    Timestamp,
+    iNaT,
+)
diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..0a6be851e1efd0389eab3398462d3fa0e7e5946f
--- /dev/null
+++ b/pandas/_libs/algos.pyi
@@ -0,0 +1,443 @@
+from typing import Any
+
+import numpy as np
+
+from pandas._typing import npt
+
+class Infinity:
+    def __eq__(self, other) -> bool: ...
+    def __ne__(self, other) -> bool: ...
+    def __lt__(self, other) -> bool: ...
+    def __le__(self, other) -> bool: ...
+    def __gt__(self, other) -> bool: ...
+    def __ge__(self, other) -> bool: ...
+
+class NegInfinity:
+    def __eq__(self, other) -> bool: ...
+    def __ne__(self, other) -> bool: ...
+    def __lt__(self, other) -> bool: ...
+    def __le__(self, other) -> bool: ...
+    def __gt__(self, other) -> bool: ...
+    def __ge__(self, other) -> bool: ...
+
+def unique_deltas(
+    arr: np.ndarray,  # const int64_t[:]
+) -> np.ndarray: ...  # np.ndarray[np.int64, ndim=1]
+def is_lexsorted(list_of_arrays: list[npt.NDArray[np.int64]]) -> bool: ...
+def groupsort_indexer(
+    index: np.ndarray,  # const int64_t[:]
+    ngroups: int,
+) -> tuple[
+    np.ndarray,  # ndarray[int64_t, ndim=1]
+    np.ndarray,  # ndarray[int64_t, ndim=1]
+]: ...
+def kth_smallest(
+    arr: np.ndarray,  # numeric[:]
+    k: int,
+) -> Any: ...  # numeric
+
+# ----------------------------------------------------------------------
+# Pairwise correlation/covariance
+
+def nancorr(
+    mat: npt.NDArray[np.float64],  # const float64_t[:, :]
+    cov: bool = ...,
+    minp: int | None = ...,
+) -> npt.NDArray[np.float64]: ...  # ndarray[float64_t, ndim=2]
+def nancorr_spearman(
+    mat: npt.NDArray[np.float64],  # ndarray[float64_t, ndim=2]
+    minp: int = ...,
+) -> npt.NDArray[np.float64]: ...  # ndarray[float64_t, ndim=2]
+
+# ----------------------------------------------------------------------
+
+def validate_limit(nobs: int | None, limit=...) -> int: ...
+def get_fill_indexer(
+    mask: npt.NDArray[np.bool_],
+    limit: int | None = None,
+) -> npt.NDArray[np.intp]: ...
+def pad(
+    old: np.ndarray,  # ndarray[numeric_object_t]
+    new: np.ndarray,  # ndarray[numeric_object_t]
+    limit=...,
+) -> npt.NDArray[np.intp]: ...  # np.ndarray[np.intp, ndim=1]
+def pad_inplace(
+    values: np.ndarray,  # numeric_object_t[:]
+    mask: np.ndarray,  # uint8_t[:]
+    limit=...,
+) -> None: ...
+def pad_2d_inplace(
+    values: np.ndarray,  # numeric_object_t[:, :]
+    mask: np.ndarray,  # const uint8_t[:, :]
+    limit=...,
+) -> None: ...
+def backfill(
+    old: np.ndarray,  # ndarray[numeric_object_t]
+    new: np.ndarray,  # ndarray[numeric_object_t]
+    limit=...,
+) -> npt.NDArray[np.intp]: ...  # np.ndarray[np.intp, ndim=1]
+def backfill_inplace(
+    values: np.ndarray,  # numeric_object_t[:]
+    mask: np.ndarray,  # uint8_t[:]
+    limit=...,
+) -> None: ...
+def backfill_2d_inplace(
+    values: np.ndarray,  # numeric_object_t[:, :]
+    mask: np.ndarray,  # const uint8_t[:, :]
+    limit=...,
+) -> None: ...
+def is_monotonic(
+    arr: np.ndarray,  # ndarray[numeric_object_t, ndim=1]
+    timelike: bool,
+) -> tuple[bool, bool, bool]: ...
+
+# ----------------------------------------------------------------------
+# rank_1d, rank_2d
+# ----------------------------------------------------------------------
+
+def rank_1d(
+    values: np.ndarray,  # ndarray[numeric_object_t, ndim=1]
+    labels: np.ndarray | None = ...,  # const int64_t[:]=None
+    is_datetimelike: bool = ...,
+    ties_method=...,
+    ascending: bool = ...,
+    pct: bool = ...,
+    na_option=...,
+    mask: npt.NDArray[np.bool_] | None = ...,
+) -> np.ndarray: ...  # np.ndarray[float64_t, ndim=1]
+def rank_2d(
+    in_arr: np.ndarray,  # ndarray[numeric_object_t, ndim=2]
+    axis: int = ...,
+    is_datetimelike: bool = ...,
+    ties_method=...,
+    ascending: bool = ...,
+    na_option=...,
+    pct: bool = ...,
+) -> np.ndarray: ...  # np.ndarray[float64_t, ndim=1]
+def diff_2d(
+    arr: np.ndarray,  # ndarray[diff_t, ndim=2]
+    out: np.ndarray,  # ndarray[out_t, ndim=2]
+    periods: int,
+    axis: int,
+    datetimelike: bool = ...,
+) -> None: ...
+def ensure_platform_int(arr: object) -> npt.NDArray[np.intp]: ...
+def ensure_object(arr: object) -> npt.NDArray[np.object_]: ...
+def ensure_float64(arr: object) -> npt.NDArray[np.float64]: ...
+def ensure_int8(arr: object) -> npt.NDArray[np.int8]: ...
+def ensure_int16(arr: object) -> npt.NDArray[np.int16]: ...
+def ensure_int32(arr: object) -> npt.NDArray[np.int32]: ...
+def ensure_int64(arr: object) -> npt.NDArray[np.int64]: ...
+def ensure_uint64(arr: object) -> npt.NDArray[np.uint64]: ...
+def take_1d_int8_int8(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_int8_int32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_int8_int64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_int8_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_int16_int16(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_int16_int32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_int16_int64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_int16_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_int32_int32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_int32_int64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_int32_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_int64_int64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_uint16_uint16(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_uint32_uint32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_uint64_uint64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_int64_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_float32_float32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_float32_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_float64_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_object_object(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_bool_bool(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_1d_bool_object(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int8_int8(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int8_int32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int8_int64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int8_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int16_int16(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int16_int32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int16_int64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int16_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int32_int32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int32_int64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int32_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int64_int64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_int64_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_uint16_uint16(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_uint32_uint32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_uint64_uint64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_float32_float32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_float32_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_float64_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_object_object(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_bool_bool(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis0_bool_object(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int8_int8(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int8_int32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int8_int64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int8_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int16_int16(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int16_int32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int16_int64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int16_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int32_int32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int32_int64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int32_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int64_int64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_uint16_uint16(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_uint32_uint32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_uint64_uint64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_int64_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_float32_float32(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_float32_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_float64_float64(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_object_object(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_bool_bool(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_axis1_bool_object(
+    values: np.ndarray, indexer: npt.NDArray[np.intp], out: np.ndarray, fill_value=...
+) -> None: ...
+def take_2d_multi_int8_int8(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_int8_int32(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_int8_int64(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_int8_float64(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_int16_int16(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_int16_int32(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_int16_int64(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_int16_float64(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_int32_int32(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_int32_int64(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_int32_float64(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_int64_float64(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_float32_float32(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_float32_float64(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_float64_float64(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_object_object(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_bool_bool(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_bool_object(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
+def take_2d_multi_int64_int64(
+    values: np.ndarray,
+    indexer: tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]],
+    out: np.ndarray,
+    fill_value=...,
+) -> None: ...
diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..7b373240952ca308391619c25193cc81627098b0
--- /dev/null
+++ b/pandas/_libs/arrays.pyi
@@ -0,0 +1,40 @@
+from collections.abc import Sequence
+from typing import Self
+
+import numpy as np
+
+from pandas._typing import (
+    AxisInt,
+    DtypeObj,
+    Shape,
+)
+
+class NDArrayBacked:
+    _dtype: DtypeObj
+    _ndarray: np.ndarray
+    def __init__(self, values: np.ndarray, dtype: DtypeObj) -> None: ...
+    @classmethod
+    def _simple_new(cls, values: np.ndarray, dtype: DtypeObj) -> Self: ...
+    def _from_backing_data(self, values: np.ndarray) -> Self: ...
+    def __setstate__(self, state) -> None: ...
+    def __len__(self) -> int: ...
+    @property
+    def shape(self) -> Shape: ...
+    @property
+    def ndim(self) -> int: ...
+    @property
+    def size(self) -> int: ...
+    @property
+    def nbytes(self) -> int: ...
+    def copy(self, order=...) -> Self: ...
+    def delete(self, loc, axis=...) -> Self: ...
+    def swapaxes(self, axis1, axis2) -> Self: ...
+    def repeat(self, repeats: int | Sequence[int], axis: int | None = ...) -> Self: ...
+    def reshape(self, *args, **kwargs) -> Self: ...
+    def ravel(self, order=...) -> Self: ...
+    @property
+    def T(self) -> Self: ...
+    @classmethod
+    def _concat_same_type(
+        cls, to_concat: Sequence[Self], axis: AxisInt = ...
+    ) -> Self: ...
diff --git a/pandas/_libs/byteswap.cpython-312-x86_64-linux-gnu.so b/pandas/_libs/byteswap.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..b81736073b03355e3bb7d0c8af27c9bb0d8c9201
Binary files /dev/null and b/pandas/_libs/byteswap.cpython-312-x86_64-linux-gnu.so differ
diff --git a/pandas/_libs/byteswap.pyi b/pandas/_libs/byteswap.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..bb0dbfc6a50b1bb7cd509dc5b3dfeed55ad70b09
--- /dev/null
+++ b/pandas/_libs/byteswap.pyi
@@ -0,0 +1,5 @@
+def read_float_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ...
+def read_double_with_byteswap(data: bytes, offset: int, byteswap: bool) -> float: ...
+def read_uint16_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
+def read_uint32_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
+def read_uint64_with_byteswap(data: bytes, offset: int, byteswap: bool) -> int: ...
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..803c2cb0b0d19f53863a7e5fb8d976431f842f4a
--- /dev/null
+++ b/pandas/_libs/groupby.pyi
@@ -0,0 +1,234 @@
+from typing import Literal
+
+import numpy as np
+
+from pandas._typing import npt
+
+def group_median_float64(
+    out: np.ndarray,  # ndarray[float64_t, ndim=2]
+    counts: npt.NDArray[np.int64],
+    values: np.ndarray,  # ndarray[float64_t, ndim=2]
+    labels: npt.NDArray[np.int64],
+    min_count: int = ...,  # Py_ssize_t
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+    is_datetimelike: bool = ...,  # bint
+    skipna: bool = ...,
+) -> None: ...
+def group_cumprod(
+    out: np.ndarray,  # float64_t[:, ::1]
+    values: np.ndarray,  # const float64_t[:, :]
+    labels: np.ndarray,  # const int64_t[:]
+    ngroups: int,
+    is_datetimelike: bool,
+    skipna: bool = ...,
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+) -> None: ...
+def group_cumsum(
+    out: np.ndarray,  # int64float_t[:, ::1]
+    values: np.ndarray,  # ndarray[int64float_t, ndim=2]
+    labels: np.ndarray,  # const int64_t[:]
+    ngroups: int,
+    is_datetimelike: bool,
+    skipna: bool = ...,
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+) -> None: ...
+def group_shift_indexer(
+    out: np.ndarray,  # int64_t[::1]
+    labels: np.ndarray,  # const int64_t[:]
+    ngroups: int,
+    periods: int,
+) -> None: ...
+def group_fillna_indexer(
+    out: np.ndarray,  # ndarray[intp_t]
+    labels: np.ndarray,  # ndarray[int64_t]
+    mask: npt.NDArray[np.uint8],
+    limit: int,  # int64_t
+    compute_ffill: bool,
+    ngroups: int,
+) -> None: ...
+def group_any_all(
+    out: np.ndarray,  # uint8_t[::1]
+    values: np.ndarray,  # const uint8_t[::1]
+    labels: np.ndarray,  # const int64_t[:]
+    mask: np.ndarray,  # const uint8_t[::1]
+    val_test: Literal["any", "all"],
+    skipna: bool,
+    result_mask: np.ndarray | None,
+) -> None: ...
+def group_sum(
+    out: np.ndarray,  # complexfloatingintuint_t[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[complexfloatingintuint_t, ndim=2]
+    labels: np.ndarray,  # const intp_t[:]
+    mask: np.ndarray | None,
+    result_mask: np.ndarray | None = ...,
+    min_count: int = ...,
+    is_datetimelike: bool = ...,
+    initial: object = ...,
+    skipna: bool = ...,
+) -> None: ...
+def group_prod(
+    out: np.ndarray,  # int64float_t[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[int64float_t, ndim=2]
+    labels: np.ndarray,  # const intp_t[:]
+    mask: np.ndarray | None,
+    result_mask: np.ndarray | None = ...,
+    min_count: int = ...,
+    skipna: bool = ...,
+) -> None: ...
+def group_var(
+    out: np.ndarray,  # floating[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[floating, ndim=2]
+    labels: np.ndarray,  # const intp_t[:]
+    min_count: int = ...,  # Py_ssize_t
+    ddof: int = ...,  # int64_t
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+    is_datetimelike: bool = ...,
+    name: str = ...,
+    skipna: bool = ...,
+) -> None: ...
+def group_skew(
+    out: np.ndarray,  # float64_t[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[float64_T, ndim=2]
+    labels: np.ndarray,  # const intp_t[::1]
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+    skipna: bool = ...,
+) -> None: ...
+def group_kurt(
+    out: np.ndarray,  # float64_t[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[float64_T, ndim=2]
+    labels: np.ndarray,  # const intp_t[::1]
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+    skipna: bool = ...,
+) -> None: ...
+def group_mean(
+    out: np.ndarray,  # floating[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[floating, ndim=2]
+    labels: np.ndarray,  # const intp_t[:]
+    min_count: int = ...,  # Py_ssize_t
+    is_datetimelike: bool = ...,  # bint
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+    skipna: bool = ...,
+) -> None: ...
+def group_ohlc(
+    out: np.ndarray,  # floatingintuint_t[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[floatingintuint_t, ndim=2]
+    labels: np.ndarray,  # const intp_t[:]
+    min_count: int = ...,
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+) -> None: ...
+def group_quantile(
+    out: npt.NDArray[np.float64],
+    values: np.ndarray,  # ndarray[numeric, ndim=1]
+    labels: npt.NDArray[np.intp],
+    mask: npt.NDArray[np.uint8],
+    qs: npt.NDArray[np.float64],  # const
+    starts: npt.NDArray[np.int64],
+    ends: npt.NDArray[np.int64],
+    interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"],
+    result_mask: np.ndarray | None,
+    is_datetimelike: bool,
+) -> None: ...
+def group_last(
+    out: np.ndarray,  # rank_t[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[rank_t, ndim=2]
+    labels: np.ndarray,  # const int64_t[:]
+    mask: npt.NDArray[np.bool_] | None,
+    result_mask: npt.NDArray[np.bool_] | None = ...,
+    min_count: int = ...,  # Py_ssize_t
+    is_datetimelike: bool = ...,
+    skipna: bool = ...,
+) -> None: ...
+def group_nth(
+    out: np.ndarray,  # rank_t[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[rank_t, ndim=2]
+    labels: np.ndarray,  # const int64_t[:]
+    mask: npt.NDArray[np.bool_] | None,
+    result_mask: npt.NDArray[np.bool_] | None = ...,
+    min_count: int = ...,  # int64_t
+    rank: int = ...,  # int64_t
+    is_datetimelike: bool = ...,
+    skipna: bool = ...,
+) -> None: ...
+def group_rank(
+    out: np.ndarray,  # float64_t[:, ::1]
+    values: np.ndarray,  # ndarray[rank_t, ndim=2]
+    labels: np.ndarray,  # const int64_t[:]
+    ngroups: int,
+    is_datetimelike: bool,
+    ties_method: Literal["average", "min", "max", "first", "dense"] = ...,
+    ascending: bool = ...,
+    pct: bool = ...,
+    na_option: Literal["keep", "top", "bottom"] = ...,
+    mask: npt.NDArray[np.bool_] | None = ...,
+) -> None: ...
+def group_max(
+    out: np.ndarray,  # groupby_t[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[groupby_t, ndim=2]
+    labels: np.ndarray,  # const int64_t[:]
+    min_count: int = ...,
+    is_datetimelike: bool = ...,
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+    skipna: bool = ...,
+) -> None: ...
+def group_min(
+    out: np.ndarray,  # groupby_t[:, ::1]
+    counts: np.ndarray,  # int64_t[::1]
+    values: np.ndarray,  # ndarray[groupby_t, ndim=2]
+    labels: np.ndarray,  # const int64_t[:]
+    min_count: int = ...,
+    is_datetimelike: bool = ...,
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+    skipna: bool = ...,
+) -> None: ...
+def group_idxmin_idxmax(
+    out: npt.NDArray[np.intp],
+    counts: npt.NDArray[np.int64],
+    values: np.ndarray,  # ndarray[groupby_t, ndim=2]
+    labels: npt.NDArray[np.intp],
+    min_count: int = ...,
+    is_datetimelike: bool = ...,
+    mask: np.ndarray | None = ...,
+    name: str = ...,
+    skipna: bool = ...,
+    result_mask: np.ndarray | None = ...,
+) -> None: ...
+def group_cummin(
+    out: np.ndarray,  # groupby_t[:, ::1]
+    values: np.ndarray,  # ndarray[groupby_t, ndim=2]
+    labels: np.ndarray,  # const int64_t[:]
+    ngroups: int,
+    is_datetimelike: bool,
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+    skipna: bool = ...,
+) -> None: ...
+def group_cummax(
+    out: np.ndarray,  # groupby_t[:, ::1]
+    values: np.ndarray,  # ndarray[groupby_t, ndim=2]
+    labels: np.ndarray,  # const int64_t[:]
+    ngroups: int,
+    is_datetimelike: bool,
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
+    skipna: bool = ...,
+) -> None: ...
diff --git a/pandas/_libs/hashing.cpython-312-x86_64-linux-gnu.so b/pandas/_libs/hashing.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..28e6528ace127ef506c9d68897fefbb6562cb25a
Binary files /dev/null and b/pandas/_libs/hashing.cpython-312-x86_64-linux-gnu.so differ
diff --git a/pandas/_libs/hashing.pyi b/pandas/_libs/hashing.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..8361026e4a87d462e04c53f7f1f8aee8a7f6ffe0
--- /dev/null
+++ b/pandas/_libs/hashing.pyi
@@ -0,0 +1,9 @@
+import numpy as np
+
+from pandas._typing import npt
+
+def hash_object_array(
+    arr: npt.NDArray[np.object_],
+    key: str,
+    encoding: str = ...,
+) -> npt.NDArray[np.uint64]: ...
diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..5ee359d84a6ed786dea6fc3ac9659c3adf7c8f70
--- /dev/null
+++ b/pandas/_libs/hashtable.pyi
@@ -0,0 +1,274 @@
+from collections.abc import Hashable
+from typing import (
+    Any,
+    Literal,
+    overload,
+)
+
+import numpy as np
+
+from pandas._typing import npt
+
+def unique_label_indices(
+    labels: np.ndarray,  # const int64_t[:]
+) -> np.ndarray: ...
+
+class Factorizer:
+    count: int
+    uniques: Any
+    def __init__(self, size_hint: int, uses_mask: bool = False) -> None: ...
+    def get_count(self) -> int: ...
+    def factorize(
+        self,
+        values: np.ndarray,
+        na_sentinel=...,
+        na_value=...,
+        mask=...,
+    ) -> npt.NDArray[np.intp]: ...
+    def hash_inner_join(
+        self, values: np.ndarray, mask=...
+    ) -> tuple[np.ndarray, np.ndarray]: ...
+
+class ObjectFactorizer(Factorizer):
+    table: PyObjectHashTable
+    uniques: ObjectVector
+
+class Int64Factorizer(Factorizer):
+    table: Int64HashTable
+    uniques: Int64Vector
+
+class UInt64Factorizer(Factorizer):
+    table: UInt64HashTable
+    uniques: UInt64Vector
+
+class Int32Factorizer(Factorizer):
+    table: Int32HashTable
+    uniques: Int32Vector
+
+class UInt32Factorizer(Factorizer):
+    table: UInt32HashTable
+    uniques: UInt32Vector
+
+class Int16Factorizer(Factorizer):
+    table: Int16HashTable
+    uniques: Int16Vector
+
+class UInt16Factorizer(Factorizer):
+    table: UInt16HashTable
+    uniques: UInt16Vector
+
+class Int8Factorizer(Factorizer):
+    table: Int8HashTable
+    uniques: Int8Vector
+
+class UInt8Factorizer(Factorizer):
+    table: UInt8HashTable
+    uniques: UInt8Vector
+
+class Float64Factorizer(Factorizer):
+    table: Float64HashTable
+    uniques: Float64Vector
+
+class Float32Factorizer(Factorizer):
+    table: Float32HashTable
+    uniques: Float32Vector
+
+class Complex64Factorizer(Factorizer):
+    table: Complex64HashTable
+    uniques: Complex64Vector
+
+class Complex128Factorizer(Factorizer):
+    table: Complex128HashTable
+    uniques: Complex128Vector
+
+class Int64Vector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.int64]: ...
+
+class Int32Vector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.int32]: ...
+
+class Int16Vector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.int16]: ...
+
+class Int8Vector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.int8]: ...
+
+class UInt64Vector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.uint64]: ...
+
+class UInt32Vector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.uint32]: ...
+
+class UInt16Vector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.uint16]: ...
+
+class UInt8Vector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.uint8]: ...
+
+class Float64Vector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.float64]: ...
+
+class Float32Vector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.float32]: ...
+
+class Complex128Vector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.complex128]: ...
+
+class Complex64Vector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.complex64]: ...
+
+class StringVector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.object_]: ...
+
+class ObjectVector:
+    def __init__(self, *args) -> None: ...
+    def __len__(self) -> int: ...
+    def to_array(self) -> npt.NDArray[np.object_]: ...
+
+class HashTable:
+    # NB: The base HashTable class does _not_ actually have these methods;
+    #  we are putting them here for the sake of mypy to avoid
+    #  reproducing them in each subclass below.
+    def __init__(self, size_hint: int = ..., uses_mask: bool = ...) -> None: ...
+    def __len__(self) -> int: ...
+    def __contains__(self, key: Hashable) -> bool: ...
+    def sizeof(self, deep: bool = ...) -> int: ...
+    def get_state(self) -> dict[str, int]: ...
+    # TODO: `val/key` type is subclass-specific
+    def get_item(self, val): ...  # TODO: return type?
+    def set_item(self, key, val) -> None: ...
+    def get_na(self): ...  # TODO: return type?
+    def set_na(self, val) -> None: ...
+    def map_locations(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        mask: npt.NDArray[np.bool_] | None = ...,
+    ) -> None: ...
+    def lookup(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        mask: npt.NDArray[np.bool_] | None = ...,
+    ) -> npt.NDArray[np.intp]: ...
+    def get_labels(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        uniques,  # SubclassTypeVector
+        count_prior: int = ...,
+        na_sentinel: int = ...,
+        na_value: object = ...,
+        mask=...,
+    ) -> npt.NDArray[np.intp]: ...
+    @overload
+    def unique(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        *,
+        return_inverse: Literal[False] = ...,
+        mask: None = ...,
+    ) -> np.ndarray: ...  # np.ndarray[subclass-specific]
+    @overload
+    def unique(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        *,
+        return_inverse: Literal[True],
+        mask: None = ...,
+    ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ...  # np.ndarray[subclass-specific]
+    @overload
+    def unique(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        *,
+        return_inverse: Literal[False] = ...,
+        mask: npt.NDArray[np.bool_],
+    ) -> tuple[
+        np.ndarray,
+        npt.NDArray[np.bool_],
+    ]: ...  # np.ndarray[subclass-specific]
+    def factorize(
+        self,
+        values: np.ndarray,  # np.ndarray[subclass-specific]
+        na_sentinel: int = ...,
+        na_value: object = ...,
+        mask=...,
+        ignore_na: bool = True,
+    ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ...  # np.ndarray[subclass-specific]
+    def hash_inner_join(
+        self, values: np.ndarray, mask=...
+    ) -> tuple[np.ndarray, np.ndarray]: ...
+
+class Complex128HashTable(HashTable): ...
+class Complex64HashTable(HashTable): ...
+class Float64HashTable(HashTable): ...
+class Float32HashTable(HashTable): ...
+
+class Int64HashTable(HashTable):
+    # Only Int64HashTable has get_labels_groupby, map_keys_to_values
+    def get_labels_groupby(
+        self,
+        values: npt.NDArray[np.int64],  # const int64_t[:]
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]: ...
+    def map_keys_to_values(
+        self,
+        keys: npt.NDArray[np.int64],
+        values: npt.NDArray[np.int64],  # const int64_t[:]
+    ) -> None: ...
+
+class Int32HashTable(HashTable): ...
+class Int16HashTable(HashTable): ...
+class Int8HashTable(HashTable): ...
+class UInt64HashTable(HashTable): ...
+class UInt32HashTable(HashTable): ...
+class UInt16HashTable(HashTable): ...
+class UInt8HashTable(HashTable): ...
+class StringHashTable(HashTable): ...
+class PyObjectHashTable(HashTable): ...
+class IntpHashTable(HashTable): ...
+
+def duplicated(
+    values: np.ndarray,
+    keep: Literal["last", "first", False] = ...,
+    mask: npt.NDArray[np.bool_] | None = ...,
+) -> npt.NDArray[np.bool_]: ...
+def mode(
+    values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = ...
+) -> np.ndarray: ...
+def value_count(
+    values: np.ndarray,
+    dropna: bool,
+    mask: npt.NDArray[np.bool_] | None = ...,
+) -> tuple[np.ndarray, npt.NDArray[np.int64], int]: ...  # np.ndarray[same-as-values]
+
+# arr and values should have same dtype
+def ismember(
+    arr: np.ndarray,
+    values: np.ndarray,
+) -> npt.NDArray[np.bool_]: ...
+def object_hash(obj) -> int: ...
+def objects_are_equal(a, b) -> bool: ...
diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..3af2856d2fbbf36c1da27bb38fd66a62bc6ac3ea
--- /dev/null
+++ b/pandas/_libs/index.pyi
@@ -0,0 +1,107 @@
+import numpy as np
+
+from pandas._typing import npt
+
+from pandas import (
+    Index,
+    MultiIndex,
+)
+from pandas.core.arrays import ExtensionArray
+
+multiindex_nulls_shift: int
+
+class IndexEngine:
+    over_size_threshold: bool
+    def __init__(self, values: np.ndarray) -> None: ...
+    def __contains__(self, val: object) -> bool: ...
+
+    # -> int | slice | np.ndarray[bool]
+    def get_loc(self, val: object) -> int | slice | np.ndarray: ...
+    def sizeof(self, deep: bool = ...) -> int: ...
+    def __sizeof__(self) -> int: ...
+    @property
+    def is_unique(self) -> bool: ...
+    @property
+    def is_monotonic_increasing(self) -> bool: ...
+    @property
+    def is_monotonic_decreasing(self) -> bool: ...
+    @property
+    def is_mapping_populated(self) -> bool: ...
+    def clear_mapping(self): ...
+    def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
+    def get_indexer_non_unique(
+        self,
+        targets: np.ndarray,
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+
+class MaskedIndexEngine(IndexEngine):
+    def __init__(self, values: object) -> None: ...
+    def get_indexer_non_unique(
+        self, targets: object
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+
+class Float64Engine(IndexEngine): ...
+class Float32Engine(IndexEngine): ...
+class Complex128Engine(IndexEngine): ...
+class Complex64Engine(IndexEngine): ...
+class Int64Engine(IndexEngine): ...
+class Int32Engine(IndexEngine): ...
+class Int16Engine(IndexEngine): ...
+class Int8Engine(IndexEngine): ...
+class UInt64Engine(IndexEngine): ...
+class UInt32Engine(IndexEngine): ...
+class UInt16Engine(IndexEngine): ...
+class UInt8Engine(IndexEngine): ...
+class ObjectEngine(IndexEngine): ...
+class StringEngine(IndexEngine): ...
+class DatetimeEngine(Int64Engine): ...
+class TimedeltaEngine(DatetimeEngine): ...
+class PeriodEngine(Int64Engine): ...
+class BoolEngine(UInt8Engine): ...
+class MaskedFloat64Engine(MaskedIndexEngine): ...
+class MaskedFloat32Engine(MaskedIndexEngine): ...
+class MaskedComplex128Engine(MaskedIndexEngine): ...
+class MaskedComplex64Engine(MaskedIndexEngine): ...
+class MaskedInt64Engine(MaskedIndexEngine): ...
+class MaskedInt32Engine(MaskedIndexEngine): ...
+class MaskedInt16Engine(MaskedIndexEngine): ...
+class MaskedInt8Engine(MaskedIndexEngine): ...
+class MaskedUInt64Engine(MaskedIndexEngine): ...
+class MaskedUInt32Engine(MaskedIndexEngine): ...
+class MaskedUInt16Engine(MaskedIndexEngine): ...
+class MaskedUInt8Engine(MaskedIndexEngine): ...
+class MaskedBoolEngine(MaskedUInt8Engine): ...
+
+class StringObjectEngine(ObjectEngine):
+    def __init__(self, values: object, na_value) -> None: ...
+
+class BaseMultiIndexCodesEngine:
+    levels: list[np.ndarray]
+    offsets: np.ndarray  # np.ndarray[..., ndim=1]
+
+    def __init__(
+        self,
+        levels: list[Index],  # all entries hashable
+        labels: list[np.ndarray],  # all entries integer-dtyped
+        offsets: np.ndarray,  # np.ndarray[..., ndim=1]
+    ) -> None: ...
+    def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ...
+    def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ...
+
+class ExtensionEngine:
+    def __init__(self, values: ExtensionArray) -> None: ...
+    def __contains__(self, val: object) -> bool: ...
+    def get_loc(self, val: object) -> int | slice | np.ndarray: ...
+    def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ...
+    def get_indexer_non_unique(
+        self,
+        targets: np.ndarray,
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+    @property
+    def is_unique(self) -> bool: ...
+    @property
+    def is_monotonic_increasing(self) -> bool: ...
+    @property
+    def is_monotonic_decreasing(self) -> bool: ...
+    def sizeof(self, deep: bool = ...) -> int: ...
+    def clear_mapping(self): ...
diff --git a/pandas/_libs/indexing.cpython-312-x86_64-linux-gnu.so b/pandas/_libs/indexing.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..065b9280a71c109580b05568e8595eb7fba9e7de
Binary files /dev/null and b/pandas/_libs/indexing.cpython-312-x86_64-linux-gnu.so differ
diff --git a/pandas/_libs/indexing.pyi b/pandas/_libs/indexing.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..3ae5c5044a2f75452fa57ba578af2c7b4c78ec96
--- /dev/null
+++ b/pandas/_libs/indexing.pyi
@@ -0,0 +1,17 @@
+from typing import (
+    Generic,
+    TypeVar,
+)
+
+from pandas.core.indexing import IndexingMixin
+
+_IndexingMixinT = TypeVar("_IndexingMixinT", bound=IndexingMixin)
+
+class NDFrameIndexerBase(Generic[_IndexingMixinT]):
+    name: str
+    # in practice obj is either a DataFrame or a Series
+    obj: _IndexingMixinT
+
+    def __init__(self, name: str, obj: _IndexingMixinT) -> None: ...
+    @property
+    def ndim(self) -> int: ...
diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..11d059ec53920e5f44911d8784b09332dbb4e797
--- /dev/null
+++ b/pandas/_libs/internals.pyi
@@ -0,0 +1,96 @@
+from collections.abc import (
+    Iterator,
+    Sequence,
+)
+from typing import (
+    Self,
+    final,
+    overload,
+)
+import weakref
+
+import numpy as np
+
+from pandas._typing import (
+    ArrayLike,
+    npt,
+)
+
+from pandas import Index
+from pandas.core.internals.blocks import Block as B
+
+def slice_len(slc: slice, objlen: int = ...) -> int: ...
+def get_concat_blkno_indexers(
+    blknos_list: list[npt.NDArray[np.intp]],
+) -> list[tuple[npt.NDArray[np.intp], BlockPlacement]]: ...
+def get_blkno_indexers(
+    blknos: np.ndarray,  # int64_t[:]
+    group: bool = ...,
+) -> list[tuple[int, slice | np.ndarray]]: ...
+def get_blkno_placements(
+    blknos: np.ndarray,
+    group: bool = ...,
+) -> Iterator[tuple[int, BlockPlacement]]: ...
+def update_blklocs_and_blknos(
+    blklocs: npt.NDArray[np.intp],
+    blknos: npt.NDArray[np.intp],
+    loc: int,
+    nblocks: int,
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+@final
+class BlockPlacement:
+    def __init__(self, val: int | slice | np.ndarray) -> None: ...
+    @property
+    def indexer(self) -> np.ndarray | slice: ...
+    @property
+    def as_array(self) -> np.ndarray: ...
+    @property
+    def as_slice(self) -> slice: ...
+    @property
+    def is_slice_like(self) -> bool: ...
+    @overload
+    def __getitem__(
+        self, loc: slice | Sequence[int] | npt.NDArray[np.intp]
+    ) -> BlockPlacement: ...
+    @overload
+    def __getitem__(self, loc: int) -> int: ...
+    def __iter__(self) -> Iterator[int]: ...
+    def __len__(self) -> int: ...
+    def delete(self, loc) -> BlockPlacement: ...
+    def add(self, other) -> BlockPlacement: ...
+    def append(self, others: list[BlockPlacement]) -> BlockPlacement: ...
+    def tile_for_unstack(self, factor: int) -> npt.NDArray[np.intp]: ...
+
+class Block:
+    _mgr_locs: BlockPlacement
+    ndim: int
+    values: ArrayLike
+    refs: BlockValuesRefs
+    def __init__(
+        self,
+        values: ArrayLike,
+        placement: BlockPlacement,
+        ndim: int,
+        refs: BlockValuesRefs | None = ...,
+    ) -> None: ...
+    def slice_block_rows(self, slicer: slice) -> Self: ...
+
+class BlockManager:
+    blocks: tuple[B, ...]
+    axes: list[Index]
+    _known_consolidated: bool
+    _is_consolidated: bool
+    _blknos: np.ndarray
+    _blklocs: np.ndarray
+    def __init__(
+        self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=...
+    ) -> None: ...
+    def get_slice(self, slobj: slice, axis: int = ...) -> Self: ...
+    def _rebuild_blknos_and_blklocs(self) -> None: ...
+
+class BlockValuesRefs:
+    referenced_blocks: list[weakref.ref]
+    def __init__(self, blk: Block | None = ...) -> None: ...
+    def add_reference(self, blk: Block) -> None: ...
+    def add_index_reference(self, index: Index) -> None: ...
+    def has_reference(self) -> bool: ...
diff --git a/pandas/_libs/interval.pyi b/pandas/_libs/interval.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..587fdf84f2f85520713352bbcab29804c95621e5
--- /dev/null
+++ b/pandas/_libs/interval.pyi
@@ -0,0 +1,174 @@
+from typing import (
+    Any,
+    Generic,
+    TypeVar,
+    overload,
+)
+
+import numpy as np
+import numpy.typing as npt
+
+from pandas._typing import (
+    IntervalClosedType,
+    Timedelta,
+    Timestamp,
+)
+
+VALID_CLOSED: frozenset[str]
+
+_OrderableScalarT = TypeVar("_OrderableScalarT", int, float)
+_OrderableTimesT = TypeVar("_OrderableTimesT", Timestamp, Timedelta)
+_OrderableT = TypeVar("_OrderableT", int, float, Timestamp, Timedelta)
+
+class _LengthDescriptor:
+    @overload
+    def __get__(
+        self, instance: Interval[_OrderableScalarT], owner: Any
+    ) -> _OrderableScalarT: ...
+    @overload
+    def __get__(
+        self, instance: Interval[_OrderableTimesT], owner: Any
+    ) -> Timedelta: ...
+
+class _MidDescriptor:
+    @overload
+    def __get__(self, instance: Interval[_OrderableScalarT], owner: Any) -> float: ...
+    @overload
+    def __get__(
+        self, instance: Interval[_OrderableTimesT], owner: Any
+    ) -> _OrderableTimesT: ...
+
+class IntervalMixin:
+    @property
+    def closed_left(self) -> bool: ...
+    @property
+    def closed_right(self) -> bool: ...
+    @property
+    def open_left(self) -> bool: ...
+    @property
+    def open_right(self) -> bool: ...
+    @property
+    def is_empty(self) -> bool: ...
+    def _check_closed_matches(self, other: IntervalMixin, name: str = ...) -> None: ...
+
+class Interval(IntervalMixin, Generic[_OrderableT]):
+    @property
+    def left(self: Interval[_OrderableT]) -> _OrderableT: ...
+    @property
+    def right(self: Interval[_OrderableT]) -> _OrderableT: ...
+    @property
+    def closed(self) -> IntervalClosedType: ...
+    mid: _MidDescriptor
+    length: _LengthDescriptor
+    def __init__(
+        self,
+        left: _OrderableT,
+        right: _OrderableT,
+        closed: IntervalClosedType = ...,
+    ) -> None: ...
+    def __hash__(self) -> int: ...
+    @overload
+    def __contains__(
+        self: Interval[Timedelta], key: Timedelta | Interval[Timedelta]
+    ) -> bool: ...
+    @overload
+    def __contains__(
+        self: Interval[Timestamp], key: Timestamp | Interval[Timestamp]
+    ) -> bool: ...
+    @overload
+    def __contains__(
+        self: Interval[_OrderableScalarT],
+        key: _OrderableScalarT | Interval[_OrderableScalarT],
+    ) -> bool: ...
+    @overload
+    def __add__(
+        self: Interval[_OrderableTimesT], y: Timedelta
+    ) -> Interval[_OrderableTimesT]: ...
+    @overload
+    def __add__(
+        self: Interval[int], y: _OrderableScalarT
+    ) -> Interval[_OrderableScalarT]: ...
+    @overload
+    def __add__(self: Interval[float], y: float) -> Interval[float]: ...
+    @overload
+    def __radd__(
+        self: Interval[_OrderableTimesT], y: Timedelta
+    ) -> Interval[_OrderableTimesT]: ...
+    @overload
+    def __radd__(
+        self: Interval[int], y: _OrderableScalarT
+    ) -> Interval[_OrderableScalarT]: ...
+    @overload
+    def __radd__(self: Interval[float], y: float) -> Interval[float]: ...
+    @overload
+    def __sub__(
+        self: Interval[_OrderableTimesT], y: Timedelta
+    ) -> Interval[_OrderableTimesT]: ...
+    @overload
+    def __sub__(
+        self: Interval[int], y: _OrderableScalarT
+    ) -> Interval[_OrderableScalarT]: ...
+    @overload
+    def __sub__(self: Interval[float], y: float) -> Interval[float]: ...
+    @overload
+    def __rsub__(
+        self: Interval[_OrderableTimesT], y: Timedelta
+    ) -> Interval[_OrderableTimesT]: ...
+    @overload
+    def __rsub__(
+        self: Interval[int], y: _OrderableScalarT
+    ) -> Interval[_OrderableScalarT]: ...
+    @overload
+    def __rsub__(self: Interval[float], y: float) -> Interval[float]: ...
+    @overload
+    def __mul__(
+        self: Interval[int], y: _OrderableScalarT
+    ) -> Interval[_OrderableScalarT]: ...
+    @overload
+    def __mul__(self: Interval[float], y: float) -> Interval[float]: ...
+    @overload
+    def __rmul__(
+        self: Interval[int], y: _OrderableScalarT
+    ) -> Interval[_OrderableScalarT]: ...
+    @overload
+    def __rmul__(self: Interval[float], y: float) -> Interval[float]: ...
+    @overload
+    def __truediv__(
+        self: Interval[int], y: _OrderableScalarT
+    ) -> Interval[_OrderableScalarT]: ...
+    @overload
+    def __truediv__(self: Interval[float], y: float) -> Interval[float]: ...
+    @overload
+    def __floordiv__(
+        self: Interval[int], y: _OrderableScalarT
+    ) -> Interval[_OrderableScalarT]: ...
+    @overload
+    def __floordiv__(self: Interval[float], y: float) -> Interval[float]: ...
+    def overlaps(self: Interval[_OrderableT], other: Interval[_OrderableT]) -> bool: ...
+
+def intervals_to_interval_bounds(
+    intervals: np.ndarray, validate_closed: bool = ...
+) -> tuple[np.ndarray, np.ndarray, IntervalClosedType]: ...
+
+class IntervalTree(IntervalMixin):
+    def __init__(
+        self,
+        left: np.ndarray,
+        right: np.ndarray,
+        closed: IntervalClosedType = ...,
+        leaf_size: int = ...,
+    ) -> None: ...
+    @property
+    def mid(self) -> np.ndarray: ...
+    @property
+    def length(self) -> np.ndarray: ...
+    def get_indexer(self, target) -> npt.NDArray[np.intp]: ...
+    def get_indexer_non_unique(
+        self, target
+    ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+    _na_count: int
+    @property
+    def is_overlapping(self) -> bool: ...
+    @property
+    def is_monotonic_increasing(self) -> bool: ...
+    def clear_mapping(self) -> None: ...
diff --git a/pandas/_libs/join.pyi b/pandas/_libs/join.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..1d4e8c90bc5593eae650319e9ca1b58cbd7eed73
--- /dev/null
+++ b/pandas/_libs/join.pyi
@@ -0,0 +1,79 @@
+import numpy as np
+
+from pandas._typing import npt
+
+def inner_join(
+    left: np.ndarray,  # const intp_t[:]
+    right: np.ndarray,  # const intp_t[:]
+    max_groups: int,
+    sort: bool = ...,
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+def left_outer_join(
+    left: np.ndarray,  # const intp_t[:]
+    right: np.ndarray,  # const intp_t[:]
+    max_groups: int,
+    sort: bool = ...,
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+def full_outer_join(
+    left: np.ndarray,  # const intp_t[:]
+    right: np.ndarray,  # const intp_t[:]
+    max_groups: int,
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+def ffill_indexer(
+    indexer: np.ndarray,  # const intp_t[:]
+) -> npt.NDArray[np.intp]: ...
+def left_join_indexer_unique(
+    left: np.ndarray,  # ndarray[join_t]
+    right: np.ndarray,  # ndarray[join_t]
+) -> npt.NDArray[np.intp]: ...
+def left_join_indexer(
+    left: np.ndarray,  # ndarray[join_t]
+    right: np.ndarray,  # ndarray[join_t]
+) -> tuple[
+    np.ndarray,  # np.ndarray[join_t]
+    npt.NDArray[np.intp],
+    npt.NDArray[np.intp],
+]: ...
+def inner_join_indexer(
+    left: np.ndarray,  # ndarray[join_t]
+    right: np.ndarray,  # ndarray[join_t]
+) -> tuple[
+    np.ndarray,  # np.ndarray[join_t]
+    npt.NDArray[np.intp],
+    npt.NDArray[np.intp],
+]: ...
+def outer_join_indexer(
+    left: np.ndarray,  # ndarray[join_t]
+    right: np.ndarray,  # ndarray[join_t]
+) -> tuple[
+    np.ndarray,  # np.ndarray[join_t]
+    npt.NDArray[np.intp],
+    npt.NDArray[np.intp],
+]: ...
+def asof_join_backward_on_X_by_Y(
+    left_values: np.ndarray,  # ndarray[numeric_t]
+    right_values: np.ndarray,  # ndarray[numeric_t]
+    left_by_values: np.ndarray,  # const int64_t[:]
+    right_by_values: np.ndarray,  # const int64_t[:]
+    allow_exact_matches: bool = ...,
+    tolerance: np.number | float | None = ...,
+    use_hashtable: bool = ...,
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+def asof_join_forward_on_X_by_Y(
+    left_values: np.ndarray,  # ndarray[numeric_t]
+    right_values: np.ndarray,  # ndarray[numeric_t]
+    left_by_values: np.ndarray,  # const int64_t[:]
+    right_by_values: np.ndarray,  # const int64_t[:]
+    allow_exact_matches: bool = ...,
+    tolerance: np.number | float | None = ...,
+    use_hashtable: bool = ...,
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
+def asof_join_nearest_on_X_by_Y(
+    left_values: np.ndarray,  # ndarray[numeric_t]
+    right_values: np.ndarray,  # ndarray[numeric_t]
+    left_by_values: np.ndarray,  # const int64_t[:]
+    right_by_values: np.ndarray,  # const int64_t[:]
+    allow_exact_matches: bool = ...,
+    tolerance: np.number | float | None = ...,
+    use_hashtable: bool = ...,
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
diff --git a/pandas/_libs/json.cpython-312-x86_64-linux-gnu.so b/pandas/_libs/json.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..cba519b304383daf0c27a5bf66da87c31760b773
Binary files /dev/null and b/pandas/_libs/json.cpython-312-x86_64-linux-gnu.so differ
diff --git a/pandas/_libs/json.pyi b/pandas/_libs/json.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..349320d69d707a27b5ca75a5eabfa3c867fa25c5
--- /dev/null
+++ b/pandas/_libs/json.pyi
@@ -0,0 +1,23 @@
+from collections.abc import Callable
+from typing import (
+    Any,
+)
+
+def ujson_dumps(
+    obj: Any,
+    ensure_ascii: bool = ...,
+    double_precision: int = ...,
+    indent: int = ...,
+    orient: str = ...,
+    date_unit: str = ...,
+    iso_dates: bool = ...,
+    default_handler: None
+    | Callable[[Any], str | float | bool | list | dict | None] = ...,
+) -> str: ...
+def ujson_loads(
+    s: str,
+    precise_float: bool = ...,
+    numpy: bool = ...,
+    dtype: None = ...,
+    labelled: bool = ...,
+) -> Any: ...
diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..e50b301c348688423b62a53e2b44e98e54ddc9ae
--- /dev/null
+++ b/pandas/_libs/lib.pyi
@@ -0,0 +1,238 @@
+# TODO(npdtypes): Many types specified here can be made more specific/accurate;
+#  the more specific versions are specified in comments
+from collections.abc import (
+    Callable,
+    Generator,
+    Hashable,
+)
+from decimal import Decimal
+from typing import (
+    Any,
+    Final,
+    Literal,
+    TypeAlias,
+    TypeGuard,
+    overload,
+)
+
+import numpy as np
+
+from pandas._typing import (
+    ArrayLike,
+    DtypeObj,
+    npt,
+)
+
+# placeholder until we can specify np.ndarray[object, ndim=2]
+ndarray_obj_2d = np.ndarray
+
+from enum import Enum
+
+class _NoDefault(Enum):
+    no_default = ...
+
+no_default: Final = _NoDefault.no_default
+NoDefault: TypeAlias = Literal[_NoDefault.no_default]
+
+i8max: int
+u8max: int
+
+def is_np_dtype(dtype: object, kinds: str | None = ...) -> TypeGuard[np.dtype]: ...
+def item_from_zerodim(val: object) -> object: ...
+def infer_dtype(value: object, skipna: bool = ...) -> str: ...
+def is_iterator(obj: object) -> bool: ...
+def is_scalar(val: object) -> bool: ...
+def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ...
+def is_pyarrow_array(obj: object) -> bool: ...
+def is_decimal(obj: object) -> TypeGuard[Decimal]: ...
+def is_complex(obj: object) -> TypeGuard[complex]: ...
+def is_bool(obj: object) -> TypeGuard[bool | np.bool_]: ...
+def is_integer(obj: object) -> TypeGuard[int | np.integer]: ...
+def is_int_or_none(obj) -> bool: ...
+def is_float(obj: object) -> TypeGuard[float]: ...
+def is_interval_array(values: np.ndarray) -> bool: ...
+def is_datetime64_array(values: np.ndarray, skipna: bool = True) -> bool: ...
+def is_timedelta_or_timedelta64_array(
+    values: np.ndarray, skipna: bool = True
+) -> bool: ...
+def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ...
+def is_time_array(values: np.ndarray, skipna: bool = ...): ...
+def is_date_array(values: np.ndarray, skipna: bool = ...): ...
+def is_datetime_array(values: np.ndarray, skipna: bool = ...): ...
+def is_string_array(values: np.ndarray, skipna: bool = ...): ...
+def is_float_array(values: np.ndarray, skipna: bool = ...): ...
+def is_integer_array(values: np.ndarray, skipna: bool = ...): ...
+def is_bool_array(values: np.ndarray, skipna: bool = ...): ...
+def fast_multiget(
+    mapping: dict,
+    keys: np.ndarray,  # object[:]
+    default=...,
+) -> ArrayLike: ...
+def fast_unique_multiple_list_gen(gen: Generator, sort: bool = ...) -> list: ...
+@overload
+def map_infer(
+    arr: np.ndarray,
+    f: Callable[[Any], Any],
+    *,
+    convert: Literal[False],
+    ignore_na: bool = ...,
+) -> np.ndarray: ...
+@overload
+def map_infer(
+    arr: np.ndarray,
+    f: Callable[[Any], Any],
+    *,
+    convert: bool = ...,
+    ignore_na: bool = ...,
+) -> ArrayLike: ...
+@overload
+def maybe_convert_objects(
+    objects: npt.NDArray[np.object_],
+    *,
+    try_float: bool = ...,
+    safe: bool = ...,
+    convert_numeric: bool = ...,
+    convert_non_numeric: Literal[False] = ...,
+    convert_to_nullable_dtype: Literal[False] = ...,
+    dtype_if_all_nat: DtypeObj | None = ...,
+) -> npt.NDArray[np.object_ | np.number]: ...
+@overload
+def maybe_convert_objects(
+    objects: npt.NDArray[np.object_],
+    *,
+    try_float: bool = ...,
+    safe: bool = ...,
+    convert_numeric: bool = ...,
+    convert_non_numeric: bool = ...,
+    convert_to_nullable_dtype: Literal[True] = ...,
+    dtype_if_all_nat: DtypeObj | None = ...,
+) -> ArrayLike: ...
+@overload
+def maybe_convert_objects(
+    objects: npt.NDArray[np.object_],
+    *,
+    try_float: bool = ...,
+    safe: bool = ...,
+    convert_numeric: bool = ...,
+    convert_non_numeric: bool = ...,
+    convert_to_nullable_dtype: bool = ...,
+    dtype_if_all_nat: DtypeObj | None = ...,
+) -> ArrayLike: ...
+@overload
+def maybe_convert_numeric(
+    values: npt.NDArray[np.object_],
+    na_values: set,
+    convert_empty: bool = ...,
+    coerce_numeric: bool = ...,
+    convert_to_masked_nullable: Literal[False] = ...,
+) -> tuple[np.ndarray, None]: ...
+@overload
+def maybe_convert_numeric(
+    values: npt.NDArray[np.object_],
+    na_values: set,
+    convert_empty: bool = ...,
+    coerce_numeric: bool = ...,
+    *,
+    convert_to_masked_nullable: Literal[True],
+) -> tuple[np.ndarray, np.ndarray]: ...
+
+# TODO: restrict `arr`?
+def ensure_string_array(
+    arr,
+    na_value: object = ...,
+    convert_na_value: bool = ...,
+    copy: bool = ...,
+    skipna: bool = ...,
+) -> npt.NDArray[np.object_]: ...
+def convert_nans_to_NA(
+    arr: npt.NDArray[np.object_],
+) -> npt.NDArray[np.object_]: ...
+def fast_zip(ndarrays: list) -> npt.NDArray[np.object_]: ...
+
+# TODO: can we be more specific about rows?
+def to_object_array_tuples(rows: object) -> ndarray_obj_2d: ...
+def tuples_to_object_array(
+    tuples: npt.NDArray[np.object_],
+) -> ndarray_obj_2d: ...
+
+# TODO: can we be more specific about rows?
+def to_object_array(rows: object, min_width: int = ...) -> ndarray_obj_2d: ...
+def dicts_to_array(dicts: list, columns: list) -> ndarray_obj_2d: ...
+def maybe_booleans_to_slice(
+    mask: npt.NDArray[np.uint8],
+) -> slice | npt.NDArray[np.uint8]: ...
+def maybe_indices_to_slice(
+    indices: npt.NDArray[np.intp],
+    max_len: int,
+) -> slice | npt.NDArray[np.intp]: ...
+def is_all_arraylike(obj: list) -> bool: ...
+
+# -----------------------------------------------------------------
+# Functions which in reality take memoryviews
+
+def memory_usage_of_objects(arr: np.ndarray) -> int: ...  # object[:]  # np.int64
+@overload
+def map_infer_mask(
+    arr: np.ndarray,
+    f: Callable[[Any], Any],
+    mask: np.ndarray,  # const uint8_t[:]
+    *,
+    convert: Literal[False],
+    na_value: Any = ...,
+    dtype: np.dtype = ...,
+) -> np.ndarray: ...
+@overload
+def map_infer_mask(
+    arr: np.ndarray,
+    f: Callable[[Any], Any],
+    mask: np.ndarray,  # const uint8_t[:]
+    *,
+    convert: bool = ...,
+    na_value: Any = ...,
+    dtype: np.dtype = ...,
+) -> ArrayLike: ...
+def indices_fast(
+    index: npt.NDArray[np.intp],
+    labels: np.ndarray,  # const int64_t[:]
+    keys: list,
+    sorted_labels: list[npt.NDArray[np.int64]],
+) -> dict[Hashable, npt.NDArray[np.intp]]: ...
+def generate_slices(
+    labels: np.ndarray,
+    ngroups: int,  # const intp_t[:]
+) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ...
+def count_level_2d(
+    mask: np.ndarray,  # ndarray[uint8_t, ndim=2, cast=True],
+    labels: np.ndarray,  # const intp_t[:]
+    max_bin: int,
+) -> np.ndarray: ...  # np.ndarray[np.int64, ndim=2]
+def get_level_sorter(
+    codes: np.ndarray,  # const int64_t[:]
+    starts: np.ndarray,  # const intp_t[:]
+) -> np.ndarray: ...  # np.ndarray[np.intp, ndim=1]
+def generate_bins_dt64(
+    values: npt.NDArray[np.int64],
+    binner: np.ndarray,  # const int64_t[:]
+    closed: object = ...,
+    hasnans: bool = ...,
+) -> np.ndarray: ...  # np.ndarray[np.int64, ndim=1]
+def array_equivalent_object(
+    left: npt.NDArray[np.object_],
+    right: npt.NDArray[np.object_],
+) -> bool: ...
+def has_infs(arr: np.ndarray) -> bool: ...  # const floating[:]
+def has_only_ints_or_nan(arr: np.ndarray) -> bool: ...  # const floating[:]
+def get_reverse_indexer(
+    indexer: np.ndarray,  # const intp_t[:]
+    length: int,
+) -> npt.NDArray[np.intp]: ...
+def is_bool_list(obj: list) -> bool: ...
+def dtypes_all_equal(types: list[DtypeObj]) -> bool: ...
+def is_range_indexer(
+    left: np.ndarray,
+    n: int,  # np.ndarray[np.int64, ndim=1]
+) -> bool: ...
+def is_sequence_range(
+    sequence: np.ndarray,
+    step: int,  # np.ndarray[np.int64, ndim=1]
+) -> bool: ...
diff --git a/pandas/_libs/missing.pyi b/pandas/_libs/missing.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..64256ae4b36ad2fcf0aa59024957d01aa122fedf
--- /dev/null
+++ b/pandas/_libs/missing.pyi
@@ -0,0 +1,17 @@
+import numpy as np
+from numpy import typing as npt
+
+class NAType:
+    def __new__(cls, *args, **kwargs): ...
+
+NA: NAType
+
+def is_matching_na(
+    left: object, right: object, nan_matches_none: bool = ...
+) -> bool: ...
+def isposinf_scalar(val: object) -> bool: ...
+def isneginf_scalar(val: object) -> bool: ...
+def checknull(val: object) -> bool: ...
+def isnaobj(arr: np.ndarray) -> npt.NDArray[np.bool_]: ...
+def is_numeric_na(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
+def is_pdna_or_none(values: np.ndarray) -> npt.NDArray[np.bool_]: ...
diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..81fe81930539d1dc009ba1242fff92a1afd4cdc4
--- /dev/null
+++ b/pandas/_libs/ops.pyi
@@ -0,0 +1,53 @@
+from collections.abc import (
+    Callable,
+    Iterable,
+)
+from typing import (
+    Any,
+    Literal,
+    TypeAlias,
+    overload,
+)
+
+import numpy as np
+
+from pandas._typing import npt
+
+_BinOp: TypeAlias = Callable[[Any, Any], Any]
+_BoolOp: TypeAlias = Callable[[Any, Any], bool]
+
+def scalar_compare(
+    values: np.ndarray,  # object[:]
+    val: object,
+    op: _BoolOp,  # {operator.eq, operator.ne, ...}
+) -> npt.NDArray[np.bool_]: ...
+def vec_compare(
+    left: npt.NDArray[np.object_],
+    right: npt.NDArray[np.object_],
+    op: _BoolOp,  # {operator.eq, operator.ne, ...}
+) -> npt.NDArray[np.bool_]: ...
+def scalar_binop(
+    values: np.ndarray,  # object[:]
+    val: object,
+    op: _BinOp,  # binary operator
+) -> np.ndarray: ...
+def vec_binop(
+    left: np.ndarray,  # object[:]
+    right: np.ndarray,  # object[:]
+    op: _BinOp,  # binary operator
+) -> np.ndarray: ...
+@overload
+def maybe_convert_bool(
+    arr: npt.NDArray[np.object_],
+    true_values: Iterable | None = None,
+    false_values: Iterable | None = None,
+    convert_to_masked_nullable: Literal[False] = ...,
+) -> tuple[np.ndarray, None]: ...
+@overload
+def maybe_convert_bool(
+    arr: npt.NDArray[np.object_],
+    true_values: Iterable = ...,
+    false_values: Iterable = ...,
+    *,
+    convert_to_masked_nullable: Literal[True],
+) -> tuple[np.ndarray, np.ndarray]: ...
diff --git a/pandas/_libs/ops_dispatch.cpython-312-x86_64-linux-gnu.so b/pandas/_libs/ops_dispatch.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..07d0736acd0575c0996ad61540272976788c750a
Binary files /dev/null and b/pandas/_libs/ops_dispatch.cpython-312-x86_64-linux-gnu.so differ
diff --git a/pandas/_libs/ops_dispatch.pyi b/pandas/_libs/ops_dispatch.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..91b5a4dbaaebc177191d3189f12e4e20d56ca0fa
--- /dev/null
+++ b/pandas/_libs/ops_dispatch.pyi
@@ -0,0 +1,5 @@
+import numpy as np
+
+def maybe_dispatch_ufunc_to_dunder_op(
+    self, ufunc: np.ufunc, method: str, *inputs, **kwargs
+): ...
diff --git a/pandas/_libs/pandas_datetime.cpython-312-x86_64-linux-gnu.so b/pandas/_libs/pandas_datetime.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..7f2307cd48672dafeb3fd2b8927698d446551882
Binary files /dev/null and b/pandas/_libs/pandas_datetime.cpython-312-x86_64-linux-gnu.so differ
diff --git a/pandas/_libs/pandas_parser.cpython-312-x86_64-linux-gnu.so b/pandas/_libs/pandas_parser.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..18393c333cbc36ba233bb18d2025627e4669c54a
Binary files /dev/null and b/pandas/_libs/pandas_parser.cpython-312-x86_64-linux-gnu.so differ
diff --git a/pandas/_libs/parsers.pyi b/pandas/_libs/parsers.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..d18f54c54623236d47df31be70e8516202d4c86a
--- /dev/null
+++ b/pandas/_libs/parsers.pyi
@@ -0,0 +1,77 @@
+from collections.abc import Hashable
+from typing import (
+    Literal,
+)
+
+import numpy as np
+
+from pandas._typing import (
+    ArrayLike,
+    Dtype,
+    npt,
+)
+
+STR_NA_VALUES: set[str]
+DEFAULT_BUFFER_HEURISTIC: int
+
+def sanitize_objects(
+    values: npt.NDArray[np.object_],
+    na_values: set,
+) -> int: ...
+
+class TextReader:
+    unnamed_cols: set[str]
+    table_width: int  # int64_t
+    leading_cols: int  # int64_t
+    header: list[list[int]]  # non-negative integers
+    def __init__(
+        self,
+        source,
+        delimiter: bytes | str = ...,  # single-character only
+        header=...,
+        header_start: int = ...,  # int64_t
+        header_end: int = ...,  # uint64_t
+        index_col=...,
+        names=...,
+        tokenize_chunksize: int = ...,  # int64_t
+        delim_whitespace: bool = ...,
+        converters=...,
+        skipinitialspace: bool = ...,
+        escapechar: bytes | str | None = ...,  # single-character only
+        doublequote: bool = ...,
+        quotechar: str | bytes | None = ...,  # at most 1 character
+        quoting: int = ...,
+        lineterminator: bytes | str | None = ...,  # at most 1 character
+        comment=...,
+        decimal: bytes | str = ...,  # single-character only
+        thousands: bytes | str | None = ...,  # single-character only
+        dtype: Dtype | dict[Hashable, Dtype] = ...,
+        usecols=...,
+        error_bad_lines: bool = ...,
+        warn_bad_lines: bool = ...,
+        na_filter: bool = ...,
+        na_values=...,
+        na_fvalues=...,
+        keep_default_na: bool = ...,
+        true_values=...,
+        false_values=...,
+        allow_leading_cols: bool = ...,
+        skiprows=...,
+        skipfooter: int = ...,  # int64_t
+        verbose: bool = ...,
+        float_precision: Literal["round_trip", "legacy", "high"] | None = ...,
+        skip_blank_lines: bool = ...,
+        encoding_errors: bytes | str = ...,
+    ) -> None: ...
+    def set_noconvert(self, i: int) -> None: ...
+    def remove_noconvert(self, i: int) -> None: ...
+    def close(self) -> None: ...
+    def read(self, rows: int | None = ...) -> dict[int, ArrayLike]: ...
+    def read_low_memory(self, rows: int | None) -> list[dict[int, ArrayLike]]: ...
+
+# _maybe_upcast, na_values are only exposed for testing
+na_values: dict
+
+def _maybe_upcast(
+    arr, use_dtype_backend: bool = ..., dtype_backend: str = ...
+) -> np.ndarray: ...
diff --git a/pandas/_libs/properties.cpython-312-x86_64-linux-gnu.so b/pandas/_libs/properties.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..2dfdcfefa6c9b1c264b74c4b1a80a6cdfbea9c71
Binary files /dev/null and b/pandas/_libs/properties.cpython-312-x86_64-linux-gnu.so differ
diff --git a/pandas/_libs/properties.pyi b/pandas/_libs/properties.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..bbde6ec454202fe0c2f1d8f8da40280f13d1536c
--- /dev/null
+++ b/pandas/_libs/properties.pyi
@@ -0,0 +1,27 @@
+from collections.abc import Sequence
+from typing import (
+    overload,
+)
+
+from pandas._typing import (
+    AnyArrayLike,
+    DataFrame,
+    Index,
+    Series,
+)
+
+# note: this is a lie to make type checkers happy (they special
+# case property). cache_readonly uses attribute names similar to
+# property (fget) but it does not provide fset and fdel.
+cache_readonly = property
+
+class AxisProperty:
+    axis: int
+    def __init__(self, axis: int = ..., doc: str = ...) -> None: ...
+    @overload
+    def __get__(self, obj: DataFrame | Series, type) -> Index: ...
+    @overload
+    def __get__(self, obj: None, type) -> AxisProperty: ...
+    def __set__(
+        self, obj: DataFrame | Series, value: AnyArrayLike | Sequence
+    ) -> None: ...
diff --git a/pandas/_libs/reshape.pyi b/pandas/_libs/reshape.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..110687fcd0c313c45e8b025083fa5790fb9913b1
--- /dev/null
+++ b/pandas/_libs/reshape.pyi
@@ -0,0 +1,16 @@
+import numpy as np
+
+from pandas._typing import npt
+
+def unstack(
+    values: np.ndarray,  # reshape_t[:, :]
+    mask: np.ndarray,  # const uint8_t[:]
+    stride: int,
+    length: int,
+    width: int,
+    new_values: np.ndarray,  # reshape_t[:, :]
+    new_mask: np.ndarray,  # uint8_t[:, :]
+) -> None: ...
+def explode(
+    values: npt.NDArray[np.object_],
+) -> tuple[npt.NDArray[np.object_], npt.NDArray[np.int64]]: ...
diff --git a/pandas/_libs/sas.pyi b/pandas/_libs/sas.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..5d65e2b56b5916ed1e76e1409e4f75c652ee8fc9
--- /dev/null
+++ b/pandas/_libs/sas.pyi
@@ -0,0 +1,7 @@
+from pandas.io.sas.sas7bdat import SAS7BDATReader
+
+class Parser:
+    def __init__(self, parser: SAS7BDATReader) -> None: ...
+    def read(self, nrows: int) -> None: ...
+
+def get_subheader_index(signature: bytes) -> int: ...
diff --git a/pandas/_libs/sparse.pyi b/pandas/_libs/sparse.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..f1f3efb4d3096a77ad9aaafe14647fb0d22cbac3
--- /dev/null
+++ b/pandas/_libs/sparse.pyi
@@ -0,0 +1,51 @@
+from typing import Self
+
+import numpy as np
+
+from pandas._typing import (
+    TakeIndexer,
+    npt,
+)
+
+class SparseIndex:
+    length: int
+    npoints: int
+    def __init__(self) -> None: ...
+    @property
+    def ngaps(self) -> int: ...
+    @property
+    def nbytes(self) -> int: ...
+    @property
+    def indices(self) -> npt.NDArray[np.int32]: ...
+    def equals(self, other) -> bool: ...
+    def lookup(self, index: int) -> np.int32: ...
+    def lookup_array(self, indexer: npt.NDArray[np.int32]) -> npt.NDArray[np.int32]: ...
+    def to_int_index(self) -> IntIndex: ...
+    def to_block_index(self) -> BlockIndex: ...
+    def intersect(self, y_: SparseIndex) -> Self: ...
+    def make_union(self, y_: SparseIndex) -> Self: ...
+
+class IntIndex(SparseIndex):
+    indices: npt.NDArray[np.int32]
+    def __init__(
+        self, length: int, indices: TakeIndexer, check_integrity: bool = ...
+    ) -> None: ...
+
+class BlockIndex(SparseIndex):
+    nblocks: int
+    blocs: np.ndarray
+    blengths: np.ndarray
+    def __init__(
+        self, length: int, blocs: np.ndarray, blengths: np.ndarray
+    ) -> None: ...
+
+    # Override to have correct parameters
+    def intersect(self, other: SparseIndex) -> Self: ...
+    def make_union(self, y: SparseIndex) -> Self: ...
+
+def make_mask_object_ndarray(
+    arr: npt.NDArray[np.object_], fill_value
+) -> npt.NDArray[np.bool_]: ...
+def get_blocks(
+    indices: npt.NDArray[np.int32],
+) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.int32]]: ...
diff --git a/pandas/_libs/testing.pyi b/pandas/_libs/testing.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..4758483b3b5e7755669d4f171cb8f662f063a344
--- /dev/null
+++ b/pandas/_libs/testing.pyi
@@ -0,0 +1,14 @@
+from collections.abc import Mapping
+
+def assert_dict_equal(a: Mapping, b: Mapping, compare_keys: bool = ...) -> bool: ...
+def assert_almost_equal(
+    a,
+    b,
+    rtol: float = ...,
+    atol: float = ...,
+    check_dtype: bool = ...,
+    obj=...,
+    lobj=...,
+    robj=...,
+    index_values=...,
+) -> bool: ...
diff --git a/pandas/_libs/tslib.pyi b/pandas/_libs/tslib.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..7e3372a80db9db10cde474ab7a1b4f7b6a7c39b5
--- /dev/null
+++ b/pandas/_libs/tslib.pyi
@@ -0,0 +1,33 @@
+from datetime import tzinfo
+
+import numpy as np
+
+from pandas._typing import npt
+
+def format_array_from_datetime(
+    values: npt.NDArray[np.int64],
+    tz: tzinfo | None = ...,
+    format: str | None = ...,
+    na_rep: str | float = ...,
+    reso: int = ...,  # NPY_DATETIMEUNIT
+) -> npt.NDArray[np.object_]: ...
+def first_non_null(values: np.ndarray) -> int: ...
+def array_to_datetime(
+    values: npt.NDArray[np.object_],
+    errors: str = ...,
+    dayfirst: bool = ...,
+    yearfirst: bool = ...,
+    utc: bool = ...,
+    creso: int = ...,
+    unit_for_numerics: str | None = ...,
+) -> tuple[np.ndarray, tzinfo | None]: ...
+
+# returned ndarray may be object dtype or datetime64[ns]
+
+def array_to_datetime_with_tz(
+    values: npt.NDArray[np.object_],
+    tz: tzinfo,
+    dayfirst: bool,
+    yearfirst: bool,
+    creso: int,
+) -> npt.NDArray[np.int64]: ...
diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c6bbb87baa2c729ed11fc6773d5f13511d16b2d
--- /dev/null
+++ b/pandas/_libs/tslibs/__init__.py
@@ -0,0 +1,89 @@
+__all__ = [
+    "BaseOffset",
+    "Day",
+    "IncompatibleFrequency",
+    "NaT",
+    "NaTType",
+    "OutOfBoundsDatetime",
+    "OutOfBoundsTimedelta",
+    "Period",
+    "Resolution",
+    "Tick",
+    "Timedelta",
+    "Timestamp",
+    "add_overflowsafe",
+    "astype_overflowsafe",
+    "delta_to_nanoseconds",
+    "dt64arr_to_periodarr",
+    "dtypes",
+    "get_resolution",
+    "get_supported_dtype",
+    "get_unit_from_dtype",
+    "guess_datetime_format",
+    "iNaT",
+    "ints_to_pydatetime",
+    "ints_to_pytimedelta",
+    "is_date_array_normalized",
+    "is_supported_dtype",
+    "is_unitless",
+    "localize_pydatetime",
+    "nat_strings",
+    "normalize_i8_timestamps",
+    "periods_per_day",
+    "periods_per_second",
+    "to_offset",
+    "tz_compare",
+    "tz_convert_from_utc",
+    "tz_convert_from_utc_single",
+]
+
+from pandas._libs.tslibs import dtypes
+from pandas._libs.tslibs.conversion import localize_pydatetime
+from pandas._libs.tslibs.dtypes import (
+    Resolution,
+    periods_per_day,
+    periods_per_second,
+)
+from pandas._libs.tslibs.nattype import (
+    NaT,
+    NaTType,
+    iNaT,
+    nat_strings,
+)
+from pandas._libs.tslibs.np_datetime import (
+    OutOfBoundsDatetime,
+    OutOfBoundsTimedelta,
+    add_overflowsafe,
+    astype_overflowsafe,
+    get_supported_dtype,
+    is_supported_dtype,
+    is_unitless,
+    py_get_unit_from_dtype as get_unit_from_dtype,
+)
+from pandas._libs.tslibs.offsets import (
+    BaseOffset,
+    Day,
+    Tick,
+    to_offset,
+)
+from pandas._libs.tslibs.parsing import guess_datetime_format
+from pandas._libs.tslibs.period import (
+    IncompatibleFrequency,
+    Period,
+)
+from pandas._libs.tslibs.timedeltas import (
+    Timedelta,
+    delta_to_nanoseconds,
+    ints_to_pytimedelta,
+)
+from pandas._libs.tslibs.timestamps import Timestamp
+from pandas._libs.tslibs.timezones import tz_compare
+from pandas._libs.tslibs.tzconversion import tz_convert_from_utc_single
+from pandas._libs.tslibs.vectorized import (
+    dt64arr_to_periodarr,
+    get_resolution,
+    ints_to_pydatetime,
+    is_date_array_normalized,
+    normalize_i8_timestamps,
+    tz_convert_from_utc,
+)
diff --git a/pandas/_libs/tslibs/base.cpython-312-x86_64-linux-gnu.so b/pandas/_libs/tslibs/base.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..73a0440d12f1621c220e7318e8d9c77a950579ef
Binary files /dev/null and b/pandas/_libs/tslibs/base.cpython-312-x86_64-linux-gnu.so differ
diff --git a/pandas/_libs/tslibs/ccalendar.cpython-312-x86_64-linux-gnu.so b/pandas/_libs/tslibs/ccalendar.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..9f79d5d210efe68161f87f488464ba0d2560b60b
Binary files /dev/null and b/pandas/_libs/tslibs/ccalendar.cpython-312-x86_64-linux-gnu.so differ
diff --git a/pandas/_libs/tslibs/ccalendar.pyi b/pandas/_libs/tslibs/ccalendar.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..993f18a61d74aaa643e9790df70ede618b917223
--- /dev/null
+++ b/pandas/_libs/tslibs/ccalendar.pyi
@@ -0,0 +1,12 @@
+DAYS: list[str]
+MONTH_ALIASES: dict[int, str]
+MONTH_NUMBERS: dict[str, int]
+MONTHS: list[str]
+int_to_weekday: dict[int, str]
+
+def get_firstbday(year: int, month: int) -> int: ...
+def get_lastbday(year: int, month: int) -> int: ...
+def get_day_of_year(year: int, month: int, day: int) -> int: ...
+def get_iso_calendar(year: int, month: int, day: int) -> tuple[int, int, int]: ...
+def get_week_of_year(year: int, month: int, day: int) -> int: ...
+def get_days_in_month(year: int, month: int) -> int: ...
diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..26affae577f4d3f4ecda2ac9c1bc0cb748a35d4d
--- /dev/null
+++ b/pandas/_libs/tslibs/conversion.pyi
@@ -0,0 +1,14 @@
+from datetime import (
+    datetime,
+    tzinfo,
+)
+
+import numpy as np
+
+DT64NS_DTYPE: np.dtype
+TD64NS_DTYPE: np.dtype
+
+def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ...
+def cast_from_unit_vectorized(
+    values: np.ndarray, unit: str, out_unit: str = ...
+) -> np.ndarray: ...
diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..821c46598620327616e8b152ab4123ef6f7bc6e1
--- /dev/null
+++ b/pandas/_libs/tslibs/dtypes.pyi
@@ -0,0 +1,86 @@
+from enum import Enum
+from typing import Self
+
+OFFSET_TO_PERIOD_FREQSTR: dict[str, str]
+
+def periods_per_day(reso: int = ...) -> int: ...
+def periods_per_second(reso: int) -> int: ...
+def abbrev_to_npy_unit(abbrev: str | None) -> int: ...
+
+class PeriodDtypeBase:
+    _dtype_code: int  # PeriodDtypeCode
+    _n: int
+
+    # actually __cinit__
+    def __new__(cls, code: int, n: int) -> Self: ...
+    @property
+    def _freq_group_code(self) -> int: ...
+    @property
+    def _resolution_obj(self) -> Resolution: ...
+    def _get_to_timestamp_base(self) -> int: ...
+    @property
+    def _freqstr(self) -> str: ...
+    def __hash__(self) -> int: ...
+    def _is_tick_like(self) -> bool: ...
+    @property
+    def _creso(self) -> int: ...
+    @property
+    def _td64_unit(self) -> str: ...
+
+class FreqGroup(Enum):
+    _value_: int
+    FR_ANN = ...
+    FR_QTR = ...
+    FR_MTH = ...
+    FR_WK = ...
+    FR_BUS = ...
+    FR_DAY = ...
+    FR_HR = ...
+    FR_MIN = ...
+    FR_SEC = ...
+    FR_MS = ...
+    FR_US = ...
+    FR_NS = ...
+    FR_UND = ...
+    @staticmethod
+    def from_period_dtype_code(code: int) -> FreqGroup: ...
+
+class Resolution(Enum):
+    _value_: int
+    RESO_NS = ...
+    RESO_US = ...
+    RESO_MS = ...
+    RESO_SEC = ...
+    RESO_MIN = ...
+    RESO_HR = ...
+    RESO_DAY = ...
+    RESO_MTH = ...
+    RESO_QTR = ...
+    RESO_YR = ...
+    def __lt__(self, other: Resolution) -> bool: ...
+    def __ge__(self, other: Resolution) -> bool: ...
+    @property
+    def attrname(self) -> str: ...
+    @classmethod
+    def from_attrname(cls, attrname: str) -> Resolution: ...
+    @classmethod
+    def get_reso_from_freqstr(cls, freq: str) -> Resolution: ...
+    @property
+    def attr_abbrev(self) -> str: ...
+
+class NpyDatetimeUnit(Enum):
+    _value_: int
+    NPY_FR_Y = ...
+    NPY_FR_M = ...
+    NPY_FR_W = ...
+    NPY_FR_D = ...
+    NPY_FR_h = ...
+    NPY_FR_m = ...
+    NPY_FR_s = ...
+    NPY_FR_ms = ...
+    NPY_FR_us = ...
+    NPY_FR_ns = ...
+    NPY_FR_ps = ...
+    NPY_FR_fs = ...
+    NPY_FR_as = ...
+    NPY_FR_GENERIC = ...
diff --git a/pandas/_libs/tslibs/fields.pyi b/pandas/_libs/tslibs/fields.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..bc55e34f3d2088c24eabf0b9b3eebf8a9a775d2c
--- /dev/null
+++ b/pandas/_libs/tslibs/fields.pyi
@@ -0,0 +1,62 @@
+import numpy as np
+
+from pandas._typing import npt
+
+def build_field_sarray(
+    dtindex: npt.NDArray[np.int64],  # const int64_t[:]
+    reso: int,  # NPY_DATETIMEUNIT
+) -> np.ndarray: ...
+def month_position_check(fields, weekdays) -> str | None: ...
+def get_date_name_field(
+    dtindex: npt.NDArray[np.int64],  # const int64_t[:]
+    field: str,
+    locale: str | None = ...,
+    reso: int = ...,  # NPY_DATETIMEUNIT
+) -> npt.NDArray[np.object_]: ...
+def get_start_end_field(
+    dtindex: npt.NDArray[np.int64],
+    field: str,
+    freq_name: str | None = ...,
+    month_kw: int = ...,
+    reso: int = ...,  # NPY_DATETIMEUNIT
+) -> npt.NDArray[np.bool_]: ...
+def get_date_field(
+    dtindex: npt.NDArray[np.int64],  # const int64_t[:]
+    field: str,
+    reso: int = ...,  # NPY_DATETIMEUNIT
+) -> npt.NDArray[np.int32]: ...
+def get_timedelta_field(
+    tdindex: npt.NDArray[np.int64],  # const int64_t[:]
+    field: str,
+    reso: int = ...,  # NPY_DATETIMEUNIT
+) -> npt.NDArray[np.int32]: ...
+def get_timedelta_days(
+    tdindex: npt.NDArray[np.int64],  # const int64_t[:]
+    reso: int = ...,  # NPY_DATETIMEUNIT
+) -> npt.NDArray[np.int64]: ...
+def isleapyear_arr(
+    years: np.ndarray,
+) -> npt.NDArray[np.bool_]: ...
+def build_isocalendar_sarray(
+    dtindex: npt.NDArray[np.int64],  # const int64_t[:]
+    reso: int,  # NPY_DATETIMEUNIT
+) -> np.ndarray: ...
+def _get_locale_names(name_type: str, locale: str | None = ...): ...
+
+class RoundTo:
+    @property
+    def MINUS_INFTY(self) -> int: ...
+    @property
+    def PLUS_INFTY(self) -> int: ...
+    @property
+    def NEAREST_HALF_EVEN(self) -> int: ...
+    @property
+    def NEAREST_HALF_PLUS_INFTY(self) -> int: ...
+    @property
+    def NEAREST_HALF_MINUS_INFTY(self) -> int: ...
+
+def round_nsint64(
+    values: npt.NDArray[np.int64],
+    mode: RoundTo,
+    nanos: int,
+) -> npt.NDArray[np.int64]: ...
diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..6d94fa6593b65731dcebe33d4b803b7035dd18a9
--- /dev/null
+++ b/pandas/_libs/tslibs/nattype.pyi
@@ -0,0 +1,184 @@
+from datetime import (
+    datetime,
+    timedelta,
+    tzinfo as _tzinfo,
+)
+from typing import (
+    Literal,
+    NoReturn,
+    Self,
+    TypeAlias,
+    overload,
+)
+
+import numpy as np
+
+from pandas._libs.tslibs.period import Period
+from pandas._typing import (
+    Frequency,
+    TimestampNonexistent,
+    TimeUnit,
+)
+
+NaT: NaTType
+iNaT: int
+nat_strings: set[str]
+
+_TimeLike: TypeAlias = datetime | timedelta | Period | np.datetime64 | np.timedelta64
+_TimeDelta: TypeAlias = timedelta | np.timedelta64
+
+class NaTType:
+    _value: np.int64
+    @property
+    def value(self) -> int: ...
+    @property
+    def asm8(self) -> np.datetime64: ...
+    def to_datetime64(self) -> np.datetime64: ...
+    def to_numpy(
+        self, dtype: np.dtype | str | None = ..., copy: bool = ...
+    ) -> np.datetime64 | np.timedelta64: ...
+    @property
+    def is_leap_year(self) -> bool: ...
+    @property
+    def is_month_start(self) -> bool: ...
+    @property
+    def is_quarter_start(self) -> bool: ...
+    @property
+    def is_year_start(self) -> bool: ...
+    @property
+    def is_month_end(self) -> bool: ...
+    @property
+    def is_quarter_end(self) -> bool: ...
+    @property
+    def is_year_end(self) -> bool: ...
+    @property
+    def day_of_year(self) -> float: ...
+    @property
+    def dayofyear(self) -> float: ...
+    @property
+    def days_in_month(self) -> float: ...
+    @property
+    def daysinmonth(self) -> float: ...
+    @property
+    def day_of_week(self) -> float: ...
+    @property
+    def dayofweek(self) -> float: ...
+    @property
+    def week(self) -> float: ...
+    @property
+    def weekofyear(self) -> float: ...
+    @property
+    def fold(self) -> int: ...
+    def day_name(self) -> float: ...
+    def month_name(self) -> float: ...
+    def weekday(self) -> float: ...
+    def isoweekday(self) -> float: ...
+    def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ...
+    def strftime(self, format: str) -> NoReturn: ...
+    def total_seconds(self) -> float: ...
+    def today(self, *args, **kwargs) -> NaTType: ...
+    def now(self, *args, **kwargs) -> NaTType: ...
+    def to_pydatetime(self) -> NaTType: ...
+    def date(self) -> NaTType: ...
+    def round(
+        self,
+        freq: Frequency,
+        ambiguous: bool | Literal["raise"] | NaTType = ...,
+        nonexistent: TimestampNonexistent = ...,
+    ) -> NaTType: ...
+    def floor(
+        self,
+        freq: Frequency,
+        ambiguous: bool | Literal["raise"] | NaTType = ...,
+        nonexistent: TimestampNonexistent = ...,
+    ) -> NaTType: ...
+    def ceil(
+        self,
+        freq: Frequency,
+        ambiguous: bool | Literal["raise"] | NaTType = ...,
+        nonexistent: TimestampNonexistent = ...,
+    ) -> NaTType: ...
+    @property
+    def tzinfo(self) -> None: ...
+    @property
+    def tz(self) -> None: ...
+    def tz_convert(self, tz: _tzinfo | str | None) -> NaTType: ...
+    def tz_localize(
+        self,
+        tz: _tzinfo | str | None,
+        ambiguous: bool | Literal["raise"] | NaTType = ...,
+        nonexistent: TimestampNonexistent = ...,
+    ) -> NaTType: ...
+    def replace(
+        self,
+        year: int | None = ...,
+        month: int | None = ...,
+        day: int | None = ...,
+        hour: int | None = ...,
+        minute: int | None = ...,
+        second: int | None = ...,
+        microsecond: int | None = ...,
+        nanosecond: int | None = ...,
+        tzinfo: _tzinfo | None = ...,
+        fold: int | None = ...,
+    ) -> NaTType: ...
+    @property
+    def year(self) -> float: ...
+    @property
+    def quarter(self) -> float: ...
+    @property
+    def month(self) -> float: ...
+    @property
+    def day(self) -> float: ...
+    @property
+    def hour(self) -> float: ...
+    @property
+    def minute(self) -> float: ...
+    @property
+    def second(self) -> float: ...
+    @property
+    def millisecond(self) -> float: ...
+    @property
+    def microsecond(self) -> float: ...
+    @property
+    def nanosecond(self) -> float: ...
+    # inject Timedelta properties
+    @property
+    def days(self) -> float: ...
+    @property
+    def seconds(self) -> float: ...
+    @property
+    def microseconds(self) -> float: ...
+    @property
+    def nanoseconds(self) -> float: ...
+    # inject Period properties
+    @property
+    def qyear(self) -> float: ...
+    # comparisons
+    def __eq__(self, other: object, /) -> Literal[False]: ...
+    def __ne__(self, other: object, /) -> Literal[True]: ...
+    def __lt__(self, other: Self | _TimeLike, /) -> Literal[False]: ...
+    def __le__(self, other: Self | _TimeLike, /) -> Literal[False]: ...
+    def __gt__(self, other: Self | _TimeLike, /) -> Literal[False]: ...
+    def __ge__(self, other: Self | _TimeLike, /) -> Literal[False]: ...
+    # unary operators
+    def __pos__(self) -> Self: ...
+    def __neg__(self) -> Self: ...
+    # binary operators
+    def __sub__(self, other: Self | _TimeLike, /) -> Self: ...
+    def __rsub__(self, other: Self | _TimeLike, /) -> Self: ...
+    def __add__(self, other: Self | _TimeLike, /) -> Self: ...
+    def __radd__(self, other: Self | _TimeLike, /) -> Self: ...
+    def __mul__(self, other: float, /) -> Self: ...  # analogous to timedelta
+    def __rmul__(self, other: float, /) -> Self: ...
+    @overload  # analogous to timedelta
+    def __truediv__(self, other: Self | _TimeDelta, /) -> float: ...  # Literal[NaN]
+    @overload
+    def __truediv__(self, other: float, /) -> Self: ...
+    @overload  # analogous to timedelta
+    def __floordiv__(self, other: Self | _TimeDelta, /) -> float: ...  # Literal[NaN]
+    @overload
+    def __floordiv__(self, other: float, /) -> Self: ...
+    # other
+    def __hash__(self) -> int: ...
+    def as_unit(self, unit: TimeUnit, round_ok: bool = ...) -> NaTType: ...
diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..00ef35c50e53251d5ca6f6c6d5ad28a67a695a21
--- /dev/null
+++ b/pandas/_libs/tslibs/np_datetime.pyi
@@ -0,0 +1,27 @@
+import numpy as np
+
+from pandas._typing import npt
+
+class OutOfBoundsDatetime(ValueError): ...
+class OutOfBoundsTimedelta(ValueError): ...
+
+# only exposed for testing
+def py_get_unit_from_dtype(dtype: np.dtype): ...
+def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ...
+def astype_overflowsafe(
+    values: np.ndarray,
+    dtype: np.dtype,
+    copy: bool = ...,
+    round_ok: bool = ...,
+    is_coerce: bool = ...,
+) -> np.ndarray: ...
+def is_unitless(dtype: np.dtype) -> bool: ...
+def compare_mismatched_resolutions(
+    left: np.ndarray, right: np.ndarray, op
+) -> npt.NDArray[np.bool_]: ...
+def add_overflowsafe(
+    left: npt.NDArray[np.int64],
+    right: npt.NDArray[np.int64],
+) -> npt.NDArray[np.int64]: ...
+def get_supported_dtype(dtype: np.dtype) -> np.dtype: ...
+def is_supported_dtype(dtype: np.dtype) -> bool: ...
diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..eaee7f54b6ea8d8dd7ecedf45edeb2868723ef38
--- /dev/null
+++ b/pandas/_libs/tslibs/offsets.pyi
@@ -0,0 +1,308 @@
+from collections.abc import Collection
+from datetime import (
+    datetime,
+    time,
+    timedelta,
+)
+from typing import (
+    Any,
+    Literal,
+    Self,
+    TypeVar,
+    overload,
+)
+
+import numpy as np
+
+from pandas._libs.tslibs.nattype import NaTType
+from pandas._typing import (
+    OffsetCalendar,
+    npt,
+)
+
+_BaseOffsetT = TypeVar("_BaseOffsetT", bound=BaseOffset)
+_DatetimeT = TypeVar("_DatetimeT", bound=datetime)
+_TimedeltaT = TypeVar("_TimedeltaT", bound=timedelta)
+
+_relativedelta_kwds: set[str]
+prefix_mapping: dict[str, type]
+
+class ApplyTypeError(TypeError): ...
+
+class BaseOffset:
+    n: int
+    normalize: bool
+    def __init__(self, n: int = ..., normalize: bool = ...) -> None: ...
+    def __eq__(self, other) -> bool: ...
+    def __ne__(self, other) -> bool: ...
+    def __hash__(self) -> int: ...
+    @property
+    def kwds(self) -> dict: ...
+    @property
+    def base(self) -> BaseOffset: ...
+    @overload
+    def __add__(self, other: npt.NDArray[np.object_]) -> npt.NDArray[np.object_]: ...
+    @overload
+    def __add__(self, other: BaseOffset) -> Self: ...
+    @overload
+    def __add__(self, other: _DatetimeT) -> _DatetimeT: ...
+    @overload
+    def __add__(self, other: _TimedeltaT) -> _TimedeltaT: ...
+    @overload
+    def __radd__(self, other: npt.NDArray[np.object_]) -> npt.NDArray[np.object_]: ...
+    @overload
+    def __radd__(self, other: BaseOffset) -> Self: ...
+    @overload
+    def __radd__(self, other: _DatetimeT) -> _DatetimeT: ...
+    @overload
+    def __radd__(self, other: _TimedeltaT) -> _TimedeltaT: ...
+    @overload
+    def __radd__(self, other: NaTType) -> NaTType: ...
+    def __sub__(self, other: BaseOffset) -> Self: ...
+    @overload
+    def __rsub__(self, other: npt.NDArray[np.object_]) -> npt.NDArray[np.object_]: ...
+    @overload
+    def __rsub__(self, other: BaseOffset) -> Self: ...
+    @overload
+    def __rsub__(self, other: _DatetimeT) -> _DatetimeT: ...
+    @overload
+    def __rsub__(self, other: _TimedeltaT) -> _TimedeltaT: ...
+    @overload
+    def __mul__(self, other: np.ndarray) -> np.ndarray: ...
+    @overload
+    def __mul__(self, other: int) -> Self: ...
+    @overload
+    def __rmul__(self, other: np.ndarray) -> np.ndarray: ...
+    @overload
+    def __rmul__(self, other: int) -> Self: ...
+    def __neg__(self) -> Self: ...
+    def copy(self) -> Self: ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def rule_code(self) -> str: ...
+    @property
+    def freqstr(self) -> str: ...
+    def _apply(self, other): ...
+    def _apply_array(self, dtarr: np.ndarray) -> np.ndarray: ...
+    def rollback(self, dt: datetime) -> datetime: ...
+    def rollforward(self, dt: datetime) -> datetime: ...
+    def is_on_offset(self, dt: datetime) -> bool: ...
+    def __setstate__(self, state) -> None: ...
+    def __getstate__(self): ...
+    @property
+    def nanos(self) -> int: ...
+
+def _get_offset(name: str) -> BaseOffset: ...
+
+class SingleConstructorOffset(BaseOffset):
+    @classmethod
+    def _from_name(cls, suffix: None = ...) -> Self: ...
+    def __reduce__(self): ...
+
+@overload
+def to_offset(freq: None, is_period: bool = ...) -> None: ...
+@overload
+def to_offset(freq: _BaseOffsetT, is_period: bool = ...) -> _BaseOffsetT: ...
+@overload
+def to_offset(freq: timedelta | str, is_period: bool = ...) -> BaseOffset: ...
+
+class Tick(SingleConstructorOffset):
+    _creso: int
+    _prefix: str
+    def __init__(self, n: int = ..., normalize: bool = ...) -> None: ...
+    @property
+    def nanos(self) -> int: ...
+
+def delta_to_tick(delta: timedelta) -> Tick: ...
+
+class Day(BaseOffset): ...
+class Hour(Tick): ...
+class Minute(Tick): ...
+class Second(Tick): ...
+class Milli(Tick): ...
+class Micro(Tick): ...
+class Nano(Tick): ...
+
+class RelativeDeltaOffset(BaseOffset):
+    def __init__(self, n: int = ..., normalize: bool = ..., **kwds: Any) -> None: ...
+
+class BusinessMixin(SingleConstructorOffset):
+    def __init__(
+        self, n: int = ..., normalize: bool = ..., offset: timedelta = ...
+    ) -> None: ...
+
+class BusinessDay(BusinessMixin): ...
+
+class BusinessHour(BusinessMixin):
+    def __init__(
+        self,
+        n: int = ...,
+        normalize: bool = ...,
+        start: str | time | Collection[str | time] = ...,
+        end: str | time | Collection[str | time] = ...,
+        offset: timedelta = ...,
+    ) -> None: ...
+
+class WeekOfMonthMixin(SingleConstructorOffset):
+    def __init__(
+        self, n: int = ..., normalize: bool = ..., weekday: int = ...
+    ) -> None: ...
+
+class YearOffset(SingleConstructorOffset):
+    def __init__(
+        self, n: int = ..., normalize: bool = ..., month: int | None = ...
+    ) -> None: ...
+    @property
+    def month(self) -> int: ...
+
+class BYearEnd(YearOffset): ...
+class BYearBegin(YearOffset): ...
+
+class YearEnd(YearOffset):
+    def __new__(
+        cls, n: int = ..., normalize: bool = ..., month: int | None = ...
+    ) -> Self: ...
+
+class YearBegin(YearOffset): ...
+
+class QuarterOffset(SingleConstructorOffset):
+    def __init__(
+        self, n: int = ..., normalize: bool = ..., startingMonth: int | None = ...
+    ) -> None: ...
+
+class BQuarterEnd(QuarterOffset): ...
+class BQuarterBegin(QuarterOffset): ...
+class QuarterEnd(QuarterOffset): ...
+class QuarterBegin(QuarterOffset): ...
+
+class HalfYearOffset(SingleConstructorOffset):
+    def __init__(
+        self, n: int = ..., normalize: bool = ..., startingMonth: int | None = ...
+    ) -> None: ...
+
+class BHalfYearEnd(HalfYearOffset): ...
+class BHalfYearBegin(HalfYearOffset): ...
+class HalfYearEnd(HalfYearOffset): ...
+class HalfYearBegin(HalfYearOffset): ...
+class MonthOffset(SingleConstructorOffset): ...
+class MonthEnd(MonthOffset): ...
+class MonthBegin(MonthOffset): ...
+class BusinessMonthEnd(MonthOffset): ...
+class BusinessMonthBegin(MonthOffset): ...
+
+class SemiMonthOffset(SingleConstructorOffset):
+    def __init__(
+        self, n: int = ..., normalize: bool = ..., day_of_month: int | None = ...
+    ) -> None: ...
+
+class SemiMonthEnd(SemiMonthOffset): ...
+class SemiMonthBegin(SemiMonthOffset): ...
+
+class Week(SingleConstructorOffset):
+    def __init__(
+        self, n: int = ..., normalize: bool = ..., weekday: int | None = ...
+    ) -> None: ...
+
+class WeekOfMonth(WeekOfMonthMixin):
+    def __init__(
+        self, n: int = ..., normalize: bool = ..., week: int = ..., weekday: int = ...
+    ) -> None: ...
+
+class LastWeekOfMonth(WeekOfMonthMixin):
+    def __init__(
+        self, n: int = ..., normalize: bool = ..., weekday: int = ...
+    ) -> None: ...
+
+class FY5253Mixin(SingleConstructorOffset):
+    def __init__(
+        self,
+        n: int = ...,
+        normalize: bool = ...,
+        weekday: int = ...,
+        startingMonth: int = ...,
+        variation: Literal["nearest", "last"] = ...,
+    ) -> None: ...
+
+class FY5253(FY5253Mixin): ...
+
+class FY5253Quarter(FY5253Mixin):
+    def __init__(
+        self,
+        n: int = ...,
+        normalize: bool = ...,
+        weekday: int = ...,
+        startingMonth: int = ...,
+        qtr_with_extra_week: int = ...,
+        variation: Literal["nearest", "last"] = ...,
+    ) -> None: ...
+
+class Easter(SingleConstructorOffset):
+    def __init__(
+        self,
+        n: int = ...,
+        normalize: bool = ...,
+        method: int = ...,
+    ) -> None: ...
+
+class _CustomBusinessMonth(BusinessMixin):
+    def __init__(
+        self,
+        n: int = ...,
+        normalize: bool = ...,
+        weekmask: str = ...,
+        holidays: list | None = ...,
+        calendar: OffsetCalendar | None = ...,
+        offset: timedelta = ...,
+    ) -> None: ...
+
+class CustomBusinessDay(BusinessDay):
+    def __init__(
+        self,
+        n: int = ...,
+        normalize: bool = ...,
+        weekmask: str = ...,
+        holidays: list | None = ...,
+        calendar: OffsetCalendar | None = ...,
+        offset: timedelta = ...,
+    ) -> None: ...
+
+class CustomBusinessHour(BusinessHour):
+    def __init__(
+        self,
+        n: int = ...,
+        normalize: bool = ...,
+        weekmask: str = ...,
+        holidays: list | None = ...,
+        calendar: OffsetCalendar | None = ...,
+        start: str | time | Collection[str | time] = ...,
+        end: str | time | Collection[str | time] = ...,
+        offset: timedelta = ...,
+    ) -> None: ...
+
+class CustomBusinessMonthEnd(_CustomBusinessMonth): ...
+class CustomBusinessMonthBegin(_CustomBusinessMonth): ...
+class OffsetMeta(type): ...
+class DateOffset(RelativeDeltaOffset, metaclass=OffsetMeta): ...
+
+BDay = BusinessDay
+BMonthEnd = BusinessMonthEnd
+BMonthBegin = BusinessMonthBegin
+CBMonthEnd = CustomBusinessMonthEnd
+CBMonthBegin = CustomBusinessMonthBegin
+CDay = CustomBusinessDay
+
+def roll_qtrday(
+    other: datetime, n: int, month: int, day_opt: str, modby: int
+) -> int: ...
+
+INVALID_FREQ_ERR_MSG: Literal["Invalid frequency: {0}"]
+
+def shift_months(
+    dtindex: npt.NDArray[np.int64],
+    months: int,
+    day_opt: str | None = ...,
+    reso: int = ...,
+) -> npt.NDArray[np.int64]: ...
+
+_offset_map: dict[str, BaseOffset]
diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..845bd9a5a5635fcf87975dd0d2d5b383c6d4ae58
--- /dev/null
+++ b/pandas/_libs/tslibs/parsing.pyi
@@ -0,0 +1,30 @@
+from datetime import datetime
+
+import numpy as np
+
+from pandas._typing import npt
+
+class DateParseError(ValueError): ...
+
+def py_parse_datetime_string(
+    date_string: str,
+    dayfirst: bool = ...,
+    yearfirst: bool = ...,
+) -> datetime: ...
+def parse_datetime_string_with_reso(
+    date_string: str,
+    freq: str | None = ...,
+    dayfirst: bool | None = ...,
+    yearfirst: bool | None = ...,
+) -> tuple[datetime, str]: ...
+def _does_string_look_like_datetime(py_string: str) -> bool: ...
+def quarter_to_myear(year: int, quarter: int, freq: str) -> tuple[int, int]: ...
+def try_parse_dates(
+    values: npt.NDArray[np.object_],  # object[:]
+    parser,
+) -> npt.NDArray[np.object_]: ...
+def guess_datetime_format(
+    dt_str: str,
+    dayfirst: bool | None = ...,
+) -> str | None: ...
+def get_rule_month(source: str) -> str: ...
diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..5cb9f891b312a566545b51690f99343867547b89
--- /dev/null
+++ b/pandas/_libs/tslibs/period.pyi
@@ -0,0 +1,135 @@
+from datetime import timedelta
+from typing import Literal
+
+import numpy as np
+
+from pandas._libs.tslibs.dtypes import PeriodDtypeBase
+from pandas._libs.tslibs.nattype import NaTType
+from pandas._libs.tslibs.offsets import BaseOffset
+from pandas._libs.tslibs.timestamps import Timestamp
+from pandas._typing import (
+    Frequency,
+    npt,
+)
+
+INVALID_FREQ_ERR_MSG: str
+DIFFERENT_FREQ: str
+
+class IncompatibleFrequency(TypeError): ...
+
+def periodarr_to_dt64arr(
+    periodarr: npt.NDArray[np.int64],  # const int64_t[:]
+    freq: int,
+) -> npt.NDArray[np.int64]: ...
+def period_asfreq_arr(
+    arr: npt.NDArray[np.int64],
+    freq1: int,
+    freq2: int,
+    end: bool,
+) -> npt.NDArray[np.int64]: ...
+def get_period_field_arr(
+    field: str,
+    arr: npt.NDArray[np.int64],  # const int64_t[:]
+    freq: int,
+) -> npt.NDArray[np.int64]: ...
+def from_ordinals(
+    values: npt.NDArray[np.int64],  # const int64_t[:]
+    freq: timedelta | BaseOffset | str,
+) -> npt.NDArray[np.int64]: ...
+def extract_ordinals(
+    values: npt.NDArray[np.object_],
+    freq: Frequency | int,
+) -> npt.NDArray[np.int64]: ...
+def extract_freq(
+    values: npt.NDArray[np.object_],
+) -> BaseOffset: ...
+def period_array_strftime(
+    values: npt.NDArray[np.int64],
+    dtype_code: int,
+    na_rep,
+    date_format: str | None,
+) -> npt.NDArray[np.object_]: ...
+
+# exposed for tests
+def period_asfreq(ordinal: int, freq1: int, freq2: int, end: bool) -> int: ...
+def period_ordinal(
+    y: int, m: int, d: int, h: int, min: int, s: int, us: int, ps: int, freq: int
+) -> int: ...
+def freq_to_dtype_code(freq: BaseOffset) -> int: ...
+def validate_end_alias(how: str) -> Literal["E", "S"]: ...
+
+class PeriodMixin:
+    @property
+    def end_time(self) -> Timestamp: ...
+    @property
+    def start_time(self) -> Timestamp: ...
+    def _require_matching_freq(self, other: BaseOffset, base: bool = ...) -> None: ...
+
+class Period(PeriodMixin):
+    ordinal: int  # int64_t
+    freq: BaseOffset
+    _dtype: PeriodDtypeBase
+
+    # error: "__new__" must return a class instance (got "Union[Period, NaTType]")
+    def __new__(  # type: ignore[misc]
+        cls,
+        value=...,
+        freq: int | str | BaseOffset | None = ...,
+        ordinal: int | None = ...,
+        year: int | None = ...,
+        month: int | None = ...,
+        quarter: int | None = ...,
+        day: int | None = ...,
+        hour: int | None = ...,
+        minute: int | None = ...,
+        second: int | None = ...,
+    ) -> Period | NaTType: ...
+    @classmethod
+    def _maybe_convert_freq(cls, freq) -> BaseOffset: ...
+    @classmethod
+    def _from_ordinal(cls, ordinal: int, freq: BaseOffset) -> Period: ...
+    @classmethod
+    def now(cls, freq: Frequency) -> Period: ...
+    def strftime(self, fmt: str | None) -> str: ...
+    def to_timestamp(
+        self,
+        freq: str | BaseOffset | None = ...,
+        how: str = ...,
+    ) -> Timestamp: ...
+    def asfreq(self, freq: str | BaseOffset, how: str = ...) -> Period: ...
+    @property
+    def freqstr(self) -> str: ...
+    @property
+    def is_leap_year(self) -> bool: ...
+    @property
+    def daysinmonth(self) -> int: ...
+    @property
+    def days_in_month(self) -> int: ...
+    @property
+    def qyear(self) -> int: ...
+    @property
+    def quarter(self) -> int: ...
+    @property
+    def day_of_year(self) -> int: ...
+    @property
+    def weekday(self) -> int: ...
+    @property
+    def day_of_week(self) -> int: ...
+    @property
+    def week(self) -> int: ...
+    @property
+    def weekofyear(self) -> int: ...
+    @property
+    def second(self) -> int: ...
+    @property
+    def minute(self) -> int: ...
+    @property
+    def hour(self) -> int: ...
+    @property
+    def day(self) -> int: ...
+    @property
+    def month(self) -> int: ...
+    @property
+    def year(self) -> int: ...
+    def __sub__(self, other) -> Period | BaseOffset: ...
+    def __add__(self, other) -> Period: ...
diff --git a/pandas/_libs/tslibs/strptime.pyi b/pandas/_libs/tslibs/strptime.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..0ec1a1e25a2b3cfe974baebfe32d686435f73e11
--- /dev/null
+++ b/pandas/_libs/tslibs/strptime.pyi
@@ -0,0 +1,14 @@
+import numpy as np
+
+from pandas._typing import npt
+
+def array_strptime(
+    values: npt.NDArray[np.object_],
+    fmt: str | None,
+    exact: bool = ...,
+    errors: str = ...,
+    utc: bool = ...,
+    creso: int = ...,  # NPY_DATETIMEUNIT
+) -> tuple[np.ndarray, np.ndarray]: ...
+
+# first ndarray is M8[ns], second is object ndarray of tzinfo | None
diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..a04387eb09d6b94d0dee888dba14141b1ea128f2
--- /dev/null
+++ b/pandas/_libs/tslibs/timedeltas.pyi
@@ -0,0 +1,168 @@
+from datetime import timedelta
+from typing import (
+    ClassVar,
+    Literal,
+    Self,
+    TypeAlias,
+    overload,
+)
+
+import numpy as np
+
+from pandas._libs.tslibs import (
+    NaTType,
+    Tick,
+)
+from pandas._typing import (
+    Frequency,
+    TimeUnit,
+    npt,
+)
+
+# This should be kept consistent with the keys in the dict timedelta_abbrevs
+# in pandas/_libs/tslibs/timedeltas.pyx
+UnitChoices: TypeAlias = Literal[
+    "Y",
+    "y",
+    "M",
+    "W",
+    "w",
+    "D",
+    "d",
+    "days",
+    "day",
+    "hours",
+    "hour",
+    "hr",
+    "h",
+    "m",
+    "minute",
+    "min",
+    "minutes",
+    "s",
+    "seconds",
+    "sec",
+    "second",
+    "ms",
+    "milliseconds",
+    "millisecond",
+    "milli",
+    "millis",
+    "us",
+    "microseconds",
+    "microsecond",
+    "µs",
+    "micro",
+    "micros",
+    "ns",
+    "nanoseconds",
+    "nano",
+    "nanos",
+    "nanosecond",
+]
+
+def get_unit_for_round(freq, creso: int) -> int: ...
+def disallow_ambiguous_unit(unit: str | None) -> None: ...
+def ints_to_pytimedelta(
+    m8values: npt.NDArray[np.timedelta64],
+    box: bool = ...,
+) -> npt.NDArray[np.object_]: ...
+def array_to_timedelta64(
+    values: npt.NDArray[np.object_],
+    unit: str | None = ...,
+    errors: str = ...,
+    creso: int = ...,
+) -> np.ndarray: ...  # np.ndarray[m8ns]
+def parse_timedelta_unit(unit: str | None) -> UnitChoices: ...
+def delta_to_nanoseconds(
+    delta: np.timedelta64 | timedelta | Tick,
+    reso: int = ...,  # NPY_DATETIMEUNIT
+    round_ok: bool = ...,
+) -> int: ...
+def floordiv_object_array(
+    left: np.ndarray, right: npt.NDArray[np.object_]
+) -> np.ndarray: ...
+def truediv_object_array(
+    left: np.ndarray, right: npt.NDArray[np.object_]
+) -> np.ndarray: ...
+
+class Timedelta(timedelta):
+    _creso: int
+    min: ClassVar[Timedelta]
+    max: ClassVar[Timedelta]
+    resolution: ClassVar[Timedelta]
+    value: int  # np.int64
+    _value: int  # np.int64
+    # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]")
+    def __new__(  # type: ignore[misc]
+        cls: type[Self],
+        value=...,
+        unit: str | None = ...,
+        **kwargs: float | np.integer | np.floating,
+    ) -> Self | NaTType: ...
+    @classmethod
+    def _from_value_and_reso(cls, value: np.int64, reso: int) -> Timedelta: ...
+    @property
+    def days(self) -> int: ...
+    @property
+    def seconds(self) -> int: ...
+    @property
+    def microseconds(self) -> int: ...
+    def total_seconds(self) -> float: ...
+    def to_pytimedelta(self) -> timedelta: ...
+    def to_timedelta64(self) -> np.timedelta64: ...
+    @property
+    def asm8(self) -> np.timedelta64: ...
+    # TODO: round/floor/ceil could return NaT?
+    def round(self, freq: Frequency) -> Self: ...
+    def floor(self, freq: Frequency) -> Self: ...
+    def ceil(self, freq: Frequency) -> Self: ...
+    @property
+    def resolution_string(self) -> str: ...
+    def __add__(self, other: timedelta) -> Timedelta: ...
+    def __radd__(self, other: timedelta) -> Timedelta: ...
+    def __sub__(self, other: timedelta) -> Timedelta: ...
+    def __rsub__(self, other: timedelta) -> Timedelta: ...
+    def __neg__(self) -> Timedelta: ...
+    def __pos__(self) -> Timedelta: ...
+    def __abs__(self) -> Timedelta: ...
+    def __mul__(self, other: float) -> Timedelta: ...
+    def __rmul__(self, other: float) -> Timedelta: ...
+    # error: Signature of "__floordiv__" incompatible with supertype "timedelta"
+    @overload  # type: ignore[override]
+    def __floordiv__(self, other: timedelta) -> int: ...
+    @overload
+    def __floordiv__(self, other: float) -> Timedelta: ...
+    @overload
+    def __floordiv__(
+        self, other: npt.NDArray[np.timedelta64]
+    ) -> npt.NDArray[np.intp]: ...
+    @overload
+    def __floordiv__(
+        self, other: npt.NDArray[np.number]
+    ) -> npt.NDArray[np.timedelta64] | Timedelta: ...
+    @overload
+    def __rfloordiv__(self, other: timedelta | str) -> int: ...
+    @overload
+    def __rfloordiv__(self, other: None | NaTType) -> NaTType: ...
+    @overload
+    def __rfloordiv__(self, other: np.ndarray) -> npt.NDArray[np.timedelta64]: ...
+    @overload
+    def __truediv__(self, other: timedelta) -> float: ...
+    @overload
+    def __truediv__(self, other: float) -> Timedelta: ...
+    def __mod__(self, other: timedelta) -> Timedelta: ...
+    def __divmod__(self, other: timedelta) -> tuple[int, Timedelta]: ...
+    def __le__(self, other: timedelta) -> bool: ...
+    def __lt__(self, other: timedelta) -> bool: ...
+    def __ge__(self, other: timedelta) -> bool: ...
+    def __gt__(self, other: timedelta) -> bool: ...
+    def __hash__(self) -> int: ...
+    def isoformat(self) -> str: ...
+    def to_numpy(
+        self, dtype: npt.DTypeLike = ..., copy: bool = False
+    ) -> np.timedelta64: ...
+    def view(self, dtype: npt.DTypeLike) -> object: ...
+    @property
+    def unit(self) -> TimeUnit: ...
+    def as_unit(self, unit: TimeUnit, round_ok: bool = ...) -> Timedelta: ...
diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..d06c78b22626a325f01f4ab3466271256249d415
--- /dev/null
+++ b/pandas/_libs/tslibs/timestamps.pyi
@@ -0,0 +1,242 @@
+from datetime import (
+    date as _date,
+    datetime,
+    time as _time,
+    timedelta,
+    tzinfo as _tzinfo,
+)
+from time import struct_time
+from typing import (
+    ClassVar,
+    Literal,
+    Self,
+    TypeAlias,
+    overload,
+)
+
+import numpy as np
+
+from pandas._libs.tslibs import (
+    BaseOffset,
+    NaTType,
+    Period,
+    Tick,
+    Timedelta,
+)
+from pandas._typing import (
+    TimestampNonexistent,
+    TimeUnit,
+)
+
+_TimeZones: TypeAlias = str | _tzinfo | None | int
+
+def integer_op_not_supported(obj: object) -> TypeError: ...
+
+class Timestamp(datetime):
+    _creso: int
+    min: ClassVar[Timestamp]
+    max: ClassVar[Timestamp]
+
+    resolution: ClassVar[Timedelta]
+    _value: int  # np.int64
+    # error: "__new__" must return a class instance (got "Union[Timestamp, NaTType]")
+    def __new__(  # type: ignore[misc]
+        cls: type[Self],
+        ts_input: np.integer | float | str | _date | datetime | np.datetime64 = ...,
+        year: int | None = ...,
+        month: int | None = ...,
+        day: int | None = ...,
+        hour: int | None = ...,
+        minute: int | None = ...,
+        second: int | None = ...,
+        microsecond: int | None = ...,
+        tzinfo: _tzinfo | None = ...,
+        *,
+        nanosecond: int | None = ...,
+        tz: _TimeZones = ...,
+        unit: str | int | None = ...,
+        fold: int | None = ...,
+    ) -> Self | NaTType: ...
+    @classmethod
+    def _from_value_and_reso(
+        cls, value: int, reso: int, tz: _TimeZones
+    ) -> Timestamp: ...
+    @property
+    def value(self) -> int: ...  # np.int64
+    @property
+    def year(self) -> int: ...
+    @property
+    def month(self) -> int: ...
+    @property
+    def day(self) -> int: ...
+    @property
+    def hour(self) -> int: ...
+    @property
+    def minute(self) -> int: ...
+    @property
+    def second(self) -> int: ...
+    @property
+    def microsecond(self) -> int: ...
+    @property
+    def nanosecond(self) -> int: ...
+    @property
+    def tzinfo(self) -> _tzinfo | None: ...
+    @property
+    def tz(self) -> _tzinfo | None: ...
+    @property
+    def fold(self) -> int: ...
+    @classmethod
+    def fromtimestamp(cls, ts: float, tz: _TimeZones = ...) -> Self: ...
+    @classmethod
+    def utcfromtimestamp(cls, ts: float) -> Self: ...
+    @classmethod
+    def today(cls, tz: _TimeZones = ...) -> Self: ...
+    @classmethod
+    def fromordinal(
+        cls,
+        ordinal: int,
+        tz: _TimeZones = ...,
+    ) -> Self: ...
+    @classmethod
+    def now(cls, tz: _TimeZones = ...) -> Self: ...
+    @classmethod
+    def utcnow(cls) -> Self: ...
+    # error: Signature of "combine" incompatible with supertype "datetime"
+    @classmethod
+    def combine(  # type: ignore[override]
+        cls, date: _date, time: _time
+    ) -> datetime: ...
+    @classmethod
+    def fromisoformat(cls, date_string: str) -> Self: ...
+    def strftime(self, format: str) -> str: ...
+    def __format__(self, fmt: str) -> str: ...
+    def toordinal(self) -> int: ...
+    def timetuple(self) -> struct_time: ...
+    def timestamp(self) -> float: ...
+    def utctimetuple(self) -> struct_time: ...
+    def date(self) -> _date: ...
+    def time(self) -> _time: ...
+    def timetz(self) -> _time: ...
+    # LSP violation: nanosecond is not present in datetime.datetime.replace
+    # and has positional args following it
+    def replace(  # type: ignore[override]
+        self,
+        year: int | None = ...,
+        month: int | None = ...,
+        day: int | None = ...,
+        hour: int | None = ...,
+        minute: int | None = ...,
+        second: int | None = ...,
+        microsecond: int | None = ...,
+        nanosecond: int | None = ...,
+        tzinfo: _tzinfo | type[object] | None = ...,
+        fold: int | None = ...,
+    ) -> Self: ...
+    # LSP violation: datetime.datetime.astimezone has a default value for tz
+    def astimezone(self, tz: _TimeZones) -> Self: ...  # type: ignore[override]
+    def ctime(self) -> str: ...
+    def isoformat(self, sep: str = ..., timespec: str = ...) -> str: ...
+    @classmethod
+    def strptime(
+        # Note: strptime is actually disabled and raises NotImplementedError
+        cls,
+        date_string: str,
+        format: str,
+    ) -> Self: ...
+    def utcoffset(self) -> timedelta | None: ...
+    def tzname(self) -> str | None: ...
+    def dst(self) -> timedelta | None: ...
+    def __le__(self, other: datetime) -> bool: ...  # type: ignore[override]
+    def __lt__(self, other: datetime) -> bool: ...  # type: ignore[override]
+    def __ge__(self, other: datetime) -> bool: ...  # type: ignore[override]
+    def __gt__(self, other: datetime) -> bool: ...  # type: ignore[override]
+    # error: Signature of "__add__" incompatible with supertype "date"/"datetime"
+    @overload  # type: ignore[override]
+    def __add__(self, other: np.ndarray) -> np.ndarray: ...
+    @overload
+    def __add__(self, other: timedelta | np.timedelta64 | Tick) -> Self: ...
+    def __radd__(self, other: timedelta) -> Self: ...
+    @overload  # type: ignore[override]
+    def __sub__(self, other: datetime) -> Timedelta: ...
+    @overload
+    def __sub__(self, other: timedelta | np.timedelta64 | Tick) -> Self: ...
+    def __hash__(self) -> int: ...
+    def weekday(self) -> int: ...
+    def isoweekday(self) -> int: ...
+    # Return type "Tuple[int, int, int]" of "isocalendar" incompatible with return
+    # type "_IsoCalendarDate" in supertype "date"
+    def isocalendar(self) -> tuple[int, int, int]: ...  # type: ignore[override]
+    @property
+    def is_leap_year(self) -> bool: ...
+    @property
+    def is_month_start(self) -> bool: ...
+    @property
+    def is_quarter_start(self) -> bool: ...
+    @property
+    def is_year_start(self) -> bool: ...
+    @property
+    def is_month_end(self) -> bool: ...
+    @property
+    def is_quarter_end(self) -> bool: ...
+    @property
+    def is_year_end(self) -> bool: ...
+    def to_pydatetime(self, warn: bool = ...) -> datetime: ...
+    def to_datetime64(self) -> np.datetime64: ...
+    def to_period(self, freq: BaseOffset | str | None = None) -> Period: ...
+    def to_julian_date(self) -> np.float64: ...
+    @property
+    def asm8(self) -> np.datetime64: ...
+    def tz_convert(self, tz: _TimeZones) -> Self: ...
+    # TODO: could return NaT?
+    def tz_localize(
+        self,
+        tz: _TimeZones,
+        ambiguous: bool | Literal["raise", "NaT"] = ...,
+        nonexistent: TimestampNonexistent = ...,
+    ) -> Self: ...
+    def normalize(self) -> Self: ...
+    # TODO: round/floor/ceil could return NaT?
+    def round(
+        self,
+        freq: str,
+        ambiguous: bool | Literal["raise", "NaT"] = ...,
+        nonexistent: TimestampNonexistent = ...,
+    ) -> Self: ...
+    def floor(
+        self,
+        freq: str,
+        ambiguous: bool | Literal["raise", "NaT"] = ...,
+        nonexistent: TimestampNonexistent = ...,
+    ) -> Self: ...
+    def ceil(
+        self,
+        freq: str,
+        ambiguous: bool | Literal["raise", "NaT"] = ...,
+        nonexistent: TimestampNonexistent = ...,
+    ) -> Self: ...
+    def day_name(self, locale: str | None = ...) -> str: ...
+    def month_name(self, locale: str | None = ...) -> str: ...
+    @property
+    def day_of_week(self) -> int: ...
+    @property
+    def dayofweek(self) -> int: ...
+    @property
+    def day_of_year(self) -> int: ...
+    @property
+    def dayofyear(self) -> int: ...
+    @property
+    def quarter(self) -> int: ...
+    @property
+    def week(self) -> int: ...
+    def to_numpy(
+        self, dtype: np.dtype | None = ..., copy: bool = ...
+    ) -> np.datetime64: ...
+    @property
+    def _date_repr(self) -> str: ...
+    @property
+    def days_in_month(self) -> int: ...
+    @property
+    def daysinmonth(self) -> int: ...
+    @property
+    def unit(self) -> TimeUnit: ...
+    def as_unit(self, unit: TimeUnit, round_ok: bool = ...) -> Timestamp: ...
diff --git a/pandas/_libs/tslibs/timezones.pyi b/pandas/_libs/tslibs/timezones.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..26ffa568a848001b4b80bf0525d60527682a9be4
--- /dev/null
+++ b/pandas/_libs/tslibs/timezones.pyi
@@ -0,0 +1,21 @@
+from collections.abc import Callable
+from datetime import (
+    datetime,
+    tzinfo,
+)
+
+import numpy as np
+
+# imported from dateutil.tz
+dateutil_gettz: Callable[[str], tzinfo]
+
+def tz_standardize(tz: tzinfo) -> tzinfo: ...
+def tz_compare(start: tzinfo | None, end: tzinfo | None) -> bool: ...
+def infer_tzinfo(
+    start: datetime | None,
+    end: datetime | None,
+) -> tzinfo | None: ...
+def maybe_get_tz(tz: str | int | np.int64 | tzinfo | None) -> tzinfo | None: ...
+def get_timezone(tz: tzinfo) -> tzinfo | str: ...
+def is_utc(tz: tzinfo | None) -> bool: ...
+def is_fixed_offset(tz: tzinfo) -> bool: ...
diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..07ee46858577aeb3a373b30d292d0803c36c3d01
--- /dev/null
+++ b/pandas/_libs/tslibs/tzconversion.pyi
@@ -0,0 +1,21 @@
+from collections.abc import Iterable
+from datetime import (
+    timedelta,
+    tzinfo,
+)
+
+import numpy as np
+
+from pandas._typing import npt
+
+# tz_convert_from_utc_single exposed for testing
+def tz_convert_from_utc_single(
+    utc_val: np.int64, tz: tzinfo, creso: int = ...
+) -> np.int64: ...
+def tz_localize_to_utc(
+    vals: npt.NDArray[np.int64],
+    tz: tzinfo | None,
+    ambiguous: str | bool | Iterable[bool] | None = ...,
+    nonexistent: str | timedelta | np.timedelta64 | None = ...,
+    creso: int = ...,  # NPY_DATETIMEUNIT
+) -> npt.NDArray[np.int64]: ...
diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..f377c2e26ab81e4cf767f6614a430a602fcfd8d9
--- /dev/null
+++ b/pandas/_libs/tslibs/vectorized.pyi
@@ -0,0 +1,41 @@
+# For cython types that cannot be represented precisely, closest-available
+# python equivalents are used, and the precise types kept as adjacent comments.
+from datetime import tzinfo
+
+import numpy as np
+
+from pandas._libs.tslibs.dtypes import Resolution
+from pandas._typing import npt
+
+def dt64arr_to_periodarr(
+    stamps: npt.NDArray[np.int64],
+    freq: int,
+    tz: tzinfo | None,
+    reso: int = ...,  # NPY_DATETIMEUNIT
+) -> npt.NDArray[np.int64]: ...
+def is_date_array_normalized(
+    stamps: npt.NDArray[np.int64],
+    tz: tzinfo | None,
+    reso: int,  # NPY_DATETIMEUNIT
+) -> bool: ...
+def normalize_i8_timestamps(
+    stamps: npt.NDArray[np.int64],
+    tz: tzinfo | None,
+    reso: int,  # NPY_DATETIMEUNIT
+) -> npt.NDArray[np.int64]: ...
+def get_resolution(
+    stamps: npt.NDArray[np.int64],
+    tz: tzinfo | None = ...,
+    reso: int = ...,  # NPY_DATETIMEUNIT
+) -> Resolution: ...
+def ints_to_pydatetime(
+    stamps: npt.NDArray[np.int64],
+    tz: tzinfo | None = ...,
+    box: str = ...,
+    reso: int = ...,  # NPY_DATETIMEUNIT
+) -> npt.NDArray[np.object_]: ...
+def tz_convert_from_utc(
+    stamps: npt.NDArray[np.int64],
+    tz: tzinfo | None,
+    reso: int = ...,  # NPY_DATETIMEUNIT
+) -> npt.NDArray[np.int64]: ...
diff --git a/pandas/_libs/window/__init__.py b/pandas/_libs/window/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..99413751cd5c2f88466556d05b12939fd8adb148
--- /dev/null
+++ b/pandas/_libs/window/aggregations.pyi
@@ -0,0 +1,145 @@
+from collections.abc import Callable
+from typing import (
+    Any,
+    Literal,
+)
+
+import numpy as np
+
+from pandas._typing import (
+    WindowingRankType,
+    npt,
+)
+
+def roll_sum(
+    values: np.ndarray,  # const float64_t[:]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_mean(
+    values: np.ndarray,  # const float64_t[:]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_var(
+    values: np.ndarray,  # const float64_t[:]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+    ddof: int = ...,
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_skew(
+    values: np.ndarray,  # np.ndarray[np.float64]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_kurt(
+    values: np.ndarray,  # np.ndarray[np.float64]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_median_c(
+    values: np.ndarray,  # np.ndarray[np.float64]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_max(
+    values: np.ndarray,  # np.ndarray[np.float64]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_min(
+    values: np.ndarray,  # np.ndarray[np.float64]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_first(
+    values: np.ndarray,  # np.ndarray[np.float64]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_last(
+    values: np.ndarray,  # np.ndarray[np.float64]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_quantile(
+    values: np.ndarray,  # const float64_t[:]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+    quantile: float,  # float64_t
+    interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"],
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_rank(
+    values: np.ndarray,
+    start: np.ndarray,
+    end: np.ndarray,
+    minp: int,
+    percentile: bool,
+    method: WindowingRankType,
+    ascending: bool,
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_nunique(
+    values: np.ndarray,  # const float64_t[:]
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+) -> np.ndarray: ...  # np.ndarray[float]
+def roll_apply(
+    obj: object,
+    start: np.ndarray,  # np.ndarray[np.int64]
+    end: np.ndarray,  # np.ndarray[np.int64]
+    minp: int,  # int64_t
+    function: Callable[..., Any],
+    raw: bool,
+    args: tuple[Any, ...],
+    kwargs: dict[str, Any],
+) -> npt.NDArray[np.float64]: ...
+def roll_weighted_sum(
+    values: np.ndarray,  # const float64_t[:]
+    weights: np.ndarray,  # const float64_t[:]
+    minp: int,
+) -> np.ndarray: ...  # np.ndarray[np.float64]
+def roll_weighted_mean(
+    values: np.ndarray,  # const float64_t[:]
+    weights: np.ndarray,  # const float64_t[:]
+    minp: int,
+) -> np.ndarray: ...  # np.ndarray[np.float64]
+def roll_weighted_var(
+    values: np.ndarray,  # const float64_t[:]
+    weights: np.ndarray,  # const float64_t[:]
+    minp: int,  # int64_t
+    ddof: int,  # unsigned int
+) -> np.ndarray: ...  # np.ndarray[np.float64]
+def ewm(
+    vals: np.ndarray,  # const float64_t[:]
+    start: np.ndarray,  # const int64_t[:]
+    end: np.ndarray,  # const int64_t[:]
+    minp: int,
+    com: float,  # float64_t
+    adjust: bool,
+    ignore_na: bool,
+    deltas: np.ndarray | None = None,  # const float64_t[:]
+    normalize: bool = True,
+) -> np.ndarray: ...  # np.ndarray[np.float64]
+def ewmcov(
+    input_x: np.ndarray,  # const float64_t[:]
+    start: np.ndarray,  # const int64_t[:]
+    end: np.ndarray,  # const int64_t[:]
+    minp: int,
+    input_y: np.ndarray,  # const float64_t[:]
+    com: float,  # float64_t
+    adjust: bool,
+    ignore_na: bool,
+    bias: bool,
+) -> np.ndarray: ...  # np.ndarray[np.float64]
diff --git a/pandas/_libs/window/indexers.pyi b/pandas/_libs/window/indexers.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..c9bc64be34ac9a41d14fef33b0fc76bdf66527e9
--- /dev/null
+++ b/pandas/_libs/window/indexers.pyi
@@ -0,0 +1,12 @@
+import numpy as np
+
+from pandas._typing import npt
+
+def calculate_variable_window_bounds(
+    num_values: int,  # int64_t
+    window_size: int,  # int64_t
+    min_periods,
+    center: bool,
+    closed: str | None,
+    index: np.ndarray,  # const int64_t[:]
+) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]: ...
diff --git a/pandas/_libs/writers.cpython-312-x86_64-linux-gnu.so b/pandas/_libs/writers.cpython-312-x86_64-linux-gnu.so
new file mode 100644
index 0000000000000000000000000000000000000000..d4abc5f481e8c7c7f33af82423db15f9b1de8ee2
Binary files /dev/null and b/pandas/_libs/writers.cpython-312-x86_64-linux-gnu.so differ
diff --git a/pandas/_libs/writers.pyi b/pandas/_libs/writers.pyi
new file mode 100644
index 0000000000000000000000000000000000000000..7b41856525dadf79a2bf4b29c7ddebfedaa880db
--- /dev/null
+++ b/pandas/_libs/writers.pyi
@@ -0,0 +1,20 @@
+import numpy as np
+
+from pandas._typing import ArrayLike
+
+def write_csv_rows(
+    data: list[ArrayLike],
+    data_index: np.ndarray,
+    nlevels: int,
+    cols: np.ndarray,
+    writer: object,  # _csv.writer
+) -> None: ...
+def convert_json_to_lines(arr: str) -> str: ...
+def max_len_string_array(
+    arr: np.ndarray,  # pandas_string[:]
+) -> int: ...
+def word_len(val: object) -> int: ...
+def string_array_replace_from_nan_rep(
+    arr: np.ndarray,  # np.ndarray[object, ndim=1]
+    nan_rep: object,
+) -> None: ...
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad323c3347b85ef636a9290ef1f35fd8fb45e51f
--- /dev/null
+++ b/pandas/_testing/__init__.py
@@ -0,0 +1,645 @@
+from __future__ import annotations
+
+from concurrent.futures import ThreadPoolExecutor
+from decimal import Decimal
+import operator
+import os
+from sys import byteorder
+import threading
+from typing import (
+    TYPE_CHECKING,
+    ContextManager,
+)
+
+import numpy as np
+
+from pandas._config import using_string_dtype
+from pandas._config.localization import (
+    can_set_locale,
+    get_locales,
+    set_locale,
+)
+
+from pandas.compat import HAS_PYARROW
+
+import pandas as pd
+from pandas import (
+    ArrowDtype,
+    DataFrame,
+    Index,
+    MultiIndex,
+    RangeIndex,
+    Series,
+)
+from pandas._testing._io import (
+    round_trip_pathlib,
+    round_trip_pickle,
+    write_to_compressed,
+)
+from pandas._testing._warnings import (
+    assert_produces_warning,
+    maybe_produces_warning,
+)
+from pandas._testing.asserters import (
+    assert_almost_equal,
+    assert_attr_equal,
+    assert_categorical_equal,
+    assert_class_equal,
+    assert_contains_all,
+    assert_copy,
+    assert_datetime_array_equal,
+    assert_dict_equal,
+    assert_equal,
+    assert_extension_array_equal,
+    assert_frame_equal,
+    assert_index_equal,
+    assert_indexing_slices_equivalent,
+    assert_interval_array_equal,
+    assert_is_sorted,
+    assert_metadata_equivalent,
+    assert_numpy_array_equal,
+    assert_period_array_equal,
+    assert_series_equal,
+    assert_sp_array_equal,
+    assert_timedelta_array_equal,
+    raise_assert_detail,
+)
+from pandas._testing.compat import (
+    get_dtype,
+    get_obj,
+)
+from pandas._testing.contexts import (
+    decompress_file,
+    raises_chained_assignment_error,
+    set_timezone,
+    with_csv_dialect,
+)
+from pandas.core.arrays import (
+    ArrowExtensionArray,
+    BaseMaskedArray,
+    NumpyExtensionArray,
+)
+from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
+from pandas.core.construction import extract_array
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from pandas._typing import (
+        Dtype,
+        NpDtype,
+    )
+
+
+UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
+UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
+SIGNED_INT_NUMPY_DTYPES: list[NpDtype] = [int, "int8", "int16", "int32", "int64"]
+SIGNED_INT_EA_DTYPES: list[Dtype] = ["Int8", "Int16", "Int32", "Int64"]
+ALL_INT_NUMPY_DTYPES = UNSIGNED_INT_NUMPY_DTYPES + SIGNED_INT_NUMPY_DTYPES
+ALL_INT_EA_DTYPES = UNSIGNED_INT_EA_DTYPES + SIGNED_INT_EA_DTYPES
+ALL_INT_DTYPES: list[Dtype] = [*ALL_INT_NUMPY_DTYPES, *ALL_INT_EA_DTYPES]
+
+FLOAT_NUMPY_DTYPES: list[NpDtype] = [float, "float32", "float64"]
+FLOAT_EA_DTYPES: list[Dtype] = ["Float32", "Float64"]
+ALL_FLOAT_DTYPES: list[Dtype] = [*FLOAT_NUMPY_DTYPES, *FLOAT_EA_DTYPES]
+
+COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"]
+if using_string_dtype():
+    STRING_DTYPES: list[Dtype] = ["U"]
+else:
+    STRING_DTYPES: list[Dtype] = [str, "str", "U"]  # type: ignore[no-redef]
+COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES]
+
+DATETIME64_DTYPES: list[Dtype] = ["datetime64[ns]", "M8[ns]"]
+TIMEDELTA64_DTYPES: list[Dtype] = ["timedelta64[ns]", "m8[ns]"]
+
+BOOL_DTYPES: list[Dtype] = [bool, "bool"]
+BYTES_DTYPES: list[Dtype] = [bytes, "bytes"]
+OBJECT_DTYPES: list[Dtype] = [object, "object"]
+
+ALL_REAL_NUMPY_DTYPES = FLOAT_NUMPY_DTYPES + ALL_INT_NUMPY_DTYPES
+ALL_REAL_EXTENSION_DTYPES = FLOAT_EA_DTYPES + ALL_INT_EA_DTYPES
+ALL_REAL_DTYPES: list[Dtype] = [*ALL_REAL_NUMPY_DTYPES, *ALL_REAL_EXTENSION_DTYPES]
+ALL_NUMERIC_DTYPES: list[Dtype] = [*ALL_REAL_DTYPES, *COMPLEX_DTYPES]
+
+ALL_NUMPY_DTYPES = (
+    ALL_REAL_NUMPY_DTYPES
+    + COMPLEX_DTYPES
+    + STRING_DTYPES
+    + DATETIME64_DTYPES
+    + TIMEDELTA64_DTYPES
+    + BOOL_DTYPES
+    + OBJECT_DTYPES
+    + BYTES_DTYPES
+)
+
+NARROW_NP_DTYPES = [
+    np.float16,
+    np.float32,
+    np.int8,
+    np.int16,
+    np.int32,
+    np.uint8,
+    np.uint16,
+    np.uint32,
+]
+
+PYTHON_DATA_TYPES = [
+    str,
+    int,
+    float,
+    complex,
+    list,
+    tuple,
+    range,
+    dict,
+    set,
+    frozenset,
+    bool,
+    bytes,
+    bytearray,
+    memoryview,
+]
+
+ENDIAN = {"little": "<", "big": ">"}[byteorder]
+
+NULL_OBJECTS = [None, np.nan, pd.NaT, float("nan"), pd.NA, Decimal("NaN")]
+NP_NAT_OBJECTS = [
+    cls("NaT", unit)
+    for cls in [np.datetime64, np.timedelta64]
+    for unit in [
+        "Y",
+        "M",
+        "W",
+        "D",
+        "h",
+        "m",
+        "s",
+        "ms",
+        "us",
+        "ns",
+        "ps",
+        "fs",
+        "as",
+    ]
+]
+
+if HAS_PYARROW:
+    import pyarrow as pa
+
+    UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]
+    SIGNED_INT_PYARROW_DTYPES = [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+    ALL_INT_PYARROW_DTYPES = UNSIGNED_INT_PYARROW_DTYPES + SIGNED_INT_PYARROW_DTYPES
+    ALL_INT_PYARROW_DTYPES_STR_REPR = [
+        str(ArrowDtype(typ)) for typ in ALL_INT_PYARROW_DTYPES
+    ]
+
+    # pa.float16 doesn't seem supported
+    # https://github.com/apache/arrow/blob/master/python/pyarrow/src/arrow/python/helpers.cc#L86
+    FLOAT_PYARROW_DTYPES = [pa.float32(), pa.float64()]
+    FLOAT_PYARROW_DTYPES_STR_REPR = [
+        str(ArrowDtype(typ)) for typ in FLOAT_PYARROW_DTYPES
+    ]
+    DECIMAL_PYARROW_DTYPES = [pa.decimal128(7, 3)]
+    STRING_PYARROW_DTYPES = [pa.string()]
+    BINARY_PYARROW_DTYPES = [pa.binary()]
+
+    TIME_PYARROW_DTYPES = [
+        pa.time32("s"),
+        pa.time32("ms"),
+        pa.time64("us"),
+        pa.time64("ns"),
+    ]
+    DATE_PYARROW_DTYPES = [pa.date32(), pa.date64()]
+    DATETIME_PYARROW_DTYPES = [
+        pa.timestamp(unit=unit, tz=tz)
+        for unit in ["s", "ms", "us", "ns"]
+        for tz in [None, "UTC", "US/Pacific", "US/Eastern"]
+    ]
+    TIMEDELTA_PYARROW_DTYPES = [pa.duration(unit) for unit in ["s", "ms", "us", "ns"]]
+
+    BOOL_PYARROW_DTYPES = [pa.bool_()]
+
+    # TODO: Add container like pyarrow types:
+    #  https://arrow.apache.org/docs/python/api/datatypes.html#factory-functions
+    ALL_PYARROW_DTYPES = (
+        ALL_INT_PYARROW_DTYPES
+        + FLOAT_PYARROW_DTYPES
+        + DECIMAL_PYARROW_DTYPES
+        + STRING_PYARROW_DTYPES
+        + BINARY_PYARROW_DTYPES
+        + TIME_PYARROW_DTYPES
+        + DATE_PYARROW_DTYPES
+        + DATETIME_PYARROW_DTYPES
+        + TIMEDELTA_PYARROW_DTYPES
+        + BOOL_PYARROW_DTYPES
+    )
+    ALL_REAL_PYARROW_DTYPES_STR_REPR = (
+        ALL_INT_PYARROW_DTYPES_STR_REPR + FLOAT_PYARROW_DTYPES_STR_REPR
+    )
+else:
+    FLOAT_PYARROW_DTYPES_STR_REPR = []
+    ALL_INT_PYARROW_DTYPES_STR_REPR = []
+    ALL_PYARROW_DTYPES = []
+    ALL_REAL_PYARROW_DTYPES_STR_REPR = []
+
+ALL_REAL_NULLABLE_DTYPES = (
+    FLOAT_NUMPY_DTYPES + ALL_REAL_EXTENSION_DTYPES + ALL_REAL_PYARROW_DTYPES_STR_REPR
+)
+
+arithmetic_dunder_methods = [
+    "__add__",
+    "__radd__",
+    "__sub__",
+    "__rsub__",
+    "__mul__",
+    "__rmul__",
+    "__floordiv__",
+    "__rfloordiv__",
+    "__truediv__",
+    "__rtruediv__",
+    "__pow__",
+    "__rpow__",
+    "__mod__",
+    "__rmod__",
+]
+
+comparison_dunder_methods = ["__eq__", "__ne__", "__le__", "__lt__", "__ge__", "__gt__"]
+
+
+# -----------------------------------------------------------------------------
+# Comparators
+
+
+def box_expected(expected, box_cls, transpose: bool = True):
+    """
+    Helper function to wrap the expected output of a test in a given box_class.
+
+    Parameters
+    ----------
+    expected : np.ndarray, Index, Series
+    box_cls : {Index, Series, DataFrame}
+
+    Returns
+    -------
+    subclass of box_cls
+    """
+    if box_cls is pd.array:
+        if isinstance(expected, RangeIndex):
+            # pd.array would return an IntegerArray
+            expected = NumpyExtensionArray(np.asarray(expected._values))
+        else:
+            expected = pd.array(expected, copy=False)
+    elif box_cls is Index:
+        expected = Index(expected, copy=False)
+    elif box_cls is Series:
+        expected = Series(expected)
+    elif box_cls is DataFrame:
+        expected = Series(expected).to_frame()
+        if transpose:
+            # for vector operations, we need a DataFrame to be a single-row,
+            #  not a single-column, in order to operate against non-DataFrame
+            #  vectors of the same length. But convert to two rows to avoid
+            #  single-row special cases in datetime arithmetic
+            expected = expected.T
+            expected = pd.concat([expected] * 2, ignore_index=True)
+    elif box_cls is np.ndarray or box_cls is np.array:
+        expected = np.array(expected)
+    elif box_cls is to_array:
+        expected = to_array(expected)
+    else:
+        raise NotImplementedError(box_cls)
+    return expected
+
+
+def to_array(obj):
+    """
+    Similar to pd.array, but does not cast numpy dtypes to nullable dtypes.
+    """
+    # temporary implementation until we get pd.array in place
+    dtype = getattr(obj, "dtype", None)
+
+    if dtype is None:
+        return np.asarray(obj)
+
+    return extract_array(obj, extract_numpy=True)
+
+
+class SubclassedSeries(Series):
+    _metadata = ["testattr", "name"]
+
+    @property
+    def _constructor(self):
+        # For testing, those properties return a generic callable, and not
+        # the actual class. In this case that is equivalent, but it is to
+        # ensure we don't rely on the property returning a class
+        # See https://github.com/pandas-dev/pandas/pull/46018 and
+        # https://github.com/pandas-dev/pandas/issues/32638 and linked issues
+        return lambda *args, **kwargs: SubclassedSeries(*args, **kwargs)
+
+    @property
+    def _constructor_expanddim(self):
+        return lambda *args, **kwargs: SubclassedDataFrame(*args, **kwargs)
+
+
+class SubclassedDataFrame(DataFrame):
+    _metadata = ["testattr"]
+
+    @property
+    def _constructor(self):
+        return lambda *args, **kwargs: SubclassedDataFrame(*args, **kwargs)
+
+    # error: Cannot override writeable attribute with read-only property
+    @property
+    def _constructor_sliced(self):  # type: ignore[override]
+        return lambda *args, **kwargs: SubclassedSeries(*args, **kwargs)
+
+
+def convert_rows_list_to_csv_str(rows_list: list[str]) -> str:
+    """
+    Convert list of CSV rows to single CSV-formatted string for current OS.
+
+    This method is used for creating expected value of to_csv() method.
+
+    Parameters
+    ----------
+    rows_list : List[str]
+        Each element represents the row of csv.
+
+    Returns
+    -------
+    str
+        Expected output of to_csv() in current OS.
+    """
+    sep = os.linesep
+    return sep.join(rows_list) + sep
+
+
+def external_error_raised(expected_exception: type[Exception]) -> ContextManager:
+    """
+    Helper function to mark pytest.raises that have an external error message.
+
+    Parameters
+    ----------
+    expected_exception : Exception
+        Expected error to raise.
+
+    Returns
+    -------
+    Callable
+        Regular `pytest.raises` function with `match` equal to `None`.
+    """
+    import pytest
+
+    return pytest.raises(expected_exception, match=None)
+
+
+def get_cython_table_params(ndframe, func_names_and_expected):
+    """
+    Combine frame, functions from com._cython_table
+    keys and expected result.
+
+    Parameters
+    ----------
+    ndframe : DataFrame or Series
+    func_names_and_expected : Sequence of two items
+        The first item is a name of an NDFrame method ('sum', 'prod') etc.
+        The second item is the expected return value.
+
+    Returns
+    -------
+    list
+        List of three items (DataFrame, function, expected result)
+    """
+    results = []
+    for func_name, expected in func_names_and_expected:
+        results.append((ndframe, func_name, expected))
+    return results
+
+
+def get_op_from_name(op_name: str) -> Callable:
+    """
+    The operator function for a given op name.
+
+    Parameters
+    ----------
+    op_name : str
+        The op name, in form of "add" or "__add__".
+
+    Returns
+    -------
+    function
+        A function performing the operation.
+    """
+    short_opname = op_name.strip("_")
+    try:
+        op = getattr(operator, short_opname)
+    except AttributeError:
+        # Assume it is the reverse operator
+        rop = getattr(operator, short_opname[1:])
+        op = lambda x, y: rop(y, x)
+
+    return op
+
+
+# -----------------------------------------------------------------------------
+# Indexing test helpers
+
+
+def getitem(x):
+    return x
+
+
+def setitem(x):
+    return x
+
+
+def loc(x):
+    return x.loc
+
+
+def iloc(x):
+    return x.iloc
+
+
+def at(x):
+    return x.at
+
+
+def iat(x):
+    return x.iat
+
+
+# -----------------------------------------------------------------------------
+
+_UNITS = ["s", "ms", "us", "ns"]
+
+
+def get_finest_unit(left: str, right: str) -> str:
+    """
+    Find the higher of two datetime64 units.
+    """
+    if _UNITS.index(left) >= _UNITS.index(right):
+        return left
+    return right
+
+
+def shares_memory(left, right) -> bool:
+    """
+    Pandas-compat for np.shares_memory.
+    """
+    if isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
+        return np.shares_memory(left, right)
+    elif isinstance(left, np.ndarray):
+        # Call with reversed args to get to unpacking logic below.
+        return shares_memory(right, left)
+
+    if isinstance(left, RangeIndex):
+        return False
+    if isinstance(left, MultiIndex):
+        return shares_memory(left._codes, right)
+    if isinstance(left, (Index, Series)):
+        if isinstance(right, (Index, Series)):
+            return shares_memory(left._values, right._values)
+        return shares_memory(left._values, right)
+
+    if isinstance(left, NDArrayBackedExtensionArray):
+        return shares_memory(left._ndarray, right)
+    if isinstance(left, pd.core.arrays.SparseArray):
+        return shares_memory(left.sp_values, right)
+    if isinstance(left, pd.core.arrays.IntervalArray):
+        return shares_memory(left._left, right) or shares_memory(left._right, right)
+
+    if isinstance(left, ArrowExtensionArray):
+        if isinstance(right, ArrowExtensionArray):
+            # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
+            left_pa_data = left._pa_array
+            right_pa_data = right._pa_array
+            left_buf1 = left_pa_data.chunk(0).buffers()[1]
+            right_buf1 = right_pa_data.chunk(0).buffers()[1]
+            return left_buf1.address == right_buf1.address
+        else:
+            # if we have one one ArrowExtensionArray and one other array, assume
+            # they can only share memory if they share the same numpy buffer
+            return np.shares_memory(left, right)
+
+    if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
+        # By convention, we'll say these share memory if they share *either*
+        #  the _data or the _mask
+        return np.shares_memory(left._data, right._data) or np.shares_memory(
+            left._mask, right._mask
+        )
+
+    if isinstance(left, DataFrame) and len(left._mgr.blocks) == 1:
+        arr = left._mgr.blocks[0].values
+        return shares_memory(arr, right)
+
+    raise NotImplementedError(type(left), type(right))
+
+
+def run_multithreaded(closure, max_workers, arguments=None, pass_barrier=False):
+    with ThreadPoolExecutor(max_workers=max_workers) as tpe:
+        if arguments is None:
+            arguments = []
+        else:
+            arguments = list(arguments)
+
+        if pass_barrier:
+            barrier = threading.Barrier(max_workers)
+            arguments.append(barrier)
+
+        try:
+            futures = []
+            for _ in range(max_workers):
+                futures.append(tpe.submit(closure, *arguments))  # noqa: PERF401
+        except RuntimeError as e:
+            import pytest
+
+            pytest.skip(
+                f"Spawning {max_workers} threads failed with "
+                f"error {e!r} (likely due to resource limits on the "
+                "system running the tests)"
+            )
+        finally:
+            if len(futures) < max_workers and pass_barrier:
+                barrier.abort()
+        for f in futures:
+            f.result()
+
+
+__all__ = [
+    "ALL_INT_EA_DTYPES",
+    "ALL_INT_NUMPY_DTYPES",
+    "ALL_NUMPY_DTYPES",
+    "ALL_REAL_NUMPY_DTYPES",
+    "BOOL_DTYPES",
+    "BYTES_DTYPES",
+    "COMPLEX_DTYPES",
+    "DATETIME64_DTYPES",
+    "ENDIAN",
+    "FLOAT_EA_DTYPES",
+    "FLOAT_NUMPY_DTYPES",
+    "NARROW_NP_DTYPES",
+    "NP_NAT_OBJECTS",
+    "NULL_OBJECTS",
+    "OBJECT_DTYPES",
+    "SIGNED_INT_EA_DTYPES",
+    "SIGNED_INT_NUMPY_DTYPES",
+    "STRING_DTYPES",
+    "TIMEDELTA64_DTYPES",
+    "UNSIGNED_INT_EA_DTYPES",
+    "UNSIGNED_INT_NUMPY_DTYPES",
+    "SubclassedDataFrame",
+    "SubclassedSeries",
+    "assert_almost_equal",
+    "assert_attr_equal",
+    "assert_categorical_equal",
+    "assert_class_equal",
+    "assert_contains_all",
+    "assert_copy",
+    "assert_datetime_array_equal",
+    "assert_dict_equal",
+    "assert_equal",
+    "assert_extension_array_equal",
+    "assert_frame_equal",
+    "assert_index_equal",
+    "assert_indexing_slices_equivalent",
+    "assert_interval_array_equal",
+    "assert_is_sorted",
+    "assert_metadata_equivalent",
+    "assert_numpy_array_equal",
+    "assert_period_array_equal",
+    "assert_produces_warning",
+    "assert_series_equal",
+    "assert_sp_array_equal",
+    "assert_timedelta_array_equal",
+    "at",
+    "box_expected",
+    "can_set_locale",
+    "convert_rows_list_to_csv_str",
+    "decompress_file",
+    "external_error_raised",
+    "get_cython_table_params",
+    "get_dtype",
+    "get_finest_unit",
+    "get_locales",
+    "get_obj",
+    "get_op_from_name",
+    "getitem",
+    "iat",
+    "iloc",
+    "loc",
+    "maybe_produces_warning",
+    "raise_assert_detail",
+    "raises_chained_assignment_error",
+    "round_trip_pathlib",
+    "round_trip_pickle",
+    "run_multithreaded",
+    "set_locale",
+    "set_timezone",
+    "setitem",
+    "shares_memory",
+    "to_array",
+    "with_csv_dialect",
+    "write_to_compressed",
+]
diff --git a/pandas/_testing/_hypothesis.py b/pandas/_testing/_hypothesis.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbad21d8ab8d11b1590d7904090d0b528d24c744
--- /dev/null
+++ b/pandas/_testing/_hypothesis.py
@@ -0,0 +1,89 @@
+"""
+Hypothesis data generator helpers.
+"""
+
+from datetime import datetime
+
+from hypothesis import strategies as st
+from hypothesis.extra.dateutil import timezones as dateutil_timezones
+
+from pandas.compat import is_platform_windows
+
+import pandas as pd
+
+from pandas.tseries.offsets import (
+    BMonthBegin,
+    BMonthEnd,
+    BQuarterBegin,
+    BQuarterEnd,
+    BYearBegin,
+    BYearEnd,
+    MonthBegin,
+    MonthEnd,
+    QuarterBegin,
+    QuarterEnd,
+    YearBegin,
+    YearEnd,
+)
+
+OPTIONAL_INTS = st.lists(st.one_of(st.integers(), st.none()), max_size=10, min_size=3)
+
+OPTIONAL_FLOATS = st.lists(st.one_of(st.floats(), st.none()), max_size=10, min_size=3)
+
+OPTIONAL_TEXT = st.lists(st.one_of(st.none(), st.text()), max_size=10, min_size=3)
+
+OPTIONAL_DICTS = st.lists(
+    st.one_of(st.none(), st.dictionaries(st.text(), st.integers())),
+    max_size=10,
+    min_size=3,
+)
+
+OPTIONAL_LISTS = st.lists(
+    st.one_of(st.none(), st.lists(st.text(), max_size=10, min_size=3)),
+    max_size=10,
+    min_size=3,
+)
+
+OPTIONAL_ONE_OF_ALL = st.one_of(
+    OPTIONAL_DICTS, OPTIONAL_FLOATS, OPTIONAL_INTS, OPTIONAL_LISTS, OPTIONAL_TEXT
+)
+
+if is_platform_windows():
+    DATETIME_NO_TZ = st.datetimes(min_value=datetime(1900, 1, 1))
+else:
+    DATETIME_NO_TZ = st.datetimes()
+
+DATETIME_JAN_1_1900_OPTIONAL_TZ = st.datetimes(
+    min_value=pd.Timestamp(1900, 1, 1).to_pydatetime(),  # pyright: ignore[reportArgumentType]
+    max_value=pd.Timestamp(1900, 1, 1).to_pydatetime(),  # pyright: ignore[reportArgumentType]
+    timezones=st.one_of(st.none(), dateutil_timezones(), st.timezones()),
+)
+
+DATETIME_IN_PD_TIMESTAMP_RANGE_NO_TZ = st.datetimes(
+    min_value=pd.Timestamp.min.to_pydatetime(warn=False),
+    max_value=pd.Timestamp.max.to_pydatetime(warn=False),
+)
+
+INT_NEG_999_TO_POS_999 = st.integers(-999, 999)
+
+# The strategy for each type is registered in conftest.py, as they don't carry
+# enough runtime information (e.g. type hints) to infer how to build them.
+YQM_OFFSET = st.one_of(
+    *map(
+        st.from_type,
+        [
+            MonthBegin,
+            MonthEnd,
+            BMonthBegin,
+            BMonthEnd,
+            QuarterBegin,
+            QuarterEnd,
+            BQuarterBegin,
+            BQuarterEnd,
+            YearBegin,
+            YearEnd,
+            BYearBegin,
+            BYearEnd,
+        ],
+    )
+)
diff --git a/pandas/_testing/_io.py b/pandas/_testing/_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..78ed56bd59077abfa403286ee973300bb6826c66
--- /dev/null
+++ b/pandas/_testing/_io.py
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+import gzip
+import io
+import tarfile
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+import zipfile
+
+from pandas.compat._optional import import_optional_dependency
+
+import pandas as pd
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from pathlib import Path
+
+    from pandas import (
+        DataFrame,
+        Series,
+    )
+
+# ------------------------------------------------------------------
+# File-IO
+
+
+def round_trip_pickle(obj: Any, tmp_path: Path) -> DataFrame | Series:
+    """
+    Pickle an object and then read it again.
+
+    Parameters
+    ----------
+    obj : any object
+        The object to pickle and then re-read.
+    path : str, path object or file-like object, default None
+        The path where the pickled object is written and then read.
+
+    Returns
+    -------
+    pandas object
+        The original object that was pickled and then re-read.
+    """
+    pd.to_pickle(obj, tmp_path)
+    return pd.read_pickle(tmp_path)
+
+
+def round_trip_pathlib(writer, reader, tmp_path: Path):
+    """
+    Write an object to file specified by a pathlib.Path and read it back
+
+    Parameters
+    ----------
+    writer : callable bound to pandas object
+        IO writing function (e.g. DataFrame.to_csv )
+    reader : callable
+        IO reading function (e.g. pd.read_csv )
+    path : str, default None
+        The path where the object is written and then read.
+
+    Returns
+    -------
+    pandas object
+        The original object that was serialized and then re-read.
+    """
+    writer(tmp_path)
+    obj = reader(tmp_path)
+    return obj
+
+
+def write_to_compressed(compression, path: str, data, dest: str = "test") -> None:
+    """
+    Write data to a compressed file.
+
+    Parameters
+    ----------
+    compression : {'gzip', 'bz2', 'zip', 'xz', 'zstd'}
+        The compression type to use.
+    path : str
+        The file path to write the data.
+    data : str
+        The data to write.
+    dest : str, default "test"
+        The destination file (for ZIP only)
+
+    Raises
+    ------
+    ValueError : An invalid compression value was passed in.
+    """
+    args: tuple[Any, ...] = (data,)
+    mode = "wb"
+    method = "write"
+    compress_method: Callable
+
+    if compression == "zip":
+        compress_method = zipfile.ZipFile
+        mode = "w"
+        args = (dest, data)
+        method = "writestr"
+    elif compression == "tar":
+        compress_method = tarfile.TarFile
+        mode = "w"
+        file = tarfile.TarInfo(name=dest)
+        bytes = io.BytesIO(data)
+        file.size = len(data)
+        args = (file, bytes)
+        method = "addfile"
+    elif compression == "gzip":
+        compress_method = gzip.GzipFile
+    elif compression == "bz2":
+        import bz2
+
+        compress_method = bz2.BZ2File
+    elif compression == "zstd":
+        compress_method = import_optional_dependency("zstandard").open
+    elif compression == "xz":
+        import lzma
+
+        compress_method = lzma.LZMAFile
+    else:
+        raise ValueError(f"Unrecognized compression type: {compression}")
+
+    # error: No overload variant of "ZipFile" matches argument types "str", "str"
+    # error: No overload variant of "BZ2File" matches argument types "str", "str"
+    # error: Argument "mode" to "TarFile" has incompatible type "str";
+    #  expected "Literal['r', 'a', 'w', 'x']
+    with compress_method(path, mode=mode) as f:  # type: ignore[call-overload, arg-type]
+        getattr(f, method)(*args)
diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2d1f5c0c273e74bba14b9c55d44bb8f89e7edc3
--- /dev/null
+++ b/pandas/_testing/_warnings.py
@@ -0,0 +1,266 @@
+from __future__ import annotations
+
+from contextlib import (
+    AbstractContextManager,
+    contextmanager,
+    nullcontext,
+)
+import inspect
+import re
+import sys
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+    Union,
+    cast,
+)
+import warnings
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Generator,
+        Sequence,
+    )
+
+
+@contextmanager
+def assert_produces_warning(
+    expected_warning: type[Warning] | bool | tuple[type[Warning], ...] | None = Warning,
+    filter_level: Literal[
+        "error", "ignore", "always", "default", "module", "once"
+    ] = "always",
+    check_stacklevel: bool = True,
+    raise_on_extra_warnings: bool = True,
+    match: str | tuple[str | None, ...] | None = None,
+    must_find_all_warnings: bool = True,
+) -> Generator[list[warnings.WarningMessage]]:
+    """
+    Context manager for running code expected to either raise a specific warning,
+    multiple specific warnings, or not raise any warnings. Verifies that the code
+    raises the expected warning(s), and that it does not raise any other unexpected
+    warnings. It is basically a wrapper around ``warnings.catch_warnings``.
+
+    Parameters
+    ----------
+    expected_warning : {Warning, False, tuple[Warning, ...], None}, default Warning
+        The type of Exception raised. ``exception.Warning`` is the base
+        class for all warnings. To raise multiple types of exceptions,
+        pass them as a tuple. To check that no warning is returned,
+        specify ``False`` or ``None``.
+    filter_level : str or None, default "always"
+        Specifies whether warnings are ignored, displayed, or turned
+        into errors.
+        Valid values are:
+
+        * "error" - turns matching warnings into exceptions
+        * "ignore" - discard the warning
+        * "always" - always emit a warning
+        * "default" - print the warning the first time it is generated
+          from each location
+        * "module" - print the warning the first time it is generated
+          from each module
+        * "once" - print the warning the first time it is generated
+
+    check_stacklevel : bool, default True
+        If True, displays the line that called the function containing
+        the warning to show were the function is called. Otherwise, the
+        line that implements the function is displayed.
+    raise_on_extra_warnings : bool, default True
+        Whether extra warnings not of the type `expected_warning` should
+        cause the test to fail.
+    match : {str, tuple[str, ...]}, optional
+        Match warning message. If it's a tuple, it has to be the size of
+        `expected_warning`. If additionally `must_find_all_warnings` is
+        True, each expected warning's message gets matched with a respective
+        match. Otherwise, multiple values get treated as an alternative.
+    must_find_all_warnings : bool, default True
+        If True and `expected_warning` is a tuple, each expected warning
+        type must get encountered. Otherwise, even one expected warning
+        results in success.
+
+    Examples
+    --------
+    >>> import warnings
+    >>> with assert_produces_warning():
+    ...     warnings.warn(UserWarning())
+    >>> with assert_produces_warning(False):
+    ...     warnings.warn(RuntimeWarning())
+    Traceback (most recent call last):
+        ...
+    AssertionError: Caused unexpected warning(s): ['RuntimeWarning'].
+    >>> with assert_produces_warning(UserWarning):
+    ...     warnings.warn(RuntimeWarning())
+    Traceback (most recent call last):
+        ...
+    AssertionError: Did not see expected warning of class 'UserWarning'.
+
+    ..warn:: This is *not* thread-safe.
+    """
+    __tracebackhide__ = True
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter(filter_level)
+        try:
+            yield w
+        finally:
+            if expected_warning:
+                if isinstance(expected_warning, tuple) and must_find_all_warnings:
+                    match = (
+                        match
+                        if isinstance(match, tuple)
+                        else (match,) * len(expected_warning)
+                    )
+                    for warning_type, warning_match in zip(
+                        expected_warning, match, strict=True
+                    ):
+                        _assert_caught_expected_warnings(
+                            caught_warnings=w,
+                            expected_warning=warning_type,
+                            match=warning_match,
+                            check_stacklevel=check_stacklevel,
+                        )
+                else:
+                    expected_warning = cast(
+                        Union[type[Warning], tuple[type[Warning], ...]],
+                        expected_warning,
+                    )
+                    match = (
+                        "|".join(m for m in match if m)
+                        if isinstance(match, tuple)
+                        else match
+                    )
+                    _assert_caught_expected_warnings(
+                        caught_warnings=w,
+                        expected_warning=expected_warning,
+                        match=match,
+                        check_stacklevel=check_stacklevel,
+                    )
+            if raise_on_extra_warnings:
+                _assert_caught_no_extra_warnings(
+                    caught_warnings=w,
+                    expected_warning=expected_warning,
+                )
+
+
+def maybe_produces_warning(
+    warning: type[Warning], condition: bool, **kwargs
+) -> AbstractContextManager:
+    """
+    Return a context manager that possibly checks a warning based on the condition
+    """
+    if condition:
+        return assert_produces_warning(warning, **kwargs)
+    else:
+        return nullcontext()
+
+
+def _assert_caught_expected_warnings(
+    *,
+    caught_warnings: Sequence[warnings.WarningMessage],
+    expected_warning: type[Warning] | tuple[type[Warning], ...],
+    match: str | None,
+    check_stacklevel: bool,
+) -> None:
+    """Assert that there was the expected warning among the caught warnings."""
+    saw_warning = False
+    matched_message = False
+    unmatched_messages = []
+    warning_name = (
+        tuple(x.__name__ for x in expected_warning)
+        if isinstance(expected_warning, tuple)
+        else expected_warning.__name__
+    )
+
+    for actual_warning in caught_warnings:
+        if issubclass(actual_warning.category, expected_warning):
+            saw_warning = True
+
+            if check_stacklevel:
+                _assert_raised_with_correct_stacklevel(actual_warning)
+
+            if match is not None:
+                if re.search(match, str(actual_warning.message)):
+                    matched_message = True
+                else:
+                    unmatched_messages.append(actual_warning.message)
+
+    if not saw_warning:
+        raise AssertionError(f"Did not see expected warning of class {warning_name!r}")
+
+    if match and not matched_message:
+        raise AssertionError(
+            f"Did not see warning {warning_name!r} "
+            f"matching '{match}'. The emitted warning messages are "
+            f"{unmatched_messages}"
+        )
+
+
+def _assert_caught_no_extra_warnings(
+    *,
+    caught_warnings: Sequence[warnings.WarningMessage],
+    expected_warning: type[Warning] | bool | tuple[type[Warning], ...] | None,
+) -> None:
+    """Assert that no extra warnings apart from the expected ones are caught."""
+    extra_warnings = []
+
+    for actual_warning in caught_warnings:
+        if _is_unexpected_warning(actual_warning, expected_warning):
+            # GH#38630 pytest.filterwarnings does not suppress these.
+            if actual_warning.category == ResourceWarning:
+                # GH 44732: Don't make the CI flaky by filtering SSL-related
+                # ResourceWarning from dependencies
+                if "unclosed <ssl.SSLSocket" in str(actual_warning.message):
+                    continue
+                # GH 44844: Matplotlib leaves font files open during the entire process
+                # upon import. Don't make CI flaky if ResourceWarning raised
+                # due to these open files.
+                if any("matplotlib" in mod for mod in sys.modules):
+                    continue
+            if actual_warning.category == EncodingWarning:
+                # EncodingWarnings are checked in the CI
+                # pyproject.toml errors on EncodingWarnings in pandas
+                # Ignore EncodingWarnings from other libraries
+                continue
+            extra_warnings.append(
+                (
+                    actual_warning.category.__name__,
+                    actual_warning.message,
+                    actual_warning.filename,
+                    actual_warning.lineno,
+                )
+            )
+
+    if extra_warnings:
+        raise AssertionError(f"Caused unexpected warning(s): {extra_warnings!r}")
+
+
+def _is_unexpected_warning(
+    actual_warning: warnings.WarningMessage,
+    expected_warning: type[Warning] | bool | tuple[type[Warning], ...] | None,
+) -> bool:
+    """Check if the actual warning issued is unexpected."""
+    if actual_warning and not expected_warning:
+        return True
+    expected_warning = cast(type[Warning], expected_warning)
+    return bool(not issubclass(actual_warning.category, expected_warning))
+
+
+def _assert_raised_with_correct_stacklevel(
+    actual_warning: warnings.WarningMessage,
+) -> None:
+    # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow
+    frame = inspect.currentframe()
+    for _ in range(4):
+        frame = frame.f_back  # type: ignore[union-attr]
+    try:
+        caller_filename = inspect.getfile(frame)  # type: ignore[arg-type]
+    finally:
+        # See note in
+        # https://docs.python.org/3/library/inspect.html#inspect.Traceback
+        del frame
+    msg = (
+        "Warning not set with correct stacklevel. "
+        f"File where warning is raised: {actual_warning.filename} != "
+        f"{caller_filename}. Warning message: {actual_warning.message}"
+    )
+    assert actual_warning.filename == caller_filename, msg
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
new file mode 100644
index 0000000000000000000000000000000000000000..3732fd9d2065561d4390fd4a167a31abab7a5bce
--- /dev/null
+++ b/pandas/_testing/asserters.py
@@ -0,0 +1,1503 @@
+from __future__ import annotations
+
+import operator
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+    NoReturn,
+    cast,
+)
+import warnings
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas._libs.missing import is_matching_na
+from pandas._libs.sparse import SparseIndex
+import pandas._libs.testing as _testing
+from pandas._libs.tslibs.np_datetime import compare_mismatched_resolutions
+from pandas.errors import Pandas4Warning
+from pandas.util._decorators import (
+    deprecate_kwarg,
+    set_module,
+)
+
+from pandas.core.dtypes.common import (
+    is_bool,
+    is_float_dtype,
+    is_integer_dtype,
+    is_number,
+    is_numeric_dtype,
+    needs_i8_conversion,
+)
+from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
+    DatetimeTZDtype,
+    ExtensionDtype,
+    NumpyEADtype,
+)
+from pandas.core.dtypes.missing import array_equivalent
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    IntervalDtype,
+    IntervalIndex,
+    MultiIndex,
+    PeriodIndex,
+    RangeIndex,
+    Series,
+    TimedeltaIndex,
+)
+from pandas.core.arrays import (
+    DatetimeArray,
+    ExtensionArray,
+    IntervalArray,
+    PeriodArray,
+    TimedeltaArray,
+)
+from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
+from pandas.core.arrays.string_ import StringDtype
+from pandas.core.indexes.api import safe_sort_index
+
+from pandas.io.formats.printing import pprint_thing
+
+if TYPE_CHECKING:
+    from pandas._typing import DtypeObj
+
+
+def assert_almost_equal(
+    left,
+    right,
+    check_dtype: bool | Literal["equiv"] = "equiv",
+    rtol: float = 1.0e-5,
+    atol: float = 1.0e-8,
+    **kwargs,
+) -> None:
+    """
+    Check that the left and right objects are approximately equal.
+
+    By approximately equal, we refer to objects that are numbers or that
+    contain numbers which may be equivalent to specific levels of precision.
+
+    Parameters
+    ----------
+    left : object
+    right : object
+    check_dtype : bool or {'equiv'}, default 'equiv'
+        Check dtype if both a and b are the same type. If 'equiv' is passed in,
+        then `RangeIndex` and `Index` with int64 dtype are also considered
+        equivalent when doing type checking.
+    rtol : float, default 1e-5
+        Relative tolerance.
+    atol : float, default 1e-8
+        Absolute tolerance.
+    """
+    if isinstance(left, Index):
+        assert_index_equal(
+            left,
+            right,
+            check_exact=False,
+            exact=check_dtype,
+            rtol=rtol,
+            atol=atol,
+            **kwargs,
+        )
+
+    elif isinstance(left, Series):
+        assert_series_equal(
+            left,
+            right,
+            check_exact=False,
+            check_dtype=check_dtype,
+            rtol=rtol,
+            atol=atol,
+            **kwargs,
+        )
+
+    elif isinstance(left, DataFrame):
+        assert_frame_equal(
+            left,
+            right,
+            check_exact=False,
+            check_dtype=check_dtype,
+            rtol=rtol,
+            atol=atol,
+            **kwargs,
+        )
+
+    else:
+        # Other sequences.
+        if check_dtype:
+            if is_number(left) and is_number(right):
+                # Do not compare numeric classes, like np.float64 and float.
+                pass
+            elif is_bool(left) and is_bool(right):
+                # Do not compare bool classes, like np.bool_ and bool.
+                pass
+            else:
+                if isinstance(left, np.ndarray) or isinstance(right, np.ndarray):
+                    obj = "numpy array"
+                else:
+                    obj = "Input"
+                assert_class_equal(left, right, obj=obj)
+
+        # if we have "equiv", this becomes True
+        _testing.assert_almost_equal(
+            left, right, check_dtype=bool(check_dtype), rtol=rtol, atol=atol, **kwargs
+        )
+
+
+def _check_isinstance(left, right, cls) -> None:
+    """
+    Helper method for our assert_* methods that ensures that
+    the two objects being compared have the right type before
+    proceeding with the comparison.
+
+    Parameters
+    ----------
+    left : The first object being compared.
+    right : The second object being compared.
+    cls : The class type to check against.
+
+    Raises
+    ------
+    AssertionError : Either `left` or `right` is not an instance of `cls`.
+    """
+    cls_name = cls.__name__
+
+    if not isinstance(left, cls):
+        raise AssertionError(
+            f"{cls_name} Expected type {cls}, found {type(left)} instead"
+        )
+    if not isinstance(right, cls):
+        raise AssertionError(
+            f"{cls_name} Expected type {cls}, found {type(right)} instead"
+        )
+
+
+def assert_dict_equal(left, right, compare_keys: bool = True) -> None:
+    _check_isinstance(left, right, dict)
+    _testing.assert_dict_equal(left, right, compare_keys=compare_keys)
+
+
+@set_module("pandas.testing")
+def assert_index_equal(
+    left: Index,
+    right: Index,
+    exact: bool | str = "equiv",
+    check_names: bool = True,
+    check_exact: bool = True,
+    check_categorical: bool = True,
+    check_order: bool = True,
+    rtol: float = 1.0e-5,
+    atol: float = 1.0e-8,
+    obj: str | None = None,
+) -> None:
+    """
+    Check that left and right Index are equal.
+
+    Parameters
+    ----------
+    left : Index
+        The first index to compare.
+    right : Index
+        The second index to compare.
+    exact : bool or {'equiv'}, default 'equiv'
+        Whether to check the Index class, dtype and inferred_type
+        are identical. If 'equiv', then RangeIndex can be substituted for
+        Index with an int64 dtype as well.
+    check_names : bool, default True
+        Whether to check the names attribute.
+    check_exact : bool, default True
+        Whether to compare number exactly.
+    check_categorical : bool, default True
+        Whether to compare internal Categorical exactly.
+    check_order : bool, default True
+        Whether to compare the order of index entries as well as their values.
+        If True, both indexes must contain the same elements, in the same order.
+        If False, both indexes must contain the same elements, but in any order.
+    rtol : float, default 1e-5
+        Relative tolerance. Only used when check_exact is False.
+    atol : float, default 1e-8
+        Absolute tolerance. Only used when check_exact is False.
+    obj : str, default 'Index' or 'MultiIndex'
+        Specify object name being compared, internally used to show appropriate
+        assertion message.
+
+    See Also
+    --------
+    testing.assert_series_equal : Check that two Series are equal.
+    testing.assert_frame_equal : Check that two DataFrames are equal.
+
+    Examples
+    --------
+    >>> from pandas import testing as tm
+    >>> a = pd.Index([1, 2, 3])
+    >>> b = pd.Index([1, 2, 3])
+    >>> tm.assert_index_equal(a, b)
+    """
+    __tracebackhide__ = True
+
+    if obj is None:
+        obj = "MultiIndex" if isinstance(left, MultiIndex) else "Index"
+
+    def _check_types(left, right, obj: str = "Index") -> None:
+        if not exact:
+            return
+
+        assert_class_equal(left, right, exact=exact, obj=obj)
+        assert_attr_equal("inferred_type", left, right, obj=obj)
+
+        # Skip exact dtype checking when `check_categorical` is False
+        if isinstance(left.dtype, CategoricalDtype) and isinstance(
+            right.dtype, CategoricalDtype
+        ):
+            if check_categorical:
+                assert_attr_equal("dtype", left, right, obj=obj)
+                assert_index_equal(left.categories, right.categories, exact=exact)
+            return
+
+        assert_attr_equal("dtype", left, right, obj=obj)
+
+    # instance validation
+    _check_isinstance(left, right, Index)
+
+    # class / dtype comparison
+    _check_types(left, right, obj=obj)
+
+    # level comparison
+    if left.nlevels != right.nlevels:
+        msg1 = f"{obj} levels are different"
+        msg2 = f"{left.nlevels}, {left}"
+        msg3 = f"{right.nlevels}, {right}"
+        raise_assert_detail(obj, msg1, msg2, msg3)
+
+    # length comparison
+    if len(left) != len(right):
+        msg1 = f"{obj} length are different"
+        msg2 = f"{len(left)}, {left}"
+        msg3 = f"{len(right)}, {right}"
+        raise_assert_detail(obj, msg1, msg2, msg3)
+
+    # If order doesn't matter then sort the index entries
+    if not check_order:
+        left = safe_sort_index(left)
+        right = safe_sort_index(right)
+
+    # MultiIndex special comparison for little-friendly error messages
+    if isinstance(left, MultiIndex):
+        right = cast(MultiIndex, right)
+
+        for level in range(left.nlevels):
+            lobj = f"{obj} level [{level}]"
+            try:
+                # try comparison on levels/codes to avoid densifying MultiIndex
+                assert_index_equal(
+                    left.levels[level],
+                    right.levels[level],
+                    exact=exact,
+                    check_names=check_names,
+                    check_exact=check_exact,
+                    check_categorical=check_categorical,
+                    rtol=rtol,
+                    atol=atol,
+                    obj=lobj,
+                )
+                assert_numpy_array_equal(left.codes[level], right.codes[level])
+            except AssertionError:
+                llevel = left.get_level_values(level)
+                rlevel = right.get_level_values(level)
+
+                assert_index_equal(
+                    llevel,
+                    rlevel,
+                    exact=exact,
+                    check_names=check_names,
+                    check_exact=check_exact,
+                    check_categorical=check_categorical,
+                    rtol=rtol,
+                    atol=atol,
+                    obj=lobj,
+                )
+            # get_level_values may change dtype
+            _check_types(left.levels[level], right.levels[level], obj=lobj)
+
+    # skip exact index checking when `check_categorical` is False
+    elif check_exact and check_categorical:
+        if not left.equals(right):
+            # _values compare can raise TypeError (non-comparable
+            # categoricals (GH#61935)
+            try:
+                mismatch = left._values != right._values
+            except TypeError:
+                raise_assert_detail(
+                    obj,
+                    "types are not comparable (non-matching categorical categories)",
+                    left,
+                    right,
+                )
+
+            if not isinstance(mismatch, np.ndarray):
+                mismatch = cast("ExtensionArray", mismatch).fillna(True)
+
+            diff = np.sum(mismatch.astype(int)) * 100.0 / len(left)
+            msg = f"{obj} values are different ({np.round(diff, 5)} %)"
+            raise_assert_detail(obj, msg, left, right)
+    else:
+        # if we have "equiv", this becomes True
+        exact_bool = bool(exact)
+        _testing.assert_almost_equal(
+            left.values,
+            right.values,
+            rtol=rtol,
+            atol=atol,
+            check_dtype=exact_bool,
+            obj=obj,
+            lobj=left,
+            robj=right,
+        )
+
+    # metadata comparison
+    if check_names:
+        assert_attr_equal("names", left, right, obj=obj)
+    if isinstance(left, PeriodIndex) or isinstance(right, PeriodIndex):
+        assert_attr_equal("dtype", left, right, obj=obj)
+    if isinstance(left, IntervalIndex) or isinstance(right, IntervalIndex):
+        assert_interval_array_equal(left._values, right._values)
+
+    if check_categorical:
+        if isinstance(left.dtype, CategoricalDtype) or isinstance(
+            right.dtype, CategoricalDtype
+        ):
+            assert_categorical_equal(left._values, right._values, obj=f"{obj} category")
+
+
+def assert_class_equal(
+    left, right, exact: bool | str = True, obj: str = "Input"
+) -> None:
+    """
+    Checks classes are equal.
+    """
+    __tracebackhide__ = True
+
+    def repr_class(x):
+        if isinstance(x, Index):
+            # return Index as it is to include values in the error message
+            return x
+
+        return type(x).__name__
+
+    def is_class_equiv(idx: Index) -> bool:
+        """Classes that are a RangeIndex (sub-)instance or exactly an `Index` .
+
+        This only checks class equivalence. There is a separate check that the
+        dtype is int64.
+        """
+        return type(idx) is Index or isinstance(idx, RangeIndex)
+
+    if type(left) == type(right):
+        return
+
+    if exact == "equiv":
+        if is_class_equiv(left) and is_class_equiv(right):
+            return
+
+    msg = f"{obj} classes are different"
+    raise_assert_detail(obj, msg, repr_class(left), repr_class(right))
+
+
+def assert_attr_equal(attr: str, left, right, obj: str = "Attributes") -> None:
+    """
+    Check attributes are equal. Both objects must have attribute.
+
+    Parameters
+    ----------
+    attr : str
+        Attribute name being compared.
+    left : object
+    right : object
+    obj : str, default 'Attributes'
+        Specify object name being compared, internally used to show appropriate
+        assertion message
+    """
+    __tracebackhide__ = True
+
+    left_attr = getattr(left, attr)
+    right_attr = getattr(right, attr)
+
+    if left_attr is right_attr or is_matching_na(left_attr, right_attr):
+        # e.g. both np.nan, both NaT, both pd.NA, ...
+        return None
+
+    try:
+        result = left_attr == right_attr
+    except TypeError:
+        # datetimetz on rhs may raise TypeError
+        result = False
+    if (left_attr is pd.NA) ^ (right_attr is pd.NA):
+        result = False
+    elif not isinstance(result, bool):
+        result = result.all()
+
+    if not result:
+        msg = f'Attribute "{attr}" are different'
+        raise_assert_detail(obj, msg, left_attr, right_attr)
+    return None
+
+
+def assert_is_sorted(seq) -> None:
+    """Assert that the sequence is sorted."""
+    if isinstance(seq, (Index, Series)):
+        seq = seq.values
+    # sorting does not change precisions
+    if isinstance(seq, np.ndarray):
+        assert_numpy_array_equal(seq, np.sort(np.array(seq)))
+    else:
+        assert_extension_array_equal(seq, seq[seq.argsort()])
+
+
+def assert_categorical_equal(
+    left,
+    right,
+    check_dtype: bool = True,
+    check_category_order: bool = True,
+    obj: str = "Categorical",
+) -> None:
+    """
+    Test that Categoricals are equivalent.
+
+    Parameters
+    ----------
+    left : Categorical
+    right : Categorical
+    check_dtype : bool, default True
+        Check that integer dtype of the codes are the same.
+    check_category_order : bool, default True
+        Whether the order of the categories should be compared, which
+        implies identical integer codes.  If False, only the resulting
+        values are compared.  The ordered attribute is
+        checked regardless.
+    obj : str, default 'Categorical'
+        Specify object name being compared, internally used to show appropriate
+        assertion message.
+    """
+    _check_isinstance(left, right, Categorical)
+
+    exact: bool | str
+    if isinstance(left.categories, RangeIndex) or isinstance(
+        right.categories, RangeIndex
+    ):
+        exact = "equiv"
+    else:
+        # We still want to require exact matches for Index
+        exact = True
+
+    if check_category_order:
+        assert_index_equal(
+            left.categories, right.categories, obj=f"{obj}.categories", exact=exact
+        )
+        assert_numpy_array_equal(
+            left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes"
+        )
+    else:
+        try:
+            lc = left.categories.sort_values()
+            rc = right.categories.sort_values()
+        except TypeError:
+            # e.g. '<' not supported between instances of 'int' and 'str'
+            lc, rc = left.categories, right.categories
+        assert_index_equal(lc, rc, obj=f"{obj}.categories", exact=exact)
+        assert_index_equal(
+            left.categories.take(left.codes),
+            right.categories.take(right.codes),
+            obj=f"{obj}.values",
+            exact=exact,
+        )
+
+    assert_attr_equal("ordered", left, right, obj=obj)
+
+
+def assert_interval_array_equal(
+    left, right, exact: bool | Literal["equiv"] = "equiv", obj: str = "IntervalArray"
+) -> None:
+    """
+    Test that two IntervalArrays are equivalent.
+
+    Parameters
+    ----------
+    left, right : IntervalArray
+        The IntervalArrays to compare.
+    exact : bool or {'equiv'}, default 'equiv'
+        Whether to check the Index class, dtype and inferred_type
+        are identical. If 'equiv', then RangeIndex can be substituted for
+        Index with an int64 dtype as well.
+    obj : str, default 'IntervalArray'
+        Specify object name being compared, internally used to show appropriate
+        assertion message
+    """
+    _check_isinstance(left, right, IntervalArray)
+
+    kwargs = {}
+    if left._left.dtype.kind in "mM":
+        # We have a DatetimeArray or TimedeltaArray
+        kwargs["check_freq"] = False
+
+    assert_equal(left._left, right._left, obj=f"{obj}.left", **kwargs)
+    assert_equal(left._right, right._right, obj=f"{obj}.right", **kwargs)
+
+    assert_attr_equal("closed", left, right, obj=obj)
+
+
+def assert_period_array_equal(left, right, obj: str = "PeriodArray") -> None:
+    _check_isinstance(left, right, PeriodArray)
+
+    assert_numpy_array_equal(left._ndarray, right._ndarray, obj=f"{obj}._ndarray")
+    assert_attr_equal("dtype", left, right, obj=obj)
+
+
+def assert_datetime_array_equal(
+    left, right, obj: str = "DatetimeArray", check_freq: bool = True
+) -> None:
+    __tracebackhide__ = True
+    _check_isinstance(left, right, DatetimeArray)
+
+    assert_numpy_array_equal(left._ndarray, right._ndarray, obj=f"{obj}._ndarray")
+    if check_freq:
+        assert_attr_equal("freq", left, right, obj=obj)
+    assert_attr_equal("tz", left, right, obj=obj)
+
+
+def assert_timedelta_array_equal(
+    left, right, obj: str = "TimedeltaArray", check_freq: bool = True
+) -> None:
+    __tracebackhide__ = True
+    _check_isinstance(left, right, TimedeltaArray)
+    assert_numpy_array_equal(left._ndarray, right._ndarray, obj=f"{obj}._ndarray")
+    if check_freq:
+        assert_attr_equal("freq", left, right, obj=obj)
+
+
+def raise_assert_detail(
+    obj, message, left, right, diff=None, first_diff=None, index_values=None
+) -> NoReturn:
+    __tracebackhide__ = True
+
+    msg = f"""{obj} are different
+
+{message}"""
+
+    if isinstance(index_values, Index):
+        index_values = np.asarray(index_values)
+
+    if isinstance(index_values, np.ndarray):
+        msg += f"\n[index]: {pprint_thing(index_values)}"
+
+    if isinstance(left, np.ndarray):
+        left = pprint_thing(left)
+    elif isinstance(left, (CategoricalDtype, StringDtype, NumpyEADtype)):
+        left = repr(left)
+
+    if isinstance(right, np.ndarray):
+        right = pprint_thing(right)
+    elif isinstance(right, (CategoricalDtype, StringDtype, NumpyEADtype)):
+        right = repr(right)
+
+    msg += f"""
+[left]:  {left}
+[right]: {right}"""
+
+    if diff is not None:
+        msg += f"\n[diff]: {diff}"
+
+    if first_diff is not None:
+        msg += f"\n{first_diff}"
+
+    raise AssertionError(msg)
+
+
+def assert_numpy_array_equal(
+    left,
+    right,
+    strict_nan: bool = False,
+    check_dtype: bool | Literal["equiv"] = True,
+    err_msg=None,
+    check_same=None,
+    obj: str = "numpy array",
+    index_values=None,
+) -> None:
+    """
+    Check that 'np.ndarray' is equivalent.
+
+    Parameters
+    ----------
+    left, right : numpy.ndarray or iterable
+        The two arrays to be compared.
+    strict_nan : bool, default False
+        If True, consider NaN and None to be different.
+    check_dtype : bool, default True
+        Check dtype if both a and b are np.ndarray.
+    err_msg : str, default None
+        If provided, used as assertion message.
+    check_same : None|'copy'|'same', default None
+        Ensure left and right refer/do not refer to the same memory area.
+    obj : str, default 'numpy array'
+        Specify object name being compared, internally used to show appropriate
+        assertion message.
+    index_values : Index | numpy.ndarray, default None
+        optional index (shared by both left and right), used in output.
+    """
+    __tracebackhide__ = True
+
+    # instance validation
+    # Show a detailed error message when classes are different
+    assert_class_equal(left, right, obj=obj)
+    # both classes must be an np.ndarray
+    _check_isinstance(left, right, np.ndarray)
+
+    def _get_base(obj):
+        return obj.base if getattr(obj, "base", None) is not None else obj
+
+    left_base = _get_base(left)
+    right_base = _get_base(right)
+
+    if check_same == "same":
+        if left_base is not right_base:
+            raise AssertionError(f"{left_base!r} is not {right_base!r}")
+    elif check_same == "copy":
+        if left_base is right_base:
+            raise AssertionError(f"{left_base!r} is {right_base!r}")
+
+    def _raise(left, right, err_msg) -> NoReturn:
+        if err_msg is None:
+            if left.shape != right.shape:
+                raise_assert_detail(
+                    obj, f"{obj} shapes are different", left.shape, right.shape
+                )
+
+            diff = 0
+            for left_arr, right_arr in zip(left, right, strict=True):
+                # count up differences
+                if not array_equivalent(left_arr, right_arr, strict_nan=strict_nan):
+                    diff += 1
+
+            diff = diff * 100.0 / left.size
+            msg = f"{obj} values are different ({np.round(diff, 5)} %)"
+            raise_assert_detail(obj, msg, left, right, index_values=index_values)
+
+        raise AssertionError(err_msg)
+
+    # compare shape and values
+    if not array_equivalent(left, right, strict_nan=strict_nan):
+        _raise(left, right, err_msg)
+
+    if check_dtype:
+        if isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
+            assert_attr_equal("dtype", left, right, obj=obj)
+
+
+@set_module("pandas.testing")
+def assert_extension_array_equal(
+    left,
+    right,
+    check_dtype: bool | Literal["equiv"] = True,
+    index_values=None,
+    check_exact: bool | lib.NoDefault = lib.no_default,
+    rtol: float | lib.NoDefault = lib.no_default,
+    atol: float | lib.NoDefault = lib.no_default,
+    obj: str = "ExtensionArray",
+) -> None:
+    """
+    Check that left and right ExtensionArrays are equal.
+
+    This method compares two ``ExtensionArray`` instances for equality,
+    including checks for missing values, the dtype of the arrays, and
+    the exactness of the comparison (or tolerance when comparing floats).
+
+    Parameters
+    ----------
+    left, right : ExtensionArray
+        The two arrays to compare.
+    check_dtype : bool, default True
+        Whether to check if the ExtensionArray dtypes are identical.
+    index_values : Index | numpy.ndarray, default None
+        Optional index (shared by both left and right), used in output.
+    check_exact : bool, default False
+        Whether to compare number exactly.
+
+        .. versionchanged:: 2.2.0
+
+            Defaults to True for integer dtypes if none of
+            ``check_exact``, ``rtol`` and ``atol`` are specified.
+    rtol : float, default 1e-5
+        Relative tolerance. Only used when check_exact is False.
+    atol : float, default 1e-8
+        Absolute tolerance. Only used when check_exact is False.
+    obj : str, default 'ExtensionArray'
+        Specify object name being compared, internally used to show appropriate
+        assertion message.
+
+        .. versionadded:: 2.0.0
+
+    See Also
+    --------
+    testing.assert_series_equal : Check that left and right ``Series`` are equal.
+    testing.assert_frame_equal : Check that left and right ``DataFrame`` are equal.
+    testing.assert_index_equal : Check that left and right ``Index`` are equal.
+
+    Notes
+    -----
+    Missing values are checked separately from valid values.
+    A mask of missing values is computed for each and checked to match.
+    The remaining all-valid values are cast to object dtype and checked.
+
+    Examples
+    --------
+    >>> from pandas import testing as tm
+    >>> a = pd.Series([1, 2, 3, 4])
+    >>> b, c = a.array, a.array
+    >>> tm.assert_extension_array_equal(b, c)
+    """
+    if (
+        check_exact is lib.no_default
+        and rtol is lib.no_default
+        and atol is lib.no_default
+    ):
+        check_exact = (
+            is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)
+        ) or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype))
+    elif check_exact is lib.no_default:
+        check_exact = False
+
+    rtol = rtol if rtol is not lib.no_default else 1.0e-5
+    atol = atol if atol is not lib.no_default else 1.0e-8
+
+    assert isinstance(left, ExtensionArray), "left is not an ExtensionArray"
+    assert isinstance(right, ExtensionArray), "right is not an ExtensionArray"
+    if check_dtype:
+        assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}")
+
+    if (
+        isinstance(left, DatetimeLikeArrayMixin)
+        and isinstance(right, DatetimeLikeArrayMixin)
+        and type(right) == type(left)
+    ):
+        # GH 52449
+        if not check_dtype and left.dtype.kind in "mM":
+            if not isinstance(left.dtype, np.dtype):
+                l_unit = cast(DatetimeTZDtype, left.dtype).unit
+            else:
+                l_unit = np.datetime_data(left.dtype)[0]
+            if not isinstance(right.dtype, np.dtype):
+                r_unit = cast(DatetimeTZDtype, right.dtype).unit
+            else:
+                r_unit = np.datetime_data(right.dtype)[0]
+            if (
+                l_unit != r_unit
+                and compare_mismatched_resolutions(
+                    left._ndarray, right._ndarray, operator.eq
+                ).all()
+            ):
+                return
+        # Avoid slow object-dtype comparisons
+        # np.asarray for case where we have an np.MaskedArray
+        assert_numpy_array_equal(
+            np.asarray(left.asi8),
+            np.asarray(right.asi8),
+            index_values=index_values,
+            obj=obj,
+        )
+        return
+
+    left_na = np.asarray(left.isna())
+    right_na = np.asarray(right.isna())
+    assert_numpy_array_equal(
+        left_na, right_na, obj=f"{obj} NA mask", index_values=index_values
+    )
+
+    # Specifically for StringArrayNumpySemantics, validate here we have a valid array
+    if (
+        isinstance(left.dtype, StringDtype)
+        and left.dtype.storage == "python"
+        and left.dtype.na_value is np.nan
+    ):
+        assert np.all(
+            [np.isnan(val) for val in left._ndarray[left_na]]  # type: ignore[attr-defined]
+        ), "wrong missing value sentinels"
+    if (
+        isinstance(right.dtype, StringDtype)
+        and right.dtype.storage == "python"
+        and right.dtype.na_value is np.nan
+    ):
+        assert np.all(
+            [np.isnan(val) for val in right._ndarray[right_na]]  # type: ignore[attr-defined]
+        ), "wrong missing value sentinels"
+
+    left_valid = left[~left_na].to_numpy(dtype=object)
+    right_valid = right[~right_na].to_numpy(dtype=object)
+    if check_exact:
+        assert_numpy_array_equal(
+            left_valid, right_valid, obj=obj, index_values=index_values
+        )
+    else:
+        _testing.assert_almost_equal(
+            left_valid,
+            right_valid,
+            check_dtype=bool(check_dtype),
+            rtol=rtol,
+            atol=atol,
+            obj=obj,
+            index_values=index_values,
+        )
+
+
+# This could be refactored to use the NDFrame.equals method
+@set_module("pandas.testing")
+@deprecate_kwarg(Pandas4Warning, "check_datetimelike_compat", new_arg_name=None)
+def assert_series_equal(
+    left,
+    right,
+    check_dtype: bool | Literal["equiv"] = True,
+    check_index_type: bool | Literal["equiv"] = "equiv",
+    check_series_type: bool = True,
+    check_names: bool = True,
+    check_exact: bool | lib.NoDefault = lib.no_default,
+    check_datetimelike_compat: bool = False,
+    check_categorical: bool = True,
+    check_category_order: bool = True,
+    check_freq: bool = True,
+    check_flags: bool = True,
+    rtol: float | lib.NoDefault = lib.no_default,
+    atol: float | lib.NoDefault = lib.no_default,
+    obj: str = "Series",
+    *,
+    check_index: bool = True,
+    check_like: bool = False,
+) -> None:
+    """
+    Check that left and right Series are equal.
+
+    Parameters
+    ----------
+    left : Series
+        First Series to compare.
+    right : Series
+        Second Series to compare.
+    check_dtype : bool, default True
+        Whether to check the Series dtype is identical.
+    check_index_type : bool or {'equiv'}, default 'equiv'
+        Whether to check the Index class, dtype and inferred_type
+        are identical.
+    check_series_type : bool, default True
+         Whether to check the Series class is identical.
+    check_names : bool, default True
+        Whether to check the Series and Index names attribute.
+    check_exact : bool, default False
+        Whether to compare number exactly. This also applies when checking
+        Index equivalence.
+
+        .. versionchanged:: 2.2.0
+
+            Defaults to True for integer dtypes if none of
+            ``check_exact``, ``rtol`` and ``atol`` are specified.
+
+        .. versionchanged:: 3.0.0
+
+            check_exact for comparing the Indexes defaults to True by
+            checking if an Index is of integer dtypes.
+
+    check_datetimelike_compat : bool, default False
+        Compare datetime-like which is comparable ignoring dtype.
+
+        .. deprecated:: 3.0
+
+    check_categorical : bool, default True
+        Whether to compare internal Categorical exactly.
+    check_category_order : bool, default True
+        Whether to compare category order of internal Categoricals.
+    check_freq : bool, default True
+        Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex.
+    check_flags : bool, default True
+        Whether to check the `flags` attribute.
+    rtol : float, default 1e-5
+        Relative tolerance. Only used when check_exact is False.
+    atol : float, default 1e-8
+        Absolute tolerance. Only used when check_exact is False.
+    obj : str, default 'Series'
+        Specify object name being compared, internally used to show appropriate
+        assertion message.
+    check_index : bool, default True
+        Whether to check index equivalence. If False, then compare only values.
+    check_like : bool, default False
+        If True, ignore the order of the index. Must be False if check_index is False.
+        Note: same labels must be with the same data.
+
+    See Also
+    --------
+    testing.assert_index_equal : Check that two Indexes are equal.
+    testing.assert_frame_equal : Check that two DataFrames are equal.
+
+    Examples
+    --------
+    >>> from pandas import testing as tm
+    >>> a = pd.Series([1, 2, 3, 4])
+    >>> b = pd.Series([1, 2, 3, 4])
+    >>> tm.assert_series_equal(a, b)
+    """
+    __tracebackhide__ = True
+    if (
+        check_exact is lib.no_default
+        and rtol is lib.no_default
+        and atol is lib.no_default
+    ):
+        check_exact = (
+            is_numeric_dtype(left.dtype) and not is_float_dtype(left.dtype)
+        ) or (is_numeric_dtype(right.dtype) and not is_float_dtype(right.dtype))
+        left_index_dtypes = (
+            [left.index.dtype] if left.index.nlevels == 1 else left.index.dtypes
+        )
+        right_index_dtypes = (
+            [right.index.dtype] if right.index.nlevels == 1 else right.index.dtypes
+        )
+        check_exact_index = all(
+            dtype.kind in "iu" for dtype in left_index_dtypes
+        ) or all(dtype.kind in "iu" for dtype in right_index_dtypes)
+    elif check_exact is lib.no_default:
+        check_exact = False
+        check_exact_index = False
+    else:
+        check_exact_index = check_exact
+
+    rtol = rtol if rtol is not lib.no_default else 1.0e-5
+    atol = atol if atol is not lib.no_default else 1.0e-8
+
+    if not check_index and check_like:
+        raise ValueError("check_like must be False if check_index is False")
+
+    # instance validation
+    _check_isinstance(left, right, Series)
+
+    if check_series_type:
+        assert_class_equal(left, right, obj=obj)
+
+    # length comparison
+    if len(left) != len(right):
+        msg1 = f"{len(left)}, {left.index}"
+        msg2 = f"{len(right)}, {right.index}"
+        raise_assert_detail(obj, "Series length are different", msg1, msg2)
+
+    if check_flags:
+        assert left.flags == right.flags, f"{left.flags!r} != {right.flags!r}"
+
+    if check_index:
+        # GH #38183
+        assert_index_equal(
+            left.index,
+            right.index,
+            exact=check_index_type,
+            check_names=check_names,
+            check_exact=check_exact_index,
+            check_categorical=check_categorical,
+            check_order=not check_like,
+            rtol=rtol,
+            atol=atol,
+            obj=f"{obj}.index",
+        )
+
+    if check_like:
+        left = left.reindex_like(right)
+
+    if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)):
+        lidx = left.index
+        ridx = right.index
+        assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq)
+
+    if check_dtype:
+        # We want to skip exact dtype checking when `check_categorical`
+        # is False. We'll still raise if only one is a `Categorical`,
+        # regardless of `check_categorical`
+        if (
+            isinstance(left.dtype, CategoricalDtype)
+            and isinstance(right.dtype, CategoricalDtype)
+            and not check_categorical
+        ):
+            pass
+        else:
+            assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}")
+    if check_exact:
+        left_values = left._values
+        right_values = right._values
+        # Only check exact if dtype is numeric
+        if isinstance(left_values, ExtensionArray) and isinstance(
+            right_values, ExtensionArray
+        ):
+            assert_extension_array_equal(
+                left_values,
+                right_values,
+                check_dtype=check_dtype,
+                index_values=left.index,
+                obj=str(obj),
+            )
+        else:
+            # convert both to NumPy if not, check_dtype would raise earlier
+            lv, rv = left_values, right_values
+            if isinstance(left_values, ExtensionArray):
+                lv = left_values.to_numpy()
+            if isinstance(right_values, ExtensionArray):
+                rv = right_values.to_numpy()
+            assert_numpy_array_equal(
+                lv,
+                rv,
+                check_dtype=check_dtype,
+                obj=str(obj),
+                index_values=left.index,
+            )
+    elif check_datetimelike_compat and (
+        needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype)
+    ):
+        # we want to check only if we have compat dtypes
+        # e.g. integer and M|m are NOT compat, but we can simply check
+        # the values in that case
+
+        # datetimelike may have different objects (e.g. datetime.datetime
+        # vs Timestamp) but will compare equal
+        if not Index(left._values).equals(Index(right._values)):
+            msg = (
+                f"[datetimelike_compat=True] {left._values} "
+                f"is not equal to {right._values}."
+            )
+            raise AssertionError(msg)
+    elif isinstance(left.dtype, IntervalDtype) and isinstance(
+        right.dtype, IntervalDtype
+    ):
+        assert_interval_array_equal(left.array, right.array)
+    elif isinstance(left.dtype, CategoricalDtype) or isinstance(
+        right.dtype, CategoricalDtype
+    ):
+        _testing.assert_almost_equal(
+            left._values,
+            right._values,
+            rtol=rtol,
+            atol=atol,
+            check_dtype=bool(check_dtype),
+            obj=str(obj),
+            index_values=left.index,
+        )
+    elif isinstance(left.dtype, ExtensionDtype) and isinstance(
+        right.dtype, ExtensionDtype
+    ):
+        assert_extension_array_equal(
+            left._values,
+            right._values,
+            rtol=rtol,
+            atol=atol,
+            check_dtype=check_dtype,
+            index_values=left.index,
+            obj=str(obj),
+        )
+    elif is_extension_array_dtype_and_needs_i8_conversion(
+        left.dtype, right.dtype
+    ) or is_extension_array_dtype_and_needs_i8_conversion(right.dtype, left.dtype):
+        assert_extension_array_equal(
+            left._values,
+            right._values,
+            check_dtype=check_dtype,
+            index_values=left.index,
+            obj=str(obj),
+        )
+    elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype):
+        # DatetimeArray or TimedeltaArray
+        assert_extension_array_equal(
+            left._values,
+            right._values,
+            check_dtype=check_dtype,
+            index_values=left.index,
+            obj=str(obj),
+        )
+    else:
+        _testing.assert_almost_equal(
+            left._values,
+            right._values,
+            rtol=rtol,
+            atol=atol,
+            check_dtype=bool(check_dtype),
+            obj=str(obj),
+            index_values=left.index,
+        )
+
+    # metadata comparison
+    if check_names:
+        assert_attr_equal("name", left, right, obj=obj)
+
+    if check_categorical:
+        if isinstance(left.dtype, CategoricalDtype) or isinstance(
+            right.dtype, CategoricalDtype
+        ):
+            assert_categorical_equal(
+                left._values,
+                right._values,
+                obj=f"{obj} category",
+                check_category_order=check_category_order,
+            )
+
+
+# This could be refactored to use the NDFrame.equals method
+@set_module("pandas.testing")
+@deprecate_kwarg(Pandas4Warning, "check_datetimelike_compat", new_arg_name=None)
+def assert_frame_equal(
+    left,
+    right,
+    check_dtype: bool | Literal["equiv"] = True,
+    check_index_type: bool | Literal["equiv"] = "equiv",
+    check_column_type: bool | Literal["equiv"] = "equiv",
+    check_frame_type: bool = True,
+    check_names: bool = True,
+    by_blocks: bool = False,
+    check_exact: bool | lib.NoDefault = lib.no_default,
+    check_datetimelike_compat: bool = False,
+    check_categorical: bool = True,
+    check_like: bool = False,
+    check_freq: bool = True,
+    check_flags: bool = True,
+    rtol: float | lib.NoDefault = lib.no_default,
+    atol: float | lib.NoDefault = lib.no_default,
+    obj: str = "DataFrame",
+) -> None:
+    """
+    Check that left and right DataFrame are equal.
+
+    This function is intended to compare two DataFrames and output any
+    differences. It is mostly intended for use in unit tests.
+    Additional parameters allow varying the strictness of the
+    equality checks performed.
+
+    Parameters
+    ----------
+    left : DataFrame
+        First DataFrame to compare.
+    right : DataFrame
+        Second DataFrame to compare.
+    check_dtype : bool, default True
+        Whether to check the DataFrame dtype is identical.
+    check_index_type : bool or {'equiv'}, default 'equiv'
+        Whether to check the Index class, dtype and inferred_type
+        are identical.
+    check_column_type : bool or {'equiv'}, default 'equiv'
+        Whether to check the columns class, dtype and inferred_type
+        are identical. Is passed as the ``exact`` argument of
+        :func:`assert_index_equal`.
+    check_frame_type : bool, default True
+        Whether to check the DataFrame class is identical.
+    check_names : bool, default True
+        Whether to check that the `names` attribute for both the `index`
+        and `column` attributes of the DataFrame is identical.
+    by_blocks : bool, default False
+        Specify how to compare internal data. If False, compare by columns.
+        If True, compare by blocks.
+    check_exact : bool, default False
+        Whether to compare number exactly. If False, the comparison uses the
+        relative tolerance (``rtol``) and absolute tolerance (``atol``)
+        parameters to determine if two values are considered close,
+        according to the formula: ``|a - b| <= (atol + rtol * |b|)``.
+
+        .. versionchanged:: 2.2.0
+
+            Defaults to True for integer dtypes if none of
+            ``check_exact``, ``rtol`` and ``atol`` are specified.
+    check_datetimelike_compat : bool, default False
+        Compare datetime-like which is comparable ignoring dtype.
+
+        .. deprecated:: 3.0
+
+    check_categorical : bool, default True
+        Whether to compare internal Categorical exactly.
+    check_like : bool, default False
+        If True, ignore the order of index & columns.
+        Note: index labels must match their respective rows
+        (same as in columns) - same labels must be with the same data.
+    check_freq : bool, default True
+        Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex.
+    check_flags : bool, default True
+        Whether to check the `flags` attribute.
+    rtol : float, default 1e-5
+        Relative tolerance. Only used when check_exact is False.
+    atol : float, default 1e-8
+        Absolute tolerance. Only used when check_exact is False.
+    obj : str, default 'DataFrame'
+        Specify object name being compared, internally used to show appropriate
+        assertion message.
+
+    See Also
+    --------
+    assert_series_equal : Equivalent method for asserting Series equality.
+    DataFrame.equals : Check DataFrame equality.
+
+    Examples
+    --------
+    This example shows comparing two DataFrames that are equal
+    but with columns of differing dtypes.
+
+    >>> from pandas.testing import assert_frame_equal
+    >>> df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    >>> df2 = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0]})
+
+    df1 equals itself.
+
+    >>> assert_frame_equal(df1, df1)
+
+    df1 differs from df2 as column 'b' is of a different type.
+
+    >>> assert_frame_equal(df1, df2)
+    Traceback (most recent call last):
+    ...
+    AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="b") are different
+
+    Attribute "dtype" are different
+    [left]:  int64
+    [right]: float64
+
+    Ignore differing dtypes in columns with check_dtype.
+
+    >>> assert_frame_equal(df1, df2, check_dtype=False)
+    """
+    __tracebackhide__ = True
+    _rtol = rtol if rtol is not lib.no_default else 1.0e-5
+    _atol = atol if atol is not lib.no_default else 1.0e-8
+    _check_exact = check_exact if check_exact is not lib.no_default else False
+
+    # instance validation
+    _check_isinstance(left, right, DataFrame)
+
+    if check_frame_type:
+        assert isinstance(left, type(right))
+        # assert_class_equal(left, right, obj=obj)
+
+    # shape comparison
+    if left.shape != right.shape:
+        raise_assert_detail(
+            obj, f"{obj} shape mismatch", f"{left.shape!r}", f"{right.shape!r}"
+        )
+
+    if check_flags:
+        assert left.flags == right.flags, f"{left.flags!r} != {right.flags!r}"
+
+    # index comparison
+    assert_index_equal(
+        left.index,
+        right.index,
+        exact=check_index_type,
+        check_names=check_names,
+        check_exact=_check_exact,
+        check_categorical=check_categorical,
+        check_order=not check_like,
+        rtol=_rtol,
+        atol=_atol,
+        obj=f"{obj}.index",
+    )
+
+    # column comparison
+    assert_index_equal(
+        left.columns,
+        right.columns,
+        exact=check_column_type,
+        check_names=check_names,
+        check_exact=_check_exact,
+        check_categorical=check_categorical,
+        check_order=not check_like,
+        rtol=_rtol,
+        atol=_atol,
+        obj=f"{obj}.columns",
+    )
+
+    if check_like:
+        left = left.reindex_like(right)
+
+    # compare by blocks
+    if by_blocks:
+        rblocks = right._to_dict_of_blocks()
+        lblocks = left._to_dict_of_blocks()
+        for dtype in list(set(list(lblocks.keys()) + list(rblocks.keys()))):
+            assert dtype in lblocks
+            assert dtype in rblocks
+            assert_frame_equal(
+                lblocks[dtype], rblocks[dtype], check_dtype=check_dtype, obj=obj
+            )
+
+    # compare by columns
+    else:
+        for i, col in enumerate(left.columns):
+            # We have already checked that columns match, so we can do
+            #  fast location-based lookups
+            lcol = left._ixs(i, axis=1)
+            rcol = right._ixs(i, axis=1)
+
+            # GH #38183
+            # use check_index=False, because we do not want to run
+            # assert_index_equal for each column,
+            # as we already checked it for the whole dataframe before.
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    message="the 'check_datetimelike_compat' keyword",
+                    category=Pandas4Warning,
+                )
+                assert_series_equal(
+                    lcol,
+                    rcol,
+                    check_dtype=check_dtype,
+                    check_index_type=check_index_type,
+                    check_exact=check_exact,
+                    check_names=check_names,
+                    check_datetimelike_compat=check_datetimelike_compat,
+                    check_categorical=check_categorical,
+                    check_freq=check_freq,
+                    obj=f'{obj}.iloc[:, {i}] (column name="{col}")',
+                    rtol=rtol,
+                    atol=atol,
+                    check_index=False,
+                    check_flags=False,
+                )
+
+
+def assert_equal(left, right, **kwargs) -> None:
+    """
+    Wrapper for tm.assert_*_equal to dispatch to the appropriate test function.
+
+    Parameters
+    ----------
+    left, right : Index, Series, DataFrame, ExtensionArray, or np.ndarray
+        The two items to be compared.
+    **kwargs
+        All keyword arguments are passed through to the underlying assert method.
+    """
+    __tracebackhide__ = True
+
+    if isinstance(left, Index):
+        assert_index_equal(left, right, **kwargs)
+        if isinstance(left, (DatetimeIndex, TimedeltaIndex)):
+            assert left.freq == right.freq, (left.freq, right.freq)
+    elif isinstance(left, Series):
+        assert_series_equal(left, right, **kwargs)
+    elif isinstance(left, DataFrame):
+        assert_frame_equal(left, right, **kwargs)
+    elif isinstance(left, IntervalArray):
+        assert_interval_array_equal(left, right, **kwargs)
+    elif isinstance(left, PeriodArray):
+        assert_period_array_equal(left, right, **kwargs)
+    elif isinstance(left, DatetimeArray):
+        assert_datetime_array_equal(left, right, **kwargs)
+    elif isinstance(left, TimedeltaArray):
+        assert_timedelta_array_equal(left, right, **kwargs)
+    elif isinstance(left, ExtensionArray):
+        assert_extension_array_equal(left, right, **kwargs)
+    elif isinstance(left, np.ndarray):
+        assert_numpy_array_equal(left, right, **kwargs)
+    elif isinstance(left, str):
+        assert kwargs == {}
+        assert left == right
+    else:
+        assert kwargs == {}
+        assert_almost_equal(left, right)
+
+
+def assert_sp_array_equal(left, right) -> None:
+    """
+    Check that the left and right SparseArray are equal.
+
+    Parameters
+    ----------
+    left : SparseArray
+    right : SparseArray
+    """
+    _check_isinstance(left, right, pd.arrays.SparseArray)
+
+    assert_numpy_array_equal(left.sp_values, right.sp_values)
+
+    # SparseIndex comparison
+    assert isinstance(left.sp_index, SparseIndex)
+    assert isinstance(right.sp_index, SparseIndex)
+
+    left_index = left.sp_index
+    right_index = right.sp_index
+
+    if not left_index.equals(right_index):
+        raise_assert_detail(
+            "SparseArray.index", "index are not equal", left_index, right_index
+        )
+    else:
+        # Just ensure a
+        pass
+
+    assert_attr_equal("fill_value", left, right)
+    assert_attr_equal("dtype", left, right)
+    assert_numpy_array_equal(left.to_dense(), right.to_dense())
+
+
+def assert_contains_all(iterable, dic) -> None:
+    for k in iterable:
+        assert k in dic, f"Did not contain item: {k!r}"
+
+
+def assert_copy(iter1, iter2, **eql_kwargs) -> None:
+    """
+    iter1, iter2: iterables that produce elements
+    comparable with assert_almost_equal
+
+    Checks that the elements are equal, but not
+    the same object. (Does not check that items
+    in sequences are also not the same object)
+    """
+    for elem1, elem2 in zip(iter1, iter2, strict=True):
+        assert_almost_equal(elem1, elem2, **eql_kwargs)
+        msg = (
+            f"Expected object {type(elem1)!r} and object {type(elem2)!r} to be "
+            "different objects, but they were the same object."
+        )
+        assert elem1 is not elem2, msg
+
+
+def is_extension_array_dtype_and_needs_i8_conversion(
+    left_dtype: DtypeObj, right_dtype: DtypeObj
+) -> bool:
+    """
+    Checks that we have the combination of an ExtensionArraydtype and
+    a dtype that should be converted to int64
+
+    Returns
+    -------
+    bool
+
+    Related to issue #37609
+    """
+    return isinstance(left_dtype, ExtensionDtype) and needs_i8_conversion(right_dtype)
+
+
+def assert_indexing_slices_equivalent(ser: Series, l_slc: slice, i_slc: slice) -> None:
+    """
+    Check that ser.iloc[i_slc] matches ser.loc[l_slc] and, if applicable,
+    ser[l_slc].
+    """
+    expected = ser.iloc[i_slc]
+
+    assert_series_equal(ser.loc[l_slc], expected)
+
+    if not is_integer_dtype(ser.index):
+        # For integer indices, .loc and plain getitem are position-based.
+        assert_series_equal(ser[l_slc], expected)
+
+
+def assert_metadata_equivalent(
+    left: DataFrame | Series, right: DataFrame | Series | None = None
+) -> None:
+    """
+    Check that ._metadata attributes are equivalent.
+    """
+    for attr in left._metadata:
+        val = getattr(left, attr, None)
+        if right is None:
+            assert val is None
+        else:
+            assert val == getattr(right, attr, None)
diff --git a/pandas/_testing/compat.py b/pandas/_testing/compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..722ba61a3227f88821c27e7b89bc27749cbb83fd
--- /dev/null
+++ b/pandas/_testing/compat.py
@@ -0,0 +1,30 @@
+"""
+Helpers for sharing tests between DataFrame/Series
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from pandas import DataFrame
+
+if TYPE_CHECKING:
+    from pandas._typing import DtypeObj
+
+
+def get_dtype(obj) -> DtypeObj:
+    if isinstance(obj, DataFrame):
+        # Note: we are assuming only one column
+        return obj.dtypes.iat[0]
+    else:
+        return obj.dtype
+
+
+def get_obj(df: DataFrame, klass):
+    """
+    For sharing tests using frame_or_series, either return the DataFrame
+    unchanged or return it's first column as a Series.
+    """
+    if klass is DataFrame:
+        return df
+    return df._ixs(0, axis=1)
diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcb20d00904cba798ae4c9a283c7d0eed44169c4
--- /dev/null
+++ b/pandas/_testing/contexts.py
@@ -0,0 +1,151 @@
+from __future__ import annotations
+
+from contextlib import contextmanager
+import os
+import sys
+from typing import (
+    IO,
+    TYPE_CHECKING,
+)
+
+from pandas.compat import CHAINED_WARNING_DISABLED
+from pandas.errors import ChainedAssignmentError
+
+from pandas.io.common import get_handle
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from pandas._typing import (
+        BaseBuffer,
+        CompressionOptions,
+        FilePath,
+    )
+
+
+@contextmanager
+def decompress_file(
+    path: FilePath | BaseBuffer, compression: CompressionOptions
+) -> Generator[IO[bytes]]:
+    """
+    Open a compressed file and return a file object.
+
+    Parameters
+    ----------
+    path : str
+        The path where the file is read from.
+
+    compression : {'gzip', 'bz2', 'zip', 'xz', 'zstd', None}
+        Name of the decompression to use
+
+    Returns
+    -------
+    file object
+    """
+    with get_handle(path, "rb", compression=compression, is_text=False) as handle:
+        yield handle.handle
+
+
+@contextmanager
+def set_timezone(tz: str) -> Generator[None]:
+    """
+    Context manager for temporarily setting a timezone.
+
+    Parameters
+    ----------
+    tz : str
+        A string representing a valid timezone.
+
+    Examples
+    --------
+    >>> from datetime import datetime
+    >>> from dateutil.tz import tzlocal
+    >>> tzlocal().tzname(datetime(2021, 1, 1))  # doctest: +SKIP
+    'IST'
+
+    >>> with set_timezone("US/Eastern"):
+    ...     tzlocal().tzname(datetime(2021, 1, 1))
+    'EST'
+    """
+    import time
+
+    def setTZ(tz) -> None:
+        if hasattr(time, "tzset"):
+            if tz is None:
+                try:
+                    del os.environ["TZ"]
+                except KeyError:
+                    pass
+            else:
+                os.environ["TZ"] = tz
+                # Next line allows typing checks to pass on Windows
+                if sys.platform != "win32":
+                    time.tzset()
+
+    orig_tz = os.environ.get("TZ")
+    setTZ(tz)
+    try:
+        yield
+    finally:
+        setTZ(orig_tz)
+
+
+@contextmanager
+def with_csv_dialect(name: str, **kwargs) -> Generator[None]:
+    """
+    Context manager to temporarily register a CSV dialect for parsing CSV.
+
+    Parameters
+    ----------
+    name : str
+        The name of the dialect.
+    kwargs : mapping
+        The parameters for the dialect.
+
+    Raises
+    ------
+    ValueError : the name of the dialect conflicts with a builtin one.
+
+    See Also
+    --------
+    csv : Python's CSV library.
+    """
+    import csv
+
+    _BUILTIN_DIALECTS = {"excel", "excel-tab", "unix"}
+
+    if name in _BUILTIN_DIALECTS:
+        raise ValueError("Cannot override builtin dialect.")
+
+    csv.register_dialect(name, **kwargs)
+    try:
+        yield
+    finally:
+        csv.unregister_dialect(name)
+
+
+def raises_chained_assignment_error(extra_warnings=(), extra_match=()):
+    from pandas._testing import assert_produces_warning
+
+    if CHAINED_WARNING_DISABLED:
+        if not extra_warnings:
+            from contextlib import nullcontext
+
+            return nullcontext()
+        else:
+            return assert_produces_warning(
+                extra_warnings,
+                match=extra_match,
+            )
+    else:
+        warning = ChainedAssignmentError
+        match = (
+            "A value is being set on a copy of a DataFrame or Series "
+            "through chained assignment"
+        )
+        if extra_warnings:
+            warning = (warning, *extra_warnings)  # type: ignore[assignment]
+        return assert_produces_warning(
+            warning,
+            match=(match, *extra_match),
+        )
diff --git a/pandas/api/__init__.py b/pandas/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a016e67a41360eadb5475dc961fba2c000a7f32b
--- /dev/null
+++ b/pandas/api/__init__.py
@@ -0,0 +1,19 @@
+"""public toolkit API"""
+
+from pandas.api import (
+    executors,
+    extensions,
+    indexers,
+    interchange,
+    types,
+    typing,
+)
+
+__all__ = [
+    "executors",
+    "extensions",
+    "indexers",
+    "interchange",
+    "types",
+    "typing",
+]
diff --git a/pandas/api/executors/__init__.py b/pandas/api/executors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..04c94ee688332ea042ae4b6511a91e1a2653880f
--- /dev/null
+++ b/pandas/api/executors/__init__.py
@@ -0,0 +1,7 @@
+"""
+Public API for function executor engines to be used with ``map`` and ``apply``.
+"""
+
+from pandas.core.apply import BaseExecutionEngine
+
+__all__ = ["BaseExecutionEngine"]
diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c88c0d35b4d70270e2bb7b4a1a6e77f03555592
--- /dev/null
+++ b/pandas/api/extensions/__init__.py
@@ -0,0 +1,33 @@
+"""
+Public API for extending pandas objects.
+"""
+
+from pandas._libs.lib import no_default
+
+from pandas.core.dtypes.base import (
+    ExtensionDtype,
+    register_extension_dtype,
+)
+
+from pandas.core.accessor import (
+    register_dataframe_accessor,
+    register_index_accessor,
+    register_series_accessor,
+)
+from pandas.core.algorithms import take
+from pandas.core.arrays import (
+    ExtensionArray,
+    ExtensionScalarOpsMixin,
+)
+
+__all__ = [
+    "ExtensionArray",
+    "ExtensionDtype",
+    "ExtensionScalarOpsMixin",
+    "no_default",
+    "register_dataframe_accessor",
+    "register_extension_dtype",
+    "register_index_accessor",
+    "register_series_accessor",
+    "take",
+]
diff --git a/pandas/api/indexers/__init__.py b/pandas/api/indexers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3c6546218de4ce8ddc3b044fc26b6946065b07e
--- /dev/null
+++ b/pandas/api/indexers/__init__.py
@@ -0,0 +1,17 @@
+"""
+Public API for Rolling Window Indexers.
+"""
+
+from pandas.core.indexers import check_array_indexer
+from pandas.core.indexers.objects import (
+    BaseIndexer,
+    FixedForwardWindowIndexer,
+    VariableOffsetWindowIndexer,
+)
+
+__all__ = [
+    "BaseIndexer",
+    "FixedForwardWindowIndexer",
+    "VariableOffsetWindowIndexer",
+    "check_array_indexer",
+]
diff --git a/pandas/api/interchange/__init__.py b/pandas/api/interchange/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aded37abc7224a6b24180beba2f52be60d7ae25d
--- /dev/null
+++ b/pandas/api/interchange/__init__.py
@@ -0,0 +1,8 @@
+"""
+Public API for DataFrame interchange protocol.
+"""
+
+from pandas.core.interchange.dataframe_protocol import DataFrame
+from pandas.core.interchange.from_dataframe import from_dataframe
+
+__all__ = ["DataFrame", "from_dataframe"]
diff --git a/pandas/api/internals.py b/pandas/api/internals.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d8992a875758d3cecad32cd20f5feab9e14cca
--- /dev/null
+++ b/pandas/api/internals.py
@@ -0,0 +1,62 @@
+import numpy as np
+
+from pandas._typing import ArrayLike
+
+from pandas import (
+    DataFrame,
+    Index,
+)
+from pandas.core.internals.api import _make_block
+from pandas.core.internals.managers import BlockManager as _BlockManager
+
+
+def create_dataframe_from_blocks(
+    blocks: list[tuple[ArrayLike, np.ndarray]], index: Index, columns: Index
+) -> DataFrame:
+    """
+    Low-level function to create a DataFrame from arrays as they are
+    representing the block structure of the resulting DataFrame.
+
+    Attention: this is an advanced, low-level function that should only be
+    used if you know that the below-mentioned assumptions are guaranteed.
+    If passing data that do not follow those assumptions, subsequent
+    subsequent operations on the resulting DataFrame might lead to strange
+    errors.
+    For almost all use cases, you should use the standard pd.DataFrame(..)
+    constructor instead. If you are planning to use this function, let us
+    know by opening an issue at https://github.com/pandas-dev/pandas/issues.
+
+    Assumptions:
+
+    - The block arrays are either a 2D numpy array or a pandas ExtensionArray
+    - In case of a numpy array, it is assumed to already be in the expected
+      shape for Blocks (2D, (cols, rows), i.e. transposed compared to the
+      DataFrame columns).
+    - All arrays are taken as is (no type inference) and expected to have the
+      correct size.
+    - The placement arrays have the correct length (equalling the number of
+      columns that its equivalent block array represents), and all placement
+      arrays together form a complete set of 0 to n_columns - 1.
+
+    Parameters
+    ----------
+    blocks : list of tuples of (block_array, block_placement)
+        This should be a list of tuples existing of (block_array, block_placement),
+        where:
+
+        - block_array is a 2D numpy array or a 1D ExtensionArray, following the
+          requirements listed above.
+        - block_placement is a 1D integer numpy array
+    index : Index
+        The Index object for the `index` of the resulting DataFrame.
+    columns : Index
+        The Index object for the `columns` of the resulting DataFrame.
+
+    Returns
+    -------
+    DataFrame
+    """
+    block_objs = [_make_block(*block) for block in blocks]
+    axes = [columns, index]
+    mgr = _BlockManager(block_objs, axes)
+    return DataFrame._from_mgr(mgr, mgr.axes)
diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a5c742b1628b797e1015247b1a2b52e0bda4470
--- /dev/null
+++ b/pandas/api/types/__init__.py
@@ -0,0 +1,23 @@
+"""
+Public toolkit API.
+"""
+
+from pandas._libs.lib import infer_dtype
+
+from pandas.core.dtypes.api import *  # noqa: F403
+from pandas.core.dtypes.concat import union_categoricals
+from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
+    DatetimeTZDtype,
+    IntervalDtype,
+    PeriodDtype,
+)
+
+__all__ = [
+    "CategoricalDtype",
+    "DatetimeTZDtype",
+    "IntervalDtype",
+    "PeriodDtype",
+    "infer_dtype",
+    "union_categoricals",
+]
diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..de6657b58ee80337097bbeb08f1ed54bb47f42d7
--- /dev/null
+++ b/pandas/api/typing/__init__.py
@@ -0,0 +1,61 @@
+"""
+Public API classes that store intermediate results useful for type-hinting.
+"""
+
+from pandas._libs import NaTType
+from pandas._libs.lib import NoDefault
+from pandas._libs.missing import NAType
+
+from pandas.core.col import Expression
+from pandas.core.groupby import (
+    DataFrameGroupBy,
+    SeriesGroupBy,
+)
+from pandas.core.indexes.frozen import FrozenList
+from pandas.core.resample import (
+    DatetimeIndexResamplerGroupby,
+    PeriodIndexResamplerGroupby,
+    Resampler,
+    TimedeltaIndexResamplerGroupby,
+    TimeGrouper,
+)
+from pandas.core.window import (
+    Expanding,
+    ExpandingGroupby,
+    ExponentialMovingWindow,
+    ExponentialMovingWindowGroupby,
+    Rolling,
+    RollingGroupby,
+    Window,
+)
+
+# TODO: Can't import Styler without importing jinja2
+# from pandas.io.formats.style import Styler
+from pandas.io.json._json import JsonReader
+from pandas.io.sas.sasreader import SASReader
+from pandas.io.stata import StataReader
+
+__all__ = [
+    "DataFrameGroupBy",
+    "DatetimeIndexResamplerGroupby",
+    "Expanding",
+    "ExpandingGroupby",
+    "ExponentialMovingWindow",
+    "ExponentialMovingWindowGroupby",
+    "Expression",
+    "FrozenList",
+    "JsonReader",
+    "NAType",
+    "NaTType",
+    "NoDefault",
+    "PeriodIndexResamplerGroupby",
+    "Resampler",
+    "Rolling",
+    "RollingGroupby",
+    "SASReader",
+    "SeriesGroupBy",
+    "StataReader",
+    "TimeGrouper",
+    "TimedeltaIndexResamplerGroupby",
+    "Window",
+]
diff --git a/pandas/api/typing/aliases.py b/pandas/api/typing/aliases.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9cad814393722a45e3e3fcbadb34e93cc968f31
--- /dev/null
+++ b/pandas/api/typing/aliases.py
@@ -0,0 +1,145 @@
+from pandas._typing import (
+    AggFuncType,
+    AlignJoin,
+    AnyAll,
+    AnyArrayLike,
+    ArrayLike,
+    AstypeArg,
+    Axes,
+    Axis,
+    ColspaceArgType,
+    CompressionOptions,
+    CorrelationMethod,
+    CSVEngine,
+    DropKeep,
+    Dtype,
+    DtypeArg,
+    DtypeBackend,
+    DtypeObj,
+    ExcelWriterIfSheetExists,
+    ExcelWriterMergeCells,
+    FilePath,
+    FillnaOptions,
+    FloatFormatType,
+    FormattersType,
+    FromDictOrient,
+    HTMLFlavors,
+    IgnoreRaise,
+    IndexLabel,
+    InterpolateOptions,
+    IntervalClosedType,
+    IntervalLeftRight,
+    JoinHow,
+    JoinValidate,
+    JSONEngine,
+    JSONSerializable,
+    ListLike,
+    MergeHow,
+    MergeValidate,
+    NaPosition,
+    NsmallestNlargestKeep,
+    OpenFileErrors,
+    Ordered,
+    ParquetCompressionOptions,
+    QuantileInterpolation,
+    ReadBuffer,
+    ReadCsvBuffer,
+    ReadPickleBuffer,
+    ReindexMethod,
+    Scalar,
+    ScalarIndexer,
+    SequenceIndexer,
+    SequenceNotStr,
+    SliceType,
+    SortKind,
+    StorageOptions,
+    Suffixes,
+    TakeIndexer,
+    TimeAmbiguous,
+    TimedeltaConvertibleTypes,
+    TimeGrouperOrigin,
+    TimeNonexistent,
+    TimestampConvertibleTypes,
+    TimeUnit,
+    ToStataByteorder,
+    ToTimestampHow,
+    UpdateJoin,
+    UsecolsArgType,
+    WindowingRankType,
+    WriteBuffer,
+    WriteExcelBuffer,
+    XMLParsers,
+)
+
+__all__ = [
+    "AggFuncType",
+    "AlignJoin",
+    "AnyAll",
+    "AnyArrayLike",
+    "ArrayLike",
+    "AstypeArg",
+    "Axes",
+    "Axis",
+    "CSVEngine",
+    "ColspaceArgType",
+    "CompressionOptions",
+    "CorrelationMethod",
+    "DropKeep",
+    "Dtype",
+    "DtypeArg",
+    "DtypeBackend",
+    "DtypeObj",
+    "ExcelWriterIfSheetExists",
+    "ExcelWriterMergeCells",
+    "FilePath",
+    "FillnaOptions",
+    "FloatFormatType",
+    "FormattersType",
+    "FromDictOrient",
+    "HTMLFlavors",
+    "IgnoreRaise",
+    "IndexLabel",
+    "InterpolateOptions",
+    "IntervalClosedType",
+    "IntervalLeftRight",
+    "JSONEngine",
+    "JSONSerializable",
+    "JoinHow",
+    "JoinValidate",
+    "ListLike",
+    "MergeHow",
+    "MergeValidate",
+    "NaPosition",
+    "NsmallestNlargestKeep",
+    "OpenFileErrors",
+    "Ordered",
+    "ParquetCompressionOptions",
+    "QuantileInterpolation",
+    "ReadBuffer",
+    "ReadCsvBuffer",
+    "ReadPickleBuffer",
+    "ReindexMethod",
+    "Scalar",
+    "ScalarIndexer",
+    "SequenceIndexer",
+    "SequenceNotStr",
+    "SliceType",
+    "SortKind",
+    "StorageOptions",
+    "Suffixes",
+    "TakeIndexer",
+    "TimeAmbiguous",
+    "TimeGrouperOrigin",
+    "TimeNonexistent",
+    "TimeUnit",
+    "TimedeltaConvertibleTypes",
+    "TimestampConvertibleTypes",
+    "ToStataByteorder",
+    "ToTimestampHow",
+    "UpdateJoin",
+    "UsecolsArgType",
+    "WindowingRankType",
+    "WriteBuffer",
+    "WriteExcelBuffer",
+    "XMLParsers",
+]
diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5c1c98da1c785fdd51503ea6005c3eb014a8311
--- /dev/null
+++ b/pandas/arrays/__init__.py
@@ -0,0 +1,37 @@
+"""
+All of pandas' ExtensionArrays.
+
+See :ref:`extending.extension-types` for more.
+"""
+
+from pandas.core.arrays import (
+    ArrowExtensionArray,
+    ArrowStringArray,
+    BooleanArray,
+    Categorical,
+    DatetimeArray,
+    FloatingArray,
+    IntegerArray,
+    IntervalArray,
+    NumpyExtensionArray,
+    PeriodArray,
+    SparseArray,
+    StringArray,
+    TimedeltaArray,
+)
+
+__all__ = [
+    "ArrowExtensionArray",
+    "ArrowStringArray",
+    "BooleanArray",
+    "Categorical",
+    "DatetimeArray",
+    "FloatingArray",
+    "IntegerArray",
+    "IntervalArray",
+    "NumpyExtensionArray",
+    "PeriodArray",
+    "SparseArray",
+    "StringArray",
+    "TimedeltaArray",
+]
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..49b56c63961550b1d06a333324d2e70c70a5ab9c
--- /dev/null
+++ b/pandas/compat/__init__.py
@@ -0,0 +1,173 @@
+"""
+compat
+======
+
+Cross-compatible functions for different versions of Python.
+
+Other items:
+* platform checker
+"""
+
+from __future__ import annotations
+
+import os
+import platform
+import sys
+from typing import TYPE_CHECKING
+
+from pandas.compat._constants import (
+    CHAINED_WARNING_DISABLED,
+    IS64,
+    ISMUSL,
+    PY312,
+    PY314,
+    PYPY,
+    WASM,
+)
+from pandas.compat.numpy import is_numpy_dev
+from pandas.compat.pyarrow import (
+    HAS_PYARROW,
+    PYARROW_MIN_VERSION,
+    pa_version_under14p0,
+    pa_version_under14p1,
+    pa_version_under16p0,
+    pa_version_under17p0,
+    pa_version_under18p0,
+    pa_version_under19p0,
+    pa_version_under20p0,
+    pa_version_under21p0,
+)
+
+if TYPE_CHECKING:
+    from pandas._typing import F
+
+
+def set_function_name(f: F, name: str, cls: type) -> F:
+    """
+    Bind the name/qualname attributes of the function.
+    """
+    f.__name__ = name
+    f.__qualname__ = f"{cls.__name__}.{name}"
+    f.__module__ = cls.__module__
+    return f
+
+
+def is_platform_little_endian() -> bool:
+    """
+    Checking if the running platform is little endian.
+
+    Returns
+    -------
+    bool
+        True if the running platform is little endian.
+    """
+    return sys.byteorder == "little"
+
+
+def is_platform_windows() -> bool:
+    """
+    Checking if the running platform is windows.
+
+    Returns
+    -------
+    bool
+        True if the running platform is windows.
+    """
+    return sys.platform in ["win32", "cygwin"]
+
+
+def is_platform_linux() -> bool:
+    """
+    Checking if the running platform is linux.
+
+    Returns
+    -------
+    bool
+        True if the running platform is linux.
+    """
+    return sys.platform == "linux"
+
+
+def is_platform_mac() -> bool:
+    """
+    Checking if the running platform is mac.
+
+    Returns
+    -------
+    bool
+        True if the running platform is mac.
+    """
+    return sys.platform == "darwin"
+
+
+def is_platform_arm() -> bool:
+    """
+    Checking if the running platform use ARM architecture.
+
+    Returns
+    -------
+    bool
+        True if the running platform uses ARM architecture.
+    """
+    return platform.machine() in ("arm64", "aarch64") or platform.machine().startswith(
+        "armv"
+    )
+
+
+def is_platform_power() -> bool:
+    """
+    Checking if the running platform use Power architecture.
+
+    Returns
+    -------
+    bool
+        True if the running platform uses ARM architecture.
+    """
+    return platform.machine() in ("ppc64", "ppc64le")
+
+
+def is_platform_riscv64() -> bool:
+    """
+    Checking if the running platform use riscv64 architecture.
+
+    Returns
+    -------
+    bool
+        True if the running platform uses riscv64 architecture.
+    """
+    return platform.machine() == "riscv64"
+
+
+def is_ci_environment() -> bool:
+    """
+    Checking if running in a continuous integration environment by checking
+    the PANDAS_CI environment variable.
+
+    Returns
+    -------
+    bool
+        True if the running in a continuous integration environment.
+    """
+    return os.environ.get("PANDAS_CI", "0") == "1"
+
+
+__all__ = [
+    "CHAINED_WARNING_DISABLED",
+    "HAS_PYARROW",
+    "IS64",
+    "ISMUSL",
+    "PY312",
+    "PY314",
+    "PYARROW_MIN_VERSION",
+    "PYPY",
+    "WASM",
+    "is_numpy_dev",
+    "pa_version_under14p0",
+    "pa_version_under14p1",
+    "pa_version_under16p0",
+    "pa_version_under17p0",
+    "pa_version_under18p0",
+    "pa_version_under19p0",
+    "pa_version_under20p0",
+    "pa_version_under21p0",
+]
diff --git a/pandas/compat/_constants.py b/pandas/compat/_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ad31e0725bd448c9a836009581508c6d3230bd6
--- /dev/null
+++ b/pandas/compat/_constants.py
@@ -0,0 +1,35 @@
+"""
+_constants
+======
+
+Constants relevant for the Python implementation.
+"""
+
+from __future__ import annotations
+
+import platform
+import sys
+import sysconfig
+
+IS64 = sys.maxsize > 2**32
+
+PY312 = sys.version_info >= (3, 12)
+PY314 = sys.version_info >= (3, 14)
+PYPY = platform.python_implementation() == "PyPy"
+WASM = (sys.platform == "emscripten") or (platform.machine() in ["wasm32", "wasm64"])
+ISMUSL = "musl" in (sysconfig.get_config_var("HOST_GNU_TYPE") or "")
+# the refcount for self in a chained __setitem__/.(i)loc indexing/method call
+REF_COUNT = 2 if PY314 else 3
+REF_COUNT_IDX = 2
+REF_COUNT_METHOD = 1 if PY314 else 2
+CHAINED_WARNING_DISABLED = PYPY
+
+
+__all__ = [
+    "IS64",
+    "ISMUSL",
+    "PY312",
+    "PY314",
+    "PYPY",
+    "WASM",
+]
diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
new file mode 100644
index 0000000000000000000000000000000000000000..42bd965e88c86c989fd7d1f448fd8aed1bacad8e
--- /dev/null
+++ b/pandas/compat/_optional.py
@@ -0,0 +1,191 @@
+from __future__ import annotations
+
+import importlib
+import sys
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+    overload,
+)
+import warnings
+
+from pandas.util._exceptions import find_stack_level
+
+from pandas.util.version import Version
+
+if TYPE_CHECKING:
+    import types
+
+# Update install.rst, actions-311-minimum_versions.yaml,
+# deps_minimum.toml & pyproject.toml when updating versions!
+
+VERSIONS = {
+    "adbc-driver-postgresql": "1.2.0",
+    "adbc-driver-sqlite": "1.2.0",
+    "bs4": "4.12.3",
+    "bottleneck": "1.4.2",
+    "fastparquet": "2024.11.0",
+    "fsspec": "2024.10.0",
+    "html5lib": "1.1",
+    "hypothesis": "6.116.0",
+    "gcsfs": "2024.10.0",
+    "jinja2": "3.1.5",
+    "lxml.etree": "5.3.0",
+    "matplotlib": "3.9.3",
+    "numba": "0.60.0",
+    "numexpr": "2.10.2",
+    "odfpy": "1.4.1",
+    "openpyxl": "3.1.5",
+    "psycopg2": "2.9.10",  # (dt dec pq3 ext lo64)
+    "pymysql": "1.1.1",
+    "pyarrow": "13.0.0",
+    "pyiceberg": "0.8.1",
+    "pyreadstat": "1.2.8",
+    "pytest": "8.3.4",
+    "python-calamine": "0.3.0",
+    "pytz": "2024.2",
+    "pyxlsb": "1.0.10",
+    "s3fs": "2024.10.0",
+    "scipy": "1.14.1",
+    "sqlalchemy": "2.0.36",
+    "tables": "3.10.1",
+    "tabulate": "0.9.0",
+    "xarray": "2024.10.0",
+    "xlrd": "2.0.1",
+    "xlsxwriter": "3.2.0",
+    "zstandard": "0.23.0",
+    "qtpy": "2.4.2",
+    "pyqt5": "5.15.9",
+}
+
+# A mapping from import name to package name (on PyPI) for packages where
+# these two names are different.
+
+INSTALL_MAPPING = {
+    "bs4": "beautifulsoup4",
+    "bottleneck": "Bottleneck",
+    "jinja2": "Jinja2",
+    "lxml.etree": "lxml",
+    "odf": "odfpy",
+    "python_calamine": "python-calamine",
+    "sqlalchemy": "SQLAlchemy",
+    "tables": "pytables",
+}
+
+
+def get_version(module: types.ModuleType) -> str:
+    version = getattr(module, "__version__", None)
+
+    if version is None:
+        raise ImportError(f"Can't determine version for {module.__name__}")
+    if module.__name__ == "psycopg2":
+        # psycopg2 appends " (dt dec pq3 ext lo64)" to it's version
+        version = version.split()[0]
+    return version
+
+
+@overload
+def import_optional_dependency(
+    name: str,
+    extra: str = ...,
+    min_version: str | None = ...,
+    *,
+    errors: Literal["raise"] = ...,
+) -> types.ModuleType: ...
+
+
+@overload
+def import_optional_dependency(
+    name: str,
+    extra: str = ...,
+    min_version: str | None = ...,
+    *,
+    errors: Literal["warn", "ignore"],
+) -> types.ModuleType | None: ...
+
+
+def import_optional_dependency(
+    name: str,
+    extra: str = "",
+    min_version: str | None = None,
+    *,
+    errors: Literal["raise", "warn", "ignore"] = "raise",
+) -> types.ModuleType | None:
+    """
+    Import an optional dependency.
+
+    By default, if a dependency is missing an ImportError with a nice
+    message will be raised. If a dependency is present, but too old,
+    we raise.
+
+    Parameters
+    ----------
+    name : str
+        The module name.
+    extra : str
+        Additional text to include in the ImportError message.
+    errors : str {'raise', 'warn', 'ignore'}
+        What to do when a dependency is not found or its version is too old.
+
+        * raise : Raise an ImportError
+        * warn : Only applicable when a module's version is to old.
+          Warns that the version is too old and returns None
+        * ignore: If the module is not installed, return None, otherwise,
+          return the module, even if the version is too old.
+          It's expected that users validate the version locally when
+          using ``errors="ignore"`` (see. ``io/html.py``)
+    min_version : str, default None
+        Specify a minimum version that is different from the global pandas
+        minimum version required.
+    Returns
+    -------
+    maybe_module : Optional[ModuleType]
+        The imported module, when found and the version is correct.
+        None is returned when the package is not found and `errors`
+        is False, or when the package's version is too old and `errors`
+        is ``'warn'`` or ``'ignore'``.
+    """
+    assert errors in {"warn", "raise", "ignore"}
+
+    package_name = INSTALL_MAPPING.get(name)
+    install_name = package_name if package_name is not None else name
+
+    msg = (
+        f"`Import {install_name}` failed. {extra} "
+        f"Use pip or conda to install the {install_name} package."
+    )
+    try:
+        module = importlib.import_module(name)
+    except ImportError as err:
+        if errors == "raise":
+            raise ImportError(msg) from err
+        return None
+
+    # Handle submodules: if we have submodule, grab parent module from sys.modules
+    parent = name.split(".")[0]
+    if parent != name:
+        install_name = parent
+        module_to_get = sys.modules[install_name]
+    else:
+        module_to_get = module
+    minimum_version = min_version if min_version is not None else VERSIONS.get(parent)
+    if minimum_version:
+        version = get_version(module_to_get)
+        if version and Version(version) < Version(minimum_version):
+            msg = (
+                f"Pandas requires version '{minimum_version}' or newer of '{parent}' "
+                f"(version '{version}' currently installed)."
+            )
+            if errors == "warn":
+                warnings.warn(
+                    msg,
+                    UserWarning,
+                    stacklevel=find_stack_level(),
+                )
+                return None
+            elif errors == "raise":
+                raise ImportError(msg)
+            else:
+                return None
+
+    return module
diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..beb4a69232b277b92c85a1f995269e0ef278a43f
--- /dev/null
+++ b/pandas/compat/pickle_compat.py
@@ -0,0 +1,143 @@
+"""
+Pickle compatibility to pandas version 1.0
+"""
+
+from __future__ import annotations
+
+import contextlib
+import io
+import pickle
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+
+import numpy as np
+
+from pandas._libs.arrays import NDArrayBacked
+from pandas._libs.tslibs import BaseOffset
+
+from pandas.core.arrays import (
+    DatetimeArray,
+    PeriodArray,
+    TimedeltaArray,
+)
+from pandas.core.internals import BlockManager
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+# If classes are moved, provide compat here.
+_class_locations_map = {
+    # Re-routing unpickle block logic to go through _unpickle_block instead
+    # for pandas <= 1.3.5
+    ("pandas.core.internals.blocks", "new_block"): (
+        "pandas._libs.internals",
+        "_unpickle_block",
+    ),
+    # Avoid Cython's warning "contradiction to Python 'class private name' rules"
+    ("pandas._libs.tslibs.nattype", "__nat_unpickle"): (
+        "pandas._libs.tslibs.nattype",
+        "_nat_unpickle",
+    ),
+    # 50775, remove Int64Index, UInt64Index & Float64Index from codebase
+    ("pandas.core.indexes.numeric", "Int64Index"): (
+        "pandas.core.indexes.base",
+        "Index",
+    ),
+    ("pandas.core.indexes.numeric", "UInt64Index"): (
+        "pandas.core.indexes.base",
+        "Index",
+    ),
+    ("pandas.core.indexes.numeric", "Float64Index"): (
+        "pandas.core.indexes.base",
+        "Index",
+    ),
+    ("pandas.core.arrays.sparse.dtype", "SparseDtype"): (
+        "pandas.core.dtypes.dtypes",
+        "SparseDtype",
+    ),
+}
+
+
+# our Unpickler sub-class to override methods and some dispatcher
+# functions for compat and uses a non-public class of the pickle module.
+class Unpickler(pickle._Unpickler):
+    def find_class(self, module: str, name: str) -> Any:
+        key = (module, name)
+        module, name = _class_locations_map.get(key, key)
+        return super().find_class(module, name)
+
+    dispatch = pickle._Unpickler.dispatch.copy()
+
+    def load_reduce(self) -> None:
+        stack = self.stack  # type: ignore[attr-defined]
+        args = stack.pop()
+        func = stack[-1]
+
+        try:
+            stack[-1] = func(*args)
+        except TypeError:
+            # If we have a deprecated function,
+            # try to replace and try again.
+            if args and isinstance(args[0], type) and issubclass(args[0], BaseOffset):
+                # TypeError: object.__new__(Day) is not safe, use Day.__new__()
+                cls = args[0]
+                stack[-1] = cls.__new__(*args)
+                return
+            elif args and issubclass(args[0], PeriodArray):
+                cls = args[0]
+                stack[-1] = NDArrayBacked.__new__(*args)
+                return
+            raise
+
+    dispatch[pickle.REDUCE[0]] = load_reduce  # type: ignore[assignment]
+
+    def load_newobj(self) -> None:
+        args = self.stack.pop()  # type: ignore[attr-defined]
+        cls = self.stack.pop()  # type: ignore[attr-defined]
+
+        # compat
+        if issubclass(cls, DatetimeArray) and not args:
+            arr = np.array([], dtype="M8[ns]")
+            obj = cls.__new__(cls, arr, arr.dtype)
+        elif issubclass(cls, TimedeltaArray) and not args:
+            arr = np.array([], dtype="m8[ns]")
+            obj = cls.__new__(cls, arr, arr.dtype)
+        elif cls is BlockManager and not args:
+            obj = cls.__new__(cls, (), [], False)
+        else:
+            obj = cls.__new__(cls, *args)
+        self.append(obj)  # type: ignore[attr-defined]
+
+    dispatch[pickle.NEWOBJ[0]] = load_newobj  # type: ignore[assignment]
+
+
+def loads(
+    bytes_object: bytes,
+    *,
+    fix_imports: bool = True,
+    encoding: str = "ASCII",
+    errors: str = "strict",
+) -> Any:
+    """
+    Analogous to pickle._loads.
+    """
+    fd = io.BytesIO(bytes_object)
+    return Unpickler(
+        fd, fix_imports=fix_imports, encoding=encoding, errors=errors
+    ).load()
+
+
+@contextlib.contextmanager
+def patch_pickle() -> Generator[None]:
+    """
+    Temporarily patch pickle to use our unpickler.
+    """
+    orig_loads = pickle.loads
+    try:
+        setattr(pickle, "loads", loads)
+        yield
+    finally:
+        setattr(pickle, "loads", orig_loads)
diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe71e8a82cd936b683c4234e732dc8ee3097af27
--- /dev/null
+++ b/pandas/compat/pyarrow.py
@@ -0,0 +1,91 @@
+"""support pyarrow compatibility across versions"""
+
+from __future__ import annotations
+
+import sys
+from typing import Any
+
+from pandas.util.version import Version
+
+PYARROW_MIN_VERSION = "13.0.0"
+try:
+    import pyarrow as pa
+
+    _palv = Version(Version(pa.__version__).base_version)
+    pa_version_under14p0 = _palv < Version("14.0.0")
+    pa_version_under14p1 = _palv < Version("14.0.1")
+    pa_version_under15p0 = _palv < Version("15.0.0")
+    pa_version_under16p0 = _palv < Version("16.0.0")
+    pa_version_under17p0 = _palv < Version("17.0.0")
+    pa_version_under18p0 = _palv < Version("18.0.0")
+    pa_version_under19p0 = _palv < Version("19.0.0")
+    pa_version_under20p0 = _palv < Version("20.0.0")
+    pa_version_under21p0 = _palv < Version("21.0.0")
+    pa_version_under22p0 = _palv < Version("22.0.0")
+    HAS_PYARROW = _palv >= Version(PYARROW_MIN_VERSION)
+except ImportError:
+    pa_version_under14p0 = True
+    pa_version_under14p1 = True
+    pa_version_under15p0 = True
+    pa_version_under16p0 = True
+    pa_version_under17p0 = True
+    pa_version_under18p0 = True
+    pa_version_under19p0 = True
+    pa_version_under20p0 = True
+    pa_version_under21p0 = True
+    pa_version_under22p0 = True
+    HAS_PYARROW = False
+
+
+def _safe_fill_null(
+    arr: pa.Array | pa.ChunkedArray, fill_value: Any
+) -> pa.Array | pa.ChunkedArray:
+    """
+    Safe wrapper for pyarrow.compute.fill_null with fallback for Windows + pyarrow 21.
+
+    pyarrow 21.0.0 on Windows has a bug in fill_null that incorrectly fills null values.
+    This function uses a fallback implementation for that specific case, otherwise uses
+    the standard pyarrow.compute.fill_null.
+
+    Parameters
+    ----------
+    arr : pyarrow.Array | pyarrow.ChunkedArray
+        Input array with potential null values.
+    fill_value : Any
+        Value to fill nulls with.
+
+    Returns
+    -------
+    pyarrow.Array | pyarrow.ChunkedArray
+        Array with nulls filled with fill_value.
+    """
+    import pyarrow.compute as pc
+
+    is_windows = sys.platform in ["win32", "cygwin"]
+    use_fallback = (
+        HAS_PYARROW and is_windows and not pa_version_under21p0 and pa_version_under22p0
+    )
+    if not use_fallback or isinstance(fill_value, (pa.Array, pa.ChunkedArray)):
+        return pc.fill_null(arr, fill_value)
+
+    fill_scalar = pa.scalar(fill_value, type=arr.type)
+
+    if pa.types.is_duration(arr.type):
+
+        def fill_null_duration(arr: pa.Array, fill_scalar: pa.Scalar) -> pa.Array:
+            mask = pc.is_null(arr)
+            zero_duration = pa.scalar(0, type=arr.type)
+            arr_zeroed = pc.if_else(mask, zero_duration, arr)
+            return pc.if_else(mask, fill_scalar, arr_zeroed)
+
+        if isinstance(arr, pa.ChunkedArray):
+            return pa.chunked_array(
+                [fill_null_duration(chunk, fill_scalar) for chunk in arr.chunks]
+            )
+        return fill_null_duration(arr, fill_scalar)
+
+    if isinstance(arr, pa.ChunkedArray):
+        return pa.chunked_array(
+            [pc.if_else(pc.is_null(chunk), fill_scalar, chunk) for chunk in arr.chunks]
+        )
+    return pc.if_else(pc.is_null(arr), fill_scalar, arr)
diff --git a/pandas/core/__init__.py b/pandas/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..4163de0d2cf011ea5040215ae3892ba8b4541f91
--- /dev/null
+++ b/pandas/core/accessor.py
@@ -0,0 +1,588 @@
+"""
+
+accessor.py contains base classes for implementing accessor properties
+that can be mixed into or pinned onto other pandas classes.
+
+"""
+
+from __future__ import annotations
+
+import functools
+from typing import (
+    TYPE_CHECKING,
+    final,
+)
+import warnings
+
+from pandas.util._decorators import (
+    set_module,
+)
+from pandas.util._exceptions import find_stack_level
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+    from pandas._typing import TypeT
+
+    from pandas import Index
+    from pandas.core.generic import NDFrame
+
+
+class DirNamesMixin:
+    _accessors: set[str] = set()
+    _hidden_attrs: frozenset[str] = frozenset()
+
+    @final
+    def _dir_deletions(self) -> set[str]:
+        """
+        Delete unwanted __dir__ for this object.
+        """
+        return self._accessors | self._hidden_attrs
+
+    def _dir_additions(self) -> set[str]:
+        """
+        Add additional __dir__ for this object.
+        """
+        return {accessor for accessor in self._accessors if hasattr(self, accessor)}
+
+    def __dir__(self) -> list[str]:
+        """
+        Provide method name lookup and completion.
+
+        Notes
+        -----
+        Only provide 'public' methods.
+        """
+        rv = set(super().__dir__())
+        rv = (rv - self._dir_deletions()) | self._dir_additions()
+        return sorted(rv)
+
+
+class PandasDelegate:
+    """
+    Abstract base class for delegating methods/properties.
+    """
+
+    def _delegate_property_get(self, name: str, *args, **kwargs):
+        raise TypeError(f"You cannot access the property {name}")
+
+    def _delegate_property_set(self, name: str, value, *args, **kwargs) -> None:
+        raise TypeError(f"The property {name} cannot be set")
+
+    def _delegate_method(self, name: str, *args, **kwargs):
+        raise TypeError(f"You cannot call method {name}")
+
+    @classmethod
+    def _add_delegate_accessors(
+        cls,
+        delegate,
+        accessors: list[str],
+        typ: str,
+        overwrite: bool = False,
+        accessor_mapping: Callable[[str], str] = lambda x: x,
+        raise_on_missing: bool = True,
+    ) -> None:
+        """
+        Add accessors to cls from the delegate class.
+
+        Parameters
+        ----------
+        cls
+            Class to add the methods/properties to.
+        delegate
+            Class to get methods/properties and docstrings.
+        accessors : list of str
+            List of accessors to add.
+        typ : {'property', 'method'}
+        overwrite : bool, default False
+            Overwrite the method/property in the target class if it exists.
+        accessor_mapping: Callable, default lambda x: x
+            Callable to map the delegate's function to the cls' function.
+        raise_on_missing: bool, default True
+            Raise if an accessor does not exist on delegate.
+            False skips the missing accessor.
+        """
+
+        def _create_delegator_property(name: str):
+            def _getter(self):
+                return self._delegate_property_get(name)
+
+            def _setter(self, new_values):
+                return self._delegate_property_set(name, new_values)
+
+            _getter.__name__ = name
+            _setter.__name__ = name
+
+            return property(
+                fget=_getter,
+                fset=_setter,
+                doc=getattr(delegate, accessor_mapping(name)).__doc__,
+            )
+
+        def _create_delegator_method(name: str):
+            method = getattr(delegate, accessor_mapping(name))
+
+            @functools.wraps(method)
+            def f(self, *args, **kwargs):
+                return self._delegate_method(name, *args, **kwargs)
+
+            return f
+
+        for name in accessors:
+            if (
+                not raise_on_missing
+                and getattr(delegate, accessor_mapping(name), None) is None
+            ):
+                continue
+
+            if typ == "property":
+                f = _create_delegator_property(name)
+            else:
+                f = _create_delegator_method(name)
+
+            # don't overwrite existing methods/properties
+            if overwrite or not hasattr(cls, name):
+                setattr(cls, name, f)
+
+
+def delegate_names(
+    delegate,
+    accessors: list[str],
+    typ: str,
+    overwrite: bool = False,
+    accessor_mapping: Callable[[str], str] = lambda x: x,
+    raise_on_missing: bool = True,
+):
+    """
+    Add delegated names to a class using a class decorator.  This provides
+    an alternative usage to directly calling `_add_delegate_accessors`
+    below a class definition.
+
+    Parameters
+    ----------
+    delegate : object
+        The class to get methods/properties & docstrings.
+    accessors : Sequence[str]
+        List of accessor to add.
+    typ : {'property', 'method'}
+    overwrite : bool, default False
+       Overwrite the method/property in the target class if it exists.
+    accessor_mapping: Callable, default lambda x: x
+        Callable to map the delegate's function to the cls' function.
+    raise_on_missing: bool, default True
+        Raise if an accessor does not exist on delegate.
+        False skips the missing accessor.
+
+    Returns
+    -------
+    callable
+        A class decorator.
+
+    Examples
+    --------
+    @delegate_names(Categorical, ["categories", "ordered"], "property")
+    class CategoricalAccessor(PandasDelegate):
+        [...]
+    """
+
+    def add_delegate_accessors(cls):
+        cls._add_delegate_accessors(
+            delegate,
+            accessors,
+            typ,
+            overwrite=overwrite,
+            accessor_mapping=accessor_mapping,
+            raise_on_missing=raise_on_missing,
+        )
+        return cls
+
+    return add_delegate_accessors
+
+
+class Accessor:
+    """
+    Custom property-like object.
+
+    A descriptor for accessors.
+
+    Parameters
+    ----------
+    name : str
+        Namespace that will be accessed under, e.g. ``df.foo``.
+    accessor : cls
+        Class with the extension methods.
+
+    Notes
+    -----
+    For accessor, The class's __init__ method assumes that one of
+    ``Series``, ``DataFrame`` or ``Index`` as the
+    single argument ``data``.
+    """
+
+    def __init__(self, name: str, accessor) -> None:
+        self._name = name
+        self._accessor = accessor
+
+    def __get__(self, obj, cls):
+        if obj is None:
+            # we're accessing the attribute of the class, i.e., Dataset.geo
+            return self._accessor
+        return self._accessor(obj)
+
+
+# Alias kept for downstream libraries
+# TODO: Deprecate as name is now misleading
+CachedAccessor = Accessor
+
+
+def _register_accessor(
+    name: str, cls: type[NDFrame | Index]
+) -> Callable[[TypeT], TypeT]:
+    """
+    Register a custom accessor on objects.
+
+    Parameters
+    ----------
+    name : str
+        Name under which the accessor should be registered. A warning is issued
+        if this name conflicts with a preexisting attribute.
+
+    Returns
+    -------
+    callable
+        A class decorator.
+
+    See Also
+    --------
+    register_dataframe_accessor : Register a custom accessor on DataFrame objects.
+    register_series_accessor : Register a custom accessor on Series objects.
+    register_index_accessor : Register a custom accessor on Index objects.
+
+    Notes
+    -----
+    This function allows you to register a custom-defined accessor class
+    for pandas objects (DataFrame, Series, or Index).
+    The requirements for the accessor class are as follows:
+
+    * Must contain an init method that:
+
+      * accepts a single object
+
+      * raises an AttributeError if the object does not have correctly
+        matching inputs for the accessor
+
+    * Must contain a method for each access pattern.
+
+      * The methods should be able to take any argument signature.
+
+      * Accessible using the @property decorator if no additional arguments are
+        needed.
+
+    """
+
+    def decorator(accessor: TypeT) -> TypeT:
+        if hasattr(cls, name):
+            warnings.warn(
+                f"registration of accessor {accessor!r} under name "
+                f"{name!r} for type {cls!r} is overriding a preexisting "
+                f"attribute with the same name.",
+                UserWarning,
+                stacklevel=find_stack_level(),
+            )
+        setattr(cls, name, Accessor(name, accessor))
+        cls._accessors.add(name)
+        return accessor
+
+    return decorator
+
+
+_register_df_examples = """
+An accessor that only accepts integers could
+have a class defined like this:
+
+>>> @pd.api.extensions.register_dataframe_accessor("int_accessor")
+... class IntAccessor:
+...     def __init__(self, pandas_obj):
+...         if not all(pandas_obj[col].dtype == 'int64' for col in pandas_obj.columns):
+...             raise AttributeError("All columns must contain integer values only")
+...         self._obj = pandas_obj
+...
+...     def sum(self):
+...         return self._obj.sum()
+...
+>>> df = pd.DataFrame([[1, 2], ['x', 'y']])
+>>> df.int_accessor
+Traceback (most recent call last):
+...
+AttributeError: All columns must contain integer values only.
+>>> df = pd.DataFrame([[1, 2], [3, 4]])
+>>> df.int_accessor.sum()
+0    4
+1    6
+dtype: int64"""
+
+
+@set_module("pandas.api.extensions")
+def register_dataframe_accessor(name: str) -> Callable[[TypeT], TypeT]:
+    """
+    Register a custom accessor on DataFrame objects.
+
+    Parameters
+    ----------
+    name : str
+        Name under which the accessor should be registered. A warning is issued
+        if this name conflicts with a preexisting attribute.
+
+    Returns
+    -------
+    callable
+        A class decorator.
+
+    See Also
+    --------
+    register_dataframe_accessor : Register a custom accessor on DataFrame objects.
+    register_series_accessor : Register a custom accessor on Series objects.
+    register_index_accessor : Register a custom accessor on Index objects.
+
+    Notes
+    -----
+    This function allows you to register a custom-defined accessor class for DataFrame.
+    The requirements for the accessor class are as follows:
+
+    * Must contain an init method that:
+
+      * accepts a single DataFrame object
+
+      * raises an AttributeError if the DataFrame object does not have correctly
+        matching inputs for the accessor
+
+    * Must contain a method for each access pattern.
+
+      * The methods should be able to take any argument signature.
+
+      * Accessible using the @property decorator if no additional arguments are
+        needed.
+
+    Examples
+    --------
+    An accessor that only accepts integers could
+    have a class defined like this:
+
+    >>> @pd.api.extensions.register_dataframe_accessor("int_accessor")
+    ... class IntAccessor:
+    ...     def __init__(self, pandas_obj):
+    ...         if not all(
+    ...             pandas_obj[col].dtype == "int64" for col in pandas_obj.columns
+    ...         ):
+    ...             raise AttributeError("All columns must contain integer values only")
+    ...         self._obj = pandas_obj
+    ...
+    ...     def sum(self):
+    ...         return self._obj.sum()
+    >>> df = pd.DataFrame([[1, 2], ["x", "y"]])
+    >>> df.int_accessor
+    Traceback (most recent call last):
+    ...
+    AttributeError: All columns must contain integer values only.
+    >>> df = pd.DataFrame([[1, 2], [3, 4]])
+    >>> df.int_accessor.sum()
+    0    4
+    1    6
+    dtype: int64
+    """
+    from pandas import DataFrame
+
+    return _register_accessor(name, DataFrame)
+
+
+_register_series_examples = """
+An accessor that only accepts integers could
+have a class defined like this:
+
+>>> @pd.api.extensions.register_series_accessor("int_accessor")
+... class IntAccessor:
+...     def __init__(self, pandas_obj):
+...         if not pandas_obj.dtype == 'int64':
+...             raise AttributeError("The series must contain integer data only")
+...         self._obj = pandas_obj
+...
+...     def sum(self):
+...         return self._obj.sum()
+...
+>>> df = pd.Series([1, 2, 'x'])
+>>> df.int_accessor
+Traceback (most recent call last):
+...
+AttributeError: The series must contain integer data only.
+>>> df = pd.Series([1, 2, 3])
+>>> df.int_accessor.sum()
+6"""
+
+
+@set_module("pandas.api.extensions")
+def register_series_accessor(name: str) -> Callable[[TypeT], TypeT]:
+    """
+    Register a custom accessor on Series objects.
+
+    Parameters
+    ----------
+    name : str
+        Name under which the accessor should be registered. A warning is issued
+        if this name conflicts with a preexisting attribute.
+
+    Returns
+    -------
+    callable
+        A class decorator.
+
+    See Also
+    --------
+    register_dataframe_accessor : Register a custom accessor on DataFrame objects.
+    register_series_accessor : Register a custom accessor on Series objects.
+    register_index_accessor : Register a custom accessor on Index objects.
+
+    Notes
+    -----
+    This function allows you to register a custom-defined accessor class for Series.
+    The requirements for the accessor class are as follows:
+
+    * Must contain an init method that:
+
+      * accepts a single Series object
+
+      * raises an AttributeError if the Series object does not have correctly
+        matching inputs for the accessor
+
+    * Must contain a method for each access pattern.
+
+      * The methods should be able to take any argument signature.
+
+      * Accessible using the @property decorator if no additional arguments are
+        needed.
+
+    Examples
+    --------
+    An accessor that only accepts integers could
+    have a class defined like this:
+
+    >>> @pd.api.extensions.register_series_accessor("int_accessor")
+    ... class IntAccessor:
+    ...     def __init__(self, pandas_obj):
+    ...         if not pandas_obj.dtype == "int64":
+    ...             raise AttributeError("The series must contain integer data only")
+    ...         self._obj = pandas_obj
+    ...
+    ...     def sum(self):
+    ...         return self._obj.sum()
+    >>> df = pd.Series([1, 2, "x"])
+    >>> df.int_accessor
+    Traceback (most recent call last):
+    ...
+    AttributeError: The series must contain integer data only.
+    >>> df = pd.Series([1, 2, 3])
+    >>> df.int_accessor.sum()
+    6
+    """
+    from pandas import Series
+
+    return _register_accessor(name, Series)
+
+
+_register_index_examples = """
+An accessor that only accepts integers could
+have a class defined like this:
+
+>>> @pd.api.extensions.register_index_accessor("int_accessor")
+... class IntAccessor:
+...     def __init__(self, pandas_obj):
+...         if not all(isinstance(x, int) for x in pandas_obj):
+...             raise AttributeError("The index must only be an integer value")
+...         self._obj = pandas_obj
+...
+...     def even(self):
+...         return [x for x in self._obj if x % 2 == 0]
+>>> df = pd.DataFrame.from_dict(
+...     {"row1": {"1": 1, "2": "a"}, "row2": {"1": 2, "2": "b"}}, orient="index"
+... )
+>>> df.index.int_accessor
+Traceback (most recent call last):
+...
+AttributeError: The index must only be an integer value.
+>>> df = pd.DataFrame(
+...     {"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]}, index=[1, 2, 5, 8]
+... )
+>>> df.index.int_accessor.even()
+[2, 8]"""
+
+
+@set_module("pandas.api.extensions")
+def register_index_accessor(name: str) -> Callable[[TypeT], TypeT]:
+    """
+    Register a custom accessor on Index objects.
+
+    Parameters
+    ----------
+    name : str
+        Name under which the accessor should be registered. A warning is issued
+        if this name conflicts with a preexisting attribute.
+
+    Returns
+    -------
+    callable
+        A class decorator.
+
+    See Also
+    --------
+    register_dataframe_accessor : Register a custom accessor on DataFrame objects.
+    register_series_accessor : Register a custom accessor on Series objects.
+    register_index_accessor : Register a custom accessor on Index objects.
+
+    Notes
+    -----
+    This function allows you to register a custom-defined accessor class for Index.
+    The requirements for the accessor class are as follows:
+
+    * Must contain an init method that:
+
+      * accepts a single Index object
+
+      * raises an AttributeError if the Index object does not have correctly
+        matching inputs for the accessor
+
+    * Must contain a method for each access pattern.
+
+      * The methods should be able to take any argument signature.
+
+      * Accessible using the @property decorator if no additional arguments are
+        needed.
+
+    Examples
+    --------
+    An accessor that only accepts integers could
+    have a class defined like this:
+
+    >>> @pd.api.extensions.register_index_accessor("int_accessor")
+    ... class IntAccessor:
+    ...     def __init__(self, pandas_obj):
+    ...         if not all(isinstance(x, int) for x in pandas_obj):
+    ...             raise AttributeError("The index must only be an integer value")
+    ...         self._obj = pandas_obj
+    ...
+    ...     def even(self):
+    ...         return [x for x in self._obj if x % 2 == 0]
+    >>> df = pd.DataFrame.from_dict(
+    ...     {"row1": {"1": 1, "2": "a"}, "row2": {"1": 2, "2": "b"}}, orient="index"
+    ... )
+    >>> df.index.int_accessor
+    Traceback (most recent call last):
+    ...
+    AttributeError: The index must only be an integer value.
+    >>> df = pd.DataFrame(
+    ...     {"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]}, index=[1, 2, 5, 8]
+    ... )
+    >>> df.index.int_accessor.even()
+    [2, 8]
+    """
+    from pandas import Index
+
+    return _register_accessor(name, Index)
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
new file mode 100644
index 0000000000000000000000000000000000000000..4683cf9fc744a910e5a5d6749e15942b37ae30af
--- /dev/null
+++ b/pandas/core/algorithms.py
@@ -0,0 +1,1712 @@
+"""
+Generic data algorithms. This module is experimental at the moment and not
+intended for public consumption
+"""
+
+from __future__ import annotations
+
+import decimal
+import operator
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+    TypeVar,
+    cast,
+    overload,
+)
+import warnings
+
+import numpy as np
+
+from pandas._libs import (
+    algos,
+    hashtable as htable,
+    iNaT,
+    lib,
+)
+from pandas._libs.missing import NA
+from pandas._typing import (
+    AnyArrayLike,
+    ArrayLike,
+    ArrayLikeT,
+    AxisInt,
+    DtypeObj,
+    TakeIndexer,
+    npt,
+)
+from pandas.util._decorators import set_module
+from pandas.util._exceptions import find_stack_level
+
+from pandas.core.dtypes.cast import (
+    construct_1d_object_array_from_listlike,
+    np_find_common_type,
+)
+from pandas.core.dtypes.common import (
+    ensure_float64,
+    ensure_object,
+    ensure_platform_int,
+    is_bool_dtype,
+    is_complex_dtype,
+    is_dict_like,
+    is_dtype_equal,
+    is_extension_array_dtype,
+    is_float,
+    is_float_dtype,
+    is_integer,
+    is_integer_dtype,
+    is_list_like,
+    is_object_dtype,
+    is_signed_integer_dtype,
+    needs_i8_conversion,
+)
+from pandas.core.dtypes.concat import concat_compat
+from pandas.core.dtypes.dtypes import (
+    BaseMaskedDtype,
+    CategoricalDtype,
+    ExtensionDtype,
+    NumpyEADtype,
+)
+from pandas.core.dtypes.generic import (
+    ABCDatetimeArray,
+    ABCExtensionArray,
+    ABCIndex,
+    ABCMultiIndex,
+    ABCNumpyExtensionArray,
+    ABCSeries,
+    ABCTimedeltaArray,
+)
+from pandas.core.dtypes.missing import (
+    isna,
+    na_value_for_dtype,
+)
+
+from pandas.core.array_algos.take import take_nd
+from pandas.core.construction import (
+    array as pd_array,
+    ensure_wrapped_if_datetimelike,
+    extract_array,
+)
+from pandas.core.indexers import validate_indices
+
+if TYPE_CHECKING:
+    from pandas._typing import (
+        ListLike,
+        NumpySorter,
+        NumpyValueArrayLike,
+    )
+
+    from pandas import (
+        Categorical,
+        Index,
+        Series,
+    )
+    from pandas.core.arrays import (
+        BaseMaskedArray,
+        ExtensionArray,
+    )
+
+    T = TypeVar("T", bound=Index | Categorical | ExtensionArray)
+
+
+# --------------- #
+# dtype access    #
+# --------------- #
+def _ensure_data(values: ArrayLike) -> np.ndarray:
+    """
+    routine to ensure that our data is of the correct
+    input dtype for lower-level routines
+
+    This will coerce:
+    - ints -> int64
+    - uint -> uint64
+    - bool -> uint8
+    - datetimelike -> i8
+    - datetime64tz -> i8 (in local tz)
+    - categorical -> codes
+
+    Parameters
+    ----------
+    values : np.ndarray or ExtensionArray
+
+    Returns
+    -------
+    np.ndarray
+    """
+
+    if not isinstance(values, ABCMultiIndex):
+        # extract_array would raise
+        values = extract_array(values, extract_numpy=True)
+
+    if is_object_dtype(values.dtype):
+        return ensure_object(np.asarray(values))
+
+    elif isinstance(values.dtype, BaseMaskedDtype):
+        # i.e. BooleanArray, FloatingArray, IntegerArray
+        values = cast("BaseMaskedArray", values)
+        if not values._hasna:
+            # No pd.NAs -> We can avoid an object-dtype cast (and copy) GH#41816
+            #  recurse to avoid re-implementing logic for eg bool->uint8
+            return _ensure_data(values._data)
+        return np.asarray(values)
+
+    elif isinstance(values.dtype, CategoricalDtype):
+        # NB: cases that go through here should NOT be using _reconstruct_data
+        #  on the back-end.
+        values = cast("Categorical", values)
+        return values.codes
+
+    elif is_bool_dtype(values.dtype):
+        if isinstance(values, np.ndarray):
+            # i.e. actually dtype == np.dtype("bool")
+            return np.asarray(values).view("uint8")
+        else:
+            # e.g. Sparse[bool, False]  # TODO: no test cases get here
+            return np.asarray(values).astype("uint8", copy=False)
+
+    elif is_integer_dtype(values.dtype):
+        return np.asarray(values)
+
+    elif is_float_dtype(values.dtype):
+        # Note: checking `values.dtype == "float128"` raises on Windows and 32bit
+        # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]"
+        # has no attribute "itemsize"
+        if values.dtype.itemsize in [2, 12, 16]:  # type: ignore[union-attr]
+            # we dont (yet) have float128 hashtable support
+            return ensure_float64(values)
+        return np.asarray(values)
+
+    elif is_complex_dtype(values.dtype):
+        return cast(np.ndarray, values)
+
+    # datetimelike
+    elif needs_i8_conversion(values.dtype):
+        npvalues = values.view("i8")
+        npvalues = cast(np.ndarray, npvalues)
+        return npvalues
+
+    # we have failed, return object
+    values = np.asarray(values, dtype=object)
+    return ensure_object(values)
+
+
+def _reconstruct_data(
+    values: ArrayLikeT, dtype: DtypeObj, original: AnyArrayLike
+) -> ArrayLikeT:
+    """
+    reverse of _ensure_data
+
+    Parameters
+    ----------
+    values : np.ndarray or ExtensionArray
+    dtype : np.dtype or ExtensionDtype
+    original : AnyArrayLike
+
+    Returns
+    -------
+    ExtensionArray or np.ndarray
+    """
+    if isinstance(values, ABCExtensionArray) and values.dtype == dtype:
+        # Catch DatetimeArray/TimedeltaArray
+        return values
+
+    if not isinstance(dtype, np.dtype):
+        # i.e. ExtensionDtype; note we have ruled out above the possibility
+        #  that values.dtype == dtype
+        cls = dtype.construct_array_type()
+
+        # error: Incompatible return value type
+        # (got "ExtensionArray",
+        # expected "ndarray[tuple[Any, ...], dtype[Any]]")
+        return cls._from_sequence(values, dtype=dtype)  # type: ignore[return-value]
+
+    # error: Incompatible return value type
+    # (got "ndarray[tuple[Any, ...], dtype[Any]]",
+    # expected "ExtensionArray")
+    return values.astype(dtype, copy=False)  # type: ignore[return-value]
+
+
+def _ensure_arraylike(values, func_name: str) -> ArrayLike:
+    """
+    ensure that we are arraylike if not already
+    """
+    if not isinstance(
+        values,
+        (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray, ABCNumpyExtensionArray),
+    ):
+        # GH#52986
+        if func_name != "isin-targets":
+            # Make an exception for the comps argument in isin.
+            raise TypeError(
+                f"{func_name} requires a Series, Index, "
+                f"ExtensionArray, np.ndarray or NumpyExtensionArray "
+                f"got {type(values).__name__}."
+            )
+
+        inferred = lib.infer_dtype(values, skipna=False)
+        if inferred in ["mixed", "string", "mixed-integer"]:
+            # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160
+            if isinstance(values, tuple):
+                values = list(values)
+            values = construct_1d_object_array_from_listlike(values)
+        else:
+            values = np.asarray(values)
+    return values
+
+
+_hashtables = {
+    "complex128": htable.Complex128HashTable,
+    "complex64": htable.Complex64HashTable,
+    "float64": htable.Float64HashTable,
+    "float32": htable.Float32HashTable,
+    "uint64": htable.UInt64HashTable,
+    "uint32": htable.UInt32HashTable,
+    "uint16": htable.UInt16HashTable,
+    "uint8": htable.UInt8HashTable,
+    "int64": htable.Int64HashTable,
+    "int32": htable.Int32HashTable,
+    "int16": htable.Int16HashTable,
+    "int8": htable.Int8HashTable,
+    "string": htable.StringHashTable,
+    "object": htable.PyObjectHashTable,
+}
+
+
+def _get_hashtable_algo(
+    values: np.ndarray,
+) -> tuple[type[htable.HashTable], np.ndarray]:
+    """
+    Parameters
+    ----------
+    values : np.ndarray
+
+    Returns
+    -------
+    htable : HashTable subclass
+    values : ndarray
+    """
+    values = _ensure_data(values)
+
+    ndtype = _check_object_for_strings(values)
+    hashtable = _hashtables[ndtype]
+    return hashtable, values
+
+
+def _check_object_for_strings(values: np.ndarray) -> str:
+    """
+    Check if we can use string hashtable instead of object hashtable.
+
+    Parameters
+    ----------
+    values : ndarray
+
+    Returns
+    -------
+    str
+    """
+    ndtype = values.dtype.name
+    if ndtype == "object":
+        # it's cheaper to use a String Hash Table than Object; we infer
+        # including nulls because that is the only difference between
+        # StringHashTable and ObjectHashtable
+        if lib.is_string_array(values, skipna=False):
+            ndtype = "string"
+    return ndtype
+
+
+# --------------- #
+# top-level algos #
+# --------------- #
+
+
+@overload
+def unique(values: T) -> T: ...
+@overload
+def unique(values: np.ndarray | Series) -> np.ndarray: ...
+
+
+@set_module("pandas")
+def unique(values):
+    """
+    Return unique values based on a hash table.
+
+    Uniques are returned in order of appearance. This does NOT sort.
+
+    Significantly faster than numpy.unique for long enough sequences.
+    Includes NA values.
+
+    Parameters
+    ----------
+    values : 1d array-like
+        The input array-like object containing values from which to extract
+        unique values.
+
+    Returns
+    -------
+    numpy.ndarray, ExtensionArray or NumpyExtensionArray
+
+        The return can be:
+
+        * Index : when the input is an Index
+        * Categorical : when the input is a Categorical dtype
+        * ndarray : when the input is a Series/ndarray
+
+        Return numpy.ndarray, ExtensionArray or NumpyExtensionArray.
+
+    See Also
+    --------
+    Index.unique : Return unique values from an Index.
+    Series.unique : Return unique values of Series object.
+
+    Examples
+    --------
+    >>> pd.unique(pd.Series([2, 1, 3, 3]))
+    array([2, 1, 3])
+
+    >>> pd.unique(pd.Series([2] + [1] * 5))
+    array([2, 1])
+
+    >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]))
+    array(['2016-01-01T00:00:00.000000'], dtype='datetime64[us]')
+
+    >>> pd.unique(
+    ...     pd.Series(
+    ...         [
+    ...             pd.Timestamp("20160101", tz="US/Eastern"),
+    ...             pd.Timestamp("20160101", tz="US/Eastern"),
+    ...         ],
+    ...         dtype="M8[ns, US/Eastern]",
+    ...     )
+    ... )
+    <DatetimeArray>
+    ['2016-01-01 00:00:00-05:00']
+    Length: 1, dtype: datetime64[ns, US/Eastern]
+
+    >>> pd.unique(
+    ...     pd.Index(
+    ...         [
+    ...             pd.Timestamp("20160101", tz="US/Eastern"),
+    ...             pd.Timestamp("20160101", tz="US/Eastern"),
+    ...         ],
+    ...         dtype="M8[ns, US/Eastern]",
+    ...     )
+    ... )
+    DatetimeIndex(['2016-01-01 00:00:00-05:00'],
+            dtype='datetime64[ns, US/Eastern]',
+            freq=None)
+
+    >>> pd.unique(np.array(list("baabc"), dtype="O"))
+    array(['b', 'a', 'c'], dtype=object)
+
+    An unordered Categorical will return categories in the
+    order of appearance.
+
+    >>> pd.unique(pd.Series(pd.Categorical(list("baabc"))))
+    ['b', 'a', 'c']
+    Categories (3, str): ['a', 'b', 'c']
+
+    >>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc"))))
+    ['b', 'a', 'c']
+    Categories (3, str): ['a', 'b', 'c']
+
+    An ordered Categorical preserves the category ordering.
+
+    >>> pd.unique(
+    ...     pd.Series(
+    ...         pd.Categorical(list("baabc"), categories=list("abc"), ordered=True)
+    ...     )
+    ... )
+    ['b', 'a', 'c']
+    Categories (3, str): ['a' < 'b' < 'c']
+
+    An array of tuples
+
+    >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values)
+    array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
+
+    A NumpyExtensionArray of complex
+
+    >>> pd.unique(pd.array([1 + 1j, 2, 3]))
+    <NumpyExtensionArray>
+    [(1+1j), (2+0j), (3+0j)]
+    Length: 3, dtype: complex128
+    """
+    return unique_with_mask(values)
+
+
+def nunique_ints(values: ArrayLike) -> int:
+    """
+    Return the number of unique values for integer array-likes.
+
+    Significantly faster than pandas.unique for long enough sequences.
+    No checks are done to ensure input is integral.
+
+    Parameters
+    ----------
+    values : 1d array-like
+
+    Returns
+    -------
+    int : The number of unique values in ``values``
+    """
+    if len(values) == 0:
+        return 0
+    values = _ensure_data(values)
+    # bincount requires intp
+    result = (np.bincount(values.ravel().astype("intp")) != 0).sum()
+    return result
+
+
+def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
+    """See algorithms.unique for docs. Takes a mask for masked arrays."""
+    values = _ensure_arraylike(values, func_name="unique")
+
+    if isinstance(values.dtype, ExtensionDtype):
+        # Dispatch to extension dtype's unique.
+        return values.unique()
+
+    if isinstance(values, ABCIndex):
+        # Dispatch to Index's unique.
+        return values.unique()
+
+    original = values
+    hashtable, values = _get_hashtable_algo(values)
+
+    table = hashtable(len(values))
+    if mask is None:
+        uniques = table.unique(values)
+        uniques = _reconstruct_data(uniques, original.dtype, original)
+        return uniques
+
+    else:
+        uniques, mask = table.unique(values, mask=mask)
+        uniques = _reconstruct_data(uniques, original.dtype, original)
+        assert mask is not None  # for mypy
+        return uniques, mask.astype("bool")
+
+
+unique1d = unique
+
+
+_MINIMUM_COMP_ARR_LEN = 1_000_000
+
+
+def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
+    """
+    Compute the isin boolean array.
+
+    Parameters
+    ----------
+    comps : list-like
+    values : list-like
+
+    Returns
+    -------
+    ndarray[bool]
+        Same length as `comps`.
+    """
+    if not is_list_like(comps):
+        raise TypeError(
+            "only list-like objects are allowed to be passed "
+            f"to isin(), you passed a `{type(comps).__name__}`"
+        )
+    if not is_list_like(values):
+        raise TypeError(
+            "only list-like objects are allowed to be passed "
+            f"to isin(), you passed a `{type(values).__name__}`"
+        )
+
+    if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
+        orig_values = list(values)
+        values = _ensure_arraylike(orig_values, func_name="isin-targets")
+
+        if (
+            len(values) > 0
+            and values.dtype.kind in "iufcb"
+            and not is_signed_integer_dtype(comps)
+            and not is_dtype_equal(values, comps)
+        ):
+            # GH#46485 Use object to avoid upcast to float64 later
+            # TODO: Share with _find_common_type_compat
+            values = construct_1d_object_array_from_listlike(orig_values)
+
+    elif isinstance(values, ABCMultiIndex):
+        # Avoid raising in extract_array
+        values = np.array(values)
+    else:
+        values = extract_array(values, extract_numpy=True, extract_range=True)
+
+    comps_array = _ensure_arraylike(comps, func_name="isin")
+    comps_array = extract_array(comps_array, extract_numpy=True)
+    if not isinstance(comps_array, np.ndarray):
+        # i.e. Extension Array
+        return comps_array.isin(values)
+
+    elif needs_i8_conversion(comps_array.dtype):
+        # Dispatch to DatetimeLikeArrayMixin.isin
+        return pd_array(comps_array).isin(values)
+    elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps_array.dtype):
+        # e.g. comps_array are integers and values are datetime64s
+        return np.zeros(comps_array.shape, dtype=bool)
+        # TODO: not quite right ... Sparse/Categorical
+    elif needs_i8_conversion(values.dtype):
+        return isin(comps_array, values.astype(object))
+
+    elif isinstance(values.dtype, ExtensionDtype):
+        return isin(np.asarray(comps_array), np.asarray(values))
+
+    # GH16012
+    # Ensure np.isin doesn't get object types or it *may* throw an exception
+    # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
+    # isin is faster for small sizes
+
+    # GH60678
+    # Ensure values don't contain <NA>, otherwise it throws exception with np.in1d
+
+    if (
+        len(comps_array) > _MINIMUM_COMP_ARR_LEN
+        and len(values) <= 26
+        and comps_array.dtype != object
+        and not any(v is NA for v in values)
+    ):
+        # If the values include nan we need to check for nan explicitly
+        # since np.nan it not equal to np.nan
+        if isna(values).any():
+
+            def f(c, v):
+                return np.logical_or(np.isin(c, v).ravel(), np.isnan(c))
+
+        else:
+            f = lambda a, b: np.isin(a, b).ravel()
+
+    else:
+        common = np_find_common_type(values.dtype, comps_array.dtype)
+        values = values.astype(common, copy=False)
+        comps_array = comps_array.astype(common, copy=False)
+        f = htable.ismember
+
+    return f(comps_array, values)
+
+
+def factorize_array(
+    values: np.ndarray,
+    use_na_sentinel: bool = True,
+    size_hint: int | None = None,
+    na_value: object = None,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> tuple[npt.NDArray[np.intp], np.ndarray]:
+    """
+    Factorize a numpy array to codes and uniques.
+
+    This doesn't do any coercion of types or unboxing before factorization.
+
+    Parameters
+    ----------
+    values : ndarray
+    use_na_sentinel : bool, default True
+        If True, the sentinel -1 will be used for NaN values. If False,
+        NaN values will be encoded as non-negative integers and will not drop the
+        NaN from the uniques of the values.
+    size_hint : int, optional
+        Passed through to the hashtable's 'get_labels' method
+    na_value : object, optional
+        A value in `values` to consider missing. Note: only use this
+        parameter when you know that you don't have any values pandas would
+        consider missing in the array (NaN for float data, iNaT for
+        datetimes, etc.).
+    mask : ndarray[bool], optional
+        If not None, the mask is used as indicator for missing values
+        (True = missing, False = valid) instead of `na_value` or
+        condition "val != val".
+
+    Returns
+    -------
+    codes : ndarray[np.intp]
+    uniques : ndarray
+    """
+    original = values
+    if values.dtype.kind in "mM":
+        # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
+        #  need to do the same to na_value. We are assuming here that the passed
+        #  na_value is an appropriately-typed NaT.
+        # e.g. test_where_datetimelike_categorical
+        na_value = iNaT
+
+    hash_klass, values = _get_hashtable_algo(values)
+
+    table = hash_klass(size_hint or len(values))
+    uniques, codes = table.factorize(
+        values,
+        na_sentinel=-1,
+        na_value=na_value,
+        mask=mask,
+        ignore_na=use_na_sentinel,
+    )
+
+    # re-cast e.g. i8->dt64/td64, uint8->bool
+    uniques = _reconstruct_data(uniques, original.dtype, original)
+
+    codes = ensure_platform_int(codes)
+    return codes, uniques
+
+
+@set_module("pandas")
+def factorize(
+    values,
+    sort: bool = False,
+    use_na_sentinel: bool = True,
+    size_hint: int | None = None,
+) -> tuple[np.ndarray, np.ndarray | Index]:
+    """
+    Encode the object as an enumerated type or categorical variable.
+
+    This method is useful for obtaining a numeric representation of an
+    array when all that matters is identifying distinct values. `factorize`
+    is available as both a top-level function :func:`pandas.factorize`,
+    and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
+
+    Parameters
+    ----------
+    values : sequence
+        A 1-D sequence. Sequences that aren't pandas objects are
+        coerced to ndarrays before factorization.
+    sort : bool, default False
+        Sort `uniques` and shuffle `codes` to maintain the
+        relationship.
+    use_na_sentinel : bool, default True
+        If True, the sentinel -1 will be used for NaN values. If False,
+        NaN values will be encoded as non-negative integers and will not drop the
+        NaN from the uniques of the values.
+    size_hint : int, optional
+        Hint to the hashtable sizer.
+
+    Returns
+    -------
+    codes : ndarray
+        An integer ndarray that's an indexer into `uniques`.
+        ``uniques.take(codes)`` will have the same values as `values`.
+    uniques : ndarray, Index, or Categorical
+        The unique valid values. When `values` is Categorical, `uniques`
+        is a Categorical. When `values` is some other pandas object, an
+        `Index` is returned. Otherwise, a 1-D ndarray is returned.
+
+        .. note::
+
+           Even if there's a missing value in `values`, `uniques` will
+           *not* contain an entry for it.
+
+    See Also
+    --------
+    cut : Discretize continuous-valued array.
+    unique : Find the unique value in an array.
+
+    Notes
+    -----
+    Reference :ref:`the user guide <reshaping.factorize>` for more examples.
+
+    Examples
+    --------
+    These examples all show factorize as a top-level method like
+    ``pd.factorize(values)``. The results are identical for methods like
+    :meth:`Series.factorize`.
+
+    >>> codes, uniques = pd.factorize(np.array(["b", "b", "a", "c", "b"], dtype="O"))
+    >>> codes
+    array([0, 0, 1, 2, 0])
+    >>> uniques
+    array(['b', 'a', 'c'], dtype=object)
+
+    With ``sort=True``, the `uniques` will be sorted, and `codes` will be
+    shuffled so that the relationship is the maintained.
+
+    >>> codes, uniques = pd.factorize(
+    ...     np.array(["b", "b", "a", "c", "b"], dtype="O"), sort=True
+    ... )
+    >>> codes
+    array([1, 1, 0, 2, 1])
+    >>> uniques
+    array(['a', 'b', 'c'], dtype=object)
+
+    When ``use_na_sentinel=True`` (the default), missing values are indicated in
+    the `codes` with the sentinel value ``-1`` and missing values are not
+    included in `uniques`.
+
+    >>> codes, uniques = pd.factorize(np.array(["b", None, "a", "c", "b"], dtype="O"))
+    >>> codes
+    array([ 0, -1,  1,  2,  0])
+    >>> uniques
+    array(['b', 'a', 'c'], dtype=object)
+
+    Thus far, we've only factorized lists (which are internally coerced to
+    NumPy arrays). When factorizing pandas objects, the type of `uniques`
+    will differ. For Categoricals, a `Categorical` is returned.
+
+    >>> cat = pd.Categorical(["a", "a", "c"], categories=["a", "b", "c"])
+    >>> codes, uniques = pd.factorize(cat)
+    >>> codes
+    array([0, 0, 1])
+    >>> uniques
+    ['a', 'c']
+    Categories (3, str): ['a', 'b', 'c']
+
+    Notice that ``'b'`` is in ``uniques.categories``, despite not being
+    present in ``cat.values``.
+
+    For all other pandas objects, an Index of the appropriate type is
+    returned.
+
+    >>> cat = pd.Series(["a", "a", "c"])
+    >>> codes, uniques = pd.factorize(cat)
+    >>> codes
+    array([0, 0, 1])
+    >>> uniques
+    Index(['a', 'c'], dtype='str')
+
+    If NaN is in the values, and we want to include NaN in the uniques of the
+    values, it can be achieved by setting ``use_na_sentinel=False``.
+
+    >>> values = np.array([1, 2, 1, np.nan])
+    >>> codes, uniques = pd.factorize(values)  # default: use_na_sentinel=True
+    >>> codes
+    array([ 0,  1,  0, -1])
+    >>> uniques
+    array([1., 2.])
+
+    >>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
+    >>> codes
+    array([0, 1, 0, 2])
+    >>> uniques
+    array([ 1.,  2., nan])
+    """
+    # Implementation notes: This method is responsible for 3 things
+    # 1.) coercing data to array-like (ndarray, Index, extension array)
+    # 2.) factorizing codes and uniques
+    # 3.) Maybe boxing the uniques in an Index
+    #
+    # Step 2 is dispatched to extension types (like Categorical). They are
+    # responsible only for factorization. All data coercion, sorting and boxing
+    # should happen here.
+    if isinstance(values, (ABCIndex, ABCSeries)):
+        return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel)
+
+    values = _ensure_arraylike(values, func_name="factorize")
+    original = values
+
+    if (
+        isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
+        and values.freq is not None
+    ):
+        # The presence of 'freq' means we can fast-path sorting and know there
+        #  aren't NAs
+        codes, uniques = values.factorize(sort=sort)
+        return codes, uniques
+
+    elif not isinstance(values, np.ndarray):
+        # i.e. ExtensionArray
+        codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)
+
+    else:
+        values = np.asarray(values)  # convert DTA/TDA/MultiIndex
+
+        if not use_na_sentinel and values.dtype == object:
+            # factorize can now handle differentiating various types of null values.
+            # These can only occur when the array has object dtype.
+            # However, for backwards compatibility we only use the null for the
+            # provided dtype. This may be revisited in the future, see GH#48476.
+            null_mask = isna(values)
+            if null_mask.any():
+                na_value = na_value_for_dtype(values.dtype, compat=False)
+                # Don't modify (potentially user-provided) array
+                values = np.where(null_mask, na_value, values)
+
+        codes, uniques = factorize_array(
+            values,
+            use_na_sentinel=use_na_sentinel,
+            size_hint=size_hint,
+        )
+
+    if sort and len(uniques) > 0:
+        uniques, codes = safe_sort(
+            uniques,
+            codes,
+            use_na_sentinel=use_na_sentinel,
+            assume_unique=True,
+            verify=False,
+        )
+
+    uniques = _reconstruct_data(uniques, original.dtype, original)
+
+    return codes, uniques
+
+
+def value_counts_internal(
+    values,
+    sort: bool = True,
+    ascending: bool = False,
+    normalize: bool = False,
+    bins=None,
+    dropna: bool = True,
+) -> Series:
+    from pandas import (
+        DatetimeIndex,
+        Index,
+        Series,
+        TimedeltaIndex,
+    )
+
+    index_name = getattr(values, "name", None)
+    name = "proportion" if normalize else "count"
+
+    if bins is not None:
+        from pandas.core.reshape.tile import cut
+
+        if isinstance(values, Series):
+            values = values._values
+
+        try:
+            ii = cut(values, bins, include_lowest=True)
+        except TypeError as err:
+            raise TypeError("bins argument only works with numeric data.") from err
+
+        # count, remove nulls (from the index), and but the bins
+        result = ii.value_counts(dropna=dropna)
+        result.name = name
+        result = result[result.index.notna()]
+        result.index = result.index.astype("interval")
+        result = result.sort_index()
+
+        # if we are dropna and we have NO values
+        if dropna and (result._values == 0).all():
+            result = result.iloc[0:0]
+
+        # normalizing is by len of all (regardless of dropna)
+        normalize_denominator = len(ii)
+
+    else:
+        normalize_denominator = None
+        if is_extension_array_dtype(values):
+            # handle Categorical and sparse,
+            result = Series(values, copy=False)._values.value_counts(dropna=dropna)
+            result.name = name
+            result.index.name = index_name
+
+        elif isinstance(values, ABCMultiIndex):
+            # GH49558
+            levels = list(range(values.nlevels))
+            result = (
+                Series(index=values, name=name)
+                .groupby(level=levels, dropna=dropna)
+                .size()
+            )
+            result.index.names = values.names
+
+        else:
+            values = _ensure_arraylike(values, func_name="value_counts")
+            keys, counts, _ = value_counts_arraylike(values, dropna)
+            if keys.dtype == np.float16:
+                keys = keys.astype(np.float32)
+
+            # Starting in 3.0, we no longer perform dtype inference on the
+            #  Index object we construct here, xref GH#56161
+            idx = Index(keys, dtype=keys.dtype, name=index_name, copy=False)
+
+            if (
+                not sort
+                and isinstance(values, (DatetimeIndex, TimedeltaIndex))
+                and idx.equals(values)
+                and values.inferred_freq is not None
+            ):
+                # Preserve freq of original index
+                idx.freq = values.inferred_freq  # type: ignore[attr-defined]
+
+            result = Series(counts, index=idx, name=name, copy=False)
+
+    if sort:
+        result = result.sort_values(ascending=ascending, kind="stable")
+
+    if normalize:
+        if normalize_denominator is not None:
+            result = result / normalize_denominator
+        else:
+            result = result / result.sum()
+
+    return result
+
+
+# Called once from SparseArray, otherwise could be private
+def value_counts_arraylike(
+    values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
+) -> tuple[ArrayLike, npt.NDArray[np.int64], int]:
+    """
+    Parameters
+    ----------
+    values : np.ndarray
+    dropna : bool
+    mask : np.ndarray[bool] or None, default None
+
+    Returns
+    -------
+    uniques : np.ndarray
+    counts : np.ndarray[np.int64]
+    """
+    original = values
+    values = _ensure_data(values)
+
+    keys, counts, na_counter = htable.value_count(values, dropna, mask=mask)
+
+    if needs_i8_conversion(original.dtype):
+        # datetime, timedelta, or period
+
+        if dropna:
+            mask = keys != iNaT
+            keys, counts = keys[mask], counts[mask]
+
+    res_keys = _reconstruct_data(keys, original.dtype, original)
+    return res_keys, counts, na_counter
+
+
+def duplicated(
+    values: ArrayLike,
+    keep: Literal["first", "last", False] = "first",
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> npt.NDArray[np.bool_]:
+    """
+    Return boolean ndarray denoting duplicate values.
+
+    Parameters
+    ----------
+    values : np.ndarray or ExtensionArray
+        Array over which to check for duplicate values.
+    keep : {'first', 'last', False}, default 'first'
+        - ``first`` : Mark duplicates as ``True`` except for the first
+          occurrence.
+        - ``last`` : Mark duplicates as ``True`` except for the last
+          occurrence.
+        - False : Mark all duplicates as ``True``.
+    mask : ndarray[bool], optional
+        array indicating which elements to exclude from checking
+
+    Returns
+    -------
+    duplicated : ndarray[bool]
+    """
+    values = _ensure_data(values)
+    return htable.duplicated(values, keep=keep, mask=mask)
+
+
+def mode(
+    values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None
+) -> tuple[np.ndarray, npt.NDArray[np.bool_]] | ExtensionArray:
+    """
+    Returns the mode(s) of an array.
+
+    Parameters
+    ----------
+    values : array-like
+        Array over which to check for duplicate values.
+    dropna : bool, default True
+        Don't consider counts of NaN/NaT.
+
+    Returns
+    -------
+    Union[Tuple[np.ndarray, npt.NDArray[np.bool_]], ExtensionArray]
+    """
+    values = _ensure_arraylike(values, func_name="mode")
+    original = values
+
+    if needs_i8_conversion(values.dtype):
+        # Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray.
+        values = ensure_wrapped_if_datetimelike(values)
+        values = cast("ExtensionArray", values)
+        return values._mode(dropna=dropna)
+
+    values = _ensure_data(values)
+
+    npresult, res_mask = htable.mode(values, dropna=dropna, mask=mask)
+    if res_mask is None:
+        res_mask = np.zeros(npresult.shape, dtype=np.bool_)
+    else:
+        return npresult, res_mask
+
+    try:
+        npresult = safe_sort(npresult)
+    except TypeError as err:
+        warnings.warn(
+            f"Unable to sort modes: {err}",
+            stacklevel=find_stack_level(),
+        )
+
+    result = _reconstruct_data(npresult, original.dtype, original)
+    return result, res_mask
+
+
+def rank(
+    values: ArrayLike,
+    axis: AxisInt = 0,
+    method: str = "average",
+    na_option: str = "keep",
+    ascending: bool = True,
+    pct: bool = False,
+) -> npt.NDArray[np.float64]:
+    """
+    Rank the values along a given axis.
+
+    Parameters
+    ----------
+    values : np.ndarray or ExtensionArray
+        Array whose values will be ranked. The number of dimensions in this
+        array must not exceed 2.
+    axis : int, default 0
+        Axis over which to perform rankings.
+    method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
+        The method by which tiebreaks are broken during the ranking.
+    na_option : {'keep', 'top'}, default 'keep'
+        The method by which NaNs are placed in the ranking.
+        - ``keep``: rank each NaN value with a NaN ranking
+        - ``top``: replace each NaN with either +/- inf so that they
+                   there are ranked at the top
+    ascending : bool, default True
+        Whether or not the elements should be ranked in ascending order.
+    pct : bool, default False
+        Whether or not to the display the returned rankings in integer form
+        (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
+    """
+    is_datetimelike = needs_i8_conversion(values.dtype)
+    values = _ensure_data(values)
+
+    if values.ndim == 1:
+        ranks = algos.rank_1d(
+            values,
+            is_datetimelike=is_datetimelike,
+            ties_method=method,
+            ascending=ascending,
+            na_option=na_option,
+            pct=pct,
+        )
+    elif values.ndim == 2:
+        ranks = algos.rank_2d(
+            values,
+            axis=axis,
+            is_datetimelike=is_datetimelike,
+            ties_method=method,
+            ascending=ascending,
+            na_option=na_option,
+            pct=pct,
+        )
+    else:
+        raise TypeError("Array with ndim > 2 are not supported.")
+
+    return ranks
+
+
+# ---- #
+# take #
+# ---- #
+
+
+@set_module("pandas.api.extensions")
+def take(
+    arr,
+    indices: TakeIndexer,
+    axis: AxisInt = 0,
+    allow_fill: bool = False,
+    fill_value=None,
+):
+    """
+    Take elements from an array.
+
+    Parameters
+    ----------
+    arr : numpy.ndarray, ExtensionArray, Index, or Series
+        Input array.
+    indices : sequence of int or one-dimensional np.ndarray of int
+        Indices to be taken.
+    axis : int, default 0
+        The axis over which to select values.
+    allow_fill : bool, default False
+        How to handle negative values in `indices`.
+
+        * False: negative values in `indices` indicate positional indices
+          from the right (the default). This is similar to :func:`numpy.take`.
+
+        * True: negative values in `indices` indicate
+          missing values. These values are set to `fill_value`. Any other
+          negative values raise a ``ValueError``.
+
+    fill_value : any, optional
+        Fill value to use for NA-indices when `allow_fill` is True.
+        This may be ``None``, in which case the default NA value for
+        the type (``self.dtype.na_value``) is used.
+
+        For multi-dimensional `arr`, each *element* is filled with
+        `fill_value`.
+
+    Returns
+    -------
+    ndarray or ExtensionArray
+        Same type as the input.
+
+    Raises
+    ------
+    IndexError
+        When `indices` is out of bounds for the array.
+    ValueError
+        When the indexer contains negative values other than ``-1``
+        and `allow_fill` is True.
+
+    Notes
+    -----
+    When `allow_fill` is False, `indices` may be whatever dimensionality
+    is accepted by NumPy for `arr`.
+
+    When `allow_fill` is True, `indices` should be 1-D.
+
+    See Also
+    --------
+    numpy.take : Take elements from an array along an axis.
+
+    Examples
+    --------
+    >>> import pandas as pd
+
+    With the default ``allow_fill=False``, negative numbers indicate
+    positional indices from the right.
+
+    >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1])
+    array([10, 10, 30])
+
+    Setting ``allow_fill=True`` will place `fill_value` in those positions.
+
+    >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
+    array([10., 10., nan])
+
+    >>> pd.api.extensions.take(
+    ...     np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, fill_value=-10
+    ... )
+    array([ 10,  10, -10])
+    """
+    if not isinstance(
+        arr,
+        (np.ndarray, ABCExtensionArray, ABCIndex, ABCSeries, ABCNumpyExtensionArray),
+    ):
+        # GH#52981
+        raise TypeError(
+            "pd.api.extensions.take requires a numpy.ndarray, ExtensionArray, "
+            f"Index, Series, or NumpyExtensionArray got {type(arr).__name__}."
+        )
+
+    indices = ensure_platform_int(indices)
+
+    if allow_fill:
+        # Pandas style, -1 means NA
+        validate_indices(indices, arr.shape[axis])
+        # error: Argument 1 to "take_nd" has incompatible type
+        # "ndarray[Any, Any] | ExtensionArray | Index | Series"; expected
+        # "ndarray[Any, Any]"
+        result = take_nd(
+            arr,  # type: ignore[arg-type]
+            indices,
+            axis=axis,
+            allow_fill=True,
+            fill_value=fill_value,
+        )
+    else:
+        # NumPy style
+        # error: Unexpected keyword argument "axis" for "take" of "ExtensionArray"
+        result = arr.take(indices, axis=axis)  # type: ignore[call-arg,assignment]
+    return result
+
+
+# ------------ #
+# searchsorted #
+# ------------ #
+
+
+def searchsorted(
+    arr: ArrayLike,
+    value: NumpyValueArrayLike | ExtensionArray,
+    side: Literal["left", "right"] = "left",
+    sorter: NumpySorter | None = None,
+) -> npt.NDArray[np.intp] | np.intp:
+    """
+    Find indices where elements should be inserted to maintain order.
+
+    Find the indices into a sorted array `arr` (a) such that, if the
+    corresponding elements in `value` were inserted before the indices,
+    the order of `arr` would be preserved.
+
+    Assuming that `arr` is sorted:
+
+    ======  ================================
+    `side`  returned index `i` satisfies
+    ======  ================================
+    left    ``arr[i-1] < value <= self[i]``
+    right   ``arr[i-1] <= value < self[i]``
+    ======  ================================
+
+    Parameters
+    ----------
+    arr: np.ndarray, ExtensionArray, Series
+        Input array. If `sorter` is None, then it must be sorted in
+        ascending order, otherwise `sorter` must be an array of indices
+        that sort it.
+    value : array-like or scalar
+        Values to insert into `arr`.
+    side : {'left', 'right'}, optional
+        If 'left', the index of the first suitable location found is given.
+        If 'right', return the last such index.  If there is no suitable
+        index, return either 0 or N (where N is the length of `self`).
+    sorter : 1-D array-like, optional
+        Optional array of integer indices that sort array a into ascending
+        order. They are typically the result of argsort.
+
+    Returns
+    -------
+    array of ints or int
+        If value is array-like, array of insertion points.
+        If value is scalar, a single integer.
+
+    See Also
+    --------
+    numpy.searchsorted : Similar method from NumPy.
+    """
+    if sorter is not None:
+        sorter = ensure_platform_int(sorter)
+
+    if (
+        isinstance(arr, np.ndarray)
+        and arr.dtype.kind in "iu"
+        and (is_integer(value) or is_integer_dtype(value))
+    ):
+        # if `arr` and `value` have different dtypes, `arr` would be
+        # recast by numpy, causing a slow search.
+        # Before searching below, we therefore try to give `value` the
+        # same dtype as `arr`, while guarding against integer overflows.
+        iinfo = np.iinfo(arr.dtype.type)
+        value_arr = np.array([value]) if is_integer(value) else np.array(value)
+        if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():
+            # value within bounds, so no overflow, so can convert value dtype
+            # to dtype of arr
+            dtype = arr.dtype
+        else:
+            dtype = value_arr.dtype
+
+        if is_integer(value):
+            # We know that value is int
+            value = cast(int, dtype.type(value))
+        else:
+            value = pd_array(cast(ArrayLike, value), dtype=dtype)
+    else:
+        # E.g. if `arr` is an array with dtype='datetime64[ns]'
+        # and `value` is a pd.Timestamp, we may need to convert value
+        arr = ensure_wrapped_if_datetimelike(arr)
+
+    # Argument 1 to "searchsorted" of "ndarray" has incompatible type
+    # "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike"
+    return arr.searchsorted(value, side=side, sorter=sorter)  # type: ignore[arg-type]
+
+
+# ---- #
+# diff #
+# ---- #
+
+_diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"}
+
+
+def diff(arr, n: int | float | np.integer | np.floating, axis: AxisInt = 0):
+    """
+    difference of n between self,
+    analogous to s-s.shift(n)
+
+    Parameters
+    ----------
+    arr : ndarray or ExtensionArray
+    n : int
+        number of periods
+    axis : {0, 1}
+        axis to shift on
+    stacklevel : int, default 3
+        The stacklevel for the lost dtype warning.
+
+    Returns
+    -------
+    shifted
+    """
+
+    # added a check on the integer value of period
+    # see https://github.com/pandas-dev/pandas/issues/56607
+    if not lib.is_integer(n):
+        if not (is_float(n) and n.is_integer()):
+            raise ValueError("periods must be an integer")
+        n = int(n)
+    na = np.nan
+    dtype = arr.dtype
+
+    is_bool = is_bool_dtype(dtype)
+    if is_bool:
+        op = operator.xor
+    else:
+        op = operator.sub
+
+    if isinstance(dtype, NumpyEADtype):
+        # NumpyExtensionArray cannot necessarily hold shifted versions of itself.
+        arr = arr.to_numpy()
+        dtype = arr.dtype
+
+    if not isinstance(arr, np.ndarray):
+        # i.e ExtensionArray
+        if hasattr(arr, f"__{op.__name__}__"):
+            if axis != 0:
+                raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")
+            return op(arr, arr.shift(n))
+        else:
+            raise TypeError(
+                f"{type(arr).__name__} has no 'diff' method. "
+                "Convert to a suitable dtype prior to calling 'diff'."
+            )
+
+    is_timedelta = False
+    if arr.dtype.kind in "mM":
+        dtype = np.int64
+        arr = arr.view("i8")
+        na = iNaT
+        is_timedelta = True
+
+    elif is_bool:
+        # We have to cast in order to be able to hold np.nan
+        dtype = np.object_
+
+    elif dtype.kind in "iu":
+        # We have to cast in order to be able to hold np.nan
+
+        # int8, int16 are incompatible with float64,
+        # see https://github.com/cython/cython/issues/2646
+        if arr.dtype.name in ["int8", "int16"]:
+            dtype = np.float32
+        else:
+            dtype = np.float64
+
+    orig_ndim = arr.ndim
+    if orig_ndim == 1:
+        # reshape so we can always use algos.diff_2d
+        arr = arr.reshape(-1, 1)
+        # TODO: require axis == 0
+
+    dtype = np.dtype(dtype)
+    out_arr = np.empty(arr.shape, dtype=dtype)
+
+    na_indexer = [slice(None)] * 2
+    na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
+    out_arr[tuple(na_indexer)] = na
+
+    if arr.dtype.name in _diff_special:
+        # TODO: can diff_2d dtype specialization troubles be fixed by defining
+        #  out_arr inside diff_2d?
+        algos.diff_2d(arr, out_arr, int(n), axis, datetimelike=is_timedelta)
+    else:
+        # To keep mypy happy, _res_indexer is a list while res_indexer is
+        #  a tuple, ditto for lag_indexer.
+        _res_indexer = [slice(None)] * 2
+        _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)
+        res_indexer = tuple(_res_indexer)
+
+        _lag_indexer = [slice(None)] * 2
+        _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)
+        lag_indexer = tuple(_lag_indexer)
+
+        out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer])
+
+    if is_timedelta:
+        out_arr = out_arr.view("timedelta64[ns]")
+
+    if orig_ndim == 1:
+        out_arr = out_arr[:, 0]
+    return out_arr
+
+
+# --------------------------------------------------------------------
+# Helper functions
+
+
+# Note: safe_sort is in algorithms.py instead of sorting.py because it is
+#  low-dependency, is used in this module, and used private methods from
+#  this module.
+def safe_sort(
+    values: Index | ArrayLike,
+    codes: npt.NDArray[np.intp] | None = None,
+    use_na_sentinel: bool = True,
+    assume_unique: bool = False,
+    verify: bool = True,
+) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]:
+    """
+    Sort ``values`` and reorder corresponding ``codes``.
+
+    ``values`` should be unique if ``codes`` is not None.
+    Safe for use with mixed types (int, str), orders ints before strs.
+
+    Parameters
+    ----------
+    values : list-like
+        Sequence; must be unique if ``codes`` is not None.
+    codes : np.ndarray[intp] or None, default None
+        Indices to ``values``. All out of bound indices are treated as
+        "not found" and will be masked with ``-1``.
+    use_na_sentinel : bool, default True
+        If True, the sentinel -1 will be used for NaN values. If False,
+        NaN values will be encoded as non-negative integers and will not drop the
+        NaN from the uniques of the values.
+    assume_unique : bool, default False
+        When True, ``values`` are assumed to be unique, which can speed up
+        the calculation. Ignored when ``codes`` is None.
+    verify : bool, default True
+        Check if codes are out of bound for the values and put out of bound
+        codes equal to ``-1``. If ``verify=False``, it is assumed there
+        are no out of bound codes. Ignored when ``codes`` is None.
+
+    Returns
+    -------
+    ordered : AnyArrayLike
+        Sorted ``values``
+    new_codes : ndarray
+        Reordered ``codes``; returned when ``codes`` is not None.
+
+    Raises
+    ------
+    TypeError
+        * If ``values`` is not list-like or if ``codes`` is neither None
+        nor list-like
+        * If ``values`` cannot be sorted
+    ValueError
+        * If ``codes`` is not None and ``values`` contain duplicates.
+    """
+    if not isinstance(values, (np.ndarray, ABCExtensionArray, ABCIndex)):
+        raise TypeError(
+            "Only np.ndarray, ExtensionArray, and Index objects are allowed to "
+            "be passed to safe_sort as values"
+        )
+
+    sorter = None
+    ordered: AnyArrayLike
+
+    if (
+        not isinstance(values.dtype, ExtensionDtype)
+        and lib.infer_dtype(values, skipna=False) == "mixed-integer"
+    ):
+        ordered = _sort_mixed(values)
+    else:
+        try:
+            sorter = values.argsort()
+            ordered = values.take(sorter)
+        except (TypeError, decimal.InvalidOperation):
+            # Previous sorters failed or were not applicable, try `_sort_mixed`
+            # which would work, but which fails for special case of 1d arrays
+            # with tuples.
+            if values.size and isinstance(values[0], tuple):
+                # error: Argument 1 to "_sort_tuples" has incompatible type
+                # "Union[Index, ExtensionArray, ndarray[Any, Any]]"; expected
+                # "ndarray[Any, Any]"
+                ordered = _sort_tuples(values)  # type: ignore[arg-type]
+            else:
+                ordered = _sort_mixed(values)
+
+    # codes:
+
+    if codes is None:
+        return ordered
+
+    if not is_list_like(codes):
+        raise TypeError(
+            "Only list-like objects or None are allowed to "
+            "be passed to safe_sort as codes"
+        )
+    codes = ensure_platform_int(np.asarray(codes))
+
+    if not assume_unique and not len(unique(values)) == len(values):
+        raise ValueError("values should be unique if codes is not None")
+
+    if sorter is None:
+        # mixed types
+        # error: Argument 1 to "_get_hashtable_algo" has incompatible type
+        # "Union[Index, ExtensionArray, ndarray[Any, Any]]"; expected
+        # "ndarray[Any, Any]"
+        hash_klass, values = _get_hashtable_algo(values)  # type: ignore[arg-type]
+        t = hash_klass(len(values))
+        t.map_locations(values)
+        # error: Argument 1 to "lookup" of "HashTable" has incompatible type
+        # "ExtensionArray | ndarray[Any, Any] | Index | Series"; expected "ndarray"
+        sorter = ensure_platform_int(t.lookup(ordered))  # type: ignore[arg-type]
+
+    if use_na_sentinel:
+        # take_nd is faster, but only works for na_sentinels of -1
+        order2 = sorter.argsort()
+        if verify:
+            mask = (codes < -len(values)) | (codes >= len(values))
+            codes[mask] = -1
+        new_codes = take_nd(order2, codes, fill_value=-1)
+    else:
+        reverse_indexer = np.empty(len(sorter), dtype=int)
+        reverse_indexer.put(sorter, np.arange(len(sorter)))
+        # Out of bound indices will be masked with `-1` next, so we
+        # may deal with them here without performance loss using `mode='wrap'`
+        new_codes = reverse_indexer.take(codes, mode="wrap")
+
+    return ordered, ensure_platform_int(new_codes)
+
+
+def _sort_mixed(values) -> AnyArrayLike:
+    """order ints before strings before nulls in 1d arrays"""
+    str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
+    null_pos = np.array([isna(x) for x in values], dtype=bool)
+    num_pos = ~str_pos & ~null_pos
+    str_argsort = np.argsort(values[str_pos])
+    num_argsort = np.argsort(values[num_pos])
+    # convert boolean arrays to positional indices, then order by underlying values
+    str_locs = str_pos.nonzero()[0].take(str_argsort)
+    num_locs = num_pos.nonzero()[0].take(num_argsort)
+    null_locs = null_pos.nonzero()[0]
+    locs = np.concatenate([num_locs, str_locs, null_locs])
+    return values.take(locs)
+
+
+def _sort_tuples(values: np.ndarray) -> np.ndarray:
+    """
+    Convert array of tuples (1d) to array of arrays (2d).
+    We need to keep the columns separately as they contain different types and
+    nans (can't use `np.sort` as it may fail when str and nan are mixed in a
+    column as types cannot be compared).
+    """
+    from pandas.core.internals.construction import to_arrays
+    from pandas.core.sorting import lexsort_indexer
+
+    arrays, _ = to_arrays(values, None)
+    indexer = lexsort_indexer(arrays, orders=True)
+    return values[indexer]
+
+
+def union_with_duplicates(
+    lvals: ArrayLike | Index, rvals: ArrayLike | Index
+) -> ArrayLike | Index:
+    """
+    Extracts the union from lvals and rvals with respect to duplicates and nans in
+    both arrays.
+
+    Parameters
+    ----------
+    lvals: np.ndarray or ExtensionArray
+        left values which is ordered in front.
+    rvals: np.ndarray or ExtensionArray
+        right values ordered after lvals.
+
+    Returns
+    -------
+    np.ndarray or ExtensionArray
+        Containing the unsorted union of both arrays.
+
+    Notes
+    -----
+    Caller is responsible for ensuring lvals.dtype == rvals.dtype.
+    """
+    from pandas import Series
+
+    l_count = value_counts_internal(lvals, dropna=False)
+    r_count = value_counts_internal(rvals, dropna=False)
+    l_count, r_count = l_count.align(r_count, fill_value=0)
+    final_count = np.maximum(l_count.values, r_count.values)
+    final_count = Series(final_count, index=l_count.index, dtype="int", copy=False)
+    if isinstance(lvals, ABCMultiIndex) and isinstance(rvals, ABCMultiIndex):
+        unique_vals = lvals.append(rvals).unique()
+    else:
+        if isinstance(lvals, ABCIndex):
+            lvals = lvals._values
+        if isinstance(rvals, ABCIndex):
+            rvals = rvals._values
+        # error: List item 0 has incompatible type "Union[ExtensionArray,
+        # ndarray[Any, Any], Index]"; expected "Union[ExtensionArray,
+        # ndarray[Any, Any]]"
+        combined = concat_compat([lvals, rvals])  # type: ignore[list-item]
+        unique_vals = unique(combined)
+        unique_vals = ensure_wrapped_if_datetimelike(unique_vals)
+    repeats = final_count.reindex(unique_vals).values
+    return np.repeat(unique_vals, repeats)
+
+
+def map_array(
+    arr: ArrayLike,
+    mapper,
+    na_action: Literal["ignore"] | None = None,
+) -> np.ndarray | ExtensionArray | Index:
+    """
+    Map values using an input mapping or function.
+
+    Parameters
+    ----------
+    mapper : function, dict, or Series
+        Mapping correspondence.
+    na_action : {None, 'ignore'}, default None
+        If 'ignore', propagate NA values, without passing them to the
+        mapping correspondence.
+
+    Returns
+    -------
+    Union[ndarray, Index, ExtensionArray]
+        The output of the mapping function applied to the array.
+        If the function returns a tuple with more than one element
+        a MultiIndex will be returned.
+    """
+    from pandas import Index
+
+    if na_action not in (None, "ignore"):
+        msg = f"na_action must either be 'ignore' or None, {na_action} was passed"
+        raise ValueError(msg)
+
+    # we can fastpath dict/Series to an efficient map
+    # as we know that we are not going to have to yield
+    # python types
+    if is_dict_like(mapper):
+        if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
+            # If a dictionary subclass defines a default value method,
+            # convert mapper to a lookup function (GH #15999).
+            dict_with_default = mapper
+            mapper = lambda x: dict_with_default[
+                np.nan if isinstance(x, float) and np.isnan(x) else x
+            ]
+        else:
+            # Dictionary does not have a default. Thus it's safe to
+            # convert to a Series for efficiency.
+            # we specify the keys here to handle the
+            # possibility that they are tuples
+
+            # The return value of mapping with an empty mapper is
+            # expected to be pd.Series(np.nan, ...). As np.nan is
+            # of dtype float64 the return value of this method should
+            # be float64 as well
+            from pandas import Series
+
+            if len(mapper) == 0:
+                mapper = Series(mapper, dtype=np.float64)
+            elif isinstance(mapper, dict):
+                mapper = Series(
+                    mapper.values(), index=Index(mapper.keys(), tupleize_cols=False)
+                )
+            else:
+                mapper = Series(mapper)
+
+    if isinstance(mapper, ABCSeries):
+        if na_action == "ignore":
+            mapper = mapper[mapper.index.notna()]
+
+        # Since values were input this means we came from either
+        # a dict or a series and mapper should be an index
+        indexer = mapper.index.get_indexer(arr)
+        new_values = take_nd(mapper._values, indexer)
+
+        return new_values
+
+    if not len(arr):
+        return arr.copy()
+
+    # we must convert to python types
+    values = arr.astype(object, copy=False)
+    if na_action is None:
+        return lib.map_infer(values, mapper)
+    else:
+        return lib.map_infer_mask(values, mapper, mask=isna(values).view(np.uint8))
diff --git a/pandas/core/api.py b/pandas/core/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec12d543d8389afa38c7c84a658dcaeee960690c
--- /dev/null
+++ b/pandas/core/api.py
@@ -0,0 +1,138 @@
+from pandas._libs import (
+    NaT,
+    Period,
+    Timedelta,
+    Timestamp,
+)
+from pandas._libs.missing import NA
+
+from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
+    CategoricalDtype,
+    DatetimeTZDtype,
+    IntervalDtype,
+    PeriodDtype,
+)
+from pandas.core.dtypes.missing import (
+    isna,
+    isnull,
+    notna,
+    notnull,
+)
+
+from pandas.core.algorithms import (
+    factorize,
+    unique,
+)
+from pandas.core.arrays import Categorical
+from pandas.core.arrays.boolean import BooleanDtype
+from pandas.core.arrays.floating import (
+    Float32Dtype,
+    Float64Dtype,
+)
+from pandas.core.arrays.integer import (
+    Int8Dtype,
+    Int16Dtype,
+    Int32Dtype,
+    Int64Dtype,
+    UInt8Dtype,
+    UInt16Dtype,
+    UInt32Dtype,
+    UInt64Dtype,
+)
+from pandas.core.arrays.string_ import StringDtype
+from pandas.core.construction import array  # noqa: ICN001
+from pandas.core.flags import Flags
+from pandas.core.groupby import (
+    Grouper,
+    NamedAgg,
+)
+from pandas.core.indexes.api import (
+    CategoricalIndex,
+    DatetimeIndex,
+    Index,
+    IntervalIndex,
+    MultiIndex,
+    PeriodIndex,
+    RangeIndex,
+    TimedeltaIndex,
+)
+from pandas.core.indexes.datetimes import (
+    bdate_range,
+    date_range,
+)
+from pandas.core.indexes.interval import (
+    Interval,
+    interval_range,
+)
+from pandas.core.indexes.period import period_range
+from pandas.core.indexes.timedeltas import timedelta_range
+from pandas.core.indexing import IndexSlice
+from pandas.core.series import Series
+from pandas.core.tools.datetimes import to_datetime
+from pandas.core.tools.numeric import to_numeric
+from pandas.core.tools.timedeltas import to_timedelta
+
+from pandas.io.formats.format import set_eng_float_format
+from pandas.tseries.offsets import DateOffset
+
+# DataFrame needs to be imported after NamedAgg to avoid a circular import
+from pandas.core.frame import DataFrame  # isort:skip
+
+__all__ = [
+    "NA",
+    "ArrowDtype",
+    "BooleanDtype",
+    "Categorical",
+    "CategoricalDtype",
+    "CategoricalIndex",
+    "DataFrame",
+    "DateOffset",
+    "DatetimeIndex",
+    "DatetimeTZDtype",
+    "Flags",
+    "Float32Dtype",
+    "Float64Dtype",
+    "Grouper",
+    "Index",
+    "IndexSlice",
+    "Int8Dtype",
+    "Int16Dtype",
+    "Int32Dtype",
+    "Int64Dtype",
+    "Interval",
+    "IntervalDtype",
+    "IntervalIndex",
+    "MultiIndex",
+    "NaT",
+    "NamedAgg",
+    "Period",
+    "PeriodDtype",
+    "PeriodIndex",
+    "RangeIndex",
+    "Series",
+    "StringDtype",
+    "Timedelta",
+    "TimedeltaIndex",
+    "Timestamp",
+    "UInt8Dtype",
+    "UInt16Dtype",
+    "UInt32Dtype",
+    "UInt64Dtype",
+    "array",
+    "bdate_range",
+    "date_range",
+    "factorize",
+    "interval_range",
+    "isna",
+    "isnull",
+    "notna",
+    "notnull",
+    "period_range",
+    "set_eng_float_format",
+    "timedelta_range",
+    "to_datetime",
+    "to_numeric",
+    "to_timedelta",
+    "unique",
+]
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f218b3813149a2c6584e9919d11802dd27b7ce4
--- /dev/null
+++ b/pandas/core/apply.py
@@ -0,0 +1,2132 @@
+from __future__ import annotations
+
+import abc
+from collections import defaultdict
+from collections.abc import Callable
+import functools
+from functools import partial
+import inspect
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    TypeAlias,
+    cast,
+)
+
+import numpy as np
+
+from pandas._libs.internals import BlockValuesRefs
+from pandas._typing import (
+    AggFuncType,
+    AggFuncTypeBase,
+    AggFuncTypeDict,
+    AggObjType,
+    Axis,
+    AxisInt,
+    NDFrameT,
+    npt,
+)
+from pandas.compat._optional import import_optional_dependency
+from pandas.errors import SpecificationError
+from pandas.util._decorators import (
+    cache_readonly,
+    set_module,
+)
+
+from pandas.core.dtypes.cast import is_nested_object
+from pandas.core.dtypes.common import (
+    is_dict_like,
+    is_extension_array_dtype,
+    is_list_like,
+    is_numeric_dtype,
+    is_sequence,
+)
+from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCNDFrame,
+    ABCSeries,
+)
+
+from pandas.core._numba.executor import generate_apply_looper
+import pandas.core.common as com
+from pandas.core.construction import ensure_wrapped_if_datetimelike
+from pandas.core.util.numba_ import (
+    get_jit_arguments,
+    prepare_function_arguments,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Generator,
+        Hashable,
+        Iterable,
+        MutableMapping,
+        Sequence,
+    )
+
+    from pandas import (
+        DataFrame,
+        Index,
+        Series,
+    )
+    from pandas.core.groupby import GroupBy
+    from pandas.core.resample import Resampler
+    from pandas.core.window.rolling import BaseWindow
+
+ResType: TypeAlias = dict[int, Any]
+
+
+@set_module("pandas.api.executors")
+class BaseExecutionEngine(abc.ABC):
+    """
+    Base class for execution engines for map and apply methods.
+
+    An execution engine receives all the parameters of a call to
+    ``apply`` or ``map``, such as the data container, the function,
+    etc. and takes care of running the execution.
+
+    Supporting different engines allows functions to be JIT compiled,
+    run in parallel, and others. Besides the default executor which
+    simply runs the code with the Python interpreter and pandas.
+    """
+
+    @staticmethod
+    @abc.abstractmethod
+    def map(
+        data: Series | DataFrame | np.ndarray,
+        func: AggFuncType,
+        args: tuple,
+        kwargs: dict[str, Any],
+        decorator: Callable | None,
+        skip_na: bool,
+    ):
+        """
+        Executor method to run functions elementwise.
+
+        In general, pandas uses ``map`` for running functions elementwise,
+        but ``Series.apply`` with the default ``by_row='compat'`` will also
+        call this executor function.
+
+        Parameters
+        ----------
+        data : Series, DataFrame or NumPy ndarray
+            The object to use for the data. Some methods implement a ``raw``
+            parameter which will convert the original pandas object to a
+            NumPy array, which will then be passed here to the executor.
+        func : function or NumPy ufunc
+            The function to execute.
+        args : tuple
+            Positional arguments to be passed to ``func``.
+        kwargs : dict
+            Keyword arguments to be passed to ``func``.
+        decorator : function, optional
+            For JIT compilers and other engines that need to decorate the
+            function ``func``, this is the decorator to use. While the
+            executor may already know which is the decorator to use, this
+            is useful as for a single executor the user can specify for
+            example ``numba.jit`` or ``numba.njit(nogil=True)``, and this
+            decorator parameter will contain the exact decorator from the
+            executor the user wants to use.
+        skip_na : bool
+            Whether the function should be called for missing values or not.
+            This is specified by the pandas user as ``map(na_action=None)``
+            or ``map(na_action='ignore')``.
+        """
+
+    @staticmethod
+    @abc.abstractmethod
+    def apply(
+        data: Series | DataFrame | np.ndarray,
+        func: AggFuncType,
+        args: tuple,
+        kwargs: dict[str, Any],
+        decorator: Callable,
+        axis: Axis,
+    ):
+        """
+        Executor method to run functions by an axis.
+
+        While we can see ``map`` as executing the function for each cell
+        in a ``DataFrame`` (or ``Series``), ``apply`` will execute the
+        function for each column (or row).
+
+        Parameters
+        ----------
+        data : Series, DataFrame or NumPy ndarray
+            The object to use for the data. Some methods implement a ``raw``
+            parameter which will convert the original pandas object to a
+            NumPy array, which will then be passed here to the executor.
+        func : function or NumPy ufunc
+            The function to execute.
+        args : tuple
+            Positional arguments to be passed to ``func``.
+        kwargs : dict
+            Keyword arguments to be passed to ``func``.
+        decorator : function, optional
+            For JIT compilers and other engines that need to decorate the
+            function ``func``, this is the decorator to use. While the
+            executor may already know which is the decorator to use, this
+            is useful as for a single executor the user can specify for
+            example ``numba.jit`` or ``numba.njit(nogil=True)``, and this
+            decorator parameter will contain the exact decorator from the
+            executor the user wants to use.
+        axis : {0 or 'index', 1 or 'columns'}
+            0 or 'index' should execute the function passing each column as
+            parameter. 1 or 'columns' should execute the function passing
+            each row as parameter. The default executor engine passes rows
+            as pandas ``Series``. Other executor engines should probably
+            expect functions to be implemented this way for compatibility.
+            But passing rows as other data structures is technically possible
+            as far as the function ``func`` is implemented accordingly.
+        """
+
+
+def frame_apply(
+    obj: DataFrame,
+    func: AggFuncType,
+    axis: Axis = 0,
+    raw: bool = False,
+    result_type: str | None = None,
+    by_row: Literal[False, "compat"] = "compat",
+    engine: str = "python",
+    engine_kwargs: dict[str, bool] | None = None,
+    args=None,
+    kwargs=None,
+) -> FrameApply:
+    """construct and return a row or column based frame apply object"""
+    _, func, columns, _ = reconstruct_func(func, **kwargs)
+
+    axis = obj._get_axis_number(axis)
+    klass: type[FrameApply]
+    if axis == 0:
+        klass = FrameRowApply
+    elif axis == 1:
+        if columns:
+            raise NotImplementedError(
+                f"Named aggregation is not supported when {axis=}."
+            )
+        klass = FrameColumnApply
+
+    return klass(
+        obj,
+        func,
+        raw=raw,
+        result_type=result_type,
+        by_row=by_row,
+        engine=engine,
+        engine_kwargs=engine_kwargs,
+        args=args,
+        kwargs=kwargs,
+    )
+
+
+class Apply(metaclass=abc.ABCMeta):
+    axis: AxisInt
+
+    def __init__(
+        self,
+        obj: AggObjType,
+        func: AggFuncType,
+        raw: bool,
+        result_type: str | None,
+        *,
+        by_row: Literal[False, "compat", "_compat"] = "compat",
+        engine: str = "python",
+        engine_kwargs: dict[str, bool] | None = None,
+        args,
+        kwargs,
+    ) -> None:
+        self.obj = obj
+        self.raw = raw
+
+        assert by_row is False or by_row in ["compat", "_compat"]
+        self.by_row = by_row
+
+        self.args = args or ()
+        self.kwargs = kwargs or {}
+
+        self.engine = engine
+        self.engine_kwargs = {} if engine_kwargs is None else engine_kwargs
+
+        if result_type not in [None, "reduce", "broadcast", "expand"]:
+            raise ValueError(
+                "invalid value for result_type, must be one "
+                "of {None, 'reduce', 'broadcast', 'expand'}"
+            )
+
+        self.result_type = result_type
+
+        self.func = func
+
+    @abc.abstractmethod
+    def apply(self) -> DataFrame | Series:
+        pass
+
+    @abc.abstractmethod
+    def agg_or_apply_list_like(
+        self, op_name: Literal["agg", "apply"]
+    ) -> DataFrame | Series:
+        pass
+
+    @abc.abstractmethod
+    def agg_or_apply_dict_like(
+        self, op_name: Literal["agg", "apply"]
+    ) -> DataFrame | Series:
+        pass
+
+    def agg(self) -> DataFrame | Series | None:
+        """
+        Provide an implementation for the aggregators.
+
+        Returns
+        -------
+        Result of aggregation, or None if agg cannot be performed by
+        this method.
+        """
+        func = self.func
+
+        if isinstance(func, str):
+            return self.apply_str()
+
+        if is_dict_like(func):
+            return self.agg_dict_like()
+        elif is_list_like(func):
+            # we require a list, but not a 'str'
+            return self.agg_list_like()
+
+        # caller can react
+        return None
+
+    def transform(self) -> DataFrame | Series:
+        """
+        Transform a DataFrame or Series.
+
+        Returns
+        -------
+        DataFrame or Series
+            Result of applying ``func`` along the given axis of the
+            Series or DataFrame.
+
+        Raises
+        ------
+        ValueError
+            If the transform function fails or does not transform.
+        """
+        obj = self.obj
+        func = self.func
+        axis = self.axis
+        args = self.args
+        kwargs = self.kwargs
+
+        is_series = obj.ndim == 1
+
+        if obj._get_axis_number(axis) == 1:
+            assert not is_series
+            return obj.T.transform(func, 0, *args, **kwargs).T
+
+        if is_list_like(func) and not is_dict_like(func):
+            func = cast(list[AggFuncTypeBase], func)
+            # Convert func equivalent dict
+            if is_series:
+                func = {com.get_callable_name(v) or v: v for v in func}
+            else:
+                func = dict.fromkeys(obj, func)
+
+        if is_dict_like(func):
+            func = cast(AggFuncTypeDict, func)
+            return self.transform_dict_like(func)
+
+        # func is either str or callable
+        func = cast(AggFuncTypeBase, func)
+        try:
+            result = self.transform_str_or_callable(func)
+        except TypeError:
+            raise
+        except Exception as err:
+            raise ValueError("Transform function failed") from err
+
+        # Functions that transform may return empty Series/DataFrame
+        # when the dtype is not appropriate
+        if (
+            isinstance(result, (ABCSeries, ABCDataFrame))
+            and result.empty
+            and not obj.empty
+        ):
+            raise ValueError("Transform function failed")
+        if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
+            obj.index
+        ):
+            raise ValueError("Function did not transform")
+
+        return result
+
+    def transform_dict_like(self, func) -> DataFrame:
+        """
+        Compute transform in the case of a dict-like func
+        """
+        from pandas.core.reshape.concat import concat
+
+        obj = self.obj
+        args = self.args
+        kwargs = self.kwargs
+
+        # transform is currently only for Series/DataFrame
+        assert isinstance(obj, ABCNDFrame)
+
+        if len(func) == 0:
+            raise ValueError("No transform functions were provided")
+
+        func = self.normalize_dictlike_arg("transform", obj, func)
+
+        results: dict[Hashable, DataFrame | Series] = {}
+        for name, how in func.items():
+            colg = obj._gotitem(name, ndim=1)
+            results[name] = colg.transform(how, 0, *args, **kwargs)
+        return concat(results, axis=1)
+
+    def transform_str_or_callable(self, func) -> DataFrame | Series:
+        """
+        Compute transform in the case of a string or callable func
+        """
+        obj = self.obj
+        args = self.args
+        kwargs = self.kwargs
+
+        if isinstance(func, str):
+            return self._apply_str(obj, func, *args, **kwargs)
+
+        # Two possible ways to use a UDF - apply or call directly
+        try:
+            return obj.apply(func, args=args, **kwargs)
+        except Exception:
+            return func(obj, *args, **kwargs)
+
+    def agg_list_like(self) -> DataFrame | Series:
+        """
+        Compute aggregation in the case of a list-like argument.
+
+        Returns
+        -------
+        Result of aggregation.
+        """
+        return self.agg_or_apply_list_like(op_name="agg")
+
+    def compute_list_like(
+        self,
+        op_name: Literal["agg", "apply"],
+        selected_obj: Series | DataFrame,
+        kwargs: dict[str, Any],
+    ) -> tuple[list[Hashable] | Index, list[Any]]:
+        """
+        Compute agg/apply results for like-like input.
+
+        Parameters
+        ----------
+        op_name : {"agg", "apply"}
+            Operation being performed.
+        selected_obj : Series or DataFrame
+            Data to perform operation on.
+        kwargs : dict
+            Keyword arguments to pass to the functions.
+
+        Returns
+        -------
+        keys : list[Hashable] or Index
+            Index labels for result.
+        results : list
+            Data for result. When aggregating with a Series, this can contain any
+            Python objects.
+        """
+        func = cast(list[AggFuncTypeBase], self.func)
+        obj = self.obj
+
+        results = []
+        keys = []
+
+        # degenerate case
+        if selected_obj.ndim == 1:
+            for a in func:
+                colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
+                args = (
+                    [self.axis, *self.args]
+                    if include_axis(op_name, colg)
+                    else self.args
+                )
+                new_res = getattr(colg, op_name)(a, *args, **kwargs)
+                results.append(new_res)
+
+                # make sure we find a good name
+                name = com.get_callable_name(a) or a
+                keys.append(name)
+
+        else:
+            indices = []
+            for index, col in enumerate(selected_obj):
+                colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index])
+                args = (
+                    [self.axis, *self.args]
+                    if include_axis(op_name, colg)
+                    else self.args
+                )
+                new_res = getattr(colg, op_name)(func, *args, **kwargs)
+                results.append(new_res)
+                indices.append(index)
+            # error: Incompatible types in assignment (expression has type "Any |
+            # Index", variable has type "list[Any | Callable[..., Any] | str]")
+            keys = selected_obj.columns.take(indices)  # type: ignore[assignment]
+
+        return keys, results
+
+    def wrap_results_list_like(
+        self, keys: Iterable[Hashable], results: list[Series | DataFrame]
+    ):
+        from pandas.core.reshape.concat import concat
+
+        obj = self.obj
+
+        try:
+            return concat(results, keys=keys, axis=1, sort=False)
+        except TypeError as err:
+            # we are concatting non-NDFrame objects,
+            # e.g. a list of scalars
+            from pandas import Series
+
+            result = Series(results, index=keys, name=obj.name)
+            if is_nested_object(result):
+                raise ValueError(
+                    "cannot combine transform and aggregation operations"
+                ) from err
+            return result
+
+    def agg_dict_like(self) -> DataFrame | Series:
+        """
+        Compute aggregation in the case of a dict-like argument.
+
+        Returns
+        -------
+        Result of aggregation.
+        """
+        return self.agg_or_apply_dict_like(op_name="agg")
+
+    def compute_dict_like(
+        self,
+        op_name: Literal["agg", "apply"],
+        selected_obj: Series | DataFrame,
+        selection: Hashable | Sequence[Hashable],
+        kwargs: dict[str, Any],
+    ) -> tuple[list[Hashable], list[Any]]:
+        """
+        Compute agg/apply results for dict-like input.
+
+        Parameters
+        ----------
+        op_name : {"agg", "apply"}
+            Operation being performed.
+        selected_obj : Series or DataFrame
+            Data to perform operation on.
+        selection : hashable or sequence of hashables
+            Used by GroupBy, Window, and Resample if selection is applied to the object.
+        kwargs : dict
+            Keyword arguments to pass to the functions.
+
+        Returns
+        -------
+        keys : list[hashable]
+            Index labels for result.
+        results : list
+            Data for result. When aggregating with a Series, this can contain any
+            Python object.
+        """
+        from pandas.core.groupby.generic import (
+            DataFrameGroupBy,
+            SeriesGroupBy,
+        )
+
+        obj = self.obj
+        is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
+        func = cast(AggFuncTypeDict, self.func)
+        func = self.normalize_dictlike_arg(op_name, selected_obj, func)
+
+        is_non_unique_col = (
+            selected_obj.ndim == 2
+            and selected_obj.columns.nunique() < len(selected_obj.columns)
+        )
+
+        if selected_obj.ndim == 1:
+            # key only used for output
+            colg = obj._gotitem(selection, ndim=1)
+            results = [getattr(colg, op_name)(how, **kwargs) for _, how in func.items()]
+            keys = list(func.keys())
+        elif not is_groupby and is_non_unique_col:
+            # key used for column selection and output
+            # GH#51099
+            results = []
+            keys = []
+            for key, how in func.items():
+                indices = selected_obj.columns.get_indexer_for([key])
+                labels = selected_obj.columns.take(indices)
+                label_to_indices = defaultdict(list)
+                for index, label in zip(indices, labels, strict=True):
+                    label_to_indices[label].append(index)
+
+                key_data = [
+                    getattr(selected_obj._ixs(indice, axis=1), op_name)(how, **kwargs)
+                    for label, indices in label_to_indices.items()
+                    for indice in indices
+                ]
+
+                keys += [key] * len(key_data)
+                results += key_data
+        elif is_groupby:
+            # key used for column selection and output
+
+            df = selected_obj
+            results, keys = [], []
+            for key, how in func.items():
+                cols = df[key]
+
+                if cols.ndim == 1:
+                    series = obj._gotitem(key, ndim=1, subset=cols)
+                    results.append(getattr(series, op_name)(how, **kwargs))
+                    keys.append(key)
+                else:
+                    for _, col in cols.items():
+                        series = obj._gotitem(key, ndim=1, subset=col)
+                        results.append(getattr(series, op_name)(how, **kwargs))
+                        keys.append(key)
+        else:
+            results = [
+                getattr(obj._gotitem(key, ndim=1), op_name)(how, **kwargs)
+                for key, how in func.items()
+            ]
+            keys = list(func.keys())
+
+        return keys, results
+
+    def wrap_results_dict_like(
+        self,
+        selected_obj: Series | DataFrame,
+        result_index: list[Hashable],
+        result_data: list,
+    ):
+        from pandas import Index
+        from pandas.core.reshape.concat import concat
+
+        obj = self.obj
+
+        # Avoid making two isinstance calls in all and any below
+        is_ndframe = [isinstance(r, ABCNDFrame) for r in result_data]
+
+        if all(is_ndframe):
+            results = [result for result in result_data if not result.empty]
+            keys_to_use: Iterable[Hashable]
+            keys_to_use = [
+                k for k, v in zip(result_index, result_data, strict=True) if not v.empty
+            ]
+            # Have to check, if at least one DataFrame is not empty.
+            if keys_to_use == []:
+                keys_to_use = result_index
+                results = result_data
+
+            if selected_obj.ndim == 2:
+                # keys are columns, so we can preserve names
+                ktu = Index(keys_to_use)
+                ktu._set_names(selected_obj.columns.names)
+                keys_to_use = ktu
+
+            axis: AxisInt = 0 if isinstance(obj, ABCSeries) else 1
+            result = concat(
+                results,
+                axis=axis,
+                keys=keys_to_use,
+                sort=False,
+            )
+        elif any(is_ndframe):
+            # There is a mix of NDFrames and scalars
+            raise ValueError(
+                "cannot perform both aggregation "
+                "and transformation operations "
+                "simultaneously"
+            )
+        else:
+            from pandas import Series
+
+            # we have a list of scalars
+            # GH 36212 use name only if obj is a series
+            if obj.ndim == 1:
+                obj = cast("Series", obj)
+                name = obj.name
+            else:
+                name = None
+
+            result = Series(result_data, index=result_index, name=name)
+
+        return result
+
+    def apply_str(self) -> DataFrame | Series:
+        """
+        Compute apply in case of a string.
+
+        Returns
+        -------
+        result: Series or DataFrame
+        """
+        # Caller is responsible for checking isinstance(self.f, str)
+        func = cast(str, self.func)
+
+        obj = self.obj
+
+        from pandas.core.groupby.generic import (
+            DataFrameGroupBy,
+            SeriesGroupBy,
+        )
+
+        # Support for `frame.transform('method')`
+        # Some methods (shift, etc.) require the axis argument, others
+        # don't, so inspect and insert if necessary.
+        method = getattr(obj, func, None)
+        if callable(method):
+            sig = inspect.getfullargspec(method)
+            arg_names = (*sig.args, *sig.kwonlyargs)
+            if self.axis != 0 and (
+                "axis" not in arg_names or func in ("corrwith", "skew")
+            ):
+                raise ValueError(f"Operation {func} does not support axis=1")
+            if "axis" in arg_names and not isinstance(
+                obj, (SeriesGroupBy, DataFrameGroupBy)
+            ):
+                self.kwargs["axis"] = self.axis
+        return self._apply_str(obj, func, *self.args, **self.kwargs)
+
+    def apply_list_or_dict_like(self) -> DataFrame | Series:
+        """
+        Compute apply in case of a list-like or dict-like.
+
+        Returns
+        -------
+        result: Series, DataFrame, or None
+            Result when self.func is a list-like or dict-like, None otherwise.
+        """
+
+        if self.engine == "numba":
+            raise NotImplementedError(
+                "The 'numba' engine doesn't support list-like/"
+                "dict likes of callables yet."
+            )
+
+        if self.axis == 1 and isinstance(self.obj, ABCDataFrame):
+            return self.obj.T.apply(self.func, 0, args=self.args, **self.kwargs).T
+
+        func = self.func
+        kwargs = self.kwargs
+
+        if is_dict_like(func):
+            result = self.agg_or_apply_dict_like(op_name="apply")
+        else:
+            result = self.agg_or_apply_list_like(op_name="apply")
+
+        result = reconstruct_and_relabel_result(result, func, **kwargs)
+
+        return result
+
+    def normalize_dictlike_arg(
+        self, how: str, obj: DataFrame | Series, func: AggFuncTypeDict
+    ) -> AggFuncTypeDict:
+        """
+        Handler for dict-like argument.
+
+        Ensures that necessary columns exist if obj is a DataFrame, and
+        that a nested renamer is not passed. Also normalizes to all lists
+        when values consists of a mix of list and non-lists.
+        """
+        assert how in ("apply", "agg", "transform")
+
+        # Can't use func.values(); wouldn't work for a Series
+        if (
+            how == "agg"
+            and isinstance(obj, ABCSeries)
+            and any(is_list_like(v) for _, v in func.items())
+        ) or (any(is_dict_like(v) for _, v in func.items())):
+            # GH 15931 - deprecation of renaming keys
+            raise SpecificationError("nested renamer is not supported")
+
+        if obj.ndim != 1:
+            # Check for missing columns on a frame
+            from pandas import Index
+
+            cols = Index(list(func.keys())).difference(obj.columns, sort=True)
+            if len(cols) > 0:
+                # GH 58474
+                raise KeyError(f"Label(s) {list(cols)} do not exist")
+
+        aggregator_types = (list, tuple, dict)
+
+        # if we have a dict of any non-scalars
+        # eg. {'A' : ['mean']}, normalize all to
+        # be list-likes
+        # Cannot use func.values() because arg may be a Series
+        if any(isinstance(x, aggregator_types) for _, x in func.items()):
+            new_func: AggFuncTypeDict = {}
+            for k, v in func.items():
+                if not isinstance(v, aggregator_types):
+                    new_func[k] = [v]
+                else:
+                    new_func[k] = v
+            func = new_func
+        return func
+
+    def _apply_str(self, obj, func: str, *args, **kwargs):
+        """
+        if arg is a string, then try to operate on it:
+        - try to find a function (or attribute) on obj
+        - try to find a numpy function
+        - raise
+        """
+        assert isinstance(func, str)
+
+        if hasattr(obj, func):
+            f = getattr(obj, func)
+            if callable(f):
+                return f(*args, **kwargs)
+
+            # people may aggregate on a non-callable attribute
+            # but don't let them think they can pass args to it
+            assert len(args) == 0
+            assert not any(kwarg == "axis" for kwarg in kwargs)
+            return f
+        elif hasattr(np, func) and hasattr(obj, "__array__"):
+            # in particular exclude Window
+            f = getattr(np, func)
+            return f(obj, *args, **kwargs)
+        else:
+            msg = f"'{func}' is not a valid function for '{type(obj).__name__}' object"
+            raise AttributeError(msg)
+
+
+class NDFrameApply(Apply):
+    """
+    Methods shared by FrameApply and SeriesApply but
+    not GroupByApply or ResamplerWindowApply
+    """
+
+    obj: DataFrame | Series
+
+    @property
+    def index(self) -> Index:
+        return self.obj.index
+
+    @property
+    def agg_axis(self) -> Index:
+        return self.obj._get_agg_axis(self.axis)
+
+    def agg_or_apply_list_like(
+        self, op_name: Literal["agg", "apply"]
+    ) -> DataFrame | Series:
+        obj = self.obj
+        kwargs = self.kwargs
+
+        if op_name == "apply":
+            if isinstance(self, FrameApply):
+                by_row = self.by_row
+
+            elif isinstance(self, SeriesApply):
+                by_row = "_compat" if self.by_row else False
+            else:
+                by_row = False
+            kwargs = {**kwargs, "by_row": by_row}
+
+        if getattr(obj, "axis", 0) == 1:
+            raise NotImplementedError("axis other than 0 is not supported")
+
+        keys, results = self.compute_list_like(op_name, obj, kwargs)
+        result = self.wrap_results_list_like(keys, results)
+        return result
+
+    def agg_or_apply_dict_like(
+        self, op_name: Literal["agg", "apply"]
+    ) -> DataFrame | Series:
+        assert op_name in ["agg", "apply"]
+        obj = self.obj
+
+        kwargs = {}
+        if op_name == "apply":
+            by_row = "_compat" if self.by_row else False
+            kwargs.update({"by_row": by_row})
+
+        if getattr(obj, "axis", 0) == 1:
+            raise NotImplementedError("axis other than 0 is not supported")
+
+        selection = None
+        result_index, result_data = self.compute_dict_like(
+            op_name, obj, selection, kwargs
+        )
+        result = self.wrap_results_dict_like(obj, result_index, result_data)
+        return result
+
+
+class FrameApply(NDFrameApply):
+    obj: DataFrame
+
+    def __init__(
+        self,
+        obj: AggObjType,
+        func: AggFuncType,
+        raw: bool,
+        result_type: str | None,
+        *,
+        by_row: Literal[False, "compat"] = False,
+        engine: str = "python",
+        engine_kwargs: dict[str, bool] | None = None,
+        args,
+        kwargs,
+    ) -> None:
+        if by_row is not False and by_row != "compat":
+            raise ValueError(f"by_row={by_row} not allowed")
+        super().__init__(
+            obj,
+            func,
+            raw,
+            result_type,
+            by_row=by_row,
+            engine=engine,
+            engine_kwargs=engine_kwargs,
+            args=args,
+            kwargs=kwargs,
+        )
+
+    # ---------------------------------------------------------------
+    # Abstract Methods
+
+    @property
+    @abc.abstractmethod
+    def result_index(self) -> Index:
+        pass
+
+    @property
+    @abc.abstractmethod
+    def result_columns(self) -> Index:
+        pass
+
+    @property
+    @abc.abstractmethod
+    def series_generator(self) -> Generator[Series]:
+        pass
+
+    @staticmethod
+    @functools.cache
+    @abc.abstractmethod
+    def generate_numba_apply_func(
+        func, nogil=True, nopython=True, parallel=False
+    ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]:
+        pass
+
+    @abc.abstractmethod
+    def apply_with_numba(self):
+        pass
+
+    def validate_values_for_numba(self) -> None:
+        # Validate column dtyps all OK
+        for colname, dtype in self.obj.dtypes.items():
+            if not is_numeric_dtype(dtype):
+                raise ValueError(
+                    f"Column {colname} must have a numeric dtype. "
+                    f"Found '{dtype}' instead"
+                )
+            if is_extension_array_dtype(dtype):
+                raise ValueError(
+                    f"Column {colname} is backed by an extension array, "
+                    f"which is not supported by the numba engine."
+                )
+
+    @abc.abstractmethod
+    def wrap_results_for_axis(
+        self, results: ResType, res_index: Index
+    ) -> DataFrame | Series:
+        pass
+
+    # ---------------------------------------------------------------
+
+    @property
+    def res_columns(self) -> Index:
+        return self.result_columns
+
+    @property
+    def columns(self) -> Index:
+        return self.obj.columns
+
+    @cache_readonly
+    def values(self):
+        return self.obj.values
+
+    def apply(self) -> DataFrame | Series:
+        """compute the results"""
+
+        # dispatch to handle list-like or dict-like
+        if is_list_like(self.func):
+            if self.engine == "numba":
+                raise NotImplementedError(
+                    "the 'numba' engine doesn't support lists of callables yet"
+                )
+            return self.apply_list_or_dict_like()
+
+        # all empty
+        if len(self.columns) == 0 and len(self.index) == 0:
+            return self.apply_empty_result()
+
+        # string dispatch
+        if isinstance(self.func, str):
+            if self.engine == "numba":
+                raise NotImplementedError(
+                    "the 'numba' engine doesn't support using "
+                    "a string as the callable function"
+                )
+            return self.apply_str()
+
+        # ufunc
+        elif isinstance(self.func, np.ufunc):
+            if self.engine == "numba":
+                raise NotImplementedError(
+                    "the 'numba' engine doesn't support "
+                    "using a numpy ufunc as the callable function"
+                )
+            with np.errstate(all="ignore"):
+                results = self.obj._mgr.apply("apply", func=self.func)
+            # _constructor will retain self.index and self.columns
+            return self.obj._constructor_from_mgr(results, axes=results.axes)
+
+        # broadcasting
+        if self.result_type == "broadcast":
+            if self.engine == "numba":
+                raise NotImplementedError(
+                    "the 'numba' engine doesn't support result_type='broadcast'"
+                )
+            return self.apply_broadcast(self.obj)
+
+        # one axis empty
+        elif not all(self.obj.shape):
+            return self.apply_empty_result()
+
+        # raw
+        elif self.raw:
+            return self.apply_raw(engine=self.engine, engine_kwargs=self.engine_kwargs)
+
+        return self.apply_standard()
+
+    def agg(self):
+        obj = self.obj
+        axis = self.axis
+
+        # TODO: Avoid having to change state
+        self.obj = self.obj if self.axis == 0 else self.obj.T
+        self.axis = 0
+
+        result = None
+        try:
+            result = super().agg()
+        finally:
+            self.obj = obj
+            self.axis = axis
+
+        if axis == 1:
+            result = result.T if result is not None else result
+
+        if result is None:
+            result = self.obj.apply(self.func, axis, args=self.args, **self.kwargs)
+
+        return result
+
+    def apply_empty_result(self):
+        """
+        we have an empty result; at least 1 axis is 0
+
+        we will try to apply the function to an empty
+        series in order to see if this is a reduction function
+        """
+        assert callable(self.func)
+
+        # we are not asked to reduce or infer reduction
+        # so just return a copy of the existing object
+        if self.result_type not in ["reduce", None]:
+            return self.obj.copy()
+
+        # we may need to infer
+        should_reduce = self.result_type == "reduce"
+
+        from pandas import Series
+
+        if not should_reduce:
+            try:
+                if self.axis == 0:
+                    r = self.func(
+                        Series([], dtype=np.float64), *self.args, **self.kwargs
+                    )
+                else:
+                    r = self.func(
+                        Series(index=self.columns, dtype=np.float64),
+                        *self.args,
+                        **self.kwargs,
+                    )
+            except Exception:
+                pass
+            else:
+                should_reduce = not isinstance(r, Series)
+
+        if should_reduce:
+            if len(self.agg_axis):
+                r = self.func(Series([], dtype=np.float64), *self.args, **self.kwargs)
+            else:
+                r = np.nan
+
+            return self.obj._constructor_sliced(r, index=self.agg_axis)
+        else:
+            return self.obj.copy()
+
+    def apply_raw(self, engine="python", engine_kwargs=None):
+        """apply to the values as a numpy array"""
+
+        def wrap_function(func):
+            """
+            Wrap user supplied function to work around numpy issue.
+
+            see https://github.com/numpy/numpy/issues/8352
+            """
+
+            def wrapper(*args, **kwargs):
+                result = func(*args, **kwargs)
+                if isinstance(result, str):
+                    result = np.array(result, dtype=object)
+                return result
+
+            return wrapper
+
+        if engine == "numba":
+            args, kwargs = prepare_function_arguments(
+                self.func,  # type: ignore[arg-type]
+                self.args,
+                self.kwargs,
+                num_required_args=1,
+            )
+            # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has
+            # incompatible type "Callable[..., Any] | str | list[Callable
+            # [..., Any] | str] | dict[Hashable,Callable[..., Any] | str |
+            # list[Callable[..., Any] | str]]"; expected "Hashable"
+            nb_looper = generate_apply_looper(
+                self.func,  # type: ignore[arg-type]
+                **get_jit_arguments(engine_kwargs),
+            )
+            result = nb_looper(self.values, self.axis, *args)
+            # If we made the result 2-D, squeeze it back to 1-D
+            result = np.squeeze(result)
+        else:
+            result = np.apply_along_axis(
+                wrap_function(self.func),
+                self.axis,
+                self.values,
+                *self.args,
+                **self.kwargs,
+            )
+
+        # TODO: mixed type case
+        if result.ndim == 2:
+            return self.obj._constructor(result, index=self.index, columns=self.columns)
+        else:
+            return self.obj._constructor_sliced(result, index=self.agg_axis)
+
+    def apply_broadcast(self, target: DataFrame) -> DataFrame:
+        assert callable(self.func)
+
+        result_values = np.empty_like(target.values)
+
+        # axis which we want to compare compliance
+        result_compare = target.shape[0]
+
+        for i, col in enumerate(target.columns):
+            res = self.func(target[col], *self.args, **self.kwargs)
+            ares = np.asarray(res).ndim
+
+            # must be a scalar or 1d
+            if ares > 1:
+                raise ValueError("too many dims to broadcast")
+            if ares == 1:
+                # must match return dim
+                if result_compare != len(res):
+                    raise ValueError("cannot broadcast result")
+
+            result_values[:, i] = res
+
+        # we *always* preserve the original index / columns
+        result = self.obj._constructor(
+            result_values, index=target.index, columns=target.columns
+        )
+        return result
+
+    def apply_standard(self):
+        if self.engine == "python":
+            results, res_index = self.apply_series_generator()
+        else:
+            results, res_index = self.apply_series_numba()
+
+        # wrap results
+        return self.wrap_results(results, res_index)
+
+    def apply_series_generator(self) -> tuple[ResType, Index]:
+        assert callable(self.func)
+
+        series_gen = self.series_generator
+        res_index = self.result_index
+
+        results = {}
+
+        for i, v in enumerate(series_gen):
+            results[i] = self.func(v, *self.args, **self.kwargs)
+            if isinstance(results[i], ABCSeries):
+                # If we have a view on v, we need to make a copy because
+                #  series_generator will swap out the underlying data
+                results[i] = results[i].copy(deep=False)
+
+        return results, res_index
+
+    def apply_series_numba(self):
+        if self.engine_kwargs.get("parallel", False):
+            raise NotImplementedError(
+                "Parallel apply is not supported when raw=False and engine='numba'"
+            )
+        if not self.obj.index.is_unique or not self.columns.is_unique:
+            raise NotImplementedError(
+                "The index/columns must be unique when raw=False and engine='numba'"
+            )
+        self.validate_values_for_numba()
+        results = self.apply_with_numba()
+        return results, self.result_index
+
+    def wrap_results(self, results: ResType, res_index: Index) -> DataFrame | Series:
+        from pandas import Series
+
+        # see if we can infer the results
+        if len(results) > 0 and 0 in results and is_sequence(results[0]):
+            return self.wrap_results_for_axis(results, res_index)
+
+        # dict of scalars
+
+        # the default dtype of an empty Series is `object`, but this
+        # code can be hit by df.mean() where the result should have dtype
+        # float64 even if it's an empty Series.
+        constructor_sliced = self.obj._constructor_sliced
+        if len(results) == 0 and constructor_sliced is Series:
+            result = constructor_sliced(results, dtype=np.float64)
+        else:
+            result = constructor_sliced(results)
+        result.index = res_index
+
+        return result
+
+    def apply_str(self) -> DataFrame | Series:
+        # Caller is responsible for checking isinstance(self.func, str)
+        # TODO: GH#39993 - Avoid special-casing by replacing with lambda
+        if self.func == "size":
+            # Special-cased because DataFrame.size returns a single scalar
+            obj = self.obj
+            value = obj.shape[self.axis]
+            return obj._constructor_sliced(value, index=self.agg_axis)
+        return super().apply_str()
+
+
+class FrameRowApply(FrameApply):
+    axis: AxisInt = 0
+
+    @property
+    def series_generator(self) -> Generator[Series]:
+        return (self.obj._ixs(i, axis=1) for i in range(len(self.columns)))
+
+    @staticmethod
+    @functools.cache
+    def generate_numba_apply_func(
+        func, nogil=True, nopython=True, parallel=False
+    ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]:
+        numba = import_optional_dependency("numba")
+        from pandas import Series
+
+        # Import helper from extensions to cast string object -> np strings
+        # Note: This also has the side effect of loading our numba extensions
+        from pandas.core._numba.extensions import maybe_cast_str
+
+        jitted_udf = numba.extending.register_jitable(func)
+
+        # Currently the parallel argument doesn't get passed through here
+        # (it's disabled) since the dicts in numba aren't thread-safe.
+        @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
+        def numba_func(values, col_names, df_index, *args):
+            results = {}
+            for j in range(values.shape[1]):
+                # Create the series
+                ser = Series(
+                    values[:, j], index=df_index, name=maybe_cast_str(col_names[j])
+                )
+                results[j] = jitted_udf(ser, *args)
+            return results
+
+        return numba_func
+
+    def apply_with_numba(self) -> dict[int, Any]:
+        func = cast(Callable, self.func)
+        args, kwargs = prepare_function_arguments(
+            func, self.args, self.kwargs, num_required_args=1
+        )
+        nb_func = self.generate_numba_apply_func(
+            func, **get_jit_arguments(self.engine_kwargs)
+        )
+        from pandas.core._numba.extensions import set_numba_data
+
+        index = self.obj.index
+        columns = self.obj.columns
+
+        # Convert from numba dict to regular dict
+        # Our isinstance checks in the df constructor don't pass for numbas typed dict
+        with set_numba_data(index) as index, set_numba_data(columns) as columns:
+            res = dict(nb_func(self.values, columns, index, *args))
+        return res
+
+    @property
+    def result_index(self) -> Index:
+        return self.columns
+
+    @property
+    def result_columns(self) -> Index:
+        return self.index
+
+    def wrap_results_for_axis(
+        self, results: ResType, res_index: Index
+    ) -> DataFrame | Series:
+        """return the results for the rows"""
+
+        if self.result_type == "reduce":
+            # e.g. test_apply_dict GH#8735
+            res = self.obj._constructor_sliced(results)
+            res.index = res_index
+            return res
+
+        elif self.result_type is None and all(
+            isinstance(x, dict) for x in results.values()
+        ):
+            # Our operation was a to_dict op e.g.
+            #  test_apply_dict GH#8735, test_apply_reduce_to_dict GH#25196 #37544
+            res = self.obj._constructor_sliced(results)
+            res.index = res_index
+            return res
+
+        try:
+            result = self.obj._constructor(data=results)
+        except ValueError as err:
+            if "All arrays must be of the same length" in str(err):
+                # e.g. result = [[2, 3], [1.5], ['foo', 'bar']]
+                #  see test_agg_listlike_result GH#29587
+                res = self.obj._constructor_sliced(results)
+                res.index = res_index
+                return res
+            else:
+                raise
+
+        if not isinstance(results[0], ABCSeries):
+            if len(result.index) == len(self.res_columns):
+                result.index = self.res_columns
+
+        if len(result.columns) == len(res_index):
+            result.columns = res_index
+
+        return result
+
+
+class FrameColumnApply(FrameApply):
+    axis: AxisInt = 1
+
+    def apply_broadcast(self, target: DataFrame) -> DataFrame:
+        result = super().apply_broadcast(target.T)
+        return result.T
+
+    @property
+    def series_generator(self) -> Generator[Series]:
+        values = self.values
+        values = ensure_wrapped_if_datetimelike(values)
+        assert len(values) > 0
+
+        # We create one Series object, and will swap out the data inside
+        #  of it.  Kids: don't do this at home.
+        ser = self.obj._ixs(0, axis=0)
+        mgr = ser._mgr
+
+        is_view = mgr.blocks[0].refs.has_reference()
+
+        if isinstance(ser.dtype, ExtensionDtype):
+            # values will be incorrect for this block
+            # TODO(EA2D): special case would be unnecessary with 2D EAs
+            obj = self.obj
+            for i in range(len(obj)):
+                yield obj._ixs(i, axis=0)
+
+        else:
+            for arr, name in zip(values, self.index, strict=True):
+                # GH#35462 re-pin mgr in case setitem changed it
+                ser._mgr = mgr
+                mgr.set_values(arr)
+                object.__setattr__(ser, "_name", name)
+                if not is_view:
+                    # In apply_series_generator we store the a shallow copy of the
+                    # result, which potentially increases the ref count of this reused
+                    # `ser` object (depending on the result of the applied function)
+                    # -> if that happened and `ser` is already a copy, then we reset
+                    # the refs here to avoid triggering a unnecessary CoW inside the
+                    # applied function (https://github.com/pandas-dev/pandas/pull/56212)
+                    mgr.blocks[0].refs = BlockValuesRefs(mgr.blocks[0])
+                yield ser
+
+    @staticmethod
+    @functools.cache
+    def generate_numba_apply_func(
+        func, nogil=True, nopython=True, parallel=False
+    ) -> Callable[[npt.NDArray, Index, Index], dict[int, Any]]:
+        numba = import_optional_dependency("numba")
+        from pandas import Series
+        from pandas.core._numba.extensions import maybe_cast_str
+
+        jitted_udf = numba.extending.register_jitable(func)
+
+        @numba.jit(nogil=nogil, nopython=nopython, parallel=parallel)
+        def numba_func(values, col_names_index, index, *args):
+            results = {}
+            # Currently the parallel argument doesn't get passed through here
+            # (it's disabled) since the dicts in numba aren't thread-safe.
+            for i in range(values.shape[0]):
+                # Create the series
+                # TODO: values corrupted without the copy
+                ser = Series(
+                    values[i].copy(),
+                    index=col_names_index,
+                    name=maybe_cast_str(index[i]),
+                )
+                results[i] = jitted_udf(ser, *args)
+
+            return results
+
+        return numba_func
+
+    def apply_with_numba(self) -> dict[int, Any]:
+        func = cast(Callable, self.func)
+        args, kwargs = prepare_function_arguments(
+            func, self.args, self.kwargs, num_required_args=1
+        )
+        nb_func = self.generate_numba_apply_func(
+            func, **get_jit_arguments(self.engine_kwargs)
+        )
+
+        from pandas.core._numba.extensions import set_numba_data
+
+        # Convert from numba dict to regular dict
+        # Our isinstance checks in the df constructor don't pass for numbas typed dict
+        with (
+            set_numba_data(self.obj.index) as index,
+            set_numba_data(self.columns) as columns,
+        ):
+            res = dict(nb_func(self.values, columns, index, *args))
+
+        return res
+
+    @property
+    def result_index(self) -> Index:
+        return self.index
+
+    @property
+    def result_columns(self) -> Index:
+        return self.columns
+
+    def wrap_results_for_axis(
+        self, results: ResType, res_index: Index
+    ) -> DataFrame | Series:
+        """return the results for the columns"""
+        result: DataFrame | Series
+
+        # we have requested to expand
+        if self.result_type == "expand":
+            result = self.infer_to_same_shape(results, res_index)
+
+        # we have a non-series and don't want inference
+        elif not isinstance(results[0], ABCSeries):
+            result = self.obj._constructor_sliced(results)
+            result.index = res_index
+
+        # we may want to infer results
+        else:
+            result = self.infer_to_same_shape(results, res_index)
+
+        return result
+
+    def infer_to_same_shape(self, results: ResType, res_index: Index) -> DataFrame:
+        """infer the results to the same shape as the input object"""
+        result = self.obj._constructor(data=results)
+        result = result.T
+
+        # set the index
+        result.index = res_index
+
+        # infer dtypes
+        result = result.infer_objects()
+
+        return result
+
+
+class SeriesApply(NDFrameApply):
+    obj: Series
+    axis: AxisInt = 0
+    by_row: Literal[False, "compat", "_compat"]  # only relevant for apply()
+
+    def __init__(
+        self,
+        obj: Series,
+        func: AggFuncType,
+        *,
+        by_row: Literal[False, "compat", "_compat"] = "compat",
+        args,
+        kwargs,
+    ) -> None:
+        super().__init__(
+            obj,
+            func,
+            raw=False,
+            result_type=None,
+            by_row=by_row,
+            args=args,
+            kwargs=kwargs,
+        )
+
+    def apply(self) -> DataFrame | Series:
+        obj = self.obj
+
+        if len(obj) == 0:
+            return self.apply_empty_result()
+
+        # dispatch to handle list-like or dict-like
+        if is_list_like(self.func):
+            return self.apply_list_or_dict_like()
+
+        if isinstance(self.func, str):
+            # if we are a string, try to dispatch
+            return self.apply_str()
+
+        if self.by_row == "_compat":
+            return self.apply_compat()
+
+        # self.func is Callable
+        return self.apply_standard()
+
+    def agg(self):
+        result = super().agg()
+        if result is None:
+            obj = self.obj
+            func = self.func
+            # string, list-like, and dict-like are entirely handled in super
+            assert callable(func)
+            result = func(obj, *self.args, **self.kwargs)
+        return result
+
+    def apply_empty_result(self) -> Series:
+        obj = self.obj
+        return obj._constructor(dtype=obj.dtype, index=obj.index).__finalize__(
+            obj, method="apply"
+        )
+
+    def apply_compat(self):
+        """compat apply method for funcs in listlikes and dictlikes.
+
+         Used for each callable when giving listlikes and dictlikes of callables to
+         apply. Needed for compatibility with Pandas < v2.1.
+
+        .. versionadded:: 2.1.0
+        """
+        obj = self.obj
+        func = self.func
+
+        if callable(func):
+            f = com.get_cython_func(func)
+            if f and not self.args and not self.kwargs:
+                return obj.apply(func, by_row=False)
+
+        try:
+            result = obj.apply(func, by_row="compat")
+        except (ValueError, AttributeError, TypeError):
+            result = obj.apply(func, by_row=False)
+        return result
+
+    def apply_standard(self) -> DataFrame | Series:
+        # caller is responsible for ensuring that f is Callable
+        func = cast(Callable, self.func)
+        obj = self.obj
+
+        if isinstance(func, np.ufunc):
+            with np.errstate(all="ignore"):
+                return func(obj, *self.args, **self.kwargs)
+        elif not self.by_row:
+            return func(obj, *self.args, **self.kwargs)
+
+        if self.args or self.kwargs:
+            # _map_values does not support args/kwargs
+            def curried(x):
+                return func(x, *self.args, **self.kwargs)
+
+        else:
+            curried = func
+        mapped = obj._map_values(mapper=curried)
+
+        if len(mapped) and isinstance(mapped[0], ABCSeries):
+            # GH#43986 Need to do list(mapped) in order to get treated as nested
+            #  See also GH#25959 regarding EA support
+            return obj._constructor_expanddim(list(mapped), index=obj.index)
+        else:
+            return obj._constructor(mapped, index=obj.index).__finalize__(
+                obj, method="apply"
+            )
+
+
+class GroupByApply(Apply):
+    obj: GroupBy | Resampler | BaseWindow
+
+    def __init__(
+        self,
+        obj: GroupBy[NDFrameT],
+        func: AggFuncType,
+        *,
+        args,
+        kwargs,
+    ) -> None:
+        kwargs = kwargs.copy()
+        self.axis = obj.obj._get_axis_number(kwargs.get("axis", 0))
+        super().__init__(
+            obj,
+            func,
+            raw=False,
+            result_type=None,
+            args=args,
+            kwargs=kwargs,
+        )
+
+    def apply(self):
+        raise NotImplementedError
+
+    def transform(self):
+        raise NotImplementedError
+
+    def agg_or_apply_list_like(
+        self, op_name: Literal["agg", "apply"]
+    ) -> DataFrame | Series:
+        obj = self.obj
+        kwargs = self.kwargs
+        if op_name == "apply":
+            kwargs = {**kwargs, "by_row": False}
+
+        if getattr(obj, "axis", 0) == 1:
+            raise NotImplementedError("axis other than 0 is not supported")
+
+        if obj._selected_obj.ndim == 1:
+            # For SeriesGroupBy this matches _obj_with_exclusions
+            selected_obj = obj._selected_obj
+        else:
+            selected_obj = obj._obj_with_exclusions
+
+        # Only set as_index=True on groupby objects, not Window or Resample
+        # that inherit from this class.
+        with com.temp_setattr(
+            obj, "as_index", True, condition=hasattr(obj, "as_index")
+        ):
+            keys, results = self.compute_list_like(op_name, selected_obj, kwargs)
+        result = self.wrap_results_list_like(keys, results)
+        return result
+
+    def agg_or_apply_dict_like(
+        self, op_name: Literal["agg", "apply"]
+    ) -> DataFrame | Series:
+        from pandas.core.groupby.generic import (
+            DataFrameGroupBy,
+            SeriesGroupBy,
+        )
+
+        assert op_name in ["agg", "apply"]
+
+        obj = self.obj
+        kwargs: dict[str, Any] = {}
+        if op_name == "apply":
+            by_row = "_compat" if self.by_row else False
+            kwargs.update({"by_row": by_row})
+
+        if getattr(obj, "axis", 0) == 1:
+            raise NotImplementedError("axis other than 0 is not supported")
+
+        selected_obj = obj._selected_obj
+        selection = obj._selection
+
+        is_groupby = isinstance(obj, (DataFrameGroupBy, SeriesGroupBy))
+
+        # Numba Groupby engine/engine-kwargs passthrough
+        if is_groupby:
+            engine = self.kwargs.get("engine", None)
+            engine_kwargs = self.kwargs.get("engine_kwargs", None)
+            kwargs.update({"engine": engine, "engine_kwargs": engine_kwargs})
+
+        with com.temp_setattr(
+            obj, "as_index", True, condition=hasattr(obj, "as_index")
+        ):
+            result_index, result_data = self.compute_dict_like(
+                op_name, selected_obj, selection, kwargs
+            )
+        result = self.wrap_results_dict_like(selected_obj, result_index, result_data)
+        return result
+
+
+class ResamplerWindowApply(GroupByApply):
+    axis: AxisInt = 0
+    obj: Resampler | BaseWindow
+
+    def __init__(
+        self,
+        obj: Resampler | BaseWindow,
+        func: AggFuncType,
+        *,
+        args,
+        kwargs,
+    ) -> None:
+        super(GroupByApply, self).__init__(
+            obj,
+            func,
+            raw=False,
+            result_type=None,
+            args=args,
+            kwargs=kwargs,
+        )
+
+    def apply(self):
+        raise NotImplementedError
+
+    def transform(self):
+        raise NotImplementedError
+
+
+def reconstruct_func(
+    func: AggFuncType | None, **kwargs
+) -> tuple[bool, AggFuncType, tuple[str, ...] | None, npt.NDArray[np.intp] | None]:
+    """
+    This is the internal function to reconstruct func given if there is relabeling
+    or not and also normalize the keyword to get new order of columns.
+
+    If named aggregation is applied, `func` will be None, and kwargs contains the
+    column and aggregation function information to be parsed;
+    If named aggregation is not applied, `func` is either string (e.g. 'min') or
+    Callable, or list of them (e.g. ['min', np.max]), or the dictionary of column name
+    and str/Callable/list of them (e.g. {'A': 'min'}, or {'A': [np.min, lambda x: x]})
+
+    If relabeling is True, will return relabeling, reconstructed func, column
+    names, and the reconstructed order of columns.
+    If relabeling is False, the columns and order will be None.
+
+    Parameters
+    ----------
+    func: agg function (e.g. 'min' or Callable) or list of agg functions
+        (e.g. ['min', np.max]) or dictionary (e.g. {'A': ['min', np.max]}).
+    **kwargs: dict, kwargs used in is_multi_agg_with_relabel and
+        normalize_keyword_aggregation function for relabelling
+
+    Returns
+    -------
+    relabelling: bool, if there is relabelling or not
+    func: normalized and mangled func
+    columns: tuple of column names
+    order: array of columns indices
+
+    Examples
+    --------
+    >>> reconstruct_func(None, **{"foo": ("col", "min")})
+    (True, defaultdict(<class 'list'>, {'col': ['min']}), ('foo',), array([0]))
+
+    >>> reconstruct_func("min")
+    (False, 'min', None, None)
+    """
+    from pandas.core.groupby.generic import NamedAgg
+
+    relabeling = func is None and (
+        is_multi_agg_with_relabel(**kwargs)
+        or any(isinstance(v, NamedAgg) for v in kwargs.values())
+    )
+
+    columns: tuple[str, ...] | None = None
+    order: npt.NDArray[np.intp] | None = None
+
+    if not relabeling:
+        if isinstance(func, list) and len(func) > len(set(func)):
+            # GH 28426 will raise error if duplicated function names are used and
+            # there is no reassigned name
+            raise SpecificationError(
+                "Function names must be unique if there is no new column names assigned"
+            )
+        if func is None:
+            # nicer error message
+            raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).")
+
+    if relabeling:
+        # error: Incompatible types in assignment (expression has type
+        # "MutableMapping[Hashable, list[Callable[..., Any] | str]]", variable has type
+        # "Callable[..., Any] | str | list[Callable[..., Any] | str] |
+        # MutableMapping[Hashable, Callable[..., Any] | str | list[Callable[..., Any] |
+        # str]] | None")
+        converted_kwargs = {}
+        for key, val in kwargs.items():
+            if isinstance(val, NamedAgg):
+                aggfunc = val.aggfunc
+                if val.args or val.kwargs:
+                    aggfunc = lambda x, func=aggfunc, a=val.args, kw=val.kwargs: func(
+                        x, *a, **kw
+                    )
+                converted_kwargs[key] = (val.column, aggfunc)
+            else:
+                converted_kwargs[key] = val
+
+        func, columns, order = normalize_keyword_aggregation(  # type: ignore[assignment]
+            converted_kwargs
+        )
+
+    assert func is not None
+
+    return relabeling, func, columns, order
+
+
+def is_multi_agg_with_relabel(**kwargs) -> bool:
+    """
+    Check whether kwargs passed to .agg look like multi-agg with relabeling.
+
+    Parameters
+    ----------
+    **kwargs : dict
+
+    Returns
+    -------
+    bool
+
+    Examples
+    --------
+    >>> is_multi_agg_with_relabel(a="max")
+    False
+    >>> is_multi_agg_with_relabel(a_max=("a", "max"), a_min=("a", "min"))
+    True
+    >>> is_multi_agg_with_relabel()
+    False
+    """
+    return all(isinstance(v, tuple) and len(v) == 2 for v in kwargs.values()) and (
+        len(kwargs) > 0
+    )
+
+
+def normalize_keyword_aggregation(
+    kwargs: dict,
+) -> tuple[
+    MutableMapping[Hashable, list[AggFuncTypeBase]],
+    tuple[str, ...],
+    npt.NDArray[np.intp],
+]:
+    """
+    Normalize user-provided "named aggregation" kwargs.
+    Transforms from the new ``Mapping[str, NamedAgg]`` style kwargs
+    to the old Dict[str, List[scalar]]].
+
+    Parameters
+    ----------
+    kwargs : dict
+
+    Returns
+    -------
+    aggspec : dict
+        The transformed kwargs.
+    columns : tuple[str, ...]
+        The user-provided keys.
+    col_idx_order : List[int]
+        List of columns indices.
+
+    Examples
+    --------
+    >>> normalize_keyword_aggregation({"output": ("input", "sum")})
+    (defaultdict(<class 'list'>, {'input': ['sum']}), ('output',), array([0]))
+    """
+    from pandas.core.indexes.base import Index
+
+    # Normalize the aggregation functions as Mapping[column, List[func]],
+    # process normally, then fixup the names.
+    # TODO: aggspec type: typing.Dict[str, List[AggScalar]]
+    aggspec = defaultdict(list)
+    order = []
+    columns = tuple(kwargs.keys())
+
+    for column, aggfunc in kwargs.values():
+        aggspec[column].append(aggfunc)
+        order.append((column, com.get_callable_name(aggfunc) or aggfunc))
+
+    # uniquify aggfunc name if duplicated in order list
+    uniquified_order = _make_unique_kwarg_list(order)
+
+    # GH 25719, due to aggspec will change the order of assigned columns in aggregation
+    # uniquified_aggspec will store uniquified order list and will compare it with order
+    # based on index
+    aggspec_order = [
+        (column, com.get_callable_name(aggfunc) or aggfunc)
+        for column, aggfuncs in aggspec.items()
+        for aggfunc in aggfuncs
+    ]
+    uniquified_aggspec = _make_unique_kwarg_list(aggspec_order)
+
+    # get the new index of columns by comparison
+    col_idx_order = Index(uniquified_aggspec).get_indexer(uniquified_order)
+    return aggspec, columns, col_idx_order
+
+
+def _make_unique_kwarg_list(
+    seq: Sequence[tuple[Any, Any]],
+) -> Sequence[tuple[Any, Any]]:
+    """
+    Uniquify aggfunc name of the pairs in the order list
+
+    Examples:
+    --------
+    >>> kwarg_list = [("a", "<lambda>"), ("a", "<lambda>"), ("b", "<lambda>")]
+    >>> _make_unique_kwarg_list(kwarg_list)
+    [('a', '<lambda>_0'), ('a', '<lambda>_1'), ('b', '<lambda>')]
+    """
+    return [
+        (pair[0], f"{pair[1]}_{seq[:i].count(pair)}") if seq.count(pair) > 1 else pair
+        for i, pair in enumerate(seq)
+    ]
+
+
+def relabel_result(
+    result: DataFrame | Series,
+    func: dict[str, list[Callable | str]],
+    columns: Iterable[Hashable],
+    order: Iterable[int],
+) -> dict[Hashable, Series]:
+    """
+    Internal function to reorder result if relabelling is True for
+    dataframe.agg, and return the reordered result in dict.
+
+    Parameters:
+    ----------
+    result: Result from aggregation
+    func: Dict of (column name, funcs)
+    columns: New columns name for relabelling
+    order: New order for relabelling
+
+    Examples
+    --------
+    >>> from pandas.core.apply import relabel_result
+    >>> result = pd.DataFrame(
+    ...     {"A": [np.nan, 2, np.nan], "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]},
+    ...     index=["max", "mean", "min"],
+    ... )
+    >>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]}
+    >>> columns = ("foo", "aab", "bar", "dat")
+    >>> order = [0, 1, 2, 3]
+    >>> result_in_dict = relabel_result(result, funcs, columns, order)
+    >>> pd.DataFrame(result_in_dict, index=columns)
+           A    C    B
+    foo  2.0  NaN  NaN
+    aab  NaN  6.0  NaN
+    bar  NaN  NaN  4.0
+    dat  NaN  NaN  2.5
+    """
+    from pandas.core.indexes.base import Index
+
+    reordered_indexes = [
+        pair[0] for pair in sorted(zip(columns, order, strict=True), key=lambda t: t[1])
+    ]
+    reordered_result_in_dict: dict[Hashable, Series] = {}
+    idx = 0
+
+    reorder_mask = not isinstance(result, ABCSeries) and len(result.columns) > 1
+    for col, fun in func.items():
+        s = result[col].dropna()
+
+        # In the `_aggregate`, the callable names are obtained and used in `result`, and
+        # these names are ordered alphabetically. e.g.
+        #           C2   C1
+        # <lambda>   1  NaN
+        # amax     NaN  4.0
+        # max      NaN  4.0
+        # sum     18.0  6.0
+        # Therefore, the order of functions for each column could be shuffled
+        # accordingly so need to get the callable name if it is not parsed names, and
+        # reorder the aggregated result for each column.
+        # e.g. if df.agg(c1=("C2", sum), c2=("C2", lambda x: min(x))), correct order is
+        # [sum, <lambda>], but in `result`, it will be [<lambda>, sum], and we need to
+        # reorder so that aggregated values map to their functions regarding the order.
+
+        # However there is only one column being used for aggregation, not need to
+        # reorder since the index is not sorted, and keep as is in `funcs`, e.g.
+        #         A
+        # min   1.0
+        # mean  1.5
+        # mean  1.5
+        if reorder_mask:
+            fun = [
+                com.get_callable_name(f) if not isinstance(f, str) else f for f in fun
+            ]
+            col_idx_order = Index(s.index, copy=False).get_indexer(fun)
+            valid_idx = col_idx_order != -1
+            if valid_idx.any():
+                s = s.iloc[col_idx_order[valid_idx]]
+        # assign the new user-provided "named aggregation" as index names, and reindex
+        # it based on the whole user-provided names.
+        if not s.empty:
+            s.index = reordered_indexes[idx : idx + len(fun)]
+        reordered_result_in_dict[col] = s.reindex(columns)
+        idx = idx + len(fun)
+    return reordered_result_in_dict
+
+
+def reconstruct_and_relabel_result(result, func, **kwargs) -> DataFrame | Series:
+    from pandas import DataFrame
+
+    relabeling, func, columns, order = reconstruct_func(func, **kwargs)
+
+    if relabeling:
+        # This is to keep the order to columns occurrence unchanged, and also
+        # keep the order of new columns occurrence unchanged
+
+        # For the return values of reconstruct_func, if relabeling is
+        # False, columns and order will be None.
+        assert columns is not None
+        assert order is not None
+
+        result_in_dict = relabel_result(result, func, columns, order)
+        result = DataFrame(result_in_dict, index=columns)
+
+    return result
+
+
+# TODO: Can't use, because mypy doesn't like us setting __name__
+#   error: "partial[Any]" has no attribute "__name__"
+# the type is:
+#   typing.Sequence[Callable[..., ScalarResult]]
+#     -> typing.Sequence[Callable[..., ScalarResult]]:
+
+
+def _managle_lambda_list(aggfuncs: Sequence[Any]) -> Sequence[Any]:
+    """
+    Possibly mangle a list of aggfuncs.
+
+    Parameters
+    ----------
+    aggfuncs : Sequence
+
+    Returns
+    -------
+    mangled: list-like
+        A new AggSpec sequence, where lambdas have been converted
+        to have unique names.
+
+    Notes
+    -----
+    If just one aggfunc is passed, the name will not be mangled.
+    """
+    if len(aggfuncs) <= 1:
+        # don't mangle for .agg([lambda x: .])
+        return aggfuncs
+    i = 0
+    mangled_aggfuncs = []
+    for aggfunc in aggfuncs:
+        if com.get_callable_name(aggfunc) == "<lambda>":
+            aggfunc = partial(aggfunc)
+            # error: "partial[Any]" has no attribute "__name__"; maybe "__new__"?
+            aggfunc.__name__ = f"<lambda_{i}>"  # type: ignore[attr-defined]
+            i += 1
+        mangled_aggfuncs.append(aggfunc)
+
+    return mangled_aggfuncs
+
+
+def maybe_mangle_lambdas(agg_spec: Any) -> Any:
+    """
+    Make new lambdas with unique names.
+
+    Parameters
+    ----------
+    agg_spec : Any
+        An argument to GroupBy.agg.
+        Non-dict-like `agg_spec` are pass through as is.
+        For dict-like `agg_spec` a new spec is returned
+        with name-mangled lambdas.
+
+    Returns
+    -------
+    mangled : Any
+        Same type as the input.
+
+    Examples
+    --------
+    >>> maybe_mangle_lambdas("sum")
+    'sum'
+    >>> maybe_mangle_lambdas([lambda: 1, lambda: 2])  # doctest: +SKIP
+    [<function __main__.<lambda_0>,
+     <function pandas...._make_lambda.<locals>.f(*args, **kwargs)>]
+    """
+    is_dict = is_dict_like(agg_spec)
+    if not (is_dict or is_list_like(agg_spec)):
+        return agg_spec
+    mangled_aggspec = type(agg_spec)()  # dict or OrderedDict
+
+    if is_dict:
+        for key, aggfuncs in agg_spec.items():
+            if is_list_like(aggfuncs) and not is_dict_like(aggfuncs):
+                mangled_aggfuncs = _managle_lambda_list(aggfuncs)
+            else:
+                mangled_aggfuncs = aggfuncs
+
+            mangled_aggspec[key] = mangled_aggfuncs
+    else:
+        mangled_aggspec = _managle_lambda_list(agg_spec)
+
+    return mangled_aggspec
+
+
+def validate_func_kwargs(
+    kwargs: dict,
+) -> tuple[list[str], list[str | Callable[..., Any]]]:
+    """
+    Validates types of user-provided "named aggregation" kwargs.
+    `TypeError` is raised if aggfunc is not `str` or callable.
+
+    Parameters
+    ----------
+    kwargs : dict
+
+    Returns
+    -------
+    columns : List[str]
+        List of user-provided keys.
+    func : List[Union[str, callable[...,Any]]]
+        List of user-provided aggfuncs
+
+    Examples
+    --------
+    >>> validate_func_kwargs({"one": "min", "two": "max"})
+    (['one', 'two'], ['min', 'max'])
+    """
+    tuple_given_message = "func is expected but received {} in **kwargs."
+    columns = list(kwargs)
+    func = []
+    for col_func in kwargs.values():
+        if not (isinstance(col_func, str) or callable(col_func)):
+            raise TypeError(tuple_given_message.format(type(col_func).__name__))
+        func.append(col_func)
+    if not columns:
+        no_arg_message = "Must provide 'func' or named aggregation **kwargs."
+        raise TypeError(no_arg_message)
+    return columns, func
+
+
+def include_axis(op_name: Literal["agg", "apply"], colg: Series | DataFrame) -> bool:
+    return isinstance(colg, ABCDataFrame) or (
+        isinstance(colg, ABCSeries) and op_name == "agg"
+    )
diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py
new file mode 100644
index 0000000000000000000000000000000000000000..5244f86e47318b4e8895a9271161ee2aea50ee10
--- /dev/null
+++ b/pandas/core/arraylike.py
@@ -0,0 +1,534 @@
+"""
+Methods that can be shared by many array-like classes or subclasses:
+    Series
+    Index
+    ExtensionArray
+"""
+
+from __future__ import annotations
+
+import operator
+from typing import Any
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
+
+from pandas.core.dtypes.cast import maybe_unbox_numpy_scalar
+from pandas.core.dtypes.generic import ABCNDFrame
+
+from pandas.core import roperator
+from pandas.core.construction import extract_array
+from pandas.core.ops.common import unpack_zerodim_and_defer
+
+REDUCTION_ALIASES = {
+    "maximum": "max",
+    "minimum": "min",
+    "add": "sum",
+    "multiply": "prod",
+}
+
+
+class OpsMixin:
+    # -------------------------------------------------------------
+    # Comparisons
+
+    def _cmp_method(self, other, op):
+        return NotImplemented
+
+    @unpack_zerodim_and_defer("__eq__")
+    def __eq__(self, other):
+        return self._cmp_method(other, operator.eq)
+
+    @unpack_zerodim_and_defer("__ne__")
+    def __ne__(self, other):
+        return self._cmp_method(other, operator.ne)
+
+    @unpack_zerodim_and_defer("__lt__")
+    def __lt__(self, other):
+        return self._cmp_method(other, operator.lt)
+
+    @unpack_zerodim_and_defer("__le__")
+    def __le__(self, other):
+        return self._cmp_method(other, operator.le)
+
+    @unpack_zerodim_and_defer("__gt__")
+    def __gt__(self, other):
+        return self._cmp_method(other, operator.gt)
+
+    @unpack_zerodim_and_defer("__ge__")
+    def __ge__(self, other):
+        return self._cmp_method(other, operator.ge)
+
+    # -------------------------------------------------------------
+    # Logical Methods
+
+    def _logical_method(self, other, op):
+        return NotImplemented
+
+    @unpack_zerodim_and_defer("__and__")
+    def __and__(self, other):
+        return self._logical_method(other, operator.and_)
+
+    @unpack_zerodim_and_defer("__rand__")
+    def __rand__(self, other):
+        return self._logical_method(other, roperator.rand_)
+
+    @unpack_zerodim_and_defer("__or__")
+    def __or__(self, other):
+        return self._logical_method(other, operator.or_)
+
+    @unpack_zerodim_and_defer("__ror__")
+    def __ror__(self, other):
+        return self._logical_method(other, roperator.ror_)
+
+    @unpack_zerodim_and_defer("__xor__")
+    def __xor__(self, other):
+        return self._logical_method(other, operator.xor)
+
+    @unpack_zerodim_and_defer("__rxor__")
+    def __rxor__(self, other):
+        return self._logical_method(other, roperator.rxor)
+
+    # -------------------------------------------------------------
+    # Arithmetic Methods
+
+    def _arith_method(self, other, op):
+        return NotImplemented
+
+    @unpack_zerodim_and_defer("__add__")
+    def __add__(self, other):
+        """
+        Get Addition of DataFrame and other, column-wise.
+
+        Equivalent to ``DataFrame.add(other)``.
+
+        Parameters
+        ----------
+        other : scalar, sequence, Series, dict or DataFrame
+            Object to be added to the DataFrame.
+
+        Returns
+        -------
+        DataFrame
+            The result of adding ``other`` to DataFrame.
+
+        See Also
+        --------
+        DataFrame.add : Add a DataFrame and another object, with option for index-
+            or column-oriented addition.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"height": [1.5, 2.6], "weight": [500, 800]}, index=["elk", "moose"]
+        ... )
+        >>> df
+               height  weight
+        elk       1.5     500
+        moose     2.6     800
+
+        Adding a scalar affects all rows and columns.
+
+        >>> df[["height", "weight"]] + 1.5
+               height  weight
+        elk       3.0   501.5
+        moose     4.1   801.5
+
+        Each element of a list is added to a column of the DataFrame, in order.
+
+        >>> df[["height", "weight"]] + [0.5, 1.5]
+               height  weight
+        elk       2.0   501.5
+        moose     3.1   801.5
+
+        Keys of a dictionary are aligned to the DataFrame, based on column names;
+        each value in the dictionary is added to the corresponding column.
+
+        >>> df[["height", "weight"]] + {"height": 0.5, "weight": 1.5}
+               height  weight
+        elk       2.0   501.5
+        moose     3.1   801.5
+
+        When `other` is a :class:`Series`, the index of `other` is aligned with the
+        columns of the DataFrame.
+
+        >>> s1 = pd.Series([0.5, 1.5], index=["weight", "height"])
+        >>> df[["height", "weight"]] + s1
+               height  weight
+        elk       3.0   500.5
+        moose     4.1   800.5
+
+        Even when the index of `other` is the same as the index of the DataFrame,
+        the :class:`Series` will not be reoriented. If index-wise alignment is desired,
+        :meth:`DataFrame.add` should be used with `axis='index'`.
+
+        >>> s2 = pd.Series([0.5, 1.5], index=["elk", "moose"])
+        >>> df[["height", "weight"]] + s2
+               elk  height  moose  weight
+        elk    NaN     NaN    NaN     NaN
+        moose  NaN     NaN    NaN     NaN
+
+        >>> df[["height", "weight"]].add(s2, axis="index")
+               height  weight
+        elk       2.0   500.5
+        moose     4.1   801.5
+
+        When `other` is a :class:`DataFrame`, both columns names and the
+        index are aligned.
+
+        >>> other = pd.DataFrame(
+        ...     {"height": [0.2, 0.4, 0.6]}, index=["elk", "moose", "deer"]
+        ... )
+        >>> df[["height", "weight"]] + other
+               height  weight
+        deer      NaN     NaN
+        elk       1.7     NaN
+        moose     3.0     NaN
+        """
+        return self._arith_method(other, operator.add)
+
+    @unpack_zerodim_and_defer("__radd__")
+    def __radd__(self, other):
+        return self._arith_method(other, roperator.radd)
+
+    @unpack_zerodim_and_defer("__sub__")
+    def __sub__(self, other):
+        return self._arith_method(other, operator.sub)
+
+    @unpack_zerodim_and_defer("__rsub__")
+    def __rsub__(self, other):
+        return self._arith_method(other, roperator.rsub)
+
+    @unpack_zerodim_and_defer("__mul__")
+    def __mul__(self, other):
+        return self._arith_method(other, operator.mul)
+
+    @unpack_zerodim_and_defer("__rmul__")
+    def __rmul__(self, other):
+        return self._arith_method(other, roperator.rmul)
+
+    @unpack_zerodim_and_defer("__truediv__")
+    def __truediv__(self, other):
+        return self._arith_method(other, operator.truediv)
+
+    @unpack_zerodim_and_defer("__rtruediv__")
+    def __rtruediv__(self, other):
+        return self._arith_method(other, roperator.rtruediv)
+
+    @unpack_zerodim_and_defer("__floordiv__")
+    def __floordiv__(self, other):
+        return self._arith_method(other, operator.floordiv)
+
+    @unpack_zerodim_and_defer("__rfloordiv")
+    def __rfloordiv__(self, other):
+        return self._arith_method(other, roperator.rfloordiv)
+
+    @unpack_zerodim_and_defer("__mod__")
+    def __mod__(self, other):
+        return self._arith_method(other, operator.mod)
+
+    @unpack_zerodim_and_defer("__rmod__")
+    def __rmod__(self, other):
+        return self._arith_method(other, roperator.rmod)
+
+    @unpack_zerodim_and_defer("__divmod__")
+    def __divmod__(self, other):
+        return self._arith_method(other, divmod)
+
+    @unpack_zerodim_and_defer("__rdivmod__")
+    def __rdivmod__(self, other):
+        return self._arith_method(other, roperator.rdivmod)
+
+    @unpack_zerodim_and_defer("__pow__")
+    def __pow__(self, other):
+        return self._arith_method(other, operator.pow)
+
+    @unpack_zerodim_and_defer("__rpow__")
+    def __rpow__(self, other):
+        return self._arith_method(other, roperator.rpow)
+
+
+# -----------------------------------------------------------------------------
+# Helpers to implement __array_ufunc__
+
+
+def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any):
+    """
+    Compatibility with numpy ufuncs.
+
+    See also
+    --------
+    numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__
+    """
+    from pandas.core.frame import (
+        DataFrame,
+        Series,
+    )
+    from pandas.core.generic import NDFrame
+    from pandas.core.internals import BlockManager
+
+    cls = type(self)
+
+    kwargs = _standardize_out_kwarg(**kwargs)
+
+    # for binary ops, use our custom dunder methods
+    result = maybe_dispatch_ufunc_to_dunder_op(self, ufunc, method, *inputs, **kwargs)
+    if result is not NotImplemented:
+        return result
+
+    # Determine if we should defer.
+    no_defer = (
+        np.ndarray.__array_ufunc__,
+        cls.__array_ufunc__,
+    )
+
+    for item in inputs:
+        higher_priority = (
+            hasattr(item, "__array_priority__")
+            and item.__array_priority__ > self.__array_priority__
+        )
+        has_array_ufunc = (
+            hasattr(item, "__array_ufunc__")
+            and type(item).__array_ufunc__ not in no_defer
+            and not isinstance(item, self._HANDLED_TYPES)
+        )
+        if higher_priority or has_array_ufunc:
+            return NotImplemented
+
+    # align all the inputs.
+    types = tuple(type(x) for x in inputs)
+    alignable = [
+        x for x, t in zip(inputs, types, strict=True) if issubclass(t, NDFrame)
+    ]
+
+    if len(alignable) > 1:
+        # This triggers alignment.
+        # At the moment, there aren't any ufuncs with more than two inputs
+        # so this ends up just being x1.index | x2.index, but we write
+        # it to handle *args.
+        set_types = set(types)
+        if len(set_types) > 1 and {DataFrame, Series}.issubset(set_types):
+            # We currently don't handle ufunc(DataFrame, Series)
+            # well. Previously this raised an internal ValueError. We might
+            # support it someday, so raise a NotImplementedError.
+            raise NotImplementedError(
+                f"Cannot apply ufunc {ufunc} to mixed DataFrame and Series inputs."
+            )
+        axes = self.axes
+        for obj in alignable[1:]:
+            # this relies on the fact that we aren't handling mixed
+            # series / frame ufuncs.
+            for i, (ax1, ax2) in enumerate(zip(axes, obj.axes, strict=True)):
+                axes[i] = ax1.union(ax2)
+
+        reconstruct_axes = dict(zip(self._AXIS_ORDERS, axes, strict=True))
+        inputs = tuple(
+            x.reindex(**reconstruct_axes) if issubclass(t, NDFrame) else x
+            for x, t in zip(inputs, types, strict=True)
+        )
+    else:
+        reconstruct_axes = dict(zip(self._AXIS_ORDERS, self.axes, strict=True))
+
+    if self.ndim == 1:
+        names = {x.name for x in inputs if hasattr(x, "name")}
+        name = names.pop() if len(names) == 1 else None
+        reconstruct_kwargs = {"name": name}
+    else:
+        reconstruct_kwargs = {}
+
+    def reconstruct(result):
+        if ufunc.nout > 1:
+            # np.modf, np.frexp, np.divmod
+            return tuple(_reconstruct(x) for x in result)
+
+        return _reconstruct(result)
+
+    def _reconstruct(result):
+        if lib.is_scalar(result):
+            return result
+
+        if result.ndim != self.ndim:
+            if method == "outer":
+                raise NotImplementedError
+            return result
+        if isinstance(result, BlockManager):
+            # we went through BlockManager.apply e.g. np.sqrt
+            result = self._constructor_from_mgr(result, axes=result.axes)
+        else:
+            # we converted an array, lost our axes
+            result = self._constructor(
+                result, **reconstruct_axes, **reconstruct_kwargs, copy=False
+            )
+        # TODO: When we support multiple values in __finalize__, this
+        # should pass alignable to `__finalize__` instead of self.
+        # Then `np.add(a, b)` would consider attrs from both a and b
+        # when a and b are NDFrames.
+        if len(alignable) == 1:
+            result = result.__finalize__(self)
+        return result
+
+    if "out" in kwargs:
+        # e.g. test_multiindex_get_loc
+        result = dispatch_ufunc_with_out(self, ufunc, method, *inputs, **kwargs)
+        return reconstruct(result)
+
+    if method == "reduce":
+        # e.g. test.series.test_ufunc.test_reduce
+        result = dispatch_reduction_ufunc(self, ufunc, method, *inputs, **kwargs)
+        if result is not NotImplemented:
+            return result
+
+    # We still get here with kwargs `axis` for e.g. np.maximum.accumulate
+    #  and `dtype` and `keepdims` for np.ptp
+
+    if self.ndim > 1 and (len(inputs) > 1 or ufunc.nout > 1):
+        # Just give up on preserving types in the complex case.
+        # In theory we could preserve them for them.
+        # * nout>1 is doable if BlockManager.apply took nout and
+        #   returned a Tuple[BlockManager].
+        # * len(inputs) > 1 is doable when we know that we have
+        #   aligned blocks / dtypes.
+
+        # e.g. my_ufunc, modf, logaddexp, heaviside, subtract, add
+        inputs = tuple(np.asarray(x) for x in inputs)
+        # Note: we can't use default_array_ufunc here bc reindexing means
+        #  that `self` may not be among `inputs`
+        result = getattr(ufunc, method)(*inputs, **kwargs)
+    elif self.ndim == 1:
+        # ufunc(series, ...)
+        inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
+        result = getattr(ufunc, method)(*inputs, **kwargs)
+    # ufunc(dataframe)
+    elif method == "__call__" and not kwargs:
+        # for np.<ufunc>(..) calls
+        # kwargs cannot necessarily be handled block-by-block, so only
+        # take this path if there are no kwargs
+        mgr = inputs[0]._mgr  # pyright: ignore[reportGeneralTypeIssues]
+        result = mgr.apply(getattr(ufunc, method))
+    else:
+        # otherwise specific ufunc methods (eg np.<ufunc>.accumulate(..))
+        # Those can have an axis keyword and thus can't be called block-by-block
+        result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs)  # pyright: ignore[reportGeneralTypeIssues]
+        # e.g. np.negative (only one reached), with "where" and "out" in kwargs
+
+    result = reconstruct(result)
+    return result
+
+
+def _standardize_out_kwarg(**kwargs) -> dict:
+    """
+    If kwargs contain "out1" and "out2", replace that with a tuple "out"
+
+    np.divmod, np.modf, np.frexp can have either `out=(out1, out2)` or
+    `out1=out1, out2=out2)`
+    """
+    if "out" not in kwargs and "out1" in kwargs and "out2" in kwargs:
+        out1 = kwargs.pop("out1")
+        out2 = kwargs.pop("out2")
+        out = (out1, out2)
+        kwargs["out"] = out
+    return kwargs
+
+
+def dispatch_ufunc_with_out(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
+    """
+    If we have an `out` keyword, then call the ufunc without `out` and then
+    set the result into the given `out`.
+    """
+
+    # Note: we assume _standardize_out_kwarg has already been called.
+    out = kwargs.pop("out")
+    where = kwargs.pop("where", None)
+
+    result = getattr(ufunc, method)(*inputs, **kwargs)
+
+    if result is NotImplemented:
+        return NotImplemented
+
+    if isinstance(result, tuple):
+        # i.e. np.divmod, np.modf, np.frexp
+        if not isinstance(out, tuple) or len(out) != len(result):
+            raise NotImplementedError
+
+        for arr, res in zip(out, result, strict=True):
+            _assign_where(arr, res, where)
+
+        return out
+
+    if isinstance(out, tuple):
+        if len(out) == 1:
+            out = out[0]
+        else:
+            raise NotImplementedError
+
+    _assign_where(out, result, where)
+    return out
+
+
+def _assign_where(out, result, where) -> None:
+    """
+    Set a ufunc result into 'out', masking with a 'where' argument if necessary.
+    """
+    if where is None:
+        # no 'where' arg passed to ufunc
+        out[:] = result
+    else:
+        np.putmask(out, where, result)
+
+
+def default_array_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
+    """
+    Fallback to the behavior we would get if we did not define __array_ufunc__.
+
+    Notes
+    -----
+    We are assuming that `self` is among `inputs`.
+    """
+    if not any(x is self for x in inputs):
+        raise NotImplementedError
+
+    new_inputs = [x if x is not self else np.asarray(x) for x in inputs]
+
+    return getattr(ufunc, method)(*new_inputs, **kwargs)
+
+
+def dispatch_reduction_ufunc(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
+    """
+    Dispatch ufunc reductions to self's reduction methods.
+    """
+    assert method == "reduce"
+
+    if len(inputs) != 1 or inputs[0] is not self:
+        return NotImplemented
+
+    if ufunc.__name__ not in REDUCTION_ALIASES:
+        return NotImplemented
+
+    method_name = REDUCTION_ALIASES[ufunc.__name__]
+
+    # NB: we are assuming that min/max represent minimum/maximum methods,
+    #  which would not be accurate for e.g. Timestamp.min
+    if not hasattr(self, method_name):
+        return NotImplemented
+
+    if self.ndim > 1:
+        if isinstance(self, ABCNDFrame):
+            # TODO: test cases where this doesn't hold, i.e. 2D DTA/TDA
+            kwargs["numeric_only"] = False
+
+        if "axis" not in kwargs:
+            # For DataFrame reductions we don't want the default axis=0
+            # Note: np.min is not a ufunc, but uses array_function_dispatch,
+            #  so calls DataFrame.min (without ever getting here) with the np.min
+            #  default of axis=None, which DataFrame.min catches and changes to axis=0.
+            # np.minimum.reduce(df) gets here bc axis is not in kwargs,
+            #  so we set axis=0 to match the behavior of np.minimum.reduce(df.values)
+            kwargs["axis"] = 0
+
+    # By default, numpy's reductions do not skip NaNs, so we have to
+    #  pass skipna=False
+    result = getattr(self, method_name)(skipna=False, **kwargs)
+    result = maybe_unbox_numpy_scalar(result)
+    return result
diff --git a/pandas/core/base.py b/pandas/core/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..f92558a34756fe59112ec978e53e364db2c15714
--- /dev/null
+++ b/pandas/core/base.py
@@ -0,0 +1,1653 @@
+"""
+Base and utility classes for pandas objects.
+"""
+
+from __future__ import annotations
+
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Generic,
+    Literal,
+    Self,
+    cast,
+    final,
+    overload,
+)
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas._typing import (
+    AxisInt,
+    DtypeObj,
+    IndexLabel,
+    NDFrameT,
+    Shape,
+    npt,
+)
+from pandas.compat import PYPY
+from pandas.compat.numpy import function as nv
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import cache_readonly
+
+from pandas.core.dtypes.cast import can_hold_element
+from pandas.core.dtypes.common import (
+    is_object_dtype,
+    is_scalar,
+)
+from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCIndex,
+    ABCMultiIndex,
+    ABCSeries,
+)
+from pandas.core.dtypes.missing import (
+    isna,
+    remove_na_arraylike,
+)
+
+from pandas.core import (
+    algorithms,
+    nanops,
+    ops,
+)
+from pandas.core.accessor import DirNamesMixin
+from pandas.core.arraylike import OpsMixin
+from pandas.core.arrays import ExtensionArray
+from pandas.core.construction import (
+    ensure_wrapped_if_datetimelike,
+    extract_array,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Hashable,
+        Iterator,
+    )
+
+    from pandas._typing import (
+        DropKeep,
+        NumpySorter,
+        NumpyValueArrayLike,
+        ScalarLike_co,
+    )
+
+    from pandas import (
+        DataFrame,
+        Index,
+        Series,
+    )
+
+
+class PandasObject(DirNamesMixin):
+    """
+    Base class for various pandas objects.
+    """
+
+    # results from calls to methods decorated with cache_readonly get added to _cache
+    _cache: dict[str, Any]
+
+    @property
+    def _constructor(self) -> type[Self]:
+        """
+        Class constructor (for this class it's just `__class__`).
+        """
+        return type(self)
+
+    def __repr__(self) -> str:
+        """
+        Return a string representation for a particular object.
+        """
+        # Should be overwritten by base classes
+        return object.__repr__(self)
+
+    def _reset_cache(self, key: str | None = None) -> None:
+        """
+        Reset cached properties. If ``key`` is passed, only clears that key.
+        """
+        if not hasattr(self, "_cache"):
+            return
+        if key is None:
+            self._cache.clear()
+        else:
+            self._cache.pop(key, None)
+
+    def __sizeof__(self) -> int:
+        """
+        Generates the total memory usage for an object that returns
+        either a value or Series of values
+        """
+        memory_usage = getattr(self, "memory_usage", None)
+        if memory_usage:
+            mem = memory_usage(deep=True)
+            return int(mem if is_scalar(mem) else mem.sum())
+
+        # no memory_usage attribute, so fall back to object's 'sizeof'
+        return super().__sizeof__()
+
+
+class NoNewAttributesMixin:
+    """
+    Mixin which prevents adding new attributes.
+
+    Prevents additional attributes via xxx.attribute = "something" after a
+    call to `self.__freeze()`. Mainly used to prevent the user from using
+    wrong attributes on an accessor (`Series.cat/.str/.dt`).
+
+    If you really want to add a new attribute at a later time, you need to use
+    `object.__setattr__(self, key, value)`.
+    """
+
+    def _freeze(self) -> None:
+        """
+        Prevents setting additional attributes.
+        """
+        object.__setattr__(self, "__frozen", True)
+
+    # prevent adding any attribute via s.xxx.new_attribute = ...
+    def __setattr__(self, key: str, value) -> None:
+        # _cache is used by a decorator
+        # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key)
+        # because
+        # 1.) getattr is false for attributes that raise errors
+        # 2.) cls.__dict__ doesn't traverse into base classes
+        if getattr(self, "__frozen", False) and not (
+            key == "_cache"
+            or key in type(self).__dict__
+            or getattr(self, key, None) is not None
+        ):
+            raise AttributeError(f"You cannot add any new attribute '{key}'")
+        object.__setattr__(self, key, value)
+
+
+class SelectionMixin(Generic[NDFrameT]):
+    """
+    mixin implementing the selection & aggregation interface on a group-like
+    object sub-classes need to define: obj, exclusions
+    """
+
+    obj: NDFrameT
+    _selection: IndexLabel | None = None
+    exclusions: frozenset[Hashable]
+    _internal_names = ["_cache", "__setstate__"]
+    _internal_names_set = set(_internal_names)
+
+    @final
+    @property
+    def _selection_list(self):
+        if not isinstance(
+            self._selection, (list, tuple, ABCSeries, ABCIndex, np.ndarray)
+        ):
+            return [self._selection]
+        return self._selection
+
+    @cache_readonly
+    def _selected_obj(self):
+        if self._selection is None or isinstance(self.obj, ABCSeries):
+            return self.obj
+        else:
+            return self.obj[self._selection]
+
+    @final
+    @cache_readonly
+    def ndim(self) -> int:
+        return self._selected_obj.ndim
+
+    @final
+    @cache_readonly
+    def _obj_with_exclusions(self):
+        if isinstance(self.obj, ABCSeries):
+            return self.obj
+
+        if self._selection is not None:
+            return self.obj[self._selection_list]
+
+        if len(self.exclusions) > 0:
+            # equivalent to `self.obj.drop(self.exclusions, axis=1)
+            #  but this avoids consolidating and making a copy
+            # TODO: following GH#45287 can we now use .drop directly without
+            #  making a copy?
+            return self.obj._drop_axis(self.exclusions, axis=1, only_slice=True)
+        else:
+            return self.obj
+
+    def __getitem__(self, key):
+        if self._selection is not None:
+            raise IndexError(f"Column(s) {self._selection} already selected")
+
+        if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)):
+            if len(self.obj.columns.intersection(key)) != len(set(key)):
+                bad_keys = list(set(key).difference(self.obj.columns))
+                raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}")
+            return self._gotitem(list(key), ndim=2)
+
+        else:
+            if key not in self.obj:
+                raise KeyError(f"Column not found: {key}")
+            ndim = self.obj[key].ndim
+            return self._gotitem(key, ndim=ndim)
+
+    def _gotitem(self, key, ndim: int, subset=None):
+        """
+        sub-classes to define
+        return a sliced object
+
+        Parameters
+        ----------
+        key : str / list of selections
+        ndim : {1, 2}
+            requested ndim of result
+        subset : object, default None
+            subset to act on
+        """
+        raise AbstractMethodError(self)
+
+    @final
+    def _infer_selection(self, key, subset: Series | DataFrame):
+        """
+        Infer the `selection` to pass to our constructor in _gotitem.
+        """
+        # Shared by Rolling and Resample
+        selection = None
+        if subset.ndim == 2 and (
+            (lib.is_scalar(key) and key in subset) or lib.is_list_like(key)
+        ):
+            selection = key
+        elif subset.ndim == 1 and lib.is_scalar(key) and key == subset.name:
+            selection = key
+        return selection
+
+    def aggregate(self, func, *args, **kwargs):
+        raise AbstractMethodError(self)
+
+    agg = aggregate
+
+
+class IndexOpsMixin(OpsMixin):
+    """
+    Common ops mixin to support a unified interface / docs for Series / Index
+    """
+
+    # ndarray compatibility
+    __array_priority__ = 1000
+    _hidden_attrs: frozenset[str] = frozenset(
+        ["tolist"]  # tolist is not deprecated, just suppressed in the __dir__
+    )
+
+    @property
+    def dtype(self) -> DtypeObj:
+        # must be defined here as a property for mypy
+        raise AbstractMethodError(self)
+
+    @property
+    def _values(self) -> ExtensionArray | np.ndarray:
+        # must be defined here as a property for mypy
+        raise AbstractMethodError(self)
+
+    @final
+    def transpose(self, *args, **kwargs) -> Self:
+        """
+        Return the transpose, which is by definition self.
+
+        Returns
+        -------
+        %(klass)s
+        """
+        nv.validate_transpose(args, kwargs)
+        return self
+
+    T = property(
+        transpose,
+        doc="""
+        Return the transpose, which is by definition self.
+
+        See Also
+        --------
+        Index : Immutable sequence used for indexing and alignment.
+
+        Examples
+        --------
+        For Series:
+
+        >>> s = pd.Series(['Ant', 'Bear', 'Cow'])
+        >>> s
+        0     Ant
+        1    Bear
+        2     Cow
+        dtype: str
+        >>> s.T
+        0     Ant
+        1    Bear
+        2     Cow
+        dtype: str
+
+        For Index:
+
+        >>> idx = pd.Index([1, 2, 3])
+        >>> idx.T
+        Index([1, 2, 3], dtype='int64')
+        """,
+    )
+
+    @property
+    def shape(self) -> Shape:
+        """
+        Return a tuple of the shape of the underlying data.
+
+        See Also
+        --------
+        Series.ndim : Number of dimensions of the underlying data.
+        Series.size : Return the number of elements in the underlying data.
+        Series.nbytes : Return the number of bytes in the underlying data.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.shape
+        (3,)
+        """
+        return self._values.shape
+
+    def __len__(self) -> int:
+        # We need this defined here for mypy
+        raise AbstractMethodError(self)
+
+    # Temporarily avoid using `-> Literal[1]:` because of an IPython (jedi) bug
+    # https://github.com/ipython/ipython/issues/14412
+    # https://github.com/davidhalter/jedi/issues/1990
+    @property
+    def ndim(self) -> int:
+        """
+        Number of dimensions of the underlying data, by definition 1.
+
+        See Also
+        --------
+        Series.size: Return the number of elements in the underlying data.
+        Series.shape: Return a tuple of the shape of the underlying data.
+        Series.dtype: Return the dtype object of the underlying data.
+        Series.values: Return Series as ndarray or ndarray-like depending on the dtype.
+
+        Examples
+        --------
+        >>> s = pd.Series(["Ant", "Bear", "Cow"])
+        >>> s
+        0     Ant
+        1    Bear
+        2     Cow
+        dtype: str
+        >>> s.ndim
+        1
+
+        For Index:
+
+        >>> idx = pd.Index([1, 2, 3])
+        >>> idx
+        Index([1, 2, 3], dtype='int64')
+        >>> idx.ndim
+        1
+        """
+        return 1
+
+    @final
+    def item(self):
+        """
+        Return the first element of the underlying data as a Python scalar.
+
+        Returns
+        -------
+        scalar
+            The first element of Series or Index.
+
+        Raises
+        ------
+        ValueError
+            If the data is not length = 1.
+
+        See Also
+        --------
+        Index.values : Returns an array representing the data in the Index.
+        Series.head : Returns the first `n` rows.
+
+        Examples
+        --------
+        >>> s = pd.Series([1])
+        >>> s.item()
+        1
+
+        For an index:
+
+        >>> s = pd.Series([1], index=["a"])
+        >>> s.index.item()
+        'a'
+        """
+        if len(self) == 1:
+            return next(iter(self))
+        raise ValueError("can only convert an array of size 1 to a Python scalar")
+
+    @property
+    def nbytes(self) -> int:
+        """
+        Return the number of bytes in the underlying data.
+
+        See Also
+        --------
+        Series.ndim : Number of dimensions of the underlying data.
+        Series.size : Return the number of elements in the underlying data.
+
+        Examples
+        --------
+        For Series:
+
+        >>> s = pd.Series(["Ant", "Bear", "Cow"])
+        >>> s
+        0     Ant
+        1    Bear
+        2     Cow
+        dtype: str
+        >>> s.nbytes
+        34
+
+        For Index:
+
+        >>> idx = pd.Index([1, 2, 3])
+        >>> idx
+        Index([1, 2, 3], dtype='int64')
+        >>> idx.nbytes
+        24
+        """
+        return self._values.nbytes
+
+    @property
+    def size(self) -> int:
+        """
+        Return the number of elements in the underlying data.
+
+        See Also
+        --------
+        Series.ndim: Number of dimensions of the underlying data, by definition 1.
+        Series.shape: Return a tuple of the shape of the underlying data.
+        Series.dtype: Return the dtype object of the underlying data.
+        Series.values: Return Series as ndarray or ndarray-like depending on the dtype.
+
+        Examples
+        --------
+        For Series:
+
+        >>> s = pd.Series(["Ant", "Bear", "Cow"])
+        >>> s
+        0     Ant
+        1    Bear
+        2     Cow
+        dtype: str
+        >>> s.size
+        3
+
+        For Index:
+
+        >>> idx = pd.Index([1, 2, 3])
+        >>> idx
+        Index([1, 2, 3], dtype='int64')
+        >>> idx.size
+        3
+        """
+        return len(self._values)
+
+    @property
+    def array(self) -> ExtensionArray:
+        """
+        The ExtensionArray of the data backing this Series or Index.
+
+        This property provides direct access to the underlying array data of a
+        Series or Index without requiring conversion to a NumPy array. It
+        returns an ExtensionArray, which is the native storage format for
+        pandas extension dtypes.
+
+        Returns
+        -------
+        ExtensionArray
+            An ExtensionArray of the values stored within. For extension
+            types, this is the actual array. For NumPy native types, this
+            is a thin (no copy) wrapper around :class:`numpy.ndarray`.
+
+            ``.array`` differs from ``.values``, which may require converting
+            the data to a different form.
+
+        See Also
+        --------
+        Index.to_numpy : Similar method that always returns a NumPy array.
+        Series.to_numpy : Similar method that always returns a NumPy array.
+
+        Notes
+        -----
+        This table lays out the different array types for each extension
+        dtype within pandas.
+
+        ================== =============================
+        dtype              array type
+        ================== =============================
+        category           Categorical
+        period             PeriodArray
+        interval           IntervalArray
+        IntegerNA          IntegerArray
+        string             StringArray
+        boolean            BooleanArray
+        datetime64[ns, tz] DatetimeArray
+        ================== =============================
+
+        For any 3rd-party extension types, the array type will be an
+        ExtensionArray.
+
+        For all remaining dtypes ``.array`` will be a
+        :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray
+        stored within. If you absolutely need a NumPy array (possibly with
+        copying / coercing data), then use :meth:`Series.to_numpy` instead.
+
+        Examples
+        --------
+        For regular NumPy types like int, and float, a NumpyExtensionArray
+        is returned.
+
+        >>> pd.Series([1, 2, 3]).array
+        <NumpyExtensionArray>
+        [1, 2, 3]
+        Length: 3, dtype: int64
+
+        For extension types, like Categorical, the actual ExtensionArray
+        is returned
+
+        >>> ser = pd.Series(pd.Categorical(["a", "b", "a"]))
+        >>> ser.array
+        ['a', 'b', 'a']
+        Categories (2, str): ['a', 'b']
+        """
+        raise AbstractMethodError(self)
+
+    def to_numpy(
+        self,
+        dtype: npt.DTypeLike | None = None,
+        copy: bool = False,
+        na_value: object = lib.no_default,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        A NumPy ndarray representing the values in this Series or Index.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype, optional
+            The dtype to pass to :meth:`numpy.asarray`.
+        copy : bool, default False
+            Whether to ensure that the returned value is not a view on
+            another array. Note that ``copy=False`` does not *ensure* that
+            ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
+            a copy is made, even if not strictly necessary.
+        na_value : Any, optional
+            The value to use for missing values. The default value depends
+            on `dtype` and the type of the array.
+        **kwargs
+            Additional keywords passed through to the ``to_numpy`` method
+            of the underlying array (for extension arrays).
+
+        Returns
+        -------
+        numpy.ndarray
+            The NumPy ndarray holding the values from this Series or Index.
+            The dtype of the array may differ. See Notes.
+
+        See Also
+        --------
+        Series.array : Get the actual data stored within.
+        Index.array : Get the actual data stored within.
+        DataFrame.to_numpy : Similar method for DataFrame.
+
+        Notes
+        -----
+        The returned array will be the same up to equality (values equal
+        in `self` will be equal in the returned array; likewise for values
+        that are not equal). When `self` contains an ExtensionArray, the
+        dtype may be different. For example, for a category-dtype Series,
+        ``to_numpy()`` will return a NumPy array and the categorical dtype
+        will be lost.
+
+        For NumPy dtypes, this will be a reference to the actual data stored
+        in this Series or Index (assuming ``copy=False``). Modifying the result
+        in place will modify the data stored in the Series or Index (not that
+        we recommend doing that).
+
+        For extension types, ``to_numpy()`` *may* require copying data and
+        coercing the result to a NumPy type (possibly object), which may be
+        expensive. When you need a no-copy reference to the underlying data,
+        :attr:`Series.array` should be used instead.
+
+        This table lays out the different dtypes and default return types of
+        ``to_numpy()`` for various dtypes within pandas.
+
+        ================== ================================
+        dtype              array type
+        ================== ================================
+        category[T]        ndarray[T] (same dtype as input)
+        period             ndarray[object] (Periods)
+        interval           ndarray[object] (Intervals)
+        IntegerNA          ndarray[object]
+        datetime64[ns]     datetime64[ns]
+        datetime64[ns, tz] ndarray[object] (Timestamps)
+        ================== ================================
+
+        Examples
+        --------
+        >>> ser = pd.Series(pd.Categorical(["a", "b", "a"]))
+        >>> ser.to_numpy()
+        array(['a', 'b', 'a'], dtype=object)
+
+        Specify the `dtype` to control how datetime-aware data is represented.
+        Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
+        objects, each with the correct ``tz``.
+
+        >>> ser = pd.Series(pd.date_range("2000", periods=2, tz="CET"))
+        >>> ser.to_numpy(dtype=object)
+        array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),
+               Timestamp('2000-01-02 00:00:00+0100', tz='CET')],
+              dtype=object)
+
+        Or ``dtype='datetime64[ns]'`` to return an ndarray of native
+        datetime64 values. The values are converted to UTC and the timezone
+        info is dropped.
+
+        >>> ser.to_numpy(dtype="datetime64[ns]")
+        ... # doctest: +ELLIPSIS
+        array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
+              dtype='datetime64[ns]')
+        """
+        if isinstance(self.dtype, ExtensionDtype):
+            return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs)
+        elif kwargs:
+            bad_keys = next(iter(kwargs.keys()))
+            raise TypeError(
+                f"to_numpy() got an unexpected keyword argument '{bad_keys}'"
+            )
+
+        fillna = (
+            na_value is not lib.no_default
+            # no need to fillna with np.nan if we already have a float dtype
+            and not (na_value is np.nan and np.issubdtype(self.dtype, np.floating))
+        )
+
+        values = self._values
+        if fillna and self.hasnans:
+            if not can_hold_element(values, na_value):
+                # if we can't hold the na_value asarray either makes a copy or we
+                # error before modifying values. The asarray later on thus won't make
+                # another copy
+                values = np.asarray(values, dtype=dtype)
+            else:
+                values = values.copy()
+
+            values[np.asanyarray(isna(self))] = na_value
+
+        result = np.asarray(values, dtype=dtype)
+
+        if (copy and not fillna) or not copy:
+            if np.shares_memory(self._values[:2], result[:2]):
+                # Take slices to improve performance of check
+                if not copy:
+                    result = result.view()
+                    result.flags.writeable = False
+                else:
+                    result = result.copy()
+
+        return result
+
+    @final
+    @property
+    def empty(self) -> bool:
+        """
+        Indicator whether Index is empty.
+
+        An Index is considered empty if it has no elements. This property can be
+        useful for quickly checking the state of an Index, especially in data
+        processing and analysis workflows where handling of empty datasets might
+        be required.
+
+        Returns
+        -------
+        bool
+            If Index is empty, return True, if not return False.
+
+        See Also
+        --------
+        Index.size : Return the number of elements in the underlying data.
+
+        Examples
+        --------
+        >>> idx = pd.Index([1, 2, 3])
+        >>> idx
+        Index([1, 2, 3], dtype='int64')
+        >>> idx.empty
+        False
+
+        >>> idx_empty = pd.Index([])
+        >>> idx_empty
+        Index([], dtype='object')
+        >>> idx_empty.empty
+        True
+
+        If we only have NaNs in our DataFrame, it is not considered empty!
+
+        >>> idx = pd.Index([np.nan, np.nan])
+        >>> idx
+        Index([nan, nan], dtype='float64')
+        >>> idx.empty
+        False
+        """
+        return not self.size
+
+    def argmax(
+        self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs
+    ) -> int:
+        """
+        Return int position of the largest value in the Series.
+
+        If the maximum is achieved in multiple locations,
+        the first row position is returned.
+
+        Parameters
+        ----------
+        axis : None
+            Unused. Parameter needed for compatibility with DataFrame.
+        skipna : bool, default True
+            Exclude NA/null values. If the entire Series is NA, or if ``skipna=False``
+            and there is an NA value, this method will raise a ``ValueError``.
+        *args, **kwargs
+            Additional arguments and keywords for compatibility with NumPy.
+
+        Returns
+        -------
+        int
+            Row position of the maximum value.
+
+        See Also
+        --------
+        Series.argmax : Return position of the maximum value.
+        Series.argmin : Return position of the minimum value.
+        numpy.ndarray.argmax : Equivalent method for numpy arrays.
+        Series.idxmax : Return index label of the maximum values.
+        Series.idxmin : Return index label of the minimum values.
+
+        Examples
+        --------
+        Consider dataset containing cereal calories
+
+        >>> s = pd.Series(
+        ...     [100.0, 110.0, 120.0, 110.0],
+        ...     index=[
+        ...         "Corn Flakes",
+        ...         "Almond Delight",
+        ...         "Cinnamon Toast Crunch",
+        ...         "Cocoa Puff",
+        ...     ],
+        ... )
+        >>> s
+        Corn Flakes              100.0
+        Almond Delight           110.0
+        Cinnamon Toast Crunch    120.0
+        Cocoa Puff               110.0
+        dtype: float64
+
+        >>> s.argmax()
+        np.int64(2)
+        >>> s.argmin()
+        np.int64(0)
+
+        The maximum cereal calories is the third element and
+        the minimum cereal calories is the first element,
+        since series is zero-indexed.
+        """
+        delegate = self._values
+        nv.validate_minmax_axis(axis)
+        skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs)
+
+        if isinstance(delegate, ExtensionArray):
+            return delegate.argmax(skipna=skipna)
+        else:
+            result = nanops.nanargmax(delegate, skipna=skipna)
+            # error: Incompatible return value type (got "Union[int, ndarray]", expected
+            # "int")
+            return result  # type: ignore[return-value]
+
+    def argmin(
+        self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs
+    ) -> int:
+        """
+        Return int position of the smallest value in the Series.
+
+        If the minimum is achieved in multiple locations,
+        the first row position is returned.
+
+        Parameters
+        ----------
+        axis : None
+            Unused. Parameter needed for compatibility with DataFrame.
+        skipna : bool, default True
+            Exclude NA/null values. If the entire Series is NA, or if ``skipna=False``
+            and there is an NA value, this method will raise a ``ValueError``.
+        *args, **kwargs
+            Additional arguments and keywords for compatibility with NumPy.
+
+        Returns
+        -------
+        int
+            Row position of the minimum value.
+
+        See Also
+        --------
+        Series.argmin : Return position of the minimum value.
+        Series.argmax : Return position of the maximum value.
+        numpy.ndarray.argmin : Equivalent method for numpy arrays.
+        Series.idxmin : Return index label of the minimum values.
+        Series.idxmax : Return index label of the maximum values.
+
+        Examples
+        --------
+        Consider dataset containing cereal calories
+
+        >>> s = pd.Series(
+        ...     [100.0, 110.0, 120.0, 110.0],
+        ...     index=[
+        ...         "Corn Flakes",
+        ...         "Almond Delight",
+        ...         "Cinnamon Toast Crunch",
+        ...         "Cocoa Puff",
+        ...     ],
+        ... )
+        >>> s
+        Corn Flakes              100.0
+        Almond Delight           110.0
+        Cinnamon Toast Crunch    120.0
+        Cocoa Puff               110.0
+        dtype: float64
+
+        >>> s.argmax()
+        np.int64(2)
+        >>> s.argmin()
+        np.int64(0)
+
+        The maximum cereal calories is the third element and
+        the minimum cereal calories is the first element,
+        since series is zero-indexed.
+        """
+        delegate = self._values
+        nv.validate_minmax_axis(axis)
+        skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs)
+
+        if isinstance(delegate, ExtensionArray):
+            return delegate.argmin(skipna=skipna)
+        else:
+            result = nanops.nanargmin(delegate, skipna=skipna)
+            # error: Incompatible return value type (got "Union[int, ndarray]", expected
+            # "int")
+            return result  # type: ignore[return-value]
+
+    def tolist(self) -> list:
+        """
+        Return a list of the values.
+
+        These are each a scalar type, which is a Python scalar
+        (for str, int, float) or a pandas scalar
+        (for Timestamp/Timedelta/Interval/Period)
+
+        Returns
+        -------
+        list
+            List containing the values as Python or pandas scalers.
+
+        See Also
+        --------
+        numpy.ndarray.tolist : Return the array as an a.ndim-levels deep
+            nested list of Python scalars.
+
+        Examples
+        --------
+        For Series
+
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.to_list()
+        [1, 2, 3]
+
+        For Index:
+
+        >>> idx = pd.Index([1, 2, 3])
+        >>> idx
+        Index([1, 2, 3], dtype='int64')
+
+        >>> idx.to_list()
+        [1, 2, 3]
+        """
+        return self._values.tolist()
+
+    to_list = tolist
+
+    def __iter__(self) -> Iterator:
+        """
+        Return an iterator of the values.
+
+        These are each a scalar type, which is a Python scalar
+        (for str, int, float) or a pandas scalar
+        (for Timestamp/Timedelta/Interval/Period)
+
+        Returns
+        -------
+        iterator
+            An iterator yielding scalar values from the Series.
+
+        See Also
+        --------
+        Series.items : Lazily iterate over (index, value) tuples.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> for x in s:
+        ...     print(x)
+        1
+        2
+        3
+        """
+        # We are explicitly making element iterators.
+        if not isinstance(self._values, np.ndarray):
+            # Check type instead of dtype to catch DTA/TDA
+            return iter(self._values)
+        else:
+            return map(self._values.item, range(self._values.size))
+
+    @cache_readonly
+    def hasnans(self) -> bool:
+        """
+        Return True if there are any NaNs.
+
+        Enables various performance speedups.
+
+        Returns
+        -------
+        bool
+
+        See Also
+        --------
+        Series.isna : Detect missing values.
+        Series.notna : Detect existing (non-missing) values.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3, None])
+        >>> s
+        0    1.0
+        1    2.0
+        2    3.0
+        3    NaN
+        dtype: float64
+        >>> s.hasnans
+        True
+        """
+        # error: Item "bool" of "Union[bool, ndarray[Any, dtype[bool_]], NDFrame]"
+        # has no attribute "any"
+        return bool(isna(self).any())  # type: ignore[union-attr]
+
+    @final
+    def _map_values(self, mapper, na_action=None):
+        """
+        An internal function that maps values using the input
+        correspondence (which can be a dict, Series, or function).
+
+        Parameters
+        ----------
+        mapper : function, dict, or Series
+            The input correspondence object
+        na_action : {None, 'ignore'}
+            If 'ignore', propagate NA values, without passing them to the
+            mapping function
+
+        Returns
+        -------
+        Union[Index, MultiIndex], inferred
+            The output of the mapping function applied to the index.
+            If the function returns a tuple with more than one element
+            a MultiIndex will be returned.
+        """
+        arr = self._values
+
+        if isinstance(arr, ExtensionArray):
+            return arr.map(mapper, na_action=na_action)
+
+        return algorithms.map_array(arr, mapper, na_action=na_action)
+
+    def value_counts(
+        self,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        bins=None,
+        dropna: bool = True,
+    ) -> Series:
+        """
+        Return a Series containing counts of unique values.
+
+        The resulting object will be in descending order so that the
+        first element is the most frequently-occurring element.
+        Excludes NA values by default.
+
+        Parameters
+        ----------
+        normalize : bool, default False
+            If True then the object returned will contain the relative
+            frequencies of the unique values.
+        sort : bool, default True
+            Stable sort by frequencies when True. Preserve the order of the data
+            when False.
+
+            .. versionchanged:: 3.0.0
+
+                Prior to 3.0.0, the sort was unstable.
+        ascending : bool, default False
+            Sort in ascending order.
+        bins : int, optional
+            Rather than count values, group them into half-open bins,
+            a convenience for ``pd.cut``, only works with numeric data.
+        dropna : bool, default True
+            Don't include counts of NaN.
+
+        Returns
+        -------
+        Series
+            Series containing counts of unique values.
+
+        See Also
+        --------
+        Series.count: Number of non-NA elements in a Series.
+        DataFrame.count: Number of non-NA elements in a DataFrame.
+        DataFrame.value_counts: Equivalent method on DataFrames.
+
+        Examples
+        --------
+        >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
+        >>> index.value_counts()
+        3.0    2
+        1.0    1
+        2.0    1
+        4.0    1
+        Name: count, dtype: int64
+
+        With `normalize` set to `True`, returns the relative frequency by
+        dividing all values by the sum of values.
+
+        >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
+        >>> s.value_counts(normalize=True)
+        3.0    0.4
+        1.0    0.2
+        2.0    0.2
+        4.0    0.2
+        Name: proportion, dtype: float64
+
+        **bins**
+
+        Bins can be useful for going from a continuous variable to a
+        categorical variable; instead of counting unique
+        apparitions of values, divide the index in the specified
+        number of half-open bins.
+
+        >>> s.value_counts(bins=3)
+        (0.996, 2.0]    2
+        (2.0, 3.0]      2
+        (3.0, 4.0]      1
+        Name: count, dtype: int64
+
+        **dropna**
+
+        With `dropna` set to `False` we can also see NaN index values.
+
+        >>> s.value_counts(dropna=False)
+        3.0    2
+        1.0    1
+        2.0    1
+        4.0    1
+        NaN    1
+        Name: count, dtype: int64
+
+        **Categorical Dtypes**
+
+        Rows with categorical type will be counted as one group
+        if they have same categories and order.
+        In the example below, even though ``a``, ``c``, and ``d``
+        all have the same data types of ``category``,
+        only ``c`` and ``d`` will be counted as one group
+        since ``a`` doesn't have the same categories.
+
+        >>> df = pd.DataFrame({"a": [1], "b": ["2"], "c": [3], "d": [3]})
+        >>> df = df.astype({"a": "category", "c": "category", "d": "category"})
+        >>> df
+           a  b  c  d
+        0  1  2  3  3
+
+        >>> df.dtypes
+        a    category
+        b      str
+        c    category
+        d    category
+        dtype: object
+
+        >>> df.dtypes.value_counts()
+        category    2
+        category    1
+        str         1
+        Name: count, dtype: int64
+        """
+        return algorithms.value_counts_internal(
+            self,
+            sort=sort,
+            ascending=ascending,
+            normalize=normalize,
+            bins=bins,
+            dropna=dropna,
+        )
+
+    def unique(self):
+        values = self._values
+        if not isinstance(values, np.ndarray):
+            # i.e. ExtensionArray
+            result = values.unique()
+        else:
+            result = algorithms.unique1d(values)  # type: ignore[assignment]
+        return result
+
+    @final
+    def nunique(self, dropna: bool = True) -> int:
+        """
+        Return number of unique elements in the object.
+
+        Excludes NA values by default.
+
+        Parameters
+        ----------
+        dropna : bool, default True
+            Don't include NaN in the count.
+
+        Returns
+        -------
+        int
+            An integer indicating the number of unique elements in the object.
+
+        See Also
+        --------
+        DataFrame.nunique: Method nunique for DataFrame.
+        Series.count: Count non-NA/null observations in the Series.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 3, 5, 7, 7])
+        >>> s
+        0    1
+        1    3
+        2    5
+        3    7
+        4    7
+        dtype: int64
+
+        >>> s.nunique()
+        4
+        """
+        uniqs = self.unique()
+        if dropna:
+            uniqs = remove_na_arraylike(uniqs)
+        return len(uniqs)
+
+    @property
+    def is_unique(self) -> bool:
+        """
+        Return True if values in the object are unique.
+
+        Returns
+        -------
+        bool
+
+        See Also
+        --------
+        Series.unique : Return unique values of Series object.
+        Series.drop_duplicates : Return Series with duplicate values removed.
+        Series.duplicated : Indicate duplicate Series values.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.is_unique
+        True
+
+        >>> s = pd.Series([1, 2, 3, 1])
+        >>> s.is_unique
+        False
+        """
+        return self.nunique(dropna=False) == len(self)
+
+    @property
+    def is_monotonic_increasing(self) -> bool:
+        """
+        Return True if values in the object are monotonically increasing.
+
+        Returns
+        -------
+        bool
+
+        See Also
+        --------
+        Series.is_monotonic_decreasing : Return boolean if values in the object are
+            monotonically decreasing.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 2])
+        >>> s.is_monotonic_increasing
+        True
+
+        >>> s = pd.Series([3, 2, 1])
+        >>> s.is_monotonic_increasing
+        False
+        """
+        from pandas import Index
+
+        return Index(self).is_monotonic_increasing
+
+    @property
+    def is_monotonic_decreasing(self) -> bool:
+        """
+        Return True if values in the object are monotonically decreasing.
+
+        Returns
+        -------
+        bool
+
+        See Also
+        --------
+        Series.is_monotonic_increasing : Return boolean if values in the object are
+            monotonically increasing.
+
+        Examples
+        --------
+        >>> s = pd.Series([3, 2, 2, 1])
+        >>> s.is_monotonic_decreasing
+        True
+
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.is_monotonic_decreasing
+        False
+        """
+        from pandas import Index
+
+        return Index(self).is_monotonic_decreasing
+
+    @final
+    def _memory_usage(self, deep: bool = False) -> int:
+        """
+        Memory usage of the values.
+
+        Parameters
+        ----------
+        deep : bool, default False
+            Introspect the data deeply, interrogate
+            `object` dtypes for system-level memory consumption.
+
+        Returns
+        -------
+        bytes used
+            Returns memory usage of the values in the Index in bytes.
+
+        See Also
+        --------
+        numpy.ndarray.nbytes : Total bytes consumed by the elements of the
+            array.
+
+        Notes
+        -----
+        Memory usage does not include memory consumed by elements that
+        are not components of the array if deep=False or if used on PyPy
+
+        Examples
+        --------
+        >>> idx = pd.Index([1, 2, 3])
+        >>> idx.memory_usage()
+        24
+        """
+        if hasattr(self.array, "memory_usage"):
+            return self.array.memory_usage(  # pyright: ignore[reportAttributeAccessIssue]
+                deep=deep,
+            )
+
+        v = self.array.nbytes
+        if deep and is_object_dtype(self.dtype) and not PYPY:
+            values = cast(np.ndarray, self._values)
+            v += lib.memory_usage_of_objects(values)
+        return v
+
+    def factorize(
+        self,
+        sort: bool = False,
+        use_na_sentinel: bool = True,
+    ) -> tuple[npt.NDArray[np.intp], Index]:
+        """
+        Encode the object as an enumerated type or categorical variable.
+
+        This method is useful for obtaining a numeric representation of an
+        array when all that matters is identifying distinct values. `factorize`
+        is available as both a top-level function :func:`pandas.factorize`,
+        and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
+
+        Parameters
+        ----------
+        sort : bool, default False
+            Sort `uniques` and shuffle `codes` to maintain the
+            relationship.
+        use_na_sentinel : bool, default True
+            If True, the sentinel -1 will be used for NaN values. If False,
+            NaN values will be encoded as non-negative integers and will not drop the
+            NaN from the uniques of the values.
+
+        Returns
+        -------
+        codes : ndarray
+            An integer ndarray that's an indexer into `uniques`.
+            ``uniques.take(codes)`` will have the same values as `values`.
+        uniques : ndarray, Index, or Categorical
+            The unique valid values. When `values` is Categorical, `uniques`
+            is a Categorical. When `values` is some other pandas object, an
+            `Index` is returned. Otherwise, a 1-D ndarray is returned.
+
+            .. note::
+
+                Even if there's a missing value in `values`, `uniques` will
+                *not* contain an entry for it.
+
+        See Also
+        --------
+        cut : Discretize continuous-valued array.
+        unique : Find the unique value in an array.
+
+        Notes
+        -----
+        Reference :ref:`the user guide <reshaping.factorize>` for more examples.
+
+        Examples
+        --------
+        These examples all show factorize as a top-level method like
+        ``pd.factorize(values)``. The results are identical for methods like
+        :meth:`Series.factorize`.
+
+        >>> codes, uniques = pd.factorize(
+        ...     np.array(["b", "b", "a", "c", "b"], dtype="O")
+        ... )
+        >>> codes
+        array([0, 0, 1, 2, 0])
+        >>> uniques
+        array(['b', 'a', 'c'], dtype=object)
+
+        With ``sort=True``, the `uniques` will be sorted, and `codes` will be
+        shuffled so that the relationship is the maintained.
+
+        >>> codes, uniques = pd.factorize(
+        ...     np.array(["b", "b", "a", "c", "b"], dtype="O"), sort=True
+        ... )
+        >>> codes
+        array([1, 1, 0, 2, 1])
+        >>> uniques
+        array(['a', 'b', 'c'], dtype=object)
+
+        When ``use_na_sentinel=True`` (the default), missing values are indicated in
+        the `codes` with the sentinel value ``-1`` and missing values are not
+        included in `uniques`.
+
+        >>> codes, uniques = pd.factorize(
+        ...     np.array(["b", None, "a", "c", "b"], dtype="O")
+        ... )
+        >>> codes
+        array([ 0, -1,  1,  2,  0])
+        >>> uniques
+        array(['b', 'a', 'c'], dtype=object)
+
+        Thus far, we've only factorized lists (which are internally coerced to
+        NumPy arrays). When factorizing pandas objects, the type of `uniques`
+        will differ. For Categoricals, a `Categorical` is returned.
+
+        >>> cat = pd.Categorical(["a", "a", "c"], categories=["a", "b", "c"])
+        >>> codes, uniques = pd.factorize(cat)
+        >>> codes
+        array([0, 0, 1])
+        >>> uniques
+        ['a', 'c']
+        Categories (3, str): ['a', 'b', 'c']
+
+        Notice that ``'b'`` is in ``uniques.categories``, despite not being
+        present in ``cat.values``.
+
+        For all other pandas objects, an Index of the appropriate type is
+        returned.
+
+        >>> cat = pd.Series(["a", "a", "c"])
+        >>> codes, uniques = pd.factorize(cat)
+        >>> codes
+        array([0, 0, 1])
+        >>> uniques
+        Index(['a', 'c'], dtype='str')
+
+        If NaN is in the values, and we want to include NaN in the uniques of the
+        values, it can be achieved by setting ``use_na_sentinel=False``.
+
+        >>> values = np.array([1, 2, 1, np.nan])
+        >>> codes, uniques = pd.factorize(values)  # default: use_na_sentinel=True
+        >>> codes
+        array([ 0,  1,  0, -1])
+        >>> uniques
+        array([1., 2.])
+
+        >>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
+        >>> codes
+        array([0, 1, 0, 2])
+        >>> uniques
+        array([ 1.,  2., nan])
+        """
+        codes, uniques = algorithms.factorize(
+            self._values, sort=sort, use_na_sentinel=use_na_sentinel
+        )
+        if uniques.dtype == np.float16:
+            uniques = uniques.astype(np.float32)
+
+        if isinstance(self, ABCMultiIndex):
+            # preserve MultiIndex
+            if len(self) == 0:
+                # GH#57517
+                uniques = self[:0]
+            else:
+                uniques = self._constructor(uniques)
+        else:
+            from pandas import Index
+
+            try:
+                uniques = Index(uniques, dtype=self.dtype, copy=False)
+            except NotImplementedError:
+                # not all dtypes are supported in Index that are allowed for Series
+                # e.g. float16 or bytes
+                uniques = Index(uniques, copy=False)
+        return codes, uniques
+
+    # This overload is needed so that the call to searchsorted in
+    # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result
+
+    # error: Overloaded function signatures 1 and 2 overlap with incompatible
+    # return types
+    @overload
+    def searchsorted(  # type: ignore[overload-overlap]
+        self,
+        value: ScalarLike_co,
+        side: Literal["left", "right"] = ...,
+        sorter: NumpySorter = ...,
+    ) -> np.intp: ...
+
+    @overload
+    def searchsorted(
+        self,
+        value: npt.ArrayLike | ExtensionArray,
+        side: Literal["left", "right"] = ...,
+        sorter: NumpySorter = ...,
+    ) -> npt.NDArray[np.intp]: ...
+
+    def searchsorted(
+        self,
+        value: NumpyValueArrayLike | ExtensionArray,
+        side: Literal["left", "right"] = "left",
+        sorter: NumpySorter | None = None,
+    ) -> npt.NDArray[np.intp] | np.intp:
+        """
+        Find indices where elements should be inserted to maintain order.
+
+        Find the indices into a sorted Index `self` such that, if the
+        corresponding elements in `value` were inserted before the indices,
+        the order of `self` would be preserved.
+
+        .. note::
+
+            The Index *must* be monotonically sorted, otherwise
+            wrong locations will likely be returned. Pandas does *not*
+            check this for you.
+
+        Parameters
+        ----------
+        value : array-like or scalar
+            Values to insert into `self`.
+        side : {{'left', 'right'}}, optional
+            If 'left', the index of the first suitable location found is given.
+            If 'right', return the last such index.  If there is no suitable
+            index, return either 0 or N (where N is the length of `self`).
+        sorter : 1-D array-like, optional
+            Optional array of integer indices that sort `self` into ascending
+            order. They are typically the result of ``np.argsort``.
+
+        Returns
+        -------
+        int or array of int
+            A scalar or array of insertion points with the
+            same shape as `value`.
+
+        See Also
+        --------
+        sort_values : Sort by the values along either axis.
+        numpy.searchsorted : Similar method from NumPy.
+
+        Notes
+        -----
+        Binary search is used to find the required insertion points.
+
+        Examples
+        --------
+        >>> ser = pd.Series([1, 2, 3])
+        >>> ser
+        0    1
+        1    2
+        2    3
+        dtype: int64
+
+        >>> ser.searchsorted(4)
+        np.int64(3)
+
+        >>> ser.searchsorted([0, 4])
+        array([0, 3])
+
+        >>> ser.searchsorted([1, 3], side="left")
+        array([0, 2])
+
+        >>> ser.searchsorted([1, 3], side="right")
+        array([1, 3])
+
+        >>> ser = pd.Series(pd.to_datetime(["3/11/2000", "3/12/2000", "3/13/2000"]))
+        >>> ser
+        0   2000-03-11
+        1   2000-03-12
+        2   2000-03-13
+        dtype: datetime64[us]
+
+        >>> ser.searchsorted("3/14/2000")
+        np.int64(3)
+
+        >>> ser = pd.Categorical(
+        ...     ["apple", "bread", "bread", "cheese", "milk"], ordered=True
+        ... )
+        >>> ser
+        ['apple', 'bread', 'bread', 'cheese', 'milk']
+        Categories (4, str): ['apple' < 'bread' < 'cheese' < 'milk']
+
+        >>> ser.searchsorted("bread")
+        np.int64(1)
+
+        >>> ser.searchsorted(["bread"], side="right")
+        array([3])
+
+        If the values are not monotonically sorted, wrong locations
+        may be returned:
+
+        >>> ser = pd.Series([2, 1, 3])
+        >>> ser
+        0    2
+        1    1
+        2    3
+        dtype: int64
+
+        >>> ser.searchsorted(1)  # doctest: +SKIP
+        0  # wrong result, correct would be 1
+        """
+        if isinstance(value, ABCDataFrame):
+            msg = (
+                "Value must be 1-D array-like or scalar, "
+                f"{type(value).__name__} is not supported"
+            )
+            raise ValueError(msg)
+
+        values = self._values
+        if not isinstance(values, np.ndarray):
+            # Going through EA.searchsorted directly improves performance GH#38083
+            return values.searchsorted(value, side=side, sorter=sorter)
+
+        return algorithms.searchsorted(
+            values,
+            value,
+            side=side,
+            sorter=sorter,
+        )
+
+    def drop_duplicates(self, *, keep: DropKeep = "first") -> Self:
+        duplicated = self._duplicated(keep=keep)
+        # error: Value of type "IndexOpsMixin" is not indexable
+        return self[~duplicated]  # type: ignore[index]
+
+    @final
+    def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
+        arr = self._values
+        if isinstance(arr, ExtensionArray):
+            return arr.duplicated(keep=keep)
+        return algorithms.duplicated(arr, keep=keep)
+
+    def _arith_method(self, other, op):
+        res_name = ops.get_op_result_name(self, other)
+
+        lvalues = self._values
+        rvalues = extract_array(other, extract_numpy=True, extract_range=True)
+        rvalues = ops.maybe_prepare_scalar_for_op(rvalues, lvalues.shape)
+        rvalues = ensure_wrapped_if_datetimelike(rvalues)
+        if isinstance(rvalues, range):
+            rvalues = np.arange(rvalues.start, rvalues.stop, rvalues.step)
+
+        with np.errstate(all="ignore"):
+            result = ops.arithmetic_op(lvalues, rvalues, op)
+
+        return self._construct_result(result, name=res_name, other=other)
+
+    def _construct_result(self, result, name, other):
+        """
+        Construct an appropriately-wrapped result from the ArrayLike result
+        of an arithmetic-like operation.
+        """
+        raise AbstractMethodError(self)
diff --git a/pandas/core/col.py b/pandas/core/col.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2c064a17f2531d23958c1455247977c2895bbce
--- /dev/null
+++ b/pandas/core/col.py
@@ -0,0 +1,374 @@
+from __future__ import annotations
+
+from collections.abc import (
+    Callable,
+    Hashable,
+)
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+
+from pandas.util._decorators import set_module
+
+if TYPE_CHECKING:
+    from pandas import (
+        DataFrame,
+        Series,
+    )
+
+
+# Used only for generating the str repr of expressions.
+_OP_SYMBOLS = {
+    "__add__": "+",
+    "__radd__": "+",
+    "__sub__": "-",
+    "__rsub__": "-",
+    "__mul__": "*",
+    "__rmul__": "*",
+    "__truediv__": "/",
+    "__rtruediv__": "/",
+    "__floordiv__": "//",
+    "__rfloordiv__": "//",
+    "__mod__": "%",
+    "__rmod__": "%",
+    "__ge__": ">=",
+    "__gt__": ">",
+    "__le__": "<=",
+    "__lt__": "<",
+    "__eq__": "==",
+    "__ne__": "!=",
+    "__and__": "&",
+    "__rand__": "&",
+    "__or__": "|",
+    "__ror__": "|",
+    "__xor__": "^",
+    "__rxor__": "^",
+}
+
+
+def _parse_args(df: DataFrame, *args: Any) -> tuple[Series]:
+    # Parse `args`, evaluating any expressions we encounter.
+    return tuple(
+        [x._eval_expression(df) if isinstance(x, Expression) else x for x in args]
+    )
+
+
+def _parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[str, Any]:
+    # Parse `kwargs`, evaluating any expressions we encounter.
+    return {
+        key: val._eval_expression(df) if isinstance(val, Expression) else val
+        for key, val in kwargs.items()
+    }
+
+
+def _pretty_print_args_kwargs(*args: Any, **kwargs: Any) -> str:
+    inputs_repr = ", ".join(repr(arg) for arg in args)
+    kwargs_repr = ", ".join(f"{k}={v!r}" for k, v in kwargs.items())
+
+    all_args = []
+    if inputs_repr:
+        all_args.append(inputs_repr)
+    if kwargs_repr:
+        all_args.append(kwargs_repr)
+
+    return ", ".join(all_args)
+
+
+@set_module("pandas.api.typing")
+class Expression:
+    """
+    Class representing a deferred column.
+
+    This is not meant to be instantiated directly. Instead, use :meth:`pandas.col`.
+    """
+
+    def __init__(
+        self,
+        func: Callable[[DataFrame], Any],
+        repr_str: str,
+        needs_parenthese: bool = False,
+    ) -> None:
+        self._func = func
+        self._repr_str = repr_str
+        self._needs_parentheses = needs_parenthese
+
+    def _eval_expression(self, df: DataFrame) -> Any:
+        return self._func(df)
+
+    def _with_op(
+        self, op: str, other: Any, repr_str: str, needs_parentheses: bool = True
+    ) -> Expression:
+        if isinstance(other, Expression):
+            return Expression(
+                lambda df: getattr(self._eval_expression(df), op)(
+                    other._eval_expression(df)
+                ),
+                repr_str,
+                needs_parenthese=needs_parentheses,
+            )
+        else:
+            return Expression(
+                lambda df: getattr(self._eval_expression(df), op)(other),
+                repr_str,
+                needs_parenthese=needs_parentheses,
+            )
+
+    def _maybe_wrap_parentheses(self, other: Any) -> tuple[str, str]:
+        if self._needs_parentheses:
+            self_repr = f"({self!r})"
+        else:
+            self_repr = f"{self!r}"
+        if isinstance(other, Expression) and other._needs_parentheses:
+            other_repr = f"({other!r})"
+        else:
+            other_repr = f"{other!r}"
+        return self_repr, other_repr
+
+    # Binary ops
+    def __add__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__add__", other, f"{self_repr} + {other_repr}")
+
+    def __radd__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__radd__", other, f"{other_repr} + {self_repr}")
+
+    def __sub__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__sub__", other, f"{self_repr} - {other_repr}")
+
+    def __rsub__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__rsub__", other, f"{other_repr} - {self_repr}")
+
+    def __mul__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__mul__", other, f"{self_repr} * {other_repr}")
+
+    def __rmul__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__rmul__", other, f"{other_repr} * {self_repr}")
+
+    def __truediv__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__truediv__", other, f"{self_repr} / {other_repr}")
+
+    def __rtruediv__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__rtruediv__", other, f"{other_repr} / {self_repr}")
+
+    def __floordiv__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__floordiv__", other, f"{self_repr} // {other_repr}")
+
+    def __rfloordiv__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__rfloordiv__", other, f"{other_repr} // {self_repr}")
+
+    def __ge__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__ge__", other, f"{self_repr} >= {other_repr}")
+
+    def __gt__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__gt__", other, f"{self_repr} > {other_repr}")
+
+    def __le__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__le__", other, f"{self_repr} <= {other_repr}")
+
+    def __lt__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__lt__", other, f"{self_repr} < {other_repr}")
+
+    def __eq__(self, other: object) -> Expression:  # type: ignore[override]
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__eq__", other, f"{self_repr} == {other_repr}")
+
+    def __ne__(self, other: object) -> Expression:  # type: ignore[override]
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__ne__", other, f"{self_repr} != {other_repr}")
+
+    def __mod__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__mod__", other, f"{self_repr} % {other_repr}")
+
+    def __rmod__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__rmod__", other, f"{other_repr} % {self_repr}")
+
+    # Logical ops
+    def __and__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__and__", other, f"{self_repr} & {other_repr}")
+
+    def __rand__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__rand__", other, f"{other_repr} & {self_repr}")
+
+    def __or__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__or__", other, f"{self_repr} | {other_repr}")
+
+    def __ror__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__ror__", other, f"{other_repr} | {self_repr}")
+
+    def __xor__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__xor__", other, f"{self_repr} ^ {other_repr}")
+
+    def __rxor__(self, other: Any) -> Expression:
+        self_repr, other_repr = self._maybe_wrap_parentheses(other)
+        return self._with_op("__rxor__", other, f"{other_repr} ^ {self_repr}")
+
+    def __invert__(self) -> Expression:
+        return Expression(
+            lambda df: ~self._eval_expression(df),
+            f"~{self._repr_str}",
+            needs_parenthese=True,
+        )
+
+    def __neg__(self) -> Expression:
+        if self._needs_parentheses:
+            repr_str = f"-({self._repr_str})"
+        else:
+            repr_str = f"-{self._repr_str}"
+        return Expression(
+            lambda df: -self._eval_expression(df),
+            repr_str,
+            needs_parenthese=True,
+        )
+
+    def __pos__(self) -> Expression:
+        if self._needs_parentheses:
+            repr_str = f"+({self._repr_str})"
+        else:
+            repr_str = f"+{self._repr_str}"
+        return Expression(
+            lambda df: +self._eval_expression(df),
+            repr_str,
+            needs_parenthese=True,
+        )
+
+    def __abs__(self) -> Expression:
+        return Expression(
+            lambda df: abs(self._eval_expression(df)),
+            f"abs({self._repr_str})",
+            needs_parenthese=True,
+        )
+
+    def __array_ufunc__(
+        self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any
+    ) -> Expression:
+        def func(df: DataFrame) -> Any:
+            parsed_inputs = _parse_args(df, *inputs)
+            parsed_kwargs = _parse_kwargs(df, *kwargs)
+            return ufunc(*parsed_inputs, **parsed_kwargs)
+
+        args_str = _pretty_print_args_kwargs(*inputs, **kwargs)
+        repr_str = f"{ufunc.__name__}({args_str})"
+
+        return Expression(func, repr_str)
+
+    def __getitem__(self, item: Any) -> Expression:
+        return self._with_op(
+            "__getitem__", item, f"{self!r}[{item!r}]", needs_parentheses=True
+        )
+
+    def _call_with_func(self, func: Callable, **kwargs: Any) -> Expression:
+        def wrapped(df: DataFrame) -> Any:
+            parsed_kwargs = _parse_kwargs(df, **kwargs)
+            return func(**parsed_kwargs)
+
+        args_str = _pretty_print_args_kwargs(**kwargs)
+        repr_str = func.__name__ + "(" + args_str + ")"
+
+        return Expression(wrapped, repr_str)
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Expression:
+        def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any:
+            parsed_args = _parse_args(df, *args)
+            parsed_kwargs = _parse_kwargs(df, **kwargs)
+            return self._eval_expression(df)(*parsed_args, **parsed_kwargs)
+
+        args_str = _pretty_print_args_kwargs(*args, **kwargs)
+        repr_str = f"{self._repr_str}({args_str})"
+        return Expression(lambda df: func(df, *args, **kwargs), repr_str)
+
+    def __getattr__(self, name: str, /) -> Any:
+        repr_str = f"{self!r}"
+        if self._needs_parentheses:
+            repr_str = f"({repr_str})"
+        repr_str += f".{name}"
+        return Expression(lambda df: getattr(self._eval_expression(df), name), repr_str)
+
+    def __repr__(self) -> str:
+        return self._repr_str or "Expr(...)"
+
+
+@set_module("pandas")
+def col(col_name: Hashable) -> Expression:
+    """
+    Generate deferred object representing a column of a DataFrame.
+
+    Any place which accepts ``lambda df: df[col_name]``, such as
+    :meth:`DataFrame.assign` or :meth:`DataFrame.loc`, can also accept
+    ``pd.col(col_name)``.
+
+    .. versionadded:: 3.0.0
+
+    Parameters
+    ----------
+    col_name : Hashable
+        Column name.
+
+    Returns
+    -------
+    `pandas.api.typing.Expression`
+        A deferred object representing a column of a DataFrame.
+
+    See Also
+    --------
+    DataFrame.query : Query columns of a dataframe using string expressions.
+
+    Examples
+    --------
+
+    You can use `col` in `assign`.
+
+    >>> df = pd.DataFrame({"name": ["beluga", "narwhal"], "speed": [100, 110]})
+    >>> df.assign(name_titlecase=pd.col("name").str.title())
+          name  speed name_titlecase
+    0   beluga    100         Beluga
+    1  narwhal    110        Narwhal
+
+    You can also use it for filtering.
+
+    >>> df.loc[pd.col("speed") > 105]
+          name  speed
+    1  narwhal    110
+    """
+    if not isinstance(col_name, Hashable):
+        msg = f"Expected Hashable, got: {type(col_name)}"
+        raise TypeError(msg)
+
+    def func(df: DataFrame) -> Series:
+        if col_name not in df.columns:
+            columns_str = str(df.columns.tolist())
+            max_len = 90
+            if len(columns_str) > max_len:
+                columns_str = columns_str[:max_len] + "...]"
+
+            msg = (
+                f"Column '{col_name}' not found in given DataFrame.\n\n"
+                f"Hint: did you mean one of {columns_str} instead?"
+            )
+            raise ValueError(msg)
+        return df[col_name]
+
+    return Expression(func, f"col({col_name!r})")
+
+
+__all__ = ["Expression", "col"]
diff --git a/pandas/core/common.py b/pandas/core/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ca6586222ca13845e7313eeaf51ee036e5f9f9d
--- /dev/null
+++ b/pandas/core/common.py
@@ -0,0 +1,685 @@
+"""
+Misc tools for implementing data structures
+
+Note: pandas.core.common is *not* part of the public API.
+"""
+
+from __future__ import annotations
+
+import builtins
+from collections import (
+    abc,
+    defaultdict,
+)
+from collections.abc import (
+    Callable,
+    Collection,
+    Generator,
+    Hashable,
+    Iterable,
+    Sequence,
+)
+import contextlib
+from functools import partial
+import inspect
+import sys
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Concatenate,
+    TypeVar,
+    cast,
+    overload,
+)
+
+import numpy as np
+
+from pandas._libs import lib
+
+from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_integer,
+)
+from pandas.core.dtypes.generic import (
+    ABCExtensionArray,
+    ABCIndex,
+    ABCMultiIndex,
+    ABCNumpyExtensionArray,
+    ABCSeries,
+)
+from pandas.core.dtypes.inference import iterable_not_string
+
+from pandas.core.col import Expression
+
+if TYPE_CHECKING:
+    from pandas._typing import (
+        AnyArrayLike,
+        ArrayLike,
+        NpDtype,
+        P,
+        RandomState,
+        T,
+    )
+
+    from pandas import Index
+
+
+def flatten(line):
+    """
+    Flatten an arbitrarily nested sequence.
+
+    Parameters
+    ----------
+    line : sequence
+        The non string sequence to flatten
+
+    Notes
+    -----
+    This doesn't consider strings sequences.
+
+    Returns
+    -------
+    flattened : generator
+    """
+    for element in line:
+        if iterable_not_string(element):
+            yield from flatten(element)
+        else:
+            yield element
+
+
+def consensus_name_attr(objs):
+    name = objs[0].name
+    for obj in objs[1:]:
+        try:
+            if obj.name != name:
+                name = None
+                break
+        except ValueError:
+            name = None
+            break
+    return name
+
+
+def is_bool_indexer(key: Any) -> bool:
+    """
+    Check whether `key` is a valid boolean indexer.
+
+    Parameters
+    ----------
+    key : Any
+        Only list-likes may be considered boolean indexers.
+        All other types are not considered a boolean indexer.
+        For array-like input, boolean ndarrays or ExtensionArrays
+        with ``_is_boolean`` set are considered boolean indexers.
+
+    Returns
+    -------
+    bool
+        Whether `key` is a valid boolean indexer.
+
+    Raises
+    ------
+    ValueError
+        When the array is an object-dtype ndarray or ExtensionArray
+        and contains missing values.
+
+    See Also
+    --------
+    check_array_indexer : Check that `key` is a valid array to index,
+        and convert to an ndarray.
+    """
+    if isinstance(
+        key,
+        (ABCSeries, np.ndarray, ABCIndex, ABCExtensionArray, ABCNumpyExtensionArray),
+    ) and not isinstance(key, ABCMultiIndex):
+        if key.dtype == np.object_:
+            key_array = np.asarray(key)
+
+            if not lib.is_bool_array(key_array):
+                na_msg = "Cannot mask with non-boolean array containing NA / NaN values"
+                if lib.is_bool_array(key_array, skipna=True):
+                    # Don't raise on e.g. ["A", "B", np.nan], see
+                    #  test_loc_getitem_list_of_labels_categoricalindex_with_na
+                    raise ValueError(na_msg)
+                return False
+            return True
+        elif is_bool_dtype(key.dtype):
+            return True
+    elif isinstance(key, list):
+        # check if np.array(key).dtype would be bool
+        if len(key) > 0:
+            if type(key) is not list:
+                # GH#42461 cython will raise TypeError if we pass a subclass
+                key = list(key)
+            return lib.is_bool_list(key)
+
+    return False
+
+
+def cast_scalar_indexer(val):
+    """
+    Disallow indexing with a float key, even if that key is a round number.
+
+    Parameters
+    ----------
+    val : scalar
+
+    Returns
+    -------
+    outval : scalar
+    """
+    # assumes lib.is_scalar(val)
+    if lib.is_float(val) and val.is_integer():
+        raise IndexError(
+            # GH#34193
+            "Indexing with a float is no longer supported. Manually convert "
+            "to an integer key instead."
+        )
+    return val
+
+
+def not_none(*args):
+    """
+    Returns a generator consisting of the arguments that are not None.
+    """
+    return (arg for arg in args if arg is not None)
+
+
+def any_none(*args) -> bool:
+    """
+    Returns a boolean indicating if any argument is None.
+    """
+    return any(arg is None for arg in args)
+
+
+def all_none(*args) -> bool:
+    """
+    Returns a boolean indicating if all arguments are None.
+    """
+    return all(arg is None for arg in args)
+
+
+def any_not_none(*args) -> bool:
+    """
+    Returns a boolean indicating if any argument is not None.
+    """
+    return any(arg is not None for arg in args)
+
+
+def all_not_none(*args) -> bool:
+    """
+    Returns a boolean indicating if all arguments are not None.
+    """
+    return all(arg is not None for arg in args)
+
+
+def count_not_none(*args) -> int:
+    """
+    Returns the count of arguments that are not None.
+    """
+    return sum(x is not None for x in args)
+
+
+@overload
+def asarray_tuplesafe(
+    values: ArrayLike | list | tuple | zip, dtype: NpDtype | None = ...
+) -> np.ndarray:
+    # ExtensionArray can only be returned when values is an Index, all other iterables
+    # will return np.ndarray. Unfortunately "all other" cannot be encoded in a type
+    # signature, so instead we special-case some common types.
+    ...
+
+
+@overload
+def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = ...) -> ArrayLike: ...
+
+
+def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLike:
+    if not (isinstance(values, (list, tuple)) or hasattr(values, "__array__")):
+        values = list(values)
+    elif isinstance(values, ABCIndex):
+        return values._values
+    elif isinstance(values, ABCSeries):
+        return values._values
+
+    if isinstance(values, list) and dtype in [np.object_, object]:
+        return construct_1d_object_array_from_listlike(values)
+
+    try:
+        result = np.asarray(values, dtype=dtype)
+    except ValueError:
+        # Using try/except since it's more performant than checking is_list_like
+        # over each element
+        # error: Argument 1 to "construct_1d_object_array_from_listlike"
+        # has incompatible type "Iterable[Any]"; expected "Sized"
+        return construct_1d_object_array_from_listlike(values)  # type: ignore[arg-type]
+
+    if issubclass(result.dtype.type, str):
+        result = np.asarray(values, dtype=object)
+
+    if result.ndim == 2:
+        # Avoid building an array of arrays:
+        values = [tuple(x) for x in values]
+        result = construct_1d_object_array_from_listlike(values)
+
+    return result
+
+
+def index_labels_to_array(
+    labels: np.ndarray | Iterable, dtype: NpDtype | None = None
+) -> np.ndarray:
+    """
+    Transform label or iterable of labels to array, for use in Index.
+
+    Parameters
+    ----------
+    dtype : dtype
+        If specified, use as dtype of the resulting array, otherwise infer.
+
+    Returns
+    -------
+    array
+    """
+    if isinstance(labels, (str, tuple)):
+        labels = [labels]
+
+    if not isinstance(labels, (list, np.ndarray)):
+        try:
+            labels = list(labels)
+        except TypeError:  # non-iterable
+            labels = [labels]
+
+    rlabels = asarray_tuplesafe(labels, dtype=dtype)
+
+    return rlabels
+
+
+def maybe_make_list(obj):
+    if obj is not None and not isinstance(obj, (tuple, list)):
+        return [obj]
+    return obj
+
+
+def maybe_iterable_to_list(obj: Iterable[T] | T) -> Collection[T] | T:
+    """
+    If obj is Iterable but not list-like, consume into list.
+    """
+    if isinstance(obj, abc.Iterable) and not isinstance(obj, abc.Sized):
+        return list(obj)
+    obj = cast(Collection, obj)
+    return obj
+
+
+def is_null_slice(obj) -> bool:
+    """
+    We have a null slice.
+    """
+    return (
+        isinstance(obj, slice)
+        and obj.start is None
+        and obj.stop is None
+        and obj.step is None
+    )
+
+
+def is_empty_slice(obj) -> bool:
+    """
+    We have an empty slice, e.g. no values are selected.
+    """
+    return (
+        isinstance(obj, slice)
+        and obj.start is not None
+        and obj.stop is not None
+        and obj.start == obj.stop
+    )
+
+
+def is_true_slices(line: abc.Iterable) -> abc.Generator[bool, None, None]:
+    """
+    Find non-trivial slices in "line": yields a bool.
+    """
+    for k in line:
+        yield isinstance(k, slice) and not is_null_slice(k)
+
+
+# TODO: used only once in indexing; belongs elsewhere?
+def is_full_slice(obj, line: int) -> bool:
+    """
+    We have a full length slice.
+    """
+    return (
+        isinstance(obj, slice)
+        and obj.start == 0
+        and obj.stop == line
+        and obj.step is None
+    )
+
+
+def get_callable_name(obj):
+    # typical case has name
+    if hasattr(obj, "__name__"):
+        return obj.__name__
+    # some objects don't; could recurse
+    if isinstance(obj, partial):
+        return get_callable_name(obj.func)
+    # fall back to class name
+    if callable(obj):
+        return type(obj).__name__
+    # everything failed (probably because the argument
+    # wasn't actually callable); we return None
+    # instead of the empty string in this case to allow
+    # distinguishing between no name and a name of ''
+    return None
+
+
+def apply_if_callable(maybe_callable, obj, **kwargs):
+    """
+    Evaluate possibly callable input using obj and kwargs if it is callable,
+    otherwise return as it is.
+
+    Parameters
+    ----------
+    maybe_callable : possibly a callable
+    obj : NDFrame
+    **kwargs
+    """
+    if isinstance(maybe_callable, Expression):
+        return maybe_callable._eval_expression(obj, **kwargs)
+    elif callable(maybe_callable):
+        return maybe_callable(obj, **kwargs)
+
+    return maybe_callable
+
+
+def standardize_mapping(into):
+    """
+    Helper function to standardize a supplied mapping.
+
+    Parameters
+    ----------
+    into : instance or subclass of collections.abc.Mapping
+        Must be a class, an initialized collections.defaultdict,
+        or an instance of a collections.abc.Mapping subclass.
+
+    Returns
+    -------
+    mapping : a collections.abc.Mapping subclass or other constructor
+        a callable object that can accept an iterator to create
+        the desired Mapping.
+
+    See Also
+    --------
+    DataFrame.to_dict
+    Series.to_dict
+    """
+    if not inspect.isclass(into):
+        if isinstance(into, defaultdict):
+            return partial(defaultdict, into.default_factory)
+        into = type(into)
+    if not issubclass(into, abc.Mapping):
+        raise TypeError(f"unsupported type: {into}")
+    if into == defaultdict:
+        raise TypeError("to_dict() only accepts initialized defaultdicts")
+    return into
+
+
+@overload
+def random_state(state: np.random.Generator) -> np.random.Generator: ...
+
+
+@overload
+def random_state(
+    state: int | np.ndarray | np.random.BitGenerator | np.random.RandomState | None,
+) -> np.random.RandomState: ...
+
+
+def random_state(state: RandomState | None = None):
+    """
+    Helper function for processing random_state arguments.
+
+    Parameters
+    ----------
+    state : int, array-like, BitGenerator, Generator, np.random.RandomState, None.
+        If receives an int, array-like, or BitGenerator, passes to
+        np.random.RandomState() as seed.
+        If receives an np.random RandomState or Generator, just returns that unchanged.
+        If receives `None`, returns np.random.
+        If receives anything else, raises an informative ValueError.
+
+        Default None.
+
+    Returns
+    -------
+    np.random.RandomState or np.random.Generator. If state is None, returns np.random
+
+    """
+    if is_integer(state) or isinstance(state, (np.ndarray, np.random.BitGenerator)):
+        return np.random.RandomState(state)
+    elif isinstance(state, np.random.RandomState):
+        return state
+    elif isinstance(state, np.random.Generator):
+        return state
+    elif state is None:
+        return np.random
+    else:
+        raise ValueError(
+            "random_state must be an integer, array-like, a BitGenerator, Generator, "
+            "a numpy RandomState, or None"
+        )
+
+
+_T = TypeVar("_T")  # Secondary TypeVar for use in pipe's type hints
+
+
+@overload
+def pipe(
+    obj: _T,
+    func: Callable[Concatenate[_T, P], T],
+    *args: P.args,
+    **kwargs: P.kwargs,
+) -> T: ...
+
+
+@overload
+def pipe(
+    obj: Any,
+    func: tuple[Callable[..., T], str],
+    *args: Any,
+    **kwargs: Any,
+) -> T: ...
+
+
+def pipe(
+    obj: _T,
+    func: Callable[Concatenate[_T, P], T] | tuple[Callable[..., T], str],
+    *args: Any,
+    **kwargs: Any,
+) -> T:
+    """
+    Apply a function ``func`` to object ``obj`` either by passing obj as the
+    first argument to the function or, in the case that the func is a tuple,
+    interpret the first element of the tuple as a function and pass the obj to
+    that function as a keyword argument whose key is the value of the second
+    element of the tuple.
+
+    Parameters
+    ----------
+    func : callable or tuple of (callable, str)
+        Function to apply to this object or, alternatively, a
+        ``(callable, data_keyword)`` tuple where ``data_keyword`` is a
+        string indicating the keyword of ``callable`` that expects the
+        object.
+    *args : iterable, optional
+        Positional arguments passed into ``func``.
+    **kwargs : dict, optional
+        A dictionary of keyword arguments passed into ``func``.
+
+    Returns
+    -------
+    object : the return type of ``func``.
+    """
+    if isinstance(func, tuple):
+        # Assigning to func_ so pyright understands that it's a callable
+        func_, target = func
+        if target in kwargs:
+            msg = f"{target} is both the pipe target and a keyword argument"
+            raise ValueError(msg)
+        kwargs[target] = obj
+        return func_(*args, **kwargs)
+    else:
+        return func(obj, *args, **kwargs)
+
+
+def get_rename_function(mapper):
+    """
+    Returns a function that will map names/labels, dependent if mapper
+    is a dict, Series or just a function.
+    """
+
+    def f(x):
+        if x in mapper:
+            return mapper[x]
+        else:
+            return x
+
+    return f if isinstance(mapper, (abc.Mapping, ABCSeries)) else mapper
+
+
+def convert_to_list_like(
+    values: Hashable | Iterable | AnyArrayLike,
+) -> list | AnyArrayLike:
+    """
+    Convert list-like or scalar input to list-like. List, numpy and pandas array-like
+    inputs are returned unmodified whereas others are converted to list.
+    """
+    if isinstance(values, (list, np.ndarray, ABCIndex, ABCSeries, ABCExtensionArray)):
+        return values
+    elif isinstance(values, abc.Iterable) and not isinstance(values, str):
+        return list(values)
+
+    return [values]
+
+
+@contextlib.contextmanager
+def temp_setattr(obj, attr: str, value, condition: bool = True) -> Generator[None]:
+    """
+    Temporarily set attribute on an object.
+
+    Parameters
+    ----------
+    obj : object
+        Object whose attribute will be modified.
+    attr : str
+        Attribute to modify.
+    value : Any
+        Value to temporarily set attribute to.
+    condition : bool, default True
+        Whether to set the attribute. Provided in order to not have to
+        conditionally use this context manager.
+
+    Yields
+    ------
+    object : obj with modified attribute.
+    """
+    if condition:
+        old_value = getattr(obj, attr)
+        setattr(obj, attr, value)
+    try:
+        yield obj
+    finally:
+        if condition:
+            setattr(obj, attr, old_value)
+
+
+def require_length_match(data, index: Index) -> None:
+    """
+    Check the length of data matches the length of the index.
+    """
+    if len(data) != len(index):
+        raise ValueError(
+            "Length of values "
+            f"({len(data)}) "
+            "does not match length of index "
+            f"({len(index)})"
+        )
+
+
+_cython_table = {
+    builtins.sum: "sum",
+    builtins.max: "max",
+    builtins.min: "min",
+    np.all: "all",
+    np.any: "any",
+    np.sum: "sum",
+    np.nansum: "sum",
+    np.mean: "mean",
+    np.nanmean: "mean",
+    np.prod: "prod",
+    np.nanprod: "prod",
+    np.std: "std",
+    np.nanstd: "std",
+    np.var: "var",
+    np.nanvar: "var",
+    np.median: "median",
+    np.nanmedian: "median",
+    np.max: "max",
+    np.nanmax: "max",
+    np.min: "min",
+    np.nanmin: "min",
+    np.cumprod: "cumprod",
+    np.nancumprod: "cumprod",
+    np.cumsum: "cumsum",
+    np.nancumsum: "cumsum",
+}
+
+
+def get_cython_func(arg: Callable) -> str | None:
+    """
+    if we define an internal function for this argument, return it
+    """
+    return _cython_table.get(arg)
+
+
+def fill_missing_names(names: Sequence[Hashable | None]) -> list[Hashable]:
+    """
+    If a name is missing then replace it by level_n, where n is the count
+
+    Parameters
+    ----------
+    names : list-like
+        list of column names or None values.
+
+    Returns
+    -------
+    list
+        list of column names with the None values replaced.
+    """
+    return [f"level_{i}" if name is None else name for i, name in enumerate(names)]
+
+
+def is_local_in_caller_frame(obj):
+    """
+    Helper function used in detecting chained assignment.
+
+    If the pandas object (DataFrame/Series) is a local variable
+    in the caller's frame, it should not be a case of chained
+    assignment or method call.
+
+    For example:
+
+    def test():
+        df = pd.DataFrame(...)
+        df["a"] = 1  # not chained assignment
+
+    Inside ``df.__setitem__``, we call this function to check whether `df`
+    (`self`) is a local variable in `test` frame (the frame calling setitem). If
+    so, we know it is not a case of chained assignment (even when the refcount
+    of `df` is below the threshold due to optimization of local variables).
+    """
+    frame = sys._getframe(2)
+    for v in frame.f_locals.values():
+        if v is obj:
+            return True
+    return False
diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcb7e1b9fff0ae92ca57f7bd7d669217f7d02cd4
--- /dev/null
+++ b/pandas/core/config_init.py
@@ -0,0 +1,923 @@
+"""
+This module is imported from the pandas package __init__.py file
+in order to ensure that the core.config options registered here will
+be available as soon as the user loads the package. if register_option
+is invoked inside specific modules, they will not be registered until that
+module is imported, which may or may not be a problem.
+
+If you need to make sure options are available even before a certain
+module is imported, register them here rather than in the module.
+
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+import os
+from typing import Any
+
+import pandas._config.config as cf
+from pandas._config.config import (
+    is_bool,
+    is_callable,
+    is_instance_factory,
+    is_int,
+    is_nonnegative_int,
+    is_one_of_factory,
+    is_str,
+    is_text,
+)
+
+from pandas.errors import Pandas4Warning
+
+# compute
+
+use_bottleneck_doc = """
+: bool
+    Use the bottleneck library to accelerate if it is installed,
+    the default is True
+    Valid values: False,True
+"""
+
+
+def use_bottleneck_cb(key: str) -> None:
+    from pandas.core import nanops
+
+    nanops.set_use_bottleneck(cf.get_option(key))
+
+
+use_numexpr_doc = """
+: bool
+    Use the numexpr library to accelerate computation if it is installed,
+    the default is True
+    Valid values: False,True
+"""
+
+
+def use_numexpr_cb(key: str) -> None:
+    from pandas.core.computation import expressions
+
+    expressions.set_use_numexpr(cf.get_option(key))
+
+
+use_numba_doc = """
+: bool
+    Use the numba engine option for select operations if it is installed,
+    the default is False
+    Valid values: False,True
+"""
+
+
+def use_numba_cb(key: str) -> None:
+    from pandas.core.util import numba_
+
+    numba_.set_use_numba(cf.get_option(key))
+
+
+with cf.config_prefix("compute"):
+    cf.register_option(
+        "use_bottleneck",
+        True,
+        use_bottleneck_doc,
+        validator=is_bool,
+        cb=use_bottleneck_cb,
+    )
+    cf.register_option(
+        "use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb
+    )
+    cf.register_option(
+        "use_numba", False, use_numba_doc, validator=is_bool, cb=use_numba_cb
+    )
+#
+# options from the "display" namespace
+
+pc_precision_doc = """
+: int
+    Floating point output precision in terms of number of places after the
+    decimal, for regular formatting as well as scientific notation. Similar
+    to ``precision`` in :meth:`numpy.set_printoptions`.
+"""
+
+pc_max_rows_doc = """
+: int
+    If max_rows is exceeded, switch to truncate view. Depending on
+    `large_repr`, objects are either centrally truncated or printed as
+    a summary view.
+
+    'None' value means unlimited. Beware that printing a large number of rows
+    could cause your rendering environment (the browser, etc.) to crash.
+
+    In case python/IPython is running in a terminal and `large_repr`
+    equals 'truncate' this can be set to 0 and pandas will auto-detect
+    the height of the terminal and print a truncated object which fits
+    the screen height. The IPython notebook, IPython qtconsole, or
+    IDLE do not run in a terminal and hence it is not possible to do
+    correct auto-detection.
+"""
+
+pc_min_rows_doc = """
+: int
+    The numbers of rows to show in a truncated view (when `max_rows` is
+    exceeded). Ignored when `max_rows` is set to None or 0. When set to
+    None, follows the value of `max_rows`.
+"""
+
+pc_max_cols_doc = """
+: int
+    If max_cols is exceeded, switch to truncate view. Depending on
+    `large_repr`, objects are either centrally truncated or printed as
+    a summary view.
+
+    'None' value means unlimited. Beware that printing a large number of
+    columns could cause your rendering environment (the browser, etc.) to
+    crash.
+
+    In case python/IPython is running in a terminal and `large_repr`
+    equals 'truncate' this can be set to 0 or None and pandas will auto-detect
+    the width of the terminal and print a truncated object which fits
+    the screen width. The IPython notebook, IPython qtconsole, or IDLE
+    do not run in a terminal and hence it is not possible to do
+    correct auto-detection and defaults to 20.
+"""
+
+pc_max_categories_doc = """
+: int
+    This sets the maximum number of categories pandas should output when
+    printing out a `Categorical` or a Series of dtype "category".
+"""
+
+pc_max_info_cols_doc = """
+: int
+    max_info_columns is used in DataFrame.info method to decide if
+    per column information will be printed.
+"""
+
+pc_nb_repr_h_doc = """
+: boolean
+    When True, IPython notebook will use html representation for
+    pandas objects (if it is available).
+"""
+
+pc_pprint_nest_depth = """
+: int
+    Controls the number of nested levels to process when pretty-printing
+"""
+
+pc_multi_sparse_doc = """
+: boolean
+    "sparsify" MultiIndex display (don't display repeated
+    elements in outer levels within groups)
+"""
+
+float_format_doc = """
+: callable
+    The callable should accept a floating point number and return
+    a string with the desired format of the number. This is used
+    in some places like SeriesFormatter.
+    See formats.format.EngFormatter for an example.
+"""
+
+max_colwidth_doc = """
+: int or None
+    The maximum width in characters of a column in the repr of
+    a pandas data structure. When the column overflows, a "..."
+    placeholder is embedded in the output. A 'None' value means unlimited.
+"""
+
+colheader_justify_doc = """
+: 'left'/'right'
+    Controls the justification of column headers. used by DataFrameFormatter.
+"""
+
+pc_expand_repr_doc = """
+: boolean
+    Whether to print out the full DataFrame repr for wide DataFrames across
+    multiple lines, `max_columns` is still respected, but the output will
+    wrap-around across multiple "pages" if its width exceeds `display.width`.
+"""
+
+pc_show_dimensions_doc = """
+: boolean or 'truncate'
+    Whether to print out dimensions at the end of DataFrame repr.
+    If 'truncate' is specified, only print out the dimensions if the
+    frame is truncated (e.g. not display all rows and/or columns)
+"""
+
+pc_east_asian_width_doc = """
+: boolean
+    Whether to use the Unicode East Asian Width to calculate the display text
+    width.
+    Enabling this may affect to the performance (default: False)
+"""
+
+
+pc_table_schema_doc = """
+: boolean
+    Whether to publish a Table Schema representation for frontends
+    that support it.
+    (default: False)
+"""
+
+pc_html_border_doc = """
+: int
+    A ``border=value`` attribute is inserted in the ``<table>`` tag
+    for the DataFrame HTML repr.
+"""
+
+pc_html_use_mathjax_doc = """\
+: boolean
+    When True, Jupyter notebook will process table contents using MathJax,
+    rendering mathematical expressions enclosed by the dollar symbol.
+    (default: True)
+"""
+
+pc_max_dir_items = """\
+: int
+    The number of items that will be added to `dir(...)`. 'None' value means
+    unlimited. Because dir is cached, changing this option will not immediately
+    affect already existing dataframes until a column is deleted or added.
+
+    This is for instance used to suggest columns from a dataframe to tab
+    completion.
+"""
+
+pc_width_doc = """
+: int
+    Width of the display in characters. In case python/IPython is running in
+    a terminal this can be set to None and pandas will correctly auto-detect
+    the width.
+    Note that the IPython notebook, IPython qtconsole, or IDLE do not run in a
+    terminal and hence it is not possible to correctly detect the width.
+"""
+
+pc_chop_threshold_doc = """
+: float or None
+    if set to a float value, all float values smaller than the given threshold
+    will be displayed as exactly 0 by repr and friends.
+"""
+
+pc_max_seq_items = """
+: int or None
+    When pretty-printing a long sequence, no more then `max_seq_items`
+    will be printed. If items are omitted, they will be denoted by the
+    addition of "..." to the resulting string.
+
+    If set to None, the number of items to be printed is unlimited.
+"""
+
+pc_max_info_rows_doc = """
+: int
+    df.info() will usually show null-counts for each column.
+    For large frames this can be quite slow. max_info_rows and max_info_cols
+    limit this null check only to frames with smaller dimensions than
+    specified.
+"""
+
+pc_large_repr_doc = """
+: 'truncate'/'info'
+    For DataFrames exceeding max_rows/max_cols, the repr (and HTML repr) can
+    show a truncated table, or switch to the view from
+    df.info() (the behaviour in earlier versions of pandas).
+"""
+
+pc_memory_usage_doc = """
+: bool, string or None
+    This specifies if the memory usage of a DataFrame should be displayed when
+    df.info() is called. Valid values True,False,'deep'
+"""
+
+
+def table_schema_cb(key: str) -> None:
+    from pandas.io.formats.printing import enable_data_resource_formatter
+
+    enable_data_resource_formatter(cf.get_option(key))
+
+
+def is_terminal() -> bool:
+    """
+    Detect if Python is running in a terminal.
+
+    Returns True if Python is running in a terminal or False if not.
+    """
+    try:
+        # error: Name 'get_ipython' is not defined
+        ip = get_ipython()  # type: ignore[name-defined]
+    except NameError:  # assume standard Python interpreter in a terminal
+        return True
+    else:
+        if hasattr(ip, "kernel"):  # IPython as a Jupyter kernel
+            return False
+        else:  # IPython in a terminal
+            return True
+
+
+with cf.config_prefix("display"):
+    cf.register_option("precision", 6, pc_precision_doc, validator=is_nonnegative_int)
+    cf.register_option(
+        "float_format",
+        None,
+        float_format_doc,
+        validator=is_one_of_factory([None, is_callable]),
+    )
+    cf.register_option(
+        "max_info_rows",
+        1690785,
+        pc_max_info_rows_doc,
+        validator=is_int,
+    )
+    cf.register_option("max_rows", 60, pc_max_rows_doc, validator=is_nonnegative_int)
+    cf.register_option(
+        "min_rows",
+        10,
+        pc_min_rows_doc,
+        validator=is_instance_factory((type(None), int)),
+    )
+    cf.register_option("max_categories", 8, pc_max_categories_doc, validator=is_int)
+
+    cf.register_option(
+        "max_colwidth",
+        50,
+        max_colwidth_doc,
+        validator=is_nonnegative_int,
+    )
+    if is_terminal():
+        max_cols = 0  # automatically determine optimal number of columns
+    else:
+        max_cols = 20  # cannot determine optimal number of columns
+    cf.register_option(
+        "max_columns", max_cols, pc_max_cols_doc, validator=is_nonnegative_int
+    )
+    cf.register_option(
+        "large_repr",
+        "truncate",
+        pc_large_repr_doc,
+        validator=is_one_of_factory(["truncate", "info"]),
+    )
+    cf.register_option("max_info_columns", 100, pc_max_info_cols_doc, validator=is_int)
+    cf.register_option(
+        "colheader_justify", "right", colheader_justify_doc, validator=is_text
+    )
+    cf.register_option("notebook_repr_html", True, pc_nb_repr_h_doc, validator=is_bool)
+    cf.register_option("pprint_nest_depth", 3, pc_pprint_nest_depth, validator=is_int)
+    cf.register_option("multi_sparse", True, pc_multi_sparse_doc, validator=is_bool)
+    cf.register_option("expand_frame_repr", True, pc_expand_repr_doc)
+    cf.register_option(
+        "show_dimensions",
+        "truncate",
+        pc_show_dimensions_doc,
+        validator=is_one_of_factory([True, False, "truncate"]),
+    )
+    cf.register_option("chop_threshold", None, pc_chop_threshold_doc)
+    cf.register_option("max_seq_items", 100, pc_max_seq_items)
+    cf.register_option(
+        "width", 80, pc_width_doc, validator=is_instance_factory((type(None), int))
+    )
+    cf.register_option(
+        "memory_usage",
+        True,
+        pc_memory_usage_doc,
+        validator=is_one_of_factory([None, True, False, "deep"]),
+    )
+    cf.register_option(
+        "unicode.east_asian_width", False, pc_east_asian_width_doc, validator=is_bool
+    )
+    cf.register_option(
+        "unicode.ambiguous_as_wide", False, pc_east_asian_width_doc, validator=is_bool
+    )
+    cf.register_option(
+        "html.table_schema",
+        False,
+        pc_table_schema_doc,
+        validator=is_bool,
+        cb=table_schema_cb,
+    )
+    cf.register_option("html.border", 1, pc_html_border_doc, validator=is_int)
+    cf.register_option(
+        "html.use_mathjax", True, pc_html_use_mathjax_doc, validator=is_bool
+    )
+    cf.register_option(
+        "max_dir_items", 100, pc_max_dir_items, validator=is_nonnegative_int
+    )
+
+tc_sim_interactive_doc = """
+: boolean
+    Whether to simulate interactive mode for purposes of testing
+"""
+
+with cf.config_prefix("mode"):
+    cf.register_option("sim_interactive", False, tc_sim_interactive_doc)
+
+
+copy_on_write_doc = """
+: bool
+    Use new copy-view behaviour using Copy-on-Write. No longer used,
+    pandas now always uses Copy-on-Write behavior. This option will
+    be removed in pandas 4.0.
+"""
+
+
+with cf.config_prefix("mode"):
+    cf.register_option(
+        "copy_on_write",
+        # Get the default from an environment variable, if set, otherwise defaults
+        # to False. This environment variable can be set for testing.
+        "warn"
+        if os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "warn"
+        else os.environ.get("PANDAS_COPY_ON_WRITE", "1") == "1",
+        copy_on_write_doc,
+        validator=is_one_of_factory([True, False, "warn"]),
+    )
+
+
+# user warnings
+chained_assignment = """
+: string
+    Raise an exception, warn, or no action if trying to use chained assignment,
+    The default is warn
+"""
+
+with cf.config_prefix("mode"):
+    cf.register_option(
+        "chained_assignment",
+        "warn",
+        chained_assignment,
+        validator=is_one_of_factory([None, "warn", "raise"]),
+    )
+
+performance_warnings = """
+: boolean
+    Whether to show or hide PerformanceWarnings.
+"""
+
+with cf.config_prefix("mode"):
+    cf.register_option(
+        "performance_warnings",
+        True,
+        performance_warnings,
+        validator=is_bool,
+    )
+
+
+string_storage_doc = """
+: string
+    The default storage for StringDtype.
+"""
+
+
+def is_valid_string_storage(value: Any) -> None:
+    legal_values = ["auto", "python", "pyarrow"]
+    if value not in legal_values:
+        msg = "Value must be one of python|pyarrow"
+        raise ValueError(msg)
+
+
+with cf.config_prefix("mode"):
+    cf.register_option(
+        "string_storage",
+        "auto",
+        string_storage_doc,
+        # validator=is_one_of_factory(["python", "pyarrow"]),
+        validator=is_valid_string_storage,
+    )
+
+
+# Set up the io.excel specific reader configuration.
+reader_engine_doc = """
+: string
+    The default Excel reader engine for '{ext}' files. Available options:
+    auto, {others}.
+"""
+
+_xls_options = ["xlrd", "calamine"]
+_xlsm_options = ["xlrd", "openpyxl", "calamine"]
+_xlsx_options = ["xlrd", "openpyxl", "calamine"]
+_ods_options = ["odf", "calamine"]
+_xlsb_options = ["pyxlsb", "calamine"]
+
+
+with cf.config_prefix("io.excel.xls"):
+    cf.register_option(
+        "reader",
+        "auto",
+        reader_engine_doc.format(ext="xls", others=", ".join(_xls_options)),
+        validator=is_one_of_factory([*_xls_options, "auto"]),
+    )
+
+with cf.config_prefix("io.excel.xlsm"):
+    cf.register_option(
+        "reader",
+        "auto",
+        reader_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
+        validator=is_one_of_factory([*_xlsm_options, "auto"]),
+    )
+
+
+with cf.config_prefix("io.excel.xlsx"):
+    cf.register_option(
+        "reader",
+        "auto",
+        reader_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
+        validator=is_one_of_factory([*_xlsx_options, "auto"]),
+    )
+
+
+with cf.config_prefix("io.excel.ods"):
+    cf.register_option(
+        "reader",
+        "auto",
+        reader_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
+        validator=is_one_of_factory([*_ods_options, "auto"]),
+    )
+
+with cf.config_prefix("io.excel.xlsb"):
+    cf.register_option(
+        "reader",
+        "auto",
+        reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)),
+        validator=is_one_of_factory([*_xlsb_options, "auto"]),
+    )
+
+# Set up the io.excel specific writer configuration.
+writer_engine_doc = """
+: string
+    The default Excel writer engine for '{ext}' files. Available options:
+    auto, {others}.
+"""
+
+_xlsm_options = ["openpyxl"]
+_xlsx_options = ["openpyxl", "xlsxwriter"]
+_ods_options = ["odf"]
+
+
+with cf.config_prefix("io.excel.xlsm"):
+    cf.register_option(
+        "writer",
+        "auto",
+        writer_engine_doc.format(ext="xlsm", others=", ".join(_xlsm_options)),
+        validator=str,
+    )
+
+
+with cf.config_prefix("io.excel.xlsx"):
+    cf.register_option(
+        "writer",
+        "auto",
+        writer_engine_doc.format(ext="xlsx", others=", ".join(_xlsx_options)),
+        validator=str,
+    )
+
+
+with cf.config_prefix("io.excel.ods"):
+    cf.register_option(
+        "writer",
+        "auto",
+        writer_engine_doc.format(ext="ods", others=", ".join(_ods_options)),
+        validator=str,
+    )
+
+
+# Set up the io.parquet specific configuration.
+parquet_engine_doc = """
+: string
+    The default parquet reader/writer engine. Available options:
+    'auto', 'pyarrow', 'fastparquet', the default is 'auto'
+"""
+
+with cf.config_prefix("io.parquet"):
+    cf.register_option(
+        "engine",
+        "auto",
+        parquet_engine_doc,
+        validator=is_one_of_factory(["auto", "pyarrow", "fastparquet"]),
+    )
+
+
+# Set up the io.sql specific configuration.
+sql_engine_doc = """
+: string
+    The default sql reader/writer engine. Available options:
+    'auto', 'sqlalchemy', the default is 'auto'
+"""
+
+with cf.config_prefix("io.sql"):
+    cf.register_option(
+        "engine",
+        "auto",
+        sql_engine_doc,
+        validator=is_one_of_factory(["auto", "sqlalchemy"]),
+    )
+
+# --------
+# Plotting
+# ---------
+
+plotting_backend_doc = """
+: str
+    The plotting backend to use. The default value is "matplotlib", the
+    backend provided with pandas. Other backends can be specified by
+    providing the name of the module that implements the backend.
+"""
+
+
+def register_plotting_backend_cb(key: str | None) -> None:
+    if key == "matplotlib":
+        # We defer matplotlib validation, since it's the default
+        return
+    from pandas.plotting._core import _get_plot_backend
+
+    _get_plot_backend(key)
+
+
+with cf.config_prefix("plotting"):
+    cf.register_option(
+        "backend",
+        defval="matplotlib",
+        doc=plotting_backend_doc,
+        validator=register_plotting_backend_cb,  # type: ignore[arg-type]
+    )
+
+
+register_converter_doc = """
+: bool or 'auto'.
+    Whether to register converters with matplotlib's units registry for
+    dates, times, datetimes, and Periods. Toggling to False will remove
+    the converters, restoring any converters that pandas overwrote.
+"""
+
+
+def register_converter_cb(key: str) -> None:
+    from pandas.plotting import (
+        deregister_matplotlib_converters,
+        register_matplotlib_converters,
+    )
+
+    if cf.get_option(key):
+        register_matplotlib_converters()
+    else:
+        deregister_matplotlib_converters()
+
+
+with cf.config_prefix("plotting.matplotlib"):
+    cf.register_option(
+        "register_converters",
+        "auto",
+        register_converter_doc,
+        validator=is_one_of_factory(["auto", True, False]),
+        cb=register_converter_cb,
+    )
+
+# ------
+# Styler
+# ------
+
+styler_sparse_index_doc = """
+: bool
+    Whether to sparsify the display of a hierarchical index. Setting to False will
+    display each explicit level element in a hierarchical key for each row.
+"""
+
+styler_sparse_columns_doc = """
+: bool
+    Whether to sparsify the display of hierarchical columns. Setting to False will
+    display each explicit level element in a hierarchical key for each column.
+"""
+
+styler_render_repr = """
+: str
+    Determine which output to use in Jupyter Notebook in {"html", "latex"}.
+"""
+
+styler_max_elements = """
+: int
+    The maximum number of data-cell (<td>) elements that will be rendered before
+    trimming will occur over columns, rows or both if needed.
+"""
+
+styler_max_rows = """
+: int, optional
+    The maximum number of rows that will be rendered. May still be reduced to
+    satisfy ``max_elements``, which takes precedence.
+"""
+
+styler_max_columns = """
+: int, optional
+    The maximum number of columns that will be rendered. May still be reduced to
+    satisfy ``max_elements``, which takes precedence.
+"""
+
+styler_precision = """
+: int
+    The precision for floats and complex numbers.
+"""
+
+styler_decimal = """
+: str
+    The character representation for the decimal separator for floats and complex.
+"""
+
+styler_thousands = """
+: str, optional
+    The character representation for thousands separator for floats, int and complex.
+"""
+
+styler_na_rep = """
+: str, optional
+    The string representation for values identified as missing.
+"""
+
+styler_escape = """
+: str, optional
+    Whether to escape certain characters according to the given context; html or latex.
+"""
+
+styler_formatter = """
+: str, callable, dict, optional
+    A formatter object to be used as default within ``Styler.format``.
+"""
+
+styler_multirow_align = """
+: {"c", "t", "b"}
+    The specifier for vertical alignment of sparsified LaTeX multirows.
+"""
+
+styler_multicol_align = r"""
+: {"r", "c", "l", "naive-l", "naive-r"}
+    The specifier for horizontal alignment of sparsified LaTeX multicolumns. Pipe
+    decorators can also be added to non-naive values to draw vertical
+    rules, e.g. "\|r" will draw a rule on the left side of right aligned merged cells.
+"""
+
+styler_hrules = """
+: bool
+    Whether to add horizontal rules on top and bottom and below the headers.
+"""
+
+styler_environment = """
+: str
+    The environment to replace ``\\begin{table}``. If "longtable" is used results
+    in a specific longtable environment format.
+"""
+
+styler_encoding = """
+: str
+    The encoding used for output HTML and LaTeX files.
+"""
+
+styler_mathjax = """
+: bool
+    If False will render special CSS classes to table attributes that indicate Mathjax
+    will not be used in Jupyter Notebook.
+"""
+
+with cf.config_prefix("styler"):
+    cf.register_option("sparse.index", True, styler_sparse_index_doc, validator=is_bool)
+
+    cf.register_option(
+        "sparse.columns", True, styler_sparse_columns_doc, validator=is_bool
+    )
+
+    cf.register_option(
+        "render.repr",
+        "html",
+        styler_render_repr,
+        validator=is_one_of_factory(["html", "latex"]),
+    )
+
+    cf.register_option(
+        "render.max_elements",
+        2**18,
+        styler_max_elements,
+        validator=is_nonnegative_int,
+    )
+
+    cf.register_option(
+        "render.max_rows",
+        None,
+        styler_max_rows,
+        validator=is_nonnegative_int,
+    )
+
+    cf.register_option(
+        "render.max_columns",
+        None,
+        styler_max_columns,
+        validator=is_nonnegative_int,
+    )
+
+    cf.register_option("render.encoding", "utf-8", styler_encoding, validator=is_str)
+
+    cf.register_option("format.decimal", ".", styler_decimal, validator=is_str)
+
+    cf.register_option(
+        "format.precision", 6, styler_precision, validator=is_nonnegative_int
+    )
+
+    cf.register_option(
+        "format.thousands",
+        None,
+        styler_thousands,
+        validator=is_instance_factory((type(None), str)),
+    )
+
+    cf.register_option(
+        "format.na_rep",
+        None,
+        styler_na_rep,
+        validator=is_instance_factory((type(None), str)),
+    )
+
+    cf.register_option(
+        "format.escape",
+        None,
+        styler_escape,
+        validator=is_one_of_factory([None, "html", "latex", "latex-math"]),
+    )
+
+    # error: Argument 1 to "is_instance_factory" has incompatible type "tuple[
+    # ..., <typing special form>, ...]"; expected "type | tuple[type, ...]"
+    cf.register_option(
+        "format.formatter",
+        None,
+        styler_formatter,
+        validator=is_instance_factory(
+            (type(None), dict, Callable, str)  # type: ignore[arg-type]
+        ),
+    )
+
+    cf.register_option("html.mathjax", True, styler_mathjax, validator=is_bool)
+
+    cf.register_option(
+        "latex.multirow_align",
+        "c",
+        styler_multirow_align,
+        validator=is_one_of_factory(["c", "t", "b", "naive"]),
+    )
+
+    val_mca = ["r", "|r|", "|r", "r|", "c", "|c|", "|c", "c|", "l", "|l|", "|l", "l|"]
+    val_mca += ["naive-l", "naive-r"]
+    cf.register_option(
+        "latex.multicol_align",
+        "r",
+        styler_multicol_align,
+        validator=is_one_of_factory(val_mca),
+    )
+
+    cf.register_option("latex.hrules", False, styler_hrules, validator=is_bool)
+
+    cf.register_option(
+        "latex.environment",
+        None,
+        styler_environment,
+        validator=is_instance_factory((type(None), str)),
+    )
+
+
+with cf.config_prefix("future"):
+    cf.register_option(
+        "infer_string",
+        False if os.environ.get("PANDAS_FUTURE_INFER_STRING", "1") == "0" else True,
+        "Whether to infer sequence of str objects as pyarrow string "
+        "dtype, which will be the default in pandas 3.0 "
+        "(at which point this option will be deprecated).",
+        validator=is_one_of_factory([True, False]),
+    )
+
+    cf.register_option(
+        "no_silent_downcasting",
+        False,
+        "This option is deprecated and will be removed in a future version. "
+        "It has no effect.",
+        validator=is_one_of_factory([True, False]),
+    )
+
+    cf.register_option(
+        "distinguish_nan_and_na",
+        os.environ.get("PANDAS_FUTURE_DISTINGUISH_NAN_AND_NA", "0") == "1",
+        "Whether to treat NaN entries as distinct from pd.NA in "
+        "numpy-nullable and pyarrow float dtypes. By default treats both "
+        "interchangeable as missing values (NaN will be coerced to NA). "
+        "See discussion in "
+        "https://github.com/pandas-dev/pandas/issues/32265",
+        validator=is_one_of_factory([True, False]),
+    )
+
+    cf.register_option(
+        "python_scalars",
+        False if os.environ.get("PANDAS_FUTURE_PYTHON_SCALARS", "0") == "0" else True,
+        "Whether to return Python scalars instead of NumPy or PyArrow scalars. "
+        "Currently experimental, setting to True is not recommended for end users.",
+        validator=is_one_of_factory([True, False]),
+    )
+
+
+# GH#59502
+cf.deprecate_option("future.no_silent_downcasting", Pandas4Warning)
+cf.deprecate_option(
+    "mode.copy_on_write",
+    Pandas4Warning,
+    msg=(
+        "The 'mode.copy_on_write' option is deprecated. Copy-on-Write can no longer "
+        "be disabled (it is always enabled with pandas >= 3.0), and setting the option "
+        "has no impact. This option will be removed in pandas 4.0."
+    ),
+)
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
new file mode 100644
index 0000000000000000000000000000000000000000..953309e03fac8b5c722663477af6db46d8be4f94
--- /dev/null
+++ b/pandas/core/construction.py
@@ -0,0 +1,852 @@
+"""
+Constructor functions intended to be shared by pd.array, Series.__init__,
+and Index.__new__.
+
+These should not depend on core.internals.
+"""
+
+from __future__ import annotations
+
+from typing import (
+    TYPE_CHECKING,
+    cast,
+    overload,
+)
+
+import numpy as np
+from numpy import ma
+
+from pandas._config import using_string_dtype
+
+from pandas._libs import lib
+from pandas._libs.tslibs import (
+    get_supported_dtype,
+    is_supported_dtype,
+)
+from pandas.util._decorators import set_module
+
+from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.cast import (
+    construct_1d_arraylike_from_scalar,
+    construct_1d_object_array_from_listlike,
+    maybe_cast_to_datetime,
+    maybe_cast_to_integer_array,
+    maybe_convert_platform,
+    maybe_promote,
+)
+from pandas.core.dtypes.common import (
+    ensure_object,
+    is_list_like,
+    is_object_dtype,
+    pandas_dtype,
+)
+from pandas.core.dtypes.dtypes import NumpyEADtype
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCExtensionArray,
+    ABCIndex,
+    ABCSeries,
+)
+from pandas.core.dtypes.missing import isna
+
+import pandas.core.common as com
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from pandas._typing import (
+        AnyArrayLike,
+        ArrayLike,
+        Dtype,
+        DtypeObj,
+        T,
+    )
+
+    from pandas import (
+        Index,
+        Series,
+    )
+    from pandas.core.arrays import (
+        DatetimeArray,
+        ExtensionArray,
+        TimedeltaArray,
+    )
+
+
+@set_module("pandas")
+def array(
+    data: Sequence[object] | AnyArrayLike,
+    dtype: Dtype | None = None,
+    copy: bool = True,
+) -> ExtensionArray:
+    """
+    Create an array.
+
+    This method constructs an array using pandas extension types when possible.
+    If `dtype` is specified, it determines the type of array returned. Otherwise,
+    pandas attempts to infer the appropriate dtype based on `data`.
+
+    Parameters
+    ----------
+    data : Sequence of objects
+        The scalars inside `data` should be instances of the
+        scalar type for `dtype`. It's expected that `data`
+        represents a 1-dimensional array of data.
+
+        When `data` is an Index or Series, the underlying array
+        will be extracted from `data`.
+
+    dtype : str, np.dtype, or ExtensionDtype, optional
+        The dtype to use for the array. This may be a NumPy
+        dtype or an extension type registered with pandas using
+        :meth:`pandas.api.extensions.register_extension_dtype`.
+
+        If not specified, there are two possibilities:
+
+        1. When `data` is a :class:`Series`, :class:`Index`, or
+           :class:`ExtensionArray`, the `dtype` will be taken
+           from the data.
+        2. Otherwise, pandas will attempt to infer the `dtype`
+           from the data.
+
+        Note that when `data` is a NumPy array, ``data.dtype`` is
+        *not* used for inferring the array type. This is because
+        NumPy cannot represent all the types of data that can be
+        held in extension arrays.
+
+        Currently, pandas will infer an extension dtype for sequences of
+
+        ============================== =======================================
+        Scalar Type                    Array Type
+        ============================== =======================================
+        :class:`pandas.Interval`       :class:`pandas.arrays.IntervalArray`
+        :class:`pandas.Period`         :class:`pandas.arrays.PeriodArray`
+        :class:`datetime.datetime`     :class:`pandas.arrays.DatetimeArray`
+        :class:`datetime.timedelta`    :class:`pandas.arrays.TimedeltaArray`
+        :class:`int`                   :class:`pandas.arrays.IntegerArray`
+        :class:`float`                 :class:`pandas.arrays.FloatingArray`
+        :class:`str`                   :class:`pandas.arrays.StringArray` or
+                                       :class:`pandas.arrays.ArrowStringArray`
+        :class:`bool`                  :class:`pandas.arrays.BooleanArray`
+        ============================== =======================================
+
+        The ExtensionArray created when the scalar type is :class:`str` is determined by
+        ``pd.options.mode.string_storage`` if the dtype is not explicitly given.
+
+        For all other cases, NumPy's usual inference rules will be used.
+    copy : bool, default True
+        Whether to copy the data, even if not necessary. Depending
+        on the type of `data`, creating the new array may require
+        copying data, even if ``copy=False``.
+
+    Returns
+    -------
+    ExtensionArray
+        The newly created array.
+
+    Raises
+    ------
+    ValueError
+        When `data` is not 1-dimensional.
+
+    See Also
+    --------
+    numpy.array : Construct a NumPy array.
+    Series : Construct a pandas Series.
+    Index : Construct a pandas Index.
+    arrays.NumpyExtensionArray : ExtensionArray wrapping a NumPy array.
+    Series.array : Extract the array stored within a Series.
+
+    Notes
+    -----
+    Omitting the `dtype` argument means pandas will attempt to infer the
+    best array type from the values in the data. As new array types are
+    added by pandas and 3rd party libraries, the "best" array type may
+    change. We recommend specifying `dtype` to ensure that
+
+    1. the correct array type for the data is returned
+    2. the returned array type doesn't change as new extension types
+       are added by pandas and third-party libraries
+
+    Additionally, if the underlying memory representation of the returned
+    array matters, we recommend specifying the `dtype` as a concrete object
+    rather than a string alias or allowing it to be inferred. For example,
+    a future version of pandas or a 3rd-party library may include a
+    dedicated ExtensionArray for string data. In this event, the following
+    would no longer return a :class:`arrays.NumpyExtensionArray` backed by a
+    NumPy array.
+
+    >>> pd.array(["a", "b"], dtype=str)
+    <ArrowStringArray>
+    ['a', 'b']
+    Length: 2, dtype: str
+
+    This would instead return the new ExtensionArray dedicated for string
+    data. If you really need the new array to be backed by a  NumPy array,
+    specify that in the dtype.
+
+    >>> pd.array(["a", "b"], dtype=np.dtype("<U1"))
+    <NumpyExtensionArray>
+    ['a', 'b']
+    Length: 2, dtype: str32
+
+    Finally, Pandas has arrays that mostly overlap with NumPy
+
+      * :class:`arrays.DatetimeArray`
+      * :class:`arrays.TimedeltaArray`
+
+    When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is
+    passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray``
+    rather than a ``NumpyExtensionArray``. This is for symmetry with the case of
+    timezone-aware data, which NumPy does not natively support.
+
+    >>> pd.array(["2015", "2016"], dtype="datetime64[ns]")
+    <DatetimeArray>
+    ['2015-01-01 00:00:00', '2016-01-01 00:00:00']
+    Length: 2, dtype: datetime64[ns]
+
+    >>> pd.array(["1h", "2h"], dtype="timedelta64[ns]")
+    <TimedeltaArray>
+    ['0 days 01:00:00', '0 days 02:00:00']
+    Length: 2, dtype: timedelta64[ns]
+
+    Examples
+    --------
+    If a dtype is not specified, pandas will infer the best dtype from the values.
+    See the description of `dtype` for the types pandas infers for.
+
+    >>> pd.array([1, 2])
+    <IntegerArray>
+    [1, 2]
+    Length: 2, dtype: Int64
+
+    >>> pd.array([1, 2, np.nan])
+    <IntegerArray>
+    [1, 2, <NA>]
+    Length: 3, dtype: Int64
+
+    >>> pd.array([1.1, 2.2])
+    <FloatingArray>
+    [1.1, 2.2]
+    Length: 2, dtype: Float64
+
+    >>> pd.array(["a", None, "c"])
+    <ArrowStringArray>
+    ['a', <NA>, 'c']
+    Length: 3, dtype: string
+
+    >>> with pd.option_context("string_storage", "python"):
+    ...     arr = pd.array(["a", None, "c"])
+    >>> arr
+    <StringArray>
+    ['a', <NA>, 'c']
+    Length: 3, dtype: string
+
+    >>> pd.array([pd.Period("2000", freq="D"), pd.Period("2000", freq="D")])
+    <PeriodArray>
+    ['2000-01-01', '2000-01-01']
+    Length: 2, dtype: period[D]
+
+    You can use the string alias for `dtype`
+
+    >>> pd.array(["a", "b", "a"], dtype="category")
+    ['a', 'b', 'a']
+    Categories (2, str): ['a', 'b']
+
+    Or specify the actual dtype
+
+    >>> pd.array(
+    ...     ["a", "b", "a"], dtype=pd.CategoricalDtype(["a", "b", "c"], ordered=True)
+    ... )
+    ['a', 'b', 'a']
+    Categories (3, str): ['a' < 'b' < 'c']
+
+    If pandas does not infer a dedicated extension type a
+    :class:`arrays.NumpyExtensionArray` is returned.
+
+    >>> pd.array([1 + 1j, 3 + 2j])
+    <NumpyExtensionArray>
+    [(1+1j), (3+2j)]
+    Length: 2, dtype: complex128
+
+    As mentioned in the "Notes" section, new extension types may be added
+    in the future (by pandas or 3rd party libraries), causing the return
+    value to no longer be a :class:`arrays.NumpyExtensionArray`. Specify the
+    `dtype` as a NumPy dtype if you need to ensure there's no future change in
+    behavior.
+
+    >>> pd.array([1, 2], dtype=np.dtype("int32"))
+    <NumpyExtensionArray>
+    [1, 2]
+    Length: 2, dtype: int32
+
+    `data` must be 1-dimensional. A ValueError is raised when the input
+    has the wrong dimensionality.
+
+    >>> pd.array(1)
+    Traceback (most recent call last):
+      ...
+    ValueError: Cannot pass scalar '1' to 'pandas.array'.
+    """
+    from pandas.core.arrays import (
+        BooleanArray,
+        DatetimeArray,
+        ExtensionArray,
+        FloatingArray,
+        IntegerArray,
+        NumpyExtensionArray,
+        TimedeltaArray,
+    )
+    from pandas.core.arrays.string_ import StringDtype
+
+    if lib.is_scalar(data):
+        msg = f"Cannot pass scalar '{data}' to 'pandas.array'."
+        raise ValueError(msg)
+    elif isinstance(data, ABCDataFrame):
+        raise TypeError("Cannot pass DataFrame to 'pandas.array'")
+
+    if dtype is None and isinstance(data, (ABCSeries, ABCIndex, ExtensionArray)):
+        # Note: we exclude np.ndarray here, will do type inference on it
+        dtype = data.dtype
+
+    data = extract_array(data, extract_numpy=True)
+
+    # this returns None for not-found dtypes.
+    if dtype is not None:
+        dtype = pandas_dtype(dtype)
+
+    if isinstance(data, ExtensionArray) and (dtype is None or data.dtype == dtype):
+        # e.g. TimedeltaArray[s], avoid casting to NumpyExtensionArray
+        if copy:
+            return data.copy()
+        return data
+
+    if isinstance(dtype, ExtensionDtype):
+        cls = dtype.construct_array_type()
+        return cls._from_sequence(data, dtype=dtype, copy=copy)
+
+    if dtype is None:
+        was_ndarray = isinstance(data, np.ndarray)
+        # error: Item "Sequence[object]" of "Sequence[object] | ExtensionArray |
+        # ndarray[Any, Any]" has no attribute "dtype"
+        if not was_ndarray or data.dtype == object:  # type: ignore[union-attr]
+            result = lib.maybe_convert_objects(
+                ensure_object(data),
+                convert_non_numeric=True,
+                convert_to_nullable_dtype=True,
+                dtype_if_all_nat=np.dtype("M8[s]"),
+            )
+            result = ensure_wrapped_if_datetimelike(result)
+            if isinstance(result, np.ndarray):
+                if len(result) == 0 and not was_ndarray:
+                    # e.g. empty list
+                    return FloatingArray._from_sequence(data, dtype="Float64")
+                return NumpyExtensionArray._from_sequence(
+                    data, dtype=result.dtype, copy=copy
+                )
+            if result is data and copy:
+                return result.copy()
+            return result
+
+        data = cast(np.ndarray, data)
+        result = ensure_wrapped_if_datetimelike(data)
+        if result is not data:
+            result = cast("DatetimeArray | TimedeltaArray", result)
+            if copy and result.dtype == data.dtype:
+                return result.copy()
+            return result
+
+        if data.dtype.kind in "SU":
+            # StringArray/ArrowStringArray depending on pd.options.mode.string_storage
+            dtype = StringDtype()
+            cls = dtype.construct_array_type()
+            return cls._from_sequence(data, dtype=dtype, copy=copy)
+
+        elif data.dtype.kind in "iu":
+            dtype = IntegerArray._dtype_cls._get_dtype_mapping()[data.dtype]
+            return IntegerArray._from_sequence(data, dtype=dtype, copy=copy)
+        elif data.dtype.kind == "f":
+            # GH#44715 Exclude np.float16 bc FloatingArray does not support it;
+            #  we will fall back to NumpyExtensionArray.
+            if data.dtype == np.float16:
+                return NumpyExtensionArray._from_sequence(
+                    data, dtype=data.dtype, copy=copy
+                )
+            dtype = FloatingArray._dtype_cls._get_dtype_mapping()[data.dtype]
+            return FloatingArray._from_sequence(data, dtype=dtype, copy=copy)
+
+        elif data.dtype.kind == "b":
+            return BooleanArray._from_sequence(data, dtype="boolean", copy=copy)
+        else:
+            # e.g. complex
+            return NumpyExtensionArray._from_sequence(data, dtype=data.dtype, copy=copy)
+
+    # Pandas overrides NumPy for
+    #   1. datetime64[ns,us,ms,s]
+    #   2. timedelta64[ns,us,ms,s]
+    # so that a DatetimeArray is returned.
+    if lib.is_np_dtype(dtype, "M") and is_supported_dtype(dtype):
+        return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy)
+    if lib.is_np_dtype(dtype, "m") and is_supported_dtype(dtype):
+        return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy)
+
+    elif lib.is_np_dtype(dtype, "mM"):
+        raise ValueError(
+            # GH#53817
+            r"datetime64 and timedelta64 dtype resolutions other than "
+            r"'s', 'ms', 'us', and 'ns' are no longer supported."
+        )
+
+    return NumpyExtensionArray._from_sequence(data, dtype=dtype, copy=copy)
+
+
+_typs = frozenset(
+    {
+        "index",
+        "rangeindex",
+        "multiindex",
+        "datetimeindex",
+        "timedeltaindex",
+        "periodindex",
+        "categoricalindex",
+        "intervalindex",
+        "series",
+    }
+)
+
+
+@overload
+def extract_array(
+    obj: Series | Index, extract_numpy: bool = ..., extract_range: bool = ...
+) -> ArrayLike: ...
+
+
+@overload
+def extract_array(
+    obj: T, extract_numpy: bool = ..., extract_range: bool = ...
+) -> T | ArrayLike: ...
+
+
+def extract_array(
+    obj: T, extract_numpy: bool = False, extract_range: bool = False
+) -> T | ArrayLike:
+    """
+    Extract the ndarray or ExtensionArray from a Series or Index.
+
+    For all other types, `obj` is just returned as is.
+
+    Parameters
+    ----------
+    obj : object
+        For Series / Index, the underlying ExtensionArray is unboxed.
+
+    extract_numpy : bool, default False
+        Whether to extract the ndarray from a NumpyExtensionArray.
+
+    extract_range : bool, default False
+        If we have a RangeIndex, return range._values if True
+        (which is a materialized integer ndarray), otherwise return unchanged.
+
+    Returns
+    -------
+    arr : object
+
+    Examples
+    --------
+    >>> extract_array(pd.Series(["a", "b", "c"], dtype="category"))
+    ['a', 'b', 'c']
+    Categories (3, str): ['a', 'b', 'c']
+
+    Other objects like lists, arrays, and DataFrames are just passed through.
+
+    >>> extract_array([1, 2, 3])
+    [1, 2, 3]
+
+    For an ndarray-backed Series / Index the ndarray is returned.
+
+    >>> extract_array(pd.Series([1, 2, 3]))
+    array([1, 2, 3])
+
+    To extract all the way down to the ndarray, pass ``extract_numpy=True``.
+
+    >>> extract_array(pd.Series([1, 2, 3]), extract_numpy=True)
+    array([1, 2, 3])
+    """
+    typ = getattr(obj, "_typ", None)
+    if typ in _typs:
+        # i.e. isinstance(obj, (ABCIndex, ABCSeries))
+        if typ == "rangeindex":
+            if extract_range:
+                # error: "T" has no attribute "_values"
+                return obj._values  # type: ignore[attr-defined]
+            return obj
+
+        # error: "T" has no attribute "_values"
+        return obj._values  # type: ignore[attr-defined]
+
+    elif extract_numpy and typ == "npy_extension":
+        # i.e. isinstance(obj, ABCNumpyExtensionArray)
+        # error: "T" has no attribute "to_numpy"
+        return obj.to_numpy()  # type: ignore[attr-defined]
+
+    return obj
+
+
+def ensure_wrapped_if_datetimelike(arr):
+    """
+    Wrap datetime64 and timedelta64 ndarrays in DatetimeArray/TimedeltaArray.
+    """
+    if isinstance(arr, np.ndarray):
+        if arr.dtype.kind == "M":
+            from pandas.core.arrays import DatetimeArray
+
+            dtype = get_supported_dtype(arr.dtype)
+            return DatetimeArray._from_sequence(arr, dtype=dtype)
+
+        elif arr.dtype.kind == "m":
+            from pandas.core.arrays import TimedeltaArray
+
+            dtype = get_supported_dtype(arr.dtype)
+            return TimedeltaArray._from_sequence(arr, dtype=dtype)
+
+    return arr
+
+
+def sanitize_masked_array(data: ma.MaskedArray) -> np.ndarray:
+    """
+    Convert numpy MaskedArray to ensure mask is softened.
+    """
+    mask = ma.getmaskarray(data)
+    if mask.any():
+        dtype, fill_value = maybe_promote(data.dtype, np.nan)
+        dtype = cast(np.dtype, dtype)
+        data = ma.asarray(data.astype(dtype, copy=True))
+        data.soften_mask()  # set hardmask False if it was True
+        data[mask] = fill_value
+    else:
+        data = data.copy()
+    return data
+
+
+def sanitize_array(
+    data,
+    index: Index | None,
+    dtype: DtypeObj | None = None,
+    copy: bool = False,
+    *,
+    allow_2d: bool = False,
+) -> ArrayLike:
+    """
+    Sanitize input data to an ndarray or ExtensionArray, copy if specified,
+    coerce to the dtype if specified.
+
+    Parameters
+    ----------
+    data : Any
+    index : Index or None, default None
+    dtype : np.dtype, ExtensionDtype, or None, default None
+    copy : bool, default False
+    allow_2d : bool, default False
+        If False, raise if we have a 2D Arraylike.
+
+    Returns
+    -------
+    np.ndarray or ExtensionArray
+    """
+    original_dtype = dtype
+    if isinstance(data, ma.MaskedArray):
+        data = sanitize_masked_array(data)
+
+    if isinstance(dtype, NumpyEADtype):
+        # Avoid ending up with a NumpyExtensionArray
+        dtype = dtype.numpy_dtype
+
+    infer_object = not isinstance(data, (ABCIndex, ABCSeries))
+
+    # extract ndarray or ExtensionArray, ensure we have no NumpyExtensionArray
+    data = extract_array(data, extract_numpy=True, extract_range=True)
+
+    if isinstance(data, np.ndarray) and data.ndim == 0:
+        if dtype is None:
+            dtype = data.dtype
+        data = lib.item_from_zerodim(data)
+    elif isinstance(data, range):
+        # GH#16804
+        data = range_to_ndarray(data)
+        copy = False
+
+    if not is_list_like(data):
+        if index is None:
+            raise ValueError("index must be specified when data is not list-like")
+        if isinstance(data, str) and using_string_dtype() and original_dtype is None:
+            from pandas.core.arrays.string_ import StringDtype
+
+            dtype = StringDtype(na_value=np.nan)
+        data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
+
+        return data
+
+    elif isinstance(data, ABCExtensionArray):
+        # it is already ensured above this is not a NumpyExtensionArray
+        # Until GH#49309 is fixed this check needs to come before the
+        #  ExtensionDtype check
+        if dtype is not None:
+            subarr = data.astype(dtype, copy=copy)
+        elif copy:
+            subarr = data.copy()
+        else:
+            subarr = data
+
+    elif isinstance(dtype, ExtensionDtype):
+        # create an extension array from its dtype
+        _sanitize_non_ordered(data)
+        cls = dtype.construct_array_type()
+        if not hasattr(data, "__array__"):
+            data = list(data)
+        subarr = cls._from_sequence(data, dtype=dtype, copy=copy)
+
+    # GH#846
+    elif isinstance(data, np.ndarray):
+        if isinstance(data, np.matrix):
+            data = data.A
+
+        if dtype is None:
+            subarr = data
+            if data.dtype == object and infer_object:
+                subarr = lib.maybe_convert_objects(
+                    data,
+                    # Here we do not convert numeric dtypes, as if we wanted that,
+                    #  numpy would have done it for us.
+                    convert_numeric=False,
+                    convert_non_numeric=True,
+                    convert_to_nullable_dtype=False,
+                    dtype_if_all_nat=np.dtype("M8[s]"),
+                )
+            elif data.dtype.kind == "U" and using_string_dtype():
+                from pandas.core.arrays.string_ import StringDtype
+
+                dtype = StringDtype(na_value=np.nan)
+                subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
+
+            if (
+                subarr is data
+                or (subarr.dtype == "str" and subarr.dtype.storage == "python")  # type: ignore[union-attr]
+            ) and copy:
+                subarr = subarr.copy()
+
+        else:
+            # we will try to copy by-definition here
+            subarr = _try_cast(data, dtype, copy)
+
+    elif hasattr(data, "__array__"):
+        # e.g. dask array GH#38645
+        if not copy:
+            data = np.asarray(data)
+        else:
+            data = np.array(data, copy=copy)
+        return sanitize_array(
+            data,
+            index=index,
+            dtype=dtype,
+            copy=False,
+            allow_2d=allow_2d,
+        )
+
+    else:
+        _sanitize_non_ordered(data)
+        # materialize e.g. generators, convert e.g. tuples, abc.ValueView
+        data = list(data)
+
+        if len(data) == 0 and dtype is None:
+            # We default to float64, matching numpy
+            subarr = np.array([], dtype=np.float64)
+
+        elif dtype is not None:
+            subarr = _try_cast(data, dtype, copy)
+
+        else:
+            subarr = maybe_convert_platform(data)
+            if subarr.dtype == object:
+                subarr = cast(np.ndarray, subarr)
+                subarr = lib.maybe_convert_objects(
+                    subarr,
+                    # Here we do not convert numeric dtypes, as if we wanted that,
+                    #  numpy would have done it for us.
+                    convert_numeric=False,
+                    convert_non_numeric=True,
+                    convert_to_nullable_dtype=False,
+                    dtype_if_all_nat=np.dtype("M8[s]"),
+                )
+
+    subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d)
+
+    if isinstance(subarr, np.ndarray):
+        # at this point we should have dtype be None or subarr.dtype == dtype
+        dtype = cast(np.dtype, dtype)
+        subarr = _sanitize_str_dtypes(subarr, data, dtype, copy)
+
+    return subarr
+
+
+def range_to_ndarray(rng: range) -> np.ndarray:
+    """
+    Cast a range object to ndarray.
+    """
+    # GH#30171 perf avoid realizing range as a list in np.array
+    try:
+        arr = np.arange(rng.start, rng.stop, rng.step, dtype="int64")
+    except OverflowError:
+        # GH#30173 handling for ranges that overflow int64
+        if (rng.start >= 0 and rng.step > 0) or (rng.step < 0 <= rng.stop):
+            try:
+                arr = np.arange(rng.start, rng.stop, rng.step, dtype="uint64")
+            except OverflowError:
+                arr = construct_1d_object_array_from_listlike(list(rng))
+        else:
+            arr = construct_1d_object_array_from_listlike(list(rng))
+    return arr
+
+
+def _sanitize_non_ordered(data) -> None:
+    """
+    Raise only for unordered sets, e.g., not for dict_keys
+    """
+    if isinstance(data, (set, frozenset)):
+        raise TypeError(f"'{type(data).__name__}' type is unordered")
+
+
+def _sanitize_ndim(
+    result: ArrayLike,
+    data,
+    dtype: DtypeObj | None,
+    index: Index | None,
+    *,
+    allow_2d: bool = False,
+) -> ArrayLike:
+    """
+    Ensure we have a 1-dimensional result array.
+    """
+    if getattr(result, "ndim", 0) == 0:
+        raise ValueError("result should be arraylike with ndim > 0")
+
+    if result.ndim == 1:
+        # the result that we want
+        result = _maybe_repeat(result, index)
+
+    elif result.ndim > 1:
+        if isinstance(data, np.ndarray):
+            if allow_2d:
+                return result
+            raise ValueError(
+                f"Data must be 1-dimensional, got ndarray of shape {data.shape} instead"
+            )
+        if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype):
+            # i.e. NumpyEADtype("O")
+
+            result = com.asarray_tuplesafe(data, dtype=np.dtype("object"))
+            cls = dtype.construct_array_type()
+            result = cls._from_sequence(result, dtype=dtype)
+        else:
+            # error: Argument "dtype" to "asarray_tuplesafe" has incompatible type
+            # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[str,
+            # dtype[Any], None]"
+            result = com.asarray_tuplesafe(data, dtype=dtype)  # type: ignore[arg-type]
+    return result
+
+
+def _sanitize_str_dtypes(
+    result: np.ndarray, data, dtype: np.dtype | None, copy: bool
+) -> np.ndarray:
+    """
+    Ensure we have a dtype that is supported by pandas.
+    """
+
+    # This is to prevent mixed-type Series getting all casted to
+    # NumPy string type, e.g. NaN --> '-1#IND'.
+    if issubclass(result.dtype.type, str):
+        # GH#16605
+        # If not empty convert the data to dtype
+        # GH#19853: If data is a scalar, result has already the result
+        if not lib.is_scalar(data):
+            if not np.all(isna(data)):
+                data = np.asarray(data, dtype=dtype)
+            if not copy:
+                result = np.asarray(data, dtype=object)
+            else:
+                result = np.array(data, dtype=object, copy=copy)
+    return result
+
+
+def _maybe_repeat(arr: ArrayLike, index: Index | None) -> ArrayLike:
+    """
+    If we have a length-1 array and an index describing how long we expect
+    the result to be, repeat the array.
+    """
+    if index is not None:
+        if 1 == len(arr) != len(index):
+            arr = arr.repeat(len(index))
+    return arr
+
+
+def _try_cast(
+    arr: list | np.ndarray,
+    dtype: np.dtype,
+    copy: bool,
+) -> ArrayLike:
+    """
+    Convert input to numpy ndarray and optionally cast to a given dtype.
+
+    Parameters
+    ----------
+    arr : ndarray or list
+        Excludes: ExtensionArray, Series, Index.
+    dtype : np.dtype
+    copy : bool
+        If False, don't copy the data if not needed.
+
+    Returns
+    -------
+    np.ndarray or ExtensionArray
+    """
+    is_ndarray = isinstance(arr, np.ndarray)
+
+    if dtype == object:
+        if not is_ndarray:
+            subarr = construct_1d_object_array_from_listlike(arr)
+            return subarr
+        return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy)
+
+    elif dtype.kind == "U":
+        # TODO: test cases with arr.dtype.kind in "mM"
+        if is_ndarray:
+            arr = cast(np.ndarray, arr)
+            shape = arr.shape
+            if arr.ndim > 1:
+                arr = arr.ravel()
+        else:
+            shape = (len(arr),)
+        return lib.ensure_string_array(arr, convert_na_value=False, copy=copy).reshape(
+            shape
+        )
+
+    elif dtype.kind in "mM":
+        if is_ndarray:
+            arr = cast(np.ndarray, arr)
+            if arr.ndim == 2 and arr.shape[1] == 1:
+                # GH#60081: DataFrame Constructor converts 1D data to array of
+                # shape (N, 1), but maybe_cast_to_datetime assumes 1D input
+                return maybe_cast_to_datetime(arr[:, 0], dtype).reshape(arr.shape)
+        return maybe_cast_to_datetime(arr, dtype)
+
+    # GH#15832: Check if we are requesting a numeric dtype and
+    # that we can convert the data to the requested dtype.
+    elif dtype.kind in "iu":
+        # this will raise if we have e.g. floats
+
+        subarr = maybe_cast_to_integer_array(arr, dtype)
+    elif not copy:
+        subarr = np.asarray(arr, dtype=dtype)
+    else:
+        subarr = np.array(arr, dtype=dtype, copy=copy)
+
+    return subarr
diff --git a/pandas/core/flags.py b/pandas/core/flags.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6088e3f40b1be470cf2e0ef138355d2f0239031
--- /dev/null
+++ b/pandas/core/flags.py
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+import weakref
+
+from pandas.util._decorators import set_module
+
+if TYPE_CHECKING:
+    from pandas.core.generic import NDFrame
+
+
+@set_module("pandas")
+class Flags:
+    """
+    Flags that apply to pandas objects.
+
+    “Flags” differ from “metadata”. Flags reflect properties of the pandas
+    object (the Series or DataFrame). Metadata refer to properties of the
+    dataset, and should be stored in DataFrame.attrs.
+
+    Parameters
+    ----------
+    obj : Series or DataFrame
+        The object these flags are associated with.
+    allows_duplicate_labels : bool, default True
+        Whether to allow duplicate labels in this object. By default,
+        duplicate labels are permitted. Setting this to ``False`` will
+        cause an :class:`errors.DuplicateLabelError` to be raised when
+        `index` (or columns for DataFrame) is not unique, or any
+        subsequent operation on introduces duplicates.
+        See :ref:`duplicates.disallow` for more.
+
+        .. warning::
+
+           This is an experimental feature. Currently, many methods fail to
+           propagate the ``allows_duplicate_labels`` value. In future versions
+           it is expected that every method taking or returning one or more
+           DataFrame or Series objects will propagate ``allows_duplicate_labels``.
+
+    See Also
+    --------
+    DataFrame.attrs : Dictionary of global attributes of this dataset.
+    Series.attrs : Dictionary of global attributes of this dataset.
+
+    Examples
+    --------
+    Attributes can be set in two ways:
+
+    >>> df = pd.DataFrame()
+    >>> df.flags
+    <Flags(allows_duplicate_labels=True)>
+    >>> df.flags.allows_duplicate_labels = False
+    >>> df.flags
+    <Flags(allows_duplicate_labels=False)>
+
+    >>> df.flags["allows_duplicate_labels"] = True
+    >>> df.flags
+    <Flags(allows_duplicate_labels=True)>
+    """
+
+    _keys: set[str] = {"allows_duplicate_labels"}
+
+    def __init__(self, obj: NDFrame, *, allows_duplicate_labels: bool) -> None:
+        self._allows_duplicate_labels = allows_duplicate_labels
+        self._obj = weakref.ref(obj)
+
+    @property
+    def allows_duplicate_labels(self) -> bool:
+        """
+        Whether this object allows duplicate labels.
+
+        Setting ``allows_duplicate_labels=False`` ensures that the
+        index (and columns of a DataFrame) are unique. Most methods
+        that accept and return a Series or DataFrame will propagate
+        the value of ``allows_duplicate_labels``.
+
+        See :ref:`duplicates` for more.
+
+        See Also
+        --------
+        DataFrame.attrs : Set global metadata on this object.
+        DataFrame.set_flags : Set global flags on this object.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": [1, 2]}, index=["a", "a"])
+        >>> df.flags.allows_duplicate_labels
+        True
+        >>> df.flags.allows_duplicate_labels = False
+        Traceback (most recent call last):
+            ...
+        pandas.errors.DuplicateLabelError: Index has duplicates.
+              positions
+        label
+        a        [0, 1]
+        """
+        return self._allows_duplicate_labels
+
+    @allows_duplicate_labels.setter
+    def allows_duplicate_labels(self, value: bool) -> None:
+        value = bool(value)
+        obj = self._obj()
+        if obj is None:
+            raise ValueError("This flag's object has been deleted.")
+
+        if not value:
+            for ax in obj.axes:
+                ax._maybe_check_unique()
+
+        self._allows_duplicate_labels = value
+
+    def __getitem__(self, key: str):
+        if key not in self._keys:
+            raise KeyError(key)
+
+        return getattr(self, key)
+
+    def __setitem__(self, key: str, value) -> None:
+        if key not in self._keys:
+            raise ValueError(f"Unknown flag {key}. Must be one of {self._keys}")
+        setattr(self, key, value)
+
+    def __repr__(self) -> str:
+        return f"<Flags(allows_duplicate_labels={self.allows_duplicate_labels})>"
+
+    def __eq__(self, other: object) -> bool:
+        if isinstance(other, type(self)):
+            return self.allows_duplicate_labels == other.allows_duplicate_labels
+        return False
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
new file mode 100644
index 0000000000000000000000000000000000000000..79e3cf246fe262487f97cb8181dda466951d74fe
--- /dev/null
+++ b/pandas/core/frame.py
@@ -0,0 +1,16710 @@
+"""
+DataFrame
+---------
+An efficient 2D container for potentially mixed-type time series or other
+labeled data series.
+
+Similar to its R counterpart, data.frame, except providing automatic data
+alignment and a host of useful data manipulation methods having to do with the
+labeling information
+"""
+
+from __future__ import annotations
+
+import collections
+from collections import abc
+from collections.abc import (
+    Callable,
+    Hashable,
+    Iterable,
+    Iterator,
+    Mapping,
+    Sequence,
+)
+import functools
+from io import StringIO
+import itertools
+import operator
+import sys
+from textwrap import dedent
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    Self,
+    cast,
+    overload,
+)
+import warnings
+
+import numpy as np
+from numpy import ma
+
+from pandas._config import get_option
+
+from pandas._libs import (
+    algos as libalgos,
+    lib,
+    properties,
+)
+from pandas._libs.hashtable import duplicated
+from pandas._libs.lib import is_range_indexer
+from pandas.compat import CHAINED_WARNING_DISABLED
+from pandas.compat._constants import (
+    REF_COUNT,
+    REF_COUNT_METHOD,
+)
+from pandas.compat._optional import import_optional_dependency
+from pandas.compat.numpy import function as nv
+from pandas.errors import (
+    ChainedAssignmentError,
+    InvalidIndexError,
+    Pandas4Warning,
+)
+from pandas.errors.cow import (
+    _chained_assignment_method_update_msg,
+    _chained_assignment_msg,
+)
+from pandas.util._decorators import (
+    Appender,
+    Substitution,
+    deprecate_nonkeyword_arguments,
+    set_module,
+)
+from pandas.util._exceptions import (
+    find_stack_level,
+)
+from pandas.util._validators import (
+    validate_ascending,
+    validate_bool_kwarg,
+    validate_percentile,
+)
+
+from pandas.core.dtypes.cast import (
+    LossySetitemError,
+    can_hold_element,
+    construct_1d_arraylike_from_scalar,
+    construct_2d_arraylike_from_scalar,
+    find_common_type,
+    infer_dtype_from_scalar,
+    invalidate_string_dtypes,
+    maybe_downcast_to_dtype,
+    maybe_unbox_numpy_scalar,
+)
+from pandas.core.dtypes.common import (
+    infer_dtype_from_object,
+    is_1d_only_ea_dtype,
+    is_array_like,
+    is_bool_dtype,
+    is_dataclass,
+    is_dict_like,
+    is_float,
+    is_float_dtype,
+    is_hashable,
+    is_integer,
+    is_integer_dtype,
+    is_iterator,
+    is_list_like,
+    is_scalar,
+    is_sequence,
+    is_string_dtype,
+    needs_i8_conversion,
+    pandas_dtype,
+)
+from pandas.core.dtypes.concat import concat_compat
+from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
+    BaseMaskedDtype,
+    ExtensionDtype,
+)
+from pandas.core.dtypes.generic import (
+    ABCIndex,
+    ABCSeries,
+)
+from pandas.core.dtypes.missing import (
+    isna,
+    notna,
+)
+
+from pandas.core import (
+    algorithms,
+    common as com,
+    nanops,
+    ops,
+    roperator,
+)
+from pandas.core.accessor import Accessor
+from pandas.core.apply import reconstruct_and_relabel_result
+from pandas.core.array_algos.take import take_2d_multi
+from pandas.core.arraylike import OpsMixin
+from pandas.core.arrays import (
+    BaseMaskedArray,
+    DatetimeArray,
+    ExtensionArray,
+    PeriodArray,
+    TimedeltaArray,
+)
+from pandas.core.arrays.sparse import SparseFrameAccessor
+from pandas.core.arrays.string_ import StringDtype
+from pandas.core.construction import (
+    ensure_wrapped_if_datetimelike,
+    sanitize_array,
+    sanitize_masked_array,
+)
+from pandas.core.generic import NDFrame
+from pandas.core.indexers import check_key_length
+from pandas.core.indexes.api import (
+    DatetimeIndex,
+    Index,
+    PeriodIndex,
+    default_index,
+    ensure_index,
+    ensure_index_from_sequences,
+)
+from pandas.core.indexes.multi import (
+    MultiIndex,
+    maybe_droplevels,
+)
+from pandas.core.indexing import (
+    check_bool_indexer,
+    check_dict_or_set_indexers,
+)
+from pandas.core.internals import BlockManager
+from pandas.core.internals.construction import (
+    arrays_to_mgr,
+    dataclasses_to_dicts,
+    dict_to_mgr,
+    ndarray_to_mgr,
+    nested_data_to_arrays,
+    rec_array_to_mgr,
+    reorder_arrays,
+    to_arrays,
+    treat_as_nested,
+)
+from pandas.core.methods import selectn
+from pandas.core.reshape.melt import melt
+from pandas.core.series import Series
+from pandas.core.shared_docs import _shared_docs
+from pandas.core.sorting import (
+    get_group_index,
+    lexsort_indexer,
+    nargsort,
+)
+
+from pandas.io.common import get_handle
+from pandas.io.formats import (
+    console,
+    format as fmt,
+)
+from pandas.io.formats.info import DataFrameInfo
+import pandas.plotting
+
+if TYPE_CHECKING:
+    import datetime
+
+    from pandas._libs.internals import BlockValuesRefs
+    from pandas._typing import (
+        AggFuncType,
+        AnyAll,
+        AnyArrayLike,
+        ArrayLike,
+        ArrowArrayExportable,
+        ArrowStreamExportable,
+        Axes,
+        Axis,
+        AxisInt,
+        ColspaceArgType,
+        CompressionOptions,
+        CorrelationMethod,
+        DropKeep,
+        Dtype,
+        DtypeObj,
+        FilePath,
+        FloatFormatType,
+        FormattersType,
+        Frequency,
+        FromDictOrient,
+        HashableT,
+        HashableT2,
+        IgnoreRaise,
+        IndexKeyFunc,
+        IndexLabel,
+        JoinValidate,
+        Level,
+        ListLike,
+        MergeHow,
+        MergeValidate,
+        MutableMappingT,
+        NaPosition,
+        NsmallestNlargestKeep,
+        ParquetCompressionOptions,
+        PythonFuncType,
+        QuantileInterpolation,
+        ReadBuffer,
+        ReindexMethod,
+        Renamer,
+        Scalar,
+        SequenceNotStr,
+        SortKind,
+        StorageOptions,
+        Suffixes,
+        T,
+        ToStataByteorder,
+        ToTimestampHow,
+        UpdateJoin,
+        ValueKeyFunc,
+        WriteBuffer,
+        XMLParsers,
+        npt,
+    )
+
+    from pandas.core.groupby.generic import DataFrameGroupBy
+    from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
+    from pandas.core.internals.managers import SingleBlockManager
+
+    from pandas.io.formats.style import Styler
+
+# ---------------------------------------------------------------------
+# Docstring templates
+
+_shared_doc_kwargs = {
+    "axes": "index, columns",
+    "klass": "DataFrame",
+    "axes_single_arg": "{0 or 'index', 1 or 'columns'}",
+    "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0
+        If 0 or 'index': apply function to each column.
+        If 1 or 'columns': apply function to each row.""",
+    "inplace": """
+    inplace : bool, default False
+        Whether to modify the DataFrame rather than creating a new one.""",
+    "optional_by": """
+by : str or list of str
+    Name or list of names to sort by.
+
+    - if `axis` is 0 or `'index'` then `by` may contain index
+      levels and/or column labels.
+    - if `axis` is 1 or `'columns'` then `by` may contain column
+      levels and/or index labels.""",
+    "optional_reindex": """
+labels : array-like, optional
+    New labels / index to conform the axis specified by 'axis' to.
+index : array-like, optional
+    New labels for the index. Preferably an Index object to avoid
+    duplicating data.
+columns : array-like, optional
+    New labels for the columns. Preferably an Index object to avoid
+    duplicating data.
+axis : int or str, optional
+    Axis to target. Can be either the axis name ('index', 'columns')
+    or number (0, 1).""",
+}
+
+_merge_doc = """
+Merge DataFrame or named Series objects with a database-style join.
+
+A named Series object is treated as a DataFrame with a single named column.
+
+The join is done on columns or indexes. If joining columns on
+columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
+on indexes or indexes on a column or columns, the index will be passed on.
+When performing a cross merge, no column specifications to merge on are
+allowed.
+
+.. warning::
+
+    If both key columns contain rows where the key is a null value, those
+    rows will be matched against each other. This is different from usual SQL
+    join behaviour and can lead to unexpected results.
+
+Parameters
+----------%s
+right : DataFrame or named Series
+    Object to merge with.
+how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
+    default 'inner'
+    Type of merge to be performed.
+
+    * left: use only keys from left frame, similar to a SQL left outer join;
+      preserve key order.
+    * right: use only keys from right frame, similar to a SQL right outer join;
+      preserve key order.
+    * outer: use union of keys from both frames, similar to a SQL full outer
+      join; sort keys lexicographically.
+    * inner: use intersection of keys from both frames, similar to a SQL inner
+      join; preserve the order of the left keys.
+    * cross: creates the cartesian product from both frames, preserves the order
+      of the left keys.
+    * left_anti: use only keys from left frame that are not in right frame, similar
+      to SQL left anti join; preserve key order.
+
+      .. versionadded:: 3.0
+    * right_anti: use only keys from right frame that are not in left frame, similar
+      to SQL right anti join; preserve key order.
+
+      .. versionadded:: 3.0
+on : Hashable or a sequence of the previous
+    Column or index level names to join on. These must be found in both
+    DataFrames. If `on` is None and not merging on indexes then this defaults
+    to the intersection of the columns in both DataFrames.
+left_on : Hashable or a sequence of the previous, or array-like
+    Column or index level names to join on in the left DataFrame. Can also
+    be an array or list of arrays of the length of the left DataFrame.
+    These arrays are treated as if they are columns.
+right_on : Hashable or a sequence of the previous, or array-like
+    Column or index level names to join on in the right DataFrame. Can also
+    be an array or list of arrays of the length of the right DataFrame.
+    These arrays are treated as if they are columns.
+left_index : bool, default False
+    Use the index from the left DataFrame as the join key(s). If it is a
+    MultiIndex, the number of keys in the other DataFrame (either the index
+    or a number of columns) must match the number of levels.
+right_index : bool, default False
+    Use the index from the right DataFrame as the join key. Same caveats as
+    left_index.
+sort : bool, default False
+    Sort the join keys lexicographically in the result DataFrame. If False,
+    the order of the join keys depends on the join type (how keyword).
+suffixes : list-like, default is ("_x", "_y")
+    A length-2 sequence where each element is optionally a string
+    indicating the suffix to add to overlapping column names in
+    `left` and `right` respectively. Pass a value of `None` instead
+    of a string to indicate that the column name from `left` or
+    `right` should be left as-is, with no suffix. At least one of the
+    values must not be None.
+copy : bool, default False
+    This keyword is now ignored; changing its value will have no
+    impact on the method.
+
+    .. deprecated:: 3.0.0
+
+        This keyword is ignored and will be removed in pandas 4.0. Since
+        pandas 3.0, this method always returns a new object using a lazy
+        copy mechanism that defers copies until necessary
+        (Copy-on-Write). See the `user guide on Copy-on-Write
+        <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+        for more details.
+
+indicator : bool or str, default False
+    If True, adds a column to the output DataFrame called "_merge" with
+    information on the source of each row. The column can be given a different
+    name by providing a string argument. The column will have a Categorical
+    type with the value of "left_only" for observations whose merge key only
+    appears in the left DataFrame, "right_only" for observations
+    whose merge key only appears in the right DataFrame, and "both"
+    if the observation's merge key is found in both DataFrames.
+
+validate : str, optional
+    If specified, checks if merge is of specified type.
+
+    * "one_to_one" or "1:1": check if merge keys are unique in both
+      left and right datasets.
+    * "one_to_many" or "1:m": check if merge keys are unique in left
+      dataset.
+    * "many_to_one" or "m:1": check if merge keys are unique in right
+      dataset.
+    * "many_to_many" or "m:m": allowed, but does not result in checks.
+
+Returns
+-------
+DataFrame
+    A DataFrame of the two merged objects.
+
+See Also
+--------
+merge_ordered : Merge with optional filling/interpolation.
+merge_asof : Merge on nearest keys.
+DataFrame.join : Similar method using indices.
+
+Examples
+--------
+>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
+...                     'value': [1, 2, 3, 5]})
+>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
+...                     'value': [5, 6, 7, 8]})
+>>> df1
+    lkey value
+0   foo      1
+1   bar      2
+2   baz      3
+3   foo      5
+>>> df2
+    rkey value
+0   foo      5
+1   bar      6
+2   baz      7
+3   foo      8
+
+Merge df1 and df2 on the lkey and rkey columns. The value columns have
+the default suffixes, _x and _y, appended.
+
+>>> df1.merge(df2, left_on='lkey', right_on='rkey')
+  lkey  value_x rkey  value_y
+0  foo        1  foo        5
+1  foo        1  foo        8
+2  bar        2  bar        6
+3  baz        3  baz        7
+4  foo        5  foo        5
+5  foo        5  foo        8
+
+Merge DataFrames df1 and df2 with specified left and right suffixes
+appended to any overlapping columns.
+
+>>> df1.merge(df2, left_on='lkey', right_on='rkey',
+...           suffixes=('_left', '_right'))
+  lkey  value_left rkey  value_right
+0  foo           1  foo            5
+1  foo           1  foo            8
+2  bar           2  bar            6
+3  baz           3  baz            7
+4  foo           5  foo            5
+5  foo           5  foo            8
+
+Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
+any overlapping columns.
+
+>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
+Traceback (most recent call last):
+...
+ValueError: columns overlap but no suffix specified:
+    Index(['value'], dtype='object')
+
+>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
+>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
+>>> df1
+      a  b
+0   foo  1
+1   bar  2
+>>> df2
+      a  c
+0   foo  3
+1   baz  4
+
+>>> df1.merge(df2, how='inner', on='a')
+      a  b  c
+0   foo  1  3
+
+>>> df1.merge(df2, how='left', on='a')
+      a  b  c
+0   foo  1  3.0
+1   bar  2  NaN
+
+>>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
+>>> df2 = pd.DataFrame({'right': [7, 8]})
+>>> df1
+    left
+0   foo
+1   bar
+>>> df2
+    right
+0   7
+1   8
+
+>>> df1.merge(df2, how='cross')
+   left  right
+0   foo      7
+1   foo      8
+2   bar      7
+3   bar      8
+"""
+
+
+# -----------------------------------------------------------------------
+# DataFrame class
+
+
+@set_module("pandas")
+class DataFrame(NDFrame, OpsMixin):
+    """
+    Two-dimensional, size-mutable, potentially heterogeneous tabular data.
+
+    Data structure also contains labeled axes (rows and columns).
+    Arithmetic operations align on both row and column labels. Can be
+    thought of as a dict-like container for Series objects. The primary
+    pandas data structure.
+
+    Parameters
+    ----------
+    data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
+        Dict can contain Series, arrays, constants, dataclass or list-like objects. If
+        data is a dict, column order follows insertion-order. If a dict contains Series
+        which have an index defined, it is aligned by its index. This alignment also
+        occurs if data is a Series or a DataFrame itself. Alignment is done on
+        Series/DataFrame inputs.
+
+        If data is a list of dicts, column order follows insertion-order.
+
+    index : Index or array-like
+        Index to use for resulting frame. Will default to RangeIndex if
+        no indexing information part of input data and no index provided.
+    columns : Index or array-like
+        Column labels to use for resulting frame when data does not have them,
+        defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
+        will perform column selection instead.
+    dtype : dtype, default None
+        Data type to force. Only a single dtype is allowed. If None, infer.
+        If ``data`` is DataFrame then is ignored.
+    copy : bool or None, default None
+        Copy data from inputs.
+        For dict data, the default of None behaves like ``copy=True``.  For DataFrame
+        or 2d ndarray input, the default of None behaves like ``copy=False``.
+        If data is a dict containing one or more Series (possibly of different dtypes),
+        ``copy=False`` will ensure that these inputs are not copied.
+
+    See Also
+    --------
+    DataFrame.from_records : Constructor from tuples, also record arrays.
+    DataFrame.from_dict : From dicts of Series, arrays, or dicts.
+    read_csv : Read a comma-separated values (csv) file into DataFrame.
+    read_table : Read general delimited file into DataFrame.
+    read_clipboard : Read text from clipboard into DataFrame.
+
+    Notes
+    -----
+    Please reference the :ref:`User Guide <basics.dataframe>` for more information.
+
+    Examples
+    --------
+    Constructing DataFrame from a dictionary.
+
+    >>> d = {"col1": [1, 2], "col2": [3, 4]}
+    >>> df = pd.DataFrame(data=d)
+    >>> df
+       col1  col2
+    0     1     3
+    1     2     4
+
+    Notice that the inferred dtype is int64.
+
+    >>> df.dtypes
+    col1    int64
+    col2    int64
+    dtype: object
+
+    To enforce a single dtype:
+
+    >>> df = pd.DataFrame(data=d, dtype=np.int8)
+    >>> df.dtypes
+    col1    int8
+    col2    int8
+    dtype: object
+
+    Constructing DataFrame from a dictionary including Series:
+
+    >>> d = {"col1": [0, 1, 2, 3], "col2": pd.Series([2, 3], index=[2, 3])}
+    >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
+       col1  col2
+    0     0   NaN
+    1     1   NaN
+    2     2   2.0
+    3     3   3.0
+
+    Constructing DataFrame from numpy ndarray:
+
+    >>> df2 = pd.DataFrame(
+    ...     np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=["a", "b", "c"]
+    ... )
+    >>> df2
+       a  b  c
+    0  1  2  3
+    1  4  5  6
+    2  7  8  9
+
+    Constructing DataFrame from a numpy ndarray that has labeled columns:
+
+    >>> data = np.array(
+    ...     [(1, 2, 3), (4, 5, 6), (7, 8, 9)],
+    ...     dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")],
+    ... )
+    >>> df3 = pd.DataFrame(data, columns=["c", "a"])
+    >>> df3
+       c  a
+    0  3  1
+    1  6  4
+    2  9  7
+
+    Constructing DataFrame from dataclass:
+
+    >>> from dataclasses import make_dataclass
+    >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
+    >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
+       x  y
+    0  0  0
+    1  0  3
+    2  2  3
+
+    Constructing DataFrame from Series/DataFrame:
+
+    >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"])
+    >>> df = pd.DataFrame(data=ser, index=["a", "c"])
+    >>> df
+       0
+    a  1
+    c  3
+
+    >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"])
+    >>> df2 = pd.DataFrame(data=df1, index=["a", "c"])
+    >>> df2
+       x
+    a  1
+    c  3
+    """
+
+    _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set
+    _typ = "dataframe"
+    _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
+    _accessors: set[str] = {"sparse"}
+    _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])
+    _mgr: BlockManager
+
+    # similar to __array_priority__, positions DataFrame before Series, Index,
+    #  and ExtensionArray.  Should NOT be overridden by subclasses.
+    __pandas_priority__ = 4000
+
+    @property
+    def _constructor(self) -> type[DataFrame]:
+        return DataFrame
+
+    def _constructor_from_mgr(self, mgr, axes) -> DataFrame:
+        df = DataFrame._from_mgr(mgr, axes=axes)
+
+        if type(self) is DataFrame:
+            # This would also work `if self._constructor is DataFrame`, but
+            #  this check is slightly faster, benefiting the most-common case.
+            return df
+
+        elif type(self).__name__ == "GeoDataFrame":
+            # Shim until geopandas can override their _constructor_from_mgr
+            #  bc they have different behavior for Managers than for DataFrames
+            return self._constructor(mgr)
+
+        # We assume that the subclass __init__ knows how to handle a
+        #  pd.DataFrame object.
+        return self._constructor(df)
+
+    _constructor_sliced: Callable[..., Series] = Series
+
+    def _constructor_sliced_from_mgr(self, mgr, axes) -> Series:
+        ser = Series._from_mgr(mgr, axes)
+        ser._name = None  # caller is responsible for setting real name
+
+        if type(self) is DataFrame:
+            # This would also work `if self._constructor_sliced is Series`, but
+            #  this check is slightly faster, benefiting the most-common case.
+            return ser
+
+        # We assume that the subclass __init__ knows how to handle a
+        #  pd.Series object.
+        return self._constructor_sliced(ser)
+
+    # ----------------------------------------------------------------------
+    # Constructors
+
+    def __init__(
+        self,
+        data=None,
+        index: Axes | None = None,
+        columns: Axes | None = None,
+        dtype: Dtype | None = None,
+        copy: bool | None = None,
+    ) -> None:
+        allow_mgr = False
+        if dtype is not None:
+            dtype = self._validate_dtype(dtype)
+
+        if isinstance(data, DataFrame):
+            data = data._mgr
+            allow_mgr = True
+            if not copy:
+                # if not copying data, ensure to still return a shallow copy
+                # to avoid the result sharing the same Manager
+                data = data.copy(deep=False)
+
+        if isinstance(data, BlockManager):
+            if not allow_mgr:
+                # GH#52419
+                warnings.warn(
+                    f"Passing a {type(data).__name__} to {type(self).__name__} "
+                    "is deprecated and will raise in a future version. "
+                    "Use public APIs instead.",
+                    Pandas4Warning,
+                    stacklevel=2,
+                )
+
+            data = data.copy(deep=False)
+            # first check if a Manager is passed without any other arguments
+            # -> use fastpath (without checking Manager type)
+            if index is None and columns is None and dtype is None and not copy:
+                # GH#33357 fastpath
+                NDFrame.__init__(self, data)
+                return
+
+        # GH47215
+        if isinstance(index, set):
+            raise ValueError("index cannot be a set")
+        if isinstance(columns, set):
+            raise ValueError("columns cannot be a set")
+
+        if copy is None:
+            if isinstance(data, dict):
+                # retain pre-GH#38939 default behavior
+                copy = True
+            elif not isinstance(data, (Index, DataFrame, Series)):
+                copy = True
+            else:
+                copy = False
+
+        if data is None:
+            index = index if index is not None else default_index(0)
+            columns = columns if columns is not None else default_index(0)
+            dtype = dtype if dtype is not None else pandas_dtype(object)
+            data = []
+
+        if isinstance(data, BlockManager):
+            mgr = self._init_mgr(
+                data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
+            )
+
+        elif isinstance(data, dict):
+            # GH#38939 de facto copy defaults to False only in non-dict cases
+            mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy)
+        elif isinstance(data, ma.MaskedArray):
+            from numpy.ma import mrecords
+
+            # masked recarray
+            if isinstance(data, mrecords.MaskedRecords):
+                raise TypeError(
+                    "MaskedRecords are not supported. Pass "
+                    "{name: data[name] for name in data.dtype.names} "
+                    "instead"
+                )
+
+            # a masked array
+            data = sanitize_masked_array(data)
+            mgr = ndarray_to_mgr(
+                data,
+                index,
+                columns,
+                dtype=dtype,
+                copy=copy,
+            )
+
+        elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)):
+            if data.dtype.names:
+                # i.e. numpy structured array
+                data = cast(np.ndarray, data)
+                mgr = rec_array_to_mgr(
+                    data,
+                    index,
+                    columns,
+                    dtype,
+                    copy,
+                )
+            elif isinstance(data, (ABCSeries, ABCIndex)) and data.name is not None:
+                # i.e. Series/Index with non-None name
+                mgr = dict_to_mgr(
+                    # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
+                    # attribute "name"
+                    {data.name: data},
+                    index,
+                    columns,
+                    dtype=dtype,
+                    copy=copy,
+                )
+            else:
+                mgr = ndarray_to_mgr(
+                    data,
+                    index,
+                    columns,
+                    dtype=dtype,
+                    copy=copy,
+                )
+
+        # For data is list-like, or Iterable (will consume into list)
+        elif is_list_like(data):
+            if not isinstance(data, abc.Sequence):
+                if hasattr(data, "__array__"):
+                    # GH#44616 big perf improvement for e.g. pytorch tensor
+                    data = np.asarray(data)
+                else:
+                    data = list(data)
+            if len(data) > 0:
+                if is_dataclass(data[0]):
+                    data = dataclasses_to_dicts(data)
+                if not isinstance(data, np.ndarray) and treat_as_nested(data):
+                    # exclude ndarray as we may have cast it a few lines above
+                    if columns is not None:
+                        columns = ensure_index(columns)
+                    arrays, columns, index = nested_data_to_arrays(
+                        # error: Argument 3 to "nested_data_to_arrays" has incompatible
+                        # type "Optional[Collection[Any]]"; expected "Optional[Index]"
+                        data,
+                        columns,
+                        index,  # type: ignore[arg-type]
+                        dtype,
+                    )
+                    mgr = arrays_to_mgr(
+                        arrays,
+                        columns,
+                        index,
+                        dtype=dtype,
+                    )
+                else:
+                    mgr = ndarray_to_mgr(
+                        data,
+                        index,
+                        columns,
+                        dtype=dtype,
+                        copy=copy,
+                    )
+            else:
+                mgr = dict_to_mgr(
+                    {},
+                    index,
+                    columns if columns is not None else default_index(0),
+                    dtype=dtype,
+                )
+        # For data is scalar
+        else:
+            if index is None or columns is None:
+                raise ValueError("DataFrame constructor not properly called!")
+
+            index = ensure_index(index)
+            columns = ensure_index(columns)
+
+            if not dtype:
+                dtype, _ = infer_dtype_from_scalar(data)
+
+            # For data is a scalar extension dtype
+            if isinstance(dtype, ExtensionDtype):
+                # TODO(EA2D): special case not needed with 2D EAs
+
+                values = [
+                    construct_1d_arraylike_from_scalar(data, len(index), dtype)
+                    for _ in range(len(columns))
+                ]
+                mgr = arrays_to_mgr(values, columns, index, dtype=None)
+            else:
+                arr2d = construct_2d_arraylike_from_scalar(
+                    data,
+                    len(index),
+                    len(columns),
+                    dtype,
+                    copy,
+                )
+
+                mgr = ndarray_to_mgr(
+                    arr2d,
+                    index,
+                    columns,
+                    dtype=arr2d.dtype,
+                    copy=False,
+                )
+
+        NDFrame.__init__(self, mgr)
+
+    # ----------------------------------------------------------------------
+
+    def __dataframe__(
+        self, nan_as_null: bool = False, allow_copy: bool = True
+    ) -> DataFrameXchg:
+        """
+        Return the dataframe interchange object implementing the interchange protocol.
+
+        .. deprecated:: 3.0.0
+
+            The Dataframe Interchange Protocol is deprecated.
+            For dataframe-agnostic code, you may want to look into:
+
+            - `Arrow PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_
+            - `Narwhals <https://github.com/narwhals-dev/narwhals>`_
+
+        .. note::
+
+           For new development, we highly recommend using the Arrow C Data Interface
+           alongside the Arrow PyCapsule Interface instead of the interchange protocol
+
+        .. warning::
+
+            Due to severe implementation issues, we recommend only considering using the
+            interchange protocol in the following cases:
+
+            - converting to pandas: for pandas >= 2.0.3
+            - converting from pandas: for pandas >= 3.0.0
+
+        Parameters
+        ----------
+        nan_as_null : bool, default False
+            `nan_as_null` is DEPRECATED and has no effect. Please avoid using
+            it; it will be removed in a future release.
+        allow_copy : bool, default True
+            Whether to allow memory copying when exporting. If set to False
+            it would cause non-zero-copy exports to fail.
+
+        Returns
+        -------
+        DataFrame interchange object
+            The object which consuming library can use to ingress the dataframe.
+
+        See Also
+        --------
+        DataFrame.from_records : Constructor from tuples, also record arrays.
+        DataFrame.from_dict : From dicts of Series, arrays, or dicts.
+
+        Notes
+        -----
+        Details on the interchange protocol:
+        https://data-apis.org/dataframe-protocol/latest/index.html
+
+        Examples
+        --------
+        >>> df_not_necessarily_pandas = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
+        >>> interchange_object = df_not_necessarily_pandas.__dataframe__()
+        >>> interchange_object.column_names()
+        Index(['A', 'B'], dtype='str')
+        >>> df_pandas = pd.api.interchange.from_dataframe(
+        ...     interchange_object.select_columns_by_name(["A"])
+        ... )
+        >>> df_pandas
+             A
+        0    1
+        1    2
+
+        These methods (``column_names``, ``select_columns_by_name``) should work
+        for any dataframe library which implements the interchange protocol.
+        """
+        warnings.warn(
+            "The Dataframe Interchange Protocol is deprecated.\n"
+            "For dataframe-agnostic code, you may want to look into:\n"
+            "- Arrow PyCapsule Interface: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html\n"
+            "- Narwhals: https://github.com/narwhals-dev/narwhals\n",
+            Pandas4Warning,
+            stacklevel=find_stack_level(),
+        )
+        from pandas.core.interchange.dataframe import PandasDataFrameXchg
+
+        return PandasDataFrameXchg(self, allow_copy=allow_copy)
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        """
+        Export the pandas DataFrame as an Arrow C stream PyCapsule.
+
+        This relies on pyarrow to convert the pandas DataFrame to the Arrow
+        format (and follows the default behaviour of ``pyarrow.Table.from_pandas``
+        in its handling of the index, i.e. store the index as a column except
+        for RangeIndex).
+        This conversion is not necessarily zero-copy.
+
+        Parameters
+        ----------
+        requested_schema : PyCapsule, default None
+            The schema to which the dataframe should be casted, passed as a
+            PyCapsule containing a C ArrowSchema representation of the
+            requested schema.
+
+        Returns
+        -------
+        PyCapsule
+        """
+        pa = import_optional_dependency("pyarrow", min_version="14.0.0")
+        if requested_schema is not None:
+            requested_schema = pa.Schema._import_from_c_capsule(requested_schema)
+        table = pa.Table.from_pandas(self, schema=requested_schema)
+        return table.__arrow_c_stream__()
+
+    # ----------------------------------------------------------------------
+
+    @property
+    def axes(self) -> list[Index]:
+        """
+        Return a list representing the axes of the DataFrame.
+
+        It has the row axis labels and column axis labels as the only members.
+        They are returned in that order.
+
+        See Also
+        --------
+        DataFrame.index: The index (row labels) of the DataFrame.
+        DataFrame.columns: The column labels of the DataFrame.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+        >>> df.axes
+        [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], dtype='str')]
+        """
+        return [self.index, self.columns]
+
+    @property
+    def shape(self) -> tuple[int, int]:
+        """
+        Return a tuple representing the dimensionality of the DataFrame.
+
+        Unlike the `len()` method, which only returns the number of rows, `shape`
+        provides both row and column counts, making it a more informative method for
+        understanding dataset size.
+
+        See Also
+        --------
+        numpy.ndarray.shape : Tuple of array dimensions.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+        >>> df.shape
+        (2, 2)
+
+        >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]})
+        >>> df.shape
+        (2, 3)
+        """
+        return len(self.index), len(self.columns)
+
+    @property
+    def _is_homogeneous_type(self) -> bool:
+        """
+        Whether all the columns in a DataFrame have the same type.
+
+        Returns
+        -------
+        bool
+
+        Examples
+        --------
+        >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
+        True
+        >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
+        False
+
+        Items with the same type but different sizes are considered
+        different types.
+
+        >>> DataFrame(
+        ...     {
+        ...         "A": np.array([1, 2], dtype=np.int32),
+        ...         "B": np.array([1, 2], dtype=np.int64),
+        ...     }
+        ... )._is_homogeneous_type
+        False
+        """
+        # The "<" part of "<=" here is for empty DataFrame cases
+        return len({block.values.dtype for block in self._mgr.blocks}) <= 1
+
+    @property
+    def _can_fast_transpose(self) -> bool:
+        """
+        Can we transpose this DataFrame without creating any new array objects.
+        """
+        blocks = self._mgr.blocks
+        if len(blocks) != 1:
+            return False
+
+        dtype = blocks[0].dtype
+        # TODO(EA2D) special case would be unnecessary with 2D EAs
+        return not is_1d_only_ea_dtype(dtype)
+
+    @property
+    def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray:
+        """
+        Analogue to ._values that may return a 2D ExtensionArray.
+        """
+        mgr = self._mgr
+
+        blocks = mgr.blocks
+        if len(blocks) != 1:
+            return ensure_wrapped_if_datetimelike(self.values)
+
+        arr = blocks[0].values
+        if arr.ndim == 1:
+            # non-2D ExtensionArray
+            return self.values
+
+        # more generally, whatever we allow in NDArrayBackedExtensionBlock
+        arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr)
+        return arr.T
+
+    # ----------------------------------------------------------------------
+    # Rendering Methods
+
+    def _repr_fits_vertical_(self) -> bool:
+        """
+        Check length against max_rows.
+        """
+        max_rows = get_option("display.max_rows")
+        return len(self) <= max_rows
+
+    def _repr_fits_horizontal_(self) -> bool:
+        """
+        Check if full repr fits in horizontal boundaries imposed by the display
+        options width and max_columns.
+        """
+        width, height = console.get_console_size()
+        max_columns = get_option("display.max_columns")
+        nb_columns = len(self.columns)
+
+        # exceed max columns
+        if (max_columns and nb_columns > max_columns) or (
+            width and nb_columns > (width // 2)
+        ):
+            return False
+
+        # used by repr_html under IPython notebook or scripts ignore terminal
+        # dims
+        if width is None or not console.in_interactive_session():
+            return True
+
+        if get_option("display.width") is not None or console.in_ipython_frontend():
+            # check at least the column row for excessive width
+            max_rows = 1
+        else:
+            max_rows = get_option("display.max_rows")
+
+        # when auto-detecting, so width=None and not in ipython front end
+        # check whether repr fits horizontal by actually checking
+        # the width of the rendered repr
+        buf = StringIO()
+
+        # only care about the stuff we'll actually print out
+        # and to_string on entire frame may be expensive
+        d = self
+
+        if max_rows is not None:  # unlimited rows
+            # min of two, where one may be None
+            d = d.iloc[: min(max_rows, len(d))]
+        else:
+            return True
+
+        d.to_string(buf=buf)
+        value = buf.getvalue()
+        repr_width = max(len(line) for line in value.split("\n"))
+
+        return repr_width < width
+
+    def _info_repr(self) -> bool:
+        """
+        True if the repr should show the info view.
+        """
+        info_repr_option = get_option("display.large_repr") == "info"
+        return info_repr_option and not (
+            self._repr_fits_horizontal_() and self._repr_fits_vertical_()
+        )
+
+    def __repr__(self) -> str:
+        """
+        Return a string representation for a particular DataFrame.
+        """
+        if self._info_repr():
+            buf = StringIO()
+            self.info(buf=buf)
+            return buf.getvalue()
+
+        repr_params = fmt.get_dataframe_repr_params()
+        return self.to_string(**repr_params)
+
+    def _repr_html_(self) -> str | None:
+        """
+        Return a html representation for a particular DataFrame.
+
+        Mainly for IPython notebook.
+        """
+        if self._info_repr():
+            buf = StringIO()
+            self.info(buf=buf)
+            # need to escape the <class>, should be the first line.
+            val = buf.getvalue().replace("<", r"&lt;", 1)
+            val = val.replace(">", r"&gt;", 1)
+            return f"<pre>{val}</pre>"
+
+        if get_option("display.notebook_repr_html"):
+            max_rows = get_option("display.max_rows")
+            min_rows = get_option("display.min_rows")
+            max_cols = get_option("display.max_columns")
+            show_dimensions = get_option("display.show_dimensions")
+            show_floats = get_option("display.float_format")
+
+            formatter = fmt.DataFrameFormatter(
+                self,
+                columns=None,
+                col_space=None,
+                na_rep="NaN",
+                formatters=None,
+                float_format=show_floats,
+                sparsify=None,
+                justify=None,
+                index_names=True,
+                header=True,
+                index=True,
+                bold_rows=True,
+                escape=True,
+                max_rows=max_rows,
+                min_rows=min_rows,
+                max_cols=max_cols,
+                show_dimensions=show_dimensions,
+                decimal=".",
+            )
+            return fmt.DataFrameRenderer(formatter).to_html(notebook=True)
+        else:
+            return None
+
+    @overload
+    def to_string(
+        self,
+        buf: None = ...,
+        *,
+        columns: Axes | None = ...,
+        col_space: int | list[int] | dict[Hashable, int] | None = ...,
+        header: bool | SequenceNotStr[str] = ...,
+        index: bool = ...,
+        na_rep: str = ...,
+        formatters: fmt.FormattersType | None = ...,
+        float_format: fmt.FloatFormatType | None = ...,
+        sparsify: bool | None = ...,
+        index_names: bool = ...,
+        justify: str | None = ...,
+        max_rows: int | None = ...,
+        max_cols: int | None = ...,
+        show_dimensions: bool = ...,
+        decimal: str = ...,
+        line_width: int | None = ...,
+        min_rows: int | None = ...,
+        max_colwidth: int | None = ...,
+        encoding: str | None = ...,
+    ) -> str: ...
+
+    @overload
+    def to_string(
+        self,
+        buf: FilePath | WriteBuffer[str],
+        *,
+        columns: Axes | None = ...,
+        col_space: int | list[int] | dict[Hashable, int] | None = ...,
+        header: bool | SequenceNotStr[str] = ...,
+        index: bool = ...,
+        na_rep: str = ...,
+        formatters: fmt.FormattersType | None = ...,
+        float_format: fmt.FloatFormatType | None = ...,
+        sparsify: bool | None = ...,
+        index_names: bool = ...,
+        justify: str | None = ...,
+        max_rows: int | None = ...,
+        max_cols: int | None = ...,
+        show_dimensions: bool = ...,
+        decimal: str = ...,
+        line_width: int | None = ...,
+        min_rows: int | None = ...,
+        max_colwidth: int | None = ...,
+        encoding: str | None = ...,
+    ) -> None: ...
+
+    @Substitution(
+        header_type="bool or list of str",
+        header="Write out the column names. If a list of columns "
+        "is given, it is assumed to be aliases for the "
+        "column names",
+        col_space_type="int, list or dict of int",
+        col_space="The minimum width of each column. If a list of ints is given "
+        "every integers corresponds with one column. If a dict is given, the key "
+        "references the column, while the value defines the space to use.",
+    )
+    @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
+    def to_string(
+        self,
+        buf: FilePath | WriteBuffer[str] | None = None,
+        *,
+        columns: Axes | None = None,
+        col_space: int | list[int] | dict[Hashable, int] | None = None,
+        header: bool | SequenceNotStr[str] = True,
+        index: bool = True,
+        na_rep: str = "NaN",
+        formatters: fmt.FormattersType | None = None,
+        float_format: fmt.FloatFormatType | None = None,
+        sparsify: bool | None = None,
+        index_names: bool = True,
+        justify: str | None = None,
+        max_rows: int | None = None,
+        max_cols: int | None = None,
+        show_dimensions: bool = False,
+        decimal: str = ".",
+        line_width: int | None = None,
+        min_rows: int | None = None,
+        max_colwidth: int | None = None,
+        encoding: str | None = None,
+    ) -> str | None:
+        """
+        Render a DataFrame to a console-friendly tabular output.
+        %(shared_params)s
+        line_width : int, optional
+            Width to wrap a line in characters.
+        min_rows : int, optional
+            The number of rows to display in the console in a truncated repr
+            (when number of rows is above `max_rows`).
+        max_colwidth : int, optional
+            Max width to truncate each column in characters. By default, no limit.
+        encoding : str, default "utf-8"
+            Set character encoding.
+        %(returns)s
+        See Also
+        --------
+        to_html : Convert DataFrame to HTML.
+
+        Examples
+        --------
+        >>> d = {"col1": [1, 2, 3], "col2": [4, 5, 6]}
+        >>> df = pd.DataFrame(d)
+        >>> print(df.to_string())
+           col1  col2
+        0     1     4
+        1     2     5
+        2     3     6
+        """
+        from pandas import option_context
+
+        with option_context("display.max_colwidth", max_colwidth):
+            formatter = fmt.DataFrameFormatter(
+                self,
+                columns=columns,
+                col_space=col_space,
+                na_rep=na_rep,
+                formatters=formatters,
+                float_format=float_format,
+                sparsify=sparsify,
+                justify=justify,
+                index_names=index_names,
+                header=header,
+                index=index,
+                min_rows=min_rows,
+                max_rows=max_rows,
+                max_cols=max_cols,
+                show_dimensions=show_dimensions,
+                decimal=decimal,
+            )
+            return fmt.DataFrameRenderer(formatter).to_string(
+                buf=buf,
+                encoding=encoding,
+                line_width=line_width,
+            )
+
+    def _get_values_for_csv(
+        self,
+        *,
+        float_format: FloatFormatType | None,
+        date_format: str | None,
+        decimal: str,
+        na_rep: str,
+        quoting,  # int csv.QUOTE_FOO from stdlib
+    ) -> DataFrame:
+        # helper used by to_csv
+        mgr = self._mgr.get_values_for_csv(
+            float_format=float_format,
+            date_format=date_format,
+            decimal=decimal,
+            na_rep=na_rep,
+            quoting=quoting,
+        )
+        return self._constructor_from_mgr(mgr, axes=mgr.axes)
+
+    # ----------------------------------------------------------------------
+
+    @property
+    def style(self) -> Styler:
+        """
+        Returns a Styler object.
+
+        Contains methods for building a styled HTML representation of the DataFrame.
+
+        See Also
+        --------
+        io.formats.style.Styler : Helps style a DataFrame or Series according to the
+            data with HTML and CSS.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": [1, 2, 3]})
+        >>> df.style  # doctest: +SKIP
+
+        Please see
+        `Table Visualization <../../user_guide/style.ipynb>`_ for more examples.
+        """
+        # Raise AttributeError so that inspect works even if jinja2 is not installed.
+        has_jinja2 = import_optional_dependency("jinja2", errors="ignore")
+        if not has_jinja2:
+            raise AttributeError("The '.style' accessor requires jinja2")
+
+        from pandas.io.formats.style import Styler
+
+        return Styler(self)
+
+    _shared_docs["items"] = r"""
+        Iterate over (column name, Series) pairs.
+
+        Iterates over the DataFrame columns, returning a tuple with
+        the column name and the content as a Series.
+
+        Yields
+        ------
+        label : object
+            The column names for the DataFrame being iterated over.
+        content : Series
+            The column entries belonging to each label, as a Series.
+
+        See Also
+        --------
+        DataFrame.iterrows : Iterate over DataFrame rows as
+            (index, Series) pairs.
+        DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
+            of the values.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
+        ...                   'population': [1864, 22000, 80000]},
+        ...                   index=['panda', 'polar', 'koala'])
+        >>> df
+                species   population
+        panda   bear      1864
+        polar   bear      22000
+        koala   marsupial 80000
+        >>> for label, content in df.items():
+        ...     print(f'label: {label}')
+        ...     print(f'content: {content}', sep='\n')
+        ...
+        label: species
+        content:
+        panda         bear
+        polar         bear
+        koala    marsupial
+        Name: species, dtype: str
+        label: population
+        content:
+        panda     1864
+        polar    22000
+        koala    80000
+        Name: population, dtype: int64
+        """
+
+    def items(self) -> Iterable[tuple[Hashable, Series]]:
+        r"""
+        Iterate over (column name, Series) pairs.
+
+        Iterates over the DataFrame columns, returning a tuple with
+        the column name and the content as a Series.
+
+        Yields
+        ------
+        label : object
+            The column names for the DataFrame being iterated over.
+        content : Series
+            The column entries belonging to each label, as a Series.
+
+        See Also
+        --------
+        DataFrame.iterrows : Iterate over DataFrame rows as
+            (index, Series) pairs.
+        DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
+            of the values.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "species": ["bear", "bear", "marsupial"],
+        ...         "population": [1864, 22000, 80000],
+        ...     },
+        ...     index=["panda", "polar", "koala"],
+        ... )
+        >>> df
+                species   population
+        panda   bear      1864
+        polar   bear      22000
+        koala   marsupial 80000
+        >>> for label, content in df.items():
+        ...     print(f"label: {label}")
+        ...     print(f"content: {content}", sep="\n")
+        label: species
+        content:
+        panda         bear
+        polar         bear
+        koala    marsupial
+        Name: species, dtype: str
+        label: population
+        content:
+        panda     1864
+        polar    22000
+        koala    80000
+        Name: population, dtype: int64
+        """
+        for i, k in enumerate(self.columns):
+            yield k, self._ixs(i, axis=1)
+
+    def iterrows(self) -> Iterable[tuple[Hashable, Series]]:
+        """
+        Iterate over DataFrame rows as (index, Series) pairs.
+
+        Yields
+        ------
+        index : label or tuple of label
+            The index of the row. A tuple for a `MultiIndex`.
+        data : Series
+            The data of the row as a Series.
+
+        See Also
+        --------
+        DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
+        DataFrame.items : Iterate over (column name, Series) pairs.
+
+        Notes
+        -----
+        1. Because ``iterrows`` returns a Series for each row,
+           it does **not** preserve dtypes across the rows (dtypes are
+           preserved across columns for DataFrames).
+
+           To preserve dtypes while iterating over the rows, it is better
+           to use :meth:`itertuples` which returns namedtuples of the values
+           and which is generally faster than ``iterrows``.
+
+        2. You should **never modify** something you are iterating over.
+           This is not guaranteed to work in all cases. Depending on the
+           data types, the iterator returns a copy and not a view, and writing
+           to it will have no effect.
+
+        Examples
+        --------
+
+        >>> df = pd.DataFrame([[1, 1.5]], columns=["int", "float"])
+        >>> row = next(df.iterrows())[1]
+        >>> row
+        int      1.0
+        float    1.5
+        Name: 0, dtype: float64
+        >>> print(row["int"].dtype)
+        float64
+        >>> print(df["int"].dtype)
+        int64
+        """
+        columns = self.columns
+        klass = self._constructor_sliced
+        for k, v in zip(self.index, self.values, strict=True):
+            s = klass(v, index=columns, name=k).__finalize__(self)
+            if self._mgr.is_single_block:
+                s._mgr.add_references(self._mgr)
+            yield k, s
+
+    def itertuples(
+        self, index: bool = True, name: str | None = "Pandas"
+    ) -> Iterable[tuple[Any, ...]]:
+        """
+        Iterate over DataFrame rows as namedtuples.
+
+        Parameters
+        ----------
+        index : bool, default True
+            If True, return the index as the first element of the tuple.
+        name : str or None, default "Pandas"
+            The name of the returned namedtuples or None to return regular
+            tuples.
+
+        Returns
+        -------
+        iterator
+            An object to iterate over namedtuples for each row in the
+            DataFrame with the first field possibly being the index and
+            following fields being the column values.
+
+        See Also
+        --------
+        DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
+            pairs.
+        DataFrame.items : Iterate over (column name, Series) pairs.
+
+        Notes
+        -----
+        The column names will be renamed to positional names if they are
+        invalid Python identifiers, repeated, or start with an underscore.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"num_legs": [4, 2], "num_wings": [0, 2]}, index=["dog", "hawk"]
+        ... )
+        >>> df
+              num_legs  num_wings
+        dog          4          0
+        hawk         2          2
+        >>> for row in df.itertuples():
+        ...     print(row)
+        Pandas(Index='dog', num_legs=4, num_wings=0)
+        Pandas(Index='hawk', num_legs=2, num_wings=2)
+
+        By setting the `index` parameter to False we can remove the index
+        as the first element of the tuple:
+
+        >>> for row in df.itertuples(index=False):
+        ...     print(row)
+        Pandas(num_legs=4, num_wings=0)
+        Pandas(num_legs=2, num_wings=2)
+
+        With the `name` parameter set we set a custom name for the yielded
+        namedtuples:
+
+        >>> for row in df.itertuples(name="Animal"):
+        ...     print(row)
+        Animal(Index='dog', num_legs=4, num_wings=0)
+        Animal(Index='hawk', num_legs=2, num_wings=2)
+        """
+        arrays = []
+        fields = list(self.columns)
+        if index:
+            arrays.append(self.index)
+            fields.insert(0, "Index")
+
+        # use integer indexing because of possible duplicate column names
+        arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
+
+        if name is not None:
+            # https://github.com/python/mypy/issues/9046
+            # error: namedtuple() expects a string literal as the first argument
+            itertuple = collections.namedtuple(  # type: ignore[misc]
+                name, fields, rename=True
+            )
+            return map(itertuple._make, zip(*arrays, strict=True))
+
+        # fallback to regular tuples
+        return zip(*arrays, strict=True)
+
+    def __len__(self) -> int:
+        """
+        Returns length of info axis, but here we use the index.
+        """
+        return len(self.index)
+
+    @overload
+    def dot(self, other: Series) -> Series: ...
+
+    @overload
+    def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: ...
+
+    def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
+        """
+        Compute the matrix multiplication between the DataFrame and other.
+
+        This method computes the matrix product between the DataFrame and the
+        values of an other Series, DataFrame or a numpy array.
+
+        It can also be called using ``self @ other``.
+
+        Parameters
+        ----------
+        other : Series, DataFrame or array-like
+            The other object to compute the matrix product with.
+
+        Returns
+        -------
+        Series or DataFrame
+            If other is a Series, return the matrix product between self and
+            other as a Series. If other is a DataFrame or a numpy.array, return
+            the matrix product of self and other in a DataFrame of a np.array.
+
+        See Also
+        --------
+        Series.dot: Similar method for Series.
+
+        Notes
+        -----
+        The dimensions of DataFrame and other must be compatible in order to
+        compute the matrix multiplication. In addition, the column names of
+        DataFrame and the index of other must contain the same values, as they
+        will be aligned prior to the multiplication.
+
+        The dot method for Series computes the inner product, instead of the
+        matrix product here.
+
+        Examples
+        --------
+        Here we multiply a DataFrame with a Series.
+
+        >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
+        >>> s = pd.Series([1, 1, 2, 1])
+        >>> df.dot(s)
+        0    -4
+        1     5
+        dtype: int64
+
+        Here we multiply a DataFrame with another DataFrame.
+
+        >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
+        >>> df.dot(other)
+            0   1
+        0   1   4
+        1   2   2
+
+        Note that the dot method give the same result as @
+
+        >>> df @ other
+            0   1
+        0   1   4
+        1   2   2
+
+        The dot method works also if other is an np.array.
+
+        >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
+        >>> df.dot(arr)
+            0   1
+        0   1   4
+        1   2   2
+
+        Note how shuffling of the objects does not change the result.
+
+        >>> s2 = s.reindex([1, 0, 2, 3])
+        >>> df.dot(s2)
+        0    -4
+        1     5
+        dtype: int64
+        """
+        if isinstance(other, (Series, DataFrame)):
+            common = self.columns.union(other.index)
+            if len(common) > len(self.columns) or len(common) > len(other.index):
+                raise ValueError("matrices are not aligned")
+
+            left = self.reindex(columns=common)
+            right = other.reindex(index=common)
+            lvals = left.values
+            rvals = right._values
+        else:
+            left = self
+            lvals = self.values
+            rvals = np.asarray(other)
+            if lvals.shape[1] != rvals.shape[0]:
+                raise ValueError(
+                    f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
+                )
+
+        if isinstance(other, DataFrame):
+            common_type = find_common_type(list(self.dtypes) + list(other.dtypes))
+            return self._constructor(
+                np.dot(lvals, rvals),
+                index=left.index,
+                columns=other.columns,
+                copy=False,
+                dtype=common_type,
+            )
+        elif isinstance(other, Series):
+            common_type = find_common_type([*list(self.dtypes), other.dtypes])
+            return self._constructor_sliced(
+                np.dot(lvals, rvals), index=left.index, copy=False, dtype=common_type
+            )
+        elif isinstance(rvals, (np.ndarray, Index)):
+            result = np.dot(lvals, rvals)
+            if result.ndim == 2:
+                return self._constructor(result, index=left.index, copy=False)
+            else:
+                return self._constructor_sliced(result, index=left.index, copy=False)
+        else:  # pragma: no cover
+            raise TypeError(f"unsupported type: {type(other)}")
+
+    @overload
+    def __matmul__(self, other: Series) -> Series: ...
+
+    @overload
+    def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: ...
+
+    def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
+        """
+        Matrix multiplication using binary `@` operator.
+        """
+        return self.dot(other)
+
+    def __rmatmul__(self, other) -> DataFrame:
+        """
+        Matrix multiplication using binary `@` operator.
+        """
+        try:
+            return self.T.dot(np.transpose(other)).T
+        except ValueError as err:
+            if "shape mismatch" not in str(err):
+                raise
+            # GH#21581 give exception message for original shapes
+            msg = f"shapes {np.shape(other)} and {self.shape} not aligned"
+            raise ValueError(msg) from err
+
+    # ----------------------------------------------------------------------
+    # IO methods (to / from other formats)
+
+    @classmethod
+    def from_arrow(
+        cls, data: ArrowArrayExportable | ArrowStreamExportable
+    ) -> DataFrame:
+        """
+        Construct a DataFrame from a tabular Arrow object.
+
+        This function accepts any Arrow-compatible tabular object implementing
+        the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_array__``
+        or ``__arrow_c_stream__`` method).
+
+        This function currently relies on ``pyarrow`` to convert the tabular
+        object in Arrow format to pandas.
+
+        .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
+
+        .. versionadded:: 3.0
+
+        Parameters
+        ----------
+        data : pyarrow.Table or Arrow-compatible table
+            Any tabular object implementing the Arrow PyCapsule Protocol
+            (i.e. has an ``__arrow_c_array__`` or ``__arrow_c_stream__``
+            method).
+
+        Returns
+        -------
+        DataFrame
+
+        See Also
+        --------
+        Series.from_arrow : Construct a Series from an Arrow object.
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> table = pa.table({"a": [1, 2, 3], "b": ["x", "y", "z"]})
+        >>> pd.DataFrame.from_arrow(table)
+           a  b
+        0  1  x
+        1  2  y
+        2  3  z
+        """
+        pa = import_optional_dependency("pyarrow", min_version="14.0.0")
+        if not isinstance(data, pa.Table):
+            if not (
+                hasattr(data, "__arrow_c_array__")
+                or hasattr(data, "__arrow_c_stream__")
+            ):
+                # explicitly test this, because otherwise we would accept variour other
+                # input types through the pa.table(..) call
+                raise TypeError(
+                    "Expected an Arrow-compatible tabular object (i.e. having an "
+                    "'_arrow_c_array__' or '__arrow_c_stream__' method), got "
+                    f"'{type(data).__name__}' instead."
+                )
+            pa_table = pa.table(data)
+        else:
+            pa_table = data
+
+        df = pa_table.to_pandas()
+        return df
+
+    @classmethod
+    def from_dict(
+        cls,
+        data: dict,
+        orient: FromDictOrient = "columns",
+        dtype: Dtype | None = None,
+        columns: Axes | None = None,
+    ) -> DataFrame:
+        """
+        Construct DataFrame from dict of array-like or dicts.
+
+        Creates DataFrame object from dictionary by columns or by index
+        allowing dtype specification.
+
+        Parameters
+        ----------
+        data : dict
+            Of the form {field : array-like} or {field : dict}.
+        orient : {'columns', 'index', 'tight'}, default 'columns'
+            The "orientation" of the data. If the keys of the passed dict
+            should be the columns of the resulting DataFrame, pass 'columns'
+            (default). Otherwise if the keys should be rows, pass 'index'.
+            If 'tight', assume a dict with keys ['index', 'columns', 'data',
+            'index_names', 'column_names'].
+
+        dtype : dtype, default None
+            Data type to force after DataFrame construction, otherwise infer.
+        columns : list, default None
+            Column labels to use when ``orient='index'``. Raises a ValueError
+            if used with ``orient='columns'`` or ``orient='tight'``.
+
+        Returns
+        -------
+        DataFrame
+
+        See Also
+        --------
+        DataFrame.from_records : DataFrame from structured ndarray, sequence
+            of tuples or dicts, or DataFrame.
+        DataFrame : DataFrame object creation using constructor.
+        DataFrame.to_dict : Convert the DataFrame to a dictionary.
+
+        Examples
+        --------
+        By default the keys of the dict become the DataFrame columns:
+
+        >>> data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}
+        >>> pd.DataFrame.from_dict(data)
+           col_1 col_2
+        0      3     a
+        1      2     b
+        2      1     c
+        3      0     d
+
+        Specify ``orient='index'`` to create the DataFrame using dictionary
+        keys as rows:
+
+        >>> data = {"row_1": [3, 2, 1, 0], "row_2": ["a", "b", "c", "d"]}
+        >>> pd.DataFrame.from_dict(data, orient="index")
+               0  1  2  3
+        row_1  3  2  1  0
+        row_2  a  b  c  d
+
+        When using the 'index' orientation, the column names can be
+        specified manually:
+
+        >>> pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"])
+               A  B  C  D
+        row_1  3  2  1  0
+        row_2  a  b  c  d
+
+        Specify ``orient='tight'`` to create the DataFrame using a 'tight'
+        format:
+
+        >>> data = {
+        ...     "index": [("a", "b"), ("a", "c")],
+        ...     "columns": [("x", 1), ("y", 2)],
+        ...     "data": [[1, 3], [2, 4]],
+        ...     "index_names": ["n1", "n2"],
+        ...     "column_names": ["z1", "z2"],
+        ... }
+        >>> pd.DataFrame.from_dict(data, orient="tight")
+        z1     x  y
+        z2     1  2
+        n1 n2
+        a  b   1  3
+           c   2  4
+        """
+        index: list | Index | None = None
+        orient = orient.lower()  # type: ignore[assignment]
+        if orient == "index":
+            if len(data) > 0:
+                # TODO speed up Series case
+                if isinstance(next(iter(data.values())), (Series, dict)):
+                    data = _from_nested_dict(data)
+                else:
+                    index = list(data.keys())
+                    # error: Incompatible types in assignment (expression has type
+                    # "List[Any]", variable has type "Dict[Any, Any]")
+                    data = list(data.values())  # type: ignore[assignment]
+        elif orient in ("columns", "tight"):
+            if columns is not None:
+                raise ValueError(f"cannot use columns parameter with orient='{orient}'")
+        else:  # pragma: no cover
+            raise ValueError(
+                f"Expected 'index', 'columns' or 'tight' for orient parameter. "
+                f"Got '{orient}' instead"
+            )
+
+        if orient != "tight":
+            return cls(data, index=index, columns=columns, dtype=dtype)
+        else:
+            realdata = data["data"]
+
+            def create_index(indexlist, namelist) -> Index:
+                index: Index
+                if len(namelist) > 1:
+                    index = MultiIndex.from_tuples(indexlist, names=namelist)
+                else:
+                    index = Index(indexlist, name=namelist[0])
+                return index
+
+            index = create_index(data["index"], data["index_names"])
+            columns = create_index(data["columns"], data["column_names"])
+            return cls(realdata, index=index, columns=columns, dtype=dtype)
+
+    def to_numpy(
+        self,
+        dtype: npt.DTypeLike | None = None,
+        copy: bool = False,
+        na_value: object = lib.no_default,
+    ) -> np.ndarray:
+        """
+        Convert the DataFrame to a NumPy array.
+
+        By default, the dtype of the returned array will be the common NumPy
+        dtype of all types in the DataFrame. For example, if the dtypes are
+        ``float16`` and ``float32``, the results dtype will be ``float32``.
+        This may require copying data and coercing values, which may be
+        expensive.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype, optional
+            The dtype to pass to :meth:`numpy.asarray`.
+        copy : bool, default False
+            Whether to ensure that the returned value is not a view on
+            another array. Note that ``copy=False`` does not *ensure* that
+            ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
+            a copy is made, even if not strictly necessary.
+        na_value : Any, optional
+            The value to use for missing values. The default value depends
+            on `dtype` and the dtypes of the DataFrame columns.
+
+        Returns
+        -------
+        numpy.ndarray
+            The NumPy array representing the values in the DataFrame.
+
+        See Also
+        --------
+        Series.to_numpy : Similar method for Series.
+
+        Examples
+        --------
+        >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
+        array([[1, 3],
+               [2, 4]])
+
+        With heterogeneous data, the lowest common type will have to
+        be used.
+
+        >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
+        >>> df.to_numpy()
+        array([[1. , 3. ],
+               [2. , 4.5]])
+
+        For a mix of numeric and non-numeric types, the output array will
+        have object dtype.
+
+        >>> df["C"] = pd.date_range("2000", periods=2)
+        >>> df.to_numpy()
+        array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
+               [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
+        """
+        if dtype is not None:
+            dtype = np.dtype(dtype)
+        result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)
+        if result.dtype is not dtype:
+            result = np.asarray(result, dtype=dtype)
+
+        return result
+
+    @overload
+    def to_dict(
+        self,
+        orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
+        *,
+        into: type[MutableMappingT] | MutableMappingT,
+        index: bool = ...,
+    ) -> MutableMappingT: ...
+
+    @overload
+    def to_dict(
+        self,
+        orient: Literal["records"],
+        *,
+        into: type[MutableMappingT] | MutableMappingT,
+        index: bool = ...,
+    ) -> list[MutableMappingT]: ...
+
+    @overload
+    def to_dict(
+        self,
+        orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
+        *,
+        into: type[dict] = ...,
+        index: bool = ...,
+    ) -> dict: ...
+
+    @overload
+    def to_dict(
+        self,
+        orient: Literal["records"],
+        *,
+        into: type[dict] = ...,
+        index: bool = ...,
+    ) -> list[dict]: ...
+
+    # error: Incompatible default for argument "into" (default has type "type
+    # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT")
+    def to_dict(
+        self,
+        orient: Literal[
+            "dict", "list", "series", "split", "tight", "records", "index"
+        ] = "dict",
+        *,
+        into: type[MutableMappingT] | MutableMappingT = dict,  # type: ignore[assignment]
+        index: bool = True,
+    ) -> MutableMappingT | list[MutableMappingT]:
+        """
+        Convert the DataFrame to a dictionary.
+
+        The type of the key-value pairs can be customized with the parameters
+        (see below).
+
+        Parameters
+        ----------
+        orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
+            Determines the type of the values of the dictionary.
+
+            - 'dict' (default) : dict like {column -> {index -> value}}
+            - 'list' : dict like {column -> [values]}
+            - 'series' : dict like {column -> Series(values)}
+            - 'split' : dict like
+              {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
+            - 'tight' : dict like
+              {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
+              'index_names' -> [index.names], 'column_names' -> [column.names]}
+            - 'records' : list like
+              [{column -> value}, ... , {column -> value}]
+            - 'index' : dict like {index -> {column -> value}}
+
+        into : class, default dict
+            The collections.abc.MutableMapping subclass used for all Mappings
+            in the return value.  Can be the actual class or an empty
+            instance of the mapping type you want.  If you want a
+            collections.defaultdict, you must pass it initialized.
+
+        index : bool, default True
+            Whether to include the index item (and index_names item if `orient`
+            is 'tight') in the returned dictionary. Can only be ``False``
+            when `orient` is 'split' or 'tight'. Note that when `orient` is
+            'records', this parameter does not take effect (index item always
+            not included).
+
+            .. versionadded:: 2.0.0
+
+        Returns
+        -------
+        dict, list or collections.abc.MutableMapping
+            Return a collections.abc.MutableMapping object representing the
+            DataFrame. The resulting transformation depends on the `orient`
+            parameter.
+
+        See Also
+        --------
+        DataFrame.from_dict: Create a DataFrame from a dictionary.
+        DataFrame.to_json: Convert a DataFrame to JSON format.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"col1": [1, 2], "col2": [0.5, 0.75]}, index=["row1", "row2"]
+        ... )
+        >>> df
+              col1  col2
+        row1     1  0.50
+        row2     2  0.75
+        >>> df.to_dict()
+        {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
+
+        You can specify the return orientation.
+
+        >>> df.to_dict("series")
+        {'col1': row1    1
+                 row2    2
+        Name: col1, dtype: int64,
+        'col2': row1    0.50
+                row2    0.75
+        Name: col2, dtype: float64}
+
+        >>> df.to_dict("split")
+        {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
+         'data': [[1, 0.5], [2, 0.75]]}
+
+        >>> df.to_dict("records")
+        [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
+
+        >>> df.to_dict("index")
+        {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
+
+        >>> df.to_dict("tight")
+        {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
+         'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
+
+        You can also specify the mapping type.
+
+        >>> from collections import OrderedDict, defaultdict
+        >>> df.to_dict(into=OrderedDict)
+        OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
+                     ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
+
+        If you want a `defaultdict`, you need to initialize it:
+
+        >>> dd = defaultdict(list)
+        >>> df.to_dict("records", into=dd)
+        [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
+         defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
+        """
+        from pandas.core.methods.to_dict import to_dict
+
+        return to_dict(self, orient, into=into, index=index)
+
+    @classmethod
+    def from_records(
+        cls,
+        data,
+        index=None,
+        exclude=None,
+        columns=None,
+        coerce_float: bool = False,
+        nrows: int | None = None,
+    ) -> DataFrame:
+        """
+        Convert structured or record ndarray to DataFrame.
+
+        Creates a DataFrame object from a structured ndarray, or iterable of
+        tuples or dicts.
+
+        Parameters
+        ----------
+        data : structured ndarray, iterable of tuples or dicts
+            Structured input data.
+        index : str, list of fields, array-like
+            Field of array to use as the index, alternately a specific set of
+            input labels to use.
+        exclude : sequence, default None
+            Columns or fields to exclude.
+        columns : sequence, default None
+            Column names to use. If the passed data do not have names
+            associated with them, this argument provides names for the
+            columns. Otherwise, this argument indicates the order of the columns
+            in the result (any names not found in the data will become all-NA
+            columns) and limits the data to these columns if not all column names
+            are provided.
+        coerce_float : bool, default False
+            Attempt to convert values of non-string, non-numeric objects (like
+            decimal.Decimal) to floating point, useful for SQL result sets.
+        nrows : int, default None
+            Number of rows to read if data is an iterator.
+
+        Returns
+        -------
+        DataFrame
+
+        See Also
+        --------
+        DataFrame.from_dict : DataFrame from dict of array-like or dicts.
+        DataFrame : DataFrame object creation using constructor.
+
+        Examples
+        --------
+        Data can be provided as a structured ndarray:
+
+        >>> data = np.array(
+        ...     [(3, "a"), (2, "b"), (1, "c"), (0, "d")],
+        ...     dtype=[("col_1", "i4"), ("col_2", "U1")],
+        ... )
+        >>> pd.DataFrame.from_records(data)
+           col_1 col_2
+        0      3     a
+        1      2     b
+        2      1     c
+        3      0     d
+
+        Data can be provided as a list of dicts:
+
+        >>> data = [
+        ...     {"col_1": 3, "col_2": "a"},
+        ...     {"col_1": 2, "col_2": "b"},
+        ...     {"col_1": 1, "col_2": "c"},
+        ...     {"col_1": 0, "col_2": "d"},
+        ... ]
+        >>> pd.DataFrame.from_records(data)
+           col_1 col_2
+        0      3     a
+        1      2     b
+        2      1     c
+        3      0     d
+
+        Data can be provided as a list of tuples with corresponding columns:
+
+        >>> data = [(3, "a"), (2, "b"), (1, "c"), (0, "d")]
+        >>> pd.DataFrame.from_records(data, columns=["col_1", "col_2"])
+           col_1 col_2
+        0      3     a
+        1      2     b
+        2      1     c
+        3      0     d
+        """
+        if isinstance(data, DataFrame):
+            raise TypeError(
+                "Passing a DataFrame to DataFrame.from_records is not supported. Use "
+                "set_index and/or drop to modify the DataFrame instead.",
+            )
+
+        result_index = None
+
+        # Make a copy of the input columns so we can modify it
+        if columns is not None:
+            columns = ensure_index(columns)
+
+        def maybe_reorder(
+            arrays: list[ArrayLike], arr_columns: Index, columns: Index, index
+        ) -> tuple[list[ArrayLike], Index, Index | None]:
+            """
+            If our desired 'columns' do not match the data's pre-existing 'arr_columns',
+            we re-order our arrays.  This is like a preemptive (cheap) reindex.
+            """
+            if len(arrays):
+                length = len(arrays[0])
+            else:
+                length = 0
+
+            result_index = None
+            if len(arrays) == 0 and index is None and length == 0:
+                result_index = default_index(0)
+
+            arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)
+            return arrays, arr_columns, result_index
+
+        if is_iterator(data):
+            if nrows == 0:
+                return cls(index=index, columns=columns)
+
+            try:
+                first_row = next(data)
+            except StopIteration:
+                return cls(index=index, columns=columns)
+
+            dtype = None
+            if hasattr(first_row, "dtype") and first_row.dtype.names:
+                dtype = first_row.dtype
+
+            values = [first_row]
+
+            if nrows is None:
+                values += data
+            else:
+                values.extend(itertools.islice(data, nrows - 1))
+
+            if dtype is not None:
+                data = np.array(values, dtype=dtype)
+            else:
+                data = values
+
+        if isinstance(data, dict):
+            if columns is None:
+                columns = arr_columns = ensure_index(sorted(data))
+                arrays = [data[k] for k in columns]
+            else:
+                arrays = []
+                arr_columns_list = []
+                for k, v in data.items():
+                    if k in columns:
+                        arr_columns_list.append(k)
+                        arrays.append(v)
+
+                arr_columns = Index(arr_columns_list)
+                arrays, arr_columns, result_index = maybe_reorder(
+                    arrays, arr_columns, columns, index
+                )
+
+        elif isinstance(data, np.ndarray):
+            arrays, columns = to_arrays(data, columns)
+            arr_columns = columns
+        else:
+            arrays, arr_columns = to_arrays(data, columns)
+            if coerce_float:
+                for i, arr in enumerate(arrays):
+                    if arr.dtype == object:
+                        # error: Argument 1 to "maybe_convert_objects" has
+                        # incompatible type "Union[ExtensionArray, ndarray]";
+                        # expected "ndarray"
+                        arrays[i] = lib.maybe_convert_objects(
+                            arr,  # type: ignore[arg-type]
+                            try_float=True,
+                        )
+
+            arr_columns = ensure_index(arr_columns)
+            if columns is None:
+                columns = arr_columns
+            else:
+                arrays, arr_columns, result_index = maybe_reorder(
+                    arrays, arr_columns, columns, index
+                )
+
+        if exclude is None:
+            exclude = set()
+        else:
+            exclude = set(exclude)
+
+        if index is not None:
+            if isinstance(index, str) or not hasattr(index, "__iter__"):
+                i = columns.get_loc(index)
+                exclude.add(index)
+                if len(arrays) > 0:
+                    result_index = Index(arrays[i], name=index)
+                else:
+                    result_index = Index([], name=index)
+            else:
+                try:
+                    index_data = [arrays[arr_columns.get_loc(field)] for field in index]
+                except (KeyError, TypeError):
+                    # raised by get_loc, see GH#29258
+                    result_index = index
+                else:
+                    result_index = ensure_index_from_sequences(index_data, names=index)
+                    exclude.update(index)
+
+        if any(exclude):
+            arr_exclude = (x for x in exclude if x in arr_columns)
+            to_remove = {arr_columns.get_loc(col) for col in arr_exclude}  # pyright: ignore[reportUnhashable]
+            arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
+
+            columns = columns.drop(exclude)
+
+        mgr = arrays_to_mgr(arrays, columns, result_index)
+        df = DataFrame._from_mgr(mgr, axes=mgr.axes)
+        if cls is not DataFrame:
+            return cls(df, copy=False)
+        return df
+
+    def to_records(
+        self, index: bool = True, column_dtypes=None, index_dtypes=None
+    ) -> np.rec.recarray:
+        """
+        Convert DataFrame to a NumPy record array.
+
+        Index will be included as the first field of the record array if
+        requested.
+
+        Parameters
+        ----------
+        index : bool, default True
+            Include index in resulting record array, stored in 'index'
+            field or using the index label, if set.
+        column_dtypes : str, type, dict, default None
+            If a string or type, the data type to store all columns. If
+            a dictionary, a mapping of column names and indices (zero-indexed)
+            to specific data types.
+        index_dtypes : str, type, dict, default None
+            If a string or type, the data type to store all index levels. If
+            a dictionary, a mapping of index level names and indices
+            (zero-indexed) to specific data types.
+
+            This mapping is applied only if `index=True`.
+
+        Returns
+        -------
+        numpy.rec.recarray
+            NumPy ndarray with the DataFrame labels as fields and each row
+            of the DataFrame as entries.
+
+        See Also
+        --------
+        DataFrame.from_records: Convert structured or record ndarray
+            to DataFrame.
+        numpy.rec.recarray: An ndarray that allows field access using
+            attributes, analogous to typed columns in a
+            spreadsheet.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": [1, 2], "B": [0.5, 0.75]}, index=["a", "b"])
+        >>> df
+           A     B
+        a  1  0.50
+        b  2  0.75
+        >>> df.to_records()
+        rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+                  dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
+
+        If the DataFrame index has no label then the recarray field name
+        is set to 'index'. If the index has a label then this is used as the
+        field name:
+
+        >>> df.index = df.index.rename("I")
+        >>> df.to_records()
+        rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+                  dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
+
+        The index can be excluded from the record array:
+
+        >>> df.to_records(index=False)
+        rec.array([(1, 0.5 ), (2, 0.75)],
+                  dtype=[('A', '<i8'), ('B', '<f8')])
+
+        Data types can be specified for the columns:
+
+        >>> df.to_records(column_dtypes={"A": "int32"})
+        rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
+                  dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
+
+        As well as for the index:
+
+        >>> df.to_records(index_dtypes="<S2")
+        rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
+                  dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
+
+        >>> index_dtypes = f"<S{df.index.str.len().max()}"
+        >>> df.to_records(index_dtypes=index_dtypes)
+        rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
+                  dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
+        """
+        if index:
+            ix_vals = [
+                np.asarray(self.index.get_level_values(i))
+                for i in range(self.index.nlevels)
+            ]
+
+            arrays = ix_vals + [
+                np.asarray(self.iloc[:, i]) for i in range(len(self.columns))
+            ]
+
+            index_names = list(self.index.names)
+
+            if isinstance(self.index, MultiIndex):
+                index_names = com.fill_missing_names(index_names)
+            elif index_names[0] is None:
+                index_names = ["index"]
+
+            names = [str(name) for name in itertools.chain(index_names, self.columns)]
+        else:
+            arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))]
+            names = [str(c) for c in self.columns]
+            index_names = []
+
+        index_len = len(index_names)
+        formats = []
+
+        for i, v in enumerate(arrays):
+            index_int = i
+
+            # When the names and arrays are collected, we
+            # first collect those in the DataFrame's index,
+            # followed by those in its columns.
+            #
+            # Thus, the total length of the array is:
+            # len(index_names) + len(DataFrame.columns).
+            #
+            # This check allows us to see whether we are
+            # handling a name / array in the index or column.
+            if index_int < index_len:
+                dtype_mapping = index_dtypes
+                name = index_names[index_int]
+            else:
+                index_int -= index_len
+                dtype_mapping = column_dtypes
+                name = self.columns[index_int]
+
+            # We have a dictionary, so we get the data type
+            # associated with the index or column (which can
+            # be denoted by its name in the DataFrame or its
+            # position in DataFrame's array of indices or
+            # columns, whichever is applicable.
+            if is_dict_like(dtype_mapping):
+                if name in dtype_mapping:
+                    dtype_mapping = dtype_mapping[name]
+                elif index_int in dtype_mapping:
+                    dtype_mapping = dtype_mapping[index_int]
+                else:
+                    dtype_mapping = None
+
+            # If no mapping can be found, use the array's
+            # dtype attribute for formatting.
+            #
+            # A valid dtype must either be a type or
+            # string naming a type.
+            if dtype_mapping is None:
+                formats.append(v.dtype)
+            elif isinstance(dtype_mapping, (type, np.dtype, str)):
+                # error: Argument 1 to "append" of "list" has incompatible
+                # type "Union[type, dtype[Any], str]"; expected "dtype[Any]"
+                formats.append(dtype_mapping)  # type: ignore[arg-type]
+            else:
+                element = "row" if i < index_len else "column"
+                msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
+                raise ValueError(msg)
+
+        return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
+
+    @classmethod
+    def _from_arrays(
+        cls,
+        arrays,
+        columns,
+        index,
+        dtype: Dtype | None = None,
+        verify_integrity: bool = True,
+    ) -> Self:
+        """
+        Create DataFrame from a list of arrays corresponding to the columns.
+
+        Parameters
+        ----------
+        arrays : list-like of arrays
+            Each array in the list corresponds to one column, in order.
+        columns : list-like, Index
+            The column names for the resulting DataFrame.
+        index : list-like, Index
+            The rows labels for the resulting DataFrame.
+        dtype : dtype, optional
+            Optional dtype to enforce for all arrays.
+        verify_integrity : bool, default True
+            Validate and homogenize all input. If set to False, it is assumed
+            that all elements of `arrays` are actual arrays how they will be
+            stored in a block (numpy ndarray or ExtensionArray), have the same
+            length as and are aligned with the index, and that `columns` and
+            `index` are ensured to be an Index object.
+
+        Returns
+        -------
+        DataFrame
+        """
+        if dtype is not None:
+            dtype = pandas_dtype(dtype)
+
+        columns = ensure_index(columns)
+        if len(columns) != len(arrays):
+            raise ValueError("len(columns) must match len(arrays)")
+        mgr = arrays_to_mgr(
+            arrays,
+            columns,
+            index,
+            dtype=dtype,
+            verify_integrity=verify_integrity,
+        )
+        return cls._from_mgr(mgr, axes=mgr.axes)
+
+    def to_stata(
+        self,
+        path: FilePath | WriteBuffer[bytes],
+        *,
+        convert_dates: dict[Hashable, str] | None = None,
+        write_index: bool = True,
+        byteorder: ToStataByteorder | None = None,
+        time_stamp: datetime.datetime | None = None,
+        data_label: str | None = None,
+        variable_labels: dict[Hashable, str] | None = None,
+        version: int | None = 114,
+        convert_strl: Sequence[Hashable] | None = None,
+        compression: CompressionOptions = "infer",
+        storage_options: StorageOptions | None = None,
+        value_labels: dict[Hashable, dict[float, str]] | None = None,
+    ) -> None:
+        """
+        Export DataFrame object to Stata dta format.
+
+        Writes the DataFrame to a Stata dataset file.
+        "dta" files contain a Stata dataset.
+
+        Parameters
+        ----------
+        path : str, path object, or buffer
+            String, path object (implementing ``os.PathLike[str]``), or file-like
+            object implementing a binary ``write()`` function.
+
+        convert_dates : dict
+            Dictionary mapping columns containing datetime types to stata
+            internal format to use when writing the dates. Options are 'tc',
+            'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
+            or a name. Datetime columns that do not have a conversion type
+            specified will be converted to 'tc'. Raises NotImplementedError if
+            a datetime column has timezone information.
+        write_index : bool
+            Write the index to Stata dataset.
+        byteorder : str
+            Can be ">", "<", "little", or "big". default is `sys.byteorder`.
+        time_stamp : datetime
+            A datetime to use as file creation date.  Default is the current
+            time.
+        data_label : str, optional
+            A label for the data set.  Must be 80 characters or smaller.
+        variable_labels : dict
+            Dictionary containing columns as keys and variable labels as
+            values. Each label must be 80 characters or smaller.
+        version : {{114, 117, 118, 119, None}}, default 114
+            Version to use in the output dta file. Set to None to let pandas
+            decide between 118 or 119 formats depending on the number of
+            columns in the frame. Version 114 can be read by Stata 10 and
+            later. Version 117 can be read by Stata 13 or later. Version 118
+            is supported in Stata 14 and later. Version 119 is supported in
+            Stata 15 and later. Version 114 limits string variables to 244
+            characters or fewer while versions 117 and later allow strings
+            with lengths up to 2,000,000 characters. Versions 118 and 119
+            support Unicode characters, and version 119 supports more than
+            32,767 variables.
+
+            Version 119 should usually only be used when the number of
+            variables exceeds the capacity of dta format 118. Exporting
+            smaller datasets in format 119 may have unintended consequences,
+            and, as of November 2020, Stata SE cannot read version 119 files.
+
+        convert_strl : list, optional
+            List of column names to convert to string columns to Stata StrL
+            format. Only available if version is 117.  Storing strings in the
+            StrL format can produce smaller dta files if strings have more than
+            8 characters and values are repeated.
+
+        compression : str or dict, default 'infer'
+            For on-the-fly compression of the output data. If 'infer' and 'path' is
+            path-like, then detect compression from the following extensions: '.gz',
+            '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+            (otherwise no compression).
+            Set to ``None`` for no compression.
+            Can also be a dict with key ``'method'`` set to one of
+            {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
+            other key-value pairs are forwarded to
+            ``zipfile.ZipFile``, ``gzip.GzipFile``,
+            ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
+            ``tarfile.TarFile``, respectively.
+            As an example, the following could be passed for faster compression and
+            to create a reproducible gzip archive:
+            ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
+
+        storage_options : dict, optional
+            Extra options that make sense for a particular storage connection, e.g.
+            host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+            are forwarded to ``urllib.request.Request`` as header options. For other
+            URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+            forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+            details, and for more examples on storage options refer `here
+            <https://pandas.pydata.org/docs/user_guide/io.html?
+            highlight=storage_options#reading-writing-remote-files>`_.
+
+        value_labels : dict of dicts
+            Dictionary containing columns as keys and dictionaries of column value
+            to labels as values. Labels for a single variable must be 32,000
+            characters or smaller.
+
+        Raises
+        ------
+        NotImplementedError
+            * If datetimes contain timezone information
+            * Column dtype is not representable in Stata
+        ValueError
+            * Columns listed in convert_dates are neither datetime64[ns]
+              or datetime.datetime
+            * Column listed in convert_dates is not in DataFrame
+            * Categorical label contains more than 32,000 characters
+
+        See Also
+        --------
+        read_stata : Import Stata data files.
+        io.stata.StataWriter : Low-level writer for Stata data files.
+        io.stata.StataWriter117 : Low-level writer for version 117 files.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [["falcon", 350], ["parrot", 18]], columns=["animal", "parrot"]
+        ... )
+        >>> df.to_stata("animals.dta")  # doctest: +SKIP
+        """
+        if version not in (114, 117, 118, 119, None):
+            raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
+        if version == 114:
+            if convert_strl is not None:
+                raise ValueError("strl is not supported in format 114")
+            from pandas.io.stata import StataWriter as statawriter
+        elif version == 117:
+            # Incompatible import of "statawriter" (imported name has type
+            # "Type[StataWriter117]", local name has type "Type[StataWriter]")
+            from pandas.io.stata import (  # type: ignore[assignment]
+                StataWriter117 as statawriter,
+            )
+        else:  # versions 118 and 119
+            # Incompatible import of "statawriter" (imported name has type
+            # "Type[StataWriter117]", local name has type "Type[StataWriter]")
+            from pandas.io.stata import (  # type: ignore[assignment]
+                StataWriterUTF8 as statawriter,
+            )
+
+        kwargs: dict[str, Any] = {}
+        if version is None or version >= 117:
+            # strl conversion is only supported >= 117
+            kwargs["convert_strl"] = convert_strl
+        if version is None or version >= 118:
+            # Specifying the version is only supported for UTF8 (118 or 119)
+            kwargs["version"] = version
+
+        writer = statawriter(
+            path,
+            self,
+            convert_dates=convert_dates,
+            byteorder=byteorder,
+            time_stamp=time_stamp,
+            data_label=data_label,
+            write_index=write_index,
+            variable_labels=variable_labels,
+            compression=compression,
+            storage_options=storage_options,
+            value_labels=value_labels,
+            **kwargs,
+        )
+        writer.write_file()
+
+    def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:
+        """
+        Write a DataFrame to the binary Feather format.
+
+        Parameters
+        ----------
+        path : str, path object, file-like object
+            String, path object (implementing ``os.PathLike[str]``), or file-like
+            object implementing a binary ``write()`` function. If a string or a path,
+            it will be used as Root Directory path when writing a partitioned dataset.
+        **kwargs :
+            Additional keywords passed to :func:`pyarrow.feather.write_feather`.
+            This includes the `compression`, `compression_level`, `chunksize`
+            and `version` keywords.
+
+        See Also
+        --------
+        DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+        DataFrame.to_excel : Write object to an Excel sheet.
+        DataFrame.to_sql : Write to a sql table.
+        DataFrame.to_csv : Write a csv file.
+        DataFrame.to_json : Convert the object to a JSON string.
+        DataFrame.to_html : Render a DataFrame as an HTML table.
+        DataFrame.to_string : Convert DataFrame to a string.
+
+        Notes
+        -----
+        This function writes the dataframe as a `feather file
+        <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default
+        index. For saving the DataFrame with your custom index use a method that
+        supports custom indices e.g. `to_parquet`.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
+        >>> df.to_feather("file.feather")  # doctest: +SKIP
+        """
+        from pandas.io.feather_format import to_feather
+
+        to_feather(self, path, **kwargs)
+
+    @overload
+    def to_markdown(
+        self,
+        buf: None = ...,
+        *,
+        mode: str = ...,
+        index: bool = ...,
+        storage_options: StorageOptions | None = ...,
+        **kwargs,
+    ) -> str: ...
+
+    @overload
+    def to_markdown(
+        self,
+        buf: FilePath | WriteBuffer[str],
+        *,
+        mode: str = ...,
+        index: bool = ...,
+        storage_options: StorageOptions | None = ...,
+        **kwargs,
+    ) -> None: ...
+
+    @overload
+    def to_markdown(
+        self,
+        buf: FilePath | WriteBuffer[str] | None,
+        *,
+        mode: str = ...,
+        index: bool = ...,
+        storage_options: StorageOptions | None = ...,
+        **kwargs,
+    ) -> str | None: ...
+
+    def to_markdown(
+        self,
+        buf: FilePath | WriteBuffer[str] | None = None,
+        *,
+        mode: str = "wt",
+        index: bool = True,
+        storage_options: StorageOptions | None = None,
+        **kwargs,
+    ) -> str | None:
+        """
+        Print DataFrame in Markdown-friendly format.
+
+        Parameters
+        ----------
+        buf : str, Path or StringIO-like, optional, default None
+            Buffer to write to. If None, the output is returned as a string.
+        mode : str, optional
+            Mode in which file is opened, "wt" by default.
+        index : bool, optional, default True
+            Add index (row) labels.
+
+        storage_options : dict, optional
+            Extra options that make sense for a particular storage connection, e.g.
+            host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+            are forwarded to ``urllib.request.Request`` as header options. For other
+            URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+            forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+            details, and for more examples on storage options refer `here
+            <https://pandas.pydata.org/docs/user_guide/io.html?
+            highlight=storage_options#reading-writing-remote-files>`_.
+
+        **kwargs
+            These parameters will be passed to `tabulate <https://pypi.org/project/tabulate>`_.
+
+        Returns
+        -------
+        str
+            DataFrame in Markdown-friendly format.
+
+        See Also
+        --------
+        DataFrame.to_html : Render DataFrame to HTML-formatted table.
+        DataFrame.to_latex : Render DataFrame to LaTeX-formatted table.
+
+        Notes
+        -----
+        Requires the `tabulate <https://pypi.org/project/tabulate>`_ package.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
+        ... )
+        >>> print(df.to_markdown())
+        |    | animal_1   | animal_2   |
+        |---:|:-----------|:-----------|
+        |  0 | elk        | dog        |
+        |  1 | pig        | quetzal    |
+
+        Output markdown with a tabulate option.
+
+        >>> print(df.to_markdown(tablefmt="grid"))
+        +----+------------+------------+
+        |    | animal_1   | animal_2   |
+        +====+============+============+
+        |  0 | elk        | dog        |
+        +----+------------+------------+
+        |  1 | pig        | quetzal    |
+        +----+------------+------------+
+        """
+        if "showindex" in kwargs:
+            raise ValueError("Pass 'index' instead of 'showindex")
+
+        kwargs.setdefault("headers", "keys")
+        kwargs.setdefault("tablefmt", "pipe")
+        kwargs.setdefault("showindex", index)
+        tabulate = import_optional_dependency("tabulate")
+        result = tabulate.tabulate(self, **kwargs)
+        if buf is None:
+            return result
+
+        with get_handle(buf, mode, storage_options=storage_options) as handles:
+            handles.handle.write(result)
+        return None
+
+    @overload
+    def to_parquet(
+        self,
+        path: None = ...,
+        *,
+        engine: Literal["auto", "pyarrow", "fastparquet"] = ...,
+        compression: ParquetCompressionOptions = ...,
+        index: bool | None = ...,
+        partition_cols: list[str] | None = ...,
+        storage_options: StorageOptions = ...,
+        filesystem: Any = ...,
+        **kwargs,
+    ) -> bytes: ...
+
+    @overload
+    def to_parquet(
+        self,
+        path: FilePath | WriteBuffer[bytes],
+        *,
+        engine: Literal["auto", "pyarrow", "fastparquet"] = ...,
+        compression: ParquetCompressionOptions = ...,
+        index: bool | None = ...,
+        partition_cols: list[str] | None = ...,
+        storage_options: StorageOptions = ...,
+        filesystem: Any = ...,
+        **kwargs,
+    ) -> None: ...
+
+    def to_parquet(
+        self,
+        path: FilePath | WriteBuffer[bytes] | None = None,
+        *,
+        engine: Literal["auto", "pyarrow", "fastparquet"] = "auto",
+        compression: ParquetCompressionOptions = "snappy",
+        index: bool | None = None,
+        partition_cols: list[str] | None = None,
+        storage_options: StorageOptions | None = None,
+        filesystem: Any = None,
+        **kwargs,
+    ) -> bytes | None:
+        """
+        Write a DataFrame to the binary parquet format.
+
+        This function writes the dataframe as a `parquet file
+        <https://parquet.apache.org/>`_. You can choose different parquet
+        backends, and have the option of compression. See
+        :ref:`the user guide <io.parquet>` for more details.
+
+        Parameters
+        ----------
+        path : str, path object, file-like object, or None, default None
+            String, path object (implementing ``os.PathLike[str]``), or file-like
+            object implementing a binary ``write()`` function. If None, the result is
+            returned as bytes. If a string or path, it will be used as Root Directory
+            path when writing a partitioned dataset.
+        engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
+            Parquet library to use. If 'auto', then the option
+            ``io.parquet.engine`` is used. The default ``io.parquet.engine``
+            behavior is to try 'pyarrow', falling back to 'fastparquet' if
+            'pyarrow' is unavailable.
+        compression : str or None, default 'snappy'
+            Name of the compression to use. Use ``None`` for no compression.
+            Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'.
+        index : bool, default None
+            If ``True``, include the dataframe's index(es) in the file output.
+            If ``False``, they will not be written to the file.
+            If ``None``, similar to ``True`` the dataframe's index(es)
+            will be saved. However, instead of being saved as values,
+            the RangeIndex will be stored as a range in the metadata so it
+            doesn't require much space and is faster. Other indexes will
+            be included as columns in the file output.
+        partition_cols : list, optional, default None
+            Column names by which to partition the dataset.
+            Columns are partitioned in the order they are given.
+            Must be None if path is not a string.
+        storage_options : dict, optional
+            Extra options that make sense for a particular storage connection, e.g.
+            host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+            are forwarded to ``urllib.request.Request`` as header options. For other
+            URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+            forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+            details, and for more examples on storage options refer `here
+            <https://pandas.pydata.org/docs/user_guide/io.html?
+            highlight=storage_options#reading-writing-remote-files>`_.
+        filesystem : fsspec or pyarrow filesystem, default None
+            Filesystem object to use when reading the parquet file. Only implemented
+            for ``engine="pyarrow"``.
+
+            .. versionadded:: 2.1.0
+
+        **kwargs
+            Additional arguments passed to the parquet library. See
+            :ref:`pandas io <io.parquet>` for more details.
+
+        Returns
+        -------
+        bytes if no path argument is provided else None
+            Returns the DataFrame converted to the binary parquet format as bytes if no
+            path argument. Returns None and writes the DataFrame to the specified
+            location in the Parquet format if the path argument is provided.
+
+        See Also
+        --------
+        read_parquet : Read a parquet file.
+        DataFrame.to_orc : Write an orc file.
+        DataFrame.to_csv : Write a csv file.
+        DataFrame.to_sql : Write to a sql table.
+        DataFrame.to_hdf : Write to hdf.
+
+        Notes
+        -----
+        * This function requires either the `fastparquet
+          <https://pypi.org/project/fastparquet>`_ or `pyarrow
+          <https://arrow.apache.org/docs/python/>`_ library.
+        * When saving a DataFrame with categorical columns to parquet,
+          the file size may increase due to the inclusion of all possible
+          categories, not just those present in the data. This behavior
+          is expected and consistent with pandas' handling of categorical data.
+          To manage file size and ensure a more predictable roundtrip process,
+          consider using :meth:`Categorical.remove_unused_categories` on the
+          DataFrame before saving.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})
+        >>> df.to_parquet("df.parquet.gzip", compression="gzip")  # doctest: +SKIP
+        >>> pd.read_parquet("df.parquet.gzip")  # doctest: +SKIP
+           col1  col2
+        0     1     3
+        1     2     4
+
+        If you want to get a buffer to the parquet content you can use a io.BytesIO
+        object, as long as you don't use partition_cols, which creates multiple files.
+
+        >>> import io
+        >>> f = io.BytesIO()
+        >>> df.to_parquet(f)
+        >>> f.seek(0)
+        0
+        >>> content = f.read()
+        """
+        from pandas.io.parquet import to_parquet
+
+        return to_parquet(
+            self,
+            path,
+            engine,
+            compression=compression,
+            index=index,
+            partition_cols=partition_cols,
+            storage_options=storage_options,
+            filesystem=filesystem,
+            **kwargs,
+        )
+
+    @overload
+    def to_orc(
+        self,
+        path: None = ...,
+        *,
+        engine: Literal["pyarrow"] = ...,
+        index: bool | None = ...,
+        engine_kwargs: dict[str, Any] | None = ...,
+    ) -> bytes: ...
+
+    @overload
+    def to_orc(
+        self,
+        path: FilePath | WriteBuffer[bytes],
+        *,
+        engine: Literal["pyarrow"] = ...,
+        index: bool | None = ...,
+        engine_kwargs: dict[str, Any] | None = ...,
+    ) -> None: ...
+
+    @overload
+    def to_orc(
+        self,
+        path: FilePath | WriteBuffer[bytes] | None,
+        *,
+        engine: Literal["pyarrow"] = ...,
+        index: bool | None = ...,
+        engine_kwargs: dict[str, Any] | None = ...,
+    ) -> bytes | None: ...
+
+    def to_orc(
+        self,
+        path: FilePath | WriteBuffer[bytes] | None = None,
+        *,
+        engine: Literal["pyarrow"] = "pyarrow",
+        index: bool | None = None,
+        engine_kwargs: dict[str, Any] | None = None,
+    ) -> bytes | None:
+        """
+        Write a DataFrame to the Optimized Row Columnar (ORC) format.
+
+        Parameters
+        ----------
+        path : str, file-like object or None, default None
+            If a string, it will be used as Root Directory path
+            when writing a partitioned dataset. By file-like object,
+            we refer to objects with a write() method, such as a file handle
+            (e.g. via builtin open function). If path is None,
+            a bytes object is returned.
+        engine : {'pyarrow'}, default 'pyarrow'
+            ORC library to use.
+        index : bool, optional
+            If ``True``, include the dataframe's index(es) in the file output.
+            If ``False``, they will not be written to the file.
+            If ``None``, similar to ``infer`` the dataframe's index(es)
+            will be saved. However, instead of being saved as values,
+            the RangeIndex will be stored as a range in the metadata so it
+            doesn't require much space and is faster. Other indexes will
+            be included as columns in the file output.
+        engine_kwargs : dict[str, Any] or None, default None
+            Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
+
+        Returns
+        -------
+        bytes if no ``path`` argument is provided else None
+            Bytes object with DataFrame data if ``path`` is not specified else None.
+
+        Raises
+        ------
+        NotImplementedError
+            Dtype of one or more columns is category, unsigned integers, interval,
+            period or sparse.
+        ValueError
+            engine is not pyarrow.
+
+        See Also
+        --------
+        read_orc : Read a ORC file.
+        DataFrame.to_parquet : Write a parquet file.
+        DataFrame.to_csv : Write a csv file.
+        DataFrame.to_sql : Write to a sql table.
+        DataFrame.to_hdf : Write to hdf.
+
+        Notes
+        -----
+        * Find more information on ORC
+          `here <https://en.wikipedia.org/wiki/Apache_ORC>`__.
+        * Before using this function you should read the :ref:`user guide about
+          ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.
+        * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
+          library.
+        * For supported dtypes please refer to `supported ORC features in Arrow
+          <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
+        * Currently timezones in datetime columns are not preserved when a
+          dataframe is converted into ORC files.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]})
+        >>> df.to_orc("df.orc")  # doctest: +SKIP
+        >>> pd.read_orc("df.orc")  # doctest: +SKIP
+           col1  col2
+        0     1     4
+        1     2     3
+
+        If you want to get a buffer to the orc content you can write it to io.BytesIO
+
+        >>> import io
+        >>> b = io.BytesIO(df.to_orc())  # doctest: +SKIP
+        >>> b.seek(0)  # doctest: +SKIP
+        0
+        >>> content = b.read()  # doctest: +SKIP
+        """
+        from pandas.io.orc import to_orc
+
+        return to_orc(
+            self, path, engine=engine, index=index, engine_kwargs=engine_kwargs
+        )
+
+    @overload
+    def to_html(
+        self,
+        buf: FilePath | WriteBuffer[str],
+        *,
+        columns: Axes | None = ...,
+        col_space: ColspaceArgType | None = ...,
+        header: bool = ...,
+        index: bool = ...,
+        na_rep: str = ...,
+        formatters: FormattersType | None = ...,
+        float_format: FloatFormatType | None = ...,
+        sparsify: bool | None = ...,
+        index_names: bool = ...,
+        justify: str | None = ...,
+        max_rows: int | None = ...,
+        max_cols: int | None = ...,
+        show_dimensions: bool | str = ...,
+        decimal: str = ...,
+        bold_rows: bool = ...,
+        classes: str | list | tuple | None = ...,
+        escape: bool = ...,
+        notebook: bool = ...,
+        border: int | bool | None = ...,
+        table_id: str | None = ...,
+        render_links: bool = ...,
+        encoding: str | None = ...,
+    ) -> None: ...
+
+    @overload
+    def to_html(
+        self,
+        buf: None = ...,
+        *,
+        columns: Axes | None = ...,
+        col_space: ColspaceArgType | None = ...,
+        header: bool = ...,
+        index: bool = ...,
+        na_rep: str = ...,
+        formatters: FormattersType | None = ...,
+        float_format: FloatFormatType | None = ...,
+        sparsify: bool | None = ...,
+        index_names: bool = ...,
+        justify: str | None = ...,
+        max_rows: int | None = ...,
+        max_cols: int | None = ...,
+        show_dimensions: bool | str = ...,
+        decimal: str = ...,
+        bold_rows: bool = ...,
+        classes: str | list | tuple | None = ...,
+        escape: bool = ...,
+        notebook: bool = ...,
+        border: int | bool | None = ...,
+        table_id: str | None = ...,
+        render_links: bool = ...,
+        encoding: str | None = ...,
+    ) -> str: ...
+
+    @Substitution(
+        header_type="bool",
+        header="Whether to print column labels, default True",
+        col_space_type="str or int, list or dict of int or str",
+        col_space="The minimum width of each column in CSS length "
+        "units.  An int is assumed to be px units.",
+    )
+    @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
+    def to_html(
+        self,
+        buf: FilePath | WriteBuffer[str] | None = None,
+        *,
+        columns: Axes | None = None,
+        col_space: ColspaceArgType | None = None,
+        header: bool = True,
+        index: bool = True,
+        na_rep: str = "NaN",
+        formatters: FormattersType | None = None,
+        float_format: FloatFormatType | None = None,
+        sparsify: bool | None = None,
+        index_names: bool = True,
+        justify: str | None = None,
+        max_rows: int | None = None,
+        max_cols: int | None = None,
+        show_dimensions: bool | str = False,
+        decimal: str = ".",
+        bold_rows: bool = True,
+        classes: str | list | tuple | None = None,
+        escape: bool = True,
+        notebook: bool = False,
+        border: int | bool | None = None,
+        table_id: str | None = None,
+        render_links: bool = False,
+        encoding: str | None = None,
+    ) -> str | None:
+        """
+        Render a DataFrame as an HTML table.
+        %(shared_params)s
+        bold_rows : bool, default True
+            Make the row labels bold in the output.
+        classes : str or list or tuple, default None
+            CSS class(es) to apply to the resulting html table.
+        escape : bool, default True
+            Convert the characters <, >, and & to HTML-safe sequences.
+        notebook : {True, False}, default False
+            Whether the generated HTML is for IPython Notebook.
+        border : int or bool
+            When an integer value is provided, it sets the border attribute in
+            the opening tag, specifying the thickness of the border.
+            If ``False`` or ``0`` is passed, the border attribute will not
+            be present in the ``<table>`` tag.
+            The default value for this parameter is governed by
+            ``pd.options.display.html.border``.
+        table_id : str, optional
+            A css id is included in the opening `<table>` tag if specified.
+        render_links : bool, default False
+            Convert URLs to HTML links.
+        encoding : str, default "utf-8"
+            Set character encoding.
+        %(returns)s
+        See Also
+        --------
+        to_string : Convert DataFrame to a string.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]})
+        >>> html_string = df.to_html()
+        >>> print(html_string)
+        <table border="1" class="dataframe">
+          <thead>
+            <tr style="text-align: right;">
+              <th></th>
+              <th>col1</th>
+              <th>col2</th>
+            </tr>
+          </thead>
+          <tbody>
+            <tr>
+              <th>0</th>
+              <td>1</td>
+              <td>4</td>
+            </tr>
+            <tr>
+              <th>1</th>
+              <td>2</td>
+              <td>3</td>
+            </tr>
+          </tbody>
+        </table>
+
+        HTML output
+
+        +----+-----+-----+
+        |    |col1 |col2 |
+        +====+=====+=====+
+        |0   |1    |4    |
+        +----+-----+-----+
+        |1   |2    |3    |
+        +----+-----+-----+
+
+        >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]})
+        >>> html_string = df.to_html(index=False)
+        >>> print(html_string)
+        <table border="1" class="dataframe">
+          <thead>
+            <tr style="text-align: right;">
+              <th>col1</th>
+              <th>col2</th>
+            </tr>
+          </thead>
+          <tbody>
+            <tr>
+              <td>1</td>
+              <td>4</td>
+            </tr>
+            <tr>
+              <td>2</td>
+              <td>3</td>
+            </tr>
+          </tbody>
+        </table>
+
+        HTML output
+
+        +-----+-----+
+        |col1 |col2 |
+        +=====+=====+
+        |1    |4    |
+        +-----+-----+
+        |2    |3    |
+        +-----+-----+
+        """
+        if justify is not None and justify not in fmt.VALID_JUSTIFY_PARAMETERS:
+            raise ValueError("Invalid value for justify parameter")
+
+        formatter = fmt.DataFrameFormatter(
+            self,
+            columns=columns,
+            col_space=col_space,
+            na_rep=na_rep,
+            header=header,
+            index=index,
+            formatters=formatters,
+            float_format=float_format,
+            bold_rows=bold_rows,
+            sparsify=sparsify,
+            justify=justify,
+            index_names=index_names,
+            escape=escape,
+            decimal=decimal,
+            max_rows=max_rows,
+            max_cols=max_cols,
+            show_dimensions=show_dimensions,
+        )
+        # TODO: a generic formatter wld b in DataFrameFormatter
+        return fmt.DataFrameRenderer(formatter).to_html(
+            buf=buf,
+            classes=classes,
+            notebook=notebook,
+            border=border,
+            encoding=encoding,
+            table_id=table_id,
+            render_links=render_links,
+        )
+
+    @overload
+    def to_xml(
+        self,
+        path_or_buffer: None = ...,
+        *,
+        index: bool = ...,
+        root_name: str | None = ...,
+        row_name: str | None = ...,
+        na_rep: str | None = ...,
+        attr_cols: list[str] | None = ...,
+        elem_cols: list[str] | None = ...,
+        namespaces: dict[str | None, str] | None = ...,
+        prefix: str | None = ...,
+        encoding: str = ...,
+        xml_declaration: bool | None = ...,
+        pretty_print: bool | None = ...,
+        parser: XMLParsers | None = ...,
+        stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ...,
+        compression: CompressionOptions = ...,
+        storage_options: StorageOptions | None = ...,
+    ) -> str: ...
+
+    @overload
+    def to_xml(
+        self,
+        path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
+        *,
+        index: bool = ...,
+        root_name: str | None = ...,
+        row_name: str | None = ...,
+        na_rep: str | None = ...,
+        attr_cols: list[str] | None = ...,
+        elem_cols: list[str] | None = ...,
+        namespaces: dict[str | None, str] | None = ...,
+        prefix: str | None = ...,
+        encoding: str = ...,
+        xml_declaration: bool | None = ...,
+        pretty_print: bool | None = ...,
+        parser: XMLParsers | None = ...,
+        stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ...,
+        compression: CompressionOptions = ...,
+        storage_options: StorageOptions | None = ...,
+    ) -> None: ...
+
+    def to_xml(
+        self,
+        path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
+        *,
+        index: bool = True,
+        root_name: str | None = "data",
+        row_name: str | None = "row",
+        na_rep: str | None = None,
+        attr_cols: list[str] | None = None,
+        elem_cols: list[str] | None = None,
+        namespaces: dict[str | None, str] | None = None,
+        prefix: str | None = None,
+        encoding: str = "utf-8",
+        xml_declaration: bool | None = True,
+        pretty_print: bool | None = True,
+        parser: XMLParsers | None = "lxml",
+        stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
+        compression: CompressionOptions = "infer",
+        storage_options: StorageOptions | None = None,
+    ) -> str | None:
+        """
+        Render a DataFrame to an XML document.
+
+        Parameters
+        ----------
+        path_or_buffer : str, path object, file-like object, or None, default None
+            String, path object (implementing ``os.PathLike[str]``), or file-like
+            object implementing a ``write()`` function. If None, the result is returned
+            as a string.
+        index : bool, default True
+            Whether to include index in XML document.
+        root_name : str, default 'data'
+            The name of root element in XML document.
+        row_name : str, default 'row'
+            The name of row element in XML document.
+        na_rep : str, optional
+            Missing data representation.
+        attr_cols : list-like, optional
+            List of columns to write as attributes in row element.
+            Hierarchical columns will be flattened with underscore
+            delimiting the different levels.
+        elem_cols : list-like, optional
+            List of columns to write as children in row element. By default,
+            all columns output as children of row element. Hierarchical
+            columns will be flattened with underscore delimiting the
+            different levels.
+        namespaces : dict, optional
+            All namespaces to be defined in root element. Keys of dict
+            should be prefix names and values of dict corresponding URIs.
+            Default namespaces should be given empty string key. For
+            example, ::
+
+                namespaces = {{"": "https://example.com"}}
+
+        prefix : str, optional
+            Namespace prefix to be used for every element and/or attribute
+            in document. This should be one of the keys in ``namespaces``
+            dict.
+        encoding : str, default 'utf-8'
+            Encoding of the resulting document.
+        xml_declaration : bool, default True
+            Whether to include the XML declaration at start of document.
+        pretty_print : bool, default True
+            Whether output should be pretty printed with indentation and
+            line breaks.
+        parser : {{'lxml','etree'}}, default 'lxml'
+            Parser module to use for building of tree. Only 'lxml' and
+            'etree' are supported. With 'lxml', the ability to use XSLT
+            stylesheet is supported.
+        stylesheet : str, path object or file-like object, optional
+            A URL, file-like object, or a raw string containing an XSLT
+            script used to transform the raw XML output. Script should use
+            layout of elements and attributes from original output. This
+            argument requires ``lxml`` to be installed. Only XSLT 1.0
+            scripts and not later versions is currently supported.
+        compression : str or dict, default 'infer'
+            For on-the-fly compression of the output data. If 'infer' and
+            'path_or_buffer' is
+            path-like, then detect compression from the following extensions: '.gz',
+            '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+            (otherwise no compression).
+            Set to ``None`` for no compression.
+            Can also be a dict with key ``'method'`` set to one of
+            {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
+            other key-value pairs are forwarded to
+            ``zipfile.ZipFile``, ``gzip.GzipFile``,
+            ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
+            ``tarfile.TarFile``, respectively.
+            As an example, the following could be passed for faster compression and
+            to create a reproducible gzip archive:
+            ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
+        storage_options : dict, optional
+            Extra options that make sense for a particular storage connection, e.g.
+            host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+            are forwarded to ``urllib.request.Request`` as header options. For other
+            URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+            forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+            details, and for more examples on storage options refer `here
+            <https://pandas.pydata.org/docs/user_guide/io.html?
+            highlight=storage_options#reading-writing-remote-files>`_.
+
+        Returns
+        -------
+        None or str
+            If ``io`` is None, returns the resulting XML format as a
+            string. Otherwise returns None.
+
+        See Also
+        --------
+        to_json : Convert the pandas object to a JSON string.
+        to_html : Convert DataFrame to a html.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [["square", 360, 4], ["circle", 360, np.nan], ["triangle", 180, 3]],
+        ...     columns=["shape", "degrees", "sides"],
+        ... )
+
+        >>> df.to_xml()  # doctest: +SKIP
+        <?xml version='1.0' encoding='utf-8'?>
+        <data>
+          <row>
+            <index>0</index>
+            <shape>square</shape>
+            <degrees>360</degrees>
+            <sides>4.0</sides>
+          </row>
+          <row>
+            <index>1</index>
+            <shape>circle</shape>
+            <degrees>360</degrees>
+            <sides/>
+          </row>
+          <row>
+            <index>2</index>
+            <shape>triangle</shape>
+            <degrees>180</degrees>
+            <sides>3.0</sides>
+          </row>
+        </data>
+
+        >>> df.to_xml(
+        ...     attr_cols=["index", "shape", "degrees", "sides"]
+        ... )  # doctest: +SKIP
+        <?xml version='1.0' encoding='utf-8'?>
+        <data>
+          <row index="0" shape="square" degrees="360" sides="4.0"/>
+          <row index="1" shape="circle" degrees="360"/>
+          <row index="2" shape="triangle" degrees="180" sides="3.0"/>
+        </data>
+
+        >>> df.to_xml(
+        ...     namespaces={{"doc": "https://example.com"}}, prefix="doc"
+        ... )  # doctest: +SKIP
+        <?xml version='1.0' encoding='utf-8'?>
+        <doc:data xmlns:doc="https://example.com">
+          <doc:row>
+            <doc:index>0</doc:index>
+            <doc:shape>square</doc:shape>
+            <doc:degrees>360</doc:degrees>
+            <doc:sides>4.0</doc:sides>
+          </doc:row>
+          <doc:row>
+            <doc:index>1</doc:index>
+            <doc:shape>circle</doc:shape>
+            <doc:degrees>360</doc:degrees>
+            <doc:sides/>
+          </doc:row>
+          <doc:row>
+            <doc:index>2</doc:index>
+            <doc:shape>triangle</doc:shape>
+            <doc:degrees>180</doc:degrees>
+            <doc:sides>3.0</doc:sides>
+          </doc:row>
+        </doc:data>
+        """
+
+        from pandas.io.formats.xml import (
+            EtreeXMLFormatter,
+            LxmlXMLFormatter,
+        )
+
+        lxml = import_optional_dependency("lxml.etree", errors="ignore")
+
+        TreeBuilder: type[EtreeXMLFormatter | LxmlXMLFormatter]
+
+        if parser == "lxml":
+            if lxml is not None:
+                TreeBuilder = LxmlXMLFormatter
+            else:
+                raise ImportError(
+                    "lxml not found, please install or use the etree parser."
+                )
+
+        elif parser == "etree":
+            TreeBuilder = EtreeXMLFormatter
+
+        else:
+            raise ValueError("Values for parser can only be lxml or etree.")
+
+        xml_formatter = TreeBuilder(
+            self,
+            path_or_buffer=path_or_buffer,
+            index=index,
+            root_name=root_name,
+            row_name=row_name,
+            na_rep=na_rep,
+            attr_cols=attr_cols,
+            elem_cols=elem_cols,
+            namespaces=namespaces,
+            prefix=prefix,
+            encoding=encoding,
+            xml_declaration=xml_declaration,
+            pretty_print=pretty_print,
+            stylesheet=stylesheet,
+            compression=compression,
+            storage_options=storage_options,
+        )
+
+        return xml_formatter.write_output()
+
+    def to_iceberg(
+        self,
+        table_identifier: str,
+        catalog_name: str | None = None,
+        *,
+        catalog_properties: dict[str, Any] | None = None,
+        location: str | None = None,
+        append: bool = False,
+        snapshot_properties: dict[str, str] | None = None,
+    ) -> None:
+        """
+        Write a DataFrame to an Apache Iceberg table.
+
+        .. versionadded:: 3.0.0
+
+        .. warning::
+
+           to_iceberg is experimental and may change without warning.
+
+        Parameters
+        ----------
+        table_identifier : str
+            Table identifier.
+        catalog_name : str, optional
+            The name of the catalog.
+        catalog_properties : dict of {str: str}, optional
+            The properties that are used next to the catalog configuration.
+        location : str, optional
+            Location for the table.
+        append : bool, default False
+            If ``True``, append data to the table, instead of replacing the content.
+        snapshot_properties : dict of {str: str}, optional
+            Custom properties to be added to the snapshot summary
+
+        See Also
+        --------
+        read_iceberg : Read an Apache Iceberg table.
+        DataFrame.to_parquet : Write a DataFrame in Parquet format.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]})
+        >>> df.to_iceberg("my_table", catalog_name="my_catalog")  # doctest: +SKIP
+        """
+        from pandas.io.iceberg import to_iceberg
+
+        to_iceberg(
+            self,
+            table_identifier,
+            catalog_name,
+            catalog_properties=catalog_properties,
+            location=location,
+            append=append,
+            snapshot_properties=snapshot_properties,
+        )
+
+    # ----------------------------------------------------------------------
+    def info(
+        self,
+        verbose: bool | None = None,
+        buf: WriteBuffer[str] | None = None,
+        max_cols: int | None = None,
+        memory_usage: bool | str | None = None,
+        show_counts: bool | None = None,
+    ) -> None:
+        """
+        Print a concise summary of a DataFrame.
+
+        This method prints information about a DataFrame including
+        the index dtype and columns, non-NA values and memory usage.
+
+        Parameters
+        ----------
+        verbose : bool, optional
+            Whether to print the full summary. By default, the setting in
+            ``pandas.options.display.max_info_columns`` is followed.
+        buf : writable buffer, defaults to sys.stdout
+            Where to send the output. By default, the output is printed to
+            sys.stdout. Pass a writable buffer if you need to further process
+            the output.
+        max_cols : int, optional
+            When to switch from the verbose to the truncated output. If the
+            DataFrame has more than `max_cols` columns, the truncated output
+            is used. By default, the setting in
+            ``pandas.options.display.max_info_columns`` is used.
+        memory_usage : bool, str, optional
+            Specifies whether total memory usage of the DataFrame
+            elements (including the index) should be displayed. By default,
+            this follows the ``pandas.options.display.memory_usage`` setting.
+
+            True always show memory usage. False never shows memory usage.
+            A value of 'deep' is equivalent to "True with deep introspection".
+            Memory usage is shown in human-readable units (base-2
+            representation). Without deep introspection a memory estimation is
+            made based in column dtype and number of rows assuming values
+            consume the same memory amount for corresponding dtypes. With deep
+            memory introspection, a real memory usage calculation is performed
+            at the cost of computational resources. See the
+            :ref:`Frequently Asked Questions <df-memory-usage>` for more
+            details.
+        show_counts : bool, optional
+            Whether to show the non-null counts. By default, this is shown
+            only if the DataFrame is smaller than
+            ``pandas.options.display.max_info_rows`` and
+            ``pandas.options.display.max_info_columns``. A value of True always
+            shows the counts, and False never shows the counts.
+
+        Returns
+        -------
+        None
+            This method prints a summary of a DataFrame and returns None.
+
+        See Also
+        --------
+        DataFrame.describe: Generate descriptive statistics of DataFrame
+            columns.
+        DataFrame.memory_usage: Memory usage of DataFrame columns.
+
+        Examples
+        --------
+        >>> int_values = [1, 2, 3, 4, 5]
+        >>> text_values = ["alpha", "beta", "gamma", "delta", "epsilon"]
+        >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "int_col": int_values,
+        ...         "text_col": text_values,
+        ...         "float_col": float_values,
+        ...     }
+        ... )
+        >>> df
+            int_col text_col  float_col
+        0        1    alpha       0.00
+        1        2     beta       0.25
+        2        3    gamma       0.50
+        3        4    delta       0.75
+        4        5  epsilon       1.00
+
+        Prints information of all columns:
+
+        >>> df.info(verbose=True)
+        <class 'pandas.DataFrame'>
+        RangeIndex: 5 entries, 0 to 4
+        Data columns (total 3 columns):
+         #   Column     Non-Null Count  Dtype
+        ---  ------     --------------  -----
+         0   int_col    5 non-null      int64
+         1   text_col   5 non-null      str
+         2   float_col  5 non-null      float64
+        dtypes: float64(1), int64(1), str(1)
+        memory usage: 278.0 bytes
+
+        Prints a summary of columns count and its dtypes but not per column
+        information:
+
+        >>> df.info(verbose=False)
+        <class 'pandas.DataFrame'>
+        RangeIndex: 5 entries, 0 to 4
+        Columns: 3 entries, int_col to float_col
+        dtypes: float64(1), int64(1), str(1)
+        memory usage: 278.0 bytes
+
+        Pipe output of DataFrame.info to buffer instead of sys.stdout, get
+        buffer content and writes to a text file:
+
+        >>> import io
+        >>> buffer = io.StringIO()
+        >>> df.info(buf=buffer)
+        >>> s = buffer.getvalue()
+        >>> with open("df_info.txt", "w", encoding="utf-8") as f:  # doctest: +SKIP
+        ...     f.write(s)
+        260
+
+        The `memory_usage` parameter allows deep introspection mode, specially
+        useful for big DataFrames and fine-tune memory optimization:
+
+        >>> random_strings_array = np.random.choice(["a", "b", "c"], 10**6)
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "column_1": np.random.choice(["a", "b", "c"], 10**6),
+        ...         "column_2": np.random.choice(["a", "b", "c"], 10**6),
+        ...         "column_3": np.random.choice(["a", "b", "c"], 10**6),
+        ...     }
+        ... )
+        >>> df.info()
+        <class 'pandas.DataFrame'>
+        RangeIndex: 1000000 entries, 0 to 999999
+        Data columns (total 3 columns):
+         #   Column    Non-Null Count    Dtype
+        ---  ------    --------------    -----
+         0   column_1  1000000 non-null  str
+         1   column_2  1000000 non-null  str
+         2   column_3  1000000 non-null  str
+        dtypes: str(3)
+        memory usage: 25.7 MB
+
+        >>> df.info(memory_usage="deep")
+        <class 'pandas.DataFrame'>
+        RangeIndex: 1000000 entries, 0 to 999999
+        Data columns (total 3 columns):
+         #   Column    Non-Null Count    Dtype
+        ---  ------    --------------    -----
+         0   column_1  1000000 non-null  str
+         1   column_2  1000000 non-null  str
+         2   column_3  1000000 non-null  str
+        dtypes: str(3)
+        memory usage: 25.7 MB
+        """
+        info = DataFrameInfo(
+            data=self,
+            memory_usage=memory_usage,
+        )
+        info.render(
+            buf=buf,
+            max_cols=max_cols,
+            verbose=verbose,
+            show_counts=show_counts,
+        )
+
+    def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
+        """
+        Return the memory usage of each column in bytes.
+
+        The memory usage can optionally include the contribution of
+        the index and elements of `object` dtype.
+
+        This value is displayed in `DataFrame.info` by default. This can be
+        suppressed by setting ``pandas.options.display.memory_usage`` to False.
+
+        Parameters
+        ----------
+        index : bool, default True
+            Specifies whether to include the memory usage of the DataFrame's
+            index in returned Series. If ``index=True``, the memory usage of
+            the index is the first item in the output.
+        deep : bool, default False
+            If True, introspect the data deeply by interrogating
+            `object` dtypes for system-level memory consumption, and include
+            it in the returned values.
+
+        Returns
+        -------
+        Series
+            A Series whose index is the original column names and whose values
+            is the memory usage of each column in bytes.
+
+        See Also
+        --------
+        numpy.ndarray.nbytes : Total bytes consumed by the elements of an
+            ndarray.
+        Series.memory_usage : Bytes consumed by a Series.
+        Categorical : Memory-efficient array for string values with
+            many repeated values.
+        DataFrame.info : Concise summary of a DataFrame.
+
+        Notes
+        -----
+        See the :ref:`Frequently Asked Questions <df-memory-usage>` for more
+        details.
+
+        Examples
+        --------
+        >>> dtypes = ["int64", "float64", "complex128", "object", "bool"]
+        >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) for t in dtypes])
+        >>> df = pd.DataFrame(data)
+        >>> df.head()
+           int64  float64            complex128  object  bool
+        0      1      1.0              1.0+0.0j       1  True
+        1      1      1.0              1.0+0.0j       1  True
+        2      1      1.0              1.0+0.0j       1  True
+        3      1      1.0              1.0+0.0j       1  True
+        4      1      1.0              1.0+0.0j       1  True
+
+        >>> df.memory_usage()
+        Index           132
+        int64         40000
+        float64       40000
+        complex128    80000
+        object        40000
+        bool           5000
+        dtype: int64
+
+        >>> df.memory_usage(index=False)
+        int64         40000
+        float64       40000
+        complex128    80000
+        object        40000
+        bool           5000
+        dtype: int64
+
+        The memory footprint of `object` dtype columns is ignored by default:
+
+        >>> df.memory_usage(deep=True)
+        Index            132
+        int64          40000
+        float64        40000
+        complex128     80000
+        object        180000
+        bool            5000
+        dtype: int64
+
+        Use a Categorical for efficient storage of an object-dtype column with
+        many repeated values.
+
+        >>> df["object"].astype("category").memory_usage(deep=True)
+        5140
+        """
+        result = self._constructor_sliced(
+            [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
+            index=self.columns,
+            dtype=np.intp,
+        )
+        if index:
+            index_memory_usage = self._constructor_sliced(
+                self.index.memory_usage(deep=deep), index=["Index"]
+            )
+            result = index_memory_usage._append_internal(result)
+        return result
+
+    def transpose(
+        self,
+        *args,
+        copy: bool | lib.NoDefault = lib.no_default,
+    ) -> DataFrame:
+        """
+        Transpose index and columns.
+
+        Reflect the DataFrame over its main diagonal by writing rows as columns
+        and vice-versa. The property :attr:`.T` is an accessor to the method
+        :meth:`transpose`.
+
+        Parameters
+        ----------
+        *args : tuple, optional
+            Accepted for compatibility with NumPy.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            Note that a copy is always required for mixed dtype DataFrames,
+            or for DataFrames with any extension types.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        Returns
+        -------
+        DataFrame
+            The transposed DataFrame.
+
+        See Also
+        --------
+        numpy.transpose : Permute the dimensions of a given array.
+
+        Notes
+        -----
+        Transposing a DataFrame with mixed dtypes will result in a homogeneous
+        DataFrame with the `object` dtype. In such a case, a copy of the data
+        is always made.
+
+        Examples
+        --------
+        **Square DataFrame with homogeneous dtype**
+
+        >>> d1 = {"col1": [1, 2], "col2": [3, 4]}
+        >>> df1 = pd.DataFrame(data=d1)
+        >>> df1
+           col1  col2
+        0     1     3
+        1     2     4
+
+        >>> df1_transposed = df1.T  # or df1.transpose()
+        >>> df1_transposed
+              0  1
+        col1  1  2
+        col2  3  4
+
+        When the dtype is homogeneous in the original DataFrame, we get a
+        transposed DataFrame with the same dtype:
+
+        >>> df1.dtypes
+        col1    int64
+        col2    int64
+        dtype: object
+        >>> df1_transposed.dtypes
+        0    int64
+        1    int64
+        dtype: object
+
+        **Non-square DataFrame with mixed dtypes**
+
+        >>> d2 = {
+        ...     "name": ["Alice", "Bob"],
+        ...     "score": [9.5, 8],
+        ...     "employed": [False, True],
+        ...     "kids": [0, 0],
+        ... }
+        >>> df2 = pd.DataFrame(data=d2)
+        >>> df2
+            name  score  employed  kids
+        0  Alice    9.5     False     0
+        1    Bob    8.0      True     0
+
+        >>> df2_transposed = df2.T  # or df2.transpose()
+        >>> df2_transposed
+                      0     1
+        name      Alice   Bob
+        score       9.5   8.0
+        employed  False  True
+        kids          0     0
+
+        When the DataFrame has mixed dtypes, we get a transposed DataFrame with
+        the `object` dtype:
+
+        >>> df2.dtypes
+        name            str
+        score       float64
+        employed       bool
+        kids          int64
+        dtype: object
+        >>> df2_transposed.dtypes
+        0    object
+        1    object
+        dtype: object
+        """
+        self._check_copy_deprecation(copy)
+        nv.validate_transpose(args, {})
+        # construct the args
+
+        first_dtype = self.dtypes.iloc[0] if len(self.columns) else None
+
+        if self._can_fast_transpose:
+            # Note: tests pass without this, but this improves perf quite a bit.
+            new_vals = self._values.T
+
+            result = self._constructor(
+                new_vals,
+                index=self.columns,
+                columns=self.index,
+                copy=False,
+                dtype=new_vals.dtype,
+            )
+            if len(self) > 0:
+                result._mgr.add_references(self._mgr)
+
+        elif (
+            self._is_homogeneous_type
+            and first_dtype is not None
+            and isinstance(first_dtype, ExtensionDtype)
+        ):
+            new_values: list
+            if isinstance(first_dtype, BaseMaskedDtype):
+                # We have masked arrays with the same dtype. We can transpose faster.
+                from pandas.core.arrays.masked import (
+                    transpose_homogeneous_masked_arrays,
+                )
+
+                new_values = transpose_homogeneous_masked_arrays(
+                    cast(Sequence[BaseMaskedArray], self._iter_column_arrays())
+                )
+            elif isinstance(first_dtype, ArrowDtype):
+                # We have arrow EAs with the same dtype. We can transpose faster.
+                from pandas.core.arrays.arrow.array import (
+                    ArrowExtensionArray,
+                    transpose_homogeneous_pyarrow,
+                )
+
+                new_values = transpose_homogeneous_pyarrow(
+                    cast(Sequence[ArrowExtensionArray], self._iter_column_arrays())
+                )
+            else:
+                # We have other EAs with the same dtype. We preserve dtype in transpose.
+                arr_typ = first_dtype.construct_array_type()
+                values = self.values
+                new_values = [
+                    arr_typ._from_sequence(row, dtype=first_dtype) for row in values
+                ]
+
+            result = type(self)._from_arrays(
+                new_values,
+                index=self.columns,
+                columns=self.index,
+                verify_integrity=False,
+            )
+
+        else:
+            new_arr = self.values.T
+            result = self._constructor(
+                new_arr,
+                index=self.columns,
+                columns=self.index,
+                dtype=new_arr.dtype,
+                # We already made a copy (more than one block)
+                copy=False,
+            )
+
+        return result.__finalize__(self, method="transpose")
+
+    @property
+    def T(self) -> DataFrame:
+        """
+        The transpose of the DataFrame.
+
+        Returns
+        -------
+        DataFrame
+            The transposed DataFrame.
+
+        See Also
+        --------
+        DataFrame.transpose : Transpose index and columns.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+        >>> df
+           col1  col2
+        0     1     3
+        1     2     4
+
+        >>> df.T
+              0  1
+        col1  1  2
+        col2  3  4
+        """
+        return self.transpose()
+
+    # ----------------------------------------------------------------------
+    # Indexing Methods
+
+    def _ixs(self, i: int, axis: AxisInt = 0) -> Series:
+        """
+        Parameters
+        ----------
+        i : int
+        axis : int
+
+        Returns
+        -------
+        Series
+        """
+        # irow
+        if axis == 0:
+            new_mgr = self._mgr.fast_xs(i)
+
+            result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
+            result._name = self.index[i]
+            return result.__finalize__(self)
+
+        # icol
+        else:
+            col_mgr = self._mgr.iget(i)
+            return self._box_col_values(col_mgr, i)
+
+    def _get_column_array(self, i: int) -> ArrayLike:
+        """
+        Get the values of the i'th column (ndarray or ExtensionArray, as stored
+        in the Block)
+
+        Warning! The returned array is a view but doesn't handle Copy-on-Write,
+        so this should be used with caution (for read-only purposes).
+        """
+        return self._mgr.iget_values(i)
+
+    def _iter_column_arrays(self) -> Iterator[ArrayLike]:
+        """
+        Iterate over the arrays of all columns in order.
+        This returns the values as stored in the Block (ndarray or ExtensionArray).
+
+        Warning! The returned array is a view but doesn't handle Copy-on-Write,
+        so this should be used with caution (for read-only purposes).
+        """
+        for i in range(len(self.columns)):
+            yield self._get_column_array(i)
+
+    def __getitem__(self, key):
+        check_dict_or_set_indexers(key)
+        key = lib.item_from_zerodim(key)
+        key = com.apply_if_callable(key, self)
+
+        if is_hashable(key, allow_slice=False) and not is_iterator(key):
+            # is_iterator to exclude generator e.g. test_getitem_listlike
+            # As of Python 3.12, slice is hashable which breaks MultiIndex (GH#57500)
+
+            # shortcut if the key is in columns
+            is_mi = isinstance(self.columns, MultiIndex)
+            # GH#45316 Return view if key is not duplicated
+            # Only use drop_duplicates with duplicates for performance
+            if not is_mi and (
+                (self.columns.is_unique and key in self.columns)
+                or key in self.columns.drop_duplicates(keep=False)
+            ):
+                return self._get_item(key)
+
+            elif is_mi and self.columns.is_unique and key in self.columns:
+                return self._getitem_multilevel(key)
+
+        # Do we have a slicer (on rows)?
+        if isinstance(key, slice):
+            return self._getitem_slice(key)
+
+        # Do we have a (boolean) DataFrame?
+        if isinstance(key, DataFrame):
+            return self.where(key)
+
+        # Do we have a (boolean) 1d indexer?
+        if com.is_bool_indexer(key):
+            return self._getitem_bool_array(key)
+
+        # We are left with two options: a single key, and a collection of keys,
+        # We interpret tuples as collections only for non-MultiIndex
+        is_single_key = isinstance(key, tuple) or not is_list_like(key)
+
+        if is_single_key:
+            if self.columns.nlevels > 1:
+                return self._getitem_multilevel(key)
+            indexer = self.columns.get_loc(key)
+            if is_integer(indexer):
+                indexer = [indexer]
+        else:
+            if is_iterator(key):
+                key = list(key)
+            indexer = self.columns._get_indexer_strict(key, "columns")[1]
+
+        # take() does not accept boolean indexers
+        if getattr(indexer, "dtype", None) == bool:
+            indexer = np.where(indexer)[0]
+
+        if isinstance(indexer, slice):
+            return self._slice(indexer, axis=1)
+
+        data = self.take(indexer, axis=1)
+
+        if is_single_key:
+            # What does looking for a single key in a non-unique index return?
+            # The behavior is inconsistent. It returns a Series, except when
+            # - the key itself is repeated (test on data.shape, #9519), or
+            # - we have a MultiIndex on columns (test on self.columns, #21309)
+            if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
+                # GH#26490 using data[key] can cause RecursionError
+                return data._get_item(key)
+
+        return data
+
+    def _getitem_bool_array(self, key):
+        # also raises Exception if object array with NA values
+        # warning here just in case -- previously __setitem__ was
+        # reindexing but __getitem__ was not; it seems more reasonable to
+        # go with the __setitem__ behavior since that is more consistent
+        # with all other indexing behavior
+        if isinstance(key, Series) and not key.index.equals(self.index):
+            warnings.warn(
+                "Boolean Series key will be reindexed to match DataFrame index.",
+                UserWarning,
+                stacklevel=find_stack_level(),
+            )
+        elif len(key) != len(self.index):
+            raise ValueError(
+                f"Item wrong length {len(key)} instead of {len(self.index)}."
+            )
+
+        # check_bool_indexer will throw exception if Series key cannot
+        # be reindexed to match DataFrame rows
+        key = check_bool_indexer(self.index, key)
+
+        if key.all():
+            return self.copy(deep=False)
+
+        indexer = key.nonzero()[0]
+        return self.take(indexer, axis=0)
+
+    def _getitem_multilevel(self, key):
+        # self.columns is a MultiIndex
+        loc = self.columns.get_loc(key)
+        if isinstance(loc, (slice, np.ndarray)):
+            new_columns = self.columns[loc]
+            result_columns = maybe_droplevels(new_columns, key)
+            result = self.iloc[:, loc]
+            result.columns = result_columns
+
+            # If there is only one column being returned, and its name is
+            # either an empty string, or a tuple with an empty string as its
+            # first element, then treat the empty string as a placeholder
+            # and return the column as if the user had provided that empty
+            # string in the key. If the result is a Series, exclude the
+            # implied empty string from its name.
+            if len(result.columns) == 1:
+                # e.g. test_frame_getitem_multicolumn_empty_level,
+                #  test_frame_mixed_depth_get, test_loc_setitem_single_column_slice
+                top = result.columns[0]
+                if isinstance(top, tuple):
+                    top = top[0]
+                if top == "":
+                    result = result[""]
+                    if isinstance(result, Series):
+                        result = self._constructor_sliced(
+                            result, index=self.index, name=key
+                        )
+
+            return result
+        else:
+            # loc is neither a slice nor ndarray, so must be an int
+            return self._ixs(loc, axis=1)
+
+    def _get_value(self, index, col, takeable: bool = False) -> Scalar:
+        """
+        Quickly retrieve single value at passed column and index.
+
+        Parameters
+        ----------
+        index : row label
+        col : column label
+        takeable : interpret the index/col as indexers, default False
+
+        Returns
+        -------
+        scalar
+
+        Notes
+        -----
+        Assumes that both `self.index._index_as_unique` and
+        `self.columns._index_as_unique`; Caller is responsible for checking.
+        """
+        if takeable:
+            series = self._ixs(col, axis=1)
+            return series._values[index]
+
+        series = self._get_item(col)
+
+        if not isinstance(self.index, MultiIndex):
+            # CategoricalIndex: Trying to use the engine fastpath may give incorrect
+            #  results if our categories are integers that dont match our codes
+            # IntervalIndex: IntervalTree has no get_loc
+            row = self.index.get_loc(index)
+            return series._values[row]
+
+        # For MultiIndex going through engine effectively restricts us to
+        #  same-length tuples; see test_get_set_value_no_partial_indexing
+        loc = self.index._engine.get_loc(index)
+        return series._values[loc]
+
+    def isetitem(self, loc, value) -> None:
+        """
+        Set the given value in the column with position `loc`.
+
+        This is a positional analogue to ``__setitem__``.
+
+        Parameters
+        ----------
+        loc : int or sequence of ints
+            Index position for the column.
+        value : scalar or arraylike
+            Value(s) for the column.
+
+        See Also
+        --------
+        DataFrame.iloc : Purely integer-location based indexing for selection by
+            position.
+
+        Notes
+        -----
+        ``frame.isetitem(loc, value)`` is an in-place method as it will
+        modify the DataFrame in place (not returning a new object). In contrast to
+        ``frame.iloc[:, i] = value`` which will try to update the existing values in
+        place, ``frame.isetitem(loc, value)`` will not update the values of the column
+        itself in place, it will instead insert a new array.
+
+        In cases where ``frame.columns`` is unique, this is equivalent to
+        ``frame[frame.columns[i]] = value``.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
+        >>> df.isetitem(1, [5, 6])
+        >>> df
+              A  B
+        0     1  5
+        1     2  6
+        """
+        if isinstance(value, DataFrame):
+            if is_integer(loc):
+                loc = [loc]
+
+            if len(loc) != len(value.columns):
+                raise ValueError(
+                    f"Got {len(loc)} positions but value has {len(value.columns)} "
+                    f"columns."
+                )
+
+            for i, idx in enumerate(loc):
+                arraylike, refs = self._sanitize_column(value.iloc[:, i])
+                self._iset_item_mgr(idx, arraylike, inplace=False, refs=refs)
+            return
+
+        arraylike, refs = self._sanitize_column(value)
+        self._iset_item_mgr(loc, arraylike, inplace=False, refs=refs)
+
+    def __setitem__(self, key, value) -> None:
+        """
+        Set item(s) in DataFrame by key.
+
+        This method allows you to set the values of one or more columns in the
+        DataFrame using a key. If the key does not exist, a new
+        column will be created.
+
+        Parameters
+        ----------
+        key : The object(s) in the index which are to be assigned to
+            Column label(s) to set. Can be a single column name, list of column names,
+            or tuple for MultiIndex columns.
+        value : scalar, array-like, Series, or DataFrame
+            Value(s) to set for the specified key(s).
+
+        Returns
+        -------
+        None
+            This method does not return a value.
+
+        See Also
+        --------
+        DataFrame.loc : Access and set values by label-based indexing.
+        DataFrame.iloc : Access and set values by position-based indexing.
+        DataFrame.assign : Assign new columns to a DataFrame.
+
+        Notes
+        -----
+        When assigning a Series to a DataFrame column, pandas aligns the Series
+        by index labels, not by position. This means:
+
+        * Values from the Series are matched to DataFrame rows by index label
+        * If a Series index label doesn't exist in the DataFrame index, it's ignored
+        * If a DataFrame index label doesn't exist in the Series index, NaN is assigned
+        * The order of values in the Series doesn't matter; only the index labels matter
+
+        Examples
+        --------
+        Basic column assignment:
+
+        >>> df = pd.DataFrame({"A": [1, 2, 3]})
+        >>> df["B"] = [4, 5, 6]  # Assigns by position
+        >>> df
+            A  B
+        0  1  4
+        1  2  5
+        2  3  6
+
+        Series assignment with index alignment:
+
+        >>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 2])
+        >>> s = pd.Series([10, 20], index=[1, 3])  # Note: index 3 doesn't exist in df
+        >>> df["B"] = s  # Assigns by index label, not position
+        >>> df
+           A     B
+        0  1   NaN
+        1  2  10.0
+        2  3   NaN
+
+        Series assignment with partial index match:
+
+        >>> df = pd.DataFrame({"A": [1, 2, 3, 4]}, index=["a", "b", "c", "d"])
+        >>> s = pd.Series([100, 200], index=["b", "d"])
+        >>> df["B"] = s
+        >>> df
+           A      B
+        a  1    NaN
+        b  2  100.0
+        c  3    NaN
+        d  4  200.0
+
+        Series index labels NOT in DataFrame, ignored:
+
+        >>> df = pd.DataFrame({"A": [1, 2, 3]}, index=["x", "y", "z"])
+        >>> s = pd.Series([10, 20, 30, 40, 50], index=["x", "y", "a", "b", "z"])
+        >>> df["B"] = s
+        >>> df
+           A   B
+        x  1  10
+        y  2  20
+        z  3  50
+        """
+        if not CHAINED_WARNING_DISABLED:
+            if sys.getrefcount(self) <= REF_COUNT and not com.is_local_in_caller_frame(
+                self
+            ):
+                warnings.warn(
+                    _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
+                )
+
+        key = com.apply_if_callable(key, self)
+
+        # see if we can slice the rows
+        if isinstance(key, slice):
+            slc = self.index._convert_slice_indexer(key, kind="getitem")
+            return self._setitem_slice(slc, value)
+
+        if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
+            self._setitem_frame(key, value)
+        elif isinstance(key, (Series, np.ndarray, list, Index)):
+            self._setitem_array(key, value)
+        elif isinstance(value, DataFrame):
+            self._set_item_frame_value(key, value)
+        elif (
+            is_list_like(value)
+            and not self.columns.is_unique
+            and 1 < len(self.columns.get_indexer_for([key])) == len(value)
+        ):
+            # Column to set is duplicated
+            self._setitem_array([key], value)
+        else:
+            # set column
+            self._set_item(key, value)
+
+    def _setitem_slice(self, key: slice, value) -> None:
+        # NB: we can't just use self.loc[key] = value because that
+        #  operates on labels and we need to operate positional for
+        #  backwards-compat, xref GH#31469
+        self.iloc[key] = value
+
+    def _setitem_array(self, key, value) -> None:
+        # also raises Exception if object array with NA values
+        if com.is_bool_indexer(key):
+            # bool indexer is indexing along rows
+            if len(key) != len(self.index):
+                raise ValueError(
+                    f"Item wrong length {len(key)} instead of {len(self.index)}!"
+                )
+            key = check_bool_indexer(self.index, key)
+            indexer = key.nonzero()[0]
+            if isinstance(value, DataFrame):
+                # GH#39931 reindex since iloc does not align
+                value = value.reindex(self.index.take(indexer))
+            self.iloc[indexer] = value
+
+        # Note: unlike self.iloc[:, indexer] = value, this will
+        #  never try to overwrite values inplace
+
+        elif isinstance(value, DataFrame):
+            check_key_length(self.columns, key, value)
+            for k1, k2 in zip(key, value.columns, strict=False):
+                self[k1] = value[k2]
+
+        elif not is_list_like(value):
+            for col in key:
+                self[col] = value
+
+        elif isinstance(value, np.ndarray) and value.ndim == 2:
+            self._iset_not_inplace(key, value)
+
+        elif np.ndim(value) > 1:
+            # list of lists
+            value = DataFrame(value).values
+            self._setitem_array(key, value)
+
+        else:
+            self._iset_not_inplace(key, value)
+
+    def _iset_not_inplace(self, key, value) -> None:
+        # GH#39510 when setting with df[key] = obj with a list-like key and
+        #  list-like value, we iterate over those listlikes and set columns
+        #  one at a time.  This is different from dispatching to
+        #  `self.loc[:, key]= value`  because loc.__setitem__ may overwrite
+        #  data inplace, whereas this will insert new arrays.
+
+        def igetitem(obj, i: int):
+            # Note: we catch DataFrame obj before getting here, but
+            #  hypothetically would return obj.iloc[:, i]
+            if isinstance(obj, np.ndarray):
+                return obj[..., i]
+            else:
+                return obj[i]
+
+        if self.columns.is_unique:
+            if np.shape(value)[-1] != len(key):
+                raise ValueError("Columns must be same length as key")
+
+            for i, col in enumerate(key):
+                self[col] = igetitem(value, i)
+
+        else:
+            ilocs = self.columns.get_indexer_non_unique(key)[0]
+            if (ilocs < 0).any():
+                # key entries not in self.columns
+                raise NotImplementedError
+
+            if np.shape(value)[-1] != len(ilocs):
+                raise ValueError("Columns must be same length as key")
+
+            assert np.ndim(value) <= 2
+
+            orig_columns = self.columns
+
+            # Using self.iloc[:, i] = ... may set values inplace, which
+            #  by convention we do not do in __setitem__
+            try:
+                self.columns = Index(range(len(self.columns)))
+                for i, iloc in enumerate(ilocs):
+                    self[iloc] = igetitem(value, i)
+            finally:
+                self.columns = orig_columns
+
+    def _setitem_frame(self, key, value) -> None:
+        # support boolean setting with DataFrame input, e.g.
+        # df[df > df2] = 0
+        if isinstance(key, np.ndarray):
+            if key.shape != self.shape:
+                raise ValueError("Array conditional must be same shape as self")
+            key = self._constructor(key, **self._construct_axes_dict(), copy=False)
+
+        if key.size and not all(is_bool_dtype(blk.dtype) for blk in key._mgr.blocks):
+            raise TypeError(
+                "Must pass DataFrame or 2-d ndarray with boolean values only"
+            )
+
+        self._where(-key, value, inplace=True)
+
+    def _set_item_frame_value(self, key, value: DataFrame) -> None:
+        self._ensure_valid_index(value)
+
+        # align columns
+        if key in self.columns:
+            loc = self.columns.get_loc(key)
+            cols = self.columns[loc]
+            len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols)
+            if len_cols != len(value.columns):
+                raise ValueError("Columns must be same length as key")
+
+            # align right-hand-side columns if self.columns
+            # is multi-index and self[key] is a sub-frame
+            if isinstance(self.columns, MultiIndex) and isinstance(
+                loc, (slice, Series, np.ndarray, Index)
+            ):
+                cols_droplevel = maybe_droplevels(cols, key)
+                if (
+                    not isinstance(cols_droplevel, MultiIndex)
+                    and is_string_dtype(cols_droplevel.dtype)
+                    and not cols_droplevel.any()
+                ):
+                    # if cols_droplevel contains only empty strings,
+                    # value.reindex(cols_droplevel, axis=1) would be full of NaNs
+                    # see GH#62518 and GH#61841
+                    return
+                if len(cols_droplevel) and not cols_droplevel.equals(value.columns):
+                    value = value.reindex(cols_droplevel, axis=1)
+
+                for col, col_droplevel in zip(cols, cols_droplevel, strict=True):
+                    self[col] = value[col_droplevel]
+                return
+
+            if is_scalar(cols):
+                self[cols] = value[value.columns[0]]
+                return
+
+            locs: np.ndarray | list
+            if isinstance(loc, slice):
+                locs = np.arange(loc.start, loc.stop, loc.step)
+            elif is_scalar(loc):
+                locs = [loc]
+            else:
+                locs = loc.nonzero()[0]
+
+            return self.isetitem(locs, value)
+
+        if len(value.columns) > 1:
+            raise ValueError(
+                "Cannot set a DataFrame with multiple columns to the single "
+                f"column {key}"
+            )
+        elif len(value.columns) == 0:
+            raise ValueError(
+                f"Cannot set a DataFrame without columns to the column {key}"
+            )
+
+        self[key] = value[value.columns[0]]
+
+    def _iset_item_mgr(
+        self,
+        loc: int | slice | np.ndarray,
+        value,
+        inplace: bool = False,
+        refs: BlockValuesRefs | None = None,
+    ) -> None:
+        # when called from _set_item_mgr loc can be anything returned from get_loc
+        self._mgr.iset(loc, value, inplace=inplace, refs=refs)
+
+    def _set_item_mgr(
+        self, key, value: ArrayLike, refs: BlockValuesRefs | None = None
+    ) -> None:
+        try:
+            loc = self._info_axis.get_loc(key)
+        except KeyError:
+            # This item wasn't present, just insert at end
+            self._mgr.insert(len(self._info_axis), key, value, refs)
+        else:
+            self._iset_item_mgr(loc, value, refs=refs)
+
+    def _iset_item(self, loc: int, value: Series, inplace: bool = True) -> None:
+        # We are only called from _replace_columnwise which guarantees that
+        # no reindex is necessary
+        self._iset_item_mgr(loc, value._values, inplace=inplace, refs=value._references)
+
+    def _set_item(self, key, value) -> None:
+        """
+        Add series to DataFrame in specified column.
+
+        If series is a numpy-array (not a Series/TimeSeries), it must be the
+        same length as the DataFrames index or an error will be thrown.
+
+        Series/TimeSeries will be conformed to the DataFrames index to
+        ensure homogeneity.
+        """
+        value, refs = self._sanitize_column(value)
+
+        if (
+            key in self.columns
+            and value.ndim == 1
+            and not isinstance(value.dtype, ExtensionDtype)
+        ):
+            # broadcast across multiple columns if necessary
+            if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
+                existing_piece = self[key]
+                if isinstance(existing_piece, DataFrame):
+                    value = np.tile(value, (len(existing_piece.columns), 1)).T
+                    refs = None
+
+        self._set_item_mgr(key, value, refs)
+
+    def _set_value(
+        self, index: IndexLabel, col, value: Scalar, takeable: bool = False
+    ) -> None:
+        """
+        Put single value at passed column and index.
+
+        Parameters
+        ----------
+        index : Label
+            row label
+        col : Label
+            column label
+        value : scalar
+        takeable : bool, default False
+            Sets whether or not index/col interpreted as indexers
+        """
+        try:
+            if takeable:
+                icol = col
+                iindex = cast(int, index)
+            else:
+                icol = self.columns.get_loc(col)
+                iindex = self.index.get_loc(index)
+            self._mgr.column_setitem(icol, iindex, value, inplace_only=True)
+
+        except (KeyError, TypeError, ValueError, LossySetitemError):
+            # get_loc might raise a KeyError for missing labels (falling back
+            #  to (i)loc will do expansion of the index)
+            # column_setitem will do validation that may raise TypeError,
+            #  ValueError, or LossySetitemError
+            # set using a non-recursive method & reset the cache
+            if takeable:
+                self.iloc[index, col] = value
+            else:
+                self.loc[index, col] = value
+
+        except InvalidIndexError as ii_err:
+            # GH48729: Seems like you are trying to assign a value to a
+            # row when only scalar options are permitted
+            raise InvalidIndexError(
+                f"You can only assign a scalar value not a {type(value)}"
+            ) from ii_err
+
+    def _ensure_valid_index(self, value) -> None:
+        """
+        Ensure that if we don't have an index, that we can create one from the
+        passed value.
+        """
+        # GH5632, make sure that we are a Series convertible
+        if not len(self.index) and is_list_like(value) and len(value):
+            if not isinstance(value, DataFrame):
+                try:
+                    value = Series(value)
+                except (ValueError, NotImplementedError, TypeError) as err:
+                    raise ValueError(
+                        "Cannot set a frame with no defined index "
+                        "and a value that cannot be converted to a Series"
+                    ) from err
+
+            # GH31368 preserve name of index
+            index_copy = value.index.copy()
+            if self.index.name is not None:
+                index_copy.name = self.index.name
+
+            self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)
+
+    def _box_col_values(self, values: SingleBlockManager, loc: int) -> Series:
+        """
+        Provide boxed values for a column.
+        """
+        # Lookup in columns so that if e.g. a str datetime was passed
+        #  we attach the Timestamp object as the name.
+        name = self.columns[loc]
+        # We get index=self.index bc values is a SingleBlockManager
+        obj = self._constructor_sliced_from_mgr(values, axes=values.axes)
+        obj._name = name
+        return obj.__finalize__(self)
+
+    def _get_item(self, item: Hashable) -> Series:
+        loc = self.columns.get_loc(item)
+        return self._ixs(loc, axis=1)
+
+    # ----------------------------------------------------------------------
+    # Unsorted
+
+    @overload
+    def query(
+        self,
+        expr: str,
+        *,
+        parser: Literal["pandas", "python"] = ...,
+        engine: Literal["python", "numexpr"] | None = ...,
+        local_dict: dict[str, Any] | None = ...,
+        global_dict: dict[str, Any] | None = ...,
+        resolvers: list[Mapping] | None = ...,
+        level: int = ...,
+        inplace: Literal[False] = ...,
+    ) -> DataFrame: ...
+
+    @overload
+    def query(
+        self,
+        expr: str,
+        *,
+        parser: Literal["pandas", "python"] = ...,
+        engine: Literal["python", "numexpr"] | None = ...,
+        local_dict: dict[str, Any] | None = ...,
+        global_dict: dict[str, Any] | None = ...,
+        resolvers: list[Mapping] | None = ...,
+        level: int = ...,
+        inplace: Literal[True],
+    ) -> None: ...
+
+    @overload
+    def query(
+        self,
+        expr: str,
+        *,
+        parser: Literal["pandas", "python"] = ...,
+        engine: Literal["python", "numexpr"] | None = ...,
+        local_dict: dict[str, Any] | None = ...,
+        global_dict: dict[str, Any] | None = ...,
+        resolvers: list[Mapping] | None = ...,
+        level: int = ...,
+        inplace: bool = ...,
+    ) -> DataFrame | None: ...
+
+    def query(
+        self,
+        expr: str,
+        *,
+        parser: Literal["pandas", "python"] = "pandas",
+        engine: Literal["python", "numexpr"] | None = None,
+        local_dict: dict[str, Any] | None = None,
+        global_dict: dict[str, Any] | None = None,
+        resolvers: list[Mapping] | None = None,
+        level: int = 0,
+        inplace: bool = False,
+    ) -> DataFrame | None:
+        """
+        Query the columns of a DataFrame with a boolean expression.
+
+        .. warning::
+
+            This method can run arbitrary code which can make you vulnerable to code
+            injection if you pass user input to this function.
+
+        Parameters
+        ----------
+        expr : str
+            The query string to evaluate.
+
+            See the documentation for :func:`eval` for details of
+            supported operations and functions in the query string.
+
+            See the documentation for :meth:`DataFrame.eval` for details on
+            referring to column names and variables in the query string.
+        parser : {'pandas', 'python'}, default 'pandas'
+            The parser to use to construct the syntax tree from the expression. The
+            default of ``'pandas'`` parses code slightly different than standard
+            Python. Alternatively, you can parse an expression using the
+            ``'python'`` parser to retain strict Python semantics.  See the
+            :ref:`enhancing performance <enhancingperf.eval>` documentation for
+            more details.
+        engine : {'python', 'numexpr'}, default 'numexpr'
+
+            The engine used to evaluate the expression. Supported engines are
+
+            - None : tries to use ``numexpr``, falls back to ``python``
+            - ``'numexpr'`` : This default engine evaluates pandas objects using
+              numexpr for large speed ups in complex expressions with large frames.
+            - ``'python'`` : Performs operations as if you had ``eval``'d in top
+              level python. This engine is generally not that useful.
+
+            More backends may be available in the future.
+        local_dict : dict or None, optional
+            A dictionary of local variables, taken from locals() by default.
+        global_dict : dict or None, optional
+            A dictionary of global variables, taken from globals() by default.
+        resolvers : list of dict-like or None, optional
+            A list of objects implementing the ``__getitem__`` special method that
+            you can use to inject an additional collection of namespaces to use for
+            variable lookup. For example, this is used in the
+            :meth:`~DataFrame.query` method to inject the
+            ``DataFrame.index`` and ``DataFrame.columns``
+            variables that refer to their respective :class:`~pandas.DataFrame`
+            instance attributes.
+        level : int, optional
+            The number of prior stack frames to traverse and add to the current
+            scope. Most users will **not** need to change this parameter.
+        inplace : bool
+            Whether to modify the DataFrame rather than creating a new one.
+
+        Returns
+        -------
+        DataFrame or None
+            DataFrame resulting from the provided query expression or
+            None if ``inplace=True``.
+
+        See Also
+        --------
+        eval : Evaluate a string describing operations on
+            DataFrame columns.
+        DataFrame.eval : Evaluate a string describing operations on
+            DataFrame columns.
+
+        Notes
+        -----
+        The result of the evaluation of this expression is first passed to
+        :attr:`DataFrame.loc` and if that fails because of a
+        multidimensional key (e.g., a DataFrame) then the result will be passed
+        to :meth:`DataFrame.__getitem__`.
+
+        This method uses the top-level :func:`eval` function to
+        evaluate the passed query.
+
+        The :meth:`~pandas.DataFrame.query` method uses a slightly
+        modified Python syntax by default. For example, the ``&`` and ``|``
+        (bitwise) operators have the precedence of their boolean cousins,
+        :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
+        however the semantics are different.
+
+        You can change the semantics of the expression by passing the keyword
+        argument ``parser='python'``. This enforces the same semantics as
+        evaluation in Python space. Likewise, you can pass ``engine='python'``
+        to evaluate an expression using Python itself as a backend. This is not
+        recommended as it is inefficient compared to using ``numexpr`` as the
+        engine.
+
+        The :attr:`DataFrame.index` and
+        :attr:`DataFrame.columns` attributes of the
+        :class:`~pandas.DataFrame` instance are placed in the query namespace
+        by default, which allows you to treat both the index and columns of the
+        frame as a column in the frame.
+        The identifier ``index`` is used for the frame index; you can also
+        use the name of the index to identify it in a query. Please note that
+        Python keywords may not be used as identifiers.
+
+        For further details and examples see the ``query`` documentation in
+        :ref:`indexing <indexing.query>`.
+
+        *Backtick quoted variables*
+
+        Backtick quoted variables are parsed as literal Python code and
+        are converted internally to a Python valid identifier.
+        This can lead to the following problems.
+
+        During parsing a number of disallowed characters inside the backtick
+        quoted string are replaced by strings that are allowed as a Python identifier.
+        These characters include all operators in Python, the space character, the
+        question mark, the exclamation mark, the dollar sign, and the euro sign.
+
+        A backtick can be escaped by double backticks.
+
+        See also the `Python documentation about lexical analysis
+        <https://docs.python.org/3/reference/lexical_analysis.html>`__
+        in combination with the source code in :mod:`pandas.core.computation.parsing`.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)}
+        ... )
+        >>> df
+           A   B  C&C
+        0  1  10   10
+        1  2   8    9
+        2  3   6    8
+        3  4   4    7
+        4  5   2    6
+        >>> df.query("A > B")
+           A  B  C&C
+        4  5  2    6
+
+        The previous expression is equivalent to
+
+        >>> df[df.A > df.B]
+           A  B  C&C
+        4  5  2    6
+
+        For columns with spaces in their name, you can use backtick quoting.
+
+        >>> df.query("B == `C&C`")
+           A   B  C&C
+        0  1  10   10
+
+        The previous expression is equivalent to
+
+        >>> df[df.B == df["C&C"]]
+           A   B  C&C
+        0  1  10   10
+
+        Using local variable:
+
+        >>> local_var = 2
+        >>> df.query("A <= @local_var")
+        A   B  C&C
+        0  1  10   10
+        1  2   8    9
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        if not isinstance(expr, str):
+            msg = f"expr must be a string to be evaluated, {type(expr)} given"
+            raise ValueError(msg)
+
+        res = self.eval(
+            expr,
+            level=level + 1,
+            parser=parser,
+            target=None,
+            engine=engine,
+            local_dict=local_dict,
+            global_dict=global_dict,
+            resolvers=resolvers or (),
+        )
+
+        try:
+            result = self.loc[res]
+        except ValueError:
+            # when res is multi-dimensional loc raises, but this is sometimes a
+            # valid query
+            result = self[res]
+
+        if inplace:
+            self._update_inplace(result)
+            return None
+        else:
+            return result
+
+    @overload
+    def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any: ...
+
+    @overload
+    def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: ...
+
+    def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
+        """
+        Evaluate a string describing operations on DataFrame columns.
+
+        .. warning::
+
+            This method can run arbitrary code which can make you vulnerable to code
+            injection if you pass user input to this function.
+
+        Operates on columns only, not specific rows or elements.  This allows
+        `eval` to run arbitrary code, which can make you vulnerable to code
+        injection if you pass user input to this function.
+
+        Parameters
+        ----------
+        expr : str
+            The expression string to evaluate.
+
+            You can refer to variables
+            in the environment by prefixing them with an '@' character like
+            ``@a + b``.
+
+            You can refer to column names that are not valid Python variable names
+            by surrounding them in backticks. Thus, column names containing spaces
+            or punctuation (besides underscores) or starting with digits must be
+            surrounded by backticks. (For example, a column named "Area (cm^2)" would
+            be referenced as ```Area (cm^2)```). Column names which are Python keywords
+            (like "if", "for", "import", etc) cannot be used.
+
+            For example, if one of your columns is called ``a a`` and you want
+            to sum it with ``b``, your query should be ```a a` + b``.
+
+            See the documentation for :func:`eval` for full details of
+            supported operations and functions in the expression string.
+        inplace : bool, default False
+            If the expression contains an assignment, whether to perform the
+            operation inplace and mutate the existing DataFrame. Otherwise,
+            a new DataFrame is returned.
+        **kwargs
+            See the documentation for :func:`eval` for complete details
+            on the keyword arguments accepted by
+            :meth:`~pandas.DataFrame.eval`.
+
+        Returns
+        -------
+        ndarray, scalar, pandas object, or None
+            The result of the evaluation or None if ``inplace=True``.
+
+        See Also
+        --------
+        DataFrame.query : Evaluates a boolean expression to query the columns
+            of a frame.
+        DataFrame.assign : Can evaluate an expression or function to create new
+            values for a column.
+        eval : Evaluate a Python expression as a string using various
+            backends.
+
+        Notes
+        -----
+        For more details see the API documentation for :func:`~eval`.
+        For detailed examples see :ref:`enhancing performance with eval
+        <enhancingperf.eval>`.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)}
+        ... )
+        >>> df
+           A   B  C&C
+        0  1  10   10
+        1  2   8    9
+        2  3   6    8
+        3  4   4    7
+        4  5   2    6
+        >>> df.eval("A + B")
+        0    11
+        1    10
+        2     9
+        3     8
+        4     7
+        dtype: int64
+
+        Assignment is allowed though by default the original DataFrame is not
+        modified.
+
+        >>> df.eval("D = A + B")
+           A   B  C&C   D
+        0  1  10   10  11
+        1  2   8    9  10
+        2  3   6    8   9
+        3  4   4    7   8
+        4  5   2    6   7
+        >>> df
+           A   B  C&C
+        0  1  10   10
+        1  2   8    9
+        2  3   6    8
+        3  4   4    7
+        4  5   2    6
+
+        Multiple columns can be assigned to using multi-line expressions:
+
+        >>> df.eval(
+        ...     '''
+        ... D = A + B
+        ... E = A - B
+        ... '''
+        ... )
+           A   B  C&C   D  E
+        0  1  10   10  11 -9
+        1  2   8    9  10 -6
+        2  3   6    8   9 -3
+        3  4   4    7   8  0
+        4  5   2    6   7  3
+
+        For columns with spaces or other disallowed characters in their name, you can
+        use backtick quoting.
+
+        >>> df.eval("B * `C&C`")
+        0    100
+        1     72
+        2     48
+        3     28
+        4     12
+        dtype: int64
+
+        Local variables shall be explicitly referenced using ``@``
+        character in front of the name:
+
+        >>> local_var = 2
+        >>> df.eval("@local_var * A")
+        0     2
+        1     4
+        2     6
+        3     8
+        4    10
+        Name: A, dtype: int64
+        """
+        from pandas.core.computation.eval import eval as _eval
+
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        kwargs["level"] = kwargs.pop("level", 0) + 1
+        index_resolvers = self._get_index_resolvers()
+        column_resolvers = self._get_cleaned_column_resolvers()
+        resolvers = column_resolvers, index_resolvers
+        if "target" not in kwargs:
+            kwargs["target"] = self
+        kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers
+
+        return _eval(expr, inplace=inplace, **kwargs)
+
+    def select_dtypes(self, include=None, exclude=None) -> DataFrame:
+        """
+        Return a subset of the DataFrame's columns based on the column dtypes.
+
+        This method allows for filtering columns based on their data types.
+        It is useful when working with heterogeneous DataFrames where operations
+        need to be performed on a specific subset of data types.
+
+        Parameters
+        ----------
+        include, exclude : scalar or list-like
+            A selection of dtypes or strings to be included/excluded. At least
+            one of these parameters must be supplied.
+
+        Returns
+        -------
+        DataFrame
+            The subset of the frame including the dtypes in ``include`` and
+            excluding the dtypes in ``exclude``.
+
+        Raises
+        ------
+        ValueError
+            * If both of ``include`` and ``exclude`` are empty
+            * If ``include`` and ``exclude`` have overlapping elements
+        TypeError
+            * If any kind of string dtype is passed in.
+
+        See Also
+        --------
+        DataFrame.dtypes: Return Series with the data type of each column.
+
+        Notes
+        -----
+        * To select all *numeric* types, use ``np.number`` or ``'number'``
+        * To select strings you must use the ``object`` dtype, but note that
+          this will return *all* object dtype columns. With
+          ``pd.options.future.infer_string`` enabled, using ``"str"`` will
+          work to select all string columns.
+        * See the `numpy dtype hierarchy
+          <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
+        * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
+          ``'datetime64'``
+        * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
+          ``'timedelta64'``
+        * To select Pandas categorical dtypes, use ``'category'``
+        * To select Pandas datetimetz dtypes, use ``'datetimetz'``
+          or ``'datetime64[ns, tz]'``
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"a": [1, 2] * 3, "b": [True, False] * 3, "c": [1.0, 2.0] * 3}
+        ... )
+        >>> df
+                a      b  c
+        0       1   True  1.0
+        1       2  False  2.0
+        2       1   True  1.0
+        3       2  False  2.0
+        4       1   True  1.0
+        5       2  False  2.0
+
+        >>> df.select_dtypes(include="bool")
+           b
+        0  True
+        1  False
+        2  True
+        3  False
+        4  True
+        5  False
+
+        >>> df.select_dtypes(include=["float64"])
+           c
+        0  1.0
+        1  2.0
+        2  1.0
+        3  2.0
+        4  1.0
+        5  2.0
+
+        >>> df.select_dtypes(exclude=["int64"])
+               b    c
+        0   True  1.0
+        1  False  2.0
+        2   True  1.0
+        3  False  2.0
+        4   True  1.0
+        5  False  2.0
+        """
+        if not is_list_like(include):
+            include = (include,) if include is not None else ()
+        if not is_list_like(exclude):
+            exclude = (exclude,) if exclude is not None else ()
+
+        selection = (frozenset(include), frozenset(exclude))
+
+        if not any(selection):
+            raise ValueError("at least one of include or exclude must be nonempty")
+
+        # convert the myriad valid dtypes object to a single representation
+        def check_int_infer_dtype(dtypes):
+            converted_dtypes: list[type] = []
+            for dtype in dtypes:
+                # Numpy maps int to different types (int32, in64) on Windows and Linux
+                # see https://github.com/numpy/numpy/issues/9464
+                if (isinstance(dtype, str) and dtype == "int") or (dtype is int):
+                    converted_dtypes.append(np.int32)
+                    converted_dtypes.append(np.int64)
+                elif dtype == "float" or dtype is float:
+                    # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20
+                    converted_dtypes.extend([np.float64, np.float32])
+                else:
+                    converted_dtypes.append(infer_dtype_from_object(dtype))
+            return frozenset(converted_dtypes)
+
+        include = check_int_infer_dtype(include)
+        exclude = check_int_infer_dtype(exclude)
+
+        for dtypes in (include, exclude):
+            invalidate_string_dtypes(dtypes)
+
+        # can't both include AND exclude!
+        if not include.isdisjoint(exclude):
+            raise ValueError(f"include and exclude overlap on {(include & exclude)}")
+
+        def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
+            # GH 46870: BooleanDtype._is_numeric == True but should be excluded
+            dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype
+            return (
+                issubclass(dtype.type, tuple(dtypes_set))
+                or (
+                    np.number in dtypes_set
+                    and getattr(dtype, "_is_numeric", False)
+                    and not is_bool_dtype(dtype)
+                )
+                # backwards compat for the default `str` dtype being selected by object
+                or (
+                    isinstance(dtype, StringDtype)
+                    and dtype.na_value is np.nan
+                    and np.object_ in dtypes_set
+                )
+            )
+
+        def predicate(arr: ArrayLike) -> bool:
+            dtype = arr.dtype
+            if include:
+                if not dtype_predicate(dtype, include):
+                    return False
+
+            if exclude:
+                if dtype_predicate(dtype, exclude):
+                    return False
+
+            return True
+
+        blk_dtypes = [blk.dtype for blk in self._mgr.blocks]
+        if (
+            np.object_ in include
+            and str not in include
+            and str not in exclude
+            and any(
+                isinstance(dtype, StringDtype) and dtype.na_value is np.nan
+                for dtype in blk_dtypes
+            )
+        ):
+            # GH#61916
+            warnings.warn(
+                "For backward compatibility, 'str' dtypes are included by "
+                "select_dtypes when 'object' dtype is specified. "
+                "This behavior is deprecated and will be removed in a future "
+                "version. Explicitly pass 'str' to `include` to select them, "
+                "or to `exclude` to remove them and silence this warning.\nSee "
+                "https://pandas.pydata.org/docs/user_guide/migration-3-strings.html"
+                "#string-migration-select-dtypes for details on how to write code "
+                "that works with pandas 2 and 3.",
+                Pandas4Warning,
+                stacklevel=find_stack_level(),
+            )
+
+        mgr = self._mgr._get_data_subset(predicate).copy(deep=False)
+        return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self)
+
+    def insert(
+        self,
+        loc: int,
+        column: Hashable,
+        value: object,
+        allow_duplicates: bool | lib.NoDefault = lib.no_default,
+    ) -> None:
+        """
+        Insert column into DataFrame at specified location.
+
+        Raises a ValueError if `column` is already contained in the DataFrame,
+        unless `allow_duplicates` is set to True.
+
+        Parameters
+        ----------
+        loc : int
+            Insertion index. Must verify 0 <= loc <= len(columns).
+        column : str, number, or hashable object
+            Label of the inserted column.
+        value : Scalar, Series, or array-like
+            Content of the inserted column.
+        allow_duplicates : bool, optional, default lib.no_default
+            Allow duplicate column labels to be created.
+
+        See Also
+        --------
+        Index.insert : Insert new item by index.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+        >>> df
+           col1  col2
+        0     1     3
+        1     2     4
+        >>> df.insert(1, "newcol", [99, 99])
+        >>> df
+           col1  newcol  col2
+        0     1      99     3
+        1     2      99     4
+        >>> df.insert(0, "col1", [100, 100], allow_duplicates=True)
+        >>> df
+           col1  col1  newcol  col2
+        0   100     1      99     3
+        1   100     2      99     4
+
+        Notice that pandas uses index alignment in case of `value` from type `Series`:
+
+        >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2]))
+        >>> df
+           col0  col1  col1  newcol  col2
+        0   NaN   100     1      99     3
+        1   5.0   100     2      99     4
+        """
+        if allow_duplicates is lib.no_default:
+            allow_duplicates = False
+        if allow_duplicates and not self.flags.allows_duplicate_labels:
+            raise ValueError(
+                "Cannot specify 'allow_duplicates=True' when "
+                "'self.flags.allows_duplicate_labels' is False."
+            )
+        if not allow_duplicates and column in self.columns:
+            # Should this be a different kind of error??
+            raise ValueError(f"cannot insert {column}, already exists")
+        if not is_integer(loc):
+            raise TypeError("loc must be int")
+        # convert non stdlib ints to satisfy typing checks
+        loc = int(loc)
+        if isinstance(value, DataFrame) and len(value.columns) > 1:
+            raise ValueError(
+                f"Expected a one-dimensional object, got a DataFrame with "
+                f"{len(value.columns)} columns instead."
+            )
+        elif isinstance(value, DataFrame):
+            value = value.iloc[:, 0]
+
+        value, refs = self._sanitize_column(value)
+        self._mgr.insert(loc, column, value, refs=refs)
+
+    def assign(self, **kwargs) -> DataFrame:
+        r"""
+        Assign new columns to a DataFrame.
+
+        Returns a new object with all original columns in addition to new ones.
+        Existing columns that are re-assigned will be overwritten.
+
+        Parameters
+        ----------
+        **kwargs : callable or Series
+            The column names are keywords. If the values are
+            callable, they are computed on the DataFrame and
+            assigned to the new columns. The callable must not
+            change input DataFrame (though pandas doesn't check it).
+            If the values are not callable, (e.g. a Series, scalar, or array),
+            they are simply assigned.
+
+        Returns
+        -------
+        DataFrame
+            A new DataFrame with the new columns in addition to
+            all the existing columns.
+
+        See Also
+        --------
+        DataFrame.loc : Select a subset of a DataFrame by labels.
+        DataFrame.iloc : Select a subset of a DataFrame by positions.
+
+        Notes
+        -----
+        Assigning multiple columns within the same ``assign`` is possible.
+        Later items in '\*\*kwargs' may refer to newly created or modified
+        columns in 'df'; items are computed and assigned into 'df' in order.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"temp_c": [17.0, 25.0]}, index=["Portland", "Berkeley"])
+        >>> df
+                  temp_c
+        Portland    17.0
+        Berkeley    25.0
+
+        Where the value is a callable, evaluated on `df`:
+
+        >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
+                  temp_c  temp_f
+        Portland    17.0    62.6
+        Berkeley    25.0    77.0
+
+        Alternatively, the same behavior can be achieved by directly
+        referencing an existing Series or sequence:
+
+        >>> df.assign(temp_f=df["temp_c"] * 9 / 5 + 32)
+                  temp_c  temp_f
+        Portland    17.0    62.6
+        Berkeley    25.0    77.0
+
+        or by using :meth:`pandas.col`:
+
+        >>> df.assign(temp_f=pd.col("temp_c") * 9 / 5 + 32)
+                  temp_c  temp_f
+        Portland    17.0    62.6
+        Berkeley    25.0    77.0
+
+        You can create multiple columns within the same assign where one
+        of the columns depends on another one defined within the same assign:
+
+        >>> df.assign(
+        ...     temp_f=lambda x: x["temp_c"] * 9 / 5 + 32,
+        ...     temp_k=lambda x: (x["temp_f"] + 459.67) * 5 / 9,
+        ... )
+                  temp_c  temp_f  temp_k
+        Portland    17.0    62.6  290.15
+        Berkeley    25.0    77.0  298.15
+        """
+        data = self.copy(deep=False)
+
+        for k, v in kwargs.items():
+            data[k] = com.apply_if_callable(v, data)
+        return data
+
+    def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]:
+        """
+        Ensures new columns (which go into the BlockManager as new blocks) are
+        always copied (or a reference is being tracked to them under CoW)
+        and converted into an array.
+
+        Parameters
+        ----------
+        value : scalar, Series, or array-like
+
+        Returns
+        -------
+        tuple of numpy.ndarray or ExtensionArray and optional BlockValuesRefs
+        """
+        self._ensure_valid_index(value)
+
+        # Using a DataFrame would mean coercing values to one dtype
+        assert not isinstance(value, DataFrame)
+        if is_dict_like(value):
+            if not isinstance(value, Series):
+                value = Series(value)
+            return _reindex_for_setitem(value, self.index)
+
+        if is_list_like(value):
+            com.require_length_match(value, self.index)
+        return sanitize_array(value, self.index, copy=True, allow_2d=True), None
+
+    @property
+    def _series(self):
+        return {item: self._ixs(idx, axis=1) for idx, item in enumerate(self.columns)}
+
+    # ----------------------------------------------------------------------
+    # Reindexing and alignment
+
+    def _reindex_multi(self, axes: dict[str, Index], fill_value) -> DataFrame:
+        """
+        We are guaranteed non-Nones in the axes.
+        """
+
+        new_index, row_indexer = self.index.reindex(axes["index"])
+        new_columns, col_indexer = self.columns.reindex(axes["columns"])
+
+        if row_indexer is not None and col_indexer is not None:
+            # Fastpath. By doing two 'take's at once we avoid making an
+            #  unnecessary copy.
+            # We only get here with `self._can_fast_transpose`, which (almost)
+            #  ensures that self.values is cheap. It may be worth making this
+            #  condition more specific.
+            indexer = row_indexer, col_indexer
+            new_values = take_2d_multi(self.values, indexer, fill_value=fill_value)
+            return self._constructor(
+                new_values, index=new_index, columns=new_columns, copy=False
+            )
+        else:
+            return self._reindex_with_indexers(
+                {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
+                fill_value=fill_value,
+            )
+
+    def set_axis(
+        self,
+        labels,
+        *,
+        axis: Axis = 0,
+        copy: bool | lib.NoDefault = lib.no_default,
+    ) -> DataFrame:
+        """
+        Assign desired index to given axis.
+
+        Indexes for column or row labels can be changed by assigning
+        a list-like or Index.
+
+        Parameters
+        ----------
+        labels : list-like, Index
+            The values for the new index.
+
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis to update. The value 0 identifies the rows. For `Series`
+            this parameter is unused and defaults to 0.
+
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        Returns
+        -------
+        DataFrame
+            An object of type DataFrame.
+
+        See Also
+        --------
+        DataFrame.rename_axis : Alter the name of the index or columns.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+
+        Change the row labels.
+
+        >>> df.set_axis(["a", "b", "c"], axis="index")
+            A  B
+        a  1  4
+        b  2  5
+        c  3  6
+
+        Change the column labels.
+
+        >>> df.set_axis(["I", "II"], axis="columns")
+            I  II
+        0  1   4
+        1  2   5
+        2  3   6
+        """
+        return super().set_axis(labels, axis=axis, copy=copy)
+
+    def reindex(
+        self,
+        labels=None,
+        *,
+        index=None,
+        columns=None,
+        axis: Axis | None = None,
+        method: ReindexMethod | None = None,
+        copy: bool | lib.NoDefault = lib.no_default,
+        level: Level | None = None,
+        fill_value: Scalar | None = np.nan,
+        limit: int | None = None,
+        tolerance=None,
+    ) -> DataFrame:
+        """
+        Conform DataFrame to new index with optional filling logic.
+
+        Places NA/NaN in locations having no value in the previous index. A new object
+        is produced unless the new index is equivalent to the current one and
+        ``copy=False``.
+
+        Parameters
+        ----------
+
+        labels : array-like, optional
+            New labels / index to conform the axis specified by 'axis' to.
+        index : array-like, optional
+            New labels for the index. Preferably an Index object to avoid
+            duplicating data.
+        columns : array-like, optional
+            New labels for the columns. Preferably an Index object to avoid
+            duplicating data.
+        axis : int or str, optional
+            Axis to target. Can be either the axis name ('index', 'columns')
+            or number (0, 1).
+        method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
+            Method to use for filling holes in reindexed DataFrame.
+            Please note: this is only applicable to DataFrames/Series with a
+            monotonically increasing/decreasing index.
+
+            * None (default): don't fill gaps
+            * pad / ffill: Propagate last valid observation forward to next
+              valid.
+            * backfill / bfill: Use next valid observation to fill gap.
+            * nearest: Use nearest valid observations to fill gap.
+
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        level : int or name
+            Broadcast across a level, matching Index values on the
+            passed MultiIndex level.
+        fill_value : scalar, default np.nan
+            Value to use for missing values. Defaults to NaN, but can be any
+            "compatible" value.
+        limit : int, default None
+            Maximum number of consecutive elements to forward or backward fill.
+        tolerance : optional
+            Maximum distance between original and new labels for inexact
+            matches. The values of the index at the matching locations most
+            satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+
+            Tolerance may be a scalar value, which applies the same tolerance
+            to all values, or list-like, which applies variable tolerance per
+            element. List-like includes list, tuple, array, Series, and must be
+            the same size as the index and its dtype must exactly match the
+            index's type.
+
+        Returns
+        -------
+        DataFrame
+            DataFrame with changed index.
+
+        See Also
+        --------
+        DataFrame.set_index : Set row labels.
+        DataFrame.reset_index : Remove row labels or move them to new columns.
+        DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+        Examples
+        --------
+        ``DataFrame.reindex`` supports two calling conventions
+
+        * ``(index=index_labels, columns=column_labels, ...)``
+        * ``(labels, axis={'index', 'columns'}, ...)``
+
+        We *highly* recommend using keyword arguments to clarify your
+        intent.
+
+        Create a DataFrame with some fictional data.
+
+        >>> index = ["Firefox", "Chrome", "Safari", "IE10", "Konqueror"]
+        >>> columns = ["http_status", "response_time"]
+        >>> df = pd.DataFrame(
+        ...     [[200, 0.04], [200, 0.02], [404, 0.07], [404, 0.08], [301, 1.0]],
+        ...     columns=columns,
+        ...     index=index,
+        ... )
+        >>> df
+                   http_status  response_time
+        Firefox            200           0.04
+        Chrome             200           0.02
+        Safari             404           0.07
+        IE10               404           0.08
+        Konqueror          301           1.00
+
+        Create a new index and reindex the DataFrame. By default
+        values in the new index that do not have corresponding
+        records in the DataFrame are assigned ``NaN``.
+
+        >>> new_index = ["Safari", "Iceweasel", "Comodo Dragon", "IE10", "Chrome"]
+        >>> df.reindex(new_index)
+                       http_status  response_time
+        Safari               404.0           0.07
+        Iceweasel              NaN            NaN
+        Comodo Dragon          NaN            NaN
+        IE10                 404.0           0.08
+        Chrome               200.0           0.02
+
+        We can fill in the missing values by passing a value to
+        the keyword ``fill_value``. Because the index is not monotonically
+        increasing or decreasing, we cannot use arguments to the keyword
+        ``method`` to fill the ``NaN`` values.
+
+        >>> df.reindex(new_index, fill_value=0)
+                       http_status  response_time
+        Safari                 404           0.07
+        Iceweasel                0           0.00
+        Comodo Dragon            0           0.00
+        IE10                   404           0.08
+        Chrome                 200           0.02
+
+        >>> df.reindex(new_index, fill_value="missing")
+                      http_status response_time
+        Safari                404          0.07
+        Iceweasel         missing       missing
+        Comodo Dragon     missing       missing
+        IE10                  404          0.08
+        Chrome                200          0.02
+
+        We can also reindex the columns.
+
+        >>> df.reindex(columns=["http_status", "user_agent"])
+                   http_status  user_agent
+        Firefox            200         NaN
+        Chrome             200         NaN
+        Safari             404         NaN
+        IE10               404         NaN
+        Konqueror          301         NaN
+
+        Or we can use "axis-style" keyword arguments
+
+        >>> df.reindex(["http_status", "user_agent"], axis="columns")
+                   http_status  user_agent
+        Firefox            200         NaN
+        Chrome             200         NaN
+        Safari             404         NaN
+        IE10               404         NaN
+        Konqueror          301         NaN
+
+        To further illustrate the filling functionality in
+        ``reindex``, we will create a DataFrame with a
+        monotonically increasing index (for example, a sequence
+        of dates).
+
+        >>> date_index = pd.date_range("1/1/2010", periods=6, freq="D")
+        >>> df2 = pd.DataFrame(
+        ...     {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index
+        ... )
+        >>> df2
+                    prices
+        2010-01-01   100.0
+        2010-01-02   101.0
+        2010-01-03     NaN
+        2010-01-04   100.0
+        2010-01-05    89.0
+        2010-01-06    88.0
+
+        Suppose we decide to expand the DataFrame to cover a wider
+        date range.
+
+        >>> date_index2 = pd.date_range("12/29/2009", periods=10, freq="D")
+        >>> df2.reindex(date_index2)
+                    prices
+        2009-12-29     NaN
+        2009-12-30     NaN
+        2009-12-31     NaN
+        2010-01-01   100.0
+        2010-01-02   101.0
+        2010-01-03     NaN
+        2010-01-04   100.0
+        2010-01-05    89.0
+        2010-01-06    88.0
+        2010-01-07     NaN
+
+        The index entries that did not have a value in the original data frame
+        (for example, '2009-12-29') are by default filled with ``NaN``.
+        If desired, we can fill in the missing values using one of several
+        options.
+
+        For example, to back-propagate the last valid value to fill the ``NaN``
+        values, pass ``bfill`` as an argument to the ``method`` keyword.
+
+        >>> df2.reindex(date_index2, method="bfill")
+                    prices
+        2009-12-29   100.0
+        2009-12-30   100.0
+        2009-12-31   100.0
+        2010-01-01   100.0
+        2010-01-02   101.0
+        2010-01-03     NaN
+        2010-01-04   100.0
+        2010-01-05    89.0
+        2010-01-06    88.0
+        2010-01-07     NaN
+
+        Please note that the ``NaN`` value present in the original DataFrame
+        (at index value 2010-01-03) will not be filled by any of the
+        value propagation schemes. This is because filling while reindexing
+        does not look at DataFrame values, but only compares the original and
+        desired indexes. If you do want to fill in the ``NaN`` values present
+        in the original DataFrame, use the ``fillna()`` method.
+
+        See the :ref:`user guide <basics.reindexing>` for more.
+        """
+        return super().reindex(
+            labels=labels,
+            index=index,
+            columns=columns,
+            axis=axis,
+            method=method,
+            level=level,
+            fill_value=fill_value,
+            limit=limit,
+            tolerance=tolerance,
+            copy=copy,
+        )
+
+    @overload
+    def drop(
+        self,
+        labels: IndexLabel | ListLike = ...,
+        *,
+        axis: Axis = ...,
+        index: IndexLabel | ListLike = ...,
+        columns: IndexLabel | ListLike = ...,
+        level: Level = ...,
+        inplace: Literal[True],
+        errors: IgnoreRaise = ...,
+    ) -> None: ...
+
+    @overload
+    def drop(
+        self,
+        labels: IndexLabel | ListLike = ...,
+        *,
+        axis: Axis = ...,
+        index: IndexLabel | ListLike = ...,
+        columns: IndexLabel | ListLike = ...,
+        level: Level = ...,
+        inplace: Literal[False] = ...,
+        errors: IgnoreRaise = ...,
+    ) -> DataFrame: ...
+
+    @overload
+    def drop(
+        self,
+        labels: IndexLabel | ListLike = ...,
+        *,
+        axis: Axis = ...,
+        index: IndexLabel | ListLike = ...,
+        columns: IndexLabel | ListLike = ...,
+        level: Level = ...,
+        inplace: bool = ...,
+        errors: IgnoreRaise = ...,
+    ) -> DataFrame | None: ...
+
+    def drop(
+        self,
+        labels: IndexLabel | ListLike = None,
+        *,
+        axis: Axis = 0,
+        index: IndexLabel | ListLike = None,
+        columns: IndexLabel | ListLike = None,
+        level: Level | None = None,
+        inplace: bool = False,
+        errors: IgnoreRaise = "raise",
+    ) -> DataFrame | None:
+        """
+        Drop specified labels from rows or columns.
+
+        Remove rows or columns by specifying label names and corresponding
+        axis, or by directly specifying index or column names. When using a
+        multi-index, labels on different levels can be removed by specifying
+        the level. See the :ref:`user guide <advanced.shown_levels>`
+        for more information about the now unused levels.
+
+        Parameters
+        ----------
+        labels : single label or iterable of labels
+            Index or column labels to drop. A tuple will be used as a single
+            label and not treated as an iterable.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Whether to drop labels from the index (0 or 'index') or
+            columns (1 or 'columns').
+        index : single label or iterable of labels
+            Alternative to specifying axis (``labels, axis=0``
+            is equivalent to ``index=labels``).
+        columns : single label or iterable of labels
+            Alternative to specifying axis (``labels, axis=1``
+            is equivalent to ``columns=labels``).
+        level : int or level name, optional
+            For MultiIndex, level from which the labels will be removed.
+        inplace : bool, default False
+            If False, return a copy. Otherwise, do operation
+            in place and return None.
+        errors : {'ignore', 'raise'}, default 'raise'
+            If 'ignore', suppress error and only existing labels are
+            dropped.
+
+        Returns
+        -------
+        DataFrame or None
+            Returns DataFrame or None DataFrame with the specified
+            index or column labels removed or None if inplace=True.
+
+        Raises
+        ------
+        KeyError
+            If any of the labels is not found in the selected axis.
+
+        See Also
+        --------
+        DataFrame.loc : Label-location based indexer for selection by label.
+        DataFrame.dropna : Return DataFrame with labels on given axis omitted
+            where (all or any) data are missing.
+        DataFrame.drop_duplicates : Return DataFrame with duplicate rows
+            removed, optionally only considering certain columns.
+        Series.drop : Return Series with specified index labels removed.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"])
+        >>> df
+           A  B   C   D
+        0  0  1   2   3
+        1  4  5   6   7
+        2  8  9  10  11
+
+        Drop columns
+
+        >>> df.drop(["B", "C"], axis=1)
+           A   D
+        0  0   3
+        1  4   7
+        2  8  11
+
+        >>> df.drop(columns=["B", "C"])
+           A   D
+        0  0   3
+        1  4   7
+        2  8  11
+
+        Drop a row by index
+
+        >>> df.drop([0, 1])
+           A  B   C   D
+        2  8  9  10  11
+
+        Drop columns and/or rows of MultiIndex DataFrame
+
+        >>> midx = pd.MultiIndex(
+        ...     levels=[["llama", "cow", "falcon"], ["speed", "weight", "length"]],
+        ...     codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
+        ... )
+        >>> df = pd.DataFrame(
+        ...     index=midx,
+        ...     columns=["big", "small"],
+        ...     data=[
+        ...         [45, 30],
+        ...         [200, 100],
+        ...         [1.5, 1],
+        ...         [30, 20],
+        ...         [250, 150],
+        ...         [1.5, 0.8],
+        ...         [320, 250],
+        ...         [1, 0.8],
+        ...         [0.3, 0.2],
+        ...     ],
+        ... )
+        >>> df
+                        big     small
+        llama   speed   45.0    30.0
+                weight  200.0   100.0
+                length  1.5     1.0
+        cow     speed   30.0    20.0
+                weight  250.0   150.0
+                length  1.5     0.8
+        falcon  speed   320.0   250.0
+                weight  1.0     0.8
+                length  0.3     0.2
+
+        Drop a specific index combination from the MultiIndex
+        DataFrame, i.e., drop the combination ``'falcon'`` and
+        ``'weight'``, which deletes only the corresponding row
+
+        >>> df.drop(index=("falcon", "weight"))
+                        big     small
+        llama   speed   45.0    30.0
+                weight  200.0   100.0
+                length  1.5     1.0
+        cow     speed   30.0    20.0
+                weight  250.0   150.0
+                length  1.5     0.8
+        falcon  speed   320.0   250.0
+                length  0.3     0.2
+
+        >>> df.drop(index="cow", columns="small")
+                        big
+        llama   speed   45.0
+                weight  200.0
+                length  1.5
+        falcon  speed   320.0
+                weight  1.0
+                length  0.3
+
+        >>> df.drop(index="length", level=1)
+                        big     small
+        llama   speed   45.0    30.0
+                weight  200.0   100.0
+        cow     speed   30.0    20.0
+                weight  250.0   150.0
+        falcon  speed   320.0   250.0
+                weight  1.0     0.8
+        """
+        return super().drop(
+            labels=labels,
+            axis=axis,
+            index=index,
+            columns=columns,
+            level=level,
+            inplace=inplace,
+            errors=errors,
+        )
+
+    @overload
+    def rename(
+        self,
+        mapper: Renamer | None = ...,
+        *,
+        index: Renamer | None = ...,
+        columns: Renamer | None = ...,
+        axis: Axis | None = ...,
+        copy: bool | lib.NoDefault = lib.no_default,
+        inplace: Literal[True],
+        level: Level = ...,
+        errors: IgnoreRaise = ...,
+    ) -> None: ...
+
+    @overload
+    def rename(
+        self,
+        mapper: Renamer | None = ...,
+        *,
+        index: Renamer | None = ...,
+        columns: Renamer | None = ...,
+        axis: Axis | None = ...,
+        copy: bool | lib.NoDefault = lib.no_default,
+        inplace: Literal[False] = ...,
+        level: Level = ...,
+        errors: IgnoreRaise = ...,
+    ) -> DataFrame: ...
+
+    @overload
+    def rename(
+        self,
+        mapper: Renamer | None = ...,
+        *,
+        index: Renamer | None = ...,
+        columns: Renamer | None = ...,
+        axis: Axis | None = ...,
+        copy: bool | lib.NoDefault = lib.no_default,
+        inplace: bool = ...,
+        level: Level = ...,
+        errors: IgnoreRaise = ...,
+    ) -> DataFrame | None: ...
+
+    def rename(
+        self,
+        mapper: Renamer | None = None,
+        *,
+        index: Renamer | None = None,
+        columns: Renamer | None = None,
+        axis: Axis | None = None,
+        copy: bool | lib.NoDefault = lib.no_default,
+        inplace: bool = False,
+        level: Level | None = None,
+        errors: IgnoreRaise = "ignore",
+    ) -> DataFrame | None:
+        """
+        Rename columns or index labels.
+
+        Function / dict values must be unique (1-to-1). Labels not contained in
+        a dict / Series will be left as-is. Extra labels listed don't throw an
+        error.
+
+        See the :ref:`user guide <basics.rename>` for more.
+
+        Parameters
+        ----------
+        mapper : dict-like or function
+            Dict-like or function transformations to apply to
+            that axis' values. Use either ``mapper`` and ``axis`` to
+            specify the axis to target with ``mapper``, or ``index`` and
+            ``columns``.
+        index : dict-like or function
+            Alternative to specifying axis (``mapper, axis=0``
+            is equivalent to ``index=mapper``).
+        columns : dict-like or function
+            Alternative to specifying axis (``mapper, axis=1``
+            is equivalent to ``columns=mapper``).
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Axis to target with ``mapper``. Can be either the axis name
+            ('index', 'columns') or number (0, 1). The default is 'index'.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        inplace : bool, default False
+            Whether to modify the DataFrame rather than creating a new one.
+            If True then value of copy is ignored.
+        level : int or level name, default None
+            In case of a MultiIndex, only rename labels in the specified
+            level.
+        errors : {'ignore', 'raise'}, default 'ignore'
+            If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
+            or `columns` contains labels that are not present in the Index
+            being transformed.
+            If 'ignore', existing keys will be renamed and extra keys will be
+            ignored.
+
+        Returns
+        -------
+        DataFrame or None
+            DataFrame with the renamed axis labels or None if ``inplace=True``.
+
+        Raises
+        ------
+        KeyError
+            If any of the labels is not found in the selected axis and
+            "errors='raise'".
+
+        See Also
+        --------
+        DataFrame.rename_axis : Set the name of the axis.
+
+        Examples
+        --------
+        ``DataFrame.rename`` supports two calling conventions
+
+        * ``(index=index_mapper, columns=columns_mapper, ...)``
+        * ``(mapper, axis={'index', 'columns'}, ...)``
+
+        We *highly* recommend using keyword arguments to clarify your
+        intent.
+
+        Rename columns using a mapping:
+
+        >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        >>> df.rename(columns={"A": "a", "B": "c"})
+           a  c
+        0  1  4
+        1  2  5
+        2  3  6
+
+        Rename index using a mapping:
+
+        >>> df.rename(index={0: "x", 1: "y", 2: "z"})
+           A  B
+        x  1  4
+        y  2  5
+        z  3  6
+
+        Cast index labels to a different type:
+
+        >>> df.index
+        RangeIndex(start=0, stop=3, step=1)
+        >>> df.rename(index=str).index
+        Index(['0', '1', '2'], dtype='str')
+
+        >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
+        Traceback (most recent call last):
+        KeyError: ['C'] not found in axis
+
+        Using axis-style parameters:
+
+        >>> df.rename(str.lower, axis="columns")
+           a  b
+        0  1  4
+        1  2  5
+        2  3  6
+
+        >>> df.rename({1: 2, 2: 4}, axis="index")
+           A  B
+        0  1  4
+        2  2  5
+        4  3  6
+        """
+        self._check_copy_deprecation(copy)
+        return super()._rename(
+            mapper=mapper,
+            index=index,
+            columns=columns,
+            axis=axis,
+            inplace=inplace,
+            level=level,
+            errors=errors,
+        )
+
+    def pop(self, item: Hashable) -> Series:
+        """
+        Return item and drop it from DataFrame. Raise KeyError if not found.
+
+        Parameters
+        ----------
+        item : label
+            Label of column to be popped.
+
+        Returns
+        -------
+        Series
+            Series representing the item that is dropped.
+
+        See Also
+        --------
+        DataFrame.drop: Drop specified labels from rows or columns.
+        DataFrame.drop_duplicates: Return DataFrame with duplicate rows removed.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [
+        ...         ("falcon", "bird", 389.0),
+        ...         ("parrot", "bird", 24.0),
+        ...         ("lion", "mammal", 80.5),
+        ...         ("monkey", "mammal", np.nan),
+        ...     ],
+        ...     columns=("name", "class", "max_speed"),
+        ... )
+        >>> df
+             name   class  max_speed
+        0  falcon    bird      389.0
+        1  parrot    bird       24.0
+        2    lion  mammal       80.5
+        3  monkey  mammal        NaN
+
+        >>> df.pop("class")
+        0      bird
+        1      bird
+        2    mammal
+        3    mammal
+        Name: class, dtype: str
+
+        >>> df
+             name  max_speed
+        0  falcon      389.0
+        1  parrot       24.0
+        2    lion       80.5
+        3  monkey        NaN
+        """
+        return super().pop(item=item)
+
+    def _replace_columnwise(
+        self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex
+    ) -> Self:
+        """
+        Dispatch to Series.replace column-wise.
+
+        Parameters
+        ----------
+        mapping : dict
+            of the form {col: (target, value)}
+        inplace : bool
+        regex : bool or same types as `to_replace` in DataFrame.replace
+
+        Returns
+        -------
+        DataFrame
+        """
+        # Operate column-wise
+        res = self if inplace else self.copy(deep=False)
+        ax = self.columns
+
+        for i, ax_value in enumerate(ax):
+            if ax_value in mapping:
+                ser = self.iloc[:, i]
+
+                target, value = mapping[ax_value]
+                newobj = ser.replace(target, value, regex=regex)
+
+                res._iset_item(i, newobj, inplace=inplace)
+
+        return res if inplace else res.__finalize__(self)
+
+    def shift(
+        self,
+        periods: int | Sequence[int] = 1,
+        freq: Frequency | None = None,
+        axis: Axis = 0,
+        fill_value: Hashable = lib.no_default,
+        suffix: str | None = None,
+    ) -> DataFrame:
+        """
+        Shift index by desired number of periods with an optional time `freq`.
+
+        When `freq` is not passed, shift the index without realigning the data.
+        If `freq` is passed (in this case, the index must be date or datetime,
+        or it will raise a `NotImplementedError`), the index will be
+        increased using the periods and the `freq`. `freq` can be inferred
+        when specified as "infer" as long as either freq or inferred_freq
+        attribute is set in the index.
+
+        Parameters
+        ----------
+        periods : int or Sequence
+            Number of periods to shift. Can be positive or negative.
+            If an iterable of ints, the data will be shifted once by each int.
+            This is equivalent to shifting by one value at a time and
+            concatenating all resulting frames. The resulting columns will have
+            the shift suffixed to their column names. For multiple periods,
+            axis must not be 1.
+        freq : DateOffset, tseries.offsets, timedelta, or str, optional
+            Offset to use from the tseries module or time rule (e.g. 'EOM').
+            If `freq` is specified then the index values are shifted but the
+            data is not realigned. That is, use `freq` if you would like to
+            extend the index when shifting and preserve the original data.
+            If `freq` is specified as "infer" then it will be inferred from
+            the freq or inferred_freq attributes of the index. If neither of
+            those attributes exist, a ValueError is thrown.
+        axis : {0 or 'index', 1 or 'columns', None}, default None
+            Shift direction. For `Series` this parameter is unused and defaults to 0.
+        fill_value : object, optional
+            The scalar value to use for newly introduced missing values.
+            the default depends on the dtype of `self`.
+            For Boolean and numeric NumPy data types, ``np.nan`` is used.
+            For datetime, timedelta, or period data, etc. :attr:`NaT` is used.
+            For extension dtypes, ``self.dtype.na_value`` is used.
+        suffix : str, optional
+            If str and periods is an iterable, this is added after the column
+            name and before the shift value for each shifted column name.
+            For `Series` this parameter is unused and defaults to `None`.
+
+        Returns
+        -------
+        DataFrame
+            Copy of input object, shifted.
+
+        See Also
+        --------
+        Index.shift : Shift values of Index.
+        DatetimeIndex.shift : Shift values of DatetimeIndex.
+        PeriodIndex.shift : Shift values of PeriodIndex.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [[10, 13, 17], [20, 23, 27], [15, 18, 22], [30, 33, 37], [45, 48, 52]],
+        ...     columns=["Col1", "Col2", "Col3"],
+        ...     index=pd.date_range("2020-01-01", "2020-01-05"),
+        ... )
+        >>> df
+                    Col1  Col2  Col3
+        2020-01-01    10    13    17
+        2020-01-02    20    23    27
+        2020-01-03    15    18    22
+        2020-01-04    30    33    37
+        2020-01-05    45    48    52
+
+        >>> df.shift(periods=3)
+                    Col1  Col2  Col3
+        2020-01-01   NaN   NaN   NaN
+        2020-01-02   NaN   NaN   NaN
+        2020-01-03   NaN   NaN   NaN
+        2020-01-04  10.0  13.0  17.0
+        2020-01-05  20.0  23.0  27.0
+
+        >>> df.shift(periods=1, axis="columns")
+                    Col1  Col2  Col3
+        2020-01-01   NaN    10    13
+        2020-01-02   NaN    20    23
+        2020-01-03   NaN    15    18
+        2020-01-04   NaN    30    33
+        2020-01-05   NaN    45    48
+
+        >>> df.shift(periods=3, fill_value=0)
+                    Col1  Col2  Col3
+        2020-01-01     0     0     0
+        2020-01-02     0     0     0
+        2020-01-03     0     0     0
+        2020-01-04    10    13    17
+        2020-01-05    20    23    27
+
+        >>> df.shift(periods=3, freq="D")
+                    Col1  Col2  Col3
+        2020-01-04    10    13    17
+        2020-01-05    20    23    27
+        2020-01-06    15    18    22
+        2020-01-07    30    33    37
+        2020-01-08    45    48    52
+
+        >>> df.shift(periods=3, freq="infer")
+                    Col1  Col2  Col3
+        2020-01-04    10    13    17
+        2020-01-05    20    23    27
+        2020-01-06    15    18    22
+        2020-01-07    30    33    37
+        2020-01-08    45    48    52
+
+        >>> df["Col1"].shift(periods=[0, 1, 2])
+                    Col1_0  Col1_1  Col1_2
+        2020-01-01      10     NaN     NaN
+        2020-01-02      20    10.0     NaN
+        2020-01-03      15    20.0    10.0
+        2020-01-04      30    15.0    20.0
+        2020-01-05      45    30.0    15.0
+        """
+        if freq is not None and fill_value is not lib.no_default:
+            # GH#53832
+            raise ValueError(
+                "Passing a 'freq' together with a 'fill_value' is not allowed."
+            )
+
+        if self.empty and freq is None:
+            return self.copy()
+
+        axis = self._get_axis_number(axis)
+
+        if is_list_like(periods):
+            periods = cast(Sequence, periods)
+            if axis == 1:
+                raise ValueError(
+                    "If `periods` contains multiple shifts, `axis` cannot be 1."
+                )
+            if len(periods) == 0:
+                raise ValueError("If `periods` is an iterable, it cannot be empty.")
+            from pandas.core.reshape.concat import concat
+
+            shifted_dataframes = []
+            for period in periods:
+                if not is_integer(period):
+                    raise TypeError(
+                        f"Periods must be integer, but {period} is {type(period)}."
+                    )
+                period = cast(int, period)
+                shifted_dataframes.append(
+                    super()
+                    .shift(periods=period, freq=freq, axis=axis, fill_value=fill_value)
+                    .add_suffix(f"{suffix}_{period}" if suffix else f"_{period}")
+                )
+            return concat(shifted_dataframes, axis=1, sort=False)
+        elif suffix:
+            raise ValueError("Cannot specify `suffix` if `periods` is an int.")
+        periods = cast(int, periods)
+
+        ncols = len(self.columns)
+        if axis == 1 and periods != 0 and ncols > 0 and freq is None:
+            if fill_value is lib.no_default:
+                # We will infer fill_value to match the closest column
+
+                # Use a column that we know is valid for our column's dtype GH#38434
+                label = self.columns[0]
+
+                if periods > 0:
+                    result = self.iloc[:, :-periods]
+                    for col in range(min(ncols, abs(periods))):
+                        # TODO(EA2D): doing this in a loop unnecessary with 2D EAs
+                        # Define filler inside loop so we get a copy
+                        filler = self.iloc[:, 0].shift(len(self))
+                        result.insert(0, label, filler, allow_duplicates=True)
+                else:
+                    result = self.iloc[:, -periods:]
+                    for col in range(min(ncols, abs(periods))):
+                        # Define filler inside loop so we get a copy
+                        filler = self.iloc[:, -1].shift(len(self))
+                        result.insert(
+                            len(result.columns), label, filler, allow_duplicates=True
+                        )
+
+                result.columns = self.columns.copy()
+                return result
+            elif len(self._mgr.blocks) > 1 or (
+                # If we only have one block and we know that we can't
+                #  keep the same dtype (i.e. the _can_hold_element check)
+                #  then we can go through the reindex_indexer path
+                #  (and avoid casting logic in the Block method).
+                not can_hold_element(self._mgr.blocks[0].values, fill_value)
+            ):
+                # GH#35488 we need to watch out for multi-block cases
+                # We only get here with fill_value not-lib.no_default
+                nper = abs(periods)
+                nper = min(nper, ncols)
+                if periods > 0:
+                    indexer = np.array(
+                        [-1] * nper + list(range(ncols - periods)), dtype=np.intp
+                    )
+                else:
+                    indexer = np.array(
+                        list(range(nper, ncols)) + [-1] * nper, dtype=np.intp
+                    )
+                mgr = self._mgr.reindex_indexer(
+                    self.columns,
+                    indexer,
+                    axis=0,
+                    fill_value=fill_value,
+                    allow_dups=True,
+                )
+                res_df = self._constructor_from_mgr(mgr, axes=mgr.axes)
+                return res_df.__finalize__(self, method="shift")
+            else:
+                return self.T.shift(periods=periods, fill_value=fill_value).T
+
+        return super().shift(
+            periods=periods, freq=freq, axis=axis, fill_value=fill_value
+        )
+
+    @overload
+    def set_index(
+        self,
+        keys,
+        *,
+        drop: bool = ...,
+        append: bool = ...,
+        inplace: Literal[False] = ...,
+        verify_integrity: bool | lib.NoDefault = ...,
+    ) -> DataFrame: ...
+
+    @overload
+    def set_index(
+        self,
+        keys,
+        *,
+        drop: bool = ...,
+        append: bool = ...,
+        inplace: Literal[True],
+        verify_integrity: bool | lib.NoDefault = ...,
+    ) -> None: ...
+
+    def set_index(
+        self,
+        keys,
+        *,
+        drop: bool = True,
+        append: bool = False,
+        inplace: bool = False,
+        verify_integrity: bool | lib.NoDefault = lib.no_default,
+    ) -> DataFrame | None:
+        """
+        Set the DataFrame index using existing columns.
+
+        Set the DataFrame index (row labels) using one or more existing
+        columns or arrays (of the correct length). The index can replace the
+        existing index or expand on it.
+
+        Parameters
+        ----------
+        keys : label or array-like or list of labels/arrays
+            This parameter can be either a single column key, a single array of
+            the same length as the calling DataFrame, or a list containing an
+            arbitrary combination of column keys and arrays. Here, "array"
+            encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
+            instances of :class:`~collections.abc.Iterator`.
+        drop : bool, default True
+            Delete columns to be used as the new index.
+        append : bool, default False
+            Whether to append columns to existing index.
+            Setting to True will add the new columns to existing index.
+            When set to False, the current index will be dropped from the DataFrame.
+        inplace : bool, default False
+            Whether to modify the DataFrame rather than creating a new one.
+        verify_integrity : bool, default False
+            Check the new index for duplicates. Otherwise defer the check until
+            necessary. Setting to False will improve the performance of this
+            method.
+
+            .. deprecated:: 3.0.0
+
+        Returns
+        -------
+        DataFrame or None
+            Changed row labels or None if ``inplace=True``.
+
+        See Also
+        --------
+        DataFrame.reset_index : Opposite of set_index.
+        DataFrame.reindex : Change to new indices or expand indices.
+        DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "month": [1, 4, 7, 10],
+        ...         "year": [2012, 2014, 2013, 2014],
+        ...         "sale": [55, 40, 84, 31],
+        ...     }
+        ... )
+        >>> df
+           month  year  sale
+        0      1  2012    55
+        1      4  2014    40
+        2      7  2013    84
+        3     10  2014    31
+
+        Set the index to become the 'month' column:
+
+        >>> df.set_index("month")
+               year  sale
+        month
+        1      2012    55
+        4      2014    40
+        7      2013    84
+        10     2014    31
+
+        Create a MultiIndex using columns 'year' and 'month':
+
+        >>> df.set_index(["year", "month"])
+                    sale
+        year  month
+        2012  1     55
+        2014  4     40
+        2013  7     84
+        2014  10    31
+
+        Create a MultiIndex using an Index and a column:
+
+        >>> df.set_index([pd.Index([1, 2, 3, 4]), "year"])
+                 month  sale
+           year
+        1  2012  1      55
+        2  2014  4      40
+        3  2013  7      84
+        4  2014  10     31
+
+        Create a MultiIndex using two Series:
+
+        >>> s = pd.Series([1, 2, 3, 4])
+        >>> df.set_index([s, s**2])
+              month  year  sale
+        1 1       1  2012    55
+        2 4       4  2014    40
+        3 9       7  2013    84
+        4 16     10  2014    31
+
+        Append a column to the existing index:
+
+        >>> df = df.set_index("month")
+        >>> df.set_index("year", append=True)
+                      sale
+        month  year
+        1      2012    55
+        4      2014    40
+        7      2013    84
+        10     2014    31
+
+        >>> df.set_index("year", append=False)
+               sale
+        year
+        2012    55
+        2014    40
+        2013    84
+        2014    31
+        """
+        if verify_integrity is not lib.no_default:
+            # GH#62919
+            warnings.warn(
+                "The 'verify_integrity' keyword in DataFrame.set_index is "
+                "deprecated and will be removed in a future version. "
+                "Directly check the result.index.is_unique instead.",
+                Pandas4Warning,
+                stacklevel=find_stack_level(),
+            )
+        else:
+            verify_integrity = False
+
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        self._check_inplace_and_allows_duplicate_labels(inplace)
+        if not isinstance(keys, list):
+            keys = [keys]
+
+        err_msg = (
+            'The parameter "keys" may be a column key, one-dimensional '
+            "array, or a list containing only valid column keys and "
+            "one-dimensional arrays."
+        )
+
+        missing: list[Hashable] = []
+        for col in keys:
+            if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)):
+                # arrays are fine as long as they are one-dimensional
+                # iterators get converted to list below
+                if getattr(col, "ndim", 1) != 1:
+                    raise ValueError(err_msg)
+            else:
+                # everything else gets tried as a key; see GH 24969
+                try:
+                    found = col in self.columns
+                except TypeError as err:
+                    raise TypeError(
+                        f"{err_msg}. Received column of type {type(col)}"
+                    ) from err
+                else:
+                    if not found:
+                        missing.append(col)
+
+        if missing:
+            raise KeyError(f"None of {missing} are in the columns")
+
+        if inplace:
+            frame = self
+        else:
+            frame = self.copy(deep=False)
+
+        arrays: list[Index] = []
+        names: list[Hashable] = []
+        if append:
+            names = list(self.index.names)
+            if isinstance(self.index, MultiIndex):
+                arrays.extend(
+                    self.index._get_level_values(i) for i in range(self.index.nlevels)
+                )
+            else:
+                arrays.append(self.index)
+
+        to_remove: set[Hashable] = set()
+        for col in keys:
+            if isinstance(col, MultiIndex):
+                arrays.extend(col._get_level_values(n) for n in range(col.nlevels))
+                names.extend(col.names)
+            elif isinstance(col, (Index, Series)):
+                # if Index then not MultiIndex (treated above)
+
+                # error: Argument 1 to "append" of "list" has incompatible type
+                #  "Union[Index, Series]"; expected "Index"
+                arrays.append(col)  # type: ignore[arg-type]
+                names.append(col.name)
+            elif isinstance(col, (list, np.ndarray)):
+                # error: Argument 1 to "append" of "list" has incompatible type
+                # "Union[List[Any], ndarray]"; expected "Index"
+                arrays.append(col)  # type: ignore[arg-type]
+                names.append(None)
+            elif isinstance(col, abc.Iterator):
+                # error: Argument 1 to "append" of "list" has incompatible type
+                # "List[Any]"; expected "Index"
+                arrays.append(list(col))  # type: ignore[arg-type]
+                names.append(None)
+            # from here, col can only be a column label
+            else:
+                arrays.append(frame[col])
+                names.append(col)
+                if drop:
+                    to_remove.add(col)
+
+            if len(arrays[-1]) != len(self):
+                # check newest element against length of calling frame, since
+                # ensure_index_from_sequences would not raise for append=False.
+                raise ValueError(
+                    f"Length mismatch: Expected {len(self)} rows, "
+                    f"received array of length {len(arrays[-1])}"
+                )
+
+        index = ensure_index_from_sequences(arrays, names)
+
+        if verify_integrity and not index.is_unique:
+            duplicates = index[index.duplicated()].unique()
+            raise ValueError(f"Index has duplicate keys: {duplicates}")
+
+        # use set to handle duplicate column names gracefully in case of drop
+        for c in to_remove:
+            del frame[c]
+
+        # clear up memory usage
+        index._cleanup()
+
+        frame.index = index
+
+        if not inplace:
+            return frame
+        return None
+
+    @overload
+    def reset_index(
+        self,
+        level: IndexLabel = ...,
+        *,
+        drop: bool = ...,
+        inplace: Literal[False] = ...,
+        col_level: Hashable = ...,
+        col_fill: Hashable = ...,
+        allow_duplicates: bool | lib.NoDefault = ...,
+        names: Hashable | Sequence[Hashable] | None = None,
+    ) -> DataFrame: ...
+
+    @overload
+    def reset_index(
+        self,
+        level: IndexLabel = ...,
+        *,
+        drop: bool = ...,
+        inplace: Literal[True],
+        col_level: Hashable = ...,
+        col_fill: Hashable = ...,
+        allow_duplicates: bool | lib.NoDefault = ...,
+        names: Hashable | Sequence[Hashable] | None = None,
+    ) -> None: ...
+
+    @overload
+    def reset_index(
+        self,
+        level: IndexLabel = ...,
+        *,
+        drop: bool = ...,
+        inplace: bool = ...,
+        col_level: Hashable = ...,
+        col_fill: Hashable = ...,
+        allow_duplicates: bool | lib.NoDefault = ...,
+        names: Hashable | Sequence[Hashable] | None = None,
+    ) -> DataFrame | None: ...
+
+    def reset_index(
+        self,
+        level: IndexLabel | None = None,
+        *,
+        drop: bool = False,
+        inplace: bool = False,
+        col_level: Hashable = 0,
+        col_fill: Hashable = "",
+        allow_duplicates: bool | lib.NoDefault = lib.no_default,
+        names: Hashable | Sequence[Hashable] | None = None,
+    ) -> DataFrame | None:
+        """
+        Reset the index, or a level of it.
+
+        Reset the index of the DataFrame, and use the default one instead.
+        If the DataFrame has a MultiIndex, this method can remove one or more
+        levels.
+
+        Parameters
+        ----------
+        level : int, str, tuple, or list, default None
+            Only remove the given levels from the index. Removes all levels by
+            default.
+        drop : bool, default False
+            Do not try to insert index into dataframe columns. This resets
+            the index to the default integer index.
+        inplace : bool, default False
+            Whether to modify the DataFrame rather than creating a new one.
+        col_level : int or str, default 0
+            If the columns have multiple levels, determines which level the
+            labels are inserted into. By default it is inserted into the first
+            level.
+        col_fill : object, default ''
+            If the columns have multiple levels, determines how the other
+            levels are named. If None then the index name is repeated.
+        allow_duplicates : bool, optional, default lib.no_default
+            Allow duplicate column labels to be created.
+        names : int, str or 1-dimensional list, default None
+            Using the given string, rename the DataFrame column which contains the
+            index data. If the DataFrame has a MultiIndex, this has to be a list
+            with length equal to the number of levels.
+
+        Returns
+        -------
+        DataFrame or None
+            DataFrame with the new index or None if ``inplace=True``.
+
+        See Also
+        --------
+        DataFrame.set_index : Opposite of reset_index.
+        DataFrame.reindex : Change to new indices or expand indices.
+        DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [("bird", 389.0), ("bird", 24.0), ("mammal", 80.5), ("mammal", np.nan)],
+        ...     index=["falcon", "parrot", "lion", "monkey"],
+        ...     columns=("class", "max_speed"),
+        ... )
+        >>> df
+                 class  max_speed
+        falcon    bird      389.0
+        parrot    bird       24.0
+        lion    mammal       80.5
+        monkey  mammal        NaN
+
+        When we reset the index, the old index is added as a column, and a
+        new sequential index is used:
+
+        >>> df.reset_index()
+            index   class  max_speed
+        0  falcon    bird      389.0
+        1  parrot    bird       24.0
+        2    lion  mammal       80.5
+        3  monkey  mammal        NaN
+
+        We can use the `drop` parameter to avoid the old index being added as
+        a column:
+
+        >>> df.reset_index(drop=True)
+            class  max_speed
+        0    bird      389.0
+        1    bird       24.0
+        2  mammal       80.5
+        3  mammal        NaN
+
+        You can also use `reset_index` with `MultiIndex`.
+
+        >>> index = pd.MultiIndex.from_tuples(
+        ...     [
+        ...         ("bird", "falcon"),
+        ...         ("bird", "parrot"),
+        ...         ("mammal", "lion"),
+        ...         ("mammal", "monkey"),
+        ...     ],
+        ...     names=["class", "name"],
+        ... )
+        >>> columns = pd.MultiIndex.from_tuples([("speed", "max"), ("species", "type")])
+        >>> df = pd.DataFrame(
+        ...     [(389.0, "fly"), (24.0, "fly"), (80.5, "run"), (np.nan, "jump")],
+        ...     index=index,
+        ...     columns=columns,
+        ... )
+        >>> df
+                       speed species
+                         max    type
+        class  name
+        bird   falcon  389.0     fly
+               parrot   24.0     fly
+        mammal lion     80.5     run
+               monkey    NaN    jump
+
+        Using the `names` parameter, choose a name for the index column:
+
+        >>> df.reset_index(names=["classes", "names"])
+          classes   names  speed species
+                             max    type
+        0    bird  falcon  389.0     fly
+        1    bird  parrot   24.0     fly
+        2  mammal    lion   80.5     run
+        3  mammal  monkey    NaN    jump
+
+        If the index has multiple levels, we can reset a subset of them:
+
+        >>> df.reset_index(level="class")
+                 class  speed species
+                          max    type
+        name
+        falcon    bird  389.0     fly
+        parrot    bird   24.0     fly
+        lion    mammal   80.5     run
+        monkey  mammal    NaN    jump
+
+        If we are not dropping the index, by default, it is placed in the top
+        level. We can place it in another level:
+
+        >>> df.reset_index(level="class", col_level=1)
+                        speed species
+                 class    max    type
+        name
+        falcon    bird  389.0     fly
+        parrot    bird   24.0     fly
+        lion    mammal   80.5     run
+        monkey  mammal    NaN    jump
+
+        When the index is inserted under another level, we can specify under
+        which one with the parameter `col_fill`:
+
+        >>> df.reset_index(level="class", col_level=1, col_fill="species")
+                      species  speed species
+                        class    max    type
+        name
+        falcon           bird  389.0     fly
+        parrot           bird   24.0     fly
+        lion           mammal   80.5     run
+        monkey         mammal    NaN    jump
+
+        If we specify a nonexistent level for `col_fill`, it is created:
+
+        >>> df.reset_index(level="class", col_level=1, col_fill="genus")
+                        genus  speed species
+                        class    max    type
+        name
+        falcon           bird  389.0     fly
+        parrot           bird   24.0     fly
+        lion           mammal   80.5     run
+        monkey         mammal    NaN    jump
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        self._check_inplace_and_allows_duplicate_labels(inplace)
+        if inplace:
+            new_obj = self
+        else:
+            new_obj = self.copy(deep=False)
+        if allow_duplicates is not lib.no_default:
+            allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")
+
+        new_index = default_index(len(new_obj))
+        if level is not None:
+            if not isinstance(level, (tuple, list)):
+                level = [level]
+            level = [self.index._get_level_number(lev) for lev in level]
+            if len(level) < self.index.nlevels:
+                new_index = self.index.droplevel(level)
+
+        if not drop:
+            to_insert: Iterable[tuple[Any, Any | None]]
+
+            default = "index" if "index" not in self else "level_0"
+            names = self.index._get_default_index_names(names, default)
+
+            if isinstance(self.index, MultiIndex):
+                to_insert = zip(
+                    reversed(self.index.levels),
+                    reversed(self.index.codes),
+                    strict=True,
+                )
+            else:
+                to_insert = ((self.index, None),)
+
+            multi_col = isinstance(self.columns, MultiIndex)
+            for j, (lev, lab) in enumerate(to_insert, start=1):
+                i = self.index.nlevels - j
+                if level is not None and i not in level:
+                    continue
+                name = names[i]
+                if multi_col:
+                    col_name = list(name) if isinstance(name, tuple) else [name]
+                    if col_fill is None:
+                        if len(col_name) not in (1, self.columns.nlevels):
+                            raise ValueError(
+                                "col_fill=None is incompatible "
+                                f"with incomplete column name {name}"
+                            )
+                        col_fill = col_name[0]
+
+                    lev_num = self.columns._get_level_number(col_level)
+                    name_lst = [col_fill] * lev_num + col_name
+                    missing = self.columns.nlevels - len(name_lst)
+                    name_lst += [col_fill] * missing
+                    name = tuple(name_lst)
+
+                # to ndarray and maybe infer different dtype
+                level_values = lev._values
+                if level_values.dtype == np.object_:
+                    level_values = lib.maybe_convert_objects(level_values)
+
+                if lab is not None:
+                    # if we have the codes, extract the values with a mask
+                    level_values = algorithms.take(
+                        level_values, lab, allow_fill=True, fill_value=lev._na_value
+                    )
+
+                new_obj.insert(
+                    0,
+                    name,
+                    level_values,
+                    allow_duplicates=allow_duplicates,
+                )
+
+        new_obj.index = new_index
+        if not inplace:
+            return new_obj
+
+        return None
+
+    # ----------------------------------------------------------------------
+    # Reindex-based selection methods
+
+    def isna(self) -> DataFrame:
+        """
+        Detect missing values.
+
+        Return a boolean same-sized object indicating if the values are NA.
+        NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
+        values.
+        Everything else gets mapped to False values. Characters such as empty
+        strings ``''`` or :attr:`numpy.inf` are not considered NA values.
+
+        Returns
+        -------
+        Series/DataFrame
+            Mask of bool values for each element in Series/DataFrame
+            that indicates whether an element is an NA value.
+
+        See Also
+        --------
+        Series.isnull : Alias of isna.
+        DataFrame.isnull : Alias of isna.
+        Series.notna : Boolean inverse of isna.
+        DataFrame.notna : Boolean inverse of isna.
+        Series.dropna : Omit axes labels with missing values.
+        DataFrame.dropna : Omit axes labels with missing values.
+        isna : Top-level isna.
+
+        Examples
+        --------
+        Show which entries in a DataFrame are NA.
+
+        >>> df = pd.DataFrame(
+        ...     dict(
+        ...         age=[5, 6, np.nan],
+        ...         born=[
+        ...             pd.NaT,
+        ...             pd.Timestamp("1939-05-27"),
+        ...             pd.Timestamp("1940-04-25"),
+        ...         ],
+        ...         name=["Alfred", "Batman", ""],
+        ...         toy=[None, "Batmobile", "Joker"],
+        ...     )
+        ... )
+        >>> df
+           age       born    name        toy
+        0  5.0        NaT  Alfred        NaN
+        1  6.0 1939-05-27  Batman  Batmobile
+        2  NaN 1940-04-25              Joker
+
+        >>> df.isna()
+             age   born   name    toy
+        0  False   True  False   True
+        1  False  False  False  False
+        2   True  False  False  False
+
+        Show which entries in a Series are NA.
+
+        >>> ser = pd.Series([5, 6, np.nan])
+        >>> ser
+        0    5.0
+        1    6.0
+        2    NaN
+        dtype: float64
+
+        >>> ser.isna()
+        0    False
+        1    False
+        2     True
+        dtype: bool
+        """
+        res_mgr = self._mgr.isna(func=isna)
+        result = self._constructor_from_mgr(res_mgr, axes=res_mgr.axes)
+        return result.__finalize__(self, method="isna")
+
+    def isnull(self) -> DataFrame:
+        """
+        DataFrame.isnull is an alias for DataFrame.isna.
+
+        Detect missing values.
+
+        Return a boolean same-sized object indicating if the values are NA.
+        NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
+        values.
+        Everything else gets mapped to False values. Characters such as empty
+        strings ``''`` or :attr:`numpy.inf` are not considered NA values.
+
+        Returns
+        -------
+        Series/DataFrame
+            Mask of bool values for each element in Series/DataFrame
+            that indicates whether an element is an NA value.
+
+        See Also
+        --------
+        Series.isnull : Alias of isna.
+        DataFrame.isnull : Alias of isna.
+        Series.notna : Boolean inverse of isna.
+        DataFrame.notna : Boolean inverse of isna.
+        Series.dropna : Omit axes labels with missing values.
+        DataFrame.dropna : Omit axes labels with missing values.
+        isna : Top-level isna.
+
+        Examples
+        --------
+        Show which entries in a DataFrame are NA.
+
+        >>> df = pd.DataFrame(
+        ...     dict(
+        ...         age=[5, 6, np.nan],
+        ...         born=[
+        ...             pd.NaT,
+        ...             pd.Timestamp("1939-05-27"),
+        ...             pd.Timestamp("1940-04-25"),
+        ...         ],
+        ...         name=["Alfred", "Batman", ""],
+        ...         toy=[None, "Batmobile", "Joker"],
+        ...     )
+        ... )
+        >>> df
+           age       born    name        toy
+        0  5.0        NaT  Alfred        NaN
+        1  6.0 1939-05-27  Batman  Batmobile
+        2  NaN 1940-04-25              Joker
+
+        >>> df.isna()
+             age   born   name    toy
+        0  False   True  False   True
+        1  False  False  False  False
+        2   True  False  False  False
+
+        Show which entries in a Series are NA.
+
+        >>> ser = pd.Series([5, 6, np.nan])
+        >>> ser
+        0    5.0
+        1    6.0
+        2    NaN
+        dtype: float64
+
+        >>> ser.isna()
+        0    False
+        1    False
+        2     True
+        dtype: bool
+        """
+        return self.isna()
+
+    def notna(self) -> DataFrame:
+        """
+        Detect existing (non-missing) values.
+
+        Return a boolean same-sized object indicating if the values are not NA.
+        Non-missing values get mapped to True. Characters such as empty
+        strings ``''`` or :attr:`numpy.inf` are not considered NA values.
+        NA values, such as None or :attr:`numpy.NaN`, get mapped to False
+        values.
+
+        Returns
+        -------
+        Series/DataFrame
+            Mask of bool values for each element in Series/DataFrame
+            that indicates whether an element is not an NA value.
+
+        See Also
+        --------
+        Series.notnull : Alias of notna.
+        DataFrame.notnull : Alias of notna.
+        Series.isna : Boolean inverse of notna.
+        DataFrame.isna : Boolean inverse of notna.
+        Series.dropna : Omit axes labels with missing values.
+        DataFrame.dropna : Omit axes labels with missing values.
+        notna : Top-level notna.
+
+        Examples
+        --------
+        Show which entries in a DataFrame are not NA.
+
+        >>> df = pd.DataFrame(
+        ...     dict(
+        ...         age=[5, 6, np.nan],
+        ...         born=[
+        ...             pd.NaT,
+        ...             pd.Timestamp("1939-05-27"),
+        ...             pd.Timestamp("1940-04-25"),
+        ...         ],
+        ...         name=["Alfred", "Batman", ""],
+        ...         toy=[None, "Batmobile", "Joker"],
+        ...     )
+        ... )
+        >>> df
+           age       born    name        toy
+        0  5.0        NaT  Alfred        NaN
+        1  6.0 1939-05-27  Batman  Batmobile
+        2  NaN 1940-04-25              Joker
+
+        >>> df.notna()
+             age   born  name    toy
+        0   True  False  True  False
+        1   True   True  True   True
+        2  False   True  True   True
+
+        Show which entries in a Series are not NA.
+
+        >>> ser = pd.Series([5, 6, np.nan])
+        >>> ser
+        0    5.0
+        1    6.0
+        2    NaN
+        dtype: float64
+
+        >>> ser.notna()
+        0     True
+        1     True
+        2    False
+        dtype: bool
+        """
+        return ~self.isna()
+
+    def notnull(self) -> DataFrame:
+        """
+        DataFrame.notnull is an alias for DataFrame.notna.
+
+        Detect existing (non-missing) values.
+
+        Return a boolean same-sized object indicating if the values are not NA.
+        Non-missing values get mapped to True. Characters such as empty
+        strings ``''`` or :attr:`numpy.inf` are not considered NA values.
+        NA values, such as None or :attr:`numpy.NaN`, get mapped to False
+        values.
+
+        Returns
+        -------
+        Series/DataFrame
+            Mask of bool values for each element in Series/DataFrame
+            that indicates whether an element is not an NA value.
+
+        See Also
+        --------
+        Series.notnull : Alias of notna.
+        DataFrame.notnull : Alias of notna.
+        Series.isna : Boolean inverse of notna.
+        DataFrame.isna : Boolean inverse of notna.
+        Series.dropna : Omit axes labels with missing values.
+        DataFrame.dropna : Omit axes labels with missing values.
+        notna : Top-level notna.
+
+        Examples
+        --------
+        Show which entries in a DataFrame are not NA.
+
+        >>> df = pd.DataFrame(
+        ...     dict(
+        ...         age=[5, 6, np.nan],
+        ...         born=[
+        ...             pd.NaT,
+        ...             pd.Timestamp("1939-05-27"),
+        ...             pd.Timestamp("1940-04-25"),
+        ...         ],
+        ...         name=["Alfred", "Batman", ""],
+        ...         toy=[None, "Batmobile", "Joker"],
+        ...     )
+        ... )
+        >>> df
+           age       born    name        toy
+        0  5.0        NaT  Alfred        NaN
+        1  6.0 1939-05-27  Batman  Batmobile
+        2  NaN 1940-04-25              Joker
+
+        >>> df.notnull()
+             age   born  name    toy
+        0   True  False  True  False
+        1   True   True  True   True
+        2  False   True  True   True
+
+        Show which entries in a Series are not NA.
+
+        >>> ser = pd.Series([5, 6, np.nan])
+        >>> ser
+        0    5.0
+        1    6.0
+        2    NaN
+        dtype: float64
+
+        >>> ser.notnull()
+        0     True
+        1     True
+        2    False
+        dtype: bool
+        """
+        return ~self.isna()
+
+    @overload
+    def dropna(
+        self,
+        *,
+        axis: Axis = ...,
+        how: AnyAll | lib.NoDefault = ...,
+        thresh: int | lib.NoDefault = ...,
+        subset: IndexLabel = ...,
+        inplace: Literal[False] = ...,
+        ignore_index: bool = ...,
+    ) -> DataFrame: ...
+
+    @overload
+    def dropna(
+        self,
+        *,
+        axis: Axis = ...,
+        how: AnyAll | lib.NoDefault = ...,
+        thresh: int | lib.NoDefault = ...,
+        subset: IndexLabel = ...,
+        inplace: Literal[True],
+        ignore_index: bool = ...,
+    ) -> None: ...
+
+    def dropna(
+        self,
+        *,
+        axis: Axis = 0,
+        how: AnyAll | lib.NoDefault = lib.no_default,
+        thresh: int | lib.NoDefault = lib.no_default,
+        subset: IndexLabel | AnyArrayLike | None = None,
+        inplace: bool = False,
+        ignore_index: bool = False,
+    ) -> DataFrame | None:
+        """
+        Remove missing values.
+
+        See the :ref:`User Guide <missing_data>` for more on which values are
+        considered missing, and how to work with missing data.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Determine if rows or columns which contain missing values are
+            removed.
+
+            * 0, or 'index' : Drop rows which contain missing values.
+            * 1, or 'columns' : Drop columns which contain missing value.
+
+            Only a single axis is allowed.
+
+        how : {'any', 'all'}, default 'any'
+            Determine if row or column is removed from DataFrame, when we have
+            at least one NA or all NA.
+
+            * 'any' : If any NA values are present, drop that row or column.
+            * 'all' : If all values are NA, drop that row or column.
+
+        thresh : int, optional
+            Require that many non-NA values. Cannot be combined with how.
+        subset : column label or iterable of labels, optional
+            Labels along other axis to consider, e.g. if you are dropping rows
+            these would be a list of columns to include.
+        inplace : bool, default False
+            Whether to modify the DataFrame rather than creating a new one.
+        ignore_index : bool, default ``False``
+            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
+
+            .. versionadded:: 2.0.0
+
+        Returns
+        -------
+        DataFrame or None
+            DataFrame with NA entries dropped from it or None if ``inplace=True``.
+
+        See Also
+        --------
+        DataFrame.isna: Indicate missing values.
+        DataFrame.notna : Indicate existing (non-missing) values.
+        DataFrame.fillna : Replace missing values.
+        Series.dropna : Drop missing values.
+        Index.dropna : Drop missing indices.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "name": ["Alfred", "Batman", "Catwoman"],
+        ...         "toy": [np.nan, "Batmobile", "Bullwhip"],
+        ...         "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT],
+        ...     }
+        ... )
+        >>> df
+               name        toy       born
+        0    Alfred        NaN        NaT
+        1    Batman  Batmobile 1940-04-25
+        2  Catwoman   Bullwhip        NaT
+
+        Drop the rows where at least one element is missing.
+
+        >>> df.dropna()
+             name        toy       born
+        1  Batman  Batmobile 1940-04-25
+
+        Drop the columns where at least one element is missing.
+
+        >>> df.dropna(axis="columns")
+               name
+        0    Alfred
+        1    Batman
+        2  Catwoman
+
+        Drop the rows where all elements are missing.
+
+        >>> df.dropna(how="all")
+               name        toy       born
+        0    Alfred        NaN        NaT
+        1    Batman  Batmobile 1940-04-25
+        2  Catwoman   Bullwhip        NaT
+
+        Keep only the rows with at least 2 non-NA values.
+
+        >>> df.dropna(thresh=2)
+               name        toy       born
+        1    Batman  Batmobile 1940-04-25
+        2  Catwoman   Bullwhip        NaT
+
+        Define in which columns to look for missing values.
+
+        >>> df.dropna(subset=["name", "toy"])
+               name        toy       born
+        1    Batman  Batmobile 1940-04-25
+        2  Catwoman   Bullwhip        NaT
+        """
+        if (how is not lib.no_default) and (thresh is not lib.no_default):
+            raise TypeError(
+                "You cannot set both the how and thresh arguments at the same time."
+            )
+
+        if how is lib.no_default:
+            how = "any"
+
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        if isinstance(axis, (tuple, list)):
+            # GH20987
+            raise TypeError("supplying multiple axes to axis is no longer supported.")
+
+        axis = self._get_axis_number(axis)
+        agg_axis = 1 - axis
+
+        agg_obj = self
+        if subset is not None:
+            # subset needs to be list
+            if not is_list_like(subset):
+                subset = [cast(Hashable, subset)]
+            ax = self._get_axis(agg_axis)
+            indices = ax.get_indexer_for(subset)
+            check = indices == -1
+            if check.any():
+                raise KeyError(np.array(subset)[check].tolist())
+            agg_obj = self.take(indices, axis=agg_axis)
+
+        if thresh is not lib.no_default:
+            count = agg_obj.count(axis=agg_axis)
+            mask = count >= thresh
+        elif how == "any":
+            # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]'
+            mask = notna(agg_obj).all(axis=agg_axis, bool_only=False)
+        elif how == "all":
+            # faster equivalent to 'agg_obj.count(agg_axis) > 0'
+            mask = notna(agg_obj).any(axis=agg_axis, bool_only=False)
+        else:
+            raise ValueError(f"invalid how option: {how}")
+
+        if np.all(mask):
+            result = self.copy(deep=False)
+        else:
+            result = self.loc(axis=axis)[mask]
+
+        if ignore_index:
+            result.index = default_index(len(result))
+
+        if not inplace:
+            return result
+        self._update_inplace(result)
+        return None
+
+    @overload
+    def drop_duplicates(
+        self,
+        subset: Hashable | Iterable[Hashable] | None = ...,
+        *,
+        keep: DropKeep = ...,
+        inplace: Literal[True],
+        ignore_index: bool = ...,
+    ) -> None: ...
+
+    @overload
+    def drop_duplicates(
+        self,
+        subset: Hashable | Iterable[Hashable] | None = ...,
+        *,
+        keep: DropKeep = ...,
+        inplace: Literal[False] = ...,
+        ignore_index: bool = ...,
+    ) -> DataFrame: ...
+
+    @overload
+    def drop_duplicates(
+        self,
+        subset: Hashable | Iterable[Hashable] | None = ...,
+        *,
+        keep: DropKeep = ...,
+        inplace: bool = ...,
+        ignore_index: bool = ...,
+    ) -> DataFrame | None: ...
+
+    def drop_duplicates(
+        self,
+        subset: Hashable | Iterable[Hashable] | None = None,
+        *,
+        keep: DropKeep = "first",
+        inplace: bool = False,
+        ignore_index: bool = False,
+    ) -> DataFrame | None:
+        """
+        Return DataFrame with duplicate rows removed.
+
+        Considering certain columns is optional. Indexes, including time indexes
+        are ignored.
+
+        Parameters
+        ----------
+        subset : column label or iterable of labels, optional
+            Only consider certain columns for identifying duplicates, by
+            default use all of the columns.
+        keep : {'first', 'last', ``False``}, default 'first'
+            Determines which duplicates (if any) to keep.
+
+            - 'first' : Drop duplicates except for the first occurrence.
+            - 'last' : Drop duplicates except for the last occurrence.
+            - ``False`` : Drop all duplicates.
+
+        inplace : bool, default ``False``
+            Whether to modify the DataFrame rather than creating a new one.
+        ignore_index : bool, default ``False``
+            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        DataFrame or None
+            DataFrame with duplicates removed or None if ``inplace=True``.
+
+        See Also
+        --------
+        DataFrame.value_counts: Count unique combinations of columns.
+
+        Notes
+        -----
+        This method requires columns specified by ``subset`` to be of hashable type.
+        Passing unhashable columns will raise a ``TypeError``.
+
+        Examples
+        --------
+        Consider dataset containing ramen rating.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"],
+        ...         "style": ["cup", "cup", "cup", "pack", "pack"],
+        ...         "rating": [4, 4, 3.5, 15, 5],
+        ...     }
+        ... )
+        >>> df
+            brand style  rating
+        0  Yum Yum   cup     4.0
+        1  Yum Yum   cup     4.0
+        2  Indomie   cup     3.5
+        3  Indomie  pack    15.0
+        4  Indomie  pack     5.0
+
+        By default, it removes duplicate rows based on all columns.
+
+        >>> df.drop_duplicates()
+            brand style  rating
+        0  Yum Yum   cup     4.0
+        2  Indomie   cup     3.5
+        3  Indomie  pack    15.0
+        4  Indomie  pack     5.0
+
+        To remove duplicates on specific column(s), use ``subset``.
+
+        >>> df.drop_duplicates(subset=["brand"])
+            brand style  rating
+        0  Yum Yum   cup     4.0
+        2  Indomie   cup     3.5
+
+        To remove duplicates and keep last occurrences, use ``keep``.
+
+        >>> df.drop_duplicates(subset=["brand", "style"], keep="last")
+            brand style  rating
+        1  Yum Yum   cup     4.0
+        2  Indomie   cup     3.5
+        4  Indomie  pack     5.0
+        """
+        if self.empty:
+            return self.copy(deep=False)
+
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")
+
+        result = self[-self.duplicated(subset, keep=keep)]
+        if ignore_index:
+            result.index = default_index(len(result))
+
+        if inplace:
+            self._update_inplace(result)
+            return None
+        else:
+            return result
+
+    def duplicated(
+        self,
+        subset: Hashable | Iterable[Hashable] | None = None,
+        keep: DropKeep = "first",
+    ) -> Series:
+        """
+        Return boolean Series denoting duplicate rows.
+
+        Considering certain columns is optional.
+
+        Parameters
+        ----------
+        subset : column label or iterable of labels, optional
+            Only consider certain columns for identifying duplicates, by
+            default use all of the columns.
+        keep : {'first', 'last', False}, default 'first'
+            Determines which duplicates (if any) to mark.
+
+            - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
+            - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
+            - False : Mark all duplicates as ``True``.
+
+        Returns
+        -------
+        Series
+            Boolean series for each duplicated rows.
+
+        See Also
+        --------
+        Index.duplicated : Equivalent method on index.
+        Series.duplicated : Equivalent method on Series.
+        Series.drop_duplicates : Remove duplicate values from Series.
+        DataFrame.drop_duplicates : Remove duplicate values from DataFrame.
+
+        Examples
+        --------
+        Consider dataset containing ramen rating.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"],
+        ...         "style": ["cup", "cup", "cup", "pack", "pack"],
+        ...         "rating": [4, 4, 3.5, 15, 5],
+        ...     }
+        ... )
+        >>> df
+            brand style  rating
+        0  Yum Yum   cup     4.0
+        1  Yum Yum   cup     4.0
+        2  Indomie   cup     3.5
+        3  Indomie  pack    15.0
+        4  Indomie  pack     5.0
+
+        By default, for each set of duplicated values, the first occurrence
+        is set on False and all others on True.
+
+        >>> df.duplicated()
+        0    False
+        1     True
+        2    False
+        3    False
+        4    False
+        dtype: bool
+
+        By using 'last', the last occurrence of each set of duplicated values
+        is set on False and all others on True.
+
+        >>> df.duplicated(keep="last")
+        0     True
+        1    False
+        2    False
+        3    False
+        4    False
+        dtype: bool
+
+        By setting ``keep`` on False, all duplicates are True.
+
+        >>> df.duplicated(keep=False)
+        0     True
+        1     True
+        2    False
+        3    False
+        4    False
+        dtype: bool
+
+        To find duplicates on specific column(s), use ``subset``.
+
+        >>> df.duplicated(subset=["brand"])
+        0    False
+        1     True
+        2    False
+        3     True
+        4     True
+        dtype: bool
+        """
+
+        if self.empty:
+            return self._constructor_sliced(dtype=bool)
+
+        def f(vals) -> tuple[np.ndarray, int]:
+            labels, shape = algorithms.factorize(vals, size_hint=len(self))
+            return labels.astype("i8"), len(shape)
+
+        if subset is None:
+            subset = self.columns
+        elif (
+            not np.iterable(subset)
+            or isinstance(subset, str)
+            or (isinstance(subset, tuple) and subset in self.columns)
+        ):
+            subset = (subset,)
+
+        #  needed for mypy since can't narrow types using np.iterable
+        subset = cast(Sequence, subset)
+
+        # Verify all columns in subset exist in the queried dataframe
+        # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
+        # key that doesn't exist.
+        diff = set(subset) - set(self.columns)
+        if diff:
+            raise KeyError(Index(diff))
+
+        if len(subset) == 1 and self.columns.is_unique:
+            # GH#45236 This is faster than get_group_index below
+            result = self[next(iter(subset))].duplicated(keep)
+            result.name = None
+        else:
+            vals = (col.values for name, col in self.items() if name in subset)
+            labels, shape = map(list, zip(*map(f, vals), strict=True))
+
+            ids = get_group_index(labels, tuple(shape), sort=False, xnull=False)
+            result = self._constructor_sliced(duplicated(ids, keep), index=self.index)
+        return result.__finalize__(self, method="duplicated")
+
+    # ----------------------------------------------------------------------
+    # Sorting
+    # error: Signature of "sort_values" incompatible with supertype "NDFrame"
+    @overload  # type: ignore[override]
+    def sort_values(
+        self,
+        by: IndexLabel,
+        *,
+        axis: Axis = ...,
+        ascending=...,
+        inplace: Literal[False] = ...,
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        ignore_index: bool = ...,
+        key: ValueKeyFunc = ...,
+    ) -> DataFrame: ...
+
+    @overload
+    def sort_values(
+        self,
+        by: IndexLabel,
+        *,
+        axis: Axis = ...,
+        ascending=...,
+        inplace: Literal[True],
+        kind: SortKind = ...,
+        na_position: str = ...,
+        ignore_index: bool = ...,
+        key: ValueKeyFunc = ...,
+    ) -> None: ...
+
+    def sort_values(
+        self,
+        by: IndexLabel,
+        *,
+        axis: Axis = 0,
+        ascending: bool | list[bool] | tuple[bool, ...] = True,
+        inplace: bool = False,
+        kind: SortKind = "quicksort",
+        na_position: str = "last",
+        ignore_index: bool = False,
+        key: ValueKeyFunc | None = None,
+    ) -> DataFrame | None:
+        """
+        Sort by the values along either axis.
+
+        Parameters
+        ----------
+        by : str or list of str
+            Name or list of names to sort by.
+
+            - if `axis` is 0 or `'index'` then `by` may contain index
+              levels and/or column labels.
+            - if `axis` is 1 or `'columns'` then `by` may contain column
+              levels and/or index labels.
+        axis : "{0 or 'index', 1 or 'columns'}", default 0
+             Axis to be sorted.
+        ascending : bool or list of bool, default True
+             Sort ascending vs. descending. Specify list for multiple sort
+             orders.  If this is a list of bools, must match the length of
+             the by.
+        inplace : bool, default False
+             If True, perform operation in-place.
+        kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
+             Choice of sorting algorithm. See also :func:`numpy.sort` for more
+             information. `mergesort` and `stable` are the only stable algorithms. For
+             DataFrames, this option is only applied when sorting on a single
+             column or label.
+        na_position : {'first', 'last'}, default 'last'
+             Puts NaNs at the beginning if `first`; `last` puts NaNs at the
+             end.
+        ignore_index : bool, default False
+             If True, the resulting axis will be labeled 0, 1, …, n - 1.
+        key : callable, optional
+            Apply the key function to the values
+            before sorting. This is similar to the `key` argument in the
+            builtin :meth:`sorted` function, with the notable difference that
+            this `key` function should be *vectorized*. It should expect a
+            ``Series`` and return a Series with the same shape as the input.
+            It will be applied to each column in `by` independently. The values in the
+            returned Series will be used as the keys for sorting.
+
+        Returns
+        -------
+        DataFrame or None
+            DataFrame with sorted values or None if ``inplace=True``.
+
+        See Also
+        --------
+        DataFrame.sort_index : Sort a DataFrame by the index.
+        Series.sort_values : Similar method for a Series.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "col1": ["A", "A", "B", np.nan, "D", "C"],
+        ...         "col2": [2, 1, 9, 8, 7, 4],
+        ...         "col3": [0, 1, 9, 4, 2, 3],
+        ...         "col4": ["a", "B", "c", "D", "e", "F"],
+        ...     }
+        ... )
+        >>> df
+          col1  col2  col3 col4
+        0    A     2     0    a
+        1    A     1     1    B
+        2    B     9     9    c
+        3  NaN     8     4    D
+        4    D     7     2    e
+        5    C     4     3    F
+
+        **Sort by a single column**
+
+        In this case, we are sorting the rows according to values in ``col1``:
+
+        >>> df.sort_values(by=["col1"])
+          col1  col2  col3 col4
+        0    A     2     0    a
+        1    A     1     1    B
+        2    B     9     9    c
+        5    C     4     3    F
+        4    D     7     2    e
+        3  NaN     8     4    D
+
+        **Sort by multiple columns**
+
+        You can also provide multiple columns to ``by`` argument, as shown below.
+        In this example, the rows are first sorted according to ``col1``, and then
+        the rows that have an identical value in ``col1`` are sorted according
+        to ``col2``.
+
+        >>> df.sort_values(by=["col1", "col2"])
+          col1  col2  col3 col4
+        1    A     1     1    B
+        0    A     2     0    a
+        2    B     9     9    c
+        5    C     4     3    F
+        4    D     7     2    e
+        3  NaN     8     4    D
+
+        **Sort in a descending order**
+
+        The sort order can be reversed using ``ascending`` argument, as shown below:
+
+        >>> df.sort_values(by="col1", ascending=False)
+          col1  col2  col3 col4
+        4    D     7     2    e
+        5    C     4     3    F
+        2    B     9     9    c
+        0    A     2     0    a
+        1    A     1     1    B
+        3  NaN     8     4    D
+
+        **Placing any** ``NA`` **first**
+
+        Note that in the above example, the rows that contain an ``NA`` value in their
+        ``col1`` are placed at the end of the dataframe. This behavior can be modified
+        via ``na_position`` argument, as shown below:
+
+        >>> df.sort_values(by="col1", ascending=False, na_position="first")
+          col1  col2  col3 col4
+        3  NaN     8     4    D
+        4    D     7     2    e
+        5    C     4     3    F
+        2    B     9     9    c
+        0    A     2     0    a
+        1    A     1     1    B
+
+        **Customized sort order**
+
+        The ``key`` argument allows for a further customization of sorting behaviour.
+        For example, you may want
+        to ignore the `letter's case <https://en.wikipedia.org/wiki/Letter_case>`__
+        when sorting strings:
+
+        >>> df.sort_values(by="col4", key=lambda col: col.str.lower())
+           col1  col2  col3 col4
+        0    A     2     0    a
+        1    A     1     1    B
+        2    B     9     9    c
+        3  NaN     8     4    D
+        4    D     7     2    e
+        5    C     4     3    F
+
+        Another typical example is
+        `natural sorting <https://en.wikipedia.org/wiki/Natural_sort_order>`__.
+        This can be done using
+        ``natsort`` `package <https://github.com/SethMMorton/natsort>`__,
+        which provides a function to generate a key
+        to sort data in their natural order:
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "hours": ["0hr", "128hr", "0hr", "64hr", "64hr", "128hr"],
+        ...         "mins": [
+        ...             "10mins",
+        ...             "40mins",
+        ...             "40mins",
+        ...             "40mins",
+        ...             "10mins",
+        ...             "10mins",
+        ...         ],
+        ...         "value": [10, 20, 30, 40, 50, 60],
+        ...     }
+        ... )
+        >>> df
+           hours    mins  value
+        0    0hr  10mins     10
+        1  128hr  40mins     20
+        2    0hr  40mins     30
+        3   64hr  40mins     40
+        4   64hr  10mins     50
+        5  128hr  10mins     60
+        >>> from natsort import natsort_keygen
+        >>> df.sort_values(
+        ...     by=["hours", "mins"],
+        ...     key=natsort_keygen(),
+        ... )
+           hours    mins  value
+        0    0hr  10mins     10
+        2    0hr  40mins     30
+        4   64hr  10mins     50
+        3   64hr  40mins     40
+        5  128hr  10mins     60
+        1  128hr  40mins     20
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        axis = self._get_axis_number(axis)
+        ascending = validate_ascending(ascending)
+        if not isinstance(by, list):
+            by = [by]
+        # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]";
+        # expected "Sized"
+        if is_sequence(ascending) and (
+            len(by) != len(ascending)  # type: ignore[arg-type]
+        ):
+            # error: Argument 1 to "len" has incompatible type "Union[bool,
+            # List[bool]]"; expected "Sized"
+            raise ValueError(
+                f"Length of ascending ({len(ascending)})"  # type: ignore[arg-type]
+                f" != length of by ({len(by)})"
+            )
+        if len(by) > 1:
+            keys = (self._get_label_or_level_values(x, axis=axis) for x in by)
+
+            # need to rewrap columns in Series to apply key function
+            if key is not None:
+                keys_data = [
+                    Series(k, name=name) for (k, name) in zip(keys, by, strict=True)
+                ]
+            else:
+                # error: Argument 1 to "list" has incompatible type
+                # "Generator[ExtensionArray | ndarray[Any, Any], None, None]";
+                # expected "Iterable[Series]"
+                keys_data = list(keys)  # type: ignore[arg-type]
+
+            indexer = lexsort_indexer(
+                keys_data, orders=ascending, na_position=na_position, key=key
+            )
+        elif by:
+            # len(by) == 1
+
+            k = self._get_label_or_level_values(by[0], axis=axis)
+
+            # need to rewrap column in Series to apply key function
+            if key is not None:
+                # error: Incompatible types in assignment (expression has type
+                # "Series", variable has type "ndarray")
+                k = Series(k, name=by[0])  # type: ignore[assignment]
+
+            if isinstance(ascending, (tuple, list)):
+                ascending = ascending[0]
+
+            indexer = nargsort(
+                k, kind=kind, ascending=ascending, na_position=na_position, key=key
+            )
+        elif inplace:
+            return self._update_inplace(self)
+        else:
+            return self.copy(deep=False)
+
+        if is_range_indexer(indexer, len(indexer)):
+            result = self.copy(deep=False)
+            if ignore_index:
+                result.index = default_index(len(result))
+
+            if inplace:
+                return self._update_inplace(result)
+            else:
+                return result
+
+        new_data = self._mgr.take(
+            indexer, axis=self._get_block_manager_axis(axis), verify=False
+        )
+
+        if ignore_index:
+            new_data.set_axis(
+                self._get_block_manager_axis(axis), default_index(len(indexer))
+            )
+
+        result = self._constructor_from_mgr(new_data, axes=new_data.axes)
+        if inplace:
+            return self._update_inplace(result)
+        else:
+            return result.__finalize__(self, method="sort_values")
+
+    @overload
+    def sort_index(
+        self,
+        *,
+        axis: Axis = ...,
+        level: IndexLabel = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: Literal[True],
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        sort_remaining: bool = ...,
+        ignore_index: bool = ...,
+        key: IndexKeyFunc = ...,
+    ) -> None: ...
+
+    @overload
+    def sort_index(
+        self,
+        *,
+        axis: Axis = ...,
+        level: IndexLabel = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: Literal[False] = ...,
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        sort_remaining: bool = ...,
+        ignore_index: bool = ...,
+        key: IndexKeyFunc = ...,
+    ) -> DataFrame: ...
+
+    @overload
+    def sort_index(
+        self,
+        *,
+        axis: Axis = ...,
+        level: IndexLabel = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: bool = ...,
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        sort_remaining: bool = ...,
+        ignore_index: bool = ...,
+        key: IndexKeyFunc = ...,
+    ) -> DataFrame | None: ...
+
+    def sort_index(
+        self,
+        *,
+        axis: Axis = 0,
+        level: IndexLabel | None = None,
+        ascending: bool | Sequence[bool] = True,
+        inplace: bool = False,
+        kind: SortKind = "quicksort",
+        na_position: NaPosition = "last",
+        sort_remaining: bool = True,
+        ignore_index: bool = False,
+        key: IndexKeyFunc | None = None,
+    ) -> DataFrame | None:
+        """
+        Sort object by labels (along an axis).
+
+        Returns a new DataFrame sorted by label if `inplace` argument is
+        ``False``, otherwise updates the original DataFrame and returns None.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis along which to sort.  The value 0 identifies the rows,
+            and 1 identifies the columns.
+        level : int or level name or list of ints or list of level names
+            If not None, sort on values in specified index level(s).
+        ascending : bool or list-like of bools, default True
+            Sort ascending vs. descending. When the index is a MultiIndex the
+            sort direction can be controlled for each level individually.
+        inplace : bool, default False
+            Whether to modify the DataFrame rather than creating a new one.
+        kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
+            Choice of sorting algorithm. See also :func:`numpy.sort` for more
+            information. `mergesort` and `stable` are the only stable algorithms. For
+            DataFrames, this option is only applied when sorting on a single
+            column or label.
+        na_position : {'first', 'last'}, default 'last'
+            Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
+            Not implemented for MultiIndex.
+        sort_remaining : bool, default True
+            If True and sorting by level and index is multilevel, sort by other
+            levels too (in order) after sorting by specified level.
+        ignore_index : bool, default False
+            If True, the resulting axis will be labeled 0, 1, …, n - 1.
+        key : callable, optional
+            If not None, apply the key function to the index values
+            before sorting. This is similar to the `key` argument in the
+            builtin :meth:`sorted` function, with the notable difference that
+            this `key` function should be *vectorized*. It should expect an
+            ``Index`` and return an ``Index`` of the same shape. For MultiIndex
+            inputs, the key is applied *per level*.
+
+        Returns
+        -------
+        DataFrame or None
+            The original DataFrame sorted by the labels or None if ``inplace=True``.
+
+        See Also
+        --------
+        Series.sort_index : Sort Series by the index.
+        DataFrame.sort_values : Sort DataFrame by the value.
+        Series.sort_values : Sort Series by the value.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], columns=["A"]
+        ... )
+        >>> df.sort_index()
+             A
+        1    4
+        29   2
+        100  1
+        150  5
+        234  3
+
+        By default, it sorts in ascending order, to sort in descending order,
+        use ``ascending=False``
+
+        >>> df.sort_index(ascending=False)
+             A
+        234  3
+        150  5
+        100  1
+        29   2
+        1    4
+
+        A key function can be specified which is applied to the index before
+        sorting. For a ``MultiIndex`` this is applied to each level separately.
+
+        >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=["A", "b", "C", "d"])
+        >>> df.sort_index(key=lambda x: x.str.lower())
+           a
+        A  1
+        b  2
+        C  3
+        d  4
+        """
+        return super().sort_index(
+            axis=axis,
+            level=level,
+            ascending=ascending,
+            inplace=inplace,
+            kind=kind,
+            na_position=na_position,
+            sort_remaining=sort_remaining,
+            ignore_index=ignore_index,
+            key=key,
+        )
+
+    def value_counts(
+        self,
+        subset: IndexLabel | None = None,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        dropna: bool = True,
+    ) -> Series:
+        """
+        Return a Series containing the frequency of each distinct row in the DataFrame.
+
+        Parameters
+        ----------
+        subset : Hashable or a sequence of the previous, optional
+            Columns to use when counting unique combinations.
+        normalize : bool, default False
+            Return proportions rather than frequencies.
+        sort : bool, default True
+            Stable sort by frequencies when True. Preserve the order of the data
+            when False.
+
+            .. versionchanged:: 3.0.0
+
+                Prior to 3.0.0, ``sort=False`` would sort by the columns values.
+
+            .. versionchanged:: 3.0.0
+
+                Prior to 3.0.0, the sort was unstable.
+        ascending : bool, default False
+            Sort in ascending order.
+        dropna : bool, default True
+            Do not include counts of rows that contain NA values.
+
+        Returns
+        -------
+        Series
+            Series containing the frequency of each distinct row in the DataFrame.
+
+        See Also
+        --------
+        Series.value_counts: Equivalent method on Series.
+
+        Notes
+        -----
+        The returned Series will have a MultiIndex with one level per input
+        column but an Index (non-multi) for a single label. By default, rows
+        that contain any NA values are omitted from the result. By default,
+        the resulting Series will be sorted by frequencies in descending order so that
+        the first element is the most frequently-occurring row.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
+        ...     index=["falcon", "dog", "cat", "ant"],
+        ... )
+        >>> df
+                num_legs  num_wings
+        falcon         2          2
+        dog            4          0
+        cat            4          0
+        ant            6          0
+
+        >>> df.value_counts()
+        num_legs  num_wings
+        4         0            2
+        2         2            1
+        6         0            1
+        Name: count, dtype: int64
+
+        >>> df.value_counts(sort=False)
+        num_legs  num_wings
+        2         2            1
+        4         0            2
+        6         0            1
+        Name: count, dtype: int64
+
+        >>> df.value_counts(ascending=True)
+        num_legs  num_wings
+        2         2            1
+        6         0            1
+        4         0            2
+        Name: count, dtype: int64
+
+        >>> df.value_counts(normalize=True)
+        num_legs  num_wings
+        4         0            0.50
+        2         2            0.25
+        6         0            0.25
+        Name: proportion, dtype: float64
+
+        With `dropna` set to `False` we can also count rows with NA values.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "first_name": ["John", "Anne", "John", "Beth"],
+        ...         "middle_name": ["Smith", pd.NA, pd.NA, "Louise"],
+        ...     }
+        ... )
+        >>> df
+          first_name middle_name
+        0       John       Smith
+        1       Anne         NaN
+        2       John         NaN
+        3       Beth      Louise
+
+        >>> df.value_counts()
+        first_name  middle_name
+        John        Smith          1
+        Beth        Louise         1
+        Name: count, dtype: int64
+
+        >>> df.value_counts(dropna=False)
+        first_name  middle_name
+        John        Smith          1
+        Anne        NaN            1
+        John        NaN            1
+        Beth        Louise         1
+        Name: count, dtype: int64
+
+        >>> df.value_counts("first_name")
+        first_name
+        John    2
+        Anne    1
+        Beth    1
+        Name: count, dtype: int64
+        """
+        if subset is None:
+            subset = self.columns.tolist()
+
+        name = "proportion" if normalize else "count"
+        counts = self.groupby(
+            subset, sort=False, dropna=dropna, observed=False
+        )._grouper.size()
+        counts.name = name
+
+        if sort:
+            counts = counts.sort_values(ascending=ascending, kind="stable")
+        if normalize:
+            counts /= counts.sum()
+
+        # Force MultiIndex for a list_like subset with a single column
+        if is_list_like(subset) and len(subset) == 1:  # type: ignore[arg-type]
+            counts.index = MultiIndex.from_arrays(
+                [counts.index], names=[counts.index.name]
+            )
+
+        return counts
+
+    def nlargest(
+        self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first"
+    ) -> DataFrame:
+        """
+        Return the first `n` rows ordered by `columns` in descending order.
+
+        Return the first `n` rows with the largest values in `columns`, in
+        descending order. The columns that are not specified are returned as
+        well, but not used for ordering.
+
+        This method is equivalent to
+        ``df.sort_values(columns, ascending=False).head(n)``, but more
+        performant.
+
+        Parameters
+        ----------
+        n : int
+            Number of rows to return.
+        columns : Hashable or a sequence of the previous
+            Column label(s) to order by.
+        keep : {'first', 'last', 'all'}, default 'first'
+            Where there are duplicate values:
+
+            - ``first`` : prioritize the first occurrence(s)
+            - ``last`` : prioritize the last occurrence(s)
+            - ``all`` : keep all the ties of the smallest item even if it means
+              selecting more than ``n`` items.
+
+        Returns
+        -------
+        DataFrame
+            The first `n` rows ordered by the given columns in descending
+            order.
+
+        See Also
+        --------
+        DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
+            ascending order.
+        DataFrame.sort_values : Sort DataFrame by the values.
+        DataFrame.head : Return the first `n` rows without re-ordering.
+
+        Notes
+        -----
+        This function cannot be used with all column types. For example, when
+        specifying columns with `object` or `category` dtypes, ``TypeError`` is
+        raised.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "population": [
+        ...             59000000,
+        ...             65000000,
+        ...             434000,
+        ...             434000,
+        ...             434000,
+        ...             337000,
+        ...             11300,
+        ...             11300,
+        ...             11300,
+        ...         ],
+        ...         "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311],
+        ...         "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"],
+        ...     },
+        ...     index=[
+        ...         "Italy",
+        ...         "France",
+        ...         "Malta",
+        ...         "Maldives",
+        ...         "Brunei",
+        ...         "Iceland",
+        ...         "Nauru",
+        ...         "Tuvalu",
+        ...         "Anguilla",
+        ...     ],
+        ... )
+        >>> df
+                  population      GDP alpha-2
+        Italy       59000000  1937894      IT
+        France      65000000  2583560      FR
+        Malta         434000    12011      MT
+        Maldives      434000     4520      MV
+        Brunei        434000    12128      BN
+        Iceland       337000    17036      IS
+        Nauru          11300      182      NR
+        Tuvalu         11300       38      TV
+        Anguilla       11300      311      AI
+
+        In the following example, we will use ``nlargest`` to select the three
+        rows having the largest values in column "population".
+
+        >>> df.nlargest(3, "population")
+                population      GDP alpha-2
+        France    65000000  2583560      FR
+        Italy     59000000  1937894      IT
+        Malta       434000    12011      MT
+
+        When using ``keep='last'``, ties are resolved in reverse order:
+
+        >>> df.nlargest(3, "population", keep="last")
+                population      GDP alpha-2
+        France    65000000  2583560      FR
+        Italy     59000000  1937894      IT
+        Brunei      434000    12128      BN
+
+        When using ``keep='all'``, the number of element kept can go beyond ``n``
+        if there are duplicate values for the smallest element, all the
+        ties are kept:
+
+        >>> df.nlargest(3, "population", keep="all")
+                  population      GDP alpha-2
+        France      65000000  2583560      FR
+        Italy       59000000  1937894      IT
+        Malta         434000    12011      MT
+        Maldives      434000     4520      MV
+        Brunei        434000    12128      BN
+
+        However, ``nlargest`` does not keep ``n`` distinct largest elements:
+
+        >>> df.nlargest(5, "population", keep="all")
+                  population      GDP alpha-2
+        France      65000000  2583560      FR
+        Italy       59000000  1937894      IT
+        Malta         434000    12011      MT
+        Maldives      434000     4520      MV
+        Brunei        434000    12128      BN
+
+        To order by the largest values in column "population" and then "GDP",
+        we can specify multiple columns like in the next example.
+
+        >>> df.nlargest(3, ["population", "GDP"])
+                population      GDP alpha-2
+        France    65000000  2583560      FR
+        Italy     59000000  1937894      IT
+        Brunei      434000    12128      BN
+        """
+        return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
+
+    def nsmallest(
+        self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first"
+    ) -> DataFrame:
+        """
+        Return the first `n` rows ordered by `columns` in ascending order.
+
+        Return the first `n` rows with the smallest values in `columns`, in
+        ascending order. The columns that are not specified are returned as
+        well, but not used for ordering.
+
+        This method is equivalent to
+        ``df.sort_values(columns, ascending=True).head(n)``, but more
+        performant.
+
+        Parameters
+        ----------
+        n : int
+            Number of items to retrieve.
+        columns : list or str
+            Column name or names to order by.
+        keep : {'first', 'last', 'all'}, default 'first'
+            Where there are duplicate values:
+
+            - ``first`` : take the first occurrence.
+            - ``last`` : take the last occurrence.
+            - ``all`` : keep all the ties of the largest item even if it means
+              selecting more than ``n`` items.
+
+        Returns
+        -------
+        DataFrame
+            DataFrame with the first `n` rows ordered by `columns` in ascending order.
+
+        See Also
+        --------
+        DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
+            descending order.
+        DataFrame.sort_values : Sort DataFrame by the values.
+        DataFrame.head : Return the first `n` rows without re-ordering.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "population": [
+        ...             59000000,
+        ...             65000000,
+        ...             434000,
+        ...             434000,
+        ...             434000,
+        ...             337000,
+        ...             337000,
+        ...             11300,
+        ...             11300,
+        ...         ],
+        ...         "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311],
+        ...         "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"],
+        ...     },
+        ...     index=[
+        ...         "Italy",
+        ...         "France",
+        ...         "Malta",
+        ...         "Maldives",
+        ...         "Brunei",
+        ...         "Iceland",
+        ...         "Nauru",
+        ...         "Tuvalu",
+        ...         "Anguilla",
+        ...     ],
+        ... )
+        >>> df
+                  population      GDP alpha-2
+        Italy       59000000  1937894      IT
+        France      65000000  2583560      FR
+        Malta         434000    12011      MT
+        Maldives      434000     4520      MV
+        Brunei        434000    12128      BN
+        Iceland       337000    17036      IS
+        Nauru         337000      182      NR
+        Tuvalu         11300       38      TV
+        Anguilla       11300      311      AI
+
+        In the following example, we will use ``nsmallest`` to select the
+        three rows having the smallest values in column "population".
+
+        >>> df.nsmallest(3, "population")
+                  population    GDP alpha-2
+        Tuvalu         11300     38      TV
+        Anguilla       11300    311      AI
+        Iceland       337000  17036      IS
+
+        When using ``keep='last'``, ties are resolved in reverse order:
+
+        >>> df.nsmallest(3, "population", keep="last")
+                  population  GDP alpha-2
+        Anguilla       11300  311      AI
+        Tuvalu         11300   38      TV
+        Nauru         337000  182      NR
+
+        When using ``keep='all'``, the number of element kept can go beyond ``n``
+        if there are duplicate values for the largest element, all the
+        ties are kept.
+
+        >>> df.nsmallest(3, "population", keep="all")
+                  population    GDP alpha-2
+        Tuvalu         11300     38      TV
+        Anguilla       11300    311      AI
+        Iceland       337000  17036      IS
+        Nauru         337000    182      NR
+
+        However, ``nsmallest`` does not keep ``n`` distinct
+        smallest elements:
+
+        >>> df.nsmallest(4, "population", keep="all")
+                  population    GDP alpha-2
+        Tuvalu         11300     38      TV
+        Anguilla       11300    311      AI
+        Iceland       337000  17036      IS
+        Nauru         337000    182      NR
+
+        To order by the smallest values in column "population" and then "GDP", we can
+        specify multiple columns like in the next example.
+
+        >>> df.nsmallest(3, ["population", "GDP"])
+                  population  GDP alpha-2
+        Tuvalu         11300   38      TV
+        Anguilla       11300  311      AI
+        Nauru         337000  182      NR
+        """
+        return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest()
+
+    def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:
+        """
+        Swap levels i and j in a :class:`MultiIndex`.
+
+        Default is to swap the two innermost levels of the index.
+
+        Parameters
+        ----------
+        i, j : int or str
+            Levels of the indices to be swapped. Can pass level name as string.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+                    The axis to swap levels on. 0 or 'index' for row-wise, 1 or
+                    'columns' for column-wise.
+
+        Returns
+        -------
+        DataFrame
+            DataFrame with levels swapped in MultiIndex.
+
+        See Also
+        --------
+        DataFrame.reorder_levels: Reorder levels of MultiIndex.
+        DataFrame.sort_index: Sort MultiIndex.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"Grade": ["A", "B", "A", "C"]},
+        ...     index=[
+        ...         ["Final exam", "Final exam", "Coursework", "Coursework"],
+        ...         ["History", "Geography", "History", "Geography"],
+        ...         ["January", "February", "March", "April"],
+        ...     ],
+        ... )
+        >>> df
+                                            Grade
+        Final exam  History     January      A
+                    Geography   February     B
+        Coursework  History     March        A
+                    Geography   April        C
+
+        In the following example, we will swap the levels of the indices.
+        Here, we will swap the levels column-wise, but levels can be swapped row-wise
+        in a similar manner. Note that column-wise is the default behaviour.
+        By not supplying any arguments for i and j, we swap the last and second to
+        last indices.
+
+        >>> df.swaplevel()
+                                            Grade
+        Final exam  January     History         A
+                    February    Geography       B
+        Coursework  March       History         A
+                    April       Geography       C
+
+        By supplying one argument, we can choose which index to swap the last
+        index with. We can for example swap the first index with the last one as
+        follows.
+
+        >>> df.swaplevel(0)
+                                            Grade
+        January     History     Final exam      A
+        February    Geography   Final exam      B
+        March       History     Coursework      A
+        April       Geography   Coursework      C
+
+        We can also define explicitly which indices we want to swap by supplying values
+        for both i and j. Here, we for example swap the first and second indices.
+
+        >>> df.swaplevel(0, 1)
+                                            Grade
+        History     Final exam  January         A
+        Geography   Final exam  February        B
+        History     Coursework  March           A
+        Geography   Coursework  April           C
+        """
+        result = self.copy(deep=False)
+
+        axis = self._get_axis_number(axis)
+
+        if not isinstance(result._get_axis(axis), MultiIndex):  # pragma: no cover
+            raise TypeError("Can only swap levels on a hierarchical axis.")
+
+        if axis == 0:
+            assert isinstance(result.index, MultiIndex)
+            result.index = result.index.swaplevel(i, j)
+        else:
+            assert isinstance(result.columns, MultiIndex)
+            result.columns = result.columns.swaplevel(i, j)
+        return result
+
+    def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame:
+        """
+        Rearrange index or column levels using input ``order``.
+
+        May not drop or duplicate levels.
+
+        Parameters
+        ----------
+        order : list of int or list of str
+            List representing new level order. Reference level by number
+            (position) or by key (label).
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Where to reorder levels.
+
+        Returns
+        -------
+        DataFrame
+            DataFrame with indices or columns with reordered levels.
+
+        See Also
+        --------
+            DataFrame.swaplevel : Swap levels i and j in a MultiIndex.
+
+        Examples
+        --------
+        >>> data = {
+        ...     "class": ["Mammals", "Mammals", "Reptiles"],
+        ...     "diet": ["Omnivore", "Carnivore", "Carnivore"],
+        ...     "species": ["Humans", "Dogs", "Snakes"],
+        ... }
+        >>> df = pd.DataFrame(data, columns=["class", "diet", "species"])
+        >>> df = df.set_index(["class", "diet"])
+        >>> df
+                                          species
+        class      diet
+        Mammals    Omnivore                Humans
+                   Carnivore                 Dogs
+        Reptiles   Carnivore               Snakes
+
+        Let's reorder the levels of the index:
+
+        >>> df.reorder_levels(["diet", "class"])
+                                          species
+        diet      class
+        Omnivore  Mammals                  Humans
+        Carnivore Mammals                    Dogs
+                  Reptiles                 Snakes
+        """
+        axis = self._get_axis_number(axis)
+        if not isinstance(self._get_axis(axis), MultiIndex):  # pragma: no cover
+            raise TypeError("Can only reorder levels on a hierarchical axis.")
+
+        result = self.copy(deep=False)
+
+        if axis == 0:
+            assert isinstance(result.index, MultiIndex)
+            result.index = result.index.reorder_levels(order)
+        else:
+            assert isinstance(result.columns, MultiIndex)
+            result.columns = result.columns.reorder_levels(order)
+        return result
+
+    # ----------------------------------------------------------------------
+    # Arithmetic Methods
+
+    def _cmp_method(self, other, op):
+        axis: Literal[1] = 1  # only relevant for Series other case
+
+        self, other = self._align_for_op(other, axis, flex=False, level=None)
+
+        # See GH#4537 for discussion of scalar op behavior
+        new_data = self._dispatch_frame_op(other, op, axis=axis)
+        return self._construct_result(new_data, other=other)
+
+    def _arith_method(self, other, op):
+        if self._should_reindex_frame_op(other, op, 1, None, None):
+            return self._arith_method_with_reindex(other, op)
+
+        axis: Literal[1] = 1  # only relevant for Series other case
+        other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],))
+
+        self, other = self._align_for_op(other, axis, flex=True, level=None)
+
+        with np.errstate(all="ignore"):
+            new_data = self._dispatch_frame_op(other, op, axis=axis)
+        return self._construct_result(new_data, other=other)
+
+    _logical_method = _arith_method
+
+    def _dispatch_frame_op(
+        self, right, func: Callable, axis: AxisInt | None = None
+    ) -> DataFrame:
+        """
+        Evaluate the frame operation func(left, right) by evaluating
+        column-by-column, dispatching to the Series implementation.
+
+        Parameters
+        ----------
+        right : scalar, Series, or DataFrame
+        func : arithmetic or comparison operator
+        axis : {None, 0, 1}
+
+        Returns
+        -------
+        DataFrame
+
+        Notes
+        -----
+        Caller is responsible for setting np.errstate where relevant.
+        """
+        # Get the appropriate array-op to apply to each column/block's values.
+        array_op = ops.get_array_op(func)
+
+        right = lib.item_from_zerodim(right)
+        if not is_list_like(right):
+            # i.e. scalar, faster than checking np.ndim(right) == 0
+            bm = self._mgr.apply(array_op, right=right)
+            return self._constructor_from_mgr(bm, axes=bm.axes)
+
+        elif isinstance(right, DataFrame):
+            assert self.index.equals(right.index)
+            assert self.columns.equals(right.columns)
+            # TODO: The previous assertion `assert right._indexed_same(self)`
+            #  fails in cases with empty columns reached via
+            #  _frame_arith_method_with_reindex
+
+            # TODO operate_blockwise expects a manager of the same type
+            bm = self._mgr.operate_blockwise(
+                right._mgr,
+                array_op,
+            )
+            return self._constructor_from_mgr(bm, axes=bm.axes)
+
+        elif isinstance(right, Series) and axis == 1:
+            # axis=1 means we want to operate row-by-row
+            assert right.index.equals(self.columns)
+
+            right = right._values
+            # maybe_align_as_frame ensures we do not have an ndarray here
+            assert not isinstance(right, np.ndarray)
+
+            arrays = [
+                array_op(_left, _right)
+                for _left, _right in zip(self._iter_column_arrays(), right, strict=True)
+            ]
+
+        elif isinstance(right, Series):
+            assert right.index.equals(self.index)
+            right = right._values
+
+            arrays = [array_op(left, right) for left in self._iter_column_arrays()]
+
+        else:
+            raise NotImplementedError(right)
+
+        return type(self)._from_arrays(
+            arrays, self.columns, self.index, verify_integrity=False
+        )
+
+    def _combine_frame(self, other: DataFrame, func, fill_value=None):
+        # at this point we have `self._indexed_same(other)`
+
+        if fill_value is None:
+            # since _arith_op may be called in a loop, avoid function call
+            #  overhead if possible by doing this check once
+            _arith_op = func
+
+        else:
+
+            def _arith_op(left, right):
+                # for the mixed_type case where we iterate over columns,
+                # _arith_op(left, right) is equivalent to
+                # left._binop(right, func, fill_value=fill_value)
+                left, right = ops.fill_binop(left, right, fill_value)
+                return func(left, right)
+
+        new_data = self._dispatch_frame_op(other, _arith_op)
+        return new_data
+
+    def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame:
+        """
+        For DataFrame-with-DataFrame operations that require reindexing,
+        operate only on shared columns, then reindex.
+
+        Parameters
+        ----------
+        right : DataFrame
+        op : binary operator
+
+        Returns
+        -------
+        DataFrame
+        """
+        left = self
+
+        # GH#31623, only operate on shared columns
+        cols, lcol_indexer, rcol_indexer = left.columns.join(
+            right.columns, how="inner", return_indexers=True
+        )
+
+        new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer]
+        new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer]
+
+        # GH#60498 For MultiIndex column alignment
+        if isinstance(cols, MultiIndex):
+            # When overwriting column names, make a shallow copy so as to not modify
+            # the input DFs
+            new_left = new_left.copy(deep=False)
+            new_right = new_right.copy(deep=False)
+            new_left.columns = cols
+            new_right.columns = cols
+
+        result = op(new_left, new_right)
+
+        # Do the join on the columns instead of using left._align_for_op
+        #  to avoid constructing two potentially large/sparse DataFrames
+        join_columns = left.columns.join(right.columns, how="outer")
+
+        if result.columns.has_duplicates:
+            # Avoid reindexing with a duplicate axis.
+            # https://github.com/pandas-dev/pandas/issues/35194
+            indexer, _ = result.columns.get_indexer_non_unique(join_columns)
+            indexer = algorithms.unique1d(indexer)
+            result = result._reindex_with_indexers(
+                {1: [join_columns, indexer]}, allow_dups=True
+            )
+        else:
+            result = result.reindex(join_columns, axis=1)
+
+        return result
+
+    def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> bool:
+        """
+        Check if this is an operation between DataFrames that will need to reindex.
+        """
+        if op is operator.pow or op is roperator.rpow:
+            # GH#32685 pow has special semantics for operating with null values
+            return False
+
+        if not isinstance(right, DataFrame):
+            return False
+
+        if (
+            (
+                isinstance(self.columns, MultiIndex)
+                or isinstance(right.columns, MultiIndex)
+            )
+            and not self.columns.equals(right.columns)
+            and fill_value is None
+        ):
+            # GH#60498 Reindex if MultiIndexe columns are not matching
+            # GH#60903 Don't reindex if fill_value is provided
+            return True
+
+        if fill_value is None and level is None and axis == 1:
+            # TODO: any other cases we should handle here?
+
+            # Intersection is always unique so we have to check the unique columns
+            left_uniques = self.columns.unique()
+            right_uniques = right.columns.unique()
+            cols = left_uniques.intersection(right_uniques)
+            if len(cols) and not (
+                len(cols) == len(left_uniques) and len(cols) == len(right_uniques)
+            ):
+                # TODO: is there a shortcut available when len(cols) == 0?
+                return True
+
+        return False
+
+    def _align_for_op(
+        self,
+        other,
+        axis: AxisInt,
+        flex: bool | None = False,
+        level: Level | None = None,
+    ):
+        """
+        Convert rhs to meet lhs dims if input is list, tuple or np.ndarray.
+
+        Parameters
+        ----------
+        other : Any
+        axis : int
+        flex : bool or None, default False
+            Whether this is a flex op, in which case we reindex.
+            None indicates not to check for alignment.
+        level : int or level name, default None
+
+        Returns
+        -------
+        left : DataFrame
+        right : Any
+        """
+        left, right = self, other
+
+        def to_series(right):
+            msg = (
+                "Unable to coerce to Series, "
+                "length must be {req_len}: given {given_len}"
+            )
+
+            # pass dtype to avoid doing inference, which would break consistency
+            #  with Index/Series ops
+            dtype = None
+            if getattr(right, "dtype", None) == object:
+                # can't pass right.dtype unconditionally as that would break on e.g.
+                #  datetime64[h] ndarray
+                dtype = object
+
+            if axis == 0:
+                if len(left.index) != len(right):
+                    raise ValueError(
+                        msg.format(req_len=len(left.index), given_len=len(right))
+                    )
+                right = left._constructor_sliced(right, index=left.index, dtype=dtype)
+            else:
+                if len(left.columns) != len(right):
+                    raise ValueError(
+                        msg.format(req_len=len(left.columns), given_len=len(right))
+                    )
+                right = left._constructor_sliced(right, index=left.columns, dtype=dtype)
+            return right
+
+        if isinstance(right, np.ndarray):
+            if right.ndim == 1:
+                right = to_series(right)
+
+            elif right.ndim == 2:
+                # We need to pass dtype=right.dtype to retain object dtype
+                #  otherwise we lose consistency with Index and array ops
+                dtype = None
+                if right.dtype == object:
+                    # can't pass right.dtype unconditionally as that would break on e.g.
+                    #  datetime64[h] ndarray
+                    dtype = object
+
+                if right.shape == left.shape:
+                    right = left._constructor(
+                        right, index=left.index, columns=left.columns, dtype=dtype
+                    )
+
+                elif right.shape[0] == left.shape[0] and right.shape[1] == 1:
+                    # Broadcast across columns
+                    right = np.broadcast_to(right, left.shape)
+                    right = left._constructor(
+                        right, index=left.index, columns=left.columns, dtype=dtype
+                    )
+
+                elif right.shape[1] == left.shape[1] and right.shape[0] == 1:
+                    # Broadcast along rows
+                    right = to_series(right[0, :])
+
+                else:
+                    raise ValueError(
+                        "Unable to coerce to DataFrame, shape "
+                        f"must be {left.shape}: given {right.shape}"
+                    )
+
+            elif right.ndim > 2:
+                raise ValueError(
+                    "Unable to coerce to Series/DataFrame, "
+                    f"dimension must be <= 2: {right.shape}"
+                )
+
+        elif is_list_like(right) and not isinstance(right, (Series, DataFrame)):
+            # GH#36702. Raise when attempting arithmetic with list of array-like.
+            if any(is_array_like(el) for el in right):
+                raise ValueError(
+                    f"Unable to coerce list of {type(right[0])} to Series/DataFrame"
+                )
+            # GH#17901
+            right = to_series(right)
+
+        if flex is not None and isinstance(right, DataFrame):
+            if not left._indexed_same(right):
+                if flex:
+                    left, right = left.align(right, join="outer", level=level)
+                else:
+                    raise ValueError(
+                        "Can only compare identically-labeled (both index and columns) "
+                        "DataFrame objects"
+                    )
+        elif isinstance(right, Series):
+            # axis=1 is default for DataFrame-with-Series op
+            axis = axis if axis is not None else 1
+            if not flex:
+                if not left.axes[axis].equals(right.index):
+                    raise ValueError(
+                        "Operands are not aligned. Do "
+                        "`left, right = left.align(right, axis=1)` "
+                        "before operating."
+                    )
+
+            left, right = left.align(
+                right,
+                join="outer",
+                axis=axis,
+                level=level,
+            )
+            right = left._maybe_align_series_as_frame(right, axis)
+        return left, right
+
+    def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt):
+        """
+        If the Series operand is not EA-dtype, we can broadcast to 2D and operate
+        blockwise.
+        """
+        rvalues = series._values
+        if not isinstance(rvalues, np.ndarray):
+            # TODO(EA2D): no need to special-case with 2D EAs
+            if lib.is_np_dtype(rvalues.dtype, "mM"):
+                # i.e. DatetimeArray[tznaive] or TimedeltaArray
+                # We can losslessly+cheaply cast to ndarray
+                rvalues = np.asarray(rvalues)
+            else:
+                return series
+
+        if axis == 0:
+            rvalues = rvalues.reshape(-1, 1)
+        else:
+            rvalues = rvalues.reshape(1, -1)
+
+        rvalues = np.broadcast_to(rvalues, self.shape)
+        # pass dtype to avoid doing inference
+        return self._constructor(
+            rvalues,
+            index=self.index,
+            columns=self.columns,
+            dtype=rvalues.dtype,
+        ).__finalize__(series)
+
+    def _flex_arith_method(
+        self, other, op, *, axis: Axis = "columns", level=None, fill_value=None
+    ):
+        axis = self._get_axis_number(axis) if axis is not None else 1
+
+        if self._should_reindex_frame_op(other, op, axis, fill_value, level):
+            return self._arith_method_with_reindex(other, op)
+
+        if isinstance(other, Series) and fill_value is not None:
+            # TODO: We could allow this in cases where we end up going
+            #  through the DataFrame path
+            raise NotImplementedError(f"fill_value {fill_value} not supported.")
+
+        other = ops.maybe_prepare_scalar_for_op(other, self.shape)
+        self, other = self._align_for_op(other, axis, flex=True, level=level)
+
+        with np.errstate(all="ignore"):
+            if isinstance(other, DataFrame):
+                # Another DataFrame
+                new_data = self._combine_frame(other, op, fill_value)
+
+            elif isinstance(other, Series):
+                new_data = self._dispatch_frame_op(other, op, axis=axis)
+            else:
+                # in this case we always have `np.ndim(other) == 0`
+                if fill_value is not None:
+                    self = self.fillna(fill_value)
+
+                new_data = self._dispatch_frame_op(other, op)
+
+        return self._construct_result(new_data, other=other)
+
+    def _construct_result(self, result, other) -> DataFrame:
+        """
+        Wrap the result of an arithmetic, comparison, or logical operation.
+
+        Parameters
+        ----------
+        result : DataFrame
+
+        Returns
+        -------
+        DataFrame
+        """
+        out = self._constructor(result, copy=False).__finalize__(self)
+        # Pin columns instead of passing to constructor for compat with
+        #  non-unique columns case
+        out.columns = self.columns
+        out.index = self.index
+        out = out.__finalize__(other)
+        return out
+
+    def __divmod__(self, other) -> tuple[DataFrame, DataFrame]:
+        # Naive implementation, room for optimization
+        div = self // other
+        mod = self - div * other
+        return div, mod
+
+    def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:
+        # Naive implementation, room for optimization
+        div = other // self
+        mod = other - div * self
+        return div, mod
+
+    def _flex_cmp_method(self, other, op, *, axis: Axis = "columns", level=None):
+        axis = self._get_axis_number(axis) if axis is not None else 1
+
+        self, other = self._align_for_op(other, axis, flex=True, level=level)
+
+        new_data = self._dispatch_frame_op(other, op, axis=axis)
+        return self._construct_result(new_data, other=other)
+
+    def eq(self, other, axis: Axis = "columns", level=None) -> DataFrame:
+        """
+        Get Not equal to of dataframe and other, element-wise (binary operator `eq`).
+
+        Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
+        operators.
+
+        Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis
+        (rows or columns) and level for comparison.
+
+        Parameters
+        ----------
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : {0 or 'index', 1 or 'columns'}, default 'columns'
+            Whether to compare by the index (0 or 'index') or columns
+            (1 or 'columns').
+        level : int or label
+            Broadcast across a level, matching Index values on the passed
+            MultiIndex level.
+
+        Returns
+        -------
+        DataFrame of bool
+            Result of the comparison.
+
+        See Also
+        --------
+        DataFrame.eq : Compare DataFrames for equality elementwise.
+        DataFrame.ne : Compare DataFrames for inequality elementwise.
+        DataFrame.le : Compare DataFrames for less than inequality
+            or equality elementwise.
+        DataFrame.lt : Compare DataFrames for strictly less than
+            inequality elementwise.
+        DataFrame.ge : Compare DataFrames for greater than inequality
+            or equality elementwise.
+        DataFrame.gt : Compare DataFrames for strictly greater than
+            inequality elementwise.
+
+        Notes
+        -----
+        Mismatched indices will be unioned together.
+        `NaN` values are considered different (i.e. `NaN` != `NaN`).
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"cost": [250, 150, 100], "revenue": [100, 250, 300]},
+        ...     index=["A", "B", "C"],
+        ... )
+        >>> df
+           cost  revenue
+        A   250      100
+        B   150      250
+        C   100      300
+
+        Comparison with a scalar, using either the operator or method:
+
+        >>> df == 100
+            cost  revenue
+        A  False     True
+        B  False    False
+        C   True    False
+
+        >>> df.eq(100)
+            cost  revenue
+        A  False     True
+        B  False    False
+        C   True    False
+
+        When `other` is a :class:`Series`, the columns of a DataFrame are aligned
+        with the index of `other` and broadcast:
+
+        >>> df != pd.Series([100, 250], index=["cost", "revenue"])
+            cost  revenue
+        A   True     True
+        B   True    False
+        C  False     True
+
+        Use the method to control the broadcast axis:
+
+        >>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis="index")
+           cost  revenue
+        A  True    False
+        B  True     True
+        C  True     True
+        D  True     True
+
+        When comparing to an arbitrary sequence, the number of columns must
+        match the number elements in `other`:
+
+        >>> df == [250, 100]
+            cost  revenue
+        A   True     True
+        B  False    False
+        C  False    False
+
+        Use the method to control the axis:
+
+        >>> df.eq([250, 250, 100], axis="index")
+            cost  revenue
+        A   True    False
+        B  False     True
+        C   True    False
+
+        Compare to a DataFrame of different shape.
+
+        >>> other = pd.DataFrame(
+        ...     {"revenue": [300, 250, 100, 150]}, index=["A", "B", "C", "D"]
+        ... )
+        >>> other
+           revenue
+        A      300
+        B      250
+        C      100
+        D      150
+
+        >>> df.gt(other)
+            cost  revenue
+        A  False    False
+        B  False    False
+        C  False     True
+        D  False    False
+
+        Compare to a MultiIndex by level.
+
+        >>> df_multindex = pd.DataFrame(
+        ...     {
+        ...         "cost": [250, 150, 100, 150, 300, 220],
+        ...         "revenue": [100, 250, 300, 200, 175, 225],
+        ...     },
+        ...     index=[
+        ...         ["Q1", "Q1", "Q1", "Q2", "Q2", "Q2"],
+        ...         ["A", "B", "C", "A", "B", "C"],
+        ...     ],
+        ... )
+        >>> df_multindex
+              cost  revenue
+        Q1 A   250      100
+           B   150      250
+           C   100      300
+        Q2 A   150      200
+           B   300      175
+           C   220      225
+
+        >>> df.le(df_multindex, level=1)
+               cost  revenue
+        Q1 A   True     True
+           B   True     True
+           C   True     True
+        Q2 A  False     True
+           B   True    False
+           C   True    False
+        """
+        return self._flex_cmp_method(other, operator.eq, axis=axis, level=level)
+
+    def ne(self, other, axis: Axis = "columns", level=None) -> DataFrame:
+        """
+        Get Not equal to of dataframe and other, element-wise (binary operator `ne`).
+
+        Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison
+        operators.
+
+        Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis
+        (rows or columns) and level for comparison.
+
+        Parameters
+        ----------
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : {0 or 'index', 1 or 'columns'}, default 'columns'
+            Whether to compare by the index (0 or 'index') or columns
+            (1 or 'columns').
+        level : int or label
+            Broadcast across a level, matching Index values on the passed
+            MultiIndex level.
+
+        Returns
+        -------
+        DataFrame of bool
+            Result of the comparison.
+
+        See Also
+        --------
+        DataFrame.eq : Compare DataFrames for equality elementwise.
+        DataFrame.ne : Compare DataFrames for inequality elementwise.
+        DataFrame.le : Compare DataFrames for less than inequality
+            or equality elementwise.
+        DataFrame.lt : Compare DataFrames for strictly less than
+            inequality elementwise.
+        DataFrame.ge : Compare DataFrames for greater than inequality
+            or equality elementwise.
+        DataFrame.gt : Compare DataFrames for strictly greater than
+            inequality elementwise.
+
+        Notes
+        -----
+        Mismatched indices will be unioned together.
+        `NaN` values are considered different (i.e. `NaN` != `NaN`).
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"cost": [250, 150, 100], "revenue": [100, 250, 300]},
+        ...     index=["A", "B", "C"],
+        ... )
+        >>> df
+           cost  revenue
+        A   250      100
+        B   150      250
+        C   100      300
+
+        Comparison with a scalar, using either the operator or method:
+
+        >>> df == 100
+            cost  revenue
+        A  False     True
+        B  False    False
+        C   True    False
+
+        >>> df.eq(100)
+            cost  revenue
+        A  False     True
+        B  False    False
+        C   True    False
+
+        When `other` is a :class:`Series`, the columns of a DataFrame are aligned
+        with the index of `other` and broadcast:
+
+        >>> df != pd.Series([100, 250], index=["cost", "revenue"])
+            cost  revenue
+        A   True     True
+        B   True    False
+        C  False     True
+
+        Use the method to control the broadcast axis:
+
+        >>> df.ne(pd.Series([100, 300], index=["A", "D"]), axis="index")
+           cost  revenue
+        A  True    False
+        B  True     True
+        C  True     True
+        D  True     True
+
+        When comparing to an arbitrary sequence, the number of columns must
+        match the number elements in `other`:
+
+        >>> df == [250, 100]
+            cost  revenue
+        A   True     True
+        B  False    False
+        C  False    False
+
+        Use the method to control the axis:
+
+        >>> df.eq([250, 250, 100], axis="index")
+            cost  revenue
+        A   True    False
+        B  False     True
+        C   True    False
+
+        Compare to a DataFrame of different shape.
+
+        >>> other = pd.DataFrame(
+        ...     {"revenue": [300, 250, 100, 150]}, index=["A", "B", "C", "D"]
+        ... )
+        >>> other
+           revenue
+        A      300
+        B      250
+        C      100
+        D      150
+
+        >>> df.gt(other)
+            cost  revenue
+        A  False    False
+        B  False    False
+        C  False     True
+        D  False    False
+
+        Compare to a MultiIndex by level.
+
+        >>> df_multindex = pd.DataFrame(
+        ...     {
+        ...         "cost": [250, 150, 100, 150, 300, 220],
+        ...         "revenue": [100, 250, 300, 200, 175, 225],
+        ...     },
+        ...     index=[
+        ...         ["Q1", "Q1", "Q1", "Q2", "Q2", "Q2"],
+        ...         ["A", "B", "C", "A", "B", "C"],
+        ...     ],
+        ... )
+        >>> df_multindex
+              cost  revenue
+        Q1 A   250      100
+           B   150      250
+           C   100      300
+        Q2 A   150      200
+           B   300      175
+           C   220      225
+
+        >>> df.le(df_multindex, level=1)
+               cost  revenue
+        Q1 A   True     True
+           B   True     True
+           C   True     True
+        Q2 A  False     True
+           B   True    False
+           C   True    False
+        """
+        return self._flex_cmp_method(other, operator.ne, axis=axis, level=level)
+
+    @Appender(ops.make_flex_doc("le", "dataframe"))
+    def le(self, other, axis: Axis = "columns", level=None) -> DataFrame:
+        return self._flex_cmp_method(other, operator.le, axis=axis, level=level)
+
+    @Appender(ops.make_flex_doc("lt", "dataframe"))
+    def lt(self, other, axis: Axis = "columns", level=None) -> DataFrame:
+        return self._flex_cmp_method(other, operator.lt, axis=axis, level=level)
+
+    @Appender(ops.make_flex_doc("ge", "dataframe"))
+    def ge(self, other, axis: Axis = "columns", level=None) -> DataFrame:
+        return self._flex_cmp_method(other, operator.ge, axis=axis, level=level)
+
+    @Appender(ops.make_flex_doc("gt", "dataframe"))
+    def gt(self, other, axis: Axis = "columns", level=None) -> DataFrame:
+        return self._flex_cmp_method(other, operator.gt, axis=axis, level=level)
+
+    @Appender(ops.make_flex_doc("add", "dataframe"))
+    def add(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, operator.add, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("radd", "dataframe"))
+    def radd(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, roperator.radd, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("sub", "dataframe"))
+    def sub(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, operator.sub, level=level, fill_value=fill_value, axis=axis
+        )
+
+    subtract = sub
+
+    @Appender(ops.make_flex_doc("rsub", "dataframe"))
+    def rsub(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, roperator.rsub, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("mul", "dataframe"))
+    def mul(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, operator.mul, level=level, fill_value=fill_value, axis=axis
+        )
+
+    multiply = mul
+
+    @Appender(ops.make_flex_doc("rmul", "dataframe"))
+    def rmul(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, roperator.rmul, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("truediv", "dataframe"))
+    def truediv(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, operator.truediv, level=level, fill_value=fill_value, axis=axis
+        )
+
+    div = truediv
+    divide = truediv
+
+    @Appender(ops.make_flex_doc("rtruediv", "dataframe"))
+    def rtruediv(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis
+        )
+
+    rdiv = rtruediv
+
+    @Appender(ops.make_flex_doc("floordiv", "dataframe"))
+    def floordiv(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, operator.floordiv, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("rfloordiv", "dataframe"))
+    def rfloordiv(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("mod", "dataframe"))
+    def mod(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, operator.mod, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("rmod", "dataframe"))
+    def rmod(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, roperator.rmod, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("pow", "dataframe"))
+    def pow(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, operator.pow, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("rpow", "dataframe"))
+    def rpow(
+        self, other, axis: Axis = "columns", level=None, fill_value=None
+    ) -> DataFrame:
+        return self._flex_arith_method(
+            other, roperator.rpow, level=level, fill_value=fill_value, axis=axis
+        )
+
+    # ----------------------------------------------------------------------
+    # Combination-Related
+
+    def compare(
+        self,
+        other: DataFrame,
+        align_axis: Axis = 1,
+        keep_shape: bool = False,
+        keep_equal: bool = False,
+        result_names: Suffixes = ("self", "other"),
+    ) -> DataFrame:
+        """
+        Compare to another DataFrame and show the differences.
+
+        Parameters
+        ----------
+        other : DataFrame
+            Object to compare with.
+
+        align_axis : {0 or 'index', 1 or 'columns'}, default 1
+            Determine which axis to align the comparison on.
+
+            * 0, or 'index' : Resulting differences are stacked vertically
+              with rows drawn alternately from self and other.
+            * 1, or 'columns' : Resulting differences are aligned horizontally
+              with columns drawn alternately from self and other.
+
+        keep_shape : bool, default False
+            If true, all rows and columns are kept.
+            Otherwise, only the ones with different values are kept.
+
+        keep_equal : bool, default False
+            If true, the result keeps values that are equal.
+            Otherwise, equal values are shown as NaNs.
+
+        result_names : tuple, default ('self', 'other')
+            Set the dataframes names in the comparison.
+
+        Returns
+        -------
+        DataFrame
+            DataFrame that shows the differences stacked side by side.
+
+            The resulting index will be a MultiIndex with 'self' and 'other'
+            stacked alternately at the inner level.
+
+        Raises
+        ------
+        ValueError
+            When the two DataFrames don't have identical labels or shape.
+
+        See Also
+        --------
+        Series.compare : Compare with another Series and show differences.
+        DataFrame.equals : Test whether two objects contain the same elements.
+
+        Notes
+        -----
+        Matching NaNs will not appear as a difference.
+
+        Can only compare identically-labeled
+        (i.e. same shape, identical row and column labels) DataFrames
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "col1": ["a", "a", "b", "b", "a"],
+        ...         "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
+        ...         "col3": [1.0, 2.0, 3.0, 4.0, 5.0],
+        ...     },
+        ...     columns=["col1", "col2", "col3"],
+        ... )
+        >>> df
+          col1  col2  col3
+        0    a   1.0   1.0
+        1    a   2.0   2.0
+        2    b   3.0   3.0
+        3    b   NaN   4.0
+        4    a   5.0   5.0
+
+        >>> df2 = df.copy()
+        >>> df2.loc[0, "col1"] = "c"
+        >>> df2.loc[2, "col3"] = 4.0
+        >>> df2
+          col1  col2  col3
+        0    c   1.0   1.0
+        1    a   2.0   2.0
+        2    b   3.0   4.0
+        3    b   NaN   4.0
+        4    a   5.0   5.0
+
+        Align the differences on columns
+
+        >>> df.compare(df2)
+          col1       col3
+          self other self other
+        0    a     c  NaN   NaN
+        2  NaN   NaN  3.0   4.0
+
+        Assign result_names
+
+        >>> df.compare(df2, result_names=("left", "right"))
+          col1       col3
+          left right left right
+        0    a     c  NaN   NaN
+        2  NaN   NaN  3.0   4.0
+
+        Stack the differences on rows
+
+        >>> df.compare(df2, align_axis=0)
+                col1  col3
+        0 self     a   NaN
+          other    c   NaN
+        2 self   NaN   3.0
+          other  NaN   4.0
+
+        Keep the equal values
+
+        >>> df.compare(df2, keep_equal=True)
+          col1       col3
+          self other self other
+        0    a     c  1.0   1.0
+        2    b     b  3.0   4.0
+
+        Keep all original rows and columns
+
+        >>> df.compare(df2, keep_shape=True)
+          col1       col2       col3
+          self other self other self other
+        0    a     c  NaN   NaN  NaN   NaN
+        1  NaN   NaN  NaN   NaN  NaN   NaN
+        2  NaN   NaN  NaN   NaN  3.0   4.0
+        3  NaN   NaN  NaN   NaN  NaN   NaN
+        4  NaN   NaN  NaN   NaN  NaN   NaN
+
+        Keep all original rows and columns and also all original values
+
+        >>> df.compare(df2, keep_shape=True, keep_equal=True)
+          col1       col2       col3
+          self other self other self other
+        0    a     c  1.0   1.0  1.0   1.0
+        1    a     a  2.0   2.0  2.0   2.0
+        2    b     b  3.0   3.0  3.0   4.0
+        3    b     b  NaN   NaN  4.0   4.0
+        4    a     a  5.0   5.0  5.0   5.0
+        """
+        return super().compare(
+            other=other,
+            align_axis=align_axis,
+            keep_shape=keep_shape,
+            keep_equal=keep_equal,
+            result_names=result_names,
+        )
+
+    def combine(
+        self,
+        other: DataFrame,
+        func: Callable[[Series, Series], Series | Hashable],
+        fill_value=None,
+        overwrite: bool = True,
+    ) -> DataFrame:
+        """
+        Perform column-wise combine with another DataFrame.
+
+        Combines a DataFrame with `other` DataFrame using `func`
+        to element-wise combine columns. The row and column indexes of the
+        resulting DataFrame will be the union of the two.
+
+        Parameters
+        ----------
+        other : DataFrame
+            The DataFrame to merge column-wise.
+        func : function
+            Function that takes two series as inputs and return a Series or a
+            scalar. Used to merge the two dataframes column by columns.
+        fill_value : scalar value, default None
+            The value to fill NaNs with prior to passing any column to the
+            merge func.
+        overwrite : bool, default True
+            If True, columns in `self` that do not exist in `other` will be
+            overwritten with NaNs.
+
+        Returns
+        -------
+        DataFrame
+            Combination of the provided DataFrames.
+
+        See Also
+        --------
+        DataFrame.combine_first : Combine two DataFrame objects and default to
+            non-null values in frame calling the method.
+
+        Examples
+        --------
+        Combine using a simple function that chooses the smaller column.
+
+        >>> df1 = pd.DataFrame({"A": [0, 0], "B": [4, 4]})
+        >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]})
+        >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
+        >>> df1.combine(df2, take_smaller)
+           A  B
+        0  0  3
+        1  0  3
+
+        Example using a true element-wise combine function.
+
+        >>> df1 = pd.DataFrame({"A": [5, 0], "B": [2, 4]})
+        >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]})
+        >>> df1.combine(df2, np.minimum)
+           A  B
+        0  1  2
+        1  0  3
+
+        Using `fill_value` fills Nones prior to passing the column to the
+        merge function.
+
+        >>> df1 = pd.DataFrame({"A": [0, 0], "B": [None, 4]})
+        >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]})
+        >>> df1.combine(df2, take_smaller, fill_value=-5)
+           A    B
+        0  0 -5.0
+        1  0  4.0
+
+        Example that demonstrates the use of `overwrite` and behavior when
+        the axis differ between the dataframes.
+
+        >>> df1 = pd.DataFrame({"A": [0, 0], "B": [4, 4]})
+        >>> df2 = pd.DataFrame(
+        ...     {
+        ...         "B": [3, 3],
+        ...         "C": [-10, 1],
+        ...     },
+        ...     index=[1, 2],
+        ... )
+        >>> df1.combine(df2, take_smaller)
+             A    B     C
+        0  NaN  NaN   NaN
+        1  NaN  3.0 -10.0
+        2  NaN  3.0   1.0
+
+        >>> df1.combine(df2, take_smaller, overwrite=False)
+             A    B     C
+        0  0.0  NaN   NaN
+        1  0.0  3.0 -10.0
+        2  NaN  3.0   1.0
+
+        Demonstrating the preference of the passed in dataframe.
+
+        >>> df2 = pd.DataFrame(
+        ...     {
+        ...         "B": [3, 3],
+        ...         "C": [1, 1],
+        ...     },
+        ...     index=[1, 2],
+        ... )
+        >>> df2.combine(df1, take_smaller)
+             B    C   A
+        0  NaN  NaN 0.0
+        1  3.0  NaN 0.0
+        2  3.0  NaN NaN
+
+        >>> df2.combine(df1, take_smaller, overwrite=False)
+             B    C   A
+        0  NaN  NaN 0.0
+        1  3.0  1.0 0.0
+        2  3.0  1.0 NaN
+        """
+        other_idxlen = len(other.index)  # save for compare
+        other_columns = other.columns
+
+        this, other = self.align(other)
+        new_index = this.index
+
+        if other.empty and len(new_index) == len(self.index):
+            return self.copy()
+
+        if self.empty and len(other) == other_idxlen:
+            return other.copy()
+
+        # preserve column order
+        new_columns = self.columns.union(other_columns, sort=False)
+        this = this.reindex(new_columns, axis=1)
+        other = other.reindex(new_columns, axis=1)
+
+        do_fill = fill_value is not None
+        result = {}
+        for i in range(this.shape[1]):
+            series = this.iloc[:, i]
+            other_series = other.iloc[:, i]
+
+            this_dtype = series.dtype
+            other_dtype = other_series.dtype
+
+            this_mask = isna(series)
+            other_mask = isna(other_series)
+
+            # don't overwrite columns unnecessarily
+            # DO propagate if this column is not in the intersection
+            if not overwrite and other_mask.all():
+                result[i] = series.copy()
+                continue
+
+            if do_fill:
+                series = series.copy()
+                other_series = other_series.copy()
+                series[this_mask] = fill_value
+                other_series[other_mask] = fill_value
+
+            if new_columns[i] not in self.columns:
+                # If self DataFrame does not have col in other DataFrame,
+                # try to promote series, which is all NaN, as other_dtype.
+                new_dtype = other_dtype
+                try:
+                    series = series.astype(new_dtype)
+                except ValueError:
+                    # e.g. new_dtype is integer types
+                    pass
+            else:
+                # if we have different dtypes, possibly promote
+                new_dtype = find_common_type([this_dtype, other_dtype])
+                series = series.astype(new_dtype)
+                other_series = other_series.astype(new_dtype)
+
+            arr = func(series, other_series)
+            if isinstance(new_dtype, np.dtype):
+                # if new_dtype is an EA Dtype, then `func` is expected to return
+                # the correct dtype without any additional casting
+                # error: No overload variant of "maybe_downcast_to_dtype" matches
+                # argument types "Union[Series, Hashable]", "dtype[Any]"
+                arr = maybe_downcast_to_dtype(  # type: ignore[call-overload]
+                    arr, new_dtype
+                )
+
+            result[i] = arr
+
+        frame_result = self._constructor(result, index=new_index)
+        frame_result.columns = new_columns
+        return frame_result.__finalize__(self, method="combine")
+
+    def combine_first(self, other: DataFrame) -> DataFrame:
+        """
+        Update null elements with value in the same location in `other`.
+
+        Combine two DataFrame objects by filling null values in one DataFrame
+        with non-null values from other DataFrame. The row and column indexes
+        of the resulting DataFrame will be the union of the two. The resulting
+        dataframe contains the 'first' dataframe values and overrides the
+        second one values where both first.loc[index, col] and
+        second.loc[index, col] are not missing values, upon calling
+        first.combine_first(second).
+
+        Parameters
+        ----------
+        other : DataFrame
+            Provided DataFrame to use to fill null values.
+
+        Returns
+        -------
+        DataFrame
+            The result of combining the provided DataFrame with the other object.
+
+        See Also
+        --------
+        DataFrame.combine : Perform series-wise operation on two DataFrames
+            using a given function.
+
+        Examples
+        --------
+        >>> df1 = pd.DataFrame({"A": [None, 0], "B": [None, 4]})
+        >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]})
+        >>> df1.combine_first(df2)
+             A    B
+        0  1.0  3.0
+        1  0.0  4.0
+
+        Null values still persist if the location of that null value
+        does not exist in `other`
+
+        >>> df1 = pd.DataFrame({"A": [None, 0], "B": [4, None]})
+        >>> df2 = pd.DataFrame({"B": [3, 3], "C": [1, 1]}, index=[1, 2])
+        >>> df1.combine_first(df2)
+             A    B    C
+        0  NaN  4.0  NaN
+        1  0.0  3.0  1.0
+        2  NaN  3.0  1.0
+        """
+
+        def combiner(x: Series, y: Series):
+            # GH#60128 The combiner is supposed to preserve EA Dtypes.
+            return y if y.name not in self.columns else y.where(x.isna(), x)
+
+        if len(other) == 0:
+            combined = self.reindex(
+                self.columns.append(other.columns.difference(self.columns)), axis=1
+            )
+            combined = combined.astype(other.dtypes)
+        else:
+            combined = self.combine(other, combiner, overwrite=False)
+
+        dtypes = {
+            # Check for isinstance(..., (np.dtype, ExtensionDtype))
+            #  to prevent raising on non-unique columns see GH#29135.
+            #  Note we will just not-cast in these cases.
+            col: find_common_type([self.dtypes[col], other.dtypes[col]])
+            for col in self.columns.intersection(other.columns)
+            if isinstance(combined.dtypes[col], (np.dtype, ExtensionDtype))
+            and isinstance(self.dtypes[col], (np.dtype, ExtensionDtype))
+            and combined.dtypes[col] != self.dtypes[col]
+        }
+
+        if dtypes:
+            combined = combined.astype(dtypes)
+
+        return combined.__finalize__(self, method="combine_first")
+
+    def update(
+        self,
+        other,
+        join: UpdateJoin = "left",
+        overwrite: bool = True,
+        filter_func=None,
+        errors: IgnoreRaise = "ignore",
+    ) -> None:
+        """
+        Modify in place using non-NA values from another DataFrame.
+
+        Aligns on indices. There is no return value.
+
+        Parameters
+        ----------
+        other : DataFrame, or object coercible into a DataFrame
+            Should have at least one matching index/column label
+            with the original DataFrame. If a Series is passed,
+            its name attribute must be set, and that will be
+            used as the column name to align with the original DataFrame.
+        join : {'left'}, default 'left'
+            Only left join is implemented, keeping the index and columns of the
+            original object.
+        overwrite : bool, default True
+            How to handle non-NA values for overlapping keys:
+
+            * True: overwrite original DataFrame's values
+              with values from `other`.
+            * False: only update values that are NA in
+              the original DataFrame.
+
+        filter_func : callable(1d-array) -> bool 1d-array, optional
+            Can choose to replace values other than NA. Return True for values
+            that should be updated.
+        errors : {'raise', 'ignore'}, default 'ignore'
+            If 'raise', will raise a ValueError if the DataFrame and `other`
+            both contain non-NA data in the same place.
+
+        Returns
+        -------
+        None
+            This method directly changes calling object.
+
+        Raises
+        ------
+        ValueError
+            * When `errors='raise'` and there's overlapping non-NA data.
+            * When `errors` is not either `'ignore'` or `'raise'`
+        NotImplementedError
+            * If `join != 'left'`
+
+        See Also
+        --------
+        dict.update : Similar method for dictionaries.
+        DataFrame.merge : For column(s)-on-column(s) operations.
+
+        Notes
+        -----
+        1. Duplicate indices on `other` are not supported and raises `ValueError`.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [400, 500, 600]})
+        >>> new_df = pd.DataFrame({"B": [4, 5, 6], "C": [7, 8, 9]})
+        >>> df.update(new_df)
+        >>> df
+           A  B
+        0  1  4
+        1  2  5
+        2  3  6
+
+        The DataFrame's length does not increase as a result of the update,
+        only values at matching index/column labels are updated.
+
+        >>> df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]})
+        >>> new_df = pd.DataFrame({"B": ["d", "e", "f", "g", "h", "i"]})
+        >>> df.update(new_df)
+        >>> df
+           A  B
+        0  a  d
+        1  b  e
+        2  c  f
+
+        >>> df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]})
+        >>> new_df = pd.DataFrame({"B": ["d", "f"]}, index=[0, 2])
+        >>> df.update(new_df)
+        >>> df
+           A  B
+        0  a  d
+        1  b  y
+        2  c  f
+
+        For Series, its name attribute must be set.
+
+        >>> df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]})
+        >>> new_column = pd.Series(["d", "e", "f"], name="B")
+        >>> df.update(new_column)
+        >>> df
+           A  B
+        0  a  d
+        1  b  e
+        2  c  f
+
+        If `other` contains NaNs the corresponding values are not updated
+        in the original dataframe.
+
+        >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [400.0, 500.0, 600.0]})
+        >>> new_df = pd.DataFrame({"B": [4, np.nan, 6]})
+        >>> df.update(new_df)
+        >>> df
+           A      B
+        0  1    4.0
+        1  2  500.0
+        2  3    6.0
+        """
+        if not CHAINED_WARNING_DISABLED:
+            if sys.getrefcount(
+                self
+            ) <= REF_COUNT_METHOD and not com.is_local_in_caller_frame(self):
+                warnings.warn(
+                    _chained_assignment_method_update_msg,
+                    ChainedAssignmentError,
+                    stacklevel=2,
+                )
+
+        # TODO: Support other joins
+        if join != "left":  # pragma: no cover
+            raise NotImplementedError("Only left join is supported")
+        if errors not in ["ignore", "raise"]:
+            raise ValueError("The parameter errors must be either 'ignore' or 'raise'")
+
+        if not isinstance(other, DataFrame):
+            other = DataFrame(other)
+
+        if other.index.has_duplicates:
+            raise ValueError("Update not allowed with duplicate indexes on other.")
+
+        index_intersection = other.index.intersection(self.index)
+        if index_intersection.empty:
+            return
+        other = other.reindex(index_intersection)
+        this_data = self.loc[index_intersection]
+
+        for col in self.columns.intersection(other.columns):
+            this = this_data[col]
+            that = other[col]
+
+            if filter_func is not None:
+                mask = ~filter_func(this) | isna(that)
+            else:
+                if errors == "raise":
+                    mask_this = notna(that)
+                    mask_that = notna(this)
+                    if any(mask_this & mask_that):
+                        raise ValueError("Data overlaps.")
+
+                if overwrite:
+                    mask = isna(that)
+                else:
+                    mask = notna(this)
+
+            # don't overwrite columns unnecessarily
+            if mask.all():
+                continue
+
+            self.loc[index_intersection, col] = this.where(mask, that)
+
+    # ----------------------------------------------------------------------
+    # Data reshaping
+    @deprecate_nonkeyword_arguments(
+        Pandas4Warning, allowed_args=["self", "by", "level"], name="groupby"
+    )
+    def groupby(
+        self,
+        by=None,
+        level: IndexLabel | None = None,
+        as_index: bool = True,
+        sort: bool = True,
+        group_keys: bool = True,
+        observed: bool = True,
+        dropna: bool = True,
+    ) -> DataFrameGroupBy:
+        """
+        Group DataFrame using a mapper or by a Series of columns.
+
+        A groupby operation involves some combination of splitting the
+        object, applying a function, and combining the results. This can be
+        used to group large amounts of data and compute operations on these
+        groups.
+
+        Parameters
+        ----------
+        by : mapping, function, label, pd.Grouper or list of such
+            Used to determine the groups for the groupby.
+            If ``by`` is a function, it's called on each value of the object's
+            index. If a dict or Series is passed, the Series or dict VALUES
+            will be used to determine the groups (the Series' values are first
+            aligned; see ``.align()`` method). If a list or ndarray of length
+            equal to the number of rows is passed (see the `groupby user guide
+            <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#splitting-an-object-into-groups>`_),
+            the values are used as-is to determine the groups. A label or list
+            of labels may be passed to group by the columns in ``self``.
+            Notice that a tuple is interpreted as a (single) key.
+        level : int, level name, or sequence of such, default None
+            If the axis is a MultiIndex (hierarchical), group by a particular
+            level or levels. Do not specify both ``by`` and ``level``.
+        as_index : bool, default True
+            Return object with group labels as the
+            index. Only relevant for DataFrame input. as_index=False is
+            effectively "SQL-style" grouped output. This argument has no effect
+            on filtrations (see the `filtrations in the user guide
+            <https://pandas.pydata.org/docs/dev/user_guide/groupby.html#filtration>`_),
+            such as ``head()``, ``tail()``, ``nth()`` and in transformations
+            (see the `transformations in the user guide
+            <https://pandas.pydata.org/docs/dev/user_guide/groupby.html#transformation>`_).
+        sort : bool, default True
+            Sort group keys. Get better performance by turning this off.
+            Note this does not influence the order of observations within each
+            group. Groupby preserves the order of rows within each group. If False,
+            the groups will appear in the same order as they did in the original
+            DataFrame.
+            This argument has no effect on filtrations (see the `filtrations
+            in the user guide
+            <https://pandas.pydata.org/docs/dev/user_guide/groupby.html#filtration>`_),
+            such as ``head()``, ``tail()``, ``nth()`` and in transformations
+            (see the `transformations in the user guide
+            <https://pandas.pydata.org/docs/dev/user_guide/groupby.html#transformation>`_).
+
+            .. versionchanged:: 2.0.0
+
+                Specifying ``sort=False`` with an ordered categorical grouper will no
+                longer sort the values.
+
+        group_keys : bool, default True
+            When calling apply and the ``by`` argument produces a like-indexed
+            (i.e. :ref:`a transform <groupby.transform>`) result, add group keys to
+            index to identify pieces. By default group keys are not included
+            when the result's index (and column) labels match the inputs, and
+            are included otherwise.
+
+            .. versionchanged:: 2.0.0
+
+               ``group_keys`` now defaults to ``True``.
+
+        observed : bool, default True
+            This only applies if any of the groupers are Categoricals.
+            If True: only show observed values for categorical groupers.
+            If False: show all values for categorical groupers.
+
+            .. versionchanged:: 3.0.0
+
+                The default value is now ``True``.
+
+        dropna : bool, default True
+            If True, and if group keys contain NA values, NA values together
+            with row/column will be dropped.
+            If False, NA values will also be treated as the key in groups.
+
+        Returns
+        -------
+        pandas.api.typing.DataFrameGroupBy
+            Returns a groupby object that contains information about the groups.
+
+        See Also
+        --------
+        resample : Convenience method for frequency conversion and resampling
+            of time series.
+
+        Notes
+        -----
+        See the `user guide
+        <https://pandas.pydata.org/pandas-docs/stable/groupby.html>`__ for more
+        detailed usage and examples, including splitting an object into groups,
+        iterating through groups, selecting a group, aggregation, and more.
+
+        The implementation of groupby is hash-based, meaning in particular that
+        objects that compare as equal will be considered to be in the same group.
+        An exception to this is that pandas has special handling of NA values:
+        any NA values will be collapsed to a single group, regardless of how
+        they compare. See the user guide linked above for more details.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"],
+        ...         "Max Speed": [380.0, 370.0, 24.0, 26.0],
+        ...     }
+        ... )
+        >>> df
+           Animal  Max Speed
+        0  Falcon      380.0
+        1  Falcon      370.0
+        2  Parrot       24.0
+        3  Parrot       26.0
+        >>> df.groupby(["Animal"]).mean()
+                Max Speed
+        Animal
+        Falcon      375.0
+        Parrot       25.0
+
+        **Hierarchical Indexes**
+
+        We can groupby different levels of a hierarchical index
+        using the `level` parameter:
+
+        >>> arrays = [
+        ...     ["Falcon", "Falcon", "Parrot", "Parrot"],
+        ...     ["Captive", "Wild", "Captive", "Wild"],
+        ... ]
+        >>> index = pd.MultiIndex.from_arrays(arrays, names=("Animal", "Type"))
+        >>> df = pd.DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index)
+        >>> df
+                        Max Speed
+        Animal Type
+        Falcon Captive      390.0
+               Wild         350.0
+        Parrot Captive       30.0
+               Wild          20.0
+        >>> df.groupby(level=0).mean()
+                Max Speed
+        Animal
+        Falcon      370.0
+        Parrot       25.0
+        >>> df.groupby(level="Type").mean()
+                 Max Speed
+        Type
+        Captive      210.0
+        Wild         185.0
+
+        We can also choose to include NA in group keys or not by setting
+        `dropna` parameter, the default setting is `True`.
+
+        >>> arr = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
+        >>> df = pd.DataFrame(arr, columns=["a", "b", "c"])
+
+        >>> df.groupby(by=["b"]).sum()
+            a   c
+        b
+        1.0 2   3
+        2.0 2   5
+
+        >>> df.groupby(by=["b"], dropna=False).sum()
+            a   c
+        b
+        1.0 2   3
+        2.0 2   5
+        NaN 1   4
+
+        >>> arr = [["a", 12, 12], [None, 12.3, 33.0], ["b", 12.3, 123], ["a", 1, 1]]
+        >>> df = pd.DataFrame(arr, columns=["a", "b", "c"])
+
+        >>> df.groupby(by="a").sum()
+            b     c
+        a
+        a   13.0   13.0
+        b   12.3  123.0
+
+        >>> df.groupby(by="a", dropna=False).sum()
+            b     c
+        a
+        a   13.0   13.0
+        b   12.3  123.0
+        NaN 12.3   33.0
+
+        When using ``.apply()``, use ``group_keys`` to include or exclude the
+        group keys. The ``group_keys`` argument defaults to ``True`` (include).
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"],
+        ...         "Max Speed": [380.0, 370.0, 24.0, 26.0],
+        ...     }
+        ... )
+        >>> df.groupby("Animal", group_keys=True)[["Max Speed"]].apply(lambda x: x)
+                  Max Speed
+        Animal
+        Falcon 0      380.0
+               1      370.0
+        Parrot 2       24.0
+               3       26.0
+
+        >>> df.groupby("Animal", group_keys=False)[["Max Speed"]].apply(lambda x: x)
+           Max Speed
+        0      380.0
+        1      370.0
+        2       24.0
+        3       26.0
+        """
+        from pandas.core.groupby.generic import DataFrameGroupBy
+
+        if level is None and by is None:
+            raise TypeError("You have to supply one of 'by' and 'level'")
+
+        return DataFrameGroupBy(
+            obj=self,
+            keys=by,
+            level=level,
+            as_index=as_index,
+            sort=sort,
+            group_keys=group_keys,
+            observed=observed,
+            dropna=dropna,
+        )
+
+    _shared_docs["pivot"] = """
+        Return reshaped DataFrame organized by given index / column values.
+
+        Reshape data (produce a "pivot" table) based on column values. Uses
+        unique values from specified `index` / `columns` to form axes of the
+        resulting DataFrame. This function does not support data
+        aggregation, multiple values will result in a MultiIndex in the
+        columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
+
+        Parameters
+        ----------%s
+        columns : Hashable or a sequence of the previous
+            Column to use to make new frame's columns.
+        index : Hashable or a sequence of the previous, optional
+            Column to use to make new frame's index. If not given, uses existing index.
+        values : Hashable or a sequence of the previous, optional
+            Column(s) to use for populating new frame's values. If not
+            specified, all remaining columns will be used and the result will
+            have hierarchically indexed columns.
+
+        Returns
+        -------
+        DataFrame
+            Returns reshaped DataFrame.
+
+        Raises
+        ------
+        ValueError:
+            When there are any `index`, `columns` combinations with multiple
+            values. `DataFrame.pivot_table` when you need to aggregate.
+
+        See Also
+        --------
+        DataFrame.pivot_table : Generalization of pivot that can handle
+            duplicate values for one index/column pair.
+        DataFrame.unstack : Pivot based on the index values instead of a
+            column.
+        wide_to_long : Wide panel to long format. Less flexible but more
+            user-friendly than melt.
+
+        Notes
+        -----
+        For finer-tuned control, see hierarchical indexing documentation along
+        with the related stack/unstack methods.
+
+        Reference :ref:`the user guide <reshaping.pivot>` for more examples.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
+        ...                            'two'],
+        ...                    'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
+        ...                    'baz': [1, 2, 3, 4, 5, 6],
+        ...                    'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
+        >>> df
+            foo   bar  baz  zoo
+        0   one   A    1    x
+        1   one   B    2    y
+        2   one   C    3    z
+        3   two   A    4    q
+        4   two   B    5    w
+        5   two   C    6    t
+
+        >>> df.pivot(index='foo', columns='bar', values='baz')
+        bar  A   B   C
+        foo
+        one  1   2   3
+        two  4   5   6
+
+        >>> df.pivot(index='foo', columns='bar')['baz']
+        bar  A   B   C
+        foo
+        one  1   2   3
+        two  4   5   6
+
+        >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
+              baz       zoo
+        bar   A  B  C   A  B  C
+        foo
+        one   1  2  3   x  y  z
+        two   4  5  6   q  w  t
+
+        You could also assign a list of column names or a list of index names.
+
+        >>> df = pd.DataFrame({
+        ...                   "lev1": [1, 1, 1, 2, 2, 2],
+        ...                   "lev2": [1, 1, 2, 1, 1, 2],
+        ...                   "lev3": [1, 2, 1, 2, 1, 2],
+        ...                   "lev4": [1, 2, 3, 4, 5, 6],
+        ...                   "values": [0, 1, 2, 3, 4, 5]})
+        >>> df
+            lev1 lev2 lev3 lev4 values
+        0   1    1    1    1    0
+        1   1    1    2    2    1
+        2   1    2    1    3    2
+        3   2    1    2    4    3
+        4   2    1    1    5    4
+        5   2    2    2    6    5
+
+        >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")
+        lev2    1         2
+        lev3    1    2    1    2
+        lev1
+        1     0.0  1.0  2.0  NaN
+        2     4.0  3.0  NaN  5.0
+
+        >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")
+              lev3    1    2
+        lev1  lev2
+           1     1  0.0  1.0
+                 2  2.0  NaN
+           2     1  4.0  3.0
+                 2  NaN  5.0
+
+        A ValueError is raised if there are any duplicates.
+
+        >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
+        ...                    "bar": ['A', 'A', 'B', 'C'],
+        ...                    "baz": [1, 2, 3, 4]})
+        >>> df
+           foo bar  baz
+        0  one   A    1
+        1  one   A    2
+        2  two   B    3
+        3  two   C    4
+
+        Notice that the first two rows are the same for our `index`
+        and `columns` arguments.
+
+        >>> df.pivot(index='foo', columns='bar', values='baz')
+        Traceback (most recent call last):
+           ...
+        ValueError: Index contains duplicate entries, cannot reshape
+        """
+
+    @Substitution("")
+    @Appender(_shared_docs["pivot"])
+    def pivot(
+        self, *, columns, index=lib.no_default, values=lib.no_default
+    ) -> DataFrame:
+        from pandas.core.reshape.pivot import pivot
+
+        return pivot(self, index=index, columns=columns, values=values)
+
+    _shared_docs["pivot_table"] = """
+        Create a spreadsheet-style pivot table as a DataFrame.
+
+        The levels in the pivot table will be stored in MultiIndex objects
+        (hierarchical indexes) on the index and columns of the result DataFrame.
+
+        Parameters
+        ----------%s
+        values : list-like or scalar, optional
+            Column or columns to aggregate.
+        index : column, Grouper, array, or sequence of the previous
+            Keys to group by on the pivot table index. If a list is passed,
+            it can contain any of the other types (except list). If an array is
+            passed, it must be the same length as the data and will be used in
+            the same manner as column values.
+        columns : column, Grouper, array, or sequence of the previous
+            Keys to group by on the pivot table column. If a list is passed,
+            it can contain any of the other types (except list). If an array is
+            passed, it must be the same length as the data and will be used in
+            the same manner as column values.
+        aggfunc : function, list of functions, dict, default "mean"
+            If a list of functions is passed, the resulting pivot table will have
+            hierarchical columns whose top level are the function names
+            (inferred from the function objects themselves).
+            If a dict is passed, the key is column to aggregate and the value is
+            function or list of functions. If ``margin=True``, aggfunc will be
+            used to calculate the partial aggregates.
+        fill_value : scalar, default None
+            Value to replace missing values with (in the resulting pivot table,
+            after aggregation).
+        margins : bool, default False
+            If ``margins=True``, special ``All`` columns and rows
+            will be added with partial group aggregates across the categories
+            on the rows and columns.
+        dropna : bool, default True
+            Do not include columns whose entries are all NaN. If True,
+
+            * rows with an NA value in any column will be omitted before computing
+              margins,
+            * index/column keys containing NA values will be dropped (see ``dropna``
+              parameter in :meth:`DataFrame.groupby`).
+
+        margins_name : str, default 'All'
+            Name of the row / column that will contain the totals
+            when margins is True.
+        observed : bool, default False
+            This only applies if any of the groupers are Categoricals.
+            If True: only show observed values for categorical groupers.
+            If False: show all values for categorical groupers.
+
+            .. versionchanged:: 3.0.0
+
+                The default value is now ``True``.
+
+        sort : bool, default True
+            Specifies if the result should be sorted.
+
+        **kwargs : dict
+            Optional keyword arguments to pass to ``aggfunc``.
+
+        Returns
+        -------
+        DataFrame
+            An Excel style pivot table.
+
+        See Also
+        --------
+        DataFrame.pivot : Pivot without aggregation that can handle
+            non-numeric data.
+        DataFrame.melt: Unpivot a DataFrame from wide to long format,
+            optionally leaving identifiers set.
+        wide_to_long : Wide panel to long format. Less flexible but more
+            user-friendly than melt.
+
+        Notes
+        -----
+        Reference :ref:`the user guide <reshaping.pivot>` for more examples.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
+        ...                          "bar", "bar", "bar", "bar"],
+        ...                    "B": ["one", "one", "one", "two", "two",
+        ...                          "one", "one", "two", "two"],
+        ...                    "C": ["small", "large", "large", "small",
+        ...                          "small", "large", "small", "small",
+        ...                          "large"],
+        ...                    "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
+        ...                    "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
+        >>> df
+             A    B      C  D  E
+        0  foo  one  small  1  2
+        1  foo  one  large  2  4
+        2  foo  one  large  2  5
+        3  foo  two  small  3  5
+        4  foo  two  small  3  6
+        5  bar  one  large  4  6
+        6  bar  one  small  5  8
+        7  bar  two  small  6  9
+        8  bar  two  large  7  9
+
+        This first example aggregates values by taking the sum.
+
+        >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
+        ...                        columns=['C'], aggfunc="sum")
+        >>> table
+        C        large  small
+        A   B
+        bar one    4.0    5.0
+            two    7.0    6.0
+        foo one    4.0    1.0
+            two    NaN    6.0
+
+        We can also fill missing values using the `fill_value` parameter.
+
+        >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
+        ...                        columns=['C'], aggfunc="sum", fill_value=0)
+        >>> table
+        C        large  small
+        A   B
+        bar one      4      5
+            two      7      6
+        foo one      4      1
+            two      0      6
+
+        The next example aggregates by taking the mean across multiple columns.
+
+        >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
+        ...                        aggfunc={'D': "mean", 'E': "mean"})
+        >>> table
+                        D         E
+        A   C
+        bar large  5.500000  7.500000
+            small  5.500000  8.500000
+        foo large  2.000000  4.500000
+            small  2.333333  4.333333
+
+        We can also calculate multiple types of aggregations for any given
+        value column.
+
+        >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
+        ...                        aggfunc={'D': "mean",
+        ...                                 'E': ["min", "max", "mean"]})
+        >>> table
+                          D   E
+                       mean max      mean  min
+        A   C
+        bar large  5.500000   9  7.500000    6
+            small  5.500000   9  8.500000    8
+        foo large  2.000000   5  4.500000    4
+            small  2.333333   6  4.333333    2
+        """
+
+    @Substitution("")
+    @Appender(_shared_docs["pivot_table"])
+    def pivot_table(
+        self,
+        values=None,
+        index=None,
+        columns=None,
+        aggfunc: AggFuncType = "mean",
+        fill_value=None,
+        margins: bool = False,
+        dropna: bool = True,
+        margins_name: Level = "All",
+        observed: bool = True,
+        sort: bool = True,
+        **kwargs,
+    ) -> DataFrame:
+        from pandas.core.reshape.pivot import pivot_table
+
+        return pivot_table(
+            self,
+            values=values,
+            index=index,
+            columns=columns,
+            aggfunc=aggfunc,
+            fill_value=fill_value,
+            margins=margins,
+            dropna=dropna,
+            margins_name=margins_name,
+            observed=observed,
+            sort=sort,
+            **kwargs,
+        )
+
+    def stack(
+        self,
+        level: IndexLabel = -1,
+        dropna: bool | lib.NoDefault = lib.no_default,
+        sort: bool | lib.NoDefault = lib.no_default,
+        future_stack: bool = True,
+    ):
+        """
+        Stack the prescribed level(s) from columns to index.
+
+        Return a reshaped DataFrame or Series having a multi-level
+        index with one or more new inner-most levels compared to the current
+        DataFrame. The new inner-most levels are created by pivoting the
+        columns of the current dataframe:
+
+        - if the columns have a single level, the output is a Series;
+        - if the columns have multiple levels, the new index level(s) is (are)
+          taken from the prescribed level(s) and the output is a DataFrame.
+
+        Parameters
+        ----------
+        level : int, str, list, default -1
+            Level(s) to stack from the column axis onto the index
+            axis, defined as one index or label, or a list of indices
+            or labels.
+        dropna : bool, default True
+            Whether to drop rows in the resulting Frame/Series with
+            missing values. Stacking a column level onto the index
+            axis can create combinations of index and column values
+            that are missing from the original dataframe. See Examples
+            section.
+        sort : bool, default True
+            Whether to sort the levels of the resulting MultiIndex.
+        future_stack : bool, default True
+            Whether to use the new implementation that will replace the current
+            implementation in pandas 3.0. When True, dropna and sort have no impact
+            on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release
+            notes <whatsnew_210.enhancements.new_stack>` for more details.
+
+        Returns
+        -------
+        DataFrame or Series
+            Stacked dataframe or series.
+
+        See Also
+        --------
+        DataFrame.unstack : Unstack prescribed level(s) from index axis
+             onto column axis.
+        DataFrame.pivot : Reshape dataframe from long format to wide
+             format.
+        DataFrame.pivot_table : Create a spreadsheet-style pivot table
+             as a DataFrame.
+
+        Notes
+        -----
+        The function is named by analogy with a collection of books
+        being reorganized from being side by side on a horizontal
+        position (the columns of the dataframe) to being stacked
+        vertically on top of each other (in the index of the
+        dataframe).
+
+        Reference :ref:`the user guide <reshaping.stacking>` for more examples.
+
+        Examples
+        --------
+        **Single level columns**
+
+        >>> df_single_level_cols = pd.DataFrame(
+        ...     [[0, 1], [2, 3]], index=["cat", "dog"], columns=["weight", "height"]
+        ... )
+
+        Stacking a dataframe with a single level column axis returns a Series:
+
+        >>> df_single_level_cols
+             weight height
+        cat       0      1
+        dog       2      3
+        >>> df_single_level_cols.stack()
+        cat  weight    0
+             height    1
+        dog  weight    2
+             height    3
+        dtype: int64
+
+        **Multi level columns: simple case**
+
+        >>> multicol1 = pd.MultiIndex.from_tuples(
+        ...     [("weight", "kg"), ("weight", "pounds")]
+        ... )
+        >>> df_multi_level_cols1 = pd.DataFrame(
+        ...     [[1, 2], [2, 4]], index=["cat", "dog"], columns=multicol1
+        ... )
+
+        Stacking a dataframe with a multi-level column axis:
+
+        >>> df_multi_level_cols1
+             weight
+                 kg    pounds
+        cat       1        2
+        dog       2        4
+        >>> df_multi_level_cols1.stack()
+                    weight
+        cat kg           1
+            pounds       2
+        dog kg           2
+            pounds       4
+
+        **Missing values**
+
+        >>> multicol2 = pd.MultiIndex.from_tuples([("weight", "kg"), ("height", "m")])
+        >>> df_multi_level_cols2 = pd.DataFrame(
+        ...     [[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=multicol2
+        ... )
+
+        It is common to have missing values when stacking a dataframe
+        with multi-level columns, as the stacked dataframe typically
+        has more values than the original dataframe. Missing values
+        are filled with NaNs:
+
+        >>> df_multi_level_cols2
+            weight height
+                kg      m
+        cat    1.0    2.0
+        dog    3.0    4.0
+        >>> df_multi_level_cols2.stack()
+                weight  height
+        cat kg     1.0     NaN
+            m      NaN     2.0
+        dog kg     3.0     NaN
+            m      NaN     4.0
+
+        **Prescribing the level(s) to be stacked**
+
+        The first parameter controls which level or levels are stacked:
+
+        >>> df_multi_level_cols2.stack(0)
+                     kg    m
+        cat weight  1.0  NaN
+            height  NaN  2.0
+        dog weight  3.0  NaN
+            height  NaN  4.0
+        >>> df_multi_level_cols2.stack([0, 1])
+        cat  weight  kg    1.0
+             height  m     2.0
+        dog  weight  kg    3.0
+             height  m     4.0
+        dtype: float64
+        """
+        if not future_stack:
+            from pandas.core.reshape.reshape import (
+                stack,
+                stack_multiple,
+            )
+
+            warnings.warn(
+                "The previous implementation of stack is deprecated and will be "
+                "removed in a future version of pandas. See the What's New notes "
+                "for pandas 2.1.0 for details. Do not specify the future_stack "
+                "argument to adopt the new implementation and silence this warning.",
+                Pandas4Warning,
+                stacklevel=find_stack_level(),
+            )
+
+            if dropna is lib.no_default:
+                dropna = True
+            if sort is lib.no_default:
+                sort = True
+
+            if isinstance(level, (tuple, list)):
+                result = stack_multiple(self, level, dropna=dropna, sort=sort)
+            else:
+                result = stack(self, level, dropna=dropna, sort=sort)
+        else:
+            from pandas.core.reshape.reshape import stack_v3
+
+            if dropna is not lib.no_default:
+                raise ValueError(
+                    "dropna must be unspecified as the new "
+                    "implementation does not introduce rows of NA values. This "
+                    "argument will be removed in a future version of pandas."
+                )
+
+            if sort is not lib.no_default:
+                raise ValueError(
+                    "Cannot specify sort, this argument will be "
+                    "removed in a future version of pandas. Sort the result using "
+                    ".sort_index instead."
+                )
+
+            if (
+                isinstance(level, (tuple, list))
+                and not all(lev in self.columns.names for lev in level)
+                and not all(isinstance(lev, int) for lev in level)
+            ):
+                raise ValueError(
+                    "level should contain all level names or all level "
+                    "numbers, not a mixture of the two."
+                )
+
+            if not isinstance(level, (tuple, list)):
+                level = [level]
+            level = [self.columns._get_level_number(lev) for lev in level]
+            result = stack_v3(self, level)
+
+        return result.__finalize__(self, method="stack")
+
+    def explode(
+        self,
+        column: IndexLabel,
+        ignore_index: bool = False,
+    ) -> DataFrame:
+        """
+        Transform each element of a list-like to a row, replicating index values.
+
+        Parameters
+        ----------
+        column : IndexLabel
+            Column(s) to explode.
+            For multiple columns, specify a non-empty list with each element
+            be str or tuple, and all specified columns their list-like data
+            on same row of the frame must have matching length.
+
+        ignore_index : bool, default False
+            If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        DataFrame
+            Exploded lists to rows of the subset columns;
+            index will be duplicated for these rows.
+
+        Raises
+        ------
+        ValueError :
+            * If columns of the frame are not unique.
+            * If specified columns to explode is empty list.
+            * If specified columns to explode have not matching count of
+              elements rowwise in the frame.
+
+        See Also
+        --------
+        DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
+            index labels.
+        DataFrame.melt : Unpivot a DataFrame from wide format to long format.
+        Series.explode : Explode a DataFrame from list-like columns to long format.
+
+        Notes
+        -----
+        This routine will explode list-likes including lists, tuples, sets,
+        Series, and np.ndarray. The result dtype of the subset rows will
+        be object. Scalars will be returned unchanged, and empty list-likes will
+        result in a np.nan for that row. In addition, the ordering of rows in the
+        output will be non-deterministic when exploding sets.
+
+        Reference :ref:`the user guide <reshaping.explode>` for more examples.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "A": [[0, 1, 2], "foo", [], [3, 4]],
+        ...         "B": 1,
+        ...         "C": [["a", "b", "c"], np.nan, [], ["d", "e"]],
+        ...     }
+        ... )
+        >>> df
+                   A  B          C
+        0  [0, 1, 2]  1  [a, b, c]
+        1        foo  1        NaN
+        2         []  1         []
+        3     [3, 4]  1     [d, e]
+
+        Single-column explode.
+
+        >>> df.explode("A")
+             A  B          C
+        0    0  1  [a, b, c]
+        0    1  1  [a, b, c]
+        0    2  1  [a, b, c]
+        1  foo  1        NaN
+        2  NaN  1         []
+        3    3  1     [d, e]
+        3    4  1     [d, e]
+
+        Multi-column explode.
+
+        >>> df.explode(list("AC"))
+             A  B    C
+        0    0  1    a
+        0    1  1    b
+        0    2  1    c
+        1  foo  1  NaN
+        2  NaN  1  NaN
+        3    3  1    d
+        3    4  1    e
+        """
+        if not self.columns.is_unique:
+            duplicate_cols = self.columns[self.columns.duplicated()].tolist()
+            raise ValueError(
+                f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}"
+            )
+
+        columns: list[Hashable]
+        if is_scalar(column) or isinstance(column, tuple):
+            columns = [column]
+        elif isinstance(column, list) and all(
+            is_scalar(c) or isinstance(c, tuple) for c in column
+        ):
+            if not column:
+                raise ValueError("column must be nonempty")
+            if len(column) > len(set(column)):
+                raise ValueError("column must be unique")
+            columns = column
+        else:
+            raise ValueError("column must be a scalar, tuple, or list thereof")
+
+        df = self.reset_index(drop=True)
+        if len(columns) == 1:
+            result = df[columns[0]].explode()
+        else:
+            mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1
+            counts0 = self[columns[0]].apply(mylen)
+            for c in columns[1:]:
+                if not all(counts0 == self[c].apply(mylen)):
+                    raise ValueError("columns must have matching element counts")
+            result = DataFrame({c: df[c].explode() for c in columns})
+        result = df.drop(columns, axis=1).join(result)
+        if ignore_index:
+            result.index = default_index(len(result))
+        else:
+            result.index = self.index.take(result.index)
+        result = result.reindex(columns=self.columns)
+
+        return result.__finalize__(self, method="explode")
+
+    def unstack(
+        self, level: IndexLabel = -1, fill_value=None, sort: bool = True
+    ) -> DataFrame | Series:
+        """
+        Pivot a level of the (necessarily hierarchical) index labels.
+
+        Returns a DataFrame having a new level of column labels whose inner-most level
+        consists of the pivoted index labels.
+
+        If the index is not a MultiIndex, the output will be a Series
+        (the analogue of stack when the columns are not a MultiIndex).
+
+        Parameters
+        ----------
+        level : int, str, or list of these, default -1 (last level)
+            Level(s) of index to unstack, can pass level name.
+        fill_value : scalar
+            Replace NaN with this value if the unstack produces missing values.
+        sort : bool, default True
+            Sort the level(s) in the resulting MultiIndex columns.
+
+        Returns
+        -------
+        Series or DataFrame
+            If index is a MultiIndex: DataFrame with pivoted index labels as new
+            inner-most level column labels, else Series.
+
+        See Also
+        --------
+        DataFrame.pivot : Pivot a table based on column values.
+        DataFrame.stack : Pivot a level of the column labels (inverse operation
+            from `unstack`).
+
+        Notes
+        -----
+        Reference :ref:`the user guide <reshaping.stacking>` for more examples.
+
+        Examples
+        --------
+        >>> index = pd.MultiIndex.from_tuples(
+        ...     [("one", "a"), ("one", "b"), ("two", "a"), ("two", "b")]
+        ... )
+        >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
+        >>> s
+        one  a   1.0
+             b   2.0
+        two  a   3.0
+             b   4.0
+        dtype: float64
+
+        >>> s.unstack(level=-1)
+             a   b
+        one  1.0  2.0
+        two  3.0  4.0
+
+        >>> s.unstack(level=0)
+           one  two
+        a  1.0   3.0
+        b  2.0   4.0
+
+        >>> df = s.unstack(level=0)
+        >>> df.unstack()
+        one  a  1.0
+             b  2.0
+        two  a  3.0
+             b  4.0
+        dtype: float64
+        """
+        from pandas.core.reshape.reshape import unstack
+
+        result = unstack(self, level, fill_value, sort)
+
+        return result.__finalize__(self, method="unstack")
+
+    def melt(
+        self,
+        id_vars=None,
+        value_vars=None,
+        var_name=None,
+        value_name: Hashable = "value",
+        col_level: Level | None = None,
+        ignore_index: bool = True,
+    ) -> DataFrame:
+        """
+        Unpivot DataFrame from wide to long format, optionally leaving identifiers set.
+
+        This function is useful to massage a DataFrame into a format where one
+        or more columns are identifier variables (`id_vars`), while all other
+        columns, considered measured variables (`value_vars`), are "unpivoted" to
+        the row axis, leaving just two non-identifier columns, 'variable' and
+        'value'.
+
+        Parameters
+        ----------
+        id_vars : scalar, tuple, list, or ndarray, optional
+            Column(s) to use as identifier variables.
+        value_vars : scalar, tuple, list, or ndarray, optional
+            Column(s) to unpivot. If not specified, uses all columns that
+            are not set as `id_vars`.
+        var_name : scalar, default None
+            Name to use for the 'variable' column. If None it uses
+            ``frame.columns.name`` or 'variable'.
+        value_name : scalar, default 'value'
+            Name to use for the 'value' column, can't be an existing column label.
+        col_level : scalar, optional
+            If columns are a MultiIndex then use this level to melt.
+        ignore_index : bool, default True
+            If True, original index is ignored. If False, original index is retained.
+            Index labels will be repeated as necessary.
+
+        Returns
+        -------
+        DataFrame
+            Unpivoted DataFrame.
+
+        See Also
+        --------
+        melt : Identical method.
+        pivot_table : Create a spreadsheet-style pivot table as a DataFrame.
+        DataFrame.pivot : Return reshaped DataFrame organized
+            by given index / column values.
+        DataFrame.explode : Explode a DataFrame from list-like
+                columns to long format.
+
+        Notes
+        -----
+        Reference :ref:`the user guide <reshaping.melt>` for more examples.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "A": {0: "a", 1: "b", 2: "c"},
+        ...         "B": {0: 1, 1: 3, 2: 5},
+        ...         "C": {0: 2, 1: 4, 2: 6},
+        ...     }
+        ... )
+        >>> df
+        A  B  C
+        0  a  1  2
+        1  b  3  4
+        2  c  5  6
+
+        >>> df.melt(id_vars=["A"], value_vars=["B"])
+        A variable  value
+        0  a        B      1
+        1  b        B      3
+        2  c        B      5
+
+        >>> df.melt(id_vars=["A"], value_vars=["B", "C"])
+        A variable  value
+        0  a        B      1
+        1  b        B      3
+        2  c        B      5
+        3  a        C      2
+        4  b        C      4
+        5  c        C      6
+
+        The names of 'variable' and 'value' columns can be customized:
+
+        >>> df.melt(
+        ...     id_vars=["A"],
+        ...     value_vars=["B"],
+        ...     var_name="myVarname",
+        ...     value_name="myValname",
+        ... )
+        A myVarname  myValname
+        0  a         B          1
+        1  b         B          3
+        2  c         B          5
+
+        Original index values can be kept around:
+
+        >>> df.melt(id_vars=["A"], value_vars=["B", "C"], ignore_index=False)
+        A variable  value
+        0  a        B      1
+        1  b        B      3
+        2  c        B      5
+        0  a        C      2
+        1  b        C      4
+        2  c        C      6
+
+        If you have multi-index columns:
+
+        >>> df.columns = [list("ABC"), list("DEF")]
+        >>> df
+        A  B  C
+        D  E  F
+        0  a  1  2
+        1  b  3  4
+        2  c  5  6
+
+        >>> df.melt(col_level=0, id_vars=["A"], value_vars=["B"])
+        A variable  value
+        0  a        B      1
+        1  b        B      3
+        2  c        B      5
+
+        >>> df.melt(id_vars=[("A", "D")], value_vars=[("B", "E")])
+        (A, D) variable_0 variable_1  value
+        0      a          B          E      1
+        1      b          B          E      3
+        2      c          B          E      5
+        """
+        return melt(
+            self,
+            id_vars=id_vars,
+            value_vars=value_vars,
+            var_name=var_name,
+            value_name=value_name,
+            col_level=col_level,
+            ignore_index=ignore_index,
+        ).__finalize__(self, method="melt")
+
+    # ----------------------------------------------------------------------
+    # Time series-related
+
+    def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
+        """
+        First discrete difference of element.
+
+        Calculates the difference of a DataFrame element compared with another
+        element in the DataFrame (default is element in previous row).
+
+        Parameters
+        ----------
+        periods : int, default 1
+            Periods to shift for calculating difference, accepts negative
+            values.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Take difference over rows (0) or columns (1).
+
+        Returns
+        -------
+        DataFrame
+            First differences of the Series.
+
+        See Also
+        --------
+        DataFrame.pct_change: Percent change over given number of periods.
+        DataFrame.shift: Shift index by desired number of periods with an
+            optional time freq.
+        Series.diff: First discrete difference of object.
+
+        Notes
+        -----
+        For boolean dtypes, this uses :meth:`operator.xor` rather than
+        :meth:`operator.sub`.
+        The result is calculated according to current dtype in DataFrame,
+        however dtype of the result is always float64.
+
+        Examples
+        --------
+
+        Difference with previous row
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "a": [1, 2, 3, 4, 5, 6],
+        ...         "b": [1, 1, 2, 3, 5, 8],
+        ...         "c": [1, 4, 9, 16, 25, 36],
+        ...     }
+        ... )
+        >>> df
+           a  b   c
+        0  1  1   1
+        1  2  1   4
+        2  3  2   9
+        3  4  3  16
+        4  5  5  25
+        5  6  8  36
+        >>> df.diff()
+             a    b     c
+        0  NaN  NaN   NaN
+        1  1.0  0.0   3.0
+        2  1.0  1.0   5.0
+        3  1.0  1.0   7.0
+        4  1.0  2.0   9.0
+        5  1.0  3.0  11.0
+
+        Difference with previous column
+
+        >>> df.diff(axis=1)
+            a  b   c
+        0 NaN  0   0
+        1 NaN -1   3
+        2 NaN -1   7
+        3 NaN -1  13
+        4 NaN  0  20
+        5 NaN  2  28
+
+        Difference with 3rd previous row
+
+        >>> df.diff(periods=3)
+             a    b     c
+        0  NaN  NaN   NaN
+        1  NaN  NaN   NaN
+        2  NaN  NaN   NaN
+        3  3.0  2.0  15.0
+        4  3.0  4.0  21.0
+        5  3.0  6.0  27.0
+
+        Difference with following row
+
+        >>> df.diff(periods=-1)
+             a    b     c
+        0 -1.0  0.0  -3.0
+        1 -1.0 -1.0  -5.0
+        2 -1.0 -1.0  -7.0
+        3 -1.0 -2.0  -9.0
+        4 -1.0 -3.0 -11.0
+        5  NaN  NaN   NaN
+
+        Overflow in input dtype
+
+        >>> df = pd.DataFrame({"a": [1, 0]}, dtype=np.uint8)
+        >>> df.diff()
+               a
+        0    NaN
+        1  255.0
+        """
+        if not lib.is_integer(periods):
+            if not (is_float(periods) and periods.is_integer()):
+                raise ValueError("periods must be an integer")
+            periods = int(periods)
+
+        axis = self._get_axis_number(axis)
+        if axis == 1:
+            if periods != 0:
+                # in the periods == 0 case, this is equivalent diff of 0 periods
+                #  along axis=0, and the Manager method may be somewhat more
+                #  performant, so we dispatch in that case.
+                return self - self.shift(periods, axis=axis)
+            # With periods=0 this is equivalent to a diff with axis=0
+            axis = 0
+
+        new_data = self._mgr.diff(n=periods)
+        res_df = self._constructor_from_mgr(new_data, axes=new_data.axes)
+        return res_df.__finalize__(self, "diff")
+
+    # ----------------------------------------------------------------------
+    # Function application
+
+    def _gotitem(
+        self,
+        key: IndexLabel,
+        ndim: int,
+        subset: DataFrame | Series | None = None,
+    ) -> DataFrame | Series:
+        """
+        Sub-classes to define. Return a sliced object.
+
+        Parameters
+        ----------
+        key : string / list of selections
+        ndim : {1, 2}
+            requested ndim of result
+        subset : object, default None
+            subset to act on
+        """
+        if subset is None:
+            subset = self
+        elif subset.ndim == 1:  # is Series
+            return subset
+
+        # TODO: _shallow_copy(subset)?
+        return subset[key]
+
+    _agg_see_also_doc = dedent(
+        """
+    See Also
+    --------
+    DataFrame.apply : Perform any type of operations.
+    DataFrame.transform : Perform transformation type operations.
+    DataFrame.groupby : Perform operations over groups.
+    DataFrame.resample : Perform operations over resampled bins.
+    DataFrame.rolling : Perform operations over rolling window.
+    DataFrame.expanding : Perform operations over expanding window.
+    core.window.ewm.ExponentialMovingWindow : Perform operation over exponential
+        weighted window.
+    """
+    )
+
+    _agg_examples_doc = dedent(
+        """
+    Examples
+    --------
+    >>> df = pd.DataFrame([[1, 2, 3],
+    ...                    [4, 5, 6],
+    ...                    [7, 8, 9],
+    ...                    [np.nan, np.nan, np.nan]],
+    ...                   columns=['A', 'B', 'C'])
+
+    Aggregate these functions over the rows.
+
+    >>> df.agg(['sum', 'min'])
+            A     B     C
+    sum  12.0  15.0  18.0
+    min   1.0   2.0   3.0
+
+    Different aggregations per column.
+
+    >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
+            A    B
+    sum  12.0  NaN
+    min   1.0  2.0
+    max   NaN  8.0
+
+    Aggregate different functions over the columns and rename the index
+    of the resulting DataFrame.
+
+    >>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean'))
+         A    B    C
+    x  7.0  NaN  NaN
+    y  NaN  2.0  NaN
+    z  NaN  NaN  6.0
+
+    Aggregate over the columns.
+
+    >>> df.agg("mean", axis="columns")
+    0    2.0
+    1    5.0
+    2    8.0
+    3    NaN
+    dtype: float64
+    """
+    )
+
+    def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):
+        """
+        Aggregate using one or more operations over the specified axis.
+
+        Parameters
+        ----------
+        func : function, str, list or dict
+            Function to use for aggregating the data. If a function, must either
+            work when passed a DataFrame or when passed to DataFrame.apply.
+
+            Accepted combinations are:
+
+            - function
+            - string function name
+            - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
+            - dict of axis labels -> functions, function names or list of such.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+                If 0 or 'index': apply function to each column.
+                If 1 or 'columns': apply function to each row.
+        *args
+            Positional arguments to pass to `func`.
+        **kwargs
+            Keyword arguments to pass to `func`.
+
+        Returns
+        -------
+        scalar, Series or DataFrame
+
+            The return can be:
+
+            * scalar : when Series.agg is called with single function
+            * Series : when DataFrame.agg is called with a single function
+            * DataFrame : when DataFrame.agg is called with several functions
+
+        See Also
+        --------
+        DataFrame.apply : Perform any type of operations.
+        DataFrame.transform : Perform transformation type operations.
+        DataFrame.groupby : Perform operations over groups.
+        DataFrame.resample : Perform operations over resampled bins.
+        DataFrame.rolling : Perform operations over rolling window.
+        DataFrame.expanding : Perform operations over expanding window.
+        core.window.ewm.ExponentialMovingWindow : Perform operation over exponential
+            weighted window.
+
+        Notes
+        -----
+        The aggregation operations are always performed over an axis, either the
+        index (default) or the column axis. This behavior is different from
+        `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
+        `var`), where the default is to compute the aggregation of the flattened
+        array, e.g., ``numpy.mean(arr_2d)`` as opposed to
+        ``numpy.mean(arr_2d, axis=0)``.
+
+        `agg` is an alias for `aggregate`. Use the alias.
+
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
+        for more details.
+
+        A passed user-defined-function will be passed a Series for evaluation.
+
+        If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [[1, 2, 3], [4, 5, 6], [7, 8, 9], [np.nan, np.nan, np.nan]],
+        ...     columns=["A", "B", "C"],
+        ... )
+
+        Aggregate these functions over the rows.
+
+        >>> df.agg(["sum", "min"])
+                A     B     C
+        sum  12.0  15.0  18.0
+        min   1.0   2.0   3.0
+
+        Different aggregations per column.
+
+        >>> df.agg({"A": ["sum", "min"], "B": ["min", "max"]})
+                A    B
+        sum  12.0  NaN
+        min   1.0  2.0
+        max   NaN  8.0
+
+        Aggregate different functions over the columns and rename the index of
+        the resulting DataFrame.
+
+        >>> df.agg(x=("A", "max"), y=("B", "min"), z=("C", "mean"))
+             A    B    C
+        x  7.0  NaN  NaN
+        y  NaN  2.0  NaN
+        z  NaN  NaN  6.0
+
+        Aggregate over the columns.
+
+        >>> df.agg("mean", axis="columns")
+        0    2.0
+        1    5.0
+        2    8.0
+        3    NaN
+        dtype: float64
+        """
+        from pandas.core.apply import frame_apply
+
+        axis = self._get_axis_number(axis)
+
+        op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
+        result = op.agg()
+        result = reconstruct_and_relabel_result(result, func, **kwargs)
+        return result
+
+    agg = aggregate
+
+    def transform(
+        self, func: AggFuncType, axis: Axis = 0, *args, **kwargs
+    ) -> DataFrame:
+        """
+        Call ``func`` on self producing a DataFrame with the same axis shape as self.
+
+        Parameters
+        ----------
+        func : function, str, list-like or dict-like
+            Function to use for transforming the data. If a function, must either
+            work when passed a DataFrame or when passed to DataFrame.apply. If func
+            is both list-like and dict-like, dict-like behavior takes precedence.
+
+            Accepted combinations are:
+
+            - function
+            - string function name
+            - list-like of functions and/or function names, e.g. ``[np.exp, 'sqrt']``
+            - dict-like of axis labels -> functions, function names or list-like
+              of such.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+                If 0 or 'index': apply function to each column.
+                If 1 or 'columns': apply function to each row.
+        *args
+            Positional arguments to pass to `func`.
+        **kwargs
+            Keyword arguments to pass to `func`.
+
+        Returns
+        -------
+        DataFrame
+            A DataFrame that must have the same length as self.
+
+        Raises
+        ------
+        ValueError : If the returned DataFrame has a different length than self.
+
+        See Also
+        --------
+        DataFrame.agg : Only perform aggregating type operations.
+        DataFrame.apply : Invoke function on a DataFrame.
+
+        Notes
+        -----
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
+        for more details.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": range(3), "B": range(1, 4)})
+        >>> df
+           A  B
+        0  0  1
+        1  1  2
+        2  2  3
+        >>> df.transform(lambda x: x + 1)
+           A  B
+        0  1  2
+        1  2  3
+        2  3  4
+
+        Even though the resulting DataFrame must have the same length as the
+        input DataFrame, it is possible to provide several input functions:
+
+        >>> s = pd.Series(range(3))
+        >>> s
+        0    0
+        1    1
+        2    2
+        dtype: int64
+        >>> s.transform([np.sqrt, np.exp])
+               sqrt        exp
+        0  0.000000   1.000000
+        1  1.000000   2.718282
+        2  1.414214   7.389056
+
+        You can call transform on a GroupBy object:
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "Date": [
+        ...             "2015-05-08",
+        ...             "2015-05-07",
+        ...             "2015-05-06",
+        ...             "2015-05-05",
+        ...             "2015-05-08",
+        ...             "2015-05-07",
+        ...             "2015-05-06",
+        ...             "2015-05-05",
+        ...         ],
+        ...         "Data": [5, 8, 6, 1, 50, 100, 60, 120],
+        ...     }
+        ... )
+        >>> df
+                 Date  Data
+        0  2015-05-08     5
+        1  2015-05-07     8
+        2  2015-05-06     6
+        3  2015-05-05     1
+        4  2015-05-08    50
+        5  2015-05-07   100
+        6  2015-05-06    60
+        7  2015-05-05   120
+        >>> df.groupby("Date")["Data"].transform("sum")
+        0     55
+        1    108
+        2     66
+        3    121
+        4     55
+        5    108
+        6     66
+        7    121
+        Name: Data, dtype: int64
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "c": [1, 1, 1, 2, 2, 2, 2],
+        ...         "type": ["m", "n", "o", "m", "m", "n", "n"],
+        ...     }
+        ... )
+        >>> df
+           c type
+        0  1    m
+        1  1    n
+        2  1    o
+        3  2    m
+        4  2    m
+        5  2    n
+        6  2    n
+        >>> df["size"] = df.groupby("c")["type"].transform(len)
+        >>> df
+           c type size
+        0  1    m    3
+        1  1    n    3
+        2  1    o    3
+        3  2    m    4
+        4  2    m    4
+        5  2    n    4
+        6  2    n    4
+        """
+        from pandas.core.apply import frame_apply
+
+        op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
+        result = op.transform()
+        assert isinstance(result, DataFrame)
+        return result
+
+    def apply(
+        self,
+        func: AggFuncType,
+        axis: Axis = 0,
+        raw: bool = False,
+        result_type: Literal["expand", "reduce", "broadcast"] | None = None,
+        args=(),
+        by_row: Literal[False, "compat"] = "compat",
+        engine: Callable | None | Literal["python", "numba"] = None,
+        engine_kwargs: dict[str, bool] | None = None,
+        **kwargs,
+    ):
+        """
+        Apply a function along an axis of the DataFrame.
+
+        Objects passed to the function are Series objects whose index is
+        either the DataFrame's index (``axis=0``) or the DataFrame's columns
+        (``axis=1``). By default (``result_type=None``), the final return type
+        is inferred from the return type of the applied function. Otherwise,
+        it depends on the `result_type` argument. The return type of the applied
+        function is inferred based on the first computed result obtained after
+        applying the function to a Series object.
+
+        Parameters
+        ----------
+        func : function
+            Function to apply to each column or row.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Axis along which the function is applied:
+
+            * 0 or 'index': apply function to each column.
+            * 1 or 'columns': apply function to each row.
+
+        raw : bool, default False
+            Determines if row or column is passed as a Series or ndarray object:
+
+            * ``False`` : passes each row or column as a Series to the
+              function.
+            * ``True`` : the passed function will receive ndarray objects
+              instead.
+              If you are just applying a NumPy reduction function this will
+              achieve much better performance.
+
+         .. note::
+
+                When ``raw=True``, the result dtype is inferred from the **first**
+                returned value.
+
+        result_type : {'expand', 'reduce', 'broadcast', None}, default None
+            These only act when ``axis=1`` (columns):
+
+            * 'expand' : list-like results will be turned into columns.
+            * 'reduce' : returns a Series if possible rather than expanding
+              list-like results. This is the opposite of 'expand'.
+            * 'broadcast' : results will be broadcast to the original shape
+              of the DataFrame, the original index and columns will be
+              retained.
+
+            The default behaviour (None) depends on the return value of the
+            applied function: list-like results will be returned as a Series
+            of those. However if the apply function returns a Series these
+            are expanded to columns.
+        args : tuple
+            Positional arguments to pass to `func` in addition to the
+            array/series.
+        by_row : False or "compat", default "compat"
+            Only has an effect when ``func`` is a listlike or dictlike of funcs
+            and the func isn't a string.
+            If "compat", will if possible first translate the func into pandas
+            methods (e.g. ``Series().apply(np.sum)`` will be translated to
+            ``Series().sum()``). If that doesn't work, will try call to apply again with
+            ``by_row=True`` and if that fails, will call apply again with
+            ``by_row=False`` (backward compatible).
+            If False, the funcs will be passed the whole Series at once.
+
+            .. versionadded:: 2.1.0
+
+        engine : decorator or {'python', 'numba'}, optional
+            Choose the execution engine to use. If not provided the function
+            will be executed by the regular Python interpreter.
+
+            Other options include JIT compilers such Numba and Bodo, which in some
+            cases can speed up the execution. To use an executor you can provide
+            the decorators ``numba.jit``, ``numba.njit`` or ``bodo.jit``. You can
+            also provide the decorator with parameters, like ``numba.jit(nogit=True)``.
+
+            Not all functions can be executed with all execution engines. In general,
+            JIT compilers will require type stability in the function (no variable
+            should change data type during the execution). And not all pandas and
+            NumPy APIs are supported. Check the engine documentation [1]_ and [2]_
+            for limitations.
+
+            .. warning::
+
+                String parameters will stop being supported in a future pandas version.
+
+            .. versionadded:: 2.2.0
+
+        engine_kwargs : dict
+            Pass keyword arguments to the engine.
+            This is currently only used by the numba engine,
+            see the documentation for the engine argument for more information.
+
+        **kwargs
+            Additional keyword arguments to pass as keywords arguments to
+            `func`.
+
+        Returns
+        -------
+        Series or DataFrame
+            Result of applying ``func`` along the given axis of the
+            DataFrame.
+
+        See Also
+        --------
+        DataFrame.map: For elementwise operations.
+        DataFrame.aggregate: Only perform aggregating type operations.
+        DataFrame.transform: Only perform transforming type operations.
+
+        Notes
+        -----
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
+        for more details.
+
+        References
+        ----------
+        .. [1] `Numba documentation
+                <https://numba.readthedocs.io/en/stable/index.html>`_
+        .. [2] `Bodo documentation
+                <https://docs.bodo.ai/latest/>`/
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"])
+        >>> df
+           A  B
+        0  4  9
+        1  4  9
+        2  4  9
+
+        Using a numpy universal function (in this case the same as
+        ``np.sqrt(df)``):
+
+        >>> df.apply(np.sqrt)
+             A    B
+        0  2.0  3.0
+        1  2.0  3.0
+        2  2.0  3.0
+
+        Using a reducing function on either axis
+
+        >>> df.apply(np.sum, axis=0)
+        A    12
+        B    27
+        dtype: int64
+
+        >>> df.apply(np.sum, axis=1)
+        0    13
+        1    13
+        2    13
+        dtype: int64
+
+        Returning a list-like will result in a Series
+
+        >>> df.apply(lambda x: [1, 2], axis=1)
+        0    [1, 2]
+        1    [1, 2]
+        2    [1, 2]
+        dtype: object
+
+        Passing ``result_type='expand'`` will expand list-like results
+        to columns of a Dataframe
+
+        >>> df.apply(lambda x: [1, 2], axis=1, result_type="expand")
+           0  1
+        0  1  2
+        1  1  2
+        2  1  2
+
+        Returning a Series inside the function is similar to passing
+        ``result_type='expand'``. The resulting column names
+        will be the Series index.
+
+        >>> df.apply(lambda x: pd.Series([1, 2], index=["foo", "bar"]), axis=1)
+           foo  bar
+        0    1    2
+        1    1    2
+        2    1    2
+
+        Passing ``result_type='broadcast'`` will ensure the same shape
+        result, whether list-like or scalar is returned by the function,
+        and broadcast it along the axis. The resulting column names will
+        be the originals.
+
+        >>> df.apply(lambda x: [1, 2], axis=1, result_type="broadcast")
+           A  B
+        0  1  2
+        1  1  2
+        2  1  2
+
+        Advanced users can speed up their code by using a Just-in-time (JIT) compiler
+        with ``apply``. The main JIT compilers available for pandas are Numba and Bodo.
+        In general, JIT compilation is only possible when the function passed to
+        ``apply`` has type stability (variables in the function do not change their
+        type during the execution).
+
+        >>> import bodo  # doctest: +SKIP
+        >>> df.apply(lambda x: x.A + x.B, axis=1, engine=bodo.jit)  # doctest: +SKIP
+
+        Note that JIT compilation is only recommended for functions that take a
+        significant amount of time to run. Fast functions are unlikely to run faster
+        with JIT compilation.
+        """
+        if engine is None or isinstance(engine, str):
+            from pandas.core.apply import frame_apply
+
+            if engine is None:
+                engine = "python"
+
+            if engine not in ["python", "numba"]:
+                raise ValueError(f"Unknown engine '{engine}'")
+
+            op = frame_apply(
+                self,
+                func=func,
+                axis=axis,
+                raw=raw,
+                result_type=result_type,
+                by_row=by_row,
+                engine=engine,
+                engine_kwargs=engine_kwargs,
+                args=args,
+                kwargs=kwargs,
+            )
+            return op.apply().__finalize__(self, method="apply")
+        elif hasattr(engine, "__pandas_udf__"):
+            if result_type is not None:
+                raise NotImplementedError(
+                    f"{result_type=} only implemented for the default engine"
+                )
+
+            agg_axis = self._get_agg_axis(self._get_axis_number(axis))
+
+            # one axis is empty
+            if not all(self.shape):
+                func = cast(Callable, func)
+                try:
+                    if axis == 0:
+                        r = func(Series([], dtype=np.float64), *args, **kwargs)
+                    else:
+                        r = func(
+                            Series(index=self.columns, dtype=np.float64),
+                            *args,
+                            **kwargs,
+                        )
+                except Exception:
+                    pass
+                else:
+                    if not isinstance(r, Series):
+                        if len(agg_axis):
+                            r = func(Series([], dtype=np.float64), *args, **kwargs)
+                        else:
+                            r = np.nan
+
+                        return self._constructor_sliced(r, index=agg_axis)
+                return self.copy()
+
+            data: DataFrame | np.ndarray = self
+            if raw:
+                # This will upcast the whole DataFrame to the same type,
+                # and likely result in an object 2D array.
+                # We should probably pass a list of 1D arrays instead, at
+                # lest for ``axis=0``
+                data = self.values
+            result = engine.__pandas_udf__.apply(
+                data=data,
+                func=func,
+                args=args,
+                kwargs=kwargs,
+                decorator=engine,
+                axis=axis,
+            )
+            if raw:
+                if result.ndim == 2:
+                    return self._constructor(
+                        result, index=self.index, columns=self.columns
+                    )
+                else:
+                    return self._constructor_sliced(result, index=agg_axis)
+            return result
+        else:
+            raise ValueError(f"Unknown engine {engine}")
+
+    def map(
+        self, func: PythonFuncType, na_action: Literal["ignore"] | None = None, **kwargs
+    ) -> DataFrame:
+        """
+        Apply a function to a Dataframe elementwise.
+
+        .. versionadded:: 2.1.0
+
+           DataFrame.applymap was deprecated and renamed to DataFrame.map.
+
+        This method applies a function that accepts and returns a scalar
+        to every element of a DataFrame.
+
+        Parameters
+        ----------
+        func : callable
+            Python function, returns a single value from a single value.
+        na_action : {None, 'ignore'}, default None
+            If 'ignore', propagate NaN values, without passing them to func.
+        **kwargs
+            Additional keyword arguments to pass as keywords arguments to
+            `func`.
+
+        Returns
+        -------
+        DataFrame
+            Transformed DataFrame.
+
+        See Also
+        --------
+        DataFrame.apply : Apply a function along input axis of DataFrame.
+        DataFrame.replace: Replace values given in `to_replace` with `value`.
+        Series.map : Apply a function elementwise on a Series.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
+        >>> df
+               0      1
+        0  1.000  2.120
+        1  3.356  4.567
+
+        >>> df.map(lambda x: len(str(x)))
+           0  1
+        0  3  4
+        1  5  5
+
+        Like Series.map, NA values can be ignored:
+
+        >>> df_copy = df.copy()
+        >>> df_copy.iloc[0, 0] = pd.NA
+        >>> df_copy.map(lambda x: len(str(x)), na_action="ignore")
+             0  1
+        0  NaN  4
+        1  5.0  5
+
+        It is also possible to use `map` with functions that are not
+        `lambda` functions:
+
+        >>> df.map(round, ndigits=1)
+             0    1
+        0  1.0  2.1
+        1  3.4  4.6
+
+        Note that a vectorized version of `func` often exists, which will
+        be much faster. You could square each number elementwise.
+
+        >>> df.map(lambda x: x**2)
+                   0          1
+        0   1.000000   4.494400
+        1  11.262736  20.857489
+
+        But it's better to avoid map in that case.
+
+        >>> df**2
+                   0          1
+        0   1.000000   4.494400
+        1  11.262736  20.857489
+        """
+        if na_action not in {"ignore", None}:
+            raise ValueError(f"na_action must be 'ignore' or None. Got {na_action!r}")
+
+        if self.empty:
+            return self.copy()
+
+        func = functools.partial(func, **kwargs)
+
+        def infer(x):
+            return x._map_values(func, na_action=na_action)
+
+        return self.apply(infer).__finalize__(self, "map")
+
+    # ----------------------------------------------------------------------
+    # Merging / joining methods
+
+    def _append_internal(
+        self,
+        other: Series,
+        ignore_index: bool = False,
+    ) -> DataFrame:
+        assert isinstance(other, Series), type(other)
+
+        if other.name is None and not ignore_index:
+            raise TypeError(
+                "Can only append a Series if ignore_index=True "
+                "or if the Series has a name"
+            )
+
+        index = Index(
+            [other.name],
+            name=(
+                self.index.names
+                if isinstance(self.index, MultiIndex)
+                else self.index.name
+            ),
+        )
+
+        row_df = other.to_frame().T
+        if isinstance(self.index.dtype, ExtensionDtype):
+            # GH#41626 retain e.g. CategoricalDtype if reached via
+            #  df.loc[key] = item
+            row_df.index = self.index.array._cast_pointwise_result(row_df.index._values)
+
+        # infer_objects is needed for
+        #  test_append_empty_frame_to_series_with_dateutil_tz
+        row_df = row_df.infer_objects().rename_axis(index.names)
+
+        from pandas.core.reshape.concat import concat
+
+        result = concat(
+            [self, row_df],
+            ignore_index=ignore_index,
+        )
+        return result.__finalize__(self, method="append")
+
+    def join(
+        self,
+        other: DataFrame | Series | Iterable[DataFrame | Series],
+        on: IndexLabel | None = None,
+        how: MergeHow = "left",
+        lsuffix: str = "",
+        rsuffix: str = "",
+        sort: bool = False,
+        validate: JoinValidate | None = None,
+    ) -> DataFrame:
+        """
+        Join columns of another DataFrame.
+
+        Join columns with `other` DataFrame either on index or on a key
+        column. Efficiently join multiple DataFrame objects by index at once by
+        passing a list.
+
+        Parameters
+        ----------
+        other : DataFrame, Series, or a list containing any combination of them
+            Index should be similar to one of the columns in this one. If a
+            Series is passed, its name attribute must be set, and that will be
+            used as the column name in the resulting joined DataFrame.
+        on : str, list of str, or array-like, optional
+            Column or index level name(s) in the caller to join on the index
+            in `other`, otherwise joins index-on-index. If multiple
+            values given, the `other` DataFrame must have a MultiIndex. Can
+            pass an array as the join key if it is not already contained in
+            the calling DataFrame. Like an Excel VLOOKUP operation.
+        how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'},
+            default 'left'
+            How to handle the operation of the two objects.
+
+            * left: use calling frame's index (or column if on is specified)
+            * right: use `other`'s index.
+            * outer: form union of calling frame's index (or column if on is
+              specified) with `other`'s index, and sort it lexicographically.
+            * inner: form intersection of calling frame's index (or column if
+              on is specified) with `other`'s index, preserving the order
+              of the calling's one.
+            * cross: creates the cartesian product from both frames, preserves the order
+              of the left keys.
+            * left_anti: use set difference of calling frame's index and `other`'s
+              index.
+            * right_anti: use set difference of `other`'s index and calling frame's
+              index.
+        lsuffix : str, default ''
+            Suffix to use from left frame's overlapping columns.
+        rsuffix : str, default ''
+            Suffix to use from right frame's overlapping columns.
+        sort : bool, default False
+            Order result DataFrame lexicographically by the join key. If False,
+            the order of the join key depends on the join type (how keyword).
+        validate : str, optional
+            If specified, checks if join is of specified type.
+
+            * "one_to_one" or "1:1": check if join keys are unique in both left
+              and right datasets.
+            * "one_to_many" or "1:m": check if join keys are unique in left dataset.
+            * "many_to_one" or "m:1": check if join keys are unique in right dataset.
+            * "many_to_many" or "m:m": allowed, but does not result in checks.
+
+        Returns
+        -------
+        DataFrame
+            A dataframe containing columns from both the caller and `other`.
+
+        See Also
+        --------
+        DataFrame.merge : For column(s)-on-column(s) operations.
+
+        Notes
+        -----
+        Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
+        passing a list of `DataFrame` objects.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "key": ["K0", "K1", "K2", "K3", "K4", "K5"],
+        ...         "A": ["A0", "A1", "A2", "A3", "A4", "A5"],
+        ...     }
+        ... )
+
+        >>> df
+          key   A
+        0  K0  A0
+        1  K1  A1
+        2  K2  A2
+        3  K3  A3
+        4  K4  A4
+        5  K5  A5
+
+        >>> other = pd.DataFrame({"key": ["K0", "K1", "K2"], "B": ["B0", "B1", "B2"]})
+
+        >>> other
+          key   B
+        0  K0  B0
+        1  K1  B1
+        2  K2  B2
+
+        Join DataFrames using their indexes.
+
+        >>> df.join(other, lsuffix="_caller", rsuffix="_other")
+          key_caller   A key_other    B
+        0         K0  A0        K0   B0
+        1         K1  A1        K1   B1
+        2         K2  A2        K2   B2
+        3         K3  A3       NaN  NaN
+        4         K4  A4       NaN  NaN
+        5         K5  A5       NaN  NaN
+
+        If we want to join using the key columns, we need to set key to be
+        the index in both `df` and `other`. The joined DataFrame will have
+        key as its index.
+
+        >>> df.set_index("key").join(other.set_index("key"))
+              A    B
+        key
+        K0   A0   B0
+        K1   A1   B1
+        K2   A2   B2
+        K3   A3  NaN
+        K4   A4  NaN
+        K5   A5  NaN
+
+        Another option to join using the key columns is to use the `on`
+        parameter. DataFrame.join always uses `other`'s index but we can use
+        any column in `df`. This method preserves the original DataFrame's
+        index in the result.
+
+        >>> df.join(other.set_index("key"), on="key")
+          key   A    B
+        0  K0  A0   B0
+        1  K1  A1   B1
+        2  K2  A2   B2
+        3  K3  A3  NaN
+        4  K4  A4  NaN
+        5  K5  A5  NaN
+
+        Using non-unique key values shows how they are matched.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "key": ["K0", "K1", "K1", "K3", "K0", "K1"],
+        ...         "A": ["A0", "A1", "A2", "A3", "A4", "A5"],
+        ...     }
+        ... )
+
+        >>> df
+          key   A
+        0  K0  A0
+        1  K1  A1
+        2  K1  A2
+        3  K3  A3
+        4  K0  A4
+        5  K1  A5
+
+        >>> df.join(other.set_index("key"), on="key", validate="m:1")
+          key   A    B
+        0  K0  A0   B0
+        1  K1  A1   B1
+        2  K1  A2   B1
+        3  K3  A3  NaN
+        4  K0  A4   B0
+        5  K1  A5   B1
+        """
+        from pandas.core.reshape.concat import concat
+        from pandas.core.reshape.merge import merge
+
+        if isinstance(other, Series):
+            if other.name is None:
+                raise ValueError("Other Series must have a name")
+            other = DataFrame({other.name: other})
+
+        if isinstance(other, DataFrame):
+            if how == "cross":
+                return merge(
+                    self,
+                    other,
+                    how=how,
+                    on=on,
+                    suffixes=(lsuffix, rsuffix),
+                    sort=sort,
+                    validate=validate,
+                )
+            return merge(
+                self,
+                other,
+                left_on=on,
+                how=how,
+                left_index=on is None,
+                right_index=True,
+                suffixes=(lsuffix, rsuffix),
+                sort=sort,
+                validate=validate,
+            )
+        else:
+            if on is not None:
+                raise ValueError(
+                    "Joining multiple DataFrames only supported for joining on index"
+                )
+
+            if rsuffix or lsuffix:
+                raise ValueError(
+                    "Suffixes not supported when joining multiple DataFrames"
+                )
+
+            # Mypy thinks the RHS is a
+            # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas
+            # the LHS is an "Iterable[DataFrame]", but in reality both types are
+            # "Iterable[Union[DataFrame, Series]]" due to the if statements
+            frames = [cast("DataFrame | Series", self), *list(other)]
+
+            can_concat = all(df.index.is_unique for df in frames)
+
+            # join indexes only using concat
+            if can_concat:
+                if how in {"left", "right"}:
+                    res = concat(
+                        frames, axis=1, join="outer", verify_integrity=True, sort=sort
+                    )
+                    index = self.index if how == "left" else frames[-1].index
+                    if sort:
+                        index = index.sort_values()
+                    result = res.reindex(index)
+                    return result
+                else:
+                    if how == "outer":
+                        sort = True
+                    return concat(
+                        frames, axis=1, join=how, verify_integrity=True, sort=sort
+                    )
+
+            joined = frames[0]
+
+            for frame in frames[1:]:
+                joined = merge(
+                    joined,
+                    frame,
+                    sort=sort,
+                    how=how,
+                    left_index=True,
+                    right_index=True,
+                    validate=validate,
+                )
+
+            return joined
+
+    @Substitution("")
+    @Appender(_merge_doc, indents=2)
+    def merge(
+        self,
+        right: DataFrame | Series,
+        how: MergeHow = "inner",
+        on: IndexLabel | AnyArrayLike | None = None,
+        left_on: IndexLabel | AnyArrayLike | None = None,
+        right_on: IndexLabel | AnyArrayLike | None = None,
+        left_index: bool = False,
+        right_index: bool = False,
+        sort: bool = False,
+        suffixes: Suffixes = ("_x", "_y"),
+        copy: bool | lib.NoDefault = lib.no_default,
+        indicator: str | bool = False,
+        validate: MergeValidate | None = None,
+    ) -> DataFrame:
+        self._check_copy_deprecation(copy)
+
+        from pandas.core.reshape.merge import merge
+
+        return merge(
+            self,
+            right,
+            how=how,
+            on=on,
+            left_on=left_on,
+            right_on=right_on,
+            left_index=left_index,
+            right_index=right_index,
+            sort=sort,
+            suffixes=suffixes,
+            indicator=indicator,
+            validate=validate,
+        )
+
+    def round(
+        self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs
+    ) -> DataFrame:
+        """
+        Round numeric columns in a DataFrame to a variable number of decimal places.
+
+        Parameters
+        ----------
+        decimals : int, dict, Series
+            Number of decimal places to round each column to. If an int is
+            given, round each column to the same number of places.
+            Otherwise dict and Series round to variable numbers of places.
+            Column names should be in the keys if `decimals` is a
+            dict-like, or in the index if `decimals` is a Series. Any
+            columns not included in `decimals` will be left as is. Elements
+            of `decimals` which are not columns of the input will be
+            ignored.
+        *args
+            Additional keywords have no effect but might be accepted for
+            compatibility with numpy.
+        **kwargs
+            Additional keywords have no effect but might be accepted for
+            compatibility with numpy.
+
+        Returns
+        -------
+        DataFrame
+            A DataFrame with the affected columns rounded to the specified
+            number of decimal places.
+
+        See Also
+        --------
+        numpy.around : Round a numpy array to the given number of decimals.
+        Series.round : Round a Series to the given number of decimals.
+
+        Notes
+        -----
+        For values exactly halfway between rounded decimal values, pandas rounds
+        to the nearest even value (e.g. -0.5 and 0.5 round to 0.0, 1.5 and 2.5
+        round to 2.0, etc.).
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [(0.21, 0.32), (0.01, 0.67), (0.66, 0.03), (0.21, 0.18)],
+        ...     columns=["dogs", "cats"],
+        ... )
+        >>> df
+            dogs  cats
+        0  0.21  0.32
+        1  0.01  0.67
+        2  0.66  0.03
+        3  0.21  0.18
+
+        By providing an integer each column is rounded to the same number
+        of decimal places
+
+        >>> df.round(1)
+            dogs  cats
+        0   0.2   0.3
+        1   0.0   0.7
+        2   0.7   0.0
+        3   0.2   0.2
+
+        With a dict, the number of places for specific columns can be
+        specified with the column names as key and the number of decimal
+        places as value
+
+        >>> df.round({"dogs": 1, "cats": 0})
+            dogs  cats
+        0   0.2   0.0
+        1   0.0   1.0
+        2   0.7   0.0
+        3   0.2   0.0
+
+        Using a Series, the number of places for specific columns can be
+        specified with the column names as index and the number of
+        decimal places as value
+
+        >>> decimals = pd.Series([0, 1], index=["cats", "dogs"])
+        >>> df.round(decimals)
+            dogs  cats
+        0   0.2   0.0
+        1   0.0   1.0
+        2   0.7   0.0
+        3   0.2   0.0
+        """
+        from pandas.core.reshape.concat import concat
+
+        def _dict_round(df: DataFrame, decimals) -> Iterator[Series]:
+            for col, vals in df.items():
+                try:
+                    yield _series_round(vals, decimals[col])
+                except KeyError:
+                    yield vals
+
+        def _series_round(ser: Series, decimals: int) -> Series:
+            if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype):
+                return ser.round(decimals)
+            elif isinstance(ser._values, (DatetimeArray, TimedeltaArray, PeriodArray)):
+                # GH#57781
+                # TODO: also the ArrowDtype analogues?
+                warnings.warn(
+                    "obj.round has no effect with datetime, timedelta, "
+                    "or period dtypes. Use obj.dt.round(...) instead.",
+                    UserWarning,
+                    stacklevel=find_stack_level(),
+                )
+            return ser
+
+        nv.validate_round(args, kwargs)
+
+        if isinstance(decimals, (dict, Series)):
+            if isinstance(decimals, Series) and not decimals.index.is_unique:
+                raise ValueError("Index of decimals must be unique")
+            if is_dict_like(decimals) and not all(
+                is_integer(value) for _, value in decimals.items()
+            ):
+                raise TypeError("Values in decimals must be integers")
+            new_cols = list(_dict_round(self, decimals))
+        elif is_integer(decimals):
+            # Dispatch to Block.round
+            # Argument "decimals" to "round" of "BaseBlockManager" has incompatible
+            # type "Union[int, integer[Any]]"; expected "int"
+            new_mgr = self._mgr.round(
+                decimals=decimals,  # type: ignore[arg-type]
+            )
+            return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(
+                self, method="round"
+            )
+        else:
+            raise TypeError("decimals must be an integer, a dict-like or a Series")
+
+        if new_cols is not None and len(new_cols) > 0:
+            return self._constructor(
+                concat(new_cols, axis=1), index=self.index, columns=self.columns
+            ).__finalize__(self, method="round")
+        else:
+            return self.copy(deep=False)
+
+    # ----------------------------------------------------------------------
+    # Statistical methods, etc.
+
+    def corr(
+        self,
+        method: CorrelationMethod = "pearson",
+        min_periods: int = 1,
+        numeric_only: bool = False,
+    ) -> DataFrame:
+        """
+        Compute pairwise correlation of columns, excluding NA/null values.
+
+        Parameters
+        ----------
+        method : {'pearson', 'kendall', 'spearman'} or callable
+            Method of correlation:
+
+            * pearson : standard correlation coefficient
+            * kendall : Kendall Tau correlation coefficient
+            * spearman : Spearman rank correlation
+            * callable: callable with input two 1d ndarrays
+                and returning a float. Note that the returned matrix from corr
+                will have 1 along the diagonals and will be symmetric
+                regardless of the callable's behavior.
+        min_periods : int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result. Currently only available for Pearson
+            and Spearman correlation.
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionchanged:: 2.0.0
+                The default value of ``numeric_only`` is now ``False``.
+
+        Returns
+        -------
+        DataFrame
+            Correlation matrix.
+
+        See Also
+        --------
+        DataFrame.corrwith : Compute pairwise correlation with another
+            DataFrame or Series.
+        Series.corr : Compute the correlation between two Series.
+
+        Notes
+        -----
+        Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.
+
+        * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
+        * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
+        * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
+
+        Examples
+        --------
+        >>> def histogram_intersection(a, b):
+        ...     v = np.minimum(a, b).sum().round(decimals=1)
+        ...     return v
+        >>> df = pd.DataFrame(
+        ...     [(0.2, 0.3), (0.0, 0.6), (0.6, 0.0), (0.2, 0.1)],
+        ...     columns=["dogs", "cats"],
+        ... )
+        >>> df.corr(method=histogram_intersection)
+              dogs  cats
+        dogs   1.0   0.3
+        cats   0.3   1.0
+
+        >>> df = pd.DataFrame(
+        ...     [(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], columns=["dogs", "cats"]
+        ... )
+        >>> df.corr(min_periods=3)
+              dogs  cats
+        dogs   1.0   NaN
+        cats   NaN   1.0
+        """  # noqa: E501
+        data = self._get_numeric_data() if numeric_only else self
+        cols = data.columns
+        idx = cols.copy()
+        mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
+
+        if method == "pearson":
+            correl = libalgos.nancorr(mat, minp=min_periods)
+        elif method == "spearman":
+            correl = libalgos.nancorr_spearman(mat, minp=min_periods)
+        elif method == "kendall" or callable(method):
+            if min_periods is None:
+                min_periods = 1
+            mat = mat.T
+            corrf = nanops.get_corr_func(method)
+            K = len(cols)
+            correl = np.empty((K, K), dtype=float)
+            mask = np.isfinite(mat)
+            for i, ac in enumerate(mat):
+                for j, bc in enumerate(mat):
+                    if i > j:
+                        continue
+
+                    valid = mask[i] & mask[j]
+                    if valid.sum() < min_periods:
+                        c = np.nan
+                    elif i == j:
+                        c = 1.0
+                    elif not valid.all():
+                        c = corrf(ac[valid], bc[valid])
+                    else:
+                        c = corrf(ac, bc)
+                    correl[i, j] = c
+                    correl[j, i] = c
+        else:
+            raise ValueError(
+                "method must be either 'pearson', "
+                "'spearman', 'kendall', or a callable, "
+                f"'{method}' was supplied"
+            )
+
+        result = self._constructor(correl, index=idx, columns=cols, copy=False)
+        return result.__finalize__(self, method="corr")
+
+    def cov(
+        self,
+        min_periods: int | None = None,
+        ddof: int | None = 1,
+        numeric_only: bool = False,
+    ) -> DataFrame:
+        """
+        Compute pairwise covariance of columns, excluding NA/null values.
+
+        Compute the pairwise covariance among the series of a DataFrame.
+        The returned data frame is the `covariance matrix
+        <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
+        of the DataFrame.
+
+        Both NA and null values are automatically excluded from the
+        calculation. (See the note below about bias from missing values.)
+        A threshold can be set for the minimum number of
+        observations for each value created. Comparisons with observations
+        below this threshold will be returned as ``NaN``.
+
+        This method is generally used for the analysis of time series data to
+        understand the relationship between different measures
+        across time.
+
+        Parameters
+        ----------
+        min_periods : int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result.
+
+        ddof : int, default 1
+            Delta degrees of freedom.  The divisor used in calculations
+            is ``N - ddof``, where ``N`` represents the number of elements.
+            This argument is applicable only when no ``nan`` is in the dataframe.
+
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionchanged:: 2.0.0
+                The default value of ``numeric_only`` is now ``False``.
+
+        Returns
+        -------
+        DataFrame
+            The covariance matrix of the series of the DataFrame.
+
+        See Also
+        --------
+        Series.cov : Compute covariance with another Series.
+        core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample
+            covariance.
+        core.window.expanding.Expanding.cov : Expanding sample covariance.
+        core.window.rolling.Rolling.cov : Rolling sample covariance.
+
+        Notes
+        -----
+        Returns the covariance matrix of the DataFrame's time series.
+        The covariance is normalized by N-ddof.
+
+        For DataFrames that have Series that are missing data (assuming that
+        data is `missing at random
+        <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
+        the returned covariance matrix will be an unbiased estimate
+        of the variance and covariance between the member Series.
+
+        However, for many applications this estimate may not be acceptable
+        because the estimate covariance matrix is not guaranteed to be positive
+        semi-definite. This could lead to estimate correlations having
+        absolute values which are greater than one, and/or a non-invertible
+        covariance matrix. See `Estimation of covariance matrices
+        <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
+        matrices>`__ for more details.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [(1, 2), (0, 3), (2, 0), (1, 1)], columns=["dogs", "cats"]
+        ... )
+        >>> df.cov()
+                  dogs      cats
+        dogs  0.666667 -1.000000
+        cats -1.000000  1.666667
+
+        >>> np.random.seed(42)
+        >>> df = pd.DataFrame(
+        ...     np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]
+        ... )
+        >>> df.cov()
+                  a         b         c         d         e
+        a  0.998438 -0.020161  0.059277 -0.008943  0.014144
+        b -0.020161  1.059352 -0.008543 -0.024738  0.009826
+        c  0.059277 -0.008543  1.010670 -0.001486 -0.000271
+        d -0.008943 -0.024738 -0.001486  0.921297 -0.013692
+        e  0.014144  0.009826 -0.000271 -0.013692  0.977795
+
+        **Minimum number of periods**
+
+        This method also supports an optional ``min_periods`` keyword
+        that specifies the required minimum number of non-NA observations for
+        each column pair in order to have a valid result:
+
+        >>> np.random.seed(42)
+        >>> df = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"])
+        >>> df.loc[df.index[:5], "a"] = np.nan
+        >>> df.loc[df.index[5:10], "b"] = np.nan
+        >>> df.cov(min_periods=12)
+                  a         b         c
+        a  0.316741       NaN -0.150812
+        b       NaN  1.248003  0.191417
+        c -0.150812  0.191417  0.895202
+        """
+        data = self._get_numeric_data() if numeric_only else self
+        if any(blk.dtype.kind in "mM" for blk in self._mgr.blocks):
+            msg = (
+                "DataFrame contains columns with dtype datetime64 "
+                "or timedelta64, which are not supported for cov."
+            )
+            raise TypeError(msg)
+        cols = data.columns
+        idx = cols.copy()
+        mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
+
+        if notna(mat).all():
+            if min_periods is not None and min_periods > len(mat):
+                base_cov = np.empty((mat.shape[1], mat.shape[1]))
+                base_cov.fill(np.nan)
+            else:
+                base_cov = np.cov(mat.T, ddof=ddof)
+            base_cov = base_cov.reshape((len(cols), len(cols)))
+        else:
+            base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)
+
+        result = self._constructor(base_cov, index=idx, columns=cols, copy=False)
+        return result.__finalize__(self, method="cov")
+
+    def corrwith(
+        self,
+        other: DataFrame | Series,
+        axis: Axis = 0,
+        drop: bool = False,
+        method: CorrelationMethod = "pearson",
+        numeric_only: bool = False,
+        min_periods: int | None = None,
+    ) -> Series:
+        """
+        Compute pairwise correlation.
+
+        Pairwise correlation is computed between rows or columns of
+        DataFrame with rows or columns of Series or DataFrame. DataFrames
+        are first aligned along both axes before computing the
+        correlations.
+
+        Parameters
+        ----------
+        other : DataFrame, Series
+            Object with which to compute correlations.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for
+            column-wise.
+        drop : bool, default False
+            Drop missing indices from result.
+        method : {'pearson', 'kendall', 'spearman'} or callable
+            Method of correlation:
+
+            * pearson : standard correlation coefficient
+            * kendall : Kendall Tau correlation coefficient
+            * spearman : Spearman rank correlation
+            * callable: callable with input two 1d ndarrays
+                and returning a float.
+
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+        min_periods : int, optional
+            Minimum number of observations needed to have a valid result.
+
+            .. versionchanged:: 2.0.0
+                The default value of ``numeric_only`` is now ``False``.
+
+        Returns
+        -------
+        Series
+            Pairwise correlations.
+
+        See Also
+        --------
+        DataFrame.corr : Compute pairwise correlation of columns.
+
+        Examples
+        --------
+        >>> index = ["a", "b", "c", "d", "e"]
+        >>> columns = ["one", "two", "three", "four"]
+        >>> df1 = pd.DataFrame(
+        ...     np.arange(20).reshape(5, 4), index=index, columns=columns
+        ... )
+        >>> df2 = pd.DataFrame(
+        ...     np.arange(16).reshape(4, 4), index=index[:4], columns=columns
+        ... )
+        >>> df1.corrwith(df2)
+        one      1.0
+        two      1.0
+        three    1.0
+        four     1.0
+        dtype: float64
+
+        >>> df2.corrwith(df1, axis=1)
+        a    1.0
+        b    1.0
+        c    1.0
+        d    1.0
+        e    NaN
+        dtype: float64
+        """
+        axis = self._get_axis_number(axis)
+        this = self._get_numeric_data() if numeric_only else self
+
+        if isinstance(other, Series):
+            return this.apply(
+                lambda x: other.corr(x, method=method, min_periods=min_periods),
+                axis=axis,
+            )
+
+        if numeric_only:
+            other = other._get_numeric_data()
+        left, right = this.align(other, join="inner")
+
+        if axis == 1:
+            left = left.T
+            right = right.T
+
+        if method == "pearson":
+            # mask missing values
+            left = left + right * 0
+            right = right + left * 0
+
+            # demeaned data
+            ldem = left - left.mean(numeric_only=numeric_only)
+            rdem = right - right.mean(numeric_only=numeric_only)
+
+            num = (ldem * rdem).sum()
+            dom = (
+                (left.count() - 1)
+                * left.std(numeric_only=numeric_only)
+                * right.std(numeric_only=numeric_only)
+            )
+
+            correl = num / dom
+
+        elif method in ["kendall", "spearman"] or callable(method):
+
+            def c(x):
+                return nanops.nancorr(x[0], x[1], method=method)
+
+            correl = self._constructor_sliced(
+                map(c, zip(left.values.T, right.values.T, strict=True)),
+                index=left.columns,
+                copy=False,
+            )
+
+        else:
+            raise ValueError(
+                f"Invalid method {method} was passed, "
+                "valid methods are: 'pearson', 'kendall', "
+                "'spearman', or callable"
+            )
+
+        if not drop:
+            # Find non-matching labels along the given axis
+            # and append missing correlations (GH 22375)
+            raxis: AxisInt = 1 if axis == 0 else 0
+            result_index = this._get_axis(raxis).union(other._get_axis(raxis))
+            idx_diff = result_index.difference(correl.index)
+
+            if len(idx_diff) > 0:
+                correl = correl._append_internal(
+                    Series([np.nan] * len(idx_diff), index=idx_diff)
+                )
+
+        return correl
+
+    # ----------------------------------------------------------------------
+    # ndarray-like stats methods
+
+    def count(self, axis: Axis = 0, numeric_only: bool = False) -> Series:
+        """
+        Count non-NA cells for each column or row.
+
+        The values `None`, `NaN`, `NaT`, ``pandas.NA`` are considered NA.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            If 0 or 'index' counts are generated for each column.
+            If 1 or 'columns' counts are generated for each row.
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+        Returns
+        -------
+        Series
+            For each column/row the number of non-NA/null entries.
+
+        See Also
+        --------
+        Series.count: Number of non-NA elements in a Series.
+        DataFrame.value_counts: Count unique combinations of columns.
+        DataFrame.shape: Number of DataFrame rows and columns (including NA
+            elements).
+        DataFrame.isna: Boolean same-sized DataFrame showing places of NA
+            elements.
+
+        Examples
+        --------
+        Constructing DataFrame from a dictionary:
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "Person": ["John", "Myla", "Lewis", "John", "Myla"],
+        ...         "Age": [24.0, np.nan, 21.0, 33, 26],
+        ...         "Single": [False, True, True, True, False],
+        ...     }
+        ... )
+        >>> df
+           Person   Age  Single
+        0    John  24.0   False
+        1    Myla   NaN    True
+        2   Lewis  21.0    True
+        3    John  33.0    True
+        4    Myla  26.0   False
+
+        Notice the uncounted NA values:
+
+        >>> df.count()
+        Person    5
+        Age       4
+        Single    5
+        dtype: int64
+
+        Counts for each **row**:
+
+        >>> df.count(axis="columns")
+        0    3
+        1    2
+        2    3
+        3    3
+        4    3
+        dtype: int64
+        """
+        axis = self._get_axis_number(axis)
+
+        if numeric_only:
+            frame = self._get_numeric_data()
+        else:
+            frame = self
+
+        # GH #423
+        if len(frame._get_axis(axis)) == 0:
+            result = self._constructor_sliced(0, index=frame._get_agg_axis(axis))
+        else:
+            result = notna(frame).sum(axis=axis)
+
+        return result.astype("int64").__finalize__(self, method="count")
+
+    def _reduce(
+        self,
+        op,
+        name: str,
+        *,
+        axis: Axis = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        filter_type=None,
+        **kwds,
+    ):
+        assert filter_type is None or filter_type == "bool", filter_type
+        out_dtype = "bool" if filter_type == "bool" else None
+
+        if axis is not None:
+            axis = self._get_axis_number(axis)
+
+        def func(values: np.ndarray):
+            # We only use this in the case that operates on self.values
+            return op(values, axis=axis, skipna=skipna, **kwds)
+
+        def blk_func(values, axis: Axis = 1):
+            if isinstance(values, ExtensionArray):
+                if not is_1d_only_ea_dtype(values.dtype):
+                    return values._reduce(name, axis=1, skipna=skipna, **kwds)
+                return values._reduce(name, skipna=skipna, keepdims=True, **kwds)
+            else:
+                return op(values, axis=axis, skipna=skipna, **kwds)
+
+        def _get_data() -> DataFrame:
+            if filter_type is None:
+                data = self._get_numeric_data()
+            else:
+                # GH#25101, GH#24434
+                assert filter_type == "bool"
+                data = self._get_bool_data()
+            return data
+
+        # Case with EAs see GH#35881
+        df = self
+        if numeric_only:
+            df = _get_data()
+        if axis is None:
+            dtype = find_common_type([block.values.dtype for block in df._mgr.blocks])
+            if isinstance(dtype, ExtensionDtype):
+                df = df.astype(dtype)
+                arr = concat_compat(list(df._iter_column_arrays()))
+                return arr._reduce(name, skipna=skipna, keepdims=False, **kwds)
+            return maybe_unbox_numpy_scalar(func(df.values))
+        elif axis == 1:
+            if len(df.index) == 0:
+                # Taking a transpose would result in no columns, losing the dtype.
+                # In the empty case, reducing along axis 0 or 1 gives the same
+                # result dtype, so reduce with axis=0 and ignore values
+                result = df._reduce(
+                    op,
+                    name,
+                    axis=0,
+                    skipna=skipna,
+                    numeric_only=False,
+                    filter_type=filter_type,
+                    **kwds,
+                ).iloc[:0]
+                result.index = df.index
+                return result
+
+            if df.shape[1]:
+                dtype = find_common_type(
+                    [block.values.dtype for block in df._mgr.blocks]
+                )
+                if isinstance(dtype, ExtensionDtype):
+                    # GH 54341: fastpath for EA-backed axis=1 reductions
+                    # This flattens the frame into a single 1D array while keeping
+                    # track of the row and column indices of the original frame. Once
+                    # flattened, grouping by the row indices and aggregating should
+                    # be equivalent to transposing the original frame and aggregating
+                    # with axis=0.
+                    name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name)
+                    df = df.astype(dtype)
+                    arr = concat_compat(list(df._iter_column_arrays()))
+                    nrows, ncols = df.shape
+                    row_index = np.tile(np.arange(nrows), ncols)
+                    col_index = np.repeat(np.arange(ncols), nrows)
+                    ser = Series(arr, index=col_index, copy=False)
+                    if name == "all":
+                        # Behavior here appears incorrect; preserving
+                        # for backwards compatibility for now.
+                        # See https://github.com/pandas-dev/pandas/issues/57171
+                        skipna = True
+                    result = ser.groupby(row_index).agg(name, **kwds, skipna=skipna)
+                    result.index = df.index
+                    return result
+
+            df = df.T
+
+        # After possibly _get_data and transposing, we are now in the
+        #  simple case where we can use BlockManager.reduce
+        res = df._mgr.reduce(blk_func)
+        out = df._constructor_from_mgr(res, axes=res.axes).iloc[0]
+        out.name = None
+        if out_dtype is not None and out.dtype != "boolean":
+            out = out.astype(out_dtype)
+        elif (df._mgr.get_dtypes() == object).any() and name not in ["any", "all"]:
+            out = out.astype(object)
+        elif len(self) == 0 and out.dtype == object and name in ("sum", "prod"):
+            # Even if we are object dtype, follow numpy and return
+            #  float64, see test_apply_funcs_over_empty
+            out = out.astype(np.float64)
+
+        return out
+
+    def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
+        """
+        Special case for _reduce to try to avoid a potentially-expensive transpose.
+
+        Apply the reduction block-wise along axis=1 and then reduce the resulting
+        1D arrays.
+        """
+        if name == "all":
+            result = np.ones(len(self), dtype=bool)
+            ufunc = np.logical_and
+        elif name == "any":
+            result = np.zeros(len(self), dtype=bool)
+            # error: Incompatible types in assignment
+            # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'],
+            # Literal[20], Literal[False]]", variable has type
+            # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20],
+            # Literal[True]]")
+            ufunc = np.logical_or  # type: ignore[assignment]
+        else:
+            raise NotImplementedError(name)
+
+        for blocks in self._mgr.blocks:
+            middle = func(blocks.values, axis=0, skipna=skipna)
+            result = ufunc(result, middle)
+
+        res_ser = self._constructor_sliced(result, index=self.index, copy=False)
+        return res_ser
+
+    # error: Signature of "any" incompatible with supertype "NDFrame"
+    @overload  # type: ignore[override]
+    def any(
+        self,
+        *,
+        axis: Axis = ...,
+        bool_only: bool = ...,
+        skipna: bool = ...,
+        **kwargs,
+    ) -> Series: ...
+
+    @overload
+    def any(
+        self,
+        *,
+        axis: None,
+        bool_only: bool = ...,
+        skipna: bool = ...,
+        **kwargs,
+    ) -> bool: ...
+
+    @overload
+    def any(
+        self,
+        *,
+        axis: Axis | None,
+        bool_only: bool = ...,
+        skipna: bool = ...,
+        **kwargs,
+    ) -> Series | bool: ...
+
+    def any(
+        self,
+        *,
+        axis: Axis | None = 0,
+        bool_only: bool = False,
+        skipna: bool = True,
+        **kwargs,
+    ) -> Series | bool:
+        """
+        Return whether any element is True, potentially over an axis.
+
+        Returns False unless there is at least one element within a series or
+        along a Dataframe axis that is True or equivalent (e.g. non-zero or
+        non-empty).
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns', None}, default 0
+            Indicate which axis or axes should be reduced. For `Series` this parameter
+            is unused and defaults to 0.
+
+            * 0 / 'index' : reduce the index, return a Series whose index is the
+              original column labels.
+            * 1 / 'columns' : reduce the columns, return a Series whose index is the
+              original index.
+            * None : reduce all axes, return a scalar.
+
+        bool_only : bool, default False
+            Include only boolean columns. Not implemented for Series.
+        skipna : bool, default True
+            Exclude NA/null values. If the entire row/column is NA and skipna is
+            True, then the result will be False, as for an empty row/column.
+            If skipna is False, then NA are treated as True, because these are not
+            equal to zero.
+        **kwargs : any, default None
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        Series or scalar
+            If axis=None, then a scalar boolean is returned.
+            Otherwise a Series is returned with index matching the index argument.
+
+        See Also
+        --------
+        numpy.any : Numpy version of this method.
+        Series.any : Return whether any element is True.
+        Series.all : Return whether all elements are True.
+        DataFrame.any : Return whether any element is True over requested axis.
+        DataFrame.all : Return whether all elements are True over requested axis.
+
+        Examples
+        --------
+        **Series**
+
+        For Series input, the output is a scalar indicating whether any element
+        is True.
+
+        >>> pd.Series([False, False]).any()
+        False
+        >>> pd.Series([True, False]).any()
+        True
+        >>> pd.Series([], dtype="float64").any()
+        False
+        >>> pd.Series([np.nan]).any()
+        False
+        >>> pd.Series([np.nan]).any(skipna=False)
+        True
+
+        **DataFrame**
+
+        Whether each column contains at least one True element (the default).
+
+        >>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
+        >>> df
+           A  B  C
+        0  1  0  0
+        1  2  2  0
+
+        >>> df.any()
+        A     True
+        B     True
+        C    False
+        dtype: bool
+
+        Aggregating over the columns.
+
+        >>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})
+        >>> df
+               A  B
+        0   True  1
+        1  False  2
+
+        >>> df.any(axis="columns")
+        0    True
+        1    True
+        dtype: bool
+
+        >>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})
+        >>> df
+               A  B
+        0   True  1
+        1  False  0
+
+        >>> df.any(axis="columns")
+        0    True
+        1    False
+        dtype: bool
+
+        Aggregating over the entire DataFrame with ``axis=None``.
+
+        >>> df.any(axis=None)
+        True
+
+        `any` for an empty DataFrame is an empty Series.
+
+        >>> pd.DataFrame([]).any()
+        Series([], dtype: bool)
+        """
+        result = self._logical_func(
+            "any", nanops.nanany, axis, bool_only, skipna, **kwargs
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="any")
+        return result
+
+    @overload
+    def all(
+        self,
+        *,
+        axis: Axis = ...,
+        bool_only: bool = ...,
+        skipna: bool = ...,
+        **kwargs,
+    ) -> Series: ...
+
+    @overload
+    def all(
+        self,
+        *,
+        axis: None,
+        bool_only: bool = ...,
+        skipna: bool = ...,
+        **kwargs,
+    ) -> bool: ...
+
+    @overload
+    def all(
+        self,
+        *,
+        axis: Axis | None,
+        bool_only: bool = ...,
+        skipna: bool = ...,
+        **kwargs,
+    ) -> Series | bool: ...
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="all")
+    def all(
+        self,
+        axis: Axis | None = 0,
+        bool_only: bool = False,
+        skipna: bool = True,
+        **kwargs,
+    ) -> Series | bool:
+        """
+        Return whether all elements are True, potentially over an axis.
+
+        Returns True unless there at least one element within a series or
+        along a Dataframe axis that is False or equivalent (e.g. zero or
+        empty).
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns', None}, default 0
+            Indicate which axis or axes should be reduced. For `Series` this parameter
+            is unused and defaults to 0.
+
+            * 0 / 'index' : reduce the index, return a Series whose index is the
+              original column labels.
+            * 1 / 'columns' : reduce the columns, return a Series whose index is the
+              original index.
+            * None : reduce all axes, return a scalar.
+
+        bool_only : bool, default False
+            Include only boolean columns. Not implemented for Series.
+        skipna : bool, default True
+            Exclude NA/null values. If the entire row/column is NA and skipna is
+            True, then the result will be True, as for an empty row/column.
+            If skipna is False, then NA are treated as True, because these are not
+            equal to zero.
+        **kwargs : any, default None
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        Series or scalar
+            If axis=None, then a scalar boolean is returned.
+            Otherwise a Series is returned with index matching the index argument.
+
+        See Also
+        --------
+        Series.all : Return True if all elements are True.
+        DataFrame.any : Return True if one (or more) elements are True.
+
+        Examples
+        --------
+        **Series**
+
+        >>> pd.Series([True, True]).all()
+        True
+        >>> pd.Series([True, False]).all()
+        False
+        >>> pd.Series([], dtype="float64").all()
+        True
+        >>> pd.Series([np.nan]).all()
+        True
+        >>> pd.Series([np.nan]).all(skipna=False)
+        True
+
+        **DataFrames**
+
+        Create a DataFrame from a dictionary.
+
+        >>> df = pd.DataFrame({"col1": [True, True], "col2": [True, False]})
+        >>> df
+           col1   col2
+        0  True   True
+        1  True  False
+
+        Default behaviour checks if values in each column all return True.
+
+        >>> df.all()
+        col1     True
+        col2    False
+        dtype: bool
+
+        Specify ``axis='columns'`` to check if values in each row all return True.
+
+        >>> df.all(axis="columns")
+        0     True
+        1    False
+        dtype: bool
+
+        Or ``axis=None`` for whether every value is True.
+
+        >>> df.all(axis=None)
+        False
+        """
+        result = self._logical_func(
+            "all", nanops.nanall, axis, bool_only, skipna, **kwargs
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="all")
+        return result
+
+    # error: Signature of "min" incompatible with supertype "NDFrame"
+    @overload  # type: ignore[override]
+    def min(
+        self,
+        *,
+        axis: Axis = ...,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series: ...
+
+    @overload
+    def min(
+        self,
+        *,
+        axis: None,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Any: ...
+
+    @overload
+    def min(
+        self,
+        *,
+        axis: Axis | None,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series | Any: ...
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="min")
+    def min(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | Any:
+        """
+        Return the minimum of the values over the requested axis.
+
+        If you want the *index* of the minimum, use ``idxmin``.
+        This is the equivalent of the ``numpy.ndarray`` method ``argmin``.
+
+        Parameters
+        ----------
+        axis : {index (0), columns (1)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            For DataFrames, specifying ``axis=None`` will apply the aggregation
+            across both axes.
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        Series or scalar
+            Value containing the calculation referenced in the description.
+
+        See Also
+        --------
+        Series.sum : Return the sum.
+        Series.min : Return the minimum.
+        Series.max : Return the maximum.
+        Series.idxmin : Return the index of the minimum.
+        Series.idxmax : Return the index of the maximum.
+        DataFrame.sum : Return the sum over the requested axis.
+        DataFrame.min : Return the minimum over the requested axis.
+        DataFrame.max : Return the maximum over the requested axis.
+        DataFrame.idxmin : Return the index of the minimum over the requested axis.
+        DataFrame.idxmax : Return the index of the maximum over the requested axis.
+
+        Examples
+        --------
+        >>> idx = pd.MultiIndex.from_arrays(
+        ...     [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]],
+        ...     names=["blooded", "animal"],
+        ... )
+        >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx)
+        >>> s
+        blooded  animal
+        warm     dog       4
+                 falcon    2
+        cold     fish      0
+                 spider    8
+        Name: legs, dtype: int64
+
+        >>> s.min()
+        0
+        """
+        result = super().min(
+            axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="min")
+        return result
+
+    # error: Signature of "max" incompatible with supertype "NDFrame"
+    @overload  # type: ignore[override]
+    def max(
+        self,
+        *,
+        axis: Axis = ...,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series: ...
+
+    @overload
+    def max(
+        self,
+        *,
+        axis: None,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Any: ...
+
+    @overload
+    def max(
+        self,
+        *,
+        axis: Axis | None,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series | Any: ...
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="max")
+    def max(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | Any:
+        """
+        Return the maximum of the values over the requested axis.
+
+        If you want the *index* of the maximum, use ``idxmax``.
+        This is the equivalent of the ``numpy.ndarray`` method ``argmax``.
+
+        Parameters
+        ----------
+        axis : {index (0), columns (1)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            For DataFrames, specifying ``axis=None`` will apply the aggregation
+            across both axes.
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        Series or scalar
+            Value containing the calculation referenced in the description.
+
+        See Also
+        --------
+        Series.sum : Return the sum.
+        Series.min : Return the minimum.
+        Series.max : Return the maximum.
+        Series.idxmin : Return the index of the minimum.
+        Series.idxmax : Return the index of the maximum.
+        DataFrame.sum : Return the sum over the requested axis.
+        DataFrame.min : Return the minimum over the requested axis.
+        DataFrame.max : Return the maximum over the requested axis.
+        DataFrame.idxmin : Return the index of the minimum over the requested axis.
+        DataFrame.idxmax : Return the index of the maximum over the requested axis.
+
+        Examples
+        --------
+        >>> idx = pd.MultiIndex.from_arrays(
+        ...     [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]],
+        ...     names=["blooded", "animal"],
+        ... )
+        >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx)
+        >>> s
+        blooded  animal
+        warm     dog       4
+                 falcon    2
+        cold     fish      0
+                 spider    8
+        Name: legs, dtype: int64
+
+        >>> s.max()
+        8
+        """
+        result = super().max(
+            axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="max")
+        return result
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="sum")
+    def sum(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        min_count: int = 0,
+        **kwargs,
+    ) -> Series:
+        """
+        Return the sum of the values over the requested axis.
+
+        This is equivalent to the method ``numpy.sum``.
+
+        Parameters
+        ----------
+        axis : {index (0), columns (1)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            .. warning::
+
+                The behavior of DataFrame.sum with ``axis=None`` is deprecated,
+                in a future version this will reduce over both axes and return a scalar
+                To retain the old behavior, pass axis=0 (or do not pass axis).
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns. Not implemented for Series.
+        min_count : int, default 0
+            The required number of valid values to perform the operation. If fewer than
+            ``min_count`` non-NA values are present the result will be NA.
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        Series or scalar
+            Sum over requested axis.
+
+        See Also
+        --------
+        Series.sum : Return the sum over Series values.
+        DataFrame.mean : Return the mean of the values over the requested axis.
+        DataFrame.median : Return the median of the values over the requested axis.
+        DataFrame.mode : Get the mode(s) of each element along the requested axis.
+        DataFrame.std : Return the standard deviation of the values over the
+            requested axis.
+
+        Examples
+        --------
+        >>> idx = pd.MultiIndex.from_arrays(
+        ...     [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]],
+        ...     names=["blooded", "animal"],
+        ... )
+        >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx)
+        >>> s
+        blooded  animal
+        warm     dog       4
+                 falcon    2
+        cold     fish      0
+                 spider    8
+        Name: legs, dtype: int64
+
+        >>> s.sum()
+        14
+
+        By default, the sum of an empty or all-NA Series is ``0``.
+
+        >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
+        0.0
+
+        This can be controlled with the ``min_count`` parameter. For example, if
+        you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
+
+        >>> pd.Series([], dtype="float64").sum(min_count=1)
+        nan
+
+        Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
+        empty series identically.
+
+        >>> pd.Series([np.nan]).sum()
+        0.0
+
+        >>> pd.Series([np.nan]).sum(min_count=1)
+        nan
+        """
+        result = super().sum(
+            axis=axis,
+            skipna=skipna,
+            numeric_only=numeric_only,
+            min_count=min_count,
+            **kwargs,
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="sum")
+        return result
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="prod")
+    def prod(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        min_count: int = 0,
+        **kwargs,
+    ) -> Series:
+        """
+        Return the product of the values over the requested axis.
+
+        Parameters
+        ----------
+        axis : {index (0), columns (1)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            .. warning::
+
+                The behavior of DataFrame.prod with ``axis=None`` is deprecated,
+                in a future version this will reduce over both axes and return a scalar
+                To retain the old behavior, pass axis=0 (or do not pass axis).
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns. Not implemented for Series.
+
+        min_count : int, default 0
+            The required number of valid values to perform the operation. If fewer than
+            ``min_count`` non-NA values are present the result will be NA.
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        Series or scalar
+            The product of the values over the requested axis.
+
+        See Also
+        --------
+        Series.sum : Return the sum.
+        Series.min : Return the minimum.
+        Series.max : Return the maximum.
+        Series.idxmin : Return the index of the minimum.
+        Series.idxmax : Return the index of the maximum.
+        DataFrame.sum : Return the sum over the requested axis.
+        DataFrame.min : Return the minimum over the requested axis.
+        DataFrame.max : Return the maximum over the requested axis.
+        DataFrame.idxmin : Return the index of the minimum over the requested axis.
+        DataFrame.idxmax : Return the index of the maximum over the requested axis.
+
+        Examples
+        --------
+        By default, the product of an empty or all-NA Series is ``1``
+
+        >>> pd.Series([], dtype="float64").prod()
+        1.0
+
+        This can be controlled with the ``min_count`` parameter
+
+        >>> pd.Series([], dtype="float64").prod(min_count=1)
+        nan
+
+        Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
+        empty series identically.
+
+        >>> pd.Series([np.nan]).prod()
+        1.0
+
+        >>> pd.Series([np.nan]).prod(min_count=1)
+        nan
+        """
+        result = super().prod(
+            axis=axis,
+            skipna=skipna,
+            numeric_only=numeric_only,
+            min_count=min_count,
+            **kwargs,
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="prod")
+        return result
+
+    # error: Signature of "mean" incompatible with supertype "NDFrame"
+    @overload  # type: ignore[override]
+    def mean(
+        self,
+        *,
+        axis: Axis = ...,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series: ...
+
+    @overload
+    def mean(
+        self,
+        *,
+        axis: None,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Any: ...
+
+    @overload
+    def mean(
+        self,
+        *,
+        axis: Axis | None,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series | Any: ...
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="mean")
+    def mean(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | Any:
+        """
+        Return the mean of the values over the requested axis.
+
+        Parameters
+        ----------
+        axis : {index (0), columns (1)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            For DataFrames, specifying ``axis=None`` will apply the aggregation
+            across both axes.
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        Series or scalar
+            Value containing the calculation referenced in the description.
+
+        See Also
+        --------
+        Series.sum : Return the sum.
+        Series.min : Return the minimum.
+        Series.max : Return the maximum.
+        Series.idxmin : Return the index of the minimum.
+        Series.idxmax : Return the index of the maximum.
+        DataFrame.sum : Return the sum over the requested axis.
+        DataFrame.min : Return the minimum over the requested axis.
+        DataFrame.max : Return the maximum over the requested axis.
+        DataFrame.idxmin : Return the index of the minimum over the requested axis.
+        DataFrame.idxmax : Return the index of the maximum over the requested axis.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.mean()
+        2.0
+
+        With a DataFrame
+
+        >>> df = pd.DataFrame({"a": [1, 2], "b": [2, 3]}, index=["tiger", "zebra"])
+        >>> df
+               a   b
+        tiger  1   2
+        zebra  2   3
+        >>> df.mean()
+        a   1.5
+        b   2.5
+        dtype: float64
+
+        Using axis=1
+
+        >>> df.mean(axis=1)
+        tiger   1.5
+        zebra   2.5
+        dtype: float64
+
+        In this case, `numeric_only` should be set to `True` to avoid
+        getting an error.
+
+        >>> df = pd.DataFrame({"a": [1, 2], "b": ["T", "Z"]}, index=["tiger", "zebra"])
+        >>> df.mean(numeric_only=True)
+        a   1.5
+        dtype: float64
+        """
+        result = super().mean(
+            axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="mean")
+        return result
+
+    # error: Signature of "median" incompatible with supertype "NDFrame"
+    @overload  # type: ignore[override]
+    def median(
+        self,
+        *,
+        axis: Axis = ...,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series: ...
+
+    @overload
+    def median(
+        self,
+        *,
+        axis: None,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Any: ...
+
+    @overload
+    def median(
+        self,
+        *,
+        axis: Axis | None,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series | Any: ...
+
+    @deprecate_nonkeyword_arguments(
+        Pandas4Warning, allowed_args=["self"], name="median"
+    )
+    def median(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | Any:
+        """
+        Return the median of the values over the requested axis.
+
+        Parameters
+        ----------
+        axis : {index (0), columns (1)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            For DataFrames, specifying ``axis=None`` will apply the aggregation
+            across both axes.
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        Series or scalar
+            Value containing the calculation referenced in the description.
+
+        See Also
+        --------
+        Series.sum : Return the sum.
+        Series.min : Return the minimum.
+        Series.max : Return the maximum.
+        Series.idxmin : Return the index of the minimum.
+        Series.idxmax : Return the index of the maximum.
+        DataFrame.sum : Return the sum over the requested axis.
+        DataFrame.min : Return the minimum over the requested axis.
+        DataFrame.max : Return the maximum over the requested axis.
+        DataFrame.idxmin : Return the index of the minimum over the requested axis.
+        DataFrame.idxmax : Return the index of the maximum over the requested axis.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.median()
+        2.0
+
+        With a DataFrame
+
+        >>> df = pd.DataFrame({"a": [1, 2], "b": [2, 3]}, index=["tiger", "zebra"])
+        >>> df
+               a   b
+        tiger  1   2
+        zebra  2   3
+        >>> df.median()
+        a   1.5
+        b   2.5
+        dtype: float64
+
+        Using axis=1
+
+        >>> df.median(axis=1)
+        tiger   1.5
+        zebra   2.5
+        dtype: float64
+
+        In this case, `numeric_only` should be set to `True`
+        to avoid getting an error.
+
+        >>> df = pd.DataFrame({"a": [1, 2], "b": ["T", "Z"]}, index=["tiger", "zebra"])
+        >>> df.median(numeric_only=True)
+        a   1.5
+        dtype: float64
+        """
+        result = super().median(
+            axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="median")
+        return result
+
+    # error: Signature of "sem" incompatible with supertype "NDFrame"
+    @overload  # type: ignore[override]
+    def sem(
+        self,
+        *,
+        axis: Axis = ...,
+        skipna: bool = ...,
+        ddof: int = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series: ...
+
+    @overload
+    def sem(
+        self,
+        *,
+        axis: None,
+        skipna: bool = ...,
+        ddof: int = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Any: ...
+
+    @overload
+    def sem(
+        self,
+        *,
+        axis: Axis | None,
+        skipna: bool = ...,
+        ddof: int = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series | Any: ...
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="sem")
+    def sem(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        ddof: int = 1,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | Any:
+        """
+        Return unbiased standard error of the mean over requested axis.
+
+        Normalized by N-1 by default. This can be changed using the ddof argument
+
+        Parameters
+        ----------
+        axis : {index (0), columns (1)}
+            For `Series` this parameter is unused and defaults to 0.
+
+            .. warning::
+
+                The behavior of DataFrame.sem with ``axis=None`` is deprecated,
+                in a future version this will reduce over both axes and return a scalar
+                To retain the old behavior, pass axis=0 (or do not pass axis).
+
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+        ddof : int, default 1
+            Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+            where N represents the number of elements.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns. Not implemented for Series.
+        **kwargs :
+            Additional keywords passed.
+
+        Returns
+        -------
+        Series or DataFrame (if level specified)
+            Unbiased standard error of the mean over requested axis.
+
+        See Also
+        --------
+        DataFrame.var : Return unbiased variance over requested axis.
+        DataFrame.std : Returns sample standard deviation over requested axis.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> round(s.sem(), 6)
+        0.57735
+
+        With a DataFrame
+
+        >>> df = pd.DataFrame({"a": [1, 2], "b": [2, 3]}, index=["tiger", "zebra"])
+        >>> df
+               a   b
+        tiger  1   2
+        zebra  2   3
+        >>> df.sem()
+        a   0.5
+        b   0.5
+        dtype: float64
+
+        Using axis=1
+
+        >>> df.sem(axis=1)
+        tiger   0.5
+        zebra   0.5
+        dtype: float64
+
+        In this case, `numeric_only` should be set to `True`
+        to avoid getting an error.
+
+        >>> df = pd.DataFrame({"a": [1, 2], "b": ["T", "Z"]}, index=["tiger", "zebra"])
+        >>> df.sem(numeric_only=True)
+        a   0.5
+        dtype: float64
+        """
+        result = super().sem(
+            axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="sem")
+        return result
+
+    # error: Signature of "var" incompatible with supertype "NDFrame"
+    @overload  # type: ignore[override]
+    def var(
+        self,
+        *,
+        axis: Axis = ...,
+        skipna: bool = ...,
+        ddof: int = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series: ...
+
+    @overload
+    def var(
+        self,
+        *,
+        axis: None,
+        skipna: bool = ...,
+        ddof: int = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Any: ...
+
+    @overload
+    def var(
+        self,
+        *,
+        axis: Axis | None,
+        skipna: bool = ...,
+        ddof: int = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series | Any: ...
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="var")
+    def var(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        ddof: int = 1,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | Any:
+        """
+        Return unbiased variance over requested axis.
+
+        Normalized by N-1 by default. This can be changed using the ddof argument.
+
+        Parameters
+        ----------
+        axis : {index (0), columns (1)}
+            For `Series` this parameter is unused and defaults to 0.
+
+            .. warning::
+
+                The behavior of DataFrame.var with ``axis=None`` is deprecated,
+                in a future version this will reduce over both axes and return a scalar
+                To retain the old behavior, pass axis=0 (or do not pass axis).
+
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+        ddof : int, default 1
+            Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+            where N represents the number of elements.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns. Not implemented for Series.
+        **kwargs :
+            Additional keywords passed.
+
+        Returns
+        -------
+        Series or scalaer
+            Unbiased variance over requested axis.
+
+        See Also
+        --------
+        numpy.var : Equivalent function in NumPy.
+        Series.var : Return unbiased variance over Series values.
+        Series.std : Return standard deviation over Series values.
+        DataFrame.std : Return standard deviation of the values over
+            the requested axis.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "person_id": [0, 1, 2, 3],
+        ...         "age": [21, 25, 62, 43],
+        ...         "height": [1.61, 1.87, 1.49, 2.01],
+        ...     }
+        ... ).set_index("person_id")
+        >>> df
+                   age  height
+        person_id
+        0           21    1.61
+        1           25    1.87
+        2           62    1.49
+        3           43    2.01
+
+        >>> df.var()
+        age       352.916667
+        height      0.056367
+        dtype: float64
+
+        Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1:
+
+        >>> df.var(ddof=0)
+        age       264.687500
+        height      0.042275
+        dtype: float64
+        """
+        result = super().var(
+            axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="var")
+        return result
+
+    # error: Signature of "std" incompatible with supertype "NDFrame"
+    @overload  # type: ignore[override]
+    def std(
+        self,
+        *,
+        axis: Axis = ...,
+        skipna: bool = ...,
+        ddof: int = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series: ...
+
+    @overload
+    def std(
+        self,
+        *,
+        axis: None,
+        skipna: bool = ...,
+        ddof: int = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Any: ...
+
+    @overload
+    def std(
+        self,
+        *,
+        axis: Axis | None,
+        skipna: bool = ...,
+        ddof: int = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series | Any: ...
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="std")
+    def std(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        ddof: int = 1,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | Any:
+        """
+        Return sample standard deviation over requested axis.
+
+        Normalized by N-1 by default. This can be changed using the ddof argument.
+
+        Parameters
+        ----------
+        axis : {index (0), columns (1)}
+            For `Series` this parameter is unused and defaults to 0.
+
+            .. warning::
+
+                The behavior of DataFrame.std with ``axis=None`` is deprecated,
+                in a future version this will reduce over both axes and return a scalar
+                To retain the old behavior, pass axis=0 (or do not pass axis).
+
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+        ddof : int, default 1
+            Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+            where N represents the number of elements.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns. Not implemented for Series.
+        **kwargs : dict
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        Series or scalar
+            Standard deviation over requested axis.
+
+        See Also
+        --------
+        Series.std : Return standard deviation over Series values.
+        DataFrame.mean : Return the mean of the values over the requested axis.
+        DataFrame.median : Return the median of the values over the requested axis.
+        DataFrame.mode : Get the mode(s) of each element along the requested axis.
+        DataFrame.sum : Return the sum of the values over the requested axis.
+
+        Notes
+        -----
+        To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the
+        default `ddof=1`)
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "person_id": [0, 1, 2, 3],
+        ...         "age": [21, 25, 62, 43],
+        ...         "height": [1.61, 1.87, 1.49, 2.01],
+        ...     }
+        ... ).set_index("person_id")
+        >>> df
+                   age  height
+        person_id
+        0           21    1.61
+        1           25    1.87
+        2           62    1.49
+        3           43    2.01
+
+        The standard deviation of the columns can be found as follows:
+
+        >>> df.std()
+        age       18.786076
+        height     0.237417
+        dtype: float64
+
+        Alternatively, `ddof=0` can be set to normalize by N instead of N-1:
+
+        >>> df.std(ddof=0)
+        age       16.269219
+        height     0.205609
+        dtype: float64
+        """
+        result = super().std(
+            axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="std")
+        return result
+
+    # error: Signature of "skew" incompatible with supertype "NDFrame"
+    @overload  # type: ignore[override]
+    def skew(
+        self,
+        *,
+        axis: Axis = ...,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series: ...
+
+    @overload
+    def skew(
+        self,
+        *,
+        axis: None,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Any: ...
+
+    @overload
+    def skew(
+        self,
+        *,
+        axis: Axis | None,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series | Any: ...
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="skew")
+    def skew(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | Any:
+        """
+        Return unbiased skew over requested axis.
+
+        Normalized by N-1.
+
+        Parameters
+        ----------
+        axis : {index (0), columns (1)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            For DataFrames, specifying ``axis=None`` will apply the aggregation
+            across both axes.
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        Series or scalar
+            Unbiased skew over requested axis.
+
+        See Also
+        --------
+        Dataframe.kurt : Returns unbiased kurtosis over requested axis.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.skew()
+        0.0
+
+        With a DataFrame
+
+        >>> df = pd.DataFrame(
+        ...     {"a": [1, 2, 3], "b": [2, 3, 4], "c": [1, 3, 5]},
+        ...     index=["tiger", "zebra", "cow"],
+        ... )
+        >>> df
+                a   b   c
+        tiger   1   2   1
+        zebra   2   3   3
+        cow     3   4   5
+        >>> df.skew()
+        a   0.0
+        b   0.0
+        c   0.0
+        dtype: float64
+
+        Using axis=1
+
+        >>> df.skew(axis=1)
+        tiger   1.732051
+        zebra  -1.732051
+        cow     0.000000
+        dtype: float64
+
+        In this case, `numeric_only` should be set to `True` to avoid
+        getting an error.
+
+        >>> df = pd.DataFrame(
+        ...     {"a": [1, 2, 3], "b": ["T", "Z", "X"]}, index=["tiger", "zebra", "cow"]
+        ... )
+        >>> df.skew(numeric_only=True)
+        a   0.0
+        dtype: float64
+        """
+        result = super().skew(
+            axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="skew")
+        return result
+
+    # error: Signature of "kurt" incompatible with supertype "NDFrame"
+    @overload  # type: ignore[override]
+    def kurt(
+        self,
+        *,
+        axis: Axis = ...,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series: ...
+
+    @overload
+    def kurt(
+        self,
+        *,
+        axis: None,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Any: ...
+
+    @overload
+    def kurt(
+        self,
+        *,
+        axis: Axis | None,
+        skipna: bool = ...,
+        numeric_only: bool = ...,
+        **kwargs,
+    ) -> Series | Any: ...
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="kurt")
+    def kurt(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | Any:
+        """
+        Return unbiased kurtosis over requested axis.
+
+        Kurtosis obtained using Fisher's definition of
+        kurtosis (kurtosis of normal == 0.0). Normalized by N-1.
+
+        Parameters
+        ----------
+        axis : {index (0), columns (1)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            For DataFrames, specifying ``axis=None`` will apply the aggregation
+            across both axes.
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        Series or scalar
+            Unbiased kurtosis over requested axis.
+
+        See Also
+        --------
+        Dataframe.kurtosis : Returns unbiased kurtosis over requested axis.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 2, 3], index=["cat", "dog", "dog", "mouse"])
+        >>> s
+        cat    1
+        dog    2
+        dog    2
+        mouse  3
+        dtype: int64
+        >>> s.kurt()
+        1.5
+
+        With a DataFrame
+
+        >>> df = pd.DataFrame(
+        ...     {"a": [1, 2, 2, 3], "b": [3, 4, 4, 4]},
+        ...     index=["cat", "dog", "dog", "mouse"],
+        ... )
+        >>> df
+               a   b
+          cat  1   3
+          dog  2   4
+          dog  2   4
+        mouse  3   4
+        >>> df.kurt()
+        a   1.5
+        b   4.0
+        dtype: float64
+
+        With axis=None
+
+        >>> df.kurt(axis=None)
+        -0.9886927196984727
+
+        Using axis=1
+
+        >>> df = pd.DataFrame(
+        ...     {"a": [1, 2], "b": [3, 4], "c": [3, 4], "d": [1, 2]},
+        ...     index=["cat", "dog"],
+        ... )
+        >>> df.kurt(axis=1)
+        cat   -6.0
+        dog   -6.0
+        dtype: float64
+        """
+        result = super().kurt(
+            axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs
+        )
+        if isinstance(result, Series):
+            result = result.__finalize__(self, method="kurt")
+        return result
+
+    # error: Incompatible types in assignment
+    kurtosis = kurt  # type: ignore[assignment]
+    product = prod
+
+    def cummin(
+        self,
+        axis: Axis = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        *args,
+        **kwargs,
+    ) -> Self:
+        """
+        Return cumulative minimum over a DataFrame or Series axis.
+
+        Returns a DataFrame or Series of the same size containing the cumulative
+        minimum.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The index or the name of the axis. 0 is equivalent to None or 'index'.
+            For `Series` this parameter is unused and defaults to 0.
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+        *args, **kwargs
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        Series or DataFrame
+            Return cumulative minimum of Series or DataFrame.
+
+        See Also
+        --------
+        core.window.expanding.Expanding.min : Similar functionality
+            but ignores ``NaN`` values.
+        DataFrame.min : Return the minimum over
+            DataFrame axis.
+        DataFrame.cummax : Return cumulative maximum over DataFrame axis.
+        DataFrame.cummin : Return cumulative minimum over DataFrame axis.
+        DataFrame.cumsum : Return cumulative sum over DataFrame axis.
+        DataFrame.cumprod : Return cumulative product over DataFrame axis.
+
+        Examples
+        --------
+        **Series**
+
+        >>> s = pd.Series([2, np.nan, 5, -1, 0])
+        >>> s
+        0    2.0
+        1    NaN
+        2    5.0
+        3   -1.0
+        4    0.0
+        dtype: float64
+
+        By default, NA values are ignored.
+
+        >>> s.cummin()
+        0    2.0
+        1    NaN
+        2    2.0
+        3   -1.0
+        4   -1.0
+        dtype: float64
+
+        To include NA values in the operation, use ``skipna=False``
+
+        >>> s.cummin(skipna=False)
+        0    2.0
+        1    NaN
+        2    NaN
+        3    NaN
+        4    NaN
+        dtype: float64
+
+        **DataFrame**
+
+        >>> df = pd.DataFrame(
+        ...     [[2.0, 1.0], [3.0, np.nan], [1.0, 0.0]], columns=list("AB")
+        ... )
+        >>> df
+             A    B
+        0  2.0  1.0
+        1  3.0  NaN
+        2  1.0  0.0
+
+        By default, iterates over rows and finds the minimum
+        in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
+
+        >>> df.cummin()
+             A    B
+        0  2.0  1.0
+        1  2.0  NaN
+        2  1.0  0.0
+
+        To iterate over columns and find the minimum in each row,
+        use ``axis=1``
+
+        >>> df.cummin(axis=1)
+             A    B
+        0  2.0  1.0
+        1  3.0  NaN
+        2  1.0  0.0
+        """
+        data = self._get_numeric_data() if numeric_only else self
+        return NDFrame.cummin(data, axis, skipna, *args, **kwargs)
+
+    def cummax(
+        self,
+        axis: Axis = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        *args,
+        **kwargs,
+    ) -> Self:
+        """
+        Return cumulative maximum over a DataFrame or Series axis.
+
+        Returns a DataFrame or Series of the same size containing the cumulative
+        maximum.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The index or the name of the axis. 0 is equivalent to None or 'index'.
+            For `Series` this parameter is unused and defaults to 0.
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+        *args, **kwargs
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        Series or DataFrame
+            Return cumulative maximum of Series or DataFrame.
+
+        See Also
+        --------
+        core.window.expanding.Expanding.max : Similar functionality
+            but ignores ``NaN`` values.
+        DataFrame.max : Return the maximum over
+            DataFrame axis.
+        DataFrame.cummax : Return cumulative maximum over DataFrame axis.
+        DataFrame.cummin : Return cumulative minimum over DataFrame axis.
+        DataFrame.cumsum : Return cumulative sum over DataFrame axis.
+        DataFrame.cumprod : Return cumulative product over DataFrame axis.
+
+        Examples
+        --------
+        **Series**
+
+        >>> s = pd.Series([2, np.nan, 5, -1, 0])
+        >>> s
+        0    2.0
+        1    NaN
+        2    5.0
+        3   -1.0
+        4    0.0
+        dtype: float64
+
+        By default, NA values are ignored.
+
+        >>> s.cummax()
+        0    2.0
+        1    NaN
+        2    5.0
+        3    5.0
+        4    5.0
+        dtype: float64
+
+        To include NA values in the operation, use ``skipna=False``
+
+        >>> s.cummax(skipna=False)
+        0    2.0
+        1    NaN
+        2    NaN
+        3    NaN
+        4    NaN
+        dtype: float64
+
+        **DataFrame**
+
+        >>> df = pd.DataFrame(
+        ...     [[2.0, 1.0], [3.0, np.nan], [1.0, 0.0]], columns=list("AB")
+        ... )
+        >>> df
+             A    B
+        0  2.0  1.0
+        1  3.0  NaN
+        2  1.0  0.0
+
+        By default, iterates over rows and finds the maximum
+        in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
+
+        >>> df.cummax()
+             A    B
+        0  2.0  1.0
+        1  3.0  NaN
+        2  3.0  1.0
+
+        To iterate over columns and find the maximum in each row,
+        use ``axis=1``
+
+        >>> df.cummax(axis=1)
+             A    B
+        0  2.0  2.0
+        1  3.0  NaN
+        2  1.0  1.0
+        """
+        data = self._get_numeric_data() if numeric_only else self
+        return NDFrame.cummax(data, axis, skipna, *args, **kwargs)
+
+    def cumsum(
+        self,
+        axis: Axis = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        *args,
+        **kwargs,
+    ) -> Self:
+        """
+        Return cumulative sum over a DataFrame or Series axis.
+
+        Returns a DataFrame or Series of the same size containing the cumulative
+        sum.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The index or the name of the axis. 0 is equivalent to None or 'index'.
+            For `Series` this parameter is unused and defaults to 0.
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+        *args, **kwargs
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        Series or DataFrame
+            Return cumulative sum of Series or DataFrame.
+
+        See Also
+        --------
+        core.window.expanding.Expanding.sum : Similar functionality
+            but ignores ``NaN`` values.
+        DataFrame.sum : Return the sum over
+            DataFrame axis.
+        DataFrame.cummax : Return cumulative maximum over DataFrame axis.
+        DataFrame.cummin : Return cumulative minimum over DataFrame axis.
+        DataFrame.cumsum : Return cumulative sum over DataFrame axis.
+        DataFrame.cumprod : Return cumulative product over DataFrame axis.
+
+        Examples
+        --------
+        **Series**
+
+        >>> s = pd.Series([2, np.nan, 5, -1, 0])
+        >>> s
+        0    2.0
+        1    NaN
+        2    5.0
+        3   -1.0
+        4    0.0
+        dtype: float64
+
+        By default, NA values are ignored.
+
+        >>> s.cumsum()
+        0    2.0
+        1    NaN
+        2    7.0
+        3    6.0
+        4    6.0
+        dtype: float64
+
+        To include NA values in the operation, use ``skipna=False``
+
+        >>> s.cumsum(skipna=False)
+        0    2.0
+        1    NaN
+        2    NaN
+        3    NaN
+        4    NaN
+        dtype: float64
+
+        **DataFrame**
+
+        >>> df = pd.DataFrame(
+        ...     [[2.0, 1.0], [3.0, np.nan], [1.0, 0.0]], columns=list("AB")
+        ... )
+        >>> df
+             A    B
+        0  2.0  1.0
+        1  3.0  NaN
+        2  1.0  0.0
+
+        By default, iterates over rows and finds the sum
+        in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
+
+        >>> df.cumsum()
+             A    B
+        0  2.0  1.0
+        1  5.0  NaN
+        2  6.0  1.0
+
+        To iterate over columns and find the sum in each row,
+        use ``axis=1``
+
+        >>> df.cumsum(axis=1)
+             A    B
+        0  2.0  3.0
+        1  3.0  NaN
+        2  1.0  1.0
+        """
+        data = self._get_numeric_data() if numeric_only else self
+        return NDFrame.cumsum(data, axis, skipna, *args, **kwargs)
+
+    def cumprod(
+        self,
+        axis: Axis = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        *args,
+        **kwargs,
+    ) -> Self:
+        """
+        Return cumulative product over a DataFrame or Series axis.
+
+        Returns a DataFrame or Series of the same size containing the cumulative
+        product.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The index or the name of the axis. 0 is equivalent to None or 'index'.
+            For `Series` this parameter is unused and defaults to 0.
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+        *args, **kwargs
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        Series or DataFrame
+            Return cumulative product of Series or DataFrame.
+
+        See Also
+        --------
+        core.window.expanding.Expanding.prod : Similar functionality
+            but ignores ``NaN`` values.
+        DataFrame.prod : Return the product over
+            DataFrame axis.
+        DataFrame.cummax : Return cumulative maximum over DataFrame axis.
+        DataFrame.cummin : Return cumulative minimum over DataFrame axis.
+        DataFrame.cumsum : Return cumulative sum over DataFrame axis.
+        DataFrame.cumprod : Return cumulative product over DataFrame axis.
+
+        Examples
+        --------
+        **Series**
+
+        >>> s = pd.Series([2, np.nan, 5, -1, 0])
+        >>> s
+        0    2.0
+        1    NaN
+        2    5.0
+        3   -1.0
+        4    0.0
+        dtype: float64
+
+        By default, NA values are ignored.
+
+        >>> s.cumprod()
+        0     2.0
+        1     NaN
+        2    10.0
+        3   -10.0
+        4    -0.0
+        dtype: float64
+
+        To include NA values in the operation, use ``skipna=False``
+
+        >>> s.cumprod(skipna=False)
+        0    2.0
+        1    NaN
+        2    NaN
+        3    NaN
+        4    NaN
+        dtype: float64
+
+        **DataFrame**
+
+        >>> df = pd.DataFrame(
+        ...     [[2.0, 1.0], [3.0, np.nan], [1.0, 0.0]], columns=list("AB")
+        ... )
+        >>> df
+             A    B
+        0  2.0  1.0
+        1  3.0  NaN
+        2  1.0  0.0
+
+        By default, iterates over rows and finds the product
+        in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
+
+        >>> df.cumprod()
+             A    B
+        0  2.0  1.0
+        1  6.0  NaN
+        2  6.0  0.0
+
+        To iterate over columns and find the product in each row,
+        use ``axis=1``
+
+        >>> df.cumprod(axis=1)
+             A    B
+        0  2.0  2.0
+        1  3.0  NaN
+        2  1.0  0.0
+        """
+        data = self._get_numeric_data() if numeric_only else self
+        return NDFrame.cumprod(data, axis, skipna, *args, **kwargs)
+
+    def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
+        """
+        Count number of distinct elements in specified axis.
+
+        Return Series with number of distinct elements. Can ignore NaN
+        values.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
+            column-wise.
+        dropna : bool, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        Series
+            Series with counts of unique values per row or column, depending on `axis`.
+
+        See Also
+        --------
+        Series.nunique: Method nunique for Series.
+        DataFrame.count: Count non-NA cells for each column or row.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": [4, 5, 6], "B": [4, 1, 1]})
+        >>> df.nunique()
+        A    3
+        B    2
+        dtype: int64
+
+        >>> df.nunique(axis=1)
+        0    1
+        1    2
+        2    2
+        dtype: int64
+        """
+        return self.apply(Series.nunique, axis=axis, dropna=dropna)
+
+    def idxmin(
+        self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
+    ) -> Series:
+        """
+        Return index of first occurrence of minimum over requested axis.
+
+        NA/null values are excluded.
+
+        Parameters
+        ----------
+        axis : {{0 or 'index', 1 or 'columns'}}, default 0
+            The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
+        skipna : bool, default True
+            Exclude NA/null values. If the entire DataFrame is NA,
+            or if ``skipna=False`` and there is an NA value, this method
+            will raise a ``ValueError``.
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+        Returns
+        -------
+        Series
+            Indexes of minima along the specified axis.
+
+        Raises
+        ------
+        ValueError
+            * If the row/column is empty
+
+        See Also
+        --------
+        Series.idxmin : Return index of the minimum element.
+
+        Notes
+        -----
+        This method is the DataFrame version of ``ndarray.argmin``.
+
+        Examples
+        --------
+        Consider a dataset containing food consumption in Argentina.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "consumption": [10.51, 103.11, 55.48],
+        ...         "co2_emissions": [37.2, 19.66, 1712],
+        ...     },
+        ...     index=["Pork", "Wheat Products", "Beef"],
+        ... )
+
+        >>> df
+                        consumption  co2_emissions
+        Pork                  10.51         37.20
+        Wheat Products       103.11         19.66
+        Beef                  55.48       1712.00
+
+        By default, it returns the index for the minimum value in each column.
+
+        >>> df.idxmin()
+        consumption                Pork
+        co2_emissions    Wheat Products
+        dtype: str
+
+        To return the index for the minimum value in each row, use ``axis="columns"``.
+
+        >>> df.idxmin(axis="columns")
+        Pork                consumption
+        Wheat Products    co2_emissions
+        Beef                consumption
+        dtype: str
+        """
+        axis = self._get_axis_number(axis)
+
+        if self.empty and len(self.axes[axis]):
+            axis_dtype = self.axes[axis].dtype
+            return self._constructor_sliced(dtype=axis_dtype)
+
+        if numeric_only:
+            data = self._get_numeric_data()
+        else:
+            data = self
+
+        res = data._reduce(
+            nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
+        )
+        indices = res._values
+        # indices will always be np.ndarray since axis is not N
+
+        if (indices == -1).any():
+            if skipna:
+                msg = "Encountered all NA values"
+            else:
+                msg = "Encountered an NA values with skipna=False"
+            raise ValueError(msg)
+
+        index = data._get_axis(axis)
+        result = algorithms.take(
+            index._values, indices, allow_fill=True, fill_value=index._na_value
+        )
+        final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
+        return final_result.__finalize__(self, method="idxmin")
+
+    def idxmax(
+        self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
+    ) -> Series:
+        """
+        Return index of first occurrence of maximum over requested axis.
+
+        NA/null values are excluded.
+
+        Parameters
+        ----------
+        axis : {{0 or 'index', 1 or 'columns'}}, default 0
+            The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
+        skipna : bool, default True
+            Exclude NA/null values. If the entire DataFrame is NA,
+            or if ``skipna=False`` and there is an NA value, this method
+            will raise a ``ValueError``.
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+        Returns
+        -------
+        Series
+            Indexes of maxima along the specified axis.
+
+        Raises
+        ------
+        ValueError
+            * If the row/column is empty
+
+        See Also
+        --------
+        Series.idxmax : Return index of the maximum element.
+
+        Notes
+        -----
+        This method is the DataFrame version of ``ndarray.argmax``.
+
+        Examples
+        --------
+        Consider a dataset containing food consumption in Argentina.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "consumption": [10.51, 103.11, 55.48],
+        ...         "co2_emissions": [37.2, 19.66, 1712],
+        ...     },
+        ...     index=["Pork", "Wheat Products", "Beef"],
+        ... )
+
+        >>> df
+                        consumption  co2_emissions
+        Pork                  10.51         37.20
+        Wheat Products       103.11         19.66
+        Beef                  55.48       1712.00
+
+        By default, it returns the index for the maximum value in each column.
+
+        >>> df.idxmax()
+        consumption      Wheat Products
+        co2_emissions              Beef
+        dtype: str
+
+        To return the index for the maximum value in each row, use ``axis="columns"``.
+
+        >>> df.idxmax(axis="columns")
+        Pork              co2_emissions
+        Wheat Products     consumption
+        Beef              co2_emissions
+        dtype: str
+        """
+        axis = self._get_axis_number(axis)
+
+        if self.empty and len(self.axes[axis]):
+            axis_dtype = self.axes[axis].dtype
+            return self._constructor_sliced(dtype=axis_dtype)
+
+        if numeric_only:
+            data = self._get_numeric_data()
+        else:
+            data = self
+
+        res = data._reduce(
+            nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
+        )
+        indices = res._values
+        # indices will always be 1d array since axis is not None
+
+        if (indices == -1).any():
+            if skipna:
+                msg = "Encountered all NA values"
+            else:
+                msg = "Encountered an NA values with skipna=False"
+            raise ValueError(msg)
+
+        index = data._get_axis(axis)
+        result = algorithms.take(
+            index._values, indices, allow_fill=True, fill_value=index._na_value
+        )
+        final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
+        return final_result.__finalize__(self, method="idxmax")
+
+    def _get_agg_axis(self, axis_num: int) -> Index:
+        """
+        Let's be explicit about this.
+        """
+        if axis_num == 0:
+            return self.columns
+        elif axis_num == 1:
+            return self.index
+        else:
+            raise ValueError(f"Axis must be 0 or 1 (got {axis_num!r})")
+
+    def mode(
+        self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True
+    ) -> DataFrame:
+        """
+        Get the mode(s) of each element along the selected axis.
+
+        The mode of a set of values is the value that appears most often.
+        It can be multiple values.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis to iterate over while searching for the mode:
+
+            * 0 or 'index' : get mode of each column
+            * 1 or 'columns' : get mode of each row.
+
+        numeric_only : bool, default False
+            If True, only apply to numeric columns.
+        dropna : bool, default True
+            Don't consider counts of NaN/NaT.
+
+        Returns
+        -------
+        DataFrame
+            The modes of each column or row.
+
+        See Also
+        --------
+        Series.mode : Return the highest frequency value in a Series.
+        Series.value_counts : Return the counts of values in a Series.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [
+        ...         ("bird", 2, 2),
+        ...         ("mammal", 4, np.nan),
+        ...         ("arthropod", 8, 0),
+        ...         ("bird", 2, np.nan),
+        ...     ],
+        ...     index=("falcon", "horse", "spider", "ostrich"),
+        ...     columns=("species", "legs", "wings"),
+        ... )
+        >>> df
+                   species  legs  wings
+        falcon        bird     2    2.0
+        horse       mammal     4    NaN
+        spider   arthropod     8    0.0
+        ostrich       bird     2    NaN
+
+        By default, missing values are not considered, and the mode of wings
+        are both 0 and 2. Because the resulting DataFrame has two rows,
+        the second row of ``species`` and ``legs`` contains ``NaN``.
+
+        >>> df.mode()
+          species  legs  wings
+        0    bird   2.0    0.0
+        1     NaN   NaN    2.0
+
+        Setting ``dropna=False`` ``NaN`` values are considered and they can be
+        the mode (like for wings).
+
+        >>> df.mode(dropna=False)
+          species  legs  wings
+        0    bird     2    NaN
+
+        Setting ``numeric_only=True``, only the mode of numeric columns is
+        computed, and columns of other types are ignored.
+
+        >>> df.mode(numeric_only=True)
+           legs  wings
+        0   2.0    0.0
+        1   NaN    2.0
+
+        To compute the mode over columns and not rows, use the axis parameter:
+
+        >>> df.mode(axis="columns", numeric_only=True)
+                   0    1
+        falcon   2.0  NaN
+        horse    4.0  NaN
+        spider   0.0  8.0
+        ostrich  2.0  NaN
+        """
+        data = self if not numeric_only else self._get_numeric_data()
+
+        def f(s):
+            return s.mode(dropna=dropna)
+
+        data = data.apply(f, axis=axis)
+        # Ensure index is type stable (should always use int index)
+        if data.empty:
+            data.index = default_index(0)
+
+        return data
+
+    @overload
+    def quantile(
+        self,
+        q: float = ...,
+        axis: Axis = ...,
+        numeric_only: bool = ...,
+        interpolation: QuantileInterpolation = ...,
+        method: Literal["single", "table"] = ...,
+    ) -> Series: ...
+
+    @overload
+    def quantile(
+        self,
+        q: AnyArrayLike | Sequence[float],
+        axis: Axis = ...,
+        numeric_only: bool = ...,
+        interpolation: QuantileInterpolation = ...,
+        method: Literal["single", "table"] = ...,
+    ) -> Series | DataFrame: ...
+
+    @overload
+    def quantile(
+        self,
+        q: float | AnyArrayLike | Sequence[float] = ...,
+        axis: Axis = ...,
+        numeric_only: bool = ...,
+        interpolation: QuantileInterpolation = ...,
+        method: Literal["single", "table"] = ...,
+    ) -> Series | DataFrame: ...
+
+    def quantile(
+        self,
+        q: float | AnyArrayLike | Sequence[float] = 0.5,
+        axis: Axis = 0,
+        numeric_only: bool = False,
+        interpolation: QuantileInterpolation = "linear",
+        method: Literal["single", "table"] = "single",
+    ) -> Series | DataFrame:
+        """
+        Return values at the given quantile over requested axis.
+
+        Parameters
+        ----------
+        q : float or array-like, default 0.5 (50% quantile)
+            Value between 0 <= q <= 1, the quantile(s) to compute.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionchanged:: 2.0.0
+                The default value of ``numeric_only`` is now ``False``.
+
+        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+            This optional parameter specifies the interpolation method to use,
+            when the desired quantile lies between two data points `i` and `j`:
+
+            * linear: `i + (j - i) * fraction`, where `fraction` is the
+              fractional part of the index surrounded by `i` and `j`.
+            * lower: `i`.
+            * higher: `j`.
+            * nearest: `i` or `j` whichever is nearest.
+            * midpoint: (`i` + `j`) / 2.
+        method : {'single', 'table'}, default 'single'
+            Whether to compute quantiles per-column ('single') or over all columns
+            ('table'). When 'table', the only allowed interpolation methods are
+            'nearest', 'lower', and 'higher'.
+
+        Returns
+        -------
+        Series or DataFrame
+
+            If ``q`` is an array, a DataFrame will be returned where the
+              index is ``q``, the columns are the columns of self, and the
+              values are the quantiles.
+            If ``q`` is a float, a Series will be returned where the
+              index is the columns of self and the values are the quantiles.
+
+        See Also
+        --------
+        core.window.rolling.Rolling.quantile: Rolling quantile.
+        numpy.percentile: Numpy function to compute the percentile.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), columns=["a", "b"]
+        ... )
+        >>> df.quantile(0.1)
+        a    1.3
+        b    3.7
+        Name: 0.1, dtype: float64
+        >>> df.quantile([0.1, 0.5])
+               a     b
+        0.1  1.3   3.7
+        0.5  2.5  55.0
+
+        Specifying `method='table'` will compute the quantile over all columns.
+
+        >>> df.quantile(0.1, method="table", interpolation="nearest")
+        a    1
+        b    1
+        Name: 0.1, dtype: int64
+        >>> df.quantile([0.1, 0.5], method="table", interpolation="nearest")
+             a    b
+        0.1  1    1
+        0.5  3  100
+
+        Specifying `numeric_only=False` will compute the quantiles for all
+        columns.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "A": [1, 2],
+        ...         "B": [pd.Timestamp("2010"), pd.Timestamp("2011")],
+        ...         "C": [pd.Timedelta("1 days"), pd.Timedelta("2 days")],
+        ...     }
+        ... )
+        >>> df.quantile(0.5, numeric_only=False)
+        A                    1.5
+        B    2010-07-02 12:00:00
+        C        1 days 12:00:00
+        Name: 0.5, dtype: object
+        """
+        validate_percentile(q)
+        axis = self._get_axis_number(axis)
+
+        if not is_list_like(q):
+            # BlockManager.quantile expects listlike, so we wrap and unwrap here
+            # error: List item 0 has incompatible type "float | ExtensionArray |
+            # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float"
+            res_df = self.quantile(
+                [q],  # type: ignore[list-item]
+                axis=axis,
+                numeric_only=numeric_only,
+                interpolation=interpolation,
+                method=method,
+            )
+            if method == "single":
+                res = res_df.iloc[0]
+            else:
+                # cannot directly iloc over sparse arrays
+                res = res_df.T.iloc[:, 0]
+            if axis == 1 and len(self) == 0:
+                # GH#41544 try to get an appropriate dtype
+                dtype = find_common_type(list(self.dtypes))
+                if needs_i8_conversion(dtype):
+                    return res.astype(dtype)
+            return res
+
+        q = Index(q, dtype=np.float64)
+        data = self._get_numeric_data() if numeric_only else self
+
+        if axis == 1:
+            data = data.T
+
+        if len(data.columns) == 0:
+            # GH#23925 _get_numeric_data may have dropped all columns
+            cols = self.columns[:0]
+
+            dtype = np.float64
+            if axis == 1:
+                # GH#41544 try to get an appropriate dtype
+                cdtype = find_common_type(list(self.dtypes))
+                if needs_i8_conversion(cdtype):
+                    dtype = cdtype
+
+            res = self._constructor([], index=q, columns=cols, dtype=dtype)
+            return res.__finalize__(self, method="quantile")
+
+        valid_method = {"single", "table"}
+        if method not in valid_method:
+            raise ValueError(
+                f"Invalid method: {method}. Method must be in {valid_method}."
+            )
+        if method == "single":
+            res = data._mgr.quantile(qs=q, interpolation=interpolation)
+        elif method == "table":
+            valid_interpolation = {"nearest", "lower", "higher"}
+            if interpolation not in valid_interpolation:
+                raise ValueError(
+                    f"Invalid interpolation: {interpolation}. "
+                    f"Interpolation must be in {valid_interpolation}"
+                )
+            # handle degenerate case
+            if len(data) == 0:
+                if data.ndim == 2:
+                    dtype = find_common_type(list(self.dtypes))
+                else:
+                    dtype = self.dtype
+                return self._constructor([], index=q, columns=data.columns, dtype=dtype)
+
+            q_idx = np.quantile(np.arange(len(data)), q, method=interpolation)
+
+            by = data.columns
+            if len(by) > 1:
+                keys = [data._get_label_or_level_values(x) for x in by]
+                indexer = lexsort_indexer(keys)
+            else:
+                k = data._get_label_or_level_values(by[0])
+                indexer = nargsort(k)
+
+            res = data._mgr.take(indexer[q_idx], verify=False)
+            res.axes[1] = q
+
+        result = self._constructor_from_mgr(res, axes=res.axes)
+        return result.__finalize__(self, method="quantile")
+
+    def to_timestamp(
+        self,
+        freq: Frequency | None = None,
+        how: ToTimestampHow = "start",
+        axis: Axis = 0,
+        copy: bool | lib.NoDefault = lib.no_default,
+    ) -> DataFrame:
+        """
+        Cast PeriodIndex to DatetimeIndex of timestamps, at *beginning* of period.
+
+        This can be changed to the *end* of the period, by specifying `how="e"`.
+
+        Parameters
+        ----------
+        freq : str, default frequency of PeriodIndex
+            Desired frequency.
+        how : {'s', 'e', 'start', 'end'}
+            Convention for converting period to timestamp; start of period
+            vs. end.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis to convert (the index by default).
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        Returns
+        -------
+        DataFrame with DatetimeIndex
+            DataFrame with the PeriodIndex cast to DatetimeIndex.
+
+        See Also
+        --------
+        DataFrame.to_period: Inverse method to cast DatetimeIndex to PeriodIndex.
+        Series.to_timestamp: Equivalent method for Series.
+
+        Examples
+        --------
+        >>> idx = pd.PeriodIndex(["2023", "2024"], freq="Y")
+        >>> d = {"col1": [1, 2], "col2": [3, 4]}
+        >>> df1 = pd.DataFrame(data=d, index=idx)
+        >>> df1
+              col1   col2
+        2023     1      3
+        2024	 2      4
+
+        The resulting timestamps will be at the beginning of the year in this case
+
+        >>> df1 = df1.to_timestamp()
+        >>> df1
+                    col1   col2
+        2023-01-01     1      3
+        2024-01-01     2      4
+        >>> df1.index
+        DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[us]', freq=None)
+
+        Using `freq` which is the offset that the Timestamps will have
+
+        >>> df2 = pd.DataFrame(data=d, index=idx)
+        >>> df2 = df2.to_timestamp(freq="M")
+        >>> df2
+                    col1   col2
+        2023-01-31     1      3
+        2024-01-31     2      4
+        >>> df2.index
+        DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[us]', freq=None)
+        """
+        self._check_copy_deprecation(copy)
+        new_obj = self.copy(deep=False)
+
+        axis_name = self._get_axis_name(axis)
+        old_ax = getattr(self, axis_name)
+        if not isinstance(old_ax, PeriodIndex):
+            raise TypeError(f"unsupported Type {type(old_ax).__name__}")
+
+        new_ax = old_ax.to_timestamp(freq=freq, how=how)
+
+        setattr(new_obj, axis_name, new_ax)
+        return new_obj
+
+    def to_period(
+        self,
+        freq: Frequency | None = None,
+        axis: Axis = 0,
+        copy: bool | lib.NoDefault = lib.no_default,
+    ) -> DataFrame:
+        """
+        Convert DataFrame from DatetimeIndex to PeriodIndex.
+
+        Convert DataFrame from DatetimeIndex to PeriodIndex with desired
+        frequency (inferred from index if not passed). Either index of columns can be
+        converted, depending on `axis` argument.
+
+        Parameters
+        ----------
+        freq : str, default
+            Frequency of the PeriodIndex.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis to convert (the index by default).
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        Returns
+        -------
+        DataFrame
+            The DataFrame with the converted PeriodIndex.
+
+        See Also
+        --------
+        Series.to_period: Equivalent method for Series.
+        Series.dt.to_period: Convert DateTime column values.
+
+        Examples
+        --------
+        >>> idx = pd.to_datetime(
+        ...     [
+        ...         "2001-03-31 00:00:00",
+        ...         "2002-05-31 00:00:00",
+        ...         "2003-08-31 00:00:00",
+        ...     ]
+        ... )
+
+        >>> idx
+        DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'],
+                      dtype='datetime64[us]', freq=None)
+
+        >>> idx.to_period("M")
+        PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]')
+
+        For the yearly frequency
+
+        >>> idx.to_period("Y")
+        PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]')
+        """
+        self._check_copy_deprecation(copy)
+        new_obj = self.copy(deep=False)
+
+        axis_name = self._get_axis_name(axis)
+        old_ax = getattr(self, axis_name)
+        if not isinstance(old_ax, DatetimeIndex):
+            raise TypeError(f"unsupported Type {type(old_ax).__name__}")
+
+        new_ax = old_ax.to_period(freq=freq)
+
+        setattr(new_obj, axis_name, new_ax)
+        return new_obj
+
+    def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:
+        """
+        Whether each element in the DataFrame is contained in values.
+
+        Parameters
+        ----------
+        values : iterable, Series, DataFrame or dict
+            The result will only be true at a location if all the
+            labels match. If `values` is a Series, that's the index. If
+            `values` is a dict, the keys must be the column names,
+            which must match. If `values` is a DataFrame,
+            then both the index and column labels must match.
+
+        Returns
+        -------
+        DataFrame
+            DataFrame of booleans showing whether each element in the DataFrame
+            is contained in values.
+
+        See Also
+        --------
+        DataFrame.eq: Equality test for DataFrame.
+        Series.isin: Equivalent method on Series.
+        Series.str.contains: Test if pattern or regex is contained within a
+            string of a Series or Index.
+
+        Notes
+        -----
+            ``__iter__`` is used (and not ``__contains__``) to iterate over values
+            when checking if it contains the elements in DataFrame.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"num_legs": [2, 4], "num_wings": [2, 0]}, index=["falcon", "dog"]
+        ... )
+        >>> df
+                num_legs  num_wings
+        falcon         2          2
+        dog            4          0
+
+        When ``values`` is a list check whether every value in the DataFrame
+        is present in the list (which animals have 0 or 2 legs or wings)
+
+        >>> df.isin([0, 2])
+                num_legs  num_wings
+        falcon      True       True
+        dog        False       True
+
+        To check if ``values`` is *not* in the DataFrame, use the ``~`` operator:
+
+        >>> ~df.isin([0, 2])
+                num_legs  num_wings
+        falcon     False      False
+        dog         True      False
+
+        When ``values`` is a dict, we can pass values to check for each
+        column separately:
+
+        >>> df.isin({"num_wings": [0, 3]})
+                num_legs  num_wings
+        falcon     False      False
+        dog        False       True
+
+        When ``values`` is a Series or DataFrame the index and column must
+        match. Note that 'falcon' does not match based on the number of legs
+        in other.
+
+        >>> other = pd.DataFrame(
+        ...     {"num_legs": [8, 3], "num_wings": [0, 2]}, index=["spider", "falcon"]
+        ... )
+        >>> df.isin(other)
+                num_legs  num_wings
+        falcon     False       True
+        dog        False      False
+        """
+        if isinstance(values, dict):
+            from pandas.core.reshape.concat import concat
+
+            values = collections.defaultdict(list, values)
+            result = concat(
+                (
+                    self.iloc[:, [i]].isin(values[col])
+                    for i, col in enumerate(self.columns)
+                ),
+                axis=1,
+            )
+        elif isinstance(values, Series):
+            if not values.index.is_unique:
+                raise ValueError("cannot compute isin with a duplicate axis.")
+            result = self.eq(values.reindex_like(self), axis="index")
+        elif isinstance(values, DataFrame):
+            if not (values.columns.is_unique and values.index.is_unique):
+                raise ValueError("cannot compute isin with a duplicate axis.")
+            result = self.eq(values.reindex_like(self))
+        else:
+            if not is_list_like(values):
+                raise TypeError(
+                    "only list-like or dict-like objects are allowed "
+                    "to be passed to DataFrame.isin(), "
+                    f"you passed a '{type(values).__name__}'"
+                )
+
+            def isin_(x):
+                # error: Argument 2 to "isin" has incompatible type "Union[Series,
+                # DataFrame, Sequence[Any], Mapping[Any, Any]]"; expected
+                # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], Index,
+                # Series], List[Any], range]"
+                result = algorithms.isin(
+                    x.ravel(),
+                    values,  # type: ignore[arg-type]
+                )
+                return result.reshape(x.shape)
+
+            res_mgr = self._mgr.apply(isin_)
+            result = self._constructor_from_mgr(
+                res_mgr,
+                axes=res_mgr.axes,
+            )
+        return result.__finalize__(self, method="isin")
+
+    # ----------------------------------------------------------------------
+    # Add index and columns
+    _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"]
+    _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {
+        **NDFrame._AXIS_TO_AXIS_NUMBER,
+        1: 1,
+        "columns": 1,
+    }
+    _AXIS_LEN = len(_AXIS_ORDERS)
+    _info_axis_number: Literal[1] = 1
+    _info_axis_name: Literal["columns"] = "columns"
+
+    index = properties.AxisProperty(
+        axis=1,
+        doc="""
+        The index (row labels) of the DataFrame.
+
+        The index of a DataFrame is a series of labels that identify each row.
+        The labels can be integers, strings, or any other hashable type. The index
+        is used for label-based access and alignment, and can be accessed or
+        modified using this attribute.
+
+        Returns
+        -------
+        pandas.Index
+            The index labels of the DataFrame.
+
+        See Also
+        --------
+        DataFrame.columns : The column labels of the DataFrame.
+        DataFrame.to_numpy : Convert the DataFrame to a NumPy array.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'],
+        ...                    'Age': [25, 30, 35],
+        ...                    'Location': ['Seattle', 'New York', 'Kona']},
+        ...                   index=([10, 20, 30]))
+        >>> df.index
+        Index([10, 20, 30], dtype='int64')
+
+        In this example, we create a DataFrame with 3 rows and 3 columns,
+        including Name, Age, and Location information. We set the index labels to
+        be the integers 10, 20, and 30. We then access the `index` attribute of the
+        DataFrame, which returns an `Index` object containing the index labels.
+
+        >>> df.index = [100, 200, 300]
+        >>> df
+            Name  Age Location
+        100  Alice   25  Seattle
+        200    Bob   30 New York
+        300  Aritra  35    Kona
+
+        In this example, we modify the index labels of the DataFrame by assigning
+        a new list of labels to the `index` attribute. The DataFrame is then
+        updated with the new labels, and the output shows the modified DataFrame.
+        """,
+    )
+    columns = properties.AxisProperty(
+        axis=0,
+        doc="""
+        The column labels of the DataFrame.
+
+        This property holds the column names as a pandas ``Index`` object.
+        It provides an immutable sequence of column labels that can be
+        used for data selection, renaming, and alignment in DataFrame operations.
+
+        Returns
+        -------
+        pandas.Index
+            The column labels of the DataFrame.
+
+        See Also
+        --------
+        DataFrame.index: The index (row labels) of the DataFrame.
+        DataFrame.axes: Return a list representing the axes of the DataFrame.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
+        >>> df
+                A  B
+        0    1  3
+        1    2  4
+        >>> df.columns
+        Index(['A', 'B'], dtype='str')
+        """,
+    )
+
+    # ----------------------------------------------------------------------
+    # Add plotting methods to DataFrame
+    plot = Accessor("plot", pandas.plotting.PlotAccessor)
+    hist = pandas.plotting.hist_frame
+    boxplot = pandas.plotting.boxplot_frame
+    sparse = Accessor("sparse", SparseFrameAccessor)
+
+    # ----------------------------------------------------------------------
+    # Internal Interface Methods
+
+    def _to_dict_of_blocks(self):
+        """
+        Return a dict of dtype -> Constructor Types that
+        each is a homogeneous dtype.
+
+        Internal ONLY.
+        """
+        mgr = self._mgr
+        return {
+            k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self)
+            for k, v in mgr.to_iter_dict()
+        }
+
+    @property
+    def values(self) -> np.ndarray:
+        """
+        Return a Numpy representation of the DataFrame.
+
+        .. warning::
+
+           We recommend using :meth:`DataFrame.to_numpy` instead.
+
+        Only the values in the DataFrame will be returned, the axes labels
+        will be removed.
+
+        Returns
+        -------
+        numpy.ndarray
+            The values of the DataFrame.
+
+        See Also
+        --------
+        DataFrame.to_numpy : Recommended alternative to this method.
+        DataFrame.index : Retrieve the index labels.
+        DataFrame.columns : Retrieving the column names.
+
+        Notes
+        -----
+        The dtype will be a lower-common-denominator dtype (implicit
+        upcasting); that is to say if the dtypes (even of numeric types)
+        are mixed, the one that accommodates all will be chosen. Use this
+        with care if you are not dealing with the blocks.
+
+        e.g. If the dtypes are float16 and float32, dtype will be upcast to
+        float32.  If dtypes are int32 and uint8, dtype will be upcast to
+        int32. By :func:`numpy.find_common_type` convention, mixing int64
+        and uint64 will result in a float64 dtype.
+
+        Examples
+        --------
+        A DataFrame where all columns are the same type (e.g., int64) results
+        in an array of the same type.
+
+        >>> df = pd.DataFrame(
+        ...     {"age": [3, 29], "height": [94, 170], "weight": [31, 115]}
+        ... )
+        >>> df
+           age  height  weight
+        0    3      94      31
+        1   29     170     115
+        >>> df.dtypes
+        age       int64
+        height    int64
+        weight    int64
+        dtype: object
+        >>> df.values
+        array([[  3,  94,  31],
+               [ 29, 170, 115]])
+
+        A DataFrame with mixed type columns(e.g., str/object, int64, float32)
+        results in an ndarray of the broadest type that accommodates these
+        mixed types (e.g., object).
+
+        >>> df2 = pd.DataFrame(
+        ...     [
+        ...         ("parrot", 24.0, "second"),
+        ...         ("lion", 80.5, 1),
+        ...         ("monkey", np.nan, None),
+        ...     ],
+        ...     columns=("name", "max_speed", "rank"),
+        ... )
+        >>> df2.dtypes
+        name             str
+        max_speed    float64
+        rank          object
+        dtype: object
+        >>> df2.values
+        array([['parrot', 24.0, 'second'],
+               ['lion', 80.5, 1],
+               ['monkey', nan, None]], dtype=object)
+        """
+        return self._mgr.as_array()
+
+
+def _from_nested_dict(
+    data: Mapping[HashableT, Mapping[HashableT2, T]],
+) -> collections.defaultdict[HashableT2, dict[HashableT, T]]:
+    new_data: collections.defaultdict[HashableT2, dict[HashableT, T]] = (
+        collections.defaultdict(dict)
+    )
+    for index, s in data.items():
+        for col, v in s.items():
+            new_data[col][index] = v
+    return new_data
+
+
+def _reindex_for_setitem(
+    value: DataFrame | Series, index: Index
+) -> tuple[ArrayLike, BlockValuesRefs | None]:
+    # reindex if necessary
+
+    if value.index.equals(index) or not len(index):
+        if isinstance(value, Series):
+            return value._values, value._references
+        return value._values.copy(), None
+
+    # GH#4107
+    try:
+        reindexed_value = value.reindex(index)._values
+    except ValueError as err:
+        # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
+        if not value.index.is_unique:
+            # duplicate axis
+            raise err
+
+        raise TypeError(
+            "incompatible index of inserted column with frame index"
+        ) from err
+    return reindexed_value, None
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a861ca8aeed7fae6eca5d772c53c91b4804b19d
--- /dev/null
+++ b/pandas/core/generic.py
@@ -0,0 +1,13769 @@
+# pyright: reportPropertyTypeMismatch=false
+from __future__ import annotations
+
+import collections
+from copy import deepcopy
+import datetime as dt
+from functools import partial
+from json import loads
+import operator
+import pickle
+import re
+import sys
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    ClassVar,
+    Concatenate,
+    Literal,
+    NoReturn,
+    Self,
+    cast,
+    final,
+    overload,
+)
+import warnings
+
+import numpy as np
+
+from pandas._config import config
+
+from pandas._libs import lib
+from pandas._libs.lib import is_range_indexer
+from pandas._libs.tslibs import (
+    Period,
+    Timestamp,
+    to_offset,
+)
+from pandas._typing import (
+    AlignJoin,
+    AnyArrayLike,
+    ArrayLike,
+    Axes,
+    Axis,
+    AxisInt,
+    CompressionOptions,
+    DtypeArg,
+    DtypeBackend,
+    DtypeObj,
+    FilePath,
+    FillnaOptions,
+    FloatFormatType,
+    FormattersType,
+    Frequency,
+    IgnoreRaise,
+    IndexKeyFunc,
+    IndexLabel,
+    InterpolateOptions,
+    IntervalClosedType,
+    JSONSerializable,
+    Level,
+    ListLike,
+    Manager,
+    NaPosition,
+    NDFrameT,
+    OpenFileErrors,
+    RandomState,
+    ReindexMethod,
+    Renamer,
+    Scalar,
+    SequenceNotStr,
+    SortKind,
+    StorageOptions,
+    Suffixes,
+    T,
+    TimeAmbiguous,
+    TimedeltaConvertibleTypes,
+    TimeNonexistent,
+    TimestampConvertibleTypes,
+    TimeUnit,
+    ValueKeyFunc,
+    WriteBuffer,
+    WriteExcelBuffer,
+    npt,
+)
+from pandas.compat import CHAINED_WARNING_DISABLED
+from pandas.compat._constants import (
+    REF_COUNT_METHOD,
+)
+from pandas.compat._optional import import_optional_dependency
+from pandas.compat.numpy import function as nv
+from pandas.errors import (
+    AbstractMethodError,
+    ChainedAssignmentError,
+    InvalidIndexError,
+    Pandas4Warning,
+)
+from pandas.errors.cow import _chained_assignment_method_msg
+from pandas.util._decorators import (
+    deprecate_kwarg,
+    doc,
+)
+from pandas.util._exceptions import find_stack_level
+from pandas.util._validators import (
+    check_dtype_backend,
+    validate_ascending,
+    validate_bool_kwarg,
+    validate_inclusive,
+)
+
+from pandas.core.dtypes.astype import astype_is_view
+from pandas.core.dtypes.cast import can_hold_element
+from pandas.core.dtypes.common import (
+    ensure_object,
+    ensure_platform_int,
+    ensure_str,
+    is_bool,
+    is_bool_dtype,
+    is_dict_like,
+    is_extension_array_dtype,
+    is_list_like,
+    is_number,
+    is_numeric_dtype,
+    is_re_compilable,
+    is_scalar,
+    pandas_dtype,
+)
+from pandas.core.dtypes.dtypes import (
+    DatetimeTZDtype,
+    ExtensionDtype,
+    PeriodDtype,
+)
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCSeries,
+)
+from pandas.core.dtypes.inference import (
+    is_hashable,
+    is_nested_list_like,
+)
+from pandas.core.dtypes.missing import (
+    isna,
+    notna,
+)
+
+from pandas.core import (
+    algorithms as algos,
+    arraylike,
+    common,
+    indexing,
+    missing,
+    nanops,
+    sample,
+)
+from pandas.core.array_algos.replace import should_use_regex
+from pandas.core.arrays import ExtensionArray
+from pandas.core.base import PandasObject
+from pandas.core.construction import extract_array
+from pandas.core.flags import Flags
+from pandas.core.indexes.api import (
+    DatetimeIndex,
+    Index,
+    MultiIndex,
+    PeriodIndex,
+    default_index,
+    ensure_index,
+)
+from pandas.core.internals import BlockManager
+from pandas.core.methods.describe import describe_ndframe
+from pandas.core.missing import (
+    clean_fill_method,
+    clean_reindex_fill_method,
+    find_valid_index,
+)
+from pandas.core.reshape.concat import concat
+from pandas.core.shared_docs import _shared_docs
+from pandas.core.sorting import get_indexer_indexer
+from pandas.core.window import (
+    Expanding,
+    ExponentialMovingWindow,
+    Rolling,
+    Window,
+)
+
+from pandas.io.formats.format import (
+    DataFrameFormatter,
+    DataFrameRenderer,
+)
+from pandas.io.formats.printing import pprint_thing
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Hashable,
+        Iterator,
+        Mapping,
+        Sequence,
+    )
+
+    from pandas._libs.tslibs import BaseOffset
+    from pandas._typing import P
+
+    from pandas import (
+        DataFrame,
+        ExcelWriter,
+        HDFStore,
+        Series,
+    )
+    from pandas.core.indexers.objects import BaseIndexer
+    from pandas.core.resample import Resampler
+
+
+# goal is to be able to define the docs close to function, while still being
+# able to share
+_shared_docs = {**_shared_docs}
+_shared_doc_kwargs = {
+    "axes": "keywords for axes",
+    "klass": "Series/DataFrame",
+    "axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame",  # noqa: E501
+    "inplace": """
+    inplace : bool, default False
+        If True, performs operation inplace.""",
+    "optional_by": """
+        by : str or list of str
+            Name or list of names to sort by""",
+}
+
+
+class NDFrame(PandasObject, indexing.IndexingMixin):
+    """
+    N-dimensional analogue of DataFrame. Store multi-dimensional in a
+    size-mutable, labeled data structure
+
+    Parameters
+    ----------
+    data : BlockManager
+    axes : list
+    copy : bool, default False
+    """
+
+    _internal_names: list[str] = [
+        "_mgr",
+        "_cache",
+        "_name",
+        "_metadata",
+        "_flags",
+    ]
+    _internal_names_set: set[str] = set(_internal_names)
+    _accessors: set[str] = set()
+    _hidden_attrs: frozenset[str] = frozenset([])
+    _metadata: list[str] = []
+    _mgr: Manager
+    _attrs: dict[Hashable, Any]
+    _typ: str
+
+    # ----------------------------------------------------------------------
+    # Constructors
+
+    def __init__(self, data: Manager) -> None:
+        object.__setattr__(self, "_mgr", data)
+        object.__setattr__(self, "_attrs", {})
+        object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
+
+    @final
+    @classmethod
+    def _init_mgr(
+        cls,
+        mgr: Manager,
+        axes: dict[Literal["index", "columns"], Axes | None],
+        dtype: DtypeObj | None = None,
+        copy: bool = False,
+    ) -> Manager:
+        """passed a manager and a axes dict"""
+        for a, axe in axes.items():
+            if axe is not None:
+                axe = ensure_index(axe)
+                bm_axis = cls._get_block_manager_axis(a)
+                mgr = mgr.reindex_axis(axe, axis=bm_axis)
+
+        # make a copy if explicitly requested
+        if copy:
+            mgr = mgr.copy(deep=True)
+        if dtype is not None:
+            # avoid further copies if we can
+            if (
+                isinstance(mgr, BlockManager)
+                and len(mgr.blocks) == 1
+                and mgr.blocks[0].values.dtype == dtype
+            ):
+                pass
+            else:
+                mgr = mgr.astype(dtype=dtype)
+        return mgr
+
+    @final
+    @classmethod
+    def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self:
+        """
+        Construct a new object of this type from a Manager object and axes.
+
+        Parameters
+        ----------
+        mgr : Manager
+            Must have the same ndim as cls.
+        axes : list[Index]
+
+        Notes
+        -----
+        The axes must match mgr.axes, but are required for future-proofing
+        in the event that axes are refactored out of the Manager objects.
+        """
+        obj = cls.__new__(cls)
+        NDFrame.__init__(obj, mgr)
+        return obj
+
+    # ----------------------------------------------------------------------
+    # attrs and flags
+
+    @property
+    def attrs(self) -> dict[Hashable, Any]:
+        """
+        Dictionary of global attributes of this dataset.
+
+        .. warning::
+
+           attrs is experimental and may change without warning.
+
+        See Also
+        --------
+        DataFrame.flags : Global flags applying to this object.
+
+        Notes
+        -----
+        Many operations that create new datasets will copy ``attrs``. Copies
+        are always deep so that changing ``attrs`` will only affect the
+        present dataset. :func:`pandas.concat` and :func:`pandas.merge` will
+        only copy ``attrs`` if all input datasets have the same ``attrs``.
+
+        Examples
+        --------
+        For Series:
+
+        >>> ser = pd.Series([1, 2, 3])
+        >>> ser.attrs = {"A": [10, 20, 30]}
+        >>> ser.attrs
+        {'A': [10, 20, 30]}
+
+        For DataFrame:
+
+        >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
+        >>> df.attrs = {"A": [10, 20, 30]}
+        >>> df.attrs
+        {'A': [10, 20, 30]}
+        """
+        return self._attrs
+
+    @attrs.setter
+    def attrs(self, value: Mapping[Hashable, Any]) -> None:
+        self._attrs = dict(value)
+
+    @final
+    @property
+    def flags(self) -> Flags:
+        """
+        Get the properties associated with this pandas object.
+
+        The available flags are
+
+        * :attr:`Flags.allows_duplicate_labels`
+
+        See Also
+        --------
+        Flags : Flags that apply to pandas objects.
+        DataFrame.attrs : Global metadata applying to this dataset.
+
+        Notes
+        -----
+        "Flags" differ from "metadata". Flags reflect properties of the
+        pandas object (the Series or DataFrame). Metadata refer to properties
+        of the dataset, and should be stored in :attr:`DataFrame.attrs`.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": [1, 2]})
+        >>> df.flags
+        <Flags(allows_duplicate_labels=True)>
+
+        Flags can be get or set using ``.``
+
+        >>> df.flags.allows_duplicate_labels
+        True
+        >>> df.flags.allows_duplicate_labels = False
+
+        Or by slicing with a key
+
+        >>> df.flags["allows_duplicate_labels"]
+        False
+        >>> df.flags["allows_duplicate_labels"] = True
+        """
+        return self._flags
+
+    @final
+    def set_flags(
+        self,
+        *,
+        copy: bool | lib.NoDefault = lib.no_default,
+        allows_duplicate_labels: bool | None = None,
+    ) -> Self:
+        """
+        Return a new object with updated flags.
+
+        This method creates a shallow copy of the original object, preserving its
+        underlying data while modifying its global flags. In particular, it allows
+        you to update properties such as whether duplicate labels are permitted. This
+        behavior is especially useful in method chains, where one wishes to
+        adjust DataFrame or Series characteristics without altering the original object.
+
+        Parameters
+        ----------
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        allows_duplicate_labels : bool, optional
+            Whether the returned object allows duplicate labels.
+
+        Returns
+        -------
+        Series or DataFrame
+            The same type as the caller.
+
+        See Also
+        --------
+        DataFrame.attrs : Global metadata applying to this dataset.
+        DataFrame.flags : Global flags applying to this object.
+
+        Notes
+        -----
+        This method returns a new object that's a view on the same data
+        as the input. Mutating the input or the output values will be reflected
+        in the other.
+
+        This method is intended to be used in method chains.
+
+        "Flags" differ from "metadata". Flags reflect properties of the
+        pandas object (the Series or DataFrame). Metadata refer to properties
+        of the dataset, and should be stored in :attr:`DataFrame.attrs`.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": [1, 2]})
+        >>> df.flags.allows_duplicate_labels
+        True
+        >>> df2 = df.set_flags(allows_duplicate_labels=False)
+        >>> df2.flags.allows_duplicate_labels
+        False
+        """
+        self._check_copy_deprecation(copy)
+        df = self.copy(deep=False)
+        if allows_duplicate_labels is not None:
+            df.flags["allows_duplicate_labels"] = allows_duplicate_labels
+        return df
+
+    @final
+    @classmethod
+    def _validate_dtype(cls, dtype) -> DtypeObj | None:
+        """validate the passed dtype"""
+        if dtype is not None:
+            dtype = pandas_dtype(dtype)
+
+            # a compound dtype
+            if dtype.kind == "V" and not isinstance(dtype, ExtensionDtype):
+                raise NotImplementedError(
+                    "compound dtypes are not implemented "
+                    f"in the {cls.__name__} constructor"
+                )
+
+        return dtype
+
+    # ----------------------------------------------------------------------
+    # Construction
+
+    # error: Signature of "_constructor" incompatible with supertype "PandasObject"
+    @property
+    def _constructor(self) -> Callable[..., Self]:  # type: ignore[override]
+        """
+        Used when a manipulation result has the same dimensions as the
+        original.
+        """
+        raise AbstractMethodError(self)
+
+    # ----------------------------------------------------------------------
+    # Axis
+    _AXIS_ORDERS: list[Literal["index", "columns"]]
+    _AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}
+    _info_axis_number: int
+    _info_axis_name: Literal["index", "columns"]
+    _AXIS_LEN: int
+
+    @final
+    def _construct_axes_dict(
+        self, axes: Sequence[Axis] | None = None, **kwargs: AxisInt
+    ) -> dict:
+        """Return an axes dictionary for myself."""
+        d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
+        # error: Argument 1 to "update" of "MutableMapping" has incompatible type
+        # "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"
+        d.update(kwargs)  # type: ignore[arg-type]
+        return d
+
+    @final
+    @classmethod
+    def _get_axis_number(cls, axis: Axis) -> AxisInt:
+        try:
+            return cls._AXIS_TO_AXIS_NUMBER[axis]
+        except KeyError as err:
+            raise ValueError(
+                f"No axis named {axis} for object type {cls.__name__}"
+            ) from err
+
+    @final
+    @classmethod
+    def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:
+        axis_number = cls._get_axis_number(axis)
+        return cls._AXIS_ORDERS[axis_number]
+
+    @final
+    def _get_axis(self, axis: Axis) -> Index:
+        axis_number = self._get_axis_number(axis)
+        assert axis_number in {0, 1}
+        return self.index if axis_number == 0 else self.columns
+
+    @final
+    @classmethod
+    def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:
+        """Map the axis to the block_manager axis."""
+        axis = cls._get_axis_number(axis)
+        ndim = cls._AXIS_LEN
+        if ndim == 2:
+            # i.e. DataFrame
+            return 1 - axis
+        return axis
+
+    @final
+    def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
+        # index or columns
+        axis_index = getattr(self, axis)
+        d = {}
+        prefix = axis[0]
+
+        for i, name in enumerate(axis_index.names):
+            if name is not None:
+                key = level = name
+            else:
+                # prefix with 'i' or 'c' depending on the input axis
+                # e.g., you must do ilevel_0 for the 0th level of an unnamed
+                # multiiindex
+                key = f"{prefix}level_{i}"
+                level = i
+
+            level_values = axis_index.get_level_values(level)
+            s = level_values.to_series()
+            s.index = axis_index
+            d[key] = s
+
+        # put the index/columns itself in the dict
+        if isinstance(axis_index, MultiIndex):
+            dindex = axis_index
+        else:
+            dindex = axis_index.to_series()
+
+        d[axis] = dindex
+        return d
+
+    @final
+    def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
+        from pandas.core.computation.parsing import clean_column_name
+
+        d: dict[str, Series | MultiIndex] = {}
+        for axis_name in self._AXIS_ORDERS:
+            d.update(self._get_axis_resolvers(axis_name))
+
+        return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
+
+    @final
+    def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
+        """
+        Return the special character free column resolvers of a DataFrame.
+
+        Column names with special characters are 'cleaned up' so that they can
+        be referred to by backtick quoting.
+        Used in :meth:`DataFrame.eval`.
+        """
+        from pandas.core.computation.parsing import clean_column_name
+        from pandas.core.series import Series
+
+        if isinstance(self, ABCSeries):
+            return {clean_column_name(self.name): self}
+
+        dtypes = self.dtypes
+        return {
+            clean_column_name(k): Series(
+                v, copy=False, index=self.index, name=k, dtype=dtype
+            ).__finalize__(self)
+            for k, v, dtype in zip(
+                self.columns,
+                self._iter_column_arrays(),
+                dtypes,
+                strict=True,
+            )
+        }
+
+    @final
+    @property
+    def _info_axis(self) -> Index:
+        return getattr(self, self._info_axis_name)
+
+    @property
+    def shape(self) -> tuple[int, ...]:
+        """
+        Return a tuple of axis dimensions
+        """
+        return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
+
+    @property
+    def axes(self) -> list[Index]:
+        """
+        Return index label(s) of the internal NDFrame
+        """
+        # we do it this way because if we have reversed axes, then
+        # the block manager shows then reversed
+        return [self._get_axis(a) for a in self._AXIS_ORDERS]
+
+    @final
+    @property
+    def ndim(self) -> int:
+        """
+        Return an int representing the number of axes / array dimensions.
+
+        Return 1 if Series. Otherwise return 2 if DataFrame.
+
+        See Also
+        --------
+        numpy.ndarray.ndim : Number of array dimensions.
+
+        Examples
+        --------
+        >>> s = pd.Series({"a": 1, "b": 2, "c": 3})
+        >>> s.ndim
+        1
+
+        >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+        >>> df.ndim
+        2
+        """
+        return self._mgr.ndim
+
+    @final
+    @property
+    def size(self) -> int:
+        """
+        Return an int representing the number of elements in this object.
+
+        Return the number of rows if Series. Otherwise return the number of
+        rows times number of columns if DataFrame.
+
+        See Also
+        --------
+        numpy.ndarray.size : Number of elements in the array.
+
+        Examples
+        --------
+        >>> s = pd.Series({"a": 1, "b": 2, "c": 3})
+        >>> s.size
+        3
+
+        >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+        >>> df.size
+        4
+        """
+
+        return int(np.prod(self.shape))
+
+    def set_axis(
+        self,
+        labels,
+        *,
+        axis: Axis = 0,
+        copy: bool | lib.NoDefault = lib.no_default,
+    ) -> Self:
+        """
+        Assign desired index to given axis.
+
+        Indexes for%(extended_summary_sub)s row labels can be changed by assigning
+        a list-like or Index.
+
+        Parameters
+        ----------
+        labels : list-like, Index
+            The values for the new index.
+
+        axis : %(axes_single_arg)s, default 0
+            The axis to update. The value 0 identifies the rows. For `Series`
+            this parameter is unused and defaults to 0.
+
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        Returns
+        -------
+        %(klass)s
+            An object of type %(klass)s.
+
+        See Also
+        --------
+        %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
+        """
+        self._check_copy_deprecation(copy)
+        return self._set_axis_nocheck(labels, axis, inplace=False)
+
+    @overload
+    def _set_axis_nocheck(
+        self, labels, axis: Axis, inplace: Literal[False]
+    ) -> Self: ...
+
+    @overload
+    def _set_axis_nocheck(self, labels, axis: Axis, inplace: Literal[True]) -> None: ...
+
+    @overload
+    def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool) -> Self | None: ...
+
+    @final
+    def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool) -> Self | None:
+        if inplace:
+            setattr(self, self._get_axis_name(axis), labels)
+            return None
+        obj = self.copy(deep=False)
+        setattr(obj, obj._get_axis_name(axis), labels)
+        return obj
+
+    @final
+    def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:
+        """
+        This is called from the cython code when we set the `index` attribute
+        directly, e.g. `series.index = [1, 2, 3]`.
+        """
+        labels = ensure_index(labels)
+        self._mgr.set_axis(axis, labels)
+
+    @final
+    def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self:
+        """
+        Return Series/DataFrame with requested index / column level(s) removed.
+
+        Parameters
+        ----------
+        level : int, str, or list-like
+            If a string is given, must be the name of a level
+            If list-like, elements must be names or positional indexes
+            of levels.
+
+        axis : {{0 or 'index', 1 or 'columns'}}, default 0
+            Axis along which the level(s) is removed:
+
+            * 0 or 'index': remove level(s) in column.
+            * 1 or 'columns': remove level(s) in row.
+
+            For `Series` this parameter is unused and defaults to 0.
+
+        Returns
+        -------
+        Series/DataFrame
+            Series/DataFrame with requested index / column level(s) removed.
+
+        See Also
+        --------
+        DataFrame.replace : Replace values given in `to_replace` with `value`.
+        DataFrame.pivot : Return reshaped DataFrame organized by given
+            index / column values.
+
+        Examples
+        --------
+        >>> df = (
+        ...     pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
+        ...     .set_index([0, 1])
+        ...     .rename_axis(["a", "b"])
+        ... )
+
+        >>> df.columns = pd.MultiIndex.from_tuples(
+        ...     [("c", "e"), ("d", "f")], names=["level_1", "level_2"]
+        ... )
+
+        >>> df
+        level_1   c   d
+        level_2   e   f
+        a b
+        1 2      3   4
+        5 6      7   8
+        9 10    11  12
+
+        >>> df.droplevel("a")
+        level_1   c   d
+        level_2   e   f
+        b
+        2        3   4
+        6        7   8
+        10      11  12
+
+        >>> df.droplevel("level_2", axis=1)
+        level_1   c   d
+        a b
+        1 2      3   4
+        5 6      7   8
+        9 10    11  12
+        """
+        labels = self._get_axis(axis)
+        new_labels = labels.droplevel(level)
+        return self.set_axis(new_labels, axis=axis)
+
+    def pop(self, item: Hashable) -> Series | Any:
+        result = self[item]
+        del self[item]
+
+        return result
+
+    @final
+    def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame:
+        """
+        Squeeze 1 dimensional axis objects into scalars.
+
+        Series or DataFrames with a single element are squeezed to a scalar.
+        DataFrames with a single column or a single row are squeezed to a
+        Series. Otherwise the object is unchanged.
+
+        This method is most useful when you don't know if your
+        object is a Series or DataFrame, but you do know it has just a single
+        column. In that case you can safely call `squeeze` to ensure you have a
+        Series.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns', None}, default None
+            A specific axis to squeeze. By default, all length-1 axes are
+            squeezed. For `Series` this parameter is unused and defaults to `None`.
+
+        Returns
+        -------
+        DataFrame, Series, or scalar
+            The projection after squeezing `axis` or all the axes.
+
+        See Also
+        --------
+        Series.iloc : Integer-location based indexing for selecting scalars.
+        DataFrame.iloc : Integer-location based indexing for selecting Series.
+        Series.to_frame : Inverse of DataFrame.squeeze for a
+            single-column DataFrame.
+
+        Examples
+        --------
+        >>> primes = pd.Series([2, 3, 5, 7])
+
+        Slicing might produce a Series with a single value:
+
+        >>> even_primes = primes[primes % 2 == 0]
+        >>> even_primes
+        0    2
+        dtype: int64
+
+        >>> even_primes.squeeze()
+        np.int64(2)
+
+        Squeezing objects with more than one value in every axis does nothing:
+
+        >>> odd_primes = primes[primes % 2 == 1]
+        >>> odd_primes
+        1    3
+        2    5
+        3    7
+        dtype: int64
+
+        >>> odd_primes.squeeze()
+        1    3
+        2    5
+        3    7
+        dtype: int64
+
+        Squeezing is even more effective when used with DataFrames.
+
+        >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
+        >>> df
+           a  b
+        0  1  2
+        1  3  4
+
+        Slicing a single column will produce a DataFrame with the columns
+        having only one value:
+
+        >>> df_a = df[["a"]]
+        >>> df_a
+           a
+        0  1
+        1  3
+
+        So the columns can be squeezed down, resulting in a Series:
+
+        >>> df_a.squeeze("columns")
+        0    1
+        1    3
+        Name: a, dtype: int64
+
+        Slicing a single row from a single column will produce a single
+        scalar DataFrame:
+
+        >>> df_0a = df.loc[df.index < 1, ["a"]]
+        >>> df_0a
+           a
+        0  1
+
+        Squeezing the rows produces a single scalar Series:
+
+        >>> df_0a.squeeze("rows")
+        a    1
+        Name: 0, dtype: int64
+
+        Squeezing all axes will project directly into a scalar:
+
+        >>> df_0a.squeeze()
+        np.int64(1)
+        """
+        axes = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),)
+        result = self.iloc[
+            tuple(
+                0 if i in axes and len(a) == 1 else slice(None)
+                for i, a in enumerate(self.axes)
+            )
+        ]
+        if isinstance(result, NDFrame):
+            result = result.__finalize__(self, method="squeeze")
+        return result
+
+    # ----------------------------------------------------------------------
+    # Rename
+
+    @overload
+    def _rename(
+        self,
+        mapper: Renamer | None = ...,
+        *,
+        index: Renamer | None = ...,
+        columns: Renamer | None = ...,
+        axis: Axis | None = ...,
+        inplace: Literal[False] = ...,
+        level: Level | None = ...,
+        errors: str = ...,
+    ) -> Self: ...
+
+    @overload
+    def _rename(
+        self,
+        mapper: Renamer | None = ...,
+        *,
+        index: Renamer | None = ...,
+        columns: Renamer | None = ...,
+        axis: Axis | None = ...,
+        inplace: Literal[True],
+        level: Level | None = ...,
+        errors: str = ...,
+    ) -> None: ...
+
+    @overload
+    def _rename(
+        self,
+        mapper: Renamer | None = ...,
+        *,
+        index: Renamer | None = ...,
+        columns: Renamer | None = ...,
+        axis: Axis | None = ...,
+        inplace: bool,
+        level: Level | None = ...,
+        errors: str = ...,
+    ) -> Self | None: ...
+
+    @final
+    def _rename(
+        self,
+        mapper: Renamer | None = None,
+        *,
+        index: Renamer | None = None,
+        columns: Renamer | None = None,
+        axis: Axis | None = None,
+        inplace: bool = False,
+        level: Level | None = None,
+        errors: str = "ignore",
+    ) -> Self | None:
+        # called by Series.rename and DataFrame.rename
+
+        if mapper is None and index is None and columns is None:
+            raise TypeError("must pass an index to rename")
+
+        if index is not None or columns is not None:
+            if axis is not None:
+                raise TypeError(
+                    "Cannot specify both 'axis' and any of 'index' or 'columns'"
+                )
+            if mapper is not None:
+                raise TypeError(
+                    "Cannot specify both 'mapper' and any of 'index' or 'columns'"
+                )
+        # use the mapper argument
+        elif axis and self._get_axis_number(axis) == 1:
+            columns = mapper
+        else:
+            index = mapper
+
+        self._check_inplace_and_allows_duplicate_labels(inplace)
+        result = self if inplace else self.copy(deep=False)
+
+        for axis_no, replacements in enumerate((index, columns)):
+            if replacements is None:
+                continue
+
+            ax = self._get_axis(axis_no)
+            f = common.get_rename_function(replacements)
+
+            if level is not None:
+                level = ax._get_level_number(level)
+
+            if isinstance(replacements, ABCSeries) and not replacements.index.is_unique:
+                # GH#58621
+                raise ValueError("Cannot rename with a Series with non-unique index.")
+
+            # GH 13473
+            if not callable(replacements):
+                if ax._is_multi and level is not None:
+                    indexer = ax.get_level_values(level).get_indexer_for(replacements)
+                else:
+                    indexer = ax.get_indexer_for(replacements)
+
+                if errors == "raise" and len(indexer[indexer == -1]):
+                    missing_labels = [
+                        label
+                        for index, label in enumerate(replacements)
+                        if indexer[index] == -1
+                    ]
+                    raise KeyError(f"{missing_labels} not found in axis")
+
+            new_index = ax._transform_index(f, level=level)
+            result._set_axis_nocheck(new_index, axis=axis_no, inplace=True)
+
+        if inplace:
+            self._update_inplace(result)
+            return None
+        else:
+            return result.__finalize__(self, method="rename")
+
+    @overload
+    def rename_axis(
+        self,
+        mapper: IndexLabel | lib.NoDefault = ...,
+        *,
+        index=...,
+        columns=...,
+        axis: Axis = ...,
+        copy: bool | lib.NoDefault = lib.no_default,
+        inplace: Literal[False] = ...,
+    ) -> Self: ...
+
+    @overload
+    def rename_axis(
+        self,
+        mapper: IndexLabel | lib.NoDefault = ...,
+        *,
+        index=...,
+        columns=...,
+        axis: Axis = ...,
+        copy: bool | lib.NoDefault = lib.no_default,
+        inplace: Literal[True],
+    ) -> None: ...
+
+    @overload
+    def rename_axis(
+        self,
+        mapper: IndexLabel | lib.NoDefault = ...,
+        *,
+        index=...,
+        columns=...,
+        axis: Axis = ...,
+        copy: bool | lib.NoDefault = lib.no_default,
+        inplace: bool = ...,
+    ) -> Self | None: ...
+
+    def rename_axis(
+        self,
+        mapper: IndexLabel | lib.NoDefault = lib.no_default,
+        *,
+        index=lib.no_default,
+        columns=lib.no_default,
+        axis: Axis = 0,
+        copy: bool | lib.NoDefault = lib.no_default,
+        inplace: bool = False,
+    ) -> Self | None:
+        """
+        Set the name of the axis for the index or columns.
+
+        Parameters
+        ----------
+        mapper : scalar, list-like, optional
+            Value to set the axis name attribute.
+
+            Use either ``mapper`` and ``axis`` to
+            specify the axis to target with ``mapper``, or ``index``
+            and/or ``columns``.
+        index : scalar, list-like, dict-like or function, optional
+            A scalar, list-like, dict-like or functions transformations to
+            apply to that axis' values.
+        columns : scalar, list-like, dict-like or function, optional
+            A scalar, list-like, dict-like or functions transformations to
+            apply to that axis' values.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis to rename.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        inplace : bool, default False
+            Modifies the object directly, instead of creating a new Series
+            or DataFrame.
+
+        Returns
+        -------
+        DataFrame, or None
+            The same type as the caller or None if ``inplace=True``.
+
+        See Also
+        --------
+        Series.rename : Alter Series index labels or name.
+        DataFrame.rename : Alter DataFrame index labels or name.
+        Index.rename : Set new names on index.
+
+        Notes
+        -----
+        ``DataFrame.rename_axis`` supports two calling conventions
+
+        * ``(index=index_mapper, columns=columns_mapper, ...)``
+        * ``(mapper, axis={'index', 'columns'}, ...)``
+
+        The first calling convention will only modify the names of
+        the index and/or the names of the Index object that is the columns.
+        In this case, the parameter ``copy`` is ignored.
+
+        The second calling convention will modify the names of the
+        corresponding index if mapper is a list or a scalar.
+        However, if mapper is dict-like or a function, it will use the
+        deprecated behavior of modifying the axis *labels*.
+
+        We *highly* recommend using keyword arguments to clarify your
+        intent.
+
+        Examples
+        --------
+        **DataFrame**
+
+        >>> df = pd.DataFrame(
+        ...     {"num_legs": [4, 4, 2], "num_arms": [0, 0, 2]}, ["dog", "cat", "monkey"]
+        ... )
+        >>> df
+                num_legs  num_arms
+        dog            4         0
+        cat            4         0
+        monkey         2         2
+        >>> df = df.rename_axis("animal")
+        >>> df
+                num_legs  num_arms
+        animal
+        dog            4         0
+        cat            4         0
+        monkey         2         2
+        >>> df = df.rename_axis("limbs", axis="columns")
+        >>> df
+        limbs   num_legs  num_arms
+        animal
+        dog            4         0
+        cat            4         0
+        monkey         2         2
+
+        **MultiIndex**
+
+        >>> df.index = pd.MultiIndex.from_product(
+        ...     [["mammal"], ["dog", "cat", "monkey"]], names=["type", "name"]
+        ... )
+        >>> df
+        limbs          num_legs  num_arms
+        type   name
+        mammal dog            4         0
+               cat            4         0
+               monkey         2         2
+
+        >>> df.rename_axis(index={"type": "class"})
+        limbs          num_legs  num_arms
+        class  name
+        mammal dog            4         0
+               cat            4         0
+               monkey         2         2
+
+        >>> df.rename_axis(columns=str.upper)
+        LIMBS          num_legs  num_arms
+        type   name
+        mammal dog            4         0
+               cat            4         0
+               monkey         2         2
+        """
+        self._check_copy_deprecation(copy)
+        axes = {"index": index, "columns": columns}
+
+        if axis is not None:
+            axis = self._get_axis_number(axis)
+
+        inplace = validate_bool_kwarg(inplace, "inplace")
+
+        if mapper is not lib.no_default:
+            # Use v0.23 behavior if a scalar or list
+            non_mapper = is_scalar(mapper) or (
+                is_list_like(mapper) and not is_dict_like(mapper)
+            )
+            if non_mapper:
+                return self._set_axis_name(mapper, axis=axis, inplace=inplace)
+            else:
+                raise ValueError("Use `.rename` to alter labels with a mapper.")
+        else:
+            # Use new behavior.  Means that index and/or columns
+            # is specified
+            result = self if inplace else self.copy(deep=False)
+
+            for axis in range(self._AXIS_LEN):
+                v = axes.get(self._get_axis_name(axis))
+                if v is lib.no_default:
+                    continue
+                non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v))
+                if non_mapper:
+                    newnames = v
+                else:
+                    f = common.get_rename_function(v)
+                    curnames = self._get_axis(axis).names
+                    newnames = [f(name) for name in curnames]
+                result._set_axis_name(newnames, axis=axis, inplace=True)
+            if not inplace:
+                return result
+            return None
+
+    @overload
+    def _set_axis_name(
+        self, name, axis: Axis = ..., *, inplace: Literal[False] = ...
+    ) -> Self: ...
+
+    @overload
+    def _set_axis_name(
+        self, name, axis: Axis = ..., *, inplace: Literal[True]
+    ) -> None: ...
+
+    @overload
+    def _set_axis_name(
+        self, name, axis: Axis = ..., *, inplace: bool
+    ) -> Self | None: ...
+
+    @final
+    def _set_axis_name(
+        self, name, axis: Axis = 0, *, inplace: bool = False
+    ) -> Self | None:
+        """
+        Set the name(s) of the axis.
+
+        Parameters
+        ----------
+        name : str or list of str
+            Name(s) to set.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis to set the label. The value 0 or 'index' specifies index,
+            and the value 1 or 'columns' specifies columns.
+        inplace : bool, default False
+            If `True`, do operation inplace and return None.
+
+        Returns
+        -------
+        Series, DataFrame, or None
+            The same type as the caller or `None` if `inplace` is `True`.
+
+        See Also
+        --------
+        DataFrame.rename : Alter the axis labels of :class:`DataFrame`.
+        Series.rename : Alter the index labels or set the index name
+            of :class:`Series`.
+        Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"num_legs": [4, 4, 2]}, ["dog", "cat", "monkey"])
+        >>> df
+                num_legs
+        dog            4
+        cat            4
+        monkey         2
+        >>> df._set_axis_name("animal")
+                num_legs
+        animal
+        dog            4
+        cat            4
+        monkey         2
+        >>> df.index = pd.MultiIndex.from_product(
+        ...     [["mammal"], ["dog", "cat", "monkey"]]
+        ... )
+        >>> df._set_axis_name(["type", "name"])
+                       num_legs
+        type   name
+        mammal dog        4
+               cat        4
+               monkey     2
+        """
+        axis = self._get_axis_number(axis)
+        idx = self._get_axis(axis).set_names(name)
+
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        renamed = self if inplace else self.copy(deep=False)
+        if axis == 0:
+            renamed.index = idx
+        else:
+            renamed.columns = idx
+
+        if not inplace:
+            return renamed
+        return None
+
+    # ----------------------------------------------------------------------
+    # Comparison Methods
+
+    @final
+    def _indexed_same(self, other) -> bool:
+        return all(
+            self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS
+        )
+
+    @final
+    def equals(self, other: object) -> bool:
+        """
+        Test whether two objects contain the same elements.
+
+        This function allows two Series or DataFrames to be compared against
+        each other to see if they have the same shape and elements. NaNs in
+        the same location are considered equal.
+
+        The row/column index do not need to have the same type, as long
+        as the values are considered equal. Corresponding columns and
+        index must be of the same dtype.
+
+        Parameters
+        ----------
+        other : Series or DataFrame
+            The other Series or DataFrame to be compared with the first.
+
+        Returns
+        -------
+        bool
+            True if all elements are the same in both objects, False
+            otherwise.
+
+        See Also
+        --------
+        Series.eq : Compare two Series objects of the same length
+            and return a Series where each element is True if the element
+            in each Series is equal, False otherwise.
+        DataFrame.eq : Compare two DataFrame objects of the same shape and
+            return a DataFrame where each element is True if the respective
+            element in each DataFrame is equal, False otherwise.
+        testing.assert_series_equal : Raises an AssertionError if left and
+            right are not equal. Provides an easy interface to ignore
+            inequality in dtypes, indexes and precision among others.
+        testing.assert_frame_equal : Like assert_series_equal, but targets
+            DataFrames.
+        numpy.array_equal : Return True if two arrays have the same shape
+            and elements, False otherwise.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({1: [10], 2: [20]})
+        >>> df
+            1   2
+        0  10  20
+
+        DataFrames df and exactly_equal have the same types and values for
+        their elements and column labels, which will return True.
+
+        >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})
+        >>> exactly_equal
+            1   2
+        0  10  20
+        >>> df.equals(exactly_equal)
+        True
+
+        DataFrames df and different_column_type have the same element
+        types and values, but have different types for the column labels,
+        which will still return True.
+
+        >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})
+        >>> different_column_type
+           1.0  2.0
+        0   10   20
+        >>> df.equals(different_column_type)
+        True
+
+        DataFrames df and different_data_type have different types for the
+        same values for their elements, and will return False even though
+        their column labels are the same values and types.
+
+        >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})
+        >>> different_data_type
+              1     2
+        0  10.0  20.0
+        >>> df.equals(different_data_type)
+        False
+
+        DataFrames with NaN in the same locations compare equal.
+
+        >>> df_nan1 = pd.DataFrame({"a": [1, np.nan], "b": [3, np.nan]})
+        >>> df_nan2 = pd.DataFrame({"a": [1, np.nan], "b": [3, np.nan]})
+        >>> df_nan1.equals(df_nan2)
+        True
+
+        If the NaN values are not in the same locations, they compare unequal.
+
+        >>> df_nan3 = pd.DataFrame({"a": [1, np.nan], "b": [3, 4]})
+        >>> df_nan1.equals(df_nan3)
+        False
+        """
+        if not (isinstance(other, type(self)) or isinstance(self, type(other))):
+            return False
+        other = cast(NDFrame, other)
+        return self._mgr.equals(other._mgr)
+
+    # -------------------------------------------------------------------------
+    # Unary Methods
+
+    @final
+    def __neg__(self) -> Self:
+        def blk_func(values: ArrayLike):
+            if is_bool_dtype(values.dtype):
+                # error: Argument 1 to "inv" has incompatible type "Union
+                # [ExtensionArray, ndarray[Any, Any]]"; expected
+                # "_SupportsInversion[ndarray[Any, dtype[bool_]]]"
+                return operator.inv(values)  # type: ignore[arg-type]
+            else:
+                # error: Argument 1 to "neg" has incompatible type "Union
+                # [ExtensionArray, ndarray[Any, Any]]"; expected
+                # "_SupportsNeg[ndarray[Any, dtype[Any]]]"
+                return operator.neg(values)  # type: ignore[arg-type]
+
+        new_data = self._mgr.apply(blk_func)
+        res = self._constructor_from_mgr(new_data, axes=new_data.axes)
+        return res.__finalize__(self, method="__neg__")
+
+    @final
+    def __pos__(self) -> Self:
+        def blk_func(values: ArrayLike):
+            if is_bool_dtype(values.dtype):
+                return values.copy()
+            else:
+                # error: Argument 1 to "pos" has incompatible type "Union
+                # [ExtensionArray, ndarray[Any, Any]]"; expected
+                # "_SupportsPos[ndarray[Any, dtype[Any]]]"
+                return operator.pos(values)  # type: ignore[arg-type]
+
+        new_data = self._mgr.apply(blk_func)
+        res = self._constructor_from_mgr(new_data, axes=new_data.axes)
+        return res.__finalize__(self, method="__pos__")
+
+    @final
+    def __invert__(self) -> Self:
+        if not self.size:
+            # inv fails with 0 len
+            return self.copy(deep=False)
+
+        new_data = self._mgr.apply(operator.invert)
+        res = self._constructor_from_mgr(new_data, axes=new_data.axes)
+        return res.__finalize__(self, method="__invert__")
+
+    @final
+    def __bool__(self) -> NoReturn:
+        raise ValueError(
+            f"The truth value of a {type(self).__name__} is ambiguous. "
+            "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
+    @final
+    def abs(self) -> Self:
+        """
+        Return a Series/DataFrame with absolute numeric value of each element.
+
+        This function only applies to elements that are all numeric.
+
+        Returns
+        -------
+        abs
+            Series/DataFrame containing the absolute value of each element.
+
+        See Also
+        --------
+        numpy.absolute : Calculate the absolute value element-wise.
+
+        Notes
+        -----
+        For ``complex`` inputs, ``1.2 + 1j``, the absolute value is
+        :math:`\\sqrt{ a^2 + b^2 }`.
+
+        Examples
+        --------
+        Absolute numeric values in a Series.
+
+        >>> s = pd.Series([-1.10, 2, -3.33, 4])
+        >>> s.abs()
+        0    1.10
+        1    2.00
+        2    3.33
+        3    4.00
+        dtype: float64
+
+        Absolute numeric values in a Series with complex numbers.
+
+        >>> s = pd.Series([1.2 + 1j])
+        >>> s.abs()
+        0    1.56205
+        dtype: float64
+
+        Absolute numeric values in a Series with a Timedelta element.
+
+        >>> s = pd.Series([pd.Timedelta("1 days")])
+        >>> s.abs()
+        0   1 days
+        dtype: timedelta64[us]
+
+        Select rows with data closest to certain value using argsort (from
+        `StackOverflow <https://stackoverflow.com/a/17758115>`__).
+
+        >>> df = pd.DataFrame(
+        ...     {"a": [4, 5, 6, 7], "b": [10, 20, 30, 40], "c": [100, 50, -30, -50]}
+        ... )
+        >>> df
+             a    b    c
+        0    4   10  100
+        1    5   20   50
+        2    6   30  -30
+        3    7   40  -50
+        >>> df.loc[(df.c - 43).abs().argsort()]
+             a    b    c
+        1    5   20   50
+        0    4   10  100
+        2    6   30  -30
+        3    7   40  -50
+        """
+        res_mgr = self._mgr.apply(np.abs)
+        return self._constructor_from_mgr(res_mgr, axes=res_mgr.axes).__finalize__(
+            self, name="abs"
+        )
+
+    @final
+    def __abs__(self) -> Self:
+        return self.abs()
+
+    @final
+    def __round__(self, decimals: int = 0) -> Self:
+        return self.round(decimals).__finalize__(self, method="__round__")
+
+    # -------------------------------------------------------------------------
+    # Label or Level Combination Helpers
+    #
+    # A collection of helper methods for DataFrame/Series operations that
+    # accept a combination of column/index labels and levels.  All such
+    # operations should utilize/extend these methods when possible so that we
+    # have consistent precedence and validation logic throughout the library.
+
+    @final
+    def _is_level_reference(self, key: Level, axis: Axis = 0) -> bool:
+        """
+        Test whether a key is a level reference for a given axis.
+
+        To be considered a level reference, `key` must be a string that:
+          - (axis=0): Matches the name of an index level and does NOT match
+            a column label.
+          - (axis=1): Matches the name of a column level and does NOT match
+            an index label.
+
+        Parameters
+        ----------
+        key : Hashable
+            Potential level name for the given axis
+        axis : int, default 0
+            Axis that levels are associated with (0 for index, 1 for columns)
+
+        Returns
+        -------
+        is_level : bool
+        """
+        axis_int = self._get_axis_number(axis)
+
+        return (
+            key is not None
+            and is_hashable(key)
+            and key in self.axes[axis_int].names
+            and not self._is_label_reference(key, axis=axis_int)
+        )
+
+    @final
+    def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool:
+        """
+        Test whether a key is a label reference for a given axis.
+
+        To be considered a label reference, `key` must be a string that:
+          - (axis=0): Matches a column label
+          - (axis=1): Matches an index label
+
+        Parameters
+        ----------
+        key : Hashable
+            Potential label name, i.e. Index entry.
+        axis : int, default 0
+            Axis perpendicular to the axis that labels are associated with
+            (0 means search for column labels, 1 means search for index labels)
+
+        Returns
+        -------
+        is_label: bool
+        """
+        axis_int = self._get_axis_number(axis)
+        other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)
+
+        return is_hashable(key) and any(key in self.axes[ax] for ax in other_axes)
+
+    @final
+    def _is_label_or_level_reference(self, key: Level, axis: AxisInt = 0) -> bool:
+        """
+        Test whether a key is a label or level reference for a given axis.
+
+        To be considered either a label or a level reference, `key` must be a
+        string that:
+          - (axis=0): Matches a column label or an index level
+          - (axis=1): Matches an index label or a column level
+
+        Parameters
+        ----------
+        key : Hashable
+            Potential label or level name
+        axis : int, default 0
+            Axis that levels are associated with (0 for index, 1 for columns)
+
+        Returns
+        -------
+        bool
+        """
+        return self._is_level_reference(key, axis=axis) or self._is_label_reference(
+            key, axis=axis
+        )
+
+    @final
+    def _check_label_or_level_ambiguity(self, key: Level, axis: Axis = 0) -> None:
+        """
+        Check whether `key` is ambiguous.
+
+        By ambiguous, we mean that it matches both a level of the input
+        `axis` and a label of the other axis.
+
+        Parameters
+        ----------
+        key : Hashable
+            Label or level name.
+        axis : int, default 0
+            Axis that levels are associated with (0 for index, 1 for columns).
+
+        Raises
+        ------
+        ValueError: `key` is ambiguous
+        """
+
+        axis_int = self._get_axis_number(axis)
+        other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)
+
+        if (
+            key is not None
+            and is_hashable(key)
+            and key in self.axes[axis_int].names
+            and any(key in self.axes[ax] for ax in other_axes)
+        ):
+            # Build an informative and grammatical warning
+            level_article, level_type = (
+                ("an", "index") if axis_int == 0 else ("a", "column")
+            )
+
+            label_article, label_type = (
+                ("a", "column") if axis_int == 0 else ("an", "index")
+            )
+
+            msg = (
+                f"'{key}' is both {level_article} {level_type} level and "
+                f"{label_article} {label_type} label, which is ambiguous."
+            )
+            raise ValueError(msg)
+
+    @final
+    def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike:
+        """
+        Return a 1-D array of values associated with `key`, a label or level
+        from the given `axis`.
+
+        Retrieval logic:
+          - (axis=0): Return column values if `key` matches a column label.
+            Otherwise return index level values if `key` matches an index
+            level.
+          - (axis=1): Return row values if `key` matches an index label.
+            Otherwise return column level values if 'key' matches a column
+            level
+
+        Parameters
+        ----------
+        key : Hashable
+            Label or level name.
+        axis : int, default 0
+            Axis that levels are associated with (0 for index, 1 for columns)
+
+        Returns
+        -------
+        np.ndarray or ExtensionArray
+
+        Raises
+        ------
+        KeyError
+            if `key` matches neither a label nor a level
+        ValueError
+            if `key` matches multiple labels
+        """
+        axis = self._get_axis_number(axis)
+        first_other_axes = next(
+            (ax for ax in range(self._AXIS_LEN) if ax != axis), None
+        )
+
+        if self._is_label_reference(key, axis=axis):
+            self._check_label_or_level_ambiguity(key, axis=axis)
+            if first_other_axes is None:
+                raise ValueError("axis matched all axes")
+            values = self.xs(key, axis=first_other_axes)._values
+        elif self._is_level_reference(key, axis=axis):
+            values = self.axes[axis].get_level_values(key)._values
+        else:
+            raise KeyError(key)
+
+        # Check for duplicates
+        if values.ndim > 1:
+            if first_other_axes is not None and isinstance(
+                self._get_axis(first_other_axes), MultiIndex
+            ):
+                multi_message = (
+                    "\n"
+                    "For a multi-index, the label must be a "
+                    "tuple with elements corresponding to each level."
+                )
+            else:
+                multi_message = ""
+
+            label_axis_name = "column" if axis == 0 else "index"
+            raise ValueError(
+                f"The {label_axis_name} label '{key}' is not unique.{multi_message}"
+            )
+
+        return values
+
+    @final
+    def _drop_labels_or_levels(self, keys, axis: AxisInt = 0):
+        """
+        Drop labels and/or levels for the given `axis`.
+
+        For each key in `keys`:
+          - (axis=0): If key matches a column label then drop the column.
+            Otherwise if key matches an index level then drop the level.
+          - (axis=1): If key matches an index label then drop the row.
+            Otherwise if key matches a column level then drop the level.
+
+        Parameters
+        ----------
+        keys : str or list of str
+            labels or levels to drop
+        axis : int, default 0
+            Axis that levels are associated with (0 for index, 1 for columns)
+
+        Returns
+        -------
+        dropped: DataFrame
+
+        Raises
+        ------
+        ValueError
+            if any `keys` match neither a label nor a level
+        """
+        axis = self._get_axis_number(axis)
+
+        # Validate keys
+        keys = common.maybe_make_list(keys)
+        invalid_keys = [
+            k for k in keys if not self._is_label_or_level_reference(k, axis=axis)
+        ]
+
+        if invalid_keys:
+            raise ValueError(
+                "The following keys are not valid labels or "
+                f"levels for axis {axis}: {invalid_keys}"
+            )
+
+        # Compute levels and labels to drop
+        levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)]
+
+        labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)]
+
+        # Perform copy upfront and then use inplace operations below.
+        # This ensures that we always perform exactly one copy.
+        # ``copy`` and/or ``inplace`` options could be added in the future.
+        dropped = self.copy(deep=False)
+
+        if axis == 0:
+            # Handle dropping index levels
+            if levels_to_drop:
+                dropped.reset_index(levels_to_drop, drop=True, inplace=True)
+
+            # Handle dropping columns labels
+            if labels_to_drop:
+                dropped.drop(labels_to_drop, axis=1, inplace=True)
+        else:
+            # Handle dropping column levels
+            if levels_to_drop:
+                if isinstance(dropped.columns, MultiIndex):
+                    # Drop the specified levels from the MultiIndex
+                    dropped.columns = dropped.columns.droplevel(levels_to_drop)
+                else:
+                    # Drop the last level of Index by replacing with
+                    # a RangeIndex
+                    dropped.columns = default_index(dropped.columns.size)
+
+            # Handle dropping index labels
+            if labels_to_drop:
+                dropped.drop(labels_to_drop, axis=0, inplace=True)
+
+        return dropped
+
+    # ----------------------------------------------------------------------
+    # Iteration
+
+    # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
+    # Incompatible types in assignment (expression has type "None", base class
+    # "object" defined the type as "Callable[[object], int]")
+    __hash__: ClassVar[None]  # type: ignore[assignment]
+
+    def __iter__(self) -> Iterator:
+        """
+        Iterate over info axis.
+
+        Returns
+        -------
+        iterator
+            Info axis as iterator.
+
+        See Also
+        --------
+        DataFrame.items : Iterate over (column name, Series) pairs.
+        DataFrame.itertuples : Iterate over DataFrame rows as namedtuples.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+        >>> for x in df:
+        ...     print(x)
+        A
+        B
+        """
+        return iter(self._info_axis)
+
+    # can we get a better explanation of this?
+    def keys(self) -> Index:
+        """
+        Get the 'info axis' (see Indexing for more).
+
+        This is index for Series, columns for DataFrame.
+
+        Returns
+        -------
+        Index
+            Info axis.
+
+        See Also
+        --------
+        DataFrame.index : The index (row labels) of the DataFrame.
+        DataFrame.columns: The column labels of the DataFrame.
+
+        Examples
+        --------
+        >>> d = pd.DataFrame(
+        ...     data={"A": [1, 2, 3], "B": [0, 4, 8]}, index=["a", "b", "c"]
+        ... )
+        >>> d
+           A  B
+        a  1  0
+        b  2  4
+        c  3  8
+        >>> d.keys()
+        Index(['A', 'B'], dtype='str')
+        """
+        return self._info_axis
+
+    def items(self):
+        """
+        Iterate over (label, values) on info axis
+
+        This is index for Series and columns for DataFrame.
+
+        Returns
+        -------
+        Generator
+        """
+        for h in self._info_axis:
+            yield h, self[h]
+
+    def __len__(self) -> int:
+        """Returns length of info axis"""
+        return len(self._info_axis)
+
+    @final
+    def __contains__(self, key) -> bool:
+        """True if the key is in the info axis"""
+        return key in self._info_axis
+
+    @property
+    def empty(self) -> bool:
+        """
+        Indicator whether Series/DataFrame is empty.
+
+        True if Series/DataFrame is entirely empty (no items), meaning any of the
+        axes are of length 0.
+
+        Returns
+        -------
+        bool
+            If Series/DataFrame is empty, return True, if not return False.
+
+        See Also
+        --------
+        Series.dropna : Return series without null values.
+        DataFrame.dropna : Return DataFrame with labels on given axis omitted
+            where (all or any) data are missing.
+
+        Notes
+        -----
+        If Series/DataFrame contains only NaNs, it is still not considered empty. See
+        the example below.
+
+        Examples
+        --------
+        An example of an actual empty DataFrame. Notice the index is empty:
+
+        >>> df_empty = pd.DataFrame({"A": []})
+        >>> df_empty
+        Empty DataFrame
+        Columns: [A]
+        Index: []
+        >>> df_empty.empty
+        True
+
+        If we only have NaNs in our DataFrame, it is not considered empty! We
+        will need to drop the NaNs to make the DataFrame empty:
+
+        >>> df = pd.DataFrame({"A": [np.nan]})
+        >>> df
+            A
+        0 NaN
+        >>> df.empty
+        False
+        >>> df.dropna().empty
+        True
+
+        >>> ser_empty = pd.Series({"A": []})
+        >>> ser_empty
+        A    []
+        dtype: object
+        >>> ser_empty.empty
+        False
+        >>> ser_empty = pd.Series()
+        >>> ser_empty.empty
+        True
+        """
+        return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS)
+
+    # ----------------------------------------------------------------------
+    # Array Interface
+
+    # This is also set in IndexOpsMixin
+    # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
+    __array_priority__: int = 1000
+
+    def __array__(
+        self, dtype: npt.DTypeLike | None = None, copy: bool | None = None
+    ) -> np.ndarray:
+        if copy is False and not self._mgr.is_single_block and not self.empty:
+            # check this manually, otherwise ._values will already return a copy
+            # and np.array(values, copy=False) will not raise an error
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+        values = self._values
+        if copy is None:
+            # Note: branch avoids `copy=None` for NumPy 1.x support
+            arr = np.asarray(values, dtype=dtype)
+        else:
+            arr = np.array(values, dtype=dtype, copy=copy)
+
+        if (
+            copy is not True
+            and astype_is_view(values.dtype, arr.dtype)
+            and self._mgr.is_single_block
+        ):
+            # Check if both conversions can be done without a copy
+            if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view(
+                values.dtype, arr.dtype
+            ):
+                arr = arr.view()
+                arr.flags.writeable = False
+        return arr
+
+    @final
+    def __array_ufunc__(
+        self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any
+    ):
+        return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs)
+
+    # ----------------------------------------------------------------------
+    # Picklability
+
+    @final
+    def __getstate__(self) -> dict[str, Any]:
+        meta = {k: getattr(self, k, None) for k in self._metadata}
+        return {
+            "_mgr": self._mgr,
+            "_typ": self._typ,
+            "_metadata": self._metadata,
+            "attrs": self.attrs,
+            "_flags": {k: self.flags[k] for k in self.flags._keys},
+            **meta,
+        }
+
+    @final
+    def __setstate__(self, state) -> None:
+        if isinstance(state, BlockManager):
+            self._mgr = state
+        elif isinstance(state, dict):
+            if "_data" in state and "_mgr" not in state:
+                # compat for older pickles
+                state["_mgr"] = state.pop("_data")
+            typ = state.get("_typ")
+            if typ is not None:
+                attrs = state.get("_attrs", {})
+                if attrs is None:  # should not happen, but better be on the safe side
+                    attrs = {}
+                object.__setattr__(self, "_attrs", attrs)
+                flags = state.get("_flags", {"allows_duplicate_labels": True})
+                object.__setattr__(self, "_flags", Flags(self, **flags))
+
+                # set in the order of internal names
+                # to avoid definitional recursion
+                # e.g. say fill_value needing _mgr to be
+                # defined
+                meta = set(self._internal_names + self._metadata)
+                for k in meta:
+                    if k in state and k != "_flags":
+                        v = state[k]
+                        object.__setattr__(self, k, v)
+
+                for k, v in state.items():
+                    if k not in meta:
+                        object.__setattr__(self, k, v)
+
+            else:
+                raise NotImplementedError("Pre-0.12 pickles are no longer supported")
+        elif len(state) == 2:
+            raise NotImplementedError("Pre-0.12 pickles are no longer supported")
+
+    # ----------------------------------------------------------------------
+    # Rendering Methods
+
+    def __repr__(self) -> str:
+        # string representation based upon iterating over self
+        # (since, by definition, `PandasContainers` are iterable)
+        prepr = f"[{','.join(map(pprint_thing, self))}]"
+        return f"{type(self).__name__}({prepr})"
+
+    @final
+    def _repr_latex_(self):
+        """
+        Returns a LaTeX representation for a particular object.
+        Mainly for use with nbconvert (jupyter notebook conversion to pdf).
+        """
+        if config.get_option("styler.render.repr") == "latex":
+            return self.to_latex()
+        else:
+            return None
+
+    @final
+    def _repr_data_resource_(self):
+        """
+        Not a real Jupyter special repr method, but we use the same
+        naming convention.
+        """
+        if config.get_option("display.html.table_schema"):
+            data = self.head(config.get_option("display.max_rows"))
+
+            as_json = data.to_json(orient="table")
+            as_json = cast(str, as_json)
+            return loads(as_json, object_pairs_hook=collections.OrderedDict)
+
+    # ----------------------------------------------------------------------
+    # I/O Methods
+
+    @final
+    def to_excel(
+        self,
+        excel_writer: FilePath | WriteExcelBuffer | ExcelWriter,
+        *,
+        sheet_name: str = "Sheet1",
+        na_rep: str = "",
+        float_format: str | None = None,
+        columns: Sequence[Hashable] | None = None,
+        header: Sequence[Hashable] | bool = True,
+        index: bool = True,
+        index_label: IndexLabel | None = None,
+        startrow: int = 0,
+        startcol: int = 0,
+        engine: Literal["openpyxl", "xlsxwriter"] | None = None,
+        merge_cells: bool = True,
+        inf_rep: str = "inf",
+        freeze_panes: tuple[int, int] | None = None,
+        storage_options: StorageOptions | None = None,
+        engine_kwargs: dict[str, Any] | None = None,
+        autofilter: bool = False,
+    ) -> None:
+        """
+        Write object to an Excel sheet.
+
+        To write a single object to an Excel .xlsx file it is only necessary to
+        specify a target file name. To write to multiple sheets it is necessary to
+        create an `ExcelWriter` object with a target file name, and specify a sheet
+        in the file to write to.
+
+        Multiple sheets may be written to by specifying unique `sheet_name`.
+        With all data written to the file it is necessary to save the changes.
+        Note that creating an `ExcelWriter` object with a file name that already exists
+        will overwrite the existing file because the default mode is write.
+
+        Parameters
+        ----------
+        excel_writer : path-like, file-like, or ExcelWriter object
+            File path or existing ExcelWriter.
+        sheet_name : str, default 'Sheet1'
+            Name of sheet which will contain DataFrame.
+        na_rep : str, default ''
+            Missing data representation.
+        float_format : str, optional
+            Format string for floating point numbers. For example
+            ``float_format="%.2f"`` will format 0.1234 to 0.12.
+        columns : sequence or list of str, optional
+            Columns to write.
+        header : bool or list of str, default True
+            Write out the column names. If a list of string is given it is
+            assumed to be aliases for the column names.
+        index : bool, default True
+            Write row names (index).
+        index_label : str or sequence, optional
+            Column label for index column(s) if desired. If not specified, and
+            `header` and `index` are True, then the index names are used. A
+            sequence should be given if the DataFrame uses MultiIndex.
+        startrow : int, default 0
+            Upper left cell row to dump data frame.
+        startcol : int, default 0
+            Upper left cell column to dump data frame.
+        engine : str, optional
+            Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
+            via the options ``io.excel.xlsx.writer`` or
+            ``io.excel.xlsm.writer``.
+        merge_cells : bool or 'columns', default False
+            If True, write MultiIndex index and columns as merged cells.
+            If 'columns', merge MultiIndex column cells only.
+        inf_rep : str, default 'inf'
+            Representation for infinity (there is no native representation for
+            infinity in Excel).
+        freeze_panes : tuple of int (length 2), optional
+            Specifies the one-based bottommost row and rightmost column that
+            is to be frozen.
+        storage_options : dict, optional
+            Extra options that make sense for a particular storage connection, e.g.
+            host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+            are forwarded to ``urllib.request.Request`` as header options. For other
+            URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+            forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+            details, and for more examples on storage options refer `here
+            <https://pandas.pydata.org/docs/user_guide/io.html?
+            highlight=storage_options#reading-writing-remote-files>`_.
+
+        engine_kwargs : dict, optional
+            Arbitrary keyword arguments passed to excel engine.
+        autofilter : bool, default False
+            If True, add automatic filters to all columns.
+
+        See Also
+        --------
+        to_csv : Write DataFrame to a comma-separated values (csv) file.
+        ExcelWriter : Class for writing DataFrame objects into excel sheets.
+        read_excel : Read an Excel file into a pandas DataFrame.
+        read_csv : Read a comma-separated values (csv) file into DataFrame.
+        io.formats.style.Styler.to_excel : Add styles to Excel sheet.
+
+        Notes
+        -----
+        For compatibility with :meth:`~DataFrame.to_csv`,
+        to_excel serializes lists and dicts to strings before writing.
+
+        Once a workbook has been saved it is not possible to write further
+        data without rewriting the whole workbook.
+
+        pandas will check the number of rows, columns,
+        and cell character count does not exceed Excel's limitations.
+        All other limitations must be checked by the user.
+
+        Examples
+        --------
+
+        Create, write to and save a workbook:
+
+        >>> df1 = pd.DataFrame(
+        ...     [["a", "b"], ["c", "d"]],
+        ...     index=["row 1", "row 2"],
+        ...     columns=["col 1", "col 2"],
+        ... )
+        >>> df1.to_excel("output.xlsx")  # doctest: +SKIP
+
+        To specify the sheet name:
+
+        >>> df1.to_excel("output.xlsx", sheet_name="Sheet_name_1")  # doctest: +SKIP
+
+        If you wish to write to more than one sheet in the workbook, it is
+        necessary to specify an ExcelWriter object:
+
+        >>> df2 = df1.copy()
+        >>> with pd.ExcelWriter("output.xlsx") as writer:  # doctest: +SKIP
+        ...     df1.to_excel(writer, sheet_name="Sheet_name_1")
+        ...     df2.to_excel(writer, sheet_name="Sheet_name_2")
+
+        ExcelWriter can also be used to append to an existing Excel file:
+
+        >>> with pd.ExcelWriter("output.xlsx", mode="a") as writer:  # doctest: +SKIP
+        ...     df1.to_excel(writer, sheet_name="Sheet_name_3")
+
+        To set the library that is used to write the Excel file,
+        you can pass the `engine` keyword (the default engine is
+        automatically chosen depending on the file extension):
+
+        >>> df1.to_excel("output1.xlsx", engine="xlsxwriter")  # doctest: +SKIP
+        """
+        if engine_kwargs is None:
+            engine_kwargs = {}
+
+        df = self if isinstance(self, ABCDataFrame) else self.to_frame()
+
+        from pandas.io.formats.excel import ExcelFormatter
+
+        formatter = ExcelFormatter(
+            df,
+            na_rep=na_rep,
+            cols=columns,
+            header=header,
+            float_format=float_format,
+            index=index,
+            index_label=index_label,
+            merge_cells=merge_cells,
+            inf_rep=inf_rep,
+            autofilter=autofilter,
+        )
+        formatter.write(
+            excel_writer,
+            sheet_name=sheet_name,
+            startrow=startrow,
+            startcol=startcol,
+            freeze_panes=freeze_panes,
+            engine=engine,
+            storage_options=storage_options,
+            engine_kwargs=engine_kwargs,
+        )
+
+    @final
+    def to_json(
+        self,
+        path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
+        *,
+        orient: Literal["split", "records", "index", "table", "columns", "values"]
+        | None = None,
+        date_format: str | None = None,
+        double_precision: int = 10,
+        force_ascii: bool = True,
+        date_unit: TimeUnit = "ms",
+        default_handler: Callable[[Any], JSONSerializable] | None = None,
+        lines: bool = False,
+        compression: CompressionOptions = "infer",
+        index: bool | None = None,
+        indent: int | None = None,
+        storage_options: StorageOptions | None = None,
+        mode: Literal["a", "w"] = "w",
+    ) -> str | None:
+        """
+        Convert the object to a JSON string.
+
+        Note NaN's and None will be converted to null and datetime objects
+        will be converted to UNIX timestamps.
+
+        Parameters
+        ----------
+        path_or_buf : str, path object, file-like object, or None, default None
+            String, path object (implementing os.PathLike[str]), or file-like
+            object implementing a write() function. If None, the result is
+            returned as a string.
+        orient : str
+            Indication of expected JSON string format.
+
+            * Series:
+
+                - default is 'index'
+                - allowed values are: {{'split', 'records', 'index', 'table'}}.
+
+            * DataFrame:
+
+                - default is 'columns'
+                - allowed values are: {{'split', 'records', 'index', 'columns',
+                  'values', 'table'}}.
+
+            * The format of the JSON string:
+
+                - 'split' : dict like {{'index' -> [index], 'columns' -> [columns],
+                  'data' -> [values]}}
+                - 'records' : list like [{{column -> value}}, ... , {{column -> value}}]
+                - 'index' : dict like {{index -> {{column -> value}}}}
+                - 'columns' : dict like {{column -> {{index -> value}}}}
+                - 'values' : just the values array
+                - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}}
+
+                Describing the data, where data component is like ``orient='records'``.
+
+        date_format : {{None, 'epoch', 'iso'}}
+            Type of date conversion. 'epoch' = epoch milliseconds,
+            'iso' = ISO8601. The default depends on the `orient`. For
+            ``orient='table'``, the default is 'iso'. For all other orients,
+            the default is 'epoch'.
+
+            .. deprecated:: 3.0.0
+                'epoch' date format is deprecated and will be removed in a future
+                version, please use 'iso' instead.
+
+        double_precision : int, default 10
+            The number of decimal places to use when encoding
+            floating point values. The possible maximal value is 15.
+            Passing double_precision greater than 15 will raise a ValueError.
+        force_ascii : bool, default True
+            Force encoded string to be ASCII.
+        date_unit : str, default 'ms' (milliseconds)
+            The time unit to encode to, governs timestamp and ISO8601
+            precision.  One of 's', 'ms', 'us', 'ns' for second, millisecond,
+            microsecond, and nanosecond respectively.
+        default_handler : callable, default None
+            Handler to call if object cannot otherwise be converted to a
+            suitable format for JSON. Should receive a single argument which is
+            the object to convert and return a serialisable object.
+        lines : bool, default False
+            If 'orient' is 'records' write out line-delimited json format. Will
+            throw ValueError if incorrect 'orient' since others are not
+            list-like.
+
+        compression : str or dict, default 'infer'
+            For on-the-fly compression of the output data. If 'infer' and
+            'path_or_buf' is path-like, then detect compression from the following
+            extensions: '.gz',
+            '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+            (otherwise no compression).
+            Set to ``None`` for no compression.
+            Can also be a dict with key ``'method'`` set to one of
+            {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
+            other key-value pairs are forwarded to
+            ``zipfile.ZipFile``, ``gzip.GzipFile``,
+            ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
+            ``tarfile.TarFile``, respectively.
+            As an example, the following could be passed for faster compression and
+            to create a reproducible gzip archive:
+            ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
+
+        index : bool or None, default None
+            The index is only used when 'orient' is 'split', 'index', 'column',
+            or 'table'. Of these, 'index' and 'column' do not support
+            `index=False`. The string 'index' as a column name with empty :class:`Index`
+            or if it is 'index' will raise a ``ValueError``.
+
+        indent : int, optional
+           Length of whitespace used to indent each record.
+
+        storage_options : dict, optional
+            Extra options that make sense for a particular storage connection, e.g.
+            host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+            are forwarded to ``urllib.request.Request`` as header options. For other
+            URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+            forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+            details, and for more examples on storage options refer `here
+            <https://pandas.pydata.org/docs/user_guide/io.html?
+            highlight=storage_options#reading-writing-remote-files>`_.
+
+        mode : str, default 'w' (writing)
+            Specify the IO mode for output when supplying a path_or_buf.
+            Accepted args are 'w' (writing) and 'a' (append) only.
+            mode='a' is only supported when lines is True and orient is 'records'.
+
+        Returns
+        -------
+        None or str
+            If path_or_buf is None, returns the resulting json format as a
+            string. Otherwise returns None.
+
+        See Also
+        --------
+        read_json : Convert a JSON string to pandas object.
+
+        Notes
+        -----
+        The behavior of ``indent=0`` varies from the stdlib, which does not
+        indent the output but does insert newlines. Currently, ``indent=0``
+        and the default ``indent=None`` are equivalent in pandas, though this
+        may change in a future release.
+
+        ``orient='table'`` contains a 'pandas_version' field under 'schema'.
+        This stores the version of `pandas` used in the latest revision of the
+        schema.
+
+        Examples
+        --------
+        >>> from json import loads, dumps
+        >>> df = pd.DataFrame(
+        ...     [["a", "b"], ["c", "d"]],
+        ...     index=["row 1", "row 2"],
+        ...     columns=["col 1", "col 2"],
+        ... )
+
+        >>> result = df.to_json(orient="split")
+        >>> parsed = loads(result)
+        >>> dumps(parsed, indent=4)  # doctest: +SKIP
+        {{
+            "columns": [
+                "col 1",
+                "col 2"
+            ],
+            "index": [
+                "row 1",
+                "row 2"
+            ],
+            "data": [
+                [
+                    "a",
+                    "b"
+                ],
+                [
+                    "c",
+                    "d"
+                ]
+            ]
+        }}
+
+        Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
+        Note that index labels are not preserved with this encoding.
+
+        >>> result = df.to_json(orient="records")
+        >>> parsed = loads(result)
+        >>> dumps(parsed, indent=4)  # doctest: +SKIP
+        [
+            {{
+                "col 1": "a",
+                "col 2": "b"
+            }},
+            {{
+                "col 1": "c",
+                "col 2": "d"
+            }}
+        ]
+
+        Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
+
+        >>> result = df.to_json(orient="index")
+        >>> parsed = loads(result)
+        >>> dumps(parsed, indent=4)  # doctest: +SKIP
+        {{
+            "row 1": {{
+                "col 1": "a",
+                "col 2": "b"
+            }},
+            "row 2": {{
+                "col 1": "c",
+                "col 2": "d"
+            }}
+        }}
+
+        Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:
+
+        >>> result = df.to_json(orient="columns")
+        >>> parsed = loads(result)
+        >>> dumps(parsed, indent=4)  # doctest: +SKIP
+        {{
+            "col 1": {{
+                "row 1": "a",
+                "row 2": "c"
+            }},
+            "col 2": {{
+                "row 1": "b",
+                "row 2": "d"
+            }}
+        }}
+
+        Encoding/decoding a Dataframe using ``'values'`` formatted JSON:
+
+        >>> result = df.to_json(orient="values")
+        >>> parsed = loads(result)
+        >>> dumps(parsed, indent=4)  # doctest: +SKIP
+        [
+            [
+                "a",
+                "b"
+            ],
+            [
+                "c",
+                "d"
+            ]
+        ]
+
+        Encoding with Table Schema:
+
+        >>> result = df.to_json(orient="table")
+        >>> parsed = loads(result)
+        >>> dumps(parsed, indent=4)  # doctest: +SKIP
+        {{
+            "schema": {{
+                "fields": [
+                    {{
+                        "name": "index",
+                        "type": "string"
+                    }},
+                    {{
+                        "name": "col 1",
+                        "type": "string"
+                    }},
+                    {{
+                        "name": "col 2",
+                        "type": "string"
+                    }}
+                ],
+                "primaryKey": [
+                    "index"
+                ],
+                "pandas_version": "1.4.0"
+            }},
+            "data": [
+                {{
+                    "index": "row 1",
+                    "col 1": "a",
+                    "col 2": "b"
+                }},
+                {{
+                    "index": "row 2",
+                    "col 1": "c",
+                    "col 2": "d"
+                }}
+            ]
+        }}
+        """
+        from pandas.io import json
+
+        if date_format is None and orient == "table":
+            date_format = "iso"
+        elif date_format is None:
+            date_format = "epoch"
+            dtypes = self.dtypes if self.ndim == 2 else [self.dtype]
+            if any(dtype.kind in "mM" for dtype in dtypes):
+                warnings.warn(
+                    "The default 'epoch' date format is deprecated and will be removed "
+                    "in a future version, please use 'iso' date format instead.",
+                    Pandas4Warning,
+                    stacklevel=find_stack_level(),
+                )
+        elif date_format == "epoch":
+            # GH#57063
+            warnings.warn(
+                "'epoch' date format is deprecated and will be removed in a future "
+                "version, please use 'iso' date format instead.",
+                Pandas4Warning,
+                stacklevel=find_stack_level(),
+            )
+
+        config.is_nonnegative_int(indent)
+        indent = indent or 0
+
+        return json.to_json(
+            path_or_buf=path_or_buf,
+            obj=self,
+            orient=orient,
+            date_format=date_format,
+            double_precision=double_precision,
+            force_ascii=force_ascii,
+            date_unit=date_unit,
+            default_handler=default_handler,
+            lines=lines,
+            compression=compression,
+            index=index,
+            indent=indent,
+            storage_options=storage_options,
+            mode=mode,
+        )
+
+    @final
+    def to_hdf(
+        self,
+        path_or_buf: FilePath | HDFStore,
+        *,
+        key: str,
+        mode: Literal["a", "w", "r+"] = "a",
+        complevel: int | None = None,
+        complib: Literal["zlib", "lzo", "bzip2", "blosc"] | None = None,
+        append: bool = False,
+        format: Literal["fixed", "table"] | None = None,
+        index: bool = True,
+        min_itemsize: int | dict[str, int] | None = None,
+        nan_rep=None,
+        dropna: bool | None = None,
+        data_columns: Literal[True] | list[str] | None = None,
+        errors: OpenFileErrors = "strict",
+        encoding: str = "UTF-8",
+    ) -> None:
+        """
+        Write the contained data to an HDF5 file using HDFStore.
+
+        Hierarchical Data Format (HDF) is self-describing, allowing an
+        application to interpret the structure and contents of a file with
+        no outside information. One HDF file can hold a mix of related objects
+        which can be accessed as a group or as individual objects.
+
+        In order to add another DataFrame or Series to an existing HDF file
+        please use append mode and a different a key.
+
+        .. warning::
+
+           One can store a subclass of ``DataFrame`` or ``Series`` to HDF5,
+           but the type of the subclass is lost upon storing.
+
+        For more information see the :ref:`user guide <io.hdf5>`.
+
+        Parameters
+        ----------
+        path_or_buf : str or pandas.HDFStore
+            File path or HDFStore object.
+        key : str
+            Identifier for the group in the store.
+        mode : {'a', 'w', 'r+'}, default 'a'
+            Mode to open file:
+
+            - 'w': write, a new file is created (an existing file with
+              the same name would be deleted).
+            - 'a': append, an existing file is opened for reading and
+              writing, and if the file does not exist it is created.
+            - 'r+': similar to 'a', but the file must already exist.
+        complevel : {0-9}, default None
+            Specifies a compression level for data.
+            A value of 0 or None disables compression.
+        complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
+            Specifies the compression library to be used.
+            These additional compressors for Blosc are supported
+            (default if no compressor specified: 'blosc:blosclz'):
+            {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
+            'blosc:zlib', 'blosc:zstd'}.
+            Specifying a compression library which is not available issues
+            a ValueError.
+        append : bool, default False
+            For Table formats, append the input data to the existing.
+        format : {'fixed', 'table', None}, default 'fixed'
+            Possible values:
+
+            - 'fixed': Fixed format. Fast writing/reading. Not-appendable,
+              nor searchable.
+            - 'table': Table format. Write as a PyTables Table structure
+              which may perform worse but allow more flexible operations
+              like searching / selecting subsets of the data.
+            - If None, pd.get_option('io.hdf.default_format') is checked,
+              followed by fallback to "fixed".
+        index : bool, default True
+            Write DataFrame index as a column.
+        min_itemsize : dict or int, optional
+            Map column names to minimum string sizes for columns.
+        nan_rep : Any, optional
+            How to represent null values as str.
+            Not allowed with append=True.
+        dropna : bool, default False, optional
+            Remove missing values.
+        data_columns : list of columns or True, optional
+            List of columns to create as indexed data columns for on-disk
+            queries, or True to use all columns. By default only the axes
+            of the object are indexed. See
+            :ref:`Query via data columns<io.hdf5-query-data-columns>`. for
+            more information.
+            Applicable only to format='table'.
+        errors : str, default 'strict'
+            Specifies how encoding and decoding errors are to be handled.
+            See the errors argument for :func:`open` for a full list
+            of options.
+        encoding : str, default "UTF-8"
+            Set character encoding.
+
+        See Also
+        --------
+        read_hdf : Read from HDF file.
+        DataFrame.to_orc : Write a DataFrame to the binary orc format.
+        DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+        DataFrame.to_sql : Write to a SQL table.
+        DataFrame.to_feather : Write out feather-format for DataFrames.
+        DataFrame.to_csv : Write out to a csv file.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"]
+        ... )  # doctest: +SKIP
+        >>> df.to_hdf("data.h5", key="df", mode="w")  # doctest: +SKIP
+
+        We can add another object to the same file:
+
+        >>> s = pd.Series([1, 2, 3, 4])  # doctest: +SKIP
+        >>> s.to_hdf("data.h5", key="s")  # doctest: +SKIP
+
+        Reading from HDF file:
+
+        >>> pd.read_hdf("data.h5", "df")  # doctest: +SKIP
+        A  B
+        a  1  4
+        b  2  5
+        c  3  6
+        >>> pd.read_hdf("data.h5", "s")  # doctest: +SKIP
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+        """
+        from pandas.io import pytables
+
+        # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected
+        # "Union[DataFrame, Series]" [arg-type]
+        pytables.to_hdf(
+            path_or_buf,
+            key,
+            self,  # type: ignore[arg-type]
+            mode=mode,
+            complevel=complevel,
+            complib=complib,
+            append=append,
+            format=format,
+            index=index,
+            min_itemsize=min_itemsize,
+            nan_rep=nan_rep,
+            dropna=dropna,
+            data_columns=data_columns,
+            errors=errors,
+            encoding=encoding,
+        )
+
+    @final
+    def to_sql(
+        self,
+        name: str,
+        con,
+        *,
+        schema: str | None = None,
+        if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
+        index: bool = True,
+        index_label: IndexLabel | None = None,
+        chunksize: int | None = None,
+        dtype: DtypeArg | None = None,
+        method: Literal["multi"] | Callable | None = None,
+    ) -> int | None:
+        """
+        Write records stored in a DataFrame to a SQL database.
+
+        Databases supported by SQLAlchemy [1]_ are supported. Tables can be
+        newly created, appended to, or overwritten.
+
+        .. warning::
+            The pandas library does not attempt to sanitize inputs provided via a to_sql call.
+            Please refer to the documentation for the underlying database driver to see if it
+            will properly prevent injection, or alternatively be advised of a security risk when
+            executing arbitrary commands in a to_sql call.
+
+        Parameters
+        ----------
+        name : str
+            Name of SQL table.
+        con : ADBC connection, sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection
+            ADBC provides high performance I/O with native type support, where available.
+            Using SQLAlchemy makes it possible to use any DB supported by that
+            library. Legacy support is provided for sqlite3.Connection objects. The user
+            is responsible for engine disposal and connection closure for the SQLAlchemy
+            connectable. See `here \
+                <https://docs.sqlalchemy.org/en/20/core/connections.html>`_.
+            If passing a sqlalchemy.engine.Connection which is already in a transaction,
+            the transaction will not be committed.  If passing a sqlite3.Connection,
+            it will not be possible to roll back the record insertion.
+
+        schema : str, optional
+            Specify the schema (if database flavor supports this). If None, use
+            default schema.
+        if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail'
+            How to behave if the table already exists.
+
+            * fail: Raise a ValueError.
+            * replace: Drop the table before inserting new values.
+            * append: Insert new values to the existing table.
+            * delete_rows: If a table exists, delete all records and insert data.
+
+        index : bool, default True
+            Write DataFrame index as a column. Uses `index_label` as the column
+            name in the table. Creates a table index for this column.
+        index_label : str or sequence, default None
+            Column label for index column(s). If None is given (default) and
+            `index` is True, then the index names are used.
+            A sequence should be given if the DataFrame uses MultiIndex.
+        chunksize : int, optional
+            Specify the number of rows in each batch to be written to the database connection at a time.
+            By default, all rows will be written at once. Also see the method keyword.
+        dtype : dict or scalar, optional
+            Specifying the datatype for columns. If a dictionary is used, the
+            keys should be the column names and the values should be the
+            SQLAlchemy types or strings for the sqlite3 legacy mode. If a
+            scalar is provided, it will be applied to all columns.
+        method : {None, 'multi', callable}, optional
+            Controls the SQL insertion clause used:
+
+            * None : Uses standard SQL ``INSERT`` clause (one per row).
+            * 'multi': Pass multiple values in a single ``INSERT`` clause.
+            * callable with signature ``(pd_table, conn, keys, data_iter)``.
+
+            Details and a sample callable implementation can be found in the
+            section :ref:`insert method <io.sql.method>`.
+
+        Returns
+        -------
+        None or int
+            Number of rows affected by to_sql. None is returned if the callable
+            passed into ``method`` does not return an integer number of rows.
+
+            The number of returned rows affected is the sum of the ``rowcount``
+            attribute of ``sqlite3.Cursor`` or SQLAlchemy connectable which may not
+            reflect the exact number of written rows as stipulated in the
+            `sqlite3 <https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.rowcount>`__ or
+            `SQLAlchemy <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.CursorResult.rowcount>`__.
+
+        Raises
+        ------
+        ValueError
+            When the table already exists and `if_exists` is 'fail' (the
+            default).
+
+        See Also
+        --------
+        read_sql : Read a DataFrame from a table.
+
+        Notes
+        -----
+        Timezone aware datetime columns will be written as
+        ``Timestamp with timezone`` type with SQLAlchemy if supported by the
+        database. Otherwise, the datetimes will be stored as timezone unaware
+        timestamps local to the original timezone.
+
+        Not all datastores support ``method="multi"``. Oracle, for example,
+        does not support multi-value insert.
+
+        References
+        ----------
+        .. [1] https://docs.sqlalchemy.org
+        .. [2] https://www.python.org/dev/peps/pep-0249/
+
+        Examples
+        --------
+        Create an in-memory SQLite database.
+
+        >>> from sqlalchemy import create_engine
+        >>> engine = create_engine('sqlite://', echo=False)
+
+        Create a table from scratch with 3 rows.
+
+        >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})
+        >>> df
+             name
+        0  User 1
+        1  User 2
+        2  User 3
+
+        >>> df.to_sql(name='users', con=engine)
+        3
+        >>> from sqlalchemy import text
+        >>> with engine.connect() as conn:
+        ...     conn.execute(text("SELECT * FROM users")).fetchall()
+        [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]
+
+        An `sqlalchemy.engine.Connection` can also be passed to `con`:
+
+        >>> with engine.begin() as connection:
+        ...     df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})
+        ...     df1.to_sql(name='users', con=connection, if_exists='append')
+        2
+
+        This is allowed to support operations that require that the same
+        DBAPI connection is used for the entire operation.
+
+        >>> df2 = pd.DataFrame({'name' : ['User 6', 'User 7']})
+        >>> df2.to_sql(name='users', con=engine, if_exists='append')
+        2
+        >>> with engine.connect() as conn:
+        ...     conn.execute(text("SELECT * FROM users")).fetchall()
+        [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),
+         (0, 'User 4'), (1, 'User 5'), (0, 'User 6'),
+         (1, 'User 7')]
+
+        Overwrite the table with just ``df2``.
+
+        >>> df2.to_sql(name='users', con=engine, if_exists='replace',
+        ...            index_label='id')
+        2
+        >>> with engine.connect() as conn:
+        ...     conn.execute(text("SELECT * FROM users")).fetchall()
+        [(0, 'User 6'), (1, 'User 7')]
+
+        Delete all rows before inserting new records with ``df3``
+
+        >>> df3 = pd.DataFrame({"name": ['User 8', 'User 9']})
+        >>> df3.to_sql(name='users', con=engine, if_exists='delete_rows',
+        ...            index_label='id')
+        2
+        >>> with engine.connect() as conn:
+        ...     conn.execute(text("SELECT * FROM users")).fetchall()
+        [(0, 'User 8'), (1, 'User 9')]
+
+        Use ``method`` to define a callable insertion method to do nothing
+        if there's a primary key conflict on a table in a PostgreSQL database.
+
+        >>> from sqlalchemy.dialects.postgresql import insert
+        >>> def insert_on_conflict_nothing(table, conn, keys, data_iter):
+        ...     # "a" is the primary key in "conflict_table"
+        ...     data = [dict(zip(keys, row)) for row in data_iter]
+        ...     stmt = insert(table.table).values(data).on_conflict_do_nothing(index_elements=["a"])
+        ...     result = conn.execute(stmt)
+        ...     return result.rowcount
+        >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append",  # noqa: F821
+        ...                    method=insert_on_conflict_nothing)  # doctest: +SKIP
+        0
+
+        For MySQL, a callable to update columns ``b`` and ``c`` if there's a conflict
+        on a primary key.
+
+        >>> from sqlalchemy.dialects.mysql import insert   # noqa: F811
+        >>> def insert_on_conflict_update(table, conn, keys, data_iter):
+        ...     # update columns "b" and "c" on primary key conflict
+        ...     data = [dict(zip(keys, row)) for row in data_iter]
+        ...     stmt = (
+        ...         insert(table.table)
+        ...         .values(data)
+        ...     )
+        ...     stmt = stmt.on_duplicate_key_update(b=stmt.inserted.b, c=stmt.inserted.c)
+        ...     result = conn.execute(stmt)
+        ...     return result.rowcount
+        >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append",  # noqa: F821
+        ...                    method=insert_on_conflict_update)  # doctest: +SKIP
+        2
+
+        Specify the dtype (especially useful for integers with missing values).
+        Notice that while pandas is forced to store the data as floating point,
+        the database supports nullable integers. When fetching the data with
+        Python, we get back integer scalars.
+
+        >>> df = pd.DataFrame({"A": [1, None, 2]})
+        >>> df
+             A
+        0  1.0
+        1  NaN
+        2  2.0
+
+        >>> from sqlalchemy.types import Integer
+        >>> df.to_sql(name='integers', con=engine, index=False,
+        ...           dtype={"A": Integer()})
+        3
+
+        >>> with engine.connect() as conn:
+        ...     conn.execute(text("SELECT * FROM integers")).fetchall()
+        [(1,), (None,), (2,)]
+
+        .. versionadded:: 2.2.0
+
+           pandas now supports writing via ADBC drivers
+
+        >>> df = pd.DataFrame({'name' : ['User 10', 'User 11', 'User 12']})
+        >>> df
+              name
+        0  User 10
+        1  User 11
+        2  User 12
+
+        >>> from adbc_driver_sqlite import dbapi  # doctest:+SKIP
+        >>> with dbapi.connect("sqlite://") as conn:  # doctest:+SKIP
+        ...     df.to_sql(name="users", con=conn)
+        3
+        """  # noqa: E501
+        from pandas.io import sql
+
+        return sql.to_sql(
+            self,
+            name,
+            con,
+            schema=schema,
+            if_exists=if_exists,
+            index=index,
+            index_label=index_label,
+            chunksize=chunksize,
+            dtype=dtype,
+            method=method,
+        )
+
+    @final
+    def to_pickle(
+        self,
+        path: FilePath | WriteBuffer[bytes],
+        *,
+        compression: CompressionOptions = "infer",
+        protocol: int = pickle.HIGHEST_PROTOCOL,
+        storage_options: StorageOptions | None = None,
+    ) -> None:
+        """
+        Pickle (serialize) object to file.
+
+        Parameters
+        ----------
+        path : str, path object, or file-like object
+            String, path object (implementing ``os.PathLike[str]``), or file-like
+            object implementing a binary ``write()`` function. File path where
+            the pickled object will be stored.
+
+        compression : str or dict, default 'infer'
+            For on-the-fly compression of the output data. If 'infer' and
+            'path_or_buf' is path-like, then detect compression from the following
+            extensions: '.gz',
+            '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+            (otherwise no compression).
+            Set to ``None`` for no compression.
+            Can also be a dict with key ``'method'`` set to one of
+            {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
+            other key-value pairs are forwarded to
+            ``zipfile.ZipFile``, ``gzip.GzipFile``,
+            ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
+            ``tarfile.TarFile``, respectively.
+            As an example, the following could be passed for faster compression and
+            to create a reproducible gzip archive:
+            ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
+
+        protocol : int
+            Int which indicates which protocol should be used by the pickler,
+            default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
+            values are 0, 1, 2, 3, 4, 5. A negative value for the protocol
+            parameter is equivalent to setting its value to HIGHEST_PROTOCOL.
+
+            .. [1] https://docs.python.org/3/library/pickle.html.
+
+        storage_options : dict, optional
+            Extra options that make sense for a particular storage connection, e.g.
+            host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+            are forwarded to ``urllib.request.Request`` as header options. For other
+            URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+            forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+            details, and for more examples on storage options refer `here
+            <https://pandas.pydata.org/docs/user_guide/io.html?
+            highlight=storage_options#reading-writing-remote-files>`_.
+
+        See Also
+        --------
+        read_pickle : Load pickled pandas object (or any object) from file.
+        DataFrame.to_hdf : Write DataFrame to an HDF5 file.
+        DataFrame.to_sql : Write DataFrame to a SQL database.
+        DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+
+        Examples
+        --------
+        >>> original_df = pd.DataFrame(
+        ...     {{"foo": range(5), "bar": range(5, 10)}}
+        ... )  # doctest: +SKIP
+        >>> original_df  # doctest: +SKIP
+           foo  bar
+        0    0    5
+        1    1    6
+        2    2    7
+        3    3    8
+        4    4    9
+        >>> original_df.to_pickle("./dummy.pkl")  # doctest: +SKIP
+
+        >>> unpickled_df = pd.read_pickle("./dummy.pkl")  # doctest: +SKIP
+        >>> unpickled_df  # doctest: +SKIP
+           foo  bar
+        0    0    5
+        1    1    6
+        2    2    7
+        3    3    8
+        4    4    9
+        """
+        from pandas.io.pickle import to_pickle
+
+        to_pickle(
+            self,
+            path,
+            compression=compression,
+            protocol=protocol,
+            storage_options=storage_options,
+        )
+
+    @final
+    def to_clipboard(
+        self, *, excel: bool = True, sep: str | None = None, **kwargs
+    ) -> None:
+        r"""
+        Copy object to the system clipboard.
+
+        Write a text representation of object to the system clipboard.
+        This can be pasted into Excel, for example.
+
+        Parameters
+        ----------
+        excel : bool, default True
+            Produce output in a csv format for easy pasting into excel.
+
+            - True, use the provided separator for csv pasting.
+            - False, write a string representation of the object to the clipboard.
+
+        sep : str, default ``'\t'``
+            Field delimiter.
+        **kwargs
+            These parameters will be passed to DataFrame.to_csv.
+
+        See Also
+        --------
+        DataFrame.to_csv : Write a DataFrame to a comma-separated values
+            (csv) file.
+        read_clipboard : Read text from clipboard and pass to read_csv.
+
+        Notes
+        -----
+        Requirements for your platform.
+
+          - Linux : `xclip`, or `xsel` (with `PyQt4` modules)
+          - Windows : none
+          - macOS : none
+
+        This method uses the processes developed for the package `pyperclip`. A
+        solution to render any output string format is given in the examples.
+
+        Examples
+        --------
+        Copy the contents of a DataFrame to the clipboard.
+
+        >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
+
+        >>> df.to_clipboard(sep=",")  # doctest: +SKIP
+        ... # Wrote the following to the system clipboard:
+        ... # ,A,B,C
+        ... # 0,1,2,3
+        ... # 1,4,5,6
+
+        We can omit the index by passing the keyword `index` and setting
+        it to false.
+
+        >>> df.to_clipboard(sep=",", index=False)  # doctest: +SKIP
+        ... # Wrote the following to the system clipboard:
+        ... # A,B,C
+        ... # 1,2,3
+        ... # 4,5,6
+
+        Using the original `pyperclip` package for any string output format.
+
+        .. code-block:: python
+
+           import pyperclip
+
+           html = df.style.to_html()
+           pyperclip.copy(html)
+        """
+        from pandas.io import clipboards
+
+        clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)
+
+    @final
+    def to_xarray(self):
+        """
+        Return an xarray object from the pandas object.
+
+        Returns
+        -------
+        xarray.DataArray or xarray.Dataset
+            Data in the pandas structure converted to Dataset if the object is
+            a DataFrame, or a DataArray if the object is a Series.
+
+        See Also
+        --------
+        DataFrame.to_hdf : Write DataFrame to an HDF5 file.
+        DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+
+        Notes
+        -----
+        See the `xarray docs <https://xarray.pydata.org/en/stable/>`__
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [
+        ...         ("falcon", "bird", 389.0, 2),
+        ...         ("parrot", "bird", 24.0, 2),
+        ...         ("lion", "mammal", 80.5, 4),
+        ...         ("monkey", "mammal", np.nan, 4),
+        ...     ],
+        ...     columns=["name", "class", "max_speed", "num_legs"],
+        ... )
+        >>> df
+             name   class  max_speed  num_legs
+        0  falcon    bird      389.0         2
+        1  parrot    bird       24.0         2
+        2    lion  mammal       80.5         4
+        3  monkey  mammal        NaN         4
+
+        >>> df.to_xarray()  # doctest: +SKIP
+        <xarray.Dataset>
+        Dimensions:    (index: 4)
+        Coordinates:
+          * index      (index) int64 32B 0 1 2 3
+        Data variables:
+            name       (index) object 32B 'falcon' 'parrot' 'lion' 'monkey'
+            class      (index) object 32B 'bird' 'bird' 'mammal' 'mammal'
+            max_speed  (index) float64 32B 389.0 24.0 80.5 nan
+            num_legs   (index) int64 32B 2 2 4 4
+
+        >>> df["max_speed"].to_xarray()  # doctest: +SKIP
+        <xarray.DataArray 'max_speed' (index: 4)>
+        array([389. ,  24. ,  80.5,   nan])
+        Coordinates:
+          * index    (index) int64 0 1 2 3
+
+        >>> dates = pd.to_datetime(
+        ...     ["2018-01-01", "2018-01-01", "2018-01-02", "2018-01-02"]
+        ... )
+        >>> df_multiindex = pd.DataFrame(
+        ...     {
+        ...         "date": dates,
+        ...         "animal": ["falcon", "parrot", "falcon", "parrot"],
+        ...         "speed": [350, 18, 361, 15],
+        ...     }
+        ... )
+        >>> df_multiindex = df_multiindex.set_index(["date", "animal"])
+
+        >>> df_multiindex
+                           speed
+        date       animal
+        2018-01-01 falcon    350
+                   parrot     18
+        2018-01-02 falcon    361
+                   parrot     15
+
+        >>> df_multiindex.to_xarray()  # doctest: +SKIP
+        <xarray.Dataset>
+        Dimensions:  (date: 2, animal: 2)
+        Coordinates:
+          * date     (date) datetime64[s] 2018-01-01 2018-01-02
+          * animal   (animal) object 'falcon' 'parrot'
+        Data variables:
+            speed    (date, animal) int64 350 18 361 15
+        """
+        xarray = import_optional_dependency("xarray")
+
+        if self.ndim == 1:
+            return xarray.DataArray.from_series(self)
+        else:
+            return xarray.Dataset.from_dataframe(self)
+
+    @overload
+    def to_latex(
+        self,
+        buf: None = ...,
+        *,
+        columns: Sequence[Hashable] | None = ...,
+        header: bool | SequenceNotStr[str] = ...,
+        index: bool = ...,
+        na_rep: str = ...,
+        formatters: FormattersType | None = ...,
+        float_format: FloatFormatType | None = ...,
+        sparsify: bool | None = ...,
+        index_names: bool = ...,
+        bold_rows: bool = ...,
+        column_format: str | None = ...,
+        longtable: bool | None = ...,
+        escape: bool | None = ...,
+        encoding: str | None = ...,
+        decimal: str = ...,
+        multicolumn: bool | None = ...,
+        multicolumn_format: str | None = ...,
+        multirow: bool | None = ...,
+        caption: str | tuple[str, str] | None = ...,
+        label: str | None = ...,
+        position: str | None = ...,
+    ) -> str: ...
+
+    @overload
+    def to_latex(
+        self,
+        buf: FilePath | WriteBuffer[str],
+        *,
+        columns: Sequence[Hashable] | None = ...,
+        header: bool | SequenceNotStr[str] = ...,
+        index: bool = ...,
+        na_rep: str = ...,
+        formatters: FormattersType | None = ...,
+        float_format: FloatFormatType | None = ...,
+        sparsify: bool | None = ...,
+        index_names: bool = ...,
+        bold_rows: bool = ...,
+        column_format: str | None = ...,
+        longtable: bool | None = ...,
+        escape: bool | None = ...,
+        encoding: str | None = ...,
+        decimal: str = ...,
+        multicolumn: bool | None = ...,
+        multicolumn_format: str | None = ...,
+        multirow: bool | None = ...,
+        caption: str | tuple[str, str] | None = ...,
+        label: str | None = ...,
+        position: str | None = ...,
+    ) -> None: ...
+
+    @final
+    def to_latex(
+        self,
+        buf: FilePath | WriteBuffer[str] | None = None,
+        *,
+        columns: Sequence[Hashable] | None = None,
+        header: bool | SequenceNotStr[str] = True,
+        index: bool = True,
+        na_rep: str = "NaN",
+        formatters: FormattersType | None = None,
+        float_format: FloatFormatType | None = None,
+        sparsify: bool | None = None,
+        index_names: bool = True,
+        bold_rows: bool = False,
+        column_format: str | None = None,
+        longtable: bool | None = None,
+        escape: bool | None = None,
+        encoding: str | None = None,
+        decimal: str = ".",
+        multicolumn: bool | None = None,
+        multicolumn_format: str | None = None,
+        multirow: bool | None = None,
+        caption: str | tuple[str, str] | None = None,
+        label: str | None = None,
+        position: str | None = None,
+    ) -> str | None:
+        r"""
+        Render object to a LaTeX tabular, longtable, or nested table.
+
+        Requires ``\usepackage{booktabs}``.  The output can be copy/pasted
+        into a main LaTeX document or read from an external file
+        with ``\input{table.tex}``.
+
+        .. versionchanged:: 2.0.0
+           Refactored to use the Styler implementation via jinja2 templating.
+
+        Parameters
+        ----------
+        buf : str, Path or StringIO-like, optional, default None
+            Buffer to write to. If None, the output is returned as a string.
+        columns : list of label, optional
+            The subset of columns to write. Writes all columns by default.
+        header : bool or list of str, default True
+            Write out the column names. If a list of strings is given,
+            it is assumed to be aliases for the column names. Braces must be escaped.
+        index : bool, default True
+            Write row names (index).
+        na_rep : str, default 'NaN'
+            Missing data representation.
+        formatters : list of functions or dict of {str: function}, optional
+            Formatter functions to apply to columns' elements by position or
+            name. The result of each function must be a unicode string.
+            List must be of length equal to the number of columns.
+        float_format : one-parameter function or str, optional, default None
+            Formatter for floating point numbers. For example
+            ``float_format="%.2f"`` and ``float_format="{:0.2f}".format`` will
+            both result in 0.1234 being formatted as 0.12.
+        sparsify : bool, optional
+            Set to False for a DataFrame with a hierarchical index to print
+            every multiindex key at each row. By default, the value will be
+            read from the config module.
+        index_names : bool, default True
+            Prints the names of the indexes.
+        bold_rows : bool, default False
+            Make the row labels bold in the output.
+        column_format : str, optional
+            The columns format as specified in `LaTeX table format
+            <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g. 'rcl' for 3
+            columns. By default, 'l' will be used for all columns except
+            columns of numbers, which default to 'r'.
+        longtable : bool, optional
+            Use a longtable environment instead of tabular. Requires
+            adding a \usepackage{longtable} to your LaTeX preamble.
+            By default, the value will be read from the pandas config
+            module, and set to `True` if the option ``styler.latex.environment`` is
+            `"longtable"`.
+
+            .. versionchanged:: 2.0.0
+               The pandas option affecting this argument has changed.
+        escape : bool, optional
+            By default, the value will be read from the pandas config
+            module and set to `True` if the option ``styler.format.escape`` is
+            `"latex"`. When set to False prevents from escaping latex special
+            characters in column names.
+
+            .. versionchanged:: 2.0.0
+               The pandas option affecting this argument has changed, as has the
+               default value to `False`.
+        encoding : str, optional
+            A string representing the encoding to use in the output file,
+            defaults to 'utf-8'.
+        decimal : str, default '.'
+            Character recognized as decimal separator, e.g. ',' in Europe.
+        multicolumn : bool, default True
+            Use \multicolumn to enhance MultiIndex columns.
+            The default will be read from the config module, and is set
+            as the option ``styler.sparse.columns``.
+
+            .. versionchanged:: 2.0.0
+               The pandas option affecting this argument has changed.
+        multicolumn_format : str, default 'r'
+            The alignment for multicolumns, similar to `column_format`
+            The default will be read from the config module, and is set as the option
+            ``styler.latex.multicol_align``.
+
+            .. versionchanged:: 2.0.0
+               The pandas option affecting this argument has changed, as has the
+               default value to "r".
+        multirow : bool, default True
+            Use \multirow to enhance MultiIndex rows. Requires adding a
+            \usepackage{multirow} to your LaTeX preamble. Will print
+            centered labels (instead of top-aligned) across the contained
+            rows, separating groups via clines. The default will be read
+            from the pandas config module, and is set as the option
+            ``styler.sparse.index``.
+
+            .. versionchanged:: 2.0.0
+               The pandas option affecting this argument has changed, as has the
+               default value to `True`.
+        caption : str or tuple, optional
+            Tuple (full_caption, short_caption),
+            which results in ``\caption[short_caption]{full_caption}``;
+            if a single string is passed, no short caption will be set.
+        label : str, optional
+            The LaTeX label to be placed inside ``\label{}`` in the output.
+            This is used with ``\ref{}`` in the main ``.tex`` file.
+
+        position : str, optional
+            The LaTeX positional argument for tables, to be placed after
+            ``\begin{}`` in the output.
+
+        Returns
+        -------
+        str or None
+            If buf is None, returns the result as a string. Otherwise returns None.
+
+        See Also
+        --------
+        io.formats.style.Styler.to_latex : Render a DataFrame to LaTeX
+            with conditional formatting.
+        DataFrame.to_string : Render a DataFrame to a console-friendly
+            tabular output.
+        DataFrame.to_html : Render a DataFrame as an HTML table.
+
+        Notes
+        -----
+        As of v2.0.0 this method has changed to use the Styler implementation as
+        part of :meth:`.Styler.to_latex` via ``jinja2`` templating. This means
+        that ``jinja2`` is a requirement, and needs to be installed, for this method
+        to function. It is advised that users switch to using Styler, since that
+        implementation is more frequently updated and contains much more
+        flexibility with the output.
+
+        Examples
+        --------
+        Convert a general DataFrame to LaTeX with formatting:
+
+        >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'],
+        ...                        age=[26, 45],
+        ...                        height=[181.23, 177.65]))
+        >>> print(df.to_latex(index=False,
+        ...                   formatters={"name": str.upper},
+        ...                   float_format="{:.1f}".format,
+        ...                   ))  # doctest: +SKIP
+        \begin{tabular}{lrr}
+        \toprule
+        name & age & height \\
+        \midrule
+        RAPHAEL & 26 & 181.2 \\
+        DONATELLO & 45 & 177.7 \\
+        \bottomrule
+        \end{tabular}
+        """
+        # Get defaults from the pandas config
+        if self.ndim == 1:
+            self = self.to_frame()
+        if longtable is None:
+            longtable = config.get_option("styler.latex.environment") == "longtable"
+        if escape is None:
+            escape = config.get_option("styler.format.escape") == "latex"
+        if multicolumn is None:
+            multicolumn = config.get_option("styler.sparse.columns")
+        if multicolumn_format is None:
+            multicolumn_format = config.get_option("styler.latex.multicol_align")
+        if multirow is None:
+            multirow = config.get_option("styler.sparse.index")
+
+        if column_format is not None and not isinstance(column_format, str):
+            raise ValueError("`column_format` must be str or unicode")
+        length = len(self.columns) if columns is None else len(columns)
+        if isinstance(header, (list, tuple)) and len(header) != length:
+            raise ValueError(f"Writing {length} cols but got {len(header)} aliases")
+
+        # Refactor formatters/float_format/decimal/na_rep/escape to Styler structure
+        base_format_ = {
+            "na_rep": na_rep,
+            "escape": "latex" if escape else None,
+            "decimal": decimal,
+        }
+        index_format_: dict[str, Any] = {"axis": 0, **base_format_}
+        column_format_: dict[str, Any] = {"axis": 1, **base_format_}
+
+        if isinstance(float_format, str):
+            float_format_: Callable | None = lambda x: float_format % x
+        else:
+            float_format_ = float_format
+
+        def _wrap(x, alt_format_):
+            if isinstance(x, (float, complex)) and float_format_ is not None:
+                return float_format_(x)
+            else:
+                return alt_format_(x)
+
+        formatters_: list | tuple | dict | Callable | None = None
+        if isinstance(formatters, list):
+            formatters_ = {
+                c: partial(_wrap, alt_format_=formatters[i])
+                for i, c in enumerate(self.columns)
+            }
+        elif isinstance(formatters, dict):
+            index_formatter = formatters.pop("__index__", None)
+            column_formatter = formatters.pop("__columns__", None)
+            if index_formatter is not None:
+                index_format_.update({"formatter": index_formatter})
+            if column_formatter is not None:
+                column_format_.update({"formatter": column_formatter})
+
+            formatters_ = formatters
+            float_columns = self.select_dtypes(include="float").columns
+            for col in float_columns:
+                if col not in formatters.keys():
+                    formatters_.update({col: float_format_})
+        elif formatters is None and float_format is not None:
+            formatters_ = partial(_wrap, alt_format_=lambda v: v)
+        format_index_ = [index_format_, column_format_]
+        format_index_names_ = [index_format_, column_format_]
+
+        # Deal with hiding indexes and relabelling column names
+        hide_: list[dict] = []
+        relabel_index_: list[dict] = []
+        if columns:
+            hide_.append(
+                {
+                    "subset": [c for c in self.columns if c not in columns],
+                    "axis": "columns",
+                }
+            )
+        if header is False:
+            hide_.append({"axis": "columns"})
+        elif isinstance(header, (list, tuple)):
+            relabel_index_.append({"labels": header, "axis": "columns"})
+            format_index_ = [index_format_]  # column_format is overwritten
+
+        if index is False:
+            hide_.append({"axis": "index"})
+        if index_names is False:
+            hide_.append({"names": True, "axis": "index"})
+
+        render_kwargs_ = {
+            "hrules": True,
+            "sparse_index": sparsify,
+            "sparse_columns": sparsify,
+            "environment": "longtable" if longtable else None,
+            "multicol_align": multicolumn_format
+            if multicolumn
+            else f"naive-{multicolumn_format}",
+            "multirow_align": "t" if multirow else "naive",
+            "encoding": encoding,
+            "caption": caption,
+            "label": label,
+            "position": position,
+            "column_format": column_format,
+            "clines": "skip-last;data"
+            if (multirow and isinstance(self.index, MultiIndex))
+            else None,
+            "bold_rows": bold_rows,
+        }
+
+        return self._to_latex_via_styler(
+            buf,
+            hide=hide_,
+            relabel_index=relabel_index_,
+            format={"formatter": formatters_, **base_format_},
+            format_index=format_index_,
+            format_index_names=format_index_names_,
+            render_kwargs=render_kwargs_,
+        )
+
+    @final
+    def _to_latex_via_styler(
+        self,
+        buf=None,
+        *,
+        hide: dict | list[dict] | None = None,
+        relabel_index: dict | list[dict] | None = None,
+        format: dict | list[dict] | None = None,
+        format_index: dict | list[dict] | None = None,
+        format_index_names: dict | list[dict] | None = None,
+        render_kwargs: dict | None = None,
+    ):
+        """
+        Render object to a LaTeX tabular, longtable, or nested table.
+
+        Uses the ``Styler`` implementation with the following, ordered, method chaining:
+
+        .. code-block:: python
+           styler = Styler(DataFrame)
+           styler.hide(**hide)
+           styler.relabel_index(**relabel_index)
+           styler.format(**format)
+           styler.format_index(**format_index)
+           styler.to_latex(buf=buf, **render_kwargs)
+
+        Parameters
+        ----------
+        buf : str, Path or StringIO-like, optional, default None
+            Buffer to write to. If None, the output is returned as a string.
+        hide : dict, list of dict
+            Keyword args to pass to the method call of ``Styler.hide``. If a list will
+            call the method numerous times.
+        relabel_index : dict, list of dict
+            Keyword args to pass to the method of ``Styler.relabel_index``. If a list
+            will call the method numerous times.
+        format : dict, list of dict
+            Keyword args to pass to the method call of ``Styler.format``. If a list will
+            call the method numerous times.
+        format_index : dict, list of dict
+            Keyword args to pass to the method call of ``Styler.format_index``. If a
+            list will call the method numerous times.
+        render_kwargs : dict
+            Keyword args to pass to the method call of ``Styler.to_latex``.
+
+        Returns
+        -------
+        str or None
+            If buf is None, returns the result as a string. Otherwise returns None.
+        """
+        from pandas.io.formats.style import Styler
+
+        self = cast("DataFrame", self)
+        styler = Styler(self, uuid="")
+
+        for kw_name in [
+            "hide",
+            "relabel_index",
+            "format",
+            "format_index",
+            "format_index_names",
+        ]:
+            kw = vars()[kw_name]
+            if isinstance(kw, dict):
+                getattr(styler, kw_name)(**kw)
+            elif isinstance(kw, list):
+                for sub_kw in kw:
+                    getattr(styler, kw_name)(**sub_kw)
+
+        # bold_rows is not a direct kwarg of Styler.to_latex
+        render_kwargs = {} if render_kwargs is None else render_kwargs
+        if render_kwargs.pop("bold_rows"):
+            styler.map_index(lambda v: "textbf:--rwrap;")
+
+        return styler.to_latex(buf=buf, **render_kwargs)
+
+    @overload
+    def to_csv(
+        self,
+        path_or_buf: None = ...,
+        *,
+        sep: str = ...,
+        na_rep: str = ...,
+        float_format: str | Callable | None = ...,
+        columns: Sequence[Hashable] | None = ...,
+        header: bool | list[str] = ...,
+        index: bool = ...,
+        index_label: IndexLabel | None = ...,
+        mode: str = ...,
+        encoding: str | None = ...,
+        compression: CompressionOptions = ...,
+        quoting: int | None = ...,
+        quotechar: str = ...,
+        lineterminator: str | None = ...,
+        chunksize: int | None = ...,
+        date_format: str | None = ...,
+        doublequote: bool = ...,
+        escapechar: str | None = ...,
+        decimal: str = ...,
+        errors: OpenFileErrors = ...,
+        storage_options: StorageOptions = ...,
+    ) -> str: ...
+
+    @overload
+    def to_csv(
+        self,
+        path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
+        *,
+        sep: str = ...,
+        na_rep: str = ...,
+        float_format: str | Callable | None = ...,
+        columns: Sequence[Hashable] | None = ...,
+        header: bool | list[str] = ...,
+        index: bool = ...,
+        index_label: IndexLabel | None = ...,
+        mode: str = ...,
+        encoding: str | None = ...,
+        compression: CompressionOptions = ...,
+        quoting: int | None = ...,
+        quotechar: str = ...,
+        lineterminator: str | None = ...,
+        chunksize: int | None = ...,
+        date_format: str | None = ...,
+        doublequote: bool = ...,
+        escapechar: str | None = ...,
+        decimal: str = ...,
+        errors: OpenFileErrors = ...,
+        storage_options: StorageOptions = ...,
+    ) -> None: ...
+
+    @final
+    def to_csv(
+        self,
+        path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
+        *,
+        sep: str = ",",
+        na_rep: str = "",
+        float_format: str | Callable | None = None,
+        columns: Sequence[Hashable] | None = None,
+        header: bool | list[str] = True,
+        index: bool = True,
+        index_label: IndexLabel | None = None,
+        mode: str = "w",
+        encoding: str | None = None,
+        compression: CompressionOptions = "infer",
+        quoting: int | None = None,
+        quotechar: str = '"',
+        lineterminator: str | None = None,
+        chunksize: int | None = None,
+        date_format: str | None = None,
+        doublequote: bool = True,
+        escapechar: str | None = None,
+        decimal: str = ".",
+        errors: OpenFileErrors = "strict",
+        storage_options: StorageOptions | None = None,
+    ) -> str | None:
+        r"""
+        Write object to a comma-separated values (csv) file.
+
+        Parameters
+        ----------
+        path_or_buf : str, path object, file-like object, or None, default None
+            String, path object (implementing os.PathLike[str]), or file-like
+            object implementing a write() function. If None, the result is
+            returned as a string. If a non-binary file object is passed, it should
+            be opened with `newline=''`, disabling universal newlines. If a binary
+            file object is passed, `mode` might need to contain a `'b'`.
+        sep : str, default ','
+            String of length 1. Field delimiter for the output file.
+        na_rep : str, default ''
+            Missing data representation.
+        float_format : str, Callable, default None
+            Format string for floating point numbers. If a Callable is given, it takes
+            precedence over other numeric formatting parameters, like decimal.
+        columns : sequence, optional
+            Columns to write.
+        header : bool or list of str, default True
+            Write out the column names. If a list of strings is given it is
+            assumed to be aliases for the column names.
+        index : bool, default True
+            Write row names (index).
+        index_label : str or sequence, or False, default None
+            Column label for index column(s) if desired. If None is given, and
+            `header` and `index` are True, then the index names are used. A
+            sequence should be given if the object uses MultiIndex. If
+            False do not print fields for index names. Use index_label=False
+            for easier importing in R.
+        mode : {{'w', 'x', 'a'}}, default 'w'
+            Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control
+            the file opening. Typical values include:
+
+            - 'w', truncate the file first.
+            - 'x', exclusive creation, failing if the file already exists.
+            - 'a', append to the end of file if it exists.
+
+        encoding : str, optional
+            A string representing the encoding to use in the output file,
+            defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
+            is a non-binary file object.
+
+        compression : str or dict, default 'infer'
+            For on-the-fly compression of the output data. If 'infer' and
+            'path_or_buf' is path-like, then detect compression from the following
+            extensions: '.gz',
+            '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+            (otherwise no compression).
+            Set to ``None`` for no compression.
+            Can also be a dict with key ``'method'`` set to one of
+            {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
+            other key-value pairs are forwarded to
+            ``zipfile.ZipFile``, ``gzip.GzipFile``,
+            ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
+            ``tarfile.TarFile``, respectively.
+            As an example, the following could be passed for faster compression and
+            to create a reproducible gzip archive:
+            ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
+
+               May be a dict with key 'method' as compression mode
+               and other entries as additional compression options if
+               compression mode is 'zip'.
+
+               Passing compression options as keys in dict is
+               supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
+        quoting : optional constant from csv module
+            Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
+            then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
+            will treat them as non-numeric.
+        quotechar : str, default '\"'
+            String of length 1. Character used to quote fields.
+        lineterminator : str, optional
+            The newline character or character sequence to use in the output
+            file. Defaults to `os.linesep`, which depends on the OS in which
+            this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
+        chunksize : int or None
+            Rows to write at a time.
+        date_format : str, default None
+            Format string for datetime objects.
+        doublequote : bool, default True
+            Control quoting of `quotechar` inside a field.
+        escapechar : str, default None
+            String of length 1. Character used to escape `sep` and `quotechar`
+            when appropriate.
+        decimal : str, default '.'
+            Character recognized as decimal separator. E.g. use ',' for
+            European data.
+        errors : str, default 'strict'
+            Specifies how encoding and decoding errors are to be handled.
+            See the errors argument for :func:`open` for a full list
+            of options.
+
+        storage_options : dict, optional
+            Extra options that make sense for a particular storage connection, e.g.
+            host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+            are forwarded to ``urllib.request.Request`` as header options. For other
+            URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+            forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+            details, and for more examples on storage options refer `here
+            <https://pandas.pydata.org/docs/user_guide/io.html?
+            highlight=storage_options#reading-writing-remote-files>`_.
+
+        Returns
+        -------
+        None or str
+            If path_or_buf is None, returns the resulting csv format as a
+            string. Otherwise returns None.
+
+        See Also
+        --------
+        read_csv : Load a CSV file into a DataFrame.
+        to_excel : Write DataFrame to an Excel file.
+
+        Examples
+        --------
+        Create 'out.csv' containing 'df' without indices
+
+        >>> df = pd.DataFrame(
+        ...     [["Raphael", "red", "sai"], ["Donatello", "purple", "bo staff"]],
+        ...     columns=["name", "mask", "weapon"],
+        ... )
+        >>> df.to_csv("out.csv", index=False)  # doctest: +SKIP
+
+        Create 'out.zip' containing 'out.csv'
+
+        >>> df.to_csv(index=False)
+        'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
+        >>> compression_opts = dict(
+        ...     method="zip", archive_name="out.csv"
+        ... )  # doctest: +SKIP
+        >>> df.to_csv(
+        ...     "out.zip", index=False, compression=compression_opts
+        ... )  # doctest: +SKIP
+
+        To write a csv file to a new folder or nested folder you will first
+        need to create it using either Pathlib or os:
+
+        >>> from pathlib import Path  # doctest: +SKIP
+        >>> filepath = Path("folder/subfolder/out.csv")  # doctest: +SKIP
+        >>> filepath.parent.mkdir(parents=True, exist_ok=True)  # doctest: +SKIP
+        >>> df.to_csv(filepath)  # doctest: +SKIP
+
+        >>> import os  # doctest: +SKIP
+        >>> os.makedirs("folder/subfolder", exist_ok=True)  # doctest: +SKIP
+        >>> df.to_csv("folder/subfolder/out.csv")  # doctest: +SKIP
+
+        Format floats to two decimal places:
+
+        >>> df.to_csv("out1.csv", float_format="%.2f")  # doctest: +SKIP
+
+        Format floats using scientific notation:
+
+        >>> df.to_csv("out2.csv", float_format="{{:.2e}}".format)  # doctest: +SKIP
+        """
+        df = self if isinstance(self, ABCDataFrame) else self.to_frame()
+
+        formatter = DataFrameFormatter(
+            frame=df,
+            header=header,
+            index=index,
+            na_rep=na_rep,
+            float_format=float_format,
+            decimal=decimal,
+        )
+
+        return DataFrameRenderer(formatter).to_csv(
+            path_or_buf,
+            lineterminator=lineterminator,
+            sep=sep,
+            encoding=encoding,
+            errors=errors,
+            compression=compression,
+            quoting=quoting,
+            columns=columns,
+            index_label=index_label,
+            mode=mode,
+            chunksize=chunksize,
+            quotechar=quotechar,
+            date_format=date_format,
+            doublequote=doublequote,
+            escapechar=escapechar,
+            storage_options=storage_options,
+        )
+
+    # ----------------------------------------------------------------------
+    # Indexing Methods
+
+    @final
+    def take(self, indices, axis: Axis = 0, **kwargs) -> Self:
+        """
+        Return the elements in the given *positional* indices along an axis.
+
+        This means that we are not indexing according to actual values in
+        the index attribute of the object. We are indexing according to the
+        actual position of the element in the object.
+
+        Parameters
+        ----------
+        indices : array-like
+            An array of ints indicating which positions to take.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis on which to select elements. ``0`` means that we are
+            selecting rows, ``1`` means that we are selecting columns.
+            For `Series` this parameter is unused and defaults to 0.
+        **kwargs
+            For compatibility with :meth:`numpy.take`. Has no effect on the
+            output.
+
+        Returns
+        -------
+        same type as caller
+            An array-like containing the elements taken from the object.
+
+        See Also
+        --------
+        DataFrame.loc : Select a subset of a DataFrame by labels.
+        DataFrame.iloc : Select a subset of a DataFrame by positions.
+        numpy.take : Take elements from an array along an axis.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [
+        ...         ("falcon", "bird", 389.0),
+        ...         ("parrot", "bird", 24.0),
+        ...         ("lion", "mammal", 80.5),
+        ...         ("monkey", "mammal", np.nan),
+        ...     ],
+        ...     columns=["name", "class", "max_speed"],
+        ...     index=[0, 2, 3, 1],
+        ... )
+        >>> df
+             name   class  max_speed
+        0  falcon    bird      389.0
+        2  parrot    bird       24.0
+        3    lion  mammal       80.5
+        1  monkey  mammal        NaN
+
+        Take elements at positions 0 and 3 along the axis 0 (default).
+
+        Note how the actual indices selected (0 and 1) do not correspond to
+        our selected indices 0 and 3. That's because we are selecting the 0th
+        and 3rd rows, not rows whose indices equal 0 and 3.
+
+        >>> df.take([0, 3])
+             name   class  max_speed
+        0  falcon    bird      389.0
+        1  monkey  mammal        NaN
+
+        Take elements at indices 1 and 2 along the axis 1 (column selection).
+
+        >>> df.take([1, 2], axis=1)
+            class  max_speed
+        0    bird      389.0
+        2    bird       24.0
+        3  mammal       80.5
+        1  mammal        NaN
+
+        We may take elements using negative integers for positive indices,
+        starting from the end of the object, just like with Python lists.
+
+        >>> df.take([-1, -2])
+             name   class  max_speed
+        1  monkey  mammal        NaN
+        3    lion  mammal       80.5
+        """
+
+        nv.validate_take((), kwargs)
+
+        if isinstance(indices, slice):
+            raise TypeError(
+                f"{type(self).__name__}.take requires a sequence of integers, "
+                "not slice."
+            )
+        indices = np.asarray(indices, dtype=np.intp)
+        if axis == 0 and indices.ndim == 1 and is_range_indexer(indices, len(self)):
+            return self.copy(deep=False)
+
+        new_data = self._mgr.take(
+            indices,
+            axis=self._get_block_manager_axis(axis),
+            verify=True,
+        )
+        return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
+            self, method="take"
+        )
+
+    @final
+    def xs(
+        self,
+        key: IndexLabel,
+        axis: Axis = 0,
+        level: IndexLabel | None = None,
+        drop_level: bool = True,
+    ) -> Self:
+        """
+        Return cross-section from the Series/DataFrame.
+
+        This method takes a `key` argument to select data at a particular
+        level of a MultiIndex.
+
+        Parameters
+        ----------
+        key : label or tuple of label
+            Label contained in the index, or partially in a MultiIndex.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Axis to retrieve cross-section on.
+        level : object, defaults to first n levels (n=1 or len(key))
+            In case of a key partially contained in a MultiIndex, indicate
+            which levels are used. Levels can be referred by label or position.
+        drop_level : bool, default True
+            If False, returns object with same levels as self.
+
+        Returns
+        -------
+        Series or DataFrame
+            Cross-section from the original Series or DataFrame
+            corresponding to the selected index levels.
+
+        See Also
+        --------
+        DataFrame.loc : Access a group of rows and columns
+            by label(s) or a boolean array.
+        DataFrame.iloc : Purely integer-location based indexing
+            for selection by position.
+
+        Notes
+        -----
+        `xs` can not be used to set values.
+
+        MultiIndex Slicers is a generic way to get/set values on
+        any level or levels.
+        It is a superset of `xs` functionality, see
+        :ref:`MultiIndex Slicers <advanced.mi_slicers>`.
+
+        Examples
+        --------
+        >>> d = {
+        ...     "num_legs": [4, 4, 2, 2],
+        ...     "num_wings": [0, 0, 2, 2],
+        ...     "class": ["mammal", "mammal", "mammal", "bird"],
+        ...     "animal": ["cat", "dog", "bat", "penguin"],
+        ...     "locomotion": ["walks", "walks", "flies", "walks"],
+        ... }
+        >>> df = pd.DataFrame(data=d)
+        >>> df = df.set_index(["class", "animal", "locomotion"])
+        >>> df
+                                   num_legs  num_wings
+        class  animal  locomotion
+        mammal cat     walks              4          0
+               dog     walks              4          0
+               bat     flies              2          2
+        bird   penguin walks              2          2
+
+        Get values at specified index
+
+        >>> df.xs("mammal")
+                           num_legs  num_wings
+        animal locomotion
+        cat    walks              4          0
+        dog    walks              4          0
+        bat    flies              2          2
+
+        Get values at several indexes
+
+        >>> df.xs(("mammal", "dog", "walks"))
+        num_legs     4
+        num_wings    0
+        Name: (mammal, dog, walks), dtype: int64
+
+        Get values at specified index and level
+
+        >>> df.xs("cat", level=1)
+                           num_legs  num_wings
+        class  locomotion
+        mammal walks              4          0
+
+        Get values at several indexes and levels
+
+        >>> df.xs(("bird", "walks"), level=[0, "locomotion"])
+                 num_legs  num_wings
+        animal
+        penguin         2          2
+
+        Get values at specified column and axis
+
+        >>> df.xs("num_wings", axis=1)
+        class   animal   locomotion
+        mammal  cat      walks         0
+                dog      walks         0
+                bat      flies         2
+        bird    penguin  walks         2
+        Name: num_wings, dtype: int64
+        """
+        axis = self._get_axis_number(axis)
+        labels = self._get_axis(axis)
+
+        if isinstance(key, list):
+            raise TypeError("list keys are not supported in xs, pass a tuple instead")
+
+        if level is not None:
+            if not isinstance(labels, MultiIndex):
+                raise TypeError("Index must be a MultiIndex")
+            loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
+
+            # create the tuple of the indexer
+            _indexer = [slice(None)] * self.ndim
+            _indexer[axis] = loc
+            indexer = tuple(_indexer)
+
+            result = self.iloc[indexer]
+            setattr(result, result._get_axis_name(axis), new_ax)
+            return result
+
+        if axis == 1:
+            if drop_level:
+                return self[key]
+            index = self.columns
+        else:
+            index = self.index
+
+        if isinstance(index, MultiIndex):
+            loc, new_index = index._get_loc_level(key, level=0)
+            if not drop_level:
+                if lib.is_integer(loc):
+                    # Slice index must be an integer or None
+                    new_index = index[loc : loc + 1]
+                else:
+                    new_index = index[loc]
+        else:
+            loc = index.get_loc(key)
+
+            if isinstance(loc, np.ndarray):
+                if loc.dtype == np.bool_:
+                    (inds,) = loc.nonzero()
+                    return self.take(inds, axis=axis)
+                else:
+                    return self.take(loc, axis=axis)
+
+            if not is_scalar(loc):
+                new_index = index[loc]
+
+        if is_scalar(loc) and axis == 0:
+            # In this case loc should be an integer
+            if self.ndim == 1:
+                # if we encounter an array-like and we only have 1 dim
+                # that means that their are list/ndarrays inside the Series!
+                # so just return them (GH 6394)
+                return self._values[loc]
+
+            new_mgr = self._mgr.fast_xs(loc)
+
+            result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
+            result._name = self.index[loc]
+            result = result.__finalize__(self)
+        elif is_scalar(loc):
+            result = self.iloc[:, slice(loc, loc + 1)]
+        elif axis == 1:
+            result = self.iloc[:, loc]
+        else:
+            result = self.iloc[loc]
+            result.index = new_index
+
+        return result
+
+    def __getitem__(self, item):
+        raise AbstractMethodError(self)
+
+    @final
+    def _getitem_slice(self, key: slice) -> Self:
+        """
+        __getitem__ for the case where the key is a slice object.
+        """
+        # _convert_slice_indexer to determine if this slice is positional
+        #  or label based, and if the latter, convert to positional
+        slobj = self.index._convert_slice_indexer(key, kind="getitem")
+        if isinstance(slobj, np.ndarray):
+            # reachable with DatetimeIndex
+            indexer = lib.maybe_indices_to_slice(slobj.astype(np.intp), len(self))
+            if isinstance(indexer, np.ndarray):
+                # GH#43223 If we can not convert, use take
+                return self.take(indexer, axis=0)
+            slobj = indexer
+        return self._slice(slobj)
+
+    def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
+        """
+        Construct a slice of this container.
+
+        Slicing with this method is *always* positional.
+        """
+        assert isinstance(slobj, slice), type(slobj)
+        axis = self._get_block_manager_axis(axis)
+        new_mgr = self._mgr.get_slice(slobj, axis=axis)
+        result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
+        result = result.__finalize__(self)
+        return result
+
+    @final
+    def __delitem__(self, key) -> None:
+        """
+        Delete item
+        """
+        deleted = False
+
+        maybe_shortcut = False
+        if self.ndim == 2 and isinstance(self.columns, MultiIndex):
+            try:
+                # By using engine's __contains__ we effectively
+                # restrict to same-length tuples
+                maybe_shortcut = key not in self.columns._engine
+            except TypeError:
+                pass
+
+        if maybe_shortcut:
+            # Allow shorthand to delete all columns whose first len(key)
+            # elements match key:
+            if not isinstance(key, tuple):
+                key = (key,)
+            for col in self.columns:
+                if isinstance(col, tuple) and col[: len(key)] == key:
+                    del self[col]
+                    deleted = True
+        if not deleted:
+            # If the above loop ran and didn't delete anything because
+            # there was no match, this call should raise the appropriate
+            # exception:
+            loc = self.axes[-1].get_loc(key)
+            self._mgr = self._mgr.idelete(loc)
+
+    # ----------------------------------------------------------------------
+    # Unsorted
+
+    @final
+    def _check_inplace_and_allows_duplicate_labels(self, inplace: bool) -> None:
+        if inplace and not self.flags.allows_duplicate_labels:
+            raise ValueError(
+                "Cannot specify 'inplace=True' when "
+                "'self.flags.allows_duplicate_labels' is False."
+            )
+
+    @final
+    def get(self, key, default=None):
+        """
+        Get item from object for given key (ex: DataFrame column).
+
+        Returns ``default`` value if not found.
+
+        Parameters
+        ----------
+        key : object
+            Key for which item should be returned.
+        default : object, default None
+            Default value to return if key is not found.
+
+        Returns
+        -------
+        same type as items contained in object
+            Item for given key or ``default`` value, if key is not found.
+
+        See Also
+        --------
+        DataFrame.get : Get item from object for given key (ex: DataFrame column).
+        Series.get : Get item from object for given key (ex: DataFrame column).
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [
+        ...         [24.3, 75.7, "high"],
+        ...         [31, 87.8, "high"],
+        ...         [22, 71.6, "medium"],
+        ...         [35, 95, "medium"],
+        ...     ],
+        ...     columns=["temp_celsius", "temp_fahrenheit", "windspeed"],
+        ...     index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"),
+        ... )
+
+        >>> df
+                    temp_celsius  temp_fahrenheit windspeed
+        2014-02-12          24.3             75.7      high
+        2014-02-13          31.0             87.8      high
+        2014-02-14          22.0             71.6    medium
+        2014-02-15          35.0             95.0    medium
+
+        >>> df.get(["temp_celsius", "windspeed"])
+                    temp_celsius windspeed
+        2014-02-12          24.3      high
+        2014-02-13          31.0      high
+        2014-02-14          22.0    medium
+        2014-02-15          35.0    medium
+
+        >>> ser = df["windspeed"]
+        >>> ser.get("2014-02-13")
+        'high'
+
+        If the key isn't found, the default value will be used.
+
+        >>> df.get(["temp_celsius", "temp_kelvin"], default="default_value")
+        'default_value'
+
+        >>> ser.get("2014-02-10", "[unknown]")
+        '[unknown]'
+        """
+        try:
+            return self[key]
+        except (KeyError, ValueError, IndexError):
+            return default
+
+    @staticmethod
+    def _check_copy_deprecation(copy):
+        if copy is not lib.no_default:
+            warnings.warn(
+                "The copy keyword is deprecated and will be removed in a future "
+                "version. Copy-on-Write is active in pandas since 3.0 which utilizes "
+                "a lazy copy mechanism that defers copies until necessary. Use "
+                ".copy() to make an eager copy if necessary.",
+                Pandas4Warning,
+                stacklevel=find_stack_level(),
+            )
+
+    # issue 58667
+    @deprecate_kwarg(Pandas4Warning, "method", new_arg_name=None)
+    @final
+    def reindex_like(
+        self,
+        other,
+        method: Literal["backfill", "bfill", "pad", "ffill", "nearest"] | None = None,
+        copy: bool | lib.NoDefault = lib.no_default,
+        limit: int | None = None,
+        tolerance=None,
+    ) -> Self:
+        """
+        Return an object with matching indices as other object.
+
+        Conform the object to the same index on all axes. Optional
+        filling logic, placing NaN in locations having no value
+        in the previous index. A new object is produced unless the
+        new index is equivalent to the current one and copy=False.
+
+        Parameters
+        ----------
+        other : Object of the same data type
+            Its row and column indices are used to define the new indices
+            of this object.
+        method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
+            Method to use for filling holes in reindexed DataFrame.
+            Please note: this is only applicable to DataFrames/Series with a
+            monotonically increasing/decreasing index.
+
+            .. deprecated:: 3.0.0
+
+            * None (default): don't fill gaps
+            * pad / ffill: propagate last valid observation forward to next
+              valid
+            * backfill / bfill: use next valid observation to fill gap
+            * nearest: use nearest valid observations to fill gap.
+
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        limit : int, default None
+            Maximum number of consecutive labels to fill for inexact matches.
+        tolerance : optional
+            Maximum distance between original and new labels for inexact
+            matches. The values of the index at the matching locations must
+            satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+
+            Tolerance may be a scalar value, which applies the same tolerance
+            to all values, or list-like, which applies variable tolerance per
+            element. List-like includes list, tuple, array, Series, and must be
+            the same size as the index and its dtype must exactly match the
+            index's type.
+
+        Returns
+        -------
+        Series or DataFrame
+            Same type as caller, but with changed indices on each axis.
+
+        See Also
+        --------
+        DataFrame.set_index : Set row labels.
+        DataFrame.reset_index : Remove row labels or move them to new columns.
+        DataFrame.reindex : Change to new indices or expand indices.
+
+        Notes
+        -----
+        Same as calling
+        ``.reindex(index=other.index, columns=other.columns,...)``.
+
+        Examples
+        --------
+        >>> df1 = pd.DataFrame(
+        ...     [
+        ...         [24.3, 75.7, "high"],
+        ...         [31, 87.8, "high"],
+        ...         [22, 71.6, "medium"],
+        ...         [35, 95, "medium"],
+        ...     ],
+        ...     columns=["temp_celsius", "temp_fahrenheit", "windspeed"],
+        ...     index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"),
+        ... )
+
+        >>> df1
+                    temp_celsius  temp_fahrenheit windspeed
+        2014-02-12          24.3             75.7      high
+        2014-02-13          31.0             87.8      high
+        2014-02-14          22.0             71.6    medium
+        2014-02-15          35.0             95.0    medium
+
+        >>> df2 = pd.DataFrame(
+        ...     [[28, "low"], [30, "low"], [35.1, "medium"]],
+        ...     columns=["temp_celsius", "windspeed"],
+        ...     index=pd.DatetimeIndex(["2014-02-12", "2014-02-13", "2014-02-15"]),
+        ... )
+
+        >>> df2
+                    temp_celsius windspeed
+        2014-02-12          28.0       low
+        2014-02-13          30.0       low
+        2014-02-15          35.1    medium
+
+        >>> df2.reindex_like(df1)
+                    temp_celsius  temp_fahrenheit windspeed
+        2014-02-12          28.0              NaN       low
+        2014-02-13          30.0              NaN       low
+        2014-02-14           NaN              NaN       NaN
+        2014-02-15          35.1              NaN    medium
+        """
+        self._check_copy_deprecation(copy)
+        d = other._construct_axes_dict(
+            axes=self._AXIS_ORDERS,
+            method=method,
+            limit=limit,
+            tolerance=tolerance,
+        )
+
+        return self.reindex(**d)
+
+    @overload
+    def drop(
+        self,
+        labels: IndexLabel | ListLike = ...,
+        *,
+        axis: Axis = ...,
+        index: IndexLabel | ListLike = ...,
+        columns: IndexLabel | ListLike = ...,
+        level: Level | None = ...,
+        inplace: Literal[True],
+        errors: IgnoreRaise = ...,
+    ) -> None: ...
+
+    @overload
+    def drop(
+        self,
+        labels: IndexLabel | ListLike = ...,
+        *,
+        axis: Axis = ...,
+        index: IndexLabel | ListLike = ...,
+        columns: IndexLabel | ListLike = ...,
+        level: Level | None = ...,
+        inplace: Literal[False] = ...,
+        errors: IgnoreRaise = ...,
+    ) -> Self: ...
+
+    @overload
+    def drop(
+        self,
+        labels: IndexLabel | ListLike = ...,
+        *,
+        axis: Axis = ...,
+        index: IndexLabel | ListLike = ...,
+        columns: IndexLabel | ListLike = ...,
+        level: Level | None = ...,
+        inplace: bool = ...,
+        errors: IgnoreRaise = ...,
+    ) -> Self | None: ...
+
+    def drop(
+        self,
+        labels: IndexLabel | ListLike = None,
+        *,
+        axis: Axis = 0,
+        index: IndexLabel | ListLike = None,
+        columns: IndexLabel | ListLike = None,
+        level: Level | None = None,
+        inplace: bool = False,
+        errors: IgnoreRaise = "raise",
+    ) -> Self | None:
+        inplace = validate_bool_kwarg(inplace, "inplace")
+
+        if labels is not None:
+            if index is not None or columns is not None:
+                raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
+            axis_name = self._get_axis_name(axis)
+            axes = {axis_name: labels}
+        elif index is not None or columns is not None:
+            if axis == 1:
+                raise ValueError("Cannot specify both 'axis' and 'index'/'columns'")
+            axes = {"index": index}
+            if self.ndim == 2:
+                axes["columns"] = columns
+        else:
+            raise ValueError(
+                "Need to specify at least one of 'labels', 'index' or 'columns'"
+            )
+
+        obj = self
+
+        for axis, labels in axes.items():
+            if labels is not None:
+                obj = obj._drop_axis(labels, axis, level=level, errors=errors)
+
+        if inplace:
+            self._update_inplace(obj)
+            return None
+        else:
+            return obj
+
+    @final
+    def _drop_axis(
+        self,
+        labels,
+        axis,
+        level=None,
+        errors: IgnoreRaise = "raise",
+        only_slice: bool = False,
+    ) -> Self:
+        """
+        Drop labels from specified axis. Used in the ``drop`` method
+        internally.
+
+        Parameters
+        ----------
+        labels : single label or list-like
+        axis : int or axis name
+        level : int or level name, default None
+            For MultiIndex
+        errors : {'ignore', 'raise'}, default 'raise'
+            If 'ignore', suppress error and existing labels are dropped.
+        only_slice : bool, default False
+            Whether indexing along columns should be view-only.
+
+        """
+        axis_num = self._get_axis_number(axis)
+        axis = self._get_axis(axis)
+
+        if axis.is_unique:
+            if level is not None:
+                if not isinstance(axis, MultiIndex):
+                    raise AssertionError("axis must be a MultiIndex")
+                new_axis = axis.drop(labels, level=level, errors=errors)
+            else:
+                new_axis = axis.drop(labels, errors=errors)
+            indexer = axis.get_indexer(new_axis)
+
+        # Case for non-unique axis
+        else:
+            is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple)
+            labels = ensure_object(common.index_labels_to_array(labels))
+            if level is not None:
+                if not isinstance(axis, MultiIndex):
+                    raise AssertionError("axis must be a MultiIndex")
+                mask = ~axis.get_level_values(level).isin(labels)
+
+                # GH 18561 MultiIndex.drop should raise if label is absent
+                if errors == "raise" and mask.all():
+                    raise KeyError(f"{labels} not found in axis")
+            elif (
+                isinstance(axis, MultiIndex)
+                and labels.dtype == "object"
+                and not is_tuple_labels
+            ):
+                # Set level to zero in case of MultiIndex and label is string,
+                #  because isin can't handle strings for MultiIndexes GH#36293
+                # In case of tuples we get dtype object but have to use isin GH#42771
+                mask = ~axis.get_level_values(0).isin(labels)
+            else:
+                mask = ~axis.isin(labels)
+                # Check if label doesn't exist along axis
+                labels_missing = (axis.get_indexer_for(labels) == -1).any()
+                if errors == "raise" and labels_missing:
+                    raise KeyError(f"{labels} not found in axis")
+
+            if isinstance(mask.dtype, ExtensionDtype):
+                # GH#45860
+                mask = mask.to_numpy(dtype=bool)
+
+            indexer = mask.nonzero()[0]
+            new_axis = axis.take(indexer)
+
+        bm_axis = self.ndim - axis_num - 1
+        new_mgr = self._mgr.reindex_indexer(
+            new_axis,
+            indexer,
+            axis=bm_axis,
+            allow_dups=True,
+            only_slice=only_slice,
+        )
+        result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
+        if self.ndim == 1:
+            result._name = self.name
+
+        return result.__finalize__(self)
+
+    @final
+    def _update_inplace(self, result) -> None:
+        """
+        Replace self internals with result.
+
+        Parameters
+        ----------
+        result : same type as self
+        """
+        # NOTE: This does *not* call __finalize__ and that's an explicit
+        # decision that we may revisit in the future.
+        self._mgr = result._mgr
+
+    @final
+    def add_prefix(self, prefix: str, axis: Axis | None = None) -> Self:
+        """
+        Prefix labels with string `prefix`.
+
+        For Series, the row labels are prefixed.
+        For DataFrame, the column labels are prefixed.
+
+        Parameters
+        ----------
+        prefix : str
+            The string to add before each label.
+        axis : {0 or 'index', 1 or 'columns', None}, default None
+            Axis to add prefix on
+
+            .. versionadded:: 2.0.0
+
+        Returns
+        -------
+        Series or DataFrame
+            New Series or DataFrame with updated labels.
+
+        See Also
+        --------
+        Series.add_suffix: Suffix row labels with string `suffix`.
+        DataFrame.add_suffix: Suffix column labels with string `suffix`.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3, 4])
+        >>> s
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+
+        >>> s.add_prefix("item_")
+        item_0    1
+        item_1    2
+        item_2    3
+        item_3    4
+        dtype: int64
+
+        >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]})
+        >>> df
+           A  B
+        0  1  3
+        1  2  4
+        2  3  5
+        3  4  6
+
+        >>> df.add_prefix("col_")
+             col_A  col_B
+        0       1       3
+        1       2       4
+        2       3       5
+        3       4       6
+        """
+        f = lambda x: f"{prefix}{x}"
+
+        axis_name = self._info_axis_name
+        if axis is not None:
+            axis_name = self._get_axis_name(axis)
+
+        mapper = {axis_name: f}
+
+        # error: Keywords must be strings
+        # error: No overload variant of "_rename" of "NDFrame" matches
+        # argument type "dict[Literal['index', 'columns'], Callable[[Any], str]]"
+        return self._rename(**mapper)  # type: ignore[call-overload, misc]
+
+    @final
+    def add_suffix(self, suffix: str, axis: Axis | None = None) -> Self:
+        """
+        Suffix labels with string `suffix`.
+
+        For Series, the row labels are suffixed.
+        For DataFrame, the column labels are suffixed.
+
+        Parameters
+        ----------
+        suffix : str
+            The string to add after each label.
+        axis : {0 or 'index', 1 or 'columns', None}, default None
+            Axis to add suffix on
+
+            .. versionadded:: 2.0.0
+
+        Returns
+        -------
+        Series or DataFrame
+            New Series or DataFrame with updated labels.
+
+        See Also
+        --------
+        Series.add_prefix: Prefix row labels with string `prefix`.
+        DataFrame.add_prefix: Prefix column labels with string `prefix`.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3, 4])
+        >>> s
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+
+        >>> s.add_suffix("_item")
+        0_item    1
+        1_item    2
+        2_item    3
+        3_item    4
+        dtype: int64
+
+        >>> df = pd.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]})
+        >>> df
+           A  B
+        0  1  3
+        1  2  4
+        2  3  5
+        3  4  6
+
+        >>> df.add_suffix("_col")
+             A_col  B_col
+        0       1       3
+        1       2       4
+        2       3       5
+        3       4       6
+        """
+        f = lambda x: f"{x}{suffix}"
+
+        axis_name = self._info_axis_name
+        if axis is not None:
+            axis_name = self._get_axis_name(axis)
+
+        mapper = {axis_name: f}
+        # error: Keywords must be strings
+        # error: No overload variant of "_rename" of "NDFrame" matches argument
+        # type "dict[Literal['index', 'columns'], Callable[[Any], str]]"
+        return self._rename(**mapper)  # type: ignore[call-overload, misc]
+
+    @overload
+    def sort_values(
+        self,
+        *,
+        axis: Axis = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: Literal[False] = ...,
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        ignore_index: bool = ...,
+        key: ValueKeyFunc = ...,
+    ) -> Self: ...
+
+    @overload
+    def sort_values(
+        self,
+        *,
+        axis: Axis = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: Literal[True],
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        ignore_index: bool = ...,
+        key: ValueKeyFunc = ...,
+    ) -> None: ...
+
+    @overload
+    def sort_values(
+        self,
+        *,
+        axis: Axis = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: bool = ...,
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        ignore_index: bool = ...,
+        key: ValueKeyFunc = ...,
+    ) -> Self | None: ...
+
+    def sort_values(
+        self,
+        *,
+        axis: Axis = 0,
+        ascending: bool | Sequence[bool] = True,
+        inplace: bool = False,
+        kind: SortKind = "quicksort",
+        na_position: NaPosition = "last",
+        ignore_index: bool = False,
+        key: ValueKeyFunc | None = None,
+    ) -> Self | None:
+        """
+        Sort by the values along either axis.
+
+        Parameters
+        ----------%(optional_by)s
+        axis : %(axes_single_arg)s, default 0
+             Axis to be sorted.
+        ascending : bool or list of bool, default True
+             Sort ascending vs. descending. Specify list for multiple sort
+             orders.  If this is a list of bools, must match the length of
+             the by.
+        inplace : bool, default False
+             If True, perform operation in-place.
+        kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
+             Choice of sorting algorithm. See also :func:`numpy.sort` for more
+             information. `mergesort` and `stable` are the only stable algorithms. For
+             DataFrames, this option is only applied when sorting on a single
+             column or label.
+        na_position : {'first', 'last'}, default 'last'
+             Puts NaNs at the beginning if `first`; `last` puts NaNs at the
+             end.
+        ignore_index : bool, default False
+             If True, the resulting axis will be labeled 0, 1, …, n - 1.
+        key : callable, optional
+            Apply the key function to the values
+            before sorting. This is similar to the `key` argument in the
+            builtin :meth:`sorted` function, with the notable difference that
+            this `key` function should be *vectorized*. It should expect a
+            ``Series`` and return a Series with the same shape as the input.
+            It will be applied to each column in `by` independently. The values in the
+            returned Series will be used as the keys for sorting.
+
+        Returns
+        -------
+        DataFrame or None
+            DataFrame with sorted values or None if ``inplace=True``.
+
+        See Also
+        --------
+        DataFrame.sort_index : Sort a DataFrame by the index.
+        Series.sort_values : Similar method for a Series.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "col1": ["A", "A", "B", np.nan, "D", "C"],
+        ...         "col2": [2, 1, 9, 8, 7, 4],
+        ...         "col3": [0, 1, 9, 4, 2, 3],
+        ...         "col4": ["a", "B", "c", "D", "e", "F"],
+        ...     }
+        ... )
+        >>> df
+          col1  col2  col3 col4
+        0    A     2     0    a
+        1    A     1     1    B
+        2    B     9     9    c
+        3  NaN     8     4    D
+        4    D     7     2    e
+        5    C     4     3    F
+
+        Sort by col1
+
+        >>> df.sort_values(by=["col1"])
+          col1  col2  col3 col4
+        0    A     2     0    a
+        1    A     1     1    B
+        2    B     9     9    c
+        5    C     4     3    F
+        4    D     7     2    e
+        3  NaN     8     4    D
+
+        Sort by multiple columns
+
+        >>> df.sort_values(by=["col1", "col2"])
+          col1  col2  col3 col4
+        1    A     1     1    B
+        0    A     2     0    a
+        2    B     9     9    c
+        5    C     4     3    F
+        4    D     7     2    e
+        3  NaN     8     4    D
+
+        Sort Descending
+
+        >>> df.sort_values(by="col1", ascending=False)
+          col1  col2  col3 col4
+        4    D     7     2    e
+        5    C     4     3    F
+        2    B     9     9    c
+        0    A     2     0    a
+        1    A     1     1    B
+        3  NaN     8     4    D
+
+        Putting NAs first
+
+        >>> df.sort_values(by="col1", ascending=False, na_position="first")
+          col1  col2  col3 col4
+        3  NaN     8     4    D
+        4    D     7     2    e
+        5    C     4     3    F
+        2    B     9     9    c
+        0    A     2     0    a
+        1    A     1     1    B
+
+        Sorting with a key function
+
+        >>> df.sort_values(by="col4", key=lambda col: col.str.lower())
+           col1  col2  col3 col4
+        0    A     2     0    a
+        1    A     1     1    B
+        2    B     9     9    c
+        3  NaN     8     4    D
+        4    D     7     2    e
+        5    C     4     3    F
+
+        Natural sort with the key argument,
+        using the `natsort <https://github.com/SethMMorton/natsort>` package.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "hours": ["0hr", "128hr", "0hr", "64hr", "64hr", "128hr"],
+        ...         "mins": [
+        ...             "10mins",
+        ...             "40mins",
+        ...             "40mins",
+        ...             "40mins",
+        ...             "10mins",
+        ...             "10mins",
+        ...         ],
+        ...         "value": [10, 20, 30, 40, 50, 60],
+        ...     }
+        ... )
+        >>> df
+           hours    mins  value
+        0    0hr  10mins     10
+        1  128hr  40mins     20
+        2    0hr  40mins     30
+        3   64hr  40mins     40
+        4   64hr  10mins     50
+        5  128hr  10mins     60
+        >>> from natsort import natsort_keygen
+        >>> df.sort_values(
+        ...     by=["hours", "mins"],
+        ...     key=natsort_keygen(),
+        ... )
+           hours    mins  value
+        0    0hr  10mins     10
+        2    0hr  40mins     30
+        4   64hr  10mins     50
+        3   64hr  40mins     40
+        5  128hr  10mins     60
+        1  128hr  40mins     20
+        """
+        raise AbstractMethodError(self)
+
+    @overload
+    def sort_index(
+        self,
+        *,
+        axis: Axis = ...,
+        level: IndexLabel = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: Literal[True],
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        sort_remaining: bool = ...,
+        ignore_index: bool = ...,
+        key: IndexKeyFunc = ...,
+    ) -> None: ...
+
+    @overload
+    def sort_index(
+        self,
+        *,
+        axis: Axis = ...,
+        level: IndexLabel = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: Literal[False] = ...,
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        sort_remaining: bool = ...,
+        ignore_index: bool = ...,
+        key: IndexKeyFunc = ...,
+    ) -> Self: ...
+
+    @overload
+    def sort_index(
+        self,
+        *,
+        axis: Axis = ...,
+        level: IndexLabel = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: bool = ...,
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        sort_remaining: bool = ...,
+        ignore_index: bool = ...,
+        key: IndexKeyFunc = ...,
+    ) -> Self | None: ...
+
+    def sort_index(
+        self,
+        *,
+        axis: Axis = 0,
+        level: IndexLabel | None = None,
+        ascending: bool | Sequence[bool] = True,
+        inplace: bool = False,
+        kind: SortKind = "quicksort",
+        na_position: NaPosition = "last",
+        sort_remaining: bool = True,
+        ignore_index: bool = False,
+        key: IndexKeyFunc | None = None,
+    ) -> Self | None:
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        axis = self._get_axis_number(axis)
+        ascending = validate_ascending(ascending)
+
+        target = self._get_axis(axis)
+
+        indexer = get_indexer_indexer(
+            target, level, ascending, kind, na_position, sort_remaining, key
+        )
+
+        if indexer is None:
+            if inplace:
+                result = self
+            else:
+                result = self.copy(deep=False)
+
+            if ignore_index:
+                if axis == 1:
+                    result.columns = default_index(len(self.columns))
+                else:
+                    result.index = default_index(len(self))
+            if inplace:
+                return None
+            else:
+                return result
+
+        baxis = self._get_block_manager_axis(axis)
+        new_data = self._mgr.take(indexer, axis=baxis, verify=False)
+
+        # reconstruct axis if needed
+        if not ignore_index:
+            new_axis = new_data.axes[baxis]._sort_levels_monotonic()
+        else:
+            new_axis = default_index(len(indexer))
+        new_data.set_axis(baxis, new_axis)
+
+        result = self._constructor_from_mgr(new_data, axes=new_data.axes)
+
+        if inplace:
+            return self._update_inplace(result)
+        else:
+            return result.__finalize__(self, method="sort_index")
+
+    def reindex(
+        self,
+        labels=None,
+        *,
+        index=None,
+        columns=None,
+        axis: Axis | None = None,
+        method: ReindexMethod | None = None,
+        copy: bool | lib.NoDefault = lib.no_default,
+        level: Level | None = None,
+        fill_value: Scalar | None = np.nan,
+        limit: int | None = None,
+        tolerance=None,
+    ) -> Self:
+        """
+        Conform Series/DataFrame to new index with optional filling logic.
+
+        Places NA/NaN in locations having no value in the previous index. A new object
+        is produced unless the new index is equivalent to the current one and
+        ``copy=False``.
+
+        Parameters
+        ----------
+        method : {{None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}}
+            Method to use for filling holes in reindexed DataFrame.
+            Please note: this is only applicable to DataFrames/Series with a
+            monotonically increasing/decreasing index.
+
+            * None (default): don't fill gaps
+            * pad / ffill: Propagate last valid observation forward to next
+              valid.
+            * backfill / bfill: Use next valid observation to fill gap.
+            * nearest: Use nearest valid observations to fill gap.
+
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        level : int or name
+            Broadcast across a level, matching Index values on the
+            passed MultiIndex level.
+        fill_value : scalar, default np.nan
+            Value to use for missing values. Defaults to NaN, but can be any
+            "compatible" value.
+        limit : int, default None
+            Maximum number of consecutive elements to forward or backward fill.
+        tolerance : optional
+            Maximum distance between original and new labels for inexact
+            matches. The values of the index at the matching locations most
+            satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+
+            Tolerance may be a scalar value, which applies the same tolerance
+            to all values, or list-like, which applies variable tolerance per
+            element. List-like includes list, tuple, array, Series, and must be
+            the same size as the index and its dtype must exactly match the
+            index's type.
+
+        Returns
+        -------
+        Series/DataFrame
+            Series/DataFrame with changed index.
+
+        See Also
+        --------
+        DataFrame.set_index : Set row labels.
+        DataFrame.reset_index : Remove row labels or move them to new columns.
+        DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+        Examples
+        --------
+        ``DataFrame.reindex`` supports two calling conventions
+
+        * ``(index=index_labels, columns=column_labels, ...)``
+        * ``(labels, axis={{'index', 'columns'}}, ...)``
+
+        We *highly* recommend using keyword arguments to clarify your
+        intent.
+
+        Create a DataFrame with some fictional data.
+
+        >>> index = ["Firefox", "Chrome", "Safari", "IE10", "Konqueror"]
+        >>> columns = ["http_status", "response_time"]
+        >>> df = pd.DataFrame(
+        ...     [[200, 0.04], [200, 0.02], [404, 0.07], [404, 0.08], [301, 1.0]],
+        ...     columns=columns,
+        ...     index=index,
+        ... )
+        >>> df
+                   http_status  response_time
+        Firefox            200           0.04
+        Chrome             200           0.02
+        Safari             404           0.07
+        IE10               404           0.08
+        Konqueror          301           1.00
+
+        Create a new index and reindex the DataFrame. By default
+        values in the new index that do not have corresponding
+        records in the DataFrame are assigned ``NaN``.
+
+        >>> new_index = ["Safari", "Iceweasel", "Comodo Dragon", "IE10", "Chrome"]
+        >>> df.reindex(new_index)
+                       http_status  response_time
+        Safari               404.0           0.07
+        Iceweasel              NaN            NaN
+        Comodo Dragon          NaN            NaN
+        IE10                 404.0           0.08
+        Chrome               200.0           0.02
+
+        We can fill in the missing values by passing a value to
+        the keyword ``fill_value``. Because the index is not monotonically
+        increasing or decreasing, we cannot use arguments to the keyword
+        ``method`` to fill the ``NaN`` values.
+
+        >>> df.reindex(new_index, fill_value=0)
+                       http_status  response_time
+        Safari                 404           0.07
+        Iceweasel                0           0.00
+        Comodo Dragon            0           0.00
+        IE10                   404           0.08
+        Chrome                 200           0.02
+
+        >>> df.reindex(new_index, fill_value="missing")
+                      http_status response_time
+        Safari                404          0.07
+        Iceweasel         missing       missing
+        Comodo Dragon     missing       missing
+        IE10                  404          0.08
+        Chrome                200          0.02
+
+        We can also reindex the columns.
+
+        >>> df.reindex(columns=["http_status", "user_agent"])
+                   http_status  user_agent
+        Firefox            200         NaN
+        Chrome             200         NaN
+        Safari             404         NaN
+        IE10               404         NaN
+        Konqueror          301         NaN
+
+        Or we can use "axis-style" keyword arguments
+
+        >>> df.reindex(["http_status", "user_agent"], axis="columns")
+                   http_status  user_agent
+        Firefox            200         NaN
+        Chrome             200         NaN
+        Safari             404         NaN
+        IE10               404         NaN
+        Konqueror          301         NaN
+
+        To further illustrate the filling functionality in
+        ``reindex``, we will create a DataFrame with a
+        monotonically increasing index (for example, a sequence
+        of dates).
+
+        >>> date_index = pd.date_range("1/1/2010", periods=6, freq="D")
+        >>> df2 = pd.DataFrame(
+        ...     {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index
+        ... )
+        >>> df2
+                    prices
+        2010-01-01   100.0
+        2010-01-02   101.0
+        2010-01-03     NaN
+        2010-01-04   100.0
+        2010-01-05    89.0
+        2010-01-06    88.0
+
+        Suppose we decide to expand the DataFrame to cover a wider
+        date range.
+
+        >>> date_index2 = pd.date_range("12/29/2009", periods=10, freq="D")
+        >>> df2.reindex(date_index2)
+                    prices
+        2009-12-29     NaN
+        2009-12-30     NaN
+        2009-12-31     NaN
+        2010-01-01   100.0
+        2010-01-02   101.0
+        2010-01-03     NaN
+        2010-01-04   100.0
+        2010-01-05    89.0
+        2010-01-06    88.0
+        2010-01-07     NaN
+
+        The index entries that did not have a value in the original data frame
+        (for example, '2009-12-29') are by default filled with ``NaN``.
+        If desired, we can fill in the missing values using one of several
+        options.
+
+        For example, to back-propagate the last valid value to fill the ``NaN``
+        values, pass ``bfill`` as an argument to the ``method`` keyword.
+
+        >>> df2.reindex(date_index2, method="bfill")
+                    prices
+        2009-12-29   100.0
+        2009-12-30   100.0
+        2009-12-31   100.0
+        2010-01-01   100.0
+        2010-01-02   101.0
+        2010-01-03     NaN
+        2010-01-04   100.0
+        2010-01-05    89.0
+        2010-01-06    88.0
+        2010-01-07     NaN
+
+        Please note that the ``NaN`` value present in the original DataFrame
+        (at index value 2010-01-03) will not be filled by any of the
+        value propagation schemes. This is because filling while reindexing
+        does not look at DataFrame values, but only compares the original and
+        desired indexes. If you do want to fill in the ``NaN`` values present
+        in the original DataFrame, use the ``fillna()`` method.
+
+        See the :ref:`user guide <basics.reindexing>` for more.
+        """
+        # TODO: Decide if we care about having different examples for different
+        # kinds
+
+        # Automatically detect matching level when reindexing from Index to MultiIndex.
+        # This prevents values from being incorrectly set to NaN when the source index
+        # name matches a index name in the target MultiIndex
+        if (
+            level is None
+            and index is not None
+            and isinstance(index, MultiIndex)
+            and not isinstance(self.index, MultiIndex)
+            and self.index.name in index.names
+        ):
+            level = self.index.name
+        self._check_copy_deprecation(copy)
+
+        if index is not None and columns is not None and labels is not None:
+            raise TypeError("Cannot specify all of 'labels', 'index', 'columns'.")
+        elif index is not None or columns is not None:
+            if axis is not None:
+                raise TypeError(
+                    "Cannot specify both 'axis' and any of 'index' or 'columns'"
+                )
+            if labels is not None:
+                if index is not None:
+                    columns = labels
+                else:
+                    index = labels
+        elif axis and self._get_axis_number(axis) == 1:
+            columns = labels
+        else:
+            index = labels
+        axes: dict[Literal["index", "columns"], Any] = {
+            "index": index,
+            "columns": columns,
+        }
+        method = clean_reindex_fill_method(method)
+
+        # if all axes that are requested to reindex are equal, then only copy
+        # if indicated must have index names equal here as well as values
+        if all(
+            self._get_axis(axis_name).identical(ax)
+            for axis_name, ax in axes.items()
+            if ax is not None
+        ):
+            return self.copy(deep=False)
+
+        # check if we are a multi reindex
+        if self._needs_reindex_multi(axes, method, level):
+            return self._reindex_multi(axes, fill_value)
+
+        # perform the reindex on the axes
+        return self._reindex_axes(
+            axes, level, limit, tolerance, method, fill_value
+        ).__finalize__(self, method="reindex")
+
+    @final
+    def _reindex_axes(
+        self,
+        axes,
+        level: Level | None,
+        limit: int | None,
+        tolerance,
+        method,
+        fill_value: Scalar | None,
+    ) -> Self:
+        """Perform the reindex for all the axes."""
+        obj = self
+        for a in self._AXIS_ORDERS:
+            labels = axes[a]
+            if labels is None:
+                continue
+
+            ax = self._get_axis(a)
+            new_index, indexer = ax.reindex(
+                labels, level=level, limit=limit, tolerance=tolerance, method=method
+            )
+
+            axis = self._get_axis_number(a)
+            obj = obj._reindex_with_indexers(
+                {axis: [new_index, indexer]},
+                fill_value=fill_value,
+                allow_dups=False,
+            )
+
+        return obj
+
+    def _needs_reindex_multi(self, axes, method, level: Level | None) -> bool:
+        """Check if we do need a multi reindex."""
+        return (
+            (common.count_not_none(*axes.values()) == self._AXIS_LEN)
+            and method is None
+            and level is None
+            # reindex_multi calls self.values, so we only want to go
+            #  down that path when doing so is cheap.
+            and self._can_fast_transpose
+        )
+
+    def _reindex_multi(self, axes, fill_value):
+        raise AbstractMethodError(self)
+
+    @final
+    def _reindex_with_indexers(
+        self,
+        reindexers,
+        fill_value=None,
+        allow_dups: bool = False,
+    ) -> Self:
+        """allow_dups indicates an internal call here"""
+        # reindex doing multiple operations on different axes if indicated
+        new_data = self._mgr
+        for axis in sorted(reindexers.keys()):
+            index, indexer = reindexers[axis]
+            baxis = self._get_block_manager_axis(axis)
+
+            if index is None:
+                continue
+
+            index = ensure_index(index)
+            if indexer is not None:
+                indexer = ensure_platform_int(indexer)
+
+            # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi)
+            new_data = new_data.reindex_indexer(
+                index,
+                indexer,
+                axis=baxis,
+                fill_value=fill_value,
+                allow_dups=allow_dups,
+            )
+
+        if new_data is self._mgr:
+            new_data = new_data.copy(deep=False)
+
+        return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
+            self
+        )
+
+    def filter(
+        self,
+        items=None,
+        like: str | None = None,
+        regex: str | None = None,
+        axis: Axis | None = None,
+    ) -> Self:
+        """
+        Subset the DataFrame or Series according to the specified index labels.
+
+        For DataFrame, filter rows or columns depending on ``axis`` argument.
+        Note that this routine does not filter based on content.
+        The filter is applied to the labels of the index.
+
+        Parameters
+        ----------
+        items : list-like
+            Keep labels from axis which are in items.
+        like : str
+            Keep labels from axis for which "like in label == True".
+        regex : str (regular expression)
+            Keep labels from axis for which re.search(regex, label) == True.
+        axis : {0 or 'index', 1 or 'columns', None}, default None
+            The axis to filter on, expressed either as an index (int)
+            or axis name (str). By default this is the info axis, 'columns' for
+            ``DataFrame``. For ``Series`` this parameter is unused and defaults to
+            ``None``.
+
+        Returns
+        -------
+        Same type as caller
+            The filtered subset of the DataFrame or Series.
+
+        See Also
+        --------
+        DataFrame.loc : Access a group of rows and columns
+            by label(s) or a boolean array.
+
+        Notes
+        -----
+        The ``items``, ``like``, and ``regex`` parameters are
+        enforced to be mutually exclusive.
+
+        ``axis`` defaults to the info axis that is used when indexing
+        with ``[]``.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     np.array(([1, 2, 3], [4, 5, 6])),
+        ...     index=["mouse", "rabbit"],
+        ...     columns=["one", "two", "three"],
+        ... )
+        >>> df
+                one  two  three
+        mouse     1    2      3
+        rabbit    4    5      6
+
+        >>> # select columns by name
+        >>> df.filter(items=["one", "three"])
+                 one  three
+        mouse     1      3
+        rabbit    4      6
+
+        >>> # select columns by regular expression
+        >>> df.filter(regex="e$", axis=1)
+                 one  three
+        mouse     1      3
+        rabbit    4      6
+
+        >>> # select rows containing 'bbi'
+        >>> df.filter(like="bbi", axis=0)
+                 one  two  three
+        rabbit    4    5      6
+        """
+        nkw = common.count_not_none(items, like, regex)
+        if nkw > 1:
+            raise TypeError(
+                "Keyword arguments `items`, `like`, or `regex` are mutually exclusive"
+            )
+
+        if axis is None:
+            axis = self._info_axis_name
+        labels = self._get_axis(axis)
+
+        if items is not None:
+            name = self._get_axis_name(axis)
+            items = Index(items).intersection(labels)
+            if len(items) == 0:
+                # Keep the dtype of labels when we are empty
+                items = items.astype(labels.dtype)
+            # error: Keywords must be strings
+            return self.reindex(**{name: items})  # type: ignore[misc]
+        elif like:
+
+            def f(x) -> bool:
+                assert like is not None  # needed for mypy
+                return like in ensure_str(x)
+
+            values = labels.map(f)
+            return self.loc(axis=axis)[values]
+        elif regex:
+
+            def f(x) -> bool:
+                return matcher.search(ensure_str(x)) is not None
+
+            matcher = re.compile(regex)
+            values = labels.map(f)
+            return self.loc(axis=axis)[values]
+        else:
+            raise TypeError("Must pass either `items`, `like`, or `regex`")
+
+    @final
+    def head(self, n: int = 5) -> Self:
+        """
+        Return the first `n` rows.
+
+        This function exhibits the same behavior as ``df[:n]``, returning the
+        first ``n`` rows based on position. It is useful for quickly checking
+        if your object has the right type of data in it.
+
+        When ``n`` is positive, it returns the first ``n`` rows. For ``n`` equal to 0,
+        it returns an empty object. When ``n`` is negative, it returns
+        all rows except the last ``|n|`` rows, mirroring the behavior of ``df[:n]``.
+
+        If ``n`` is larger than the number of rows, this function returns all rows.
+
+        Parameters
+        ----------
+        n : int, default 5
+            Number of rows to select.
+
+        Returns
+        -------
+        same type as caller
+            The first `n` rows of the caller object.
+
+        See Also
+        --------
+        DataFrame.tail: Returns the last `n` rows.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "animal": [
+        ...             "alligator",
+        ...             "bee",
+        ...             "falcon",
+        ...             "lion",
+        ...             "monkey",
+        ...             "parrot",
+        ...             "shark",
+        ...             "whale",
+        ...             "zebra",
+        ...         ]
+        ...     }
+        ... )
+        >>> df
+              animal
+        0  alligator
+        1        bee
+        2     falcon
+        3       lion
+        4     monkey
+        5     parrot
+        6      shark
+        7      whale
+        8      zebra
+
+        Viewing the first 5 lines
+
+        >>> df.head()
+              animal
+        0  alligator
+        1        bee
+        2     falcon
+        3       lion
+        4     monkey
+
+        Viewing the first `n` lines (three in this case)
+
+        >>> df.head(3)
+              animal
+        0  alligator
+        1        bee
+        2     falcon
+
+        For negative values of `n`
+
+        >>> df.head(-3)
+              animal
+        0  alligator
+        1        bee
+        2     falcon
+        3       lion
+        4     monkey
+        5     parrot
+        """
+        return self.iloc[:n].copy()
+
+    @final
+    def tail(self, n: int = 5) -> Self:
+        """
+        Return the last `n` rows.
+
+        This function returns last `n` rows from the object based on
+        position. It is useful for quickly verifying data, for example,
+        after sorting or appending rows.
+
+        For negative values of `n`, this function returns all rows except
+        the first `|n|` rows, equivalent to ``df[|n|:]``.
+
+        If ``n`` is larger than the number of rows, this function returns all rows.
+
+        Parameters
+        ----------
+        n : int, default 5
+            Number of rows to select.
+
+        Returns
+        -------
+        type of caller
+            The last `n` rows of the caller object.
+
+        See Also
+        --------
+        DataFrame.head : The first `n` rows of the caller object.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "animal": [
+        ...             "alligator",
+        ...             "bee",
+        ...             "falcon",
+        ...             "lion",
+        ...             "monkey",
+        ...             "parrot",
+        ...             "shark",
+        ...             "whale",
+        ...             "zebra",
+        ...         ]
+        ...     }
+        ... )
+        >>> df
+              animal
+        0  alligator
+        1        bee
+        2     falcon
+        3       lion
+        4     monkey
+        5     parrot
+        6      shark
+        7      whale
+        8      zebra
+
+        Viewing the last 5 lines
+
+        >>> df.tail()
+           animal
+        4  monkey
+        5  parrot
+        6   shark
+        7   whale
+        8   zebra
+
+        Viewing the last `n` lines (three in this case)
+
+        >>> df.tail(3)
+          animal
+        6  shark
+        7  whale
+        8  zebra
+
+        For negative values of `n`
+
+        >>> df.tail(-3)
+           animal
+        3    lion
+        4  monkey
+        5  parrot
+        6   shark
+        7   whale
+        8   zebra
+        """
+        if n == 0:
+            return self.iloc[0:0].copy()
+        return self.iloc[-n:].copy()
+
+    @final
+    def sample(
+        self,
+        n: int | None = None,
+        frac: float | None = None,
+        replace: bool = False,
+        weights=None,
+        random_state: RandomState | None = None,
+        axis: Axis | None = None,
+        ignore_index: bool = False,
+    ) -> Self:
+        """
+        Return a random sample of items from an axis of object.
+
+        You can use `random_state` for reproducibility.
+
+        Parameters
+        ----------
+        n : int, optional
+            Number of items from axis to return. Cannot be used with `frac`.
+            Default = 1 if `frac` = None.
+        frac : float, optional
+            Fraction of axis items to return. Cannot be used with `n`.
+        replace : bool, default False
+            Allow or disallow sampling of the same row more than once.
+        weights : str or ndarray-like, optional
+            Default ``None`` results in equal probability weighting.
+            If passed a Series, will align with target object on index. Index
+            values in weights not found in sampled object will be ignored and
+            index values in sampled object not in weights will be assigned
+            weights of zero.
+            If called on a DataFrame, will accept the name of a column
+            when axis = 0.
+            Unless weights are a Series, weights must be same length as axis
+            being sampled.
+            If weights do not sum to 1, they will be normalized to sum to 1.
+            Missing values in the weights column will be treated as zero.
+            Infinite values not allowed.
+            When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1``
+            in order to avoid biased results. See the Notes below for more details.
+        random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional
+            If int, array-like, or BitGenerator, seed for random number generator.
+            If np.random.RandomState or np.random.Generator, use as given.
+            Default ``None`` results in sampling with the current state of np.random.
+        axis : {0 or 'index', 1 or 'columns', None}, default None
+            Axis to sample. Accepts axis number or name. Default is stat axis
+            for given data type. For `Series` this parameter is unused and defaults to `None`.
+        ignore_index : bool, default False
+            If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        Series or DataFrame
+            A new object of same type as caller containing `n` items randomly
+            sampled from the caller object.
+
+        See Also
+        --------
+        DataFrameGroupBy.sample: Generates random samples from each group of a
+            DataFrame object.
+        SeriesGroupBy.sample: Generates random samples from each group of a
+            Series object.
+        numpy.random.choice: Generates a random sample from a given 1-D numpy
+            array.
+
+        Notes
+        -----
+        If `frac` > 1, `replacement` should be set to `True`.
+
+        When replace = False will not allow ``(n * max(weights) / sum(weights)) > 1``,
+        since that would cause results to be biased. E.g. sampling 2 items without replacement
+        with weights [100, 1, 1] would yield two last items in 1/2 of cases, instead of 1/102.
+        This is similar to specifying `n=4` without replacement on a Series with 3 elements.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "num_legs": [2, 4, 8, 0],
+        ...         "num_wings": [2, 0, 0, 0],
+        ...         "num_specimen_seen": [10, 2, 1, 8],
+        ...     },
+        ...     index=["falcon", "dog", "spider", "fish"],
+        ... )
+        >>> df
+                num_legs  num_wings  num_specimen_seen
+        falcon         2          2                 10
+        dog            4          0                  2
+        spider         8          0                  1
+        fish           0          0                  8
+
+        Extract 3 random elements from the ``Series`` ``df['num_legs']``:
+        Note that we use `random_state` to ensure the reproducibility of
+        the examples.
+
+        >>> df["num_legs"].sample(n=3, random_state=1)
+        fish      0
+        spider    8
+        falcon    2
+        Name: num_legs, dtype: int64
+
+        A random 50% sample of the ``DataFrame`` with replacement:
+
+        >>> df.sample(frac=0.5, replace=True, random_state=1)
+              num_legs  num_wings  num_specimen_seen
+        dog          4          0                  2
+        fish         0          0                  8
+
+        An upsample sample of the ``DataFrame`` with replacement:
+        Note that `replace` parameter has to be `True` for `frac` parameter > 1.
+
+        >>> df.sample(frac=2, replace=True, random_state=1)
+                num_legs  num_wings  num_specimen_seen
+        dog            4          0                  2
+        fish           0          0                  8
+        falcon         2          2                 10
+        falcon         2          2                 10
+        fish           0          0                  8
+        dog            4          0                  2
+        fish           0          0                  8
+        dog            4          0                  2
+
+        Using a DataFrame column as weights. Rows with larger value in the
+        `num_specimen_seen` column are more likely to be sampled.
+
+        >>> df.sample(n=2, weights="num_specimen_seen", random_state=1)
+                num_legs  num_wings  num_specimen_seen
+        falcon         2          2                 10
+        fish           0          0                  8
+        """  # noqa: E501
+        if axis is None:
+            axis = 0
+
+        axis = self._get_axis_number(axis)
+        obj_len = self.shape[axis]
+
+        # Process random_state argument
+        rs = common.random_state(random_state)
+
+        size = sample.process_sampling_size(n, frac, replace)
+        if size is None:
+            assert frac is not None
+            size = round(frac * obj_len)
+
+        if weights is not None:
+            weights = sample.preprocess_weights(self, weights, axis)
+
+        sampled_indices = sample.sample(obj_len, size, replace, weights, rs)
+        result = self.take(sampled_indices, axis=axis)
+
+        if ignore_index:
+            result.index = default_index(len(result))
+
+        return result
+
+    @overload
+    def pipe(
+        self,
+        func: Callable[Concatenate[Self, P], T],
+        *args: P.args,
+        **kwargs: P.kwargs,
+    ) -> T: ...
+
+    @overload
+    def pipe(
+        self,
+        func: tuple[Callable[..., T], str],
+        *args: Any,
+        **kwargs: Any,
+    ) -> T: ...
+
+    @final
+    def pipe(
+        self,
+        func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str],
+        *args: Any,
+        **kwargs: Any,
+    ) -> T:
+        r"""
+        Apply chainable functions that expect Series or DataFrames.
+
+        Parameters
+        ----------
+        func : function
+            Function to apply to the Series/DataFrame.
+            ``args``, and ``kwargs`` are passed into ``func``.
+            Alternatively a ``(callable, data_keyword)`` tuple where
+            ``data_keyword`` is a string indicating the keyword of
+            ``callable`` that expects the Series/DataFrame.
+        *args : iterable, optional
+            Positional arguments passed into ``func``.
+        **kwargs : mapping, optional
+            A dictionary of keyword arguments passed into ``func``.
+
+        Returns
+        -------
+        The return type of ``func``.
+            The result of applying ``func`` to the Series or DataFrame.
+
+        See Also
+        --------
+        DataFrame.apply : Apply a function along input axis of DataFrame.
+        DataFrame.map : Apply a function elementwise on a whole DataFrame.
+        Series.map : Apply a mapping correspondence on a
+            :class:`~pandas.Series`.
+
+        Notes
+        -----
+        Use ``.pipe`` when chaining together functions that expect
+        Series, DataFrames or GroupBy objects.
+
+        Examples
+        --------
+        Constructing an income DataFrame from a dictionary.
+
+        >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]]
+        >>> df = pd.DataFrame(data, columns=["Salary", "Others"])
+        >>> df
+           Salary  Others
+        0    8000  1000.0
+        1    9500     NaN
+        2    5000  2000.0
+
+        Functions that perform tax reductions on an income DataFrame.
+
+        >>> def subtract_federal_tax(df):
+        ...     return df * 0.9
+        >>> def subtract_state_tax(df, rate):
+        ...     return df * (1 - rate)
+        >>> def subtract_national_insurance(df, rate, rate_increase):
+        ...     new_rate = rate + rate_increase
+        ...     return df * (1 - new_rate)
+
+        Instead of writing
+
+        >>> subtract_national_insurance(
+        ...     subtract_state_tax(subtract_federal_tax(df), rate=0.12),
+        ...     rate=0.05,
+        ...     rate_increase=0.02,
+        ... )  # doctest: +SKIP
+
+        You can write
+
+        >>> (
+        ...     df.pipe(subtract_federal_tax)
+        ...     .pipe(subtract_state_tax, rate=0.12)
+        ...     .pipe(subtract_national_insurance, rate=0.05, rate_increase=0.02)
+        ... )
+            Salary   Others
+        0  5892.48   736.56
+        1  6997.32      NaN
+        2  3682.80  1473.12
+
+        If you have a function that takes the data as (say) the second
+        argument, pass a tuple indicating which keyword expects the
+        data. For example, suppose ``national_insurance`` takes its data as ``df``
+        in the second argument:
+
+        >>> def subtract_national_insurance(rate, df, rate_increase):
+        ...     new_rate = rate + rate_increase
+        ...     return df * (1 - new_rate)
+        >>> (
+        ...     df.pipe(subtract_federal_tax)
+        ...     .pipe(subtract_state_tax, rate=0.12)
+        ...     .pipe(
+        ...         (subtract_national_insurance, "df"), rate=0.05, rate_increase=0.02
+        ...     )
+        ... )
+            Salary   Others
+        0  5892.48   736.56
+        1  6997.32      NaN
+        2  3682.80  1473.12
+        """
+        return common.pipe(self.copy(deep=False), func, *args, **kwargs)
+
+    # ----------------------------------------------------------------------
+    # Attribute access
+
+    @final
+    def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
+        """
+        Propagate metadata from other to self.
+
+        This is the default implementation. Subclasses may override this method to
+        implement their own metadata handling.
+
+        Parameters
+        ----------
+        other : the object from which to get the attributes that we are going
+            to propagate. If ``other`` has an ``input_objs`` attribute, then
+            this attribute must contain an iterable of objects, each with an
+            ``attrs`` attribute.
+        method : str, optional
+            A passed method name providing context on where ``__finalize__``
+            was called.
+
+            .. warning::
+
+               The value passed as `method` are not currently considered
+               stable across pandas releases.
+
+        Notes
+        -----
+        In case ``other`` has an ``input_objs`` attribute, this method only
+        propagates its metadata if each object in ``input_objs`` has the exact
+        same metadata as the others.
+        """
+        if isinstance(other, NDFrame):
+            if other.attrs:
+                # We want attrs propagation to have minimal performance
+                # impact if attrs are not used; i.e. attrs is an empty dict.
+                # One could make the deepcopy unconditionally, but a deepcopy
+                # of an empty dict is 50x more expensive than the empty check.
+                self.attrs = deepcopy(other.attrs)
+            self.flags.allows_duplicate_labels = (
+                self.flags.allows_duplicate_labels
+                and other.flags.allows_duplicate_labels
+            )
+            # For subclasses using _metadata.
+            for name in set(self._metadata) & set(other._metadata):
+                assert isinstance(name, str)
+                object.__setattr__(self, name, getattr(other, name, None))
+
+        elif hasattr(other, "input_objs"):
+            objs = other.input_objs
+            # propagate attrs only if all inputs have the same attrs
+            if all(bool(obj.attrs) for obj in objs):
+                # all inputs have non-empty attrs
+                attrs = objs[0].attrs
+                have_same_attrs = all(obj.attrs == attrs for obj in objs[1:])
+                if have_same_attrs:
+                    self.attrs = deepcopy(attrs)
+
+            allows_duplicate_labels = all(x.flags.allows_duplicate_labels for x in objs)
+            self.flags.allows_duplicate_labels = allows_duplicate_labels
+
+        return self
+
+    @final
+    def __getattr__(self, name: str):
+        """
+        After regular attribute access, try looking up the name
+        This allows simpler access to columns for interactive use.
+        """
+        # Note: obj.x will always call obj.__getattribute__('x') prior to
+        # calling obj.__getattr__('x').
+        if (
+            name not in self._internal_names_set
+            and name not in self._metadata
+            and name not in self._accessors
+            and self._info_axis._can_hold_identifiers_and_holds_name(name)
+        ):
+            return self[name]
+        return object.__getattribute__(self, name)
+
+    @final
+    def __setattr__(self, name: str, value) -> None:
+        """
+        After regular attribute access, try setting the name
+        This allows simpler access to columns for interactive use.
+        """
+        # first try regular attribute access via __getattribute__, so that
+        # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify
+        # the same attribute.
+
+        try:
+            object.__getattribute__(self, name)
+            return object.__setattr__(self, name, value)
+        except AttributeError:
+            pass
+
+        # if this fails, go on to more involved attribute setting
+        # (note that this matches __getattr__, above).
+        if name in self._internal_names_set:
+            object.__setattr__(self, name, value)
+        elif name in self._metadata:
+            object.__setattr__(self, name, value)
+        else:
+            try:
+                existing = getattr(self, name)
+                if isinstance(existing, Index):
+                    object.__setattr__(self, name, value)
+                elif name in self._info_axis:
+                    self[name] = value
+                else:
+                    object.__setattr__(self, name, value)
+            except (AttributeError, TypeError):
+                if isinstance(self, ABCDataFrame) and (is_list_like(value)):
+                    warnings.warn(
+                        "Pandas doesn't allow columns to be "
+                        "created via a new attribute name - see "
+                        "https://pandas.pydata.org/pandas-docs/"
+                        "stable/indexing.html#attribute-access",
+                        stacklevel=find_stack_level(),
+                    )
+                object.__setattr__(self, name, value)
+
+    @final
+    def _dir_additions(self) -> set[str]:
+        """
+        add the string-like attributes from the info_axis.
+        If info_axis is a MultiIndex, its first level values are used.
+        """
+        additions = super()._dir_additions()
+        if self._info_axis._can_hold_strings:
+            additions.update(self._info_axis._dir_additions_for_owner)
+        return additions
+
+    # ----------------------------------------------------------------------
+    # Consolidation of internals
+
+    @final
+    def _consolidate_inplace(self) -> None:
+        """Consolidate data in place and return None"""
+
+        self._mgr = self._mgr.consolidate()
+
+    @final
+    def _consolidate(self):
+        """
+        Compute NDFrame with "consolidated" internals (data of each dtype
+        grouped together in a single ndarray).
+
+        Returns
+        -------
+        consolidated : same type as caller
+        """
+        cons_data = self._mgr.consolidate()
+        return self._constructor_from_mgr(cons_data, axes=cons_data.axes).__finalize__(
+            self
+        )
+
+    @final
+    @property
+    def _is_mixed_type(self) -> bool:
+        if self._mgr.is_single_block:
+            # Includes all Series cases
+            return False
+
+        if self._mgr.any_extension_types:
+            # Even if they have the same dtype, we can't consolidate them,
+            #  so we pretend this is "mixed'"
+            return True
+
+        return self.dtypes.nunique() > 1
+
+    @final
+    def _get_numeric_data(self) -> Self:
+        new_mgr = self._mgr.get_numeric_data()
+        return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
+
+    @final
+    def _get_bool_data(self):
+        new_mgr = self._mgr.get_bool_data()
+        return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
+
+    # ----------------------------------------------------------------------
+    # Internal Interface Methods
+
+    @property
+    def values(self):
+        raise AbstractMethodError(self)
+
+    @property
+    def _values(self) -> ArrayLike:
+        """internal implementation"""
+        raise AbstractMethodError(self)
+
+    @property
+    def dtypes(self):
+        """
+        Return the dtypes in the DataFrame.
+
+        This returns a Series with the data type of each column.
+        The result's index is the original DataFrame's columns. Columns
+        with mixed types are stored with the ``object`` dtype. See
+        :ref:`the User Guide <basics.dtypes>` for more.
+
+        Returns
+        -------
+        pandas.Series
+            The data type of each column.
+
+        See Also
+        --------
+        Series.dtypes : Return the dtype object of the underlying data.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "float": [1.0],
+        ...         "int": [1],
+        ...         "datetime": [pd.Timestamp("20180310")],
+        ...         "string": ["foo"],
+        ...     }
+        ... )
+        >>> df.dtypes
+        float              float64
+        int                  int64
+        datetime    datetime64[us]
+        string              str
+        dtype: object
+        """
+        data = self._mgr.get_dtypes()
+        return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)
+
+    @final
+    def astype(
+        self,
+        dtype,
+        copy: bool | lib.NoDefault = lib.no_default,
+        errors: IgnoreRaise = "raise",
+    ) -> Self:
+        """
+        Cast a pandas object to a specified dtype ``dtype``.
+
+        This method allows the conversion of the data types of pandas objects,
+        including DataFrames and Series, to the specified dtype. It supports casting
+        entire objects to a single data type or applying different data types to
+        individual columns using a mapping.
+
+        Parameters
+        ----------
+        dtype : str, data type, Series or Mapping of column name -> data type
+            Use a str, numpy.dtype, pandas.ExtensionDtype or Python type to
+            cast entire pandas object to the same type. Alternatively, use a
+            mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is
+            a numpy.dtype or Python type to cast one or more of the DataFrame's
+            columns to column-specific types.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        errors : {'raise', 'ignore'}, default 'raise'
+            Control raising of exceptions on invalid data for provided dtype.
+
+            - ``raise`` : allow exceptions to be raised
+            - ``ignore`` : suppress exceptions. On error return original object.
+
+        Returns
+        -------
+        same type as caller
+            The pandas object casted to the specified ``dtype``.
+
+        See Also
+        --------
+        to_datetime : Convert argument to datetime.
+        to_timedelta : Convert argument to timedelta.
+        to_numeric : Convert argument to a numeric type.
+        numpy.ndarray.astype : Cast a numpy array to a specified type.
+
+        Notes
+        -----
+        .. versionchanged:: 2.0.0
+
+            Using ``astype`` to convert from timezone-naive dtype to
+            timezone-aware dtype will raise an exception.
+            Use :meth:`Series.dt.tz_localize` instead.
+
+        Examples
+        --------
+        Create a DataFrame:
+
+        >>> d = {"col1": [1, 2], "col2": [3, 4]}
+        >>> df = pd.DataFrame(data=d)
+        >>> df.dtypes
+        col1    int64
+        col2    int64
+        dtype: object
+
+        Cast all columns to int32:
+
+        >>> df.astype("int32").dtypes
+        col1    int32
+        col2    int32
+        dtype: object
+
+        Cast col1 to int32 using a dictionary:
+
+        >>> df.astype({"col1": "int32"}).dtypes
+        col1    int32
+        col2    int64
+        dtype: object
+
+        Create a series:
+
+        >>> ser = pd.Series([1, 2], dtype="int32")
+        >>> ser
+        0    1
+        1    2
+        dtype: int32
+        >>> ser.astype("int64")
+        0    1
+        1    2
+        dtype: int64
+
+        Convert to categorical type:
+
+        >>> ser.astype("category")
+        0    1
+        1    2
+        dtype: category
+        Categories (2, int32): [1, 2]
+
+        Convert to ordered categorical type with custom ordering:
+
+        >>> from pandas.api.types import CategoricalDtype
+        >>> cat_dtype = CategoricalDtype(categories=[2, 1], ordered=True)
+        >>> ser.astype(cat_dtype)
+        0    1
+        1    2
+        dtype: category
+        Categories (2, int64): [2 < 1]
+
+        Create a series of dates:
+
+        >>> ser_date = pd.Series(pd.date_range("20200101", periods=3))
+        >>> ser_date
+        0   2020-01-01
+        1   2020-01-02
+        2   2020-01-03
+        dtype: datetime64[us]
+        """
+        self._check_copy_deprecation(copy)
+        if is_dict_like(dtype):
+            if self.ndim == 1:  # i.e. Series
+                if len(dtype) > 1 or self.name not in dtype:
+                    raise KeyError(
+                        "Only the Series name can be used for "
+                        "the key in Series dtype mappings."
+                    )
+                new_type = dtype[self.name]
+                return self.astype(new_type, errors=errors)
+
+            # GH#44417 cast to Series so we can use .iat below, which will be
+            #  robust in case we
+            from pandas import Series
+
+            dtype_ser = Series(dtype, dtype=object)
+
+            for col_name in dtype_ser.index:
+                if col_name not in self:
+                    raise KeyError(
+                        "Only a column name can be used for the "
+                        "key in a dtype mappings argument. "
+                        f"'{col_name}' not found in columns."
+                    )
+
+            dtype_ser = dtype_ser.reindex(self.columns, fill_value=None)
+
+            results = []
+            for i, (col_name, col) in enumerate(self.items()):
+                cdt = dtype_ser.iat[i]
+                if isna(cdt):
+                    res_col = col.copy(deep=False)
+                else:
+                    try:
+                        res_col = col.astype(dtype=cdt, errors=errors)
+                    except ValueError as ex:
+                        ex.args = (
+                            f"{ex}: Error while type casting for column '{col_name}'",
+                        )
+                        raise
+                results.append(res_col)
+
+        elif is_extension_array_dtype(dtype) and self.ndim > 1:
+            # TODO(EA2D): special case not needed with 2D EAs
+            dtype = pandas_dtype(dtype)
+            if isinstance(dtype, ExtensionDtype) and all(
+                block.values.dtype == dtype for block in self._mgr.blocks
+            ):
+                return self.copy(deep=False)
+            # GH 18099/22869: columnwise conversion to extension dtype
+            # GH 24704: self.items handles duplicate column names
+            results = [ser.astype(dtype, errors=errors) for _, ser in self.items()]
+
+        else:
+            # else, only a single dtype is given
+            new_data = self._mgr.astype(dtype=dtype, errors=errors)
+            res = self._constructor_from_mgr(new_data, axes=new_data.axes)
+            return res.__finalize__(self, method="astype")
+
+        # GH 33113: handle empty frame or series
+        if not results:
+            return self.copy(deep=False)
+
+        # GH 19920: retain column metadata after concat
+        result = concat(results, axis=1)
+        # GH#40810 retain subclass
+        # error: Incompatible types in assignment
+        # (expression has type "Self", variable has type "DataFrame")
+        result = self._constructor(result)  # type: ignore[assignment]
+        result.columns = self.columns
+        result = result.__finalize__(self, method="astype")
+        # https://github.com/python/mypy/issues/8354
+        return cast(Self, result)
+
+    @final
+    def copy(self, deep: bool = True) -> Self:
+        """
+        Make a copy of this object's indices and data.
+
+        When ``deep=True`` (default), a new object will be created with a
+        copy of the calling object's data and indices. Modifications to
+        the data or indices of the copy will not be reflected in the
+        original object (see notes below).
+
+        When ``deep=False``, a new object will be created without copying
+        the calling object's data or index (only references to the data
+        and index are copied). With Copy-on-Write, changes to the original
+        will *not* be reflected in the shallow copy (and vice versa). The
+        shallow copy uses a lazy (deferred) copy mechanism that copies the
+        data only when any changes to the original or shallow copy are made,
+        ensuring memory efficiency while maintaining data integrity.
+
+        .. note::
+            In pandas versions prior to 3.0, the default behavior without
+            Copy-on-Write was different: changes to the original *were* reflected
+            in the shallow copy (and vice versa). See the :ref:`Copy-on-Write
+            user guide <copy_on_write>` for more information.
+
+        Parameters
+        ----------
+        deep : bool, default True
+            Make a deep copy, including a copy of the data and the indices.
+            With ``deep=False`` neither the indices nor the data are copied.
+
+        Returns
+        -------
+        Series or DataFrame
+            Object type matches caller.
+
+        See Also
+        --------
+        copy.copy : Return a shallow copy of an object.
+        copy.deepcopy : Return a deep copy of an object.
+
+        Notes
+        -----
+        When ``deep=True``, data is copied but actual Python objects
+        will not be copied recursively, only the reference to the object.
+        This is in contrast to `copy.deepcopy` in the Standard Library,
+        which recursively copies object data (see examples below).
+
+        While ``Index`` objects are copied when ``deep=True``, the underlying
+        numpy array is not copied for performance reasons. Since ``Index`` is
+        immutable, the underlying data can be safely shared and a copy
+        is not needed.
+
+        Since pandas is not thread safe, see the
+        :ref:`gotchas <gotchas.thread-safety>` when copying in a threading
+        environment.
+
+        Copy-on-Write protects shallow copies against accidental modifications.
+        This means that any changes to the copied data would make a new copy
+        of the data upon write (and vice versa). Changes made to either the
+        original or copied variable would not be reflected in the counterpart.
+        See :ref:`Copy_on_Write <copy_on_write>` for more information.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2], index=["a", "b"])
+        >>> s
+        a    1
+        b    2
+        dtype: int64
+
+        >>> s_copy = s.copy(deep=True)
+        >>> s_copy
+        a    1
+        b    2
+        dtype: int64
+
+        Due to Copy-on-Write, shallow copies still protect data modifications.
+        Note shallow does not get modified below.
+
+        >>> s = pd.Series([1, 2], index=["a", "b"])
+        >>> shallow = s.copy(deep=False)
+        >>> s.iloc[1] = 200
+        >>> shallow
+        a    1
+        b    2
+        dtype: int64
+
+        When the data has object dtype, even a deep copy does not copy the
+        underlying Python objects. Updating a nested data object will be
+        reflected in the deep copy.
+
+        >>> s = pd.Series([[1, 2], [3, 4]])
+        >>> deep = s.copy()
+        >>> s[0][0] = 10
+        >>> s
+        0    [10, 2]
+        1     [3, 4]
+        dtype: object
+        >>> deep
+        0    [10, 2]
+        1     [3, 4]
+        dtype: object
+        """
+        data = self._mgr.copy(deep=deep)
+        return self._constructor_from_mgr(data, axes=data.axes).__finalize__(
+            self, method="copy"
+        )
+
+    @final
+    def __copy__(self) -> Self:
+        return self.copy(deep=False)
+
+    @final
+    def __deepcopy__(self, memo=None) -> Self:
+        """
+        Parameters
+        ----------
+        memo, default None
+            Standard signature. Unused
+        """
+        return self.copy(deep=True)
+
+    @final
+    def infer_objects(self, copy: bool | lib.NoDefault = lib.no_default) -> Self:
+        """
+        Attempt to infer better dtypes for object columns.
+
+        Attempts soft conversion of object-dtyped
+        columns, leaving non-object and unconvertible
+        columns unchanged. The inference rules are the
+        same as during normal Series/DataFrame construction.
+
+        Parameters
+        ----------
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        Returns
+        -------
+        same type as input object
+            Returns an object of the same type as the input object.
+
+        See Also
+        --------
+        to_datetime : Convert argument to datetime.
+        to_timedelta : Convert argument to timedelta.
+        to_numeric : Convert argument to numeric type.
+        convert_dtypes : Convert argument to best possible dtype.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})
+        >>> df = df.iloc[1:]
+        >>> df
+           A
+        1  1
+        2  2
+        3  3
+
+        >>> df.dtypes
+        A    object
+        dtype: object
+
+        >>> df.infer_objects().dtypes
+        A    int64
+        dtype: object
+        """
+        self._check_copy_deprecation(copy)
+        new_mgr = self._mgr.convert()
+        res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
+        return res.__finalize__(self, method="infer_objects")
+
+    @final
+    def convert_dtypes(
+        self,
+        infer_objects: bool = True,
+        convert_string: bool = True,
+        convert_integer: bool = True,
+        convert_boolean: bool = True,
+        convert_floating: bool = True,
+        dtype_backend: DtypeBackend = "numpy_nullable",
+    ) -> Self:
+        """
+        Convert columns from numpy dtypes to the best dtypes that support ``pd.NA``.
+
+        Parameters
+        ----------
+        infer_objects : bool, default True
+            Whether object dtypes should be converted to the best possible types.
+        convert_string : bool, default True
+            Whether object dtypes should be converted to ``StringDtype()``.
+        convert_integer : bool, default True
+            Whether, if possible, conversion can be done to integer extension types.
+        convert_boolean : bool, defaults True
+            Whether object dtypes should be converted to ``BooleanDtypes()``.
+        convert_floating : bool, defaults True
+            Whether, if possible, conversion can be done to floating extension types.
+            If `convert_integer` is also True, preference will be give to integer
+            dtypes if the floats can be faithfully casted to integers.
+        dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
+            Back-end data type applied to the resultant :class:`DataFrame` or
+            :class:`Series` (still experimental). Behaviour is as follows:
+
+            * ``"numpy_nullable"``: returns nullable-dtype-backed
+              :class:`DataFrame` or :class:`Serires`.
+            * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
+              :class:`DataFrame` or :class:`Series`.
+
+            .. versionadded:: 2.0
+
+        Returns
+        -------
+        Series or DataFrame
+            Copy of input object with new dtype.
+
+        See Also
+        --------
+        infer_objects : Infer dtypes of objects.
+        to_datetime : Convert argument to datetime.
+        to_timedelta : Convert argument to timedelta.
+        to_numeric : Convert argument to a numeric type.
+
+        Notes
+        -----
+        By default, ``convert_dtypes`` will attempt to convert a Series (or each
+        Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
+        ``convert_string``, ``convert_integer``, ``convert_boolean`` and
+        ``convert_floating``, it is possible to turn off individual conversions
+        to ``StringDtype``, the integer extension types, ``BooleanDtype``
+        or floating extension types, respectively.
+
+        For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
+        rules as during normal Series/DataFrame construction.  Then, if possible,
+        convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer
+        or floating extension type, otherwise leave as ``object``.
+
+        If the dtype is integer, convert to an appropriate integer extension type.
+
+        If the dtype is numeric, and consists of all integers, convert to an
+        appropriate integer extension type. Otherwise, convert to an
+        appropriate floating extension type.
+
+        In the future, as new dtypes are added that support ``pd.NA``, the results
+        of this method will change to support those new dtypes.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
+        ...         "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
+        ...         "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
+        ...         "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
+        ...         "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
+        ...         "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
+        ...     }
+        ... )
+
+        Start with a DataFrame with default dtypes.
+
+        >>> df
+           a  b      c    d     e      f
+        0  1  x   True    h  10.0    NaN
+        1  2  y  False    i   NaN  100.5
+        2  3  z    NaN  NaN  20.0  200.0
+
+        >>> df.dtypes
+        a      int32
+        b     object
+        c     object
+        d     object
+        e    float64
+        f    float64
+        dtype: object
+
+        Convert the DataFrame to use best possible dtypes.
+
+        >>> dfn = df.convert_dtypes()
+        >>> dfn
+           a  b      c     d     e      f
+        0  1  x   True     h    10   <NA>
+        1  2  y  False     i  <NA>  100.5
+        2  3  z   <NA>  <NA>    20  200.0
+
+        >>> dfn.dtypes
+        a      Int32
+        b     string
+        c    boolean
+        d     string
+        e      Int64
+        f    Float64
+        dtype: object
+
+        Start with a Series of strings and missing data represented by ``np.nan``.
+
+        >>> s = pd.Series(["a", "b", np.nan])
+        >>> s
+        0      a
+        1      b
+        2    NaN
+        dtype: str
+
+        Obtain a Series with dtype ``StringDtype``.
+
+        >>> s.convert_dtypes()
+        0       a
+        1       b
+        2    <NA>
+        dtype: string
+        """
+        check_dtype_backend(dtype_backend)
+        new_mgr = self._mgr.convert_dtypes(
+            infer_objects=infer_objects,
+            convert_string=convert_string,
+            convert_integer=convert_integer,
+            convert_boolean=convert_boolean,
+            convert_floating=convert_floating,
+            dtype_backend=dtype_backend,
+        )
+        res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
+        return res.__finalize__(self, method="convert_dtypes")
+
+    # ----------------------------------------------------------------------
+    # Filling NA's
+
+    @final
+    def _pad_or_backfill(
+        self,
+        method: Literal["ffill", "bfill", "pad", "backfill"],
+        *,
+        axis: None | Axis = None,
+        inplace: bool = False,
+        limit: None | int = None,
+        limit_area: Literal["inside", "outside"] | None = None,
+    ):
+        if axis is None:
+            axis = 0
+        axis = self._get_axis_number(axis)
+        method = clean_fill_method(method)
+
+        if axis == 1:
+            if not self._mgr.is_single_block and inplace:
+                raise NotImplementedError
+            # e.g. test_align_fill_method
+            result = self.T._pad_or_backfill(
+                method=method, limit=limit, limit_area=limit_area
+            ).T
+
+            return result
+
+        new_mgr = self._mgr.pad_or_backfill(
+            method=method,
+            limit=limit,
+            limit_area=limit_area,
+            inplace=inplace,
+        )
+        result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
+        if inplace:
+            self._update_inplace(result)
+            return self
+        else:
+            return result.__finalize__(self, method="fillna")
+
+    @final
+    def fillna(
+        self,
+        value: Hashable | Mapping | Series | DataFrame,
+        *,
+        axis: Axis | None = None,
+        inplace: bool = False,
+        limit: int | None = None,
+    ) -> Self:
+        """
+        Fill NA/NaN values with `value`.
+
+        Parameters
+        ----------
+        value : scalar, dict, Series, or DataFrame
+            Value to use to fill holes (e.g. 0), alternately a
+            dict/Series/DataFrame of values specifying which value to use for
+            each index (for a Series) or column (for a DataFrame).  Values not
+            in the dict/Series/DataFrame will not be filled. This value cannot
+            be a list.
+        axis : {0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame
+            Axis along which to fill missing values. For `Series`
+            this parameter is unused and defaults to 0.
+        inplace : bool, default False
+            If True, fill in-place. Note: this will modify any
+            other views on this object (e.g., a no-copy slice for a column in a
+            DataFrame).
+        limit : int, default None
+            This is the maximum number of entries along the entire axis
+            where NaNs will be filled. Must be greater than 0 if not None.
+
+        Returns
+        -------
+        Series/DataFrame
+            Object with missing values filled.
+
+        See Also
+        --------
+        ffill : Fill values by propagating the last valid observation to next valid.
+        bfill : Fill values by using the next valid observation to fill the gap.
+        interpolate : Fill NaN values using interpolation.
+        reindex : Conform object to new index.
+        asfreq : Convert TimeSeries to specified frequency.
+
+        Notes
+        -----
+        For non-object dtype, ``value=None`` will use the NA value of the dtype.
+        See more details in the :ref:`Filling missing data<missing_data.fillna>`
+        section.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [
+        ...         [np.nan, 2, np.nan, 0],
+        ...         [3, 4, np.nan, 1],
+        ...         [np.nan, np.nan, np.nan, np.nan],
+        ...         [np.nan, 3, np.nan, 4],
+        ...     ],
+        ...     columns=list("ABCD"),
+        ... )
+        >>> df
+             A    B   C    D
+        0  NaN  2.0 NaN  0.0
+        1  3.0  4.0 NaN  1.0
+        2  NaN  NaN NaN  NaN
+        3  NaN  3.0 NaN  4.0
+
+        Replace all NaN elements with 0s.
+
+        >>> df.fillna(0)
+             A    B    C    D
+        0  0.0  2.0  0.0  0.0
+        1  3.0  4.0  0.0  1.0
+        2  0.0  0.0  0.0  0.0
+        3  0.0  3.0  0.0  4.0
+
+        Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
+        2, and 3 respectively.
+
+        >>> values = {"A": 0, "B": 1, "C": 2, "D": 3}
+        >>> df.fillna(value=values)
+             A    B    C    D
+        0  0.0  2.0  2.0  0.0
+        1  3.0  4.0  2.0  1.0
+        2  0.0  1.0  2.0  3.0
+        3  0.0  3.0  2.0  4.0
+
+        Only replace the first NaN element.
+
+        >>> df.fillna(value=values, limit=1)
+             A    B    C    D
+        0  0.0  2.0  2.0  0.0
+        1  3.0  4.0  NaN  1.0
+        2  NaN  1.0  NaN  3.0
+        3  NaN  3.0  NaN  4.0
+
+        When filling using a DataFrame, replacement happens along
+        the same column names and same indices
+
+        >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
+        >>> df.fillna(df2)
+             A    B    C    D
+        0  0.0  2.0  0.0  0.0
+        1  3.0  4.0  0.0  1.0
+        2  0.0  0.0  0.0  NaN
+        3  0.0  3.0  0.0  4.0
+
+        Note that column D is not affected since it is not present in df2.
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        if inplace:
+            if not CHAINED_WARNING_DISABLED:
+                if sys.getrefcount(
+                    self
+                ) <= REF_COUNT_METHOD and not common.is_local_in_caller_frame(self):
+                    warnings.warn(
+                        _chained_assignment_method_msg,
+                        ChainedAssignmentError,
+                        stacklevel=2,
+                    )
+
+        if isinstance(value, (list, tuple)):
+            raise TypeError(
+                '"value" parameter must be a scalar or dict, but '
+                f'you passed a "{type(value).__name__}"'
+            )
+
+        # set the default here, so functions examining the signature
+        # can detect if something was set (e.g. in groupby) (GH9221)
+        if axis is None:
+            axis = 0
+        axis = self._get_axis_number(axis)
+
+        if self.ndim == 1:
+            if isinstance(value, (dict, ABCSeries)):
+                if not len(value):
+                    # test_fillna_nonscalar
+                    return self if inplace else self.copy(deep=False)
+                from pandas import Series
+
+                value = Series(value)
+                value = value.reindex(self.index)
+                value = value._values
+            elif not is_list_like(value):
+                pass
+            else:
+                raise TypeError(
+                    '"value" parameter must be a scalar, dict '
+                    "or Series, but you passed a "
+                    f'"{type(value).__name__}"'
+                )
+
+            new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace)
+
+        elif isinstance(value, (dict, ABCSeries)):
+            result = self if inplace else self.copy(deep=False)
+            if axis == 1:
+                # Check that all columns in result have the same dtype
+                # otherwise don't bother with fillna and losing accurate dtypes
+                unique_dtypes = algos.unique(self._mgr.get_dtypes())
+                if len(unique_dtypes) > 1:
+                    raise ValueError(
+                        "All columns must have the same dtype, but got dtypes: "
+                        f"{list(unique_dtypes)}"
+                    )
+                # Use the first column, which we have already validated has the
+                # same dtypes as the other columns.
+                if not can_hold_element(result.iloc[:, 0], value):
+                    frame_dtype = unique_dtypes.item()
+                    raise ValueError(
+                        f"{value} not a suitable type to fill into {frame_dtype}"
+                    )
+                result = result.T.fillna(value=value).T
+                if inplace:
+                    self._update_inplace(result)
+                    result = self
+            else:
+                for k, v in value.items():
+                    if k not in result:
+                        continue
+
+                    res_k = result[k].fillna(v, limit=limit)
+
+                    if not inplace:
+                        result[k] = res_k
+                    # We can write into our existing column(s) iff dtype
+                    #  was preserved.
+                    elif isinstance(res_k, ABCSeries):
+                        # i.e. 'k' only shows up once in self.columns
+                        if res_k.dtype == result[k].dtype:
+                            result.loc[:, k] = res_k
+                        else:
+                            # Different dtype -> no way to do inplace.
+                            result[k] = res_k
+                    else:
+                        # see test_fillna_dict_inplace_nonunique_columns
+                        locs = result.columns.get_loc(k)
+                        if isinstance(locs, slice):
+                            locs = range(self.shape[1])[locs]
+                        elif isinstance(locs, np.ndarray) and locs.dtype.kind == "b":
+                            locs = locs.nonzero()[0]
+                        elif not (
+                            isinstance(locs, np.ndarray) and locs.dtype.kind == "i"
+                        ):
+                            # Should never be reached, but let's cover our bases
+                            raise NotImplementedError(
+                                "Unexpected get_loc result, please report a bug at "
+                                "https://github.com/pandas-dev/pandas"
+                            )
+
+                        for i, loc in enumerate(locs):
+                            res_loc = res_k.iloc[:, i]
+                            target = self.iloc[:, loc]
+
+                            if res_loc.dtype == target.dtype:
+                                result.iloc[:, loc] = res_loc
+                            else:
+                                result.isetitem(loc, res_loc)
+            return result
+
+        elif not is_list_like(value):
+            if axis == 1:
+                result = self.T.fillna(value=value, limit=limit).T
+                new_data = result._mgr
+            else:
+                new_data = self._mgr.fillna(value=value, limit=limit, inplace=inplace)
+        elif isinstance(value, ABCDataFrame) and self.ndim == 2:
+            new_data = self.where(self.notna(), value)._mgr
+        else:
+            raise ValueError(f"invalid fill value with a {type(value)}")
+
+        result = self._constructor_from_mgr(new_data, axes=new_data.axes)
+        if inplace:
+            self._update_inplace(result)
+            return self
+        else:
+            return result.__finalize__(self, method="fillna")
+
+    @final
+    def ffill(
+        self,
+        *,
+        axis: None | Axis = None,
+        inplace: bool = False,
+        limit: None | int = None,
+        limit_area: Literal["inside", "outside"] | None = None,
+    ) -> Self:
+        """
+        Fill NA/NaN values by propagating the last valid observation to next valid.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame
+            Axis along which to fill missing values. For `Series`
+            this parameter is unused and defaults to 0.
+        inplace : bool, default False
+            If True, fill in-place. Note: this will modify any
+            other views on this object (e.g., a no-copy slice for a column in a
+            DataFrame).
+        limit : int, default None
+            If method is specified, this is the maximum number of consecutive
+            NaN values to forward/backward fill. In other words, if there is
+            a gap with more than this number of consecutive NaNs, it will only
+            be partially filled. If method is not specified, this is the
+            maximum number of entries along the entire axis where NaNs will be
+            filled. Must be greater than 0 if not None.
+        limit_area : {{`None`, 'inside', 'outside'}}, default None
+            If limit is specified, consecutive NaNs will be filled with this
+            restriction.
+
+            * ``None``: No fill restriction.
+            * 'inside': Only fill NaNs surrounded by valid values
+              (interpolate).
+            * 'outside': Only fill NaNs outside valid values (extrapolate).
+
+            .. versionadded:: 2.2.0
+
+        Returns
+        -------
+        Series/DataFrame
+            Object with missing values filled.
+
+        See Also
+        --------
+        DataFrame.bfill : Fill NA/NaN values by using the next valid observation
+            to fill the gap.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [
+        ...         [np.nan, 2, np.nan, 0],
+        ...         [3, 4, np.nan, 1],
+        ...         [np.nan, np.nan, np.nan, np.nan],
+        ...         [np.nan, 3, np.nan, 4],
+        ...     ],
+        ...     columns=list("ABCD"),
+        ... )
+        >>> df
+             A    B   C    D
+        0  NaN  2.0 NaN  0.0
+        1  3.0  4.0 NaN  1.0
+        2  NaN  NaN NaN  NaN
+        3  NaN  3.0 NaN  4.0
+
+        >>> df.ffill()
+             A    B   C    D
+        0  NaN  2.0 NaN  0.0
+        1  3.0  4.0 NaN  1.0
+        2  3.0  4.0 NaN  1.0
+        3  3.0  3.0 NaN  4.0
+
+        >>> ser = pd.Series([1, np.nan, 2, 3])
+        >>> ser.ffill()
+        0   1.0
+        1   1.0
+        2   2.0
+        3   3.0
+        dtype: float64
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        if inplace:
+            if not CHAINED_WARNING_DISABLED:
+                if sys.getrefcount(
+                    self
+                ) <= REF_COUNT_METHOD and not common.is_local_in_caller_frame(self):
+                    warnings.warn(
+                        _chained_assignment_method_msg,
+                        ChainedAssignmentError,
+                        stacklevel=2,
+                    )
+
+        return self._pad_or_backfill(
+            "ffill",
+            axis=axis,
+            inplace=inplace,
+            limit=limit,
+            limit_area=limit_area,
+        )
+
+    @final
+    def bfill(
+        self,
+        *,
+        axis: None | Axis = None,
+        inplace: bool = False,
+        limit: None | int = None,
+        limit_area: Literal["inside", "outside"] | None = None,
+    ) -> Self:
+        """
+        Fill NA/NaN values by using the next valid observation to fill the gap.
+
+        This method fills missing values in a backward direction along the
+        specified axis, propagating non-null values from later positions to
+        earlier positions containing NaN.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame
+            Axis along which to fill missing values. For `Series`
+            this parameter is unused and defaults to 0.
+        inplace : bool, default False
+            If True, fill in-place. Note: this will modify any
+            other views on this object (e.g., a no-copy slice for a column in a
+            DataFrame).
+        limit : int, default None
+            If method is specified, this is the maximum number of consecutive
+            NaN values to forward/backward fill. In other words, if there is
+            a gap with more than this number of consecutive NaNs, it will only
+            be partially filled. If method is not specified, this is the
+            maximum number of entries along the entire axis where NaNs will be
+            filled. Must be greater than 0 if not None.
+        limit_area : {{`None`, 'inside', 'outside'}}, default None
+            If limit is specified, consecutive NaNs will be filled with this
+            restriction.
+
+            * ``None``: No fill restriction.
+            * 'inside': Only fill NaNs surrounded by valid values
+              (interpolate).
+            * 'outside': Only fill NaNs outside valid values (extrapolate).
+
+            .. versionadded:: 2.2.0
+
+        Returns
+        -------
+        Series/DataFrame
+            Object with missing values filled.
+
+        See Also
+        --------
+        DataFrame.ffill : Fill NA/NaN values by propagating the last valid
+            observation to next valid.
+
+        Examples
+        --------
+        For Series:
+
+        >>> s = pd.Series([1, None, None, 2])
+        >>> s.bfill()
+        0    1.0
+        1    2.0
+        2    2.0
+        3    2.0
+        dtype: float64
+        >>> s.bfill(limit=1)
+        0    1.0
+        1    NaN
+        2    2.0
+        3    2.0
+        dtype: float64
+
+        With DataFrame:
+
+        >>> df = pd.DataFrame({"A": [1, None, None, 4], "B": [None, 5, None, 7]})
+        >>> df
+              A     B
+        0   1.0	  NaN
+        1   NaN	  5.0
+        2   NaN   NaN
+        3   4.0   7.0
+        >>> df.bfill()
+              A     B
+        0   1.0   5.0
+        1   4.0   5.0
+        2   4.0   7.0
+        3   4.0   7.0
+        >>> df.bfill(limit=1)
+              A     B
+        0   1.0   5.0
+        1   NaN   5.0
+        2   4.0   7.0
+        3   4.0   7.0
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        if inplace:
+            if not CHAINED_WARNING_DISABLED:
+                if sys.getrefcount(
+                    self
+                ) <= REF_COUNT_METHOD and not common.is_local_in_caller_frame(self):
+                    warnings.warn(
+                        _chained_assignment_method_msg,
+                        ChainedAssignmentError,
+                        stacklevel=2,
+                    )
+
+        return self._pad_or_backfill(
+            "bfill",
+            axis=axis,
+            inplace=inplace,
+            limit=limit,
+            limit_area=limit_area,
+        )
+
+    @final
+    def replace(
+        self,
+        to_replace=None,
+        value=lib.no_default,
+        *,
+        inplace: bool = False,
+        regex: bool = False,
+    ) -> Self:
+        """
+        Replace values given in `to_replace` with `value`.
+
+        Values of the Series/DataFrame are replaced with other values dynamically.
+        This differs from updating with ``.loc`` or ``.iloc``, which require
+        you to specify a location to update with some value.
+
+        Parameters
+        ----------
+        to_replace : str, regex, list, dict, Series, int, float, or None
+            How to find the values that will be replaced.
+
+            * numeric, str or regex:
+
+                - numeric: numeric values equal to `to_replace` will be
+                  replaced with `value`
+                - str: string exactly matching `to_replace` will be replaced
+                  with `value`
+                - regex: regexes matching `to_replace` will be replaced with
+                  `value`
+
+            * list of str, regex, or numeric:
+
+                - First, if `to_replace` and `value` are both lists, they
+                  **must** be the same length.
+                - Second, if ``regex=True`` then all of the strings in **both**
+                  lists will be interpreted as regexes otherwise they will match
+                  directly. This doesn't matter much for `value` since there
+                  are only a few possible substitution regexes you can use.
+                - str, regex and numeric rules apply as above.
+
+            * dict:
+
+                - Dicts can be used to specify different replacement values
+                  for different existing values. For example,
+                  ``{'a': 'b', 'y': 'z'}`` replaces the value 'a' with 'b' and
+                  'y' with 'z'. To use a dict in this way, the optional `value`
+                  parameter should not be given.
+                - For a DataFrame a dict can specify that different values
+                  should be replaced in different columns. For example,
+                  ``{'a': 1, 'b': 'z'}`` looks for the value 1 in column 'a'
+                  and the value 'z' in column 'b' and replaces these values
+                  with whatever is specified in `value`. The `value` parameter
+                  should not be ``None`` in this case. You can treat this as a
+                  special case of passing two lists except that you are
+                  specifying the column to search in.
+                - For a DataFrame nested dictionaries, e.g.,
+                  ``{'a': {'b': np.nan}}``, are read as follows: look in column
+                  'a' for the value 'b' and replace it with NaN. The optional `value`
+                  parameter should not be specified to use a nested dict in this
+                  way. You can nest regular expressions as well. Note that
+                  column names (the top-level dictionary keys in a nested
+                  dictionary) **cannot** be regular expressions.
+
+            * None:
+
+                - This means that the `regex` argument must be a string,
+                  compiled regular expression, or list, dict, ndarray or
+                  Series of such elements. If `value` is also ``None`` then
+                  this **must** be a nested dictionary or Series.
+
+            See the examples section for examples of each of these.
+        value : scalar, dict, list, str, regex, default None
+            Value to replace any values matching `to_replace` with.
+            For a DataFrame a dict of values can be used to specify which
+            value to use for each column (columns not in the dict will not be
+            filled). Regular expressions, strings and lists or dicts of such
+            objects are also allowed.
+
+        inplace : bool, default False
+            If True, performs operation inplace.
+        regex : bool or same types as `to_replace`, default False
+            Whether to interpret `to_replace` and/or `value` as regular
+            expressions. Alternatively, this could be a regular expression or a
+            list, dict, or array of regular expressions in which case
+            `to_replace` must be ``None``.
+
+        Returns
+        -------
+        Series/DataFrame
+            Object after replacement.
+
+        Raises
+        ------
+        AssertionError
+            * If `regex` is not a ``bool`` and `to_replace` is not
+              ``None``.
+
+        TypeError
+            * If `to_replace` is not a scalar, array-like, ``dict``, or ``None``
+            * If `to_replace` is a ``dict`` and `value` is not a ``list``,
+              ``dict``, ``ndarray``, or ``Series``
+            * If `to_replace` is ``None`` and `regex` is not compilable
+              into a regular expression or is a list, dict, ndarray, or
+              Series.
+            * When replacing multiple ``bool`` or ``datetime64`` objects and
+              the arguments to `to_replace` does not match the type of the
+              value being replaced
+
+        ValueError
+            * If a ``list`` or an ``ndarray`` is passed to `to_replace` and
+              `value` but they are not the same length.
+
+        See Also
+        --------
+        Series.fillna : Fill NA values.
+        DataFrame.fillna : Fill NA values.
+        Series.where : Replace values based on boolean condition.
+        DataFrame.where : Replace values based on boolean condition.
+        DataFrame.map: Apply a function to a Dataframe elementwise.
+        Series.map: Map values of Series according to an input mapping or function.
+        Series.str.replace : Simple string replacement.
+
+        Notes
+        -----
+        * Regex substitution is performed under the hood with ``re.sub``. The
+          rules for substitution for ``re.sub`` are the same.
+        * Regular expressions will only substitute on strings, meaning you
+          cannot provide, for example, a regular expression matching floating
+          point numbers and expect the columns in your frame that have a
+          numeric dtype to be matched. However, if those floating point
+          numbers *are* strings, then you can do this.
+        * This method has *a lot* of options. You are encouraged to experiment
+          and play with this method to gain intuition about how it works.
+        * When dict is used as the `to_replace` value, it is like
+          key(s) in the dict are the to_replace part and
+          value(s) in the dict are the value parameter.
+
+        Examples
+        --------
+
+        **Scalar `to_replace` and `value`**
+
+        >>> s = pd.Series([1, 2, 3, 4, 5])
+        >>> s.replace(1, 5)
+        0    5
+        1    2
+        2    3
+        3    4
+        4    5
+        dtype: int64
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "A": [0, 1, 2, 3, 4],
+        ...         "B": [5, 6, 7, 8, 9],
+        ...         "C": ["a", "b", "c", "d", "e"],
+        ...     }
+        ... )
+        >>> df.replace(0, 5)
+            A  B  C
+        0  5  5  a
+        1  1  6  b
+        2  2  7  c
+        3  3  8  d
+        4  4  9  e
+
+        **List-like `to_replace`**
+
+        >>> df.replace([0, 1, 2, 3], 4)
+            A  B  C
+        0  4  5  a
+        1  4  6  b
+        2  4  7  c
+        3  4  8  d
+        4  4  9  e
+
+        >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1])
+            A  B  C
+        0  4  5  a
+        1  3  6  b
+        2  2  7  c
+        3  1  8  d
+        4  4  9  e
+
+        **dict-like `to_replace`**
+
+        >>> df.replace({0: 10, 1: 100})
+                A  B  C
+        0   10  5  a
+        1  100  6  b
+        2    2  7  c
+        3    3  8  d
+        4    4  9  e
+
+        >>> df.replace({"A": 0, "B": 5}, 100)
+                A    B  C
+        0  100  100  a
+        1    1    6  b
+        2    2    7  c
+        3    3    8  d
+        4    4    9  e
+
+        >>> df.replace({"A": {0: 100, 4: 400}})
+                A  B  C
+        0  100  5  a
+        1    1  6  b
+        2    2  7  c
+        3    3  8  d
+        4  400  9  e
+
+        **Regular expression `to_replace`**
+
+        >>> df = pd.DataFrame({"A": ["bat", "foo", "bait"], "B": ["abc", "bar", "xyz"]})
+        >>> df.replace(to_replace=r"^ba.$", value="new", regex=True)
+                A    B
+        0   new  abc
+        1   foo  new
+        2  bait  xyz
+
+        >>> df.replace({"A": r"^ba.$"}, {"A": "new"}, regex=True)
+                A    B
+        0   new  abc
+        1   foo  bar
+        2  bait  xyz
+
+        >>> df.replace(regex=r"^ba.$", value="new")
+                A    B
+        0   new  abc
+        1   foo  new
+        2  bait  xyz
+
+        >>> df.replace(regex={r"^ba.$": "new", "foo": "xyz"})
+                A    B
+        0   new  abc
+        1   xyz  new
+        2  bait  xyz
+
+        >>> df.replace(regex=[r"^ba.$", "foo"], value="new")
+                A    B
+        0   new  abc
+        1   new  new
+        2  bait  xyz
+
+        Compare the behavior of ``s.replace({'a': None})`` and
+        ``s.replace('a', None)`` to understand the peculiarities
+        of the `to_replace` parameter:
+
+        >>> s = pd.Series([10, "a", "a", "b", "a"])
+
+        When one uses a dict as the `to_replace` value, it is like the
+        value(s) in the dict are equal to the `value` parameter.
+        ``s.replace({'a': None})`` is equivalent to
+        ``s.replace(to_replace={'a': None}, value=None)``:
+
+        >>> s.replace({"a": None})
+        0      10
+        1    None
+        2    None
+        3       b
+        4    None
+        dtype: object
+
+        If ``None`` is explicitly passed for ``value``, it will be respected:
+
+        >>> s.replace("a", None)
+        0      10
+        1    None
+        2    None
+        3       b
+        4    None
+        dtype: object
+
+        When ``regex=True``, ``value`` is not ``None`` and `to_replace` is a string,
+        the replacement will be applied in all columns of the DataFrame.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "A": [0, 1, 2, 3, 4],
+        ...         "B": ["a", "b", "c", "d", "e"],
+        ...         "C": ["f", "g", "h", "i", "j"],
+        ...     }
+        ... )
+
+        >>> df.replace(to_replace="^[a-g]", value="e", regex=True)
+            A  B  C
+        0  0  e  e
+        1  1  e  e
+        2  2  e  h
+        3  3  e  i
+        4  4  e  j
+
+        If ``value`` is not ``None`` and `to_replace` is a dictionary, the dictionary
+        keys will be the DataFrame columns that the replacement will be applied.
+
+        >>> df.replace(to_replace={"B": "^[a-c]", "C": "^[h-j]"}, value="e", regex=True)
+            A  B  C
+        0  0  e  f
+        1  1  e  g
+        2  2  e  e
+        3  3  d  e
+        4  4  e  e
+        """
+        if not is_bool(regex) and to_replace is not None:
+            raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool")
+
+        if not (
+            is_scalar(to_replace)
+            or is_re_compilable(to_replace)
+            or is_list_like(to_replace)
+        ):
+            raise TypeError(
+                "Expecting 'to_replace' to be either a scalar, array-like, "
+                "dict or None, got invalid type "
+                f"{type(to_replace).__name__!r}"
+            )
+
+        if value is lib.no_default and not (
+            is_dict_like(to_replace) or is_dict_like(regex)
+        ):
+            raise ValueError(
+                # GH#33302
+                f"{type(self).__name__}.replace must specify either 'value', "
+                "a dict-like 'to_replace', or dict-like 'regex'."
+            )
+
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        if inplace:
+            if not CHAINED_WARNING_DISABLED:
+                if sys.getrefcount(
+                    self
+                ) <= REF_COUNT_METHOD and not common.is_local_in_caller_frame(self):
+                    warnings.warn(
+                        _chained_assignment_method_msg,
+                        ChainedAssignmentError,
+                        stacklevel=2,
+                    )
+
+        if value is lib.no_default:
+            if not is_dict_like(to_replace):
+                # In this case we have checked above that
+                #  1) regex is dict-like and 2) to_replace is None
+                to_replace = regex
+                regex = True
+
+            items = list(to_replace.items())
+            if items:
+                keys, values = zip(*items, strict=True)
+            else:
+                keys, values = ([], [])  # type: ignore[assignment]
+
+            are_mappings = [is_dict_like(v) for v in values]
+
+            if any(are_mappings):
+                if not all(are_mappings):
+                    raise TypeError(
+                        "If a nested mapping is passed, all values "
+                        "of the top level mapping must be mappings"
+                    )
+                # passed a nested dict/Series
+                to_rep_dict = {}
+                value_dict = {}
+
+                for k, v in items:
+                    # error: Incompatible types in assignment (expression has type
+                    # "list[Never]", variable has type "tuple[Any, ...]")
+                    keys, values = list(zip(*v.items(), strict=True)) or (  # type: ignore[assignment]
+                        [],
+                        [],
+                    )
+
+                    to_rep_dict[k] = list(keys)
+                    value_dict[k] = list(values)
+
+                to_replace, value = to_rep_dict, value_dict
+            else:
+                to_replace, value = keys, values
+
+            return self.replace(to_replace, value, inplace=inplace, regex=regex)
+        else:
+            # need a non-zero len on all axes
+            if not self.size:
+                return self if inplace else self.copy(deep=False)
+            if is_dict_like(to_replace):
+                if is_dict_like(value):  # {'A' : NA} -> {'A' : 0}
+                    if isinstance(self, ABCSeries):
+                        raise ValueError(
+                            "to_replace and value cannot be dict-like for "
+                            "Series.replace"
+                        )
+                    # Note: Checking below for `in foo.keys()` instead of
+                    #  `in foo` is needed for when we have a Series and not dict
+                    mapping = {
+                        col: (to_replace[col], value[col])
+                        for col in to_replace.keys()
+                        if col in value.keys() and col in self
+                    }
+                    return self._replace_columnwise(mapping, inplace, regex)
+
+                # {'A': NA} -> 0
+                elif not is_list_like(value):
+                    # Operate column-wise
+                    if self.ndim == 1:
+                        raise ValueError(
+                            "Series.replace cannot specify both a dict-like "
+                            "'to_replace' and a 'value'"
+                        )
+                    mapping = {
+                        col: (to_rep, value) for col, to_rep in to_replace.items()
+                    }
+                    return self._replace_columnwise(mapping, inplace, regex)
+                else:
+                    raise TypeError("value argument must be scalar, dict, or Series")
+
+            elif is_list_like(to_replace):
+                if not is_list_like(value):
+                    # e.g. to_replace = [NA, ''] and value is 0,
+                    #  so we replace NA with 0 and then replace '' with 0
+                    value = [value] * len(to_replace)
+
+                # e.g. we have to_replace = [NA, ''] and value = [0, 'missing']
+                if len(to_replace) != len(value):
+                    raise ValueError(
+                        f"Replacement lists must match in length. "
+                        f"Expecting {len(to_replace)} got {len(value)} "
+                    )
+                new_data = self._mgr.replace_list(
+                    src_list=to_replace,
+                    dest_list=value,
+                    inplace=inplace,
+                    regex=regex,
+                )
+
+            elif to_replace is None:
+                if not (
+                    is_re_compilable(regex)
+                    or is_list_like(regex)
+                    or is_dict_like(regex)
+                ):
+                    raise TypeError(
+                        f"'regex' must be a string or a compiled regular expression "
+                        f"or a list or dict of strings or regular expressions, "
+                        f"you passed a {type(regex).__name__!r}"
+                    )
+                return self.replace(regex, value, inplace=inplace, regex=True)
+            # dest iterable dict-like
+            elif is_dict_like(value):  # NA -> {'A' : 0, 'B' : -1}
+                # Operate column-wise
+                if self.ndim == 1:
+                    raise ValueError(
+                        "Series.replace cannot use dict-value and non-None to_replace"
+                    )
+                mapping = {col: (to_replace, val) for col, val in value.items()}
+                return self._replace_columnwise(mapping, inplace, regex)
+
+            elif not is_list_like(value):  # NA -> 0
+                regex = should_use_regex(regex, to_replace)
+                if regex:
+                    new_data = self._mgr.replace_regex(
+                        to_replace=to_replace,
+                        value=value,
+                        inplace=inplace,
+                    )
+                else:
+                    new_data = self._mgr.replace(
+                        to_replace=to_replace, value=value, inplace=inplace
+                    )
+            else:
+                raise TypeError(
+                    f'Invalid "to_replace" type: {type(to_replace).__name__!r}'
+                )
+
+        result = self._constructor_from_mgr(new_data, axes=new_data.axes)
+        if inplace:
+            self._update_inplace(result)
+            return self
+        else:
+            return result.__finalize__(self, method="replace")
+
+    @final
+    def interpolate(
+        self,
+        method: InterpolateOptions = "linear",
+        *,
+        axis: Axis = 0,
+        limit: int | None = None,
+        inplace: bool = False,
+        limit_direction: Literal["forward", "backward", "both"] | None = None,
+        limit_area: Literal["inside", "outside"] | None = None,
+        **kwargs,
+    ) -> Self:
+        """
+        Fill NaN values using an interpolation method.
+
+        Please note that only ``method='linear'`` is supported for
+        DataFrame/Series with a MultiIndex.
+
+        Parameters
+        ----------
+        method : str, default 'linear'
+            Interpolation technique to use. One of:
+
+            * 'linear': Ignore the index and treat the values as equally
+              spaced. This is the only method supported on MultiIndexes.
+            * 'time': Works on daily and higher resolution data to interpolate
+              given length of interval. This interpolates values based on
+              time interval between observations.
+            * 'index': The interpolation uses the numerical values
+              of the DataFrame's index to linearly calculate missing values.
+            * 'values': Interpolation based on the numerical values
+              in the DataFrame, treating them as equally spaced along the index.
+            * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
+              'barycentric', 'polynomial': Passed to
+              `scipy.interpolate.interp1d`, whereas 'spline' is passed to
+              `scipy.interpolate.UnivariateSpline`. These methods use the numerical
+              values of the index.  Both 'polynomial' and 'spline' require that
+              you also specify an `order` (int), e.g.
+              ``df.interpolate(method='polynomial', order=5)``. Note that,
+              `slinear` method in Pandas refers to the Scipy first order `spline`
+              instead of Pandas first order `spline`.
+            * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima',
+              'cubicspline': Wrappers around the SciPy interpolation methods of
+              similar names. See `Notes`.
+            * 'from_derivatives': Refers to
+              `scipy.interpolate.BPoly.from_derivatives`.
+
+        axis : {{0 or 'index', 1 or 'columns', None}}, default None
+            Axis to interpolate along. For `Series` this parameter is unused
+            and defaults to 0.
+        limit : int, optional
+            Maximum number of consecutive NaNs to fill. Must be greater than
+            0.
+        inplace : bool, default False
+            Update the data in place if possible.
+        limit_direction : {{'forward', 'backward', 'both'}}, optional, default 'forward'
+            Consecutive NaNs will be filled in this direction.
+
+        limit_area : {{`None`, 'inside', 'outside'}}, default None
+            If limit is specified, consecutive NaNs will be filled with this
+            restriction.
+
+            * ``None``: No fill restriction.
+            * 'inside': Only fill NaNs surrounded by valid values
+              (interpolate).
+            * 'outside': Only fill NaNs outside valid values (extrapolate).
+
+        **kwargs : optional
+            Keyword arguments to pass on to the interpolating function.
+
+        Returns
+        -------
+        Series or DataFrame
+            Returns the same object type as the caller, interpolated at
+            some or all ``NaN`` values.
+
+        See Also
+        --------
+        fillna : Fill missing values using different methods.
+        scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials
+            (Akima interpolator).
+        scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the
+            Bernstein basis.
+        scipy.interpolate.interp1d : Interpolate a 1-D function.
+        scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh
+            interpolator).
+        scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic
+            interpolation.
+        scipy.interpolate.CubicSpline : Cubic spline data interpolator.
+
+        Notes
+        -----
+        The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima'
+        methods are wrappers around the respective SciPy implementations of
+        similar names. These use the actual numerical values of the index.
+        For more information on their behavior, see the
+        `SciPy documentation
+        <https://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation>`__.
+
+        Examples
+        --------
+        Filling in ``NaN`` in a :class:`~pandas.Series` via linear
+        interpolation.
+
+        >>> s = pd.Series([0, 1, np.nan, 3])
+        >>> s
+        0    0.0
+        1    1.0
+        2    NaN
+        3    3.0
+        dtype: float64
+        >>> s.interpolate()
+        0    0.0
+        1    1.0
+        2    2.0
+        3    3.0
+        dtype: float64
+
+        Filling in ``NaN`` in a Series via polynomial interpolation or splines:
+        Both 'polynomial' and 'spline' methods require that you also specify
+        an ``order`` (int).
+
+        >>> s = pd.Series([0, 2, np.nan, 8])
+        >>> s.interpolate(method="polynomial", order=2)
+        0    0.000000
+        1    2.000000
+        2    4.666667
+        3    8.000000
+        dtype: float64
+
+        Fill the DataFrame forward (that is, going down) along each column
+        using linear interpolation.
+
+        Note how the last entry in column 'a' is interpolated differently,
+        because there is no entry after it to use for interpolation.
+        Note how the first entry in column 'b' remains ``NaN``, because there
+        is no entry before it to use for interpolation.
+
+        >>> df = pd.DataFrame(
+        ...     [
+        ...         (0.0, np.nan, -1.0, 1.0),
+        ...         (np.nan, 2.0, np.nan, np.nan),
+        ...         (2.0, 3.0, np.nan, 9.0),
+        ...         (np.nan, 4.0, -4.0, 16.0),
+        ...     ],
+        ...     columns=list("abcd"),
+        ... )
+        >>> df
+             a    b    c     d
+        0  0.0  NaN -1.0   1.0
+        1  NaN  2.0  NaN   NaN
+        2  2.0  3.0  NaN   9.0
+        3  NaN  4.0 -4.0  16.0
+        >>> df.interpolate(method="linear", limit_direction="forward", axis=0)
+             a    b    c     d
+        0  0.0  NaN -1.0   1.0
+        1  1.0  2.0 -2.0   5.0
+        2  2.0  3.0 -3.0   9.0
+        3  2.0  4.0 -4.0  16.0
+
+        Using polynomial interpolation.
+
+        >>> df["d"].interpolate(method="polynomial", order=2)
+        0     1.0
+        1     4.0
+        2     9.0
+        3    16.0
+        Name: d, dtype: float64
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+
+        if inplace:
+            if not CHAINED_WARNING_DISABLED:
+                if sys.getrefcount(
+                    self
+                ) <= REF_COUNT_METHOD and not common.is_local_in_caller_frame(self):
+                    warnings.warn(
+                        _chained_assignment_method_msg,
+                        ChainedAssignmentError,
+                        stacklevel=2,
+                    )
+
+        axis = self._get_axis_number(axis)
+
+        if self.empty:
+            return self if inplace else self.copy()
+
+        if not isinstance(method, str):
+            raise ValueError("'method' should be a string, not None.")
+
+        obj, should_transpose = (self.T, True) if axis == 1 else (self, False)
+
+        if isinstance(obj.index, MultiIndex) and method != "linear":
+            raise ValueError(
+                "Only `method=linear` interpolation is supported on MultiIndexes."
+            )
+
+        limit_direction = missing.infer_limit_direction(limit_direction, method)
+
+        index = missing.get_interp_index(method, obj.index)
+        new_data = obj._mgr.interpolate(
+            method=method,
+            index=index,
+            limit=limit,
+            limit_direction=limit_direction,
+            limit_area=limit_area,
+            inplace=inplace,
+            **kwargs,
+        )
+
+        result = self._constructor_from_mgr(new_data, axes=new_data.axes)
+        if should_transpose:
+            result = result.T
+        if inplace:
+            self._update_inplace(result)
+            return self
+        else:
+            return result.__finalize__(self, method="interpolate")
+
+    # ----------------------------------------------------------------------
+    # Timeseries methods Methods
+
+    @final
+    def asof(self, where, subset=None):
+        """
+        Return the last row(s) without any NaNs before `where`.
+
+        The last row (for each element in `where`, if list) without any
+        NaN is taken.
+        In case of a :class:`~pandas.DataFrame`, the last row without NaN
+        considering only the subset of columns (if not `None`)
+
+        If there is no good value, NaN is returned for a Series or
+        a Series of NaN values for a DataFrame
+
+        Parameters
+        ----------
+        where : date or array-like of dates
+            Date(s) before which the last row(s) are returned.
+        subset : str or array-like of str, default `None`
+            For DataFrame, if not `None`, only use these columns to
+            check for NaNs.
+
+        Returns
+        -------
+        scalar, Series, or DataFrame
+
+            The return can be:
+
+            * scalar : when `self` is a Series and `where` is a scalar
+            * Series: when `self` is a Series and `where` is an array-like,
+              or when `self` is a DataFrame and `where` is a scalar
+            * DataFrame : when `self` is a DataFrame and `where` is an
+              array-like
+
+        See Also
+        --------
+        merge_asof : Perform an asof merge. Similar to left join.
+
+        Notes
+        -----
+        Dates are assumed to be sorted. Raises if this is not the case.
+
+        Examples
+        --------
+        A Series and a scalar `where`.
+
+        >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])
+        >>> s
+        10    1.0
+        20    2.0
+        30    NaN
+        40    4.0
+        dtype: float64
+
+        >>> s.asof(20)
+        np.float64(2.0)
+
+        For a sequence `where`, a Series is returned. The first value is
+        NaN, because the first element of `where` is before the first
+        index value.
+
+        >>> s.asof([5, 20])
+        5     NaN
+        20    2.0
+        dtype: float64
+
+        Missing values are not considered. The following is ``2.0``, not
+        NaN, even though NaN is at the index location for ``30``.
+
+        >>> s.asof(30)
+        np.float64(2.0)
+
+        Take all columns into consideration
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "a": [10.0, 20.0, 30.0, 40.0, 50.0],
+        ...         "b": [None, None, None, None, 500],
+        ...     },
+        ...     index=pd.DatetimeIndex(
+        ...         [
+        ...             "2018-02-27 09:01:00",
+        ...             "2018-02-27 09:02:00",
+        ...             "2018-02-27 09:03:00",
+        ...             "2018-02-27 09:04:00",
+        ...             "2018-02-27 09:05:00",
+        ...         ]
+        ...     ),
+        ... )
+        >>> df.asof(pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"]))
+                              a   b
+        2018-02-27 09:03:30 NaN NaN
+        2018-02-27 09:04:30 NaN NaN
+
+        Take a single column into consideration
+
+        >>> df.asof(
+        ...     pd.DatetimeIndex(["2018-02-27 09:03:30", "2018-02-27 09:04:30"]),
+        ...     subset=["a"],
+        ... )
+                                a   b
+        2018-02-27 09:03:30  30.0 NaN
+        2018-02-27 09:04:30  40.0 NaN
+        """
+        if isinstance(where, str):
+            where = Timestamp(where)
+
+        if not self.index.is_monotonic_increasing:
+            raise ValueError("asof requires a sorted index")
+
+        is_series = isinstance(self, ABCSeries)
+        if is_series:
+            if subset is not None:
+                raise ValueError("subset is not valid for Series")
+        else:
+            if subset is None:
+                subset = self.columns
+            if not is_list_like(subset):
+                subset = [subset]
+
+        is_list = is_list_like(where)
+        if not is_list:
+            start = self.index[0]
+            if isinstance(self.index, PeriodIndex):
+                where = Period(where, freq=self.index.freq)
+
+            if where < start:
+                if not is_series:
+                    return self._constructor_sliced(
+                        index=self.columns, name=where, dtype=np.float64
+                    )
+                return np.nan
+
+            # It's always much faster to use a *while* loop here for
+            # Series than pre-computing all the NAs. However a
+            # *while* loop is extremely expensive for DataFrame
+            # so we later pre-compute all the NAs and use the same
+            # code path whether *where* is a scalar or list.
+            # See PR: https://github.com/pandas-dev/pandas/pull/14476
+            if is_series:
+                loc = self.index.searchsorted(where, side="right")
+                if loc > 0:
+                    loc -= 1
+
+                values = self._values
+                while loc > 0 and isna(values[loc]):
+                    loc -= 1
+                return values[loc]
+
+        if not isinstance(where, Index):
+            where = Index(where) if is_list else Index([where])
+
+        nulls = self.isna() if is_series else self[subset].isna().any(axis=1)
+        if nulls.all():
+            if is_series:
+                self = cast("Series", self)
+                return self._constructor(np.nan, index=where, name=self.name)
+            elif is_list:
+                self = cast("DataFrame", self)
+                return self._constructor(np.nan, index=where, columns=self.columns)
+            else:
+                self = cast("DataFrame", self)
+                return self._constructor_sliced(
+                    np.nan, index=self.columns, name=where[0]
+                )
+
+        # error: Unsupported operand type for
+        # ~ ("ExtensionArray | ndarray[Any, Any] | Any")
+        locs = self.index.asof_locs(where, ~nulls._values)  # type: ignore[operator]
+
+        # mask the missing
+        mask = locs == -1
+        data = self.take(locs)
+        data.index = where
+        if mask.any():
+            # GH#16063 only do this setting when necessary, otherwise
+            #  we'd cast e.g. bools to floats
+            data.loc[mask] = np.nan
+        return data if is_list else data.iloc[-1]
+
+    # ----------------------------------------------------------------------
+    # Action Methods
+
+    def isna(self) -> Self:
+        """
+        Detect missing values.
+
+        Return a boolean same-sized object indicating if the values are NA.
+        NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
+        values.
+        Everything else gets mapped to False values. Characters such as empty
+        strings ``''`` or :attr:`numpy.inf` are not considered NA values.
+
+        Returns
+        -------
+        Series/DataFrame
+            Mask of bool values for each element in Series/DataFrame
+            that indicates whether an element is an NA value.
+
+        See Also
+        --------
+        Series.isnull : Alias of isna.
+        DataFrame.isnull : Alias of isna.
+        Series.notna : Boolean inverse of isna.
+        DataFrame.notna : Boolean inverse of isna.
+        Series.dropna : Omit axes labels with missing values.
+        DataFrame.dropna : Omit axes labels with missing values.
+        isna : Top-level isna.
+
+        Examples
+        --------
+        Show which entries in a DataFrame are NA.
+
+        >>> df = pd.DataFrame(
+        ...     dict(
+        ...         age=[5, 6, np.nan],
+        ...         born=[
+        ...             pd.NaT,
+        ...             pd.Timestamp("1939-05-27"),
+        ...             pd.Timestamp("1940-04-25"),
+        ...         ],
+        ...         name=["Alfred", "Batman", ""],
+        ...         toy=[None, "Batmobile", "Joker"],
+        ...     )
+        ... )
+        >>> df
+           age       born    name        toy
+        0  5.0        NaT  Alfred        NaN
+        1  6.0 1939-05-27  Batman  Batmobile
+        2  NaN 1940-04-25              Joker
+
+        >>> df.isna()
+             age   born   name    toy
+        0  False   True  False   True
+        1  False  False  False  False
+        2   True  False  False  False
+
+        Show which entries in a Series are NA.
+
+        >>> ser = pd.Series([5, 6, np.nan])
+        >>> ser
+        0    5.0
+        1    6.0
+        2    NaN
+        dtype: float64
+
+        >>> ser.isna()
+        0    False
+        1    False
+        2     True
+        dtype: bool
+        """
+        return isna(self).__finalize__(self, method="isna")
+
+    def isnull(self) -> Self:
+        """
+        Detect missing values.
+
+        Return a boolean same-sized object indicating if the values are NA.
+        NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
+        values.
+        Everything else gets mapped to False values. Characters such as empty
+        strings ``''`` or :attr:`numpy.inf` are not considered NA values.
+
+        Returns
+        -------
+        Series/DataFrame
+            Mask of bool values for each element in Series/DataFrame
+            that indicates whether an element is an NA value.
+
+        See Also
+        --------
+        Series.isna : Alias of isnull.
+        DataFrame.isna : Alias of isnull.
+        Series.notna : Boolean inverse of isnull.
+        DataFrame.notna : Boolean inverse of isnull.
+        Series.dropna : Omit axes labels with missing values.
+        DataFrame.dropna : Omit axes labels with missing values.
+        isna : Top-level isna.
+
+        Examples
+        --------
+        Show which entries in a DataFrame are NA.
+
+        >>> df = pd.DataFrame(
+        ...     dict(
+        ...         age=[5, 6, np.nan],
+        ...         born=[
+        ...             pd.NaT,
+        ...             pd.Timestamp("1939-05-27"),
+        ...             pd.Timestamp("1940-04-25"),
+        ...         ],
+        ...         name=["Alfred", "Batman", ""],
+        ...         toy=[None, "Batmobile", "Joker"],
+        ...     )
+        ... )
+        >>> df
+           age       born    name        toy
+        0  5.0        NaT  Alfred        NaN
+        1  6.0 1939-05-27  Batman  Batmobile
+        2  NaN 1940-04-25              Joker
+
+        >>> df.isna()
+             age   born   name    toy
+        0  False   True  False   True
+        1  False  False  False  False
+        2   True  False  False  False
+
+        Show which entries in a Series are NA.
+
+        >>> ser = pd.Series([5, 6, np.nan])
+        >>> ser
+        0    5.0
+        1    6.0
+        2    NaN
+        dtype: float64
+
+        >>> ser.isna()
+        0    False
+        1    False
+        2     True
+        dtype: bool
+        """
+        return isna(self).__finalize__(self, method="isnull")
+
+    def notna(self) -> Self:
+        """
+        Detect existing (non-missing) values.
+
+        Return a boolean same-sized object indicating if the values are not NA.
+        Non-missing values get mapped to True. Characters such as empty
+        strings ``''`` or :attr:`numpy.inf` are not considered NA values.
+        NA values, such as None or :attr:`numpy.NaN`, get mapped to False
+        values.
+
+        Returns
+        -------
+        Series/DataFrame
+            Mask of bool values for each element in Series/DataFrame
+            that indicates whether an element is not an NA value.
+
+        See Also
+        --------
+        Series.notnull : Alias of notna.
+        DataFrame.notnull : Alias of notna.
+        Series.isna : Boolean inverse of notna.
+        DataFrame.isna : Boolean inverse of notna.
+        Series.dropna : Omit axes labels with missing values.
+        DataFrame.dropna : Omit axes labels with missing values.
+        notna : Top-level notna.
+
+        Examples
+        --------
+        Show which entries in a DataFrame are not NA.
+
+        >>> df = pd.DataFrame(
+        ...     dict(
+        ...         age=[5, 6, np.nan],
+        ...         born=[
+        ...             pd.NaT,
+        ...             pd.Timestamp("1939-05-27"),
+        ...             pd.Timestamp("1940-04-25"),
+        ...         ],
+        ...         name=["Alfred", "Batman", ""],
+        ...         toy=[None, "Batmobile", "Joker"],
+        ...     )
+        ... )
+        >>> df
+           age       born    name        toy
+        0  5.0        NaT  Alfred        NaN
+        1  6.0 1939-05-27  Batman  Batmobile
+        2  NaN 1940-04-25              Joker
+
+        >>> df.notna()
+             age   born  name    toy
+        0   True  False  True  False
+        1   True   True  True   True
+        2  False   True  True   True
+
+        Show which entries in a Series are not NA.
+
+        >>> ser = pd.Series([5, 6, np.nan])
+        >>> ser
+        0    5.0
+        1    6.0
+        2    NaN
+        dtype: float64
+
+        >>> ser.notna()
+        0     True
+        1     True
+        2    False
+        dtype: bool
+        """
+        return notna(self).__finalize__(self, method="notna")
+
+    def notnull(self) -> Self:
+        """
+        Detect existing (non-missing) values.
+
+        Return a boolean same-sized object indicating if the values are not NA.
+        Non-missing values get mapped to True. Characters such as empty
+        strings ``''`` or :attr:`numpy.inf` are not considered NA values.
+        NA values, such as None or :attr:`numpy.NaN`, get mapped to False
+        values.
+
+        Returns
+        -------
+        Series/DataFrame
+            Mask of bool values for each element in Series/DataFrame
+            that indicates whether an element is not an NA value.
+
+        See Also
+        --------
+        Series.notnull : Alias of notna.
+        DataFrame.notnull : Alias of notna.
+        Series.isna : Boolean inverse of notna.
+        DataFrame.isna : Boolean inverse of notna.
+        Series.dropna : Omit axes labels with missing values.
+        DataFrame.dropna : Omit axes labels with missing values.
+        notna : Top-level notna.
+
+        Examples
+        --------
+        Show which entries in a DataFrame are not NA.
+
+        >>> df = pd.DataFrame(
+        ...     dict(
+        ...         age=[5, 6, np.nan],
+        ...         born=[
+        ...             pd.NaT,
+        ...             pd.Timestamp("1939-05-27"),
+        ...             pd.Timestamp("1940-04-25"),
+        ...         ],
+        ...         name=["Alfred", "Batman", ""],
+        ...         toy=[None, "Batmobile", "Joker"],
+        ...     )
+        ... )
+        >>> df
+           age       born    name        toy
+        0  5.0        NaT  Alfred        NaN
+        1  6.0 1939-05-27  Batman  Batmobile
+        2  NaN 1940-04-25              Joker
+
+        >>> df.notna()
+             age   born  name    toy
+        0   True  False  True  False
+        1   True   True  True   True
+        2  False   True  True   True
+
+        Show which entries in a Series are not NA.
+
+        >>> ser = pd.Series([5, 6, np.nan])
+        >>> ser
+        0    5.0
+        1    6.0
+        2    NaN
+        dtype: float64
+
+        >>> ser.notna()
+        0     True
+        1     True
+        2    False
+        dtype: bool
+        """
+        return notna(self).__finalize__(self, method="notnull")
+
+    @final
+    def _clip_with_scalar(self, lower, upper, inplace: bool = False):
+        if (lower is not None and np.any(isna(lower))) or (
+            upper is not None and np.any(isna(upper))
+        ):
+            raise ValueError("Cannot use an NA value as a clip threshold")
+
+        result = self
+        mask = self.isna()
+
+        if lower is not None:
+            cond = mask | (self >= lower)
+            result = result.where(cond, lower, inplace=inplace)
+        if upper is not None:
+            cond = mask | (self <= upper)
+            result = result.where(cond, upper, inplace=inplace)
+
+        return result
+
+    @final
+    def _clip_with_one_bound(self, threshold, method, axis, inplace):
+        if axis is not None:
+            axis = self._get_axis_number(axis)
+
+        # method is self.le for upper bound and self.ge for lower bound
+        if is_scalar(threshold) and is_number(threshold):
+            if method.__name__ == "le":
+                return self._clip_with_scalar(None, threshold, inplace=inplace)
+            return self._clip_with_scalar(threshold, None, inplace=inplace)
+
+        # GH #15390
+        # In order for where method to work, the threshold must
+        # be transformed to NDFrame from other array like structure.
+        if (not isinstance(threshold, ABCSeries)) and is_list_like(threshold):
+            if isinstance(self, ABCSeries):
+                threshold = self._constructor(threshold, index=self.index)
+            else:
+                threshold = self._align_for_op(threshold, axis, flex=None)[1]
+
+        # GH 40420
+        # Treat missing thresholds as no bounds, not clipping the values
+        if is_list_like(threshold):
+            fill_value = np.inf if method.__name__ == "le" else -np.inf
+            threshold_inf = threshold.fillna(fill_value)
+        else:
+            threshold_inf = threshold
+
+        subset = method(threshold_inf, axis=axis) | isna(self)
+
+        # GH 40420
+        return self.where(subset, threshold, axis=axis, inplace=inplace)
+
+    @final
+    def clip(
+        self,
+        lower=None,
+        upper=None,
+        *,
+        axis: Axis | None = None,
+        inplace: bool = False,
+        **kwargs,
+    ) -> Self:
+        """
+        Trim values at input threshold(s).
+
+        Assigns values outside boundary to boundary values. Thresholds
+        can be singular values or array like, and in the latter case
+        the clipping is performed element-wise in the specified axis.
+
+        Parameters
+        ----------
+        lower : float or array-like, default None
+            Minimum threshold value. All values below this
+            threshold will be set to it. A missing
+            threshold (e.g `NA`) will not clip the value.
+        upper : float or array-like, default None
+            Maximum threshold value. All values above this
+            threshold will be set to it. A missing
+            threshold (e.g `NA`) will not clip the value.
+        axis : {{0 or 'index', 1 or 'columns', None}}, default None
+            Align object with lower and upper along the given axis.
+            For `Series` this parameter is unused and defaults to `None`.
+        inplace : bool, default False
+            Whether to perform the operation in place on the data.
+        **kwargs
+            Additional keywords have no effect but might be accepted
+            for compatibility with numpy.
+
+        Returns
+        -------
+        Series or DataFrame
+            Same type as calling object with the values outside the
+            clip boundaries replaced.
+
+        See Also
+        --------
+        Series.clip : Trim values at input threshold in series.
+        DataFrame.clip : Trim values at input threshold in DataFrame.
+        numpy.clip : Clip (limit) the values in an array.
+
+        Examples
+        --------
+        >>> data = {"col_0": [9, -3, 0, -1, 5], "col_1": [-2, -7, 6, 8, -5]}
+        >>> df = pd.DataFrame(data)
+        >>> df
+           col_0  col_1
+        0      9     -2
+        1     -3     -7
+        2      0      6
+        3     -1      8
+        4      5     -5
+
+        Clips per column using lower and upper thresholds:
+
+        >>> df.clip(-4, 6)
+           col_0  col_1
+        0      6     -2
+        1     -3     -4
+        2      0      6
+        3     -1      6
+        4      5     -4
+
+        Clips using specific lower and upper thresholds per column:
+
+        >>> df.clip([-2, -1], [4, 5])
+            col_0  col_1
+        0      4     -1
+        1     -2     -1
+        2      0      5
+        3     -1      5
+        4      4     -1
+
+        Clips using specific lower and upper thresholds per column element:
+
+        >>> t = pd.Series([2, -4, -1, 6, 3])
+        >>> t
+        0    2
+        1   -4
+        2   -1
+        3    6
+        4    3
+        dtype: int64
+
+        >>> df.clip(t, t + 4, axis=0)
+           col_0  col_1
+        0      6      2
+        1     -3     -4
+        2      0      3
+        3      6      8
+        4      5      3
+
+        Clips using specific lower threshold per column element, with missing values:
+
+        >>> t = pd.Series([2, -4, np.nan, 6, 3])
+        >>> t
+        0    2.0
+        1   -4.0
+        2    NaN
+        3    6.0
+        4    3.0
+        dtype: float64
+
+        >>> df.clip(t, axis=0)
+        col_0  col_1
+        0      9.0    2.0
+        1     -3.0   -4.0
+        2      0.0    6.0
+        3      6.0    8.0
+        4      5.0    3.0
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+
+        if inplace:
+            if not CHAINED_WARNING_DISABLED:
+                if sys.getrefcount(
+                    self
+                ) <= REF_COUNT_METHOD and not common.is_local_in_caller_frame(self):
+                    warnings.warn(
+                        _chained_assignment_method_msg,
+                        ChainedAssignmentError,
+                        stacklevel=2,
+                    )
+
+        axis = nv.validate_clip_with_axis(axis, (), kwargs)
+        if axis is not None:
+            axis = self._get_axis_number(axis)
+
+        # GH 17276
+        # numpy doesn't like NaN as a clip value
+        # so ignore
+        # GH 19992
+        # numpy doesn't drop a list-like bound containing NaN
+        isna_lower = isna(lower)
+        if not is_list_like(lower):
+            if np.any(isna_lower):
+                lower = None
+        elif np.all(isna_lower):
+            lower = None
+        isna_upper = isna(upper)
+        if not is_list_like(upper):
+            if np.any(isna_upper):
+                upper = None
+        elif np.all(isna_upper):
+            upper = None
+
+        # GH 2747 (arguments were reversed)
+        if (
+            lower is not None
+            and upper is not None
+            and is_scalar(lower)
+            and is_scalar(upper)
+        ):
+            lower, upper = min(lower, upper), max(lower, upper)
+
+        # fast-path for scalars
+        if (lower is None or is_number(lower)) and (upper is None or is_number(upper)):
+            return self._clip_with_scalar(lower, upper, inplace=inplace)
+
+        result = self
+        if lower is not None:
+            result = result._clip_with_one_bound(
+                lower, method=self.ge, axis=axis, inplace=inplace
+            )
+        if upper is not None:
+            if inplace:
+                result = self
+            result = result._clip_with_one_bound(
+                upper, method=self.le, axis=axis, inplace=inplace
+            )
+
+        return result
+
+    @final
+    def asfreq(
+        self,
+        freq: Frequency,
+        method: FillnaOptions | None = None,
+        how: Literal["start", "end"] | None = None,
+        normalize: bool = False,
+        fill_value: Hashable | None = None,
+    ) -> Self:
+        """
+        Convert time series to specified frequency.
+
+        Returns the original data conformed to a new index with the specified
+        frequency.
+
+        If the index of this Series/DataFrame is a :class:`~pandas.PeriodIndex`, the
+        new index is the result of transforming the original index with
+        :meth:`PeriodIndex.asfreq <pandas.PeriodIndex.asfreq>` (so the original index
+        will map one-to-one to the new index).
+
+        Otherwise, the new index will be equivalent to ``pd.date_range(start, end,
+        freq=freq)`` where ``start`` and ``end`` are, respectively, the min and
+        max entries in the original index (see :func:`pandas.date_range`). The
+        values corresponding to any timesteps in the new index which were not present
+        in the original index will be null (``NaN``), unless a method for filling
+        such unknowns is provided (see the ``method`` parameter below).
+
+        The :meth:`resample` method is more appropriate if an operation on each group of
+        timesteps (such as an aggregate) is necessary to represent the data at the new
+        frequency.
+
+        Parameters
+        ----------
+        freq : DateOffset or str
+            Frequency DateOffset or string.
+        method : {{'backfill'/'bfill', 'pad'/'ffill'}}, default None
+            Method to use for filling holes in reindexed Series (note this
+            does not fill NaNs that already were present):
+
+            * 'pad' / 'ffill': propagate last valid observation forward to next
+              valid based on the order of the index
+            * 'backfill' / 'bfill': use NEXT valid observation to fill.
+        how : {{'start', 'end'}}, default end
+            For PeriodIndex only (see PeriodIndex.asfreq).
+        normalize : bool, default False
+            Whether to reset output index to midnight.
+        fill_value : scalar, optional
+            Value to use for missing values, applied during upsampling (note
+            this does not fill NaNs that already were present).
+
+        Returns
+        -------
+        Series/DataFrame
+            Series/DataFrame object reindexed to the specified frequency.
+
+        See Also
+        --------
+        reindex : Conform DataFrame to new index with optional filling logic.
+
+        Notes
+        -----
+        To learn more about the frequency strings, please see
+        :ref:`this link<timeseries.offset_aliases>`.
+
+        Examples
+        --------
+        Start by creating a series with 4 one minute timestamps.
+
+        >>> index = pd.date_range("1/1/2000", periods=4, freq="min")
+        >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index)
+        >>> df = pd.DataFrame({"s": series})
+        >>> df
+                               s
+        2000-01-01 00:00:00    0.0
+        2000-01-01 00:01:00    NaN
+        2000-01-01 00:02:00    2.0
+        2000-01-01 00:03:00    3.0
+
+        Upsample the series into 30 second bins.
+
+        >>> df.asfreq(freq="30s")
+                               s
+        2000-01-01 00:00:00    0.0
+        2000-01-01 00:00:30    NaN
+        2000-01-01 00:01:00    NaN
+        2000-01-01 00:01:30    NaN
+        2000-01-01 00:02:00    2.0
+        2000-01-01 00:02:30    NaN
+        2000-01-01 00:03:00    3.0
+
+        Upsample again, providing a ``fill value``.
+
+        >>> df.asfreq(freq="30s", fill_value=9.0)
+                               s
+        2000-01-01 00:00:00    0.0
+        2000-01-01 00:00:30    9.0
+        2000-01-01 00:01:00    NaN
+        2000-01-01 00:01:30    9.0
+        2000-01-01 00:02:00    2.0
+        2000-01-01 00:02:30    9.0
+        2000-01-01 00:03:00    3.0
+
+        Upsample again, providing a ``method``.
+
+        >>> df.asfreq(freq="30s", method="bfill")
+                               s
+        2000-01-01 00:00:00    0.0
+        2000-01-01 00:00:30    NaN
+        2000-01-01 00:01:00    NaN
+        2000-01-01 00:01:30    2.0
+        2000-01-01 00:02:00    2.0
+        2000-01-01 00:02:30    3.0
+        2000-01-01 00:03:00    3.0
+        """
+        from pandas.core.resample import asfreq
+
+        return asfreq(
+            self,
+            freq,
+            method=method,
+            how=how,
+            normalize=normalize,
+            fill_value=fill_value,
+        )
+
+    @final
+    def at_time(self, time, asof: bool = False, axis: Axis | None = None) -> Self:
+        """
+        Select values at particular time of day (e.g., 9:30AM).
+
+        Parameters
+        ----------
+        time : datetime.time or str
+            The values to select.
+        asof : bool, default False
+            This parameter is currently not supported.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            For `Series` this parameter is unused and defaults to 0.
+
+        Returns
+        -------
+        Series or DataFrame
+            The values with the specified time.
+
+        Raises
+        ------
+        TypeError
+            If the index is not  a :class:`DatetimeIndex`
+
+        See Also
+        --------
+        between_time : Select values between particular times of the day.
+        first : Select initial periods of time series based on a date offset.
+        last : Select final periods of time series based on a date offset.
+        DatetimeIndex.indexer_at_time : Get just the index locations for
+            values at particular time of the day.
+
+        Examples
+        --------
+        >>> i = pd.date_range("2018-04-09", periods=4, freq="12h")
+        >>> ts = pd.DataFrame({"A": [1, 2, 3, 4]}, index=i)
+        >>> ts
+                             A
+        2018-04-09 00:00:00  1
+        2018-04-09 12:00:00  2
+        2018-04-10 00:00:00  3
+        2018-04-10 12:00:00  4
+
+        >>> ts.at_time("12:00")
+                             A
+        2018-04-09 12:00:00  2
+        2018-04-10 12:00:00  4
+        """
+        if axis is None:
+            axis = 0
+        axis = self._get_axis_number(axis)
+
+        index = self._get_axis(axis)
+
+        if not isinstance(index, DatetimeIndex):
+            raise TypeError("Index must be DatetimeIndex")
+
+        indexer = index.indexer_at_time(time, asof=asof)
+        return self.take(indexer, axis=axis)
+
+    @final
+    def between_time(
+        self,
+        start_time,
+        end_time,
+        inclusive: IntervalClosedType = "both",
+        axis: Axis | None = None,
+    ) -> Self:
+        """
+        Select values between particular times of the day (e.g., 9:00-9:30 AM).
+
+        By setting ``start_time`` to be later than ``end_time``,
+        you can get the times that are *not* between the two times.
+
+        Parameters
+        ----------
+        start_time : datetime.time or str
+            Initial time as a time filter limit.
+        end_time : datetime.time or str
+            End time as a time filter limit.
+        inclusive : {"both", "neither", "left", "right"}, default "both"
+            Include boundaries; whether to set each bound as closed or open.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Determine range time on index or columns value.
+            For `Series` this parameter is unused and defaults to 0.
+
+        Returns
+        -------
+        Series or DataFrame
+            Data from the original object filtered to the specified dates range.
+
+        Raises
+        ------
+        TypeError
+            If the index is not  a :class:`DatetimeIndex`
+
+        See Also
+        --------
+        at_time : Select values at a particular time of the day.
+        first : Select initial periods of time series based on a date offset.
+        last : Select final periods of time series based on a date offset.
+        DatetimeIndex.indexer_between_time : Get just the index locations for
+            values between particular times of the day.
+
+        Examples
+        --------
+        >>> i = pd.date_range("2018-04-09", periods=4, freq="1D20min")
+        >>> ts = pd.DataFrame({"A": [1, 2, 3, 4]}, index=i)
+        >>> ts
+                             A
+        2018-04-09 00:00:00  1
+        2018-04-10 00:20:00  2
+        2018-04-11 00:40:00  3
+        2018-04-12 01:00:00  4
+
+        >>> ts.between_time("0:15", "0:45")
+                             A
+        2018-04-10 00:20:00  2
+        2018-04-11 00:40:00  3
+
+        You get the times that are *not* between two times by setting
+        ``start_time`` later than ``end_time``:
+
+        >>> ts.between_time("0:45", "0:15")
+                             A
+        2018-04-09 00:00:00  1
+        2018-04-12 01:00:00  4
+        """
+        if axis is None:
+            axis = 0
+        axis = self._get_axis_number(axis)
+
+        index = self._get_axis(axis)
+        if not isinstance(index, DatetimeIndex):
+            raise TypeError("Index must be DatetimeIndex")
+
+        left_inclusive, right_inclusive = validate_inclusive(inclusive)
+        indexer = index.indexer_between_time(
+            start_time,
+            end_time,
+            include_start=left_inclusive,
+            include_end=right_inclusive,
+        )
+        return self.take(indexer, axis=axis)
+
+    @final
+    def resample(
+        self,
+        rule,
+        closed: Literal["right", "left"] | None = None,
+        label: Literal["right", "left"] | None = None,
+        convention: Literal["start", "end", "s", "e"] = "start",
+        on: Level | None = None,
+        level: Level | None = None,
+        origin: str | TimestampConvertibleTypes = "start_day",
+        offset: TimedeltaConvertibleTypes | None = None,
+        group_keys: bool = False,
+    ) -> Resampler:
+        """
+        Resample time-series data.
+
+        Convenience method for frequency conversion and resampling of time series.
+        The object must have a datetime-like index (`DatetimeIndex`, `PeriodIndex`,
+        or `TimedeltaIndex`), or the caller must pass the label of a datetime-like
+        series/index to the ``on``/``level`` keyword parameter.
+
+        Parameters
+        ----------
+        rule : DateOffset, Timedelta or str
+            The offset string or object representing target conversion.
+        closed : {{'right', 'left'}}, default None
+            Which side of bin interval is closed. The default is 'left'
+            for all frequency offsets except for 'ME', 'YE', 'QE', 'BME',
+            'BA', 'BQE', and 'W' which all have a default of 'right'.
+        label : {{'right', 'left'}}, default None
+            Which bin edge label to label bucket with. The default is 'left'
+            for all frequency offsets except for 'ME', 'YE', 'QE', 'BME',
+            'BA', 'BQE', and 'W' which all have a default of 'right'.
+        convention : {{'start', 'end', 's', 'e'}}, default 'start'
+            For `PeriodIndex` only, controls whether to use the start or
+            end of `rule`.
+        on : str, optional
+            For a DataFrame, column to use instead of index for resampling.
+            Column must be datetime-like.
+        level : str or int, optional
+            For a MultiIndex, level (name or number) to use for
+            resampling. `level` must be datetime-like.
+        origin : Timestamp or str, default 'start_day'
+            The timestamp on which to adjust the grouping. The timezone of origin
+            must match the timezone of the index.
+            If string, must be Timestamp convertible or one of the following:
+
+            - 'epoch': `origin` is 1970-01-01
+            - 'start': `origin` is the first value of the timeseries
+            - 'start_day': `origin` is the first day at midnight of the timeseries
+
+            - 'end': `origin` is the last value of the timeseries
+            - 'end_day': `origin` is the ceiling midnight of the last day
+
+            .. note::
+
+                Only takes effect for Tick-frequencies (i.e. fixed frequencies like
+                days, hours, and minutes, rather than months or quarters).
+        offset : Timedelta or str, default is None
+            An offset timedelta added to the origin.
+
+        group_keys : bool, default False
+            Whether to include the group keys in the result index when using
+            ``.apply()`` on the resampled object.
+
+            .. versionchanged:: 2.0.0
+
+                ``group_keys`` now defaults to ``False``.
+
+        Returns
+        -------
+        pandas.api.typing.Resampler
+            :class:`~pandas.core.Resampler` object.
+
+        See Also
+        --------
+        Series.resample : Resample a Series.
+        DataFrame.resample : Resample a DataFrame.
+        groupby : Group Series/DataFrame by mapping, function, label, or list of labels.
+        asfreq : Reindex a Series/DataFrame with the given frequency without grouping.
+
+        Notes
+        -----
+        See the `user guide
+        <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling>`__
+        for more.
+
+        To learn more about the offset strings, please see `this link
+        <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`__.
+
+        Examples
+        --------
+        Start by creating a series with 9 one minute timestamps.
+
+        >>> index = pd.date_range("1/1/2000", periods=9, freq="min")
+        >>> series = pd.Series(range(9), index=index)
+        >>> series
+        2000-01-01 00:00:00    0
+        2000-01-01 00:01:00    1
+        2000-01-01 00:02:00    2
+        2000-01-01 00:03:00    3
+        2000-01-01 00:04:00    4
+        2000-01-01 00:05:00    5
+        2000-01-01 00:06:00    6
+        2000-01-01 00:07:00    7
+        2000-01-01 00:08:00    8
+        Freq: min, dtype: int64
+
+        Downsample the series into 3 minute bins and sum the values
+        of the timestamps falling into a bin.
+
+        >>> series.resample("3min").sum()
+        2000-01-01 00:00:00     3
+        2000-01-01 00:03:00    12
+        2000-01-01 00:06:00    21
+        Freq: 3min, dtype: int64
+
+        Downsample the series into 3 minute bins as above, but label each
+        bin using the right edge instead of the left. Please note that the
+        value in the bucket used as the label is not included in the bucket,
+        which it labels. For example, in the original series the
+        bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed
+        value in the resampled bucket with the label ``2000-01-01 00:03:00``
+        does not include 3 (if it did, the summed value would be 6, not 3).
+
+        >>> series.resample("3min", label="right").sum()
+        2000-01-01 00:03:00     3
+        2000-01-01 00:06:00    12
+        2000-01-01 00:09:00    21
+        Freq: 3min, dtype: int64
+
+        To include this value close the right side of the bin interval,
+        as shown below.
+
+        >>> series.resample("3min", label="right", closed="right").sum()
+        2000-01-01 00:00:00     0
+        2000-01-01 00:03:00     6
+        2000-01-01 00:06:00    15
+        2000-01-01 00:09:00    15
+        Freq: 3min, dtype: int64
+
+        Upsample the series into 30 second bins.
+
+        >>> series.resample("30s").asfreq()[0:5]  # Select first 5 rows
+        2000-01-01 00:00:00   0.0
+        2000-01-01 00:00:30   NaN
+        2000-01-01 00:01:00   1.0
+        2000-01-01 00:01:30   NaN
+        2000-01-01 00:02:00   2.0
+        Freq: 30s, dtype: float64
+
+        Upsample the series into 30 second bins and fill the ``NaN``
+        values using the ``ffill`` method.
+
+        >>> series.resample("30s").ffill()[0:5]
+        2000-01-01 00:00:00    0
+        2000-01-01 00:00:30    0
+        2000-01-01 00:01:00    1
+        2000-01-01 00:01:30    1
+        2000-01-01 00:02:00    2
+        Freq: 30s, dtype: int64
+
+        Upsample the series into 30 second bins and fill the
+        ``NaN`` values using the ``bfill`` method.
+
+        >>> series.resample("30s").bfill()[0:5]
+        2000-01-01 00:00:00    0
+        2000-01-01 00:00:30    1
+        2000-01-01 00:01:00    1
+        2000-01-01 00:01:30    2
+        2000-01-01 00:02:00    2
+        Freq: 30s, dtype: int64
+
+        Pass a custom function via ``apply``
+
+        >>> def custom_resampler(arraylike):
+        ...     return np.sum(arraylike) + 5
+        >>> series.resample("3min").apply(custom_resampler)
+        2000-01-01 00:00:00     8
+        2000-01-01 00:03:00    17
+        2000-01-01 00:06:00    26
+        Freq: 3min, dtype: int64
+
+        For a Series with a PeriodIndex, the keyword `convention` can be
+        used to control whether to use the start or end of `rule`.
+
+        Resample a year by quarter using 'start' `convention`. Values are
+        assigned to the first quarter of the period.
+
+        >>> s = pd.Series(
+        ...     [1, 2], index=pd.period_range("2012-01-01", freq="Y", periods=2)
+        ... )
+        >>> s
+        2012    1
+        2013    2
+        Freq: Y-DEC, dtype: int64
+        >>> s.resample("Q", convention="start").asfreq()
+        2012Q1    1.0
+        2012Q2    NaN
+        2012Q3    NaN
+        2012Q4    NaN
+        2013Q1    2.0
+        2013Q2    NaN
+        2013Q3    NaN
+        2013Q4    NaN
+        Freq: Q-DEC, dtype: float64
+
+        Resample quarters by month using 'end' `convention`. Values are
+        assigned to the last month of the period.
+
+        >>> q = pd.Series(
+        ...     [1, 2, 3, 4], index=pd.period_range("2018-01-01", freq="Q", periods=4)
+        ... )
+        >>> q
+        2018Q1    1
+        2018Q2    2
+        2018Q3    3
+        2018Q4    4
+        Freq: Q-DEC, dtype: int64
+        >>> q.resample("M", convention="end").asfreq()
+        2018-03    1.0
+        2018-04    NaN
+        2018-05    NaN
+        2018-06    2.0
+        2018-07    NaN
+        2018-08    NaN
+        2018-09    3.0
+        2018-10    NaN
+        2018-11    NaN
+        2018-12    4.0
+        Freq: M, dtype: float64
+
+        For DataFrame objects, the keyword `on` can be used to specify the
+        column instead of the index for resampling.
+
+        >>> df = pd.DataFrame([10, 11, 9, 13, 14, 18, 17, 19], columns=["price"])
+        >>> df["volume"] = [50, 60, 40, 100, 50, 100, 40, 50]
+        >>> df["week_starting"] = pd.date_range("01/01/2018", periods=8, freq="W")
+        >>> df
+           price  volume week_starting
+        0     10      50    2018-01-07
+        1     11      60    2018-01-14
+        2      9      40    2018-01-21
+        3     13     100    2018-01-28
+        4     14      50    2018-02-04
+        5     18     100    2018-02-11
+        6     17      40    2018-02-18
+        7     19      50    2018-02-25
+        >>> df.resample("ME", on="week_starting").mean()
+                       price  volume
+        week_starting
+        2018-01-31     10.75    62.5
+        2018-02-28     17.00    60.0
+
+        For a DataFrame with MultiIndex, the keyword `level` can be used to
+        specify on which level the resampling needs to take place.
+
+        >>> days = pd.date_range("1/1/2000", periods=4, freq="D")
+        >>> df2 = pd.DataFrame(
+        ...     [
+        ...         [10, 50],
+        ...         [11, 60],
+        ...         [9, 40],
+        ...         [13, 100],
+        ...         [14, 50],
+        ...         [18, 100],
+        ...         [17, 40],
+        ...         [19, 50],
+        ...     ],
+        ...     columns=["price", "volume"],
+        ...     index=pd.MultiIndex.from_product([days, ["morning", "afternoon"]]),
+        ... )
+        >>> df2
+                              price  volume
+        2000-01-01 morning       10      50
+                   afternoon     11      60
+        2000-01-02 morning        9      40
+                   afternoon     13     100
+        2000-01-03 morning       14      50
+                   afternoon     18     100
+        2000-01-04 morning       17      40
+                   afternoon     19      50
+        >>> df2.resample("D", level=0).sum()
+                    price  volume
+        2000-01-01     21     110
+        2000-01-02     22     140
+        2000-01-03     32     150
+        2000-01-04     36      90
+
+        If you want to adjust the start of the bins based on a fixed timestamp:
+
+        >>> start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00"
+        >>> rng = pd.date_range(start, end, freq="7min")
+        >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
+        >>> ts
+        2000-10-01 23:30:00     0
+        2000-10-01 23:37:00     3
+        2000-10-01 23:44:00     6
+        2000-10-01 23:51:00     9
+        2000-10-01 23:58:00    12
+        2000-10-02 00:05:00    15
+        2000-10-02 00:12:00    18
+        2000-10-02 00:19:00    21
+        2000-10-02 00:26:00    24
+        Freq: 7min, dtype: int64
+
+        >>> ts.resample("17min").sum()
+        2000-10-01 23:14:00     0
+        2000-10-01 23:31:00     9
+        2000-10-01 23:48:00    21
+        2000-10-02 00:05:00    54
+        2000-10-02 00:22:00    24
+        Freq: 17min, dtype: int64
+
+        >>> ts.resample("17min", origin="epoch").sum()
+        2000-10-01 23:18:00     0
+        2000-10-01 23:35:00    18
+        2000-10-01 23:52:00    27
+        2000-10-02 00:09:00    39
+        2000-10-02 00:26:00    24
+        Freq: 17min, dtype: int64
+
+        >>> ts.resample("17min", origin="2000-01-01").sum()
+        2000-10-01 23:24:00     3
+        2000-10-01 23:41:00    15
+        2000-10-01 23:58:00    45
+        2000-10-02 00:15:00    45
+        Freq: 17min, dtype: int64
+
+        If you want to adjust the start of the bins with an `offset` Timedelta, the two
+        following lines are equivalent:
+
+        >>> ts.resample("17min", origin="start").sum()
+        2000-10-01 23:30:00     9
+        2000-10-01 23:47:00    21
+        2000-10-02 00:04:00    54
+        2000-10-02 00:21:00    24
+        Freq: 17min, dtype: int64
+
+        >>> ts.resample("17min", offset="23h30min").sum()
+        2000-10-01 23:30:00     9
+        2000-10-01 23:47:00    21
+        2000-10-02 00:04:00    54
+        2000-10-02 00:21:00    24
+        Freq: 17min, dtype: int64
+
+        If you want to take the largest Timestamp as the end of the bins:
+
+        >>> ts.resample("17min", origin="end").sum()
+        2000-10-01 23:35:00     0
+        2000-10-01 23:52:00    18
+        2000-10-02 00:09:00    27
+        2000-10-02 00:26:00    63
+        Freq: 17min, dtype: int64
+
+        In contrast with the `start_day`, you can use `end_day` to take the ceiling
+        midnight of the largest Timestamp as the end of the bins and drop the bins
+        not containing data:
+
+        >>> ts.resample("17min", origin="end_day").sum()
+        2000-10-01 23:38:00     3
+        2000-10-01 23:55:00    15
+        2000-10-02 00:12:00    45
+        2000-10-02 00:29:00    45
+        Freq: 17min, dtype: int64
+        """
+        from pandas.core.resample import get_resampler
+
+        return get_resampler(
+            cast("Series | DataFrame", self),
+            freq=rule,
+            label=label,
+            closed=closed,
+            convention=convention,
+            key=on,
+            level=level,
+            origin=origin,
+            offset=offset,
+            group_keys=group_keys,
+        )
+
+    @final
+    def rank(
+        self,
+        axis: Axis = 0,
+        method: Literal["average", "min", "max", "first", "dense"] = "average",
+        numeric_only: bool = False,
+        na_option: Literal["keep", "top", "bottom"] = "keep",
+        ascending: bool = True,
+        pct: bool = False,
+    ) -> Self:
+        """
+        Compute numerical data ranks (1 through n) along axis.
+
+        By default, equal values are assigned a rank that is the average of the
+        ranks of those values.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Index to direct ranking.
+            For `Series` this parameter is unused and defaults to 0.
+        method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
+            How to rank the group of records that have the same value (i.e. ties):
+
+            * average: average rank of the group
+            * min: lowest rank in the group
+            * max: highest rank in the group
+            * first: ranks assigned in order they appear in the array
+            * dense: like 'min', but rank always increases by 1 between groups.
+
+        numeric_only : bool, default False
+            For DataFrame objects, rank only numeric columns if set to True.
+
+            .. versionchanged:: 2.0.0
+                The default value of ``numeric_only`` is now ``False``.
+
+        na_option : {'keep', 'top', 'bottom'}, default 'keep'
+            How to rank NaN values:
+
+            * keep: assign NaN rank to NaN values
+            * top: assign lowest rank to NaN values
+            * bottom: assign highest rank to NaN values
+
+        ascending : bool, default True
+            Whether or not the elements should be ranked in ascending order.
+        pct : bool, default False
+            Whether or not to display the returned rankings in percentile
+            form.
+
+        Returns
+        -------
+        same type as caller
+            Return a Series or DataFrame with data ranks as values.
+
+        See Also
+        --------
+        core.groupby.DataFrameGroupBy.rank : Rank of values within each group.
+        core.groupby.SeriesGroupBy.rank : Rank of values within each group.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     data={
+        ...         "Animal": ["cat", "penguin", "dog", "spider", "snake"],
+        ...         "Number_legs": [4, 2, 4, 8, np.nan],
+        ...     }
+        ... )
+        >>> df
+            Animal  Number_legs
+        0      cat          4.0
+        1  penguin          2.0
+        2      dog          4.0
+        3   spider          8.0
+        4    snake          NaN
+
+        Ties are assigned the mean of the ranks (by default) for the group.
+
+        >>> s = pd.Series(range(5), index=list("abcde"))
+        >>> s["d"] = s["b"]
+        >>> s.rank()
+        a    1.0
+        b    2.5
+        c    4.0
+        d    2.5
+        e    5.0
+        dtype: float64
+
+        The following example shows how the method behaves with the above
+        parameters:
+
+        * default_rank: this is the default behaviour obtained without using
+          any parameter.
+        * max_rank: setting ``method = 'max'`` the records that have the
+          same values are ranked using the highest rank (e.g.: since 'cat'
+          and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.)
+        * NA_bottom: choosing ``na_option = 'bottom'``, if there are records
+          with NaN values they are placed at the bottom of the ranking.
+        * pct_rank: when setting ``pct = True``, the ranking is expressed as
+          percentile rank.
+
+        >>> df["default_rank"] = df["Number_legs"].rank()
+        >>> df["max_rank"] = df["Number_legs"].rank(method="max")
+        >>> df["NA_bottom"] = df["Number_legs"].rank(na_option="bottom")
+        >>> df["pct_rank"] = df["Number_legs"].rank(pct=True)
+        >>> df
+            Animal  Number_legs  default_rank  max_rank  NA_bottom  pct_rank
+        0      cat          4.0           2.5       3.0        2.5     0.625
+        1  penguin          2.0           1.0       1.0        1.0     0.250
+        2      dog          4.0           2.5       3.0        2.5     0.625
+        3   spider          8.0           4.0       4.0        4.0     1.000
+        4    snake          NaN           NaN       NaN        5.0       NaN
+        """
+        axis_int = self._get_axis_number(axis)
+
+        if na_option not in {"keep", "top", "bottom"}:
+            msg = "na_option must be one of 'keep', 'top', or 'bottom'"
+            raise ValueError(msg)
+
+        def ranker(data):
+            if data.ndim == 2:
+                # i.e. DataFrame, we cast to ndarray
+                values = data.values
+            else:
+                # i.e. Series, can dispatch to EA
+                values = data._values
+
+            if isinstance(values, ExtensionArray):
+                ranks = values._rank(
+                    axis=axis_int,
+                    method=method,
+                    ascending=ascending,
+                    na_option=na_option,
+                    pct=pct,
+                )
+            else:
+                ranks = algos.rank(
+                    values,
+                    axis=axis_int,
+                    method=method,
+                    ascending=ascending,
+                    na_option=na_option,
+                    pct=pct,
+                )
+
+            ranks_obj = self._constructor(ranks, **data._construct_axes_dict())
+            return ranks_obj.__finalize__(self, method="rank")
+
+        if numeric_only:
+            if self.ndim == 1 and not is_numeric_dtype(self.dtype):
+                # GH#47500
+                raise TypeError(
+                    "Series.rank does not allow numeric_only=True with "
+                    "non-numeric dtype."
+                )
+            data = self._get_numeric_data()
+        else:
+            data = self
+
+        return ranker(data)
+
+    def compare(
+        self,
+        other: Self,
+        align_axis: Axis = 1,
+        keep_shape: bool = False,
+        keep_equal: bool = False,
+        result_names: Suffixes = ("self", "other"),
+    ):
+        """
+        Compare to another Series/DataFrame and show the differences.
+
+        Parameters
+        ----------
+        other : Series/DataFrame
+            Object to compare with.
+
+        align_axis : {0 or 'index', 1 or 'columns'}, default 1
+            Determine which axis to align the comparison on.
+
+            * 0, or 'index' : Resulting differences are stacked vertically
+              with rows drawn alternately from self and other.
+            * 1, or 'columns' : Resulting differences are aligned horizontally
+              with columns drawn alternately from self and other.
+
+        keep_shape : bool, default False
+            If true, all rows and columns are kept.
+            Otherwise, only the ones with different values are kept.
+
+        keep_equal : bool, default False
+            If true, the result keeps values that are equal.
+            Otherwise, equal values are shown as NaNs.
+
+        result_names : tuple, default ('self', 'other')
+            Set the dataframes names in the comparison.
+        """
+        if type(self) is not type(other):
+            cls_self, cls_other = type(self).__name__, type(other).__name__
+            raise TypeError(
+                f"can only compare '{cls_self}' (not '{cls_other}') with '{cls_self}'"
+            )
+
+        # error: Unsupported left operand type for & ("Self")
+        mask = ~((self == other) | (self.isna() & other.isna()))  # type: ignore[operator]
+        mask.fillna(True, inplace=True)
+
+        if not keep_equal:
+            self = self.where(mask)
+            other = other.where(mask)
+
+        if not keep_shape:
+            if isinstance(self, ABCDataFrame):
+                cmask = mask.any()
+                rmask = mask.any(axis=1)
+                self = self.loc[rmask, cmask]
+                other = other.loc[rmask, cmask]
+            else:
+                self = self[mask]
+                other = other[mask]
+        if not isinstance(result_names, tuple):
+            raise TypeError(
+                f"Passing 'result_names' as a {type(result_names)} is not "
+                "supported. Provide 'result_names' as a tuple instead."
+            )
+
+        if align_axis in (1, "columns"):  # This is needed for Series
+            axis = 1
+        else:
+            axis = self._get_axis_number(align_axis)
+
+        # error: List item 0 has incompatible type "NDFrame"; expected
+        #  "Union[Series, DataFrame]"
+        diff = concat(
+            [self, other],  # type: ignore[list-item]
+            axis=axis,
+            keys=result_names,
+        )
+
+        if axis >= self.ndim:
+            # No need to reorganize data if stacking on new axis
+            # This currently applies for stacking two Series on columns
+            return diff
+
+        ax = diff._get_axis(axis)
+        ax_names = np.array(ax.names)
+
+        # set index names to positions to avoid confusion
+        ax.names = np.arange(len(ax_names))
+
+        # bring self-other to inner level
+        order = [*range(1, ax.nlevels), 0]
+        if isinstance(diff, ABCDataFrame):
+            diff = diff.reorder_levels(order, axis=axis)
+        else:
+            diff = diff.reorder_levels(order)
+
+        # restore the index names in order
+        diff._get_axis(axis=axis).names = ax_names[order]
+
+        # reorder axis to keep things organized
+        indices = (
+            np.arange(diff.shape[axis])
+            .reshape([2, diff.shape[axis] // 2])
+            .T.reshape(-1)
+        )
+        diff = diff.take(indices, axis=axis)
+
+        return diff
+
+    @final
+    def align(
+        self,
+        other: NDFrameT,
+        join: AlignJoin = "outer",
+        axis: Axis | None = None,
+        level: Level | None = None,
+        copy: bool | lib.NoDefault = lib.no_default,
+        fill_value: Hashable | None = None,
+    ) -> tuple[Self, NDFrameT]:
+        """
+        Align two objects on their axes with the specified join method.
+
+        Join method is specified for each axis Index.
+
+        Parameters
+        ----------
+        other : DataFrame or Series
+            The object to align with.
+        join : {{'outer', 'inner', 'left', 'right'}}, default 'outer'
+            Type of alignment to be performed.
+
+            * left: use only keys from left frame, preserve key order.
+            * right: use only keys from right frame, preserve key order.
+            * outer: use union of keys from both frames, sort keys lexicographically.
+            * inner: use intersection of keys from both frames,
+              preserve the order of the left keys.
+
+        axis : allowed axis of the other object, default None
+            Align on index (0), columns (1), or both (None).
+        level : int or level name, default None
+            Broadcast across a level, matching Index values on the
+            passed MultiIndex level.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        fill_value : scalar, default np.nan
+            Value to use for missing values. Defaults to NaN, but can be any
+            "compatible" value.
+
+        Returns
+        -------
+        tuple of (Series/DataFrame, type of other)
+            Aligned objects.
+
+        See Also
+        --------
+        Series.align : Align two objects on their axes with specified join method.
+        DataFrame.align : Align two objects on their axes with specified join method.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2]
+        ... )
+        >>> other = pd.DataFrame(
+        ...     [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]],
+        ...     columns=["A", "B", "C", "D"],
+        ...     index=[2, 3, 4],
+        ... )
+        >>> df
+           D  B  E  A
+        1  1  2  3  4
+        2  6  7  8  9
+        >>> other
+            A    B    C    D
+        2   10   20   30   40
+        3   60   70   80   90
+        4  600  700  800  900
+
+        Align on columns:
+
+        >>> left, right = df.align(other, join="outer", axis=1)
+        >>> left
+           A  B   C  D  E
+        1  4  2 NaN  1  3
+        2  9  7 NaN  6  8
+        >>> right
+            A    B    C    D   E
+        2   10   20   30   40 NaN
+        3   60   70   80   90 NaN
+        4  600  700  800  900 NaN
+
+        We can also align on the index:
+
+        >>> left, right = df.align(other, join="outer", axis=0)
+        >>> left
+            D    B    E    A
+        1  1.0  2.0  3.0  4.0
+        2  6.0  7.0  8.0  9.0
+        3  NaN  NaN  NaN  NaN
+        4  NaN  NaN  NaN  NaN
+        >>> right
+            A      B      C      D
+        1    NaN    NaN    NaN    NaN
+        2   10.0   20.0   30.0   40.0
+        3   60.0   70.0   80.0   90.0
+        4  600.0  700.0  800.0  900.0
+
+        Finally, the default `axis=None` will align on both index and columns:
+
+        >>> left, right = df.align(other, join="outer", axis=None)
+        >>> left
+             A    B   C    D    E
+        1  4.0  2.0 NaN  1.0  3.0
+        2  9.0  7.0 NaN  6.0  8.0
+        3  NaN  NaN NaN  NaN  NaN
+        4  NaN  NaN NaN  NaN  NaN
+        >>> right
+               A      B      C      D   E
+        1    NaN    NaN    NaN    NaN NaN
+        2   10.0   20.0   30.0   40.0 NaN
+        3   60.0   70.0   80.0   90.0 NaN
+        4  600.0  700.0  800.0  900.0 NaN
+        """
+        self._check_copy_deprecation(copy)
+
+        _right: DataFrame | Series
+        if axis is not None:
+            axis = self._get_axis_number(axis)
+        if isinstance(other, ABCDataFrame):
+            left, _right, join_index = self._align_frame(
+                other,
+                join=join,
+                axis=axis,
+                level=level,
+                fill_value=fill_value,
+            )
+
+        elif isinstance(other, ABCSeries):
+            left, _right, join_index = self._align_series(
+                other,
+                join=join,
+                axis=axis,
+                level=level,
+                fill_value=fill_value,
+            )
+        else:  # pragma: no cover
+            raise TypeError(f"unsupported type: {type(other)}")
+
+        right = cast(NDFrameT, _right)
+        if self.ndim == 1 or axis == 0:
+            # If we are aligning timezone-aware DatetimeIndexes and the timezones
+            #  do not match, convert both to UTC.
+            if isinstance(left.index.dtype, DatetimeTZDtype):
+                if left.index.tz != right.index.tz:
+                    if join_index is not None:
+                        # GH#33671 copy to ensure we don't change the index on
+                        #  our original Series
+                        left = left.copy(deep=False)
+                        right = right.copy(deep=False)
+                        left.index = join_index
+                        right.index = join_index
+
+        left = left.__finalize__(self)
+        right = right.__finalize__(other)
+        return left, right
+
+    @final
+    def _align_frame(
+        self,
+        other: DataFrame,
+        join: AlignJoin = "outer",
+        axis: Axis | None = None,
+        level=None,
+        fill_value=None,
+    ) -> tuple[Self, DataFrame, Index | None]:
+        # defaults
+        join_index, join_columns = None, None
+        ilidx, iridx = None, None
+        clidx, cridx = None, None
+
+        is_series = isinstance(self, ABCSeries)
+
+        if (axis is None or axis == 0) and not self.index.equals(other.index):
+            join_index, ilidx, iridx = self.index.join(
+                other.index, how=join, level=level, return_indexers=True
+            )
+
+        if (
+            (axis is None or axis == 1)
+            and not is_series
+            and not self.columns.equals(other.columns)
+        ):
+            join_columns, clidx, cridx = self.columns.join(
+                other.columns, how=join, level=level, return_indexers=True
+            )
+
+        if is_series:
+            reindexers = {0: [join_index, ilidx]}
+        else:
+            reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]}
+
+        left = self._reindex_with_indexers(
+            reindexers, fill_value=fill_value, allow_dups=True
+        )
+        # other must be always DataFrame
+        right = other._reindex_with_indexers(
+            {0: [join_index, iridx], 1: [join_columns, cridx]},
+            fill_value=fill_value,
+            allow_dups=True,
+        )
+        return left, right, join_index
+
+    @final
+    def _align_series(
+        self,
+        other: Series,
+        join: AlignJoin = "outer",
+        axis: Axis | None = None,
+        level=None,
+        fill_value=None,
+    ) -> tuple[Self, Series, Index | None]:
+        is_series = isinstance(self, ABCSeries)
+
+        if (not is_series and axis is None) or axis not in [None, 0, 1]:
+            raise ValueError("Must specify axis=0 or 1")
+
+        if is_series and axis == 1:
+            raise ValueError("cannot align series to a series other than axis 0")
+
+        # series/series compat, other must always be a Series
+        if not axis:
+            # equal
+            if self.index.equals(other.index):
+                join_index, lidx, ridx = None, None, None
+            else:
+                join_index, lidx, ridx = self.index.join(
+                    other.index, how=join, level=level, return_indexers=True
+                )
+
+            if is_series:
+                left = self._reindex_indexer(join_index, lidx)
+            elif lidx is None or join_index is None:
+                left = self.copy(deep=False)
+            else:
+                new_mgr = self._mgr.reindex_indexer(join_index, lidx, axis=1)
+                left = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
+
+            right = other._reindex_indexer(join_index, ridx)
+
+        else:
+            # one has > 1 ndim
+            fdata = self._mgr
+            join_index = self.axes[1]
+            lidx, ridx = None, None
+            if not join_index.equals(other.index):
+                join_index, lidx, ridx = join_index.join(
+                    other.index, how=join, level=level, return_indexers=True
+                )
+
+            if lidx is not None:
+                bm_axis = self._get_block_manager_axis(1)
+                fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)
+
+            left = self._constructor_from_mgr(fdata, axes=fdata.axes)
+
+            right = other._reindex_indexer(join_index, ridx)
+
+        # fill
+        fill_na = notna(fill_value)
+        if fill_na:
+            left = left.fillna(fill_value)
+            right = right.fillna(fill_value)
+
+        return left, right, join_index
+
+    @final
+    def _where(
+        self,
+        cond,
+        other=lib.no_default,
+        *,
+        inplace: bool = False,
+        axis: Axis | None = None,
+        level=None,
+    ) -> Self:
+        """
+        Equivalent to public method `where`, except that `other` is not
+        applied as a function even if callable. Used in __setitem__.
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+
+        if axis is not None:
+            axis = self._get_axis_number(axis)
+
+        # align the cond to same shape as myself
+        cond = common.apply_if_callable(cond, self)
+        if isinstance(cond, NDFrame):
+            # CoW: Make sure reference is not kept alive
+            if cond.ndim == 1 and self.ndim == 2:
+                cond = cond._constructor_expanddim(
+                    dict.fromkeys(range(len(self.columns)), cond),
+                    copy=False,
+                )
+                cond.columns = self.columns
+            cond = cond.align(self, join="right")[0]
+        else:
+            if not hasattr(cond, "shape"):
+                cond = np.asanyarray(cond)
+            if cond.shape != self.shape:
+                raise ValueError("Array conditional must be same shape as self")
+            cond = self._constructor(cond, **self._construct_axes_dict(), copy=False)
+
+        # make sure we are boolean
+        fill_value = bool(inplace)
+        cond = cond.fillna(fill_value)
+        cond = cond.infer_objects()
+
+        msg = "Boolean array expected for the condition, not {dtype}"
+
+        if not cond.empty:
+            if not isinstance(cond, ABCDataFrame):
+                # This is a single-dimensional object.
+                if not is_bool_dtype(cond):
+                    raise TypeError(msg.format(dtype=cond.dtype))
+            else:
+                for block in cond._mgr.blocks:
+                    if not is_bool_dtype(block.dtype):
+                        raise TypeError(msg.format(dtype=block.dtype))
+                if cond._mgr.any_extension_types:
+                    # GH51574: avoid object ndarray conversion later on
+                    cond = cond._constructor(
+                        cond.to_numpy(dtype=bool, na_value=fill_value),
+                        **cond._construct_axes_dict(),
+                    )
+        else:
+            # GH#21947 we have an empty DataFrame/Series, could be object-dtype
+            cond = cond.astype(bool)
+
+        cond = -cond if inplace else cond
+        cond = cond.reindex(self._info_axis, axis=self._info_axis_number)
+
+        # try to align with other
+        if isinstance(other, NDFrame):
+            # align with me
+            if other.ndim <= self.ndim:
+                # CoW: Make sure reference is not kept alive
+                other = self.align(
+                    other,
+                    join="left",
+                    axis=axis,
+                    level=level,
+                    fill_value=None,
+                )[1]
+
+                # if we are NOT aligned, raise as we cannot where index
+                if axis is None and not other._indexed_same(self):
+                    raise InvalidIndexError
+
+                if other.ndim < self.ndim:
+                    other = other._values
+                    if isinstance(other, np.ndarray):
+                        # TODO(EA2D): could also do this for NDArrayBackedEA cases?
+                        if axis == 0:
+                            other = np.reshape(other, (-1, 1))
+                        elif axis == 1:
+                            other = np.reshape(other, (1, -1))
+
+                        other = np.broadcast_to(other, self.shape)
+                    else:
+                        # GH#38729, GH#62038 avoid lossy casting or object-casting
+                        if axis == 0:
+                            res_cols = [
+                                self.iloc[:, i]._where(
+                                    cond.iloc[:, i],
+                                    other,
+                                )
+                                for i in range(self.shape[1])
+                            ]
+                        elif axis == 1:
+                            # TODO: can we use a zero-copy alternative to "repeat"?
+                            res_cols = [
+                                self.iloc[:, i]._where(
+                                    cond.iloc[:, i],
+                                    other[i : i + 1].repeat(len(self)),
+                                )
+                                for i in range(self.shape[1])
+                            ]
+                        res = self._constructor(dict(enumerate(res_cols)))
+                        res.index = self.index
+                        res.columns = self.columns
+                        if inplace:
+                            self._update_inplace(res)
+                            return self
+                        return res.__finalize__(self)
+
+            # slice me out of the other
+            else:
+                raise NotImplementedError(
+                    "cannot align with a higher dimensional NDFrame"
+                )
+
+        elif not isinstance(other, (MultiIndex, NDFrame)):
+            # mainly just catching Index here
+            other = extract_array(other, extract_numpy=True)
+
+        if isinstance(other, (np.ndarray, ExtensionArray)):
+            if other.shape != self.shape:
+                if self.ndim != 1:
+                    # In the ndim == 1 case we may have
+                    #  other length 1, which we treat as scalar (GH#2745, GH#4192)
+                    #  or len(other) == icond.sum(), which we treat like
+                    #  __setitem__ (GH#3235)
+                    raise ValueError(
+                        "other must be the same shape as self when an ndarray"
+                    )
+
+            # we are the same shape, so create an actual object for alignment
+            else:
+                other = self._constructor(
+                    other, **self._construct_axes_dict(), copy=False
+                )
+
+        if axis is None:
+            axis = 0
+
+        if self.ndim == getattr(other, "ndim", 0):
+            align = True
+        else:
+            align = self._get_axis_number(axis) == 1
+
+        if inplace:
+            # we may have different type blocks come out of putmask, so
+            # reconstruct the block manager
+
+            new_data = self._mgr.putmask(mask=cond, new=other, align=align)
+            result = self._constructor_from_mgr(new_data, axes=new_data.axes)
+            self._update_inplace(result)
+            return self
+
+        else:
+            new_data = self._mgr.where(
+                other=other,
+                cond=cond,
+                align=align,
+            )
+            result = self._constructor_from_mgr(new_data, axes=new_data.axes)
+            return result.__finalize__(self)
+
+    @final
+    def where(
+        self,
+        cond,
+        other=lib.no_default,
+        *,
+        inplace: bool = False,
+        axis: Axis | None = None,
+        level: Level | None = None,
+    ) -> Self:
+        """
+        Replace values where the condition is False.
+
+        This method allows conditional replacement of values. Where the
+        condition evaluates to True, the original values are retained; where
+        it evaluates to False, values are replaced with corresponding entries
+        from ``other``.
+
+        Parameters
+        ----------
+        cond : bool Series/DataFrame, array-like, or callable
+            Where `cond` is True, keep the original value. Where
+            False, replace with corresponding value from `other`.
+            If `cond` is callable, it is computed on the Series/DataFrame and
+            should return boolean Series/DataFrame or array. The callable must
+            not change input Series/DataFrame (though pandas doesn't check it).
+        other : scalar, Series/DataFrame, or callable
+            Entries where `cond` is False are replaced with
+            corresponding value from `other`.
+            If other is callable, it is computed on the Series/DataFrame and
+            should return scalar or Series/DataFrame. The callable must not
+            change input Series/DataFrame (though pandas doesn't check it).
+            If not specified, entries will be filled with the corresponding
+            NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension
+            dtypes).
+        inplace : bool, default False
+            Whether to perform the operation in place on the data.
+        axis : int, default None
+            Alignment axis if needed. For `Series` this parameter is
+            unused and defaults to 0.
+        level : int, default None
+            Alignment level if needed.
+
+        Returns
+        -------
+        Series or DataFrame
+            When applied to a Series, the function will return a Series,
+            and when applied to a DataFrame, it will return a DataFrame.
+
+        See Also
+        --------
+        :func:`DataFrame.mask` : Return an object of same shape as caller.
+        :func:`Series.mask` : Return an object of same shape as caller.
+
+        Notes
+        -----
+        The where method is an application of the if-then idiom. For each
+        element in the caller, if ``cond`` is ``True`` the
+        element is used; otherwise the corresponding element from
+        ``other`` is used. If the axis of ``other`` does not align with axis of
+        ``cond`` Series/DataFrame, the values of ``cond`` on misaligned index positions
+        will be filled with False.
+
+        The signature for :func:`Series.where` or
+        :func:`DataFrame.where` differs from :func:`numpy.where`.
+        Roughly ``df1.where(m, df2)`` is equivalent to ``np.where(m, df1, df2)``.
+
+        For further details and examples see the ``where`` documentation in
+        :ref:`indexing <indexing.where_mask>`.
+
+        The dtype of the object takes precedence. The fill value is casted to
+        the object's dtype, if this can be done losslessly.
+
+        Examples
+        --------
+        >>> s = pd.Series(range(5))
+        >>> s.where(s > 0)
+        0    NaN
+        1    1.0
+        2    2.0
+        3    3.0
+        4    4.0
+        dtype: float64
+        >>> s.mask(s > 0)
+        0    0.0
+        1    NaN
+        2    NaN
+        3    NaN
+        4    NaN
+        dtype: float64
+
+        >>> s = pd.Series(range(5))
+        >>> t = pd.Series([True, False])
+        >>> s.where(t, 99)
+        0     0
+        1    99
+        2    99
+        3    99
+        4    99
+        dtype: int64
+        >>> s.mask(t, 99)
+        0    99
+        1     1
+        2    99
+        3    99
+        4    99
+        dtype: int64
+
+        >>> s.where(s > 1, 10)
+        0    10
+        1    10
+        2    2
+        3    3
+        4    4
+        dtype: int64
+        >>> s.mask(s > 1, 10)
+        0     0
+        1     1
+        2    10
+        3    10
+        4    10
+        dtype: int64
+
+        >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"])
+        >>> df
+           A  B
+        0  0  1
+        1  2  3
+        2  4  5
+        3  6  7
+        4  8  9
+        >>> m = df % 3 == 0
+        >>> df.where(m, -df)
+           A  B
+        0  0 -1
+        1 -2  3
+        2 -4 -5
+        3  6 -7
+        4 -8  9
+        >>> df.where(m, -df) == np.where(m, df, -df)
+              A     B
+        0  True  True
+        1  True  True
+        2  True  True
+        3  True  True
+        4  True  True
+        >>> df.where(m, -df) == df.mask(~m, -df)
+              A     B
+        0  True  True
+        1  True  True
+        2  True  True
+        3  True  True
+        4  True  True
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        if inplace:
+            if not CHAINED_WARNING_DISABLED:
+                if sys.getrefcount(
+                    self
+                ) <= REF_COUNT_METHOD and not common.is_local_in_caller_frame(self):
+                    warnings.warn(
+                        _chained_assignment_method_msg,
+                        ChainedAssignmentError,
+                        stacklevel=2,
+                    )
+
+        other = common.apply_if_callable(other, self)
+        return self._where(cond, other, inplace=inplace, axis=axis, level=level)
+
+    @final
+    def mask(
+        self,
+        cond,
+        other=lib.no_default,
+        *,
+        inplace: bool = False,
+        axis: Axis | None = None,
+        level: Level | None = None,
+    ) -> Self:
+        """
+        Replace values where the condition is True.
+
+        Parameters
+        ----------
+        cond : bool Series/DataFrame, array-like, or callable
+            Where `cond` is False, keep the original value. Where
+            True, replace with corresponding value from `other`.
+            If `cond` is callable, it is computed on the Series/DataFrame and
+            should return boolean Series/DataFrame or array. The callable must
+            not change input Series/DataFrame (though pandas doesn't check it).
+        other : scalar, Series/DataFrame, or callable
+            Entries where `cond` is True are replaced with
+            corresponding value from `other`.
+            If other is callable, it is computed on the Series/DataFrame and
+            should return scalar or Series/DataFrame. The callable must not
+            change input Series/DataFrame (though pandas doesn't check it).
+            If not specified, entries will be filled with the corresponding
+            NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension
+            dtypes).
+        inplace : bool, default False
+            Whether to perform the operation in place on the data.
+        axis : int, default None
+            Alignment axis if needed. For `Series` this parameter is
+            unused and defaults to 0.
+        level : int, default None
+            Alignment level if needed.
+
+        Returns
+        -------
+        Series or DataFrame
+            When applied to a Series, the function will return a Series,
+            and when applied to a DataFrame, it will return a DataFrame.
+
+        See Also
+        --------
+        :func:`DataFrame.where` : Return an object of same shape as caller.
+        :func:`Series.where` : Return an object of same shape as caller.
+
+        Notes
+        -----
+        The mask method is an application of the if-then idiom. For each
+        element in the caller, if ``cond`` is ``False`` the
+        element is used; otherwise the corresponding element from
+        ``other`` is used. If the axis of ``other`` does not align with axis of
+        ``cond`` Series/DataFrame, the values of ``cond`` on misaligned index positions
+        will be filled with True.
+
+        The signature for :func:`Series.where` or
+        :func:`DataFrame.where` differs from :func:`numpy.where`.
+        Roughly ``df1.where(m, df2)`` is equivalent to ``np.where(m, df1, df2)``.
+
+        For further details and examples see the ``mask`` documentation in
+        :ref:`indexing <indexing.where_mask>`.
+
+        The dtype of the object takes precedence. The fill value is casted to
+        the object's dtype, if this can be done losslessly.
+
+        Examples
+        --------
+        >>> s = pd.Series(range(5))
+        >>> s.where(s > 0)
+        0    NaN
+        1    1.0
+        2    2.0
+        3    3.0
+        4    4.0
+        dtype: float64
+        >>> s.mask(s > 0)
+        0    0.0
+        1    NaN
+        2    NaN
+        3    NaN
+        4    NaN
+        dtype: float64
+
+        >>> s = pd.Series(range(5))
+        >>> t = pd.Series([True, False])
+        >>> s.where(t, 99)
+        0     0
+        1    99
+        2    99
+        3    99
+        4    99
+        dtype: int64
+        >>> s.mask(t, 99)
+        0    99
+        1     1
+        2    99
+        3    99
+        4    99
+        dtype: int64
+
+        >>> s.where(s > 1, 10)
+        0    10
+        1    10
+        2    2
+        3    3
+        4    4
+        dtype: int64
+        >>> s.mask(s > 1, 10)
+        0     0
+        1     1
+        2    10
+        3    10
+        4    10
+        dtype: int64
+
+        >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=["A", "B"])
+        >>> df
+           A  B
+        0  0  1
+        1  2  3
+        2  4  5
+        3  6  7
+        4  8  9
+        >>> m = df % 3 == 0
+        >>> df.where(m, -df)
+           A  B
+        0  0 -1
+        1 -2  3
+        2 -4 -5
+        3  6 -7
+        4 -8  9
+        >>> df.where(m, -df) == np.where(m, df, -df)
+              A     B
+        0  True  True
+        1  True  True
+        2  True  True
+        3  True  True
+        4  True  True
+        >>> df.where(m, -df) == df.mask(~m, -df)
+              A     B
+        0  True  True
+        1  True  True
+        2  True  True
+        3  True  True
+        4  True  True
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        if inplace:
+            if not CHAINED_WARNING_DISABLED:
+                if sys.getrefcount(
+                    self
+                ) <= REF_COUNT_METHOD and not common.is_local_in_caller_frame(self):
+                    warnings.warn(
+                        _chained_assignment_method_msg,
+                        ChainedAssignmentError,
+                        stacklevel=2,
+                    )
+
+        cond = common.apply_if_callable(cond, self)
+        other = common.apply_if_callable(other, self)
+
+        # see gh-21891
+        if not hasattr(cond, "__invert__"):
+            cond = np.array(cond)
+
+        return self._where(
+            ~cond,
+            other=other,
+            inplace=inplace,
+            axis=axis,
+            level=level,
+        )
+
+    def shift(
+        self,
+        periods: int | Sequence[int] = 1,
+        freq=None,
+        axis: Axis = 0,
+        fill_value: Hashable = lib.no_default,
+        suffix: str | None = None,
+    ) -> Self | DataFrame:
+        """
+        Shift index by desired number of periods with an optional time `freq`.
+
+        When `freq` is not passed, shift the index without realigning the data.
+        If `freq` is passed (in this case, the index must be date or datetime,
+        or it will raise a `NotImplementedError`), the index will be
+        increased using the periods and the `freq`. `freq` can be inferred
+        when specified as "infer" as long as either freq or inferred_freq
+        attribute is set in the index.
+
+        Parameters
+        ----------
+        periods : int or Sequence
+            Number of periods to shift. Can be positive or negative.
+            If an iterable of ints, the data will be shifted once by each int.
+            This is equivalent to shifting by one value at a time and
+            concatenating all resulting frames. The resulting columns will have
+            the shift suffixed to their column names. For multiple periods,
+            axis must not be 1.
+        freq : DateOffset, tseries.offsets, timedelta, or str, optional
+            Offset to use from the tseries module or time rule (e.g. 'EOM').
+            If `freq` is specified then the index values are shifted but the
+            data is not realigned. That is, use `freq` if you would like to
+            extend the index when shifting and preserve the original data.
+            If `freq` is specified as "infer" then it will be inferred from
+            the freq or inferred_freq attributes of the index. If neither of
+            those attributes exist, a ValueError is thrown.
+        axis : {{0 or 'index', 1 or 'columns', None}}, default None
+            Shift direction. For `Series` this parameter is unused and defaults to 0.
+        fill_value : object, optional
+            The scalar value to use for newly introduced missing values.
+            the default depends on the dtype of `self`.
+            For Boolean and numeric NumPy data types, ``np.nan`` is used.
+            For datetime, timedelta, or period data, etc. :attr:`NaT` is used.
+            For extension dtypes, ``self.dtype.na_value`` is used.
+        suffix : str, optional
+            If str and periods is an iterable, this is added after the column
+            name and before the shift value for each shifted column name.
+            For `Series` this parameter is unused and defaults to `None`.
+
+        Returns
+        -------
+        Series/DataFrame
+            Copy of input object, shifted.
+
+        See Also
+        --------
+        Index.shift : Shift values of Index.
+        DatetimeIndex.shift : Shift values of DatetimeIndex.
+        PeriodIndex.shift : Shift values of PeriodIndex.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [[10, 13, 17], [20, 23, 27], [15, 18, 22], [30, 33, 37], [45, 48, 52]],
+        ...     columns=["Col1", "Col2", "Col3"],
+        ...     index=pd.date_range("2020-01-01", "2020-01-05"),
+        ... )
+        >>> df
+                    Col1  Col2  Col3
+        2020-01-01    10    13    17
+        2020-01-02    20    23    27
+        2020-01-03    15    18    22
+        2020-01-04    30    33    37
+        2020-01-05    45    48    52
+
+        >>> df.shift(periods=3)
+                    Col1  Col2  Col3
+        2020-01-01   NaN   NaN   NaN
+        2020-01-02   NaN   NaN   NaN
+        2020-01-03   NaN   NaN   NaN
+        2020-01-04  10.0  13.0  17.0
+        2020-01-05  20.0  23.0  27.0
+
+        >>> df.shift(periods=1, axis="columns")
+                    Col1  Col2  Col3
+        2020-01-01   NaN    10    13
+        2020-01-02   NaN    20    23
+        2020-01-03   NaN    15    18
+        2020-01-04   NaN    30    33
+        2020-01-05   NaN    45    48
+
+        >>> df.shift(periods=3, fill_value=0)
+                    Col1  Col2  Col3
+        2020-01-01     0     0     0
+        2020-01-02     0     0     0
+        2020-01-03     0     0     0
+        2020-01-04    10    13    17
+        2020-01-05    20    23    27
+
+        >>> df.shift(periods=3, freq="D")
+                    Col1  Col2  Col3
+        2020-01-04    10    13    17
+        2020-01-05    20    23    27
+        2020-01-06    15    18    22
+        2020-01-07    30    33    37
+        2020-01-08    45    48    52
+
+        >>> df.shift(periods=3, freq="infer")
+                    Col1  Col2  Col3
+        2020-01-04    10    13    17
+        2020-01-05    20    23    27
+        2020-01-06    15    18    22
+        2020-01-07    30    33    37
+        2020-01-08    45    48    52
+
+        >>> df["Col1"].shift(periods=[0, 1, 2])
+                    Col1_0  Col1_1  Col1_2
+        2020-01-01      10     NaN     NaN
+        2020-01-02      20    10.0     NaN
+        2020-01-03      15    20.0    10.0
+        2020-01-04      30    15.0    20.0
+        2020-01-05      45    30.0    15.0
+        """
+        axis = self._get_axis_number(axis)
+
+        if freq is not None and fill_value is not lib.no_default:
+            # GH#53832
+            raise ValueError(
+                "Passing a 'freq' together with a 'fill_value' is not allowed."
+            )
+
+        if periods == 0:
+            return self.copy(deep=False)
+
+        if is_list_like(periods) and isinstance(self, ABCSeries):
+            return self.to_frame().shift(
+                periods=periods, freq=freq, axis=axis, fill_value=fill_value
+            )
+        periods = cast(int, periods)
+
+        if freq is None:
+            # when freq is None, data is shifted, index is not
+            axis = self._get_axis_number(axis)
+            assert axis == 0  # axis == 1 cases handled in DataFrame.shift
+            new_data = self._mgr.shift(periods=periods, fill_value=fill_value)
+            return self._constructor_from_mgr(
+                new_data, axes=new_data.axes
+            ).__finalize__(self, method="shift")
+
+        return self._shift_with_freq(periods, axis, freq)
+
+    @final
+    def _shift_with_freq(self, periods: int, axis: int, freq) -> Self:
+        # see shift.__doc__
+        # when freq is given, index is shifted, data is not
+        index = self._get_axis(axis)
+
+        if freq == "infer":
+            freq = getattr(index, "freq", None)
+
+            if freq is None:
+                freq = getattr(index, "inferred_freq", None)
+
+            if freq is None:
+                msg = "Freq was not set in the index hence cannot be inferred"
+                raise ValueError(msg)
+
+        elif isinstance(freq, str):
+            is_period = isinstance(index, PeriodIndex)
+            freq = to_offset(freq, is_period=is_period)
+
+        if isinstance(index, PeriodIndex):
+            orig_freq = to_offset(index.freq)
+            if freq != orig_freq:
+                assert orig_freq is not None  # for mypy
+                raise ValueError(
+                    f"Given freq {PeriodDtype(freq)._freqstr} "
+                    f"does not match PeriodIndex freq "
+                    f"{PeriodDtype(orig_freq)._freqstr}"
+                )
+            new_ax: Index = index.shift(periods)
+        else:
+            new_ax = index.shift(periods, freq)
+
+        result = self.set_axis(new_ax, axis=axis)
+        return result.__finalize__(self, method="shift")
+
+    @final
+    def truncate(
+        self,
+        before=None,
+        after=None,
+        axis: Axis | None = None,
+        copy: bool | lib.NoDefault = lib.no_default,
+    ) -> Self:
+        """
+        Truncate a Series or DataFrame before and after some index value.
+
+        This is a useful shorthand for boolean indexing based on index
+        values above or below certain thresholds.
+
+        Parameters
+        ----------
+        before : date, str, int
+            Truncate all rows before this index value.
+        after : date, str, int
+            Truncate all rows after this index value.
+        axis : {0 or 'index', 1 or 'columns'}, optional
+            Axis to truncate. Truncates the index (rows) by default.
+            For `Series` this parameter is unused and defaults to 0.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        Returns
+        -------
+        type of caller
+            The truncated Series or DataFrame.
+
+        See Also
+        --------
+        DataFrame.loc : Select a subset of a DataFrame by label.
+        DataFrame.iloc : Select a subset of a DataFrame by position.
+
+        Notes
+        -----
+        If the index being truncated contains only datetime values,
+        `before` and `after` may be specified as strings instead of
+        Timestamps.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "A": ["a", "b", "c", "d", "e"],
+        ...         "B": ["f", "g", "h", "i", "j"],
+        ...         "C": ["k", "l", "m", "n", "o"],
+        ...     },
+        ...     index=[1, 2, 3, 4, 5],
+        ... )
+        >>> df
+           A  B  C
+        1  a  f  k
+        2  b  g  l
+        3  c  h  m
+        4  d  i  n
+        5  e  j  o
+
+        >>> df.truncate(before=2, after=4)
+           A  B  C
+        2  b  g  l
+        3  c  h  m
+        4  d  i  n
+
+        The columns of a DataFrame can be truncated.
+
+        >>> df.truncate(before="A", after="B", axis="columns")
+           A  B
+        1  a  f
+        2  b  g
+        3  c  h
+        4  d  i
+        5  e  j
+
+        For Series, only rows can be truncated.
+
+        >>> df["A"].truncate(before=2, after=4)
+        2    b
+        3    c
+        4    d
+        Name: A, dtype: str
+
+        The index values in ``truncate`` can be datetimes or string
+        dates.
+
+        >>> dates = pd.date_range("2016-01-01", "2016-02-01", freq="s")
+        >>> df = pd.DataFrame(index=dates, data={"A": 1})
+        >>> df.tail()
+                             A
+        2016-01-31 23:59:56  1
+        2016-01-31 23:59:57  1
+        2016-01-31 23:59:58  1
+        2016-01-31 23:59:59  1
+        2016-02-01 00:00:00  1
+
+        >>> df.truncate(
+        ...     before=pd.Timestamp("2016-01-05"), after=pd.Timestamp("2016-01-10")
+        ... ).tail()
+                             A
+        2016-01-09 23:59:56  1
+        2016-01-09 23:59:57  1
+        2016-01-09 23:59:58  1
+        2016-01-09 23:59:59  1
+        2016-01-10 00:00:00  1
+
+        Because the index is a DatetimeIndex containing only dates, we can
+        specify `before` and `after` as strings. They will be coerced to
+        Timestamps before truncation.
+
+        >>> df.truncate("2016-01-05", "2016-01-10").tail()
+                             A
+        2016-01-09 23:59:56  1
+        2016-01-09 23:59:57  1
+        2016-01-09 23:59:58  1
+        2016-01-09 23:59:59  1
+        2016-01-10 00:00:00  1
+
+        Note that ``truncate`` assumes a 0 value for any unspecified time
+        component (midnight). This differs from partial string slicing, which
+        returns any partially matching dates.
+
+        >>> df.loc["2016-01-05":"2016-01-10", :].tail()
+                             A
+        2016-01-10 23:59:55  1
+        2016-01-10 23:59:56  1
+        2016-01-10 23:59:57  1
+        2016-01-10 23:59:58  1
+        2016-01-10 23:59:59  1
+        """
+        self._check_copy_deprecation(copy)
+
+        if axis is None:
+            axis = 0
+        axis = self._get_axis_number(axis)
+        ax = self._get_axis(axis)
+
+        # GH 17935
+        # Check that index is sorted
+        if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
+            raise ValueError("truncate requires a sorted index")
+
+        # if we have a date index, convert to dates, otherwise
+        # treat like a slice
+        if ax._is_all_dates:
+            from pandas.core.tools.datetimes import to_datetime
+
+            if before is not None:
+                # Avoid converting to NaT
+                before = to_datetime(before)
+            if after is not None:
+                # Avoid converting to NaT
+                after = to_datetime(after)
+
+        if before is not None and after is not None and before > after:
+            raise ValueError(f"Truncate: {after} must be after {before}")
+
+        if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
+            before, after = after, before
+
+        slicer = [slice(None, None)] * self._AXIS_LEN
+        slicer[axis] = slice(before, after)
+        result = self.loc[tuple(slicer)]
+
+        if isinstance(ax, MultiIndex):
+            setattr(result, self._get_axis_name(axis), ax.truncate(before, after))
+
+        result = result.copy(deep=False)
+
+        return result
+
+    @final
+    def tz_convert(
+        self,
+        tz,
+        axis: Axis = 0,
+        level=None,
+        copy: bool | lib.NoDefault = lib.no_default,
+    ) -> Self:
+        """
+        Convert tz-aware axis to target time zone.
+
+        Parameters
+        ----------
+        tz : str or tzinfo object or None
+            Target time zone. Passing ``None`` will convert to
+            UTC and remove the timezone information.
+        axis : {{0 or 'index', 1 or 'columns'}}, default 0
+            The axis to convert
+        level : int, str, default None
+            If axis is a MultiIndex, convert a specific level. Otherwise
+            must be None.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        Returns
+        -------
+        Series/DataFrame
+            Object with time zone converted axis.
+
+        Raises
+        ------
+        TypeError
+            If the axis is tz-naive.
+
+        See Also
+        --------
+        DataFrame.tz_localize: Localize tz-naive index of DataFrame to target time zone.
+        Series.tz_localize: Localize tz-naive index of Series to target time zone.
+
+        Examples
+        --------
+        Change to another time zone:
+
+        >>> s = pd.Series(
+        ...     [1],
+        ...     index=pd.DatetimeIndex(["2018-09-15 01:30:00+02:00"]),
+        ... )
+        >>> s.tz_convert("Asia/Shanghai")
+        2018-09-15 07:30:00+08:00    1
+        dtype: int64
+
+        Pass None to convert to UTC and get a tz-naive index:
+
+        >>> s = pd.Series([1], index=pd.DatetimeIndex(["2018-09-15 01:30:00+02:00"]))
+        >>> s.tz_convert(None)
+        2018-09-14 23:30:00    1
+        dtype: int64
+        """
+        self._check_copy_deprecation(copy)
+        axis = self._get_axis_number(axis)
+        ax = self._get_axis(axis)
+
+        def _tz_convert(ax, tz):
+            if not hasattr(ax, "tz_convert"):
+                if len(ax) > 0:
+                    ax_name = self._get_axis_name(axis)
+                    raise TypeError(
+                        f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
+                    )
+                ax = DatetimeIndex([], tz=tz)
+            else:
+                ax = ax.tz_convert(tz)
+            return ax
+
+        # if a level is given it must be a MultiIndex level or
+        # equivalent to the axis name
+        if isinstance(ax, MultiIndex):
+            level = ax._get_level_number(level)
+            new_level = _tz_convert(ax.levels[level], tz)
+            ax = ax.set_levels(new_level, level=level)
+        else:
+            if level not in (None, 0, ax.name):
+                raise ValueError(f"The level {level} is not valid")
+            ax = _tz_convert(ax, tz)
+
+        result = self.copy(deep=False)
+        result = result.set_axis(ax, axis=axis)
+        return result.__finalize__(self, method="tz_convert")
+
+    @final
+    def tz_localize(
+        self,
+        tz,
+        axis: Axis = 0,
+        level=None,
+        copy: bool | lib.NoDefault = lib.no_default,
+        ambiguous: TimeAmbiguous = "raise",
+        nonexistent: TimeNonexistent = "raise",
+    ) -> Self:
+        """
+        Localize time zone naive index of a Series or DataFrame to target time zone.
+
+        This operation localizes the Index. To localize the values in a
+        time zone naive Series, use :meth:`Series.dt.tz_localize`.
+
+        Parameters
+        ----------
+        tz : str or tzinfo or None
+            Time zone to localize. Passing ``None`` will remove the
+            time zone information and preserve local time.
+        axis : {{0 or 'index', 1 or 'columns'}}, default 0
+            The axis to localize
+        level : int, str, default None
+            If axis ia a MultiIndex, localize a specific level. Otherwise
+            must be None.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        ambiguous : 'infer', bool, bool-ndarray, 'NaT', default 'raise'
+            When clocks moved backward due to DST, ambiguous times may arise.
+            For example in Central European Time (UTC+01), when going from
+            03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
+            00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
+            `ambiguous` parameter dictates how ambiguous times should be
+            handled.
+
+            - 'infer' will attempt to infer fall dst-transition hours based on
+              order
+            - bool (or bool-ndarray) where True signifies a DST time, False designates
+              a non-DST time (note that this flag is only applicable for
+              ambiguous times)
+            - 'NaT' will return NaT where there are ambiguous times
+            - 'raise' will raise a ValueError if there are ambiguous
+              times.
+        nonexistent : str, default 'raise'
+            A nonexistent time does not exist in a particular timezone
+            where clocks moved forward due to DST. Valid values are:
+
+            - 'shift_forward' will shift the nonexistent time forward to the
+              closest existing time
+            - 'shift_backward' will shift the nonexistent time backward to the
+              closest existing time
+            - 'NaT' will return NaT where there are nonexistent times
+            - timedelta objects will shift nonexistent times by the timedelta
+            - 'raise' will raise a ValueError if there are
+              nonexistent times.
+
+        Returns
+        -------
+        Series/DataFrame
+            Same type as the input, with time zone naive or aware index, depending on
+            ``tz``.
+
+        Raises
+        ------
+        TypeError
+            If the TimeSeries is tz-aware and tz is not None.
+
+        See Also
+        --------
+        Series.dt.tz_localize: Localize the values in a time zone naive Series.
+        Timestamp.tz_localize: Localize the Timestamp to a timezone.
+
+        Examples
+        --------
+        Localize local times:
+
+        >>> s = pd.Series(
+        ...     [1],
+        ...     index=pd.DatetimeIndex(["2018-09-15 01:30:00"]),
+        ... )
+        >>> s.tz_localize("CET")
+        2018-09-15 01:30:00+02:00    1
+        dtype: int64
+
+        Pass None to convert to tz-naive index and preserve local time:
+
+        >>> s = pd.Series([1], index=pd.DatetimeIndex(["2018-09-15 01:30:00+02:00"]))
+        >>> s.tz_localize(None)
+        2018-09-15 01:30:00    1
+        dtype: int64
+
+        Be careful with DST changes. When there is sequential data, pandas
+        can infer the DST time:
+
+        >>> s = pd.Series(
+        ...     range(7),
+        ...     index=pd.DatetimeIndex(
+        ...         [
+        ...             "2018-10-28 01:30:00",
+        ...             "2018-10-28 02:00:00",
+        ...             "2018-10-28 02:30:00",
+        ...             "2018-10-28 02:00:00",
+        ...             "2018-10-28 02:30:00",
+        ...             "2018-10-28 03:00:00",
+        ...             "2018-10-28 03:30:00",
+        ...         ]
+        ...     ),
+        ... )
+        >>> s.tz_localize("CET", ambiguous="infer")
+        2018-10-28 01:30:00+02:00    0
+        2018-10-28 02:00:00+02:00    1
+        2018-10-28 02:30:00+02:00    2
+        2018-10-28 02:00:00+01:00    3
+        2018-10-28 02:30:00+01:00    4
+        2018-10-28 03:00:00+01:00    5
+        2018-10-28 03:30:00+01:00    6
+        dtype: int64
+
+        In some cases, inferring the DST is impossible. In such cases, you can
+        pass an ndarray to the ambiguous parameter to set the DST explicitly
+
+        >>> s = pd.Series(
+        ...     range(3),
+        ...     index=pd.DatetimeIndex(
+        ...         [
+        ...             "2018-10-28 01:20:00",
+        ...             "2018-10-28 02:36:00",
+        ...             "2018-10-28 03:46:00",
+        ...         ]
+        ...     ),
+        ... )
+        >>> s.tz_localize("CET", ambiguous=np.array([True, True, False]))
+        2018-10-28 01:20:00+02:00    0
+        2018-10-28 02:36:00+02:00    1
+        2018-10-28 03:46:00+01:00    2
+        dtype: int64
+
+        If the DST transition causes nonexistent times, you can shift these
+        dates forward or backward with a timedelta object or `'shift_forward'`
+        or `'shift_backward'`.
+
+        >>> dti = pd.DatetimeIndex(
+        ...     ["2015-03-29 02:30:00", "2015-03-29 03:30:00"], dtype="M8[ns]"
+        ... )
+        >>> s = pd.Series(range(2), index=dti)
+        >>> s.tz_localize("Europe/Warsaw", nonexistent="shift_forward")
+        2015-03-29 03:00:00+02:00    0
+        2015-03-29 03:30:00+02:00    1
+        dtype: int64
+        >>> s.tz_localize("Europe/Warsaw", nonexistent="shift_backward")
+        2015-03-29 01:59:59.999999999+01:00    0
+        2015-03-29 03:30:00+02:00              1
+        dtype: int64
+        >>> s.tz_localize("Europe/Warsaw", nonexistent=pd.Timedelta("1h"))
+        2015-03-29 03:30:00+02:00    0
+        2015-03-29 03:30:00+02:00    1
+        dtype: int64
+        """
+        self._check_copy_deprecation(copy)
+        nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")
+        if nonexistent not in nonexistent_options and not isinstance(
+            nonexistent, dt.timedelta
+        ):
+            raise ValueError(
+                "The nonexistent argument must be one of 'raise', "
+                "'NaT', 'shift_forward', 'shift_backward' or "
+                "a timedelta object"
+            )
+
+        axis = self._get_axis_number(axis)
+        ax = self._get_axis(axis)
+
+        def _tz_localize(ax, tz, ambiguous, nonexistent):
+            if not hasattr(ax, "tz_localize"):
+                if len(ax) > 0:
+                    ax_name = self._get_axis_name(axis)
+                    raise TypeError(
+                        f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
+                    )
+                ax = DatetimeIndex([], tz=tz)
+            else:
+                ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent)
+            return ax
+
+        # if a level is given it must be a MultiIndex level or
+        # equivalent to the axis name
+        if isinstance(ax, MultiIndex):
+            level = ax._get_level_number(level)
+            new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent)
+            ax = ax.set_levels(new_level, level=level)
+        else:
+            if level not in (None, 0, ax.name):
+                raise ValueError(f"The level {level} is not valid")
+            ax = _tz_localize(ax, tz, ambiguous, nonexistent)
+
+        result = self.copy(deep=False)
+        result = result.set_axis(ax, axis=axis)
+        return result.__finalize__(self, method="tz_localize")
+
+    # ----------------------------------------------------------------------
+    # Numeric Methods
+
+    @final
+    def describe(
+        self,
+        percentiles=None,
+        include=None,
+        exclude=None,
+    ) -> Self:
+        """
+        Generate descriptive statistics.
+
+        Descriptive statistics include those that summarize the central
+        tendency, dispersion and shape of a
+        dataset's distribution, excluding ``NaN`` values.
+
+        Analyzes both numeric and object series, as well
+        as ``DataFrame`` column sets of mixed data types. The output
+        will vary depending on what is provided. Refer to the notes
+        below for more detail.
+
+        Parameters
+        ----------
+        percentiles : list-like of numbers, optional
+            The percentiles to include in the output. All should
+            fall between 0 and 1. The default, ``None``, will automatically
+            return the 25th, 50th, and 75th percentiles.
+        include : 'all', list-like of dtypes or None (default), optional
+            A white list of data types to include in the result. Ignored
+            for ``Series``. Here are the options:
+
+            - 'all' : All columns of the input will be included in the output.
+            - A list-like of dtypes : Limits the results to the
+              provided data types.
+              To limit the result to numeric types submit
+              ``numpy.number``. To limit it instead to object columns submit
+              the ``numpy.object`` data type. Strings
+              can also be used in the style of
+              ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
+              select pandas categorical columns, use ``'category'``
+            - None (default) : The result will include all numeric columns.
+        exclude : list-like of dtypes or None (default), optional,
+            A black list of data types to omit from the result. Ignored
+            for ``Series``. Here are the options:
+
+            - A list-like of dtypes : Excludes the provided data types
+              from the result. To exclude numeric types submit
+              ``numpy.number``. To exclude object columns submit the data
+              type ``numpy.object``. Strings can also be used in the style of
+              ``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To
+              exclude pandas categorical columns, use ``'category'``
+            - None (default) : The result will exclude nothing.
+
+        Returns
+        -------
+        Series or DataFrame
+            Summary statistics of the Series or Dataframe provided.
+
+        See Also
+        --------
+        DataFrame.count: Count number of non-NA/null observations.
+        DataFrame.max: Maximum of the values in the object.
+        DataFrame.min: Minimum of the values in the object.
+        DataFrame.mean: Mean of the values.
+        DataFrame.std: Standard deviation of the observations.
+        DataFrame.select_dtypes: Subset of a DataFrame including/excluding
+            columns based on their dtype.
+
+        Notes
+        -----
+        For numeric data, the result's index will include ``count``,
+        ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and
+        upper percentiles. By default the lower percentile is ``25`` and the
+        upper percentile is ``75``. The ``50`` percentile is the
+        same as the median.
+
+        For object data (e.g. strings), the result's index
+        will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``
+        is the most common value. The ``freq`` is the most common value's
+        frequency.
+
+        If multiple object values have the highest count, then the
+        ``count`` and ``top`` results will be arbitrarily chosen from
+        among those with the highest count.
+
+        For mixed data types provided via a ``DataFrame``, the default is to
+        return only an analysis of numeric columns. If the DataFrame consists
+        only of object and categorical data without any numeric columns, the
+        default is to return an analysis of both the object and categorical
+        columns. If ``include='all'`` is provided as an option, the result
+        will include a union of attributes of each type.
+
+        The `include` and `exclude` parameters can be used to limit
+        which columns in a ``DataFrame`` are analyzed for the output.
+        The parameters are ignored when analyzing a ``Series``.
+
+        Examples
+        --------
+        Describing a numeric ``Series``.
+
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.describe()
+        count    3.0
+        mean     2.0
+        std      1.0
+        min      1.0
+        25%      1.5
+        50%      2.0
+        75%      2.5
+        max      3.0
+        dtype: float64
+
+        Describing a categorical ``Series``.
+
+        >>> s = pd.Series(["a", "a", "b", "c"])
+        >>> s.describe()
+        count     4
+        unique    3
+        top       a
+        freq      2
+        dtype: object
+
+        Describing a timestamp ``Series``.
+
+        >>> s = pd.Series(
+        ...     [
+        ...         np.datetime64("2000-01-01"),
+        ...         np.datetime64("2010-01-01"),
+        ...         np.datetime64("2010-01-01"),
+        ...     ]
+        ... )
+        >>> s.describe()
+        count                      3
+        mean     2006-09-01 08:00:00
+        min      2000-01-01 00:00:00
+        25%      2004-12-31 12:00:00
+        50%      2010-01-01 00:00:00
+        75%      2010-01-01 00:00:00
+        max      2010-01-01 00:00:00
+        dtype: object
+
+        Describing a ``DataFrame``. By default only numeric fields
+        are returned.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "categorical": pd.Categorical(["d", "e", "f"]),
+        ...         "numeric": [1, 2, 3],
+        ...         "object": ["a", "b", "c"],
+        ...     }
+        ... )
+        >>> df.describe()
+               numeric
+        count      3.0
+        mean       2.0
+        std        1.0
+        min        1.0
+        25%        1.5
+        50%        2.0
+        75%        2.5
+        max        3.0
+
+        Describing all columns of a ``DataFrame`` regardless of data type.
+
+        >>> df.describe(include="all")  # doctest: +SKIP
+               categorical  numeric object
+        count            3      3.0      3
+        unique           3      NaN      3
+        top              f      NaN      a
+        freq             1      NaN      1
+        mean           NaN      2.0    NaN
+        std            NaN      1.0    NaN
+        min            NaN      1.0    NaN
+        25%            NaN      1.5    NaN
+        50%            NaN      2.0    NaN
+        75%            NaN      2.5    NaN
+        max            NaN      3.0    NaN
+
+        Describing a column from a ``DataFrame`` by accessing it as
+        an attribute.
+
+        >>> df.numeric.describe()
+        count    3.0
+        mean     2.0
+        std      1.0
+        min      1.0
+        25%      1.5
+        50%      2.0
+        75%      2.5
+        max      3.0
+        Name: numeric, dtype: float64
+
+        Including only numeric columns in a ``DataFrame`` description.
+
+        >>> df.describe(include=[np.number])
+               numeric
+        count      3.0
+        mean       2.0
+        std        1.0
+        min        1.0
+        25%        1.5
+        50%        2.0
+        75%        2.5
+        max        3.0
+
+        Including only string columns in a ``DataFrame`` description.
+
+        >>> df.describe(include=[object])  # doctest: +SKIP
+               object
+        count       3
+        unique      3
+        top         a
+        freq        1
+
+        Including only categorical columns from a ``DataFrame`` description.
+
+        >>> df.describe(include=["category"])
+               categorical
+        count            3
+        unique           3
+        top              d
+        freq             1
+
+        Excluding numeric columns from a ``DataFrame`` description.
+
+        >>> df.describe(exclude=[np.number])  # doctest: +SKIP
+               categorical object
+        count            3      3
+        unique           3      3
+        top              f      a
+        freq             1      1
+
+        Excluding object columns from a ``DataFrame`` description.
+
+        >>> df.describe(exclude=[object])  # doctest: +SKIP
+               categorical  numeric
+        count            3      3.0
+        unique           3      NaN
+        top              f      NaN
+        freq             1      NaN
+        mean           NaN      2.0
+        std            NaN      1.0
+        min            NaN      1.0
+        25%            NaN      1.5
+        50%            NaN      2.0
+        75%            NaN      2.5
+        max            NaN      3.0
+        """
+        return describe_ndframe(
+            obj=self,
+            include=include,
+            exclude=exclude,
+            percentiles=percentiles,
+        ).__finalize__(self, method="describe")
+
+    @final
+    def pct_change(
+        self,
+        periods: int = 1,
+        fill_method: None = None,
+        freq=None,
+        **kwargs,
+    ) -> Self:
+        """
+        Fractional change between the current and a prior element.
+
+        Computes the fractional change from the immediately previous row by
+        default. This is useful in comparing the fraction of change in a time
+        series of elements.
+
+        .. note::
+
+            Despite the name of this method, it calculates fractional change
+            (also known as per unit change or relative change) and not
+            percentage change. If you need the percentage change, multiply
+            these values by 100.
+
+        Parameters
+        ----------
+        periods : int, default 1
+            Periods to shift for forming percent change.
+        fill_method : None
+            Must be None. This argument will be removed in a future version of pandas.
+        freq : DateOffset, timedelta, or str, optional
+            Increment to use from time series API (e.g. 'ME' or BDay()).
+        **kwargs
+            Additional keyword arguments are passed into
+            `DataFrame.shift` or `Series.shift`.
+
+        Returns
+        -------
+        Series or DataFrame
+            The same type as the calling object.
+
+        See Also
+        --------
+        Series.diff : Compute the difference of two elements in a Series.
+        DataFrame.diff : Compute the difference of two elements in a DataFrame.
+        Series.shift : Shift the index by some number of periods.
+        DataFrame.shift : Shift the index by some number of periods.
+
+        Examples
+        --------
+        **Series**
+
+        >>> s = pd.Series([90, 91, 85])
+        >>> s
+        0    90
+        1    91
+        2    85
+        dtype: int64
+
+        >>> s.pct_change()
+        0         NaN
+        1    0.011111
+        2   -0.065934
+        dtype: float64
+
+        >>> s.pct_change(periods=2)
+        0         NaN
+        1         NaN
+        2   -0.055556
+        dtype: float64
+
+        See the percentage change in a Series where filling NAs with last
+        valid observation forward to next valid.
+
+        >>> s = pd.Series([90, 91, None, 85])
+        >>> s
+        0    90.0
+        1    91.0
+        2     NaN
+        3    85.0
+        dtype: float64
+
+        >>> s.ffill().pct_change()
+        0         NaN
+        1    0.011111
+        2    0.000000
+        3   -0.065934
+        dtype: float64
+
+        **DataFrame**
+
+        Percentage change in French franc, Deutsche Mark, and Italian lira from
+        1980-01-01 to 1980-03-01.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "FR": [4.0405, 4.0963, 4.3149],
+        ...         "GR": [1.7246, 1.7482, 1.8519],
+        ...         "IT": [804.74, 810.01, 860.13],
+        ...     },
+        ...     index=["1980-01-01", "1980-02-01", "1980-03-01"],
+        ... )
+        >>> df
+                        FR      GR      IT
+        1980-01-01  4.0405  1.7246  804.74
+        1980-02-01  4.0963  1.7482  810.01
+        1980-03-01  4.3149  1.8519  860.13
+
+        >>> df.pct_change()
+                          FR        GR        IT
+        1980-01-01       NaN       NaN       NaN
+        1980-02-01  0.013810  0.013684  0.006549
+        1980-03-01  0.053365  0.059318  0.061876
+
+        Percentage of change in GOOG and APPL stock volume. Shows computing
+        the percentage change between columns.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "2016": [1769950, 30586265],
+        ...         "2015": [1500923, 40912316],
+        ...         "2014": [1371819, 41403351],
+        ...     },
+        ...     index=["GOOG", "APPL"],
+        ... )
+        >>> df
+                  2016      2015      2014
+        GOOG   1769950   1500923   1371819
+        APPL  30586265  40912316  41403351
+
+        >>> df.pct_change(axis="columns", periods=-1)
+                  2016      2015  2014
+        GOOG  0.179241  0.094112   NaN
+        APPL -0.252395 -0.011860   NaN
+        """
+        # GH#53491
+        if fill_method is not None:
+            raise ValueError(f"fill_method must be None; got {fill_method=}.")
+
+        axis = self._get_axis_number(kwargs.pop("axis", "index"))
+        shifted = self.shift(periods=periods, freq=freq, axis=axis, **kwargs)
+        # Unsupported left operand type for / ("Self")
+        rs = self / shifted - 1  # type: ignore[operator]
+        if freq is not None:
+            # Shift method is implemented differently when freq is not None
+            # We want to restore the original index
+            rs = rs.loc[~rs.index.duplicated()]
+            rs = rs.reindex_like(self)
+        return rs.__finalize__(self, method="pct_change")
+
+    @final
+    def _logical_func(
+        self,
+        name: str,
+        func,
+        axis: Axis | None = 0,
+        bool_only: bool = False,
+        skipna: bool = True,
+        **kwargs,
+    ) -> Series | bool:
+        nv.validate_logical_func((), kwargs, fname=name)
+        validate_bool_kwarg(skipna, "skipna", none_allowed=False)
+
+        if self.ndim > 1 and axis is None:
+            # Reduce along one dimension then the other, to simplify DataFrame._reduce
+            res = self._logical_func(
+                name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs
+            )
+            # error: Item "bool" of "Series | bool" has no attribute "_logical_func"
+            return res._logical_func(  # type: ignore[union-attr]
+                name, func, skipna=skipna, **kwargs
+            )
+        elif axis is None:
+            axis = 0
+
+        if (
+            self.ndim > 1
+            and axis == 1
+            and len(self._mgr.blocks) > 1
+            # TODO(EA2D): special-case not needed
+            and all(block.values.ndim == 2 for block in self._mgr.blocks)
+            and not kwargs
+        ):
+            # Fastpath avoiding potentially expensive transpose
+            obj = self
+            if bool_only:
+                obj = self._get_bool_data()
+            return obj._reduce_axis1(name, func, skipna=skipna)
+
+        return self._reduce(
+            func,
+            name=name,
+            axis=axis,
+            skipna=skipna,
+            numeric_only=bool_only,
+            filter_type="bool",
+        )
+
+    def any(
+        self,
+        *,
+        axis: Axis | None = 0,
+        bool_only: bool = False,
+        skipna: bool = True,
+        **kwargs,
+    ) -> Series | bool:
+        return self._logical_func(
+            "any", nanops.nanany, axis, bool_only, skipna, **kwargs
+        )
+
+    def all(
+        self,
+        *,
+        axis: Axis = 0,
+        bool_only: bool = False,
+        skipna: bool = True,
+        **kwargs,
+    ) -> Series | bool:
+        return self._logical_func(
+            "all", nanops.nanall, axis, bool_only, skipna, **kwargs
+        )
+
+    @final
+    def _accum_func(
+        self,
+        name: str,
+        func,
+        axis: Axis | None = None,
+        skipna: bool = True,
+        *args,
+        **kwargs,
+    ):
+        skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name)
+        if axis is None:
+            axis = 0
+        else:
+            axis = self._get_axis_number(axis)
+
+        if axis == 1:
+            return self.T._accum_func(
+                name,
+                func,
+                axis=0,
+                skipna=skipna,
+                *args,  # noqa: B026
+                **kwargs,
+            ).T
+
+        def block_accum_func(blk_values):
+            values = blk_values.T if hasattr(blk_values, "T") else blk_values
+
+            result: np.ndarray | ExtensionArray
+            if isinstance(values, ExtensionArray):
+                result = values._accumulate(name, skipna=skipna, **kwargs)
+            else:
+                result = nanops.na_accum_func(values, func, skipna=skipna)
+
+            result = result.T if hasattr(result, "T") else result
+            return result
+
+        result = self._mgr.apply(block_accum_func)
+
+        return self._constructor_from_mgr(result, axes=result.axes).__finalize__(
+            self, method=name
+        )
+
+    def cummax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self:
+        return self._accum_func(
+            "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs
+        )
+
+    def cummin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self:
+        return self._accum_func(
+            "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs
+        )
+
+    def cumsum(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self:
+        return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs)
+
+    def cumprod(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self:
+        return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs)
+
+    @final
+    def _stat_function_ddof(
+        self,
+        name: str,
+        func,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        ddof: int = 1,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | float:
+        nv.validate_stat_ddof_func((), kwargs, fname=name)
+        validate_bool_kwarg(skipna, "skipna", none_allowed=False)
+
+        return self._reduce(
+            func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof
+        )
+
+    def sem(
+        self,
+        *,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        ddof: int = 1,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | float:
+        return self._stat_function_ddof(
+            "sem", nanops.nansem, axis, skipna, ddof, numeric_only, **kwargs
+        )
+
+    def var(
+        self,
+        *,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        ddof: int = 1,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | float:
+        return self._stat_function_ddof(
+            "var", nanops.nanvar, axis, skipna, ddof, numeric_only, **kwargs
+        )
+
+    def std(
+        self,
+        *,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        ddof: int = 1,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | float:
+        return self._stat_function_ddof(
+            "std", nanops.nanstd, axis, skipna, ddof, numeric_only, **kwargs
+        )
+
+    @final
+    def _stat_function(
+        self,
+        name: str,
+        func,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ):
+        assert name in ["median", "mean", "min", "max", "kurt", "skew"], name
+        nv.validate_func(name, (), kwargs)
+
+        validate_bool_kwarg(skipna, "skipna", none_allowed=False)
+
+        return self._reduce(
+            func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
+        )
+
+    def min(
+        self,
+        *,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ):
+        return self._stat_function(
+            "min",
+            nanops.nanmin,
+            axis,
+            skipna,
+            numeric_only,
+            **kwargs,
+        )
+
+    def max(
+        self,
+        *,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ):
+        return self._stat_function(
+            "max",
+            nanops.nanmax,
+            axis,
+            skipna,
+            numeric_only,
+            **kwargs,
+        )
+
+    def mean(
+        self,
+        *,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | float:
+        return self._stat_function(
+            "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
+        )
+
+    def median(
+        self,
+        *,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | float:
+        return self._stat_function(
+            "median", nanops.nanmedian, axis, skipna, numeric_only, **kwargs
+        )
+
+    def skew(
+        self,
+        *,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | float:
+        return self._stat_function(
+            "skew", nanops.nanskew, axis, skipna, numeric_only, **kwargs
+        )
+
+    def kurt(
+        self,
+        *,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Series | float:
+        return self._stat_function(
+            "kurt", nanops.nankurt, axis, skipna, numeric_only, **kwargs
+        )
+
+    kurtosis = kurt
+
+    @final
+    def _min_count_stat_function(
+        self,
+        name: str,
+        func,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        min_count: int = 0,
+        **kwargs,
+    ):
+        assert name in ["sum", "prod"], name
+        nv.validate_func(name, (), kwargs)
+
+        validate_bool_kwarg(skipna, "skipna", none_allowed=False)
+
+        return self._reduce(
+            func,
+            name=name,
+            axis=axis,
+            skipna=skipna,
+            numeric_only=numeric_only,
+            min_count=min_count,
+        )
+
+    def sum(
+        self,
+        *,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        min_count: int = 0,
+        **kwargs,
+    ):
+        return self._min_count_stat_function(
+            "sum", nanops.nansum, axis, skipna, numeric_only, min_count, **kwargs
+        )
+
+    def prod(
+        self,
+        *,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        min_count: int = 0,
+        **kwargs,
+    ):
+        return self._min_count_stat_function(
+            "prod",
+            nanops.nanprod,
+            axis,
+            skipna,
+            numeric_only,
+            min_count,
+            **kwargs,
+        )
+
+    product = prod
+
+    @final
+    def rolling(
+        self,
+        window: int | dt.timedelta | str | BaseOffset | BaseIndexer,
+        min_periods: int | None = None,
+        center: bool = False,
+        win_type: str | None = None,
+        on: str | None = None,
+        closed: IntervalClosedType | None = None,
+        step: int | None = None,
+        method: str = "single",
+    ) -> Window | Rolling:
+        """
+        Provide rolling window calculations.
+
+        Parameters
+        ----------
+        window : int, timedelta, str, offset, or BaseIndexer subclass
+            Interval of the moving window.
+
+            If an integer, the delta between the start and end of each window.
+            The number of points in the window depends on the ``closed`` argument.
+
+            If a timedelta, str, or offset, the time period of each window. Each
+            window will be a variable sized based on the observations included in
+            the time-period. This is only valid for datetimelike indexes.
+            To learn more about the offsets & frequency strings, please see
+            :ref:`this link<timeseries.offset_aliases>`.
+
+            If a BaseIndexer subclass, the window boundaries
+            based on the defined ``get_window_bounds`` method. Additional rolling
+            keyword arguments, namely ``min_periods``, ``center``, ``closed`` and
+            ``step`` will be passed to ``get_window_bounds``.
+
+        min_periods : int, default None
+            Minimum number of observations in window required to have a value;
+            otherwise, result is ``np.nan``.
+
+            For a window that is specified by an offset, ``min_periods`` will default
+            to 1.
+
+            For a window that is specified by an integer, ``min_periods`` will default
+            to the size of the window.
+
+        center : bool, default False
+            If False, set the window labels as the right edge of the window index.
+
+            If True, set the window labels as the center of the window index.
+
+        win_type : str, default None
+            If ``None``, all points are evenly weighted.
+
+            If a string, it must be a valid `scipy.signal window function
+            <https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__.
+
+            Certain Scipy window types require additional parameters to be passed
+            in the aggregation function. The additional parameters must match
+            the keywords specified in the Scipy window type method signature.
+
+        on : str, optional
+            For a DataFrame, a column label or Index level on which
+            to calculate the rolling window, rather than the DataFrame's index.
+
+            Provided integer column is ignored and excluded from result since
+            an integer index is not used to calculate the rolling window.
+
+        closed : str, default None
+            Determines the inclusivity of points in the window
+
+            If ``'right'``, uses the window (first, last] meaning the last point
+            is included in the calculations.
+
+            If ``'left'``, uses the window [first, last) meaning the first point
+            is included in the calculations.
+
+            If ``'both'``, uses the window [first, last] meaning all points in
+            the window are included in the calculations.
+
+            If ``'neither'``, uses the window (first, last) meaning the first
+            and last points in the window are excluded from calculations.
+
+            () and [] are referencing open and closed set
+            notation respetively.
+
+            Default ``None`` (``'right'``).
+
+        step : int, default None
+            Evaluate the window at every ``step`` result, equivalent to slicing as
+            ``[::step]``. ``window`` must be an integer. Using a step argument other
+            than None or 1 will produce a result with a different shape than the input.
+
+        method : str {'single', 'table'}, default 'single'
+
+            Execute the rolling operation per single column or row (``'single'``)
+            or over the entire object (``'table'``).
+
+            This argument is only implemented when specifying ``engine='numba'``
+            in the method call.
+
+        Returns
+        -------
+        pandas.api.typing.Window or pandas.api.typing.Rolling
+            An instance of Window is returned if ``win_type`` is passed. Otherwise,
+            an instance of Rolling is returned.
+
+        See Also
+        --------
+        expanding : Provides expanding transformations.
+        ewm : Provides exponential weighted functions.
+
+        Notes
+        -----
+        See :ref:`Windowing Operations <window.generic>` for further usage details
+        and examples.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]})
+        >>> df
+             B
+        0  0.0
+        1  1.0
+        2  2.0
+        3  NaN
+        4  4.0
+
+        **window**
+
+        Rolling sum with a window length of 2 observations.
+
+        >>> df.rolling(2).sum()
+             B
+        0  NaN
+        1  1.0
+        2  3.0
+        3  NaN
+        4  NaN
+
+        Rolling sum with a window span of 2 seconds.
+
+        >>> df_time = pd.DataFrame(
+        ...     {"B": [0, 1, 2, np.nan, 4]},
+        ...     index=[
+        ...         pd.Timestamp("20130101 09:00:00"),
+        ...         pd.Timestamp("20130101 09:00:02"),
+        ...         pd.Timestamp("20130101 09:00:03"),
+        ...         pd.Timestamp("20130101 09:00:05"),
+        ...         pd.Timestamp("20130101 09:00:06"),
+        ...     ],
+        ... )
+
+        >>> df_time
+                               B
+        2013-01-01 09:00:00  0.0
+        2013-01-01 09:00:02  1.0
+        2013-01-01 09:00:03  2.0
+        2013-01-01 09:00:05  NaN
+        2013-01-01 09:00:06  4.0
+
+        >>> df_time.rolling("2s").sum()
+                               B
+        2013-01-01 09:00:00  0.0
+        2013-01-01 09:00:02  1.0
+        2013-01-01 09:00:03  3.0
+        2013-01-01 09:00:05  NaN
+        2013-01-01 09:00:06  4.0
+
+        Rolling sum with forward looking windows with 2 observations.
+
+        >>> indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=2)
+        >>> df.rolling(window=indexer, min_periods=1).sum()
+             B
+        0  1.0
+        1  3.0
+        2  2.0
+        3  4.0
+        4  4.0
+
+        **min_periods**
+
+        Rolling sum with a window length of 2 observations, but only needs a minimum
+        of 1 observation to calculate a value.
+
+        >>> df.rolling(2, min_periods=1).sum()
+             B
+        0  0.0
+        1  1.0
+        2  3.0
+        3  2.0
+        4  4.0
+
+        **center**
+
+        Rolling sum with the result assigned to the center of the window index.
+
+        >>> df.rolling(3, min_periods=1, center=True).sum()
+             B
+        0  1.0
+        1  3.0
+        2  3.0
+        3  6.0
+        4  4.0
+
+        >>> df.rolling(3, min_periods=1, center=False).sum()
+             B
+        0  0.0
+        1  1.0
+        2  3.0
+        3  3.0
+        4  6.0
+
+        **step**
+
+        Rolling sum with a window length of 2 observations, minimum of 1 observation to
+        calculate a value, and a step of 2.
+
+        >>> df.rolling(2, min_periods=1, step=2).sum()
+             B
+        0  0.0
+        2  3.0
+        4  4.0
+
+        **win_type**
+
+        Rolling sum with a window length of 2, using the Scipy ``'gaussian'``
+        window type. ``std`` is required in the aggregation function.
+
+        >>> df.rolling(2, win_type="gaussian").sum(std=3)
+                  B
+        0        NaN
+        1   0.986207
+        2   2.958621
+        3        NaN
+        4        NaN
+
+        **on**
+
+        Rolling sum with a window length of 2 days.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "A": [
+        ...             pd.to_datetime("2020-01-01"),
+        ...             pd.to_datetime("2020-01-01"),
+        ...             pd.to_datetime("2020-01-02"),
+        ...         ],
+        ...         "B": [1, 2, 3],
+        ...     },
+        ...     index=pd.date_range("2020", periods=3),
+        ... )
+
+        >>> df
+                            A  B
+        2020-01-01 2020-01-01  1
+        2020-01-02 2020-01-01  2
+        2020-01-03 2020-01-02  3
+
+        >>> df.rolling("2D", on="A").sum()
+                            A    B
+        2020-01-01 2020-01-01  1.0
+        2020-01-02 2020-01-01  3.0
+        2020-01-03 2020-01-02  6.0
+        """
+        if win_type is not None:
+            return Window(
+                self,
+                window=window,
+                min_periods=min_periods,
+                center=center,
+                win_type=win_type,
+                on=on,
+                closed=closed,
+                step=step,
+                method=method,
+            )
+
+        return Rolling(
+            self,
+            window=window,
+            min_periods=min_periods,
+            center=center,
+            win_type=win_type,
+            on=on,
+            closed=closed,
+            step=step,
+            method=method,
+        )
+
+    @final
+    def expanding(
+        self,
+        min_periods: int = 1,
+        method: Literal["single", "table"] = "single",
+    ) -> Expanding:
+        """
+        Provide expanding window calculations.
+
+        An expanding window yields the value of an aggregation statistic with all
+        the data available up to that point in time.
+
+        Parameters
+        ----------
+        min_periods : int, default 1
+            Minimum number of observations in window required to have a value;
+            otherwise, result is ``np.nan``.
+
+        method : str {'single', 'table'}, default 'single'
+            Execute the rolling operation per single column or row (``'single'``)
+            or over the entire object (``'table'``).
+
+            This argument is only implemented when specifying ``engine='numba'``
+            in the method call.
+
+        Returns
+        -------
+        pandas.api.typing.Expanding
+            An instance of Expanding for further expanding window calculations,
+            e.g. using the ``sum`` method.
+
+        See Also
+        --------
+        rolling : Provides rolling window calculations.
+        ewm : Provides exponential weighted functions.
+
+        Notes
+        -----
+        See :ref:`Windowing Operations <window.expanding>` for further usage details
+        and examples.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]})
+        >>> df
+             B
+        0  0.0
+        1  1.0
+        2  2.0
+        3  NaN
+        4  4.0
+
+        **min_periods**
+
+        Expanding sum with 1 vs 3 observations needed to calculate a value.
+
+        >>> df.expanding(1).sum()
+             B
+        0  0.0
+        1  1.0
+        2  3.0
+        3  3.0
+        4  7.0
+        >>> df.expanding(3).sum()
+             B
+        0  NaN
+        1  NaN
+        2  3.0
+        3  3.0
+        4  7.0
+        """
+        return Expanding(self, min_periods=min_periods, method=method)
+
+    @final
+    @doc(ExponentialMovingWindow)
+    def ewm(
+        self,
+        com: float | None = None,
+        span: float | None = None,
+        halflife: float | TimedeltaConvertibleTypes | None = None,
+        alpha: float | None = None,
+        min_periods: int | None = 0,
+        adjust: bool = True,
+        ignore_na: bool = False,
+        times: np.ndarray | DataFrame | Series | None = None,
+        method: Literal["single", "table"] = "single",
+    ) -> ExponentialMovingWindow:
+        return ExponentialMovingWindow(
+            self,
+            com=com,
+            span=span,
+            halflife=halflife,
+            alpha=alpha,
+            min_periods=min_periods,
+            adjust=adjust,
+            ignore_na=ignore_na,
+            times=times,
+            method=method,
+        )
+
+    # ----------------------------------------------------------------------
+    # Arithmetic Methods
+
+    @final
+    def _inplace_method(self, other, op) -> Self:
+        """
+        Wrap arithmetic method to operate inplace.
+        """
+        result = op(self, other)
+
+        # this makes sure that we are aligned like the input
+        # we are updating inplace
+        self._update_inplace(result.reindex_like(self))
+        return self
+
+    @final
+    def __iadd__(self, other) -> Self:
+        # error: Unsupported left operand type for + ("Type[NDFrame]")
+        return self._inplace_method(other, type(self).__add__)  # type: ignore[operator]
+
+    @final
+    def __isub__(self, other) -> Self:
+        # error: Unsupported left operand type for - ("Type[NDFrame]")
+        return self._inplace_method(other, type(self).__sub__)  # type: ignore[operator]
+
+    @final
+    def __imul__(self, other) -> Self:
+        # error: Unsupported left operand type for * ("Type[NDFrame]")
+        return self._inplace_method(other, type(self).__mul__)  # type: ignore[operator]
+
+    @final
+    def __itruediv__(self, other) -> Self:
+        # error: Unsupported left operand type for / ("Type[NDFrame]")
+        return self._inplace_method(
+            other,
+            type(self).__truediv__,  # type: ignore[operator]
+        )
+
+    @final
+    def __ifloordiv__(self, other) -> Self:
+        # error: Unsupported left operand type for // ("Type[NDFrame]")
+        return self._inplace_method(
+            other,
+            type(self).__floordiv__,  # type: ignore[operator]
+        )
+
+    @final
+    def __imod__(self, other) -> Self:
+        # error: Unsupported left operand type for % ("Type[NDFrame]")
+        return self._inplace_method(other, type(self).__mod__)  # type: ignore[operator]
+
+    @final
+    def __ipow__(self, other) -> Self:
+        # error: Unsupported left operand type for ** ("Type[NDFrame]")
+        return self._inplace_method(other, type(self).__pow__)  # type: ignore[operator]
+
+    @final
+    def __iand__(self, other) -> Self:
+        # error: Unsupported left operand type for & ("Type[NDFrame]")
+        return self._inplace_method(other, type(self).__and__)  # type: ignore[operator]
+
+    @final
+    def __ior__(self, other) -> Self:
+        return self._inplace_method(other, type(self).__or__)
+
+    @final
+    def __ixor__(self, other) -> Self:
+        # error: Unsupported left operand type for ^ ("Type[NDFrame]")
+        return self._inplace_method(other, type(self).__xor__)  # type: ignore[operator]
+
+    # ----------------------------------------------------------------------
+    # Misc methods
+
+    @final
+    def _find_valid_index(self, *, how: str) -> Hashable:
+        """
+        Retrieves the index of the first valid value.
+
+        Parameters
+        ----------
+        how : {'first', 'last'}
+            Use this parameter to change between the first or last valid index.
+
+        Returns
+        -------
+        idx_first_valid : type of index
+        """
+        is_valid = self.notna().values
+        idxpos = find_valid_index(how=how, is_valid=is_valid)
+        if idxpos is None:
+            return None
+        return self.index[idxpos]
+
+    @final
+    def first_valid_index(self) -> Hashable:
+        """
+        Return index for first non-missing value or None, if no value is found.
+
+        See the :ref:`User Guide <missing_data>` for more information
+        on which values are considered missing.
+
+        Returns
+        -------
+        type of index
+            Index of first non-missing value.
+
+        See Also
+        --------
+        DataFrame.last_valid_index : Return index for last non-NA value or None, if
+            no non-NA value is found.
+        Series.last_valid_index : Return index for last non-NA value or None, if no
+            non-NA value is found.
+        DataFrame.isna : Detect missing values.
+
+        Examples
+        --------
+        For Series:
+
+        >>> s = pd.Series([None, 3, 4])
+        >>> s.first_valid_index()
+        1
+        >>> s.last_valid_index()
+        2
+
+        >>> s = pd.Series([None, None])
+        >>> print(s.first_valid_index())
+        None
+        >>> print(s.last_valid_index())
+        None
+
+        If all elements in Series are NA/null, returns None.
+
+        >>> s = pd.Series()
+        >>> print(s.first_valid_index())
+        None
+        >>> print(s.last_valid_index())
+        None
+
+        If Series is empty, returns None.
+
+        For DataFrame:
+
+        >>> df = pd.DataFrame({"A": [None, None, 2], "B": [None, 3, 4]})
+        >>> df
+             A      B
+        0  NaN    NaN
+        1  NaN    3.0
+        2  2.0    4.0
+        >>> df.first_valid_index()
+        1
+        >>> df.last_valid_index()
+        2
+
+        >>> df = pd.DataFrame({"A": [None, None, None], "B": [None, None, None]})
+        >>> df
+             A      B
+        0  None   None
+        1  None   None
+        2  None   None
+        >>> print(df.first_valid_index())
+        None
+        >>> print(df.last_valid_index())
+        None
+
+        If all elements in DataFrame are NA/null, returns None.
+
+        >>> df = pd.DataFrame()
+        >>> df
+        Empty DataFrame
+        Columns: []
+        Index: []
+        >>> print(df.first_valid_index())
+        None
+        >>> print(df.last_valid_index())
+        None
+
+        If DataFrame is empty, returns None.
+        """
+        return self._find_valid_index(how="first")
+
+    @final
+    def last_valid_index(self) -> Hashable:
+        """
+        Return index for last non-missing value or None, if no value is found.
+
+        See the :ref:`User Guide <missing_data>` for more information
+        on which values are considered missing.
+
+        Returns
+        -------
+        type of index
+            Index of last non-missing value.
+
+        See Also
+        --------
+        DataFrame.first_valid_index : Return index for first non-NA value or None, if
+            no non-NA value is found.
+        Series.first_valid_index : Return index for first non-NA value or None, if no
+            non-NA value is found.
+        DataFrame.isna : Detect missing values.
+
+        Examples
+        --------
+        For Series:
+
+        >>> s = pd.Series([None, 3, 4])
+        >>> s.first_valid_index()
+        1
+        >>> s.last_valid_index()
+        2
+
+        >>> s = pd.Series([None, None])
+        >>> print(s.first_valid_index())
+        None
+        >>> print(s.last_valid_index())
+        None
+
+        If all elements in Series are NA/null, returns None.
+
+        >>> s = pd.Series()
+        >>> print(s.first_valid_index())
+        None
+        >>> print(s.last_valid_index())
+        None
+
+        If Series is empty, returns None.
+
+        For DataFrame:
+
+        >>> df = pd.DataFrame({"A": [None, None, 2], "B": [None, 3, 4]})
+        >>> df
+             A      B
+        0  NaN    NaN
+        1  NaN    3.0
+        2  2.0    4.0
+        >>> df.first_valid_index()
+        1
+        >>> df.last_valid_index()
+        2
+
+        >>> df = pd.DataFrame({"A": [None, None, None], "B": [None, None, None]})
+        >>> df
+             A      B
+        0  None   None
+        1  None   None
+        2  None   None
+        >>> print(df.first_valid_index())
+        None
+        >>> print(df.last_valid_index())
+        None
+
+        If all elements in DataFrame are NA/null, returns None.
+
+        >>> df = pd.DataFrame()
+        >>> df
+        Empty DataFrame
+        Columns: []
+        Index: []
+        >>> print(df.first_valid_index())
+        None
+        >>> print(df.last_valid_index())
+        None
+
+        If DataFrame is empty, returns None.
+        """
+        return self._find_valid_index(how="last")
+
+
+_num_doc = """
+{desc}
+
+Parameters
+----------
+axis : {axis_descr}
+    Axis for the function to be applied on.
+    For `Series` this parameter is unused and defaults to 0.
+
+    For DataFrames, specifying ``axis=None`` will apply the aggregation
+    across both axes.
+
+    .. versionadded:: 2.0.0
+
+skipna : bool, default True
+    Exclude NA/null values when computing the result.
+numeric_only : bool, default False
+    Include only float, int, boolean columns.
+
+{min_count}\
+**kwargs
+    Additional keyword arguments to be passed to the function.
+
+Returns
+-------
+{name1} or scalar\
+
+    Value containing the calculation referenced in the description.\
+{see_also}\
+{examples}
+"""
+
+_sum_prod_doc = """
+{desc}
+
+Parameters
+----------
+axis : {axis_descr}
+    Axis for the function to be applied on.
+    For `Series` this parameter is unused and defaults to 0.
+
+    .. warning::
+
+        The behavior of DataFrame.{name} with ``axis=None`` is deprecated,
+        in a future version this will reduce over both axes and return a scalar
+        To retain the old behavior, pass axis=0 (or do not pass axis).
+
+    .. versionadded:: 2.0.0
+
+skipna : bool, default True
+    Exclude NA/null values when computing the result.
+numeric_only : bool, default False
+    Include only float, int, boolean columns. Not implemented for Series.
+
+{min_count}\
+**kwargs
+    Additional keyword arguments to be passed to the function.
+
+Returns
+-------
+{name1} or scalar\
+
+    Value containing the calculation referenced in the description.\
+{see_also}\
+{examples}
+"""
+
+_num_ddof_doc = """
+{desc}
+
+Parameters
+----------
+axis : {axis_descr}
+    For `Series` this parameter is unused and defaults to 0.
+
+    .. warning::
+
+        The behavior of DataFrame.{name} with ``axis=None`` is deprecated,
+        in a future version this will reduce over both axes and return a scalar
+        To retain the old behavior, pass axis=0 (or do not pass axis).
+
+skipna : bool, default True
+    Exclude NA/null values. If an entire row/column is NA, the result
+    will be NA.
+ddof : int, default 1
+    Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+    where N represents the number of elements.
+numeric_only : bool, default False
+    Include only float, int, boolean columns. Not implemented for Series.
+**kwargs :
+    Additional keywords have no effect but might be accepted
+    for compatibility with NumPy.
+
+Returns
+-------
+{name1} or {name2} (if level specified)
+    {return_desc}
+
+See Also
+--------
+{see_also}\
+{notes}\
+{examples}
+"""
+
+_sem_see_also = """\
+scipy.stats.sem : Compute standard error of the mean.
+{name2}.std : Return sample standard deviation over requested axis.
+{name2}.var : Return unbiased variance over requested axis.
+{name2}.mean : Return the mean of the values over the requested axis.
+{name2}.median : Return the median of the values over the requested axis.
+{name2}.mode : Return the mode(s) of the Series."""
+
+_sem_return_desc = """\
+Unbiased standard error of the mean over requested axis."""
+
+_std_see_also = """\
+numpy.std : Compute the standard deviation along the specified axis.
+{name2}.var : Return unbiased variance over requested axis.
+{name2}.sem : Return unbiased standard error of the mean over requested axis.
+{name2}.mean : Return the mean of the values over the requested axis.
+{name2}.median : Return the median of the values over the requested axis.
+{name2}.mode : Return the mode(s) of the Series."""
+
+_std_return_desc = """\
+Standard deviation over requested axis."""
+
+_std_notes = """
+
+Notes
+-----
+To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the
+default `ddof=1`)"""
+
+_std_examples = """
+
+Examples
+--------
+>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],
+...                    'age': [21, 25, 62, 43],
+...                    'height': [1.61, 1.87, 1.49, 2.01]}
+...                   ).set_index('person_id')
+>>> df
+           age  height
+person_id
+0           21    1.61
+1           25    1.87
+2           62    1.49
+3           43    2.01
+
+The standard deviation of the columns can be found as follows:
+
+>>> df.std()
+age       18.786076
+height     0.237417
+dtype: float64
+
+Alternatively, `ddof=0` can be set to normalize by N instead of N-1:
+
+>>> df.std(ddof=0)
+age       16.269219
+height     0.205609
+dtype: float64"""
+
+_var_examples = """
+
+Examples
+--------
+>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],
+...                    'age': [21, 25, 62, 43],
+...                    'height': [1.61, 1.87, 1.49, 2.01]}
+...                   ).set_index('person_id')
+>>> df
+           age  height
+person_id
+0           21    1.61
+1           25    1.87
+2           62    1.49
+3           43    2.01
+
+>>> df.var()
+age       352.916667
+height      0.056367
+dtype: float64
+
+Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1:
+
+>>> df.var(ddof=0)
+age       264.687500
+height      0.042275
+dtype: float64"""
+
+_bool_doc = """
+{desc}
+
+Parameters
+----------
+axis : {{0 or 'index', 1 or 'columns', None}}, default 0
+    Indicate which axis or axes should be reduced. For `Series` this parameter
+    is unused and defaults to 0.
+
+    * 0 / 'index' : reduce the index, return a Series whose index is the
+      original column labels.
+    * 1 / 'columns' : reduce the columns, return a Series whose index is the
+      original index.
+    * None : reduce all axes, return a scalar.
+
+bool_only : bool, default False
+    Include only boolean columns. Not implemented for Series.
+skipna : bool, default True
+    Exclude NA/null values. If the entire row/column is NA and skipna is
+    True, then the result will be {empty_value}, as for an empty row/column.
+    If skipna is False, then NA are treated as True, because these are not
+    equal to zero.
+**kwargs : any, default None
+    Additional keywords have no effect but might be accepted for
+    compatibility with NumPy.
+
+Returns
+-------
+{name2} or {name1}
+    If axis=None, then a scalar boolean is returned.
+    Otherwise a Series is returned with index matching the index argument.
+
+{see_also}
+{examples}"""
+
+_all_desc = """\
+Return whether all elements are True, potentially over an axis.
+
+Returns True unless there at least one element within a series or
+along a Dataframe axis that is False or equivalent (e.g. zero or
+empty)."""
+
+_all_examples = """\
+Examples
+--------
+**Series**
+
+>>> pd.Series([True, True]).all()
+True
+>>> pd.Series([True, False]).all()
+False
+>>> pd.Series([], dtype="float64").all()
+True
+>>> pd.Series([np.nan]).all()
+True
+>>> pd.Series([np.nan]).all(skipna=False)
+True
+
+**DataFrames**
+
+Create a DataFrame from a dictionary.
+
+>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
+>>> df
+   col1   col2
+0  True   True
+1  True  False
+
+Default behaviour checks if values in each column all return True.
+
+>>> df.all()
+col1     True
+col2    False
+dtype: bool
+
+Specify ``axis='columns'`` to check if values in each row all return True.
+
+>>> df.all(axis='columns')
+0     True
+1    False
+dtype: bool
+
+Or ``axis=None`` for whether every value is True.
+
+>>> df.all(axis=None)
+False
+"""
+
+_all_see_also = """\
+See Also
+--------
+Series.all : Return True if all elements are True.
+DataFrame.any : Return True if one (or more) elements are True.
+"""
+
+_cnum_pd_doc = """
+Return cumulative {desc} over a DataFrame or Series axis.
+
+Returns a DataFrame or Series of the same size containing the cumulative
+{desc}.
+
+Parameters
+----------
+axis : {{0 or 'index', 1 or 'columns'}}, default 0
+    The index or the name of the axis. 0 is equivalent to None or 'index'.
+    For `Series` this parameter is unused and defaults to 0.
+skipna : bool, default True
+    Exclude NA/null values. If an entire row/column is NA, the result
+    will be NA.
+numeric_only : bool, default False
+    Include only float, int, boolean columns.
+*args, **kwargs
+    Additional keywords have no effect but might be accepted for
+    compatibility with NumPy.
+
+Returns
+-------
+{name1} or {name2}
+    Return cumulative {desc} of {name1} or {name2}.
+
+See Also
+--------
+core.window.expanding.Expanding.{accum_func_name} : Similar functionality
+    but ignores ``NaN`` values.
+{name2}.{accum_func_name} : Return the {desc} over
+    {name2} axis.
+{name2}.cummax : Return cumulative maximum over {name2} axis.
+{name2}.cummin : Return cumulative minimum over {name2} axis.
+{name2}.cumsum : Return cumulative sum over {name2} axis.
+{name2}.cumprod : Return cumulative product over {name2} axis.
+
+{examples}"""
+
+_cnum_series_doc = """
+Return cumulative {desc} over a DataFrame or Series axis.
+
+Returns a DataFrame or Series of the same size containing the cumulative
+{desc}.
+
+Parameters
+----------
+axis : {{0 or 'index', 1 or 'columns'}}, default 0
+    The index or the name of the axis. 0 is equivalent to None or 'index'.
+    For `Series` this parameter is unused and defaults to 0.
+skipna : bool, default True
+    Exclude NA/null values. If an entire row/column is NA, the result
+    will be NA.
+*args, **kwargs
+    Additional keywords have no effect but might be accepted for
+    compatibility with NumPy.
+
+Returns
+-------
+{name1} or {name2}
+    Return cumulative {desc} of {name1} or {name2}.
+
+See Also
+--------
+core.window.expanding.Expanding.{accum_func_name} : Similar functionality
+    but ignores ``NaN`` values.
+{name2}.{accum_func_name} : Return the {desc} over
+    {name2} axis.
+{name2}.cummax : Return cumulative maximum over {name2} axis.
+{name2}.cummin : Return cumulative minimum over {name2} axis.
+{name2}.cumsum : Return cumulative sum over {name2} axis.
+{name2}.cumprod : Return cumulative product over {name2} axis.
+
+{examples}"""
+
+_cummin_examples = """\
+Examples
+--------
+**Series**
+
+>>> s = pd.Series([2, np.nan, 5, -1, 0])
+>>> s
+0    2.0
+1    NaN
+2    5.0
+3   -1.0
+4    0.0
+dtype: float64
+
+By default, NA values are ignored.
+
+>>> s.cummin()
+0    2.0
+1    NaN
+2    2.0
+3   -1.0
+4   -1.0
+dtype: float64
+
+To include NA values in the operation, use ``skipna=False``
+
+>>> s.cummin(skipna=False)
+0    2.0
+1    NaN
+2    NaN
+3    NaN
+4    NaN
+dtype: float64
+
+**DataFrame**
+
+>>> df = pd.DataFrame([[2.0, 1.0],
+...                    [3.0, np.nan],
+...                    [1.0, 0.0]],
+...                   columns=list('AB'))
+>>> df
+     A    B
+0  2.0  1.0
+1  3.0  NaN
+2  1.0  0.0
+
+By default, iterates over rows and finds the minimum
+in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
+
+>>> df.cummin()
+     A    B
+0  2.0  1.0
+1  2.0  NaN
+2  1.0  0.0
+
+To iterate over columns and find the minimum in each row,
+use ``axis=1``
+
+>>> df.cummin(axis=1)
+     A    B
+0  2.0  1.0
+1  3.0  NaN
+2  1.0  0.0
+"""
+
+_cumsum_examples = """\
+Examples
+--------
+**Series**
+
+>>> s = pd.Series([2, np.nan, 5, -1, 0])
+>>> s
+0    2.0
+1    NaN
+2    5.0
+3   -1.0
+4    0.0
+dtype: float64
+
+By default, NA values are ignored.
+
+>>> s.cumsum()
+0    2.0
+1    NaN
+2    7.0
+3    6.0
+4    6.0
+dtype: float64
+
+To include NA values in the operation, use ``skipna=False``
+
+>>> s.cumsum(skipna=False)
+0    2.0
+1    NaN
+2    NaN
+3    NaN
+4    NaN
+dtype: float64
+
+**DataFrame**
+
+>>> df = pd.DataFrame([[2.0, 1.0],
+...                    [3.0, np.nan],
+...                    [1.0, 0.0]],
+...                   columns=list('AB'))
+>>> df
+     A    B
+0  2.0  1.0
+1  3.0  NaN
+2  1.0  0.0
+
+By default, iterates over rows and finds the sum
+in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
+
+>>> df.cumsum()
+     A    B
+0  2.0  1.0
+1  5.0  NaN
+2  6.0  1.0
+
+To iterate over columns and find the sum in each row,
+use ``axis=1``
+
+>>> df.cumsum(axis=1)
+     A    B
+0  2.0  3.0
+1  3.0  NaN
+2  1.0  1.0
+"""
+
+_cumprod_examples = """\
+Examples
+--------
+**Series**
+
+>>> s = pd.Series([2, np.nan, 5, -1, 0])
+>>> s
+0    2.0
+1    NaN
+2    5.0
+3   -1.0
+4    0.0
+dtype: float64
+
+By default, NA values are ignored.
+
+>>> s.cumprod()
+0     2.0
+1     NaN
+2    10.0
+3   -10.0
+4    -0.0
+dtype: float64
+
+To include NA values in the operation, use ``skipna=False``
+
+>>> s.cumprod(skipna=False)
+0    2.0
+1    NaN
+2    NaN
+3    NaN
+4    NaN
+dtype: float64
+
+**DataFrame**
+
+>>> df = pd.DataFrame([[2.0, 1.0],
+...                    [3.0, np.nan],
+...                    [1.0, 0.0]],
+...                   columns=list('AB'))
+>>> df
+     A    B
+0  2.0  1.0
+1  3.0  NaN
+2  1.0  0.0
+
+By default, iterates over rows and finds the product
+in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
+
+>>> df.cumprod()
+     A    B
+0  2.0  1.0
+1  6.0  NaN
+2  6.0  0.0
+
+To iterate over columns and find the product in each row,
+use ``axis=1``
+
+>>> df.cumprod(axis=1)
+     A    B
+0  2.0  2.0
+1  3.0  NaN
+2  1.0  0.0
+"""
+
+_cummax_examples = """\
+Examples
+--------
+**Series**
+
+>>> s = pd.Series([2, np.nan, 5, -1, 0])
+>>> s
+0    2.0
+1    NaN
+2    5.0
+3   -1.0
+4    0.0
+dtype: float64
+
+By default, NA values are ignored.
+
+>>> s.cummax()
+0    2.0
+1    NaN
+2    5.0
+3    5.0
+4    5.0
+dtype: float64
+
+To include NA values in the operation, use ``skipna=False``
+
+>>> s.cummax(skipna=False)
+0    2.0
+1    NaN
+2    NaN
+3    NaN
+4    NaN
+dtype: float64
+
+**DataFrame**
+
+>>> df = pd.DataFrame([[2.0, 1.0],
+...                    [3.0, np.nan],
+...                    [1.0, 0.0]],
+...                   columns=list('AB'))
+>>> df
+     A    B
+0  2.0  1.0
+1  3.0  NaN
+2  1.0  0.0
+
+By default, iterates over rows and finds the maximum
+in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
+
+>>> df.cummax()
+     A    B
+0  2.0  1.0
+1  3.0  NaN
+2  3.0  1.0
+
+To iterate over columns and find the maximum in each row,
+use ``axis=1``
+
+>>> df.cummax(axis=1)
+     A    B
+0  2.0  2.0
+1  3.0  NaN
+2  1.0  1.0
+"""
+
+_any_see_also = """\
+See Also
+--------
+numpy.any : Numpy version of this method.
+Series.any : Return whether any element is True.
+Series.all : Return whether all elements are True.
+DataFrame.any : Return whether any element is True over requested axis.
+DataFrame.all : Return whether all elements are True over requested axis.
+"""
+
+_any_desc = """\
+Return whether any element is True, potentially over an axis.
+
+Returns False unless there is at least one element within a series or
+along a Dataframe axis that is True or equivalent (e.g. non-zero or
+non-empty)."""
+
+_any_examples = """\
+Examples
+--------
+**Series**
+
+For Series input, the output is a scalar indicating whether any element
+is True.
+
+>>> pd.Series([False, False]).any()
+False
+>>> pd.Series([True, False]).any()
+True
+>>> pd.Series([], dtype="float64").any()
+False
+>>> pd.Series([np.nan]).any()
+False
+>>> pd.Series([np.nan]).any(skipna=False)
+True
+
+**DataFrame**
+
+Whether each column contains at least one True element (the default).
+
+>>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
+>>> df
+   A  B  C
+0  1  0  0
+1  2  2  0
+
+>>> df.any()
+A     True
+B     True
+C    False
+dtype: bool
+
+Aggregating over the columns.
+
+>>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})
+>>> df
+       A  B
+0   True  1
+1  False  2
+
+>>> df.any(axis='columns')
+0    True
+1    True
+dtype: bool
+
+>>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})
+>>> df
+       A  B
+0   True  1
+1  False  0
+
+>>> df.any(axis='columns')
+0    True
+1    False
+dtype: bool
+
+Aggregating over the entire DataFrame with ``axis=None``.
+
+>>> df.any(axis=None)
+True
+
+`any` for an empty DataFrame is an empty Series.
+
+>>> pd.DataFrame([]).any()
+Series([], dtype: bool)
+"""
+
+_shared_docs["stat_func_example"] = """
+
+Examples
+--------
+>>> idx = pd.MultiIndex.from_arrays([
+...     ['warm', 'warm', 'cold', 'cold'],
+...     ['dog', 'falcon', 'fish', 'spider']],
+...     names=['blooded', 'animal'])
+>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
+>>> s
+blooded  animal
+warm     dog       4
+         falcon    2
+cold     fish      0
+         spider    8
+Name: legs, dtype: int64
+
+>>> s.{stat_func}()
+{default_output}"""
+
+_sum_examples = _shared_docs["stat_func_example"].format(
+    stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8
+)
+
+_sum_examples += """
+
+By default, the sum of an empty or all-NA Series is ``0``.
+
+>>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
+0.0
+
+This can be controlled with the ``min_count`` parameter. For example, if
+you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
+
+>>> pd.Series([], dtype="float64").sum(min_count=1)
+nan
+
+Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
+empty series identically.
+
+>>> pd.Series([np.nan]).sum()
+0.0
+
+>>> pd.Series([np.nan]).sum(min_count=1)
+nan"""
+
+_max_examples: str = _shared_docs["stat_func_example"].format(
+    stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8
+)
+
+_min_examples: str = _shared_docs["stat_func_example"].format(
+    stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0
+)
+
+_skew_see_also = """
+
+See Also
+--------
+Series.skew : Return unbiased skew over requested axis.
+Series.var : Return unbiased variance over requested axis.
+Series.std : Return unbiased standard deviation over requested axis."""
+
+_stat_func_see_also = """
+
+See Also
+--------
+Series.sum : Return the sum.
+Series.min : Return the minimum.
+Series.max : Return the maximum.
+Series.idxmin : Return the index of the minimum.
+Series.idxmax : Return the index of the maximum.
+DataFrame.sum : Return the sum over the requested axis.
+DataFrame.min : Return the minimum over the requested axis.
+DataFrame.max : Return the maximum over the requested axis.
+DataFrame.idxmin : Return the index of the minimum over the requested axis.
+DataFrame.idxmax : Return the index of the maximum over the requested axis."""
+
+_prod_examples = """
+
+Examples
+--------
+By default, the product of an empty or all-NA Series is ``1``
+
+>>> pd.Series([], dtype="float64").prod()
+1.0
+
+This can be controlled with the ``min_count`` parameter
+
+>>> pd.Series([], dtype="float64").prod(min_count=1)
+nan
+
+Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
+empty series identically.
+
+>>> pd.Series([np.nan]).prod()
+1.0
+
+>>> pd.Series([np.nan]).prod(min_count=1)
+nan"""
+
+_min_count_stub = """\
+min_count : int, default 0
+    The required number of valid values to perform the operation. If fewer than
+    ``min_count`` non-NA values are present the result will be NA.
+"""
+
+
+def make_doc(name: str, ndim: int) -> str:
+    """
+    Generate the docstring for a Series/DataFrame reduction.
+    """
+    if ndim == 1:
+        name1 = "scalar"
+        name2 = "Series"
+        axis_descr = "{index (0)}"
+    else:
+        name1 = "Series"
+        name2 = "DataFrame"
+        axis_descr = "{index (0), columns (1)}"
+
+    if name == "any":
+        base_doc = _bool_doc
+        desc = _any_desc
+        see_also = _any_see_also
+        examples = _any_examples
+        kwargs = {"empty_value": "False"}
+    elif name == "all":
+        base_doc = _bool_doc
+        desc = _all_desc
+        see_also = _all_see_also
+        examples = _all_examples
+        kwargs = {"empty_value": "True"}
+    elif name == "min":
+        base_doc = _num_doc
+        desc = (
+            "Return the minimum of the values over the requested axis.\n\n"
+            "If you want the *index* of the minimum, use ``idxmin``. This is "
+            "the equivalent of the ``numpy.ndarray`` method ``argmin``."
+        )
+        see_also = _stat_func_see_also
+        examples = _min_examples
+        kwargs = {"min_count": ""}
+    elif name == "max":
+        base_doc = _num_doc
+        desc = (
+            "Return the maximum of the values over the requested axis.\n\n"
+            "If you want the *index* of the maximum, use ``idxmax``. This is "
+            "the equivalent of the ``numpy.ndarray`` method ``argmax``."
+        )
+        see_also = _stat_func_see_also
+        examples = _max_examples
+        kwargs = {"min_count": ""}
+
+    elif name == "sum":
+        base_doc = _sum_prod_doc
+        desc = (
+            "Return the sum of the values over the requested axis.\n\n"
+            "This is equivalent to the method ``numpy.sum``."
+        )
+        see_also = _stat_func_see_also
+        examples = _sum_examples
+        kwargs = {"min_count": _min_count_stub}
+
+    elif name == "prod":
+        base_doc = _sum_prod_doc
+        desc = "Return the product of the values over the requested axis."
+        see_also = _stat_func_see_also
+        examples = _prod_examples
+        kwargs = {"min_count": _min_count_stub}
+
+    elif name == "median":
+        base_doc = _num_doc
+        desc = "Return the median of the values over the requested axis."
+        see_also = _stat_func_see_also
+        examples = """
+
+            Examples
+            --------
+            >>> s = pd.Series([1, 2, 3])
+            >>> s.median()
+            2.0
+
+            With a DataFrame
+
+            >>> df = pd.DataFrame({'a': [1, 2], 'b': [2, 3]}, index=['tiger', 'zebra'])
+            >>> df
+                   a   b
+            tiger  1   2
+            zebra  2   3
+            >>> df.median()
+            a   1.5
+            b   2.5
+            dtype: float64
+
+            Using axis=1
+
+            >>> df.median(axis=1)
+            tiger   1.5
+            zebra   2.5
+            dtype: float64
+
+            In this case, `numeric_only` should be set to `True`
+            to avoid getting an error.
+
+            >>> df = pd.DataFrame({'a': [1, 2], 'b': ['T', 'Z']},
+            ...                   index=['tiger', 'zebra'])
+            >>> df.median(numeric_only=True)
+            a   1.5
+            dtype: float64"""
+        kwargs = {"min_count": ""}
+
+    elif name == "mean":
+        base_doc = _num_doc
+        desc = "Return the mean of the values over the requested axis."
+        see_also = _stat_func_see_also
+        examples = """
+
+            Examples
+            --------
+            >>> s = pd.Series([1, 2, 3])
+            >>> s.mean()
+            2.0
+
+            With a DataFrame
+
+            >>> df = pd.DataFrame({'a': [1, 2], 'b': [2, 3]}, index=['tiger', 'zebra'])
+            >>> df
+                   a   b
+            tiger  1   2
+            zebra  2   3
+            >>> df.mean()
+            a   1.5
+            b   2.5
+            dtype: float64
+
+            Using axis=1
+
+            >>> df.mean(axis=1)
+            tiger   1.5
+            zebra   2.5
+            dtype: float64
+
+            In this case, `numeric_only` should be set to `True` to avoid
+            getting an error.
+
+            >>> df = pd.DataFrame({'a': [1, 2], 'b': ['T', 'Z']},
+            ...                   index=['tiger', 'zebra'])
+            >>> df.mean(numeric_only=True)
+            a   1.5
+            dtype: float64"""
+        kwargs = {"min_count": ""}
+
+    elif name == "var":
+        base_doc = _num_ddof_doc
+        desc = (
+            "Return unbiased variance over requested axis.\n\nNormalized by "
+            "N-1 by default. This can be changed using the ddof argument."
+        )
+        examples = _var_examples
+        see_also = ""
+        kwargs = {"notes": ""}
+
+    elif name == "std":
+        base_doc = _num_ddof_doc
+        desc = (
+            "Return sample standard deviation over requested axis."
+            "\n\nNormalized by N-1 by default. This can be changed using the "
+            "ddof argument."
+        )
+        examples = _std_examples
+        see_also = _std_see_also.format(name2=name2)
+        kwargs = {"notes": "", "return_desc": _std_return_desc}
+
+    elif name == "sem":
+        base_doc = _num_ddof_doc
+        desc = (
+            "Return unbiased standard error of the mean over requested "
+            "axis.\n\nNormalized by N-1 by default. This can be changed "
+            "using the ddof argument"
+        )
+        examples = """
+
+            Examples
+            --------
+            >>> s = pd.Series([1, 2, 3])
+            >>> round(s.sem(), 6)
+            0.57735
+
+            With a DataFrame
+
+            >>> df = pd.DataFrame({'a': [1, 2], 'b': [2, 3]}, index=['tiger', 'zebra'])
+            >>> df
+                   a   b
+            tiger  1   2
+            zebra  2   3
+            >>> df.sem()
+            a   0.5
+            b   0.5
+            dtype: float64
+
+            Using axis=1
+
+            >>> df.sem(axis=1)
+            tiger   0.5
+            zebra   0.5
+            dtype: float64
+
+            In this case, `numeric_only` should be set to `True`
+            to avoid getting an error.
+
+            >>> df = pd.DataFrame({'a': [1, 2], 'b': ['T', 'Z']},
+            ...                   index=['tiger', 'zebra'])
+            >>> df.sem(numeric_only=True)
+            a   0.5
+            dtype: float64"""
+        see_also = _sem_see_also.format(name2=name2)
+        kwargs = {"notes": "", "return_desc": _sem_return_desc}
+
+    elif name == "skew":
+        base_doc = _num_doc
+        desc = "Return unbiased skew over requested axis.\n\nNormalized by N-1."
+        see_also = _skew_see_also
+        examples = """
+
+            Examples
+            --------
+            >>> s = pd.Series([1, 2, 3])
+            >>> s.skew()
+            0.0
+
+            With a DataFrame
+
+            >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [2, 3, 4], 'c': [1, 3, 5]},
+            ...                   index=['tiger', 'zebra', 'cow'])
+            >>> df
+                    a   b   c
+            tiger   1   2   1
+            zebra   2   3   3
+            cow     3   4   5
+            >>> df.skew()
+            a   0.0
+            b   0.0
+            c   0.0
+            dtype: float64
+
+            Using axis=1
+
+            >>> df.skew(axis=1)
+            tiger   1.732051
+            zebra  -1.732051
+            cow     0.000000
+            dtype: float64
+
+            In this case, `numeric_only` should be set to `True` to avoid
+            getting an error.
+
+            >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': ['T', 'Z', 'X']},
+            ...                   index=['tiger', 'zebra', 'cow'])
+            >>> df.skew(numeric_only=True)
+            a   0.0
+            dtype: float64"""
+        kwargs = {"min_count": ""}
+
+    elif name == "kurt":
+        base_doc = _num_doc
+        desc = (
+            "Return unbiased kurtosis over requested axis.\n\n"
+            "Kurtosis obtained using Fisher's definition of\n"
+            "kurtosis (kurtosis of normal == 0.0). Normalized "
+            "by N-1."
+        )
+        see_also = ""
+        examples = """
+
+            Examples
+            --------
+            >>> s = pd.Series([1, 2, 2, 3], index=['cat', 'dog', 'dog', 'mouse'])
+            >>> s
+            cat    1
+            dog    2
+            dog    2
+            mouse  3
+            dtype: int64
+            >>> s.kurt()
+            1.5
+
+            With a DataFrame
+
+            >>> df = pd.DataFrame({'a': [1, 2, 2, 3], 'b': [3, 4, 4, 4]},
+            ...                   index=['cat', 'dog', 'dog', 'mouse'])
+            >>> df
+                   a   b
+              cat  1   3
+              dog  2   4
+              dog  2   4
+            mouse  3   4
+            >>> df.kurt()
+            a   1.5
+            b   4.0
+            dtype: float64
+
+            With axis=None
+
+            >>> df.kurt(axis=None)
+            -0.9886927196984727
+
+            Using axis=1
+
+            >>> df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [3, 4], 'd': [1, 2]},
+            ...                   index=['cat', 'dog'])
+            >>> df.kurt(axis=1)
+            cat   -6.0
+            dog   -6.0
+            dtype: float64"""
+        kwargs = {"min_count": ""}
+
+    elif name == "cumsum":
+        if ndim == 1:
+            base_doc = _cnum_series_doc
+        else:
+            base_doc = _cnum_pd_doc
+
+        desc = "sum"
+        see_also = ""
+        examples = _cumsum_examples
+        kwargs = {"accum_func_name": "sum"}
+
+    elif name == "cumprod":
+        if ndim == 1:
+            base_doc = _cnum_series_doc
+        else:
+            base_doc = _cnum_pd_doc
+
+        desc = "product"
+        see_also = ""
+        examples = _cumprod_examples
+        kwargs = {"accum_func_name": "prod"}
+
+    elif name == "cummin":
+        if ndim == 1:
+            base_doc = _cnum_series_doc
+        else:
+            base_doc = _cnum_pd_doc
+
+        desc = "minimum"
+        see_also = ""
+        examples = _cummin_examples
+        kwargs = {"accum_func_name": "min"}
+
+    elif name == "cummax":
+        if ndim == 1:
+            base_doc = _cnum_series_doc
+        else:
+            base_doc = _cnum_pd_doc
+
+        desc = "maximum"
+        see_also = ""
+        examples = _cummax_examples
+        kwargs = {"accum_func_name": "max"}
+
+    else:
+        raise NotImplementedError
+
+    docstr = base_doc.format(
+        desc=desc,
+        name=name,
+        name1=name1,
+        name2=name2,
+        axis_descr=axis_descr,
+        see_also=see_also,
+        examples=examples,
+        **kwargs,
+    )
+    return docstr
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3a92cea53615e763c35198e5361bd78ae2f8fb4
--- /dev/null
+++ b/pandas/core/indexing.py
@@ -0,0 +1,2796 @@
+from __future__ import annotations
+
+from contextlib import suppress
+import sys
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Self,
+    cast,
+    final,
+)
+import warnings
+
+import numpy as np
+
+from pandas._libs.indexing import NDFrameIndexerBase
+from pandas._libs.lib import item_from_zerodim
+from pandas.compat import CHAINED_WARNING_DISABLED
+from pandas.compat._constants import REF_COUNT_IDX
+from pandas.errors import (
+    AbstractMethodError,
+    ChainedAssignmentError,
+    IndexingError,
+    InvalidIndexError,
+    LossySetitemError,
+)
+from pandas.errors.cow import _chained_assignment_msg
+from pandas.util._decorators import (
+    doc,
+)
+
+from pandas.core.dtypes.cast import (
+    can_hold_element,
+    maybe_promote,
+)
+from pandas.core.dtypes.common import (
+    is_array_like,
+    is_bool_dtype,
+    is_hashable,
+    is_integer,
+    is_iterator,
+    is_list_like,
+    is_numeric_dtype,
+    is_object_dtype,
+    is_scalar,
+    is_sequence,
+)
+from pandas.core.dtypes.concat import concat_compat
+from pandas.core.dtypes.dtypes import ExtensionDtype
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCSeries,
+)
+from pandas.core.dtypes.missing import (
+    construct_1d_array_from_inferred_fill_value,
+    infer_fill_value,
+    is_valid_na_for_dtype,
+    isna,
+    na_value_for_dtype,
+)
+
+from pandas.core import algorithms as algos
+import pandas.core.common as com
+from pandas.core.construction import (
+    array as pd_array,
+    extract_array,
+)
+from pandas.core.indexers import (
+    check_array_indexer,
+    is_list_like_indexer,
+    is_scalar_indexer,
+    length_of_indexer,
+)
+from pandas.core.indexes.api import (
+    Index,
+    MultiIndex,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Hashable,
+        Sequence,
+    )
+
+    from pandas._typing import (
+        Axis,
+        AxisInt,
+        T,
+        npt,
+    )
+
+    from pandas import (
+        DataFrame,
+        Series,
+    )
+
+# "null slice"
+_NS = slice(None, None)
+_one_ellipsis_message = "indexer may only contain one '...' entry"
+
+
+# the public IndexSlicerMaker
+class _IndexSlice:
+    """
+    Create an object to more easily perform multi-index slicing.
+
+    See Also
+    --------
+    MultiIndex.remove_unused_levels : New MultiIndex with no unused levels.
+
+    Notes
+    -----
+    See :ref:`Defined Levels <advanced.shown_levels>`
+    for further info on slicing a MultiIndex.
+
+    Examples
+    --------
+    >>> midx = pd.MultiIndex.from_product([["A0", "A1"], ["B0", "B1", "B2", "B3"]])
+    >>> columns = ["foo", "bar"]
+    >>> dfmi = pd.DataFrame(
+    ...     np.arange(16).reshape((len(midx), len(columns))),
+    ...     index=midx,
+    ...     columns=columns,
+    ... )
+
+    Using the default slice command:
+
+    >>> dfmi.loc[(slice(None), slice("B0", "B1")), :]
+               foo  bar
+        A0 B0    0    1
+           B1    2    3
+        A1 B0    8    9
+           B1   10   11
+
+    Using the IndexSlice class for a more intuitive command:
+
+    >>> idx = pd.IndexSlice
+    >>> dfmi.loc[idx[:, "B0":"B1"], :]
+               foo  bar
+        A0 B0    0    1
+           B1    2    3
+        A1 B0    8    9
+           B1   10   11
+    """
+
+    def __getitem__(self, arg):
+        return arg
+
+
+IndexSlice = _IndexSlice()
+IndexSlice.__module__ = "pandas"
+
+
+class IndexingMixin:
+    """
+    Mixin for adding .loc/.iloc/.at/.iat to Dataframes and Series.
+    """
+
+    @property
+    def iloc(self) -> _iLocIndexer:
+        """
+        Purely integer-location based indexing for selection by position.
+
+        .. versionchanged:: 3.0
+
+           Callables which return a tuple are deprecated as input.
+
+        ``.iloc[]`` is primarily integer position based (from ``0`` to
+        ``length-1`` of the axis), but may also be used with a boolean
+        array.
+
+        Allowed inputs are:
+
+        - An integer, e.g. ``5``.
+        - A list or array of integers, e.g. ``[4, 3, 0]``.
+        - A slice object with ints, e.g. ``1:7``.
+        - A boolean array.
+        - A ``callable`` function with one argument (the calling Series or
+          DataFrame) and that returns valid output for indexing (one of the above).
+          This is useful in method chains, when you don't have a reference to the
+          calling object, but would like to base your selection on
+          some value.
+        - A tuple of row and column indexes. The tuple elements consist of one of the
+          above inputs, e.g. ``(0, 1)``.
+
+        ``.iloc`` will raise ``IndexError`` if a requested indexer is
+        out-of-bounds, except *slice* indexers which allow out-of-bounds
+        indexing (this conforms with python/numpy *slice* semantics).
+
+        See more at :ref:`Selection by Position <indexing.integer>`.
+
+        See Also
+        --------
+        DataFrame.iat : Fast integer location scalar accessor.
+        DataFrame.loc : Purely label-location based indexer for selection by label.
+        Series.iloc : Purely integer-location based indexing for
+                       selection by position.
+
+        Examples
+        --------
+        >>> mydict = [
+        ...     {"a": 1, "b": 2, "c": 3, "d": 4},
+        ...     {"a": 100, "b": 200, "c": 300, "d": 400},
+        ...     {"a": 1000, "b": 2000, "c": 3000, "d": 4000},
+        ... ]
+        >>> df = pd.DataFrame(mydict)
+        >>> df
+              a     b     c     d
+        0     1     2     3     4
+        1   100   200   300   400
+        2  1000  2000  3000  4000
+
+        **Indexing just the rows**
+
+        With a scalar integer.
+
+        >>> type(df.iloc[0])
+        <class 'pandas.Series'>
+        >>> df.iloc[0]
+        a    1
+        b    2
+        c    3
+        d    4
+        Name: 0, dtype: int64
+
+        With a list of integers.
+
+        >>> df.iloc[[0]]
+           a  b  c  d
+        0  1  2  3  4
+        >>> type(df.iloc[[0]])
+        <class 'pandas.DataFrame'>
+
+        >>> df.iloc[[0, 1]]
+             a    b    c    d
+        0    1    2    3    4
+        1  100  200  300  400
+
+        With a `slice` object.
+
+        >>> df.iloc[:3]
+              a     b     c     d
+        0     1     2     3     4
+        1   100   200   300   400
+        2  1000  2000  3000  4000
+
+        With a boolean mask the same length as the index.
+
+        >>> df.iloc[[True, False, True]]
+              a     b     c     d
+        0     1     2     3     4
+        2  1000  2000  3000  4000
+
+        With a callable, useful in method chains. The `x` passed
+        to the ``lambda`` is the DataFrame being sliced. This selects
+        the rows whose index label even.
+
+        >>> df.iloc[lambda x: x.index % 2 == 0]
+              a     b     c     d
+        0     1     2     3     4
+        2  1000  2000  3000  4000
+
+        **Indexing both axes**
+
+        You can mix the indexer types for the index and columns. Use ``:`` to
+        select the entire axis.
+
+        With scalar integers.
+
+        >>> df.iloc[0, 1]
+        np.int64(2)
+
+        With lists of integers.
+
+        >>> df.iloc[[0, 2], [1, 3]]
+              b     d
+        0     2     4
+        2  2000  4000
+
+        With `slice` objects.
+
+        >>> df.iloc[1:3, 0:3]
+              a     b     c
+        1   100   200   300
+        2  1000  2000  3000
+
+        With a boolean array whose length matches the columns.
+
+        >>> df.iloc[:, [True, False, True, False]]
+              a     c
+        0     1     3
+        1   100   300
+        2  1000  3000
+
+        With a callable function that expects the Series or DataFrame.
+
+        >>> df.iloc[:, lambda df: [0, 2]]
+              a     c
+        0     1     3
+        1   100   300
+        2  1000  3000
+        """
+        return _iLocIndexer("iloc", self)
+
+    @property
+    def loc(self) -> _LocIndexer:
+        """
+        Access a group of rows and columns by label(s) or a boolean array.
+
+        ``.loc[]`` is primarily label based, but may also be used with a
+        boolean array.
+
+        Allowed inputs are:
+
+        - A single label, e.g. ``5`` or ``'a'``, (note that ``5`` is
+          interpreted as a *label* of the index, and **never** as an
+          integer position along the index).
+        - A list or array of labels, e.g. ``['a', 'b', 'c']``.
+        - A slice object with labels, e.g. ``'a':'f'``.
+
+          .. warning:: Note that contrary to usual python slices, **both** the
+              start and the stop are included
+
+        - A boolean array of the same length as the axis being sliced,
+          e.g. ``[True, False, True]``.
+        - An alignable boolean Series. The index of the key will be aligned before
+          masking.
+        - An alignable Index. The Index of the returned selection will be the input.
+        - A ``callable`` function with one argument (the calling Series or
+          DataFrame) and that returns valid output for indexing (one of the above)
+
+        See more at :ref:`Selection by Label <indexing.label>`.
+
+        Raises
+        ------
+        KeyError
+            If any items are not found.
+        IndexingError
+            If an indexed key is passed and its index is unalignable to the frame index.
+
+        See Also
+        --------
+        DataFrame.at : Access a single value for a row/column label pair.
+        DataFrame.iloc : Access group of rows and columns by integer position(s).
+        DataFrame.xs : Returns a cross-section (row(s) or column(s)) from the
+                       Series/DataFrame.
+        Series.loc : Access group of values using labels.
+
+        Examples
+        --------
+        **Getting values**
+
+        >>> df = pd.DataFrame(
+        ...     [[1, 2], [4, 5], [7, 8]],
+        ...     index=["cobra", "viper", "sidewinder"],
+        ...     columns=["max_speed", "shield"],
+        ... )
+        >>> df
+                    max_speed  shield
+        cobra               1       2
+        viper               4       5
+        sidewinder          7       8
+
+        Single label. Note this returns the row as a Series.
+
+        >>> df.loc["viper"]
+        max_speed    4
+        shield       5
+        Name: viper, dtype: int64
+
+        List of labels. Note using ``[[]]`` returns a DataFrame.
+
+        >>> df.loc[["viper", "sidewinder"]]
+                    max_speed  shield
+        viper               4       5
+        sidewinder          7       8
+
+        Single label for row and column
+
+        >>> df.loc["cobra", "shield"]
+        np.int64(2)
+
+        Slice with labels for row and single label for column. As mentioned
+        above, note that both the start and stop of the slice are included.
+
+        >>> df.loc["cobra":"viper", "max_speed"]
+        cobra    1
+        viper    4
+        Name: max_speed, dtype: int64
+
+        Boolean list with the same length as the row axis
+
+        >>> df.loc[[False, False, True]]
+                    max_speed  shield
+        sidewinder          7       8
+
+        Alignable boolean Series:
+
+        >>> df.loc[
+        ...     pd.Series([False, True, False], index=["viper", "sidewinder", "cobra"])
+        ... ]
+                             max_speed  shield
+        sidewinder          7       8
+
+        Index (same behavior as ``df.reindex``)
+
+        >>> df.loc[pd.Index(["cobra", "viper"], name="foo")]
+               max_speed  shield
+        foo
+        cobra          1       2
+        viper          4       5
+
+        Conditional that returns a boolean Series
+
+        >>> df.loc[df["shield"] > 6]
+                    max_speed  shield
+        sidewinder          7       8
+
+        Conditional that returns a boolean Series with column labels specified
+
+        >>> df.loc[df["shield"] > 6, ["max_speed"]]
+                    max_speed
+        sidewinder          7
+
+        Multiple conditional using ``&`` that returns a boolean Series
+
+        >>> df.loc[(df["max_speed"] > 1) & (df["shield"] < 8)]
+                    max_speed  shield
+        viper          4       5
+
+        Multiple conditional using ``|`` that returns a boolean Series
+
+        >>> df.loc[(df["max_speed"] > 4) | (df["shield"] < 5)]
+                    max_speed  shield
+        cobra               1       2
+        sidewinder          7       8
+
+        Please ensure that each condition is wrapped in parentheses ``()``.
+        See the :ref:`user guide<indexing.boolean>`
+        for more details and explanations of Boolean indexing.
+
+        .. note::
+            If you find yourself using 3 or more conditionals in ``.loc[]``,
+            consider using :ref:`advanced indexing<advanced.advanced_hierarchical>`.
+
+            See below for using ``.loc[]`` on MultiIndex DataFrames.
+
+        Callable that returns a boolean Series
+
+        >>> df.loc[lambda df: df["shield"] == 8]
+                    max_speed  shield
+        sidewinder          7       8
+
+        **Setting values**
+
+        Set value for all items matching the list of labels
+
+        >>> df.loc[["viper", "sidewinder"], ["shield"]] = 50
+        >>> df
+                    max_speed  shield
+        cobra               1       2
+        viper               4      50
+        sidewinder          7      50
+
+        Set value for an entire row
+
+        >>> df.loc["cobra"] = 10
+        >>> df
+                    max_speed  shield
+        cobra              10      10
+        viper               4      50
+        sidewinder          7      50
+
+        Set value for an entire column
+
+        >>> df.loc[:, "max_speed"] = 30
+        >>> df
+                    max_speed  shield
+        cobra              30      10
+        viper              30      50
+        sidewinder         30      50
+
+        Set value for rows matching callable condition
+
+        >>> df.loc[df["shield"] > 35] = 0
+        >>> df
+                    max_speed  shield
+        cobra              30      10
+        viper               0       0
+        sidewinder          0       0
+
+        Add value matching location
+
+        >>> df.loc["viper", "shield"] += 5
+        >>> df
+                    max_speed  shield
+        cobra              30      10
+        viper               0       5
+        sidewinder          0       0
+
+        Setting using a ``Series`` or a ``DataFrame`` sets the values matching the
+        index labels, not the index positions.
+
+        >>> shuffled_df = df.loc[["viper", "cobra", "sidewinder"]]
+        >>> df.loc[:] += shuffled_df
+        >>> df
+                    max_speed  shield
+        cobra              60      20
+        viper               0      10
+        sidewinder          0       0
+
+        **Getting values on a DataFrame with an index that has integer labels**
+
+        Another example using integers for the index
+
+        >>> df = pd.DataFrame(
+        ...     [[1, 2], [4, 5], [7, 8]],
+        ...     index=[7, 8, 9],
+        ...     columns=["max_speed", "shield"],
+        ... )
+        >>> df
+           max_speed  shield
+        7          1       2
+        8          4       5
+        9          7       8
+
+        Slice with integer labels for rows. As mentioned above, note that both
+        the start and stop of the slice are included.
+
+        >>> df.loc[7:9]
+           max_speed  shield
+        7          1       2
+        8          4       5
+        9          7       8
+
+        **Getting values with a MultiIndex**
+
+        A number of examples using a DataFrame with a MultiIndex
+
+        >>> tuples = [
+        ...     ("cobra", "mark i"),
+        ...     ("cobra", "mark ii"),
+        ...     ("sidewinder", "mark i"),
+        ...     ("sidewinder", "mark ii"),
+        ...     ("viper", "mark ii"),
+        ...     ("viper", "mark iii"),
+        ... ]
+        >>> index = pd.MultiIndex.from_tuples(tuples)
+        >>> values = [[12, 2], [0, 4], [10, 20], [1, 4], [7, 1], [16, 36]]
+        >>> df = pd.DataFrame(values, columns=["max_speed", "shield"], index=index)
+        >>> df
+                             max_speed  shield
+        cobra      mark i           12       2
+                   mark ii           0       4
+        sidewinder mark i           10      20
+                   mark ii           1       4
+        viper      mark ii           7       1
+                   mark iii         16      36
+
+        Single label. Note this returns a DataFrame with a single index.
+
+        >>> df.loc["cobra"]
+                 max_speed  shield
+        mark i          12       2
+        mark ii          0       4
+
+        Single index tuple. Note this returns a Series.
+
+        >>> df.loc[("cobra", "mark ii")]
+        max_speed    0
+        shield       4
+        Name: (cobra, mark ii), dtype: int64
+
+        Single label for row and column. Similar to passing in a tuple, this
+        returns a Series.
+
+        >>> df.loc["cobra", "mark i"]
+        max_speed    12
+        shield        2
+        Name: (cobra, mark i), dtype: int64
+
+        Single tuple. Note using ``[[]]`` returns a DataFrame.
+
+        >>> df.loc[[("cobra", "mark ii")]]
+                       max_speed  shield
+        cobra mark ii          0       4
+
+        Single tuple for the index with a single label for the column
+
+        >>> df.loc[("cobra", "mark i"), "shield"]
+        np.int64(2)
+
+        Slice from index tuple to single label
+
+        >>> df.loc[("cobra", "mark i") : "viper"]
+                             max_speed  shield
+        cobra      mark i           12       2
+                   mark ii           0       4
+        sidewinder mark i           10      20
+                   mark ii           1       4
+        viper      mark ii           7       1
+                   mark iii         16      36
+
+        Slice from index tuple to index tuple
+
+        >>> df.loc[("cobra", "mark i") : ("viper", "mark ii")]
+                            max_speed  shield
+        cobra      mark i          12       2
+                   mark ii          0       4
+        sidewinder mark i          10      20
+                   mark ii          1       4
+        viper      mark ii          7       1
+
+        Please see the :ref:`user guide<advanced.advanced_hierarchical>`
+        for more details and explanations of advanced indexing.
+
+        **Assignment with Series**
+
+        When assigning a Series to .loc[row_indexer, col_indexer], pandas aligns
+        the Series by index labels, not by order or position.
+
+        Series assignment with .loc and index alignment:
+
+        >>> df = pd.DataFrame({"A": [1, 2, 3]}, index=[0, 1, 2])
+        >>> s = pd.Series([10, 20], index=[1, 0])  # Note reversed order
+        >>> df.loc[:, "B"] = s  # Aligns by index, not order
+        >>> df
+           A   B
+        0  1  20.0
+        1  2  10.0
+        2  3 NaN
+        """
+        return _LocIndexer("loc", self)
+
+    @property
+    def at(self) -> _AtIndexer:
+        """
+        Access a single value for a row/column label pair.
+
+        Similar to ``loc``, in that both provide label-based lookups. Use
+        ``at`` if you only need to get or set a single value in a DataFrame
+        or Series.
+
+        Raises
+        ------
+        KeyError
+            If getting a value and 'label' does not exist in a DataFrame or Series.
+
+        ValueError
+            If row/column label pair is not a tuple or if any label
+            from the pair is not a scalar for DataFrame.
+            If label is list-like (*excluding* NamedTuple) for Series.
+
+        See Also
+        --------
+        DataFrame.at : Access a single value for a row/column pair by label.
+        DataFrame.iat : Access a single value for a row/column pair by integer
+            position.
+        DataFrame.loc : Access a group of rows and columns by label(s).
+        DataFrame.iloc : Access a group of rows and columns by integer
+            position(s).
+        Series.at : Access a single value by label.
+        Series.iat : Access a single value by integer position.
+        Series.loc : Access a group of rows by label(s).
+        Series.iloc : Access a group of rows by integer position(s).
+
+        Notes
+        -----
+        See :ref:`Fast scalar value getting and setting <indexing.basics.get_value>`
+        for more details.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [[0, 2, 3], [0, 4, 1], [10, 20, 30]],
+        ...     index=[4, 5, 6],
+        ...     columns=["A", "B", "C"],
+        ... )
+        >>> df
+            A   B   C
+        4   0   2   3
+        5   0   4   1
+        6  10  20  30
+
+        Get value at specified row/column pair
+
+        >>> df.at[4, "B"]
+        np.int64(2)
+
+        Set value at specified row/column pair
+
+        >>> df.at[4, "B"] = 10
+        >>> df.at[4, "B"]
+        np.int64(10)
+
+        Get value within a Series
+
+        >>> df.loc[5].at["B"]
+        np.int64(4)
+        """
+        return _AtIndexer("at", self)
+
+    @property
+    def iat(self) -> _iAtIndexer:
+        """
+        Access a single value for a row/column pair by integer position.
+
+        Similar to ``iloc``, in that both provide integer-based lookups. Use
+        ``iat`` if you only need to get or set a single value in a DataFrame
+        or Series.
+
+        Raises
+        ------
+        IndexError
+            When integer position is out of bounds.
+
+        See Also
+        --------
+        DataFrame.at : Access a single value for a row/column label pair.
+        DataFrame.loc : Access a group of rows and columns by label(s).
+        DataFrame.iloc : Access a group of rows and columns by integer position(s).
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     [[0, 2, 3], [0, 4, 1], [10, 20, 30]], columns=["A", "B", "C"]
+        ... )
+        >>> df
+            A   B   C
+        0   0   2   3
+        1   0   4   1
+        2  10  20  30
+
+        Get value at specified row/column pair
+
+        >>> df.iat[1, 2]
+        np.int64(1)
+
+        Set value at specified row/column pair
+
+        >>> df.iat[1, 2] = 10
+        >>> df.iat[1, 2]
+        np.int64(10)
+
+        Get value within a series
+
+        >>> df.loc[0].iat[1]
+        np.int64(2)
+        """
+        return _iAtIndexer("iat", self)
+
+
+class _LocationIndexer(NDFrameIndexerBase):
+    _valid_types: str
+    axis: AxisInt | None = None
+
+    # sub-classes need to set _takeable
+    _takeable: bool
+
+    @final
+    def __call__(self, axis: Axis | None = None) -> Self:
+        # we need to return a copy of ourselves
+        new_self = type(self)(self.name, self.obj)
+
+        if axis is not None:
+            axis_int_none = self.obj._get_axis_number(axis)
+        else:
+            axis_int_none = axis
+        new_self.axis = axis_int_none
+        return new_self
+
+    def _get_setitem_indexer(self, key):
+        """
+        Convert a potentially-label-based key into a positional indexer.
+        """
+        if self.name == "loc":
+            # always holds here bc iloc overrides _get_setitem_indexer
+            self._ensure_listlike_indexer(key, axis=self.axis)
+
+        if isinstance(key, tuple):
+            for x in key:
+                check_dict_or_set_indexers(x)
+
+        if self.axis is not None:
+            key = _tupleize_axis_indexer(self.ndim, self.axis, key)
+
+        ax = self.obj._get_axis(0)
+
+        if (
+            isinstance(ax, MultiIndex)
+            and self.name != "iloc"
+            and is_hashable(key, allow_slice=False)
+        ):
+            with suppress(KeyError, InvalidIndexError):
+                # TypeError e.g. passed a bool
+                return ax.get_loc(key)
+
+        if isinstance(key, tuple):
+            with suppress(IndexingError):
+                # suppress "Too many indexers"
+                return self._convert_tuple(key)
+
+        if isinstance(key, range):
+            # GH#45479 test_loc_setitem_range_key
+            key = list(key)
+
+        return self._convert_to_indexer(key, axis=0)
+
+    @final
+    def _maybe_mask_setitem_value(self, indexer, value):
+        """
+        If we have obj.iloc[mask] = series_or_frame and series_or_frame has the
+        same length as obj, we treat this as obj.iloc[mask] = series_or_frame[mask],
+        similar to Series.__setitem__.
+
+        Note this is only for loc, not iloc.
+        """
+
+        if (
+            isinstance(indexer, tuple)
+            and len(indexer) == 2
+            and isinstance(value, (ABCSeries, ABCDataFrame))
+        ):
+            pi, icols = indexer
+            ndim = value.ndim
+            if com.is_bool_indexer(pi) and len(value) == len(pi):
+                newkey = pi.nonzero()[0]
+
+                if is_scalar_indexer(icols, self.ndim - 1) and ndim == 1:
+                    # e.g. test_loc_setitem_boolean_mask_allfalse
+                    if len(newkey) == 0:
+                        value = value.iloc[:0]
+                    else:
+                        # test_loc_setitem_ndframe_values_alignment
+                        value = self.obj.iloc._align_series(indexer, value)
+                    indexer = (newkey, icols)
+
+                elif (
+                    isinstance(icols, np.ndarray)
+                    and icols.dtype.kind == "i"
+                    and len(icols) == 1
+                ):
+                    if ndim == 1:
+                        # We implicitly broadcast, though numpy does not, see
+                        # github.com/pandas-dev/pandas/pull/45501#discussion_r789071825
+                        # test_loc_setitem_ndframe_values_alignment
+                        value = self.obj.iloc._align_series(indexer, value)
+                        indexer = (newkey, icols)
+
+                    elif ndim == 2 and value.shape[1] == 1:
+                        if len(newkey) == 0:
+                            value = value.iloc[:0]
+                        else:
+                            # test_loc_setitem_ndframe_values_alignment
+                            value = self.obj.iloc._align_frame(indexer, value)
+                        indexer = (newkey, icols)
+        elif com.is_bool_indexer(indexer):
+            indexer = indexer.nonzero()[0]
+
+        return indexer, value
+
+    @final
+    def _ensure_listlike_indexer(self, key, axis=None, value=None) -> None:
+        """
+        Ensure that a list-like of column labels are all present by adding them if
+        they do not already exist.
+
+        Parameters
+        ----------
+        key : list-like of column labels
+            Target labels.
+        axis : key axis if known
+        """
+        column_axis = 1
+
+        # column only exists in 2-dimensional DataFrame
+        if self.ndim != 2:
+            return
+
+        if isinstance(key, tuple) and len(key) > 1:
+            # key may be a tuple if we are .loc
+            # if length of key is > 1 set key to column part
+            # unless axis is already specified, then go with that
+            if axis is None:
+                axis = column_axis
+            key = key[axis]
+
+        if (
+            axis == column_axis
+            and not isinstance(self.obj.columns, MultiIndex)
+            and is_list_like_indexer(key)
+            and not com.is_bool_indexer(key)
+            and all(is_hashable(k) for k in key)
+        ):
+            # GH#38148
+            keys = self.obj.columns.union(key, sort=False)
+            diff = Index(key, copy=False).difference(self.obj.columns, sort=False)
+
+            if len(diff):
+                # e.g. if we are doing df.loc[:, ["A", "B"]] = 7 and "B"
+                #  is a new column, add the new columns with dtype=np.void
+                #  so that later when we go through setitem_single_column
+                #  we will use isetitem. Without this, the reindex_axis
+                #  below would create float64 columns in this example, which
+                #  would successfully hold 7, so we would end up with the wrong
+                #  dtype.
+                indexer = np.arange(len(keys), dtype=np.intp)
+                indexer[len(self.obj.columns) :] = -1
+                new_mgr = self.obj._mgr.reindex_indexer(
+                    keys, indexer=indexer, axis=0, only_slice=True, use_na_proxy=True
+                )
+                self.obj._mgr = new_mgr
+                return
+
+            self.obj._mgr = self.obj._mgr.reindex_axis(keys, axis=0, only_slice=True)
+
+    @final
+    def __setitem__(self, key, value) -> None:
+        if not CHAINED_WARNING_DISABLED:
+            if sys.getrefcount(self.obj) <= REF_COUNT_IDX:
+                warnings.warn(
+                    _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
+                )
+
+        check_dict_or_set_indexers(key)
+        if isinstance(key, tuple):
+            key = (list(x) if is_iterator(x) else x for x in key)
+            key = tuple(com.apply_if_callable(x, self.obj) for x in key)
+        else:
+            maybe_callable = com.apply_if_callable(key, self.obj)
+            key = self._raise_callable_usage(key, maybe_callable)
+        indexer = self._get_setitem_indexer(key)
+        self._has_valid_setitem_indexer(key)
+
+        iloc: _iLocIndexer = (
+            cast("_iLocIndexer", self) if self.name == "iloc" else self.obj.iloc
+        )
+        iloc._setitem_with_indexer(indexer, value, self.name)
+
+    def _validate_key(self, key, axis: AxisInt) -> None:
+        """
+        Ensure that key is valid for current indexer.
+
+        Parameters
+        ----------
+        key : scalar, slice or list-like
+            Key requested.
+        axis : int
+            Dimension on which the indexing is being made.
+
+        Raises
+        ------
+        TypeError
+            If the key (or some element of it) has wrong type.
+        IndexError
+            If the key (or some element of it) is out of bounds.
+        KeyError
+            If the key was not found.
+        """
+        raise AbstractMethodError(self)
+
+    @final
+    def _expand_ellipsis(self, tup: tuple) -> tuple:
+        """
+        If a tuple key includes an Ellipsis, replace it with an appropriate
+        number of null slices.
+        """
+        if any(x is Ellipsis for x in tup):
+            if tup.count(Ellipsis) > 1:
+                raise IndexingError(_one_ellipsis_message)
+
+            if len(tup) == self.ndim:
+                # It is unambiguous what axis this Ellipsis is indexing,
+                #  treat as a single null slice.
+                i = tup.index(Ellipsis)
+                # FIXME: this assumes only one Ellipsis
+                new_key = (*tup[:i], _NS, *tup[i + 1 :])
+                return new_key
+
+            # TODO: other cases?  only one test gets here, and that is covered
+            #  by _validate_key_length
+        return tup
+
+    @final
+    def _validate_tuple_indexer(self, key: tuple) -> tuple:
+        """
+        Check the key for valid keys across my indexer.
+        """
+        key = self._validate_key_length(key)
+        key = self._expand_ellipsis(key)
+        for i, k in enumerate(key):
+            try:
+                self._validate_key(k, i)
+            except ValueError as err:
+                raise ValueError(
+                    f"Location based indexing can only have [{self._valid_types}] types"
+                ) from err
+        return key
+
+    @final
+    def _is_nested_tuple_indexer(self, tup: tuple) -> bool:
+        """
+        Returns
+        -------
+        bool
+        """
+        if any(isinstance(ax, MultiIndex) for ax in self.obj.axes):
+            return any(is_nested_tuple(tup, ax) for ax in self.obj.axes)
+        return False
+
+    @final
+    def _convert_tuple(self, key: tuple) -> tuple:
+        # Note: we assume _tupleize_axis_indexer has been called, if necessary.
+        self._validate_key_length(key)
+        keyidx = [self._convert_to_indexer(k, axis=i) for i, k in enumerate(key)]
+        return tuple(keyidx)
+
+    @final
+    def _validate_key_length(self, key: tuple) -> tuple:
+        if len(key) > self.ndim:
+            if key[0] is Ellipsis:
+                # e.g. Series.iloc[..., 3] reduces to just Series.iloc[3]
+                key = key[1:]
+                if Ellipsis in key:
+                    raise IndexingError(_one_ellipsis_message)
+                return self._validate_key_length(key)
+            raise IndexingError("Too many indexers")
+        return key
+
+    @final
+    def _getitem_tuple_same_dim(self, tup: tuple):
+        """
+        Index with indexers that should return an object of the same dimension
+        as self.obj.
+
+        This is only called after a failed call to _getitem_lowerdim.
+        """
+        retval = self.obj
+        # Selecting columns before rows is significantly faster
+        start_val = (self.ndim - len(tup)) + 1
+        for i, key in enumerate(reversed(tup)):
+            i = self.ndim - i - start_val
+            if com.is_null_slice(key):
+                continue
+
+            retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
+            # We should never have retval.ndim < self.ndim, as that should
+            #  be handled by the _getitem_lowerdim call above.
+            assert retval.ndim == self.ndim
+
+        if retval is self.obj:
+            # if all axes were a null slice (`df.loc[:, :]`), ensure we still
+            # return a new object (https://github.com/pandas-dev/pandas/pull/49469)
+            retval = retval.copy(deep=False)
+
+        return retval
+
+    @final
+    def _getitem_lowerdim(self, tup: tuple):
+        # we can directly get the axis result since the axis is specified
+        if self.axis is not None:
+            axis = self.obj._get_axis_number(self.axis)
+            return self._getitem_axis(tup, axis=axis)
+
+        # we may have a nested tuples indexer here
+        if self._is_nested_tuple_indexer(tup):
+            return self._getitem_nested_tuple(tup)
+
+        # we maybe be using a tuple to represent multiple dimensions here
+        ax0 = self.obj._get_axis(0)
+        # ...but iloc should handle the tuple as simple integer-location
+        # instead of checking it as multiindex representation (GH 13797)
+        if (
+            isinstance(ax0, MultiIndex)
+            and self.name != "iloc"
+            and not any(isinstance(x, slice) for x in tup)
+        ):
+            # Note: in all extant test cases, replacing the slice condition with
+            #  `all(is_hashable(x) or com.is_null_slice(x) for x in tup)`
+            #  is equivalent.
+            #  (see the other place where we call _handle_lowerdim_multi_index_axis0)
+            with suppress(IndexingError):
+                return cast(_LocIndexer, self)._handle_lowerdim_multi_index_axis0(tup)
+
+        tup = self._validate_key_length(tup)
+
+        # Reverse tuple so that we are indexing along columns before rows
+        # and avoid unintended dtype inference. # GH60600
+        for i, key in zip(range(len(tup) - 1, -1, -1), reversed(tup), strict=True):
+            if is_label_like(key) or is_list_like(key):
+                # We don't need to check for tuples here because those are
+                #  caught by the _is_nested_tuple_indexer check above.
+                section = self._getitem_axis(key, axis=i)
+
+                # We should never have a scalar section here, because
+                #  _getitem_lowerdim is only called after a check for
+                #  is_scalar_access, which that would be.
+                if section.ndim == self.ndim:
+                    # we're in the middle of slicing through a MultiIndex
+                    # revise the key wrt to `section` by inserting an _NS
+                    new_key = (*tup[:i], _NS, *tup[i + 1 :])
+
+                else:
+                    # Note: the section.ndim == self.ndim check above
+                    #  rules out having DataFrame here, so we dont need to worry
+                    #  about transposing.
+                    new_key = tup[:i] + tup[i + 1 :]
+
+                    if len(new_key) == 1:
+                        new_key = new_key[0]
+
+                # Slices should return views, but calling iloc/loc with a null
+                # slice returns a new object.
+                if com.is_null_slice(new_key):
+                    return section
+                # This is an elided recursive call to iloc/loc
+                return getattr(section, self.name)[new_key]
+
+        raise IndexingError("not applicable")
+
+    @final
+    def _getitem_nested_tuple(self, tup: tuple):
+        # we have a nested tuple so have at least 1 multi-index level
+        # we should be able to match up the dimensionality here
+
+        for key in tup:
+            check_dict_or_set_indexers(key)
+
+        # we have too many indexers for our dim, but have at least 1
+        # multi-index dimension, try to see if we have something like
+        # a tuple passed to a series with a multi-index
+        if len(tup) > self.ndim:
+            if self.name != "loc":
+                # This should never be reached, but let's be explicit about it
+                raise ValueError("Too many indices")  # pragma: no cover
+            if all(
+                is_hashable(x, allow_slice=False) or com.is_null_slice(x) for x in tup
+            ):
+                # GH#10521 Series should reduce MultiIndex dimensions instead of
+                #  DataFrame, IndexingError is not raised when slice(None,None,None)
+                #  with one row.
+                with suppress(IndexingError):
+                    return cast(_LocIndexer, self)._handle_lowerdim_multi_index_axis0(
+                        tup
+                    )
+            elif isinstance(self.obj, ABCSeries) and any(
+                isinstance(k, tuple) for k in tup
+            ):
+                # GH#35349 Raise if tuple in tuple for series
+                # Do this after the all-hashable-or-null-slice check so that
+                #  we are only getting non-hashable tuples, in particular ones
+                #  that themselves contain a slice entry
+                # See test_loc_series_getitem_too_many_dimensions
+                raise IndexingError("Too many indexers")
+
+            # this is a series with a multi-index specified a tuple of
+            # selectors
+            axis = self.axis or 0
+            return self._getitem_axis(tup, axis=axis)
+
+        # handle the multi-axis by taking sections and reducing
+        # this is iterative
+        obj = self.obj
+        # GH#41369 Loop in reverse order ensures indexing along columns before rows
+        # which selects only necessary blocks which avoids dtype conversion if possible
+        axis = len(tup) - 1
+        for key in reversed(tup):
+            if com.is_null_slice(key):
+                axis -= 1
+                continue
+
+            obj = getattr(obj, self.name)._getitem_axis(key, axis=axis)
+            axis -= 1
+
+            # if we have a scalar, we are done
+            if is_scalar(obj) or not hasattr(obj, "ndim"):
+                break
+
+        return obj
+
+    def _convert_to_indexer(self, key, axis: AxisInt):
+        raise AbstractMethodError(self)
+
+    def _raise_callable_usage(self, key: Any, maybe_callable: T) -> T:
+        # GH53533
+        if self.name == "iloc" and callable(key) and isinstance(maybe_callable, tuple):
+            raise ValueError(
+                "Returning a tuple from a callable with iloc is not allowed.",
+            )
+        return maybe_callable
+
+    @final
+    def __getitem__(self, key):
+        check_dict_or_set_indexers(key)
+        if type(key) is tuple:
+            key = (list(x) if is_iterator(x) else x for x in key)
+            key = tuple(com.apply_if_callable(x, self.obj) for x in key)
+            if self._is_scalar_access(key):
+                return self.obj._get_value(*key, takeable=self._takeable)
+            return self._getitem_tuple(key)
+        else:
+            # we by definition only have the 0th axis
+            axis = self.axis or 0
+
+            maybe_callable = com.apply_if_callable(key, self.obj)
+            maybe_callable = self._raise_callable_usage(key, maybe_callable)
+            return self._getitem_axis(maybe_callable, axis=axis)
+
+    def _is_scalar_access(self, key: tuple):
+        raise NotImplementedError
+
+    def _getitem_tuple(self, tup: tuple):
+        raise AbstractMethodError(self)
+
+    def _getitem_axis(self, key, axis: AxisInt):
+        raise NotImplementedError
+
+    def _has_valid_setitem_indexer(self, indexer) -> bool:
+        raise AbstractMethodError(self)
+
+    @final
+    def _getbool_axis(self, key, axis: AxisInt):
+        # caller is responsible for ensuring non-None axis
+        labels = self.obj._get_axis(axis)
+        key = check_bool_indexer(labels, key)
+        inds = key.nonzero()[0]
+        return self.obj.take(inds, axis=axis)
+
+
+@doc(IndexingMixin.loc)
+class _LocIndexer(_LocationIndexer):
+    _takeable: bool = False
+    _valid_types = (
+        "labels (MUST BE IN THE INDEX), slices of labels (BOTH "
+        "endpoints included! Can be slices of integers if the "
+        "index is integers), listlike of labels, boolean"
+    )
+
+    # -------------------------------------------------------------------
+    # Key Checks
+
+    @doc(_LocationIndexer._validate_key)
+    def _validate_key(self, key, axis: Axis) -> None:
+        # valid for a collection of labels (we check their presence later)
+        # slice of labels (where start-end in labels)
+        # slice of integers (only if in the labels)
+        # boolean not in slice and with boolean index
+        ax = self.obj._get_axis(axis)
+        if isinstance(key, bool) and not (
+            is_bool_dtype(ax.dtype)
+            or ax.dtype.name == "boolean"
+            or (
+                isinstance(ax, MultiIndex)
+                and is_bool_dtype(ax.get_level_values(0).dtype)
+            )
+        ):
+            raise KeyError(
+                f"{key}: boolean label can not be used without a boolean index"
+            )
+
+        if isinstance(key, slice) and (
+            isinstance(key.start, bool) or isinstance(key.stop, bool)
+        ):
+            raise TypeError(f"{key}: boolean values can not be used in a slice")
+
+    def _has_valid_setitem_indexer(self, indexer) -> bool:
+        return True
+
+    def _is_scalar_access(self, key: tuple) -> bool:
+        """
+        Returns
+        -------
+        bool
+        """
+        # this is a shortcut accessor to both .loc and .iloc
+        # that provide the equivalent access of .at and .iat
+        # a) avoid getting things via sections and (to minimize dtype changes)
+        # b) provide a performant path
+        if len(key) != self.ndim:
+            return False
+
+        for i, k in enumerate(key):
+            if not is_scalar(k):
+                return False
+
+            ax = self.obj.axes[i]
+            if isinstance(ax, MultiIndex):
+                return False
+
+            if isinstance(k, str) and ax._supports_partial_string_indexing:
+                # partial string indexing, df.loc['2000', 'A']
+                # should not be considered scalar
+                return False
+
+            if not ax._index_as_unique:
+                return False
+
+        return True
+
+    # -------------------------------------------------------------------
+    # MultiIndex Handling
+
+    def _multi_take_opportunity(self, tup: tuple) -> bool:
+        """
+        Check whether there is the possibility to use ``_multi_take``.
+
+        Currently the limit is that all axes being indexed, must be indexed with
+        list-likes.
+
+        Parameters
+        ----------
+        tup : tuple
+            Tuple of indexers, one per axis.
+
+        Returns
+        -------
+        bool
+            Whether the current indexing,
+            can be passed through `_multi_take`.
+        """
+        if not all(is_list_like_indexer(x) for x in tup):
+            return False
+
+        # just too complicated
+        return not any(com.is_bool_indexer(x) for x in tup)
+
+    def _multi_take(self, tup: tuple):
+        """
+        Create the indexers for the passed tuple of keys, and
+        executes the take operation. This allows the take operation to be
+        executed all at once, rather than once for each dimension.
+        Improving efficiency.
+
+        Parameters
+        ----------
+        tup : tuple
+            Tuple of indexers, one per axis.
+
+        Returns
+        -------
+        values: same type as the object being indexed
+        """
+        # GH 836
+        d = {
+            axis: self._get_listlike_indexer(key, axis)
+            for (key, axis) in zip(tup, self.obj._AXIS_ORDERS, strict=True)
+        }
+        return self.obj._reindex_with_indexers(d, allow_dups=True)
+
+    # -------------------------------------------------------------------
+
+    def _getitem_iterable(self, key, axis: AxisInt):
+        """
+        Index current object with an iterable collection of keys.
+
+        Parameters
+        ----------
+        key : iterable
+            Targeted labels.
+        axis : int
+            Dimension on which the indexing is being made.
+
+        Raises
+        ------
+        KeyError
+            If no key was found. Will change in the future to raise if not all
+            keys were found.
+
+        Returns
+        -------
+        scalar, DataFrame, or Series: indexed value(s).
+        """
+        # we assume that not com.is_bool_indexer(key), as that is
+        #  handled before we get here.
+        self._validate_key(key, axis)
+
+        # A collection of keys
+        keyarr, indexer = self._get_listlike_indexer(key, axis)
+        return self.obj._reindex_with_indexers(
+            {axis: [keyarr, indexer]}, allow_dups=True
+        )
+
+    def _getitem_tuple(self, tup: tuple):
+        with suppress(IndexingError):
+            tup = self._expand_ellipsis(tup)
+            return self._getitem_lowerdim(tup)
+
+        # no multi-index, so validate all of the indexers
+        tup = self._validate_tuple_indexer(tup)
+
+        # ugly hack for GH #836
+        if self._multi_take_opportunity(tup):
+            return self._multi_take(tup)
+
+        return self._getitem_tuple_same_dim(tup)
+
+    def _get_label(self, label, axis: AxisInt):
+        # GH#5567 this will fail if the label is not present in the axis.
+        return self.obj.xs(label, axis=axis)
+
+    def _handle_lowerdim_multi_index_axis0(self, tup: tuple):
+        # we have an axis0 multi-index, handle or raise
+        axis = self.axis or 0
+        try:
+            # fast path for series or for tup devoid of slices
+            return self._get_label(tup, axis=axis)
+
+        except KeyError as ek:
+            # raise KeyError if number of indexers match
+            # else IndexingError will be raised
+            if self.ndim < len(tup) <= self.obj.index.nlevels:
+                raise ek
+            raise IndexingError("No label returned") from ek
+
+    def _getitem_axis(self, key, axis: AxisInt):
+        key = item_from_zerodim(key)
+        if is_iterator(key):
+            key = list(key)
+        if key is Ellipsis:
+            key = slice(None)
+
+        labels = self.obj._get_axis(axis)
+
+        if isinstance(key, tuple) and isinstance(labels, MultiIndex):
+            key = tuple(key)
+
+        if isinstance(key, slice):
+            self._validate_key(key, axis)
+            return self._get_slice_axis(key, axis=axis)
+        elif com.is_bool_indexer(key):
+            return self._getbool_axis(key, axis=axis)
+        elif is_list_like_indexer(key):
+            # an iterable multi-selection
+            if not (isinstance(key, tuple) and isinstance(labels, MultiIndex)):
+                if hasattr(key, "ndim") and key.ndim > 1:
+                    raise ValueError("Cannot index with multidimensional key")
+
+                return self._getitem_iterable(key, axis=axis)
+
+            # nested tuple slicing
+            if is_nested_tuple(key, labels):
+                locs = labels.get_locs(key)
+                indexer: list[slice | npt.NDArray[np.intp]] = [slice(None)] * self.ndim
+                indexer[axis] = locs
+                return self.obj.iloc[tuple(indexer)]
+
+        # fall thru to straight lookup
+        self._validate_key(key, axis)
+        return self._get_label(key, axis=axis)
+
+    def _get_slice_axis(self, slice_obj: slice, axis: AxisInt):
+        """
+        This is pretty simple as we just have to deal with labels.
+        """
+        # caller is responsible for ensuring non-None axis
+        obj = self.obj
+        if not need_slice(slice_obj):
+            return obj.copy(deep=False)
+
+        labels = obj._get_axis(axis)
+        indexer = labels.slice_indexer(slice_obj.start, slice_obj.stop, slice_obj.step)
+
+        if isinstance(indexer, slice):
+            return self.obj._slice(indexer, axis=axis)
+        else:
+            # DatetimeIndex overrides Index.slice_indexer and may
+            #  return a DatetimeIndex instead of a slice object.
+            return self.obj.take(indexer, axis=axis)
+
+    def _convert_to_indexer(self, key, axis: AxisInt):
+        """
+        Convert indexing key into something we can use to do actual fancy
+        indexing on an ndarray.
+
+        Examples
+        ix[:5] -> slice(0, 5)
+        ix[[1,2,3]] -> [1,2,3]
+        ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz)
+
+        Going by Zen of Python?
+        'In the face of ambiguity, refuse the temptation to guess.'
+        raise AmbiguousIndexError with integer labels?
+        - No, prefer label-based indexing
+        """
+        labels = self.obj._get_axis(axis)
+
+        if isinstance(key, slice):
+            return labels._convert_slice_indexer(key, kind="loc")
+
+        if (
+            isinstance(key, tuple)
+            and not isinstance(labels, MultiIndex)
+            and self.ndim < 2
+            and len(key) > 1
+        ):
+            raise IndexingError("Too many indexers")
+
+        # Slices are not valid keys passed in by the user,
+        # even though they are hashable in Python 3.12
+        contains_slice = False
+        if isinstance(key, tuple):
+            contains_slice = any(isinstance(v, slice) for v in key)
+
+        if is_scalar(key) or (
+            isinstance(labels, MultiIndex) and is_hashable(key) and not contains_slice
+        ):
+            # Otherwise get_loc will raise InvalidIndexError
+
+            # if we are a label return me
+            try:
+                return labels.get_loc(key)
+            except LookupError:
+                if isinstance(key, tuple) and isinstance(labels, MultiIndex):
+                    if len(key) == labels.nlevels:
+                        return {"key": key}
+                    raise
+            except InvalidIndexError:
+                # GH35015, using datetime as column indices raises exception
+                if not isinstance(labels, MultiIndex):
+                    raise
+            except ValueError:
+                if not is_integer(key):
+                    raise
+                return {"key": key}
+
+        if is_nested_tuple(key, labels):
+            if self.ndim == 1 and any(isinstance(k, tuple) for k in key):
+                # GH#35349 Raise if tuple in tuple for series
+                raise IndexingError("Too many indexers")
+            return labels.get_locs(key)
+
+        elif is_list_like_indexer(key):
+            if is_iterator(key):
+                key = list(key)
+
+            if com.is_bool_indexer(key):
+                key = check_bool_indexer(labels, key)
+                return key
+            else:
+                return self._get_listlike_indexer(key, axis)[1]
+        else:
+            try:
+                return labels.get_loc(key)
+            except LookupError:
+                # allow a not found key only if we are a setter
+                if not is_list_like_indexer(key):
+                    return {"key": key}
+                raise
+
+    def _get_listlike_indexer(self, key, axis: AxisInt):
+        """
+        Transform a list-like of keys into a new index and an indexer.
+
+        Parameters
+        ----------
+        key : list-like
+            Targeted labels.
+        axis:  int
+            Dimension on which the indexing is being made.
+
+        Raises
+        ------
+        KeyError
+            If at least one key was requested but none was found.
+
+        Returns
+        -------
+        keyarr: Index
+            New index (coinciding with 'key' if the axis is unique).
+        values : array-like
+            Indexer for the return object, -1 denotes keys not found.
+        """
+        ax = self.obj._get_axis(axis)
+        axis_name = self.obj._get_axis_name(axis)
+
+        keyarr, indexer = ax._get_indexer_strict(key, axis_name)
+
+        return keyarr, indexer
+
+
+@doc(IndexingMixin.iloc)
+class _iLocIndexer(_LocationIndexer):
+    _valid_types = (
+        "integer, integer slice (START point is INCLUDED, END "
+        "point is EXCLUDED), listlike of integers, boolean array"
+    )
+    _takeable = True
+
+    # -------------------------------------------------------------------
+    # Key Checks
+
+    def _validate_key(self, key, axis: AxisInt) -> None:
+        if com.is_bool_indexer(key):
+            if hasattr(key, "index") and isinstance(key.index, Index):
+                if key.index.inferred_type == "integer":
+                    return
+                raise ValueError(
+                    "iLocation based boolean indexing cannot use an indexable as a mask"
+                )
+            return
+
+        if isinstance(key, slice):
+            return
+        elif is_integer(key):
+            self._validate_integer(key, axis)
+        elif isinstance(key, tuple):
+            # a tuple should already have been caught by this point
+            # so don't treat a tuple as a valid indexer
+            raise IndexingError("Too many indexers")
+        elif is_list_like_indexer(key):
+            if isinstance(key, ABCSeries):
+                arr = key._values
+            elif is_array_like(key):
+                arr = key
+            else:
+                arr = np.array(key)
+            len_axis = len(self.obj._get_axis(axis))
+
+            # check that the key has a numeric dtype
+            if not is_numeric_dtype(arr.dtype):
+                raise IndexError(f".iloc requires numeric indexers, got {arr}")
+
+            if len(arr):
+                if isinstance(arr.dtype, ExtensionDtype):
+                    arr_max = arr._reduce("max")
+                    arr_min = arr._reduce("min")
+                else:
+                    arr_max = np.max(arr)
+                    arr_min = np.min(arr)
+
+                # check that the key does not exceed the maximum size
+                if arr_max >= len_axis or arr_min < -len_axis:
+                    raise IndexError("positional indexers are out-of-bounds")
+        else:
+            raise ValueError(f"Can only index by location with a [{self._valid_types}]")
+
+    def _has_valid_setitem_indexer(self, indexer) -> bool:
+        """
+        Validate that a positional indexer cannot enlarge its target
+        will raise if needed, does not modify the indexer externally.
+
+        Returns
+        -------
+        bool
+        """
+        if isinstance(indexer, dict):
+            raise IndexError("iloc cannot enlarge its target object")
+
+        if isinstance(indexer, ABCDataFrame):
+            raise TypeError(
+                "DataFrame indexer for .iloc is not supported. "
+                "Consider using .loc with a DataFrame indexer for automatic alignment.",
+            )
+
+        if not isinstance(indexer, tuple):
+            indexer = _tuplify(self.ndim, indexer)
+
+        for ax, i in zip(self.obj.axes, indexer, strict=False):
+            if isinstance(i, slice):
+                # should check the stop slice?
+                pass
+            elif is_list_like_indexer(i):
+                # should check the elements?
+                pass
+            elif is_integer(i):
+                if i >= len(ax):
+                    raise IndexError("iloc cannot enlarge its target object")
+            elif isinstance(i, dict):
+                raise IndexError("iloc cannot enlarge its target object")
+
+        return True
+
+    def _is_scalar_access(self, key: tuple) -> bool:
+        """
+        Returns
+        -------
+        bool
+        """
+        # this is a shortcut accessor to both .loc and .iloc
+        # that provide the equivalent access of .at and .iat
+        # a) avoid getting things via sections and (to minimize dtype changes)
+        # b) provide a performant path
+        if len(key) != self.ndim:
+            return False
+
+        return all(is_integer(k) for k in key)
+
+    def _validate_integer(self, key: int | np.integer, axis: AxisInt) -> None:
+        """
+        Check that 'key' is a valid position in the desired axis.
+
+        Parameters
+        ----------
+        key : int
+            Requested position.
+        axis : int
+            Desired axis.
+
+        Raises
+        ------
+        IndexError
+            If 'key' is not a valid position in axis 'axis'.
+        """
+        len_axis = len(self.obj._get_axis(axis))
+        if key >= len_axis or key < -len_axis:
+            raise IndexError("single positional indexer is out-of-bounds")
+
+    # -------------------------------------------------------------------
+
+    def _getitem_tuple(self, tup: tuple):
+        tup = self._validate_tuple_indexer(tup)
+        with suppress(IndexingError):
+            return self._getitem_lowerdim(tup)
+
+        return self._getitem_tuple_same_dim(tup)
+
+    def _get_list_axis(self, key, axis: AxisInt):
+        """
+        Return Series values by list or array of integers.
+
+        Parameters
+        ----------
+        key : list-like positional indexer
+        axis : int
+
+        Returns
+        -------
+        Series object
+
+        Notes
+        -----
+        `axis` can only be zero.
+        """
+        try:
+            return self.obj.take(key, axis=axis)
+        except IndexError as err:
+            # re-raise with different error message, e.g. test_getitem_ndarray_3d
+            raise IndexError("positional indexers are out-of-bounds") from err
+
+    def _getitem_axis(self, key, axis: AxisInt):
+        if key is Ellipsis:
+            key = slice(None)
+        elif isinstance(key, ABCDataFrame):
+            raise IndexError(
+                "DataFrame indexer is not allowed for .iloc\n"
+                "Consider using .loc for automatic alignment."
+            )
+
+        if isinstance(key, slice):
+            return self._get_slice_axis(key, axis=axis)
+
+        if is_iterator(key):
+            key = list(key)
+
+        if isinstance(key, list):
+            key = np.asarray(key)
+
+        if com.is_bool_indexer(key):
+            self._validate_key(key, axis)
+            return self._getbool_axis(key, axis=axis)
+
+        # a list of integers
+        elif is_list_like_indexer(key):
+            return self._get_list_axis(key, axis=axis)
+
+        # a single integer
+        else:
+            key = item_from_zerodim(key)
+            if not is_integer(key):
+                raise TypeError("Cannot index by location index with a non-integer key")
+
+            # validate the location
+            self._validate_integer(key, axis)
+
+            return self.obj._ixs(key, axis=axis)
+
+    def _get_slice_axis(self, slice_obj: slice, axis: AxisInt):
+        # caller is responsible for ensuring non-None axis
+        obj = self.obj
+
+        if not need_slice(slice_obj):
+            return obj.copy(deep=False)
+
+        labels = obj._get_axis(axis)
+        labels._validate_positional_slice(slice_obj)
+        return self.obj._slice(slice_obj, axis=axis)
+
+    def _convert_to_indexer(self, key: T, axis: AxisInt) -> T:
+        """
+        Much simpler as we only have to deal with our valid types.
+        """
+        return key
+
+    def _get_setitem_indexer(self, key):
+        # GH#32257 Fall through to let numpy do validation
+        if is_iterator(key):
+            key = list(key)
+
+        if self.axis is not None:
+            key = _tupleize_axis_indexer(self.ndim, self.axis, key)
+
+        return key
+
+    # -------------------------------------------------------------------
+
+    def _decide_split_path(self, indexer, value) -> bool:
+        """
+        Decide whether we will take a block-by-block path.
+        """
+        take_split_path = not self.obj._mgr.is_single_block
+
+        if not take_split_path and isinstance(value, ABCDataFrame):
+            # Avoid cast of values
+            take_split_path = not value._mgr.is_single_block
+
+        # if there is only one block/type, still have to take split path
+        # unless the block is one-dimensional or it can hold the value
+        if not take_split_path and len(self.obj._mgr.blocks) and self.ndim > 1:
+            # in case of dict, keys are indices
+            val = list(value.values()) if isinstance(value, dict) else value
+            arr = self.obj._mgr.blocks[0].values
+            take_split_path = not can_hold_element(
+                arr, extract_array(val, extract_numpy=True)
+            )
+
+        # if we have any multi-indexes that have non-trivial slices
+        # (not null slices) then we must take the split path, xref
+        # GH 10360, GH 27841
+        if isinstance(indexer, tuple) and len(indexer) == len(self.obj.axes):
+            for i, ax in zip(indexer, self.obj.axes, strict=True):
+                if isinstance(ax, MultiIndex) and not (
+                    is_integer(i) or com.is_null_slice(i)
+                ):
+                    take_split_path = True
+                    break
+
+        return take_split_path
+
+    def _setitem_new_column(self, indexer, key, value, name: str) -> None:
+        """
+        _setitem_with_indexer cases that can go through DataFrame.__setitem__.
+        """
+        # add the new item, and set the value
+        # must have all defined axes if we have a scalar
+        # or a list-like on the non-info axes if we have a
+        # list-like
+        if not len(self.obj):
+            if not is_list_like_indexer(value):
+                raise ValueError(
+                    "cannot set a frame with no defined index and a scalar"
+                )
+            self.obj[key] = value
+            return
+
+        # add a new item with the dtype setup
+        if com.is_null_slice(indexer[0]):
+            # We are setting an entire column
+            self.obj[key] = value
+            return
+        elif is_array_like(value):
+            # GH#42099
+            arr = extract_array(value, extract_numpy=True)
+            taker = -1 * np.ones(len(self.obj), dtype=np.intp)
+            empty_value = algos.take_nd(arr, taker)
+            if not isinstance(value, ABCSeries):
+                # if not Series (in which case we need to align),
+                #  we can short-circuit
+                if isinstance(arr, np.ndarray) and arr.ndim == 1 and len(arr) == 1:
+                    # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
+                    arr = arr[0, ...]
+                empty_value[indexer[0]] = arr
+                self.obj[key] = empty_value
+                return
+
+            self.obj[key] = empty_value
+        elif not is_list_like(value):
+            self.obj[key] = construct_1d_array_from_inferred_fill_value(
+                value, len(self.obj)
+            )
+        else:
+            # FIXME: GH#42099#issuecomment-864326014
+            self.obj[key] = infer_fill_value(value)
+
+        new_indexer = convert_from_missing_indexer_tuple(indexer, self.obj.axes)
+        self._setitem_with_indexer(new_indexer, value, name)
+
+        return
+
+    def _setitem_with_indexer(self, indexer, value, name: str = "iloc") -> None:
+        """
+        _setitem_with_indexer is for setting values on a Series/DataFrame
+        using positional indexers.
+
+        If the relevant keys are not present, the Series/DataFrame may be
+        expanded.
+        """
+        info_axis = self.obj._info_axis_number
+        take_split_path = self._decide_split_path(indexer, value)
+
+        if isinstance(indexer, tuple):
+            nindexer = []
+            for i, idx in enumerate(indexer):
+                idx, missing = convert_missing_indexer(idx)
+                if missing:
+                    # reindex the axis to the new value
+                    # and set inplace
+                    key = idx
+
+                    # if this is the items axes, then take the main missing
+                    # path first
+                    # this correctly sets the dtype
+                    # essentially this separates out the block that is needed
+                    # to possibly be modified
+                    if self.ndim > 1 and i == info_axis:
+                        self._setitem_new_column(indexer, key, value, name=name)
+                        return
+
+                    # reindex the axis
+                    index = self.obj._get_axis(i)
+                    labels = index.insert(len(index), key)
+
+                    # We are expanding the Series/DataFrame values to match
+                    #  the length of the new index `labels`.  GH#40096 ensure
+                    #  this is valid even if the index has duplicates.
+                    taker = np.arange(len(index) + 1, dtype=np.intp)
+                    taker[-1] = -1
+                    reindexers = {i: (labels, taker)}
+                    new_obj = self.obj._reindex_with_indexers(
+                        reindexers, allow_dups=True
+                    )
+                    self.obj._mgr = new_obj._mgr
+
+                    nindexer.append(labels.get_loc(key))
+
+                else:
+                    nindexer.append(idx)
+
+            indexer = tuple(nindexer)
+        else:
+            indexer, missing = convert_missing_indexer(indexer)
+
+            if missing:
+                self._setitem_with_indexer_missing(indexer, value)
+                return
+
+        if name == "loc":
+            # must come after setting of missing
+            indexer, value = self._maybe_mask_setitem_value(indexer, value)
+
+        # align and set the values
+        if take_split_path:
+            # We have to operate column-wise
+            self._setitem_with_indexer_split_path(indexer, value, name)
+        else:
+            self._setitem_single_block(indexer, value, name)
+
+    def _setitem_with_indexer_split_path(self, indexer, value, name: str):
+        """
+        Setitem column-wise.
+        """
+        # Above we only set take_split_path to True for 2D cases
+        assert self.ndim == 2
+
+        if not isinstance(indexer, tuple):
+            indexer = _tuplify(self.ndim, indexer)
+        if len(indexer) > self.ndim:
+            raise IndexError("too many indices for array")
+        if isinstance(indexer[0], np.ndarray) and indexer[0].ndim > 2:
+            raise ValueError(r"Cannot set values with ndim > 2")
+
+        if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict):
+            from pandas import Series
+
+            value = self._align_series(indexer, Series(value))
+
+        # Ensure we have something we can iterate over
+        info_axis = indexer[1]
+        ilocs = self._ensure_iterable_column_indexer(info_axis)
+
+        pi = indexer[0]
+        lplane_indexer = length_of_indexer(pi, self.obj.index)
+        # lplane_indexer gives the expected length of obj[indexer[0]]
+
+        # we need an iterable, with an ndim of at least 1
+        # eg. don't pass through np.array(0)
+        if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0:
+            if isinstance(value, ABCDataFrame):
+                self._setitem_with_indexer_frame_value(indexer, value, name)
+
+            elif np.ndim(value) == 2:
+                # TODO: avoid np.ndim call in case it isn't an ndarray, since
+                #  that will construct an ndarray, which will be wasteful
+                self._setitem_with_indexer_2d_value(indexer, value)
+
+            elif len(ilocs) == 1 and lplane_indexer == len(value) and not is_scalar(pi):
+                # We are setting multiple rows in a single column.
+                self._setitem_single_column(ilocs[0], value, pi)
+
+            elif len(ilocs) == 1 and 0 != lplane_indexer != len(value):
+                # We are trying to set N values into M entries of a single
+                #  column, which is invalid for N != M
+                # Exclude zero-len for e.g. boolean masking that is all-false
+
+                if len(value) == 1 and not is_integer(info_axis):
+                    # This is a case like df.iloc[:3, [1]] = [0]
+                    #  where we treat as df.iloc[:3, 1] = 0
+                    return self._setitem_with_indexer((pi, info_axis[0]), value[0])
+
+                raise ValueError(
+                    "Must have equal len keys and value when setting with an iterable"
+                )
+
+            elif lplane_indexer == 0 and len(value) == len(self.obj.index):
+                # We get here in one case via .loc with an all-False mask
+                pass
+
+            elif self._is_scalar_access(indexer) and is_object_dtype(
+                self.obj.dtypes._values[ilocs[0]]
+            ):
+                # We are setting nested data, only possible for object dtype data
+                self._setitem_single_column(indexer[1], value, pi)
+
+            elif len(ilocs) == len(value):
+                # We are setting multiple columns in a single row.
+                for loc, v in zip(ilocs, value, strict=True):
+                    self._setitem_single_column(loc, v, pi)
+
+            elif len(ilocs) == 1 and com.is_null_slice(pi) and len(self.obj) == 0:
+                # This is a setitem-with-expansion, see
+                #  test_loc_setitem_empty_append_expands_rows_mixed_dtype
+                # e.g. df = DataFrame(columns=["x", "y"])
+                #  df["x"] = df["x"].astype(np.int64)
+                #  df.loc[:, "x"] = [1, 2, 3]
+                self._setitem_single_column(ilocs[0], value, pi)
+
+            else:
+                raise ValueError(
+                    "Must have equal len keys and value when setting with an iterable"
+                )
+
+        else:
+            # scalar value
+            for loc in ilocs:
+                self._setitem_single_column(loc, value, pi)
+
+    def _setitem_with_indexer_2d_value(self, indexer, value) -> None:
+        # We get here with np.ndim(value) == 2, excluding DataFrame,
+        #  which goes through _setitem_with_indexer_frame_value
+        pi = indexer[0]
+
+        ilocs = self._ensure_iterable_column_indexer(indexer[1])
+
+        if not is_array_like(value):
+            # cast lists to array
+            value = np.array(value, dtype=object)
+        if len(ilocs) != value.shape[1]:
+            raise ValueError(
+                "Must have equal len keys and value when setting with an ndarray"
+            )
+
+        for i, loc in enumerate(ilocs):
+            value_col = value[:, i]
+            if is_object_dtype(value_col.dtype):
+                # casting to list so that we do type inference in setitem_single_column
+                value_col = value_col.tolist()
+            self._setitem_single_column(loc, value_col, pi)
+
+    def _setitem_with_indexer_frame_value(
+        self, indexer, value: DataFrame, name: str
+    ) -> None:
+        ilocs = self._ensure_iterable_column_indexer(indexer[1])
+
+        sub_indexer = list(indexer)
+        pi = indexer[0]
+
+        multiindex_indexer = isinstance(self.obj.columns, MultiIndex)
+
+        unique_cols = value.columns.is_unique
+
+        # We do not want to align the value in case of iloc GH#37728
+        if name == "iloc":
+            for i, loc in enumerate(ilocs):
+                val = value.iloc[:, i]
+                self._setitem_single_column(loc, val, pi)
+
+        elif not unique_cols and value.columns.equals(self.obj.columns):
+            # We assume we are already aligned, see
+            # test_iloc_setitem_frame_duplicate_columns_multiple_blocks
+            for loc in ilocs:
+                item = self.obj.columns[loc]
+                if item in value:
+                    sub_indexer[1] = item
+                    val = self._align_series(
+                        tuple(sub_indexer),
+                        value.iloc[:, loc],
+                        multiindex_indexer,
+                    )
+                else:
+                    val = np.nan
+
+                self._setitem_single_column(loc, val, pi)
+
+        elif not unique_cols:
+            raise ValueError("Setting with non-unique columns is not allowed.")
+
+        else:
+            for loc in ilocs:
+                item = self.obj.columns[loc]
+                if item in value:
+                    sub_indexer[1] = item
+                    val = self._align_series(
+                        tuple(sub_indexer),
+                        value[item],
+                        multiindex_indexer,
+                        using_cow=True,
+                    )
+                else:
+                    val = np.nan
+
+                self._setitem_single_column(loc, val, pi)
+
+    def _setitem_single_column(self, loc: int, value, plane_indexer) -> None:
+        """
+
+        Parameters
+        ----------
+        loc : int
+            Indexer for column position
+        plane_indexer : int, slice, listlike[int]
+            The indexer we use for setitem along axis=0.
+        """
+        pi = plane_indexer
+
+        is_full_setter = com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj))
+
+        is_null_setter = com.is_empty_slice(pi) or (is_array_like(pi) and len(pi) == 0)
+
+        if is_null_setter:
+            # no-op, don't cast dtype later
+            return
+
+        elif is_full_setter:
+            try:
+                self.obj._mgr.column_setitem(
+                    loc, plane_indexer, value, inplace_only=True
+                )
+            except (ValueError, TypeError, LossySetitemError) as exc:
+                # If we're setting an entire column and we can't do it inplace,
+                #  then we can use value's dtype (or inferred dtype)
+                #  instead of object
+                dtype = self.obj.dtypes.iloc[loc]
+                if dtype not in (np.void, object) and not self.obj.empty:
+                    # - Exclude np.void, as that is a special case for expansion.
+                    #   We want to raise for
+                    #       df = pd.DataFrame({'a': [1, 2]})
+                    #       df.loc[:, 'a'] = .3
+                    #   but not for
+                    #       df = pd.DataFrame({'a': [1, 2]})
+                    #       df.loc[:, 'b'] = .3
+                    # - Exclude `object`, as then no upcasting happens.
+                    # - Exclude empty initial object with enlargement,
+                    #   as then there's nothing to be inconsistent with.
+                    raise TypeError(
+                        f"Invalid value '{value}' for dtype '{dtype}'"
+                    ) from exc
+                self.obj.isetitem(loc, value)
+        else:
+            # set value into the column (first attempting to operate inplace, then
+            #  falling back to casting if necessary)
+            dtype = self.obj.dtypes.iloc[loc]
+            if dtype == np.void:
+                # This means we're expanding, with multiple columns, e.g.
+                #     df = pd.DataFrame({'A': [1,2,3], 'B': [4,5,6]})
+                #     df.loc[df.index <= 2, ['F', 'G']] = (1, 'abc')
+                # Columns F and G will initially be set to np.void.
+                # Here, we replace those temporary `np.void` columns with
+                # columns of the appropriate dtype, based on `value`.
+                self.obj.iloc[:, loc] = construct_1d_array_from_inferred_fill_value(
+                    value, len(self.obj)
+                )
+            self.obj._mgr.column_setitem(loc, plane_indexer, value)
+
+    def _setitem_single_block(self, indexer, value, name: str) -> None:
+        """
+        _setitem_with_indexer for the case when we have a single Block.
+        """
+        from pandas import Series
+
+        if (isinstance(value, ABCSeries) and name != "iloc") or isinstance(value, dict):
+            # TODO(EA): ExtensionBlock.setitem this causes issues with
+            # setting for extensionarrays that store dicts. Need to decide
+            # if it's worth supporting that.
+            value = self._align_series(indexer, Series(value))
+
+        info_axis = self.obj._info_axis_number
+        item_labels = self.obj._get_axis(info_axis)
+        if isinstance(indexer, tuple):
+            # if we are setting on the info axis ONLY
+            # set using those methods to avoid block-splitting
+            # logic here
+            if (
+                self.ndim == len(indexer) == 2
+                and is_integer(indexer[1])
+                and com.is_null_slice(indexer[0])
+            ):
+                col = item_labels[indexer[info_axis]]
+                if len(item_labels.get_indexer_for([col])) == 1:
+                    # e.g. test_loc_setitem_empty_append_expands_rows
+                    loc = item_labels.get_loc(col)
+                    self._setitem_single_column(loc, value, indexer[0])
+                    return
+
+            indexer = maybe_convert_ix(*indexer)  # e.g. test_setitem_frame_align
+
+        if isinstance(value, ABCDataFrame) and name != "iloc":
+            value = self._align_frame(indexer, value)._values
+
+        # actually do the set
+        self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
+
+    def _setitem_with_indexer_missing(self, indexer, value):
+        """
+        Insert new row(s) or column(s) into the Series or DataFrame.
+        """
+        from pandas import Series
+
+        # reindex the axis to the new value
+        # and set inplace
+        if self.ndim == 1:
+            index = self.obj.index
+            new_index = index.insert(len(index), indexer)
+
+            # we have a coerced indexer, e.g. a float
+            # that matches in an int64 Index, so
+            # we will not create a duplicate index, rather
+            # index to that element
+            # e.g. 0.0 -> 0
+            # GH#12246
+            if index.is_unique:
+                # pass new_index[-1:] instead if [new_index[-1]]
+                #  so that we retain dtype
+                new_indexer = index.get_indexer(new_index[-1:])
+                if (new_indexer != -1).any():
+                    # We get only here with loc, so can hard code
+                    return self._setitem_with_indexer(new_indexer, value, "loc")
+
+            # this preserves dtype of the value and of the object
+            if not is_scalar(value):
+                new_dtype = None
+
+            elif is_valid_na_for_dtype(value, self.obj.dtype):
+                if not is_object_dtype(self.obj.dtype):
+                    # Every NA value is suitable for object, no conversion needed
+                    value = na_value_for_dtype(self.obj.dtype, compat=False)
+
+                new_dtype = maybe_promote(self.obj.dtype, value)[0]
+
+            elif isna(value):
+                new_dtype = None
+            elif not self.obj.empty and not is_object_dtype(self.obj.dtype):
+                # We should not cast, if we have object dtype because we can
+                # set timedeltas into object series
+                curr_dtype = self.obj.dtype
+                curr_dtype = getattr(curr_dtype, "numpy_dtype", curr_dtype)
+                new_dtype = maybe_promote(curr_dtype, value)[0]
+            else:
+                new_dtype = None
+
+            new_values = Series([value], dtype=new_dtype)._values
+
+            if len(self.obj._values):
+                # GH#22717 handle casting compatibility that np.concatenate
+                #  does incorrectly
+                new_values = concat_compat([self.obj._values, new_values])
+            self.obj._mgr = self.obj._constructor(
+                new_values, index=new_index, name=self.obj.name
+            )._mgr
+
+        elif self.ndim == 2:
+            if not len(self.obj.columns):
+                # no columns and scalar
+                raise ValueError("cannot set a frame with no defined columns")
+
+            has_dtype = hasattr(value, "dtype")
+            if isinstance(value, ABCSeries):
+                # append a Series
+                value = value.reindex(index=self.obj.columns)
+                value.name = indexer
+            elif isinstance(value, dict):
+                value = Series(
+                    value, index=self.obj.columns, name=indexer, dtype=object
+                )
+            else:
+                # a list-list
+                if is_list_like_indexer(value):
+                    # must have conforming columns
+                    if len(value) != len(self.obj.columns):
+                        raise ValueError("cannot set a row with mismatched columns")
+
+                value = Series(value, index=self.obj.columns, name=indexer)
+
+            if not len(self.obj):
+                # We will ignore the existing dtypes instead of using
+                #  internals.concat logic
+                df = value.to_frame().T
+
+                idx = self.obj.index
+                if isinstance(idx, MultiIndex):
+                    name = idx.names
+                else:
+                    name = idx.name
+
+                df.index = Index([indexer], name=name)
+                if not has_dtype:
+                    # i.e. if we already had a Series or ndarray, keep that
+                    #  dtype.  But if we had a list or dict, then do inference
+                    df = df.infer_objects()
+                self.obj._mgr = df._mgr
+            else:
+                self.obj._mgr = self.obj._append_internal(value)._mgr
+
+    def _ensure_iterable_column_indexer(self, column_indexer):
+        """
+        Ensure that our column indexer is something that can be iterated over.
+        """
+        ilocs: Sequence[int | np.integer] | np.ndarray | range
+        if is_integer(column_indexer):
+            ilocs = [column_indexer]
+        elif isinstance(column_indexer, slice):
+            ilocs = range(len(self.obj.columns))[column_indexer]
+        elif (
+            isinstance(column_indexer, np.ndarray) and column_indexer.dtype.kind == "b"
+        ):
+            ilocs = np.arange(len(column_indexer))[column_indexer]
+        else:
+            ilocs = column_indexer
+        return ilocs
+
+    def _align_series(
+        self,
+        indexer,
+        ser: Series,
+        multiindex_indexer: bool = False,
+        using_cow: bool = False,
+    ):
+        """
+        Parameters
+        ----------
+        indexer : tuple, slice, scalar
+            Indexer used to get the locations that will be set to `ser`.
+        ser : pd.Series
+            Values to assign to the locations specified by `indexer`.
+        multiindex_indexer : bool, optional
+            Defaults to False. Should be set to True if `indexer` was from
+            a `pd.MultiIndex`, to avoid unnecessary broadcasting.
+
+        Returns
+        -------
+        `np.array` of `ser` broadcast to the appropriate shape for assignment
+        to the locations selected by `indexer`
+        """
+        if isinstance(indexer, (slice, np.ndarray, list, Index)):
+            indexer = (indexer,)
+
+        if isinstance(indexer, tuple):
+            # flatten np.ndarray indexers
+            if (
+                len(indexer) == 2
+                and isinstance(indexer[1], np.ndarray)
+                and indexer[1].dtype == np.bool_
+            ):
+                indexer = (indexer[0], np.where(indexer[1])[0])
+
+            def ravel(i):
+                return i.ravel() if isinstance(i, np.ndarray) else i
+
+            indexer = tuple(map(ravel, indexer))
+            aligners = [not com.is_null_slice(idx) for idx in indexer]
+            sum_aligners = sum(aligners)
+            single_aligner = sum_aligners == 1
+            is_frame = self.ndim == 2
+            obj = self.obj
+
+            # are we a single alignable value on a non-primary
+            # dim (e.g. panel: 1,2, or frame: 0) ?
+            # hence need to align to a single axis dimension
+            # rather that find all valid dims
+
+            # frame
+            if is_frame:
+                single_aligner = single_aligner and aligners[0]
+
+            # we have a frame, with multiple indexers on both axes; and a
+            # series, so need to broadcast (see GH5206)
+            if all(is_sequence(_) or isinstance(_, slice) for _ in indexer):
+                ser_values = ser.reindex(obj.axes[0][indexer[0]])._values
+
+                # single indexer
+                if len(indexer) > 1 and not multiindex_indexer:
+                    if isinstance(indexer[1], slice):
+                        len_indexer = len(obj.axes[1][indexer[1]])
+                    else:
+                        len_indexer = len(indexer[1])
+                    ser_values = (
+                        np.tile(ser_values, len_indexer).reshape(len_indexer, -1).T
+                    )
+
+                return ser_values
+
+            for i, idx in enumerate(indexer):
+                ax = obj.axes[i]
+
+                # multiple aligners (or null slices)
+                if is_sequence(idx) or isinstance(idx, slice):
+                    if single_aligner and com.is_null_slice(idx):
+                        continue
+                    new_ix = ax[idx]
+                    if not is_list_like_indexer(new_ix):
+                        new_ix = Index([new_ix])
+                    else:
+                        new_ix = Index(new_ix)
+                    if not len(new_ix) or ser.index.equals(new_ix):
+                        if using_cow:
+                            return ser
+                        return ser._values.copy()
+
+                    return ser.reindex(new_ix)._values
+
+                # 2 dims
+                elif single_aligner:
+                    # reindex along index
+                    ax = self.obj.axes[1]
+                    if ser.index.equals(ax) or not len(ax):
+                        return ser._values.copy()
+                    return ser.reindex(ax)._values
+
+        elif is_integer(indexer) and self.ndim == 1:
+            if is_object_dtype(self.obj.dtype):
+                return ser
+            ax = self.obj._get_axis(0)
+
+            if ser.index.equals(ax):
+                return ser._values.copy()
+
+            return ser.reindex(ax)._values[indexer]
+
+        elif is_integer(indexer):
+            ax = self.obj._get_axis(1)
+
+            if ser.index.equals(ax):
+                return ser._values.copy()
+
+            return ser.reindex(ax)._values
+
+        raise ValueError("Incompatible indexer with Series")
+
+    def _align_frame(self, indexer, df: DataFrame) -> DataFrame:
+        is_frame = self.ndim == 2
+
+        if isinstance(indexer, tuple):
+            idx, cols = None, None
+            sindexers = []
+            for i, ix in enumerate(indexer):
+                ax = self.obj.axes[i]
+                if is_sequence(ix) or isinstance(ix, slice):
+                    if isinstance(ix, np.ndarray):
+                        ix = ix.reshape(-1)
+                    if idx is None:
+                        idx = ax[ix]
+                    elif cols is None:
+                        cols = ax[ix]
+                    else:
+                        break
+                else:
+                    sindexers.append(i)
+
+            if idx is not None and cols is not None:
+                if df.index.equals(idx) and df.columns.equals(cols):
+                    val = df.copy()
+                else:
+                    val = df.reindex(idx, columns=cols)
+                return val
+
+        elif (isinstance(indexer, slice) or is_list_like_indexer(indexer)) and is_frame:
+            ax = self.obj.index[indexer]
+            if df.index.equals(ax):
+                val = df.copy()
+            else:
+                # we have a multi-index and are trying to align
+                # with a particular, level GH3738
+                if (
+                    isinstance(ax, MultiIndex)
+                    and isinstance(df.index, MultiIndex)
+                    and ax.nlevels != df.index.nlevels
+                ):
+                    raise TypeError(
+                        "cannot align on a multi-index with out "
+                        "specifying the join levels"
+                    )
+
+                val = df.reindex(index=ax)
+            return val
+
+        raise ValueError("Incompatible indexer with DataFrame")
+
+
+class _ScalarAccessIndexer(NDFrameIndexerBase):
+    """
+    Access scalars quickly.
+    """
+
+    # sub-classes need to set _takeable
+    _takeable: bool
+
+    def _convert_key(self, key):
+        raise AbstractMethodError(self)
+
+    def __getitem__(self, key):
+        if not isinstance(key, tuple):
+            # we could have a convertible item here (e.g. Timestamp)
+            if not is_list_like_indexer(key):
+                key = (key,)
+            else:
+                raise ValueError("Invalid call for scalar access (getting)!")
+
+        key = self._convert_key(key)
+        return self.obj._get_value(*key, takeable=self._takeable)
+
+    def __setitem__(self, key, value) -> None:
+        if isinstance(key, tuple):
+            key = tuple(com.apply_if_callable(x, self.obj) for x in key)
+        else:
+            # scalar callable may return tuple
+            key = com.apply_if_callable(key, self.obj)
+
+        if not isinstance(key, tuple):
+            key = _tuplify(self.ndim, key)
+        key = list(self._convert_key(key))
+        if len(key) != self.ndim:
+            raise ValueError("Not enough indexers for scalar access (setting)!")
+
+        self.obj._set_value(*key, value=value, takeable=self._takeable)
+
+
+@doc(IndexingMixin.at)
+class _AtIndexer(_ScalarAccessIndexer):
+    _takeable = False
+
+    def _convert_key(self, key):
+        """
+        Require they keys to be the same type as the index. (so we don't
+        fallback)
+        """
+        # GH 26989
+        # For series, unpacking key needs to result in the label.
+        # This is already the case for len(key) == 1; e.g. (1,)
+        if self.ndim == 1 and len(key) > 1:
+            key = (key,)
+
+        return key
+
+    @property
+    def _axes_are_unique(self) -> bool:
+        # Only relevant for self.ndim == 2
+        assert self.ndim == 2
+        return self.obj.index.is_unique and self.obj.columns.is_unique
+
+    def __getitem__(self, key):
+        if self.ndim == 2 and not self._axes_are_unique:
+            # GH#33041 fall back to .loc
+            if not isinstance(key, tuple) or not all(is_scalar(x) for x in key):
+                raise ValueError("Invalid call for scalar access (getting)!")
+            return self.obj.loc[key]
+
+        return super().__getitem__(key)
+
+    def __setitem__(self, key, value) -> None:
+        if not CHAINED_WARNING_DISABLED:
+            if sys.getrefcount(self.obj) <= REF_COUNT_IDX:
+                warnings.warn(
+                    _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
+                )
+
+        if self.ndim == 2 and not self._axes_are_unique:
+            # GH#33041 fall back to .loc
+            if not isinstance(key, tuple) or not all(is_scalar(x) for x in key):
+                raise ValueError("Invalid call for scalar access (setting)!")
+
+            self.obj.loc[key] = value
+            return
+
+        return super().__setitem__(key, value)
+
+
+@doc(IndexingMixin.iat)
+class _iAtIndexer(_ScalarAccessIndexer):
+    _takeable = True
+
+    def _convert_key(self, key):
+        """
+        Require integer args. (and convert to label arguments)
+        """
+        for i in key:
+            if not is_integer(i):
+                raise ValueError("iAt based indexing can only have integer indexers")
+        return key
+
+    def __setitem__(self, key, value) -> None:
+        if not CHAINED_WARNING_DISABLED:
+            if sys.getrefcount(self.obj) <= REF_COUNT_IDX:
+                warnings.warn(
+                    _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
+                )
+
+        return super().__setitem__(key, value)
+
+
+def _tuplify(ndim: int, loc: Hashable) -> tuple[Hashable | slice, ...]:
+    """
+    Given an indexer for the first dimension, create an equivalent tuple
+    for indexing over all dimensions.
+
+    Parameters
+    ----------
+    ndim : int
+    loc : object
+
+    Returns
+    -------
+    tuple
+    """
+    _tup: list[Hashable | slice]
+    _tup = [slice(None, None) for _ in range(ndim)]
+    _tup[0] = loc
+    return tuple(_tup)
+
+
+def _tupleize_axis_indexer(ndim: int, axis: AxisInt, key) -> tuple:
+    """
+    If we have an axis, adapt the given key to be axis-independent.
+    """
+    new_key = [slice(None)] * ndim
+    new_key[axis] = key
+    return tuple(new_key)
+
+
+def check_bool_indexer(index: Index, key) -> np.ndarray:
+    """
+    Check if key is a valid boolean indexer for an object with such index and
+    perform reindexing or conversion if needed.
+
+    This function assumes that is_bool_indexer(key) == True.
+
+    Parameters
+    ----------
+    index : Index
+        Index of the object on which the indexing is done.
+    key : list-like
+        Boolean indexer to check.
+
+    Returns
+    -------
+    np.array
+        Resulting key.
+
+    Raises
+    ------
+    IndexError
+        If the key does not have the same length as index.
+    IndexingError
+        If the index of the key is unalignable to index.
+    """
+    result = key
+    if isinstance(key, ABCSeries) and not key.index.equals(index):
+        indexer = result.index.get_indexer_for(index)
+        if -1 in indexer:
+            raise IndexingError(
+                "Unalignable boolean Series provided as "
+                "indexer (index of the boolean Series and of "
+                "the indexed object do not match)."
+            )
+
+        result = result.take(indexer)
+
+        # fall through for boolean
+        if not isinstance(result.dtype, ExtensionDtype):
+            return result.astype(bool)._values
+
+    if is_object_dtype(key):
+        # key might be object-dtype bool, check_array_indexer needs bool array
+        result = np.asarray(result, dtype=bool)
+    elif not is_array_like(result):
+        # GH 33924
+        # key may contain nan elements, check_array_indexer needs bool array
+        result = pd_array(result, dtype=bool)
+    return check_array_indexer(index, result)
+
+
+def convert_missing_indexer(indexer):
+    """
+    Reverse convert a missing indexer, which is a dict
+    return the scalar indexer and a boolean indicating if we converted
+    """
+    if isinstance(indexer, dict):
+        # a missing key (but not a tuple indexer)
+        indexer = indexer["key"]
+
+        if isinstance(indexer, bool):
+            raise KeyError("cannot use a single bool to index into setitem")
+        return indexer, True
+
+    return indexer, False
+
+
+def convert_from_missing_indexer_tuple(indexer: tuple, axes: list[Index]) -> tuple:
+    """
+    Create a filtered indexer that doesn't have any missing indexers.
+    """
+
+    def get_indexer(_i, _idx):
+        return axes[_i].get_loc(_idx["key"]) if isinstance(_idx, dict) else _idx
+
+    return tuple(get_indexer(_i, _idx) for _i, _idx in enumerate(indexer))
+
+
+def maybe_convert_ix(*args):
+    """
+    We likely want to take the cross-product.
+    """
+    for arg in args:
+        if not isinstance(arg, (np.ndarray, list, ABCSeries, Index)):
+            return args
+    return np.ix_(*args)
+
+
+def is_nested_tuple(tup, labels) -> bool:
+    """
+    Returns
+    -------
+    bool
+    """
+    # check for a compatible nested tuple and multiindexes among the axes
+    if not isinstance(tup, tuple):
+        return False
+
+    for k in tup:
+        if is_list_like(k) or isinstance(k, slice):
+            return isinstance(labels, MultiIndex)
+
+    return False
+
+
+def is_label_like(key) -> bool:
+    """
+    Returns
+    -------
+    bool
+    """
+    # select a label or row
+    return (
+        not isinstance(key, slice)
+        and not is_list_like_indexer(key)
+        and key is not Ellipsis
+    )
+
+
+def need_slice(obj: slice) -> bool:
+    """
+    Returns
+    -------
+    bool
+    """
+    return (
+        obj.start is not None
+        or obj.stop is not None
+        or (obj.step is not None and obj.step != 1)
+    )
+
+
+def check_dict_or_set_indexers(key) -> None:
+    """
+    Check if the indexer is or contains a dict or set, which is no longer allowed.
+    """
+    if isinstance(key, set) or (
+        isinstance(key, tuple) and any(isinstance(x, set) for x in key)
+    ):
+        raise TypeError(
+            "Passing a set as an indexer is not supported. Use a list instead."
+        )
+
+    if isinstance(key, dict) or (
+        isinstance(key, tuple) and any(isinstance(x, dict) for x in key)
+    ):
+        raise TypeError(
+            "Passing a dict as an indexer is not supported. Use a list instead."
+        )
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0d789e9cb263205e639fcf7713e641c4d36dad5
--- /dev/null
+++ b/pandas/core/missing.py
@@ -0,0 +1,1103 @@
+"""
+Routines for filling missing data.
+"""
+
+from __future__ import annotations
+
+from functools import wraps
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    cast,
+    overload,
+)
+
+import numpy as np
+
+from pandas._config import is_nan_na
+
+from pandas._libs import (
+    NaT,
+    algos,
+    lib,
+)
+from pandas._typing import (
+    ArrayLike,
+    AxisInt,
+    F,
+    ReindexMethod,
+    npt,
+)
+from pandas.compat._optional import import_optional_dependency
+
+from pandas.core.dtypes.cast import infer_dtype_from
+from pandas.core.dtypes.common import (
+    is_array_like,
+    is_bool_dtype,
+    is_numeric_dtype,
+    is_object_dtype,
+    needs_i8_conversion,
+)
+from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
+    BaseMaskedDtype,
+    DatetimeTZDtype,
+)
+from pandas.core.dtypes.missing import (
+    is_valid_na_for_dtype,
+    isna,
+    na_value_for_dtype,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from typing import TypeAlias
+
+    from pandas import Index
+
+    _CubicBC: TypeAlias = Literal["not-a-knot", "clamped", "natural", "periodic"]
+
+
+def check_value_size(value, mask: npt.NDArray[np.bool_], length: int):
+    """
+    Validate the size of the values passed to ExtensionArray.fillna.
+    """
+    if is_array_like(value):
+        if len(value) != length:
+            raise ValueError(
+                f"Length of 'value' does not match. Got ({len(value)}) "
+                f" expected {length}"
+            )
+        value = value[mask]
+
+    return value
+
+
+def mask_missing(arr: ArrayLike, value) -> npt.NDArray[np.bool_]:
+    """
+    Return a masking array of same size/shape as arr
+    with entries equaling value set to True.
+
+    Parameters
+    ----------
+    arr : ArrayLike
+    value : scalar-like
+        Caller has ensured `not is_list_like(value)` and that it can be held
+        by `arr`.
+
+    Returns
+    -------
+    np.ndarray[bool]
+    """
+    dtype, value = infer_dtype_from(value)
+
+    if (
+        isinstance(arr.dtype, (BaseMaskedDtype, ArrowDtype))
+        and lib.is_float(value)
+        and np.isnan(value)
+        and not is_nan_na()
+    ):
+        # TODO: this should be done in an EA method?
+        if arr.dtype.kind == "f":
+            # GH#55127
+            if isinstance(arr.dtype, BaseMaskedDtype):
+                # error: "ExtensionArray" has no attribute "_data"  [attr-defined]
+                mask = np.isnan(arr._data) & ~arr.isna()  # type: ignore[attr-defined,operator]
+                return mask
+            else:
+                # error: "ExtensionArray" has no attribute "_pa_array"  [attr-defined]
+                import pyarrow.compute as pc
+
+                mask = pc.is_nan(arr._pa_array).fill_null(False).to_numpy()  # type: ignore[attr-defined]
+                return mask
+
+        elif arr.dtype.kind in "iu":
+            # GH#51237
+            mask = np.zeros(arr.shape, dtype=bool)
+            return mask
+
+    if isna(value):
+        return isna(arr)
+
+    # GH 21977
+    mask = np.zeros(arr.shape, dtype=bool)
+    if (
+        is_numeric_dtype(arr.dtype)
+        and not is_bool_dtype(arr.dtype)
+        and lib.is_bool(value)
+    ):
+        # e.g. test_replace_ea_float_with_bool, see GH#62048
+        pass
+    elif (
+        is_bool_dtype(arr.dtype) and is_numeric_dtype(dtype) and not lib.is_bool(value)
+    ):
+        # e.g. test_replace_ea_float_with_bool, see GH#62048
+        pass
+    elif is_numeric_dtype(arr.dtype) and isinstance(value, str):
+        # GH#29553 prevent numpy deprecation warnings
+        pass
+    elif is_object_dtype(arr.dtype):
+        # pre-compute mask to avoid comparison to NA
+        # e.g. test_replace_na_in_obj_column
+        arr_mask = ~isna(arr)
+        mask[arr_mask] = arr[arr_mask] == value
+    else:
+        new_mask = arr == value
+
+        if not isinstance(new_mask, np.ndarray):
+            # usually BooleanArray
+            new_mask = new_mask.to_numpy(dtype=bool, na_value=False)
+        mask = new_mask
+
+    return mask
+
+
+@overload
+def clean_fill_method(
+    method: Literal["ffill", "pad", "bfill", "backfill"],
+    *,
+    allow_nearest: Literal[False] = ...,
+) -> Literal["pad", "backfill"]: ...
+
+
+@overload
+def clean_fill_method(
+    method: Literal["ffill", "pad", "bfill", "backfill", "nearest"],
+    *,
+    allow_nearest: Literal[True],
+) -> Literal["pad", "backfill", "nearest"]: ...
+
+
+def clean_fill_method(
+    method: Literal["ffill", "pad", "bfill", "backfill", "nearest"],
+    *,
+    allow_nearest: bool = False,
+) -> Literal["pad", "backfill", "nearest"]:
+    if isinstance(method, str):
+        # error: Incompatible types in assignment (expression has type "str", variable
+        # has type "Literal['ffill', 'pad', 'bfill', 'backfill', 'nearest']")
+        method = method.lower()  # type: ignore[assignment]
+        if method == "ffill":
+            method = "pad"
+        elif method == "bfill":
+            method = "backfill"
+
+    valid_methods = ["pad", "backfill"]
+    expecting = "pad (ffill) or backfill (bfill)"
+    if allow_nearest:
+        valid_methods.append("nearest")
+        expecting = "pad (ffill), backfill (bfill) or nearest"
+    if method not in valid_methods:
+        raise ValueError(f"Invalid fill method. Expecting {expecting}. Got {method}")
+    return method
+
+
+# interpolation methods that dispatch to np.interp
+
+NP_METHODS = ["linear", "time", "index", "values"]
+
+# interpolation methods that dispatch to _interpolate_scipy_wrapper
+
+SP_METHODS = [
+    "nearest",
+    "zero",
+    "slinear",
+    "quadratic",
+    "cubic",
+    "barycentric",
+    "krogh",
+    "spline",
+    "polynomial",
+    "from_derivatives",
+    "piecewise_polynomial",
+    "pchip",
+    "akima",
+    "cubicspline",
+]
+
+
+def clean_interp_method(method: str, index: Index, **kwargs) -> str:
+    order = kwargs.get("order")
+
+    if method in ("spline", "polynomial") and order is None:
+        raise ValueError("You must specify the order of the spline or polynomial.")
+
+    valid = NP_METHODS + SP_METHODS
+    if method not in valid:
+        raise ValueError(f"method must be one of {valid}. Got '{method}' instead.")
+
+    if method in ("krogh", "piecewise_polynomial", "pchip"):
+        if not index.is_monotonic_increasing:
+            raise ValueError(
+                f"{method} interpolation requires that the index be monotonic."
+            )
+
+    return method
+
+
+def find_valid_index(how: str, is_valid: npt.NDArray[np.bool_]) -> int | None:
+    """
+    Retrieves the positional index of the first valid value.
+
+    Parameters
+    ----------
+    how : {'first', 'last'}
+        Use this parameter to change between the first or last valid index.
+    is_valid: np.ndarray
+        Mask to find na_values.
+
+    Returns
+    -------
+    int or None
+    """
+    assert how in ["first", "last"]
+
+    if len(is_valid) == 0:  # early stop
+        return None
+
+    if is_valid.ndim == 2:
+        # reduce axis 1
+        is_valid = is_valid.any(axis=1)  # type: ignore[assignment]
+
+    if how == "first":
+        idxpos = is_valid[::].argmax()
+
+    elif how == "last":
+        idxpos = len(is_valid) - 1 - is_valid[::-1].argmax()
+
+    chk_notna = is_valid[idxpos]
+
+    if not chk_notna:
+        return None
+    # Incompatible return value type (got "signedinteger[Any]",
+    # expected "Optional[int]")
+    return idxpos  # type: ignore[return-value]
+
+
+def validate_limit_direction(
+    limit_direction: str,
+) -> Literal["forward", "backward", "both"]:
+    valid_limit_directions = ["forward", "backward", "both"]
+    limit_direction = limit_direction.lower()
+    if limit_direction not in valid_limit_directions:
+        raise ValueError(
+            "Invalid limit_direction: expecting one of "
+            f"{valid_limit_directions}, got '{limit_direction}'."
+        )
+    # error: Incompatible return value type (got "str", expected
+    # "Literal['forward', 'backward', 'both']")
+    return limit_direction  # type: ignore[return-value]
+
+
+def validate_limit_area(limit_area: str | None) -> Literal["inside", "outside"] | None:
+    if limit_area is not None:
+        valid_limit_areas = ["inside", "outside"]
+        limit_area = limit_area.lower()
+        if limit_area not in valid_limit_areas:
+            raise ValueError(
+                f"Invalid limit_area: expecting one of {valid_limit_areas}, got "
+                f"{limit_area}."
+            )
+    # error: Incompatible return value type (got "Optional[str]", expected
+    # "Optional[Literal['inside', 'outside']]")
+    return limit_area  # type: ignore[return-value]
+
+
+def infer_limit_direction(
+    limit_direction: Literal["backward", "forward", "both"] | None, method: str
+) -> Literal["backward", "forward", "both"]:
+    # Set `limit_direction` depending on `method`
+    if limit_direction is None:
+        if method in ("backfill", "bfill"):
+            limit_direction = "backward"
+        else:
+            limit_direction = "forward"
+    else:
+        if method in ("pad", "ffill") and limit_direction != "forward":
+            raise ValueError(
+                f"`limit_direction` must be 'forward' for method `{method}`"
+            )
+        if method in ("backfill", "bfill") and limit_direction != "backward":
+            raise ValueError(
+                f"`limit_direction` must be 'backward' for method `{method}`"
+            )
+    return limit_direction
+
+
+def get_interp_index(method, index: Index) -> Index:
+    # create/use the index
+    if method == "linear":
+        # prior default
+        from pandas import RangeIndex
+
+        index = RangeIndex(len(index))
+    else:
+        methods = {"index", "values", "nearest", "time"}
+        is_numeric_or_datetime = (
+            is_numeric_dtype(index.dtype)
+            or isinstance(index.dtype, DatetimeTZDtype)
+            or lib.is_np_dtype(index.dtype, "mM")
+        )
+        valid = NP_METHODS + SP_METHODS
+        if method in valid:
+            if method not in methods and not is_numeric_or_datetime:
+                raise ValueError(
+                    "Index column must be numeric or datetime type when "
+                    f"using {method} method other than linear. "
+                    "Try setting a numeric or datetime index column before "
+                    "interpolating."
+                )
+        else:
+            raise ValueError(f"Can not interpolate with method={method}.")
+
+    if isna(index).any():
+        raise NotImplementedError(
+            "Interpolation with NaNs in the index "
+            "has not been implemented. Try filling "
+            "those NaNs before interpolating."
+        )
+    return index
+
+
+def interpolate_2d_inplace(
+    data: np.ndarray,  # floating dtype
+    index: Index,
+    axis: AxisInt,
+    method: str = "linear",
+    limit: int | None = None,
+    limit_direction: str = "forward",
+    limit_area: str | None = None,
+    fill_value: Any | None = None,
+    mask=None,
+    **kwargs,
+) -> None:
+    """
+    Column-wise application of _interpolate_1d.
+
+    Notes
+    -----
+    Alters 'data' in-place.
+
+    The signature does differ from _interpolate_1d because it only
+    includes what is needed for Block.interpolate.
+    """
+    # validate the interp method
+    clean_interp_method(method, index, **kwargs)
+
+    if is_valid_na_for_dtype(fill_value, data.dtype):
+        fill_value = na_value_for_dtype(data.dtype, compat=False)
+
+    if method == "time":
+        if not needs_i8_conversion(index.dtype):
+            raise ValueError(
+                "time-weighted interpolation only works "
+                "on Series or DataFrames with a "
+                "DatetimeIndex"
+            )
+        method = "values"
+
+    limit_direction = validate_limit_direction(limit_direction)
+    limit_area_validated = validate_limit_area(limit_area)
+
+    # default limit is unlimited GH #16282
+    limit = algos.validate_limit(nobs=None, limit=limit)
+
+    indices = _index_to_interp_indices(index, method)
+
+    def func(yvalues: np.ndarray) -> None:
+        # process 1-d slices in the axis direction
+
+        _interpolate_1d(
+            indices=indices,
+            yvalues=yvalues,
+            method=method,
+            limit=limit,
+            limit_direction=limit_direction,
+            limit_area=limit_area_validated,
+            fill_value=fill_value,
+            bounds_error=False,
+            mask=mask,
+            **kwargs,
+        )
+
+    np.apply_along_axis(func, axis, data)
+
+
+def _index_to_interp_indices(index: Index, method: str) -> np.ndarray:
+    """
+    Convert Index to ndarray of indices to pass to NumPy/SciPy.
+    """
+    xarr = index._values
+    if needs_i8_conversion(xarr.dtype):
+        # GH#1646 for dt64tz
+        xarr = xarr.view("i8")
+
+    if method == "linear":
+        inds = xarr
+        inds = cast(np.ndarray, inds)
+    else:
+        inds = np.asarray(xarr)
+
+        if method in ("values", "index"):
+            if inds.dtype == np.object_:
+                inds = lib.maybe_convert_objects(inds)
+
+    return inds
+
+
+def _interpolate_1d(
+    indices: np.ndarray,
+    yvalues: np.ndarray,
+    method: str = "linear",
+    limit: int | None = None,
+    limit_direction: str = "forward",
+    limit_area: Literal["inside", "outside"] | None = None,
+    fill_value: Any | None = None,
+    bounds_error: bool = False,
+    order: int | None = None,
+    mask=None,
+    **kwargs,
+) -> None:
+    """
+    Logic for the 1-d interpolation.  The input
+    indices and yvalues will each be 1-d arrays of the same length.
+
+    Bounds_error is currently hardcoded to False since non-scipy ones don't
+    take it as an argument.
+
+    Notes
+    -----
+    Fills 'yvalues' in-place.
+    """
+    if mask is not None:
+        invalid = mask
+    else:
+        invalid = isna(yvalues)
+    valid = ~invalid
+
+    if not valid.any():
+        return
+
+    if valid.all():
+        return
+
+    # These index pointers to invalid values... i.e. {0, 1, etc...
+    all_nans = np.flatnonzero(invalid)
+
+    first_valid_index = find_valid_index(how="first", is_valid=valid)
+    if first_valid_index is None:  # no nan found in start
+        first_valid_index = 0
+    start_nans = np.arange(first_valid_index)
+
+    last_valid_index = find_valid_index(how="last", is_valid=valid)
+    if last_valid_index is None:  # no nan found in end
+        last_valid_index = len(yvalues)
+    end_nans = np.arange(1 + last_valid_index, len(valid))
+
+    # preserve_nans contains indices of invalid values,
+    # but in this case, it is the final set of indices that need to be
+    # preserved as NaN after the interpolation.
+
+    # For example if limit_direction='forward' then preserve_nans will
+    # contain indices of NaNs at the beginning of the series, and NaNs that
+    # are more than 'limit' away from the prior non-NaN.
+
+    # set preserve_nans based on direction using _interp_limit
+    if limit_direction == "forward":
+        preserve_nans = np.union1d(start_nans, _interp_limit(invalid, limit, 0))
+    elif limit_direction == "backward":
+        preserve_nans = np.union1d(end_nans, _interp_limit(invalid, 0, limit))
+    else:
+        # both directions... just use _interp_limit
+        preserve_nans = np.unique(_interp_limit(invalid, limit, limit))
+
+    # if limit_area is set, add either mid or outside indices
+    # to preserve_nans GH #16284
+    if limit_area == "inside":
+        # preserve NaNs on the outside
+        preserve_nans = np.union1d(preserve_nans, start_nans)
+        preserve_nans = np.union1d(preserve_nans, end_nans)
+    elif limit_area == "outside":
+        # preserve NaNs on the inside
+        mid_nans = np.setdiff1d(all_nans, start_nans, assume_unique=True)
+        mid_nans = np.setdiff1d(mid_nans, end_nans, assume_unique=True)
+        preserve_nans = np.union1d(preserve_nans, mid_nans)
+
+    is_datetimelike = yvalues.dtype.kind in "mM"
+
+    if is_datetimelike:
+        yvalues = yvalues.view("i8")
+
+    if method in NP_METHODS:
+        # np.interp requires sorted X values, #21037
+
+        indexer = np.argsort(indices[valid])
+        yvalues[invalid] = np.interp(
+            indices[invalid], indices[valid][indexer], yvalues[valid][indexer]
+        )
+    else:
+        yvalues[invalid] = _interpolate_scipy_wrapper(
+            indices[valid],
+            yvalues[valid],
+            indices[invalid],
+            method=method,
+            fill_value=fill_value,
+            bounds_error=bounds_error,
+            order=order,
+            **kwargs,
+        )
+
+    if mask is not None:
+        mask[:] = False
+        mask[preserve_nans] = True
+    elif is_datetimelike:
+        yvalues[preserve_nans] = NaT.value
+    else:
+        yvalues[preserve_nans] = np.nan
+    return
+
+
+def _interpolate_scipy_wrapper(
+    x: np.ndarray,
+    y: np.ndarray,
+    new_x: np.ndarray,
+    method: str,
+    fill_value=None,
+    bounds_error: bool = False,
+    order=None,
+    **kwargs,
+):
+    """
+    Passed off to scipy.interpolate.interp1d. method is scipy's kind.
+    Returns an array interpolated at new_x.  Add any new methods to
+    the list in _clean_interp_method.
+    """
+    extra = f"{method} interpolation requires SciPy."
+    import_optional_dependency("scipy", extra=extra)
+    from scipy import interpolate
+
+    new_x = np.asarray(new_x)
+
+    # ignores some kwargs that could be passed along.
+    alt_methods: dict[str, Callable[..., np.ndarray]] = {
+        "barycentric": interpolate.barycentric_interpolate,
+        "krogh": interpolate.krogh_interpolate,
+        "from_derivatives": _from_derivatives,
+        "piecewise_polynomial": _from_derivatives,
+        "cubicspline": _cubicspline_interpolate,
+        "akima": _akima_interpolate,
+        "pchip": interpolate.pchip_interpolate,
+    }
+
+    interp1d_methods = [
+        "nearest",
+        "zero",
+        "slinear",
+        "quadratic",
+        "cubic",
+        "polynomial",
+    ]
+    terp: Callable[..., np.ndarray] | None
+    if method in interp1d_methods:
+        if method == "polynomial":
+            kind = order
+        else:
+            kind = method
+        terp = interpolate.interp1d(
+            x, y, kind=kind, fill_value=fill_value, bounds_error=bounds_error
+        )
+        new_y = terp(new_x)
+    elif method == "spline":
+        # GH #10633, #24014
+        if isna(order) or (order <= 0):
+            raise ValueError(
+                f"order needs to be specified and greater than 0; got order: {order}"
+            )
+        terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs)
+        new_y = terp(new_x)
+    else:
+        # GH 7295: need to be able to write for some reason
+        # in some circumstances: check all three
+        if not x.flags.writeable:
+            x = x.copy()
+        if not y.flags.writeable:
+            y = y.copy()
+        if not new_x.flags.writeable:
+            new_x = new_x.copy()
+        terp = alt_methods.get(method, None)
+        if terp is None:
+            raise ValueError(f"Can not interpolate with method={method}.")
+
+        # Make sure downcast is not in kwargs for alt methods
+        kwargs.pop("downcast", None)
+        new_y = terp(x, y, new_x, **kwargs)
+    return new_y
+
+
+def _from_derivatives(
+    xi: np.ndarray,
+    yi: np.ndarray,
+    x: np.ndarray,
+    order=None,
+    der: int | list[int] | None = 0,
+    extrapolate: bool = False,
+):
+    """
+    Convenience function for interpolate.BPoly.from_derivatives.
+
+    Construct a piecewise polynomial in the Bernstein basis, compatible
+    with the specified values and derivatives at breakpoints.
+
+    Parameters
+    ----------
+    xi : array-like
+        sorted 1D array of x-coordinates
+    yi : array-like or list of array-likes
+        yi[i][j] is the j-th derivative known at xi[i]
+    order: None or int or array-like of ints. Default: None.
+        Specifies the degree of local polynomials. If not None, some
+        derivatives are ignored.
+    der : int or list
+        How many derivatives to extract; None for all potentially nonzero
+        derivatives (that is a number equal to the number of points), or a
+        list of derivatives to extract. This number includes the function
+        value as 0th derivative.
+     extrapolate : bool, optional
+        Whether to extrapolate to ouf-of-bounds points based on first and last
+        intervals, or to return NaNs. Default: True.
+
+    See Also
+    --------
+    scipy.interpolate.BPoly.from_derivatives
+
+    Returns
+    -------
+    y : scalar or array-like
+        The result, of length R or length M or M by R.
+    """
+    from scipy import interpolate
+
+    # return the method for compat with scipy version & backwards compat
+    method = interpolate.BPoly.from_derivatives
+    m = method(xi, yi.reshape(-1, 1), orders=order, extrapolate=extrapolate)
+
+    return m(x)
+
+
+def _akima_interpolate(
+    xi: np.ndarray,
+    yi: np.ndarray,
+    x: np.ndarray,
+    der: int = 0,
+    axis: AxisInt = 0,
+):
+    """
+    Convenience function for akima interpolation.
+    xi and yi are arrays of values used to approximate some function f,
+    with ``yi = f(xi)``.
+
+    See `Akima1DInterpolator` for details.
+
+    Parameters
+    ----------
+    xi : np.ndarray
+        A sorted list of x-coordinates, of length N.
+    yi : np.ndarray
+        A 1-D array of real values.  `yi`'s length along the interpolation
+        axis must be equal to the length of `xi`. If N-D array, use axis
+        parameter to select correct axis.
+    x : np.ndarray
+        Of length M.
+    der : int, optional
+        How many derivatives to extract. This number includes the function
+        value as 0th derivative.
+    axis : int, optional
+        Axis in the yi array corresponding to the x-coordinate values.
+
+    See Also
+    --------
+    scipy.interpolate.Akima1DInterpolator
+
+    Returns
+    -------
+    y : scalar or array-like
+        The result, of length R or length M or M by R,
+
+    """
+    from scipy import interpolate
+
+    P = interpolate.Akima1DInterpolator(xi, yi, axis=axis)
+
+    return P(x, nu=der)
+
+
+def _cubicspline_interpolate(
+    xi: np.ndarray,
+    yi: np.ndarray,
+    x: np.ndarray,
+    axis: AxisInt = 0,
+    bc_type: _CubicBC | tuple[Any, Any] = "not-a-knot",
+    extrapolate: Literal["periodic"] | bool | None = None,
+) -> np.ndarray:
+    """
+    Convenience function for cubic spline data interpolator.
+
+    See `scipy.interpolate.CubicSpline` for details.
+
+    Parameters
+    ----------
+    xi : np.ndarray, shape (n,)
+        1-d array containing values of the independent variable.
+        Values must be real, finite and in strictly increasing order.
+    yi : np.ndarray
+        Array containing values of the dependent variable. It can have
+        arbitrary number of dimensions, but the length along ``axis``
+        (see below) must match the length of ``x``. Values must be finite.
+    x : np.ndarray, shape (m,)
+    axis : int, optional
+        Axis along which `y` is assumed to be varying. Meaning that for
+        ``x[i]`` the corresponding values are ``np.take(y, i, axis=axis)``.
+        Default is 0.
+    bc_type : string or 2-tuple, optional
+        Boundary condition type. Two additional equations, given by the
+        boundary conditions, are required to determine all coefficients of
+        polynomials on each segment [2]_.
+        If `bc_type` is a string, then the specified condition will be applied
+        at both ends of a spline. Available conditions are:
+        * 'not-a-knot' (default): The first and second segment at a curve end
+          are the same polynomial. It is a good default when there is no
+          information on boundary conditions.
+        * 'periodic': The interpolated functions is assumed to be periodic
+          of period ``x[-1] - x[0]``. The first and last value of `y` must be
+          identical: ``y[0] == y[-1]``. This boundary condition will result in
+          ``y'[0] == y'[-1]`` and ``y''[0] == y''[-1]``.
+        * 'clamped': The first derivative at curves ends are zero. Assuming
+          a 1D `y`, ``bc_type=((1, 0.0), (1, 0.0))`` is the same condition.
+        * 'natural': The second derivative at curve ends are zero. Assuming
+          a 1D `y`, ``bc_type=((2, 0.0), (2, 0.0))`` is the same condition.
+        If `bc_type` is a 2-tuple, the first and the second value will be
+        applied at the curve start and end respectively. The tuple values can
+        be one of the previously mentioned strings (except 'periodic') or a
+        tuple `(order, deriv_values)` allowing to specify arbitrary
+        derivatives at curve ends:
+        * `order`: the derivative order, 1 or 2.
+        * `deriv_value`: array-like containing derivative values, shape must
+          be the same as `y`, excluding ``axis`` dimension. For example, if
+          `y` is 1D, then `deriv_value` must be a scalar. If `y` is 3D with
+          the shape (n0, n1, n2) and axis=2, then `deriv_value` must be 2D
+          and have the shape (n0, n1).
+    extrapolate : {bool, 'periodic', None}, optional
+        If bool, determines whether to extrapolate to out-of-bounds points
+        based on first and last intervals, or to return NaNs. If 'periodic',
+        periodic extrapolation is used. If None (default), ``extrapolate`` is
+        set to 'periodic' for ``bc_type='periodic'`` and to True otherwise.
+
+    See Also
+    --------
+    scipy.interpolate.CubicHermiteSpline
+
+    Returns
+    -------
+    y : scalar or array-like
+        The result, of shape (m,)
+
+    References
+    ----------
+    .. [1] `Cubic Spline Interpolation
+            <https://en.wikiversity.org/wiki/Cubic_Spline_Interpolation>`_
+            on Wikiversity.
+    .. [2] Carl de Boor, "A Practical Guide to Splines", Springer-Verlag, 1978.
+    """
+    from scipy import interpolate
+
+    P = interpolate.CubicSpline(
+        xi, yi, axis=axis, bc_type=bc_type, extrapolate=extrapolate
+    )
+
+    return P(x)
+
+
+def pad_or_backfill_inplace(
+    values: np.ndarray,
+    method: Literal["pad", "backfill"] = "pad",
+    axis: AxisInt = 0,
+    limit: int | None = None,
+    limit_area: Literal["inside", "outside"] | None = None,
+) -> None:
+    """
+    Perform an actual interpolation of values, values will be make 2-d if
+    needed fills inplace, returns the result.
+
+    Parameters
+    ----------
+    values: np.ndarray
+        Input array.
+    method: str, default "pad"
+        Interpolation method. Could be "bfill" or "pad"
+    axis: 0 or 1
+        Interpolation axis
+    limit: int, optional
+        Index limit on interpolation.
+    limit_area: str, optional
+        Limit area for interpolation. Can be "inside" or "outside"
+
+    Notes
+    -----
+    Modifies values in-place.
+    """
+    transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
+
+    # reshape a 1 dim if needed
+    if values.ndim == 1:
+        if axis != 0:  # pragma: no cover
+            raise AssertionError("cannot interpolate on an ndim == 1 with axis != 0")
+        values = values.reshape((1, *values.shape))
+
+    method = clean_fill_method(method)
+    tvalues = transf(values)
+
+    func = get_fill_func(method, ndim=2)
+    # _pad_2d and _backfill_2d both modify tvalues inplace
+    func(tvalues, limit=limit, limit_area=limit_area)
+
+
+def _fillna_prep(
+    values, mask: npt.NDArray[np.bool_] | None = None
+) -> npt.NDArray[np.bool_]:
+    # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d
+
+    if mask is None:
+        mask = isna(values)
+
+    return mask
+
+
+def _datetimelike_compat(func: F) -> F:
+    """
+    Wrapper to handle datetime64 and timedelta64 dtypes.
+    """
+
+    @wraps(func)
+    def new_func(
+        values,
+        limit: int | None = None,
+        limit_area: Literal["inside", "outside"] | None = None,
+        mask=None,
+    ):
+        if needs_i8_conversion(values.dtype):
+            if mask is None:
+                # This needs to occur before casting to int64
+                mask = isna(values)
+
+            result, mask = func(
+                values.view("i8"), limit=limit, limit_area=limit_area, mask=mask
+            )
+            return result.view(values.dtype), mask
+
+        return func(values, limit=limit, limit_area=limit_area, mask=mask)
+
+    return cast(F, new_func)
+
+
+@_datetimelike_compat
+def _pad_1d(
+    values: np.ndarray,
+    limit: int | None = None,
+    limit_area: Literal["inside", "outside"] | None = None,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> tuple[np.ndarray, npt.NDArray[np.bool_]]:
+    mask = _fillna_prep(values, mask)
+    if limit_area is not None and not mask.all():
+        _fill_limit_area_1d(mask, limit_area)
+    algos.pad_inplace(values, mask, limit=limit)
+    return values, mask
+
+
+@_datetimelike_compat
+def _backfill_1d(
+    values: np.ndarray,
+    limit: int | None = None,
+    limit_area: Literal["inside", "outside"] | None = None,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> tuple[np.ndarray, npt.NDArray[np.bool_]]:
+    mask = _fillna_prep(values, mask)
+    if limit_area is not None and not mask.all():
+        _fill_limit_area_1d(mask, limit_area)
+    algos.backfill_inplace(values, mask, limit=limit)
+    return values, mask
+
+
+@_datetimelike_compat
+def _pad_2d(
+    values: np.ndarray,
+    limit: int | None = None,
+    limit_area: Literal["inside", "outside"] | None = None,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> tuple[np.ndarray, npt.NDArray[np.bool_]]:
+    mask = _fillna_prep(values, mask)
+    if limit_area is not None:
+        _fill_limit_area_2d(mask, limit_area)
+
+    if values.size:
+        algos.pad_2d_inplace(values, mask, limit=limit)
+    return values, mask
+
+
+@_datetimelike_compat
+def _backfill_2d(
+    values,
+    limit: int | None = None,
+    limit_area: Literal["inside", "outside"] | None = None,
+    mask: npt.NDArray[np.bool_] | None = None,
+):
+    mask = _fillna_prep(values, mask)
+    if limit_area is not None:
+        _fill_limit_area_2d(mask, limit_area)
+
+    if values.size:
+        algos.backfill_2d_inplace(values, mask, limit=limit)
+    else:
+        # for test coverage
+        pass
+    return values, mask
+
+
+def _fill_limit_area_1d(
+    mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"]
+) -> None:
+    """Prepare 1d mask for ffill/bfill with limit_area.
+
+    Caller is responsible for checking at least one value of mask is False.
+    When called, mask will no longer faithfully represent when
+    the corresponding are NA or not.
+
+    Parameters
+    ----------
+    mask : np.ndarray[bool, ndim=1]
+        Mask representing NA values when filling.
+    limit_area : { "outside", "inside" }
+        Whether to limit filling to outside or inside the outer most non-NA value.
+    """
+    neg_mask = ~mask
+    first = neg_mask.argmax()
+    last = len(neg_mask) - neg_mask[::-1].argmax() - 1
+    if limit_area == "inside":
+        mask[:first] = False
+        mask[last + 1 :] = False
+    elif limit_area == "outside":
+        mask[first + 1 : last] = False
+
+
+def _fill_limit_area_2d(
+    mask: npt.NDArray[np.bool_], limit_area: Literal["outside", "inside"]
+) -> None:
+    """Prepare 2d mask for ffill/bfill with limit_area.
+
+    When called, mask will no longer faithfully represent when
+    the corresponding are NA or not.
+
+    Parameters
+    ----------
+    mask : np.ndarray[bool, ndim=1]
+        Mask representing NA values when filling.
+    limit_area : { "outside", "inside" }
+        Whether to limit filling to outside or inside the outer most non-NA value.
+    """
+    neg_mask = ~mask.T
+    if limit_area == "outside":
+        # Identify inside
+        la_mask = (
+            np.maximum.accumulate(neg_mask, axis=0)
+            & np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1]
+        )
+    else:
+        # Identify outside
+        la_mask = (
+            ~np.maximum.accumulate(neg_mask, axis=0)
+            | ~np.maximum.accumulate(neg_mask[::-1], axis=0)[::-1]
+        )
+    mask[la_mask.T] = False
+
+
+_fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d}
+
+
+def get_fill_func(method, ndim: int = 1):
+    method = clean_fill_method(method)
+    if ndim == 1:
+        return _fill_methods[method]
+    return {"pad": _pad_2d, "backfill": _backfill_2d}[method]
+
+
+def clean_reindex_fill_method(method) -> ReindexMethod | None:
+    if method is None:
+        return None
+    return clean_fill_method(method, allow_nearest=True)
+
+
+def _interp_limit(
+    invalid: npt.NDArray[np.bool_], fw_limit: int | None, bw_limit: int | None
+) -> np.ndarray:
+    """
+    Get indexers of values that won't be filled
+    because they exceed the limits.
+
+    Parameters
+    ----------
+    invalid : np.ndarray[bool]
+    fw_limit : int or None
+        forward limit to index
+    bw_limit : int or None
+        backward limit to index
+
+    Returns
+    -------
+    set of indexers
+
+    Notes
+    -----
+    This is equivalent to the more readable, but slower
+
+    .. code-block:: python
+
+        def _interp_limit(invalid, fw_limit, bw_limit):
+            for x in np.where(invalid)[0]:
+                if invalid[max(0, x - fw_limit) : x + bw_limit + 1].all():
+                    yield x
+    """
+    # handle forward first; the backward direction is the same except
+    # 1. operate on the reversed array
+    # 2. subtract the returned indices from N - 1
+    N = len(invalid)
+    f_idx = np.array([], dtype=np.int64)
+    b_idx = np.array([], dtype=np.int64)
+    assume_unique = True
+
+    def inner(invalid, limit: int):
+        limit = min(limit, N)
+        windowed = np.lib.stride_tricks.sliding_window_view(invalid, limit + 1).all(1)
+        idx = np.union1d(
+            np.where(windowed)[0] + limit,
+            np.where((~invalid[: limit + 1]).cumsum() == 0)[0],
+        )
+        return idx
+
+    if fw_limit is not None:
+        if fw_limit == 0:
+            f_idx = np.where(invalid)[0]
+            assume_unique = False
+        else:
+            f_idx = inner(invalid, fw_limit)
+
+    if bw_limit is not None:
+        if bw_limit == 0:
+            # then we don't even need to care about backwards
+            # just use forwards
+            return f_idx
+        else:
+            b_idx = N - 1 - inner(invalid[::-1], bw_limit)
+            if fw_limit == 0:
+                return b_idx
+
+    return np.intersect1d(f_idx, b_idx, assume_unique=assume_unique)
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9743ce10758441e54b62cfe91d6fce6479f4be66
--- /dev/null
+++ b/pandas/core/nanops.py
@@ -0,0 +1,1777 @@
+from __future__ import annotations
+
+import functools
+import itertools
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    cast,
+)
+import warnings
+
+import numpy as np
+
+from pandas._config import get_option
+
+from pandas._libs import (
+    NaT,
+    NaTType,
+    iNaT,
+    lib,
+)
+from pandas._typing import (
+    ArrayLike,
+    AxisInt,
+    CorrelationMethod,
+    Dtype,
+    DtypeObj,
+    F,
+    Scalar,
+    Shape,
+    npt,
+)
+from pandas.compat._optional import import_optional_dependency
+
+from pandas.core.dtypes.common import (
+    is_complex,
+    is_float,
+    is_float_dtype,
+    is_integer,
+    is_numeric_dtype,
+    is_object_dtype,
+    needs_i8_conversion,
+    pandas_dtype,
+)
+from pandas.core.dtypes.missing import (
+    isna,
+    na_value_for_dtype,
+    notna,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+bn = import_optional_dependency("bottleneck", errors="warn")
+_BOTTLENECK_INSTALLED = bn is not None
+_USE_BOTTLENECK = False
+
+
+def set_use_bottleneck(v: bool = True) -> None:
+    # set/unset to use bottleneck
+    global _USE_BOTTLENECK
+    if _BOTTLENECK_INSTALLED:
+        _USE_BOTTLENECK = v
+
+
+set_use_bottleneck(get_option("compute.use_bottleneck"))
+
+
+class disallow:
+    def __init__(self, *dtypes: Dtype) -> None:
+        super().__init__()
+        self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
+
+    def check(self, obj) -> bool:
+        return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes)
+
+    def __call__(self, f: F) -> F:
+        @functools.wraps(f)
+        def _f(*args, **kwargs):
+            obj_iter = itertools.chain(args, kwargs.values())
+            if any(self.check(obj) for obj in obj_iter):
+                f_name = f.__name__.replace("nan", "")
+                raise TypeError(
+                    f"reduction operation '{f_name}' not allowed for this dtype"
+                )
+            try:
+                return f(*args, **kwargs)
+            except ValueError as e:
+                # we want to transform an object array
+                # ValueError message to the more typical TypeError
+                # e.g. this is normally a disallowed function on
+                # object arrays that contain strings
+                if is_object_dtype(args[0]):
+                    raise TypeError(e) from e
+                raise
+
+        return cast(F, _f)
+
+
+class bottleneck_switch:
+    def __init__(self, name=None, **kwargs) -> None:
+        self.name = name
+        self.kwargs = kwargs
+
+    def __call__(self, alt: F) -> F:
+        bn_name = self.name or alt.__name__
+
+        try:
+            bn_func = getattr(bn, bn_name)
+        except (AttributeError, NameError):  # pragma: no cover
+            bn_func = None
+
+        @functools.wraps(alt)
+        def f(
+            values: np.ndarray,
+            *,
+            axis: AxisInt | None = None,
+            skipna: bool = True,
+            **kwds,
+        ):
+            if len(self.kwargs) > 0:
+                for k, v in self.kwargs.items():
+                    if k not in kwds:
+                        kwds[k] = v
+
+            if values.size == 0 and kwds.get("min_count") is None:
+                # We are empty, returning NA for our type
+                # Only applies for the default `min_count` of None
+                # since that affects how empty arrays are handled.
+                # TODO(GH-18976) update all the nanops methods to
+                # correctly handle empty inputs and remove this check.
+                # It *may* just be `var`
+                return _na_for_min_count(values, axis)
+
+            if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name):
+                if kwds.get("mask", None) is None:
+                    # `mask` is not recognised by bottleneck, would raise
+                    #  TypeError if called
+                    kwds.pop("mask", None)
+                    result = bn_func(values, axis=axis, **kwds)
+
+                    # prefer to treat inf/-inf as NA, but must compute the func
+                    # twice :(
+                    if _has_infs(result):
+                        result = alt(values, axis=axis, skipna=skipna, **kwds)
+                else:
+                    result = alt(values, axis=axis, skipna=skipna, **kwds)
+            else:
+                result = alt(values, axis=axis, skipna=skipna, **kwds)
+
+            return result
+
+        return cast(F, f)
+
+
+def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
+    # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
+    if dtype != object and not needs_i8_conversion(dtype):
+        # GH 42878
+        # Bottleneck uses naive summation leading to O(n) loss of precision
+        # unlike numpy which implements pairwise summation, which has O(log(n)) loss
+        # crossref: https://github.com/pydata/bottleneck/issues/379
+
+        # GH 15507
+        # bottleneck does not properly upcast during the sum
+        # so can overflow
+
+        # GH 9422
+        # further we also want to preserve NaN when all elements
+        # are NaN, unlike bottleneck/numpy which consider this
+        # to be 0
+        return name not in ["nansum", "nanprod", "nanmean"]
+    return False
+
+
+def _has_infs(result) -> bool:
+    if isinstance(result, np.ndarray):
+        if result.dtype in ("f8", "f4"):
+            # Note: outside of a nanops-specific test, we always have
+            #  result.ndim == 1, so there is no risk of this ravel making a copy.
+            return lib.has_infs(result.ravel("K"))
+    try:
+        return np.isinf(result).any()
+    except (TypeError, NotImplementedError):
+        # if it doesn't support infs, then it can't have infs
+        return False
+
+
+def _get_fill_value(
+    dtype: DtypeObj, fill_value: Scalar | None = None, fill_value_typ=None
+):
+    """return the correct fill value for the dtype of the values"""
+    if fill_value is not None:
+        return fill_value
+    if _na_ok_dtype(dtype):
+        if fill_value_typ is None:
+            return np.nan
+        elif fill_value_typ == "+inf":
+            return np.inf
+        else:
+            return -np.inf
+    elif fill_value_typ == "+inf":
+        # need the max int here
+        return lib.i8max
+    else:
+        return iNaT
+
+
+def _maybe_get_mask(
+    values: np.ndarray, skipna: bool, mask: npt.NDArray[np.bool_] | None
+) -> npt.NDArray[np.bool_] | None:
+    """
+    Compute a mask if and only if necessary.
+
+    This function will compute a mask iff it is necessary. Otherwise,
+    return the provided mask (potentially None) when a mask does not need to be
+    computed.
+
+    A mask is never necessary if the values array is of boolean or integer
+    dtypes, as these are incapable of storing NaNs. If passing a NaN-capable
+    dtype that is interpretable as either boolean or integer data (eg,
+    timedelta64), a mask must be provided.
+
+    If the skipna parameter is False, a new mask will not be computed.
+
+    The mask is computed using isna() by default. Setting invert=True selects
+    notna() as the masking function.
+
+    Parameters
+    ----------
+    values : ndarray
+        input array to potentially compute mask for
+    skipna : bool
+        boolean for whether NaNs should be skipped
+    mask : Optional[ndarray]
+        nan-mask if known
+
+    Returns
+    -------
+    Optional[np.ndarray[bool]]
+    """
+    if mask is None:
+        if values.dtype.kind in "biu":
+            # Boolean data cannot contain nulls, so signal via mask being None
+            return None
+
+        if skipna or values.dtype.kind in "mM":
+            mask = isna(values)
+
+    return mask
+
+
+def _get_values(
+    values: np.ndarray,
+    skipna: bool,
+    fill_value: Any = None,
+    fill_value_typ: str | None = None,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> tuple[np.ndarray, npt.NDArray[np.bool_] | None]:
+    """
+    Utility to get the values view, mask, dtype, dtype_max, and fill_value.
+
+    If both mask and fill_value/fill_value_typ are not None and skipna is True,
+    the values array will be copied.
+
+    For input arrays of boolean or integer dtypes, copies will only occur if a
+    precomputed mask, a fill_value/fill_value_typ, and skipna=True are
+    provided.
+
+    Parameters
+    ----------
+    values : ndarray
+        input array to potentially compute mask for
+    skipna : bool
+        boolean for whether NaNs should be skipped
+    fill_value : Any
+        value to fill NaNs with
+    fill_value_typ : str
+        Set to '+inf' or '-inf' to handle dtype-specific infinities
+    mask : Optional[np.ndarray[bool]]
+        nan-mask if known
+
+    Returns
+    -------
+    values : ndarray
+        Potential copy of input value array
+    mask : Optional[ndarray[bool]]
+        Mask for values, if deemed necessary to compute
+    """
+    # In _get_values is only called from within nanops, and in all cases
+    #  with scalar fill_value.  This guarantee is important for the
+    #  np.where call below
+
+    mask = _maybe_get_mask(values, skipna, mask)
+
+    dtype = values.dtype
+
+    datetimelike = False
+    if values.dtype.kind in "mM":
+        # changing timedelta64/datetime64 to int64 needs to happen after
+        #  finding `mask` above
+        values = np.asarray(values.view("i8"))
+        datetimelike = True
+
+    if skipna and (mask is not None):
+        # get our fill value (in case we need to provide an alternative
+        # dtype for it)
+        fill_value = _get_fill_value(
+            dtype, fill_value=fill_value, fill_value_typ=fill_value_typ
+        )
+
+        if fill_value is not None:
+            if mask.any():
+                if datetimelike or _na_ok_dtype(dtype):
+                    values = values.copy()
+                    np.putmask(values, mask, fill_value)
+                else:
+                    # np.where will promote if needed
+                    values = np.where(~mask, values, fill_value)
+
+    return values, mask
+
+
+def _get_dtype_max(dtype: np.dtype) -> np.dtype:
+    # return a platform independent precision dtype
+    dtype_max = dtype
+    if dtype.kind in "bi":
+        dtype_max = np.dtype(np.int64)
+    elif dtype.kind == "u":
+        dtype_max = np.dtype(np.uint64)
+    elif dtype.kind == "f":
+        dtype_max = np.dtype(np.float64)
+    return dtype_max
+
+
+def _na_ok_dtype(dtype: DtypeObj) -> bool:
+    if needs_i8_conversion(dtype):
+        return False
+    return not issubclass(dtype.type, np.integer)
+
+
+def _wrap_results(result, dtype: np.dtype, fill_value=None):
+    """wrap our results if needed"""
+    if result is NaT:
+        pass
+
+    elif dtype.kind == "M":
+        if fill_value is None:
+            # GH#24293
+            fill_value = iNaT
+        if not isinstance(result, np.ndarray):
+            assert not isna(fill_value), "Expected non-null fill_value"
+            if result == fill_value:
+                result = np.nan
+
+            if isna(result):
+                result = np.datetime64("NaT", "ns").astype(dtype)
+            else:
+                result = np.int64(result).view(dtype)
+            # retain original unit
+            result = result.astype(dtype, copy=False)
+        else:
+            # If we have float dtype, taking a view will give the wrong result
+            result = result.astype(dtype)
+    elif dtype.kind == "m":
+        if not isinstance(result, np.ndarray):
+            if result == fill_value or np.isnan(result):
+                result = np.timedelta64("NaT").astype(dtype)
+
+            elif np.fabs(result) > lib.i8max:
+                # raise if we have a timedelta64[ns] which is too large
+                raise ValueError("overflow in timedelta operation")
+            else:
+                # return a timedelta64 with the original unit
+                result = np.int64(result).astype(dtype, copy=False)
+
+        else:
+            result = result.astype("m8[ns]").view(dtype)
+
+    return result
+
+
+def _datetimelike_compat(func: F) -> F:
+    """
+    If we have datetime64 or timedelta64 values, ensure we have a correct
+    mask before calling the wrapped function, then cast back afterwards.
+    """
+
+    @functools.wraps(func)
+    def new_func(
+        values: np.ndarray,
+        *,
+        axis: AxisInt | None = None,
+        skipna: bool = True,
+        mask: npt.NDArray[np.bool_] | None = None,
+        **kwargs,
+    ):
+        orig_values = values
+
+        datetimelike = values.dtype.kind in "mM"
+        if datetimelike and mask is None:
+            mask = isna(values)
+
+        result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
+
+        if datetimelike:
+            result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)
+            if not skipna:
+                assert mask is not None  # checked above
+                result = _mask_datetimelike_result(result, axis, mask, orig_values)
+
+        return result
+
+    return cast(F, new_func)
+
+
+def _na_for_min_count(values: np.ndarray, axis: AxisInt | None) -> Scalar | np.ndarray:
+    """
+    Return the missing value for `values`.
+
+    Parameters
+    ----------
+    values : ndarray
+    axis : int or None
+        axis for the reduction, required if values.ndim > 1.
+
+    Returns
+    -------
+    result : scalar or ndarray
+        For 1-D values, returns a scalar of the correct missing type.
+        For 2-D values, returns a 1-D array where each element is missing.
+    """
+    # we either return np.nan or pd.NaT
+    if values.dtype.kind in "iufcb":
+        values = values.astype("float64")
+    fill_value = na_value_for_dtype(values.dtype)
+
+    if values.ndim == 1:
+        return fill_value
+    elif axis is None:
+        return fill_value
+    else:
+        result_shape = values.shape[:axis] + values.shape[axis + 1 :]
+
+        return np.full(result_shape, fill_value, dtype=values.dtype)
+
+
+def maybe_operate_rowwise(func: F) -> F:
+    """
+    NumPy operations on C-contiguous ndarrays with axis=1 can be
+    very slow if axis 1 >> axis 0.
+    Operate row-by-row and concatenate the results.
+    """
+
+    @functools.wraps(func)
+    def newfunc(values: np.ndarray, *, axis: AxisInt | None = None, **kwargs):
+        if (
+            axis == 1
+            and values.ndim == 2
+            and values.flags["C_CONTIGUOUS"]
+            # only takes this path for wide arrays (long dataframes), for threshold see
+            # https://github.com/pandas-dev/pandas/pull/43311#issuecomment-974891737
+            and (values.shape[1] / 1000) > values.shape[0]
+            and values.dtype not in (object, bool)
+        ):
+            arrs = list(values)
+            if kwargs.get("mask") is not None:
+                mask = kwargs.pop("mask")
+                results = [
+                    func(arrs[i], mask=mask[i], **kwargs) for i in range(len(arrs))
+                ]
+            else:
+                results = [func(x, **kwargs) for x in arrs]
+            return np.array(results)
+
+        return func(values, axis=axis, **kwargs)
+
+    return cast(F, newfunc)
+
+
+def nanany(
+    values: np.ndarray,
+    *,
+    axis: AxisInt | None = None,
+    skipna: bool = True,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> bool:
+    """
+    Check if any elements along an axis evaluate to True.
+
+    Parameters
+    ----------
+    values : ndarray
+    axis : int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : bool
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([1, 2])
+    >>> nanops.nanany(s.values)
+    np.True_
+
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([np.nan])
+    >>> nanops.nanany(s.values)
+    np.False_
+    """
+    if values.dtype.kind in "iub" and mask is None:
+        # GH#26032 fastpath
+        # error: Incompatible return value type (got "Union[bool_, ndarray]",
+        # expected "bool")
+        return values.any(axis)  # type: ignore[return-value]
+
+    if values.dtype.kind == "M":
+        # GH#34479
+        raise TypeError("datetime64 type does not support operation 'any'")
+
+    values, _ = _get_values(values, skipna, fill_value=False, mask=mask)
+
+    # For object type, any won't necessarily return
+    # boolean values (numpy/numpy#4352)
+    if values.dtype == object:
+        values = values.astype(bool)
+
+    # error: Incompatible return value type (got "Union[bool_, ndarray]", expected
+    # "bool")
+    return values.any(axis)  # type: ignore[return-value]
+
+
+def nanall(
+    values: np.ndarray,
+    *,
+    axis: AxisInt | None = None,
+    skipna: bool = True,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> bool:
+    """
+    Check if all elements along an axis evaluate to True.
+
+    Parameters
+    ----------
+    values : ndarray
+    axis : int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : bool
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([1, 2, np.nan])
+    >>> nanops.nanall(s.values)
+    np.True_
+
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([1, 0])
+    >>> nanops.nanall(s.values)
+    np.False_
+    """
+    if values.dtype.kind in "iub" and mask is None:
+        # GH#26032 fastpath
+        # error: Incompatible return value type (got "Union[bool_, ndarray]",
+        # expected "bool")
+        return values.all(axis)  # type: ignore[return-value]
+
+    if values.dtype.kind == "M":
+        # GH#34479
+        raise TypeError("datetime64 type does not support operation 'all'")
+
+    values, _ = _get_values(values, skipna, fill_value=True, mask=mask)
+
+    # For object type, all won't necessarily return
+    # boolean values (numpy/numpy#4352)
+    if values.dtype == object:
+        values = values.astype(bool)
+
+    # error: Incompatible return value type (got "Union[bool_, ndarray]", expected
+    # "bool")
+    return values.all(axis)  # type: ignore[return-value]
+
+
+@disallow("M8")
+@_datetimelike_compat
+@maybe_operate_rowwise
+def nansum(
+    values: np.ndarray,
+    *,
+    axis: AxisInt | None = None,
+    skipna: bool = True,
+    min_count: int = 0,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> npt.NDArray[np.floating] | float | NaTType:
+    """
+    Sum the elements along an axis ignoring NaNs
+
+    Parameters
+    ----------
+    values : ndarray[dtype]
+    axis : int, optional
+    skipna : bool, default True
+    min_count: int, default 0
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : dtype
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([1, 2, np.nan])
+    >>> nanops.nansum(s.values)
+    np.float64(3.0)
+    """
+    dtype = values.dtype
+    values, mask = _get_values(values, skipna, fill_value=0, mask=mask)
+    dtype_sum = _get_dtype_max(dtype)
+    if dtype.kind == "f":
+        dtype_sum = dtype
+    elif dtype.kind == "m":
+        dtype_sum = np.dtype(np.float64)
+
+    the_sum = values.sum(axis, dtype=dtype_sum)
+    the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
+
+    return the_sum
+
+
+def _mask_datetimelike_result(
+    result: np.ndarray | np.datetime64 | np.timedelta64,
+    axis: AxisInt | None,
+    mask: npt.NDArray[np.bool_],
+    orig_values: np.ndarray,
+) -> np.ndarray | np.datetime64 | np.timedelta64 | NaTType:
+    if isinstance(result, np.ndarray):
+        # we need to apply the mask
+        result = result.astype("i8").view(orig_values.dtype)
+        axis_mask = mask.any(axis=axis)
+        result[axis_mask] = iNaT
+    elif mask.any():
+        return np.int64(iNaT).view(orig_values.dtype)
+    return result
+
+
+@bottleneck_switch()
+@_datetimelike_compat
+def nanmean(
+    values: np.ndarray,
+    *,
+    axis: AxisInt | None = None,
+    skipna: bool = True,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> float:
+    """
+    Compute the mean of the element along an axis ignoring NaNs
+
+    Parameters
+    ----------
+    values : ndarray
+    axis : int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    float
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([1, 2, np.nan])
+    >>> nanops.nanmean(s.values)
+    np.float64(1.5)
+    """
+    if values.dtype == object and len(values) > 1_000 and mask is None:
+        # GH#54754 if we are going to fail, try to fail-fast
+        nanmean(values[:1000], axis=axis, skipna=skipna)
+
+    dtype = values.dtype
+    values, mask = _get_values(values, skipna, fill_value=0, mask=mask)
+    dtype_sum = _get_dtype_max(dtype)
+    dtype_count = np.dtype(np.float64)
+
+    # not using needs_i8_conversion because that includes period
+    if dtype.kind in "mM":
+        dtype_sum = np.dtype(np.float64)
+    elif dtype.kind in "iu":
+        dtype_sum = np.dtype(np.float64)
+    elif dtype.kind == "f":
+        dtype_sum = dtype
+        dtype_count = dtype
+
+    count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
+    the_sum = values.sum(axis, dtype=dtype_sum)
+    the_sum = _ensure_numeric(the_sum)
+
+    if axis is not None and getattr(the_sum, "ndim", False):
+        count = cast(np.ndarray, count)
+        with np.errstate(all="ignore"):
+            # suppress division by zero warnings
+            the_mean = the_sum / count
+        ct_mask = count == 0
+        if ct_mask.any():
+            the_mean[ct_mask] = np.nan
+    else:
+        the_mean = the_sum / count if count > 0 else np.nan
+
+    return the_mean
+
+
+@bottleneck_switch()
+def nanmedian(
+    values: np.ndarray, *, axis: AxisInt | None = None, skipna: bool = True, mask=None
+) -> float | np.ndarray:
+    """
+    Parameters
+    ----------
+    values : ndarray
+    axis : int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : float | ndarray
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([1, np.nan, 2, 2])
+    >>> nanops.nanmedian(s.values)
+    2.0
+
+    >>> s = pd.Series([np.nan, np.nan, np.nan])
+    >>> nanops.nanmedian(s.values)
+    nan
+    """
+    # for floats without mask, the data already uses NaN as missing value
+    # indicator, and `mask` will be calculated from that below -> in those
+    # cases we never need to set NaN to the masked values
+    using_nan_sentinel = values.dtype.kind == "f" and mask is None
+
+    def get_median(x: np.ndarray, _mask=None):
+        if _mask is None:
+            _mask = notna(x)
+        else:
+            _mask = ~_mask
+        if not skipna and not _mask.all():
+            return np.nan
+        with warnings.catch_warnings():
+            # Suppress RuntimeWarning about All-NaN slice
+            warnings.filterwarnings(
+                "ignore", "All-NaN slice encountered", RuntimeWarning
+            )
+            warnings.filterwarnings("ignore", "Mean of empty slice", RuntimeWarning)
+            res = np.nanmedian(x[_mask])
+        return res
+
+    dtype = values.dtype
+    values, mask = _get_values(values, skipna, mask=mask, fill_value=None)
+    if values.dtype.kind != "f":
+        if values.dtype == object:
+            # GH#34671 avoid casting strings to numeric
+            inferred = lib.infer_dtype(values)
+            if inferred in ["string", "mixed"]:
+                raise TypeError(f"Cannot convert {values} to numeric")
+        try:
+            values = values.astype("f8")
+        except ValueError as err:
+            # e.g. "could not convert string to float: 'a'"
+            raise TypeError(str(err)) from err
+    if not using_nan_sentinel and mask is not None:
+        if not values.flags.writeable:
+            values = values.copy()
+        values[mask] = np.nan
+
+    notempty = values.size
+
+    res: float | np.ndarray
+
+    # an array from a frame
+    if values.ndim > 1 and axis is not None:
+        # there's a non-empty array to apply over otherwise numpy raises
+        if notempty:
+            if not skipna:
+                res = np.apply_along_axis(get_median, axis, values)
+
+            else:
+                # fastpath for the skipna case
+                with warnings.catch_warnings():
+                    # Suppress RuntimeWarning about All-NaN slice
+                    warnings.filterwarnings(
+                        "ignore", "All-NaN slice encountered", RuntimeWarning
+                    )
+                    if (values.shape[1] == 1 and axis == 0) or (
+                        values.shape[0] == 1 and axis == 1
+                    ):
+                        # GH52788: fastpath when squeezable, nanmedian for 2D array slow
+                        res = np.nanmedian(np.squeeze(values), keepdims=True)
+                    else:
+                        res = np.nanmedian(values, axis=axis)
+
+        else:
+            # must return the correct shape, but median is not defined for the
+            # empty set so return nans of shape "everything but the passed axis"
+            # since "axis" is where the reduction would occur if we had a nonempty
+            # array
+            res = _get_empty_reduction_result(values.shape, axis)
+
+    else:
+        # otherwise return a scalar value
+        res = get_median(values, mask) if notempty else np.nan
+    return _wrap_results(res, dtype)
+
+
+def _get_empty_reduction_result(
+    shape: Shape,
+    axis: AxisInt,
+) -> np.ndarray:
+    """
+    The result from a reduction on an empty ndarray.
+
+    Parameters
+    ----------
+    shape : Tuple[int, ...]
+    axis : int
+
+    Returns
+    -------
+    np.ndarray
+    """
+    shp = np.array(shape)
+    dims = np.arange(len(shape))
+    ret = np.empty(shp[dims != axis], dtype=np.float64)
+    ret.fill(np.nan)
+    return ret
+
+
+def _get_counts_nanvar(
+    values_shape: Shape,
+    mask: npt.NDArray[np.bool_] | None,
+    axis: AxisInt | None,
+    ddof: int,
+    dtype: np.dtype = np.dtype(np.float64),
+) -> tuple[float | np.ndarray, float | np.ndarray]:
+    """
+    Get the count of non-null values along an axis, accounting
+    for degrees of freedom.
+
+    Parameters
+    ----------
+    values_shape : Tuple[int, ...]
+        shape tuple from values ndarray, used if mask is None
+    mask : Optional[ndarray[bool]]
+        locations in values that should be considered missing
+    axis : Optional[int]
+        axis to count along
+    ddof : int
+        degrees of freedom
+    dtype : type, optional
+        type to use for count
+
+    Returns
+    -------
+    count : int, np.nan or np.ndarray
+    d : int, np.nan or np.ndarray
+    """
+    count = _get_counts(values_shape, mask, axis, dtype=dtype)
+    d = count - dtype.type(ddof)
+
+    # always return NaN, never inf
+    if is_float(count):
+        if count <= ddof:
+            # error: Incompatible types in assignment (expression has type
+            # "float", variable has type "Union[floating[Any], ndarray[Any,
+            # dtype[floating[Any]]]]")
+            count = np.nan  # type: ignore[assignment]
+            d = np.nan
+    else:
+        # count is not narrowed by is_float check
+        count = cast(np.ndarray, count)
+        mask = count <= ddof
+        if mask.any():
+            np.putmask(d, mask, np.nan)
+            np.putmask(count, mask, np.nan)
+    return count, d
+
+
+@bottleneck_switch(ddof=1)
+def nanstd(
+    values,
+    *,
+    axis: AxisInt | None = None,
+    skipna: bool = True,
+    ddof: int = 1,
+    mask=None,
+):
+    """
+    Compute the standard deviation along given axis while ignoring NaNs
+
+    Parameters
+    ----------
+    values : ndarray
+    axis : int, optional
+    skipna : bool, default True
+    ddof : int, default 1
+        Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+        where N represents the number of elements.
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : float
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([1, np.nan, 2, 3])
+    >>> nanops.nanstd(s.values)
+    1.0
+    """
+    if values.dtype.kind == "M":
+        unit = np.datetime_data(values.dtype)[0]
+        values = values.view(f"m8[{unit}]")
+
+    orig_dtype = values.dtype
+    values, mask = _get_values(values, skipna, mask=mask)
+
+    result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask))
+    return _wrap_results(result, orig_dtype)
+
+
+@disallow("M8", "m8")
+@bottleneck_switch(ddof=1)
+def nanvar(
+    values: np.ndarray,
+    *,
+    axis: AxisInt | None = None,
+    skipna: bool = True,
+    ddof: int = 1,
+    mask=None,
+):
+    """
+    Compute the variance along given axis while ignoring NaNs
+
+    Parameters
+    ----------
+    values : ndarray
+    axis : int, optional
+    skipna : bool, default True
+    ddof : int, default 1
+        Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+        where N represents the number of elements.
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : float
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([1, np.nan, 2, 3])
+    >>> nanops.nanvar(s.values)
+    1.0
+    """
+    dtype = values.dtype
+    mask = _maybe_get_mask(values, skipna, mask)
+    if dtype.kind in "iu":
+        values = values.astype("f8")
+        if mask is not None:
+            values[mask] = np.nan
+
+    if values.dtype.kind == "f":
+        count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
+    else:
+        count, d = _get_counts_nanvar(values.shape, mask, axis, ddof)
+
+    if skipna and mask is not None:
+        values = values.copy()
+        np.putmask(values, mask, 0)
+
+    # xref GH10242
+    # Compute variance via two-pass algorithm, which is stable against
+    # cancellation errors and relatively accurate for small numbers of
+    # observations.
+    #
+    # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
+    avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
+    if axis is not None:
+        avg = np.expand_dims(avg, axis)
+    if values.dtype.kind == "c":
+        # Need to use absolute value for complex numbers.
+        sqr = _ensure_numeric(abs(avg - values) ** 2)
+    else:
+        sqr = _ensure_numeric((avg - values) ** 2)
+    if mask is not None:
+        np.putmask(sqr, mask, 0)
+    result = sqr.sum(axis=axis, dtype=np.float64) / d
+
+    # Return variance as np.float64 (the datatype used in the accumulator),
+    # unless we were dealing with a float array, in which case use the same
+    # precision as the original values array.
+    if dtype.kind == "f":
+        result = result.astype(dtype, copy=False)
+    return result
+
+
+@disallow("M8", "m8")
+def nansem(
+    values: np.ndarray,
+    *,
+    axis: AxisInt | None = None,
+    skipna: bool = True,
+    ddof: int = 1,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> float:
+    """
+    Compute the standard error in the mean along given axis while ignoring NaNs
+
+    Parameters
+    ----------
+    values : ndarray
+    axis : int, optional
+    skipna : bool, default True
+    ddof : int, default 1
+        Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+        where N represents the number of elements.
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : float64
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([1, np.nan, 2, 3])
+    >>> nanops.nansem(s.values)
+     np.float64(0.5773502691896258)
+    """
+    # This checks if non-numeric-like data is passed with numeric_only=False
+    # and raises a TypeError otherwise
+    nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
+
+    mask = _maybe_get_mask(values, skipna, mask)
+    if values.dtype.kind != "f":
+        values = values.astype("f8")
+
+    if not skipna and mask is not None and mask.any():
+        return np.nan
+
+    count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
+    var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
+
+    return np.sqrt(var) / np.sqrt(count)
+
+
+def _nanminmax(meth, fill_value_typ):
+    @bottleneck_switch(name=f"nan{meth}")
+    @_datetimelike_compat
+    def reduction(
+        values: np.ndarray,
+        *,
+        axis: AxisInt | None = None,
+        skipna: bool = True,
+        mask: npt.NDArray[np.bool_] | None = None,
+    ):
+        if values.size == 0:
+            return _na_for_min_count(values, axis)
+
+        dtype = values.dtype
+        values, mask = _get_values(
+            values, skipna, fill_value_typ=fill_value_typ, mask=mask
+        )
+        result = getattr(values, meth)(axis)
+        result = _maybe_null_out(
+            result, axis, mask, values.shape, datetimelike=dtype.kind in "mM"
+        )
+        return result
+
+    return reduction
+
+
+nanmin = _nanminmax("min", fill_value_typ="+inf")
+nanmax = _nanminmax("max", fill_value_typ="-inf")
+
+
+def nanargmax(
+    values: np.ndarray,
+    *,
+    axis: AxisInt | None = None,
+    skipna: bool = True,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> int | np.ndarray:
+    """
+    Parameters
+    ----------
+    values : ndarray
+    axis : int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : int or ndarray[int]
+        The index/indices  of max value in specified axis or -1 in the NA case
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> arr = np.array([1, 2, 3, np.nan, 4])
+    >>> nanops.nanargmax(arr)
+    np.int64(4)
+
+    >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
+    >>> arr[2:, 2] = np.nan
+    >>> arr
+    array([[ 0.,  1.,  2.],
+           [ 3.,  4.,  5.],
+           [ 6.,  7., nan],
+           [ 9., 10., nan]])
+    >>> nanops.nanargmax(arr, axis=1)
+    array([2, 2, 1, 1])
+    """
+    values, mask = _get_values(values, True, fill_value_typ="-inf", mask=mask)
+    result = values.argmax(axis)
+    # error: Argument 1 to "_maybe_arg_null_out" has incompatible type "Any |
+    # signedinteger[Any]"; expected "ndarray[Any, Any]"
+    result = _maybe_arg_null_out(result, axis, mask, skipna)  # type: ignore[arg-type]
+    return result
+
+
+def nanargmin(
+    values: np.ndarray,
+    *,
+    axis: AxisInt | None = None,
+    skipna: bool = True,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> int | np.ndarray:
+    """
+    Parameters
+    ----------
+    values : ndarray
+    axis : int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : int or ndarray[int]
+        The index/indices of min value in specified axis or -1 in the NA case
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> arr = np.array([1, 2, 3, np.nan, 4])
+    >>> nanops.nanargmin(arr)
+    np.int64(0)
+
+    >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
+    >>> arr[2:, 0] = np.nan
+    >>> arr
+    array([[ 0.,  1.,  2.],
+           [ 3.,  4.,  5.],
+           [nan,  7.,  8.],
+           [nan, 10., 11.]])
+    >>> nanops.nanargmin(arr, axis=1)
+    array([0, 0, 1, 1])
+    """
+    values, mask = _get_values(values, True, fill_value_typ="+inf", mask=mask)
+    result = values.argmin(axis)
+    # error: Argument 1 to "_maybe_arg_null_out" has incompatible type "Any |
+    # signedinteger[Any]"; expected "ndarray[Any, Any]"
+    result = _maybe_arg_null_out(result, axis, mask, skipna)  # type: ignore[arg-type]
+    return result
+
+
+@disallow("M8", "m8")
+@maybe_operate_rowwise
+def nanskew(
+    values: np.ndarray,
+    *,
+    axis: AxisInt | None = None,
+    skipna: bool = True,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> float:
+    """
+    Compute the sample skewness.
+
+    The statistic computed here is the adjusted Fisher-Pearson standardized
+    moment coefficient G1. The algorithm computes this coefficient directly
+    from the second and third central moment.
+
+    Parameters
+    ----------
+    values : ndarray
+    axis : int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : float64
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([1, np.nan, 1, 2])
+    >>> nanops.nanskew(s.values)
+    np.float64(1.7320508075688787)
+    """
+    mask = _maybe_get_mask(values, skipna, mask)
+    if values.dtype.kind != "f":
+        values = values.astype("f8")
+        count = _get_counts(values.shape, mask, axis)
+    else:
+        count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
+
+    if skipna and mask is not None:
+        values = values.copy()
+        np.putmask(values, mask, 0)
+    elif not skipna and mask is not None and mask.any():
+        return np.nan
+
+    with np.errstate(invalid="ignore", divide="ignore"):
+        mean = values.sum(axis, dtype=np.float64) / count
+    if axis is not None:
+        mean = np.expand_dims(mean, axis)
+
+    adjusted = values - mean
+    if skipna and mask is not None:
+        np.putmask(adjusted, mask, 0)
+    adjusted2 = adjusted**2
+    adjusted3 = adjusted2 * adjusted
+    m2 = adjusted2.sum(axis, dtype=np.float64)
+    m3 = adjusted3.sum(axis, dtype=np.float64)
+
+    # floating point error. See comment in [nankurt]
+    max_abs = np.abs(values).max(axis, initial=0.0)
+    eps = np.finfo(m2.dtype).eps
+    constant_tolerance2 = ((eps * max_abs) ** 2) * count
+    constant_tolerance3 = ((eps * max_abs) ** 3) * count
+    m2 = _zero_out_fperr(m2, constant_tolerance2)
+    m3 = _zero_out_fperr(m3, constant_tolerance3)
+
+    with np.errstate(invalid="ignore", divide="ignore"):
+        result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5)
+
+    dtype = values.dtype
+    if dtype.kind == "f":
+        result = result.astype(dtype, copy=False)
+
+    if isinstance(result, np.ndarray):
+        result = np.where(m2 == 0, 0, result)
+        result[count < 3] = np.nan
+    else:
+        result = dtype.type(0) if m2 == 0 else result
+        if count < 3:
+            return np.nan
+
+    return result
+
+
+@disallow("M8", "m8")
+@maybe_operate_rowwise
+def nankurt(
+    values: np.ndarray,
+    *,
+    axis: AxisInt | None = None,
+    skipna: bool = True,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> float:
+    """
+    Compute the sample excess kurtosis
+
+    The statistic computed here is the adjusted Fisher-Pearson standardized
+    moment coefficient G2, computed directly from the second and fourth
+    central moment.
+
+    Parameters
+    ----------
+    values : ndarray
+    axis : int, optional
+    skipna : bool, default True
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    result : float64
+        Unless input is a float array, in which case use the same
+        precision as the input array.
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([1, np.nan, 1, 3, 2])
+    >>> nanops.nankurt(s.values)
+    np.float64(-1.2892561983471076)
+    """
+    mask = _maybe_get_mask(values, skipna, mask)
+    if values.dtype.kind != "f":
+        values = values.astype("f8")
+        count = _get_counts(values.shape, mask, axis)
+    else:
+        count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
+
+    if skipna and mask is not None:
+        values = values.copy()
+        np.putmask(values, mask, 0)
+    elif not skipna and mask is not None and mask.any():
+        return np.nan
+
+    with np.errstate(invalid="ignore", divide="ignore"):
+        mean = values.sum(axis, dtype=np.float64) / count
+    if axis is not None:
+        mean = np.expand_dims(mean, axis)
+
+    adjusted = values - mean
+    if skipna and mask is not None:
+        np.putmask(adjusted, mask, 0)
+    adjusted2 = adjusted**2
+    adjusted4 = adjusted2**2
+    m2 = adjusted2.sum(axis, dtype=np.float64)
+    m4 = adjusted4.sum(axis, dtype=np.float64)
+
+    # Several floating point errors may occur during the summation due to rounding.
+    # This computation is similar to the one in Scipy
+    # https://github.com/scipy/scipy/blob/04d6d9c460b1fed83f2919ecec3d743cfa2e8317/scipy/stats/_stats_py.py#L1429
+    # With a few modifications, like using the maximum value instead of the averages
+    # and some adaptations because they use the average and we use the sum for `m2`.
+    # We need to estimate an upper bound to the error to consider the data constant.
+    # Let's call:
+    # x: true value in data
+    # y: floating point representation
+    # e: relative approximation error
+    # n: number of observations in array
+    #
+    # We have that:
+    # |x - y|/|x| <= e (See https://en.wikipedia.org/wiki/Machine_epsilon)
+    # (|x - y|/|x|)² <= e²
+    # Σ (|x - y|/|x|)² <= ne²
+    #
+    # Let's say that the fperr upper bound for m2 is constrained by the summation.
+    # |m2 - y|/|m2| <= ne²
+    # |m2 - y| <= n|m2|e²
+    #
+    # We will use max (x²) to estimate |m2|
+    max_abs = np.abs(values).max(axis, initial=0.0)
+    eps = np.finfo(m2.dtype).eps
+    constant_tolerance2 = ((eps * max_abs) ** 2) * count
+    constant_tolerance4 = ((eps * max_abs) ** 4) * count
+    m2 = _zero_out_fperr(m2, constant_tolerance2)
+    m4 = _zero_out_fperr(m4, constant_tolerance4)
+
+    with np.errstate(invalid="ignore", divide="ignore"):
+        adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
+        numerator = count * (count + 1) * (count - 1) * m4
+        denominator = (count - 2) * (count - 3) * m2**2
+
+    if not isinstance(denominator, np.ndarray):
+        # if ``denom`` is a scalar, check these corner cases first before
+        # doing division
+        if count < 4:
+            return np.nan
+        if denominator == 0:
+            return values.dtype.type(0)
+
+    with np.errstate(invalid="ignore", divide="ignore"):
+        result = numerator / denominator - adj
+
+    dtype = values.dtype
+    if dtype.kind == "f":
+        result = result.astype(dtype, copy=False)
+
+    if isinstance(result, np.ndarray):
+        result = np.where(denominator == 0, 0, result)
+        result[count < 4] = np.nan
+
+    return result
+
+
+@disallow("M8", "m8")
+@maybe_operate_rowwise
+def nanprod(
+    values: np.ndarray,
+    *,
+    axis: AxisInt | None = None,
+    skipna: bool = True,
+    min_count: int = 0,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> float:
+    """
+    Parameters
+    ----------
+    values : ndarray[dtype]
+    axis : int, optional
+    skipna : bool, default True
+    min_count: int, default 0
+    mask : ndarray[bool], optional
+        nan-mask if known
+
+    Returns
+    -------
+    Dtype
+        The product of all elements on a given axis. ( NaNs are treated as 1)
+
+    Examples
+    --------
+    >>> from pandas.core import nanops
+    >>> s = pd.Series([1, 2, 3, np.nan])
+    >>> nanops.nanprod(s.values)
+    np.float64(6.0)
+    """
+    mask = _maybe_get_mask(values, skipna, mask)
+
+    if skipna and mask is not None:
+        values = values.copy()
+        values[mask] = 1
+    result = values.prod(axis)
+    # error: Incompatible return value type (got "Union[ndarray, float]", expected
+    # "float")
+    return _maybe_null_out(  # type: ignore[return-value]
+        result, axis, mask, values.shape, min_count=min_count
+    )
+
+
+def _maybe_arg_null_out(
+    result: np.ndarray,
+    axis: AxisInt | None,
+    mask: npt.NDArray[np.bool_] | None,
+    skipna: bool,
+) -> np.ndarray | int:
+    # helper function for nanargmin/nanargmax
+    if mask is None:
+        return result
+
+    if axis is None or not getattr(result, "ndim", False):
+        if skipna and mask.all():
+            raise ValueError("Encountered all NA values")
+        elif not skipna and mask.any():
+            raise ValueError("Encountered an NA value with skipna=False")
+    elif skipna and mask.all(axis).any():
+        raise ValueError("Encountered all NA values")
+    elif not skipna and mask.any(axis).any():
+        raise ValueError("Encountered an NA value with skipna=False")
+    return result
+
+
+def _get_counts(
+    values_shape: Shape,
+    mask: npt.NDArray[np.bool_] | None,
+    axis: AxisInt | None,
+    dtype: np.dtype[np.floating] = np.dtype(np.float64),
+) -> np.floating | npt.NDArray[np.floating]:
+    """
+    Get the count of non-null values along an axis
+
+    Parameters
+    ----------
+    values_shape : tuple of int
+        shape tuple from values ndarray, used if mask is None
+    mask : Optional[ndarray[bool]]
+        locations in values that should be considered missing
+    axis : Optional[int]
+        axis to count along
+    dtype : type, optional
+        type to use for count
+
+    Returns
+    -------
+    count : scalar or array
+    """
+    if axis is None:
+        if mask is not None:
+            n = mask.size - mask.sum()
+        else:
+            n = np.prod(values_shape)
+        return dtype.type(n)
+
+    if mask is not None:
+        count = mask.shape[axis] - mask.sum(axis)
+    else:
+        count = values_shape[axis]
+
+    if is_integer(count):
+        return dtype.type(count)
+    return count.astype(dtype, copy=False)
+
+
+def _maybe_null_out(
+    result: np.ndarray | float | NaTType,
+    axis: AxisInt | None,
+    mask: npt.NDArray[np.bool_] | None,
+    shape: tuple[int, ...],
+    min_count: int = 1,
+    datetimelike: bool = False,
+) -> np.ndarray | float | NaTType:
+    """
+    Returns
+    -------
+    Dtype
+        The product of all elements on a given axis. ( NaNs are treated as 1)
+    """
+    if mask is None and min_count == 0:
+        # nothing to check; short-circuit
+        return result
+
+    if axis is not None and isinstance(result, np.ndarray):
+        if mask is not None:
+            null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
+        else:
+            # we have no nulls, kept mask=None in _maybe_get_mask
+            below_count = shape[axis] - min_count < 0
+            new_shape = shape[:axis] + shape[axis + 1 :]
+            null_mask = np.broadcast_to(below_count, new_shape)
+
+        if np.any(null_mask):
+            if datetimelike:
+                # GH#60646 For datetimelike, no need to cast to float
+                result[null_mask] = iNaT
+            elif is_numeric_dtype(result):
+                if np.iscomplexobj(result):
+                    result = result.astype("c16")
+                elif not is_float_dtype(result):
+                    result = result.astype("f8", copy=False)
+                result[null_mask] = np.nan
+            else:
+                # GH12941, use None to auto cast null
+                result[null_mask] = None
+    elif result is not NaT:
+        if check_below_min_count(shape, mask, min_count):
+            result_dtype = getattr(result, "dtype", None)
+            if is_float_dtype(result_dtype):
+                # error: Item "None" of "Optional[Any]" has no attribute "type"
+                result = result_dtype.type("nan")  # type: ignore[union-attr]
+            else:
+                result = np.nan
+
+    return result
+
+
+def check_below_min_count(
+    shape: tuple[int, ...], mask: npt.NDArray[np.bool_] | None, min_count: int
+) -> bool:
+    """
+    Check for the `min_count` keyword. Returns True if below `min_count` (when
+    missing value should be returned from the reduction).
+
+    Parameters
+    ----------
+    shape : tuple
+        The shape of the values (`values.shape`).
+    mask : ndarray[bool] or None
+        Boolean numpy array (typically of same shape as `shape`) or None.
+    min_count : int
+        Keyword passed through from sum/prod call.
+
+    Returns
+    -------
+    bool
+    """
+    if min_count > 0:
+        if mask is None:
+            # no missing values, only check size
+            non_nulls = np.prod(shape)
+        else:
+            non_nulls = mask.size - mask.sum()
+        if non_nulls < min_count:
+            return True
+    return False
+
+
+def _zero_out_fperr(arg, tol: float | np.ndarray):
+    # #18044 reference this behavior to fix rolling skew/kurt issue
+    if isinstance(arg, np.ndarray):
+        return np.where(np.abs(arg) < tol, 0, arg)
+    else:
+        return arg.dtype.type(0) if np.abs(arg) < tol else arg
+
+
+@disallow("M8", "m8")
+def nancorr(
+    a: np.ndarray,
+    b: np.ndarray,
+    *,
+    method: CorrelationMethod = "pearson",
+    min_periods: int | None = None,
+) -> float:
+    """
+    a, b: ndarrays
+    """
+    if len(a) != len(b):
+        raise AssertionError("Operands to nancorr must have same size")
+
+    if min_periods is None:
+        min_periods = 1
+
+    valid = notna(a) & notna(b)
+    if not valid.all():
+        a = a[valid]
+        b = b[valid]
+
+    if len(a) < min_periods:
+        return np.nan
+
+    a = _ensure_numeric(a)
+    b = _ensure_numeric(b)
+
+    f = get_corr_func(method)
+    return f(a, b)
+
+
+def get_corr_func(
+    method: CorrelationMethod,
+) -> Callable[[np.ndarray, np.ndarray], float]:
+    if method == "kendall":
+        from scipy.stats import kendalltau
+
+        def func(a, b):
+            return kendalltau(a, b)[0]
+
+        return func
+    elif method == "spearman":
+        from scipy.stats import spearmanr
+
+        def func(a, b):
+            return spearmanr(a, b)[0]
+
+        return func
+    elif method == "pearson":
+
+        def func(a, b):
+            return np.corrcoef(a, b)[0, 1]
+
+        return func
+    elif callable(method):
+        return method
+
+    raise ValueError(
+        f"Unknown method '{method}', expected one of "
+        "'kendall', 'spearman', 'pearson', or callable"
+    )
+
+
+@disallow("M8", "m8")
+def nancov(
+    a: np.ndarray,
+    b: np.ndarray,
+    *,
+    min_periods: int | None = None,
+    ddof: int | None = 1,
+) -> float:
+    if len(a) != len(b):
+        raise AssertionError("Operands to nancov must have same size")
+
+    if min_periods is None:
+        min_periods = 1
+
+    valid = notna(a) & notna(b)
+    if not valid.all():
+        a = a[valid]
+        b = b[valid]
+
+    if len(a) < min_periods:
+        return np.nan
+
+    a = _ensure_numeric(a)
+    b = _ensure_numeric(b)
+
+    return np.cov(a, b, ddof=ddof)[0, 1]
+
+
+def _ensure_numeric(x):
+    if isinstance(x, np.ndarray):
+        if x.dtype.kind in "biu":
+            x = x.astype(np.float64)
+        elif x.dtype == object:
+            inferred = lib.infer_dtype(x)
+            if inferred in ["string", "mixed"]:
+                # GH#44008, GH#36703 avoid casting e.g. strings to numeric
+                raise TypeError(f"Could not convert {x} to numeric")
+            try:
+                x = x.astype(np.complex128)
+            except (TypeError, ValueError):
+                try:
+                    x = x.astype(np.float64)
+                except ValueError as err:
+                    # GH#29941 we get here with object arrays containing strs
+                    raise TypeError(f"Could not convert {x} to numeric") from err
+            else:
+                if not np.any(np.imag(x)):
+                    x = x.real
+    elif not (is_float(x) or is_integer(x) or is_complex(x)):
+        if isinstance(x, str):
+            # GH#44008, GH#36703 avoid casting e.g. strings to numeric
+            raise TypeError(f"Could not convert string '{x}' to numeric")
+        try:
+            x = float(x)
+        except (TypeError, ValueError):
+            # e.g. "1+1j" or "foo"
+            try:
+                x = complex(x)
+            except ValueError as err:
+                # e.g. "foo"
+                raise TypeError(f"Could not convert {x} to numeric") from err
+    return x
+
+
+def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike:
+    """
+    Cumulative function with skipna support.
+
+    Parameters
+    ----------
+    values : np.ndarray or ExtensionArray
+    accum_func : {np.cumprod, np.maximum.accumulate, np.cumsum, np.minimum.accumulate}
+    skipna : bool
+
+    Returns
+    -------
+    np.ndarray or ExtensionArray
+    """
+    mask_a, mask_b = {
+        np.cumprod: (1.0, np.nan),
+        np.maximum.accumulate: (-np.inf, np.nan),
+        np.cumsum: (0.0, np.nan),
+        np.minimum.accumulate: (np.inf, np.nan),
+    }[accum_func]
+
+    # This should go through ea interface
+    assert values.dtype.kind not in "mM"
+
+    # We will be applying this function to block values
+    if skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)):
+        vals = values.copy()
+        mask = isna(vals)
+        vals[mask] = mask_a
+        result = accum_func(vals, axis=0)
+        result[mask] = mask_b
+    else:
+        result = accum_func(values, axis=0)
+
+    return result
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
new file mode 100644
index 0000000000000000000000000000000000000000..beb60faf23b55d21a78a0a35b671a48bf28d93b5
--- /dev/null
+++ b/pandas/core/resample.py
@@ -0,0 +1,3150 @@
+from __future__ import annotations
+
+import copy
+from typing import (
+    TYPE_CHECKING,
+    Concatenate,
+    Literal,
+    Self,
+    cast,
+    final,
+    no_type_check,
+    overload,
+)
+import warnings
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas._libs.tslibs import (
+    BaseOffset,
+    IncompatibleFrequency,
+    NaT,
+    Period,
+    Timedelta,
+    Timestamp,
+    to_offset,
+)
+from pandas._typing import NDFrameT
+from pandas.errors import (
+    AbstractMethodError,
+    Pandas4Warning,
+)
+from pandas.util._decorators import set_module
+from pandas.util._exceptions import find_stack_level
+
+from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
+    PeriodDtype,
+)
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCSeries,
+)
+
+import pandas.core.algorithms as algos
+from pandas.core.apply import ResamplerWindowApply
+from pandas.core.arrays import ArrowExtensionArray
+from pandas.core.base import (
+    PandasObject,
+    SelectionMixin,
+)
+from pandas.core.generic import (
+    NDFrame,
+)
+from pandas.core.groupby.groupby import (
+    BaseGroupBy,
+    GroupBy,
+    get_groupby,
+)
+from pandas.core.groupby.grouper import Grouper
+from pandas.core.groupby.ops import BinGrouper
+from pandas.core.indexes.api import MultiIndex
+from pandas.core.indexes.base import Index
+from pandas.core.indexes.datetimes import (
+    DatetimeIndex,
+    date_range,
+)
+from pandas.core.indexes.period import (
+    PeriodIndex,
+    period_range,
+)
+from pandas.core.indexes.timedeltas import (
+    TimedeltaIndex,
+    timedelta_range,
+)
+from pandas.core.reshape.concat import concat
+
+from pandas.tseries.frequencies import (
+    is_subperiod,
+    is_superperiod,
+)
+from pandas.tseries.offsets import (
+    Day,
+    Tick,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Hashable,
+    )
+
+    from pandas._typing import (
+        Any,
+        AnyArrayLike,
+        Axis,
+        FreqIndexT,
+        Frequency,
+        IndexLabel,
+        InterpolateOptions,
+        P,
+        T,
+        TimedeltaConvertibleTypes,
+        TimeGrouperOrigin,
+        TimestampConvertibleTypes,
+        TimeUnit,
+        npt,
+    )
+
+    from pandas import (
+        DataFrame,
+        Series,
+    )
+    from pandas.core.generic import NDFrame
+
+_shared_docs_kwargs: dict[str, str] = {}
+
+
+@set_module("pandas.api.typing")
+class Resampler(BaseGroupBy, PandasObject):
+    """
+    Class for resampling datetimelike data, a groupby-like operation.
+    See aggregate, transform, and apply functions on this object.
+
+    It's easiest to use obj.resample(...) to use Resampler.
+
+    Parameters
+    ----------
+    obj : Series or DataFrame
+    groupby : TimeGrouper
+
+    Returns
+    -------
+    a Resampler of the appropriate type
+
+    Notes
+    -----
+    After resampling, see aggregate, apply, and transform functions.
+    """
+
+    _grouper: BinGrouper
+    _timegrouper: TimeGrouper
+    binner: DatetimeIndex | TimedeltaIndex | PeriodIndex  # depends on subclass
+    exclusions: frozenset[Hashable] = frozenset()  # for SelectionMixin compat
+    _internal_names_set = set({"obj", "ax", "_indexer"})
+
+    # to the groupby descriptor
+    _attributes = [
+        "freq",
+        "closed",
+        "label",
+        "convention",
+        "origin",
+        "offset",
+    ]
+
+    def __init__(
+        self,
+        obj: NDFrame,
+        timegrouper: TimeGrouper,
+        *,
+        gpr_index: Index,
+        group_keys: bool = False,
+        selection=None,
+        include_groups: bool = False,
+    ) -> None:
+        if include_groups:
+            raise ValueError("include_groups=True is no longer allowed.")
+        self._timegrouper = timegrouper
+        self.keys = None
+        self.sort = True
+        self.group_keys = group_keys
+        self.as_index = True
+
+        self.obj, self.ax, self._indexer = self._timegrouper._set_grouper(
+            self._convert_obj(obj), sort=True, gpr_index=gpr_index
+        )
+        self.binner, self._grouper = self._get_binner()
+        self._selection = selection
+        if self._timegrouper.key is not None:
+            self.exclusions = frozenset([self._timegrouper.key])
+        else:
+            self.exclusions = frozenset()
+
+    @final
+    def __str__(self) -> str:
+        """
+        Provide a nice str repr of our rolling object.
+        """
+        attrs = (
+            f"{k}={getattr(self._timegrouper, k)}"
+            for k in self._attributes
+            if getattr(self._timegrouper, k, None) is not None
+        )
+        return f"{type(self).__name__} [{', '.join(attrs)}]"
+
+    @final
+    def __getattr__(self, attr: str):
+        if attr in self._internal_names_set:
+            return object.__getattribute__(self, attr)
+        if attr in self._attributes:
+            return getattr(self._timegrouper, attr)
+        if attr in self.obj:
+            return self[attr]
+
+        return object.__getattribute__(self, attr)
+
+    @final
+    @property
+    def _from_selection(self) -> bool:
+        """
+        Is the resampling from a DataFrame column or MultiIndex level.
+        """
+        # upsampling and PeriodIndex resampling do not work
+        # with selection, this state used to catch and raise an error
+        return self._timegrouper is not None and (
+            self._timegrouper.key is not None or self._timegrouper.level is not None
+        )
+
+    def _convert_obj(self, obj: NDFrameT) -> NDFrameT:
+        """
+        Provide any conversions for the object in order to correctly handle.
+
+        Parameters
+        ----------
+        obj : Series or DataFrame
+
+        Returns
+        -------
+        Series or DataFrame
+        """
+        return obj._consolidate()
+
+    def _get_binner_for_time(self):
+        raise AbstractMethodError(self)
+
+    @final
+    def _get_binner(self):
+        """
+        Create the BinGrouper, assume that self.set_grouper(obj)
+        has already been called.
+        """
+        binner, bins, binlabels = self._get_binner_for_time()
+        assert len(bins) == len(binlabels)
+        if self._timegrouper._arrow_dtype is not None:
+            binlabels = binlabels.astype(self._timegrouper._arrow_dtype)
+        bin_grouper = BinGrouper(bins, binlabels, indexer=self._indexer)
+        return binner, bin_grouper
+
+    @overload
+    def pipe(
+        self,
+        func: Callable[Concatenate[Self, P], T],
+        *args: P.args,
+        **kwargs: P.kwargs,
+    ) -> T: ...
+
+    @overload
+    def pipe(
+        self,
+        func: tuple[Callable[..., T], str],
+        *args: Any,
+        **kwargs: Any,
+    ) -> T: ...
+
+    @final
+    def pipe(
+        self,
+        func: Callable[Concatenate[Self, P], T] | tuple[Callable[..., T], str],
+        *args: Any,
+        **kwargs: Any,
+    ) -> T:
+        """
+        Apply a ``func`` with arguments to this Resampler object and return its result.
+
+        Use `.pipe` when you want to improve readability by chaining together
+        functions that expect Series, DataFrames, GroupBy or Resampler objects.
+        Instead of writing
+
+        >>> h = lambda x, arg2, arg3: x + 1 - arg2 * arg3
+        >>> g = lambda x, arg1: x * 5 / arg1
+        >>> f = lambda x: x**4
+        >>> df = pd.DataFrame([["a", 4], ["b", 5]], columns=["group", "value"])
+        >>> h(g(f(df.groupby("group")), arg1=1), arg2=2, arg3=3)  # doctest: +SKIP
+
+        You can write
+
+        >>> (
+        ...     df.groupby("group").pipe(f).pipe(g, arg1=1).pipe(h, arg2=2, arg3=3)
+        ... )  # doctest: +SKIP
+
+        which is much more readable.
+
+        Parameters
+        ----------
+        func : callable or tuple of (callable, str)
+            Function to apply to this Resampler object or, alternatively,
+            a `(callable, data_keyword)` tuple where `data_keyword` is a
+            string indicating the keyword of `callable` that expects the
+            Resampler object.
+        *args : iterable, optional
+            Positional arguments passed into `func`.
+        **kwargs : dict, optional
+                A dictionary of keyword arguments passed into `func`.
+
+        Returns
+        -------
+        any
+            The result of applying ``func`` to the Resampler object.
+
+        See Also
+        --------
+        Series.pipe : Apply a function with arguments to a series.
+        DataFrame.pipe: Apply a function with arguments to a dataframe.
+        apply : Apply function to each group instead of to the
+            full Resampler object.
+
+        Notes
+        -----
+        See more `here
+        <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {"A": [1, 2, 3, 4]}, index=pd.date_range("2012-08-02", periods=4)
+        ... )
+        >>> df
+                    A
+        2012-08-02  1
+        2012-08-03  2
+        2012-08-04  3
+        2012-08-05  4
+
+        To get the difference between each 2-day period's maximum and minimum
+        value in one pass, you can do
+
+        >>> df.resample("2D").pipe(lambda x: x.max() - x.min())
+                    A
+        2012-08-02  1
+        2012-08-04  1
+        """
+        return super().pipe(func, *args, **kwargs)
+
+    @final
+    def aggregate(self, func=None, *args, **kwargs):
+        """
+        Aggregate using one or more operations over the specified axis.
+
+        Parameters
+        ----------
+        func : function, str, list or dict
+            Function to use for aggregating the data. If a function, must either
+            work when passed a DataFrame or when passed to DataFrame.apply.
+
+            Accepted combinations are:
+
+            - function
+            - string function name
+            - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
+            - dict of axis labels -> functions, function names or list of such.
+        *args
+            Positional arguments to pass to `func`.
+        **kwargs
+            Keyword arguments to pass to `func`.
+
+        Returns
+        -------
+        scalar, Series or DataFrame
+
+            The return can be:
+
+            * scalar : when Series.agg is called with single function
+            * Series : when DataFrame.agg is called with a single function
+            * DataFrame : when DataFrame.agg is called with several functions
+
+        See Also
+        --------
+        DataFrame.groupby.aggregate : Aggregate using callable, string, dict,
+            or list of string/callables.
+        DataFrame.resample.transform : Transforms the Series on each group
+            based on the given function.
+        DataFrame.aggregate: Aggregate using one or more
+            operations over the specified axis.
+
+        Notes
+        -----
+        The aggregation operations are always performed over an axis, either the
+        index (default) or the column axis. This behavior is different from
+        `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
+        `var`), where the default is to compute the aggregation of the flattened
+        array, e.g., ``numpy.mean(arr_2d)`` as opposed to
+        ``numpy.mean(arr_2d, axis=0)``.
+
+        `agg` is an alias for `aggregate`. Use the alias.
+
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
+        for more details.
+
+        A passed user-defined-function will be passed a Series for evaluation.
+
+        If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``.
+
+        Examples
+        --------
+        >>> s = pd.Series(
+        ...     [1, 2, 3, 4, 5], index=pd.date_range("20130101", periods=5, freq="s")
+        ... )
+        >>> s
+        2013-01-01 00:00:00    1
+        2013-01-01 00:00:01    2
+        2013-01-01 00:00:02    3
+        2013-01-01 00:00:03    4
+        2013-01-01 00:00:04    5
+        Freq: s, dtype: int64
+
+        >>> r = s.resample("2s")
+
+        >>> r.agg("sum")
+        2013-01-01 00:00:00    3
+        2013-01-01 00:00:02    7
+        2013-01-01 00:00:04    5
+        Freq: 2s, dtype: int64
+
+        >>> r.agg(["sum", "mean", "max"])
+                            sum  mean  max
+        2013-01-01 00:00:00    3   1.5    2
+        2013-01-01 00:00:02    7   3.5    4
+        2013-01-01 00:00:04    5   5.0    5
+
+        >>> r.agg({"result": lambda x: x.mean() / x.std(), "total": "sum"})
+                            result  total
+        2013-01-01 00:00:00  2.121320      3
+        2013-01-01 00:00:02  4.949747      7
+        2013-01-01 00:00:04       NaN      5
+
+        >>> r.agg(average="mean", total="sum")
+                                average  total
+        2013-01-01 00:00:00      1.5      3
+        2013-01-01 00:00:02      3.5      7
+        2013-01-01 00:00:04      5.0      5
+        """
+        result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg()
+        if result is None:
+            how = func
+            result = self._groupby_and_aggregate(how, *args, **kwargs)
+
+        return result
+
+    agg = aggregate
+    apply = aggregate
+
+    @final
+    def transform(self, arg, *args, **kwargs):
+        """
+        Call function producing a like-indexed Series on each group.
+
+        Return a Series with the transformed values.
+
+        Parameters
+        ----------
+        arg : function
+            To apply to each group. Should return a Series with the same index.
+        *args, **kwargs
+            Additional arguments and keywords.
+
+        Returns
+        -------
+        Series
+            A Series with the transformed values, maintaining the same index as
+            the original object.
+
+        See Also
+        --------
+        core.resample.Resampler.apply : Apply a function along each group.
+        core.resample.Resampler.aggregate : Aggregate using one or more operations
+            over the specified axis.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2], index=pd.date_range("20180101", periods=2, freq="1h"))
+        >>> s
+        2018-01-01 00:00:00    1
+        2018-01-01 01:00:00    2
+        Freq: h, dtype: int64
+
+        >>> resampled = s.resample("15min")
+        >>> resampled.transform(lambda x: (x - x.mean()) / x.std())
+        2018-01-01 00:00:00   NaN
+        2018-01-01 01:00:00   NaN
+        Freq: h, dtype: float64
+        """
+        return self._selected_obj.groupby(self._timegrouper).transform(
+            arg, *args, **kwargs
+        )
+
+    def _downsample(self, how, **kwargs):
+        raise AbstractMethodError(self)
+
+    def _upsample(self, f, limit: int | None = None, fill_value=None):
+        raise AbstractMethodError(self)
+
+    def _gotitem(self, key, ndim: int, subset=None):
+        """
+        Sub-classes to define. Return a sliced object.
+
+        Parameters
+        ----------
+        key : string / list of selections
+        ndim : {1, 2}
+            requested ndim of result
+        subset : object, default None
+            subset to act on
+        """
+        grouper = self._grouper
+        if subset is None:
+            subset = self.obj
+            if key is not None:
+                subset = subset[key]
+            else:
+                # reached via Apply.agg_dict_like with selection=None and ndim=1
+                assert subset.ndim == 1
+        if ndim == 1:
+            assert subset.ndim == 1
+
+        grouped = get_groupby(
+            subset, by=None, grouper=grouper, group_keys=self.group_keys
+        )
+        return grouped
+
+    def _groupby_and_aggregate(self, how, *args, **kwargs):
+        """
+        Re-evaluate the obj with a groupby aggregation.
+        """
+        grouper = self._grouper
+
+        # Excludes `on` column when provided
+        obj = self._obj_with_exclusions
+
+        grouped = get_groupby(obj, by=None, grouper=grouper, group_keys=self.group_keys)
+
+        try:
+            if callable(how):
+                # TODO: test_resample_apply_with_additional_args fails if we go
+                #  through the non-lambda path, not clear that it should.
+                func = lambda x: how(x, *args, **kwargs)
+                result = grouped.aggregate(func)
+            else:
+                result = grouped.aggregate(how, *args, **kwargs)
+        except (AttributeError, KeyError):
+            # we have a non-reducing function; try to evaluate
+            # alternatively we want to evaluate only a column of the input
+
+            # test_apply_to_one_column_of_df the function being applied references
+            #  a DataFrame column, but aggregate_item_by_item operates column-wise
+            #  on Series, raising AttributeError or KeyError
+            #  (depending on whether the column lookup uses getattr/__getitem__)
+            result = grouped.apply(how, *args, **kwargs)
+
+        except ValueError as err:
+            if "Must produce aggregated value" in str(err):
+                # raised in _aggregate_named
+                # see test_apply_without_aggregation, test_apply_with_mutated_index
+                pass
+            else:
+                raise
+
+            # we have a non-reducing function
+            # try to evaluate
+            result = grouped.apply(how, *args, **kwargs)
+
+        return self._wrap_result(result)
+
+    @final
+    def _get_resampler_for_grouping(
+        self,
+        groupby: GroupBy,
+        key,
+    ):
+        """
+        Return the correct class for resampling with groupby.
+        """
+        return self._resampler_for_grouping(
+            groupby=groupby,
+            key=key,
+            parent=self,
+        )
+
+    def _wrap_result(self, result):
+        """
+        Potentially wrap any results.
+        """
+        if isinstance(result, ABCSeries) and self._selection is not None:
+            result.name = self._selection
+
+        if isinstance(result, ABCSeries) and result.empty:
+            # When index is all NaT, result is empty but index is not
+            obj = self.obj
+            result.index = _asfreq_compat(obj.index[:0], freq=self.freq)
+            result.name = getattr(obj, "name", None)
+
+        if self._timegrouper._arrow_dtype is not None:
+            result.index = result.index.astype(self._timegrouper._arrow_dtype)
+            result.index.name = self.obj.index.name
+
+        return result
+
+    @final
+    def ffill(self, limit: int | None = None):
+        """
+        Forward fill the values.
+
+        This method fills missing values by propagating the last valid
+        observation forward, up to the next valid observation. It is commonly
+        used in time series analysis when resampling data to a higher frequency
+        (upsampling) and filling gaps in the resampled output.
+
+        Parameters
+        ----------
+        limit : int, optional
+            Limit of how many values to fill.
+
+        Returns
+        -------
+        Series
+            The resampled data with missing values filled forward.
+
+        See Also
+        --------
+        Series.fillna: Fill NA/NaN values using the specified method.
+        DataFrame.fillna: Fill NA/NaN values using the specified method.
+
+        Examples
+        --------
+        Here we only create a ``Series``.
+
+        >>> ser = pd.Series(
+        ...     [1, 2, 3, 4],
+        ...     index=pd.DatetimeIndex(
+        ...         ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"]
+        ...     ),
+        ... )
+        >>> ser
+        2023-01-01    1
+        2023-01-15    2
+        2023-02-01    3
+        2023-02-15    4
+        dtype: int64
+
+        Example for ``ffill`` with downsampling (we have fewer dates after resampling):
+
+        >>> ser.resample("MS").ffill()
+        2023-01-01    1
+        2023-02-01    3
+        Freq: MS, dtype: int64
+
+        Example for ``ffill`` with upsampling (fill the new dates with
+        the previous value):
+
+        >>> ser.resample("W").ffill()
+        2023-01-01    1
+        2023-01-08    1
+        2023-01-15    2
+        2023-01-22    2
+        2023-01-29    2
+        2023-02-05    3
+        2023-02-12    3
+        2023-02-19    4
+        Freq: W-SUN, dtype: int64
+
+        With upsampling and limiting (only fill the first new date with the
+        previous value):
+
+        >>> ser.resample("W").ffill(limit=1)
+        2023-01-01    1.0
+        2023-01-08    1.0
+        2023-01-15    2.0
+        2023-01-22    2.0
+        2023-01-29    NaN
+        2023-02-05    3.0
+        2023-02-12    NaN
+        2023-02-19    4.0
+        Freq: W-SUN, dtype: float64
+        """
+        return self._upsample("ffill", limit=limit)
+
+    @final
+    def nearest(self, limit: int | None = None):
+        """
+        Resample by using the nearest value.
+
+        When resampling data, missing values may appear (e.g., when the
+        resampling frequency is higher than the original frequency).
+        The `nearest` method will replace ``NaN`` values that appeared in
+        the resampled data with the value from the nearest member of the
+        sequence, based on the index value.
+        Missing values that existed in the original data will not be modified.
+        If `limit` is given, fill only this many values in each direction for
+        each of the original values.
+
+        Parameters
+        ----------
+        limit : int, optional
+            Limit of how many values to fill.
+
+        Returns
+        -------
+        Series or DataFrame
+            An upsampled Series or DataFrame with ``NaN`` values filled with
+            their nearest value.
+
+        See Also
+        --------
+        bfill : Backward fill the new missing values in the resampled data.
+        ffill : Forward fill ``NaN`` values.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2], index=pd.date_range("20180101", periods=2, freq="1h"))
+        >>> s
+        2018-01-01 00:00:00    1
+        2018-01-01 01:00:00    2
+        Freq: h, dtype: int64
+
+        >>> s.resample("15min").nearest()
+        2018-01-01 00:00:00    1
+        2018-01-01 00:15:00    1
+        2018-01-01 00:30:00    2
+        2018-01-01 00:45:00    2
+        2018-01-01 01:00:00    2
+        Freq: 15min, dtype: int64
+
+        Limit the number of upsampled values imputed by the nearest:
+
+        >>> s.resample("15min").nearest(limit=1)
+        2018-01-01 00:00:00    1.0
+        2018-01-01 00:15:00    1.0
+        2018-01-01 00:30:00    NaN
+        2018-01-01 00:45:00    2.0
+        2018-01-01 01:00:00    2.0
+        Freq: 15min, dtype: float64
+        """
+        return self._upsample("nearest", limit=limit)
+
+    @final
+    def bfill(self, limit: int | None = None):
+        """
+        Backward fill the new missing values in the resampled data.
+
+        In statistics, imputation is the process of replacing missing data with
+        substituted values [1]_. When resampling data, missing values may
+        appear (e.g., when the resampling frequency is higher than the original
+        frequency). The backward fill will replace NaN values that appeared in
+        the resampled data with the next value in the original sequence.
+        Missing values that existed in the original data will not be modified.
+
+        Parameters
+        ----------
+        limit : int, optional
+            Limit of how many values to fill.
+
+        Returns
+        -------
+        Series, DataFrame
+            An upsampled Series or DataFrame with backward filled NaN values.
+
+        See Also
+        --------
+        nearest : Fill NaN values with nearest neighbor starting from center.
+        ffill : Forward fill NaN values.
+        Series.fillna : Fill NaN values in the Series using the
+            specified method, which can be 'backfill'.
+        DataFrame.fillna : Fill NaN values in the DataFrame using the
+            specified method, which can be 'backfill'.
+
+        References
+        ----------
+        .. [1] https://en.wikipedia.org/wiki/Imputation_%28statistics%29
+
+        Examples
+        --------
+        Resampling a Series:
+
+        >>> s = pd.Series(
+        ...     [1, 2, 3], index=pd.date_range("20180101", periods=3, freq="h")
+        ... )
+        >>> s
+        2018-01-01 00:00:00    1
+        2018-01-01 01:00:00    2
+        2018-01-01 02:00:00    3
+        Freq: h, dtype: int64
+
+        >>> s.resample("30min").bfill()
+        2018-01-01 00:00:00    1
+        2018-01-01 00:30:00    2
+        2018-01-01 01:00:00    2
+        2018-01-01 01:30:00    3
+        2018-01-01 02:00:00    3
+        Freq: 30min, dtype: int64
+
+        >>> s.resample("15min").bfill(limit=2)
+        2018-01-01 00:00:00    1.0
+        2018-01-01 00:15:00    NaN
+        2018-01-01 00:30:00    2.0
+        2018-01-01 00:45:00    2.0
+        2018-01-01 01:00:00    2.0
+        2018-01-01 01:15:00    NaN
+        2018-01-01 01:30:00    3.0
+        2018-01-01 01:45:00    3.0
+        2018-01-01 02:00:00    3.0
+        Freq: 15min, dtype: float64
+
+        Resampling a DataFrame that has missing values:
+
+        >>> df = pd.DataFrame(
+        ...     {"a": [2, np.nan, 6], "b": [1, 3, 5]},
+        ...     index=pd.date_range("20180101", periods=3, freq="h"),
+        ... )
+        >>> df
+                               a  b
+        2018-01-01 00:00:00  2.0  1
+        2018-01-01 01:00:00  NaN  3
+        2018-01-01 02:00:00  6.0  5
+
+        >>> df.resample("30min").bfill()
+                               a  b
+        2018-01-01 00:00:00  2.0  1
+        2018-01-01 00:30:00  NaN  3
+        2018-01-01 01:00:00  NaN  3
+        2018-01-01 01:30:00  6.0  5
+        2018-01-01 02:00:00  6.0  5
+
+        >>> df.resample("15min").bfill(limit=2)
+                               a    b
+        2018-01-01 00:00:00  2.0  1.0
+        2018-01-01 00:15:00  NaN  NaN
+        2018-01-01 00:30:00  NaN  3.0
+        2018-01-01 00:45:00  NaN  3.0
+        2018-01-01 01:00:00  NaN  3.0
+        2018-01-01 01:15:00  NaN  NaN
+        2018-01-01 01:30:00  6.0  5.0
+        2018-01-01 01:45:00  6.0  5.0
+        2018-01-01 02:00:00  6.0  5.0
+        """
+        return self._upsample("bfill", limit=limit)
+
+    @final
+    def interpolate(
+        self,
+        method: InterpolateOptions = "linear",
+        *,
+        axis: Axis = 0,
+        limit: int | None = None,
+        limit_direction: Literal["forward", "backward", "both"] = "forward",
+        limit_area=None,
+        **kwargs,
+    ):
+        """
+        Interpolate values between target timestamps according to different methods.
+
+        The original index is first reindexed to target timestamps
+        (see :meth:`core.resample.Resampler.asfreq`),
+        then the interpolation of ``NaN`` values via :meth:`DataFrame.interpolate`
+        happens.
+
+        Parameters
+        ----------
+        method : str, default 'linear'
+            Interpolation technique to use. One of:
+
+            * 'linear': Ignore the index and treat the values as equally
+              spaced. This is the only method supported on MultiIndexes.
+            * 'time': Works on daily and higher resolution data to interpolate
+              given length of interval.
+            * 'index', 'values': use the actual numerical values of the index.
+            * 'pad': Fill in NaNs using existing values.
+            * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
+              'barycentric', 'polynomial': Passed to
+              `scipy.interpolate.interp1d`, whereas 'spline' is passed to
+              `scipy.interpolate.UnivariateSpline`. These methods use the numerical
+              values of the index.  Both 'polynomial' and 'spline' require that
+              you also specify an `order` (int), e.g.
+              ``df.interpolate(method='polynomial', order=5)``. Note that,
+              `slinear` method in Pandas refers to the Scipy first order `spline`
+              instead of Pandas first order `spline`.
+            * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima',
+              'cubicspline': Wrappers around the SciPy interpolation methods of
+              similar names. See `Notes`.
+            * 'from_derivatives': Refers to
+              `scipy.interpolate.BPoly.from_derivatives`.
+
+        axis : {{0 or 'index', 1 or 'columns', None}}, default None
+            Axis to interpolate along. For `Series` this parameter is unused
+            and defaults to 0.
+        limit : int, optional
+            Maximum number of consecutive NaNs to fill. Must be greater than
+            0.
+        limit_direction : {{'forward', 'backward', 'both'}}, Optional
+            Consecutive NaNs will be filled in this direction.
+
+        limit_area : {{`None`, 'inside', 'outside'}}, default None
+            If limit is specified, consecutive NaNs will be filled with this
+            restriction.
+
+            * ``None``: No fill restriction.
+            * 'inside': Only fill NaNs surrounded by valid values
+              (interpolate).
+            * 'outside': Only fill NaNs outside valid values (extrapolate).
+
+        **kwargs : optional
+            Keyword arguments to pass on to the interpolating function.
+
+        Returns
+        -------
+        DataFrame or Series
+            Interpolated values at the specified freq.
+
+        See Also
+        --------
+        core.resample.Resampler.asfreq: Return the values at the new freq,
+            essentially a reindex.
+        DataFrame.interpolate: Fill NaN values using an interpolation method.
+        DataFrame.bfill : Backward fill NaN values in the resampled data.
+        DataFrame.ffill : Forward fill NaN values.
+
+        Notes
+        -----
+        For high-frequent or non-equidistant time-series with timestamps
+        the reindexing followed by interpolation may lead to information loss
+        as shown in the last example.
+
+        Examples
+        --------
+
+        >>> start = "2023-03-01T07:00:00"
+        >>> timesteps = pd.date_range(start, periods=5, freq="s")
+        >>> series = pd.Series(data=[1, -1, 2, 1, 3], index=timesteps)
+        >>> series
+        2023-03-01 07:00:00    1
+        2023-03-01 07:00:01   -1
+        2023-03-01 07:00:02    2
+        2023-03-01 07:00:03    1
+        2023-03-01 07:00:04    3
+        Freq: s, dtype: int64
+
+        Downsample the dataframe to 0.5Hz by providing the period time of 2s.
+
+        >>> series.resample("2s").interpolate("linear")
+        2023-03-01 07:00:00    1
+        2023-03-01 07:00:02    2
+        2023-03-01 07:00:04    3
+        Freq: 2s, dtype: int64
+
+        Upsample the dataframe to 2Hz by providing the period time of 500ms.
+
+        >>> series.resample("500ms").interpolate("linear")
+        2023-03-01 07:00:00.000    1.0
+        2023-03-01 07:00:00.500    0.0
+        2023-03-01 07:00:01.000   -1.0
+        2023-03-01 07:00:01.500    0.5
+        2023-03-01 07:00:02.000    2.0
+        2023-03-01 07:00:02.500    1.5
+        2023-03-01 07:00:03.000    1.0
+        2023-03-01 07:00:03.500    2.0
+        2023-03-01 07:00:04.000    3.0
+        Freq: 500ms, dtype: float64
+
+        Internal reindexing with ``asfreq()`` prior to interpolation leads to
+        an interpolated timeseries on the basis of the reindexed timestamps
+        (anchors). It is assured that all available datapoints from original
+        series become anchors, so it also works for resampling-cases that lead
+        to non-aligned timestamps, as in the following example:
+
+        >>> series.resample("400ms").interpolate("linear")
+        2023-03-01 07:00:00.000    1.000000
+        2023-03-01 07:00:00.400    0.333333
+        2023-03-01 07:00:00.800   -0.333333
+        2023-03-01 07:00:01.200    0.000000
+        2023-03-01 07:00:01.600    1.000000
+        2023-03-01 07:00:02.000    2.000000
+        2023-03-01 07:00:02.400    1.666667
+        2023-03-01 07:00:02.800    1.333333
+        2023-03-01 07:00:03.200    1.666667
+        2023-03-01 07:00:03.600    2.333333
+        2023-03-01 07:00:04.000    3.000000
+        Freq: 400ms, dtype: float64
+
+        Note that the series correctly decreases between two anchors
+        ``07:00:00`` and ``07:00:02``.
+        """
+        if "inplace" in kwargs:
+            # GH#58690
+            warnings.warn(
+                f"The 'inplace' keyword in {type(self).__name__}.interpolate "
+                "is deprecated and will be removed in a future version. "
+                "resample(...).interpolate is never inplace.",
+                Pandas4Warning,
+                stacklevel=find_stack_level(),
+            )
+            inplace = kwargs.pop("inplace")
+            if inplace:
+                raise ValueError("Cannot interpolate inplace on a resampled object.")
+
+        result = self._upsample("asfreq")
+
+        # If the original data has timestamps which are not aligned with the
+        # target timestamps, we need to add those points back to the data frame
+        # that is supposed to be interpolated. This does not work with
+        # PeriodIndex, so we skip this case. GH#21351
+        obj = self._selected_obj
+        is_period_index = isinstance(obj.index, PeriodIndex)
+
+        # Skip this step for PeriodIndex
+        if not is_period_index:
+            final_index = result.index
+            if isinstance(final_index, MultiIndex):
+                raise NotImplementedError(
+                    "Direct interpolation of MultiIndex data frames is not "
+                    "supported. If you tried to resample and interpolate on a "
+                    "grouped data frame, please use:\n"
+                    "`df.groupby(...).apply(lambda x: x.resample(...)."
+                    "interpolate(...))`"
+                    "\ninstead, as resampling and interpolation has to be "
+                    "performed for each group independently."
+                )
+
+            missing_data_points_index = obj.index.difference(final_index)
+            if len(missing_data_points_index) > 0:
+                result = concat(
+                    [result, obj.loc[missing_data_points_index]]
+                ).sort_index()
+
+        result_interpolated = result.interpolate(
+            method=method,
+            axis=axis,
+            limit=limit,
+            inplace=False,
+            limit_direction=limit_direction,
+            limit_area=limit_area,
+            **kwargs,
+        )
+
+        # No further steps if the original data has a PeriodIndex
+        if is_period_index:
+            return result_interpolated
+
+        # Make sure that original data points which do not align with the
+        # resampled index are removed
+        result_interpolated = result_interpolated.loc[final_index]
+
+        # Make sure frequency indexes are preserved
+        result_interpolated.index = final_index
+        return result_interpolated
+
+    @final
+    def asfreq(self, fill_value=None):
+        """
+        Return the values at the new freq, essentially a reindex.
+
+        Parameters
+        ----------
+        fill_value : scalar, optional
+            Value to use for missing values, applied during upsampling (note
+            this does not fill NaNs that already were present).
+
+        Returns
+        -------
+        DataFrame or Series
+            Values at the specified freq.
+
+        See Also
+        --------
+        Series.asfreq: Convert TimeSeries to specified frequency.
+        DataFrame.asfreq: Convert TimeSeries to specified frequency.
+
+        Examples
+        --------
+
+        >>> ser = pd.Series(
+        ...     [1, 2, 3, 4],
+        ...     index=pd.DatetimeIndex(
+        ...         ["2023-01-01", "2023-01-31", "2023-02-01", "2023-02-28"]
+        ...     ),
+        ... )
+        >>> ser
+        2023-01-01    1
+        2023-01-31    2
+        2023-02-01    3
+        2023-02-28    4
+        dtype: int64
+        >>> ser.resample("MS").asfreq()
+        2023-01-01    1
+        2023-02-01    3
+        Freq: MS, dtype: int64
+        """
+        return self._upsample("asfreq", fill_value=fill_value)
+
+    @final
+    def sum(
+        self,
+        numeric_only: bool = False,
+        min_count: int = 0,
+    ):
+        """
+        Compute sum of group values.
+
+        This method provides a simple way to compute the sum of values within each
+        resampled group, particularly useful for aggregating time-based data into
+        daily, monthly, or yearly sums.
+
+        Parameters
+        ----------
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+
+            .. versionchanged:: 2.0.0
+
+                numeric_only no longer accepts ``None``.
+
+        min_count : int, default 0
+            The required number of valid values to perform the operation. If fewer
+            than ``min_count`` non-NA values are present the result will be NA.
+
+        Returns
+        -------
+        Series or DataFrame
+            Computed sum of values within each group.
+
+        See Also
+        --------
+        core.resample.Resampler.mean : Compute mean of groups, excluding missing values.
+        core.resample.Resampler.count : Compute count of group, excluding missing
+            values.
+        DataFrame.resample : Resample time-series data.
+        Series.sum : Return the sum of the values over the requested axis.
+
+        Examples
+        --------
+        >>> ser = pd.Series(
+        ...     [1, 2, 3, 4],
+        ...     index=pd.DatetimeIndex(
+        ...         ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"]
+        ...     ),
+        ... )
+        >>> ser
+        2023-01-01    1
+        2023-01-15    2
+        2023-02-01    3
+        2023-02-15    4
+        dtype: int64
+        >>> ser.resample("MS").sum()
+        2023-01-01    3
+        2023-02-01    7
+        Freq: MS, dtype: int64
+        """
+        return self._downsample("sum", numeric_only=numeric_only, min_count=min_count)
+
+    @final
+    def prod(
+        self,
+        numeric_only: bool = False,
+        min_count: int = 0,
+    ):
+        """
+        Compute prod of group values.
+
+        Parameters
+        ----------
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+
+            .. versionchanged:: 2.0.0
+
+                numeric_only no longer accepts ``None``.
+
+        min_count : int, default 0
+            The required number of valid values to perform the operation. If fewer
+            than ``min_count`` non-NA values are present the result will be NA.
+
+        Returns
+        -------
+        Series or DataFrame
+            Computed prod of values within each group.
+
+        See Also
+        --------
+        core.resample.Resampler.sum : Compute sum of groups, excluding missing values.
+        core.resample.Resampler.mean : Compute mean of groups, excluding missing values.
+        core.resample.Resampler.median : Compute median of groups, excluding missing
+            values.
+
+        Examples
+        --------
+        >>> ser = pd.Series(
+        ...     [1, 2, 3, 4],
+        ...     index=pd.DatetimeIndex(
+        ...         ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"]
+        ...     ),
+        ... )
+        >>> ser
+        2023-01-01    1
+        2023-01-15    2
+        2023-02-01    3
+        2023-02-15    4
+        dtype: int64
+        >>> ser.resample("MS").prod()
+        2023-01-01    2
+        2023-02-01   12
+        Freq: MS, dtype: int64
+        """
+        return self._downsample("prod", numeric_only=numeric_only, min_count=min_count)
+
+    @final
+    def min(
+        self,
+        numeric_only: bool = False,
+        min_count: int = 0,
+    ):
+        """
+        Compute min value of group.
+
+        Parameters
+        ----------
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+
+            .. versionchanged:: 2.0.0
+
+                numeric_only no longer accepts ``None``.
+
+        min_count : int, default 0
+            The required number of valid values to perform the operation. If fewer
+            than ``min_count`` non-NA values are present the result will be NA.
+
+        Returns
+        -------
+        Series or DataFrame
+            Compute the minimum value in the given Series or DataFrame.
+
+        See Also
+        --------
+        core.resample.Resampler.max : Compute max value of group.
+        core.resample.Resampler.mean : Compute mean of groups, excluding missing values.
+        core.resample.Resampler.median : Compute median of groups, excluding missing
+            values.
+
+        Examples
+        --------
+        >>> ser = pd.Series(
+        ...     [1, 2, 3, 4],
+        ...     index=pd.DatetimeIndex(
+        ...         ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"]
+        ...     ),
+        ... )
+        >>> ser
+        2023-01-01    1
+        2023-01-15    2
+        2023-02-01    3
+        2023-02-15    4
+        dtype: int64
+        >>> ser.resample("MS").min()
+        2023-01-01    1
+        2023-02-01    3
+        Freq: MS, dtype: int64
+        """
+        return self._downsample("min", numeric_only=numeric_only, min_count=min_count)
+
+    @final
+    def max(
+        self,
+        numeric_only: bool = False,
+        min_count: int = 0,
+    ):
+        """
+        Compute max value of group.
+
+        Parameters
+        ----------
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+
+            .. versionchanged:: 2.0.0
+
+                numeric_only no longer accepts ``None``.
+
+        min_count : int, default 0
+            The required number of valid values to perform the operation. If fewer
+            than ``min_count`` non-NA values are present the result will be NA.
+
+        Returns
+        -------
+        Series or DataFrame
+            Computes the maximum value in the given Series or Dataframe.
+
+        See Also
+        --------
+        core.resample.Resampler.min : Compute min value of group.
+        core.resample.Resampler.mean : Compute mean of groups, excluding missing values.
+        core.resample.Resampler.median : Compute median of groups, excluding missing
+            values.
+
+        Examples
+        --------
+        >>> ser = pd.Series(
+        ...     [1, 2, 3, 4],
+        ...     index=pd.DatetimeIndex(
+        ...         ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"]
+        ...     ),
+        ... )
+        >>> ser
+        2023-01-01    1
+        2023-01-15    2
+        2023-02-01    3
+        2023-02-15    4
+        dtype: int64
+        >>> ser.resample("MS").max()
+        2023-01-01    2
+        2023-02-01    4
+        Freq: MS, dtype: int64
+        """
+        return self._downsample("max", numeric_only=numeric_only, min_count=min_count)
+
+    @final
+    def first(
+        self,
+        numeric_only: bool = False,
+        min_count: int = 0,
+        skipna: bool = True,
+    ):
+        """
+        Compute the first non-null entry of each column.
+
+        Parameters
+        ----------
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+        min_count : int, default 0
+            The required number of valid values to perform the operation. If fewer
+            than ``min_count`` non-NA values are present the result will be NA.
+        skipna : bool, default True
+            Exclude NA/null values. If an entire group is NA, the result will be NA.
+
+        Returns
+        -------
+        Series or DataFrame
+            First values within each group.
+
+        See Also
+        --------
+        core.resample.Resampler.last : Compute the last non-null value in each group.
+        core.resample.Resampler.mean : Compute mean of groups, excluding missing values.
+
+        Examples
+        --------
+        >>> s = pd.Series(
+        ...     [1, 2, 3, 4],
+        ...     index=pd.DatetimeIndex(
+        ...         ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"]
+        ...     ),
+        ... )
+        >>> s
+        2023-01-01    1
+        2023-01-15    2
+        2023-02-01    3
+        2023-02-15    4
+        dtype: int64
+        >>> s.resample("MS").first()
+        2023-01-01    1
+        2023-02-01    3
+        Freq: MS, dtype: int64
+        """
+        return self._downsample(
+            "first", numeric_only=numeric_only, min_count=min_count, skipna=skipna
+        )
+
+    @final
+    def last(
+        self,
+        numeric_only: bool = False,
+        min_count: int = 0,
+        skipna: bool = True,
+    ):
+        """
+        Compute the last non-null entry of each column.
+
+        Parameters
+        ----------
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+        min_count : int, default 0
+            The required number of valid values to perform the operation. If fewer
+            than ``min_count`` non-NA values are present the result will be NA.
+        skipna : bool, default True
+            Exclude NA/null values. If an entire group is NA, the result will be NA.
+
+        Returns
+        -------
+        Series or DataFrame
+            Last of values within each group.
+
+        See Also
+        --------
+        core.resample.Resampler.first : Compute the first non-null value in each group.
+        core.resample.Resampler.mean : Compute mean of groups, excluding missing values.
+
+        Examples
+        --------
+        >>> s = pd.Series(
+        ...     [1, 2, 3, 4],
+        ...     index=pd.DatetimeIndex(
+        ...         ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"]
+        ...     ),
+        ... )
+        >>> s
+        2023-01-01    1
+        2023-01-15    2
+        2023-02-01    3
+        2023-02-15    4
+        dtype: int64
+        >>> s.resample("MS").last()
+        2023-01-01    2
+        2023-02-01    4
+        Freq: MS, dtype: int64
+        """
+        return self._downsample(
+            "last", numeric_only=numeric_only, min_count=min_count, skipna=skipna
+        )
+
+    @final
+    def median(self, numeric_only: bool = False):
+        """
+        Compute median of groups, excluding missing values.
+
+        For multiple groupings, the result index will be a MultiIndex
+
+        Parameters
+        ----------
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+
+            .. versionchanged:: 2.0.0
+
+                numeric_only no longer accepts ``None`` and defaults to False.
+
+        Returns
+        -------
+        Series or DataFrame
+            Median of values within each group.
+
+        See Also
+        --------
+        Series.groupby : Apply a function groupby to a Series.
+        DataFrame.groupby : Apply a function groupby to each row or column of a
+            DataFrame.
+
+        Examples
+        --------
+
+        >>> ser = pd.Series(
+        ...     [1, 2, 3, 3, 4, 5],
+        ...     index=pd.DatetimeIndex(
+        ...         [
+        ...             "2023-01-01",
+        ...             "2023-01-10",
+        ...             "2023-01-15",
+        ...             "2023-02-01",
+        ...             "2023-02-10",
+        ...             "2023-02-15",
+        ...         ]
+        ...     ),
+        ... )
+        >>> ser.resample("MS").median()
+        2023-01-01    2.0
+        2023-02-01    4.0
+        Freq: MS, dtype: float64
+        """
+        return self._downsample("median", numeric_only=numeric_only)
+
+    @final
+    def mean(
+        self,
+        numeric_only: bool = False,
+    ):
+        """
+        Compute mean of groups, excluding missing values.
+
+        Parameters
+        ----------
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionchanged:: 2.0.0
+
+                numeric_only now defaults to ``False``.
+
+        Returns
+        -------
+        DataFrame or Series
+            Mean of values within each group.
+
+        See Also
+        --------
+        core.resample.Resampler.median : Compute median of groups, excluding missing
+            values.
+        core.resample.Resampler.sum : Compute sum of groups, excluding missing values.
+        core.resample.Resampler.std : Compute standard deviation of groups, excluding
+            missing values.
+        core.resample.Resampler.var : Compute variance of groups, excluding missing
+            values.
+
+        Examples
+        --------
+
+        >>> ser = pd.Series(
+        ...     [1, 2, 3, 4],
+        ...     index=pd.DatetimeIndex(
+        ...         ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"]
+        ...     ),
+        ... )
+        >>> ser
+        2023-01-01    1
+        2023-01-15    2
+        2023-02-01    3
+        2023-02-15    4
+        dtype: int64
+        >>> ser.resample("MS").mean()
+        2023-01-01    1.5
+        2023-02-01    3.5
+        Freq: MS, dtype: float64
+        """
+        return self._downsample("mean", numeric_only=numeric_only)
+
+    @final
+    def std(
+        self,
+        ddof: int = 1,
+        numeric_only: bool = False,
+    ):
+        """
+        Compute standard deviation of groups, excluding missing values.
+
+        Parameters
+        ----------
+        ddof : int, default 1
+            Degrees of freedom.
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionchanged:: 2.0.0
+
+                numeric_only now defaults to ``False``.
+
+        Returns
+        -------
+        DataFrame or Series
+            Standard deviation of values within each group.
+
+        See Also
+        --------
+        core.resample.Resampler.mean : Compute mean of groups, excluding missing values.
+        core.resample.Resampler.median : Compute median of groups, excluding missing
+            values.
+        core.resample.Resampler.var : Compute variance of groups, excluding missing
+            values.
+
+        Examples
+        --------
+
+        >>> ser = pd.Series(
+        ...     [1, 3, 2, 4, 3, 8],
+        ...     index=pd.DatetimeIndex(
+        ...         [
+        ...             "2023-01-01",
+        ...             "2023-01-10",
+        ...             "2023-01-15",
+        ...             "2023-02-01",
+        ...             "2023-02-10",
+        ...             "2023-02-15",
+        ...         ]
+        ...     ),
+        ... )
+        >>> ser.resample("MS").std()
+        2023-01-01    1.000000
+        2023-02-01    2.645751
+        Freq: MS, dtype: float64
+        """
+        return self._downsample("std", ddof=ddof, numeric_only=numeric_only)
+
+    @final
+    def var(
+        self,
+        ddof: int = 1,
+        numeric_only: bool = False,
+    ):
+        """
+        Compute variance of groups, excluding missing values.
+
+        Parameters
+        ----------
+        ddof : int, default 1
+            Degrees of freedom.
+
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionchanged:: 2.0.0
+
+                numeric_only now defaults to ``False``.
+
+        Returns
+        -------
+        DataFrame or Series
+            Variance of values within each group.
+
+        See Also
+        --------
+        core.resample.Resampler.std : Compute standard deviation of groups, excluding
+            missing values.
+        core.resample.Resampler.mean : Compute mean of groups, excluding missing values.
+        core.resample.Resampler.median : Compute median of groups, excluding missing
+            values.
+
+        Examples
+        --------
+
+        >>> ser = pd.Series(
+        ...     [1, 3, 2, 4, 3, 8],
+        ...     index=pd.DatetimeIndex(
+        ...         [
+        ...             "2023-01-01",
+        ...             "2023-01-10",
+        ...             "2023-01-15",
+        ...             "2023-02-01",
+        ...             "2023-02-10",
+        ...             "2023-02-15",
+        ...         ]
+        ...     ),
+        ... )
+        >>> ser.resample("MS").var()
+        2023-01-01    1.0
+        2023-02-01    7.0
+        Freq: MS, dtype: float64
+
+        >>> ser.resample("MS").var(ddof=0)
+        2023-01-01    0.666667
+        2023-02-01    4.666667
+        Freq: MS, dtype: float64
+        """
+        return self._downsample("var", ddof=ddof, numeric_only=numeric_only)
+
+    @final
+    def sem(
+        self,
+        ddof: int = 1,
+        numeric_only: bool = False,
+    ):
+        """
+        Compute standard error of the mean of groups, excluding missing values.
+
+        For multiple groupings, the result index will be a MultiIndex.
+
+        Parameters
+        ----------
+        ddof : int, default 1
+            Degrees of freedom.
+
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionchanged:: 2.0.0
+
+                numeric_only now defaults to ``False``.
+
+        Returns
+        -------
+        Series or DataFrame
+            Standard error of the mean of values within each group.
+
+        See Also
+        --------
+        DataFrame.sem : Return unbiased standard error of the mean over requested axis.
+        Series.sem : Return unbiased standard error of the mean over requested axis.
+
+        Examples
+        --------
+
+        >>> ser = pd.Series(
+        ...     [1, 3, 2, 4, 3, 8],
+        ...     index=pd.DatetimeIndex(
+        ...         [
+        ...             "2023-01-01",
+        ...             "2023-01-10",
+        ...             "2023-01-15",
+        ...             "2023-02-01",
+        ...             "2023-02-10",
+        ...             "2023-02-15",
+        ...         ]
+        ...     ),
+        ... )
+        >>> ser.resample("MS").sem()
+        2023-01-01    0.577350
+        2023-02-01    1.527525
+        Freq: MS, dtype: float64
+        """
+        return self._downsample("sem", ddof=ddof, numeric_only=numeric_only)
+
+    @final
+    def ohlc(self):
+        """
+        Compute open, high, low and close values of a group, excluding missing values.
+
+        Returns
+        -------
+        DataFrame
+            Open, high, low and close values within each group.
+
+        See Also
+        --------
+        DataFrame.agg : Aggregate using one or more operations over the specified axis.
+        DataFrame.resample : Resample time-series data.
+        DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns.
+
+        Examples
+        --------
+        >>> ser = pd.Series(
+        ...     [1, 3, 2, 4, 3, 5],
+        ...     index=pd.DatetimeIndex(
+        ...         [
+        ...             "2023-01-01",
+        ...             "2023-01-10",
+        ...             "2023-01-15",
+        ...             "2023-02-01",
+        ...             "2023-02-10",
+        ...             "2023-02-15",
+        ...         ]
+        ...     ),
+        ... )
+        >>> ser.resample("MS").ohlc()
+                    open  high  low  close
+        2023-01-01     1     3    1      2
+        2023-02-01     4     5    3      5
+        """
+        ax = self.ax
+        obj = self._obj_with_exclusions
+        if len(ax) == 0:
+            # GH#42902
+            obj = obj.copy()
+            obj.index = _asfreq_compat(obj.index, self.freq)
+            if obj.ndim == 1:
+                obj = obj.to_frame()
+                obj = obj.reindex(["open", "high", "low", "close"], axis=1)
+            else:
+                mi = MultiIndex.from_product(
+                    [obj.columns, ["open", "high", "low", "close"]]
+                )
+                obj = obj.reindex(mi, axis=1)
+            return obj
+
+        return self._downsample("ohlc")
+
+    @final
+    def nunique(self):
+        """
+        Return number of unique elements in the group.
+
+        Returns
+        -------
+        Series
+            Number of unique values within each group.
+
+        See Also
+        --------
+        core.groupby.SeriesGroupBy.nunique : Method nunique for SeriesGroupBy.
+
+        Examples
+        --------
+        >>> ser = pd.Series(
+        ...     [1, 2, 3, 3],
+        ...     index=pd.DatetimeIndex(
+        ...         ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"]
+        ...     ),
+        ... )
+        >>> ser
+        2023-01-01    1
+        2023-01-15    2
+        2023-02-01    3
+        2023-02-15    3
+        dtype: int64
+        >>> ser.resample("MS").nunique()
+        2023-01-01    2
+        2023-02-01    1
+        Freq: MS, dtype: int64
+        """
+        return self._downsample("nunique")
+
+    @final
+    def size(self):
+        """
+        Compute group sizes.
+
+        Returns
+        -------
+        Series
+            Number of rows in each group.
+
+        See Also
+        --------
+        Series.groupby : Apply a function groupby to a Series.
+        DataFrame.groupby : Apply a function groupby to each row
+            or column of a DataFrame.
+
+        Examples
+        --------
+        >>> ser = pd.Series(
+        ...     [1, 2, 3],
+        ...     index=pd.DatetimeIndex(["2023-01-01", "2023-01-15", "2023-02-01"]),
+        ... )
+        >>> ser
+        2023-01-01    1
+        2023-01-15    2
+        2023-02-01    3
+        dtype: int64
+        >>> ser.resample("MS").size()
+        2023-01-01    2
+        2023-02-01    1
+        Freq: MS, dtype: int64
+        """
+        result = self._downsample("size")
+
+        # If the result is a non-empty DataFrame we stack to get a Series
+        # GH 46826
+        if isinstance(result, ABCDataFrame) and not result.empty:
+            result = result.stack()
+
+        if not len(self.ax):
+            from pandas import Series
+
+            if self._selected_obj.ndim == 1:
+                name = self._selected_obj.name
+            else:
+                name = None
+            result = Series([], index=result.index, dtype="int64", name=name)
+        return result
+
+    @final
+    def count(self):
+        """
+        Compute count of group, excluding missing values.
+
+        Returns
+        -------
+        Series or DataFrame
+            Count of values within each group.
+
+        See Also
+        --------
+        Series.groupby : Apply a function groupby to a Series.
+        DataFrame.groupby : Apply a function groupby to each row
+            or column of a DataFrame.
+
+        Examples
+        --------
+        >>> ser = pd.Series(
+        ...     [1, 2, 3, 4],
+        ...     index=pd.DatetimeIndex(
+        ...         ["2023-01-01", "2023-01-15", "2023-02-01", "2023-02-15"]
+        ...     ),
+        ... )
+        >>> ser
+        2023-01-01    1
+        2023-01-15    2
+        2023-02-01    3
+        2023-02-15    4
+        dtype: int64
+        >>> ser.resample("MS").count()
+        2023-01-01    2
+        2023-02-01    2
+        Freq: MS, dtype: int64
+        """
+        result = self._downsample("count")
+        if not len(self.ax):
+            if self._selected_obj.ndim == 1:
+                result = type(self._selected_obj)(
+                    [], index=result.index, dtype="int64", name=self._selected_obj.name
+                )
+            else:
+                from pandas import DataFrame
+
+                result = DataFrame(
+                    [], index=result.index, columns=result.columns, dtype="int64"
+                )
+
+        return result
+
+    @final
+    def quantile(self, q: float | list[float] | AnyArrayLike = 0.5, **kwargs):
+        """
+        Return value at the given quantile.
+
+        Computes the quantile of values within each resampled group.
+
+        Parameters
+        ----------
+        q : float or array-like, default 0.5 (50% quantile)
+            Value between 0 <= q <= 1, the quantile(s) to compute.
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        DataFrame or Series
+            Quantile of values within each group.
+
+        See Also
+        --------
+        Series.quantile
+            Return a series, where the index is q and the values are the quantiles.
+        DataFrame.quantile
+            Return a DataFrame, where the columns are the columns of self,
+            and the values are the quantiles.
+        DataFrameGroupBy.quantile
+            Return a DataFrame, where the columns are groupby columns,
+            and the values are its quantiles.
+
+        Examples
+        --------
+
+        >>> ser = pd.Series(
+        ...     [1, 3, 2, 4, 3, 8],
+        ...     index=pd.DatetimeIndex(
+        ...         [
+        ...             "2023-01-01",
+        ...             "2023-01-10",
+        ...             "2023-01-15",
+        ...             "2023-02-01",
+        ...             "2023-02-10",
+        ...             "2023-02-15",
+        ...         ]
+        ...     ),
+        ... )
+        >>> ser.resample("MS").quantile()
+        2023-01-01    2.0
+        2023-02-01    4.0
+        Freq: MS, dtype: float64
+
+        >>> ser.resample("MS").quantile(0.25)
+        2023-01-01    1.5
+        2023-02-01    3.5
+        Freq: MS, dtype: float64
+        """
+        return self._downsample("quantile", q=q, **kwargs)
+
+
+class _GroupByMixin(PandasObject, SelectionMixin):
+    """
+    Provide the groupby facilities.
+    """
+
+    _attributes: list[str]  # in practice the same as Resampler._attributes
+    _selection: IndexLabel | None = None
+    _groupby: GroupBy
+    _timegrouper: TimeGrouper
+
+    def __init__(
+        self,
+        *,
+        parent: Resampler,
+        groupby: GroupBy,
+        key=None,
+        selection: IndexLabel | None = None,
+    ) -> None:
+        # reached via ._gotitem and _get_resampler_for_grouping
+
+        assert isinstance(groupby, GroupBy), type(groupby)
+
+        # parent is always a Resampler, sometimes a _GroupByMixin
+        assert isinstance(parent, Resampler), type(parent)
+
+        # initialize our GroupByMixin object with
+        # the resampler attributes
+        for attr in self._attributes:
+            setattr(self, attr, getattr(parent, attr))
+        self._selection = selection
+
+        self.binner = parent.binner
+        self.key = key
+
+        self._groupby = groupby
+        self._timegrouper = copy.copy(parent._timegrouper)
+
+        self.ax = parent.ax
+        self.obj = parent.obj
+
+    @no_type_check
+    def _apply(self, f, *args, **kwargs):
+        """
+        Dispatch to _upsample; we are stripping all of the _upsample kwargs and
+        performing the original function call on the grouped object.
+        """
+
+        def func(x):
+            x = self._resampler_cls(x, timegrouper=self._timegrouper, gpr_index=self.ax)
+
+            if isinstance(f, str):
+                return getattr(x, f)(**kwargs)
+
+            return x.apply(f, *args, **kwargs)
+
+        result = self._groupby.apply(func)
+
+        # GH 47705
+        if (
+            isinstance(result, ABCDataFrame)
+            and len(result) == 0
+            and not isinstance(result.index, PeriodIndex)
+        ):
+            result = result.set_index(
+                _asfreq_compat(self.obj.index[:0], freq=self.freq), append=True
+            )
+
+        return self._wrap_result(result)
+
+    _upsample = _apply
+    _downsample = _apply
+    _groupby_and_aggregate = _apply
+
+    @final
+    def _gotitem(self, key, ndim, subset=None):
+        """
+        Sub-classes to define. Return a sliced object.
+
+        Parameters
+        ----------
+        key : string / list of selections
+        ndim : {1, 2}
+            requested ndim of result
+        subset : object, default None
+            subset to act on
+        """
+        # create a new object to prevent aliasing
+        if subset is None:
+            subset = self.obj
+            if key is not None:
+                subset = subset[key]
+            else:
+                # reached via Apply.agg_dict_like with selection=None, ndim=1
+                assert subset.ndim == 1
+
+        # Try to select from a DataFrame, falling back to a Series
+        try:
+            if isinstance(key, list) and self.key not in key and self.key is not None:
+                key.append(self.key)
+            groupby = self._groupby[key]
+        except IndexError:
+            groupby = self._groupby
+
+        selection = self._infer_selection(key, subset)
+
+        new_rs = type(self)(
+            groupby=groupby,
+            parent=cast(Resampler, self),
+            selection=selection,
+        )
+        return new_rs
+
+
+class DatetimeIndexResampler(Resampler):
+    ax: DatetimeIndex
+
+    @property
+    def _resampler_for_grouping(self) -> type[DatetimeIndexResamplerGroupby]:
+        return DatetimeIndexResamplerGroupby
+
+    def _get_binner_for_time(self):
+        # this is how we are actually creating the bins
+        return self._timegrouper._get_time_bins(self.ax)
+
+    def _downsample(self, how, **kwargs):
+        """
+        Downsample the cython defined function.
+
+        Parameters
+        ----------
+        how : string / cython mapped function
+        **kwargs : kw args passed to how function
+        """
+        ax = self.ax
+
+        # Excludes `on` column when provided
+        obj = self._obj_with_exclusions
+
+        if not len(ax):
+            # reset to the new freq
+            obj = obj.copy()
+            obj.index = obj.index._with_freq(self.freq)
+            assert obj.index.freq == self.freq, (obj.index.freq, self.freq)
+            return obj
+
+        # we are downsampling
+        # we want to call the actual grouper method here
+        result = obj.groupby(self._grouper).aggregate(how, **kwargs)
+        return self._wrap_result(result)
+
+    def _adjust_binner_for_upsample(self, binner):
+        """
+        Adjust our binner when upsampling.
+
+        The range of a new index should not be outside specified range
+        """
+        if self.closed == "right":
+            binner = binner[1:]
+        else:
+            binner = binner[:-1]
+        return binner
+
+    def _upsample(self, method, limit: int | None = None, fill_value=None):
+        """
+        Parameters
+        ----------
+        method : string {'backfill', 'bfill', 'pad',
+            'ffill', 'asfreq'} method for upsampling
+        limit : int, default None
+            Maximum size gap to fill when reindexing
+        fill_value : scalar, default None
+            Value to use for missing values
+        """
+        if self._from_selection:
+            raise ValueError(
+                "Upsampling from level= or on= selection "
+                "is not supported, use .set_index(...) "
+                "to explicitly set index to datetime-like"
+            )
+
+        ax = self.ax
+        obj = self._selected_obj
+        binner = self.binner
+        res_index = self._adjust_binner_for_upsample(binner)
+
+        # if index exactly matches target grid (same freq & alignment), use fast path
+        if (
+            limit is None
+            and to_offset(ax.inferred_freq) == self.freq
+            and len(obj) == len(res_index)
+            and obj.index.equals(res_index)
+        ):
+            result = obj.copy()
+            result.index = res_index
+        else:
+            if method == "asfreq":
+                method = None
+            result = obj.reindex(
+                res_index, method=method, limit=limit, fill_value=fill_value
+            )
+
+        return self._wrap_result(result)
+
+    def _wrap_result(self, result):
+        result = super()._wrap_result(result)
+
+        # we may have a different kind that we were asked originally
+        # convert if needed
+        if isinstance(self.ax, PeriodIndex) and not isinstance(
+            result.index, PeriodIndex
+        ):
+            if isinstance(result.index, MultiIndex):
+                # GH 24103 - e.g. groupby resample
+                if not isinstance(result.index.levels[-1], PeriodIndex):
+                    new_level = result.index.levels[-1].to_period(self.freq)
+                    result.index = result.index.set_levels(new_level, level=-1)
+            else:
+                result.index = result.index.to_period(self.freq)
+        return result
+
+
+@set_module("pandas.api.typing")
+# error: Definition of "ax" in base class "_GroupByMixin" is incompatible
+# with definition in base class "DatetimeIndexResampler"
+class DatetimeIndexResamplerGroupby(  # type: ignore[misc]
+    _GroupByMixin, DatetimeIndexResampler
+):
+    """
+    Provides a resample of a groupby implementation
+    """
+
+    @property
+    def _resampler_cls(self):
+        return DatetimeIndexResampler
+
+
+class PeriodIndexResampler(DatetimeIndexResampler):
+    # error: Incompatible types in assignment (expression has type "PeriodIndex", base
+    # class "DatetimeIndexResampler" defined the type as "DatetimeIndex")
+    ax: PeriodIndex  # type: ignore[assignment]
+
+    @property
+    def _resampler_for_grouping(self):
+        return PeriodIndexResamplerGroupby
+
+    def _get_binner_for_time(self):
+        return self._timegrouper._get_period_bins(self.ax)
+
+    def _convert_obj(self, obj: NDFrameT) -> NDFrameT:
+        obj = super()._convert_obj(obj)
+
+        if self._from_selection:
+            # see GH 14008, GH 12871
+            msg = (
+                "Resampling from level= or on= selection "
+                "with a PeriodIndex is not currently supported, "
+                "use .set_index(...) to explicitly set index"
+            )
+            raise NotImplementedError(msg)
+
+        return obj
+
+    def _downsample(self, how, **kwargs):
+        """
+        Downsample the cython defined function.
+
+        Parameters
+        ----------
+        how : string / cython mapped function
+        **kwargs : kw args passed to how function
+        """
+        ax = self.ax
+
+        if is_subperiod(ax.freq, self.freq):
+            # Downsampling
+            return self._groupby_and_aggregate(how, **kwargs)
+        elif is_superperiod(ax.freq, self.freq):
+            if how == "ohlc":
+                # GH #13083
+                # upsampling to subperiods is handled as an asfreq, which works
+                # for pure aggregating/reducing methods
+                # OHLC reduces along the time dimension, but creates multiple
+                # values for each period -> handle by _groupby_and_aggregate()
+                return self._groupby_and_aggregate(how)
+            return self.asfreq()
+        elif ax.freq == self.freq:
+            return self.asfreq()
+
+        raise IncompatibleFrequency(
+            f"Frequency {ax.freq} cannot be resampled to {self.freq}, "
+            "as they are not sub or super periods"
+        )
+
+    def _upsample(self, method, limit: int | None = None, fill_value=None):
+        """
+        Parameters
+        ----------
+        method : {'backfill', 'bfill', 'pad', 'ffill'}
+            Method for upsampling.
+        limit : int, default None
+            Maximum size gap to fill when reindexing.
+        fill_value : scalar, default None
+            Value to use for missing values.
+        """
+        ax = self.ax
+        obj = self.obj
+        new_index = self.binner
+
+        # Start vs. end of period
+        memb = ax.asfreq(self.freq, how=self.convention)
+
+        # Get the fill indexer
+        if method == "asfreq":
+            method = None
+        indexer = memb.get_indexer(new_index, method=method, limit=limit)
+        new_obj = _take_new_index(
+            obj,
+            indexer,
+            new_index,
+        )
+        return self._wrap_result(new_obj)
+
+
+@set_module("pandas.api.typing")
+# error: Definition of "ax" in base class "_GroupByMixin" is incompatible with
+# definition in base class "PeriodIndexResampler"
+class PeriodIndexResamplerGroupby(  # type: ignore[misc]
+    _GroupByMixin, PeriodIndexResampler
+):
+    """
+    Provides a resample of a groupby implementation.
+    """
+
+    @property
+    def _resampler_cls(self):
+        return PeriodIndexResampler
+
+
+class TimedeltaIndexResampler(DatetimeIndexResampler):
+    # error: Incompatible types in assignment (expression has type "TimedeltaIndex",
+    # base class "DatetimeIndexResampler" defined the type as "DatetimeIndex")
+    ax: TimedeltaIndex  # type: ignore[assignment]
+
+    @property
+    def _resampler_for_grouping(self):
+        return TimedeltaIndexResamplerGroupby
+
+    def _get_binner_for_time(self):
+        return self._timegrouper._get_time_delta_bins(self.ax)
+
+    def _adjust_binner_for_upsample(self, binner):
+        """
+        Adjust our binner when upsampling.
+
+        The range of a new index is allowed to be greater than original range
+        so we don't need to change the length of a binner, GH 13022
+        """
+        return binner
+
+
+@set_module("pandas.api.typing")
+# error: Definition of "ax" in base class "_GroupByMixin" is incompatible with
+# definition in base class "DatetimeIndexResampler"
+class TimedeltaIndexResamplerGroupby(  # type: ignore[misc]
+    _GroupByMixin, TimedeltaIndexResampler
+):
+    """
+    Provides a resample of a groupby implementation.
+    """
+
+    @property
+    def _resampler_cls(self):
+        return TimedeltaIndexResampler
+
+
+def get_resampler(obj: Series | DataFrame, **kwds) -> Resampler:
+    """
+    Create a TimeGrouper and return our resampler.
+    """
+    tg = TimeGrouper(obj, **kwds)  # type: ignore[arg-type]
+    return tg._get_resampler(obj)
+
+
+get_resampler.__doc__ = Resampler.__doc__
+
+
+def get_resampler_for_grouping(
+    groupby: GroupBy,
+    rule,
+    how=None,
+    fill_method=None,
+    limit: int | None = None,
+    on=None,
+    **kwargs,
+) -> Resampler:
+    """
+    Return our appropriate resampler when grouping as well.
+    """
+    # .resample uses 'on' similar to how .groupby uses 'key'
+    tg = TimeGrouper(freq=rule, key=on, **kwargs)
+    resampler = tg._get_resampler(groupby.obj)
+    return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key)
+
+
+@set_module("pandas.api.typing")
+class TimeGrouper(Grouper):
+    """
+    Custom groupby class for time-interval grouping.
+
+    Parameters
+    ----------
+    freq : pandas date offset or offset alias for identifying bin edges
+    closed : closed end of interval; 'left' or 'right'
+    label : interval boundary to use for labeling; 'left' or 'right'
+    convention : {'start', 'end', 'e', 's'}
+        If axis is PeriodIndex
+    """
+
+    _attributes = (
+        *Grouper._attributes,
+        "closed",
+        "label",
+        "how",
+        "convention",
+        "origin",
+        "offset",
+    )
+
+    origin: TimeGrouperOrigin
+
+    def __init__(
+        self,
+        obj: Grouper | None = None,
+        freq: Frequency = "Min",
+        key: str | None = None,
+        closed: Literal["left", "right"] | None = None,
+        label: Literal["left", "right"] | None = None,
+        how: str = "mean",
+        fill_method=None,
+        limit: int | None = None,
+        convention: Literal["start", "end", "e", "s"] | None = None,
+        origin: (
+            Literal["epoch", "start", "start_day", "end", "end_day"]
+            | TimestampConvertibleTypes
+        ) = "start_day",
+        offset: TimedeltaConvertibleTypes | None = None,
+        group_keys: bool = False,
+        **kwargs,
+    ) -> None:
+        # Check for correctness of the keyword arguments which would
+        # otherwise silently use the default if misspelled
+        if label not in {None, "left", "right"}:
+            raise ValueError(f"Unsupported value {label} for `label`")
+        if closed not in {None, "left", "right"}:
+            raise ValueError(f"Unsupported value {closed} for `closed`")
+        if convention not in {None, "start", "end", "e", "s"}:
+            raise ValueError(f"Unsupported value {convention} for `convention`")
+
+        if (key is None and obj is not None and isinstance(obj.index, PeriodIndex)) or (  # type: ignore[attr-defined]
+            key is not None
+            and obj is not None
+            and getattr(obj[key], "dtype", None) == "period"  # type: ignore[index]
+        ):
+            freq = to_offset(freq, is_period=True)
+        else:
+            freq = to_offset(freq)
+
+        if not isinstance(freq, Tick):
+            if offset is not None:
+                warnings.warn(
+                    "The 'offset' keyword does not take effect when resampling "
+                    "with a 'freq' that is not Tick-like (h, m, s, ms, us, ns)",
+                    RuntimeWarning,
+                    stacklevel=find_stack_level(),
+                )
+            if origin != "start_day":
+                warnings.warn(
+                    "The 'origin' keyword does not take effect when resampling "
+                    "with a 'freq' that is not Tick-like (h, m, s, ms, us, ns)",
+                    RuntimeWarning,
+                    stacklevel=find_stack_level(),
+                )
+
+        end_types = {"ME", "YE", "QE", "BME", "BYE", "BQE", "W"}
+        rule = freq.rule_code
+        if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types):
+            if closed is None:
+                closed = "right"
+            if label is None:
+                label = "right"
+        # The backward resample sets ``closed`` to ``'right'`` by default
+        # since the last value should be considered as the edge point for
+        # the last bin. When origin in "end" or "end_day", the value for a
+        # specific ``Timestamp`` index stands for the resample result from
+        # the current ``Timestamp`` minus ``freq`` to the current
+        # ``Timestamp`` with a right close.
+        elif origin in ["end", "end_day"]:
+            if closed is None:
+                closed = "right"
+            if label is None:
+                label = "right"
+        else:
+            if closed is None:
+                closed = "left"
+            if label is None:
+                label = "left"
+
+        self.closed = closed
+        self.label = label
+        self.convention = convention if convention is not None else "e"
+        self.how = how
+        self.fill_method = fill_method
+        self.limit = limit
+        self.group_keys = group_keys
+        self._arrow_dtype: ArrowDtype | None = None
+
+        if origin in ("epoch", "start", "start_day", "end", "end_day"):
+            # error: Incompatible types in assignment (expression has type "Union[Union[
+            # Timestamp, datetime, datetime64, signedinteger[_64Bit], float, str],
+            # Literal['epoch', 'start', 'start_day', 'end', 'end_day']]", variable has
+            # type "Union[Timestamp, Literal['epoch', 'start', 'start_day', 'end',
+            # 'end_day']]")
+            self.origin = origin  # type: ignore[assignment]
+        else:
+            try:
+                self.origin = Timestamp(origin)
+            except (ValueError, TypeError) as err:
+                raise ValueError(
+                    "'origin' should be equal to 'epoch', 'start', 'start_day', "
+                    "'end', 'end_day' or "
+                    f"should be a Timestamp convertible type. Got '{origin}' instead."
+                ) from err
+
+        try:
+            self.offset = Timedelta(offset) if offset is not None else None
+        except (ValueError, TypeError) as err:
+            raise ValueError(
+                "'offset' should be a Timedelta convertible type. "
+                f"Got '{offset}' instead."
+            ) from err
+
+        # always sort time groupers
+        kwargs["sort"] = True
+
+        super().__init__(freq=freq, key=key, **kwargs)
+
+    def _get_resampler(self, obj: NDFrame) -> Resampler:
+        """
+        Return my resampler or raise if we have an invalid axis.
+
+        Parameters
+        ----------
+        obj : Series or DataFrame
+
+        Returns
+        -------
+        Resampler
+
+        Raises
+        ------
+        TypeError if incompatible axis
+
+        """
+        _, ax, _ = self._set_grouper(obj, gpr_index=None)
+        if isinstance(ax, DatetimeIndex):
+            return DatetimeIndexResampler(
+                obj,
+                timegrouper=self,
+                group_keys=self.group_keys,
+                gpr_index=ax,
+            )
+        elif isinstance(ax, PeriodIndex):
+            return PeriodIndexResampler(
+                obj,
+                timegrouper=self,
+                group_keys=self.group_keys,
+                gpr_index=ax,
+            )
+        elif isinstance(ax, TimedeltaIndex):
+            return TimedeltaIndexResampler(
+                obj,
+                timegrouper=self,
+                group_keys=self.group_keys,
+                gpr_index=ax,
+            )
+
+        raise TypeError(
+            "Only valid with DatetimeIndex, "
+            "TimedeltaIndex or PeriodIndex, "
+            f"but got an instance of '{type(ax).__name__}'"
+        )
+
+    def _get_grouper(
+        self, obj: NDFrameT, validate: bool = True, observed: bool = True
+    ) -> tuple[BinGrouper, NDFrameT]:
+        """
+        Parameters
+        ----------
+        obj : Series or DataFrame
+            Object being grouped.
+        validate : bool, default True
+            Unused. Only for compatibility with ``Grouper._get_grouper``.
+        observed : bool, default True
+            Unused. Only for compatibility with ``Grouper._get_grouper``.
+
+        Returns
+        -------
+        A tuple of grouper, obj (possibly sorted)
+        """
+        # create the resampler and return our binner
+        r = self._get_resampler(obj)
+        return r._grouper, cast(NDFrameT, r.obj)
+
+    def _get_time_bins(self, ax: DatetimeIndex):
+        if not isinstance(ax, DatetimeIndex):
+            raise TypeError(
+                "axis must be a DatetimeIndex, but got "
+                f"an instance of {type(ax).__name__}"
+            )
+
+        if len(ax) == 0:
+            binner = labels = DatetimeIndex(
+                data=[], freq=self.freq, name=ax.name, dtype=ax.dtype
+            )
+            return binner, [], labels
+
+        first, last = _get_timestamp_range_edges(
+            ax.min(),
+            ax.max(),
+            self.freq,
+            unit=ax.unit,
+            closed=self.closed,
+            origin=self.origin,
+            offset=self.offset,
+        )
+        # GH #12037
+        # use first/last directly instead of call replace() on them
+        # because replace() will swallow the nanosecond part
+        # thus last bin maybe slightly before the end if the end contains
+        # nanosecond part and lead to `Values falls after last bin` error
+        # GH 25758: If DST lands at midnight (e.g. 'America/Havana'), user feedback
+        # has noted that ambiguous=True provides the most sensible result
+        binner = labels = date_range(
+            freq=self.freq,
+            start=first,
+            end=last,
+            tz=ax.tz,
+            name=ax.name,
+            ambiguous=True,
+            nonexistent="shift_forward",
+            unit=ax.unit,
+        )
+
+        ax_values = ax.asi8
+        binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
+
+        # general version, knowing nothing about relative frequencies
+        bins = lib.generate_bins_dt64(
+            ax_values, bin_edges, self.closed, hasnans=ax.hasnans
+        )
+
+        if self.closed == "right":
+            labels = binner
+            if self.label == "right":
+                labels = labels[1:]
+        elif self.label == "right":
+            labels = labels[1:]
+
+        if ax.hasnans:
+            binner = binner.insert(0, NaT)
+            labels = labels.insert(0, NaT)
+
+        # if we end up with more labels than bins
+        # adjust the labels
+        # GH4076
+        if len(bins) < len(labels):
+            labels = labels[: len(bins)]
+
+        return binner, bins, labels
+
+    def _adjust_bin_edges(
+        self, binner: DatetimeIndex, ax_values: npt.NDArray[np.int64]
+    ) -> tuple[DatetimeIndex, npt.NDArray[np.int64]]:
+        # Some hacks for > daily data, see #1471, #1458, #1483
+
+        if self.freq.name in ("BME", "ME", "W") or self.freq.name.split("-")[0] in (
+            "BQE",
+            "BYE",
+            "QE",
+            "YE",
+            "W",
+        ):
+            # If the right end-point is on the last day of the month, roll forwards
+            # until the last moment of that day. Note that we only do this for offsets
+            # which correspond to the end of a super-daily period - "month start", for
+            # example, is excluded.
+            if self.closed == "right":
+                # GH 21459, GH 9119: Adjust the bins relative to the wall time
+                edges_dti = binner.tz_localize(None)
+                edges_dti = (
+                    edges_dti
+                    + Timedelta(days=1).as_unit(edges_dti.unit)
+                    - Timedelta(1, unit=edges_dti.unit).as_unit(edges_dti.unit)
+                )
+                bin_edges = edges_dti.tz_localize(binner.tz).asi8
+            else:
+                bin_edges = binner.asi8
+
+            # intraday values on last day
+            if bin_edges[-2] > ax_values.max():
+                bin_edges = bin_edges[:-1]
+                binner = binner[:-1]
+        else:
+            bin_edges = binner.asi8
+        return binner, bin_edges
+
+    def _get_time_delta_bins(self, ax: TimedeltaIndex):
+        if not isinstance(ax, TimedeltaIndex):
+            raise TypeError(
+                "axis must be a TimedeltaIndex, but got "
+                f"an instance of {type(ax).__name__}"
+            )
+
+        if not isinstance(self.freq, (Tick, Day)):
+            # GH#51896
+            raise ValueError(
+                "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
+                f"e.g. '24h' or '3D', not {self.freq}"
+            )
+
+        if not len(ax):
+            binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name)
+            return binner, [], labels
+
+        start, end = ax.min(), ax.max()
+
+        if self.closed == "right":
+            end += self.freq
+
+        labels = binner = timedelta_range(
+            start=start, end=end, freq=self.freq, name=ax.name
+        )
+
+        end_stamps = labels
+        if self.closed == "left":
+            end_stamps += self.freq
+
+        bins = ax.searchsorted(end_stamps, side=self.closed)
+
+        if self.offset:
+            # GH 10530 & 31809
+            labels += self.offset
+
+        return binner, bins, labels
+
+    def _get_time_period_bins(self, ax: DatetimeIndex):
+        if not isinstance(ax, DatetimeIndex):
+            raise TypeError(
+                "axis must be a DatetimeIndex, but got "
+                f"an instance of {type(ax).__name__}"
+            )
+
+        freq = self.freq
+
+        if len(ax) == 0:
+            binner = labels = PeriodIndex(
+                data=[], freq=freq, name=ax.name, dtype=ax.dtype
+            )
+            return binner, [], labels
+
+        labels = binner = period_range(start=ax[0], end=ax[-1], freq=freq, name=ax.name)
+
+        end_stamps = (labels + freq).asfreq(freq, "s").to_timestamp()
+        if ax.tz:
+            end_stamps = end_stamps.tz_localize(ax.tz)
+        bins = ax.searchsorted(end_stamps, side="left")
+
+        return binner, bins, labels
+
+    def _get_period_bins(self, ax: PeriodIndex):
+        if not isinstance(ax, PeriodIndex):
+            raise TypeError(
+                "axis must be a PeriodIndex, but got "
+                f"an instance of {type(ax).__name__}"
+            )
+
+        memb = ax.asfreq(self.freq, how=self.convention)
+
+        # NaT handling as in pandas._lib.lib.generate_bins_dt64()
+        nat_count = 0
+        if memb.hasnans:
+            # error: Incompatible types in assignment (expression has type
+            # "bool_", variable has type "int")  [assignment]
+            nat_count = np.sum(memb._isnan)  # type: ignore[assignment]
+            memb = memb[~memb._isnan]
+
+        if not len(memb):
+            # index contains no valid (non-NaT) values
+            bins = np.array([], dtype=np.int64)
+            binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name)
+            if len(ax) > 0:
+                # index is all NaT
+                binner, bins, labels = _insert_nat_bin(binner, bins, labels, len(ax))
+            return binner, bins, labels
+
+        freq_mult = self.freq.n
+
+        start = ax.min().asfreq(self.freq, how=self.convention)
+        end = ax.max().asfreq(self.freq, how="end")
+        bin_shift = 0
+
+        if isinstance(self.freq, Tick):
+            # GH 23882 & 31809: get adjusted bin edge labels with 'origin'
+            # and 'origin' support. This call only makes sense if the freq is a
+            # Tick since offset and origin are only used in those cases.
+            # Not doing this check could create an extra empty bin.
+            p_start, end = _get_period_range_edges(
+                start,
+                end,
+                self.freq,
+                closed=self.closed,
+                origin=self.origin,
+                offset=self.offset,
+            )
+
+            # Get offset for bin edge (not label edge) adjustment
+            start_offset = Period(start, self.freq) - Period(p_start, self.freq)
+            # error: Item "Period" of "Union[Period, Any]" has no attribute "n"
+            bin_shift = start_offset.n % freq_mult  # type: ignore[union-attr]
+            start = p_start
+
+        labels = binner = period_range(
+            start=start, end=end, freq=self.freq, name=ax.name
+        )
+
+        i8 = memb.asi8
+
+        # when upsampling to subperiods, we need to generate enough bins
+        expected_bins_count = len(binner) * freq_mult
+        i8_extend = expected_bins_count - (i8[-1] - i8[0])
+        rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult)
+        rng += freq_mult
+        # adjust bin edge indexes to account for base
+        rng -= bin_shift
+
+        # Wrap in PeriodArray for PeriodArray.searchsorted
+        prng = type(memb._data)(rng, dtype=memb.dtype)
+        bins = memb.searchsorted(prng, side="left")
+
+        if nat_count > 0:
+            binner, bins, labels = _insert_nat_bin(binner, bins, labels, nat_count)
+
+        return binner, bins, labels
+
+    def _set_grouper(
+        self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None
+    ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]:
+        obj, ax, indexer = super()._set_grouper(obj, sort, gpr_index=gpr_index)
+        if isinstance(ax.dtype, ArrowDtype) and ax.dtype.kind in "Mm":
+            self._arrow_dtype = ax.dtype
+            ax = Index(
+                cast(ArrowExtensionArray, ax.array)._maybe_convert_datelike_array()
+            )
+        return obj, ax, indexer
+
+
+@overload
+def _take_new_index(
+    obj: DataFrame, indexer: npt.NDArray[np.intp], new_index: Index
+) -> DataFrame: ...
+
+
+@overload
+def _take_new_index(
+    obj: Series, indexer: npt.NDArray[np.intp], new_index: Index
+) -> Series: ...
+
+
+def _take_new_index(
+    obj: DataFrame | Series,
+    indexer: npt.NDArray[np.intp],
+    new_index: Index,
+) -> DataFrame | Series:
+    if isinstance(obj, ABCSeries):
+        new_values = algos.take_nd(obj._values, indexer)
+        return obj._constructor(new_values, index=new_index, name=obj.name)
+    elif isinstance(obj, ABCDataFrame):
+        new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1)
+        return obj._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
+    else:
+        raise ValueError("'obj' should be either a Series or a DataFrame")
+
+
+def _get_timestamp_range_edges(
+    first: Timestamp,
+    last: Timestamp,
+    freq: BaseOffset,
+    unit: TimeUnit,
+    closed: Literal["right", "left"] = "left",
+    origin: TimeGrouperOrigin = "start_day",
+    offset: Timedelta | None = None,
+) -> tuple[Timestamp, Timestamp]:
+    """
+    Adjust the `first` Timestamp to the preceding Timestamp that resides on
+    the provided offset. Adjust the `last` Timestamp to the following
+    Timestamp that resides on the provided offset. Input Timestamps that
+    already reside on the offset will be adjusted depending on the type of
+    offset and the `closed` parameter.
+
+    Parameters
+    ----------
+    first : pd.Timestamp
+        The beginning Timestamp of the range to be adjusted.
+    last : pd.Timestamp
+        The ending Timestamp of the range to be adjusted.
+    freq : pd.DateOffset
+        The dateoffset to which the Timestamps will be adjusted.
+    closed : {'right', 'left'}, default "left"
+        Which side of bin interval is closed.
+    origin : {'epoch', 'start', 'start_day'} or Timestamp, default 'start_day'
+        The timestamp on which to adjust the grouping. The timezone of origin must
+        match the timezone of the index.
+        If a timestamp is not used, these values are also supported:
+
+        - 'epoch': `origin` is 1970-01-01
+        - 'start': `origin` is the first value of the timeseries
+        - 'start_day': `origin` is the first day at midnight of the timeseries
+    offset : pd.Timedelta, default is None
+        An offset timedelta added to the origin.
+
+    Returns
+    -------
+    A tuple of length 2, containing the adjusted pd.Timestamp objects.
+    """
+    if isinstance(freq, Tick):
+        index_tz = first.tz
+        if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None):
+            raise ValueError("The origin must have the same timezone as the index.")
+        if origin == "epoch":
+            # set the epoch based on the timezone to have similar bins results when
+            # resampling on the same kind of indexes on different timezones
+            origin = Timestamp("1970-01-01", tz=index_tz)
+
+        first, last = _adjust_dates_anchored(
+            first,
+            last,
+            freq,
+            closed=closed,
+            origin=origin,
+            offset=offset,
+            unit=unit,
+        )
+    else:
+        first = first.normalize()
+        last = last.normalize()
+
+        if closed == "left":
+            first = Timestamp(freq.rollback(first))
+        else:
+            first = Timestamp(first - freq)
+
+        last = Timestamp(last + freq)
+
+    return first, last
+
+
+def _get_period_range_edges(
+    first: Period,
+    last: Period,
+    freq: BaseOffset,
+    closed: Literal["right", "left"] = "left",
+    origin: TimeGrouperOrigin = "start_day",
+    offset: Timedelta | None = None,
+) -> tuple[Period, Period]:
+    """
+    Adjust the provided `first` and `last` Periods to the respective Period of
+    the given offset that encompasses them.
+
+    Parameters
+    ----------
+    first : pd.Period
+        The beginning Period of the range to be adjusted.
+    last : pd.Period
+        The ending Period of the range to be adjusted.
+    freq : pd.DateOffset
+        The freq to which the Periods will be adjusted.
+    closed : {'right', 'left'}, default "left"
+        Which side of bin interval is closed.
+    origin : {'epoch', 'start', 'start_day'}, Timestamp, default 'start_day'
+        The timestamp on which to adjust the grouping. The timezone of origin must
+        match the timezone of the index.
+
+        If a timestamp is not used, these values are also supported:
+
+        - 'epoch': `origin` is 1970-01-01
+        - 'start': `origin` is the first value of the timeseries
+        - 'start_day': `origin` is the first day at midnight of the timeseries
+    offset : pd.Timedelta, default is None
+        An offset timedelta added to the origin.
+
+    Returns
+    -------
+    A tuple of length 2, containing the adjusted pd.Period objects.
+    """
+    if not all(isinstance(obj, Period) for obj in [first, last]):
+        raise TypeError("'first' and 'last' must be instances of type Period")
+
+    # GH 23882
+    first_ts = first.to_timestamp()
+    last_ts = last.to_timestamp()
+    adjust_first = not freq.is_on_offset(first_ts)
+    adjust_last = freq.is_on_offset(last_ts)
+
+    first_ts, last_ts = _get_timestamp_range_edges(
+        first_ts, last_ts, freq, unit="ns", closed=closed, origin=origin, offset=offset
+    )
+
+    first = (first_ts + int(adjust_first) * freq).to_period(freq)
+    last = (last_ts - int(adjust_last) * freq).to_period(freq)
+    return first, last
+
+
+def _insert_nat_bin(
+    binner: PeriodIndex, bins: np.ndarray, labels: PeriodIndex, nat_count: int
+) -> tuple[PeriodIndex, np.ndarray, PeriodIndex]:
+    # NaT handling as in pandas._lib.lib.generate_bins_dt64()
+    # shift bins by the number of NaT
+    assert nat_count > 0
+    bins += nat_count
+    bins = np.insert(bins, 0, nat_count)
+
+    # Incompatible types in assignment (expression has type "Index", variable
+    # has type "PeriodIndex")
+    binner = binner.insert(0, NaT)  # type: ignore[assignment]
+    # Incompatible types in assignment (expression has type "Index", variable
+    # has type "PeriodIndex")
+    labels = labels.insert(0, NaT)  # type: ignore[assignment]
+    return binner, bins, labels
+
+
+def _adjust_dates_anchored(
+    first: Timestamp,
+    last: Timestamp,
+    freq: Tick,
+    closed: Literal["right", "left"] = "right",
+    origin: TimeGrouperOrigin = "start_day",
+    offset: Timedelta | None = None,
+    unit: TimeUnit = "ns",
+) -> tuple[Timestamp, Timestamp]:
+    # First and last offsets should be calculated from the start day to fix an
+    # error cause by resampling across multiple days when a one day period is
+    # not a multiple of the frequency. See GH 8683
+    # To handle frequencies that are not multiple or divisible by a day we let
+    # the possibility to define a fixed origin timestamp. See GH 31809
+    first = first.as_unit(unit)
+    last = last.as_unit(unit)
+    if offset is not None:
+        offset = offset.as_unit(unit)
+
+    freq_value = Timedelta(freq).as_unit(unit)._value
+
+    origin_timestamp = 0  # origin == "epoch"
+    if origin == "start_day":
+        origin_timestamp = first.normalize()._value
+    elif origin == "start":
+        origin_timestamp = first._value
+    elif isinstance(origin, Timestamp):
+        origin_timestamp = origin.as_unit(unit)._value
+    elif origin in ["end", "end_day"]:
+        origin_last = last if origin == "end" else last.ceil("D")
+        sub_freq_times = (origin_last._value - first._value) // freq_value
+        if closed == "left":
+            sub_freq_times += 1
+        first = origin_last - sub_freq_times * freq
+        origin_timestamp = first._value
+    origin_timestamp += offset._value if offset else 0
+
+    # GH 10117 & GH 19375. If first and last contain timezone information,
+    # Perform the calculation in UTC in order to avoid localizing on an
+    # Ambiguous or Nonexistent time.
+    first_tzinfo = first.tzinfo
+    last_tzinfo = last.tzinfo
+    if first_tzinfo is not None:
+        first = first.tz_convert("UTC")
+    if last_tzinfo is not None:
+        last = last.tz_convert("UTC")
+
+    foffset = (first._value - origin_timestamp) % freq_value
+    loffset = (last._value - origin_timestamp) % freq_value
+
+    if closed == "right":
+        if foffset > 0:
+            # roll back
+            fresult_int = first._value - foffset
+        else:
+            fresult_int = first._value - freq_value
+
+        if loffset > 0:
+            # roll forward
+            lresult_int = last._value + (freq_value - loffset)
+        else:
+            # already the end of the road
+            lresult_int = last._value
+    else:  # closed == 'left'
+        if foffset > 0:
+            fresult_int = first._value - foffset
+        else:
+            # start of the road
+            fresult_int = first._value
+
+        if loffset > 0:
+            # roll forward
+            lresult_int = last._value + (freq_value - loffset)
+        else:
+            lresult_int = last._value + freq_value
+    fresult = Timestamp(fresult_int, unit=unit)
+    lresult = Timestamp(lresult_int, unit=unit)
+    if first_tzinfo is not None:
+        fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo)
+    if last_tzinfo is not None:
+        lresult = lresult.tz_localize("UTC").tz_convert(last_tzinfo)
+    return fresult, lresult
+
+
+def asfreq(
+    obj: NDFrameT,
+    freq,
+    method=None,
+    how=None,
+    normalize: bool = False,
+    fill_value=None,
+) -> NDFrameT:
+    """
+    Utility frequency conversion method for Series/DataFrame.
+
+    See :meth:`pandas.NDFrame.asfreq` for full documentation.
+    """
+    if isinstance(obj.index, PeriodIndex):
+        if method is not None:
+            raise NotImplementedError("'method' argument is not supported")
+
+        if how is None:
+            how = "E"
+
+        if isinstance(freq, BaseOffset):
+            if hasattr(freq, "_period_dtype_code"):
+                freq = PeriodDtype(freq)._freqstr
+
+        new_obj = obj.copy()
+        new_obj.index = obj.index.asfreq(freq, how=how)
+
+    elif len(obj.index) == 0:
+        new_obj = obj.copy()
+
+        new_obj.index = _asfreq_compat(obj.index, freq)
+    else:
+        unit: TimeUnit = "ns"
+        if isinstance(obj.index, DatetimeIndex):
+            # TODO: should we disallow non-DatetimeIndex?
+            unit = obj.index.unit
+        dti = date_range(obj.index.min(), obj.index.max(), freq=freq, unit=unit)
+        dti.name = obj.index.name
+        new_obj = obj.reindex(dti, method=method, fill_value=fill_value)
+        if normalize:
+            new_obj.index = new_obj.index.normalize()
+
+    return new_obj
+
+
+def _asfreq_compat(index: FreqIndexT, freq) -> FreqIndexT:
+    """
+    Helper to mimic asfreq on (empty) DatetimeIndex and TimedeltaIndex.
+
+    Parameters
+    ----------
+    index : PeriodIndex, DatetimeIndex, or TimedeltaIndex
+    freq : DateOffset
+
+    Returns
+    -------
+    same type as index
+    """
+    if len(index) != 0:
+        # This should never be reached, always checked by the caller
+        raise ValueError(
+            "Can only set arbitrary freq for empty DatetimeIndex or TimedeltaIndex"
+        )
+    if isinstance(index, PeriodIndex):
+        new_index = index.asfreq(freq=freq)
+    elif isinstance(index, DatetimeIndex):
+        new_index = DatetimeIndex([], dtype=index.dtype, freq=freq, name=index.name)
+    elif isinstance(index, TimedeltaIndex):
+        new_index = TimedeltaIndex([], dtype=index.dtype, freq=freq, name=index.name)
+    else:  # pragma: no cover
+        raise TypeError(type(index))
+    return new_index
diff --git a/pandas/core/roperator.py b/pandas/core/roperator.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ea4bea41cdeaac7b0520cafc08656b1dbe5519d
--- /dev/null
+++ b/pandas/core/roperator.py
@@ -0,0 +1,63 @@
+"""
+Reversed Operations not available in the stdlib operator module.
+Defining these instead of using lambdas allows us to reference them by name.
+"""
+
+from __future__ import annotations
+
+import operator
+
+
+def radd(left, right):
+    return right + left
+
+
+def rsub(left, right):
+    return right - left
+
+
+def rmul(left, right):
+    return right * left
+
+
+def rdiv(left, right):
+    return right / left
+
+
+def rtruediv(left, right):
+    return right / left
+
+
+def rfloordiv(left, right):
+    return right // left
+
+
+def rmod(left, right):
+    # check if right is a string as % is the string
+    # formatting operation; this is a TypeError
+    # otherwise perform the op
+    if isinstance(right, str):
+        typ = type(left).__name__
+        raise TypeError(f"{typ} cannot perform the operation mod")
+
+    return right % left
+
+
+def rdivmod(left, right):
+    return divmod(right, left)
+
+
+def rpow(left, right):
+    return right**left
+
+
+def rand_(left, right):
+    return operator.and_(right, left)
+
+
+def ror_(left, right):
+    return operator.or_(right, left)
+
+
+def rxor(left, right):
+    return operator.xor(right, left)
diff --git a/pandas/core/sample.py b/pandas/core/sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f476540cf406af438306a124779dd1d233f14fb
--- /dev/null
+++ b/pandas/core/sample.py
@@ -0,0 +1,163 @@
+"""
+Module containing utilities for NDFrame.sample() and .GroupBy.sample()
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from pandas._libs import lib
+
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCSeries,
+)
+
+if TYPE_CHECKING:
+    from pandas._typing import AxisInt
+
+    from pandas.core.generic import NDFrame
+
+
+def preprocess_weights(obj: NDFrame, weights, axis: AxisInt) -> np.ndarray:
+    """
+    Process and validate the `weights` argument to `NDFrame.sample` and
+    `.GroupBy.sample`.
+
+    Returns `weights` as an ndarray[np.float64], validated except for normalizing
+    weights (because that must be done groupwise in groupby sampling).
+    """
+    # If a series, align with frame
+    if isinstance(weights, ABCSeries):
+        weights = weights.reindex(obj.axes[axis])
+
+    # Strings acceptable if a dataframe and axis = 0
+    if isinstance(weights, str):
+        if isinstance(obj, ABCDataFrame):
+            if axis == 0:
+                try:
+                    weights = obj[weights]
+                except KeyError as err:
+                    raise KeyError(
+                        "String passed to weights not a valid column"
+                    ) from err
+            else:
+                raise ValueError(
+                    "Strings can only be passed to "
+                    "weights when sampling from rows on "
+                    "a DataFrame"
+                )
+        else:
+            raise ValueError(
+                "Strings cannot be passed as weights when sampling from a Series."
+            )
+
+    if isinstance(obj, ABCSeries):
+        func = obj._constructor
+    else:
+        func = obj._constructor_sliced
+
+    weights = func(weights, dtype="float64")._values
+
+    if len(weights) != obj.shape[axis]:
+        raise ValueError("Weights and axis to be sampled must be of same length")
+
+    if lib.has_infs(weights):
+        raise ValueError("weight vector may not include `inf` values")
+
+    if (weights < 0).any():
+        raise ValueError("weight vector many not include negative values")
+
+    missing = np.isnan(weights)
+    if missing.any():
+        # Don't modify weights in place
+        weights = weights.copy()
+        weights[missing] = 0
+    return weights
+
+
+def process_sampling_size(
+    n: int | None, frac: float | None, replace: bool
+) -> int | None:
+    """
+    Process and validate the `n` and `frac` arguments to `NDFrame.sample` and
+    `.GroupBy.sample`.
+
+    Returns None if `frac` should be used (variable sampling sizes), otherwise returns
+    the constant sampling size.
+    """
+    # If no frac or n, default to n=1.
+    if n is None and frac is None:
+        n = 1
+    elif n is not None and frac is not None:
+        raise ValueError("Please enter a value for `frac` OR `n`, not both")
+    elif n is not None:
+        if n < 0:
+            raise ValueError(
+                "A negative number of rows requested. Please provide `n` >= 0."
+            )
+        if n % 1 != 0:
+            raise ValueError("Only integers accepted as `n` values")
+    else:
+        assert frac is not None  # for mypy
+        if frac > 1 and not replace:
+            raise ValueError(
+                "Replace has to be set to `True` when "
+                "upsampling the population `frac` > 1."
+            )
+        if frac < 0:
+            raise ValueError(
+                "A negative number of rows requested. Please provide `frac` >= 0."
+            )
+
+    return n
+
+
+def sample(
+    obj_len: int,
+    size: int,
+    replace: bool,
+    weights: np.ndarray | None,
+    random_state: np.random.RandomState | np.random.Generator,
+) -> np.ndarray:
+    """
+    Randomly sample `size` indices in `np.arange(obj_len)`.
+
+    Parameters
+    ----------
+    obj_len : int
+        The length of the indices being considered
+    size : int
+        The number of values to choose
+    replace : bool
+        Allow or disallow sampling of the same row more than once.
+    weights : np.ndarray[np.float64] or None
+        If None, equal probability weighting, otherwise weights according
+        to the vector normalized
+    random_state: np.random.RandomState or np.random.Generator
+        State used for the random sampling
+
+    Returns
+    -------
+    np.ndarray[np.intp]
+    """
+    if weights is not None:
+        weight_sum = weights.sum()
+        if weight_sum != 0:
+            weights = weights / weight_sum
+        else:
+            raise ValueError("Invalid weights: weights sum to zero")
+
+        assert weights is not None  # for mypy
+        if not replace and size * weights.max() > 1:
+            raise ValueError(
+                "Weighted sampling cannot be achieved with replace=False. Either "
+                "set replace=True or use smaller weights. See the docstring of "
+                "sample for details."
+            )
+
+    return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype(
+        np.intp, copy=False
+    )
diff --git a/pandas/core/series.py b/pandas/core/series.py
new file mode 100644
index 0000000000000000000000000000000000000000..d54cbbdc67bd611650f6aa8738016b98b8dd8c8e
--- /dev/null
+++ b/pandas/core/series.py
@@ -0,0 +1,8771 @@
+"""
+Data structure for 1-dimensional cross-sectional and time series data
+"""
+
+from __future__ import annotations
+
+from collections.abc import (
+    Callable,
+    Hashable,
+    Iterable,
+    Mapping,
+    Sequence,
+)
+import functools
+import operator
+import sys
+from textwrap import dedent
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    Self,
+    cast,
+    overload,
+)
+import warnings
+
+import numpy as np
+
+from pandas._libs import (
+    lib,
+    properties,
+    reshape,
+)
+from pandas._libs.lib import is_range_indexer
+from pandas.compat import CHAINED_WARNING_DISABLED
+from pandas.compat._constants import (
+    REF_COUNT,
+    REF_COUNT_METHOD,
+)
+from pandas.compat._optional import import_optional_dependency
+from pandas.compat.numpy import function as nv
+from pandas.errors import (
+    ChainedAssignmentError,
+    InvalidIndexError,
+    Pandas4Warning,
+)
+from pandas.errors.cow import (
+    _chained_assignment_method_update_msg,
+    _chained_assignment_msg,
+)
+from pandas.util._decorators import (
+    Appender,
+    deprecate_nonkeyword_arguments,
+    doc,
+    set_module,
+)
+from pandas.util._exceptions import (
+    find_stack_level,
+)
+from pandas.util._validators import (
+    validate_ascending,
+    validate_bool_kwarg,
+    validate_percentile,
+)
+
+from pandas.core.dtypes.astype import astype_is_view
+from pandas.core.dtypes.cast import (
+    LossySetitemError,
+    construct_1d_arraylike_from_scalar,
+    find_common_type,
+    infer_dtype_from,
+    maybe_box_native,
+    maybe_unbox_numpy_scalar,
+)
+from pandas.core.dtypes.common import (
+    is_dict_like,
+    is_float,
+    is_integer,
+    is_iterator,
+    is_list_like,
+    is_object_dtype,
+    is_scalar,
+    pandas_dtype,
+    validate_all_hashable,
+)
+from pandas.core.dtypes.dtypes import (
+    ExtensionDtype,
+)
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCSeries,
+)
+from pandas.core.dtypes.inference import is_hashable
+from pandas.core.dtypes.missing import (
+    isna,
+    na_value_for_dtype,
+    notna,
+    remove_na_arraylike,
+)
+
+from pandas.core import (
+    algorithms,
+    base,
+    common as com,
+    nanops,
+    ops,
+    roperator,
+)
+from pandas.core.accessor import Accessor
+from pandas.core.apply import SeriesApply
+from pandas.core.arrays import ExtensionArray
+from pandas.core.arrays.arrow import (
+    ListAccessor,
+    StructAccessor,
+)
+from pandas.core.arrays.categorical import CategoricalAccessor
+from pandas.core.arrays.sparse import SparseAccessor
+from pandas.core.construction import (
+    array as pd_array,
+    extract_array,
+    sanitize_array,
+)
+from pandas.core.generic import NDFrame
+from pandas.core.indexers import (
+    disallow_ndim_indexing,
+    unpack_1tuple,
+)
+from pandas.core.indexes.accessors import CombinedDatetimelikeProperties
+from pandas.core.indexes.api import (
+    DatetimeIndex,
+    Index,
+    MultiIndex,
+    PeriodIndex,
+    default_index,
+    ensure_index,
+    maybe_sequence_to_range,
+)
+import pandas.core.indexes.base as ibase
+from pandas.core.indexes.multi import maybe_droplevels
+from pandas.core.indexing import (
+    check_bool_indexer,
+    check_dict_or_set_indexers,
+)
+from pandas.core.internals import SingleBlockManager
+from pandas.core.methods import selectn
+from pandas.core.shared_docs import _shared_docs
+from pandas.core.sorting import (
+    ensure_key_mapped,
+    nargsort,
+)
+from pandas.core.strings.accessor import StringMethods
+from pandas.core.tools.datetimes import to_datetime
+
+import pandas.io.formats.format as fmt
+from pandas.io.formats.info import (
+    SeriesInfo,
+)
+import pandas.plotting
+
+if TYPE_CHECKING:
+    from pandas._libs.internals import BlockValuesRefs
+    from pandas._typing import (
+        AggFuncType,
+        AnyAll,
+        AnyArrayLike,
+        ArrayLike,
+        ArrowArrayExportable,
+        ArrowStreamExportable,
+        Axis,
+        AxisInt,
+        CorrelationMethod,
+        DropKeep,
+        Dtype,
+        DtypeObj,
+        FilePath,
+        Frequency,
+        IgnoreRaise,
+        IndexKeyFunc,
+        IndexLabel,
+        Level,
+        ListLike,
+        MutableMappingT,
+        NaPosition,
+        NumpySorter,
+        NumpyValueArrayLike,
+        QuantileInterpolation,
+        ReindexMethod,
+        Renamer,
+        Scalar,
+        SortKind,
+        StorageOptions,
+        Suffixes,
+        ValueKeyFunc,
+        WriteBuffer,
+        npt,
+    )
+
+    from pandas.core.frame import DataFrame
+    from pandas.core.groupby.generic import SeriesGroupBy
+
+__all__ = ["Series"]
+
+_shared_doc_kwargs = {
+    "axes": "index",
+    "klass": "Series",
+    "axes_single_arg": "{0 or 'index'}",
+    "axis": """axis : {0 or 'index'}
+        Unused. Parameter needed for compatibility with DataFrame.""",
+    "inplace": """inplace : bool, default False
+        If True, performs operation inplace and returns None.""",
+    "unique": "np.ndarray",
+    "duplicated": "Series",
+    "optional_by": "",
+    "optional_reindex": """
+index : array-like, optional
+    New labels for the index. Preferably an Index object to avoid
+    duplicating data.
+axis : int or str, optional
+    Unused.""",
+}
+
+# ----------------------------------------------------------------------
+# Series class
+
+
+# error: Cannot override final attribute "ndim" (previously declared in base
+# class "NDFrame")
+# error: Cannot override final attribute "size" (previously declared in base
+# class "NDFrame")
+# definition in base class "NDFrame"
+@set_module("pandas")
+class Series(base.IndexOpsMixin, NDFrame):  # type: ignore[misc]
+    """
+    One-dimensional ndarray with axis labels (including time series).
+
+    Labels need not be unique but must be a hashable type. The object
+    supports both integer- and label-based indexing and provides a host of
+    methods for performing operations involving the index. Statistical
+    methods from ndarray have been overridden to automatically exclude
+    missing data (currently represented as NaN).
+
+    Operations between Series (+, -, /, \\*, \\*\\*) align values based on their
+    associated index values-- they need not be the same length. The result
+    index will be the sorted union of the two indexes.
+
+    Parameters
+    ----------
+    data : array-like, Iterable, dict, or scalar value
+        Contains data stored in Series. If data is a dict, argument order is
+        maintained. Unordered sets are not supported.
+    index : array-like or Index (1d)
+        Values must be hashable and have the same length as `data`.
+        Non-unique index values are allowed. Will default to
+        RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like
+        and index is None, then the keys in the data are used as the index. If the
+        index is not None, the resulting Series is reindexed with the index values.
+    dtype : str, numpy.dtype, or ExtensionDtype, optional
+        Data type for the output Series. If not specified, this will be
+        inferred from `data`.
+        See the :ref:`user guide <basics.dtypes>` for more usages.
+    name : Hashable, default None
+        The name to give to the Series.
+    copy : bool, default None
+        Whether to copy input data, only relevant for array, Series, and Index
+        inputs (for other input, e.g. a list, a new array is created anyway).
+        Defaults to True for array input and False for Index/Series.
+        Even when False for Index/Series, a shallow copy of the data is made.
+        Set to False to avoid copying array input at your own risk (if you
+        know the input data won't be modified elsewhere).
+        Set to True to force copying Series/Index input up front.
+
+    See Also
+    --------
+    DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data.
+    Index : Immutable sequence used for indexing and alignment.
+
+    Notes
+    -----
+    Please reference the :ref:`User Guide <basics.series>` for more information.
+
+    Examples
+    --------
+    Constructing Series from a dictionary with an Index specified
+
+    >>> d = {"a": 1, "b": 2, "c": 3}
+    >>> ser = pd.Series(data=d, index=["a", "b", "c"])
+    >>> ser
+    a   1
+    b   2
+    c   3
+    dtype: int64
+
+    The keys of the dictionary match with the Index values, hence the Index
+    values have no effect.
+
+    >>> d = {"a": 1, "b": 2, "c": 3}
+    >>> ser = pd.Series(data=d, index=["x", "y", "z"])
+    >>> ser
+    x   NaN
+    y   NaN
+    z   NaN
+    dtype: float64
+
+    Note that the Index is first built with the keys from the dictionary.
+    After this the Series is reindexed with the given Index values, hence we
+    get all NaN as a result.
+
+    Constructing Series from a list with `copy=False`.
+
+    >>> r = [1, 2]
+    >>> ser = pd.Series(r, copy=False)
+    >>> ser.iloc[0] = 999
+    >>> r
+    [1, 2]
+    >>> ser
+    0    999
+    1      2
+    dtype: int64
+
+    Due to input data type the Series has a `copy` of
+    the original data even though `copy=False`, so
+    the data is unchanged.
+
+    Constructing Series from a 1d ndarray with `copy=False`.
+
+    >>> r = np.array([1, 2])
+    >>> ser = pd.Series(r, copy=False)
+    >>> ser.iloc[0] = 999
+    >>> r
+    array([999,   2])
+    >>> ser
+    0    999
+    1      2
+    dtype: int64
+
+    Due to input data type the Series has a `view` on
+    the original data, so
+    the data is changed as well.
+    """
+
+    _typ = "series"
+    _HANDLED_TYPES = (Index, ExtensionArray, np.ndarray)
+
+    _name: Hashable
+    _metadata: list[str] = ["_name"]
+    _internal_names_set = {"index", "name"} | NDFrame._internal_names_set
+    _accessors = {"dt", "cat", "str", "sparse"}
+    _hidden_attrs = (
+        base.IndexOpsMixin._hidden_attrs | NDFrame._hidden_attrs | frozenset([])
+    )
+
+    # similar to __array_priority__, positions Series after DataFrame
+    #  but before Index and ExtensionArray.  Should NOT be overridden by subclasses.
+    __pandas_priority__ = 3000
+
+    # Override cache_readonly bc Series is mutable
+    hasnans = property(
+        # error: "Callable[[IndexOpsMixin], bool]" has no attribute "fget"
+        base.IndexOpsMixin.hasnans.fget,  # type: ignore[attr-defined]
+        doc=base.IndexOpsMixin.hasnans.__doc__,
+    )
+    _mgr: SingleBlockManager
+
+    # ----------------------------------------------------------------------
+    # Constructors
+
+    def __init__(
+        self,
+        data=None,
+        index=None,
+        dtype: Dtype | None = None,
+        name=None,
+        copy: bool | None = None,
+    ) -> None:
+        allow_mgr = False
+        if (
+            isinstance(data, SingleBlockManager)
+            and index is None
+            and dtype is None
+            and (copy is False or copy is None)
+        ):
+            if not allow_mgr:
+                # GH#52419
+                warnings.warn(
+                    f"Passing a {type(data).__name__} to {type(self).__name__} "
+                    "is deprecated and will raise in a future version. "
+                    "Use public APIs instead.",
+                    Pandas4Warning,
+                    stacklevel=2,
+                )
+            data = data.copy(deep=False)
+            # GH#33357 called with just the SingleBlockManager
+            NDFrame.__init__(self, data)
+            self.name = name
+            return
+
+        if isinstance(data, (ExtensionArray, np.ndarray)):
+            if copy is not False:
+                if dtype is None or astype_is_view(data.dtype, pandas_dtype(dtype)):
+                    data = data.copy()
+                    copy = False
+        if copy is None:
+            copy = False
+
+        if isinstance(data, SingleBlockManager) and not copy:
+            data = data.copy(deep=False)
+
+            if not allow_mgr:
+                warnings.warn(
+                    f"Passing a {type(data).__name__} to {type(self).__name__} "
+                    "is deprecated and will raise in a future version. "
+                    "Use public APIs instead.",
+                    Pandas4Warning,
+                    stacklevel=2,
+                )
+                allow_mgr = True
+
+        name = ibase.maybe_extract_name(name, data, type(self))
+
+        if index is not None:
+            index = ensure_index(index)
+
+        if dtype is not None:
+            dtype = self._validate_dtype(dtype)
+
+        if data is None:
+            index = index if index is not None else default_index(0)
+            if len(index) or dtype is not None:
+                data = na_value_for_dtype(pandas_dtype(dtype), compat=False)
+            else:
+                data = []
+
+        if isinstance(data, MultiIndex):
+            raise NotImplementedError(
+                "initializing a Series from a MultiIndex is not supported"
+            )
+
+        refs = None
+        if isinstance(data, Index):
+            if dtype is not None:
+                data = data.astype(dtype)
+            if not copy:
+                refs = data._references
+
+        elif isinstance(data, np.ndarray):
+            if len(data.dtype):
+                # GH#13296 we are dealing with a compound dtype, which
+                #  should be treated as 2D
+                raise ValueError(
+                    "Cannot construct a Series from an ndarray with "
+                    "compound dtype.  Use DataFrame instead."
+                )
+        elif isinstance(data, Series):
+            if index is None:
+                index = data.index
+                data = data._mgr.copy(deep=False)
+            else:
+                data = data.reindex(index)
+                data = data._mgr
+                if data._has_no_reference(0):
+                    copy = False
+        elif isinstance(data, Mapping):
+            data, index = self._init_dict(data, index, dtype)
+            dtype = None
+            copy = False
+        elif isinstance(data, SingleBlockManager):
+            if index is None:
+                index = data.index
+            elif not data.index.equals(index) or copy:
+                # GH#19275 SingleBlockManager input should only be called
+                # internally
+                raise AssertionError(
+                    "Cannot pass both SingleBlockManager "
+                    "`data` argument and a different "
+                    "`index` argument. `copy` must be False."
+                )
+
+            if not allow_mgr:
+                warnings.warn(
+                    f"Passing a {type(data).__name__} to {type(self).__name__} "
+                    "is deprecated and will raise in a future version. "
+                    "Use public APIs instead.",
+                    Pandas4Warning,
+                    stacklevel=2,
+                )
+                allow_mgr = True
+
+        elif isinstance(data, ExtensionArray):
+            pass
+        else:
+            data = com.maybe_iterable_to_list(data)
+            if is_list_like(data) and not len(data) and dtype is None:
+                # GH 29405: Pre-2.0, this defaulted to float.
+                dtype = np.dtype(object)
+
+        if index is None:
+            if not is_list_like(data):
+                data = [data]
+            index = default_index(len(data))
+        elif is_list_like(data):
+            com.require_length_match(data, index)
+
+        # create/copy the manager
+        if isinstance(data, SingleBlockManager):
+            if dtype is not None:
+                if not astype_is_view(data.dtype, pandas_dtype(dtype)):
+                    copy = False
+                data = data.astype(dtype=dtype)
+            if copy:
+                data = data.copy(deep=True)
+        else:
+            data = sanitize_array(data, index, dtype, copy)
+            data = SingleBlockManager.from_array(data, index, refs=refs)
+
+        NDFrame.__init__(self, data)
+        self.name = name
+        self._set_axis(0, index)
+
+    def _init_dict(
+        self, data: Mapping, index: Index | None = None, dtype: DtypeObj | None = None
+    ):
+        """
+        Derive the "_mgr" and "index" attributes of a new Series from a
+        dictionary input.
+
+        Parameters
+        ----------
+        data : dict or dict-like
+            Data used to populate the new Series.
+        index : Index or None, default None
+            Index for the new Series: if None, use dict keys.
+        dtype : np.dtype, ExtensionDtype, or None, default None
+            The dtype for the new Series: if None, infer from data.
+
+        Returns
+        -------
+        _data : BlockManager for the new Series
+        index : index for the new Series
+        """
+        # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')]
+        # raises KeyError), so we iterate the entire dict, and align
+        if data:
+            # GH:34717, issue was using zip to extract key and values from data.
+            # using generators in effects the performance.
+            # Below is the new way of extracting the keys and values
+
+            keys = maybe_sequence_to_range(tuple(data.keys()))
+            values = list(data.values())  # Generating list of values- faster way
+        elif index is not None:
+            # fastpath for Series(data=None). Just use broadcasting a scalar
+            # instead of reindexing.
+            if len(index) or dtype is not None:
+                values = na_value_for_dtype(pandas_dtype(dtype), compat=False)
+            else:
+                values = []
+            keys = index
+        else:
+            keys, values = default_index(0), []
+
+        # Input is now list-like, so rely on "standard" construction:
+        s = Series(values, index=keys, dtype=dtype)
+
+        # Now we just make sure the order is respected, if any
+        if data and index is not None:
+            s = s.reindex(index)
+        return s._mgr, s.index
+
+    # ----------------------------------------------------------------------
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        """
+        Export the pandas Series as an Arrow C stream PyCapsule.
+
+        This relies on pyarrow to convert the pandas Series to the Arrow
+        format (and follows the default behavior of ``pyarrow.Array.from_pandas``
+        in its handling of the index, i.e. to ignore it).
+        This conversion is not necessarily zero-copy.
+
+        Parameters
+        ----------
+        requested_schema : PyCapsule, default None
+            The schema to which the dataframe should be casted, passed as a
+            PyCapsule containing a C ArrowSchema representation of the
+            requested schema.
+
+        Returns
+        -------
+        PyCapsule
+        """
+        pa = import_optional_dependency("pyarrow", min_version="16.0.0")
+        type = (
+            pa.DataType._import_from_c_capsule(requested_schema)
+            if requested_schema is not None
+            else None
+        )
+        ca = pa.array(self, type=type)
+        if not isinstance(ca, pa.ChunkedArray):
+            ca = pa.chunked_array([ca])
+        return ca.__arrow_c_stream__()
+
+    # ----------------------------------------------------------------------
+
+    @property
+    def _constructor(self) -> type[Series]:
+        return Series
+
+    def _constructor_from_mgr(self, mgr, axes):
+        ser = Series._from_mgr(mgr, axes=axes)
+        ser._name = None  # caller is responsible for setting real name
+
+        if type(self) is Series:
+            # This would also work `if self._constructor is Series`, but
+            #  this check is slightly faster, benefiting the most-common case.
+            return ser
+
+        # We assume that the subclass __init__ knows how to handle a
+        #  pd.Series object.
+        return self._constructor(ser)
+
+    @property
+    def _constructor_expanddim(self) -> Callable[..., DataFrame]:
+        """
+        Used when a manipulation result has one higher dimension as the
+        original, such as Series.to_frame()
+        """
+        from pandas.core.frame import DataFrame
+
+        return DataFrame
+
+    def _constructor_expanddim_from_mgr(self, mgr, axes):
+        from pandas.core.frame import DataFrame
+
+        df = DataFrame._from_mgr(mgr, axes=mgr.axes)
+
+        if type(self) is Series:
+            # This would also work `if self._constructor_expanddim is DataFrame`,
+            #  but this check is slightly faster, benefiting the most-common case.
+            return df
+
+        # We assume that the subclass __init__ knows how to handle a
+        #  pd.DataFrame object.
+        return self._constructor_expanddim(df)
+
+    # types
+    @property
+    def _can_hold_na(self) -> bool:
+        return self._mgr._can_hold_na
+
+    # ndarray compatibility
+    @property
+    def dtype(self) -> DtypeObj:
+        """
+        Return the dtype object of the underlying data.
+
+        See Also
+        --------
+        Series.dtypes : Return the dtype object of the underlying data.
+        Series.astype : Cast a pandas object to a specified dtype dtype.
+        Series.convert_dtypes : Convert columns to the best possible dtypes using dtypes
+            supporting pd.NA.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.dtype
+        dtype('int64')
+        """
+        return self._mgr.dtype
+
+    @property
+    def dtypes(self) -> DtypeObj:
+        """
+        Return the dtype object of the underlying data.
+
+        See Also
+        --------
+        DataFrame.dtypes :  Return the dtypes in the DataFrame.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.dtypes
+        dtype('int64')
+        """
+        # DataFrame compatibility
+        return self.dtype
+
+    @property
+    def name(self) -> Hashable:
+        """
+        Return the name of the Series.
+
+        The name of a Series becomes its index or column name if it is used
+        to form a DataFrame. It is also used whenever displaying the Series
+        using the interpreter.
+
+        Returns
+        -------
+        label (hashable object)
+            The name of the Series, also the column name if part of a DataFrame.
+
+        See Also
+        --------
+        Series.rename : Sets the Series name when given a scalar input.
+        Index.name : Corresponding Index property.
+
+        Examples
+        --------
+        The Series name can be set initially when calling the constructor.
+
+        >>> s = pd.Series([1, 2, 3], dtype=np.int64, name="Numbers")
+        >>> s
+        0    1
+        1    2
+        2    3
+        Name: Numbers, dtype: int64
+        >>> s.name = "Integers"
+        >>> s
+        0    1
+        1    2
+        2    3
+        Name: Integers, dtype: int64
+
+        The name of a Series within a DataFrame is its column name.
+
+        >>> df = pd.DataFrame(
+        ...     [[1, 2], [3, 4], [5, 6]], columns=["Odd Numbers", "Even Numbers"]
+        ... )
+        >>> df
+           Odd Numbers  Even Numbers
+        0            1             2
+        1            3             4
+        2            5             6
+        >>> df["Even Numbers"].name
+        'Even Numbers'
+        """
+        return self._name
+
+    @name.setter
+    def name(self, value: Hashable) -> None:
+        validate_all_hashable(value, error_name=f"{type(self).__name__}.name")
+        object.__setattr__(self, "_name", value)
+
+    @property
+    def values(self):
+        """
+        Return Series as ndarray or ndarray-like depending on the dtype.
+
+        .. warning::
+
+           We recommend using :attr:`Series.array` or
+           :meth:`Series.to_numpy`, depending on whether you need
+           a reference to the underlying data or a NumPy array.
+
+        Returns
+        -------
+        numpy.ndarray or ndarray-like
+
+        See Also
+        --------
+        Series.array : Reference to the underlying data.
+        Series.to_numpy : A NumPy array representing the underlying data.
+
+        Examples
+        --------
+        >>> pd.Series([1, 2, 3]).values
+        array([1, 2, 3])
+
+        >>> pd.Series(list("aabc")).values
+        <ArrowStringArray>
+        ['a', 'a', 'b', 'c']
+        Length: 4, dtype: str
+
+        >>> pd.Series(list("aabc")).astype("category").values
+        ['a', 'a', 'b', 'c']
+        Categories (3, str): ['a', 'b', 'c']
+
+        Timezone aware datetime data is converted to UTC:
+
+        >>> pd.Series(pd.date_range("20130101", periods=3, tz="US/Eastern")).values
+        array(['2013-01-01T05:00:00.000000',
+               '2013-01-02T05:00:00.000000',
+               '2013-01-03T05:00:00.000000'], dtype='datetime64[us]')
+        """
+        return self._mgr.external_values()
+
+    @property
+    def _values(self):
+        """
+        Return the internal repr of this data (defined by Block.interval_values).
+        This are the values as stored in the Block (ndarray or ExtensionArray
+        depending on the Block class), with datetime64[ns] and timedelta64[ns]
+        wrapped in ExtensionArrays to match Index._values behavior.
+
+        Differs from the public ``.values`` for certain data types, because of
+        historical backwards compatibility of the public attribute (e.g. period
+        returns object ndarray and datetimetz a datetime64[ns] ndarray for
+        ``.values`` while it returns an ExtensionArray for ``._values`` in those
+        cases).
+
+        Differs from ``.array`` in that this still returns the numpy array if
+        the Block is backed by a numpy array (except for datetime64 and
+        timedelta64 dtypes), while ``.array`` ensures to always return an
+        ExtensionArray.
+
+        Overview:
+
+        dtype       | values        | _values       | array                 |
+        ----------- | ------------- | ------------- | --------------------- |
+        Numeric     | ndarray       | ndarray       | NumpyExtensionArray   |
+        Category    | Categorical   | Categorical   | Categorical           |
+        dt64[ns]    | ndarray[M8ns] | DatetimeArray | DatetimeArray         |
+        dt64[ns tz] | ndarray[M8ns] | DatetimeArray | DatetimeArray         |
+        td64[ns]    | ndarray[m8ns] | TimedeltaArray| TimedeltaArray        |
+        Period      | ndarray[obj]  | PeriodArray   | PeriodArray           |
+        Nullable    | EA            | EA            | EA                    |
+
+        """
+        return self._mgr.internal_values()
+
+    @property
+    def _references(self) -> BlockValuesRefs:
+        return self._mgr._block.refs
+
+    @Appender(base.IndexOpsMixin.array.__doc__)  # type: ignore[prop-decorator]
+    @property
+    def array(self) -> ExtensionArray:
+        arr = self._mgr.array_values()
+        # TODO decide on read-only https://github.com/pandas-dev/pandas/issues/63099
+        # arr = arr.view()
+        # arr._readonly = True
+        return arr
+
+    def __len__(self) -> int:
+        """
+        Return the length of the Series.
+        """
+        return len(self._mgr)
+
+    # ----------------------------------------------------------------------
+    # NDArray Compat
+    def __array__(
+        self, dtype: npt.DTypeLike | None = None, copy: bool | None = None
+    ) -> np.ndarray:
+        """
+        Return the values as a NumPy array.
+
+        Users should not call this directly. Rather, it is invoked by
+        :func:`numpy.array` and :func:`numpy.asarray`.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype, optional
+            The dtype to use for the resulting NumPy array. By default,
+            the dtype is inferred from the data.
+
+        copy : bool or None, optional
+            See :func:`numpy.asarray`.
+
+        Returns
+        -------
+        numpy.ndarray
+            The values in the series converted to a :class:`numpy.ndarray`
+            with the specified `dtype`.
+
+        See Also
+        --------
+        array : Create a new array from data.
+        Series.array : Zero-copy view to the array backing the Series.
+        Series.to_numpy : Series method for similar behavior.
+
+        Examples
+        --------
+        >>> ser = pd.Series([1, 2, 3])
+        >>> np.asarray(ser)
+        array([1, 2, 3])
+
+        For timezone-aware data, the timezones may be retained with
+        ``dtype='object'``
+
+        >>> tzser = pd.Series(pd.date_range("2000", periods=2, tz="CET"))
+        >>> np.asarray(tzser, dtype="object")
+        array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),
+               Timestamp('2000-01-02 00:00:00+0100', tz='CET')],
+              dtype=object)
+
+        Or the values may be localized to UTC and the tzinfo discarded with
+        ``dtype='datetime64[ns]'``
+
+        >>> np.asarray(tzser, dtype="datetime64[ns]")  # doctest: +ELLIPSIS
+        array(['1999-12-31T23:00:00.000000000', ...],
+              dtype='datetime64[ns]')
+        """
+        values = self._values
+        if copy is None:
+            # Note: branch avoids `copy=None` for NumPy 1.x support
+            arr = np.asarray(values, dtype=dtype)
+        else:
+            arr = np.array(values, dtype=dtype, copy=copy)
+
+        if copy is True:
+            return arr
+        if copy is False or astype_is_view(values.dtype, arr.dtype):
+            arr = arr.view()
+            arr.flags.writeable = False
+        return arr
+
+    # ----------------------------------------------------------------------
+
+    # indexers
+    @property
+    def axes(self) -> list[Index]:
+        """
+        Return a list of the row axis labels.
+        """
+        return [self.index]
+
+    # ----------------------------------------------------------------------
+    # Indexing Methods
+
+    def _ixs(self, i: int, axis: AxisInt = 0) -> Any:
+        """
+        Return the i-th value or values in the Series by location.
+
+        Parameters
+        ----------
+        i : int
+
+        Returns
+        -------
+        scalar
+        """
+        return self._values[i]
+
+    def _slice(self, slobj: slice, axis: AxisInt = 0) -> Series:
+        # axis kwarg is retained for compat with NDFrame method
+        #  _slice is *always* positional
+        mgr = self._mgr.get_slice(slobj, axis=axis)
+        out = self._constructor_from_mgr(mgr, axes=mgr.axes)
+        out._name = self._name
+        return out.__finalize__(self)
+
+    def __getitem__(self, key):
+        check_dict_or_set_indexers(key)
+        key = com.apply_if_callable(key, self)
+
+        if key is Ellipsis:
+            return self.copy(deep=False)
+
+        key_is_scalar = is_scalar(key)
+        if isinstance(key, (list, tuple)):
+            key = unpack_1tuple(key)
+
+        elif key_is_scalar:
+            # Note: GH#50617 in 3.0 we changed int key to always be treated as
+            #  a label, matching DataFrame behavior.
+            return self._get_value(key)
+
+        # Convert generator to list before going through hashable part
+        # (We will iterate through the generator there to check for slices)
+        if is_iterator(key):
+            key = list(key)
+
+        if is_hashable(key, allow_slice=False):
+            # Otherwise index.get_value will raise InvalidIndexError
+            try:
+                # For labels that don't resolve as scalars like tuples and frozensets
+                result = self._get_value(key)
+
+                return result
+
+            except (KeyError, TypeError, InvalidIndexError):
+                # InvalidIndexError for e.g. generator
+                #  see test_series_getitem_corner_generator
+                if isinstance(key, tuple) and isinstance(self.index, MultiIndex):
+                    # We still have the corner case where a tuple is a key
+                    # in the first level of our MultiIndex
+                    return self._get_values_tuple(key)
+
+        if isinstance(key, slice):
+            # Do slice check before somewhat-costly is_bool_indexer
+            return self._getitem_slice(key)
+
+        if com.is_bool_indexer(key):
+            key = check_bool_indexer(self.index, key)
+            key = np.asarray(key, dtype=bool)
+            return self._get_rows_with_mask(key)
+
+        return self._get_with(key)
+
+    def _get_with(self, key):
+        # other: fancy integer or otherwise
+        if isinstance(key, ABCDataFrame):
+            raise TypeError(
+                "Indexing a Series with DataFrame is not "
+                "supported, use the appropriate DataFrame column"
+            )
+        elif isinstance(key, tuple):
+            return self._get_values_tuple(key)
+
+        return self.loc[key]
+
+    def _get_values_tuple(self, key: tuple):
+        # mpl hackaround
+        if com.any_none(*key):
+            # mpl compat if we look up e.g. ser[:, np.newaxis];
+            #  see tests.series.timeseries.test_mpl_compat_hack
+            # the asarray is needed to avoid returning a 2D DatetimeArray
+            result = np.asarray(self._values[key])
+            disallow_ndim_indexing(result)
+            return result
+
+        if not isinstance(self.index, MultiIndex):
+            raise KeyError("key of type tuple not found and not a MultiIndex")
+
+        # If key is contained, would have returned by now
+        indexer, new_index = self.index.get_loc_level(key)
+        new_ser = self._constructor(self._values[indexer], index=new_index, copy=False)
+        if isinstance(indexer, slice):
+            new_ser._mgr.add_references(self._mgr)
+        return new_ser.__finalize__(self)
+
+    def _get_rows_with_mask(self, indexer: npt.NDArray[np.bool_]) -> Series:
+        new_mgr = self._mgr.get_rows_with_mask(indexer)
+        return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
+
+    def _get_value(self, label, takeable: bool = False):
+        """
+        Quickly retrieve single value at passed index label.
+
+        Parameters
+        ----------
+        label : object
+        takeable : interpret the index as indexers, default False
+
+        Returns
+        -------
+        scalar value
+        """
+        if takeable:
+            return self._values[label]
+
+        # Similar to Index.get_value, but we do not fall back to positional
+        loc = self.index.get_loc(label)
+
+        if is_integer(loc):
+            return self._values[loc]
+
+        if isinstance(self.index, MultiIndex):
+            mi = self.index
+            new_values = self._values[loc]
+            if len(new_values) == 1 and mi.nlevels == 1:
+                # If more than one level left, we can not return a scalar
+                return new_values[0]
+
+            new_index = mi[loc]
+            new_index = maybe_droplevels(new_index, label)
+            new_ser = self._constructor(
+                new_values, index=new_index, name=self.name, copy=False
+            )
+            if isinstance(loc, slice):
+                new_ser._mgr.add_references(self._mgr)
+            return new_ser.__finalize__(self)
+
+        else:
+            return self.iloc[loc]
+
+    def __setitem__(self, key, value) -> None:
+        if not CHAINED_WARNING_DISABLED:
+            if sys.getrefcount(self) <= REF_COUNT and not com.is_local_in_caller_frame(
+                self
+            ):
+                warnings.warn(
+                    _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
+                )
+
+        check_dict_or_set_indexers(key)
+        key = com.apply_if_callable(key, self)
+
+        if key is Ellipsis:
+            key = slice(None)
+
+        if isinstance(key, slice):
+            indexer = self.index._convert_slice_indexer(key, kind="getitem")
+            return self._set_values(indexer, value)
+
+        try:
+            self._set_with_engine(key, value)
+        except KeyError:
+            # We have a scalar (or for MultiIndex or object-dtype, scalar-like)
+            #  key that is not present in self.index.
+            # GH#12862 adding a new key to the Series
+            self.loc[key] = value
+
+        except (TypeError, ValueError, LossySetitemError):
+            # The key was OK, but we cannot set the value losslessly
+            indexer = self.index.get_loc(key)
+            self._set_values(indexer, value)
+
+        except InvalidIndexError as err:
+            if isinstance(key, tuple) and not isinstance(self.index, MultiIndex):
+                # cases with MultiIndex don't get here bc they raise KeyError
+                # e.g. test_basic_getitem_setitem_corner
+                raise KeyError(
+                    "key of type tuple not found and not a MultiIndex"
+                ) from err
+
+            if com.is_bool_indexer(key):
+                key = check_bool_indexer(self.index, key)
+                key = np.asarray(key, dtype=bool)
+
+                if (
+                    is_list_like(value)
+                    and len(value) != len(self)
+                    and not isinstance(value, Series)
+                    and not is_object_dtype(self.dtype)
+                ):
+                    # Series will be reindexed to have matching length inside
+                    #  _where call below
+                    # GH#44265
+                    indexer = key.nonzero()[0]
+                    self._set_values(indexer, value)
+                    return
+
+                # otherwise with listlike other we interpret series[mask] = other
+                #  as series[mask] = other[mask]
+                try:
+                    self._where(~key, value, inplace=True)
+                except InvalidIndexError:
+                    # test_where_dups
+                    self.iloc[key] = value
+                return
+
+            else:
+                self._set_with(key, value)
+
+    def _set_with_engine(self, key, value) -> None:
+        loc = self.index.get_loc(key)
+
+        # this is equivalent to self._values[key] = value
+        self._mgr.setitem_inplace(loc, value)
+
+    def _set_with(self, key, value) -> None:
+        # We got here via exception-handling off of InvalidIndexError, so
+        #  key should always be listlike at this point.
+        assert not isinstance(key, tuple)
+
+        if is_iterator(key):
+            # Without this, the call to infer_dtype will consume the generator
+            key = list(key)
+
+        self._set_labels(key, value)
+
+    def _set_labels(self, key, value) -> None:
+        key = com.asarray_tuplesafe(key)
+        indexer: np.ndarray = self.index.get_indexer(key)
+        mask = indexer == -1
+        if mask.any():
+            raise KeyError(f"{key[mask]} not in index")
+        self._set_values(indexer, value)
+
+    def _set_values(self, key, value) -> None:
+        if isinstance(key, (Index, Series)):
+            key = key._values
+
+        self._mgr = self._mgr.setitem(indexer=key, value=value)
+
+    def _set_value(self, label, value, takeable: bool = False) -> None:
+        """
+        Quickly set single value at passed label.
+
+        If label is not contained, a new object is created with the label
+        placed at the end of the result index.
+
+        Parameters
+        ----------
+        label : object
+            Partial indexing with MultiIndex not allowed.
+        value : object
+            Scalar value.
+        takeable : interpret the index as indexers, default False
+        """
+        if not takeable:
+            try:
+                loc = self.index.get_loc(label)
+            except KeyError:
+                # set using a non-recursive method
+                self.loc[label] = value
+                return
+        else:
+            loc = label
+
+        self._set_values(loc, value)
+
+    # ----------------------------------------------------------------------
+    # Unsorted
+
+    def repeat(self, repeats: int | Sequence[int], axis: None = None) -> Series:
+        """
+        Repeat elements of a Series.
+
+        Returns a new Series where each element of the current Series
+        is repeated consecutively a given number of times.
+
+        Parameters
+        ----------
+        repeats : int or array of ints
+            The number of repetitions for each element. This should be a
+            non-negative integer. Repeating 0 times will return an empty
+            Series.
+        axis : None
+            Unused. Parameter needed for compatibility with DataFrame.
+
+        Returns
+        -------
+        Series
+            Newly created Series with repeated elements.
+
+        See Also
+        --------
+        Index.repeat : Equivalent function for Index.
+        numpy.repeat : Similar method for :class:`numpy.ndarray`.
+
+        Examples
+        --------
+        >>> s = pd.Series(["a", "b", "c"])
+        >>> s
+        0    a
+        1    b
+        2    c
+        dtype: str
+        >>> s.repeat(2)
+        0    a
+        0    a
+        1    b
+        1    b
+        2    c
+        2    c
+        dtype: str
+        >>> s.repeat([1, 2, 3])
+        0    a
+        1    b
+        1    b
+        2    c
+        2    c
+        2    c
+        dtype: str
+        """
+        nv.validate_repeat((), {"axis": axis})
+        new_index = self.index.repeat(repeats)
+        new_values = self._values.repeat(repeats)
+        return self._constructor(new_values, index=new_index, copy=False).__finalize__(
+            self, method="repeat"
+        )
+
+    @overload
+    def reset_index(
+        self,
+        level: IndexLabel = ...,
+        *,
+        drop: Literal[False] = ...,
+        name: Level = ...,
+        inplace: Literal[False] = ...,
+        allow_duplicates: bool = ...,
+    ) -> DataFrame: ...
+
+    @overload
+    def reset_index(
+        self,
+        level: IndexLabel = ...,
+        *,
+        drop: Literal[True],
+        name: Level = ...,
+        inplace: Literal[False] = ...,
+        allow_duplicates: bool = ...,
+    ) -> Series: ...
+
+    @overload
+    def reset_index(
+        self,
+        level: IndexLabel = ...,
+        *,
+        drop: bool = ...,
+        name: Level = ...,
+        inplace: Literal[True],
+        allow_duplicates: bool = ...,
+    ) -> None: ...
+
+    def reset_index(
+        self,
+        level: IndexLabel | None = None,
+        *,
+        drop: bool = False,
+        name: Level = lib.no_default,
+        inplace: bool = False,
+        allow_duplicates: bool = False,
+    ) -> DataFrame | Series | None:
+        """
+        Generate a new DataFrame or Series with the index reset.
+
+        This is useful when the index needs to be treated as a column, or
+        when the index is meaningless and needs to be reset to the default
+        before another operation.
+
+        Parameters
+        ----------
+        level : int, str, tuple, or list, default optional
+            For a Series with a MultiIndex, only remove the specified levels
+            from the index. Removes all levels by default.
+        drop : bool, default False
+            Just reset the index, without inserting it as a column in
+            the new DataFrame.
+        name : object, optional
+            The name to use for the column containing the original Series
+            values. Uses ``self.name`` by default. This argument is ignored
+            when `drop` is True.
+        inplace : bool, default False
+            Modify the Series in place (do not create a new object).
+        allow_duplicates : bool, default False
+            Allow duplicate column labels to be created.
+
+        Returns
+        -------
+        Series or DataFrame or None
+            When `drop` is False (the default), a DataFrame is returned.
+            The newly created columns will come first in the DataFrame,
+            followed by the original Series values.
+            When `drop` is True, a `Series` is returned.
+            In either case, if ``inplace=True``, no value is returned.
+
+        See Also
+        --------
+        DataFrame.reset_index: Analogous function for DataFrame.
+
+        Examples
+        --------
+        >>> s = pd.Series(
+        ...     [1, 2, 3, 4],
+        ...     name="foo",
+        ...     index=pd.Index(["a", "b", "c", "d"], name="idx"),
+        ... )
+
+        Generate a DataFrame with default index.
+
+        >>> s.reset_index()
+          idx  foo
+        0   a    1
+        1   b    2
+        2   c    3
+        3   d    4
+
+        To specify the name of the new column use `name`.
+
+        >>> s.reset_index(name="values")
+          idx  values
+        0   a       1
+        1   b       2
+        2   c       3
+        3   d       4
+
+        To generate a new Series with the default set `drop` to True.
+
+        >>> s.reset_index(drop=True)
+        0    1
+        1    2
+        2    3
+        3    4
+        Name: foo, dtype: int64
+
+        The `level` parameter is interesting for Series with a multi-level
+        index.
+
+        >>> arrays = [
+        ...     np.array(["bar", "bar", "baz", "baz"]),
+        ...     np.array(["one", "two", "one", "two"]),
+        ... ]
+        >>> s2 = pd.Series(
+        ...     range(4),
+        ...     name="foo",
+        ...     index=pd.MultiIndex.from_arrays(arrays, names=["a", "b"]),
+        ... )
+
+        To remove a specific level from the Index, use `level`.
+
+        >>> s2.reset_index(level="a")
+               a  foo
+        b
+        one  bar    0
+        two  bar    1
+        one  baz    2
+        two  baz    3
+
+        If `level` is not set, all levels are removed from the Index.
+
+        >>> s2.reset_index()
+             a    b  foo
+        0  bar  one    0
+        1  bar  two    1
+        2  baz  one    2
+        3  baz  two    3
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        if drop:
+            new_index = default_index(len(self))
+            if level is not None:
+                level_list: Sequence[Hashable]
+                if not isinstance(level, (tuple, list)):
+                    level_list = [level]
+                else:
+                    level_list = level
+                level_list = [self.index._get_level_number(lev) for lev in level_list]
+                if len(level_list) < self.index.nlevels:
+                    new_index = self.index.droplevel(level_list)
+
+            if inplace:
+                self.index = new_index
+            else:
+                new_ser = self.copy(deep=False)
+                new_ser.index = new_index
+                return new_ser.__finalize__(self, method="reset_index")
+        elif inplace:
+            raise TypeError(
+                "Cannot reset_index inplace on a Series to create a DataFrame"
+            )
+        else:
+            if name is lib.no_default:
+                # For backwards compatibility, keep columns as [0] instead of
+                #  [None] when self.name is None
+                if self.name is None:
+                    name = 0
+                else:
+                    name = self.name
+
+            df = self.to_frame(name)
+            return df.reset_index(
+                level=level, drop=drop, allow_duplicates=allow_duplicates
+            )
+        return None
+
+    # ----------------------------------------------------------------------
+    # Rendering Methods
+
+    def __repr__(self) -> str:
+        """
+        Return a string representation for a particular Series.
+        """
+        repr_params = fmt.get_series_repr_params()
+        return self.to_string(**repr_params)
+
+    @overload
+    def to_string(
+        self,
+        buf: None = ...,
+        *,
+        na_rep: str = ...,
+        float_format: str | None = ...,
+        header: bool = ...,
+        index: bool = ...,
+        length: bool = ...,
+        dtype=...,
+        name=...,
+        max_rows: int | None = ...,
+        min_rows: int | None = ...,
+    ) -> str: ...
+
+    @overload
+    def to_string(
+        self,
+        buf: FilePath | WriteBuffer[str],
+        *,
+        na_rep: str = ...,
+        float_format: str | None = ...,
+        header: bool = ...,
+        index: bool = ...,
+        length: bool = ...,
+        dtype=...,
+        name=...,
+        max_rows: int | None = ...,
+        min_rows: int | None = ...,
+    ) -> None: ...
+
+    @deprecate_nonkeyword_arguments(
+        Pandas4Warning, allowed_args=["self", "buf"], name="to_string"
+    )
+    def to_string(
+        self,
+        buf: FilePath | WriteBuffer[str] | None = None,
+        na_rep: str = "NaN",
+        float_format: str | None = None,
+        header: bool = True,
+        index: bool = True,
+        length: bool = False,
+        dtype: bool = False,
+        name: bool = False,
+        max_rows: int | None = None,
+        min_rows: int | None = None,
+    ) -> str | None:
+        """
+        Render a string representation of the Series.
+
+        Parameters
+        ----------
+        buf : StringIO-like, optional
+            Buffer to write to.
+        na_rep : str, optional
+            String representation of NaN to use, default 'NaN'.
+        float_format : one-parameter function, optional
+            Formatter function to apply to columns' elements if they are
+            floats, default None.
+        header : bool, default True
+            Add the Series header (index name).
+        index : bool, optional
+            Add index (row) labels, default True.
+        length : bool, default False
+            Add the Series length.
+        dtype : bool, default False
+            Add the Series dtype.
+        name : bool, default False
+            Add the Series name if not None.
+        max_rows : int, optional
+            Maximum number of rows to show before truncating. If None, show
+            all.
+        min_rows : int, optional
+            The number of rows to display in a truncated repr (when number
+            of rows is above `max_rows`).
+
+        Returns
+        -------
+        str or None
+            String representation of Series if ``buf=None``, otherwise None.
+
+        See Also
+        --------
+        Series.to_dict : Convert Series to dict object.
+        Series.to_frame : Convert Series to DataFrame object.
+        Series.to_markdown : Print Series in Markdown-friendly format.
+        Series.to_timestamp : Cast to DatetimeIndex of Timestamps.
+
+        Examples
+        --------
+        >>> ser = pd.Series([1, 2, 3]).to_string()
+        >>> ser
+        '0    1\\n1    2\\n2    3'
+        """
+        formatter = fmt.SeriesFormatter(
+            self,
+            name=name,
+            length=length,
+            header=header,
+            index=index,
+            dtype=dtype,
+            na_rep=na_rep,
+            float_format=float_format,
+            min_rows=min_rows,
+            max_rows=max_rows,
+        )
+        result = formatter.to_string()
+
+        # catch contract violations
+        if not isinstance(result, str):
+            raise AssertionError(
+                "result must be of type str, type "
+                f"of result is {type(result).__name__!r}"
+            )
+
+        if buf is None:
+            return result
+        elif hasattr(buf, "write"):
+            buf.write(result)
+        else:
+            with open(buf, "w", encoding="utf-8") as f:
+                f.write(result)
+        return None
+
+    @overload
+    def to_markdown(
+        self,
+        buf: None = ...,
+        *,
+        mode: str = ...,
+        index: bool = ...,
+        storage_options: StorageOptions | None = ...,
+        **kwargs,
+    ) -> str: ...
+
+    @overload
+    def to_markdown(
+        self,
+        buf: IO[str],
+        *,
+        mode: str = ...,
+        index: bool = ...,
+        storage_options: StorageOptions | None = ...,
+        **kwargs,
+    ) -> None: ...
+
+    @overload
+    def to_markdown(
+        self,
+        buf: IO[str] | None,
+        *,
+        mode: str = ...,
+        index: bool = ...,
+        storage_options: StorageOptions | None = ...,
+        **kwargs,
+    ) -> str | None: ...
+
+    @deprecate_nonkeyword_arguments(
+        Pandas4Warning, allowed_args=["self", "buf"], name="to_markdown"
+    )
+    def to_markdown(
+        self,
+        buf: IO[str] | None = None,
+        mode: str = "wt",
+        index: bool = True,
+        storage_options: StorageOptions | None = None,
+        **kwargs,
+    ) -> str | None:
+        """
+        Print Series in Markdown-friendly format.
+
+        Parameters
+        ----------
+        buf : str, Path or StringIO-like, optional, default None
+            Buffer to write to. If None, the output is returned as a string.
+        mode : str, optional
+            Mode in which file is opened, "wt" by default.
+        index : bool, optional, default True
+            Add index (row) labels.
+
+        storage_options : dict, optional
+            Extra options that make sense for a particular storage connection, e.g.
+            host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+            are forwarded to ``urllib.request.Request`` as header options. For other
+            URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+            forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+            details, and for more examples on storage options refer `here
+            <https://pandas.pydata.org/docs/user_guide/io.html?
+            highlight=storage_options#reading-writing-remote-files>`_.
+
+        **kwargs
+            These parameters will be passed to `tabulate \
+                <https://pypi.org/project/tabulate>`_.
+
+        Returns
+        -------
+        str
+            Series in Markdown-friendly format.
+
+        See Also
+        --------
+        Series.to_frame : Rrite a text representation of object to the system clipboard.
+        Series.to_latex : Render Series to LaTeX-formatted table.
+
+        Notes
+        -----
+        Requires the `tabulate <https://pypi.org/project/tabulate>`_ package.
+
+        Examples
+            --------
+            >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal")
+            >>> print(s.to_markdown())
+            |    | animal   |
+            |---:|:---------|
+            |  0 | elk      |
+            |  1 | pig      |
+            |  2 | dog      |
+            |  3 | quetzal  |
+
+            Output markdown with a tabulate option.
+
+            >>> print(s.to_markdown(tablefmt="grid"))
+            +----+----------+
+            |    | animal   |
+            +====+==========+
+            |  0 | elk      |
+            +----+----------+
+            |  1 | pig      |
+            +----+----------+
+            |  2 | dog      |
+            +----+----------+
+            |  3 | quetzal  |
+            +----+----------+
+        """
+        return self.to_frame().to_markdown(
+            buf, mode=mode, index=index, storage_options=storage_options, **kwargs
+        )
+
+    # ----------------------------------------------------------------------
+
+    def items(self) -> Iterable[tuple[Hashable, Any]]:
+        """
+        Lazily iterate over (index, value) tuples.
+
+        This method returns an iterable tuple (index, value). This is
+        convenient if you want to create a lazy iterator.
+
+        Returns
+        -------
+        iterable
+            Iterable of tuples containing the (index, value) pairs from a
+            Series.
+
+        See Also
+        --------
+        DataFrame.items : Iterate over (column name, Series) pairs.
+        DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) pairs.
+
+        Examples
+        --------
+        >>> s = pd.Series(["A", "B", "C"])
+        >>> for index, value in s.items():
+        ...     print(f"Index : {index}, Value : {value}")
+        Index : 0, Value : A
+        Index : 1, Value : B
+        Index : 2, Value : C
+        """
+        return zip(iter(self.index), iter(self), strict=True)
+
+    # ----------------------------------------------------------------------
+    # Misc public methods
+
+    def keys(self) -> Index:
+        """
+        Return alias for index.
+
+        Returns
+        -------
+        Index
+            Index of the Series.
+
+        See Also
+        --------
+        Series.index : The index (axis labels) of the Series.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3], index=[0, 1, 2])
+        >>> s.keys()
+        Index([0, 1, 2], dtype='int64')
+        """
+        return self.index
+
+    @overload
+    def to_dict(
+        self, *, into: type[MutableMappingT] | MutableMappingT
+    ) -> MutableMappingT: ...
+
+    @overload
+    def to_dict(self, *, into: type[dict] = ...) -> dict: ...
+
+    # error: Incompatible default for argument "into" (default has type "type[
+    # dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT")
+    def to_dict(
+        self,
+        *,
+        into: type[MutableMappingT] | MutableMappingT = dict,  # type: ignore[assignment]
+    ) -> MutableMappingT:
+        """
+        Convert Series to {label -> value} dict or dict-like object.
+
+        Parameters
+        ----------
+        into : class, default dict
+            The collections.abc.MutableMapping subclass to use as the return
+            object. Can be the actual class or an empty instance of the mapping
+            type you want.  If you want a collections.defaultdict, you must
+            pass it initialized.
+
+        Returns
+        -------
+        collections.abc.MutableMapping
+            Key-value representation of Series.
+
+        See Also
+        --------
+        Series.to_list: Converts Series to a list of the values.
+        Series.to_numpy: Converts Series to NumPy ndarray.
+        Series.array: ExtensionArray of the data backing this Series.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3, 4])
+        >>> s.to_dict()
+        {0: 1, 1: 2, 2: 3, 3: 4}
+        >>> from collections import OrderedDict, defaultdict
+        >>> s.to_dict(into=OrderedDict)
+        OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
+        >>> dd = defaultdict(list)
+        >>> s.to_dict(into=dd)
+        defaultdict(<class 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
+        """
+        # GH16122
+        into_c = com.standardize_mapping(into)
+
+        if is_object_dtype(self.dtype) or isinstance(self.dtype, ExtensionDtype):
+            return into_c((k, maybe_box_native(v)) for k, v in self.items())
+        else:
+            # Not an object dtype => all types will be the same so let the default
+            # indexer return native python type
+            return into_c(self.items())
+
+    def to_frame(self, name: Hashable = lib.no_default) -> DataFrame:
+        """
+        Convert Series to DataFrame.
+
+        Parameters
+        ----------
+        name : object, optional
+            The passed name should substitute for the series name (if it has
+            one).
+
+        Returns
+        -------
+        DataFrame
+            DataFrame representation of Series.
+
+        See Also
+        --------
+        Series.to_dict : Convert Series to dict object.
+
+        Examples
+        --------
+        >>> s = pd.Series(["a", "b", "c"], name="vals")
+        >>> s.to_frame()
+          vals
+        0    a
+        1    b
+        2    c
+        """
+        columns: Index
+        if name is lib.no_default:
+            name = self.name
+            if name is None:
+                # default to [0], same as we would get with DataFrame(self)
+                columns = default_index(1)
+            else:
+                columns = Index([name])
+        else:
+            columns = Index([name])
+
+        mgr = self._mgr.to_2d_mgr(columns)
+        df = self._constructor_expanddim_from_mgr(mgr, axes=mgr.axes)
+        return df.__finalize__(self, method="to_frame")
+
+    @classmethod
+    def from_arrow(cls, data: ArrowArrayExportable | ArrowStreamExportable) -> Series:
+        """
+        Construct a Series from an array-like Arrow object.
+
+        This function accepts any Arrow-compatible array-like object implementing
+        the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_array__``
+        or ``__arrow_c_stream__`` method).
+
+        This function currently relies on ``pyarrow`` to convert the object
+        in Arrow format to pandas.
+
+        .. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
+
+        .. versionadded:: 3.0
+
+        Parameters
+        ----------
+        data : pyarrow.Array or Arrow-compatible object
+            Any array-like object implementing the Arrow PyCapsule Protocol
+            (i.e. has an ``__arrow_c_array__`` or ``__arrow_c_stream__``
+            method).
+
+        Returns
+        -------
+        Series
+
+        See Also
+        --------
+        DataFrame.from_arrow : Construct a DataFrame from an Arrow object.
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> arrow_array = pa.array([1, 2, 3])
+        >>> pd.Series.from_arrow(arrow_array)
+        0    1
+        1    2
+        2    3
+        dtype: int64
+        """
+        pa = import_optional_dependency("pyarrow", min_version="14.0.0")
+        if not isinstance(data, (pa.Array, pa.ChunkedArray)):
+            if not (
+                hasattr(data, "__arrow_c_array__")
+                or hasattr(data, "__arrow_c_stream__")
+            ):
+                # explicitly test this, because otherwise we would accept variour other
+                # input types through the pa.chunked_array(..) call
+                raise TypeError(
+                    "Expected an Arrow-compatible array-like object (i.e. having an "
+                    "'_arrow_c_array__' or '__arrow_c_stream__' method), got "
+                    f"'{type(data).__name__}' instead."
+                )
+            # using chunked_array() as it works for both arrays and streams
+            pa_array = pa.chunked_array(data)
+        else:
+            pa_array = data
+
+        ser = pa_array.to_pandas()
+        return ser
+
+    def _set_name(self, name, inplace: bool = False) -> Series:
+        """
+        Set the Series name.
+
+        Parameters
+        ----------
+        name : str
+        inplace : bool
+            Whether to modify `self` directly or return a copy.
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        ser = self if inplace else self.copy(deep=False)
+        ser.name = name
+        return ser
+
+    @Appender(
+        dedent(
+            """
+        Examples
+        --------
+        >>> ser = pd.Series([390., 350., 30., 20.],
+        ...                 index=['Falcon', 'Falcon', 'Parrot', 'Parrot'],
+        ...                 name="Max Speed")
+        >>> ser
+        Falcon    390.0
+        Falcon    350.0
+        Parrot     30.0
+        Parrot     20.0
+        Name: Max Speed, dtype: float64
+
+        We can pass a list of values to group the Series data by custom labels:
+
+        >>> ser.groupby(["a", "b", "a", "b"]).mean()
+        a    210.0
+        b    185.0
+        Name: Max Speed, dtype: float64
+
+        Grouping by numeric labels yields similar results:
+
+        >>> ser.groupby([0, 1, 0, 1]).mean()
+        0    210.0
+        1    185.0
+        Name: Max Speed, dtype: float64
+
+        We can group by a level of the index:
+
+        >>> ser.groupby(level=0).mean()
+        Falcon    370.0
+        Parrot     25.0
+        Name: Max Speed, dtype: float64
+
+        We can group by a condition applied to the Series values:
+
+        >>> ser.groupby(ser > 100).mean()
+        Max Speed
+        False     25.0
+        True     370.0
+        Name: Max Speed, dtype: float64
+
+        **Grouping by Indexes**
+
+        We can groupby different levels of a hierarchical index
+        using the `level` parameter:
+
+        >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
+        ...           ['Captive', 'Wild', 'Captive', 'Wild']]
+        >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
+        >>> ser = pd.Series([390., 350., 30., 20.], index=index, name="Max Speed")
+        >>> ser
+        Animal  Type
+        Falcon  Captive    390.0
+                Wild       350.0
+        Parrot  Captive     30.0
+                Wild        20.0
+        Name: Max Speed, dtype: float64
+
+        >>> ser.groupby(level=0).mean()
+        Animal
+        Falcon    370.0
+        Parrot     25.0
+        Name: Max Speed, dtype: float64
+
+        We can also group by the 'Type' level of the hierarchical index
+        to get the mean speed for each type:
+
+        >>> ser.groupby(level="Type").mean()
+        Type
+        Captive    210.0
+        Wild       185.0
+        Name: Max Speed, dtype: float64
+
+        We can also choose to include `NA` in group keys or not by defining
+        `dropna` parameter, the default setting is `True`.
+
+        >>> ser = pd.Series([1, 2, 3, 3], index=["a", 'a', 'b', np.nan])
+        >>> ser.groupby(level=0).sum()
+        a    3
+        b    3
+        dtype: int64
+
+        To include `NA` values in the group keys, set `dropna=False`:
+
+        >>> ser.groupby(level=0, dropna=False).sum()
+        a    3
+        b    3
+        NaN  3
+        dtype: int64
+
+        We can also group by a custom list with NaN values to handle
+        missing group labels:
+
+        >>> arrays = ['Falcon', 'Falcon', 'Parrot', 'Parrot']
+        >>> ser = pd.Series([390., 350., 30., 20.], index=arrays, name="Max Speed")
+        >>> ser.groupby(["a", "b", "a", np.nan]).mean()
+        a    210.0
+        b    350.0
+        Name: Max Speed, dtype: float64
+
+        >>> ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()
+        a    210.0
+        b    350.0
+        NaN   20.0
+        Name: Max Speed, dtype: float64
+        """
+        )
+    )
+    @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
+    @deprecate_nonkeyword_arguments(
+        Pandas4Warning, allowed_args=["self", "by", "level"], name="groupby"
+    )
+    def groupby(
+        self,
+        by=None,
+        level: IndexLabel | None = None,
+        as_index: bool = True,
+        sort: bool = True,
+        group_keys: bool = True,
+        observed: bool = True,
+        dropna: bool = True,
+    ) -> SeriesGroupBy:
+        from pandas.core.groupby.generic import SeriesGroupBy
+
+        if level is None and by is None:
+            raise TypeError("You have to supply one of 'by' and 'level'")
+        if not as_index:
+            raise TypeError("as_index=False only valid with DataFrame")
+
+        return SeriesGroupBy(
+            obj=self,
+            keys=by,
+            level=level,
+            as_index=as_index,
+            sort=sort,
+            group_keys=group_keys,
+            observed=observed,
+            dropna=dropna,
+        )
+
+    # ----------------------------------------------------------------------
+    # Statistics, overridden ndarray methods
+
+    # TODO: integrate bottleneck
+    def count(self) -> int:
+        """
+        Return number of non-NA/null observations in the Series.
+
+        Returns
+        -------
+        int
+            Number of non-null values in the Series.
+
+        See Also
+        --------
+        DataFrame.count : Count non-NA cells for each column or row.
+
+        Examples
+        --------
+        >>> s = pd.Series([0.0, 1.0, np.nan])
+        >>> s.count()
+        2
+        """
+        return maybe_unbox_numpy_scalar(notna(self._values).sum().astype("int64"))
+
+    def mode(self, dropna: bool = True) -> Series:
+        """
+        Return the mode(s) of the Series.
+
+        The mode is the value that appears most often. There can be multiple modes.
+
+        Always returns Series even if only one value is returned.
+
+        Parameters
+        ----------
+        dropna : bool, default True
+            Don't consider counts of NaN/NaT.
+
+        Returns
+        -------
+        Series
+            Modes of the Series in sorted order.
+
+        See Also
+        --------
+        numpy.mode : Equivalent numpy function for computing median.
+        Series.sum : Sum of the values.
+        Series.median : Median of the values.
+        Series.std : Standard deviation of the values.
+        Series.var : Variance of the values.
+        Series.min : Minimum value.
+        Series.max : Maximum value.
+
+        Examples
+        --------
+        >>> s = pd.Series([2, 4, 2, 2, 4, None])
+        >>> s.mode()
+        0    2.0
+        dtype: float64
+
+        More than one mode:
+
+        >>> s = pd.Series([2, 4, 8, 2, 4, None])
+        >>> s.mode()
+        0    2.0
+        1    4.0
+        dtype: float64
+
+        With and without considering null value:
+
+        >>> s = pd.Series([2, 4, None, None, 4, None])
+        >>> s.mode(dropna=False)
+        0   NaN
+        dtype: float64
+        >>> s = pd.Series([2, 4, None, None, 4, None])
+        >>> s.mode()
+        0    4.0
+        dtype: float64
+        """
+        # TODO: Add option for bins like value_counts()
+        values = self._values
+        if isinstance(values, np.ndarray):
+            res_values, _ = algorithms.mode(values, dropna=dropna)
+        else:
+            res_values = values._mode(dropna=dropna)
+
+        # Ensure index is type stable (should always use int index)
+        return self._constructor(
+            res_values,
+            index=range(len(res_values)),
+            name=self.name,
+            copy=False,
+            dtype=self.dtype,
+        ).__finalize__(self, method="mode")
+
+    def unique(self) -> ArrayLike:
+        """
+        Return unique values of Series object.
+
+        Uniques are returned in order of appearance. Hash table-based unique,
+        therefore does NOT sort.
+
+        Returns
+        -------
+        ndarray or ExtensionArray
+            The unique values returned as a NumPy array. See Notes.
+
+        See Also
+        --------
+        Series.drop_duplicates : Return Series with duplicate values removed.
+        unique : Top-level unique method for any 1-d array-like object.
+        Index.unique : Return Index with unique values from an Index object.
+
+        Notes
+        -----
+        Returns the unique values as a NumPy array. In case of an
+        extension-array backed Series, a new
+        :class:`~api.extensions.ExtensionArray` of that type with just
+        the unique values is returned. This includes
+
+            * Categorical
+            * Period
+            * Datetime with Timezone
+            * Datetime without Timezone
+            * Timedelta
+            * Interval
+            * Sparse
+            * IntegerNA
+
+        See Examples section.
+
+        Examples
+        --------
+        >>> pd.Series([2, 1, 3, 3], name="A").unique()
+        array([2, 1, 3])
+
+        >>> pd.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).unique()
+        <DatetimeArray>
+        ['2016-01-01 00:00:00']
+        Length: 1, dtype: datetime64[us]
+
+        >>> pd.Series(
+        ...     [pd.Timestamp("2016-01-01", tz="US/Eastern") for _ in range(3)]
+        ... ).unique()
+        <DatetimeArray>
+        ['2016-01-01 00:00:00-05:00']
+        Length: 1, dtype: datetime64[us, US/Eastern]
+
+        A Categorical will return categories in the order of
+        appearance and with the same dtype.
+
+        >>> pd.Series(pd.Categorical(list("baabc"))).unique()
+        ['b', 'a', 'c']
+        Categories (3, str): ['a', 'b', 'c']
+        >>> pd.Series(
+        ...     pd.Categorical(list("baabc"), categories=list("abc"), ordered=True)
+        ... ).unique()
+        ['b', 'a', 'c']
+        Categories (3, str): ['a' < 'b' < 'c']
+        """
+        return super().unique()
+
+    @overload
+    def drop_duplicates(
+        self,
+        *,
+        keep: DropKeep = ...,
+        inplace: Literal[False] = ...,
+        ignore_index: bool = ...,
+    ) -> Series: ...
+
+    @overload
+    def drop_duplicates(
+        self, *, keep: DropKeep = ..., inplace: Literal[True], ignore_index: bool = ...
+    ) -> None: ...
+
+    @overload
+    def drop_duplicates(
+        self, *, keep: DropKeep = ..., inplace: bool = ..., ignore_index: bool = ...
+    ) -> Series | None: ...
+
+    def drop_duplicates(
+        self,
+        *,
+        keep: DropKeep = "first",
+        inplace: bool = False,
+        ignore_index: bool = False,
+    ) -> Series | None:
+        """
+        Return Series with duplicate values removed.
+
+        Parameters
+        ----------
+        keep : {'first', 'last', ``False``}, default 'first'
+            Method to handle dropping duplicates:
+
+            - 'first' : Drop duplicates except for the first occurrence.
+            - 'last' : Drop duplicates except for the last occurrence.
+            - ``False`` : Drop all duplicates.
+
+        inplace : bool, default ``False``
+            If ``True``, performs operation inplace and returns None.
+
+        ignore_index : bool, default ``False``
+            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
+
+            .. versionadded:: 2.0.0
+
+        Returns
+        -------
+        Series or None
+            Series with duplicates dropped or None if ``inplace=True``.
+
+        See Also
+        --------
+        Index.drop_duplicates : Equivalent method on Index.
+        DataFrame.drop_duplicates : Equivalent method on DataFrame.
+        Series.duplicated : Related method on Series, indicating duplicate
+            Series values.
+        Series.unique : Return unique values as an array.
+
+        Examples
+        --------
+        Generate a Series with duplicated entries.
+
+        >>> s = pd.Series(
+        ...     ["llama", "cow", "llama", "beetle", "llama", "hippo"], name="animal"
+        ... )
+        >>> s
+        0     llama
+        1       cow
+        2     llama
+        3    beetle
+        4     llama
+        5     hippo
+        Name: animal, dtype: str
+
+        With the 'keep' parameter, the selection behavior of duplicated values
+        can be changed. The value 'first' keeps the first occurrence for each
+        set of duplicated entries. The default value of keep is 'first'.
+
+        >>> s.drop_duplicates()
+        0     llama
+        1       cow
+        3    beetle
+        5     hippo
+        Name: animal, dtype: str
+
+        The value 'last' for parameter 'keep' keeps the last occurrence for
+        each set of duplicated entries.
+
+        >>> s.drop_duplicates(keep="last")
+        1       cow
+        3    beetle
+        4     llama
+        5     hippo
+        Name: animal, dtype: str
+
+        The value ``False`` for parameter 'keep' discards all sets of
+        duplicated entries.
+
+        >>> s.drop_duplicates(keep=False)
+        1       cow
+        3    beetle
+        5     hippo
+        Name: animal, dtype: str
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        result = super().drop_duplicates(keep=keep)
+
+        if ignore_index:
+            result.index = default_index(len(result))
+
+        if inplace:
+            self._update_inplace(result)
+            return None
+        else:
+            return result
+
+    def duplicated(self, keep: DropKeep = "first") -> Series:
+        """
+        Indicate duplicate Series values.
+
+        Duplicated values are indicated as ``True`` values in the resulting
+        Series. Either all duplicates, all except the first or all except the
+        last occurrence of duplicates can be indicated.
+
+        Parameters
+        ----------
+        keep : {'first', 'last', False}, default 'first'
+            Method to handle dropping duplicates:
+
+            - 'first' : Mark duplicates as ``True`` except for the first
+              occurrence.
+            - 'last' : Mark duplicates as ``True`` except for the last
+              occurrence.
+            - ``False`` : Mark all duplicates as ``True``.
+
+        Returns
+        -------
+        Series[bool]
+            Series indicating whether each value has occurred in the
+            preceding values.
+
+        See Also
+        --------
+        Index.duplicated : Equivalent method on pandas.Index.
+        DataFrame.duplicated : Equivalent method on pandas.DataFrame.
+        Series.drop_duplicates : Remove duplicate values from Series.
+
+        Examples
+        --------
+        By default, for each set of duplicated values, the first occurrence is
+        set on False and all others on True:
+
+        >>> animals = pd.Series(["llama", "cow", "llama", "beetle", "llama"])
+        >>> animals.duplicated()
+        0    False
+        1    False
+        2     True
+        3    False
+        4     True
+        dtype: bool
+
+        which is equivalent to
+
+        >>> animals.duplicated(keep="first")
+        0    False
+        1    False
+        2     True
+        3    False
+        4     True
+        dtype: bool
+
+        By using 'last', the last occurrence of each set of duplicated values
+        is set on False and all others on True:
+
+        >>> animals.duplicated(keep="last")
+        0     True
+        1    False
+        2     True
+        3    False
+        4    False
+        dtype: bool
+
+        By setting keep on ``False``, all duplicates are True:
+
+        >>> animals.duplicated(keep=False)
+        0     True
+        1    False
+        2     True
+        3    False
+        4     True
+        dtype: bool
+        """
+        res = self._duplicated(keep=keep)
+        result = self._constructor(res, index=self.index, copy=False)
+        return result.__finalize__(self, method="duplicated")
+
+    def idxmin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashable:
+        """
+        Return the row label of the minimum value.
+
+        If multiple values equal the minimum, the first row label with that
+        value is returned.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+        skipna : bool, default True
+            Exclude NA/null values. If the entire Series is NA, or if ``skipna=False``
+            and there is an NA value, this method will raise a ``ValueError``.
+        *args, **kwargs
+            Additional arguments and keywords have no effect but might be
+            accepted for compatibility with NumPy.
+
+        Returns
+        -------
+        Index
+            Label of the minimum value.
+
+        Raises
+        ------
+        ValueError
+            If the Series is empty.
+
+        See Also
+        --------
+        numpy.argmin : Return indices of the minimum values
+            along the given axis.
+        DataFrame.idxmin : Return index of first occurrence of minimum
+            over requested axis.
+        Series.idxmax : Return index *label* of the first occurrence
+            of maximum of values.
+
+        Notes
+        -----
+        This method is the Series version of ``ndarray.argmin``. This method
+        returns the label of the minimum, while ``ndarray.argmin`` returns
+        the position. To get the position, use ``series.values.argmin()``.
+
+        Examples
+        --------
+        >>> s = pd.Series(data=[1, None, 4, 1], index=["A", "B", "C", "D"])
+        >>> s
+        A    1.0
+        B    NaN
+        C    4.0
+        D    1.0
+        dtype: float64
+
+        >>> s.idxmin()
+        'A'
+        """
+        axis = self._get_axis_number(axis)
+        iloc = self.argmin(axis, skipna, *args, **kwargs)
+        return self.index[iloc]
+
+    def idxmax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashable:
+        """
+        Return the row label of the maximum value.
+
+        If multiple values equal the maximum, the first row label with that
+        value is returned.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+        skipna : bool, default True
+            Exclude NA/null values. If the entire Series is NA, or if ``skipna=False``
+            and there is an NA value, this method will raise a ``ValueError``.
+        *args, **kwargs
+            Additional arguments and keywords have no effect but might be
+            accepted for compatibility with NumPy.
+
+        Returns
+        -------
+        Index
+            Label of the maximum value.
+
+        Raises
+        ------
+        ValueError
+            If the Series is empty.
+
+        See Also
+        --------
+        numpy.argmax : Return indices of the maximum values
+            along the given axis.
+        DataFrame.idxmax : Return index of first occurrence of maximum
+            over requested axis.
+        Series.idxmin : Return index *label* of the first occurrence
+            of minimum of values.
+
+        Notes
+        -----
+        This method is the Series version of ``ndarray.argmax``. This method
+        returns the label of the maximum, while ``ndarray.argmax`` returns
+        the position. To get the position, use ``series.values.argmax()``.
+
+        Examples
+        --------
+        >>> s = pd.Series(data=[1, None, 4, 3, 4], index=["A", "B", "C", "D", "E"])
+        >>> s
+        A    1.0
+        B    NaN
+        C    4.0
+        D    3.0
+        E    4.0
+        dtype: float64
+
+        >>> s.idxmax()
+        'C'
+        """
+        axis = self._get_axis_number(axis)
+        iloc = self.argmax(axis, skipna, *args, **kwargs)
+        return self.index[iloc]
+
+    def round(self, decimals: int = 0, *args, **kwargs) -> Series:
+        """
+        Round each value in a Series to the given number of decimals.
+
+        Parameters
+        ----------
+        decimals : int, default 0
+            Number of decimal places to round to. If decimals is negative,
+            it specifies the number of positions to the left of the decimal point.
+        *args, **kwargs
+            Additional arguments and keywords have no effect but might be
+            accepted for compatibility with NumPy.
+
+        Returns
+        -------
+        Series
+            Rounded values of the Series.
+
+        See Also
+        --------
+        numpy.around : Round values of an np.array.
+        DataFrame.round : Round values of a DataFrame.
+        Series.dt.round : Round values of data to the specified freq.
+
+        Notes
+        -----
+        For values exactly halfway between rounded decimal values, pandas rounds
+        to the nearest even value (e.g. -0.5 and 0.5 round to 0.0, 1.5 and 2.5
+        round to 2.0, etc.).
+
+        Examples
+        --------
+        >>> s = pd.Series([-0.5, 0.1, 2.5, 1.3, 2.7])
+        >>> s.round()
+        0   -0.0
+        1    0.0
+        2    2.0
+        3    1.0
+        4    3.0
+        dtype: float64
+        """
+
+        nv.validate_round(args, kwargs)
+
+        if len(self) == 0:
+            return self.copy()
+
+        if is_object_dtype(self.dtype):
+            values = self._values
+            result = lib.map_infer(values, lambda x: round(x, decimals), convert=False)
+            return self._constructor(result, index=self.index, copy=False).__finalize__(
+                self, method="round"
+            )
+        new_mgr = self._mgr.round(decimals=decimals)
+        return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(
+            self, method="round"
+        )
+
+    @overload
+    def quantile(
+        self, q: float = ..., interpolation: QuantileInterpolation = ...
+    ) -> float: ...
+
+    @overload
+    def quantile(
+        self,
+        q: Sequence[float] | AnyArrayLike,
+        interpolation: QuantileInterpolation = ...,
+    ) -> Series: ...
+
+    @overload
+    def quantile(
+        self,
+        q: float | Sequence[float] | AnyArrayLike = ...,
+        interpolation: QuantileInterpolation = ...,
+    ) -> float | Series: ...
+
+    def quantile(
+        self,
+        q: float | Sequence[float] | AnyArrayLike = 0.5,
+        interpolation: QuantileInterpolation = "linear",
+    ) -> float | Series:
+        """
+        Return value at the given quantile.
+
+        Parameters
+        ----------
+        q : float or array-like, default 0.5 (50% quantile)
+            The quantile(s) to compute, which can lie in range: 0 <= q <= 1.
+        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+            This optional parameter specifies the interpolation method to use,
+            when the desired quantile lies between two data points `i` and `j`:
+
+                * linear: `i + (j - i) * (x-i)/(j-i)`, where `(x-i)/(j-i)` is
+                  the fractional part of the index surrounded by `i > j`.
+                * lower: `i`.
+                * higher: `j`.
+                * nearest: `i` or `j` whichever is nearest.
+                * midpoint: (`i` + `j`) / 2.
+
+        Returns
+        -------
+        float or Series
+            If ``q`` is an array, a Series will be returned where the
+            index is ``q`` and the values are the quantiles, otherwise
+            a float will be returned.
+
+        See Also
+        --------
+        core.window.Rolling.quantile : Calculate the rolling quantile.
+        numpy.percentile : Returns the q-th percentile(s) of the array elements.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3, 4])
+        >>> s.quantile(0.5)
+        2.5
+        >>> s.quantile([0.25, 0.5, 0.75])
+        0.25    1.75
+        0.50    2.50
+        0.75    3.25
+        dtype: float64
+        """
+        validate_percentile(q)
+
+        # We dispatch to DataFrame so that core.internals only has to worry
+        #  about 2D cases.
+        df = self.to_frame()
+
+        result = df.quantile(q=q, interpolation=interpolation, numeric_only=False)
+        if result.ndim == 2:
+            result = result.iloc[:, 0]
+
+        if is_list_like(q):
+            result.name = self.name
+            idx = Index(q, dtype=np.float64)
+            return self._constructor(result, index=idx, name=self.name)
+        else:
+            # scalar
+            return maybe_unbox_numpy_scalar(result.iloc[0])
+
+    def corr(
+        self,
+        other: Series,
+        method: CorrelationMethod = "pearson",
+        min_periods: int | None = None,
+    ) -> float:
+        """
+        Compute correlation with `other` Series, excluding missing values.
+
+        The two `Series` objects are not required to be the same length and will be
+        aligned internally before the correlation function is applied.
+
+        Parameters
+        ----------
+        other : Series
+            Series with which to compute the correlation.
+        method : {'pearson', 'kendall', 'spearman'} or callable
+            Method used to compute correlation:
+
+            - pearson : Standard correlation coefficient
+            - kendall : Kendall Tau correlation coefficient
+            - spearman : Spearman rank correlation
+            - callable: Callable with input two 1d ndarrays and returning a float.
+
+            .. warning::
+                Note that the returned matrix from corr will have 1 along the
+                diagonals and will be symmetric regardless of the callable's
+                behavior.
+        min_periods : int, optional
+            Minimum number of observations needed to have a valid result.
+
+        Returns
+        -------
+        float
+            Correlation with other.
+
+        See Also
+        --------
+        DataFrame.corr : Compute pairwise correlation between columns.
+        DataFrame.corrwith : Compute pairwise correlation with another
+            DataFrame or Series.
+
+        Notes
+        -----
+        Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.
+
+        * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
+        * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
+        * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
+
+        Automatic data alignment: as with all pandas operations, automatic data alignment is performed for this method.
+        ``corr()`` automatically considers values with matching indices.
+
+        Examples
+        --------
+        >>> def histogram_intersection(a, b):
+        ...     v = np.minimum(a, b).sum().round(decimals=1)
+        ...     return v
+        >>> s1 = pd.Series([0.2, 0.0, 0.6, 0.2])
+        >>> s2 = pd.Series([0.3, 0.6, 0.0, 0.1])
+        >>> s1.corr(s2, method=histogram_intersection)
+        0.3
+
+        Pandas auto-aligns the values with matching indices
+
+        >>> s1 = pd.Series([1, 2, 3], index=[0, 1, 2])
+        >>> s2 = pd.Series([1, 2, 3], index=[2, 1, 0])
+        >>> s1.corr(s2)
+        -1.0
+
+        If the input is a constant array, the correlation is not defined in this case,
+        and ``np.nan`` is returned.
+
+        >>> s1 = pd.Series([0.45, 0.45])
+        >>> s1.corr(s1)
+        nan
+        """  # noqa: E501
+        this, other = self.align(other, join="inner")
+        if len(this) == 0:
+            return np.nan
+
+        this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False)
+        other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False)
+
+        if method in ["pearson", "spearman", "kendall"] or callable(method):
+            result = nanops.nancorr(
+                this_values, other_values, method=method, min_periods=min_periods
+            )
+            result = maybe_unbox_numpy_scalar(result)
+            return result
+
+        raise ValueError(
+            "method must be either 'pearson', "
+            "'spearman', 'kendall', or a callable, "
+            f"'{method}' was supplied"
+        )
+
+    def cov(
+        self,
+        other: Series,
+        min_periods: int | None = None,
+        ddof: int | None = 1,
+    ) -> float:
+        """
+        Compute covariance with Series, excluding missing values.
+
+        The two `Series` objects are not required to be the same length and
+        will be aligned internally before the covariance is calculated.
+
+        Parameters
+        ----------
+        other : Series
+            Series with which to compute the covariance.
+        min_periods : int, optional
+            Minimum number of observations needed to have a valid result.
+        ddof : int, default 1
+            Delta degrees of freedom.  The divisor used in calculations
+            is ``N - ddof``, where ``N`` represents the number of elements.
+
+        Returns
+        -------
+        float
+            Covariance between Series and other normalized by N-1
+            (unbiased estimator).
+
+        See Also
+        --------
+        DataFrame.cov : Compute pairwise covariance of columns.
+
+        Examples
+        --------
+        >>> s1 = pd.Series([0.90010907, 0.13484424, 0.62036035])
+        >>> s2 = pd.Series([0.12528585, 0.26962463, 0.51111198])
+        >>> s1.cov(s2)
+        -0.01685762652715874
+        """
+        this, other = self.align(other, join="inner")
+        if len(this) == 0:
+            return np.nan
+        this_values = this.to_numpy(dtype=float, na_value=np.nan, copy=False)
+        other_values = other.to_numpy(dtype=float, na_value=np.nan, copy=False)
+        result = nanops.nancov(
+            this_values, other_values, min_periods=min_periods, ddof=ddof
+        )
+        result = maybe_unbox_numpy_scalar(result)
+        return result
+
+    def diff(self, periods: int = 1) -> Series:
+        """
+        First discrete difference of Series elements.
+
+        Calculates the difference of a Series element compared with another
+        element in the Series (default is element in previous row).
+
+        Parameters
+        ----------
+        periods : int, default 1
+            Periods to shift for calculating difference, accepts negative
+            values.
+
+        Returns
+        -------
+        Series
+            First differences of the Series.
+
+        See Also
+        --------
+        Series.pct_change: Percent change over given number of periods.
+        Series.shift: Shift index by desired number of periods with an
+            optional time freq.
+        DataFrame.diff: First discrete difference of object.
+
+        Notes
+        -----
+        For boolean dtypes, this uses :meth:`operator.xor` rather than
+        :meth:`operator.sub`.
+        The result is calculated according to current dtype in Series,
+        however dtype of the result is always float64.
+
+        Examples
+        --------
+
+        Difference with previous row
+
+        >>> s = pd.Series([1, 1, 2, 3, 5, 8])
+        >>> s.diff()
+        0    NaN
+        1    0.0
+        2    1.0
+        3    1.0
+        4    2.0
+        5    3.0
+        dtype: float64
+
+        Difference with 3rd previous row
+
+        >>> s.diff(periods=3)
+        0    NaN
+        1    NaN
+        2    NaN
+        3    2.0
+        4    4.0
+        5    6.0
+        dtype: float64
+
+        Difference with following row
+
+        >>> s.diff(periods=-1)
+        0    0.0
+        1   -1.0
+        2   -1.0
+        3   -2.0
+        4   -3.0
+        5    NaN
+        dtype: float64
+
+        Overflow in input dtype
+
+        >>> s = pd.Series([1, 0], dtype=np.uint8)
+        >>> s.diff()
+        0      NaN
+        1    255.0
+        dtype: float64
+        """
+        if not lib.is_integer(periods):
+            if not (is_float(periods) and periods.is_integer()):
+                raise ValueError("periods must be an integer")
+        result = algorithms.diff(self._values, periods)
+        return self._constructor(
+            result, index=self.index.view(), copy=False
+        ).__finalize__(self, method="diff")
+
+    def autocorr(self, lag: int = 1) -> float:
+        """
+        Compute the lag-N autocorrelation.
+
+        This method computes the Pearson correlation between
+        the Series and its shifted self.
+
+        Parameters
+        ----------
+        lag : int, default 1
+            Number of lags to apply before performing autocorrelation.
+
+        Returns
+        -------
+        float
+            The Pearson correlation between self and self.shift(lag).
+
+        See Also
+        --------
+        Series.corr : Compute the correlation between two Series.
+        Series.shift : Shift index by desired number of periods.
+        DataFrame.corr : Compute pairwise correlation of columns.
+        DataFrame.corrwith : Compute pairwise correlation between rows or
+            columns of two DataFrame objects.
+
+        Notes
+        -----
+        If the Pearson correlation is not well defined return 'NaN'.
+
+        Examples
+        --------
+        >>> s = pd.Series([0.25, 0.5, 0.2, -0.05])
+        >>> s.autocorr()  # doctest: +ELLIPSIS
+        0.10355...
+        >>> s.autocorr(lag=2)  # doctest: +ELLIPSIS
+        -0.99999...
+
+        If the Pearson correlation is not well defined, then 'NaN' is returned.
+
+        >>> s = pd.Series([1, 0, 0, 0])
+        >>> s.autocorr()
+        nan
+        """
+        return self.corr(cast(Series, self.shift(lag)))
+
+    def dot(self, other: AnyArrayLike | DataFrame) -> Series | np.ndarray:
+        """
+        Compute the dot product between the Series and the columns of other.
+
+        This method computes the dot product between the Series and another
+        one, or the Series and each columns of a DataFrame, or the Series and
+        each columns of an array.
+
+        It can also be called using `self @ other`.
+
+        Parameters
+        ----------
+        other : Series, DataFrame or array-like
+            The other object to compute the dot product with its columns.
+
+        Returns
+        -------
+        scalar, Series or numpy.ndarray
+            Return the dot product of the Series and other if other is a
+            Series, the Series of the dot product of Series and each rows of
+            other if other is a DataFrame or a numpy.ndarray between the Series
+            and each columns of the numpy array.
+
+        See Also
+        --------
+        DataFrame.dot: Compute the matrix product with the DataFrame.
+        Series.mul: Multiplication of series and other, element-wise.
+
+        Notes
+        -----
+        The Series and other has to share the same index if other is a Series
+        or a DataFrame.
+
+        Examples
+        --------
+        >>> s = pd.Series([0, 1, 2, 3])
+        >>> other = pd.Series([-1, 2, -3, 4])
+        >>> s.dot(other)
+        8
+        >>> s @ other
+        8
+        >>> df = pd.DataFrame([[0, 1], [-2, 3], [4, -5], [6, 7]])
+        >>> s.dot(df)
+        0    24
+        1    14
+        dtype: int64
+        >>> arr = np.array([[0, 1], [-2, 3], [4, -5], [6, 7]])
+        >>> s.dot(arr)
+        array([24, 14])
+        """
+        if isinstance(other, (Series, ABCDataFrame)):
+            common = self.index.union(other.index)
+            if len(common) > len(self.index) or len(common) > len(other.index):
+                raise ValueError("matrices are not aligned")
+
+            left = self.reindex(index=common)
+            right = other.reindex(index=common)
+            lvals = left.values
+            rvals = right.values
+        else:
+            lvals = self.values
+            rvals = np.asarray(other)
+            if lvals.shape[0] != rvals.shape[0]:
+                raise Exception(
+                    f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
+                )
+
+        if isinstance(other, ABCDataFrame):
+            common_type = find_common_type([self.dtypes, *list(other.dtypes)])
+            return self._constructor(
+                np.dot(lvals, rvals), index=other.columns, copy=False, dtype=common_type
+            ).__finalize__(self, method="dot")
+        elif isinstance(other, Series):
+            result = np.dot(lvals, rvals)
+        elif isinstance(rvals, np.ndarray):
+            result = np.dot(lvals, rvals)
+        else:  # pragma: no cover
+            raise TypeError(f"unsupported type: {type(other)}")
+        return maybe_unbox_numpy_scalar(result)
+
+    def __matmul__(self, other):
+        """
+        Matrix multiplication using binary `@` operator.
+        """
+        return self.dot(other)
+
+    def __rmatmul__(self, other):
+        """
+        Matrix multiplication using binary `@` operator.
+        """
+        return self.dot(np.transpose(other))
+
+    # Signature of "searchsorted" incompatible with supertype "IndexOpsMixin"
+    def searchsorted(  # type: ignore[override]
+        self,
+        value: NumpyValueArrayLike | ExtensionArray,
+        side: Literal["left", "right"] = "left",
+        sorter: NumpySorter | None = None,
+    ) -> npt.NDArray[np.intp] | np.intp:
+        """
+        Find indices where elements should be inserted to maintain order.
+
+        Find the indices into a sorted Series `self` such that, if the
+        corresponding elements in `value` were inserted before the indices,
+        the order of `self` would be preserved.
+
+        .. note::
+            The Series *must* be monotonically sorted, otherwise
+            wrong locations will likely be returned. Pandas does *not*
+            check this for you.
+
+        Parameters
+        ----------
+        value : array-like or scalar
+            Values to insert into `self`.
+        side : {'left', 'right'}, optional
+            If 'left', the index of the first suitable location found is given.
+            If 'right', return the last such index.  If there is no suitable
+            index, return either 0 or N (where N is the length of `self`).
+        sorter : 1-D array-like, optional
+            Optional array of integer indices that sort `self` into ascending
+            order. They are typically the result of ``np.argsort``.
+
+        Returns
+        -------
+        int or array of int
+            A scalar or array of insertion points with the
+            same shape as `value`.
+
+        See Also
+        --------
+        sort_values : Sort by the values along either axis.
+        numpy.searchsorted : Similar method from NumPy.
+
+        Notes
+        -----
+        Binary search is used to find the required insertion points.
+
+        Examples
+        --------
+        >>> ser = pd.Series([1, 2, 3])
+        >>> ser
+        0    1
+        1    2
+        2    3
+        dtype: int64
+        >>> ser.searchsorted(4)
+        np.int64(3)
+        >>> ser.searchsorted([0, 4])
+        array([0, 3])
+        >>> ser.searchsorted([1, 3], side="left")
+        array([0, 2])
+        >>> ser.searchsorted([1, 3], side="right")
+        array([1, 3])
+        >>> ser = pd.Series(pd.to_datetime(["3/11/2000", "3/12/2000", "3/13/2000"]))
+        >>> ser
+        0   2000-03-11
+        1   2000-03-12
+        2   2000-03-13
+        dtype: datetime64[us]
+        >>> ser.searchsorted("3/14/2000")
+        np.int64(3)
+        >>> ser = pd.Categorical(
+        ...     ["apple", "bread", "bread", "cheese", "milk"], ordered=True
+        ... )
+        >>> ser
+        ['apple', 'bread', 'bread', 'cheese', 'milk']
+        Categories (4, str): ['apple' < 'bread' < 'cheese' < 'milk']
+        >>> ser.searchsorted("bread")
+        np.int64(1)
+        >>> ser.searchsorted(["bread"], side="right")
+        array([3])
+
+        If the values are not monotonically sorted, wrong locations
+        may be returned:
+
+        >>> ser = pd.Series([2, 1, 3])
+        >>> ser
+        0    2
+        1    1
+        2    3
+        dtype: int64
+        >>> ser.searchsorted(1)  # doctest: +SKIP
+        0  # wrong result, correct would be 1
+        """
+        return base.IndexOpsMixin.searchsorted(self, value, side=side, sorter=sorter)
+
+    # -------------------------------------------------------------------
+    # Combination
+
+    def _append_internal(self, to_append: Series, ignore_index: bool = False) -> Series:
+        from pandas.core.reshape.concat import concat
+
+        return concat([self, to_append], ignore_index=ignore_index)
+
+    def compare(
+        self,
+        other: Series,
+        align_axis: Axis = 1,
+        keep_shape: bool = False,
+        keep_equal: bool = False,
+        result_names: Suffixes = ("self", "other"),
+    ) -> DataFrame | Series:
+        """
+        Compare to another Series and show the differences.
+
+        Parameters
+        ----------
+        other : Series
+            Object to compare with.
+
+        align_axis : {{0 or 'index', 1 or 'columns'}}, default 1
+            Determine which axis to align the comparison on.
+
+            * 0, or 'index' : Resulting differences are stacked vertically
+              with rows drawn alternately from self and other.
+            * 1, or 'columns' : Resulting differences are aligned horizontally
+              with columns drawn alternately from self and other.
+
+        keep_shape : bool, default False
+            If true, all rows and columns are kept.
+            Otherwise, only the ones with different values are kept.
+
+        keep_equal : bool, default False
+            If true, the result keeps values that are equal.
+            Otherwise, equal values are shown as NaNs.
+
+        result_names : tuple, default ('self', 'other')
+            Set the dataframes names in the comparison.
+
+        Returns
+        -------
+        Series or DataFrame
+            If axis is 0 or 'index' the result will be a Series.
+            The resulting index will be a MultiIndex with 'self' and 'other'
+            stacked alternately at the inner level.
+
+            If axis is 1 or 'columns' the result will be a DataFrame.
+            It will have two columns namely 'self' and 'other'.
+
+        See Also
+        --------
+        DataFrame.compare : Compare with another DataFrame and show differences.
+
+        Notes
+        -----
+        Matching NaNs will not appear as a difference.
+
+        Examples
+        --------
+        >>> s1 = pd.Series(["a", "b", "c", "d", "e"])
+        >>> s2 = pd.Series(["a", "a", "c", "b", "e"])
+
+        Align the differences on columns
+
+        >>> s1.compare(s2)
+          self other
+        1    b     a
+        3    d     b
+
+        Stack the differences on indices
+
+        >>> s1.compare(s2, align_axis=0)
+        1  self     b
+           other    a
+        3  self     d
+           other    b
+        dtype: str
+
+        Keep all original rows
+
+        >>> s1.compare(s2, keep_shape=True)
+          self other
+        0  NaN   NaN
+        1    b     a
+        2  NaN   NaN
+        3    d     b
+        4  NaN   NaN
+
+        Keep all original rows and also all original values
+
+        >>> s1.compare(s2, keep_shape=True, keep_equal=True)
+          self other
+        0    a     a
+        1    b     a
+        2    c     c
+        3    d     b
+        4    e     e
+        """
+
+        return super().compare(
+            other=other,
+            align_axis=align_axis,
+            keep_shape=keep_shape,
+            keep_equal=keep_equal,
+            result_names=result_names,
+        )
+
+    def combine(
+        self,
+        other: Series | Hashable,
+        func: Callable[[Hashable, Hashable], Hashable],
+        fill_value: Hashable | None = None,
+    ) -> Series:
+        """
+        Combine the Series with a Series or scalar according to `func`.
+
+        Combine the Series and `other` using `func` to perform elementwise
+        selection for combined Series.
+        `fill_value` is assumed when value is not present at some index
+        from one of the two Series being combined.
+
+        Parameters
+        ----------
+        other : Series or scalar
+            The value(s) to be combined with the `Series`.
+        func : function
+            Function that takes two scalars as inputs and returns an element.
+        fill_value : scalar, optional
+            The value to assume when an index is missing from
+            one Series or the other. The default specifies to use the
+            appropriate NaN value for the underlying dtype of the Series.
+
+        Returns
+        -------
+        Series
+            The result of combining the Series with the other object.
+
+        See Also
+        --------
+        Series.combine_first : Combine Series values, choosing the calling
+            Series' values first.
+
+        Examples
+        --------
+        Consider 2 Datasets ``s1`` and ``s2`` containing
+        highest clocked speeds of different birds.
+
+        >>> s1 = pd.Series({"falcon": 330.0, "eagle": 160.0})
+        >>> s1
+        falcon    330.0
+        eagle     160.0
+        dtype: float64
+        >>> s2 = pd.Series({"falcon": 345.0, "eagle": 200.0, "duck": 30.0})
+        >>> s2
+        falcon    345.0
+        eagle     200.0
+        duck       30.0
+        dtype: float64
+
+        Now, to combine the two datasets and view the highest speeds
+        of the birds across the two datasets
+
+        >>> s1.combine(s2, max)
+        duck        NaN
+        eagle     200.0
+        falcon    345.0
+        dtype: float64
+
+        In the previous example, the resulting value for duck is missing,
+        because the maximum of a NaN and a float is a NaN.
+        So, in the example, we set ``fill_value=0``,
+        so the maximum value returned will be the value from some dataset.
+
+        >>> s1.combine(s2, max, fill_value=0)
+        duck       30.0
+        eagle     200.0
+        falcon    345.0
+        dtype: float64
+        """
+        if fill_value is None:
+            fill_value = na_value_for_dtype(self.dtype, compat=False)
+
+        if isinstance(other, Series):
+            # If other is a Series, result is based on union of Series,
+            # so do this element by element
+            new_index = self.index.union(other.index)
+            new_name = ops.get_op_result_name(self, other)
+            new_values = np.empty(len(new_index), dtype=object)
+            with np.errstate(all="ignore"):
+                for i, idx in enumerate(new_index):
+                    lv = self.get(idx, fill_value)
+                    rv = other.get(idx, fill_value)
+                    new_values[i] = func(lv, rv)
+        else:
+            # Assume that other is a scalar, so apply the function for
+            # each element in the Series
+            new_index = self.index
+            new_values = np.empty(len(new_index), dtype=object)
+            with np.errstate(all="ignore"):
+                new_values[:] = [func(lv, other) for lv in self._values]
+            new_name = self.name
+
+        res_values = self.array._cast_pointwise_result(new_values)
+        return self._constructor(
+            res_values,
+            dtype=res_values.dtype,
+            index=new_index,
+            name=new_name,
+            copy=False,
+        )
+
+    def combine_first(self, other) -> Series:
+        """
+        Update null elements with value in the same location in 'other'.
+
+        Combine two Series objects by filling null values in one Series with
+        non-null values from the other Series. Result index will be the union
+        of the two indexes.
+
+        Parameters
+        ----------
+        other : Series
+            The value(s) to be used for filling null values.
+
+        Returns
+        -------
+        Series
+            The result of combining the provided Series with the other object.
+
+        See Also
+        --------
+        Series.combine : Perform element-wise operation on two Series
+            using a given function.
+
+        Examples
+        --------
+        >>> s1 = pd.Series([1, np.nan])
+        >>> s2 = pd.Series([3, 4, 5])
+        >>> s1.combine_first(s2)
+        0    1.0
+        1    4.0
+        2    5.0
+        dtype: float64
+
+        Null values still persist if the location of that null value
+        does not exist in `other`
+
+        >>> s1 = pd.Series({"falcon": np.nan, "eagle": 160.0})
+        >>> s2 = pd.Series({"eagle": 200.0, "duck": 30.0})
+        >>> s1.combine_first(s2)
+        duck       30.0
+        eagle     160.0
+        falcon      NaN
+        dtype: float64
+        """
+        from pandas.core.reshape.concat import concat
+
+        if self.dtype == other.dtype:
+            if self.index.equals(other.index):
+                return self.mask(self.isna(), other)
+
+        new_index = self.index.union(other.index)
+
+        this = self
+        # identify the index subset to keep for each series
+        keep_other = other.index.difference(this.index[notna(this)])
+        keep_this = this.index.difference(keep_other)
+
+        this = this.reindex(keep_this)
+        other = other.reindex(keep_other)
+
+        if this.dtype.kind == "M" and other.dtype.kind != "M":
+            # TODO: try to match resos?
+            other = to_datetime(other)
+            warnings.warn(
+                # GH#62931
+                "Silently casting non-datetime 'other' to datetime in "
+                "Series.combine_first is deprecated and will be removed "
+                "in a future version. Explicitly cast before calling "
+                "combine_first instead.",
+                Pandas4Warning,
+                stacklevel=find_stack_level(),
+            )
+
+        combined = concat([this, other])
+        combined = combined.reindex(new_index)
+        return combined.__finalize__(self, method="combine_first")
+
+    def update(self, other: Series | Sequence | Mapping) -> None:
+        """
+        Modify Series in place using values from passed Series.
+
+        Uses non-NA values from passed Series to make updates. Aligns
+        on index.
+
+        Parameters
+        ----------
+        other : Series, or object coercible into Series
+            Other Series that provides values to update the current Series.
+
+        See Also
+        --------
+        Series.combine : Perform element-wise operation on two Series
+            using a given function.
+        Series.transform: Modify a Series using a function.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.update(pd.Series([4, 5, 6]))
+        >>> s
+        0    4
+        1    5
+        2    6
+        dtype: int64
+
+        >>> s = pd.Series(["a", "b", "c"])
+        >>> s.update(pd.Series(["d", "e"], index=[0, 2]))
+        >>> s
+        0    d
+        1    b
+        2    e
+        dtype: str
+
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.update(pd.Series([4, 5, 6, 7, 8]))
+        >>> s
+        0    4
+        1    5
+        2    6
+        dtype: int64
+
+        If ``other`` contains NaNs the corresponding values are not updated
+        in the original Series.
+
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.update(pd.Series([4, np.nan, 6]))
+        >>> s
+        0    4
+        1    2
+        2    6
+        dtype: int64
+
+        ``other`` can also be a non-Series object type
+        that is coercible into a Series
+
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.update([4, np.nan, 6])
+        >>> s
+        0    4
+        1    2
+        2    6
+        dtype: int64
+
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.update({1: 9})
+        >>> s
+        0    1
+        1    9
+        2    3
+        dtype: int64
+        """
+        if not CHAINED_WARNING_DISABLED:
+            if sys.getrefcount(
+                self
+            ) <= REF_COUNT_METHOD and not com.is_local_in_caller_frame(self):
+                warnings.warn(
+                    _chained_assignment_method_update_msg,
+                    ChainedAssignmentError,
+                    stacklevel=2,
+                )
+
+        if not isinstance(other, Series):
+            other = Series(other)
+
+        other = other.reindex_like(self)
+        mask = notna(other)
+
+        self._mgr = self._mgr.putmask(mask=mask, new=other)
+
+    # ----------------------------------------------------------------------
+    # Reindexing, sorting
+
+    @overload
+    def sort_values(
+        self,
+        *,
+        axis: Axis = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: Literal[False] = ...,
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        ignore_index: bool = ...,
+        key: ValueKeyFunc = ...,
+    ) -> Series: ...
+
+    @overload
+    def sort_values(
+        self,
+        *,
+        axis: Axis = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: Literal[True],
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        ignore_index: bool = ...,
+        key: ValueKeyFunc = ...,
+    ) -> None: ...
+
+    @overload
+    def sort_values(
+        self,
+        *,
+        axis: Axis = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: bool = ...,
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        ignore_index: bool = ...,
+        key: ValueKeyFunc = ...,
+    ) -> Series | None: ...
+
+    def sort_values(
+        self,
+        *,
+        axis: Axis = 0,
+        ascending: bool | Sequence[bool] = True,
+        inplace: bool = False,
+        kind: SortKind = "quicksort",
+        na_position: NaPosition = "last",
+        ignore_index: bool = False,
+        key: ValueKeyFunc | None = None,
+    ) -> Series | None:
+        """
+        Sort by the values.
+
+        Sort a Series in ascending or descending order by some
+        criterion.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+        ascending : bool or list of bools, default True
+            If True, sort values in ascending order, otherwise descending.
+        inplace : bool, default False
+            If True, perform operation in-place.
+        kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
+            Choice of sorting algorithm. See also :func:`numpy.sort` for more
+            information. 'mergesort' and 'stable' are the only stable  algorithms.
+        na_position : {'first' or 'last'}, default 'last'
+            Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
+            the end.
+        ignore_index : bool, default False
+            If True, the resulting axis will be labeled 0, 1, …, n - 1.
+        key : callable, optional
+            If not None, apply the key function to the series values
+            before sorting. This is similar to the `key` argument in the
+            builtin :meth:`sorted` function, with the notable difference that
+            this `key` function should be *vectorized*. It should expect a
+            ``Series`` and return an array-like.
+
+        Returns
+        -------
+        Series or None
+            Series ordered by values or None if ``inplace=True``.
+
+        See Also
+        --------
+        Series.sort_index : Sort by the Series indices.
+        DataFrame.sort_values : Sort DataFrame by the values along either axis.
+        DataFrame.sort_index : Sort DataFrame by indices.
+
+        Examples
+        --------
+        >>> s = pd.Series([np.nan, 1, 3, 10, 5])
+        >>> s
+        0     NaN
+        1     1.0
+        2     3.0
+        3     10.0
+        4     5.0
+        dtype: float64
+
+        Sort values ascending order (default behavior)
+
+        >>> s.sort_values(ascending=True)
+        1     1.0
+        2     3.0
+        4     5.0
+        3    10.0
+        0     NaN
+        dtype: float64
+
+        Sort values descending order
+
+        >>> s.sort_values(ascending=False)
+        3    10.0
+        4     5.0
+        2     3.0
+        1     1.0
+        0     NaN
+        dtype: float64
+
+        Sort values putting NAs first
+
+        >>> s.sort_values(na_position="first")
+        0     NaN
+        1     1.0
+        2     3.0
+        4     5.0
+        3    10.0
+        dtype: float64
+
+        Sort a series of strings
+
+        >>> s = pd.Series(["z", "b", "d", "a", "c"])
+        >>> s
+        0    z
+        1    b
+        2    d
+        3    a
+        4    c
+        dtype: str
+
+        >>> s.sort_values()
+        3    a
+        1    b
+        4    c
+        2    d
+        0    z
+        dtype: str
+
+        Sort using a key function. Your `key` function will be
+        given the ``Series`` of values and should return an array-like.
+
+        >>> s = pd.Series(["a", "B", "c", "D", "e"])
+        >>> s.sort_values()
+        1    B
+        3    D
+        0    a
+        2    c
+        4    e
+        dtype: str
+        >>> s.sort_values(key=lambda x: x.str.lower())
+        0    a
+        1    B
+        2    c
+        3    D
+        4    e
+        dtype: str
+
+        NumPy ufuncs work well here. For example, we can
+        sort by the ``sin`` of the value
+
+        >>> s = pd.Series([-4, -2, 0, 2, 4])
+        >>> s.sort_values(key=np.sin)
+        1   -2
+        4    4
+        2    0
+        0   -4
+        3    2
+        dtype: int64
+
+        More complicated user-defined functions can be used,
+        as long as they expect a Series and return an array-like
+
+        >>> s.sort_values(key=lambda x: (np.tan(x.cumsum())))
+        0   -4
+        3    2
+        4    4
+        1   -2
+        2    0
+        dtype: int64
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        # Validate the axis parameter
+        self._get_axis_number(axis)
+
+        if is_list_like(ascending):
+            ascending = cast(Sequence[bool], ascending)
+            if len(ascending) != 1:
+                raise ValueError(
+                    f"Length of ascending ({len(ascending)}) must be 1 for Series"
+                )
+            ascending = ascending[0]
+
+        ascending = validate_ascending(ascending)
+
+        if na_position not in ["first", "last"]:
+            raise ValueError(f"invalid na_position: {na_position}")
+
+        # GH 35922. Make sorting stable by leveraging nargsort
+        if key:
+            values_to_sort = cast(Series, ensure_key_mapped(self, key))._values
+        else:
+            values_to_sort = self._values
+        sorted_index = nargsort(values_to_sort, kind, bool(ascending), na_position)
+
+        if is_range_indexer(sorted_index, len(sorted_index)):
+            if inplace:
+                return self._update_inplace(self)
+            return self.copy(deep=False)
+
+        result = self._constructor(
+            self._values[sorted_index], index=self.index[sorted_index], copy=False
+        )
+
+        if ignore_index:
+            result.index = default_index(len(sorted_index))
+
+        if not inplace:
+            return result.__finalize__(self, method="sort_values")
+        self._update_inplace(result)
+        return None
+
+    @overload
+    def sort_index(
+        self,
+        *,
+        axis: Axis = ...,
+        level: IndexLabel = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: Literal[True],
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        sort_remaining: bool = ...,
+        ignore_index: bool = ...,
+        key: IndexKeyFunc = ...,
+    ) -> None: ...
+
+    @overload
+    def sort_index(
+        self,
+        *,
+        axis: Axis = ...,
+        level: IndexLabel = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: Literal[False] = ...,
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        sort_remaining: bool = ...,
+        ignore_index: bool = ...,
+        key: IndexKeyFunc = ...,
+    ) -> Series: ...
+
+    @overload
+    def sort_index(
+        self,
+        *,
+        axis: Axis = ...,
+        level: IndexLabel = ...,
+        ascending: bool | Sequence[bool] = ...,
+        inplace: bool = ...,
+        kind: SortKind = ...,
+        na_position: NaPosition = ...,
+        sort_remaining: bool = ...,
+        ignore_index: bool = ...,
+        key: IndexKeyFunc = ...,
+    ) -> Series | None: ...
+
+    def sort_index(
+        self,
+        *,
+        axis: Axis = 0,
+        level: IndexLabel | None = None,
+        ascending: bool | Sequence[bool] = True,
+        inplace: bool = False,
+        kind: SortKind = "quicksort",
+        na_position: NaPosition = "last",
+        sort_remaining: bool = True,
+        ignore_index: bool = False,
+        key: IndexKeyFunc | None = None,
+    ) -> Series | None:
+        """
+        Sort Series by index labels.
+
+        Returns a new Series sorted by label if `inplace` argument is
+        ``False``, otherwise updates the original series and returns None.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+        level : int, optional
+            If not None, sort on values in specified index level(s).
+        ascending : bool or list-like of bools, default True
+            Sort ascending vs. descending. When the index is a MultiIndex the
+            sort direction can be controlled for each level individually.
+        inplace : bool, default False
+            If True, perform operation in-place.
+        kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
+            Choice of sorting algorithm. See also :func:`numpy.sort` for more
+            information. 'mergesort' and 'stable' are the only stable algorithms. For
+            DataFrames, this option is only applied when sorting on a single
+            column or label.
+        na_position : {'first', 'last'}, default 'last'
+            If 'first' puts NaNs at the beginning, 'last' puts NaNs at the end.
+            Not implemented for MultiIndex.
+        sort_remaining : bool, default True
+            If True and sorting by level and index is multilevel, sort by other
+            levels too (in order) after sorting by specified level.
+        ignore_index : bool, default False
+            If True, the resulting axis will be labeled 0, 1, …, n - 1.
+        key : callable, optional
+            If not None, apply the key function to the index values
+            before sorting. This is similar to the `key` argument in the
+            builtin :meth:`sorted` function, with the notable difference that
+            this `key` function should be *vectorized*. It should expect an
+            ``Index`` and return an ``Index`` of the same shape.
+
+        Returns
+        -------
+        Series or None
+            The original Series sorted by the labels or None if ``inplace=True``.
+
+        See Also
+        --------
+        DataFrame.sort_index: Sort DataFrame by the index.
+        DataFrame.sort_values: Sort DataFrame by the value.
+        Series.sort_values : Sort Series by the value.
+
+        Examples
+        --------
+        >>> s = pd.Series(["a", "b", "c", "d"], index=[3, 2, 1, 4])
+        >>> s.sort_index()
+        1    c
+        2    b
+        3    a
+        4    d
+        dtype: str
+
+        Sort Descending
+
+        >>> s.sort_index(ascending=False)
+        4    d
+        3    a
+        2    b
+        1    c
+        dtype: str
+
+        By default NaNs are put at the end, but use `na_position` to place
+        them at the beginning
+
+        >>> s = pd.Series(["a", "b", "c", "d"], index=[3, 2, 1, np.nan])
+        >>> s.sort_index(na_position="first")
+        NaN     d
+         1.0    c
+         2.0    b
+         3.0    a
+        dtype: str
+
+        Specify index level to sort
+
+        >>> arrays = [
+        ...     np.array(["qux", "qux", "foo", "foo", "baz", "baz", "bar", "bar"]),
+        ...     np.array(["two", "one", "two", "one", "two", "one", "two", "one"]),
+        ... ]
+        >>> s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=arrays)
+        >>> s.sort_index(level=1)
+        bar  one    8
+        baz  one    6
+        foo  one    4
+        qux  one    2
+        bar  two    7
+        baz  two    5
+        foo  two    3
+        qux  two    1
+        dtype: int64
+
+        Does not sort by remaining levels when sorting by levels
+
+        >>> s.sort_index(level=1, sort_remaining=False)
+        qux  one    2
+        foo  one    4
+        baz  one    6
+        bar  one    8
+        qux  two    1
+        foo  two    3
+        baz  two    5
+        bar  two    7
+        dtype: int64
+
+        Apply a key function before sorting
+
+        >>> s = pd.Series([1, 2, 3, 4], index=["A", "b", "C", "d"])
+        >>> s.sort_index(key=lambda x: x.str.lower())
+        A    1
+        b    2
+        C    3
+        d    4
+        dtype: int64
+        """
+
+        return super().sort_index(
+            axis=axis,
+            level=level,
+            ascending=ascending,
+            inplace=inplace,
+            kind=kind,
+            na_position=na_position,
+            sort_remaining=sort_remaining,
+            ignore_index=ignore_index,
+            key=key,
+        )
+
+    def argsort(
+        self,
+        axis: Axis = 0,
+        kind: SortKind = "quicksort",
+        order: None = None,
+        stable: None = None,
+    ) -> Series:
+        """
+        Return the integer indices that would sort the Series values.
+
+        Override ndarray.argsort. Argsorts the value, omitting NA/null values,
+        and places the result in the same locations as the non-NA values.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+        kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort'
+            Choice of sorting algorithm. See :func:`numpy.sort` for more
+            information. 'mergesort' and 'stable' are the only stable algorithms.
+        order : None
+            Has no effect but is accepted for compatibility with numpy.
+        stable : None
+            Has no effect but is accepted for compatibility with numpy.
+
+        Returns
+        -------
+        Series[np.intp]
+            Positions of values within the sort order with -1 indicating
+            nan values.
+
+        See Also
+        --------
+        numpy.ndarray.argsort : Returns the indices that would sort this array.
+
+        Examples
+        --------
+        >>> s = pd.Series([3, 2, 1])
+        >>> s.argsort()
+        0    2
+        1    1
+        2    0
+        dtype: int64
+        """
+        if axis != -1:
+            # GH#54257 We allow -1 here so that np.argsort(series) works
+            self._get_axis_number(axis)
+
+        result = self.array.argsort(kind=kind)
+
+        res = self._constructor(
+            result, index=self.index, name=self.name, dtype=np.intp, copy=False
+        )
+        return res.__finalize__(self, method="argsort")
+
+    def nlargest(
+        self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
+    ) -> Series:
+        """
+        Return the largest `n` elements.
+
+        Parameters
+        ----------
+        n : int, default 5
+            Return this many descending sorted values.
+        keep : {'first', 'last', 'all'}, default 'first'
+            When there are duplicate values that cannot all fit in a
+            Series of `n` elements:
+
+            - ``first`` : return the first `n` occurrences in order
+              of appearance.
+            - ``last`` : return the last `n` occurrences in reverse
+              order of appearance.
+            - ``all`` : keep all occurrences. This can result in a Series of
+              size larger than `n`.
+
+        Returns
+        -------
+        Series
+            The `n` largest values in the Series, sorted in decreasing order.
+
+        See Also
+        --------
+        Series.nsmallest: Get the `n` smallest elements.
+        Series.sort_values: Sort Series by values.
+        Series.head: Return the first `n` rows.
+
+        Notes
+        -----
+        Faster than ``.sort_values(ascending=False).head(n)`` for small `n`
+        relative to the size of the ``Series`` object.
+
+        Examples
+        --------
+        >>> countries_population = {
+        ...     "Italy": 59000000,
+        ...     "France": 65000000,
+        ...     "Malta": 434000,
+        ...     "Maldives": 434000,
+        ...     "Brunei": 434000,
+        ...     "Iceland": 337000,
+        ...     "Nauru": 11300,
+        ...     "Tuvalu": 11300,
+        ...     "Anguilla": 11300,
+        ...     "Montserrat": 5200,
+        ... }
+        >>> s = pd.Series(countries_population)
+        >>> s
+        Italy       59000000
+        France      65000000
+        Malta         434000
+        Maldives      434000
+        Brunei        434000
+        Iceland       337000
+        Nauru          11300
+        Tuvalu         11300
+        Anguilla       11300
+        Montserrat      5200
+        dtype: int64
+
+        The `n` largest elements where ``n=5`` by default.
+
+        >>> s.nlargest()
+        France      65000000
+        Italy       59000000
+        Malta         434000
+        Maldives      434000
+        Brunei        434000
+        dtype: int64
+
+        The `n` largest elements where ``n=3``. Default `keep` value is 'first'
+        so Malta will be kept.
+
+        >>> s.nlargest(3)
+        France    65000000
+        Italy     59000000
+        Malta       434000
+        dtype: int64
+
+        The `n` largest elements where ``n=3`` and keeping the last duplicates.
+        Brunei will be kept since it is the last with value 434000 based on
+        the index order.
+
+        >>> s.nlargest(3, keep="last")
+        France      65000000
+        Italy       59000000
+        Brunei        434000
+        dtype: int64
+
+        The `n` largest elements where ``n=3`` with all duplicates kept. Note
+        that the returned Series has five elements due to the three duplicates.
+
+        >>> s.nlargest(3, keep="all")
+        France      65000000
+        Italy       59000000
+        Malta         434000
+        Maldives      434000
+        Brunei        434000
+        dtype: int64
+        """
+        return selectn.SelectNSeries(self, n=n, keep=keep).nlargest()
+
+    def nsmallest(
+        self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
+    ) -> Series:
+        """
+        Return the smallest `n` elements.
+
+        Parameters
+        ----------
+        n : int, default 5
+            Return this many ascending sorted values.
+        keep : {'first', 'last', 'all'}, default 'first'
+            When there are duplicate values that cannot all fit in a
+            Series of `n` elements:
+
+            - ``first`` : return the first `n` occurrences in order
+              of appearance.
+            - ``last`` : return the last `n` occurrences in reverse
+              order of appearance.
+            - ``all`` : keep all occurrences. This can result in a Series of
+              size larger than `n`.
+
+        Returns
+        -------
+        Series
+            The `n` smallest values in the Series, sorted in increasing order.
+
+        See Also
+        --------
+        Series.nlargest: Get the `n` largest elements.
+        Series.sort_values: Sort Series by values.
+        Series.head: Return the first `n` rows.
+
+        Notes
+        -----
+        Faster than ``.sort_values().head(n)`` for small `n` relative to
+        the size of the ``Series`` object.
+
+        Examples
+        --------
+        >>> countries_population = {
+        ...     "Italy": 59000000,
+        ...     "France": 65000000,
+        ...     "Brunei": 434000,
+        ...     "Malta": 434000,
+        ...     "Maldives": 434000,
+        ...     "Iceland": 337000,
+        ...     "Nauru": 11300,
+        ...     "Tuvalu": 11300,
+        ...     "Anguilla": 11300,
+        ...     "Montserrat": 5200,
+        ... }
+        >>> s = pd.Series(countries_population)
+        >>> s
+        Italy       59000000
+        France      65000000
+        Brunei        434000
+        Malta         434000
+        Maldives      434000
+        Iceland       337000
+        Nauru          11300
+        Tuvalu         11300
+        Anguilla       11300
+        Montserrat      5200
+        dtype: int64
+
+        The `n` smallest elements where ``n=5`` by default.
+
+        >>> s.nsmallest()
+        Montserrat    5200
+        Nauru        11300
+        Tuvalu       11300
+        Anguilla     11300
+        Iceland     337000
+        dtype: int64
+
+        The `n` smallest elements where ``n=3``. Default `keep` value is
+        'first' so Nauru and Tuvalu will be kept.
+
+        >>> s.nsmallest(3)
+        Montserrat   5200
+        Nauru       11300
+        Tuvalu      11300
+        dtype: int64
+
+        The `n` smallest elements where ``n=3`` and keeping the last
+        duplicates. Anguilla and Tuvalu will be kept since they are the last
+        with value 11300 based on the index order.
+
+        >>> s.nsmallest(3, keep="last")
+        Montserrat   5200
+        Anguilla    11300
+        Tuvalu      11300
+        dtype: int64
+
+        The `n` smallest elements where ``n=3`` with all duplicates kept. Note
+        that the returned Series has four elements due to the three duplicates.
+
+        >>> s.nsmallest(3, keep="all")
+        Montserrat   5200
+        Nauru       11300
+        Tuvalu      11300
+        Anguilla    11300
+        dtype: int64
+        """
+        return selectn.SelectNSeries(self, n=n, keep=keep).nsmallest()
+
+    def swaplevel(
+        self, i: Level = -2, j: Level = -1, copy: bool | lib.NoDefault = lib.no_default
+    ) -> Series:
+        """
+        Swap levels i and j in a :class:`MultiIndex`.
+
+        Default is to swap the two innermost levels of the index.
+
+        Parameters
+        ----------
+        i, j : int or str
+            Levels of the indices to be swapped. Can pass level name as string.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        Returns
+        -------
+        Series
+            Series with levels swapped in MultiIndex.
+
+        See Also
+        --------
+        DataFrame.swaplevel : Swap levels i and j in a :class:`DataFrame`.
+        Series.reorder_levels : Rearrange index levels using input order.
+        MultiIndex.swaplevel : Swap levels i and j in a :class:`MultiIndex`.
+
+        Examples
+        --------
+        >>> s = pd.Series(
+        ...     ["A", "B", "A", "C"],
+        ...     index=[
+        ...         ["Final exam", "Final exam", "Coursework", "Coursework"],
+        ...         ["History", "Geography", "History", "Geography"],
+        ...         ["January", "February", "March", "April"],
+        ...     ],
+        ... )
+        >>> s
+        Final exam  History    January     A
+                    Geography  February    B
+        Coursework  History    March       A
+                    Geography  April       C
+        dtype: str
+
+        In the following example, we will swap the levels of the indices.
+        Here, we will swap the levels column-wise, but levels can be swapped row-wise
+        in a similar manner. Note that column-wise is the default behavior.
+        By not supplying any arguments for i and j, we swap the last and second to
+        last indices.
+
+        >>> s.swaplevel()
+        Final exam  January   History       A
+                    February  Geography     B
+        Coursework  March     History       A
+                    April     Geography     C
+        dtype: str
+
+        By supplying one argument, we can choose which index to swap the last
+        index with. We can for example swap the first index with the last one as
+        follows.
+
+        >>> s.swaplevel(0)
+        January     History     Final exam      A
+        February    Geography   Final exam      B
+        March       History     Coursework      A
+        April       Geography   Coursework      C
+        dtype: str
+
+        We can also define explicitly which indices we want to swap by supplying values
+        for both i and j. Here, we for example swap the first and second indices.
+
+        >>> s.swaplevel(0, 1)
+        History     Final exam  January         A
+        Geography   Final exam  February        B
+        History     Coursework  March           A
+        Geography   Coursework  April           C
+        dtype: str
+        """
+        self._check_copy_deprecation(copy)
+        assert isinstance(self.index, MultiIndex)
+        result = self.copy(deep=False)
+        result.index = self.index.swaplevel(i, j)
+        return result
+
+    def reorder_levels(self, order: Sequence[Level]) -> Series:
+        """
+        Rearrange index levels using input order.
+
+        May not drop or duplicate levels.
+
+        Parameters
+        ----------
+        order : list of int representing new level order
+            Reference level by number or key.
+
+        Returns
+        -------
+        Series
+            Type of caller with index as MultiIndex (new object).
+
+        See Also
+        --------
+        DataFrame.reorder_levels : Rearrange index or column levels using
+            input ``order``.
+
+        Examples
+        --------
+        >>> arrays = [
+        ...     np.array(["dog", "dog", "cat", "cat", "bird", "bird"]),
+        ...     np.array(["white", "black", "white", "black", "white", "black"]),
+        ... ]
+        >>> s = pd.Series([1, 2, 3, 3, 5, 2], index=arrays)
+        >>> s
+        dog   white    1
+              black    2
+        cat   white    3
+              black    3
+        bird  white    5
+              black    2
+        dtype: int64
+        >>> s.reorder_levels([1, 0])
+        white  dog     1
+        black  dog     2
+        white  cat     3
+        black  cat     3
+        white  bird    5
+        black  bird    2
+        dtype: int64
+        """
+        if not isinstance(self.index, MultiIndex):  # pragma: no cover
+            raise Exception("Can only reorder levels on a hierarchical axis.")
+
+        result = self.copy(deep=False)
+        assert isinstance(result.index, MultiIndex)
+        result.index = result.index.reorder_levels(order)
+        return result
+
+    def explode(self, ignore_index: bool = False) -> Series:
+        """
+        Transform each element of a list-like to a row.
+
+        Parameters
+        ----------
+        ignore_index : bool, default False
+            If True, the resulting index will be labeled 0, 1, …, n - 1.
+
+        Returns
+        -------
+        Series
+            Exploded lists to rows; index will be duplicated for these rows.
+
+        See Also
+        --------
+        Series.str.split : Split string values on specified separator.
+        Series.unstack : Unstack, a.k.a. pivot, Series with MultiIndex
+            to produce DataFrame.
+        DataFrame.melt : Unpivot a DataFrame from wide format to long format.
+        DataFrame.explode : Explode a DataFrame from list-like
+            columns to long format.
+
+        Notes
+        -----
+        This routine will explode list-likes including lists, tuples, sets,
+        Series, and np.ndarray. The result dtype of the subset rows will
+        be object. Scalars will be returned unchanged, and empty list-likes will
+        result in an np.nan for that row. In addition, the ordering of elements in
+        the output will be non-deterministic when exploding sets.
+
+        Reference :ref:`the user guide <reshaping.explode>` for more examples.
+
+        Examples
+        --------
+        >>> s = pd.Series([[1, 2, 3], "foo", [], [3, 4]])
+        >>> s
+        0    [1, 2, 3]
+        1          foo
+        2           []
+        3       [3, 4]
+        dtype: object
+
+        >>> s.explode()
+        0      1
+        0      2
+        0      3
+        1    foo
+        2    NaN
+        3      3
+        3      4
+        dtype: object
+        """
+        if isinstance(self.dtype, ExtensionDtype):
+            values, counts = self._values._explode()
+        elif len(self) and is_object_dtype(self.dtype):
+            values, counts = reshape.explode(np.asarray(self._values))
+        else:
+            result = self.copy()
+            return result.reset_index(drop=True) if ignore_index else result
+
+        if ignore_index:
+            index: Index = default_index(len(values))
+        else:
+            index = self.index.repeat(counts)
+
+        return self._constructor(values, index=index, name=self.name, copy=False)
+
+    def unstack(
+        self,
+        level: IndexLabel = -1,
+        fill_value: Hashable | None = None,
+        sort: bool = True,
+    ) -> DataFrame:
+        """
+        Unstack, also known as pivot, Series with MultiIndex to produce DataFrame.
+
+        Parameters
+        ----------
+        level : int, str, or list of these, default last level
+            Level(s) to unstack, can pass level name.
+        fill_value : scalar value, default None
+            Value to use when replacing NaN values.
+        sort : bool, default True
+            Sort the level(s) in the resulting MultiIndex columns.
+
+        Returns
+        -------
+        DataFrame
+            Unstacked Series.
+
+        See Also
+        --------
+        DataFrame.unstack : Pivot the MultiIndex of a DataFrame.
+
+        Notes
+        -----
+        Reference :ref:`the user guide <reshaping.stacking>` for more examples.
+
+        Examples
+        --------
+        >>> s = pd.Series(
+        ...     [1, 2, 3, 4],
+        ...     index=pd.MultiIndex.from_product([["one", "two"], ["a", "b"]]),
+        ... )
+        >>> s
+        one  a    1
+             b    2
+        two  a    3
+             b    4
+        dtype: int64
+
+        >>> s.unstack(level=-1)
+             a  b
+        one  1  2
+        two  3  4
+
+        >>> s.unstack(level=0)
+           one  two
+        a    1    3
+        b    2    4
+        """
+        from pandas.core.reshape.reshape import unstack
+
+        return unstack(self, level, fill_value, sort)
+
+    # ----------------------------------------------------------------------
+    # function application
+
+    def map(
+        self,
+        func: Callable | Mapping | Series | None = None,
+        na_action: Literal["ignore"] | None = None,
+        engine: Callable | None = None,
+        **kwargs,
+    ) -> Series:
+        """
+        Map values of Series according to an input mapping or function.
+
+        Used for substituting each value in a Series with another value,
+        that may be derived from a function, a ``dict`` or
+        a :class:`Series`.
+
+        Parameters
+        ----------
+        func : function, collections.abc.Mapping subclass or Series
+            Function or mapping correspondence.
+        na_action : {None, 'ignore'}, default None
+            If 'ignore', propagate NaN values, without passing them to the
+            mapping correspondence.
+        engine : decorator, optional
+            Choose the execution engine to use to run the function. Only used for
+            functions. If ``map`` is called with a mapping or ``Series``, an
+            exception will be raised. If ``engine`` is not provided the function will
+            be executed by the regular Python interpreter.
+
+            Options include JIT compilers such as Numba, Bodo or Blosc2, which in some
+            cases can speed up the execution. To use an executor you can provide the
+            decorators ``numba.jit``, ``numba.njit``, ``bodo.jit`` or ``blosc2.jit``.
+            You can also provide the decorator with parameters, like
+            ``numba.jit(nogit=True)``.
+
+            Not all functions can be executed with all execution engines. In general,
+            JIT compilers will require type stability in the function (no variable
+            should change data type during the execution). And not all pandas and
+            NumPy APIs are supported. Check the engine documentation for limitations.
+
+            .. versionadded:: 3.0.0
+
+        **kwargs
+            Additional keyword arguments to pass as keywords arguments to
+            `arg`.
+
+            .. versionadded:: 3.0.0
+
+        Returns
+        -------
+        Series
+            Same index as caller.
+
+        See Also
+        --------
+        Series.apply : For applying more complex functions on a Series.
+        Series.replace: Replace values given in `to_replace` with `value`.
+        DataFrame.apply : Apply a function row-/column-wise.
+        DataFrame.map : Apply a function elementwise on a whole DataFrame.
+
+        Notes
+        -----
+        When ``arg`` is a dictionary, values in Series that are not in the
+        dictionary (as keys) are converted to ``NaN``. However, if the
+        dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e.
+        provides a method for default values), then this default is used
+        rather than ``NaN``.
+
+        Examples
+        --------
+        >>> s = pd.Series(["cat", "dog", np.nan, "rabbit"])
+        >>> s
+        0      cat
+        1      dog
+        2      NaN
+        3   rabbit
+        dtype: str
+
+        ``map`` accepts a ``dict`` or a ``Series``. Values that are not found
+        in the ``dict`` are converted to ``NaN``, unless the dict has a default
+        value (e.g. ``defaultdict``):
+
+        >>> s.map({"cat": "kitten", "dog": "puppy"})
+        0   kitten
+        1    puppy
+        2      NaN
+        3      NaN
+        dtype: str
+
+        It also accepts a function:
+
+        >>> s.map("I am a {}".format)
+        0       I am a cat
+        1       I am a dog
+        2       I am a nan
+        3    I am a rabbit
+        dtype: str
+
+        To avoid applying the function to missing values (and keep them as
+        ``NaN``) ``na_action='ignore'`` can be used:
+
+        >>> s.map("I am a {}".format, na_action="ignore")
+        0     I am a cat
+        1     I am a dog
+        2            NaN
+        3  I am a rabbit
+        dtype: str
+
+        For categorical data, the function is only applied to the categories:
+
+        >>> s = pd.Series(list("cabaa"))
+        >>> s.map(print)
+        c
+        a
+        b
+        a
+        a
+        0    None
+        1    None
+        2    None
+        3    None
+        4    None
+        dtype: object
+
+        >>> s_cat = s.astype("category")
+        >>> s_cat.map(print)  # function called once per unique category
+        a
+        b
+        c
+        0    None
+        1    None
+        2    None
+        3    None
+        4    None
+        dtype: object
+        """
+        if func is None:
+            if "arg" in kwargs:
+                # `.map(arg=my_func)`
+                func = kwargs.pop("arg")
+                # https://github.com/pandas-dev/pandas/pull/61264
+                warnings.warn(
+                    "The parameter `arg` has been renamed to `func`, and it "
+                    "will stop being supported in a future version of pandas.",
+                    Pandas4Warning,
+                    stacklevel=find_stack_level(),
+                )
+            else:
+                raise ValueError("The `func` parameter is required")
+
+        if engine is not None:
+            if not callable(func):
+                raise ValueError(
+                    "The engine argument can only be specified when func is a function"
+                )
+            if not hasattr(engine, "__pandas_udf__"):
+                raise ValueError(f"Not a valid engine: {engine!r}")
+            result = engine.__pandas_udf__.map(  # type: ignore[attr-defined]
+                data=self,
+                func=func,
+                args=(),
+                kwargs=kwargs,
+                decorator=engine,
+                skip_na=na_action == "ignore",
+            )
+            if not isinstance(result, Series):
+                result = Series(result, index=self.index, name=self.name)
+            return result.__finalize__(self, method="map")
+
+        if callable(func):
+            func = functools.partial(func, **kwargs)
+        new_values = self._map_values(func, na_action=na_action)
+        return self._constructor(new_values, index=self.index, copy=False).__finalize__(
+            self, method="map"
+        )
+
+    def _gotitem(self, key, ndim, subset=None) -> Self:
+        """
+        Sub-classes to define. Return a sliced object.
+
+        Parameters
+        ----------
+        key : string / list of selections
+        ndim : {1, 2}
+            Requested ndim of result.
+        subset : object, default None
+            Subset to act on.
+        """
+        return self
+
+    _agg_see_also_doc = dedent(
+        """
+    See Also
+    --------
+    Series.apply : Invoke function on a Series.
+    Series.transform : Transform function producing a Series with like indexes.
+    """
+    )
+
+    _agg_examples_doc = dedent(
+        """
+    Examples
+    --------
+    >>> s = pd.Series([1, 2, 3, 4])
+    >>> s
+    0    1
+    1    2
+    2    3
+    3    4
+    dtype: int64
+
+    >>> s.agg('min')
+    1
+
+    >>> s.agg(['min', 'max'])
+    min   1
+    max   4
+    dtype: int64
+    """
+    )
+
+    def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):
+        """
+        Aggregate using one or more operations over the specified axis.
+
+        Parameters
+        ----------
+        func : function, str, list or dict
+            Function to use for aggregating the data. If a function, must either
+            work when passed a Series or when passed to Series.apply.
+
+            Accepted combinations are:
+
+            - function
+            - string function name
+            - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
+            - dict of axis labels -> functions, function names or list of such.
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+        *args
+            Positional arguments to pass to `func`.
+        **kwargs
+            Keyword arguments to pass to `func`.
+
+        Returns
+        -------
+        scalar, Series or DataFrame
+            The return can be:
+
+            * scalar : when Series.agg is called with single function
+            * Series : when DataFrame.agg is called with a single function
+            * DataFrame : when DataFrame.agg is called with several functions
+
+        See Also
+        --------
+        Series.apply : Invoke function on a Series.
+        Series.transform : Transform function producing a Series with like indexes.
+
+        Notes
+        -----
+        The aggregation operations are always performed over an axis, either the
+        index (default) or the column axis. This behavior is different from
+        `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
+        `var`), where the default is to compute the aggregation of the flattened
+        array, e.g., ``numpy.mean(arr_2d)`` as opposed to
+        ``numpy.mean(arr_2d, axis=0)``.
+
+        `agg` is an alias for `aggregate`. Use the alias.
+
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
+        for more details.
+
+        A passed user-defined-function will be passed a Series for evaluation.
+
+        If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3, 4])
+        >>> s
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+
+        >>> s.agg("min")
+        1
+
+        >>> s.agg(["min", "max"])
+        min   1
+        max   4
+        dtype: int64
+        """
+
+        # Validate the axis parameter
+        self._get_axis_number(axis)
+
+        # if func is None, will switch to user-provided "named aggregation" kwargs
+        if func is None:
+            func = dict(kwargs.items())
+
+        op = SeriesApply(self, func, args=args, kwargs=kwargs)
+        result = op.agg()
+        return result
+
+    agg = aggregate
+
+    def transform(
+        self, func: AggFuncType, axis: Axis = 0, *args, **kwargs
+    ) -> DataFrame | Series:
+        """
+        Call ``func`` on self producing a Series with the same axis shape as self.
+
+        Parameters
+        ----------
+        func : function, str, list-like or dict-like
+            Function to use for transforming the data. If a function, must either
+            work when passed a Series or when passed to Series.apply. If func
+            is both list-like and dict-like, dict-like behavior takes precedence.
+
+            Accepted combinations are:
+
+            - function
+            - string function name
+            - list-like of functions and/or function names, e.g. ``[np.exp, 'sqrt']``
+            - dict-like of axis labels -> functions, function names or list-like of such
+
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+
+        *args
+            Positional arguments to pass to `func`.
+        **kwargs
+            Keyword arguments to pass to `func`.
+
+        Returns
+        -------
+        Series
+            A Series that must have the same length as self.
+
+        Raises
+        ------
+        ValueError : If the returned Series has a different length than self.
+
+        See Also
+        --------
+        Series.agg : Only perform aggregating type operations.
+        Series.apply : Invoke function on a Series.
+
+        Notes
+        -----
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
+        for more details.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({"A": range(3), "B": range(1, 4)})
+        >>> df
+        A  B
+        0  0  1
+        1  1  2
+        2  2  3
+        >>> df.transform(lambda x: x + 1)
+        A  B
+        0  1  2
+        1  2  3
+        2  3  4
+
+        Even though the resulting Series must have the same length as the
+        input Series, it is possible to provide several input functions:
+
+        >>> s = pd.Series(range(3))
+        >>> s
+        0    0
+        1    1
+        2    2
+        dtype: int64
+        >>> s.transform([np.sqrt, np.exp])
+            sqrt        exp
+        0  0.000000   1.000000
+        1  1.000000   2.718282
+        2  1.414214   7.389056
+
+        You can call transform on a GroupBy object:
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "Date": [
+        ...             "2015-05-08",
+        ...             "2015-05-07",
+        ...             "2015-05-06",
+        ...             "2015-05-05",
+        ...             "2015-05-08",
+        ...             "2015-05-07",
+        ...             "2015-05-06",
+        ...             "2015-05-05",
+        ...         ],
+        ...         "Data": [5, 8, 6, 1, 50, 100, 60, 120],
+        ...     }
+        ... )
+        >>> df
+                Date  Data
+        0  2015-05-08     5
+        1  2015-05-07     8
+        2  2015-05-06     6
+        3  2015-05-05     1
+        4  2015-05-08    50
+        5  2015-05-07   100
+        6  2015-05-06    60
+        7  2015-05-05   120
+        >>> df.groupby("Date")["Data"].transform("sum")
+        0     55
+        1    108
+        2     66
+        3    121
+        4     55
+        5    108
+        6     66
+        7    121
+        Name: Data, dtype: int64
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "c": [1, 1, 1, 2, 2, 2, 2],
+        ...         "type": ["m", "n", "o", "m", "m", "n", "n"],
+        ...     }
+        ... )
+        >>> df
+        c type
+        0  1    m
+        1  1    n
+        2  1    o
+        3  2    m
+        4  2    m
+        5  2    n
+        6  2    n
+        >>> df["size"] = df.groupby("c")["type"].transform(len)
+        >>> df
+        c type size
+        0  1    m    3
+        1  1    n    3
+        2  1    o    3
+        3  2    m    4
+        4  2    m    4
+        5  2    n    4
+        6  2    n    4
+        """
+        # Validate axis argument
+        self._get_axis_number(axis)
+        ser = self.copy(deep=False)
+        result = SeriesApply(ser, func=func, args=args, kwargs=kwargs).transform()
+        return result
+
+    def apply(
+        self,
+        func: AggFuncType,
+        args: tuple[Any, ...] = (),
+        *,
+        by_row: Literal[False, "compat"] = "compat",
+        **kwargs,
+    ) -> DataFrame | Series:
+        """
+        Invoke function on values of Series.
+
+        Can be ufunc (a NumPy function that applies to the entire Series)
+        or a Python function that only works on single values.
+
+        Parameters
+        ----------
+        func : function
+            Python function or NumPy ufunc to apply.
+        args : tuple
+            Positional arguments passed to func after the series value.
+        by_row : False or "compat", default "compat"
+            If ``"compat"`` and func is a callable, func will be passed each element of
+            the Series, like ``Series.map``. If func is a list or dict of
+            callables, will first try to translate each func into pandas methods. If
+            that doesn't work, will try call to apply again with ``by_row="compat"``
+            and if that fails, will call apply again with ``by_row=False``
+            (backward compatible).
+            If False, the func will be passed the whole Series at once.
+
+            ``by_row`` has no effect when ``func`` is a string.
+
+            .. versionadded:: 2.1.0
+        **kwargs
+            Additional keyword arguments passed to func.
+
+        Returns
+        -------
+        Series or DataFrame
+            If func returns a Series object the result will be a DataFrame.
+
+        See Also
+        --------
+        Series.map: For element-wise operations.
+        Series.agg: Only perform aggregating type operations.
+        Series.transform: Only perform transforming type operations.
+
+        Notes
+        -----
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
+        for more details.
+
+        Examples
+        --------
+        Create a series with typical summer temperatures for each city.
+
+        >>> s = pd.Series([20, 21, 12], index=["London", "New York", "Helsinki"])
+        >>> s
+        London      20
+        New York    21
+        Helsinki    12
+        dtype: int64
+
+        Square the values by defining a function and passing it as an
+        argument to ``apply()``.
+
+        >>> def square(x):
+        ...     return x**2
+        >>> s.apply(square)
+        London      400
+        New York    441
+        Helsinki    144
+        dtype: int64
+
+        Square the values by passing an anonymous function as an
+        argument to ``apply()``.
+
+        >>> s.apply(lambda x: x**2)
+        London      400
+        New York    441
+        Helsinki    144
+        dtype: int64
+
+        Define a custom function that needs additional positional
+        arguments and pass these additional arguments using the
+        ``args`` keyword.
+
+        >>> def subtract_custom_value(x, custom_value):
+        ...     return x - custom_value
+
+        >>> s.apply(subtract_custom_value, args=(5,))
+        London      15
+        New York    16
+        Helsinki     7
+        dtype: int64
+
+        Define a custom function that takes keyword arguments
+        and pass these arguments to ``apply``.
+
+        >>> def add_custom_values(x, **kwargs):
+        ...     for month in kwargs:
+        ...         x += kwargs[month]
+        ...     return x
+
+        >>> s.apply(add_custom_values, june=30, july=20, august=25)
+        London      95
+        New York    96
+        Helsinki    87
+        dtype: int64
+
+        Use a function from the Numpy library.
+
+        >>> s.apply(np.log)
+        London      2.995732
+        New York    3.044522
+        Helsinki    2.484907
+        dtype: float64
+        """
+        return SeriesApply(
+            self,
+            func,
+            by_row=by_row,
+            args=args,
+            kwargs=kwargs,
+        ).apply()
+
+    def _reindex_indexer(
+        self,
+        new_index: Index | None,
+        indexer: npt.NDArray[np.intp] | None,
+    ) -> Series:
+        # Note: new_index is None iff indexer is None
+        # if not None, indexer is np.intp
+        if indexer is None and (
+            new_index is None or new_index.names == self.index.names
+        ):
+            return self.copy(deep=False)
+
+        new_values = algorithms.take_nd(
+            self._values, indexer, allow_fill=True, fill_value=None
+        )
+        return self._constructor(new_values, index=new_index, copy=False)
+
+    def _needs_reindex_multi(self, axes, method, level) -> bool:
+        """
+        Check if we do need a multi reindex; this is for compat with
+        higher dims.
+        """
+        return False
+
+    @overload
+    def rename(
+        self,
+        index: Renamer | Hashable | None = ...,
+        *,
+        axis: Axis | None = ...,
+        copy: bool | lib.NoDefault = ...,
+        inplace: Literal[True],
+        level: Level | None = ...,
+        errors: IgnoreRaise = ...,
+    ) -> Series | None: ...
+
+    @overload
+    def rename(
+        self,
+        index: Renamer | Hashable | None = ...,
+        *,
+        axis: Axis | None = ...,
+        copy: bool | lib.NoDefault = ...,
+        inplace: Literal[False] = ...,
+        level: Level | None = ...,
+        errors: IgnoreRaise = ...,
+    ) -> Series: ...
+
+    def rename(
+        self,
+        index: Renamer | Hashable | None = None,
+        *,
+        axis: Axis | None = None,
+        copy: bool | lib.NoDefault = lib.no_default,
+        inplace: bool = False,
+        level: Level | None = None,
+        errors: IgnoreRaise = "ignore",
+    ) -> Series | None:
+        """
+        Alter Series index labels or name.
+
+        Function / dict values must be unique (1-to-1). Labels not contained in
+        a dict / Series will be left as-is. Extra labels listed don't throw an
+        error.
+
+        Alternatively, change ``Series.name`` with a scalar value.
+
+        See the :ref:`user guide <basics.rename>` for more.
+
+        Parameters
+        ----------
+        index : scalar, hashable sequence, dict-like or function optional
+            Functions or dict-like are transformations to apply to
+            the index.
+            Scalar or hashable sequence-like will alter the ``Series.name``
+            attribute.
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        inplace : bool, default False
+            Whether to return a new Series. If True the value of copy is ignored.
+        level : int or level name, default None
+            In case of MultiIndex, only rename labels in the specified level.
+        errors : {'ignore', 'raise'}, default 'ignore'
+            If 'raise', raise `KeyError` when a `dict-like mapper` or
+            `index` contains labels that are not present in the index being transformed.
+            If 'ignore', existing keys will be renamed and extra keys will be ignored.
+
+        Returns
+        -------
+        Series
+            A shallow copy with index labels or name altered, or the same object
+            if ``inplace=True`` and index is not a dict or callable else None.
+
+        See Also
+        --------
+        DataFrame.rename : Corresponding DataFrame method.
+        Series.rename_axis : Set the name of the axis.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s
+        0    1
+        1    2
+        2    3
+        dtype: int64
+        >>> s.rename("my_name")  # scalar, changes Series.name
+        0    1
+        1    2
+        2    3
+        Name: my_name, dtype: int64
+        >>> s.rename(lambda x: x**2)  # function, changes labels
+        0    1
+        1    2
+        4    3
+        dtype: int64
+        >>> s.rename({1: 3, 2: 5})  # mapping, changes labels
+        0    1
+        3    2
+        5    3
+        dtype: int64
+        """
+        self._check_copy_deprecation(copy)
+        if axis is not None:
+            # Make sure we raise if an invalid 'axis' is passed.
+            axis = self._get_axis_number(axis)
+
+        if callable(index) or is_dict_like(index):
+            # error: Argument 1 to "_rename" of "NDFrame" has incompatible
+            # type "Union[Union[Mapping[Any, Hashable], Callable[[Any],
+            # Hashable]], Hashable, None]"; expected "Union[Mapping[Any,
+            # Hashable], Callable[[Any], Hashable], None]"
+            return super()._rename(
+                index,  # type: ignore[arg-type]
+                inplace=inplace,
+                level=level,
+                errors=errors,
+            )
+        else:
+            return self._set_name(index, inplace=inplace)
+
+    def set_axis(
+        self,
+        labels,
+        *,
+        axis: Axis = 0,
+        copy: bool | lib.NoDefault = lib.no_default,
+    ) -> Series:
+        """
+        Assign desired index to given axis.
+
+        .. deprecated:: 3.0.0
+            This keyword is ignored and will be removed in pandas 4.0. Since
+            pandas 3.0, this method always returns a new object using a lazy
+            copy mechanism that defers copies until necessary
+            (Copy-on-Write). See the `user guide on Copy-on-Write
+            <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+            for more details.
+
+        Indexes for row labels can be changed by assigning a list-like or Index.
+
+        Parameters
+        ----------
+        labels : list-like or Index
+            The values for the new index.
+        axis : {0 or 'index'}, default 0
+            The axis to update. The value 0 identifies the rows. For `Series`
+            this parameter is unused and defaults to 0.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+        Returns
+        -------
+        Series
+            A shallow copy of the object with axis altered to the given index.
+
+        See Also
+        --------
+        Series.rename_axis : Alter the name of the index.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s
+        0    1
+        1    2
+        2    3
+        dtype: int64
+        >>> s.set_axis(["a", "b", "c"], axis=0)
+        a    1
+        b    2
+        c    3
+        dtype: int64
+        """
+
+        return super().set_axis(labels, axis=axis, copy=copy)
+
+    # error: Cannot determine type of 'reindex'
+
+    def reindex(  # type: ignore[override]
+        self,
+        index=None,
+        *,
+        axis: Axis | None = None,
+        method: ReindexMethod | None = None,
+        copy: bool | lib.NoDefault = lib.no_default,
+        level: Level | None = None,
+        fill_value: Scalar | None = None,
+        limit: int | None = None,
+        tolerance=None,
+    ) -> Series:
+        """
+        Conform Series to new index with optional filling logic.
+
+        Places NA/NaN in locations having no value in the previous index. A new object
+        is produced unless the new index is equivalent to the current one and
+        ``copy=False``.
+
+        Parameters
+        ----------
+        index : scalar, list-like, dict-like or function, optional
+            A scalar, list-like, dict-like or functions transformations to
+            apply to that axis' values.
+        axis : {0 or 'index'}, default 0
+            The axis to rename. For `Series` this parameter is unused and defaults to 0.
+        method : {{None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}}
+            Method to use for filling holes in reindexed DataFrame.
+            Please note: this is only applicable to DataFrames/Series with a
+            monotonically increasing/decreasing index.
+
+            * None (default): don't fill gaps
+            * pad / ffill: Propagate last valid observation forward to next
+              valid.
+            * backfill / bfill: Use next valid observation to fill gap.
+            * nearest: Use nearest valid observations to fill gap.
+
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        level : int or name
+            Broadcast across a level, matching Index values on the
+            passed MultiIndex level.
+        fill_value : scalar, default np.nan
+            Value to use for missing values. Defaults to NaN, but can be any
+            "compatible" value.
+        limit : int, default None
+            Maximum number of consecutive elements to forward or backward fill.
+        tolerance : optional
+            Maximum distance between original and new labels for inexact
+            matches. The values of the index at the matching locations most
+            satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
+
+            Tolerance may be a scalar value, which applies the same tolerance
+            to all values, or list-like, which applies variable tolerance per
+            element. List-like includes list, tuple, array, Series, and must be
+            the same size as the index and its dtype must exactly match the
+            index's type.
+
+        Returns
+        -------
+        Series
+            Series with changed index.
+
+        See Also
+        --------
+        DataFrame.set_index : Set row labels.
+        DataFrame.reset_index : Remove row labels or move them to new columns.
+        DataFrame.reindex_like : Change to same indices as other DataFrame.
+
+        Examples
+        --------
+        ``DataFrame.reindex`` supports two calling conventions
+
+        * ``(index=index_labels, columns=column_labels, ...)``
+        * ``(labels, axis={{'index', 'columns'}}, ...)``
+
+        We *highly* recommend using keyword arguments to clarify your
+        intent.
+
+        Create a DataFrame with some fictional data.
+
+        >>> index = ["Firefox", "Chrome", "Safari", "IE10", "Konqueror"]
+        >>> columns = ["http_status", "response_time"]
+        >>> df = pd.DataFrame(
+        ...     [[200, 0.04], [200, 0.02], [404, 0.07], [404, 0.08], [301, 1.0]],
+        ...     columns=columns,
+        ...     index=index,
+        ... )
+        >>> df
+                   http_status  response_time
+        Firefox            200           0.04
+        Chrome             200           0.02
+        Safari             404           0.07
+        IE10               404           0.08
+        Konqueror          301           1.00
+
+        Create a new index and reindex the DataFrame. By default
+        values in the new index that do not have corresponding
+        records in the DataFrame are assigned ``NaN``.
+
+        >>> new_index = ["Safari", "Iceweasel", "Comodo Dragon", "IE10", "Chrome"]
+        >>> df.reindex(new_index)
+                       http_status  response_time
+        Safari               404.0           0.07
+        Iceweasel              NaN            NaN
+        Comodo Dragon          NaN            NaN
+        IE10                 404.0           0.08
+        Chrome               200.0           0.02
+
+        We can fill in the missing values by passing a value to
+        the keyword ``fill_value``. Because the index is not monotonically
+        increasing or decreasing, we cannot use arguments to the keyword
+        ``method`` to fill the ``NaN`` values.
+
+        >>> df.reindex(new_index, fill_value=0)
+                       http_status  response_time
+        Safari                 404           0.07
+        Iceweasel                0           0.00
+        Comodo Dragon            0           0.00
+        IE10                   404           0.08
+        Chrome                 200           0.02
+
+        >>> df.reindex(new_index, fill_value="missing")
+                      http_status response_time
+        Safari                404          0.07
+        Iceweasel         missing       missing
+        Comodo Dragon     missing       missing
+        IE10                  404          0.08
+        Chrome                200          0.02
+
+        We can also reindex the columns.
+
+        >>> df.reindex(columns=["http_status", "user_agent"])
+                   http_status  user_agent
+        Firefox            200         NaN
+        Chrome             200         NaN
+        Safari             404         NaN
+        IE10               404         NaN
+        Konqueror          301         NaN
+
+        Or we can use "axis-style" keyword arguments
+
+        >>> df.reindex(["http_status", "user_agent"], axis="columns")
+                   http_status  user_agent
+        Firefox            200         NaN
+        Chrome             200         NaN
+        Safari             404         NaN
+        IE10               404         NaN
+        Konqueror          301         NaN
+
+        To further illustrate the filling functionality in
+        ``reindex``, we will create a DataFrame with a
+        monotonically increasing index (for example, a sequence
+        of dates).
+
+        >>> date_index = pd.date_range("1/1/2010", periods=6, freq="D")
+        >>> df2 = pd.DataFrame(
+        ...     {"prices": [100, 101, np.nan, 100, 89, 88]}, index=date_index
+        ... )
+        >>> df2
+                    prices
+        2010-01-01   100.0
+        2010-01-02   101.0
+        2010-01-03     NaN
+        2010-01-04   100.0
+        2010-01-05    89.0
+        2010-01-06    88.0
+
+        Suppose we decide to expand the DataFrame to cover a wider
+        date range.
+
+        >>> date_index2 = pd.date_range("12/29/2009", periods=10, freq="D")
+        >>> df2.reindex(date_index2)
+                    prices
+        2009-12-29     NaN
+        2009-12-30     NaN
+        2009-12-31     NaN
+        2010-01-01   100.0
+        2010-01-02   101.0
+        2010-01-03     NaN
+        2010-01-04   100.0
+        2010-01-05    89.0
+        2010-01-06    88.0
+        2010-01-07     NaN
+
+        The index entries that did not have a value in the original data frame
+        (for example, '2009-12-29') are by default filled with ``NaN``.
+        If desired, we can fill in the missing values using one of several
+        options.
+
+        For example, to back-propagate the last valid value to fill the ``NaN``
+        values, pass ``bfill`` as an argument to the ``method`` keyword.
+
+        >>> df2.reindex(date_index2, method="bfill")
+                    prices
+        2009-12-29   100.0
+        2009-12-30   100.0
+        2009-12-31   100.0
+        2010-01-01   100.0
+        2010-01-02   101.0
+        2010-01-03     NaN
+        2010-01-04   100.0
+        2010-01-05    89.0
+        2010-01-06    88.0
+        2010-01-07     NaN
+
+        Please note that the ``NaN`` value present in the original DataFrame
+        (at index value 2010-01-03) will not be filled by any of the
+        value propagation schemes. This is because filling while reindexing
+        does not look at DataFrame values, but only compares the original and
+        desired indexes. If you do want to fill in the ``NaN`` values present
+        in the original DataFrame, use the ``fillna()`` method.
+
+        See the :ref:`user guide <basics.reindexing>` for more.
+        """
+        return super().reindex(
+            index=index,
+            method=method,
+            level=level,
+            fill_value=fill_value,
+            limit=limit,
+            tolerance=tolerance,
+            copy=copy,
+        )
+
+    @overload  # type: ignore[override]
+    def rename_axis(
+        self,
+        mapper: IndexLabel | lib.NoDefault = ...,
+        *,
+        index=...,
+        axis: Axis = ...,
+        copy: bool | lib.NoDefault = ...,
+        inplace: Literal[True],
+    ) -> None: ...
+
+    @overload
+    def rename_axis(
+        self,
+        mapper: IndexLabel | lib.NoDefault = ...,
+        *,
+        index=...,
+        axis: Axis = ...,
+        copy: bool | lib.NoDefault = ...,
+        inplace: Literal[False] = ...,
+    ) -> Self: ...
+
+    @overload
+    def rename_axis(
+        self,
+        mapper: IndexLabel | lib.NoDefault = ...,
+        *,
+        index=...,
+        axis: Axis = ...,
+        copy: bool | lib.NoDefault = ...,
+        inplace: bool = ...,
+    ) -> Self | None: ...
+
+    def rename_axis(
+        self,
+        mapper: IndexLabel | lib.NoDefault = lib.no_default,
+        *,
+        index=lib.no_default,
+        axis: Axis = 0,
+        copy: bool | lib.NoDefault = lib.no_default,
+        inplace: bool = False,
+    ) -> Self | None:
+        """
+        Set the name of the axis for the index.
+
+        Parameters
+        ----------
+        mapper : scalar, list-like, optional
+            Value to set the axis name attribute.
+
+            Use either ``mapper`` and ``axis`` to
+            specify the axis to target with ``mapper``, or ``index``.
+
+        index : scalar, list-like, dict-like or function, optional
+            A scalar, list-like, dict-like or functions transformations to
+            apply to that axis' values.
+        axis : {0 or 'index'}, default 0
+            The axis to rename. For `Series` this parameter is unused and defaults to 0.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        inplace : bool, default False
+            Modifies the object directly, instead of creating a new Series
+            or DataFrame.
+
+        Returns
+        -------
+        Series, or None
+            The same type as the caller or None if ``inplace=True``.
+
+        See Also
+        --------
+        Series.rename : Alter Series index labels or name.
+        DataFrame.rename : Alter DataFrame index labels or name.
+        Index.rename : Set new names on index.
+
+        Examples
+        --------
+
+        >>> s = pd.Series(["dog", "cat", "monkey"])
+        >>> s
+        0       dog
+        1       cat
+        2    monkey
+        dtype: str
+        >>> s.rename_axis("animal")
+        animal
+        0    dog
+        1    cat
+        2    monkey
+        dtype: str
+        """
+        return super().rename_axis(
+            mapper=mapper,
+            index=index,
+            axis=axis,
+            inplace=inplace,
+            copy=copy,
+        )
+
+    @overload
+    def drop(
+        self,
+        labels: IndexLabel | ListLike = ...,
+        *,
+        axis: Axis = ...,
+        index: IndexLabel | ListLike = ...,
+        columns: IndexLabel | ListLike = ...,
+        level: Level | None = ...,
+        inplace: Literal[True],
+        errors: IgnoreRaise = ...,
+    ) -> None: ...
+
+    @overload
+    def drop(
+        self,
+        labels: IndexLabel | ListLike = ...,
+        *,
+        axis: Axis = ...,
+        index: IndexLabel | ListLike = ...,
+        columns: IndexLabel | ListLike = ...,
+        level: Level | None = ...,
+        inplace: Literal[False] = ...,
+        errors: IgnoreRaise = ...,
+    ) -> Series: ...
+
+    @overload
+    def drop(
+        self,
+        labels: IndexLabel | ListLike = ...,
+        *,
+        axis: Axis = ...,
+        index: IndexLabel | ListLike = ...,
+        columns: IndexLabel | ListLike = ...,
+        level: Level | None = ...,
+        inplace: bool = ...,
+        errors: IgnoreRaise = ...,
+    ) -> Series | None: ...
+
+    def drop(
+        self,
+        labels: IndexLabel | ListLike = None,
+        *,
+        axis: Axis = 0,
+        index: IndexLabel | ListLike = None,
+        columns: IndexLabel | ListLike = None,
+        level: Level | None = None,
+        inplace: bool = False,
+        errors: IgnoreRaise = "raise",
+    ) -> Series | None:
+        """
+        Return Series with specified index labels removed.
+
+        Remove elements of a Series based on specifying the index labels.
+        When using a multi-index, labels on different levels can be removed
+        by specifying the level.
+
+        Parameters
+        ----------
+        labels : single label or list-like
+            Index labels to drop.
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+        index : single label or list-like
+            Redundant for application on Series, but 'index' can be used instead
+            of 'labels'.
+        columns : single label or list-like
+            No change is made to the Series; use 'index' or 'labels' instead.
+        level : int or level name, optional
+            For MultiIndex, level for which the labels will be removed.
+        inplace : bool, default False
+            If True, do operation inplace and return None.
+        errors : {'ignore', 'raise'}, default 'raise'
+            If 'ignore', suppress error and only existing labels are dropped.
+
+        Returns
+        -------
+        Series or None
+            Series with specified index labels removed or None if ``inplace=True``.
+
+        Raises
+        ------
+        KeyError
+            If none of the labels are found in the index.
+
+        See Also
+        --------
+        Series.reindex : Return only specified index labels of Series.
+        Series.dropna : Return series without null values.
+        Series.drop_duplicates : Return Series with duplicate values removed.
+        DataFrame.drop : Drop specified labels from rows or columns.
+
+        Examples
+        --------
+        >>> s = pd.Series(data=np.arange(3), index=["A", "B", "C"])
+        >>> s
+        A  0
+        B  1
+        C  2
+        dtype: int64
+
+        Drop labels B and C
+
+        >>> s.drop(labels=["B", "C"])
+        A  0
+        dtype: int64
+
+        Drop 2nd level label in MultiIndex Series
+
+        >>> midx = pd.MultiIndex(
+        ...     levels=[["llama", "cow", "falcon"], ["speed", "weight", "length"]],
+        ...     codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
+        ... )
+        >>> s = pd.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
+        >>> s
+        llama   speed      45.0
+                weight    200.0
+                length      1.2
+        cow     speed      30.0
+                weight    250.0
+                length      1.5
+        falcon  speed     320.0
+                weight      1.0
+                length      0.3
+        dtype: float64
+
+        >>> s.drop(labels="weight", level=1)
+        llama   speed      45.0
+                length      1.2
+        cow     speed      30.0
+                length      1.5
+        falcon  speed     320.0
+                length      0.3
+        dtype: float64
+        """
+        return super().drop(
+            labels=labels,
+            axis=axis,
+            index=index,
+            columns=columns,
+            level=level,
+            inplace=inplace,
+            errors=errors,
+        )
+
+    def pop(self, item: Hashable) -> Any:
+        """
+        Return item and drops from series. Raise KeyError if not found.
+
+        Parameters
+        ----------
+        item : label
+            Index of the element that needs to be removed.
+
+        Returns
+        -------
+        scalar
+            Value that is popped from series.
+
+        See Also
+        --------
+        Series.drop: Drop specified values from Series.
+        Series.drop_duplicates: Return Series with duplicate values removed.
+
+        Examples
+        --------
+        >>> ser = pd.Series([1, 2, 3])
+
+        >>> ser.pop(0)
+        1
+
+        >>> ser
+        1    2
+        2    3
+        dtype: int64
+        """
+        return maybe_unbox_numpy_scalar(super().pop(item=item))
+
+    def info(
+        self,
+        verbose: bool | None = None,
+        buf: IO[str] | None = None,
+        max_cols: int | None = None,
+        memory_usage: bool | str | None = None,
+        show_counts: bool = True,
+    ) -> None:
+        """
+        Print a concise summary of a Series.
+
+        This method prints information about a Series including
+        the index dtype, non-NA values and memory usage.
+
+        Parameters
+        ----------
+        verbose : bool, optional
+            Whether to print the full summary. By default, the setting in
+            ``pandas.options.display.max_info_columns`` is followed.
+        buf : writable buffer, defaults to sys.stdout
+            Where to send the output. By default, the output is printed to
+            sys.stdout. Pass a writable buffer if you need to further process
+            the output.
+        max_cols : int, optional
+            Unused, exists only for compatibility with DataFrame.info.
+        memory_usage : bool, str, optional
+            Specifies whether total memory usage of the Series
+            elements (including the index) should be displayed. By default,
+            this follows the ``pandas.options.display.memory_usage`` setting.
+
+            True always show memory usage. False never shows memory usage.
+            A value of 'deep' is equivalent to "True with deep introspection".
+            Memory usage is shown in human-readable units (base-2
+            representation). Without deep introspection a memory estimation is
+            made based in column dtype and number of rows assuming values
+            consume the same memory amount for corresponding dtypes. With deep
+            memory introspection, a real memory usage calculation is performed
+            at the cost of computational resources. See the
+            :ref:`Frequently Asked Questions <df-memory-usage>` for more
+            details.
+        show_counts : bool, optional
+            Whether to show the non-null counts. By default, this is shown
+            only if the DataFrame is smaller than
+            ``pandas.options.display.max_info_rows`` and
+            ``pandas.options.display.max_info_columns``. A value of True always
+            shows the counts, and False never shows the counts.
+
+        Returns
+        -------
+        None
+            This method prints a summary of a Series and returns None.
+
+        See Also
+        --------
+        Series.describe: Generate descriptive statistics of Series.
+        Series.memory_usage: Memory usage of Series.
+
+        Examples
+        --------
+        >>> int_values = [1, 2, 3, 4, 5]
+        >>> text_values = ["alpha", "beta", "gamma", "delta", "epsilon"]
+        >>> s = pd.Series(text_values, index=int_values)
+        >>> s.info()
+        <class 'pandas.Series'>
+        Index: 5 entries, 1 to 5
+        Series name: None
+        Non-Null Count  Dtype
+        --------------  -----
+        5 non-null      str
+        dtypes: str(1)
+        memory usage: 106.0 bytes
+
+        Prints a summary excluding information about its values:
+
+        >>> s.info(verbose=False)
+        <class 'pandas.Series'>
+        Index: 5 entries, 1 to 5
+        dtypes: str(1)
+        memory usage: 106.0 bytes
+
+        Pipe output of Series.info to buffer instead of sys.stdout, get
+        buffer content and writes to a text file:
+
+        >>> import io
+        >>> buffer = io.StringIO()
+        >>> s.info(buf=buffer)
+        >>> s = buffer.getvalue()
+        >>> with open("df_info.txt", "w", encoding="utf-8") as f:  # doctest: +SKIP
+        ...     f.write(s)
+        260
+
+        The `memory_usage` parameter allows deep introspection mode, specially
+        useful for big Series and fine-tune memory optimization:
+
+        >>> random_strings_array = np.random.choice(["a", "b", "c"], 10**6)
+        >>> s = pd.Series(np.random.choice(["a", "b", "c"], 10**6))
+        >>> s.info()
+        <class 'pandas.Series'>
+        RangeIndex: 1000000 entries, 0 to 999999
+        Series name: None
+        Non-Null Count    Dtype
+        --------------    -----
+        1000000 non-null  str
+        dtypes: str(1)
+        memory usage: 8.6 MB
+
+        >>> s.info(memory_usage="deep")
+        <class 'pandas.Series'>
+        RangeIndex: 1000000 entries, 0 to 999999
+        Series name: None
+        Non-Null Count    Dtype
+        --------------    -----
+        1000000 non-null  str
+        dtypes: str(1)
+        memory usage: 8.6 MB
+        """
+        return SeriesInfo(self, memory_usage).render(
+            buf=buf,
+            max_cols=max_cols,
+            verbose=verbose,
+            show_counts=show_counts,
+        )
+
+    def memory_usage(self, index: bool = True, deep: bool = False) -> int:
+        """
+        Return the memory usage of the Series.
+
+        The memory usage can optionally include the contribution of
+        the index and of elements of `object` dtype.
+
+        Parameters
+        ----------
+        index : bool, default True
+            Specifies whether to include the memory usage of the Series index.
+        deep : bool, default False
+            If True, introspect the data deeply by interrogating
+            `object` dtypes for system-level memory consumption, and include
+            it in the returned value.
+
+        Returns
+        -------
+        int
+            Bytes of memory consumed.
+
+        See Also
+        --------
+        numpy.ndarray.nbytes : Total bytes consumed by the elements of the
+            array.
+        DataFrame.memory_usage : Bytes consumed by a DataFrame.
+
+        Examples
+        --------
+        >>> s = pd.Series(range(3))
+        >>> s.memory_usage()
+        156
+
+        Not including the index gives the size of the rest of the data, which
+        is necessarily smaller:
+
+        >>> s.memory_usage(index=False)
+        24
+
+        The memory footprint of `object` values is ignored by default:
+
+        >>> s = pd.Series(["a", "b"])
+        >>> s.values
+        <ArrowStringArray>
+        ['a', 'b']
+        Length: 2, dtype: str
+        >>> s.memory_usage()
+        150
+        >>> s.memory_usage(deep=True)
+        150
+        """
+        v = self._memory_usage(deep=deep)
+        if index:
+            v += self.index.memory_usage(deep=deep)
+        return v
+
+    def isin(self, values) -> Series:
+        """
+        Whether elements in Series are contained in `values`.
+
+        Return a boolean Series showing whether each element in the Series
+        matches an element in the passed sequence of `values` exactly.
+
+        Parameters
+        ----------
+        values : set or list-like
+            The sequence of values to test. Passing in a single string will
+            raise a ``TypeError``. Instead, turn a single string into a
+            list of one element.
+
+        Returns
+        -------
+        Series
+            Series of booleans indicating if each element is in values.
+
+        Raises
+        ------
+        TypeError
+          * If `values` is a string
+
+        See Also
+        --------
+        DataFrame.isin : Equivalent method on DataFrame.
+
+        Examples
+        --------
+        >>> s = pd.Series(
+        ...     ["llama", "cow", "llama", "beetle", "llama", "hippo"], name="animal"
+        ... )
+        >>> s.isin(["cow", "llama"])
+        0     True
+        1     True
+        2     True
+        3    False
+        4     True
+        5    False
+        Name: animal, dtype: bool
+
+        To invert the boolean values, use the ``~`` operator:
+
+        >>> ~s.isin(["cow", "llama"])
+        0    False
+        1    False
+        2    False
+        3     True
+        4    False
+        5     True
+        Name: animal, dtype: bool
+
+        Passing a single string as ``s.isin('llama')`` will raise an error. Use
+        a list of one element instead:
+
+        >>> s.isin(["llama"])
+        0     True
+        1    False
+        2     True
+        3    False
+        4     True
+        5    False
+        Name: animal, dtype: bool
+
+        Strings and integers are distinct and are therefore not comparable:
+
+        >>> pd.Series([1]).isin(["1"])
+        0    False
+        dtype: bool
+        >>> pd.Series([1.1]).isin(["1.1"])
+        0    False
+        dtype: bool
+        """
+        result = algorithms.isin(self._values, values)
+        return self._constructor(result, index=self.index, copy=False).__finalize__(
+            self, method="isin"
+        )
+
+    def between(
+        self,
+        left,
+        right,
+        inclusive: Literal["both", "neither", "left", "right"] = "both",
+    ) -> Series:
+        """
+        Return boolean Series equivalent to left <= series <= right.
+
+        This function returns a boolean vector containing `True` wherever the
+        corresponding Series element is between the boundary values `left` and
+        `right`. NA values are treated as `False`.
+
+        Parameters
+        ----------
+        left : scalar or list-like
+            Left boundary.
+        right : scalar or list-like
+            Right boundary.
+        inclusive : {"both", "neither", "left", "right"}
+            Include boundaries. Whether to set each bound as closed or open.
+
+        Returns
+        -------
+        Series
+            Series representing whether each element is between left and
+            right (inclusive).
+
+        See Also
+        --------
+        Series.gt : Greater than of series and other.
+        Series.lt : Less than of series and other.
+
+        Notes
+        -----
+        This function is equivalent to ``(left <= ser) & (ser <= right)``
+
+        Examples
+        --------
+        >>> s = pd.Series([2, 0, 4, 8, np.nan])
+
+        Boundary values are included by default:
+
+        >>> s.between(1, 4)
+        0     True
+        1    False
+        2     True
+        3    False
+        4    False
+        dtype: bool
+
+        With `inclusive` set to ``"neither"`` boundary values are excluded:
+
+        >>> s.between(1, 4, inclusive="neither")
+        0     True
+        1    False
+        2    False
+        3    False
+        4    False
+        dtype: bool
+
+        `left` and `right` can be any scalar value:
+
+        >>> s = pd.Series(["Alice", "Bob", "Carol", "Eve"])
+        >>> s.between("Anna", "Daniel")
+        0    False
+        1     True
+        2     True
+        3    False
+        dtype: bool
+        """
+        if inclusive == "both":
+            lmask = self >= left
+            rmask = self <= right
+        elif inclusive == "left":
+            lmask = self >= left
+            rmask = self < right
+        elif inclusive == "right":
+            lmask = self > left
+            rmask = self <= right
+        elif inclusive == "neither":
+            lmask = self > left
+            rmask = self < right
+        else:
+            raise ValueError(
+                "Inclusive has to be either string of 'both',"
+                "'left', 'right', or 'neither'."
+            )
+
+        return lmask & rmask
+
+    def case_when(
+        self,
+        caselist: list[
+            tuple[
+                ArrayLike | Callable[[Series], Series | np.ndarray | Sequence[bool]],
+                ArrayLike | Scalar | Callable[[Series], Series | np.ndarray],
+            ],
+        ],
+    ) -> Series:
+        """
+        Replace values where the conditions are True.
+
+        .. versionadded:: 2.2.0
+
+        Parameters
+        ----------
+        caselist : A list of tuples of conditions and expected replacements
+            Takes the form:  ``(condition0, replacement0)``,
+            ``(condition1, replacement1)``, ... .
+            ``condition`` should be a 1-D boolean array-like object
+            or a callable. If ``condition`` is a callable,
+            it is computed on the Series
+            and should return a boolean Series or array.
+            The callable must not change the input Series
+            (though pandas doesn`t check it). ``replacement`` should be a
+            1-D array-like object, a scalar or a callable.
+            If ``replacement`` is a callable, it is computed on the Series
+            and should return a scalar or Series. The callable
+            must not change the input Series
+            (though pandas doesn`t check it).
+
+        Returns
+        -------
+        Series
+            A new Series with values replaced based on the provided conditions.
+
+        See Also
+        --------
+        Series.mask : Replace values where the condition is True.
+
+        Examples
+        --------
+        >>> c = pd.Series([6, 7, 8, 9], name="c")
+        >>> a = pd.Series([0, 0, 1, 2])
+        >>> b = pd.Series([0, 3, 4, 5])
+
+        >>> c.case_when(
+        ...     caselist=[
+        ...         (a.gt(0), a),  # condition, replacement
+        ...         (b.gt(0), b),
+        ...     ]
+        ... )
+        0    6
+        1    3
+        2    1
+        3    2
+        Name: c, dtype: int64
+        """
+        if not isinstance(caselist, list):
+            raise TypeError(
+                f"The caselist argument should be a list; instead got {type(caselist)}"
+            )
+
+        if not caselist:
+            raise ValueError(
+                "provide at least one boolean condition, "
+                "with a corresponding replacement."
+            )
+
+        for num, entry in enumerate(caselist):
+            if not isinstance(entry, tuple):
+                raise TypeError(
+                    f"Argument {num} must be a tuple; instead got {type(entry)}."
+                )
+            if len(entry) != 2:
+                raise ValueError(
+                    f"Argument {num} must have length 2; "
+                    "a condition and replacement; "
+                    f"instead got length {len(entry)}."
+                )
+        caselist = [
+            (
+                com.apply_if_callable(condition, self),
+                com.apply_if_callable(replacement, self),
+            )
+            for condition, replacement in caselist
+        ]
+        default = self.copy(deep=False)
+        conditions, replacements = zip(*caselist, strict=True)
+        common_dtypes = [infer_dtype_from(arg)[0] for arg in [*replacements, default]]
+        if len(set(common_dtypes)) > 1:
+            common_dtype = find_common_type(common_dtypes)
+            updated_replacements = []
+            for condition, replacement in zip(conditions, replacements, strict=True):
+                if is_scalar(replacement):
+                    replacement = construct_1d_arraylike_from_scalar(
+                        value=replacement, length=len(condition), dtype=common_dtype
+                    )
+                elif isinstance(replacement, ABCSeries):
+                    replacement = replacement.astype(common_dtype)
+                else:
+                    replacement = pd_array(replacement, dtype=common_dtype)
+                updated_replacements.append(replacement)
+            replacements = updated_replacements
+            default = default.astype(common_dtype)
+
+        counter = range(len(conditions) - 1, -1, -1)
+        for position, condition, replacement in zip(
+            counter, reversed(conditions), reversed(replacements), strict=True
+        ):
+            try:
+                default = default.mask(
+                    condition, other=replacement, axis=0, inplace=False, level=None
+                )
+            except Exception as error:
+                raise ValueError(
+                    f"Failed to apply condition{position} and replacement{position}."
+                ) from error
+        return default
+
+    # error: Cannot determine type of 'isna'
+    def isna(self) -> Series:
+        """
+        Detect missing values.
+
+        Return a boolean same-sized Series indicating if the values are NA.
+        NA values, such as None or :attr:`numpy.NaN`, get mapped to True
+        values.
+        Everything else gets mapped to False values. Characters such as empty
+        strings ``''`` or :attr:`numpy.inf` are not considered NA values.
+
+        Returns
+        -------
+        Series
+            Mask of bool values for each element in Series that
+            indicates whether an element is an NA value.
+
+        See Also
+        --------
+        DataFrame.isna : Detect missing values.
+        DataFrame.isnull : Alias of isna.
+        Series.notna : Boolean inverse of isna.
+        DataFrame.notna : Boolean inverse of isna.
+        Series.notnull : Alias of notna.
+        DataFrame.notnull : Alias of notna.
+        Series.dropna : Omit axes labels with missing values.
+        DataFrame.dropna : Omit axes labels with missing values.
+        isna : Top-level isna.
+
+        Examples
+        --------
+        Show which entries in a Series are NA.
+
+        >>> ser = pd.Series([5, 6, np.nan])
+        >>> ser
+        0    5.0
+        1    6.0
+        2    NaN
+        dtype: float64
+        >>> ser.isna()
+        0    False
+        1    False
+        2     True
+        dtype: bool
+        """
+        return NDFrame.isna(self)
+
+    # error: Cannot determine type of 'isna'
+    @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
+    def isnull(self) -> Series:
+        """
+        Series.isnull is an alias for Series.isna.
+        """
+        return super().isnull()
+
+    # error: Cannot determine type of 'notna'
+    def notna(self) -> Series:
+        """
+        Detect existing (non-missing) values.
+
+        Return a boolean same-sized Series indicating if the values are not NA.
+        Non-missing values get mapped to True. Characters such as empty
+        strings ``''`` or :attr:`numpy.inf` are not considered NA values.
+        NA values, such as None or :attr:`numpy.NaN`, get mapped to False
+        values.
+
+        Returns
+        -------
+        Series
+            Mask of bool values for each element in Series that
+            indicates whether an element is not an NA value.
+
+        See Also
+        --------
+        Series.isna : Detect missing values.
+        DataFrame.isna : Detect missing values.
+        Series.isnull : Alias of isna.
+        DataFrame.isnull : Alias of isna.
+        DataFrame.notna : Boolean inverse of isna.
+        DataFrame.notnull : Alias of notna.
+        Series.dropna : Omit axes labels with missing values.
+        DataFrame.dropna : Omit axes labels with missing values.
+        notna : Top-level notna.
+
+        Examples
+        --------
+        Show which entries in a Series are not NA.
+
+        >>> ser = pd.Series([5, 6, np.nan])
+        >>> ser
+        0    5.0
+        1    6.0
+        2    NaN
+        dtype: float64
+        >>> ser.notna()
+        0     True
+        1     True
+        2    False
+        dtype: bool
+        """
+        return super().notna()
+
+    # error: Cannot determine type of 'notna'
+    @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
+    def notnull(self) -> Series:
+        """
+        Series.notnull is an alias for Series.notna.
+        """
+        return super().notnull()
+
+    @overload
+    def dropna(
+        self,
+        *,
+        axis: Axis = ...,
+        inplace: Literal[False] = ...,
+        how: AnyAll | None = ...,
+        ignore_index: bool = ...,
+    ) -> Series: ...
+
+    @overload
+    def dropna(
+        self,
+        *,
+        axis: Axis = ...,
+        inplace: Literal[True],
+        how: AnyAll | None = ...,
+        ignore_index: bool = ...,
+    ) -> None: ...
+
+    def dropna(
+        self,
+        *,
+        axis: Axis = 0,
+        inplace: bool = False,
+        how: AnyAll | None = None,
+        ignore_index: bool = False,
+    ) -> Series | None:
+        """
+        Return a new Series with missing values removed.
+
+        See the :ref:`User Guide <missing_data>` for more on which values are
+        considered missing, and how to work with missing data.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+        inplace : bool, default False
+            If True, do operation inplace and return None.
+        how : str, optional
+            Not in use. Kept for compatibility.
+        ignore_index : bool, default ``False``
+            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
+
+            .. versionadded:: 2.0.0
+
+        Returns
+        -------
+        Series or None
+            Series with NA entries dropped from it or None if ``inplace=True``.
+
+        See Also
+        --------
+        Series.isna: Indicate missing values.
+        Series.notna : Indicate existing (non-missing) values.
+        Series.fillna : Replace missing values.
+        DataFrame.dropna : Drop rows or columns which contain NA values.
+        Index.dropna : Drop missing indices.
+
+        Examples
+        --------
+        >>> ser = pd.Series([1.0, 2.0, np.nan])
+        >>> ser
+        0    1.0
+        1    2.0
+        2    NaN
+        dtype: float64
+
+        Drop NA values from a Series.
+
+        >>> ser.dropna()
+        0    1.0
+        1    2.0
+        dtype: float64
+
+        Empty strings are not considered NA values. ``None`` is considered an
+        NA value.
+
+        >>> ser = pd.Series([np.nan, 2, pd.NaT, "", None, "I stay"])
+        >>> ser
+        0       NaN
+        1         2
+        2       NaT
+        3
+        4      None
+        5    I stay
+        dtype: object
+        >>> ser.dropna()
+        1         2
+        3
+        5    I stay
+        dtype: object
+        """
+        inplace = validate_bool_kwarg(inplace, "inplace")
+        ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")
+        # Validate the axis parameter
+        self._get_axis_number(axis or 0)
+
+        if self._can_hold_na:
+            result = remove_na_arraylike(self)
+        elif not inplace:
+            result = self.copy(deep=False)
+        else:
+            result = self
+
+        if ignore_index:
+            result.index = default_index(len(result))
+
+        if inplace:
+            return self._update_inplace(result)
+        else:
+            return result
+
+    # ----------------------------------------------------------------------
+    # Time series-oriented methods
+
+    def to_timestamp(
+        self,
+        freq: Frequency | None = None,
+        how: Literal["s", "e", "start", "end"] = "start",
+        copy: bool | lib.NoDefault = lib.no_default,
+    ) -> Series:
+        """
+        Cast to DatetimeIndex of Timestamps, at *beginning* of period.
+
+        This can be changed to the *end* of the period, by specifying `how="e"`.
+
+        Parameters
+        ----------
+        freq : str, default frequency of PeriodIndex
+            Desired frequency.
+        how : {'s', 'e', 'start', 'end'}
+            Convention for converting period to timestamp; start of period
+            vs. end.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        Returns
+        -------
+        Series with DatetimeIndex
+            Series with the PeriodIndex cast to DatetimeIndex.
+
+        See Also
+        --------
+        Series.to_period: Inverse method to cast DatetimeIndex to PeriodIndex.
+        DataFrame.to_timestamp: Equivalent method for DataFrame.
+
+        Examples
+        --------
+        >>> idx = pd.PeriodIndex(["2023", "2024", "2025"], freq="Y")
+        >>> s1 = pd.Series([1, 2, 3], index=idx)
+        >>> s1
+        2023    1
+        2024    2
+        2025    3
+        Freq: Y-DEC, dtype: int64
+
+        The resulting frequency of the Timestamps is `YearBegin`
+
+        >>> s1 = s1.to_timestamp()
+        >>> s1
+        2023-01-01    1
+        2024-01-01    2
+        2025-01-01    3
+        Freq: YS-JAN, dtype: int64
+
+        Using `freq` which is the offset that the Timestamps will have
+
+        >>> s2 = pd.Series([1, 2, 3], index=idx)
+        >>> s2 = s2.to_timestamp(freq="M")
+        >>> s2
+        2023-01-31    1
+        2024-01-31    2
+        2025-01-31    3
+        Freq: YE-JAN, dtype: int64
+        """
+        self._check_copy_deprecation(copy)
+        if not isinstance(self.index, PeriodIndex):
+            raise TypeError(f"unsupported Type {type(self.index).__name__}")
+
+        new_obj = self.copy(deep=False)
+        new_index = self.index.to_timestamp(freq=freq, how=how)
+        setattr(new_obj, "index", new_index)
+        return new_obj
+
+    def to_period(
+        self,
+        freq: str | None = None,
+        copy: bool | lib.NoDefault = lib.no_default,
+    ) -> Series:
+        """
+        Convert Series from DatetimeIndex to PeriodIndex.
+
+        Parameters
+        ----------
+        freq : str, default None
+            Frequency associated with the PeriodIndex.
+        copy : bool, default False
+            This keyword is now ignored; changing its value will have no
+            impact on the method.
+
+            .. deprecated:: 3.0.0
+
+                This keyword is ignored and will be removed in pandas 4.0. Since
+                pandas 3.0, this method always returns a new object using a lazy
+                copy mechanism that defers copies until necessary
+                (Copy-on-Write). See the `user guide on Copy-on-Write
+                <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
+                for more details.
+
+        Returns
+        -------
+        Series
+            Series with index converted to PeriodIndex.
+
+        See Also
+        --------
+        DataFrame.to_period: Equivalent method for DataFrame.
+        Series.dt.to_period: Convert DateTime column values.
+
+        Examples
+        --------
+        >>> idx = pd.DatetimeIndex(["2023", "2024", "2025"])
+        >>> s = pd.Series([1, 2, 3], index=idx)
+        >>> s = s.to_period()
+        >>> s
+        2023    1
+        2024    2
+        2025    3
+        Freq: Y-DEC, dtype: int64
+
+        Viewing the index
+
+        >>> s.index
+        PeriodIndex(['2023', '2024', '2025'], dtype='period[Y-DEC]')
+        """
+        self._check_copy_deprecation(copy)
+        if not isinstance(self.index, DatetimeIndex):
+            raise TypeError(f"unsupported Type {type(self.index).__name__}")
+
+        new_obj = self.copy(deep=False)
+        new_index = self.index.to_period(freq=freq)
+        setattr(new_obj, "index", new_index)
+        return new_obj
+
+    # ----------------------------------------------------------------------
+    # Add index
+    _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index"]
+    _AXIS_LEN = len(_AXIS_ORDERS)
+    _info_axis_number: Literal[0] = 0
+    _info_axis_name: Literal["index"] = "index"
+
+    index = properties.AxisProperty(
+        axis=0,
+        doc="""
+        The index (axis labels) of the Series.
+
+        The index of a Series is used to label and identify each element of the
+        underlying data. The index can be thought of as an immutable ordered set
+        (technically a multi-set, as it may contain duplicate labels), and is
+        used to index and align data in pandas.
+
+        Returns
+        -------
+        Index
+            The index labels of the Series.
+
+        See Also
+        --------
+        Series.reindex : Conform Series to new index.
+        Index : The base pandas index type.
+
+        Notes
+        -----
+        For more information on pandas indexing, see the `indexing user guide
+        <https://pandas.pydata.org/docs/user_guide/indexing.html>`__.
+
+        Examples
+        --------
+        To create a Series with a custom index and view the index labels:
+
+        >>> cities = ['Kolkata', 'Chicago', 'Toronto', 'Lisbon']
+        >>> populations = [14.85, 2.71, 2.93, 0.51]
+        >>> city_series = pd.Series(populations, index=cities)
+        >>> city_series.index
+        Index(['Kolkata', 'Chicago', 'Toronto', 'Lisbon'], dtype='object')
+
+        To change the index labels of an existing Series:
+
+        >>> city_series.index = ['KOL', 'CHI', 'TOR', 'LIS']
+        >>> city_series.index
+        Index(['KOL', 'CHI', 'TOR', 'LIS'], dtype='object')
+        """,
+    )
+
+    # ----------------------------------------------------------------------
+    # Accessor Methods
+    # ----------------------------------------------------------------------
+    str = Accessor("str", StringMethods)
+    dt = Accessor("dt", CombinedDatetimelikeProperties)
+    cat = Accessor("cat", CategoricalAccessor)
+    plot = Accessor("plot", pandas.plotting.PlotAccessor)
+    sparse = Accessor("sparse", SparseAccessor)
+    struct = Accessor("struct", StructAccessor)
+    list = Accessor("list", ListAccessor)
+
+    # ----------------------------------------------------------------------
+    # Add plotting methods to Series
+    hist = pandas.plotting.hist_series
+
+    # ----------------------------------------------------------------------
+    # Template-Based Arithmetic/Comparison Methods
+
+    def _cmp_method(self, other, op):
+        res_name = ops.get_op_result_name(self, other)
+
+        if isinstance(other, Series) and not self._indexed_same(other):
+            raise ValueError("Can only compare identically-labeled Series objects")
+
+        lvalues = self._values
+        rvalues = extract_array(other, extract_numpy=True, extract_range=True)
+
+        res_values = ops.comparison_op(lvalues, rvalues, op)
+
+        return self._construct_result(res_values, name=res_name, other=other)
+
+    def _logical_method(self, other, op):
+        res_name = ops.get_op_result_name(self, other)
+        self, other = self._align_for_op(other, align_asobject=True)
+
+        lvalues = self._values
+        rvalues = extract_array(other, extract_numpy=True, extract_range=True)
+
+        res_values = ops.logical_op(lvalues, rvalues, op)
+        return self._construct_result(res_values, name=res_name, other=other)
+
+    def _arith_method(self, other, op):
+        self, other = self._align_for_op(other)
+        return base.IndexOpsMixin._arith_method(self, other, op)
+
+    def _align_for_op(self, right, align_asobject: bool = False):
+        """align lhs and rhs Series"""
+        # TODO: Different from DataFrame._align_for_op, list, tuple and ndarray
+        # are not coerced here
+        # because Series has inconsistencies described in GH#13637
+        left = self
+
+        if isinstance(right, Series):
+            # avoid repeated alignment
+            if not left.index.equals(right.index):
+                if align_asobject:
+                    if left.dtype not in (object, np.bool_) or right.dtype not in (
+                        object,
+                        np.bool_,
+                    ):
+                        pass
+                        # GH#52538 no longer cast in these cases
+                    else:
+                        # to keep original value's dtype for bool ops
+                        left = left.astype(object)
+                        right = right.astype(object)
+
+                left, right = left.align(right)
+
+        return left, right
+
+    def _binop(self, other: Series, func, level=None, fill_value=None) -> Series:
+        """
+        Perform generic binary operation with optional fill value.
+
+        Parameters
+        ----------
+        other : Series
+        func : binary operator
+        fill_value : float or object
+            Value to substitute for NA/null values. If both Series are NA in a
+            location, the result will be NA regardless of the passed fill value.
+        level : int or level name, default None
+            Broadcast across a level, matching Index values on the
+            passed MultiIndex level.
+
+        Returns
+        -------
+        Series
+        """
+        this = self
+
+        if not self.index.equals(other.index):
+            this, other = self.align(other, level=level, join="outer")
+
+        this_vals, other_vals = ops.fill_binop(this._values, other._values, fill_value)
+
+        with np.errstate(all="ignore"):
+            result = func(this_vals, other_vals)
+
+        name = ops.get_op_result_name(self, other)
+
+        out = this._construct_result(result, name, other)
+        return cast(Series, out)
+
+    def _construct_result(
+        self,
+        result: ArrayLike | tuple[ArrayLike, ArrayLike],
+        name: Hashable,
+        other: AnyArrayLike | DataFrame,
+    ) -> Series | tuple[Series, Series]:
+        """
+        Construct an appropriately-labelled Series from the result of an op.
+
+        Parameters
+        ----------
+        result : ndarray or ExtensionArray
+        name : Label
+        other : Series, DataFrame or array-like
+
+        Returns
+        -------
+        Series
+            In the case of __divmod__ or __rdivmod__, a 2-tuple of Series.
+        """
+        if isinstance(result, tuple):
+            # produced by divmod or rdivmod
+
+            res1 = self._construct_result(result[0], name=name, other=other)
+            res2 = self._construct_result(result[1], name=name, other=other)
+
+            # GH#33427 assertions to keep mypy happy
+            assert isinstance(res1, Series)
+            assert isinstance(res2, Series)
+            return (res1, res2)
+
+        # TODO: result should always be ArrayLike, but this fails for some
+        #  JSONArray tests
+        dtype = getattr(result, "dtype", None)
+        out = self._constructor(result, index=self.index, dtype=dtype, copy=False)
+        out = out.__finalize__(self)
+        out = out.__finalize__(other)
+
+        # Set the result's name after __finalize__ is called because __finalize__
+        #  would set it back to self.name
+        out.name = name
+        return out
+
+    def _flex_method(self, other, op, *, level=None, fill_value=None, axis: Axis = 0):
+        if axis is not None:
+            self._get_axis_number(axis)
+
+        res_name = ops.get_op_result_name(self, other)
+
+        if isinstance(other, Series):
+            return self._binop(other, op, level=level, fill_value=fill_value)
+        elif isinstance(other, (np.ndarray, list, tuple, ExtensionArray)):
+            if len(other) != len(self):
+                raise ValueError("Lengths must be equal")
+            other = self._constructor(other, self.index, copy=False)
+            result = self._binop(other, op, level=level, fill_value=fill_value)
+            result._name = res_name
+            return result
+        elif isinstance(other, ABCDataFrame):
+            # GH#46179
+            raise TypeError(
+                f"Series.{op.__name__.strip('_')} does not support a DataFrame "
+                f"`other`. Use df.{op.__name__.strip('_')}(ser) instead."
+            )
+        else:
+            if fill_value is not None:
+                if isna(other):
+                    return op(self, fill_value)
+                self = self.fillna(fill_value)
+
+            return op(self, other)
+
+    def eq(
+        self,
+        other,
+        level: Level | None = None,
+        fill_value: float | None = None,
+        axis: Axis = 0,
+    ) -> Series:
+        """
+        Return Equal to of series and other, element-wise (binary operator `eq`).
+
+        Equivalent to ``series == other``, but with support to substitute a fill_value
+        for missing data in either one of the inputs.
+
+        Parameters
+        ----------
+        other : object
+            When a Series is provided, will align on indexes. For all other types,
+            will behave the same as ``==`` but with possibly different results due
+            to the other arguments.
+        level : int or name
+            Broadcast across a level, matching Index values on the
+            passed MultiIndex level.
+        fill_value : None or float value, default None (NaN)
+            Fill existing missing (NaN) values, and any new element needed for
+            successful Series alignment, with this value before computation.
+            If data in both corresponding Series locations is missing
+            the result of filling (at that location) will be missing.
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        See Also
+        --------
+        Series.ge : Return elementwise Greater than or equal to of series and other.
+        Series.le : Return elementwise Less than or equal to of series and other.
+        Series.gt : Return elementwise Greater than of series and other.
+        Series.lt : Return elementwise Less than of series and other.
+
+        Examples
+        --------
+        >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"])
+        >>> a
+        a    1.0
+        b    1.0
+        c    1.0
+        d    NaN
+        dtype: float64
+        >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"])
+        >>> b
+        a    1.0
+        b    NaN
+        d    1.0
+        e    NaN
+        dtype: float64
+        >>> a.eq(b, fill_value=0)
+        a     True
+        b    False
+        c    False
+        d    False
+        e    False
+        dtype: bool
+        """
+        return self._flex_method(
+            other, operator.eq, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("ne", "series"))
+    def ne(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, operator.ne, level=level, fill_value=fill_value, axis=axis
+        )
+
+    def le(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        """
+        Return Less than or equal to of series and other, \
+        element-wise (binary operator `le`).
+
+        Equivalent to ``series <= other``, but with support to substitute a
+        fill_value for missing data in either one of the inputs.
+
+        Parameters
+        ----------
+        other : object
+            When a Series is provided, will align on indexes. For all other types,
+            will behave the same as ``==`` but with possibly different results due
+            to the other arguments.
+        level : int or name
+            Broadcast across a level, matching Index values on the
+            passed MultiIndex level.
+        fill_value : None or float value, default None (NaN)
+            Fill existing missing (NaN) values, and any new element needed for
+            successful Series alignment, with this value before computation.
+            If data in both corresponding Series locations is missing
+            the result of filling (at that location) will be missing.
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        See Also
+        --------
+        Series.ge : Return elementwise Greater than or equal to of series and other.
+        Series.lt : Return elementwise Less than of series and other.
+        Series.gt : Return elementwise Greater than of series and other.
+        Series.eq : Return elementwise equal to of series and other.
+
+        Examples
+        --------
+        >>> a = pd.Series([1, 1, 1, np.nan, 1], index=['a', 'b', 'c', 'd', 'e'])
+        >>> a
+        a    1.0
+        b    1.0
+        c    1.0
+        d    NaN
+        e    1.0
+        dtype: float64
+        >>> b = pd.Series([0, 1, 2, np.nan, 1], index=['a', 'b', 'c', 'd', 'f'])
+        >>> b
+        a    0.0
+        b    1.0
+        c    2.0
+        d    NaN
+        f    1.0
+        dtype: float64
+        >>> a.le(b, fill_value=0)
+        a    False
+        b     True
+        c     True
+        d    False
+        e    False
+        f     True
+        dtype: bool
+        """
+        return self._flex_method(
+            other, operator.le, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("lt", "series"))
+    def lt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, operator.lt, level=level, fill_value=fill_value, axis=axis
+        )
+
+    def ge(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        """
+        Return Greater than or equal to of series and other, \
+        element-wise (binary operator `ge`).
+
+        Equivalent to ``series >= other``, but with support to substitute a
+        fill_value for missing data in either one of the inputs.
+
+        Parameters
+        ----------
+        other : object
+            When a Series is provided, will align on indexes. For all other types,
+            will behave the same as ``==`` but with possibly different results due
+            to the other arguments.
+        level : int or name
+            Broadcast across a level, matching Index values on the
+            passed MultiIndex level.
+        fill_value : None or float value, default None (NaN)
+            Fill existing missing (NaN) values, and any new element needed for
+            successful Series alignment, with this value before computation.
+            If data in both corresponding Series locations is missing
+            the result of filling (at that location) will be missing.
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        See Also
+        --------
+        Series.gt : Greater than comparison, element-wise.
+        Series.le : Less than or equal to comparison, element-wise.
+        Series.lt : Less than comparison, element-wise.
+        Series.eq : Equal to comparison, element-wise.
+        Series.ne : Not equal to comparison, element-wise.
+
+        Examples
+        --------
+        >>> a = pd.Series([1, 1, 1, np.nan, 1], index=["a", "b", "c", "d", "e"])
+        >>> a
+        a    1.0
+        b    1.0
+        c    1.0
+        d    NaN
+        e    1.0
+        dtype: float64
+        >>> b = pd.Series([0, 1, 2, np.nan, 1], index=["a", "b", "c", "d", "f"])
+        >>> b
+        a    0.0
+        b    1.0
+        c    2.0
+        d    NaN
+        f    1.0
+        dtype: float64
+        >>> a.ge(b, fill_value=0)
+        a     True
+        b     True
+        c    False
+        d    False
+        e     True
+        f    False
+        dtype: bool
+        """
+        return self._flex_method(
+            other, operator.ge, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("gt", "series"))
+    def gt(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, operator.gt, level=level, fill_value=fill_value, axis=axis
+        )
+
+    def add(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        """
+        Return Addition of series and other, element-wise (binary operator `add`).
+
+        Equivalent to ``series + other``, but with support to substitute a fill_value
+        for missing data in either one of the inputs.
+
+        Parameters
+        ----------
+        other : Series or scalar value
+            With which to compute the addition.
+        level : int or name
+            Broadcast across a level, matching Index values on the
+            passed MultiIndex level.
+        fill_value : None or float value, default None (NaN)
+            Fill existing missing (NaN) values, and any new element needed for
+            successful Series alignment, with this value before computation.
+            If data in both corresponding Series locations is missing
+            the result of filling (at that location) will be missing.
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        See Also
+        --------
+        Series.radd : Reverse of the Addition operator, see
+            `Python documentation
+            <https://docs.python.org/3/reference/datamodel.html#emulating-numeric-types>`_
+            for more details.
+
+        Examples
+        --------
+        >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"])
+        >>> a
+        a    1.0
+        b    1.0
+        c    1.0
+        d    NaN
+        dtype: float64
+        >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"])
+        >>> b
+        a    1.0
+        b    NaN
+        d    1.0
+        e    NaN
+        dtype: float64
+        >>> a.add(b, fill_value=0)
+        a    2.0
+        b    1.0
+        c    1.0
+        d    1.0
+        e    NaN
+        dtype: float64
+        """
+        return self._flex_method(
+            other, operator.add, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("radd", "series"))
+    def radd(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, roperator.radd, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("sub", "series"))
+    def sub(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, operator.sub, level=level, fill_value=fill_value, axis=axis
+        )
+
+    subtract = sub
+
+    @Appender(ops.make_flex_doc("rsub", "series"))
+    def rsub(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, roperator.rsub, level=level, fill_value=fill_value, axis=axis
+        )
+
+    def mul(
+        self,
+        other,
+        level: Level | None = None,
+        fill_value: float | None = None,
+        axis: Axis = 0,
+    ) -> Series:
+        """
+        Return Multiplication of series and other, element-wise (binary operator `mul`).
+
+        Equivalent to ``series * other``, but with support to substitute
+        a fill_value for missing data in either one of the inputs.
+
+        Parameters
+        ----------
+        other : Series or scalar value
+            With which to compute the multiplication.
+        level : int or name
+            Broadcast across a level, matching Index values on the
+            passed MultiIndex level.
+        fill_value : None or float value, default None (NaN)
+            Fill existing missing (NaN) values, and any new element needed for
+            successful Series alignment, with this value before computation.
+            If data in both corresponding Series locations is missing
+            the result of filling (at that location) will be missing.
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        See Also
+        --------
+        Series.rmul : Reverse of the Multiplication operator, see
+            `Python documentation
+            <https://docs.python.org/3/reference/datamodel.html#emulating-numeric-types>`_
+            for more details.
+
+        Examples
+        --------
+        >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"])
+        >>> a
+        a    1.0
+        b    1.0
+        c    1.0
+        d    NaN
+        dtype: float64
+        >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"])
+        >>> b
+        a    1.0
+        b    NaN
+        d    1.0
+        e    NaN
+        dtype: float64
+        >>> a.multiply(b, fill_value=0)
+        a    1.0
+        b    0.0
+        c    0.0
+        d    0.0
+        e    NaN
+        dtype: float64
+        >>> a.mul(5, fill_value=0)
+        a    5.0
+        b    5.0
+        c    5.0
+        d    0.0
+        dtype: float64
+        """
+        return self._flex_method(
+            other, operator.mul, level=level, fill_value=fill_value, axis=axis
+        )
+
+    multiply = mul
+
+    @Appender(ops.make_flex_doc("rmul", "series"))
+    def rmul(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, roperator.rmul, level=level, fill_value=fill_value, axis=axis
+        )
+
+    def truediv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        """
+        Return Floating division of series and other, \
+        element-wise (binary operator `truediv`).
+
+        Equivalent to ``series / other``, but with support to substitute a
+        fill_value for missing data in either one of the inputs.
+
+        Parameters
+        ----------
+        other : Series or scalar value
+            Series with which to compute division.
+        level : int or name
+            Broadcast across a level, matching Index values on the
+            passed MultiIndex level.
+        fill_value : None or float value, default None (NaN)
+            Fill existing missing (NaN) values, and any new element needed for
+            successful Series alignment, with this value before computation.
+            If data in both corresponding Series locations is missing
+            the result of filling (at that location) will be missing.
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        See Also
+        --------
+        Series.rtruediv : Reverse of the Floating division operator, see
+            `Python documentation
+            <https://docs.python.org/3/reference/datamodel.html#emulating-numeric-types>`_
+            for more details.
+
+        Examples
+        --------
+        >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"])
+        >>> a
+        a    1.0
+        b    1.0
+        c    1.0
+        d    NaN
+        dtype: float64
+        >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"])
+        >>> b
+        a    1.0
+        b    NaN
+        d    1.0
+        e    NaN
+        dtype: float64
+        >>> a.divide(b, fill_value=0)
+        a    1.0
+        b    inf
+        c    inf
+        d    0.0
+        e    NaN
+        dtype: float64
+        """
+        return self._flex_method(
+            other, operator.truediv, level=level, fill_value=fill_value, axis=axis
+        )
+
+    div = truediv
+    divide = truediv
+
+    @Appender(ops.make_flex_doc("rtruediv", "series"))
+    def rtruediv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis
+        )
+
+    rdiv = rtruediv
+
+    @Appender(ops.make_flex_doc("floordiv", "series"))
+    def floordiv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, operator.floordiv, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("rfloordiv", "series"))
+    def rfloordiv(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis
+        )
+
+    def mod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        """
+        Return Modulo of series and other, element-wise (binary operator `mod`).
+
+        Equivalent to ``series % other``, but with support to substitute a
+        fill_value for missing data in either one of the inputs.
+
+        Parameters
+        ----------
+        other : Series or scalar value
+            Series with which to compute modulo.
+        level : int or name
+            Broadcast across a level, matching Index values on the
+            passed MultiIndex level.
+        fill_value : None or float value, default None (NaN)
+            Fill existing missing (NaN) values, and any new element needed for
+            successful Series alignment, with this value before computation.
+            If data in both corresponding Series locations is missing
+            the result of filling (at that location) will be missing.
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
+
+        Returns
+        -------
+        Series
+            The result of the operation.
+
+        See Also
+        --------
+        Series.rmod : Reverse of the Modulo operator, see
+            `Python documentation
+            <https://docs.python.org/3/reference/datamodel.html#emulating-numeric-types>`_
+            for more details.
+
+        Examples
+        --------
+        >>> a = pd.Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"])
+        >>> a
+        a    1.0
+        b    1.0
+        c    1.0
+        d    NaN
+        dtype: float64
+        >>> b = pd.Series([1, np.nan, 1, np.nan], index=["a", "b", "d", "e"])
+        >>> b
+        a    1.0
+        b    NaN
+        d    1.0
+        e    NaN
+        dtype: float64
+        >>> a.mod(b, fill_value=0)
+        a    0.0
+        b    NaN
+        c    NaN
+        d    0.0
+        e    NaN
+        dtype: float64
+        """
+        return self._flex_method(
+            other, operator.mod, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("rmod", "series"))
+    def rmod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, roperator.rmod, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("pow", "series"))
+    def pow(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, operator.pow, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("rpow", "series"))
+    def rpow(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, roperator.rpow, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("divmod", "series"))
+    def divmod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, divmod, level=level, fill_value=fill_value, axis=axis
+        )
+
+    @Appender(ops.make_flex_doc("rdivmod", "series"))
+    def rdivmod(self, other, level=None, fill_value=None, axis: Axis = 0) -> Series:
+        return self._flex_method(
+            other, roperator.rdivmod, level=level, fill_value=fill_value, axis=axis
+        )
+
+    # ----------------------------------------------------------------------
+    # Reductions
+
+    def _reduce(
+        self,
+        op,
+        # error: Variable "pandas.core.series.Series.str" is not valid as a type
+        name: str,  # type: ignore[valid-type]
+        *,
+        axis: Axis = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        filter_type=None,
+        **kwds,
+    ):
+        """
+        Perform a reduction operation.
+
+        If we have an ndarray as a value, then simply perform the operation,
+        otherwise delegate to the object.
+        """
+        delegate = self._values
+
+        if axis is not None:
+            self._get_axis_number(axis)
+
+        if isinstance(delegate, ExtensionArray):
+            # dispatch to ExtensionArray interface
+            result = delegate._reduce(name, skipna=skipna, **kwds)
+
+        else:
+            # dispatch to numpy arrays
+            if numeric_only and self.dtype.kind not in "iufcb":
+                # i.e. not is_numeric_dtype(self.dtype)
+                kwd_name = "numeric_only"
+                if name in ["any", "all"]:
+                    kwd_name = "bool_only"
+                # GH#47500 - change to TypeError to match other methods
+                raise TypeError(
+                    f"Series.{name} does not allow {kwd_name}={numeric_only} "
+                    "with non-numeric dtypes."
+                )
+            result = op(delegate, skipna=skipna, **kwds)
+
+        result = maybe_unbox_numpy_scalar(result)
+        return result
+
+    # error: Signature of "any" incompatible with supertype "NDFrame"
+    def any(  # type: ignore[override]
+        self,
+        *,
+        axis: Axis = 0,
+        bool_only: bool = False,
+        skipna: bool = True,
+        **kwargs,
+    ) -> bool:
+        """
+        Return whether any element is True, potentially over an axis.
+
+        Returns False unless there is at least one element within a series or
+        along a Dataframe axis that is True or equivalent (e.g. non-zero or
+        non-empty).
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns', None}, default 0
+            Indicate which axis or axes should be reduced. For `Series` this parameter
+            is unused and defaults to 0.
+
+            * 0 / 'index' : reduce the index, return a Series whose index is the
+              original column labels.
+            * 1 / 'columns' : reduce the columns, return a Series whose index is the
+              original index.
+            * None : reduce all axes, return a scalar.
+
+        bool_only : bool, default False
+            Include only boolean columns. Not implemented for Series.
+        skipna : bool, default True
+            Exclude NA/null values. If the entire row/column is NA and skipna is
+            True, then the result will be False, as for an empty row/column.
+            If skipna is False, then NA are treated as True, because these are not
+            equal to zero.
+        **kwargs : any, default None
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        Series or scalar
+            If axis=None, then a scalar boolean is returned.
+            Otherwise a Series is returned with index matching the index argument.
+
+        See Also
+        --------
+        numpy.any : Numpy version of this method.
+        Series.any : Return whether any element is True.
+        Series.all : Return whether all elements are True.
+        DataFrame.any : Return whether any element is True over requested axis.
+        DataFrame.all : Return whether all elements are True over requested axis.
+
+        Examples
+        --------
+        **Series**
+
+        For Series input, the output is a scalar indicating whether any element
+        is True.
+
+        >>> pd.Series([False, False]).any()
+        False
+        >>> pd.Series([True, False]).any()
+        True
+        >>> pd.Series([], dtype="float64").any()
+        False
+        >>> pd.Series([np.nan]).any()
+        False
+        >>> pd.Series([np.nan]).any(skipna=False)
+        True
+
+        **DataFrame**
+
+        Whether each column contains at least one True element (the default).
+
+        >>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
+        >>> df
+           A  B  C
+        0  1  0  0
+        1  2  2  0
+
+        >>> df.any()
+        A     True
+        B     True
+        C    False
+        dtype: bool
+
+        Aggregating over the columns.
+
+        >>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})
+        >>> df
+               A  B
+        0   True  1
+        1  False  2
+
+        >>> df.any(axis="columns")
+        0    True
+        1    True
+        dtype: bool
+
+        >>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})
+        >>> df
+               A  B
+        0   True  1
+        1  False  0
+
+        >>> df.any(axis="columns")
+        0    True
+        1    False
+        dtype: bool
+
+        Aggregating over the entire DataFrame with ``axis=None``.
+
+        >>> df.any(axis=None)
+        True
+
+        `any` for an empty DataFrame is an empty Series.
+
+        >>> pd.DataFrame([]).any()
+        Series([], dtype: bool)
+        """
+        nv.validate_logical_func((), kwargs, fname="any")
+        validate_bool_kwarg(skipna, "skipna", none_allowed=False)
+        return self._reduce(
+            nanops.nanany,
+            name="any",
+            axis=axis,
+            numeric_only=bool_only,
+            skipna=skipna,
+            filter_type="bool",
+        )
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="all")
+    def all(
+        self,
+        axis: Axis = 0,
+        bool_only: bool = False,
+        skipna: bool = True,
+        **kwargs,
+    ) -> bool:
+        """
+        Return whether all elements are True, potentially over an axis.
+
+        Returns True unless there at least one element within a series or
+        along a Dataframe axis that is False or equivalent (e.g. zero or
+        empty).
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns', None}, default 0
+            Indicate which axis or axes should be reduced. For `Series` this parameter
+            is unused and defaults to 0.
+
+            * 0 / 'index' : reduce the index, return a Series whose index is the
+              original column labels.
+            * 1 / 'columns' : reduce the columns, return a Series whose index is the
+              original index.
+            * None : reduce all axes, return a scalar.
+
+        bool_only : bool, default False
+            Include only boolean columns. Not implemented for Series.
+        skipna : bool, default True
+            Exclude NA/null values. If the entire row/column is NA and skipna is
+            True, then the result will be True, as for an empty row/column.
+            If skipna is False, then NA are treated as True, because these are not
+            equal to zero.
+        **kwargs : any, default None
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        Series or scalar
+            If axis=None, then a scalar boolean is returned.
+            Otherwise a Series is returned with index matching the index argument.
+
+        See Also
+        --------
+        Series.all : Return True if all elements are True.
+        DataFrame.any : Return True if one (or more) elements are True.
+
+        Examples
+        --------
+        **Series**
+
+        >>> pd.Series([True, True]).all()
+        True
+        >>> pd.Series([True, False]).all()
+        False
+        >>> pd.Series([], dtype="float64").all()
+        True
+        >>> pd.Series([np.nan]).all()
+        True
+        >>> pd.Series([np.nan]).all(skipna=False)
+        True
+
+        **DataFrames**
+
+        Create a DataFrame from a dictionary.
+
+        >>> df = pd.DataFrame({"col1": [True, True], "col2": [True, False]})
+        >>> df
+           col1   col2
+        0  True   True
+        1  True  False
+
+        Default behaviour checks if values in each column all return True.
+
+        >>> df.all()
+        col1     True
+        col2    False
+        dtype: bool
+
+        Specify ``axis='columns'`` to check if values in each row all return True.
+
+        >>> df.all(axis="columns")
+        0     True
+        1    False
+        dtype: bool
+
+        Or ``axis=None`` for whether every value is True.
+
+        >>> df.all(axis=None)
+        False
+        """
+        nv.validate_logical_func((), kwargs, fname="all")
+        validate_bool_kwarg(skipna, "skipna", none_allowed=False)
+        return self._reduce(
+            nanops.nanall,
+            name="all",
+            axis=axis,
+            numeric_only=bool_only,
+            skipna=skipna,
+            filter_type="bool",
+        )
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="min")
+    def min(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ):
+        """
+        Return the minimum of the values over the requested axis.
+
+        If you want the *index* of the minimum, use ``idxmin``.
+        This is the equivalent of the ``numpy.ndarray`` method ``argmin``.
+
+        Parameters
+        ----------
+        axis : {index (0)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            For DataFrames, specifying ``axis=None`` will apply the aggregation
+            across both axes.
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        scalar or Series (if level specified)
+            The minimum of the values in the Series.
+
+        See Also
+        --------
+        numpy.min : Equivalent numpy function for arrays.
+        Series.min : Return the minimum.
+        Series.max : Return the maximum.
+        Series.idxmin : Return the index of the minimum.
+        Series.idxmax : Return the index of the maximum.
+        DataFrame.min : Return the minimum over the requested axis.
+        DataFrame.max : Return the maximum over the requested axis.
+        DataFrame.idxmin : Return the index of the minimum over the requested axis.
+        DataFrame.idxmax : Return the index of the maximum over the requested axis.
+
+        Examples
+        --------
+        >>> idx = pd.MultiIndex.from_arrays(
+        ...     [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]],
+        ...     names=["blooded", "animal"],
+        ... )
+        >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx)
+        >>> s
+        blooded  animal
+        warm     dog       4
+                 falcon    2
+        cold     fish      0
+                 spider    8
+        Name: legs, dtype: int64
+
+        >>> s.min()
+        0
+        """
+        return NDFrame.min(
+            self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs
+        )
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="max")
+    def max(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ):
+        """
+        Return the maximum of the values over the requested axis.
+
+        If you want the *index* of the maximum, use ``idxmax``.
+        This is the equivalent of the ``numpy.ndarray`` method ``argmax``.
+
+        Parameters
+        ----------
+        axis : {index (0)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            For DataFrames, specifying ``axis=None`` will apply the aggregation
+            across both axes.
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        scalar or Series (if level specified)
+            The maximum of the values in the Series.
+
+        See Also
+        --------
+        numpy.max : Equivalent numpy function for arrays.
+        Series.min : Return the minimum.
+        Series.max : Return the maximum.
+        Series.idxmin : Return the index of the minimum.
+        Series.idxmax : Return the index of the maximum.
+        DataFrame.min : Return the minimum over the requested axis.
+        DataFrame.max : Return the maximum over the requested axis.
+        DataFrame.idxmin : Return the index of the minimum over the requested axis.
+        DataFrame.idxmax : Return the index of the maximum over the requested axis.
+
+        Examples
+        --------
+        >>> idx = pd.MultiIndex.from_arrays(
+        ...     [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]],
+        ...     names=["blooded", "animal"],
+        ... )
+        >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx)
+        >>> s
+        blooded  animal
+        warm     dog       4
+                 falcon    2
+        cold     fish      0
+                 spider    8
+        Name: legs, dtype: int64
+
+        >>> s.max()
+        8
+        """
+        return NDFrame.max(
+            self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs
+        )
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="sum")
+    def sum(
+        self,
+        axis: Axis | None = None,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        min_count: int = 0,
+        **kwargs,
+    ):
+        """
+        Return the sum of the values over the requested axis.
+
+        This is equivalent to the method ``numpy.sum``.
+
+        Parameters
+        ----------
+        axis : {index (0)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            .. warning::
+
+                The behavior of DataFrame.sum with ``axis=None`` is deprecated,
+                in a future version this will reduce over both axes and return a scalar
+                To retain the old behavior, pass axis=0 (or do not pass axis).
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns. Not implemented for Series.
+
+        min_count : int, default 0
+            The required number of valid values to perform the operation. If fewer than
+            ``min_count`` non-NA values are present the result will be NA.
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        scalar or Series (if level specified)
+            Sum of the values for the requested axis.
+
+        See Also
+        --------
+        numpy.sum : Equivalent numpy function for computing sum.
+        Series.mean : Mean of the values.
+        Series.median : Median of the values.
+        Series.std : Standard deviation of the values.
+        Series.var : Variance of the values.
+        Series.min : Minimum value.
+        Series.max : Maximum value.
+
+        Examples
+        --------
+        >>> idx = pd.MultiIndex.from_arrays(
+        ...     [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]],
+        ...     names=["blooded", "animal"],
+        ... )
+        >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx)
+        >>> s
+        blooded  animal
+        warm     dog       4
+                 falcon    2
+        cold     fish      0
+                 spider    8
+        Name: legs, dtype: int64
+
+        >>> s.sum()
+        14
+
+        By default, the sum of an empty or all-NA Series is ``0``.
+
+        >>> pd.Series([], dtype="float64").sum()  # min_count=0 is the default
+        0.0
+
+        This can be controlled with the ``min_count`` parameter. For example, if
+        you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
+
+        >>> pd.Series([], dtype="float64").sum(min_count=1)
+        nan
+
+        Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
+        empty series identically.
+
+        >>> pd.Series([np.nan]).sum()
+        0.0
+
+        >>> pd.Series([np.nan]).sum(min_count=1)
+        nan
+        """
+        return NDFrame.sum(
+            self,
+            axis=axis,
+            skipna=skipna,
+            numeric_only=numeric_only,
+            min_count=min_count,
+            **kwargs,
+        )
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="prod")
+    def prod(
+        self,
+        axis: Axis | None = None,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        min_count: int = 0,
+        **kwargs,
+    ):
+        """
+        Return the product of the values over the requested axis.
+
+        By default, missing values are skipped. To include them in the calculation,
+        set ``skipna`` parameter to False.
+
+        Parameters
+        ----------
+        axis : {index (0)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            .. warning::
+                The behavior of DataFrame.prod with ``axis=None`` is deprecated,
+                in a future version this will reduce over both axes and return a scalar
+                To retain the old behavior, pass axis=0 (or do not pass axis).
+
+            .. versionadded:: 2.0.0
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns. Not implemented for Series.
+        min_count : int, default 0
+            The required number of valid values to perform the operation. If fewer than
+            ``min_count`` non-NA values are present the result will be NA.
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        scalar
+            Value containing the calculation referenced in the description.
+
+        See Also
+        --------
+        Series.sum : Return the sum.
+        Series.min : Return the minimum.
+        Series.max : Return the maximum.
+        Series.idxmin : Return the index of the minimum.
+        Series.idxmax : Return the index of the maximum.
+
+        DataFrame.sum : Return the sum over the requested axis.
+        DataFrame.min : Return the minimum over the requested axis.
+        DataFrame.max : Return the maximum over the requested axis.
+        DataFrame.idxmin : Return the index of the minimum over the requested axis.
+        DataFrame.idxmax : Return the index of the maximum over the requested axis.
+
+        Examples
+        --------
+        By default, the product of an empty or all-NA Series is ``1``
+
+        >>> pd.Series([], dtype="float64").prod()
+        1.0
+
+        This can be controlled with the ``min_count`` parameter
+
+        >>> pd.Series([], dtype="float64").prod(min_count=1)
+        nan
+
+        Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
+        empty series identically.
+
+        >>> pd.Series([np.nan]).prod()
+        1.0
+        >>> pd.Series([np.nan]).prod(min_count=1)
+        nan
+        """
+        return NDFrame.prod(
+            self,
+            axis=axis,
+            skipna=skipna,
+            numeric_only=numeric_only,
+            min_count=min_count,
+            **kwargs,
+        )
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="mean")
+    def mean(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Any:
+        """
+        Return the mean of the values over the requested axis.
+
+        Parameters
+        ----------
+        axis : {index (0)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            For DataFrames, specifying ``axis=None`` will apply the aggregation
+            across both axes.
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        scalar or Series (if level specified)
+            Mean of the values for the requested axis.
+
+        See Also
+        --------
+        numpy.median : Equivalent numpy function for computing median.
+        Series.sum : Sum of the values.
+        Series.median : Median of the values.
+        Series.std : Standard deviation of the values.
+        Series.var : Variance of the values.
+        Series.min : Minimum value.
+        Series.max : Maximum value.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.mean()
+        2.0
+        """
+        return NDFrame.mean(
+            self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs
+        )
+
+    @deprecate_nonkeyword_arguments(
+        Pandas4Warning, allowed_args=["self"], name="median"
+    )
+    def median(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ) -> Any:
+        """
+        Return the median of the values over the requested axis.
+
+        Parameters
+        ----------
+        axis : {index (0)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            For DataFrames, specifying ``axis=None`` will apply the aggregation
+            across both axes.
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        scalar or Series (if level specified)
+            Median of the values for the requested axis.
+
+        See Also
+        --------
+        numpy.median : Equivalent numpy function for computing median.
+        Series.sum : Sum of the values.
+        Series.median : Median of the values.
+        Series.std : Standard deviation of the values.
+        Series.var : Variance of the values.
+        Series.min : Minimum value.
+        Series.max : Maximum value.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.median()
+        2.0
+
+        With a DataFrame
+
+        >>> df = pd.DataFrame({"a": [1, 2], "b": [2, 3]}, index=["tiger", "zebra"])
+        >>> df
+               a   b
+        tiger  1   2
+        zebra  2   3
+        >>> df.median()
+        a   1.5
+        b   2.5
+        dtype: float64
+
+        Using axis=1
+
+        >>> df.median(axis=1)
+        tiger   1.5
+        zebra   2.5
+        dtype: float64
+
+        In this case, `numeric_only` should be set to `True`
+        to avoid getting an error.
+
+        >>> df = pd.DataFrame({"a": [1, 2], "b": ["T", "Z"]}, index=["tiger", "zebra"])
+        >>> df.median(numeric_only=True)
+        a   1.5
+        dtype: float64
+        """
+        return NDFrame.median(
+            self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs
+        )
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="sem")
+    def sem(
+        self,
+        axis: Axis | None = None,
+        skipna: bool = True,
+        ddof: int = 1,
+        numeric_only: bool = False,
+        **kwargs,
+    ):
+        """
+        Return unbiased standard error of the mean over requested axis.
+
+        Normalized by N-1 by default. This can be changed using the ddof argument
+
+        Parameters
+        ----------
+        axis : {index (0)}
+            This parameter is unused and defaults to 0.
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+        ddof : int, default 1
+            Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+            where N represents the number of elements.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns. Not implemented for Series.
+        **kwargs :
+            Additional keywords have no effect but might be accepted
+            for compatibility with NumPy.
+
+        Returns
+        -------
+        scalar or Series (if level specified)
+            Unbiased standard error of the mean over requested axis.
+
+        See Also
+        --------
+        scipy.stats.sem : Compute standard error of the mean.
+        Series.std : Return sample standard deviation over requested axis.
+        Series.var : Return unbiased variance over requested axis.
+        Series.mean : Return the mean of the values over the requested axis.
+        Series.median : Return the median of the values over the requested axis.
+        Series.mode : Return the mode(s) of the Series.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> round(s.sem(), 6)
+        0.57735
+        """
+        return NDFrame.sem(
+            self,
+            axis=axis,
+            skipna=skipna,
+            ddof=ddof,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="var")
+    def var(
+        self,
+        axis: Axis | None = None,
+        skipna: bool = True,
+        ddof: int = 1,
+        numeric_only: bool = False,
+        **kwargs,
+    ):
+        """
+        Return unbiased variance over requested axis.
+
+        Normalized by N-1 by default. This can be changed using the ddof argument.
+
+        Parameters
+        ----------
+        axis : {index (0)}
+            For `Series` this parameter is unused and defaults to 0.
+
+            .. warning::
+
+                The behavior of DataFrame.var with ``axis=None`` is deprecated,
+                in a future version this will reduce over both axes and return a scalar
+                To retain the old behavior, pass axis=0 (or do not pass axis).
+
+        skipna : bool, default True
+            Exclude NA/null values. If an entire row/column is NA, the result
+            will be NA.
+        ddof : int, default 1
+            Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+            where N represents the number of elements.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns. Not implemented for Series.
+        **kwargs :
+            Additional keywords passed.
+
+        Returns
+        -------
+        scalar or Series (if level specified)
+            Unbiased variance over requested axis.
+
+        See Also
+        --------
+        numpy.var : Equivalent function in NumPy.
+        Series.std : Returns the standard deviation of the Series.
+        DataFrame.var : Returns the variance of the DataFrame.
+        DataFrame.std : Return standard deviation of the values over
+            the requested axis.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "person_id": [0, 1, 2, 3],
+        ...         "age": [21, 25, 62, 43],
+        ...         "height": [1.61, 1.87, 1.49, 2.01],
+        ...     }
+        ... ).set_index("person_id")
+        >>> df
+                   age  height
+        person_id
+        0           21    1.61
+        1           25    1.87
+        2           62    1.49
+        3           43    2.01
+
+        >>> df.var()
+        age       352.916667
+        height      0.056367
+        dtype: float64
+
+        Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1:
+
+        >>> df.var(ddof=0)
+        age       264.687500
+        height      0.042275
+        dtype: float64
+        """
+        return NDFrame.var(
+            self,
+            axis=axis,
+            skipna=skipna,
+            ddof=ddof,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="std")
+    def std(
+        self,
+        axis: Axis | None = None,
+        skipna: bool = True,
+        ddof: int = 1,
+        numeric_only: bool = False,
+        **kwargs,
+    ):
+        """
+        Return sample standard deviation.
+
+        Normalized by N-1 by default. This can be changed using the ddof argument.
+
+        Parameters
+        ----------
+        axis : {index (0)}
+            This parameter is unused and defaults to 0.
+        skipna : bool, default True
+            Exclude NA/null values. If Series is NA, the result
+            will be NA.
+        ddof : int, default 1
+            Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
+            where N represents the number of elements.
+        numeric_only : bool, default False
+            Not implemented for Series.
+        **kwargs :
+            Additional keywords have no effect but might be accepted
+            for compatibility with NumPy.
+
+        Returns
+        -------
+        scalar
+            Standard deviation over all values in the Series.
+
+        See Also
+        --------
+        numpy.std : Compute the standard deviation along the specified axis.
+        Series.var : Return unbiased variance over requested axis.
+        Series.sem : Return unbiased standard error of the mean over requested axis.
+        Series.mean : Return the mean of the values over the requested axis.
+        Series.median : Return the median of the values over the requested axis.
+        Series.mode : Return the mode(s) of the Series.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.std()
+        1.0
+
+        Alternatively, ``ddof=0`` can be set to normalize by $N$ instead of $N-1$:
+
+        >>> s.std(ddof=0)
+        0.816496580927726
+        """
+        return NDFrame.std(
+            self,
+            axis=axis,
+            skipna=skipna,
+            ddof=ddof,
+            numeric_only=numeric_only,
+            **kwargs,
+        )
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="skew")
+    def skew(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ):
+        """
+        Return unbiased skew over requested axis.
+
+        Normalized by N-1.
+
+        Parameters
+        ----------
+        axis : {index (0)}
+            This parameter is unused and defaults to 0.
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Unused.
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        scalar
+            Unbiased skew of the Series.
+
+        See Also
+        --------
+
+        Series.var : Return unbiased variance over requested axis.
+        Series.std : Return unbiased standard deviation over requested axis.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3])
+        >>> s.skew()
+        0.0
+        """
+        return NDFrame.skew(
+            self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs
+        )
+
+    @deprecate_nonkeyword_arguments(Pandas4Warning, allowed_args=["self"], name="kurt")
+    def kurt(
+        self,
+        axis: Axis | None = 0,
+        skipna: bool = True,
+        numeric_only: bool = False,
+        **kwargs,
+    ):
+        """
+        Return unbiased kurtosis over requested axis.
+
+        Kurtosis obtained using Fisher's definition of
+        kurtosis (kurtosis of normal == 0.0). Normalized by N-1.
+
+        Parameters
+        ----------
+        axis : {index (0)}
+            Axis for the function to be applied on.
+            For `Series` this parameter is unused and defaults to 0.
+
+            For DataFrames, specifying ``axis=None`` will apply the aggregation
+            across both axes.
+
+            .. versionadded:: 2.0.0
+
+        skipna : bool, default True
+            Exclude NA/null values when computing the result.
+        numeric_only : bool, default False
+            Include only float, int, boolean columns.
+
+        **kwargs
+            Additional keyword arguments to be passed to the function.
+
+        Returns
+        -------
+        scalar
+            Unbiased kurtosis.
+
+        See Also
+        --------
+        Series.skew : Return unbiased skew over requested axis.
+        Series.var : Return unbiased variance over requested axis.
+        Series.std : Return unbiased standard deviation over requested axis.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 2, 3], index=["cat", "dog", "dog", "mouse"])
+        >>> s
+        cat    1
+        dog    2
+        dog    2
+        mouse  3
+        dtype: int64
+        >>> s.kurt()
+        1.5
+        """
+        return NDFrame.kurt(
+            self, axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs
+        )
+
+    kurtosis = kurt
+    product = prod
+
+    def cummin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self:
+        """
+        Return cumulative minimum over a Series.
+
+        Returns a Series of the same size containing the cumulative
+        minimum.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'}, default 0
+            This parameter is unused and defaults to 0.
+        skipna : bool, default True
+            If the entire series is NA, the result will be NA.
+        *args, **kwargs
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        Series
+            Return cumulative minimum of the Series.
+
+        See Also
+        --------
+        core.window.expanding.Expanding.min : Similar functionality
+            but ignores ``NaN`` values.
+        Series.min : Return the minimum value of the Series.
+        Series.cummax : Return cumulative maximum.
+        Series.cumsum : Return cumulative sum.
+        Series.cumprod : Return cumulative product.
+
+        Examples
+        --------
+        >>> s = pd.Series([2, np.nan, 5, -1, 0])
+        >>> s
+        0    2.0
+        1    NaN
+        2    5.0
+        3   -1.0
+        4    0.0
+        dtype: float64
+
+        By default, NA values are ignored.
+
+        >>> s.cummin()
+        0    2.0
+        1    NaN
+        2    2.0
+        3   -1.0
+        4   -1.0
+        dtype: float64
+
+        To include NA values in the operation, use ``skipna=False``
+
+        >>> s.cummin(skipna=False)
+        0    2.0
+        1    NaN
+        2    NaN
+        3    NaN
+        4    NaN
+        dtype: float64
+        """
+        return NDFrame.cummin(self, axis, skipna, *args, **kwargs)
+
+    def cummax(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self:
+        """
+        Return cumulative maximum over a Series.
+
+        Returns a Series of the same size containing the cumulative
+        maximum.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'}, default 0
+            This parameter is unused and defaults to 0.
+        skipna : bool, default True
+            Exclude NA/null values. If the series is NA, the result is NA.
+        *args, **kwargs
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        Series
+            Return cumulative maximum of Series.
+
+        See Also
+        --------
+        core.window.expanding.Expanding.max : Similar functionality
+            but ignores ``NaN`` values.
+        Series.max : Return the maximum over a Series.
+        Series.cummin : Return cumulative minimum.
+        Series.cumsum : Return cumulative sum.
+        Series.cumprod : Return cumulative product.
+
+        Examples
+        --------
+        >>> s = pd.Series([2, np.nan, 5, -1, 0])
+        >>> s
+        0    2.0
+        1    NaN
+        2    5.0
+        3   -1.0
+        4    0.0
+        dtype: float64
+
+        By default, NA values are ignored.
+
+        >>> s.cummax()
+        0    2.0
+        1    NaN
+        2    5.0
+        3    5.0
+        4    5.0
+        dtype: float64
+
+        To include NA values in the operation, use ``skipna=False``
+
+        >>> s.cummax(skipna=False)
+        0    2.0
+        1    NaN
+        2    NaN
+        3    NaN
+        4    NaN
+        dtype: float64
+        """
+        return NDFrame.cummax(self, axis, skipna, *args, **kwargs)
+
+    def cumsum(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self:
+        """
+        Return cumulative sum over a Series.
+
+        Returns a Series of the same size containing the cumulative sum.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'}, default 0
+            This parameter is unused and defaults to 0.
+        skipna : bool, default True
+            Exclude NA/null values. If entire series is NA, the result will be NA.
+        *args, **kwargs
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        Series
+            Return cumulative sum of Series.
+
+        See Also
+        --------
+        core.window.expanding.Expanding.sum : Similar functionality
+            but ignores ``NaN`` values.
+        Series.sum : Return the sum over Series.
+        Series.cummax : Return cumulative maximum.
+        Series.cummin : Return cumulative minimum.
+        Series.cumprod : Return cumulative product.
+
+        Examples
+        --------
+        >>> s = pd.Series([2, np.nan, 5, -1, 0])
+        >>> s
+        0    2.0
+        1    NaN
+        2    5.0
+        3   -1.0
+        4    0.0
+        dtype: float64
+
+        By default, NA values are ignored.
+
+        >>> s.cumsum()
+        0    2.0
+        1    NaN
+        2    7.0
+        3    6.0
+        4    6.0
+        dtype: float64
+
+        To include NA values in the operation, use ``skipna=False``
+
+        >>> s.cumsum(skipna=False)
+        0    2.0
+        1    NaN
+        2    NaN
+        3    NaN
+        4    NaN
+        dtype: float64
+        """
+        return NDFrame.cumsum(self, axis, skipna, *args, **kwargs)
+
+    def cumprod(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Self:
+        """
+        Return cumulative product over a Series.
+
+        Returns a Series of the same size containing the cumulative
+        product.
+
+        Parameters
+        ----------
+        axis : {0 or 'index'}, default 0
+            This parameter is unused and defaults to 0.
+        skipna : bool, default True
+            Exclude NA/null values. If entire Series is NA, the result will be NA.
+        *args, **kwargs
+            Additional keywords have no effect but might be accepted for
+            compatibility with NumPy.
+
+        Returns
+        -------
+        Series
+            Return cumulative product of Series.
+
+        See Also
+        --------
+        core.window.expanding.Expanding.prod : Similar functionality
+            but ignores ``NaN`` values.
+        Series.prod : Return the product over Series.
+        Series.cummax : Return cumulative maximum.
+        Series.cummin : Return cumulative minimum.
+        Series.cumsum : Return cumulative sum.
+
+        Examples
+        --------
+        >>> s = pd.Series([2, np.nan, 5, -1, 0])
+        >>> s
+        0    2.0
+        1    NaN
+        2    5.0
+        3   -1.0
+        4    0.0
+        dtype: float64
+
+        By default, NA values are ignored.
+
+        >>> s.cumprod()
+        0     2.0
+        1     NaN
+        2    10.0
+        3   -10.0
+        4    -0.0
+        dtype: float64
+
+        To include NA values in the operation, use ``skipna=False``
+
+        >>> s.cumprod(skipna=False)
+        0    2.0
+        1    NaN
+        2    NaN
+        3    NaN
+        4    NaN
+        dtype: float64
+        """
+        return NDFrame.cumprod(self, axis, skipna, *args, **kwargs)
diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f91443b8cda655d07f9cf2dc1aedb8bcce873da
--- /dev/null
+++ b/pandas/core/shared_docs.py
@@ -0,0 +1,639 @@
+from __future__ import annotations
+
+_shared_docs: dict[str, str] = {}
+
+_shared_docs["aggregate"] = """
+Aggregate using one or more operations over the specified axis.
+
+Parameters
+----------
+func : function, str, list or dict
+    Function to use for aggregating the data. If a function, must either
+    work when passed a {klass} or when passed to {klass}.apply.
+
+    Accepted combinations are:
+
+    - function
+    - string function name
+    - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
+    - dict of axis labels -> functions, function names or list of such.
+{axis}
+*args
+    Positional arguments to pass to `func`.
+**kwargs
+    Keyword arguments to pass to `func`.
+
+Returns
+-------
+scalar, Series or DataFrame
+
+    The return can be:
+
+    * scalar : when Series.agg is called with single function
+    * Series : when DataFrame.agg is called with a single function
+    * DataFrame : when DataFrame.agg is called with several functions
+{see_also}
+Notes
+-----
+The aggregation operations are always performed over an axis, either the
+index (default) or the column axis. This behavior is different from
+`numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
+`var`), where the default is to compute the aggregation of the flattened
+array, e.g., ``numpy.mean(arr_2d)`` as opposed to
+``numpy.mean(arr_2d, axis=0)``.
+
+`agg` is an alias for `aggregate`. Use the alias.
+
+Functions that mutate the passed object can produce unexpected
+behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
+for more details.
+
+A passed user-defined-function will be passed a Series for evaluation.
+
+If ``func`` defines an index relabeling, ``axis`` must be ``0`` or ``index``.
+{examples}"""
+
+_shared_docs["compare"] = """
+Compare to another {klass} and show the differences.
+
+Parameters
+----------
+other : {klass}
+    Object to compare with.
+
+align_axis : {{0 or 'index', 1 or 'columns'}}, default 1
+    Determine which axis to align the comparison on.
+
+    * 0, or 'index' : Resulting differences are stacked vertically
+      with rows drawn alternately from self and other.
+    * 1, or 'columns' : Resulting differences are aligned horizontally
+      with columns drawn alternately from self and other.
+
+keep_shape : bool, default False
+    If true, all rows and columns are kept.
+    Otherwise, only the ones with different values are kept.
+
+keep_equal : bool, default False
+    If true, the result keeps values that are equal.
+    Otherwise, equal values are shown as NaNs.
+
+result_names : tuple, default ('self', 'other')
+    Set the dataframes names in the comparison.
+"""
+
+_shared_docs["groupby"] = """
+Group %(klass)s using a mapper or by a Series of columns.
+
+A groupby operation involves some combination of splitting the
+object, applying a function, and combining the results. This can be
+used to group large amounts of data and compute operations on these
+groups.
+
+Parameters
+----------
+by : mapping, function, label, pd.Grouper or list of such
+    Used to determine the groups for the groupby.
+    If ``by`` is a function, it's called on each value of the object's
+    index. If a dict or Series is passed, the Series or dict VALUES
+    will be used to determine the groups (the Series' values are first
+    aligned; see ``.align()`` method). If a list or ndarray of length
+    equal to the selected axis is passed (see the `groupby user guide
+    <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#splitting-an-object-into-groups>`_),
+    the values are used as-is to determine the groups. A label or list
+    of labels may be passed to group by the columns in ``self``.
+    Notice that a tuple is interpreted as a (single) key.
+level : int, level name, or sequence of such, default None
+    If the axis is a MultiIndex (hierarchical), group by a particular
+    level or levels. Do not specify both ``by`` and ``level``.
+as_index : bool, default True
+    Return object with group labels as the
+    index. Only relevant for DataFrame input. as_index=False is
+    effectively "SQL-style" grouped output. This argument has no effect
+    on filtrations (see the `filtrations in the user guide
+    <https://pandas.pydata.org/docs/dev/user_guide/groupby.html#filtration>`_),
+    such as ``head()``, ``tail()``, ``nth()`` and in transformations
+    (see the `transformations in the user guide
+    <https://pandas.pydata.org/docs/dev/user_guide/groupby.html#transformation>`_).
+sort : bool, default True
+    Sort group keys. Get better performance by turning this off.
+    Note this does not influence the order of observations within each
+    group. Groupby preserves the order of rows within each group. If False,
+    the groups will appear in the same order as they did in the original DataFrame.
+    This argument has no effect on filtrations (see the `filtrations in the user guide
+    <https://pandas.pydata.org/docs/dev/user_guide/groupby.html#filtration>`_),
+    such as ``head()``, ``tail()``, ``nth()`` and in transformations
+    (see the `transformations in the user guide
+    <https://pandas.pydata.org/docs/dev/user_guide/groupby.html#transformation>`_).
+
+    .. versionchanged:: 2.0.0
+
+        Specifying ``sort=False`` with an ordered categorical grouper will no
+        longer sort the values.
+
+group_keys : bool, default True
+    When calling apply and the ``by`` argument produces a like-indexed
+    (i.e. :ref:`a transform <groupby.transform>`) result, add group keys to
+    index to identify pieces. By default group keys are not included
+    when the result's index (and column) labels match the inputs, and
+    are included otherwise.
+
+    .. versionchanged:: 2.0.0
+
+       ``group_keys`` now defaults to ``True``.
+
+observed : bool, default True
+    This only applies if any of the groupers are Categoricals.
+    If True: only show observed values for categorical groupers.
+    If False: show all values for categorical groupers.
+
+    .. versionchanged:: 3.0.0
+
+        The default value is now ``True``.
+
+dropna : bool, default True
+    If True, and if group keys contain NA values, NA values together
+    with row/column will be dropped.
+    If False, NA values will also be treated as the key in groups.
+
+Returns
+-------
+pandas.api.typing.%(klass)sGroupBy
+    Returns a groupby object that contains information about the groups.
+
+See Also
+--------
+resample : Convenience method for frequency conversion and resampling
+    of time series.
+
+Notes
+-----
+See the `user guide
+<https://pandas.pydata.org/pandas-docs/stable/groupby.html>`__ for more
+detailed usage and examples, including splitting an object into groups,
+iterating through groups, selecting a group, aggregation, and more.
+
+The implementation of groupby is hash-based, meaning in particular that
+objects that compare as equal will be considered to be in the same group.
+An exception to this is that pandas has special handling of NA values:
+any NA values will be collapsed to a single group, regardless of how
+they compare. See the user guide linked above for more details.
+"""
+
+_shared_docs["transform"] = """
+Call ``func`` on self producing a {klass} with the same axis shape as self.
+
+Parameters
+----------
+func : function, str, list-like or dict-like
+    Function to use for transforming the data. If a function, must either
+    work when passed a {klass} or when passed to {klass}.apply. If func
+    is both list-like and dict-like, dict-like behavior takes precedence.
+
+    Accepted combinations are:
+
+    - function
+    - string function name
+    - list-like of functions and/or function names, e.g. ``[np.exp, 'sqrt']``
+    - dict-like of axis labels -> functions, function names or list-like of such.
+{axis}
+*args
+    Positional arguments to pass to `func`.
+**kwargs
+    Keyword arguments to pass to `func`.
+
+Returns
+-------
+{klass}
+    A {klass} that must have the same length as self.
+
+Raises
+------
+ValueError : If the returned {klass} has a different length than self.
+
+See Also
+--------
+{klass}.agg : Only perform aggregating type operations.
+{klass}.apply : Invoke function on a {klass}.
+
+Notes
+-----
+Functions that mutate the passed object can produce unexpected
+behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
+for more details.
+
+Examples
+--------
+>>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}})
+>>> df
+   A  B
+0  0  1
+1  1  2
+2  2  3
+>>> df.transform(lambda x: x + 1)
+   A  B
+0  1  2
+1  2  3
+2  3  4
+
+Even though the resulting {klass} must have the same length as the
+input {klass}, it is possible to provide several input functions:
+
+>>> s = pd.Series(range(3))
+>>> s
+0    0
+1    1
+2    2
+dtype: int64
+>>> s.transform([np.sqrt, np.exp])
+       sqrt        exp
+0  0.000000   1.000000
+1  1.000000   2.718282
+2  1.414214   7.389056
+
+You can call transform on a GroupBy object:
+
+>>> df = pd.DataFrame({{
+...     "Date": [
+...         "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05",
+...         "2015-05-08", "2015-05-07", "2015-05-06", "2015-05-05"],
+...     "Data": [5, 8, 6, 1, 50, 100, 60, 120],
+... }})
+>>> df
+         Date  Data
+0  2015-05-08     5
+1  2015-05-07     8
+2  2015-05-06     6
+3  2015-05-05     1
+4  2015-05-08    50
+5  2015-05-07   100
+6  2015-05-06    60
+7  2015-05-05   120
+>>> df.groupby('Date')['Data'].transform('sum')
+0     55
+1    108
+2     66
+3    121
+4     55
+5    108
+6     66
+7    121
+Name: Data, dtype: int64
+
+>>> df = pd.DataFrame({{
+...     "c": [1, 1, 1, 2, 2, 2, 2],
+...     "type": ["m", "n", "o", "m", "m", "n", "n"]
+... }})
+>>> df
+   c type
+0  1    m
+1  1    n
+2  1    o
+3  2    m
+4  2    m
+5  2    n
+6  2    n
+>>> df['size'] = df.groupby('c')['type'].transform(len)
+>>> df
+   c type size
+0  1    m    3
+1  1    n    3
+2  1    o    3
+3  2    m    4
+4  2    m    4
+5  2    n    4
+6  2    n    4
+"""
+
+_shared_docs["storage_options"] = """storage_options : dict, optional
+    Extra options that make sense for a particular storage connection, e.g.
+    host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+    are forwarded to ``urllib.request.Request`` as header options. For other
+    URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+    forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+    details, and for more examples on storage options refer `here
+    <https://pandas.pydata.org/docs/user_guide/io.html?
+    highlight=storage_options#reading-writing-remote-files>`_."""
+
+_shared_docs["compression_options"] = """compression : str or dict, default 'infer'
+    For on-the-fly compression of the output data. If 'infer' and '%s' is
+    path-like, then detect compression from the following extensions: '.gz',
+    '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+    (otherwise no compression).
+    Set to ``None`` for no compression.
+    Can also be a dict with key ``'method'`` set
+    to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
+    other key-value pairs are forwarded to
+    ``zipfile.ZipFile``, ``gzip.GzipFile``,
+    ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
+    ``tarfile.TarFile``, respectively.
+    As an example, the following could be passed for faster compression and to create
+    a reproducible gzip archive:
+    ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
+
+    .. versionadded:: 1.5.0
+        Added support for `.tar` files."""
+
+_shared_docs["decompression_options"] = """compression : str or dict, default 'infer'
+    For on-the-fly decompression of on-disk data. If 'infer' and '%s' is
+    path-like, then detect compression from the following extensions: '.gz',
+    '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+    (otherwise no compression).
+    If using 'zip' or 'tar', the ZIP file must contain only one data file to be read in.
+    Set to ``None`` for no decompression.
+    Can also be a dict with key ``'method'`` set
+    to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
+    other key-value pairs are forwarded to
+    ``zipfile.ZipFile``, ``gzip.GzipFile``,
+    ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or
+    ``tarfile.TarFile``, respectively.
+    As an example, the following could be passed for Zstandard decompression using a
+    custom compression dictionary:
+    ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
+
+    .. versionadded:: 1.5.0
+        Added support for `.tar` files."""
+
+_shared_docs["replace"] = """
+    Replace values given in `to_replace` with `value`.
+
+    Values of the {klass} are replaced with other values dynamically.
+    This differs from updating with ``.loc`` or ``.iloc``, which require
+    you to specify a location to update with some value.
+
+    Parameters
+    ----------
+    to_replace : str, regex, list, dict, Series, int, float, or None
+        How to find the values that will be replaced.
+
+        * numeric, str or regex:
+
+            - numeric: numeric values equal to `to_replace` will be
+              replaced with `value`
+            - str: string exactly matching `to_replace` will be replaced
+              with `value`
+            - regex: regexes matching `to_replace` will be replaced with
+              `value`
+
+        * list of str, regex, or numeric:
+
+            - First, if `to_replace` and `value` are both lists, they
+              **must** be the same length.
+            - Second, if ``regex=True`` then all of the strings in **both**
+              lists will be interpreted as regexes otherwise they will match
+              directly. This doesn't matter much for `value` since there
+              are only a few possible substitution regexes you can use.
+            - str, regex and numeric rules apply as above.
+
+        * dict:
+
+            - Dicts can be used to specify different replacement values
+              for different existing values. For example,
+              ``{{'a': 'b', 'y': 'z'}}`` replaces the value 'a' with 'b' and
+              'y' with 'z'. To use a dict in this way, the optional `value`
+              parameter should not be given.
+            - For a DataFrame a dict can specify that different values
+              should be replaced in different columns. For example,
+              ``{{'a': 1, 'b': 'z'}}`` looks for the value 1 in column 'a'
+              and the value 'z' in column 'b' and replaces these values
+              with whatever is specified in `value`. The `value` parameter
+              should not be ``None`` in this case. You can treat this as a
+              special case of passing two lists except that you are
+              specifying the column to search in.
+            - For a DataFrame nested dictionaries, e.g.,
+              ``{{'a': {{'b': np.nan}}}}``, are read as follows: look in column
+              'a' for the value 'b' and replace it with NaN. The optional `value`
+              parameter should not be specified to use a nested dict in this
+              way. You can nest regular expressions as well. Note that
+              column names (the top-level dictionary keys in a nested
+              dictionary) **cannot** be regular expressions.
+
+        * None:
+
+            - This means that the `regex` argument must be a string,
+              compiled regular expression, or list, dict, ndarray or
+              Series of such elements. If `value` is also ``None`` then
+              this **must** be a nested dictionary or Series.
+
+        See the examples section for examples of each of these.
+    value : scalar, dict, list, str, regex, default None
+        Value to replace any values matching `to_replace` with.
+        For a DataFrame a dict of values can be used to specify which
+        value to use for each column (columns not in the dict will not be
+        filled). Regular expressions, strings and lists or dicts of such
+        objects are also allowed.
+    {inplace}
+    regex : bool or same types as `to_replace`, default False
+        Whether to interpret `to_replace` and/or `value` as regular
+        expressions. Alternatively, this could be a regular expression or a
+        list, dict, or array of regular expressions in which case
+        `to_replace` must be ``None``.
+
+    Returns
+    -------
+    {klass}
+        Object after replacement.
+
+    Raises
+    ------
+    AssertionError
+        * If `regex` is not a ``bool`` and `to_replace` is not
+          ``None``.
+
+    TypeError
+        * If `to_replace` is not a scalar, array-like, ``dict``, or ``None``
+        * If `to_replace` is a ``dict`` and `value` is not a ``list``,
+          ``dict``, ``ndarray``, or ``Series``
+        * If `to_replace` is ``None`` and `regex` is not compilable
+          into a regular expression or is a list, dict, ndarray, or
+          Series.
+        * When replacing multiple ``bool`` or ``datetime64`` objects and
+          the arguments to `to_replace` does not match the type of the
+          value being replaced
+
+    ValueError
+        * If a ``list`` or an ``ndarray`` is passed to `to_replace` and
+          `value` but they are not the same length.
+
+    See Also
+    --------
+    Series.fillna : Fill NA values.
+    DataFrame.fillna : Fill NA values.
+    Series.where : Replace values based on boolean condition.
+    DataFrame.where : Replace values based on boolean condition.
+    DataFrame.map: Apply a function to a Dataframe elementwise.
+    Series.map: Map values of Series according to an input mapping or function.
+    Series.str.replace : Simple string replacement.
+
+    Notes
+    -----
+    * Regex substitution is performed under the hood with ``re.sub``. The
+      rules for substitution for ``re.sub`` are the same.
+    * Regular expressions will only substitute on strings, meaning you
+      cannot provide, for example, a regular expression matching floating
+      point numbers and expect the columns in your frame that have a
+      numeric dtype to be matched. However, if those floating point
+      numbers *are* strings, then you can do this.
+    * This method has *a lot* of options. You are encouraged to experiment
+      and play with this method to gain intuition about how it works.
+    * When dict is used as the `to_replace` value, it is like
+      key(s) in the dict are the to_replace part and
+      value(s) in the dict are the value parameter.
+
+    Examples
+    --------
+
+    **Scalar `to_replace` and `value`**
+
+    >>> s = pd.Series([1, 2, 3, 4, 5])
+    >>> s.replace(1, 5)
+    0    5
+    1    2
+    2    3
+    3    4
+    4    5
+    dtype: int64
+
+    >>> df = pd.DataFrame({{'A': [0, 1, 2, 3, 4],
+    ...                    'B': [5, 6, 7, 8, 9],
+    ...                    'C': ['a', 'b', 'c', 'd', 'e']}})
+    >>> df.replace(0, 5)
+        A  B  C
+    0  5  5  a
+    1  1  6  b
+    2  2  7  c
+    3  3  8  d
+    4  4  9  e
+
+    **List-like `to_replace`**
+
+    >>> df.replace([0, 1, 2, 3], 4)
+        A  B  C
+    0  4  5  a
+    1  4  6  b
+    2  4  7  c
+    3  4  8  d
+    4  4  9  e
+
+    >>> df.replace([0, 1, 2, 3], [4, 3, 2, 1])
+        A  B  C
+    0  4  5  a
+    1  3  6  b
+    2  2  7  c
+    3  1  8  d
+    4  4  9  e
+
+    **dict-like `to_replace`**
+
+    >>> df.replace({{0: 10, 1: 100}})
+            A  B  C
+    0   10  5  a
+    1  100  6  b
+    2    2  7  c
+    3    3  8  d
+    4    4  9  e
+
+    >>> df.replace({{'A': 0, 'B': 5}}, 100)
+            A    B  C
+    0  100  100  a
+    1    1    6  b
+    2    2    7  c
+    3    3    8  d
+    4    4    9  e
+
+    >>> df.replace({{'A': {{0: 100, 4: 400}}}})
+            A  B  C
+    0  100  5  a
+    1    1  6  b
+    2    2  7  c
+    3    3  8  d
+    4  400  9  e
+
+    **Regular expression `to_replace`**
+
+    >>> df = pd.DataFrame({{'A': ['bat', 'foo', 'bait'],
+    ...                    'B': ['abc', 'bar', 'xyz']}})
+    >>> df.replace(to_replace=r'^ba.$', value='new', regex=True)
+            A    B
+    0   new  abc
+    1   foo  new
+    2  bait  xyz
+
+    >>> df.replace({{'A': r'^ba.$'}}, {{'A': 'new'}}, regex=True)
+            A    B
+    0   new  abc
+    1   foo  bar
+    2  bait  xyz
+
+    >>> df.replace(regex=r'^ba.$', value='new')
+            A    B
+    0   new  abc
+    1   foo  new
+    2  bait  xyz
+
+    >>> df.replace(regex={{r'^ba.$': 'new', 'foo': 'xyz'}})
+            A    B
+    0   new  abc
+    1   xyz  new
+    2  bait  xyz
+
+    >>> df.replace(regex=[r'^ba.$', 'foo'], value='new')
+            A    B
+    0   new  abc
+    1   new  new
+    2  bait  xyz
+
+    Compare the behavior of ``s.replace({{'a': None}})`` and
+    ``s.replace('a', None)`` to understand the peculiarities
+    of the `to_replace` parameter:
+
+    >>> s = pd.Series([10, 'a', 'a', 'b', 'a'])
+
+    When one uses a dict as the `to_replace` value, it is like the
+    value(s) in the dict are equal to the `value` parameter.
+    ``s.replace({{'a': None}})`` is equivalent to
+    ``s.replace(to_replace={{'a': None}}, value=None)``:
+
+    >>> s.replace({{'a': None}})
+    0      10
+    1    None
+    2    None
+    3       b
+    4    None
+    dtype: object
+
+    If ``None`` is explicitly passed for ``value``, it will be respected:
+
+    >>> s.replace('a', None)
+    0      10
+    1    None
+    2    None
+    3       b
+    4    None
+    dtype: object
+
+    When ``regex=True``, ``value`` is not ``None`` and `to_replace` is a string,
+    the replacement will be applied in all columns of the DataFrame.
+
+    >>> df = pd.DataFrame({{'A': [0, 1, 2, 3, 4],
+    ...                    'B': ['a', 'b', 'c', 'd', 'e'],
+    ...                    'C': ['f', 'g', 'h', 'i', 'j']}})
+
+    >>> df.replace(to_replace='^[a-g]', value='e', regex=True)
+        A  B  C
+    0  0  e  e
+    1  1  e  e
+    2  2  e  h
+    3  3  e  i
+    4  4  e  j
+
+    If ``value`` is not ``None`` and `to_replace` is a dictionary, the dictionary
+    keys will be the DataFrame columns that the replacement will be applied.
+
+    >>> df.replace(to_replace={{'B': '^[a-c]', 'C': '^[h-j]'}}, value='e', regex=True)
+        A  B  C
+    0  0  e  f
+    1  1  e  g
+    2  2  e  e
+    3  3  d  e
+    4  4  e  e
+"""
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
new file mode 100644
index 0000000000000000000000000000000000000000..fecc2ca9e2e0ba13e3d98a2c29c7183a9b0e5fa5
--- /dev/null
+++ b/pandas/core/sorting.py
@@ -0,0 +1,736 @@
+"""miscellaneous sorting / groupby utilities"""
+
+from __future__ import annotations
+
+import itertools
+from typing import (
+    TYPE_CHECKING,
+    cast,
+)
+
+import numpy as np
+
+from pandas._libs import (
+    algos,
+    hashtable,
+    lib,
+)
+from pandas._libs.hashtable import unique_label_indices
+
+from pandas.core.dtypes.common import (
+    ensure_int64,
+    ensure_platform_int,
+)
+from pandas.core.dtypes.generic import (
+    ABCMultiIndex,
+    ABCRangeIndex,
+)
+from pandas.core.dtypes.missing import isna
+
+from pandas.core.construction import extract_array
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Hashable,
+        Sequence,
+    )
+
+    from pandas._typing import (
+        ArrayLike,
+        AxisInt,
+        IndexKeyFunc,
+        Level,
+        NaPosition,
+        Shape,
+        SortKind,
+        npt,
+    )
+
+    from pandas import (
+        MultiIndex,
+        Series,
+    )
+    from pandas.core.arrays import ExtensionArray
+    from pandas.core.indexes.base import Index
+
+
+def get_indexer_indexer(
+    target: Index,
+    level: Level | list[Level] | None,
+    ascending: list[bool] | bool,
+    kind: SortKind,
+    na_position: NaPosition,
+    sort_remaining: bool,
+    key: IndexKeyFunc,
+) -> npt.NDArray[np.intp] | None:
+    """
+    Helper method that return the indexer according to input parameters for
+    the sort_index method of DataFrame and Series.
+
+    Parameters
+    ----------
+    target : Index
+    level : int or level name or list of ints or list of level names
+    ascending : bool or list of bools, default True
+    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}
+    na_position : {'first', 'last'}
+    sort_remaining : bool
+    key : callable, optional
+
+    Returns
+    -------
+    Optional[ndarray[intp]]
+        The indexer for the new index.
+    """
+
+    # error: Incompatible types in assignment (expression has type
+    # "Union[ExtensionArray, ndarray[Any, Any], Index, Series]", variable has
+    # type "Index")
+    target = ensure_key_mapped(target, key, levels=level)  # type: ignore[assignment]
+    target = target._sort_levels_monotonic()
+
+    if level is not None:
+        _, indexer = target.sortlevel(
+            level,
+            ascending=ascending,
+            sort_remaining=sort_remaining,
+            na_position=na_position,
+        )
+    elif (np.all(ascending) and target.is_monotonic_increasing) or (
+        not np.any(ascending) and target.is_monotonic_decreasing
+    ):
+        # Check monotonic-ness before sort an index (GH 11080)
+        return None
+    elif isinstance(target, ABCMultiIndex):
+        codes = [lev.codes for lev in target._get_codes_for_sorting()]
+        indexer = lexsort_indexer(
+            codes, orders=ascending, na_position=na_position, codes_given=True
+        )
+    else:
+        # ascending can only be a Sequence for MultiIndex
+        indexer = nargsort(
+            target,
+            kind=kind,
+            ascending=cast(bool, ascending),
+            na_position=na_position,
+        )
+    return indexer
+
+
+def get_group_index(
+    labels, shape: Shape, sort: bool, xnull: bool
+) -> npt.NDArray[np.int64]:
+    """
+    For the particular label_list, gets the offsets into the hypothetical list
+    representing the totally ordered cartesian product of all possible label
+    combinations, *as long as* this space fits within int64 bounds;
+    otherwise, though group indices identify unique combinations of
+    labels, they cannot be deconstructed.
+    - If `sort`, rank of returned ids preserve lexical ranks of labels.
+      i.e. returned id's can be used to do lexical sort on labels;
+    - If `xnull` nulls (-1 labels) are passed through.
+
+    Parameters
+    ----------
+    labels : sequence of arrays
+        Integers identifying levels at each location
+    shape : tuple[int, ...]
+        Number of unique levels at each location
+    sort : bool
+        If the ranks of returned ids should match lexical ranks of labels
+    xnull : bool
+        If true nulls are excluded. i.e. -1 values in the labels are
+        passed through.
+
+    Returns
+    -------
+    An array of type int64 where two elements are equal if their corresponding
+    labels are equal at all location.
+
+    Notes
+    -----
+    The length of `labels` and `shape` must be identical.
+    """
+
+    def _int64_cut_off(shape) -> int:
+        acc = 1
+        for i, mul in enumerate(shape):
+            acc *= int(mul)
+            if not acc < lib.i8max:
+                return i
+        return len(shape)
+
+    def maybe_lift(lab, size: int) -> tuple[np.ndarray, int]:
+        # promote nan values (assigned -1 label in lab array)
+        # so that all output values are non-negative
+        return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
+
+    labels = [ensure_int64(x) for x in labels]
+    lshape = list(shape)
+    if not xnull:
+        for i, (lab, size) in enumerate(zip(labels, shape, strict=True)):
+            labels[i], lshape[i] = maybe_lift(lab, size)
+
+    # Iteratively process all the labels in chunks sized so less
+    # than lib.i8max unique int ids will be required for each chunk
+    while True:
+        # how many levels can be done without overflow:
+        nlev = _int64_cut_off(lshape)
+
+        # compute flat ids for the first `nlev` levels
+        stride = np.prod(lshape[1:nlev], dtype="i8")
+        out = stride * labels[0].astype("i8", subok=False, copy=False)
+
+        for i in range(1, nlev):
+            if lshape[i] == 0:
+                stride = np.int64(0)
+            else:
+                stride //= lshape[i]
+            out += labels[i] * stride
+
+        if xnull:  # exclude nulls
+            mask = labels[0] == -1
+            for lab in labels[1:nlev]:
+                mask |= lab == -1
+            out[mask] = -1
+
+        if nlev == len(lshape):  # all levels done!
+            break
+
+        # compress what has been done so far in order to avoid overflow
+        # to retain lexical ranks, obs_ids should be sorted
+        comp_ids, obs_ids = compress_group_index(out, sort=sort)
+
+        labels = [comp_ids, *labels[nlev:]]
+        lshape = [len(obs_ids), *lshape[nlev:]]
+
+    return out
+
+
+def get_compressed_ids(
+    labels, sizes: Shape
+) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.int64]]:
+    """
+    Group_index is offsets into cartesian product of all possible labels. This
+    space can be huge, so this function compresses it, by computing offsets
+    (comp_ids) into the list of unique labels (obs_group_ids).
+
+    Parameters
+    ----------
+    labels : list of label arrays
+    sizes : tuple[int] of size of the levels
+
+    Returns
+    -------
+    np.ndarray[np.intp]
+        comp_ids
+    np.ndarray[np.int64]
+        obs_group_ids
+    """
+    ids = get_group_index(labels, sizes, sort=True, xnull=False)
+    return compress_group_index(ids, sort=True)
+
+
+def is_int64_overflow_possible(shape: Shape) -> bool:
+    the_prod = 1
+    for x in shape:
+        the_prod *= int(x)
+
+    return the_prod >= lib.i8max
+
+
+def _decons_group_index(
+    comp_labels: npt.NDArray[np.intp], shape: Shape
+) -> list[npt.NDArray[np.intp]]:
+    # reconstruct labels
+    if is_int64_overflow_possible(shape):
+        # at some point group indices are factorized,
+        # and may not be deconstructed here! wrong path!
+        raise ValueError("cannot deconstruct factorized group indices!")
+
+    label_list = []
+    factor = 1
+    y = np.array(0)
+    x = comp_labels
+    for i in reversed(range(len(shape))):
+        labels = (x - y) % (factor * shape[i]) // factor
+        np.putmask(labels, comp_labels < 0, -1)
+        label_list.append(labels)
+        y = labels * factor
+        factor *= shape[i]
+    return label_list[::-1]
+
+
+def decons_obs_group_ids(
+    comp_ids: npt.NDArray[np.intp],
+    obs_ids: npt.NDArray[np.intp],
+    shape: Shape,
+    labels: Sequence[npt.NDArray[np.signedinteger]],
+    xnull: bool,
+) -> list[npt.NDArray[np.intp]]:
+    """
+    Reconstruct labels from observed group ids.
+
+    Parameters
+    ----------
+    comp_ids : np.ndarray[np.intp]
+    obs_ids: np.ndarray[np.intp]
+    shape : tuple[int]
+    labels : Sequence[np.ndarray[np.signedinteger]]
+    xnull : bool
+        If nulls are excluded; i.e. -1 labels are passed through.
+    """
+    if not xnull:
+        lift = np.fromiter(((a == -1).any() for a in labels), dtype=np.intp)
+        arr_shape = np.asarray(shape, dtype=np.intp) + lift
+        shape = tuple(arr_shape)
+
+    if not is_int64_overflow_possible(shape):
+        # obs ids are deconstructable! take the fast route!
+        out = _decons_group_index(obs_ids, shape)
+        return (
+            out
+            if xnull or not lift.any()
+            else [x - y for x, y in zip(out, lift, strict=True)]
+        )
+
+    indexer = unique_label_indices(comp_ids)
+    return [lab[indexer].astype(np.intp, subok=False, copy=True) for lab in labels]
+
+
+def lexsort_indexer(
+    keys: Sequence[ArrayLike | Index | Series],
+    orders=None,
+    na_position: str = "last",
+    key: Callable | None = None,
+    codes_given: bool = False,
+) -> npt.NDArray[np.intp]:
+    """
+    Performs lexical sorting on a set of keys
+
+    Parameters
+    ----------
+    keys : Sequence[ArrayLike | Index | Series]
+        Sequence of arrays to be sorted by the indexer
+        Sequence[Series] is only if key is not None.
+    orders : bool or list of booleans, optional
+        Determines the sorting order for each element in keys. If a list,
+        it must be the same length as keys. This determines whether the
+        corresponding element in keys should be sorted in ascending
+        (True) or descending (False) order. if bool, applied to all
+        elements as above. if None, defaults to True.
+    na_position : {'first', 'last'}, default 'last'
+        Determines placement of NA elements in the sorted list ("last" or "first")
+    key : Callable, optional
+        Callable key function applied to every element in keys before sorting
+    codes_given: bool, False
+        Avoid categorical materialization if codes are already provided.
+
+    Returns
+    -------
+    np.ndarray[np.intp]
+    """
+    from pandas.core.arrays import Categorical
+
+    if na_position not in ["last", "first"]:
+        raise ValueError(f"invalid na_position: {na_position}")
+
+    if isinstance(orders, bool):
+        orders = itertools.repeat(orders, len(keys))
+    elif orders is None:
+        orders = itertools.repeat(True, len(keys))
+    else:
+        orders = reversed(orders)
+
+    labels = []
+
+    for k, order in zip(reversed(keys), orders, strict=True):
+        k = ensure_key_mapped(k, key)
+        if codes_given:
+            codes = cast(np.ndarray, k)
+            n = codes.max() + 1 if len(codes) else 0
+        else:
+            cat = Categorical(k, ordered=True)
+            codes = cat.codes
+            n = len(cat.categories)
+
+        mask = codes == -1
+
+        if na_position == "last" and mask.any():
+            codes = np.where(mask, n, codes)
+
+        # not order means descending
+        if not order:
+            codes = np.where(mask, codes, n - codes - 1)
+
+        labels.append(codes)
+
+    return np.lexsort(labels)
+
+
+def nargsort(
+    items: ArrayLike | Index | Series,
+    kind: SortKind = "quicksort",
+    ascending: bool = True,
+    na_position: str = "last",
+    key: Callable | None = None,
+    mask: npt.NDArray[np.bool_] | None = None,
+) -> npt.NDArray[np.intp]:
+    """
+    Intended to be a drop-in replacement for np.argsort which handles NaNs.
+
+    Adds ascending, na_position, and key parameters.
+
+    (GH #6399, #5231, #27237)
+
+    Parameters
+    ----------
+    items : np.ndarray, ExtensionArray, Index, or Series
+    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
+    ascending : bool, default True
+    na_position : {'first', 'last'}, default 'last'
+    key : Optional[Callable], default None
+    mask : Optional[np.ndarray[bool]], default None
+        Passed when called by ExtensionArray.argsort.
+
+    Returns
+    -------
+    np.ndarray[np.intp]
+    """
+
+    if key is not None:
+        # see TestDataFrameSortKey, TestRangeIndex::test_sort_values_key
+        items = ensure_key_mapped(items, key)
+        return nargsort(
+            items,
+            kind=kind,
+            ascending=ascending,
+            na_position=na_position,
+            key=None,
+            mask=mask,
+        )
+
+    if isinstance(items, ABCRangeIndex):
+        return items.argsort(ascending=ascending)
+    elif not isinstance(items, ABCMultiIndex):
+        items = extract_array(items)
+    else:
+        raise TypeError(
+            "nargsort does not support MultiIndex. Use index.sort_values instead."
+        )
+
+    if mask is None:
+        mask = np.asarray(isna(items))
+
+    if not isinstance(items, np.ndarray):
+        # i.e. ExtensionArray
+        return items.argsort(
+            ascending=ascending,
+            kind=kind,
+            na_position=na_position,
+        )
+
+    idx = np.arange(len(items))
+    non_nans = items[~mask]
+    non_nan_idx = idx[~mask]
+
+    nan_idx = np.nonzero(mask)[0]
+    if not ascending:
+        non_nans = non_nans[::-1]
+        non_nan_idx = non_nan_idx[::-1]
+    indexer = non_nan_idx[non_nans.argsort(kind=kind)]
+    if not ascending:
+        indexer = indexer[::-1]
+    # Finally, place the NaNs at the end or the beginning according to
+    # na_position
+    if na_position == "last":
+        indexer = np.concatenate([indexer, nan_idx])
+    elif na_position == "first":
+        indexer = np.concatenate([nan_idx, indexer])
+    else:
+        raise ValueError(f"invalid na_position: {na_position}")
+    return ensure_platform_int(indexer)
+
+
+def nargminmax(values: ExtensionArray, method: str, axis: AxisInt = 0):
+    """
+    Implementation of np.argmin/argmax but for ExtensionArray and which
+    handles missing values.
+
+    Parameters
+    ----------
+    values : ExtensionArray
+    method : {"argmax", "argmin"}
+    axis : int, default 0
+
+    Returns
+    -------
+    int
+    """
+    assert method in {"argmax", "argmin"}
+    func = np.argmax if method == "argmax" else np.argmin
+
+    mask = np.asarray(isna(values))
+    arr_values = values._values_for_argsort()
+
+    if arr_values.ndim > 1:
+        if mask.any():
+            if axis == 1:
+                zipped = zip(arr_values, mask, strict=True)
+            else:
+                zipped = zip(arr_values.T, mask.T, strict=True)
+            return np.array([_nanargminmax(v, m, func) for v, m in zipped])
+        return func(arr_values, axis=axis)
+
+    return _nanargminmax(arr_values, mask, func)
+
+
+def _nanargminmax(values: np.ndarray, mask: npt.NDArray[np.bool_], func) -> int:
+    """
+    See nanargminmax.__doc__.
+    """
+    idx = np.arange(values.shape[0])
+    non_nans = values[~mask]
+    non_nan_idx = idx[~mask]
+
+    return non_nan_idx[func(non_nans)]
+
+
+def _ensure_key_mapped_multiindex(
+    index: MultiIndex, key: Callable, level=None
+) -> MultiIndex:
+    """
+    Returns a new MultiIndex in which key has been applied
+    to all levels specified in level (or all levels if level
+    is None). Used for key sorting for MultiIndex.
+
+    Parameters
+    ----------
+    index : MultiIndex
+        Index to which to apply the key function on the
+        specified levels.
+    key : Callable
+        Function that takes an Index and returns an Index of
+        the same shape. This key is applied to each level
+        separately. The name of the level can be used to
+        distinguish different levels for application.
+    level : list-like, int or str, default None
+        Level or list of levels to apply the key function to.
+        If None, key function is applied to all levels. Other
+        levels are left unchanged.
+
+    Returns
+    -------
+    labels : MultiIndex
+        Resulting MultiIndex with modified levels.
+    """
+
+    if level is not None:
+        if isinstance(level, (str, int)):
+            level_iter = [level]
+        else:
+            level_iter = level
+
+        sort_levels: range | set = {index._get_level_number(lev) for lev in level_iter}
+    else:
+        sort_levels = range(index.nlevels)
+
+    mapped = [
+        (
+            ensure_key_mapped(index._get_level_values(level), key)
+            if level in sort_levels
+            else index._get_level_values(level)
+        )
+        for level in range(index.nlevels)
+    ]
+
+    return type(index).from_arrays(mapped)
+
+
+def ensure_key_mapped(
+    values: ArrayLike | Index | Series, key: Callable | None, levels=None
+) -> ArrayLike | Index | Series:
+    """
+    Applies a callable key function to the values function and checks
+    that the resulting value has the same shape. Can be called on Index
+    subclasses, Series, DataFrames, or ndarrays.
+
+    Parameters
+    ----------
+    values : Series, DataFrame, Index subclass, or ndarray
+    key : Optional[Callable], key to be called on the values array
+    levels : Optional[List], if values is a MultiIndex, list of levels to
+    apply the key to.
+    """
+    from pandas.core.indexes.api import Index
+
+    if not key:
+        return values
+
+    if isinstance(values, ABCMultiIndex):
+        return _ensure_key_mapped_multiindex(values, key, level=levels)
+
+    result = key(values.copy())
+    if len(result) != len(values):
+        raise ValueError(
+            "User-provided `key` function must not change the shape of the array."
+        )
+
+    try:
+        if isinstance(
+            values, Index
+        ):  # convert to a new Index subclass, not necessarily the same
+            result = Index(result, tupleize_cols=False)
+        else:
+            # try to revert to original type otherwise
+            type_of_values = type(values)
+            #  error: Too many arguments for "ExtensionArray"
+            result = type_of_values(result)  # type: ignore[call-arg]
+    except TypeError as err:
+        raise TypeError(
+            f"User-provided `key` function returned an invalid type {type(result)} \
+            which could not be converted to {type(values)}."
+        ) from err
+
+    return result
+
+
+def get_indexer_dict(
+    label_list: list[np.ndarray], keys: list[Index]
+) -> dict[Hashable, npt.NDArray[np.intp]]:
+    """
+    Returns
+    -------
+    dict:
+        Labels mapped to indexers.
+    """
+    shape = tuple(len(x) for x in keys)
+
+    group_index = get_group_index(label_list, shape, sort=True, xnull=True)
+    if np.all(group_index == -1):
+        # Short-circuit, lib.indices_fast will return the same
+        return {}
+    ngroups = (
+        ((group_index.size and group_index.max()) + 1)
+        if is_int64_overflow_possible(shape)
+        else np.prod(shape, dtype="i8")
+    )
+
+    sorter = get_group_index_sorter(group_index, ngroups)
+
+    sorted_labels = [lab.take(sorter) for lab in label_list]
+    group_index = group_index.take(sorter)
+
+    return lib.indices_fast(sorter, group_index, keys, sorted_labels)
+
+
+# ----------------------------------------------------------------------
+# sorting levels...cleverly?
+
+
+def get_group_index_sorter(
+    group_index: npt.NDArray[np.intp], ngroups: int | None = None
+) -> npt.NDArray[np.intp]:
+    """
+    algos.groupsort_indexer implements `counting sort` and it is at least
+    O(ngroups), where
+        ngroups = prod(shape)
+        shape = map(len, keys)
+    that is, linear in the number of combinations (cartesian product) of unique
+    values of groupby keys. This can be huge when doing multi-key groupby.
+    np.argsort(kind='mergesort') is O(count x log(count)) where count is the
+    length of the data-frame;
+    Both algorithms are `stable` sort and that is necessary for correctness of
+    groupby operations. e.g. consider:
+        df.groupby(key)[col].transform('first')
+
+    Parameters
+    ----------
+    group_index : np.ndarray[np.intp]
+        signed integer dtype
+    ngroups : int or None, default None
+
+    Returns
+    -------
+    np.ndarray[np.intp]
+    """
+    if ngroups is None:
+        ngroups = 1 + group_index.max()
+    count = len(group_index)
+    alpha = 0.0  # taking complexities literally; there may be
+    beta = 1.0  # some room for fine-tuning these parameters
+    do_groupsort = count > 0 and ((alpha + beta * ngroups) < (count * np.log(count)))
+    if do_groupsort:
+        sorter, _ = algos.groupsort_indexer(
+            ensure_platform_int(group_index),
+            ngroups,
+        )
+        # sorter _should_ already be intp, but mypy is not yet able to verify
+    else:
+        sorter = group_index.argsort(kind="mergesort")
+    return ensure_platform_int(sorter)
+
+
+def compress_group_index(
+    group_index: npt.NDArray[np.int64], sort: bool = True
+) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
+    """
+    Group_index is offsets into cartesian product of all possible labels. This
+    space can be huge, so this function compresses it, by computing offsets
+    (comp_ids) into the list of unique labels (obs_group_ids).
+    """
+    if len(group_index) and np.all(group_index[1:] >= group_index[:-1]):
+        # GH 53806: fast path for sorted group_index
+        unique_mask = np.concatenate(
+            [group_index[:1] > -1, group_index[1:] != group_index[:-1]]
+        )
+        comp_ids = unique_mask.cumsum()
+        comp_ids -= 1
+        obs_group_ids = group_index[unique_mask]
+    else:
+        size_hint = len(group_index)
+        table = hashtable.Int64HashTable(size_hint)
+
+        group_index = ensure_int64(group_index)
+
+        # note, group labels come out ascending (ie, 1,2,3 etc)
+        comp_ids, obs_group_ids = table.get_labels_groupby(group_index)
+
+        if sort and len(obs_group_ids) > 0:
+            obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
+
+    return ensure_int64(comp_ids), ensure_int64(obs_group_ids)
+
+
+def _reorder_by_uniques(
+    uniques: npt.NDArray[np.int64], labels: npt.NDArray[np.intp]
+) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.intp]]:
+    """
+    Parameters
+    ----------
+    uniques : np.ndarray[np.int64]
+    labels : np.ndarray[np.intp]
+
+    Returns
+    -------
+    np.ndarray[np.int64]
+    np.ndarray[np.intp]
+    """
+    # sorter is index where elements ought to go
+    sorter = uniques.argsort()
+
+    # reverse_indexer is where elements came from
+    reverse_indexer = np.empty(len(sorter), dtype=np.intp)
+    reverse_indexer.put(sorter, np.arange(len(sorter)))
+
+    mask = labels < 0
+
+    # move labels to right locations (ie, unsort ascending labels)
+    labels = reverse_indexer.take(labels)
+    np.putmask(labels, mask, -1)
+
+    # sort observed ids
+    uniques = uniques.take(sorter)
+
+    return uniques, labels
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..18edb61c8a3f3e2202b6823c86091b20a5f30b6b
--- /dev/null
+++ b/pandas/errors/__init__.py
@@ -0,0 +1,1087 @@
+"""
+Expose public exceptions & warnings
+"""
+
+from __future__ import annotations
+
+import abc
+import ctypes
+
+from pandas._config.config import OptionError
+
+from pandas._libs.tslibs import (
+    IncompatibleFrequency,
+    OutOfBoundsDatetime,
+    OutOfBoundsTimedelta,
+)
+
+from pandas.util.version import InvalidVersion
+
+
+class IntCastingNaNError(ValueError):
+    """
+    Exception raised when converting (``astype``) an array with NaN to an integer type.
+
+    This error occurs when attempting to cast a data structure containing non-finite
+    values (such as NaN or infinity) to an integer data type. Integer types do not
+    support non-finite values, so such conversions are explicitly disallowed to
+    prevent silent data corruption or unexpected behavior.
+
+    See Also
+    --------
+    DataFrame.astype : Method to cast a pandas DataFrame object to a specified dtype.
+    Series.astype : Method to cast a pandas Series object to a specified dtype.
+
+    Examples
+    --------
+    >>> pd.DataFrame(np.array([[1, np.nan], [2, 3]]), dtype="i8")
+    Traceback (most recent call last):
+    IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
+    """
+
+
+class NullFrequencyError(ValueError):
+    """
+    Exception raised when a ``freq`` cannot be null.
+
+    Particularly ``DatetimeIndex.shift``, ``TimedeltaIndex.shift``,
+    ``PeriodIndex.shift``.
+
+    See Also
+    --------
+    Index.shift : Shift values of Index.
+    Series.shift : Shift values of Series.
+
+    Examples
+    --------
+    >>> df = pd.DatetimeIndex(["2011-01-01 10:00", "2011-01-01"], freq=None)
+    >>> df.shift(2)
+    Traceback (most recent call last):
+    NullFrequencyError: Cannot shift with no freq
+    """
+
+
+class PerformanceWarning(Warning):
+    """
+    Warning raised when there is a possible performance impact.
+
+    See Also
+    --------
+    DataFrame.set_index : Set the DataFrame index using existing columns.
+    DataFrame.loc : Access a group of rows and columns by label(s) \
+    or a boolean array.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame(
+    ...     {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": [1, 2, 3, 4]}
+    ... )
+    >>> df = df.set_index(["jim", "joe"])
+    >>> df
+              jolie
+    jim  joe
+    0    x    1
+         x    2
+    1    z    3
+         y    4
+    >>> df.loc[(1, "z")]  # doctest: +SKIP
+    # PerformanceWarning: indexing past lexsort depth may impact performance.
+    df.loc[(1, 'z')]
+              jolie
+    jim  joe
+    1    z        3
+    """
+
+
+class PandasChangeWarning(Warning):
+    """
+    Warning raised for any upcoming change.
+
+    See Also
+    --------
+    errors.PandasPendingDeprecationWarning : Class for deprecations that will raise a
+        PendingDeprecationWarning.
+    errors.PandasDeprecationWarning : Class for deprecations that will raise a
+        DeprecationWarning.
+    errors.PandasFutureWarning : Class for deprecations that will raise a FutureWarning.
+
+    Examples
+    --------
+    >>> pd.errors.PandasChangeWarning
+    <class 'pandas.errors.PandasChangeWarning'>
+    """
+
+    @classmethod
+    @abc.abstractmethod
+    def version(cls) -> str:
+        """Version where change will be enforced."""
+
+
+class PandasPendingDeprecationWarning(PandasChangeWarning, PendingDeprecationWarning):
+    """
+    Warning raised for an upcoming change that is a PendingDeprecationWarning.
+
+    See Also
+    --------
+    errors.PandasChangeWarning: Class for deprecations that will raise any warning.
+    errors.PandasDeprecationWarning : Class for deprecations that will raise a
+        DeprecationWarning.
+    errors.PandasFutureWarning : Class for deprecations that will raise a FutureWarning.
+
+    Examples
+    --------
+    >>> pd.errors.PandasPendingDeprecationWarning
+    <class 'pandas.errors.PandasPendingDeprecationWarning'>
+    """
+
+
+class PandasDeprecationWarning(PandasChangeWarning, DeprecationWarning):
+    """
+    Warning raised for an upcoming change that is a DeprecationWarning.
+
+    See Also
+    --------
+    errors.PandasChangeWarning: Class for deprecations that will raise any warning.
+    errors.PandasPendingDeprecationWarning : Class for deprecations that will raise a
+        PendingDeprecationWarning.
+    errors.PandasFutureWarning : Class for deprecations that will raise a FutureWarning.
+
+    Examples
+    --------
+    >>> pd.errors.PandasDeprecationWarning
+    <class 'pandas.errors.PandasDeprecationWarning'>
+    """
+
+
+class PandasFutureWarning(PandasChangeWarning, FutureWarning):
+    """
+    Warning raised for an upcoming change that is a FutureWarning.
+
+    See Also
+    --------
+    errors.PandasChangeWarning: Class for deprecations that will raise any warning.
+    errors.PandasPendingDeprecationWarning : Class for deprecations that will raise a
+        PendingDeprecationWarning.
+    errors.PandasDeprecationWarning : Class for deprecations that will raise a
+        DeprecationWarning.
+
+    Examples
+    --------
+    >>> pd.errors.PandasFutureWarning
+    <class 'pandas.errors.PandasFutureWarning'>
+    """
+
+
+class Pandas4Warning(PandasDeprecationWarning):
+    """
+    Warning raised for an upcoming change that will be enforced in pandas 4.0.
+
+    See Also
+    --------
+    errors.PandasChangeWarning: Class for deprecations that will raise any warning.
+    errors.PandasPendingDeprecationWarning : Class for deprecations that will raise a
+        PendingDeprecationWarning.
+    errors.PandasDeprecationWarning : Class for deprecations that will raise a
+        DeprecationWarning.
+    errors.PandasFutureWarning : Class for deprecations that will raise a FutureWarning.
+
+    Examples
+    --------
+    >>> pd.errors.Pandas4Warning
+    <class 'pandas.errors.Pandas4Warning'>
+    """
+
+    @classmethod
+    def version(cls) -> str:
+        """Version where change will be enforced."""
+        return "4.0"
+
+
+class Pandas5Warning(PandasPendingDeprecationWarning):
+    """
+    Warning raised for an upcoming change that will be enforced in pandas 5.0.
+
+    See Also
+    --------
+    errors.PandasChangeWarning: Class for deprecations that will raise any warning.
+    errors.PandasPendingDeprecationWarning : Class for deprecations that will raise a
+        PendingDeprecationWarning.
+    errors.PandasDeprecationWarning : Class for deprecations that will raise a
+        DeprecationWarning.
+    errors.PandasFutureWarning : Class for deprecations that will raise a FutureWarning.
+
+    Examples
+    --------
+    >>> pd.errors.Pandas5Warning
+    <class 'pandas.errors.Pandas5Warning'>
+    """
+
+    @classmethod
+    def version(cls) -> str:
+        """Version where change will be enforced."""
+        return "5.0"
+
+
+_CurrentDeprecationWarning = Pandas4Warning
+
+
+class UnsupportedFunctionCall(ValueError):
+    """
+    Exception raised when attempting to call a unsupported numpy function.
+
+    For example, ``np.cumsum(groupby_object)``.
+
+    See Also
+    --------
+    DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns.
+    Series.groupby : Group Series using a mapper or by a Series of columns.
+    core.groupby.GroupBy.cumsum : Compute cumulative sum for each group.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame(
+    ...     {"A": [0, 0, 1, 1], "B": ["x", "x", "z", "y"], "C": [1, 2, 3, 4]}
+    ... )
+    >>> np.cumsum(df.groupby(["A"]))
+    Traceback (most recent call last):
+    UnsupportedFunctionCall: numpy operations are not valid with groupby.
+    Use .groupby(...).cumsum() instead
+    """
+
+
+class UnsortedIndexError(KeyError):
+    """
+    Error raised when slicing a MultiIndex which has not been lexsorted.
+
+    Subclass of `KeyError`.
+
+    See Also
+    --------
+    DataFrame.sort_index : Sort a DataFrame by its index.
+    DataFrame.set_index : Set the DataFrame index using existing columns.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame(
+    ...     {
+    ...         "cat": [0, 0, 1, 1],
+    ...         "color": ["white", "white", "brown", "black"],
+    ...         "lives": [4, 4, 3, 7],
+    ...     },
+    ... )
+    >>> df = df.set_index(["cat", "color"])
+    >>> df
+                lives
+    cat  color
+    0    white    4
+         white    4
+    1    brown    3
+         black    7
+    >>> df.loc[(0, "black") : (1, "white")]
+    Traceback (most recent call last):
+    UnsortedIndexError: 'Key length (2) was greater
+    than MultiIndex lexsort depth (1)'
+    """
+
+
+class ParserError(ValueError):
+    """
+    Exception that is raised by an error encountered in parsing file contents.
+
+    This is a generic error raised for errors encountered when functions like
+    `read_csv` or `read_html` are parsing contents of a file.
+
+    See Also
+    --------
+    read_csv : Read CSV (comma-separated) file into a DataFrame.
+    read_html : Read HTML table into a DataFrame.
+
+    Examples
+    --------
+    >>> data = '''a,b,c
+    ... cat,foo,bar
+    ... dog,foo,"baz'''
+    >>> from io import StringIO
+    >>> pd.read_csv(StringIO(data), skipfooter=1, engine="python")
+    Traceback (most recent call last):
+    ParserError: ',' expected after '"'. Error could possibly be due
+    to parsing errors in the skipped footer rows
+    """
+
+
+class DtypeWarning(Warning):
+    """
+    Warning raised when reading different dtypes in a column from a file.
+
+    Raised for a dtype incompatibility. This can happen whenever `read_csv`
+    or `read_table` encounter non-uniform dtypes in a column(s) of a given
+    CSV file.
+
+    See Also
+    --------
+    read_csv : Read CSV (comma-separated) file into a DataFrame.
+    read_table : Read general delimited file into a DataFrame.
+
+    Notes
+    -----
+    This warning is issued when dealing with larger files because the dtype
+    checking happens per chunk read.
+
+    Despite the warning, the CSV file is read with mixed types in a single
+    column which will be an object type. See the examples below to better
+    understand this issue.
+
+    Examples
+    --------
+    This example creates and reads a large CSV file with a column that contains
+    `int` and `str`.
+
+    >>> df = pd.DataFrame(
+    ...     {
+    ...         "a": (["1"] * 100000 + ["X"] * 100000 + ["1"] * 100000),
+    ...         "b": ["b"] * 300000,
+    ...     }
+    ... )  # doctest: +SKIP
+    >>> df.to_csv("test.csv", index=False)  # doctest: +SKIP
+    >>> df2 = pd.read_csv("test.csv")  # doctest: +SKIP
+    ... # DtypeWarning: Columns (0: a) have mixed types
+
+    Important to notice that ``df2`` will contain both `str` and `int` for the
+    same input, '1'.
+
+    >>> df2.iloc[262140, 0]  # doctest: +SKIP
+    '1'
+    >>> type(df2.iloc[262140, 0])  # doctest: +SKIP
+    <class 'str'>
+    >>> df2.iloc[262150, 0]  # doctest: +SKIP
+    1
+    >>> type(df2.iloc[262150, 0])  # doctest: +SKIP
+    <class 'int'>
+
+    One way to solve this issue is using the `dtype` parameter in the
+    `read_csv` and `read_table` functions to explicit the conversion:
+
+    >>> df2 = pd.read_csv("test.csv", sep=",", dtype={"a": str})  # doctest: +SKIP
+
+    No warning was issued.
+    """
+
+
+class EmptyDataError(ValueError):
+    """
+    Exception raised in ``pd.read_csv`` when empty data or header is encountered.
+
+    This error is typically encountered when attempting to read an empty file or
+    an invalid file where no data or headers are present.
+
+    See Also
+    --------
+    read_csv : Read a comma-separated values (CSV) file into DataFrame.
+    errors.ParserError : Exception that is raised by an error encountered in parsing
+        file contents.
+    errors.DtypeWarning : Warning raised when reading different dtypes in a column
+        from a file.
+
+    Examples
+    --------
+    >>> from io import StringIO
+    >>> empty = StringIO()
+    >>> pd.read_csv(empty)
+    Traceback (most recent call last):
+    EmptyDataError: No columns to parse from file
+    """
+
+
+class ParserWarning(Warning):
+    """
+    Warning raised when reading a file that doesn't use the default 'c' parser.
+
+    Raised by `pd.read_csv` and `pd.read_table` when it is necessary to change
+    parsers, generally from the default 'c' parser to 'python'.
+
+    It happens due to a lack of support or functionality for parsing a
+    particular attribute of a CSV file with the requested engine.
+
+    Currently, 'c' unsupported options include the following parameters:
+
+    1. `sep` other than a single character (e.g. regex separators)
+    2. `skipfooter` higher than 0
+
+    The warning can be avoided by adding `engine='python'` as a parameter in
+    `pd.read_csv` and `pd.read_table` methods.
+
+    See Also
+    --------
+    pd.read_csv : Read CSV (comma-separated) file into DataFrame.
+    pd.read_table : Read general delimited file into DataFrame.
+
+    Examples
+    --------
+    Using a `sep` in `pd.read_csv` other than a single character:
+
+    >>> import io
+    >>> csv = '''a;b;c
+    ...           1;1,8
+    ...           1;2,1'''
+    >>> df = pd.read_csv(io.StringIO(csv), sep="[;,]")  # doctest: +SKIP
+    ... # ParserWarning: Falling back to the 'python' engine...
+
+    Adding `engine='python'` to `pd.read_csv` removes the Warning:
+
+    >>> df = pd.read_csv(io.StringIO(csv), sep="[;,]", engine="python")
+    """
+
+
+class MergeError(ValueError):
+    """
+    Exception raised when merging data.
+
+    Subclass of ``ValueError``.
+
+    See Also
+    --------
+    DataFrame.join : For joining DataFrames on their indexes.
+    merge : For merging two DataFrames on a common set of keys.
+
+    Examples
+    --------
+    >>> left = pd.DataFrame(
+    ...     {"a": ["a", "b", "b", "d"], "b": ["cat", "dog", "weasel", "horse"]},
+    ...     index=range(4),
+    ... )
+    >>> right = pd.DataFrame(
+    ...     {"a": ["a", "b", "c", "d"], "c": ["meow", "bark", "chirp", "nay"]},
+    ...     index=range(4),
+    ... ).set_index("a")
+    >>> left.join(
+    ...     right,
+    ...     on="a",
+    ...     validate="one_to_one",
+    ... )
+    Traceback (most recent call last):
+    MergeError: Merge keys are not unique in left dataset; not a one-to-one merge
+    """
+
+
+class AbstractMethodError(NotImplementedError):
+    """
+    Raise this error instead of NotImplementedError for abstract methods.
+
+    The `AbstractMethodError` is designed for use in classes that follow an abstract
+    base class pattern. By raising this error in the method, it ensures that a subclass
+    must implement the method to provide specific functionality. This is useful in a
+    framework or library where certain methods must be implemented by the user to
+    ensure correct behavior.
+
+    Parameters
+    ----------
+    class_instance : object
+        The instance of the class where the abstract method is being called.
+    methodtype : str, default "method"
+        A string indicating the type of method that is abstract.
+        Must be one of {"method", "classmethod", "staticmethod", "property"}.
+
+    See Also
+    --------
+    api.extensions.ExtensionArray
+        An example of a pandas extension mechanism that requires implementing
+        specific abstract methods.
+    NotImplementedError
+        A built-in exception that can also be used for abstract methods but lacks
+        the specificity of `AbstractMethodError` in indicating the need for subclass
+        implementation.
+
+    Examples
+    --------
+    >>> class Foo:
+    ...     @classmethod
+    ...     def classmethod(cls):
+    ...         raise pd.errors.AbstractMethodError(cls, methodtype="classmethod")
+    ...
+    ...     def method(self):
+    ...         raise pd.errors.AbstractMethodError(self)
+    >>> test = Foo.classmethod()
+    Traceback (most recent call last):
+    AbstractMethodError: This classmethod must be defined in the concrete class Foo
+
+    >>> test2 = Foo().method()
+    Traceback (most recent call last):
+    AbstractMethodError: This classmethod must be defined in the concrete class Foo
+    """
+
+    def __init__(self, class_instance, methodtype: str = "method") -> None:
+        types = {"method", "classmethod", "staticmethod", "property"}
+        if methodtype not in types:
+            raise ValueError(
+                f"methodtype must be one of {types}, got {methodtype} instead."
+            )
+        self.methodtype = methodtype
+        self.class_instance = class_instance
+
+    def __str__(self) -> str:
+        if self.methodtype == "classmethod":
+            name = self.class_instance.__name__
+        else:
+            name = type(self.class_instance).__name__
+        return f"This {self.methodtype} must be defined in the concrete class {name}"
+
+
+class NumbaUtilError(Exception):
+    """
+    Error raised for unsupported Numba engine routines.
+
+    See Also
+    --------
+    DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns.
+    Series.groupby : Group Series using a mapper or by a Series of columns.
+    DataFrame.agg : Aggregate using one or more operations over the specified axis.
+    Series.agg : Aggregate using one or more operations over the specified axis.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame(
+    ...     {"key": ["a", "a", "b", "b"], "data": [1, 2, 3, 4]}, columns=["key", "data"]
+    ... )
+    >>> def incorrect_function(x):
+    ...     return sum(x) * 2.7
+    >>> df.groupby("key").agg(incorrect_function, engine="numba")
+    Traceback (most recent call last):
+    NumbaUtilError: The first 2 arguments to incorrect_function
+    must be ['values', 'index']
+    """
+
+
+class DuplicateLabelError(ValueError):
+    """
+    Error raised when an operation would introduce duplicate labels.
+
+    This error is typically encountered when performing operations on objects
+    with `allows_duplicate_labels=False` and the operation would result in
+    duplicate labels in the index. Duplicate labels can lead to ambiguities
+    in indexing and reduce data integrity.
+
+    See Also
+    --------
+    Series.set_flags : Return a new ``Series`` object with updated flags.
+    DataFrame.set_flags : Return a new ``DataFrame`` object with updated flags.
+    Series.reindex : Conform ``Series`` object to new index with optional filling logic.
+    DataFrame.reindex : Conform ``DataFrame`` object to new index with optional filling
+        logic.
+
+    Examples
+    --------
+    >>> s = pd.Series([0, 1, 2], index=["a", "b", "c"]).set_flags(
+    ...     allows_duplicate_labels=False
+    ... )
+    >>> s.reindex(["a", "a", "b"])
+    Traceback (most recent call last):
+       ...
+    DuplicateLabelError: Index has duplicates.
+          positions
+    label
+    a        [0, 1]
+    """
+
+
+class InvalidIndexError(Exception):
+    """
+    Exception raised when attempting to use an invalid index key.
+
+    This exception is triggered when a user attempts to access or manipulate
+    data in a pandas DataFrame or Series using an index key that is not valid
+    for the given object. This may occur in cases such as using a malformed
+    slice, a mismatched key for a ``MultiIndex``, or attempting to access an index
+    element that does not exist.
+
+    See Also
+    --------
+    MultiIndex : A multi-level, or hierarchical, index object for pandas objects.
+
+    Examples
+    --------
+    >>> idx = pd.MultiIndex.from_product([["x", "y"], [0, 1]])
+    >>> df = pd.DataFrame([[1, 1, 2, 2], [3, 3, 4, 4]], columns=idx)
+    >>> df
+        x       y
+        0   1   0   1
+    0   1   1   2   2
+    1   3   3   4   4
+    >>> df[:, 0]
+    Traceback (most recent call last):
+    InvalidIndexError: (slice(None, None, None), 0)
+    """
+
+
+class DataError(Exception):
+    """
+    Exception raised when performing an operation on non-numerical data.
+
+    For example, calling ``ohlc`` on a non-numerical column or a function
+    on a rolling window.
+
+    See Also
+    --------
+    Series.rolling : Provide rolling window calculations on Series object.
+    DataFrame.rolling : Provide rolling window calculations on DataFrame object.
+
+    Examples
+    --------
+    >>> ser = pd.Series(["a", "b", "c"])
+    >>> ser.rolling(2).sum()
+    Traceback (most recent call last):
+    DataError: No numeric types to aggregate
+    """
+
+
+class SpecificationError(Exception):
+    """
+    Exception raised by ``agg`` when the functions are ill-specified.
+
+    The exception raised in two scenarios.
+
+    The first way is calling ``agg`` on a
+    Dataframe or Series using a nested renamer (dict-of-dict).
+
+    The second way is calling ``agg`` on a Dataframe with duplicated functions
+    names without assigning column name.
+
+    See Also
+    --------
+    DataFrame.agg : Aggregate using one or more operations over the specified axis.
+    Series.agg : Aggregate using one or more operations over the specified axis.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
+    >>> df.groupby("A").B.agg({"foo": "count"})  # doctest: +SKIP
+    ... # SpecificationError: nested renamer is not supported
+
+    >>> df.groupby("A").agg({"B": {"foo": ["sum", "max"]}})  # doctest: +SKIP
+    ... # SpecificationError: nested renamer is not supported
+
+    >>> df.groupby("A").agg(["min", "min"])  # doctest: +SKIP
+    ... # SpecificationError: nested renamer is not supported
+    """
+
+
+class ChainedAssignmentError(Warning):
+    """
+    Warning raised when trying to set using chained assignment.
+
+    With Copy-on-Write now always enabled, chained assignment can
+    never work. In such a situation, we are always setting into a temporary
+    object that is the result of an indexing operation (getitem), which under
+    Copy-on-Write always behaves as a copy. Thus, assigning through a chain
+    can never update the original Series or DataFrame.
+
+    For more information on Copy-on-Write,
+    see :ref:`the user guide<copy_on_write>`.
+
+    See Also
+    --------
+    DataFrame.loc : Access a group of rows and columns by label(s) or a boolean array.
+    DataFrame.iloc : Purely integer-location based indexing for selection by position.
+    Series.loc : Access a group of rows by label(s) or a boolean array.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({"A": [1, 1, 1, 2, 2]}, columns=["A"])
+    >>> df["A"][0:3] = 10  # doctest: +SKIP
+    ... # ChainedAssignmentError: ...
+    """
+
+
+class NumExprClobberingError(NameError):
+    """
+    Exception raised when trying to use a built-in numexpr name as a variable name.
+
+    ``eval`` or ``query`` will throw the error if the engine is set
+    to 'numexpr'. 'numexpr' is the default engine value for these methods if the
+    numexpr package is installed.
+
+    See Also
+    --------
+    eval : Evaluate a Python expression as a string using various backends.
+    DataFrame.query : Query the columns of a DataFrame with a boolean expression.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({"abs": [1, 1, 1]})
+    >>> df.query("abs > 2")  # doctest: +SKIP
+    ... # NumExprClobberingError: Variables in expression "(abs) > (2)" overlap...
+    >>> sin, a = 1, 2
+    >>> pd.eval("sin + a", engine="numexpr")  # doctest: +SKIP
+    ... # NumExprClobberingError: Variables in expression "(sin) + (a)" overlap...
+    """
+
+
+class UndefinedVariableError(NameError):
+    """
+    Exception raised by ``query`` or ``eval`` when using an undefined variable name.
+
+    It will also specify whether the undefined variable is local or not.
+
+    Parameters
+    ----------
+    name : str
+        The name of the undefined variable.
+    is_local : bool or None, optional
+        Indicates whether the undefined variable is considered a local variable.
+        If ``True``, the error message specifies it as a local variable.
+        If ``False`` or ``None``, the variable is treated as a non-local name.
+
+    See Also
+    --------
+    DataFrame.query : Query the columns of a DataFrame with a boolean expression.
+    DataFrame.eval : Evaluate a string describing operations on DataFrame columns.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({"A": [1, 1, 1]})
+    >>> df.query("A > x")  # doctest: +SKIP
+    ... # UndefinedVariableError: name 'x' is not defined
+    >>> df.query("A > @y")  # doctest: +SKIP
+    ... # UndefinedVariableError: local variable 'y' is not defined
+    >>> pd.eval("x + 1")  # doctest: +SKIP
+    ... # UndefinedVariableError: name 'x' is not defined
+    """
+
+    def __init__(self, name: str, is_local: bool | None = None) -> None:
+        base_msg = f"{name!r} is not defined"
+        if is_local:
+            msg = f"local variable {base_msg}"
+        else:
+            msg = f"name {base_msg}"
+        super().__init__(msg)
+
+
+class IndexingError(Exception):
+    """
+    Exception is raised when trying to index and there is a mismatch in dimensions.
+
+    Raised by properties like :attr:`.pandas.DataFrame.iloc` when
+    an indexer is out of bounds or :attr:`.pandas.DataFrame.loc` when its index is
+    unalignable to the frame index.
+
+    See Also
+    --------
+    DataFrame.iloc : Purely integer-location based indexing for \
+    selection by position.
+    DataFrame.loc : Access a group of rows and columns by label(s) \
+    or a boolean array.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({"A": [1, 1, 1]})
+    >>> df.loc[..., ..., "A"]  # doctest: +SKIP
+    ... # IndexingError: indexer may only contain one '...' entry
+    >>> df = pd.DataFrame({"A": [1, 1, 1]})
+    >>> df.loc[1, ..., ...]  # doctest: +SKIP
+    ... # IndexingError: Too many indexers
+    >>> df[pd.Series([True], dtype=bool)]  # doctest: +SKIP
+    ... # IndexingError: Unalignable boolean Series provided as indexer...
+    >>> s = pd.Series(range(2), index=pd.MultiIndex.from_product([["a", "b"], ["c"]]))
+    >>> s.loc["a", "c", "d"]  # doctest: +SKIP
+    ... # IndexingError: Too many indexers
+    """
+
+
+class PyperclipException(RuntimeError):
+    """
+    Exception raised when clipboard functionality is unsupported.
+
+    Raised by ``to_clipboard()`` and ``read_clipboard()``.
+    """
+
+
+class PyperclipWindowsException(PyperclipException):
+    """
+    Exception raised when clipboard functionality is unsupported by Windows.
+
+    Access to the clipboard handle would be denied due to some other
+    window process is accessing it.
+    """
+
+    def __init__(self, message: str) -> None:
+        # attr only exists on Windows, so typing fails on other platforms
+        message += f" ({ctypes.WinError()})"  # type: ignore[attr-defined]
+        super().__init__(message)
+
+
+class CSSWarning(UserWarning):
+    """
+    Warning is raised when converting css styling fails.
+
+    This can be due to the styling not having an equivalent value or because the
+    styling isn't properly formatted.
+
+    See Also
+    --------
+    DataFrame.style : Returns a Styler object for applying CSS-like styles.
+    io.formats.style.Styler : Helps style a DataFrame or Series according to the
+        data with HTML and CSS.
+    io.formats.style.Styler.to_excel : Export styled DataFrame to Excel.
+    io.formats.style.Styler.to_html : Export styled DataFrame to HTML.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({"A": [1, 1, 1]})
+    >>> df.style.map(lambda x: "background-color: blueGreenRed;").to_excel(
+    ...     "styled.xlsx"
+    ... )  # doctest: +SKIP
+    CSSWarning: Unhandled color format: 'blueGreenRed'
+    >>> df.style.map(lambda x: "border: 1px solid red red;").to_excel(
+    ...     "styled.xlsx"
+    ... )  # doctest: +SKIP
+    CSSWarning: Unhandled color format: 'blueGreenRed'
+    """
+
+
+class PossibleDataLossError(Exception):
+    """
+    Exception raised when trying to open an HDFStore file when already opened.
+
+    This error is triggered when there is a potential risk of data loss due to
+    conflicting operations on an HDFStore file. It serves to prevent unintended
+    overwrites or data corruption by enforcing exclusive access to the file.
+
+    See Also
+    --------
+    HDFStore : Dict-like IO interface for storing pandas objects in PyTables.
+    HDFStore.open : Open an HDFStore file in the specified mode.
+
+    Examples
+    --------
+    >>> store = pd.HDFStore("my-store", "a")  # doctest: +SKIP
+    >>> store.open("w")  # doctest: +SKIP
+    """
+
+
+class ClosedFileError(Exception):
+    """
+    Exception is raised when trying to perform an operation on a closed HDFStore file.
+
+    ``ClosedFileError`` is specific to operations on ``HDFStore`` objects. Once an
+    HDFStore is closed, its resources are no longer available, and any further attempt
+    to access data or perform file operations will raise this exception.
+
+    See Also
+    --------
+    HDFStore.close : Closes the PyTables file handle.
+    HDFStore.open : Opens the file in the specified mode.
+    HDFStore.is_open : Returns a boolean indicating whether the file is open.
+
+    Examples
+    --------
+    >>> store = pd.HDFStore("my-store", "a")  # doctest: +SKIP
+    >>> store.close()  # doctest: +SKIP
+    >>> store.keys()  # doctest: +SKIP
+    ... # ClosedFileError: my-store file is not open!
+    """
+
+
+class IncompatibilityWarning(Warning):
+    """
+    Warning raised when trying to use where criteria on an incompatible HDF5 file.
+    """
+
+
+class AttributeConflictWarning(Warning):
+    """
+    Warning raised when index attributes conflict when using HDFStore.
+
+    Occurs when attempting to append an index with a different
+    name than the existing index on an HDFStore or attempting to append an index with a
+    different frequency than the existing index on an HDFStore.
+
+    See Also
+    --------
+    HDFStore : Dict-like IO interface for storing pandas objects in PyTables.
+    DataFrame.to_hdf : Write the contained data to an HDF5 file using HDFStore.
+    read_hdf : Read from an HDF5 file into a DataFrame.
+
+    Examples
+    --------
+    >>> idx1 = pd.Index(["a", "b"], name="name1")
+    >>> df1 = pd.DataFrame([[1, 2], [3, 4]], index=idx1)
+    >>> df1.to_hdf("file", "data", "w", append=True)  # doctest: +SKIP
+    >>> idx2 = pd.Index(["c", "d"], name="name2")
+    >>> df2 = pd.DataFrame([[5, 6], [7, 8]], index=idx2)
+    >>> df2.to_hdf("file", "data", "a", append=True)  # doctest: +SKIP
+    AttributeConflictWarning: the [index_name] attribute of the existing index is
+    [name1] which conflicts with the new [name2]...
+    """
+
+
+class DatabaseError(OSError):
+    """
+    Error is raised when executing SQL with bad syntax or SQL that throws an error.
+
+    Raised by :func:`.pandas.read_sql` when a bad SQL statement is passed in.
+
+    See Also
+    --------
+    read_sql : Read SQL query or database table into a DataFrame.
+
+    Examples
+    --------
+    >>> from sqlite3 import connect
+    >>> conn = connect(":memory:")
+    >>> pd.read_sql("select * test", conn)  # doctest: +SKIP
+    """
+
+
+class PossiblePrecisionLoss(Warning):
+    """
+    Warning raised by to_stata on a column with a value outside or equal to int64.
+
+    When the column value is outside or equal to the int64 value the column is
+    converted to a float64 dtype.
+
+    See Also
+    --------
+    DataFrame.to_stata : Export DataFrame object to Stata dta format.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({"s": pd.Series([1, 2**53], dtype=np.int64)})
+    >>> df.to_stata("test")  # doctest: +SKIP
+    """
+
+
+class ValueLabelTypeMismatch(Warning):
+    """
+    Warning raised by to_stata on a category column that contains non-string values.
+
+    When exporting data to Stata format using the `to_stata` method, category columns
+    must have string values as labels. If a category column contains non-string values
+    (e.g., integers, floats, or other types), this warning is raised to indicate that
+    the Stata file may not correctly represent the data.
+
+    See Also
+    --------
+    DataFrame.to_stata : Export DataFrame object to Stata dta format.
+    Series.cat : Accessor for categorical properties of the Series values.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({"categories": pd.Series(["a", 2], dtype="category")})
+    >>> df.to_stata("test")  # doctest: +SKIP
+    """
+
+
+class InvalidColumnName(Warning):
+    """
+    Warning raised by to_stata the column contains a non-valid stata name.
+
+    Because the column name is an invalid Stata variable, the name needs to be
+    converted.
+
+    See Also
+    --------
+    DataFrame.to_stata : Export DataFrame object to Stata dta format.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame({"0categories": pd.Series([2, 2])})
+    >>> df.to_stata("test")  # doctest: +SKIP
+    """
+
+
+class CategoricalConversionWarning(Warning):
+    """
+    Warning is raised when reading a partial labeled Stata file using an iterator.
+
+    This warning helps ensure data integrity and alerts users to potential issues
+    during the incremental reading of Stata files with labeled data, allowing for
+    additional checks and adjustments as necessary.
+
+    See Also
+    --------
+    read_stata : Read a Stata file into a DataFrame.
+    Categorical : Represents a categorical variable in pandas.
+
+    Examples
+    --------
+    >>> from pandas.io.stata import StataReader
+    >>> with StataReader("dta_file", chunksize=2) as reader:  # doctest: +SKIP
+    ...     for i, block in enumerate(reader):
+    ...         print(i, block)
+    ... # CategoricalConversionWarning: One or more series with value labels...
+    """
+
+
+class LossySetitemError(Exception):
+    """
+    Raised when trying to do a __setitem__ on an np.ndarray that is not lossless.
+
+    Notes
+    -----
+    This is an internal error.
+    """
+
+
+class NoBufferPresent(Exception):
+    """
+    Exception is raised in _get_data_buffer to signal that there is no requested buffer.
+    """
+
+
+class InvalidComparison(Exception):
+    """
+    Exception is raised by _validate_comparison_value to indicate an invalid comparison.
+
+    Notes
+    -----
+    This is an internal error.
+    """
+
+
+__all__ = [
+    "AbstractMethodError",
+    "AttributeConflictWarning",
+    "CSSWarning",
+    "CategoricalConversionWarning",
+    "ChainedAssignmentError",
+    "ClosedFileError",
+    "DataError",
+    "DatabaseError",
+    "DtypeWarning",
+    "DuplicateLabelError",
+    "EmptyDataError",
+    "IncompatibilityWarning",
+    "IncompatibleFrequency",
+    "IndexingError",
+    "IntCastingNaNError",
+    "InvalidColumnName",
+    "InvalidComparison",
+    "InvalidIndexError",
+    "InvalidVersion",
+    "LossySetitemError",
+    "MergeError",
+    "NoBufferPresent",
+    "NullFrequencyError",
+    "NumExprClobberingError",
+    "NumbaUtilError",
+    "OptionError",
+    "OutOfBoundsDatetime",
+    "OutOfBoundsTimedelta",
+    "Pandas4Warning",
+    "Pandas5Warning",
+    "PandasChangeWarning",
+    "PandasDeprecationWarning",
+    "PandasFutureWarning",
+    "PandasPendingDeprecationWarning",
+    "ParserError",
+    "ParserWarning",
+    "PerformanceWarning",
+    "PossibleDataLossError",
+    "PossiblePrecisionLoss",
+    "PyperclipException",
+    "PyperclipWindowsException",
+    "SpecificationError",
+    "UndefinedVariableError",
+    "UnsortedIndexError",
+    "UnsupportedFunctionCall",
+    "ValueLabelTypeMismatch",
+]
diff --git a/pandas/errors/cow.py b/pandas/errors/cow.py
new file mode 100644
index 0000000000000000000000000000000000000000..8516c33b9d9dcc85e3aeb1bd74068e7a298c9c68
--- /dev/null
+++ b/pandas/errors/cow.py
@@ -0,0 +1,43 @@
+_chained_assignment_msg = (
+    "A value is being set on a copy of a DataFrame or Series "
+    "through chained assignment.\n"
+    "Such chained assignment never works to update the original DataFrame or "
+    "Series, because the intermediate object on which we are setting values "
+    "always behaves as a copy (due to Copy-on-Write).\n\n"
+    "Try using '.loc[row_indexer, col_indexer] = value' instead, to perform "
+    "the assignment in a single step.\n\n"
+    "See the documentation for a more detailed explanation: "
+    "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
+    "copy_on_write.html#chained-assignment"
+)
+
+
+_chained_assignment_method_msg = (
+    "A value is being set on a copy of a DataFrame or Series "
+    "through chained assignment using an inplace method.\n"
+    "Such inplace method never works to update the original DataFrame or Series, "
+    "because the intermediate object on which we are setting values always "
+    "behaves as a copy (due to Copy-on-Write).\n\n"
+    "For example, when doing 'df[col].method(value, inplace=True)', try "
+    "using 'df.method({col: value}, inplace=True)' instead, to perform "
+    "the operation inplace on the original object, or try to avoid an inplace "
+    "operation using 'df[col] = df[col].method(value)'.\n\n"
+    "See the documentation for a more detailed explanation: "
+    "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
+    "copy_on_write.html"
+)
+
+
+_chained_assignment_method_update_msg = (
+    "A value is being set on a copy of a DataFrame or Series "
+    "through chained assignment using an inplace method.\n"
+    "Such inplace method never works to update the original DataFrame or Series, "
+    "because the intermediate object on which we are setting values always "
+    "behaves as a copy (due to Copy-on-Write).\n\n"
+    "For example, when doing 'df[col].update(other)', try "
+    "using 'df.update({col: other})' instead, to perform "
+    "the operation inplace on the original object.\n\n"
+    "See the documentation for a more detailed explanation: "
+    "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
+    "copy_on_write.html"
+)
diff --git a/pandas/io/__init__.py b/pandas/io/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c7e531debb1426624186453b622cfccd11d44ef
--- /dev/null
+++ b/pandas/io/__init__.py
@@ -0,0 +1,13 @@
+# ruff: noqa: TC004
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    # import modules that have public classes/functions
+    from pandas.io import (
+        formats,
+        json,
+        stata,
+    )
+
+    # mark only those modules as public
+    __all__ = ["formats", "json", "stata"]
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..da9ac3913cbbd45b80c47987dd5b3c523da8e8b4
--- /dev/null
+++ b/pandas/io/_util.py
@@ -0,0 +1,191 @@
+from __future__ import annotations
+
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+)
+
+import numpy as np
+
+from pandas._config import using_string_dtype
+
+from pandas._libs import lib
+from pandas.compat import (
+    pa_version_under18p0,
+    pa_version_under19p0,
+)
+from pandas.compat._optional import import_optional_dependency
+
+from pandas.core.dtypes.common import pandas_dtype
+
+import pandas as pd
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Hashable,
+        Sequence,
+    )
+
+    import pyarrow
+
+    from pandas._typing import (
+        DtypeArg,
+        DtypeBackend,
+    )
+
+
+def _arrow_dtype_mapping() -> dict:
+    pa = import_optional_dependency("pyarrow")
+    return {
+        pa.int8(): pd.Int8Dtype(),
+        pa.int16(): pd.Int16Dtype(),
+        pa.int32(): pd.Int32Dtype(),
+        pa.int64(): pd.Int64Dtype(),
+        pa.uint8(): pd.UInt8Dtype(),
+        pa.uint16(): pd.UInt16Dtype(),
+        pa.uint32(): pd.UInt32Dtype(),
+        pa.uint64(): pd.UInt64Dtype(),
+        pa.bool_(): pd.BooleanDtype(),
+        pa.string(): pd.StringDtype(),
+        pa.float32(): pd.Float32Dtype(),
+        pa.float64(): pd.Float64Dtype(),
+        pa.string(): pd.StringDtype(),
+        pa.large_string(): pd.StringDtype(),
+    }
+
+
+def _arrow_string_types_mapper() -> Callable:
+    pa = import_optional_dependency("pyarrow")
+
+    mapping = {
+        pa.string(): pd.StringDtype(na_value=np.nan),
+        pa.large_string(): pd.StringDtype(na_value=np.nan),
+    }
+    if not pa_version_under18p0:
+        mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan)
+
+    return mapping.get
+
+
+def arrow_table_to_pandas(
+    table: pyarrow.Table,
+    dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
+    null_to_int64: bool = False,
+    to_pandas_kwargs: dict | None = None,
+    dtype: DtypeArg | None = None,
+    names: Sequence[Hashable] | None = None,
+) -> pd.DataFrame:
+    pa = import_optional_dependency("pyarrow")
+
+    to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs
+
+    types_mapper: type[pd.ArrowDtype] | None | Callable
+    if dtype_backend == "numpy_nullable":
+        mapping = _arrow_dtype_mapping()
+        if null_to_int64:
+            # Modify the default mapping to also map null to Int64
+            # (to match other engines - only for CSV parser)
+            mapping[pa.null()] = pd.Int64Dtype()
+        types_mapper = mapping.get
+    elif dtype_backend == "pyarrow":
+        types_mapper = pd.ArrowDtype
+    elif using_string_dtype():
+        if pa_version_under19p0:
+            types_mapper = _arrow_string_types_mapper()
+        elif dtype is not None:
+            # GH#56136 Avoid lossy conversion to float64
+            # We'll convert to numpy below if
+            types_mapper = {
+                pa.int8(): pd.Int8Dtype(),
+                pa.int16(): pd.Int16Dtype(),
+                pa.int32(): pd.Int32Dtype(),
+                pa.int64(): pd.Int64Dtype(),
+            }.get
+        else:
+            types_mapper = None
+    elif dtype_backend is lib.no_default or dtype_backend == "numpy":
+        if dtype is not None:
+            # GH#56136 Avoid lossy conversion to float64
+            # We'll convert to numpy below if
+            types_mapper = {
+                pa.int8(): pd.Int8Dtype(),
+                pa.int16(): pd.Int16Dtype(),
+                pa.int32(): pd.Int32Dtype(),
+                pa.int64(): pd.Int64Dtype(),
+            }.get
+        else:
+            types_mapper = None
+    else:
+        raise NotImplementedError
+
+    df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
+    return _post_convert_dtypes(df, dtype_backend, dtype, names)
+
+
+def _post_convert_dtypes(
+    df: pd.DataFrame,
+    dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault,
+    dtype: DtypeArg | None,
+    names: Sequence[Hashable] | None,
+) -> pd.DataFrame:
+    if dtype is not None and (
+        dtype_backend is lib.no_default or dtype_backend == "numpy"
+    ):
+        # GH#56136 apply any user-provided dtype, and convert any IntegerDtype
+        #  columns the user didn't explicitly ask for.
+        if isinstance(dtype, dict):
+            if names is not None:
+                df.columns = names
+
+            cmp_dtypes = {
+                pd.Int8Dtype(),
+                pd.Int16Dtype(),
+                pd.Int32Dtype(),
+                pd.Int64Dtype(),
+            }
+            for col in df.columns:
+                if col not in dtype and df[col].dtype in cmp_dtypes:
+                    # Any key that the user didn't explicitly specify
+                    #  that got converted to IntegerDtype now gets converted
+                    #  to numpy dtype.
+                    dtype[col] = df[col].dtype.numpy_dtype
+
+            # Ignore non-existent columns from dtype mapping
+            # like other parsers do
+            dtype = {
+                key: pandas_dtype(dtype[key]) for key in dtype if key in df.columns
+            }
+
+        else:
+            dtype = pandas_dtype(dtype)
+
+        try:
+            df = df.astype(dtype)
+        except TypeError as err:
+            # GH#44901 reraise to keep api consistent
+            raise ValueError(str(err)) from err
+
+    if (
+        not using_string_dtype()
+        and dtype != "str"
+        and (dtype_backend is lib.no_default or dtype_backend == "numpy")
+    ):
+        # Convert any StringDtype columns back to object dtype (pyarrow always
+        # uses string dtype even when the infer_string option is False)
+        for col, dtype in zip(df.columns, df.dtypes, strict=True):
+            if isinstance(dtype, pd.StringDtype) and dtype.na_value is np.nan:
+                df[col] = df[col].astype("object").fillna(None)
+            if isinstance(dtype, pd.CategoricalDtype):
+                cat_dtype = dtype.categories.dtype
+                if (
+                    isinstance(cat_dtype, pd.StringDtype)
+                    and cat_dtype.na_value is np.nan
+                ):
+                    cat_dtype = pd.CategoricalDtype(
+                        categories=dtype.categories.astype("object"),
+                        ordered=dtype.ordered,
+                    )
+                    df[col] = df[col].astype(cat_dtype)
+
+    return df
diff --git a/pandas/io/api.py b/pandas/io/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d9f38999f787cdc96f3162934bfdcba251ccf5d
--- /dev/null
+++ b/pandas/io/api.py
@@ -0,0 +1,65 @@
+"""
+Data I/O API
+"""
+
+from pandas.io.clipboards import read_clipboard
+from pandas.io.excel import (
+    ExcelFile,
+    ExcelWriter,
+    read_excel,
+)
+from pandas.io.feather_format import read_feather
+from pandas.io.html import read_html
+from pandas.io.iceberg import read_iceberg
+from pandas.io.json import read_json
+from pandas.io.orc import read_orc
+from pandas.io.parquet import read_parquet
+from pandas.io.parsers import (
+    read_csv,
+    read_fwf,
+    read_table,
+)
+from pandas.io.pickle import (
+    read_pickle,
+    to_pickle,
+)
+from pandas.io.pytables import (
+    HDFStore,
+    read_hdf,
+)
+from pandas.io.sas import read_sas
+from pandas.io.spss import read_spss
+from pandas.io.sql import (
+    read_sql,
+    read_sql_query,
+    read_sql_table,
+)
+from pandas.io.stata import read_stata
+from pandas.io.xml import read_xml
+
+__all__ = [
+    "ExcelFile",
+    "ExcelWriter",
+    "HDFStore",
+    "read_clipboard",
+    "read_csv",
+    "read_excel",
+    "read_feather",
+    "read_fwf",
+    "read_hdf",
+    "read_html",
+    "read_iceberg",
+    "read_json",
+    "read_orc",
+    "read_parquet",
+    "read_pickle",
+    "read_sas",
+    "read_spss",
+    "read_sql",
+    "read_sql_query",
+    "read_sql_table",
+    "read_stata",
+    "read_table",
+    "read_xml",
+    "to_pickle",
+]
diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a562481f0e98726a67acedb3b7f48183d676057
--- /dev/null
+++ b/pandas/io/clipboards.py
@@ -0,0 +1,200 @@
+"""io on the clipboard"""
+
+from __future__ import annotations
+
+from io import StringIO
+from typing import TYPE_CHECKING
+import warnings
+
+from pandas._libs import lib
+from pandas.util._decorators import set_module
+from pandas.util._exceptions import find_stack_level
+from pandas.util._validators import check_dtype_backend
+
+from pandas.core.dtypes.generic import ABCDataFrame
+
+from pandas import (
+    get_option,
+    option_context,
+)
+
+if TYPE_CHECKING:
+    from pandas._typing import DtypeBackend
+
+
+@set_module("pandas")
+def read_clipboard(
+    sep: str = r"\s+",
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+    **kwargs,
+):  # pragma: no cover
+    r"""
+    Read text from clipboard and pass to :func:`~pandas.read_csv`.
+
+    Parses clipboard contents similar to how CSV files are parsed
+    using :func:`~pandas.read_csv`.
+
+    Parameters
+    ----------
+    sep : str, default '\\s+'
+        A string or regex delimiter. The default of ``'\\s+'`` denotes
+        one or more whitespace characters.
+
+    dtype_backend : {'numpy_nullable', 'pyarrow'}
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). If not specified, the default behavior
+        is to not use nullable data types. If specified, the behavior
+        is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+        * ``"pyarrow"``: returns pyarrow-backed nullable
+          :class:`ArrowDtype` :class:`DataFrame`
+
+        .. versionadded:: 2.0
+
+    **kwargs
+        See :func:`~pandas.read_csv` for the full argument list.
+
+    Returns
+    -------
+    DataFrame
+        A parsed :class:`~pandas.DataFrame` object.
+
+    See Also
+    --------
+    DataFrame.to_clipboard : Copy object to the system clipboard.
+    read_csv : Read a comma-separated values (csv) file into DataFrame.
+    read_fwf : Read a table of fixed-width formatted lines into DataFrame.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
+    >>> df.to_clipboard()  # doctest: +SKIP
+    >>> pd.read_clipboard()  # doctest: +SKIP
+         A  B  C
+    0    1  2  3
+    1    4  5  6
+    """
+    encoding = kwargs.pop("encoding", "utf-8")
+
+    # only utf-8 is valid for passed value because that's what clipboard
+    # supports
+    if encoding is not None and encoding.lower().replace("-", "") != "utf8":
+        raise NotImplementedError("reading from clipboard only supports utf-8 encoding")
+
+    check_dtype_backend(dtype_backend)
+
+    from pandas.io.clipboard import clipboard_get
+    from pandas.io.parsers import read_csv
+
+    text = clipboard_get()
+
+    # Try to decode (if needed, as "text" might already be a string here).
+    try:
+        text = text.decode(kwargs.get("encoding") or get_option("display.encoding"))
+    except AttributeError:
+        pass
+
+    # Excel copies into clipboard with \t separation
+    # inspect no more then the 10 first lines, if they
+    # all contain an equal number (>0) of tabs, infer
+    # that this came from excel and set 'sep' accordingly
+    lines = text[:10000].split("\n")[:-1][:10]
+
+    # Need to remove leading white space, since read_csv
+    # accepts:
+    #    a  b
+    # 0  1  2
+    # 1  3  4
+
+    counts = {x.lstrip(" ").count("\t") for x in lines}
+    if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0:
+        sep = "\t"
+        # check the number of leading tabs in the first line
+        # to account for index columns
+        index_length = len(lines[0]) - len(lines[0].lstrip(" \t"))
+        if index_length != 0:
+            kwargs.setdefault("index_col", list(range(index_length)))
+
+    elif not isinstance(sep, str):
+        raise ValueError(f"{sep=} must be a string")
+
+    # Regex separator currently only works with python engine.
+    # Default to python if separator is multi-character (regex)
+    if len(sep) > 1 and kwargs.get("engine") is None:
+        kwargs["engine"] = "python"
+    elif len(sep) > 1 and kwargs.get("engine") == "c":
+        warnings.warn(
+            "read_clipboard with regex separator does not work properly with c engine.",
+            stacklevel=find_stack_level(),
+        )
+
+    return read_csv(StringIO(text), sep=sep, dtype_backend=dtype_backend, **kwargs)
+
+
+def to_clipboard(
+    obj, excel: bool | None = True, sep: str | None = None, **kwargs
+) -> None:  # pragma: no cover
+    """
+    Attempt to write text representation of object to the system clipboard
+    The clipboard can be then pasted into Excel for example.
+
+    Parameters
+    ----------
+    obj : the object to write to the clipboard
+    excel : bool, defaults to True
+            if True, use the provided separator, writing in a csv
+            format for allowing easy pasting into excel.
+            if False, write a string representation of the object
+            to the clipboard
+    sep : optional, defaults to tab
+    other keywords are passed to to_csv
+
+    Notes
+    -----
+    Requirements for your platform
+      - Linux: xclip, or xsel (with PyQt4 modules)
+      - Windows:
+      - OS X:
+    """
+    encoding = kwargs.pop("encoding", "utf-8")
+
+    # testing if an invalid encoding is passed to clipboard
+    if encoding is not None and encoding.lower().replace("-", "") != "utf8":
+        raise ValueError("clipboard only supports utf-8 encoding")
+
+    from pandas.io.clipboard import clipboard_set
+
+    if excel is None:
+        excel = True
+
+    if excel:
+        try:
+            if sep is None:
+                sep = "\t"
+            buf = StringIO()
+
+            # clipboard_set (pyperclip) expects unicode
+            obj.to_csv(buf, sep=sep, encoding="utf-8", **kwargs)
+            text = buf.getvalue()
+
+            clipboard_set(text)
+            return
+        except TypeError:
+            warnings.warn(
+                "to_clipboard in excel mode requires a single character separator.",
+                stacklevel=find_stack_level(),
+            )
+    elif sep is not None:
+        warnings.warn(
+            "to_clipboard with excel=False ignores the sep argument.",
+            stacklevel=find_stack_level(),
+        )
+
+    if isinstance(obj, ABCDataFrame):
+        # str(df) has various unhelpful defaults, like truncation
+        with option_context("display.max_colwidth", None):
+            objstr = obj.to_string(**kwargs)
+    else:
+        objstr = str(obj)
+    clipboard_set(objstr)
diff --git a/pandas/io/common.py b/pandas/io/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..04f4f9f604786e8e8b3eab852c51e212cf50b9c8
--- /dev/null
+++ b/pandas/io/common.py
@@ -0,0 +1,1327 @@
+"""Common I/O API utilities"""
+
+from __future__ import annotations
+
+from abc import (
+    ABC,
+    abstractmethod,
+)
+import codecs
+from collections import defaultdict
+from collections.abc import (
+    Hashable,
+    Mapping,
+    Sequence,
+)
+import dataclasses
+import functools
+import gzip
+from io import (
+    BufferedIOBase,
+    BytesIO,
+    RawIOBase,
+    StringIO,
+    TextIOBase,
+    TextIOWrapper,
+)
+import mmap
+import os
+from pathlib import Path
+import re
+import tarfile
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    AnyStr,
+    DefaultDict,
+    Generic,
+    Literal,
+    TypeVar,
+    cast,
+    overload,
+)
+from urllib.parse import (
+    urljoin,
+    urlparse as parse_url,
+    uses_netloc,
+    uses_params,
+    uses_relative,
+)
+import warnings
+import zipfile
+
+from pandas._typing import (
+    BaseBuffer,
+    ReadCsvBuffer,
+)
+from pandas.compat._optional import import_optional_dependency
+from pandas.util._exceptions import find_stack_level
+
+from pandas.core.dtypes.common import (
+    is_bool,
+    is_file_like,
+    is_integer,
+    is_list_like,
+)
+from pandas.core.dtypes.generic import ABCMultiIndex
+
+_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
+_VALID_URLS.discard("")
+_FSSPEC_URL_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*(::[A-Za-z0-9+\-+.]+)*://")
+
+BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer)
+
+
+if TYPE_CHECKING:
+    from types import TracebackType
+
+    from pandas._typing import (
+        CompressionDict,
+        CompressionOptions,
+        FilePath,
+        ReadBuffer,
+        StorageOptions,
+        WriteBuffer,
+    )
+
+    from pandas import MultiIndex
+
+
+@dataclasses.dataclass
+class IOArgs:
+    """
+    Return value of io/common.py:_get_filepath_or_buffer.
+    """
+
+    filepath_or_buffer: str | BaseBuffer
+    encoding: str
+    mode: str
+    compression: CompressionDict
+    should_close: bool = False
+
+
+@dataclasses.dataclass
+class IOHandles(Generic[AnyStr]):
+    """
+    Return value of io/common.py:get_handle
+
+    Can be used as a context manager.
+
+    This is used to easily close created buffers and to handle corner cases when
+    TextIOWrapper is inserted.
+
+    handle: The file handle to be used.
+    created_handles: All file handles that are created by get_handle
+    is_wrapped: Whether a TextIOWrapper needs to be detached.
+    """
+
+    # handle might not implement the IO-interface
+    handle: IO[AnyStr]
+    compression: CompressionDict
+    created_handles: list[IO[bytes] | IO[str]] = dataclasses.field(default_factory=list)
+    is_wrapped: bool = False
+
+    def close(self) -> None:
+        """
+        Close all created buffers.
+
+        Note: If a TextIOWrapper was inserted, it is flushed and detached to
+        avoid closing the potentially user-created buffer.
+        """
+        if self.is_wrapped:
+            assert isinstance(self.handle, TextIOWrapper)
+            self.handle.flush()
+            self.handle.detach()
+            self.created_handles.remove(self.handle)
+        for handle in self.created_handles:
+            handle.close()
+        self.created_handles = []
+        self.is_wrapped = False
+
+    def __enter__(self) -> IOHandles[AnyStr]:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: TracebackType | None,
+    ) -> None:
+        self.close()
+
+
+def is_url(url: object) -> bool:
+    """
+    Check to see if a URL has a valid protocol.
+
+    Parameters
+    ----------
+    url : str or unicode
+
+    Returns
+    -------
+    isurl : bool
+        If `url` has a valid protocol return True otherwise False.
+    """
+    if not isinstance(url, str):
+        return False
+    return parse_url(url).scheme in _VALID_URLS
+
+
+@overload
+def _expand_user(filepath_or_buffer: str) -> str: ...
+
+
+@overload
+def _expand_user(filepath_or_buffer: BaseBufferT) -> BaseBufferT: ...
+
+
+def _expand_user(filepath_or_buffer: str | BaseBufferT) -> str | BaseBufferT:
+    """
+    Return the argument with an initial component of ~ or ~user
+    replaced by that user's home directory.
+
+    Parameters
+    ----------
+    filepath_or_buffer : object to be converted if possible
+
+    Returns
+    -------
+    expanded_filepath_or_buffer : an expanded filepath or the
+                                  input if not expandable
+    """
+    if isinstance(filepath_or_buffer, str):
+        return os.path.expanduser(filepath_or_buffer)
+    return filepath_or_buffer
+
+
+def validate_header_arg(header: object) -> None:
+    if header is None:
+        return
+    if is_integer(header):
+        header = cast(int, header)
+        if header < 0:
+            # GH 27779
+            raise ValueError(
+                "Passing negative integer to header is invalid. "
+                "For no header, use header=None instead"
+            )
+        return
+    if is_list_like(header, allow_sets=False):
+        header = cast(Sequence, header)
+        if not all(map(is_integer, header)):
+            raise ValueError("header must be integer or list of integers")
+        if any(i < 0 for i in header):
+            raise ValueError("cannot specify multi-index header with negative integers")
+        return
+    if is_bool(header):
+        raise TypeError(
+            "Passing a bool to header is invalid. Use header=None for no header or "
+            "header=int or list-like of ints to specify "
+            "the row(s) making up the column names"
+        )
+    # GH 16338
+    raise ValueError("header must be integer or list of integers")
+
+
+@overload
+def stringify_path(
+    filepath_or_buffer: FilePath, convert_file_like: bool = ...
+) -> str: ...
+
+
+@overload
+def stringify_path(
+    filepath_or_buffer: BaseBufferT, convert_file_like: bool = ...
+) -> BaseBufferT: ...
+
+
+def stringify_path(
+    filepath_or_buffer: FilePath | BaseBufferT,
+    convert_file_like: bool = False,
+) -> str | BaseBufferT:
+    """
+    Attempt to convert a path-like object to a string.
+
+    Parameters
+    ----------
+    filepath_or_buffer : object to be converted
+
+    Returns
+    -------
+    str_filepath_or_buffer : maybe a string version of the object
+
+    Notes
+    -----
+    Objects supporting the fspath protocol are coerced
+    according to its __fspath__ method.
+
+    Any other object is passed through unchanged, which includes bytes,
+    strings, buffers, or anything else that's not even path-like.
+    """
+    if not convert_file_like and is_file_like(filepath_or_buffer):
+        # GH 38125: some fsspec objects implement os.PathLike but have already opened a
+        # file. This prevents opening the file a second time. infer_compression calls
+        # this function with convert_file_like=True to infer the compression.
+        return cast(BaseBufferT, filepath_or_buffer)
+
+    if isinstance(filepath_or_buffer, os.PathLike):
+        filepath_or_buffer = filepath_or_buffer.__fspath__()
+    return _expand_user(filepath_or_buffer)
+
+
+def urlopen(*args: Any, **kwargs: Any) -> Any:
+    """
+    Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
+    the stdlib.
+    """
+    import urllib.request
+
+    return urllib.request.urlopen(*args, **kwargs)  # noqa: TID251
+
+
+def is_fsspec_url(url: FilePath | BaseBuffer) -> bool:
+    """
+    Returns true if the given URL looks like
+    something fsspec can handle
+    """
+    return (
+        isinstance(url, str)
+        and bool(_FSSPEC_URL_PATTERN.match(url))
+        and not url.startswith(("http://", "https://"))
+    )
+
+
+def _get_filepath_or_buffer(
+    filepath_or_buffer: FilePath | BaseBuffer,
+    encoding: str = "utf-8",
+    compression: CompressionOptions | None = None,
+    mode: str = "r",
+    storage_options: StorageOptions | None = None,
+) -> IOArgs:
+    """
+    If the filepath_or_buffer is a url, translate and return the buffer.
+    Otherwise passthrough.
+
+    Parameters
+    ----------
+    filepath_or_buffer : a url, filepath (str or pathlib.Path),
+                         or buffer
+
+    compression : str or dict, default 'infer'
+        For on-the-fly compression of the output data. If 'infer' and
+        'filepath_or_buffer' is path-like, then detect compression from the
+        following extensions: '.gz',
+        '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+        (otherwise no compression).
+        Set to ``None`` for no compression.
+        Can also be a dict with key ``'method'`` set
+        to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``}
+        and other key-value pairs are forwarded to
+        ``zipfile.ZipFile``, ``gzip.GzipFile``,
+        ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
+        ``tarfile.TarFile``, respectively.
+        As an example, the following could be passed for faster compression and to
+        create a reproducible gzip archive:
+        ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
+
+    encoding : the encoding to use to decode bytes, default is 'utf-8'
+    mode : str, optional
+
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection, e.g.
+        host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+        are forwarded to ``urllib.request.Request`` as header options. For other
+        URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+        forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+        details, and for more examples on storage options refer `here
+        <https://pandas.pydata.org/docs/user_guide/io.html?
+        highlight=storage_options#reading-writing-remote-files>`_.
+
+
+    Returns the dataclass IOArgs.
+    """
+    filepath_or_buffer = stringify_path(filepath_or_buffer)
+
+    # handle compression dict
+    compression_method, compression = get_compression_method(compression)
+    compression_method = infer_compression(filepath_or_buffer, compression_method)
+
+    # GH21227 internal compression is not used for non-binary handles.
+    if compression_method and hasattr(filepath_or_buffer, "write") and "b" not in mode:
+        warnings.warn(
+            "compression has no effect when passing a non-binary object as input.",
+            RuntimeWarning,
+            stacklevel=find_stack_level(),
+        )
+        compression_method = None
+
+    compression = dict(compression, method=compression_method)
+
+    # bz2 and xz do not write the byte order mark for utf-16 and utf-32
+    # print a warning when writing such files
+    if (
+        "w" in mode
+        and compression_method in ["bz2", "xz"]
+        and encoding in ["utf-16", "utf-32"]
+    ):
+        warnings.warn(
+            f"{compression} will not write the byte order mark for {encoding}",
+            UnicodeWarning,
+            stacklevel=find_stack_level(),
+        )
+
+    if "a" in mode and compression_method in ["zip", "tar"]:
+        # GH56778
+        warnings.warn(
+            "zip and tar do not support mode 'a' properly. "
+            "This combination will result in multiple files with same name "
+            "being added to the archive.",
+            RuntimeWarning,
+            stacklevel=find_stack_level(),
+        )
+
+    # Use binary mode when converting path-like objects to file-like objects (fsspec)
+    # except when text mode is explicitly requested. The original mode is returned if
+    # fsspec is not used.
+    fsspec_mode = mode
+    if "t" not in fsspec_mode and "b" not in fsspec_mode:
+        fsspec_mode += "b"
+
+    if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
+        # TODO: fsspec can also handle HTTP via requests, but leaving this
+        # unchanged. using fsspec appears to break the ability to infer if the
+        # server responded with gzipped data
+        storage_options = storage_options or {}
+
+        # waiting until now for importing to match intended lazy logic of
+        # urlopen function defined elsewhere in this module
+        import urllib.request
+
+        # assuming storage_options is to be interpreted as headers
+        req_info = urllib.request.Request(filepath_or_buffer, headers=storage_options)
+        with urlopen(req_info) as req:
+            content_encoding = req.headers.get("Content-Encoding", None)
+            if content_encoding == "gzip":
+                # Override compression based on Content-Encoding header
+                compression = {"method": "gzip"}
+            reader = BytesIO(req.read())
+        return IOArgs(
+            filepath_or_buffer=reader,
+            encoding=encoding,
+            compression=compression,
+            should_close=True,
+            mode=fsspec_mode,
+        )
+
+    if is_fsspec_url(filepath_or_buffer):
+        assert isinstance(
+            filepath_or_buffer, str
+        )  # just to appease mypy for this branch
+        # two special-case s3-like protocols; these have special meaning in Hadoop,
+        # but are equivalent to just "s3" from fsspec's point of view
+        # cc #11071
+        if filepath_or_buffer.startswith("s3a://"):
+            filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://")
+        if filepath_or_buffer.startswith("s3n://"):
+            filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://")
+        fsspec = import_optional_dependency("fsspec")
+
+        # If botocore is installed we fallback to reading with anon=True
+        # to allow reads from public buckets
+        err_types_to_retry_with_anon: list[Any] = []
+        try:
+            import_optional_dependency("botocore")
+            from botocore.exceptions import (
+                ClientError,
+                NoCredentialsError,
+            )
+
+            err_types_to_retry_with_anon = [
+                ClientError,
+                NoCredentialsError,
+                PermissionError,
+            ]
+        except ImportError:
+            pass
+
+        try:
+            file_obj = fsspec.open(
+                filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
+            ).open()
+        # GH 34626 Reads from Public Buckets without Credentials needs anon=True
+        except tuple(err_types_to_retry_with_anon):
+            if storage_options is None:
+                storage_options = {"anon": True}
+            else:
+                # don't mutate user input.
+                storage_options = dict(storage_options)
+                storage_options["anon"] = True
+            file_obj = fsspec.open(
+                filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
+            ).open()
+
+        return IOArgs(
+            filepath_or_buffer=file_obj,
+            encoding=encoding,
+            compression=compression,
+            should_close=True,
+            mode=fsspec_mode,
+        )
+    elif storage_options:
+        raise ValueError(
+            "storage_options passed with file object or non-fsspec file path"
+        )
+
+    if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
+        return IOArgs(
+            filepath_or_buffer=_expand_user(filepath_or_buffer),
+            encoding=encoding,
+            compression=compression,
+            should_close=False,
+            mode=mode,
+        )
+
+    # is_file_like requires (read | write) & __iter__ but __iter__ is only
+    # needed for read_csv(engine=python)
+    if not (
+        hasattr(filepath_or_buffer, "read") or hasattr(filepath_or_buffer, "write")
+    ):
+        msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}"
+        raise ValueError(msg)
+
+    return IOArgs(
+        filepath_or_buffer=filepath_or_buffer,
+        encoding=encoding,
+        compression=compression,
+        should_close=False,
+        mode=mode,
+    )
+
+
+def file_path_to_url(path: str) -> str:
+    """
+    converts an absolute native path to a FILE URL.
+
+    Parameters
+    ----------
+    path : a path in native format
+
+    Returns
+    -------
+    a valid FILE URL
+    """
+    # lazify expensive import (~30ms)
+    from urllib.request import pathname2url
+
+    return urljoin("file:", pathname2url(path))
+
+
+extension_to_compression = {
+    ".tar": "tar",
+    ".tar.gz": "tar",
+    ".tar.bz2": "tar",
+    ".tar.xz": "tar",
+    ".gz": "gzip",
+    ".bz2": "bz2",
+    ".zip": "zip",
+    ".xz": "xz",
+    ".zst": "zstd",
+}
+_supported_compressions = set(extension_to_compression.values())
+
+
+def get_compression_method(
+    compression: CompressionOptions,
+) -> tuple[str | None, CompressionDict]:
+    """
+    Simplifies a compression argument to a compression method string and
+    a mapping containing additional arguments.
+
+    Parameters
+    ----------
+    compression : str or mapping
+        If string, specifies the compression method. If mapping, value at key
+        'method' specifies compression method.
+
+    Returns
+    -------
+    tuple of ({compression method}, Optional[str]
+              {compression arguments}, Dict[str, Any])
+
+    Raises
+    ------
+    ValueError on mapping missing 'method' key
+    """
+    compression_method: str | None
+    if isinstance(compression, Mapping):
+        compression_args = dict(compression)
+        try:
+            compression_method = compression_args.pop("method")
+        except KeyError as err:
+            raise ValueError("If mapping, compression must have key 'method'") from err
+    else:
+        compression_args = {}
+        compression_method = compression
+    return compression_method, compression_args
+
+
+def infer_compression(
+    filepath_or_buffer: FilePath | BaseBuffer, compression: str | None
+) -> str | None:
+    """
+    Get the compression method for filepath_or_buffer. If compression='infer',
+    the inferred compression method is returned. Otherwise, the input
+    compression method is returned unchanged, unless it's invalid, in which
+    case an error is raised.
+
+    Parameters
+    ----------
+    filepath_or_buffer : str or file handle
+        File path or object.
+
+    compression : str or dict, default 'infer'
+        For on-the-fly compression of the output data. If 'infer' and
+        'filepath_or_buffer' is path-like, then detect compression from the
+        following extensions: '.gz',
+        '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+        (otherwise no compression).
+        Set to ``None`` for no compression.
+        Can also be a dict with key ``'method'`` set
+        to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``}
+        and other key-value pairs are forwarded to
+        ``zipfile.ZipFile``, ``gzip.GzipFile``,
+        ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
+        ``tarfile.TarFile``, respectively.
+        As an example, the following could be passed for faster compression and to
+        create a reproducible gzip archive:
+        ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
+
+    Returns
+    -------
+    string or None
+
+    Raises
+    ------
+    ValueError on invalid compression specified.
+    """
+    if compression is None:
+        return None
+
+    # Infer compression
+    if compression == "infer":
+        # Convert all path types (e.g. pathlib.Path) to strings
+        if isinstance(filepath_or_buffer, str) and "::" in filepath_or_buffer:
+            # chained URLs contain ::
+            filepath_or_buffer = filepath_or_buffer.split("::")[0]
+        filepath_or_buffer = stringify_path(filepath_or_buffer, convert_file_like=True)
+        if not isinstance(filepath_or_buffer, str):
+            # Cannot infer compression of a buffer, assume no compression
+            return None
+
+        # Infer compression from the filename/URL extension
+        for extension, compression in extension_to_compression.items():
+            if filepath_or_buffer.lower().endswith(extension):
+                return compression
+        return None
+
+    # Compression has been specified. Check that it's valid
+    if compression in _supported_compressions:
+        return compression
+
+    valid = ["infer", None, *sorted(_supported_compressions)]
+    msg = (
+        f"Unrecognized compression type: {compression}\n"
+        f"Valid compression types are {valid}"
+    )
+    raise ValueError(msg)
+
+
+def check_parent_directory(path: Path | str) -> None:
+    """
+    Check if parent directory of a file exists, raise OSError if it does not
+
+    Parameters
+    ----------
+    path: Path or str
+        Path to check parent directory of
+    """
+    parent = Path(path).parent
+    if not parent.is_dir():
+        raise OSError(rf"Cannot save file into a non-existent directory: '{parent}'")
+
+
+@overload
+def get_handle(
+    path_or_buf: FilePath | BaseBuffer,
+    mode: str,
+    *,
+    encoding: str | None = ...,
+    compression: CompressionOptions = ...,
+    memory_map: bool = ...,
+    is_text: Literal[False],
+    errors: str | None = ...,
+    storage_options: StorageOptions = ...,
+) -> IOHandles[bytes]: ...
+
+
+@overload
+def get_handle(
+    path_or_buf: FilePath | BaseBuffer,
+    mode: str,
+    *,
+    encoding: str | None = ...,
+    compression: CompressionOptions = ...,
+    memory_map: bool = ...,
+    is_text: Literal[True] = ...,
+    errors: str | None = ...,
+    storage_options: StorageOptions = ...,
+) -> IOHandles[str]: ...
+
+
+@overload
+def get_handle(
+    path_or_buf: FilePath | BaseBuffer,
+    mode: str,
+    *,
+    encoding: str | None = ...,
+    compression: CompressionOptions = ...,
+    memory_map: bool = ...,
+    is_text: bool = ...,
+    errors: str | None = ...,
+    storage_options: StorageOptions = ...,
+) -> IOHandles[str] | IOHandles[bytes]: ...
+
+
+def get_handle(
+    path_or_buf: FilePath | BaseBuffer,
+    mode: str,
+    *,
+    encoding: str | None = None,
+    compression: CompressionOptions | None = None,
+    memory_map: bool = False,
+    is_text: bool = True,
+    errors: str | None = None,
+    storage_options: StorageOptions | None = None,
+) -> IOHandles[str] | IOHandles[bytes]:
+    """
+    Get file handle for given path/buffer and mode.
+
+    Parameters
+    ----------
+    path_or_buf : str or file handle
+        File path or object.
+    mode : str
+        Mode to open path_or_buf with.
+    encoding : str or None
+        Encoding to use.
+    compression : str or dict, default 'infer'
+        For on-the-fly compression of the output data. If 'infer' and 'path_or_buf'
+        is path-like, then detect compression from the following extensions: '.gz',
+        '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+        (otherwise no compression).
+        Set to ``None`` for no compression.
+        Can also be a dict with key ``'method'`` set
+        to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``}
+        and other key-value pairs are forwarded to
+        ``zipfile.ZipFile``, ``gzip.GzipFile``,
+        ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
+        ``tarfile.TarFile``, respectively.
+        As an example, the following could be passed for faster compression and to
+        create a reproducible gzip archive:
+        ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
+
+           May be a dict with key 'method' as compression mode
+           and other keys as compression options if compression
+           mode is 'zip'.
+
+           Passing compression options as keys in dict is
+           supported for compression modes 'gzip', 'bz2', 'zstd' and 'zip'.
+
+    memory_map : bool, default False
+        See parsers._parser_params for more information. Only used by read_csv.
+    is_text : bool, default True
+        Whether the type of the content passed to the file/buffer is string or
+        bytes. This is not the same as `"b" not in mode`. If a string content is
+        passed to a binary file/buffer, a wrapper is inserted.
+    errors : str, default 'strict'
+        Specifies how encoding and decoding errors are to be handled.
+        See the errors argument for :func:`open` for a full list
+        of options.
+    storage_options: StorageOptions = None
+        Passed to _get_filepath_or_buffer
+
+    Returns the dataclass IOHandles
+    """
+    # Windows does not default to utf-8. Set to utf-8 for a consistent behavior
+    encoding = encoding or "utf-8"
+
+    errors = errors or "strict"
+
+    # read_csv does not know whether the buffer is opened in binary/text mode
+    if _is_binary_mode(path_or_buf, mode) and "b" not in mode:
+        mode += "b"
+
+    # validate encoding and errors
+    codecs.lookup(encoding)
+    if isinstance(errors, str):
+        codecs.lookup_error(errors)
+
+    # open URLs
+    ioargs = _get_filepath_or_buffer(
+        path_or_buf,
+        encoding=encoding,
+        compression=compression,
+        mode=mode,
+        storage_options=storage_options,
+    )
+
+    handle = ioargs.filepath_or_buffer
+    handles: list[BaseBuffer]
+
+    # memory mapping needs to be the first step
+    # only used for read_csv
+    handle, memory_map, handles = _maybe_memory_map(handle, memory_map)
+
+    is_path = isinstance(handle, str)
+    compression_args = dict(ioargs.compression)
+    compression = compression_args.pop("method")
+
+    # Only for write methods
+    if "r" not in mode and is_path:
+        check_parent_directory(str(handle))
+
+    if compression:
+        if compression != "zstd":
+            # compression libraries do not like an explicit text-mode
+            ioargs.mode = ioargs.mode.replace("t", "")
+        elif compression == "zstd" and "b" not in ioargs.mode:
+            # python-zstandard defaults to text mode, but we always expect
+            # compression libraries to use binary mode.
+            ioargs.mode += "b"
+
+        # GZ Compression
+        if compression == "gzip":
+            if isinstance(handle, str):
+                # error: Incompatible types in assignment (expression has type
+                # "GzipFile", variable has type "Union[str, BaseBuffer]")
+                handle = gzip.GzipFile(  # type: ignore[assignment]
+                    filename=handle,
+                    mode=ioargs.mode,
+                    **compression_args,
+                )
+            else:
+                handle = gzip.GzipFile(
+                    # No overload variant of "GzipFile" matches argument types
+                    # "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
+                    fileobj=handle,  # type: ignore[call-overload]
+                    mode=ioargs.mode,
+                    **compression_args,
+                )
+
+        # BZ Compression
+        elif compression == "bz2":
+            import bz2
+
+            # Overload of "BZ2File" to handle pickle protocol 5
+            # "Union[str, BaseBuffer]", "str", "Dict[str, Any]"
+            handle = bz2.BZ2File(  # type: ignore[call-overload]
+                handle,
+                mode=ioargs.mode,
+                **compression_args,
+            )
+
+        # ZIP Compression
+        elif compression == "zip":
+            # error: Argument 1 to "_BytesZipFile" has incompatible type
+            # "Union[str, BaseBuffer]"; expected "Union[Union[str, PathLike[str]],
+            # ReadBuffer[bytes], WriteBuffer[bytes]]"
+            handle = _BytesZipFile(
+                handle,  # type: ignore[arg-type]
+                ioargs.mode,
+                **compression_args,
+            )
+            if handle.buffer.mode == "r":
+                handles.append(handle)
+                zip_names = handle.buffer.namelist()
+                if len(zip_names) == 1:
+                    handle = handle.buffer.open(zip_names.pop())
+                elif not zip_names:
+                    raise ValueError(f"Zero files found in ZIP file {path_or_buf}")
+                else:
+                    raise ValueError(
+                        "Multiple files found in ZIP file. "
+                        f"Only one file per ZIP: {zip_names}"
+                    )
+
+        # TAR Encoding
+        elif compression == "tar":
+            compression_args.setdefault("mode", ioargs.mode)
+            if isinstance(handle, str):
+                handle = _BytesTarFile(name=handle, **compression_args)
+            else:
+                # error: Argument "fileobj" to "_BytesTarFile" has incompatible
+                # type "BaseBuffer"; expected "Union[ReadBuffer[bytes],
+                # WriteBuffer[bytes], None]"
+                handle = _BytesTarFile(
+                    fileobj=handle,  # type: ignore[arg-type]
+                    **compression_args,
+                )
+            assert isinstance(handle, _BytesTarFile)
+            if "r" in handle.buffer.mode:
+                handles.append(handle)
+                files = handle.buffer.getnames()
+                if len(files) == 1:
+                    file = handle.buffer.extractfile(files[0])
+                    assert file is not None
+                    handle = file
+                elif not files:
+                    raise ValueError(f"Zero files found in TAR archive {path_or_buf}")
+                else:
+                    raise ValueError(
+                        "Multiple files found in TAR archive. "
+                        f"Only one file per TAR archive: {files}"
+                    )
+
+        # XZ Compression
+        elif compression == "xz":
+            # error: Argument 1 to "LZMAFile" has incompatible type "Union[str,
+            # BaseBuffer]"; expected "Optional[Union[Union[str, bytes, PathLike[str],
+            # PathLike[bytes]], IO[bytes]], None]"
+            import lzma
+
+            handle = lzma.LZMAFile(
+                handle,  # type: ignore[arg-type]
+                ioargs.mode,
+                **compression_args,
+            )
+
+        # Zstd Compression
+        elif compression == "zstd":
+            zstd = import_optional_dependency("zstandard")
+            if "r" in ioargs.mode:
+                open_args = {"dctx": zstd.ZstdDecompressor(**compression_args)}
+            else:
+                open_args = {"cctx": zstd.ZstdCompressor(**compression_args)}
+            handle = zstd.open(
+                handle,
+                mode=ioargs.mode,
+                **open_args,
+            )
+
+        # Unrecognized Compression
+        else:
+            msg = f"Unrecognized compression type: {compression}"
+            raise ValueError(msg)
+
+        assert not isinstance(handle, str)
+        handles.append(handle)
+
+    elif isinstance(handle, str):
+        # Check whether the filename is to be opened in binary mode.
+        # Binary mode does not support 'encoding' and 'newline'.
+        if ioargs.encoding and "b" not in ioargs.mode:
+            # Encoding
+            handle = open(
+                handle,
+                ioargs.mode,
+                encoding=ioargs.encoding,
+                errors=errors,
+                newline="",
+            )
+        else:
+            # Binary mode
+            handle = open(handle, ioargs.mode)
+        handles.append(handle)
+
+    # Convert BytesIO or file objects passed with an encoding
+    is_wrapped = False
+    if not is_text and ioargs.mode == "rb" and isinstance(handle, TextIOBase):
+        # not added to handles as it does not open/buffer resources
+        handle = _BytesIOWrapper(
+            handle,
+            encoding=ioargs.encoding,
+        )
+    elif is_text and (
+        compression or memory_map or _is_binary_mode(handle, ioargs.mode)
+    ):
+        if (
+            not hasattr(handle, "readable")
+            or not hasattr(handle, "writable")
+            or not hasattr(handle, "seekable")
+        ):
+            handle = _IOWrapper(handle)
+        # error: Value of type variable "_BufferT_co" of "TextIOWrapper" cannot
+        # be "_IOWrapper | BaseBuffer" [type-var]
+        handle = TextIOWrapper(
+            handle,  # type: ignore[type-var]
+            encoding=ioargs.encoding,
+            errors=errors,
+            newline="",
+        )
+        handles.append(handle)
+        # only marked as wrapped when the caller provided a handle
+        is_wrapped = not (
+            isinstance(ioargs.filepath_or_buffer, str) or ioargs.should_close
+        )
+
+    if "r" in ioargs.mode and not hasattr(handle, "read"):
+        raise TypeError(
+            "Expected file path name or file-like object, "
+            f"got {type(ioargs.filepath_or_buffer)} type"
+        )
+
+    handles.reverse()  # close the most recently added buffer first
+    if ioargs.should_close:
+        assert not isinstance(ioargs.filepath_or_buffer, str)
+        handles.append(ioargs.filepath_or_buffer)
+
+    return IOHandles(
+        # error: Argument "handle" to "IOHandles" has incompatible type
+        # "Union[TextIOWrapper, GzipFile, BaseBuffer, typing.IO[bytes],
+        # typing.IO[Any]]"; expected "pandas._typing.IO[Any]"
+        handle=handle,  # type: ignore[arg-type]
+        # error: Argument "created_handles" to "IOHandles" has incompatible type
+        # "List[BaseBuffer]"; expected "List[Union[IO[bytes], IO[str]]]"
+        created_handles=handles,  # type: ignore[arg-type]
+        is_wrapped=is_wrapped,
+        compression=ioargs.compression,
+    )
+
+
+class _BufferedWriter(BytesIO, ABC):
+    """
+    Some objects do not support multiple .write() calls (TarFile and ZipFile).
+    This wrapper writes to the underlying buffer on close.
+    """
+
+    buffer = BytesIO()
+
+    @abstractmethod
+    def write_to_buffer(self) -> None: ...
+
+    def close(self) -> None:
+        if self.closed:
+            # already closed
+            return
+        if self.getbuffer().nbytes:
+            # write to buffer
+            self.seek(0)
+            with self.buffer:
+                self.write_to_buffer()
+        else:
+            self.buffer.close()
+        super().close()
+
+
+class _BytesTarFile(_BufferedWriter):
+    def __init__(
+        self,
+        name: str | None = None,
+        mode: Literal["r", "a", "w", "x"] = "r",
+        fileobj: ReadBuffer[bytes] | WriteBuffer[bytes] | None = None,
+        archive_name: str | None = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+        self.archive_name = archive_name
+        self.name = name
+        #  error: No overload variant of "open" of "TarFile" matches argument
+        # types "str | None", "str", "ReadBuffer[bytes] | WriteBuffer[bytes] | None",
+        # "dict[str, Any]"
+        # error: Incompatible types in assignment (expression has type "TarFile",
+        #  base class "_BufferedWriter" defined the type as "BytesIO")
+        self.buffer: tarfile.TarFile = tarfile.TarFile.open(  # type: ignore[call-overload, assignment]
+            name=name,
+            mode=self.extend_mode(mode),
+            fileobj=fileobj,
+            **kwargs,
+        )
+
+    def extend_mode(self, mode: str) -> str:
+        mode = mode.replace("b", "")
+        if mode != "w":
+            return mode
+        if self.name is not None:
+            suffix = Path(self.name).suffix
+            if suffix in (".gz", ".xz", ".bz2"):
+                mode = f"{mode}:{suffix[1:]}"
+        return mode
+
+    def infer_filename(self) -> str | None:
+        """
+        If an explicit archive_name is not given, we still want the file inside the zip
+        file not to be named something.tar, because that causes confusion (GH39465).
+        """
+        if self.name is None:
+            return None
+
+        filename = Path(self.name)
+        if filename.suffix == ".tar":
+            return filename.with_suffix("").name
+        elif filename.suffix in (".tar.gz", ".tar.bz2", ".tar.xz"):
+            return filename.with_suffix("").with_suffix("").name
+        return filename.name
+
+    def write_to_buffer(self) -> None:
+        # TarFile needs a non-empty string
+        archive_name = self.archive_name or self.infer_filename() or "tar"
+        tarinfo = tarfile.TarInfo(name=archive_name)
+        tarinfo.size = len(self.getvalue())
+        self.buffer.addfile(tarinfo, self)
+
+
+class _BytesZipFile(_BufferedWriter):
+    def __init__(
+        self,
+        file: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
+        mode: str,
+        archive_name: str | None = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+        mode = mode.replace("b", "")
+        self.archive_name = archive_name
+
+        kwargs.setdefault("compression", zipfile.ZIP_DEFLATED)
+        # error: No overload variant of "ZipFile" matches argument types
+        # "str | PathLike[str] | ReadBuffer[bytes] | WriteBuffer[bytes]",
+        # "str", "dict[str, Any]"
+        # error: Incompatible types in assignment (expression has type "ZipFile",
+        # base class "_BufferedWriter" defined the type as "BytesIO")
+        self.buffer: zipfile.ZipFile = zipfile.ZipFile(  # type: ignore[call-overload, assignment]
+            file, mode, **kwargs
+        )
+
+    def infer_filename(self) -> str | None:
+        """
+        If an explicit archive_name is not given, we still want the file inside the zip
+        file not to be named something.zip, because that causes confusion (GH39465).
+        """
+        if isinstance(self.buffer.filename, (os.PathLike, str)):
+            filename = Path(self.buffer.filename)
+            if filename.suffix == ".zip":
+                return filename.with_suffix("").name
+            return filename.name
+        return None
+
+    def write_to_buffer(self) -> None:
+        # ZipFile needs a non-empty string
+        archive_name = self.archive_name or self.infer_filename() or "zip"
+        self.buffer.writestr(archive_name, self.getvalue())
+
+
+class _IOWrapper:
+    # TextIOWrapper is overly strict: it request that the buffer has seekable, readable,
+    # and writable. If we have a read-only buffer, we shouldn't need writable and vice
+    # versa. Some buffers, are seek/read/writ-able but they do not have the "-able"
+    # methods, e.g., tempfile.SpooledTemporaryFile.
+    # If a buffer does not have the above "-able" methods, we simple assume they are
+    # seek/read/writ-able.
+    def __init__(self, buffer: BaseBuffer) -> None:
+        self.buffer = buffer
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self.buffer, name)
+
+    def readable(self) -> bool:
+        if hasattr(self.buffer, "readable"):
+            return self.buffer.readable()
+        return True
+
+    def seekable(self) -> bool:
+        if hasattr(self.buffer, "seekable"):
+            return self.buffer.seekable()
+        return True
+
+    def writable(self) -> bool:
+        if hasattr(self.buffer, "writable"):
+            return self.buffer.writable()
+        return True
+
+
+class _BytesIOWrapper:
+    # Wrapper that wraps a StringIO buffer and reads bytes from it
+    # Created for compat with pyarrow read_csv
+    def __init__(self, buffer: StringIO | TextIOBase, encoding: str = "utf-8") -> None:
+        self.buffer = buffer
+        self.encoding = encoding
+        # Because a character can be represented by more than 1 byte,
+        # it is possible that reading will produce more bytes than n
+        # We store the extra bytes in this overflow variable, and append the
+        # overflow to the front of the bytestring the next time reading is performed
+        self.overflow = b""
+
+    def __getattr__(self, attr: str) -> Any:
+        return getattr(self.buffer, attr)
+
+    def read(self, n: int | None = -1) -> bytes:
+        assert self.buffer is not None
+        bytestring = self.buffer.read(n).encode(self.encoding)
+        # When n=-1/n greater than remaining bytes: Read entire file/rest of file
+        combined_bytestring = self.overflow + bytestring
+        if n is None or n < 0 or n >= len(combined_bytestring):
+            self.overflow = b""
+            return combined_bytestring
+        else:
+            to_return = combined_bytestring[:n]
+            self.overflow = combined_bytestring[n:]
+            return to_return
+
+
+def _maybe_memory_map(
+    handle: str | BaseBuffer, memory_map: bool
+) -> tuple[str | BaseBuffer, bool, list[BaseBuffer]]:
+    """Try to memory map file/buffer."""
+    handles: list[BaseBuffer] = []
+    memory_map &= hasattr(handle, "fileno") or isinstance(handle, str)
+    if not memory_map:
+        return handle, memory_map, handles
+
+    # mmap used by only read_csv
+    handle = cast(ReadCsvBuffer, handle)
+
+    # need to open the file first
+    if isinstance(handle, str):
+        handle = open(handle, "rb")
+        handles.append(handle)
+
+    try:
+        # open mmap and adds *-able
+        # error: Argument 1 to "_IOWrapper" has incompatible type "mmap";
+        # expected "BaseBuffer"
+        wrapped = _IOWrapper(
+            mmap.mmap(
+                handle.fileno(),
+                0,
+                access=mmap.ACCESS_READ,  # type: ignore[arg-type]
+            )
+        )
+    finally:
+        for handle in reversed(handles):
+            # error: "BaseBuffer" has no attribute "close"
+            handle.close()  # type: ignore[attr-defined]
+
+    return wrapped, memory_map, [wrapped]
+
+
+def file_exists(filepath_or_buffer: FilePath | BaseBuffer) -> bool:
+    """Test whether file exists."""
+    exists = False
+    filepath_or_buffer = stringify_path(filepath_or_buffer)
+    if not isinstance(filepath_or_buffer, str):
+        return exists
+    try:
+        exists = os.path.exists(filepath_or_buffer)
+        # gh-5874: if the filepath is too long will raise here
+    except (TypeError, ValueError):
+        pass
+    return exists
+
+
+def _is_binary_mode(handle: FilePath | BaseBuffer, mode: str) -> bool:
+    """Whether the handle is opened in binary mode"""
+    # specified by user
+    if "t" in mode or "b" in mode:
+        return "b" in mode
+
+    # exceptions
+    text_classes = (
+        # classes that expect string but have 'b' in mode
+        codecs.StreamWriter,
+        codecs.StreamReader,
+        codecs.StreamReaderWriter,
+    )
+    if issubclass(type(handle), text_classes):
+        return False
+
+    return isinstance(handle, _get_binary_io_classes()) or "b" in getattr(
+        handle, "mode", mode
+    )
+
+
+@functools.lru_cache
+def _get_binary_io_classes() -> tuple[type, ...]:
+    """IO classes that that expect bytes"""
+    binary_classes: tuple[type, ...] = (BufferedIOBase, RawIOBase)
+
+    # python-zstandard doesn't use any of the builtin base classes; instead we
+    # have to use the `zstd.ZstdDecompressionReader` class for isinstance checks.
+    # Unfortunately `zstd.ZstdDecompressionReader` isn't exposed by python-zstandard
+    # so we have to get it from a `zstd.ZstdDecompressor` instance.
+    # See also https://github.com/indygreg/python-zstandard/pull/165.
+    zstd = import_optional_dependency("zstandard", errors="ignore")
+    if zstd is not None:
+        with zstd.ZstdDecompressor().stream_reader(b"") as reader:
+            binary_classes += (type(reader),)
+
+    return binary_classes
+
+
+def is_potential_multi_index(
+    columns: Sequence[Hashable] | MultiIndex,
+    index_col: bool | Sequence[int] | None = None,
+) -> bool:
+    """
+    Check whether or not the `columns` parameter
+    could be converted into a MultiIndex.
+
+    Parameters
+    ----------
+    columns : array-like
+        Object which may or may not be convertible into a MultiIndex
+    index_col : None, bool or list, optional
+        Column or columns to use as the (possibly hierarchical) index
+
+    Returns
+    -------
+    bool : Whether or not columns could become a MultiIndex
+    """
+    if index_col is None or isinstance(index_col, bool):
+        index_columns = set()
+    else:
+        index_columns = set(index_col)
+
+    return bool(
+        len(columns)
+        and not isinstance(columns, ABCMultiIndex)
+        and all(isinstance(c, tuple) for c in columns if c not in index_columns)
+    )
+
+
+def dedup_names(
+    names: Sequence[Hashable], is_potential_multiindex: bool
+) -> Sequence[Hashable]:
+    """
+    Rename column names if duplicates exist.
+
+    Currently the renaming is done by appending a period and an autonumeric,
+    but a custom pattern may be supported in the future.
+
+    Examples
+    --------
+    >>> dedup_names(["x", "y", "x", "x"], is_potential_multiindex=False)
+    ['x', 'y', 'x.1', 'x.2']
+    """
+    names = list(names)  # so we can index
+    counts: DefaultDict[Hashable, int] = defaultdict(int)
+
+    for i, col in enumerate(names):
+        cur_count = counts[col]
+
+        while cur_count > 0:
+            counts[col] = cur_count + 1
+
+            if is_potential_multiindex:
+                # for mypy
+                assert isinstance(col, tuple)
+                col = (*col[:-1], f"{col[-1]}.{cur_count}")
+            else:
+                col = f"{col}.{cur_count}"
+            cur_count = counts[col]
+
+        names[i] = col
+        counts[col] = cur_count + 1
+
+    return names
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..750df6143aa56fe429bf4240bc813b5f4df4b5ac
--- /dev/null
+++ b/pandas/io/feather_format.py
@@ -0,0 +1,181 @@
+"""feather-format compat"""
+
+from __future__ import annotations
+
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+import warnings
+
+import numpy as np
+
+from pandas._config import using_string_dtype
+
+from pandas._libs import lib
+from pandas.compat._optional import import_optional_dependency
+from pandas.errors import Pandas4Warning
+from pandas.util._decorators import set_module
+from pandas.util._validators import check_dtype_backend
+
+from pandas.core.api import DataFrame
+from pandas.core.arrays.string_ import StringDtype
+
+from pandas.io._util import arrow_table_to_pandas
+from pandas.io.common import get_handle
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Hashable,
+        Sequence,
+    )
+
+    from pandas._typing import (
+        DtypeBackend,
+        FilePath,
+        ReadBuffer,
+        StorageOptions,
+        WriteBuffer,
+    )
+
+
+def to_feather(
+    df: DataFrame,
+    path: FilePath | WriteBuffer[bytes],
+    storage_options: StorageOptions | None = None,
+    **kwargs: Any,
+) -> None:
+    """
+    Write a DataFrame to the binary Feather format.
+
+    Parameters
+    ----------
+    df : DataFrame
+    path : str, path object, or file-like object
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection, e.g.
+        host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+        are forwarded to ``urllib.request.Request`` as header options. For other
+        URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+        forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+        details, and for more examples on storage options refer `here
+        <https://pandas.pydata.org/docs/user_guide/io.html?
+        highlight=storage_options#reading-writing-remote-files>`_.
+    **kwargs :
+        Additional keywords passed to `pyarrow.feather.write_feather`.
+
+    """
+    import_optional_dependency("pyarrow")
+    from pyarrow import feather
+
+    if not isinstance(df, DataFrame):
+        raise ValueError("feather only support IO with DataFrames")
+
+    with get_handle(
+        path, "wb", storage_options=storage_options, is_text=False
+    ) as handles:
+        feather.write_feather(df, handles.handle, **kwargs)
+
+
+@set_module("pandas")
+def read_feather(
+    path: FilePath | ReadBuffer[bytes],
+    columns: Sequence[Hashable] | None = None,
+    use_threads: bool = True,
+    storage_options: StorageOptions | None = None,
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+) -> DataFrame:
+    """
+    Load a feather-format object from the file path.
+
+    Feather is particularly useful for scenarios that require efficient
+    serialization and deserialization of tabular data. It supports
+    schema preservation, making it a reliable choice for use cases
+    such as sharing data between Python and R, or persisting intermediate
+    results during data processing pipelines. This method provides additional
+    flexibility with options for selective column reading, thread parallelism,
+    and choosing the backend for data types.
+
+    Parameters
+    ----------
+    path : str, path object, or file-like object
+        String, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a binary ``read()`` function. The string could be a URL.
+        Valid URL schemes include http, ftp, s3, gs and file. For file URLs, a host is
+        expected. A local file could be: ``file://localhost/path/to/table.feather``.
+    columns : sequence, default None
+        If not provided, all columns are read.
+    use_threads : bool, default True
+        Whether to parallelize reading using multiple threads.
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection, e.g.
+        host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+        are forwarded to ``urllib.request.Request`` as header options. For other
+        URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+        forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+        details, and for more examples on storage options refer `here
+        <https://pandas.pydata.org/docs/user_guide/io.html?
+        highlight=storage_options#reading-writing-remote-files>`_.
+
+    dtype_backend : {{'numpy_nullable', 'pyarrow'}}
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). If not specified, the default behavior
+        is to not use nullable data types. If specified, the behavior
+        is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`.
+        * ``"pyarrow"``: returns pyarrow-backed nullable
+          :class:`ArrowDtype` :class:`DataFrame`
+
+        .. versionadded:: 2.0
+
+    Returns
+    -------
+    type of object stored in file
+        DataFrame object stored in the file.
+
+    See Also
+    --------
+    read_csv : Read a comma-separated values (csv) file into a pandas DataFrame.
+    read_excel : Read an Excel file into a pandas DataFrame.
+    read_spss : Read an SPSS file into a pandas DataFrame.
+    read_orc : Load an ORC object into a pandas DataFrame.
+    read_sas : Read SAS file into a pandas DataFrame.
+
+    Examples
+    --------
+    >>> df = pd.read_feather("path/to/file.feather")  # doctest: +SKIP
+    """
+    import_optional_dependency("pyarrow")
+    from pyarrow import feather
+
+    # import utils to register the pyarrow extension types
+    import pandas.core.arrays.arrow.extension_types  # pyright: ignore[reportUnusedImport] # noqa: F401
+
+    check_dtype_backend(dtype_backend)
+
+    with get_handle(
+        path, "rb", storage_options=storage_options, is_text=False
+    ) as handles:
+        if dtype_backend is lib.no_default and not using_string_dtype():
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    "make_block is deprecated",
+                    Pandas4Warning,
+                )
+
+                df = feather.read_feather(
+                    handles.handle, columns=columns, use_threads=bool(use_threads)
+                )
+                # Convert any StringDtype columns to object dtype (pyarrow always
+                # uses string dtype even when the infer_string option is False)
+                for col, dtype in zip(df.columns, df.dtypes, strict=True):
+                    if isinstance(dtype, StringDtype) and dtype.na_value is np.nan:
+                        df[col] = df[col].astype("object")
+                return df
+
+        pa_table = feather.read_table(
+            handles.handle, columns=columns, use_threads=bool(use_threads)
+        )
+        return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
diff --git a/pandas/io/html.py b/pandas/io/html.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ceba63dea7690573ae66575748074ff126a12af
--- /dev/null
+++ b/pandas/io/html.py
@@ -0,0 +1,1245 @@
+"""
+:mod:`pandas.io.html` is a module containing functionality for dealing with
+HTML IO.
+
+"""
+
+from __future__ import annotations
+
+from collections import abc
+import errno
+import numbers
+import os
+import re
+from re import Pattern
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+    cast,
+)
+
+from pandas._libs import lib
+from pandas.compat._optional import import_optional_dependency
+from pandas.errors import (
+    AbstractMethodError,
+    EmptyDataError,
+)
+from pandas.util._decorators import set_module
+from pandas.util._validators import check_dtype_backend
+
+from pandas.core.dtypes.common import is_list_like
+
+from pandas import isna
+from pandas.core.indexes.base import Index
+from pandas.core.indexes.multi import MultiIndex
+from pandas.core.series import Series
+
+from pandas.io.common import (
+    get_handle,
+    is_url,
+    stringify_path,
+    validate_header_arg,
+)
+from pandas.io.formats.printing import pprint_thing
+from pandas.io.parsers import TextParser
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Iterable,
+        Sequence,
+    )
+
+    from pandas._typing import (
+        BaseBuffer,
+        DtypeBackend,
+        FilePath,
+        HTMLFlavors,
+        ReadBuffer,
+        StorageOptions,
+    )
+
+    from pandas import DataFrame
+
+#############
+# READ HTML #
+#############
+_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
+
+
+def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:
+    """
+    Replace extra whitespace inside of a string with a single space.
+
+    Parameters
+    ----------
+    s : str or unicode
+        The string from which to remove extra whitespace.
+    regex : re.Pattern
+        The regular expression to use to remove extra whitespace.
+
+    Returns
+    -------
+    subd : str or unicode
+        `s` with all extra whitespace replaced with a single space.
+    """
+    return regex.sub(" ", s.strip())
+
+
+def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:
+    """
+    Get an iterator given an integer, slice or container.
+
+    Parameters
+    ----------
+    skiprows : int, slice, container
+        The iterator to use to skip rows; can also be a slice.
+
+    Raises
+    ------
+    TypeError
+        * If `skiprows` is not a slice, integer, or Container
+
+    Returns
+    -------
+    it : iterable
+        A proper iterator to use to skip rows of a DataFrame.
+    """
+    if isinstance(skiprows, slice):
+        start, step = skiprows.start or 0, skiprows.step or 1
+        return list(range(start, skiprows.stop, step))
+    elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
+        return cast("int | Sequence[int]", skiprows)
+    elif skiprows is None:
+        return 0
+    raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
+
+
+def _read(
+    obj: FilePath | BaseBuffer,
+    encoding: str | None,
+    storage_options: StorageOptions | None,
+) -> str | bytes:
+    """
+    Try to read from a url, file or string.
+
+    Parameters
+    ----------
+    obj : str, unicode, path object, or file-like object
+
+    Returns
+    -------
+    raw_text : str
+    """
+    try:
+        with get_handle(
+            obj, "r", encoding=encoding, storage_options=storage_options
+        ) as handles:
+            return handles.handle.read()
+    except OSError as err:
+        if not is_url(obj):
+            raise FileNotFoundError(
+                f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}"
+            ) from err
+        raise
+
+
+class _HtmlFrameParser:
+    """
+    Base class for parsers that parse HTML into DataFrames.
+
+    Parameters
+    ----------
+    io : str or file-like
+        This can be either a string path, a valid URL using the HTTP,
+        FTP, or FILE protocols or a file-like object.
+
+    match : str or regex
+        The text to match in the document.
+
+    attrs : dict
+        List of HTML <table> element attributes to match.
+
+    encoding : str
+        Encoding to be used by parser
+
+    displayed_only : bool
+        Whether or not items with "display:none" should be ignored
+
+    extract_links : {None, "all", "header", "body", "footer"}
+        Table elements in the specified section(s) with <a> tags will have their
+        href extracted.
+
+    Attributes
+    ----------
+    io : str or file-like
+        raw HTML, URL, or file-like object
+
+    match : regex
+        The text to match in the raw HTML
+
+    attrs : dict-like
+        A dictionary of valid table attributes to use to search for table
+        elements.
+
+    encoding : str
+        Encoding to be used by parser
+
+    displayed_only : bool
+        Whether or not items with "display:none" should be ignored
+
+    extract_links : {None, "all", "header", "body", "footer"}
+        Table elements in the specified section(s) with <a> tags will have their
+        href extracted.
+
+    Notes
+    -----
+    To subclass this class effectively you must override the following methods:
+        * :func:`_build_doc`
+        * :func:`_attr_getter`
+        * :func:`_href_getter`
+        * :func:`_text_getter`
+        * :func:`_parse_td`
+        * :func:`_parse_thead_tr`
+        * :func:`_parse_tbody_tr`
+        * :func:`_parse_tfoot_tr`
+        * :func:`_parse_tables`
+        * :func:`_equals_tag`
+    See each method's respective documentation for details on their
+    functionality.
+    """
+
+    def __init__(
+        self,
+        io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
+        match: str | Pattern,
+        attrs: dict[str, str] | None,
+        encoding: str,
+        displayed_only: bool,
+        extract_links: Literal["header", "footer", "body", "all"] | None,
+        storage_options: StorageOptions = None,
+    ) -> None:
+        self.io = io
+        self.match = match
+        self.attrs = attrs
+        self.encoding = encoding
+        self.displayed_only = displayed_only
+        self.extract_links = extract_links
+        self.storage_options = storage_options
+
+    def parse_tables(self):
+        """
+        Parse and return all tables from the DOM.
+
+        Returns
+        -------
+        list of parsed (header, body, footer) tuples from tables.
+        """
+        tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
+        return (self._parse_thead_tbody_tfoot(table) for table in tables)
+
+    def _attr_getter(self, obj, attr):
+        """
+        Return the attribute value of an individual DOM node.
+
+        Parameters
+        ----------
+        obj : node-like
+            A DOM node.
+
+        attr : str or unicode
+            The attribute, such as "colspan"
+
+        Returns
+        -------
+        str or unicode
+            The attribute value.
+        """
+        # Both lxml and BeautifulSoup have the same implementation:
+        return obj.get(attr)
+
+    def _href_getter(self, obj) -> str | None:
+        """
+        Return an href if the DOM node contains a child <a> or None.
+
+        Parameters
+        ----------
+        obj : node-like
+            A DOM node.
+
+        Returns
+        -------
+        href : str or unicode
+            The href from the <a> child of the DOM node.
+        """
+        raise AbstractMethodError(self)
+
+    def _text_getter(self, obj):
+        """
+        Return the text of an individual DOM node.
+
+        Parameters
+        ----------
+        obj : node-like
+            A DOM node.
+
+        Returns
+        -------
+        text : str or unicode
+            The text from an individual DOM node.
+        """
+        raise AbstractMethodError(self)
+
+    def _parse_td(self, obj):
+        """
+        Return the td elements from a row element.
+
+        Parameters
+        ----------
+        obj : node-like
+            A DOM <tr> node.
+
+        Returns
+        -------
+        list of node-like
+            These are the elements of each row, i.e., the columns.
+        """
+        raise AbstractMethodError(self)
+
+    def _parse_thead_tr(self, table):
+        """
+        Return the list of thead row elements from the parsed table element.
+
+        Parameters
+        ----------
+        table : a table element that contains zero or more thead elements.
+
+        Returns
+        -------
+        list of node-like
+            These are the <tr> row elements of a table.
+        """
+        raise AbstractMethodError(self)
+
+    def _parse_tbody_tr(self, table):
+        """
+        Return the list of tbody row elements from the parsed table element.
+
+        HTML5 table bodies consist of either 0 or more <tbody> elements (which
+        only contain <tr> elements) or 0 or more <tr> elements. This method
+        checks for both structures.
+
+        Parameters
+        ----------
+        table : a table element that contains row elements.
+
+        Returns
+        -------
+        list of node-like
+            These are the <tr> row elements of a table.
+        """
+        raise AbstractMethodError(self)
+
+    def _parse_tfoot_tr(self, table):
+        """
+        Return the list of tfoot row elements from the parsed table element.
+
+        Parameters
+        ----------
+        table : a table element that contains row elements.
+
+        Returns
+        -------
+        list of node-like
+            These are the <tr> row elements of a table.
+        """
+        raise AbstractMethodError(self)
+
+    def _parse_tables(self, document, match, attrs):
+        """
+        Return all tables from the parsed DOM.
+
+        Parameters
+        ----------
+        document : the DOM from which to parse the table element.
+
+        match : str or regular expression
+            The text to search for in the DOM tree.
+
+        attrs : dict
+            A dictionary of table attributes that can be used to disambiguate
+            multiple tables on a page.
+
+        Raises
+        ------
+        ValueError : `match` does not match any text in the document.
+
+        Returns
+        -------
+        list of node-like
+            HTML <table> elements to be parsed into raw data.
+        """
+        raise AbstractMethodError(self)
+
+    def _equals_tag(self, obj, tag) -> bool:
+        """
+        Return whether an individual DOM node matches a tag
+
+        Parameters
+        ----------
+        obj : node-like
+            A DOM node.
+
+        tag : str
+            Tag name to be checked for equality.
+
+        Returns
+        -------
+        boolean
+            Whether `obj`'s tag name is `tag`
+        """
+        raise AbstractMethodError(self)
+
+    def _build_doc(self):
+        """
+        Return a tree-like object that can be used to iterate over the DOM.
+
+        Returns
+        -------
+        node-like
+            The DOM from which to parse the table element.
+        """
+        raise AbstractMethodError(self)
+
+    def _parse_thead_tbody_tfoot(self, table_html):
+        """
+        Given a table, return parsed header, body, and foot.
+
+        Parameters
+        ----------
+        table_html : node-like
+
+        Returns
+        -------
+        tuple of (header, body, footer), each a list of list-of-text rows.
+
+        Notes
+        -----
+        Header and body are lists-of-lists. Top level list is a list of
+        rows. Each row is a list of str text.
+
+        Logic: Use <thead>, <tbody>, <tfoot> elements to identify
+               header, body, and footer, otherwise:
+               - Put all rows into body
+               - Move rows from top of body to header only if
+                 all elements inside row are <th>
+               - Move rows from bottom of body to footer only if
+                 all elements inside row are <th>
+        """
+        header_rows = self._parse_thead_tr(table_html)
+        body_rows = self._parse_tbody_tr(table_html)
+        footer_rows = self._parse_tfoot_tr(table_html)
+
+        def row_is_all_th(row):
+            return all(self._equals_tag(t, "th") for t in self._parse_td(row))
+
+        if not header_rows:
+            # The table has no <thead>. Move the top all-<th> rows from
+            # body_rows to header_rows. (This is a common case because many
+            # tables in the wild have no <thead> or <tfoot>
+            while body_rows and row_is_all_th(body_rows[0]):
+                header_rows.append(body_rows.pop(0))
+
+        header, rem = self._expand_colspan_rowspan(header_rows, section="header")
+        body, rem = self._expand_colspan_rowspan(
+            body_rows,
+            section="body",
+            remainder=rem,
+            overflow=len(footer_rows) > 0,
+        )
+        footer, _ = self._expand_colspan_rowspan(
+            footer_rows, section="footer", remainder=rem, overflow=False
+        )
+
+        return header, body, footer
+
+    def _expand_colspan_rowspan(
+        self,
+        rows,
+        section: Literal["header", "footer", "body"],
+        remainder: list[tuple[int, str | tuple, int]] | None = None,
+        overflow: bool = True,
+    ) -> tuple[list[list], list[tuple[int, str | tuple, int]]]:
+        """
+        Given a list of <tr>s, return a list of text rows.
+
+        Parameters
+        ----------
+        rows : list of node-like
+            List of <tr>s
+        section : the section that the rows belong to (header, body or footer).
+        remainder: list[tuple[int, str | tuple, int]] | None
+            Any remainder from the expansion of previous section
+        overflow: bool
+            If true, return any partial rows as 'remainder'. If not, use up any
+            partial rows. True by default.
+
+        Returns
+        -------
+        list of list
+            Each returned row is a list of str text, or tuple (text, link)
+            if extract_links is not None.
+        remainder
+            Remaining partial rows if any. If overflow is False, an empty list
+            is returned.
+
+        Notes
+        -----
+        Any cell with ``rowspan`` or ``colspan`` will have its contents copied
+        to subsequent cells.
+        """
+        all_texts = []  # list of rows, each a list of str
+        text: str | tuple
+        remainder = remainder if remainder is not None else []
+
+        for tr in rows:
+            texts = []  # the output for this row
+            next_remainder = []
+
+            index = 0
+            tds = self._parse_td(tr)
+            for td in tds:
+                # Append texts from previous rows with rowspan>1 that come
+                # before this <td>
+                while remainder and remainder[0][0] <= index:
+                    prev_i, prev_text, prev_rowspan = remainder.pop(0)
+                    texts.append(prev_text)
+                    if prev_rowspan > 1:
+                        next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
+                    index += 1
+
+                # Append the text from this <td>, colspan times
+                text = _remove_whitespace(self._text_getter(td))
+                if self.extract_links in ("all", section):
+                    href = self._href_getter(td)
+                    text = (text, href)
+                rowspan = int(self._attr_getter(td, "rowspan") or 1)
+                colspan = int(self._attr_getter(td, "colspan") or 1)
+
+                for _ in range(colspan):
+                    texts.append(text)
+                    if rowspan > 1:
+                        next_remainder.append((index, text, rowspan - 1))
+                    index += 1
+
+            # Append texts from previous rows at the final position
+            for prev_i, prev_text, prev_rowspan in remainder:
+                texts.append(prev_text)
+                if prev_rowspan > 1:
+                    next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
+
+            all_texts.append(texts)
+            remainder = next_remainder
+
+        if not overflow:
+            # Append rows that only appear because the previous row had non-1
+            # rowspan
+            while remainder:
+                next_remainder = []
+                texts = []
+                for prev_i, prev_text, prev_rowspan in remainder:
+                    texts.append(prev_text)
+                    if prev_rowspan > 1:
+                        next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
+                all_texts.append(texts)
+                remainder = next_remainder
+
+        return all_texts, remainder
+
+    def _handle_hidden_tables(self, tbl_list, attr_name: str):
+        """
+        Return list of tables, potentially removing hidden elements
+
+        Parameters
+        ----------
+        tbl_list : list of node-like
+            Type of list elements will vary depending upon parser used
+        attr_name : str
+            Name of the accessor for retrieving HTML attributes
+
+        Returns
+        -------
+        list of node-like
+            Return type matches `tbl_list`
+        """
+        if not self.displayed_only:
+            return tbl_list
+
+        return [
+            x
+            for x in tbl_list
+            if "display:none"
+            not in getattr(x, attr_name).get("style", "").replace(" ", "")
+        ]
+
+
+class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
+    """
+    HTML to DataFrame parser that uses BeautifulSoup under the hood.
+
+    See Also
+    --------
+    pandas.io.html._HtmlFrameParser
+    pandas.io.html._LxmlFrameParser
+
+    Notes
+    -----
+    Documentation strings for this class are in the base class
+    :class:`pandas.io.html._HtmlFrameParser`.
+    """
+
+    def _parse_tables(self, document, match, attrs):
+        element_name = "table"
+        tables = document.find_all(element_name, attrs=attrs)
+        if not tables:
+            raise ValueError("No tables found")
+
+        result = []
+        unique_tables = set()
+        tables = self._handle_hidden_tables(tables, "attrs")
+
+        for table in tables:
+            if self.displayed_only:
+                for elem in table.find_all("style"):
+                    elem.decompose()
+
+                for elem in table.find_all(style=re.compile(r"display:\s*none")):
+                    elem.decompose()
+
+            if table not in unique_tables and table.find(string=match) is not None:
+                result.append(table)
+            unique_tables.add(table)
+        if not result:
+            raise ValueError(f"No tables found matching pattern {match.pattern!r}")
+        return result
+
+    def _href_getter(self, obj) -> str | None:
+        a = obj.find("a", href=True)
+        return None if not a else a["href"]
+
+    def _text_getter(self, obj):
+        return obj.text
+
+    def _equals_tag(self, obj, tag) -> bool:
+        return obj.name == tag
+
+    def _parse_td(self, row):
+        return row.find_all(("td", "th"), recursive=False)
+
+    def _parse_thead_tr(self, table):
+        return table.select("thead tr")
+
+    def _parse_tbody_tr(self, table):
+        from_tbody = table.select("tbody tr")
+        from_root = table.find_all("tr", recursive=False)
+        # HTML spec: at most one of these lists has content
+        return from_tbody + from_root
+
+    def _parse_tfoot_tr(self, table):
+        return table.select("tfoot tr")
+
+    def _setup_build_doc(self):
+        raw_text = _read(self.io, self.encoding, self.storage_options)
+        if not raw_text:
+            raise ValueError(f"No text parsed from document: {self.io}")
+        return raw_text
+
+    def _build_doc(self):
+        from bs4 import BeautifulSoup
+
+        bdoc = self._setup_build_doc()
+        if isinstance(bdoc, bytes) and self.encoding is not None:
+            udoc = bdoc.decode(self.encoding)
+            from_encoding = None
+        else:
+            udoc = bdoc
+            from_encoding = self.encoding
+
+        soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
+
+        for br in soup.find_all("br"):
+            br.replace_with("\n" + br.text)
+
+        return soup
+
+
+def _build_xpath_expr(attrs) -> str:
+    """
+    Build an xpath expression to simulate bs4's ability to pass in kwargs to
+    search for attributes when using the lxml parser.
+
+    Parameters
+    ----------
+    attrs : dict
+        A dict of HTML attributes. These are NOT checked for validity.
+
+    Returns
+    -------
+    expr : unicode
+        An XPath expression that checks for the given HTML attributes.
+    """
+    # give class attribute as class_ because class is a python keyword
+    if "class_" in attrs:
+        attrs["class"] = attrs.pop("class_")
+
+    s = " and ".join([f"@{k}={v!r}" for k, v in attrs.items()])
+    return f"[{s}]"
+
+
+_re_namespace = {"re": "http://exslt.org/regular-expressions"}
+
+
+class _LxmlFrameParser(_HtmlFrameParser):
+    """
+    HTML to DataFrame parser that uses lxml under the hood.
+
+    Warning
+    -------
+    This parser can only handle HTTP, FTP, and FILE urls.
+
+    See Also
+    --------
+    _HtmlFrameParser
+    _BeautifulSoupLxmlFrameParser
+
+    Notes
+    -----
+    Documentation strings for this class are in the base class
+    :class:`_HtmlFrameParser`.
+    """
+
+    def _href_getter(self, obj) -> str | None:
+        href = obj.xpath(".//a/@href")
+        return None if not href else href[0]
+
+    def _text_getter(self, obj):
+        return obj.text_content()
+
+    def _parse_td(self, row):
+        # Look for direct children only: the "row" element here may be a
+        # <thead> or <tfoot> (see _parse_thead_tr).
+        return row.xpath("./td|./th")
+
+    def _parse_tables(self, document, match, kwargs):
+        pattern = match.pattern
+
+        # 1. check all descendants for the given pattern and only search tables
+        # GH 49929
+        xpath_expr = f"//table[.//text()[re:test(., {pattern!r})]]"
+
+        # if any table attributes were given build an xpath expression to
+        # search for them
+        if kwargs:
+            xpath_expr += _build_xpath_expr(kwargs)
+
+        tables = document.xpath(xpath_expr, namespaces=_re_namespace)
+
+        tables = self._handle_hidden_tables(tables, "attrib")
+        if self.displayed_only:
+            for table in tables:
+                # lxml utilizes XPATH 1.0 which does not have regex
+                # support. As a result, we find all elements with a style
+                # attribute and iterate them to check for display:none
+                for elem in table.xpath(".//style"):
+                    elem.drop_tree()
+                for elem in table.xpath(".//*[@style]"):
+                    if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
+                        elem.drop_tree()
+        if not tables:
+            raise ValueError(f"No tables found matching regex {pattern!r}")
+        return tables
+
+    def _equals_tag(self, obj, tag) -> bool:
+        return obj.tag == tag
+
+    def _build_doc(self):
+        """
+        Raises
+        ------
+        ValueError
+            * If a URL that lxml cannot parse is passed.
+
+        Exception
+            * Any other ``Exception`` thrown. For example, trying to parse a
+              URL that is syntactically correct on a machine with no internet
+              connection will fail.
+
+        See Also
+        --------
+        pandas.io.html._HtmlFrameParser._build_doc
+        """
+        from lxml.etree import XMLSyntaxError
+        from lxml.html import (
+            HTMLParser,
+            parse,
+        )
+
+        parser = HTMLParser(recover=True, encoding=self.encoding)
+
+        if is_url(self.io):
+            with get_handle(self.io, "r", storage_options=self.storage_options) as f:
+                r = parse(f.handle, parser=parser)
+        else:
+            # try to parse the input in the simplest way
+            try:
+                r = parse(self.io, parser=parser)
+            except OSError as err:
+                raise FileNotFoundError(
+                    f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {self.io}"
+                ) from err
+        try:
+            r = r.getroot()
+        except AttributeError:
+            pass
+        else:
+            if not hasattr(r, "text_content"):
+                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
+
+        for br in r.xpath("*//br"):
+            br.tail = "\n" + (br.tail or "")
+
+        return r
+
+    def _parse_thead_tr(self, table):
+        rows = []
+
+        for thead in table.xpath(".//thead"):
+            rows.extend(thead.xpath("./tr"))
+
+            # HACK: lxml does not clean up the clearly-erroneous
+            # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
+            # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
+            # children as though it's a <tr>.
+            #
+            # Better solution would be to use html5lib.
+            elements_at_root = thead.xpath("./td|./th")
+            if elements_at_root:
+                rows.append(thead)
+
+        return rows
+
+    def _parse_tbody_tr(self, table):
+        from_tbody = table.xpath(".//tbody//tr")
+        from_root = table.xpath("./tr")
+        # HTML spec: at most one of these lists has content
+        return from_tbody + from_root
+
+    def _parse_tfoot_tr(self, table):
+        return table.xpath(".//tfoot//tr")
+
+
+def _expand_elements(body) -> None:
+    data = [len(elem) for elem in body]
+    lens = Series(data)
+    lens_max = lens.max()
+    not_max = lens[lens != lens_max]
+
+    empty = [""]
+    for ind, length in not_max.items():
+        body[ind] += empty * (lens_max - length)
+
+
+def _data_to_frame(**kwargs):
+    head, body, foot = kwargs.pop("data")
+    header = kwargs.pop("header")
+    kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
+    if head:
+        body = head + body
+
+        # Infer header when there is a <thead> or top <th>-only rows
+        if header is None:
+            if len(head) == 1:
+                header = 0
+            else:
+                # ignore all-empty-text rows
+                header = [i for i, row in enumerate(head) if any(text for text in row)]
+
+    if foot:
+        body += foot
+
+    # fill out elements of body that are "ragged"
+    _expand_elements(body)
+    with TextParser(body, header=header, **kwargs) as tp:
+        return tp.read()
+
+
+_valid_parsers = {
+    "lxml": _LxmlFrameParser,
+    None: _LxmlFrameParser,
+    "html5lib": _BeautifulSoupHtml5LibFrameParser,
+    "bs4": _BeautifulSoupHtml5LibFrameParser,
+}
+
+
+def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]:
+    """
+    Choose the parser based on the input flavor.
+
+    Parameters
+    ----------
+    flavor : {{"lxml", "html5lib", "bs4"}} or None
+        The type of parser to use. This must be a valid backend.
+
+    Returns
+    -------
+    cls : _HtmlFrameParser subclass
+        The parser class based on the requested input flavor.
+
+    Raises
+    ------
+    ValueError
+        * If `flavor` is not a valid backend.
+    ImportError
+        * If you do not have the requested `flavor`
+    """
+    valid_parsers = list(_valid_parsers.keys())
+    if flavor not in valid_parsers:
+        raise ValueError(
+            f"{flavor!r} is not a valid flavor, valid flavors are {valid_parsers}"
+        )
+
+    if flavor in ("bs4", "html5lib"):
+        import_optional_dependency("html5lib")
+        import_optional_dependency("bs4")
+    else:
+        import_optional_dependency("lxml.etree")
+    return _valid_parsers[flavor]
+
+
+def _print_as_set(s) -> str:
+    arg = ", ".join([pprint_thing(el) for el in s])
+    return f"{{{arg}}}"
+
+
+def _validate_flavor(flavor):
+    if flavor is None:
+        flavor = "lxml", "bs4"
+    elif isinstance(flavor, str):
+        flavor = (flavor,)
+    elif isinstance(flavor, abc.Iterable):
+        if not all(isinstance(flav, str) for flav in flavor):
+            raise TypeError(
+                f"Object of type {type(flavor).__name__!r} "
+                f"is not an iterable of strings"
+            )
+    else:
+        msg = repr(flavor) if isinstance(flavor, str) else str(flavor)
+        msg += " is not a valid flavor"
+        raise ValueError(msg)
+
+    flavor = tuple(flavor)
+    valid_flavors = set(_valid_parsers)
+    flavor_set = set(flavor)
+
+    if not flavor_set & valid_flavors:
+        raise ValueError(
+            f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "
+            f"flavors are {_print_as_set(valid_flavors)}"
+        )
+    return flavor
+
+
+def _parse(
+    flavor,
+    io,
+    match,
+    attrs,
+    encoding,
+    displayed_only,
+    extract_links,
+    storage_options,
+    **kwargs,
+):
+    flavor = _validate_flavor(flavor)
+    compiled_match = re.compile(match)  # you can pass a compiled regex here
+
+    retained = None
+    for flav in flavor:
+        parser = _parser_dispatch(flav)
+        p = parser(
+            io,
+            compiled_match,
+            attrs,
+            encoding,
+            displayed_only,
+            extract_links,
+            storage_options,
+        )
+
+        try:
+            tables = p.parse_tables()
+        except ValueError as caught:
+            # if `io` is an io-like object, check if it's seekable
+            # and try to rewind it before trying the next parser
+            if hasattr(io, "seekable") and io.seekable():
+                io.seek(0)
+            elif hasattr(io, "seekable") and not io.seekable():
+                # if we couldn't rewind it, let the user know
+                raise ValueError(
+                    f"The flavor {flav} failed to parse your input. "
+                    "Since you passed a non-rewindable file "
+                    "object, we can't rewind it to try "
+                    "another parser. Try read_html() with a different flavor."
+                ) from caught
+
+            retained = caught
+        else:
+            break
+    else:
+        assert retained is not None  # for mypy
+        raise retained
+
+    ret = []
+    for table in tables:
+        try:
+            df = _data_to_frame(data=table, **kwargs)
+            # Cast MultiIndex header to an Index of tuples when extracting header
+            # links and replace nan with None (therefore can't use mi.to_flat_index()).
+            # This maintains consistency of selection (e.g. df.columns.str[1])
+            if extract_links in ("all", "header") and isinstance(
+                df.columns, MultiIndex
+            ):
+                df.columns = Index(
+                    ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
+                    tupleize_cols=False,
+                )
+
+            ret.append(df)
+        except EmptyDataError:  # empty table
+            continue
+    return ret
+
+
+@set_module("pandas")
+def read_html(
+    io: FilePath | ReadBuffer[str],
+    *,
+    match: str | Pattern = ".+",
+    flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None,
+    header: int | Sequence[int] | None = None,
+    index_col: int | Sequence[int] | None = None,
+    skiprows: int | Sequence[int] | slice | None = None,
+    attrs: dict[str, str] | None = None,
+    parse_dates: bool = False,
+    thousands: str | None = ",",
+    encoding: str | None = None,
+    decimal: str = ".",
+    converters: dict | None = None,
+    na_values: Iterable[object] | None = None,
+    keep_default_na: bool = True,
+    displayed_only: bool = True,
+    extract_links: Literal["header", "footer", "body", "all"] | None = None,
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+    storage_options: StorageOptions = None,
+) -> list[DataFrame]:
+    r"""
+    Read HTML tables into a ``list`` of ``DataFrame`` objects.
+
+    Parameters
+    ----------
+    io : str, path object, or file-like object
+        String path, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a string ``read()`` function.
+        The string can represent a URL. Note that
+        lxml only accepts the http, ftp and file url protocols. If you have a
+        URL that starts with ``'https'`` you might try removing the ``'s'``.
+
+    match : str or compiled regular expression, optional
+        The set of tables containing text matching this regex or string will be
+        returned. Unless the HTML is extremely simple you will probably need to
+        pass a non-empty string here. Defaults to '.+' (match any non-empty
+        string). The default value will return all tables contained on a page.
+        This value is converted to a regular expression so that there is
+        consistent behavior between Beautiful Soup and lxml.
+
+    flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional
+        The parsing engine (or list of parsing engines) to use. 'bs4' and
+        'html5lib' are synonymous with each other, they are both there for
+        backwards compatibility. The default of ``None`` tries to use ``lxml``
+        to parse and if that fails it falls back on ``bs4`` + ``html5lib``.
+
+    header : int or list-like, optional
+        The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
+        make the columns headers.
+
+    index_col : int or list-like, optional
+        The column (or list of columns) to use to create the index.
+
+    skiprows : int, list-like or slice, optional
+        Number of rows to skip after parsing the column integer. 0-based. If a
+        sequence of integers or a slice is given, will skip the rows indexed by
+        that sequence.  Note that a single element sequence means 'skip the nth
+        row' whereas an integer means 'skip n rows'.
+
+    attrs : dict, optional
+        This is a dictionary of attributes that you can pass to use to identify
+        the table in the HTML. These are not checked for validity before being
+        passed to lxml or Beautiful Soup. However, these attributes must be
+        valid HTML table attributes to work correctly. For example, ::
+
+            attrs = {{"id": "table"}}
+
+        is a valid attribute dictionary because the 'id' HTML tag attribute is
+        a valid HTML attribute for *any* HTML tag as per `this document
+        <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::
+
+            attrs = {{"asdf": "table"}}
+
+        is *not* a valid attribute dictionary because 'asdf' is not a valid
+        HTML attribute even if it is a valid XML attribute.  Valid HTML 4.01
+        table attributes can be found `here
+        <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
+        working draft of the HTML 5 spec can be found `here
+        <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the
+        latest information on table attributes for the modern web.
+
+    parse_dates : bool, optional
+        See :func:`~read_csv` for more details.
+
+    thousands : str, optional
+        Separator to use to parse thousands. Defaults to ``','``.
+
+    encoding : str, optional
+        The encoding used to decode the web page. Defaults to ``None``.``None``
+        preserves the previous encoding behavior, which depends on the
+        underlying parser library (e.g., the parser library will try to use
+        the encoding provided by the document).
+
+    decimal : str, default '.'
+        Character to recognize as decimal point (e.g. use ',' for European
+        data).
+
+    converters : dict, default None
+        Dict of functions for converting values in certain columns. Keys can
+        either be integers or column labels, values are functions that take one
+        input argument, the cell (not column) content, and return the
+        transformed content.
+
+    na_values : iterable, default None
+        Custom NA values.
+
+    keep_default_na : bool, default True
+        If na_values are specified and keep_default_na is False the default NaN
+        values are overridden, otherwise they're appended to.
+
+    displayed_only : bool, default True
+        Whether elements with "display: none" should be parsed.
+
+    extract_links : {{None, "all", "header", "body", "footer"}}
+        Table elements in the specified section(s) with <a> tags will have their
+        href extracted.
+
+    dtype_backend : {{'numpy_nullable', 'pyarrow'}}
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). If not specified, the default behavior
+        is to not use nullable data types. If specified, the behavior
+        is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+        * ``"pyarrow"``: returns pyarrow-backed nullable
+          :class:`ArrowDtype` :class:`DataFrame`
+
+        .. versionadded:: 2.0
+
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection, e.g.
+        host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+        are forwarded to ``urllib.request.Request`` as header options. For other
+        URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+        forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+        details, and for more examples on storage options refer `here
+        <https://pandas.pydata.org/docs/user_guide/io.html?
+        highlight=storage_options#reading-writing-remote-files>`_.
+
+        .. versionadded:: 2.1.0
+
+    Returns
+    -------
+    dfs
+        A list of DataFrames.
+
+    See Also
+    --------
+    read_csv : Read a comma-separated values (csv) file into DataFrame.
+
+    Notes
+    -----
+    Before using this function you should read the :ref:`gotchas about the
+    HTML parsing libraries <io.html.gotchas>`.
+
+    Expect to do some cleanup after you call this function. For example, you
+    might need to manually assign column names if the column names are
+    converted to NaN when you pass the `header=0` argument. We try to assume as
+    little as possible about the structure of the table and push the
+    idiosyncrasies of the HTML contained in the table to the user.
+
+    This function searches for ``<table>`` elements and only for ``<tr>``
+    and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
+    element in the table. ``<td>`` stands for "table data". This function
+    attempts to properly handle ``colspan`` and ``rowspan`` attributes.
+    If the function has a ``<thead>`` argument, it is used to construct
+    the header, otherwise the function attempts to find the header within
+    the body (by putting rows with only ``<th>`` elements into the header).
+
+    Similar to :func:`~read_csv` the `header` argument is applied
+    **after** `skiprows` is applied.
+
+    This function will *always* return a list of :class:`DataFrame` *or*
+    it will fail, i.e., it will *not* return an empty list, save for some
+    rare cases.
+    It might return an empty list in case of inputs with single row and
+    ``<td>`` containing only whitespaces.
+
+    Examples
+    --------
+    See the :ref:`read_html documentation in the IO section of the docs
+    <io.read_html>` for some examples of reading in HTML tables.
+    """
+    # Type check here. We don't want to parse only to fail because of an
+    # invalid value of an integer skiprows.
+    if isinstance(skiprows, numbers.Integral) and skiprows < 0:
+        raise ValueError(
+            "cannot skip rows starting from the end of the "
+            "data (you passed a negative value)"
+        )
+    if extract_links not in [None, "header", "footer", "body", "all"]:
+        raise ValueError(
+            "`extract_links` must be one of "
+            '{None, "header", "footer", "body", "all"}, got '
+            f'"{extract_links}"'
+        )
+
+    validate_header_arg(header)
+    check_dtype_backend(dtype_backend)
+
+    io = stringify_path(io)
+
+    return _parse(
+        flavor=flavor,
+        io=io,
+        match=match,
+        header=header,
+        index_col=index_col,
+        skiprows=skiprows,
+        parse_dates=parse_dates,
+        thousands=thousands,
+        attrs=attrs,
+        encoding=encoding,
+        decimal=decimal,
+        converters=converters,
+        na_values=na_values,
+        keep_default_na=keep_default_na,
+        displayed_only=displayed_only,
+        extract_links=extract_links,
+        dtype_backend=dtype_backend,
+        storage_options=storage_options,
+    )
diff --git a/pandas/io/iceberg.py b/pandas/io/iceberg.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4361b000524e0701fd97c2f3632b422f29265bc
--- /dev/null
+++ b/pandas/io/iceberg.py
@@ -0,0 +1,155 @@
+from typing import (
+    Any,
+)
+
+from pandas.compat._optional import import_optional_dependency
+from pandas.util._decorators import set_module
+
+from pandas import DataFrame
+
+
+@set_module("pandas")
+def read_iceberg(
+    table_identifier: str,
+    catalog_name: str | None = None,
+    *,
+    catalog_properties: dict[str, Any] | None = None,
+    columns: list[str] | None = None,
+    row_filter: str | None = None,
+    case_sensitive: bool = True,
+    snapshot_id: int | None = None,
+    limit: int | None = None,
+    scan_properties: dict[str, Any] | None = None,
+) -> DataFrame:
+    """
+    Read an Apache Iceberg table into a pandas DataFrame.
+
+    .. versionadded:: 3.0.0
+
+    .. warning::
+
+       read_iceberg is experimental and may change without warning.
+
+    Parameters
+    ----------
+    table_identifier : str
+        Table identifier.
+    catalog_name : str, optional
+        The name of the catalog.
+    catalog_properties : dict of {str: str}, optional
+        The properties that are used next to the catalog configuration.
+    columns : list of str, optional
+        A list of strings representing the column names to return in the output
+        dataframe.
+    row_filter : str, optional
+        A string that describes the desired rows.
+    case_sensitive : bool, default True
+        If True column matching is case sensitive.
+    snapshot_id : int, optional
+        Snapshot ID to time travel to. By default the table will be scanned as of the
+        current snapshot ID.
+    limit : int, optional
+        An integer representing the number of rows to return in the scan result.
+        By default all matching rows will be fetched.
+    scan_properties : dict of {str: obj}, optional
+        Additional Table properties as a dictionary of string key value pairs to use
+        for this scan.
+
+    Returns
+    -------
+    DataFrame
+        DataFrame based on the Iceberg table.
+
+    See Also
+    --------
+    read_parquet : Read a Parquet file.
+
+    Examples
+    --------
+    >>> df = pd.read_iceberg(
+    ...     table_identifier="my_table",
+    ...     catalog_name="my_catalog",
+    ...     catalog_properties={"s3.secret-access-key": "my-secret"},
+    ...     row_filter="trip_distance >= 10.0",
+    ...     columns=["VendorID", "tpep_pickup_datetime"],
+    ... )  # doctest: +SKIP
+    """
+    pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
+    pyiceberg_expressions = import_optional_dependency("pyiceberg.expressions")
+    if catalog_properties is None:
+        catalog_properties = {}
+    catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
+    table = catalog.load_table(table_identifier)
+    if row_filter is None:
+        row_filter = pyiceberg_expressions.AlwaysTrue()
+    if columns is None:
+        selected_fields = ("*",)
+    else:
+        selected_fields = tuple(columns)  # type: ignore[assignment]
+    if scan_properties is None:
+        scan_properties = {}
+    result = table.scan(
+        row_filter=row_filter,
+        selected_fields=selected_fields,
+        case_sensitive=case_sensitive,
+        snapshot_id=snapshot_id,
+        options=scan_properties,
+        limit=limit,
+    )
+    return result.to_pandas()
+
+
+def to_iceberg(
+    df: DataFrame,
+    table_identifier: str,
+    catalog_name: str | None = None,
+    *,
+    catalog_properties: dict[str, Any] | None = None,
+    location: str | None = None,
+    append: bool = False,
+    snapshot_properties: dict[str, str] | None = None,
+) -> None:
+    """
+    Write a DataFrame to an Apache Iceberg table.
+
+    .. versionadded:: 3.0.0
+
+    Parameters
+    ----------
+    table_identifier : str
+        Table identifier.
+    catalog_name : str, optional
+        The name of the catalog.
+    catalog_properties : dict of {str: str}, optional
+        The properties that are used next to the catalog configuration.
+    location : str, optional
+        Location for the table.
+    append : bool, default False
+        If ``True``, append data to the table, instead of replacing the content.
+    snapshot_properties : dict of {str: str}, optional
+        Custom properties to be added to the snapshot summary
+
+    See Also
+    --------
+    read_iceberg : Read an Apache Iceberg table.
+    DataFrame.to_parquet : Write a DataFrame in Parquet format.
+    """
+    pa = import_optional_dependency("pyarrow")
+    pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
+    if catalog_properties is None:
+        catalog_properties = {}
+    catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
+    arrow_table = pa.Table.from_pandas(df)
+    table = catalog.create_table_if_not_exists(
+        identifier=table_identifier,
+        schema=arrow_table.schema,
+        location=location,
+        # we could add `partition_spec`, `sort_order` and `properties` in the
+        # future, but it may not be trivial without exposing PyIceberg objects
+    )
+    if snapshot_properties is None:
+        snapshot_properties = {}
+    if append:
+        table.append(arrow_table, snapshot_properties=snapshot_properties)
+    else:
+        table.overwrite(arrow_table, snapshot_properties=snapshot_properties)
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
new file mode 100644
index 0000000000000000000000000000000000000000..8851532508c7e1d55df8b7da01008c95f78b09d6
--- /dev/null
+++ b/pandas/io/orc.py
@@ -0,0 +1,243 @@
+"""orc compat"""
+
+from __future__ import annotations
+
+import io
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+)
+
+from pandas._libs import lib
+from pandas.compat._optional import import_optional_dependency
+from pandas.util._decorators import set_module
+from pandas.util._validators import check_dtype_backend
+
+from pandas.core.indexes.api import default_index
+
+from pandas.io._util import arrow_table_to_pandas
+from pandas.io.common import (
+    get_handle,
+    is_fsspec_url,
+)
+
+if TYPE_CHECKING:
+    import fsspec
+    import pyarrow.fs
+
+    from pandas._typing import (
+        DtypeBackend,
+        FilePath,
+        ReadBuffer,
+        WriteBuffer,
+    )
+
+    from pandas.core.frame import DataFrame
+
+
+@set_module("pandas")
+def read_orc(
+    path: FilePath | ReadBuffer[bytes],
+    columns: list[str] | None = None,
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+    filesystem: pyarrow.fs.FileSystem | fsspec.spec.AbstractFileSystem | None = None,
+    **kwargs: Any,
+) -> DataFrame:
+    """
+    Load an ORC object from the file path, returning a DataFrame.
+
+    This method reads an ORC (Optimized Row Columnar) file into a pandas
+    DataFrame using the `pyarrow.orc` library. ORC is a columnar storage format
+    that provides efficient compression and fast retrieval for analytical workloads.
+    It allows reading specific columns, handling different filesystem
+    types (such as local storage, cloud storage via fsspec, or pyarrow filesystem),
+    and supports different data type backends, including `numpy_nullable` and `pyarrow`.
+
+    Parameters
+    ----------
+    path : str, path object, or file-like object
+        String, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a binary ``read()`` function. The string could be a URL.
+        Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
+        expected. A local file could be:
+        ``file://localhost/path/to/table.orc``.
+    columns : list, default None
+        If not None, only these columns will be read from the file.
+        Output always follows the ordering of the file and not the columns list.
+        This mirrors the original behaviour of
+        :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
+    dtype_backend : {'numpy_nullable', 'pyarrow'}
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). If not specified, the default behavior
+        is to not use nullable data types. If specified, the behavior
+        is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+        * ``"pyarrow"``: returns pyarrow-backed nullable
+          :class:`ArrowDtype` :class:`DataFrame`
+
+        .. versionadded:: 2.0
+
+    filesystem : fsspec or pyarrow filesystem, default None
+        Filesystem object to use when reading the orc file.
+
+        .. versionadded:: 2.1.0
+
+    **kwargs
+        Any additional kwargs are passed to pyarrow.
+
+    Returns
+    -------
+    DataFrame
+        DataFrame based on the ORC file.
+
+    See Also
+    --------
+    read_csv : Read a comma-separated values (csv) file into a pandas DataFrame.
+    read_excel : Read an Excel file into a pandas DataFrame.
+    read_spss : Read an SPSS file into a pandas DataFrame.
+    read_sas : Load a SAS file into a pandas DataFrame.
+    read_feather : Load a feather-format object into a pandas DataFrame.
+
+    Notes
+    -----
+    Before using this function you should read the :ref:`user guide about ORC <io.orc>`
+    and :ref:`install optional dependencies <install.warn_orc>`.
+
+    If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"),
+    a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a
+    pyarrow or fsspec filesystem object into the filesystem keyword to override this
+    behavior.
+
+    Examples
+    --------
+    >>> result = pd.read_orc("example_pa.orc")  # doctest: +SKIP
+    """
+    # we require a newer version of pyarrow than we support for orc
+
+    orc = import_optional_dependency("pyarrow.orc")
+
+    check_dtype_backend(dtype_backend)
+
+    with get_handle(path, "rb", is_text=False) as handles:
+        source = handles.handle
+        if is_fsspec_url(path) and filesystem is None:
+            pa = import_optional_dependency("pyarrow")
+            pa_fs = import_optional_dependency("pyarrow.fs")
+            try:
+                filesystem, source = pa_fs.FileSystem.from_uri(path)
+            except (TypeError, pa.ArrowInvalid):
+                pass
+
+        pa_table = orc.read_table(
+            source=source, columns=columns, filesystem=filesystem, **kwargs
+        )
+    return arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
+
+
+def to_orc(
+    df: DataFrame,
+    path: FilePath | WriteBuffer[bytes] | None = None,
+    *,
+    engine: Literal["pyarrow"] = "pyarrow",
+    index: bool | None = None,
+    engine_kwargs: dict[str, Any] | None = None,
+) -> bytes | None:
+    """
+    Write a DataFrame to the ORC format.
+
+    Parameters
+    ----------
+    df : DataFrame
+        The dataframe to be written to ORC. Raises NotImplementedError
+        if dtype of one or more columns is category, unsigned integers,
+        intervals, periods or sparse.
+    path : str, file-like object or None, default None
+        If a string, it will be used as Root Directory path
+        when writing a partitioned dataset. By file-like object,
+        we refer to objects with a write() method, such as a file handle
+        (e.g. via builtin open function). If path is None,
+        a bytes object is returned.
+    engine : str, default 'pyarrow'
+        ORC library to use.
+    index : bool, optional
+        If ``True``, include the dataframe's index(es) in the file output. If
+        ``False``, they will not be written to the file.
+        If ``None``, similar to ``infer`` the dataframe's index(es)
+        will be saved. However, instead of being saved as values,
+        the RangeIndex will be stored as a range in the metadata so it
+        doesn't require much space and is faster. Other indexes will
+        be included as columns in the file output.
+    engine_kwargs : dict[str, Any] or None, default None
+        Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
+
+    Returns
+    -------
+    bytes if no path argument is provided else None
+
+    Raises
+    ------
+    NotImplementedError
+        Dtype of one or more columns is category, unsigned integers, interval,
+        period or sparse.
+    ValueError
+        engine is not pyarrow.
+
+    Notes
+    -----
+    * Before using this function you should read the
+      :ref:`user guide about ORC <io.orc>` and
+      :ref:`install optional dependencies <install.warn_orc>`.
+    * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
+      library.
+    * For supported dtypes please refer to `supported ORC features in Arrow
+      <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
+    * Currently timezones in datetime columns are not preserved when a
+      dataframe is converted into ORC files.
+    """
+    if index is None:
+        index = df.index.names[0] is not None
+    if engine_kwargs is None:
+        engine_kwargs = {}
+
+    # validate index
+    # --------------
+
+    # validate that we have only a default index
+    # raise on anything else as we don't serialize the index
+
+    if not df.index.equals(default_index(len(df))):
+        raise ValueError(
+            "orc does not support serializing a non-default index for the index; "
+            "you can .reset_index() to make the index into column(s)"
+        )
+
+    if df.index.name is not None:
+        raise ValueError("orc does not serialize index meta-data on a default index")
+
+    if engine != "pyarrow":
+        raise ValueError("engine must be 'pyarrow'")
+    pa = import_optional_dependency("pyarrow")
+    orc = import_optional_dependency("pyarrow.orc")
+
+    was_none = path is None
+    if was_none:
+        path = io.BytesIO()
+    assert path is not None  # For mypy
+    with get_handle(path, "wb", is_text=False) as handles:
+        try:
+            orc.write_table(
+                pa.Table.from_pandas(df, preserve_index=index),
+                handles.handle,
+                **engine_kwargs,
+            )
+        except (TypeError, pa.ArrowNotImplementedError) as e:
+            raise NotImplementedError(
+                "The dtype of one or more columns is not supported yet."
+            ) from e
+
+    if was_none:
+        assert isinstance(path, io.BytesIO)  # For mypy
+        return path.getvalue()
+    return None
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
new file mode 100644
index 0000000000000000000000000000000000000000..218002ebb3f6a0bef4c994b152c8ff51c0852440
--- /dev/null
+++ b/pandas/io/parquet.py
@@ -0,0 +1,680 @@
+"""parquet compat"""
+
+from __future__ import annotations
+
+import io
+import json
+import os
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+)
+from warnings import (
+    catch_warnings,
+    filterwarnings,
+)
+
+from pandas._libs import lib
+from pandas.compat._optional import import_optional_dependency
+from pandas.errors import (
+    AbstractMethodError,
+    Pandas4Warning,
+)
+from pandas.util._decorators import set_module
+from pandas.util._validators import check_dtype_backend
+
+from pandas import (
+    DataFrame,
+    get_option,
+)
+
+from pandas.io._util import arrow_table_to_pandas
+from pandas.io.common import (
+    IOHandles,
+    get_handle,
+    is_fsspec_url,
+    is_url,
+    stringify_path,
+)
+
+if TYPE_CHECKING:
+    from pandas._typing import (
+        DtypeBackend,
+        FilePath,
+        ParquetCompressionOptions,
+        ReadBuffer,
+        StorageOptions,
+        WriteBuffer,
+    )
+
+
+def get_engine(engine: str) -> BaseImpl:
+    """return our implementation"""
+    if engine == "auto":
+        engine = get_option("io.parquet.engine")
+
+    if engine == "auto":
+        # try engines in this order
+        engine_classes = [PyArrowImpl, FastParquetImpl]
+
+        error_msgs = ""
+        for engine_class in engine_classes:
+            try:
+                return engine_class()
+            except ImportError as err:
+                error_msgs += "\n - " + str(err)
+
+        raise ImportError(
+            "Unable to find a usable engine; "
+            "tried using: 'pyarrow', 'fastparquet'.\n"
+            "A suitable version of "
+            "pyarrow or fastparquet is required for parquet "
+            "support.\n"
+            "Trying to import the above resulted in these errors:"
+            f"{error_msgs}"
+        )
+
+    if engine == "pyarrow":
+        return PyArrowImpl()
+    elif engine == "fastparquet":
+        return FastParquetImpl()
+
+    raise ValueError("engine must be one of 'pyarrow', 'fastparquet'")
+
+
+def _get_path_or_handle(
+    path: FilePath | ReadBuffer[bytes] | WriteBuffer[bytes],
+    fs: Any,
+    storage_options: StorageOptions | None = None,
+    mode: str = "rb",
+    is_dir: bool = False,
+) -> tuple[
+    FilePath | ReadBuffer[bytes] | WriteBuffer[bytes], IOHandles[bytes] | None, Any
+]:
+    """File handling for PyArrow."""
+    path_or_handle = stringify_path(path)
+    if fs is not None:
+        pa_fs = import_optional_dependency("pyarrow.fs", errors="ignore")
+        fsspec = import_optional_dependency("fsspec", errors="ignore")
+        if pa_fs is not None and isinstance(fs, pa_fs.FileSystem):
+            if storage_options:
+                raise NotImplementedError(
+                    "storage_options not supported with a pyarrow FileSystem."
+                )
+        elif fsspec is not None and isinstance(fs, fsspec.spec.AbstractFileSystem):
+            pass
+        else:
+            raise ValueError(
+                f"filesystem must be a pyarrow or fsspec FileSystem, "
+                f"not a {type(fs).__name__}"
+            )
+    if is_fsspec_url(path_or_handle) and fs is None:
+        if storage_options is None:
+            pa = import_optional_dependency("pyarrow")
+            pa_fs = import_optional_dependency("pyarrow.fs")
+
+            try:
+                fs, path_or_handle = pa_fs.FileSystem.from_uri(path)
+            except (TypeError, pa.ArrowInvalid):
+                pass
+        if fs is None:
+            fsspec = import_optional_dependency("fsspec")
+            fs, path_or_handle = fsspec.core.url_to_fs(
+                path_or_handle, **(storage_options or {})
+            )
+    elif storage_options and (not is_url(path_or_handle) or mode != "rb"):
+        # can't write to a remote url
+        # without making use of fsspec at the moment
+        raise ValueError("storage_options passed with buffer, or non-supported URL")
+
+    handles = None
+    if (
+        not fs
+        and not is_dir
+        and isinstance(path_or_handle, str)
+        and not os.path.isdir(path_or_handle)
+    ):
+        # use get_handle only when we are very certain that it is not a directory
+        # fsspec resources can also point to directories
+        # this branch is used for example when reading from non-fsspec URLs
+        handles = get_handle(
+            path_or_handle, mode, is_text=False, storage_options=storage_options
+        )
+        fs = None
+        path_or_handle = handles.handle
+    return path_or_handle, handles, fs
+
+
+class BaseImpl:
+    @staticmethod
+    def validate_dataframe(df: DataFrame) -> None:
+        if not isinstance(df, DataFrame):
+            raise ValueError("to_parquet only supports IO with DataFrames")
+
+    def write(self, df: DataFrame, path, compression, **kwargs) -> None:
+        raise AbstractMethodError(self)
+
+    def read(self, path, columns=None, **kwargs) -> DataFrame:
+        raise AbstractMethodError(self)
+
+
+class PyArrowImpl(BaseImpl):
+    def __init__(self) -> None:
+        import_optional_dependency(
+            "pyarrow", extra="pyarrow is required for parquet support."
+        )
+        import pyarrow.parquet
+
+        # import utils to register the pyarrow extension types
+        import pandas.core.arrays.arrow.extension_types  # pyright: ignore[reportUnusedImport] # noqa: F401
+
+        self.api = pyarrow
+
+    def write(
+        self,
+        df: DataFrame,
+        path: FilePath | WriteBuffer[bytes],
+        compression: ParquetCompressionOptions = "snappy",
+        index: bool | None = None,
+        storage_options: StorageOptions | None = None,
+        partition_cols: list[str] | None = None,
+        filesystem=None,
+        **kwargs,
+    ) -> None:
+        self.validate_dataframe(df)
+
+        from_pandas_kwargs: dict[str, Any] = {"schema": kwargs.pop("schema", None)}
+        if index is not None:
+            from_pandas_kwargs["preserve_index"] = index
+
+        table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
+
+        if df.attrs:
+            df_metadata = {"PANDAS_ATTRS": json.dumps(df.attrs)}
+            existing_metadata = table.schema.metadata
+            merged_metadata = {**existing_metadata, **df_metadata}
+            table = table.replace_schema_metadata(merged_metadata)
+
+        path_or_handle, handles, filesystem = _get_path_or_handle(
+            path,
+            filesystem,
+            storage_options=storage_options,
+            mode="wb",
+            is_dir=partition_cols is not None,
+        )
+        if (
+            isinstance(path_or_handle, io.BufferedWriter)
+            and hasattr(path_or_handle, "name")
+            and isinstance(path_or_handle.name, (str, bytes))
+        ):
+            if isinstance(path_or_handle.name, bytes):
+                path_or_handle = path_or_handle.name.decode()
+            else:
+                path_or_handle = path_or_handle.name
+
+        try:
+            if partition_cols is not None:
+                # writes to multiple files under the given path
+                self.api.parquet.write_to_dataset(
+                    table,
+                    path_or_handle,
+                    compression=compression,
+                    partition_cols=partition_cols,
+                    filesystem=filesystem,
+                    **kwargs,
+                )
+            else:
+                # write to single output file
+                self.api.parquet.write_table(
+                    table,
+                    path_or_handle,
+                    compression=compression,
+                    filesystem=filesystem,
+                    **kwargs,
+                )
+        finally:
+            if handles is not None:
+                handles.close()
+
+    def read(
+        self,
+        path,
+        columns=None,
+        filters=None,
+        dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+        storage_options: StorageOptions | None = None,
+        filesystem=None,
+        to_pandas_kwargs: dict[str, Any] | None = None,
+        **kwargs,
+    ) -> DataFrame:
+        kwargs["use_pandas_metadata"] = True
+
+        path_or_handle, handles, filesystem = _get_path_or_handle(
+            path,
+            filesystem,
+            storage_options=storage_options,
+            mode="rb",
+        )
+        try:
+            pa_table = self.api.parquet.read_table(
+                path_or_handle,
+                columns=columns,
+                filesystem=filesystem,
+                filters=filters,
+                **kwargs,
+            )
+            with catch_warnings():
+                filterwarnings(
+                    "ignore",
+                    "make_block is deprecated",
+                    Pandas4Warning,
+                )
+                result = arrow_table_to_pandas(
+                    pa_table,
+                    dtype_backend=dtype_backend,
+                    to_pandas_kwargs=to_pandas_kwargs,
+                )
+
+            if pa_table.schema.metadata:
+                if b"PANDAS_ATTRS" in pa_table.schema.metadata:
+                    df_metadata = pa_table.schema.metadata[b"PANDAS_ATTRS"]
+                    result.attrs = json.loads(df_metadata)
+            return result
+        finally:
+            if handles is not None:
+                handles.close()
+
+
+class FastParquetImpl(BaseImpl):
+    def __init__(self) -> None:
+        # since pandas is a dependency of fastparquet
+        # we need to import on first use
+        fastparquet = import_optional_dependency(
+            "fastparquet", extra="fastparquet is required for parquet support."
+        )
+        self.api = fastparquet
+
+    def write(
+        self,
+        df: DataFrame,
+        path,
+        compression: Literal["snappy", "gzip", "brotli"] | None = "snappy",
+        index=None,
+        partition_cols=None,
+        storage_options: StorageOptions | None = None,
+        filesystem=None,
+        **kwargs,
+    ) -> None:
+        self.validate_dataframe(df)
+
+        if "partition_on" in kwargs and partition_cols is not None:
+            raise ValueError(
+                "Cannot use both partition_on and "
+                "partition_cols. Use partition_cols for partitioning data"
+            )
+        if "partition_on" in kwargs:
+            partition_cols = kwargs.pop("partition_on")
+
+        if partition_cols is not None:
+            kwargs["file_scheme"] = "hive"
+
+        if filesystem is not None:
+            raise NotImplementedError(
+                "filesystem is not implemented for the fastparquet engine."
+            )
+
+        # cannot use get_handle as write() does not accept file buffers
+        path = stringify_path(path)
+        if is_fsspec_url(path):
+            fsspec = import_optional_dependency("fsspec")
+
+            # if filesystem is provided by fsspec, file must be opened in 'wb' mode.
+            kwargs["open_with"] = lambda path, _: fsspec.open(
+                path, "wb", **(storage_options or {})
+            ).open()
+        elif storage_options:
+            raise ValueError(
+                "storage_options passed with file object or non-fsspec file path"
+            )
+
+        with catch_warnings(record=True):
+            self.api.write(
+                path,
+                df,
+                compression=compression,
+                write_index=index,
+                partition_on=partition_cols,
+                **kwargs,
+            )
+
+    def read(
+        self,
+        path,
+        columns=None,
+        filters=None,
+        storage_options: StorageOptions | None = None,
+        filesystem=None,
+        to_pandas_kwargs: dict | None = None,
+        **kwargs,
+    ) -> DataFrame:
+        parquet_kwargs: dict[str, Any] = {}
+        dtype_backend = kwargs.pop("dtype_backend", lib.no_default)
+        # We are disabling nullable dtypes for fastparquet pending discussion
+        parquet_kwargs["pandas_nulls"] = False
+        if dtype_backend is not lib.no_default:
+            raise ValueError(
+                "The 'dtype_backend' argument is not supported for the "
+                "fastparquet engine"
+            )
+        if filesystem is not None:
+            raise NotImplementedError(
+                "filesystem is not implemented for the fastparquet engine."
+            )
+        if to_pandas_kwargs is not None:
+            raise NotImplementedError(
+                "to_pandas_kwargs is not implemented for the fastparquet engine."
+            )
+        path = stringify_path(path)
+        handles = None
+        if is_fsspec_url(path):
+            fsspec = import_optional_dependency("fsspec")
+
+            parquet_kwargs["fs"] = fsspec.open(path, "rb", **(storage_options or {})).fs
+        elif isinstance(path, str) and not os.path.isdir(path):
+            # use get_handle only when we are very certain that it is not a directory
+            # fsspec resources can also point to directories
+            # this branch is used for example when reading from non-fsspec URLs
+            handles = get_handle(
+                path, "rb", is_text=False, storage_options=storage_options
+            )
+            path = handles.handle
+
+        try:
+            parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
+            with catch_warnings():
+                filterwarnings(
+                    "ignore",
+                    "make_block is deprecated",
+                    Pandas4Warning,
+                )
+                return parquet_file.to_pandas(
+                    columns=columns, filters=filters, **kwargs
+                )
+        finally:
+            if handles is not None:
+                handles.close()
+
+
+def to_parquet(
+    df: DataFrame,
+    path: FilePath | WriteBuffer[bytes] | None = None,
+    engine: str = "auto",
+    compression: ParquetCompressionOptions = "snappy",
+    index: bool | None = None,
+    storage_options: StorageOptions | None = None,
+    partition_cols: list[str] | None = None,
+    filesystem: Any = None,
+    **kwargs,
+) -> bytes | None:
+    """
+    Write a DataFrame to the parquet format.
+
+    Parameters
+    ----------
+    df : DataFrame
+    path : str, path object, file-like object, or None, default None
+        String, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a binary ``write()`` function. If None, the result
+        is returned as bytes. If a string, it will be used as Root Directory
+        path when writing a partitioned dataset. The engine fastparquet does
+        not accept file-like objects.
+    engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
+        Parquet library to use. If 'auto', then the option
+        ``io.parquet.engine`` is used. The default ``io.parquet.engine``
+        behavior is to try 'pyarrow', falling back to 'fastparquet' if
+        'pyarrow' is unavailable.
+
+        When using the ``'pyarrow'`` engine and no storage options are provided
+        and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
+        (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
+        Use the filesystem keyword with an instantiated fsspec filesystem
+        if you wish to use its implementation.
+    compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}},
+        default 'snappy'. Name of the compression to use. Use ``None``
+        for no compression.
+    index : bool, default None
+        If ``True``, include the dataframe's index(es) in the file output. If
+        ``False``, they will not be written to the file.
+        If ``None``, similar to ``True`` the dataframe's index(es)
+        will be saved. However, instead of being saved as values,
+        the RangeIndex will be stored as a range in the metadata so it
+        doesn't require much space and is faster. Other indexes will
+        be included as columns in the file output.
+    partition_cols : str or list, optional, default None
+        Column names by which to partition the dataset.
+        Columns are partitioned in the order they are given.
+        Must be None if path is not a string.
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection, e.g.
+        host, port, username, password, etc. For HTTP(S) URLs the key-value
+        pairs are forwarded to ``urllib.request.Request`` as header options.
+        For other URLs (e.g. starting with "s3://", and "gcs://") the
+        key-value pairs are forwarded to ``fsspec.open``. Please see ``fsspec``
+        and ``urllib`` for more details, and for more examples on storage
+        options refer `here <https://pandas.pydata.org/docs/user_guide/io.html?
+        highlight=storage_options#reading-writing-remote-files>`_.
+    filesystem : fsspec or pyarrow filesystem, default None
+        Filesystem object to use when reading the parquet file. Only implemented
+        for ``engine="pyarrow"``.
+
+        .. versionadded:: 2.1.0
+
+    **kwargs
+        Additional keyword arguments passed to the engine:
+
+        * For ``engine="pyarrow"``: passed to :func:`pyarrow.parquet.write_table`
+          or :func:`pyarrow.parquet.write_to_dataset` (when using partition_cols)
+        * For ``engine="fastparquet"``: passed to :func:`fastparquet.write`
+
+    Returns
+    -------
+    bytes if no path argument is provided else None
+    """
+    if isinstance(partition_cols, str):
+        partition_cols = [partition_cols]
+    impl = get_engine(engine)
+
+    path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
+
+    impl.write(
+        df,
+        path_or_buf,
+        compression=compression,
+        index=index,
+        partition_cols=partition_cols,
+        storage_options=storage_options,
+        filesystem=filesystem,
+        **kwargs,
+    )
+
+    if path is None:
+        assert isinstance(path_or_buf, io.BytesIO)
+        return path_or_buf.getvalue()
+    else:
+        return None
+
+
+@set_module("pandas")
+def read_parquet(
+    path: FilePath | ReadBuffer[bytes],
+    engine: str = "auto",
+    columns: list[str] | None = None,
+    storage_options: StorageOptions | None = None,
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+    filesystem: Any = None,
+    filters: list[tuple] | list[list[tuple]] | None = None,
+    to_pandas_kwargs: dict | None = None,
+    **kwargs,
+) -> DataFrame:
+    """
+    Load a parquet object from the file path, returning a DataFrame.
+
+    The function automatically handles reading the data from a parquet file
+    and creates a DataFrame with the appropriate structure.
+
+    Parameters
+    ----------
+    path : str, path object or file-like object
+        String, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a binary ``read()`` function.
+        The string could be a URL. Valid URL schemes include http, ftp, s3,
+        gs, and file. For file URLs, a host is expected. A local file could be:
+        ``file://localhost/path/to/table.parquet``.
+        A file URL can also be a path to a directory that contains multiple
+        partitioned parquet files. Both pyarrow and fastparquet support
+        paths to directories as well as file URLs. A directory path could be:
+        ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
+    engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
+        Parquet library to use. If 'auto', then the option
+        ``io.parquet.engine`` is used. The default ``io.parquet.engine``
+        behavior is to try 'pyarrow', falling back to 'fastparquet' if
+        'pyarrow' is unavailable.
+
+        When using the ``'pyarrow'`` engine and no storage options are provided
+        and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
+        (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
+        Use the filesystem keyword with an instantiated fsspec filesystem
+        if you wish to use its implementation.
+    columns : list, default=None
+        If not None, only these columns will be read from the file.
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection, e.g.
+        host, port, username, password, etc. For HTTP(S) URLs the key-value
+        pairs are forwarded to ``urllib.request.Request`` as header options.
+        For other URLs (e.g. starting with "s3://", and "gcs://") the
+        key-value pairs are forwarded to ``fsspec.open``. Please see ``fsspec``
+        and ``urllib`` for more details, and for more examples on storage
+        options refer `here <https://pandas.pydata.org/docs/user_guide/io.html?
+        highlight=storage_options#reading-writing-remote-files>`_.
+    dtype_backend : {{'numpy_nullable', 'pyarrow'}}
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). If not specified, the default behavior
+        is to not use nullable data types. If specified, the behavior
+        is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+        * ``"pyarrow"``: returns pyarrow-backed nullable
+          :class:`ArrowDtype` :class:`DataFrame`
+
+        .. versionadded:: 2.0
+
+    filesystem : fsspec or pyarrow filesystem, default None
+        Filesystem object to use when reading the parquet file. Only implemented
+        for ``engine="pyarrow"``.
+
+        .. versionadded:: 2.1.0
+
+    filters : List[Tuple] or List[List[Tuple]], default None
+        To filter out data.
+        Filter syntax: [[(column, op, val), ...],...]
+        where op is [==, =, >, >=, <, <=, !=, in, not in]
+        The innermost tuples are transposed into a set of filters applied
+        through an `AND` operation.
+        The outer list combines these sets of filters through an `OR`
+        operation.
+        A single list of tuples can also be used, meaning that no `OR`
+        operation between set of filters is to be conducted.
+
+        Using this argument will NOT result in row-wise filtering of the final
+        partitions unless ``engine="pyarrow"`` is also specified.  For
+        other engines, filtering is only performed at the partition level, that is,
+        to prevent the loading of some row-groups and/or files.
+
+        .. versionadded:: 2.1.0
+
+    to_pandas_kwargs : dict | None, default None
+        Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas`
+        when ``engine="pyarrow"``.
+
+        .. versionadded:: 3.0.0
+
+    **kwargs
+        Additional keyword arguments passed to the engine:
+
+        * For ``engine="pyarrow"``: passed to :func:`pyarrow.parquet.read_table`
+        * For ``engine="fastparquet"``: passed to
+          :meth:`fastparquet.ParquetFile.to_pandas`
+
+    Returns
+    -------
+    DataFrame
+        DataFrame based on parquet file.
+
+    See Also
+    --------
+    DataFrame.to_parquet : Create a parquet object that serializes a DataFrame.
+
+    Examples
+    --------
+    >>> original_df = pd.DataFrame({"foo": range(5), "bar": range(5, 10)})
+    >>> original_df
+       foo  bar
+    0    0    5
+    1    1    6
+    2    2    7
+    3    3    8
+    4    4    9
+    >>> df_parquet_bytes = original_df.to_parquet()
+    >>> from io import BytesIO
+    >>> restored_df = pd.read_parquet(BytesIO(df_parquet_bytes))
+    >>> restored_df
+       foo  bar
+    0    0    5
+    1    1    6
+    2    2    7
+    3    3    8
+    4    4    9
+    >>> restored_df.equals(original_df)
+    True
+    >>> restored_bar = pd.read_parquet(BytesIO(df_parquet_bytes), columns=["bar"])
+    >>> restored_bar
+        bar
+    0    5
+    1    6
+    2    7
+    3    8
+    4    9
+    >>> restored_bar.equals(original_df[["bar"]])
+    True
+
+    The function uses `kwargs` that are passed directly to the engine.
+    In the following example, we use the `filters` argument of the pyarrow
+    engine to filter the rows of the DataFrame.
+
+    Since `pyarrow` is the default engine, we can omit the `engine` argument.
+    Note that the `filters` argument is implemented by the `pyarrow` engine,
+    which can benefit from multithreading and also potentially be more
+    economical in terms of memory.
+
+    >>> sel = [("foo", ">", 2)]
+    >>> restored_part = pd.read_parquet(BytesIO(df_parquet_bytes), filters=sel)
+    >>> restored_part
+        foo  bar
+    0    3    8
+    1    4    9
+    """
+
+    impl = get_engine(engine)
+    check_dtype_backend(dtype_backend)
+
+    return impl.read(
+        path,
+        columns=columns,
+        filters=filters,
+        storage_options=storage_options,
+        dtype_backend=dtype_backend,
+        filesystem=filesystem,
+        to_pandas_kwargs=to_pandas_kwargs,
+        **kwargs,
+    )
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f2b380bc70bf273b2b4cd400281b4b7d2d5fc15
--- /dev/null
+++ b/pandas/io/pickle.py
@@ -0,0 +1,239 @@
+"""pickle compat"""
+
+from __future__ import annotations
+
+import pickle
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+import warnings
+
+from pandas.compat import pickle_compat
+from pandas.util._decorators import set_module
+
+from pandas.io.common import get_handle
+
+if TYPE_CHECKING:
+    from pandas._typing import (
+        CompressionOptions,
+        FilePath,
+        ReadPickleBuffer,
+        StorageOptions,
+        WriteBuffer,
+    )
+
+    from pandas import (
+        DataFrame,
+        Series,
+    )
+
+
+@set_module("pandas")
+def to_pickle(
+    obj: Any,
+    filepath_or_buffer: FilePath | WriteBuffer[bytes],
+    compression: CompressionOptions = "infer",
+    protocol: int = pickle.HIGHEST_PROTOCOL,
+    storage_options: StorageOptions | None = None,
+) -> None:
+    """
+    Pickle (serialize) object to file.
+
+    Parameters
+    ----------
+    obj : any object
+        Any python object.
+    filepath_or_buffer : str, path object, or file-like object
+        String, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a binary ``write()`` function.
+        Also accepts URL. URL has to be of S3 or GCS.
+    compression : str or dict, default 'infer'
+        For on-the-fly compression of the output data. If 'infer' and
+        'filepath_or_buffer' is path-like, then detect compression from the
+        following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
+        '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
+        Set to ``None`` for no compression.
+        Can also be a dict with key ``'method'`` set
+        to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``,
+        ``'tar'``} and other key-value pairs are forwarded to
+        ``zipfile.ZipFile``, ``gzip.GzipFile``,
+        ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
+        ``tarfile.TarFile``, respectively.
+        As an example, the following could be passed for faster compression
+        and to create a reproducible gzip archive:
+        ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
+    protocol : int
+        Int which indicates which protocol should be used by the pickler,
+        default HIGHEST_PROTOCOL (see [1], paragraph 12.1.2). The possible
+        values for this parameter depend on the version of Python. For Python
+        2.x, possible values are 0, 1, 2. For Python>=3.0, 3 is a valid value.
+        For Python >= 3.4, 4 is a valid value. A negative value for the
+        protocol parameter is equivalent to setting its value to
+        HIGHEST_PROTOCOL.
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection, e.g.
+        host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+        are forwarded to ``urllib.request.Request`` as header options. For other
+        URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+        forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+        details, and for more examples on storage options refer `here
+        <https://pandas.pydata.org/docs/user_guide/io.html?
+        highlight=storage_options#reading-writing-remote-files>`_.
+
+        .. [1] https://docs.python.org/3/library/pickle.html
+
+    See Also
+    --------
+    read_pickle : Load pickled pandas object (or any object) from file.
+    DataFrame.to_hdf : Write DataFrame to an HDF5 file.
+    DataFrame.to_sql : Write DataFrame to a SQL database.
+    DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
+
+    Examples
+    --------
+    >>> original_df = pd.DataFrame(
+    ...     {{"foo": range(5), "bar": range(5, 10)}}
+    ... )  # doctest: +SKIP
+    >>> original_df  # doctest: +SKIP
+       foo  bar
+    0    0    5
+    1    1    6
+    2    2    7
+    3    3    8
+    4    4    9
+    >>> pd.to_pickle(original_df, "./dummy.pkl")  # doctest: +SKIP
+
+    >>> unpickled_df = pd.read_pickle("./dummy.pkl")  # doctest: +SKIP
+    >>> unpickled_df  # doctest: +SKIP
+       foo  bar
+    0    0    5
+    1    1    6
+    2    2    7
+    3    3    8
+    4    4    9
+    """
+    if protocol < 0:
+        protocol = pickle.HIGHEST_PROTOCOL
+
+    with get_handle(
+        filepath_or_buffer,
+        "wb",
+        compression=compression,
+        is_text=False,
+        storage_options=storage_options,
+    ) as handles:
+        # letting pickle write directly to the buffer is more memory-efficient
+        pickle.dump(obj, handles.handle, protocol=protocol)
+
+
+@set_module("pandas")
+def read_pickle(
+    filepath_or_buffer: FilePath | ReadPickleBuffer,
+    compression: CompressionOptions = "infer",
+    storage_options: StorageOptions | None = None,
+) -> DataFrame | Series:
+    """
+    Load pickled pandas object (or any object) from file and return unpickled object.
+
+    .. warning::
+
+       Loading pickled data received from untrusted sources can be
+       unsafe. See `here <https://docs.python.org/3/library/pickle.html>`__.
+
+    Parameters
+    ----------
+    filepath_or_buffer : str, path object, or file-like object
+        String, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a binary ``readlines()`` function.
+        Also accepts URL. URL is not limited to S3 and GCS.
+    compression : str or dict, default 'infer'
+        For on-the-fly decompression of on-disk data. If 'infer' and
+        'filepath_or_buffer' is path-like, then detect compression from the
+        following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
+        '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
+        If using 'zip' or 'tar', the ZIP file must contain only one data file
+        to be read in.
+        Set to ``None`` for no decompression.
+        Can also be a dict with key ``'method'`` set
+        to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``,
+        ``'tar'``} and other key-value pairs are forwarded to
+        ``zipfile.ZipFile``, ``gzip.GzipFile``,
+        ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or
+        ``tarfile.TarFile``, respectively.
+        As an example, the following could be passed for Zstandard decompression
+        using a custom compression dictionary:
+        ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection, e.g.
+        host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+        are forwarded to ``urllib.request.Request`` as header options. For other
+        URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+        forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+        details, and for more examples on storage options refer `here
+        <https://pandas.pydata.org/docs/user_guide/io.html?
+        highlight=storage_options#reading-writing-remote-files>`_.
+
+    Returns
+    -------
+    object
+        The unpickled pandas object (or any object) that was stored in file.
+
+    See Also
+    --------
+    DataFrame.to_pickle : Pickle (serialize) DataFrame object to file.
+    Series.to_pickle : Pickle (serialize) Series object to file.
+    read_hdf : Read HDF5 file into a DataFrame.
+    read_sql : Read SQL query or database table into a DataFrame.
+    read_parquet : Load a parquet object, returning a DataFrame.
+
+    Notes
+    -----
+    read_pickle is only guaranteed to be backwards compatible to pandas 1.0
+    provided the object was serialized with to_pickle.
+
+    Examples
+    --------
+    >>> original_df = pd.DataFrame(
+    ...     {{"foo": range(5), "bar": range(5, 10)}}
+    ... )  # doctest: +SKIP
+    >>> original_df  # doctest: +SKIP
+       foo  bar
+    0    0    5
+    1    1    6
+    2    2    7
+    3    3    8
+    4    4    9
+    >>> pd.to_pickle(original_df, "./dummy.pkl")  # doctest: +SKIP
+
+    >>> unpickled_df = pd.read_pickle("./dummy.pkl")  # doctest: +SKIP
+    >>> unpickled_df  # doctest: +SKIP
+       foo  bar
+    0    0    5
+    1    1    6
+    2    2    7
+    3    3    8
+    4    4    9
+    """
+    # TypeError for Cython complaints about object.__new__ vs Tick.__new__
+    excs_to_catch = (AttributeError, ImportError, ModuleNotFoundError, TypeError)
+    with get_handle(
+        filepath_or_buffer,
+        "rb",
+        compression=compression,
+        is_text=False,
+        storage_options=storage_options,
+    ) as handles:
+        # 1) try standard library Pickle
+        # 2) try pickle_compat (older pandas version) to handle subclass changes
+        try:
+            with warnings.catch_warnings(record=True):
+                # We want to silence any warnings about, e.g. moved modules.
+                warnings.simplefilter("ignore", Warning)
+                return pickle.load(handles.handle)
+        except excs_to_catch:
+            # e.g.
+            #  "No module named 'pandas.core.sparse.series'"
+            #  "Can't get attribute '_nat_unpickle' on <module 'pandas._libs.tslib"
+            handles.handle.seek(0)
+            return pickle_compat.Unpickler(handles.handle).load()
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
new file mode 100644
index 0000000000000000000000000000000000000000..398d36fa467aad03862de65db351694b7e6cd57d
--- /dev/null
+++ b/pandas/io/pytables.py
@@ -0,0 +1,5599 @@
+"""
+High level interface to PyTables for reading and writing pandas data structures
+to disk
+"""
+
+from __future__ import annotations
+
+from contextlib import suppress
+import copy
+from datetime import (
+    date,
+    tzinfo,
+)
+import itertools
+import os
+import re
+from textwrap import dedent
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Final,
+    Literal,
+    Self,
+    TypeAlias,
+    cast,
+    overload,
+)
+import warnings
+
+import numpy as np
+
+from pandas._config import (
+    config,
+    get_option,
+    using_string_dtype,
+)
+
+from pandas._libs import (
+    lib,
+    writers as libwriters,
+)
+from pandas._libs.lib import is_string_array
+from pandas._libs.tslibs import timezones
+from pandas.compat import HAS_PYARROW
+from pandas.compat._optional import import_optional_dependency
+from pandas.compat.pickle_compat import patch_pickle
+from pandas.errors import (
+    AttributeConflictWarning,
+    ClosedFileError,
+    IncompatibilityWarning,
+    PerformanceWarning,
+    PossibleDataLossError,
+)
+from pandas.util._decorators import (
+    cache_readonly,
+    set_module,
+)
+from pandas.util._exceptions import find_stack_level
+
+from pandas.core.dtypes.common import (
+    ensure_object,
+    is_bool_dtype,
+    is_complex_dtype,
+    is_list_like,
+    is_string_dtype,
+    needs_i8_conversion,
+)
+from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
+    DatetimeTZDtype,
+    ExtensionDtype,
+    PeriodDtype,
+)
+from pandas.core.dtypes.missing import array_equivalent
+
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    MultiIndex,
+    PeriodIndex,
+    RangeIndex,
+    Series,
+    StringDtype,
+    TimedeltaIndex,
+    concat,
+    isna,
+)
+from pandas.core.arrays import (
+    Categorical,
+    DatetimeArray,
+    PeriodArray,
+)
+from pandas.core.arrays.datetimes import tz_to_dtype
+from pandas.core.arrays.string_ import BaseStringArray
+import pandas.core.common as com
+from pandas.core.computation.pytables import (
+    PyTablesExpr,
+    maybe_expression,
+)
+from pandas.core.construction import (
+    array as pd_array,
+    extract_array,
+)
+from pandas.core.indexes.api import ensure_index
+
+from pandas.io.common import stringify_path
+from pandas.io.formats.printing import (
+    adjoin,
+    pprint_thing,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Hashable,
+        Iterator,
+        Sequence,
+    )
+    from types import (
+        ModuleType,
+        TracebackType,
+    )
+
+    from tables import (
+        Col,
+        File,
+        Node,
+    )
+
+    from pandas._typing import (
+        AnyArrayLike,
+        ArrayLike,
+        AxisInt,
+        DtypeArg,
+        FilePath,
+        TimeUnit,
+        npt,
+    )
+
+    from pandas.core.internals import Block
+
+# versioning attribute
+_version = "0.15.2"
+
+# encoding
+_default_encoding = "UTF-8"
+
+
+def _ensure_encoding(encoding: str | None) -> str:
+    # set the encoding if we need
+    if encoding is None:
+        encoding = _default_encoding
+
+    return encoding
+
+
+def _ensure_str(name):
+    """
+    Ensure that an index / column name is a str (python 3); otherwise they
+    may be np.string dtype. Non-string dtypes are passed through unchanged.
+
+    https://github.com/pandas-dev/pandas/issues/13492
+    """
+    if isinstance(name, str):
+        name = str(name)
+    return name
+
+
+Term: TypeAlias = PyTablesExpr
+
+
+def _ensure_term(where, scope_level: int):
+    """
+    Ensure that the where is a Term or a list of Term.
+
+    This makes sure that we are capturing the scope of variables that are
+    passed create the terms here with a frame_level=2 (we are 2 levels down)
+    """
+    # only consider list/tuple here as an ndarray is automatically a coordinate
+    # list
+    level = scope_level + 1
+    if isinstance(where, (list, tuple)):
+        where = [
+            Term(term, scope_level=level + 1) if maybe_expression(term) else term
+            for term in where
+            if term is not None
+        ]
+    elif maybe_expression(where):
+        where = Term(where, scope_level=level)
+    return where if where is None or len(where) else None
+
+
+incompatibility_doc: Final = """
+where criteria is being ignored as this version [%s] is too old (or
+not-defined), read the file in and write it out to a new file to upgrade (with
+the copy_to method)
+"""
+
+attribute_conflict_doc: Final = """
+the [%s] attribute of the existing index is [%s] which conflicts with the new
+[%s], resetting the attribute to None
+"""
+
+performance_doc: Final = """
+your performance may suffer as PyTables will pickle object types that it cannot
+map directly to c-types [inferred_type->%s,key->%s] [items->%s]
+"""
+
+# formats
+_FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
+
+# axes map
+_AXES_MAP = {DataFrame: [0]}
+
+# register our configuration options
+dropna_doc: Final = """
+: boolean
+    drop ALL nan rows when appending to a table
+"""
+format_doc: Final = """
+: format
+    default format writing format, if None, then
+    put will default to 'fixed' and append will default to 'table'
+"""
+
+with config.config_prefix("io.hdf"):
+    config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
+    config.register_option(
+        "default_format",
+        None,
+        format_doc,
+        validator=config.is_one_of_factory(["fixed", "table", None]),
+    )
+
+# oh the troubles to reduce import time
+_table_mod: ModuleType | None = None
+_table_file_open_policy_is_strict = False
+
+
+def _tables():
+    global _table_mod
+    global _table_file_open_policy_is_strict
+    if _table_mod is None:
+        import tables
+
+        _table_mod = tables
+
+        # set the file open policy
+        # return the file open policy; this changes as of pytables 3.1
+        # depending on the HDF5 version
+        with suppress(AttributeError):
+            _table_file_open_policy_is_strict = (
+                tables.file._FILE_OPEN_POLICY == "strict"
+            )
+
+    return _table_mod
+
+
+# interface to/from ###
+
+
+def to_hdf(
+    path_or_buf: FilePath | HDFStore,
+    key: str,
+    value: DataFrame | Series,
+    mode: str = "a",
+    complevel: int | None = None,
+    complib: str | None = None,
+    append: bool = False,
+    format: str | None = None,
+    index: bool = True,
+    min_itemsize: int | dict[str, int] | None = None,
+    nan_rep=None,
+    dropna: bool | None = None,
+    data_columns: Literal[True] | list[str] | None = None,
+    errors: str = "strict",
+    encoding: str = "UTF-8",
+) -> None:
+    """store this object, close it if we opened it"""
+    if append:
+        f = lambda store: store.append(
+            key,
+            value,
+            format=format,
+            index=index,
+            min_itemsize=min_itemsize,
+            nan_rep=nan_rep,
+            dropna=dropna,
+            data_columns=data_columns,
+            errors=errors,
+            encoding=encoding,
+        )
+    else:
+        # NB: dropna is not passed to `put`
+        f = lambda store: store.put(
+            key,
+            value,
+            format=format,
+            index=index,
+            min_itemsize=min_itemsize,
+            nan_rep=nan_rep,
+            data_columns=data_columns,
+            errors=errors,
+            encoding=encoding,
+            dropna=dropna,
+        )
+
+    if isinstance(path_or_buf, HDFStore):
+        f(path_or_buf)
+    else:
+        path_or_buf = stringify_path(path_or_buf)
+        with HDFStore(
+            path_or_buf, mode=mode, complevel=complevel, complib=complib
+        ) as store:
+            f(store)
+
+
+@set_module("pandas")
+def read_hdf(
+    path_or_buf: FilePath | HDFStore,
+    key=None,
+    mode: str = "r",
+    errors: str = "strict",
+    where: str | list | None = None,
+    start: int | None = None,
+    stop: int | None = None,
+    columns: list[str] | None = None,
+    iterator: bool = False,
+    chunksize: int | None = None,
+    **kwargs,
+):
+    """
+    Read from the store, close it if we opened it.
+
+    Retrieve pandas object stored in file, optionally based on where
+    criteria.
+
+    .. warning::
+
+       Pandas uses PyTables for reading and writing HDF5 files, which allows
+       serializing object-dtype data with pickle when using the "fixed" format.
+       Loading pickled data received from untrusted sources can be unsafe.
+
+       See: https://docs.python.org/3/library/pickle.html for more.
+
+    Parameters
+    ----------
+    path_or_buf : str, path object, pandas.HDFStore
+        Any valid string path is acceptable. Only supports the local file system,
+        remote URLs and file-like objects are not supported.
+
+        If you want to pass in a path object, pandas accepts any
+        ``os.PathLike``.
+
+        Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
+
+    key : object, optional
+        The group identifier in the store. Can be omitted if the HDF file
+        contains a single pandas object.
+    mode : {'r', 'r+', 'a'}, default 'r'
+        Mode to use when opening the file. Ignored if path_or_buf is a
+        :class:`pandas.HDFStore`. Default is 'r'.
+    errors : str, default 'strict'
+        Specifies how encoding and decoding errors are to be handled.
+        See the errors argument for :func:`open` for a full list
+        of options.
+    where : list, optional
+        A list of Term (or convertible) objects.
+    start : int, optional
+        Row number to start selection.
+    stop : int, optional
+        Row number to stop selection.
+    columns : list, optional
+        A list of columns names to return.
+    iterator : bool, optional
+        Return an iterator object.
+    chunksize : int, optional
+        Number of rows to include in an iteration when using an iterator.
+    **kwargs
+        Additional keyword arguments passed to HDFStore.
+
+    Returns
+    -------
+    object
+        The selected object. Return type depends on the object stored.
+
+    See Also
+    --------
+    DataFrame.to_hdf : Write an HDF file from a DataFrame.
+    HDFStore : Low-level access to HDF files.
+
+    Notes
+    -----
+    When ``errors="surrogatepass"``, ``pd.options.future.infer_string`` is true,
+    and PyArrow is installed, if a UTF-16 surrogate is encountered when decoding
+    to UTF-8, the resulting dtype will be
+    ``pd.StringDtype(storage="python", na_value=np.nan)``.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame([[1, 1.0, "a"]], columns=["x", "y", "z"])  # doctest: +SKIP
+    >>> df.to_hdf("./store.h5", "data")  # doctest: +SKIP
+    >>> reread = pd.read_hdf("./store.h5")  # doctest: +SKIP
+    """
+    if mode not in ["r", "r+", "a"]:
+        raise ValueError(
+            f"mode {mode} is not allowed while performing a read. "
+            f"Allowed modes are r, r+ and a."
+        )
+    # grab the scope
+    if where is not None:
+        where = _ensure_term(where, scope_level=1)
+
+    if isinstance(path_or_buf, HDFStore):
+        if not path_or_buf.is_open:
+            raise OSError("The HDFStore must be open for reading.")
+
+        store = path_or_buf
+        auto_close = False
+    else:
+        path_or_buf = stringify_path(path_or_buf)
+        if not isinstance(path_or_buf, str):
+            raise NotImplementedError(
+                "Support for generic buffers has not been implemented."
+            )
+        try:
+            exists = os.path.exists(path_or_buf)
+
+        # if filepath is too long
+        except (TypeError, ValueError):
+            exists = False
+
+        if not exists:
+            raise FileNotFoundError(f"File {path_or_buf} does not exist")
+
+        store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
+        # can't auto open/close if we are using an iterator
+        # so delegate to the iterator
+        auto_close = True
+
+    try:
+        if key is None:
+            groups = store.groups()
+            if len(groups) == 0:
+                raise ValueError(
+                    "Dataset(s) incompatible with Pandas data types, "
+                    "not table, or no datasets found in HDF5 file."
+                )
+            candidate_only_group = groups[0]
+
+            # For the HDF file to have only one dataset, all other groups
+            # should then be metadata groups for that candidate group. (This
+            # assumes that the groups() method enumerates parent groups
+            # before their children.)
+            for group_to_check in groups[1:]:
+                if not _is_metadata_of(group_to_check, candidate_only_group):
+                    raise ValueError(
+                        "key must be provided when HDF5 "
+                        "file contains multiple datasets."
+                    )
+            key = candidate_only_group._v_pathname
+        return store.select(
+            key,
+            where=where,
+            start=start,
+            stop=stop,
+            columns=columns,
+            iterator=iterator,
+            chunksize=chunksize,
+            auto_close=auto_close,
+        )
+    except (ValueError, TypeError, LookupError):
+        if not isinstance(path_or_buf, HDFStore):
+            # if there is an error, close the store if we opened it.
+            with suppress(AttributeError):
+                store.close()
+
+        raise
+
+
+def _is_metadata_of(group: Node, parent_group: Node) -> bool:
+    """Check if a given group is a metadata group for a given parent_group."""
+    if group._v_depth <= parent_group._v_depth:
+        return False
+
+    current = group
+    while current._v_depth > 1:
+        parent = current._v_parent
+        if parent == parent_group and current._v_name == "meta":
+            return True
+        current = current._v_parent
+    return False
+
+
+@set_module("pandas")
+class HDFStore:
+    """
+    Dict-like IO interface for storing pandas objects in PyTables.
+
+    Either Fixed or Table format.
+
+    .. warning::
+
+       Pandas uses PyTables for reading and writing HDF5 files, which allows
+       serializing object-dtype data with pickle when using the "fixed" format.
+       Loading pickled data received from untrusted sources can be unsafe.
+
+       See: https://docs.python.org/3/library/pickle.html for more.
+
+    Parameters
+    ----------
+    path : str
+        File path to HDF5 file.
+    mode : {'a', 'w', 'r', 'r+'}, default 'a'
+
+        ``'r'``
+            Read-only; no data can be modified.
+        ``'w'``
+            Write; a new file is created (an existing file with the same
+            name would be deleted).
+        ``'a'``
+            Append; an existing file is opened for reading and writing,
+            and if the file does not exist it is created.
+        ``'r+'``
+            It is similar to ``'a'``, but the file must already exist.
+    complevel : int, 0-9, default None
+        Specifies a compression level for data.
+        A value of 0 or None disables compression.
+    complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
+        Specifies the compression library to be used.
+        These additional compressors for Blosc are supported
+        (default if no compressor specified: 'blosc:blosclz'):
+        {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
+         'blosc:zlib', 'blosc:zstd'}.
+        Specifying a compression library which is not available issues
+        a ValueError.
+    fletcher32 : bool, default False
+        If applying compression use the fletcher32 checksum.
+    **kwargs
+        These parameters will be passed to the PyTables open_file method.
+
+    Examples
+    --------
+    >>> bar = pd.DataFrame(np.random.randn(10, 4))
+    >>> store = pd.HDFStore("test.h5")
+    >>> store["foo"] = bar  # write to HDF5
+    >>> bar = store["foo"]  # retrieve
+    >>> store.close()
+
+    **Create or load HDF5 file in-memory**
+
+    When passing the `driver` option to the PyTables open_file method through
+    **kwargs, the HDF5 file is loaded or created in-memory and will only be
+    written when closed:
+
+    >>> bar = pd.DataFrame(np.random.randn(10, 4))
+    >>> store = pd.HDFStore("test.h5", driver="H5FD_CORE")
+    >>> store["foo"] = bar
+    >>> store.close()  # only now, data is written to disk
+    """
+
+    _handle: File | None
+    _mode: str
+
+    def __init__(
+        self,
+        path,
+        mode: str = "a",
+        complevel: int | None = None,
+        complib=None,
+        fletcher32: bool = False,
+        **kwargs,
+    ) -> None:
+        if "format" in kwargs:
+            raise ValueError("format is not a defined argument for HDFStore")
+
+        tables = import_optional_dependency("tables")
+
+        if complib is not None and complib not in tables.filters.all_complibs:
+            raise ValueError(
+                f"complib only supports {tables.filters.all_complibs} compression."
+            )
+
+        if complib is None and complevel is not None:
+            complib = tables.filters.default_complib
+
+        self._path = stringify_path(path)
+        if mode is None:
+            mode = "a"
+        self._mode = mode
+        self._handle = None
+        self._complevel = complevel if complevel else 0
+        self._complib = complib
+        self._fletcher32 = fletcher32
+        self._filters = None
+        self.open(mode=mode, **kwargs)
+
+    def __fspath__(self) -> str:
+        return self._path
+
+    @property
+    def root(self):
+        """return the root node"""
+        self._check_if_open()
+        assert self._handle is not None  # for mypy
+        return self._handle.root
+
+    @property
+    def filename(self) -> str:
+        return self._path
+
+    def __getitem__(self, key: str):
+        return self.get(key)
+
+    def __setitem__(self, key: str, value) -> None:
+        self.put(key, value)
+
+    def __delitem__(self, key: str) -> int | None:
+        return self.remove(key)
+
+    def __getattr__(self, name: str):
+        """allow attribute access to get stores"""
+        try:
+            return self.get(name)
+        except (KeyError, ClosedFileError):
+            pass
+        raise AttributeError(
+            f"'{type(self).__name__}' object has no attribute '{name}'"
+        )
+
+    def __contains__(self, key: str) -> bool:
+        """
+        check for existence of this key
+        can match the exact pathname or the pathnm w/o the leading '/'
+        """
+        node = self.get_node(key)
+        if node is not None:
+            name = node._v_pathname
+            if key in (name, name[1:]):
+                return True
+        return False
+
+    def __len__(self) -> int:
+        return len(self.groups())
+
+    def __repr__(self) -> str:
+        pstr = pprint_thing(self._path)
+        return f"{type(self)}\nFile path: {pstr}\n"
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: TracebackType | None,
+    ) -> None:
+        self.close()
+
+    def keys(self, include: str = "pandas") -> list[str]:
+        """
+        Return a list of keys corresponding to objects stored in HDFStore.
+
+        Parameters
+        ----------
+
+        include : str, default 'pandas'
+                When kind equals 'pandas' return pandas objects.
+                When kind equals 'native' return native HDF5 Table objects.
+
+        Returns
+        -------
+        list
+            List of ABSOLUTE path-names (e.g. have the leading '/').
+
+        Raises
+        ------
+        raises ValueError if kind has an illegal value
+
+        See Also
+        --------
+        HDFStore.info : Prints detailed information on the store.
+        HDFStore.get_node : Returns the node with the key.
+        HDFStore.get_storer : Returns the storer object for a key.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+        >>> store = pd.HDFStore("store.h5", "w")  # doctest: +SKIP
+        >>> store.put("data", df)  # doctest: +SKIP
+        >>> store.get("data")  # doctest: +SKIP
+        >>> print(store.keys())  # doctest: +SKIP
+        ['/data1', '/data2']
+        >>> store.close()  # doctest: +SKIP
+        """
+        if include == "pandas":
+            return [n._v_pathname for n in self.groups()]
+
+        elif include == "native":
+            assert self._handle is not None  # mypy
+            return [
+                n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")
+            ]
+        raise ValueError(
+            f"`include` should be either 'pandas' or 'native' but is '{include}'"
+        )
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self.keys())
+
+    def items(self) -> Iterator[tuple[str, list]]:
+        """
+        iterate on key->group
+        """
+        for g in self.groups():
+            yield g._v_pathname, g
+
+    def open(self, mode: str = "a", **kwargs) -> None:
+        """
+        Open the file in the specified mode
+
+        Parameters
+        ----------
+        mode : {'a', 'w', 'r', 'r+'}, default 'a'
+            See HDFStore docstring or tables.open_file for info about modes
+        **kwargs
+            These parameters will be passed to the PyTables open_file method.
+        """
+        tables = _tables()
+
+        if self._mode != mode:
+            # if we are changing a write mode to read, ok
+            if self._mode in ["a", "w"] and mode in ["r", "r+"]:
+                pass
+            elif mode in ["w"]:
+                # this would truncate, raise here
+                if self.is_open:
+                    raise PossibleDataLossError(
+                        f"Re-opening the file [{self._path}] with mode [{self._mode}] "
+                        "will delete the current file!"
+                    )
+
+            self._mode = mode
+
+        # close and reopen the handle
+        if self.is_open:
+            self.close()
+
+        if self._complevel and self._complevel > 0:
+            self._filters = _tables().Filters(
+                self._complevel, self._complib, fletcher32=self._fletcher32
+            )
+
+        if _table_file_open_policy_is_strict and self.is_open:
+            msg = (
+                "Cannot open HDF5 file, which is already opened, "
+                "even in read-only mode."
+            )
+            raise ValueError(msg)
+
+        self._handle = tables.open_file(self._path, self._mode, **kwargs)
+
+    def close(self) -> None:
+        """
+        Close the PyTables file handle
+        """
+        if self._handle is not None:
+            self._handle.close()
+        self._handle = None
+
+    @property
+    def is_open(self) -> bool:
+        """
+        return a boolean indicating whether the file is open
+        """
+        if self._handle is None:
+            return False
+        return bool(self._handle.isopen)
+
+    def flush(self, fsync: bool = False) -> None:
+        """
+        Force all buffered modifications to be written to disk.
+
+        Parameters
+        ----------
+        fsync : bool (default False)
+          call ``os.fsync()`` on the file handle to force writing to disk.
+
+        Notes
+        -----
+        Without ``fsync=True``, flushing may not guarantee that the OS writes
+        to disk. With fsync, the operation will block until the OS claims the
+        file has been written; however, other caching layers may still
+        interfere.
+        """
+        if self._handle is not None:
+            self._handle.flush()
+            if fsync:
+                with suppress(OSError):
+                    os.fsync(self._handle.fileno())
+
+    def get(self, key: str):
+        """
+        Retrieve pandas object stored in file.
+
+        Parameters
+        ----------
+        key : str
+            Object to retrieve from file. Raises KeyError if not found.
+
+        Returns
+        -------
+        object
+            Same type as object stored in file.
+
+        See Also
+        --------
+        HDFStore.get_node : Returns the node with the key.
+        HDFStore.get_storer : Returns the storer object for a key.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+        >>> store = pd.HDFStore("store.h5", "w")  # doctest: +SKIP
+        >>> store.put("data", df)  # doctest: +SKIP
+        >>> store.get("data")  # doctest: +SKIP
+        >>> store.close()  # doctest: +SKIP
+        """
+        with patch_pickle():
+            # GH#31167 Without this patch, pickle doesn't know how to unpickle
+            #  old DateOffset objects now that they are cdef classes.
+            group = self.get_node(key)
+            if group is None:
+                raise KeyError(f"No object named {key} in the file")
+            return self._read_group(group)
+
+    def select(
+        self,
+        key: str,
+        where=None,
+        start=None,
+        stop=None,
+        columns=None,
+        iterator: bool = False,
+        chunksize: int | None = None,
+        auto_close: bool = False,
+    ):
+        """
+        Retrieve pandas object stored in file, optionally based on where criteria.
+
+        .. warning::
+
+           Pandas uses PyTables for reading and writing HDF5 files, which allows
+           serializing object-dtype data with pickle when using the "fixed" format.
+           Loading pickled data received from untrusted sources can be unsafe.
+
+           See: https://docs.python.org/3/library/pickle.html for more.
+
+        Parameters
+        ----------
+        key : str
+            Object being retrieved from file.
+        where : list or None
+            List of Term (or convertible) objects, optional.
+        start : int or None
+            Row number to start selection.
+        stop : int, default None
+            Row number to stop selection.
+        columns : list or None
+            A list of columns that if not None, will limit the return columns.
+        iterator : bool or False
+            Returns an iterator.
+        chunksize : int or None
+            Number or rows to include in iteration, return an iterator.
+        auto_close : bool or False
+            Should automatically close the store when finished.
+
+        Returns
+        -------
+        object
+            Retrieved object from file.
+
+        See Also
+        --------
+        HDFStore.select_as_coordinates : Returns the selection as an index.
+        HDFStore.select_column : Returns a single column from the table.
+        HDFStore.select_as_multiple : Retrieves pandas objects from multiple tables.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+        >>> store = pd.HDFStore("store.h5", "w")  # doctest: +SKIP
+        >>> store.put("data", df)  # doctest: +SKIP
+        >>> store.get("data")  # doctest: +SKIP
+        >>> print(store.keys())  # doctest: +SKIP
+        ['/data1', '/data2']
+        >>> store.select("/data1")  # doctest: +SKIP
+           A  B
+        0  1  2
+        1  3  4
+        >>> store.select("/data1", where="columns == A")  # doctest: +SKIP
+           A
+        0  1
+        1  3
+        >>> store.close()  # doctest: +SKIP
+        """
+        group = self.get_node(key)
+        if group is None:
+            raise KeyError(f"No object named {key} in the file")
+
+        # create the storer and axes
+        where = _ensure_term(where, scope_level=1)
+        s = self._create_storer(group)
+        s.infer_axes()
+
+        # function to call on iteration
+        def func(_start, _stop, _where):
+            return s.read(start=_start, stop=_stop, where=_where, columns=columns)
+
+        # create the iterator
+        it = TableIterator(
+            self,
+            s,
+            func,
+            where=where,
+            nrows=s.nrows,
+            start=start,
+            stop=stop,
+            iterator=iterator,
+            chunksize=chunksize,
+            auto_close=auto_close,
+        )
+
+        return it.get_result()
+
+    def select_as_coordinates(
+        self,
+        key: str,
+        where=None,
+        start: int | None = None,
+        stop: int | None = None,
+    ):
+        """
+        return the selection as an Index
+
+        .. warning::
+
+           Pandas uses PyTables for reading and writing HDF5 files, which allows
+           serializing object-dtype data with pickle when using the "fixed" format.
+           Loading pickled data received from untrusted sources can be unsafe.
+
+           See: https://docs.python.org/3/library/pickle.html for more.
+
+
+        Parameters
+        ----------
+        key : str
+        where : list of Term (or convertible) objects, optional
+        start : integer (defaults to None), row number to start selection
+        stop  : integer (defaults to None), row number to stop selection
+        """
+        where = _ensure_term(where, scope_level=1)
+        tbl = self.get_storer(key)
+        if not isinstance(tbl, Table):
+            raise TypeError("can only read_coordinates with a table")
+        return tbl.read_coordinates(where=where, start=start, stop=stop)
+
+    def select_column(
+        self,
+        key: str,
+        column: str,
+        start: int | None = None,
+        stop: int | None = None,
+    ):
+        """
+        return a single column from the table. This is generally only useful to
+        select an indexable
+
+        .. warning::
+
+           Pandas uses PyTables for reading and writing HDF5 files, which allows
+           serializing object-dtype data with pickle when using the "fixed" format.
+           Loading pickled data received from untrusted sources can be unsafe.
+
+           See: https://docs.python.org/3/library/pickle.html for more.
+
+        Parameters
+        ----------
+        key : str
+        column : str
+            The column of interest.
+        start : int or None, default None
+        stop : int or None, default None
+
+        Raises
+        ------
+        raises KeyError if the column is not found (or key is not a valid
+            store)
+        raises ValueError if the column can not be extracted individually (it
+            is part of a data block)
+
+        """
+        tbl = self.get_storer(key)
+        if not isinstance(tbl, Table):
+            raise TypeError("can only read_column with a table")
+        return tbl.read_column(column=column, start=start, stop=stop)
+
+    def select_as_multiple(
+        self,
+        keys,
+        where=None,
+        selector=None,
+        columns=None,
+        start=None,
+        stop=None,
+        iterator: bool = False,
+        chunksize: int | None = None,
+        auto_close: bool = False,
+    ):
+        """
+        Retrieve pandas objects from multiple tables.
+
+        .. warning::
+
+           Pandas uses PyTables for reading and writing HDF5 files, which allows
+           serializing object-dtype data with pickle when using the "fixed" format.
+           Loading pickled data received from untrusted sources can be unsafe.
+
+           See: https://docs.python.org/3/library/pickle.html for more.
+
+        Parameters
+        ----------
+        keys : a list of the tables
+        selector : the table to apply the where criteria (defaults to keys[0]
+            if not supplied)
+        columns : the columns I want back
+        start : integer (defaults to None), row number to start selection
+        stop  : integer (defaults to None), row number to stop selection
+        iterator : bool, return an iterator, default False
+        chunksize : nrows to include in iteration, return an iterator
+        auto_close : bool, default False
+            Should automatically close the store when finished.
+
+        Raises
+        ------
+        raises KeyError if keys or selector is not found or keys is empty
+        raises TypeError if keys is not a list or tuple
+        raises ValueError if the tables are not ALL THE SAME DIMENSIONS
+        """
+        # default to single select
+        where = _ensure_term(where, scope_level=1)
+        if isinstance(keys, (list, tuple)) and len(keys) == 1:
+            keys = keys[0]
+        if isinstance(keys, str):
+            return self.select(
+                key=keys,
+                where=where,
+                columns=columns,
+                start=start,
+                stop=stop,
+                iterator=iterator,
+                chunksize=chunksize,
+                auto_close=auto_close,
+            )
+
+        if not isinstance(keys, (list, tuple)):
+            raise TypeError("keys must be a list/tuple")
+
+        if not len(keys):
+            raise ValueError("keys must have a non-zero length")
+
+        if selector is None:
+            selector = keys[0]
+
+        # collect the tables
+        tbls = [self.get_storer(k) for k in keys]
+        s = self.get_storer(selector)
+
+        # validate rows
+        nrows = None
+        for t, k in itertools.chain([(s, selector)], zip(tbls, keys, strict=True)):
+            if t is None:
+                raise KeyError(f"Invalid table [{k}]")
+            if not t.is_table:
+                raise TypeError(
+                    f"object [{t.pathname}] is not a table, and cannot be used in all "
+                    "select as multiple"
+                )
+
+            if nrows is None:
+                nrows = t.nrows
+            elif t.nrows != nrows:
+                raise ValueError("all tables must have exactly the same nrows!")
+
+        # The isinstance checks here are redundant with the check above,
+        #  but necessary for mypy; see GH#29757
+        _tbls = [x for x in tbls if isinstance(x, Table)]
+
+        # axis is the concentration axes
+        axis = {t.non_index_axes[0][0] for t in _tbls}.pop()
+
+        def func(_start, _stop, _where):
+            # retrieve the objs, _where is always passed as a set of
+            # coordinates here
+            objs = [
+                t.read(where=_where, columns=columns, start=_start, stop=_stop)
+                for t in tbls
+            ]
+
+            # concat and return
+            return concat(objs, axis=axis, verify_integrity=False)._consolidate()
+
+        # create the iterator
+        it = TableIterator(
+            self,
+            s,
+            func,
+            where=where,
+            nrows=nrows,
+            start=start,
+            stop=stop,
+            iterator=iterator,
+            chunksize=chunksize,
+            auto_close=auto_close,
+        )
+
+        return it.get_result(coordinates=True)
+
+    def put(
+        self,
+        key: str,
+        value: DataFrame | Series,
+        format=None,
+        index: bool = True,
+        append: bool = False,
+        complib=None,
+        complevel: int | None = None,
+        min_itemsize: int | dict[str, int] | None = None,
+        nan_rep=None,
+        data_columns: Literal[True] | list[str] | None = None,
+        encoding=None,
+        errors: str = "strict",
+        track_times: bool = True,
+        dropna: bool = False,
+    ) -> None:
+        """
+        Store object in HDFStore.
+
+        This method writes a pandas DataFrame or Series into an HDF5 file using
+        either the fixed or table format. The `table` format allows additional
+        operations like incremental appends and queries but may have performance
+        trade-offs. The `fixed` format provides faster read/write operations but
+        does not support appends or queries.
+
+        Parameters
+        ----------
+        key : str
+            Key of object to store in file.
+        value : {Series, DataFrame}
+            Value of object to store in file.
+        format : 'fixed(f)|table(t)', default is 'fixed'
+            Format to use when storing object in HDFStore. Value can be one of:
+
+            ``'fixed'``
+                Fixed format.  Fast writing/reading. Not-appendable, nor searchable.
+            ``'table'``
+                Table format.  Write as a PyTables Table structure which may perform
+                worse but allow more flexible operations like searching / selecting
+                subsets of the data.
+        index : bool, default True
+            Write DataFrame index as a column.
+        append : bool, default False
+            This will force Table format, append the input data to the existing.
+        complib : default None
+            This parameter is currently not accepted.
+        complevel : int, 0-9, default None
+            Specifies a compression level for data.
+            A value of 0 or None disables compression.
+        min_itemsize : int, dict, or None
+            Dict of columns that specify minimum str sizes.
+        nan_rep : str
+            Str to use as str nan representation.
+        data_columns : list of columns or True, default None
+            List of columns to create as data columns, or True to use all columns.
+            See `here
+            <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
+        encoding : str, default None
+            Provide an encoding for strings.
+        errors : str, default 'strict'
+            The error handling scheme to use for encoding errors.
+            The default is 'strict' meaning that encoding errors raise a
+            UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
+            'xmlcharrefreplace' as well as any other name registered with
+            codecs.register_error that can handle UnicodeEncodeErrors.
+        track_times : bool, default True
+            Parameter is propagated to 'create_table' method of 'PyTables'.
+            If set to False it enables to have the same h5 files (same hashes)
+            independent on creation time.
+        dropna : bool, default False, optional
+            Remove missing values.
+
+        See Also
+        --------
+        HDFStore.info : Prints detailed information on the store.
+        HDFStore.get_storer : Returns the storer object for a key.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+        >>> store = pd.HDFStore("store.h5", "w")  # doctest: +SKIP
+        >>> store.put("data", df)  # doctest: +SKIP
+        """
+        if format is None:
+            format = get_option("io.hdf.default_format") or "fixed"
+        format = self._validate_format(format)
+        self._write_to_group(
+            key,
+            value,
+            format=format,
+            index=index,
+            append=append,
+            complib=complib,
+            complevel=complevel,
+            min_itemsize=min_itemsize,
+            nan_rep=nan_rep,
+            data_columns=data_columns,
+            encoding=encoding,
+            errors=errors,
+            track_times=track_times,
+            dropna=dropna,
+        )
+
+    def remove(self, key: str, where=None, start=None, stop=None) -> int | None:
+        """
+        Remove pandas object partially by specifying the where condition
+
+        Parameters
+        ----------
+        key : str
+            Node to remove or delete rows from
+        where : list of Term (or convertible) objects, optional
+        start : integer (defaults to None), row number to start selection
+        stop  : integer (defaults to None), row number to stop selection
+
+        Returns
+        -------
+        number of rows removed (or None if not a Table)
+
+        Raises
+        ------
+        raises KeyError if key is not a valid store
+
+        """
+        where = _ensure_term(where, scope_level=1)
+        try:
+            s = self.get_storer(key)
+        except KeyError:
+            # the key is not a valid store, re-raising KeyError
+            raise
+        except AssertionError:
+            # surface any assertion errors for e.g. debugging
+            raise
+        except Exception as err:
+            # In tests we get here with ClosedFileError, TypeError, and
+            #  _table_mod.NoSuchNodeError.  TODO: Catch only these?
+
+            if where is not None:
+                raise ValueError(
+                    "trying to remove a node with a non-None where clause!"
+                ) from err
+
+            # we are actually trying to remove a node (with children)
+            node = self.get_node(key)
+            if node is not None:
+                node._f_remove(recursive=True)
+                return None
+
+        # remove the node
+        if com.all_none(where, start, stop):
+            s.group._f_remove(recursive=True)
+            return None
+
+        # delete from the table
+        if not s.is_table:
+            raise ValueError("can only remove with where on objects written as tables")
+        return s.delete(where=where, start=start, stop=stop)
+
+    def append(
+        self,
+        key: str,
+        value: DataFrame | Series,
+        format=None,
+        axes=None,
+        index: bool | list[str] = True,
+        append: bool = True,
+        complib=None,
+        complevel: int | None = None,
+        columns=None,
+        min_itemsize: int | dict[str, int] | None = None,
+        nan_rep=None,
+        chunksize: int | None = None,
+        expectedrows=None,
+        dropna: bool | None = None,
+        data_columns: Literal[True] | list[str] | None = None,
+        encoding=None,
+        errors: str = "strict",
+    ) -> None:
+        """
+        Append to Table in file.
+
+        Node must already exist and be Table format.
+
+        Parameters
+        ----------
+        key : str
+            Key of object to append.
+        value : {Series, DataFrame}
+            Value of object to append.
+        format : 'table' is the default
+            Format to use when storing object in HDFStore.  Value can be one of:
+
+            ``'table'``
+                Table format. Write as a PyTables Table structure which may perform
+                worse but allow more flexible operations like searching / selecting
+                subsets of the data.
+        axes : default None
+            This parameter is currently not accepted.
+        index : bool, default True
+            Write DataFrame index as a column.
+        append : bool, default True
+            Append the input data to the existing.
+        complib : default None
+            This parameter is currently not accepted.
+        complevel : int, 0-9, default None
+            Specifies a compression level for data.
+            A value of 0 or None disables compression.
+        columns : default None
+            This parameter is currently not accepted, try data_columns.
+        min_itemsize : int, dict, or None
+            Dict of columns that specify minimum str sizes.
+        nan_rep : str
+            Str to use as str nan representation.
+        chunksize : int or None
+            Size to chunk the writing.
+        expectedrows : int
+            Expected TOTAL row size of this table.
+        dropna : bool, default False, optional
+            Do not write an ALL nan row to the store settable
+            by the option 'io.hdf.dropna_table'.
+        data_columns : list of columns, or True, default None
+            List of columns to create as indexed data columns for on-disk
+            queries, or True to use all columns. By default only the axes
+            of the object are indexed. See `here
+            <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
+        encoding : default None
+            Provide an encoding for str.
+        errors : str, default 'strict'
+            The error handling scheme to use for encoding errors.
+            The default is 'strict' meaning that encoding errors raise a
+            UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
+            'xmlcharrefreplace' as well as any other name registered with
+            codecs.register_error that can handle UnicodeEncodeErrors.
+
+        See Also
+        --------
+        HDFStore.append_to_multiple : Append to multiple tables.
+
+        Notes
+        -----
+        Does *not* check if data being appended overlaps with existing
+        data in the table, so be careful
+
+        Examples
+        --------
+        >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+        >>> store = pd.HDFStore("store.h5", "w")  # doctest: +SKIP
+        >>> store.put("data", df1, format="table")  # doctest: +SKIP
+        >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["A", "B"])
+        >>> store.append("data", df2)  # doctest: +SKIP
+        >>> store.close()  # doctest: +SKIP
+           A  B
+        0  1  2
+        1  3  4
+        0  5  6
+        1  7  8
+        """
+        if columns is not None:
+            raise TypeError(
+                "columns is not a supported keyword in append, try data_columns"
+            )
+
+        if dropna is None:
+            dropna = get_option("io.hdf.dropna_table")
+        if format is None:
+            format = get_option("io.hdf.default_format") or "table"
+        format = self._validate_format(format)
+        self._write_to_group(
+            key,
+            value,
+            format=format,
+            axes=axes,
+            index=index,
+            append=append,
+            complib=complib,
+            complevel=complevel,
+            min_itemsize=min_itemsize,
+            nan_rep=nan_rep,
+            chunksize=chunksize,
+            expectedrows=expectedrows,
+            dropna=dropna,
+            data_columns=data_columns,
+            encoding=encoding,
+            errors=errors,
+        )
+
+    def append_to_multiple(
+        self,
+        d: dict,
+        value,
+        selector,
+        data_columns=None,
+        axes=None,
+        dropna: bool = False,
+        **kwargs,
+    ) -> None:
+        """
+        Append to multiple tables
+
+        Parameters
+        ----------
+        d : a dict of table_name to table_columns, None is acceptable as the
+            values of one node (this will get all the remaining columns)
+        value : a pandas object
+        selector : a string that designates the indexable table; all of its
+            columns will be designed as data_columns, unless data_columns is
+            passed, in which case these are used
+        data_columns : list of columns to create as data columns, or True to
+            use all columns
+        dropna : if evaluates to True, drop rows from all tables if any single
+                 row in each table has all NaN. Default False.
+
+        Notes
+        -----
+        axes parameter is currently not accepted
+
+        """
+        if axes is not None:
+            raise TypeError(
+                "axes is currently not accepted as a parameter to append_to_multiple; "
+                "you can create the tables independently instead"
+            )
+
+        if not isinstance(d, dict):
+            raise ValueError(
+                "append_to_multiple must have a dictionary specified as the "
+                "way to split the value"
+            )
+
+        if selector not in d:
+            raise ValueError(
+                "append_to_multiple requires a selector that is in passed dict"
+            )
+
+        # figure out the splitting axis (the non_index_axis)
+        axis = next(iter(set(range(value.ndim)) - set(_AXES_MAP[type(value)])))
+
+        # figure out how to split the value
+        remain_key = None
+        remain_values: list = []
+        for k, v in d.items():
+            if v is None:
+                if remain_key is not None:
+                    raise ValueError(
+                        "append_to_multiple can only have one value in d that is None"
+                    )
+                remain_key = k
+            else:
+                remain_values.extend(v)
+        if remain_key is not None:
+            ordered = value.axes[axis]
+            ordd = ordered.difference(Index(remain_values))
+            ordd = sorted(ordered.get_indexer(ordd))
+            d[remain_key] = ordered.take(ordd)
+
+        # data_columns
+        if data_columns is None:
+            data_columns = d[selector]
+
+        # ensure rows are synchronized across the tables
+        if dropna:
+            idxs = (value[cols].dropna(how="all").index for cols in d.values())
+            valid_index = next(idxs)
+            for index in idxs:
+                valid_index = valid_index.intersection(index)
+            value = value.loc[valid_index]
+
+        min_itemsize = kwargs.pop("min_itemsize", None)
+
+        # append
+        for k, v in d.items():
+            dc = data_columns if k == selector else None
+
+            # compute the val
+            val = value.reindex(v, axis=axis)
+
+            filtered = (
+                {key: value for (key, value) in min_itemsize.items() if key in v}
+                if min_itemsize is not None
+                else None
+            )
+            self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)
+
+    def create_table_index(
+        self,
+        key: str,
+        columns=None,
+        optlevel: int | None = None,
+        kind: str | None = None,
+    ) -> None:
+        """
+        Create a pytables index on the table.
+
+        Parameters
+        ----------
+        key : str
+        columns : None, bool, or listlike[str]
+            Indicate which columns to create an index on.
+
+            * False : Do not create any indexes.
+            * True : Create indexes on all columns.
+            * None : Create indexes on all columns.
+            * listlike : Create indexes on the given columns.
+
+        optlevel : int or None, default None
+            Optimization level, if None, pytables defaults to 6.
+        kind : str or None, default None
+            Kind of index, if None, pytables defaults to "medium".
+
+        Raises
+        ------
+        TypeError: raises if the node is not a table
+        """
+        # version requirements
+        _tables()
+        s = self.get_storer(key)
+        if s is None:
+            return
+
+        if not isinstance(s, Table):
+            raise TypeError("cannot create table index on a Fixed format store")
+        s.create_index(columns=columns, optlevel=optlevel, kind=kind)
+
+    def groups(self) -> list:
+        """
+        Return a list of all the top-level nodes.
+
+        Each node returned is not a pandas storage object.
+
+        Returns
+        -------
+        list
+            List of objects.
+
+        See Also
+        --------
+        HDFStore.get_node : Returns the node with the key.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+        >>> store = pd.HDFStore("store.h5", "w")  # doctest: +SKIP
+        >>> store.put("data", df)  # doctest: +SKIP
+        >>> print(store.groups())  # doctest: +SKIP
+        >>> store.close()  # doctest: +SKIP
+        [/data (Group) ''
+          children := ['axis0' (Array), 'axis1' (Array), 'block0_values' (Array),
+          'block0_items' (Array)]]
+        """
+        _tables()
+        self._check_if_open()
+        assert self._handle is not None  # for mypy
+        assert _table_mod is not None  # for mypy
+        return [
+            g
+            for g in self._handle.walk_groups()
+            if (
+                not isinstance(g, _table_mod.link.Link)
+                and (
+                    getattr(g._v_attrs, "pandas_type", None)
+                    or getattr(g, "table", None)
+                    or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
+                )
+            )
+        ]
+
+    def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]:
+        """
+        Walk the pytables group hierarchy for pandas objects.
+
+        This generator will yield the group path, subgroups and pandas object
+        names for each group.
+
+        Any non-pandas PyTables objects that are not a group will be ignored.
+
+        The `where` group itself is listed first (preorder), then each of its
+        child groups (following an alphanumerical order) is also traversed,
+        following the same procedure.
+
+        Parameters
+        ----------
+        where : str, default "/"
+            Group where to start walking.
+
+        Yields
+        ------
+        path : str
+            Full path to a group (without trailing '/').
+        groups : list
+            Names (strings) of the groups contained in `path`.
+        leaves : list
+            Names (strings) of the pandas objects contained in `path`.
+
+        See Also
+        --------
+        HDFStore.info : Prints detailed information on the store.
+
+        Examples
+        --------
+        >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+        >>> store = pd.HDFStore("store.h5", "w")  # doctest: +SKIP
+        >>> store.put("data", df1, format="table")  # doctest: +SKIP
+        >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["A", "B"])
+        >>> store.append("data", df2)  # doctest: +SKIP
+        >>> store.close()  # doctest: +SKIP
+        >>> for group in store.walk():  # doctest: +SKIP
+        ...     print(group)  # doctest: +SKIP
+        >>> store.close()  # doctest: +SKIP
+        """
+        _tables()
+        self._check_if_open()
+        assert self._handle is not None  # for mypy
+        assert _table_mod is not None  # for mypy
+
+        for g in self._handle.walk_groups(where):
+            if getattr(g._v_attrs, "pandas_type", None) is not None:
+                continue
+
+            groups = []
+            leaves = []
+            for child in g._v_children.values():
+                pandas_type = getattr(child._v_attrs, "pandas_type", None)
+                if pandas_type is None:
+                    if isinstance(child, _table_mod.group.Group):
+                        groups.append(child._v_name)
+                else:
+                    leaves.append(child._v_name)
+
+            yield (g._v_pathname.rstrip("/"), groups, leaves)
+
+    def get_node(self, key: str) -> Node | None:
+        """return the node with the key or None if it does not exist"""
+        self._check_if_open()
+        if not key.startswith("/"):
+            key = "/" + key
+
+        assert self._handle is not None
+        assert _table_mod is not None  # for mypy
+        try:
+            node = self._handle.get_node(self.root, key)
+        except _table_mod.exceptions.NoSuchNodeError:
+            return None
+
+        assert isinstance(node, _table_mod.Node), type(node)
+        return node
+
+    def get_storer(self, key: str) -> GenericFixed | Table:
+        """return the storer object for a key, raise if not in the file"""
+        group = self.get_node(key)
+        if group is None:
+            raise KeyError(f"No object named {key} in the file")
+
+        s = self._create_storer(group)
+        s.infer_axes()
+        return s
+
+    def copy(
+        self,
+        file,
+        mode: str = "w",
+        propindexes: bool = True,
+        keys=None,
+        complib=None,
+        complevel: int | None = None,
+        fletcher32: bool = False,
+        overwrite: bool = True,
+    ) -> HDFStore:
+        """
+        Copy the existing store to a new file, updating in place.
+
+        Parameters
+        ----------
+        propindexes : bool, default True
+            Restore indexes in copied file.
+        keys : list, optional
+            List of keys to include in the copy (defaults to all).
+        overwrite : bool, default True
+            Whether to overwrite (remove and replace) existing nodes in the new store.
+        mode, complib, complevel, fletcher32 same as in HDFStore.__init__
+
+        Returns
+        -------
+        open file handle of the new store
+        """
+        new_store = HDFStore(
+            file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
+        )
+        if keys is None:
+            keys = list(self.keys())
+        if not isinstance(keys, (tuple, list)):
+            keys = [keys]
+        for k in keys:
+            s = self.get_storer(k)
+            if s is not None:
+                if k in new_store:
+                    if overwrite:
+                        new_store.remove(k)
+
+                data = self.select(k)
+                if isinstance(s, Table):
+                    index: bool | list[str] = False
+                    if propindexes:
+                        index = [a.name for a in s.axes if a.is_indexed]
+                    new_store.append(
+                        k,
+                        data,
+                        index=index,
+                        data_columns=getattr(s, "data_columns", None),
+                        encoding=s.encoding,
+                    )
+                else:
+                    new_store.put(k, data, encoding=s.encoding)
+
+        return new_store
+
+    def info(self) -> str:
+        """
+        Print detailed information on the store.
+
+        Returns
+        -------
+        str
+            A String containing the python pandas class name, filepath to the HDF5
+            file and all the object keys along with their respective dataframe shapes.
+
+        See Also
+        --------
+        HDFStore.get_storer : Returns the storer object for a key.
+
+        Examples
+        --------
+        >>> df1 = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+        >>> df2 = pd.DataFrame([[5, 6], [7, 8]], columns=["C", "D"])
+        >>> store = pd.HDFStore("store.h5", "w")  # doctest: +SKIP
+        >>> store.put("data1", df1)  # doctest: +SKIP
+        >>> store.put("data2", df2)  # doctest: +SKIP
+        >>> print(store.info())  # doctest: +SKIP
+        >>> store.close()  # doctest: +SKIP
+        <class 'pandas.io.pytables.HDFStore'>
+        File path: store.h5
+        /data1            frame        (shape->[2,2])
+        /data2            frame        (shape->[2,2])
+        """
+        path = pprint_thing(self._path)
+        output = f"{type(self)}\nFile path: {path}\n"
+
+        if self.is_open:
+            lkeys = sorted(self.keys())
+            if lkeys:
+                keys = []
+                values = []
+
+                for k in lkeys:
+                    try:
+                        s = self.get_storer(k)
+                        if s is not None:
+                            keys.append(pprint_thing(s.pathname or k))
+                            values.append(pprint_thing(s or "invalid_HDFStore node"))
+                    except AssertionError:
+                        # surface any assertion errors for e.g. debugging
+                        raise
+                    except Exception as detail:
+                        keys.append(k)
+                        dstr = pprint_thing(detail)
+                        values.append(f"[invalid_HDFStore node: {dstr}]")
+
+                output += adjoin(12, keys, values)
+            else:
+                output += "Empty"
+        else:
+            output += "File is CLOSED"
+
+        return output
+
+    # ------------------------------------------------------------------------
+    # private methods
+
+    def _check_if_open(self) -> None:
+        if not self.is_open:
+            raise ClosedFileError(f"{self._path} file is not open!")
+
+    def _validate_format(self, format: str) -> str:
+        """validate / deprecate formats"""
+        # validate
+        try:
+            format = _FORMAT_MAP[format.lower()]
+        except KeyError as err:
+            raise TypeError(f"invalid HDFStore format specified [{format}]") from err
+
+        return format
+
+    def _create_storer(
+        self,
+        group,
+        format=None,
+        value: DataFrame | Series | None = None,
+        encoding: str = "UTF-8",
+        errors: str = "strict",
+    ) -> GenericFixed | Table:
+        """return a suitable class to operate"""
+        cls: type[GenericFixed | Table]
+
+        if value is not None and not isinstance(value, (Series, DataFrame)):
+            raise TypeError("value must be None, Series, or DataFrame")
+
+        pt = getattr(group._v_attrs, "pandas_type", None)
+        tt = getattr(group._v_attrs, "table_type", None)
+
+        # infer the pt from the passed value
+        if pt is None:
+            if value is None:
+                _tables()
+                assert _table_mod is not None  # for mypy
+                if getattr(group, "table", None) or isinstance(
+                    group, _table_mod.table.Table
+                ):
+                    pt = "frame_table"
+                    tt = "generic_table"
+                else:
+                    raise TypeError(
+                        "cannot create a storer if the object is not existing "
+                        "nor a value are passed"
+                    )
+            else:
+                if isinstance(value, Series):
+                    pt = "series"
+                else:
+                    pt = "frame"
+
+                # we are actually a table
+                if format == "table":
+                    pt += "_table"
+
+        # a storer node
+        if "table" not in pt:
+            _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}
+            try:
+                cls = _STORER_MAP[pt]
+            except KeyError as err:
+                raise TypeError(
+                    f"cannot properly create the storer for: [_STORER_MAP] [group->"
+                    f"{group},value->{type(value)},format->{format}"
+                ) from err
+            return cls(self, group, encoding=encoding, errors=errors)
+
+        # existing node (and must be a table)
+        if tt is None:
+            # if we are a writer, determine the tt
+            if value is not None:
+                if pt == "series_table":
+                    index = getattr(value, "index", None)
+                    if index is not None:
+                        if index.nlevels == 1:
+                            tt = "appendable_series"
+                        elif index.nlevels > 1:
+                            tt = "appendable_multiseries"
+                elif pt == "frame_table":
+                    index = getattr(value, "index", None)
+                    if index is not None:
+                        if index.nlevels == 1:
+                            tt = "appendable_frame"
+                        elif index.nlevels > 1:
+                            tt = "appendable_multiframe"
+
+        _TABLE_MAP = {
+            "generic_table": GenericTable,
+            "appendable_series": AppendableSeriesTable,
+            "appendable_multiseries": AppendableMultiSeriesTable,
+            "appendable_frame": AppendableFrameTable,
+            "appendable_multiframe": AppendableMultiFrameTable,
+            "worm": WORMTable,
+        }
+        try:
+            cls = _TABLE_MAP[tt]  # type: ignore[index]
+        except KeyError as err:
+            raise TypeError(
+                f"cannot properly create the storer for: [_TABLE_MAP] [group->"
+                f"{group},value->{type(value)},format->{format}"
+            ) from err
+
+        return cls(self, group, encoding=encoding, errors=errors)
+
+    def _write_to_group(
+        self,
+        key: str,
+        value: DataFrame | Series,
+        format,
+        axes=None,
+        index: bool | list[str] = True,
+        append: bool = False,
+        complib=None,
+        complevel: int | None = None,
+        fletcher32=None,
+        min_itemsize: int | dict[str, int] | None = None,
+        chunksize: int | None = None,
+        expectedrows=None,
+        dropna: bool = False,
+        nan_rep=None,
+        data_columns=None,
+        encoding=None,
+        errors: str = "strict",
+        track_times: bool = True,
+    ) -> None:
+        # we don't want to store a table node at all if our object is 0-len
+        # as there are not dtypes
+        if getattr(value, "empty", None) and (format == "table" or append):
+            return
+
+        group = self._identify_group(key, append)
+
+        s = self._create_storer(group, format, value, encoding=encoding, errors=errors)
+        if append:
+            # raise if we are trying to append to a Fixed format,
+            #       or a table that exists (and we are putting)
+            if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
+                raise ValueError("Can only append to Tables")
+            if not s.is_exists:
+                s.set_object_info()
+        else:
+            s.set_object_info()
+
+        if not s.is_table and complib:
+            raise ValueError("Compression not supported on Fixed format stores")
+
+        # write the object
+        s.write(
+            obj=value,
+            axes=axes,
+            append=append,
+            complib=complib,
+            complevel=complevel,
+            fletcher32=fletcher32,
+            min_itemsize=min_itemsize,
+            chunksize=chunksize,
+            expectedrows=expectedrows,
+            dropna=dropna,
+            nan_rep=nan_rep,
+            data_columns=data_columns,
+            track_times=track_times,
+        )
+
+        if isinstance(s, Table) and index:
+            s.create_index(columns=index)
+
+    def _read_group(self, group: Node):
+        s = self._create_storer(group)
+        s.infer_axes()
+        return s.read()
+
+    def _identify_group(self, key: str, append: bool) -> Node:
+        """Identify HDF5 group based on key, delete/create group if needed."""
+        group = self.get_node(key)
+
+        # we make this assertion for mypy; the get_node call will already
+        # have raised if this is incorrect
+        assert self._handle is not None
+
+        # remove the node if we are not appending
+        if group is not None and not append:
+            self._handle.remove_node(group, recursive=True)
+            group = None
+
+        if group is None:
+            group = self._create_nodes_and_group(key)
+
+        return group
+
+    def _create_nodes_and_group(self, key: str) -> Node:
+        """Create nodes from key and return group name."""
+        # assertion for mypy
+        assert self._handle is not None
+
+        paths = key.split("/")
+        # recursively create the groups
+        path = "/"
+        for p in paths:
+            if not len(p):
+                continue
+            new_path = path
+            if not path.endswith("/"):
+                new_path += "/"
+            new_path += p
+            group = self.get_node(new_path)
+            if group is None:
+                group = self._handle.create_group(path, p)
+            path = new_path
+        return group
+
+
+class TableIterator:
+    """
+    Define the iteration interface on a table
+
+    Parameters
+    ----------
+    store : HDFStore
+    s     : the referred storer
+    func  : the function to execute the query
+    where : the where of the query
+    nrows : the rows to iterate on
+    start : the passed start value (default is None)
+    stop  : the passed stop value (default is None)
+    iterator : bool, default False
+        Whether to use the default iterator.
+    chunksize : the passed chunking value (default is 100000)
+    auto_close : bool, default False
+        Whether to automatically close the store at the end of iteration.
+    """
+
+    chunksize: int | None
+    store: HDFStore
+    s: GenericFixed | Table
+
+    def __init__(
+        self,
+        store: HDFStore,
+        s: GenericFixed | Table,
+        func,
+        where,
+        nrows,
+        start=None,
+        stop=None,
+        iterator: bool = False,
+        chunksize: int | None = None,
+        auto_close: bool = False,
+    ) -> None:
+        self.store = store
+        self.s = s
+        self.func = func
+        self.where = where
+
+        # set start/stop if they are not set if we are a table
+        if self.s.is_table:
+            if nrows is None:
+                nrows = 0
+            if start is None:
+                start = 0
+            if stop is None:
+                stop = nrows
+            stop = min(nrows, stop)
+
+        self.nrows = nrows
+        self.start = start
+        self.stop = stop
+
+        self.coordinates = None
+        if iterator or chunksize is not None:
+            if chunksize is None:
+                chunksize = 100000
+            self.chunksize = int(chunksize)
+        else:
+            self.chunksize = None
+
+        self.auto_close = auto_close
+
+    def __iter__(self) -> Iterator:
+        # iterate
+        current = self.start
+        if self.coordinates is None:
+            raise ValueError("Cannot iterate until get_result is called.")
+        while current < self.stop:
+            stop = min(current + self.chunksize, self.stop)
+            value = self.func(None, None, self.coordinates[current:stop])
+            current = stop
+            if value is None or not len(value):
+                continue
+
+            yield value
+
+        self.close()
+
+    def close(self) -> None:
+        if self.auto_close:
+            self.store.close()
+
+    def get_result(self, coordinates: bool = False):
+        #  return the actual iterator
+        if self.chunksize is not None:
+            if not isinstance(self.s, Table):
+                raise TypeError("can only use an iterator or chunksize on a table")
+
+            self.coordinates = self.s.read_coordinates(where=self.where)
+
+            return self
+
+        # if specified read via coordinates (necessary for multiple selections
+        if coordinates:
+            if not isinstance(self.s, Table):
+                raise TypeError("can only read_coordinates on a table")
+            where = self.s.read_coordinates(
+                where=self.where, start=self.start, stop=self.stop
+            )
+        else:
+            where = self.where
+
+        # directly return the result
+        results = self.func(self.start, self.stop, where)
+        self.close()
+        return results
+
+
+class IndexCol:
+    """
+    an index column description class
+
+    Parameters
+    ----------
+    axis   : axis which I reference
+    values : the ndarray like converted values
+    kind   : a string description of this type
+    typ    : the pytables type
+    pos    : the position in the pytables
+
+    """
+
+    is_an_indexable: bool = True
+    is_data_indexable: bool = True
+    _info_fields = ["freq", "tz", "index_name"]
+
+    def __init__(
+        self,
+        name: str,
+        values=None,
+        kind=None,
+        typ=None,
+        cname: str | None = None,
+        axis=None,
+        pos=None,
+        freq=None,
+        tz=None,
+        index_name=None,
+        ordered=None,
+        table=None,
+        meta=None,
+        metadata=None,
+    ) -> None:
+        if not isinstance(name, str):
+            raise ValueError("`name` must be a str.")
+
+        self.values = values
+        self.kind = kind
+        self.typ = typ
+        self.name = name
+        self.cname = cname or name
+        self.axis = axis
+        self.pos = pos
+        self.freq = freq
+        self.tz = tz
+        self.index_name = index_name
+        self.ordered = ordered
+        self.table = table
+        self.meta = meta
+        self.metadata = metadata
+
+        if pos is not None:
+            self.set_pos(pos)
+
+        # These are ensured as long as the passed arguments match the
+        #  constructor annotations.
+        assert isinstance(self.name, str)
+        assert isinstance(self.cname, str)
+
+    @property
+    def itemsize(self) -> int:
+        # Assumes self.typ has already been initialized
+        return self.typ.itemsize
+
+    @property
+    def kind_attr(self) -> str:
+        return f"{self.name}_kind"
+
+    def set_pos(self, pos: int) -> None:
+        """set the position of this column in the Table"""
+        self.pos = pos
+        if pos is not None and self.typ is not None:
+            self.typ._v_pos = pos
+
+    def __repr__(self) -> str:
+        temp = tuple(
+            map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
+        )
+        return ",".join(
+            [
+                f"{key}->{value}"
+                for key, value in zip(
+                    ["name", "cname", "axis", "pos", "kind"], temp, strict=True
+                )
+            ]
+        )
+
+    def __eq__(self, other: object) -> bool:
+        """compare 2 col items"""
+        return all(
+            getattr(self, a, None) == getattr(other, a, None)
+            for a in ["name", "cname", "axis", "pos"]
+        )
+
+    def __ne__(self, other) -> bool:
+        return not self.__eq__(other)
+
+    @property
+    def is_indexed(self) -> bool:
+        """return whether I am an indexed column"""
+        if not hasattr(self.table, "cols"):
+            # e.g. if infer hasn't been called yet, self.table will be None.
+            return False
+        return getattr(self.table.cols, self.cname).is_indexed
+
+    def convert(
+        self, values: np.ndarray, nan_rep, encoding: str, errors: str
+    ) -> tuple[np.ndarray, np.ndarray] | tuple[Index, Index]:
+        """
+        Convert the data from this selection to the appropriate pandas type.
+        """
+        assert isinstance(values, np.ndarray), type(values)
+
+        # values is a recarray
+        if values.dtype.fields is not None:
+            # Copy, otherwise values will be a view
+            # preventing the original recarry from being free'ed
+            values = values[self.cname].copy()
+
+        val_kind = self.kind
+        values = _maybe_convert(values, val_kind, encoding, errors)
+        kwargs = {}
+        kwargs["name"] = self.index_name
+
+        if self.freq is not None:
+            kwargs["freq"] = self.freq
+
+        factory: type[Index | DatetimeIndex] = Index
+        if lib.is_np_dtype(values.dtype, "M") or isinstance(
+            values.dtype, DatetimeTZDtype
+        ):
+            factory = DatetimeIndex
+        elif values.dtype == "i8" and "freq" in kwargs:
+            # PeriodIndex data is stored as i8
+            # error: Incompatible types in assignment (expression has type
+            # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type
+            # "Union[Type[Index], Type[DatetimeIndex]]")
+            factory = lambda x, **kwds: PeriodIndex.from_ordinals(  # type: ignore[assignment]
+                x, freq=kwds.get("freq", None)
+            )._rename(kwds["name"])
+
+        # making an Index instance could throw a number of different errors
+        try:
+            new_pd_index = factory(values, **kwargs)
+        except UnicodeEncodeError as err:
+            if (
+                errors == "surrogatepass"
+                and using_string_dtype()
+                and str(err).endswith("surrogates not allowed")
+                and HAS_PYARROW
+            ):
+                new_pd_index = factory(
+                    values,
+                    dtype=StringDtype(storage="python", na_value=np.nan),
+                    **kwargs,
+                )
+            else:
+                raise
+        except ValueError:
+            # if the output freq is different that what we recorded,
+            # it should be None (see also 'doc example part 2')
+            if "freq" in kwargs:
+                kwargs["freq"] = None
+            new_pd_index = factory(values, **kwargs)
+
+        final_pd_index: Index
+        if self.tz is not None and isinstance(new_pd_index, DatetimeIndex):
+            final_pd_index = new_pd_index.tz_localize("UTC").tz_convert(self.tz)
+        else:
+            final_pd_index = new_pd_index
+        return final_pd_index, final_pd_index
+
+    def take_data(self):
+        """return the values"""
+        return self.values
+
+    @property
+    def attrs(self):
+        return self.table._v_attrs
+
+    @property
+    def description(self):
+        return self.table.description
+
+    @property
+    def col(self):
+        """return my current col description"""
+        return getattr(self.description, self.cname, None)
+
+    @property
+    def cvalues(self):
+        """return my cython values"""
+        return self.values
+
+    def __iter__(self) -> Iterator:
+        return iter(self.values)
+
+    def maybe_set_size(self, min_itemsize=None) -> None:
+        """
+        maybe set a string col itemsize:
+            min_itemsize can be an integer or a dict with this columns name
+            with an integer size
+        """
+        if self.kind == "string":
+            if isinstance(min_itemsize, dict):
+                min_itemsize = min_itemsize.get(self.name)
+
+            if min_itemsize is not None and self.typ.itemsize < min_itemsize:
+                self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
+
+    def validate_names(self) -> None:
+        pass
+
+    def validate_and_set(self, handler: AppendableTable, append: bool) -> None:
+        self.table = handler.table
+        self.validate_col()
+        self.validate_attr(append)
+        self.validate_metadata(handler)
+        self.write_metadata(handler)
+        self.set_attr()
+
+    def validate_col(self, itemsize=None):
+        """validate this column: return the compared against itemsize"""
+        # validate this column for string truncation (or reset to the max size)
+        if self.kind == "string":
+            c = self.col
+            if c is not None:
+                if itemsize is None:
+                    itemsize = self.itemsize
+                if c.itemsize < itemsize:
+                    raise ValueError(
+                        f"Trying to store a string with len [{itemsize}] in "
+                        f"[{self.cname}] column but\nthis column has a limit of "
+                        f"[{c.itemsize}]!\nConsider using min_itemsize to "
+                        "preset the sizes on these columns"
+                    )
+                return c.itemsize
+
+        return None
+
+    def validate_attr(self, append: bool) -> None:
+        # check for backwards incompatibility
+        if append:
+            existing_kind = getattr(self.attrs, self.kind_attr, None)
+            if existing_kind is not None and existing_kind != self.kind:
+                raise TypeError(
+                    f"incompatible kind in col [{existing_kind} - {self.kind}]"
+                )
+
+    def update_info(self, info) -> None:
+        """
+        set/update the info for this indexable with the key/value
+        if there is a conflict raise/warn as needed
+        """
+        for key in self._info_fields:
+            value = getattr(self, key, None)
+            idx = info.setdefault(self.name, {})
+
+            existing_value = idx.get(key)
+            if key in idx and value is not None and existing_value != value:
+                # frequency/name just warn
+                if key in ["freq", "index_name"]:
+                    ws = attribute_conflict_doc % (key, existing_value, value)
+                    warnings.warn(
+                        ws, AttributeConflictWarning, stacklevel=find_stack_level()
+                    )
+
+                    # reset
+                    idx[key] = None
+                    setattr(self, key, None)
+
+                else:
+                    raise ValueError(
+                        f"invalid info for [{self.name}] for [{key}], "
+                        f"existing_value [{existing_value}] conflicts with "
+                        f"new value [{value}]"
+                    )
+            elif value is not None or existing_value is not None:
+                idx[key] = value
+
+    def set_info(self, info) -> None:
+        """set my state from the passed info"""
+        idx = info.get(self.name)
+        if idx is not None:
+            self.__dict__.update(idx)
+
+    def set_attr(self) -> None:
+        """set the kind for this column"""
+        setattr(self.attrs, self.kind_attr, self.kind)
+
+    def validate_metadata(self, handler: AppendableTable) -> None:
+        """validate that kind=category does not change the categories"""
+        if self.meta == "category":
+            new_metadata = self.metadata
+            cur_metadata = handler.read_metadata(self.cname)
+            if (
+                new_metadata is not None
+                and cur_metadata is not None
+                and not array_equivalent(
+                    new_metadata, cur_metadata, strict_nan=True, dtype_equal=True
+                )
+            ):
+                raise ValueError(
+                    "cannot append a categorical with "
+                    "different categories to the existing"
+                )
+
+    def write_metadata(self, handler: AppendableTable) -> None:
+        """set the meta data"""
+        if self.metadata is not None:
+            handler.write_metadata(self.cname, self.metadata)
+
+
+class GenericIndexCol(IndexCol):
+    """an index which is not represented in the data of the table"""
+
+    @property
+    def is_indexed(self) -> bool:
+        return False
+
+    def convert(
+        self, values: np.ndarray, nan_rep, encoding: str, errors: str
+    ) -> tuple[Index, Index]:
+        """
+        Convert the data from this selection to the appropriate pandas type.
+
+        Parameters
+        ----------
+        values : np.ndarray
+        nan_rep : str
+        encoding : str
+        errors : str
+        """
+        assert isinstance(values, np.ndarray), type(values)
+
+        index = RangeIndex(len(values))
+        return index, index
+
+    def set_attr(self) -> None:
+        pass
+
+
+class DataCol(IndexCol):
+    """
+    a data holding column, by definition this is not indexable
+
+    Parameters
+    ----------
+    data   : the actual data
+    cname  : the column name in the table to hold the data (typically
+                values)
+    meta   : a string description of the metadata
+    metadata : the actual metadata
+    """
+
+    is_an_indexable = False
+    is_data_indexable = False
+    _info_fields = ["tz", "ordered"]
+
+    def __init__(
+        self,
+        name: str,
+        values=None,
+        kind=None,
+        typ=None,
+        cname: str | None = None,
+        pos=None,
+        tz=None,
+        ordered=None,
+        table=None,
+        meta=None,
+        metadata=None,
+        dtype: DtypeArg | None = None,
+        data=None,
+    ) -> None:
+        super().__init__(
+            name=name,
+            values=values,
+            kind=kind,
+            typ=typ,
+            pos=pos,
+            cname=cname,
+            tz=tz,
+            ordered=ordered,
+            table=table,
+            meta=meta,
+            metadata=metadata,
+        )
+        self.dtype = dtype
+        self.data = data
+
+    @property
+    def dtype_attr(self) -> str:
+        return f"{self.name}_dtype"
+
+    @property
+    def meta_attr(self) -> str:
+        return f"{self.name}_meta"
+
+    def __repr__(self) -> str:
+        temp = tuple(
+            map(
+                pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
+            )
+        )
+        return ",".join(
+            [
+                f"{key}->{value}"
+                for key, value in zip(
+                    ["name", "cname", "dtype", "kind", "shape"], temp, strict=True
+                )
+            ]
+        )
+
+    def __eq__(self, other: object) -> bool:
+        """compare 2 col items"""
+        return all(
+            getattr(self, a, None) == getattr(other, a, None)
+            for a in ["name", "cname", "dtype", "pos"]
+        )
+
+    def set_data(self, data: ArrayLike) -> None:
+        assert data is not None
+        assert self.dtype is None
+
+        data, dtype_name = _get_data_and_dtype_name(data)
+
+        self.data = data
+        self.dtype = dtype_name
+        self.kind = _dtype_to_kind(dtype_name)
+
+    def take_data(self):
+        """return the data"""
+        return self.data
+
+    @classmethod
+    def _get_atom(cls, values: ArrayLike) -> Col:
+        """
+        Get an appropriately typed and shaped pytables.Col object for values.
+        """
+        dtype = values.dtype
+        # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no
+        # attribute "itemsize"
+        itemsize = dtype.itemsize  # type: ignore[union-attr]
+
+        shape = values.shape
+        if values.ndim == 1:
+            # EA, use block shape pretending it is 2D
+            # TODO(EA2D): not necessary with 2D EAs
+            shape = (1, values.size)
+
+        if isinstance(values, Categorical):
+            codes = values.codes
+            atom = cls.get_atom_data(shape, kind=codes.dtype.name)
+        elif lib.is_np_dtype(dtype, "M") or isinstance(dtype, DatetimeTZDtype):
+            atom = cls.get_atom_datetime64(shape)
+        elif lib.is_np_dtype(dtype, "m"):
+            atom = cls.get_atom_timedelta64(shape)
+        elif is_complex_dtype(dtype):
+            atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])
+        elif is_string_dtype(dtype):
+            atom = cls.get_atom_string(shape, itemsize)
+        else:
+            atom = cls.get_atom_data(shape, kind=dtype.name)
+
+        return atom
+
+    @classmethod
+    def get_atom_string(cls, shape, itemsize):
+        return _tables().StringCol(itemsize=itemsize, shape=shape[0])
+
+    @classmethod
+    def get_atom_coltype(cls, kind: str) -> type[Col]:
+        """return the PyTables column class for this column"""
+        if kind.startswith("uint"):
+            k4 = kind[4:]
+            col_name = f"UInt{k4}Col"
+        elif kind.startswith("period"):
+            # we store as integer
+            col_name = "Int64Col"
+        else:
+            kcap = kind.capitalize()
+            col_name = f"{kcap}Col"
+
+        return getattr(_tables(), col_name)
+
+    @classmethod
+    def get_atom_data(cls, shape, kind: str) -> Col:
+        return cls.get_atom_coltype(kind=kind)(shape=shape[0])
+
+    @classmethod
+    def get_atom_datetime64(cls, shape):
+        return _tables().Int64Col(shape=shape[0])
+
+    @classmethod
+    def get_atom_timedelta64(cls, shape):
+        return _tables().Int64Col(shape=shape[0])
+
+    @property
+    def shape(self):
+        return getattr(self.data, "shape", None)
+
+    @property
+    def cvalues(self):
+        """return my cython values"""
+        return self.data
+
+    def validate_attr(self, append) -> None:
+        """validate that we have the same order as the existing & same dtype"""
+        if append:
+            existing_fields = getattr(self.attrs, self.kind_attr, None)
+            if existing_fields is not None and existing_fields != list(self.values):
+                raise ValueError("appended items do not match existing items in table!")
+
+            existing_dtype = getattr(self.attrs, self.dtype_attr, None)
+            if existing_dtype is not None and existing_dtype != self.dtype:
+                raise ValueError(
+                    "appended items dtype do not match existing items dtype in table!"
+                )
+
+    def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
+        """
+        Convert the data from this selection to the appropriate pandas type.
+
+        Parameters
+        ----------
+        values : np.ndarray
+        nan_rep :
+        encoding : str
+        errors : str
+
+        Returns
+        -------
+        index : listlike to become an Index
+        data : ndarraylike to become a column
+        """
+        assert isinstance(values, np.ndarray), type(values)
+
+        # values is a recarray
+        if values.dtype.fields is not None:
+            values = values[self.cname]
+
+        assert self.typ is not None
+        if self.dtype is None:
+            # Note: in tests we never have timedelta64 or datetime64,
+            #  so the _get_data_and_dtype_name may be unnecessary
+            converted, dtype_name = _get_data_and_dtype_name(values)
+            kind = _dtype_to_kind(dtype_name)
+        else:
+            converted = values
+            dtype_name = self.dtype
+            kind = self.kind
+
+        assert isinstance(converted, np.ndarray)  # for mypy
+
+        # use the meta if needed
+        meta = self.meta
+        metadata = self.metadata
+        ordered = self.ordered
+        tz = self.tz
+
+        assert dtype_name is not None
+        # convert to the correct dtype
+        dtype = dtype_name
+
+        # reverse converts
+        if dtype.startswith("datetime64"):
+            # recreate with tz if indicated
+            if dtype == "datetime64":
+                dtype = "datetime64[ns]"
+            converted = _set_tz(converted, tz, dtype)
+
+        elif dtype.startswith("timedelta64"):
+            if dtype == "timedelta64":
+                # from before we started storing timedelta64 unit
+                converted = np.asarray(converted, dtype="m8[ns]")
+            else:
+                converted = np.asarray(converted, dtype=dtype)
+        elif dtype == "date":
+            try:
+                converted = np.asarray(
+                    [date.fromordinal(v) for v in converted], dtype=object
+                )
+            except ValueError:
+                converted = np.asarray(
+                    [date.fromtimestamp(v) for v in converted], dtype=object
+                )
+
+        elif meta == "category":
+            # we have a categorical
+            categories = metadata
+            codes = converted.ravel()
+
+            # if we have stored a NaN in the categories
+            # then strip it; in theory we could have BOTH
+            # -1s in the codes and nulls :<
+            if categories is None:
+                # Handle case of NaN-only categorical columns in which case
+                # the categories are an empty array; when this is stored,
+                # pytables cannot write a zero-len array, so on readback
+                # the categories would be None and `read_hdf()` would fail.
+                categories = Index([], dtype=np.float64)
+            else:
+                mask = isna(categories)
+                if mask.any():
+                    categories = categories[~mask]
+                    codes[codes != -1] -= mask.astype(int).cumsum()._values
+
+            converted = Categorical.from_codes(
+                codes, categories=categories, ordered=ordered, validate=False
+            )
+
+        else:
+            try:
+                converted = converted.astype(dtype, copy=False)
+            except TypeError:
+                converted = converted.astype("O", copy=False)
+
+        # convert nans / decode
+        if kind == "string":
+            converted = _unconvert_string_array(
+                converted, nan_rep=nan_rep, encoding=encoding, errors=errors
+            )
+
+        return self.values, converted
+
+    def set_attr(self) -> None:
+        """set the data for this column"""
+        setattr(self.attrs, self.kind_attr, self.values)
+        setattr(self.attrs, self.meta_attr, self.meta)
+        assert self.dtype is not None
+        setattr(self.attrs, self.dtype_attr, self.dtype)
+
+
+class DataIndexableCol(DataCol):
+    """represent a data column that can be indexed"""
+
+    is_data_indexable = True
+
+    def validate_names(self) -> None:
+        if not is_string_dtype(Index(self.values).dtype):
+            # TODO: should the message here be more specifically non-str?
+            raise ValueError("cannot have non-object label DataIndexableCol")
+
+    @classmethod
+    def get_atom_string(cls, shape, itemsize):
+        return _tables().StringCol(itemsize=itemsize)
+
+    @classmethod
+    def get_atom_data(cls, shape, kind: str) -> Col:
+        return cls.get_atom_coltype(kind=kind)()
+
+    @classmethod
+    def get_atom_datetime64(cls, shape):
+        return _tables().Int64Col()
+
+    @classmethod
+    def get_atom_timedelta64(cls, shape):
+        return _tables().Int64Col()
+
+
+class GenericDataIndexableCol(DataIndexableCol):
+    """represent a generic pytables data column"""
+
+
+class Fixed:
+    """
+    represent an object in my store
+    facilitate read/write of various types of objects
+    this is an abstract base class
+
+    Parameters
+    ----------
+    parent : HDFStore
+    group : Node
+        The group node where the table resides.
+    """
+
+    pandas_kind: str
+    format_type: str = "fixed"  # GH#30962 needed by dask
+    obj_type: type[DataFrame | Series]
+    ndim: int
+    parent: HDFStore
+    is_table: bool = False
+
+    def __init__(
+        self,
+        parent: HDFStore,
+        group: Node,
+        encoding: str | None = "UTF-8",
+        errors: str = "strict",
+    ) -> None:
+        assert isinstance(parent, HDFStore), type(parent)
+        assert _table_mod is not None  # needed for mypy
+        assert isinstance(group, _table_mod.Node), type(group)
+        self.parent = parent
+        self.group = group
+        self.encoding = _ensure_encoding(encoding)
+        self.errors = errors
+
+    @property
+    def is_old_version(self) -> bool:
+        return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
+
+    @property
+    def version(self) -> tuple[int, int, int]:
+        """compute and set our version"""
+        version = getattr(self.group._v_attrs, "pandas_version", None)
+        if isinstance(version, str):
+            version_tup = tuple(int(x) for x in version.split("."))
+            if len(version_tup) == 2:
+                version_tup = (*version_tup, 0)
+            assert len(version_tup) == 3  # needed for mypy
+            return version_tup
+        else:
+            return (0, 0, 0)
+
+    @property
+    def pandas_type(self):
+        return getattr(self.group._v_attrs, "pandas_type", None)
+
+    def __repr__(self) -> str:
+        """return a pretty representation of myself"""
+        self.infer_axes()
+        s = self.shape
+        if s is not None:
+            if isinstance(s, (list, tuple)):
+                jshape = ",".join([pprint_thing(x) for x in s])
+                s = f"[{jshape}]"
+            return f"{self.pandas_type:12.12} (shape->{s})"
+        return self.pandas_type
+
+    def set_object_info(self) -> None:
+        """set my pandas type & version"""
+        self.attrs.pandas_type = str(self.pandas_kind)
+        self.attrs.pandas_version = str(_version)
+
+    def copy(self) -> Fixed:
+        new_self = copy.copy(self)
+        return new_self
+
+    @property
+    def shape(self):
+        return self.nrows
+
+    @property
+    def pathname(self):
+        return self.group._v_pathname
+
+    @property
+    def _handle(self):
+        return self.parent._handle
+
+    @property
+    def _filters(self):
+        return self.parent._filters
+
+    @property
+    def _complevel(self) -> int:
+        return self.parent._complevel
+
+    @property
+    def _fletcher32(self) -> bool:
+        return self.parent._fletcher32
+
+    @property
+    def attrs(self):
+        return self.group._v_attrs
+
+    def set_attrs(self) -> None:
+        """set our object attributes"""
+
+    def get_attrs(self) -> None:
+        """get our object attributes"""
+
+    @property
+    def storable(self):
+        """return my storable"""
+        return self.group
+
+    @property
+    def is_exists(self) -> bool:
+        return False
+
+    @property
+    def nrows(self):
+        return getattr(self.storable, "nrows", None)
+
+    def validate(self, other) -> Literal[True] | None:
+        """validate against an existing storable"""
+        if other is None:
+            return None
+        return True
+
+    def validate_version(self, where=None) -> None:
+        """are we trying to operate on an old version?"""
+
+    def infer_axes(self) -> bool:
+        """
+        infer the axes of my storer
+        return a boolean indicating if we have a valid storer or not
+        """
+        s = self.storable
+        if s is None:
+            return False
+        self.get_attrs()
+        return True
+
+    def read(
+        self,
+        where=None,
+        columns=None,
+        start: int | None = None,
+        stop: int | None = None,
+    ) -> Series | DataFrame:
+        raise NotImplementedError(
+            "cannot read on an abstract storer: subclasses should implement"
+        )
+
+    def write(self, obj, **kwargs) -> None:
+        raise NotImplementedError(
+            "cannot write on an abstract storer: subclasses should implement"
+        )
+
+    def delete(
+        self, where=None, start: int | None = None, stop: int | None = None
+    ) -> int | None:
+        """
+        support fully deleting the node in its entirety (only) - where
+        specification must be None
+        """
+        if com.all_none(where, start, stop):
+            self._handle.remove_node(self.group, recursive=True)
+            return None
+
+        raise TypeError("cannot delete on an abstract storer")
+
+
+class GenericFixed(Fixed):
+    """a generified fixed version"""
+
+    _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
+    _reverse_index_map = {v: k for k, v in _index_type_map.items()}
+    attributes: list[str] = []
+
+    # indexer helpers
+    def _class_to_alias(self, cls) -> str:
+        return self._index_type_map.get(cls, "")
+
+    def _alias_to_class(self, alias):
+        if isinstance(alias, type):  # pragma: no cover
+            # compat: for a short period of time master stored types
+            return alias
+        return self._reverse_index_map.get(alias, Index)
+
+    def _get_index_factory(self, attrs):
+        index_class = self._alias_to_class(getattr(attrs, "index_class", ""))
+
+        factory: Callable
+
+        kwargs = {}
+        if index_class == DatetimeIndex:
+
+            def f(values, freq=None, tz=None):
+                # data are already in UTC, localize and convert if tz present
+                dta = DatetimeArray._simple_new(
+                    values.values, dtype=values.dtype, freq=freq
+                )
+                result = DatetimeIndex._simple_new(dta, name=None)
+                if tz is not None:
+                    result = result.tz_localize("UTC").tz_convert(tz)
+                return result
+
+            factory = f
+        elif index_class == PeriodIndex:
+
+            def f(values, freq=None, tz=None):
+                dtype = PeriodDtype(freq)
+                parr = PeriodArray._simple_new(values, dtype=dtype)
+                return PeriodIndex._simple_new(parr, name=None)
+
+            factory = f
+        else:
+            factory = index_class
+            kwargs["copy"] = False
+
+        if "freq" in attrs:
+            kwargs["freq"] = attrs["freq"]
+            if index_class is Index:
+                # DTI/PI would be gotten by _alias_to_class
+                factory = TimedeltaIndex
+
+        if "tz" in attrs:
+            kwargs["tz"] = attrs["tz"]
+            assert index_class is DatetimeIndex  # just checking
+
+        return factory, kwargs
+
+    def validate_read(self, columns, where) -> None:
+        """
+        raise if any keywords are passed which are not-None
+        """
+        if columns is not None:
+            raise TypeError(
+                "cannot pass a column specification when reading "
+                "a Fixed format store. this store must be selected in its entirety"
+            )
+        if where is not None:
+            raise TypeError(
+                "cannot pass a where specification when reading "
+                "from a Fixed format store. this store must be selected in its entirety"
+            )
+
+    @property
+    def is_exists(self) -> bool:
+        return True
+
+    def set_attrs(self) -> None:
+        """set our object attributes"""
+        self.attrs.encoding = self.encoding
+        self.attrs.errors = self.errors
+
+    def get_attrs(self) -> None:
+        """retrieve our attributes"""
+        self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
+        self.errors = getattr(self.attrs, "errors", "strict")
+        for n in self.attributes:
+            setattr(self, n, getattr(self.attrs, n, None))
+
+    def write(self, obj, **kwargs) -> None:
+        self.set_attrs()
+
+    def read_array(self, key: str, start: int | None = None, stop: int | None = None):
+        """read an array for the specified node (off of group"""
+        import tables
+
+        node = getattr(self.group, key)
+        attrs = node._v_attrs
+
+        transposed = getattr(attrs, "transposed", False)
+
+        if isinstance(node, tables.VLArray):
+            ret = node[0][start:stop]
+            dtype = getattr(attrs, "value_type", None)
+            if dtype is not None:
+                ret = pd_array(ret, dtype=dtype)
+        else:
+            dtype = getattr(attrs, "value_type", None)
+            shape = getattr(attrs, "shape", None)
+
+            if shape is not None:
+                # length 0 axis
+                ret = np.empty(shape, dtype=dtype)
+            else:
+                ret = node[start:stop]
+
+            if dtype and dtype.startswith("datetime64"):
+                # reconstruct a timezone if indicated
+                if dtype == "datetime64":
+                    dtype = "datetime64[ns]"
+                tz = getattr(attrs, "tz", None)
+                ret = _set_tz(ret, tz, dtype)
+
+            elif dtype and dtype.startswith("timedelta64"):
+                if dtype == "timedelta64":
+                    # This was written back before we started writing
+                    # timedelta64 units
+                    ret = np.asarray(ret, dtype="m8[ns]")
+                else:
+                    ret = np.asarray(ret, dtype=dtype)
+
+        if transposed:
+            return ret.T
+        else:
+            return ret
+
+    def read_index(
+        self, key: str, start: int | None = None, stop: int | None = None
+    ) -> Index:
+        variety = getattr(self.attrs, f"{key}_variety")
+
+        if variety == "multi":
+            return self.read_multi_index(key, start=start, stop=stop)
+        elif variety == "regular":
+            node = getattr(self.group, key)
+            index = self.read_index_node(node, start=start, stop=stop)
+            return index
+        else:  # pragma: no cover
+            raise TypeError(f"unrecognized index variety: {variety}")
+
+    def write_index(self, key: str, index: Index) -> None:
+        if isinstance(index, MultiIndex):
+            setattr(self.attrs, f"{key}_variety", "multi")
+            self.write_multi_index(key, index)
+        else:
+            setattr(self.attrs, f"{key}_variety", "regular")
+            converted = _convert_index("index", index, self.encoding, self.errors)
+
+            self.write_array(key, converted.values)
+
+            node = getattr(self.group, key)
+            node._v_attrs.kind = converted.kind
+            node._v_attrs.name = index.name
+
+            if isinstance(index, (DatetimeIndex, PeriodIndex)):
+                node._v_attrs.index_class = self._class_to_alias(type(index))
+
+            if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
+                node._v_attrs.freq = index.freq
+
+            if isinstance(index, DatetimeIndex) and index.tz is not None:
+                node._v_attrs.tz = _get_tz(index.tz)
+
+    def write_multi_index(self, key: str, index: MultiIndex) -> None:
+        setattr(self.attrs, f"{key}_nlevels", index.nlevels)
+
+        for i, (lev, level_codes, name) in enumerate(
+            zip(index.levels, index.codes, index.names, strict=True)
+        ):
+            # write the level
+            if isinstance(lev.dtype, ExtensionDtype) and not isinstance(
+                lev.dtype, StringDtype
+            ):
+                raise NotImplementedError(
+                    "Saving a MultiIndex with an extension dtype is not supported."
+                )
+            level_key = f"{key}_level{i}"
+            conv_level = _convert_index(level_key, lev, self.encoding, self.errors)
+            self.write_array(level_key, conv_level.values)
+            node = getattr(self.group, level_key)
+            node._v_attrs.kind = conv_level.kind
+            node._v_attrs.name = name
+
+            # write the name
+            setattr(node._v_attrs, f"{key}_name{name}", name)
+
+            # write the labels
+            label_key = f"{key}_label{i}"
+            self.write_array(label_key, level_codes)
+
+    def read_multi_index(
+        self, key: str, start: int | None = None, stop: int | None = None
+    ) -> MultiIndex:
+        nlevels = getattr(self.attrs, f"{key}_nlevels")
+
+        levels = []
+        codes = []
+        names: list[Hashable] = []
+        for i in range(nlevels):
+            level_key = f"{key}_level{i}"
+            node = getattr(self.group, level_key)
+            lev = self.read_index_node(node, start=start, stop=stop)
+            levels.append(lev)
+            names.append(lev.name)
+
+            label_key = f"{key}_label{i}"
+            level_codes = self.read_array(label_key, start=start, stop=stop)
+            codes.append(level_codes)
+
+        return MultiIndex(
+            levels=levels, codes=codes, names=names, verify_integrity=True
+        )
+
+    def read_index_node(
+        self, node: Node, start: int | None = None, stop: int | None = None
+    ) -> Index:
+        data = node[start:stop]
+        # If the index was an empty array write_array_empty() will
+        # have written a sentinel. Here we replace it with the original.
+        if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
+            data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
+        kind = node._v_attrs.kind
+        name = None
+
+        if "name" in node._v_attrs:
+            name = _ensure_str(node._v_attrs.name)
+
+        attrs = node._v_attrs
+        factory, kwargs = self._get_index_factory(attrs)
+
+        if kind in ("date", "object"):
+            index = factory(
+                _unconvert_index(
+                    data, kind, encoding=self.encoding, errors=self.errors
+                ),
+                dtype=object,
+                **kwargs,
+            )
+        else:
+            try:
+                index = factory(
+                    _unconvert_index(
+                        data, kind, encoding=self.encoding, errors=self.errors
+                    ),
+                    **kwargs,
+                )
+            except UnicodeEncodeError as err:
+                if (
+                    self.errors == "surrogatepass"
+                    and using_string_dtype()
+                    and str(err).endswith("surrogates not allowed")
+                    and HAS_PYARROW
+                ):
+                    index = factory(
+                        _unconvert_index(
+                            data, kind, encoding=self.encoding, errors=self.errors
+                        ),
+                        dtype=StringDtype(storage="python", na_value=np.nan),
+                        **kwargs,
+                    )
+                else:
+                    raise
+
+        index.name = name
+
+        return index
+
+    def write_array_empty(self, key: str, value: ArrayLike) -> None:
+        """write a 0-len array"""
+        # ugly hack for length 0 axes
+        arr = np.empty((1,) * value.ndim)
+        self._handle.create_array(self.group, key, arr)
+        node = getattr(self.group, key)
+        node._v_attrs.value_type = str(value.dtype)
+        node._v_attrs.shape = value.shape
+
+    def write_array(
+        self, key: str, obj: AnyArrayLike, items: Index | None = None
+    ) -> None:
+        # TODO: we only have a few tests that get here, the only EA
+        #  that gets passed is DatetimeArray, and we never have
+        #  both self._filters and EA
+
+        value = extract_array(obj, extract_numpy=True)
+
+        if key in self.group:
+            self._handle.remove_node(self.group, key)
+
+        # Transform needed to interface with pytables row/col notation
+        empty_array = value.size == 0
+        transposed = False
+
+        if isinstance(value.dtype, CategoricalDtype):
+            raise NotImplementedError(
+                "Cannot store a category dtype in an HDF5 dataset that uses format="
+                '"fixed". Use format="table".'
+            )
+        if not empty_array:
+            if hasattr(value, "T"):
+                # ExtensionArrays (1d) may not have transpose.
+                value = value.T
+                transposed = True
+
+        atom = None
+        if self._filters is not None:
+            with suppress(ValueError):
+                # get the atom for this datatype
+                atom = _tables().Atom.from_dtype(value.dtype)
+
+        if atom is not None:
+            # We only get here if self._filters is non-None and
+            #  the Atom.from_dtype call succeeded
+
+            # create an empty chunked array and fill it from value
+            if not empty_array:
+                ca = self._handle.create_carray(
+                    self.group, key, atom, value.shape, filters=self._filters
+                )
+                ca[:] = value
+
+            else:
+                self.write_array_empty(key, value)
+
+        elif value.dtype.type == np.object_:
+            # infer the type, warn if we have a non-string type here (for
+            # performance)
+            inferred_type = lib.infer_dtype(value, skipna=False)
+            if empty_array:
+                pass
+            elif inferred_type == "string":
+                pass
+            elif get_option("performance_warnings"):
+                ws = performance_doc % (inferred_type, key, items)
+                warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())
+
+            vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
+            vlarr.append(value)
+
+        elif lib.is_np_dtype(value.dtype, "M"):
+            self._handle.create_array(self.group, key, value.view("i8"))
+            getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
+        elif isinstance(value.dtype, DatetimeTZDtype):
+            # store as UTC
+            # with a zone
+
+            # error: "ExtensionArray" has no attribute "asi8"
+            self._handle.create_array(
+                self.group,
+                key,
+                value.asi8,  # type: ignore[attr-defined]
+            )
+
+            node = getattr(self.group, key)
+            # error: "ExtensionArray" has no attribute "tz"
+            node._v_attrs.tz = _get_tz(value.tz)  # type: ignore[attr-defined]
+            node._v_attrs.value_type = f"datetime64[{value.dtype.unit}]"
+        elif lib.is_np_dtype(value.dtype, "m"):
+            self._handle.create_array(self.group, key, value.view("i8"))
+            getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
+        elif isinstance(value, BaseStringArray):
+            vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
+            vlarr.append(value.to_numpy())
+            node = getattr(self.group, key)
+            node._v_attrs.value_type = str(value.dtype)
+        elif empty_array:
+            self.write_array_empty(key, value)
+        else:
+            self._handle.create_array(self.group, key, value)
+
+        getattr(self.group, key)._v_attrs.transposed = transposed
+
+
+class SeriesFixed(GenericFixed):
+    pandas_kind = "series"
+    attributes = ["name"]
+
+    name: Hashable
+
+    @property
+    def shape(self) -> tuple[int] | None:
+        try:
+            return (len(self.group.values),)
+        except (TypeError, AttributeError):
+            return None
+
+    def read(
+        self,
+        where=None,
+        columns=None,
+        start: int | None = None,
+        stop: int | None = None,
+    ) -> Series:
+        self.validate_read(columns, where)
+        index = self.read_index("index", start=start, stop=stop)
+        values = self.read_array("values", start=start, stop=stop)
+        try:
+            result = Series(values, index=index, name=self.name, copy=False)
+        except UnicodeEncodeError as err:
+            if (
+                self.errors == "surrogatepass"
+                and using_string_dtype()
+                and str(err).endswith("surrogates not allowed")
+                and HAS_PYARROW
+            ):
+                result = Series(
+                    values,
+                    index=index,
+                    name=self.name,
+                    copy=False,
+                    dtype=StringDtype(storage="python", na_value=np.nan),
+                )
+            else:
+                raise
+        return result
+
+    def write(self, obj, **kwargs) -> None:
+        super().write(obj, **kwargs)
+        self.write_index("index", obj.index)
+        self.write_array("values", obj)
+        self.attrs.name = obj.name
+
+
+class BlockManagerFixed(GenericFixed):
+    attributes = ["ndim", "nblocks"]
+
+    nblocks: int
+
+    @property
+    def shape(self) -> list[int] | None:
+        try:
+            ndim = self.ndim
+
+            # items
+            items = 0
+            for i in range(self.nblocks):
+                node = getattr(self.group, f"block{i}_items")
+                shape = getattr(node, "shape", None)
+                if shape is not None:
+                    items += shape[0]
+
+            # data shape
+            node = self.group.block0_values
+            shape = getattr(node, "shape", None)
+            if shape is not None:
+                shape = list(shape[0 : (ndim - 1)])
+            else:
+                shape = []
+
+            shape.append(items)
+
+            return shape
+        except AttributeError:
+            return None
+
+    def read(
+        self,
+        where=None,
+        columns=None,
+        start: int | None = None,
+        stop: int | None = None,
+    ) -> DataFrame:
+        # start, stop applied to rows, so 0th axis only
+        self.validate_read(columns, where)
+        select_axis = self.obj_type()._get_block_manager_axis(0)
+
+        axes = []
+        for i in range(self.ndim):
+            _start, _stop = (start, stop) if i == select_axis else (None, None)
+            ax = self.read_index(f"axis{i}", start=_start, stop=_stop)
+            axes.append(ax)
+
+        items = axes[0]
+        dfs = []
+
+        for i in range(self.nblocks):
+            blk_items = self.read_index(f"block{i}_items")
+            values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
+
+            columns = items[items.get_indexer(blk_items)]
+            df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
+            if (
+                using_string_dtype()
+                and isinstance(values, np.ndarray)
+                and is_string_array(values, skipna=True)
+            ):
+                df = df.astype(StringDtype(na_value=np.nan))
+            dfs.append(df)
+
+        if len(dfs) > 0:
+            out = concat(dfs, axis=1).copy()
+            return out.reindex(columns=items)
+
+        return DataFrame(columns=axes[0], index=axes[1])
+
+    def write(self, obj, **kwargs) -> None:
+        super().write(obj, **kwargs)
+
+        data = obj._mgr
+        if not data.is_consolidated():
+            data = data.consolidate()
+
+        self.attrs.ndim = data.ndim
+        for i, ax in enumerate(data.axes):
+            if i == 0 and (not ax.is_unique):
+                raise ValueError("Columns index has to be unique for fixed format")
+            self.write_index(f"axis{i}", ax)
+
+        # Supporting mixed-type DataFrame objects...nontrivial
+        self.attrs.nblocks = len(data.blocks)
+        for i, blk in enumerate(data.blocks):
+            # I have no idea why, but writing values before items fixed #2299
+            blk_items = data.items.take(blk.mgr_locs)
+            self.write_array(f"block{i}_values", blk.values, items=blk_items)
+            self.write_index(f"block{i}_items", blk_items)
+
+
+class FrameFixed(BlockManagerFixed):
+    pandas_kind = "frame"
+    obj_type = DataFrame
+
+
+class Table(Fixed):
+    """
+    represent a table:
+        facilitate read/write of various types of tables
+
+    Attrs in Table Node
+    -------------------
+    These are attributes that are store in the main table node, they are
+    necessary to recreate these tables when read back in.
+
+    index_axes    : a list of tuples of the (original indexing axis and
+        index column)
+    non_index_axes: a list of tuples of the (original index axis and
+        columns on a non-indexing axis)
+    values_axes   : a list of the columns which comprise the data of this
+        table
+    data_columns  : a list of the columns that we are allowing indexing
+        (these become single columns in values_axes)
+    nan_rep       : the string to use for nan representations for string
+        objects
+    levels        : the names of levels
+    metadata      : the names of the metadata columns
+    """
+
+    pandas_kind = "wide_table"
+    format_type: str = "table"  # GH#30962 needed by dask
+    table_type: str
+    levels: int | list[Hashable] = 1
+    is_table = True
+
+    metadata: list
+
+    def __init__(
+        self,
+        parent: HDFStore,
+        group: Node,
+        encoding: str | None = None,
+        errors: str = "strict",
+        index_axes: list[IndexCol] | None = None,
+        non_index_axes: list[tuple[AxisInt, Any]] | None = None,
+        values_axes: list[DataCol] | None = None,
+        data_columns: list | None = None,
+        info: dict | None = None,
+        nan_rep=None,
+    ) -> None:
+        super().__init__(parent, group, encoding=encoding, errors=errors)
+        self.index_axes = index_axes or []
+        self.non_index_axes = non_index_axes or []
+        self.values_axes = values_axes or []
+        self.data_columns = data_columns or []
+        self.info = info or {}
+        self.nan_rep = nan_rep
+
+    @property
+    def table_type_short(self) -> str:
+        return self.table_type.split("_")[0]
+
+    def __repr__(self) -> str:
+        """return a pretty representation of myself"""
+        self.infer_axes()
+        jdc = ",".join(self.data_columns) if len(self.data_columns) else ""
+        dc = f",dc->[{jdc}]"
+
+        ver = ""
+        if self.is_old_version:
+            jver = ".".join([str(x) for x in self.version])
+            ver = f"[{jver}]"
+
+        jindex_axes = ",".join([a.name for a in self.index_axes])
+        return (
+            f"{self.pandas_type:12.12}{ver} "
+            f"(typ->{self.table_type_short},nrows->{self.nrows},"
+            f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"
+        )
+
+    def __getitem__(self, c: str):
+        """return the axis for c"""
+        for a in self.axes:
+            if c == a.name:
+                return a
+        return None
+
+    def validate(self, other) -> None:
+        """validate against an existing table"""
+        if other is None:
+            return
+
+        if other.table_type != self.table_type:
+            raise TypeError(
+                "incompatible table_type with existing "
+                f"[{other.table_type} - {self.table_type}]"
+            )
+
+        for c in ["index_axes", "non_index_axes", "values_axes"]:
+            sv = getattr(self, c, None)
+            ov = getattr(other, c, None)
+            if sv != ov:
+                # show the error for the specific axes
+                # Argument 1 to "enumerate" has incompatible type
+                # "Optional[Any]"; expected "Iterable[Any]"  [arg-type]
+                for i, sax in enumerate(sv):  # type: ignore[arg-type]
+                    # Value of type "Optional[Any]" is not indexable  [index]
+                    oax = ov[i]  # type: ignore[index]
+                    if sax != oax:
+                        if c == "values_axes" and sax.kind != oax.kind:
+                            raise ValueError(
+                                f"Cannot serialize the column [{oax.values[0]}] "
+                                f"because its data contents are not [{sax.kind}] "
+                                f"but [{oax.kind}] object dtype"
+                            )
+                        raise ValueError(
+                            f"invalid combination of [{c}] on appending data "
+                            f"[{sax}] vs current table [{oax}]"
+                        )
+
+                # should never get here
+                raise Exception(
+                    f"invalid combination of [{c}] on appending data [{sv}] vs "
+                    f"current table [{ov}]"
+                )
+
+    @property
+    def is_multi_index(self) -> bool:
+        """the levels attribute is 1 or a list in the case of a multi-index"""
+        return isinstance(self.levels, list)
+
+    def validate_multiindex(
+        self, obj: DataFrame | Series
+    ) -> tuple[DataFrame, list[Hashable]]:
+        """
+        validate that we can store the multi-index; reset and return the
+        new object
+        """
+        levels = com.fill_missing_names(obj.index.names)
+        try:
+            reset_obj = obj.reset_index()
+        except ValueError as err:
+            raise ValueError(
+                "duplicate names/columns in the multi-index when storing as a table"
+            ) from err
+        assert isinstance(reset_obj, DataFrame)  # for mypy
+        return reset_obj, levels
+
+    @property
+    def nrows_expected(self) -> int:
+        """based on our axes, compute the expected nrows"""
+        return np.prod([i.cvalues.shape[0] for i in self.index_axes])
+
+    @property
+    def is_exists(self) -> bool:
+        """has this table been created"""
+        return "table" in self.group
+
+    @property
+    def storable(self):
+        return getattr(self.group, "table", None)
+
+    @property
+    def table(self):
+        """return the table group (this is my storable)"""
+        return self.storable
+
+    @property
+    def dtype(self):
+        return self.table.dtype
+
+    @property
+    def description(self):
+        return self.table.description
+
+    @property
+    def axes(self) -> itertools.chain[IndexCol]:
+        return itertools.chain(self.index_axes, self.values_axes)
+
+    @property
+    def ncols(self) -> int:
+        """the number of total columns in the values axes"""
+        return sum(len(a.values) for a in self.values_axes)
+
+    @property
+    def is_transposed(self) -> bool:
+        return False
+
+    @property
+    def data_orientation(self) -> tuple[int, ...]:
+        """return a tuple of my permutated axes, non_indexable at the front"""
+        return tuple(
+            itertools.chain(
+                [int(a[0]) for a in self.non_index_axes],
+                [int(a.axis) for a in self.index_axes],
+            )
+        )
+
+    def queryables(self) -> dict[str, Any]:
+        """return a dict of the kinds allowable columns for this object"""
+        # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here
+        axis_names = {0: "index", 1: "columns"}
+
+        # compute the values_axes queryables
+        d1 = [(a.cname, a) for a in self.index_axes]
+        d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]
+        d3 = [
+            (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
+        ]
+
+        return dict(d1 + d2 + d3)
+
+    def index_cols(self) -> list[tuple[Any, Any]]:
+        """return a list of my index cols"""
+        # Note: each `i.cname` below is assured to be a str.
+        return [(i.axis, i.cname) for i in self.index_axes]
+
+    def values_cols(self) -> list[str]:
+        """return a list of my values cols"""
+        return [i.cname for i in self.values_axes]
+
+    def _get_metadata_path(self, key: str) -> str:
+        """return the metadata pathname for this key"""
+        group = self.group._v_pathname
+        return f"{group}/meta/{key}/meta"
+
+    def write_metadata(self, key: str, values: np.ndarray) -> None:
+        """
+        Write out a metadata array to the key as a fixed-format Series.
+
+        Parameters
+        ----------
+        key : str
+        values : ndarray
+        """
+        self.parent.put(
+            self._get_metadata_path(key),
+            Series(values, copy=False),
+            format="table",
+            encoding=self.encoding,
+            errors=self.errors,
+            nan_rep=self.nan_rep,
+        )
+
+    def read_metadata(self, key: str):
+        """return the meta data array for this key"""
+        if getattr(getattr(self.group, "meta", None), key, None) is not None:
+            return self.parent.select(self._get_metadata_path(key))
+        return None
+
+    def set_attrs(self) -> None:
+        """set our table type & indexables"""
+        self.attrs.table_type = str(self.table_type)
+        self.attrs.index_cols = self.index_cols()
+        self.attrs.values_cols = self.values_cols()
+        self.attrs.non_index_axes = self.non_index_axes
+        self.attrs.data_columns = self.data_columns
+        self.attrs.nan_rep = self.nan_rep
+        self.attrs.encoding = self.encoding
+        self.attrs.errors = self.errors
+        self.attrs.levels = self.levels
+        self.attrs.info = self.info
+
+    def get_attrs(self) -> None:
+        """retrieve our attributes"""
+        self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
+        self.data_columns = getattr(self.attrs, "data_columns", None) or []
+        self.info = getattr(self.attrs, "info", None) or {}
+        self.nan_rep = getattr(self.attrs, "nan_rep", None)
+        self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
+        self.errors = getattr(self.attrs, "errors", "strict")
+        self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
+        self.index_axes = [a for a in self.indexables if a.is_an_indexable]
+        self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
+
+    def validate_version(self, where=None) -> None:
+        """are we trying to operate on an old version?"""
+        if where is not None:
+            if self.is_old_version:
+                ws = incompatibility_doc % ".".join([str(x) for x in self.version])
+                warnings.warn(
+                    ws,
+                    IncompatibilityWarning,
+                    stacklevel=find_stack_level(),
+                )
+
+    def validate_min_itemsize(self, min_itemsize) -> None:
+        """
+        validate the min_itemsize doesn't contain items that are not in the
+        axes this needs data_columns to be defined
+        """
+        if min_itemsize is None:
+            return
+        if not isinstance(min_itemsize, dict):
+            return
+
+        q = self.queryables()
+        for k in min_itemsize:
+            # ok, apply generally
+            if k == "values":
+                continue
+            if k not in q:
+                raise ValueError(
+                    f"min_itemsize has the key [{k}] which is not an axis or "
+                    "data_column"
+                )
+
+    @cache_readonly
+    def indexables(self):
+        """create/cache the indexables if they don't exist"""
+        _indexables = []
+
+        desc = self.description
+        table_attrs = self.table.attrs
+
+        # Note: each of the `name` kwargs below are str, ensured
+        #  by the definition in index_cols.
+        # index columns
+        for i, (axis, name) in enumerate(self.attrs.index_cols):
+            atom = getattr(desc, name)
+            md = self.read_metadata(name)
+            meta = "category" if md is not None else None
+
+            kind_attr = f"{name}_kind"
+            kind = getattr(table_attrs, kind_attr, None)
+
+            index_col = IndexCol(
+                name=name,
+                axis=axis,
+                pos=i,
+                kind=kind,
+                typ=atom,
+                table=self.table,
+                meta=meta,
+                metadata=md,
+            )
+            _indexables.append(index_col)
+
+        # values columns
+        dc = set(self.data_columns)
+        base_pos = len(_indexables)
+
+        def f(i, c: str) -> DataCol:
+            assert isinstance(c, str)
+            klass = DataCol
+            if c in dc:
+                klass = DataIndexableCol
+
+            atom = getattr(desc, c)
+            adj_name = _maybe_adjust_name(c, self.version)
+
+            # TODO: why kind_attr here?
+            values = getattr(table_attrs, f"{adj_name}_kind", None)
+            dtype = getattr(table_attrs, f"{adj_name}_dtype", None)
+            # Argument 1 to "_dtype_to_kind" has incompatible type
+            # "Optional[Any]"; expected "str"  [arg-type]
+            kind = _dtype_to_kind(dtype)  # type: ignore[arg-type]
+
+            md = self.read_metadata(c)
+            # TODO: figure out why these two versions of `meta` dont always match.
+            #  meta = "category" if md is not None else None
+            meta = getattr(table_attrs, f"{adj_name}_meta", None)
+
+            obj = klass(
+                name=adj_name,
+                cname=c,
+                values=values,
+                kind=kind,
+                pos=base_pos + i,
+                typ=atom,
+                table=self.table,
+                meta=meta,
+                metadata=md,
+                dtype=dtype,
+            )
+            return obj
+
+        # Note: the definition of `values_cols` ensures that each
+        #  `c` below is a str.
+        _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])
+
+        return _indexables
+
+    def create_index(
+        self, columns=None, optlevel=None, kind: str | None = None
+    ) -> None:
+        """
+        Create a pytables index on the specified columns.
+
+        Parameters
+        ----------
+        columns : None, bool, or listlike[str]
+            Indicate which columns to create an index on.
+
+            * False : Do not create any indexes.
+            * True : Create indexes on all columns.
+            * None : Create indexes on all columns.
+            * listlike : Create indexes on the given columns.
+
+        optlevel : int or None, default None
+            Optimization level, if None, pytables defaults to 6.
+        kind : str or None, default None
+            Kind of index, if None, pytables defaults to "medium".
+
+        Raises
+        ------
+        TypeError if trying to create an index on a complex-type column.
+
+        Notes
+        -----
+        Cannot index Time64Col or ComplexCol.
+        Pytables must be >= 3.0.
+        """
+        if not self.infer_axes():
+            return
+        if columns is False:
+            return
+
+        # index all indexables and data_columns
+        if columns is None or columns is True:
+            columns = [a.cname for a in self.axes if a.is_data_indexable]
+        if not isinstance(columns, (tuple, list)):
+            columns = [columns]
+
+        kw = {}
+        if optlevel is not None:
+            kw["optlevel"] = optlevel
+        if kind is not None:
+            kw["kind"] = kind
+
+        table = self.table
+        for c in columns:
+            v = getattr(table.cols, c, None)
+            if v is not None:
+                # remove the index if the kind/optlevel have changed
+                if v.is_indexed:
+                    index = v.index
+                    cur_optlevel = index.optlevel
+                    cur_kind = index.kind
+
+                    if kind is not None and cur_kind != kind:
+                        v.remove_index()
+                    else:
+                        kw["kind"] = cur_kind
+
+                    if optlevel is not None and cur_optlevel != optlevel:
+                        v.remove_index()
+                    else:
+                        kw["optlevel"] = cur_optlevel
+
+                # create the index
+                if not v.is_indexed:
+                    if v.type.startswith("complex"):
+                        raise TypeError(
+                            "Columns containing complex values can be stored but "
+                            "cannot be indexed when using table format. Either use "
+                            "fixed format, set index=False, or do not include "
+                            "the columns containing complex values to "
+                            "data_columns when initializing the table."
+                        )
+                    v.create_index(**kw)
+            elif c in self.non_index_axes[0][1]:
+                # GH 28156
+                raise AttributeError(
+                    f"column {c} is not a data_column.\n"
+                    f"In order to read column {c} you must reload the dataframe \n"
+                    f"into HDFStore and include {c} with the data_columns argument."
+                )
+
+    def _read_axes(
+        self, where, start: int | None = None, stop: int | None = None
+    ) -> list[tuple[np.ndarray, np.ndarray] | tuple[Index, Index]]:
+        """
+        Create the axes sniffed from the table.
+
+        Parameters
+        ----------
+        where : ???
+        start : int or None, default None
+        stop : int or None, default None
+
+        Returns
+        -------
+        List[Tuple[index_values, column_values]]
+        """
+        # create the selection
+        selection = Selection(self, where=where, start=start, stop=stop)
+        values = selection.select()
+
+        results = []
+        # convert the data
+        for a in self.axes:
+            a.set_info(self.info)
+            res = a.convert(
+                values,
+                nan_rep=self.nan_rep,
+                encoding=self.encoding,
+                errors=self.errors,
+            )
+            results.append(res)
+
+        return results
+
+    @classmethod
+    def get_object(cls, obj, transposed: bool):
+        """return the data for this obj"""
+        return obj
+
+    def validate_data_columns(self, data_columns, min_itemsize, non_index_axes) -> list:
+        """
+        take the input data_columns and min_itemize and create a data
+        columns spec
+        """
+        if not len(non_index_axes):
+            return []
+
+        axis, axis_labels = non_index_axes[0]
+        info = self.info.get(axis, {})
+        if info.get("type") == "MultiIndex" and data_columns:
+            raise ValueError(
+                f"cannot use a multi-index on axis [{axis}] with "
+                f"data_columns {data_columns}"
+            )
+
+        # evaluate the passed data_columns, True == use all columns
+        # take only valid axis labels
+        if data_columns is True:
+            data_columns = list(axis_labels)
+        elif data_columns is None:
+            data_columns = []
+
+        # if min_itemsize is a dict, add the keys (exclude 'values')
+        if isinstance(min_itemsize, dict):
+            existing_data_columns = set(data_columns)
+            data_columns = list(data_columns)  # ensure we do not modify
+            data_columns.extend(
+                [
+                    k
+                    for k in min_itemsize.keys()
+                    if k != "values" and k not in existing_data_columns
+                ]
+            )
+
+        # return valid columns in the order of our axis
+        return [c for c in data_columns if c in axis_labels]
+
+    def _create_axes(
+        self,
+        axes,
+        obj: DataFrame,
+        validate: bool = True,
+        nan_rep=None,
+        data_columns=None,
+        min_itemsize=None,
+    ):
+        """
+        Create and return the axes.
+
+        Parameters
+        ----------
+        axes: list or None
+            The names or numbers of the axes to create.
+        obj : DataFrame
+            The object to create axes on.
+        validate: bool, default True
+            Whether to validate the obj against an existing object already written.
+        nan_rep :
+            A value to use for string column nan_rep.
+        data_columns : List[str], True, or None, default None
+            Specify the columns that we want to create to allow indexing on.
+
+            * True : Use all available columns.
+            * None : Use no columns.
+            * List[str] : Use the specified columns.
+
+        min_itemsize: Dict[str, int] or None, default None
+            The min itemsize for a column in bytes.
+        """
+        if not isinstance(obj, DataFrame):
+            group = self.group._v_name
+            raise TypeError(
+                f"cannot properly create the storer for: [group->{group},"
+                f"value->{type(obj)}]"
+            )
+
+        # set the default axes if needed
+        if axes is None:
+            axes = [0]
+
+        # map axes to numbers
+        axes = [obj._get_axis_number(a) for a in axes]
+
+        # do we have an existing table (if so, use its axes & data_columns)
+        if self.infer_axes():
+            table_exists = True
+            axes = [a.axis for a in self.index_axes]
+            data_columns = list(self.data_columns)
+            nan_rep = self.nan_rep
+            # TODO: do we always have validate=True here?
+        else:
+            table_exists = False
+
+        new_info = self.info
+
+        assert self.ndim == 2  # with next check, we must have len(axes) == 1
+        # currently support on ndim-1 axes
+        if len(axes) != self.ndim - 1:
+            raise ValueError(
+                "currently only support ndim-1 indexers in an AppendableTable"
+            )
+
+        # create according to the new data
+        new_non_index_axes: list = []
+
+        # nan_representation
+        if nan_rep is None:
+            nan_rep = "nan"
+
+        # We construct the non-index-axis first, since that alters new_info
+        idx = next(x for x in [0, 1] if x not in axes)
+
+        a = obj.axes[idx]
+        # we might be able to change the axes on the appending data if necessary
+        append_axis = list(a)
+        if table_exists:
+            indexer = len(new_non_index_axes)  # i.e. 0
+            exist_axis = self.non_index_axes[indexer][1]
+            if not array_equivalent(
+                np.array(append_axis),
+                np.array(exist_axis),
+                strict_nan=True,
+                dtype_equal=True,
+            ):
+                # ahah! -> reindex
+                if array_equivalent(
+                    np.array(sorted(append_axis)),
+                    np.array(sorted(exist_axis)),
+                    strict_nan=True,
+                    dtype_equal=True,
+                ):
+                    append_axis = exist_axis
+
+        # the non_index_axes info
+        info = new_info.setdefault(idx, {})
+        info["names"] = list(a.names)
+        info["type"] = type(a).__name__
+
+        new_non_index_axes.append((idx, append_axis))
+
+        # Now we can construct our new index axis
+        idx = axes[0]
+        a = obj.axes[idx]
+        axis_name = obj._get_axis_name(idx)
+        new_index = _convert_index(axis_name, a, self.encoding, self.errors)
+        new_index.axis = idx
+
+        # Because we are always 2D, there is only one new_index, so
+        #  we know it will have pos=0
+        new_index.set_pos(0)
+        new_index.update_info(new_info)
+        new_index.maybe_set_size(min_itemsize)  # check for column conflicts
+
+        new_index_axes = [new_index]
+        j = len(new_index_axes)  # i.e. 1
+        assert j == 1
+
+        # reindex by our non_index_axes & compute data_columns
+        assert len(new_non_index_axes) == 1
+        for a in new_non_index_axes:
+            obj = _reindex_axis(obj, a[0], a[1])
+
+        transposed = new_index.axis == 1
+
+        # figure out data_columns and get out blocks
+        data_columns = self.validate_data_columns(
+            data_columns, min_itemsize, new_non_index_axes
+        )
+
+        frame = self.get_object(obj, transposed)._consolidate()
+
+        blocks, blk_items = self._get_blocks_and_items(
+            frame, table_exists, new_non_index_axes, self.values_axes, data_columns
+        )
+
+        # add my values
+        vaxes = []
+        for i, (blk, b_items) in enumerate(zip(blocks, blk_items, strict=True)):
+            # shape of the data column are the indexable axes
+            klass = DataCol
+            name = None
+
+            # we have a data_column
+            if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
+                klass = DataIndexableCol
+                name = b_items[0]
+                if not (name is None or isinstance(name, str)):
+                    # TODO: should the message here be more specifically non-str?
+                    raise ValueError("cannot have non-object label DataIndexableCol")
+
+            # make sure that we match up the existing columns
+            # if we have an existing table
+            existing_col: DataCol | None
+
+            if table_exists and validate:
+                try:
+                    existing_col = self.values_axes[i]
+                except (IndexError, KeyError) as err:
+                    raise ValueError(
+                        f"Incompatible appended table [{blocks}]"
+                        f"with existing table [{self.values_axes}]"
+                    ) from err
+            else:
+                existing_col = None
+
+            new_name = name or f"values_block_{i}"
+            data_converted = _maybe_convert_for_string_atom(
+                new_name,
+                blk.values,
+                existing_col=existing_col,
+                min_itemsize=min_itemsize,
+                nan_rep=nan_rep,
+                encoding=self.encoding,
+                errors=self.errors,
+                columns=b_items,
+            )
+            adj_name = _maybe_adjust_name(new_name, self.version)
+
+            typ = klass._get_atom(data_converted)
+            kind = _dtype_to_kind(data_converted.dtype.name)
+            tz = None
+            if getattr(data_converted, "tz", None) is not None:
+                tz = _get_tz(data_converted.tz)
+
+            meta = metadata = ordered = None
+            if isinstance(data_converted.dtype, CategoricalDtype):
+                ordered = data_converted.ordered
+                meta = "category"
+                metadata = np.asarray(data_converted.categories).ravel()
+            elif isinstance(blk.dtype, StringDtype):
+                meta = str(blk.dtype)
+
+            data, dtype_name = _get_data_and_dtype_name(data_converted)
+
+            col = klass(
+                name=adj_name,
+                cname=new_name,
+                values=list(b_items),
+                typ=typ,
+                pos=j,
+                kind=kind,
+                tz=tz,
+                ordered=ordered,
+                meta=meta,
+                metadata=metadata,
+                dtype=dtype_name,
+                data=data,
+            )
+            col.update_info(new_info)
+
+            vaxes.append(col)
+
+            j += 1
+
+        dcs = [col.name for col in vaxes if col.is_data_indexable]
+
+        new_table = type(self)(
+            parent=self.parent,
+            group=self.group,
+            encoding=self.encoding,
+            errors=self.errors,
+            index_axes=new_index_axes,
+            non_index_axes=new_non_index_axes,
+            values_axes=vaxes,
+            data_columns=dcs,
+            info=new_info,
+            nan_rep=nan_rep,
+        )
+        if hasattr(self, "levels"):
+            # TODO: get this into constructor, only for appropriate subclass
+            new_table.levels = self.levels
+
+        new_table.validate_min_itemsize(min_itemsize)
+
+        if validate and table_exists:
+            new_table.validate(self)
+
+        return new_table
+
+    @staticmethod
+    def _get_blocks_and_items(
+        frame: DataFrame,
+        table_exists: bool,
+        new_non_index_axes,
+        values_axes,
+        data_columns,
+    ):
+        # Helper to clarify non-state-altering parts of _create_axes
+        def get_blk_items(mgr):
+            return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]
+
+        mgr = frame._mgr
+        blocks: list[Block] = list(mgr.blocks)
+        blk_items: list[Index] = get_blk_items(mgr)
+
+        if len(data_columns):
+            # TODO: prove that we only get here with axis == 1?
+            #  It is the case in all extant tests, but NOT the case
+            #  outside this `if len(data_columns)` check.
+
+            axis, axis_labels = new_non_index_axes[0]
+            new_labels = Index(axis_labels).difference(Index(data_columns))
+            mgr = frame.reindex(new_labels, axis=axis)._mgr
+
+            blocks = list(mgr.blocks)
+            blk_items = get_blk_items(mgr)
+            for c in data_columns:
+                # This reindex would raise ValueError if we had a duplicate
+                #  index, so we can infer that (as long as axis==1) we
+                #  get a single column back, so a single block.
+                mgr = frame.reindex([c], axis=axis)._mgr
+                blocks.extend(mgr.blocks)
+                blk_items.extend(get_blk_items(mgr))
+
+        # reorder the blocks in the same order as the existing table if we can
+        if table_exists:
+            by_items = {
+                tuple(b_items.tolist()): (b, b_items)
+                for b, b_items in zip(blocks, blk_items, strict=True)
+            }
+            new_blocks: list[Block] = []
+            new_blk_items = []
+            for ea in values_axes:
+                items = tuple(ea.values)
+                try:
+                    b, b_items = by_items.pop(items)
+                    new_blocks.append(b)
+                    new_blk_items.append(b_items)
+                except (IndexError, KeyError) as err:
+                    jitems = ",".join([pprint_thing(item) for item in items])
+                    raise ValueError(
+                        f"cannot match existing table structure for [{jitems}] "
+                        "on appending data"
+                    ) from err
+            blocks = new_blocks
+            blk_items = new_blk_items
+
+        return blocks, blk_items
+
+    def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame:
+        """process axes filters"""
+        # make a copy to avoid side effects
+        if columns is not None:
+            columns = list(columns)
+
+        # make sure to include levels if we have them
+        if columns is not None and self.is_multi_index:
+            assert isinstance(self.levels, list)  # assured by is_multi_index
+            for n in self.levels:
+                if n not in columns:
+                    columns.insert(0, n)
+
+        # reorder by any non_index_axes & limit to the select columns
+        for axis, labels in self.non_index_axes:
+            obj = _reindex_axis(obj, axis, labels, columns)
+
+            def process_filter(field, filt, op):
+                for axis_name in obj._AXIS_ORDERS:
+                    axis_number = obj._get_axis_number(axis_name)
+                    axis_values = obj._get_axis(axis_name)
+                    assert axis_number is not None
+
+                    # see if the field is the name of an axis
+                    if field == axis_name:
+                        # if we have a multi-index, then need to include
+                        # the levels
+                        if self.is_multi_index:
+                            filt = filt.union(Index(self.levels))
+
+                        takers = op(axis_values, filt)
+                        return obj.loc(axis=axis_number)[takers]
+
+                    # this might be the name of a file IN an axis
+                    elif field in axis_values:
+                        # we need to filter on this dimension
+                        values = ensure_index(getattr(obj, field).values)
+                        filt = ensure_index(filt)
+
+                        # hack until we support reversed dim flags
+                        if isinstance(obj, DataFrame):
+                            axis_number = 1 - axis_number
+
+                        takers = op(values, filt)
+                        return obj.loc(axis=axis_number)[takers]
+
+                raise ValueError(f"cannot find the field [{field}] for filtering!")
+
+        # apply the selection filters (but keep in the same order)
+        if selection.filter is not None:
+            for field, op, filt in selection.filter.format():
+                obj = process_filter(field, filt, op)
+
+        return obj
+
+    def create_description(
+        self,
+        complib,
+        complevel: int | None,
+        fletcher32: bool,
+        expectedrows: int | None,
+    ) -> dict[str, Any]:
+        """create the description of the table from the axes & values"""
+        # provided expected rows if its passed
+        if expectedrows is None:
+            expectedrows = max(self.nrows_expected, 10000)
+
+        d = {"name": "table", "expectedrows": expectedrows}
+
+        # description from the axes & values
+        d["description"] = {a.cname: a.typ for a in self.axes}
+
+        if complib:
+            if complevel is None:
+                complevel = self._complevel or 9
+            filters = _tables().Filters(
+                complevel=complevel,
+                complib=complib,
+                fletcher32=fletcher32 or self._fletcher32,
+            )
+            d["filters"] = filters
+        elif self._filters is not None:
+            d["filters"] = self._filters
+
+        return d
+
+    def read_coordinates(
+        self, where=None, start: int | None = None, stop: int | None = None
+    ):
+        """
+        select coordinates (row numbers) from a table; return the
+        coordinates object
+        """
+        # validate the version
+        self.validate_version(where)
+
+        # infer the data kind
+        if not self.infer_axes():
+            return False
+
+        # create the selection
+        selection = Selection(self, where=where, start=start, stop=stop)
+        coords = selection.select_coords()
+        if selection.filter is not None:
+            for field, op, filt in selection.filter.format():
+                data = self.read_column(
+                    field, start=coords.min(), stop=coords.max() + 1
+                )
+                coords = coords[op(data.iloc[coords - coords.min()], filt).values]
+
+        return Index(coords, copy=False)
+
+    def read_column(
+        self,
+        column: str,
+        where=None,
+        start: int | None = None,
+        stop: int | None = None,
+    ):
+        """
+        return a single column from the table, generally only indexables
+        are interesting
+        """
+        # validate the version
+        self.validate_version()
+
+        # infer the data kind
+        if not self.infer_axes():
+            return False
+
+        if where is not None:
+            raise TypeError("read_column does not currently accept a where clause")
+
+        # find the axes
+        for a in self.axes:
+            if column == a.name:
+                if not a.is_data_indexable:
+                    raise ValueError(
+                        f"column [{column}] can not be extracted individually; "
+                        "it is not data indexable"
+                    )
+
+                # column must be an indexable or a data column
+                c = getattr(self.table.cols, column)
+                a.set_info(self.info)
+                col_values = a.convert(
+                    c[start:stop],
+                    nan_rep=self.nan_rep,
+                    encoding=self.encoding,
+                    errors=self.errors,
+                )
+                cvs = col_values[1]
+                dtype = getattr(self.table.attrs, f"{column}_meta", None)
+                return Series(cvs, name=column, copy=False, dtype=dtype)
+
+        raise KeyError(f"column [{column}] not found in the table")
+
+
+class WORMTable(Table):
+    """
+    a write-once read-many table: this format DOES NOT ALLOW appending to a
+    table. writing is a one-time operation the data are stored in a format
+    that allows for searching the data on disk
+    """
+
+    table_type = "worm"
+
+    def read(
+        self,
+        where=None,
+        columns=None,
+        start: int | None = None,
+        stop: int | None = None,
+    ):
+        """
+        read the indices and the indexing array, calculate offset rows and return
+        """
+        raise NotImplementedError("WORMTable needs to implement read")
+
+    def write(self, obj, **kwargs) -> None:
+        """
+        write in a format that we can search later on (but cannot append
+        to): write out the indices and the values using _write_array
+        (e.g. a CArray) create an indexing table so that we can search
+        """
+        raise NotImplementedError("WORMTable needs to implement write")
+
+
+class AppendableTable(Table):
+    """support the new appendable table formats"""
+
+    table_type = "appendable"
+
+    # error: Signature of "write" incompatible with supertype "Fixed"
+    def write(  # type: ignore[override]
+        self,
+        obj,
+        axes=None,
+        append: bool = False,
+        complib=None,
+        complevel=None,
+        fletcher32=None,
+        min_itemsize=None,
+        chunksize: int | None = None,
+        expectedrows=None,
+        dropna: bool = False,
+        nan_rep=None,
+        data_columns=None,
+        track_times: bool = True,
+    ) -> None:
+        if not append and self.is_exists:
+            self._handle.remove_node(self.group, "table")
+
+        # create the axes
+        table = self._create_axes(
+            axes=axes,
+            obj=obj,
+            validate=append,
+            min_itemsize=min_itemsize,
+            nan_rep=nan_rep,
+            data_columns=data_columns,
+        )
+
+        for a in table.axes:
+            a.validate_names()
+
+        if not table.is_exists:
+            # create the table
+            options = table.create_description(
+                complib=complib,
+                complevel=complevel,
+                fletcher32=fletcher32,
+                expectedrows=expectedrows,
+            )
+
+            # set the table attributes
+            table.set_attrs()
+
+            options["track_times"] = track_times
+
+            # create the table
+            table._handle.create_table(table.group, **options)
+
+        # update my info
+        table.attrs.info = table.info
+
+        # validate the axes and set the kinds
+        for a in table.axes:
+            a.validate_and_set(table, append)
+
+        # add the rows
+        table.write_data(chunksize, dropna=dropna)
+
+    def write_data(self, chunksize: int | None, dropna: bool = False) -> None:
+        """
+        we form the data into a 2-d including indexes,values,mask write chunk-by-chunk
+        """
+        names = self.dtype.names
+        nrows = self.nrows_expected
+
+        # if dropna==True, then drop ALL nan rows
+        masks = []
+        if dropna:
+            for a in self.values_axes:
+                # figure the mask: only do if we can successfully process this
+                # column, otherwise ignore the mask
+                mask = isna(a.data).all(axis=0)
+                if isinstance(mask, np.ndarray):
+                    masks.append(mask.astype("u1", copy=False))
+
+        # consolidate masks
+        if masks:
+            mask = masks[0]
+            for m in masks[1:]:
+                mask = mask & m
+            mask = mask.ravel()
+        else:
+            mask = None
+
+        # broadcast the indexes if needed
+        indexes = [a.cvalues for a in self.index_axes]
+        nindexes = len(indexes)
+        assert nindexes == 1, nindexes  # ensures we dont need to broadcast
+
+        # transpose the values so first dimension is last
+        # reshape the values if needed
+        values = [a.take_data() for a in self.values_axes]
+        values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
+        bvalues = []
+        for i, v in enumerate(values):
+            new_shape = (nrows, *self.dtype[names[nindexes + i]].shape)
+            bvalues.append(v.reshape(new_shape))
+
+        # write the chunks
+        if chunksize is None:
+            chunksize = 100000
+
+        rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
+        chunks = nrows // chunksize + 1
+        for i in range(chunks):
+            start_i = i * chunksize
+            end_i = min((i + 1) * chunksize, nrows)
+            if start_i >= end_i:
+                break
+
+            self.write_data_chunk(
+                rows,
+                indexes=[a[start_i:end_i] for a in indexes],
+                mask=mask[start_i:end_i] if mask is not None else None,
+                values=[v[start_i:end_i] for v in bvalues],
+            )
+
+    def write_data_chunk(
+        self,
+        rows: np.ndarray,
+        indexes: list[np.ndarray],
+        mask: npt.NDArray[np.bool_] | None,
+        values: list[np.ndarray],
+    ) -> None:
+        """
+        Parameters
+        ----------
+        rows : an empty memory space where we are putting the chunk
+        indexes : an array of the indexes
+        mask : an array of the masks
+        values : an array of the values
+        """
+        # 0 len
+        for v in values:
+            if not np.prod(v.shape):
+                return
+
+        nrows = indexes[0].shape[0]
+        if nrows != len(rows):
+            rows = np.empty(nrows, dtype=self.dtype)
+        names = self.dtype.names
+        nindexes = len(indexes)
+
+        # indexes
+        for i, idx in enumerate(indexes):
+            rows[names[i]] = idx
+
+        # values
+        for i, v in enumerate(values):
+            rows[names[i + nindexes]] = v
+
+        # mask
+        if mask is not None:
+            m = ~mask.ravel().astype(bool, copy=False)
+            if not m.all():
+                rows = rows[m]
+
+        if len(rows):
+            self.table.append(rows)
+            self.table.flush()
+
+    def delete(
+        self, where=None, start: int | None = None, stop: int | None = None
+    ) -> int | None:
+        # delete all rows (and return the nrows)
+        if where is None or not len(where):
+            if start is None and stop is None:
+                nrows = self.nrows
+                self._handle.remove_node(self.group, recursive=True)
+            else:
+                # pytables<3.0 would remove a single row with stop=None
+                if stop is None:
+                    stop = self.nrows
+                nrows = self.table.remove_rows(start=start, stop=stop)
+                self.table.flush()
+            return nrows
+
+        # infer the data kind
+        if not self.infer_axes():
+            return None
+
+        # create the selection
+        table = self.table
+        selection = Selection(self, where, start=start, stop=stop)
+        values = selection.select_coords()
+
+        # delete the rows in reverse order
+        sorted_series = Series(values, copy=False).sort_values()
+        ln = len(sorted_series)
+
+        if ln:
+            # construct groups of consecutive rows
+            diff = sorted_series.diff()
+            groups = list(diff[diff > 1].index)
+
+            # 1 group
+            if not groups:
+                groups = [0]
+
+            # final element
+            if groups[-1] != ln:
+                groups.append(ln)
+
+            # initial element
+            if groups[0] != 0:
+                groups.insert(0, 0)
+
+            # we must remove in reverse order!
+            pg = groups.pop()
+            for g in reversed(groups):
+                rows = sorted_series.take(range(g, pg))
+                table.remove_rows(
+                    start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
+                )
+                pg = g
+
+            self.table.flush()
+
+        # return the number of rows removed
+        return ln
+
+
+class AppendableFrameTable(AppendableTable):
+    """support the new appendable table formats"""
+
+    pandas_kind = "frame_table"
+    table_type = "appendable_frame"
+    ndim = 2
+    obj_type: type[DataFrame | Series] = DataFrame
+
+    @property
+    def is_transposed(self) -> bool:
+        return self.index_axes[0].axis == 1
+
+    @classmethod
+    def get_object(cls, obj, transposed: bool):
+        """these are written transposed"""
+        if transposed:
+            obj = obj.T
+        return obj
+
+    def read(
+        self,
+        where=None,
+        columns=None,
+        start: int | None = None,
+        stop: int | None = None,
+    ):
+        # validate the version
+        self.validate_version(where)
+
+        # infer the data kind
+        if not self.infer_axes():
+            return None
+
+        result = self._read_axes(where=where, start=start, stop=stop)
+
+        info = (
+            self.info.get(self.non_index_axes[0][0], {})
+            if len(self.non_index_axes)
+            else {}
+        )
+
+        inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
+        assert len(inds) == 1
+        ind = inds[0]
+
+        index = result[ind][0]
+
+        frames = []
+        for i, a in enumerate(self.axes):
+            if a not in self.values_axes:
+                continue
+            index_vals, cvalues = result[i]
+
+            # we could have a multi-index constructor here
+            # ensure_index doesn't recognized our list-of-tuples here
+            if info.get("type") != "MultiIndex":
+                cols = Index(index_vals)
+            else:
+                cols = MultiIndex.from_tuples(index_vals)
+
+            names = info.get("names")
+            if names is not None:
+                cols.set_names(names, inplace=True)
+
+            if self.is_transposed:
+                values = cvalues
+                index_ = cols
+                cols_ = Index(index, name=getattr(index, "name", None))
+            else:
+                values = cvalues.T
+                index_ = Index(index, name=getattr(index, "name", None))
+                cols_ = cols
+
+            # if we have a DataIndexableCol, its shape will only be 1 dim
+            if values.ndim == 1 and isinstance(values, np.ndarray):
+                values = values.reshape((1, values.shape[0]))
+
+            if isinstance(values, (np.ndarray, DatetimeArray)):
+                try:
+                    df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
+                except UnicodeEncodeError as err:
+                    if (
+                        self.errors == "surrogatepass"
+                        and using_string_dtype()
+                        and str(err).endswith("surrogates not allowed")
+                        and HAS_PYARROW
+                    ):
+                        df = DataFrame(
+                            values.T,
+                            columns=cols_,
+                            index=index_,
+                            copy=False,
+                            dtype=StringDtype(storage="python", na_value=np.nan),
+                        )
+                    else:
+                        raise
+            elif isinstance(values, Index):
+                df = DataFrame(values, columns=cols_, index=index_)
+            else:
+                # Categorical
+                df = DataFrame._from_arrays([values], columns=cols_, index=index_)
+            if not (using_string_dtype() and values.dtype.kind == "O"):
+                assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
+
+            # If str / string dtype is stored in meta, use that.
+            for column in cols_:
+                dtype = getattr(self.table.attrs, f"{column}_meta", None)
+                if dtype in ["str", "string"]:
+                    df[column] = df[column].astype(dtype)
+            frames.append(df)
+
+        if len(frames) == 1:
+            df = frames[0]
+        else:
+            df = concat(frames, axis=1)
+
+        selection = Selection(self, where=where, start=start, stop=stop)
+        # apply the selection filters & axis orderings
+        df = self.process_axes(df, selection=selection, columns=columns)
+        return df
+
+
+class AppendableSeriesTable(AppendableFrameTable):
+    """support the new appendable table formats"""
+
+    pandas_kind = "series_table"
+    table_type = "appendable_series"
+    ndim = 2
+    obj_type = Series
+
+    @property
+    def is_transposed(self) -> bool:
+        return False
+
+    @classmethod
+    def get_object(cls, obj, transposed: bool):
+        return obj
+
+    # error: Signature of "write" incompatible with supertype "Fixed"
+    def write(self, obj, data_columns=None, **kwargs) -> None:  # type: ignore[override]
+        """we are going to write this as a frame table"""
+        if not isinstance(obj, DataFrame):
+            name = obj.name or "values"
+            obj = obj.to_frame(name)
+        super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
+
+    def read(
+        self,
+        where=None,
+        columns=None,
+        start: int | None = None,
+        stop: int | None = None,
+    ) -> Series:
+        is_multi_index = self.is_multi_index
+        if columns is not None and is_multi_index:
+            assert isinstance(self.levels, list)  # needed for mypy
+            for n in self.levels:
+                if n not in columns:
+                    columns.insert(0, n)
+        s = super().read(where=where, columns=columns, start=start, stop=stop)
+        if is_multi_index:
+            s.set_index(self.levels, inplace=True)
+
+        s = s.iloc[:, 0]
+
+        # remove the default name
+        if s.name == "values":
+            s.name = None
+        return s
+
+
+class AppendableMultiSeriesTable(AppendableSeriesTable):
+    """support the new appendable table formats"""
+
+    pandas_kind = "series_table"
+    table_type = "appendable_multiseries"
+
+    #  error: Signature of "write" incompatible with supertype "Fixed"
+    def write(self, obj, **kwargs) -> None:  # type: ignore[override]
+        """we are going to write this as a frame table"""
+        name = obj.name or "values"
+        newobj, self.levels = self.validate_multiindex(obj)
+        assert isinstance(self.levels, list)  # for mypy
+        cols = list(self.levels)
+        cols.append(name)
+        newobj.columns = Index(cols)
+        super().write(obj=newobj, **kwargs)
+
+
+class GenericTable(AppendableFrameTable):
+    """a table that read/writes the generic pytables table format"""
+
+    pandas_kind = "frame_table"
+    table_type = "generic_table"
+    ndim = 2
+    obj_type = DataFrame
+    levels: list[Hashable]
+
+    @property
+    def pandas_type(self) -> str:
+        return self.pandas_kind
+
+    @property
+    def storable(self):
+        return getattr(self.group, "table", None) or self.group
+
+    def get_attrs(self) -> None:
+        """retrieve our attributes"""
+        self.non_index_axes = []
+        self.nan_rep = None
+        self.levels = []
+
+        self.index_axes = [a for a in self.indexables if a.is_an_indexable]
+        self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
+        self.data_columns = [a.name for a in self.values_axes]
+
+    @cache_readonly
+    def indexables(self):
+        """create the indexables from the table description"""
+        d = self.description
+
+        # TODO: can we get a typ for this?  AFAICT it is the only place
+        #  where we aren't passing one
+        # the index columns is just a simple index
+        md = self.read_metadata("index")
+        meta = "category" if md is not None else None
+        index_col = GenericIndexCol(
+            name="index", axis=0, table=self.table, meta=meta, metadata=md
+        )
+
+        _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]
+
+        for i, n in enumerate(d._v_names):
+            assert isinstance(n, str)
+
+            atom = getattr(d, n)
+            md = self.read_metadata(n)
+            meta = "category" if md is not None else None
+            dc = GenericDataIndexableCol(
+                name=n,
+                pos=i,
+                values=[n],
+                typ=atom,
+                table=self.table,
+                meta=meta,
+                metadata=md,
+            )
+            _indexables.append(dc)
+
+        return _indexables
+
+    # error: Signature of "write" incompatible with supertype "AppendableTable"
+    def write(self, **kwargs) -> None:  # type: ignore[override]
+        raise NotImplementedError("cannot write on a generic table")
+
+
+class AppendableMultiFrameTable(AppendableFrameTable):
+    """a frame with a multi-index"""
+
+    table_type = "appendable_multiframe"
+    obj_type = DataFrame
+    ndim = 2
+    _re_levels = re.compile(r"^level_\d+$")
+
+    @property
+    def table_type_short(self) -> str:
+        return "appendable_multi"
+
+    # error: Signature of "write" incompatible with supertype "Fixed"
+    def write(self, obj, data_columns=None, **kwargs) -> None:  # type: ignore[override]
+        if data_columns is None:
+            data_columns = []
+        elif data_columns is True:
+            data_columns = obj.columns.tolist()
+        obj, self.levels = self.validate_multiindex(obj)
+        assert isinstance(self.levels, list)  # for mypy
+        for n in self.levels:
+            if n not in data_columns:
+                data_columns.insert(0, n)
+        super().write(obj=obj, data_columns=data_columns, **kwargs)
+
+    def read(
+        self,
+        where=None,
+        columns=None,
+        start: int | None = None,
+        stop: int | None = None,
+    ) -> DataFrame:
+        df = super().read(where=where, columns=columns, start=start, stop=stop)
+        df = df.set_index(self.levels)
+
+        # remove names for 'level_%d'
+        df.index = df.index.set_names(
+            [None if self._re_levels.search(name) else name for name in df.index.names]
+        )
+
+        return df
+
+
+def _reindex_axis(
+    obj: DataFrame, axis: AxisInt, labels: Index, other=None
+) -> DataFrame:
+    ax = obj._get_axis(axis)
+    labels = ensure_index(labels)
+
+    # try not to reindex even if other is provided
+    # if it equals our current index
+    if other is not None:
+        other = ensure_index(other)
+    if (other is None or labels.equals(other)) and labels.equals(ax):
+        return obj
+
+    labels = ensure_index(labels.unique())
+    if other is not None:
+        labels = ensure_index(other.unique()).intersection(labels, sort=False)
+    if not labels.equals(ax):
+        slicer: list[slice | Index] = [slice(None, None)] * obj.ndim
+        slicer[axis] = labels
+        obj = obj.loc[tuple(slicer)]
+    return obj
+
+
+# tz to/from coercion
+
+
+def _get_tz(tz: tzinfo) -> str | tzinfo:
+    """for a tz-aware type, return an encoded zone"""
+    zone = timezones.get_timezone(tz)
+    return zone
+
+
+def _set_tz(
+    values: npt.NDArray[np.int64], tz: str | tzinfo | None, datetime64_dtype: str
+) -> DatetimeArray:
+    """
+    Coerce the values to a DatetimeArray with appropriate tz.
+
+    Parameters
+    ----------
+    values : ndarray[int64]
+    tz : str, tzinfo, or None
+    datetime64_dtype : str, e.g. "datetime64[ns]", "datetime64[25s]"
+    """
+    assert values.dtype == "i8", values.dtype
+    # Argument "tz" to "tz_to_dtype" has incompatible type "str | tzinfo | None";
+    # expected "tzinfo"
+    unit, _ = np.datetime_data(datetime64_dtype)  # parsing dtype: unit, count
+    unit = cast("TimeUnit", unit)
+    # error: Argument "tz" to "tz_to_dtype" has incompatible type
+    #  "str | tzinfo | None"; expected "tzinfo"
+    dtype = tz_to_dtype(tz=tz, unit=unit)  # type: ignore[arg-type]
+    dta = DatetimeArray._from_sequence(values, dtype=dtype)
+    return dta
+
+
+def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:
+    assert isinstance(name, str)
+
+    index_name = index.name
+    # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";
+    # expected "Union[ExtensionArray, ndarray]"
+    converted, dtype_name = _get_data_and_dtype_name(index)  # type: ignore[arg-type]
+    kind = _dtype_to_kind(dtype_name)
+    atom = DataIndexableCol._get_atom(converted)
+
+    if (
+        lib.is_np_dtype(index.dtype, "iu")
+        or needs_i8_conversion(index.dtype)
+        or is_bool_dtype(index.dtype)
+    ):
+        # Includes Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
+        #  in which case "kind" is "integer", "integer", "datetime64",
+        #  "timedelta64", and "integer", respectively.
+        return IndexCol(
+            name,
+            values=converted,
+            kind=kind,
+            typ=atom,
+            freq=getattr(index, "freq", None),
+            tz=getattr(index, "tz", None),
+            index_name=index_name,
+        )
+
+    if isinstance(index, MultiIndex):
+        raise TypeError("MultiIndex not supported here!")
+
+    inferred_type = lib.infer_dtype(index, skipna=False)
+    # we won't get inferred_type of "datetime64" or "timedelta64" as these
+    #  would go through the DatetimeIndex/TimedeltaIndex paths above
+
+    values = np.asarray(index)
+
+    if inferred_type == "date":
+        converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
+        return IndexCol(
+            name, converted, "date", _tables().Time32Col(), index_name=index_name
+        )
+    elif inferred_type == "string":
+        converted = _convert_string_array(values, encoding, errors)
+        itemsize = converted.dtype.itemsize
+        return IndexCol(
+            name,
+            converted,
+            "string",
+            _tables().StringCol(itemsize),
+            index_name=index_name,
+        )
+
+    elif inferred_type in ["integer", "floating"]:
+        return IndexCol(
+            name, values=converted, kind=kind, typ=atom, index_name=index_name
+        )
+    else:
+        assert isinstance(converted, np.ndarray) and converted.dtype == object
+        assert kind == "object", kind
+        atom = _tables().ObjectAtom()
+        return IndexCol(name, converted, kind, atom, index_name=index_name)
+
+
+def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:
+    index: Index | np.ndarray
+
+    if kind.startswith("datetime64"):
+        if kind == "datetime64":
+            # created before we stored resolution information
+            index = DatetimeIndex(data, copy=False)
+        else:
+            index = DatetimeIndex(data.view(kind), copy=False)
+    elif kind.startswith("timedelta64"):
+        if kind == "timedelta64":
+            # created before we stored resolution information
+            index = TimedeltaIndex(data, copy=False)
+        else:
+            index = TimedeltaIndex(data.view(kind), copy=False)
+    elif kind == "date":
+        try:
+            index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
+        except ValueError:
+            index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
+    elif kind in ("integer", "float", "bool"):
+        index = np.asarray(data)
+    elif kind in ("string"):
+        index = _unconvert_string_array(
+            data, nan_rep=None, encoding=encoding, errors=errors
+        )
+    elif kind == "object":
+        index = np.asarray(data[0])
+    else:  # pragma: no cover
+        raise ValueError(f"unrecognized index type {kind}")
+    return index
+
+
+def _maybe_convert_for_string_atom(
+    name: str,
+    bvalues: ArrayLike,
+    existing_col,
+    min_itemsize,
+    nan_rep,
+    encoding,
+    errors,
+    columns: list[str],
+):
+    if isinstance(bvalues.dtype, StringDtype):
+        bvalues = bvalues.to_numpy()
+    if bvalues.dtype != object:
+        return bvalues
+
+    bvalues = cast(np.ndarray, bvalues)
+
+    dtype_name = bvalues.dtype.name
+    inferred_type = lib.infer_dtype(bvalues, skipna=False)
+
+    if inferred_type == "date":
+        raise TypeError("[date] is not implemented as a table column")
+    if inferred_type == "datetime":
+        # after GH#8260
+        # this only would be hit for a multi-timezone dtype which is an error
+        raise TypeError(
+            "too many timezones in this block, create separate data columns"
+        )
+
+    if not (inferred_type == "string" or dtype_name == "object"):
+        return bvalues
+
+    mask = isna(bvalues)
+    data = bvalues.copy()
+    data[mask] = nan_rep
+
+    if existing_col and mask.any() and len(nan_rep) > existing_col.itemsize:
+        raise ValueError("NaN representation is too large for existing column size")
+
+    # see if we have a valid string type
+    inferred_type = lib.infer_dtype(data, skipna=False)
+    if inferred_type != "string":
+        # we cannot serialize this data, so report an exception on a column
+        # by column basis
+
+        # expected behaviour:
+        # search block for a non-string object column by column
+        for i in range(data.shape[0]):
+            col = data[i]
+            inferred_type = lib.infer_dtype(col, skipna=False)
+            if inferred_type != "string":
+                error_column_label = columns[i] if len(columns) > i else f"No.{i}"
+                raise TypeError(
+                    f"Cannot serialize the column [{error_column_label}]\n"
+                    f"because its data contents are not [string] but "
+                    f"[{inferred_type}] object dtype"
+                )
+
+    # itemsize is the maximum length of a string (along any dimension)
+
+    data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)
+    itemsize = data_converted.itemsize
+
+    # specified min_itemsize?
+    if isinstance(min_itemsize, dict):
+        min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
+    itemsize = max(min_itemsize or 0, itemsize)
+
+    # check for column in the values conflicts
+    if existing_col is not None:
+        eci = existing_col.validate_col(itemsize)
+        if eci is not None and eci > itemsize:
+            itemsize = eci
+
+    data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
+    return data_converted
+
+
+def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:
+    """
+    Take a string-like that is object dtype and coerce to a fixed size string type.
+
+    Parameters
+    ----------
+    data : np.ndarray[object]
+    encoding : str
+    errors : str
+        Handler for encoding errors.
+
+    Returns
+    -------
+    np.ndarray[fixed-length-string]
+    """
+    # encode if needed
+    if len(data):
+        data = (
+            Series(data.ravel(), copy=False, dtype="object")
+            .str.encode(encoding, errors)
+            ._values.reshape(data.shape)
+        )
+
+    # create the sized dtype
+    ensured = ensure_object(data.ravel())
+    itemsize = max(1, libwriters.max_len_string_array(ensured))
+
+    data = np.asarray(data, dtype=f"S{itemsize}")
+    return data
+
+
+def _unconvert_string_array(
+    data: np.ndarray, nan_rep, encoding: str, errors: str
+) -> np.ndarray:
+    """
+    Inverse of _convert_string_array.
+
+    Parameters
+    ----------
+    data : np.ndarray[fixed-length-string]
+    nan_rep : the storage repr of NaN
+    encoding : str
+    errors : str
+        Handler for encoding errors.
+
+    Returns
+    -------
+    np.ndarray[object]
+        Decoded data.
+    """
+    shape = data.shape
+    data = np.asarray(data.ravel(), dtype=object)
+
+    if len(data):
+        itemsize = libwriters.max_len_string_array(ensure_object(data))
+        dtype = f"U{itemsize}"
+
+        if isinstance(data[0], bytes):
+            ser = Series(data, copy=False).str.decode(
+                encoding, errors=errors, dtype="object"
+            )
+            data = ser.to_numpy()
+            data.flags.writeable = True
+        else:
+            data = data.astype(dtype, copy=False).astype(object, copy=False)
+
+    if nan_rep is None:
+        nan_rep = "nan"
+
+    libwriters.string_array_replace_from_nan_rep(data, nan_rep)
+    return data.reshape(shape)
+
+
+def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):
+    assert isinstance(val_kind, str), type(val_kind)
+    if _need_convert(val_kind):
+        conv = _get_converter(val_kind, encoding, errors)
+        values = conv(values)
+    return values
+
+
+def _get_converter(kind: str, encoding: str, errors: str):
+    if kind == "datetime64":
+        return lambda x: np.asarray(x, dtype="M8[ns]")
+    elif "datetime64" in kind:
+        return lambda x: np.asarray(x, dtype=kind)
+    elif kind == "string":
+        return lambda x: _unconvert_string_array(
+            x, nan_rep=None, encoding=encoding, errors=errors
+        )
+    else:  # pragma: no cover
+        raise ValueError(f"invalid kind {kind}")
+
+
+def _need_convert(kind: str) -> bool:
+    if kind in ("datetime64", "string") or "datetime64" in kind:
+        return True
+    return False
+
+
+def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:
+    """
+    Prior to 0.10.1, we named values blocks like: values_block_0 and the
+    name values_0, adjust the given name if necessary.
+
+    Parameters
+    ----------
+    name : str
+    version : Tuple[int, int, int]
+
+    Returns
+    -------
+    str
+    """
+    if isinstance(version, str) or len(version) < 3:
+        raise ValueError("Version is incorrect, expected sequence of 3 integers.")
+
+    if version[0] == 0 and version[1] <= 10 and version[2] == 0:
+        m = re.search(r"values_block_(\d+)", name)
+        if m:
+            grp = m.groups()[0]
+            name = f"values_{grp}"
+    return name
+
+
+def _dtype_to_kind(dtype_str: str) -> str:
+    """
+    Find the "kind" string describing the given dtype name.
+    """
+    if dtype_str.startswith(("string", "bytes")):
+        kind = "string"
+    elif dtype_str.startswith("float"):
+        kind = "float"
+    elif dtype_str.startswith("complex"):
+        kind = "complex"
+    elif dtype_str.startswith(("int", "uint")):
+        kind = "integer"
+    elif dtype_str.startswith("datetime64"):
+        kind = dtype_str
+    elif dtype_str.startswith("timedelta"):
+        kind = dtype_str
+    elif dtype_str.startswith("bool"):
+        kind = "bool"
+    elif dtype_str.startswith("category"):
+        kind = "category"
+    elif dtype_str.startswith("period"):
+        # We store the `freq` attr so we can restore from integers
+        kind = "integer"
+    elif dtype_str == "object":
+        kind = "object"
+    elif dtype_str == "str":
+        kind = "str"
+    else:
+        raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
+
+    return kind
+
+
+def _get_data_and_dtype_name(data: ArrayLike):
+    """
+    Convert the passed data into a storable form and a dtype string.
+    """
+    if isinstance(data, Categorical):
+        data = data.codes
+
+    if isinstance(data.dtype, DatetimeTZDtype):
+        # For datetime64tz we need to drop the TZ in tests TODO: why?
+        dtype_name = f"datetime64[{data.dtype.unit}]"
+    else:
+        dtype_name = data.dtype.name
+
+    if data.dtype.kind in "mM":
+        data = np.asarray(data.view("i8"))
+        # TODO: we used to reshape for the dt64tz case, but no longer
+        #  doing that doesn't seem to break anything.  why?
+
+    elif isinstance(data, PeriodIndex):
+        data = data.asi8
+
+    data = np.asarray(data)
+    return data, dtype_name
+
+
+class Selection:
+    """
+    Carries out a selection operation on a tables.Table object.
+
+    Parameters
+    ----------
+    table : a Table object
+    where : list of Terms (or convertible to)
+    start, stop: indices to start and/or stop selection
+
+    """
+
+    def __init__(
+        self,
+        table: Table,
+        where=None,
+        start: int | None = None,
+        stop: int | None = None,
+    ) -> None:
+        self.table = table
+        self.where = where
+        self.start = start
+        self.stop = stop
+        self.condition = None
+        self.filter = None
+        self.terms = None
+        self.coordinates = None
+
+        if is_list_like(where):
+            # see if we have a passed coordinate like
+            with suppress(ValueError):
+                inferred = lib.infer_dtype(where, skipna=False)
+                if inferred in ("integer", "boolean"):
+                    where = np.asarray(where)
+                    if where.dtype == np.bool_:
+                        start, stop = self.start, self.stop
+                        if start is None:
+                            start = 0
+                        if stop is None:
+                            stop = self.table.nrows
+                        self.coordinates = np.arange(start, stop)[where]
+                    elif issubclass(where.dtype.type, np.integer):
+                        if (self.start is not None and (where < self.start).any()) or (
+                            self.stop is not None and (where >= self.stop).any()
+                        ):
+                            raise ValueError(
+                                "where must have index locations >= start and < stop"
+                            )
+                        self.coordinates = where
+
+        if self.coordinates is None:
+            self.terms = self.generate(where)
+
+            # create the numexpr & the filter
+            if self.terms is not None:
+                self.condition, self.filter = self.terms.evaluate()
+
+    @overload
+    def generate(self, where: dict | list | tuple | str) -> PyTablesExpr: ...
+
+    @overload
+    def generate(self, where: None) -> None: ...
+
+    def generate(self, where: dict | list | tuple | str | None) -> PyTablesExpr | None:
+        """where can be a : dict,list,tuple,string"""
+        if where is None:
+            return None
+
+        q = self.table.queryables()
+        try:
+            return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)
+        except NameError as err:
+            # raise a nice message, suggesting that the user should use
+            # data_columns
+            qkeys = ",".join(q.keys())
+            msg = dedent(
+                f"""\
+                The passed where expression: {where}
+                            contains an invalid variable reference
+                            all of the variable references must be a reference to
+                            an axis (e.g. 'index' or 'columns'), or a data_column
+                            The currently defined references are: {qkeys}
+                """
+            )
+            raise ValueError(msg) from err
+
+    def select(self):
+        """
+        generate the selection
+        """
+        if self.condition is not None:
+            return self.table.table.read_where(
+                self.condition.format(), start=self.start, stop=self.stop
+            )
+        elif self.coordinates is not None:
+            return self.table.table.read_coordinates(self.coordinates)
+        return self.table.table.read(start=self.start, stop=self.stop)
+
+    def select_coords(self):
+        """
+        generate the selection
+        """
+        start, stop = self.start, self.stop
+        nrows = self.table.nrows
+        if start is None:
+            start = 0
+        elif start < 0:
+            start += nrows
+        if stop is None:
+            stop = nrows
+        elif stop < 0:
+            stop += nrows
+
+        if self.condition is not None:
+            return self.table.table.get_where_list(
+                self.condition.format(), start=start, stop=stop, sort=True
+            )
+        elif self.coordinates is not None:
+            return self.coordinates
+
+        return np.arange(start, stop)
diff --git a/pandas/io/spss.py b/pandas/io/spss.py
new file mode 100644
index 0000000000000000000000000000000000000000..522c7206a2ae55322232f2be1031cbfd30d1fdfd
--- /dev/null
+++ b/pandas/io/spss.py
@@ -0,0 +1,95 @@
+from __future__ import annotations
+
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+
+from pandas._libs import lib
+from pandas.compat._optional import import_optional_dependency
+from pandas.util._decorators import set_module
+from pandas.util._validators import check_dtype_backend
+
+from pandas.core.dtypes.inference import is_list_like
+
+from pandas.io.common import stringify_path
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+    from pathlib import Path
+
+    from pandas._typing import DtypeBackend
+
+    from pandas import DataFrame
+
+
+@set_module("pandas")
+def read_spss(
+    path: str | Path,
+    usecols: Sequence[str] | None = None,
+    convert_categoricals: bool = True,
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+    **kwargs: Any,
+) -> DataFrame:
+    """
+    Load an SPSS file from the file path, returning a DataFrame.
+
+    Parameters
+    ----------
+    path : str or Path
+        File path.
+    usecols : list-like, optional
+        Return a subset of the columns. If None, return all columns.
+    convert_categoricals : bool, default is True
+        Convert categorical columns into pd.Categorical.
+    dtype_backend : {'numpy_nullable', 'pyarrow'}
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). If not specified, the default behavior
+        is to not use nullable data types. If specified, the behavior
+        is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+        * ``"pyarrow"``: returns pyarrow-backed
+          nullable :class:`ArrowDtype` :class:`DataFrame`
+
+        .. versionadded:: 2.0
+    **kwargs
+        Additional keyword arguments that can be passed to :func:`pyreadstat.read_sav`.
+
+        .. versionadded:: 3.0
+
+    Returns
+    -------
+    DataFrame
+        DataFrame based on the SPSS file.
+
+    See Also
+    --------
+    read_csv : Read a comma-separated values (csv) file into a pandas DataFrame.
+    read_excel : Read an Excel file into a pandas DataFrame.
+    read_sas : Read an SAS file into a pandas DataFrame.
+    read_orc : Load an ORC object into a pandas DataFrame.
+    read_feather : Load a feather-format object into a pandas DataFrame.
+
+    Examples
+    --------
+    >>> df = pd.read_spss("spss_data.sav")  # doctest: +SKIP
+    """
+    pyreadstat = import_optional_dependency("pyreadstat")
+    check_dtype_backend(dtype_backend)
+
+    if usecols is not None:
+        if not is_list_like(usecols):
+            raise TypeError("usecols must be list-like.")
+        usecols = list(usecols)  # pyreadstat requires a list
+
+    df, metadata = pyreadstat.read_sav(
+        stringify_path(path),
+        usecols=usecols,
+        apply_value_formats=convert_categoricals,
+        **kwargs,
+    )
+    df.attrs = metadata.__dict__
+    if dtype_backend is not lib.no_default:
+        df = df.convert_dtypes(dtype_backend=dtype_backend)
+    return df
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
new file mode 100644
index 0000000000000000000000000000000000000000..52adbd42c4479804cfbbb30bf3e769f5f3645106
--- /dev/null
+++ b/pandas/io/sql.py
@@ -0,0 +1,2960 @@
+"""
+Collection of query wrappers / abstractions to both facilitate data
+retrieval and to reduce dependency on DB-specific API.
+"""
+
+from __future__ import annotations
+
+from abc import (
+    ABC,
+    abstractmethod,
+)
+from contextlib import (
+    ExitStack,
+    contextmanager,
+)
+from datetime import (
+    date,
+    datetime,
+    time,
+)
+from functools import partial
+import re
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    Self,
+    cast,
+    overload,
+)
+import warnings
+
+import numpy as np
+
+from pandas._config import using_string_dtype
+
+from pandas._libs import lib
+from pandas.compat._optional import (
+    VERSIONS,
+    import_optional_dependency,
+)
+from pandas.errors import (
+    AbstractMethodError,
+    DatabaseError,
+)
+from pandas.util._decorators import set_module
+from pandas.util._exceptions import find_stack_level
+from pandas.util._validators import check_dtype_backend
+
+from pandas.core.dtypes.common import (
+    is_dict_like,
+    is_list_like,
+    is_object_dtype,
+    is_string_dtype,
+)
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
+from pandas.core.dtypes.missing import isna
+
+from pandas import get_option
+from pandas.core.api import (
+    DataFrame,
+    Series,
+)
+from pandas.core.arrays import ArrowExtensionArray
+from pandas.core.arrays.string_ import StringDtype
+from pandas.core.base import PandasObject
+import pandas.core.common as com
+from pandas.core.common import maybe_make_list
+from pandas.core.internals.construction import convert_object_array
+from pandas.core.tools.datetimes import to_datetime
+
+from pandas.io._util import arrow_table_to_pandas
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Generator,
+        Iterator,
+        Mapping,
+    )
+
+    from sqlalchemy import Table
+    from sqlalchemy.sql.expression import (
+        Delete,
+        Select,
+        TextClause,
+    )
+
+    from pandas._typing import (
+        DtypeArg,
+        DtypeBackend,
+        IndexLabel,
+    )
+
+    from pandas import Index
+
+# -----------------------------------------------------------------------------
+# -- Helper functions
+
+
+def _process_parse_dates_argument(parse_dates):
+    """Process parse_dates argument for read_sql functions"""
+    # handle non-list entries for parse_dates gracefully
+    if parse_dates is True or parse_dates is None or parse_dates is False:
+        parse_dates = []
+
+    elif not hasattr(parse_dates, "__iter__"):
+        parse_dates = [parse_dates]
+    return parse_dates
+
+
+def _handle_date_column(
+    col, utc: bool = False, format: str | dict[str, Any] | None = None
+):
+    if isinstance(format, dict):
+        # GH35185 Allow custom error values in parse_dates argument of
+        # read_sql like functions.
+        # Format can take on custom to_datetime argument values such as
+        # {"errors": "coerce"} or {"dayfirst": True}
+        return to_datetime(col, **format)
+    else:
+        # Allow passing of formatting string for integers
+        # GH17855
+        if format is None and (
+            issubclass(col.dtype.type, np.floating)
+            or issubclass(col.dtype.type, np.integer)
+        ):
+            format = "s"
+        if format in ["D", "d", "h", "m", "s", "ms", "us", "ns"]:
+            return to_datetime(col, errors="coerce", unit=format, utc=utc)
+        elif isinstance(col.dtype, DatetimeTZDtype):
+            # coerce to UTC timezone
+            # GH11216
+            return to_datetime(col, utc=True)
+        else:
+            return to_datetime(col, errors="coerce", format=format, utc=utc)
+
+
+def _parse_date_columns(data_frame: DataFrame, parse_dates) -> DataFrame:
+    """
+    Force non-datetime columns to be read as such.
+    Supports both string formatted and integer timestamp columns.
+    """
+    parse_dates = _process_parse_dates_argument(parse_dates)
+
+    # we want to coerce datetime64_tz dtypes for now to UTC
+    # we could in theory do a 'nice' conversion from a FixedOffset tz
+    # GH11216
+    for i, (col_name, df_col) in enumerate(data_frame.items()):
+        if isinstance(df_col.dtype, DatetimeTZDtype) or col_name in parse_dates:
+            try:
+                fmt = parse_dates[col_name]
+            except (KeyError, TypeError):
+                fmt = None
+            data_frame.isetitem(i, _handle_date_column(df_col, format=fmt))
+
+    return data_frame
+
+
+def _convert_arrays_to_dataframe(
+    data,
+    columns,
+    coerce_float: bool = True,
+    dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+) -> DataFrame:
+    content = lib.to_object_array_tuples(data)
+    idx_len = content.shape[0]
+    arrays = convert_object_array(
+        list(content.T),
+        dtype=None,
+        coerce_float=coerce_float,
+        dtype_backend=dtype_backend,
+    )
+    if dtype_backend == "pyarrow":
+        pa = import_optional_dependency("pyarrow")
+
+        result_arrays = []
+        for arr in arrays:
+            pa_array = pa.array(arr, from_pandas=True)
+            if arr.dtype == "string":
+                # TODO: Arrow still infers strings arrays as regular strings instead
+                # of large_string, which is what we preserver everywhere else for
+                # dtype_backend="pyarrow". We may want to reconsider this
+                pa_array = pa_array.cast(pa.string())
+            result_arrays.append(ArrowExtensionArray(pa_array))
+        arrays = result_arrays  # type: ignore[assignment]
+    if arrays:
+        return DataFrame._from_arrays(
+            arrays, columns=columns, index=range(idx_len), verify_integrity=False
+        )
+    else:
+        return DataFrame(columns=columns)
+
+
+def _wrap_result(
+    data,
+    columns,
+    index_col=None,
+    coerce_float: bool = True,
+    parse_dates=None,
+    dtype: DtypeArg | None = None,
+    dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+) -> DataFrame:
+    """Wrap result set of a SQLAlchemy query in a DataFrame."""
+    frame = _convert_arrays_to_dataframe(data, columns, coerce_float, dtype_backend)
+
+    if dtype:
+        frame = frame.astype(dtype)
+
+    frame = _parse_date_columns(frame, parse_dates)
+
+    if index_col is not None:
+        frame = frame.set_index(index_col)
+
+    return frame
+
+
+def _wrap_result_adbc(
+    df: DataFrame,
+    *,
+    index_col=None,
+    parse_dates=None,
+    dtype: DtypeArg | None = None,
+    dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+) -> DataFrame:
+    """Wrap result set of a SQLAlchemy query in a DataFrame."""
+    if dtype:
+        df = df.astype(dtype)
+
+    df = _parse_date_columns(df, parse_dates)
+
+    if index_col is not None:
+        df = df.set_index(index_col)
+
+    return df
+
+
+# -----------------------------------------------------------------------------
+# -- Read and write to DataFrames
+
+
+@overload
+def read_sql_table(  # pyright: ignore[reportOverlappingOverload]
+    table_name: str,
+    con,
+    schema=...,
+    index_col: str | list[str] | None = ...,
+    coerce_float=...,
+    parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ...,
+    columns: list[str] | None = ...,
+    chunksize: None = ...,
+    dtype_backend: DtypeBackend | lib.NoDefault = ...,
+) -> DataFrame: ...
+
+
+@overload
+def read_sql_table(
+    table_name: str,
+    con,
+    schema=...,
+    index_col: str | list[str] | None = ...,
+    coerce_float=...,
+    parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ...,
+    columns: list[str] | None = ...,
+    chunksize: int = ...,
+    dtype_backend: DtypeBackend | lib.NoDefault = ...,
+) -> Iterator[DataFrame]: ...
+
+
+@set_module("pandas")
+def read_sql_table(
+    table_name: str,
+    con,
+    schema: str | None = None,
+    index_col: str | list[str] | None = None,
+    coerce_float: bool = True,
+    parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None,
+    columns: list[str] | None = None,
+    chunksize: int | None = None,
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+) -> DataFrame | Iterator[DataFrame]:
+    """
+    Read SQL database table into a DataFrame.
+
+    Given a table name and a SQLAlchemy connectable, returns a DataFrame.
+    This function does not support DBAPI connections.
+
+    Parameters
+    ----------
+    table_name : str
+        Name of SQL table in database.
+    con : SQLAlchemy connectable or str
+        A database URI could be provided as str.
+        SQLite DBAPI connection mode not supported.
+    schema : str, default None
+        Name of SQL schema in database to query (if database flavor
+        supports this). Uses default schema if None (default).
+    index_col : str or list of str, optional, default: None
+        Column(s) to set as index(MultiIndex).
+    coerce_float : bool, default True
+        Attempts to convert values of non-string, non-numeric objects (like
+        decimal.Decimal) to floating point. Can result in loss of Precision.
+    parse_dates : list or dict, default None
+        - List of column names to parse as dates.
+        - Dict of ``{column_name: format string}`` where format string is
+          strftime compatible in case of parsing string times or is one of
+          (D, s, ns, ms, us) in case of parsing integer timestamps.
+        - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
+          to the keyword arguments of :func:`pandas.to_datetime`
+          Especially useful with databases without native Datetime support,
+          such as SQLite.
+    columns : list, default None
+        List of column names to select from SQL table.
+    chunksize : int, default None
+        If specified, returns an iterator where `chunksize` is the number of
+        rows to include in each chunk.
+    dtype_backend : {'numpy_nullable', 'pyarrow'}
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). If not specified, the default behavior
+        is to not use nullable data types. If specified, the behavior
+        is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+        * ``"pyarrow"``: returns pyarrow-backed nullable
+          :class:`ArrowDtype` :class:`DataFrame`
+
+        .. versionadded:: 2.0
+
+    Returns
+    -------
+    DataFrame or Iterator[DataFrame]
+        A SQL table is returned as two-dimensional data structure with labeled
+        axes.
+
+    See Also
+    --------
+    read_sql_query : Read SQL query into a DataFrame.
+    read_sql : Read SQL query or database table into a DataFrame.
+
+    Notes
+    -----
+    Any datetime values with time zone information will be converted to UTC.
+
+    Examples
+    --------
+    >>> pd.read_sql_table("table_name", "postgres:///db_name")  # doctest:+SKIP
+    """
+
+    check_dtype_backend(dtype_backend)
+    if dtype_backend is lib.no_default:
+        dtype_backend = "numpy"  # type: ignore[assignment]
+    assert dtype_backend is not lib.no_default
+
+    with pandasSQL_builder(con, schema=schema, need_transaction=True) as pandas_sql:
+        if not pandas_sql.has_table(table_name):
+            raise ValueError(f"Table {table_name} not found")
+
+        table = pandas_sql.read_table(
+            table_name,
+            index_col=index_col,
+            coerce_float=coerce_float,
+            parse_dates=parse_dates,
+            columns=columns,
+            chunksize=chunksize,
+            dtype_backend=dtype_backend,
+        )
+
+    if table is not None:
+        return table
+    else:
+        raise ValueError(f"Table {table_name} not found", con)
+
+
+@overload
+def read_sql_query(  # pyright: ignore[reportOverlappingOverload]
+    sql,
+    con,
+    index_col: str | list[str] | None = ...,
+    coerce_float=...,
+    params: list[Any] | Mapping[str, Any] | None = ...,
+    parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ...,
+    chunksize: None = ...,
+    dtype: DtypeArg | None = ...,
+    dtype_backend: DtypeBackend | lib.NoDefault = ...,
+) -> DataFrame: ...
+
+
+@overload
+def read_sql_query(
+    sql,
+    con,
+    index_col: str | list[str] | None = ...,
+    coerce_float=...,
+    params: list[Any] | Mapping[str, Any] | None = ...,
+    parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = ...,
+    chunksize: int = ...,
+    dtype: DtypeArg | None = ...,
+    dtype_backend: DtypeBackend | lib.NoDefault = ...,
+) -> Iterator[DataFrame]: ...
+
+
+@set_module("pandas")
+def read_sql_query(
+    sql,
+    con,
+    index_col: str | list[str] | None = None,
+    coerce_float: bool = True,
+    params: list[Any] | Mapping[str, Any] | None = None,
+    parse_dates: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None = None,
+    chunksize: int | None = None,
+    dtype: DtypeArg | None = None,
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+) -> DataFrame | Iterator[DataFrame]:
+    """
+    Read SQL query into a DataFrame.
+
+    Returns a DataFrame corresponding to the result set of the query
+    string. Optionally provide an `index_col` parameter to use one of the
+    columns as the index, otherwise default integer index will be used.
+
+    Parameters
+    ----------
+    sql : str SQL query or SQLAlchemy Selectable (select or text object)
+        SQL query to be executed.
+    con : SQLAlchemy connectable, str, or sqlite3 connection
+        Using SQLAlchemy makes it possible to use any DB supported by that
+        library. If a DBAPI2 object, only sqlite3 is supported.
+    index_col : str or list of str, optional, default: None
+        Column(s) to set as index(MultiIndex).
+    coerce_float : bool, default True
+        Attempts to convert values of non-string, non-numeric objects (like
+        decimal.Decimal) to floating point. Useful for SQL result sets.
+    params : list, tuple or mapping, optional, default: None
+        List of parameters to pass to execute method.  The syntax used
+        to pass parameters is database driver dependent. Check your
+        database driver documentation for which of the five syntax styles,
+        described in PEP 249's paramstyle, is supported.
+        Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}.
+    parse_dates : list or dict, default: None
+        - List of column names to parse as dates.
+        - Dict of ``{column_name: format string}`` where format string is
+          strftime compatible in case of parsing string times, or is one of
+          (D, s, ns, ms, us) in case of parsing integer timestamps.
+        - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
+          to the keyword arguments of :func:`pandas.to_datetime`
+          Especially useful with databases without native Datetime support,
+          such as SQLite.
+    chunksize : int, default None
+        If specified, return an iterator where `chunksize` is the number of
+        rows to include in each chunk.
+    dtype : Type name or dict of columns
+        Data type for data or columns. E.g. np.float64 or
+        {'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
+    dtype_backend : {'numpy_nullable', 'pyarrow'}
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). If not specified, the default behavior
+        is to not use nullable data types. If specified, the behavior
+        is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+        * ``"pyarrow"``: returns pyarrow-backed nullable
+          :class:`ArrowDtype` :class:`DataFrame`
+
+        .. versionadded:: 2.0
+
+    Returns
+    -------
+    DataFrame or Iterator[DataFrame]
+        Returns a DataFrame object that contains the result set of the
+        executed SQL query, in relation to the specified database connection.
+
+    See Also
+    --------
+    read_sql_table : Read SQL database table into a DataFrame.
+    read_sql : Read SQL query or database table into a DataFrame.
+
+    Notes
+    -----
+    Any datetime values with time zone information parsed via the `parse_dates`
+    parameter will be converted to UTC.
+
+    Examples
+    --------
+    >>> from sqlalchemy import create_engine  # doctest: +SKIP
+    >>> engine = create_engine("sqlite:///database.db")  # doctest: +SKIP
+    >>> sql_query = "SELECT int_column FROM test_data"  # doctest: +SKIP
+    >>> with engine.connect() as conn, conn.begin():  # doctest: +SKIP
+    ...     data = pd.read_sql_query(sql_query, conn)  # doctest: +SKIP
+    """
+
+    check_dtype_backend(dtype_backend)
+    if dtype_backend is lib.no_default:
+        dtype_backend = "numpy"  # type: ignore[assignment]
+    assert dtype_backend is not lib.no_default
+
+    with pandasSQL_builder(con) as pandas_sql:
+        return pandas_sql.read_query(
+            sql,
+            index_col=index_col,
+            params=params,
+            coerce_float=coerce_float,
+            parse_dates=parse_dates,
+            chunksize=chunksize,
+            dtype=dtype,
+            dtype_backend=dtype_backend,
+        )
+
+
+@overload
+def read_sql(  # pyright: ignore[reportOverlappingOverload]
+    sql,
+    con,
+    index_col: str | list[str] | None = ...,
+    coerce_float=...,
+    params=...,
+    parse_dates=...,
+    columns: list[str] = ...,
+    chunksize: None = ...,
+    dtype_backend: DtypeBackend | lib.NoDefault = ...,
+    dtype: DtypeArg | None = None,
+) -> DataFrame: ...
+
+
+@overload
+def read_sql(
+    sql,
+    con,
+    index_col: str | list[str] | None = ...,
+    coerce_float=...,
+    params=...,
+    parse_dates=...,
+    columns: list[str] = ...,
+    chunksize: int = ...,
+    dtype_backend: DtypeBackend | lib.NoDefault = ...,
+    dtype: DtypeArg | None = None,
+) -> Iterator[DataFrame]: ...
+
+
+@set_module("pandas")
+def read_sql(
+    sql,
+    con,
+    index_col: str | list[str] | None = None,
+    coerce_float: bool = True,
+    params=None,
+    parse_dates=None,
+    columns: list[str] | None = None,
+    chunksize: int | None = None,
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+    dtype: DtypeArg | None = None,
+) -> DataFrame | Iterator[DataFrame]:
+    """
+    Read SQL query or database table into a DataFrame.
+
+    This function is a convenience wrapper around ``read_sql_table`` and
+    ``read_sql_query`` (for backward compatibility). It will delegate
+    to the specific function depending on the provided input. A SQL query
+    will be routed to ``read_sql_query``, while a database table name will
+    be routed to ``read_sql_table``. Note that the delegated function might
+    have more specific notes about their functionality not listed here.
+
+    Parameters
+    ----------
+    sql : str or SQLAlchemy Selectable (select or text object)
+        SQL query to be executed or a table name.
+    con : ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection
+        ADBC provides high performance I/O with native type support, where available.
+        Using SQLAlchemy makes it possible to use any DB supported by that
+        library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible
+        for engine disposal and connection closure for the ADBC connection and
+        SQLAlchemy connectable; str connections are closed automatically. See
+        `here <https://docs.sqlalchemy.org/en/20/core/connections.html>`_.
+    index_col : str or list of str, optional, default: None
+        Column(s) to set as index(MultiIndex).
+    coerce_float : bool, default True
+        Attempts to convert values of non-string, non-numeric objects (like
+        decimal.Decimal) to floating point, useful for SQL result sets.
+    params : list, tuple or dict, optional, default: None
+        List of parameters to pass to execute method.  The syntax used
+        to pass parameters is database driver dependent. Check your
+        database driver documentation for which of the five syntax styles,
+        described in PEP 249's paramstyle, is supported.
+        Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}.
+    parse_dates : list or dict, default: None
+        - List of column names to parse as dates.
+        - Dict of ``{column_name: format string}`` where format string is
+          strftime compatible in case of parsing string times, or is one of
+          (D, s, ns, ms, us) in case of parsing integer timestamps.
+        - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
+          to the keyword arguments of :func:`pandas.to_datetime`
+          Especially useful with databases without native Datetime support,
+          such as SQLite.
+    columns : list, default: None
+        List of column names to select from SQL table (only used when reading
+        a table).
+    chunksize : int, default None
+        If specified, return an iterator where `chunksize` is the
+        number of rows to include in each chunk.
+    dtype_backend : {'numpy_nullable', 'pyarrow'}
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). If not specified, the default behavior
+        is to not use nullable data types. If specified, the behavior
+        is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+        * ``"pyarrow"``: returns pyarrow-backed nullable
+          :class:`ArrowDtype` :class:`DataFrame`
+
+        .. versionadded:: 2.0
+    dtype : Type name or dict of columns
+        Data type for data or columns. E.g. np.float64 or
+        {'a': np.float64, 'b': np.int32, 'c': 'Int64'}.
+        The argument is ignored if a table is passed instead of a query.
+
+        .. versionadded:: 2.0.0
+
+    Returns
+    -------
+    DataFrame or Iterator[DataFrame]
+        Returns a DataFrame object that contains the result set of the
+        executed SQL query or an SQL Table based on the provided input,
+        in relation to the specified database connection.
+
+    See Also
+    --------
+    read_sql_table : Read SQL database table into a DataFrame.
+    read_sql_query : Read SQL query into a DataFrame.
+
+    Notes
+    -----
+    ``pandas`` does not attempt to sanitize SQL statements;
+    instead it simply forwards the statement you are executing
+    to the underlying driver, which may or may not sanitize from there.
+    Please refer to the underlying driver documentation for any details.
+    Generally, be wary when accepting statements from arbitrary sources.
+
+    Examples
+    --------
+    Read data from SQL via either a SQL query or a SQL tablename.
+    When using a SQLite database only SQL queries are accepted,
+    providing only the SQL tablename will result in an error.
+
+    >>> from sqlite3 import connect
+    >>> conn = connect(":memory:")
+    >>> df = pd.DataFrame(
+    ...     data=[[0, "10/11/12"], [1, "12/11/10"]],
+    ...     columns=["int_column", "date_column"],
+    ... )
+    >>> df.to_sql(name="test_data", con=conn)
+    2
+
+    >>> pd.read_sql("SELECT int_column, date_column FROM test_data", conn)
+       int_column date_column
+    0           0    10/11/12
+    1           1    12/11/10
+
+    >>> pd.read_sql("test_data", "postgres:///db_name")  # doctest:+SKIP
+
+    For parameterized query, using ``params`` is recommended over string interpolation.
+
+    >>> from sqlalchemy import text
+    >>> sql = text(
+    ...     "SELECT int_column, date_column FROM test_data WHERE int_column=:int_val"
+    ... )
+    >>> pd.read_sql(sql, conn, params={"int_val": 1})  # doctest:+SKIP
+       int_column date_column
+    0           1    12/11/10
+
+    Apply date parsing to columns through the ``parse_dates`` argument
+    The ``parse_dates`` argument calls ``pd.to_datetime`` on the provided columns.
+    Custom argument values for applying ``pd.to_datetime`` on a column are specified
+    via a dictionary format:
+
+    >>> pd.read_sql(
+    ...     "SELECT int_column, date_column FROM test_data",
+    ...     conn,
+    ...     parse_dates={"date_column": {"format": "%d/%m/%y"}},
+    ... )
+       int_column date_column
+    0           0  2012-11-10
+    1           1  2010-11-12
+
+    .. versionadded:: 2.2.0
+
+       pandas now supports reading via ADBC drivers
+
+    >>> from adbc_driver_postgresql import dbapi  # doctest:+SKIP
+    >>> with dbapi.connect("postgres:///db_name") as conn:  # doctest:+SKIP
+    ...     pd.read_sql("SELECT int_column FROM test_data", conn)
+       int_column
+    0           0
+    1           1
+    """
+
+    check_dtype_backend(dtype_backend)
+    if dtype_backend is lib.no_default:
+        dtype_backend = "numpy"  # type: ignore[assignment]
+    assert dtype_backend is not lib.no_default
+
+    with pandasSQL_builder(con) as pandas_sql:
+        if isinstance(pandas_sql, SQLiteDatabase):
+            return pandas_sql.read_query(
+                sql,
+                index_col=index_col,
+                params=params,
+                coerce_float=coerce_float,
+                parse_dates=parse_dates,
+                chunksize=chunksize,
+                dtype_backend=dtype_backend,
+                dtype=dtype,
+            )
+
+        try:
+            _is_table_name = pandas_sql.has_table(sql)
+        except Exception:
+            # using generic exception to catch errors from sql drivers (GH24988)
+            _is_table_name = False
+
+        if _is_table_name:
+            return pandas_sql.read_table(
+                sql,
+                index_col=index_col,
+                coerce_float=coerce_float,
+                parse_dates=parse_dates,
+                columns=columns,
+                chunksize=chunksize,
+                dtype_backend=dtype_backend,
+            )
+        else:
+            return pandas_sql.read_query(
+                sql,
+                index_col=index_col,
+                params=params,
+                coerce_float=coerce_float,
+                parse_dates=parse_dates,
+                chunksize=chunksize,
+                dtype_backend=dtype_backend,
+                dtype=dtype,
+            )
+
+
+def to_sql(
+    frame,
+    name: str,
+    con,
+    schema: str | None = None,
+    if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
+    index: bool = True,
+    index_label: IndexLabel | None = None,
+    chunksize: int | None = None,
+    dtype: DtypeArg | None = None,
+    method: Literal["multi"] | Callable | None = None,
+    engine: str = "auto",
+    **engine_kwargs,
+) -> int | None:
+    """
+    Write records stored in a DataFrame to a SQL database.
+
+    .. warning::
+        The pandas library does not attempt to sanitize inputs provided via a to_sql call.
+        Please refer to the documentation for the underlying database driver to see if it
+        will properly prevent injection, or alternatively be advised of a security risk when
+        executing arbitrary commands in a to_sql call.
+
+    Parameters
+    ----------
+    frame : DataFrame, Series
+    name : str
+        Name of SQL table.
+    con : ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection
+        or sqlite3 DBAPI2 connection
+        ADBC provides high performance I/O with native type support, where available.
+        Using SQLAlchemy makes it possible to use any DB supported by that
+        library.
+        If a DBAPI2 object, only sqlite3 is supported.
+    schema : str, optional
+        Name of SQL schema in database to write to (if database flavor
+        supports this). If None, use default schema (default).
+    if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail'
+        - fail: If table exists, do nothing.
+        - replace: If table exists, drop it, recreate it, and insert data.
+        - append: If table exists, insert data. Create if does not exist.
+        - delete_rows: If a table exists, delete all records and insert data.
+    index : bool, default True
+        Write DataFrame index as a column.
+    index_label : str or sequence, optional
+        Column label for index column(s). If None is given (default) and
+        `index` is True, then the index names are used.
+        A sequence should be given if the DataFrame uses MultiIndex.
+    chunksize : int, optional
+        Specify the number of rows in each batch to be written at a time.
+        By default, all rows will be written at once.
+    dtype : dict or scalar, optional
+        Specifying the datatype for columns. If a dictionary is used, the
+        keys should be the column names and the values should be the
+        SQLAlchemy types or strings for the sqlite3 fallback mode. If a
+        scalar is provided, it will be applied to all columns.
+    method : {None, 'multi', callable}, optional
+        Controls the SQL insertion clause used:
+
+        - None : Uses standard SQL ``INSERT`` clause (one per row).
+        - ``'multi'``: Pass multiple values in a single ``INSERT`` clause.
+        - callable with signature ``(pd_table, conn, keys, data_iter) -> int | None``.
+
+        Details and a sample callable implementation can be found in the
+        section :ref:`insert method <io.sql.method>`.
+    engine : {'auto', 'sqlalchemy'}, default 'auto'
+        SQL engine library to use. If 'auto', then the option
+        ``io.sql.engine`` is used. The default ``io.sql.engine``
+        behavior is 'sqlalchemy'
+
+    **engine_kwargs
+        Any additional kwargs are passed to the engine.
+
+    Returns
+    -------
+    None or int
+        Number of rows affected by to_sql. None is returned if the callable
+        passed into ``method`` does not return an integer number of rows.
+
+    Notes
+    -----
+    The returned rows affected is the sum of the ``rowcount`` attribute of ``sqlite3.Cursor``
+    or SQLAlchemy connectable. If using ADBC the returned rows are the result
+    of ``Cursor.adbc_ingest``. The returned value may not reflect the exact number of written
+    rows as stipulated in the
+    `sqlite3 <https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.rowcount>`__ or
+    `SQLAlchemy <https://docs.sqlalchemy.org/en/14/core/connections.html#sqlalchemy.engine.BaseCursorResult.rowcount>`__
+    """  # noqa: E501
+    if if_exists not in ("fail", "replace", "append", "delete_rows"):
+        raise ValueError(f"'{if_exists}' is not valid for if_exists")
+
+    if isinstance(frame, Series):
+        frame = frame.to_frame()
+    elif not isinstance(frame, DataFrame):
+        raise NotImplementedError(
+            "'frame' argument should be either a Series or a DataFrame"
+        )
+
+    with pandasSQL_builder(con, schema=schema, need_transaction=True) as pandas_sql:
+        return pandas_sql.to_sql(
+            frame,
+            name,
+            if_exists=if_exists,
+            index=index,
+            index_label=index_label,
+            schema=schema,
+            chunksize=chunksize,
+            dtype=dtype,
+            method=method,
+            engine=engine,
+            **engine_kwargs,
+        )
+
+
+def has_table(table_name: str, con, schema: str | None = None) -> bool:
+    """
+    Check if DataBase has named table.
+
+    Parameters
+    ----------
+    table_name: string
+        Name of SQL table.
+    con: ADBC Connection, SQLAlchemy connectable, str, or sqlite3 connection
+        ADBC provides high performance I/O with native type support, where available.
+        Using SQLAlchemy makes it possible to use any DB supported by that
+        library.
+        If a DBAPI2 object, only sqlite3 is supported.
+    schema : string, default None
+        Name of SQL schema in database to write to (if database flavor supports
+        this). If None, use default schema (default).
+
+    Returns
+    -------
+    boolean
+    """
+    with pandasSQL_builder(con, schema=schema) as pandas_sql:
+        return pandas_sql.has_table(table_name)
+
+
+table_exists = has_table
+
+
+def pandasSQL_builder(
+    con,
+    schema: str | None = None,
+    need_transaction: bool = False,
+) -> PandasSQL:
+    """
+    Convenience function to return the correct PandasSQL subclass based on the
+    provided parameters.  Also creates a sqlalchemy connection and transaction
+    if necessary.
+    """
+    import sqlite3
+
+    if isinstance(con, sqlite3.Connection) or con is None:
+        return SQLiteDatabase(con)
+
+    sqlalchemy = import_optional_dependency("sqlalchemy", errors="ignore")
+
+    if isinstance(con, str) and sqlalchemy is None:
+        raise ImportError(
+            f"Using URI string without version '{VERSIONS['sqlalchemy']}' or newer "
+            "of 'sqlalchemy' installed."
+        )
+
+    if sqlalchemy is not None and isinstance(con, (str, sqlalchemy.engine.Connectable)):
+        return SQLDatabase(con, schema, need_transaction)
+
+    adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore")
+    if adbc and isinstance(con, adbc.Connection):
+        return ADBCDatabase(con)
+
+    warnings.warn(
+        "pandas only supports SQLAlchemy connectable (engine/connection) or "
+        "database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 "
+        "objects are not tested. Please consider using SQLAlchemy.",
+        UserWarning,
+        stacklevel=find_stack_level(),
+    )
+    return SQLiteDatabase(con)
+
+
+class SQLTable(PandasObject):
+    """
+    For mapping Pandas tables to SQL tables.
+    Uses fact that table is reflected by SQLAlchemy to
+    do better type conversions.
+    Also holds various flags needed to avoid having to
+    pass them between functions all the time.
+    """
+
+    # TODO: support for multiIndex
+
+    def __init__(
+        self,
+        name: str,
+        pandas_sql_engine,
+        frame=None,
+        index: bool | str | list[str] | None = True,
+        if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
+        prefix: str = "pandas",
+        index_label=None,
+        schema=None,
+        keys=None,
+        dtype: DtypeArg | None = None,
+    ) -> None:
+        self.name = name
+        self.pd_sql = pandas_sql_engine
+        self.prefix = prefix
+        self.frame = frame
+        self.index = self._index_name(index, index_label)
+        self.schema = schema
+        self.if_exists = if_exists
+        self.keys = keys
+        self.dtype = dtype
+
+        if frame is not None:
+            # We want to initialize based on a dataframe
+            self.table = self._create_table_setup()
+        else:
+            # no data provided, read-only mode
+            self.table = self.pd_sql.get_table(self.name, self.schema)
+
+        if self.table is None:
+            raise ValueError(f"Could not init table '{name}'")
+
+        if not len(self.name):
+            raise ValueError("Empty table name specified")
+
+    def exists(self):
+        return self.pd_sql.has_table(self.name, self.schema)
+
+    def sql_schema(self) -> str:
+        from sqlalchemy.schema import CreateTable
+
+        return str(CreateTable(self.table).compile(self.pd_sql.con))
+
+    def _execute_create(self) -> None:
+        # Inserting table into database, add to MetaData object
+        self.table = self.table.to_metadata(self.pd_sql.meta)
+        with self.pd_sql.run_transaction():
+            self.table.create(bind=self.pd_sql.con)
+
+    def create(self) -> None:
+        if self.exists():
+            if self.if_exists == "fail":
+                raise ValueError(f"Table '{self.name}' already exists.")
+            elif self.if_exists == "replace":
+                self.pd_sql.drop_table(self.name, self.schema)
+                self._execute_create()
+            elif self.if_exists == "append":
+                pass
+            elif self.if_exists == "delete_rows":
+                self.pd_sql.delete_rows(self.name, self.schema)
+            else:
+                raise ValueError(f"'{self.if_exists}' is not valid for if_exists")
+        else:
+            self._execute_create()
+
+    def _execute_insert(self, conn, keys: list[str], data_iter) -> int:
+        """
+        Execute SQL statement inserting data
+
+        Parameters
+        ----------
+        conn : sqlalchemy.engine.Engine or sqlalchemy.engine.Connection
+        keys : list of str
+           Column names
+        data_iter : generator of list
+           Each item contains a list of values to be inserted
+        """
+        data = [dict(zip(keys, row, strict=True)) for row in data_iter]
+        result = self.pd_sql.execute(self.table.insert(), data)
+        return result.rowcount
+
+    def _execute_insert_multi(self, conn, keys: list[str], data_iter) -> int:
+        """
+        Alternative to _execute_insert for DBs support multi-value INSERT.
+
+        Note: multi-value insert is usually faster for analytics DBs
+        and tables containing a few columns
+        but performance degrades quickly with increase of columns.
+
+        """
+
+        from sqlalchemy import insert
+
+        data = [dict(zip(keys, row, strict=True)) for row in data_iter]
+        stmt = insert(self.table).values(data)
+        result = self.pd_sql.execute(stmt)
+        return result.rowcount
+
+    def insert_data(self) -> tuple[list[str], list[np.ndarray]]:
+        if self.index is not None:
+            temp = self.frame.copy(deep=False)
+            temp.index.names = self.index
+            try:
+                temp.reset_index(inplace=True)
+            except ValueError as err:
+                raise ValueError(f"duplicate name in index/columns: {err}") from err
+        else:
+            temp = self.frame
+
+        column_names = list(map(str, temp.columns))
+        ncols = len(column_names)
+        # this just pre-allocates the list: None's will be replaced with ndarrays
+        # error: List item 0 has incompatible type "None"; expected "ndarray"
+        data_list: list[np.ndarray] = [None] * ncols  # type: ignore[list-item]
+
+        for i, (_, ser) in enumerate(temp.items()):
+            if ser.dtype.kind == "M":
+                if isinstance(ser._values, ArrowExtensionArray):
+                    import pyarrow as pa
+
+                    if pa.types.is_date(ser.dtype.pyarrow_dtype):
+                        # GH#53854 to_pydatetime not supported for pyarrow date dtypes
+                        d = ser._values.to_numpy(dtype=object)
+                    else:
+                        d = ser.dt.to_pydatetime()._values
+                else:
+                    d = ser._values.to_pydatetime()
+            elif ser.dtype.kind == "m":
+                vals = ser._values
+                if isinstance(vals, ArrowExtensionArray):
+                    vals = vals.to_numpy(dtype=np.dtype("m8[ns]"))
+                # store as integers, see GH#6921, GH#7076
+                d = vals.view("i8").astype(object)
+            else:
+                d = ser._values.astype(object)
+
+            assert isinstance(d, np.ndarray), type(d)
+
+            if ser._can_hold_na:
+                # Note: this will miss timedeltas since they are converted to int
+                mask = isna(d)
+                d[mask] = None
+
+            data_list[i] = d
+
+        return column_names, data_list
+
+    def insert(
+        self,
+        chunksize: int | None = None,
+        method: Literal["multi"] | Callable | None = None,
+    ) -> int | None:
+        # set insert method
+        if method is None:
+            exec_insert = self._execute_insert
+        elif method == "multi":
+            exec_insert = self._execute_insert_multi
+        elif callable(method):
+            exec_insert = partial(method, self)
+        else:
+            raise ValueError(f"Invalid parameter `method`: {method}")
+
+        keys, data_list = self.insert_data()
+
+        nrows = len(self.frame)
+
+        if nrows == 0:
+            return 0
+
+        if chunksize is None:
+            chunksize = nrows
+        elif chunksize == 0:
+            raise ValueError("chunksize argument should be non-zero")
+
+        chunks = (nrows // chunksize) + 1
+        total_inserted = None
+        with self.pd_sql.run_transaction() as conn:
+            for i in range(chunks):
+                start_i = i * chunksize
+                end_i = min((i + 1) * chunksize, nrows)
+                if start_i >= end_i:
+                    break
+
+                chunk_iter = zip(
+                    *(arr[start_i:end_i] for arr in data_list), strict=True
+                )
+                num_inserted = exec_insert(conn, keys, chunk_iter)
+                # GH 46891
+                if num_inserted is not None:
+                    if total_inserted is None:
+                        total_inserted = num_inserted
+                    else:
+                        total_inserted += num_inserted
+        return total_inserted
+
+    def _query_iterator(
+        self,
+        result,
+        exit_stack: ExitStack,
+        chunksize: int | None,
+        columns,
+        coerce_float: bool = True,
+        parse_dates=None,
+        dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+    ) -> Generator[DataFrame]:
+        """Return generator through chunked result set."""
+        has_read_data = False
+        with exit_stack:
+            while True:
+                data = result.fetchmany(chunksize)
+                if not data:
+                    if not has_read_data:
+                        yield DataFrame.from_records(
+                            [], columns=columns, coerce_float=coerce_float
+                        )
+                    break
+
+                has_read_data = True
+                self.frame = _convert_arrays_to_dataframe(
+                    data, columns, coerce_float, dtype_backend
+                )
+
+                self._harmonize_columns(
+                    parse_dates=parse_dates, dtype_backend=dtype_backend
+                )
+
+                if self.index is not None:
+                    self.frame.set_index(self.index, inplace=True)
+
+                yield self.frame
+
+    def read(
+        self,
+        exit_stack: ExitStack,
+        coerce_float: bool = True,
+        parse_dates=None,
+        columns=None,
+        chunksize: int | None = None,
+        dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+    ) -> DataFrame | Iterator[DataFrame]:
+        from sqlalchemy import select
+
+        if columns is not None and len(columns) > 0:
+            cols = [self.table.c[n] for n in columns]
+            if self.index is not None:
+                for idx in self.index[::-1]:
+                    cols.insert(0, self.table.c[idx])
+            sql_select = select(*cols)
+        else:
+            sql_select = select(self.table)
+        result = self.pd_sql.execute(sql_select)
+        column_names = result.keys()
+
+        if chunksize is not None:
+            return self._query_iterator(
+                result,
+                exit_stack,
+                chunksize,
+                column_names,
+                coerce_float=coerce_float,
+                parse_dates=parse_dates,
+                dtype_backend=dtype_backend,
+            )
+        else:
+            data = result.fetchall()
+            self.frame = _convert_arrays_to_dataframe(
+                data, column_names, coerce_float, dtype_backend
+            )
+
+            self._harmonize_columns(
+                parse_dates=parse_dates, dtype_backend=dtype_backend
+            )
+
+            if self.index is not None:
+                self.frame.set_index(self.index, inplace=True)
+
+            return self.frame
+
+    def _index_name(self, index, index_label):
+        # for writing: index=True to include index in sql table
+        if index is True:
+            nlevels = self.frame.index.nlevels
+            # if index_label is specified, set this as index name(s)
+            if index_label is not None:
+                if not isinstance(index_label, list):
+                    index_label = [index_label]
+                if len(index_label) != nlevels:
+                    raise ValueError(
+                        "Length of 'index_label' should match number of "
+                        f"levels, which is {nlevels}"
+                    )
+                return index_label
+            # return the used column labels for the index columns
+            if (
+                nlevels == 1
+                and "index" not in self.frame.columns
+                and self.frame.index.name is None
+            ):
+                return ["index"]
+            else:
+                return com.fill_missing_names(self.frame.index.names)
+
+        # for reading: index=(list of) string to specify column to set as index
+        elif isinstance(index, str):
+            return [index]
+        elif isinstance(index, list):
+            return index
+        else:
+            return None
+
+    def _get_column_names_and_types(self, dtype_mapper):
+        column_names_and_types = []
+        if self.index is not None:
+            for i, idx_label in enumerate(self.index):
+                idx_type = dtype_mapper(self.frame.index._get_level_values(i))
+                column_names_and_types.append((str(idx_label), idx_type, True))
+
+        column_names_and_types += [
+            (str(self.frame.columns[i]), dtype_mapper(self.frame.iloc[:, i]), False)
+            for i in range(len(self.frame.columns))
+        ]
+
+        return column_names_and_types
+
+    def _create_table_setup(self):
+        from sqlalchemy import (
+            Column,
+            PrimaryKeyConstraint,
+            Table,
+        )
+        from sqlalchemy.schema import MetaData
+
+        column_names_and_types = self._get_column_names_and_types(self._sqlalchemy_type)
+
+        columns: list[Any] = [
+            Column(name, typ, index=is_index)
+            for name, typ, is_index in column_names_and_types
+        ]
+
+        if self.keys is not None:
+            if not is_list_like(self.keys):
+                keys = [self.keys]
+            else:
+                keys = self.keys
+            pkc = PrimaryKeyConstraint(*keys, name=self.name + "_pk")
+            columns.append(pkc)
+
+        schema = self.schema or self.pd_sql.meta.schema
+
+        # At this point, attach to new metadata, only attach to self.meta
+        # once table is created.
+        meta = MetaData()
+        return Table(self.name, meta, *columns, schema=schema)
+
+    def _harmonize_columns(
+        self,
+        parse_dates=None,
+        dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+    ) -> None:
+        """
+        Make the DataFrame's column types align with the SQL table
+        column types.
+        Need to work around limited NA value support. Floats are always
+        fine, ints must always be floats if there are Null values.
+        Booleans are hard because converting bool column with None replaces
+        all Nones with false. Therefore only convert bool if there are no
+        NA values.
+        Datetimes should already be converted to np.datetime64 if supported,
+        but here we also force conversion if required.
+        """
+        parse_dates = _process_parse_dates_argument(parse_dates)
+
+        for sql_col in self.table.columns:
+            col_name = sql_col.name
+            try:
+                df_col = self.frame[col_name]
+
+                # Handle date parsing upfront; don't try to convert columns
+                # twice
+                if col_name in parse_dates:
+                    try:
+                        fmt = parse_dates[col_name]
+                    except TypeError:
+                        fmt = None
+                    self.frame[col_name] = _handle_date_column(df_col, format=fmt)
+                    continue
+
+                # the type the dataframe column should have
+                col_type = self._get_dtype(sql_col.type)
+
+                if (
+                    col_type is datetime
+                    or col_type is date
+                    or col_type is DatetimeTZDtype
+                ):
+                    # Convert tz-aware Datetime SQL columns to UTC
+                    utc = col_type is DatetimeTZDtype
+                    self.frame[col_name] = _handle_date_column(df_col, utc=utc)
+                elif dtype_backend == "numpy" and col_type is float:
+                    # floats support NA, can always convert!
+                    self.frame[col_name] = df_col.astype(col_type)
+                elif (
+                    using_string_dtype()
+                    and is_string_dtype(col_type)
+                    and is_object_dtype(self.frame[col_name])
+                ):
+                    self.frame[col_name] = df_col.astype(col_type)
+                elif dtype_backend == "numpy" and len(df_col) == df_col.count():
+                    # No NA values, can convert ints and bools
+                    if col_type is np.dtype("int64") or col_type is bool:
+                        self.frame[col_name] = df_col.astype(col_type)
+            except KeyError:
+                pass  # this column not in results
+
+    def _sqlalchemy_type(self, col: Index | Series):
+        dtype: DtypeArg = self.dtype or {}
+        if is_dict_like(dtype):
+            dtype = cast(dict, dtype)
+            if col.name in dtype:
+                return dtype[col.name]
+
+        # Infer type of column, while ignoring missing values.
+        # Needed for inserting typed data containing NULLs, GH 8778.
+        col_type = lib.infer_dtype(col, skipna=True)
+
+        from sqlalchemy.types import (
+            TIMESTAMP,
+            BigInteger,
+            Boolean,
+            Date,
+            DateTime,
+            Float,
+            Integer,
+            SmallInteger,
+            Text,
+            Time,
+        )
+
+        if col_type in ("datetime64", "datetime"):
+            # GH 9086: TIMESTAMP is the suggested type if the column contains
+            # timezone information
+            try:
+                # error: Item "Index" of "Union[Index, Series]" has no attribute "dt"
+                if col.dt.tz is not None:  # type: ignore[union-attr]
+                    return TIMESTAMP(timezone=True)
+            except AttributeError:
+                # The column is actually a DatetimeIndex
+                # GH 26761 or an Index with date-like data e.g. 9999-01-01
+                if getattr(col, "tz", None) is not None:
+                    return TIMESTAMP(timezone=True)
+            return DateTime
+        if col_type == "timedelta64":
+            warnings.warn(
+                "the 'timedelta' type is not supported, and will be "
+                "written as integer values (ns frequency) to the database.",
+                UserWarning,
+                stacklevel=find_stack_level(),
+            )
+            return BigInteger
+        elif col_type == "floating":
+            if col.dtype == "float32":
+                return Float(precision=23)
+            else:
+                return Float(precision=53)
+        elif col_type == "integer":
+            # GH35076 Map pandas integer to optimal SQLAlchemy integer type
+            if col.dtype.name.lower() in ("int8", "uint8", "int16"):
+                return SmallInteger
+            elif col.dtype.name.lower() in ("uint16", "int32"):
+                return Integer
+            elif col.dtype.name.lower() == "uint64":
+                raise ValueError("Unsigned 64 bit integer datatype is not supported")
+            else:
+                return BigInteger
+        elif col_type == "boolean":
+            return Boolean
+        elif col_type == "date":
+            return Date
+        elif col_type == "time":
+            return Time
+        elif col_type == "complex":
+            raise ValueError("Complex datatypes not supported")
+
+        return Text
+
+    def _get_dtype(self, sqltype):
+        from sqlalchemy.types import (
+            TIMESTAMP,
+            Boolean,
+            Date,
+            DateTime,
+            Float,
+            Integer,
+            String,
+        )
+
+        if isinstance(sqltype, Float):
+            return float
+        elif isinstance(sqltype, Integer):
+            # TODO: Refine integer size.
+            return np.dtype("int64")
+        elif isinstance(sqltype, TIMESTAMP):
+            # we have a timezone capable type
+            if not sqltype.timezone:
+                return datetime
+            return DatetimeTZDtype
+        elif isinstance(sqltype, DateTime):
+            # Caution: np.datetime64 is also a subclass of np.number.
+            return datetime
+        elif isinstance(sqltype, Date):
+            return date
+        elif isinstance(sqltype, Boolean):
+            return bool
+        elif isinstance(sqltype, String):
+            if using_string_dtype():
+                return StringDtype(na_value=np.nan)
+
+        return object
+
+
+class PandasSQL(PandasObject, ABC):
+    """
+    Subclasses Should define read_query and to_sql.
+    """
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(self, *args) -> None:
+        pass
+
+    def read_table(
+        self,
+        table_name: str,
+        index_col: str | list[str] | None = None,
+        coerce_float: bool = True,
+        parse_dates=None,
+        columns=None,
+        schema: str | None = None,
+        chunksize: int | None = None,
+        dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+    ) -> DataFrame | Iterator[DataFrame]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def read_query(
+        self,
+        sql: str,
+        index_col: str | list[str] | None = None,
+        coerce_float: bool = True,
+        parse_dates=None,
+        params=None,
+        chunksize: int | None = None,
+        dtype: DtypeArg | None = None,
+        dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+    ) -> DataFrame | Iterator[DataFrame]:
+        pass
+
+    @abstractmethod
+    def to_sql(
+        self,
+        frame,
+        name: str,
+        if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
+        index: bool = True,
+        index_label=None,
+        schema=None,
+        chunksize: int | None = None,
+        dtype: DtypeArg | None = None,
+        method: Literal["multi"] | Callable | None = None,
+        engine: str = "auto",
+        **engine_kwargs,
+    ) -> int | None:
+        pass
+
+    @abstractmethod
+    def execute(self, sql: str | Select | TextClause, params=None):
+        pass
+
+    @abstractmethod
+    def has_table(self, name: str, schema: str | None = None) -> bool:
+        pass
+
+    @abstractmethod
+    def _create_sql_schema(
+        self,
+        frame: DataFrame,
+        table_name: str,
+        keys: list[str] | None = None,
+        dtype: DtypeArg | None = None,
+        schema: str | None = None,
+    ) -> str:
+        pass
+
+
+class BaseEngine:
+    def insert_records(
+        self,
+        table: SQLTable,
+        con,
+        frame,
+        name: str,
+        index: bool | str | list[str] | None = True,
+        schema=None,
+        chunksize: int | None = None,
+        method=None,
+        **engine_kwargs,
+    ) -> int | None:
+        """
+        Inserts data into already-prepared table
+        """
+        raise AbstractMethodError(self)
+
+
+class SQLAlchemyEngine(BaseEngine):
+    def __init__(self) -> None:
+        import_optional_dependency(
+            "sqlalchemy", extra="sqlalchemy is required for SQL support."
+        )
+
+    def insert_records(
+        self,
+        table: SQLTable,
+        con,
+        frame,
+        name: str,
+        index: bool | str | list[str] | None = True,
+        schema=None,
+        chunksize: int | None = None,
+        method=None,
+        **engine_kwargs,
+    ) -> int | None:
+        from sqlalchemy import exc
+
+        try:
+            return table.insert(chunksize=chunksize, method=method)
+        except exc.StatementError as err:
+            # GH34431
+            # https://stackoverflow.com/a/67358288/6067848
+            msg = r"""(\(1054, "Unknown column 'inf(e0)?' in 'field list'"\))(?#
+            )|inf can not be used with MySQL"""
+            err_text = str(err.orig)
+            if re.search(msg, err_text):
+                raise ValueError("inf cannot be used with MySQL") from err
+            raise err
+
+
+def get_engine(engine: str) -> BaseEngine:
+    """return our implementation"""
+    if engine == "auto":
+        engine = get_option("io.sql.engine")
+
+    if engine == "auto":
+        # try engines in this order
+        engine_classes = [SQLAlchemyEngine]
+
+        error_msgs = ""
+        for engine_class in engine_classes:
+            try:
+                return engine_class()
+            except ImportError as err:
+                error_msgs += "\n - " + str(err)
+
+        raise ImportError(
+            "Unable to find a usable engine; "
+            "tried using: 'sqlalchemy'.\n"
+            "A suitable version of "
+            "sqlalchemy is required for sql I/O "
+            "support.\n"
+            "Trying to import the above resulted in these errors:"
+            f"{error_msgs}"
+        )
+
+    if engine == "sqlalchemy":
+        return SQLAlchemyEngine()
+
+    raise ValueError("engine must be one of 'auto', 'sqlalchemy'")
+
+
+class SQLDatabase(PandasSQL):
+    """
+    This class enables conversion between DataFrame and SQL databases
+    using SQLAlchemy to handle DataBase abstraction.
+
+    Parameters
+    ----------
+    con : SQLAlchemy Connectable or URI string.
+        Connectable to connect with the database. Using SQLAlchemy makes it
+        possible to use any DB supported by that library.
+    schema : string, default None
+        Name of SQL schema in database to write to (if database flavor
+        supports this). If None, use default schema (default).
+    need_transaction : bool, default False
+        If True, SQLDatabase will create a transaction.
+
+    """
+
+    def __init__(
+        self, con, schema: str | None = None, need_transaction: bool = False
+    ) -> None:
+        from sqlalchemy import create_engine
+        from sqlalchemy.engine import Engine
+        from sqlalchemy.schema import MetaData
+
+        # self.exit_stack cleans up the Engine and Connection and commits the
+        # transaction if any of those objects was created below.
+        # Cleanup happens either in self.__exit__ or at the end of the iterator
+        # returned by read_sql when chunksize is not None.
+        self.exit_stack = ExitStack()
+        if isinstance(con, str):
+            con = create_engine(con)
+            self.exit_stack.callback(con.dispose)
+        if isinstance(con, Engine):
+            con = self.exit_stack.enter_context(con.connect())
+        if need_transaction and not con.in_transaction():
+            self.exit_stack.enter_context(con.begin())
+        self.con = con
+        self.meta = MetaData(schema=schema)
+        self.returns_generator = False
+
+    def __exit__(self, *args) -> None:
+        if not self.returns_generator:
+            self.exit_stack.close()
+
+    @contextmanager
+    def run_transaction(self):
+        if not self.con.in_transaction():
+            with self.con.begin():
+                yield self.con
+        else:
+            yield self.con
+
+    def execute(self, sql: str | Select | TextClause | Delete, params=None):
+        """Simple passthrough to SQLAlchemy connectable"""
+        from sqlalchemy.exc import SQLAlchemyError
+
+        args = [] if params is None else [params]
+        if isinstance(sql, str):
+            execute_function = self.con.exec_driver_sql
+        else:
+            execute_function = self.con.execute
+
+        try:
+            return execute_function(sql, *args)
+        except SQLAlchemyError as exc:
+            raise DatabaseError(f"Execution failed on sql '{sql}': {exc}") from exc
+
+    def read_table(
+        self,
+        table_name: str,
+        index_col: str | list[str] | None = None,
+        coerce_float: bool = True,
+        parse_dates=None,
+        columns=None,
+        schema: str | None = None,
+        chunksize: int | None = None,
+        dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+    ) -> DataFrame | Iterator[DataFrame]:
+        """
+        Read SQL database table into a DataFrame.
+
+        Parameters
+        ----------
+        table_name : str
+            Name of SQL table in database.
+        index_col : string, optional, default: None
+            Column to set as index.
+        coerce_float : bool, default True
+            Attempts to convert values of non-string, non-numeric objects
+            (like decimal.Decimal) to floating point. This can result in
+            loss of precision.
+        parse_dates : list or dict, default: None
+            - List of column names to parse as dates.
+            - Dict of ``{column_name: format string}`` where format string is
+              strftime compatible in case of parsing string times, or is one of
+              (D, s, ns, ms, us) in case of parsing integer timestamps.
+            - Dict of ``{column_name: arg}``, where the arg corresponds
+              to the keyword arguments of :func:`pandas.to_datetime`.
+              Especially useful with databases without native Datetime support,
+              such as SQLite.
+        columns : list, default: None
+            List of column names to select from SQL table.
+        schema : string, default None
+            Name of SQL schema in database to query (if database flavor
+            supports this).  If specified, this overwrites the default
+            schema of the SQL database object.
+        chunksize : int, default None
+            If specified, return an iterator where `chunksize` is the number
+            of rows to include in each chunk.
+        dtype_backend : {'numpy_nullable', 'pyarrow'}
+            Back-end data type applied to the resultant :class:`DataFrame`
+            (still experimental). If not specified, the default behavior
+            is to not use nullable data types. If specified, the behavior
+            is as follows:
+
+            * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+            * ``"pyarrow"``: returns pyarrow-backed nullable
+              :class:`ArrowDtype` :class:`DataFrame`
+
+            .. versionadded:: 2.0
+
+        Returns
+        -------
+        DataFrame
+
+        See Also
+        --------
+        pandas.read_sql_table
+        SQLDatabase.read_query
+
+        """
+        self.meta.reflect(bind=self.con, only=[table_name], views=True)
+        table = SQLTable(table_name, self, index=index_col, schema=schema)
+        if chunksize is not None:
+            self.returns_generator = True
+        return table.read(
+            self.exit_stack,
+            coerce_float=coerce_float,
+            parse_dates=parse_dates,
+            columns=columns,
+            chunksize=chunksize,
+            dtype_backend=dtype_backend,
+        )
+
+    @staticmethod
+    def _query_iterator(
+        result,
+        exit_stack: ExitStack,
+        chunksize: int,
+        columns,
+        index_col=None,
+        coerce_float: bool = True,
+        parse_dates=None,
+        dtype: DtypeArg | None = None,
+        dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+    ) -> Generator[DataFrame]:
+        """Return generator through chunked result set"""
+        has_read_data = False
+        with exit_stack:
+            while True:
+                data = result.fetchmany(chunksize)
+                if not data:
+                    if not has_read_data:
+                        yield _wrap_result(
+                            [],
+                            columns,
+                            index_col=index_col,
+                            coerce_float=coerce_float,
+                            parse_dates=parse_dates,
+                            dtype=dtype,
+                            dtype_backend=dtype_backend,
+                        )
+                    break
+
+                has_read_data = True
+                yield _wrap_result(
+                    data,
+                    columns,
+                    index_col=index_col,
+                    coerce_float=coerce_float,
+                    parse_dates=parse_dates,
+                    dtype=dtype,
+                    dtype_backend=dtype_backend,
+                )
+
+    def read_query(
+        self,
+        sql: str,
+        index_col: str | list[str] | None = None,
+        coerce_float: bool = True,
+        parse_dates=None,
+        params=None,
+        chunksize: int | None = None,
+        dtype: DtypeArg | None = None,
+        dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+    ) -> DataFrame | Iterator[DataFrame]:
+        """
+        Read SQL query into a DataFrame.
+
+        Parameters
+        ----------
+        sql : str
+            SQL query to be executed.
+        index_col : string, optional, default: None
+            Column name to use as index for the returned DataFrame object.
+        coerce_float : bool, default True
+            Attempt to convert values of non-string, non-numeric objects (like
+            decimal.Decimal) to floating point, useful for SQL result sets.
+        params : list, tuple or dict, optional, default: None
+            List of parameters to pass to execute method.  The syntax used
+            to pass parameters is database driver dependent. Check your
+            database driver documentation for which of the five syntax styles,
+            described in PEP 249's paramstyle, is supported.
+            Eg. for psycopg2, uses %(name)s so use params={'name' : 'value'}
+        parse_dates : list or dict, default: None
+            - List of column names to parse as dates.
+            - Dict of ``{column_name: format string}`` where format string is
+              strftime compatible in case of parsing string times, or is one of
+              (D, s, ns, ms, us) in case of parsing integer timestamps.
+            - Dict of ``{column_name: arg dict}``, where the arg dict
+              corresponds to the keyword arguments of
+              :func:`pandas.to_datetime` Especially useful with databases
+              without native Datetime support, such as SQLite.
+        chunksize : int, default None
+            If specified, return an iterator where `chunksize` is the number
+            of rows to include in each chunk.
+        dtype : Type name or dict of columns
+            Data type for data or columns. E.g. np.float64 or
+            {'a': np.float64, 'b': np.int32, 'c': 'Int64'}
+
+        Returns
+        -------
+        DataFrame
+
+        See Also
+        --------
+        read_sql_table : Read SQL database table into a DataFrame.
+        read_sql
+
+        """
+        result = self.execute(sql, params)
+        columns = result.keys()
+
+        if chunksize is not None:
+            self.returns_generator = True
+            return self._query_iterator(
+                result,
+                self.exit_stack,
+                chunksize,
+                columns,
+                index_col=index_col,
+                coerce_float=coerce_float,
+                parse_dates=parse_dates,
+                dtype=dtype,
+                dtype_backend=dtype_backend,
+            )
+        else:
+            data = result.fetchall()
+            frame = _wrap_result(
+                data,
+                columns,
+                index_col=index_col,
+                coerce_float=coerce_float,
+                parse_dates=parse_dates,
+                dtype=dtype,
+                dtype_backend=dtype_backend,
+            )
+            return frame
+
+    read_sql = read_query
+
+    def prep_table(
+        self,
+        frame,
+        name: str,
+        if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
+        index: bool | str | list[str] | None = True,
+        index_label=None,
+        schema=None,
+        dtype: DtypeArg | None = None,
+    ) -> SQLTable:
+        """
+        Prepares table in the database for data insertion. Creates it if needed, etc.
+        """
+        if dtype:
+            if not is_dict_like(dtype):
+                # error: Value expression in dictionary comprehension has incompatible
+                # type "Union[ExtensionDtype, str, dtype[Any], Type[object],
+                # Dict[Hashable, Union[ExtensionDtype, Union[str, dtype[Any]],
+                # Type[str], Type[float], Type[int], Type[complex], Type[bool],
+                # Type[object]]]]"; expected type "Union[ExtensionDtype, str,
+                # dtype[Any], Type[object]]"
+                dtype = dict.fromkeys(frame, dtype)  # type: ignore[arg-type]
+            else:
+                dtype = cast(dict, dtype)
+
+            from sqlalchemy.types import TypeEngine
+
+            for col, my_type in dtype.items():
+                if isinstance(my_type, type) and issubclass(my_type, TypeEngine):
+                    pass
+                elif isinstance(my_type, TypeEngine):
+                    pass
+                else:
+                    raise ValueError(f"The type of {col} is not a SQLAlchemy type")
+
+        table = SQLTable(
+            name,
+            self,
+            frame=frame,
+            index=index,
+            if_exists=if_exists,
+            index_label=index_label,
+            schema=schema,
+            dtype=dtype,
+        )
+        table.create()
+        return table
+
+    def check_case_sensitive(
+        self,
+        name: str,
+        schema: str | None,
+    ) -> None:
+        """
+        Checks table name for issues with case-sensitivity.
+        Method is called after data is inserted.
+        """
+        if not name.isdigit() and not name.islower():
+            # check for potentially case sensitivity issues (GH7815)
+            # Only check when name is not a number and name is not lower case
+            from sqlalchemy import inspect as sqlalchemy_inspect
+
+            insp = sqlalchemy_inspect(self.con)
+            table_names = insp.get_table_names(schema=schema or self.meta.schema)
+            if name not in table_names:
+                msg = (
+                    f"The provided table name '{name}' is not found exactly as "
+                    "such in the database after writing the table, possibly "
+                    "due to case sensitivity issues. Consider using lower "
+                    "case table names."
+                )
+                warnings.warn(
+                    msg,
+                    UserWarning,
+                    stacklevel=find_stack_level(),
+                )
+
+    def to_sql(
+        self,
+        frame,
+        name: str,
+        if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
+        index: bool = True,
+        index_label=None,
+        schema: str | None = None,
+        chunksize: int | None = None,
+        dtype: DtypeArg | None = None,
+        method: Literal["multi"] | Callable | None = None,
+        engine: str = "auto",
+        **engine_kwargs,
+    ) -> int | None:
+        """
+        Write records stored in a DataFrame to a SQL database.
+
+        Parameters
+        ----------
+        frame : DataFrame
+        name : string
+            Name of SQL table.
+        if_exists : {'fail', 'replace', 'append', 'delete_rows'}, default 'fail'
+            - fail: If table exists, do nothing.
+            - replace: If table exists, drop it, recreate it, and insert data.
+            - append: If table exists, insert data. Create if does not exist.
+            - delete_rows: If a table exists, delete all records and insert data.
+        index : boolean, default True
+            Write DataFrame index as a column.
+        index_label : string or sequence, default None
+            Column label for index column(s). If None is given (default) and
+            `index` is True, then the index names are used.
+            A sequence should be given if the DataFrame uses MultiIndex.
+        schema : string, default None
+            Name of SQL schema in database to write to (if database flavor
+            supports this). If specified, this overwrites the default
+            schema of the SQLDatabase object.
+        chunksize : int, default None
+            If not None, then rows will be written in batches of this size at a
+            time.  If None, all rows will be written at once.
+        dtype : single type or dict of column name to SQL type, default None
+            Optional specifying the datatype for columns. The SQL type should
+            be a SQLAlchemy type. If all columns are of the same type, one
+            single value can be used.
+        method : {None', 'multi', callable}, default None
+            Controls the SQL insertion clause used:
+
+            * None : Uses standard SQL ``INSERT`` clause (one per row).
+            * 'multi': Pass multiple values in a single ``INSERT`` clause.
+            * callable with signature ``(pd_table, conn, keys, data_iter)``.
+
+            Details and a sample callable implementation can be found in the
+            section :ref:`insert method <io.sql.method>`.
+        engine : {'auto', 'sqlalchemy'}, default 'auto'
+            SQL engine library to use. If 'auto', then the option
+            ``io.sql.engine`` is used. The default ``io.sql.engine``
+            behavior is 'sqlalchemy'
+
+        **engine_kwargs
+            Any additional kwargs are passed to the engine.
+        """
+        sql_engine = get_engine(engine)
+
+        table = self.prep_table(
+            frame=frame,
+            name=name,
+            if_exists=if_exists,
+            index=index,
+            index_label=index_label,
+            schema=schema,
+            dtype=dtype,
+        )
+
+        total_inserted = sql_engine.insert_records(
+            table=table,
+            con=self.con,
+            frame=frame,
+            name=name,
+            index=index,
+            schema=schema,
+            chunksize=chunksize,
+            method=method,
+            **engine_kwargs,
+        )
+
+        self.check_case_sensitive(name=name, schema=schema)
+        return total_inserted
+
+    @property
+    def tables(self):
+        return self.meta.tables
+
+    def has_table(self, name: str, schema: str | None = None) -> bool:
+        from sqlalchemy import inspect as sqlalchemy_inspect
+
+        insp = sqlalchemy_inspect(self.con)
+        return insp.has_table(name, schema or self.meta.schema)
+
+    def get_table(self, table_name: str, schema: str | None = None) -> Table:
+        from sqlalchemy import (
+            Numeric,
+            Table,
+        )
+
+        schema = schema or self.meta.schema
+        tbl = Table(table_name, self.meta, autoload_with=self.con, schema=schema)
+        for column in tbl.columns:
+            if isinstance(column.type, Numeric):
+                column.type.asdecimal = False
+        return tbl
+
+    def drop_table(self, table_name: str, schema: str | None = None) -> None:
+        schema = schema or self.meta.schema
+        if self.has_table(table_name, schema):
+            self.meta.reflect(
+                bind=self.con, only=[table_name], schema=schema, views=True
+            )
+            with self.run_transaction():
+                self.get_table(table_name, schema).drop(bind=self.con)
+            self.meta.clear()
+
+    def delete_rows(self, table_name: str, schema: str | None = None) -> None:
+        schema = schema or self.meta.schema
+        if self.has_table(table_name, schema):
+            self.meta.reflect(
+                bind=self.con, only=[table_name], schema=schema, views=True
+            )
+            table = self.get_table(table_name, schema)
+            self.execute(table.delete()).close()
+            self.meta.clear()
+
+    def _create_sql_schema(
+        self,
+        frame: DataFrame,
+        table_name: str,
+        keys: list[str] | None = None,
+        dtype: DtypeArg | None = None,
+        schema: str | None = None,
+    ) -> str:
+        table = SQLTable(
+            table_name,
+            self,
+            frame=frame,
+            index=False,
+            keys=keys,
+            dtype=dtype,
+            schema=schema,
+        )
+        return str(table.sql_schema())
+
+
+# ---- SQL without SQLAlchemy ---
+
+
+class ADBCDatabase(PandasSQL):
+    """
+    This class enables conversion between DataFrame and SQL databases
+    using ADBC to handle DataBase abstraction.
+
+    Parameters
+    ----------
+    con : adbc_driver_manager.dbapi.Connection
+    """
+
+    def __init__(self, con) -> None:
+        self.con = con
+
+    @contextmanager
+    def run_transaction(self):
+        with self.con.cursor() as cur:
+            try:
+                yield cur
+            except Exception:
+                self.con.rollback()
+                raise
+            self.con.commit()
+
+    def execute(self, sql: str | Select | TextClause, params=None):
+        from adbc_driver_manager import Error
+
+        if not isinstance(sql, str):
+            raise TypeError("Query must be a string unless using sqlalchemy.")
+        args = [] if params is None else [params]
+        cur = self.con.cursor()
+        try:
+            cur.execute(sql, *args)
+            return cur
+        except Error as exc:
+            try:
+                self.con.rollback()
+            except Error as inner_exc:  # pragma: no cover
+                ex = DatabaseError(
+                    f"Execution failed on sql: {sql}\n{exc}\nunable to rollback"
+                )
+                raise ex from inner_exc
+
+            ex = DatabaseError(f"Execution failed on sql '{sql}': {exc}")
+            raise ex from exc
+
+    def read_table(
+        self,
+        table_name: str,
+        index_col: str | list[str] | None = None,
+        coerce_float: bool = True,
+        parse_dates=None,
+        columns=None,
+        schema: str | None = None,
+        chunksize: int | None = None,
+        dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+    ) -> DataFrame | Iterator[DataFrame]:
+        """
+        Read SQL database table into a DataFrame.
+
+        Parameters
+        ----------
+        table_name : str
+            Name of SQL table in database.
+        coerce_float : bool, default True
+            Raises NotImplementedError
+        parse_dates : list or dict, default: None
+            - List of column names to parse as dates.
+            - Dict of ``{column_name: format string}`` where format string is
+              strftime compatible in case of parsing string times, or is one of
+              (D, s, ns, ms, us) in case of parsing integer timestamps.
+            - Dict of ``{column_name: arg}``, where the arg corresponds
+              to the keyword arguments of :func:`pandas.to_datetime`.
+              Especially useful with databases without native Datetime support,
+              such as SQLite.
+        columns : list, default: None
+            List of column names to select from SQL table.
+        schema : string, default None
+            Name of SQL schema in database to query (if database flavor
+            supports this).  If specified, this overwrites the default
+            schema of the SQL database object.
+        chunksize : int, default None
+            Raises NotImplementedError
+        dtype_backend : {'numpy_nullable', 'pyarrow'}
+            Back-end data type applied to the resultant :class:`DataFrame`
+            (still experimental). If not specified, the default behavior
+            is to not use nullable data types. If specified, the behavior
+            is as follows:
+
+            * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+            * ``"pyarrow"``: returns pyarrow-backed nullable
+              :class:`ArrowDtype` :class:`DataFrame`
+
+            .. versionadded:: 2.0
+
+        Returns
+        -------
+        DataFrame
+
+        See Also
+        --------
+        pandas.read_sql_table
+        SQLDatabase.read_query
+
+        """
+        if coerce_float is not True:
+            raise NotImplementedError(
+                "'coerce_float' is not implemented for ADBC drivers"
+            )
+        if chunksize:
+            raise NotImplementedError("'chunksize' is not implemented for ADBC drivers")
+
+        if columns:
+            if index_col:
+                index_select = maybe_make_list(index_col)
+            else:
+                index_select = []
+            to_select = index_select + columns
+            select_list = ", ".join(f'"{x}"' for x in to_select)
+        else:
+            select_list = "*"
+        if schema:
+            stmt = f"SELECT {select_list} FROM {schema}.{table_name}"
+        else:
+            stmt = f"SELECT {select_list} FROM {table_name}"
+
+        with self.execute(stmt) as cur:
+            pa_table = cur.fetch_arrow_table()
+            df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
+
+        return _wrap_result_adbc(
+            df,
+            index_col=index_col,
+            parse_dates=parse_dates,
+        )
+
+    def read_query(
+        self,
+        sql: str,
+        index_col: str | list[str] | None = None,
+        coerce_float: bool = True,
+        parse_dates=None,
+        params=None,
+        chunksize: int | None = None,
+        dtype: DtypeArg | None = None,
+        dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+    ) -> DataFrame | Iterator[DataFrame]:
+        """
+        Read SQL query into a DataFrame.
+
+        Parameters
+        ----------
+        sql : str
+            SQL query to be executed.
+        index_col : string, optional, default: None
+            Column name to use as index for the returned DataFrame object.
+        coerce_float : bool, default True
+            Raises NotImplementedError
+        params : list, tuple or dict, optional, default: None
+            Raises NotImplementedError
+        parse_dates : list or dict, default: None
+            - List of column names to parse as dates.
+            - Dict of ``{column_name: format string}`` where format string is
+              strftime compatible in case of parsing string times, or is one of
+              (D, s, ns, ms, us) in case of parsing integer timestamps.
+            - Dict of ``{column_name: arg dict}``, where the arg dict
+              corresponds to the keyword arguments of
+              :func:`pandas.to_datetime` Especially useful with databases
+              without native Datetime support, such as SQLite.
+        chunksize : int, default None
+            Raises NotImplementedError
+        dtype : Type name or dict of columns
+            Data type for data or columns. E.g. np.float64 or
+            {'a': np.float64, 'b': np.int32, 'c': 'Int64'}
+
+        Returns
+        -------
+        DataFrame
+
+        See Also
+        --------
+        read_sql_table : Read SQL database table into a DataFrame.
+        read_sql
+
+        """
+        if coerce_float is not True:
+            raise NotImplementedError(
+                "'coerce_float' is not implemented for ADBC drivers"
+            )
+        if params:
+            raise NotImplementedError("'params' is not implemented for ADBC drivers")
+        if chunksize:
+            raise NotImplementedError("'chunksize' is not implemented for ADBC drivers")
+
+        with self.execute(sql) as cur:
+            pa_table = cur.fetch_arrow_table()
+            df = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
+
+        return _wrap_result_adbc(
+            df,
+            index_col=index_col,
+            parse_dates=parse_dates,
+            dtype=dtype,
+        )
+
+    read_sql = read_query
+
+    def to_sql(
+        self,
+        frame,
+        name: str,
+        if_exists: Literal["fail", "replace", "append", "delete_rows"] = "fail",
+        index: bool = True,
+        index_label=None,
+        schema: str | None = None,
+        chunksize: int | None = None,
+        dtype: DtypeArg | None = None,
+        method: Literal["multi"] | Callable | None = None,
+        engine: str = "auto",
+        **engine_kwargs,
+    ) -> int | None:
+        """
+        Write records stored in a DataFrame to a SQL database.
+
+        Parameters
+        ----------
+        frame : DataFrame
+        name : string
+            Name of SQL table.
+        if_exists : {'fail', 'replace', 'append'}, default 'fail'
+            - fail: If table exists, do nothing.
+            - replace: If table exists, drop it, recreate it, and insert data.
+            - append: If table exists, insert data. Create if does not exist.
+            - delete_rows: If a table exists, delete all records and insert data.
+        index : boolean, default True
+            Write DataFrame index as a column.
+        index_label : string or sequence, default None
+            Raises NotImplementedError
+        schema : string, default None
+            Name of SQL schema in database to write to (if database flavor
+            supports this). If specified, this overwrites the default
+            schema of the SQLDatabase object.
+        chunksize : int, default None
+            Raises NotImplementedError
+        dtype : single type or dict of column name to SQL type, default None
+            Raises NotImplementedError
+        method : {None', 'multi', callable}, default None
+            Raises NotImplementedError
+        engine : {'auto', 'sqlalchemy'}, default 'auto'
+            Raises NotImplementedError if not set to 'auto'
+        """
+        pa = import_optional_dependency("pyarrow")
+        from adbc_driver_manager import Error
+
+        if index_label:
+            raise NotImplementedError(
+                "'index_label' is not implemented for ADBC drivers"
+            )
+        if chunksize:
+            raise NotImplementedError("'chunksize' is not implemented for ADBC drivers")
+        if dtype:
+            raise NotImplementedError("'dtype' is not implemented for ADBC drivers")
+        if method:
+            raise NotImplementedError("'method' is not implemented for ADBC drivers")
+        if engine != "auto":
+            raise NotImplementedError(
+                "engine != 'auto' not implemented for ADBC drivers"
+            )
+
+        if schema:
+            table_name = f"{schema}.{name}"
+        else:
+            table_name = name
+
+        # pandas if_exists="append" will still create the
+        # table if it does not exist; ADBC is more explicit with append/create
+        # as applicable modes, so the semantics get blurred across
+        # the libraries
+        mode = "create"
+        if self.has_table(name, schema):
+            if if_exists == "fail":
+                raise ValueError(f"Table '{table_name}' already exists.")
+            elif if_exists == "replace":
+                sql_statement = f"DROP TABLE {table_name}"
+                self.execute(sql_statement).close()
+            elif if_exists == "append":
+                mode = "append"
+            elif if_exists == "delete_rows":
+                mode = "append"
+                self.delete_rows(name, schema)
+
+        try:
+            tbl = pa.Table.from_pandas(frame, preserve_index=index)
+        except pa.ArrowNotImplementedError as exc:
+            raise ValueError("datatypes not supported") from exc
+
+        with self.con.cursor() as cur:
+            try:
+                total_inserted = cur.adbc_ingest(
+                    table_name=name, data=tbl, mode=mode, db_schema_name=schema
+                )
+            except Error as exc:
+                raise DatabaseError(
+                    f"Failed to insert records on table={name} with {mode=}"
+                ) from exc
+
+        self.con.commit()
+        return total_inserted
+
+    def has_table(self, name: str, schema: str | None = None) -> bool:
+        meta = self.con.adbc_get_objects(
+            db_schema_filter=schema, table_name_filter=name
+        ).read_all()
+
+        for catalog_schema in meta["catalog_db_schemas"].to_pylist():
+            if not catalog_schema:
+                continue
+            for schema_record in catalog_schema:
+                if not schema_record:
+                    continue
+
+                for table_record in schema_record["db_schema_tables"]:
+                    if table_record["table_name"] == name:
+                        return True
+
+        return False
+
+    def delete_rows(self, name: str, schema: str | None = None) -> None:
+        table_name = f"{schema}.{name}" if schema else name
+        if self.has_table(name, schema):
+            self.execute(f"DELETE FROM {table_name}").close()
+
+    def _create_sql_schema(
+        self,
+        frame: DataFrame,
+        table_name: str,
+        keys: list[str] | None = None,
+        dtype: DtypeArg | None = None,
+        schema: str | None = None,
+    ) -> str:
+        raise NotImplementedError("not implemented for adbc")
+
+
+# sqlite-specific sql strings and handler class
+# dictionary used for readability purposes
+_SQL_TYPES = {
+    "string": "TEXT",
+    "floating": "REAL",
+    "integer": "INTEGER",
+    "datetime": "TIMESTAMP",
+    "date": "DATE",
+    "time": "TIME",
+    "boolean": "INTEGER",
+}
+
+
+def _get_unicode_name(name: object) -> str:
+    try:
+        uname = str(name).encode("utf-8", "strict").decode("utf-8")
+    except UnicodeError as err:
+        raise ValueError(f"Cannot convert identifier to UTF-8: '{name}'") from err
+    return uname
+
+
+def _get_valid_sqlite_name(name: object) -> str:
+    # See https://stackoverflow.com/questions/6514274/how-do-you-escape-strings\
+    # -for-sqlite-table-column-names-in-python
+    # Ensure the string can be encoded as UTF-8.
+    # Ensure the string does not include any NUL characters.
+    # Replace all " with "".
+    # Wrap the entire thing in double quotes.
+
+    uname = _get_unicode_name(name)
+    if not len(uname):
+        raise ValueError("Empty table or column name specified")
+
+    nul_index = uname.find("\x00")
+    if nul_index >= 0:
+        raise ValueError("SQLite identifier cannot contain NULs")
+    return '"' + uname.replace('"', '""') + '"'
+
+
+class SQLiteTable(SQLTable):
+    """
+    Patch the SQLTable for fallback support.
+    Instead of a table variable just use the Create Table statement.
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+        self._register_date_adapters()
+
+    def _register_date_adapters(self) -> None:
+        # GH 8341
+        # register an adapter callable for datetime.time object
+        import sqlite3
+
+        # this will transform time(12,34,56,789) into '12:34:56.000789'
+        # (this is what sqlalchemy does)
+        def _adapt_time(t) -> str:
+            # This is faster than strftime
+            return f"{t.hour:02d}:{t.minute:02d}:{t.second:02d}.{t.microsecond:06d}"
+
+        # Also register adapters for date/datetime and co
+        # xref https://docs.python.org/3.12/library/sqlite3.html#adapter-and-converter-recipes
+        # Python 3.12+ doesn't auto-register adapters for us anymore
+
+        adapt_date_iso = lambda val: val.isoformat()
+        adapt_datetime_iso = lambda val: val.isoformat(" ")
+
+        sqlite3.register_adapter(time, _adapt_time)
+
+        sqlite3.register_adapter(date, adapt_date_iso)
+        sqlite3.register_adapter(datetime, adapt_datetime_iso)
+
+        convert_date = lambda val: date.fromisoformat(val.decode())
+        convert_timestamp = lambda val: datetime.fromisoformat(val.decode())
+
+        sqlite3.register_converter("date", convert_date)
+        sqlite3.register_converter("timestamp", convert_timestamp)
+
+    def sql_schema(self) -> str:
+        return str(";\n".join(self.table))
+
+    def _execute_create(self) -> None:
+        with self.pd_sql.run_transaction() as cur:
+            for stmt in self.table:
+                cur.execute(stmt)
+
+    def insert_statement(self, *, num_rows: int) -> str:
+        names = list(map(str, self.frame.columns))
+        wld = "?"  # wildcard char
+        escape = _get_valid_sqlite_name
+
+        if self.index is not None:
+            for idx in self.index[::-1]:
+                names.insert(0, idx)
+
+        bracketed_names = [escape(column) for column in names]
+        col_names = ",".join(bracketed_names)
+
+        row_wildcards = ",".join([wld] * len(names))
+        wildcards = ",".join([f"({row_wildcards})" for _ in range(num_rows)])
+        insert_statement = (
+            f"INSERT INTO {escape(self.name)} ({col_names}) VALUES {wildcards}"
+        )
+        return insert_statement
+
+    def _execute_insert(self, conn, keys, data_iter) -> int:
+        from sqlite3 import Error
+
+        data_list = list(data_iter)
+        try:
+            conn.executemany(self.insert_statement(num_rows=1), data_list)
+        except Error as exc:
+            raise DatabaseError("Execution failed") from exc
+        return conn.rowcount
+
+    def _execute_insert_multi(self, conn, keys, data_iter) -> int:
+        data_list = list(data_iter)
+        flattened_data = [x for row in data_list for x in row]
+        conn.execute(self.insert_statement(num_rows=len(data_list)), flattened_data)
+        return conn.rowcount
+
+    def _create_table_setup(self):
+        """
+        Return a list of SQL statements that creates a table reflecting the
+        structure of a DataFrame.  The first entry will be a CREATE TABLE
+        statement while the rest will be CREATE INDEX statements.
+        """
+        column_names_and_types = self._get_column_names_and_types(self._sql_type_name)
+        escape = _get_valid_sqlite_name
+
+        create_tbl_stmts = [
+            escape(cname) + " " + ctype for cname, ctype, _ in column_names_and_types
+        ]
+
+        if self.keys is not None and len(self.keys):
+            if not is_list_like(self.keys):
+                keys = [self.keys]
+            else:
+                keys = self.keys
+            cnames_br = ", ".join([escape(c) for c in keys])
+            create_tbl_stmts.append(
+                f"CONSTRAINT {self.name}_pk PRIMARY KEY ({cnames_br})"
+            )
+        if self.schema:
+            schema_name = self.schema + "."
+        else:
+            schema_name = ""
+        create_stmts = [
+            "CREATE TABLE "
+            + schema_name
+            + escape(self.name)
+            + " (\n"
+            + ",\n  ".join(create_tbl_stmts)
+            + "\n)"
+        ]
+
+        ix_cols = [cname for cname, _, is_index in column_names_and_types if is_index]
+        if ix_cols:
+            cnames = "_".join(ix_cols)
+            cnames_br = ",".join([escape(c) for c in ix_cols])
+            create_stmts.append(
+                "CREATE INDEX "
+                + escape("ix_" + self.name + "_" + cnames)
+                + "ON "
+                + escape(self.name)
+                + " ("
+                + cnames_br
+                + ")"
+            )
+
+        return create_stmts
+
+    def _sql_type_name(self, col):
+        dtype: DtypeArg = self.dtype or {}
+        if is_dict_like(dtype):
+            dtype = cast(dict, dtype)
+            if col.name in dtype:
+                return dtype[col.name]
+
+        # Infer type of column, while ignoring missing values.
+        # Needed for inserting typed data containing NULLs, GH 8778.
+        col_type = lib.infer_dtype(col, skipna=True)
+
+        if col_type == "timedelta64":
+            warnings.warn(
+                "the 'timedelta' type is not supported, and will be "
+                "written as integer values (ns frequency) to the database.",
+                UserWarning,
+                stacklevel=find_stack_level(),
+            )
+            col_type = "integer"
+
+        elif col_type == "datetime64":
+            col_type = "datetime"
+
+        elif col_type == "empty":
+            col_type = "string"
+
+        elif col_type == "complex":
+            raise ValueError("Complex datatypes not supported")
+
+        if col_type not in _SQL_TYPES:
+            col_type = "string"
+
+        return _SQL_TYPES[col_type]
+
+
+class SQLiteDatabase(PandasSQL):
+    """
+    Version of SQLDatabase to support SQLite connections (fallback without
+    SQLAlchemy). This should only be used internally.
+
+    Parameters
+    ----------
+    con : sqlite connection object
+
+    """
+
+    def __init__(self, con) -> None:
+        self.con = con
+
+    @contextmanager
+    def run_transaction(self):
+        cur = self.con.cursor()
+        try:
+            yield cur
+            self.con.commit()
+        except Exception:
+            self.con.rollback()
+            raise
+        finally:
+            cur.close()
+
+    def execute(self, sql: str | Select | TextClause, params=None):
+        from sqlite3 import Error
+
+        if not isinstance(sql, str):
+            raise TypeError("Query must be a string unless using sqlalchemy.")
+        args = [] if params is None else [params]
+        cur = self.con.cursor()
+        try:
+            cur.execute(sql, *args)
+            return cur
+        except Error as exc:
+            try:
+                self.con.rollback()
+            except Error as inner_exc:  # pragma: no cover
+                ex = DatabaseError(
+                    f"Execution failed on sql: {sql}\n{exc}\nunable to rollback"
+                )
+                raise ex from inner_exc
+
+            ex = DatabaseError(f"Execution failed on sql '{sql}': {exc}")
+            raise ex from exc
+
+    @staticmethod
+    def _query_iterator(
+        cursor,
+        chunksize: int,
+        columns,
+        index_col=None,
+        coerce_float: bool = True,
+        parse_dates=None,
+        dtype: DtypeArg | None = None,
+        dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+    ) -> Generator[DataFrame]:
+        """Return generator through chunked result set"""
+        has_read_data = False
+        while True:
+            data = cursor.fetchmany(chunksize)
+            if type(data) == tuple:
+                data = list(data)
+            if not data:
+                cursor.close()
+                if not has_read_data:
+                    result = DataFrame.from_records(
+                        [], columns=columns, coerce_float=coerce_float
+                    )
+                    if dtype:
+                        result = result.astype(dtype)
+                    yield result
+                break
+
+            has_read_data = True
+            yield _wrap_result(
+                data,
+                columns,
+                index_col=index_col,
+                coerce_float=coerce_float,
+                parse_dates=parse_dates,
+                dtype=dtype,
+                dtype_backend=dtype_backend,
+            )
+
+    def read_query(
+        self,
+        sql,
+        index_col=None,
+        coerce_float: bool = True,
+        parse_dates=None,
+        params=None,
+        chunksize: int | None = None,
+        dtype: DtypeArg | None = None,
+        dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
+    ) -> DataFrame | Iterator[DataFrame]:
+        cursor = self.execute(sql, params)
+        columns = [col_desc[0] for col_desc in cursor.description]
+
+        if chunksize is not None:
+            return self._query_iterator(
+                cursor,
+                chunksize,
+                columns,
+                index_col=index_col,
+                coerce_float=coerce_float,
+                parse_dates=parse_dates,
+                dtype=dtype,
+                dtype_backend=dtype_backend,
+            )
+        else:
+            data = self._fetchall_as_list(cursor)
+            cursor.close()
+
+            frame = _wrap_result(
+                data,
+                columns,
+                index_col=index_col,
+                coerce_float=coerce_float,
+                parse_dates=parse_dates,
+                dtype=dtype,
+                dtype_backend=dtype_backend,
+            )
+            return frame
+
+    def _fetchall_as_list(self, cur):
+        result = cur.fetchall()
+        if not isinstance(result, list):
+            result = list(result)
+        return result
+
+    def to_sql(
+        self,
+        frame,
+        name: str,
+        if_exists: str = "fail",
+        index: bool = True,
+        index_label=None,
+        schema=None,
+        chunksize: int | None = None,
+        dtype: DtypeArg | None = None,
+        method: Literal["multi"] | Callable | None = None,
+        engine: str = "auto",
+        **engine_kwargs,
+    ) -> int | None:
+        """
+        Write records stored in a DataFrame to a SQL database.
+
+        Parameters
+        ----------
+        frame: DataFrame
+        name: string
+            Name of SQL table.
+        if_exists: {'fail', 'replace', 'append', 'delete_rows'}, default 'fail'
+            fail: If table exists, do nothing.
+            replace: If table exists, drop it, recreate it, and insert data.
+            append: If table exists, insert data. Create if it does not exist.
+            delete_rows: If a table exists, delete all records and insert data.
+        index : bool, default True
+            Write DataFrame index as a column
+        index_label : string or sequence, default None
+            Column label for index column(s). If None is given (default) and
+            `index` is True, then the index names are used.
+            A sequence should be given if the DataFrame uses MultiIndex.
+        schema : string, default None
+            Ignored parameter included for compatibility with SQLAlchemy
+            version of ``to_sql``.
+        chunksize : int, default None
+            If not None, then rows will be written in batches of this
+            size at a time. If None, all rows will be written at once.
+        dtype : single type or dict of column name to SQL type, default None
+            Optional specifying the datatype for columns. The SQL type should
+            be a string. If all columns are of the same type, one single value
+            can be used.
+        method : {None, 'multi', callable}, default None
+            Controls the SQL insertion clause used:
+
+            * None : Uses standard SQL ``INSERT`` clause (one per row).
+            * 'multi': Pass multiple values in a single ``INSERT`` clause.
+            * callable with signature ``(pd_table, conn, keys, data_iter)``.
+
+            Details and a sample callable implementation can be found in the
+            section :ref:`insert method <io.sql.method>`.
+        """
+        if dtype:
+            if not is_dict_like(dtype):
+                # error: Value expression in dictionary comprehension has incompatible
+                # type "Union[ExtensionDtype, str, dtype[Any], Type[object],
+                # Dict[Hashable, Union[ExtensionDtype, Union[str, dtype[Any]],
+                # Type[str], Type[float], Type[int], Type[complex], Type[bool],
+                # Type[object]]]]"; expected type "Union[ExtensionDtype, str,
+                # dtype[Any], Type[object]]"
+                dtype = dict.fromkeys(frame, dtype)  # type: ignore[arg-type]
+            else:
+                dtype = cast(dict, dtype)
+
+            for col, my_type in dtype.items():
+                if not isinstance(my_type, str):
+                    raise ValueError(f"{col} ({my_type}) not a string")
+
+        table = SQLiteTable(
+            name,
+            self,
+            frame=frame,
+            index=index,
+            if_exists=if_exists,
+            index_label=index_label,
+            dtype=dtype,
+        )
+        table.create()
+        return table.insert(chunksize, method)
+
+    def has_table(self, name: str, schema: str | None = None) -> bool:
+        wld = "?"
+        query = f"""
+        SELECT
+            name
+        FROM
+            sqlite_master
+        WHERE
+            type IN ('table', 'view')
+            AND name={wld};
+        """
+
+        return len(self.execute(query, [name]).fetchall()) > 0
+
+    def get_table(self, table_name: str, schema: str | None = None) -> None:
+        return None  # not supported in fallback mode
+
+    def drop_table(self, name: str, schema: str | None = None) -> None:
+        drop_sql = f"DROP TABLE {_get_valid_sqlite_name(name)}"
+        self.execute(drop_sql).close()
+
+    def delete_rows(self, name: str, schema: str | None = None) -> None:
+        delete_sql = f"DELETE FROM {_get_valid_sqlite_name(name)}"
+        if self.has_table(name, schema):
+            self.execute(delete_sql).close()
+
+    def _create_sql_schema(
+        self,
+        frame,
+        table_name: str,
+        keys=None,
+        dtype: DtypeArg | None = None,
+        schema: str | None = None,
+    ) -> str:
+        table = SQLiteTable(
+            table_name,
+            self,
+            frame=frame,
+            index=False,
+            keys=keys,
+            dtype=dtype,
+            schema=schema,
+        )
+        return str(table.sql_schema())
+
+
+def get_schema(
+    frame,
+    name: str,
+    keys=None,
+    con=None,
+    dtype: DtypeArg | None = None,
+    schema: str | None = None,
+) -> str:
+    """
+    Get the SQL db table schema for the given frame.
+
+    Parameters
+    ----------
+    frame : DataFrame
+    name : str
+        name of SQL table
+    keys : string or sequence, default: None
+        columns to use a primary key
+    con: ADBC Connection, SQLAlchemy connectable, sqlite3 connection, default: None
+        ADBC provides high performance I/O with native type support, where available.
+        Using SQLAlchemy makes it possible to use any DB supported by that
+        library
+        If a DBAPI2 object, only sqlite3 is supported.
+    dtype : dict of column name to SQL type, default None
+        Optional specifying the datatype for columns. The SQL type should
+        be a SQLAlchemy type, or a string for sqlite3 fallback connection.
+    schema: str, default: None
+        Optional specifying the schema to be used in creating the table.
+    """
+    with pandasSQL_builder(con=con) as pandas_sql:
+        return pandas_sql._create_sql_schema(
+            frame, name, keys=keys, dtype=dtype, schema=schema
+        )
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b2dc9fd831b75f69c85dcac06004c6cdee1af8a
--- /dev/null
+++ b/pandas/io/stata.py
@@ -0,0 +1,3925 @@
+"""
+Module contains tools for processing Stata files into DataFrames
+
+The StataReader below was originally written by Joe Presbrey as part of PyDTA.
+It has been extended and improved by Skipper Seabold from the Statsmodels
+project who also developed the StataWriter and was finally added to pandas in
+a once again improved version.
+
+You can find more information on http://presbrey.mit.edu/PyDTA and
+https://www.statsmodels.org/devel/
+"""
+
+from __future__ import annotations
+
+from collections import abc
+from datetime import (
+    datetime,
+    timedelta,
+)
+from io import BytesIO
+import os
+import struct
+import sys
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    AnyStr,
+    Final,
+    Self,
+    cast,
+)
+import warnings
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas._libs.lib import infer_dtype
+from pandas._libs.writers import max_len_string_array
+from pandas.errors import (
+    CategoricalConversionWarning,
+    InvalidColumnName,
+    Pandas4Warning,
+    PossiblePrecisionLoss,
+    ValueLabelTypeMismatch,
+)
+from pandas.util._decorators import (
+    set_module,
+)
+from pandas.util._exceptions import find_stack_level
+
+from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.common import (
+    ensure_object,
+    is_numeric_dtype,
+    is_string_dtype,
+)
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+from pandas import (
+    Categorical,
+    DatetimeIndex,
+    NaT,
+    Timestamp,
+    isna,
+    to_datetime,
+)
+from pandas.core.frame import DataFrame
+from pandas.core.indexes.base import Index
+from pandas.core.indexes.range import RangeIndex
+from pandas.core.series import Series
+from pandas.core.shared_docs import _shared_docs
+
+from pandas.io.common import get_handle
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Hashable,
+        Sequence,
+    )
+    from types import TracebackType
+    from typing import Literal
+
+    from pandas._typing import (
+        CompressionOptions,
+        FilePath,
+        ReadBuffer,
+        StorageOptions,
+        WriteBuffer,
+    )
+
+_version_error = (
+    "Version of given Stata file is {version}. pandas supports importing "
+    "versions 102, 103, 104, 105, 108, 110 (Stata 7), 111 (Stata 7SE),  "
+    "113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), "
+    "118 (Stata 14/15/16), and 119 (Stata 15/16, over 32,767 variables)."
+)
+
+_statafile_processing_params1 = """\
+convert_dates : bool, default True
+    Convert date variables to DataFrame time values.
+convert_categoricals : bool, default True
+    Read value labels and convert columns to Categorical/Factor variables."""
+
+_statafile_processing_params2 = """\
+index_col : str, optional
+    Column to set as index.
+convert_missing : bool, default False
+    Flag indicating whether to convert missing values to their Stata
+    representations.  If False, missing values are replaced with nan.
+    If True, columns containing missing values are returned with
+    object data types and missing values are represented by
+    StataMissingValue objects.
+preserve_dtypes : bool, default True
+    Preserve Stata datatypes. If False, numeric data are upcast to pandas
+    default types for foreign data (float64 or int64).
+columns : list or None
+    Columns to retain.  Columns will be returned in the given order.  None
+    returns all columns.
+order_categoricals : bool, default True
+    Flag indicating whether converted categorical data are ordered."""
+
+_chunksize_params = """\
+chunksize : int, default None
+    Return StataReader object for iterations, returns chunks with
+    given number of lines."""
+
+_reader_notes = """\
+Notes
+-----
+Categorical variables read through an iterator may not have the same
+categories and dtype. This occurs when  a variable stored in a DTA
+file is associated to an incomplete set of value labels that only
+label a strict subset of the values."""
+
+_stata_reader_doc = f"""\
+Class for reading Stata dta files.
+
+Parameters
+----------
+path_or_buf : path (string), buffer or path object
+    string, pathlib.Path or object
+    implementing a binary read() functions.
+{_statafile_processing_params1}
+{_statafile_processing_params2}
+{_chunksize_params}
+{_shared_docs["decompression_options"]}
+{_shared_docs["storage_options"]}
+
+{_reader_notes}
+"""
+
+
+_date_formats = ["%tc", "%tC", "%td", "%d", "%tw", "%tm", "%tq", "%th", "%ty"]
+
+
+stata_epoch: Final = datetime(1960, 1, 1)
+unix_epoch: Final = datetime(1970, 1, 1)
+
+
+def _stata_elapsed_date_to_datetime_vec(dates: Series, fmt: str) -> Series:
+    """
+    Convert from SIF to datetime. https://www.stata.com/help.cgi?datetime
+
+    Parameters
+    ----------
+    dates : Series
+        The Stata Internal Format date to convert to datetime according to fmt
+    fmt : str
+        The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
+        Returns
+
+    Returns
+    -------
+    converted : Series
+        The converted dates
+
+    Examples
+    --------
+    >>> dates = pd.Series([52])
+    >>> _stata_elapsed_date_to_datetime_vec(dates, "%tw")
+    0   1961-01-01
+    dtype: datetime64[s]
+
+    Notes
+    -----
+    datetime/c - tc
+        milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day
+    datetime/C - tC - NOT IMPLEMENTED
+        milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds
+    date - td
+        days since 01jan1960 (01jan1960 = 0)
+    weekly date - tw
+        weeks since 1960w1
+        This assumes 52 weeks in a year, then adds 7 * remainder of the weeks.
+        The datetime value is the start of the week in terms of days in the
+        year, not ISO calendar weeks.
+    monthly date - tm
+        months since 1960m1
+    quarterly date - tq
+        quarters since 1960q1
+    half-yearly date - th
+        half-years since 1960h1 yearly
+    date - ty
+        years since 0000
+    """
+
+    if fmt.startswith(("%tc", "tc")):
+        # Delta ms relative to base
+        td = np.timedelta64(stata_epoch - unix_epoch, "ms")
+        res = np.array(dates._values, dtype="M8[ms]") + td
+        return Series(res, index=dates.index)
+
+    elif fmt.startswith(("%td", "td", "%d", "d")):
+        # Delta days relative to base
+        td = np.timedelta64(stata_epoch - unix_epoch, "D")
+        res = np.array(dates._values, dtype="M8[D]") + td
+        return Series(res, index=dates.index)
+
+    elif fmt.startswith(("%tm", "tm")):
+        # Delta months relative to base
+        ordinals = dates + (stata_epoch.year - unix_epoch.year) * 12
+        res = np.array(ordinals, dtype="M8[M]").astype("M8[s]")
+        return Series(res, index=dates.index)
+
+    elif fmt.startswith(("%tq", "tq")):
+        # Delta quarters relative to base
+        ordinals = dates + (stata_epoch.year - unix_epoch.year) * 4
+        res = np.array(ordinals, dtype="M8[3M]").astype("M8[s]")
+        return Series(res, index=dates.index)
+
+    elif fmt.startswith(("%th", "th")):
+        # Delta half-years relative to base
+        ordinals = dates + (stata_epoch.year - unix_epoch.year) * 2
+        res = np.array(ordinals, dtype="M8[6M]").astype("M8[s]")
+        return Series(res, index=dates.index)
+
+    elif fmt.startswith(("%ty", "ty")):
+        # Years -- not delta
+        ordinals = dates - 1970
+        res = np.array(ordinals, dtype="M8[Y]").astype("M8[s]")
+        return Series(res, index=dates.index)
+
+    bad_locs = np.isnan(dates)
+    has_bad_values = False
+    if bad_locs.any():
+        has_bad_values = True
+        dates._values[bad_locs] = 1.0  # Replace with NaT
+    dates = dates.astype(np.int64)
+
+    if fmt.startswith(("%tC", "tC")):
+        warnings.warn(
+            "Encountered %tC format. Leaving in Stata Internal Format.",
+            stacklevel=find_stack_level(),
+        )
+        conv_dates = Series(dates, dtype=object)
+        if has_bad_values:
+            conv_dates[bad_locs] = NaT
+        return conv_dates
+    # does not count leap days - 7 days is a week.
+    # 52nd week may have more than 7 days
+    elif fmt.startswith(("%tw", "tw")):
+        year = stata_epoch.year + dates // 52
+        days = (dates % 52) * 7
+        per_y = (year - 1970).array.view("Period[Y]")
+        per_d = per_y.asfreq("D", how="S")
+        per_d_shifted = per_d + days._values
+        per_s = per_d_shifted.asfreq("s", how="S")
+        conv_dates_arr = per_s.view("M8[s]")
+        conv_dates = Series(conv_dates_arr, index=dates.index)
+
+    else:
+        raise ValueError(f"Date fmt {fmt} not understood")
+
+    if has_bad_values:  # Restore NaT for bad values
+        conv_dates[bad_locs] = NaT
+
+    return conv_dates
+
+
+def _datetime_to_stata_elapsed_vec(dates: Series, fmt: str) -> Series:
+    """
+    Convert from datetime to SIF. https://www.stata.com/help.cgi?datetime
+
+    Parameters
+    ----------
+    dates : Series
+        Series or array containing datetime or datetime64[ns] to
+        convert to the Stata Internal Format given by fmt
+    fmt : str
+        The format to convert to. Can be, tc, td, tw, tm, tq, th, ty
+    """
+    index = dates.index
+    NS_PER_DAY = 24 * 3600 * 1000 * 1000 * 1000
+    US_PER_DAY = NS_PER_DAY / 1000
+    MS_PER_DAY = NS_PER_DAY / 1_000_000
+
+    def parse_dates_safe(
+        dates: Series, delta: bool = False, year: bool = False, days: bool = False
+    ) -> DataFrame:
+        d = {}
+        if lib.is_np_dtype(dates.dtype, "M"):
+            if delta:
+                time_delta = dates.dt.as_unit("ms") - Timestamp(stata_epoch).as_unit(
+                    "ms"
+                )
+                d["delta"] = time_delta._values.view(np.int64)
+            if days or year:
+                date_index = DatetimeIndex(dates)
+                d["year"] = date_index._data.year
+                d["month"] = date_index._data.month
+            if days:
+                year_start = np.asarray(dates).astype("M8[Y]").astype(dates.dtype)
+                diff = dates - year_start
+                d["days"] = np.asarray(diff).astype("m8[D]").view("int64")
+
+        elif infer_dtype(dates, skipna=False) == "datetime":
+            warnings.warn(
+                # GH#56536
+                "Converting object-dtype columns of datetimes to datetime64 when "
+                "writing to stata is deprecated. Call "
+                "`df=df.infer_objects(copy=False)` before writing to stata instead.",
+                Pandas4Warning,
+                stacklevel=find_stack_level(),
+            )
+            if delta:
+                delta = dates._values - stata_epoch
+
+                def f(x: timedelta) -> float:
+                    return US_PER_DAY * x.days + 1_000_000 * x.seconds + x.microseconds
+
+                v = np.vectorize(f)
+                d["delta"] = v(delta) // 1_000  # convert back to ms
+            if year:
+                year_month = dates.apply(lambda x: 100 * x.year + x.month)
+                d["year"] = year_month._values // 100
+                d["month"] = year_month._values - d["year"] * 100
+            if days:
+
+                def g(x: datetime) -> int:
+                    return (x - datetime(x.year, 1, 1)).days
+
+                v = np.vectorize(g)
+                d["days"] = v(dates)
+        else:
+            raise ValueError(
+                "Columns containing dates must contain either "
+                "datetime64, datetime or null values."
+            )
+
+        return DataFrame(d, index=index)
+
+    bad_loc = isna(dates)
+    index = dates.index
+    if bad_loc.any():
+        if lib.is_np_dtype(dates.dtype, "M"):
+            dates._values[bad_loc] = to_datetime(stata_epoch)
+        else:
+            dates._values[bad_loc] = stata_epoch
+
+    if fmt in ["%tc", "tc"]:
+        d = parse_dates_safe(dates, delta=True)
+        conv_dates = d.delta
+    elif fmt in ["%tC", "tC"]:
+        warnings.warn(
+            "Stata Internal Format tC not supported.",
+            stacklevel=find_stack_level(),
+        )
+        conv_dates = dates
+    elif fmt in ["%td", "td"]:
+        d = parse_dates_safe(dates, delta=True)
+        conv_dates = d.delta // MS_PER_DAY
+    elif fmt in ["%tw", "tw"]:
+        d = parse_dates_safe(dates, year=True, days=True)
+        conv_dates = 52 * (d.year - stata_epoch.year) + d.days // 7
+    elif fmt in ["%tm", "tm"]:
+        d = parse_dates_safe(dates, year=True)
+        conv_dates = 12 * (d.year - stata_epoch.year) + d.month - 1
+    elif fmt in ["%tq", "tq"]:
+        d = parse_dates_safe(dates, year=True)
+        conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3
+    elif fmt in ["%th", "th"]:
+        d = parse_dates_safe(dates, year=True)
+        conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(int)
+    elif fmt in ["%ty", "ty"]:
+        d = parse_dates_safe(dates, year=True)
+        conv_dates = d.year
+    else:
+        raise ValueError(f"Format {fmt} is not a known Stata date format")
+
+    conv_dates = Series(conv_dates, dtype=np.float64, copy=False)
+    missing_value = struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]
+    conv_dates[bad_loc] = missing_value
+
+    return Series(conv_dates, index=index, copy=False)
+
+
+excessive_string_length_error: Final = """
+Fixed width strings in Stata .dta files are limited to 244 (or fewer)
+characters.  Column '{0}' does not satisfy this restriction. Use the
+'version=117' parameter to write the newer (Stata 13 and later) format.
+"""
+
+
+precision_loss_doc: Final = """
+Column converted from {0} to {1}, and some data are outside of the lossless
+conversion range. This may result in a loss of precision in the saved data.
+"""
+
+
+value_label_mismatch_doc: Final = """
+Stata value labels (pandas categories) must be strings. Column {0} contains
+non-string labels which will be converted to strings.  Please check that the
+Stata data file created has not lost information due to duplicate labels.
+"""
+
+
+invalid_name_doc: Final = """
+Not all pandas column names were valid Stata variable names.
+The following replacements have been made:
+
+    {0}
+
+If this is not what you expect, please make sure you have Stata-compliant
+column names in your DataFrame (strings only, max 32 characters, only
+alphanumerics and underscores, no Stata reserved words)
+"""
+
+
+categorical_conversion_warning: Final = """
+One or more series with value labels are not fully labeled. Reading this
+dataset with an iterator results in categorical variable with different
+categories. This occurs since it is not possible to know all possible values
+until the entire dataset has been read. To avoid this warning, you can either
+read dataset without an iterator, or manually convert categorical data by
+``convert_categoricals`` to False and then accessing the variable labels
+through the value_labels method of the reader.
+"""
+
+
+def _cast_to_stata_types(data: DataFrame) -> DataFrame:
+    """
+    Checks the dtypes of the columns of a pandas DataFrame for
+    compatibility with the data types and ranges supported by Stata, and
+    converts if necessary.
+
+    Parameters
+    ----------
+    data : DataFrame
+        The DataFrame to check and convert
+
+    Notes
+    -----
+    Numeric columns in Stata must be one of int8, int16, int32, float32 or
+    float64, with some additional value restrictions.  int8 and int16 columns
+    are checked for violations of the value restrictions and upcast if needed.
+    int64 data is not usable in Stata, and so it is downcast to int32 whenever
+    the value are in the int32 range, and sidecast to float64 when larger than
+    this range.  If the int64 values are outside of the range of those
+    perfectly representable as float64 values, a warning is raised.
+
+    bool columns are cast to int8.  uint columns are converted to int of the
+    same size if there is no loss in precision, otherwise are upcast to a
+    larger type.  uint64 is currently not supported since it is concerted to
+    object in a DataFrame.
+    """
+    ws = ""
+    # original, if small, if large
+    conversion_data: tuple[
+        tuple[type, type, type],
+        tuple[type, type, type],
+        tuple[type, type, type],
+        tuple[type, type, type],
+        tuple[type, type, type],
+    ] = (
+        (np.bool_, np.int8, np.int8),
+        (np.uint8, np.int8, np.int16),
+        (np.uint16, np.int16, np.int32),
+        (np.uint32, np.int32, np.int64),
+        (np.uint64, np.int64, np.float64),
+    )
+
+    float32_max = struct.unpack("<f", b"\xff\xff\xff\x7e")[0]
+    float64_max = struct.unpack("<d", b"\xff\xff\xff\xff\xff\xff\xdf\x7f")[0]
+
+    for col in data:
+        # Cast from unsupported types to supported types
+        is_nullable_int = (
+            isinstance(data[col].dtype, ExtensionDtype)
+            and data[col].dtype.kind in "iub"
+        )
+        # We need to find orig_missing before altering data below
+        orig_missing = data[col].isna()
+        if is_nullable_int:
+            fv = 0 if data[col].dtype.kind in "iu" else False
+            # Replace with NumPy-compatible column
+            data[col] = data[col].fillna(fv).astype(data[col].dtype.numpy_dtype)
+        elif isinstance(data[col].dtype, ExtensionDtype):
+            if getattr(data[col].dtype, "numpy_dtype", None) is not None:
+                data[col] = data[col].astype(data[col].dtype.numpy_dtype)
+            elif is_string_dtype(data[col].dtype):
+                # TODO could avoid converting string dtype to object here,
+                # but handle string dtype in _encode_strings
+                data[col] = data[col].astype("object")
+                # generate_table checks for None values
+                data.loc[data[col].isna(), col] = None
+
+        dtype = data[col].dtype
+        empty_df = data.shape[0] == 0
+        for c_data in conversion_data:
+            if dtype == c_data[0]:
+                if empty_df or data[col].max() <= np.iinfo(c_data[1]).max:
+                    dtype = c_data[1]
+                else:
+                    dtype = c_data[2]
+                if c_data[2] == np.int64:  # Warn if necessary
+                    if data[col].max() >= 2**53:
+                        ws = precision_loss_doc.format("uint64", "float64")
+
+                data[col] = data[col].astype(dtype)
+
+        # Check values and upcast if necessary
+
+        if dtype == np.int8 and not empty_df:
+            if data[col].max() > 100 or data[col].min() < -127:
+                data[col] = data[col].astype(np.int16)
+        elif dtype == np.int16 and not empty_df:
+            if data[col].max() > 32740 or data[col].min() < -32767:
+                data[col] = data[col].astype(np.int32)
+        elif dtype == np.int64:
+            if empty_df or (
+                data[col].max() <= 2147483620 and data[col].min() >= -2147483647
+            ):
+                data[col] = data[col].astype(np.int32)
+            else:
+                data[col] = data[col].astype(np.float64)
+                if data[col].max() >= 2**53 or data[col].min() <= -(2**53):
+                    ws = precision_loss_doc.format("int64", "float64")
+        elif dtype in (np.float32, np.float64):
+            if np.isinf(data[col]).any():
+                raise ValueError(
+                    f"Column {col} contains infinity or -infinity"
+                    "which is outside the range supported by Stata."
+                )
+            value = data[col].max()
+            if dtype == np.float32 and value > float32_max:
+                data[col] = data[col].astype(np.float64)
+            elif dtype == np.float64:
+                if value > float64_max:
+                    raise ValueError(
+                        f"Column {col} has a maximum value ({value}) outside the range "
+                        f"supported by Stata ({float64_max})"
+                    )
+        if is_nullable_int:
+            if orig_missing.any():
+                # Replace missing by Stata sentinel value
+                sentinel = StataMissingValue.BASE_MISSING_VALUES[data[col].dtype.name]
+                data.loc[orig_missing, col] = sentinel
+    if ws:
+        warnings.warn(
+            ws,
+            PossiblePrecisionLoss,
+            stacklevel=find_stack_level(),
+        )
+
+    return data
+
+
+class StataValueLabel:
+    """
+    Parse a categorical column and prepare formatted output
+
+    Parameters
+    ----------
+    catarray : Series
+        Categorical Series to encode
+    encoding : {"latin-1", "utf-8"}
+        Encoding to use for value labels.
+    """
+
+    def __init__(
+        self, catarray: Series, encoding: Literal["latin-1", "utf-8"] = "latin-1"
+    ) -> None:
+        if encoding not in ("latin-1", "utf-8"):
+            raise ValueError("Only latin-1 and utf-8 are supported.")
+        self.labname = catarray.name
+        self._encoding = encoding
+        categories = catarray.cat.categories
+        self.value_labels = enumerate(categories)
+
+        self._prepare_value_labels()
+
+    def _prepare_value_labels(self) -> None:
+        """Encode value labels."""
+
+        self.text_len = 0
+        self.txt: list[bytes] = []
+        self.n = 0
+        # Offsets (length of categories), converted to int32
+        self.off = np.array([], dtype=np.int32)
+        # Values, converted to int32
+        self.val = np.array([], dtype=np.int32)
+        self.len = 0
+
+        # Compute lengths and setup lists of offsets and labels
+        offsets: list[int] = []
+        values: list[float] = []
+        for vl in self.value_labels:
+            category: str | bytes = vl[1]
+            if not isinstance(category, str):
+                category = str(category)
+                warnings.warn(
+                    value_label_mismatch_doc.format(self.labname),
+                    ValueLabelTypeMismatch,
+                    stacklevel=find_stack_level(),
+                )
+            category = category.encode(self._encoding)
+            offsets.append(self.text_len)
+            self.text_len += len(category) + 1  # +1 for the padding
+            values.append(vl[0])
+            self.txt.append(category)
+            self.n += 1
+
+        # Ensure int32
+        self.off = np.array(offsets, dtype=np.int32)
+        self.val = np.array(values, dtype=np.int32)
+
+        # Total length
+        self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
+
+    def generate_value_label(self, byteorder: str) -> bytes:
+        """
+        Generate the binary representation of the value labels.
+
+        Parameters
+        ----------
+        byteorder : str
+            Byte order of the output
+
+        Returns
+        -------
+        value_label : bytes
+            Bytes containing the formatted value label
+        """
+        encoding = self._encoding
+        bio = BytesIO()
+        null_byte = b"\x00"
+
+        # len
+        bio.write(struct.pack(byteorder + "i", self.len))
+
+        # labname
+        labname = str(self.labname)[:32].encode(encoding)
+        lab_len = 32 if encoding not in ("utf-8", "utf8") else 128
+        labname = _pad_bytes(labname, lab_len + 1)
+        bio.write(labname)
+
+        # padding - 3 bytes
+        for i in range(3):
+            bio.write(struct.pack("c", null_byte))
+
+        # value_label_table
+        # n - int32
+        bio.write(struct.pack(byteorder + "i", self.n))
+
+        # textlen  - int32
+        bio.write(struct.pack(byteorder + "i", self.text_len))
+
+        # off - int32 array (n elements)
+        for offset in self.off:
+            bio.write(struct.pack(byteorder + "i", offset))
+
+        # val - int32 array (n elements)
+        for value in self.val:
+            bio.write(struct.pack(byteorder + "i", value))
+
+        # txt - Text labels, null terminated
+        for text in self.txt:
+            bio.write(text + null_byte)
+
+        return bio.getvalue()
+
+
+class StataNonCatValueLabel(StataValueLabel):
+    """
+    Prepare formatted version of value labels
+
+    Parameters
+    ----------
+    labname : str
+        Value label name
+    value_labels: Dictionary
+        Mapping of values to labels
+    encoding : {"latin-1", "utf-8"}
+        Encoding to use for value labels.
+    """
+
+    def __init__(
+        self,
+        labname: str,
+        value_labels: dict[float, str],
+        encoding: Literal["latin-1", "utf-8"] = "latin-1",
+    ) -> None:
+        if encoding not in ("latin-1", "utf-8"):
+            raise ValueError("Only latin-1 and utf-8 are supported.")
+
+        self.labname = labname
+        self._encoding = encoding
+        self.value_labels = sorted(  # type: ignore[assignment]
+            value_labels.items(), key=lambda x: x[0]
+        )
+        self._prepare_value_labels()
+
+
+class StataMissingValue:
+    """
+    An observation's missing value.
+
+    Parameters
+    ----------
+    value : {int, float}
+        The Stata missing value code
+
+    Notes
+    -----
+    More information: <https://www.stata.com/help.cgi?missing>
+
+    Integer missing values make the code '.', '.a', ..., '.z' to the ranges
+    101 ... 127 (for int8), 32741 ... 32767  (for int16) and 2147483621 ...
+    2147483647 (for int32).  Missing values for floating point data types are
+    more complex but the pattern is simple to discern from the following table.
+
+    np.float32 missing values (float in Stata)
+    0000007f    .
+    0008007f    .a
+    0010007f    .b
+    ...
+    00c0007f    .x
+    00c8007f    .y
+    00d0007f    .z
+
+    np.float64 missing values (double in Stata)
+    000000000000e07f    .
+    000000000001e07f    .a
+    000000000002e07f    .b
+    ...
+    000000000018e07f    .x
+    000000000019e07f    .y
+    00000000001ae07f    .z
+    """
+
+    # Construct a dictionary of missing values
+    MISSING_VALUES: dict[float, str] = {}
+    bases: Final = (101, 32741, 2147483621)
+    for b in bases:
+        # Conversion to long to avoid hash issues on 32 bit platforms #8968
+        MISSING_VALUES[b] = "."
+        for i in range(1, 27):
+            MISSING_VALUES[i + b] = "." + chr(96 + i)
+
+    float32_base: bytes = b"\x00\x00\x00\x7f"
+    increment_32: int = struct.unpack("<i", b"\x00\x08\x00\x00")[0]
+    for i in range(27):
+        key = struct.unpack("<f", float32_base)[0]
+        MISSING_VALUES[key] = "."
+        if i > 0:
+            MISSING_VALUES[key] += chr(96 + i)
+        int_value = struct.unpack("<i", struct.pack("<f", key))[0] + increment_32
+        float32_base = struct.pack("<i", int_value)
+
+    float64_base: bytes = b"\x00\x00\x00\x00\x00\x00\xe0\x7f"
+    increment_64 = struct.unpack("q", b"\x00\x00\x00\x00\x00\x01\x00\x00")[0]
+    for i in range(27):
+        key = struct.unpack("<d", float64_base)[0]
+        MISSING_VALUES[key] = "."
+        if i > 0:
+            MISSING_VALUES[key] += chr(96 + i)
+        int_value = struct.unpack("q", struct.pack("<d", key))[0] + increment_64
+        float64_base = struct.pack("q", int_value)
+
+    BASE_MISSING_VALUES: Final = {
+        "int8": 101,
+        "int16": 32741,
+        "int32": 2147483621,
+        "float32": struct.unpack("<f", float32_base)[0],
+        "float64": struct.unpack("<d", float64_base)[0],
+    }
+
+    def __init__(self, value: float) -> None:
+        self._value = value
+        # Conversion to int to avoid hash issues on 32 bit platforms #8968
+        value = int(value) if value < 2147483648 else float(value)
+        self._str = self.MISSING_VALUES[value]
+
+    @property
+    def string(self) -> str:
+        """
+        The Stata representation of the missing value: '.', '.a'..'.z'
+
+        Returns
+        -------
+        str
+            The representation of the missing value.
+        """
+        return self._str
+
+    @property
+    def value(self) -> float:
+        """
+        The binary representation of the missing value.
+
+        Returns
+        -------
+        {int, float}
+            The binary representation of the missing value.
+        """
+        return self._value
+
+    def __str__(self) -> str:
+        return self.string
+
+    def __repr__(self) -> str:
+        return f"{type(self)}({self})"
+
+    def __eq__(self, other: object) -> bool:
+        return (
+            isinstance(other, type(self))
+            and self.string == other.string
+            and self.value == other.value
+        )
+
+    @classmethod
+    def get_base_missing_value(cls, dtype: np.dtype) -> float:
+        if dtype.type is np.int8:
+            value = cls.BASE_MISSING_VALUES["int8"]
+        elif dtype.type is np.int16:
+            value = cls.BASE_MISSING_VALUES["int16"]
+        elif dtype.type is np.int32:
+            value = cls.BASE_MISSING_VALUES["int32"]
+        elif dtype.type is np.float32:
+            value = cls.BASE_MISSING_VALUES["float32"]
+        elif dtype.type is np.float64:
+            value = cls.BASE_MISSING_VALUES["float64"]
+        else:
+            raise ValueError("Unsupported dtype")
+        return value
+
+
+class StataParser:
+    def __init__(self) -> None:
+        # type          code.
+        # --------------------
+        # str1        1 = 0x01
+        # str2        2 = 0x02
+        # ...
+        # str244    244 = 0xf4
+        # byte      251 = 0xfb  (sic)
+        # int       252 = 0xfc
+        # long      253 = 0xfd
+        # float     254 = 0xfe
+        # double    255 = 0xff
+        # --------------------
+        # NOTE: the byte type seems to be reserved for categorical variables
+        # with a label, but the underlying variable is -127 to 100
+        # we're going to drop the label and cast to int
+        self.DTYPE_MAP = dict(
+            [(i, np.dtype(f"S{i}")) for i in range(1, 245)]
+            + [
+                (251, np.dtype(np.int8)),
+                (252, np.dtype(np.int16)),
+                (253, np.dtype(np.int32)),
+                (254, np.dtype(np.float32)),
+                (255, np.dtype(np.float64)),
+            ]
+        )
+        self.DTYPE_MAP_XML: dict[int, np.dtype] = {
+            32768: np.dtype(np.uint8),  # Keys to GSO
+            65526: np.dtype(np.float64),
+            65527: np.dtype(np.float32),
+            65528: np.dtype(np.int32),
+            65529: np.dtype(np.int16),
+            65530: np.dtype(np.int8),
+        }
+        self.TYPE_MAP = list(tuple(range(251)) + tuple("bhlfd"))
+        self.TYPE_MAP_XML = {
+            # Not really a Q, unclear how to handle byteswap
+            32768: "Q",
+            65526: "d",
+            65527: "f",
+            65528: "l",
+            65529: "h",
+            65530: "b",
+        }
+        # NOTE: technically, some of these are wrong. there are more numbers
+        # that can be represented. it's the 27 ABOVE and BELOW the max listed
+        # numeric data type in [U] 12.2.2 of the 11.2 manual
+        float32_min = b"\xff\xff\xff\xfe"
+        float32_max = b"\xff\xff\xff\x7e"
+        float64_min = b"\xff\xff\xff\xff\xff\xff\xef\xff"
+        float64_max = b"\xff\xff\xff\xff\xff\xff\xdf\x7f"
+        self.VALID_RANGE = {
+            "b": (-127, 100),
+            "h": (-32767, 32740),
+            "l": (-2147483647, 2147483620),
+            "f": (
+                np.float32(struct.unpack("<f", float32_min)[0]),
+                np.float32(struct.unpack("<f", float32_max)[0]),
+            ),
+            "d": (
+                np.float64(struct.unpack("<d", float64_min)[0]),
+                np.float64(struct.unpack("<d", float64_max)[0]),
+            ),
+        }
+        self.OLD_VALID_RANGE = {
+            "b": (-128, 126),
+            "h": (-32768, 32766),
+            "l": (-2147483648, 2147483646),
+            "f": (
+                np.float32(struct.unpack("<f", float32_min)[0]),
+                np.float32(struct.unpack("<f", float32_max)[0]),
+            ),
+            "d": (
+                np.float64(struct.unpack("<d", float64_min)[0]),
+                np.float64(struct.unpack("<d", float64_max)[0]),
+            ),
+        }
+
+        self.OLD_TYPE_MAPPING = {
+            98: 251,  # byte
+            105: 252,  # int
+            108: 253,  # long
+            102: 254,  # float
+            100: 255,  # double
+        }
+
+        # These missing values are the generic '.' in Stata, and are used
+        # to replace nans
+        self.MISSING_VALUES: dict[str, int | np.float32 | np.float64] = {
+            "b": 101,
+            "h": 32741,
+            "l": 2147483621,
+            "f": np.float32(struct.unpack("<f", b"\x00\x00\x00\x7f")[0]),
+            "d": np.float64(
+                struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]
+            ),
+        }
+        self.NUMPY_TYPE_MAP = {
+            "b": "i1",
+            "h": "i2",
+            "l": "i4",
+            "f": "f4",
+            "d": "f8",
+            "Q": "u8",
+        }
+
+        # Reserved words cannot be used as variable names
+        self.RESERVED_WORDS = {
+            "aggregate",
+            "array",
+            "boolean",
+            "break",
+            "byte",
+            "case",
+            "catch",
+            "class",
+            "colvector",
+            "complex",
+            "const",
+            "continue",
+            "default",
+            "delegate",
+            "delete",
+            "do",
+            "double",
+            "else",
+            "eltypedef",
+            "end",
+            "enum",
+            "explicit",
+            "export",
+            "external",
+            "float",
+            "for",
+            "friend",
+            "function",
+            "global",
+            "goto",
+            "if",
+            "inline",
+            "int",
+            "local",
+            "long",
+            "NULL",
+            "pragma",
+            "protected",
+            "quad",
+            "rowvector",
+            "short",
+            "typedef",
+            "typename",
+            "virtual",
+            "_all",
+            "_N",
+            "_skip",
+            "_b",
+            "_pi",
+            "str#",
+            "in",
+            "_pred",
+            "strL",
+            "_coef",
+            "_rc",
+            "using",
+            "_cons",
+            "_se",
+            "with",
+            "_n",
+        }
+
+
+@set_module("pandas.api.typing")
+class StataReader(StataParser, abc.Iterator):
+    __doc__ = _stata_reader_doc
+
+    _path_or_buf: IO[bytes]  # Will be assigned by `_open_file`.
+
+    def __init__(
+        self,
+        path_or_buf: FilePath | ReadBuffer[bytes],
+        convert_dates: bool = True,
+        convert_categoricals: bool = True,
+        index_col: str | None = None,
+        convert_missing: bool = False,
+        preserve_dtypes: bool = True,
+        columns: Sequence[str] | None = None,
+        order_categoricals: bool = True,
+        chunksize: int | None = None,
+        compression: CompressionOptions = "infer",
+        storage_options: StorageOptions | None = None,
+    ) -> None:
+        super().__init__()
+
+        # Arguments to the reader (can be temporarily overridden in
+        # calls to read).
+        self._convert_dates = convert_dates
+        self._convert_categoricals = convert_categoricals
+        self._index_col = index_col
+        self._convert_missing = convert_missing
+        self._preserve_dtypes = preserve_dtypes
+        self._columns = columns
+        self._order_categoricals = order_categoricals
+        self._original_path_or_buf = path_or_buf
+        self._compression = compression
+        self._storage_options = storage_options
+        self._encoding = ""
+        self._chunksize = chunksize
+        self._using_iterator = False
+        self._entered = False
+        if self._chunksize is None:
+            self._chunksize = 1
+        elif not isinstance(chunksize, int) or chunksize <= 0:
+            raise ValueError("chunksize must be a positive integer when set.")
+
+        # State variables for the file
+        self._close_file: Callable[[], None] | None = None
+        self._column_selector_set = False
+        self._value_label_dict: dict[str, dict[int, str]] = {}
+        self._value_labels_read = False
+        self._dtype: np.dtype | None = None
+        self._lines_read = 0
+
+        self._native_byteorder = _set_endianness(sys.byteorder)
+
+    def _ensure_open(self) -> None:
+        """
+        Ensure the file has been opened and its header data read.
+        """
+        if not hasattr(self, "_path_or_buf"):
+            self._open_file()
+
+    def _open_file(self) -> None:
+        """
+        Open the file (with compression options, etc.), and read header information.
+        """
+        if not self._entered:
+            warnings.warn(
+                "StataReader is being used without using a context manager. "
+                "Using StataReader as a context manager is the only supported method.",
+                ResourceWarning,
+                stacklevel=find_stack_level(),
+            )
+        handles = get_handle(
+            self._original_path_or_buf,
+            "rb",
+            storage_options=self._storage_options,
+            is_text=False,
+            compression=self._compression,
+        )
+        if hasattr(handles.handle, "seekable") and handles.handle.seekable():
+            # If the handle is directly seekable, use it without an extra copy.
+            self._path_or_buf = handles.handle
+            self._close_file = handles.close
+        else:
+            # Copy to memory, and ensure no encoding.
+            with handles:
+                self._path_or_buf = BytesIO(handles.handle.read())
+            self._close_file = self._path_or_buf.close
+
+        self._read_header()
+        self._setup_dtype()
+
+    def __enter__(self) -> Self:
+        """enter context manager"""
+        self._entered = True
+        return self
+
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: TracebackType | None,
+    ) -> None:
+        if self._close_file:
+            self._close_file()
+
+    def _set_encoding(self) -> None:
+        """
+        Set string encoding which depends on file version
+        """
+        if self._format_version < 118:
+            self._encoding = "latin-1"
+        else:
+            self._encoding = "utf-8"
+
+    def _read_int8(self) -> int:
+        return struct.unpack("b", self._path_or_buf.read(1))[0]
+
+    def _read_uint8(self) -> int:
+        return struct.unpack("B", self._path_or_buf.read(1))[0]
+
+    def _read_uint16(self) -> int:
+        return struct.unpack(f"{self._byteorder}H", self._path_or_buf.read(2))[0]
+
+    def _read_uint32(self) -> int:
+        return struct.unpack(f"{self._byteorder}I", self._path_or_buf.read(4))[0]
+
+    def _read_uint64(self) -> int:
+        return struct.unpack(f"{self._byteorder}Q", self._path_or_buf.read(8))[0]
+
+    def _read_int16(self) -> int:
+        return struct.unpack(f"{self._byteorder}h", self._path_or_buf.read(2))[0]
+
+    def _read_int32(self) -> int:
+        return struct.unpack(f"{self._byteorder}i", self._path_or_buf.read(4))[0]
+
+    def _read_int64(self) -> int:
+        return struct.unpack(f"{self._byteorder}q", self._path_or_buf.read(8))[0]
+
+    def _read_char8(self) -> bytes:
+        return struct.unpack("c", self._path_or_buf.read(1))[0]
+
+    def _read_int16_count(self, count: int) -> tuple[int, ...]:
+        return struct.unpack(
+            f"{self._byteorder}{'h' * count}",
+            self._path_or_buf.read(2 * count),
+        )
+
+    def _read_header(self) -> None:
+        first_char = self._read_char8()
+        if first_char == b"<":
+            self._read_new_header()
+        else:
+            self._read_old_header(first_char)
+
+    def _read_new_header(self) -> None:
+        # The first part of the header is common to 117 - 119.
+        self._path_or_buf.read(27)  # stata_dta><header><release>
+        self._format_version = int(self._path_or_buf.read(3))
+        if self._format_version not in [117, 118, 119]:
+            raise ValueError(_version_error.format(version=self._format_version))
+        self._set_encoding()
+        self._path_or_buf.read(21)  # </release><byteorder>
+        self._byteorder = ">" if self._path_or_buf.read(3) == b"MSF" else "<"
+        self._path_or_buf.read(15)  # </byteorder><K>
+        self._nvar = (
+            self._read_uint16() if self._format_version <= 118 else self._read_uint32()
+        )
+        self._path_or_buf.read(7)  # </K><N>
+
+        self._nobs = self._get_nobs()
+        self._path_or_buf.read(11)  # </N><label>
+        self._data_label = self._get_data_label()
+        self._path_or_buf.read(19)  # </label><timestamp>
+        self._time_stamp = self._get_time_stamp()
+        self._path_or_buf.read(26)  # </timestamp></header><map>
+        self._path_or_buf.read(8)  # 0x0000000000000000
+        self._path_or_buf.read(8)  # position of <map>
+
+        self._seek_vartypes = self._read_int64() + 16
+        self._seek_varnames = self._read_int64() + 10
+        self._seek_sortlist = self._read_int64() + 10
+        self._seek_formats = self._read_int64() + 9
+        self._seek_value_label_names = self._read_int64() + 19
+
+        # Requires version-specific treatment
+        self._seek_variable_labels = self._get_seek_variable_labels()
+
+        self._path_or_buf.read(8)  # <characteristics>
+        self._data_location = self._read_int64() + 6
+        self._seek_strls = self._read_int64() + 7
+        self._seek_value_labels = self._read_int64() + 14
+
+        self._typlist, self._dtyplist = self._get_dtypes(self._seek_vartypes)
+
+        self._path_or_buf.seek(self._seek_varnames)
+        self._varlist = self._get_varlist()
+
+        self._path_or_buf.seek(self._seek_sortlist)
+        self._srtlist = self._read_int16_count(self._nvar + 1)[:-1]
+
+        self._path_or_buf.seek(self._seek_formats)
+        self._fmtlist = self._get_fmtlist()
+
+        self._path_or_buf.seek(self._seek_value_label_names)
+        self._lbllist = self._get_lbllist()
+
+        self._path_or_buf.seek(self._seek_variable_labels)
+        self._variable_labels = self._get_variable_labels()
+
+    # Get data type information, works for versions 117-119.
+    def _get_dtypes(
+        self, seek_vartypes: int
+    ) -> tuple[list[int | str], list[str | np.dtype]]:
+        self._path_or_buf.seek(seek_vartypes)
+        typlist = []
+        dtyplist = []
+        for _ in range(self._nvar):
+            typ = self._read_uint16()
+            if typ <= 2045:
+                typlist.append(typ)
+                dtyplist.append(str(typ))
+            else:
+                try:
+                    typlist.append(self.TYPE_MAP_XML[typ])  # type: ignore[arg-type]
+                    dtyplist.append(self.DTYPE_MAP_XML[typ])  # type: ignore[arg-type]
+                except KeyError as err:
+                    raise ValueError(f"cannot convert stata types [{typ}]") from err
+
+        return typlist, dtyplist  # type: ignore[return-value]
+
+    def _get_varlist(self) -> list[str]:
+        # 33 in order formats, 129 in formats 118 and 119
+        b = 33 if self._format_version < 118 else 129
+        return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)]
+
+    # Returns the format list
+    def _get_fmtlist(self) -> list[str]:
+        if self._format_version >= 118:
+            b = 57
+        elif self._format_version > 113:
+            b = 49
+        elif self._format_version > 104:
+            b = 12
+        else:
+            b = 7
+
+        return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)]
+
+    # Returns the label list
+    def _get_lbllist(self) -> list[str]:
+        if self._format_version >= 118:
+            b = 129
+        elif self._format_version > 108:
+            b = 33
+        else:
+            b = 9
+        return [self._decode(self._path_or_buf.read(b)) for _ in range(self._nvar)]
+
+    def _get_variable_labels(self) -> list[str]:
+        if self._format_version >= 118:
+            vlblist = [
+                self._decode(self._path_or_buf.read(321)) for _ in range(self._nvar)
+            ]
+        elif self._format_version > 105:
+            vlblist = [
+                self._decode(self._path_or_buf.read(81)) for _ in range(self._nvar)
+            ]
+        else:
+            vlblist = [
+                self._decode(self._path_or_buf.read(32)) for _ in range(self._nvar)
+            ]
+        return vlblist
+
+    def _get_nobs(self) -> int:
+        if self._format_version >= 118:
+            return self._read_uint64()
+        elif self._format_version >= 103:
+            return self._read_uint32()
+        else:
+            return self._read_uint16()
+
+    def _get_data_label(self) -> str:
+        if self._format_version >= 118:
+            strlen = self._read_uint16()
+            return self._decode(self._path_or_buf.read(strlen))
+        elif self._format_version == 117:
+            strlen = self._read_int8()
+            return self._decode(self._path_or_buf.read(strlen))
+        elif self._format_version > 105:
+            return self._decode(self._path_or_buf.read(81))
+        else:
+            return self._decode(self._path_or_buf.read(32))
+
+    def _get_time_stamp(self) -> str:
+        if self._format_version >= 118:
+            strlen = self._read_int8()
+            return self._path_or_buf.read(strlen).decode("utf-8")
+        elif self._format_version == 117:
+            strlen = self._read_int8()
+            return self._decode(self._path_or_buf.read(strlen))
+        elif self._format_version > 104:
+            return self._decode(self._path_or_buf.read(18))
+        else:
+            raise ValueError
+
+    def _get_seek_variable_labels(self) -> int:
+        if self._format_version == 117:
+            self._path_or_buf.read(8)  # <variable_labels>, throw away
+            # Stata 117 data files do not follow the described format.  This is
+            # a work around that uses the previous label, 33 bytes for each
+            # variable, 20 for the closing tag and 17 for the opening tag
+            return self._seek_value_label_names + (33 * self._nvar) + 20 + 17
+        elif self._format_version >= 118:
+            return self._read_int64() + 17
+        else:
+            raise ValueError
+
+    def _read_old_header(self, first_char: bytes) -> None:
+        self._format_version = int(first_char[0])
+        if self._format_version not in [
+            102,
+            103,
+            104,
+            105,
+            108,
+            110,
+            111,
+            113,
+            114,
+            115,
+        ]:
+            raise ValueError(_version_error.format(version=self._format_version))
+        self._set_encoding()
+        # Note 102 format will have a zero in this header position, so support
+        # relies on little-endian being set whenever this value isn't one,
+        # even though for later releases strictly speaking the value should
+        # be either one or two to be valid
+        self._byteorder = ">" if self._read_int8() == 0x1 else "<"
+        self._filetype = self._read_int8()
+        self._path_or_buf.read(1)  # unused
+
+        self._nvar = self._read_uint16()
+        self._nobs = self._get_nobs()
+
+        self._data_label = self._get_data_label()
+
+        if self._format_version >= 105:
+            self._time_stamp = self._get_time_stamp()
+
+        # descriptors
+        if self._format_version >= 111:
+            typlist = [int(c) for c in self._path_or_buf.read(self._nvar)]
+        else:
+            buf = self._path_or_buf.read(self._nvar)
+            typlistb = np.frombuffer(buf, dtype=np.uint8)
+            typlist = []
+            for tp in typlistb:
+                if tp in self.OLD_TYPE_MAPPING:
+                    typlist.append(self.OLD_TYPE_MAPPING[tp])
+                else:
+                    typlist.append(tp - 127)  # bytes
+
+        try:
+            self._typlist = [self.TYPE_MAP[typ] for typ in typlist]
+        except ValueError as err:
+            invalid_types = ",".join([str(x) for x in typlist])
+            raise ValueError(f"cannot convert stata types [{invalid_types}]") from err
+        try:
+            self._dtyplist = [self.DTYPE_MAP[typ] for typ in typlist]
+        except ValueError as err:
+            invalid_dtypes = ",".join([str(x) for x in typlist])
+            raise ValueError(f"cannot convert stata dtypes [{invalid_dtypes}]") from err
+
+        if self._format_version > 108:
+            self._varlist = [
+                self._decode(self._path_or_buf.read(33)) for _ in range(self._nvar)
+            ]
+        else:
+            self._varlist = [
+                self._decode(self._path_or_buf.read(9)) for _ in range(self._nvar)
+            ]
+        self._srtlist = self._read_int16_count(self._nvar + 1)[:-1]
+
+        self._fmtlist = self._get_fmtlist()
+
+        self._lbllist = self._get_lbllist()
+
+        self._variable_labels = self._get_variable_labels()
+
+        # ignore expansion fields (Format 105 and later)
+        # When reading, read five bytes; the last four bytes now tell you
+        # the size of the next read, which you discard.  You then continue
+        # like this until you read 5 bytes of zeros.
+
+        if self._format_version > 104:
+            while True:
+                data_type = self._read_int8()
+                if self._format_version > 108:
+                    data_len = self._read_int32()
+                else:
+                    data_len = self._read_int16()
+                if data_type == 0:
+                    break
+                self._path_or_buf.read(data_len)
+
+        # necessary data to continue parsing
+        self._data_location = self._path_or_buf.tell()
+
+    def _setup_dtype(self) -> np.dtype:
+        """Map between numpy and state dtypes"""
+        if self._dtype is not None:
+            return self._dtype
+
+        dtypes = []  # Convert struct data types to numpy data type
+        for i, typ in enumerate(self._typlist):
+            if typ in self.NUMPY_TYPE_MAP:
+                typ = cast(str, typ)  # only strs in NUMPY_TYPE_MAP
+                dtypes.append((f"s{i}", f"{self._byteorder}{self.NUMPY_TYPE_MAP[typ]}"))
+            else:
+                dtypes.append((f"s{i}", f"S{typ}"))
+        self._dtype = np.dtype(dtypes)
+
+        return self._dtype
+
+    def _decode(self, s: bytes) -> str:
+        # have bytes not strings, so must decode
+        s = s.partition(b"\0")[0]
+        try:
+            return s.decode(self._encoding)
+        except UnicodeDecodeError:
+            # GH 25960, fallback to handle incorrect format produced when 117
+            # files are converted to 118 files in Stata
+            encoding = self._encoding
+            msg = f"""
+One or more strings in the dta file could not be decoded using {encoding}, and
+so the fallback encoding of latin-1 is being used.  This can happen when a file
+has been incorrectly encoded by Stata or some other software. You should verify
+the string values returned are correct."""
+            warnings.warn(
+                msg,
+                UnicodeWarning,
+                stacklevel=find_stack_level(),
+            )
+            return s.decode("latin-1")
+
+    def _read_new_value_labels(self) -> None:
+        """Reads value labels with variable length strings (108 and later format)"""
+        if self._format_version >= 117:
+            self._path_or_buf.seek(self._seek_value_labels)
+        else:
+            assert self._dtype is not None
+            offset = self._nobs * self._dtype.itemsize
+            self._path_or_buf.seek(self._data_location + offset)
+
+        while True:
+            if self._format_version >= 117:
+                if self._path_or_buf.read(5) == b"</val":  # <lbl>
+                    break  # end of value label table
+
+            slength = self._path_or_buf.read(4)
+            if not slength:
+                break  # end of value label table (format < 117), or end-of-file
+            if self._format_version == 108:
+                labname = self._decode(self._path_or_buf.read(9))
+            elif self._format_version <= 117:
+                labname = self._decode(self._path_or_buf.read(33))
+            else:
+                labname = self._decode(self._path_or_buf.read(129))
+            self._path_or_buf.read(3)  # padding
+
+            n = self._read_uint32()
+            txtlen = self._read_uint32()
+            off = np.frombuffer(
+                self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n
+            )
+            val = np.frombuffer(
+                self._path_or_buf.read(4 * n), dtype=f"{self._byteorder}i4", count=n
+            )
+            ii = np.argsort(off)
+            off = off[ii]
+            val = val[ii]
+            txt = self._path_or_buf.read(txtlen)
+            self._value_label_dict[labname] = {}
+            for i in range(n):
+                end = off[i + 1] if i < n - 1 else txtlen
+                self._value_label_dict[labname][val[i]] = self._decode(
+                    txt[off[i] : end]
+                )
+
+            if self._format_version >= 117:
+                self._path_or_buf.read(6)  # </lbl>
+
+    def _read_old_value_labels(self) -> None:
+        """Reads value labels with fixed-length strings (105 and earlier format)"""
+        assert self._dtype is not None
+        offset = self._nobs * self._dtype.itemsize
+        self._path_or_buf.seek(self._data_location + offset)
+
+        while True:
+            if not self._path_or_buf.read(2):
+                # end-of-file may have been reached, if so stop here
+                break
+
+            # otherwise back up and read again, taking byteorder into account
+            self._path_or_buf.seek(-2, os.SEEK_CUR)
+            n = self._read_uint16()
+            labname = self._decode(self._path_or_buf.read(9))
+            self._path_or_buf.read(1)  # padding
+            codes = np.frombuffer(
+                self._path_or_buf.read(2 * n), dtype=f"{self._byteorder}i2", count=n
+            )
+            self._value_label_dict[labname] = {}
+            for i in range(n):
+                self._value_label_dict[labname][codes[i]] = self._decode(
+                    self._path_or_buf.read(8)
+                )
+
+    def _read_value_labels(self) -> None:
+        self._ensure_open()
+        if self._value_labels_read:
+            # Don't read twice
+            return
+
+        if self._format_version >= 108:
+            self._read_new_value_labels()
+        else:
+            self._read_old_value_labels()
+        self._value_labels_read = True
+
+    def _read_strls(self) -> None:
+        self._path_or_buf.seek(self._seek_strls)
+        # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
+        self.GSO = {"0": ""}
+        while True:
+            if self._path_or_buf.read(3) != b"GSO":
+                break
+
+            if self._format_version == 117:
+                v_o = self._read_uint64()
+            else:
+                buf = self._path_or_buf.read(12)
+                # Only tested on little endian machine.
+                v_size = 2 if self._format_version == 118 else 3
+                if self._byteorder == "<":
+                    buf = buf[0:v_size] + buf[4 : (12 - v_size)]
+                else:
+                    buf = buf[4 - v_size : 4] + buf[(4 + v_size) :]
+                v_o = struct.unpack(f"{self._byteorder}Q", buf)[0]
+            typ = self._read_uint8()
+            length = self._read_uint32()
+            va = self._path_or_buf.read(length)
+            if typ == 130:
+                decoded_va = va[0:-1].decode(self._encoding)
+            else:
+                # Stata says typ 129 can be binary, so use str
+                decoded_va = str(va)
+                # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
+            self.GSO[str(v_o)] = decoded_va
+
+    def __next__(self) -> DataFrame:
+        self._using_iterator = True
+        return self.read(nrows=self._chunksize)
+
+    def get_chunk(self, size: int | None = None) -> DataFrame:
+        """
+        Reads lines from Stata file and returns as dataframe
+
+        Parameters
+        ----------
+        size : int, defaults to None
+            Number of lines to read.  If None, reads whole file.
+
+        Returns
+        -------
+        DataFrame
+        """
+        if size is None:
+            size = self._chunksize
+        return self.read(nrows=size)
+
+    def read(
+        self,
+        nrows: int | None = None,
+        convert_dates: bool | None = None,
+        convert_categoricals: bool | None = None,
+        index_col: str | None = None,
+        convert_missing: bool | None = None,
+        preserve_dtypes: bool | None = None,
+        columns: Sequence[str] | None = None,
+        order_categoricals: bool | None = None,
+    ) -> DataFrame:
+        """
+        Reads observations from Stata file, converting them into a dataframe
+
+        Parameters
+        ----------
+        nrows : int
+            Number of lines to read from data file, if None read whole file.
+        convert_dates : bool, default True
+            Convert date variables to DataFrame time values.
+        convert_categoricals : bool, default True
+            Read value labels and convert columns to Categorical/Factor variables.
+        index_col : str, optional
+            Column to set as index.
+        convert_missing : bool, default False
+            Flag indicating whether to convert missing values to their Stata
+            representations.  If False, missing values are replaced with nan.
+            If True, columns containing missing values are returned with
+            object data types and missing values are represented by
+            StataMissingValue objects.
+        preserve_dtypes : bool, default True
+            Preserve Stata datatypes. If False, numeric data are upcast to pandas
+            default types for foreign data (float64 or int64).
+        columns : list or None
+            Columns to retain.  Columns will be returned in the given order.  None
+            returns all columns.
+        order_categoricals : bool, default True
+            Flag indicating whether converted categorical data are ordered.
+
+        Returns
+        -------
+        DataFrame
+        """
+        self._ensure_open()
+
+        # Handle options
+        if convert_dates is None:
+            convert_dates = self._convert_dates
+        if convert_categoricals is None:
+            convert_categoricals = self._convert_categoricals
+        if convert_missing is None:
+            convert_missing = self._convert_missing
+        if preserve_dtypes is None:
+            preserve_dtypes = self._preserve_dtypes
+        if columns is None:
+            columns = self._columns
+        if order_categoricals is None:
+            order_categoricals = self._order_categoricals
+        if index_col is None:
+            index_col = self._index_col
+        if nrows is None:
+            nrows = self._nobs
+
+        # Handle empty file or chunk.  If reading incrementally raise
+        # StopIteration.  If reading the whole thing return an empty
+        # data frame.
+        if (self._nobs == 0) and nrows == 0:
+            data = DataFrame(columns=self._varlist)
+            # Apply dtypes correctly
+            for i, col in enumerate(data.columns):
+                dt = self._dtyplist[i]
+                if isinstance(dt, np.dtype):
+                    if dt.char != "S":
+                        data[col] = data[col].astype(dt)
+            if columns is not None:
+                data = self._do_select_columns(data, columns)
+            return data
+
+        if (self._format_version >= 117) and (not self._value_labels_read):
+            self._read_strls()
+
+        # Read data
+        assert self._dtype is not None
+        dtype = self._dtype
+        max_read_len = (self._nobs - self._lines_read) * dtype.itemsize
+        read_len = nrows * dtype.itemsize
+        read_len = min(read_len, max_read_len)
+        if read_len <= 0:
+            # Iterator has finished, should never be here unless
+            # we are reading the file incrementally
+            if convert_categoricals:
+                self._read_value_labels()
+            raise StopIteration
+        offset = self._lines_read * dtype.itemsize
+        self._path_or_buf.seek(self._data_location + offset)
+        read_lines = min(nrows, self._nobs - self._lines_read)
+        raw_data = np.frombuffer(
+            self._path_or_buf.read(read_len), dtype=dtype, count=read_lines
+        )
+
+        self._lines_read += read_lines
+
+        # if necessary, swap the byte order to native here
+        if self._byteorder != self._native_byteorder:
+            raw_data = raw_data.byteswap().view(raw_data.dtype.newbyteorder())
+
+        if convert_categoricals:
+            self._read_value_labels()
+
+        if len(raw_data) == 0:
+            data = DataFrame(columns=self._varlist)
+        else:
+            data = DataFrame.from_records(raw_data)
+            data.columns = Index(self._varlist)
+
+        # If index is not specified, use actual row number rather than
+        # restarting at 0 for each chunk.
+        if index_col is None:
+            data.index = RangeIndex(
+                self._lines_read - read_lines, self._lines_read
+            )  # set attr instead of set_index to avoid copy
+
+        if columns is not None:
+            data = self._do_select_columns(data, columns)
+
+        # Decode strings
+        for col, typ in zip(data, self._typlist, strict=True):
+            if isinstance(typ, int):
+                data[col] = data[col].apply(self._decode)
+
+        data = self._insert_strls(data)
+
+        # Convert columns (if needed) to match input type
+        valid_dtypes = [i for i, dtyp in enumerate(self._dtyplist) if dtyp is not None]
+        object_type = np.dtype(object)
+        for idx in valid_dtypes:
+            dtype = data.iloc[:, idx].dtype
+            if dtype not in (object_type, self._dtyplist[idx]):
+                data.isetitem(idx, data.iloc[:, idx].astype(dtype))
+
+        data = self._do_convert_missing(data, convert_missing)
+
+        if convert_dates:
+            for i, fmt in enumerate(self._fmtlist):
+                if any(fmt.startswith(date_fmt) for date_fmt in _date_formats):
+                    data.isetitem(
+                        i, _stata_elapsed_date_to_datetime_vec(data.iloc[:, i], fmt)
+                    )
+
+        if convert_categoricals:
+            data = self._do_convert_categoricals(
+                data, self._value_label_dict, self._lbllist, order_categoricals
+            )
+
+        if not preserve_dtypes:
+            retyped_data = []
+            convert = False
+            for col in data:
+                dtype = data[col].dtype
+                if dtype in (np.dtype(np.float16), np.dtype(np.float32)):
+                    dtype = np.dtype(np.float64)
+                    convert = True
+                elif dtype in (
+                    np.dtype(np.int8),
+                    np.dtype(np.int16),
+                    np.dtype(np.int32),
+                ):
+                    dtype = np.dtype(np.int64)
+                    convert = True
+                retyped_data.append((col, data[col].astype(dtype)))
+            if convert:
+                data = DataFrame.from_dict(dict(retyped_data))
+
+        if index_col is not None:
+            data = data.set_index(data.pop(index_col))
+
+        return data
+
+    def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFrame:
+        # missing code for double was different in version 105 and prior
+        old_missingdouble = float.fromhex("0x1.0p333")
+
+        # Check for missing values, and replace if found
+        replacements = {}
+        for i in range(len(data.columns)):
+            fmt = self._typlist[i]
+            # recode instances of the old missing code to the currently used value
+            if self._format_version <= 105 and fmt == "d":
+                data.iloc[:, i] = data.iloc[:, i].replace(
+                    old_missingdouble, self.MISSING_VALUES["d"]
+                )
+
+            if self._format_version <= 111:
+                if fmt not in self.OLD_VALID_RANGE:
+                    continue
+
+                fmt = cast(str, fmt)  # only strs in OLD_VALID_RANGE
+                nmin, nmax = self.OLD_VALID_RANGE[fmt]
+            else:
+                if fmt not in self.VALID_RANGE:
+                    continue
+
+                fmt = cast(str, fmt)  # only strs in VALID_RANGE
+                nmin, nmax = self.VALID_RANGE[fmt]
+            series = data.iloc[:, i]
+
+            # appreciably faster to do this with ndarray instead of Series
+            svals = series._values
+            missing = (svals < nmin) | (svals > nmax)
+
+            if not missing.any():
+                continue
+
+            if convert_missing:  # Replacement follows Stata notation
+                missing_loc = np.nonzero(np.asarray(missing))[0]
+                umissing, umissing_loc = np.unique(series[missing], return_inverse=True)
+                replacement = Series(series, dtype=object)
+                for j, um in enumerate(umissing):
+                    if self._format_version <= 111:
+                        missing_value = StataMissingValue(
+                            float(self.MISSING_VALUES[fmt])
+                        )
+                    else:
+                        missing_value = StataMissingValue(um)
+
+                    loc = missing_loc[umissing_loc == j]
+                    replacement.iloc[loc] = missing_value
+            else:  # All replacements are identical
+                dtype = series.dtype
+                if dtype not in (np.float32, np.float64):
+                    dtype = np.float64
+                replacement = Series(series, dtype=dtype)
+                # Note: operating on ._values is much faster than directly
+                # TODO: can we fix that?
+                replacement._values[missing] = np.nan
+            replacements[i] = replacement
+        if replacements:
+            for idx, value in replacements.items():
+                data.isetitem(idx, value)
+        return data
+
+    def _insert_strls(self, data: DataFrame) -> DataFrame:
+        if not hasattr(self, "GSO") or len(self.GSO) == 0:
+            return data
+        for i, typ in enumerate(self._typlist):
+            if typ != "Q":
+                continue
+            # Wrap v_o in a string to allow uint64 values as keys on 32bit OS
+            data.isetitem(i, [self.GSO[str(k)] for k in data.iloc[:, i]])
+        return data
+
+    def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFrame:
+        if not self._column_selector_set:
+            column_set = set(columns)
+            if len(column_set) != len(columns):
+                raise ValueError("columns contains duplicate entries")
+            unmatched = column_set.difference(data.columns)
+            if unmatched:
+                joined = ", ".join(list(unmatched))
+                raise ValueError(
+                    "The following columns were not "
+                    f"found in the Stata data set: {joined}"
+                )
+            # Copy information for retained columns for later processing
+            dtyplist = []
+            typlist = []
+            fmtlist = []
+            lbllist = []
+            for col in columns:
+                i = data.columns.get_loc(col)  # type: ignore[no-untyped-call]
+                dtyplist.append(self._dtyplist[i])
+                typlist.append(self._typlist[i])
+                fmtlist.append(self._fmtlist[i])
+                lbllist.append(self._lbllist[i])
+
+            self._dtyplist = dtyplist
+            self._typlist = typlist
+            self._fmtlist = fmtlist
+            self._lbllist = lbllist
+            self._column_selector_set = True
+
+        return data[columns]
+
+    def _do_convert_categoricals(
+        self,
+        data: DataFrame,
+        value_label_dict: dict[str, dict[int, str]],
+        lbllist: Sequence[str],
+        order_categoricals: bool,
+    ) -> DataFrame:
+        """
+        Converts categorical columns to Categorical type.
+        """
+        if not value_label_dict:
+            return data
+        cat_converted_data = []
+        for col, label in zip(data, lbllist, strict=True):
+            if label in value_label_dict:
+                # Explicit call with ordered=True
+                vl = value_label_dict[label]
+                keys = np.array(list(vl.keys()))
+                column = data[col]
+                key_matches = column.isin(keys)
+                if self._using_iterator and key_matches.all():
+                    initial_categories: np.ndarray | None = keys
+                    # If all categories are in the keys and we are iterating,
+                    # use the same keys for all chunks. If some are missing
+                    # value labels, then we will fall back to the categories
+                    # varying across chunks.
+                else:
+                    if self._using_iterator:
+                        # warn is using an iterator
+                        warnings.warn(
+                            categorical_conversion_warning,
+                            CategoricalConversionWarning,
+                            stacklevel=find_stack_level(),
+                        )
+                    initial_categories = None
+                cat_data = Categorical(
+                    column, categories=initial_categories, ordered=order_categoricals
+                )
+                if initial_categories is None:
+                    # If None here, then we need to match the cats in the Categorical
+                    categories = []
+                    for category in cat_data.categories:
+                        if category in vl:
+                            categories.append(vl[category])
+                        else:
+                            categories.append(category)
+                else:
+                    # If all cats are matched, we can use the values
+                    categories = list(vl.values())
+                try:
+                    # Try to catch duplicate categories
+                    # TODO: if we get a non-copying rename_categories, use that
+                    cat_data = cat_data.rename_categories(categories)
+                except ValueError as err:
+                    vc = Series(categories, copy=False).value_counts()
+                    repeated_cats = list(vc.index[vc > 1])
+                    repeats = "-" * 80 + "\n" + "\n".join(repeated_cats)
+                    # GH 25772
+                    msg = f"""
+Value labels for column {col} are not unique. These cannot be converted to
+pandas categoricals.
+
+Either read the file with `convert_categoricals` set to False or use the
+low level interface in `StataReader` to separately read the values and the
+value_labels.
+
+The repeated labels are:
+{repeats}
+"""
+                    raise ValueError(msg) from err
+                # TODO: is the next line needed above in the data(...) method?
+                cat_series = Series(cat_data, index=data.index, copy=False)
+                cat_converted_data.append((col, cat_series))
+            else:
+                cat_converted_data.append((col, data[col]))
+        data = DataFrame(dict(cat_converted_data), copy=False)
+        return data
+
+    @property
+    def data_label(self) -> str:
+        """
+        Return data label of Stata file.
+
+        The data label is a descriptive string associated with the dataset
+        stored in the Stata file. This property provides access to that
+        label, if one is present.
+
+        See Also
+        --------
+        io.stata.StataReader.variable_labels : Return a dict associating each variable
+            name with corresponding label.
+        DataFrame.to_stata : Export DataFrame object to Stata dta format.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([(1,)], columns=["variable"])
+        >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21)
+        >>> data_label = "This is a data file."
+        >>> path = "/My_path/filename.dta"
+        >>> df.to_stata(
+        ...     path,
+        ...     time_stamp=time_stamp,  # doctest: +SKIP
+        ...     data_label=data_label,  # doctest: +SKIP
+        ...     version=None,
+        ... )  # doctest: +SKIP
+        >>> with pd.io.stata.StataReader(path) as reader:  # doctest: +SKIP
+        ...     print(reader.data_label)  # doctest: +SKIP
+        This is a data file.
+        """
+        self._ensure_open()
+        return self._data_label
+
+    @property
+    def time_stamp(self) -> str:
+        """
+        Return time stamp of Stata file.
+        """
+        self._ensure_open()
+        return self._time_stamp
+
+    def variable_labels(self) -> dict[str, str]:
+        """
+        Return a dict associating each variable name with corresponding label.
+
+        This method retrieves variable labels from a Stata file. Variable labels are
+        mappings between variable names and their corresponding descriptive labels
+        in a Stata dataset.
+
+        Returns
+        -------
+        dict
+            A python dictionary.
+
+        See Also
+        --------
+        read_stata : Read Stata file into DataFrame.
+        DataFrame.to_stata : Export DataFrame object to Stata dta format.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["col_1", "col_2"])
+        >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21)
+        >>> path = "/My_path/filename.dta"
+        >>> variable_labels = {"col_1": "This is an example"}
+        >>> df.to_stata(
+        ...     path,
+        ...     time_stamp=time_stamp,  # doctest: +SKIP
+        ...     variable_labels=variable_labels,
+        ...     version=None,
+        ... )  # doctest: +SKIP
+        >>> with pd.io.stata.StataReader(path) as reader:  # doctest: +SKIP
+        ...     print(reader.variable_labels())  # doctest: +SKIP
+        {'index': '', 'col_1': 'This is an example', 'col_2': ''}
+        >>> pd.read_stata(path)  # doctest: +SKIP
+            index col_1 col_2
+        0       0    1    2
+        1       1    3    4
+        """
+        self._ensure_open()
+        return dict(zip(self._varlist, self._variable_labels, strict=True))
+
+    def value_labels(self) -> dict[str, dict[int, str]]:
+        """
+        Return a nested dict associating each variable name to its value and label.
+
+        This method retrieves the value labels from a Stata file. Value labels are
+        mappings between the coded values and their corresponding descriptive labels
+        in a Stata dataset.
+
+        Returns
+        -------
+        dict
+            A python dictionary.
+
+        See Also
+        --------
+        read_stata : Read Stata file into DataFrame.
+        DataFrame.to_stata : Export DataFrame object to Stata dta format.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=["col_1", "col_2"])
+        >>> time_stamp = pd.Timestamp(2000, 2, 29, 14, 21)
+        >>> path = "/My_path/filename.dta"
+        >>> value_labels = {"col_1": {3: "x"}}
+        >>> df.to_stata(
+        ...     path,
+        ...     time_stamp=time_stamp,  # doctest: +SKIP
+        ...     value_labels=value_labels,
+        ...     version=None,
+        ... )  # doctest: +SKIP
+        >>> with pd.io.stata.StataReader(path) as reader:  # doctest: +SKIP
+        ...     print(reader.value_labels())  # doctest: +SKIP
+        {'col_1': {3: 'x'}}
+        >>> pd.read_stata(path)  # doctest: +SKIP
+            index col_1 col_2
+        0       0    1    2
+        1       1    x    4
+        """
+        if not self._value_labels_read:
+            self._read_value_labels()
+
+        return self._value_label_dict
+
+
+@set_module("pandas")
+def read_stata(
+    filepath_or_buffer: FilePath | ReadBuffer[bytes],
+    *,
+    convert_dates: bool = True,
+    convert_categoricals: bool = True,
+    index_col: str | None = None,
+    convert_missing: bool = False,
+    preserve_dtypes: bool = True,
+    columns: Sequence[str] | None = None,
+    order_categoricals: bool = True,
+    chunksize: int | None = None,
+    iterator: bool = False,
+    compression: CompressionOptions = "infer",
+    storage_options: StorageOptions | None = None,
+) -> DataFrame | StataReader:
+    """
+    Read Stata file into DataFrame.
+
+    Parameters
+    ----------
+    filepath_or_buffer : str, path object or file-like object
+        Any valid string path is acceptable. The string could be a URL. Valid
+        URL schemes include http, ftp, s3, and file. For file URLs, a host is
+        expected. A local file could be: ``file://localhost/path/to/table.dta``.
+
+        If you want to pass in a path object, pandas accepts any ``os.PathLike``.
+
+        By file-like object, we refer to objects with a ``read()`` method,
+        such as a file handle (e.g. via builtin ``open`` function)
+        or ``StringIO``.
+    convert_dates : bool, default True
+        Convert date variables to DataFrame time values.
+    convert_categoricals : bool, default True
+        Read value labels and convert columns to Categorical/Factor variables.
+    index_col : str, optional
+        Column to set as index.
+    convert_missing : bool, default False
+        Flag indicating whether to convert missing values to their Stata
+        representations.  If False, missing values are replaced with nan.
+        If True, columns containing missing values are returned with
+        object data types and missing values are represented by
+        StataMissingValue objects.
+    preserve_dtypes : bool, default True
+        Preserve Stata datatypes. If False, numeric data are upcast to pandas
+        default types for foreign data (float64 or int64).
+    columns : list or None
+        Columns to retain.  Columns will be returned in the given order.  None
+        returns all columns.
+    order_categoricals : bool, default True
+        Flag indicating whether converted categorical data are ordered.
+    chunksize : int, default None
+        Return StataReader object for iterations, returns chunks with
+        given number of lines.
+    iterator : bool, default False
+        Return StataReader object.
+    compression : str or dict, default 'infer'
+        For on-the-fly decompression of on-disk data. If 'infer' and
+        'filepath_or_buffer' is path-like, then detect compression from the
+        following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
+        '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
+        If using 'zip' or 'tar', the ZIP file must contain only one
+        data file to be read in. Set to ``None`` for no decompression.
+        Can also be a dict with key ``'method'`` set to one of
+        {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``} and
+        other key-value pairs are forwarded to
+        ``zipfile.ZipFile``, ``gzip.GzipFile``,
+        ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``, ``lzma.LZMAFile`` or
+        ``tarfile.TarFile``, respectively.
+        As an example, the following could be passed for Zstandard decompression using a
+        custom compression dictionary:
+        ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection, e.g.
+        host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+        are forwarded to ``urllib.request.Request`` as header options. For other
+        URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+        forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+        details, and for more examples on storage options refer `here
+        <https://pandas.pydata.org/docs/user_guide/io.html?
+        highlight=storage_options#reading-writing-remote-files>`_.
+
+    Returns
+    -------
+    DataFrame, pandas.api.typing.StataReader
+        If iterator or chunksize, returns StataReader, else DataFrame.
+
+    See Also
+    --------
+    io.stata.StataReader : Low-level reader for Stata data files.
+    DataFrame.to_stata: Export Stata data files.
+
+    Notes
+    -----
+    Categorical variables read through an iterator may not have the same
+    categories and dtype. This occurs when  a variable stored in a DTA
+    file is associated to an incomplete set of value labels that only
+    label a strict subset of the values.
+
+    Examples
+    --------
+
+    Creating a dummy stata for this example
+
+    >>> df = pd.DataFrame(
+    ...     {
+    ...         "animal": ["falcon", "parrot", "falcon", "parrot"],
+    ...         "speed": [350, 18, 361, 15],
+    ...     }
+    ... )  # doctest: +SKIP
+    >>> df.to_stata("animals.dta")  # doctest: +SKIP
+
+    Read a Stata dta file:
+
+    >>> df = pd.read_stata("animals.dta")  # doctest: +SKIP
+
+    Read a Stata dta file in 10,000 line chunks:
+
+    >>> values = np.random.randint(
+    ...     0, 10, size=(20_000, 1), dtype="uint8"
+    ... )  # doctest: +SKIP
+    >>> df = pd.DataFrame(values, columns=["i"])  # doctest: +SKIP
+    >>> df.to_stata("filename.dta")  # doctest: +SKIP
+
+    >>> with pd.read_stata('filename.dta', chunksize=10000) as itr:  # doctest: +SKIP
+    >>>     for chunk in itr:
+    ...         # Operate on a single chunk, e.g., chunk.mean()
+    ...         pass  # doctest: +SKIP
+    """
+    reader = StataReader(
+        filepath_or_buffer,
+        convert_dates=convert_dates,
+        convert_categoricals=convert_categoricals,
+        index_col=index_col,
+        convert_missing=convert_missing,
+        preserve_dtypes=preserve_dtypes,
+        columns=columns,
+        order_categoricals=order_categoricals,
+        chunksize=chunksize,
+        storage_options=storage_options,
+        compression=compression,
+    )
+
+    if iterator or chunksize:
+        return reader
+
+    with reader:
+        return reader.read()
+
+
+def _set_endianness(endianness: str) -> str:
+    if endianness.lower() in ["<", "little"]:
+        return "<"
+    elif endianness.lower() in [">", "big"]:
+        return ">"
+    else:  # pragma : no cover
+        raise ValueError(f"Endianness {endianness} not understood")
+
+
+def _pad_bytes(name: AnyStr, length: int) -> AnyStr:
+    """
+    Take a char string and pads it with null bytes until it's length chars.
+    """
+    if isinstance(name, bytes):
+        return name + b"\x00" * (length - len(name))
+    return name + "\x00" * (length - len(name))
+
+
+def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:
+    """
+    Convert from one of the stata date formats to a type in TYPE_MAP.
+    """
+    if fmt in [
+        "tc",
+        "%tc",
+        "td",
+        "%td",
+        "tw",
+        "%tw",
+        "tm",
+        "%tm",
+        "tq",
+        "%tq",
+        "th",
+        "%th",
+        "ty",
+        "%ty",
+    ]:
+        return np.dtype(np.float64)  # Stata expects doubles for SIFs
+    else:
+        raise NotImplementedError(f"Format {fmt} not implemented")
+
+
+def _maybe_convert_to_int_keys(convert_dates: dict, varlist: list[Hashable]) -> dict:
+    new_dict = {}
+    for key, value in convert_dates.items():
+        if not value.startswith("%"):  # make sure proper fmts
+            convert_dates[key] = "%" + value
+        if key in varlist:
+            new_dict[varlist.index(key)] = convert_dates[key]
+        else:
+            if not isinstance(key, int):
+                raise ValueError("convert_dates key must be a column or an integer")
+            new_dict[key] = convert_dates[key]
+    return new_dict
+
+
+def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int:
+    """
+    Convert dtype types to stata types. Returns the byte of the given ordinal.
+    See TYPE_MAP and comments for an explanation. This is also explained in
+    the dta spec.
+    1 - 244 are strings of this length
+                         Pandas    Stata
+    251 - for int8      byte
+    252 - for int16     int
+    253 - for int32     long
+    254 - for float32   float
+    255 - for double    double
+
+    If there are dates to convert, then dtype will already have the correct
+    type inserted.
+    """
+    # TODO: expand to handle datetime to integer conversion
+    if dtype.type is np.object_:  # try to coerce it to the biggest string
+        # not memory efficient, what else could we
+        # do?
+        itemsize = max_len_string_array(ensure_object(column._values))
+        return max(itemsize, 1)
+    elif dtype.type is np.float64:
+        return 255
+    elif dtype.type is np.float32:
+        return 254
+    elif dtype.type is np.int32:
+        return 253
+    elif dtype.type is np.int16:
+        return 252
+    elif dtype.type is np.int8:
+        return 251
+    else:  # pragma : no cover
+        raise NotImplementedError(f"Data type {dtype} not supported.")
+
+
+def _dtype_to_default_stata_fmt(
+    dtype: np.dtype, column: Series, dta_version: int = 114, force_strl: bool = False
+) -> str:
+    """
+    Map numpy dtype to stata's default format for this type. Not terribly
+    important since users can change this in Stata. Semantics are
+
+    object  -> "%DDs" where DD is the length of the string.  If not a string,
+                raise ValueError
+    float64 -> "%10.0g"
+    float32 -> "%9.0g"
+    int64   -> "%9.0g"
+    int32   -> "%12.0g"
+    int16   -> "%8.0g"
+    int8    -> "%8.0g"
+    strl    -> "%9s"
+    """
+    # TODO: Refactor to combine type with format
+    # TODO: expand this to handle a default datetime format?
+    if dta_version < 117:
+        max_str_len = 244
+    else:
+        max_str_len = 2045
+        if force_strl:
+            return "%9s"
+    if dtype.type is np.object_:
+        itemsize = max_len_string_array(ensure_object(column._values))
+        if itemsize > max_str_len:
+            if dta_version >= 117:
+                return "%9s"
+            else:
+                raise ValueError(excessive_string_length_error.format(column.name))
+        return "%" + str(max(itemsize, 1)) + "s"
+    elif dtype == np.float64:
+        return "%10.0g"
+    elif dtype == np.float32:
+        return "%9.0g"
+    elif dtype == np.int32:
+        return "%12.0g"
+    elif dtype in (np.int8, np.int16):
+        return "%8.0g"
+    else:  # pragma : no cover
+        raise NotImplementedError(f"Data type {dtype} not supported.")
+
+
+class StataWriter(StataParser):
+    """
+    A class for writing Stata binary dta files
+
+    Parameters
+    ----------
+    fname : path (string), buffer or path object
+        string, pathlib.Path or
+        object implementing a binary write() functions. If using a buffer
+        then the buffer will not be automatically closed after the file
+        is written.
+    data : DataFrame
+        Input to save
+    convert_dates : dict
+        Dictionary mapping columns containing datetime types to stata internal
+        format to use when writing the dates. Options are 'tc', 'td', 'tm',
+        'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
+        Datetime columns that do not have a conversion type specified will be
+        converted to 'tc'. Raises NotImplementedError if a datetime column has
+        timezone information
+    write_index : bool
+        Write the index to Stata dataset.
+    byteorder : str
+        Can be ">", "<", "little", or "big". default is `sys.byteorder`
+    time_stamp : datetime
+        A datetime to use as file creation date.  Default is the current time
+    data_label : str
+        A label for the data set.  Must be 80 characters or smaller.
+    variable_labels : dict
+        Dictionary containing columns as keys and variable labels as values.
+        Each label must be 80 characters or smaller.
+    compression : str or dict, default 'infer'
+        For on-the-fly compression of the output data. If 'infer' and 'fname' is
+        path-like, then detect compression from the following extensions: '.gz',
+        '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'
+        (otherwise no compression).
+        Set to ``None`` for no compression.
+        Can also be a dict with key ``'method'`` set
+        to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``}
+        and other key-value pairs are forwarded to
+        ``zipfile.ZipFile``, ``gzip.GzipFile``,
+        ``bz2.BZ2File``, ``zstandard.ZstdCompressor``, ``lzma.LZMAFile`` or
+        ``tarfile.TarFile``, respectively.
+        As an example, the following could be passed for faster compression and to
+        create a reproducible gzip archive:
+        ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection, e.g.
+        host, port, username, password, etc. For HTTP(S) URLs the key-value pairs
+        are forwarded to ``urllib.request.Request`` as header options. For other
+        URLs (e.g. starting with "s3://", and "gcs://") the key-value pairs are
+        forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more
+        details, and for more examples on storage options refer `here
+        <https://pandas.pydata.org/docs/user_guide/io.html?
+        highlight=storage_options#reading-writing-remote-files>`_.
+
+    value_labels : dict of dicts
+        Dictionary containing columns as keys and dictionaries of column value
+        to labels as values. The combined length of all labels for a single
+        variable must be 32,000 characters or smaller.
+
+    Returns
+    -------
+    writer : StataWriter instance
+        The StataWriter instance has a write_file method, which will
+        write the file to the given `fname`.
+
+    Raises
+    ------
+    NotImplementedError
+        * If datetimes contain timezone information
+    ValueError
+        * Columns listed in convert_dates are neither datetime64[ns]
+          or datetime
+        * Column dtype is not representable in Stata
+        * Column listed in convert_dates is not in DataFrame
+        * Categorical label contains more than 32,000 characters
+
+    Examples
+    --------
+    >>> data = pd.DataFrame([[1.0, 1]], columns=["a", "b"])
+    >>> writer = StataWriter("./data_file.dta", data)
+    >>> writer.write_file()
+
+    Directly write a zip file
+    >>> compression = {"method": "zip", "archive_name": "data_file.dta"}
+    >>> writer = StataWriter("./data_file.zip", data, compression=compression)
+    >>> writer.write_file()
+
+    Save a DataFrame with dates
+    >>> from datetime import datetime
+    >>> data = pd.DataFrame([[datetime(2000, 1, 1)]], columns=["date"])
+    >>> writer = StataWriter("./date_data_file.dta", data, {"date": "tw"})
+    >>> writer.write_file()
+    """
+
+    _max_string_length = 244
+    _encoding: Literal["latin-1", "utf-8"] = "latin-1"
+
+    def __init__(
+        self,
+        fname: FilePath | WriteBuffer[bytes],
+        data: DataFrame,
+        convert_dates: dict[Hashable, str] | None = None,
+        write_index: bool = True,
+        byteorder: str | None = None,
+        time_stamp: datetime | None = None,
+        data_label: str | None = None,
+        variable_labels: dict[Hashable, str] | None = None,
+        compression: CompressionOptions = "infer",
+        storage_options: StorageOptions | None = None,
+        *,
+        value_labels: dict[Hashable, dict[float, str]] | None = None,
+    ) -> None:
+        super().__init__()
+        self.data = data
+        self._convert_dates = {} if convert_dates is None else convert_dates
+        self._write_index = write_index
+        self._time_stamp = time_stamp
+        self._data_label = data_label
+        self._variable_labels = variable_labels
+        self._non_cat_value_labels = value_labels
+        self._value_labels: list[StataValueLabel] = []
+        self._has_value_labels = np.array([], dtype=bool)
+        self._compression = compression
+        self._output_file: IO[bytes] | None = None
+        self._converted_names: dict[Hashable, str] = {}
+        # attach nobs, nvars, data, varlist, typlist
+        self._prepare_pandas(data)
+        self.storage_options = storage_options
+
+        if byteorder is None:
+            byteorder = sys.byteorder
+        self._byteorder = _set_endianness(byteorder)
+        self._fname = fname
+        self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8}
+
+    def _write(self, to_write: str) -> None:
+        """
+        Helper to call encode before writing to file for Python 3 compat.
+        """
+        self.handles.handle.write(to_write.encode(self._encoding))
+
+    def _write_bytes(self, value: bytes) -> None:
+        """
+        Helper to assert file is open before writing.
+        """
+        self.handles.handle.write(value)
+
+    def _prepare_non_cat_value_labels(
+        self, data: DataFrame
+    ) -> list[StataNonCatValueLabel]:
+        """
+        Check for value labels provided for non-categorical columns. Value
+        labels
+        """
+        non_cat_value_labels: list[StataNonCatValueLabel] = []
+        if self._non_cat_value_labels is None:
+            return non_cat_value_labels
+
+        for labname, labels in self._non_cat_value_labels.items():
+            if labname in self._converted_names:
+                colname = self._converted_names[labname]
+            elif labname in data.columns:
+                colname = str(labname)
+            else:
+                raise KeyError(
+                    f"Can't create value labels for {labname}, it wasn't "
+                    "found in the dataset."
+                )
+
+            if not is_numeric_dtype(data[colname].dtype):
+                # Labels should not be passed explicitly for categorical
+                # columns that will be converted to int
+                raise ValueError(
+                    f"Can't create value labels for {labname}, value labels "
+                    "can only be applied to numeric columns."
+                )
+            svl = StataNonCatValueLabel(colname, labels, self._encoding)
+            non_cat_value_labels.append(svl)
+        return non_cat_value_labels
+
+    def _prepare_categoricals(self, data: DataFrame) -> DataFrame:
+        """
+        Check for categorical columns, retain categorical information for
+        Stata file and convert categorical data to int
+        """
+        is_cat = [isinstance(dtype, CategoricalDtype) for dtype in data.dtypes]
+        if not any(is_cat):
+            return data
+
+        self._has_value_labels |= np.array(is_cat)
+
+        get_base_missing_value = StataMissingValue.get_base_missing_value
+        data_formatted = []
+        for col, col_is_cat in zip(data, is_cat, strict=True):
+            if col_is_cat:
+                svl = StataValueLabel(data[col], encoding=self._encoding)
+                self._value_labels.append(svl)
+                dtype = data[col].cat.codes.dtype
+                if dtype == np.int64:
+                    raise ValueError(
+                        "It is not possible to export "
+                        "int64-based categorical data to Stata."
+                    )
+                values = data[col].cat.codes._values.copy()
+
+                # Upcast if needed so that correct missing values can be set
+                if values.max() >= get_base_missing_value(dtype):
+                    if dtype == np.int8:
+                        dtype = np.dtype(np.int16)
+                    elif dtype == np.int16:
+                        dtype = np.dtype(np.int32)
+                    else:
+                        dtype = np.dtype(np.float64)
+                    values = np.array(values, dtype=dtype)
+
+                # Replace missing values with Stata missing value for type
+                values[values == -1] = get_base_missing_value(dtype)
+                data_formatted.append((col, values))
+            else:
+                data_formatted.append((col, data[col]))
+        return DataFrame.from_dict(dict(data_formatted))
+
+    def _replace_nans(self, data: DataFrame) -> DataFrame:
+        # return data
+        """
+        Checks floating point data columns for nans, and replaces these with
+        the generic Stata for missing value (.)
+        """
+        for c in data:
+            dtype = data[c].dtype
+            if dtype in (np.float32, np.float64):
+                if dtype == np.float32:
+                    replacement = self.MISSING_VALUES["f"]
+                else:
+                    replacement = self.MISSING_VALUES["d"]
+                data[c] = data[c].fillna(replacement)
+
+        return data
+
+    def _update_strl_names(self) -> None:
+        """No-op, forward compatibility"""
+
+    def _validate_variable_name(self, name: str) -> str:
+        """
+        Validate variable names for Stata export.
+
+        Parameters
+        ----------
+        name : str
+            Variable name
+
+        Returns
+        -------
+        str
+            The validated name with invalid characters replaced with
+            underscores.
+
+        Notes
+        -----
+        Stata 114 and 117 support ascii characters in a-z, A-Z, 0-9
+        and _.
+        """
+        for c in name:
+            if (
+                (c < "A" or c > "Z")
+                and (c < "a" or c > "z")
+                and (c < "0" or c > "9")
+                and c != "_"
+            ):
+                name = name.replace(c, "_")
+        return name
+
+    def _check_column_names(self, data: DataFrame) -> DataFrame:
+        """
+        Checks column names to ensure that they are valid Stata column names.
+        This includes checks for:
+            * Non-string names
+            * Stata keywords
+            * Variables that start with numbers
+            * Variables with names that are too long
+
+        When an illegal variable name is detected, it is converted, and if
+        dates are exported, the variable name is propagated to the date
+        conversion dictionary
+        """
+        converted_names: dict[Hashable, str] = {}
+        columns = list(data.columns)
+        original_columns = columns[:]
+
+        duplicate_var_id = 0
+        for j, name in enumerate(columns):
+            orig_name = name
+            if not isinstance(name, str):
+                name = str(name)
+
+            name = self._validate_variable_name(name)
+
+            # Variable name must not be a reserved word
+            if name in self.RESERVED_WORDS:
+                name = "_" + name
+
+            # Variable name may not start with a number
+            if "0" <= name[0] <= "9":
+                name = "_" + name
+
+            name = name[: min(len(name), 32)]
+
+            if not name == orig_name:
+                # check for duplicates
+                while columns.count(name) > 0:
+                    # prepend ascending number to avoid duplicates
+                    name = "_" + str(duplicate_var_id) + name
+                    name = name[: min(len(name), 32)]
+                    duplicate_var_id += 1
+                converted_names[orig_name] = name
+
+            columns[j] = name
+
+        data.columns = Index(columns)
+
+        # Check date conversion, and fix key if needed
+        if self._convert_dates:
+            for c, o in zip(columns, original_columns, strict=True):
+                if c != o:
+                    self._convert_dates[c] = self._convert_dates[o]
+                    del self._convert_dates[o]
+
+        if converted_names:
+            conversion_warning = []
+            for orig_name, name in converted_names.items():
+                msg = f"{orig_name}   ->   {name}"
+                conversion_warning.append(msg)
+
+            ws = invalid_name_doc.format("\n    ".join(conversion_warning))
+            warnings.warn(
+                ws,
+                InvalidColumnName,
+                stacklevel=find_stack_level(),
+            )
+
+        self._converted_names = converted_names
+        self._update_strl_names()
+
+        return data
+
+    def _set_formats_and_types(self, dtypes: Series) -> None:
+        self.fmtlist: list[str] = []
+        self.typlist: list[int] = []
+        for col, dtype in dtypes.items():
+            self.fmtlist.append(_dtype_to_default_stata_fmt(dtype, self.data[col]))
+            self.typlist.append(_dtype_to_stata_type(dtype, self.data[col]))
+
+    def _prepare_pandas(self, data: DataFrame) -> None:
+        # NOTE: we might need a different API / class for pandas objects so
+        # we can set different semantics - handle this with a PR to pandas.io
+
+        data = data.copy()
+
+        if self._write_index:
+            temp = data.reset_index()
+            if isinstance(temp, DataFrame):
+                data = temp
+
+        # Ensure column names are strings
+        data = self._check_column_names(data)
+
+        # Check columns for compatibility with stata, upcast if necessary
+        # Raise if outside the supported range
+        data = _cast_to_stata_types(data)
+
+        # Replace NaNs with Stata missing values
+        data = self._replace_nans(data)
+
+        # Set all columns to initially unlabelled
+        self._has_value_labels = np.repeat(False, data.shape[1])
+
+        # Create value labels for non-categorical data
+        non_cat_value_labels = self._prepare_non_cat_value_labels(data)
+
+        non_cat_columns = [svl.labname for svl in non_cat_value_labels]
+        has_non_cat_val_labels = data.columns.isin(non_cat_columns)
+        self._has_value_labels |= has_non_cat_val_labels
+        self._value_labels.extend(non_cat_value_labels)
+
+        # Convert categoricals to int data, and strip labels
+        data = self._prepare_categoricals(data)
+
+        self.nobs, self.nvar = data.shape
+        self.data = data
+        self.varlist = data.columns.tolist()
+
+        dtypes = data.dtypes
+
+        # Ensure all date columns are converted
+        for col in data:
+            if col in self._convert_dates:
+                continue
+            if lib.is_np_dtype(data[col].dtype, "M"):
+                self._convert_dates[col] = "tc"
+
+        self._convert_dates = _maybe_convert_to_int_keys(
+            self._convert_dates, self.varlist
+        )
+        for key in self._convert_dates:
+            new_type = _convert_datetime_to_stata_type(self._convert_dates[key])
+            dtypes.iloc[key] = np.dtype(new_type)
+
+        # Verify object arrays are strings and encode to bytes
+        self._encode_strings()
+
+        self._set_formats_and_types(dtypes)
+
+        # set the given format for the datetime cols
+        if self._convert_dates is not None:
+            for key in self._convert_dates:
+                if isinstance(key, int):
+                    self.fmtlist[key] = self._convert_dates[key]
+
+    def _encode_strings(self) -> None:
+        """
+        Encode strings in dta-specific encoding
+
+        Do not encode columns marked for date conversion or for strL
+        conversion. The strL converter independently handles conversion and
+        also accepts empty string arrays.
+        """
+        convert_dates = self._convert_dates
+        # _convert_strl is not available in dta 114
+        convert_strl = getattr(self, "_convert_strl", [])
+        for i, col in enumerate(self.data):
+            # Skip columns marked for date conversion or strl conversion
+            if i in convert_dates or col in convert_strl:
+                continue
+            column = self.data[col]
+            dtype = column.dtype
+            # TODO could also handle string dtype here specifically
+            if dtype.type is np.object_:
+                inferred_dtype = infer_dtype(column, skipna=True)
+                if not ((inferred_dtype == "string") or len(column) == 0):
+                    col = column.name
+                    raise ValueError(
+                        f"""\
+Column `{col}` cannot be exported.\n\nOnly string-like object arrays
+containing all strings or a mix of strings and None can be exported.
+Object arrays containing only null values are prohibited. Other object
+types cannot be exported and must first be converted to one of the
+supported types."""
+                    )
+                encoded = self.data[col].str.encode(self._encoding)
+                # If larger than _max_string_length do nothing
+                if (
+                    max_len_string_array(ensure_object(self.data[col]._values))
+                    <= self._max_string_length
+                ):
+                    self.data[col] = encoded
+
+    def write_file(self) -> None:
+        """
+        Export DataFrame object to Stata dta format.
+
+        This method writes the contents of a pandas DataFrame to a `.dta` file
+        compatible with Stata. It includes features for handling value labels,
+        variable types, and metadata like timestamps and data labels. The output
+        file can then be read and used in Stata or other compatible statistical
+        tools.
+
+        See Also
+        --------
+        read_stata : Read Stata file into DataFrame.
+        DataFrame.to_stata : Export DataFrame object to Stata dta format.
+        io.stata.StataWriter : A class for writing Stata binary dta files.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "fully_labelled": [1, 2, 3, 3, 1],
+        ...         "partially_labelled": [1.0, 2.0, np.nan, 9.0, np.nan],
+        ...         "Y": [7, 7, 9, 8, 10],
+        ...         "Z": pd.Categorical(["j", "k", "l", "k", "j"]),
+        ...     }
+        ... )
+        >>> path = "/My_path/filename.dta"
+        >>> labels = {
+        ...     "fully_labelled": {1: "one", 2: "two", 3: "three"},
+        ...     "partially_labelled": {1.0: "one", 2.0: "two"},
+        ... }
+        >>> writer = pd.io.stata.StataWriter(
+        ...     path, df, value_labels=labels
+        ... )  # doctest: +SKIP
+        >>> writer.write_file()  # doctest: +SKIP
+        >>> df = pd.read_stata(path)  # doctest: +SKIP
+        >>> df  # doctest: +SKIP
+            index fully_labelled  partially_labeled  Y  Z
+        0       0            one                one  7  j
+        1       1            two                two  7  k
+        2       2          three                NaN  9  l
+        3       3          three                9.0  8  k
+        4       4            one                NaN 10  j
+        """
+        with get_handle(
+            self._fname,
+            "wb",
+            compression=self._compression,
+            is_text=False,
+            storage_options=self.storage_options,
+        ) as self.handles:
+            if self.handles.compression["method"] is not None:
+                # ZipFile creates a file (with the same name) for each write call.
+                # Write it first into a buffer and then write the buffer to the ZipFile.
+                self._output_file, self.handles.handle = self.handles.handle, BytesIO()
+                self.handles.created_handles.append(self.handles.handle)
+
+            try:
+                self._write_header(
+                    data_label=self._data_label, time_stamp=self._time_stamp
+                )
+                self._write_map()
+                self._write_variable_types()
+                self._write_varnames()
+                self._write_sortlist()
+                self._write_formats()
+                self._write_value_label_names()
+                self._write_variable_labels()
+                self._write_expansion_fields()
+                self._write_characteristics()
+                records = self._prepare_data()
+                self._write_data(records)
+                self._write_strls()
+                self._write_value_labels()
+                self._write_file_close_tag()
+                self._write_map()
+                self._close()
+            except Exception as exc:
+                self.handles.close()
+                if isinstance(self._fname, (str, os.PathLike)) and os.path.isfile(
+                    self._fname
+                ):
+                    try:
+                        os.unlink(self._fname)
+                    except OSError:
+                        warnings.warn(
+                            f"This save was not successful but {self._fname} could not "
+                            "be deleted. This file is not valid.",
+                            ResourceWarning,
+                            stacklevel=find_stack_level(),
+                        )
+                raise exc
+
+    def _close(self) -> None:
+        """
+        Close the file if it was created by the writer.
+
+        If a buffer or file-like object was passed in, for example a GzipFile,
+        then leave this file open for the caller to close.
+        """
+        # write compression
+        if self._output_file is not None:
+            assert isinstance(self.handles.handle, BytesIO)
+            bio, self.handles.handle = self.handles.handle, self._output_file
+            self.handles.handle.write(bio.getvalue())
+
+    def _write_map(self) -> None:
+        """No-op, future compatibility"""
+
+    def _write_file_close_tag(self) -> None:
+        """No-op, future compatibility"""
+
+    def _write_characteristics(self) -> None:
+        """No-op, future compatibility"""
+
+    def _write_strls(self) -> None:
+        """No-op, future compatibility"""
+
+    def _write_expansion_fields(self) -> None:
+        """Write 5 zeros for expansion fields"""
+        self._write(_pad_bytes("", 5))
+
+    def _write_value_labels(self) -> None:
+        for vl in self._value_labels:
+            self._write_bytes(vl.generate_value_label(self._byteorder))
+
+    def _write_header(
+        self,
+        data_label: str | None = None,
+        time_stamp: datetime | None = None,
+    ) -> None:
+        byteorder = self._byteorder
+        # ds_format - just use 114
+        self._write_bytes(struct.pack("b", 114))
+        # byteorder
+        self._write((byteorder == ">" and "\x01") or "\x02")
+        # filetype
+        self._write("\x01")
+        # unused
+        self._write("\x00")
+        # number of vars, 2 bytes
+        self._write_bytes(struct.pack(byteorder + "h", self.nvar)[:2])
+        # number of obs, 4 bytes
+        self._write_bytes(struct.pack(byteorder + "i", self.nobs)[:4])
+        # data label 81 bytes, char, null terminated
+        if data_label is None:
+            self._write_bytes(self._null_terminate_bytes(_pad_bytes("", 80)))
+        else:
+            self._write_bytes(
+                self._null_terminate_bytes(_pad_bytes(data_label[:80], 80))
+            )
+        # time stamp, 18 bytes, char, null terminated
+        # format dd Mon yyyy hh:mm
+        if time_stamp is None:
+            time_stamp = datetime.now()
+        elif not isinstance(time_stamp, datetime):
+            raise ValueError("time_stamp should be datetime type")
+        # GH #13856
+        # Avoid locale-specific month conversion
+        months = [
+            "Jan",
+            "Feb",
+            "Mar",
+            "Apr",
+            "May",
+            "Jun",
+            "Jul",
+            "Aug",
+            "Sep",
+            "Oct",
+            "Nov",
+            "Dec",
+        ]
+        month_lookup = {i + 1: month for i, month in enumerate(months)}
+        ts = (
+            time_stamp.strftime("%d ")
+            + month_lookup[time_stamp.month]
+            + time_stamp.strftime(" %Y %H:%M")
+        )
+        self._write_bytes(self._null_terminate_bytes(ts))
+
+    def _write_variable_types(self) -> None:
+        for typ in self.typlist:
+            self._write_bytes(struct.pack("B", typ))
+
+    def _write_varnames(self) -> None:
+        # varlist names are checked by _check_column_names
+        # varlist, requires null terminated
+        for name in self.varlist:
+            name = self._null_terminate_str(name)
+            name = _pad_bytes(name[:32], 33)
+            self._write(name)
+
+    def _write_sortlist(self) -> None:
+        # srtlist, 2*(nvar+1), int array, encoded by byteorder
+        srtlist = _pad_bytes("", 2 * (self.nvar + 1))
+        self._write(srtlist)
+
+    def _write_formats(self) -> None:
+        # fmtlist, 49*nvar, char array
+        for fmt in self.fmtlist:
+            self._write(_pad_bytes(fmt, 49))
+
+    def _write_value_label_names(self) -> None:
+        # lbllist, 33*nvar, char array
+        for i in range(self.nvar):
+            # Use variable name when categorical
+            if self._has_value_labels[i]:
+                name = self.varlist[i]
+                name = self._null_terminate_str(name)
+                name = _pad_bytes(name[:32], 33)
+                self._write(name)
+            else:  # Default is empty label
+                self._write(_pad_bytes("", 33))
+
+    def _write_variable_labels(self) -> None:
+        # Missing labels are 80 blank characters plus null termination
+        blank = _pad_bytes("", 81)
+
+        if self._variable_labels is None:
+            for i in range(self.nvar):
+                self._write(blank)
+            return
+
+        for col in self.data:
+            if col in self._variable_labels:
+                label = self._variable_labels[col]
+                if len(label) > 80:
+                    raise ValueError("Variable labels must be 80 characters or fewer")
+                is_latin1 = all(ord(c) < 256 for c in label)
+                if not is_latin1:
+                    raise ValueError(
+                        "Variable labels must contain only characters that "
+                        "can be encoded in Latin-1"
+                    )
+                self._write(_pad_bytes(label, 81))
+            else:
+                self._write(blank)
+
+    def _convert_strls(self, data: DataFrame) -> DataFrame:
+        """No-op, future compatibility"""
+        return data
+
+    def _prepare_data(self) -> np.rec.recarray:
+        data = self.data
+        typlist = self.typlist
+        convert_dates = self._convert_dates
+        # 1. Convert dates
+        if self._convert_dates is not None:
+            for i, col in enumerate(data):
+                if i in convert_dates:
+                    data[col] = _datetime_to_stata_elapsed_vec(
+                        data[col], self.fmtlist[i]
+                    )
+        # 2. Convert strls
+        data = self._convert_strls(data)
+
+        # 3. Convert bad string data to '' and pad to correct length
+        dtypes = {}
+        native_byteorder = self._byteorder == _set_endianness(sys.byteorder)
+        for i, col in enumerate(data):
+            typ = typlist[i]
+            if typ <= self._max_string_length:
+                dc = data[col].fillna("")
+                data[col] = dc.apply(_pad_bytes, args=(typ,))
+                stype = f"S{typ}"
+                dtypes[col] = stype
+                data[col] = data[col].astype(stype)
+            else:
+                dtype = data[col].dtype
+                if not native_byteorder:
+                    dtype = dtype.newbyteorder(self._byteorder)
+                dtypes[col] = dtype
+
+        return data.to_records(index=False, column_dtypes=dtypes)
+
+    def _write_data(self, records: np.rec.recarray) -> None:
+        self._write_bytes(records.tobytes())
+
+    @staticmethod
+    def _null_terminate_str(s: str) -> str:
+        s += "\x00"
+        return s
+
+    def _null_terminate_bytes(self, s: str) -> bytes:
+        return self._null_terminate_str(s).encode(self._encoding)
+
+
+def _dtype_to_stata_type_117(dtype: np.dtype, column: Series, force_strl: bool) -> int:
+    """
+    Converts dtype types to stata types. Returns the byte of the given ordinal.
+    See TYPE_MAP and comments for an explanation. This is also explained in
+    the dta spec.
+    1 - 2045 are strings of this length
+                Pandas    Stata
+    32768 - for object    strL
+    65526 - for int8      byte
+    65527 - for int16     int
+    65528 - for int32     long
+    65529 - for float32   float
+    65530 - for double    double
+
+    If there are dates to convert, then dtype will already have the correct
+    type inserted.
+    """
+    # TODO: expand to handle datetime to integer conversion
+    if force_strl:
+        return 32768
+    if dtype.type is np.object_:  # try to coerce it to the biggest string
+        # not memory efficient, what else could we
+        # do?
+        itemsize = max_len_string_array(ensure_object(column._values))
+        itemsize = max(itemsize, 1)
+        if itemsize <= 2045:
+            return itemsize
+        return 32768
+    elif dtype.type is np.float64:
+        return 65526
+    elif dtype.type is np.float32:
+        return 65527
+    elif dtype.type is np.int32:
+        return 65528
+    elif dtype.type is np.int16:
+        return 65529
+    elif dtype.type is np.int8:
+        return 65530
+    else:  # pragma : no cover
+        raise NotImplementedError(f"Data type {dtype} not supported.")
+
+
+def _pad_bytes_new(name: str | bytes, length: int) -> bytes:
+    """
+    Takes a bytes instance and pads it with null bytes until it's length chars.
+    """
+    if isinstance(name, str):
+        name = bytes(name, "utf-8")
+    return name + b"\x00" * (length - len(name))
+
+
+class StataStrLWriter:
+    """
+    Converter for Stata StrLs
+
+    Stata StrLs map 8 byte values to strings which are stored using a
+    dictionary-like format where strings are keyed to two values.
+
+    Parameters
+    ----------
+    df : DataFrame
+        DataFrame to convert
+    columns : Sequence[str]
+        List of columns names to convert to StrL
+    version : int, optional
+        dta version.  Currently supports 117, 118 and 119
+    byteorder : str, optional
+        Can be ">", "<", "little", or "big". default is `sys.byteorder`
+
+    Notes
+    -----
+    Supports creation of the StrL block of a dta file for dta versions
+    117, 118 and 119.  These differ in how the GSO is stored.  118 and
+    119 store the GSO lookup value as a uint32 and a uint64, while 117
+    uses two uint32s. 118 and 119 also encode all strings as unicode
+    which is required by the format.  117 uses 'latin-1' a fixed width
+    encoding that extends the 7-bit ascii table with an additional 128
+    characters.
+    """
+
+    def __init__(
+        self,
+        df: DataFrame,
+        columns: Sequence[str],
+        version: int = 117,
+        byteorder: str | None = None,
+    ) -> None:
+        if version not in (117, 118, 119):
+            raise ValueError("Only dta versions 117, 118 and 119 supported")
+        self._dta_ver = version
+
+        self.df = df
+        self.columns = columns
+        self._gso_table = {"": (0, 0)}
+        if byteorder is None:
+            byteorder = sys.byteorder
+        self._byteorder = _set_endianness(byteorder)
+        # Flag whether chosen byteorder matches the system on which we're running
+        self._native_byteorder = self._byteorder == _set_endianness(sys.byteorder)
+
+        gso_v_type = "I"  # uint32
+        gso_o_type = "Q"  # uint64
+        self._encoding = "utf-8"
+        if version == 117:
+            o_size = 4
+            gso_o_type = "I"  # 117 used uint32
+            self._encoding = "latin-1"
+        elif version == 118:
+            o_size = 6
+        else:  # version == 119
+            o_size = 5
+        if self._native_byteorder:
+            self._o_offet = 2 ** (8 * (8 - o_size))
+        else:
+            self._o_offet = 2 ** (8 * o_size)
+        self._gso_o_type = gso_o_type
+        self._gso_v_type = gso_v_type
+
+    def _convert_key(self, key: tuple[int, int]) -> int:
+        v, o = key
+        if self._native_byteorder:
+            return v + self._o_offet * o
+        else:
+            # v, o will be swapped when applying byteorder
+            return o + self._o_offet * v
+
+    def generate_table(self) -> tuple[dict[str, tuple[int, int]], DataFrame]:
+        """
+        Generates the GSO lookup table for the DataFrame
+
+        Returns
+        -------
+        gso_table : dict
+            Ordered dictionary using the string found as keys
+            and their lookup position (v,o) as values
+        gso_df : DataFrame
+            DataFrame where strl columns have been converted to
+            (v,o) values
+
+        Notes
+        -----
+        Modifies the DataFrame in-place.
+
+        The DataFrame returned encodes the (v,o) values as uint64s. The
+        encoding depends on the dta version, and can be expressed as
+
+        enc = v + o * 2 ** (o_size * 8)
+
+        so that v is stored in the lower bits and o is in the upper
+        bits. o_size is
+
+          * 117: 4
+          * 118: 6
+          * 119: 5
+        """
+        gso_table = self._gso_table
+        gso_df = self.df
+        columns = list(gso_df.columns)
+        selected = gso_df[self.columns]
+        col_index = [(col, columns.index(col)) for col in self.columns]
+        keys = np.empty(selected.shape, dtype=np.uint64)
+        for o, (idx, row) in enumerate(selected.iterrows()):
+            for j, (col, v) in enumerate(col_index):
+                val = row[col]
+                # Allow columns with mixed str and None or pd.NA (GH 23633)
+                val = "" if isna(val) else val
+                key = gso_table.get(val, None)
+                if key is None:
+                    # Stata prefers human numbers
+                    key = (v + 1, o + 1)
+                    gso_table[val] = key
+                keys[o, j] = self._convert_key(key)
+        for i, col in enumerate(self.columns):
+            gso_df[col] = keys[:, i]
+
+        return gso_table, gso_df
+
+    def generate_blob(self, gso_table: dict[str, tuple[int, int]]) -> bytes:
+        """
+        Generates the binary blob of GSOs that is written to the dta file.
+
+        Parameters
+        ----------
+        gso_table : dict
+            Ordered dictionary (str, vo)
+
+        Returns
+        -------
+        gso : bytes
+            Binary content of dta file to be placed between strl tags
+
+        Notes
+        -----
+        Output format depends on dta version.  117 uses two uint32s to
+        express v and o while 118+ uses a uint32 for v and a uint64 for o.
+        """
+        # Format information
+        # Length includes null term
+        # 117
+        # GSOvvvvooootllllxxxxxxxxxxxxxxx...x
+        #  3  u4  u4 u1 u4  string + null term
+        #
+        # 118, 119
+        # GSOvvvvooooooootllllxxxxxxxxxxxxxxx...x
+        #  3  u4   u8   u1 u4    string + null term
+
+        bio = BytesIO()
+        gso = bytes("GSO", "ascii")
+        gso_type = struct.pack(self._byteorder + "B", 130)
+        null = struct.pack(self._byteorder + "B", 0)
+        v_type = self._byteorder + self._gso_v_type
+        o_type = self._byteorder + self._gso_o_type
+        len_type = self._byteorder + "I"
+        for strl, vo in gso_table.items():
+            if vo == (0, 0):
+                continue
+            v, o = vo
+
+            # GSO
+            bio.write(gso)
+
+            # vvvv
+            bio.write(struct.pack(v_type, v))
+
+            # oooo / oooooooo
+            bio.write(struct.pack(o_type, o))
+
+            # t
+            bio.write(gso_type)
+
+            # llll
+            if isinstance(strl, str):
+                strl_convert = bytes(strl, "utf-8")
+            else:
+                strl_convert = strl
+
+            bio.write(struct.pack(len_type, len(strl_convert) + 1))
+
+            # xxx...xxx
+            bio.write(strl_convert)
+            bio.write(null)
+
+        return bio.getvalue()
+
+
+class StataWriter117(StataWriter):
+    """
+    A class for writing Stata binary dta files in Stata 13 format (117)
+
+    Parameters
+    ----------
+    fname : path (string), buffer or path object
+        string, pathlib.Path or
+        object implementing a binary write() functions. If using a buffer
+        then the buffer will not be automatically closed after the file
+        is written.
+    data : DataFrame
+        Input to save
+    convert_dates : dict
+        Dictionary mapping columns containing datetime types to stata internal
+        format to use when writing the dates. Options are 'tc', 'td', 'tm',
+        'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
+        Datetime columns that do not have a conversion type specified will be
+        converted to 'tc'. Raises NotImplementedError if a datetime column has
+        timezone information
+    write_index : bool
+        Write the index to Stata dataset.
+    byteorder : str
+        Can be ">", "<", "little", or "big". default is `sys.byteorder`
+    time_stamp : datetime
+        A datetime to use as file creation date.  Default is the current time
+    data_label : str
+        A label for the data set.  Must be 80 characters or smaller.
+    variable_labels : dict
+        Dictionary containing columns as keys and variable labels as values.
+        Each label must be 80 characters or smaller.
+    convert_strl : list
+        List of columns names to convert to Stata StrL format.  Columns with
+        more than 2045 characters are automatically written as StrL.
+        Smaller columns can be converted by including the column name.  Using
+        StrLs can reduce output file size when strings are longer than 8
+        characters, and either frequently repeated or sparse.
+    {compression_options}
+
+    value_labels : dict of dicts
+        Dictionary containing columns as keys and dictionaries of column value
+        to labels as values. The combined length of all labels for a single
+        variable must be 32,000 characters or smaller.
+
+    Returns
+    -------
+    writer : StataWriter117 instance
+        The StataWriter117 instance has a write_file method, which will
+        write the file to the given `fname`.
+
+    Raises
+    ------
+    NotImplementedError
+        * If datetimes contain timezone information
+    ValueError
+        * Columns listed in convert_dates are neither datetime64[ns]
+          or datetime
+        * Column dtype is not representable in Stata
+        * Column listed in convert_dates is not in DataFrame
+        * Categorical label contains more than 32,000 characters
+
+    Examples
+    --------
+    >>> data = pd.DataFrame([[1.0, 1, "a"]], columns=["a", "b", "c"])
+    >>> writer = pd.io.stata.StataWriter117("./data_file.dta", data)
+    >>> writer.write_file()
+
+    Directly write a zip file
+    >>> compression = {"method": "zip", "archive_name": "data_file.dta"}
+    >>> writer = pd.io.stata.StataWriter117(
+    ...     "./data_file.zip", data, compression=compression
+    ... )
+    >>> writer.write_file()
+
+    Or with long strings stored in strl format
+    >>> data = pd.DataFrame(
+    ...     [["A relatively long string"], [""], [""]], columns=["strls"]
+    ... )
+    >>> writer = pd.io.stata.StataWriter117(
+    ...     "./data_file_with_long_strings.dta", data, convert_strl=["strls"]
+    ... )
+    >>> writer.write_file()
+    """
+
+    _max_string_length = 2045
+    _dta_version = 117
+
+    def __init__(
+        self,
+        fname: FilePath | WriteBuffer[bytes],
+        data: DataFrame,
+        convert_dates: dict[Hashable, str] | None = None,
+        write_index: bool = True,
+        byteorder: str | None = None,
+        time_stamp: datetime | None = None,
+        data_label: str | None = None,
+        variable_labels: dict[Hashable, str] | None = None,
+        convert_strl: Sequence[Hashable] | None = None,
+        compression: CompressionOptions = "infer",
+        storage_options: StorageOptions | None = None,
+        *,
+        value_labels: dict[Hashable, dict[float, str]] | None = None,
+    ) -> None:
+        # Copy to new list since convert_strl might be modified later
+        self._convert_strl: list[Hashable] = []
+        if convert_strl is not None:
+            self._convert_strl.extend(convert_strl)
+
+        super().__init__(
+            fname,
+            data,
+            convert_dates,
+            write_index,
+            byteorder=byteorder,
+            time_stamp=time_stamp,
+            data_label=data_label,
+            variable_labels=variable_labels,
+            value_labels=value_labels,
+            compression=compression,
+            storage_options=storage_options,
+        )
+        self._map: dict[str, int] = {}
+        self._strl_blob = b""
+
+    @staticmethod
+    def _tag(val: str | bytes, tag: str) -> bytes:
+        """Surround val with <tag></tag>"""
+        if isinstance(val, str):
+            val = bytes(val, "utf-8")
+        return bytes("<" + tag + ">", "utf-8") + val + bytes("</" + tag + ">", "utf-8")
+
+    def _update_map(self, tag: str) -> None:
+        """Update map location for tag with file position"""
+        assert self.handles.handle is not None
+        self._map[tag] = self.handles.handle.tell()
+
+    def _write_header(
+        self,
+        data_label: str | None = None,
+        time_stamp: datetime | None = None,
+    ) -> None:
+        """Write the file header"""
+        byteorder = self._byteorder
+        self._write_bytes(bytes("<stata_dta>", "utf-8"))
+        bio = BytesIO()
+        # ds_format - 117
+        bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release"))
+        # byteorder
+        bio.write(self._tag((byteorder == ">" and "MSF") or "LSF", "byteorder"))
+        # number of vars, 2 bytes in 117 and 118, 4 byte in 119
+        nvar_type = "H" if self._dta_version <= 118 else "I"
+        bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K"))
+        # 117 uses 4 bytes, 118 uses 8
+        nobs_size = "I" if self._dta_version == 117 else "Q"
+        bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N"))
+        # data label 81 bytes, char, null terminated
+        label = data_label[:80] if data_label is not None else ""
+        encoded_label = label.encode(self._encoding)
+        label_size = "B" if self._dta_version == 117 else "H"
+        label_len = struct.pack(byteorder + label_size, len(encoded_label))
+        encoded_label = label_len + encoded_label
+        bio.write(self._tag(encoded_label, "label"))
+        # time stamp, 18 bytes, char, null terminated
+        # format dd Mon yyyy hh:mm
+        if time_stamp is None:
+            time_stamp = datetime.now()
+        elif not isinstance(time_stamp, datetime):
+            raise ValueError("time_stamp should be datetime type")
+        # Avoid locale-specific month conversion
+        months = [
+            "Jan",
+            "Feb",
+            "Mar",
+            "Apr",
+            "May",
+            "Jun",
+            "Jul",
+            "Aug",
+            "Sep",
+            "Oct",
+            "Nov",
+            "Dec",
+        ]
+        month_lookup = {i + 1: month for i, month in enumerate(months)}
+        ts = (
+            time_stamp.strftime("%d ")
+            + month_lookup[time_stamp.month]
+            + time_stamp.strftime(" %Y %H:%M")
+        )
+        # '\x11' added due to inspection of Stata file
+        stata_ts = b"\x11" + bytes(ts, "utf-8")
+        bio.write(self._tag(stata_ts, "timestamp"))
+        self._write_bytes(self._tag(bio.getvalue(), "header"))
+
+    def _write_map(self) -> None:
+        """
+        Called twice during file write. The first populates the values in
+        the map with 0s.  The second call writes the final map locations when
+        all blocks have been written.
+        """
+        if not self._map:
+            self._map = {
+                "stata_data": 0,
+                "map": self.handles.handle.tell(),
+                "variable_types": 0,
+                "varnames": 0,
+                "sortlist": 0,
+                "formats": 0,
+                "value_label_names": 0,
+                "variable_labels": 0,
+                "characteristics": 0,
+                "data": 0,
+                "strls": 0,
+                "value_labels": 0,
+                "stata_data_close": 0,
+                "end-of-file": 0,
+            }
+        # Move to start of map
+        self.handles.handle.seek(self._map["map"])
+        bio = BytesIO()
+        for val in self._map.values():
+            bio.write(struct.pack(self._byteorder + "Q", val))
+        self._write_bytes(self._tag(bio.getvalue(), "map"))
+
+    def _write_variable_types(self) -> None:
+        self._update_map("variable_types")
+        bio = BytesIO()
+        for typ in self.typlist:
+            bio.write(struct.pack(self._byteorder + "H", typ))
+        self._write_bytes(self._tag(bio.getvalue(), "variable_types"))
+
+    def _write_varnames(self) -> None:
+        self._update_map("varnames")
+        bio = BytesIO()
+        # 118 scales by 4 to accommodate utf-8 data worst case encoding
+        vn_len = 32 if self._dta_version == 117 else 128
+        for name in self.varlist:
+            name = self._null_terminate_str(name)
+            name = _pad_bytes_new(name[:32].encode(self._encoding), vn_len + 1)
+            bio.write(name)
+        self._write_bytes(self._tag(bio.getvalue(), "varnames"))
+
+    def _write_sortlist(self) -> None:
+        self._update_map("sortlist")
+        sort_size = 2 if self._dta_version < 119 else 4
+        self._write_bytes(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist"))
+
+    def _write_formats(self) -> None:
+        self._update_map("formats")
+        bio = BytesIO()
+        fmt_len = 49 if self._dta_version == 117 else 57
+        for fmt in self.fmtlist:
+            bio.write(_pad_bytes_new(fmt.encode(self._encoding), fmt_len))
+        self._write_bytes(self._tag(bio.getvalue(), "formats"))
+
+    def _write_value_label_names(self) -> None:
+        self._update_map("value_label_names")
+        bio = BytesIO()
+        # 118 scales by 4 to accommodate utf-8 data worst case encoding
+        vl_len = 32 if self._dta_version == 117 else 128
+        for i in range(self.nvar):
+            # Use variable name when categorical
+            name = ""  # default name
+            if self._has_value_labels[i]:
+                name = self.varlist[i]
+            name = self._null_terminate_str(name)
+            encoded_name = _pad_bytes_new(name[:32].encode(self._encoding), vl_len + 1)
+            bio.write(encoded_name)
+        self._write_bytes(self._tag(bio.getvalue(), "value_label_names"))
+
+    def _write_variable_labels(self) -> None:
+        # Missing labels are 80 blank characters plus null termination
+        self._update_map("variable_labels")
+        bio = BytesIO()
+        # 118 scales by 4 to accommodate utf-8 data worst case encoding
+        vl_len = 80 if self._dta_version == 117 else 320
+        blank = _pad_bytes_new("", vl_len + 1)
+
+        if self._variable_labels is None:
+            for _ in range(self.nvar):
+                bio.write(blank)
+            self._write_bytes(self._tag(bio.getvalue(), "variable_labels"))
+            return
+
+        for col in self.data:
+            if col in self._variable_labels:
+                label = self._variable_labels[col]
+                if len(label) > 80:
+                    raise ValueError("Variable labels must be 80 characters or fewer")
+                try:
+                    encoded = label.encode(self._encoding)
+                except UnicodeEncodeError as err:
+                    raise ValueError(
+                        "Variable labels must contain only characters that "
+                        f"can be encoded in {self._encoding}"
+                    ) from err
+
+                bio.write(_pad_bytes_new(encoded, vl_len + 1))
+            else:
+                bio.write(blank)
+        self._write_bytes(self._tag(bio.getvalue(), "variable_labels"))
+
+    def _write_characteristics(self) -> None:
+        self._update_map("characteristics")
+        self._write_bytes(self._tag(b"", "characteristics"))
+
+    def _write_data(self, records: np.rec.recarray) -> None:
+        self._update_map("data")
+        self._write_bytes(b"<data>")
+        self._write_bytes(records.tobytes())
+        self._write_bytes(b"</data>")
+
+    def _write_strls(self) -> None:
+        self._update_map("strls")
+        self._write_bytes(self._tag(self._strl_blob, "strls"))
+
+    def _write_expansion_fields(self) -> None:
+        """No-op in dta 117+"""
+
+    def _write_value_labels(self) -> None:
+        self._update_map("value_labels")
+        bio = BytesIO()
+        for vl in self._value_labels:
+            lab = vl.generate_value_label(self._byteorder)
+            lab = self._tag(lab, "lbl")
+            bio.write(lab)
+        self._write_bytes(self._tag(bio.getvalue(), "value_labels"))
+
+    def _write_file_close_tag(self) -> None:
+        self._update_map("stata_data_close")
+        self._write_bytes(bytes("</stata_dta>", "utf-8"))
+        self._update_map("end-of-file")
+
+    def _update_strl_names(self) -> None:
+        """
+        Update column names for conversion to strl if they might have been
+        changed to comply with Stata naming rules
+        """
+        # Update convert_strl if names changed
+        for orig, new in self._converted_names.items():
+            if orig in self._convert_strl:
+                idx = self._convert_strl.index(orig)
+                self._convert_strl[idx] = new
+
+    def _convert_strls(self, data: DataFrame) -> DataFrame:
+        """
+        Convert columns to StrLs if either very large or in the
+        convert_strl variable
+        """
+        convert_cols = [
+            col
+            for i, col in enumerate(data)
+            if self.typlist[i] == 32768 or col in self._convert_strl
+        ]
+
+        if convert_cols:
+            ssw = StataStrLWriter(
+                data, convert_cols, version=self._dta_version, byteorder=self._byteorder
+            )
+            tab, new_data = ssw.generate_table()
+            data = new_data
+            self._strl_blob = ssw.generate_blob(tab)
+        return data
+
+    def _set_formats_and_types(self, dtypes: Series) -> None:
+        self.typlist = []
+        self.fmtlist = []
+        for col, dtype in dtypes.items():
+            force_strl = col in self._convert_strl
+            fmt = _dtype_to_default_stata_fmt(
+                dtype,
+                self.data[col],
+                dta_version=self._dta_version,
+                force_strl=force_strl,
+            )
+            self.fmtlist.append(fmt)
+            self.typlist.append(
+                _dtype_to_stata_type_117(dtype, self.data[col], force_strl)
+            )
+
+
+class StataWriterUTF8(StataWriter117):
+    """
+    Stata binary dta file writing in Stata 15 (118) and 16 (119) formats
+
+    DTA 118 and 119 format files support unicode string data (both fixed
+    and strL) format. Unicode is also supported in value labels, variable
+    labels and the dataset label. Format 119 is automatically used if the
+    file contains more than 32,767 variables.
+
+    Parameters
+    ----------
+    fname : path (string), buffer or path object
+        string, pathlib.Path or
+        object implementing a binary write() functions. If using a buffer
+        then the buffer will not be automatically closed after the file
+        is written.
+    data : DataFrame
+        Input to save
+    convert_dates : dict, default None
+        Dictionary mapping columns containing datetime types to stata internal
+        format to use when writing the dates. Options are 'tc', 'td', 'tm',
+        'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
+        Datetime columns that do not have a conversion type specified will be
+        converted to 'tc'. Raises NotImplementedError if a datetime column has
+        timezone information
+    write_index : bool, default True
+        Write the index to Stata dataset.
+    byteorder : str, default None
+        Can be ">", "<", "little", or "big". default is `sys.byteorder`
+    time_stamp : datetime, default None
+        A datetime to use as file creation date.  Default is the current time
+    data_label : str, default None
+        A label for the data set.  Must be 80 characters or smaller.
+    variable_labels : dict, default None
+        Dictionary containing columns as keys and variable labels as values.
+        Each label must be 80 characters or smaller.
+    convert_strl : list, default None
+        List of columns names to convert to Stata StrL format.  Columns with
+        more than 2045 characters are automatically written as StrL.
+        Smaller columns can be converted by including the column name.  Using
+        StrLs can reduce output file size when strings are longer than 8
+        characters, and either frequently repeated or sparse.
+    version : int, default None
+        The dta version to use. By default, uses the size of data to determine
+        the version. 118 is used if data.shape[1] <= 32767, and 119 is used
+        for storing larger DataFrames.
+    {compression_options}
+
+    value_labels : dict of dicts
+        Dictionary containing columns as keys and dictionaries of column value
+        to labels as values. The combined length of all labels for a single
+        variable must be 32,000 characters or smaller.
+
+    Returns
+    -------
+    StataWriterUTF8
+        The instance has a write_file method, which will write the file to the
+        given `fname`.
+
+    Raises
+    ------
+    NotImplementedError
+        * If datetimes contain timezone information
+    ValueError
+        * Columns listed in convert_dates are neither datetime64[ns]
+          or datetime
+        * Column dtype is not representable in Stata
+        * Column listed in convert_dates is not in DataFrame
+        * Categorical label contains more than 32,000 characters
+
+    Examples
+    --------
+    Using Unicode data and column names
+
+    >>> from pandas.io.stata import StataWriterUTF8
+    >>> data = pd.DataFrame([[1.0, 1, "ᴬ"]], columns=["a", "β", "ĉ"])
+    >>> writer = StataWriterUTF8("./data_file.dta", data)
+    >>> writer.write_file()
+
+    Directly write a zip file
+    >>> compression = {"method": "zip", "archive_name": "data_file.dta"}
+    >>> writer = StataWriterUTF8("./data_file.zip", data, compression=compression)
+    >>> writer.write_file()
+
+    Or with long strings stored in strl format
+
+    >>> data = pd.DataFrame(
+    ...     [["ᴀ relatively long ŝtring"], [""], [""]], columns=["strls"]
+    ... )
+    >>> writer = StataWriterUTF8(
+    ...     "./data_file_with_long_strings.dta", data, convert_strl=["strls"]
+    ... )
+    >>> writer.write_file()
+    """
+
+    _encoding: Literal["utf-8"] = "utf-8"
+
+    def __init__(
+        self,
+        fname: FilePath | WriteBuffer[bytes],
+        data: DataFrame,
+        convert_dates: dict[Hashable, str] | None = None,
+        write_index: bool = True,
+        byteorder: str | None = None,
+        time_stamp: datetime | None = None,
+        data_label: str | None = None,
+        variable_labels: dict[Hashable, str] | None = None,
+        convert_strl: Sequence[Hashable] | None = None,
+        version: int | None = None,
+        compression: CompressionOptions = "infer",
+        storage_options: StorageOptions | None = None,
+        *,
+        value_labels: dict[Hashable, dict[float, str]] | None = None,
+    ) -> None:
+        if version is None:
+            version = 118 if data.shape[1] <= 32767 else 119
+        elif version not in (118, 119):
+            raise ValueError("version must be either 118 or 119.")
+        elif version == 118 and data.shape[1] > 32767:
+            raise ValueError(
+                "You must use version 119 for data sets containing more than"
+                "32,767 variables"
+            )
+
+        super().__init__(
+            fname,
+            data,
+            convert_dates=convert_dates,
+            write_index=write_index,
+            byteorder=byteorder,
+            time_stamp=time_stamp,
+            data_label=data_label,
+            variable_labels=variable_labels,
+            value_labels=value_labels,
+            convert_strl=convert_strl,
+            compression=compression,
+            storage_options=storage_options,
+        )
+        # Override version set in StataWriter117 init
+        self._dta_version = version
+
+    def _validate_variable_name(self, name: str) -> str:
+        """
+        Validate variable names for Stata export.
+
+        Parameters
+        ----------
+        name : str
+            Variable name
+
+        Returns
+        -------
+        str
+            The validated name with invalid characters replaced with
+            underscores.
+
+        Notes
+        -----
+        Stata 118+ support most unicode characters. The only limitation is in
+        the ascii range where the characters supported are a-z, A-Z, 0-9 and _.
+        """
+        # High code points appear to be acceptable
+        for c in name:
+            if (
+                (
+                    ord(c) < 128
+                    and (c < "A" or c > "Z")
+                    and (c < "a" or c > "z")
+                    and (c < "0" or c > "9")
+                    and c != "_"
+                )
+                or 128 <= ord(c) < 192
+                or c in {"×", "÷"}  # noqa: RUF001
+            ):
+                name = name.replace(c, "_")
+
+        return name
diff --git a/pandas/io/xml.py b/pandas/io/xml.py
new file mode 100644
index 0000000000000000000000000000000000000000..96a2c6cc5d126c02e91faeef24fc100dd028c1f2
--- /dev/null
+++ b/pandas/io/xml.py
@@ -0,0 +1,1155 @@
+"""
+:mod:``pandas.io.xml`` is a module for reading XML.
+"""
+
+from __future__ import annotations
+
+import io
+from os import PathLike
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+
+from pandas._libs import lib
+from pandas.compat._optional import import_optional_dependency
+from pandas.errors import (
+    AbstractMethodError,
+    ParserError,
+)
+from pandas.util._decorators import set_module
+from pandas.util._validators import check_dtype_backend
+
+from pandas.core.dtypes.common import is_list_like
+
+from pandas.io.common import (
+    get_handle,
+    infer_compression,
+    is_fsspec_url,
+    is_url,
+    stringify_path,
+)
+from pandas.io.parsers import TextParser
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Sequence,
+    )
+    from xml.etree.ElementTree import Element
+
+    from lxml import etree
+
+    from pandas._typing import (
+        CompressionOptions,
+        ConvertersArg,
+        DtypeArg,
+        DtypeBackend,
+        FilePath,
+        ParseDatesArg,
+        ReadBuffer,
+        StorageOptions,
+        XMLParsers,
+    )
+
+    from pandas import DataFrame
+
+
+class _XMLFrameParser:
+    """
+    Internal subclass to parse XML into DataFrames.
+
+    Parameters
+    ----------
+    path_or_buffer : a valid JSON ``str``, path object or file-like object
+        Any valid string path is acceptable. The string could be a URL. Valid
+        URL schemes include http, ftp, s3, and file.
+
+    xpath : str or regex
+        The ``XPath`` expression to parse required set of nodes for
+        migration to :class:`~pandas.DataFrame`. ``etree`` supports limited ``XPath``.
+
+    namespaces : dict
+        The namespaces defined in XML document (``xmlns:namespace='URI'``)
+        as dicts with key being namespace and value the URI.
+
+    elems_only : bool
+        Parse only the child elements at the specified ``xpath``.
+
+    attrs_only : bool
+        Parse only the attributes at the specified ``xpath``.
+
+    names : list
+        Column names for :class:`~pandas.DataFrame` of parsed XML data.
+
+    dtype : dict
+        Data type for data or columns. E.g. {{'a': np.float64,
+        'b': np.int32, 'c': 'Int64'}}
+
+    converters : dict, optional
+        Dict of functions for converting values in certain columns. Keys can
+        either be integers or column labels.
+
+    parse_dates : bool or list of int or names or list of lists or dict
+        Converts either index or select columns to datetimes
+
+    encoding : str
+        Encoding of xml object or document.
+
+    stylesheet : str or file-like
+        URL, file, file-like object, or a raw string containing XSLT,
+        ``etree`` does not support XSLT but retained for consistency.
+
+    iterparse : dict, optional
+        Dict with row element as key and list of descendant elements
+        and/or attributes as value to be retrieved in iterparsing of
+        XML document.
+
+    compression : str or dict, default 'infer'
+        For on-the-fly decompression of on-disk data. If 'infer' and
+        'path_or_buffer' is path-like, then detect compression from the
+        following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
+        '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
+        If using 'zip' or 'tar', the ZIP file must contain only one data
+        file to be read in. Set to ``None`` for no decompression.
+        Can also be a dict with key ``'method'`` set to one of
+        {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``}
+        and other key-value pairs are forwarded to ``zipfile.ZipFile``,
+        ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``,
+        ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively.
+        As an example, the following could be passed for Zstandard
+        decompression using a custom compression dictionary:
+        ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
+
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection,
+        e.g. host, port, username, password, etc. For HTTP(S) URLs the
+        key-value pairs are forwarded to ``urllib.request.Request`` as header
+        options. For other URLs (e.g. starting with "s3://", and "gcs://")
+        the key-value pairs are forwarded to ``fsspec.open``. Please see
+        ``fsspec`` and ``urllib`` for more details, and for more examples on
+        storage options refer `here <https://pandas.pydata.org/docs/
+        user_guide/io.html?highlight=storage_options#reading-writing-remote-
+        files>`_.
+
+    See also
+    --------
+    pandas.io.xml._EtreeFrameParser
+    pandas.io.xml._LxmlFrameParser
+
+    Notes
+    -----
+    To subclass this class effectively you must override the following methods:`
+        * :func:`parse_data`
+        * :func:`_parse_nodes`
+        * :func:`_iterparse_nodes`
+        * :func:`_parse_doc`
+        * :func:`_validate_names`
+        * :func:`_validate_path`
+
+
+    See each method's respective documentation for details on their
+    functionality.
+    """
+
+    def __init__(
+        self,
+        path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
+        xpath: str,
+        namespaces: dict[str, str] | None,
+        elems_only: bool,
+        attrs_only: bool,
+        names: Sequence[str] | None,
+        dtype: DtypeArg | None,
+        converters: ConvertersArg | None,
+        parse_dates: ParseDatesArg | None,
+        encoding: str | None,
+        stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
+        iterparse: dict[str, list[str]] | None,
+        compression: CompressionOptions,
+        storage_options: StorageOptions,
+    ) -> None:
+        self.path_or_buffer = path_or_buffer
+        self.xpath = xpath
+        self.namespaces = namespaces
+        self.elems_only = elems_only
+        self.attrs_only = attrs_only
+        self.names = names
+        self.dtype = dtype
+        self.converters = converters
+        self.parse_dates = parse_dates
+        self.encoding = encoding
+        self.stylesheet = stylesheet
+        self.iterparse = iterparse
+        self.compression: CompressionOptions = compression
+        self.storage_options = storage_options
+
+    def parse_data(self) -> list[dict[str, str | None]]:
+        """
+        Parse xml data.
+
+        This method will call the other internal methods to
+        validate ``xpath``, names, parse and return specific nodes.
+        """
+
+        raise AbstractMethodError(self)
+
+    def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:
+        """
+        Parse xml nodes.
+
+        This method will parse the children and attributes of elements
+        in ``xpath``, conditionally for only elements, only attributes
+        or both while optionally renaming node names.
+
+        Raises
+        ------
+        ValueError
+            * If only elements and only attributes are specified.
+
+        Notes
+        -----
+        Namespace URIs will be removed from return node values. Also,
+        elements with missing children or attributes compared to siblings
+        will have optional keys filled with None values.
+        """
+
+        dicts: list[dict[str, str | None]]
+
+        if self.elems_only and self.attrs_only:
+            raise ValueError("Either element or attributes can be parsed not both.")
+        if self.elems_only:
+            if self.names:
+                dicts = [
+                    {
+                        **(
+                            {el.tag: el.text}
+                            if el.text and not el.text.isspace()
+                            else {}
+                        ),
+                        **{
+                            nm: ch.text if ch.text else None
+                            for nm, ch in zip(self.names, el.findall("*"), strict=True)
+                        },
+                    }
+                    for el in elems
+                ]
+            else:
+                dicts = [
+                    {ch.tag: ch.text if ch.text else None for ch in el.findall("*")}
+                    for el in elems
+                ]
+
+        elif self.attrs_only:
+            dicts = [
+                {k: v if v else None for k, v in el.attrib.items()} for el in elems
+            ]
+
+        elif self.names:
+            dicts = [
+                {
+                    **el.attrib,
+                    **({el.tag: el.text} if el.text and not el.text.isspace() else {}),
+                    **{
+                        nm: ch.text if ch.text else None
+                        for nm, ch in zip(self.names, el.findall("*"), strict=False)
+                    },
+                }
+                for el in elems
+            ]
+
+        else:
+            dicts = [
+                {
+                    **el.attrib,
+                    **({el.tag: el.text} if el.text and not el.text.isspace() else {}),
+                    **{ch.tag: ch.text if ch.text else None for ch in el.findall("*")},
+                }
+                for el in elems
+            ]
+
+        dicts = [
+            {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts
+        ]
+
+        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
+        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
+
+        if self.names:
+            dicts = [dict(zip(self.names, d.values(), strict=True)) for d in dicts]
+
+        return dicts
+
+    def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
+        """
+        Iterparse xml nodes.
+
+        This method will read in local disk, decompressed XML files for elements
+        and underlying descendants using iterparse, a method to iterate through
+        an XML tree without holding entire XML tree in memory.
+
+        Raises
+        ------
+        TypeError
+            * If ``iterparse`` is not a dict or its dict value is not list-like.
+        ParserError
+            * If ``path_or_buffer`` is not a physical file on disk or file-like object.
+            * If no data is returned from selected items in ``iterparse``.
+
+        Notes
+        -----
+        Namespace URIs will be removed from return node values. Also,
+        elements with missing children or attributes in submitted list
+        will have optional keys filled with None values.
+        """
+
+        dicts: list[dict[str, str | None]] = []
+        row: dict[str, str | None] | None = None
+
+        if not isinstance(self.iterparse, dict):
+            raise TypeError(
+                f"{type(self.iterparse).__name__} is not a valid type for iterparse"
+            )
+
+        row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
+        if not is_list_like(self.iterparse[row_node]):
+            raise TypeError(
+                f"{type(self.iterparse[row_node])} is not a valid type "
+                "for value in iterparse"
+            )
+
+        if (not hasattr(self.path_or_buffer, "read")) and (
+            not isinstance(self.path_or_buffer, (str, PathLike))
+            or is_url(self.path_or_buffer)
+            or is_fsspec_url(self.path_or_buffer)
+            or (
+                isinstance(self.path_or_buffer, str)
+                and self.path_or_buffer.startswith(("<?xml", "<"))
+            )
+            or infer_compression(self.path_or_buffer, "infer") is not None
+        ):
+            raise ParserError(
+                "iterparse is designed for large XML files that are fully extracted on "
+                "local disk and not as compressed files or online sources."
+            )
+
+        iterparse_repeats = len(self.iterparse[row_node]) != len(
+            set(self.iterparse[row_node])
+        )
+
+        for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
+            curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
+
+            if event == "start":
+                if curr_elem == row_node:
+                    row = {}
+
+            if row is not None:
+                if self.names and iterparse_repeats:
+                    for col, nm in zip(
+                        self.iterparse[row_node], self.names, strict=True
+                    ):
+                        if curr_elem == col:
+                            elem_val = elem.text if elem.text else None
+                            if elem_val not in row.values() and nm not in row:
+                                row[nm] = elem_val
+
+                        if col in elem.attrib:
+                            if elem.attrib[col] not in row.values() and nm not in row:
+                                row[nm] = elem.attrib[col]
+                else:
+                    for col in self.iterparse[row_node]:
+                        if curr_elem == col:
+                            row[col] = elem.text if elem.text else None
+                        if col in elem.attrib:
+                            row[col] = elem.attrib[col]
+
+            if event == "end":
+                if curr_elem == row_node and row is not None:
+                    dicts.append(row)
+                    row = None
+
+                elem.clear()
+                if hasattr(elem, "getprevious"):
+                    while (
+                        elem.getprevious() is not None and elem.getparent() is not None
+                    ):
+                        del elem.getparent()[0]
+
+        if dicts == []:
+            raise ParserError("No result from selected items in iterparse.")
+
+        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
+        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
+
+        if self.names:
+            dicts = [dict(zip(self.names, d.values(), strict=True)) for d in dicts]
+
+        return dicts
+
+    def _validate_path(self) -> list[Any]:
+        """
+        Validate ``xpath``.
+
+        This method checks for syntax, evaluation, or empty nodes return.
+
+        Raises
+        ------
+        SyntaxError
+            * If xpah is not supported or issues with namespaces.
+
+        ValueError
+            * If xpah does not return any nodes.
+        """
+
+        raise AbstractMethodError(self)
+
+    def _validate_names(self) -> None:
+        """
+        Validate names.
+
+        This method will check if names is a list-like and aligns
+        with length of parse nodes.
+
+        Raises
+        ------
+        ValueError
+            * If value is not a list and less then length of nodes.
+        """
+        raise AbstractMethodError(self)
+
+    def _parse_doc(
+        self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
+    ) -> Element | etree._Element:
+        """
+        Build tree from path_or_buffer.
+
+        This method will parse XML object into tree
+        either from string/bytes or file location.
+        """
+        raise AbstractMethodError(self)
+
+
+class _EtreeFrameParser(_XMLFrameParser):
+    """
+    Internal class to parse XML into DataFrames with the Python
+    standard library XML module: `xml.etree.ElementTree`.
+    """
+
+    def parse_data(self) -> list[dict[str, str | None]]:
+        from xml.etree.ElementTree import iterparse
+
+        if self.stylesheet is not None:
+            raise ValueError(
+                "To use stylesheet, you need lxml installed and selected as parser."
+            )
+
+        if self.iterparse is None:
+            self.xml_doc = self._parse_doc(self.path_or_buffer)
+            elems = self._validate_path()
+
+        self._validate_names()
+
+        xml_dicts: list[dict[str, str | None]] = (
+            self._parse_nodes(elems)
+            if self.iterparse is None
+            else self._iterparse_nodes(iterparse)
+        )
+
+        return xml_dicts
+
+    def _validate_path(self) -> list[Any]:
+        """
+        Notes
+        -----
+        ``etree`` supports limited ``XPath``. If user attempts a more complex
+        expression syntax error will raise.
+        """
+
+        msg = (
+            "xpath does not return any nodes or attributes. "
+            "Be sure to specify in `xpath` the parent nodes of "
+            "children and attributes to parse. "
+            "If document uses namespaces denoted with "
+            "xmlns, be sure to define namespaces and "
+            "use them in xpath."
+        )
+        try:
+            elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
+            children = [ch for el in elems for ch in el.findall("*")]
+            attrs = {k: v for el in elems for k, v in el.attrib.items()}
+
+            if elems is None:
+                raise ValueError(msg)
+
+            if elems is not None:
+                if self.elems_only and children == []:
+                    raise ValueError(msg)
+                if self.attrs_only and attrs == {}:
+                    raise ValueError(msg)
+                if children == [] and attrs == {}:
+                    raise ValueError(msg)
+
+        except (KeyError, SyntaxError) as err:
+            raise SyntaxError(
+                "You have used an incorrect or unsupported XPath "
+                "expression for etree library or you used an "
+                "undeclared namespace prefix."
+            ) from err
+
+        return elems
+
+    def _validate_names(self) -> None:
+        children: list[Any]
+
+        if self.names:
+            if self.iterparse:
+                children = self.iterparse[next(iter(self.iterparse))]
+            else:
+                parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
+                children = parent.findall("*") if parent is not None else []
+
+            if is_list_like(self.names):
+                if len(self.names) < len(children):
+                    raise ValueError(
+                        "names does not match length of child elements in xpath."
+                    )
+            else:
+                raise TypeError(
+                    f"{type(self.names).__name__} is not a valid type for names"
+                )
+
+    def _parse_doc(
+        self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
+    ) -> Element:
+        from xml.etree.ElementTree import (
+            XMLParser,
+            parse,
+        )
+
+        handle_data = get_data_from_filepath(
+            filepath_or_buffer=raw_doc,
+            encoding=self.encoding,
+            compression=self.compression,
+            storage_options=self.storage_options,
+        )
+
+        with handle_data as xml_data:
+            curr_parser = XMLParser(encoding=self.encoding)
+            document = parse(xml_data, parser=curr_parser)
+
+        return document.getroot()
+
+
+class _LxmlFrameParser(_XMLFrameParser):
+    """
+    Internal class to parse XML into :class:`~pandas.DataFrame` with third-party
+    full-featured XML library, ``lxml``, that supports
+    ``XPath`` 1.0 and XSLT 1.0.
+    """
+
+    def parse_data(self) -> list[dict[str, str | None]]:
+        """
+        Parse xml data.
+
+        This method will call the other internal methods to
+        validate ``xpath``, names, optionally parse and run XSLT,
+        and parse original or transformed XML and return specific nodes.
+        """
+        from lxml.etree import iterparse
+
+        if self.iterparse is None:
+            self.xml_doc = self._parse_doc(self.path_or_buffer)
+
+            if self.stylesheet:
+                self.xsl_doc = self._parse_doc(self.stylesheet)
+                self.xml_doc = self._transform_doc()
+
+            elems = self._validate_path()
+
+        self._validate_names()
+
+        xml_dicts: list[dict[str, str | None]] = (
+            self._parse_nodes(elems)
+            if self.iterparse is None
+            else self._iterparse_nodes(iterparse)
+        )
+
+        return xml_dicts
+
+    def _validate_path(self) -> list[Any]:
+        msg = (
+            "xpath does not return any nodes or attributes. "
+            "Be sure to specify in `xpath` the parent nodes of "
+            "children and attributes to parse. "
+            "If document uses namespaces denoted with "
+            "xmlns, be sure to define namespaces and "
+            "use them in xpath."
+        )
+
+        elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
+        children = [ch for el in elems for ch in el.xpath("*")]
+        attrs = {k: v for el in elems for k, v in el.attrib.items()}
+
+        if elems == []:
+            raise ValueError(msg)
+
+        if elems != []:
+            if self.elems_only and children == []:
+                raise ValueError(msg)
+            if self.attrs_only and attrs == {}:
+                raise ValueError(msg)
+            if children == [] and attrs == {}:
+                raise ValueError(msg)
+
+        return elems
+
+    def _validate_names(self) -> None:
+        children: list[Any]
+
+        if self.names:
+            if self.iterparse:
+                children = self.iterparse[next(iter(self.iterparse))]
+            else:
+                children = self.xml_doc.xpath(
+                    self.xpath + "[1]/*", namespaces=self.namespaces
+                )
+
+            if is_list_like(self.names):
+                if len(self.names) < len(children):
+                    raise ValueError(
+                        "names does not match length of child elements in xpath."
+                    )
+            else:
+                raise TypeError(
+                    f"{type(self.names).__name__} is not a valid type for names"
+                )
+
+    def _parse_doc(
+        self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
+    ) -> etree._Element:
+        from lxml.etree import (
+            XMLParser,
+            fromstring,
+            parse,
+        )
+
+        handle_data = get_data_from_filepath(
+            filepath_or_buffer=raw_doc,
+            encoding=self.encoding,
+            compression=self.compression,
+            storage_options=self.storage_options,
+        )
+
+        with handle_data as xml_data:
+            curr_parser = XMLParser(encoding=self.encoding)
+
+            if isinstance(xml_data, io.StringIO):
+                if self.encoding is None:
+                    raise TypeError(
+                        "Can not pass encoding None when input is StringIO."
+                    )
+
+                document = fromstring(
+                    xml_data.getvalue().encode(self.encoding), parser=curr_parser
+                )
+            else:
+                document = parse(xml_data, parser=curr_parser)
+
+        return document
+
+    def _transform_doc(self) -> etree._XSLTResultTree:
+        """
+        Transform original tree using stylesheet.
+
+        This method will transform original xml using XSLT script into
+        am ideally flatter xml document for easier parsing and migration
+        to Data Frame.
+        """
+        from lxml.etree import XSLT
+
+        transformer = XSLT(self.xsl_doc)
+        new_doc = transformer(self.xml_doc)
+
+        return new_doc
+
+
+def get_data_from_filepath(
+    filepath_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
+    encoding: str | None,
+    compression: CompressionOptions,
+    storage_options: StorageOptions,
+):
+    """
+    Extract raw XML data.
+
+    The method accepts two input types:
+        1. filepath (string-like)
+        2. file-like object (e.g. open file object, StringIO)
+    """
+    filepath_or_buffer = stringify_path(filepath_or_buffer)
+    with get_handle(
+        filepath_or_buffer,
+        "r",
+        encoding=encoding,
+        compression=compression,
+        storage_options=storage_options,
+    ) as handle_obj:
+        return (
+            preprocess_data(handle_obj.handle.read())
+            if hasattr(handle_obj.handle, "read")
+            else handle_obj.handle
+        )
+
+
+def preprocess_data(
+    data: str | bytes | io.StringIO | io.BytesIO,
+) -> io.StringIO | io.BytesIO:
+    """
+    Convert extracted raw data.
+
+    This method will return underlying data of extracted XML content.
+    The data either has a `read` attribute (e.g. a file object or a
+    StringIO/BytesIO) or is a string or bytes that is an XML document.
+    """
+
+    if isinstance(data, str):
+        data = io.StringIO(data)
+
+    elif isinstance(data, bytes):
+        data = io.BytesIO(data)
+
+    return data
+
+
+def _data_to_frame(data: list[dict[str, str | None]], **kwargs) -> DataFrame:
+    """
+    Convert parsed data to Data Frame.
+
+    This method will bind xml dictionary data of keys and values
+    into named columns of Data Frame using the built-in TextParser
+    class that build Data Frame and infers specific dtypes.
+    """
+
+    tags = next(iter(data))
+    nodes = [list(d.values()) for d in data]
+
+    try:
+        with TextParser(nodes, names=tags, **kwargs) as tp:
+            return tp.read()
+    except ParserError as err:
+        raise ParserError(
+            "XML document may be too complex for import. "
+            "Try to flatten document and use distinct "
+            "element and attribute names."
+        ) from err
+
+
+def _parse(
+    path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
+    xpath: str,
+    namespaces: dict[str, str] | None,
+    elems_only: bool,
+    attrs_only: bool,
+    names: Sequence[str] | None,
+    dtype: DtypeArg | None,
+    converters: ConvertersArg | None,
+    parse_dates: ParseDatesArg | None,
+    encoding: str | None,
+    parser: XMLParsers,
+    stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
+    iterparse: dict[str, list[str]] | None,
+    compression: CompressionOptions,
+    storage_options: StorageOptions,
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+    **kwargs,
+) -> DataFrame:
+    """
+    Call internal parsers.
+
+    This method will conditionally call internal parsers:
+    LxmlFrameParser and/or EtreeParser.
+
+    Raises
+    ------
+    ImportError
+        * If lxml is not installed if selected as parser.
+
+    ValueError
+        * If parser is not lxml or etree.
+    """
+
+    p: _EtreeFrameParser | _LxmlFrameParser
+
+    if parser == "lxml":
+        lxml = import_optional_dependency("lxml.etree", errors="ignore")
+
+        if lxml is not None:
+            p = _LxmlFrameParser(
+                path_or_buffer,
+                xpath,
+                namespaces,
+                elems_only,
+                attrs_only,
+                names,
+                dtype,
+                converters,
+                parse_dates,
+                encoding,
+                stylesheet,
+                iterparse,
+                compression,
+                storage_options,
+            )
+        else:
+            raise ImportError("lxml not found, please install or use the etree parser.")
+
+    elif parser == "etree":
+        p = _EtreeFrameParser(
+            path_or_buffer,
+            xpath,
+            namespaces,
+            elems_only,
+            attrs_only,
+            names,
+            dtype,
+            converters,
+            parse_dates,
+            encoding,
+            stylesheet,
+            iterparse,
+            compression,
+            storage_options,
+        )
+    else:
+        raise ValueError("Values for parser can only be lxml or etree.")
+
+    data_dicts = p.parse_data()
+
+    return _data_to_frame(
+        data=data_dicts,
+        dtype=dtype,
+        converters=converters,
+        parse_dates=parse_dates,
+        dtype_backend=dtype_backend,
+        **kwargs,
+    )
+
+
+@set_module("pandas")
+def read_xml(
+    path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
+    *,
+    xpath: str = "./*",
+    namespaces: dict[str, str] | None = None,
+    elems_only: bool = False,
+    attrs_only: bool = False,
+    names: Sequence[str] | None = None,
+    dtype: DtypeArg | None = None,
+    converters: ConvertersArg | None = None,
+    parse_dates: ParseDatesArg | None = None,
+    # encoding can not be None for lxml and StringIO input
+    encoding: str | None = "utf-8",
+    parser: XMLParsers = "lxml",
+    stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
+    iterparse: dict[str, list[str]] | None = None,
+    compression: CompressionOptions = "infer",
+    storage_options: StorageOptions | None = None,
+    dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
+) -> DataFrame:
+    r"""
+    Read XML document into a :class:`~pandas.DataFrame` object.
+
+    Parameters
+    ----------
+    path_or_buffer : str, path object, or file-like object
+        String path, path object (implementing ``os.PathLike[str]``), or file-like
+        object implementing a ``read()`` function. The string can be a path.
+        The string can further be a URL. Valid URL schemes
+        include http, ftp, s3, and file.
+
+    xpath : str, optional, default './\*'
+        The ``XPath`` to parse required set of nodes for migration to
+        :class:`~pandas.DataFrame`.``XPath`` should return a collection of elements
+        and not a single element. Note: The ``etree`` parser supports limited ``XPath``
+        expressions. For more complex ``XPath``, use ``lxml`` which requires
+        installation.
+
+    namespaces : dict, optional
+        The namespaces defined in XML document as dicts with key being
+        namespace prefix and value the URI. There is no need to include all
+        namespaces in XML, only the ones used in ``xpath`` expression.
+        Note: if XML document uses default namespace denoted as
+        `xmlns='<URI>'` without a prefix, you must assign any temporary
+        namespace prefix such as 'doc' to the URI in order to parse
+        underlying nodes and/or attributes.
+
+    elems_only : bool, optional, default False
+        Parse only the child elements at the specified ``xpath``. By default,
+        all child elements and non-empty text nodes are returned.
+
+    attrs_only :  bool, optional, default False
+        Parse only the attributes at the specified ``xpath``.
+        By default, all attributes are returned.
+
+    names :  list-like, optional
+        Column names for DataFrame of parsed XML data. Use this parameter to
+        rename original element names and distinguish same named elements and
+        attributes.
+
+    dtype : Type name or dict of column -> type, optional
+        Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
+        'c': 'Int64'}}
+        Use `str` or `object` together with suitable `na_values` settings
+        to preserve and not interpret dtype.
+        If converters are specified, they will be applied INSTEAD
+        of dtype conversion.
+
+    converters : dict, optional
+        Dict of functions for converting values in certain columns. Keys can either
+        be integers or column labels.
+
+    parse_dates : bool or list of int or names or list of lists or dict, default False
+        Identifiers to parse index or columns to datetime. The behavior is as follows:
+
+        * boolean. If True -> try parsing the index.
+        * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
+          each as a separate date column.
+        * list of lists. e.g.  If [[1, 3]] -> combine columns 1 and 3 and parse as
+          a single date column.
+        * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
+          result 'foo'
+
+    encoding : str, optional, default 'utf-8'
+        Encoding of XML document.
+
+    parser : {{'lxml','etree'}}, default 'lxml'
+        Parser module to use for retrieval of data. Only 'lxml' and
+        'etree' are supported. With 'lxml' more complex ``XPath`` searches
+        and ability to use XSLT stylesheet are supported.
+
+    stylesheet : str, path object or file-like object
+        A URL, file-like object, or a string path containing an XSLT script.
+        This stylesheet should flatten complex, deeply nested XML documents
+        for easier parsing. To use this feature you must have ``lxml`` module
+        installed and specify 'lxml' as ``parser``. The ``xpath`` must
+        reference nodes of transformed XML document generated after XSLT
+        transformation and not the original XML document. Only XSLT 1.0
+        scripts and not later versions is currently supported.
+
+    iterparse : dict, optional
+        The nodes or attributes to retrieve in iterparsing of XML document
+        as a dict with key being the name of repeating element and value being
+        list of elements or attribute names that are descendants of the repeated
+        element. Note: If this option is used, it will replace ``xpath`` parsing
+        and unlike ``xpath``, descendants do not need to relate to each other but can
+        exist any where in document under the repeating element. This memory-
+        efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).
+        For example, ``{{"row_element": ["child_elem", "attr", "grandchild_elem"]}}``.
+
+    compression : str or dict, default 'infer'
+        For on-the-fly decompression of on-disk data. If 'infer' and
+        'path_or_buffer' is path-like, then detect compression from the
+        following extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar',
+        '.tar.gz', '.tar.xz' or '.tar.bz2' (otherwise no compression).
+        If using 'zip' or 'tar', the ZIP file must contain only one data
+        file to be read in. Set to ``None`` for no decompression.
+        Can also be a dict with key ``'method'`` set to one of
+        {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'xz'``, ``'tar'``}
+        and other key-value pairs are forwarded to ``zipfile.ZipFile``,
+        ``gzip.GzipFile``, ``bz2.BZ2File``, ``zstandard.ZstdDecompressor``,
+        ``lzma.LZMAFile`` or ``tarfile.TarFile``, respectively.
+        As an example, the following could be passed for Zstandard
+        decompression using a custom compression dictionary:
+        ``compression={'method': 'zstd', 'dict_data': my_compression_dict}``.
+
+    storage_options : dict, optional
+        Extra options that make sense for a particular storage connection,
+        e.g. host, port, username, password, etc. For HTTP(S) URLs the
+        key-value pairs are forwarded to ``urllib.request.Request`` as header
+        options. For other URLs (e.g. starting with "s3://", and "gcs://")
+        the key-value pairs are forwarded to ``fsspec.open``. Please see
+        ``fsspec`` and ``urllib`` for more details, and for more examples on
+        storage options refer `here <https://pandas.pydata.org/docs/
+        user_guide/io.html?highlight=storage_options#reading-writing-remote-
+        files>`_.
+
+    dtype_backend : {{'numpy_nullable', 'pyarrow'}}
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). If not specified, the default behavior
+        is to not use nullable data types. If specified, the behavior
+        is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
+        * ``"pyarrow"``: returns pyarrow-backed nullable
+          :class:`ArrowDtype` :class:`DataFrame`
+
+        .. versionadded:: 2.0
+
+    Returns
+    -------
+    df
+        A DataFrame.
+
+    See Also
+    --------
+    read_json : Convert a JSON string to pandas object.
+    read_html : Read HTML tables into a list of DataFrame objects.
+
+    Notes
+    -----
+    This method is best designed to import shallow XML documents in
+    following format which is the ideal fit for the two-dimensions of a
+    ``DataFrame`` (row by column). ::
+
+            <root>
+                <row>
+                  <column1>data</column1>
+                  <column2>data</column2>
+                  <column3>data</column3>
+                  ...
+               </row>
+               <row>
+                  ...
+               </row>
+               ...
+            </root>
+
+    As a file format, XML documents can be designed any way including
+    layout of elements and attributes as long as it conforms to W3C
+    specifications. Therefore, this method is a convenience handler for
+    a specific flatter design and not all possible XML structures.
+
+    However, for more complex XML documents, ``stylesheet`` allows you to
+    temporarily redesign original document with XSLT (a special purpose
+    language) for a flatter version for migration to a DataFrame.
+
+    This function will *always* return a single :class:`DataFrame` or raise
+    exceptions due to issues with XML document, ``xpath``, or other
+    parameters.
+
+    See the :ref:`read_xml documentation in the IO section of the docs
+    <io.read_xml>` for more information in using this method to parse XML
+    files to DataFrames.
+
+    Examples
+    --------
+    >>> from io import StringIO
+    >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
+    ... <data xmlns="http://example.com">
+    ...  <row>
+    ...    <shape>square</shape>
+    ...    <degrees>360</degrees>
+    ...    <sides>4.0</sides>
+    ...  </row>
+    ...  <row>
+    ...    <shape>circle</shape>
+    ...    <degrees>360</degrees>
+    ...    <sides/>
+    ...  </row>
+    ...  <row>
+    ...    <shape>triangle</shape>
+    ...    <degrees>180</degrees>
+    ...    <sides>3.0</sides>
+    ...  </row>
+    ... </data>'''
+
+    >>> df = pd.read_xml(StringIO(xml))
+    >>> df
+          shape  degrees  sides
+    0    square      360    4.0
+    1    circle      360    NaN
+    2  triangle      180    3.0
+
+    >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
+    ... <data>
+    ...   <row shape="square" degrees="360" sides="4.0"/>
+    ...   <row shape="circle" degrees="360"/>
+    ...   <row shape="triangle" degrees="180" sides="3.0"/>
+    ... </data>'''
+
+    >>> df = pd.read_xml(StringIO(xml), xpath=".//row")
+    >>> df
+          shape  degrees  sides
+    0    square      360    4.0
+    1    circle      360    NaN
+    2  triangle      180    3.0
+
+    >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
+    ... <doc:data xmlns:doc="https://example.com">
+    ...   <doc:row>
+    ...     <doc:shape>square</doc:shape>
+    ...     <doc:degrees>360</doc:degrees>
+    ...     <doc:sides>4.0</doc:sides>
+    ...   </doc:row>
+    ...   <doc:row>
+    ...     <doc:shape>circle</doc:shape>
+    ...     <doc:degrees>360</doc:degrees>
+    ...     <doc:sides/>
+    ...   </doc:row>
+    ...   <doc:row>
+    ...     <doc:shape>triangle</doc:shape>
+    ...     <doc:degrees>180</doc:degrees>
+    ...     <doc:sides>3.0</doc:sides>
+    ...   </doc:row>
+    ... </doc:data>'''
+
+    >>> df = pd.read_xml(
+    ...     StringIO(xml),
+    ...     xpath="//doc:row",
+    ...     namespaces={"doc": "https://example.com"},
+    ... )
+    >>> df
+          shape  degrees  sides
+    0    square      360    4.0
+    1    circle      360    NaN
+    2  triangle      180    3.0
+
+    >>> xml_data = '''
+    ...         <data>
+    ...            <row>
+    ...               <index>0</index>
+    ...               <a>1</a>
+    ...               <b>2.5</b>
+    ...               <c>True</c>
+    ...               <d>a</d>
+    ...               <e>2019-12-31 00:00:00</e>
+    ...            </row>
+    ...            <row>
+    ...               <index>1</index>
+    ...               <b>4.5</b>
+    ...               <c>False</c>
+    ...               <d>b</d>
+    ...               <e>2019-12-31 00:00:00</e>
+    ...            </row>
+    ...         </data>
+    ...         '''
+
+    >>> df = pd.read_xml(
+    ...     StringIO(xml_data), dtype_backend="numpy_nullable", parse_dates=["e"]
+    ... )
+    >>> df
+       index     a    b      c  d          e
+    0      0     1  2.5   True  a 2019-12-31
+    1      1  <NA>  4.5  False  b 2019-12-31
+    """
+    check_dtype_backend(dtype_backend)
+
+    return _parse(
+        path_or_buffer=path_or_buffer,
+        xpath=xpath,
+        namespaces=namespaces,
+        elems_only=elems_only,
+        attrs_only=attrs_only,
+        names=names,
+        dtype=dtype,
+        converters=converters,
+        parse_dates=parse_dates,
+        encoding=encoding,
+        parser=parser,
+        stylesheet=stylesheet,
+        iterparse=iterparse,
+        compression=compression,
+        storage_options=storage_options,
+        dtype_backend=dtype_backend,
+    )
diff --git a/pandas/plotting/__init__.py b/pandas/plotting/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..837bfaf82ca272672cb6bd91adb83154e66a508e
--- /dev/null
+++ b/pandas/plotting/__init__.py
@@ -0,0 +1,99 @@
+"""
+Plotting public API.
+
+Authors of third-party plotting backends should implement a module with a
+public ``plot(data, kind, **kwargs)``. The parameter `data` will contain
+the data structure and can be a `Series` or a `DataFrame`. For example,
+for ``df.plot()`` the parameter `data` will contain the DataFrame `df`.
+In some cases, the data structure is transformed before being sent to
+the backend (see PlotAccessor.__call__ in pandas/plotting/_core.py for
+the exact transformations).
+
+The parameter `kind` will be one of:
+
+- line
+- bar
+- barh
+- box
+- hist
+- kde
+- area
+- pie
+- scatter
+- hexbin
+
+See the pandas API reference for documentation on each kind of plot.
+
+Any other keyword argument is currently assumed to be backend specific,
+but some parameters may be unified and added to the signature in the
+future (e.g. `title` which should be useful for any backend).
+
+Currently, all the Matplotlib functions in pandas are accessed through
+the selected backend. For example, `pandas.plotting.boxplot` (equivalent
+to `DataFrame.boxplot`) is also accessed in the selected backend. This
+is expected to change, and the exact API is under discussion. But with
+the current version, backends are expected to implement the next functions:
+
+- plot (describe above, used for `Series.plot` and `DataFrame.plot`)
+- hist_series and hist_frame (for `Series.hist` and `DataFrame.hist`)
+- boxplot (`pandas.plotting.boxplot(df)` equivalent to `DataFrame.boxplot`)
+- boxplot_frame and boxplot_frame_groupby
+- register and deregister (register converters for the tick formats)
+- Plots not called as `Series` and `DataFrame` methods:
+  - table
+  - andrews_curves
+  - autocorrelation_plot
+  - bootstrap_plot
+  - lag_plot
+  - parallel_coordinates
+  - radviz
+  - scatter_matrix
+
+Use the code in pandas/plotting/_matplotib.py and
+https://github.com/pyviz/hvplot as a reference on how to write a backend.
+
+For the discussion about the API see
+https://github.com/pandas-dev/pandas/issues/26747.
+"""
+
+from pandas.plotting._core import (
+    PlotAccessor,
+    boxplot,
+    boxplot_frame,
+    boxplot_frame_groupby,
+    hist_frame,
+    hist_series,
+)
+from pandas.plotting._misc import (
+    andrews_curves,
+    autocorrelation_plot,
+    bootstrap_plot,
+    deregister as deregister_matplotlib_converters,
+    lag_plot,
+    parallel_coordinates,
+    plot_params,
+    radviz,
+    register as register_matplotlib_converters,
+    scatter_matrix,
+    table,
+)
+
+__all__ = [
+    "PlotAccessor",
+    "andrews_curves",
+    "autocorrelation_plot",
+    "bootstrap_plot",
+    "boxplot",
+    "boxplot_frame",
+    "boxplot_frame_groupby",
+    "deregister_matplotlib_converters",
+    "hist_frame",
+    "hist_series",
+    "lag_plot",
+    "parallel_coordinates",
+    "plot_params",
+    "radviz",
+    "register_matplotlib_converters",
+    "scatter_matrix",
+    "table",
+]
diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
new file mode 100644
index 0000000000000000000000000000000000000000..e75bb32313b03da9aeee2cfbcd9227e10edcd780
--- /dev/null
+++ b/pandas/plotting/_core.py
@@ -0,0 +1,2255 @@
+from __future__ import annotations
+
+import importlib
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+)
+
+from pandas._config import get_option
+
+from pandas.util._decorators import set_module
+
+from pandas.core.dtypes.common import (
+    is_integer,
+    is_list_like,
+)
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCSeries,
+)
+
+from pandas.core.base import PandasObject
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Hashable,
+        Sequence,
+    )
+    import types
+
+    from matplotlib.axes import Axes
+    import numpy as np
+
+    from pandas._typing import IndexLabel
+
+    from pandas import (
+        DataFrame,
+        Index,
+        Series,
+    )
+    from pandas.core.groupby.generic import DataFrameGroupBy
+
+
+def holds_integer(column: Index) -> bool:
+    return column.dtype.kind in "iu"
+
+
+@set_module("pandas.plotting")
+def hist_series(
+    self: Series,
+    by=None,
+    ax=None,
+    grid: bool = True,
+    xlabelsize: int | None = None,
+    xrot: float | None = None,
+    ylabelsize: int | None = None,
+    yrot: float | None = None,
+    figsize: tuple[int, int] | None = None,
+    bins: int | Sequence[int] = 10,
+    backend: str | None = None,
+    legend: bool = False,
+    **kwargs,
+):
+    """
+    Draw histogram of the input series using matplotlib.
+
+    Parameters
+    ----------
+    by : object, optional
+        If passed, then used to form histograms for separate groups.
+    ax : matplotlib axis object
+        If not passed, uses gca().
+    grid : bool, default True
+        Whether to show axis grid lines.
+    xlabelsize : int, default None
+        If specified changes the x-axis label size.
+    xrot : float, default None
+        Rotation of x axis labels.
+    ylabelsize : int, default None
+        If specified changes the y-axis label size.
+    yrot : float, default None
+        Rotation of y axis labels.
+    figsize : tuple, default None
+        Figure size in inches by default.
+    bins : int or sequence, default 10
+        Number of histogram bins to be used. If an integer is given, bins + 1
+        bin edges are calculated and returned. If bins is a sequence, gives
+        bin edges, including left edge of first bin and right edge of last
+        bin. In this case, bins is returned unmodified.
+    backend : str, default None
+        Backend to use instead of the backend specified in the option
+        ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
+        specify the ``plotting.backend`` for the whole session, set
+        ``pd.options.plotting.backend``.
+    legend : bool, default False
+        Whether to show the legend.
+
+    **kwargs
+        To be passed to the actual plotting function.
+
+    Returns
+    -------
+    matplotlib.axes.Axes
+        A histogram plot.
+
+    See Also
+    --------
+    matplotlib.axes.Axes.hist : Plot a histogram using matplotlib.
+
+    Examples
+    --------
+    For Series:
+
+    .. plot::
+        :context: close-figs
+
+        >>> lst = ["a", "a", "a", "b", "b", "b"]
+        >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst)
+        >>> hist = ser.hist()
+
+    For Groupby:
+
+    .. plot::
+        :context: close-figs
+
+        >>> lst = ["a", "a", "a", "b", "b", "b"]
+        >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst)
+        >>> hist = ser.groupby(level=0).hist()
+    """
+    plot_backend = _get_plot_backend(backend)
+    return plot_backend.hist_series(
+        self,
+        by=by,
+        ax=ax,
+        grid=grid,
+        xlabelsize=xlabelsize,
+        xrot=xrot,
+        ylabelsize=ylabelsize,
+        yrot=yrot,
+        figsize=figsize,
+        bins=bins,
+        legend=legend,
+        **kwargs,
+    )
+
+
+@set_module("pandas.plotting")
+def hist_frame(
+    data: DataFrame,
+    column: IndexLabel | None = None,
+    by=None,
+    grid: bool = True,
+    xlabelsize: int | None = None,
+    xrot: float | None = None,
+    ylabelsize: int | None = None,
+    yrot: float | None = None,
+    ax=None,
+    sharex: bool = False,
+    sharey: bool = False,
+    figsize: tuple[int, int] | None = None,
+    layout: tuple[int, int] | None = None,
+    bins: int | Sequence[int] = 10,
+    backend: str | None = None,
+    legend: bool = False,
+    **kwargs,
+):
+    """
+    Make a histogram of the DataFrame's columns.
+
+    A `histogram`_ is a representation of the distribution of data.
+    This function calls :meth:`matplotlib.pyplot.hist`, on each series in
+    the DataFrame, resulting in one histogram per column.
+
+    .. _histogram: https://en.wikipedia.org/wiki/Histogram
+
+    Parameters
+    ----------
+    data : DataFrame
+        The pandas object holding the data.
+    column : str or sequence, optional
+        If passed, will be used to limit data to a subset of columns.
+    by : object, optional
+        If passed, then used to form histograms for separate groups.
+    grid : bool, default True
+        Whether to show axis grid lines.
+    xlabelsize : int, default None
+        If specified changes the x-axis label size.
+    xrot : float, default None
+        Rotation of x axis labels. For example, a value of 90 displays the
+        x labels rotated 90 degrees clockwise.
+    ylabelsize : int, default None
+        If specified changes the y-axis label size.
+    yrot : float, default None
+        Rotation of y axis labels. For example, a value of 90 displays the
+        y labels rotated 90 degrees clockwise.
+    ax : Matplotlib axes object, default None
+        The axes to plot the histogram on.
+    sharex : bool, default True if ax is None else False
+        In case subplots=True, share x axis and set some x axis labels to
+        invisible; defaults to True if ax is None otherwise False if an ax
+        is passed in.
+        Note that passing in both an ax and sharex=True will alter all x axis
+        labels for all subplots in a figure.
+    sharey : bool, default False
+        In case subplots=True, share y axis and set some y axis labels to
+        invisible.
+    figsize : tuple, optional
+        The size in inches of the figure to create. Uses the value in
+        `matplotlib.rcParams` by default.
+    layout : tuple, optional
+        Tuple of (rows, columns) for the layout of the histograms.
+    bins : int or sequence, default 10
+        Number of histogram bins to be used. If an integer is given, bins + 1
+        bin edges are calculated and returned. If bins is a sequence, gives
+        bin edges, including left edge of first bin and right edge of last
+        bin. In this case, bins is returned unmodified.
+
+    backend : str, default None
+        Backend to use instead of the backend specified in the option
+        ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
+        specify the ``plotting.backend`` for the whole session, set
+        ``pd.options.plotting.backend``.
+
+    legend : bool, default False
+        Whether to show the legend.
+
+    **kwargs
+        All other plotting keyword arguments to be passed to
+        :meth:`matplotlib.pyplot.hist`.
+
+    Returns
+    -------
+    np.ndarray
+        2D NumPy Array of :class:`matplotlib.axes.Axes`.
+
+    See Also
+    --------
+    matplotlib.pyplot.hist : Plot a histogram using matplotlib.
+
+    Examples
+    --------
+    This example draws a histogram based on the length and width of
+    some animals, displayed in three bins
+
+    .. plot::
+        :context: close-figs
+
+        >>> data = {
+        ...     "length": [1.5, 0.5, 1.2, 0.9, 3],
+        ...     "width": [0.7, 0.2, 0.15, 0.2, 1.1],
+        ... }
+        >>> index = ["pig", "rabbit", "duck", "chicken", "horse"]
+        >>> df = pd.DataFrame(data, index=index)
+        >>> hist = df.hist(bins=3)
+    """
+    plot_backend = _get_plot_backend(backend)
+    return plot_backend.hist_frame(
+        data,
+        column=column,
+        by=by,
+        grid=grid,
+        xlabelsize=xlabelsize,
+        xrot=xrot,
+        ylabelsize=ylabelsize,
+        yrot=yrot,
+        ax=ax,
+        sharex=sharex,
+        sharey=sharey,
+        figsize=figsize,
+        layout=layout,
+        legend=legend,
+        bins=bins,
+        **kwargs,
+    )
+
+
+@set_module("pandas.plotting")
+def boxplot(
+    data: DataFrame,
+    column: str | list[str] | None = None,
+    by: str | list[str] | None = None,
+    ax: Axes | None = None,
+    fontsize: float | str | None = None,
+    rot: int = 0,
+    grid: bool = True,
+    figsize: tuple[float, float] | None = None,
+    layout: tuple[int, int] | None = None,
+    return_type: str | None = None,
+    **kwargs,
+):
+    """
+    Make a box plot from DataFrame columns.
+
+    Make a box-and-whisker plot from DataFrame columns, optionally grouped
+    by some other columns. A box plot is a method for graphically depicting
+    groups of numerical data through their quartiles.
+    The box extends from the Q1 to Q3 quartile values of the data,
+    with a line at the median (Q2). The whiskers extend from the edges
+    of box to show the range of the data. By default, they extend no more than
+    `1.5 * IQR (IQR = Q3 - Q1)` from the edges of the box, ending at the farthest
+    data point within that interval. Outliers are plotted as separate dots.
+
+    For further details see
+    Wikipedia's entry for `boxplot <https://en.wikipedia.org/wiki/Box_plot>`_.
+
+    Parameters
+    ----------
+    data : DataFrame
+        The data to visualize.
+    column : str or list of str, optional
+        Column name or list of names, or vector.
+        Can be any valid input to :meth:`pandas.DataFrame.groupby`.
+    by : str or array-like, optional
+        Column in the DataFrame to :meth:`pandas.DataFrame.groupby`.
+        One box-plot will be done per value of columns in `by`.
+    ax : object of class matplotlib.axes.Axes, optional
+        The matplotlib axes to be used by boxplot.
+    fontsize : float or str
+        Tick label font size in points or as a string (e.g., `large`).
+    rot : float, default 0
+        The rotation angle of labels (in degrees)
+        with respect to the screen coordinate system.
+    grid : bool, default True
+        Setting this to True will show the grid.
+    figsize : A tuple (width, height) in inches
+        The size of the figure to create in matplotlib.
+    layout : tuple (rows, columns), optional
+        For example, (3, 5) will display the subplots
+        using 3 rows and 5 columns, starting from the top-left.
+    return_type : {'axes', 'dict', 'both'} or None, default 'axes'
+        The kind of object to return. The default is ``axes``.
+
+        * 'axes' returns the matplotlib axes the boxplot is drawn on.
+        * 'dict' returns a dictionary whose values are the matplotlib
+          lines of the boxplot.
+        * 'both' returns a namedtuple with the axes and dict.
+        * when grouping with ``by``, a Series mapping columns to
+          ``return_type`` is returned.
+
+        If ``return_type`` is `None`, a NumPy array
+        of axes with the same shape as ``layout`` is returned.
+
+    **kwargs
+        All other plotting keyword arguments to be passed to
+        :func:`matplotlib.pyplot.boxplot`.
+
+    Returns
+    -------
+    result
+        See Notes.
+
+    See Also
+    --------
+    Series.plot.hist: Make a histogram.
+    matplotlib.pyplot.boxplot : Matplotlib equivalent plot.
+
+    Notes
+    -----
+    The return type depends on the `return_type` parameter:
+
+    * 'axes' : object of class matplotlib.axes.Axes
+    * 'dict' : dict of matplotlib.lines.Line2D objects
+    * 'both' : a namedtuple with structure (ax, lines)
+
+    For data grouped with ``by``, return a Series of the above or a numpy
+    array:
+
+    * :class:`~pandas.Series`
+    * :class:`~numpy.array` (for ``return_type = None``)
+
+    Use ``return_type='dict'`` when you want to tweak the appearance
+    of the lines after plotting. In this case a dict containing the Lines
+    making up the boxes, caps, fliers, medians, and whiskers is returned.
+
+    Examples
+    --------
+
+    Boxplots can be created for every column in the dataframe
+    by ``df.boxplot()`` or indicating the columns to be used:
+
+    .. plot::
+        :context: close-figs
+
+        >>> np.random.seed(1234)
+        >>> df = pd.DataFrame(
+        ...     np.random.randn(10, 4), columns=["Col1", "Col2", "Col3", "Col4"]
+        ... )
+        >>> boxplot = df.boxplot(column=["Col1", "Col2", "Col3"])  # doctest: +SKIP
+
+    Boxplots of variables distributions grouped by the values of a third
+    variable can be created using the option ``by``. For instance:
+
+    .. plot::
+        :context: close-figs
+
+        >>> df = pd.DataFrame(np.random.randn(10, 2), columns=["Col1", "Col2"])
+        >>> df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
+        >>> boxplot = df.boxplot(by="X")
+
+    A list of strings (i.e. ``['X', 'Y']``) can be passed to boxplot
+    in order to group the data by combination of the variables in the x-axis:
+
+    .. plot::
+        :context: close-figs
+
+        >>> df = pd.DataFrame(np.random.randn(10, 3), columns=["Col1", "Col2", "Col3"])
+        >>> df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
+        >>> df["Y"] = pd.Series(["A", "B", "A", "B", "A", "B", "A", "B", "A", "B"])
+        >>> boxplot = df.boxplot(column=["Col1", "Col2"], by=["X", "Y"])
+
+    The layout of boxplot can be adjusted giving a tuple to ``layout``:
+
+    .. plot::
+        :context: close-figs
+
+        >>> boxplot = df.boxplot(column=["Col1", "Col2"], by="X", layout=(2, 1))
+
+    Additional formatting can be done to the boxplot, like suppressing the grid
+    (``grid=False``), rotating the labels in the x-axis (i.e. ``rot=45``)
+    or changing the fontsize (i.e. ``fontsize=15``):
+
+    .. plot::
+        :context: close-figs
+
+        >>> boxplot = df.boxplot(grid=False, rot=45, fontsize=15)  # doctest: +SKIP
+
+    The parameter ``return_type`` can be used to select the type of element
+    returned by `boxplot`.  When ``return_type='axes'`` is selected,
+    the matplotlib axes on which the boxplot is drawn are returned:
+
+        >>> boxplot = df.boxplot(column=["Col1", "Col2"], return_type="axes")
+        >>> type(boxplot)
+        <class 'matplotlib.axes._axes.Axes'>
+
+    When grouping with ``by``, a Series mapping columns to ``return_type``
+    is returned:
+
+        >>> boxplot = df.boxplot(column=["Col1", "Col2"], by="X", return_type="axes")
+        >>> type(boxplot)
+        <class 'pandas.Series'>
+
+    If ``return_type`` is `None`, a NumPy array of axes with the same shape
+    as ``layout`` is returned:
+
+        >>> boxplot = df.boxplot(column=["Col1", "Col2"], by="X", return_type=None)
+        >>> type(boxplot)
+        <class 'numpy.ndarray'>
+    """
+    plot_backend = _get_plot_backend("matplotlib")
+    return plot_backend.boxplot(
+        data,
+        column=column,
+        by=by,
+        ax=ax,
+        fontsize=fontsize,
+        rot=rot,
+        grid=grid,
+        figsize=figsize,
+        layout=layout,
+        return_type=return_type,
+        **kwargs,
+    )
+
+
+@set_module("pandas.plotting")
+def boxplot_frame(
+    self: DataFrame,
+    column=None,
+    by=None,
+    ax=None,
+    fontsize: int | None = None,
+    rot: int = 0,
+    grid: bool = True,
+    figsize: tuple[float, float] | None = None,
+    layout=None,
+    return_type=None,
+    backend=None,
+    **kwargs,
+):
+    """
+    Make a box plot from DataFrame columns.
+
+    Make a box-and-whisker plot from DataFrame columns, optionally grouped
+    by some other columns. A box plot is a method for graphically depicting
+    groups of numerical data through their quartiles.
+    The box extends from the Q1 to Q3 quartile values of the data,
+    with a line at the median (Q2). The whiskers extend from the edges
+    of box to show the range of the data. By default, they extend no more than
+    `1.5 * IQR (IQR = Q3 - Q1)` from the edges of the box, ending at the farthest
+    data point within that interval. Outliers are plotted as separate dots.
+
+    For further details see
+    Wikipedia's entry for `boxplot <https://en.wikipedia.org/wiki/Box_plot>`_.
+
+    Parameters
+    ----------
+    column : str or list of str, optional
+        Column name or list of names, or vector.
+        Can be any valid input to :meth:`pandas.DataFrame.groupby`.
+    by : str or array-like, optional
+        Column in the DataFrame to :meth:`pandas.DataFrame.groupby`.
+        One box-plot will be done per value of columns in `by`.
+    ax : object of class matplotlib.axes.Axes, optional
+        The matplotlib axes to be used by boxplot.
+    fontsize : float or str
+        Tick label font size in points or as a string (e.g., `large`).
+    rot : float, default 0
+        The rotation angle of labels (in degrees)
+        with respect to the screen coordinate system.
+    grid : bool, default True
+        Setting this to True will show the grid.
+    figsize : A tuple (width, height) in inches
+        The size of the figure to create in matplotlib.
+    layout : tuple (rows, columns), optional
+        For example, (3, 5) will display the subplots
+        using 3 rows and 5 columns, starting from the top-left.
+    return_type : {'axes', 'dict', 'both'} or None, default 'axes'
+        The kind of object to return. The default is ``axes``.
+
+        * 'axes' returns the matplotlib axes the boxplot is drawn on.
+        * 'dict' returns a dictionary whose values are the matplotlib
+          lines of the boxplot.
+        * 'both' returns a namedtuple with the axes and dict.
+        * when grouping with ``by``, a Series mapping columns to
+          ``return_type`` is returned.
+
+        If ``return_type`` is `None`, a NumPy array
+        of axes with the same shape as ``layout`` is returned.
+    backend : str, default None
+        Backend to use instead of the backend specified in the option
+        ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
+        specify the ``plotting.backend`` for the whole session, set
+        ``pd.options.plotting.backend``.
+
+    **kwargs
+        All other plotting keyword arguments to be passed to
+        :func:`matplotlib.pyplot.boxplot`.
+
+    Returns
+    -------
+    result
+        See Notes.
+
+    See Also
+    --------
+    Series.plot.hist: Make a histogram.
+    matplotlib.pyplot.boxplot : Matplotlib equivalent plot.
+
+    Notes
+    -----
+    The return type depends on the `return_type` parameter:
+
+    * 'axes' : object of class matplotlib.axes.Axes
+    * 'dict' : dict of matplotlib.lines.Line2D objects
+    * 'both' : a namedtuple with structure (ax, lines)
+
+    For data grouped with ``by``, return a Series of the above or a numpy
+    array:
+
+    * :class:`~pandas.Series`
+    * :class:`~numpy.array` (for ``return_type = None``)
+
+    Use ``return_type='dict'`` when you want to tweak the appearance
+    of the lines after plotting. In this case a dict containing the Lines
+    making up the boxes, caps, fliers, medians, and whiskers is returned.
+
+    Examples
+    --------
+
+    Boxplots can be created for every column in the dataframe
+    by ``df.boxplot()`` or indicating the columns to be used:
+
+    .. plot::
+        :context: close-figs
+
+        >>> np.random.seed(1234)
+        >>> df = pd.DataFrame(
+        ...     np.random.randn(10, 4), columns=["Col1", "Col2", "Col3", "Col4"]
+        ... )
+        >>> boxplot = df.boxplot(column=["Col1", "Col2", "Col3"])  # doctest: +SKIP
+
+    Boxplots of variables distributions grouped by the values of a third
+    variable can be created using the option ``by``. For instance:
+
+    .. plot::
+        :context: close-figs
+
+        >>> df = pd.DataFrame(np.random.randn(10, 2), columns=["Col1", "Col2"])
+        >>> df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
+        >>> boxplot = df.boxplot(by="X")
+
+    A list of strings (i.e. ``['X', 'Y']``) can be passed to boxplot
+    in order to group the data by combination of the variables in the x-axis:
+
+    .. plot::
+        :context: close-figs
+
+        >>> df = pd.DataFrame(np.random.randn(10, 3), columns=["Col1", "Col2", "Col3"])
+        >>> df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
+        >>> df["Y"] = pd.Series(["A", "B", "A", "B", "A", "B", "A", "B", "A", "B"])
+        >>> boxplot = df.boxplot(column=["Col1", "Col2"], by=["X", "Y"])
+
+    The layout of boxplot can be adjusted giving a tuple to ``layout``:
+
+    .. plot::
+        :context: close-figs
+
+        >>> boxplot = df.boxplot(column=["Col1", "Col2"], by="X", layout=(2, 1))
+
+    Additional formatting can be done to the boxplot, like suppressing the grid
+    (``grid=False``), rotating the labels in the x-axis (i.e. ``rot=45``)
+    or changing the fontsize (i.e. ``fontsize=15``):
+
+    .. plot::
+        :context: close-figs
+
+        >>> boxplot = df.boxplot(grid=False, rot=45, fontsize=15)  # doctest: +SKIP
+
+    The parameter ``return_type`` can be used to select the type of element
+    returned by `boxplot`.  When ``return_type='axes'`` is selected,
+    the matplotlib axes on which the boxplot is drawn are returned:
+
+    .. plot::
+        :context: close-figs
+
+        >>> boxplot = df.boxplot(column=["Col1", "Col2"], return_type="axes")
+        >>> type(boxplot)
+        <class 'matplotlib.axes._axes.Axes'>
+
+    When grouping with ``by``, a Series mapping columns to ``return_type``
+    is returned:
+
+    .. plot::
+        :context: close-figs
+
+        >>> boxplot = df.boxplot(column=["Col1", "Col2"], by="X", return_type="axes")
+        >>> type(boxplot)
+        <class 'pandas.Series'>
+
+    If ``return_type`` is `None`, a NumPy array of axes with the same shape
+    as ``layout`` is returned:
+
+    .. plot::
+        :context: close-figs
+
+        >>> boxplot = df.boxplot(column=["Col1", "Col2"], by="X", return_type=None)
+        >>> type(boxplot)
+        <class 'numpy.ndarray'>
+    """
+
+    plot_backend = _get_plot_backend(backend)
+    return plot_backend.boxplot_frame(
+        self,
+        column=column,
+        by=by,
+        ax=ax,
+        fontsize=fontsize,
+        rot=rot,
+        grid=grid,
+        figsize=figsize,
+        layout=layout,
+        return_type=return_type,
+        **kwargs,
+    )
+
+
+@set_module("pandas.plotting")
+def boxplot_frame_groupby(
+    grouped: DataFrameGroupBy,
+    subplots: bool = True,
+    column=None,
+    fontsize: int | None = None,
+    rot: int = 0,
+    grid: bool = True,
+    ax=None,
+    figsize: tuple[float, float] | None = None,
+    layout=None,
+    sharex: bool = False,
+    sharey: bool = True,
+    backend=None,
+    **kwargs,
+):
+    """
+    Make box plots from DataFrameGroupBy data.
+
+    Parameters
+    ----------
+    grouped : DataFrameGroupBy
+        The grouped DataFrame object over which to create the box plots.
+    subplots : bool
+        * ``False`` - no subplots will be used
+        * ``True`` - create a subplot for each group.
+    column : column name or list of names, or vector
+        Can be any valid input to groupby.
+    fontsize : float or str
+        Font size for the labels.
+    rot : float
+        Rotation angle of labels (in degrees) on the x-axis.
+    grid : bool
+        Whether to show grid lines on the plot.
+    ax : Matplotlib axis object, default None
+        The axes on which to draw the plots. If None, uses the current axes.
+    figsize : tuple of (float, float)
+        The figure size in inches (width, height).
+    layout : tuple (optional)
+        The layout of the plot: (rows, columns).
+    sharex : bool, default False
+        Whether x-axes will be shared among subplots.
+    sharey : bool, default True
+        Whether y-axes will be shared among subplots.
+    backend : str, default None
+        Backend to use instead of the backend specified in the option
+        ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
+        specify the ``plotting.backend`` for the whole session, set
+        ``pd.options.plotting.backend``.
+    **kwargs
+        All other plotting keyword arguments to be passed to
+        matplotlib's boxplot function.
+
+    Returns
+    -------
+    dict or DataFrame.boxplot return value
+        If ``subplots=True``, returns a dictionary of group keys to the boxplot
+        return values. If ``subplots=False``, returns the boxplot return value
+        of a single DataFrame.
+
+    See Also
+    --------
+    DataFrame.boxplot : Create a box plot from a DataFrame.
+    Series.plot : Plot a Series.
+
+    Examples
+    --------
+    You can create boxplots for grouped data and show them as separate subplots:
+
+    .. plot::
+        :context: close-figs
+
+        >>> import itertools
+        >>> tuples = [t for t in itertools.product(range(1000), range(4))]
+        >>> index = pd.MultiIndex.from_tuples(tuples, names=["lvl0", "lvl1"])
+        >>> data = np.random.randn(len(index), 4)
+        >>> df = pd.DataFrame(data, columns=list("ABCD"), index=index)
+        >>> grouped = df.groupby(level="lvl1")
+        >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8, 10))  # doctest: +SKIP
+
+    The ``subplots=False`` option shows the boxplots in a single figure.
+
+    .. plot::
+        :context: close-figs
+
+        >>> grouped.boxplot(subplots=False, rot=45, fontsize=12)  # doctest: +SKIP
+    """
+    plot_backend = _get_plot_backend(backend)
+    return plot_backend.boxplot_frame_groupby(
+        grouped,
+        subplots=subplots,
+        column=column,
+        fontsize=fontsize,
+        rot=rot,
+        grid=grid,
+        ax=ax,
+        figsize=figsize,
+        layout=layout,
+        sharex=sharex,
+        sharey=sharey,
+        **kwargs,
+    )
+
+
+@set_module("pandas.plotting")
+class PlotAccessor(PandasObject):
+    """
+    Make plots of Series or DataFrame.
+
+    Uses the backend specified by the
+    option ``plotting.backend``. By default, matplotlib is used.
+
+    Parameters
+    ----------
+    data : Series or DataFrame
+        The object for which the method is called.
+
+    Attributes
+    ----------
+    x : label or position, default None
+        Only used if data is a DataFrame.
+    y : label, position or list of label, positions, default None
+        Allows plotting of one column versus another. Only used if data is a
+        DataFrame.
+    kind : str
+        The kind of plot to produce:
+
+        - 'line' : line plot (default)
+        - 'bar' : vertical bar plot
+        - 'barh' : horizontal bar plot
+        - 'hist' : histogram
+        - 'box' : boxplot
+        - 'kde' : Kernel Density Estimation plot
+        - 'density' : same as 'kde'
+        - 'area' : area plot
+        - 'pie' : pie plot
+        - 'scatter' : scatter plot (DataFrame only)
+        - 'hexbin' : hexbin plot (DataFrame only)
+    ax : matplotlib axes object, default None
+        An axes of the current figure.
+    subplots : bool or sequence of iterables, default False
+        Whether to group columns into subplots:
+
+        - ``False`` : No subplots will be used
+        - ``True`` : Make separate subplots for each column.
+        - sequence of iterables of column labels: Create a subplot for each
+          group of columns. For example `[('a', 'c'), ('b', 'd')]` will
+          create 2 subplots: one with columns 'a' and 'c', and one
+          with columns 'b' and 'd'. Remaining columns that aren't specified
+          will be plotted in additional subplots (one per column).
+
+    sharex : bool, default True if ax is None else False
+        In case ``subplots=True``, share x axis and set some x axis labels
+        to invisible; defaults to True if ax is None otherwise False if
+        an ax is passed in; Be aware, that passing in both an ax and
+        ``sharex=True`` will alter all x axis labels for all axis in a figure.
+    sharey : bool, default False
+        In case ``subplots=True``, share y axis and set some y axis labels to invisible.
+    layout : tuple, optional
+        (rows, columns) for the layout of subplots.
+    figsize : a tuple (width, height) in inches
+        Size of a figure object.
+    use_index : bool, default True
+        Use index as ticks for x axis.
+    title : str or list
+        Title to use for the plot. If a string is passed, print the string
+        at the top of the figure. If a list is passed and `subplots` is
+        True, print each item in the list above the corresponding subplot.
+    grid : bool, default None (matlab style default)
+        Axis grid lines.
+    legend : bool or {'reverse'}
+        Place legend on axis subplots.
+    style : list or dict
+        The matplotlib line style per column.
+    logx : bool or 'sym', default False
+        Use log scaling or symlog scaling on x axis.
+
+    logy : bool or 'sym' default False
+        Use log scaling or symlog scaling on y axis.
+
+    loglog : bool or 'sym', default False
+        Use log scaling or symlog scaling on both x and y axes.
+
+    xticks : sequence
+        Values to use for the xticks.
+    yticks : sequence
+        Values to use for the yticks.
+    xlim : 2-tuple/list
+        Set the x limits of the current axes.
+    ylim : 2-tuple/list
+        Set the y limits of the current axes.
+    xlabel : label, optional
+        Name to use for the xlabel on x-axis. Default uses index name as xlabel, or the
+        x-column name for planar plots.
+
+        .. versionchanged:: 2.0.0
+
+            Now applicable to histograms.
+
+    ylabel : label, optional
+        Name to use for the ylabel on y-axis. Default will show no ylabel, or the
+        y-column name for planar plots.
+
+        .. versionchanged:: 2.0.0
+
+            Now applicable to histograms.
+
+    rot : float, default None
+        Rotation for ticks (xticks for vertical, yticks for horizontal
+        plots).
+    fontsize : float, default None
+        Font size for xticks and yticks.
+    colormap : str or matplotlib colormap object, default None
+        Colormap to select colors from. If string, load colormap with that
+        name from matplotlib.
+    colorbar : bool, optional
+        If True, plot colorbar (only relevant for 'scatter' and 'hexbin'
+        plots).
+    position : float
+        Specify relative alignments for bar plot layout.
+        From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5
+        (center).
+    table : bool, Series or DataFrame, default False
+        If True, draw a table using the data in the DataFrame and the data
+        will be transposed to meet matplotlib's default layout.
+        If a Series or DataFrame is passed, use passed data to draw a
+        table.
+    yerr : DataFrame, Series, array-like, dict and str
+        See :ref:`Plotting with Error Bars <visualization.errorbars>` for
+        detail.
+    xerr : DataFrame, Series, array-like, dict and str
+        Equivalent to yerr.
+    stacked : bool, default False in line and bar plots, and True in area plot
+        If True, create stacked plot.
+    secondary_y : bool or sequence, default False
+        Whether to plot on the secondary y-axis if a list/tuple, which
+        columns to plot on secondary y-axis.
+    mark_right : bool, default True
+        When using a secondary_y axis, automatically mark the column
+        labels with "(right)" in the legend.
+    include_bool : bool, default is False
+        If True, boolean values can be plotted.
+    backend : str, default None
+        Backend to use instead of the backend specified in the option
+        ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
+        specify the ``plotting.backend`` for the whole session, set
+        ``pd.options.plotting.backend``.
+    **kwargs
+        Options to pass to matplotlib plotting method.
+
+    Returns
+    -------
+    :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+        If the backend is not the default matplotlib one, the return value
+        will be the object returned by the backend.
+
+    See Also
+    --------
+    matplotlib.pyplot.plot : Plot y versus x as lines and/or markers.
+    DataFrame.hist : Make a histogram.
+    DataFrame.boxplot : Make a box plot.
+    DataFrame.plot.scatter : Make a scatter plot with varying marker
+        point size and color.
+    DataFrame.plot.hexbin : Make a hexagonal binning plot of
+        two variables.
+    DataFrame.plot.kde : Make Kernel Density Estimate plot using
+        Gaussian kernels.
+    DataFrame.plot.area : Make a stacked area plot.
+    DataFrame.plot.bar : Make a bar plot.
+    DataFrame.plot.barh : Make a horizontal bar plot.
+
+    Notes
+    -----
+    - See matplotlib documentation online for more on this subject
+    - If `kind` = 'bar' or 'barh', you can specify relative alignments
+      for bar plot layout by `position` keyword.
+      From 0 (left/bottom-end) to 1 (right/top-end). Default is 0.5
+      (center)
+
+    Examples
+    --------
+    For Series:
+
+    .. plot::
+        :context: close-figs
+
+        >>> ser = pd.Series([1, 2, 3, 3])
+        >>> plot = ser.plot(kind="hist", title="My plot")
+
+    For DataFrame:
+
+    .. plot::
+        :context: close-figs
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "length": [1.5, 0.5, 1.2, 0.9, 3],
+        ...         "width": [0.7, 0.2, 0.15, 0.2, 1.1],
+        ...     },
+        ...     index=["pig", "rabbit", "duck", "chicken", "horse"],
+        ... )
+        >>> plot = df.plot(title="DataFrame Plot")
+
+    For SeriesGroupBy:
+
+    .. plot::
+        :context: close-figs
+
+        >>> lst = [-1, -2, -3, 1, 2, 3]
+        >>> ser = pd.Series([1, 2, 2, 4, 6, 6], index=lst)
+        >>> plot = ser.groupby(lambda x: x > 0).plot(title="SeriesGroupBy Plot")
+
+    For DataFrameGroupBy:
+
+    .. plot::
+        :context: close-figs
+
+        >>> df = pd.DataFrame({"col1": [1, 2, 3, 4], "col2": ["A", "B", "A", "B"]})
+        >>> plot = df.groupby("col2").plot(kind="bar", title="DataFrameGroupBy Plot")
+    """
+
+    _common_kinds = ("line", "bar", "barh", "kde", "density", "area", "hist", "box")
+    _series_kinds = ("pie",)
+    _dataframe_kinds = ("scatter", "hexbin")
+    _kind_aliases = {"density": "kde"}
+    _all_kinds = _common_kinds + _series_kinds + _dataframe_kinds
+
+    def __init__(self, data: Series | DataFrame) -> None:
+        self._parent = data
+
+    @staticmethod
+    def _get_call_args(backend_name: str, data: Series | DataFrame, args, kwargs):
+        """
+        This function makes calls to this accessor `__call__` method compatible
+        with the previous `SeriesPlotMethods.__call__` and
+        `DataFramePlotMethods.__call__`. Those had slightly different
+        signatures, since `DataFramePlotMethods` accepted `x` and `y`
+        parameters.
+        """
+        if isinstance(data, ABCSeries):
+            arg_def = [
+                ("kind", "line"),
+                ("ax", None),
+                ("figsize", None),
+                ("use_index", True),
+                ("title", None),
+                ("grid", None),
+                ("legend", False),
+                ("style", None),
+                ("logx", False),
+                ("logy", False),
+                ("loglog", False),
+                ("xticks", None),
+                ("yticks", None),
+                ("xlim", None),
+                ("ylim", None),
+                ("rot", None),
+                ("fontsize", None),
+                ("colormap", None),
+                ("table", False),
+                ("yerr", None),
+                ("xerr", None),
+                ("label", None),
+                ("secondary_y", False),
+                ("xlabel", None),
+                ("ylabel", None),
+            ]
+        elif isinstance(data, ABCDataFrame):
+            arg_def = [
+                ("x", None),
+                ("y", None),
+                ("kind", "line"),
+                ("ax", None),
+                ("subplots", False),
+                ("sharex", None),
+                ("sharey", False),
+                ("layout", None),
+                ("figsize", None),
+                ("use_index", True),
+                ("title", None),
+                ("grid", None),
+                ("legend", True),
+                ("style", None),
+                ("logx", False),
+                ("logy", False),
+                ("loglog", False),
+                ("xticks", None),
+                ("yticks", None),
+                ("xlim", None),
+                ("ylim", None),
+                ("rot", None),
+                ("fontsize", None),
+                ("colormap", None),
+                ("table", False),
+                ("yerr", None),
+                ("xerr", None),
+                ("secondary_y", False),
+                ("xlabel", None),
+                ("ylabel", None),
+            ]
+        else:
+            raise TypeError(
+                f"Called plot accessor for type {type(data).__name__}, "
+                "expected Series or DataFrame"
+            )
+
+        if args and isinstance(data, ABCSeries):
+            positional_args = str(args)[1:-1]
+            keyword_args = ", ".join(
+                [
+                    f"{name}={value!r}"
+                    for (name, _), value in zip(arg_def, args, strict=False)
+                ]
+            )
+            msg = (
+                "`Series.plot()` should not be called with positional "
+                "arguments, only keyword arguments. The order of "
+                "positional arguments will change in the future. "
+                f"Use `Series.plot({keyword_args})` instead of "
+                f"`Series.plot({positional_args})`."
+            )
+            raise TypeError(msg)
+
+        pos_args = {
+            name: value for (name, _), value in zip(arg_def, args, strict=False)
+        }
+        if backend_name == "pandas.plotting._matplotlib":
+            kwargs = dict(arg_def, **pos_args, **kwargs)
+        else:
+            kwargs = dict(pos_args, **kwargs)
+
+        x = kwargs.pop("x", None)
+        y = kwargs.pop("y", None)
+        kind = kwargs.pop("kind", "line")
+        return x, y, kind, kwargs
+
+    def __call__(self, *args, **kwargs):
+        plot_backend = _get_plot_backend(kwargs.pop("backend", None))
+
+        x, y, kind, kwargs = self._get_call_args(
+            plot_backend.__name__, self._parent, args, kwargs
+        )
+
+        kind = self._kind_aliases.get(kind, kind)
+
+        # when using another backend, get out of the way
+        if plot_backend.__name__ != "pandas.plotting._matplotlib":
+            return plot_backend.plot(self._parent, x=x, y=y, kind=kind, **kwargs)
+
+        if kind not in self._all_kinds:
+            raise ValueError(
+                f"{kind} is not a valid plot kind Valid plot kinds: {self._all_kinds}"
+            )
+
+        data = self._parent
+
+        if isinstance(data, ABCSeries):
+            kwargs["reuse_plot"] = True
+
+        if kind in self._dataframe_kinds:
+            if isinstance(data, ABCDataFrame):
+                return plot_backend.plot(data, x=x, y=y, kind=kind, **kwargs)
+            else:
+                raise ValueError(f"plot kind {kind} can only be used for data frames")
+        elif kind in self._series_kinds:
+            if isinstance(data, ABCDataFrame):
+                if y is None and kwargs.get("subplots") is False:
+                    raise ValueError(
+                        f"{kind} requires either y column or 'subplots=True'"
+                    )
+                if y is not None:
+                    if is_integer(y) and not holds_integer(data.columns):
+                        y = data.columns[y]
+                    # converted to series actually. copy to not modify
+                    data = data[y].copy(deep=False)
+                    data.index.name = y
+        elif isinstance(data, ABCDataFrame):
+            data_cols = data.columns
+            if x is not None:
+                if is_integer(x) and not holds_integer(data.columns):
+                    x = data_cols[x]
+                elif not isinstance(data[x], ABCSeries):
+                    raise ValueError("x must be a label or position")
+                data = data.set_index(x)
+            if y is not None:
+                # check if we have y as int or list of ints
+                int_ylist = is_list_like(y) and all(is_integer(c) for c in y)
+                int_y_arg = is_integer(y) or int_ylist
+                if int_y_arg and not holds_integer(data.columns):
+                    y = data_cols[y]
+
+                label_kw = kwargs["label"] if "label" in kwargs else False
+                for kw in ["xerr", "yerr"]:
+                    if kw in kwargs and (
+                        isinstance(kwargs[kw], str) or is_integer(kwargs[kw])
+                    ):
+                        try:
+                            kwargs[kw] = data[kwargs[kw]]
+                        except (IndexError, KeyError, TypeError):
+                            pass
+
+                data = data[y]
+
+                if isinstance(data, ABCSeries):
+                    label_name = label_kw or y
+                    data.name = label_name
+                else:
+                    # error: Argument 1 to "len" has incompatible type "Any | bool";
+                    # expected "Sized"  [arg-type]
+                    match = is_list_like(label_kw) and len(label_kw) == len(y)  # type: ignore[arg-type]
+                    if label_kw and not match:
+                        raise ValueError(
+                            "label should be list-like and same length as y"
+                        )
+                    label_name = label_kw or data.columns
+                    data.columns = label_name
+
+        return plot_backend.plot(data, kind=kind, **kwargs)
+
+    __call__.__doc__ = __doc__
+
+    def line(
+        self,
+        x: Hashable | None = None,
+        y: Hashable | None = None,
+        color: str | Sequence[str] | dict | None = None,
+        **kwargs,
+    ) -> PlotAccessor:
+        """
+        Plot Series or DataFrame as lines.
+
+        This function is useful to plot lines using DataFrame's values
+        as coordinates.
+
+        Parameters
+        ----------
+        x : label or position, optional
+            Allows plotting of one column versus another. If not specified,
+            the index of the DataFrame is used.
+        y : label or position, optional
+            Allows plotting of one column versus another. If not specified,
+            all numerical columns are used.
+        color : str, array-like, or dict, optional
+            The color for each of the DataFrame's columns. Possible values are:
+
+            - A single color string referred to by name, RGB or RGBA code,
+              for instance 'red' or '#a98d19'.
+
+            - A sequence of color strings referred to by name, RGB or RGBA
+              code, which will be used for each column recursively. For
+              instance ['green','yellow'] each column's line will be filled in
+              green or yellow, alternatively. If there is only a single column to
+              be plotted, then only the first color from the color list will be
+              used.
+
+            - A dict of the form {column name : color}, so that each column will be
+              colored accordingly. For example, if your columns are called `a` and
+              `b`, then passing {'a': 'green', 'b': 'red'} will color lines for
+              column `a` in green and lines for column `b` in red.
+
+        **kwargs
+            Additional keyword arguments are documented in
+            :meth:`DataFrame.plot`.
+
+        Returns
+        -------
+        matplotlib.axes.Axes or np.ndarray of them
+            An ndarray is returned with one :class:`matplotlib.axes.Axes`
+            per column when ``subplots=True``.
+
+        See Also
+        --------
+        matplotlib.pyplot.plot : Plot y versus x as lines and/or markers.
+
+        Examples
+        --------
+
+        .. plot::
+            :context: close-figs
+
+            >>> s = pd.Series([1, 3, 2])
+            >>> s.plot.line()  # doctest: +SKIP
+
+        .. plot::
+            :context: close-figs
+
+            The following example shows the populations for some animals
+            over the years.
+
+            >>> df = pd.DataFrame(
+            ...     {
+            ...         "pig": [20, 18, 489, 675, 1776],
+            ...         "horse": [4, 25, 281, 600, 1900],
+            ...     },
+            ...     index=[1990, 1997, 2003, 2009, 2014],
+            ... )
+            >>> lines = df.plot.line()
+
+        .. plot::
+            :context: close-figs
+
+            An example with subplots, so an array of axes is returned.
+
+            >>> axes = df.plot.line(subplots=True)
+            >>> type(axes)
+            <class 'numpy.ndarray'>
+
+        .. plot::
+            :context: close-figs
+
+            Let's repeat the same example, but specifying colors for
+            each column (in this case, for each animal).
+
+            >>> axes = df.plot.line(
+            ...     subplots=True, color={"pig": "pink", "horse": "#742802"}
+            ... )
+
+        .. plot::
+            :context: close-figs
+
+            The following example shows the relationship between both
+            populations.
+
+            >>> lines = df.plot.line(x="pig", y="horse")
+        """
+        if color is not None:
+            kwargs["color"] = color
+        return self(kind="line", x=x, y=y, **kwargs)
+
+    def bar(
+        self,
+        x: Hashable | None = None,
+        y: Hashable | None = None,
+        color: str | Sequence[str] | dict | None = None,
+        **kwargs,
+    ) -> PlotAccessor:
+        """
+        Vertical bar plot.
+
+        A bar plot is a plot that presents categorical data with
+        rectangular bars with lengths proportional to the values that they
+        represent. A bar plot shows comparisons among discrete categories. One
+        axis of the plot shows the specific categories being compared, and the
+        other axis represents a measured value.
+
+        Parameters
+        ----------
+        x : label or position, optional
+            Allows plotting of one column versus another. If not specified,
+            the index of the DataFrame is used.
+        y : label or position, optional
+            Allows plotting of one column versus another. If not specified,
+            all numerical columns are used.
+        color : str, array-like, or dict, optional
+            The color for each of the DataFrame's columns. Possible values are:
+
+            - A single color string referred to by name, RGB or RGBA code,
+              for instance 'red' or '#a98d19'.
+
+            - A sequence of color strings referred to by name, RGB or RGBA
+              code, which will be used for each column recursively. For
+              instance ['green','yellow'] each column's bar will be filled in
+              green or yellow, alternatively. If there is only a single column to
+              be plotted, then only the first color from the color list will be
+              used.
+
+            - A dict of the form {column name : color}, so that each column will be
+              colored accordingly. For example, if your columns are called `a` and
+              `b`, then passing {'a': 'green', 'b': 'red'} will color bars for
+              column `a` in green and bars for column `b` in red.
+
+        **kwargs
+            Additional keyword arguments are documented in
+            :meth:`DataFrame.plot`.
+
+        Returns
+        -------
+        matplotlib.axes.Axes or np.ndarray of them
+            An ndarray is returned with one :class:`matplotlib.axes.Axes`
+            per column when ``subplots=True``.
+
+        See Also
+        --------
+        DataFrame.plot.barh : Horizontal bar plot.
+        DataFrame.plot : Make plots of a DataFrame.
+        matplotlib.pyplot.bar : Make a bar plot with matplotlib.
+
+        Examples
+        --------
+        Basic plot.
+
+        .. plot::
+            :context: close-figs
+
+            >>> df = pd.DataFrame({"lab": ["A", "B", "C"], "val": [10, 30, 20]})
+            >>> ax = df.plot.bar(x="lab", y="val", rot=0)
+
+        Plot a whole dataframe to a bar plot. Each column is assigned a
+        distinct color, and each row is nested in a group along the
+        horizontal axis.
+
+        .. plot::
+            :context: close-figs
+
+            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
+            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
+            >>> index = [
+            ...     "snail",
+            ...     "pig",
+            ...     "elephant",
+            ...     "rabbit",
+            ...     "giraffe",
+            ...     "coyote",
+            ...     "horse",
+            ... ]
+            >>> df = pd.DataFrame({"speed": speed, "lifespan": lifespan}, index=index)
+            >>> ax = df.plot.bar(rot=0)
+
+        Plot stacked bar charts for the DataFrame
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = df.plot.bar(stacked=True)
+
+        Instead of nesting, the figure can be split by column with
+        ``subplots=True``. In this case, a :class:`numpy.ndarray` of
+        :class:`matplotlib.axes.Axes` are returned.
+
+        .. plot::
+            :context: close-figs
+
+            >>> axes = df.plot.bar(rot=0, subplots=True)
+            >>> axes[1].legend(loc=2)  # doctest: +SKIP
+
+        If you don't like the default colours, you can specify how you'd
+        like each column to be colored.
+
+        .. plot::
+            :context: close-figs
+
+            >>> axes = df.plot.bar(
+            ...     rot=0,
+            ...     subplots=True,
+            ...     color={"speed": "red", "lifespan": "green"},
+            ... )
+            >>> axes[1].legend(loc=2)  # doctest: +SKIP
+
+        Plot a single column.
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = df.plot.bar(y="speed", rot=0)
+
+        Plot only selected categories for the DataFrame.
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = df.plot.bar(x="lifespan", rot=0)
+        """
+        if color is not None:
+            kwargs["color"] = color
+        return self(kind="bar", x=x, y=y, **kwargs)
+
+    def barh(
+        self,
+        x: Hashable | None = None,
+        y: Hashable | None = None,
+        color: str | Sequence[str] | dict | None = None,
+        **kwargs,
+    ) -> PlotAccessor:
+        """
+        Make a horizontal bar plot.
+
+        A horizontal bar plot is a plot that presents quantitative data with
+        rectangular bars with lengths proportional to the values that they
+        represent. A bar plot shows comparisons among discrete categories. One
+        axis of the plot shows the specific categories being compared, and the
+        other axis represents a measured value.
+
+        Parameters
+        ----------
+        x : label or position, optional
+            Allows plotting of one column versus another. If not specified,
+            the index of the DataFrame is used.
+        y : label or position, optional
+            Allows plotting of one column versus another. If not specified,
+            all numerical columns are used.
+        color : str, array-like, or dict, optional
+            The color for each of the DataFrame's columns. Possible values are:
+
+            - A single color string referred to by name, RGB or RGBA code,
+              for instance 'red' or '#a98d19'.
+
+            - A sequence of color strings referred to by name, RGB or RGBA
+              code, which will be used for each column recursively. For
+              instance ['green','yellow'] each column's bar will be filled in
+              green or yellow, alternatively. If there is only a single column to
+              be plotted, then only the first color from the color list will be
+              used.
+
+            - A dict of the form {column name : color}, so that each column will be
+              colored accordingly. For example, if your columns are called `a` and
+              `b`, then passing {'a': 'green', 'b': 'red'} will color bars for
+              column `a` in green and bars for column `b` in red.
+
+        **kwargs
+            Additional keyword arguments are documented in
+            :meth:`DataFrame.plot`.
+
+        Returns
+        -------
+        matplotlib.axes.Axes or np.ndarray of them
+            An ndarray is returned with one :class:`matplotlib.axes.Axes`
+            per column when ``subplots=True``.
+
+        See Also
+        --------
+        DataFrame.plot.bar : Vertical bar plot.
+        DataFrame.plot : Make plots of DataFrame using matplotlib.
+        matplotlib.axes.Axes.bar : Plot a vertical bar plot using matplotlib.
+
+        Examples
+        --------
+        Basic example
+
+        .. plot::
+            :context: close-figs
+
+            >>> df = pd.DataFrame({"lab": ["A", "B", "C"], "val": [10, 30, 20]})
+            >>> ax = df.plot.barh(x="lab", y="val")
+
+        Plot a whole DataFrame to a horizontal bar plot
+
+        .. plot::
+            :context: close-figs
+
+            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
+            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
+            >>> index = [
+            ...     "snail",
+            ...     "pig",
+            ...     "elephant",
+            ...     "rabbit",
+            ...     "giraffe",
+            ...     "coyote",
+            ...     "horse",
+            ... ]
+            >>> df = pd.DataFrame({"speed": speed, "lifespan": lifespan}, index=index)
+            >>> ax = df.plot.barh()
+
+        Plot stacked barh charts for the DataFrame
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = df.plot.barh(stacked=True)
+
+        We can specify colors for each column
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = df.plot.barh(color={"speed": "red", "lifespan": "green"})
+
+        Plot a column of the DataFrame to a horizontal bar plot
+
+        .. plot::
+            :context: close-figs
+
+            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
+            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
+            >>> index = [
+            ...     "snail",
+            ...     "pig",
+            ...     "elephant",
+            ...     "rabbit",
+            ...     "giraffe",
+            ...     "coyote",
+            ...     "horse",
+            ... ]
+            >>> df = pd.DataFrame({"speed": speed, "lifespan": lifespan}, index=index)
+            >>> ax = df.plot.barh(y="speed")
+
+        Plot DataFrame versus the desired column
+
+        .. plot::
+            :context: close-figs
+
+            >>> speed = [0.1, 17.5, 40, 48, 52, 69, 88]
+            >>> lifespan = [2, 8, 70, 1.5, 25, 12, 28]
+            >>> index = [
+            ...     "snail",
+            ...     "pig",
+            ...     "elephant",
+            ...     "rabbit",
+            ...     "giraffe",
+            ...     "coyote",
+            ...     "horse",
+            ... ]
+            >>> df = pd.DataFrame({"speed": speed, "lifespan": lifespan}, index=index)
+            >>> ax = df.plot.barh(x="lifespan")
+        """
+        if color is not None:
+            kwargs["color"] = color
+        return self(kind="barh", x=x, y=y, **kwargs)
+
+    def box(self, by: IndexLabel | None = None, **kwargs) -> PlotAccessor:
+        r"""
+        Make a box plot of the DataFrame columns.
+
+        A box plot is a method for graphically depicting groups of numerical
+        data through their quartiles.
+        The box extends from the Q1 to Q3 quartile values of the data,
+        with a line at the median (Q2). The whiskers extend from the edges
+        of box to show the range of the data. The position of the whiskers
+        is set by default to 1.5*IQR (IQR = Q3 - Q1) from the edges of the
+        box. Outlier points are those past the end of the whiskers.
+
+        For further details see Wikipedia's
+        entry for `boxplot <https://en.wikipedia.org/wiki/Box_plot>`__.
+
+        A consideration when using this chart is that the box and the whiskers
+        can overlap, which is very common when plotting small sets of data.
+
+        Parameters
+        ----------
+        by : str or sequence
+            Column in the DataFrame to group by.
+
+        **kwargs
+            Additional keywords are documented in
+            :meth:`DataFrame.plot`.
+
+        Returns
+        -------
+        :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+            The matplotlib axes containing the box plot.
+
+        See Also
+        --------
+        DataFrame.boxplot: Another method to draw a box plot.
+        Series.plot.box: Draw a box plot from a Series object.
+        matplotlib.pyplot.boxplot: Draw a box plot in matplotlib.
+
+        Examples
+        --------
+        Draw a box plot from a DataFrame with four columns of randomly
+        generated data.
+
+        .. plot::
+            :context: close-figs
+
+            >>> data = np.random.randn(25, 4)
+            >>> df = pd.DataFrame(data, columns=list("ABCD"))
+            >>> ax = df.plot.box()
+
+        You can also generate groupings if you specify the `by` parameter (which
+        can take a column name, or a list or tuple of column names):
+
+        .. plot::
+            :context: close-figs
+
+            >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85]
+            >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list})
+            >>> ax = df.plot.box(column="age", by="gender", figsize=(10, 8))
+        """
+        return self(kind="box", by=by, **kwargs)
+
+    def hist(
+        self, by: IndexLabel | None = None, bins: int = 10, **kwargs
+    ) -> PlotAccessor:
+        """
+        Draw one histogram of the DataFrame's columns.
+
+        A histogram is a representation of the distribution of data.
+        This function groups the values of all given Series in the DataFrame
+        into bins and draws all bins in one :class:`matplotlib.axes.Axes`.
+        This is useful when the DataFrame's Series are in a similar scale.
+
+        Parameters
+        ----------
+        by : str or sequence, optional
+            Column in the DataFrame to group by.
+        bins : int, default 10
+            Number of histogram bins to be used.
+        **kwargs
+            Additional keyword arguments are documented in
+            :meth:`DataFrame.plot`.
+
+        Returns
+        -------
+        :class:`matplotlib.axes.Axes`
+            Return a histogram plot.
+
+        See Also
+        --------
+        DataFrame.hist : Draw histograms per DataFrame's Series.
+        Series.hist : Draw a histogram with Series' data.
+
+        Examples
+        --------
+        When we roll a die 6000 times, we expect to get each value around 1000
+        times. But when we roll two dice and sum the result, the distribution
+        is going to be quite different. A histogram illustrates those
+        distributions.
+
+        .. plot::
+            :context: close-figs
+
+            >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=["one"])
+            >>> df["two"] = df["one"] + np.random.randint(1, 7, 6000)
+            >>> ax = df.plot.hist(bins=12, alpha=0.5)
+
+        A grouped histogram can be generated by providing the parameter `by` (which
+        can be a column name, or a list of column names):
+
+        .. plot::
+            :context: close-figs
+
+            >>> age_list = [8, 10, 12, 14, 72, 74, 76, 78, 20, 25, 30, 35, 60, 85]
+            >>> df = pd.DataFrame({"gender": list("MMMMMMMMFFFFFF"), "age": age_list})
+            >>> ax = df.plot.hist(column=["age"], by="gender", figsize=(10, 8))
+        """
+        return self(kind="hist", by=by, bins=bins, **kwargs)
+
+    def kde(
+        self,
+        bw_method: Literal["scott", "silverman"] | float | Callable | None = None,
+        ind: np.ndarray | int | None = None,
+        weights: np.ndarray | None = None,
+        **kwargs,
+    ) -> PlotAccessor:
+        """
+        Generate Kernel Density Estimate plot using Gaussian kernels.
+
+        In statistics, `kernel density estimation`_ (KDE) is a non-parametric
+        way to estimate the probability density function (PDF) of a random
+        variable. This function uses Gaussian kernels and includes automatic
+        bandwidth determination.
+
+        .. _kernel density estimation:
+            https://en.wikipedia.org/wiki/Kernel_density_estimation
+
+        Parameters
+        ----------
+        bw_method : str, scalar or callable, optional
+            The method used to calculate the estimator bandwidth. This can be
+            'scott', 'silverman', a scalar constant or a callable.
+            If None (default), 'scott' is used.
+            See :class:`scipy.stats.gaussian_kde` for more information.
+        ind : NumPy array or int, optional
+            Evaluation points for the estimated PDF. If None (default),
+            1000 equally spaced points are used. If `ind` is a NumPy array, the
+            KDE is evaluated at the points passed. If `ind` is an integer,
+            `ind` number of equally spaced points are used.
+        weights : NumPy array, optional
+            Weights of datapoints. This must be the same shape as datapoints.
+            If None (default), the samples are assumed to be equally weighted.
+        **kwargs
+            Additional keyword arguments are documented in
+            :meth:`DataFrame.plot`.
+
+        Returns
+        -------
+        matplotlib.axes.Axes or numpy.ndarray of them
+            The matplotlib axes containing the KDE plot.
+
+        See Also
+        --------
+        scipy.stats.gaussian_kde : Representation of a kernel-density
+            estimate using Gaussian kernels. This is the function used
+            internally to estimate the PDF.
+
+        Examples
+        --------
+        Given a Series of points randomly sampled from an unknown
+        distribution, estimate its PDF using KDE with automatic
+        bandwidth determination and plot the results, evaluating them at
+        1000 equally spaced points (default):
+
+        .. plot::
+            :context: close-figs
+
+            >>> s = pd.Series([1, 2, 2.5, 3, 3.5, 4, 5])
+            >>> ax = s.plot.kde()
+
+        A scalar bandwidth can be specified. Using a small bandwidth value can
+        lead to over-fitting, while using a large bandwidth value may result
+        in under-fitting:
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = s.plot.kde(bw_method=0.3)
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = s.plot.kde(bw_method=3)
+
+        Finally, the `ind` parameter determines the evaluation points for the
+        plot of the estimated PDF:
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = s.plot.kde(ind=[1, 2, 3, 4, 5])
+
+        For DataFrame, it works in the same way:
+
+        .. plot::
+            :context: close-figs
+
+            >>> df = pd.DataFrame(
+            ...     {
+            ...         "x": [1, 2, 2.5, 3, 3.5, 4, 5],
+            ...         "y": [4, 4, 4.5, 5, 5.5, 6, 6],
+            ...     }
+            ... )
+            >>> ax = df.plot.kde()
+
+        A scalar bandwidth can be specified. Using a small bandwidth value can
+        lead to over-fitting, while using a large bandwidth value may result
+        in under-fitting:
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = df.plot.kde(bw_method=0.3)
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = df.plot.kde(bw_method=3)
+
+        Finally, the `ind` parameter determines the evaluation points for the
+        plot of the estimated PDF:
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = df.plot.kde(ind=[1, 2, 3, 4, 5, 6])
+        """
+        return self(kind="kde", bw_method=bw_method, ind=ind, weights=weights, **kwargs)
+
+    density = kde
+
+    def area(
+        self,
+        x: Hashable | None = None,
+        y: Hashable | None = None,
+        stacked: bool = True,
+        **kwargs,
+    ) -> PlotAccessor:
+        """
+        Draw a stacked area plot.
+
+        An area plot displays quantitative data visually.
+        This function wraps the matplotlib area function.
+
+        Parameters
+        ----------
+        x : label or position, optional
+            Coordinates for the X axis. By default uses the index.
+        y : label or position, optional
+            Column to plot. By default uses all columns.
+        stacked : bool, default True
+            Area plots are stacked by default. Set to False to create a
+            unstacked plot.
+        **kwargs
+            Additional keyword arguments are documented in
+            :meth:`DataFrame.plot`.
+
+        Returns
+        -------
+        matplotlib.axes.Axes or numpy.ndarray
+            Area plot, or array of area plots if subplots is True.
+
+        See Also
+        --------
+        DataFrame.plot : Make plots of DataFrame using matplotlib.
+
+        Examples
+        --------
+        Draw an area plot based on basic business metrics:
+
+        .. plot::
+            :context: close-figs
+
+            >>> df = pd.DataFrame(
+            ...     {
+            ...         "sales": [3, 2, 3, 9, 10, 6],
+            ...         "signups": [5, 5, 6, 12, 14, 13],
+            ...         "visits": [20, 42, 28, 62, 81, 50],
+            ...     },
+            ...     index=pd.date_range(
+            ...         start="2018/01/01", end="2018/07/01", freq="ME"
+            ...     ),
+            ... )
+            >>> ax = df.plot.area()
+
+        Area plots are stacked by default. To produce an unstacked plot,
+        pass ``stacked=False``:
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = df.plot.area(stacked=False)
+
+        Draw an area plot for a single column:
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax = df.plot.area(y="sales")
+
+        Draw with a different `x`:
+
+        .. plot::
+            :context: close-figs
+
+            >>> df = pd.DataFrame(
+            ...     {
+            ...         "sales": [3, 2, 3],
+            ...         "visits": [20, 42, 28],
+            ...         "day": [1, 2, 3],
+            ...     }
+            ... )
+            >>> ax = df.plot.area(x="day")
+        """
+        return self(kind="area", x=x, y=y, stacked=stacked, **kwargs)
+
+    def pie(self, y: IndexLabel | None = None, **kwargs) -> PlotAccessor:
+        """
+        Generate a pie plot.
+
+        A pie plot is a proportional representation of the numerical data in a
+        column. This function wraps :meth:`matplotlib.pyplot.pie` for the
+        specified column. If no column reference is passed and
+        ``subplots=True`` a pie plot is drawn for each numerical column
+        independently.
+
+        Parameters
+        ----------
+        y : int or label, optional
+            Label or position of the column to plot.
+            If not provided, ``subplots=True`` argument must be passed.
+        **kwargs
+            Keyword arguments to pass on to :meth:`DataFrame.plot`.
+
+        Returns
+        -------
+        matplotlib.axes.Axes or np.ndarray of them
+            A NumPy array is returned when `subplots` is True.
+
+        See Also
+        --------
+        Series.plot.pie : Generate a pie plot for a Series.
+        DataFrame.plot : Make plots of a DataFrame.
+
+        Examples
+        --------
+        In the example below we have a DataFrame with the information about
+        planet's mass and radius. We pass the 'mass' column to the
+        pie function to get a pie plot.
+
+        .. plot::
+            :context: close-figs
+
+            >>> df = pd.DataFrame(
+            ...     {"mass": [0.330, 4.87, 5.97], "radius": [2439.7, 6051.8, 6378.1]},
+            ...     index=["Mercury", "Venus", "Earth"],
+            ... )
+            >>> plot = df.plot.pie(y="mass", figsize=(5, 5))
+
+        .. plot::
+            :context: close-figs
+
+            >>> plot = df.plot.pie(subplots=True, figsize=(11, 6))
+        """
+        if y is not None:
+            kwargs["y"] = y
+        if (
+            isinstance(self._parent, ABCDataFrame)
+            and kwargs.get("y", None) is None
+            and not kwargs.get("subplots", False)
+        ):
+            raise ValueError("pie requires either y column or 'subplots=True'")
+        return self(kind="pie", **kwargs)
+
+    def scatter(
+        self,
+        x: Hashable,
+        y: Hashable,
+        s: Hashable | Sequence[Hashable] | None = None,
+        c: Hashable | Sequence[Hashable] | None = None,
+        **kwargs,
+    ) -> PlotAccessor:
+        """
+        Create a scatter plot with varying marker point size and color.
+
+        The coordinates of each point are defined by two dataframe columns and
+        filled circles are used to represent each point. This kind of plot is
+        useful to see complex correlations between two variables. Points could
+        be for instance natural 2D coordinates like longitude and latitude in
+        a map or, in general, any pair of metrics that can be plotted against
+        each other.
+
+        Parameters
+        ----------
+        x : int or str
+            The column name or column position to be used as horizontal
+            coordinates for each point.
+        y : int or str
+            The column name or column position to be used as vertical
+            coordinates for each point.
+        s : str, scalar or array-like, optional
+            The size of each point. Possible values are:
+
+            - A string with the name of the column to be used for marker's size.
+
+            - A single scalar so all points have the same size.
+
+            - A sequence of scalars, which will be used for each point's size
+              recursively. For instance, when passing [2,14] all points size
+              will be either 2 or 14, alternatively.
+
+        c : str, int or array-like, optional
+            The color of each point. Possible values are:
+
+            - A single color string referred to by name, RGB or RGBA code,
+              for instance 'red' or '#a98d19'.
+
+            - A sequence of color strings referred to by name, RGB or RGBA
+              code, which will be used for each point's color recursively. For
+              instance ['green','yellow'] all points will be filled in green or
+              yellow, alternatively.
+
+            - A column name or position whose values will be used to color the
+              marker points according to a colormap.
+
+        **kwargs
+            Keyword arguments to pass on to :meth:`DataFrame.plot`.
+
+        Returns
+        -------
+        :class:`matplotlib.axes.Axes` or numpy.ndarray of them
+            The matplotlib axes containing the scatter plot.
+
+        See Also
+        --------
+        matplotlib.pyplot.scatter : Scatter plot using multiple input data
+            formats.
+
+        Examples
+        --------
+        Let's see how to draw a scatter plot using coordinates from the values
+        in a DataFrame's columns.
+
+        .. plot::
+            :context: close-figs
+
+            >>> df = pd.DataFrame(
+            ...     [
+            ...         [5.1, 3.5, 0],
+            ...         [4.9, 3.0, 0],
+            ...         [7.0, 3.2, 1],
+            ...         [6.4, 3.2, 1],
+            ...         [5.9, 3.0, 2],
+            ...     ],
+            ...     columns=["length", "width", "species"],
+            ... )
+            >>> ax1 = df.plot.scatter(x="length", y="width", c="DarkBlue")
+
+        And now with the color determined by a column as well.
+
+        .. plot::
+            :context: close-figs
+
+            >>> ax2 = df.plot.scatter(
+            ...     x="length", y="width", c="species", colormap="viridis"
+            ... )
+        """
+        return self(kind="scatter", x=x, y=y, s=s, c=c, **kwargs)
+
+    def hexbin(
+        self,
+        x: Hashable,
+        y: Hashable,
+        C: Hashable | None = None,
+        reduce_C_function: Callable | None = None,
+        gridsize: int | tuple[int, int] | None = None,
+        **kwargs,
+    ) -> PlotAccessor:
+        """
+        Generate a hexagonal binning plot.
+
+        Generate a hexagonal binning plot of `x` versus `y`. If `C` is `None`
+        (the default), this is a histogram of the number of occurrences
+        of the observations at ``(x[i], y[i])``.
+
+        If `C` is specified, specifies values at given coordinates
+        ``(x[i], y[i])``. These values are accumulated for each hexagonal
+        bin and then reduced according to `reduce_C_function`,
+        having as default the NumPy's mean function (:meth:`numpy.mean`).
+        (If `C` is specified, it must also be a 1-D sequence
+        of the same length as `x` and `y`, or a column label.)
+
+        Parameters
+        ----------
+        x : int or str
+            The column label or position for x points.
+        y : int or str
+            The column label or position for y points.
+        C : int or str, optional
+            The column label or position for the value of `(x, y)` point.
+        reduce_C_function : callable, default `np.mean`
+            Function of one argument that reduces all the values in a bin to
+            a single number (e.g. `np.mean`, `np.max`, `np.sum`, `np.std`).
+        gridsize : int or tuple of (int, int), default 100
+            The number of hexagons in the x-direction.
+            The corresponding number of hexagons in the y-direction is
+            chosen in a way that the hexagons are approximately regular.
+            Alternatively, gridsize can be a tuple with two elements
+            specifying the number of hexagons in the x-direction and the
+            y-direction.
+        **kwargs
+            Additional keyword arguments are documented in
+            :meth:`DataFrame.plot`.
+
+        Returns
+        -------
+        matplotlib.Axes
+            The matplotlib ``Axes`` on which the hexbin is plotted.
+
+        See Also
+        --------
+        DataFrame.plot : Make plots of a DataFrame.
+        matplotlib.pyplot.hexbin : Hexagonal binning plot using matplotlib,
+            the matplotlib function that is used under the hood.
+
+        Examples
+        --------
+        The following examples are generated with random data from
+        a normal distribution.
+
+        .. plot::
+            :context: close-figs
+
+            >>> n = 10000
+            >>> df = pd.DataFrame({"x": np.random.randn(n), "y": np.random.randn(n)})
+            >>> ax = df.plot.hexbin(x="x", y="y", gridsize=20)
+
+        The next example uses `C` and `np.sum` as `reduce_C_function`.
+        Note that `'observations'` values ranges from 1 to 5 but the result
+        plot shows values up to more than 25. This is because of the
+        `reduce_C_function`.
+
+        .. plot::
+            :context: close-figs
+
+            >>> n = 500
+            >>> df = pd.DataFrame(
+            ...     {
+            ...         "coord_x": np.random.uniform(-3, 3, size=n),
+            ...         "coord_y": np.random.uniform(30, 50, size=n),
+            ...         "observations": np.random.randint(1, 5, size=n),
+            ...     }
+            ... )
+            >>> ax = df.plot.hexbin(
+            ...     x="coord_x",
+            ...     y="coord_y",
+            ...     C="observations",
+            ...     reduce_C_function=np.sum,
+            ...     gridsize=10,
+            ...     cmap="viridis",
+            ... )
+        """
+        if reduce_C_function is not None:
+            kwargs["reduce_C_function"] = reduce_C_function
+        if gridsize is not None:
+            kwargs["gridsize"] = gridsize
+
+        return self(kind="hexbin", x=x, y=y, C=C, **kwargs)
+
+
+_backends: dict[str, types.ModuleType] = {}
+
+
+def _load_backend(backend: str) -> types.ModuleType:
+    """
+    Load a pandas plotting backend.
+
+    Parameters
+    ----------
+    backend : str
+        The identifier for the backend. Either an entrypoint item registered
+        with importlib.metadata, "matplotlib", or a module name.
+
+    Returns
+    -------
+    types.ModuleType
+        The imported backend.
+    """
+    from importlib.metadata import entry_points
+
+    if backend == "matplotlib":
+        # Because matplotlib is an optional dependency and first-party backend,
+        # we need to attempt an import here to raise an ImportError if needed.
+        try:
+            module = importlib.import_module("pandas.plotting._matplotlib")
+        except ImportError:
+            raise ImportError(
+                "matplotlib is required for plotting when the "
+                'default backend "matplotlib" is selected.'
+            ) from None
+        return module
+
+    found_backend = False
+
+    eps = entry_points()
+    key = "pandas_plotting_backends"
+    # entry_points lost dict API ~ PY 3.10
+    # https://github.com/python/importlib_metadata/issues/298
+    if hasattr(eps, "select"):
+        entry = eps.select(group=key)
+    else:
+        # Argument 2 to "get" of "dict" has incompatible type "Tuple[]";
+        # expected "EntryPoints"  [arg-type]
+        entry = eps.get(key, ())  # type: ignore[arg-type]
+    for entry_point in entry:
+        found_backend = entry_point.name == backend
+        if found_backend:
+            module = entry_point.load()
+            break
+
+    if not found_backend:
+        # Fall back to unregistered, module name approach.
+        try:
+            module = importlib.import_module(backend)
+            found_backend = True
+        except ImportError:
+            # We re-raise later on.
+            pass
+
+    if found_backend:
+        if hasattr(module, "plot"):
+            # Validate that the interface is implemented when the option is set,
+            # rather than at plot time.
+            return module
+
+    raise ValueError(
+        f"Could not find plotting backend '{backend}'. Ensure that you've "
+        f"installed the package providing the '{backend}' entrypoint, or that "
+        "the package has a top-level `.plot` method."
+    )
+
+
+def _get_plot_backend(backend: str | None = None):
+    """
+    Return the plotting backend to use (e.g. `pandas.plotting._matplotlib`).
+
+    The plotting system of pandas uses matplotlib by default, but the idea here
+    is that it can also work with other third-party backends. This function
+    returns the module which provides a top-level `.plot` method that will
+    actually do the plotting. The backend is specified from a string, which
+    either comes from the keyword argument `backend`, or, if not specified, from
+    the option `pandas.options.plotting.backend`. All the rest of the code in
+    this file uses the backend specified there for the plotting.
+
+    The backend is imported lazily, as matplotlib is a soft dependency, and
+    pandas can be used without it being installed.
+
+    Notes
+    -----
+    Modifies `_backends` with imported backend as a side effect.
+    """
+    backend_str: str = backend or get_option("plotting.backend")
+
+    if backend_str in _backends:
+        return _backends[backend_str]
+
+    module = _load_backend(backend_str)
+    _backends[backend_str] = module
+    return module
diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff28868aa003326355f0e3e4b5b7914edb63121c
--- /dev/null
+++ b/pandas/plotting/_matplotlib/__init__.py
@@ -0,0 +1,93 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from pandas.plotting._matplotlib.boxplot import (
+    BoxPlot,
+    boxplot,
+    boxplot_frame,
+    boxplot_frame_groupby,
+)
+from pandas.plotting._matplotlib.converter import (
+    deregister,
+    register,
+)
+from pandas.plotting._matplotlib.core import (
+    AreaPlot,
+    BarhPlot,
+    BarPlot,
+    HexBinPlot,
+    LinePlot,
+    PiePlot,
+    ScatterPlot,
+)
+from pandas.plotting._matplotlib.hist import (
+    HistPlot,
+    KdePlot,
+    hist_frame,
+    hist_series,
+)
+from pandas.plotting._matplotlib.misc import (
+    andrews_curves,
+    autocorrelation_plot,
+    bootstrap_plot,
+    lag_plot,
+    parallel_coordinates,
+    radviz,
+    scatter_matrix,
+)
+from pandas.plotting._matplotlib.tools import table
+
+if TYPE_CHECKING:
+    from pandas.plotting._matplotlib.core import MPLPlot
+
+PLOT_CLASSES: dict[str, type[MPLPlot]] = {
+    "line": LinePlot,
+    "bar": BarPlot,
+    "barh": BarhPlot,
+    "box": BoxPlot,
+    "hist": HistPlot,
+    "kde": KdePlot,
+    "area": AreaPlot,
+    "pie": PiePlot,
+    "scatter": ScatterPlot,
+    "hexbin": HexBinPlot,
+}
+
+
+def plot(data, kind, **kwargs):
+    # Importing pyplot at the top of the file (before the converters are
+    # registered) causes problems in matplotlib 2 (converters seem to not
+    # work)
+    import matplotlib.pyplot as plt
+
+    if kwargs.pop("reuse_plot", False):
+        ax = kwargs.get("ax")
+        if ax is None and len(plt.get_fignums()) > 0:
+            with plt.rc_context():
+                ax = plt.gca()
+            kwargs["ax"] = getattr(ax, "left_ax", ax)
+    plot_obj = PLOT_CLASSES[kind](data, **kwargs)
+    plot_obj.generate()
+    plt.draw_if_interactive()
+    return plot_obj.result
+
+
+__all__ = [
+    "andrews_curves",
+    "autocorrelation_plot",
+    "bootstrap_plot",
+    "boxplot",
+    "boxplot_frame",
+    "boxplot_frame_groupby",
+    "deregister",
+    "hist_frame",
+    "hist_series",
+    "lag_plot",
+    "parallel_coordinates",
+    "plot",
+    "radviz",
+    "register",
+    "scatter_matrix",
+    "table",
+]
diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bb185c51478f9c892631934f0b818effc5a5c96
--- /dev/null
+++ b/pandas/plotting/_matplotlib/boxplot.py
@@ -0,0 +1,563 @@
+from __future__ import annotations
+
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+    NamedTuple,
+)
+import warnings
+
+import matplotlib as mpl
+import numpy as np
+
+from pandas._libs import lib
+from pandas.util._decorators import cache_readonly
+from pandas.util._exceptions import find_stack_level
+
+from pandas.core.dtypes.common import is_dict_like
+from pandas.core.dtypes.generic import ABCSeries
+from pandas.core.dtypes.missing import remove_na_arraylike
+
+import pandas as pd
+import pandas.core.common as com
+from pandas.util.version import Version
+
+from pandas.io.formats.printing import pprint_thing
+from pandas.plotting._matplotlib.core import (
+    LinePlot,
+    MPLPlot,
+)
+from pandas.plotting._matplotlib.groupby import create_iter_data_given_by
+from pandas.plotting._matplotlib.style import get_standard_colors
+from pandas.plotting._matplotlib.tools import (
+    create_subplots,
+    flatten_axes,
+    maybe_adjust_figure,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Collection
+
+    from matplotlib.axes import Axes
+    from matplotlib.figure import Figure
+    from matplotlib.lines import Line2D
+
+    from pandas._typing import MatplotlibColor
+
+
+def _set_ticklabels(ax: Axes, labels: list[str], is_vertical: bool, **kwargs) -> None:
+    """Set the tick labels of a given axis.
+
+    Due to https://github.com/matplotlib/matplotlib/pull/17266, we need to handle the
+    case of repeated ticks (due to `FixedLocator`) and thus we duplicate the number of
+    labels.
+    """
+    ticks = ax.get_xticks() if is_vertical else ax.get_yticks()
+    if len(ticks) != len(labels):
+        i, remainder = divmod(len(ticks), len(labels))
+        if Version(mpl.__version__) < Version("3.10"):
+            assert remainder == 0, remainder
+        labels *= i
+    if is_vertical:
+        ax.set_xticklabels(labels, **kwargs)
+    else:
+        ax.set_yticklabels(labels, **kwargs)
+
+
+class BoxPlot(LinePlot):
+    @property
+    def _kind(self) -> Literal["box"]:
+        return "box"
+
+    _layout_type = "horizontal"
+
+    _valid_return_types = (None, "axes", "dict", "both")
+
+    class BP(NamedTuple):
+        # namedtuple to hold results
+        ax: Axes
+        lines: dict[str, list[Line2D]]
+
+    def __init__(self, data, return_type: str = "axes", **kwargs) -> None:
+        if return_type not in self._valid_return_types:
+            raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}")
+
+        self.return_type = return_type
+        # Do not call LinePlot.__init__ which may fill nan
+        MPLPlot.__init__(self, data, **kwargs)
+
+        if self.subplots:
+            # Disable label ax sharing. Otherwise, all subplots shows last
+            # column label
+            if self.orientation == "vertical":
+                self.sharex = False
+            else:
+                self.sharey = False
+
+    # error: Signature of "_plot" incompatible with supertype "MPLPlot"
+    @classmethod
+    def _plot(  # type: ignore[override]
+        cls, ax: Axes, y: np.ndarray, column_num=None, return_type: str = "axes", **kwds
+    ):
+        ys: np.ndarray | list[np.ndarray]
+        if y.ndim == 2:
+            ys = [remove_na_arraylike(v) for v in y]
+            # Boxplot fails with empty arrays, so need to add a NaN
+            #   if any cols are empty
+            # GH 8181
+            ys = [v if v.size > 0 else np.array([np.nan]) for v in ys]
+        else:
+            ys = remove_na_arraylike(y)
+        bp = ax.boxplot(ys, **kwds)
+
+        if return_type == "dict":
+            return bp, bp
+        elif return_type == "both":
+            return cls.BP(ax=ax, lines=bp), bp
+        else:
+            return ax, bp
+
+    def _validate_color_args(self, color, colormap):
+        if color is lib.no_default:
+            return None
+
+        if colormap is not None:
+            warnings.warn(
+                "'color' and 'colormap' cannot be used simultaneously. Using 'color'",
+                stacklevel=find_stack_level(),
+            )
+
+        if isinstance(color, dict):
+            valid_keys = ["boxes", "whiskers", "medians", "caps"]
+            for key in color:
+                if key not in valid_keys:
+                    raise ValueError(
+                        f"color dict contains invalid key '{key}'. "
+                        f"The key must be either {valid_keys}"
+                    )
+        return color
+
+    @cache_readonly
+    def _color_attrs(self):
+        # get standard colors for default
+        # use 2 colors by default, for box/whisker and median
+        # flier colors isn't needed here
+        # because it can be specified by ``sym`` kw
+        return get_standard_colors(num_colors=3, colormap=self.colormap, color=None)
+
+    @cache_readonly
+    def _boxes_c(self):
+        return self._color_attrs[0]
+
+    @cache_readonly
+    def _whiskers_c(self):
+        return self._color_attrs[0]
+
+    @cache_readonly
+    def _medians_c(self):
+        return self._color_attrs[2]
+
+    @cache_readonly
+    def _caps_c(self):
+        return self._color_attrs[0]
+
+    def _get_colors(
+        self,
+        num_colors=None,
+        color_kwds: dict[str, MatplotlibColor]
+        | MatplotlibColor
+        | Collection[MatplotlibColor]
+        | None = "color",
+    ) -> None:
+        pass
+
+    def maybe_color_bp(self, bp) -> None:
+        if isinstance(self.color, dict):
+            boxes = self.color.get("boxes", self._boxes_c)
+            whiskers = self.color.get("whiskers", self._whiskers_c)
+            medians = self.color.get("medians", self._medians_c)
+            caps = self.color.get("caps", self._caps_c)
+        else:
+            # Other types are forwarded to matplotlib
+            # If None, use default colors
+            boxes = self.color or self._boxes_c
+            whiskers = self.color or self._whiskers_c
+            medians = self.color or self._medians_c
+            caps = self.color or self._caps_c
+
+        color_tup = (boxes, whiskers, medians, caps)
+        maybe_color_bp(bp, color_tup=color_tup, **self.kwds)
+
+    def _make_plot(self, fig: Figure) -> None:
+        if self.subplots:
+            obj_axes = []
+            obj_labels = []
+
+            # Re-create iterated data if `by` is assigned by users
+            data = (
+                create_iter_data_given_by(self.data, self._kind)
+                if self.by is not None
+                else self.data
+            )
+
+            for i, (label, y) in enumerate(self._iter_data(data=data)):
+                ax = self._get_ax(i)
+                kwds = self.kwds.copy()
+
+                # When by is applied, show title for subplots to know which group it is
+                # just like df.boxplot, and need to apply T on y to provide right input
+                if self.by is not None:
+                    y = y.T
+                    ax.set_title(pprint_thing(label))
+
+                    # When `by` is assigned, the ticklabels will become unique grouped
+                    # values, instead of label which is used as subtitle in this case.
+                    # error: "Index" has no attribute "levels"; maybe "nlevels"?
+                    levels = self.data.columns.levels  # type: ignore[attr-defined]
+                    ticklabels = [pprint_thing(col) for col in levels[0]]
+                else:
+                    ticklabels = [pprint_thing(label)]
+
+                ret, bp = self._plot(
+                    ax, y, column_num=i, return_type=self.return_type, **kwds
+                )
+                self.maybe_color_bp(bp)
+                obj_axes.append(ret)
+                obj_labels.append(label)
+                _set_ticklabels(
+                    ax=ax, labels=ticklabels, is_vertical=self.orientation == "vertical"
+                )
+            self._return_obj = pd.Series(obj_axes, index=obj_labels, dtype=object)
+        else:
+            y = self.data.values.T
+            ax = self._get_ax(0)
+            kwds = self.kwds.copy()
+
+            ret, bp = self._plot(
+                ax, y, column_num=0, return_type=self.return_type, **kwds
+            )
+            self.maybe_color_bp(bp)
+            self._return_obj = ret
+
+            labels = [pprint_thing(left) for left in self.data.columns]
+            if not self.use_index:
+                labels = [pprint_thing(key) for key in range(len(labels))]
+            _set_ticklabels(
+                ax=ax, labels=labels, is_vertical=self.orientation == "vertical"
+            )
+
+    def _make_legend(self) -> None:
+        pass
+
+    def _post_plot_logic(self, ax: Axes, data) -> None:
+        # GH 45465: make sure that the boxplot doesn't ignore xlabel/ylabel
+        if self.xlabel:
+            ax.set_xlabel(pprint_thing(self.xlabel))
+        if self.ylabel:
+            ax.set_ylabel(pprint_thing(self.ylabel))
+
+    @property
+    def orientation(self) -> Literal["horizontal", "vertical"]:
+        if self.kwds.get("vert", True):
+            return "vertical"
+        else:
+            return "horizontal"
+
+    @property
+    def result(self):
+        if self.return_type is None:
+            return super().result
+        else:
+            return self._return_obj
+
+
+def maybe_color_bp(bp, color_tup, **kwds) -> None:
+    # GH#30346, when users specifying those arguments explicitly, our defaults
+    # for these four kwargs should be overridden; if not, use Pandas settings
+    if not kwds.get("boxprops"):
+        mpl.artist.setp(bp["boxes"], color=color_tup[0], alpha=1)
+    if not kwds.get("whiskerprops"):
+        mpl.artist.setp(bp["whiskers"], color=color_tup[1], alpha=1)
+    if not kwds.get("medianprops"):
+        mpl.artist.setp(bp["medians"], color=color_tup[2], alpha=1)
+    if not kwds.get("capprops"):
+        mpl.artist.setp(bp["caps"], color=color_tup[3], alpha=1)
+
+
+def _grouped_plot_by_column(
+    plotf,
+    data,
+    columns=None,
+    by=None,
+    numeric_only: bool = True,
+    grid: bool = False,
+    figsize: tuple[float, float] | None = None,
+    ax=None,
+    layout=None,
+    return_type=None,
+    **kwargs,
+):
+    grouped = data.groupby(by, observed=False)
+    if columns is None:
+        if not isinstance(by, (list, tuple)):
+            by = [by]
+        columns = data._get_numeric_data().columns.difference(by)
+    naxes = len(columns)
+    fig, axes = create_subplots(
+        naxes=naxes,
+        sharex=kwargs.pop("sharex", True),
+        sharey=kwargs.pop("sharey", True),
+        figsize=figsize,
+        ax=ax,
+        layout=layout,
+    )
+
+    # GH 45465: move the "by" label based on "vert"
+    xlabel, ylabel = kwargs.pop("xlabel", None), kwargs.pop("ylabel", None)
+    if kwargs.get("vert", True):
+        xlabel = xlabel or by
+    else:
+        ylabel = ylabel or by
+
+    ax_values = []
+
+    for ax, col in zip(flatten_axes(axes), columns, strict=False):
+        gp_col = grouped[col]
+        keys, values = zip(*gp_col, strict=True)
+        re_plotf = plotf(keys, values, ax, xlabel=xlabel, ylabel=ylabel, **kwargs)
+        ax.set_title(col)
+        ax_values.append(re_plotf)
+        ax.grid(grid)
+
+    result = pd.Series(ax_values, index=columns, copy=False)
+
+    # Return axes in multiplot case, maybe revisit later # 985
+    if return_type is None:
+        result = axes
+
+    byline = by[0] if len(by) == 1 else by
+    fig.suptitle(f"Boxplot grouped by {byline}")
+    maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2)
+
+    return result
+
+
+def boxplot(
+    data,
+    column=None,
+    by=None,
+    ax=None,
+    fontsize: int | None = None,
+    rot: int = 0,
+    grid: bool = True,
+    figsize: tuple[float, float] | None = None,
+    layout=None,
+    return_type=None,
+    **kwds,
+):
+    import matplotlib.pyplot as plt
+
+    # validate return_type:
+    if return_type not in BoxPlot._valid_return_types:
+        raise ValueError("return_type must be {'axes', 'dict', 'both'}")
+
+    if isinstance(data, ABCSeries):
+        data = data.to_frame("x")
+        column = "x"
+
+    def _get_colors():
+        #  num_colors=3 is required as method maybe_color_bp takes the colors
+        #  in positions 0 and 2.
+        #  if colors not provided, use same defaults as DataFrame.plot.box
+        result_list = get_standard_colors(num_colors=3)
+        result = np.take(result_list, [0, 0, 2])
+        result = np.append(result, "k")
+
+        colors = kwds.pop("color", None)
+        if colors:
+            if is_dict_like(colors):
+                # replace colors in result array with user-specified colors
+                # taken from the colors dict parameter
+                # "boxes" value placed in position 0, "whiskers" in 1, etc.
+                valid_keys = ["boxes", "whiskers", "medians", "caps"]
+                key_to_index = dict(zip(valid_keys, range(4), strict=True))
+                for key, value in colors.items():
+                    if key in valid_keys:
+                        result[key_to_index[key]] = value
+                    else:
+                        raise ValueError(
+                            f"color dict contains invalid key '{key}'. "
+                            f"The key must be either {valid_keys}"
+                        )
+            else:
+                result.fill(colors)
+
+        return result
+
+    def plot_group(keys, values, ax: Axes, **kwds):
+        # GH 45465: xlabel/ylabel need to be popped out before plotting happens
+        xlabel, ylabel = kwds.pop("xlabel", None), kwds.pop("ylabel", None)
+        if xlabel:
+            ax.set_xlabel(pprint_thing(xlabel))
+        if ylabel:
+            ax.set_ylabel(pprint_thing(ylabel))
+
+        keys = [pprint_thing(x) for x in keys]
+        values = [remove_na_arraylike(v) for v in values]
+        bp = ax.boxplot(values, **kwds)
+        if fontsize is not None:
+            ax.tick_params(axis="both", labelsize=fontsize)
+
+        # GH 45465: x/y are flipped when "vert" changes
+        _set_ticklabels(
+            ax=ax, labels=keys, is_vertical=kwds.get("vert", True), rotation=rot
+        )
+        maybe_color_bp(bp, color_tup=colors, **kwds)
+
+        # Return axes in multiplot case, maybe revisit later # 985
+        if return_type == "dict":
+            return bp
+        elif return_type == "both":
+            return BoxPlot.BP(ax=ax, lines=bp)
+        else:
+            return ax
+
+    colors = _get_colors()
+    if column is None:
+        columns = None
+    elif isinstance(column, (list, tuple)):
+        columns = column
+    else:
+        columns = [column]
+
+    if by is not None:
+        # Prefer array return type for 2-D plots to match the subplot layout
+        # https://github.com/pandas-dev/pandas/pull/12216#issuecomment-241175580
+        result = _grouped_plot_by_column(
+            plot_group,
+            data,
+            columns=columns,
+            by=by,
+            grid=grid,
+            figsize=figsize,
+            ax=ax,
+            layout=layout,
+            return_type=return_type,
+            **kwds,
+        )
+    else:
+        if return_type is None:
+            return_type = "axes"
+        if layout is not None:
+            raise ValueError("The 'layout' keyword is not supported when 'by' is None")
+
+        if ax is None:
+            rc = {"figure.figsize": figsize} if figsize is not None else {}
+            with mpl.rc_context(rc):
+                ax = plt.gca()
+        data = data._get_numeric_data()
+        naxes = len(data.columns)
+        if naxes == 0:
+            raise ValueError(
+                "boxplot method requires numerical columns, nothing to plot."
+            )
+        if columns is None:
+            columns = data.columns
+        else:
+            data = data[columns]
+
+        result = plot_group(columns, data.values.T, ax, **kwds)
+        ax.grid(grid)
+
+    return result
+
+
+def boxplot_frame(
+    self,
+    column=None,
+    by=None,
+    ax=None,
+    fontsize: int | None = None,
+    rot: int = 0,
+    grid: bool = True,
+    figsize: tuple[float, float] | None = None,
+    layout=None,
+    return_type=None,
+    **kwds,
+):
+    import matplotlib.pyplot as plt
+
+    ax = boxplot(
+        self,
+        column=column,
+        by=by,
+        ax=ax,
+        fontsize=fontsize,
+        grid=grid,
+        rot=rot,
+        figsize=figsize,
+        layout=layout,
+        return_type=return_type,
+        **kwds,
+    )
+    plt.draw_if_interactive()
+    return ax
+
+
+def boxplot_frame_groupby(
+    grouped,
+    subplots: bool = True,
+    column=None,
+    fontsize: int | None = None,
+    rot: int = 0,
+    grid: bool = True,
+    ax=None,
+    figsize: tuple[float, float] | None = None,
+    layout=None,
+    sharex: bool = False,
+    sharey: bool = True,
+    **kwds,
+):
+    if subplots is True:
+        naxes = len(grouped)
+        fig, axes = create_subplots(
+            naxes=naxes,
+            squeeze=False,
+            ax=ax,
+            sharex=sharex,
+            sharey=sharey,
+            figsize=figsize,
+            layout=layout,
+        )
+        data = {}
+        for (key, group), ax in zip(grouped, flatten_axes(axes), strict=False):
+            d = group.boxplot(
+                ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds
+            )
+            ax.set_title(pprint_thing(key))
+            data[key] = d
+        ret = pd.Series(data)
+        maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2)
+    else:
+        keys, frames = zip(*grouped, strict=True)
+        df = pd.concat(frames, keys=keys, axis=1)
+
+        # GH 16748, DataFrameGroupby fails when subplots=False and `column` argument
+        # is assigned, and in this case, since `df` here becomes MI after groupby,
+        # so we need to couple the keys (grouped values) and column (original df
+        # column) together to search for subset to plot
+        if column is not None:
+            column = com.convert_to_list_like(column)
+            multi_key = pd.MultiIndex.from_product([keys, column])
+            column = list(multi_key.values)
+        ret = df.boxplot(
+            column=column,
+            fontsize=fontsize,
+            rot=rot,
+            grid=grid,
+            ax=ax,
+            figsize=figsize,
+            layout=layout,
+            **kwds,
+        )
+    return ret
diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..813bd984cf2972ad950c35aa3f6511d3cfd1aa0d
--- /dev/null
+++ b/pandas/plotting/_matplotlib/converter.py
@@ -0,0 +1,1130 @@
+from __future__ import annotations
+
+import contextlib
+import datetime as pydt
+from datetime import (
+    datetime,
+    tzinfo,
+)
+import functools
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    cast,
+)
+import warnings
+
+import matplotlib as mpl
+import matplotlib.dates as mdates
+import matplotlib.units as munits
+import numpy as np
+
+from pandas._libs import lib
+from pandas._libs.tslibs import (
+    Timestamp,
+    to_offset,
+)
+from pandas._libs.tslibs.dtypes import (
+    FreqGroup,
+    periods_per_day,
+)
+from pandas._typing import (
+    F,
+    npt,
+)
+
+from pandas.core.dtypes.common import (
+    is_float,
+    is_float_dtype,
+    is_integer,
+    is_integer_dtype,
+    is_nested_list_like,
+)
+
+from pandas import (
+    Index,
+    Series,
+    get_option,
+)
+import pandas.core.common as com
+from pandas.core.indexes.datetimes import date_range
+from pandas.core.indexes.period import (
+    Period,
+    PeriodIndex,
+    period_range,
+)
+import pandas.core.tools.datetimes as tools
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from matplotlib.axis import Axis
+
+    from pandas._libs.tslibs.offsets import BaseOffset
+    from pandas._typing import TimeUnit
+
+
+_mpl_units: dict = {}  # Cache for units overwritten by us
+
+
+def get_pairs() -> list[tuple[type, type[mdates.DateConverter]]]:
+    pairs = [
+        (Timestamp, DatetimeConverter),
+        (Period, PeriodConverter),
+        (pydt.datetime, DatetimeConverter),
+        (pydt.date, DatetimeConverter),
+        (pydt.time, TimeConverter),
+        (np.datetime64, DatetimeConverter),
+    ]
+    return pairs
+
+
+def register_pandas_matplotlib_converters(func: F) -> F:
+    """
+    Decorator applying pandas_converters.
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        with pandas_converters():
+            return func(*args, **kwargs)
+
+    return cast(F, wrapper)
+
+
+@contextlib.contextmanager
+def pandas_converters() -> Generator[None]:
+    """
+    Context manager registering pandas' converters for a plot.
+
+    See Also
+    --------
+    register_pandas_matplotlib_converters : Decorator that applies this.
+    """
+    value = get_option("plotting.matplotlib.register_converters")
+
+    if value:
+        # register for True or "auto"
+        register()
+    try:
+        yield
+    finally:
+        if value == "auto":
+            # only deregister for "auto"
+            deregister()
+
+
+def register() -> None:
+    pairs = get_pairs()
+    for type_, cls in pairs:
+        # Cache previous converter if present
+        if type_ in munits.registry and not isinstance(munits.registry[type_], cls):
+            previous = munits.registry[type_]
+            _mpl_units[type_] = previous
+        # Replace with pandas converter
+        munits.registry[type_] = cls()
+
+
+def deregister() -> None:
+    # Renamed in pandas.plotting.__init__
+    for type_, cls in get_pairs():
+        # We use type to catch our classes directly, no inheritance
+        if type(munits.registry.get(type_)) is cls:
+            munits.registry.pop(type_)
+
+    # restore the old keys
+    for unit, formatter in _mpl_units.items():
+        if type(formatter) not in {DatetimeConverter, PeriodConverter, TimeConverter}:
+            # make it idempotent by excluding ours.
+            munits.registry[unit] = formatter
+
+
+def _to_ordinalf(tm: pydt.time) -> float:
+    tot_sec = tm.hour * 3600 + tm.minute * 60 + tm.second + tm.microsecond / 10**6
+    return tot_sec
+
+
+def time2num(d):
+    if isinstance(d, str):
+        parsed = Timestamp(d)
+        return _to_ordinalf(parsed.time())
+    if isinstance(d, pydt.time):
+        return _to_ordinalf(d)
+    return d
+
+
+class TimeConverter(munits.ConversionInterface):
+    @staticmethod
+    def convert(value, unit, axis):
+        valid_types = (str, pydt.time)
+        if isinstance(value, valid_types) or is_integer(value) or is_float(value):
+            return time2num(value)
+        if isinstance(value, Index):
+            return value.map(time2num)
+        if isinstance(value, (list, tuple, np.ndarray, Index)):
+            return [time2num(x) for x in value]
+        return value
+
+    @staticmethod
+    def axisinfo(unit, axis) -> munits.AxisInfo | None:
+        if unit != "time":
+            return None
+
+        majloc = mpl.ticker.AutoLocator()  # pyright: ignore[reportAttributeAccessIssue]
+        majfmt = TimeFormatter(majloc)
+        return munits.AxisInfo(majloc=majloc, majfmt=majfmt, label="time")
+
+    @staticmethod
+    def default_units(x, axis) -> str:
+        return "time"
+
+
+# time formatter
+class TimeFormatter(mpl.ticker.Formatter):  # pyright: ignore[reportAttributeAccessIssue]
+    def __init__(self, locs) -> None:
+        self.locs = locs
+
+    def __call__(self, x, pos: int | None = 0) -> str:
+        """
+        Return the time of day as a formatted string.
+
+        Parameters
+        ----------
+        x : float
+            The time of day specified as seconds since 00:00 (midnight),
+            with up to microsecond precision.
+        pos
+            Unused
+
+        Returns
+        -------
+        str
+            A string in HH:MM:SS.mmmuuu format. Microseconds,
+            milliseconds and seconds are only displayed if non-zero.
+        """
+        fmt = "%H:%M:%S.%f"
+        s = int(x)
+        msus = round((x - s) * 10**6)
+        ms = msus // 1000
+        us = msus % 1000
+        m, s = divmod(s, 60)
+        h, m = divmod(m, 60)
+        _, h = divmod(h, 24)
+        if us != 0:
+            return pydt.time(h, m, s, msus).strftime(fmt)
+        elif ms != 0:
+            return pydt.time(h, m, s, msus).strftime(fmt)[:-3]
+        elif s != 0:
+            return pydt.time(h, m, s).strftime("%H:%M:%S")
+
+        return pydt.time(h, m).strftime("%H:%M")
+
+
+# Period Conversion
+
+
+class PeriodConverter(mdates.DateConverter):
+    @staticmethod
+    def convert(values, unit, axis: Axis):
+        # Reached via e.g. `ax.set_xlim`
+
+        # In tests as of 2025-09-24, unit is always None except for 3 tests
+        #  that directly call this with unit="";
+        #  axis is always specifically a matplotlib.axis.XAxis
+
+        if not hasattr(axis, "freq"):
+            raise TypeError("Axis must have `freq` set to convert to Periods")
+        freq = to_offset(axis.freq, is_period=True)  # pyright: ignore[reportAttributeAccessIssue]
+        return PeriodConverter.convert_from_freq(values, freq)
+
+    @staticmethod
+    def convert_from_freq(values, freq: BaseOffset):
+        if is_nested_list_like(values):
+            values = [PeriodConverter._convert_1d(v, freq) for v in values]
+        else:
+            values = PeriodConverter._convert_1d(values, freq)
+        return values
+
+    @staticmethod
+    def _convert_1d(values, freq: BaseOffset):
+        valid_types = (str, datetime, Period, pydt.date, np.datetime64)
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", "Period with BDay freq is deprecated", category=FutureWarning
+            )
+            warnings.filterwarnings(
+                "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning
+            )
+            if (
+                isinstance(values, valid_types)
+                or is_integer(values)
+                or is_float(values)
+            ):
+                return _get_datevalue(values, freq)
+            elif isinstance(values, PeriodIndex):
+                return values.asfreq(freq).asi8
+            elif isinstance(values, Index):
+                return values.map(lambda x: _get_datevalue(x, freq))
+            elif lib.infer_dtype(values, skipna=False) == "period":
+                # https://github.com/pandas-dev/pandas/issues/24304
+                # convert ndarray[period] -> PeriodIndex
+                return PeriodIndex(values, freq=freq).asi8
+            elif isinstance(values, (list, tuple, np.ndarray)):
+                return [_get_datevalue(x, freq) for x in values]
+        return values
+
+
+def _get_datevalue(date, freq: BaseOffset):
+    if isinstance(date, Period):
+        return date.asfreq(freq).ordinal
+    elif isinstance(date, (str, datetime, pydt.date, np.datetime64)):
+        return Period(date, freq).ordinal  # pyright: ignore[reportAttributeAccessIssue]
+    elif is_integer(date) or is_float(date):
+        return date
+    elif date is None:
+        return None
+    raise ValueError(f"Unrecognizable date '{date}'")
+
+
+# Datetime Conversion
+class DatetimeConverter(mdates.DateConverter):
+    @staticmethod
+    def convert(values, unit, axis: Axis):
+        # Reached via e.g. `ax.set_xlim`
+
+        # In tests as of 2025-09-24, unit is always None except for 3 tests
+        #  that directly call this with unit="";
+        #  axis is always specifically a matplotlib.axis.XAxis
+
+        # values might be a 1-d array, or a list-like of arrays.
+        if is_nested_list_like(values):
+            values = [DatetimeConverter._convert_1d(v, unit, axis) for v in values]
+        else:
+            values = DatetimeConverter._convert_1d(values, unit, axis)
+        return values
+
+    @staticmethod
+    def _convert_1d(values, unit, axis):
+        def try_parse(values):
+            try:
+                return mdates.date2num(tools.to_datetime(values))
+            except Exception:
+                return values
+
+        if isinstance(values, (datetime, pydt.date, np.datetime64, pydt.time)):
+            return mdates.date2num(values)
+        elif is_integer(values) or is_float(values):
+            return values
+        elif isinstance(values, str):
+            return try_parse(values)
+        elif isinstance(values, (list, tuple, np.ndarray, Index, Series)):
+            if isinstance(values, Series):
+                # https://github.com/matplotlib/matplotlib/issues/11391
+                # Series was skipped. Convert to DatetimeIndex to get asi8
+                values = Index(values)
+            if isinstance(values, Index):
+                values = values.values
+            if not isinstance(values, np.ndarray):
+                values = com.asarray_tuplesafe(values)
+
+            if is_integer_dtype(values) or is_float_dtype(values):
+                return values
+
+            try:
+                values = tools.to_datetime(values)
+            except Exception:
+                pass
+
+            values = mdates.date2num(values)
+
+        return values
+
+    @staticmethod
+    def axisinfo(unit: tzinfo | None, axis) -> munits.AxisInfo:
+        """
+        Return the :class:`~matplotlib.units.AxisInfo` for *unit*.
+
+        *unit* is a tzinfo instance or None.
+        The *axis* argument is required but not used.
+        """
+        tz = unit
+
+        majloc = PandasAutoDateLocator(tz=tz)
+        majfmt = PandasAutoDateFormatter(majloc, tz=tz)
+        datemin = pydt.date(2000, 1, 1)
+        datemax = pydt.date(2010, 1, 1)
+
+        return munits.AxisInfo(
+            majloc=majloc, majfmt=majfmt, label="", default_limits=(datemin, datemax)
+        )
+
+
+class PandasAutoDateFormatter(mdates.AutoDateFormatter):
+    def __init__(self, locator, tz=None, defaultfmt: str = "%Y-%m-%d") -> None:
+        mdates.AutoDateFormatter.__init__(self, locator, tz, defaultfmt)
+
+
+class PandasAutoDateLocator(mdates.AutoDateLocator):
+    def get_locator(self, dmin, dmax):
+        """Pick the best locator based on a distance."""
+        tot_sec = (dmax - dmin).total_seconds()
+
+        if abs(tot_sec) < self.minticks:
+            self._freq = -1
+            locator = MilliSecondLocator(self.tz)
+            locator.set_axis(self.axis)
+
+            # error: Item "None" of "Axis | _DummyAxis | _AxisWrapper | None"
+            # has no attribute "get_data_interval"
+            locator.axis.set_view_interval(  # type: ignore[union-attr]
+                *self.axis.get_view_interval()  # type: ignore[union-attr]
+            )
+            locator.axis.set_data_interval(  # type: ignore[union-attr]
+                *self.axis.get_data_interval()  # type: ignore[union-attr]
+            )
+            return locator
+
+        return mdates.AutoDateLocator.get_locator(self, dmin, dmax)
+
+    def _get_unit(self):
+        return MilliSecondLocator.get_unit_generic(self._freq)
+
+
+class MilliSecondLocator(mdates.DateLocator):
+    UNIT = 1.0 / (24 * 3600 * 1000)
+
+    def __init__(self, tz) -> None:
+        mdates.DateLocator.__init__(self, tz)
+        self._interval = 1.0
+
+    def _get_unit(self):
+        return self.get_unit_generic(-1)
+
+    @staticmethod
+    def get_unit_generic(freq):
+        unit = mdates.RRuleLocator.get_unit_generic(freq)
+        if unit < 0:
+            return MilliSecondLocator.UNIT
+        return unit
+
+    def __call__(self):
+        # if no data have been set, this will tank with a ValueError
+        try:
+            dmin, dmax = self.viewlim_to_dt()
+        except ValueError:
+            return []
+
+        # We need to cap at the endpoints of valid datetime
+        nmax, nmin = mdates.date2num((dmax, dmin))
+
+        num = (nmax - nmin) * 86400 * 1000
+        max_millis_ticks = 6
+        for interval in [1, 10, 50, 100, 200, 500]:
+            if num <= interval * (max_millis_ticks - 1):
+                self._interval = interval
+                break
+            # We went through the whole loop without breaking, default to 1
+            self._interval = 1000.0
+
+        estimate = (nmax - nmin) / (self._get_unit() * self._get_interval())
+
+        if estimate > self.MAXTICKS * 2:
+            raise RuntimeError(
+                "MillisecondLocator estimated to generate "
+                f"{estimate:d} ticks from {dmin} to {dmax}: exceeds Locator.MAXTICKS"
+                f"* 2 ({self.MAXTICKS * 2:d}) "
+            )
+
+        interval = self._get_interval()
+        freq = f"{interval}ms"
+        tz = self.tz.tzname(None)
+        st = dmin.replace(tzinfo=None)
+        ed = dmax.replace(tzinfo=None)
+        all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object)
+
+        try:
+            if len(all_dates) > 0:
+                locs = self.raise_if_exceeds(mdates.date2num(all_dates))
+                return locs
+        except Exception:  # pragma: no cover
+            pass
+
+        lims = mdates.date2num([dmin, dmax])
+        return lims
+
+    def _get_interval(self):
+        return self._interval
+
+    def autoscale(self):
+        """
+        Set the view limits to include the data range.
+        """
+        # We need to cap at the endpoints of valid datetime
+        dmin, dmax = self.datalim_to_dt()
+
+        vmin = mdates.date2num(dmin)
+        vmax = mdates.date2num(dmax)
+
+        return self.nonsingular(vmin, vmax)
+
+
+# Fixed frequency dynamic tick locators and formatters
+
+# -------------------------------------------------------------------------
+# --- Locators ---
+# -------------------------------------------------------------------------
+
+
+def _get_default_annual_spacing(nyears) -> tuple[int, int]:
+    """
+    Returns a default spacing between consecutive ticks for annual data.
+    """
+    if nyears < 11:
+        (min_spacing, maj_spacing) = (1, 1)
+    elif nyears < 20:
+        (min_spacing, maj_spacing) = (1, 2)
+    elif nyears < 50:
+        (min_spacing, maj_spacing) = (1, 5)
+    elif nyears < 100:
+        (min_spacing, maj_spacing) = (5, 10)
+    elif nyears < 200:
+        (min_spacing, maj_spacing) = (5, 25)
+    elif nyears < 600:
+        (min_spacing, maj_spacing) = (10, 50)
+    else:
+        factor = nyears // 1000 + 1
+        (min_spacing, maj_spacing) = (factor * 20, factor * 100)
+    return (min_spacing, maj_spacing)
+
+
+def _period_break(dates: PeriodIndex, period: str) -> npt.NDArray[np.intp]:
+    """
+    Returns the indices where the given period changes.
+
+    Parameters
+    ----------
+    dates : PeriodIndex
+        Array of intervals to monitor.
+    period : str
+        Name of the period to monitor.
+    """
+    mask = _period_break_mask(dates, period)
+    return np.nonzero(mask)[0]
+
+
+def _period_break_mask(dates: PeriodIndex, period: str) -> npt.NDArray[np.bool_]:
+    current = getattr(dates, period)
+    previous = getattr(dates - 1 * dates.freq, period)
+    return current != previous
+
+
+def has_level_label(label_flags: npt.NDArray[np.intp], vmin: float) -> bool:
+    """
+    Returns true if the ``label_flags`` indicate there is at least one label
+    for this level.
+
+    if the minimum view limit is not an exact integer, then the first tick
+    label won't be shown, so we must adjust for that.
+    """
+    if label_flags.size == 0 or (
+        label_flags.size == 1 and label_flags[0] == 0 and vmin % 1 > 0.0
+    ):
+        return False
+    else:
+        return True
+
+
+def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]:
+    # error: "BaseOffset" has no attribute "_period_dtype_code"
+    dtype_code = freq._period_dtype_code  # type: ignore[attr-defined]
+    freq_group = FreqGroup.from_period_dtype_code(dtype_code)
+
+    ppd = -1  # placeholder for above-day freqs
+
+    if dtype_code >= FreqGroup.FR_HR.value:  # pyright: ignore[reportAttributeAccessIssue]
+        # error: "BaseOffset" has no attribute "_creso"
+        ppd = periods_per_day(freq._creso)  # type: ignore[attr-defined]
+        ppm = 28 * ppd
+        ppy = 365 * ppd
+    elif freq_group == FreqGroup.FR_BUS:
+        ppm = 19
+        ppy = 261
+    elif freq_group == FreqGroup.FR_DAY:
+        ppm = 28
+        ppy = 365
+    elif freq_group == FreqGroup.FR_WK:
+        ppm = 3
+        ppy = 52
+    elif freq_group == FreqGroup.FR_MTH:
+        ppm = 1
+        ppy = 12
+    elif freq_group == FreqGroup.FR_QTR:
+        ppm = -1  # placerholder
+        ppy = 4
+    elif freq_group == FreqGroup.FR_ANN:
+        ppm = -1  # placeholder
+        ppy = 1
+    else:
+        raise NotImplementedError(f"Unsupported frequency: {dtype_code}")
+
+    return ppd, ppm, ppy
+
+
+@functools.cache
+def _daily_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray:
+    # error: "BaseOffset" has no attribute "_period_dtype_code"
+    dtype_code = freq._period_dtype_code  # type: ignore[attr-defined]
+
+    periodsperday, periodspermonth, periodsperyear = _get_periods_per_ymd(freq)
+
+    # save this for later usage
+    vmin_orig = vmin
+    (vmin, vmax) = (int(vmin), int(vmax))
+    span = vmax - vmin + 1
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore", "Period with BDay freq is deprecated", category=FutureWarning
+        )
+        warnings.filterwarnings(
+            "ignore", r"PeriodDtype\[B\] is deprecated", category=FutureWarning
+        )
+        dates_ = period_range(
+            start=Period(ordinal=vmin, freq=freq),
+            end=Period(ordinal=vmax, freq=freq),
+            freq=freq,
+        )
+
+    # Initialize the output
+    info = np.zeros(
+        span, dtype=[("val", np.int64), ("maj", bool), ("min", bool), ("fmt", "|S20")]
+    )
+    info["val"][:] = dates_.asi8
+    info["fmt"][:] = ""
+    info["maj"][[0, -1]] = True
+    # .. and set some shortcuts
+    info_maj = info["maj"]
+    info_min = info["min"]
+    info_fmt = info["fmt"]
+
+    def first_label(label_flags):
+        if (label_flags[0] == 0) and (label_flags.size > 1) and ((vmin_orig % 1) > 0.0):
+            return label_flags[1]
+        else:
+            return label_flags[0]
+
+    # Case 1. Less than a month
+    if span <= periodspermonth:
+        day_start = _period_break(dates_, "day")
+        month_start = _period_break(dates_, "month")
+        year_start = _period_break(dates_, "year")
+
+        def _hour_finder(label_interval: int, force_year_start: bool) -> None:
+            target = dates_.hour
+            mask = _period_break_mask(dates_, "hour")
+            info_maj[day_start] = True
+            info_min[mask & (target % label_interval == 0)] = True
+            info_fmt[mask & (target % label_interval == 0)] = "%H:%M"
+            info_fmt[day_start] = "%H:%M\n%d-%b"
+            info_fmt[year_start] = "%H:%M\n%d-%b\n%Y"
+            if force_year_start and not has_level_label(year_start, vmin_orig):
+                info_fmt[first_label(day_start)] = "%H:%M\n%d-%b\n%Y"
+
+        def _minute_finder(label_interval: int) -> None:
+            target = dates_.minute
+            hour_start = _period_break(dates_, "hour")
+            mask = _period_break_mask(dates_, "minute")
+            info_maj[hour_start] = True
+            info_min[mask & (target % label_interval == 0)] = True
+            info_fmt[mask & (target % label_interval == 0)] = "%H:%M"
+            info_fmt[day_start] = "%H:%M\n%d-%b"
+            info_fmt[year_start] = "%H:%M\n%d-%b\n%Y"
+
+        def _second_finder(label_interval: int) -> None:
+            target = dates_.second
+            minute_start = _period_break(dates_, "minute")
+            mask = _period_break_mask(dates_, "second")
+            info_maj[minute_start] = True
+            info_min[mask & (target % label_interval == 0)] = True
+            info_fmt[mask & (target % label_interval == 0)] = "%H:%M:%S"
+            info_fmt[day_start] = "%H:%M:%S\n%d-%b"
+            info_fmt[year_start] = "%H:%M:%S\n%d-%b\n%Y"
+
+        if span < periodsperday / 12000:
+            _second_finder(1)
+        elif span < periodsperday / 6000:
+            _second_finder(2)
+        elif span < periodsperday / 2400:
+            _second_finder(5)
+        elif span < periodsperday / 1200:
+            _second_finder(10)
+        elif span < periodsperday / 800:
+            _second_finder(15)
+        elif span < periodsperday / 400:
+            _second_finder(30)
+        elif span < periodsperday / 150:
+            _minute_finder(1)
+        elif span < periodsperday / 70:
+            _minute_finder(2)
+        elif span < periodsperday / 24:
+            _minute_finder(5)
+        elif span < periodsperday / 12:
+            _minute_finder(15)
+        elif span < periodsperday / 6:
+            _minute_finder(30)
+        elif span < periodsperday / 2.5:
+            _hour_finder(1, False)
+        elif span < periodsperday / 1.5:
+            _hour_finder(2, False)
+        elif span < periodsperday * 1.25:
+            _hour_finder(3, False)
+        elif span < periodsperday * 2.5:
+            _hour_finder(6, True)
+        elif span < periodsperday * 4:
+            _hour_finder(12, True)
+        else:
+            info_maj[month_start] = True
+            info_min[day_start] = True
+            info_fmt[day_start] = "%d"
+            info_fmt[month_start] = "%d\n%b"
+            info_fmt[year_start] = "%d\n%b\n%Y"
+            if not has_level_label(year_start, vmin_orig):
+                if not has_level_label(month_start, vmin_orig):
+                    info_fmt[first_label(day_start)] = "%d\n%b\n%Y"
+                else:
+                    info_fmt[first_label(month_start)] = "%d\n%b\n%Y"
+
+    # Case 2. Less than three months
+    elif span <= periodsperyear // 4:
+        month_start = _period_break(dates_, "month")
+        info_maj[month_start] = True
+        if dtype_code < FreqGroup.FR_HR.value:  # pyright: ignore[reportAttributeAccessIssue]
+            info["min"] = True
+        else:
+            day_start = _period_break(dates_, "day")
+            info["min"][day_start] = True
+        week_start = _period_break(dates_, "week")
+        year_start = _period_break(dates_, "year")
+        info_fmt[week_start] = "%d"
+        info_fmt[month_start] = "\n\n%b"
+        info_fmt[year_start] = "\n\n%b\n%Y"
+        if not has_level_label(year_start, vmin_orig):
+            if not has_level_label(month_start, vmin_orig):
+                info_fmt[first_label(week_start)] = "\n\n%b\n%Y"
+            else:
+                info_fmt[first_label(month_start)] = "\n\n%b\n%Y"
+    # Case 3. Less than 14 months ...............
+    elif span <= 1.15 * periodsperyear:
+        year_start = _period_break(dates_, "year")
+        month_start = _period_break(dates_, "month")
+        week_start = _period_break(dates_, "week")
+        info_maj[month_start] = True
+        info_min[week_start] = True
+        info_min[year_start] = False
+        info_min[month_start] = False
+        info_fmt[month_start] = "%b"
+        info_fmt[year_start] = "%b\n%Y"
+        if not has_level_label(year_start, vmin_orig):
+            info_fmt[first_label(month_start)] = "%b\n%Y"
+    # Case 4. Less than 2.5 years ...............
+    elif span <= 2.5 * periodsperyear:
+        year_start = _period_break(dates_, "year")
+        quarter_start = _period_break(dates_, "quarter")
+        month_start = _period_break(dates_, "month")
+        info_maj[quarter_start] = True
+        info_min[month_start] = True
+        info_fmt[quarter_start] = "%b"
+        info_fmt[year_start] = "%b\n%Y"
+    # Case 4. Less than 4 years .................
+    elif span <= 4 * periodsperyear:
+        year_start = _period_break(dates_, "year")
+        month_start = _period_break(dates_, "month")
+        info_maj[year_start] = True
+        info_min[month_start] = True
+        info_min[year_start] = False
+
+        month_break = dates_[month_start].month
+        jan_or_jul = month_start[(month_break == 1) | (month_break == 7)]
+        info_fmt[jan_or_jul] = "%b"
+        info_fmt[year_start] = "%b\n%Y"
+    # Case 5. Less than 11 years ................
+    elif span <= 11 * periodsperyear:
+        year_start = _period_break(dates_, "year")
+        quarter_start = _period_break(dates_, "quarter")
+        info_maj[year_start] = True
+        info_min[quarter_start] = True
+        info_min[year_start] = False
+        info_fmt[year_start] = "%Y"
+    # Case 6. More than 12 years ................
+    else:
+        year_start = _period_break(dates_, "year")
+        year_break = dates_[year_start].year
+        nyears = span / periodsperyear
+        (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears)
+        major_idx = year_start[(year_break % maj_anndef == 0)]
+        info_maj[major_idx] = True
+        minor_idx = year_start[(year_break % min_anndef == 0)]
+        info_min[minor_idx] = True
+        info_fmt[major_idx] = "%Y"
+
+    return info
+
+
+@functools.cache
+def _monthly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray:
+    _, _, periodsperyear = _get_periods_per_ymd(freq)
+
+    vmin_orig = vmin
+    (vmin, vmax) = (int(vmin), int(vmax))
+    span = vmax - vmin + 1
+
+    # Initialize the output
+    info = np.zeros(
+        span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")]
+    )
+    info["val"] = np.arange(vmin, vmax + 1)
+    dates_ = info["val"]
+    info["fmt"] = ""
+    year_start = (dates_ % 12 == 0).nonzero()[0]
+    info_maj = info["maj"]
+    info_fmt = info["fmt"]
+
+    if span <= 1.15 * periodsperyear:
+        info_maj[year_start] = True
+        info["min"] = True
+
+        info_fmt[:] = "%b"
+        info_fmt[year_start] = "%b\n%Y"
+
+        if not has_level_label(year_start, vmin_orig):
+            if dates_.size > 1:
+                idx = 1
+            else:
+                idx = 0
+            info_fmt[idx] = "%b\n%Y"
+
+    elif span <= 2.5 * periodsperyear:
+        quarter_start = (dates_ % 3 == 0).nonzero()
+        info_maj[year_start] = True
+        # TODO: Check the following : is it really info['fmt'] ?
+        #  2023-09-15 this is reached in test_finder_monthly
+        info["fmt"][quarter_start] = True
+        info["min"] = True
+
+        info_fmt[quarter_start] = "%b"
+        info_fmt[year_start] = "%b\n%Y"
+
+    elif span <= 4 * periodsperyear:
+        info_maj[year_start] = True
+        info["min"] = True
+
+        jan_or_jul = (dates_ % 12 == 0) | (dates_ % 12 == 6)
+        info_fmt[jan_or_jul] = "%b"
+        info_fmt[year_start] = "%b\n%Y"
+
+    elif span <= 11 * periodsperyear:
+        quarter_start = (dates_ % 3 == 0).nonzero()
+        info_maj[year_start] = True
+        info["min"][quarter_start] = True
+
+        info_fmt[year_start] = "%Y"
+
+    else:
+        nyears = span / periodsperyear
+        (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears)
+        years = dates_[year_start] // 12 + 1
+        major_idx = year_start[(years % maj_anndef == 0)]
+        info_maj[major_idx] = True
+        info["min"][year_start[(years % min_anndef == 0)]] = True
+
+        info_fmt[major_idx] = "%Y"
+
+    return info
+
+
+@functools.cache
+def _quarterly_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray:
+    _, _, periodsperyear = _get_periods_per_ymd(freq)
+    vmin_orig = vmin
+    (vmin, vmax) = (int(vmin), int(vmax))
+    span = vmax - vmin + 1
+
+    info = np.zeros(
+        span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")]
+    )
+    info["val"] = np.arange(vmin, vmax + 1)
+    info["fmt"] = ""
+    dates_ = info["val"]
+    info_maj = info["maj"]
+    info_fmt = info["fmt"]
+    year_start = (dates_ % 4 == 0).nonzero()[0]
+
+    if span <= 3.5 * periodsperyear:
+        info_maj[year_start] = True
+        info["min"] = True
+
+        info_fmt[:] = "Q%q"
+        info_fmt[year_start] = "Q%q\n%F"
+        if not has_level_label(year_start, vmin_orig):
+            if dates_.size > 1:
+                idx = 1
+            else:
+                idx = 0
+            info_fmt[idx] = "Q%q\n%F"
+
+    elif span <= 11 * periodsperyear:
+        info_maj[year_start] = True
+        info["min"] = True
+        info_fmt[year_start] = "%F"
+
+    else:
+        # https://github.com/pandas-dev/pandas/pull/47602
+        years = dates_[year_start] // 4 + 1970
+        nyears = span / periodsperyear
+        (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears)
+        major_idx = year_start[(years % maj_anndef == 0)]
+        info_maj[major_idx] = True
+        info["min"][year_start[(years % min_anndef == 0)]] = True
+        info_fmt[major_idx] = "%F"
+
+    return info
+
+
+@functools.cache
+def _annual_finder(vmin: float, vmax: float, freq: BaseOffset) -> np.ndarray:
+    # Note: small difference here vs other finders in adding 1 to vmax
+    (vmin, vmax) = (int(vmin), int(vmax + 1))
+    span = vmax - vmin + 1
+
+    info = np.zeros(
+        span, dtype=[("val", int), ("maj", bool), ("min", bool), ("fmt", "|S8")]
+    )
+    info["val"] = np.arange(vmin, vmax + 1)
+    info["fmt"] = ""
+    dates_ = info["val"]
+
+    (min_anndef, maj_anndef) = _get_default_annual_spacing(span)
+    major_idx = dates_ % maj_anndef == 0
+    minor_idx = dates_ % min_anndef == 0
+    info["maj"][major_idx] = True
+    info["min"][minor_idx] = True
+    info["fmt"][major_idx] = "%Y"
+
+    return info
+
+
+def get_finder(freq: BaseOffset):
+    # error: "BaseOffset" has no attribute "_period_dtype_code"
+    dtype_code = freq._period_dtype_code  # type: ignore[attr-defined]
+    fgroup = FreqGroup.from_period_dtype_code(dtype_code)
+
+    if fgroup == FreqGroup.FR_ANN:
+        return _annual_finder
+    elif fgroup == FreqGroup.FR_QTR:
+        return _quarterly_finder
+    elif fgroup == FreqGroup.FR_MTH:
+        return _monthly_finder
+    elif (dtype_code >= FreqGroup.FR_BUS.value) or fgroup == FreqGroup.FR_WK:  # pyright: ignore[reportAttributeAccessIssue]
+        return _daily_finder
+    else:  # pragma: no cover
+        raise NotImplementedError(f"Unsupported frequency: {dtype_code}")
+
+
+class TimeSeries_DateLocator(mpl.ticker.Locator):  # pyright: ignore[reportAttributeAccessIssue]
+    """
+    Locates the ticks along an axis controlled by a :class:`Series`.
+
+    Parameters
+    ----------
+    freq : BaseOffset
+        Valid frequency specifier.
+    minor_locator : {False, True}, optional
+        Whether the locator is for minor ticks (True) or not.
+    dynamic_mode : {True, False}, optional
+        Whether the locator should work in dynamic mode.
+    base : {int}, optional
+    quarter : {int}, optional
+    month : {int}, optional
+    day : {int}, optional
+    """
+
+    axis: Axis
+
+    def __init__(
+        self,
+        freq: BaseOffset,
+        minor_locator: bool = False,
+        dynamic_mode: bool = True,
+        base: int = 1,
+        quarter: int = 1,
+        month: int = 1,
+        day: int = 1,
+        plot_obj=None,
+    ) -> None:
+        freq = to_offset(freq, is_period=True)
+        self.freq = freq
+        self.base = base
+        (self.quarter, self.month, self.day) = (quarter, month, day)
+        self.isminor = minor_locator
+        self.isdynamic = dynamic_mode
+        self.offset = 0
+        self.plot_obj = plot_obj
+        self.finder = get_finder(freq)
+
+    def _get_default_locs(self, vmin, vmax):
+        """Returns the default locations of ticks."""
+        locator = self.finder(vmin, vmax, self.freq)
+
+        if self.isminor:
+            return np.compress(locator["min"], locator["val"])
+        return np.compress(locator["maj"], locator["val"])
+
+    def __call__(self):
+        """Return the locations of the ticks."""
+        # axis calls Locator.set_axis inside set_m<xxxx>_formatter
+
+        vi = tuple(self.axis.get_view_interval())
+        vmin, vmax = vi
+        if vmax < vmin:
+            vmin, vmax = vmax, vmin
+        if self.isdynamic:
+            locs = self._get_default_locs(vmin, vmax)
+        else:  # pragma: no cover
+            base = self.base
+            (d, m) = divmod(vmin, base)
+            vmin = (d + 1) * base
+            # error: No overload variant of "range" matches argument types "float",
+            # "float", "int"
+            locs = list(range(vmin, vmax + 1, base))  # type: ignore[call-overload]
+        return locs
+
+    def autoscale(self):
+        """
+        Sets the view limits to the nearest multiples of base that contain the
+        data.
+        """
+        # requires matplotlib >= 0.98.0
+        (vmin, vmax) = self.axis.get_data_interval()
+
+        locs = self._get_default_locs(vmin, vmax)
+        (vmin, vmax) = locs[[0, -1]]
+        if vmin == vmax:
+            vmin -= 1
+            vmax += 1
+        return mpl.transforms.nonsingular(vmin, vmax)
+
+
+# -------------------------------------------------------------------------
+# --- Formatter ---
+# -------------------------------------------------------------------------
+
+
+class TimeSeries_DateFormatter(mpl.ticker.Formatter):  # pyright: ignore[reportAttributeAccessIssue]
+    """
+    Formats the ticks along an axis controlled by a :class:`PeriodIndex`.
+
+    Parameters
+    ----------
+    freq : BaseOffset
+        Valid frequency specifier.
+    minor_locator : bool, default False
+        Whether the current formatter should apply to minor ticks (True) or
+        major ticks (False).
+    dynamic_mode : bool, default True
+        Whether the formatter works in dynamic mode or not.
+    """
+
+    axis: Axis
+
+    def __init__(
+        self,
+        freq: BaseOffset,
+        minor_locator: bool = False,
+        dynamic_mode: bool = True,
+        plot_obj=None,
+    ) -> None:
+        freq = to_offset(freq, is_period=True)
+        self.format = None
+        self.freq = freq
+        self.locs: list[Any] = []  # unused, for matplotlib compat
+        self.formatdict: dict[Any, Any] | None = None
+        self.isminor = minor_locator
+        self.isdynamic = dynamic_mode
+        self.offset = 0
+        self.plot_obj = plot_obj
+        self.finder = get_finder(freq)
+
+    def _set_default_format(self, vmin, vmax):
+        """Returns the default ticks spacing."""
+        info = self.finder(vmin, vmax, self.freq)
+
+        if self.isminor:
+            format = np.compress(info["min"] & np.logical_not(info["maj"]), info)
+        else:
+            format = np.compress(info["maj"], info)
+        self.formatdict = {x: f for (x, _, _, f) in format}
+        return self.formatdict
+
+    def set_locs(self, locs) -> None:
+        """Sets the locations of the ticks"""
+        # don't actually use the locs. This is just needed to work with
+        # matplotlib. Force to use vmin, vmax
+
+        self.locs = locs
+
+        (vmin, vmax) = tuple(self.axis.get_view_interval())
+        if vmax < vmin:
+            (vmin, vmax) = (vmax, vmin)
+        self._set_default_format(vmin, vmax)
+
+    def __call__(self, x, pos: int | None = 0) -> str:
+        if self.formatdict is None:
+            return ""
+        else:
+            fmt = self.formatdict.pop(x, "")
+            if isinstance(fmt, np.bytes_):
+                fmt = fmt.decode("utf-8")
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    "Period with BDay freq is deprecated",
+                    category=FutureWarning,
+                )
+                period = Period(ordinal=int(x), freq=self.freq)
+            assert isinstance(period, Period)
+            return period.strftime(fmt)
+
+
+class TimeSeries_TimedeltaFormatter(mpl.ticker.Formatter):  # pyright: ignore[reportAttributeAccessIssue]
+    """
+    Formats the ticks along an axis controlled by a :class:`TimedeltaIndex`.
+    """
+
+    def __init__(self, unit: TimeUnit = "ns"):
+        self.unit = unit
+        super().__init__()
+
+    axis: Axis
+
+    @staticmethod
+    def format_timedelta_ticks(x, pos, n_decimals: int, exp: int = 9) -> str:
+        """
+        Convert seconds to 'D days HH:MM:SS.F'
+        """
+        s, ns = divmod(x, 10**exp)
+        m, s = divmod(s, 60)
+        h, m = divmod(m, 60)
+        d, h = divmod(h, 24)
+        decimals = int(ns * 10 ** (n_decimals - exp))
+        s = f"{int(h):02d}:{int(m):02d}:{int(s):02d}"
+        if n_decimals > 0:
+            s += f".{decimals:0{n_decimals}d}"
+        if d != 0:
+            s = f"{int(d):d} days {s}"
+        return s
+
+    def __call__(self, x, pos: int | None = 0) -> str:
+        exp = {"ns": 9, "us": 6, "ms": 3, "s": 0}[self.unit]
+        (vmin, vmax) = tuple(self.axis.get_view_interval())
+        n_decimals = min(int(np.ceil(np.log10(100 * 10**exp / abs(vmax - vmin)))), exp)
+        return self.format_timedelta_ticks(x, pos, n_decimals, exp)
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..0834501c4429db97e2ebb0a7f7eeb88416aedc0a
--- /dev/null
+++ b/pandas/plotting/_matplotlib/core.py
@@ -0,0 +1,2207 @@
+from __future__ import annotations
+
+from abc import (
+    ABC,
+    abstractmethod,
+)
+from collections.abc import (
+    Hashable,
+    Iterable,
+    Iterator,
+    Sequence,
+)
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    cast,
+    final,
+)
+import warnings
+
+import matplotlib as mpl
+import numpy as np
+
+from pandas._libs import lib
+from pandas.errors import AbstractMethodError
+from pandas.util._decorators import cache_readonly
+from pandas.util._exceptions import find_stack_level
+
+from pandas.core.dtypes.common import (
+    is_any_real_numeric_dtype,
+    is_bool,
+    is_float,
+    is_float_dtype,
+    is_hashable,
+    is_integer,
+    is_integer_dtype,
+    is_iterator,
+    is_list_like,
+    is_number,
+    is_numeric_dtype,
+)
+from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
+    ExtensionDtype,
+)
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCDatetimeIndex,
+    ABCIndex,
+    ABCMultiIndex,
+    ABCPeriodIndex,
+    ABCSeries,
+)
+from pandas.core.dtypes.missing import isna
+
+import pandas.core.common as com
+
+from pandas.io.formats.printing import pprint_thing
+from pandas.plotting._matplotlib import tools
+from pandas.plotting._matplotlib.converter import (
+    PeriodConverter,
+    register_pandas_matplotlib_converters,
+)
+from pandas.plotting._matplotlib.groupby import reconstruct_data_with_by
+from pandas.plotting._matplotlib.misc import unpack_single_str_list
+from pandas.plotting._matplotlib.style import get_standard_colors
+from pandas.plotting._matplotlib.timeseries import (
+    format_dateaxis,
+    maybe_convert_index,
+    prepare_ts_data,
+    use_dynamic_x,
+)
+from pandas.plotting._matplotlib.tools import (
+    create_subplots,
+    flatten_axes,
+    format_date_labels,
+    get_all_lines,
+    get_xlim,
+    handle_shared_axes,
+)
+
+if TYPE_CHECKING:
+    from matplotlib.artist import Artist
+    from matplotlib.axes import Axes
+    from matplotlib.axis import Axis
+    from matplotlib.figure import Figure
+
+    from pandas._typing import (
+        IndexLabel,
+        NDFrameT,
+        PlottingOrientation,
+        npt,
+    )
+
+    from pandas import (
+        DataFrame,
+        Index,
+        Series,
+    )
+
+
+def holds_integer(column: Index) -> bool:
+    return column.dtype.kind in "iu"
+
+
+def _color_in_style(style: str) -> bool:
+    """
+    Check if there is a color letter in the style string.
+    """
+    return not set(mpl.colors.BASE_COLORS).isdisjoint(style)
+
+
+class MPLPlot(ABC):
+    """
+    Base class for assembling a pandas plot using matplotlib
+
+    Parameters
+    ----------
+    data :
+
+    """
+
+    @property
+    @abstractmethod
+    def _kind(self) -> str:
+        """Specify kind str. Must be overridden in child class"""
+        raise NotImplementedError
+
+    _layout_type = "vertical"
+    _default_rot = 0
+
+    @property
+    def orientation(self) -> str | None:
+        return None
+
+    data: DataFrame
+
+    def __init__(
+        self,
+        data,
+        kind=None,
+        by: IndexLabel | None = None,
+        subplots: bool | Sequence[Sequence[str]] = False,
+        sharex: bool | None = None,
+        sharey: bool = False,
+        use_index: bool = True,
+        figsize: tuple[float, float] | None = None,
+        grid=None,
+        legend: bool | str = True,
+        rot=None,
+        ax=None,
+        fig=None,
+        title=None,
+        xlim=None,
+        ylim=None,
+        xticks=None,
+        yticks=None,
+        xlabel: Hashable | None = None,
+        ylabel: Hashable | None = None,
+        fontsize: int | None = None,
+        secondary_y: bool | tuple | list | np.ndarray = False,
+        colormap=None,
+        table: bool = False,
+        layout=None,
+        include_bool: bool = False,
+        column: IndexLabel | None = None,
+        *,
+        logx: bool | None | Literal["sym"] = False,
+        logy: bool | None | Literal["sym"] = False,
+        loglog: bool | None | Literal["sym"] = False,
+        mark_right: bool = True,
+        stacked: bool = False,
+        label: Hashable | None = None,
+        style=None,
+        **kwds,
+    ) -> None:
+        # if users assign an empty list or tuple, raise `ValueError`
+        # similar to current `df.box` and `df.hist` APIs.
+        if by in ([], ()):
+            raise ValueError("No group keys passed!")
+        self.by = com.maybe_make_list(by)
+
+        # Assign the rest of columns into self.columns if by is explicitly defined
+        # while column is not, only need `columns` in hist/box plot when it's DF
+        # TODO: Might deprecate `column` argument in future PR (#28373)
+        if isinstance(data, ABCDataFrame):
+            if column:
+                self.columns = com.maybe_make_list(column)
+            elif self.by is None:
+                self.columns = [
+                    col for col in data.columns if is_numeric_dtype(data[col])
+                ]
+            else:
+                self.columns = [
+                    col
+                    for col in data.columns
+                    if col not in self.by and is_numeric_dtype(data[col])
+                ]
+
+        # For `hist` plot, need to get grouped original data before `self.data` is
+        # updated later
+        if self.by is not None and self._kind == "hist":
+            self._grouped = data.groupby(unpack_single_str_list(self.by))
+
+        self.kind = kind
+
+        self.subplots = type(self)._validate_subplots_kwarg(
+            subplots, data, kind=self._kind
+        )
+
+        self.sharex = type(self)._validate_sharex(sharex, ax, by)
+        self.sharey = sharey
+        self.figsize = figsize
+        self.layout = layout
+
+        self.xticks = xticks
+        self.yticks = yticks
+        self.xlim = xlim
+        self.ylim = ylim
+        self.title = title
+        self.use_index = use_index
+        self.xlabel = xlabel
+        self.ylabel = ylabel
+
+        self.fontsize = fontsize
+
+        if rot is not None:
+            self.rot = rot
+            # need to know for format_date_labels since it's rotated to 30 by
+            # default
+            self._rot_set = True
+        else:
+            self._rot_set = False
+            self.rot = self._default_rot
+
+        if grid is None:
+            grid = False if secondary_y else mpl.rcParams["axes.grid"]
+
+        self.grid = grid
+        self.legend = legend
+        self.legend_handles: list[Artist] = []
+        self.legend_labels: list[Hashable] = []
+
+        self.logx = type(self)._validate_log_kwd("logx", logx)
+        self.logy = type(self)._validate_log_kwd("logy", logy)
+        self.loglog = type(self)._validate_log_kwd("loglog", loglog)
+        self.label = label
+        self.style = style
+        self.mark_right = mark_right
+        self.stacked = stacked
+
+        # ax may be an Axes object or (if self.subplots) an ndarray of
+        #  Axes objects
+        self.ax = ax
+        # TODO: deprecate fig keyword as it is ignored, not passed in tests
+        #  as of 2023-11-05
+
+        # parse errorbar input if given
+        xerr = kwds.pop("xerr", None)
+        yerr = kwds.pop("yerr", None)
+        nseries = self._get_nseries(data)
+        xerr, data = type(self)._parse_errorbars("xerr", xerr, data, nseries)
+        yerr, data = type(self)._parse_errorbars("yerr", yerr, data, nseries)
+        self.errors = {"xerr": xerr, "yerr": yerr}
+        self.data = data
+
+        if not isinstance(secondary_y, (bool, tuple, list, np.ndarray, ABCIndex)):
+            secondary_y = [secondary_y]
+        self.secondary_y = secondary_y
+
+        # ugly TypeError if user passes matplotlib's `cmap` name.
+        # Probably better to accept either.
+        if "cmap" in kwds and colormap:
+            raise TypeError("Only specify one of `cmap` and `colormap`.")
+        if "cmap" in kwds:
+            self.colormap = kwds.pop("cmap")
+        else:
+            self.colormap = colormap
+
+        self.table = table
+        self.include_bool = include_bool
+
+        self.kwds = kwds
+
+        color = kwds.pop("color", lib.no_default)
+        self.color = self._validate_color_args(color, self.colormap)
+        assert "color" not in self.kwds
+
+        self.data = self._ensure_frame(self.data)
+
+        from pandas.plotting import plot_params
+
+        self.x_compat = plot_params["x_compat"]
+        if "x_compat" in self.kwds:
+            self.x_compat = bool(self.kwds.pop("x_compat"))
+
+    @final
+    def _is_ts_plot(self) -> bool:
+        # this is slightly deceptive
+        return not self.x_compat and self.use_index and self._use_dynamic_x()
+
+    @final
+    def _use_dynamic_x(self) -> bool:
+        return use_dynamic_x(self._get_ax(0), self.data.index)
+
+    @final
+    @staticmethod
+    def _validate_sharex(sharex: bool | None, ax, by) -> bool:
+        if sharex is None:
+            # if by is defined, subplots are used and sharex should be False
+            if ax is None and by is None:
+                sharex = True
+            else:
+                # if we get an axis, the users should do the visibility
+                # setting...
+                sharex = False
+        elif not is_bool(sharex):
+            raise TypeError("sharex must be a bool or None")
+        return bool(sharex)
+
+    @classmethod
+    def _validate_log_kwd(
+        cls,
+        kwd: str,
+        value: bool | None | Literal["sym"],
+    ) -> bool | None | Literal["sym"]:
+        if (
+            value is None
+            or isinstance(value, bool)
+            or (isinstance(value, str) and value == "sym")
+        ):
+            return value
+        raise ValueError(
+            f"keyword '{kwd}' should be bool, None, or 'sym', not '{value}'"
+        )
+
+    @final
+    @staticmethod
+    def _validate_subplots_kwarg(
+        subplots: bool | Sequence[Sequence[str]], data: Series | DataFrame, kind: str
+    ) -> bool | list[tuple[int, ...]]:
+        """
+        Validate the subplots parameter
+
+        - check type and content
+        - check for duplicate columns
+        - check for invalid column names
+        - convert column names into indices
+        - add missing columns in a group of their own
+        See comments in code below for more details.
+
+        Parameters
+        ----------
+        subplots : subplots parameters as passed to PlotAccessor
+
+        Returns
+        -------
+        validated subplots : a bool or a list of tuples of column indices. Columns
+        in the same tuple will be grouped together in the resulting plot.
+        """
+
+        if isinstance(subplots, bool):
+            return subplots
+        elif not isinstance(subplots, Iterable):
+            raise ValueError("subplots should be a bool or an iterable")
+
+        supported_kinds = (
+            "line",
+            "bar",
+            "barh",
+            "hist",
+            "kde",
+            "density",
+            "area",
+            "pie",
+        )
+        if kind not in supported_kinds:
+            raise ValueError(
+                "When subplots is an iterable, kind must be "
+                f"one of {', '.join(supported_kinds)}. Got {kind}."
+            )
+
+        if isinstance(data, ABCSeries):
+            raise NotImplementedError(
+                "An iterable subplots for a Series is not supported."
+            )
+
+        columns = data.columns
+        if isinstance(columns, ABCMultiIndex):
+            raise NotImplementedError(
+                "An iterable subplots for a DataFrame with a MultiIndex column "
+                "is not supported."
+            )
+
+        if columns.nunique() != len(columns):
+            raise NotImplementedError(
+                "An iterable subplots for a DataFrame with non-unique column "
+                "labels is not supported."
+            )
+
+        # subplots is a list of tuples where each tuple is a group of
+        # columns to be grouped together (one ax per group).
+        # we consolidate the subplots list such that:
+        # - the tuples contain indices instead of column names
+        # - the columns that aren't yet in the list are added in a group
+        #   of their own.
+        # For example with columns from a to g, and
+        # subplots = [(a, c), (b, f, e)],
+        # we end up with [(ai, ci), (bi, fi, ei), (di,), (gi,)]
+        # This way, we can handle self.subplots in a homogeneous manner
+        # later.
+        # TODO: also accept indices instead of just names?
+
+        out = []
+        seen_columns: set[Hashable] = set()
+        for group in subplots:
+            if not is_list_like(group):
+                raise ValueError(
+                    "When subplots is an iterable, each entry "
+                    "should be a list/tuple of column names."
+                )
+            idx_locs = columns.get_indexer_for(group)
+            if (idx_locs == -1).any():
+                bad_labels = np.extract(idx_locs == -1, group)
+                raise ValueError(
+                    f"Column label(s) {list(bad_labels)} not found in the DataFrame."
+                )
+            unique_columns = set(group)
+            duplicates = seen_columns.intersection(unique_columns)
+            if duplicates:
+                raise ValueError(
+                    "Each column should be in only one subplot. "
+                    f"Columns {duplicates} were found in multiple subplots."
+                )
+            seen_columns = seen_columns.union(unique_columns)
+            out.append(tuple(idx_locs))
+
+        unseen_columns = columns.difference(seen_columns)
+        for column in unseen_columns:
+            idx_loc = columns.get_loc(column)
+            out.append((idx_loc,))
+        return out
+
+    def _validate_color_args(self, color, colormap):
+        if color is lib.no_default:
+            # It was not provided by the user
+            if "colors" in self.kwds and colormap is not None:
+                warnings.warn(
+                    "'color' and 'colormap' cannot be used simultaneously. "
+                    "Using 'color'",
+                    stacklevel=find_stack_level(),
+                )
+            return None
+        if self.nseries == 1 and color is not None and not is_list_like(color):
+            # support series.plot(color='green')
+            color = [color]
+
+        if isinstance(color, tuple) and self.nseries == 1 and len(color) in (3, 4):
+            # support RGB and RGBA tuples in series plot
+            color = [color]
+
+        if colormap is not None:
+            warnings.warn(
+                "'color' and 'colormap' cannot be used simultaneously. Using 'color'",
+                stacklevel=find_stack_level(),
+            )
+
+        if self.style is not None:
+            if isinstance(self.style, dict):
+                styles = [self.style[col] for col in self.columns if col in self.style]
+            elif is_list_like(self.style):
+                styles = self.style
+            else:
+                styles = [self.style]
+            # need only a single match
+            for s in styles:
+                if _color_in_style(s):
+                    raise ValueError(
+                        "Cannot pass 'style' string with a color symbol and "
+                        "'color' keyword argument. Please use one or the "
+                        "other or pass 'style' without a color symbol"
+                    )
+        return color
+
+    @final
+    @staticmethod
+    def _iter_data(
+        data: DataFrame | dict[Hashable, Series | DataFrame],
+    ) -> Iterator[tuple[Hashable, np.ndarray]]:
+        for col, values in data.items():
+            # This was originally written to use values.values before EAs
+            #  were implemented; adding np.asarray(...) to keep consistent
+            #  typing.
+            yield col, np.asarray(values.values)
+
+    def _get_nseries(self, data: Series | DataFrame) -> int:
+        # When `by` is explicitly assigned, grouped data size will be defined, and
+        # this will determine number of subplots to have, aka `self.nseries`
+        if data.ndim == 1:
+            return 1
+        elif self.by is not None and self._kind == "hist":
+            return len(self._grouped)
+        elif self.by is not None and self._kind == "box":
+            return len(self.columns)
+        else:
+            return data.shape[1]
+
+    @final
+    @property
+    def nseries(self) -> int:
+        return self._get_nseries(self.data)
+
+    @final
+    def generate(self) -> None:
+        self._compute_plot_data()
+        fig = self.fig
+        self._make_plot(fig)
+        self._add_table()
+        self._make_legend()
+        self._adorn_subplots(fig)
+
+        for ax in self.axes:
+            self._post_plot_logic_common(ax)
+            self._post_plot_logic(ax, self.data)
+
+    @final
+    @staticmethod
+    def _has_plotted_object(ax: Axes) -> bool:
+        """check whether ax has data"""
+        return len(ax.lines) != 0 or len(ax.artists) != 0 or len(ax.containers) != 0
+
+    @final
+    def _maybe_right_yaxis(self, ax: Axes, axes_num: int) -> Axes:
+        if not self.on_right(axes_num):
+            # secondary axes may be passed via ax kw
+            return self._get_ax_layer(ax)
+
+        if hasattr(ax, "right_ax"):
+            # if it has right_ax property, ``ax`` must be left axes
+            return ax.right_ax
+        elif hasattr(ax, "left_ax"):
+            # if it has left_ax property, ``ax`` must be right axes
+            return ax
+        else:
+            # otherwise, create twin axes
+            orig_ax, new_ax = ax, ax.twinx()
+            # TODO: use Matplotlib public API when available
+            new_ax._get_lines = orig_ax._get_lines  # type: ignore[attr-defined]
+            # TODO #54485
+            new_ax._get_patches_for_fill = (  # type: ignore[attr-defined]
+                orig_ax._get_patches_for_fill  # type: ignore[attr-defined]
+            )
+            # TODO #54485
+            orig_ax.right_ax, new_ax.left_ax = (  # type: ignore[attr-defined]
+                new_ax,
+                orig_ax,
+            )
+
+            if not self._has_plotted_object(orig_ax):  # no data on left y
+                orig_ax.get_yaxis().set_visible(False)
+
+            if self.logy is True or self.loglog is True:
+                new_ax.set_yscale("log")
+            elif self.logy == "sym" or self.loglog == "sym":
+                new_ax.set_yscale("symlog")
+            return new_ax
+
+    @final
+    @cache_readonly
+    def fig(self) -> Figure:
+        return self._axes_and_fig[1]
+
+    @final
+    @cache_readonly
+    # TODO: can we annotate this as both a Sequence[Axes] and ndarray[object]?
+    def axes(self) -> Sequence[Axes]:
+        return self._axes_and_fig[0]
+
+    @final
+    @cache_readonly
+    def _axes_and_fig(self) -> tuple[Sequence[Axes], Figure]:
+        import matplotlib.pyplot as plt
+
+        if self.subplots:
+            naxes = (
+                self.nseries if isinstance(self.subplots, bool) else len(self.subplots)
+            )
+            fig, axes = create_subplots(
+                naxes=naxes,
+                sharex=self.sharex,
+                sharey=self.sharey,
+                figsize=self.figsize,
+                ax=self.ax,
+                layout=self.layout,
+                layout_type=self._layout_type,
+            )
+        elif self.ax is None:
+            fig = plt.figure(figsize=self.figsize)
+            axes = fig.add_subplot(111)
+        else:
+            fig = self.ax.get_figure()
+            if self.figsize is not None:
+                fig.set_size_inches(self.figsize)
+            axes = self.ax
+
+        axes = np.fromiter(flatten_axes(axes), dtype=object)
+
+        if self.logx is True or self.loglog is True:
+            [a.set_xscale("log") for a in axes]
+        elif self.logx == "sym" or self.loglog == "sym":
+            [a.set_xscale("symlog") for a in axes]
+
+        if self.logy is True or self.loglog is True:
+            [a.set_yscale("log") for a in axes]
+        elif self.logy == "sym" or self.loglog == "sym":
+            [a.set_yscale("symlog") for a in axes]
+
+        axes_seq = cast(Sequence["Axes"], axes)
+        return axes_seq, fig
+
+    @property
+    def result(self):
+        """
+        Return result axes
+        """
+        if self.subplots:
+            if self.layout is not None and not is_list_like(self.ax):
+                # error: "Sequence[Any]" has no attribute "reshape"
+                return self.axes.reshape(*self.layout)  # type: ignore[attr-defined]
+            else:
+                return self.axes
+        else:
+            sec_true = isinstance(self.secondary_y, bool) and self.secondary_y
+            # error: Argument 1 to "len" has incompatible type "Union[bool,
+            # Tuple[Any, ...], List[Any], ndarray[Any, Any]]"; expected "Sized"
+            all_sec = (
+                is_list_like(self.secondary_y) and len(self.secondary_y) == self.nseries  # type: ignore[arg-type]
+            )
+            if sec_true or all_sec:
+                # if all data is plotted on secondary, return right axes
+                return self._get_ax_layer(self.axes[0], primary=False)
+            else:
+                return self.axes[0]
+
+    @final
+    @staticmethod
+    def _convert_to_ndarray(data):
+        # GH31357: categorical columns are processed separately
+        if isinstance(data.dtype, CategoricalDtype):
+            return data
+
+        # GH32073: cast to float if values contain nulled integers
+        if (is_integer_dtype(data.dtype) or is_float_dtype(data.dtype)) and isinstance(
+            data.dtype, ExtensionDtype
+        ):
+            return data.to_numpy(dtype="float", na_value=np.nan)
+
+        # GH25587: cast ExtensionArray of pandas (IntegerArray, etc.) to
+        # np.ndarray before plot.
+        if len(data) > 0:
+            return np.asarray(data)
+
+        return data
+
+    @final
+    def _ensure_frame(self, data) -> DataFrame:
+        if isinstance(data, ABCSeries):
+            label = self.label
+            if label is None and data.name is None:
+                label = ""
+            if label is None:
+                # We'll end up with columns of [0] instead of [None]
+                data = data.to_frame()
+            else:
+                data = data.to_frame(name=label)
+        elif self._kind in ("hist", "box"):
+            cols = self.columns if self.by is None else self.columns + self.by
+            data = data.loc[:, cols]
+        return data
+
+    @final
+    def _compute_plot_data(self) -> None:
+        data = self.data
+
+        # GH15079 reconstruct data if by is defined
+        if self.by is not None:
+            self.subplots = True
+            data = reconstruct_data_with_by(self.data, by=self.by, cols=self.columns)
+
+        # GH16953, infer_objects is needed as fallback, for ``Series``
+        # with ``dtype == object``
+        data = data.infer_objects()
+        include_type = [np.number, "datetime", "datetimetz", "timedelta"]
+
+        # GH23719, allow plotting boolean
+        if self.include_bool is True:
+            include_type.append(np.bool_)
+
+        # GH22799, exclude datetime-like type for boxplot
+        exclude_type = None
+        if self._kind == "box":
+            # TODO: change after solving issue 27881
+            include_type = [np.number]
+            exclude_type = ["timedelta"]
+
+        # GH 18755, include object and category type for scatter plot
+        if self._kind == "scatter":
+            include_type.extend(["object", "category", "string"])
+
+        numeric_data = data.select_dtypes(include=include_type, exclude=exclude_type)
+
+        is_empty = numeric_data.shape[-1] == 0
+        # no non-numeric frames or series allowed
+        if is_empty:
+            raise TypeError("no numeric data to plot")
+
+        self.data = numeric_data.apply(type(self)._convert_to_ndarray)
+
+    def _make_plot(self, fig: Figure) -> None:
+        raise AbstractMethodError(self)
+
+    @final
+    def _add_table(self) -> None:
+        if self.table is False:
+            return
+        elif self.table is True:
+            data = self.data.transpose()
+        else:
+            data = self.table
+        ax = self._get_ax(0)
+        tools.table(ax, data)
+
+    @final
+    def _post_plot_logic_common(self, ax: Axes) -> None:
+        """Common post process for each axes"""
+        if self.orientation == "vertical" or self.orientation is None:
+            type(self)._apply_axis_properties(
+                ax.xaxis, rot=self.rot, fontsize=self.fontsize
+            )
+            type(self)._apply_axis_properties(ax.yaxis, fontsize=self.fontsize)
+
+            if hasattr(ax, "right_ax"):
+                type(self)._apply_axis_properties(
+                    ax.right_ax.yaxis, fontsize=self.fontsize
+                )
+
+        elif self.orientation == "horizontal":
+            type(self)._apply_axis_properties(
+                ax.yaxis, rot=self.rot, fontsize=self.fontsize
+            )
+            type(self)._apply_axis_properties(ax.xaxis, fontsize=self.fontsize)
+
+            if hasattr(ax, "right_ax"):
+                type(self)._apply_axis_properties(
+                    ax.right_ax.yaxis, fontsize=self.fontsize
+                )
+        else:  # pragma no cover
+            raise ValueError
+
+    @abstractmethod
+    def _post_plot_logic(self, ax: Axes, data) -> None:
+        """Post process for each axes. Overridden in child classes"""
+
+    @final
+    def _adorn_subplots(self, fig: Figure) -> None:
+        """Common post process unrelated to data"""
+        if len(self.axes) > 0:
+            all_axes = self._get_subplots(fig)
+            nrows, ncols = self._get_axes_layout(fig)
+            handle_shared_axes(
+                axarr=all_axes,
+                nplots=len(all_axes),
+                naxes=nrows * ncols,
+                nrows=nrows,
+                ncols=ncols,
+                sharex=self.sharex,
+                sharey=self.sharey,
+            )
+
+        for ax in self.axes:
+            ax = getattr(ax, "right_ax", ax)
+            if self.yticks is not None:
+                ax.set_yticks(self.yticks)
+
+            if self.xticks is not None:
+                ax.set_xticks(self.xticks)
+
+            if self.ylim is not None:
+                ax.set_ylim(self.ylim)
+
+            if self.xlim is not None:
+                ax.set_xlim(self.xlim)
+
+            # GH9093, currently Pandas does not show ylabel, so if users provide
+            # ylabel will set it as ylabel in the plot.
+            if self.ylabel is not None:
+                ax.set_ylabel(pprint_thing(self.ylabel))
+
+            ax.grid(self.grid)
+
+        if self.title:
+            if self.subplots:
+                if is_list_like(self.title):
+                    if not isinstance(self.subplots, bool):
+                        if len(self.subplots) != len(self.title):
+                            raise ValueError(
+                                f"The number of titles ({len(self.title)}) must equal "
+                                f"the number of subplots ({len(self.subplots)})."
+                            )
+                    elif len(self.title) != self.nseries:
+                        raise ValueError(
+                            "The length of `title` must equal the number "
+                            "of columns if using `title` of type `list` "
+                            "and `subplots=True`.\n"
+                            f"length of title = {len(self.title)}\n"
+                            f"number of columns = {self.nseries}"
+                        )
+
+                    for ax, title in zip(self.axes, self.title, strict=False):
+                        ax.set_title(title)
+                else:
+                    fig.suptitle(self.title)
+            else:
+                if is_list_like(self.title):
+                    msg = (
+                        "Using `title` of type `list` is not supported "
+                        "unless `subplots=True` is passed"
+                    )
+                    raise ValueError(msg)
+                self.axes[0].set_title(self.title)
+
+    @final
+    @staticmethod
+    def _apply_axis_properties(
+        axis: Axis, rot=None, fontsize: int | None = None
+    ) -> None:
+        """
+        Tick creation within matplotlib is reasonably expensive and is
+        internally deferred until accessed as Ticks are created/destroyed
+        multiple times per draw. It's therefore beneficial for us to avoid
+        accessing unless we will act on the Tick.
+        """
+        if rot is not None or fontsize is not None:
+            # rot=0 is a valid setting, hence the explicit None check
+            labels = axis.get_majorticklabels() + axis.get_minorticklabels()
+            for label in labels:
+                if rot is not None:
+                    label.set_rotation(rot)
+                if fontsize is not None:
+                    label.set_fontsize(fontsize)
+
+    @final
+    @property
+    def legend_title(self) -> str | None:
+        if not isinstance(self.data.columns, ABCMultiIndex):
+            name = self.data.columns.name
+            if name is not None:
+                name = pprint_thing(name)
+            return name
+        else:
+            stringified = map(pprint_thing, self.data.columns.names)
+            return ",".join(stringified)
+
+    @final
+    def _mark_right_label(self, label: str, index: int) -> str:
+        """
+        Append ``(right)`` to the label of a line if it's plotted on the right axis.
+
+        Note that ``(right)`` is only appended when ``subplots=False``.
+        """
+        if not self.subplots and self.mark_right and self.on_right(index):
+            label += " (right)"
+        return label
+
+    @final
+    def _append_legend_handles_labels(self, handle: Artist, label: str) -> None:
+        """
+        Append current handle and label to ``legend_handles`` and ``legend_labels``.
+
+        These will be used to make the legend.
+        """
+        self.legend_handles.append(handle)
+        self.legend_labels.append(label)
+
+    def _make_legend(self) -> None:
+        ax, leg = self._get_ax_legend(self.axes[0])
+
+        handles = []
+        labels = []
+        title = ""
+
+        if not self.subplots:
+            if leg is not None:
+                title = leg.get_title().get_text()
+                # Replace leg.legend_handles because it misses marker info
+                handles = leg.legend_handles
+                labels = [x.get_text() for x in leg.get_texts()]
+
+            if self.legend:
+                if self.legend == "reverse":
+                    handles += reversed(self.legend_handles)
+                    labels += reversed(self.legend_labels)
+                else:
+                    handles += self.legend_handles
+                    labels += self.legend_labels
+
+                if self.legend_title is not None:
+                    title = self.legend_title
+
+            if len(handles) > 0:
+                ax.legend(handles, labels, loc="best", title=title)
+
+        elif self.subplots and self.legend:
+            for ax in self.axes:
+                if ax.get_visible():
+                    with warnings.catch_warnings():
+                        warnings.filterwarnings(
+                            "ignore",
+                            "No artists with labels found to put in legend.",
+                            UserWarning,
+                        )
+                        ax.legend(loc="best")
+
+    @final
+    @staticmethod
+    def _get_ax_legend(ax: Axes):
+        """
+        Take in axes and return ax and legend under different scenarios
+        """
+        leg = ax.get_legend()
+
+        other_ax = cast(
+            "Axes", getattr(ax, "left_ax", None) or getattr(ax, "right_ax", None)
+        )
+        other_leg = None
+        if other_ax is not None:
+            other_leg = other_ax.get_legend()
+        if leg is None and other_leg is not None:
+            leg = other_leg
+            ax = other_ax
+        return ax, leg
+
+    _need_to_set_index = False
+
+    @final
+    def _get_xticks(self):
+        index = self.data.index
+        is_datetype = index.inferred_type in ("datetime", "date", "datetime64", "time")
+
+        # TODO: be stricter about x?
+        x: list[int] | np.ndarray
+        if self.use_index:
+            if isinstance(index, ABCPeriodIndex):
+                # test_mixed_freq_irreg_period
+                x = index.to_timestamp()._mpl_repr()
+                # TODO: why do we need to do to_timestamp() here but not other
+                #  places where we call mpl_repr?
+            elif is_any_real_numeric_dtype(index.dtype):
+                # Matplotlib supports numeric values or datetime objects as
+                # xaxis values. Taking LBYL approach here, by the time
+                # matplotlib raises exception when using non numeric/datetime
+                # values for xaxis, several actions are already taken by plt.
+                x = index._mpl_repr()
+            elif isinstance(index, ABCDatetimeIndex) or is_datetype:
+                x = index._mpl_repr()
+            else:
+                self._need_to_set_index = True
+                x = list(range(len(index)))
+        else:
+            x = list(range(len(index)))
+
+        return x
+
+    @classmethod
+    @register_pandas_matplotlib_converters
+    def _plot(
+        cls, ax: Axes, x, y: np.ndarray, style=None, is_errorbar: bool = False, **kwds
+    ):
+        mask = isna(y)
+        if mask.any():
+            y = np.ma.array(y)
+            y = np.ma.masked_where(mask, y)
+
+        if isinstance(x, ABCIndex):
+            x = x._mpl_repr()
+
+        if is_errorbar:
+            if "xerr" in kwds:
+                kwds["xerr"] = np.array(kwds.get("xerr"))
+            if "yerr" in kwds:
+                kwds["yerr"] = np.array(kwds.get("yerr"))
+            return ax.errorbar(x, y, **kwds)
+        else:
+            # prevent style kwarg from going to errorbar, where it is unsupported
+            args = (x, y, style) if style is not None else (x, y)
+            return ax.plot(*args, **kwds)
+
+    def _get_custom_index_name(self):
+        """Specify whether xlabel/ylabel should be used to override index name"""
+        return self.xlabel
+
+    @final
+    def _get_index_name(self) -> str | None:
+        if isinstance(self.data.index, ABCMultiIndex):
+            name = self.data.index.names
+            if com.any_not_none(*name):
+                name = ",".join([pprint_thing(x) for x in name])
+            else:
+                name = None
+        else:
+            name = self.data.index.name
+            if name is not None:
+                name = pprint_thing(name)
+
+        # GH 45145, override the default axis label if one is provided.
+        index_name = self._get_custom_index_name()
+        if index_name is not None:
+            name = pprint_thing(index_name)
+
+        return name
+
+    @final
+    @classmethod
+    def _get_ax_layer(cls, ax, primary: bool = True):
+        """get left (primary) or right (secondary) axes"""
+        if primary:
+            return getattr(ax, "left_ax", ax)
+        else:
+            return getattr(ax, "right_ax", ax)
+
+    @final
+    def _col_idx_to_axis_idx(self, col_idx: int) -> int:
+        """Return the index of the axis where the column at col_idx should be plotted"""
+        if isinstance(self.subplots, list):
+            # Subplots is a list: some columns will be grouped together in the same ax
+            return next(
+                group_idx
+                for (group_idx, group) in enumerate(self.subplots)
+                if col_idx in group
+            )
+        else:
+            # subplots is True: one ax per column
+            return col_idx
+
+    @final
+    def _get_ax(self, i: int) -> Axes:
+        # get the twinx ax if appropriate
+        if self.subplots:
+            i = self._col_idx_to_axis_idx(i)
+            ax = self.axes[i]
+            ax = self._maybe_right_yaxis(ax, i)
+            # error: Unsupported target for indexed assignment ("Sequence[Any]")
+            self.axes[i] = ax  # type: ignore[index]
+        else:
+            ax = self.axes[0]
+            ax = self._maybe_right_yaxis(ax, i)
+
+        ax.get_yaxis().set_visible(True)
+        return ax
+
+    @final
+    def on_right(self, i: int) -> bool:
+        if isinstance(self.secondary_y, bool):
+            return self.secondary_y
+
+        if isinstance(self.secondary_y, (tuple, list, np.ndarray, ABCIndex)):
+            return self.data.columns[i] in self.secondary_y
+
+    @final
+    def _apply_style_colors(
+        self, colors, kwds: dict[str, Any], col_num: int, label: str
+    ):
+        """
+        Manage style and color based on column number and its label.
+        Returns tuple of appropriate style and kwds which "color" may be added.
+        """
+        style = None
+        if self.style is not None:
+            if isinstance(self.style, list):
+                try:
+                    style = self.style[col_num]
+                except IndexError:
+                    pass
+            elif isinstance(self.style, dict):
+                style = self.style.get(label, style)
+            else:
+                style = self.style
+
+        has_color = "color" in kwds or self.colormap is not None
+        nocolor_style = style is None or not _color_in_style(style)
+        if (has_color or self.subplots) and nocolor_style:
+            if isinstance(colors, dict):
+                kwds["color"] = colors[label]
+            else:
+                kwds["color"] = colors[col_num % len(colors)]
+        return style, kwds
+
+    def _get_colors(
+        self,
+        num_colors: int | None = None,
+        color_kwds: str = "color",
+    ):
+        if num_colors is None:
+            num_colors = self.nseries
+        if color_kwds == "color":
+            color = self.color
+        else:
+            color = self.kwds.get(color_kwds)
+        return get_standard_colors(
+            num_colors=num_colors,
+            colormap=self.colormap,
+            color=color,
+        )
+
+    # TODO: tighter typing for first return?
+    @final
+    @staticmethod
+    def _parse_errorbars(
+        label: str, err, data: NDFrameT, nseries: int
+    ) -> tuple[Any, NDFrameT]:
+        """
+        Look for error keyword arguments and return the actual errorbar data
+        or return the error DataFrame/dict
+
+        Error bars can be specified in several ways:
+            Series: the user provides a pandas.Series object of the same
+                    length as the data
+            ndarray: provides an np.ndarray of the same length as the data
+            DataFrame/dict: error values are paired with keys matching the
+                    key in the plotted DataFrame
+            str: the name of the column within the plotted DataFrame
+
+        Asymmetrical error bars are also supported, however raw error values
+        must be provided in this case. For an ``N`` length :class:`Series`, a
+        ``2xN`` array should be provided indicating lower and upper (or left
+        and right) errors. For an ``MxN`` :class:`DataFrame`, asymmetrical errors
+        should be in an ``Mx2xN`` array.
+        """
+        if err is None:
+            return None, data
+
+        def match_labels(data, e):
+            e = e.reindex(data.index)
+            return e
+
+        # key-matched DataFrame
+        if isinstance(err, ABCDataFrame):
+            err = match_labels(data, err)
+        # key-matched dict
+        elif isinstance(err, dict):
+            pass
+
+        # Series of error values
+        elif isinstance(err, ABCSeries):
+            # broadcast error series across data
+            err = match_labels(data, err)
+            err = np.atleast_2d(err)
+            err = np.tile(err, (nseries, 1))
+
+        # errors are a column in the dataframe
+        elif isinstance(err, str):
+            evalues = data[err].values
+            data = data[data.columns.drop(err)]
+            err = np.atleast_2d(evalues)
+            err = np.tile(err, (nseries, 1))
+
+        elif is_list_like(err):
+            if is_iterator(err):
+                err = np.atleast_2d(list(err))
+            else:
+                # raw error values
+                err = np.atleast_2d(err)
+
+            err_shape = err.shape
+
+            # asymmetrical error bars
+            if isinstance(data, ABCSeries) and err_shape[0] == 2:
+                err = np.expand_dims(err, 0)
+                err_shape = err.shape
+                if err_shape[2] != len(data):
+                    raise ValueError(
+                        "Asymmetrical error bars should be provided "
+                        f"with the shape (2, {len(data)})"
+                    )
+            elif isinstance(data, ABCDataFrame) and err.ndim == 3:
+                if (
+                    (err_shape[0] != nseries)
+                    or (err_shape[1] != 2)
+                    or (err_shape[2] != len(data))
+                ):
+                    raise ValueError(
+                        "Asymmetrical error bars should be provided "
+                        f"with the shape ({nseries}, 2, {len(data)})"
+                    )
+
+            # broadcast errors to each data series
+            if len(err) == 1:
+                err = np.tile(err, (nseries, 1))
+
+        elif is_number(err):
+            err = np.tile(
+                [err],
+                (nseries, len(data)),
+            )
+
+        else:
+            msg = f"No valid {label} detected"
+            raise ValueError(msg)
+
+        return err, data
+
+    @final
+    def _get_errorbars(
+        self, label=None, index=None, xerr: bool = True, yerr: bool = True
+    ) -> dict[str, Any]:
+        errors = {}
+
+        for kw, flag in zip(["xerr", "yerr"], [xerr, yerr], strict=True):
+            if flag:
+                err = self.errors[kw]
+                # user provided label-matched dataframe of errors
+                if isinstance(err, (ABCDataFrame, dict)):
+                    if label is not None and label in err.keys():
+                        err = err[label]
+                    else:
+                        err = None
+                elif index is not None and err is not None:
+                    err = err[index]
+
+                if err is not None:
+                    errors[kw] = err
+        return errors
+
+    @final
+    def _get_subplots(self, fig: Figure) -> list[Axes]:
+        return [
+            ax
+            for ax in fig.get_axes()
+            if (isinstance(ax, mpl.axes.Axes) and ax.get_subplotspec() is not None)
+        ]
+
+    @final
+    def _get_axes_layout(self, fig: Figure) -> tuple[int, int]:
+        axes = self._get_subplots(fig)
+        x_set = set()
+        y_set = set()
+        for ax in axes:
+            # check axes coordinates to estimate layout
+            points = ax.get_position().get_points()
+            x_set.add(points[0][0])
+            y_set.add(points[0][1])
+        return (len(y_set), len(x_set))
+
+
+class PlanePlot(MPLPlot, ABC):
+    """
+    Abstract class for plotting on plane, currently scatter and hexbin.
+    """
+
+    _layout_type = "single"
+
+    def __init__(self, data, x, y, **kwargs) -> None:
+        MPLPlot.__init__(self, data, **kwargs)
+        if x is None or y is None:
+            raise ValueError(self._kind + " requires an x and y column")
+        if is_integer(x) and not holds_integer(self.data.columns):
+            x = self.data.columns[x]
+        if is_integer(y) and not holds_integer(self.data.columns):
+            y = self.data.columns[y]
+
+        self.x = x
+        self.y = y
+
+    @final
+    def _get_nseries(self, data: Series | DataFrame) -> int:
+        return 1
+
+    @final
+    def _post_plot_logic(self, ax: Axes, data) -> None:
+        x, y = self.x, self.y
+        xlabel = self.xlabel if self.xlabel is not None else pprint_thing(x)
+        ylabel = self.ylabel if self.ylabel is not None else pprint_thing(y)
+        # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible
+        # type "Hashable"; expected "str"
+        ax.set_xlabel(xlabel)  # type: ignore[arg-type]
+        ax.set_ylabel(ylabel)  # type: ignore[arg-type]
+
+    @final
+    def _plot_colorbar(self, ax: Axes, *, fig: Figure, **kwds):
+        # Addresses issues #10611 and #10678:
+        # When plotting scatterplots and hexbinplots in IPython
+        # inline backend the colorbar axis height tends not to
+        # exactly match the parent axis height.
+        # The difference is due to small fractional differences
+        # in floating points with similar representation.
+        # To deal with this, this method forces the colorbar
+        # height to take the height of the parent axes.
+        # For a more detailed description of the issue
+        # see the following link:
+        # https://github.com/ipython/ipython/issues/11215
+
+        # GH33389, if ax is used multiple times, we should always
+        # use the last one which contains the latest information
+        # about the ax
+        img = ax.collections[-1]
+        return fig.colorbar(img, ax=ax, **kwds)
+
+
+class ScatterPlot(PlanePlot):
+    @property
+    def _kind(self) -> Literal["scatter"]:
+        return "scatter"
+
+    def __init__(
+        self,
+        data,
+        x,
+        y,
+        s=None,
+        c=None,
+        *,
+        colorbar: bool | lib.NoDefault = lib.no_default,
+        norm=None,
+        **kwargs,
+    ) -> None:
+        if s is None:
+            # hide the matplotlib default for size, in case we want to change
+            # the handling of this argument later
+            s = 20
+        elif is_hashable(s) and s in data.columns:
+            s = data[s]
+        self.s = s
+
+        self.colorbar = colorbar
+        self.norm = norm
+
+        super().__init__(data, x, y, **kwargs)
+        if is_integer(c) and not holds_integer(self.data.columns):
+            c = self.data.columns[c]
+        self.c = c
+
+    @register_pandas_matplotlib_converters
+    def _make_plot(self, fig: Figure) -> None:
+        x, y, c, data = self.x, self.y, self.c, self.data
+        ax = self.axes[0]
+
+        from pandas import Series
+
+        x_data = data[x]
+        s = Series(index=x_data)
+        if use_dynamic_x(ax, s.index):
+            s = maybe_convert_index(ax, s)
+            freq, s = prepare_ts_data(s, ax, self.kwds)
+            x_data = s.index
+
+        c_is_column = is_hashable(c) and c in self.data.columns
+
+        color_by_categorical = c_is_column and isinstance(
+            self.data[c].dtype, CategoricalDtype
+        )
+
+        color = self.color
+        c_values = self._get_c_values(color, color_by_categorical, c_is_column)
+        norm, cmap = self._get_norm_and_cmap(c_values, color_by_categorical)
+        cb = self._get_colorbar(c_values, c_is_column)
+
+        if self.legend:
+            label = self.label
+        else:
+            label = None
+
+        # if a list of non-color strings is passed in as c, color points
+        # by uniqueness of the strings, such same strings get same color
+        create_colors = not self._are_valid_colors(c_values)
+        if create_colors:
+            color_mapping = self._get_color_mapping(c_values)
+            c_values = [color_mapping[s] for s in c_values]
+
+            # build legend for labeling custom colors
+            ax.legend(
+                handles=[
+                    mpl.patches.Circle((0, 0), facecolor=c, label=s)
+                    for s, c in color_mapping.items()
+                ]
+            )
+
+        scatter = ax.scatter(
+            x_data.values,
+            data[y].values,
+            c=c_values,
+            label=label,
+            cmap=cmap,
+            norm=norm,
+            s=self.s,
+            **self.kwds,
+        )
+
+        if cb:
+            cbar_label = c if c_is_column else ""
+            cbar = self._plot_colorbar(ax, fig=fig, label=cbar_label)
+            if color_by_categorical:
+                n_cats = len(self.data[c].cat.categories)
+                cbar.set_ticks(np.linspace(0.5, n_cats - 0.5, n_cats))
+                cbar.ax.set_yticklabels(self.data[c].cat.categories)
+
+        if label is not None:
+            self._append_legend_handles_labels(
+                # error: Argument 2 to "_append_legend_handles_labels" of
+                # "MPLPlot" has incompatible type "Hashable"; expected "str"
+                scatter,
+                label,  # type: ignore[arg-type]
+            )
+
+        errors_x = self._get_errorbars(label=x, index=0, yerr=False)
+        errors_y = self._get_errorbars(label=y, index=0, xerr=False)
+        if len(errors_x) > 0 or len(errors_y) > 0:
+            err_kwds = dict(errors_x, **errors_y)
+            err_kwds["ecolor"] = scatter.get_facecolor()[0]
+            ax.errorbar(data[x].values, data[y].values, linestyle="none", **err_kwds)
+
+    def _get_c_values(self, color, color_by_categorical: bool, c_is_column: bool):
+        c = self.c
+        if c is not None and color is not None:
+            raise TypeError("Specify exactly one of `c` and `color`")
+        if c is None and color is None:
+            c_values = mpl.rcParams["patch.facecolor"]
+        elif color is not None:
+            c_values = color
+        elif color_by_categorical:
+            c_values = self.data[c].cat.codes
+        elif c_is_column:
+            c_values = self.data[c].values
+        else:
+            c_values = c
+        return c_values
+
+    def _are_valid_colors(self, c_values: Series) -> bool:
+        # check if c_values contains strings and if these strings are valid mpl colors.
+        # no need to check numerics as these (and mpl colors) will be validated for us
+        # in .Axes.scatter._parse_scatter_color_args(...)
+        unique = np.unique(c_values)
+        try:
+            if len(c_values) and all(isinstance(c, str) for c in unique):
+                mpl.colors.to_rgba_array(unique)
+
+            return True
+
+        except (TypeError, ValueError) as _:
+            return False
+
+    def _get_color_mapping(self, c_values: Series) -> dict[str, np.ndarray]:
+        unique = np.unique(c_values)
+        n_colors = len(unique)
+
+        # passing `None` here will default to :rc:`image.cmap`
+        cmap = mpl.colormaps.get_cmap(self.colormap)
+        colors = cmap(np.linspace(0, 1, n_colors))  # RGB tuples
+
+        return dict(zip(unique, colors, strict=True))
+
+    def _get_norm_and_cmap(self, c_values, color_by_categorical: bool):
+        c = self.c
+        if self.colormap is not None:
+            cmap = mpl.colormaps.get_cmap(self.colormap)
+        # cmap is only used if c_values are integers, otherwise UserWarning.
+        # GH-53908: additionally call isinstance() because is_integer_dtype
+        # returns True for "b" (meaning "blue" and not int8 in this context)
+        elif not isinstance(c_values, str) and is_integer_dtype(c_values):
+            # pandas uses colormap, matplotlib uses cmap.
+            cmap = mpl.colormaps["Greys"]
+        else:
+            cmap = None
+
+        if color_by_categorical and cmap is not None:
+            n_cats = len(self.data[c].cat.categories)
+            cmap = mpl.colors.ListedColormap([cmap(i) for i in range(cmap.N)])
+            bounds = np.linspace(0, n_cats, n_cats + 1)
+            norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
+            # TODO: warn that we are ignoring self.norm if user specified it?
+            #  Doesn't happen in any tests 2023-11-09
+        else:
+            norm = self.norm
+        return norm, cmap
+
+    def _get_colorbar(self, c_values, c_is_column: bool) -> bool:
+        # plot colorbar if
+        # 1. colormap is assigned, and
+        # 2.`c` is a column containing only numeric values
+        plot_colorbar = self.colormap or c_is_column
+        cb = self.colorbar
+        if cb is lib.no_default:
+            return is_numeric_dtype(c_values) and plot_colorbar
+        return cb
+
+
+class HexBinPlot(PlanePlot):
+    @property
+    def _kind(self) -> Literal["hexbin"]:
+        return "hexbin"
+
+    def __init__(self, data, x, y, C=None, *, colorbar: bool = True, **kwargs) -> None:
+        super().__init__(data, x, y, **kwargs)
+        if is_integer(C) and not holds_integer(self.data.columns):
+            C = self.data.columns[C]
+        self.C = C
+
+        self.colorbar = colorbar
+
+        # Scatter plot allows to plot objects data
+        if len(self.data[self.x]._get_numeric_data()) == 0:
+            raise ValueError(self._kind + " requires x column to be numeric")
+        if len(self.data[self.y]._get_numeric_data()) == 0:
+            raise ValueError(self._kind + " requires y column to be numeric")
+
+    def _make_plot(self, fig: Figure) -> None:
+        x, y, data, C = self.x, self.y, self.data, self.C
+        ax = self.axes[0]
+        # pandas uses colormap, matplotlib uses cmap.
+        cmap = self.colormap or "BuGn"
+        cmap = mpl.colormaps.get_cmap(cmap)
+        cb = self.colorbar
+
+        if C is None:
+            c_values = None
+        else:
+            c_values = data[C].values
+
+        ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap, **self.kwds)
+        if cb:
+            self._plot_colorbar(ax, fig=fig)
+
+    def _make_legend(self) -> None:
+        pass
+
+
+class LinePlot(MPLPlot):
+    _default_rot = 0
+
+    @property
+    def orientation(self) -> PlottingOrientation:
+        return "vertical"
+
+    @property
+    def _kind(self) -> Literal["line", "area", "hist", "kde", "box"]:
+        return "line"
+
+    def __init__(self, data, **kwargs) -> None:
+        MPLPlot.__init__(self, data, **kwargs)
+        if self.stacked:
+            self.data = self.data.fillna(value=0)
+
+    def _make_plot(self, fig: Figure) -> None:
+        if self._is_ts_plot():
+            data = maybe_convert_index(self._get_ax(0), self.data)
+
+            x = data.index  # dummy, not used
+            plotf = self._ts_plot
+            it = data.items()
+        else:
+            x = self._get_xticks()
+            # error: Incompatible types in assignment (expression has type
+            # "Callable[[Any, Any, Any, Any, Any, Any, KwArg(Any)], Any]", variable has
+            # type "Callable[[Any, Any, Any, Any, KwArg(Any)], Any]")
+            plotf = self._plot  # type: ignore[assignment]
+            # error: Incompatible types in assignment (expression has type
+            # "Iterator[tuple[Hashable, ndarray[Any, Any]]]", variable has
+            # type "Iterable[tuple[Hashable, Series]]")
+            it = self._iter_data(data=self.data)  # type: ignore[assignment]
+
+        stacking_id = self._get_stacking_id()
+        is_errorbar = com.any_not_none(*self.errors.values())
+
+        colors = self._get_colors()
+        for i, (label, y) in enumerate(it):
+            ax = self._get_ax(i)
+            kwds = self.kwds.copy()
+            if self.color is not None:
+                kwds["color"] = self.color
+            style, kwds = self._apply_style_colors(
+                colors,
+                kwds,
+                i,
+                # error: Argument 4 to "_apply_style_colors" of "MPLPlot" has
+                # incompatible type "Hashable"; expected "str"
+                label,  # type: ignore[arg-type]
+            )
+
+            errors = self._get_errorbars(label=label, index=i)
+            kwds = dict(kwds, **errors)
+
+            label = pprint_thing(label)
+            label = self._mark_right_label(label, index=i)
+            kwds["label"] = label
+
+            newlines = plotf(
+                ax,
+                x,
+                y,
+                style=style,
+                column_num=i,
+                stacking_id=stacking_id,
+                is_errorbar=is_errorbar,
+                **kwds,
+            )
+            self._append_legend_handles_labels(newlines[0], label)
+
+            if self._is_ts_plot():
+                # reset of xlim should be used for ts data
+                # TODO: GH28021, should find a way to change view limit on xaxis
+                lines = get_all_lines(ax)
+                left, right = get_xlim(lines)
+                ax.set_xlim(left, right)
+
+    # error: Signature of "_plot" incompatible with supertype "MPLPlot"
+    @classmethod
+    def _plot(  # type: ignore[override]
+        cls,
+        ax: Axes,
+        x,
+        y: np.ndarray,
+        style=None,
+        column_num=None,
+        stacking_id=None,
+        **kwds,
+    ):
+        # column_num is used to get the target column from plotf in line and
+        # area plots
+        if column_num == 0:
+            cls._initialize_stacker(ax, stacking_id, len(y))
+        y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"])
+        lines = MPLPlot._plot(ax, x, y_values, style=style, **kwds)
+        cls._update_stacker(ax, stacking_id, y)
+        return lines
+
+    @final
+    def _ts_plot(self, ax: Axes, x, data: Series, style=None, **kwds):
+        # accept x to be consistent with normal plot func,
+        # x is not passed to tsplot as it uses data.index as x coordinate
+        # column_num must be in kwds for stacking purpose
+        freq, data = prepare_ts_data(data, ax, kwds)
+
+        # TODO #54485
+        ax._plot_data.append((data, self._kind, kwds))  # type: ignore[attr-defined]
+
+        lines = self._plot(ax, data.index, np.asarray(data.values), style=style, **kwds)
+        # set date formatter, locators and rescale limits
+        # TODO #54485
+        format_dateaxis(ax, ax.freq, data.index)  # type: ignore[arg-type, attr-defined]
+        return lines
+
+    @final
+    def _get_stacking_id(self) -> int | None:
+        if self.stacked:
+            return id(self.data)
+        else:
+            return None
+
+    @final
+    @classmethod
+    def _initialize_stacker(cls, ax: Axes, stacking_id, n: int) -> None:
+        if stacking_id is None:
+            return
+        if not hasattr(ax, "_stacker_pos_prior"):
+            # TODO #54485
+            ax._stacker_pos_prior = {}  # type: ignore[attr-defined]
+        if not hasattr(ax, "_stacker_neg_prior"):
+            # TODO #54485
+            ax._stacker_neg_prior = {}  # type: ignore[attr-defined]
+        # TODO #54485
+        ax._stacker_pos_prior[stacking_id] = np.zeros(n)  # type: ignore[attr-defined]
+        # TODO #54485
+        ax._stacker_neg_prior[stacking_id] = np.zeros(n)  # type: ignore[attr-defined]
+
+    @final
+    @classmethod
+    def _get_stacked_values(
+        cls, ax: Axes, stacking_id: int | None, values: np.ndarray, label
+    ) -> np.ndarray:
+        if stacking_id is None:
+            return values
+        if not hasattr(ax, "_stacker_pos_prior"):
+            # stacker may not be initialized for subplots
+            cls._initialize_stacker(ax, stacking_id, len(values))
+
+        if (values >= 0).all():
+            # TODO #54485
+            return (
+                ax._stacker_pos_prior[stacking_id]  # type: ignore[attr-defined]
+                + values
+            )
+        elif (values <= 0).all():
+            # TODO #54485
+            return (
+                ax._stacker_neg_prior[stacking_id]  # type: ignore[attr-defined]
+                + values
+            )
+
+        raise ValueError(
+            "When stacked is True, each column must be either "
+            "all positive or all negative. "
+            f"Column '{label}' contains both positive and negative values"
+        )
+
+    @final
+    @classmethod
+    def _update_stacker(cls, ax: Axes, stacking_id: int | None, values) -> None:
+        if stacking_id is None:
+            return
+        if (values >= 0).all():
+            # TODO #54485
+            ax._stacker_pos_prior[stacking_id] += values  # type: ignore[attr-defined]
+        elif (values <= 0).all():
+            # TODO #54485
+            ax._stacker_neg_prior[stacking_id] += values  # type: ignore[attr-defined]
+
+    def _post_plot_logic(self, ax: Axes, data) -> None:
+        def get_label(i):
+            if is_float(i) and i.is_integer():
+                i = int(i)
+            try:
+                return pprint_thing(data.index[i])
+            except Exception:
+                return ""
+
+        if self._need_to_set_index:
+            xticks = ax.get_xticks()
+            xticklabels = [get_label(x) for x in xticks]
+            # error: Argument 1 to "FixedLocator" has incompatible type "ndarray[Any,
+            # Any]"; expected "Sequence[float]"
+            ax.xaxis.set_major_locator(mpl.ticker.FixedLocator(xticks))  # type: ignore[arg-type]
+            ax.set_xticklabels(xticklabels)
+
+        # If the index is an irregular time series, then by default
+        # we rotate the tick labels. The exception is if there are
+        # subplots which don't share their x-axes, in which we case
+        # we don't rotate the ticklabels as by default the subplots
+        # would be too close together.
+        condition = (
+            not self._use_dynamic_x()
+            and (data.index._is_all_dates and self.use_index)
+            and (not self.subplots or (self.subplots and self.sharex))
+        )
+
+        index_name = self._get_index_name()
+
+        if condition:
+            # irregular TS rotated 30 deg. by default
+            # probably a better place to check / set this.
+            if not self._rot_set:
+                self.rot = 30
+            format_date_labels(ax, rot=self.rot)
+
+        if index_name is not None and self.use_index:
+            ax.set_xlabel(index_name)
+
+
+class AreaPlot(LinePlot):
+    @property
+    def _kind(self) -> Literal["area"]:
+        return "area"
+
+    def __init__(self, data, **kwargs) -> None:
+        kwargs.setdefault("stacked", True)
+        data = data.fillna(value=0)
+        LinePlot.__init__(self, data, **kwargs)
+
+        if not self.stacked:
+            # use smaller alpha to distinguish overlap
+            self.kwds.setdefault("alpha", 0.5)
+
+        if self.logy or self.loglog:
+            raise ValueError("Log-y scales are not supported in area plot")
+
+    # error: Signature of "_plot" incompatible with supertype "MPLPlot"
+    @classmethod
+    def _plot(  # type: ignore[override]
+        cls,
+        ax: Axes,
+        x,
+        y: np.ndarray,
+        style=None,
+        column_num=None,
+        stacking_id=None,
+        is_errorbar: bool = False,
+        **kwds,
+    ):
+        if column_num == 0:
+            cls._initialize_stacker(ax, stacking_id, len(y))
+        y_values = cls._get_stacked_values(ax, stacking_id, y, kwds["label"])
+
+        # need to remove label, because subplots uses mpl legend as it is
+        line_kwds = kwds.copy()
+        line_kwds.pop("label")
+        lines = MPLPlot._plot(ax, x, y_values, style=style, **line_kwds)
+
+        # get data from the line to get coordinates for fill_between
+        xdata, y_values = lines[0].get_data(orig=False)
+
+        # unable to use ``_get_stacked_values`` here to get starting point
+        if stacking_id is None:
+            start = np.zeros(len(y))
+        elif (y >= 0).all():
+            # TODO #54485
+            start = ax._stacker_pos_prior[stacking_id]  # type: ignore[attr-defined]
+        elif (y <= 0).all():
+            # TODO #54485
+            start = ax._stacker_neg_prior[stacking_id]  # type: ignore[attr-defined]
+        else:
+            start = np.zeros(len(y))
+
+        if "color" not in kwds:
+            kwds["color"] = lines[0].get_color()
+
+        rect = ax.fill_between(xdata, start, y_values, **kwds)
+        cls._update_stacker(ax, stacking_id, y)
+
+        # LinePlot expects list of artists
+        res = [rect]
+        return res
+
+    def _post_plot_logic(self, ax: Axes, data) -> None:
+        LinePlot._post_plot_logic(self, ax, data)
+
+        is_shared_y = len(list(ax.get_shared_y_axes())) > 0
+        # do not override the default axis behaviour in case of shared y axes
+        if self.ylim is None and not is_shared_y:
+            if (data >= 0).all().all():
+                ax.set_ylim(0, None)
+            elif (data <= 0).all().all():
+                ax.set_ylim(None, 0)
+
+
+class BarPlot(MPLPlot):
+    @property
+    def _kind(self) -> Literal["bar", "barh"]:
+        return "bar"
+
+    _default_rot = 90
+
+    @property
+    def orientation(self) -> PlottingOrientation:
+        return "vertical"
+
+    def __init__(
+        self,
+        data,
+        *,
+        align="center",
+        bottom=0,
+        left=0,
+        width=0.5,
+        position=0.5,
+        log=False,
+        **kwargs,
+    ) -> None:
+        # we have to treat a series differently than a
+        # 1-column DataFrame w.r.t. color handling
+        self._is_series = isinstance(data, ABCSeries)
+        self.bar_width = width
+        self._align = align
+        self._position = position
+
+        if is_list_like(bottom):
+            bottom = np.array(bottom)
+        if is_list_like(left):
+            left = np.array(left)
+        self.bottom = bottom
+        self.left = left
+
+        self.log = log
+
+        MPLPlot.__init__(self, data, **kwargs)
+
+        if self._is_ts_plot():
+            self.tick_pos = np.array(
+                PeriodConverter.convert_from_freq(
+                    self._get_xticks(),
+                    data.index.freq,
+                )
+            )
+        else:
+            self.tick_pos = np.arange(len(data))
+
+    @cache_readonly
+    def ax_pos(self) -> np.ndarray:
+        return self.tick_pos - self.tickoffset
+
+    @cache_readonly
+    def tickoffset(self):
+        if self.stacked or self.subplots:
+            return self.bar_width * self._position
+        elif self._align == "edge":
+            w = self.bar_width / self.nseries
+            return self.bar_width * (self._position - 0.5) + w * 0.5
+        else:
+            return self.bar_width * self._position
+
+    @cache_readonly
+    def lim_offset(self):
+        if self.stacked or self.subplots:
+            if self._align == "edge":
+                return self.bar_width / 2
+            else:
+                return 0
+        elif self._align == "edge":
+            w = self.bar_width / self.nseries
+            return w * 0.5
+        else:
+            return 0
+
+    # error: Signature of "_plot" incompatible with supertype "MPLPlot"
+    @classmethod
+    @register_pandas_matplotlib_converters
+    def _plot(  # type: ignore[override]
+        cls,
+        ax: Axes,
+        x,
+        y: np.ndarray,
+        w,
+        start: int | npt.NDArray[np.intp] = 0,
+        log: bool = False,
+        **kwds,
+    ):
+        return ax.bar(x, y, w, bottom=start, log=log, **kwds)
+
+    @property
+    def _start_base(self):
+        return self.bottom
+
+    def _make_plot(self, fig: Figure) -> None:
+        colors = self._get_colors()
+        ncolors = len(colors)
+
+        pos_prior = neg_prior = np.zeros(len(self.data))
+        K = self.nseries
+
+        data = self.data.fillna(0)
+
+        _stacked_subplots_ind: dict[int, int] = {}
+        _stacked_subplots_offsets = []
+
+        self.subplots: list[Any]
+
+        if not isinstance(self.subplots, bool):
+            if bool(self.subplots) and self.stacked:
+                for i, sub_plot in enumerate(self.subplots):
+                    if len(sub_plot) <= 1:
+                        continue
+                    for plot in sub_plot:
+                        _stacked_subplots_ind[int(plot)] = i
+                    _stacked_subplots_offsets.append([0, 0])
+
+        for i, (label, y) in enumerate(self._iter_data(data=data)):
+            ax = self._get_ax(i)
+            kwds = self.kwds.copy()
+            if self._is_series:
+                kwds["color"] = colors
+            elif isinstance(colors, dict):
+                kwds["color"] = colors[label]
+            else:
+                kwds["color"] = colors[i % ncolors]
+
+            errors = self._get_errorbars(label=label, index=i)
+            kwds = dict(kwds, **errors)
+
+            label = pprint_thing(label)
+            label = self._mark_right_label(label, index=i)
+
+            if (("yerr" in kwds) or ("xerr" in kwds)) and (kwds.get("ecolor") is None):
+                kwds["ecolor"] = mpl.rcParams["xtick.color"]
+
+            start = 0
+            if self.log and (y >= 1).all():
+                start = 1
+            start = start + self._start_base
+
+            kwds["align"] = self._align
+
+            if i in _stacked_subplots_ind:
+                offset_index = _stacked_subplots_ind[i]
+                pos_prior, neg_prior = _stacked_subplots_offsets[offset_index]  # type: ignore[assignment]
+                mask = y >= 0
+                start = np.where(mask, pos_prior, neg_prior) + self._start_base
+                w = self.bar_width / 2
+                rect = self._plot(
+                    ax,
+                    self.ax_pos + w,
+                    y,
+                    self.bar_width,
+                    start=start,
+                    label=label,
+                    log=self.log,
+                    **kwds,
+                )
+                pos_new = pos_prior + np.where(mask, y, 0)
+                neg_new = neg_prior + np.where(mask, 0, y)
+                _stacked_subplots_offsets[offset_index] = [pos_new, neg_new]
+
+            elif self.subplots:
+                w = self.bar_width / 2
+                rect = self._plot(
+                    ax,
+                    self.ax_pos + w,
+                    y,
+                    self.bar_width,
+                    start=start,
+                    label=label,
+                    log=self.log,
+                    **kwds,
+                )
+                ax.set_title(label)
+            elif self.stacked:
+                mask = y >= 0
+                start = np.where(mask, pos_prior, neg_prior) + self._start_base
+                w = self.bar_width / 2
+                rect = self._plot(
+                    ax,
+                    self.ax_pos + w,
+                    y,
+                    self.bar_width,
+                    start=start,
+                    label=label,
+                    log=self.log,
+                    **kwds,
+                )
+                pos_prior = pos_prior + np.where(mask, y, 0)
+                neg_prior = neg_prior + np.where(mask, 0, y)
+            else:
+                w = self.bar_width / K
+                rect = self._plot(
+                    ax,
+                    self.ax_pos + (i + 0.5) * w,
+                    y,
+                    w,
+                    start=start,
+                    label=label,
+                    log=self.log,
+                    **kwds,
+                )
+            self._append_legend_handles_labels(rect, label)
+
+    def _post_plot_logic(self, ax: Axes, data) -> None:
+        if self.use_index:
+            str_index = [pprint_thing(key) for key in data.index]
+        else:
+            str_index = [pprint_thing(key) for key in range(data.shape[0])]
+
+        s_edge = self.ax_pos[0] - 0.25 + self.lim_offset
+        e_edge = self.ax_pos[-1] + 0.25 + self.bar_width + self.lim_offset
+
+        self._decorate_ticks(ax, self._get_index_name(), str_index, s_edge, e_edge)
+
+    def _decorate_ticks(
+        self,
+        ax: Axes,
+        name: str | None,
+        ticklabels: list[str],
+        start_edge: float,
+        end_edge: float,
+    ) -> None:
+        ax.set_xlim((start_edge, end_edge))
+
+        if self.xticks is not None:
+            ax.set_xticks(np.array(self.xticks))
+        else:
+            ax.set_xticks(self.tick_pos)
+            ax.set_xticklabels(ticklabels)
+
+        if name is not None and self.use_index:
+            ax.set_xlabel(name)
+
+
+class BarhPlot(BarPlot):
+    @property
+    def _kind(self) -> Literal["barh"]:
+        return "barh"
+
+    _default_rot = 0
+
+    @property
+    def orientation(self) -> Literal["horizontal"]:
+        return "horizontal"
+
+    @property
+    def _start_base(self):
+        return self.left
+
+    # error: Signature of "_plot" incompatible with supertype "MPLPlot"
+    @classmethod
+    def _plot(  # type: ignore[override]
+        cls,
+        ax: Axes,
+        x,
+        y: np.ndarray,
+        w,
+        start: int | npt.NDArray[np.intp] = 0,
+        log: bool = False,
+        **kwds,
+    ):
+        return ax.barh(x, y, w, left=start, log=log, **kwds)
+
+    def _get_custom_index_name(self):
+        return self.ylabel
+
+    def _decorate_ticks(
+        self,
+        ax: Axes,
+        name: str | None,
+        ticklabels: list[str],
+        start_edge: float,
+        end_edge: float,
+    ) -> None:
+        # horizontal bars
+        ax.set_ylim((start_edge, end_edge))
+        ax.set_yticks(self.tick_pos)
+        ax.set_yticklabels(ticklabels)
+        if name is not None and self.use_index:
+            ax.set_ylabel(name)
+        # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible type
+        # "Hashable | None"; expected "str"
+        ax.set_xlabel(self.xlabel)  # type: ignore[arg-type]
+
+
+class PiePlot(MPLPlot):
+    @property
+    def _kind(self) -> Literal["pie"]:
+        return "pie"
+
+    _layout_type = "horizontal"
+
+    def __init__(self, data: Series | DataFrame, kind=None, **kwargs) -> None:
+        data = data.fillna(value=0)
+        lt_zero = data < 0
+        if isinstance(data, ABCDataFrame) and lt_zero.any().any():
+            raise ValueError(f"{self._kind} plot doesn't allow negative values")
+        elif isinstance(data, ABCSeries) and lt_zero.any():
+            raise ValueError(f"{self._kind} plot doesn't allow negative values")
+        MPLPlot.__init__(self, data, kind=kind, **kwargs)
+
+    @classmethod
+    def _validate_log_kwd(
+        cls,
+        kwd: str,
+        value: bool | None | Literal["sym"],
+    ) -> bool | None | Literal["sym"]:
+        super()._validate_log_kwd(kwd=kwd, value=value)
+        if value is not False:
+            warnings.warn(
+                f"PiePlot ignores the '{kwd}' keyword",
+                UserWarning,
+                stacklevel=find_stack_level(),
+            )
+        return False
+
+    def _validate_color_args(self, color, colormap) -> None:
+        # TODO: warn if color is passed and ignored?
+        return None
+
+    def _make_plot(self, fig: Figure) -> None:
+        colors = self._get_colors(num_colors=len(self.data), color_kwds="colors")
+        self.kwds.setdefault("colors", colors)
+
+        for i, (label, y) in enumerate(self._iter_data(data=self.data)):
+            ax = self._get_ax(i)
+
+            kwds = self.kwds.copy()
+
+            def blank_labeler(label, value):
+                if value == 0:
+                    return ""
+                else:
+                    return label
+
+            idx = [pprint_thing(v) for v in self.data.index]
+            labels = kwds.pop("labels", idx)
+            # labels is used for each wedge's labels
+            # Blank out labels for values of 0 so they don't overlap
+            # with nonzero wedges
+            if labels is not None:
+                blabels = [
+                    blank_labeler(left, value)
+                    for left, value in zip(labels, y, strict=True)
+                ]
+            else:
+                blabels = None
+            results = ax.pie(y, labels=blabels, **kwds)
+
+            if kwds.get("autopct", None) is not None:
+                # error: Need more than 2 values to unpack (3 expected)
+                patches, texts, autotexts = results  # type: ignore[misc]
+            else:
+                # error: Too many values to unpack (2 expected, 3 provided)
+                patches, texts = results  # type: ignore[misc]
+                autotexts = []
+
+            if self.fontsize is not None:
+                for t in texts + autotexts:
+                    t.set_fontsize(self.fontsize)
+
+            # leglabels is used for legend labels
+            leglabels = labels if labels is not None else idx
+            for _patch, _leglabel in zip(patches, leglabels, strict=True):
+                self._append_legend_handles_labels(_patch, _leglabel)
+
+    def _post_plot_logic(self, ax: Axes, data) -> None:
+        pass
diff --git a/pandas/plotting/_matplotlib/groupby.py b/pandas/plotting/_matplotlib/groupby.py
new file mode 100644
index 0000000000000000000000000000000000000000..783f79710097c7e471d3f531fac3be8cd711014a
--- /dev/null
+++ b/pandas/plotting/_matplotlib/groupby.py
@@ -0,0 +1,141 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from pandas.core.dtypes.missing import remove_na_arraylike
+
+from pandas import (
+    MultiIndex,
+    concat,
+)
+
+from pandas.plotting._matplotlib.misc import unpack_single_str_list
+
+if TYPE_CHECKING:
+    from collections.abc import Hashable
+
+    from pandas._typing import IndexLabel
+
+    from pandas import (
+        DataFrame,
+        Series,
+    )
+
+
+def create_iter_data_given_by(
+    data: DataFrame, kind: str = "hist"
+) -> dict[Hashable, DataFrame | Series]:
+    """
+    Create data for iteration given `by` is assigned or not, and it is only
+    used in both hist and boxplot.
+
+    If `by` is assigned, return a dictionary of DataFrames in which the key of
+    dictionary is the values in groups.
+    If `by` is not assigned, return input as is, and this preserves current
+    status of iter_data.
+
+    Parameters
+    ----------
+    data : reformatted grouped data from `_compute_plot_data` method.
+    kind : str, plot kind. This function is only used for `hist` and `box` plots.
+
+    Returns
+    -------
+    iter_data : DataFrame or Dictionary of DataFrames
+
+    Examples
+    --------
+    If `by` is assigned:
+
+    >>> import numpy as np
+    >>> tuples = [("h1", "a"), ("h1", "b"), ("h2", "a"), ("h2", "b")]
+    >>> mi = pd.MultiIndex.from_tuples(tuples)
+    >>> value = [[1, 3, np.nan, np.nan], [3, 4, np.nan, np.nan], [np.nan, np.nan, 5, 6]]
+    >>> data = pd.DataFrame(value, columns=mi)
+    >>> create_iter_data_given_by(data)
+    {'h1':     h1
+         a    b
+    0  1.0  3.0
+    1  3.0  4.0
+    2  NaN  NaN, 'h2':     h2
+         a    b
+    0  NaN  NaN
+    1  NaN  NaN
+    2  5.0  6.0}
+    """
+
+    # For `hist` plot, before transformation, the values in level 0 are values
+    # in groups and subplot titles, and later used for column subselection and
+    # iteration; For `box` plot, values in level 1 are column names to show,
+    # and are used for iteration and as subplots titles.
+    if kind == "hist":
+        level = 0
+    else:
+        level = 1
+
+    # Select sub-columns based on the value of level of MI, and if `by` is
+    # assigned, data must be a MI DataFrame
+    assert isinstance(data.columns, MultiIndex)
+    return {
+        col: data.loc[:, data.columns.get_level_values(level) == col]
+        for col in data.columns.levels[level]
+    }
+
+
+def reconstruct_data_with_by(
+    data: DataFrame, by: IndexLabel, cols: IndexLabel
+) -> DataFrame:
+    """
+    Internal function to group data, and reassign multiindex column names onto the
+    result in order to let grouped data be used in _compute_plot_data method.
+
+    Parameters
+    ----------
+    data : Original DataFrame to plot
+    by : grouped `by` parameter selected by users
+    cols : columns of data set (excluding columns used in `by`)
+
+    Returns
+    -------
+    Output is the reconstructed DataFrame with MultiIndex columns. The first level
+    of MI is unique values of groups, and second level of MI is the columns
+    selected by users.
+
+    Examples
+    --------
+    >>> d = {"h": ["h1", "h1", "h2"], "a": [1, 3, 5], "b": [3, 4, 6]}
+    >>> df = pd.DataFrame(d)
+    >>> reconstruct_data_with_by(df, by="h", cols=["a", "b"])
+       h1      h2
+       a     b     a     b
+    0  1.0   3.0   NaN   NaN
+    1  3.0   4.0   NaN   NaN
+    2  NaN   NaN   5.0   6.0
+    """
+    by_modified = unpack_single_str_list(by)
+    grouped = data.groupby(by_modified)
+
+    data_list = []
+    for key, group in grouped:
+        # error: List item 1 has incompatible type "Union[Hashable,
+        # Sequence[Hashable]]"; expected "Iterable[Hashable]"
+        columns = MultiIndex.from_product([[key], cols])  # type: ignore[list-item]
+        sub_group = group[cols]
+        sub_group.columns = columns
+        data_list.append(sub_group)
+
+    data = concat(data_list, axis=1)
+    return data
+
+
+def reformat_hist_y_given_by(y: np.ndarray, by: IndexLabel | None) -> np.ndarray:
+    """Internal function to reformat y given `by` is applied or not for hist plot.
+
+    If by is None, input y is 1-d with NaN removed; and if by is not None, groupby
+    will take place and input y is multi-dimensional array.
+    """
+    if by is not None and len(y.shape) > 1:
+        return np.array([remove_na_arraylike(col) for col in y.T]).T
+    return remove_na_arraylike(y)
diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py
new file mode 100644
index 0000000000000000000000000000000000000000..029db85b315fd5d7849cd4441e26d0e50a046f01
--- /dev/null
+++ b/pandas/plotting/_matplotlib/hist.py
@@ -0,0 +1,574 @@
+from __future__ import annotations
+
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    final,
+)
+
+import numpy as np
+
+from pandas.core.dtypes.common import (
+    is_integer,
+    is_list_like,
+)
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCIndex,
+)
+from pandas.core.dtypes.missing import (
+    isna,
+    remove_na_arraylike,
+)
+
+from pandas.io.formats.printing import pprint_thing
+from pandas.plotting._matplotlib.core import (
+    LinePlot,
+    MPLPlot,
+)
+from pandas.plotting._matplotlib.groupby import (
+    create_iter_data_given_by,
+    reformat_hist_y_given_by,
+)
+from pandas.plotting._matplotlib.misc import unpack_single_str_list
+from pandas.plotting._matplotlib.tools import (
+    create_subplots,
+    flatten_axes,
+    maybe_adjust_figure,
+    set_ticks_props,
+)
+
+if TYPE_CHECKING:
+    from matplotlib.axes import Axes
+    from matplotlib.container import BarContainer
+    from matplotlib.figure import Figure
+    from matplotlib.patches import Polygon
+
+    from pandas._typing import PlottingOrientation
+
+    from pandas import (
+        DataFrame,
+        Series,
+    )
+
+
+class HistPlot(LinePlot):
+    @property
+    def _kind(self) -> Literal["hist", "kde"]:
+        return "hist"
+
+    def __init__(
+        self,
+        data,
+        bins: int | np.ndarray | list[np.ndarray] = 10,
+        bottom: int | np.ndarray = 0,
+        *,
+        range=None,
+        weights=None,
+        **kwargs,
+    ) -> None:
+        if is_list_like(bottom):
+            bottom = np.array(bottom)
+        self.bottom = bottom
+
+        self._bin_range = range
+        self.weights = weights
+
+        self.xlabel = kwargs.get("xlabel")
+        self.ylabel = kwargs.get("ylabel")
+        # Do not call LinePlot.__init__ which may fill nan
+        MPLPlot.__init__(self, data, **kwargs)
+
+        self.bins = self._adjust_bins(bins)
+
+    def _adjust_bins(self, bins: int | np.ndarray | list[np.ndarray]):
+        if is_integer(bins):
+            if self.by is not None:
+                by_modified = unpack_single_str_list(self.by)
+                grouped = self.data.groupby(by_modified)[self.columns]
+                bins = [self._calculate_bins(group, bins) for key, group in grouped]
+            else:
+                bins = self._calculate_bins(self.data, bins)
+        return bins
+
+    def _calculate_bins(self, data: Series | DataFrame, bins) -> np.ndarray:
+        """Calculate bins given data"""
+        nd_values = data.infer_objects()._get_numeric_data()
+        values = nd_values.values
+        if nd_values.ndim == 2:
+            values = values.reshape(-1)
+        values = values[~isna(values)]
+
+        return np.histogram_bin_edges(values, bins=bins, range=self._bin_range)
+
+    # error: Signature of "_plot" incompatible with supertype "LinePlot"
+    @classmethod
+    def _plot(  # type: ignore[override]
+        cls,
+        ax: Axes,
+        y: np.ndarray,
+        style=None,
+        bottom: int | np.ndarray = 0,
+        column_num: int = 0,
+        stacking_id=None,
+        *,
+        bins,
+        **kwds,
+        # might return a subset from the possible return types of Axes.hist(...)[2]?
+    ) -> BarContainer | Polygon | list[BarContainer | Polygon]:
+        if column_num == 0:
+            cls._initialize_stacker(ax, stacking_id, len(bins) - 1)
+
+        base = np.zeros(len(bins) - 1)
+        bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"])
+        # ignore style
+        n, bins, patches = ax.hist(y, bins=bins, bottom=bottom, **kwds)
+        cls._update_stacker(ax, stacking_id, n)
+        return patches
+
+    def _make_plot(self, fig: Figure) -> None:
+        colors = self._get_colors()
+        stacking_id = self._get_stacking_id()
+
+        # Re-create iterated data if `by` is assigned by users
+        data = (
+            create_iter_data_given_by(self.data, self._kind)
+            if self.by is not None
+            else self.data
+        )
+        for i, (label, y) in enumerate(self._iter_data(data=data)):
+            ax = self._get_ax(i)
+
+            kwds = self.kwds.copy()
+            if self.color is not None:
+                kwds["color"] = self.color
+
+            label = pprint_thing(label)
+            label = self._mark_right_label(label, index=i)
+            kwds["label"] = label
+
+            style, kwds = self._apply_style_colors(colors, kwds, i, label)
+            if style is not None:
+                kwds["style"] = style
+
+            self._make_plot_keywords(kwds, y)
+
+            # the bins is multi-dimension array now and each plot need only 1-d and
+            # when by is applied, label should be columns that are grouped
+            if self.by is not None:
+                kwds["bins"] = kwds["bins"][i]
+                kwds["label"] = self.columns
+                kwds.pop("color")
+
+            if self.weights is not None:
+                kwds["weights"] = type(self)._get_column_weights(self.weights, i, y)
+
+            y = reformat_hist_y_given_by(y, self.by)
+
+            artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds)
+
+            # when by is applied, show title for subplots to know which group it is
+            if self.by is not None:
+                ax.set_title(pprint_thing(label))
+
+            # error: Value of type "Polygon" is not indexable
+            self._append_legend_handles_labels(artists[0], label)  # type: ignore[index,arg-type]
+
+    def _make_plot_keywords(self, kwds: dict[str, Any], y: np.ndarray) -> None:
+        """merge BoxPlot/KdePlot properties to passed kwds"""
+        # y is required for KdePlot
+        kwds["bottom"] = self.bottom
+        kwds["bins"] = self.bins
+
+    @final
+    @staticmethod
+    def _get_column_weights(weights, i: int, y):
+        # We allow weights to be a multi-dimensional array, e.g. a (10, 2) array,
+        # and each sub-array (10,) will be called in each iteration. If users only
+        # provide 1D array, we assume the same weights is used for all iterations
+        if weights is not None:
+            if np.ndim(weights) != 1 and np.shape(weights)[-1] != 1:
+                try:
+                    weights = weights[:, i]
+                except IndexError as err:
+                    raise ValueError(
+                        "weights must have the same shape as data, "
+                        "or be a single column"
+                    ) from err
+            weights = weights[~isna(y)]
+        return weights
+
+    def _post_plot_logic(self, ax: Axes, data) -> None:
+        if self.orientation == "horizontal":
+            # error: Argument 1 to "set_xlabel" of "_AxesBase" has incompatible
+            # type "Hashable"; expected "str"
+            ax.set_xlabel(
+                "Frequency" if self.xlabel is None else self.xlabel  # type: ignore[arg-type]
+            )
+            ax.set_ylabel(self.ylabel)  # type: ignore[arg-type]
+        else:
+            ax.set_xlabel(self.xlabel)  # type: ignore[arg-type]
+            ax.set_ylabel(
+                "Frequency" if self.ylabel is None else self.ylabel  # type: ignore[arg-type]
+            )
+
+    @property
+    def orientation(self) -> PlottingOrientation:
+        if self.kwds.get("orientation", None) == "horizontal":
+            return "horizontal"
+        else:
+            return "vertical"
+
+
+class KdePlot(HistPlot):
+    @property
+    def _kind(self) -> Literal["kde"]:
+        return "kde"
+
+    @property
+    def orientation(self) -> Literal["vertical"]:
+        return "vertical"
+
+    def __init__(
+        self, data, bw_method=None, ind=None, *, weights=None, **kwargs
+    ) -> None:
+        # Do not call LinePlot.__init__ which may fill nan
+        MPLPlot.__init__(self, data, **kwargs)
+        self.bw_method = bw_method
+        self.ind = ind
+        self.weights = weights
+
+    @staticmethod
+    def _get_ind(y: np.ndarray, ind):
+        if ind is None:
+            # np.nanmax() and np.nanmin() ignores the missing values
+            sample_range = np.nanmax(y) - np.nanmin(y)
+            ind = np.linspace(
+                np.nanmin(y) - 0.5 * sample_range,
+                np.nanmax(y) + 0.5 * sample_range,
+                1000,
+            )
+        elif is_integer(ind):
+            sample_range = np.nanmax(y) - np.nanmin(y)
+            ind = np.linspace(
+                np.nanmin(y) - 0.5 * sample_range,
+                np.nanmax(y) + 0.5 * sample_range,
+                ind,
+            )
+        return ind
+
+    @classmethod
+    # error: Signature of "_plot" incompatible with supertype "MPLPlot"
+    def _plot(  # type: ignore[override]
+        cls,
+        ax: Axes,
+        y: np.ndarray,
+        style=None,
+        bw_method=None,
+        weights=None,
+        ind=None,
+        column_num=None,
+        stacking_id: int | None = None,
+        **kwds,
+    ):
+        from scipy.stats import gaussian_kde
+
+        y = remove_na_arraylike(y)
+        gkde = gaussian_kde(y, bw_method=bw_method, weights=weights)
+
+        # gaussian_kde.evaluate(None) raises TypeError, so pyright requires this check
+        assert ind is not None
+        y = gkde.evaluate(ind)
+        lines = MPLPlot._plot(ax, ind, y, style=style, **kwds)
+        return lines
+
+    def _make_plot_keywords(self, kwds: dict[str, Any], y: np.ndarray) -> None:
+        kwds["bw_method"] = self.bw_method
+        kwds["ind"] = type(self)._get_ind(y, ind=self.ind)
+
+    def _post_plot_logic(self, ax: Axes, data) -> None:
+        ax.set_ylabel("Density")
+
+
+def _grouped_plot(
+    plotf,
+    data: Series | DataFrame,
+    column=None,
+    by=None,
+    numeric_only: bool = True,
+    figsize: tuple[float, float] | None = None,
+    sharex: bool = True,
+    sharey: bool = True,
+    layout=None,
+    rot: float = 0,
+    ax=None,
+    **kwargs,
+):
+    # error: Non-overlapping equality check (left operand type: "Optional[Tuple[float,
+    # float]]", right operand type: "Literal['default']")
+    if figsize == "default":  # type: ignore[comparison-overlap]
+        # allowed to specify mpl default with 'default'
+        raise ValueError(
+            "figsize='default' is no longer supported. "
+            "Specify figure size by tuple instead"
+        )
+
+    grouped = data.groupby(by)
+    if column is not None:
+        grouped = grouped[column]
+
+    naxes = len(grouped)
+    fig, axes = create_subplots(
+        naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout
+    )
+
+    for ax, (key, group) in zip(flatten_axes(axes), grouped, strict=False):
+        if numeric_only and isinstance(group, ABCDataFrame):
+            group = group._get_numeric_data()
+        plotf(group, ax, **kwargs)
+        ax.set_title(pprint_thing(key))
+
+    return fig, axes
+
+
+def _grouped_hist(
+    data: Series | DataFrame,
+    column=None,
+    by=None,
+    ax=None,
+    bins: int = 50,
+    figsize: tuple[float, float] | None = None,
+    layout=None,
+    sharex: bool = False,
+    sharey: bool = False,
+    rot: float = 90,
+    grid: bool = True,
+    xlabelsize: int | None = None,
+    xrot=None,
+    ylabelsize: int | None = None,
+    yrot=None,
+    legend: bool = False,
+    **kwargs,
+):
+    """
+    Grouped histogram
+
+    Parameters
+    ----------
+    data : Series/DataFrame
+    column : object, optional
+    by : object, optional
+    ax : axes, optional
+    bins : int, default 50
+    figsize : tuple, optional
+    layout : optional
+    sharex : bool, default False
+    sharey : bool, default False
+    rot : float, default 90
+    grid : bool, default True
+    legend: : bool, default False
+    kwargs : dict, keyword arguments passed to matplotlib.Axes.hist
+
+    Returns
+    -------
+    collection of Matplotlib Axes
+    """
+    if legend:
+        assert "label" not in kwargs
+        if data.ndim == 1:
+            kwargs["label"] = data.name
+        elif column is None:
+            kwargs["label"] = data.columns
+        else:
+            kwargs["label"] = column
+
+    def plot_group(group, ax) -> None:
+        ax.hist(group.dropna().values, bins=bins, **kwargs)
+        if legend:
+            ax.legend()
+
+    if xrot is None:
+        xrot = rot
+
+    fig, axes = _grouped_plot(
+        plot_group,
+        data,
+        column=column,
+        by=by,
+        sharex=sharex,
+        sharey=sharey,
+        ax=ax,
+        figsize=figsize,
+        layout=layout,
+        rot=rot,
+    )
+
+    set_ticks_props(
+        axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
+    )
+
+    maybe_adjust_figure(
+        fig, bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3
+    )
+    return axes
+
+
+def hist_series(
+    self: Series,
+    by=None,
+    ax=None,
+    grid: bool = True,
+    xlabelsize: int | None = None,
+    xrot=None,
+    ylabelsize: int | None = None,
+    yrot=None,
+    figsize: tuple[float, float] | None = None,
+    bins: int = 10,
+    legend: bool = False,
+    **kwds,
+):
+    import matplotlib.pyplot as plt
+
+    if legend and "label" in kwds:
+        raise ValueError("Cannot use both legend and label")
+
+    if by is None:
+        if kwds.get("layout", None) is not None:
+            raise ValueError("The 'layout' keyword is not supported when 'by' is None")
+        # hack until the plotting interface is a bit more unified
+        fig = kwds.pop(
+            "figure", plt.gcf() if plt.get_fignums() else plt.figure(figsize=figsize)
+        )
+        if figsize is not None and tuple(figsize) != tuple(fig.get_size_inches()):
+            fig.set_size_inches(*figsize, forward=True)
+        if ax is None:
+            ax = fig.gca()
+        elif ax.get_figure() != fig:
+            raise AssertionError("passed axis not bound to passed figure")
+        values = self.dropna().values
+        if legend:
+            kwds["label"] = self.name
+        ax.hist(values, bins=bins, **kwds)
+        if legend:
+            ax.legend()
+        ax.grid(grid)
+        axes = np.array([ax])
+
+        set_ticks_props(
+            axes,
+            xlabelsize=xlabelsize,
+            xrot=xrot,
+            ylabelsize=ylabelsize,
+            yrot=yrot,
+        )
+
+    else:
+        if "figure" in kwds:
+            raise ValueError(
+                "Cannot pass 'figure' when using the "
+                "'by' argument, since a new 'Figure' instance will be created"
+            )
+        axes = _grouped_hist(
+            self,
+            by=by,
+            ax=ax,
+            grid=grid,
+            figsize=figsize,
+            bins=bins,
+            xlabelsize=xlabelsize,
+            xrot=xrot,
+            ylabelsize=ylabelsize,
+            yrot=yrot,
+            legend=legend,
+            **kwds,
+        )
+
+    if hasattr(axes, "ndim"):
+        if axes.ndim == 1 and len(axes) == 1:
+            return axes[0]
+    return axes
+
+
+def hist_frame(
+    data: DataFrame,
+    column=None,
+    by=None,
+    grid: bool = True,
+    xlabelsize: int | None = None,
+    xrot=None,
+    ylabelsize: int | None = None,
+    yrot=None,
+    ax=None,
+    sharex: bool = False,
+    sharey: bool = False,
+    figsize: tuple[float, float] | None = None,
+    layout=None,
+    bins: int = 10,
+    legend: bool = False,
+    **kwds,
+):
+    if legend and "label" in kwds:
+        raise ValueError("Cannot use both legend and label")
+    if by is not None:
+        axes = _grouped_hist(
+            data,
+            column=column,
+            by=by,
+            ax=ax,
+            grid=grid,
+            figsize=figsize,
+            sharex=sharex,
+            sharey=sharey,
+            layout=layout,
+            bins=bins,
+            xlabelsize=xlabelsize,
+            xrot=xrot,
+            ylabelsize=ylabelsize,
+            yrot=yrot,
+            legend=legend,
+            **kwds,
+        )
+        return axes
+
+    if column is not None:
+        if not isinstance(column, (list, np.ndarray, ABCIndex)):
+            column = [column]
+        data = data[column]
+    # GH32590
+    data = data.select_dtypes(
+        include=(np.number, "datetime64", "datetimetz"), exclude="timedelta"
+    )
+    naxes = len(data.columns)
+
+    if naxes == 0:
+        raise ValueError(
+            "hist method requires numerical or datetime columns, nothing to plot."
+        )
+
+    fig, axes = create_subplots(
+        naxes=naxes,
+        ax=ax,
+        squeeze=False,
+        sharex=sharex,
+        sharey=sharey,
+        figsize=figsize,
+        layout=layout,
+    )
+    can_set_label = "label" not in kwds
+
+    for ax, col in zip(flatten_axes(axes), data.columns, strict=False):
+        if legend and can_set_label:
+            kwds["label"] = col
+        ax.hist(data[col].dropna().values, bins=bins, **kwds)
+        ax.set_title(col)
+        ax.grid(grid)
+        if legend:
+            ax.legend()
+
+    set_ticks_props(
+        axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
+    )
+    maybe_adjust_figure(fig, wspace=0.3, hspace=0.3)
+
+    return axes
diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..271b8f1dc7dc9733fa47d6c132389ed5b66e5c24
--- /dev/null
+++ b/pandas/plotting/_matplotlib/misc.py
@@ -0,0 +1,480 @@
+from __future__ import annotations
+
+import random
+from typing import TYPE_CHECKING
+
+import matplotlib as mpl
+import numpy as np
+
+from pandas.core.dtypes.missing import notna
+
+from pandas.io.formats.printing import pprint_thing
+from pandas.plotting._matplotlib.style import get_standard_colors
+from pandas.plotting._matplotlib.tools import (
+    create_subplots,
+    do_adjust_figure,
+    maybe_adjust_figure,
+    set_ticks_props,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Hashable
+
+    from matplotlib.axes import Axes
+    from matplotlib.figure import Figure
+
+    from pandas import (
+        DataFrame,
+        Index,
+        Series,
+    )
+
+
+def scatter_matrix(
+    frame: DataFrame,
+    alpha: float = 0.5,
+    figsize: tuple[float, float] | None = None,
+    ax=None,
+    grid: bool = False,
+    diagonal: str = "hist",
+    marker: str = ".",
+    density_kwds=None,
+    hist_kwds=None,
+    range_padding: float = 0.05,
+    **kwds,
+):
+    df = frame._get_numeric_data()
+    n = df.columns.size
+    naxes = n * n
+    fig, axes = create_subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False)
+
+    # no gaps between subplots
+    maybe_adjust_figure(fig, wspace=0, hspace=0)
+
+    mask = notna(df)
+
+    marker = _get_marker_compat(marker)
+
+    hist_kwds = hist_kwds or {}
+    density_kwds = density_kwds or {}
+
+    # GH 14855
+    kwds.setdefault("edgecolors", "none")
+
+    boundaries_list = []
+    for a in df.columns:
+        values = df[a].values[mask[a].values]
+        rmin_, rmax_ = np.min(values), np.max(values)
+        rdelta_ext = (rmax_ - rmin_) * range_padding / 2
+        boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext))
+
+    for i, a in enumerate(df.columns):
+        for j, b in enumerate(df.columns):
+            ax = axes[i, j]
+
+            if i == j:
+                values = df[a].values[mask[a].values]
+
+                # Deal with the diagonal by drawing a histogram there.
+                if diagonal == "hist":
+                    ax.hist(values, **hist_kwds)
+
+                elif diagonal in ("kde", "density"):
+                    from scipy.stats import gaussian_kde
+
+                    y = values
+                    gkde = gaussian_kde(y)
+                    ind = np.linspace(y.min(), y.max(), 1000)
+                    ax.plot(ind, gkde.evaluate(ind), **density_kwds)
+
+                ax.set_xlim(boundaries_list[i])
+
+            else:
+                common = (mask[a] & mask[b]).values
+
+                ax.scatter(
+                    df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds
+                )
+
+                ax.set_xlim(boundaries_list[j])
+                ax.set_ylim(boundaries_list[i])
+
+            ax.set_xlabel(b)
+            ax.set_ylabel(a)
+
+            if j != 0:
+                ax.yaxis.set_visible(False)
+            if i != n - 1:
+                ax.xaxis.set_visible(False)
+
+    if len(df.columns) > 1:
+        lim1 = boundaries_list[0]
+        locs = axes[0][1].yaxis.get_majorticklocs()
+        locs = locs[(lim1[0] <= locs) & (locs <= lim1[1])]
+        adj = (locs - lim1[0]) / (lim1[1] - lim1[0])
+
+        lim0 = axes[0][0].get_ylim()
+        adj = adj * (lim0[1] - lim0[0]) + lim0[0]
+        axes[0][0].yaxis.set_ticks(adj)
+
+        if np.all(locs == locs.astype(int)):
+            # if all ticks are int
+            locs = locs.astype(int)
+        axes[0][0].yaxis.set_ticklabels(locs)
+
+    set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
+
+    return axes
+
+
+def _get_marker_compat(marker):
+    if marker not in mpl.lines.lineMarkers:
+        return "o"
+    return marker
+
+
+def radviz(
+    frame: DataFrame,
+    class_column,
+    ax: Axes | None = None,
+    color=None,
+    colormap=None,
+    **kwds,
+) -> Axes:
+    import matplotlib.pyplot as plt
+
+    def normalize(series):
+        a = min(series)
+        b = max(series)
+        return (series - a) / (b - a)
+
+    n = len(frame)
+    classes = frame[class_column].drop_duplicates()
+    class_col = frame[class_column]
+    df = frame.drop(class_column, axis=1).apply(normalize)
+
+    if ax is None:
+        ax = plt.gca()
+        ax.set_xlim(-1, 1)
+        ax.set_ylim(-1, 1)
+
+    to_plot: dict[Hashable, list[list]] = {}
+    colors = get_standard_colors(
+        num_colors=len(classes), colormap=colormap, color_type="random", color=color
+    )
+
+    for kls in classes:
+        to_plot[kls] = [[], []]
+
+    m = len(frame.columns) - 1
+    s = np.array(
+        [(np.cos(t), np.sin(t)) for t in [2 * np.pi * (i / m) for i in range(m)]]
+    )
+
+    for i in range(n):
+        row = df.iloc[i].values
+        row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
+        y = (s * row_).sum(axis=0) / row.sum()
+        kls = class_col.iat[i]
+        to_plot[kls][0].append(y[0])
+        to_plot[kls][1].append(y[1])
+
+    for i, kls in enumerate(classes):
+        ax.scatter(
+            to_plot[kls][0],
+            to_plot[kls][1],
+            color=colors[i],
+            label=pprint_thing(kls),
+            **kwds,
+        )
+    ax.legend()
+
+    ax.add_patch(mpl.patches.Circle((0.0, 0.0), radius=1.0, facecolor="none"))
+
+    for xy, name in zip(s, df.columns, strict=True):
+        ax.add_patch(mpl.patches.Circle(xy, radius=0.025, facecolor="gray"))
+
+        if xy[0] < 0.0 and xy[1] < 0.0:
+            ax.text(
+                xy[0] - 0.025, xy[1] - 0.025, name, ha="right", va="top", size="small"
+            )
+        elif xy[0] < 0.0 <= xy[1]:
+            ax.text(
+                xy[0] - 0.025,
+                xy[1] + 0.025,
+                name,
+                ha="right",
+                va="bottom",
+                size="small",
+            )
+        elif xy[1] < 0.0 <= xy[0]:
+            ax.text(
+                xy[0] + 0.025, xy[1] - 0.025, name, ha="left", va="top", size="small"
+            )
+        elif xy[0] >= 0.0 and xy[1] >= 0.0:
+            ax.text(
+                xy[0] + 0.025, xy[1] + 0.025, name, ha="left", va="bottom", size="small"
+            )
+
+    ax.axis("equal")
+    return ax
+
+
+def andrews_curves(
+    frame: DataFrame,
+    class_column,
+    ax: Axes | None = None,
+    samples: int = 200,
+    color=None,
+    colormap=None,
+    **kwds,
+) -> Axes:
+    import matplotlib.pyplot as plt
+
+    def function(amplitudes):
+        def f(t):
+            x1 = amplitudes[0]
+            result = x1 / np.sqrt(2.0)
+
+            # Take the rest of the coefficients and resize them
+            # appropriately. Take a copy of amplitudes as otherwise numpy
+            # deletes the element from amplitudes itself.
+            coeffs = np.delete(np.copy(amplitudes), 0)
+            coeffs = np.resize(coeffs, (int((coeffs.size + 1) / 2), 2))
+
+            # Generate the harmonics and arguments for the sin and cos
+            # functions.
+            harmonics = np.arange(0, coeffs.shape[0]) + 1
+            trig_args = np.outer(harmonics, t)
+
+            result += np.sum(
+                coeffs[:, 0, np.newaxis] * np.sin(trig_args)
+                + coeffs[:, 1, np.newaxis] * np.cos(trig_args),
+                axis=0,
+            )
+            return result
+
+        return f
+
+    n = len(frame)
+    class_col = frame[class_column]
+    classes = frame[class_column].drop_duplicates()
+    df = frame.drop(class_column, axis=1)
+    t = np.linspace(-np.pi, np.pi, samples)
+    used_legends: set[str] = set()
+
+    color_values = get_standard_colors(
+        num_colors=len(classes), colormap=colormap, color_type="random", color=color
+    )
+    colors = dict(zip(classes, color_values, strict=False))
+    if ax is None:
+        ax = plt.gca()
+        ax.set_xlim(-np.pi, np.pi)
+    for i in range(n):
+        row = df.iloc[i].values
+        f = function(row)
+        y = f(t)
+        kls = class_col.iat[i]
+        label = pprint_thing(kls)
+        if label not in used_legends:
+            used_legends.add(label)
+            ax.plot(t, y, color=colors[kls], label=label, **kwds)
+        else:
+            ax.plot(t, y, color=colors[kls], **kwds)
+
+    ax.legend(loc="upper right")
+    ax.grid()
+    return ax
+
+
+def bootstrap_plot(
+    series: Series,
+    fig: Figure | None = None,
+    size: int = 50,
+    samples: int = 500,
+    **kwds,
+) -> Figure:
+    import matplotlib.pyplot as plt
+
+    # TODO: is the failure mentioned below still relevant?
+    # random.sample(ndarray, int) fails on python 3.3, sigh
+    data = list(series.values)
+    samplings = [random.sample(data, size) for _ in range(samples)]
+
+    means = np.array([np.mean(sampling) for sampling in samplings])
+    medians = np.array([np.median(sampling) for sampling in samplings])
+    midranges = np.array(
+        [(min(sampling) + max(sampling)) * 0.5 for sampling in samplings]
+    )
+    if fig is None:
+        fig = plt.figure()
+    x = list(range(samples))
+    axes = []
+    ax1 = fig.add_subplot(2, 3, 1)
+    ax1.set_xlabel("Sample")
+    axes.append(ax1)
+    ax1.plot(x, means, **kwds)
+    ax2 = fig.add_subplot(2, 3, 2)
+    ax2.set_xlabel("Sample")
+    axes.append(ax2)
+    ax2.plot(x, medians, **kwds)
+    ax3 = fig.add_subplot(2, 3, 3)
+    ax3.set_xlabel("Sample")
+    axes.append(ax3)
+    ax3.plot(x, midranges, **kwds)
+    ax4 = fig.add_subplot(2, 3, 4)
+    ax4.set_xlabel("Mean")
+    axes.append(ax4)
+    ax4.hist(means, **kwds)
+    ax5 = fig.add_subplot(2, 3, 5)
+    ax5.set_xlabel("Median")
+    axes.append(ax5)
+    ax5.hist(medians, **kwds)
+    ax6 = fig.add_subplot(2, 3, 6)
+    ax6.set_xlabel("Midrange")
+    axes.append(ax6)
+    ax6.hist(midranges, **kwds)
+    for axis in axes:
+        plt.setp(axis.get_xticklabels(), fontsize=8)
+        plt.setp(axis.get_yticklabels(), fontsize=8)
+    if do_adjust_figure(fig):
+        plt.tight_layout()
+    return fig
+
+
+def parallel_coordinates(
+    frame: DataFrame,
+    class_column,
+    cols=None,
+    ax: Axes | None = None,
+    color=None,
+    use_columns: bool = False,
+    xticks=None,
+    colormap=None,
+    axvlines: bool = True,
+    axvlines_kwds=None,
+    sort_labels: bool = False,
+    **kwds,
+) -> Axes:
+    import matplotlib.pyplot as plt
+
+    if axvlines_kwds is None:
+        axvlines_kwds = {"linewidth": 1, "color": "black"}
+
+    n = len(frame)
+    classes = frame[class_column].drop_duplicates()
+    class_col = frame[class_column]
+
+    if cols is None:
+        df = frame.drop(class_column, axis=1)
+    else:
+        df = frame[cols]
+
+    used_legends: set[str] = set()
+
+    ncols = len(df.columns)
+
+    # determine values to use for xticks
+    x: list[int] | Index
+    if use_columns is True:
+        if not np.all(np.isreal(list(df.columns))):
+            raise ValueError("Columns must be numeric to be used as xticks")
+        x = df.columns
+    elif xticks is not None:
+        if not np.all(np.isreal(xticks)):
+            raise ValueError("xticks specified must be numeric")
+        if len(xticks) != ncols:
+            raise ValueError("Length of xticks must match number of columns")
+        x = xticks
+    else:
+        x = list(range(ncols))
+
+    if ax is None:
+        ax = plt.gca()
+
+    color_values = get_standard_colors(
+        num_colors=len(classes), colormap=colormap, color_type="random", color=color
+    )
+
+    if sort_labels:
+        classes = sorted(classes)
+        color_values = sorted(color_values)
+    colors = dict(zip(classes, color_values, strict=True))
+
+    for i in range(n):
+        y = df.iloc[i].values
+        kls = class_col.iat[i]
+        label = pprint_thing(kls)
+        if label not in used_legends:
+            used_legends.add(label)
+            ax.plot(x, y, color=colors[kls], label=label, **kwds)
+        else:
+            ax.plot(x, y, color=colors[kls], **kwds)
+
+    if axvlines:
+        for i in x:
+            ax.axvline(i, **axvlines_kwds)
+
+    ax.set_xticks(x)
+    ax.set_xticklabels(df.columns)
+    ax.set_xlim(x[0], x[-1])
+    ax.legend(loc="upper right")
+    ax.grid()
+    return ax
+
+
+def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Axes:
+    # workaround because `c='b'` is hardcoded in matplotlib's scatter method
+    import matplotlib.pyplot as plt
+
+    kwds.setdefault("c", plt.rcParams["patch.facecolor"])
+
+    data = series.values
+    y1 = data[:-lag]
+    y2 = data[lag:]
+    if ax is None:
+        ax = plt.gca()
+    ax.set_xlabel("y(t)")
+    ax.set_ylabel(f"y(t + {lag})")
+    ax.scatter(y1, y2, **kwds)
+    return ax
+
+
+def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwds) -> Axes:
+    import matplotlib.pyplot as plt
+
+    n = len(series)
+    data = np.asarray(series)
+    if ax is None:
+        ax = plt.gca()
+        ax.set_xlim(1, n)
+        ax.set_ylim(-1.0, 1.0)
+    mean = np.mean(data)
+    c0 = np.sum((data - mean) ** 2) / n
+
+    def r(h):
+        return ((data[: n - h] - mean) * (data[h:] - mean)).sum() / n / c0
+
+    x = np.arange(n) + 1
+    y = [r(loc) for loc in x]
+    z95 = 1.959963984540054
+    z99 = 2.5758293035489004
+    ax.axhline(y=z99 / np.sqrt(n), linestyle="--", color="grey")
+    ax.axhline(y=z95 / np.sqrt(n), color="grey")
+    ax.axhline(y=0.0, color="black")
+    ax.axhline(y=-z95 / np.sqrt(n), color="grey")
+    ax.axhline(y=-z99 / np.sqrt(n), linestyle="--", color="grey")
+    ax.set_xlabel("Lag")
+    ax.set_ylabel("Autocorrelation")
+    ax.plot(x, y, **kwds)
+    if "label" in kwds:
+        ax.legend()
+    ax.grid()
+    return ax
+
+
+def unpack_single_str_list(keys):
+    # GH 42795
+    if isinstance(keys, list) and len(keys) == 1:
+        keys = keys[0]
+    return keys
diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cf63c86213924927524c7018bf6dad87f2de636
--- /dev/null
+++ b/pandas/plotting/_matplotlib/style.py
@@ -0,0 +1,293 @@
+from __future__ import annotations
+
+from collections.abc import (
+    Collection,
+    Iterator,
+    Sequence,
+)
+import itertools
+from typing import (
+    TYPE_CHECKING,
+    cast,
+    overload,
+)
+import warnings
+
+import matplotlib as mpl
+import matplotlib.colors
+import numpy as np
+
+from pandas._typing import MatplotlibColor as Color
+from pandas.util._exceptions import find_stack_level
+
+from pandas.core.dtypes.common import is_list_like
+
+if TYPE_CHECKING:
+    from matplotlib.colors import Colormap
+
+
+@overload
+def get_standard_colors(
+    num_colors: int,
+    colormap: Colormap | None = ...,
+    color_type: str = ...,
+    *,
+    color: dict[str, Color],
+) -> dict[str, Color]: ...
+
+
+@overload
+def get_standard_colors(
+    num_colors: int,
+    colormap: Colormap | None = ...,
+    color_type: str = ...,
+    *,
+    color: Color | Sequence[Color] | None = ...,
+) -> list[Color]: ...
+
+
+@overload
+def get_standard_colors(
+    num_colors: int,
+    colormap: Colormap | None = ...,
+    color_type: str = ...,
+    *,
+    color: dict[str, Color] | Color | Sequence[Color] | None = ...,
+) -> dict[str, Color] | list[Color]: ...
+
+
+def get_standard_colors(
+    num_colors: int,
+    colormap: Colormap | None = None,
+    color_type: str = "default",
+    *,
+    color: dict[str, Color] | Color | Sequence[Color] | None = None,
+) -> dict[str, Color] | list[Color]:
+    """
+    Get standard colors based on `colormap`, `color_type` or `color` inputs.
+
+    Parameters
+    ----------
+    num_colors : int
+        Minimum number of colors to be returned.
+        Ignored if `color` is a dictionary.
+    colormap : :py:class:`matplotlib.colors.Colormap`, optional
+        Matplotlib colormap.
+        When provided, the resulting colors will be derived from the colormap.
+    color_type : {"default", "random"}, optional
+        Type of colors to derive. Used if provided `color` and `colormap` are None.
+        Ignored if either `color` or `colormap` are not None.
+    color : dict or str or sequence, optional
+        Color(s) to be used for deriving sequence of colors.
+        Can be either be a dictionary, or a single color (single color string,
+        or sequence of floats representing a single color),
+        or a sequence of colors.
+
+    Returns
+    -------
+    dict or list
+        Standard colors. Can either be a mapping if `color` was a dictionary,
+        or a list of colors with a length of `num_colors` or more.
+
+    Warns
+    -----
+    UserWarning
+        If both `colormap` and `color` are provided.
+        Parameter `color` will override.
+    """
+    if isinstance(color, dict):
+        return color
+
+    colors = _derive_colors(
+        color=color,
+        colormap=colormap,
+        color_type=color_type,
+        num_colors=num_colors,
+    )
+
+    return list(_cycle_colors(colors, num_colors=num_colors))
+
+
+def _derive_colors(
+    *,
+    color: Color | Collection[Color] | None,
+    colormap: str | Colormap | None,
+    color_type: str,
+    num_colors: int,
+) -> list[Color]:
+    """
+    Derive colors from either `colormap`, `color_type` or `color` inputs.
+
+    Get a list of colors either from `colormap`, or from `color`,
+    or from `color_type` (if both `colormap` and `color` are None).
+
+    Parameters
+    ----------
+    color : str or sequence, optional
+        Color(s) to be used for deriving sequence of colors.
+        Can be either be a single color (single color string, or sequence of floats
+        representing a single color), or a sequence of colors.
+    colormap : :py:class:`matplotlib.colors.Colormap`, optional
+        Matplotlib colormap.
+        When provided, the resulting colors will be derived from the colormap.
+    color_type : {"default", "random"}, optional
+        Type of colors to derive. Used if provided `color` and `colormap` are None.
+        Ignored if either `color` or `colormap`` are not None.
+    num_colors : int
+        Number of colors to be extracted.
+
+    Returns
+    -------
+    list
+        List of colors extracted.
+
+    Warns
+    -----
+    UserWarning
+        If both `colormap` and `color` are provided.
+        Parameter `color` will override.
+    """
+    if color is None and colormap is not None:
+        return _get_colors_from_colormap(colormap, num_colors=num_colors)
+    elif color is not None:
+        if colormap is not None:
+            warnings.warn(
+                "'color' and 'colormap' cannot be used simultaneously. Using 'color'",
+                stacklevel=find_stack_level(),
+            )
+        return _get_colors_from_color(color)
+    else:
+        return _get_colors_from_color_type(color_type, num_colors=num_colors)
+
+
+def _cycle_colors(colors: list[Color], num_colors: int) -> Iterator[Color]:
+    """Cycle colors until achieving max of `num_colors` or length of `colors`.
+
+    Extra colors will be ignored by matplotlib if there are more colors
+    than needed and nothing needs to be done here.
+    """
+    max_colors = max(num_colors, len(colors))
+    yield from itertools.islice(itertools.cycle(colors), max_colors)
+
+
+def _get_colors_from_colormap(
+    colormap: str | Colormap,
+    num_colors: int,
+) -> list[Color]:
+    """Get colors from colormap."""
+    cmap = _get_cmap_instance(colormap)
+    return [cmap(num) for num in np.linspace(0, 1, num=num_colors)]
+
+
+def _get_cmap_instance(colormap: str | Colormap) -> Colormap:
+    """Get instance of matplotlib colormap."""
+    if isinstance(colormap, str):
+        cmap = colormap
+        colormap = mpl.colormaps[colormap]
+        if colormap is None:
+            raise ValueError(f"Colormap {cmap} is not recognized")
+    return colormap
+
+
+def _get_colors_from_color(
+    color: Color | Collection[Color],
+) -> list[Color]:
+    """Get colors from user input color."""
+    if len(color) == 0:
+        raise ValueError(f"Invalid color argument: {color}")
+
+    if _is_single_color(color):
+        color = cast(Color, color)
+        return [color]
+
+    color = cast(Collection[Color], color)
+    return list(_gen_list_of_colors_from_iterable(color))
+
+
+def _is_single_color(color: Color | Collection[Color]) -> bool:
+    """Check if `color` is a single color, not a sequence of colors.
+
+    Single color is of these kinds:
+        - Named color "red", "C0", "firebrick"
+        - Alias "g"
+        - Sequence of floats, such as (0.1, 0.2, 0.3) or (0.1, 0.2, 0.3, 0.4).
+
+    See Also
+    --------
+    _is_single_string_color
+    """
+    if isinstance(color, str) and _is_single_string_color(color):
+        # GH #36972
+        return True
+
+    if _is_floats_color(color):
+        return True
+
+    return False
+
+
+def _gen_list_of_colors_from_iterable(color: Collection[Color]) -> Iterator[Color]:
+    """
+    Yield colors from string of several letters or from collection of colors.
+    """
+    for x in color:
+        if _is_single_color(x):
+            yield x
+        else:
+            raise ValueError(f"Invalid color {x}")
+
+
+def _is_floats_color(color: Color | Collection[Color]) -> bool:
+    """Check if color comprises a sequence of floats representing color."""
+    return bool(
+        is_list_like(color)
+        and (len(color) == 3 or len(color) == 4)
+        and all(isinstance(x, (int, float)) for x in color)
+    )
+
+
+def _get_colors_from_color_type(color_type: str, num_colors: int) -> list[Color]:
+    """Get colors from user input color type."""
+    if color_type == "default":
+        prop_cycle = mpl.rcParams["axes.prop_cycle"]
+        return [
+            c["color"]
+            for c in itertools.islice(prop_cycle, min(num_colors, len(prop_cycle)))
+        ]
+    elif color_type == "random":
+        return np.random.default_rng(num_colors).random((num_colors, 3)).tolist()
+    else:
+        raise ValueError("color_type must be either 'default' or 'random'")
+
+
+def _is_single_string_color(color: Color) -> bool:
+    """Check if `color` is a single string color.
+
+    Examples of single string colors:
+        - 'r'
+        - 'g'
+        - 'red'
+        - 'green'
+        - 'C3'
+        - 'firebrick'
+
+    Parameters
+    ----------
+    color : Color
+        Color string or sequence of floats.
+
+    Returns
+    -------
+    bool
+        True if `color` looks like a valid color.
+        False otherwise.
+    """
+    conv = matplotlib.colors.ColorConverter()
+    try:
+        # error: Argument 1 to "to_rgba" of "ColorConverter" has incompatible type
+        # "str | Sequence[float]"; expected "tuple[float, float, float] | ..."
+        conv.to_rgba(color)  # type: ignore[arg-type]
+    except ValueError:
+        return False
+    else:
+        return True
diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py
new file mode 100644
index 0000000000000000000000000000000000000000..5023867445adb844f0d5c3e28183bbf2474e027a
--- /dev/null
+++ b/pandas/plotting/_matplotlib/timeseries.py
@@ -0,0 +1,364 @@
+# TODO: Use the fact that axis can have units to simplify the process
+
+from __future__ import annotations
+
+import functools
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+import warnings
+
+from pandas._libs.tslibs import (
+    BaseOffset,
+    Period,
+    to_offset,
+)
+from pandas._libs.tslibs.dtypes import (
+    OFFSET_TO_PERIOD_FREQSTR,
+    FreqGroup,
+)
+
+from pandas.core.dtypes.generic import (
+    ABCDatetimeIndex,
+    ABCPeriodIndex,
+    ABCTimedeltaIndex,
+)
+
+from pandas.io.formats.printing import pprint_thing
+from pandas.plotting._matplotlib.converter import (
+    TimeSeries_DateFormatter,
+    TimeSeries_DateLocator,
+    TimeSeries_TimedeltaFormatter,
+)
+from pandas.tseries.frequencies import (
+    get_period_alias,
+    is_subperiod,
+    is_superperiod,
+)
+
+if TYPE_CHECKING:
+    from datetime import timedelta
+
+    from matplotlib.axes import Axes
+
+    from pandas._typing import NDFrameT
+
+    from pandas import (
+        DatetimeIndex,
+        Index,
+        PeriodIndex,
+        Series,
+    )
+
+# ---------------------------------------------------------------------
+# Plotting functions and monkey patches
+
+
+def maybe_resample(series: Series, ax: Axes, kwargs: dict[str, Any]):
+    # resample against axes freq if necessary
+
+    if "how" in kwargs:
+        raise ValueError(
+            "'how' is not a valid keyword for plotting functions. If plotting "
+            "multiple objects on shared axes, resample manually first."
+        )
+
+    freq, ax_freq = _get_freq(ax, series)
+
+    if freq is None:  # pragma: no cover
+        raise ValueError("Cannot use dynamic axis without frequency info")
+
+    # Convert DatetimeIndex to PeriodIndex
+    if isinstance(series.index, ABCDatetimeIndex):
+        series = series.to_period(freq=freq)
+
+    if ax_freq is not None and freq != ax_freq:
+        if is_superperiod(freq, ax_freq):  # upsample input
+            series = series.copy(deep=False)
+            # error: "Index" has no attribute "asfreq"
+            series.index = series.index.asfreq(  # type: ignore[attr-defined]
+                ax_freq, how="s"
+            )
+            freq = ax_freq
+        elif _is_sup(freq, ax_freq):  # one is weekly
+            how = "last"
+            series = getattr(series.resample("D"), how)().dropna()
+            series = getattr(series.resample(ax_freq), how)().dropna()
+            freq = ax_freq
+        elif is_subperiod(freq, ax_freq) or _is_sub(freq, ax_freq):
+            _upsample_others(ax, freq, kwargs)
+        else:  # pragma: no cover
+            raise ValueError("Incompatible frequency conversion")
+    return freq, series
+
+
+def _is_sub(f1: str, f2: str) -> bool:
+    return (f1.startswith("W") and is_subperiod("D", f2)) or (
+        f2.startswith("W") and is_subperiod(f1, "D")
+    )
+
+
+def _is_sup(f1: str, f2: str) -> bool:
+    return (f1.startswith("W") and is_superperiod("D", f2)) or (
+        f2.startswith("W") and is_superperiod(f1, "D")
+    )
+
+
+def _upsample_others(ax: Axes, freq: BaseOffset, kwargs: dict[str, Any]) -> None:
+    legend = ax.get_legend()
+    lines, labels = _replot_ax(ax, freq)
+    _replot_ax(ax, freq)
+
+    other_ax = None
+    if hasattr(ax, "left_ax"):
+        other_ax = ax.left_ax
+    if hasattr(ax, "right_ax"):
+        other_ax = ax.right_ax
+
+    if other_ax is not None:
+        rlines, rlabels = _replot_ax(other_ax, freq)
+        lines.extend(rlines)
+        labels.extend(rlabels)
+
+    if legend is not None and kwargs.get("legend", True) and len(lines) > 0:
+        title: str | None = legend.get_title().get_text()
+        if title == "None":
+            title = None
+        ax.legend(lines, labels, loc="best", title=title)
+
+
+def _replot_ax(ax: Axes, freq: BaseOffset):
+    data = getattr(ax, "_plot_data", None)
+
+    # clear current axes and data
+    # TODO #54485
+    ax._plot_data = []  # type: ignore[attr-defined]
+    ax.clear()
+
+    decorate_axes(ax, freq)
+
+    lines = []
+    labels = []
+    if data is not None:
+        for series, plotf, kwds in data:
+            series = series.copy(deep=False)
+            idx = series.index.asfreq(freq, how="S")
+            series.index = idx
+            # TODO #54485
+            ax._plot_data.append((series, plotf, kwds))  # type: ignore[attr-defined]
+
+            # for tsplot
+            if isinstance(plotf, str):
+                from pandas.plotting._matplotlib import PLOT_CLASSES
+
+                plotf = PLOT_CLASSES[plotf]._plot
+
+            lines.append(plotf(ax, series.index._mpl_repr(), series.values, **kwds)[0])
+            labels.append(pprint_thing(series.name))
+
+    return lines, labels
+
+
+def decorate_axes(ax: Axes, freq: BaseOffset) -> None:
+    """Initialize axes for time-series plotting"""
+    if not hasattr(ax, "_plot_data"):
+        # TODO #54485
+        ax._plot_data = []  # type: ignore[attr-defined]
+
+    # TODO #54485
+    ax.freq = freq  # type: ignore[attr-defined]
+    xaxis = ax.get_xaxis()
+    # TODO #54485
+    xaxis.freq = freq  # type: ignore[attr-defined]
+
+
+def _get_ax_freq(ax: Axes):
+    """
+    Get the freq attribute of the ax object if set.
+    Also checks shared axes (eg when using secondary yaxis, sharex=True
+    or twinx)
+    """
+    ax_freq = getattr(ax, "freq", None)
+    if ax_freq is None:
+        # check for left/right ax in case of secondary yaxis
+        if hasattr(ax, "left_ax"):
+            ax_freq = getattr(ax.left_ax, "freq", None)
+        elif hasattr(ax, "right_ax"):
+            ax_freq = getattr(ax.right_ax, "freq", None)
+    if ax_freq is None:
+        # check if a shared ax (sharex/twinx) has already freq set
+        shared_axes = ax.get_shared_x_axes().get_siblings(ax)
+        if len(shared_axes) > 1:
+            for shared_ax in shared_axes:
+                ax_freq = getattr(shared_ax, "freq", None)
+                if ax_freq is not None:
+                    break
+    return ax_freq
+
+
+def _get_period_alias(freq: timedelta | BaseOffset | str) -> str | None:
+    if isinstance(freq, BaseOffset):
+        freqstr = freq.name
+    else:
+        freqstr = to_offset(freq, is_period=True).rule_code
+
+    return get_period_alias(freqstr)
+
+
+def _get_freq(ax: Axes, series: Series):
+    # get frequency from data
+    freq = getattr(series.index, "freq", None)
+    if freq is None:
+        freq = getattr(series.index, "inferred_freq", None)
+        freq = to_offset(freq, is_period=True)
+
+    ax_freq = _get_ax_freq(ax)
+
+    # use axes freq if no data freq
+    if freq is None:
+        freq = ax_freq
+
+    # get the period frequency
+    freq = _get_period_alias(freq)
+    return freq, ax_freq
+
+
+def use_dynamic_x(ax: Axes, index: Index) -> bool:
+    freq = _get_index_freq(index)
+    ax_freq = _get_ax_freq(ax)
+
+    if freq is None:  # convert irregular if axes has freq info
+        freq = ax_freq
+    # do not use tsplot if irregular was plotted first
+    elif (ax_freq is None) and (len(ax.get_lines()) > 0):
+        return False
+
+    if freq is None:
+        return False
+
+    freq_str = _get_period_alias(freq)
+
+    if freq_str is None:
+        return False
+
+    # FIXME: hack this for 0.10.1, creating more technical debt...sigh
+    if isinstance(index, ABCDatetimeIndex):
+        # error: "BaseOffset" has no attribute "_period_dtype_code"
+        freq_str = OFFSET_TO_PERIOD_FREQSTR.get(freq_str, freq_str)
+        base = to_offset(freq_str, is_period=True)._period_dtype_code  # type: ignore[attr-defined]
+        if base <= FreqGroup.FR_DAY.value:
+            return index[:1].is_normalized
+        period = Period(index[0], freq_str)
+        assert isinstance(period, Period)
+        return period.to_timestamp().tz_localize(index.tz) == index[0]
+    return True
+
+
+def _get_index_freq(index: Index) -> BaseOffset | None:
+    freq = getattr(index, "freq", None)
+    if freq is None:
+        freq = getattr(index, "inferred_freq", None)
+        freq = to_offset(freq)
+    return freq
+
+
+def maybe_convert_index(ax: Axes, data: NDFrameT) -> NDFrameT:
+    # tsplot converts automatically, but don't want to convert index
+    # over and over for DataFrames
+    if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)):
+        freq = _get_index_freq(data.index)
+
+        if freq is None:
+            freq = _get_ax_freq(ax)
+
+        if freq is None:
+            raise ValueError("Could not get frequency alias for plotting")
+
+        freq_str = _get_period_alias(freq)
+
+        with warnings.catch_warnings():
+            # suppress Period[B] deprecation warning
+            # TODO: need to find an alternative to this before the deprecation
+            #  is enforced!
+            warnings.filterwarnings(
+                "ignore",
+                r"PeriodDtype\[B\] is deprecated",
+                category=FutureWarning,
+            )
+
+            if isinstance(data.index, ABCDatetimeIndex):
+                data = data.tz_localize(None).to_period(freq=freq_str)
+            elif isinstance(data.index, ABCPeriodIndex):
+                data.index = data.index.asfreq(freq=freq_str, how="start")
+    return data
+
+
+# Patch methods for subplot.
+
+
+def _format_coord(freq: BaseOffset, t, y) -> str:
+    time_period = Period(ordinal=int(t), freq=freq)
+    return f"t = {time_period}  y = {y:8f}"
+
+
+def format_dateaxis(
+    subplot, freq: BaseOffset, index: DatetimeIndex | PeriodIndex
+) -> None:
+    """
+    Pretty-formats the date axis (x-axis).
+
+    Major and minor ticks are automatically set for the frequency of the
+    current underlying series.  As the dynamic mode is activated by
+    default, changing the limits of the x axis will intelligently change
+    the positions of the ticks.
+    """
+    import matplotlib.pyplot as plt
+
+    # handle index specific formatting
+    # Note: DatetimeIndex does not use this
+    # interface. DatetimeIndex uses matplotlib.date directly
+    if isinstance(index, ABCPeriodIndex):
+        majlocator = TimeSeries_DateLocator(
+            freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot
+        )
+        minlocator = TimeSeries_DateLocator(
+            freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot
+        )
+        subplot.xaxis.set_major_locator(majlocator)
+        subplot.xaxis.set_minor_locator(minlocator)
+
+        majformatter = TimeSeries_DateFormatter(
+            freq, dynamic_mode=True, minor_locator=False, plot_obj=subplot
+        )
+        minformatter = TimeSeries_DateFormatter(
+            freq, dynamic_mode=True, minor_locator=True, plot_obj=subplot
+        )
+        subplot.xaxis.set_major_formatter(majformatter)
+        subplot.xaxis.set_minor_formatter(minformatter)
+
+        # x and y coord info
+        subplot.format_coord = functools.partial(_format_coord, freq)
+
+    elif isinstance(index, ABCTimedeltaIndex):
+        subplot.xaxis.set_major_formatter(TimeSeries_TimedeltaFormatter(index.unit))
+    else:
+        raise TypeError("index type not supported")
+
+    plt.draw_if_interactive()
+
+
+def prepare_ts_data(
+    series: Series, ax: Axes, kwargs: dict[str, Any]
+) -> tuple[BaseOffset | str, Series]:
+    freq, data = maybe_resample(series, ax, kwargs)
+
+    # Set ax with freq info
+    decorate_axes(ax, freq)
+    # digging deeper
+    if hasattr(ax, "left_ax"):
+        decorate_axes(ax.left_ax, freq)
+    if hasattr(ax, "right_ax"):
+        decorate_axes(ax.right_ax, freq)
+
+    return freq, data
diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ee75e7fe553ee7ddae5418da9d32dd768857a78
--- /dev/null
+++ b/pandas/plotting/_matplotlib/tools.py
@@ -0,0 +1,491 @@
+# being a bit too dynamic
+from __future__ import annotations
+
+from math import ceil
+from typing import TYPE_CHECKING
+import warnings
+
+import matplotlib as mpl
+import numpy as np
+
+from pandas.util._exceptions import find_stack_level
+
+from pandas.core.dtypes.common import is_list_like
+from pandas.core.dtypes.generic import (
+    ABCDataFrame,
+    ABCIndex,
+    ABCSeries,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Generator,
+        Iterable,
+    )
+
+    from matplotlib.axes import Axes
+    from matplotlib.axis import Axis
+    from matplotlib.figure import Figure
+    from matplotlib.lines import Line2D
+    from matplotlib.table import Table
+
+    from pandas import (
+        DataFrame,
+        Series,
+    )
+
+
+def do_adjust_figure(fig: Figure) -> bool:
+    """Whether fig has constrained_layout enabled."""
+    if not hasattr(fig, "get_constrained_layout"):
+        return False
+    return not fig.get_constrained_layout()
+
+
+def maybe_adjust_figure(fig: Figure, *args, **kwargs) -> None:
+    """Call fig.subplots_adjust unless fig has constrained_layout enabled."""
+    if do_adjust_figure(fig):
+        fig.subplots_adjust(*args, **kwargs)
+
+
+def format_date_labels(ax: Axes, rot) -> None:
+    # mini version of autofmt_xdate
+    for label in ax.get_xticklabels():
+        label.set_horizontalalignment("right")
+        label.set_rotation(rot)
+    fig = ax.get_figure()
+    if fig is not None:
+        # should always be a Figure but can technically be None
+        maybe_adjust_figure(fig, bottom=0.2)  # type: ignore[arg-type]
+
+
+def table(
+    ax, data: DataFrame | Series, rowLabels=None, colLabels=None, **kwargs
+) -> Table:
+    if isinstance(data, ABCSeries):
+        data = data.to_frame()
+    elif isinstance(data, ABCDataFrame):
+        pass
+    else:
+        raise ValueError("Input data must be DataFrame or Series")
+
+    if rowLabels is None:
+        rowLabels = data.index
+
+    if colLabels is None:
+        colLabels = data.columns
+
+    cellText = data.values
+
+    # error: Argument "cellText" to "table" has incompatible type "ndarray[Any,
+    # Any]"; expected "Sequence[Sequence[str]] | None"
+    return mpl.table.table(
+        ax,
+        cellText=cellText,  # type: ignore[arg-type]
+        rowLabels=rowLabels,
+        colLabels=colLabels,
+        **kwargs,
+    )
+
+
+def _get_layout(
+    nplots: int,
+    layout: tuple[int, int] | None = None,
+    layout_type: str = "box",
+) -> tuple[int, int]:
+    if layout is not None:
+        if not isinstance(layout, (tuple, list)) or len(layout) != 2:
+            raise ValueError("Layout must be a tuple of (rows, columns)")
+
+        nrows, ncols = layout
+
+        if nrows == -1 and ncols > 0:
+            layout = (ceil(nplots / ncols), ncols)
+        elif ncols == -1 and nrows > 0:
+            layout = (nrows, ceil(nplots / nrows))
+        elif ncols <= 0 and nrows <= 0:
+            msg = "At least one dimension of layout must be positive"
+            raise ValueError(msg)
+
+        nrows, ncols = layout
+        if nrows * ncols < nplots:
+            raise ValueError(
+                f"Layout of {nrows}x{ncols} must be larger than required size {nplots}"
+            )
+
+        return layout
+
+    if layout_type == "single":
+        return (1, 1)
+    elif layout_type == "horizontal":
+        return (1, nplots)
+    elif layout_type == "vertical":
+        return (nplots, 1)
+
+    layouts = {1: (1, 1), 2: (1, 2), 3: (2, 2), 4: (2, 2)}
+    try:
+        return layouts[nplots]
+    except KeyError:
+        k = 1
+        while k**2 < nplots:
+            k += 1
+
+        if (k - 1) * k >= nplots:
+            return k, (k - 1)
+        else:
+            return k, k
+
+
+# copied from matplotlib/pyplot.py and modified for pandas.plotting
+
+
+def create_subplots(
+    naxes: int,
+    sharex: bool = False,
+    sharey: bool = False,
+    squeeze: bool = True,
+    subplot_kw=None,
+    ax=None,
+    layout=None,
+    layout_type: str = "box",
+    **fig_kw,
+):
+    """
+    Create a figure with a set of subplots already made.
+
+    This utility wrapper makes it convenient to create common layouts of
+    subplots, including the enclosing figure object, in a single call.
+
+    Parameters
+    ----------
+    naxes : int
+      Number of required axes. Exceeded axes are set invisible. Default is
+      nrows * ncols.
+
+    sharex : bool
+      If True, the X axis will be shared amongst all subplots.
+
+    sharey : bool
+      If True, the Y axis will be shared amongst all subplots.
+
+    squeeze : bool
+
+      If True, extra dimensions are squeezed out from the returned axis object:
+        - if only one subplot is constructed (nrows=ncols=1), the resulting
+        single Axis object is returned as a scalar.
+        - for Nx1 or 1xN subplots, the returned object is a 1-d numpy object
+        array of Axis objects are returned as numpy 1-d arrays.
+        - for NxM subplots with N>1 and M>1 are returned as a 2d array.
+
+      If False, no squeezing is done: the returned axis object is always
+      a 2-d array containing Axis instances, even if it ends up being 1x1.
+
+    subplot_kw : dict
+      Dict with keywords passed to the add_subplot() call used to create each
+      subplots.
+
+    ax : Matplotlib axis object, optional
+
+    layout : tuple
+      Number of rows and columns of the subplot grid.
+      If not specified, calculated from naxes and layout_type
+
+    layout_type : {'box', 'horizontal', 'vertical'}, default 'box'
+      Specify how to layout the subplot grid.
+
+    fig_kw : Other keyword arguments to be passed to the figure() call.
+        Note that all keywords not recognized above will be
+        automatically included here.
+
+    Returns
+    -------
+    fig, ax : tuple
+      - fig is the Matplotlib Figure object
+      - ax can be either a single axis object or an array of axis objects if
+      more than one subplot was created.  The dimensions of the resulting array
+      can be controlled with the squeeze keyword, see above.
+
+    Examples
+    --------
+    x = np.linspace(0, 2*np.pi, 400)
+    y = np.sin(x**2)
+
+    # Just a figure and one subplot
+    f, ax = plt.subplots()
+    ax.plot(x, y)
+    ax.set_title('Simple plot')
+
+    # Two subplots, unpack the output array immediately
+    f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
+    ax1.plot(x, y)
+    ax1.set_title('Sharing Y axis')
+    ax2.scatter(x, y)
+
+    # Four polar axes
+    plt.subplots(2, 2, subplot_kw=dict(polar=True))
+    """
+    import matplotlib.pyplot as plt
+
+    if subplot_kw is None:
+        subplot_kw = {}
+
+    if ax is None:
+        fig = plt.figure(**fig_kw)
+    else:
+        if is_list_like(ax):
+            if squeeze:
+                ax = np.fromiter(flatten_axes(ax), dtype=object)
+            if layout is not None:
+                warnings.warn(
+                    "When passing multiple axes, layout keyword is ignored.",
+                    UserWarning,
+                    stacklevel=find_stack_level(),
+                )
+            if sharex or sharey:
+                warnings.warn(
+                    "When passing multiple axes, sharex and sharey "
+                    "are ignored. These settings must be specified when creating axes.",
+                    UserWarning,
+                    stacklevel=find_stack_level(),
+                )
+            if ax.size == naxes:
+                fig = ax.flat[0].get_figure()
+                return fig, ax
+            else:
+                raise ValueError(
+                    f"The number of passed axes must be {naxes}, the "
+                    "same as the output plot"
+                )
+
+        fig = ax.get_figure()
+        # if ax is passed and a number of subplots is 1, return ax as it is
+        if naxes == 1:
+            if squeeze:
+                return fig, ax
+            else:
+                return fig, np.fromiter(flatten_axes(ax), dtype=object)
+        else:
+            warnings.warn(
+                "To output multiple subplots, the figure containing "
+                "the passed axes is being cleared.",
+                UserWarning,
+                stacklevel=find_stack_level(),
+            )
+            fig.clear()
+
+    nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type)
+    nplots = nrows * ncols
+
+    # Create empty object array to hold all axes.  It's easiest to make it 1-d
+    # so we can just append subplots upon creation, and then
+    axarr = np.empty(nplots, dtype=object)
+
+    # Create first subplot separately, so we can share it if requested
+    ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw)
+
+    if sharex:
+        subplot_kw["sharex"] = ax0
+    if sharey:
+        subplot_kw["sharey"] = ax0
+    axarr[0] = ax0
+
+    # Note off-by-one counting because add_subplot uses the MATLAB 1-based
+    # convention.
+    for i in range(1, nplots):
+        kwds = subplot_kw.copy()
+        # Set sharex and sharey to None for blank/dummy axes, these can
+        # interfere with proper axis limits on the visible axes if
+        # they share axes e.g. issue #7528
+        if i >= naxes:
+            kwds["sharex"] = None
+            kwds["sharey"] = None
+        ax = fig.add_subplot(nrows, ncols, i + 1, **kwds)
+        axarr[i] = ax
+
+    if naxes != nplots:
+        for ax in axarr[naxes:]:
+            ax.set_visible(False)
+
+    handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey)
+
+    if squeeze:
+        # Reshape the array to have the final desired dimension (nrow,ncol),
+        # though discarding unneeded dimensions that equal 1.  If we only have
+        # one subplot, just return it instead of a 1-element array.
+        if nplots == 1:
+            axes = axarr[0]
+        else:
+            axes = axarr.reshape(nrows, ncols).squeeze()
+    else:
+        # returned axis array will be always 2-d, even if nrows=ncols=1
+        axes = axarr.reshape(nrows, ncols)
+
+    return fig, axes
+
+
+def _remove_labels_from_axis(axis: Axis) -> None:
+    for t in axis.get_majorticklabels():
+        t.set_visible(False)
+
+    # set_visible will not be effective if
+    # minor axis has NullLocator and NullFormatter (default)
+    if isinstance(axis.get_minor_locator(), mpl.ticker.NullLocator):
+        axis.set_minor_locator(mpl.ticker.AutoLocator())
+    if isinstance(axis.get_minor_formatter(), mpl.ticker.NullFormatter):
+        axis.set_minor_formatter(mpl.ticker.FormatStrFormatter(""))
+    for t in axis.get_minorticklabels():
+        t.set_visible(False)
+
+    axis.get_label().set_visible(False)
+
+
+def _has_externally_shared_axis(ax1: Axes, compare_axis: str) -> bool:
+    """
+    Return whether an axis is externally shared.
+
+    Parameters
+    ----------
+    ax1 : matplotlib.axes.Axes
+        Axis to query.
+    compare_axis : str
+        `"x"` or `"y"` according to whether the X-axis or Y-axis is being
+        compared.
+
+    Returns
+    -------
+    bool
+        `True` if the axis is externally shared. Otherwise `False`.
+
+    Notes
+    -----
+    If two axes with different positions are sharing an axis, they can be
+    referred to as *externally* sharing the common axis.
+
+    If two axes sharing an axis also have the same position, they can be
+    referred to as *internally* sharing the common axis (a.k.a twinning).
+
+    _handle_shared_axes() is only interested in axes externally sharing an
+    axis, regardless of whether either of the axes is also internally sharing
+    with a third axis.
+    """
+    if compare_axis == "x":
+        axes = ax1.get_shared_x_axes()
+    elif compare_axis == "y":
+        axes = ax1.get_shared_y_axes()
+    else:
+        raise ValueError(
+            "_has_externally_shared_axis() needs 'x' or 'y' as a second parameter"
+        )
+
+    axes_siblings = axes.get_siblings(ax1)
+
+    # Retain ax1 and any of its siblings which aren't in the same position as it
+    ax1_points = ax1.get_position().get_points()
+
+    for ax2 in axes_siblings:
+        if not np.array_equal(ax1_points, ax2.get_position().get_points()):
+            return True
+
+    return False
+
+
+def handle_shared_axes(
+    axarr: Iterable[Axes],
+    nplots: int,
+    naxes: int,
+    nrows: int,
+    ncols: int,
+    sharex: bool,
+    sharey: bool,
+) -> None:
+    if nplots > 1:
+        row_num = lambda x: x.get_subplotspec().rowspan.start
+        col_num = lambda x: x.get_subplotspec().colspan.start
+
+        is_first_col = lambda x: x.get_subplotspec().is_first_col()
+
+        if nrows > 1:
+            try:
+                # first find out the ax layout,
+                # so that we can correctly handle 'gaps"
+                layout = np.zeros((nrows + 1, ncols + 1), dtype=np.bool_)
+                for ax in axarr:
+                    layout[row_num(ax), col_num(ax)] = ax.get_visible()
+
+                for ax in axarr:
+                    # only the last row of subplots should get x labels -> all
+                    # other off layout handles the case that the subplot is
+                    # the last in the column, because below is no subplot/gap.
+                    if not layout[row_num(ax) + 1, col_num(ax)]:
+                        continue
+                    if sharex or _has_externally_shared_axis(ax, "x"):
+                        _remove_labels_from_axis(ax.xaxis)
+
+            except IndexError:
+                # if gridspec is used, ax.rowNum and ax.colNum may different
+                # from layout shape. in this case, use last_row logic
+                is_last_row = lambda x: x.get_subplotspec().is_last_row()
+                for ax in axarr:
+                    if is_last_row(ax):
+                        continue
+                    if sharex or _has_externally_shared_axis(ax, "x"):
+                        _remove_labels_from_axis(ax.xaxis)
+
+        if ncols > 1:
+            for ax in axarr:
+                # only the first column should get y labels -> set all other to
+                # off as we only have labels in the first column and we always
+                # have a subplot there, we can skip the layout test
+                if is_first_col(ax):
+                    continue
+                if sharey or _has_externally_shared_axis(ax, "y"):
+                    _remove_labels_from_axis(ax.yaxis)
+
+
+def flatten_axes(axes: Axes | Iterable[Axes]) -> Generator[Axes]:
+    if not is_list_like(axes):
+        yield axes  # type: ignore[misc]
+    elif isinstance(axes, (np.ndarray, ABCIndex)):
+        yield from np.asarray(axes).reshape(-1)
+    else:
+        yield from axes  # type: ignore[misc]
+
+
+def set_ticks_props(
+    axes: Axes | Iterable[Axes],
+    xlabelsize: int | None = None,
+    xrot=None,
+    ylabelsize: int | None = None,
+    yrot=None,
+):
+    for ax in flatten_axes(axes):
+        if xlabelsize is not None:
+            mpl.artist.setp(ax.get_xticklabels(), fontsize=xlabelsize)  # type: ignore[arg-type]
+        if xrot is not None:
+            mpl.artist.setp(ax.get_xticklabels(), rotation=xrot)  # type: ignore[arg-type]
+        if ylabelsize is not None:
+            mpl.artist.setp(ax.get_yticklabels(), fontsize=ylabelsize)  # type: ignore[arg-type]
+        if yrot is not None:
+            mpl.artist.setp(ax.get_yticklabels(), rotation=yrot)  # type: ignore[arg-type]
+    return axes
+
+
+def get_all_lines(ax: Axes) -> list[Line2D]:
+    lines = ax.get_lines()
+
+    if hasattr(ax, "right_ax"):
+        lines += ax.right_ax.get_lines()
+
+    if hasattr(ax, "left_ax"):
+        lines += ax.left_ax.get_lines()
+
+    return lines
+
+
+def get_xlim(lines: Iterable[Line2D]) -> tuple[float, float]:
+    left, right = np.inf, -np.inf
+    for line in lines:
+        x = line.get_xdata(orig=False)
+        left = min(np.nanmin(x), left)
+        right = max(np.nanmax(x), right)
+    return left, right
diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c445c570ae33ddf1022eb953319bf8badd84ad9
--- /dev/null
+++ b/pandas/plotting/_misc.py
@@ -0,0 +1,780 @@
+from __future__ import annotations
+
+from contextlib import contextmanager
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+
+from pandas.util._decorators import set_module
+
+from pandas.plotting._core import _get_plot_backend
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Generator,
+        Mapping,
+    )
+
+    from matplotlib.axes import Axes
+    from matplotlib.colors import Colormap
+    from matplotlib.figure import Figure
+    from matplotlib.table import Table
+    import numpy as np
+
+    from pandas import (
+        DataFrame,
+        Series,
+    )
+
+
+@set_module("pandas.plotting")
+def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table:
+    """
+    Helper function to convert DataFrame and Series to matplotlib.table.
+
+    This method provides an easy way to visualize tabular data within a Matplotlib
+    figure. It automatically extracts index and column labels from the DataFrame
+    or Series, unless explicitly specified. This function is particularly useful
+    when displaying summary tables alongside other plots or when creating static
+    reports. It utilizes the `matplotlib.pyplot.table` backend and allows
+    customization through various styling options available in Matplotlib.
+
+    Parameters
+    ----------
+    ax : Matplotlib axes object
+        The axes on which to draw the table.
+    data : DataFrame or Series
+        Data for table contents.
+    **kwargs
+        Keyword arguments to be passed to matplotlib.table.table.
+        If `rowLabels` or `colLabels` is not specified, data index or column
+        names will be used.
+
+    Returns
+    -------
+    matplotlib table object
+        The created table as a matplotlib Table object.
+
+    See Also
+    --------
+    DataFrame.plot : Make plots of DataFrame using matplotlib.
+    matplotlib.pyplot.table : Create a table from data in a Matplotlib plot.
+
+    Examples
+    --------
+
+    .. plot::
+            :context: close-figs
+
+            >>> import matplotlib.pyplot as plt
+            >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
+            >>> fig, ax = plt.subplots()
+            >>> ax.axis("off")
+            (np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(1.0))
+            >>> table = pd.plotting.table(
+            ...     ax, df, loc="center", cellLoc="center", colWidths=[0.2, 0.2]
+            ... )
+    """
+    plot_backend = _get_plot_backend("matplotlib")
+    return plot_backend.table(
+        ax=ax, data=data, rowLabels=None, colLabels=None, **kwargs
+    )
+
+
+@set_module("pandas.plotting")
+def register() -> None:
+    """
+    Register pandas formatters and converters with matplotlib.
+
+    This function modifies the global ``matplotlib.units.registry``
+    dictionary. pandas adds custom converters for
+
+    * pd.Timestamp
+    * pd.Period
+    * np.datetime64
+    * datetime.datetime
+    * datetime.date
+    * datetime.time
+
+    See Also
+    --------
+    deregister_matplotlib_converters : Remove pandas formatters and converters.
+
+    Examples
+    --------
+    .. plot::
+       :context: close-figs
+
+        The following line is done automatically by pandas so
+        the plot can be rendered:
+
+        >>> pd.plotting.register_matplotlib_converters()
+
+        >>> df = pd.DataFrame(
+        ...     {"ts": pd.period_range("2020", periods=2, freq="M"), "y": [1, 2]}
+        ... )
+        >>> plot = df.plot.line(x="ts", y="y")
+
+    Unsetting the register manually an error will be raised:
+
+    >>> pd.set_option(
+    ...     "plotting.matplotlib.register_converters", False
+    ... )  # doctest: +SKIP
+    >>> df.plot.line(x="ts", y="y")  # doctest: +SKIP
+    Traceback (most recent call last):
+    TypeError: float() argument must be a string or a real number, not 'Period'
+    """
+    plot_backend = _get_plot_backend("matplotlib")
+    plot_backend.register()
+
+
+@set_module("pandas.plotting")
+def deregister() -> None:
+    """
+    Remove pandas formatters and converters.
+
+    Removes the custom converters added by :func:`register`. This
+    attempts to set the state of the registry back to the state before
+    pandas registered its own units. Converters for pandas' own types like
+    Timestamp and Period are removed completely. Converters for types
+    pandas overwrites, like ``datetime.datetime``, are restored to their
+    original value.
+
+    See Also
+    --------
+    register_matplotlib_converters : Register pandas formatters and converters
+        with matplotlib.
+
+    Examples
+    --------
+    .. plot::
+       :context: close-figs
+
+        The following line is done automatically by pandas so
+        the plot can be rendered:
+
+        >>> pd.plotting.register_matplotlib_converters()
+
+        >>> df = pd.DataFrame(
+        ...     {"ts": pd.period_range("2020", periods=2, freq="M"), "y": [1, 2]}
+        ... )
+        >>> plot = df.plot.line(x="ts", y="y")
+
+    Unsetting the register manually an error will be raised:
+
+    >>> pd.set_option(
+    ...     "plotting.matplotlib.register_converters", False
+    ... )  # doctest: +SKIP
+    >>> df.plot.line(x="ts", y="y")  # doctest: +SKIP
+    Traceback (most recent call last):
+    TypeError: float() argument must be a string or a real number, not 'Period'
+    """
+    plot_backend = _get_plot_backend("matplotlib")
+    plot_backend.deregister()
+
+
+@set_module("pandas.plotting")
+def scatter_matrix(
+    frame: DataFrame,
+    alpha: float = 0.5,
+    figsize: tuple[float, float] | None = None,
+    ax: Axes | None = None,
+    grid: bool = False,
+    diagonal: str = "hist",
+    marker: str = ".",
+    density_kwds: Mapping[str, Any] | None = None,
+    hist_kwds: Mapping[str, Any] | None = None,
+    range_padding: float = 0.05,
+    **kwargs,
+) -> np.ndarray:
+    """
+    Draw a matrix of scatter plots.
+
+    Each pair of numeric columns in the DataFrame is plotted against each other,
+    resulting in a matrix of scatter plots. The diagonal plots can display either
+    histograms or Kernel Density Estimation (KDE) plots for each variable.
+
+    Parameters
+    ----------
+    frame : DataFrame
+        The data to be plotted.
+    alpha : float, optional
+        Amount of transparency applied.
+    figsize : (float,float), optional
+        A tuple (width, height) in inches.
+    ax : Matplotlib axis object, optional
+        An existing Matplotlib axis object for the plots. If None, a new axis is
+        created.
+    grid : bool, optional
+        Setting this to True will show the grid.
+    diagonal : {'hist', 'kde'}
+        Pick between 'kde' and 'hist' for either Kernel Density Estimation or
+        Histogram plot in the diagonal.
+    marker : str, optional
+        Matplotlib marker type, default '.'.
+    density_kwds : keywords
+        Keyword arguments to be passed to kernel density estimate plot.
+    hist_kwds : keywords
+        Keyword arguments to be passed to hist function.
+    range_padding : float, default 0.05
+        Relative extension of axis range in x and y with respect to
+        (x_max - x_min) or (y_max - y_min).
+    **kwargs
+        Keyword arguments to be passed to scatter function.
+
+    Returns
+    -------
+    numpy.ndarray
+        A matrix of scatter plots.
+
+    See Also
+    --------
+    plotting.parallel_coordinates : Plots parallel coordinates for multivariate data.
+    plotting.andrews_curves : Generates Andrews curves for visualizing clusters of
+        multivariate data.
+    plotting.radviz : Creates a RadViz visualization.
+    plotting.bootstrap_plot : Visualizes uncertainty in data via bootstrap sampling.
+
+    Examples
+    --------
+
+    .. plot::
+        :context: close-figs
+
+        >>> df = pd.DataFrame(np.random.randn(1000, 4), columns=["A", "B", "C", "D"])
+        >>> pd.plotting.scatter_matrix(df, alpha=0.2)
+        array([[<Axes: xlabel='A', ylabel='A'>, <Axes: xlabel='B', ylabel='A'>,
+                <Axes: xlabel='C', ylabel='A'>, <Axes: xlabel='D', ylabel='A'>],
+               [<Axes: xlabel='A', ylabel='B'>, <Axes: xlabel='B', ylabel='B'>,
+                <Axes: xlabel='C', ylabel='B'>, <Axes: xlabel='D', ylabel='B'>],
+               [<Axes: xlabel='A', ylabel='C'>, <Axes: xlabel='B', ylabel='C'>,
+                <Axes: xlabel='C', ylabel='C'>, <Axes: xlabel='D', ylabel='C'>],
+               [<Axes: xlabel='A', ylabel='D'>, <Axes: xlabel='B', ylabel='D'>,
+                <Axes: xlabel='C', ylabel='D'>, <Axes: xlabel='D', ylabel='D'>]],
+              dtype=object)
+    """
+    plot_backend = _get_plot_backend("matplotlib")
+    return plot_backend.scatter_matrix(
+        frame=frame,
+        alpha=alpha,
+        figsize=figsize,
+        ax=ax,
+        grid=grid,
+        diagonal=diagonal,
+        marker=marker,
+        density_kwds=density_kwds,
+        hist_kwds=hist_kwds,
+        range_padding=range_padding,
+        **kwargs,
+    )
+
+
+@set_module("pandas.plotting")
+def radviz(
+    frame: DataFrame,
+    class_column: str,
+    ax: Axes | None = None,
+    color: list[str] | tuple[str, ...] | None = None,
+    colormap: Colormap | str | None = None,
+    **kwds,
+) -> Axes:
+    """
+    Plot a multidimensional dataset in 2D.
+
+    Each Series in the DataFrame is represented as an evenly distributed
+    slice on a circle. Each data point is rendered in the circle according to
+    the value on each Series. Highly correlated `Series` in the `DataFrame`
+    are placed closer on the unit circle.
+
+    RadViz allow to project an N-dimensional data set into a 2D space where the
+    influence of each dimension can be interpreted as a balance between the
+    influence of all dimensions.
+
+    More info available at the `original article
+    <https://doi.org/10.1145/331770.331775>`_
+    describing RadViz.
+
+    Parameters
+    ----------
+    frame : `DataFrame`
+        Object holding the data.
+    class_column : str
+        Column name containing the name of the data point category.
+    ax : :class:`matplotlib.axes.Axes`, optional
+        A plot instance to which to add the information.
+    color : list[str] or tuple[str], optional
+        Assign a color to each category. Example: ['blue', 'green'].
+    colormap : str or :class:`matplotlib.colors.Colormap`, default None
+        Colormap to select colors from. If string, load colormap with that
+        name from matplotlib.
+    **kwds
+        Options to pass to matplotlib scatter plotting method.
+
+    Returns
+    -------
+    :class:`matplotlib.axes.Axes`
+        The Axes object from Matplotlib.
+
+    See Also
+    --------
+    plotting.andrews_curves : Plot clustering visualization.
+
+    Examples
+    --------
+
+    .. plot::
+        :context: close-figs
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         "SepalLength": [6.5, 7.7, 5.1, 5.8, 7.6, 5.0, 5.4, 4.6, 6.7, 4.6],
+        ...         "SepalWidth": [3.0, 3.8, 3.8, 2.7, 3.0, 2.3, 3.0, 3.2, 3.3, 3.6],
+        ...         "PetalLength": [5.5, 6.7, 1.9, 5.1, 6.6, 3.3, 4.5, 1.4, 5.7, 1.0],
+        ...         "PetalWidth": [1.8, 2.2, 0.4, 1.9, 2.1, 1.0, 1.5, 0.2, 2.1, 0.2],
+        ...         "Category": [
+        ...             "virginica",
+        ...             "virginica",
+        ...             "setosa",
+        ...             "virginica",
+        ...             "virginica",
+        ...             "versicolor",
+        ...             "versicolor",
+        ...             "setosa",
+        ...             "virginica",
+        ...             "setosa",
+        ...         ],
+        ...     }
+        ... )
+        >>> pd.plotting.radviz(df, "Category")  # doctest: +SKIP
+    """
+    plot_backend = _get_plot_backend("matplotlib")
+    return plot_backend.radviz(
+        frame=frame,
+        class_column=class_column,
+        ax=ax,
+        color=color,
+        colormap=colormap,
+        **kwds,
+    )
+
+
+@set_module("pandas.plotting")
+def andrews_curves(
+    frame: DataFrame,
+    class_column: str,
+    ax: Axes | None = None,
+    samples: int = 200,
+    color: list[str] | tuple[str, ...] | None = None,
+    colormap: Colormap | str | None = None,
+    **kwargs,
+) -> Axes:
+    """
+    Generate a matplotlib plot for visualizing clusters of multivariate data.
+
+    Andrews curves have the functional form:
+
+    .. math::
+        f(t) = \\frac{x_1}{\\sqrt{2}} + x_2 \\sin(t) + x_3 \\cos(t) +
+        x_4 \\sin(2t) + x_5 \\cos(2t) + \\cdots
+
+    Where :math:`x` coefficients correspond to the values of each dimension
+    and :math:`t` is linearly spaced between :math:`-\\pi` and :math:`+\\pi`.
+    Each row of frame then corresponds to a single curve.
+
+    Parameters
+    ----------
+    frame : DataFrame
+        Data to be plotted, preferably normalized to (0.0, 1.0).
+    class_column : label
+        Name of the column containing class names.
+    ax : axes object, default None
+        Axes to use.
+    samples : int
+        Number of points to plot in each curve.
+    color : str, list[str] or tuple[str], optional
+        Colors to use for the different classes. Colors can be strings
+        or 3-element floating point RGB values.
+    colormap : str or matplotlib colormap object, default None
+        Colormap to select colors from. If a string, load colormap with that
+        name from matplotlib.
+    **kwargs
+        Options to pass to matplotlib plotting method.
+
+    Returns
+    -------
+    :class:`matplotlib.axes.Axes`
+        The matplotlib Axes object with the plot.
+
+    See Also
+    --------
+    plotting.parallel_coordinates : Plot parallel coordinates chart.
+    DataFrame.plot : Make plots of Series or DataFrame.
+
+    Examples
+    --------
+
+    .. plot::
+        :context: close-figs
+
+        >>> df = pd.read_csv(
+        ...     "https://raw.githubusercontent.com/pandas-dev/"
+        ...     "pandas/main/pandas/tests/io/data/csv/iris.csv"
+        ... )  # doctest: +SKIP
+        >>> pd.plotting.andrews_curves(df, "Name")  # doctest: +SKIP
+    """
+    plot_backend = _get_plot_backend("matplotlib")
+    return plot_backend.andrews_curves(
+        frame=frame,
+        class_column=class_column,
+        ax=ax,
+        samples=samples,
+        color=color,
+        colormap=colormap,
+        **kwargs,
+    )
+
+
+@set_module("pandas.plotting")
+def bootstrap_plot(
+    series: Series,
+    fig: Figure | None = None,
+    size: int = 50,
+    samples: int = 500,
+    **kwds,
+) -> Figure:
+    """
+    Bootstrap plot on mean, median and mid-range statistics.
+
+    The bootstrap plot is used to estimate the uncertainty of a statistic
+    by relying on random sampling with replacement [1]_. This function will
+    generate bootstrapping plots for mean, median and mid-range statistics
+    for the given number of samples of the given size.
+
+    .. [1] "Bootstrapping (statistics)" in \
+    https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29
+
+    Parameters
+    ----------
+    series : pandas.Series
+        Series from where to get the samplings for the bootstrapping.
+    fig : matplotlib.figure.Figure, default None
+        If given, it will use the `fig` reference for plotting instead of
+        creating a new one with default parameters.
+    size : int, default 50
+        Number of data points to consider during each sampling. It must be
+        less than or equal to the length of the `series`.
+    samples : int, default 500
+        Number of times the bootstrap procedure is performed.
+    **kwds
+        Options to pass to matplotlib plotting method.
+
+    Returns
+    -------
+    matplotlib.figure.Figure
+        Matplotlib figure.
+
+    See Also
+    --------
+    DataFrame.plot : Basic plotting for DataFrame objects.
+    Series.plot : Basic plotting for Series objects.
+
+    Examples
+    --------
+    This example draws a basic bootstrap plot for a Series.
+
+    .. plot::
+        :context: close-figs
+
+        >>> s = pd.Series(np.random.uniform(size=100))
+        >>> pd.plotting.bootstrap_plot(s)  # doctest: +SKIP
+        <Figure size 640x480 with 6 Axes>
+    """
+    plot_backend = _get_plot_backend("matplotlib")
+    return plot_backend.bootstrap_plot(
+        series=series, fig=fig, size=size, samples=samples, **kwds
+    )
+
+
+@set_module("pandas.plotting")
+def parallel_coordinates(
+    frame: DataFrame,
+    class_column: str,
+    cols: list[str] | None = None,
+    ax: Axes | None = None,
+    color: list[str] | tuple[str, ...] | None = None,
+    use_columns: bool = False,
+    xticks: list | tuple | None = None,
+    colormap: Colormap | str | None = None,
+    axvlines: bool = True,
+    axvlines_kwds: Mapping[str, Any] | None = None,
+    sort_labels: bool = False,
+    **kwargs,
+) -> Axes:
+    """
+    Parallel coordinates plotting.
+
+    Parameters
+    ----------
+    frame : DataFrame
+        The DataFrame to be plotted.
+    class_column : str
+        Column name containing class names.
+    cols : list, optional
+        A list of column names to use.
+    ax : matplotlib.axis, optional
+        Matplotlib axis object.
+    color : list or tuple, optional
+        Colors to use for the different classes.
+    use_columns : bool, optional
+        If true, columns will be used as xticks.
+    xticks : list or tuple, optional
+        A list of values to use for xticks.
+    colormap : str or matplotlib colormap, default None
+        Colormap to use for line colors.
+    axvlines : bool, optional
+        If true, vertical lines will be added at each xtick.
+    axvlines_kwds : keywords, optional
+        Options to be passed to axvline method for vertical lines.
+    sort_labels : bool, default False
+        Sort class_column labels, useful when assigning colors.
+    **kwargs
+        Options to pass to matplotlib plotting method.
+
+    Returns
+    -------
+    matplotlib.axes.Axes
+        The matplotlib axes containing the parallel coordinates plot.
+
+    See Also
+    --------
+    plotting.andrews_curves : Generate a matplotlib plot for visualizing clusters
+        of multivariate data.
+    plotting.radviz : Plot a multidimensional dataset in 2D.
+
+    Examples
+    --------
+
+    .. plot::
+        :context: close-figs
+
+        >>> df = pd.read_csv(
+        ...     "https://raw.githubusercontent.com/pandas-dev/"
+        ...     "pandas/main/pandas/tests/io/data/csv/iris.csv"
+        ... )  # doctest: +SKIP
+        >>> pd.plotting.parallel_coordinates(
+        ...     df, "Name", color=("#556270", "#4ECDC4", "#C7F464")
+        ... )  # doctest: +SKIP
+    """
+    plot_backend = _get_plot_backend("matplotlib")
+    return plot_backend.parallel_coordinates(
+        frame=frame,
+        class_column=class_column,
+        cols=cols,
+        ax=ax,
+        color=color,
+        use_columns=use_columns,
+        xticks=xticks,
+        colormap=colormap,
+        axvlines=axvlines,
+        axvlines_kwds=axvlines_kwds,
+        sort_labels=sort_labels,
+        **kwargs,
+    )
+
+
+@set_module("pandas.plotting")
+def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Axes:
+    """
+    Lag plot for time series.
+
+    A lag plot is a scatter plot of a time series against a lag of itself. It helps
+    in visualizing the temporal dependence between observations by plotting the values
+    at time `t` on the x-axis and the values at time `t + lag` on the y-axis.
+
+    Parameters
+    ----------
+    series : Series
+        The time series to visualize.
+    lag : int, default 1
+        Lag length of the scatter plot.
+    ax : Matplotlib axis object, optional
+        The matplotlib axis object to use.
+    **kwds
+        Matplotlib scatter method keyword arguments.
+
+    Returns
+    -------
+    matplotlib.axes.Axes
+        The matplotlib Axes object containing the lag plot.
+
+    See Also
+    --------
+    plotting.autocorrelation_plot : Autocorrelation plot for time series.
+    matplotlib.pyplot.scatter : A scatter plot of y vs. x with varying marker size
+        and/or color in Matplotlib.
+
+    Examples
+    --------
+    Lag plots are most commonly used to look for patterns in time series data.
+
+    Given the following time series
+
+    .. plot::
+        :context: close-figs
+
+        >>> np.random.seed(5)
+        >>> x = np.cumsum(np.random.normal(loc=1, scale=5, size=50))
+        >>> s = pd.Series(x)
+        >>> s.plot()  # doctest: +SKIP
+
+    A lag plot with ``lag=1`` returns
+
+    .. plot::
+        :context: close-figs
+
+        >>> _ = pd.plotting.lag_plot(s, lag=1)
+    """
+    plot_backend = _get_plot_backend("matplotlib")
+    return plot_backend.lag_plot(series=series, lag=lag, ax=ax, **kwds)
+
+
+@set_module("pandas.plotting")
+def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Axes:
+    """
+    Autocorrelation plot for time series.
+
+    This method generates an autocorrelation plot for a given time series,
+    which helps to identify any periodic structure or correlation within the
+    data across various lags. It shows the correlation of a time series with a
+    delayed copy of itself as a function of delay. Autocorrelation plots are useful for
+    checking randomness in a data set. If the data are random, the autocorrelations
+    should be near zero for any and all time-lag separations. If the data are not
+    random, then one or more of the autocorrelations will be significantly
+    non-zero.
+
+    Parameters
+    ----------
+    series : Series
+        The time series to visualize.
+    ax : Matplotlib axis object, optional
+        The matplotlib axis object to use.
+    **kwargs
+        Options to pass to matplotlib plotting method.
+
+    Returns
+    -------
+    matplotlib.axes.Axes
+        The matplotlib axes containing the autocorrelation plot.
+
+    See Also
+    --------
+    Series.autocorr : Compute the lag-N autocorrelation for a Series.
+    plotting.lag_plot : Lag plot for time series.
+
+    Examples
+    --------
+    The horizontal lines in the plot correspond to 95% and 99% confidence bands.
+
+    The dashed line is 99% confidence band.
+
+    .. plot::
+        :context: close-figs
+
+        >>> spacing = np.linspace(-9 * np.pi, 9 * np.pi, num=1000)
+        >>> s = pd.Series(0.7 * np.random.rand(1000) + 0.3 * np.sin(spacing))
+        >>> pd.plotting.autocorrelation_plot(s)  # doctest: +SKIP
+    """
+    plot_backend = _get_plot_backend("matplotlib")
+    return plot_backend.autocorrelation_plot(series=series, ax=ax, **kwargs)
+
+
+class _Options(dict):
+    """
+    Stores pandas plotting options.
+
+    Allows for parameter aliasing so you can just use parameter names that are
+    the same as the plot function parameters, but is stored in a canonical
+    format that makes it easy to breakdown into groups later.
+
+    See Also
+    --------
+    plotting.register_matplotlib_converters : Register pandas formatters and
+        converters with matplotlib.
+    plotting.bootstrap_plot : Bootstrap plot on mean, median and mid-range statistics.
+    plotting.autocorrelation_plot : Autocorrelation plot for time series.
+    plotting.lag_plot : Lag plot for time series.
+
+    Examples
+    --------
+
+    .. plot::
+            :context: close-figs
+
+             >>> np.random.seed(42)
+             >>> df = pd.DataFrame(
+             ...     {"A": np.random.randn(10), "B": np.random.randn(10)},
+             ...     index=pd.date_range("1/1/2000", freq="4MS", periods=10),
+             ... )
+             >>> with pd.plotting.plot_params.use("x_compat", True):
+             ...     _ = df["A"].plot(color="r")
+             ...     _ = df["B"].plot(color="g")
+    """
+
+    # alias so the names are same as plotting method parameter names
+    _ALIASES = {"x_compat": "xaxis.compat"}
+    _DEFAULT_KEYS = ["xaxis.compat"]
+
+    def __init__(self) -> None:
+        super().__setitem__("xaxis.compat", False)
+
+    def __getitem__(self, key):
+        key = self._get_canonical_key(key)
+        if key not in self:
+            raise ValueError(f"{key} is not a valid pandas plotting option")
+        return super().__getitem__(key)
+
+    def __setitem__(self, key, value) -> None:
+        key = self._get_canonical_key(key)
+        super().__setitem__(key, value)
+
+    def __delitem__(self, key) -> None:
+        key = self._get_canonical_key(key)
+        if key in self._DEFAULT_KEYS:
+            raise ValueError(f"Cannot remove default parameter {key}")
+        super().__delitem__(key)
+
+    def __contains__(self, key) -> bool:
+        key = self._get_canonical_key(key)
+        return super().__contains__(key)
+
+    def reset(self) -> None:
+        """
+        Reset the option store to its initial state
+
+        Returns
+        -------
+        None
+        """
+        # error: Cannot access "__init__" directly
+        self.__init__()  # type: ignore[misc]
+
+    def _get_canonical_key(self, key: str) -> str:
+        return self._ALIASES.get(key, key)
+
+    @contextmanager
+    def use(self, key, value) -> Generator[_Options]:
+        """
+        Temporarily set a parameter value using the with statement.
+        Aliasing allowed.
+        """
+        old_value = self[key]
+        try:
+            self[key] = value
+            yield self
+        finally:
+            self[key] = old_value
+
+
+plot_params = _Options()
+plot_params.__module__ = "pandas.plotting"
diff --git a/pandas/tests/__init__.py b/pandas/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/api/__init__.py b/pandas/tests/api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..4eefb1163352c16e4d4a263a8af1a457ad510599
--- /dev/null
+++ b/pandas/tests/api/test_api.py
@@ -0,0 +1,577 @@
+from __future__ import annotations
+
+import importlib
+import inspect
+import pathlib
+import pkgutil
+
+import pytest
+
+import pandas as pd
+from pandas import api
+import pandas._testing as tm
+from pandas.api import (
+    executors as api_executors,
+    extensions as api_extensions,
+    indexers as api_indexers,
+    interchange as api_interchange,
+    types as api_types,
+    typing as api_typing,
+)
+from pandas.api.typing import aliases as api_aliases
+
+
+class Base:
+    def check(self, namespace, expected, ignored=None):
+        # see which names are in the namespace, minus optional
+        # ignored ones
+        # compare vs the expected
+
+        result = sorted(
+            f for f in dir(namespace) if not f.startswith("__") and f != "annotations"
+        )
+        if ignored is not None:
+            result = sorted(set(result) - set(ignored))
+
+        expected = sorted(expected)
+        tm.assert_almost_equal(result, expected)
+
+
+class TestPDApi(Base):
+    # these are optionally imported based on testing
+    # & need to be ignored
+    ignored = ["tests", "locale", "conftest", "_version_meson"]
+
+    # top-level sub-packages
+    public_lib = [
+        "api",
+        "arrays",
+        "options",
+        "test",
+        "testing",
+        "errors",
+        "plotting",
+        "io",
+        "tseries",
+    ]
+    private_lib = ["compat", "core", "pandas", "util", "_built_with_meson"]
+
+    # misc
+    misc = ["IndexSlice", "NaT", "NA"]
+
+    # top-level classes
+    classes = [
+        "ArrowDtype",
+        "Categorical",
+        "CategoricalIndex",
+        "DataFrame",
+        "DateOffset",
+        "DatetimeIndex",
+        "ExcelFile",
+        "ExcelWriter",
+        "Flags",
+        "Grouper",
+        "HDFStore",
+        "Index",
+        "MultiIndex",
+        "Period",
+        "PeriodIndex",
+        "RangeIndex",
+        "Series",
+        "SparseDtype",
+        "StringDtype",
+        "Timedelta",
+        "TimedeltaIndex",
+        "Timestamp",
+        "Interval",
+        "IntervalIndex",
+        "CategoricalDtype",
+        "PeriodDtype",
+        "IntervalDtype",
+        "DatetimeTZDtype",
+        "BooleanDtype",
+        "Int8Dtype",
+        "Int16Dtype",
+        "Int32Dtype",
+        "Int64Dtype",
+        "UInt8Dtype",
+        "UInt16Dtype",
+        "UInt32Dtype",
+        "UInt64Dtype",
+        "Float32Dtype",
+        "Float64Dtype",
+        "NamedAgg",
+    ]
+
+    # these are already deprecated; awaiting removal
+    deprecated_classes: list[str] = []
+
+    # external modules exposed in pandas namespace
+    modules: list[str] = []
+
+    # top-level functions
+    funcs = [
+        "array",
+        "bdate_range",
+        "col",
+        "concat",
+        "crosstab",
+        "cut",
+        "date_range",
+        "interval_range",
+        "eval",
+        "factorize",
+        "get_dummies",
+        "from_dummies",
+        "infer_freq",
+        "isna",
+        "isnull",
+        "lreshape",
+        "melt",
+        "notna",
+        "notnull",
+        "offsets",
+        "merge",
+        "merge_ordered",
+        "merge_asof",
+        "period_range",
+        "pivot",
+        "pivot_table",
+        "qcut",
+        "show_versions",
+        "timedelta_range",
+        "unique",
+        "wide_to_long",
+    ]
+
+    # top-level option funcs
+    funcs_option = [
+        "reset_option",
+        "describe_option",
+        "get_option",
+        "option_context",
+        "set_option",
+        "set_eng_float_format",
+    ]
+
+    # top-level read_* funcs
+    funcs_read = [
+        "read_clipboard",
+        "read_csv",
+        "read_excel",
+        "read_fwf",
+        "read_hdf",
+        "read_html",
+        "read_xml",
+        "read_json",
+        "read_pickle",
+        "read_sas",
+        "read_sql",
+        "read_sql_query",
+        "read_sql_table",
+        "read_stata",
+        "read_table",
+        "read_feather",
+        "read_parquet",
+        "read_orc",
+        "read_spss",
+        "read_iceberg",
+    ]
+
+    # top-level json funcs
+    funcs_json = ["json_normalize"]
+
+    # top-level to_* funcs
+    funcs_to = ["to_datetime", "to_numeric", "to_pickle", "to_timedelta"]
+
+    # top-level to deprecate in the future
+    deprecated_funcs_in_future: list[str] = []
+
+    # these are already deprecated; awaiting removal
+    deprecated_funcs: list[str] = []
+
+    # private modules in pandas namespace
+    private_modules = [
+        "_config",
+        "_libs",
+        "_is_numpy_dev",
+        "_pandas_datetime_CAPI",
+        "_pandas_parser_CAPI",
+        "_testing",
+        "_typing",
+    ]
+    if not pd._built_with_meson:
+        private_modules.append("_version")
+
+    def test_api(self):
+        checkthese = (
+            self.public_lib
+            + self.private_lib
+            + self.misc
+            + self.modules
+            + self.classes
+            + self.funcs
+            + self.funcs_option
+            + self.funcs_read
+            + self.funcs_json
+            + self.funcs_to
+            + self.private_modules
+        )
+        self.check(namespace=pd, expected=checkthese, ignored=self.ignored)
+
+    def test_api_all(self):
+        expected = set(
+            self.public_lib
+            + self.misc
+            + self.modules
+            + self.classes
+            + self.funcs
+            + self.funcs_option
+            + self.funcs_read
+            + self.funcs_json
+            + self.funcs_to
+        ) - set(self.deprecated_classes)
+        actual = set(pd.__all__)
+
+        extraneous = actual - expected
+        assert not extraneous
+
+        missing = expected - actual
+        assert not missing
+
+    def test_depr(self):
+        deprecated_list = (
+            self.deprecated_classes
+            + self.deprecated_funcs
+            + self.deprecated_funcs_in_future
+        )
+        for depr in deprecated_list:
+            with tm.assert_produces_warning(FutureWarning):
+                _ = getattr(pd, depr)
+
+
+class TestApi(Base):
+    allowed_api_dirs = [
+        "executors",
+        "types",
+        "extensions",
+        "indexers",
+        "interchange",
+        "typing",
+        "internals",
+    ]
+    allowed_typing = [
+        "DataFrameGroupBy",
+        "DatetimeIndexResamplerGroupby",
+        "Expanding",
+        "ExpandingGroupby",
+        "ExponentialMovingWindow",
+        "ExponentialMovingWindowGroupby",
+        "Expression",
+        "FrozenList",
+        "JsonReader",
+        "NaTType",
+        "NAType",
+        "NoDefault",
+        "PeriodIndexResamplerGroupby",
+        "Resampler",
+        "Rolling",
+        "RollingGroupby",
+        "SeriesGroupBy",
+        "StataReader",
+        "SASReader",
+        "TimedeltaIndexResamplerGroupby",
+        "TimeGrouper",
+        "Window",
+        "aliases",
+    ]
+    allowed_api_types = [
+        "is_any_real_numeric_dtype",
+        "is_array_like",
+        "is_bool",
+        "is_bool_dtype",
+        "is_categorical_dtype",
+        "is_complex",
+        "is_complex_dtype",
+        "is_datetime64_any_dtype",
+        "is_datetime64_dtype",
+        "is_datetime64_ns_dtype",
+        "is_datetime64tz_dtype",
+        "is_dict_like",
+        "is_dtype_equal",
+        "is_extension_array_dtype",
+        "is_file_like",
+        "is_float",
+        "is_float_dtype",
+        "is_hashable",
+        "is_int64_dtype",
+        "is_integer",
+        "is_integer_dtype",
+        "is_interval_dtype",
+        "is_iterator",
+        "is_list_like",
+        "is_named_tuple",
+        "is_number",
+        "is_numeric_dtype",
+        "is_object_dtype",
+        "is_period_dtype",
+        "is_re",
+        "is_re_compilable",
+        "is_scalar",
+        "is_signed_integer_dtype",
+        "is_sparse",
+        "is_string_dtype",
+        "is_timedelta64_dtype",
+        "is_timedelta64_ns_dtype",
+        "is_unsigned_integer_dtype",
+        "pandas_dtype",
+        "infer_dtype",
+        "union_categoricals",
+        "CategoricalDtype",
+        "DatetimeTZDtype",
+        "IntervalDtype",
+        "PeriodDtype",
+    ]
+    allowed_api_interchange = ["from_dataframe", "DataFrame"]
+    allowed_api_indexers = [
+        "check_array_indexer",
+        "BaseIndexer",
+        "FixedForwardWindowIndexer",
+        "VariableOffsetWindowIndexer",
+    ]
+    allowed_api_extensions = [
+        "no_default",
+        "ExtensionDtype",
+        "register_extension_dtype",
+        "register_dataframe_accessor",
+        "register_index_accessor",
+        "register_series_accessor",
+        "take",
+        "ExtensionArray",
+        "ExtensionScalarOpsMixin",
+    ]
+    allowed_api_executors = ["BaseExecutionEngine"]
+    allowed_api_aliases = [
+        "AggFuncType",
+        "AlignJoin",
+        "AnyAll",
+        "AnyArrayLike",
+        "ArrayLike",
+        "AstypeArg",
+        "Axes",
+        "Axis",
+        "CSVEngine",
+        "ColspaceArgType",
+        "CompressionOptions",
+        "CorrelationMethod",
+        "DropKeep",
+        "Dtype",
+        "DtypeArg",
+        "DtypeBackend",
+        "DtypeObj",
+        "ExcelWriterIfSheetExists",
+        "ExcelWriterMergeCells",
+        "FilePath",
+        "FillnaOptions",
+        "FloatFormatType",
+        "FormattersType",
+        "FromDictOrient",
+        "HTMLFlavors",
+        "IgnoreRaise",
+        "IndexLabel",
+        "InterpolateOptions",
+        "IntervalClosedType",
+        "IntervalLeftRight",
+        "JSONEngine",
+        "JSONSerializable",
+        "JoinHow",
+        "JoinValidate",
+        "ListLike",
+        "MergeHow",
+        "MergeValidate",
+        "NaPosition",
+        "NsmallestNlargestKeep",
+        "OpenFileErrors",
+        "Ordered",
+        "ParquetCompressionOptions",
+        "QuantileInterpolation",
+        "ReadBuffer",
+        "ReadCsvBuffer",
+        "ReadPickleBuffer",
+        "ReindexMethod",
+        "Scalar",
+        "ScalarIndexer",
+        "SequenceIndexer",
+        "SequenceNotStr",
+        "SliceType",
+        "SortKind",
+        "StorageOptions",
+        "Suffixes",
+        "TakeIndexer",
+        "TimeAmbiguous",
+        "TimeGrouperOrigin",
+        "TimeNonexistent",
+        "TimeUnit",
+        "TimedeltaConvertibleTypes",
+        "TimestampConvertibleTypes",
+        "ToStataByteorder",
+        "ToTimestampHow",
+        "UpdateJoin",
+        "UsecolsArgType",
+        "WindowingRankType",
+        "WriteBuffer",
+        "WriteExcelBuffer",
+        "XMLParsers",
+    ]
+
+    def test_api(self):
+        self.check(api, self.allowed_api_dirs)
+
+    def test_api_typing(self):
+        self.check(api_typing, self.allowed_typing)
+
+    def test_api_types(self):
+        self.check(api_types, self.allowed_api_types)
+
+    def test_api_interchange(self):
+        self.check(api_interchange, self.allowed_api_interchange)
+
+    def test_api_indexers(self):
+        self.check(api_indexers, self.allowed_api_indexers)
+
+    def test_api_extensions(self):
+        self.check(api_extensions, self.allowed_api_extensions)
+
+    def test_api_executors(self):
+        self.check(api_executors, self.allowed_api_executors)
+
+    def test_api_typing_aliases(self):
+        self.check(api_aliases, self.allowed_api_aliases)
+
+
+class TestErrors(Base):
+    def test_errors(self):
+        ignored = ["_CurrentDeprecationWarning", "abc", "ctypes", "cow"]
+        self.check(pd.errors, pd.errors.__all__, ignored=ignored)
+
+
+class TestUtil(Base):
+    def test_util(self):
+        self.check(
+            pd.util,
+            ["hash_array", "hash_pandas_object"],
+            ignored=[
+                "_decorators",
+                "_test_decorators",
+                "_exceptions",
+                "_validators",
+                "capitalize_first_letter",
+                "version",
+                "_print_versions",
+                "_tester",
+            ],
+        )
+
+
+class TestTesting(Base):
+    funcs = [
+        "assert_frame_equal",
+        "assert_series_equal",
+        "assert_index_equal",
+        "assert_extension_array_equal",
+    ]
+
+    def test_testing(self):
+        from pandas import testing
+
+        self.check(testing, self.funcs)
+
+    def test_util_in_top_level(self):
+        with pytest.raises(AttributeError, match="foo"):
+            pd.util.foo
+
+
+def get_pandas_objects(
+    module_name: str, recurse: bool
+) -> list[tuple[str, str, object]]:
+    """
+    Get all pandas objects within a module.
+
+    An object is determined to be part of pandas if it has a string
+    __module__ attribute that starts with ``"pandas"``.
+
+    Parameters
+    ----------
+    module_name : str
+        Name of the module to search.
+    recurse : bool
+        Whether to search submodules.
+
+    Returns
+    -------
+        List of all objects that are determined to be a part of pandas.
+    """
+    module = importlib.import_module(module_name)
+    objs = []
+
+    for name, obj in inspect.getmembers(module):
+        module_dunder = getattr(obj, "__module__", None)
+        if isinstance(module_dunder, str) and module_dunder.startswith("pandas"):
+            objs.append((module_name, name, obj))
+
+    if not recurse:
+        return objs
+
+    # __file__ can, but shouldn't, be None
+    assert isinstance(module.__file__, str)
+    paths = [pathlib.Path(module.__file__).parent]
+    for module_info in pkgutil.walk_packages(paths):
+        name = module_info.name
+        if name.startswith("_") or name == "internals":
+            continue
+        objs.extend(
+            get_pandas_objects(f"{module.__name__}.{name}", recurse=module_info.ispkg)
+        )
+    return objs
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize(
+    "module_name",
+    [
+        "pandas",
+        "pandas.api",
+        "pandas.arrays",
+        "pandas.errors",
+        pytest.param("pandas.io", marks=pytest.mark.xfail(reason="Private imports")),
+        "pandas.plotting",
+        "pandas.testing",
+    ],
+)
+def test_attributes_module(module_name):
+    """
+    Ensures that all public objects have their __module__ set to the public import path.
+    """
+    recurse = module_name not in ["pandas", "pandas.testing"]
+    objs = get_pandas_objects(module_name, recurse=recurse)
+    failures = [
+        (module_name, name, type(obj), obj.__module__)
+        for module_name, name, obj in objs
+        if not (
+            obj.__module__ == module_name
+            # Explicit exceptions
+            or ("Dtype" in name and obj.__module__ == "pandas")
+            or (name == "Categorical" and obj.__module__ == "pandas")
+        )
+    ]
+    assert len(failures) == 0, "\n".join(str(e) for e in failures)
+
+    # Check that all objects can indeed be imported from their __module__
+    failures = []
+    for module_name, name, obj in objs:
+        module = importlib.import_module(obj.__module__)
+        try:
+            getattr(module, name)
+        except Exception:
+            failures.append((module_name, name, type(obj), obj.__module__))
+    assert len(failures) == 0, "\n".join(str(e) for e in failures)
diff --git a/pandas/tests/api/test_types.py b/pandas/tests/api/test_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf39370c49d76762760a98e820ad2985a8a81222
--- /dev/null
+++ b/pandas/tests/api/test_types.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+import pandas._testing as tm
+from pandas.api import types
+from pandas.tests.api.test_api import Base
+
+
+class TestTypes(Base):
+    allowed = [
+        "is_any_real_numeric_dtype",
+        "is_bool",
+        "is_bool_dtype",
+        "is_categorical_dtype",
+        "is_complex",
+        "is_complex_dtype",
+        "is_datetime64_any_dtype",
+        "is_datetime64_dtype",
+        "is_datetime64_ns_dtype",
+        "is_datetime64tz_dtype",
+        "is_dtype_equal",
+        "is_float",
+        "is_float_dtype",
+        "is_int64_dtype",
+        "is_integer",
+        "is_integer_dtype",
+        "is_number",
+        "is_numeric_dtype",
+        "is_object_dtype",
+        "is_scalar",
+        "is_sparse",
+        "is_string_dtype",
+        "is_signed_integer_dtype",
+        "is_timedelta64_dtype",
+        "is_timedelta64_ns_dtype",
+        "is_unsigned_integer_dtype",
+        "is_period_dtype",
+        "is_interval_dtype",
+        "is_re",
+        "is_re_compilable",
+        "is_dict_like",
+        "is_iterator",
+        "is_file_like",
+        "is_list_like",
+        "is_hashable",
+        "is_array_like",
+        "is_named_tuple",
+        "pandas_dtype",
+        "union_categoricals",
+        "infer_dtype",
+        "is_extension_array_dtype",
+    ]
+    deprecated: list[str] = []
+    dtypes = ["CategoricalDtype", "DatetimeTZDtype", "PeriodDtype", "IntervalDtype"]
+
+    def test_types(self):
+        self.check(types, self.allowed + self.dtypes + self.deprecated)
+
+    def test_deprecated_from_api_types(self):
+        for t in self.deprecated:
+            with tm.assert_produces_warning(FutureWarning):
+                getattr(types, t)(1)
diff --git a/pandas/tests/apply/__init__.py b/pandas/tests/apply/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/apply/common.py b/pandas/tests/apply/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4d153df54059ca2a82f336e19afb4297eb218a2
--- /dev/null
+++ b/pandas/tests/apply/common.py
@@ -0,0 +1,7 @@
+from pandas.core.groupby.base import transformation_kernels
+
+# There is no Series.cumcount or DataFrame.cumcount
+series_transform_kernels = [
+    x for x in sorted(transformation_kernels) if x != "cumcount"
+]
+frame_transform_kernels = [x for x in sorted(transformation_kernels) if x != "cumcount"]
diff --git a/pandas/tests/apply/conftest.py b/pandas/tests/apply/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..aecf82f5a941948da66c9dda09ec9a826a2706ca
--- /dev/null
+++ b/pandas/tests/apply/conftest.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Series,
+)
+from pandas.api.executors import BaseExecutionEngine
+
+
+class MockExecutionEngine(BaseExecutionEngine):
+    """
+    Execution Engine to test if the execution engine interface receives and
+    uses all parameters provided by the user.
+
+    Making this engine work as the default Python engine by calling it, no extra
+    functionality is implemented here.
+
+    When testing, this will be called when this engine is provided, and then the
+    same pandas.map and pandas.apply function will be called, but without engine,
+    executing the default behavior from the python engine.
+    """
+
+    def map(data, func, args, kwargs, decorator, skip_na):
+        kwargs_to_pass = kwargs if isinstance(data, DataFrame) else {}
+        return data.map(func, na_action="ignore" if skip_na else None, **kwargs_to_pass)
+
+    def apply(data, func, args, kwargs, decorator, axis):
+        if isinstance(data, Series):
+            return data.apply(func, convert_dtype=True, args=args, by_row=False)
+        elif isinstance(data, DataFrame):
+            return data.apply(
+                func,
+                axis=axis,
+                raw=False,
+                result_type=None,
+                args=args,
+                by_row="compat",
+                **kwargs,
+            )
+        else:
+            assert isinstance(data, np.ndarray)
+
+            def wrap_function(func):
+                # https://github.com/numpy/numpy/issues/8352
+                def wrapper(*args, **kwargs):
+                    result = func(*args, **kwargs)
+                    if isinstance(result, str):
+                        result = np.array(result, dtype=object)
+                    return result
+
+                return wrapper
+
+            return np.apply_along_axis(wrap_function(func), axis, data, *args, **kwargs)
+
+
+class MockEngineDecorator:
+    __pandas_udf__ = MockExecutionEngine
+
+
+@pytest.fixture(params=[None, MockEngineDecorator])
+def engine(request):
+    return request.param
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c16425ac2ac73f2ea96173fe6ec97c4b8ef0cb9
--- /dev/null
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -0,0 +1,1875 @@
+from datetime import datetime
+import warnings
+
+import numpy as np
+import pytest
+
+from pandas.compat import is_platform_arm
+
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    MultiIndex,
+    Series,
+    Timestamp,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.tests.apply.conftest import MockEngineDecorator
+from pandas.tests.frame.common import zip_frames
+from pandas.util.version import Version
+
+
+@pytest.fixture
+def int_frame_const_col():
+    """
+    Fixture for DataFrame of ints which are constant per column
+
+    Columns are ['A', 'B', 'C'], with values (per column): [1, 2, 3]
+    """
+    df = DataFrame(
+        np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1,
+        columns=["A", "B", "C"],
+    )
+    return df
+
+
+@pytest.fixture(
+    params=[
+        "python",
+        pytest.param("numba", marks=pytest.mark.single_cpu),
+        MockEngineDecorator,
+    ]
+)
+def engine(request):
+    if request.param == "numba":
+        pytest.importorskip("numba")
+    return request.param
+
+
+def test_apply(float_frame, engine, request):
+    if engine == "numba":
+        mark = pytest.mark.xfail(reason="numba engine not supporting numpy ufunc yet")
+        request.node.add_marker(mark)
+    with np.errstate(all="ignore"):
+        # ufunc
+        result = np.sqrt(float_frame["A"])
+        expected = float_frame.apply(np.sqrt, engine=engine)["A"]
+        tm.assert_series_equal(result, expected)
+
+        # aggregator
+        result = float_frame.apply(np.mean, engine=engine)["A"]
+        expected = np.mean(float_frame["A"])
+        assert result == expected
+
+        d = float_frame.index[0]
+        result = float_frame.apply(np.mean, axis=1, engine=engine)
+        expected = np.mean(float_frame.xs(d))
+        assert result[d] == expected
+        assert result.index is float_frame.index
+
+
+@pytest.mark.parametrize("axis", [0, 1])
+@pytest.mark.parametrize("raw", [True, False])
+@pytest.mark.parametrize("nopython", [True, False])
+def test_apply_args(float_frame, axis, raw, engine, nopython):
+    numba = pytest.importorskip("numba")
+    if (
+        engine == "numba"
+        and Version(numba.__version__) == Version("0.61")
+        and is_platform_arm()
+    ):
+        pytest.skip(f"Segfaults on ARM platforms with numba {numba.__version__}")
+    engine_kwargs = {"nopython": nopython}
+    result = float_frame.apply(
+        lambda x, y: x + y,
+        axis,
+        args=(1,),
+        raw=raw,
+        engine=engine,
+        engine_kwargs=engine_kwargs,
+    )
+    expected = float_frame + 1
+    tm.assert_frame_equal(result, expected)
+
+    # GH:58712
+    result = float_frame.apply(
+        lambda x, a, b: x + a + b,
+        args=(1,),
+        b=2,
+        raw=raw,
+        engine=engine,
+        engine_kwargs=engine_kwargs,
+    )
+    expected = float_frame + 3
+    tm.assert_frame_equal(result, expected)
+
+    if engine == "numba":
+        # py signature binding
+        with pytest.raises(TypeError, match="missing a required argument: 'a'"):
+            float_frame.apply(
+                lambda x, a: x + a,
+                b=2,
+                raw=raw,
+                engine=engine,
+                engine_kwargs=engine_kwargs,
+            )
+
+        # keyword-only arguments are not supported in numba
+        with pytest.raises(
+            pd.errors.NumbaUtilError,
+            match="numba does not support keyword-only arguments",
+        ):
+            float_frame.apply(
+                lambda x, a, *, b: x + a + b,
+                args=(1,),
+                b=2,
+                raw=raw,
+                engine=engine,
+                engine_kwargs=engine_kwargs,
+            )
+
+        with pytest.raises(
+            pd.errors.NumbaUtilError,
+            match="numba does not support keyword-only arguments",
+        ):
+            float_frame.apply(
+                lambda *x, b: x[0] + x[1] + b,
+                args=(1,),
+                b=2,
+                raw=raw,
+                engine=engine,
+                engine_kwargs=engine_kwargs,
+            )
+
+
+def test_apply_categorical_func():
+    # GH 9573
+    df = DataFrame({"c0": ["A", "A", "B", "B"], "c1": ["C", "C", "D", "D"]})
+    result = df.apply(lambda ts: ts.astype("category"))
+
+    assert result.shape == (4, 2)
+    assert isinstance(result["c0"].dtype, CategoricalDtype)
+    assert isinstance(result["c1"].dtype, CategoricalDtype)
+
+
+def test_apply_axis1_with_ea():
+    # GH#36785
+    expected = DataFrame({"A": [Timestamp("2013-01-01", tz="UTC")]})
+    result = expected.apply(lambda x: x, axis=1)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data, dtype",
+    [(1, None), (1, CategoricalDtype([1])), (Timestamp("2013-01-01", tz="UTC"), None)],
+)
+def test_agg_axis1_duplicate_index(data, dtype):
+    # GH 42380
+    expected = DataFrame([[data], [data]], index=["a", "a"], dtype=dtype)
+    result = expected.agg(lambda x: x, axis=1)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_mixed_datetimelike():
+    # mixed datetimelike
+    # GH 7778
+    expected = DataFrame(
+        {
+            "A": date_range("20130101", periods=3),
+            "B": pd.to_timedelta(np.arange(3), unit="s"),
+        }
+    )
+    result = expected.apply(lambda x: x, axis=1)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", [np.sqrt, np.mean])
+def test_apply_empty(func, engine):
+    # empty
+    empty_frame = DataFrame()
+
+    result = empty_frame.apply(func, engine=engine)
+    assert result.empty
+
+
+def test_apply_float_frame(float_frame, engine):
+    no_rows = float_frame[:0]
+    result = no_rows.apply(lambda x: x.mean(), engine=engine)
+    expected = Series(np.nan, index=float_frame.columns)
+    tm.assert_series_equal(result, expected)
+
+    no_cols = float_frame.loc[:, []]
+    result = no_cols.apply(lambda x: x.mean(), axis=1, engine=engine)
+    expected = Series(np.nan, index=float_frame.index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_empty_except_index(engine):
+    # GH 2476
+    expected = DataFrame(index=["a"])
+    result = expected.apply(lambda x: x["a"], axis=1, engine=engine)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_with_reduce_empty():
+    # reduce with an empty DataFrame
+    empty_frame = DataFrame()
+
+    x = []
+    result = empty_frame.apply(x.append, axis=1, result_type="expand")
+    tm.assert_frame_equal(result, empty_frame)
+    result = empty_frame.apply(x.append, axis=1, result_type="reduce")
+    expected = Series([], dtype=np.float64)
+    tm.assert_series_equal(result, expected)
+
+    empty_with_cols = DataFrame(columns=["a", "b", "c"])
+    result = empty_with_cols.apply(x.append, axis=1, result_type="expand")
+    tm.assert_frame_equal(result, empty_with_cols)
+    result = empty_with_cols.apply(x.append, axis=1, result_type="reduce")
+    expected = Series([], dtype=np.float64)
+    tm.assert_series_equal(result, expected)
+
+    # Ensure that x.append hasn't been called
+    assert x == []
+
+
+@pytest.mark.parametrize("func", ["sum", "prod", "any", "all"])
+def test_apply_funcs_over_empty(func):
+    # GH 28213
+    df = DataFrame(columns=["a", "b", "c"])
+
+    result = df.apply(getattr(np, func))
+    expected = getattr(df, func)()
+    if func in ("sum", "prod"):
+        expected = expected.astype(float)
+    tm.assert_series_equal(result, expected)
+
+
+def test_nunique_empty():
+    # GH 28213
+    df = DataFrame(columns=["a", "b", "c"])
+
+    result = df.nunique()
+    expected = Series(0, index=df.columns)
+    tm.assert_series_equal(result, expected)
+
+    result = df.T.nunique()
+    expected = Series([], dtype=np.float64)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_standard_nonunique():
+    df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"])
+
+    result = df.apply(lambda s: s[0], axis=1)
+    expected = Series([1, 4, 7], ["a", "a", "c"])
+    tm.assert_series_equal(result, expected)
+
+    result = df.T.apply(lambda s: s[0], axis=0)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_broadcast_scalars(float_frame):
+    # scalars
+    result = float_frame.apply(np.mean, result_type="broadcast")
+    expected = DataFrame([float_frame.mean()], index=float_frame.index)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_broadcast_scalars_axis1(float_frame):
+    result = float_frame.apply(np.mean, axis=1, result_type="broadcast")
+    m = float_frame.mean(axis=1)
+    expected = DataFrame(dict.fromkeys(float_frame.columns, m))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_broadcast_lists_columns(float_frame):
+    # lists
+    result = float_frame.apply(
+        lambda x: list(range(len(float_frame.columns))),
+        axis=1,
+        result_type="broadcast",
+    )
+    m = list(range(len(float_frame.columns)))
+    expected = DataFrame(
+        [m] * len(float_frame.index),
+        dtype="float64",
+        index=float_frame.index,
+        columns=float_frame.columns,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_broadcast_lists_index(float_frame):
+    result = float_frame.apply(
+        lambda x: list(range(len(float_frame.index))), result_type="broadcast"
+    )
+    m = list(range(len(float_frame.index)))
+    expected = DataFrame(
+        dict.fromkeys(float_frame.columns, m),
+        dtype="float64",
+        index=float_frame.index,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_broadcast_list_lambda_func(int_frame_const_col):
+    # preserve columns
+    df = int_frame_const_col
+    result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="broadcast")
+    tm.assert_frame_equal(result, df)
+
+
+def test_apply_broadcast_series_lambda_func(int_frame_const_col):
+    df = int_frame_const_col
+    result = df.apply(
+        lambda x: Series([1, 2, 3], index=list("abc")),
+        axis=1,
+        result_type="broadcast",
+    )
+    expected = df.copy()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("axis", [0, 1])
+def test_apply_raw_float_frame(float_frame, axis, engine):
+    if engine == "numba":
+        pytest.skip("numba can't handle when UDF returns None.")
+
+    def _assert_raw(x):
+        assert isinstance(x, np.ndarray)
+        assert x.ndim == 1
+
+    float_frame.apply(_assert_raw, axis=axis, engine=engine, raw=True)
+
+
+@pytest.mark.parametrize("axis", [0, 1])
+def test_apply_raw_float_frame_lambda(float_frame, axis, engine):
+    result = float_frame.apply(np.mean, axis=axis, engine=engine, raw=True)
+    expected = float_frame.apply(lambda x: x.values.mean(), axis=axis)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_raw_float_frame_no_reduction(float_frame, engine):
+    # no reduction
+    result = float_frame.apply(lambda x: x * 2, engine=engine, raw=True)
+    expected = float_frame * 2
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("axis", [0, 1])
+def test_apply_raw_mixed_type_frame(axis, engine):
+    if engine == "numba":
+        pytest.skip("isinstance check doesn't work with numba")
+
+    def _assert_raw(x):
+        assert isinstance(x, np.ndarray)
+        assert x.ndim == 1
+
+    # Mixed dtype (GH-32423)
+    df = DataFrame(
+        {
+            "a": 1.0,
+            "b": 2,
+            "c": "foo",
+            "float32": np.array([1.0] * 10, dtype="float32"),
+            "int32": np.array([1] * 10, dtype="int32"),
+        },
+        index=np.arange(10),
+    )
+    df.apply(_assert_raw, axis=axis, engine=engine, raw=True)
+
+
+def test_apply_axis1(float_frame):
+    d = float_frame.index[0]
+    result = float_frame.apply(np.mean, axis=1)[d]
+    expected = np.mean(float_frame.xs(d))
+    assert result == expected
+
+
+def test_apply_mixed_dtype_corner():
+    df = DataFrame({"A": ["foo"], "B": [1.0]})
+    result = df[:0].apply(np.mean, axis=1)
+    # the result here is actually kind of ambiguous, should it be a Series
+    # or a DataFrame?
+    expected = Series(dtype=np.float64)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_mixed_dtype_corner_indexing():
+    df = DataFrame({"A": ["foo"], "B": [1.0]})
+    result = df.apply(lambda x: x["A"], axis=1)
+    expected = Series(["foo"], index=range(1))
+    tm.assert_series_equal(result, expected)
+
+    result = df.apply(lambda x: x["B"], axis=1)
+    expected = Series([1.0], index=range(1))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.filterwarnings("ignore::RuntimeWarning")
+@pytest.mark.parametrize("ax", ["index", "columns"])
+@pytest.mark.parametrize(
+    "func", [lambda x: x, lambda x: x.mean()], ids=["identity", "mean"]
+)
+@pytest.mark.parametrize("raw", [True, False])
+@pytest.mark.parametrize("axis", [0, 1])
+def test_apply_empty_infer_type(ax, func, raw, axis, engine, request):
+    df = DataFrame(**{ax: ["a", "b", "c"]})
+
+    with np.errstate(all="ignore"):
+        test_res = func(np.array([], dtype="f8"))
+        is_reduction = not isinstance(test_res, np.ndarray)
+
+        result = df.apply(func, axis=axis, engine=engine, raw=raw)
+        if is_reduction:
+            agg_axis = df._get_agg_axis(axis)
+            assert isinstance(result, Series)
+            assert result.index is agg_axis
+        else:
+            assert isinstance(result, DataFrame)
+
+
+def test_apply_empty_infer_type_broadcast():
+    no_cols = DataFrame(index=["a", "b", "c"])
+    result = no_cols.apply(lambda x: x.mean(), result_type="broadcast")
+    assert isinstance(result, DataFrame)
+
+
+def test_apply_with_args_kwds_add_some(float_frame):
+    def add_some(x, howmuch=0):
+        return x + howmuch
+
+    result = float_frame.apply(add_some, howmuch=2)
+    expected = float_frame.apply(lambda x: x + 2)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_with_args_kwds_agg_and_add(float_frame):
+    def agg_and_add(x, howmuch=0):
+        return x.mean() + howmuch
+
+    result = float_frame.apply(agg_and_add, howmuch=2)
+    expected = float_frame.apply(lambda x: x.mean() + 2)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_with_args_kwds_subtract_and_divide(float_frame):
+    def subtract_and_divide(x, sub, divide=1):
+        return (x - sub) / divide
+
+    result = float_frame.apply(subtract_and_divide, args=(2,), divide=2)
+    expected = float_frame.apply(lambda x: (x - 2.0) / 2.0)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_yield_list(float_frame):
+    result = float_frame.apply(list)
+    tm.assert_frame_equal(result, float_frame)
+
+
+def test_apply_reduce_Series(float_frame):
+    float_frame.iloc[::2, float_frame.columns.get_loc("A")] = np.nan
+    expected = float_frame.mean(axis=1)
+    result = float_frame.apply(np.mean, axis=1)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_reduce_to_dict():
+    # GH 25196 37544
+    data = DataFrame([[1, 2], [3, 4]], columns=["c0", "c1"], index=["i0", "i1"])
+
+    result = data.apply(dict, axis=0)
+    expected = Series([{"i0": 1, "i1": 3}, {"i0": 2, "i1": 4}], index=data.columns)
+    tm.assert_series_equal(result, expected)
+
+    result = data.apply(dict, axis=1)
+    expected = Series([{"c0": 1, "c1": 2}, {"c0": 3, "c1": 4}], index=data.index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_differently_indexed():
+    df = DataFrame(np.random.default_rng(2).standard_normal((20, 10)))
+
+    result = df.apply(Series.describe, axis=0)
+    expected = DataFrame({i: v.describe() for i, v in df.items()}, columns=df.columns)
+    tm.assert_frame_equal(result, expected)
+
+    result = df.apply(Series.describe, axis=1)
+    expected = DataFrame({i: v.describe() for i, v in df.T.items()}, columns=df.index).T
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_bug():
+    # GH 6125
+    positions = DataFrame(
+        [
+            [1, "ABC0", 50],
+            [1, "YUM0", 20],
+            [1, "DEF0", 20],
+            [2, "ABC1", 50],
+            [2, "YUM1", 20],
+            [2, "DEF1", 20],
+        ],
+        columns=["a", "market", "position"],
+    )
+
+    def f(r):
+        return r["market"]
+
+    expected = positions.apply(f, axis=1)
+
+    positions = DataFrame(
+        [
+            [datetime(2013, 1, 1), "ABC0", 50],
+            [datetime(2013, 1, 2), "YUM0", 20],
+            [datetime(2013, 1, 3), "DEF0", 20],
+            [datetime(2013, 1, 4), "ABC1", 50],
+            [datetime(2013, 1, 5), "YUM1", 20],
+            [datetime(2013, 1, 6), "DEF1", 20],
+        ],
+        columns=["a", "market", "position"],
+    )
+    result = positions.apply(f, axis=1)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_convert_objects():
+    expected = DataFrame(
+        {
+            "A": [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            "B": [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            "C": [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            "D": np.random.default_rng(2).standard_normal(11),
+            "E": np.random.default_rng(2).standard_normal(11),
+            "F": np.random.default_rng(2).standard_normal(11),
+        }
+    )
+
+    result = expected.apply(lambda x: x, axis=1)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_attach_name(float_frame):
+    result = float_frame.apply(lambda x: x.name)
+    expected = Series(float_frame.columns, index=float_frame.columns)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_attach_name_axis1(float_frame):
+    result = float_frame.apply(lambda x: x.name, axis=1)
+    expected = Series(float_frame.index, index=float_frame.index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_attach_name_non_reduction(float_frame):
+    # non-reductions
+    result = float_frame.apply(lambda x: np.repeat(x.name, len(x)))
+    expected = DataFrame(
+        np.tile(float_frame.columns, (len(float_frame.index), 1)),
+        index=float_frame.index,
+        columns=float_frame.columns,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_attach_name_non_reduction_axis1(float_frame):
+    result = float_frame.apply(lambda x: np.repeat(x.name, len(x)), axis=1)
+    expected = Series(
+        np.repeat(t[0], len(float_frame.columns)) for t in float_frame.itertuples()
+    )
+    expected.index = float_frame.index
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_multi_index():
+    index = MultiIndex.from_arrays([["a", "a", "b"], ["c", "d", "d"]])
+    s = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["col1", "col2"])
+    result = s.apply(lambda x: Series({"min": min(x), "max": max(x)}), 1)
+    expected = DataFrame([[1, 2], [3, 4], [5, 6]], index=index, columns=["min", "max"])
+    tm.assert_frame_equal(result, expected, check_like=True)
+
+
+@pytest.mark.parametrize(
+    "df, dicts",
+    [
+        [
+            DataFrame([["foo", "bar"], ["spam", "eggs"]]),
+            Series([{0: "foo", 1: "spam"}, {0: "bar", 1: "eggs"}]),
+        ],
+        [DataFrame([[0, 1], [2, 3]]), Series([{0: 0, 1: 2}, {0: 1, 1: 3}])],
+    ],
+)
+def test_apply_dict(df, dicts):
+    # GH 8735
+    fn = lambda x: x.to_dict()
+    reduce_true = df.apply(fn, result_type="reduce")
+    reduce_false = df.apply(fn, result_type="expand")
+    reduce_none = df.apply(fn)
+
+    tm.assert_series_equal(reduce_true, dicts)
+    tm.assert_frame_equal(reduce_false, df)
+    tm.assert_series_equal(reduce_none, dicts)
+
+
+def test_apply_non_numpy_dtype():
+    # GH 12244
+    df = DataFrame({"dt": date_range("2015-01-01", periods=3, tz="Europe/Brussels")})
+    result = df.apply(lambda x: x)
+    tm.assert_frame_equal(result, df)
+
+    result = df.apply(lambda x: x + pd.Timedelta("1day"))
+    expected = DataFrame(
+        {"dt": date_range("2015-01-02", periods=3, tz="Europe/Brussels")}
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_non_numpy_dtype_category():
+    df = DataFrame({"dt": ["a", "b", "c", "a"]}, dtype="category")
+    result = df.apply(lambda x: x)
+    tm.assert_frame_equal(result, df)
+
+
+def test_apply_dup_names_multi_agg():
+    # GH 21063
+    df = DataFrame([[0, 1], [2, 3]], columns=["a", "a"])
+    expected = DataFrame([[0, 1]], columns=["a", "a"], index=["min"])
+    result = df.agg(["min"])
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("op", ["apply", "agg"])
+def test_apply_nested_result_axis_1(op):
+    # GH 13820
+    def apply_list(row):
+        return [2 * row["A"], 2 * row["C"], 2 * row["B"]]
+
+    df = DataFrame(np.zeros((4, 4)), columns=list("ABCD"))
+    result = getattr(df, op)(apply_list, axis=1)
+    expected = Series(
+        [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_noreduction_tzaware_object():
+    # https://github.com/pandas-dev/pandas/issues/31505
+    expected = DataFrame(
+        {"foo": [Timestamp("2020", tz="UTC")]}, dtype="datetime64[ns, UTC]"
+    )
+    result = expected.apply(lambda x: x)
+    tm.assert_frame_equal(result, expected)
+    result = expected.apply(lambda x: x.copy())
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_function_runs_once():
+    # https://github.com/pandas-dev/pandas/issues/30815
+
+    df = DataFrame({"a": [1, 2, 3]})
+    names = []  # Save row names function is applied to
+
+    def reducing_function(row):
+        names.append(row.name)
+
+    def non_reducing_function(row):
+        names.append(row.name)
+        return row
+
+    for func in [reducing_function, non_reducing_function]:
+        del names[:]
+
+        df.apply(func, axis=1)
+        assert names == list(df.index)
+
+
+def test_apply_raw_function_runs_once(engine):
+    # https://github.com/pandas-dev/pandas/issues/34506
+    if engine == "numba":
+        pytest.skip("appending to list outside of numba func is not supported")
+
+    df = DataFrame({"a": [1, 2, 3]})
+    values = []  # Save row values function is applied to
+
+    def reducing_function(row):
+        values.extend(row)
+
+    def non_reducing_function(row):
+        values.extend(row)
+        return row
+
+    for func in [reducing_function, non_reducing_function]:
+        del values[:]
+
+        df.apply(func, engine=engine, raw=True, axis=1)
+        assert values == list(df.a.to_list())
+
+
+def test_apply_with_byte_string():
+    # GH 34529
+    df = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"])
+    expected = DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"], dtype=object)
+    # After we make the apply we expect a dataframe just
+    # like the original but with the object datatype
+    result = df.apply(lambda x: x.astype("object"))
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("val", ["asd", 12, None, np.nan])
+def test_apply_category_equalness(val):
+    # Check if categorical comparisons on apply, GH 21239
+    df_values = ["asd", None, 12, "asd", "cde", np.nan]
+    df = DataFrame({"a": df_values}, dtype="category")
+
+    result = df.a.apply(lambda x: x == val)
+    expected = Series(
+        [False if pd.isnull(x) else x == val for x in df_values], name="a"
+    )
+    # False since behavior of NaN for categorical dtype has been changed (GH 59966)
+    tm.assert_series_equal(result, expected)
+
+
+# the user has supplied an opaque UDF where
+# they are transforming the input that requires
+# us to infer the output
+
+
+def test_infer_row_shape():
+    # GH 17437
+    # if row shape is changing, infer it
+    df = DataFrame(np.random.default_rng(2).random((10, 2)))
+    result = df.apply(np.fft.fft, axis=0).shape
+    assert result == (10, 2)
+
+    result = df.apply(np.fft.rfft, axis=0).shape
+    assert result == (6, 2)
+
+
+@pytest.mark.parametrize(
+    "ops, by_row, expected",
+    [
+        ({"a": lambda x: x + 1}, "compat", DataFrame({"a": [2, 3]})),
+        ({"a": lambda x: x + 1}, False, DataFrame({"a": [2, 3]})),
+        ({"a": lambda x: x.sum()}, "compat", Series({"a": 3})),
+        ({"a": lambda x: x.sum()}, False, Series({"a": 3})),
+        (
+            {"a": ["sum", np.sum, lambda x: x.sum()]},
+            "compat",
+            DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
+        ),
+        (
+            {"a": ["sum", np.sum, lambda x: x.sum()]},
+            False,
+            DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
+        ),
+        ({"a": lambda x: 1}, "compat", DataFrame({"a": [1, 1]})),
+        ({"a": lambda x: 1}, False, Series({"a": 1})),
+    ],
+)
+def test_dictlike_lambda(ops, by_row, expected):
+    # GH53601
+    df = DataFrame({"a": [1, 2]})
+    result = df.apply(ops, by_row=by_row)
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ops",
+    [
+        {"a": lambda x: x + 1},
+        {"a": lambda x: x.sum()},
+        {"a": ["sum", np.sum, lambda x: x.sum()]},
+        {"a": lambda x: 1},
+    ],
+)
+def test_dictlike_lambda_raises(ops):
+    # GH53601
+    df = DataFrame({"a": [1, 2]})
+    with pytest.raises(ValueError, match="by_row=True not allowed"):
+        df.apply(ops, by_row=True)
+
+
+def test_with_dictlike_columns():
+    # GH 17602
+    df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
+    result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1)
+    expected = Series([{"s": 3} for t in df.itertuples()])
+    tm.assert_series_equal(result, expected)
+
+    df["tm"] = [
+        Timestamp("2017-05-01 00:00:00"),
+        Timestamp("2017-05-02 00:00:00"),
+    ]
+    result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1)
+    tm.assert_series_equal(result, expected)
+
+    # compose a series
+    result = (df["a"] + df["b"]).apply(lambda x: {"s": x})
+    expected = Series([{"s": 3}, {"s": 3}])
+    tm.assert_series_equal(result, expected)
+
+
+def test_with_dictlike_columns_with_datetime():
+    # GH 18775
+    df = DataFrame()
+    df["author"] = ["X", "Y", "Z"]
+    df["publisher"] = ["BBC", "NBC", "N24"]
+    df["date"] = pd.to_datetime(
+        ["17-10-2010 07:15:30", "13-05-2011 08:20:35", "15-01-2013 09:09:09"],
+        dayfirst=True,
+    )
+    result = df.apply(lambda x: {}, axis=1)
+    expected = Series([{}, {}, {}])
+    tm.assert_series_equal(result, expected)
+
+
+def test_with_dictlike_columns_with_infer():
+    # GH 17602
+    df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
+    result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand")
+    expected = DataFrame({"s": [3, 3]})
+    tm.assert_frame_equal(result, expected)
+
+    df["tm"] = [
+        Timestamp("2017-05-01 00:00:00"),
+        Timestamp("2017-05-02 00:00:00"),
+    ]
+    result = df.apply(lambda x: {"s": x["a"] + x["b"]}, axis=1, result_type="expand")
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ops, by_row, expected",
+    [
+        ([lambda x: x + 1], "compat", DataFrame({("a", "<lambda>"): [2, 3]})),
+        ([lambda x: x + 1], False, DataFrame({("a", "<lambda>"): [2, 3]})),
+        ([lambda x: x.sum()], "compat", DataFrame({"a": [3]}, index=["<lambda>"])),
+        ([lambda x: x.sum()], False, DataFrame({"a": [3]}, index=["<lambda>"])),
+        (
+            ["sum", np.sum, lambda x: x.sum()],
+            "compat",
+            DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
+        ),
+        (
+            ["sum", np.sum, lambda x: x.sum()],
+            False,
+            DataFrame({"a": [3, 3, 3]}, index=["sum", "sum", "<lambda>"]),
+        ),
+        (
+            [lambda x: x + 1, lambda x: 3],
+            "compat",
+            DataFrame([[2, 3], [3, 3]], columns=[["a", "a"], ["<lambda>", "<lambda>"]]),
+        ),
+        (
+            [lambda x: 2, lambda x: 3],
+            False,
+            DataFrame({"a": [2, 3]}, ["<lambda>", "<lambda>"]),
+        ),
+    ],
+)
+def test_listlike_lambda(ops, by_row, expected):
+    # GH53601
+    df = DataFrame({"a": [1, 2]})
+    result = df.apply(ops, by_row=by_row)
+    tm.assert_equal(result, expected)
+
+
+def test_listlike_datetime_index_unsorted():
+    # https://github.com/pandas-dev/pandas/pull/62843
+    values = [datetime(2024, 1, 1), datetime(2024, 1, 2), datetime(2024, 1, 3)]
+    df = DataFrame({"a": [1, 2]}, index=[values[1], values[0]])
+    result = df.apply([lambda x: x, lambda x: x.shift(freq="D")], by_row=False)
+    expected = DataFrame(
+        [[1.0, 2.0], [2.0, np.nan], [np.nan, 1.0]],
+        index=[values[1], values[0], values[2]],
+        columns=MultiIndex([["a"], ["<lambda>"]], codes=[[0, 0], [0, 0]]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_dictlike_datetime_index_unsorted():
+    # https://github.com/pandas-dev/pandas/pull/62843
+    values = [datetime(2024, 1, 1), datetime(2024, 1, 2), datetime(2024, 1, 3)]
+    df = DataFrame({"a": [1, 2], "b": [3, 4]}, index=[values[1], values[0]])
+    result = df.apply(
+        {"a": lambda x: x, "b": lambda x: x.shift(freq="D")}, by_row=False
+    )
+    expected = DataFrame(
+        {
+            "a": [1.0, 2.0, np.nan],
+            "b": [4.0, np.nan, 3.0],
+        },
+        index=[values[1], values[0], values[2]],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ops",
+    [
+        [lambda x: x + 1],
+        [lambda x: x.sum()],
+        ["sum", np.sum, lambda x: x.sum()],
+        [lambda x: x + 1, lambda x: 3],
+    ],
+)
+def test_listlike_lambda_raises(ops):
+    # GH53601
+    df = DataFrame({"a": [1, 2]})
+    with pytest.raises(ValueError, match="by_row=True not allowed"):
+        df.apply(ops, by_row=True)
+
+
+def test_with_listlike_columns():
+    # GH 17348
+    df = DataFrame(
+        {
+            "a": Series(np.random.default_rng(2).standard_normal(4)),
+            "b": ["a", "list", "of", "words"],
+            "ts": date_range("2016-10-01", periods=4, freq="h"),
+        }
+    )
+
+    result = df[["a", "b"]].apply(tuple, axis=1)
+    expected = Series([t[1:] for t in df[["a", "b"]].itertuples()])
+    tm.assert_series_equal(result, expected)
+
+    result = df[["a", "ts"]].apply(tuple, axis=1)
+    expected = Series([t[1:] for t in df[["a", "ts"]].itertuples()])
+    tm.assert_series_equal(result, expected)
+
+
+def test_with_listlike_columns_returning_list():
+    # GH 18919
+    df = DataFrame({"x": Series([["a", "b"], ["q"]]), "y": Series([["z"], ["q", "t"]])})
+    df.index = MultiIndex.from_tuples([("i0", "j0"), ("i1", "j1")])
+
+    result = df.apply(lambda row: [el for el in row["x"] if el in row["y"]], axis=1)
+    expected = Series([[], ["q"]], index=df.index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_infer_output_shape_columns():
+    # GH 18573
+
+    df = DataFrame(
+        {
+            "number": [1.0, 2.0],
+            "string": ["foo", "bar"],
+            "datetime": [
+                Timestamp("2017-11-29 03:30:00"),
+                Timestamp("2017-11-29 03:45:00"),
+            ],
+        }
+    )
+    result = df.apply(lambda row: (row.number, row.string), axis=1)
+    expected = Series([(t.number, t.string) for t in df.itertuples()])
+    tm.assert_series_equal(result, expected)
+
+
+def test_infer_output_shape_listlike_columns():
+    # GH 16353
+
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((6, 3)), columns=["A", "B", "C"]
+    )
+
+    result = df.apply(lambda x: [1, 2, 3], axis=1)
+    expected = Series([[1, 2, 3] for t in df.itertuples()])
+    tm.assert_series_equal(result, expected)
+
+    result = df.apply(lambda x: [1, 2], axis=1)
+    expected = Series([[1, 2] for t in df.itertuples()])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("val", [1, 2])
+def test_infer_output_shape_listlike_columns_np_func(val):
+    # GH 17970
+    df = DataFrame({"a": [1, 2, 3]}, index=list("abc"))
+
+    result = df.apply(lambda row: np.ones(val), axis=1)
+    expected = Series([np.ones(val) for t in df.itertuples()], index=df.index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_infer_output_shape_listlike_columns_with_timestamp():
+    # GH 17892
+    df = DataFrame(
+        {
+            "a": [
+                Timestamp("2010-02-01"),
+                Timestamp("2010-02-04"),
+                Timestamp("2010-02-05"),
+                Timestamp("2010-02-06"),
+            ],
+            "b": [9, 5, 4, 3],
+            "c": [5, 3, 4, 2],
+            "d": [1, 2, 3, 4],
+        }
+    )
+
+    def fun(x):
+        return (1, 2)
+
+    result = df.apply(fun, axis=1)
+    expected = Series([(1, 2) for t in df.itertuples()])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("lst", [[1, 2, 3], [1, 2]])
+def test_consistent_coerce_for_shapes(lst):
+    # we want column names to NOT be propagated
+    # just because the shape matches the input shape
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((4, 3)), columns=["A", "B", "C"]
+    )
+
+    result = df.apply(lambda x: lst, axis=1)
+    expected = Series([lst for t in df.itertuples()])
+    tm.assert_series_equal(result, expected)
+
+
+def test_consistent_names(int_frame_const_col):
+    # if a Series is returned, we should use the resulting index names
+    df = int_frame_const_col
+
+    result = df.apply(
+        lambda x: Series([1, 2, 3], index=["test", "other", "cols"]), axis=1
+    )
+    expected = int_frame_const_col.rename(
+        columns={"A": "test", "B": "other", "C": "cols"}
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = df.apply(lambda x: Series([1, 2], index=["test", "other"]), axis=1)
+    expected = expected[["test", "other"]]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_result_type(int_frame_const_col):
+    # result_type should be consistent no matter which
+    # path we take in the code
+    df = int_frame_const_col
+
+    result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand")
+    expected = df.copy()
+    expected.columns = range(3)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_result_type_shorter_list(int_frame_const_col):
+    # result_type should be consistent no matter which
+    # path we take in the code
+    df = int_frame_const_col
+    result = df.apply(lambda x: [1, 2], axis=1, result_type="expand")
+    expected = df[["A", "B"]].copy()
+    expected.columns = range(2)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_result_type_broadcast(int_frame_const_col, request, engine):
+    # result_type should be consistent no matter which
+    # path we take in the code
+    if engine == "numba":
+        mark = pytest.mark.xfail(reason="numba engine doesn't support list return")
+        request.node.add_marker(mark)
+    df = int_frame_const_col
+    if engine is MockEngineDecorator:
+        with pytest.raises(
+            NotImplementedError,
+            match="result_type='broadcast' only implemented for the default engine",
+        ):
+            df.apply(
+                lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine
+            )
+    else:
+        # broadcast result
+        result = df.apply(
+            lambda x: [1, 2, 3], axis=1, result_type="broadcast", engine=engine
+        )
+        expected = df.copy()
+        tm.assert_frame_equal(result, expected)
+
+
+def test_result_type_broadcast_series_func(int_frame_const_col, engine, request):
+    # result_type should be consistent no matter which
+    # path we take in the code
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="numba Series constructor only support ndarrays not list data"
+        )
+        request.node.add_marker(mark)
+    df = int_frame_const_col
+    columns = ["other", "col", "names"]
+
+    if engine is MockEngineDecorator:
+        with pytest.raises(
+            NotImplementedError,
+            match="result_type='broadcast' only implemented for the default engine",
+        ):
+            df.apply(
+                lambda x: Series([1, 2, 3], index=columns),
+                axis=1,
+                result_type="broadcast",
+                engine=engine,
+            )
+    else:
+        result = df.apply(
+            lambda x: Series([1, 2, 3], index=columns),
+            axis=1,
+            result_type="broadcast",
+            engine=engine,
+        )
+        expected = df.copy()
+        tm.assert_frame_equal(result, expected)
+
+
+def test_result_type_series_result(int_frame_const_col, engine, request):
+    # result_type should be consistent no matter which
+    # path we take in the code
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="numba Series constructor only support ndarrays not list data"
+        )
+        request.node.add_marker(mark)
+    df = int_frame_const_col
+    # series result
+    result = df.apply(lambda x: Series([1, 2, 3], index=x.index), axis=1, engine=engine)
+    expected = df.copy()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_result_type_series_result_other_index(int_frame_const_col, engine, request):
+    # result_type should be consistent no matter which
+    # path we take in the code
+
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="no support in numba Series constructor for list of columns"
+        )
+        request.node.add_marker(mark)
+    df = int_frame_const_col
+    # series result with other index
+    columns = ["other", "col", "names"]
+    result = df.apply(lambda x: Series([1, 2, 3], index=columns), axis=1, engine=engine)
+    expected = df.copy()
+    expected.columns = columns
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "box",
+    [lambda x: list(x), lambda x: tuple(x), lambda x: np.array(x, dtype="int64")],
+    ids=["list", "tuple", "array"],
+)
+def test_consistency_for_boxed(box, int_frame_const_col):
+    # passing an array or list should not affect the output shape
+    df = int_frame_const_col
+
+    result = df.apply(lambda x: box([1, 2]), axis=1)
+    expected = Series([box([1, 2]) for t in df.itertuples()])
+    tm.assert_series_equal(result, expected)
+
+    result = df.apply(lambda x: box([1, 2]), axis=1, result_type="expand")
+    expected = int_frame_const_col[["A", "B"]].rename(columns={"A": 0, "B": 1})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_agg_transform(axis, float_frame):
+    other_axis = 1 if axis in {0, "index"} else 0
+
+    with np.errstate(all="ignore"):
+        f_abs = np.abs(float_frame)
+        f_sqrt = np.sqrt(float_frame)
+
+        # ufunc
+        expected = f_sqrt.copy()
+        result = float_frame.apply(np.sqrt, axis=axis)
+        tm.assert_frame_equal(result, expected)
+
+        # list-like
+        result = float_frame.apply([np.sqrt], axis=axis)
+        expected = f_sqrt.copy()
+        if axis in {0, "index"}:
+            expected.columns = MultiIndex.from_product([float_frame.columns, ["sqrt"]])
+        else:
+            expected.index = MultiIndex.from_product([float_frame.index, ["sqrt"]])
+        tm.assert_frame_equal(result, expected)
+
+        # multiple items in list
+        # these are in the order as if we are applying both
+        # functions per series and then concatting
+        result = float_frame.apply([np.abs, np.sqrt], axis=axis)
+        expected = zip_frames([f_abs, f_sqrt], axis=other_axis)
+        if axis in {0, "index"}:
+            expected.columns = MultiIndex.from_product(
+                [float_frame.columns, ["absolute", "sqrt"]]
+            )
+        else:
+            expected.index = MultiIndex.from_product(
+                [float_frame.index, ["absolute", "sqrt"]]
+            )
+        tm.assert_frame_equal(result, expected)
+
+
+def test_demo():
+    # demonstration tests
+    df = DataFrame({"A": range(5), "B": 5})
+
+    result = df.agg(["min", "max"])
+    expected = DataFrame(
+        {"A": [0, 4], "B": [5, 5]}, columns=["A", "B"], index=["min", "max"]
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_demo_dict_agg():
+    # demonstration tests
+    df = DataFrame({"A": range(5), "B": 5})
+    result = df.agg({"A": ["min", "max"], "B": ["sum", "max"]})
+    expected = DataFrame(
+        {"A": [4.0, 0.0, np.nan], "B": [5.0, np.nan, 25.0]},
+        columns=["A", "B"],
+        index=["max", "min", "sum"],
+    )
+    tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+
+def test_agg_with_name_as_column_name():
+    # GH 36212 - Column name is "name"
+    data = {"name": ["foo", "bar"]}
+    df = DataFrame(data)
+
+    # result's name should be None
+    result = df.agg({"name": "count"})
+    expected = Series({"name": 2})
+    tm.assert_series_equal(result, expected)
+
+    # Check if name is still preserved when aggregating series instead
+    result = df["name"].agg({"name": "count"})
+    expected = Series({"name": 2}, name="name")
+    tm.assert_series_equal(result, expected)
+
+
+def test_agg_multiple_mixed():
+    # GH 20909
+    mdf = DataFrame(
+        {
+            "A": [1, 2, 3],
+            "B": [1.0, 2.0, 3.0],
+            "C": ["foo", "bar", "baz"],
+        }
+    )
+    expected = DataFrame(
+        {
+            "A": [1, 6],
+            "B": [1.0, 6.0],
+            "C": ["bar", "foobarbaz"],
+        },
+        index=["min", "sum"],
+    )
+    # sorted index
+    result = mdf.agg(["min", "sum"])
+    tm.assert_frame_equal(result, expected)
+
+    result = mdf[["C", "B", "A"]].agg(["sum", "min"])
+    # GH40420: the result of .agg should have an index that is sorted
+    # according to the arguments provided to agg.
+    expected = expected[["C", "B", "A"]].reindex(["sum", "min"])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_agg_multiple_mixed_raises():
+    # GH 20909
+    mdf = DataFrame(
+        {
+            "A": [1, 2, 3],
+            "B": [1.0, 2.0, 3.0],
+            "C": ["foo", "bar", "baz"],
+            "D": date_range("20130101", periods=3),
+        }
+    )
+
+    # sorted index
+    msg = "does not support operation"
+    with pytest.raises(TypeError, match=msg):
+        mdf.agg(["min", "sum"])
+
+    with pytest.raises(TypeError, match=msg):
+        mdf[["D", "C", "B", "A"]].agg(["sum", "min"])
+
+
+def test_agg_reduce(axis, float_frame):
+    other_axis = 1 if axis in {0, "index"} else 0
+    name1, name2 = float_frame.axes[other_axis].unique()[:2].sort_values()
+
+    # all reducers
+    expected = pd.concat(
+        [
+            float_frame.mean(axis=axis),
+            float_frame.max(axis=axis),
+            float_frame.sum(axis=axis),
+        ],
+        axis=1,
+    )
+    expected.columns = ["mean", "max", "sum"]
+    expected = expected.T if axis in {0, "index"} else expected
+
+    result = float_frame.agg(["mean", "max", "sum"], axis=axis)
+    tm.assert_frame_equal(result, expected)
+
+    # dict input with scalars
+    func = {name1: "mean", name2: "sum"}
+    result = float_frame.agg(func, axis=axis)
+    expected = Series(
+        [
+            float_frame.loc(other_axis)[name1].mean(),
+            float_frame.loc(other_axis)[name2].sum(),
+        ],
+        index=[name1, name2],
+    )
+    tm.assert_series_equal(result, expected)
+
+    # dict input with lists
+    func = {name1: ["mean"], name2: ["sum"]}
+    result = float_frame.agg(func, axis=axis)
+    expected = DataFrame(
+        {
+            name1: Series([float_frame.loc(other_axis)[name1].mean()], index=["mean"]),
+            name2: Series([float_frame.loc(other_axis)[name2].sum()], index=["sum"]),
+        }
+    )
+    expected = expected.T if axis in {1, "columns"} else expected
+    tm.assert_frame_equal(result, expected)
+
+    # dict input with lists with multiple
+    func = {name1: ["mean", "sum"], name2: ["sum", "max"]}
+    result = float_frame.agg(func, axis=axis)
+    expected = pd.concat(
+        {
+            name1: Series(
+                [
+                    float_frame.loc(other_axis)[name1].mean(),
+                    float_frame.loc(other_axis)[name1].sum(),
+                ],
+                index=["mean", "sum"],
+            ),
+            name2: Series(
+                [
+                    float_frame.loc(other_axis)[name2].sum(),
+                    float_frame.loc(other_axis)[name2].max(),
+                ],
+                index=["sum", "max"],
+            ),
+        },
+        axis=1,
+    )
+    expected = expected.T if axis in {1, "columns"} else expected
+    tm.assert_frame_equal(result, expected)
+
+
+def test_named_agg_reduce_axis1_raises(float_frame):
+    name1, name2 = float_frame.axes[0].unique()[:2].sort_values()
+    msg = "Named aggregation is not supported when axis=1."
+    for axis in [1, "columns"]:
+        with pytest.raises(NotImplementedError, match=msg):
+            float_frame.agg(row1=(name1, "sum"), row2=(name2, "max"), axis=axis)
+
+
+def test_nuiscance_columns():
+    # GH 15015
+    df = DataFrame(
+        {
+            "A": [1, 2, 3],
+            "B": [1.0, 2.0, 3.0],
+            "C": ["foo", "bar", "baz"],
+            "D": date_range("20130101", periods=3),
+        }
+    )
+
+    result = df.agg("min")
+    expected = Series([1, 1.0, "bar", Timestamp("20130101")], index=df.columns)
+    tm.assert_series_equal(result, expected)
+
+    result = df.agg(["min"])
+    expected = DataFrame(
+        [[1, 1.0, "bar", Timestamp("20130101")]],
+        index=["min"],
+        columns=df.columns,
+    )
+    tm.assert_frame_equal(result, expected)
+
+    msg = "does not support operation"
+    with pytest.raises(TypeError, match=msg):
+        df.agg("sum")
+
+    result = df[["A", "B", "C"]].agg("sum")
+    expected = Series([6, 6.0, "foobarbaz"], index=["A", "B", "C"])
+    tm.assert_series_equal(result, expected)
+
+    msg = "does not support operation"
+    with pytest.raises(TypeError, match=msg):
+        df.agg(["sum"])
+
+
+@pytest.mark.parametrize("how", ["agg", "apply"])
+def test_non_callable_aggregates(how):
+    # GH 16405
+    # 'size' is a property of frame/series
+    # validate that this is working
+    # GH 39116 - expand to apply
+    df = DataFrame(
+        {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]}
+    )
+
+    # Function aggregate
+    result = getattr(df, how)({"A": "count"})
+    expected = Series({"A": 2})
+
+    tm.assert_series_equal(result, expected)
+
+    # Non-function aggregate
+    result = getattr(df, how)({"A": "size"})
+    expected = Series({"A": 3})
+
+    tm.assert_series_equal(result, expected)
+
+    # Mix function and non-function aggs
+    result1 = getattr(df, how)(["count", "size"])
+    result2 = getattr(df, how)(
+        {"A": ["count", "size"], "B": ["count", "size"], "C": ["count", "size"]}
+    )
+    expected = DataFrame(
+        {
+            "A": {"count": 2, "size": 3},
+            "B": {"count": 2, "size": 3},
+            "C": {"count": 2, "size": 3},
+        }
+    )
+
+    tm.assert_frame_equal(result1, result2, check_like=True)
+    tm.assert_frame_equal(result2, expected, check_like=True)
+
+    # Just functional string arg is same as calling df.arg()
+    result = getattr(df, how)("count")
+    expected = df.count()
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("how", ["agg", "apply"])
+def test_size_as_str(how, axis):
+    # GH 39934
+    df = DataFrame(
+        {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]}
+    )
+    # Just a string attribute arg same as calling df.arg
+    # on the columns
+    result = getattr(df, how)("size", axis=axis)
+    if axis in (0, "index"):
+        expected = Series(df.shape[0], index=df.columns)
+    else:
+        expected = Series(df.shape[1], index=df.index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_agg_listlike_result():
+    # GH-29587 user defined function returning list-likes
+    df = DataFrame({"A": [2, 2, 3], "B": [1.5, np.nan, 1.5], "C": ["foo", None, "bar"]})
+
+    def func(group_col):
+        return list(group_col.dropna().unique())
+
+    result = df.agg(func)
+    expected = Series([[2, 3], [1.5], ["foo", "bar"]], index=["A", "B", "C"])
+    tm.assert_series_equal(result, expected)
+
+    result = df.agg([func])
+    expected = expected.to_frame("func").T
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("axis", [0, 1])
+@pytest.mark.parametrize(
+    "args, kwargs",
+    [
+        ((1, 2, 3), {}),
+        ((8, 7, 15), {}),
+        ((1, 2), {}),
+        ((1,), {"b": 2}),
+        ((), {"a": 1, "b": 2}),
+        ((), {"a": 2, "b": 1}),
+        ((), {"a": 1, "b": 2, "c": 3}),
+    ],
+)
+def test_agg_args_kwargs(axis, args, kwargs):
+    def f(x, a, b, c=3):
+        return x.sum() + (a + b) / c
+
+    df = DataFrame([[1, 2], [3, 4]])
+
+    if axis == 0:
+        expected = Series([5.0, 7.0])
+    else:
+        expected = Series([4.0, 8.0])
+
+    result = df.agg(f, axis, *args, **kwargs)
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("num_cols", [2, 3, 5])
+def test_frequency_is_original(num_cols, engine, request):
+    # GH 22150
+    if engine == "numba":
+        mark = pytest.mark.xfail(reason="numba engine only supports numeric indices")
+        request.node.add_marker(mark)
+    index = pd.DatetimeIndex(["1950-06-30", "1952-10-24", "1953-05-29"])
+    original = index.copy()
+    df = DataFrame(1, index=index, columns=range(num_cols))
+    df.apply(lambda x: x, engine=engine)
+    assert index.freq == original.freq
+
+
+def test_apply_datetime_tz_issue(engine, request):
+    # GH 29052
+
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="numba engine doesn't support non-numeric indexes"
+        )
+        request.node.add_marker(mark)
+
+    timestamps = [
+        Timestamp("2019-03-15 12:34:31.909000+0000", tz="UTC"),
+        Timestamp("2019-03-15 12:34:34.359000+0000", tz="UTC"),
+        Timestamp("2019-03-15 12:34:34.660000+0000", tz="UTC"),
+    ]
+    df = DataFrame(data=[0, 1, 2], index=timestamps)
+    result = df.apply(lambda x: x.name, axis=1, engine=engine)
+    expected = Series(index=timestamps, data=timestamps)
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})])
+@pytest.mark.parametrize("method", ["min", "max", "sum"])
+def test_mixed_column_raises(df, method, using_infer_string):
+    # GH 16832
+    if method == "sum":
+        msg = r'can only concatenate str \(not "int"\) to str|does not support'
+    else:
+        msg = "not supported between instances of 'str' and 'float'"
+    if not using_infer_string:
+        with pytest.raises(TypeError, match=msg):
+            getattr(df, method)()
+    else:
+        getattr(df, method)()
+
+
+@pytest.mark.parametrize("col", [1, 1.0, True, "a", np.nan])
+def test_apply_dtype(col):
+    # GH 31466
+    df = DataFrame([[1.0, col]], columns=["a", "b"])
+    result = df.apply(lambda x: x.dtype)
+    expected = df.dtypes
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_mutating():
+    # GH#35462 case where applied func pins a new BlockManager to a row
+    df = DataFrame({"a": range(10), "b": range(10, 20)})
+    df_orig = df.copy()
+
+    def func(row):
+        mgr = row._mgr
+        row.loc["a"] += 1
+        assert row._mgr is not mgr
+        return row
+
+    expected = df.copy()
+    expected["a"] += 1
+
+    result = df.apply(func, axis=1)
+
+    tm.assert_frame_equal(result, expected)
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_apply_empty_list_reduce():
+    # GH#35683 get columns correct
+    df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], columns=["a", "b"])
+
+    result = df.apply(lambda x: [], result_type="reduce")
+    expected = Series({"a": [], "b": []}, dtype=object)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_no_suffix_index(engine, request):
+    # GH36189
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="numba engine doesn't support list-likes/dict-like callables"
+        )
+        request.node.add_marker(mark)
+    pdf = DataFrame([[4, 9]] * 3, columns=["A", "B"])
+    result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()], engine=engine)
+    expected = DataFrame(
+        {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "<lambda>", "<lambda>"]
+    )
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_raw_returns_string(engine):
+    # https://github.com/pandas-dev/pandas/issues/35940
+    if engine == "numba":
+        pytest.skip("No object dtype support in numba")
+    df = DataFrame({"A": ["aa", "bbb"]})
+    result = df.apply(lambda x: x[0], engine=engine, axis=1, raw=True)
+    expected = Series(["aa", "bbb"])
+    tm.assert_series_equal(result, expected)
+
+
+def test_aggregation_func_column_order():
+    # GH40420: the result of .agg should have an index that is sorted
+    # according to the arguments provided to agg.
+    df = DataFrame(
+        [
+            (1, 0, 0),
+            (2, 0, 0),
+            (3, 0, 0),
+            (4, 5, 4),
+            (5, 6, 6),
+            (6, 7, 7),
+        ],
+        columns=("att1", "att2", "att3"),
+    )
+
+    def sum_div2(s):
+        return s.sum() / 2
+
+    aggs = ["sum", sum_div2, "count", "min"]
+    result = df.agg(aggs)
+    expected = DataFrame(
+        {
+            "att1": [21.0, 10.5, 6.0, 1.0],
+            "att2": [18.0, 9.0, 6.0, 0.0],
+            "att3": [17.0, 8.5, 6.0, 0.0],
+        },
+        index=["sum", "sum_div2", "count", "min"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_getitem_axis_1(engine, request):
+    # GH 13427
+    if engine == "numba":
+        mark = pytest.mark.xfail(
+            reason="numba engine not supporting duplicate index values"
+        )
+        request.node.add_marker(mark)
+    df = DataFrame({"a": [0, 1, 2], "b": [1, 2, 3]})
+    result = df[["a", "a"]].apply(
+        lambda x: x.iloc[0] + x.iloc[1], axis=1, engine=engine
+    )
+    expected = Series([0, 2, 4])
+    tm.assert_series_equal(result, expected)
+
+
+def test_nuisance_depr_passes_through_warnings():
+    # GH 43740
+    # DataFrame.agg with list-likes may emit warnings for both individual
+    # args and for entire columns, but we only want to emit once. We
+    # catch and suppress the warnings for individual args, but need to make
+    # sure if some other warnings were raised, they get passed through to
+    # the user.
+
+    def expected_warning(x):
+        warnings.warn("Hello, World!")
+        return x.sum()
+
+    df = DataFrame({"a": [1, 2, 3]})
+    with tm.assert_produces_warning(UserWarning, match="Hello, World!"):
+        df.agg([expected_warning])
+
+
+def test_apply_type():
+    # GH 46719
+    df = DataFrame(
+        {"col1": [3, "string", float], "col2": [0.25, datetime(2020, 1, 1), np.nan]},
+        index=["a", "b", "c"],
+    )
+
+    # axis=0
+    result = df.apply(type, axis=0)
+    expected = Series({"col1": Series, "col2": Series})
+    tm.assert_series_equal(result, expected)
+
+    # axis=1
+    result = df.apply(type, axis=1)
+    expected = Series({"a": Series, "b": Series, "c": Series})
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_on_empty_dataframe(engine):
+    # GH 39111
+    df = DataFrame({"a": [1, 2], "b": [3, 0]})
+    result = df.head(0).apply(lambda x: max(x["a"], x["b"]), axis=1, engine=engine)
+    expected = Series([], dtype=np.float64)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_return_list():
+    df = DataFrame({"a": [1, 2], "b": [2, 3]})
+    result = df.apply(lambda x: [x.values])
+    expected = DataFrame({"a": [[1, 2]], "b": [[2, 3]]})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "test, constant",
+    [
+        ({"a": [1, 2, 3], "b": [1, 1, 1]}, {"a": [1, 2, 3], "b": [1]}),
+        ({"a": [2, 2, 2], "b": [1, 1, 1]}, {"a": [2], "b": [1]}),
+    ],
+)
+def test_unique_agg_type_is_series(test, constant):
+    # GH#22558
+    df1 = DataFrame(test)
+    expected = Series(data=constant, index=["a", "b"], dtype="object")
+    aggregation = {"a": "unique", "b": "unique"}
+
+    result = df1.agg(aggregation)
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_any_apply_keyword_non_zero_axis_regression():
+    # https://github.com/pandas-dev/pandas/issues/48656
+    df = DataFrame({"A": [1, 2, 0], "B": [0, 2, 0], "C": [0, 0, 0]})
+    expected = Series([True, True, False])
+    tm.assert_series_equal(df.any(axis=1), expected)
+
+    result = df.apply("any", axis=1)
+    tm.assert_series_equal(result, expected)
+
+    result = df.apply("any", 1)
+    tm.assert_series_equal(result, expected)
+
+
+def test_agg_mapping_func_deprecated():
+    # GH 53325
+    df = DataFrame({"x": [1, 2, 3]})
+
+    def foo1(x, a=1, c=0):
+        return x + a + c
+
+    def foo2(x, b=2, c=0):
+        return x + b + c
+
+    # single func already takes the vectorized path
+    result = df.agg(foo1, 0, 3, c=4)
+    expected = df + 7
+    tm.assert_frame_equal(result, expected)
+
+    result = df.agg([foo1, foo2], 0, 3, c=4)
+    expected = DataFrame(
+        [[8, 8], [9, 9], [10, 10]], columns=[["x", "x"], ["foo1", "foo2"]]
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # TODO: the result below is wrong, should be fixed (GH53325)
+    result = df.agg({"x": foo1}, 0, 3, c=4)
+    expected = DataFrame([2, 3, 4], columns=["x"])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_agg_std():
+    df = DataFrame(np.arange(6).reshape(3, 2), columns=["A", "B"])
+
+    result = df.agg(np.std, ddof=1)
+    expected = Series({"A": 2.0, "B": 2.0}, dtype=float)
+    tm.assert_series_equal(result, expected)
+
+    result = df.agg([np.std], ddof=1)
+    expected = DataFrame({"A": 2.0, "B": 2.0}, index=["std"])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_agg_np_size():
+    # GH#42203, GH#48328
+    df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"])
+
+    result = df.agg({"A": [np.size]})
+    expected = DataFrame({"A": [3]}, index=["size"])
+    tm.assert_frame_equal(result, expected)
+
+    result = df.agg({"A": np.size})
+    expected = Series({"A": 3})
+    tm.assert_series_equal(result, expected)
+
+    result = df.agg({"A": [np.mean, np.size]})
+    expected = DataFrame({"A": [4.0, 3.0]}, index=["mean", "size"])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_agg_dist_like_and_nonunique_columns():
+    # GH#51099
+    df = DataFrame(
+        {"A": [None, 2, 3], "B": [1.0, np.nan, 3.0], "C": ["foo", None, "bar"]}
+    )
+    df.columns = ["A", "A", "C"]
+
+    result = df.agg({"A": "count"})
+    expected = df["A"].count()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("engine_name", ["unknown", 25])
+def test_wrong_engine(engine_name):
+    with pytest.raises(ValueError, match="Unknown engine "):
+        DataFrame().apply(lambda x: x, engine=engine_name)
diff --git a/pandas/tests/apply/test_frame_apply_relabeling.py b/pandas/tests/apply/test_frame_apply_relabeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..86918ec09aa97d7db9af0a8655e3273a53b7aad0
--- /dev/null
+++ b/pandas/tests/apply/test_frame_apply_relabeling.py
@@ -0,0 +1,105 @@
+import numpy as np
+
+import pandas as pd
+import pandas._testing as tm
+
+
+def test_agg_relabel():
+    # GH 26513
+    df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
+
+    # simplest case with one column, one func
+    result = df.agg(foo=("B", "sum"))
+    expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"]))
+    tm.assert_frame_equal(result, expected)
+
+    # test on same column with different methods
+    result = df.agg(foo=("B", "sum"), bar=("B", "min"))
+    expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"]))
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_agg_relabel_multi_columns_multi_methods():
+    # GH 26513, test on multiple columns with multiple methods
+    df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
+    result = df.agg(
+        foo=("A", "sum"),
+        bar=("B", "mean"),
+        cat=("A", "min"),
+        dat=("B", "max"),
+        f=("A", "max"),
+        g=("C", "min"),
+    )
+    expected = pd.DataFrame(
+        {
+            "A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan],
+            "B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan],
+            "C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0],
+        },
+        index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_agg_relabel_partial_functions():
+    # GH 26513, test on partial, functools or more complex cases
+    df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
+    result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min))
+    expected = pd.DataFrame(
+        {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"])
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = df.agg(
+        foo=("A", min),
+        bar=("B", np.min),
+        cat=("B", max),
+        dat=("C", "min"),
+        f=("B", np.sum),
+        kk=("B", lambda x: min(x)),
+    )
+    expected = pd.DataFrame(
+        {
+            "A": [1.0, np.nan, np.nan, np.nan, np.nan, np.nan],
+            "B": [np.nan, 1.0, 4.0, np.nan, 10.0, 1.0],
+            "C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan],
+        },
+        index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_agg_namedtuple():
+    # GH 26513
+    df = pd.DataFrame({"A": [0, 1], "B": [1, 2]})
+    result = df.agg(
+        foo=pd.NamedAgg("B", "sum"),
+        bar=pd.NamedAgg("B", "min"),
+        cat=pd.NamedAgg(column="B", aggfunc="count"),
+        fft=pd.NamedAgg("B", aggfunc="max"),
+    )
+
+    expected = pd.DataFrame(
+        {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"])
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = df.agg(
+        foo=pd.NamedAgg("A", "min"),
+        bar=pd.NamedAgg(column="B", aggfunc="max"),
+        cat=pd.NamedAgg(column="A", aggfunc="max"),
+    )
+    expected = pd.DataFrame(
+        {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]},
+        index=pd.Index(["foo", "bar", "cat"]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_reconstruct_func():
+    # GH 28472, test to ensure reconstruct_func isn't moved;
+    # This method is used by other libraries (e.g. dask)
+    result = pd.core.apply.reconstruct_func("min")
+    expected = (False, "min", None, None)
+    tm.assert_equal(result, expected)
diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..558d76ae8fdc4b95d46bbe94e15822779bd7c53f
--- /dev/null
+++ b/pandas/tests/apply/test_frame_transform.py
@@ -0,0 +1,264 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    MultiIndex,
+    Series,
+)
+import pandas._testing as tm
+from pandas.tests.apply.common import frame_transform_kernels
+from pandas.tests.frame.common import zip_frames
+
+
+def unpack_obj(obj, klass, axis):
+    """
+    Helper to ensure we have the right type of object for a test parametrized
+    over frame_or_series.
+    """
+    if klass is not DataFrame:
+        obj = obj["A"]
+        if axis != 0:
+            pytest.skip(f"Test is only for DataFrame with axis={axis}")
+    return obj
+
+
+def test_transform_ufunc(axis, float_frame, frame_or_series):
+    # GH 35964
+    obj = unpack_obj(float_frame, frame_or_series, axis)
+
+    with np.errstate(all="ignore"):
+        f_sqrt = np.sqrt(obj)
+
+    # ufunc
+    result = obj.transform(np.sqrt, axis=axis)
+    expected = f_sqrt
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ops, names",
+    [
+        ([np.sqrt], ["sqrt"]),
+        ([np.abs, np.sqrt], ["absolute", "sqrt"]),
+        (np.array([np.sqrt]), ["sqrt"]),
+        (np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]),
+    ],
+)
+def test_transform_listlike(axis, float_frame, ops, names):
+    # GH 35964
+    other_axis = 1 if axis in {0, "index"} else 0
+    with np.errstate(all="ignore"):
+        expected = zip_frames([op(float_frame) for op in ops], axis=other_axis)
+    if axis in {0, "index"}:
+        expected.columns = MultiIndex.from_product([float_frame.columns, names])
+    else:
+        expected.index = MultiIndex.from_product([float_frame.index, names])
+    result = float_frame.transform(ops, axis=axis)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("ops", [[], np.array([])])
+def test_transform_empty_listlike(float_frame, ops, frame_or_series):
+    obj = unpack_obj(float_frame, frame_or_series, 0)
+
+    with pytest.raises(ValueError, match="No transform functions were provided"):
+        obj.transform(ops)
+
+
+def test_transform_listlike_func_with_args():
+    # GH 50624
+    df = DataFrame({"x": [1, 2, 3]})
+
+    def foo1(x, a=1, c=0):
+        return x + a + c
+
+    def foo2(x, b=2, c=0):
+        return x + b + c
+
+    msg = r"foo1\(\) got an unexpected keyword argument 'b'"
+    with pytest.raises(TypeError, match=msg):
+        df.transform([foo1, foo2], 0, 3, b=3, c=4)
+
+    result = df.transform([foo1, foo2], 0, 3, c=4)
+    expected = DataFrame(
+        [[8, 8], [9, 9], [10, 10]],
+        columns=MultiIndex.from_tuples([("x", "foo1"), ("x", "foo2")]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("box", [dict, Series])
+def test_transform_dictlike(axis, float_frame, box):
+    # GH 35964
+    if axis in (0, "index"):
+        e = float_frame.columns[0]
+        expected = float_frame[[e]].transform(np.abs)
+    else:
+        e = float_frame.index[0]
+        expected = float_frame.iloc[[0]].transform(np.abs)
+    result = float_frame.transform(box({e: np.abs}), axis=axis)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_transform_dictlike_mixed():
+    # GH 40018 - mix of lists and non-lists in values of a dictionary
+    df = DataFrame({"a": [1, 2], "b": [1, 4], "c": [1, 4]})
+    result = df.transform({"b": ["sqrt", "abs"], "c": "sqrt"})
+    expected = DataFrame(
+        [[1.0, 1, 1.0], [2.0, 4, 2.0]],
+        columns=MultiIndex([("b", "c"), ("sqrt", "abs")], [(0, 0, 1), (0, 1, 0)]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ops",
+    [
+        {},
+        {"A": []},
+        {"A": [], "B": "cumsum"},
+        {"A": "cumsum", "B": []},
+        {"A": [], "B": ["cumsum"]},
+        {"A": ["cumsum"], "B": []},
+    ],
+)
+def test_transform_empty_dictlike(float_frame, ops, frame_or_series):
+    obj = unpack_obj(float_frame, frame_or_series, 0)
+
+    with pytest.raises(ValueError, match="No transform functions were provided"):
+        obj.transform(ops)
+
+
+@pytest.mark.parametrize("use_apply", [True, False])
+def test_transform_udf(axis, float_frame, use_apply, frame_or_series):
+    # GH 35964
+    obj = unpack_obj(float_frame, frame_or_series, axis)
+
+    # transform uses UDF either via apply or passing the entire DataFrame
+    def func(x):
+        # transform is using apply iff x is not a DataFrame
+        if use_apply == isinstance(x, frame_or_series):
+            # Force transform to fallback
+            raise ValueError
+        return x + 1
+
+    result = obj.transform(func, axis=axis)
+    expected = obj + 1
+    tm.assert_equal(result, expected)
+
+
+wont_fail = ["ffill", "bfill", "fillna", "pad", "backfill", "shift"]
+frame_kernels_raise = [x for x in frame_transform_kernels if x not in wont_fail]
+
+
+@pytest.mark.parametrize("op", [*frame_kernels_raise, lambda x: x + 1])
+def test_transform_bad_dtype(op, frame_or_series, request):
+    # GH 35964
+    if op == "ngroup":
+        request.applymarker(
+            pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame")
+        )
+
+    obj = DataFrame({"A": 3 * [object]})  # DataFrame that will fail on most transforms
+    obj = tm.get_obj(obj, frame_or_series)
+    error = TypeError
+    msg = "|".join(
+        [
+            "not supported between instances of 'type' and 'type'",
+            "unsupported operand type",
+        ]
+    )
+
+    with pytest.raises(error, match=msg):
+        obj.transform(op)
+    with pytest.raises(error, match=msg):
+        obj.transform([op])
+    with pytest.raises(error, match=msg):
+        obj.transform({"A": op})
+    with pytest.raises(error, match=msg):
+        obj.transform({"A": [op]})
+
+
+@pytest.mark.parametrize("op", frame_kernels_raise)
+def test_transform_failure_typeerror(request, op):
+    # GH 35964
+
+    if op == "ngroup":
+        request.applymarker(
+            pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame")
+        )
+
+    # Using object makes most transform kernels fail
+    df = DataFrame({"A": 3 * [object], "B": [1, 2, 3]})
+    error = TypeError
+    msg = "|".join(
+        [
+            "not supported between instances of 'type' and 'type'",
+            "unsupported operand type",
+        ]
+    )
+
+    with pytest.raises(error, match=msg):
+        df.transform([op])
+
+    with pytest.raises(error, match=msg):
+        df.transform({"A": op, "B": op})
+
+    with pytest.raises(error, match=msg):
+        df.transform({"A": [op], "B": [op]})
+
+    with pytest.raises(error, match=msg):
+        df.transform({"A": [op, "shift"], "B": [op]})
+
+
+def test_transform_failure_valueerror():
+    # GH 40211
+    def op(x):
+        if np.sum(np.sum(x)) < 10:
+            raise ValueError
+        return x
+
+    df = DataFrame({"A": [1, 2, 3], "B": [400, 500, 600]})
+    msg = "Transform function failed"
+
+    with pytest.raises(ValueError, match=msg):
+        df.transform([op])
+
+    with pytest.raises(ValueError, match=msg):
+        df.transform({"A": op, "B": op})
+
+    with pytest.raises(ValueError, match=msg):
+        df.transform({"A": [op], "B": [op]})
+
+    with pytest.raises(ValueError, match=msg):
+        df.transform({"A": [op, "shift"], "B": [op]})
+
+
+@pytest.mark.parametrize("use_apply", [True, False])
+def test_transform_passes_args(use_apply, frame_or_series):
+    # GH 35964
+    # transform uses UDF either via apply or passing the entire DataFrame
+    expected_args = [1, 2]
+    expected_kwargs = {"c": 3}
+
+    def f(x, a, b, c):
+        # transform is using apply iff x is not a DataFrame
+        if use_apply == isinstance(x, frame_or_series):
+            # Force transform to fallback
+            raise ValueError
+        assert [a, b] == expected_args
+        assert c == expected_kwargs["c"]
+        return x
+
+    frame_or_series([1]).transform(f, 0, *expected_args, **expected_kwargs)
+
+
+def test_transform_empty_dataframe():
+    # https://github.com/pandas-dev/pandas/issues/39636
+    df = DataFrame([], columns=["col1", "col2"])
+    result = df.transform(lambda x: x + 10)
+    tm.assert_frame_equal(result, df)
+
+    result = df["col1"].transform(lambda x: x + 10)
+    tm.assert_series_equal(result, df["col1"])
diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py
new file mode 100644
index 0000000000000000000000000000000000000000..0503bf9166ec7b6c06edf95293cc286140787d60
--- /dev/null
+++ b/pandas/tests/apply/test_invalid_arg.py
@@ -0,0 +1,375 @@
+# Tests specifically aimed at detecting bad arguments.
+# This file is organized by reason for exception.
+#     1. always invalid argument values
+#     2. missing column(s)
+#     3. incompatible ops/dtype/args/kwargs
+#     4. invalid result shape/type
+# If your test does not fit into one of these categories, add to this list.
+
+from itertools import chain
+import re
+
+import numpy as np
+import pytest
+
+from pandas.errors import SpecificationError
+
+from pandas import (
+    DataFrame,
+    Series,
+    date_range,
+)
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize("result_type", ["foo", 1])
+def test_result_type_error(result_type):
+    # allowed result_type
+    df = DataFrame(
+        np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1,
+        columns=["A", "B", "C"],
+    )
+
+    msg = (
+        "invalid value for result_type, must be one of "
+        "{None, 'reduce', 'broadcast', 'expand'}"
+    )
+    with pytest.raises(ValueError, match=msg):
+        df.apply(lambda x: [1, 2, 3], axis=1, result_type=result_type)
+
+
+def test_apply_invalid_axis_value():
+    df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=["a", "a", "c"])
+    msg = "No axis named 2 for object type DataFrame"
+    with pytest.raises(ValueError, match=msg):
+        df.apply(lambda x: x, 2)
+
+
+def test_agg_raises():
+    # GH 26513
+    df = DataFrame({"A": [0, 1], "B": [1, 2]})
+    msg = "Must provide"
+
+    with pytest.raises(TypeError, match=msg):
+        df.agg()
+
+
+def test_map_with_invalid_na_action_raises():
+    # https://github.com/pandas-dev/pandas/issues/32815
+    s = Series([1, 2, 3])
+    msg = "na_action must either be 'ignore' or None"
+    with pytest.raises(ValueError, match=msg):
+        s.map(lambda x: x, na_action="____")
+
+
+@pytest.mark.parametrize("input_na_action", ["____", True])
+def test_map_arg_is_dict_with_invalid_na_action_raises(input_na_action):
+    # https://github.com/pandas-dev/pandas/issues/46588
+    s = Series([1, 2, 3])
+    msg = f"na_action must either be 'ignore' or None, {input_na_action} was passed"
+    with pytest.raises(ValueError, match=msg):
+        s.map({1: 2}, na_action=input_na_action)
+
+
+@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
+@pytest.mark.parametrize("func", [{"A": {"B": "sum"}}, {"A": {"B": ["sum"]}}])
+def test_nested_renamer(frame_or_series, method, func):
+    # GH 35964
+    obj = frame_or_series({"A": [1]})
+    match = "nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=match):
+        getattr(obj, method)(func)
+
+
+@pytest.mark.parametrize(
+    "renamer",
+    [{"foo": ["min", "max"]}, {"foo": ["min", "max"], "bar": ["sum", "mean"]}],
+)
+def test_series_nested_renamer(renamer):
+    s = Series(range(6), dtype="int64", name="series")
+    msg = "nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        s.agg(renamer)
+
+
+def test_apply_dict_depr():
+    tsdf = DataFrame(
+        np.random.default_rng(2).standard_normal((10, 3)),
+        columns=["A", "B", "C"],
+        index=date_range("1/1/2000", periods=10),
+    )
+    msg = "nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        tsdf.A.agg({"foo": ["sum", "mean"]})
+
+
+@pytest.mark.parametrize("method", ["agg", "transform"])
+def test_dict_nested_renaming_depr(method):
+    df = DataFrame({"A": range(5), "B": 5})
+
+    # nested renaming
+    msg = r"nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        getattr(df, method)({"A": {"foo": "min"}, "B": {"bar": "max"}})
+
+
+@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
+@pytest.mark.parametrize("func", [{"B": "sum"}, {"B": ["sum"]}])
+def test_missing_column(method, func):
+    # GH 40004
+    obj = DataFrame({"A": [1]})
+    msg = r"Label\(s\) \['B'\] do not exist"
+    with pytest.raises(KeyError, match=msg):
+        getattr(obj, method)(func)
+
+
+def test_transform_mixed_column_name_dtypes():
+    # GH39025
+    df = DataFrame({"a": ["1"]})
+    msg = r"Label\(s\) \[1, 'b'\] do not exist"
+    with pytest.raises(KeyError, match=msg):
+        df.transform({"a": int, 1: str, "b": int})
+
+
+@pytest.mark.parametrize(
+    "how, args", [("pct_change", ()), ("nsmallest", (1, ["a", "b"])), ("tail", 1)]
+)
+def test_apply_str_axis_1_raises(how, args):
+    # GH 39211 - some ops don't support axis=1
+    df = DataFrame({"a": [1, 2], "b": [3, 4]})
+    msg = f"Operation {how} does not support axis=1"
+    with pytest.raises(ValueError, match=msg):
+        df.apply(how, axis=1, args=args)
+
+
+def test_transform_axis_1_raises():
+    # GH 35964
+    msg = "No axis named 1 for object type Series"
+    with pytest.raises(ValueError, match=msg):
+        Series([1]).transform("sum", axis=1)
+
+
+def test_apply_modify_traceback():
+    data = DataFrame(
+        {
+            "A": [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            "B": [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            "C": [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            "D": np.random.default_rng(2).standard_normal(11),
+            "E": np.random.default_rng(2).standard_normal(11),
+            "F": np.random.default_rng(2).standard_normal(11),
+        }
+    )
+
+    data.loc[4, "C"] = np.nan
+
+    def transform(row):
+        if row["C"].startswith("shin") and row["A"] == "foo":
+            row["D"] = 7
+        return row
+
+    msg = "'float' object has no attribute 'startswith'"
+    with pytest.raises(AttributeError, match=msg):
+        data.apply(transform, axis=1)
+
+
+@pytest.mark.parametrize(
+    "df, func, expected",
+    tm.get_cython_table_params(
+        DataFrame([["a", "b"], ["b", "a"]]), [["cumprod", TypeError]]
+    ),
+)
+def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string):
+    # GH 21224
+    if using_infer_string:
+        expected = (expected, NotImplementedError)
+
+    msg = (
+        "can't multiply sequence by non-int of type 'str'"
+        "|cannot perform cumprod with type str"  # NotImplementedError python backend
+        "|operation 'cumprod' not supported for dtype 'str'"  # TypeError pyarrow
+    )
+    warn = None if isinstance(func, str) else FutureWarning
+    with pytest.raises(expected, match=msg):
+        with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"):
+            df.agg(func, axis=axis)
+
+
+@pytest.mark.parametrize(
+    "series, func, expected",
+    chain(
+        tm.get_cython_table_params(
+            Series("a b c".split()),
+            [
+                ("mean", TypeError),  # mean raises TypeError
+                ("prod", TypeError),
+                ("std", TypeError),
+                ("var", TypeError),
+                ("median", TypeError),
+                ("cumprod", TypeError),
+            ],
+        )
+    ),
+)
+def test_agg_cython_table_raises_series(series, func, expected, using_infer_string):
+    # GH21224
+    msg = r"[Cc]ould not convert|can't multiply sequence by non-int of type"
+    if func == "median" or func is np.nanmedian or func is np.median:
+        msg = r"Cannot convert \['a' 'b' 'c'\] to numeric"
+
+    if using_infer_string and func == "cumprod":
+        expected = (expected, NotImplementedError)
+
+    msg = (
+        msg + "|does not support|has no kernel|Cannot perform|cannot perform|operation"
+    )
+    warn = None if isinstance(func, str) else FutureWarning
+
+    with pytest.raises(expected, match=msg):
+        # e.g. Series('a b'.split()).cumprod() will raise
+        with tm.assert_produces_warning(warn, match="is currently using Series.*"):
+            series.agg(func)
+
+
+def test_agg_none_to_type():
+    # GH 40543
+    df = DataFrame({"a": [None]})
+    msg = re.escape("int() argument must be a string")
+    with pytest.raises(TypeError, match=msg):
+        df.agg({"a": lambda x: int(x.iloc[0])})
+
+
+def test_transform_none_to_type():
+    # GH#34377
+    df = DataFrame({"a": [None]})
+    msg = "argument must be a"
+    with pytest.raises(TypeError, match=msg):
+        df.transform({"a": lambda x: int(x.iloc[0])})
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda x: np.array([1, 2]).reshape(-1, 2),
+        lambda x: [1, 2],
+        lambda x: Series([1, 2]),
+    ],
+)
+def test_apply_broadcast_error(func):
+    df = DataFrame(
+        np.tile(np.arange(3, dtype="int64"), 6).reshape(6, -1) + 1,
+        columns=["A", "B", "C"],
+    )
+
+    # > 1 ndim
+    msg = "too many dims to broadcast|cannot broadcast result"
+    with pytest.raises(ValueError, match=msg):
+        df.apply(func, axis=1, result_type="broadcast")
+
+
+def test_transform_and_agg_err_agg(axis, float_frame):
+    # cannot both transform and agg
+    msg = "cannot combine transform and aggregation operations"
+    with pytest.raises(ValueError, match=msg):
+        with np.errstate(all="ignore"):
+            float_frame.agg(["max", "sqrt"], axis=axis)
+
+
+@pytest.mark.filterwarnings("ignore::FutureWarning")  # GH53325
+@pytest.mark.parametrize(
+    "func, msg",
+    [
+        (["sqrt", "max"], "cannot combine transform and aggregation"),
+        (
+            {"foo": np.sqrt, "bar": "sum"},
+            "cannot perform both aggregation and transformation",
+        ),
+    ],
+)
+def test_transform_and_agg_err_series(string_series, func, msg):
+    # we are trying to transform with an aggregator
+    with pytest.raises(ValueError, match=msg):
+        with np.errstate(all="ignore"):
+            string_series.agg(func)
+
+
+@pytest.mark.parametrize("func", [["max", "min"], ["max", "sqrt"]])
+def test_transform_wont_agg_frame(axis, float_frame, func):
+    # GH 35964
+    # cannot both transform and agg
+    msg = "Function did not transform"
+    with pytest.raises(ValueError, match=msg):
+        float_frame.transform(func, axis=axis)
+
+
+@pytest.mark.parametrize("func", [["min", "max"], ["sqrt", "max"]])
+def test_transform_wont_agg_series(string_series, func):
+    # GH 35964
+    # we are trying to transform with an aggregator
+    msg = "Function did not transform"
+
+    with pytest.raises(ValueError, match=msg):
+        string_series.transform(func)
+
+
+@pytest.mark.parametrize(
+    "op_wrapper", [lambda x: x, lambda x: [x], lambda x: {"A": x}, lambda x: {"A": [x]}]
+)
+def test_transform_reducer_raises(all_reductions, frame_or_series, op_wrapper):
+    # GH 35964
+    op = op_wrapper(all_reductions)
+
+    obj = DataFrame({"A": [1, 2, 3]})
+    obj = tm.get_obj(obj, frame_or_series)
+
+    msg = "Function did not transform"
+    with pytest.raises(ValueError, match=msg):
+        obj.transform(op)
+
+
+def test_transform_missing_labels_raises():
+    # GH 58474
+    df = DataFrame({"foo": [2, 4, 6], "bar": [1, 2, 3]}, index=["A", "B", "C"])
+    msg = r"Label\(s\) \['A', 'B'\] do not exist"
+    with pytest.raises(KeyError, match=msg):
+        df.transform({"A": lambda x: x + 2, "B": lambda x: x * 2}, axis=0)
+
+    msg = r"Label\(s\) \['bar', 'foo'\] do not exist"
+    with pytest.raises(KeyError, match=msg):
+        df.transform({"foo": lambda x: x + 2, "bar": lambda x: x * 2}, axis=1)
diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
new file mode 100644
index 0000000000000000000000000000000000000000..75bc3f5b74b9deff5587a6c0b0a3c25a266f9a1e
--- /dev/null
+++ b/pandas/tests/apply/test_numba.py
@@ -0,0 +1,129 @@
+import numpy as np
+import pytest
+
+from pandas.compat import is_platform_arm
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+)
+import pandas._testing as tm
+from pandas.util.version import Version
+
+pytestmark = [td.skip_if_no("numba"), pytest.mark.single_cpu, pytest.mark.skipif()]
+
+numba = pytest.importorskip("numba")
+pytestmark.append(
+    pytest.mark.skipif(
+        Version(numba.__version__) == Version("0.61") and is_platform_arm(),
+        reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
+    )
+)
+
+
+@pytest.fixture(params=[0, 1])
+def apply_axis(request):
+    return request.param
+
+
+def test_numba_vs_python_noop(float_frame, apply_axis):
+    func = lambda x: x
+    result = float_frame.apply(func, engine="numba", axis=apply_axis)
+    expected = float_frame.apply(func, engine="python", axis=apply_axis)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_numba_vs_python_string_index():
+    # GH#56189
+    df = DataFrame(
+        1,
+        index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
+        columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)),
+    )
+    func = lambda x: x
+    result = df.apply(func, engine="numba", axis=0)
+    expected = df.apply(func, engine="python", axis=0)
+    tm.assert_frame_equal(
+        result, expected, check_column_type=False, check_index_type=False
+    )
+
+
+def test_numba_vs_python_indexing():
+    frame = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]},
+        index=Index(["A", "B", "C"]),
+    )
+    row_func = lambda x: x["c"]
+    result = frame.apply(row_func, engine="numba", axis=1)
+    expected = frame.apply(row_func, engine="python", axis=1)
+    tm.assert_series_equal(result, expected)
+
+    col_func = lambda x: x["A"]
+    result = frame.apply(col_func, engine="numba", axis=0)
+    expected = frame.apply(col_func, engine="python", axis=0)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "reduction",
+    [lambda x: x.mean(), lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()],
+)
+def test_numba_vs_python_reductions(reduction, apply_axis):
+    df = DataFrame(np.ones((4, 4), dtype=np.float64))
+    result = df.apply(reduction, engine="numba", axis=apply_axis)
+    expected = df.apply(reduction, engine="python", axis=apply_axis)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("colnames", [[1, 2, 3], [1.0, 2.0, 3.0]])
+def test_numba_numeric_colnames(colnames):
+    # Check that numeric column names lower properly and can be indexed on
+    df = DataFrame(
+        np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int64), columns=colnames
+    )
+    first_col = colnames[0]
+    f = lambda x: x[first_col]  # Get the first column
+    result = df.apply(f, engine="numba", axis=1)
+    expected = df.apply(f, engine="python", axis=1)
+    tm.assert_series_equal(result, expected)
+
+
+def test_numba_parallel_unsupported(float_frame):
+    f = lambda x: x
+    with pytest.raises(
+        NotImplementedError,
+        match="Parallel apply is not supported when raw=False and engine='numba'",
+    ):
+        float_frame.apply(f, engine="numba", engine_kwargs={"parallel": True})
+
+
+def test_numba_nonunique_unsupported(apply_axis):
+    f = lambda x: x
+    df = DataFrame({"a": [1, 2]}, index=Index(["a", "a"]))
+    with pytest.raises(
+        NotImplementedError,
+        match="The index/columns must be unique when raw=False and engine='numba'",
+    ):
+        df.apply(f, engine="numba", axis=apply_axis)
+
+
+def test_numba_unsupported_dtypes(apply_axis):
+    pytest.importorskip("pyarrow")
+    f = lambda x: x
+    df = DataFrame({"a": [1, 2], "b": ["a", "b"], "c": [4, 5]})
+    df["c"] = df["c"].astype("double[pyarrow]")
+
+    with pytest.raises(
+        ValueError,
+        match="Column b must have a numeric dtype. Found 'object|str' instead",
+    ):
+        df.apply(f, engine="numba", axis=apply_axis)
+
+    with pytest.raises(
+        ValueError,
+        match="Column c is backed by an extension array, "
+        "which is not supported by the numba engine.",
+    ):
+        df["c"].to_frame().apply(f, engine="numba", axis=apply_axis)
diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py
new file mode 100644
index 0000000000000000000000000000000000000000..cea6fb793c0c7b4687bbacc6b57b5e13dd7a2aee
--- /dev/null
+++ b/pandas/tests/apply/test_series_apply.py
@@ -0,0 +1,669 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    concat,
+    date_range,
+    timedelta_range,
+)
+import pandas._testing as tm
+from pandas.tests.apply.common import series_transform_kernels
+
+
+@pytest.fixture(params=[False, "compat"])
+def by_row(request):
+    return request.param
+
+
+def test_series_map_box_timedelta(by_row):
+    # GH#11349
+    ser = Series(timedelta_range("1 day 1 s", periods=3, freq="h"))
+
+    def f(x):
+        return x.total_seconds() if by_row else x.dt.total_seconds()
+
+    result = ser.apply(f, by_row=by_row)
+
+    expected = ser.map(lambda x: x.total_seconds())
+    tm.assert_series_equal(result, expected)
+
+    expected = Series([86401.0, 90001.0, 93601.0])
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply(datetime_series, by_row):
+    result = datetime_series.apply(np.sqrt, by_row=by_row)
+    with np.errstate(all="ignore"):
+        expected = np.sqrt(datetime_series)
+    tm.assert_series_equal(result, expected)
+
+    # element-wise apply (ufunc)
+    result = datetime_series.apply(np.exp, by_row=by_row)
+    expected = np.exp(datetime_series)
+    tm.assert_series_equal(result, expected)
+
+    # empty series
+    s = Series(dtype=object, name="foo", index=Index([], name="bar"))
+    rs = s.apply(lambda x: x, by_row=by_row)
+    tm.assert_series_equal(s, rs)
+
+    # check all metadata (GH 9322)
+    assert s is not rs
+    assert s.index is rs.index
+    assert s.dtype == rs.dtype
+    assert s.name == rs.name
+
+    # index but no data
+    s = Series(index=[1, 2, 3], dtype=np.float64)
+    rs = s.apply(lambda x: x, by_row=by_row)
+    tm.assert_series_equal(s, rs)
+
+
+def test_apply_map_same_length_inference_bug():
+    s = Series([1, 2])
+
+    def f(x):
+        return (x, x + 1)
+
+    result = s.apply(f, by_row="compat")
+    expected = s.map(f)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_args():
+    s = Series(["foo,bar"])
+
+    result = s.apply(str.split, args=(",",))
+    assert result[0] == ["foo", "bar"]
+    assert isinstance(result[0], list)
+
+
+@pytest.mark.parametrize(
+    "args, kwargs, increment",
+    [((), {}, 0), ((), {"a": 1}, 1), ((2, 3), {}, 32), ((1,), {"c": 2}, 201)],
+)
+def test_agg_args(args, kwargs, increment):
+    # GH 43357
+    def f(x, a=0, b=0, c=0):
+        return x + a + 10 * b + 100 * c
+
+    s = Series([1, 2])
+    result = s.agg(f, 0, *args, **kwargs)
+    expected = s + increment
+    tm.assert_series_equal(result, expected)
+
+
+def test_agg_mapping_func_deprecated():
+    # GH 53325
+    s = Series([1, 2, 3])
+
+    def foo1(x, a=1, c=0):
+        return x + a + c
+
+    def foo2(x, b=2, c=0):
+        return x + b + c
+
+    s.agg(foo1, 0, 3, c=4)
+    s.agg([foo1, foo2], 0, 3, c=4)
+    s.agg({"a": foo1, "b": foo2}, 0, 3, c=4)
+
+
+def test_series_apply_map_box_timestamps(by_row):
+    # GH#2689, GH#2627
+    ser = Series(date_range("1/1/2000", periods=10))
+
+    def func(x):
+        return (x.hour, x.day, x.month)
+
+    if not by_row:
+        msg = "Series' object has no attribute 'hour'"
+        with pytest.raises(AttributeError, match=msg):
+            ser.apply(func, by_row=by_row)
+        return
+
+    result = ser.apply(func, by_row=by_row)
+    expected = ser.map(func)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_box_dt64():
+    # ufunc will not be boxed. Same test cases as the test_map_box
+    vals = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02")]
+    ser = Series(vals, dtype="M8[ns]")
+    assert ser.dtype == "datetime64[ns]"
+    # boxed value must be Timestamp instance
+    res = ser.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat")
+    exp = Series(["Timestamp_1_None", "Timestamp_2_None"])
+    tm.assert_series_equal(res, exp)
+
+
+def test_apply_box_dt64tz():
+    vals = [
+        pd.Timestamp("2011-01-01", tz="US/Eastern"),
+        pd.Timestamp("2011-01-02", tz="US/Eastern"),
+    ]
+    ser = Series(vals, dtype="M8[ns, US/Eastern]")
+    assert ser.dtype == "datetime64[ns, US/Eastern]"
+    res = ser.apply(lambda x: f"{type(x).__name__}_{x.day}_{x.tz}", by_row="compat")
+    exp = Series(["Timestamp_1_US/Eastern", "Timestamp_2_US/Eastern"])
+    tm.assert_series_equal(res, exp)
+
+
+def test_apply_box_td64():
+    # timedelta
+    vals = [pd.Timedelta("1 days"), pd.Timedelta("2 days")]
+    ser = Series(vals)
+    assert ser.dtype == "timedelta64[us]"
+    res = ser.apply(lambda x: f"{type(x).__name__}_{x.days}", by_row="compat")
+    exp = Series(["Timedelta_1", "Timedelta_2"])
+    tm.assert_series_equal(res, exp)
+
+
+def test_apply_box_period():
+    # period
+    vals = [pd.Period("2011-01-01", freq="M"), pd.Period("2011-01-02", freq="M")]
+    ser = Series(vals)
+    assert ser.dtype == "Period[M]"
+    res = ser.apply(lambda x: f"{type(x).__name__}_{x.freqstr}", by_row="compat")
+    exp = Series(["Period_M", "Period_M"])
+    tm.assert_series_equal(res, exp)
+
+
+def test_apply_datetimetz(by_row):
+    values = date_range("2011-01-01", "2011-01-02", freq="h").tz_localize("Asia/Tokyo")
+    s = Series(values, name="XX")
+
+    result = s.apply(lambda x: x + pd.offsets.Day(), by_row=by_row)
+    exp_values = date_range("2011-01-02", "2011-01-03", freq="h").tz_localize(
+        "Asia/Tokyo"
+    )
+    exp = Series(exp_values, name="XX")
+    tm.assert_series_equal(result, exp)
+
+    result = s.apply(lambda x: x.hour if by_row else x.dt.hour, by_row=by_row)
+    exp = Series([*list(range(24)), 0], name="XX", dtype="int64" if by_row else "int32")
+    tm.assert_series_equal(result, exp)
+
+    # not vectorized
+    def f(x):
+        return str(x.tz) if by_row else str(x.dt.tz)
+
+    result = s.apply(f, by_row=by_row)
+    if by_row:
+        exp = Series(["Asia/Tokyo"] * 25, name="XX")
+        tm.assert_series_equal(result, exp)
+    else:
+        assert result == "Asia/Tokyo"
+
+
+def test_apply_categorical(by_row, using_infer_string):
+    values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True)
+    ser = Series(values, name="XX", index=list("abcdefg"))
+
+    if not by_row:
+        msg = "Series' object has no attribute 'lower"
+        with pytest.raises(AttributeError, match=msg):
+            ser.apply(lambda x: x.lower(), by_row=by_row)
+        assert ser.apply(lambda x: "A", by_row=by_row) == "A"
+        return
+
+    result = ser.apply(lambda x: x.lower(), by_row=by_row)
+
+    # should be categorical dtype when the number of categories are
+    # the same
+    values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True)
+    exp = Series(values, name="XX", index=list("abcdefg"))
+    tm.assert_series_equal(result, exp)
+    tm.assert_categorical_equal(result.values, exp.values)
+
+    result = ser.apply(lambda x: "A")
+    exp = Series(["A"] * 7, name="XX", index=list("abcdefg"))
+    tm.assert_series_equal(result, exp)
+    assert result.dtype == object if not using_infer_string else "str"
+
+
+@pytest.mark.parametrize("series", [["1-1", "1-1", np.nan], ["1-1", "1-2", np.nan]])
+def test_apply_categorical_with_nan_values(series, by_row):
+    # GH 20714 bug fixed in: GH 24275
+    s = Series(series, dtype="category")
+    if not by_row:
+        msg = "'Series' object has no attribute 'split'"
+        with pytest.raises(AttributeError, match=msg):
+            s.apply(lambda x: x.split("-")[0], by_row=by_row)
+        return
+    # NaN for cat dtype fixed in (GH 59966)
+    result = s.apply(lambda x: x.split("-")[0] if pd.notna(x) else False, by_row=by_row)
+    result = result.astype(object)
+    expected = Series(["1", "1", False], dtype="category")
+    expected = expected.astype(object)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_empty_integer_series_with_datetime_index(by_row):
+    # GH 21245
+    s = Series([], index=date_range(start="2018-01-01", periods=0), dtype=int)
+    result = s.apply(lambda x: x, by_row=by_row)
+    tm.assert_series_equal(result, s)
+
+
+def test_apply_dataframe_iloc():
+    uintDF = DataFrame(np.uint64([1, 2, 3, 4, 5]), columns=["Numbers"])
+    indexDF = DataFrame([2, 3, 2, 1, 2], columns=["Indices"])
+
+    def retrieve(targetRow, targetDF):
+        val = targetDF["Numbers"].iloc[targetRow]
+        return val
+
+    result = indexDF["Indices"].apply(retrieve, args=(uintDF,))
+    expected = Series([3, 4, 3, 2, 3], name="Indices", dtype="uint64")
+    tm.assert_series_equal(result, expected)
+
+
+def test_transform(string_series, by_row):
+    # transforming functions
+
+    with np.errstate(all="ignore"):
+        f_sqrt = np.sqrt(string_series)
+        f_abs = np.abs(string_series)
+
+        # ufunc
+        result = string_series.apply(np.sqrt, by_row=by_row)
+        expected = f_sqrt.copy()
+        tm.assert_series_equal(result, expected)
+
+        # list-like
+        result = string_series.apply([np.sqrt], by_row=by_row)
+        expected = f_sqrt.to_frame().copy()
+        expected.columns = ["sqrt"]
+        tm.assert_frame_equal(result, expected)
+
+        result = string_series.apply(["sqrt"], by_row=by_row)
+        tm.assert_frame_equal(result, expected)
+
+        # multiple items in list
+        # these are in the order as if we are applying both functions per
+        # series and then concatting
+        expected = concat([f_sqrt, f_abs], axis=1)
+        expected.columns = ["sqrt", "absolute"]
+        result = string_series.apply([np.sqrt, np.abs], by_row=by_row)
+        tm.assert_frame_equal(result, expected)
+
+        # dict, provide renaming
+        expected = concat([f_sqrt, f_abs], axis=1)
+        expected.columns = ["foo", "bar"]
+        expected = expected.unstack().rename("series")
+
+        result = string_series.apply({"foo": np.sqrt, "bar": np.abs}, by_row=by_row)
+        tm.assert_series_equal(result.reindex_like(expected), expected)
+
+
+@pytest.mark.parametrize("op", series_transform_kernels)
+def test_transform_partial_failure(op, request):
+    # GH 35964
+    if op in ("ffill", "bfill", "shift"):
+        request.applymarker(
+            pytest.mark.xfail(reason=f"{op} is successful on any dtype")
+        )
+
+    # Using object makes most transform kernels fail
+    ser = Series(3 * [object])
+
+    if op in ("fillna", "ngroup"):
+        error = ValueError
+        msg = "Transform function failed"
+    else:
+        error = TypeError
+        msg = "|".join(
+            [
+                "not supported between instances of 'type' and 'type'",
+                "unsupported operand type",
+            ]
+        )
+
+    with pytest.raises(error, match=msg):
+        ser.transform([op, "shift"])
+
+    with pytest.raises(error, match=msg):
+        ser.transform({"A": op, "B": "shift"})
+
+    with pytest.raises(error, match=msg):
+        ser.transform({"A": [op], "B": ["shift"]})
+
+    with pytest.raises(error, match=msg):
+        ser.transform({"A": [op, "shift"], "B": [op]})
+
+
+def test_transform_partial_failure_valueerror():
+    # GH 40211
+    def noop(x):
+        return x
+
+    def raising_op(_):
+        raise ValueError
+
+    ser = Series(3 * [object])
+    msg = "Transform function failed"
+
+    with pytest.raises(ValueError, match=msg):
+        ser.transform([noop, raising_op])
+
+    with pytest.raises(ValueError, match=msg):
+        ser.transform({"A": raising_op, "B": noop})
+
+    with pytest.raises(ValueError, match=msg):
+        ser.transform({"A": [raising_op], "B": [noop]})
+
+    with pytest.raises(ValueError, match=msg):
+        ser.transform({"A": [noop, raising_op], "B": [noop]})
+
+
+def test_demo():
+    # demonstration tests
+    s = Series(range(6), dtype="int64", name="series")
+
+    result = s.agg(["min", "max"])
+    expected = Series([0, 5], index=["min", "max"], name="series")
+    tm.assert_series_equal(result, expected)
+
+    result = s.agg({"foo": "min"})
+    expected = Series([0], index=["foo"], name="series")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", [str, lambda x: str(x)])
+def test_apply_map_evaluate_lambdas_the_same(string_series, func, by_row, engine):
+    # test that we are evaluating row-by-row first if by_row="compat"
+    # else vectorized evaluation
+    result = string_series.apply(func, by_row=by_row)
+
+    if by_row:
+        expected = string_series.map(func, engine=engine)
+        tm.assert_series_equal(result, expected)
+    else:
+        assert result == str(string_series)
+
+
+def test_agg_evaluate_lambdas(string_series):
+    # GH53325
+    result = string_series.agg(lambda x: type(x))
+    assert result is Series
+
+    result = string_series.agg(type)
+    assert result is Series
+
+
+@pytest.mark.parametrize("op_name", ["agg", "apply"])
+def test_with_nested_series(datetime_series, op_name):
+    # GH 2316 & GH52123
+    # .agg with a reducer and a transform, what to do
+    result = getattr(datetime_series, op_name)(
+        lambda x: Series([x, x**2], index=["x", "x^2"])
+    )
+    if op_name == "apply":
+        expected = DataFrame({"x": datetime_series, "x^2": datetime_series**2})
+        tm.assert_frame_equal(result, expected)
+    else:
+        expected = Series([datetime_series, datetime_series**2], index=["x", "x^2"])
+        tm.assert_series_equal(result, expected)
+
+
+def test_replicate_describe(string_series):
+    # this also tests a result set that is all scalars
+    expected = string_series.describe()
+    result = string_series.apply(
+        {
+            "count": "count",
+            "mean": "mean",
+            "std": "std",
+            "min": "min",
+            "25%": lambda x: x.quantile(0.25),
+            "50%": "median",
+            "75%": lambda x: x.quantile(0.75),
+            "max": "max",
+        },
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_reduce(string_series):
+    # reductions with named functions
+    result = string_series.agg(["sum", "mean"])
+    expected = Series(
+        [string_series.sum(), string_series.mean()],
+        ["sum", "mean"],
+        name=string_series.name,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "how, kwds",
+    [("agg", {}), ("apply", {"by_row": "compat"}), ("apply", {"by_row": False})],
+)
+def test_non_callable_aggregates(how, kwds):
+    # test agg using non-callable series attributes
+    # GH 39116 - expand to apply
+    s = Series([1, 2, None])
+
+    # Calling agg w/ just a string arg same as calling s.arg
+    result = getattr(s, how)("size", **kwds)
+    expected = s.size
+    assert result == expected
+
+    # test when mixed w/ callable reducers
+    result = getattr(s, how)(["size", "count", "mean"], **kwds)
+    expected = Series({"size": 3.0, "count": 2.0, "mean": 1.5})
+    tm.assert_series_equal(result, expected)
+
+    result = getattr(s, how)({"size": "size", "count": "count", "mean": "mean"}, **kwds)
+    tm.assert_series_equal(result, expected)
+
+
+def test_series_apply_no_suffix_index(by_row):
+    # GH36189
+    s = Series([4] * 3)
+    result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()], by_row=by_row)
+    expected = Series([12, 12, 12], index=["sum", "<lambda>", "<lambda>"])
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "dti,exp",
+    [
+        (
+            Series([1, 2], index=pd.DatetimeIndex([0, 31536000000])),
+            DataFrame(np.repeat([[1, 2]], 2, axis=0), dtype="int64"),
+        ),
+        (
+            Series(
+                np.arange(10, dtype=np.float64),
+                index=date_range("2020-01-01", periods=10),
+                name="ts",
+            ),
+            DataFrame(np.repeat([[1, 2]], 10, axis=0), dtype="int64"),
+        ),
+    ],
+)
+@pytest.mark.parametrize("aware", [True, False])
+def test_apply_series_on_date_time_index_aware_series(dti, exp, aware):
+    # GH 25959
+    # Calling apply on a localized time series should not cause an error
+    if aware:
+        index = dti.tz_localize("UTC").index
+    else:
+        index = dti.index
+    result = Series(index).apply(lambda x: Series([1, 2]))
+    tm.assert_frame_equal(result, exp)
+
+
+@pytest.mark.parametrize(
+    "by_row, expected", [("compat", Series(np.ones(10), dtype="int64")), (False, 1)]
+)
+def test_apply_scalar_on_date_time_index_aware_series(by_row, expected):
+    # GH 25959
+    # Calling apply on a localized time series should not cause an error
+    series = Series(
+        np.arange(10, dtype=np.float64),
+        index=date_range("2020-01-01", periods=10, tz="UTC"),
+    )
+    result = Series(series.index).apply(lambda x: 1, by_row=by_row)
+    tm.assert_equal(result, expected)
+
+
+def test_apply_to_timedelta(by_row):
+    list_of_valid_strings = ["00:00:01", "00:00:02"]
+    a = pd.to_timedelta(list_of_valid_strings)
+    b = Series(list_of_valid_strings).apply(pd.to_timedelta, by_row=by_row)
+    tm.assert_series_equal(Series(a), b)
+
+    list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT]
+
+    a = pd.to_timedelta(list_of_strings)
+    ser = Series(list_of_strings)
+    b = ser.apply(pd.to_timedelta, by_row=by_row)
+    tm.assert_series_equal(Series(a), b)
+
+
+@pytest.mark.parametrize(
+    "ops, names",
+    [
+        ([np.sum], ["sum"]),
+        ([np.sum, np.mean], ["sum", "mean"]),
+        (np.array([np.sum]), ["sum"]),
+        (np.array([np.sum, np.mean]), ["sum", "mean"]),
+    ],
+)
+@pytest.mark.parametrize(
+    "how, kwargs",
+    [["agg", {}], ["apply", {"by_row": "compat"}], ["apply", {"by_row": False}]],
+)
+def test_apply_listlike_reducer(string_series, ops, names, how, kwargs):
+    # GH 39140
+    expected = Series(
+        {name: op(string_series) for name, op in zip(names, ops, strict=True)}
+    )
+    expected.name = "series"
+    result = getattr(string_series, how)(ops, **kwargs)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ops",
+    [
+        {"A": np.sum},
+        {"A": np.sum, "B": np.mean},
+        Series({"A": np.sum}),
+        Series({"A": np.sum, "B": np.mean}),
+    ],
+)
+@pytest.mark.parametrize(
+    "how, kwargs",
+    [["agg", {}], ["apply", {"by_row": "compat"}], ["apply", {"by_row": False}]],
+)
+def test_apply_dictlike_reducer(string_series, ops, how, kwargs, by_row):
+    # GH 39140
+    expected = Series({name: op(string_series) for name, op in ops.items()})
+    expected.name = string_series.name
+    result = getattr(string_series, how)(ops, **kwargs)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ops, names",
+    [
+        ([np.sqrt], ["sqrt"]),
+        ([np.abs, np.sqrt], ["absolute", "sqrt"]),
+        (np.array([np.sqrt]), ["sqrt"]),
+        (np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]),
+    ],
+)
+def test_apply_listlike_transformer(string_series, ops, names, by_row):
+    # GH 39140
+    with np.errstate(all="ignore"):
+        expected = concat([op(string_series) for op in ops], axis=1)
+        expected.columns = names
+        result = string_series.apply(ops, by_row=by_row)
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ops, expected",
+    [
+        ([lambda x: x], DataFrame({"<lambda>": [1, 2, 3]})),
+        ([lambda x: x.sum()], Series([6], index=["<lambda>"])),
+    ],
+)
+def test_apply_listlike_lambda(ops, expected, by_row):
+    # GH53400
+    ser = Series([1, 2, 3])
+    result = ser.apply(ops, by_row=by_row)
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ops",
+    [
+        {"A": np.sqrt},
+        {"A": np.sqrt, "B": np.exp},
+        Series({"A": np.sqrt}),
+        Series({"A": np.sqrt, "B": np.exp}),
+    ],
+)
+def test_apply_dictlike_transformer(string_series, ops, by_row):
+    # GH 39140
+    with np.errstate(all="ignore"):
+        expected = concat({name: op(string_series) for name, op in ops.items()})
+        expected.name = string_series.name
+        result = string_series.apply(ops, by_row=by_row)
+        tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ops, expected",
+    [
+        (
+            {"a": lambda x: x},
+            Series([1, 2, 3], index=MultiIndex.from_arrays([["a"] * 3, range(3)])),
+        ),
+        ({"a": lambda x: x.sum()}, Series([6], index=["a"])),
+    ],
+)
+def test_apply_dictlike_lambda(ops, by_row, expected):
+    # GH53400
+    ser = Series([1, 2, 3])
+    result = ser.apply(ops, by_row=by_row)
+    tm.assert_equal(result, expected)
+
+
+def test_apply_retains_column_name(by_row):
+    # GH 16380
+    df = DataFrame({"x": range(3)}, Index(range(3), name="x"))
+    result = df.x.apply(lambda x: Series(range(x + 1), Index(range(x + 1), name="y")))
+    expected = DataFrame(
+        [[0.0, np.nan, np.nan], [0.0, 1.0, np.nan], [0.0, 1.0, 2.0]],
+        columns=Index(range(3), name="y"),
+        index=Index(range(3), name="x"),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_type():
+    # GH 46719
+    s = Series([3, "string", float], index=["a", "b", "c"])
+    result = s.apply(type)
+    expected = Series([int, str, type], index=["a", "b", "c"])
+    tm.assert_series_equal(result, expected)
+
+
+def test_series_apply_unpack_nested_data():
+    # GH#55189
+    ser = Series([[1, 2, 3], [4, 5, 6, 7]])
+    result = ser.apply(lambda x: Series(x))
+    expected = DataFrame({0: [1.0, 4.0], 1: [2.0, 5.0], 2: [3.0, 6.0], 3: [np.nan, 7]})
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/apply/test_series_apply_relabeling.py b/pandas/tests/apply/test_series_apply_relabeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0a285e6eb38cc26da155755108ef2c814229384
--- /dev/null
+++ b/pandas/tests/apply/test_series_apply_relabeling.py
@@ -0,0 +1,33 @@
+import pandas as pd
+import pandas._testing as tm
+
+
+def test_relabel_no_duplicated_method():
+    # this is to test there is no duplicated method used in agg
+    df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]})
+
+    result = df["A"].agg(foo="sum")
+    expected = df["A"].agg({"foo": "sum"})
+    tm.assert_series_equal(result, expected)
+
+    result = df["B"].agg(foo="min", bar="max")
+    expected = df["B"].agg({"foo": "min", "bar": "max"})
+    tm.assert_series_equal(result, expected)
+
+    result = df["B"].agg(foo=sum, bar=min, cat="max")
+    expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"})
+    tm.assert_series_equal(result, expected)
+
+
+def test_relabel_duplicated_method():
+    # this is to test with nested renaming, duplicated method can be used
+    # if they are assigned with different new names
+    df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]})
+
+    result = df["A"].agg(foo="sum", bar="sum")
+    expected = pd.Series([6, 6], index=["foo", "bar"], name="A")
+    tm.assert_series_equal(result, expected)
+
+    result = df["B"].agg(foo=min, bar="min")
+    expected = pd.Series([1, 1], index=["foo", "bar"], name="B")
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/apply/test_series_transform.py b/pandas/tests/apply/test_series_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..82592c4711ece5a7f4b6d421d743e1adbd78c345
--- /dev/null
+++ b/pandas/tests/apply/test_series_transform.py
@@ -0,0 +1,84 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    MultiIndex,
+    Series,
+    concat,
+)
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize(
+    "args, kwargs, increment",
+    [((), {}, 0), ((), {"a": 1}, 1), ((2, 3), {}, 32), ((1,), {"c": 2}, 201)],
+)
+def test_agg_args(args, kwargs, increment):
+    # GH 43357
+    def f(x, a=0, b=0, c=0):
+        return x + a + 10 * b + 100 * c
+
+    s = Series([1, 2])
+    result = s.transform(f, 0, *args, **kwargs)
+    expected = s + increment
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ops, names",
+    [
+        ([np.sqrt], ["sqrt"]),
+        ([np.abs, np.sqrt], ["absolute", "sqrt"]),
+        (np.array([np.sqrt]), ["sqrt"]),
+        (np.array([np.abs, np.sqrt]), ["absolute", "sqrt"]),
+    ],
+)
+def test_transform_listlike(string_series, ops, names):
+    # GH 35964
+    with np.errstate(all="ignore"):
+        expected = concat([op(string_series) for op in ops], axis=1)
+        expected.columns = names
+        result = string_series.transform(ops)
+        tm.assert_frame_equal(result, expected)
+
+
+def test_transform_listlike_func_with_args():
+    # GH 50624
+
+    s = Series([1, 2, 3])
+
+    def foo1(x, a=1, c=0):
+        return x + a + c
+
+    def foo2(x, b=2, c=0):
+        return x + b + c
+
+    msg = r"foo1\(\) got an unexpected keyword argument 'b'"
+    with pytest.raises(TypeError, match=msg):
+        s.transform([foo1, foo2], 0, 3, b=3, c=4)
+
+    result = s.transform([foo1, foo2], 0, 3, c=4)
+    expected = DataFrame({"foo1": [8, 9, 10], "foo2": [8, 9, 10]})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("box", [dict, Series])
+def test_transform_dictlike(string_series, box):
+    # GH 35964
+    with np.errstate(all="ignore"):
+        expected = concat([np.sqrt(string_series), np.abs(string_series)], axis=1)
+    expected.columns = ["foo", "bar"]
+    result = string_series.transform(box({"foo": np.sqrt, "bar": np.abs}))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_transform_dictlike_mixed():
+    # GH 40018 - mix of lists and non-lists in values of a dictionary
+    df = Series([1, 4])
+    result = df.transform({"b": ["sqrt", "abs"], "c": "sqrt"})
+    expected = DataFrame(
+        [[1.0, 1, 1.0], [2.0, 4, 2.0]],
+        columns=MultiIndex([("b", "c"), ("sqrt", "abs")], [(0, 0, 1), (0, 1, 0)]),
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/apply/test_str.py b/pandas/tests/apply/test_str.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5a9492630b13a8ac03e976a699f8e58752887f2
--- /dev/null
+++ b/pandas/tests/apply/test_str.py
@@ -0,0 +1,307 @@
+from itertools import chain
+import operator
+
+import numpy as np
+import pytest
+
+from pandas.compat import (
+    WASM,
+)
+
+from pandas.core.dtypes.common import is_number
+
+from pandas import (
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+from pandas.tests.apply.common import (
+    frame_transform_kernels,
+    series_transform_kernels,
+)
+
+
+@pytest.mark.parametrize("func", ["sum", "mean", "min", "max", "std"])
+@pytest.mark.parametrize(
+    "kwds",
+    [
+        pytest.param({}, id="no_kwds"),
+        pytest.param({"axis": 1}, id="on_axis"),
+        pytest.param({"numeric_only": True}, id="func_kwds"),
+        pytest.param({"axis": 1, "numeric_only": True}, id="axis_and_func_kwds"),
+    ],
+)
+@pytest.mark.parametrize("how", ["agg", "apply"])
+def test_apply_with_string_funcs(float_frame, func, kwds, how):
+    result = getattr(float_frame, how)(func, **kwds)
+    expected = getattr(float_frame, func)(**kwds)
+    tm.assert_series_equal(result, expected)
+
+
+def test_with_string_args(datetime_series, all_numeric_reductions):
+    result = datetime_series.apply(all_numeric_reductions)
+    expected = getattr(datetime_series, all_numeric_reductions)()
+    assert result == expected
+
+
+@pytest.mark.parametrize("op", ["mean", "median", "std", "var"])
+@pytest.mark.parametrize("how", ["agg", "apply"])
+def test_apply_np_reducer(op, how):
+    # GH 39116
+    float_frame = DataFrame({"a": [1, 2], "b": [3, 4]})
+    result = getattr(float_frame, how)(op)
+    # pandas ddof defaults to 1, numpy to 0
+    kwargs = {"ddof": 1} if op in ("std", "var") else {}
+    expected = Series(
+        getattr(np, op)(float_frame, axis=0, **kwargs), index=float_frame.columns
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.skipif(WASM, reason="No fp exception support in wasm")
+@pytest.mark.parametrize(
+    "op", ["abs", "ceil", "cos", "cumsum", "exp", "log", "sqrt", "square"]
+)
+@pytest.mark.parametrize("how", ["transform", "apply"])
+def test_apply_np_transformer(float_frame, op, how):
+    # GH 39116
+
+    # float_frame will _usually_ have negative values, which will
+    #  trigger the warning here, but let's put one in just to be sure
+    float_frame.iloc[0, 0] = -1.0
+    warn = None
+    if op in ["log", "sqrt"]:
+        warn = RuntimeWarning
+
+    with tm.assert_produces_warning(warn, check_stacklevel=False):
+        # float_frame fixture is defined in conftest.py, so we don't check the
+        # stacklevel as otherwise the test would fail.
+        result = getattr(float_frame, how)(op)
+        expected = getattr(np, op)(float_frame)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "series, func, expected",
+    chain(
+        tm.get_cython_table_params(
+            Series(dtype=np.float64),
+            [
+                ("sum", 0),
+                ("max", np.nan),
+                ("min", np.nan),
+                ("all", True),
+                ("any", False),
+                ("mean", np.nan),
+                ("prod", 1),
+                ("std", np.nan),
+                ("var", np.nan),
+                ("median", np.nan),
+            ],
+        ),
+        tm.get_cython_table_params(
+            Series([np.nan, 1, 2, 3]),
+            [
+                ("sum", 6),
+                ("max", 3),
+                ("min", 1),
+                ("all", True),
+                ("any", True),
+                ("mean", 2),
+                ("prod", 6),
+                ("std", 1),
+                ("var", 1),
+                ("median", 2),
+            ],
+        ),
+        tm.get_cython_table_params(
+            Series("a b c".split()),
+            [
+                ("sum", "abc"),
+                ("max", "c"),
+                ("min", "a"),
+                ("all", True),
+                ("any", True),
+            ],
+        ),
+    ),
+)
+def test_agg_cython_table_series(series, func, expected):
+    # GH21224
+    # test reducing functions in
+    # pandas.core.base.SelectionMixin._cython_table
+    warn = None if isinstance(func, str) else FutureWarning
+    with tm.assert_produces_warning(warn, match="is currently using Series.*"):
+        result = series.agg(func)
+    if is_number(expected):
+        assert np.isclose(result, expected, equal_nan=True)
+    else:
+        assert result == expected
+
+
+@pytest.mark.parametrize(
+    "series, func, expected",
+    chain(
+        tm.get_cython_table_params(
+            Series(dtype=np.float64),
+            [
+                ("cumprod", Series([], dtype=np.float64)),
+                ("cumsum", Series([], dtype=np.float64)),
+            ],
+        ),
+        tm.get_cython_table_params(
+            Series([np.nan, 1, 2, 3]),
+            [
+                ("cumprod", Series([np.nan, 1, 2, 6])),
+                ("cumsum", Series([np.nan, 1, 3, 6])),
+            ],
+        ),
+        tm.get_cython_table_params(
+            Series("a b c".split()), [("cumsum", Series(["a", "ab", "abc"]))]
+        ),
+    ),
+)
+def test_agg_cython_table_transform_series(series, func, expected):
+    # GH21224
+    # test transforming functions in
+    # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
+    warn = None if isinstance(func, str) else FutureWarning
+    with tm.assert_produces_warning(warn, match="is currently using Series.*"):
+        result = series.agg(func)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "df, func, expected",
+    chain(
+        tm.get_cython_table_params(
+            DataFrame(),
+            [
+                ("sum", Series(dtype="float64")),
+                ("max", Series(dtype="float64")),
+                ("min", Series(dtype="float64")),
+                ("all", Series(dtype=bool)),
+                ("any", Series(dtype=bool)),
+                ("mean", Series(dtype="float64")),
+                ("prod", Series(dtype="float64")),
+                ("std", Series(dtype="float64")),
+                ("var", Series(dtype="float64")),
+                ("median", Series(dtype="float64")),
+            ],
+        ),
+        tm.get_cython_table_params(
+            DataFrame([[np.nan, 1], [1, 2]]),
+            [
+                ("sum", Series([1.0, 3])),
+                ("max", Series([1.0, 2])),
+                ("min", Series([1.0, 1])),
+                ("all", Series([True, True])),
+                ("any", Series([True, True])),
+                ("mean", Series([1, 1.5])),
+                ("prod", Series([1.0, 2])),
+                ("std", Series([np.nan, 0.707107])),
+                ("var", Series([np.nan, 0.5])),
+                ("median", Series([1, 1.5])),
+            ],
+        ),
+    ),
+)
+def test_agg_cython_table_frame(df, func, expected, axis):
+    # GH 21224
+    # test reducing functions in
+    # pandas.core.base.SelectionMixin._cython_table
+    warn = None if isinstance(func, str) else FutureWarning
+    with tm.assert_produces_warning(warn, match="is currently using DataFrame.*"):
+        # GH#53425
+        result = df.agg(func, axis=axis)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "df, func, expected",
+    chain(
+        tm.get_cython_table_params(
+            DataFrame(), [("cumprod", DataFrame()), ("cumsum", DataFrame())]
+        ),
+        tm.get_cython_table_params(
+            DataFrame([[np.nan, 1], [1, 2]]),
+            [
+                ("cumprod", DataFrame([[np.nan, 1], [1, 2]])),
+                ("cumsum", DataFrame([[np.nan, 1], [1, 3]])),
+            ],
+        ),
+    ),
+)
+def test_agg_cython_table_transform_frame(df, func, expected, axis):
+    # GH 21224
+    # test transforming functions in
+    # pandas.core.base.SelectionMixin._cython_table (cumprod, cumsum)
+    if axis in ("columns", 1):
+        # operating blockwise doesn't let us preserve dtypes
+        expected = expected.astype("float64")
+
+    warn = None if isinstance(func, str) else FutureWarning
+    with tm.assert_produces_warning(warn, match="is currently using DataFrame.*"):
+        # GH#53425
+        result = df.agg(func, axis=axis)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("op", series_transform_kernels)
+def test_transform_groupby_kernel_series(request, string_series, op):
+    # GH 35964
+    if op == "ngroup":
+        request.applymarker(
+            pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame")
+        )
+    args = [0.0] if op == "fillna" else []
+    ones = np.ones(string_series.shape[0])
+
+    warn = FutureWarning if op == "fillna" else None
+    msg = "SeriesGroupBy.fillna is deprecated"
+    with tm.assert_produces_warning(warn, match=msg):
+        expected = string_series.groupby(ones).transform(op, *args)
+    result = string_series.transform(op, 0, *args)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("op", frame_transform_kernels)
+def test_transform_groupby_kernel_frame(request, float_frame, op):
+    if op == "ngroup":
+        request.applymarker(
+            pytest.mark.xfail(raises=ValueError, reason="ngroup not valid for NDFrame")
+        )
+
+    # GH 35964
+
+    args = [0.0] if op == "fillna" else []
+    ones = np.ones(float_frame.shape[0])
+    gb = float_frame.groupby(ones)
+
+    warn = FutureWarning if op == "fillna" else None
+    op_msg = "DataFrameGroupBy.fillna is deprecated"
+    with tm.assert_produces_warning(warn, match=op_msg):
+        expected = gb.transform(op, *args)
+
+    result = float_frame.transform(op, 0, *args)
+    tm.assert_frame_equal(result, expected)
+
+    # same thing, but ensuring we have multiple blocks
+    assert "E" not in float_frame.columns
+    float_frame["E"] = float_frame["A"].copy()
+    assert len(float_frame._mgr.blocks) > 1
+
+    ones = np.ones(float_frame.shape[0])
+    gb2 = float_frame.groupby(ones)
+    expected2 = gb2.transform(op, *args)
+    result2 = float_frame.transform(op, 0, *args)
+    tm.assert_frame_equal(result2, expected2)
+
+
+@pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"])
+def test_transform_method_name(method):
+    # GH 19760
+    df = DataFrame({"A": [-1, 2]})
+    result = df.transform(method)
+    expected = operator.methodcaller(method)(df)
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/arithmetic/__init__.py b/pandas/tests/arithmetic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ea9d2b0ee23ad14168a4366e332e1d49d3c0c85
--- /dev/null
+++ b/pandas/tests/arithmetic/common.py
@@ -0,0 +1,158 @@
+"""
+Assertion helpers for arithmetic tests.
+"""
+
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    array,
+)
+import pandas._testing as tm
+from pandas.core.arrays import (
+    BooleanArray,
+    NumpyExtensionArray,
+)
+
+
+def assert_cannot_add(left, right, msg="cannot add"):
+    """
+    Helper function to assert that two objects cannot be added.
+
+    Parameters
+    ----------
+    left : object
+        The first operand.
+    right : object
+        The second operand.
+    msg : str, default "cannot add"
+        The error message expected in the TypeError.
+    """
+    with pytest.raises(TypeError, match=msg):
+        left + right
+    with pytest.raises(TypeError, match=msg):
+        right + left
+
+
+def assert_invalid_addsub_type(left, right, msg=None):
+    """
+    Helper function to assert that two objects can
+    neither be added nor subtracted.
+
+    Parameters
+    ----------
+    left : object
+        The first operand.
+    right : object
+        The second operand.
+    msg : str or None, default None
+        The error message expected in the TypeError.
+    """
+    with pytest.raises(TypeError, match=msg):
+        left + right
+    with pytest.raises(TypeError, match=msg):
+        right + left
+    with pytest.raises(TypeError, match=msg):
+        left - right
+    with pytest.raises(TypeError, match=msg):
+        right - left
+
+
+def get_upcast_box(left, right, is_cmp: bool = False):
+    """
+    Get the box to use for 'expected' in an arithmetic or comparison operation.
+
+    Parameters
+    left : Any
+    right : Any
+    is_cmp : bool, default False
+        Whether the operation is a comparison method.
+    """
+
+    if isinstance(left, DataFrame) or isinstance(right, DataFrame):
+        return DataFrame
+    if isinstance(left, Series) or isinstance(right, Series):
+        if is_cmp and isinstance(left, Index):
+            # Index does not defer for comparisons
+            return np.array
+        return Series
+    if isinstance(left, Index) or isinstance(right, Index):
+        if is_cmp:
+            return np.array
+        return Index
+    return tm.to_array
+
+
+def assert_invalid_comparison(left, right, box):
+    """
+    Assert that comparison operations with mismatched types behave correctly.
+
+    Parameters
+    ----------
+    left : np.ndarray, ExtensionArray, Index, or Series
+    right : object
+    box : {pd.DataFrame, pd.Series, pd.Index, pd.array, tm.to_array}
+    """
+    # Not for tznaive-tzaware comparison
+
+    # Note: not quite the same as how we do this for tm.box_expected
+    xbox = box if box not in [Index, array] else np.array
+
+    def xbox2(x):
+        # Eventually we'd like this to be tighter, but for now we'll
+        #  just exclude NumpyExtensionArray[bool]
+        if isinstance(x, NumpyExtensionArray):
+            return x._ndarray
+        if isinstance(x, BooleanArray):
+            # NB: we are assuming no pd.NAs for now
+            return x.astype(bool)
+        return x
+
+    result = xbox2(left == right)
+    expected = xbox(np.zeros(result.shape, dtype=np.bool_))
+
+    tm.assert_equal(result, expected)
+
+    result = xbox2(right == left)
+    tm.assert_equal(result, xbox(expected))
+
+    result = xbox2(left != right)
+    tm.assert_equal(result, ~expected)
+
+    result = xbox2(right != left)
+    tm.assert_equal(result, xbox(~expected))
+
+    msg = "|".join(
+        [
+            "Invalid comparison between",
+            "Cannot compare type",
+            "not supported between",
+            "invalid type promotion",
+            (
+                # GH#36706 npdev 1.20.0 2020-09-28
+                r"The DTypes <class 'numpy.dtype\[datetime64\]'> and "
+                r"<class 'numpy.dtype\[int64\]'> do not have a common DType. "
+                "For example they cannot be stored in a single array unless the "
+                "dtype is `object`."
+            ),
+        ]
+    )
+    with pytest.raises(TypeError, match=msg):
+        left < right
+    with pytest.raises(TypeError, match=msg):
+        left <= right
+    with pytest.raises(TypeError, match=msg):
+        left > right
+    with pytest.raises(TypeError, match=msg):
+        left >= right
+    with pytest.raises(TypeError, match=msg):
+        right < left
+    with pytest.raises(TypeError, match=msg):
+        right <= left
+    with pytest.raises(TypeError, match=msg):
+        right > left
+    with pytest.raises(TypeError, match=msg):
+        right >= left
diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..44838394f2183dd7f18c4a960ff60a9f5e3f29cc
--- /dev/null
+++ b/pandas/tests/arithmetic/conftest.py
@@ -0,0 +1,139 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Index
+
+
+@pytest.fixture(params=[1, np.array(1, dtype=np.int64)])
+def one(request):
+    """
+    Several variants of integer value 1. The zero-dim integer array
+    behaves like an integer.
+
+    This fixture can be used to check that datetimelike indexes handle
+    addition and subtraction of integers and zero-dimensional arrays
+    of integers.
+
+    Examples
+    --------
+    dti = pd.date_range('2016-01-01', periods=2, freq='h')
+    dti
+    DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00'],
+    dtype='datetime64[ns]', freq='h')
+    dti + one
+    DatetimeIndex(['2016-01-01 01:00:00', '2016-01-01 02:00:00'],
+    dtype='datetime64[ns]', freq='h')
+    """
+    return request.param
+
+
+zeros = [
+    box_cls([0] * 5, dtype=dtype)
+    for box_cls in [Index, np.array, pd.array]
+    for dtype in [np.int64, np.uint64, np.float64]
+]
+zeros.extend([box_cls([-0.0] * 5, dtype=np.float64) for box_cls in [Index, np.array]])
+zeros.extend([np.array(0, dtype=dtype) for dtype in [np.int64, np.uint64, np.float64]])
+zeros.extend([np.array(-0.0, dtype=np.float64)])
+zeros.extend([0, 0.0, -0.0])
+
+
+@pytest.fixture(params=zeros)
+def zero(request):
+    """
+    Several types of scalar zeros and length 5 vectors of zeros.
+
+    This fixture can be used to check that numeric-dtype indexes handle
+    division by any zero numeric-dtype.
+
+    Uses vector of length 5 for broadcasting with `numeric_idx` fixture,
+    which creates numeric-dtype vectors also of length 5.
+
+    Examples
+    --------
+    arr = RangeIndex(5)
+    arr / zeros
+    Index([nan, inf, inf, inf, inf], dtype='float64')
+    """
+    return request.param
+
+
+# ------------------------------------------------------------------
+# Scalar Fixtures
+
+
+@pytest.fixture(
+    params=[
+        pd.Timedelta("10m7s").to_pytimedelta(),
+        pd.Timedelta("10m7s"),
+        pd.Timedelta("10m7s").to_timedelta64(),
+    ],
+    ids=lambda x: type(x).__name__,
+)
+def scalar_td(request):
+    """
+    Several variants of Timedelta scalars representing 10 minutes and 7 seconds.
+    """
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        pd.offsets.Day(3),
+        pd.offsets.Hour(72),
+        pd.Timedelta(days=3).to_pytimedelta(),
+        pd.Timedelta("72:00:00"),
+        np.timedelta64(3, "D"),
+        np.timedelta64(72, "h"),
+    ],
+    ids=lambda x: type(x).__name__,
+)
+def three_days(request):
+    """
+    Several timedelta-like and DateOffset objects that each represent
+    a 3-day timedelta
+    """
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        pd.offsets.Hour(2),
+        pd.offsets.Minute(120),
+        pd.Timedelta(hours=2).to_pytimedelta(),
+        pd.Timedelta(seconds=2 * 3600),
+        np.timedelta64(2, "h"),
+        np.timedelta64(120, "m"),
+    ],
+    ids=lambda x: type(x).__name__,
+)
+def two_hours(request):
+    """
+    Several timedelta-like and DateOffset objects that each represent
+    a 2-hour timedelta
+    """
+    return request.param
+
+
+_common_mismatch = [
+    pd.offsets.YearBegin(2),
+    pd.offsets.MonthBegin(1),
+    pd.offsets.Minute(),
+]
+
+
+@pytest.fixture(
+    params=[
+        np.timedelta64(4, "h"),
+        pd.Timedelta(hours=23).to_pytimedelta(),
+        pd.Timedelta("23:00:00"),
+        *_common_mismatch,
+    ]
+)
+def not_daily(request):
+    """
+    Several timedelta-like and DateOffset instances that are _not_
+    compatible with Daily frequencies.
+    """
+    return request.param
diff --git a/pandas/tests/arithmetic/test_array_ops.py b/pandas/tests/arithmetic/test_array_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..f026092e0d1133ea861f9cdb2699924e3618298e
--- /dev/null
+++ b/pandas/tests/arithmetic/test_array_ops.py
@@ -0,0 +1,78 @@
+import operator
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.missing import isna
+
+import pandas._testing as tm
+from pandas.core.ops.array_ops import (
+    comparison_op,
+    na_logical_op,
+)
+
+
+def test_na_logical_op_2d():
+    left = np.arange(8).reshape(4, 2)
+    right = left.astype(object)
+    right[0, 0] = np.nan
+
+    # Check that we fall back to the vec_binop branch
+    with pytest.raises(TypeError, match="unsupported operand type"):
+        operator.or_(left, right)
+
+    result = na_logical_op(left, right, operator.or_)
+    expected = right
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_object_comparison_2d():
+    left = np.arange(9).reshape(3, 3).astype(object)
+    right = left.T
+
+    result = comparison_op(left, right, operator.eq)
+    expected = np.eye(3).astype(bool)
+    tm.assert_numpy_array_equal(result, expected)
+
+    # Ensure that cython doesn't raise on non-writeable arg, which
+    #  we can get from np.broadcast_to
+    right.flags.writeable = False
+    result = comparison_op(left, right, operator.ne)
+    tm.assert_numpy_array_equal(result, ~expected)
+
+
+@pytest.mark.parametrize("rvalues", [1, [1, 1, 1], np.nan, None])
+@pytest.mark.parametrize(
+    "op", [operator.eq, operator.ne, operator.lt, operator.le, operator.gt, operator.ge]
+)
+def test_comparison_for_subclasses(rvalues, op):
+    # GH#63205 Ensure subclasses of ndarray are correctly handled in comparison_op
+    # Define a custom ndarray subclass
+    class TestArray(np.ndarray):
+        def __new__(cls, input_array):
+            return np.asarray(input_array).view(cls)
+
+        def __array_finalize__(self, obj) -> None:
+            self._is_test_array = True
+
+    def expected_with_na_handling(lvalues, rvalues, op):
+        # Similar to comparison_op, handle zerodim arrays with na value separately
+        if (rvalues.ndim == 0) and isna(rvalues.item()):
+            # numpy does not like comparisons vs None
+            if op is operator.ne:
+                return np.ones(lvalues.shape, dtype=bool)
+            else:
+                return np.zeros(lvalues.shape, dtype=bool)
+        return op(lvalues, rvalues)
+
+    # Define test data
+    lvalues = [1, 2, 3]
+
+    # Test with both ndarray and TestArray
+    result = comparison_op(np.array(lvalues), np.array(rvalues), op)
+    expected = expected_with_na_handling(np.array(lvalues), np.array(rvalues), op)
+    tm.assert_numpy_array_equal(result, expected)
+
+    result = comparison_op(TestArray(lvalues), TestArray(rvalues), op)
+    expected = expected_with_na_handling(TestArray(lvalues), TestArray(rvalues), op)
+    tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/arithmetic/test_bool.py b/pandas/tests/arithmetic/test_bool.py
new file mode 100644
index 0000000000000000000000000000000000000000..3723b7042a3ce77bcf21d34c77fff01ed31eceb4
--- /dev/null
+++ b/pandas/tests/arithmetic/test_bool.py
@@ -0,0 +1,28 @@
+import pytest
+
+from pandas import (
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+
+
+def test_divmod_bool_raises(box_with_array):
+    # GH#46043 // raises, so divmod should too
+    ser = Series([True, False])
+    obj = tm.box_expected(ser, box_with_array)
+
+    msg = "operator 'floordiv' not implemented for bool dtypes"
+    with pytest.raises(NotImplementedError, match=msg):
+        obj // obj
+
+    if box_with_array is DataFrame:
+        msg = "operator 'floordiv' not implemented for bool dtypes"
+    else:
+        msg = "operator 'divmod' not implemented for bool dtypes"
+    with pytest.raises(NotImplementedError, match=msg):
+        divmod(obj, obj)
+
+    # go through __rdivmod__
+    with pytest.raises(NotImplementedError, match=msg):
+        divmod(True, obj)
diff --git a/pandas/tests/arithmetic/test_categorical.py b/pandas/tests/arithmetic/test_categorical.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6f3a13ce670596a12ca10b9e8d02d69d63c96fb
--- /dev/null
+++ b/pandas/tests/arithmetic/test_categorical.py
@@ -0,0 +1,25 @@
+import numpy as np
+
+from pandas import (
+    Categorical,
+    Series,
+)
+import pandas._testing as tm
+
+
+class TestCategoricalComparisons:
+    def test_categorical_nan_equality(self):
+        cat = Series(Categorical(["a", "b", "c", np.nan]))
+        expected = Series([True, True, True, False])
+        result = cat == cat
+        tm.assert_series_equal(result, expected)
+
+    def test_categorical_tuple_equality(self):
+        # GH 18050
+        ser = Series([(0, 0), (0, 1), (0, 0), (1, 0), (1, 1)])
+        expected = Series([True, False, True, False, False])
+        result = ser == (0, 0)
+        tm.assert_series_equal(result, expected)
+
+        result = ser.astype("category") == (0, 0)
+        tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py
new file mode 100644
index 0000000000000000000000000000000000000000..05d0a9c0626af83b3eb43a683a94a0229376ff88
--- /dev/null
+++ b/pandas/tests/arithmetic/test_datetime64.py
@@ -0,0 +1,2500 @@
+# Arithmetic tests for DataFrame/Series/Index/Array classes that should
+# behave identically.
+# Specifically for datetime64 and datetime64tz dtypes
+from datetime import (
+    datetime,
+    time,
+    timedelta,
+    timezone,
+)
+from itertools import (
+    product,
+)
+import operator
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs.conversion import localize_pydatetime
+from pandas._libs.tslibs.offsets import shift_months
+
+import pandas as pd
+from pandas import (
+    DateOffset,
+    DatetimeIndex,
+    NaT,
+    Period,
+    Series,
+    Timedelta,
+    TimedeltaIndex,
+    Timestamp,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.core import roperator
+from pandas.tests.arithmetic.common import (
+    assert_cannot_add,
+    assert_invalid_addsub_type,
+    assert_invalid_comparison,
+    get_upcast_box,
+)
+
+# ------------------------------------------------------------------
+# Comparisons
+
+
+class TestDatetime64ArrayLikeComparisons:
+    # Comparison tests for datetime64 vectors fully parametrized over
+    #  DataFrame/Series/DatetimeIndex/DatetimeArray.  Ideally all comparison
+    #  tests will eventually end up here.
+
+    def test_compare_zerodim(self, tz_naive_fixture, box_with_array):
+        # Test comparison with zero-dimensional array is unboxed
+        tz = tz_naive_fixture
+        box = box_with_array
+        dti = date_range("20130101", periods=3, tz=tz)
+
+        other = np.array(dti.to_numpy()[0])
+
+        dtarr = tm.box_expected(dti, box)
+        xbox = get_upcast_box(dtarr, other, True)
+        result = dtarr <= other
+        expected = np.array([True, False, False])
+        expected = tm.box_expected(expected, xbox)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            "foo",
+            -1,
+            99,
+            4.0,
+            object(),
+            timedelta(days=2),
+            # GH#19800, GH#19301 datetime.date comparison raises to
+            #  match DatetimeIndex/Timestamp.  This also matches the behavior
+            #  of stdlib datetime.datetime
+            datetime(2001, 1, 1).date(),
+            # GH#19301 None and NaN are *not* cast to NaT for comparisons
+            None,
+            np.nan,
+        ],
+    )
+    def test_dt64arr_cmp_scalar_invalid(self, other, tz_naive_fixture, box_with_array):
+        # GH#22074, GH#15966
+        tz = tz_naive_fixture
+
+        rng = date_range("1/1/2000", periods=10, tz=tz)
+        dtarr = tm.box_expected(rng, box_with_array)
+        assert_invalid_comparison(dtarr, other, box_with_array)
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            # GH#4968 invalid date/int comparisons
+            list(range(10)),
+            np.arange(10),
+            np.arange(10).astype(np.float32),
+            np.arange(10).astype(object),
+            pd.timedelta_range("1ns", periods=10).array,
+            np.array(pd.timedelta_range("1ns", periods=10)),
+            list(pd.timedelta_range("1ns", periods=10)),
+            pd.timedelta_range("1 Day", periods=10).astype(object),
+            pd.period_range("1971-01-01", freq="D", periods=10).array,
+            pd.period_range("1971-01-01", freq="D", periods=10).astype(object),
+        ],
+    )
+    def test_dt64arr_cmp_arraylike_invalid(
+        self, other, tz_naive_fixture, box_with_array
+    ):
+        tz = tz_naive_fixture
+
+        dta = date_range("1970-01-01", freq="ns", periods=10, tz=tz)._data
+        obj = tm.box_expected(dta, box_with_array)
+        assert_invalid_comparison(obj, other, box_with_array)
+
+    def test_dt64arr_cmp_mixed_invalid(self, tz_naive_fixture):
+        tz = tz_naive_fixture
+
+        dta = date_range("1970-01-01", freq="h", periods=5, tz=tz)._data
+
+        other = np.array([0, 1, 2, dta[3], Timedelta(days=1)])
+        result = dta == other
+        expected = np.array([False, False, False, True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = dta != other
+        tm.assert_numpy_array_equal(result, ~expected)
+
+        msg = "Invalid comparison between|Cannot compare type|not supported between"
+        with pytest.raises(TypeError, match=msg):
+            dta < other
+        with pytest.raises(TypeError, match=msg):
+            dta > other
+        with pytest.raises(TypeError, match=msg):
+            dta <= other
+        with pytest.raises(TypeError, match=msg):
+            dta >= other
+
+    def test_dt64arr_nat_comparison(self, tz_naive_fixture, box_with_array):
+        # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly
+        tz = tz_naive_fixture
+        box = box_with_array
+
+        ts = Timestamp("2021-01-01", tz=tz)
+        ser = Series([ts, NaT])
+
+        obj = tm.box_expected(ser, box)
+        xbox = get_upcast_box(obj, ts, True)
+
+        expected = Series([True, False], dtype=np.bool_)
+        expected = tm.box_expected(expected, xbox)
+
+        result = obj == ts
+        tm.assert_equal(result, expected)
+
+
+class TestDatetime64SeriesComparison:
+    # TODO: moved from tests.series.test_operators; needs cleanup
+
+    @pytest.mark.parametrize(
+        "pair",
+        [
+            (
+                [Timestamp("2011-01-01"), NaT, Timestamp("2011-01-03")],
+                [NaT, NaT, Timestamp("2011-01-03")],
+            ),
+            (
+                [Timedelta("1 days"), NaT, Timedelta("3 days")],
+                [NaT, NaT, Timedelta("3 days")],
+            ),
+            (
+                [Period("2011-01", freq="M"), NaT, Period("2011-03", freq="M")],
+                [NaT, NaT, Period("2011-03", freq="M")],
+            ),
+        ],
+    )
+    @pytest.mark.parametrize("reverse", [True, False])
+    @pytest.mark.parametrize("dtype", [None, object])
+    @pytest.mark.parametrize(
+        "op, expected",
+        [
+            (operator.eq, [False, False, True]),
+            (operator.ne, [True, True, False]),
+            (operator.lt, [False, False, False]),
+            (operator.gt, [False, False, False]),
+            (operator.ge, [False, False, True]),
+            (operator.le, [False, False, True]),
+        ],
+    )
+    def test_nat_comparisons(
+        self,
+        dtype,
+        index_or_series,
+        reverse,
+        pair,
+        op,
+        expected,
+    ):
+        box = index_or_series
+        lhs, rhs = pair
+        if reverse:
+            # add lhs / rhs switched data
+            lhs, rhs = rhs, lhs
+
+        left = Series(lhs, dtype=dtype)
+        right = box(rhs, dtype=dtype)
+
+        result = op(left, right)
+
+        tm.assert_series_equal(result, Series(expected))
+
+    @pytest.mark.parametrize(
+        "data",
+        [
+            [Timestamp("2011-01-01"), NaT, Timestamp("2011-01-03")],
+            [Timedelta("1 days"), NaT, Timedelta("3 days")],
+            [Period("2011-01", freq="M"), NaT, Period("2011-03", freq="M")],
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [None, object])
+    def test_nat_comparisons_scalar(self, dtype, data, box_with_array):
+        box = box_with_array
+
+        left = Series(data, dtype=dtype)
+        left = tm.box_expected(left, box)
+        xbox = get_upcast_box(left, NaT, True)
+
+        expected = [False, False, False]
+        expected = tm.box_expected(expected, xbox)
+        if box is pd.array and dtype is object:
+            expected = pd.array(expected, dtype="bool")
+
+        tm.assert_equal(left == NaT, expected)
+        tm.assert_equal(NaT == left, expected)
+
+        expected = [True, True, True]
+        expected = tm.box_expected(expected, xbox)
+        if box is pd.array and dtype is object:
+            expected = pd.array(expected, dtype="bool")
+        tm.assert_equal(left != NaT, expected)
+        tm.assert_equal(NaT != left, expected)
+
+        expected = [False, False, False]
+        expected = tm.box_expected(expected, xbox)
+        if box is pd.array and dtype is object:
+            expected = pd.array(expected, dtype="bool")
+        tm.assert_equal(left < NaT, expected)
+        tm.assert_equal(NaT > left, expected)
+        tm.assert_equal(left <= NaT, expected)
+        tm.assert_equal(NaT >= left, expected)
+
+        tm.assert_equal(left > NaT, expected)
+        tm.assert_equal(NaT < left, expected)
+        tm.assert_equal(left >= NaT, expected)
+        tm.assert_equal(NaT <= left, expected)
+
+    @pytest.mark.parametrize("val", [datetime(2000, 1, 4), datetime(2000, 1, 5)])
+    def test_series_comparison_scalars(self, val):
+        series = Series(date_range("1/1/2000", periods=10))
+
+        result = series > val
+        expected = Series([x > val for x in series])
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "left,right", [("lt", "gt"), ("le", "ge"), ("eq", "eq"), ("ne", "ne")]
+    )
+    def test_timestamp_compare_series(self, left, right):
+        # see gh-4982
+        # Make sure we can compare Timestamps on the right AND left hand side.
+        ser = Series(date_range("20010101", periods=10), name="dates")
+        s_nat = ser.copy(deep=True)
+
+        ser[0] = Timestamp("nat")
+        ser[3] = Timestamp("nat")
+
+        left_f = getattr(operator, left)
+        right_f = getattr(operator, right)
+
+        # No NaT
+        expected = left_f(ser, Timestamp("20010109"))
+        result = right_f(Timestamp("20010109"), ser)
+        tm.assert_series_equal(result, expected)
+
+        # NaT
+        expected = left_f(ser, Timestamp("nat"))
+        result = right_f(Timestamp("nat"), ser)
+        tm.assert_series_equal(result, expected)
+
+        # Compare to Timestamp with series containing NaT
+        expected = left_f(s_nat, Timestamp("20010109"))
+        result = right_f(Timestamp("20010109"), s_nat)
+        tm.assert_series_equal(result, expected)
+
+        # Compare to NaT with series containing NaT
+        expected = left_f(s_nat, NaT)
+        result = right_f(NaT, s_nat)
+        tm.assert_series_equal(result, expected)
+
+    def test_dt64arr_timestamp_equality(self, box_with_array):
+        # GH#11034
+        box = box_with_array
+
+        ser = Series([Timestamp("2000-01-29 01:59:00"), Timestamp("2000-01-30"), NaT])
+        ser = tm.box_expected(ser, box)
+        xbox = get_upcast_box(ser, ser, True)
+
+        result = ser != ser
+        expected = tm.box_expected([False, False, True], xbox)
+        tm.assert_equal(result, expected)
+
+        if box is pd.DataFrame:
+            # alignment for frame vs series comparisons deprecated
+            #  in GH#46795 enforced 2.0
+            with pytest.raises(ValueError, match="not aligned"):
+                ser != ser[0]
+
+        else:
+            result = ser != ser[0]
+            expected = tm.box_expected([False, True, True], xbox)
+            tm.assert_equal(result, expected)
+
+        if box is pd.DataFrame:
+            # alignment for frame vs series comparisons deprecated
+            #  in GH#46795 enforced 2.0
+            with pytest.raises(ValueError, match="not aligned"):
+                ser != ser[2]
+        else:
+            result = ser != ser[2]
+            expected = tm.box_expected([True, True, True], xbox)
+            tm.assert_equal(result, expected)
+
+        result = ser == ser
+        expected = tm.box_expected([True, True, False], xbox)
+        tm.assert_equal(result, expected)
+
+        if box is pd.DataFrame:
+            # alignment for frame vs series comparisons deprecated
+            #  in GH#46795 enforced 2.0
+            with pytest.raises(ValueError, match="not aligned"):
+                ser == ser[0]
+        else:
+            result = ser == ser[0]
+            expected = tm.box_expected([True, False, False], xbox)
+            tm.assert_equal(result, expected)
+
+        if box is pd.DataFrame:
+            # alignment for frame vs series comparisons deprecated
+            #  in GH#46795 enforced 2.0
+            with pytest.raises(ValueError, match="not aligned"):
+                ser == ser[2]
+        else:
+            result = ser == ser[2]
+            expected = tm.box_expected([False, False, False], xbox)
+            tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "datetimelike",
+        [
+            Timestamp("20130101"),
+            datetime(2013, 1, 1),
+            np.datetime64("2013-01-01T00:00", "ns"),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "op,expected",
+        [
+            (operator.lt, [True, False, False, False]),
+            (operator.le, [True, True, False, False]),
+            (operator.eq, [False, True, False, False]),
+            (operator.gt, [False, False, False, True]),
+        ],
+    )
+    def test_dt64_compare_datetime_scalar(self, datetimelike, op, expected):
+        # GH#17965, test for ability to compare datetime64[ns] columns
+        #  to datetimelike
+        ser = Series(
+            [
+                Timestamp("20120101"),
+                Timestamp("20130101"),
+                np.nan,
+                Timestamp("20130103"),
+            ],
+            name="A",
+        )
+        result = op(ser, datetimelike)
+        expected = Series(expected, name="A")
+        tm.assert_series_equal(result, expected)
+
+    def test_ts_series_numpy_maximum(self):
+        # GH#50864, test numpy.maximum does not fail
+        # given a TimeStamp and Series(with dtype datetime64) comparison
+        ts = Timestamp("2024-07-01")
+        ts_series = Series(
+            ["2024-06-01", "2024-07-01", "2024-08-01"],
+            dtype="datetime64[us]",
+        )
+
+        expected = Series(
+            ["2024-07-01", "2024-07-01", "2024-08-01"],
+            dtype="datetime64[us]",
+        )
+
+        tm.assert_series_equal(expected, np.maximum(ts, ts_series))
+
+
+class TestDatetimeIndexComparisons:
+    # TODO: moved from tests.indexes.test_base; parametrize and de-duplicate
+    def test_comparators(self, comparison_op):
+        index = date_range("2020-01-01", periods=10)
+        element = index[len(index) // 2]
+        element = Timestamp(element).to_datetime64()
+
+        arr = np.array(index)
+        arr_result = comparison_op(arr, element)
+        index_result = comparison_op(index, element)
+
+        assert isinstance(index_result, np.ndarray)
+        tm.assert_numpy_array_equal(arr_result, index_result)
+
+    @pytest.mark.parametrize(
+        "other",
+        [datetime(2016, 1, 1), Timestamp("2016-01-01"), np.datetime64("2016-01-01")],
+    )
+    def test_dti_cmp_datetimelike(self, other, tz_naive_fixture):
+        tz = tz_naive_fixture
+        dti = date_range("2016-01-01", periods=2, tz=tz)
+        if tz is not None:
+            if isinstance(other, np.datetime64):
+                pytest.skip(f"{type(other).__name__} is not tz aware")
+            other = localize_pydatetime(other, dti.tzinfo)
+
+        result = dti == other
+        expected = np.array([True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = dti > other
+        expected = np.array([False, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = dti >= other
+        expected = np.array([True, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = dti < other
+        expected = np.array([False, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = dti <= other
+        expected = np.array([True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype", [None, object])
+    def test_dti_cmp_nat(self, dtype, box_with_array):
+        left = DatetimeIndex([Timestamp("2011-01-01"), NaT, Timestamp("2011-01-03")])
+        right = DatetimeIndex([NaT, NaT, Timestamp("2011-01-03")])
+
+        left = tm.box_expected(left, box_with_array)
+        right = tm.box_expected(right, box_with_array)
+        xbox = get_upcast_box(left, right, True)
+
+        lhs, rhs = left, right
+        if dtype is object:
+            lhs, rhs = left.astype(object), right.astype(object)
+
+        result = rhs == lhs
+        expected = np.array([False, False, True])
+        expected = tm.box_expected(expected, xbox)
+        tm.assert_equal(result, expected)
+
+        result = lhs != rhs
+        expected = np.array([True, True, False])
+        expected = tm.box_expected(expected, xbox)
+        tm.assert_equal(result, expected)
+
+        expected = np.array([False, False, False])
+        expected = tm.box_expected(expected, xbox)
+        tm.assert_equal(lhs == NaT, expected)
+        tm.assert_equal(NaT == rhs, expected)
+
+        expected = np.array([True, True, True])
+        expected = tm.box_expected(expected, xbox)
+        tm.assert_equal(lhs != NaT, expected)
+        tm.assert_equal(NaT != lhs, expected)
+
+        expected = np.array([False, False, False])
+        expected = tm.box_expected(expected, xbox)
+        tm.assert_equal(lhs < NaT, expected)
+        tm.assert_equal(NaT > lhs, expected)
+
+    def test_dti_cmp_nat_behaves_like_float_cmp_nan(self):
+        fidx1 = pd.Index([1.0, np.nan, 3.0, np.nan, 5.0, 7.0])
+        fidx2 = pd.Index([2.0, 3.0, np.nan, np.nan, 6.0, 7.0])
+
+        didx1 = DatetimeIndex(
+            ["2014-01-01", NaT, "2014-03-01", NaT, "2014-05-01", "2014-07-01"]
+        )
+        didx2 = DatetimeIndex(
+            ["2014-02-01", "2014-03-01", NaT, NaT, "2014-06-01", "2014-07-01"]
+        )
+        darr = np.array(
+            [
+                np.datetime64("2014-02-01 00:00"),
+                np.datetime64("2014-03-01 00:00"),
+                np.datetime64("nat"),
+                np.datetime64("nat"),
+                np.datetime64("2014-06-01 00:00"),
+                np.datetime64("2014-07-01 00:00"),
+            ]
+        )
+
+        cases = [(fidx1, fidx2), (didx1, didx2), (didx1, darr)]
+
+        # Check pd.NaT is handles as the same as np.nan
+        with tm.assert_produces_warning(None):
+            for idx1, idx2 in cases:
+                result = idx1 < idx2
+                expected = np.array([True, False, False, False, True, False])
+                tm.assert_numpy_array_equal(result, expected)
+
+                result = idx2 > idx1
+                expected = np.array([True, False, False, False, True, False])
+                tm.assert_numpy_array_equal(result, expected)
+
+                result = idx1 <= idx2
+                expected = np.array([True, False, False, False, True, True])
+                tm.assert_numpy_array_equal(result, expected)
+
+                result = idx2 >= idx1
+                expected = np.array([True, False, False, False, True, True])
+                tm.assert_numpy_array_equal(result, expected)
+
+                result = idx1 == idx2
+                expected = np.array([False, False, False, False, False, True])
+                tm.assert_numpy_array_equal(result, expected)
+
+                result = idx1 != idx2
+                expected = np.array([True, True, True, True, True, False])
+                tm.assert_numpy_array_equal(result, expected)
+
+        with tm.assert_produces_warning(None):
+            for idx1, val in [(fidx1, np.nan), (didx1, NaT)]:
+                result = idx1 < val
+                expected = np.array([False, False, False, False, False, False])
+                tm.assert_numpy_array_equal(result, expected)
+                result = idx1 > val
+                tm.assert_numpy_array_equal(result, expected)
+
+                result = idx1 <= val
+                tm.assert_numpy_array_equal(result, expected)
+                result = idx1 >= val
+                tm.assert_numpy_array_equal(result, expected)
+
+                result = idx1 == val
+                tm.assert_numpy_array_equal(result, expected)
+
+                result = idx1 != val
+                expected = np.array([True, True, True, True, True, True])
+                tm.assert_numpy_array_equal(result, expected)
+
+        # Check pd.NaT is handles as the same as np.nan
+        with tm.assert_produces_warning(None):
+            for idx1, val in [(fidx1, 3), (didx1, datetime(2014, 3, 1))]:
+                result = idx1 < val
+                expected = np.array([True, False, False, False, False, False])
+                tm.assert_numpy_array_equal(result, expected)
+                result = idx1 > val
+                expected = np.array([False, False, False, False, True, True])
+                tm.assert_numpy_array_equal(result, expected)
+
+                result = idx1 <= val
+                expected = np.array([True, False, True, False, False, False])
+                tm.assert_numpy_array_equal(result, expected)
+                result = idx1 >= val
+                expected = np.array([False, False, True, False, True, True])
+                tm.assert_numpy_array_equal(result, expected)
+
+                result = idx1 == val
+                expected = np.array([False, False, True, False, False, False])
+                tm.assert_numpy_array_equal(result, expected)
+
+                result = idx1 != val
+                expected = np.array([True, True, False, True, True, True])
+                tm.assert_numpy_array_equal(result, expected)
+
+    def test_comparison_tzawareness_compat(self, comparison_op, box_with_array):
+        # GH#18162
+        op = comparison_op
+        box = box_with_array
+
+        dr = date_range("2016-01-01", periods=6, unit="ns")
+        dz = dr.tz_localize("US/Pacific")
+
+        dr = tm.box_expected(dr, box)
+        dz = tm.box_expected(dz, box)
+
+        if box is pd.DataFrame:
+            tolist = lambda x: x.astype(object).values.tolist()[0]
+        else:
+            tolist = list
+
+        if op not in [operator.eq, operator.ne]:
+            msg = (
+                r"Invalid comparison between dtype=datetime64\[ns.*\] "
+                "and (Timestamp|DatetimeArray|list|ndarray)"
+            )
+            with pytest.raises(TypeError, match=msg):
+                op(dr, dz)
+
+            with pytest.raises(TypeError, match=msg):
+                op(dr, tolist(dz))
+            with pytest.raises(TypeError, match=msg):
+                op(dr, np.array(tolist(dz), dtype=object))
+            with pytest.raises(TypeError, match=msg):
+                op(dz, dr)
+
+            with pytest.raises(TypeError, match=msg):
+                op(dz, tolist(dr))
+            with pytest.raises(TypeError, match=msg):
+                op(dz, np.array(tolist(dr), dtype=object))
+
+        # The aware==aware and naive==naive comparisons should *not* raise
+        assert np.all(dr == dr)
+        assert np.all(dr == tolist(dr))
+        assert np.all(tolist(dr) == dr)
+        assert np.all(np.array(tolist(dr), dtype=object) == dr)
+        assert np.all(dr == np.array(tolist(dr), dtype=object))
+
+        assert np.all(dz == dz)
+        assert np.all(dz == tolist(dz))
+        assert np.all(tolist(dz) == dz)
+        assert np.all(np.array(tolist(dz), dtype=object) == dz)
+        assert np.all(dz == np.array(tolist(dz), dtype=object))
+
+    def test_comparison_tzawareness_compat_scalars(self, comparison_op, box_with_array):
+        # GH#18162
+        op = comparison_op
+
+        dr = date_range("2016-01-01", periods=6, unit="ns")
+        dz = dr.tz_localize("US/Pacific")
+
+        dr = tm.box_expected(dr, box_with_array)
+        dz = tm.box_expected(dz, box_with_array)
+
+        # Check comparisons against scalar Timestamps
+        ts = Timestamp("2000-03-14 01:59")
+        ts_tz = Timestamp("2000-03-14 01:59", tz="Europe/Amsterdam")
+
+        assert np.all(dr > ts)
+        msg = r"Invalid comparison between dtype=datetime64\[ns.*\] and Timestamp"
+        if op not in [operator.eq, operator.ne]:
+            with pytest.raises(TypeError, match=msg):
+                op(dr, ts_tz)
+
+        assert np.all(dz > ts_tz)
+        if op not in [operator.eq, operator.ne]:
+            with pytest.raises(TypeError, match=msg):
+                op(dz, ts)
+
+        if op not in [operator.eq, operator.ne]:
+            # GH#12601: Check comparison against Timestamps and DatetimeIndex
+            with pytest.raises(TypeError, match=msg):
+                op(ts, dz)
+
+    @pytest.mark.parametrize(
+        "other",
+        [datetime(2016, 1, 1), Timestamp("2016-01-01"), np.datetime64("2016-01-01")],
+    )
+    # Bug in NumPy? https://github.com/numpy/numpy/issues/13841
+    # Raising in __eq__ will fallback to NumPy, which warns, fails,
+    # then re-raises the original exception. So we just need to ignore.
+    @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning")
+    def test_scalar_comparison_tzawareness(
+        self, comparison_op, other, tz_aware_fixture, box_with_array
+    ):
+        op = comparison_op
+        tz = tz_aware_fixture
+        dti = date_range("2016-01-01", periods=2, tz=tz, unit="ns")
+
+        dtarr = tm.box_expected(dti, box_with_array)
+        xbox = get_upcast_box(dtarr, other, True)
+        if op in [operator.eq, operator.ne]:
+            exbool = op is operator.ne
+            expected = np.array([exbool, exbool], dtype=bool)
+            expected = tm.box_expected(expected, xbox)
+
+            result = op(dtarr, other)
+            tm.assert_equal(result, expected)
+
+            result = op(other, dtarr)
+            tm.assert_equal(result, expected)
+        else:
+            msg = (
+                r"Invalid comparison between dtype=datetime64\[ns, .*\] "
+                f"and {type(other).__name__}"
+            )
+            with pytest.raises(TypeError, match=msg):
+                op(dtarr, other)
+            with pytest.raises(TypeError, match=msg):
+                op(other, dtarr)
+
+    def test_nat_comparison_tzawareness(self, comparison_op):
+        # GH#19276
+        # tzaware DatetimeIndex should not raise when compared to NaT
+        op = comparison_op
+
+        dti = DatetimeIndex(
+            ["2014-01-01", NaT, "2014-03-01", NaT, "2014-05-01", "2014-07-01"]
+        )
+        expected = np.array([op == operator.ne] * len(dti))
+        result = op(dti, NaT)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = op(dti.tz_localize("US/Pacific"), NaT)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_dti_cmp_str(self, tz_naive_fixture):
+        # GH#22074
+        # regardless of tz, we expect these comparisons are valid
+        tz = tz_naive_fixture
+        rng = date_range("1/1/2000", periods=10, tz=tz)
+        other = "1/1/2000"
+
+        result = rng == other
+        expected = np.array([True] + [False] * 9)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = rng != other
+        expected = np.array([False] + [True] * 9)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = rng < other
+        expected = np.array([False] * 10)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = rng <= other
+        expected = np.array([True] + [False] * 9)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = rng > other
+        expected = np.array([False] + [True] * 9)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = rng >= other
+        expected = np.array([True] * 10)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_dti_cmp_list(self):
+        rng = date_range("1/1/2000", periods=10)
+
+        result = rng == list(rng)
+        expected = rng == rng
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            pd.timedelta_range("1D", periods=10),
+            pd.timedelta_range("1D", periods=10).to_series(),
+            pd.timedelta_range("1D", periods=10).asi8.view("m8[ns]"),
+        ],
+        ids=lambda x: type(x).__name__,
+    )
+    def test_dti_cmp_tdi_tzawareness(self, other):
+        # GH#22074
+        # reversion test that we _don't_ call _assert_tzawareness_compat
+        # when comparing against TimedeltaIndex
+        dti = date_range("2000-01-01", periods=10, tz="Asia/Tokyo")
+
+        result = dti == other
+        expected = np.array([False] * 10)
+        if isinstance(other, Series):
+            tm.assert_series_equal(result, Series(expected, index=other.index))
+        else:
+            tm.assert_numpy_array_equal(result, expected)
+
+        result = dti != other
+        expected = np.array([True] * 10)
+        if isinstance(other, Series):
+            tm.assert_series_equal(result, Series(expected, index=other.index))
+        else:
+            tm.assert_numpy_array_equal(result, expected)
+
+        msg = "Invalid comparison between"
+        with pytest.raises(TypeError, match=msg):
+            dti < other
+        with pytest.raises(TypeError, match=msg):
+            dti <= other
+        with pytest.raises(TypeError, match=msg):
+            dti > other
+        with pytest.raises(TypeError, match=msg):
+            dti >= other
+
+    def test_dti_cmp_object_dtype(self):
+        # GH#22074
+        dti = date_range("2000-01-01", periods=10, tz="Asia/Tokyo")
+
+        other = dti.astype("O")
+
+        result = dti == other
+        expected = np.array([True] * 10)
+        tm.assert_numpy_array_equal(result, expected)
+
+        other = dti.tz_localize(None)
+        result = dti != other
+        tm.assert_numpy_array_equal(result, expected)
+
+        other = np.array(list(dti[:5]) + [Timedelta(days=1)] * 5)
+        result = dti == other
+        expected = np.array([True] * 5 + [False] * 5)
+        tm.assert_numpy_array_equal(result, expected)
+        msg = ">=' not supported between instances of 'Timestamp' and 'Timedelta'"
+        with pytest.raises(TypeError, match=msg):
+            dti >= other
+
+
+# ------------------------------------------------------------------
+# Arithmetic
+
+
+class TestDatetime64Arithmetic:
+    # This class is intended for "finished" tests that are fully parametrized
+    #  over DataFrame/Series/Index/DatetimeArray
+
+    # -------------------------------------------------------------
+    # Addition/Subtraction of timedelta-like
+
+    @pytest.mark.arm_slow
+    def test_dt64arr_add_timedeltalike_scalar(
+        self, tz_naive_fixture, two_hours, box_with_array
+    ):
+        # GH#22005, GH#22163 check DataFrame doesn't raise TypeError
+        tz = tz_naive_fixture
+
+        rng = date_range("2000-01-01", "2000-02-01", tz=tz, unit="ns")
+        expected = date_range("2000-01-01 02:00", "2000-02-01 02:00", tz=tz, unit="ns")
+        if tz is not None:
+            expected = expected._with_freq(None)
+
+        rng = tm.box_expected(rng, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = rng + two_hours
+        tm.assert_equal(result, expected)
+
+        result = two_hours + rng
+        tm.assert_equal(result, expected)
+
+        rng += two_hours
+        tm.assert_equal(rng, expected)
+
+    def test_dt64arr_sub_timedeltalike_scalar(
+        self, tz_naive_fixture, two_hours, box_with_array
+    ):
+        tz = tz_naive_fixture
+
+        rng = date_range("2000-01-01", "2000-02-01", tz=tz, unit="ns")
+        expected = date_range("1999-12-31 22:00", "2000-01-31 22:00", tz=tz, unit="ns")
+        if tz is not None:
+            expected = expected._with_freq(None)
+
+        rng = tm.box_expected(rng, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = rng - two_hours
+        tm.assert_equal(result, expected)
+
+        rng -= two_hours
+        tm.assert_equal(rng, expected)
+
+    def test_dt64_array_sub_dt_with_different_timezone(self, box_with_array):
+        t1 = date_range("20130101", periods=3).tz_localize("US/Eastern")
+        t1 = tm.box_expected(t1, box_with_array)
+        t2 = Timestamp("20130101").tz_localize("CET")
+        tnaive = Timestamp(20130101)
+
+        result = t1 - t2
+        expected = TimedeltaIndex(
+            ["0 days 06:00:00", "1 days 06:00:00", "2 days 06:00:00"]
+        )
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(result, expected)
+
+        result = t2 - t1
+        expected = TimedeltaIndex(
+            ["-1 days +18:00:00", "-2 days +18:00:00", "-3 days +18:00:00"]
+        )
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(result, expected)
+
+        msg = "Cannot subtract tz-naive and tz-aware datetime-like objects"
+        with pytest.raises(TypeError, match=msg):
+            t1 - tnaive
+
+        with pytest.raises(TypeError, match=msg):
+            tnaive - t1
+
+    def test_dt64_array_sub_dt64_array_with_different_timezone(self, box_with_array):
+        t1 = date_range("20130101", periods=3).tz_localize("US/Eastern")
+        t1 = tm.box_expected(t1, box_with_array)
+        t2 = date_range("20130101", periods=3).tz_localize("CET")
+        t2 = tm.box_expected(t2, box_with_array)
+        tnaive = date_range("20130101", periods=3)
+
+        result = t1 - t2
+        expected = TimedeltaIndex(
+            ["0 days 06:00:00", "0 days 06:00:00", "0 days 06:00:00"]
+        )
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(result, expected)
+
+        result = t2 - t1
+        expected = TimedeltaIndex(
+            ["-1 days +18:00:00", "-1 days +18:00:00", "-1 days +18:00:00"]
+        )
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(result, expected)
+
+        msg = "Cannot subtract tz-naive and tz-aware datetime-like objects"
+        with pytest.raises(TypeError, match=msg):
+            t1 - tnaive
+
+        with pytest.raises(TypeError, match=msg):
+            tnaive - t1
+
+    def test_dt64arr_add_sub_td64_nat(self, box_with_array, tz_naive_fixture):
+        # GH#23320 special handling for timedelta64("NaT")
+        tz = tz_naive_fixture
+
+        dti = date_range("1994-04-01", periods=9, tz=tz, freq="QS", unit="ns")
+        other = np.timedelta64("NaT")
+        expected = DatetimeIndex(["NaT"] * 9, tz=tz).as_unit("ns")
+
+        obj = tm.box_expected(dti, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = obj + other
+        tm.assert_equal(result, expected)
+        result = other + obj
+        tm.assert_equal(result, expected)
+        result = obj - other
+        tm.assert_equal(result, expected)
+        msg = "cannot subtract"
+        with pytest.raises(TypeError, match=msg):
+            other - obj
+
+    def test_dt64arr_add_sub_td64ndarray(self, tz_naive_fixture, box_with_array):
+        tz = tz_naive_fixture
+        dti = date_range("2016-01-01", periods=3, tz=tz)
+        tdi = TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"])
+        tdarr = tdi.values
+
+        expected = date_range("2015-12-31", "2016-01-02", periods=3, tz=tz)
+
+        dtarr = tm.box_expected(dti, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = dtarr + tdarr
+        tm.assert_equal(result, expected)
+        result = tdarr + dtarr
+        tm.assert_equal(result, expected)
+
+        expected = date_range("2016-01-02", "2016-01-04", periods=3, tz=tz)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = dtarr - tdarr
+        tm.assert_equal(result, expected)
+        msg = "|".join(
+            [
+                "cannot subtract DatetimeArray from ndarray",
+                "cannot subtract a datelike from a TimedeltaArray",
+                "cannot subtract DatetimeArray from Timedelta",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            tdarr - dtarr
+
+    # -----------------------------------------------------------------
+    # Subtraction of datetime-like scalars
+
+    @pytest.mark.parametrize(
+        "ts",
+        [
+            Timestamp("2013-01-01"),
+            Timestamp("2013-01-01").to_pydatetime(),
+            Timestamp("2013-01-01").to_datetime64(),
+            # GH#7996, GH#22163 ensure non-nano datetime64 is converted to nano
+            #  for DataFrame operation
+            np.datetime64("2013-01-01", "D"),
+        ],
+    )
+    def test_dt64arr_sub_dtscalar(self, box_with_array, ts):
+        # GH#8554, GH#22163 DataFrame op should _not_ return dt64 dtype
+        idx = date_range("2013-01-01", periods=3)._with_freq(None)
+        idx = tm.box_expected(idx, box_with_array)
+
+        expected = TimedeltaIndex(["0 Days", "1 Day", "2 Days"])
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = idx - ts
+        tm.assert_equal(result, expected)
+
+        result = ts - idx
+        tm.assert_equal(result, -expected)
+        tm.assert_equal(result, -expected)
+
+    def test_dt64arr_sub_timestamp_tzaware(self, box_with_array):
+        ser = date_range("2014-03-17", periods=2, freq="D", tz="US/Eastern", unit="ns")
+        ser = ser._with_freq(None)
+        ts = ser[0]
+
+        ser = tm.box_expected(ser, box_with_array)
+
+        delta_series = Series(
+            [np.timedelta64(0, "D"), np.timedelta64(1, "D")], dtype="m8[ns]"
+        )
+        expected = tm.box_expected(delta_series, box_with_array)
+
+        tm.assert_equal(ser - ts, expected)
+        tm.assert_equal(ts - ser, -expected)
+
+    def test_dt64arr_sub_NaT(self, box_with_array, unit):
+        # GH#18808
+        dti = DatetimeIndex([NaT, Timestamp("19900315")]).as_unit(unit)
+        ser = tm.box_expected(dti, box_with_array)
+
+        result = ser - NaT
+        expected = Series([NaT, NaT], dtype=f"timedelta64[{unit}]")
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(result, expected)
+
+        dti_tz = dti.tz_localize("Asia/Tokyo")
+        ser_tz = tm.box_expected(dti_tz, box_with_array)
+
+        result = ser_tz - NaT
+        expected = Series([NaT, NaT], dtype=f"timedelta64[{unit}]")
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(result, expected)
+
+    # -------------------------------------------------------------
+    # Subtraction of datetime-like array-like
+
+    def test_dt64arr_sub_dt64object_array(
+        self, performance_warning, box_with_array, tz_naive_fixture
+    ):
+        dti = date_range("2016-01-01", periods=3, tz=tz_naive_fixture)
+        expected = dti - dti
+
+        obj = tm.box_expected(dti, box_with_array)
+        expected = tm.box_expected(expected, box_with_array).astype(object)
+
+        with tm.assert_produces_warning(performance_warning):
+            result = obj - obj.astype(object)
+        tm.assert_equal(result, expected)
+
+    def test_dt64arr_naive_sub_dt64ndarray(self, box_with_array):
+        dti = date_range("2016-01-01", periods=3, tz=None)
+        dt64vals = dti.values
+
+        dtarr = tm.box_expected(dti, box_with_array)
+
+        expected = dtarr - dtarr
+        result = dtarr - dt64vals
+        tm.assert_equal(result, expected)
+        result = dt64vals - dtarr
+        tm.assert_equal(result, expected)
+
+    def test_dt64arr_aware_sub_dt64ndarray_raises(
+        self, tz_aware_fixture, box_with_array
+    ):
+        tz = tz_aware_fixture
+        dti = date_range("2016-01-01", periods=3, tz=tz)
+        dt64vals = dti.values
+
+        dtarr = tm.box_expected(dti, box_with_array)
+        msg = "Cannot subtract tz-naive and tz-aware datetime"
+        with pytest.raises(TypeError, match=msg):
+            dtarr - dt64vals
+        with pytest.raises(TypeError, match=msg):
+            dt64vals - dtarr
+
+    # -------------------------------------------------------------
+    # Addition of datetime-like others (invalid)
+
+    def test_dt64arr_add_dtlike_raises(self, tz_naive_fixture, box_with_array):
+        # GH#22163 ensure DataFrame doesn't cast Timestamp to i8
+        # GH#9631
+        tz = tz_naive_fixture
+
+        dti = date_range("2016-01-01", periods=3, tz=tz)
+        if tz is None:
+            dti2 = dti.tz_localize("US/Eastern")
+        else:
+            dti2 = dti.tz_localize(None)
+        dtarr = tm.box_expected(dti, box_with_array)
+
+        assert_cannot_add(dtarr, dti.values)
+        assert_cannot_add(dtarr, dti)
+        assert_cannot_add(dtarr, dtarr)
+        assert_cannot_add(dtarr, dti[0])
+        assert_cannot_add(dtarr, dti[0].to_pydatetime())
+        assert_cannot_add(dtarr, dti[0].to_datetime64())
+        assert_cannot_add(dtarr, dti2[0])
+        assert_cannot_add(dtarr, dti2[0].to_pydatetime())
+        assert_cannot_add(dtarr, np.datetime64("2011-01-01", "D"))
+
+    # -------------------------------------------------------------
+    # Other Invalid Addition/Subtraction
+
+    # Note: freq here includes both Tick and non-Tick offsets; this is
+    #  relevant because historically integer-addition was allowed if we had
+    #  a freq.
+    @pytest.mark.parametrize("freq", ["h", "D", "W", "2ME", "MS", "QE", "B", None])
+    @pytest.mark.parametrize("dtype", [None, "uint8"])
+    def test_dt64arr_addsub_intlike(
+        self, dtype, index_or_series_or_array, freq, tz_naive_fixture
+    ):
+        # GH#19959, GH#19123, GH#19012
+        # GH#55860 use index_or_series_or_array instead of box_with_array
+        #  bc DataFrame alignment makes it inapplicable
+        tz = tz_naive_fixture
+
+        if freq is None:
+            dti = DatetimeIndex(["NaT", "2017-04-05 06:07:08"], tz=tz)
+        else:
+            dti = date_range("2016-01-01", periods=2, freq=freq, tz=tz)
+
+        obj = index_or_series_or_array(dti)
+        other = np.array([4, -1])
+        if dtype is not None:
+            other = other.astype(dtype)
+
+        msg = "|".join(
+            [
+                "Addition/subtraction of integers",
+                "cannot subtract DatetimeArray from",
+                # IntegerArray
+                "can only perform ops with numeric values",
+                "unsupported operand type.*Categorical",
+                r"unsupported operand type\(s\) for -: 'int' and 'Timestamp'",
+            ]
+        )
+        assert_invalid_addsub_type(obj, 1, msg)
+        assert_invalid_addsub_type(obj, np.int64(2), msg)
+        assert_invalid_addsub_type(obj, np.array(3, dtype=np.int64), msg)
+        assert_invalid_addsub_type(obj, other, msg)
+        assert_invalid_addsub_type(obj, np.array(other), msg)
+        assert_invalid_addsub_type(obj, pd.array(other), msg)
+        assert_invalid_addsub_type(obj, pd.Categorical(other), msg)
+        assert_invalid_addsub_type(obj, pd.Index(other), msg)
+        assert_invalid_addsub_type(obj, Series(other), msg)
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            3.14,
+            np.array([2.0, 3.0]),
+            # GH#13078 datetime +/- Period is invalid
+            Period("2011-01-01", freq="D"),
+            # https://github.com/pandas-dev/pandas/issues/10329
+            time(1, 2, 3),
+        ],
+    )
+    @pytest.mark.parametrize("dti_freq", [None, "D"])
+    def test_dt64arr_add_sub_invalid(self, dti_freq, other, box_with_array):
+        dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq)
+        dtarr = tm.box_expected(dti, box_with_array)
+        msg = "|".join(
+            [
+                "unsupported operand type",
+                "cannot (add|subtract)",
+                "cannot use operands with types",
+                "ufunc '?(add|subtract)'? cannot use operands with types",
+                "Concatenation operation is not implemented for NumPy arrays",
+            ]
+        )
+        assert_invalid_addsub_type(dtarr, other, msg)
+
+    @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"])
+    @pytest.mark.parametrize("dti_freq", [None, "D"])
+    def test_dt64arr_add_sub_parr(
+        self, dti_freq, pi_freq, box_with_array, box_with_array2
+    ):
+        # GH#20049 subtracting PeriodIndex should raise TypeError
+        dti = DatetimeIndex(["2011-01-01", "2011-01-02"], freq=dti_freq)
+        pi = dti.to_period(pi_freq)
+
+        dtarr = tm.box_expected(dti, box_with_array)
+        parr = tm.box_expected(pi, box_with_array2)
+        msg = "|".join(
+            [
+                "cannot (add|subtract)",
+                "unsupported operand",
+                "descriptor.*requires",
+                "ufunc.*cannot use operands",
+            ]
+        )
+        assert_invalid_addsub_type(dtarr, parr, msg)
+
+    @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning")
+    def test_dt64arr_addsub_time_objects_raises(self, box_with_array, tz_naive_fixture):
+        # https://github.com/pandas-dev/pandas/issues/10329
+
+        tz = tz_naive_fixture
+
+        obj1 = date_range("2012-01-01", periods=3, tz=tz)
+        obj2 = [time(i, i, i) for i in range(3)]
+
+        obj1 = tm.box_expected(obj1, box_with_array)
+        obj2 = tm.box_expected(obj2, box_with_array)
+
+        msg = "|".join(
+            [
+                "unsupported operand",
+                "cannot subtract DatetimeArray from ndarray",
+            ]
+        )
+        # pandas.errors.PerformanceWarning: Non-vectorized DateOffset being
+        # applied to Series or DatetimeIndex
+        # we aren't testing that here, so ignore.
+        assert_invalid_addsub_type(obj1, obj2, msg=msg)
+
+    # -------------------------------------------------------------
+    # Other invalid operations
+
+    @pytest.mark.parametrize(
+        "dt64_series",
+        [
+            Series([Timestamp("19900315"), Timestamp("19900315")]),
+            Series([NaT, Timestamp("19900315")]),
+            Series([NaT, NaT], dtype="datetime64[ns]"),
+        ],
+    )
+    @pytest.mark.parametrize("one", [1, 1.0, np.array(1)])
+    def test_dt64_mul_div_numeric_invalid(self, one, dt64_series, box_with_array):
+        obj = tm.box_expected(dt64_series, box_with_array)
+
+        msg = "cannot perform .* with this index type"
+
+        # multiplication
+        with pytest.raises(TypeError, match=msg):
+            obj * one
+        with pytest.raises(TypeError, match=msg):
+            one * obj
+
+        # division
+        with pytest.raises(TypeError, match=msg):
+            obj / one
+        with pytest.raises(TypeError, match=msg):
+            one / obj
+
+
+class TestDatetime64DateOffsetArithmetic:
+    # -------------------------------------------------------------
+    # Tick DateOffsets
+
+    # TODO: parametrize over timezone?
+    def test_dt64arr_series_add_tick_DateOffset(self, box_with_array, unit):
+        # GH#4532
+        # operate with pd.offsets
+        ser = Series(
+            [Timestamp("20130101 9:01"), Timestamp("20130101 9:02")]
+        ).dt.as_unit(unit)
+        expected = Series(
+            [Timestamp("20130101 9:01:05"), Timestamp("20130101 9:02:05")]
+        ).dt.as_unit(unit)
+
+        ser = tm.box_expected(ser, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = ser + pd.offsets.Second(5)
+        tm.assert_equal(result, expected)
+
+        result2 = pd.offsets.Second(5) + ser
+        tm.assert_equal(result2, expected)
+
+    def test_dt64arr_series_sub_tick_DateOffset(self, box_with_array):
+        # GH#4532
+        # operate with pd.offsets
+        ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")])
+        expected = Series(
+            [Timestamp("20130101 9:00:55"), Timestamp("20130101 9:01:55")]
+        )
+
+        ser = tm.box_expected(ser, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = ser - pd.offsets.Second(5)
+        tm.assert_equal(result, expected)
+
+        result2 = -pd.offsets.Second(5) + ser
+        tm.assert_equal(result2, expected)
+        msg = "cannot subtract DatetimeArray from Second"
+        with pytest.raises(TypeError, match=msg):
+            pd.offsets.Second(5) - ser
+
+    @pytest.mark.parametrize(
+        "cls_name", ["Day", "Hour", "Minute", "Second", "Milli", "Micro", "Nano"]
+    )
+    def test_dt64arr_add_sub_tick_DateOffset_smoke(self, cls_name, box_with_array):
+        # GH#4532
+        # smoke tests for valid DateOffsets
+        ser = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")])
+        ser = tm.box_expected(ser, box_with_array)
+
+        offset_cls = getattr(pd.offsets, cls_name)
+        ser + offset_cls(5)
+        offset_cls(5) + ser
+        ser - offset_cls(5)
+
+    def test_dti_add_tick_tzaware(self, tz_aware_fixture, box_with_array):
+        # GH#21610, GH#22163 ensure DataFrame doesn't return object-dtype
+        tz = tz_aware_fixture
+        if tz == "US/Pacific":
+            dates = date_range("2012-11-01", periods=3, tz=tz, unit="ns")
+            offset = dates + pd.offsets.Hour(5)
+            assert dates[0] + pd.offsets.Hour(5) == offset[0]
+
+        dates = date_range("2010-11-01 00:00", periods=3, tz=tz, freq="h", unit="ns")
+        expected = DatetimeIndex(
+            ["2010-11-01 05:00", "2010-11-01 06:00", "2010-11-01 07:00"],
+            freq="h",
+            tz=tz,
+        ).as_unit("ns")
+
+        dates = tm.box_expected(dates, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        for scalar in [pd.offsets.Hour(5), np.timedelta64(5, "h"), timedelta(hours=5)]:
+            offset = dates + scalar
+            tm.assert_equal(offset, expected)
+            offset = scalar + dates
+            tm.assert_equal(offset, expected)
+
+            roundtrip = offset - scalar
+            tm.assert_equal(roundtrip, dates)
+
+            msg = "cannot subtract DatetimeArray from"
+            with pytest.raises(TypeError, match=msg):
+                scalar - dates
+
+    # -------------------------------------------------------------
+    # RelativeDelta DateOffsets
+
+    def test_dt64arr_add_sub_relativedelta_offsets(self, box_with_array, unit):
+        # GH#10699
+        vec = DatetimeIndex(
+            [
+                Timestamp("2000-01-05 00:15:00"),
+                Timestamp("2000-01-31 00:23:00"),
+                Timestamp("2000-01-01"),
+                Timestamp("2000-03-31"),
+                Timestamp("2000-02-29"),
+                Timestamp("2000-12-31"),
+                Timestamp("2000-05-15"),
+                Timestamp("2001-06-15"),
+            ]
+        ).as_unit(unit)
+        vec = tm.box_expected(vec, box_with_array)
+        vec_items = vec.iloc[0] if box_with_array is pd.DataFrame else vec
+
+        # DateOffset relativedelta fastpath
+        relative_kwargs = [
+            ("years", 2),
+            ("months", 5),
+            ("days", 3),
+            ("hours", 5),
+            ("minutes", 10),
+            ("seconds", 2),
+            ("microseconds", 5),
+        ]
+        for i, (offset_unit, value) in enumerate(relative_kwargs):
+            off = DateOffset(**{offset_unit: value})
+
+            exp_unit = unit
+            if offset_unit == "microseconds" and unit != "ns":
+                exp_unit = "us"
+
+            # TODO(GH#55564): as_unit will be unnecessary
+            expected = DatetimeIndex([x + off for x in vec_items]).as_unit(exp_unit)
+            expected = tm.box_expected(expected, box_with_array)
+            tm.assert_equal(expected, vec + off)
+
+            expected = DatetimeIndex([x - off for x in vec_items]).as_unit(exp_unit)
+            expected = tm.box_expected(expected, box_with_array)
+            tm.assert_equal(expected, vec - off)
+
+            off = DateOffset(**dict(relative_kwargs[: i + 1]))
+
+            expected = DatetimeIndex([x + off for x in vec_items]).as_unit(exp_unit)
+            expected = tm.box_expected(expected, box_with_array)
+            tm.assert_equal(expected, vec + off)
+
+            expected = DatetimeIndex([x - off for x in vec_items]).as_unit(exp_unit)
+            expected = tm.box_expected(expected, box_with_array)
+            tm.assert_equal(expected, vec - off)
+            msg = "cannot subtract DatetimeArray from"
+            with pytest.raises(TypeError, match=msg):
+                off - vec
+
+    # -------------------------------------------------------------
+    # Non-Tick, Non-RelativeDelta DateOffsets
+
+    # TODO: redundant with test_dt64arr_add_sub_DateOffset?  that includes
+    #  tz-aware cases which this does not
+    @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning")
+    @pytest.mark.parametrize(
+        "cls_and_kwargs",
+        [
+            "YearBegin",
+            ("YearBegin", {"month": 5}),
+            "YearEnd",
+            ("YearEnd", {"month": 5}),
+            "MonthBegin",
+            "MonthEnd",
+            "SemiMonthEnd",
+            "SemiMonthBegin",
+            "Week",
+            ("Week", {"weekday": 3}),
+            ("Week", {"weekday": 6}),
+            "BusinessDay",
+            "BDay",
+            "QuarterEnd",
+            "QuarterBegin",
+            "CustomBusinessDay",
+            "CDay",
+            "CBMonthEnd",
+            "CBMonthBegin",
+            "BMonthBegin",
+            "BMonthEnd",
+            "BusinessHour",
+            "BYearBegin",
+            "BYearEnd",
+            "BQuarterBegin",
+            ("LastWeekOfMonth", {"weekday": 2}),
+            (
+                "FY5253Quarter",
+                {
+                    "qtr_with_extra_week": 1,
+                    "startingMonth": 1,
+                    "weekday": 2,
+                    "variation": "nearest",
+                },
+            ),
+            ("FY5253", {"weekday": 0, "startingMonth": 2, "variation": "nearest"}),
+            ("WeekOfMonth", {"weekday": 2, "week": 2}),
+            "Easter",
+            ("DateOffset", {"day": 4}),
+            ("DateOffset", {"month": 5}),
+        ],
+    )
+    @pytest.mark.parametrize("normalize", [True, False])
+    @pytest.mark.parametrize("n", [0, 5])
+    @pytest.mark.parametrize("tz", [None, "US/Central"])
+    def test_dt64arr_add_sub_DateOffsets(
+        self, box_with_array, n, normalize, cls_and_kwargs, unit, tz
+    ):
+        # GH#10699
+        # assert vectorized operation matches pointwise operations
+
+        if isinstance(cls_and_kwargs, tuple):
+            # If cls_name param is a tuple, then 2nd entry is kwargs for
+            # the offset constructor
+            cls_name, kwargs = cls_and_kwargs
+        else:
+            cls_name = cls_and_kwargs
+            kwargs = {}
+
+        if n == 0 and cls_name in [
+            "WeekOfMonth",
+            "LastWeekOfMonth",
+            "FY5253Quarter",
+            "FY5253",
+        ]:
+            # passing n = 0 is invalid for these offset classes
+            return
+
+        vec = (
+            DatetimeIndex(
+                [
+                    Timestamp("2000-01-05 00:15:00"),
+                    Timestamp("2000-01-31 00:23:00"),
+                    Timestamp("2000-01-01"),
+                    Timestamp("2000-03-31"),
+                    Timestamp("2000-02-29"),
+                    Timestamp("2000-12-31"),
+                    Timestamp("2000-05-15"),
+                    Timestamp("2001-06-15"),
+                ]
+            )
+            .as_unit(unit)
+            .tz_localize(tz)
+        )
+        vec = tm.box_expected(vec, box_with_array)
+        vec_items = vec.iloc[0] if box_with_array is pd.DataFrame else vec
+
+        offset_cls = getattr(pd.offsets, cls_name)
+        offset = offset_cls(n, normalize=normalize, **kwargs)
+
+        # TODO(GH#55564): as_unit will be unnecessary
+        expected = DatetimeIndex([x + offset for x in vec_items]).as_unit(unit)
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(expected, vec + offset)
+        tm.assert_equal(expected, offset + vec)
+
+        expected = DatetimeIndex([x - offset for x in vec_items]).as_unit(unit)
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(expected, vec - offset)
+
+        expected = DatetimeIndex([offset + x for x in vec_items]).as_unit(unit)
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(expected, offset + vec)
+        msg = "cannot subtract DatetimeArray from"
+        with pytest.raises(TypeError, match=msg):
+            offset - vec
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            [pd.offsets.MonthEnd(), pd.offsets.Day(n=2)],
+            [pd.offsets.DateOffset(years=1), pd.offsets.MonthEnd()],
+            # matching offsets
+            [pd.offsets.DateOffset(years=1), pd.offsets.DateOffset(years=1)],
+        ],
+    )
+    @pytest.mark.parametrize("op", [operator.add, roperator.radd, operator.sub])
+    def test_dt64arr_add_sub_offset_array(
+        self, performance_warning, tz_naive_fixture, box_with_array, op, other
+    ):
+        # GH#18849
+        # GH#10699 array of offsets
+
+        tz = tz_naive_fixture
+        dti = date_range("2017-01-01", periods=2, tz=tz)
+        dtarr = tm.box_expected(dti, box_with_array)
+        other = np.array(other)
+        expected = DatetimeIndex([op(dti[n], other[n]) for n in range(len(dti))])
+        expected = tm.box_expected(expected, box_with_array).astype(object)
+
+        with tm.assert_produces_warning(performance_warning):
+            res = op(dtarr, other)
+        tm.assert_equal(res, expected)
+
+        # Same thing but boxing other
+        other = tm.box_expected(other, box_with_array)
+        if box_with_array is pd.array and op is roperator.radd:
+            # We expect a NumpyExtensionArray, not ndarray[object] here
+            expected = pd.array(expected, dtype=object)
+        with tm.assert_produces_warning(performance_warning):
+            res = op(dtarr, other)
+        tm.assert_equal(res, expected)
+
+    @pytest.mark.parametrize(
+        "op, offset, exp, exp_freq",
+        [
+            (
+                "__add__",
+                DateOffset(months=3, days=10),
+                [
+                    Timestamp("2014-04-11"),
+                    Timestamp("2015-04-11"),
+                    Timestamp("2016-04-11"),
+                    Timestamp("2017-04-11"),
+                ],
+                None,
+            ),
+            (
+                "__add__",
+                DateOffset(months=3),
+                [
+                    Timestamp("2014-04-01"),
+                    Timestamp("2015-04-01"),
+                    Timestamp("2016-04-01"),
+                    Timestamp("2017-04-01"),
+                ],
+                "YS-APR",
+            ),
+            (
+                "__sub__",
+                DateOffset(months=3, days=10),
+                [
+                    Timestamp("2013-09-21"),
+                    Timestamp("2014-09-21"),
+                    Timestamp("2015-09-21"),
+                    Timestamp("2016-09-21"),
+                ],
+                None,
+            ),
+            (
+                "__sub__",
+                DateOffset(months=3),
+                [
+                    Timestamp("2013-10-01"),
+                    Timestamp("2014-10-01"),
+                    Timestamp("2015-10-01"),
+                    Timestamp("2016-10-01"),
+                ],
+                "YS-OCT",
+            ),
+        ],
+    )
+    def test_dti_add_sub_nonzero_mth_offset(
+        self, op, offset, exp, exp_freq, tz_aware_fixture, box_with_array
+    ):
+        # GH 26258
+        tz = tz_aware_fixture
+        date = date_range(
+            start="01 Jan 2014", end="01 Jan 2017", freq="YS", tz=tz, unit="ns"
+        )
+        date = tm.box_expected(date, box_with_array, False)
+        mth = getattr(date, op)
+        result = mth(offset)
+
+        expected = DatetimeIndex(exp, tz=tz).as_unit("ns")
+        expected = tm.box_expected(expected, box_with_array, False)
+        tm.assert_equal(result, expected)
+
+    def test_dt64arr_series_add_DateOffset_with_milli(self):
+        # GH 57529
+        dti = DatetimeIndex(
+            [
+                "2000-01-01 00:00:00.012345678",
+                "2000-01-31 00:00:00.012345678",
+                "2000-02-29 00:00:00.012345678",
+            ],
+            dtype="datetime64[ns]",
+        )
+        result = dti + DateOffset(milliseconds=4)
+        expected = DatetimeIndex(
+            [
+                "2000-01-01 00:00:00.016345678",
+                "2000-01-31 00:00:00.016345678",
+                "2000-02-29 00:00:00.016345678",
+            ],
+            dtype="datetime64[ns]",
+        )
+        tm.assert_index_equal(result, expected)
+
+        result = dti + DateOffset(days=1, milliseconds=4)
+        expected = DatetimeIndex(
+            [
+                "2000-01-02 00:00:00.016345678",
+                "2000-02-01 00:00:00.016345678",
+                "2000-03-01 00:00:00.016345678",
+            ],
+            dtype="datetime64[ns]",
+        )
+        tm.assert_index_equal(result, expected)
+
+
+class TestDatetime64OverflowHandling:
+    # TODO: box + de-duplicate
+
+    def test_dt64_overflow_masking(self, box_with_array):
+        # GH#25317
+        left = Series([Timestamp("1969-12-31")], dtype="M8[ns]")
+        right = Series([NaT])
+
+        left = tm.box_expected(left, box_with_array)
+        right = tm.box_expected(right, box_with_array)
+
+        expected = TimedeltaIndex([NaT], dtype="m8[ns]")
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = left - right
+        tm.assert_equal(result, expected)
+
+    def test_dt64_series_arith_overflow(self):
+        # GH#12534, fixed by GH#19024
+        dt = Timestamp("1700-01-31")
+        td = Timedelta("20000 Days")
+        dti = date_range("1949-09-30", freq="100YE", periods=4, unit="ns")
+        ser = Series(dti)
+        msg = "Overflow in int64 addition"
+        with pytest.raises(OverflowError, match=msg):
+            ser - dt
+        with pytest.raises(OverflowError, match=msg):
+            dt - ser
+        with pytest.raises(OverflowError, match=msg):
+            ser + td
+        with pytest.raises(OverflowError, match=msg):
+            td + ser
+
+        ser.iloc[-1] = NaT
+        expected = Series(
+            ["2004-10-03", "2104-10-04", "2204-10-04", "NaT"], dtype="datetime64[ns]"
+        )
+        res = ser + td
+        tm.assert_series_equal(res, expected)
+        res = td + ser
+        tm.assert_series_equal(res, expected)
+
+        ser.iloc[1:] = NaT
+        expected = Series(["91279 Days", "NaT", "NaT", "NaT"], dtype="timedelta64[ns]")
+        res = ser - dt
+        tm.assert_series_equal(res, expected)
+        res = dt - ser
+        tm.assert_series_equal(res, -expected)
+
+    def test_datetimeindex_sub_timestamp_overflow(self):
+        dtimax = pd.to_datetime(["2021-12-28 17:19", Timestamp.max]).as_unit("ns")
+        dtimin = pd.to_datetime(["2021-12-28 17:19", Timestamp.min]).as_unit("ns")
+
+        tsneg = Timestamp("1950-01-01").as_unit("ns")
+        ts_neg_variants = [
+            tsneg,
+            tsneg.to_pydatetime(),
+            tsneg.to_datetime64().astype("datetime64[ns]"),
+            tsneg.to_datetime64().astype("datetime64[D]"),
+        ]
+
+        tspos = Timestamp("1980-01-01").as_unit("ns")
+        ts_pos_variants = [
+            tspos,
+            tspos.to_pydatetime(),
+            tspos.to_datetime64().astype("datetime64[ns]"),
+            tspos.to_datetime64().astype("datetime64[D]"),
+        ]
+        msg = "Overflow in int64 addition"
+        for variant in ts_neg_variants:
+            with pytest.raises(OverflowError, match=msg):
+                dtimax - variant
+
+        expected = Timestamp.max._value - tspos._value
+        for variant in ts_pos_variants:
+            res = dtimax - variant
+            assert res[1]._value == expected
+
+        expected = Timestamp.min._value - tsneg._value
+        for variant in ts_neg_variants:
+            res = dtimin - variant
+            assert res[1]._value == expected
+
+        for variant in ts_pos_variants:
+            with pytest.raises(OverflowError, match=msg):
+                dtimin - variant
+
+    def test_datetimeindex_sub_datetimeindex_overflow(self):
+        # GH#22492, GH#22508
+        dtimax = pd.to_datetime(["2021-12-28 17:19", Timestamp.max]).as_unit("ns")
+        dtimin = pd.to_datetime(["2021-12-28 17:19", Timestamp.min]).as_unit("ns")
+
+        ts_neg = pd.to_datetime(["1950-01-01", "1950-01-01"]).as_unit("ns")
+        ts_pos = pd.to_datetime(["1980-01-01", "1980-01-01"]).as_unit("ns")
+
+        # General tests
+        expected = Timestamp.max._value - ts_pos[1]._value
+        result = dtimax - ts_pos
+        assert result[1]._value == expected
+
+        expected = Timestamp.min._value - ts_neg[1]._value
+        result = dtimin - ts_neg
+        assert result[1]._value == expected
+        msg = "Overflow in int64 addition"
+        with pytest.raises(OverflowError, match=msg):
+            dtimax - ts_neg
+
+        with pytest.raises(OverflowError, match=msg):
+            dtimin - ts_pos
+
+        # Edge cases
+        tmin = pd.to_datetime([Timestamp.min])
+        t1 = tmin + Timedelta.max + Timedelta("1us")
+        with pytest.raises(OverflowError, match=msg):
+            t1 - tmin
+
+        tmax = pd.to_datetime([Timestamp.max])
+        t2 = tmax + Timedelta.min - Timedelta("1us")
+        with pytest.raises(OverflowError, match=msg):
+            tmax - t2
+
+
+class TestTimestampSeriesArithmetic:
+    def test_empty_series_add_sub(self, box_with_array):
+        # GH#13844
+        a = Series(dtype="M8[ns]")
+        b = Series(dtype="m8[ns]")
+        a = box_with_array(a)
+        b = box_with_array(b)
+        tm.assert_equal(a, a + b)
+        tm.assert_equal(a, a - b)
+        tm.assert_equal(a, b + a)
+        msg = "cannot subtract"
+        with pytest.raises(TypeError, match=msg):
+            b - a
+
+    def test_operators_datetimelike(self):
+        # ## timedelta64 ###
+        td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
+        td1.iloc[2] = np.nan
+
+        # ## datetime64 ###
+        dt1 = Series(
+            [
+                Timestamp("20111230"),
+                Timestamp("20120101"),
+                Timestamp("20120103"),
+            ]
+        )
+        dt1.iloc[2] = np.nan
+        dt2 = Series(
+            [
+                Timestamp("20111231"),
+                Timestamp("20120102"),
+                Timestamp("20120104"),
+            ]
+        )
+        dt1 - dt2
+        dt2 - dt1
+
+        # datetime64 with timetimedelta
+        dt1 + td1
+        td1 + dt1
+        dt1 - td1
+
+        # timetimedelta with datetime64
+        td1 + dt1
+        dt1 + td1
+
+    def test_dt64ser_sub_datetime_dtype(self, unit):
+        ts = Timestamp(datetime(1993, 1, 7, 13, 30, 00))
+        dt = datetime(1993, 6, 22, 13, 30)
+        ser = Series([ts], dtype=f"M8[{unit}]")
+        result = ser - dt
+
+        # the expected unit is the max of `unit` and the unit imputed to `dt`,
+        #  which is "us"
+        exp_unit = tm.get_finest_unit(unit, "us")
+        assert result.dtype == f"timedelta64[{exp_unit}]"
+
+    # -------------------------------------------------------------
+    # TODO: This next block of tests came from tests.series.test_operators,
+    # needs to be de-duplicated and parametrized over `box` classes
+
+    @pytest.mark.parametrize(
+        "left, right, op_fail",
+        [
+            [
+                [Timestamp("20111230"), Timestamp("20120101"), NaT],
+                [Timestamp("20111231"), Timestamp("20120102"), Timestamp("20120104")],
+                ["__sub__", "__rsub__"],
+            ],
+            [
+                [Timestamp("20111230"), Timestamp("20120101"), NaT],
+                [timedelta(minutes=5, seconds=3), timedelta(minutes=5, seconds=3), NaT],
+                ["__add__", "__radd__", "__sub__"],
+            ],
+            [
+                [
+                    Timestamp("20111230", tz="US/Eastern"),
+                    Timestamp("20111230", tz="US/Eastern"),
+                    NaT,
+                ],
+                [timedelta(minutes=5, seconds=3), NaT, timedelta(minutes=5, seconds=3)],
+                ["__add__", "__radd__", "__sub__"],
+            ],
+        ],
+    )
+    def test_operators_datetimelike_invalid(
+        self, left, right, op_fail, all_arithmetic_operators
+    ):
+        # these are all TypeError ops
+        op_str = all_arithmetic_operators
+        arg1 = Series(left)
+        arg2 = Series(right)
+        # check that we are getting a TypeError
+        # with 'operate' (from core/ops.py) for the ops that are not
+        # defined
+        op = getattr(arg1, op_str, None)
+        # Previously, _validate_for_numeric_binop in core/indexes/base.py
+        # did this for us.
+        if op_str not in op_fail:
+            with pytest.raises(
+                TypeError, match="operate|[cC]annot|unsupported operand"
+            ):
+                op(arg2)
+        else:
+            # Smoke test
+            op(arg2)
+
+    def test_sub_single_tz(self, unit):
+        # GH#12290
+        s1 = Series([Timestamp("2016-02-10", tz="America/Sao_Paulo")]).dt.as_unit(unit)
+        s2 = Series([Timestamp("2016-02-08", tz="America/Sao_Paulo")]).dt.as_unit(unit)
+        result = s1 - s2
+        expected = Series([Timedelta("2days")]).dt.as_unit(unit)
+        tm.assert_series_equal(result, expected)
+        result = s2 - s1
+        expected = Series([Timedelta("-2days")]).dt.as_unit(unit)
+        tm.assert_series_equal(result, expected)
+
+    def test_dt64tz_series_sub_dtitz(self):
+        # GH#19071 subtracting tzaware DatetimeIndex from tzaware Series
+        # (with same tz) raises, fixed by #19024
+        dti = date_range("1999-09-30", periods=10, tz="US/Pacific")
+        ser = Series(dti)
+        expected = Series(TimedeltaIndex(["0days"] * 10))
+
+        res = dti - ser
+        tm.assert_series_equal(res, expected)
+        res = ser - dti
+        tm.assert_series_equal(res, expected)
+
+    def test_sub_datetime_compat(self, unit):
+        # see GH#14088
+        ser = Series([datetime(2016, 8, 23, 12, tzinfo=timezone.utc), NaT]).dt.as_unit(
+            unit
+        )
+        dt = datetime(2016, 8, 22, 12, tzinfo=timezone.utc)
+        # The datetime object has "us" so we upcast lower units
+        exp_unit = tm.get_finest_unit(unit, "us")
+        exp = Series([Timedelta("1 days"), NaT]).dt.as_unit(exp_unit)
+        result = ser - dt
+        tm.assert_series_equal(result, exp)
+        result2 = ser - Timestamp(dt)
+        tm.assert_series_equal(result2, exp)
+
+    def test_dt64_series_add_mixed_tick_DateOffset(self):
+        # GH#4532
+        # operate with pd.offsets
+        s = Series([Timestamp("20130101 9:01"), Timestamp("20130101 9:02")])
+
+        result = s + pd.offsets.Milli(5)
+        result2 = pd.offsets.Milli(5) + s
+        expected = Series(
+            [Timestamp("20130101 9:01:00.005"), Timestamp("20130101 9:02:00.005")]
+        )
+        tm.assert_series_equal(result, expected)
+        tm.assert_series_equal(result2, expected)
+
+        result = s + pd.offsets.Minute(5) + pd.offsets.Milli(5)
+        expected = Series(
+            [Timestamp("20130101 9:06:00.005"), Timestamp("20130101 9:07:00.005")]
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_datetime64_ops_nat(self, unit):
+        # GH#11349
+        datetime_series = Series([NaT, Timestamp("19900315")]).dt.as_unit(unit)
+        nat_series_dtype_timestamp = Series([NaT, NaT], dtype=f"datetime64[{unit}]")
+        single_nat_dtype_datetime = Series([NaT], dtype=f"datetime64[{unit}]")
+
+        # subtraction
+        tm.assert_series_equal(-NaT + datetime_series, nat_series_dtype_timestamp)
+        msg = "bad operand type for unary -: 'DatetimeArray'"
+        with pytest.raises(TypeError, match=msg):
+            -single_nat_dtype_datetime + datetime_series
+
+        tm.assert_series_equal(
+            -NaT + nat_series_dtype_timestamp, nat_series_dtype_timestamp
+        )
+        with pytest.raises(TypeError, match=msg):
+            -single_nat_dtype_datetime + nat_series_dtype_timestamp
+
+        # addition
+        tm.assert_series_equal(
+            nat_series_dtype_timestamp + NaT, nat_series_dtype_timestamp
+        )
+        tm.assert_series_equal(
+            NaT + nat_series_dtype_timestamp, nat_series_dtype_timestamp
+        )
+
+        tm.assert_series_equal(
+            nat_series_dtype_timestamp + NaT, nat_series_dtype_timestamp
+        )
+        tm.assert_series_equal(
+            NaT + nat_series_dtype_timestamp, nat_series_dtype_timestamp
+        )
+
+    # -------------------------------------------------------------
+    # Timezone-Centric Tests
+
+    def test_operators_datetimelike_with_timezones(self):
+        tz = "US/Eastern"
+        dt1 = Series(date_range("2000-01-01 09:00:00", periods=5, tz=tz), name="foo")
+        dt2 = dt1.copy()
+        dt2.iloc[2] = np.nan
+
+        td1 = Series(pd.timedelta_range("1 days 1 min", periods=5, freq="h"))
+        td2 = td1.copy()
+        td2.iloc[1] = np.nan
+        assert td2._values.freq is None
+
+        result = dt1 + td1[0]
+        exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz)
+        tm.assert_series_equal(result, exp)
+
+        result = dt2 + td2[0]
+        exp = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize(tz)
+        tm.assert_series_equal(result, exp)
+
+        # odd numpy behavior with scalar timedeltas
+        result = td1[0] + dt1
+        exp = (dt1.dt.tz_localize(None) + td1[0]).dt.tz_localize(tz)
+        tm.assert_series_equal(result, exp)
+
+        result = td2[0] + dt2
+        exp = (dt2.dt.tz_localize(None) + td2[0]).dt.tz_localize(tz)
+        tm.assert_series_equal(result, exp)
+
+        result = dt1 - td1[0]
+        exp = (dt1.dt.tz_localize(None) - td1[0]).dt.tz_localize(tz)
+        tm.assert_series_equal(result, exp)
+        msg = "cannot subtract DatetimeArray from"
+        with pytest.raises(TypeError, match=msg):
+            td1[0] - dt1
+
+        result = dt2 - td2[0]
+        exp = (dt2.dt.tz_localize(None) - td2[0]).dt.tz_localize(tz)
+        tm.assert_series_equal(result, exp)
+        with pytest.raises(TypeError, match=msg):
+            td2[0] - dt2
+
+        result = dt1 + td1
+        exp = (dt1.dt.tz_localize(None) + td1).dt.tz_localize(tz)
+        tm.assert_series_equal(result, exp)
+
+        result = dt2 + td2
+        exp = (dt2.dt.tz_localize(None) + td2).dt.tz_localize(tz)
+        tm.assert_series_equal(result, exp)
+
+        result = dt1 - td1
+        exp = (dt1.dt.tz_localize(None) - td1).dt.tz_localize(tz)
+        tm.assert_series_equal(result, exp)
+
+        result = dt2 - td2
+        exp = (dt2.dt.tz_localize(None) - td2).dt.tz_localize(tz)
+        tm.assert_series_equal(result, exp)
+        msg = "cannot (add|subtract)"
+        with pytest.raises(TypeError, match=msg):
+            td1 - dt1
+        with pytest.raises(TypeError, match=msg):
+            td2 - dt2
+
+
+class TestDatetimeIndexArithmetic:
+    # -------------------------------------------------------------
+    # Binary operations DatetimeIndex and TimedeltaIndex/array
+
+    def test_dti_add_tdi(self, tz_naive_fixture):
+        # GH#17558
+        tz = tz_naive_fixture
+        dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10)
+        tdi = pd.timedelta_range("0 days", periods=10)
+        expected = date_range("2017-01-01", periods=10, tz=tz)
+        expected = expected._with_freq(None)
+
+        # add with TimedeltaIndex
+        result = dti + tdi
+        tm.assert_index_equal(result, expected)
+
+        result = tdi + dti
+        tm.assert_index_equal(result, expected)
+
+        # add with timedelta64 array
+        result = dti + tdi.values
+        tm.assert_index_equal(result, expected)
+
+        result = tdi.values + dti
+        tm.assert_index_equal(result, expected)
+
+    def test_dti_iadd_tdi(self, tz_naive_fixture):
+        # GH#17558
+        tz = tz_naive_fixture
+        dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10)
+        tdi = pd.timedelta_range("0 days", periods=10)
+        expected = date_range("2017-01-01", periods=10, tz=tz)
+        expected = expected._with_freq(None)
+
+        # iadd with TimedeltaIndex
+        result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10)
+        result += tdi
+        tm.assert_index_equal(result, expected)
+
+        result = pd.timedelta_range("0 days", periods=10)
+        result += dti
+        tm.assert_index_equal(result, expected)
+
+        # iadd with timedelta64 array
+        result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10)
+        result += tdi.values
+        tm.assert_index_equal(result, expected)
+
+        result = pd.timedelta_range("0 days", periods=10)
+        result += dti
+        tm.assert_index_equal(result, expected)
+
+    def test_dti_sub_tdi(self, tz_naive_fixture):
+        # GH#17558
+        tz = tz_naive_fixture
+        dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10)
+        tdi = pd.timedelta_range("0 days", periods=10)
+        expected = date_range("2017-01-01", periods=10, tz=tz, freq="-1D")
+        expected = expected._with_freq(None)
+
+        # sub with TimedeltaIndex
+        result = dti - tdi
+        tm.assert_index_equal(result, expected)
+
+        msg = "cannot subtract .*TimedeltaArray"
+        with pytest.raises(TypeError, match=msg):
+            tdi - dti
+
+        # sub with timedelta64 array
+        result = dti - tdi.values
+        tm.assert_index_equal(result, expected)
+
+        msg = "cannot subtract a datelike from a TimedeltaArray"
+        with pytest.raises(TypeError, match=msg):
+            tdi.values - dti
+
+    def test_dti_isub_tdi(self, tz_naive_fixture, unit):
+        # GH#17558
+        tz = tz_naive_fixture
+        dti = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10).as_unit(unit)
+        tdi = pd.timedelta_range("0 days", periods=10, unit=unit)
+        expected = date_range("2017-01-01", periods=10, tz=tz, freq="-1D", unit=unit)
+        expected = expected._with_freq(None)
+
+        # isub with TimedeltaIndex
+        result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10).as_unit(unit)
+        result -= tdi
+        tm.assert_index_equal(result, expected)
+
+        # DTA.__isub__ GH#43904
+        dta = dti._data.copy()
+        dta -= tdi
+        tm.assert_datetime_array_equal(dta, expected._data)
+
+        out = dti._data.copy()
+        np.subtract(out, tdi, out=out)
+        tm.assert_datetime_array_equal(out, expected._data)
+
+        msg = "cannot subtract a datelike from a TimedeltaArray"
+        with pytest.raises(TypeError, match=msg):
+            tdi -= dti
+
+        # isub with timedelta64 array
+        result = DatetimeIndex([Timestamp("2017-01-01", tz=tz)] * 10).as_unit(unit)
+        result -= tdi.values
+        tm.assert_index_equal(result, expected)
+
+        with pytest.raises(TypeError, match=msg):
+            tdi.values -= dti
+
+        with pytest.raises(TypeError, match=msg):
+            tdi._values -= dti
+
+    # -------------------------------------------------------------
+    # Binary Operations DatetimeIndex and datetime-like
+    # TODO: A couple other tests belong in this section.  Move them in
+    # A PR where there isn't already a giant diff.
+
+    # -------------------------------------------------------------
+
+    def test_dta_add_sub_index(self, tz_naive_fixture):
+        # Check that DatetimeArray defers to Index classes
+        dti = date_range("20130101", periods=3, tz=tz_naive_fixture)
+        dta = dti.array
+        result = dta - dti
+        expected = dti - dti
+        tm.assert_index_equal(result, expected)
+
+        tdi = result
+        result = dta + tdi
+        expected = dti + tdi
+        tm.assert_index_equal(result, expected)
+
+        result = dta - tdi
+        expected = dti - tdi
+        tm.assert_index_equal(result, expected)
+
+    def test_sub_dti_dti(self, unit):
+        # previously performed setop (deprecated in 0.16.0), now changed to
+        # return subtraction -> TimeDeltaIndex (GH ...)
+
+        dti = date_range("20130101", periods=3, unit=unit)
+        dti_tz = date_range("20130101", periods=3, unit=unit).tz_localize("US/Eastern")
+        expected = TimedeltaIndex([0, 0, 0]).as_unit(unit)
+
+        result = dti - dti
+        tm.assert_index_equal(result, expected)
+
+        result = dti_tz - dti_tz
+        tm.assert_index_equal(result, expected)
+        msg = "Cannot subtract tz-naive and tz-aware datetime-like objects"
+        with pytest.raises(TypeError, match=msg):
+            dti_tz - dti
+
+        with pytest.raises(TypeError, match=msg):
+            dti - dti_tz
+
+        # isub
+        dti -= dti
+        tm.assert_index_equal(dti, expected)
+
+        # different length raises ValueError
+        dti1 = date_range("20130101", periods=3, unit=unit)
+        dti2 = date_range("20130101", periods=4, unit=unit)
+        msg = "cannot add indices of unequal length"
+        with pytest.raises(ValueError, match=msg):
+            dti1 - dti2
+
+        # NaN propagation
+        dti1 = DatetimeIndex(["2012-01-01", np.nan, "2012-01-03"]).as_unit(unit)
+        dti2 = DatetimeIndex(["2012-01-02", "2012-01-03", np.nan]).as_unit(unit)
+        expected = TimedeltaIndex(["1 days", np.nan, np.nan]).as_unit(unit)
+        result = dti2 - dti1
+        tm.assert_index_equal(result, expected)
+
+    # -------------------------------------------------------------------
+    # TODO: Most of this block is moved from series or frame tests, needs
+    # cleanup, box-parametrization, and de-duplication
+
+    @pytest.mark.parametrize("op", [operator.add, operator.sub])
+    def test_timedelta64_equal_timedelta_supported_ops(self, op, box_with_array):
+        ser = Series(
+            [
+                Timestamp("20130301"),
+                Timestamp("20130228 23:00:00"),
+                Timestamp("20130228 22:00:00"),
+                Timestamp("20130228 21:00:00"),
+            ]
+        )
+        obj = box_with_array(ser)
+
+        intervals = ["D", "h", "m", "s", "us"]
+
+        def timedelta64(*args):
+            # see casting notes in NumPy gh-12927
+            return np.sum(list(map(np.timedelta64, args, intervals)))
+
+        for d, h, m, s, us in product(*([range(2)] * 5)):
+            nptd = timedelta64(d, h, m, s, us)
+            pytd = timedelta(days=d, hours=h, minutes=m, seconds=s, microseconds=us)
+            lhs = op(obj, nptd)
+            rhs = op(obj, pytd)
+
+            tm.assert_equal(lhs, rhs)
+
+    def test_ops_nat_mixed_datetime64_timedelta64(self):
+        # GH#11349
+        timedelta_series = Series([NaT, Timedelta("1s")])
+        datetime_series = Series([NaT, Timestamp("19900315")])
+        nat_series_dtype_timedelta = Series([NaT, NaT], dtype="timedelta64[ns]")
+        nat_series_dtype_timestamp = Series([NaT, NaT], dtype="datetime64[ns]")
+        single_nat_dtype_datetime = Series([NaT], dtype="datetime64[ns]")
+        single_nat_dtype_timedelta = Series([NaT], dtype="timedelta64[ns]")
+
+        # subtraction
+        tm.assert_series_equal(
+            datetime_series - single_nat_dtype_datetime, nat_series_dtype_timedelta
+        )
+
+        tm.assert_series_equal(
+            datetime_series - single_nat_dtype_timedelta, nat_series_dtype_timestamp
+        )
+        tm.assert_series_equal(
+            -single_nat_dtype_timedelta + datetime_series, nat_series_dtype_timestamp
+        )
+
+        # without a Series wrapping the NaT, it is ambiguous
+        # whether it is a datetime64 or timedelta64
+        # defaults to interpreting it as timedelta64
+        tm.assert_series_equal(
+            nat_series_dtype_timestamp - single_nat_dtype_datetime,
+            nat_series_dtype_timedelta,
+        )
+
+        tm.assert_series_equal(
+            nat_series_dtype_timestamp - single_nat_dtype_timedelta,
+            nat_series_dtype_timestamp,
+        )
+        tm.assert_series_equal(
+            -single_nat_dtype_timedelta + nat_series_dtype_timestamp,
+            nat_series_dtype_timestamp,
+        )
+        msg = "cannot subtract a datelike"
+        with pytest.raises(TypeError, match=msg):
+            timedelta_series - single_nat_dtype_datetime
+
+        # addition
+        tm.assert_series_equal(
+            nat_series_dtype_timestamp + single_nat_dtype_timedelta,
+            nat_series_dtype_timestamp,
+        )
+        tm.assert_series_equal(
+            single_nat_dtype_timedelta + nat_series_dtype_timestamp,
+            nat_series_dtype_timestamp,
+        )
+
+        tm.assert_series_equal(
+            nat_series_dtype_timestamp + single_nat_dtype_timedelta,
+            nat_series_dtype_timestamp,
+        )
+        tm.assert_series_equal(
+            single_nat_dtype_timedelta + nat_series_dtype_timestamp,
+            nat_series_dtype_timestamp,
+        )
+
+        tm.assert_series_equal(
+            nat_series_dtype_timedelta + single_nat_dtype_datetime,
+            nat_series_dtype_timestamp,
+        )
+        tm.assert_series_equal(
+            single_nat_dtype_datetime + nat_series_dtype_timedelta,
+            nat_series_dtype_timestamp,
+        )
+
+    def test_ufunc_coercions(self, unit):
+        idx = date_range("2011-01-01", periods=3, freq="2D", name="x", unit=unit)
+
+        delta = np.timedelta64(1, "D")
+        exp = date_range("2011-01-02", periods=3, freq="2D", name="x", unit=unit)
+        for result in [idx + delta, np.add(idx, delta)]:
+            assert isinstance(result, DatetimeIndex)
+            tm.assert_index_equal(result, exp)
+            assert result.freq == "2D"
+
+        exp = date_range("2010-12-31", periods=3, freq="2D", name="x", unit=unit)
+
+        for result in [idx - delta, np.subtract(idx, delta)]:
+            assert isinstance(result, DatetimeIndex)
+            tm.assert_index_equal(result, exp)
+            assert result.freq == "2D"
+
+        # When adding/subtracting an ndarray (which has no .freq), the result
+        #  does not infer freq
+        idx = idx._with_freq(None)
+        delta = np.array(
+            [np.timedelta64(1, "D"), np.timedelta64(2, "D"), np.timedelta64(3, "D")]
+        )
+        exp = DatetimeIndex(
+            ["2011-01-02", "2011-01-05", "2011-01-08"], name="x"
+        ).as_unit(unit)
+
+        for result in [idx + delta, np.add(idx, delta)]:
+            tm.assert_index_equal(result, exp)
+            assert result.freq == exp.freq
+
+        exp = DatetimeIndex(
+            ["2010-12-31", "2011-01-01", "2011-01-02"], name="x"
+        ).as_unit(unit)
+        for result in [idx - delta, np.subtract(idx, delta)]:
+            assert isinstance(result, DatetimeIndex)
+            tm.assert_index_equal(result, exp)
+            assert result.freq == exp.freq
+
+    def test_dti_add_series(self, tz_naive_fixture, names):
+        # GH#13905
+        tz = tz_naive_fixture
+        index = DatetimeIndex(
+            ["2016-06-28 05:30", "2016-06-28 05:31"], tz=tz, name=names[0]
+        ).as_unit("ns")
+        ser = Series([Timedelta(seconds=5)] * 2, index=index, name=names[1])
+        expected = Series(index + Timedelta(seconds=5), index=index, name=names[2])
+
+        # passing name arg isn't enough when names[2] is None
+        expected.name = names[2]
+        assert expected.dtype == index.dtype
+        result = ser + index
+        tm.assert_series_equal(result, expected)
+        result2 = index + ser
+        tm.assert_series_equal(result2, expected)
+
+        expected = index + Timedelta(seconds=5)
+        result3 = ser.values + index
+        tm.assert_index_equal(result3, expected)
+        result4 = index + ser.values
+        tm.assert_index_equal(result4, expected)
+
+    @pytest.mark.parametrize("op", [operator.add, roperator.radd, operator.sub])
+    def test_dti_addsub_offset_arraylike(
+        self, performance_warning, tz_naive_fixture, names, op, index_or_series
+    ):
+        # GH#18849, GH#19744
+        other_box = index_or_series
+
+        tz = tz_naive_fixture
+        dti = date_range("2017-01-01", periods=2, tz=tz, name=names[0])
+        other = other_box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)], name=names[1])
+
+        xbox = get_upcast_box(dti, other)
+
+        with tm.assert_produces_warning(performance_warning):
+            res = op(dti, other)
+
+        expected = DatetimeIndex(
+            [op(dti[n], other[n]) for n in range(len(dti))], name=names[2], freq="infer"
+        )
+        expected = tm.box_expected(expected, xbox).astype(object)
+        tm.assert_equal(res, expected)
+
+    @pytest.mark.parametrize("other_box", [pd.Index, np.array])
+    def test_dti_addsub_object_arraylike(
+        self, performance_warning, tz_naive_fixture, box_with_array, other_box
+    ):
+        tz = tz_naive_fixture
+
+        dti = date_range("2017-01-01", periods=2, tz=tz)
+        dtarr = tm.box_expected(dti, box_with_array)
+        other = other_box([pd.offsets.MonthEnd(), Timedelta(days=4)])
+        xbox = get_upcast_box(dtarr, other)
+
+        expected = DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture)
+        expected = tm.box_expected(expected, xbox).astype(object)
+
+        with tm.assert_produces_warning(performance_warning):
+            result = dtarr + other
+        tm.assert_equal(result, expected)
+
+        expected = DatetimeIndex(["2016-12-31", "2016-12-29"], tz=tz_naive_fixture)
+        expected = tm.box_expected(expected, xbox).astype(object)
+
+        with tm.assert_produces_warning(performance_warning):
+            result = dtarr - other
+        tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize("years", [-1, 0, 1])
+@pytest.mark.parametrize("months", [-2, 0, 2])
+def test_shift_months(years, months, unit):
+    dti = DatetimeIndex(
+        [
+            Timestamp("2000-01-05 00:15:00"),
+            Timestamp("2000-01-31 00:23:00"),
+            Timestamp("2000-01-01"),
+            Timestamp("2000-02-29"),
+            Timestamp("2000-12-31"),
+        ]
+    ).as_unit(unit)
+    shifted = shift_months(dti.asi8, years * 12 + months, reso=dti._data._creso)
+    shifted_dt64 = shifted.view(f"M8[{dti.unit}]")
+    actual = DatetimeIndex(shifted_dt64)
+
+    raw = [x + pd.offsets.DateOffset(years=years, months=months) for x in dti]
+    expected = DatetimeIndex(raw).as_unit(dti.unit)
+    tm.assert_index_equal(actual, expected)
+
+
+def test_dt64arr_addsub_object_dtype_2d(performance_warning):
+    # block-wise DataFrame operations will require operating on 2D
+    #  DatetimeArray/TimedeltaArray, so check that specifically.
+    dti = date_range("1994-02-13", freq="2W", periods=4)
+    dta = dti._data.reshape((4, 1))
+
+    other = np.array([[pd.offsets.Day(n)] for n in range(4)])
+    assert other.shape == dta.shape
+
+    with tm.assert_produces_warning(performance_warning):
+        result = dta + other
+    with tm.assert_produces_warning(performance_warning):
+        expected = (dta[:, 0] + other[:, 0]).reshape(-1, 1)
+
+    tm.assert_numpy_array_equal(result, expected)
+
+    with tm.assert_produces_warning(performance_warning):
+        # Case where we expect to get a TimedeltaArray back
+        result2 = dta - dta.astype(object)
+
+    assert result2.shape == (4, 1)
+    assert all(td._value == 0 for td in result2.ravel())
+
+
+def test_non_nano_dt64_addsub_np_nat_scalars():
+    # GH 52295
+    ser = Series([1233242342344, 232432434324, 332434242344], dtype="datetime64[ms]")
+    result = ser - np.datetime64("nat", "ms")
+    expected = Series([NaT] * 3, dtype="timedelta64[ms]")
+    tm.assert_series_equal(result, expected)
+
+    result = ser + np.timedelta64("nat", "ms")
+    expected = Series([NaT] * 3, dtype="datetime64[ms]")
+    tm.assert_series_equal(result, expected)
+
+
+def test_non_nano_dt64_addsub_np_nat_scalars_unitless():
+    # GH 52295
+    # TODO: Can we default to the ser unit?
+    ser = Series([1233242342344, 232432434324, 332434242344], dtype="datetime64[ms]")
+    result = ser - np.datetime64("nat")
+    expected = Series([NaT] * 3, dtype="timedelta64[ms]")
+    tm.assert_series_equal(result, expected)
+
+    result = ser + np.timedelta64("nat")
+    expected = Series([NaT] * 3, dtype="datetime64[ms]")
+    tm.assert_series_equal(result, expected)
+
+
+def test_non_nano_dt64_addsub_np_nat_scalars_unsupported_unit():
+    # GH 52295
+    ser = Series([12332, 23243, 33243], dtype="datetime64[s]")
+    result = ser - np.datetime64("nat", "D")
+    expected = Series([NaT] * 3, dtype="timedelta64[s]")
+    tm.assert_series_equal(result, expected)
+
+    result = ser + np.timedelta64("nat", "D")
+    expected = Series([NaT] * 3, dtype="datetime64[s]")
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2353da0dad568875ba150570b52262aa81d0ea4
--- /dev/null
+++ b/pandas/tests/arithmetic/test_interval.py
@@ -0,0 +1,308 @@
+import operator
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.common import is_list_like
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    Index,
+    Interval,
+    IntervalIndex,
+    Period,
+    Series,
+    Timedelta,
+    Timestamp,
+    date_range,
+    period_range,
+    timedelta_range,
+)
+import pandas._testing as tm
+from pandas.core.arrays import (
+    BooleanArray,
+    IntervalArray,
+)
+from pandas.tests.arithmetic.common import get_upcast_box
+
+
+@pytest.fixture(
+    params=[
+        (Index([0, 2, 4, 4]), Index([1, 3, 5, 8])),
+        (Index([0.0, 1.0, 2.0, np.nan]), Index([1.0, 2.0, 3.0, np.nan])),
+        (
+            timedelta_range("0 days", periods=3).insert(3, pd.NaT),
+            timedelta_range("1 day", periods=3).insert(3, pd.NaT),
+        ),
+        (
+            date_range("20170101", periods=3).insert(3, pd.NaT),
+            date_range("20170102", periods=3).insert(3, pd.NaT),
+        ),
+        (
+            date_range("20170101", periods=3, tz="US/Eastern").insert(3, pd.NaT),
+            date_range("20170102", periods=3, tz="US/Eastern").insert(3, pd.NaT),
+        ),
+    ],
+    ids=lambda x: str(x[0].dtype),
+)
+def left_right_dtypes(request):
+    """
+    Fixture for building an IntervalArray from various dtypes
+    """
+    return request.param
+
+
+@pytest.fixture
+def interval_array(left_right_dtypes):
+    """
+    Fixture to generate an IntervalArray of various dtypes containing NA if possible
+    """
+    left, right = left_right_dtypes
+    return IntervalArray.from_arrays(left, right)
+
+
+def create_categorical_intervals(left, right, closed="right"):
+    return Categorical(IntervalIndex.from_arrays(left, right, closed))
+
+
+def create_series_intervals(left, right, closed="right"):
+    return Series(IntervalArray.from_arrays(left, right, closed))
+
+
+def create_series_categorical_intervals(left, right, closed="right"):
+    return Series(Categorical(IntervalIndex.from_arrays(left, right, closed)))
+
+
+class TestComparison:
+    @pytest.fixture(params=[operator.eq, operator.ne])
+    def op(self, request):
+        return request.param
+
+    @pytest.fixture(
+        params=[
+            IntervalArray.from_arrays,
+            IntervalIndex.from_arrays,
+            create_categorical_intervals,
+            create_series_intervals,
+            create_series_categorical_intervals,
+        ],
+        ids=[
+            "IntervalArray",
+            "IntervalIndex",
+            "Categorical[Interval]",
+            "Series[Interval]",
+            "Series[Categorical[Interval]]",
+        ],
+    )
+    def interval_constructor(self, request):
+        """
+        Fixture for all pandas native interval constructors.
+        To be used as the LHS of IntervalArray comparisons.
+        """
+        return request.param
+
+    def elementwise_comparison(self, op, interval_array, other):
+        """
+        Helper that performs elementwise comparisons between `array` and `other`
+        """
+        other = other if is_list_like(other) else [other] * len(interval_array)
+        expected = np.array(
+            [op(x, y) for x, y in zip(interval_array, other, strict=True)]
+        )
+        if isinstance(other, Series):
+            return Series(expected, index=other.index)
+        return expected
+
+    def test_compare_scalar_interval(self, op, interval_array):
+        # matches first interval
+        other = interval_array[0]
+        result = op(interval_array, other)
+        expected = self.elementwise_comparison(op, interval_array, other)
+        tm.assert_numpy_array_equal(result, expected)
+
+        # matches on a single endpoint but not both
+        other = Interval(interval_array.left[0], interval_array.right[1])
+        result = op(interval_array, other)
+        expected = self.elementwise_comparison(op, interval_array, other)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_compare_scalar_interval_mixed_closed(self, op, closed, other_closed):
+        interval_array = IntervalArray.from_arrays(range(2), range(1, 3), closed=closed)
+        other = Interval(0, 1, closed=other_closed)
+
+        result = op(interval_array, other)
+        expected = self.elementwise_comparison(op, interval_array, other)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_compare_scalar_na(self, op, interval_array, nulls_fixture, box_with_array):
+        box = box_with_array
+        obj = tm.box_expected(interval_array, box)
+        result = op(obj, nulls_fixture)
+
+        if nulls_fixture is pd.NA:
+            # GH#31882
+            exp = np.ones(interval_array.shape, dtype=bool)
+            expected = BooleanArray(exp, exp)
+        else:
+            expected = self.elementwise_comparison(op, interval_array, nulls_fixture)
+
+        if not (box is Index and nulls_fixture is pd.NA):
+            # don't cast expected from BooleanArray to ndarray[object]
+            xbox = get_upcast_box(obj, nulls_fixture, True)
+            expected = tm.box_expected(expected, xbox)
+
+        tm.assert_equal(result, expected)
+
+        rev = op(nulls_fixture, obj)
+        tm.assert_equal(rev, expected)
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            0,
+            1.0,
+            True,
+            "foo",
+            Timestamp("2017-01-01"),
+            Timestamp("2017-01-01", tz="US/Eastern"),
+            Timedelta("0 days"),
+            Period("2017-01-01", "D"),
+        ],
+    )
+    def test_compare_scalar_other(self, op, interval_array, other):
+        result = op(interval_array, other)
+        expected = self.elementwise_comparison(op, interval_array, other)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_compare_list_like_interval(self, op, interval_array, interval_constructor):
+        # same endpoints
+        other = interval_constructor(interval_array.left, interval_array.right)
+        result = op(interval_array, other)
+        expected = self.elementwise_comparison(op, interval_array, other)
+        tm.assert_equal(result, expected)
+
+        # different endpoints
+        other = interval_constructor(
+            interval_array.left[::-1], interval_array.right[::-1]
+        )
+        result = op(interval_array, other)
+        expected = self.elementwise_comparison(op, interval_array, other)
+        tm.assert_equal(result, expected)
+
+        # all nan endpoints
+        other = interval_constructor([np.nan] * 4, [np.nan] * 4)
+        result = op(interval_array, other)
+        expected = self.elementwise_comparison(op, interval_array, other)
+        tm.assert_equal(result, expected)
+
+    def test_compare_list_like_interval_mixed_closed(
+        self, op, interval_constructor, closed, other_closed
+    ):
+        interval_array = IntervalArray.from_arrays(range(2), range(1, 3), closed=closed)
+        other = interval_constructor(range(2), range(1, 3), closed=other_closed)
+
+        result = op(interval_array, other)
+        expected = self.elementwise_comparison(op, interval_array, other)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            (
+                Interval(0, 1),
+                Interval(Timedelta("1 day"), Timedelta("2 days")),
+                Interval(4, 5, "both"),
+                Interval(10, 20, "neither"),
+            ),
+            (0, 1.5, Timestamp("20170103"), np.nan),
+            (
+                Timestamp("20170102", tz="US/Eastern"),
+                Timedelta("2 days"),
+                "baz",
+                pd.NaT,
+            ),
+        ],
+    )
+    def test_compare_list_like_object(self, op, interval_array, other):
+        result = op(interval_array, other)
+        expected = self.elementwise_comparison(op, interval_array, other)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_compare_list_like_nan(self, op, interval_array, nulls_fixture):
+        other = [nulls_fixture] * 4
+        result = op(interval_array, other)
+        expected = self.elementwise_comparison(op, interval_array, other)
+
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            np.arange(4, dtype="int64"),
+            np.arange(4, dtype="float64"),
+            date_range("2017-01-01", periods=4),
+            date_range("2017-01-01", periods=4, tz="US/Eastern"),
+            timedelta_range("0 days", periods=4),
+            period_range("2017-01-01", periods=4, freq="D"),
+            Categorical(list("abab")),
+            Categorical(date_range("2017-01-01", periods=4)),
+            pd.array(list("abcd")),
+            pd.array(["foo", 3.14, None, object()], dtype=object),
+        ],
+        ids=lambda x: str(x.dtype),
+    )
+    def test_compare_list_like_other(self, op, interval_array, other):
+        result = op(interval_array, other)
+        expected = self.elementwise_comparison(op, interval_array, other)
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("length", [1, 3, 5])
+    @pytest.mark.parametrize("other_constructor", [IntervalArray, list])
+    def test_compare_length_mismatch_errors(self, op, other_constructor, length):
+        interval_array = IntervalArray.from_arrays(range(4), range(1, 5))
+        other = other_constructor([Interval(0, 1)] * length)
+        with pytest.raises(ValueError, match="Lengths must match to compare"):
+            op(interval_array, other)
+
+    @pytest.mark.parametrize(
+        "constructor, expected_type, assert_func",
+        [
+            (IntervalIndex, np.array, tm.assert_numpy_array_equal),
+            (Series, Series, tm.assert_series_equal),
+        ],
+    )
+    def test_index_series_compat(self, op, constructor, expected_type, assert_func):
+        # IntervalIndex/Series that rely on IntervalArray for comparisons
+        breaks = range(4)
+        index = constructor(IntervalIndex.from_breaks(breaks))
+
+        # scalar comparisons
+        other = index[0]
+        result = op(index, other)
+        expected = expected_type(self.elementwise_comparison(op, index, other))
+        assert_func(result, expected)
+
+        other = breaks[0]
+        result = op(index, other)
+        expected = expected_type(self.elementwise_comparison(op, index, other))
+        assert_func(result, expected)
+
+        # list-like comparisons
+        other = IntervalArray.from_breaks(breaks)
+        result = op(index, other)
+        expected = expected_type(self.elementwise_comparison(op, index, other))
+        assert_func(result, expected)
+
+        other = [index[0], breaks[0], "foo"]
+        result = op(index, other)
+        expected = expected_type(self.elementwise_comparison(op, index, other))
+        assert_func(result, expected)
+
+    @pytest.mark.parametrize("scalars", ["a", False, 1, 1.0, None])
+    def test_comparison_operations(self, scalars):
+        # GH #28981
+        expected = Series([False, False])
+        s = Series([Interval(0, 1), Interval(1, 2)], dtype="interval")
+        result = s == scalars
+        tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py
new file mode 100644
index 0000000000000000000000000000000000000000..5878246126d617e96dbb11b2d437e78fe87a3aec
--- /dev/null
+++ b/pandas/tests/arithmetic/test_numeric.py
@@ -0,0 +1,1585 @@
+# Arithmetic tests for DataFrame/Series/Index/Array classes that should
+# behave identically.
+# Specifically for numeric dtypes
+from __future__ import annotations
+
+from collections import abc
+from datetime import timedelta
+from decimal import Decimal
+import operator
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    Index,
+    RangeIndex,
+    Series,
+    Timedelta,
+    TimedeltaIndex,
+    array,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.core import ops
+from pandas.core.computation import expressions as expr
+from pandas.tests.arithmetic.common import (
+    assert_invalid_addsub_type,
+    assert_invalid_comparison,
+)
+
+
+@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"])
+def switch_numexpr_min_elements(request, monkeypatch):
+    with monkeypatch.context() as m:
+        m.setattr(expr, "_MIN_ELEMENTS", request.param)
+        yield request.param
+
+
+@pytest.fixture(
+    params=[
+        # TODO: add more  dtypes here
+        Index(np.arange(5, dtype="float64")),
+        Index(np.arange(5, dtype="int64")),
+        Index(np.arange(5, dtype="uint64")),
+        RangeIndex(5),
+    ],
+    ids=lambda x: type(x).__name__,
+)
+def numeric_idx(request):
+    """
+    Several types of numeric-dtypes Index objects
+    """
+    return request.param
+
+
+def adjust_negative_zero(zero, expected):
+    """
+    Helper to adjust the expected result if we are dividing by -0.0
+    as opposed to 0.0
+    """
+    if np.signbit(np.array(zero)).any():
+        # All entries in the `zero` fixture should be either
+        #  all-negative or no-negative.
+        assert np.signbit(np.array(zero)).all()
+
+        expected *= -1
+
+    return expected
+
+
+def compare_op(series, other, op):
+    left = np.abs(series) if op in (ops.rpow, operator.pow) else series
+    right = np.abs(other) if op in (ops.rpow, operator.pow) else other
+
+    cython_or_numpy = op(left, right)
+    python = left.combine(right, op)
+    if isinstance(other, Series) and not other.index.equals(series.index):
+        python.index = python.index._with_freq(None)
+    tm.assert_series_equal(cython_or_numpy, python)
+
+
+# TODO: remove this kludge once mypy stops giving false positives here
+# List comprehension has incompatible type List[PandasObject]; expected List[RangeIndex]
+#  See GH#29725
+_ldtypes = ["i1", "i2", "i4", "i8", "u1", "u2", "u4", "u8", "f2", "f4", "f8"]
+lefts: list[Index | Series] = [RangeIndex(10, 40, 10)]
+lefts.extend([Series([10, 20, 30], dtype=dtype) for dtype in _ldtypes])
+lefts.extend([Index([10, 20, 30], dtype=dtype) for dtype in _ldtypes if dtype != "f2"])
+
+# ------------------------------------------------------------------
+# Comparisons
+
+
+class TestNumericComparisons:
+    def test_operator_series_comparison_zerorank(self):
+        # GH#13006
+        result = np.float64(0) > Series([1, 2, 3])
+        expected = 0.0 > Series([1, 2, 3])
+        tm.assert_series_equal(result, expected)
+        result = Series([1, 2, 3]) < np.float64(0)
+        expected = Series([1, 2, 3]) < 0.0
+        tm.assert_series_equal(result, expected)
+        result = np.array([0, 1, 2])[0] > Series([0, 1, 2])
+        expected = 0.0 > Series([1, 2, 3])
+        tm.assert_series_equal(result, expected)
+
+    def test_df_numeric_cmp_dt64_raises(self, box_with_array, fixed_now_ts):
+        # GH#8932, GH#22163
+        ts = fixed_now_ts
+        obj = np.array(range(5))
+        obj = tm.box_expected(obj, box_with_array)
+
+        assert_invalid_comparison(obj, ts, box_with_array)
+
+    def test_compare_invalid(self):
+        # GH#8058
+        # ops testing
+        a = Series(np.random.default_rng(2).standard_normal(5), name=0)
+        b = Series(np.random.default_rng(2).standard_normal(5))
+        b.name = pd.Timestamp("2000-01-01")
+        tm.assert_series_equal(a / b, 1 / (b / a))
+
+    def test_numeric_cmp_string_numexpr_path(self, box_with_array, monkeypatch):
+        # GH#36377, GH#35700
+        box = box_with_array
+        xbox = box if box is not Index else np.ndarray
+
+        obj = Series(np.random.default_rng(2).standard_normal(51))
+        obj = tm.box_expected(obj, box, transpose=False)
+        with monkeypatch.context() as m:
+            m.setattr(expr, "_MIN_ELEMENTS", 50)
+            result = obj == "a"
+
+        expected = Series(np.zeros(51, dtype=bool))
+        expected = tm.box_expected(expected, xbox, transpose=False)
+        tm.assert_equal(result, expected)
+
+        with monkeypatch.context() as m:
+            m.setattr(expr, "_MIN_ELEMENTS", 50)
+            result = obj != "a"
+        tm.assert_equal(result, ~expected)
+
+        msg = "Invalid comparison between dtype=float64 and str"
+        with pytest.raises(TypeError, match=msg):
+            obj < "a"
+
+
+# ------------------------------------------------------------------
+# Numeric dtypes Arithmetic with Datetime/Timedelta Scalar
+
+
+class TestNumericArraylikeArithmeticWithDatetimeLike:
+    def test_mul_timedelta_list(self, box_with_array):
+        # GH#62524
+        box = box_with_array
+        left = np.array([3, 4])
+        left = tm.box_expected(left, box)
+
+        right = [Timedelta(days=1), Timedelta(days=2)]
+
+        result = left * right
+
+        expected = TimedeltaIndex([Timedelta(days=3), Timedelta(days=8)])
+        expected = tm.box_expected(expected, box)
+        tm.assert_equal(result, expected)
+
+        result2 = right * left
+        tm.assert_equal(result2, expected)
+
+    @pytest.mark.parametrize("box_cls", [np.array, Index, Series])
+    @pytest.mark.parametrize(
+        "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype)
+    )
+    def test_mul_td64arr(self, left, box_cls):
+        # GH#22390
+        right = np.array([1, 2, 3], dtype="m8[s]")
+        right = box_cls(right)
+
+        expected = TimedeltaIndex(["10s", "40s", "90s"], dtype=right.dtype)
+
+        if isinstance(left, Series) or box_cls is Series:
+            expected = Series(expected)
+        assert expected.dtype == right.dtype
+
+        result = left * right
+        tm.assert_equal(result, expected)
+
+        result = right * left
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize("box_cls", [np.array, Index, Series])
+    @pytest.mark.parametrize(
+        "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype)
+    )
+    def test_div_td64arr(self, left, box_cls):
+        # GH#22390
+        right = np.array([10, 40, 90], dtype="m8[s]")
+        right = box_cls(right)
+
+        expected = TimedeltaIndex(["1s", "2s", "3s"], dtype=right.dtype)
+        if isinstance(left, Series) or box_cls is Series:
+            expected = Series(expected)
+        assert expected.dtype == right.dtype
+
+        result = right / left
+        tm.assert_equal(result, expected)
+
+        result = right // left
+        tm.assert_equal(result, expected)
+
+        # (true_) needed for min-versions build 2022-12-26
+        msg = "ufunc '(true_)?divide' cannot use operands with types"
+        with pytest.raises(TypeError, match=msg):
+            left / right
+
+        msg = "ufunc 'floor_divide' cannot use operands with types"
+        with pytest.raises(TypeError, match=msg):
+            left // right
+
+    # TODO: also test Tick objects;
+    #  see test_numeric_arr_rdiv_tdscalar for note on these failing
+    @pytest.mark.parametrize(
+        "scalar_td",
+        [
+            Timedelta(days=1).as_unit("ns"),
+            Timedelta(days=1).as_unit("ns").to_timedelta64(),
+            Timedelta(days=1).to_pytimedelta(),
+            Timedelta(days=1).to_timedelta64().astype("timedelta64[s]"),
+            Timedelta(days=1).to_timedelta64().astype("timedelta64[ms]"),
+        ],
+        ids=lambda x: type(x).__name__,
+    )
+    def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box_with_array):
+        # GH#19333
+        box = box_with_array
+        index = numeric_idx
+        expected = TimedeltaIndex(
+            [Timedelta(days=n) for n in range(len(index))], dtype="m8[ns]"
+        )
+        if isinstance(scalar_td, np.timedelta64):
+            dtype = scalar_td.dtype
+            expected = expected.astype(dtype)
+        elif type(scalar_td) is timedelta:
+            expected = expected.astype("m8[us]")
+
+        index = tm.box_expected(index, box)
+        expected = tm.box_expected(expected, box)
+
+        result = index * scalar_td
+        tm.assert_equal(result, expected)
+
+        commute = scalar_td * index
+        tm.assert_equal(commute, expected)
+
+    @pytest.mark.parametrize(
+        "scalar_td",
+        [
+            Timedelta(days=1).as_unit("ns"),
+            Timedelta(days=1).as_unit("ns").to_timedelta64(),
+            Timedelta(days=1).as_unit("ns").to_pytimedelta(),
+        ],
+        ids=lambda x: type(x).__name__,
+    )
+    @pytest.mark.parametrize("dtype", [np.int64, np.float64])
+    def test_numeric_arr_mul_tdscalar_numexpr_path(
+        self, dtype, scalar_td, box_with_array
+    ):
+        # GH#44772 for the float64 case
+        box = box_with_array
+
+        arr_i8 = np.arange(2 * 10**4).astype(np.int64, copy=False)
+        arr = arr_i8.astype(dtype, copy=False)
+        obj = tm.box_expected(arr, box, transpose=False)
+
+        expected = arr_i8.view("timedelta64[D]").astype("timedelta64[ns]")
+        if type(scalar_td) is timedelta:
+            expected = expected.astype("timedelta64[us]")
+
+        expected = tm.box_expected(expected, box, transpose=False)
+
+        result = obj * scalar_td
+        tm.assert_equal(result, expected)
+
+        result = scalar_td * obj
+        tm.assert_equal(result, expected)
+
+    def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box_with_array):
+        box = box_with_array
+
+        index = numeric_idx[1:3]
+
+        expected = TimedeltaIndex(["3 Days", "36 Hours"])
+        if isinstance(three_days, np.timedelta64):
+            dtype = three_days.dtype
+            if dtype < np.dtype("m8[s]"):
+                # i.e. resolution is lower -> use lowest supported resolution
+                dtype = np.dtype("m8[s]")
+            expected = expected.astype(dtype)
+        elif type(three_days) is timedelta or (
+            isinstance(three_days, Timedelta) and three_days.unit == "us"
+        ):
+            expected = expected.astype("m8[us]")
+        elif isinstance(
+            three_days,
+            (pd.offsets.Day, pd.offsets.Hour, pd.offsets.Minute, pd.offsets.Second),
+        ):
+            # closest reso is Second
+            expected = expected.astype("m8[s]")
+
+        index = tm.box_expected(index, box)
+        expected = tm.box_expected(expected, box)
+
+        if isinstance(three_days, pd.offsets.Day):
+            # GH#41943 Day is no longer timedelta-like
+            msg = "unsupported operand type"
+            with pytest.raises(TypeError, match=msg):
+                three_days / index
+        else:
+            result = three_days / index
+            tm.assert_equal(result, expected)
+            msg = "cannot use operands with types dtype"
+
+        with pytest.raises(TypeError, match=msg):
+            index / three_days
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            Timedelta(hours=31),
+            Timedelta(hours=31).to_pytimedelta(),
+            Timedelta(hours=31).to_timedelta64(),
+            Timedelta(hours=31).to_timedelta64().astype("m8[h]"),
+            np.timedelta64("NaT"),
+            np.timedelta64("NaT", "D"),
+            pd.offsets.Minute(3),
+            pd.offsets.Second(0),
+            # GH#28080 numeric+datetimelike should raise; Timestamp used
+            #  to raise NullFrequencyError but that behavior was removed in 1.0
+            pd.Timestamp("2021-01-01", tz="Asia/Tokyo"),
+            pd.Timestamp("2021-01-01"),
+            pd.Timestamp("2021-01-01").to_pydatetime(),
+            pd.Timestamp("2021-01-01", tz="UTC").to_pydatetime(),
+            pd.Timestamp("2021-01-01").to_datetime64(),
+            np.datetime64("NaT", "ns"),
+            pd.NaT,
+        ],
+        ids=repr,
+    )
+    def test_add_sub_datetimedeltalike_invalid(
+        self, numeric_idx, other, box_with_array
+    ):
+        box = box_with_array
+
+        left = tm.box_expected(numeric_idx, box)
+        msg = "|".join(
+            [
+                "unsupported operand type",
+                "Addition/subtraction of integers and integer-arrays",
+                "Instead of adding/subtracting",
+                "cannot use operands with types dtype",
+                "Concatenation operation is not implemented for NumPy arrays",
+                "Cannot (add|subtract) NaT (to|from) ndarray",
+                # pd.array vs np.datetime64 case
+                r"operand type\(s\) all returned NotImplemented from __array_ufunc__",
+                "can only perform ops with numeric values",
+                "cannot subtract DatetimeArray from ndarray",
+                # pd.Timedelta(1) + Index([0, 1, 2])
+                "Cannot add or subtract Timedelta from integers",
+            ]
+        )
+        assert_invalid_addsub_type(left, other, msg)
+
+
+# ------------------------------------------------------------------
+# Arithmetic
+
+
+class TestDivisionByZero:
+    def test_div_zero(self, zero, numeric_idx):
+        idx = numeric_idx
+
+        expected = Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64)
+        # We only adjust for Index, because Series does not yet apply
+        #  the adjustment correctly.
+        expected2 = adjust_negative_zero(zero, expected)
+
+        result = idx / zero
+        tm.assert_index_equal(result, expected2)
+        ser_compat = Series(idx).astype("i8") / np.array(zero).astype("i8")
+        tm.assert_series_equal(ser_compat, Series(expected))
+
+    def test_floordiv_zero(self, zero, numeric_idx):
+        idx = numeric_idx
+
+        expected = Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64)
+        # We only adjust for Index, because Series does not yet apply
+        #  the adjustment correctly.
+        expected2 = adjust_negative_zero(zero, expected)
+
+        result = idx // zero
+        tm.assert_index_equal(result, expected2)
+        ser_compat = Series(idx).astype("i8") // np.array(zero).astype("i8")
+        tm.assert_series_equal(ser_compat, Series(expected))
+
+    def test_mod_zero(self, zero, numeric_idx):
+        idx = numeric_idx
+
+        expected = Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64)
+        result = idx % zero
+        tm.assert_index_equal(result, expected)
+        ser_compat = Series(idx).astype("i8") % np.array(zero).astype("i8")
+        tm.assert_series_equal(ser_compat, Series(result))
+
+    def test_divmod_zero(self, zero, numeric_idx):
+        idx = numeric_idx
+
+        exleft = Index([np.nan, np.inf, np.inf, np.inf, np.inf], dtype=np.float64)
+        exright = Index([np.nan, np.nan, np.nan, np.nan, np.nan], dtype=np.float64)
+        exleft = adjust_negative_zero(zero, exleft)
+
+        result = divmod(idx, zero)
+        tm.assert_index_equal(result[0], exleft)
+        tm.assert_index_equal(result[1], exright)
+
+    @pytest.mark.parametrize("op", [operator.truediv, operator.floordiv])
+    def test_div_negative_zero(self, zero, numeric_idx, op):
+        # Check that -1 / -0.0 returns np.inf, not -np.inf
+        if numeric_idx.dtype == np.uint64:
+            pytest.skip(f"Div by negative 0 not relevant for {numeric_idx.dtype}")
+        idx = numeric_idx - 3
+
+        expected = Index([-np.inf, -np.inf, -np.inf, np.nan, np.inf], dtype=np.float64)
+        expected = adjust_negative_zero(zero, expected)
+
+        result = op(idx, zero)
+        tm.assert_index_equal(result, expected)
+
+    # ------------------------------------------------------------------
+
+    @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64])
+    def test_ser_div_ser(
+        self,
+        switch_numexpr_min_elements,
+        dtype1,
+        any_real_numpy_dtype,
+    ):
+        # no longer do integer div for any ops, but deal with the 0's
+        dtype2 = any_real_numpy_dtype
+
+        first = Series([3, 4, 5, 8], name="first").astype(dtype1)
+        second = Series([0, 0, 0, 3], name="second").astype(dtype2)
+
+        with np.errstate(all="ignore"):
+            expected = Series(
+                first.values.astype(np.float64) / second.values,
+                dtype="float64",
+                name=None,
+            )
+        expected.iloc[0:3] = np.inf
+        if first.dtype == "int64" and second.dtype == "float32":
+            # when using numexpr, the casting rules are slightly different
+            # and int64/float32 combo results in float32 instead of float64
+            if expr.USE_NUMEXPR and switch_numexpr_min_elements == 0:
+                expected = expected.astype("float32")
+
+        result = first / second
+        tm.assert_series_equal(result, expected)
+        assert not result.equals(second / first)
+
+    @pytest.mark.parametrize("dtype1", [np.int64, np.float64, np.uint64])
+    def test_ser_divmod_zero(self, dtype1, any_real_numpy_dtype):
+        # GH#26987
+        dtype2 = any_real_numpy_dtype
+        left = Series([1, 1]).astype(dtype1)
+        right = Series([0, 2]).astype(dtype2)
+
+        # GH#27321 pandas convention is to set 1 // 0 to np.inf, as opposed
+        #  to numpy which sets to np.nan; patch `expected[0]` below
+        expected = left // right, left % right
+        expected = list(expected)
+        expected[0] = expected[0].astype(np.float64)
+        expected[0][0] = np.inf
+        result = divmod(left, right)
+
+        tm.assert_series_equal(result[0], expected[0])
+        tm.assert_series_equal(result[1], expected[1])
+
+        # rdivmod case
+        result = divmod(left.values, right)
+        tm.assert_series_equal(result[0], expected[0])
+        tm.assert_series_equal(result[1], expected[1])
+
+    def test_ser_divmod_inf(self):
+        left = Series([np.inf, 1.0])
+        right = Series([np.inf, 2.0])
+
+        expected = left // right, left % right
+        result = divmod(left, right)
+
+        tm.assert_series_equal(result[0], expected[0])
+        tm.assert_series_equal(result[1], expected[1])
+
+        # rdivmod case
+        result = divmod(left.values, right)
+        tm.assert_series_equal(result[0], expected[0])
+        tm.assert_series_equal(result[1], expected[1])
+
+    def test_rdiv_zero_compat(self):
+        # GH#8674
+        zero_array = np.array([0] * 5)
+        data = np.random.default_rng(2).standard_normal(5)
+        expected = Series([0.0] * 5)
+
+        result = zero_array / Series(data)
+        tm.assert_series_equal(result, expected)
+
+        result = Series(zero_array) / data
+        tm.assert_series_equal(result, expected)
+
+        result = Series(zero_array) / Series(data)
+        tm.assert_series_equal(result, expected)
+
+    def test_div_zero_inf_signs(self):
+        # GH#9144, inf signing
+        ser = Series([-1, 0, 1], name="first")
+        expected = Series([-np.inf, np.nan, np.inf], name="first")
+
+        result = ser / 0
+        tm.assert_series_equal(result, expected)
+
+    def test_rdiv_zero(self):
+        # GH#9144
+        ser = Series([-1, 0, 1], name="first")
+        expected = Series([0.0, np.nan, 0.0], name="first")
+
+        result = 0 / ser
+        tm.assert_series_equal(result, expected)
+
+    def test_floordiv_div(self):
+        # GH#9144
+        ser = Series([-1, 0, 1], name="first")
+
+        result = ser // 0
+        expected = Series([-np.inf, np.nan, np.inf], name="first")
+        tm.assert_series_equal(result, expected)
+
+    def test_df_div_zero_df(self):
+        # integer div, but deal with the 0's (GH#9144)
+        df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]})
+        result = df / df
+
+        first = Series([1.0, 1.0, 1.0, 1.0])
+        second = Series([np.nan, np.nan, np.nan, 1])
+        expected = pd.DataFrame({"first": first, "second": second})
+        tm.assert_frame_equal(result, expected)
+
+    def test_df_div_zero_array(self):
+        # integer div, but deal with the 0's (GH#9144)
+        df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]})
+
+        first = Series([1.0, 1.0, 1.0, 1.0])
+        second = Series([np.nan, np.nan, np.nan, 1])
+        expected = pd.DataFrame({"first": first, "second": second})
+
+        with np.errstate(all="ignore"):
+            arr = df.values.astype("float") / df.values
+        result = pd.DataFrame(arr, index=df.index, columns=df.columns)
+        tm.assert_frame_equal(result, expected)
+
+    def test_df_div_zero_int(self):
+        # integer div, but deal with the 0's (GH#9144)
+        df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]})
+
+        result = df / 0
+        expected = pd.DataFrame(np.inf, index=df.index, columns=df.columns)
+        expected.iloc[0:3, 1] = np.nan
+        tm.assert_frame_equal(result, expected)
+
+        # numpy has a slightly different (wrong) treatment
+        with np.errstate(all="ignore"):
+            arr = df.values.astype("float64") / 0
+        result2 = pd.DataFrame(arr, index=df.index, columns=df.columns)
+        tm.assert_frame_equal(result2, expected)
+
+    def test_df_div_zero_series_does_not_commute(self):
+        # integer div, but deal with the 0's (GH#9144)
+        df = pd.DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
+        ser = df[0]
+        res = ser / df
+        res2 = df / ser
+        assert not res.fillna(0).equals(res2.fillna(0))
+
+    # ------------------------------------------------------------------
+    # Mod By Zero
+
+    def test_df_mod_zero_df(self):
+        # GH#3590, modulo as ints
+        df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]})
+        # this is technically wrong, as the integer portion is coerced to float
+        first = Series([0, 0, 0, 0])
+        first = first.astype("float64")
+        second = Series([np.nan, np.nan, np.nan, 0])
+        expected = pd.DataFrame({"first": first, "second": second})
+        result = df % df
+        tm.assert_frame_equal(result, expected)
+
+        # GH#38939 If we dont pass copy=False, df is consolidated and
+        #  result["first"] is float64 instead of int64
+        df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]}, copy=False)
+        first = Series([0, 0, 0, 0], dtype="int64")
+        second = Series([np.nan, np.nan, np.nan, 0])
+        expected = pd.DataFrame({"first": first, "second": second})
+        result = df % df
+        tm.assert_frame_equal(result, expected)
+
+    def test_df_mod_zero_array(self):
+        # GH#3590, modulo as ints
+        df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]})
+
+        # this is technically wrong, as the integer portion is coerced to float
+        # ###
+        first = Series([0, 0, 0, 0], dtype="float64")
+        second = Series([np.nan, np.nan, np.nan, 0])
+        expected = pd.DataFrame({"first": first, "second": second})
+
+        # numpy has a slightly different (wrong) treatment
+        with np.errstate(all="ignore"):
+            arr = df.values % df.values
+        result2 = pd.DataFrame(arr, index=df.index, columns=df.columns, dtype="float64")
+        result2.iloc[0:3, 1] = np.nan
+        tm.assert_frame_equal(result2, expected)
+
+    def test_df_mod_zero_int(self):
+        # GH#3590, modulo as ints
+        df = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]})
+
+        result = df % 0
+        expected = pd.DataFrame(np.nan, index=df.index, columns=df.columns)
+        tm.assert_frame_equal(result, expected)
+
+        # numpy has a slightly different (wrong) treatment
+        with np.errstate(all="ignore"):
+            arr = df.values.astype("float64") % 0
+        result2 = pd.DataFrame(arr, index=df.index, columns=df.columns)
+        tm.assert_frame_equal(result2, expected)
+
+    def test_df_mod_zero_series_does_not_commute(self):
+        # GH#3590, modulo as ints
+        # not commutative with series
+        df = pd.DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
+        ser = df[0]
+        res = ser % df
+        res2 = df % ser
+        assert not res.fillna(0).equals(res2.fillna(0))
+
+
+class TestMultiplicationDivision:
+    # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__
+    # for non-timestamp/timedelta/period dtypes
+
+    def test_divide_decimal(self, box_with_array):
+        # resolves issue GH#9787
+        box = box_with_array
+        ser = Series([Decimal(10)])
+        expected = Series([Decimal(5)])
+
+        ser = tm.box_expected(ser, box)
+        expected = tm.box_expected(expected, box)
+
+        result = ser / Decimal(2)
+
+        tm.assert_equal(result, expected)
+
+        result = ser // Decimal(2)
+        tm.assert_equal(result, expected)
+
+    def test_div_equiv_binop(self):
+        # Test Series.div as well as Series.__div__
+        # float/integer issue
+        # GH#7785
+        first = Series([1, 0], name="first")
+        second = Series([-0.01, -0.02], name="second")
+        expected = Series([-0.01, -np.inf])
+
+        result = second.div(first)
+        tm.assert_series_equal(result, expected, check_names=False)
+
+        result = second / first
+        tm.assert_series_equal(result, expected)
+
+    def test_div_int(self, numeric_idx):
+        idx = numeric_idx
+        result = idx / 1
+        expected = idx.astype("float64")
+        tm.assert_index_equal(result, expected)
+
+        result = idx / 2
+        expected = Index(idx.values / 2)
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize("op", [operator.mul, ops.rmul, operator.floordiv])
+    def test_mul_int_identity(self, op, numeric_idx, box_with_array):
+        idx = numeric_idx
+        idx = tm.box_expected(idx, box_with_array)
+
+        result = op(idx, 1)
+        tm.assert_equal(result, idx)
+
+    def test_mul_int_array(self, numeric_idx):
+        idx = numeric_idx
+        didx = idx * idx
+
+        result = idx * np.array(5, dtype="int64")
+        tm.assert_index_equal(result, idx * 5)
+
+        arr_dtype = "uint64" if idx.dtype == np.uint64 else "int64"
+        result = idx * np.arange(5, dtype=arr_dtype)
+        tm.assert_index_equal(result, didx)
+
+    def test_mul_int_series(self, numeric_idx):
+        idx = numeric_idx
+        didx = idx * idx
+
+        arr_dtype = "uint64" if idx.dtype == np.uint64 else "int64"
+        result = idx * Series(np.arange(5, dtype=arr_dtype))
+        tm.assert_series_equal(result, Series(didx))
+
+    def test_mul_float_series(self, numeric_idx):
+        idx = numeric_idx
+        rng5 = np.arange(5, dtype="float64")
+
+        result = idx * Series(rng5 + 0.1)
+        expected = Series(rng5 * (rng5 + 0.1))
+        tm.assert_series_equal(result, expected)
+
+    def test_mul_index(self, numeric_idx):
+        idx = numeric_idx
+
+        result = idx * idx
+        tm.assert_index_equal(result, idx**2)
+
+    def test_mul_datelike_raises(self, numeric_idx):
+        idx = numeric_idx
+        msg = "cannot perform __rmul__ with this index type"
+        with pytest.raises(TypeError, match=msg):
+            idx * date_range("20130101", periods=5)
+
+    def test_mul_size_mismatch_raises(self, numeric_idx):
+        idx = numeric_idx
+        msg = "operands could not be broadcast together"
+        with pytest.raises(ValueError, match=msg):
+            idx * idx[0:3]
+        with pytest.raises(ValueError, match=msg):
+            idx * np.array([1, 2])
+
+    @pytest.mark.parametrize("op", [operator.pow, ops.rpow])
+    def test_pow_float(self, op, numeric_idx, box_with_array):
+        # test power calculations both ways, GH#14973
+        box = box_with_array
+        idx = numeric_idx
+        expected = Index(op(idx.values, 2.0))
+
+        idx = tm.box_expected(idx, box)
+        expected = tm.box_expected(expected, box)
+
+        result = op(idx, 2.0)
+        tm.assert_equal(result, expected)
+
+    def test_modulo(self, numeric_idx, box_with_array):
+        # GH#9244
+        box = box_with_array
+        idx = numeric_idx
+        expected = Index(idx.values % 2)
+
+        idx = tm.box_expected(idx, box)
+        expected = tm.box_expected(expected, box)
+
+        result = idx % 2
+        tm.assert_equal(result, expected)
+
+    def test_divmod_scalar(self, numeric_idx):
+        idx = numeric_idx
+
+        result = divmod(idx, 2)
+        with np.errstate(all="ignore"):
+            div, mod = divmod(idx.values, 2)
+
+        expected = Index(div), Index(mod)
+        for r, e in zip(result, expected, strict=True):
+            tm.assert_index_equal(r, e)
+
+    def test_divmod_ndarray(self, numeric_idx):
+        idx = numeric_idx
+        other = np.ones(idx.values.shape, dtype=idx.values.dtype) * 2
+
+        result = divmod(idx, other)
+        with np.errstate(all="ignore"):
+            div, mod = divmod(idx.values, other)
+
+        expected = Index(div), Index(mod)
+        for r, e in zip(result, expected, strict=True):
+            tm.assert_index_equal(r, e)
+
+    def test_divmod_series(self, numeric_idx):
+        idx = numeric_idx
+        other = np.ones(idx.values.shape, dtype=idx.values.dtype) * 2
+
+        result = divmod(idx, Series(other))
+        with np.errstate(all="ignore"):
+            div, mod = divmod(idx.values, other)
+
+        expected = Series(div), Series(mod)
+        for r, e in zip(result, expected, strict=True):
+            tm.assert_series_equal(r, e)
+
+    @pytest.mark.parametrize("other", [np.nan, 7, -23, 2.718, -3.14, np.inf])
+    def test_ops_np_scalar(self, other):
+        vals = np.random.default_rng(2).standard_normal((5, 3))
+        f = lambda x: pd.DataFrame(
+            x, index=list("ABCDE"), columns=["jim", "joe", "jolie"]
+        )
+
+        df = f(vals)
+
+        tm.assert_frame_equal(df / np.array(other), f(vals / other))
+        tm.assert_frame_equal(np.array(other) * df, f(vals * other))
+        tm.assert_frame_equal(df + np.array(other), f(vals + other))
+        tm.assert_frame_equal(np.array(other) - df, f(other - vals))
+
+    # TODO: This came from series.test.test_operators, needs cleanup
+    def test_operators_frame(self):
+        # rpow does not work with DataFrame
+        ts = Series(
+            np.arange(10, dtype=np.float64),
+            index=date_range("2020-01-01", periods=10),
+            name="ts",
+        )
+        ts.name = "ts"
+
+        df = pd.DataFrame({"A": ts})
+
+        tm.assert_series_equal(ts + ts, ts + df["A"], check_names=False)
+        tm.assert_series_equal(ts**ts, ts ** df["A"], check_names=False)
+        tm.assert_series_equal(ts < ts, ts < df["A"], check_names=False)
+        tm.assert_series_equal(ts / ts, ts / df["A"], check_names=False)
+
+    # TODO: this came from tests.series.test_analytics, needs cleanup and
+    #  de-duplication with test_modulo above
+    def test_modulo2(self):
+        with np.errstate(all="ignore"):
+            # GH#3590, modulo as ints
+            p = pd.DataFrame({"first": [3, 4, 5, 8], "second": [0, 0, 0, 3]})
+            result = p["first"] % p["second"]
+            expected = Series(p["first"].values % p["second"].values, dtype="float64")
+            expected.iloc[0:3] = np.nan
+            tm.assert_series_equal(result, expected)
+
+            result = p["first"] % 0
+            expected = Series(np.nan, index=p.index, name="first")
+            tm.assert_series_equal(result, expected)
+
+            p = p.astype("float64")
+            result = p["first"] % p["second"]
+            expected = Series(p["first"].values % p["second"].values)
+            tm.assert_series_equal(result, expected)
+
+            p = p.astype("float64")
+            result = p["first"] % p["second"]
+            result2 = p["second"] % p["first"]
+            assert not result.equals(result2)
+
+    def test_modulo_zero_int(self):
+        # GH#9144
+        with np.errstate(all="ignore"):
+            s = Series([0, 1])
+
+            result = s % 0
+            expected = Series([np.nan, np.nan])
+            tm.assert_series_equal(result, expected)
+
+            result = 0 % s
+            expected = Series([np.nan, 0.0])
+            tm.assert_series_equal(result, expected)
+
+    def test_non_1d_ea_raises_notimplementederror(self):
+        # GH#61866
+        ea_array = array([1, 2, 3, 4, 5], dtype="Int64").reshape(5, 1)
+        np_array = np.array([1, 2, 3, 4, 5], dtype=np.int64).reshape(5, 1)
+
+        msg = "can only perform ops with 1-d structures"
+
+        with pytest.raises(NotImplementedError, match=msg):
+            ea_array * np_array
+
+        with pytest.raises(NotImplementedError, match=msg):
+            np_array * ea_array
+
+
+class TestAdditionSubtraction:
+    # __add__, __sub__, __radd__, __rsub__, __iadd__, __isub__
+    # for non-timestamp/timedelta/period dtypes
+
+    @pytest.mark.parametrize(
+        "first, second, expected",
+        [
+            (
+                Series([1, 2, 3], index=list("ABC"), name="x"),
+                Series([2, 2, 2], index=list("ABD"), name="x"),
+                Series([3.0, 4.0, np.nan, np.nan], index=list("ABCD"), name="x"),
+            ),
+            (
+                Series([1, 2, 3], index=list("ABC"), name="x"),
+                Series([2, 2, 2, 2], index=list("ABCD"), name="x"),
+                Series([3, 4, 5, np.nan], index=list("ABCD"), name="x"),
+            ),
+        ],
+    )
+    def test_add_series(self, first, second, expected):
+        # GH#1134
+        tm.assert_series_equal(first + second, expected)
+        tm.assert_series_equal(second + first, expected)
+
+    @pytest.mark.parametrize(
+        "first, second, expected",
+        [
+            (
+                pd.DataFrame({"x": [1, 2, 3]}, index=list("ABC")),
+                pd.DataFrame({"x": [2, 2, 2]}, index=list("ABD")),
+                pd.DataFrame({"x": [3.0, 4.0, np.nan, np.nan]}, index=list("ABCD")),
+            ),
+            (
+                pd.DataFrame({"x": [1, 2, 3]}, index=list("ABC")),
+                pd.DataFrame({"x": [2, 2, 2, 2]}, index=list("ABCD")),
+                pd.DataFrame({"x": [3, 4, 5, np.nan]}, index=list("ABCD")),
+            ),
+        ],
+    )
+    def test_add_frames(self, first, second, expected):
+        # GH#1134
+        tm.assert_frame_equal(first + second, expected)
+        tm.assert_frame_equal(second + first, expected)
+
+    # TODO: This came from series.test.test_operators, needs cleanup
+    def test_series_frame_radd_bug(self, fixed_now_ts):
+        # GH#353
+        vals = Series([str(i) for i in range(5)])
+        result = "foo_" + vals
+        expected = vals.map(lambda x: "foo_" + x)
+        tm.assert_series_equal(result, expected)
+
+        frame = pd.DataFrame({"vals": vals})
+        result = "foo_" + frame
+        expected = pd.DataFrame({"vals": vals.map(lambda x: "foo_" + x)})
+        tm.assert_frame_equal(result, expected)
+
+        ts = Series(
+            np.arange(10, dtype=np.float64),
+            index=date_range("2020-01-01", periods=10),
+            name="ts",
+        )
+
+        # really raise this time
+        fix_now = fixed_now_ts.to_pydatetime()
+        msg = "|".join(
+            [
+                "unsupported operand type",
+                # wrong error message, see https://github.com/numpy/numpy/issues/18832
+                "Concatenation operation",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            fix_now + ts
+
+        with pytest.raises(TypeError, match=msg):
+            ts + fix_now
+
+    # TODO: This came from series.test.test_operators, needs cleanup
+    def test_datetime64_with_index(self):
+        # arithmetic integer ops with an index
+        ser = Series(np.random.default_rng(2).standard_normal(5))
+        expected = ser - ser.index.to_series()
+        result = ser - ser.index
+        tm.assert_series_equal(result, expected)
+
+        # GH#4629
+        # arithmetic datetime64 ops with an index
+        ser = Series(
+            date_range("20130101", periods=5),
+            index=date_range("20130101", periods=5),
+        )
+        expected = ser - ser.index.to_series()
+        result = ser - ser.index
+        tm.assert_series_equal(result, expected)
+
+        msg = "cannot subtract PeriodArray from DatetimeArray"
+        with pytest.raises(TypeError, match=msg):
+            # GH#18850
+            result = ser - ser.index.to_period()
+
+        df = pd.DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)),
+            index=date_range("20130101", periods=5),
+        )
+        df["date"] = pd.Timestamp("20130102")
+        df["expected"] = df["date"] - df.index.to_series()
+        df["result"] = df["date"] - df.index
+        tm.assert_series_equal(df["result"], df["expected"], check_names=False)
+
+    # TODO: taken from tests.frame.test_operators, needs cleanup
+    def test_frame_operators(self, float_frame):
+        frame = float_frame
+
+        garbage = np.random.default_rng(2).random(4)
+        colSeries = Series(garbage, index=np.array(frame.columns))
+
+        idSum = frame + frame
+        seriesSum = frame + colSeries
+
+        for col, series in idSum.items():
+            for idx, val in series.items():
+                origVal = frame[col][idx] * 2
+                if not np.isnan(val):
+                    assert val == origVal
+                else:
+                    assert np.isnan(origVal)
+
+        for col, series in seriesSum.items():
+            for idx, val in series.items():
+                origVal = frame[col][idx] + colSeries[col]
+                if not np.isnan(val):
+                    assert val == origVal
+                else:
+                    assert np.isnan(origVal)
+
+    def test_frame_operators_col_align(self, float_frame):
+        frame2 = pd.DataFrame(float_frame, columns=["D", "C", "B", "A"])
+        added = frame2 + frame2
+        expected = frame2 * 2
+        tm.assert_frame_equal(added, expected)
+
+    def test_frame_operators_none_to_nan(self):
+        df = pd.DataFrame({"a": ["a", None, "b"]})
+        tm.assert_frame_equal(df + df, pd.DataFrame({"a": ["aa", np.nan, "bb"]}))
+
+    @pytest.mark.parametrize("dtype", ("float", "int64"))
+    def test_frame_operators_empty_like(self, dtype):
+        # Test for issue #10181
+        frames = [
+            pd.DataFrame(dtype=dtype),
+            pd.DataFrame(columns=["A"], dtype=dtype),
+            pd.DataFrame(index=[0], dtype=dtype),
+        ]
+        for df in frames:
+            assert (df + df).equals(df)
+            tm.assert_frame_equal(df + df, df)
+
+    @pytest.mark.parametrize(
+        "func",
+        [lambda x: x * 2, lambda x: x[::2], lambda x: 5],
+        ids=["multiply", "slice", "constant"],
+    )
+    def test_series_operators_arithmetic(self, all_arithmetic_functions, func):
+        op = all_arithmetic_functions
+        series = Series(
+            np.arange(10, dtype=np.float64),
+            index=date_range("2020-01-01", periods=10),
+            name="ts",
+        )
+        other = func(series)
+        compare_op(series, other, op)
+
+    @pytest.mark.parametrize(
+        "func", [lambda x: x + 1, lambda x: 5], ids=["add", "constant"]
+    )
+    def test_series_operators_compare(self, comparison_op, func):
+        op = comparison_op
+        series = Series(
+            np.arange(10, dtype=np.float64),
+            index=date_range("2020-01-01", periods=10),
+            name="ts",
+        )
+        other = func(series)
+        compare_op(series, other, op)
+
+    @pytest.mark.parametrize(
+        "func",
+        [lambda x: x * 2, lambda x: x[::2], lambda x: 5],
+        ids=["multiply", "slice", "constant"],
+    )
+    def test_divmod(self, func):
+        series = Series(
+            np.arange(10, dtype=np.float64),
+            index=date_range("2020-01-01", periods=10),
+            name="ts",
+        )
+        other = func(series)
+        results = divmod(series, other)
+        if isinstance(other, abc.Iterable) and len(series) != len(other):
+            # if the lengths don't match, this is the test where we use
+            # `tser[::2]`. Pad every other value in `other_np` with nan.
+            other_np = []
+            for n in other:
+                other_np.append(n)
+                other_np.append(np.nan)
+        else:
+            other_np = other
+        other_np = np.asarray(other_np)
+        with np.errstate(all="ignore"):
+            expecteds = divmod(series.values, np.asarray(other_np))
+
+        for result, expected in zip(results, expecteds, strict=True):
+            # check the values, name, and index separately
+            tm.assert_almost_equal(np.asarray(result), expected)
+
+            assert result.name == series.name
+            tm.assert_index_equal(result.index, series.index._with_freq(None))
+
+    def test_series_divmod_zero(self):
+        # Check that divmod uses pandas convention for division by zero,
+        #  which does not match numpy.
+        # pandas convention has
+        #  1/0 == np.inf
+        #  -1/0 == -np.inf
+        #  1/-0.0 == -np.inf
+        #  -1/-0.0 == np.inf
+        tser = Series(
+            np.arange(1, 11, dtype=np.float64),
+            index=date_range("2020-01-01", periods=10),
+            name="ts",
+        )
+        other = tser * 0
+
+        result = divmod(tser, other)
+        exp1 = Series([np.inf] * len(tser), index=tser.index, name="ts")
+        exp2 = Series([np.nan] * len(tser), index=tser.index, name="ts")
+        tm.assert_series_equal(result[0], exp1)
+        tm.assert_series_equal(result[1], exp2)
+
+
+class TestUFuncCompat:
+    # TODO: add more dtypes
+    @pytest.mark.parametrize("holder", [Index, RangeIndex, Series])
+    @pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64])
+    def test_ufunc_compat(self, holder, dtype):
+        box = Series if holder is Series else Index
+
+        if holder is RangeIndex:
+            if dtype != np.int64:
+                pytest.skip(f"dtype {dtype} not relevant for RangeIndex")
+            idx = RangeIndex(0, 5, name="foo")
+        else:
+            idx = holder(np.arange(5, dtype=dtype), name="foo")
+        result = np.sin(idx)
+        expected = box(np.sin(np.arange(5, dtype=dtype)), name="foo")
+        tm.assert_equal(result, expected)
+
+    # TODO: add more dtypes
+    @pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64])
+    def test_ufunc_coercions(self, index_or_series, dtype):
+        idx = index_or_series([1, 2, 3, 4, 5], dtype=dtype, name="x")
+        box = index_or_series
+
+        result = np.sqrt(idx)
+        assert result.dtype == "f8" and isinstance(result, box)
+        exp = Index(np.sqrt(np.array([1, 2, 3, 4, 5], dtype=np.float64)), name="x")
+        exp = tm.box_expected(exp, box)
+        tm.assert_equal(result, exp)
+
+        result = np.divide(idx, 2.0)
+        assert result.dtype == "f8" and isinstance(result, box)
+        exp = Index([0.5, 1.0, 1.5, 2.0, 2.5], dtype=np.float64, name="x")
+        exp = tm.box_expected(exp, box)
+        tm.assert_equal(result, exp)
+
+        # _evaluate_numeric_binop
+        result = idx + 2.0
+        assert result.dtype == "f8" and isinstance(result, box)
+        exp = Index([3.0, 4.0, 5.0, 6.0, 7.0], dtype=np.float64, name="x")
+        exp = tm.box_expected(exp, box)
+        tm.assert_equal(result, exp)
+
+        result = idx - 2.0
+        assert result.dtype == "f8" and isinstance(result, box)
+        exp = Index([-1.0, 0.0, 1.0, 2.0, 3.0], dtype=np.float64, name="x")
+        exp = tm.box_expected(exp, box)
+        tm.assert_equal(result, exp)
+
+        result = idx * 1.0
+        assert result.dtype == "f8" and isinstance(result, box)
+        exp = Index([1.0, 2.0, 3.0, 4.0, 5.0], dtype=np.float64, name="x")
+        exp = tm.box_expected(exp, box)
+        tm.assert_equal(result, exp)
+
+        result = idx / 2.0
+        assert result.dtype == "f8" and isinstance(result, box)
+        exp = Index([0.5, 1.0, 1.5, 2.0, 2.5], dtype=np.float64, name="x")
+        exp = tm.box_expected(exp, box)
+        tm.assert_equal(result, exp)
+
+    # TODO: add more dtypes
+    @pytest.mark.parametrize("holder", [Index, Series])
+    @pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64])
+    def test_ufunc_multiple_return_values(self, holder, dtype):
+        obj = holder([1, 2, 3], dtype=dtype, name="x")
+        box = Series if holder is Series else Index
+
+        result = np.modf(obj)
+        assert isinstance(result, tuple)
+        exp1 = Index([0.0, 0.0, 0.0], dtype=np.float64, name="x")
+        exp2 = Index([1.0, 2.0, 3.0], dtype=np.float64, name="x")
+        tm.assert_equal(result[0], tm.box_expected(exp1, box))
+        tm.assert_equal(result[1], tm.box_expected(exp2, box))
+
+    def test_ufunc_at(self):
+        s = Series([0, 1, 2], index=[1, 2, 3], name="x")
+        np.add.at(s, [0, 2], 10)
+        expected = Series([10, 1, 12], index=[1, 2, 3], name="x")
+        tm.assert_series_equal(s, expected)
+
+
+class TestObjectDtypeEquivalence:
+    # Tests that arithmetic operations match operations executed elementwise
+
+    @pytest.mark.parametrize("dtype", [None, object])
+    def test_numarr_with_dtype_add_nan(self, dtype, box_with_array):
+        box = box_with_array
+        ser = Series([1, 2, 3], dtype=dtype)
+        expected = Series([np.nan, np.nan, np.nan], dtype=dtype)
+
+        ser = tm.box_expected(ser, box)
+        expected = tm.box_expected(expected, box)
+
+        result = np.nan + ser
+        tm.assert_equal(result, expected)
+
+        result = ser + np.nan
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype", [None, object])
+    def test_numarr_with_dtype_add_int(self, dtype, box_with_array):
+        box = box_with_array
+        ser = Series([1, 2, 3], dtype=dtype)
+        expected = Series([2, 3, 4], dtype=dtype)
+
+        ser = tm.box_expected(ser, box)
+        expected = tm.box_expected(expected, box)
+
+        result = 1 + ser
+        tm.assert_equal(result, expected)
+
+        result = ser + 1
+        tm.assert_equal(result, expected)
+
+    # TODO: moved from tests.series.test_operators; needs cleanup
+    @pytest.mark.parametrize(
+        "op",
+        [operator.add, operator.sub, operator.mul, operator.truediv, operator.floordiv],
+    )
+    def test_operators_reverse_object(self, op):
+        # GH#56
+        arr = Series(
+            np.random.default_rng(2).standard_normal(10),
+            index=np.arange(10),
+            dtype=object,
+        )
+
+        result = op(1.0, arr)
+        expected = op(1.0, arr.astype(float))
+        tm.assert_series_equal(result.astype(float), expected)
+
+
+class TestNumericArithmeticUnsorted:
+    # Tests in this class have been moved from type-specific test modules
+    #  but not yet sorted, parametrized, and de-duplicated
+    @pytest.mark.parametrize(
+        "op",
+        [
+            operator.add,
+            operator.sub,
+            operator.mul,
+            operator.floordiv,
+            operator.truediv,
+        ],
+    )
+    @pytest.mark.parametrize(
+        "idx1",
+        [
+            RangeIndex(0, 10, 1),
+            RangeIndex(0, 20, 2),
+            RangeIndex(-10, 10, 2),
+            RangeIndex(5, -5, -1),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "idx2",
+        [
+            RangeIndex(0, 10, 1),
+            RangeIndex(0, 20, 2),
+            RangeIndex(-10, 10, 2),
+            RangeIndex(5, -5, -1),
+        ],
+    )
+    def test_binops_index(self, op, idx1, idx2):
+        idx1 = idx1._rename("foo")
+        idx2 = idx2._rename("bar")
+        result = op(idx1, idx2)
+        expected = op(Index(idx1.to_numpy()), Index(idx2.to_numpy()))
+        tm.assert_index_equal(result, expected, exact="equiv")
+
+    @pytest.mark.parametrize(
+        "op",
+        [
+            operator.add,
+            operator.sub,
+            operator.mul,
+            operator.floordiv,
+            operator.truediv,
+        ],
+    )
+    @pytest.mark.parametrize(
+        "idx",
+        [
+            RangeIndex(0, 10, 1),
+            RangeIndex(0, 20, 2),
+            RangeIndex(-10, 10, 2),
+            RangeIndex(5, -5, -1),
+        ],
+    )
+    @pytest.mark.parametrize("scalar", [-1, 1, 2])
+    def test_binops_index_scalar(self, op, idx, scalar):
+        result = op(idx, scalar)
+        expected = op(Index(idx.to_numpy()), scalar)
+        tm.assert_index_equal(result, expected, exact="equiv")
+
+    @pytest.mark.parametrize("idx1", [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2)])
+    @pytest.mark.parametrize("idx2", [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2)])
+    def test_binops_index_pow(self, idx1, idx2):
+        # numpy does not allow powers of negative integers so test separately
+        # https://github.com/numpy/numpy/pull/8127
+        idx1 = idx1._rename("foo")
+        idx2 = idx2._rename("bar")
+        result = pow(idx1, idx2)
+        expected = pow(Index(idx1.to_numpy()), Index(idx2.to_numpy()))
+        tm.assert_index_equal(result, expected, exact="equiv")
+
+    @pytest.mark.parametrize("idx", [RangeIndex(0, 10, 1), RangeIndex(0, 20, 2)])
+    @pytest.mark.parametrize("scalar", [1, 2])
+    def test_binops_index_scalar_pow(self, idx, scalar):
+        # numpy does not allow powers of negative integers so test separately
+        # https://github.com/numpy/numpy/pull/8127
+        result = pow(idx, scalar)
+        expected = pow(Index(idx.to_numpy()), scalar)
+        tm.assert_index_equal(result, expected, exact="equiv")
+
+    # TODO: divmod?
+    @pytest.mark.parametrize(
+        "op",
+        [
+            operator.add,
+            operator.sub,
+            operator.mul,
+            operator.floordiv,
+            operator.truediv,
+            operator.pow,
+            operator.mod,
+        ],
+    )
+    def test_arithmetic_with_frame_or_series(self, op):
+        # check that we return NotImplemented when operating with Series
+        # or DataFrame
+        index = RangeIndex(5)
+        other = Series(np.random.default_rng(2).standard_normal(5))
+
+        expected = op(Series(index), other)
+        result = op(index, other)
+        tm.assert_series_equal(result, expected)
+
+        other = pd.DataFrame(np.random.default_rng(2).standard_normal((2, 5)))
+        expected = op(pd.DataFrame([index, index]), other)
+        result = op(index, other)
+        tm.assert_frame_equal(result, expected)
+
+    def test_numeric_compat2(self):
+        # validate that we are handling the RangeIndex overrides to numeric ops
+        # and returning RangeIndex where possible
+
+        idx = RangeIndex(0, 10, 2)
+
+        result = idx * 2
+        expected = RangeIndex(0, 20, 4)
+        tm.assert_index_equal(result, expected, exact=True)
+
+        result = idx + 2
+        expected = RangeIndex(2, 12, 2)
+        tm.assert_index_equal(result, expected, exact=True)
+
+        result = idx - 2
+        expected = RangeIndex(-2, 8, 2)
+        tm.assert_index_equal(result, expected, exact=True)
+
+        result = idx / 2
+        expected = RangeIndex(0, 5, 1).astype("float64")
+        tm.assert_index_equal(result, expected, exact=True)
+
+        result = idx / 4
+        expected = RangeIndex(0, 10, 2) / 4
+        tm.assert_index_equal(result, expected, exact=True)
+
+        result = idx // 1
+        expected = idx
+        tm.assert_index_equal(result, expected, exact=True)
+
+        # __mul__
+        result = idx * idx
+        expected = Index(idx.values * idx.values)
+        tm.assert_index_equal(result, expected, exact=True)
+
+        # __pow__
+        idx = RangeIndex(0, 1000, 2)
+        result = idx**2
+        expected = Index(idx._values) ** 2
+        tm.assert_index_equal(Index(result.values), expected, exact=True)
+
+    @pytest.mark.parametrize(
+        "idx, div, expected",
+        [
+            # TODO: add more dtypes
+            (RangeIndex(0, 1000, 2), 2, RangeIndex(0, 500, 1)),
+            (RangeIndex(-99, -201, -3), -3, RangeIndex(33, 67, 1)),
+            (
+                RangeIndex(0, 1000, 1),
+                2,
+                Index(RangeIndex(0, 1000, 1)._values) // 2,
+            ),
+            (
+                RangeIndex(0, 100, 1),
+                2.0,
+                Index(RangeIndex(0, 100, 1)._values) // 2.0,
+            ),
+            (RangeIndex(0), 50, RangeIndex(0)),
+            (RangeIndex(2, 4, 2), 3, RangeIndex(0, 1, 1)),
+            (RangeIndex(-5, -10, -6), 4, RangeIndex(-2, -1, 1)),
+            (RangeIndex(-100, -200, 3), 2, RangeIndex(0)),
+        ],
+    )
+    def test_numeric_compat2_floordiv(self, idx, div, expected):
+        # __floordiv__
+        tm.assert_index_equal(idx // div, expected, exact=True)
+
+    @pytest.mark.parametrize("dtype", [np.int64, np.float64])
+    @pytest.mark.parametrize("delta", [1, 0, -1])
+    def test_addsub_arithmetic(self, dtype, delta):
+        # GH#8142
+        delta = dtype(delta)
+        index = Index([10, 11, 12], dtype=dtype)
+        result = index + delta
+        expected = Index(index.values + delta, dtype=dtype)
+        tm.assert_index_equal(result, expected)
+
+        # this subtraction used to fail
+        result = index - delta
+        expected = Index(index.values - delta, dtype=dtype)
+        tm.assert_index_equal(result, expected)
+
+        tm.assert_index_equal(index + index, 2 * index)
+        tm.assert_index_equal(index - index, 0 * index)
+        assert not (index - index).empty
+
+    def test_pow_nan_with_zero(self, box_with_array):
+        left = Index([np.nan, np.nan, np.nan])
+        right = Index([0, 0, 0])
+        expected = Index([1.0, 1.0, 1.0])
+
+        left = tm.box_expected(left, box_with_array)
+        right = tm.box_expected(right, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = left**right
+        tm.assert_equal(result, expected)
+
+
+def test_fill_value_inf_masking():
+    # GH #27464 make sure we mask 0/1 with Inf and not NaN
+    df = pd.DataFrame({"A": [0, 1, 2], "B": [1.1, None, 1.1]})
+
+    other = pd.DataFrame({"A": [1.1, 1.2, 1.3]}, index=[0, 2, 3])
+
+    result = df.rfloordiv(other, fill_value=1)
+
+    expected = pd.DataFrame(
+        {"A": [np.inf, 1.0, 0.0, 1.0], "B": [0.0, np.nan, 0.0, np.nan]}
+    )
+    tm.assert_frame_equal(result, expected, check_index_type=False)
+
+
+def test_dataframe_div_silenced():
+    # GH#26793
+    pdf1 = pd.DataFrame(
+        {
+            "A": np.arange(10),
+            "B": [np.nan, 1, 2, 3, 4] * 2,
+            "C": [np.nan] * 10,
+            "D": np.arange(10),
+        },
+        index=list("abcdefghij"),
+        columns=list("ABCD"),
+    )
+    pdf2 = pd.DataFrame(
+        np.random.default_rng(2).standard_normal((10, 4)),
+        index=list("abcdefghjk"),
+        columns=list("ABCX"),
+    )
+    with tm.assert_produces_warning(None):
+        pdf1.div(pdf2, fill_value=0)
+
+
+@pytest.mark.parametrize(
+    "data, expected_data",
+    [([0, 1, 2], [0, 2, 4])],
+)
+@pytest.mark.parametrize("box_pandas_1d_array", [Index, Series, tm.to_array])
+@pytest.mark.parametrize("box_1d_array", [Index, Series, tm.to_array, np.array, list])
+def test_integer_array_add_list_like(
+    box_pandas_1d_array, box_1d_array, data, expected_data
+):
+    # GH22606 Verify operators with IntegerArray and list-likes
+    arr = array(data, dtype="Int64")
+    container = box_pandas_1d_array(arr)
+    left = container + box_1d_array(data)
+    right = box_1d_array(data) + container
+
+    if Series in [box_1d_array, box_pandas_1d_array]:
+        cls = Series
+    elif Index in [box_1d_array, box_pandas_1d_array]:
+        cls = Index
+    else:
+        cls = array
+
+    expected = cls(expected_data, dtype="Int64")
+
+    tm.assert_equal(left, expected)
+    tm.assert_equal(right, expected)
+
+
+def test_sub_multiindex_swapped_levels():
+    # GH 9952
+    df = pd.DataFrame(
+        {"a": np.random.default_rng(2).standard_normal(6)},
+        index=pd.MultiIndex.from_product(
+            [["a", "b"], [0, 1, 2]], names=["levA", "levB"]
+        ),
+    )
+    df2 = df.copy()
+    df2.index = df2.index.swaplevel(0, 1)
+    result = df - df2
+    expected = pd.DataFrame([0.0] * 6, columns=["a"], index=df.index)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("power", [1, 2, 5])
+@pytest.mark.parametrize("string_size", [0, 1, 2, 5])
+def test_empty_str_comparison(power, string_size):
+    # GH 37348
+    a = np.array(range(10**power))
+    right = pd.DataFrame(a, dtype=np.int64)
+    left = " " * string_size
+
+    result = right == left
+    expected = pd.DataFrame(np.zeros(right.shape, dtype=bool))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_series_add_sub_with_UInt64():
+    # GH 22023
+    series1 = Series([1, 2, 3])
+    series2 = Series([2, 1, 3], dtype="UInt64")
+
+    result = series1 + series2
+    expected = Series([3, 3, 6], dtype="Float64")
+    tm.assert_series_equal(result, expected)
+
+    result = series1 - series2
+    expected = Series([-1, 1, 0], dtype="Float64")
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc0f78d3aa01af5353bc9385848fb49a7784020d
--- /dev/null
+++ b/pandas/tests/arithmetic/test_object.py
@@ -0,0 +1,410 @@
+# Arithmetic tests for DataFrame/Series/Index/Array classes that should
+# behave identically.
+# Specifically for object dtype
+import datetime
+from decimal import Decimal
+import operator
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    Series,
+    Timestamp,
+    option_context,
+)
+import pandas._testing as tm
+from pandas.core import ops
+
+# ------------------------------------------------------------------
+# Comparisons
+
+
+class TestObjectComparisons:
+    def test_comparison_object_numeric_nas(self, comparison_op):
+        ser = Series(np.random.default_rng(2).standard_normal(10), dtype=object)
+        shifted = ser.shift(2)
+
+        func = comparison_op
+
+        result = func(ser, shifted)
+        expected = func(ser.astype(float), shifted.astype(float))
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
+    )
+    def test_object_comparisons(self, infer_string):
+        with option_context("future.infer_string", infer_string):
+            ser = Series(["a", "b", np.nan, "c", "a"])
+
+            result = ser == "a"
+            expected = Series([True, False, False, False, True])
+            tm.assert_series_equal(result, expected)
+
+            result = ser < "a"
+            expected = Series([False, False, False, False, False])
+            tm.assert_series_equal(result, expected)
+
+            result = ser != "a"
+            expected = -(ser == "a")
+            tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype", [None, object])
+    def test_more_na_comparisons(self, dtype):
+        left = Series(["a", np.nan, "c"], dtype=dtype)
+        right = Series(["a", np.nan, "d"], dtype=dtype)
+
+        result = left == right
+        expected = Series([True, False, False])
+        tm.assert_series_equal(result, expected)
+
+        result = left != right
+        expected = Series([False, True, True])
+        tm.assert_series_equal(result, expected)
+
+        result = left == np.nan
+        expected = Series([False, False, False])
+        tm.assert_series_equal(result, expected)
+
+        result = left != np.nan
+        expected = Series([True, True, True])
+        tm.assert_series_equal(result, expected)
+
+
+# ------------------------------------------------------------------
+# Arithmetic
+
+
+class TestArithmetic:
+    def test_add_period_to_array_of_offset(self):
+        # GH#50162
+        per = pd.Period("2012-1-1", freq="D")
+        pi = pd.period_range("2012-1-1", periods=10, freq="D")
+        idx = per - pi
+
+        expected = pd.Index([x + per for x in idx], dtype=object)
+        result = idx + per
+        tm.assert_index_equal(result, expected)
+
+        result = per + idx
+        tm.assert_index_equal(result, expected)
+
+    # TODO: parametrize
+    def test_pow_ops_object(self):
+        # GH#22922
+        # pow is weird with masking & 1, so testing here
+        a = Series([1, np.nan, 1, np.nan], dtype=object)
+        b = Series([1, np.nan, np.nan, 1], dtype=object)
+        result = a**b
+        expected = Series(a.values**b.values, dtype=object)
+        tm.assert_series_equal(result, expected)
+
+        result = b**a
+        expected = Series(b.values**a.values, dtype=object)
+
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("op", [operator.add, ops.radd])
+    @pytest.mark.parametrize("other", ["category", "Int64"])
+    def test_add_extension_scalar(self, other, box_with_array, op):
+        # GH#22378
+        # Check that scalars satisfying is_extension_array_dtype(obj)
+        # do not incorrectly try to dispatch to an ExtensionArray operation
+
+        arr = Series(["a", "b", "c"])
+        expected = Series([op(x, other) for x in arr])
+
+        arr = tm.box_expected(arr, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = op(arr, other)
+        tm.assert_equal(result, expected)
+
+    def test_objarr_add_str(self, box_with_array):
+        ser = Series(["x", np.nan, "x"])
+        expected = Series(["xa", np.nan, "xa"])
+
+        ser = tm.box_expected(ser, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = ser + "a"
+        tm.assert_equal(result, expected)
+
+    def test_objarr_radd_str(self, box_with_array):
+        ser = Series(["x", np.nan, "x"])
+        expected = Series(["ax", np.nan, "ax"])
+
+        ser = tm.box_expected(ser, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = "a" + ser
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "data",
+        [
+            [1, 2, 3],
+            [1.1, 2.2, 3.3],
+            [Timestamp("2011-01-01"), Timestamp("2011-01-02"), pd.NaT],
+            ["x", "y", 1],
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [None, object])
+    def test_objarr_radd_str_invalid(self, dtype, data, box_with_array):
+        ser = Series(data, dtype=dtype)
+
+        ser = tm.box_expected(ser, box_with_array)
+        msg = "|".join(
+            [
+                "can only concatenate str",
+                "did not contain a loop with signature matching types",
+                "unsupported operand type",
+                "must be str",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            "foo_" + ser
+
+    @pytest.mark.parametrize("op", [operator.add, ops.radd, operator.sub, ops.rsub])
+    def test_objarr_add_invalid(self, op, box_with_array):
+        # invalid ops
+        box = box_with_array
+
+        obj_ser = Series(list("abc"), dtype=object, name="objects")
+
+        obj_ser = tm.box_expected(obj_ser, box)
+        msg = "|".join(
+            [
+                "can only concatenate str",
+                "unsupported operand type",
+                "must be str",
+                "has no kernel",
+            ]
+        )
+        with pytest.raises(Exception, match=msg):
+            op(obj_ser, 1)
+        with pytest.raises(Exception, match=msg):
+            op(obj_ser, np.array(1, dtype=np.int64))
+
+    # TODO: Moved from tests.series.test_operators; needs cleanup
+    def test_operators_na_handling(self):
+        ser = Series(["foo", "bar", "baz", np.nan])
+        result = "prefix_" + ser
+        expected = Series(["prefix_foo", "prefix_bar", "prefix_baz", np.nan])
+        tm.assert_series_equal(result, expected)
+
+        result = ser + "_suffix"
+        expected = Series(["foo_suffix", "bar_suffix", "baz_suffix", np.nan])
+        tm.assert_series_equal(result, expected)
+
+    # TODO: parametrize over box
+    @pytest.mark.parametrize("dtype", [None, object])
+    def test_series_with_dtype_radd_timedelta(self, dtype):
+        # note this test is _not_ aimed at timedelta64-dtyped Series
+        # as of 2.0 we retain object dtype when ser.dtype == object
+        ser = Series(
+            [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.Timedelta("3 days")],
+            dtype=dtype,
+        )
+        expected = Series(
+            [pd.Timedelta("4 days"), pd.Timedelta("5 days"), pd.Timedelta("6 days")],
+            dtype=dtype,
+        )
+
+        result = pd.Timedelta("3 days") + ser
+        tm.assert_series_equal(result, expected)
+
+        result = ser + pd.Timedelta("3 days")
+        tm.assert_series_equal(result, expected)
+
+    # TODO: cleanup & parametrize over box
+    def test_mixed_timezone_series_ops_object(self):
+        # GH#13043
+        ser = Series(
+            [
+                Timestamp("2015-01-01", tz="US/Eastern"),
+                Timestamp("2015-01-01", tz="Asia/Tokyo"),
+            ],
+            name="xxx",
+        )
+        assert ser.dtype == object
+
+        exp = Series(
+            [
+                Timestamp("2015-01-02", tz="US/Eastern"),
+                Timestamp("2015-01-02", tz="Asia/Tokyo"),
+            ],
+            name="xxx",
+        )
+        tm.assert_series_equal(ser + pd.Timedelta("1 days"), exp)
+        tm.assert_series_equal(pd.Timedelta("1 days") + ser, exp)
+
+        # object series & object series
+        ser2 = Series(
+            [
+                Timestamp("2015-01-03", tz="US/Eastern"),
+                Timestamp("2015-01-05", tz="Asia/Tokyo"),
+            ],
+            name="xxx",
+        )
+        assert ser2.dtype == object
+        exp = Series(
+            [pd.Timedelta("2 days"), pd.Timedelta("4 days")], name="xxx", dtype=object
+        )
+        tm.assert_series_equal(ser2 - ser, exp)
+        tm.assert_series_equal(ser - ser2, -exp)
+
+        ser = Series(
+            [pd.Timedelta("01:00:00"), pd.Timedelta("02:00:00")],
+            name="xxx",
+            dtype=object,
+        )
+        assert ser.dtype == object
+
+        exp = Series(
+            [pd.Timedelta("01:30:00"), pd.Timedelta("02:30:00")],
+            name="xxx",
+            dtype=object,
+        )
+        tm.assert_series_equal(ser + pd.Timedelta("00:30:00"), exp)
+        tm.assert_series_equal(pd.Timedelta("00:30:00") + ser, exp)
+
+    # TODO: cleanup & parametrize over box
+    def test_iadd_preserves_name(self):
+        # GH#17067, GH#19723 __iadd__ and __isub__ should preserve index name
+        ser = Series([1, 2, 3])
+        ser.index.name = "foo"
+
+        ser.index += 1
+        assert ser.index.name == "foo"
+
+        ser.index -= 1
+        assert ser.index.name == "foo"
+
+    def test_add_string(self):
+        # from bug report
+        index = pd.Index(["a", "b", "c"])
+        index2 = index + "foo"
+
+        assert "a" not in index2
+        assert "afoo" in index2
+
+    def test_iadd_string(self):
+        index = pd.Index(["a", "b", "c"])
+        # doesn't fail test unless there is a check before `+=`
+        assert "a" in index
+
+        index += "_x"
+        assert "a_x" in index
+
+    def test_add(self):
+        index = pd.Index([str(i) for i in range(10)])
+        expected = pd.Index(index.values * 2)
+        tm.assert_index_equal(index + index, expected)
+        tm.assert_index_equal(index + index.tolist(), expected)
+        tm.assert_index_equal(index.tolist() + index, expected)
+
+        # test add and radd
+        index = pd.Index(list("abc"))
+        expected = pd.Index(["a1", "b1", "c1"])
+        tm.assert_index_equal(index + "1", expected)
+        expected = pd.Index(["1a", "1b", "1c"])
+        tm.assert_index_equal("1" + index, expected)
+
+    def test_sub_fail(self):
+        index = pd.Index([str(i) for i in range(10)])
+
+        msg = "unsupported operand type|Cannot broadcast|sub' not supported"
+        with pytest.raises(TypeError, match=msg):
+            index - "a"
+        with pytest.raises(TypeError, match=msg):
+            index - index
+        with pytest.raises(TypeError, match=msg):
+            index - index.tolist()
+        with pytest.raises(TypeError, match=msg):
+            index.tolist() - index
+
+    def test_sub_object(self):
+        # GH#19369
+        index = pd.Index([Decimal(1), Decimal(2)])
+        expected = pd.Index([Decimal(0), Decimal(1)])
+
+        result = index - Decimal(1)
+        tm.assert_index_equal(result, expected)
+
+        result = index - pd.Index([Decimal(1), Decimal(1)])
+        tm.assert_index_equal(result, expected)
+
+        msg = "unsupported operand type"
+        with pytest.raises(TypeError, match=msg):
+            index - "foo"
+
+        with pytest.raises(TypeError, match=msg):
+            index - np.array([2, "foo"], dtype=object)
+
+    def test_rsub_object(self, fixed_now_ts):
+        # GH#19369
+        index = pd.Index([Decimal(1), Decimal(2)])
+        expected = pd.Index([Decimal(1), Decimal(0)])
+
+        result = Decimal(2) - index
+        tm.assert_index_equal(result, expected)
+
+        result = np.array([Decimal(2), Decimal(2)]) - index
+        tm.assert_index_equal(result, expected)
+
+        msg = "unsupported operand type"
+        with pytest.raises(TypeError, match=msg):
+            "foo" - index
+
+        with pytest.raises(TypeError, match=msg):
+            np.array([True, fixed_now_ts]) - index
+
+
+class MyIndex(pd.Index):
+    # Simple index subclass that tracks ops calls.
+
+    _calls: int
+
+    @classmethod
+    def _simple_new(cls, values, name=None, dtype=None):
+        result = object.__new__(cls)
+        result._data = values
+        result._name = name
+        result._calls = 0
+        result._reset_identity()
+
+        return result
+
+    def __add__(self, other):
+        self._calls += 1
+        return self._simple_new(self._data)
+
+    def __radd__(self, other):
+        return self.__add__(other)
+
+
+@pytest.mark.parametrize(
+    "other",
+    [
+        [datetime.timedelta(1), datetime.timedelta(2)],
+        [datetime.datetime(2000, 1, 1), datetime.datetime(2000, 1, 2)],
+        [pd.Period("2000"), pd.Period("2001")],
+        ["a", "b"],
+    ],
+    ids=["timedelta", "datetime", "period", "object"],
+)
+def test_index_ops_defer_to_unknown_subclasses(other):
+    # https://github.com/pandas-dev/pandas/issues/31109
+    values = np.array(
+        [datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)], dtype=object
+    )
+    a = MyIndex._simple_new(values)
+    other = pd.Index(other)
+    result = other + a
+    assert isinstance(result, MyIndex)
+    assert a._calls == 1
diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py
new file mode 100644
index 0000000000000000000000000000000000000000..24733f3b3e5634e96742e8035d00bcb3edf7ccd7
--- /dev/null
+++ b/pandas/tests/arithmetic/test_period.py
@@ -0,0 +1,1679 @@
+# Arithmetic tests for DataFrame/Series/Index/Array classes that should
+# behave identically.
+# Specifically for Period dtype
+import operator
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import (
+    IncompatibleFrequency,
+    Period,
+    Timestamp,
+    to_offset,
+)
+
+import pandas as pd
+from pandas import (
+    PeriodIndex,
+    Series,
+    Timedelta,
+    TimedeltaIndex,
+    period_range,
+)
+import pandas._testing as tm
+from pandas.core import ops
+from pandas.core.arrays import TimedeltaArray
+from pandas.tests.arithmetic.common import (
+    assert_invalid_addsub_type,
+    assert_invalid_comparison,
+    get_upcast_box,
+)
+
+_common_mismatch = [
+    pd.offsets.YearBegin(2),
+    pd.offsets.MonthBegin(1),
+    pd.offsets.Minute(),
+]
+
+
+@pytest.fixture(
+    params=[
+        Timedelta(minutes=30).to_pytimedelta(),
+        np.timedelta64(30, "s"),
+        Timedelta(seconds=30),
+        *_common_mismatch,
+    ]
+)
+def not_hourly(request):
+    """
+    Several timedelta-like and DateOffset instances that are _not_
+    compatible with Hourly frequencies.
+    """
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        np.timedelta64(365, "D"),
+        Timedelta(days=365).to_pytimedelta(),
+        Timedelta(days=365),
+        *_common_mismatch,
+    ]
+)
+def mismatched_freq(request):
+    """
+    Several timedelta-like and DateOffset instances that are _not_
+    compatible with Monthly or Annual frequencies.
+    """
+    return request.param
+
+
+# ------------------------------------------------------------------
+# Comparisons
+
+
+class TestPeriodArrayLikeComparisons:
+    # Comparison tests for PeriodDtype vectors fully parametrized over
+    #  DataFrame/Series/PeriodIndex/PeriodArray.  Ideally all comparison
+    #  tests will eventually end up here.
+
+    @pytest.mark.parametrize("other", ["2017", Period("2017", freq="D")])
+    def test_eq_scalar(self, other, box_with_array):
+        idx = PeriodIndex(["2017", "2017", "2018"], freq="D")
+        idx = tm.box_expected(idx, box_with_array)
+        xbox = get_upcast_box(idx, other, True)
+
+        expected = np.array([True, True, False])
+        expected = tm.box_expected(expected, xbox)
+
+        result = idx == other
+
+        tm.assert_equal(result, expected)
+
+    def test_compare_zerodim(self, box_with_array):
+        # GH#26689 make sure we unbox zero-dimensional arrays
+
+        pi = period_range("2000", periods=4)
+        other = np.array(pi.to_numpy()[0])
+
+        pi = tm.box_expected(pi, box_with_array)
+        xbox = get_upcast_box(pi, other, True)
+
+        result = pi <= other
+        expected = np.array([True, False, False, False])
+        expected = tm.box_expected(expected, xbox)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "scalar",
+        [
+            "foo",
+            Timestamp("2021-01-01"),
+            Timedelta(days=4),
+            9,
+            9.5,
+            2000,  # specifically don't consider 2000 to match Period("2000", "D")
+            False,
+            None,
+        ],
+    )
+    def test_compare_invalid_scalar(self, box_with_array, scalar):
+        # GH#28980
+        # comparison with scalar that cannot be interpreted as a Period
+        pi = period_range("2000", periods=4)
+        parr = tm.box_expected(pi, box_with_array)
+        assert_invalid_comparison(parr, scalar, box_with_array)
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            pd.date_range("2000", periods=4).array,
+            pd.timedelta_range("1D", periods=4).array,
+            np.arange(4),
+            np.arange(4).astype(np.float64),
+            list(range(4)),
+            # match Period semantics by not treating integers as Periods
+            [2000, 2001, 2002, 2003],
+            np.arange(2000, 2004),
+            np.arange(2000, 2004).astype(object),
+            pd.Index([2000, 2001, 2002, 2003]),
+        ],
+    )
+    def test_compare_invalid_listlike(self, box_with_array, other):
+        pi = period_range("2000", periods=4)
+        parr = tm.box_expected(pi, box_with_array)
+        assert_invalid_comparison(parr, other, box_with_array)
+
+    @pytest.mark.parametrize("other_box", [list, np.array, lambda x: x.astype(object)])
+    def test_compare_object_dtype(self, box_with_array, other_box):
+        pi = period_range("2000", periods=5)
+        parr = tm.box_expected(pi, box_with_array)
+
+        other = other_box(pi)
+        xbox = get_upcast_box(parr, other, True)
+
+        expected = np.array([True, True, True, True, True])
+        expected = tm.box_expected(expected, xbox)
+
+        result = parr == other
+        tm.assert_equal(result, expected)
+        result = parr <= other
+        tm.assert_equal(result, expected)
+        result = parr >= other
+        tm.assert_equal(result, expected)
+
+        result = parr != other
+        tm.assert_equal(result, ~expected)
+        result = parr < other
+        tm.assert_equal(result, ~expected)
+        result = parr > other
+        tm.assert_equal(result, ~expected)
+
+        other = other_box(pi[::-1])
+
+        expected = np.array([False, False, True, False, False])
+        expected = tm.box_expected(expected, xbox)
+        result = parr == other
+        tm.assert_equal(result, expected)
+
+        expected = np.array([True, True, True, False, False])
+        expected = tm.box_expected(expected, xbox)
+        result = parr <= other
+        tm.assert_equal(result, expected)
+
+        expected = np.array([False, False, True, True, True])
+        expected = tm.box_expected(expected, xbox)
+        result = parr >= other
+        tm.assert_equal(result, expected)
+
+        expected = np.array([True, True, False, True, True])
+        expected = tm.box_expected(expected, xbox)
+        result = parr != other
+        tm.assert_equal(result, expected)
+
+        expected = np.array([True, True, False, False, False])
+        expected = tm.box_expected(expected, xbox)
+        result = parr < other
+        tm.assert_equal(result, expected)
+
+        expected = np.array([False, False, False, True, True])
+        expected = tm.box_expected(expected, xbox)
+        result = parr > other
+        tm.assert_equal(result, expected)
+
+
+class TestPeriodIndexComparisons:
+    # TODO: parameterize over boxes
+
+    def test_pi_cmp_period(self):
+        idx = period_range("2007-01", periods=20, freq="M")
+        per = idx[10]
+
+        result = idx < per
+        exp = idx.values < idx.values[10]
+        tm.assert_numpy_array_equal(result, exp)
+
+        # Tests Period.__richcmp__ against ndarray[object, ndim=2]
+        result = idx.values.reshape(10, 2) < per
+        tm.assert_numpy_array_equal(result, exp.reshape(10, 2))
+
+        # Tests Period.__richcmp__ against ndarray[object, ndim=0]
+        result = idx < np.array(per)
+        tm.assert_numpy_array_equal(result, exp)
+
+    # TODO: moved from test_datetime64; de-duplicate with version below
+    def test_parr_cmp_period_scalar2(self, box_with_array):
+        pi = period_range("2000-01-01", periods=10, freq="D")
+
+        val = pi[3]
+        expected = [x > val for x in pi]
+
+        ser = tm.box_expected(pi, box_with_array)
+        xbox = get_upcast_box(ser, val, True)
+
+        expected = tm.box_expected(expected, xbox)
+        result = ser > val
+        tm.assert_equal(result, expected)
+
+        val = pi[5]
+        result = ser > val
+        expected = [x > val for x in pi]
+        expected = tm.box_expected(expected, xbox)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize("freq", ["M", "2M", "3M"])
+    def test_parr_cmp_period_scalar(self, freq, box_with_array):
+        # GH#13200
+        base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq)
+        base = tm.box_expected(base, box_with_array)
+        per = Period("2011-02", freq=freq)
+        xbox = get_upcast_box(base, per, True)
+
+        exp = np.array([False, True, False, False])
+        exp = tm.box_expected(exp, xbox)
+        tm.assert_equal(base == per, exp)
+        tm.assert_equal(per == base, exp)
+
+        exp = np.array([True, False, True, True])
+        exp = tm.box_expected(exp, xbox)
+        tm.assert_equal(base != per, exp)
+        tm.assert_equal(per != base, exp)
+
+        exp = np.array([False, False, True, True])
+        exp = tm.box_expected(exp, xbox)
+        tm.assert_equal(base > per, exp)
+        tm.assert_equal(per < base, exp)
+
+        exp = np.array([True, False, False, False])
+        exp = tm.box_expected(exp, xbox)
+        tm.assert_equal(base < per, exp)
+        tm.assert_equal(per > base, exp)
+
+        exp = np.array([False, True, True, True])
+        exp = tm.box_expected(exp, xbox)
+        tm.assert_equal(base >= per, exp)
+        tm.assert_equal(per <= base, exp)
+
+        exp = np.array([True, True, False, False])
+        exp = tm.box_expected(exp, xbox)
+        tm.assert_equal(base <= per, exp)
+        tm.assert_equal(per >= base, exp)
+
+    @pytest.mark.parametrize("freq", ["M", "2M", "3M"])
+    def test_parr_cmp_pi(self, freq, box_with_array):
+        # GH#13200
+        base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq)
+        base = tm.box_expected(base, box_with_array)
+
+        # TODO: could also box idx?
+        idx = PeriodIndex(["2011-02", "2011-01", "2011-03", "2011-05"], freq=freq)
+
+        xbox = get_upcast_box(base, idx, True)
+
+        exp = np.array([False, False, True, False])
+        exp = tm.box_expected(exp, xbox)
+        tm.assert_equal(base == idx, exp)
+
+        exp = np.array([True, True, False, True])
+        exp = tm.box_expected(exp, xbox)
+        tm.assert_equal(base != idx, exp)
+
+        exp = np.array([False, True, False, False])
+        exp = tm.box_expected(exp, xbox)
+        tm.assert_equal(base > idx, exp)
+
+        exp = np.array([True, False, False, True])
+        exp = tm.box_expected(exp, xbox)
+        tm.assert_equal(base < idx, exp)
+
+        exp = np.array([False, True, True, False])
+        exp = tm.box_expected(exp, xbox)
+        tm.assert_equal(base >= idx, exp)
+
+        exp = np.array([True, False, True, True])
+        exp = tm.box_expected(exp, xbox)
+        tm.assert_equal(base <= idx, exp)
+
+    @pytest.mark.parametrize("freq", ["M", "2M", "3M"])
+    def test_parr_cmp_pi_mismatched_freq(self, freq, box_with_array):
+        # GH#13200
+        # different base freq
+        base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq)
+        base = tm.box_expected(base, box_with_array)
+
+        msg = rf"Invalid comparison between dtype=period\[{freq}\] and Period"
+        with pytest.raises(TypeError, match=msg):
+            base <= Period("2011", freq="Y")
+
+        with pytest.raises(TypeError, match=msg):
+            Period("2011", freq="Y") >= base
+
+        # TODO: Could parametrize over boxes for idx?
+        idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="Y")
+        rev_msg = r"Invalid comparison between dtype=period\[Y-DEC\] and PeriodArray"
+        idx_msg = rev_msg if box_with_array in [tm.to_array, pd.array] else msg
+        with pytest.raises(TypeError, match=idx_msg):
+            base <= idx
+
+        # Different frequency
+        msg = rf"Invalid comparison between dtype=period\[{freq}\] and Period"
+        with pytest.raises(TypeError, match=msg):
+            base <= Period("2011", freq="4M")
+
+        with pytest.raises(TypeError, match=msg):
+            Period("2011", freq="4M") >= base
+
+        idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="4M")
+        rev_msg = r"Invalid comparison between dtype=period\[4M\] and PeriodArray"
+        idx_msg = rev_msg if box_with_array in [tm.to_array, pd.array] else msg
+        with pytest.raises(TypeError, match=idx_msg):
+            base <= idx
+
+    @pytest.mark.parametrize("freq", ["M", "2M", "3M"])
+    def test_pi_cmp_nat(self, freq):
+        idx1 = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-05"], freq=freq)
+        per = idx1[1]
+
+        result = idx1 > per
+        exp = np.array([False, False, False, True])
+        tm.assert_numpy_array_equal(result, exp)
+        result = per < idx1
+        tm.assert_numpy_array_equal(result, exp)
+
+        result = idx1 == pd.NaT
+        exp = np.array([False, False, False, False])
+        tm.assert_numpy_array_equal(result, exp)
+        result = pd.NaT == idx1
+        tm.assert_numpy_array_equal(result, exp)
+
+        result = idx1 != pd.NaT
+        exp = np.array([True, True, True, True])
+        tm.assert_numpy_array_equal(result, exp)
+        result = pd.NaT != idx1
+        tm.assert_numpy_array_equal(result, exp)
+
+        idx2 = PeriodIndex(["2011-02", "2011-01", "2011-04", "NaT"], freq=freq)
+        result = idx1 < idx2
+        exp = np.array([True, False, False, False])
+        tm.assert_numpy_array_equal(result, exp)
+
+        result = idx1 == idx2
+        exp = np.array([False, False, False, False])
+        tm.assert_numpy_array_equal(result, exp)
+
+        result = idx1 != idx2
+        exp = np.array([True, True, True, True])
+        tm.assert_numpy_array_equal(result, exp)
+
+        result = idx1 == idx1
+        exp = np.array([True, True, False, True])
+        tm.assert_numpy_array_equal(result, exp)
+
+        result = idx1 != idx1
+        exp = np.array([False, False, True, False])
+        tm.assert_numpy_array_equal(result, exp)
+
+    @pytest.mark.parametrize("freq", ["M", "2M", "3M"])
+    def test_pi_cmp_nat_mismatched_freq_raises(self, freq):
+        idx1 = PeriodIndex(["2011-01", "2011-02", "NaT", "2011-05"], freq=freq)
+
+        diff = PeriodIndex(["2011-02", "2011-01", "2011-04", "NaT"], freq="4M")
+        msg = rf"Invalid comparison between dtype=period\[{freq}\] and PeriodArray"
+        with pytest.raises(TypeError, match=msg):
+            idx1 > diff
+
+        result = idx1 == diff
+        expected = np.array([False, False, False, False], dtype=bool)
+        tm.assert_numpy_array_equal(result, expected)
+
+    # TODO: De-duplicate with test_pi_cmp_nat
+    @pytest.mark.parametrize("dtype", [object, None])
+    def test_comp_nat(self, dtype):
+        left = PeriodIndex([Period("2011-01-01"), pd.NaT, Period("2011-01-03")])
+        right = PeriodIndex([pd.NaT, pd.NaT, Period("2011-01-03")])
+
+        if dtype is not None:
+            left = left.astype(dtype)
+            right = right.astype(dtype)
+
+        result = left == right
+        expected = np.array([False, False, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = left != right
+        expected = np.array([True, True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        expected = np.array([False, False, False])
+        tm.assert_numpy_array_equal(left == pd.NaT, expected)
+        tm.assert_numpy_array_equal(pd.NaT == right, expected)
+
+        expected = np.array([True, True, True])
+        tm.assert_numpy_array_equal(left != pd.NaT, expected)
+        tm.assert_numpy_array_equal(pd.NaT != left, expected)
+
+        expected = np.array([False, False, False])
+        tm.assert_numpy_array_equal(left < pd.NaT, expected)
+        tm.assert_numpy_array_equal(pd.NaT > left, expected)
+
+
+class TestPeriodSeriesComparisons:
+    def test_cmp_series_period_series_mixed_freq(self):
+        # GH#13200
+        base = Series(
+            [
+                Period("2011", freq="Y"),
+                Period("2011-02", freq="M"),
+                Period("2013", freq="Y"),
+                Period("2011-04", freq="M"),
+            ]
+        )
+
+        ser = Series(
+            [
+                Period("2012", freq="Y"),
+                Period("2011-01", freq="M"),
+                Period("2013", freq="Y"),
+                Period("2011-05", freq="M"),
+            ]
+        )
+
+        exp = Series([False, False, True, False])
+        tm.assert_series_equal(base == ser, exp)
+
+        exp = Series([True, True, False, True])
+        tm.assert_series_equal(base != ser, exp)
+
+        exp = Series([False, True, False, False])
+        tm.assert_series_equal(base > ser, exp)
+
+        exp = Series([True, False, False, True])
+        tm.assert_series_equal(base < ser, exp)
+
+        exp = Series([False, True, True, False])
+        tm.assert_series_equal(base >= ser, exp)
+
+        exp = Series([True, False, True, True])
+        tm.assert_series_equal(base <= ser, exp)
+
+
+class TestPeriodIndexSeriesComparisonConsistency:
+    """Test PeriodIndex and Period Series Ops consistency"""
+
+    # TODO: needs parametrization+de-duplication
+
+    def _check(self, values, func, expected):
+        # Test PeriodIndex and Period Series Ops consistency
+
+        idx = PeriodIndex(values)
+        result = func(idx)
+
+        # check that we don't pass an unwanted type to tm.assert_equal
+        assert isinstance(expected, (pd.Index, np.ndarray))
+        tm.assert_equal(result, expected)
+
+        s = Series(values)
+        result = func(s)
+
+        exp = Series(expected, name=values.name)
+        tm.assert_series_equal(result, exp)
+
+    def test_pi_comp_period(self):
+        idx = PeriodIndex(
+            ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx"
+        )
+        per = idx[2]
+
+        f = lambda x: x == per
+        exp = np.array([False, False, True, False], dtype=np.bool_)
+        self._check(idx, f, exp)
+        f = lambda x: per == x
+        self._check(idx, f, exp)
+
+        f = lambda x: x != per
+        exp = np.array([True, True, False, True], dtype=np.bool_)
+        self._check(idx, f, exp)
+        f = lambda x: per != x
+        self._check(idx, f, exp)
+
+        f = lambda x: per >= x
+        exp = np.array([True, True, True, False], dtype=np.bool_)
+        self._check(idx, f, exp)
+
+        f = lambda x: x > per
+        exp = np.array([False, False, False, True], dtype=np.bool_)
+        self._check(idx, f, exp)
+
+        f = lambda x: per >= x
+        exp = np.array([True, True, True, False], dtype=np.bool_)
+        self._check(idx, f, exp)
+
+    def test_pi_comp_period_nat(self):
+        idx = PeriodIndex(
+            ["2011-01", "NaT", "2011-03", "2011-04"], freq="M", name="idx"
+        )
+        per = idx[2]
+
+        f = lambda x: x == per
+        exp = np.array([False, False, True, False], dtype=np.bool_)
+        self._check(idx, f, exp)
+        f = lambda x: per == x
+        self._check(idx, f, exp)
+
+        f = lambda x: x == pd.NaT
+        exp = np.array([False, False, False, False], dtype=np.bool_)
+        self._check(idx, f, exp)
+        f = lambda x: pd.NaT == x
+        self._check(idx, f, exp)
+
+        f = lambda x: x != per
+        exp = np.array([True, True, False, True], dtype=np.bool_)
+        self._check(idx, f, exp)
+        f = lambda x: per != x
+        self._check(idx, f, exp)
+
+        f = lambda x: x != pd.NaT
+        exp = np.array([True, True, True, True], dtype=np.bool_)
+        self._check(idx, f, exp)
+        f = lambda x: pd.NaT != x
+        self._check(idx, f, exp)
+
+        f = lambda x: per >= x
+        exp = np.array([True, False, True, False], dtype=np.bool_)
+        self._check(idx, f, exp)
+
+        f = lambda x: x < per
+        exp = np.array([True, False, False, False], dtype=np.bool_)
+        self._check(idx, f, exp)
+
+        f = lambda x: x > pd.NaT
+        exp = np.array([False, False, False, False], dtype=np.bool_)
+        self._check(idx, f, exp)
+
+        f = lambda x: pd.NaT >= x
+        exp = np.array([False, False, False, False], dtype=np.bool_)
+        self._check(idx, f, exp)
+
+
+# ------------------------------------------------------------------
+# Arithmetic
+
+
+class TestPeriodFrameArithmetic:
+    def test_ops_frame_period(self):
+        # GH#13043
+        df = pd.DataFrame(
+            {
+                "A": [Period("2015-01", freq="M"), Period("2015-02", freq="M")],
+                "B": [Period("2014-01", freq="M"), Period("2014-02", freq="M")],
+            }
+        )
+        assert df["A"].dtype == "Period[M]"
+        assert df["B"].dtype == "Period[M]"
+
+        p = Period("2015-03", freq="M")
+        off = p.freq
+        # dtype will be object because of original dtype
+        exp = pd.DataFrame(
+            {
+                "A": np.array([2 * off, 1 * off], dtype=object),
+                "B": np.array([14 * off, 13 * off], dtype=object),
+            }
+        )
+        tm.assert_frame_equal(p - df, exp)
+        tm.assert_frame_equal(df - p, -1 * exp)
+
+        df2 = pd.DataFrame(
+            {
+                "A": [Period("2015-05", freq="M"), Period("2015-06", freq="M")],
+                "B": [Period("2015-05", freq="M"), Period("2015-06", freq="M")],
+            }
+        )
+        assert df2["A"].dtype == "Period[M]"
+        assert df2["B"].dtype == "Period[M]"
+
+        exp = pd.DataFrame(
+            {
+                "A": np.array([4 * off, 4 * off], dtype=object),
+                "B": np.array([16 * off, 16 * off], dtype=object),
+            }
+        )
+        tm.assert_frame_equal(df2 - df, exp)
+        tm.assert_frame_equal(df - df2, -1 * exp)
+
+
+class TestPeriodIndexArithmetic:
+    # ---------------------------------------------------------------
+    # __add__/__sub__ with PeriodIndex
+    # PeriodIndex + other is defined for integers and timedelta-like others
+    # PeriodIndex - other is defined for integers, timedelta-like others,
+    #   and PeriodIndex (with matching freq)
+
+    def test_parr_add_iadd_parr_raises(self, box_with_array):
+        rng = period_range("1/1/2000", freq="D", periods=5)
+        other = period_range("1/6/2000", freq="D", periods=5)
+        # TODO: parametrize over boxes for other?
+
+        rng = tm.box_expected(rng, box_with_array)
+        # An earlier implementation of PeriodIndex addition performed
+        # a set operation (union).  This has since been changed to
+        # raise a TypeError. See GH#14164 and GH#13077 for historical
+        # reference.
+        msg = r"unsupported operand type\(s\) for \+: .* and .*"
+        with pytest.raises(TypeError, match=msg):
+            rng + other
+
+        with pytest.raises(TypeError, match=msg):
+            rng += other
+
+    def test_pi_sub_isub_pi(self):
+        # GH#20049
+        # For historical reference see GH#14164, GH#13077.
+        # PeriodIndex subtraction originally performed set difference,
+        # then changed to raise TypeError before being implemented in GH#20049
+        rng = period_range("1/1/2000", freq="D", periods=5)
+        other = period_range("1/6/2000", freq="D", periods=5)
+
+        off = rng.freq
+        expected = pd.Index([-5 * off] * 5)
+        result = rng - other
+        tm.assert_index_equal(result, expected)
+
+        rng -= other
+        tm.assert_index_equal(rng, expected)
+
+    def test_pi_sub_pi_with_nat(self):
+        rng = period_range("1/1/2000", freq="D", periods=5)
+        other = rng[1:].insert(0, pd.NaT)
+        assert other[1:].equals(rng[1:])
+
+        result = rng - other
+        off = rng.freq
+        expected = pd.Index([pd.NaT, 0 * off, 0 * off, 0 * off, 0 * off])
+        tm.assert_index_equal(result, expected)
+
+    def test_parr_sub_pi_mismatched_freq(self, box_with_array, box_with_array2):
+        rng = period_range("1/1/2000", freq="D", periods=5)
+        other = period_range("1/6/2000", freq="h", periods=5)
+
+        rng = tm.box_expected(rng, box_with_array)
+        other = tm.box_expected(other, box_with_array2)
+        msg = r"Input has different freq=[hD] from PeriodArray\(freq=[Dh]\)"
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng - other
+
+    @pytest.mark.parametrize("n", [1, 2, 3, 4])
+    def test_sub_n_gt_1_ticks(self, tick_classes, n):
+        # GH 23878
+        p1_d = "19910905"
+        p2_d = "19920406"
+        p1 = PeriodIndex([p1_d], freq=tick_classes(n))
+        p2 = PeriodIndex([p2_d], freq=tick_classes(n))
+
+        expected = PeriodIndex([p2_d], freq=p2.freq.base) - PeriodIndex(
+            [p1_d], freq=p1.freq.base
+        )
+
+        tm.assert_index_equal((p2 - p1), expected)
+
+    @pytest.mark.parametrize("n", [1, 2, 3, 4])
+    @pytest.mark.parametrize(
+        "offset, kwd_name",
+        [
+            (pd.offsets.YearEnd, "month"),
+            (pd.offsets.QuarterEnd, "startingMonth"),
+            (pd.offsets.MonthEnd, None),
+            (pd.offsets.Week, "weekday"),
+        ],
+    )
+    def test_sub_n_gt_1_offsets(self, offset, kwd_name, n):
+        # GH 23878
+        kwds = {kwd_name: 3} if kwd_name is not None else {}
+        p1_d = "19910905"
+        p2_d = "19920406"
+        freq = offset(n, normalize=False, **kwds)
+        p1 = PeriodIndex([p1_d], freq=freq)
+        p2 = PeriodIndex([p2_d], freq=freq)
+
+        result = p2 - p1
+        expected = PeriodIndex([p2_d], freq=freq.base) - PeriodIndex(
+            [p1_d], freq=freq.base
+        )
+
+        tm.assert_index_equal(result, expected)
+
+    # -------------------------------------------------------------
+    # Invalid Operations
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            # datetime scalars
+            Timestamp("2016-01-01"),
+            Timestamp("2016-01-01").to_pydatetime(),
+            Timestamp("2016-01-01").to_datetime64(),
+            # datetime-like arrays
+            pd.date_range("2016-01-01", periods=3, freq="h"),
+            pd.date_range("2016-01-01", periods=3, tz="Europe/Brussels"),
+            pd.date_range("2016-01-01", periods=3, freq="s")._data,
+            pd.date_range("2016-01-01", periods=3, tz="Asia/Tokyo")._data,
+            # Miscellaneous invalid types
+            3.14,
+            np.array([2.0, 3.0, 4.0]),
+        ],
+    )
+    def test_parr_add_sub_invalid(self, other, box_with_array):
+        # GH#23215
+        rng = period_range("1/1/2000", freq="D", periods=3)
+        rng = tm.box_expected(rng, box_with_array)
+
+        msg = "|".join(
+            [
+                r"(:?cannot add PeriodArray and .*)",
+                r"(:?cannot subtract .* from (:?a\s)?.*)",
+                r"(:?unsupported operand type\(s\) for \+: .* and .*)",
+                r"unsupported operand type\(s\) for [+-]: .* and .*",
+            ]
+        )
+        assert_invalid_addsub_type(rng, other, msg)
+        with pytest.raises(TypeError, match=msg):
+            rng + other
+        with pytest.raises(TypeError, match=msg):
+            other + rng
+        with pytest.raises(TypeError, match=msg):
+            rng - other
+        with pytest.raises(TypeError, match=msg):
+            other - rng
+
+    # -----------------------------------------------------------------
+    # __add__/__sub__ with ndarray[datetime64] and ndarray[timedelta64]
+
+    def test_pi_add_sub_td64_array_non_tick_raises(self):
+        rng = period_range("1/1/2000", freq="Q", periods=3)
+        tdi = TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"])
+        tdarr = tdi.values
+
+        msg = r"Cannot add or subtract timedelta64\[ns\] dtype from period\[Q-DEC\]"
+        with pytest.raises(TypeError, match=msg):
+            rng + tdarr
+        with pytest.raises(TypeError, match=msg):
+            tdarr + rng
+
+        with pytest.raises(TypeError, match=msg):
+            rng - tdarr
+        msg = r"cannot subtract PeriodArray from TimedeltaArray"
+        with pytest.raises(TypeError, match=msg):
+            tdarr - rng
+
+    def test_pi_add_sub_td64_array_tick(self):
+        # PeriodIndex + Timedelta-like is allowed only with
+        #   tick-like frequencies
+        rng = period_range("1/1/2000", freq="90D", periods=3)
+        tdi = TimedeltaIndex(["-1 Day", "-1 Day", "-1 Day"])
+        tdarr = tdi.values
+
+        expected = period_range("12/31/1999", freq="90D", periods=3)
+        result = rng + tdi
+        tm.assert_index_equal(result, expected)
+        result = rng + tdarr
+        tm.assert_index_equal(result, expected)
+        result = tdi + rng
+        tm.assert_index_equal(result, expected)
+        result = tdarr + rng
+        tm.assert_index_equal(result, expected)
+
+        expected = period_range("1/2/2000", freq="90D", periods=3)
+
+        result = rng - tdi
+        tm.assert_index_equal(result, expected)
+        result = rng - tdarr
+        tm.assert_index_equal(result, expected)
+
+        msg = r"cannot subtract .* from .*"
+        with pytest.raises(TypeError, match=msg):
+            tdarr - rng
+
+        with pytest.raises(TypeError, match=msg):
+            tdi - rng
+
+    @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"])
+    @pytest.mark.parametrize("tdi_freq", [None, "h"])
+    def test_parr_sub_td64array(self, box_with_array, tdi_freq, pi_freq):
+        box = box_with_array
+        xbox = box if box not in [pd.array, tm.to_array] else pd.Index
+
+        tdi = TimedeltaIndex(["1 hours", "2 hours"], freq=tdi_freq)
+        dti = Timestamp("2018-03-07 17:16:40") + tdi
+        pi = dti.to_period(pi_freq)
+
+        # TODO: parametrize over box for pi?
+        td64obj = tm.box_expected(tdi, box)
+
+        if pi_freq == "h":
+            result = pi - td64obj
+            expected = (pi.to_timestamp("s") - tdi).to_period(pi_freq)
+            expected = tm.box_expected(expected, xbox)
+            tm.assert_equal(result, expected)
+
+            # Subtract from scalar
+            result = pi[0] - td64obj
+            expected = (pi[0].to_timestamp("s") - tdi).to_period(pi_freq)
+            expected = tm.box_expected(expected, box)
+            tm.assert_equal(result, expected)
+
+        elif pi_freq == "D":
+            # Tick, but non-compatible
+            msg = (
+                "Cannot add/subtract timedelta-like from PeriodArray that is "
+                "not an integer multiple of the PeriodArray's freq."
+            )
+            with pytest.raises(IncompatibleFrequency, match=msg):
+                pi - td64obj
+
+            with pytest.raises(IncompatibleFrequency, match=msg):
+                pi[0] - td64obj
+
+        else:
+            # With non-Tick freq, we could not add timedelta64 array regardless
+            #  of what its resolution is
+            msg = "Cannot add or subtract timedelta64"
+            with pytest.raises(TypeError, match=msg):
+                pi - td64obj
+            with pytest.raises(TypeError, match=msg):
+                pi[0] - td64obj
+
+    # -----------------------------------------------------------------
+    # operations with array/Index of DateOffset objects
+
+    @pytest.mark.parametrize("box", [np.array, pd.Index])
+    def test_pi_add_offset_array(self, performance_warning, box):
+        # GH#18849
+        pi = PeriodIndex([Period("2015Q1"), Period("2016Q2")])
+        offs = box(
+            [
+                pd.offsets.QuarterEnd(n=1, startingMonth=12),
+                pd.offsets.QuarterEnd(n=-2, startingMonth=12),
+            ]
+        )
+        expected = PeriodIndex([Period("2015Q2"), Period("2015Q4")]).astype(object)
+
+        with tm.assert_produces_warning(performance_warning):
+            res = pi + offs
+        tm.assert_index_equal(res, expected)
+
+        with tm.assert_produces_warning(performance_warning):
+            res2 = offs + pi
+        tm.assert_index_equal(res2, expected)
+
+        unanchored = np.array([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)])
+        # addition/subtraction ops with incompatible offsets should issue
+        # a PerformanceWarning and _then_ raise a TypeError.
+        msg = r"Input cannot be converted to Period\(freq=Q-DEC\)"
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            with tm.assert_produces_warning(performance_warning):
+                pi + unanchored
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            with tm.assert_produces_warning(performance_warning):
+                unanchored + pi
+
+    @pytest.mark.parametrize("box", [np.array, pd.Index])
+    def test_pi_sub_offset_array(self, performance_warning, box):
+        # GH#18824
+        pi = PeriodIndex([Period("2015Q1"), Period("2016Q2")])
+        other = box(
+            [
+                pd.offsets.QuarterEnd(n=1, startingMonth=12),
+                pd.offsets.QuarterEnd(n=-2, startingMonth=12),
+            ]
+        )
+
+        expected = PeriodIndex([pi[n] - other[n] for n in range(len(pi))])
+        expected = expected.astype(object)
+
+        with tm.assert_produces_warning(performance_warning):
+            res = pi - other
+        tm.assert_index_equal(res, expected)
+
+        anchored = box([pd.offsets.MonthEnd(), pd.offsets.Day(n=2)])
+
+        # addition/subtraction ops with anchored offsets should issue
+        # a PerformanceWarning and _then_ raise a TypeError.
+        msg = r"Input has different freq=-1M from Period\(freq=Q-DEC\)"
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            with tm.assert_produces_warning(performance_warning):
+                pi - anchored
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            with tm.assert_produces_warning(performance_warning):
+                anchored - pi
+
+    def test_pi_add_iadd_int(self, one):
+        # Variants of `one` for #19012
+        rng = period_range("2000-01-01 09:00", freq="h", periods=10)
+        result = rng + one
+        expected = period_range("2000-01-01 10:00", freq="h", periods=10)
+        tm.assert_index_equal(result, expected)
+        rng += one
+        tm.assert_index_equal(rng, expected)
+
+    def test_pi_sub_isub_int(self, one):
+        """
+        PeriodIndex.__sub__ and __isub__ with several representations of
+        the integer 1, e.g. int, np.int64, np.uint8, ...
+        """
+        rng = period_range("2000-01-01 09:00", freq="h", periods=10)
+        result = rng - one
+        expected = period_range("2000-01-01 08:00", freq="h", periods=10)
+        tm.assert_index_equal(result, expected)
+        rng -= one
+        tm.assert_index_equal(rng, expected)
+
+    @pytest.mark.parametrize("five", [5, np.array(5, dtype=np.int64)])
+    def test_pi_sub_intlike(self, five):
+        rng = period_range("2007-01", periods=50)
+
+        result = rng - five
+        exp = rng + (-five)
+        tm.assert_index_equal(result, exp)
+
+    def test_pi_add_sub_int_array_freqn_gt1(self):
+        # GH#47209 test adding array of ints when freq.n > 1 matches
+        #  scalar behavior
+        pi = period_range("2016-01-01", periods=10, freq="2D")
+        arr = np.arange(10)
+        result = pi + arr
+        expected = pd.Index([x + y for x, y in zip(pi, arr, strict=True)])
+        tm.assert_index_equal(result, expected)
+
+        result = pi - arr
+        expected = pd.Index([x - y for x, y in zip(pi, arr, strict=True)])
+        tm.assert_index_equal(result, expected)
+
+    def test_pi_sub_isub_offset(self):
+        # offset
+        # DateOffset
+        rng = period_range("2014", "2024", freq="Y")
+        result = rng - pd.offsets.YearEnd(5)
+        expected = period_range("2009", "2019", freq="Y")
+        tm.assert_index_equal(result, expected)
+        rng -= pd.offsets.YearEnd(5)
+        tm.assert_index_equal(rng, expected)
+
+        rng = period_range("2014-01", "2016-12", freq="M")
+        result = rng - pd.offsets.MonthEnd(5)
+        expected = period_range("2013-08", "2016-07", freq="M")
+        tm.assert_index_equal(result, expected)
+
+        rng -= pd.offsets.MonthEnd(5)
+        tm.assert_index_equal(rng, expected)
+
+    @pytest.mark.parametrize("transpose", [True, False])
+    def test_pi_add_offset_n_gt1(self, box_with_array, transpose):
+        # GH#23215
+        # add offset to PeriodIndex with freq.n > 1
+
+        per = Period("2016-01", freq="2M")
+        pi = PeriodIndex([per])
+
+        expected = PeriodIndex(["2016-03"], freq="2M")
+
+        pi = tm.box_expected(pi, box_with_array, transpose=transpose)
+        expected = tm.box_expected(expected, box_with_array, transpose=transpose)
+
+        result = pi + per.freq
+        tm.assert_equal(result, expected)
+
+        result = per.freq + pi
+        tm.assert_equal(result, expected)
+
+    def test_pi_add_offset_n_gt1_not_divisible(self, box_with_array):
+        # GH#23215
+        # PeriodIndex with freq.n > 1 add offset with offset.n % freq.n != 0
+        pi = PeriodIndex(["2016-01"], freq="2M")
+        expected = PeriodIndex(["2016-04"], freq="2M")
+
+        pi = tm.box_expected(pi, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = pi + to_offset("3ME")
+        tm.assert_equal(result, expected)
+
+        result = to_offset("3ME") + pi
+        tm.assert_equal(result, expected)
+
+    # ---------------------------------------------------------------
+    # __add__/__sub__ with integer arrays
+
+    @pytest.mark.parametrize("int_holder", [np.array, pd.Index])
+    @pytest.mark.parametrize("op", [operator.add, ops.radd])
+    def test_pi_add_intarray(self, int_holder, op):
+        # GH#19959
+        pi = PeriodIndex([Period("2015Q1"), Period("NaT")])
+        other = int_holder([4, -1])
+
+        result = op(pi, other)
+        expected = PeriodIndex([Period("2016Q1"), Period("NaT")])
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize("int_holder", [np.array, pd.Index])
+    def test_pi_sub_intarray(self, int_holder):
+        # GH#19959
+        pi = PeriodIndex([Period("2015Q1"), Period("NaT")])
+        other = int_holder([4, -1])
+
+        result = pi - other
+        expected = PeriodIndex([Period("2014Q1"), Period("NaT")])
+        tm.assert_index_equal(result, expected)
+
+        msg = r"bad operand type for unary -: 'PeriodArray'"
+        with pytest.raises(TypeError, match=msg):
+            other - pi
+
+    # ---------------------------------------------------------------
+    # Timedelta-like (timedelta, timedelta64, Timedelta, Tick)
+    # TODO: Some of these are misnomers because of non-Tick DateOffsets
+
+    def test_parr_add_timedeltalike_minute_gt1(self, three_days, box_with_array):
+        # GH#23031 adding a time-delta-like offset to a PeriodArray that has
+        # minute frequency with n != 1.  A more general case is tested below
+        # in test_pi_add_timedeltalike_tick_gt1, but here we write out the
+        # expected result more explicitly.
+        other = three_days
+        rng = period_range("2014-05-01", periods=3, freq="2D")
+        rng = tm.box_expected(rng, box_with_array)
+
+        expected = PeriodIndex(["2014-05-04", "2014-05-06", "2014-05-08"], freq="2D")
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = rng + other
+        tm.assert_equal(result, expected)
+
+        result = other + rng
+        tm.assert_equal(result, expected)
+
+        # subtraction
+        expected = PeriodIndex(["2014-04-28", "2014-04-30", "2014-05-02"], freq="2D")
+        expected = tm.box_expected(expected, box_with_array)
+        result = rng - other
+        tm.assert_equal(result, expected)
+
+        msg = "|".join(
+            [
+                r"bad operand type for unary -: 'PeriodArray'",
+                r"cannot subtract PeriodArray from timedelta64\[[hD]\]",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            other - rng
+
+    @pytest.mark.parametrize("freqstr", ["5ns", "5us", "5ms", "5s", "5min", "5h", "5D"])
+    def test_parr_add_timedeltalike_tick_gt1(self, three_days, freqstr, box_with_array):
+        # GH#23031 adding a time-delta-like offset to a PeriodArray that has
+        # tick-like frequency with n != 1
+        other = three_days
+        rng = period_range("2014-05-01", periods=6, freq=freqstr)
+        first = rng[0]
+        rng = tm.box_expected(rng, box_with_array)
+
+        expected = period_range(first + other, periods=6, freq=freqstr)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = rng + other
+        tm.assert_equal(result, expected)
+
+        result = other + rng
+        tm.assert_equal(result, expected)
+
+        # subtraction
+        expected = period_range(first - other, periods=6, freq=freqstr)
+        expected = tm.box_expected(expected, box_with_array)
+        result = rng - other
+        tm.assert_equal(result, expected)
+        msg = "|".join(
+            [
+                r"bad operand type for unary -: 'PeriodArray'",
+                r"cannot subtract PeriodArray from timedelta64\[[hD]\]",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            other - rng
+
+    def test_pi_add_iadd_timedeltalike_daily(self, three_days):
+        # Tick
+        other = three_days
+        rng = period_range("2014-05-01", "2014-05-15", freq="D")
+        expected = period_range("2014-05-04", "2014-05-18", freq="D")
+
+        result = rng + other
+        tm.assert_index_equal(result, expected)
+
+        rng += other
+        tm.assert_index_equal(rng, expected)
+
+    def test_pi_sub_isub_timedeltalike_daily(self, three_days):
+        # Tick-like 3 Days
+        other = three_days
+        rng = period_range("2014-05-01", "2014-05-15", freq="D")
+        expected = period_range("2014-04-28", "2014-05-12", freq="D")
+
+        result = rng - other
+        tm.assert_index_equal(result, expected)
+
+        rng -= other
+        tm.assert_index_equal(rng, expected)
+
+    def test_parr_add_sub_timedeltalike_freq_mismatch_daily(
+        self, not_daily, box_with_array
+    ):
+        other = not_daily
+        rng = period_range("2014-05-01", "2014-05-15", freq="D")
+        rng = tm.box_expected(rng, box_with_array)
+
+        msg = "|".join(
+            [
+                # non-timedelta-like DateOffset
+                "Input has different freq(=.+)? from Period.*?\\(freq=D\\)",
+                # timedelta/td64/Timedelta but not a multiple of 24H
+                "Cannot add/subtract timedelta-like from PeriodArray that is "
+                "not an integer multiple of the PeriodArray's freq.",
+            ]
+        )
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng + other
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng += other
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng - other
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng -= other
+
+    def test_pi_add_iadd_timedeltalike_hourly(self, two_hours):
+        other = two_hours
+        rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h")
+        expected = period_range("2014-01-01 12:00", "2014-01-05 12:00", freq="h")
+
+        result = rng + other
+        tm.assert_index_equal(result, expected)
+
+        rng += other
+        tm.assert_index_equal(rng, expected)
+
+    def test_parr_add_timedeltalike_mismatched_freq_hourly(
+        self, not_hourly, box_with_array
+    ):
+        other = not_hourly
+        rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h")
+        rng = tm.box_expected(rng, box_with_array)
+        msg = "|".join(
+            [
+                # non-timedelta-like DateOffset
+                "Input has different freq(=.+)? from Period.*?\\(freq=h\\)",
+                # timedelta/td64/Timedelta but not a multiple of 24H
+                "Cannot add/subtract timedelta-like from PeriodArray that is "
+                "not an integer multiple of the PeriodArray's freq.",
+            ]
+        )
+
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng + other
+
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng += other
+
+    def test_pi_sub_isub_timedeltalike_hourly(self, two_hours):
+        other = two_hours
+        rng = period_range("2014-01-01 10:00", "2014-01-05 10:00", freq="h")
+        expected = period_range("2014-01-01 08:00", "2014-01-05 08:00", freq="h")
+
+        result = rng - other
+        tm.assert_index_equal(result, expected)
+
+        rng -= other
+        tm.assert_index_equal(rng, expected)
+
+    def test_add_iadd_timedeltalike_annual(self):
+        # offset
+        # DateOffset
+        rng = period_range("2014", "2024", freq="Y")
+        result = rng + pd.offsets.YearEnd(5)
+        expected = period_range("2019", "2029", freq="Y")
+        tm.assert_index_equal(result, expected)
+        rng += pd.offsets.YearEnd(5)
+        tm.assert_index_equal(rng, expected)
+
+    def test_pi_add_sub_timedeltalike_freq_mismatch_annual(self, mismatched_freq):
+        other = mismatched_freq
+        rng = period_range("2014", "2024", freq="Y")
+        msg = "Input has different freq(=.+)? from Period.*?\\(freq=Y-DEC\\)"
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng + other
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng += other
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng - other
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng -= other
+
+    def test_pi_add_iadd_timedeltalike_M(self):
+        rng = period_range("2014-01", "2016-12", freq="M")
+        expected = period_range("2014-06", "2017-05", freq="M")
+
+        result = rng + pd.offsets.MonthEnd(5)
+        tm.assert_index_equal(result, expected)
+
+        rng += pd.offsets.MonthEnd(5)
+        tm.assert_index_equal(rng, expected)
+
+    def test_pi_add_sub_timedeltalike_freq_mismatch_monthly(self, mismatched_freq):
+        other = mismatched_freq
+        rng = period_range("2014-01", "2016-12", freq="M")
+        msg = "Input has different freq(=.+)? from Period.*?\\(freq=M\\)"
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng + other
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng += other
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng - other
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            rng -= other
+
+    @pytest.mark.parametrize("transpose", [True, False])
+    def test_parr_add_sub_td64_nat(self, box_with_array, transpose):
+        # GH#23320 special handling for timedelta64("NaT")
+        pi = period_range("1994-04-01", periods=9, freq="19D")
+        other = np.timedelta64("NaT")
+        expected = PeriodIndex(["NaT"] * 9, freq="19D")
+
+        obj = tm.box_expected(pi, box_with_array, transpose=transpose)
+        expected = tm.box_expected(expected, box_with_array, transpose=transpose)
+
+        result = obj + other
+        tm.assert_equal(result, expected)
+        result = other + obj
+        tm.assert_equal(result, expected)
+        result = obj - other
+        tm.assert_equal(result, expected)
+        msg = r"cannot subtract .* from .*"
+        with pytest.raises(TypeError, match=msg):
+            other - obj
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            np.array(["NaT"] * 9, dtype="m8[ns]"),
+            TimedeltaArray._from_sequence(["NaT"] * 9, dtype="m8[ns]"),
+        ],
+    )
+    def test_parr_add_sub_tdt64_nat_array(self, box_with_array, other):
+        pi = period_range("1994-04-01", periods=9, freq="19D")
+        expected = PeriodIndex(["NaT"] * 9, freq="19D")
+
+        obj = tm.box_expected(pi, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = obj + other
+        tm.assert_equal(result, expected)
+        result = other + obj
+        tm.assert_equal(result, expected)
+        result = obj - other
+        tm.assert_equal(result, expected)
+        msg = r"cannot subtract .* from .*"
+        with pytest.raises(TypeError, match=msg):
+            other - obj
+
+        # some but not *all* NaT
+        other = other.copy()
+        other[0] = np.timedelta64(0, "ns")
+        expected = PeriodIndex([pi[0]] + ["NaT"] * 8, freq="19D")
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = obj + other
+        tm.assert_equal(result, expected)
+        result = other + obj
+        tm.assert_equal(result, expected)
+        result = obj - other
+        tm.assert_equal(result, expected)
+        with pytest.raises(TypeError, match=msg):
+            other - obj
+
+    # ---------------------------------------------------------------
+    # Unsorted
+
+    def test_parr_add_sub_index(self):
+        # Check that PeriodArray defers to Index on arithmetic ops
+        pi = period_range("2000-12-31", periods=3)
+        parr = pi.array
+
+        result = parr - pi
+        expected = pi - pi
+        tm.assert_index_equal(result, expected)
+
+    def test_parr_add_sub_object_array(self, performance_warning):
+        pi = period_range("2000-12-31", periods=3, freq="D")
+        parr = pi.array
+
+        other = np.array([Timedelta(days=1), pd.offsets.Day(2), 3])
+
+        with tm.assert_produces_warning(performance_warning):
+            result = parr + other
+
+        expected = PeriodIndex(
+            ["2001-01-01", "2001-01-03", "2001-01-05"], freq="D"
+        )._data.astype(object)
+        tm.assert_equal(result, expected)
+
+        with tm.assert_produces_warning(performance_warning):
+            result = parr - other
+
+        expected = PeriodIndex(["2000-12-30"] * 3, freq="D")._data.astype(object)
+        tm.assert_equal(result, expected)
+
+    def test_period_add_timestamp_raises(self, box_with_array):
+        # GH#17983
+        ts = Timestamp("2017")
+        per = Period("2017", freq="M")
+
+        arr = pd.Index([per], dtype="Period[M]")
+        arr = tm.box_expected(arr, box_with_array)
+
+        msg = "cannot add PeriodArray and Timestamp"
+        with pytest.raises(TypeError, match=msg):
+            arr + ts
+        with pytest.raises(TypeError, match=msg):
+            ts + arr
+
+        msg = "cannot add PeriodArray and DatetimeArray"
+        with pytest.raises(TypeError, match=msg):
+            arr + Series([ts])
+        with pytest.raises(TypeError, match=msg):
+            Series([ts]) + arr
+        with pytest.raises(TypeError, match=msg):
+            arr + pd.Index([ts])
+        with pytest.raises(TypeError, match=msg):
+            pd.Index([ts]) + arr
+
+        if box_with_array is pd.DataFrame:
+            msg = "cannot add PeriodArray and DatetimeArray"
+        else:
+            msg = r"unsupported operand type\(s\) for \+: 'Period' and 'DatetimeArray"
+        with pytest.raises(TypeError, match=msg):
+            arr + pd.DataFrame([ts])
+        if box_with_array is pd.DataFrame:
+            msg = "cannot add PeriodArray and DatetimeArray"
+        else:
+            msg = r"unsupported operand type\(s\) for \+: 'DatetimeArray' and 'Period'"
+        with pytest.raises(TypeError, match=msg):
+            pd.DataFrame([ts]) + arr
+
+
+class TestPeriodSeriesArithmetic:
+    def test_parr_add_timedeltalike_scalar(self, three_days, box_with_array):
+        # GH#13043
+        ser = Series(
+            [Period("2015-01-01", freq="D"), Period("2015-01-02", freq="D")],
+            name="xxx",
+        )
+        assert ser.dtype == "Period[D]"
+
+        expected = Series(
+            [Period("2015-01-04", freq="D"), Period("2015-01-05", freq="D")],
+            name="xxx",
+        )
+
+        obj = tm.box_expected(ser, box_with_array)
+        if box_with_array is pd.DataFrame:
+            assert (obj.dtypes == "Period[D]").all()
+
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = obj + three_days
+        tm.assert_equal(result, expected)
+
+        result = three_days + obj
+        tm.assert_equal(result, expected)
+
+    def test_ops_series_period(self):
+        # GH#13043
+        ser = Series(
+            [Period("2015-01-01", freq="D"), Period("2015-01-02", freq="D")],
+            name="xxx",
+        )
+        assert ser.dtype == "Period[D]"
+
+        per = Period("2015-01-10", freq="D")
+        off = per.freq
+        # dtype will be object because of original dtype
+        expected = Series([9 * off, 8 * off], name="xxx", dtype=object)
+        tm.assert_series_equal(per - ser, expected)
+        tm.assert_series_equal(ser - per, -1 * expected)
+
+        s2 = Series(
+            [Period("2015-01-05", freq="D"), Period("2015-01-04", freq="D")],
+            name="xxx",
+        )
+        assert s2.dtype == "Period[D]"
+
+        expected = Series([4 * off, 2 * off], name="xxx", dtype=object)
+        tm.assert_series_equal(s2 - ser, expected)
+        tm.assert_series_equal(ser - s2, -1 * expected)
+
+
+class TestPeriodIndexSeriesMethods:
+    """Test PeriodIndex and Period Series Ops consistency"""
+
+    def _check(self, values, func, expected):
+        idx = PeriodIndex(values)
+        result = func(idx)
+        tm.assert_equal(result, expected)
+
+        ser = Series(values)
+        result = func(ser)
+
+        exp = Series(expected, name=values.name)
+        tm.assert_series_equal(result, exp)
+
+    def test_pi_ops(self):
+        idx = PeriodIndex(
+            ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx"
+        )
+
+        expected = PeriodIndex(
+            ["2011-03", "2011-04", "2011-05", "2011-06"], freq="M", name="idx"
+        )
+
+        self._check(idx, lambda x: x + 2, expected)
+        self._check(idx, lambda x: 2 + x, expected)
+
+        self._check(idx + 2, lambda x: x - 2, idx)
+
+        result = idx - Period("2011-01", freq="M")
+        off = idx.freq
+        exp = pd.Index([0 * off, 1 * off, 2 * off, 3 * off], name="idx")
+        tm.assert_index_equal(result, exp)
+
+        result = Period("2011-01", freq="M") - idx
+        exp = pd.Index([0 * off, -1 * off, -2 * off, -3 * off], name="idx")
+        tm.assert_index_equal(result, exp)
+
+    @pytest.mark.parametrize("ng", ["str", 1.5])
+    @pytest.mark.parametrize(
+        "func",
+        [
+            lambda obj, ng: obj + ng,
+            lambda obj, ng: ng + obj,
+            lambda obj, ng: obj - ng,
+            lambda obj, ng: ng - obj,
+            lambda obj, ng: np.add(obj, ng),
+            lambda obj, ng: np.add(ng, obj),
+            lambda obj, ng: np.subtract(obj, ng),
+            lambda obj, ng: np.subtract(ng, obj),
+        ],
+    )
+    def test_parr_ops_errors(self, ng, func, box_with_array):
+        idx = PeriodIndex(
+            ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx"
+        )
+        obj = tm.box_expected(idx, box_with_array)
+        msg = "|".join(
+            [
+                r"unsupported operand type\(s\)",
+                "can only concatenate",
+                r"must be str",
+                "object to str implicitly",
+            ]
+        )
+
+        with pytest.raises(TypeError, match=msg):
+            func(obj, ng)
+
+    def test_pi_ops_nat(self):
+        idx = PeriodIndex(
+            ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx"
+        )
+        expected = PeriodIndex(
+            ["2011-03", "2011-04", "NaT", "2011-06"], freq="M", name="idx"
+        )
+
+        self._check(idx, lambda x: x + 2, expected)
+        self._check(idx, lambda x: 2 + x, expected)
+        self._check(idx, lambda x: np.add(x, 2), expected)
+
+        self._check(idx + 2, lambda x: x - 2, idx)
+        self._check(idx + 2, lambda x: np.subtract(x, 2), idx)
+
+        # freq with mult
+        idx = PeriodIndex(
+            ["2011-01", "2011-02", "NaT", "2011-04"], freq="2M", name="idx"
+        )
+        expected = PeriodIndex(
+            ["2011-07", "2011-08", "NaT", "2011-10"], freq="2M", name="idx"
+        )
+
+        self._check(idx, lambda x: x + 3, expected)
+        self._check(idx, lambda x: 3 + x, expected)
+        self._check(idx, lambda x: np.add(x, 3), expected)
+
+        self._check(idx + 3, lambda x: x - 3, idx)
+        self._check(idx + 3, lambda x: np.subtract(x, 3), idx)
+
+    def test_pi_ops_array_int(self):
+        idx = PeriodIndex(
+            ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx"
+        )
+        f = lambda x: x + np.array([1, 2, 3, 4])
+        exp = PeriodIndex(
+            ["2011-02", "2011-04", "NaT", "2011-08"], freq="M", name="idx"
+        )
+        self._check(idx, f, exp)
+
+        f = lambda x: np.add(x, np.array([4, -1, 1, 2]))
+        exp = PeriodIndex(
+            ["2011-05", "2011-01", "NaT", "2011-06"], freq="M", name="idx"
+        )
+        self._check(idx, f, exp)
+
+        f = lambda x: x - np.array([1, 2, 3, 4])
+        exp = PeriodIndex(
+            ["2010-12", "2010-12", "NaT", "2010-12"], freq="M", name="idx"
+        )
+        self._check(idx, f, exp)
+
+        f = lambda x: np.subtract(x, np.array([3, 2, 3, -2]))
+        exp = PeriodIndex(
+            ["2010-10", "2010-12", "NaT", "2011-06"], freq="M", name="idx"
+        )
+        self._check(idx, f, exp)
+
+    def test_pi_ops_offset(self):
+        idx = PeriodIndex(
+            ["2011-01-01", "2011-02-01", "2011-03-01", "2011-04-01"],
+            freq="D",
+            name="idx",
+        )
+        f = lambda x: x + pd.offsets.Day()
+        exp = PeriodIndex(
+            ["2011-01-02", "2011-02-02", "2011-03-02", "2011-04-02"],
+            freq="D",
+            name="idx",
+        )
+        self._check(idx, f, exp)
+
+        f = lambda x: x + pd.offsets.Day(2)
+        exp = PeriodIndex(
+            ["2011-01-03", "2011-02-03", "2011-03-03", "2011-04-03"],
+            freq="D",
+            name="idx",
+        )
+        self._check(idx, f, exp)
+
+        f = lambda x: x - pd.offsets.Day(2)
+        exp = PeriodIndex(
+            ["2010-12-30", "2011-01-30", "2011-02-27", "2011-03-30"],
+            freq="D",
+            name="idx",
+        )
+        self._check(idx, f, exp)
+
+    def test_pi_offset_errors(self):
+        idx = PeriodIndex(
+            ["2011-01-01", "2011-02-01", "2011-03-01", "2011-04-01"],
+            freq="D",
+            name="idx",
+        )
+        ser = Series(idx)
+
+        msg = (
+            "Cannot add/subtract timedelta-like from PeriodArray that is not "
+            "an integer multiple of the PeriodArray's freq"
+        )
+        for obj in [idx, ser]:
+            with pytest.raises(IncompatibleFrequency, match=msg):
+                obj + pd.offsets.Hour(2)
+
+            with pytest.raises(IncompatibleFrequency, match=msg):
+                pd.offsets.Hour(2) + obj
+
+            with pytest.raises(IncompatibleFrequency, match=msg):
+                obj - pd.offsets.Hour(2)
+
+    def test_pi_sub_period(self):
+        # GH#13071
+        idx = PeriodIndex(
+            ["2011-01", "2011-02", "2011-03", "2011-04"], freq="M", name="idx"
+        )
+
+        result = idx - Period("2012-01", freq="M")
+        off = idx.freq
+        exp = pd.Index([-12 * off, -11 * off, -10 * off, -9 * off], name="idx")
+        tm.assert_index_equal(result, exp)
+
+        result = np.subtract(idx, Period("2012-01", freq="M"))
+        tm.assert_index_equal(result, exp)
+
+        result = Period("2012-01", freq="M") - idx
+        exp = pd.Index([12 * off, 11 * off, 10 * off, 9 * off], name="idx")
+        tm.assert_index_equal(result, exp)
+
+        result = np.subtract(Period("2012-01", freq="M"), idx)
+        tm.assert_index_equal(result, exp)
+
+        exp = TimedeltaIndex(
+            [np.nan, np.nan, np.nan, np.nan], name="idx", dtype="m8[ns]"
+        )
+        result = idx - Period("NaT", freq="M")
+        tm.assert_index_equal(result, exp)
+        assert result.freq == exp.freq
+
+        result = Period("NaT", freq="M") - idx
+        tm.assert_index_equal(result, exp)
+        assert result.freq == exp.freq
+
+    def test_pi_sub_pdnat(self):
+        # GH#13071, GH#19389
+        idx = PeriodIndex(
+            ["2011-01", "2011-02", "NaT", "2011-04"], freq="M", name="idx"
+        )
+        exp = TimedeltaIndex([pd.NaT] * 4, name="idx", dtype="m8[ns]")
+        tm.assert_index_equal(pd.NaT - idx, exp)
+        tm.assert_index_equal(idx - pd.NaT, exp)
+
+    def test_pi_sub_period_nat(self):
+        # GH#13071
+        idx = PeriodIndex(
+            ["2011-01", "NaT", "2011-03", "2011-04"], freq="M", name="idx"
+        )
+
+        result = idx - Period("2012-01", freq="M")
+        off = idx.freq
+        exp = pd.Index([-12 * off, pd.NaT, -10 * off, -9 * off], name="idx")
+        tm.assert_index_equal(result, exp)
+
+        result = Period("2012-01", freq="M") - idx
+        exp = pd.Index([12 * off, pd.NaT, 10 * off, 9 * off], name="idx")
+        tm.assert_index_equal(result, exp)
+
+        exp = TimedeltaIndex(
+            [np.nan, np.nan, np.nan, np.nan], name="idx", dtype="m8[ns]"
+        )
+        tm.assert_index_equal(idx - Period("NaT", freq="M"), exp)
+        tm.assert_index_equal(Period("NaT", freq="M") - idx, exp)
diff --git a/pandas/tests/arithmetic/test_string.py b/pandas/tests/arithmetic/test_string.py
new file mode 100644
index 0000000000000000000000000000000000000000..46a3d1e8386eb29153700937bb82e43ddf23883d
--- /dev/null
+++ b/pandas/tests/arithmetic/test_string.py
@@ -0,0 +1,472 @@
+import operator
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+from pandas.compat import HAS_PYARROW
+from pandas.errors import Pandas4Warning
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    NA,
+    ArrowDtype,
+    Series,
+    StringDtype,
+)
+import pandas._testing as tm
+from pandas.core.construction import extract_array
+
+
+def string_dtype_highest_priority(dtype1, dtype2):
+    if HAS_PYARROW:
+        DTYPE_HIERARCHY = [
+            StringDtype("python", na_value=np.nan),
+            StringDtype("pyarrow", na_value=np.nan),
+            StringDtype("python", na_value=NA),
+            StringDtype("pyarrow", na_value=NA),
+        ]
+    else:
+        DTYPE_HIERARCHY = [
+            StringDtype("python", na_value=np.nan),
+            StringDtype("python", na_value=NA),
+        ]
+
+    h1 = DTYPE_HIERARCHY.index(dtype1)
+    h2 = DTYPE_HIERARCHY.index(dtype2)
+    return DTYPE_HIERARCHY[max(h1, h2)]
+
+
+def test_eq_all_na():
+    pytest.importorskip("pyarrow")
+    a = pd.array([NA, NA], dtype=StringDtype("pyarrow"))
+    result = a == a
+    expected = pd.array([NA, NA], dtype="boolean[pyarrow]")
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_reversed_logical_ops(any_string_dtype):
+    # GH#60234
+    dtype = any_string_dtype
+    warn = None if dtype == object else Pandas4Warning
+    left = Series([True, False, False, True])
+    right = Series(["", "", "b", "c"], dtype=dtype)
+
+    msg = "operations between boolean dtype and"
+    with tm.assert_produces_warning(warn, match=msg):
+        result = left | right
+    expected = left | right.astype(bool)
+    tm.assert_series_equal(result, expected)
+
+    with tm.assert_produces_warning(warn, match=msg):
+        result = left & right
+    expected = left & right.astype(bool)
+    tm.assert_series_equal(result, expected)
+
+    with tm.assert_produces_warning(warn, match=msg):
+        result = left ^ right
+    expected = left ^ right.astype(bool)
+    tm.assert_series_equal(result, expected)
+
+
+def test_pathlib_path_division(any_string_dtype, request):
+    # GH#61940
+    if any_string_dtype == object:
+        mark = pytest.mark.xfail(
+            reason="with NA present we go through _masked_arith_op which "
+            "raises TypeError bc Path is not recognized by lib.is_scalar."
+        )
+        request.applymarker(mark)
+
+    item = Path("/Users/Irv/")
+    ser = Series(["A", "B", NA], dtype=any_string_dtype)
+
+    result = item / ser
+    expected = Series([item / "A", item / "B", ser.dtype.na_value], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+    result = ser / item
+    expected = Series(["A" / item, "B" / item, ser.dtype.na_value], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+
+def test_mixed_object_comparison(any_string_dtype):
+    # GH#60228
+    dtype = any_string_dtype
+    ser = Series(["a", "b"], dtype=dtype)
+
+    mixed = Series([1, "b"], dtype=object)
+
+    result = ser == mixed
+    expected = Series([False, True], dtype=bool)
+    if dtype == object:
+        pass
+    elif dtype.storage == "python" and dtype.na_value is NA:
+        expected = expected.astype("boolean")
+    elif dtype.storage == "pyarrow" and dtype.na_value is NA:
+        expected = expected.astype("bool[pyarrow]")
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_pyarrow_numpy_string_invalid():
+    # GH#56008
+    pa = pytest.importorskip("pyarrow")
+    ser = Series([False, True])
+    ser2 = Series(["a", "b"], dtype=StringDtype(na_value=np.nan))
+    result = ser == ser2
+    expected_eq = Series(False, index=ser.index)
+    tm.assert_series_equal(result, expected_eq)
+
+    result = ser != ser2
+    expected_ne = Series(True, index=ser.index)
+    tm.assert_series_equal(result, expected_ne)
+
+    with pytest.raises(TypeError, match="Invalid comparison"):
+        ser > ser2
+
+    # GH#59505
+    ser3 = ser2.astype("string[pyarrow]")
+    result3_eq = ser3 == ser
+    tm.assert_series_equal(result3_eq, expected_eq.astype("bool[pyarrow]"))
+    result3_ne = ser3 != ser
+    tm.assert_series_equal(result3_ne, expected_ne.astype("bool[pyarrow]"))
+
+    with pytest.raises(TypeError, match="Invalid comparison"):
+        ser > ser3
+
+    ser4 = ser2.astype(ArrowDtype(pa.string()))
+    result4_eq = ser4 == ser
+    tm.assert_series_equal(result4_eq, expected_eq.astype("bool[pyarrow]"))
+    result4_ne = ser4 != ser
+    tm.assert_series_equal(result4_ne, expected_ne.astype("bool[pyarrow]"))
+
+    with pytest.raises(TypeError, match="Invalid comparison"):
+        ser > ser4
+
+
+def test_mul_bool_invalid(any_string_dtype):
+    # GH#62595
+    dtype = any_string_dtype
+    ser = Series(["a", "b", "c"], dtype=dtype)
+
+    if dtype == object:
+        pytest.skip("This is not expect to raise")
+    elif dtype.storage == "python":
+        msg = "Cannot multiply StringArray by bools. Explicitly cast to integers"
+    else:
+        msg = "Can only string multiply by an integer"
+
+    with pytest.raises(TypeError, match=msg):
+        False * ser
+    with pytest.raises(TypeError, match=msg):
+        ser * True
+    with pytest.raises(TypeError, match=msg):
+        ser * np.array([True, False, True], dtype=bool)
+    with pytest.raises(TypeError, match=msg):
+        np.array([True, False, True], dtype=bool) * ser
+
+
+def test_add(any_string_dtype, request):
+    dtype = any_string_dtype
+    if dtype == object:
+        mark = pytest.mark.xfail(
+            reason="Need to update expected for numpy object dtype"
+        )
+        request.applymarker(mark)
+
+    a = Series(["a", "b", "c", None, None], dtype=dtype)
+    b = Series(["x", "y", None, "z", None], dtype=dtype)
+
+    result = a + b
+    expected = Series(["ax", "by", None, None, None], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = a.add(b)
+    tm.assert_series_equal(result, expected)
+
+    result = a.radd(b)
+    expected = Series(["xa", "yb", None, None, None], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = a.add(b, fill_value="-")
+    expected = Series(["ax", "by", "c-", "-z", None], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_add_2d(any_string_dtype, request):
+    dtype = any_string_dtype
+
+    if dtype == object or dtype.storage == "pyarrow":
+        reason = "Failed: DID NOT RAISE <class 'ValueError'>"
+        mark = pytest.mark.xfail(raises=None, reason=reason)
+        request.applymarker(mark)
+
+    a = pd.array(["a", "b", "c"], dtype=dtype)
+    b = np.array([["a", "b", "c"]], dtype=object)
+    with pytest.raises(ValueError, match="3 != 1"):
+        a + b
+
+    s = Series(a)
+    with pytest.raises(ValueError, match="3 != 1"):
+        s + b
+
+
+def test_add_sequence(any_string_dtype, request, using_infer_string):
+    dtype = any_string_dtype
+    if (
+        dtype != object
+        and dtype.storage == "python"
+        and dtype.na_value is np.nan
+        and HAS_PYARROW
+        and using_infer_string
+    ):
+        mark = pytest.mark.xfail(
+            reason="As of GH#62522, the list gets wrapped with sanitize_array, "
+            "which casts to a higher-priority StringArray, so we get "
+            "NotImplemented."
+        )
+        request.applymarker(mark)
+    if dtype == np.dtype(object) and using_infer_string:
+        mark = pytest.mark.xfail(reason="Cannot broadcast list")
+        request.applymarker(mark)
+
+    a = pd.array(["a", "b", None, None], dtype=dtype)
+    other = ["x", None, "y", None]
+
+    result = a + other
+    expected = pd.array(["ax", None, None, None], dtype=dtype)
+    tm.assert_extension_array_equal(result, expected)
+
+    result = other + a
+    expected = pd.array(["xa", None, None, None], dtype=dtype)
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_mul(any_string_dtype):
+    dtype = any_string_dtype
+    a = pd.array(["a", "b", None], dtype=dtype)
+    result = a * 2
+    expected = pd.array(["aa", "bb", None], dtype=dtype)
+    tm.assert_extension_array_equal(result, expected)
+
+    result = 2 * a
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_add_strings(any_string_dtype, request):
+    dtype = any_string_dtype
+    if dtype != np.dtype(object):
+        mark = pytest.mark.xfail(reason="GH-28527")
+        request.applymarker(mark)
+    arr = pd.array(["a", "b", "c", "d"], dtype=dtype)
+    df = pd.DataFrame([["t", "y", "v", "w"]], dtype=object)
+    assert arr.__add__(df) is NotImplemented
+
+    result = arr + df
+    expected = pd.DataFrame([["at", "by", "cv", "dw"]]).astype(dtype)
+    tm.assert_frame_equal(result, expected)
+
+    result = df + arr
+    expected = pd.DataFrame([["ta", "yb", "vc", "wd"]]).astype(dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.xfail(reason="GH-28527")
+def test_add_frame(dtype):
+    arr = pd.array(["a", "b", np.nan, np.nan], dtype=dtype)
+    df = pd.DataFrame([["x", np.nan, "y", np.nan]])
+
+    assert arr.__add__(df) is NotImplemented
+
+    result = arr + df
+    expected = pd.DataFrame([["ax", np.nan, np.nan, np.nan]]).astype(dtype)
+    tm.assert_frame_equal(result, expected)
+
+    result = df + arr
+    expected = pd.DataFrame([["xa", np.nan, np.nan, np.nan]]).astype(dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_comparison_methods_scalar(comparison_op, any_string_dtype):
+    dtype = any_string_dtype
+    op_name = f"__{comparison_op.__name__}__"
+    a = pd.array(["a", None, "c"], dtype=dtype)
+    other = "a"
+    result = getattr(a, op_name)(other)
+    if dtype == object or dtype.na_value is np.nan:
+        expected = np.array([getattr(item, op_name)(other) for item in a])
+        if comparison_op == operator.ne:
+            expected[1] = True
+        else:
+            expected[1] = False
+        result = extract_array(result, extract_numpy=True)
+        tm.assert_numpy_array_equal(result, expected.astype(np.bool_))
+    else:
+        expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
+        expected = np.array([getattr(item, op_name)(other) for item in a], dtype=object)
+        expected = pd.array(expected, dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
+
+
+def test_comparison_methods_scalar_pd_na(comparison_op, any_string_dtype):
+    dtype = any_string_dtype
+    op_name = f"__{comparison_op.__name__}__"
+    a = pd.array(["a", None, "c"], dtype=dtype)
+    result = getattr(a, op_name)(NA)
+
+    if dtype == np.dtype(object) or dtype.na_value is np.nan:
+        if operator.ne == comparison_op:
+            expected = np.array([True, True, True])
+        else:
+            expected = np.array([False, False, False])
+        result = extract_array(result, extract_numpy=True)
+        tm.assert_numpy_array_equal(result, expected)
+    else:
+        expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
+        expected = pd.array([None, None, None], dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
+        tm.assert_extension_array_equal(result, expected)
+
+
+def test_comparison_methods_scalar_not_string(comparison_op, any_string_dtype):
+    op_name = f"__{comparison_op.__name__}__"
+    dtype = any_string_dtype
+
+    a = pd.array(["a", None, "c"], dtype=dtype)
+    other = 42
+
+    if op_name not in ["__eq__", "__ne__"]:
+        with pytest.raises(TypeError, match="Invalid comparison|not supported between"):
+            getattr(a, op_name)(other)
+
+        return
+
+    result = getattr(a, op_name)(other)
+    result = extract_array(result, extract_numpy=True)
+
+    if dtype == np.dtype(object) or dtype.na_value is np.nan:
+        expected_data = {
+            "__eq__": [False, False, False],
+            "__ne__": [True, True, True],
+        }[op_name]
+        expected = np.array(expected_data)
+        tm.assert_numpy_array_equal(result, expected)
+    else:
+        expected_data = {"__eq__": [False, None, False], "__ne__": [True, None, True]}[
+            op_name
+        ]
+        expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
+        expected = pd.array(expected_data, dtype=expected_dtype)
+        tm.assert_extension_array_equal(result, expected)
+
+
+def test_comparison_methods_array(comparison_op, any_string_dtype, any_string_dtype2):
+    op_name = f"__{comparison_op.__name__}__"
+    dtype = any_string_dtype
+    dtype2 = any_string_dtype2
+
+    a = pd.array(["a", None, "c"], dtype=dtype)
+    other = pd.array([None, None, "c"], dtype=dtype2)
+    result = comparison_op(a, other)
+    result = extract_array(result, extract_numpy=True)
+
+    # ensure operation is commutative
+    result2 = comparison_op(other, a)
+    result2 = extract_array(result2, extract_numpy=True)
+    tm.assert_equal(result, result2)
+
+    if (dtype == object or dtype.na_value is np.nan) and (
+        dtype2 == object or dtype2.na_value is np.nan
+    ):
+        if operator.ne == comparison_op:
+            expected = np.array([True, True, False])
+        else:
+            expected = np.array([False, False, False])
+            expected[-1] = getattr(other[-1], op_name)(a[-1])
+        result = extract_array(result, extract_numpy=True)
+        tm.assert_numpy_array_equal(result, expected)
+
+    else:
+        if dtype == object:
+            max_dtype = dtype2
+        elif dtype2 == object:
+            max_dtype = dtype
+        else:
+            max_dtype = string_dtype_highest_priority(dtype, dtype2)
+        if max_dtype.storage == "python":
+            expected_dtype = "boolean"
+        else:
+            expected_dtype = "bool[pyarrow]"
+
+        expected = np.full(len(a), fill_value=None, dtype="object")
+        expected[-1] = getattr(other[-1], op_name)(a[-1])
+        expected = pd.array(expected, dtype=expected_dtype)
+        tm.assert_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_comparison_methods_array_arrow_extension(comparison_op, any_string_dtype):
+    # Test pd.ArrowDtype(pa.string()) against other string arrays
+    import pyarrow as pa
+
+    dtype2 = any_string_dtype
+
+    op_name = f"__{comparison_op.__name__}__"
+    dtype = ArrowDtype(pa.string())
+    a = pd.array(["a", None, "c"], dtype=dtype)
+    other = pd.array([None, None, "c"], dtype=dtype2)
+    result = comparison_op(a, other)
+
+    # ensure operation is commutative
+    result2 = comparison_op(other, a)
+    tm.assert_equal(result, result2)
+
+    expected = pd.array([None, None, True], dtype="bool[pyarrow]")
+    expected[-1] = getattr(other[-1], op_name)(a[-1])
+    tm.assert_extension_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("box", [pd.array, pd.Index, Series])
+def test_comparison_methods_list(comparison_op, any_string_dtype, box, request):
+    dtype = any_string_dtype
+
+    if box is pd.array and dtype != object and dtype.na_value is np.nan:
+        mark = pytest.mark.xfail(
+            reason="After wrapping list, op returns NotImplemented, see GH#62522"
+        )
+        request.applymarker(mark)
+
+    op_name = f"__{comparison_op.__name__}__"
+
+    a = box(pd.array(["a", None, "c"], dtype=dtype))
+    item = "c"
+    other = [None, None, "c"]
+    result = comparison_op(a, other)
+
+    # ensure operation is commutative
+    result2 = comparison_op(other, a)
+    tm.assert_equal(result, result2)
+
+    if dtype == np.dtype(object) or dtype.na_value is np.nan:
+        if operator.ne == comparison_op:
+            expected = np.array([True, True, False])
+        else:
+            expected = np.array([False, False, False])
+            expected[-1] = getattr(item, op_name)(item)
+        if box is not pd.Index:
+            # if GH#62766 is addressed this check can be removed
+            expected = box(expected, dtype=expected.dtype)
+        tm.assert_equal(result, expected)
+
+    else:
+        expected_dtype = "boolean[pyarrow]" if dtype.storage == "pyarrow" else "boolean"
+        expected = np.full(len(a), fill_value=None, dtype="object")
+        expected[-1] = getattr(item, op_name)(item)
+        expected = pd.array(expected, dtype=expected_dtype)
+        expected = extract_array(expected, extract_numpy=True)
+        if box is not pd.Index:
+            # if GH#62766 is addressed this check can be removed
+            expected = tm.box_expected(expected, box)
+        tm.assert_equal(result, expected)
diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py
new file mode 100644
index 0000000000000000000000000000000000000000..89a9148bed5575f24b99c0446f85fe4826062518
--- /dev/null
+++ b/pandas/tests/arithmetic/test_timedelta64.py
@@ -0,0 +1,2331 @@
+# Arithmetic tests for DataFrame/Series/Index/Array classes that should
+# behave identically.
+from datetime import (
+    datetime,
+    timedelta,
+)
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import timezones
+from pandas.compat import WASM
+from pandas.errors import OutOfBoundsDatetime
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    NaT,
+    Series,
+    Timedelta,
+    TimedeltaIndex,
+    Timestamp,
+    offsets,
+    timedelta_range,
+)
+import pandas._testing as tm
+from pandas.core.arrays import NumpyExtensionArray
+from pandas.tests.arithmetic.common import (
+    assert_invalid_addsub_type,
+    assert_invalid_comparison,
+    get_upcast_box,
+)
+
+
+def assert_dtype(obj, expected_dtype):
+    """
+    Helper to check the dtype for a Series, Index, or single-column DataFrame.
+    """
+    dtype = tm.get_dtype(obj)
+
+    assert dtype == expected_dtype
+
+
+def get_expected_name(box, names):
+    if box is DataFrame:
+        # Since we are operating with a DataFrame and a non-DataFrame,
+        # the non-DataFrame is cast to Series and its name ignored.
+        exname = names[0]
+    elif box in [tm.to_array, pd.array]:
+        exname = names[1]
+    else:
+        exname = names[2]
+    return exname
+
+
+# ------------------------------------------------------------------
+# Timedelta64[ns] dtype Comparisons
+
+
+class TestTimedelta64ArrayLikeComparisons:
+    # Comparison tests for timedelta64[ns] vectors fully parametrized over
+    #  DataFrame/Series/TimedeltaIndex/TimedeltaArray.  Ideally all comparison
+    #  tests will eventually end up here.
+
+    def test_compare_timedelta64_zerodim(self, box_with_array):
+        # GH#26689 should unbox when comparing with zerodim array
+        box = box_with_array
+        xbox = box_with_array if box_with_array not in [Index, pd.array] else np.ndarray
+
+        tdi = timedelta_range("2h", periods=4)
+        other = np.array(tdi.to_numpy()[0])
+
+        tdi = tm.box_expected(tdi, box)
+        res = tdi <= other
+        expected = np.array([True, False, False, False])
+        expected = tm.box_expected(expected, xbox)
+        tm.assert_equal(res, expected)
+
+    @pytest.mark.parametrize(
+        "td_scalar",
+        [
+            timedelta(days=1),
+            Timedelta(days=1),
+            Timedelta(days=1).to_timedelta64(),
+            offsets.Hour(24),
+        ],
+    )
+    def test_compare_timedeltalike_scalar(self, box_with_array, td_scalar):
+        # regression test for GH#5963
+        box = box_with_array
+        xbox = box if box not in [Index, pd.array] else np.ndarray
+
+        ser = Series([timedelta(days=1), timedelta(days=2)])
+        ser = tm.box_expected(ser, box)
+        actual = ser > td_scalar
+        expected = Series([False, True])
+        expected = tm.box_expected(expected, xbox)
+        tm.assert_equal(actual, expected)
+
+    @pytest.mark.parametrize(
+        "invalid",
+        [
+            345600000000000,
+            "a",
+            Timestamp("2021-01-01"),
+            Timestamp("2021-01-01").now("UTC"),
+            Timestamp("2021-01-01").now().to_datetime64(),
+            Timestamp("2021-01-01").now().to_pydatetime(),
+            Timestamp("2021-01-01").date(),
+            np.array(4),  # zero-dim mismatched dtype
+        ],
+    )
+    def test_td64_comparisons_invalid(self, box_with_array, invalid):
+        # GH#13624 for str
+        box = box_with_array
+
+        rng = timedelta_range("1 days", periods=10)
+        obj = tm.box_expected(rng, box)
+
+        assert_invalid_comparison(obj, invalid, box)
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            list(range(10)),
+            np.arange(10),
+            np.arange(10).astype(np.float32),
+            np.arange(10).astype(object),
+            pd.date_range("1970-01-01", periods=10, tz="UTC").array,
+            np.array(pd.date_range("1970-01-01", periods=10)),
+            list(pd.date_range("1970-01-01", periods=10)),
+            pd.date_range("1970-01-01", periods=10).astype(object),
+            pd.period_range("1971-01-01", freq="D", periods=10).array,
+            pd.period_range("1971-01-01", freq="D", periods=10).astype(object),
+        ],
+    )
+    def test_td64arr_cmp_arraylike_invalid(self, other, box_with_array):
+        # We don't parametrize this over box_with_array because listlike
+        #  other plays poorly with assert_invalid_comparison reversed checks
+
+        rng = timedelta_range("1 days", periods=10)._data
+        rng = tm.box_expected(rng, box_with_array)
+        assert_invalid_comparison(rng, other, box_with_array)
+
+    def test_td64arr_cmp_mixed_invalid(self):
+        rng = timedelta_range("1 days", periods=5)._data
+        other = np.array([0, 1, 2, rng[3], Timestamp("2021-01-01")])
+
+        result = rng == other
+        expected = np.array([False, False, False, True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = rng != other
+        tm.assert_numpy_array_equal(result, ~expected)
+
+        msg = "Invalid comparison between|Cannot compare type|not supported between"
+        with pytest.raises(TypeError, match=msg):
+            rng < other
+        with pytest.raises(TypeError, match=msg):
+            rng > other
+        with pytest.raises(TypeError, match=msg):
+            rng <= other
+        with pytest.raises(TypeError, match=msg):
+            rng >= other
+
+
+class TestTimedelta64ArrayComparisons:
+    # TODO: All of these need to be parametrized over box
+
+    @pytest.mark.parametrize("dtype", [None, object])
+    def test_comp_nat(self, dtype):
+        left = TimedeltaIndex([Timedelta("1 days"), NaT, Timedelta("3 days")])
+        right = TimedeltaIndex([NaT, NaT, Timedelta("3 days")])
+
+        lhs, rhs = left, right
+        if dtype is object:
+            lhs, rhs = left.astype(object), right.astype(object)
+
+        result = rhs == lhs
+        expected = np.array([False, False, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = rhs != lhs
+        expected = np.array([True, True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        expected = np.array([False, False, False])
+        tm.assert_numpy_array_equal(lhs == NaT, expected)
+        tm.assert_numpy_array_equal(NaT == rhs, expected)
+
+        expected = np.array([True, True, True])
+        tm.assert_numpy_array_equal(lhs != NaT, expected)
+        tm.assert_numpy_array_equal(NaT != lhs, expected)
+
+        expected = np.array([False, False, False])
+        tm.assert_numpy_array_equal(lhs < NaT, expected)
+        tm.assert_numpy_array_equal(NaT > lhs, expected)
+
+    @pytest.mark.parametrize(
+        "idx2",
+        [
+            TimedeltaIndex(
+                ["2 day", "2 day", NaT, NaT, "1 day 00:00:02", "5 days 00:00:03"]
+            ),
+            np.array(
+                [
+                    np.timedelta64(2, "D"),
+                    np.timedelta64(2, "D"),
+                    np.timedelta64("nat"),
+                    np.timedelta64("nat"),
+                    np.timedelta64(1, "D") + np.timedelta64(2, "s"),
+                    np.timedelta64(5, "D") + np.timedelta64(3, "s"),
+                ]
+            ),
+        ],
+    )
+    def test_comparisons_nat(self, idx2):
+        idx1 = TimedeltaIndex(
+            [
+                "1 day",
+                NaT,
+                "1 day 00:00:01",
+                NaT,
+                "1 day 00:00:01",
+                "5 day 00:00:03",
+            ]
+        )
+        # Check pd.NaT is handles as the same as np.nan
+        result = idx1 < idx2
+        expected = np.array([True, False, False, False, True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = idx2 > idx1
+        expected = np.array([True, False, False, False, True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = idx1 <= idx2
+        expected = np.array([True, False, False, False, True, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = idx2 >= idx1
+        expected = np.array([True, False, False, False, True, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = idx1 == idx2
+        expected = np.array([False, False, False, False, False, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = idx1 != idx2
+        expected = np.array([True, True, True, True, True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+    # TODO: better name
+    def test_comparisons_coverage(self):
+        rng = timedelta_range("1 days", periods=10)
+
+        result = rng < rng[3]
+        expected = np.array([True, True, True] + [False] * 7)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = rng == list(rng)
+        exp = rng == rng
+        tm.assert_numpy_array_equal(result, exp)
+
+
+# ------------------------------------------------------------------
+# Timedelta64[ns] dtype Arithmetic Operations
+
+
+class TestTimedelta64ArithmeticUnsorted:
+    # Tests moved from type-specific test files but not
+    #  yet sorted/parametrized/de-duplicated
+
+    def test_td64_op_with_list(self, box_with_array):
+        # GH#62353
+        box = box_with_array
+
+        left = TimedeltaIndex(["2D", "4D"])
+        left = tm.box_expected(left, box)
+
+        right = [Timestamp("2016-01-01"), Timestamp("2016-02-01")]
+
+        result = left + right
+        expected = DatetimeIndex(["2016-01-03", "2016-02-05"], dtype="M8[us]")
+        expected = tm.box_expected(expected, box)
+        tm.assert_equal(result, expected)
+
+        result2 = right + left
+        tm.assert_equal(result2, expected)
+
+    def test_ufunc_coercions(self):
+        # normal ops are also tested in tseries/test_timedeltas.py
+        idx = TimedeltaIndex(["2h", "4h", "6h", "8h", "10h"], freq="2h", name="x")
+
+        for result in [idx * 2, np.multiply(idx, 2)]:
+            assert isinstance(result, TimedeltaIndex)
+            exp = TimedeltaIndex(["4h", "8h", "12h", "16h", "20h"], freq="4h", name="x")
+            tm.assert_index_equal(result, exp)
+            assert result.freq == "4h"
+
+        for result in [idx / 2, np.divide(idx, 2)]:
+            assert isinstance(result, TimedeltaIndex)
+            exp = TimedeltaIndex(["1h", "2h", "3h", "4h", "5h"], freq="h", name="x")
+            tm.assert_index_equal(result, exp)
+            assert result.freq == "h"
+
+        for result in [-idx, np.negative(idx)]:
+            assert isinstance(result, TimedeltaIndex)
+            exp = TimedeltaIndex(
+                ["-2h", "-4h", "-6h", "-8h", "-10h"], freq="-2h", name="x"
+            )
+            tm.assert_index_equal(result, exp)
+            assert result.freq == "-2h"
+
+        idx = TimedeltaIndex(["-2h", "-1h", "0h", "1h", "2h"], freq="h", name="x")
+        for result in [abs(idx), np.absolute(idx)]:
+            assert isinstance(result, TimedeltaIndex)
+            exp = TimedeltaIndex(["2h", "1h", "0h", "1h", "2h"], freq=None, name="x")
+            tm.assert_index_equal(result, exp)
+            assert result.freq is None
+
+    def test_subtraction_ops(self):
+        # with datetimes/timedelta and tdi/dti
+        tdi = TimedeltaIndex(["1 days", NaT, "2 days"], name="foo")
+        dti = pd.date_range("20130101", periods=3, name="bar")
+        td = Timedelta("1 days")
+        dt = Timestamp("20130101")
+
+        msg = "cannot subtract a datelike from a TimedeltaArray"
+        with pytest.raises(TypeError, match=msg):
+            tdi - dt
+        with pytest.raises(TypeError, match=msg):
+            tdi - dti
+
+        msg = r"unsupported operand type\(s\) for -"
+        with pytest.raises(TypeError, match=msg):
+            td - dt
+
+        msg = "cannot subtract DatetimeArray from Timedelta"
+        with pytest.raises(TypeError, match=msg):
+            td - dti
+
+        result = dt - dti
+        expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"], name="bar")
+        tm.assert_index_equal(result, expected)
+
+        result = dti - dt
+        expected = TimedeltaIndex(["0 days", "1 days", "2 days"], name="bar")
+        tm.assert_index_equal(result, expected)
+
+        result = tdi - td
+        expected = TimedeltaIndex(["0 days", NaT, "1 days"], name="foo")
+        tm.assert_index_equal(result, expected)
+
+        result = td - tdi
+        expected = TimedeltaIndex(["0 days", NaT, "-1 days"], name="foo")
+        tm.assert_index_equal(result, expected)
+
+        result = dti - td
+        expected = DatetimeIndex(
+            ["20121231", "20130101", "20130102"], dtype="M8[us]", freq="D", name="bar"
+        )
+        tm.assert_index_equal(result, expected)
+
+        result = dt - tdi
+        expected = DatetimeIndex(
+            ["20121231", NaT, "20121230"], dtype="M8[us]", name="foo"
+        )
+        tm.assert_index_equal(result, expected)
+
+    def test_subtraction_ops_with_tz(self, box_with_array):
+        # check that dt/dti subtraction ops with tz are validated
+        dti = pd.date_range("20130101", periods=3)
+        dti = tm.box_expected(dti, box_with_array)
+        ts = Timestamp("20130101")
+        dt = ts.to_pydatetime()
+        dti_tz = pd.date_range("20130101", periods=3).tz_localize("US/Eastern")
+        dti_tz = tm.box_expected(dti_tz, box_with_array)
+        ts_tz = Timestamp("20130101").tz_localize("US/Eastern")
+        ts_tz2 = Timestamp("20130101").tz_localize("CET")
+        dt_tz = ts_tz.to_pydatetime()
+        td = Timedelta("1 days")
+
+        def _check(result, expected):
+            assert result == expected
+            assert isinstance(result, Timedelta)
+
+        # scalars
+        result = ts - ts
+        expected = Timedelta("0 days")
+        _check(result, expected)
+
+        result = dt_tz - ts_tz
+        expected = Timedelta("0 days")
+        _check(result, expected)
+
+        result = ts_tz - dt_tz
+        expected = Timedelta("0 days")
+        _check(result, expected)
+
+        # tz mismatches
+        msg = "Cannot subtract tz-naive and tz-aware datetime-like objects."
+        with pytest.raises(TypeError, match=msg):
+            dt_tz - ts
+        msg = "can't subtract offset-naive and offset-aware datetimes"
+        with pytest.raises(TypeError, match=msg):
+            dt_tz - dt
+        msg = "can't subtract offset-naive and offset-aware datetimes"
+        with pytest.raises(TypeError, match=msg):
+            dt - dt_tz
+        msg = "Cannot subtract tz-naive and tz-aware datetime-like objects."
+        with pytest.raises(TypeError, match=msg):
+            ts - dt_tz
+        with pytest.raises(TypeError, match=msg):
+            ts_tz2 - ts
+        with pytest.raises(TypeError, match=msg):
+            ts_tz2 - dt
+
+        msg = "Cannot subtract tz-naive and tz-aware"
+        # with dti
+        with pytest.raises(TypeError, match=msg):
+            dti - ts_tz
+        with pytest.raises(TypeError, match=msg):
+            dti_tz - ts
+
+        result = dti_tz - dt_tz
+        expected = TimedeltaIndex(["0 days", "1 days", "2 days"])
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(result, expected)
+
+        result = dt_tz - dti_tz
+        expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"])
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(result, expected)
+
+        result = dti_tz - ts_tz
+        expected = TimedeltaIndex(["0 days", "1 days", "2 days"])
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(result, expected)
+
+        result = ts_tz - dti_tz
+        expected = TimedeltaIndex(["0 days", "-1 days", "-2 days"])
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(result, expected)
+
+        result = td - td
+        expected = Timedelta("0 days")
+        _check(result, expected)
+
+        result = dti_tz - td
+        expected = DatetimeIndex(["20121231", "20130101", "20130102"], tz="US/Eastern")
+        expected = tm.box_expected(expected, box_with_array)
+        tm.assert_equal(result, expected)
+
+    def test_dti_tdi_numeric_ops(self):
+        # These are normally union/diff set-like ops
+        tdi = TimedeltaIndex(["1 days", NaT, "2 days"], name="foo")
+        dti = pd.date_range("20130101", periods=3, name="bar")
+
+        result = tdi - tdi
+        expected = TimedeltaIndex(["0 days", NaT, "0 days"], name="foo")
+        tm.assert_index_equal(result, expected)
+
+        result = tdi + tdi
+        expected = TimedeltaIndex(["2 days", NaT, "4 days"], name="foo")
+        tm.assert_index_equal(result, expected)
+
+        result = dti - tdi  # name will be reset
+        expected = DatetimeIndex(["20121231", NaT, "20130101"], dtype="M8[us]")
+        tm.assert_index_equal(result, expected)
+
+    def test_addition_ops(self):
+        # with datetimes/timedelta and tdi/dti
+        tdi = TimedeltaIndex(["1 days", NaT, "2 days"], name="foo")
+        dti = pd.date_range("20130101", periods=3, name="bar")
+        td = Timedelta("1 days")
+        dt = Timestamp("20130101")
+
+        result = tdi + dt
+        expected = DatetimeIndex(
+            ["20130102", NaT, "20130103"], dtype="M8[us]", name="foo"
+        )
+        tm.assert_index_equal(result, expected)
+
+        result = dt + tdi
+        expected = DatetimeIndex(
+            ["20130102", NaT, "20130103"], dtype="M8[us]", name="foo"
+        )
+        tm.assert_index_equal(result, expected)
+
+        result = td + tdi
+        expected = TimedeltaIndex(["2 days", NaT, "3 days"], name="foo")
+        tm.assert_index_equal(result, expected)
+
+        result = tdi + td
+        expected = TimedeltaIndex(["2 days", NaT, "3 days"], name="foo")
+        tm.assert_index_equal(result, expected)
+
+        # unequal length
+        msg = "cannot add indices of unequal length"
+        with pytest.raises(ValueError, match=msg):
+            tdi + dti[0:1]
+        with pytest.raises(ValueError, match=msg):
+            tdi[0:1] + dti
+
+        # random indexes
+        msg = "Addition/subtraction of integers and integer-arrays"
+        with pytest.raises(TypeError, match=msg):
+            tdi + Index([1, 2, 3], dtype=np.int64)
+
+        # this is a union!
+        # FIXME: don't leave commented-out
+        # pytest.raises(TypeError, lambda : Index([1,2,3]) + tdi)
+
+        result = tdi + dti  # name will be reset
+        expected = DatetimeIndex(["20130102", NaT, "20130105"], dtype="M8[us]")
+        tm.assert_index_equal(result, expected)
+
+        result = dti + tdi  # name will be reset
+        expected = DatetimeIndex(["20130102", NaT, "20130105"], dtype="M8[us]")
+        tm.assert_index_equal(result, expected)
+
+        result = dt + td
+        expected = Timestamp("20130102")
+        assert result == expected
+
+        result = td + dt
+        expected = Timestamp("20130102")
+        assert result == expected
+
+    # TODO: Needs more informative name, probably split up into
+    # more targeted tests
+    @pytest.mark.parametrize("freq", ["D", "B"])
+    def test_timedelta(self, freq):
+        index = pd.date_range("1/1/2000", periods=50, freq=freq, unit="ns")
+
+        shifted = index + timedelta(1)
+        back = shifted + timedelta(-1)
+        back = back._with_freq("infer")
+        tm.assert_index_equal(index, back)
+
+        if freq == "D":
+            expected = pd.tseries.offsets.Day(1)
+            assert index.freq == expected
+            assert shifted.freq == expected
+            assert back.freq == expected
+        else:  # freq == 'B'
+            assert index.freq == pd.tseries.offsets.BusinessDay(1)
+            assert shifted.freq is None
+            assert back.freq == pd.tseries.offsets.BusinessDay(1)
+
+        result = index - timedelta(1)
+        expected = index + timedelta(-1)
+        tm.assert_index_equal(result, expected)
+
+    def test_timedelta_tick_arithmetic(self):
+        # GH#4134, buggy with timedeltas
+        rng = pd.date_range("2013", "2014")
+        s = Series(rng)
+        result1 = rng - offsets.Hour(1)
+        result2 = DatetimeIndex(s - np.timedelta64(100000000))
+        result3 = rng - np.timedelta64(100000000)
+        result4 = DatetimeIndex(s - offsets.Hour(1))
+
+        assert result1.freq == rng.freq
+        result1 = result1._with_freq(None)
+        tm.assert_index_equal(result1, result4)
+
+        assert result3.freq == rng.freq
+        result3 = result3._with_freq(None)
+        tm.assert_index_equal(result2, result3)
+
+    def test_tda_add_sub_index(self):
+        # Check that TimedeltaArray defers to Index on arithmetic ops
+        tdi = TimedeltaIndex(["1 days", NaT, "2 days"])
+        tda = tdi.array
+
+        dti = pd.date_range("1999-12-31", periods=3, freq="D")
+
+        result = tda + dti
+        expected = tdi + dti
+        tm.assert_index_equal(result, expected)
+
+        result = tda + tdi
+        expected = tdi + tdi
+        tm.assert_index_equal(result, expected)
+
+        result = tda - tdi
+        expected = tdi - tdi
+        tm.assert_index_equal(result, expected)
+
+    def test_tda_add_dt64_object_array(
+        self, performance_warning, box_with_array, tz_naive_fixture
+    ):
+        # Result should be cast back to DatetimeArray
+        box = box_with_array
+
+        dti = pd.date_range("2016-01-01", periods=3, tz=tz_naive_fixture)
+        dti = dti._with_freq(None)
+        tdi = dti - dti
+
+        obj = tm.box_expected(tdi, box)
+        other = tm.box_expected(dti, box)
+
+        with tm.assert_produces_warning(performance_warning):
+            result = obj + other.astype(object)
+        tm.assert_equal(result, other.astype(object))
+
+    # -------------------------------------------------------------
+    # Binary operations TimedeltaIndex and timedelta-like
+
+    def test_tdi_iadd_timedeltalike(self, two_hours, box_with_array):
+        # only test adding/sub offsets as + is now numeric
+        rng = timedelta_range("1 days", "10 days")
+        expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D")
+        if (
+            isinstance(two_hours, Timedelta)
+            and two_hours.unit == "ns"
+            and box_with_array is not pd.array
+        ):
+            # The EA op has to be _actually_ inplace so does not cast to a
+            #  new dtype.  For the others, the op can assign a new array
+            #  and get the dtype that normally results from `rng + two_hours`
+            expected = expected.as_unit("ns")
+
+        rng = tm.box_expected(rng, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        orig_rng = rng
+        rng += two_hours
+        tm.assert_equal(rng, expected)
+        if box_with_array is not Index:
+            # Check that operation is actually inplace
+            tm.assert_equal(orig_rng, expected)
+
+    def test_tdi_isub_timedeltalike(self, two_hours, box_with_array):
+        # only test adding/sub offsets as - is now numeric
+        rng = timedelta_range("1 days", "10 days")
+        expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00")
+        if (
+            isinstance(two_hours, Timedelta)
+            and two_hours.unit == "ns"
+            and box_with_array is not pd.array
+        ):
+            # The EA op has to be _actually_ inplace so does not cast to a
+            #  new dtype.  For the others, the op can assign a new array
+            #  and get the dtype that normally results from `rng - two_hours`
+            expected = expected.as_unit("ns")
+
+        rng = tm.box_expected(rng, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        orig_rng = rng
+        rng -= two_hours
+        tm.assert_equal(rng, expected)
+        if box_with_array is not Index:
+            # Check that operation is actually inplace
+            tm.assert_equal(orig_rng, expected)
+
+    # -------------------------------------------------------------
+
+    def test_tdi_ops_attributes(self):
+        rng = timedelta_range("2 days", periods=5, freq="2D", name="x")
+
+        result = rng + 1 * rng.freq
+        exp = timedelta_range("4 days", periods=5, freq="2D", name="x")
+        tm.assert_index_equal(result, exp)
+        assert result.freq == "2D"
+
+        result = rng - 2 * rng.freq
+        exp = timedelta_range("-2 days", periods=5, freq="2D", name="x")
+        tm.assert_index_equal(result, exp)
+        assert result.freq == "2D"
+
+        result = rng * 2
+        exp = timedelta_range("4 days", periods=5, freq="4D", name="x")
+        tm.assert_index_equal(result, exp)
+        assert result.freq == "4D"
+
+        result = rng / 2
+        exp = timedelta_range("1 days", periods=5, freq="D", name="x")
+        tm.assert_index_equal(result, exp)
+        assert result.freq == "D"
+
+        result = -rng
+        exp = timedelta_range("-2 days", periods=5, freq="-2D", name="x")
+        tm.assert_index_equal(result, exp)
+        assert result.freq == "-2D"
+
+        rng = timedelta_range("-2 days", periods=5, freq="D", name="x")
+
+        result = abs(rng)
+        exp = TimedeltaIndex(
+            ["2 days", "1 days", "0 days", "1 days", "2 days"], name="x"
+        )
+        tm.assert_index_equal(result, exp)
+        assert result.freq is None
+
+
+class TestAddSubNaTMasking:
+    # TODO: parametrize over boxes
+
+    @pytest.mark.parametrize("str_ts", ["1950-01-01", "1980-01-01"])
+    def test_tdarr_add_timestamp_nat_masking(self, box_with_array, str_ts):
+        # GH#17991 checking for overflow-masking with NaT
+        tdinat = pd.to_timedelta(["24658 days 11:15:00", "NaT"])
+        tdobj = tm.box_expected(tdinat, box_with_array)
+
+        ts = Timestamp(str_ts)
+        ts_variants = [
+            ts,
+            ts.to_pydatetime(),
+            ts.to_datetime64().astype("datetime64[ns]"),
+            ts.to_datetime64().astype("datetime64[D]"),
+        ]
+
+        for variant in ts_variants:
+            res = tdobj + variant
+            if box_with_array is DataFrame:
+                assert res.iloc[1, 1] is NaT
+            else:
+                assert res[1] is NaT
+
+    def test_tdi_add_overflow(self):
+        # See GH#14068
+        # preliminary test scalar analogue of vectorized tests below
+        # TODO: Make raised error message more informative and test
+        ts = Timestamp("2000").as_unit("ns")
+        with pytest.raises(OutOfBoundsDatetime, match="10155196800000000000"):
+            pd.to_timedelta(106580, "D") + ts
+        with pytest.raises(OutOfBoundsDatetime, match="10155196800000000000"):
+            ts + pd.to_timedelta(106580, "D")
+
+        _NaT = NaT._value + 1
+        td = pd.to_timedelta([106580], "D").as_unit("ns")
+        msg = "Overflow in int64 addition"
+        with pytest.raises(OverflowError, match=msg):
+            td + Timestamp("2000")
+        with pytest.raises(OverflowError, match=msg):
+            Timestamp("2000") + td
+        with pytest.raises(OverflowError, match=msg):
+            pd.to_timedelta([_NaT]) - Timedelta("1 days")
+        with pytest.raises(OverflowError, match=msg):
+            pd.to_timedelta(["5 days", _NaT]) - Timedelta("1 days")
+        with pytest.raises(OverflowError, match=msg):
+            (
+                pd.to_timedelta([_NaT, "5 days", "1 hours"])
+                - pd.to_timedelta(["7 seconds", _NaT, "4 hours"])
+            )
+
+        # These should not overflow!
+        exp = TimedeltaIndex([NaT], dtype="m8[us]")
+        result = pd.to_timedelta([NaT]) - Timedelta("1 days")
+        tm.assert_index_equal(result, exp)
+
+        exp = TimedeltaIndex(["4 days", NaT])
+        result = pd.to_timedelta(["5 days", NaT]) - Timedelta("1 days")
+        tm.assert_index_equal(result, exp)
+
+        exp = TimedeltaIndex([NaT, NaT, "5 hours"])
+        result = pd.to_timedelta([NaT, "5 days", "1 hours"]) + pd.to_timedelta(
+            ["7 seconds", NaT, "4 hours"]
+        )
+        tm.assert_index_equal(result, exp)
+
+
+class TestTimedeltaArraylikeAddSubOps:
+    # Tests for timedelta64[ns] __add__, __sub__, __radd__, __rsub__
+
+    def test_sub_nat_retain_unit(self):
+        ser = pd.to_timedelta(Series(["00:00:01"])).astype("m8[s]")
+
+        result = ser - NaT
+        expected = Series([NaT], dtype="m8[s]")
+        tm.assert_series_equal(result, expected)
+
+    # TODO: moved from tests.indexes.timedeltas.test_arithmetic; needs
+    #  parametrization+de-duplication
+    def test_timedelta_ops_with_missing_values(self):
+        # setup
+        s1 = pd.to_timedelta(Series(["00:00:01"]))
+        s2 = pd.to_timedelta(Series(["00:00:02"]))
+
+        sn = pd.to_timedelta(Series([NaT], dtype="m8[us]"))
+
+        df1 = DataFrame(["00:00:01"]).apply(pd.to_timedelta)
+        df2 = DataFrame(["00:00:02"]).apply(pd.to_timedelta)
+
+        dfn = DataFrame([NaT._value]).apply(pd.to_timedelta).astype("m8[us]")
+
+        scalar1 = pd.to_timedelta("00:00:01")
+        scalar2 = pd.to_timedelta("00:00:02")
+        timedelta_NaT = pd.to_timedelta("NaT")
+
+        actual = scalar1 + scalar1
+        assert actual == scalar2
+        actual = scalar2 - scalar1
+        assert actual == scalar1
+
+        actual = s1 + s1
+        tm.assert_series_equal(actual, s2)
+        actual = s2 - s1
+        tm.assert_series_equal(actual, s1)
+
+        actual = s1 + scalar1
+        tm.assert_series_equal(actual, s2)
+        actual = scalar1 + s1
+        tm.assert_series_equal(actual, s2)
+        actual = s2 - scalar1
+        tm.assert_series_equal(actual, s1)
+        actual = -scalar1 + s2
+        tm.assert_series_equal(actual, s1)
+
+        actual = s1 + timedelta_NaT
+        tm.assert_series_equal(actual, sn)
+        actual = timedelta_NaT + s1
+        tm.assert_series_equal(actual, sn)
+        actual = s1 - timedelta_NaT
+        tm.assert_series_equal(actual, sn)
+        actual = -timedelta_NaT + s1
+        tm.assert_series_equal(actual, sn)
+
+        msg = "unsupported operand type"
+        with pytest.raises(TypeError, match=msg):
+            s1 + np.nan
+        with pytest.raises(TypeError, match=msg):
+            np.nan + s1
+        with pytest.raises(TypeError, match=msg):
+            s1 - np.nan
+        with pytest.raises(TypeError, match=msg):
+            -np.nan + s1
+
+        actual = s1 + NaT
+        tm.assert_series_equal(actual, sn)
+        actual = s2 - NaT
+        tm.assert_series_equal(actual, sn)
+
+        actual = s1 + df1
+        tm.assert_frame_equal(actual, df2)
+        actual = s2 - df1
+        tm.assert_frame_equal(actual, df1)
+        actual = df1 + s1
+        tm.assert_frame_equal(actual, df2)
+        actual = df2 - s1
+        tm.assert_frame_equal(actual, df1)
+
+        actual = df1 + df1
+        tm.assert_frame_equal(actual, df2)
+        actual = df2 - df1
+        tm.assert_frame_equal(actual, df1)
+
+        actual = df1 + scalar1
+        tm.assert_frame_equal(actual, df2)
+        actual = df2 - scalar1
+        tm.assert_frame_equal(actual, df1)
+
+        actual = df1 + timedelta_NaT
+        tm.assert_frame_equal(actual, dfn)
+        actual = df1 - timedelta_NaT
+        tm.assert_frame_equal(actual, dfn)
+
+        msg = "cannot subtract a datelike from|unsupported operand type"
+        with pytest.raises(TypeError, match=msg):
+            df1 + np.nan
+        with pytest.raises(TypeError, match=msg):
+            df1 - np.nan
+
+        actual = df1 + NaT  # NaT is datetime, not timedelta
+        tm.assert_frame_equal(actual, dfn)
+        actual = df1 - NaT
+        tm.assert_frame_equal(actual, dfn)
+
+    # TODO: moved from tests.series.test_operators, needs splitting, cleanup,
+    # de-duplication, box-parametrization...
+    def test_operators_timedelta64(self):
+        # series ops
+        v1 = pd.date_range("2012-1-1", periods=3, freq="D", unit="ns")
+        v2 = pd.date_range("2012-1-2", periods=3, freq="D", unit="ns")
+        rs = Series(v2) - Series(v1)
+        xp = Series(1e9 * 3600 * 24, rs.index).astype("int64").astype("timedelta64[ns]")
+        tm.assert_series_equal(rs, xp)
+        assert rs.dtype == "timedelta64[ns]"
+
+        df = DataFrame({"A": v1})
+        td = Series([timedelta(days=i) for i in range(3)], dtype="m8[ns]")
+        assert td.dtype == "timedelta64[ns]"
+
+        # series on the rhs
+        result = df["A"] - df["A"].shift()
+        assert result.dtype == "timedelta64[ns]"
+
+        result = df["A"] + td
+        assert result.dtype == "M8[ns]"
+
+        # scalar Timestamp on rhs
+        maxa = df["A"].max()
+        assert isinstance(maxa, Timestamp)
+
+        resultb = df["A"] - df["A"].max()
+        assert resultb.dtype == "timedelta64[ns]"
+
+        # timestamp on lhs
+        result = resultb + df["A"]
+        values = [Timestamp("20111230"), Timestamp("20120101"), Timestamp("20120103")]
+        expected = Series(values, dtype="M8[ns]", name="A")
+        tm.assert_series_equal(result, expected)
+
+        # datetimes on rhs
+        result = df["A"] - datetime(2001, 1, 1)
+        expected = Series(
+            [timedelta(days=4017 + i) for i in range(3)], name="A", dtype="m8[ns]"
+        )
+        tm.assert_series_equal(result, expected)
+        assert result.dtype == "m8[ns]"
+
+        d = datetime(2001, 1, 1, 3, 4)
+        resulta = df["A"] - d
+        assert resulta.dtype == "m8[ns]"
+
+        # roundtrip
+        resultb = resulta + d
+        tm.assert_series_equal(df["A"], resultb)
+
+        # timedeltas on rhs
+        td = timedelta(days=1)
+        resulta = df["A"] + td
+        resultb = resulta - td
+        tm.assert_series_equal(resultb, df["A"])
+        assert resultb.dtype == "M8[ns]"
+
+        # roundtrip
+        td = timedelta(minutes=5, seconds=3)
+        resulta = df["A"] + td
+        resultb = resulta - td
+        tm.assert_series_equal(df["A"], resultb)
+        assert resultb.dtype == "M8[ns]"
+
+        # inplace
+        value = rs[2] + np.timedelta64(timedelta(minutes=5, seconds=1))
+        rs[2] += np.timedelta64(timedelta(minutes=5, seconds=1))
+        assert rs[2] == value
+
+    def test_timedelta64_ops_nat(self):
+        # GH 11349
+        timedelta_series = Series([NaT, Timedelta("1s")])
+        nat_series_dtype_timedelta = Series([NaT, NaT], dtype="timedelta64[us]")
+        single_nat_dtype_timedelta = Series([NaT], dtype="timedelta64[us]")
+
+        # subtraction
+        tm.assert_series_equal(timedelta_series - NaT, nat_series_dtype_timedelta)
+        tm.assert_series_equal(-NaT + timedelta_series, nat_series_dtype_timedelta)
+
+        tm.assert_series_equal(
+            timedelta_series - single_nat_dtype_timedelta, nat_series_dtype_timedelta
+        )
+        tm.assert_series_equal(
+            -single_nat_dtype_timedelta + timedelta_series, nat_series_dtype_timedelta
+        )
+
+        # addition
+        tm.assert_series_equal(
+            nat_series_dtype_timedelta + NaT, nat_series_dtype_timedelta
+        )
+        tm.assert_series_equal(
+            NaT + nat_series_dtype_timedelta, nat_series_dtype_timedelta
+        )
+
+        tm.assert_series_equal(
+            nat_series_dtype_timedelta + single_nat_dtype_timedelta,
+            nat_series_dtype_timedelta,
+        )
+        tm.assert_series_equal(
+            single_nat_dtype_timedelta + nat_series_dtype_timedelta,
+            nat_series_dtype_timedelta,
+        )
+
+        tm.assert_series_equal(timedelta_series + NaT, nat_series_dtype_timedelta)
+        tm.assert_series_equal(NaT + timedelta_series, nat_series_dtype_timedelta)
+
+        tm.assert_series_equal(
+            timedelta_series + single_nat_dtype_timedelta, nat_series_dtype_timedelta
+        )
+        tm.assert_series_equal(
+            single_nat_dtype_timedelta + timedelta_series, nat_series_dtype_timedelta
+        )
+
+        tm.assert_series_equal(
+            nat_series_dtype_timedelta + NaT, nat_series_dtype_timedelta
+        )
+        tm.assert_series_equal(
+            NaT + nat_series_dtype_timedelta, nat_series_dtype_timedelta
+        )
+
+        tm.assert_series_equal(
+            nat_series_dtype_timedelta + single_nat_dtype_timedelta,
+            nat_series_dtype_timedelta,
+        )
+        tm.assert_series_equal(
+            single_nat_dtype_timedelta + nat_series_dtype_timedelta,
+            nat_series_dtype_timedelta,
+        )
+
+        # multiplication
+        tm.assert_series_equal(
+            nat_series_dtype_timedelta * 1.0, nat_series_dtype_timedelta
+        )
+        tm.assert_series_equal(
+            1.0 * nat_series_dtype_timedelta, nat_series_dtype_timedelta
+        )
+
+        tm.assert_series_equal(timedelta_series * 1, timedelta_series)
+        tm.assert_series_equal(1 * timedelta_series, timedelta_series)
+
+        tm.assert_series_equal(timedelta_series * 1.5, Series([NaT, Timedelta("1.5s")]))
+        tm.assert_series_equal(1.5 * timedelta_series, Series([NaT, Timedelta("1.5s")]))
+
+        tm.assert_series_equal(timedelta_series * np.nan, nat_series_dtype_timedelta)
+        tm.assert_series_equal(np.nan * timedelta_series, nat_series_dtype_timedelta)
+
+        # division
+        tm.assert_series_equal(timedelta_series / 2, Series([NaT, Timedelta("0.5s")]))
+        tm.assert_series_equal(timedelta_series / 2.0, Series([NaT, Timedelta("0.5s")]))
+        tm.assert_series_equal(timedelta_series / np.nan, nat_series_dtype_timedelta)
+
+    # -------------------------------------------------------------
+    # Binary operations td64 arraylike and datetime-like
+
+    @pytest.mark.parametrize("cls", [Timestamp, datetime, np.datetime64])
+    def test_td64arr_add_sub_datetimelike_scalar(
+        self, cls, box_with_array, tz_naive_fixture
+    ):
+        # GH#11925, GH#29558, GH#23215
+        tz = tz_naive_fixture
+
+        dt_scalar = Timestamp("2012-01-01", tz=tz)
+        if cls is datetime:
+            ts = dt_scalar.to_pydatetime()
+        elif cls is np.datetime64:
+            if tz_naive_fixture is not None:
+                pytest.skip(f"{cls} doesn't support {tz_naive_fixture}")
+            ts = dt_scalar.to_datetime64()
+        else:
+            ts = dt_scalar
+
+        tdi = timedelta_range("1 day", periods=3)
+        expected = pd.date_range("2012-01-02", periods=3, tz=tz)
+        if tz is not None and not timezones.is_utc(expected.tz):
+            # Day is no longer preserved by timedelta add/sub in pandas3 because
+            #  it represents Calendar-Day instead of 24h
+            expected = expected._with_freq(None)
+
+        tdarr = tm.box_expected(tdi, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        tm.assert_equal(ts + tdarr, expected)
+        tm.assert_equal(tdarr + ts, expected)
+
+        expected2 = pd.date_range("2011-12-31", periods=3, freq="-1D", tz=tz)
+        if tz is not None and not timezones.is_utc(expected2.tz):
+            # Day is no longer preserved by timedelta add/sub in pandas3 because
+            #  it represents Calendar-Day instead of 24h
+            expected2 = expected2._with_freq(None)
+        expected2 = tm.box_expected(expected2, box_with_array)
+
+        tm.assert_equal(ts - tdarr, expected2)
+        tm.assert_equal(ts + (-tdarr), expected2)
+
+        msg = "cannot subtract a datelike"
+        with pytest.raises(TypeError, match=msg):
+            tdarr - ts
+
+    def test_td64arr_add_datetime64_nat(self, box_with_array):
+        # GH#23215
+        other = np.datetime64("NaT")
+
+        tdi = timedelta_range("1 day", periods=3)
+        expected = DatetimeIndex(["NaT", "NaT", "NaT"], dtype="M8[us]")
+
+        tdser = tm.box_expected(tdi, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        tm.assert_equal(tdser + other, expected)
+        tm.assert_equal(other + tdser, expected)
+
+    def test_td64arr_sub_dt64_array(self, box_with_array):
+        dti = pd.date_range("2016-01-01", periods=3)
+        tdi = TimedeltaIndex(["-1 Day"] * 3)
+        dtarr = dti.values
+        expected = DatetimeIndex(dtarr) - tdi
+
+        tdi = tm.box_expected(tdi, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        msg = "cannot subtract a datelike from"
+        with pytest.raises(TypeError, match=msg):
+            tdi - dtarr
+
+        # TimedeltaIndex.__rsub__
+        result = dtarr - tdi
+        tm.assert_equal(result, expected)
+
+    def test_td64arr_add_dt64_array(self, box_with_array):
+        dti = pd.date_range("2016-01-01", periods=3)
+        tdi = TimedeltaIndex(["-1 Day"] * 3)
+        dtarr = dti.values
+        expected = DatetimeIndex(dtarr) + tdi
+
+        tdi = tm.box_expected(tdi, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = tdi + dtarr
+        tm.assert_equal(result, expected)
+        result = dtarr + tdi
+        tm.assert_equal(result, expected)
+
+    # ------------------------------------------------------------------
+    # Invalid __add__/__sub__ operations
+
+    @pytest.mark.parametrize("pi_freq", ["D", "W", "Q", "h"])
+    @pytest.mark.parametrize("tdi_freq", [None, "h"])
+    def test_td64arr_sub_periodlike(
+        self, box_with_array, box_with_array2, tdi_freq, pi_freq
+    ):
+        # GH#20049 subtracting PeriodIndex should raise TypeError
+        tdi = TimedeltaIndex(["1 hours", "2 hours"], freq=tdi_freq)
+        dti = Timestamp("2018-03-07 17:16:40") + tdi
+        pi = dti.to_period(pi_freq)
+        per = pi[0]
+
+        tdi = tm.box_expected(tdi, box_with_array)
+        pi = tm.box_expected(pi, box_with_array2)
+        msg = "|".join(
+            [
+                "cannot subtract",
+                "unsupported operand type",
+                r"bad operand type for unary \-: 'PeriodArray'",
+                r"Input has different freq=-1h from PeriodArray\(.*\)",
+                "Cannot add/subtract timedelta-like from PeriodArray",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            tdi - pi
+
+        # GH#13078 subtraction of Period scalar not supported
+        with pytest.raises(TypeError, match=msg):
+            tdi - per
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            # GH#12624 for str case
+            "a",
+            # GH#19123
+            1,
+            1.5,
+            np.array(2),
+        ],
+    )
+    def test_td64arr_addsub_numeric_scalar_invalid(self, box_with_array, other):
+        # vector-like others are tested in test_td64arr_add_sub_numeric_arr_invalid
+        tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]")
+        tdarr = tm.box_expected(tdser, box_with_array)
+
+        assert_invalid_addsub_type(tdarr, other)
+
+    @pytest.mark.parametrize(
+        "vec",
+        [
+            np.array([1, 2, 3]),
+            Index([1, 2, 3]),
+            Series([1, 2, 3]),
+            DataFrame([[1, 2, 3]]),
+        ],
+        ids=lambda x: type(x).__name__,
+    )
+    def test_td64arr_addsub_numeric_arr_invalid(
+        self, box_with_array, vec, any_real_numpy_dtype
+    ):
+        tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]")
+        tdarr = tm.box_expected(tdser, box_with_array)
+
+        vector = vec.astype(any_real_numpy_dtype)
+        assert_invalid_addsub_type(tdarr, vector)
+
+    def test_td64arr_add_sub_int(self, box_with_array, one):
+        # Variants of `one` for #19012, deprecated GH#22535
+        rng = timedelta_range("1 days 09:00:00", freq="h", periods=10)
+        tdarr = tm.box_expected(rng, box_with_array)
+
+        msg = "Addition/subtraction of integers"
+        assert_invalid_addsub_type(tdarr, one, msg)
+
+        # TODO: get inplace ops into assert_invalid_addsub_type
+        with pytest.raises(TypeError, match=msg):
+            tdarr += one
+        with pytest.raises(TypeError, match=msg):
+            tdarr -= one
+
+    def test_td64arr_add_sub_integer_array(self, box_with_array):
+        # GH#19959, deprecated GH#22535
+        # GH#22696 for DataFrame case, check that we don't dispatch to numpy
+        #  implementation, which treats int64 as m8[ns]
+        box = box_with_array
+        xbox = np.ndarray if box is pd.array else box
+
+        rng = timedelta_range("1 days 09:00:00", freq="h", periods=3)
+        tdarr = tm.box_expected(rng, box)
+        other = tm.box_expected([4, 3, 2], xbox)
+
+        msg = "Addition/subtraction of integers and integer-arrays"
+        assert_invalid_addsub_type(tdarr, other, msg)
+
+    def test_td64arr_addsub_integer_array_no_freq(self, box_with_array):
+        # GH#19959
+        box = box_with_array
+        xbox = np.ndarray if box is pd.array else box
+
+        tdi = TimedeltaIndex(["1 Day", "NaT", "3 Hours"])
+        tdarr = tm.box_expected(tdi, box)
+        other = tm.box_expected([14, -1, 16], xbox)
+
+        msg = "Addition/subtraction of integers"
+        assert_invalid_addsub_type(tdarr, other, msg)
+
+    # ------------------------------------------------------------------
+    # Operations with timedelta-like others
+
+    def test_td64arr_add_sub_td64_array(self, box_with_array):
+        box = box_with_array
+        dti = pd.date_range("2016-01-01", periods=3)
+        tdi = dti - dti.shift(1)
+        tdarr = tdi.values
+
+        expected = 2 * tdi
+        tdi = tm.box_expected(tdi, box)
+        expected = tm.box_expected(expected, box)
+
+        result = tdi + tdarr
+        tm.assert_equal(result, expected)
+        result = tdarr + tdi
+        tm.assert_equal(result, expected)
+
+        expected_sub = 0 * tdi
+        result = tdi - tdarr
+        tm.assert_equal(result, expected_sub)
+        result = tdarr - tdi
+        tm.assert_equal(result, expected_sub)
+
+    def test_td64arr_add_sub_tdi(self, box_with_array, names):
+        # GH#17250 make sure result dtype is correct
+        # GH#19043 make sure names are propagated correctly
+        box = box_with_array
+        exname = get_expected_name(box, names)
+
+        tdi = TimedeltaIndex(["0 days", "1 day"], name=names[1])
+        tdi = np.array(tdi) if box in [tm.to_array, pd.array] else tdi
+        ser = Series(
+            [Timedelta(hours=3), Timedelta(hours=4)], name=names[0], dtype="m8[ns]"
+        )
+        expected = Series(
+            [Timedelta(hours=3), Timedelta(days=1, hours=4)],
+            name=exname,
+            dtype="m8[ns]",
+        )
+
+        ser = tm.box_expected(ser, box)
+        expected = tm.box_expected(expected, box)
+
+        result = tdi + ser
+        tm.assert_equal(result, expected)
+        assert_dtype(result, "timedelta64[ns]")
+
+        result = ser + tdi
+        tm.assert_equal(result, expected)
+        assert_dtype(result, "timedelta64[ns]")
+
+        expected = Series(
+            [Timedelta(hours=-3), Timedelta(days=1, hours=-4)],
+            name=exname,
+            dtype="m8[ns]",
+        )
+        expected = tm.box_expected(expected, box)
+
+        result = tdi - ser
+        tm.assert_equal(result, expected)
+        assert_dtype(result, "timedelta64[ns]")
+
+        result = ser - tdi
+        tm.assert_equal(result, -expected)
+        assert_dtype(result, "timedelta64[ns]")
+
+    @pytest.mark.parametrize("tdnat", [np.timedelta64("NaT"), NaT])
+    def test_td64arr_add_sub_td64_nat(self, box_with_array, tdnat):
+        # GH#18808, GH#23320 special handling for timedelta64("NaT")
+        box = box_with_array
+        tdi = TimedeltaIndex([NaT, Timedelta("1s")])
+        expected = TimedeltaIndex(["NaT"] * 2).as_unit("us")
+
+        obj = tm.box_expected(tdi, box)
+        expected = tm.box_expected(expected, box)
+
+        result = obj + tdnat
+        tm.assert_equal(result, expected)
+        result = tdnat + obj
+        tm.assert_equal(result, expected)
+        result = obj - tdnat
+        tm.assert_equal(result, expected)
+        result = tdnat - obj
+        tm.assert_equal(result, expected)
+
+    def test_td64arr_add_timedeltalike(self, two_hours, box_with_array):
+        # only test adding/sub offsets as + is now numeric
+        # GH#10699 for Tick cases
+        box = box_with_array
+        rng = timedelta_range("1 days", "10 days")
+        expected = timedelta_range("1 days 02:00:00", "10 days 02:00:00", freq="D")
+        if isinstance(two_hours, Timedelta) and two_hours.unit == "ns":
+            expected = expected.as_unit("ns")
+
+        rng = tm.box_expected(rng, box)
+        expected = tm.box_expected(expected, box)
+
+        result = rng + two_hours
+        tm.assert_equal(result, expected)
+
+        result = two_hours + rng
+        tm.assert_equal(result, expected)
+
+    def test_td64arr_sub_timedeltalike(self, two_hours, box_with_array):
+        # only test adding/sub offsets as - is now numeric
+        # GH#10699 for Tick cases
+        box = box_with_array
+        rng = timedelta_range("1 days", "10 days")
+        expected = timedelta_range("0 days 22:00:00", "9 days 22:00:00")
+        if isinstance(two_hours, Timedelta) and two_hours.unit == "ns":
+            expected = expected.as_unit("ns")
+
+        rng = tm.box_expected(rng, box)
+        expected = tm.box_expected(expected, box)
+
+        result = rng - two_hours
+        tm.assert_equal(result, expected)
+
+        result = two_hours - rng
+        tm.assert_equal(result, -expected)
+
+    # ------------------------------------------------------------------
+    # __add__/__sub__ with DateOffsets and arrays of DateOffsets
+
+    def test_td64arr_add_sub_offset_index(
+        self, performance_warning, names, box_with_array
+    ):
+        # GH#18849, GH#19744
+        box = box_with_array
+        exname = get_expected_name(box, names)
+
+        tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0])
+        other = Index([offsets.Hour(n=1), offsets.Minute(n=-2)], name=names[1])
+        other = np.array(other) if box in [tm.to_array, pd.array] else other
+
+        expected = TimedeltaIndex(
+            [tdi[n] + other[n] for n in range(len(tdi))], freq="infer", name=exname
+        )
+        expected_sub = TimedeltaIndex(
+            [tdi[n] - other[n] for n in range(len(tdi))], freq="infer", name=exname
+        )
+
+        tdi = tm.box_expected(tdi, box)
+        expected = tm.box_expected(expected, box).astype(object)
+        expected_sub = tm.box_expected(expected_sub, box).astype(object)
+
+        with tm.assert_produces_warning(performance_warning):
+            res = tdi + other
+        tm.assert_equal(res, expected)
+
+        with tm.assert_produces_warning(performance_warning):
+            res2 = other + tdi
+        tm.assert_equal(res2, expected)
+
+        with tm.assert_produces_warning(performance_warning):
+            res_sub = tdi - other
+        tm.assert_equal(res_sub, expected_sub)
+
+    def test_td64arr_add_sub_offset_array(self, performance_warning, box_with_array):
+        # GH#18849, GH#18824
+        box = box_with_array
+        tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"])
+        other = np.array([offsets.Hour(n=1), offsets.Minute(n=-2)])
+
+        expected = TimedeltaIndex(
+            [tdi[n] + other[n] for n in range(len(tdi))], freq="infer"
+        )
+        expected_sub = TimedeltaIndex(
+            [tdi[n] - other[n] for n in range(len(tdi))], freq="infer"
+        )
+
+        tdi = tm.box_expected(tdi, box)
+        expected = tm.box_expected(expected, box).astype(object)
+
+        with tm.assert_produces_warning(performance_warning):
+            res = tdi + other
+        tm.assert_equal(res, expected)
+
+        with tm.assert_produces_warning(performance_warning):
+            res2 = other + tdi
+        tm.assert_equal(res2, expected)
+
+        expected_sub = tm.box_expected(expected_sub, box_with_array).astype(object)
+        with tm.assert_produces_warning(performance_warning):
+            res_sub = tdi - other
+        tm.assert_equal(res_sub, expected_sub)
+
+    def test_td64arr_with_offset_series(
+        self, performance_warning, names, box_with_array
+    ):
+        # GH#18849
+        box = box_with_array
+        box2 = Series if box in [Index, tm.to_array, pd.array] else box
+        exname = get_expected_name(box, names)
+
+        tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0])
+        other = Series([offsets.Hour(n=1), offsets.Minute(n=-2)], name=names[1])
+
+        expected_add = Series(
+            [tdi[n] + other[n] for n in range(len(tdi))], name=exname, dtype=object
+        )
+        obj = tm.box_expected(tdi, box)
+        expected_add = tm.box_expected(expected_add, box2).astype(object)
+
+        with tm.assert_produces_warning(performance_warning):
+            res = obj + other
+        tm.assert_equal(res, expected_add)
+
+        with tm.assert_produces_warning(performance_warning):
+            res2 = other + obj
+        tm.assert_equal(res2, expected_add)
+
+        expected_sub = Series(
+            [tdi[n] - other[n] for n in range(len(tdi))], name=exname, dtype=object
+        )
+        expected_sub = tm.box_expected(expected_sub, box2).astype(object)
+
+        with tm.assert_produces_warning(performance_warning):
+            res3 = obj - other
+        tm.assert_equal(res3, expected_sub)
+
+    @pytest.mark.parametrize("obox", [np.array, Index, Series])
+    def test_td64arr_addsub_anchored_offset_arraylike(
+        self, performance_warning, obox, box_with_array
+    ):
+        # GH#18824
+        tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"])
+        tdi = tm.box_expected(tdi, box_with_array)
+
+        anchored = obox([offsets.MonthEnd(), offsets.Day(n=2)])
+
+        # addition/subtraction ops with anchored offsets should issue
+        # a PerformanceWarning and _then_ raise a TypeError.
+        msg = "has incorrect type|cannot add the type MonthEnd"
+        with pytest.raises(TypeError, match=msg):
+            with tm.assert_produces_warning(performance_warning):
+                tdi + anchored
+        with pytest.raises(TypeError, match=msg):
+            with tm.assert_produces_warning(performance_warning):
+                anchored + tdi
+        with pytest.raises(TypeError, match=msg):
+            with tm.assert_produces_warning(performance_warning):
+                tdi - anchored
+        with pytest.raises(TypeError, match=msg):
+            with tm.assert_produces_warning(performance_warning):
+                anchored - tdi
+
+    # ------------------------------------------------------------------
+    # Unsorted
+
+    def test_td64arr_add_sub_object_array(self, performance_warning, box_with_array):
+        box = box_with_array
+        xbox = np.ndarray if box is pd.array else box
+
+        tdi = timedelta_range("1 day", periods=3, freq="D")
+        tdarr = tm.box_expected(tdi, box)
+
+        other = np.array([Timedelta(days=1), offsets.Day(2), Timestamp("2000-01-04")])
+
+        with tm.assert_produces_warning(performance_warning):
+            result = tdarr + other
+
+        expected = Index(
+            [Timedelta(days=2), Timedelta(days=4), Timestamp("2000-01-07")]
+        )
+        expected = tm.box_expected(expected, xbox).astype(object)
+        tm.assert_equal(result, expected)
+
+        msg = "unsupported operand type|cannot subtract a datelike"
+        with pytest.raises(TypeError, match=msg):
+            with tm.assert_produces_warning(performance_warning):
+                tdarr - other
+
+        with tm.assert_produces_warning(performance_warning):
+            result = other - tdarr
+
+        expected = Index([Timedelta(0), Timedelta(0), Timestamp("2000-01-01")])
+        expected = tm.box_expected(expected, xbox).astype(object)
+        tm.assert_equal(result, expected)
+
+
+class TestTimedeltaArraylikeMulDivOps:
+    # Tests for timedelta64[ns]
+    # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__
+
+    # ------------------------------------------------------------------
+    # Multiplication
+    # organized with scalar others first, then array-like
+
+    def test_td64arr_mul_int(self, box_with_array):
+        idx = TimedeltaIndex(np.arange(5, dtype="int64"))
+        idx = tm.box_expected(idx, box_with_array)
+
+        result = idx * 1
+        tm.assert_equal(result, idx)
+
+        result = 1 * idx
+        tm.assert_equal(result, idx)
+
+    def test_td64arr_mul_tdlike_scalar_raises(self, two_hours, box_with_array):
+        rng = timedelta_range("1 days", "10 days", name="foo")
+        rng = tm.box_expected(rng, box_with_array)
+        msg = "|".join(
+            [
+                "argument must be an integer",
+                "cannot use operands with types dtype",
+                "Cannot multiply with",
+                r"unsupported operand type\(s\) for \*",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            rng * two_hours
+
+    def test_tdi_mul_int_array_zerodim(self, box_with_array):
+        rng5 = np.arange(5, dtype="int64")
+        idx = TimedeltaIndex(rng5)
+        expected = TimedeltaIndex(rng5 * 5)
+
+        idx = tm.box_expected(idx, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = idx * np.array(5, dtype="int64")
+        tm.assert_equal(result, expected)
+
+    def test_tdi_mul_int_array(self, box_with_array):
+        rng5 = np.arange(5, dtype="int64")
+        idx = TimedeltaIndex(rng5)
+        expected = TimedeltaIndex(rng5**2)
+
+        idx = tm.box_expected(idx, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = idx * rng5
+        tm.assert_equal(result, expected)
+
+    def test_tdi_mul_int_series(self, box_with_array):
+        box = box_with_array
+        xbox = Series if box in [Index, tm.to_array, pd.array] else box
+
+        idx = TimedeltaIndex(np.arange(5, dtype="int64"))
+        expected = TimedeltaIndex(np.arange(5, dtype="int64") ** 2)
+
+        idx = tm.box_expected(idx, box)
+        expected = tm.box_expected(expected, xbox)
+
+        result = idx * Series(np.arange(5, dtype="int64"))
+        tm.assert_equal(result, expected)
+
+    def test_tdi_mul_float_series(self, box_with_array):
+        box = box_with_array
+        xbox = Series if box in [Index, tm.to_array, pd.array] else box
+
+        idx = TimedeltaIndex(np.arange(5, dtype="int64"))
+        idx = tm.box_expected(idx, box)
+
+        rng5f = np.arange(5, dtype="float64")
+        expected = TimedeltaIndex(rng5f * (rng5f + 1.0))
+        expected = tm.box_expected(expected, xbox)
+
+        result = idx * Series(rng5f + 1.0)
+        tm.assert_equal(result, expected)
+
+    # TODO: Put Series/DataFrame in others?
+    @pytest.mark.parametrize(
+        "other",
+        [
+            np.arange(1, 11),
+            Index(np.arange(1, 11), np.int64),
+            Index(range(1, 11), np.uint64),
+            Index(range(1, 11), np.float64),
+            pd.RangeIndex(1, 11),
+        ],
+        ids=lambda x: type(x).__name__,
+    )
+    def test_tdi_rmul_arraylike(self, other, box_with_array):
+        box = box_with_array
+
+        tdi = TimedeltaIndex(["1 Day"] * 10)
+        expected = timedelta_range("1 days", "10 days")._with_freq(None)
+
+        tdi = tm.box_expected(tdi, box)
+        xbox = get_upcast_box(tdi, other)
+
+        expected = tm.box_expected(expected, xbox)
+
+        result = other * tdi
+        tm.assert_equal(result, expected)
+        commute = tdi * other
+        tm.assert_equal(commute, expected)
+
+    def test_td64arr_mul_bool_scalar_raises(self, box_with_array):
+        # GH#58054
+        ser = Series(np.arange(5) * timedelta(hours=1), dtype="m8[ns]")
+        obj = tm.box_expected(ser, box_with_array)
+
+        msg = r"Cannot multiply 'timedelta64\[ns\]' by bool"
+        with pytest.raises(TypeError, match=msg):
+            True * obj
+        with pytest.raises(TypeError, match=msg):
+            obj * True
+        with pytest.raises(TypeError, match=msg):
+            np.True_ * obj
+        with pytest.raises(TypeError, match=msg):
+            obj * np.True_
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            bool,
+            "boolean",
+            pytest.param("bool[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        ],
+    )
+    def test_td64arr_mul_bool_raises(self, dtype, box_with_array):
+        # GH#58054
+        ser = Series(np.arange(5) * timedelta(hours=1), dtype="m8[ns]")
+        obj = tm.box_expected(ser, box_with_array)
+
+        other = Series(np.arange(5) < 0.5, dtype=dtype)
+        other = tm.box_expected(other, box_with_array)
+
+        msg = r"Cannot multiply 'timedelta64\[ns\]' by bool"
+        with pytest.raises(TypeError, match=msg):
+            obj * other
+
+        msg2 = msg.replace("rmul", "mul")
+        if dtype == "bool[pyarrow]":
+            # We go through ArrowEA.__mul__ which gives a different message
+            msg2 = (
+                r"operation 'mul' not supported for dtype 'bool\[pyarrow\]' "
+                r"with dtype 'timedelta64\[ns\]'"
+            )
+        with pytest.raises(TypeError, match=msg2):
+            other * obj
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            "Int64",
+            "Float64",
+            pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        ],
+    )
+    def test_td64arr_mul_masked(self, dtype, box_with_array):
+        ser = Series(np.arange(5) * timedelta(hours=1), dtype="m8[ns]")
+        obj = tm.box_expected(ser, box_with_array)
+
+        other = Series(np.arange(5), dtype=dtype)
+        other = tm.box_expected(other, box_with_array)
+
+        expected = Series([Timedelta(hours=n**2) for n in range(5)], dtype="m8[ns]")
+        expected = tm.box_expected(expected, box_with_array)
+        if dtype == "int64[pyarrow]":
+            expected = expected.astype("duration[ns][pyarrow]")
+
+        result = obj * other
+        tm.assert_equal(result, expected)
+        result = other * obj
+        tm.assert_equal(result, expected)
+
+    # ------------------------------------------------------------------
+    # __div__, __rdiv__
+
+    def test_td64arr_div_nat_invalid(self, box_with_array):
+        # don't allow division by NaT (maybe could in the future)
+        rng = timedelta_range("1 days", "10 days", name="foo")
+        rng = tm.box_expected(rng, box_with_array)
+
+        with pytest.raises(TypeError, match="unsupported operand type"):
+            rng / NaT
+        with pytest.raises(TypeError, match="Cannot divide NaTType by"):
+            NaT / rng
+
+        dt64nat = np.datetime64("NaT", "ns")
+        msg = "|".join(
+            [
+                # 'divide' on npdev as of 2021-12-18
+                "ufunc '(true_divide|divide)' cannot use operands",
+                "cannot perform __r?truediv__",
+                "Cannot divide datetime64 by TimedeltaArray",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            rng / dt64nat
+        with pytest.raises(TypeError, match=msg):
+            dt64nat / rng
+
+    def test_td64arr_div_td64nat(self, box_with_array):
+        # GH#23829
+        box = box_with_array
+        xbox = np.ndarray if box is pd.array else box
+
+        rng = timedelta_range("1 days", "10 days")
+        rng = tm.box_expected(rng, box)
+
+        other = np.timedelta64("NaT")
+
+        expected = np.array([np.nan] * 10)
+        expected = tm.box_expected(expected, xbox)
+
+        result = rng / other
+        tm.assert_equal(result, expected)
+
+        result = other / rng
+        tm.assert_equal(result, expected)
+
+    def test_td64arr_div_int(self, box_with_array):
+        idx = TimedeltaIndex(np.arange(5, dtype="int64"))
+        idx = tm.box_expected(idx, box_with_array)
+
+        result = idx / 1
+        tm.assert_equal(result, idx)
+
+        with pytest.raises(TypeError, match="Cannot divide"):
+            # GH#23829
+            1 / idx
+
+    def test_td64arr_div_tdlike_scalar(self, two_hours, box_with_array):
+        # GH#20088, GH#22163 ensure DataFrame returns correct dtype
+        box = box_with_array
+        xbox = np.ndarray if box is pd.array else box
+
+        rng = timedelta_range("1 days", "10 days", name="foo")
+        expected = Index((np.arange(10) + 1) * 12, dtype=np.float64, name="foo")
+
+        rng = tm.box_expected(rng, box)
+        expected = tm.box_expected(expected, xbox)
+
+        result = rng / two_hours
+        tm.assert_equal(result, expected)
+
+        result = two_hours / rng
+        expected = 1 / expected
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize("m", [1, 3, 10])
+    @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"])
+    def test_td64arr_div_td64_scalar(self, m, unit, box_with_array):
+        box = box_with_array
+        xbox = np.ndarray if box is pd.array else box
+
+        ser = Series([Timedelta(days=59)] * 3)
+        ser[2] = np.nan
+        flat = ser
+        ser = tm.box_expected(ser, box)
+
+        # op
+        expected = Series([x / np.timedelta64(m, unit) for x in flat])
+        expected = tm.box_expected(expected, xbox)
+        result = ser / np.timedelta64(m, unit)
+        tm.assert_equal(result, expected)
+
+        # reverse op
+        expected = Series([Timedelta(np.timedelta64(m, unit)) / x for x in flat])
+        expected = tm.box_expected(expected, xbox)
+        result = np.timedelta64(m, unit) / ser
+        tm.assert_equal(result, expected)
+
+    def test_td64arr_div_tdlike_scalar_with_nat(self, two_hours, box_with_array):
+        box = box_with_array
+        xbox = np.ndarray if box is pd.array else box
+
+        rng = TimedeltaIndex(["1 days", NaT, "2 days"], name="foo")
+        expected = Index([12, np.nan, 24], dtype=np.float64, name="foo")
+
+        rng = tm.box_expected(rng, box)
+        expected = tm.box_expected(expected, xbox)
+
+        result = rng / two_hours
+        tm.assert_equal(result, expected)
+
+        result = two_hours / rng
+        expected = 1 / expected
+        tm.assert_equal(result, expected)
+
+    def test_td64arr_div_td64_ndarray(self, box_with_array):
+        # GH#22631
+        box = box_with_array
+        xbox = np.ndarray if box is pd.array else box
+
+        rng = TimedeltaIndex(["1 days", NaT, "2 days"])
+        expected = Index([12, np.nan, 24], dtype=np.float64)
+
+        rng = tm.box_expected(rng, box)
+        expected = tm.box_expected(expected, xbox)
+
+        other = np.array([2, 4, 2], dtype="m8[h]")
+        result = rng / other
+        tm.assert_equal(result, expected)
+
+        result = rng / tm.box_expected(other, box)
+        tm.assert_equal(result, expected)
+
+        result = rng / other.astype(object)
+        tm.assert_equal(result, expected.astype(object))
+
+        result = rng / list(other)
+        tm.assert_equal(result, expected)
+
+        # reversed op
+        expected = 1 / expected
+        result = other / rng
+        tm.assert_equal(result, expected)
+
+        result = tm.box_expected(other, box) / rng
+        tm.assert_equal(result, expected)
+
+        result = other.astype(object) / rng
+        tm.assert_equal(result, expected)
+
+        result = list(other) / rng
+        tm.assert_equal(result, expected)
+
+    def test_tdarr_div_length_mismatch(self, box_with_array):
+        rng = TimedeltaIndex(["1 days", NaT, "2 days"])
+        mismatched = [1, 2, 3, 4]
+
+        rng = tm.box_expected(rng, box_with_array)
+        msg = "Cannot divide vectors|Unable to coerce to Series"
+        for obj in [mismatched, mismatched[:2]]:
+            # one shorter, one longer
+            for other in [obj, np.array(obj), Index(obj)]:
+                with pytest.raises(ValueError, match=msg):
+                    rng / other
+                with pytest.raises(ValueError, match=msg):
+                    other / rng
+
+    def test_td64_div_object_mixed_result(self, box_with_array):
+        # Case where we having a NaT in the result inseat of timedelta64("NaT")
+        #  is misleading
+        orig = timedelta_range("1 Day", periods=3).insert(1, NaT)
+        tdi = tm.box_expected(orig, box_with_array, transpose=False)
+
+        other = np.array([orig[0], 1.5, 2.0, orig[2]], dtype=object)
+        other = tm.box_expected(other, box_with_array, transpose=False)
+
+        res = tdi / other
+
+        expected = Index([1.0, np.timedelta64("NaT", "us"), orig[0], 1.5], dtype=object)
+        expected = tm.box_expected(expected, box_with_array, transpose=False)
+        if isinstance(expected, NumpyExtensionArray):
+            expected = expected.to_numpy()
+        tm.assert_equal(res, expected)
+        if box_with_array is DataFrame:
+            # We have an np.timedelta64(NaT), not pd.NaT
+            assert isinstance(res.iloc[1, 0], np.timedelta64)
+
+        res = tdi // other
+
+        expected = Index([1, np.timedelta64("NaT", "us"), orig[0], 1], dtype=object)
+        expected = tm.box_expected(expected, box_with_array, transpose=False)
+        if isinstance(expected, NumpyExtensionArray):
+            expected = expected.to_numpy()
+        tm.assert_equal(res, expected)
+        if box_with_array is DataFrame:
+            # We have an np.timedelta64(NaT), not pd.NaT
+            assert isinstance(res.iloc[1, 0], np.timedelta64)
+
+    # ------------------------------------------------------------------
+    # __floordiv__, __rfloordiv__
+
+    @pytest.mark.skipif(WASM, reason="no fp exception support in wasm")
+    def test_td64arr_floordiv_td64arr_with_nat(self, box_with_array):
+        # GH#35529
+        box = box_with_array
+        xbox = np.ndarray if box is pd.array else box
+
+        left = Series([1000, 222330, 30], dtype="timedelta64[ns]")
+        right = Series([1000, 222330, None], dtype="timedelta64[ns]")
+
+        left = tm.box_expected(left, box)
+        right = tm.box_expected(right, box)
+
+        expected = np.array([1.0, 1.0, np.nan], dtype=np.float64)
+        expected = tm.box_expected(expected, xbox)
+
+        with tm.maybe_produces_warning(
+            RuntimeWarning, box is pd.array, check_stacklevel=False
+        ):
+            result = left // right
+
+        tm.assert_equal(result, expected)
+
+        # case that goes through __rfloordiv__ with arraylike
+        with tm.maybe_produces_warning(
+            RuntimeWarning, box is pd.array, check_stacklevel=False
+        ):
+            result = np.asarray(left) // right
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.filterwarnings("ignore:invalid value encountered:RuntimeWarning")
+    def test_td64arr_floordiv_tdscalar(self, box_with_array, scalar_td):
+        # GH#18831, GH#19125
+        box = box_with_array
+        xbox = np.ndarray if box is pd.array else box
+        td = Timedelta("5m3s")  # i.e. (scalar_td - 1sec) / 2
+
+        td1 = Series([td, td, NaT], dtype="m8[ns]")
+        td1 = tm.box_expected(td1, box, transpose=False)
+
+        expected = Series([0, 0, np.nan])
+        expected = tm.box_expected(expected, xbox, transpose=False)
+
+        result = td1 // scalar_td
+        tm.assert_equal(result, expected)
+
+        # Reversed op
+        expected = Series([2, 2, np.nan])
+        expected = tm.box_expected(expected, xbox, transpose=False)
+
+        result = scalar_td // td1
+        tm.assert_equal(result, expected)
+
+        # same thing buts let's be explicit about calling __rfloordiv__
+        result = td1.__rfloordiv__(scalar_td)
+        tm.assert_equal(result, expected)
+
+    def test_td64arr_floordiv_int(self, box_with_array):
+        idx = TimedeltaIndex(np.arange(5, dtype="int64"))
+        idx = tm.box_expected(idx, box_with_array)
+        result = idx // 1
+        tm.assert_equal(result, idx)
+
+        pattern = "floor_divide cannot use operands|Cannot divide int by Timedelta*"
+        with pytest.raises(TypeError, match=pattern):
+            1 // idx
+
+    # ------------------------------------------------------------------
+    # mod, divmod
+    # TODO: operations with timedelta-like arrays, numeric arrays,
+    #  reversed ops
+
+    def test_td64arr_mod_tdscalar(
+        self, performance_warning, box_with_array, three_days
+    ):
+        tdi = timedelta_range("1 Day", "9 days")
+        tdarr = tm.box_expected(tdi, box_with_array)
+
+        expected = TimedeltaIndex(["1 Day", "2 Days", "0 Days"] * 3)
+        expected = tm.box_expected(expected, box_with_array)
+
+        if isinstance(three_days, offsets.Day):
+            msg = "unsupported operand type"
+            with pytest.raises(TypeError, match=msg):
+                tdarr % three_days
+            with pytest.raises(TypeError, match=msg):
+                divmod(tdarr, three_days)
+            with pytest.raises(TypeError, match=msg):
+                tdarr // three_days
+            return
+
+        result = tdarr % three_days
+        tm.assert_equal(result, expected)
+
+        if box_with_array is DataFrame and isinstance(three_days, pd.DateOffset):
+            # TODO: making expected be object here a result of DataFrame.__divmod__
+            #  being defined in a naive way that does not dispatch to the underlying
+            #  array's __divmod__
+            expected = expected.astype(object)
+        else:
+            performance_warning = False
+
+        with tm.assert_produces_warning(performance_warning):
+            result = divmod(tdarr, three_days)
+
+        tm.assert_equal(result[1], expected)
+        tm.assert_equal(result[0], tdarr // three_days)
+
+    def test_td64arr_mod_int(self, box_with_array):
+        tdi = timedelta_range("1 ns", "10 ns", periods=10)
+        tdarr = tm.box_expected(tdi, box_with_array)
+
+        expected = TimedeltaIndex(["1 ns", "0 ns"] * 5)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = tdarr % 2
+        tm.assert_equal(result, expected)
+
+        msg = "Cannot divide int by"
+        with pytest.raises(TypeError, match=msg):
+            2 % tdarr
+
+        result = divmod(tdarr, 2)
+        tm.assert_equal(result[1], expected)
+        tm.assert_equal(result[0], tdarr // 2)
+
+    def test_td64arr_rmod_tdscalar(self, box_with_array, three_days):
+        tdi = timedelta_range("1 Day", "9 days")
+        tdarr = tm.box_expected(tdi, box_with_array)
+
+        expected = ["0 Days", "1 Day", "0 Days"] + ["3 Days"] * 6
+        expected = TimedeltaIndex(expected)
+        expected = tm.box_expected(expected, box_with_array)
+
+        if isinstance(three_days, offsets.Day):
+            msg = "Cannot divide Day by TimedeltaArray"
+            with pytest.raises(TypeError, match=msg):
+                three_days % tdarr
+            return
+
+        result = three_days % tdarr
+        tm.assert_equal(result, expected)
+
+        result = divmod(three_days, tdarr)
+        tm.assert_equal(result[1], expected)
+        tm.assert_equal(result[0], three_days // tdarr)
+
+    # ------------------------------------------------------------------
+    # Operations with invalid others
+
+    def test_td64arr_mul_tdscalar_invalid(self, box_with_array, scalar_td):
+        td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
+        td1.iloc[2] = np.nan
+
+        td1 = tm.box_expected(td1, box_with_array)
+
+        # check that we are getting a TypeError
+        # with 'operate' (from core/ops.py) for the ops that are not
+        # defined
+        pattern = "operate|unsupported|cannot|not supported"
+        with pytest.raises(TypeError, match=pattern):
+            td1 * scalar_td
+        with pytest.raises(TypeError, match=pattern):
+            scalar_td * td1
+
+    def test_td64arr_mul_too_short_raises(self, box_with_array):
+        idx = TimedeltaIndex(np.arange(5, dtype="int64"))
+        idx = tm.box_expected(idx, box_with_array)
+        msg = "|".join(
+            [
+                "cannot use operands with types dtype",
+                "Cannot multiply with unequal lengths",
+                "Unable to coerce to Series",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            # length check before dtype check
+            idx * idx[:3]
+        with pytest.raises(ValueError, match=msg):
+            idx * np.array([1, 2])
+
+    def test_td64arr_mul_td64arr_raises(self, box_with_array):
+        idx = TimedeltaIndex(np.arange(5, dtype="int64"))
+        idx = tm.box_expected(idx, box_with_array)
+        msg = "cannot use operands with types dtype"
+        with pytest.raises(TypeError, match=msg):
+            idx * idx
+
+    # ------------------------------------------------------------------
+    # Operations with numeric others
+
+    def test_td64arr_mul_numeric_scalar(self, box_with_array, one):
+        # GH#4521
+        # divide/multiply by integers
+        tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]")
+        expected = Series(["-59 Days", "-59 Days", "NaT"], dtype="timedelta64[ns]")
+
+        tdser = tm.box_expected(tdser, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = tdser * (-one)
+        tm.assert_equal(result, expected)
+        result = (-one) * tdser
+        tm.assert_equal(result, expected)
+
+        expected = Series(["118 Days", "118 Days", "NaT"], dtype="timedelta64[ns]")
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = tdser * (2 * one)
+        tm.assert_equal(result, expected)
+        result = (2 * one) * tdser
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize("two", [2, 2.0, np.array(2), np.array(2.0)])
+    def test_td64arr_div_numeric_scalar(self, box_with_array, two):
+        # GH#4521
+        # divide/multiply by integers
+        tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]")
+        expected = Series(["29.5D", "29.5D", "NaT"], dtype="timedelta64[ns]")
+
+        tdser = tm.box_expected(tdser, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = tdser / two
+        tm.assert_equal(result, expected)
+
+        with pytest.raises(TypeError, match="Cannot divide"):
+            two / tdser
+
+    @pytest.mark.parametrize("two", [2, 2.0, np.array(2), np.array(2.0)])
+    def test_td64arr_floordiv_numeric_scalar(self, box_with_array, two):
+        tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]")
+        expected = Series(["29.5D", "29.5D", "NaT"], dtype="timedelta64[ns]")
+
+        tdser = tm.box_expected(tdser, box_with_array)
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = tdser // two
+        tm.assert_equal(result, expected)
+
+        with pytest.raises(TypeError, match="Cannot divide"):
+            two // tdser
+
+    @pytest.mark.parametrize(
+        "klass",
+        [np.array, Index, Series],
+        ids=lambda x: x.__name__,
+    )
+    def test_td64arr_rmul_numeric_array(
+        self,
+        box_with_array,
+        klass,
+        any_real_numpy_dtype,
+    ):
+        # GH#4521
+        # divide/multiply by integers
+
+        vector = klass([20, 30, 40])
+        tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]")
+        vector = vector.astype(any_real_numpy_dtype)
+
+        expected = Series(["1180 Days", "1770 Days", "NaT"], dtype="timedelta64[ns]")
+
+        tdser = tm.box_expected(tdser, box_with_array)
+        xbox = get_upcast_box(tdser, vector)
+
+        expected = tm.box_expected(expected, xbox)
+
+        result = tdser * vector
+        tm.assert_equal(result, expected)
+
+        result = vector * tdser
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "klass",
+        [np.array, Index, Series],
+        ids=lambda x: x.__name__,
+    )
+    def test_td64arr_div_numeric_array(
+        self, box_with_array, klass, any_real_numpy_dtype
+    ):
+        # GH#4521
+        # divide/multiply by integers
+
+        vector = klass([20, 30, 40])
+        tdser = Series(["59 Days", "59 Days", "NaT"], dtype="m8[ns]")
+        vector = vector.astype(any_real_numpy_dtype)
+
+        expected = Series(["2.95D", "1D 23h 12m", "NaT"], dtype="timedelta64[ns]")
+
+        tdser = tm.box_expected(tdser, box_with_array)
+        xbox = get_upcast_box(tdser, vector)
+        expected = tm.box_expected(expected, xbox)
+
+        result = tdser / vector
+        tm.assert_equal(result, expected)
+
+        pattern = "|".join(
+            [
+                "true_divide'? cannot use operands",
+                "cannot perform __div__",
+                "cannot perform __truediv__",
+                "unsupported operand",
+                "Cannot divide",
+                "ufunc 'divide' cannot use operands with types",
+            ]
+        )
+        with pytest.raises(TypeError, match=pattern):
+            vector / tdser
+
+        result = tdser / vector.astype(object)
+        if box_with_array is DataFrame:
+            expected = [tdser.iloc[0, n] / vector[n] for n in range(len(vector))]
+            expected = tm.box_expected(expected, xbox).astype(object)
+            expected[2] = expected[2].fillna(np.timedelta64("NaT", "ns"))
+        else:
+            expected = [tdser[n] / vector[n] for n in range(len(tdser))]
+            expected = [
+                x if x is not NaT else np.timedelta64("NaT", "ns") for x in expected
+            ]
+            if xbox is tm.to_array:
+                expected = tm.to_array(expected).astype(object)
+            else:
+                expected = xbox(expected, dtype=object)
+
+        tm.assert_equal(result, expected)
+
+        with pytest.raises(TypeError, match=pattern):
+            vector.astype(object) / tdser
+
+    def test_td64arr_mul_int_series(self, box_with_array, names):
+        # GH#19042 test for correct name attachment
+        box = box_with_array
+        exname = get_expected_name(box, names)
+
+        tdi = TimedeltaIndex(
+            ["0days", "1day", "2days", "3days", "4days"], name=names[0]
+        )
+        # TODO: Should we be parametrizing over types for `ser` too?
+        ser = Series([0, 1, 2, 3, 4], dtype=np.int64, name=names[1])
+
+        expected = Series(
+            ["0days", "1day", "4days", "9days", "16days"],
+            dtype="timedelta64[us]",
+            name=exname,
+        )
+
+        tdi = tm.box_expected(tdi, box)
+        xbox = get_upcast_box(tdi, ser)
+
+        expected = tm.box_expected(expected, xbox)
+
+        result = ser * tdi
+        tm.assert_equal(result, expected)
+
+        result = tdi * ser
+        tm.assert_equal(result, expected)
+
+    # TODO: Should we be parametrizing over types for `ser` too?
+    def test_float_series_rdiv_td64arr(self, box_with_array, names):
+        # GH#19042 test for correct name attachment
+        box = box_with_array
+        tdi = TimedeltaIndex(
+            ["0days", "1day", "2days", "3days", "4days"], name=names[0]
+        )
+        ser = Series([1.5, 3, 4.5, 6, 7.5], dtype=np.float64, name=names[1])
+
+        xname = names[2] if box not in [tm.to_array, pd.array] else names[1]
+        expected = Series(
+            [tdi[n] / ser[n] for n in range(len(ser))],
+            dtype="timedelta64[us]",
+            name=xname,
+        )
+
+        tdi = tm.box_expected(tdi, box)
+        xbox = get_upcast_box(tdi, ser)
+        expected = tm.box_expected(expected, xbox)
+
+        result = ser.__rtruediv__(tdi)
+        if box is DataFrame:
+            assert result is NotImplemented
+        else:
+            tm.assert_equal(result, expected)
+
+    def test_td64arr_all_nat_div_object_dtype_numeric(self, box_with_array):
+        # GH#39750 make sure we infer the result as td64
+        tdi = TimedeltaIndex([NaT, NaT], dtype="m8[ns]")
+
+        left = tm.box_expected(tdi, box_with_array)
+        right = np.array([2, 2.0], dtype=object)
+
+        tdnat = np.timedelta64("NaT", "ns")
+        expected = Index([tdnat] * 2, dtype=object)
+        if box_with_array is not Index:
+            expected = tm.box_expected(expected, box_with_array).astype(object)
+            if box_with_array in [Series, DataFrame]:
+                expected = expected.fillna(tdnat)  # GH#18463
+
+        result = left / right
+        tm.assert_equal(result, expected)
+
+        result = left // right
+        tm.assert_equal(result, expected)
+
+
+class TestTimedelta64ArrayLikeArithmetic:
+    # Arithmetic tests for timedelta64[ns] vectors fully parametrized over
+    #  DataFrame/Series/TimedeltaIndex/TimedeltaArray.  Ideally all arithmetic
+    #  tests will eventually end up here.
+
+    def test_td64arr_pow_invalid(self, scalar_td, box_with_array):
+        td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
+        td1.iloc[2] = np.nan
+
+        td1 = tm.box_expected(td1, box_with_array)
+
+        # check that we are getting a TypeError
+        # with 'operate' (from core/ops.py) for the ops that are not
+        # defined
+        pattern = "operate|unsupported|cannot|not supported"
+        with pytest.raises(TypeError, match=pattern):
+            scalar_td**td1
+
+        with pytest.raises(TypeError, match=pattern):
+            td1**scalar_td
+
+
+def test_add_timestamp_to_timedelta():
+    # GH: 35897
+    timestamp = Timestamp("2021-01-01")
+    result = timestamp + timedelta_range("0s", "1s", periods=31)
+    expected = DatetimeIndex(
+        [
+            timestamp
+            + (
+                pd.to_timedelta("0.033333s") * i
+                + pd.to_timedelta("0.000001s") * divmod(i, 3)[0]
+            )
+            for i in range(31)
+        ]
+    )
+    tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/arrays/__init__.py b/pandas/tests/arrays/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/arrays/masked_shared.py b/pandas/tests/arrays/masked_shared.py
new file mode 100644
index 0000000000000000000000000000000000000000..545b14af2c98bcdfeea2969d859ca097e7e0db8b
--- /dev/null
+++ b/pandas/tests/arrays/masked_shared.py
@@ -0,0 +1,155 @@
+"""
+Tests shared by MaskedArray subclasses.
+"""
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.tests.extension.base import BaseOpsUtil
+
+
+class ComparisonOps(BaseOpsUtil):
+    def _compare_other(self, data, op, other):
+        # array
+        result = pd.Series(op(data, other))
+        expected = pd.Series(op(data._data, other), dtype="boolean")
+
+        # fill the nan locations
+        expected[data._mask] = pd.NA
+
+        tm.assert_series_equal(result, expected)
+
+        # series
+        ser = pd.Series(data)
+        result = op(ser, other)
+
+        # Set nullable dtype here to avoid upcasting when setting to pd.NA below
+        expected = op(pd.Series(data._data), other).astype("boolean")
+
+        # fill the nan locations
+        expected[data._mask] = pd.NA
+
+        tm.assert_series_equal(result, expected)
+
+    # subclass will override to parametrize 'other'
+    def test_scalar(self, other, comparison_op, dtype):
+        op = comparison_op
+        left = pd.array([1, 0, None], dtype=dtype)
+
+        result = op(left, other)
+
+        if other is pd.NA:
+            expected = pd.array([None, None, None], dtype="boolean")
+        else:
+            values = op(left._data, other)
+            expected = pd.arrays.BooleanArray(values, left._mask, copy=True)
+        tm.assert_extension_array_equal(result, expected)
+
+        # ensure we haven't mutated anything inplace
+        result[0] = pd.NA
+        tm.assert_extension_array_equal(left, pd.array([1, 0, None], dtype=dtype))
+
+
+class NumericOps:
+    # Shared by IntegerArray and FloatingArray, not BooleanArray
+
+    def test_searchsorted_nan(self, dtype):
+        # The base class casts to object dtype, for which searchsorted returns
+        #  0 from the left and 10 from the right.
+        arr = pd.array(range(10), dtype=dtype)
+
+        assert arr.searchsorted(np.nan, side="left") == 10
+        assert arr.searchsorted(np.nan, side="right") == 10
+
+    def test_no_shared_mask(self, data):
+        result = data + 1
+        assert not tm.shares_memory(result, data)
+
+    def test_array(self, comparison_op, dtype):
+        op = comparison_op
+
+        left = pd.array([0, 1, 2, None, None, None], dtype=dtype)
+        right = pd.array([0, 1, None, 0, 1, None], dtype=dtype)
+
+        result = op(left, right)
+        values = op(left._data, right._data)
+        mask = left._mask | right._mask
+
+        expected = pd.arrays.BooleanArray(values, mask)
+        tm.assert_extension_array_equal(result, expected)
+
+        # ensure we haven't mutated anything inplace
+        result[0] = pd.NA
+        tm.assert_extension_array_equal(
+            left, pd.array([0, 1, 2, None, None, None], dtype=dtype)
+        )
+        tm.assert_extension_array_equal(
+            right, pd.array([0, 1, None, 0, 1, None], dtype=dtype)
+        )
+
+    def test_compare_with_booleanarray(self, comparison_op, dtype):
+        op = comparison_op
+
+        left = pd.array([True, False, None] * 3, dtype="boolean")
+        right = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype=dtype)
+        other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean")
+
+        expected = op(left, other)
+        result = op(left, right)
+        tm.assert_extension_array_equal(result, expected)
+
+        # reversed op
+        expected = op(other, left)
+        result = op(right, left)
+        tm.assert_extension_array_equal(result, expected)
+
+    def test_compare_to_string(self, dtype):
+        # GH#28930
+        ser = pd.Series([1, None], dtype=dtype)
+        result = ser == "a"
+        expected = pd.Series([False, pd.NA], dtype="boolean")
+
+        tm.assert_series_equal(result, expected)
+
+    def test_ufunc_with_out(self, dtype):
+        arr = pd.array([1, 2, 3], dtype=dtype)
+        arr2 = pd.array([1, 2, pd.NA], dtype=dtype)
+
+        mask = arr == arr
+        mask2 = arr2 == arr2
+
+        result = np.zeros(3, dtype=bool)
+        result |= mask
+        # If MaskedArray.__array_ufunc__ handled "out" appropriately,
+        #  `result` should still be an ndarray.
+        assert isinstance(result, np.ndarray)
+        assert result.all()
+
+        # result |= mask worked because mask could be cast losslessly to
+        #  boolean ndarray. mask2 can't, so this raises
+        result = np.zeros(3, dtype=bool)
+        msg = "Specify an appropriate 'na_value' for this dtype"
+        with pytest.raises(ValueError, match=msg):
+            result |= mask2
+
+        # addition
+        res = np.add(arr, arr2)
+        expected = pd.array([2, 4, pd.NA], dtype=dtype)
+        tm.assert_extension_array_equal(res, expected)
+
+        # when passing out=arr, we will modify 'arr' inplace.
+        res = np.add(arr, arr2, out=arr)
+        assert res is arr
+        tm.assert_extension_array_equal(res, expected)
+        tm.assert_extension_array_equal(arr, expected)
+
+    def test_mul_td64_array(self, dtype):
+        # GH#45622
+        arr = pd.array([1, 2, pd.NA], dtype=dtype)
+        other = np.arange(3, dtype=np.int64).view("m8[ns]")
+
+        result = arr * other
+        expected = pd.array([pd.Timedelta(0), pd.Timedelta(2), pd.NaT])
+        tm.assert_extension_array_equal(result, expected)
diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py
new file mode 100644
index 0000000000000000000000000000000000000000..a02926dd5e158cd914a3eff0bc061a01cabea323
--- /dev/null
+++ b/pandas/tests/arrays/test_array.py
@@ -0,0 +1,539 @@
+import datetime
+import decimal
+import zoneinfo
+
+import numpy as np
+import pytest
+
+from pandas._config import using_string_dtype
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.api.extensions import register_extension_dtype
+from pandas.arrays import (
+    BooleanArray,
+    DatetimeArray,
+    FloatingArray,
+    IntegerArray,
+    IntervalArray,
+    SparseArray,
+    TimedeltaArray,
+)
+from pandas.core.arrays import (
+    NumpyExtensionArray,
+    period_array,
+)
+from pandas.tests.extension.decimal import (
+    DecimalArray,
+    DecimalDtype,
+    to_decimal,
+)
+
+
+@pytest.mark.parametrize("dtype_unit", ["M8[h]", "M8[m]", "m8[h]"])
+def test_dt64_array(dtype_unit):
+    # GH#53817
+    dtype_var = np.dtype(dtype_unit)
+    msg = (
+        r"datetime64 and timedelta64 dtype resolutions other than "
+        r"'s', 'ms', 'us', and 'ns' are no longer supported."
+    )
+    with pytest.raises(ValueError, match=msg):
+        pd.array([], dtype=dtype_var)
+
+
+@pytest.mark.parametrize(
+    "data, dtype, expected",
+    [
+        # Basic NumPy defaults.
+        ([], None, FloatingArray._from_sequence([], dtype="Float64")),
+        ([1, 2], None, IntegerArray._from_sequence([1, 2], dtype="Int64")),
+        ([1, 2], object, NumpyExtensionArray(np.array([1, 2], dtype=object))),
+        (
+            [1, 2],
+            np.dtype("float32"),
+            NumpyExtensionArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
+        ),
+        (
+            np.array([], dtype=object),
+            None,
+            NumpyExtensionArray(np.array([], dtype=object)),
+        ),
+        (
+            np.array([1, 2], dtype="int64"),
+            None,
+            IntegerArray._from_sequence([1, 2], dtype="Int64"),
+        ),
+        (
+            np.array([1.0, 2.0], dtype="float64"),
+            None,
+            FloatingArray._from_sequence([1.0, 2.0], dtype="Float64"),
+        ),
+        # String alias passes through to NumPy
+        ([1, 2], "float32", NumpyExtensionArray(np.array([1, 2], dtype="float32"))),
+        ([1, 2], "int64", NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
+        # GH#44715 FloatingArray does not support float16, so fall
+        #  back to NumpyExtensionArray
+        (
+            np.array([1, 2], dtype=np.float16),
+            None,
+            NumpyExtensionArray(np.array([1, 2], dtype=np.float16)),
+        ),
+        # idempotency with e.g. pd.array(pd.array([1, 2], dtype="int64"))
+        (
+            NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
+            None,
+            NumpyExtensionArray(np.array([1, 2], dtype=np.int32)),
+        ),
+        # Period alias
+        (
+            [pd.Period("2000", "D"), pd.Period("2001", "D")],
+            "Period[D]",
+            period_array(["2000", "2001"], freq="D"),
+        ),
+        # Period dtype
+        (
+            [pd.Period("2000", "D")],
+            pd.PeriodDtype("D"),
+            period_array(["2000"], freq="D"),
+        ),
+        # Datetime (naive)
+        (
+            [1, 2],
+            np.dtype("datetime64[ns]"),
+            DatetimeArray._from_sequence(
+                np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
+            ),
+        ),
+        (
+            [1, 2],
+            np.dtype("datetime64[s]"),
+            DatetimeArray._from_sequence(
+                np.array([1, 2], dtype="M8[s]"), dtype="M8[s]"
+            ),
+        ),
+        (
+            np.array([1, 2], dtype="datetime64[ns]"),
+            None,
+            DatetimeArray._from_sequence(
+                np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]"
+            ),
+        ),
+        (
+            pd.DatetimeIndex(["2000", "2001"]),
+            np.dtype("datetime64[ns]"),
+            DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
+        ),
+        (
+            pd.DatetimeIndex(["2000", "2001"]),
+            None,
+            DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[us]"),
+        ),
+        (
+            ["2000", "2001"],
+            np.dtype("datetime64[ns]"),
+            DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"),
+        ),
+        (
+            [pd.NaT, pd.NaT],
+            None,
+            DatetimeArray._from_sequence([pd.NaT, pd.NaT], dtype="M8[s]"),
+        ),
+        # Datetime (tz-aware)
+        (
+            ["2000", "2001"],
+            pd.DatetimeTZDtype(tz="CET"),
+            DatetimeArray._from_sequence(
+                ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")
+            ),
+        ),
+        # Timedelta
+        (
+            ["1h", "2h"],
+            np.dtype("timedelta64[ns]"),
+            TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
+        ),
+        (
+            pd.TimedeltaIndex(["1h", "2h"]),
+            np.dtype("timedelta64[ns]"),
+            TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"),
+        ),
+        (
+            np.array([1, 2], dtype="m8[s]"),
+            np.dtype("timedelta64[s]"),
+            TimedeltaArray._from_sequence(
+                np.array([1, 2], dtype="m8[s]"), dtype="m8[s]"
+            ),
+        ),
+        (
+            pd.TimedeltaIndex(["1h", "2h"]),
+            None,
+            TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[us]"),
+        ),
+        (
+            # preserve non-nano, i.e. don't cast to NumpyExtensionArray
+            TimedeltaArray._simple_new(
+                np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
+            ),
+            None,
+            TimedeltaArray._simple_new(
+                np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
+            ),
+        ),
+        (
+            # preserve non-nano, i.e. don't cast to NumpyExtensionArray
+            TimedeltaArray._simple_new(
+                np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
+            ),
+            np.dtype("m8[s]"),
+            TimedeltaArray._simple_new(
+                np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]")
+            ),
+        ),
+        # Category
+        (["a", "b"], "category", pd.Categorical(["a", "b"])),
+        (
+            ["a", "b"],
+            pd.CategoricalDtype(None, ordered=True),
+            pd.Categorical(["a", "b"], ordered=True),
+        ),
+        # Interval
+        (
+            [pd.Interval(1, 2), pd.Interval(3, 4)],
+            "interval",
+            IntervalArray.from_tuples([(1, 2), (3, 4)]),
+        ),
+        # Sparse
+        ([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")),
+        # IntegerNA
+        ([1, None], "Int16", pd.array([1, None], dtype="Int16")),
+        (
+            pd.Series([1, 2]),
+            None,
+            NumpyExtensionArray(np.array([1, 2], dtype=np.int64)),
+        ),
+        # String
+        (
+            ["a", None],
+            "string",
+            pd.StringDtype()
+            .construct_array_type()
+            ._from_sequence(["a", None], dtype=pd.StringDtype()),
+        ),
+        (
+            ["a", None],
+            "str",
+            pd.StringDtype(na_value=np.nan)
+            .construct_array_type()
+            ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan))
+            if using_string_dtype()
+            else NumpyExtensionArray(np.array(["a", "None"])),
+        ),
+        (
+            ["a", None],
+            pd.StringDtype(),
+            pd.StringDtype()
+            .construct_array_type()
+            ._from_sequence(["a", None], dtype=pd.StringDtype()),
+        ),
+        (
+            ["a", None],
+            pd.StringDtype(na_value=np.nan),
+            pd.StringDtype(na_value=np.nan)
+            .construct_array_type()
+            ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)),
+        ),
+        (
+            # numpy array with string dtype
+            np.array(["a", "b"], dtype=str),
+            pd.StringDtype(),
+            pd.StringDtype()
+            .construct_array_type()
+            ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
+        ),
+        (
+            # numpy array with string dtype
+            np.array(["a", "b"], dtype=str),
+            pd.StringDtype(na_value=np.nan),
+            pd.StringDtype(na_value=np.nan)
+            .construct_array_type()
+            ._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
+        ),
+        # Boolean
+        (
+            [True, None],
+            "boolean",
+            BooleanArray._from_sequence([True, None], dtype="boolean"),
+        ),
+        (
+            [True, None],
+            pd.BooleanDtype(),
+            BooleanArray._from_sequence([True, None], dtype="boolean"),
+        ),
+        # Index
+        (pd.Index([1, 2]), None, NumpyExtensionArray(np.array([1, 2], dtype=np.int64))),
+        # Series[EA] returns the EA
+        (
+            pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])),
+            None,
+            pd.Categorical(["a", "b"], categories=["a", "b", "c"]),
+        ),
+        # "3rd party" EAs work
+        ([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])),
+        # pass an ExtensionArray, but a different dtype
+        (
+            period_array(["2000", "2001"], freq="D"),
+            "category",
+            pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]),
+        ),
+        # Complex
+        (
+            np.array([complex(1), complex(2)], dtype=np.complex128),
+            None,
+            NumpyExtensionArray(
+                np.array([complex(1), complex(2)], dtype=np.complex128)
+            ),
+        ),
+    ],
+)
+def test_array(data, dtype, expected):
+    result = pd.array(data, dtype=dtype)
+    tm.assert_equal(result, expected)
+
+
+def test_array_copy():
+    a = np.array([1, 2])
+    # default is to copy
+    b = pd.array(a, dtype=a.dtype)
+    assert not tm.shares_memory(a, b)
+
+    # copy=True
+    b = pd.array(a, dtype=a.dtype, copy=True)
+    assert not tm.shares_memory(a, b)
+
+    # copy=False
+    b = pd.array(a, dtype=a.dtype, copy=False)
+    assert tm.shares_memory(a, b)
+
+
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        # period
+        (
+            [pd.Period("2000", "D"), pd.Period("2001", "D")],
+            period_array(["2000", "2001"], freq="D"),
+        ),
+        # interval
+        ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])),
+        # datetime
+        (
+            [pd.Timestamp("2000").as_unit("s"), pd.Timestamp("2001").as_unit("s")],
+            DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"),
+        ),
+        (
+            [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)],
+            DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[us]"),
+        ),
+        (
+            np.array([1, 2], dtype="M8[ns]"),
+            DatetimeArray._from_sequence(np.array([1, 2], dtype="M8[ns]")),
+        ),
+        (
+            np.array([1, 2], dtype="M8[us]"),
+            DatetimeArray._simple_new(
+                np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]")
+            ),
+        ),
+        # datetimetz
+        (
+            [
+                pd.Timestamp("2000", tz="CET").as_unit("s"),
+                pd.Timestamp("2001", tz="CET").as_unit("s"),
+            ],
+            DatetimeArray._from_sequence(
+                ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="s")
+            ),
+        ),
+        (
+            [
+                datetime.datetime(
+                    2000, 1, 1, tzinfo=zoneinfo.ZoneInfo("Europe/Berlin")
+                ),
+                datetime.datetime(
+                    2001, 1, 1, tzinfo=zoneinfo.ZoneInfo("Europe/Berlin")
+                ),
+            ],
+            DatetimeArray._from_sequence(
+                ["2000", "2001"],
+                dtype=pd.DatetimeTZDtype(
+                    tz=zoneinfo.ZoneInfo("Europe/Berlin"), unit="us"
+                ),
+            ),
+        ),
+        # timedelta
+        (
+            [pd.Timedelta("1h"), pd.Timedelta("2h")],
+            TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[us]"),
+        ),
+        (
+            np.array([1, 2], dtype="m8[ns]"),
+            TimedeltaArray._from_sequence(
+                np.array([1, 2], dtype="m8[ns]"), dtype=np.dtype("m8[ns]")
+            ),
+        ),
+        (
+            np.array([1, 2], dtype="m8[us]"),
+            TimedeltaArray._from_sequence(
+                np.array([1, 2], dtype="m8[us]"), dtype=np.dtype("m8[us]")
+            ),
+        ),
+        # integer
+        ([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")),
+        ([1, None], IntegerArray._from_sequence([1, None], dtype="Int64")),
+        ([1, pd.NA], IntegerArray._from_sequence([1, pd.NA], dtype="Int64")),
+        ([1, np.nan], IntegerArray._from_sequence([1, pd.NA], dtype="Int64")),
+        # float
+        ([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2], dtype="Float64")),
+        ([0.1, None], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
+        ([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
+        ([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")),
+        # integer-like float
+        ([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
+        ([1.0, None], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
+        ([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
+        ([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")),
+        # mixed-integer-float
+        ([1, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")),
+        (
+            [1, np.nan, 2.0],
+            FloatingArray._from_sequence([1.0, None, 2.0], dtype="Float64"),
+        ),
+        # string
+        (
+            ["a", "b"],
+            pd.StringDtype()
+            .construct_array_type()
+            ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
+        ),
+        (
+            ["a", None],
+            pd.StringDtype()
+            .construct_array_type()
+            ._from_sequence(["a", None], dtype=pd.StringDtype()),
+        ),
+        (
+            # numpy array with string dtype
+            np.array(["a", "b"], dtype=str),
+            pd.StringDtype()
+            .construct_array_type()
+            ._from_sequence(["a", "b"], dtype=pd.StringDtype()),
+        ),
+        # Boolean
+        ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")),
+        ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")),
+    ],
+)
+def test_array_inference(data, expected):
+    result = pd.array(data)
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # mix of frequencies
+        [pd.Period("2000", "D"), pd.Period("2001", "Y")],
+        # mix of closed
+        [pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")],
+        # Mix of timezones
+        [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")],
+        # Mix of tz-aware and tz-naive
+        [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")],
+        np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]),
+    ],
+)
+def test_array_inference_fails(data):
+    result = pd.array(data)
+    expected = NumpyExtensionArray(np.array(data, dtype=object))
+    tm.assert_extension_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("data", [np.array(0)])
+def test_nd_raises(data):
+    with pytest.raises(ValueError, match="NumpyExtensionArray must be 1-dimensional"):
+        pd.array(data, dtype="int64")
+
+
+def test_scalar_raises():
+    with pytest.raises(ValueError, match="Cannot pass scalar '1'"):
+        pd.array(1)
+
+
+def test_dataframe_raises():
+    # GH#51167 don't accidentally cast to StringArray by doing inference on columns
+    df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+    msg = "Cannot pass DataFrame to 'pandas.array'"
+    with pytest.raises(TypeError, match=msg):
+        pd.array(df)
+
+
+def test_bounds_check():
+    # GH21796
+    with pytest.raises(
+        TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16"
+    ):
+        pd.array([-1, 2, 3], dtype="UInt16")
+
+
+# ---------------------------------------------------------------------------
+# A couple dummy classes to ensure that Series and Indexes are unboxed before
+# getting to the EA classes.
+
+
+@register_extension_dtype
+class DecimalDtype2(DecimalDtype):
+    name = "decimal2"
+
+    def construct_array_type(self):
+        """
+        Return the array type associated with this dtype.
+
+        Returns
+        -------
+        type
+        """
+        return DecimalArray2
+
+
+class DecimalArray2(DecimalArray):
+    @classmethod
+    def _from_sequence(cls, scalars, *, dtype=None, copy=False):
+        if isinstance(scalars, (pd.Series, pd.Index)):
+            raise TypeError("scalars should not be of type pd.Series or pd.Index")
+
+        return super()._from_sequence(scalars, dtype=dtype, copy=copy)
+
+
+def test_array_unboxes(index_or_series):
+    box = index_or_series
+
+    data = box([decimal.Decimal("1"), decimal.Decimal("2")])
+    dtype = DecimalDtype2()
+    # make sure it works
+    with pytest.raises(
+        TypeError, match="scalars should not be of type pd.Series or pd.Index"
+    ):
+        DecimalArray2._from_sequence(data, dtype=dtype)
+
+    result = pd.array(data, dtype="decimal2")
+    expected = DecimalArray2._from_sequence(data.values, dtype=dtype)
+    tm.assert_equal(result, expected)
+
+
+def test_array_to_numpy_na():
+    # GH#40638
+    arr = pd.array([pd.NA, 1], dtype="string[python]")
+    result = arr.to_numpy(na_value=True, dtype=bool)
+    expected = np.array([True, True])
+    tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f6b3491a74693312e8c6c6e6f92faee127bc357
--- /dev/null
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -0,0 +1,1390 @@
+from __future__ import annotations
+
+import re
+import warnings
+
+import numpy as np
+import pytest
+
+from pandas._libs import (
+    NaT,
+    Timestamp,
+)
+from pandas._libs.tslibs import to_offset
+from pandas.compat.numpy import np_version_gt2
+
+from pandas.core.dtypes.dtypes import PeriodDtype
+
+import pandas as pd
+from pandas import (
+    DatetimeIndex,
+    Period,
+    PeriodIndex,
+    TimedeltaIndex,
+)
+import pandas._testing as tm
+from pandas.core.arrays import (
+    DatetimeArray,
+    NumpyExtensionArray,
+    PeriodArray,
+    TimedeltaArray,
+)
+
+
+# TODO: more freq variants
+@pytest.fixture(params=["D", "B", "W", "ME", "QE", "YE"])
+def freqstr(request):
+    """Fixture returning parametrized frequency in string format."""
+    return request.param
+
+
+@pytest.fixture
+def period_index(freqstr):
+    """
+    A fixture to provide PeriodIndex objects with different frequencies.
+
+    Most PeriodArray behavior is already tested in PeriodIndex tests,
+    so here we just test that the PeriodArray behavior matches
+    the PeriodIndex behavior.
+    """
+    # TODO: non-monotone indexes; NaTs, different start dates
+    with warnings.catch_warnings():
+        # suppress deprecation of Period[B]
+        warnings.filterwarnings(
+            "ignore", message="Period with BDay freq", category=FutureWarning
+        )
+        freqstr = PeriodDtype(to_offset(freqstr))._freqstr
+        pi = pd.period_range(start=Timestamp("2000-01-01"), periods=100, freq=freqstr)
+    return pi
+
+
+@pytest.fixture
+def datetime_index(freqstr):
+    """
+    A fixture to provide DatetimeIndex objects with different frequencies.
+
+    Most DatetimeArray behavior is already tested in DatetimeIndex tests,
+    so here we just test that the DatetimeArray behavior matches
+    the DatetimeIndex behavior.
+    """
+    # TODO: non-monotone indexes; NaTs, different start dates, timezones
+    dti = pd.date_range(
+        start=Timestamp("2000-01-01"), periods=100, freq=freqstr, unit="ns"
+    )
+    return dti
+
+
+@pytest.fixture
+def timedelta_index():
+    """
+    A fixture to provide TimedeltaIndex objects with different frequencies.
+     Most TimedeltaArray behavior is already tested in TimedeltaIndex tests,
+    so here we just test that the TimedeltaArray behavior matches
+    the TimedeltaIndex behavior.
+    """
+    # TODO: flesh this out
+    return TimedeltaIndex(["1 Day", "3 Hours", "NaT"])
+
+
+class SharedTests:
+    index_cls: type[DatetimeIndex | PeriodIndex | TimedeltaIndex]
+
+    @pytest.fixture
+    def arr1d(self):
+        """Fixture returning DatetimeArray with daily frequency."""
+        data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
+        if self.array_cls is PeriodArray:
+            arr = self.array_cls(data, freq="D")
+        else:
+            arr = self.index_cls(data, freq="D")._data
+        return arr
+
+    def test_compare_len1_raises(self, arr1d):
+        # make sure we raise when comparing with different lengths, specific
+        #  to the case where one has length-1, which numpy would broadcast
+        arr = arr1d
+        idx = self.index_cls(arr)
+
+        with pytest.raises(ValueError, match="Lengths must match"):
+            arr == arr[:1]
+
+        # test the index classes while we're at it, GH#23078
+        with pytest.raises(ValueError, match="Lengths must match"):
+            idx <= idx[[0]]
+
+    @pytest.mark.parametrize(
+        "result",
+        [
+            pd.date_range("2020", periods=3),
+            pd.date_range("2020", periods=3, tz="UTC"),
+            pd.timedelta_range("0 days", periods=3),
+            pd.period_range("2020Q1", periods=3, freq="Q"),
+        ],
+    )
+    def test_compare_with_Categorical(self, result):
+        expected = pd.Categorical(result)
+        assert all(result == expected)
+        assert not any(result != expected)
+
+    @pytest.mark.parametrize("reverse", [True, False])
+    @pytest.mark.parametrize("as_index", [True, False])
+    def test_compare_categorical_dtype(self, arr1d, as_index, reverse, ordered):
+        other = pd.Categorical(arr1d, ordered=ordered)
+        if as_index:
+            other = pd.CategoricalIndex(other)
+
+        left, right = arr1d, other
+        if reverse:
+            left, right = right, left
+
+        ones = np.ones(arr1d.shape, dtype=bool)
+        zeros = ~ones
+
+        result = left == right
+        tm.assert_numpy_array_equal(result, ones)
+
+        result = left != right
+        tm.assert_numpy_array_equal(result, zeros)
+
+        if not reverse and not as_index:
+            # Otherwise Categorical raises TypeError bc it is not ordered
+            # TODO: we should probably get the same behavior regardless?
+            result = left < right
+            tm.assert_numpy_array_equal(result, zeros)
+
+            result = left <= right
+            tm.assert_numpy_array_equal(result, ones)
+
+            result = left > right
+            tm.assert_numpy_array_equal(result, zeros)
+
+            result = left >= right
+            tm.assert_numpy_array_equal(result, ones)
+
+    def test_take(self):
+        data = np.arange(100, dtype="i8") * 24 * 3600 * 10**9
+        np.random.default_rng(2).shuffle(data)
+
+        if self.array_cls is PeriodArray:
+            arr = PeriodArray(data, dtype="period[D]")
+        else:
+            arr = self.index_cls(data)._data
+        idx = self.index_cls._simple_new(arr)
+
+        takers = [1, 4, 94]
+        result = arr.take(takers)
+        expected = idx.take(takers)
+
+        tm.assert_index_equal(self.index_cls(result), expected)
+
+        takers = np.array([1, 4, 94])
+        result = arr.take(takers)
+        expected = idx.take(takers)
+
+        tm.assert_index_equal(self.index_cls(result), expected)
+
+    @pytest.mark.parametrize("fill_value", [2, 2.0, Timestamp(2021, 1, 1, 12).time])
+    def test_take_fill_raises(self, fill_value, arr1d):
+        msg = f"value should be a '{arr1d._scalar_type.__name__}' or 'NaT'. Got"
+        with pytest.raises(TypeError, match=msg):
+            arr1d.take([0, 1], allow_fill=True, fill_value=fill_value)
+
+    def test_take_fill(self, arr1d):
+        arr = arr1d
+
+        result = arr.take([-1, 1], allow_fill=True, fill_value=None)
+        assert result[0] is NaT
+
+        result = arr.take([-1, 1], allow_fill=True, fill_value=np.nan)
+        assert result[0] is NaT
+
+        result = arr.take([-1, 1], allow_fill=True, fill_value=NaT)
+        assert result[0] is NaT
+
+    @pytest.mark.filterwarnings(
+        "ignore:Period with BDay freq is deprecated:FutureWarning"
+    )
+    def test_take_fill_str(self, arr1d):
+        # Cast str fill_value matching other fill_value-taking methods
+        result = arr1d.take([-1, 1], allow_fill=True, fill_value=str(arr1d[-1]))
+        expected = arr1d[[-1, 1]]
+        tm.assert_equal(result, expected)
+
+        msg = f"value should be a '{arr1d._scalar_type.__name__}' or 'NaT'. Got"
+        with pytest.raises(TypeError, match=msg):
+            arr1d.take([-1, 1], allow_fill=True, fill_value="foo")
+
+    def test_concat_same_type(self, arr1d):
+        arr = arr1d
+        idx = self.index_cls(arr)
+        idx = idx.insert(0, NaT)
+        arr = arr1d
+
+        result = arr._concat_same_type([arr[:-1], arr[1:], arr])
+        arr2 = arr.astype(object)
+        expected = self.index_cls(np.concatenate([arr2[:-1], arr2[1:], arr2]))
+
+        tm.assert_index_equal(self.index_cls(result), expected)
+
+    def test_unbox_scalar(self, arr1d):
+        result = arr1d._unbox_scalar(arr1d[0])
+        expected = arr1d._ndarray.dtype.type
+        assert isinstance(result, expected)
+
+        result = arr1d._unbox_scalar(NaT)
+        assert isinstance(result, expected)
+
+        msg = f"'value' should be a {self.scalar_type.__name__}."
+        with pytest.raises(ValueError, match=msg):
+            arr1d._unbox_scalar("foo")
+
+    def test_check_compatible_with(self, arr1d):
+        arr1d._check_compatible_with(arr1d[0])
+        arr1d._check_compatible_with(arr1d[:1])
+        arr1d._check_compatible_with(NaT)
+
+    def test_scalar_from_string(self, arr1d):
+        result = arr1d._scalar_from_string(str(arr1d[0]))
+        assert result == arr1d[0]
+
+    def test_reduce_invalid(self, arr1d):
+        msg = "does not support operation 'not a method'"
+        with pytest.raises(TypeError, match=msg):
+            arr1d._reduce("not a method")
+
+    @pytest.mark.parametrize("method", ["pad", "backfill"])
+    def test_fillna_method_doesnt_change_orig(self, method):
+        data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
+        if self.array_cls is PeriodArray:
+            arr = self.array_cls(data, dtype="period[D]")
+        else:
+            dtype = "M8[ns]" if self.array_cls is DatetimeArray else "m8[ns]"
+            arr = self.array_cls._from_sequence(data, dtype=np.dtype(dtype))
+        arr[4] = NaT
+
+        fill_value = arr[3] if method == "pad" else arr[5]
+
+        result = arr._pad_or_backfill(method=method)
+        assert result[4] == fill_value
+
+        # check that the original was not changed
+        assert arr[4] is NaT
+
+    def test_searchsorted(self):
+        data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
+        if self.array_cls is PeriodArray:
+            arr = self.array_cls(data, dtype="period[D]")
+        else:
+            dtype = "M8[ns]" if self.array_cls is DatetimeArray else "m8[ns]"
+            arr = self.array_cls._from_sequence(data, dtype=np.dtype(dtype))
+
+        # scalar
+        result = arr.searchsorted(arr[1])
+        assert result == 1
+
+        result = arr.searchsorted(arr[2], side="right")
+        assert result == 3
+
+        # own-type
+        result = arr.searchsorted(arr[1:3])
+        expected = np.array([1, 2], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = arr.searchsorted(arr[1:3], side="right")
+        expected = np.array([2, 3], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+
+        # GH#29884 match numpy convention on whether NaT goes
+        #  at the end or the beginning
+        result = arr.searchsorted(NaT)
+        assert result == 10
+
+    @pytest.mark.parametrize("box", [None, "index", "series"])
+    def test_searchsorted_castable_strings(
+        self, arr1d, box, string_storage, using_infer_string
+    ):
+        arr = arr1d
+        if box is None:
+            pass
+        elif box == "index":
+            # Test the equivalent Index.searchsorted method while we're here
+            arr = self.index_cls(arr)
+        else:
+            # Test the equivalent Series.searchsorted method while we're here
+            arr = pd.Series(arr)
+
+        # scalar
+        result = arr.searchsorted(str(arr[1]))
+        assert result == 1
+
+        result = arr.searchsorted(str(arr[2]), side="right")
+        assert result == 3
+
+        result = arr.searchsorted([str(x) for x in arr[1:3]])
+        expected = np.array([1, 2], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+
+        with pytest.raises(
+            TypeError,
+            match=re.escape(
+                f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
+                "or array of those. Got 'str' instead."
+            ),
+        ):
+            arr.searchsorted("foo")
+
+        msg = re.escape(
+            f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', "
+            "or array of those. Got str array instead."
+        )
+        if not using_infer_string:
+            msg = msg.replace("str", "string")
+        with pd.option_context("string_storage", string_storage):
+            with pytest.raises(
+                TypeError,
+                match=msg,
+            ):
+                arr.searchsorted([str(arr[1]), "baz"])
+
+    def test_getitem_near_implementation_bounds(self):
+        # We only check tz-naive for DTA bc the bounds are slightly different
+        #  for other tzs
+        i8vals = np.asarray([NaT._value + n for n in range(1, 5)], dtype="i8")
+        if self.array_cls is PeriodArray:
+            arr = self.array_cls(i8vals, dtype="period[ns]")
+        else:
+            arr = self.index_cls(i8vals, freq="ns")._data
+        arr[0]  # should not raise OutOfBoundsDatetime
+
+        index = pd.Index(arr)
+        index[0]  # should not raise OutOfBoundsDatetime
+
+        ser = pd.Series(arr)
+        ser[0]  # should not raise OutOfBoundsDatetime
+
+    def test_getitem_2d(self, arr1d):
+        # 2d slicing on a 1D array
+        expected = type(arr1d)._simple_new(
+            arr1d._ndarray[:, np.newaxis], dtype=arr1d.dtype
+        )
+        result = arr1d[:, np.newaxis]
+        tm.assert_equal(result, expected)
+
+        # Lookup on a 2D array
+        arr2d = expected
+        expected = type(arr2d)._simple_new(arr2d._ndarray[:3, 0], dtype=arr2d.dtype)
+        result = arr2d[:3, 0]
+        tm.assert_equal(result, expected)
+
+        # Scalar lookup
+        result = arr2d[-1, 0]
+        expected = arr1d[-1]
+        assert result == expected
+
+    def test_iter_2d(self, arr1d):
+        data2d = arr1d._ndarray[:3, np.newaxis]
+        arr2d = type(arr1d)._simple_new(data2d, dtype=arr1d.dtype)
+        result = list(arr2d)
+        assert len(result) == 3
+        for x in result:
+            assert isinstance(x, type(arr1d))
+            assert x.ndim == 1
+            assert x.dtype == arr1d.dtype
+
+    def test_repr_2d(self, arr1d):
+        data2d = arr1d._ndarray[:3, np.newaxis]
+        arr2d = type(arr1d)._simple_new(data2d, dtype=arr1d.dtype)
+
+        result = repr(arr2d)
+
+        if isinstance(arr2d, TimedeltaArray):
+            expected = (
+                f"<{type(arr2d).__name__}>\n"
+                "[\n"
+                f"['{arr1d[0]._repr_base()}'],\n"
+                f"['{arr1d[1]._repr_base()}'],\n"
+                f"['{arr1d[2]._repr_base()}']\n"
+                "]\n"
+                f"Shape: (3, 1), dtype: {arr1d.dtype}"
+            )
+        else:
+            expected = (
+                f"<{type(arr2d).__name__}>\n"
+                "[\n"
+                f"['{arr1d[0]}'],\n"
+                f"['{arr1d[1]}'],\n"
+                f"['{arr1d[2]}']\n"
+                "]\n"
+                f"Shape: (3, 1), dtype: {arr1d.dtype}"
+            )
+
+        assert result == expected
+
+    def test_setitem(self):
+        data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
+        if self.array_cls is PeriodArray:
+            arr = self.array_cls(data, dtype="period[D]")
+        else:
+            arr = self.index_cls(data, freq="D")._data
+
+        arr[0] = arr[1]
+        expected = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
+        expected[0] = expected[1]
+
+        tm.assert_numpy_array_equal(arr.asi8, expected)
+
+        arr[:2] = arr[-2:]
+        expected[:2] = expected[-2:]
+        tm.assert_numpy_array_equal(arr.asi8, expected)
+
+    def test_setitem_list_of_nats(self, arr1d):
+        # GH#63420
+        arr1d[:] = [NaT] * len(arr1d)
+        assert arr1d.isna().all()
+
+    @pytest.mark.parametrize(
+        "box",
+        [
+            pd.Index,
+            pd.Series,
+            np.array,
+            list,
+            NumpyExtensionArray,
+        ],
+    )
+    def test_setitem_object_dtype(self, box, arr1d):
+        expected = arr1d.copy()[::-1]
+        if expected.dtype.kind in ["m", "M"]:
+            expected = expected._with_freq(None)
+
+        vals = expected
+        if box is list:
+            vals = list(vals)
+        elif box is np.array:
+            # if we do np.array(x).astype(object) then dt64 and td64 cast to ints
+            vals = np.array(vals.astype(object))
+        elif box is NumpyExtensionArray:
+            vals = box(np.asarray(vals, dtype=object))
+        else:
+            vals = box(vals).astype(object)
+
+        arr1d[:] = vals
+
+        tm.assert_equal(arr1d, expected)
+
+    def test_setitem_strs(self, arr1d):
+        # Check that we parse strs in both scalar and listlike
+
+        # Setting list-like of strs
+        expected = arr1d.copy()
+        expected[[0, 1]] = arr1d[-2:]
+
+        result = arr1d.copy()
+        result[:2] = [str(x) for x in arr1d[-2:]]
+        tm.assert_equal(result, expected)
+
+        # Same thing but now for just a scalar str
+        expected = arr1d.copy()
+        expected[0] = arr1d[-1]
+
+        result = arr1d.copy()
+        result[0] = str(arr1d[-1])
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize("as_index", [True, False])
+    def test_setitem_categorical(self, arr1d, as_index):
+        expected = arr1d.copy()[::-1]
+        if not isinstance(expected, PeriodArray):
+            expected = expected._with_freq(None)
+
+        cat = pd.Categorical(arr1d)
+        if as_index:
+            cat = pd.CategoricalIndex(cat)
+
+        arr1d[:] = cat[::-1]
+
+        tm.assert_equal(arr1d, expected)
+
+    def test_setitem_raises(self, arr1d):
+        arr = arr1d[:10]
+        val = arr[0]
+
+        with pytest.raises(IndexError, match="index 12 is out of bounds"):
+            arr[12] = val
+
+        with pytest.raises(TypeError, match="value should be a.* 'object'"):
+            arr[0] = object()
+
+        msg = "cannot set using a list-like indexer with a different length"
+        with pytest.raises(ValueError, match=msg):
+            # GH#36339
+            arr[[]] = [arr[1]]
+
+        msg = "cannot set using a slice indexer with a different length than"
+        with pytest.raises(ValueError, match=msg):
+            # GH#36339
+            arr[1:1] = arr[:3]
+
+    @pytest.mark.parametrize("box", [list, np.array, pd.Index, pd.Series])
+    def test_setitem_numeric_raises(self, arr1d, box):
+        # We dont case e.g. int64 to our own dtype for setitem
+
+        msg = (
+            f"value should be a '{arr1d._scalar_type.__name__}', "
+            "'NaT', or array of those. Got"
+        )
+        with pytest.raises(TypeError, match=msg):
+            arr1d[:2] = box([0, 1])
+
+        with pytest.raises(TypeError, match=msg):
+            arr1d[:2] = box([0.0, 1.0])
+
+    def test_inplace_arithmetic(self):
+        # GH#24115 check that iadd and isub are actually in-place
+        data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
+        if self.array_cls is PeriodArray:
+            arr = self.array_cls(data, dtype="period[D]")
+        else:
+            arr = self.index_cls(data, freq="D")._data
+
+        expected = arr + pd.Timedelta(days=1)
+        arr += pd.Timedelta(days=1)
+        tm.assert_equal(arr, expected)
+
+        expected = arr - pd.Timedelta(days=1)
+        arr -= pd.Timedelta(days=1)
+        tm.assert_equal(arr, expected)
+
+    def test_shift_fill_int_deprecated(self, arr1d):
+        # GH#31971, enforced in 2.0
+        with pytest.raises(TypeError, match="value should be a"):
+            arr1d.shift(1, fill_value=1)
+
+    def test_median(self, arr1d):
+        arr = arr1d
+        if len(arr) % 2 == 0:
+            # make it easier to define `expected`
+            arr = arr[:-1]
+
+        expected = arr[len(arr) // 2]
+
+        result = arr.median()
+        assert type(result) is type(expected)
+        assert result == expected
+
+        arr[len(arr) // 2] = NaT
+        if not isinstance(expected, Period):
+            expected = arr[len(arr) // 2 - 1 : len(arr) // 2 + 2].mean()
+
+        assert arr.median(skipna=False) is NaT
+
+        result = arr.median()
+        assert type(result) is type(expected)
+        assert result == expected
+
+        assert arr[:0].median() is NaT
+        assert arr[:0].median(skipna=False) is NaT
+
+        # 2d Case
+        arr2 = arr.reshape(-1, 1)
+
+        result = arr2.median(axis=None)
+        assert type(result) is type(expected)
+        assert result == expected
+
+        assert arr2.median(axis=None, skipna=False) is NaT
+
+        result = arr2.median(axis=0)
+        expected2 = type(arr)._from_sequence([expected], dtype=arr.dtype)
+        tm.assert_equal(result, expected2)
+
+        result = arr2.median(axis=0, skipna=False)
+        expected2 = type(arr)._from_sequence([NaT], dtype=arr.dtype)
+        tm.assert_equal(result, expected2)
+
+        result = arr2.median(axis=1)
+        tm.assert_equal(result, arr)
+
+        result = arr2.median(axis=1, skipna=False)
+        tm.assert_equal(result, arr)
+
+    def test_from_integer_array(self):
+        arr = np.array([1, 2, 3], dtype=np.int64)
+        data = pd.array(arr, dtype="Int64")
+        if self.array_cls is PeriodArray:
+            expected = self.array_cls(arr, dtype=self.example_dtype)
+            result = self.array_cls(data, dtype=self.example_dtype)
+        else:
+            expected = self.array_cls._from_sequence(arr, dtype=self.example_dtype)
+            result = self.array_cls._from_sequence(data, dtype=self.example_dtype)
+
+        tm.assert_extension_array_equal(result, expected)
+
+
+class TestDatetimeArray(SharedTests):
+    index_cls = DatetimeIndex
+    array_cls = DatetimeArray
+    scalar_type = Timestamp
+    example_dtype = "M8[ns]"
+
+    @pytest.fixture
+    def arr1d(self, tz_naive_fixture, freqstr):
+        """
+        Fixture returning DatetimeArray with parametrized frequency and
+        timezones
+        """
+        tz = tz_naive_fixture
+        dti = pd.date_range(
+            "2016-01-01 01:01:00", periods=5, freq=freqstr, tz=tz, unit="ns"
+        )
+        dta = dti._data
+        return dta
+
+    def test_round(self, arr1d):
+        # GH#24064
+        dti = self.index_cls(arr1d)
+
+        result = dti.round(freq="2min")
+        expected = dti - pd.Timedelta(minutes=1)
+        expected = expected._with_freq(None)
+        tm.assert_index_equal(result, expected)
+
+        dta = dti._data
+        result = dta.round(freq="2min")
+        expected = expected._data._with_freq(None)
+        tm.assert_datetime_array_equal(result, expected)
+
+    def test_array_interface(self, datetime_index):
+        arr = datetime_index._data
+        copy_false = None if np_version_gt2 else False
+
+        # default asarray gives the same underlying data (for tz naive)
+        result = np.asarray(arr)
+        expected = arr._ndarray
+        assert result is expected
+        tm.assert_numpy_array_equal(result, expected)
+        result = np.array(arr, copy=copy_false)
+        assert result is expected
+        tm.assert_numpy_array_equal(result, expected)
+
+        # specifying M8[ns] gives the same result as default
+        result = np.asarray(arr, dtype="datetime64[ns]")
+        expected = arr._ndarray
+        assert result is expected
+        tm.assert_numpy_array_equal(result, expected)
+        result = np.array(arr, dtype="datetime64[ns]", copy=copy_false)
+        assert result is expected
+        tm.assert_numpy_array_equal(result, expected)
+        result = np.array(arr, dtype="datetime64[ns]")
+        if not np_version_gt2:
+            # TODO: GH 57739
+            assert result is not expected
+        tm.assert_numpy_array_equal(result, expected)
+
+        # to object dtype
+        result = np.asarray(arr, dtype=object)
+        expected = np.array(list(arr), dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+        # to other dtype always copies
+        result = np.asarray(arr, dtype="int64")
+        assert result is not arr.asi8
+        assert not np.may_share_memory(arr, result)
+        expected = arr.asi8.copy()
+        tm.assert_numpy_array_equal(result, expected)
+
+        # other dtypes handled by numpy
+        for dtype in ["float64", str]:
+            result = np.asarray(arr, dtype=dtype)
+            expected = np.asarray(arr).astype(dtype)
+            tm.assert_numpy_array_equal(result, expected)
+
+    def test_array_object_dtype(self, arr1d):
+        # GH#23524
+        arr = arr1d
+        dti = self.index_cls(arr1d)
+
+        expected = np.array(list(dti))
+
+        result = np.array(arr, dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+        # also test the DatetimeIndex method while we're at it
+        result = np.array(dti, dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_array_tz(self, arr1d):
+        # GH#23524
+        arr = arr1d
+        dti = self.index_cls(arr1d, copy=False)
+        copy_false = None if np_version_gt2 else False
+
+        expected = dti.asi8.view("M8[ns]")
+        result = np.array(arr, dtype="M8[ns]")
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = np.array(arr, dtype="datetime64[ns]")
+        tm.assert_numpy_array_equal(result, expected)
+
+        # check that we are not making copies when setting copy=copy_false
+        result = np.array(arr, dtype="M8[ns]", copy=copy_false)
+        assert result.base is expected.base
+        assert result.base is not None
+        result = np.array(arr, dtype="datetime64[ns]", copy=copy_false)
+        assert result.base is expected.base
+        assert result.base is not None
+
+    def test_array_i8_dtype(self, arr1d):
+        arr = arr1d
+        dti = self.index_cls(arr1d)
+        copy_false = None if np_version_gt2 else False
+
+        expected = dti.asi8
+        result = np.array(arr, dtype="i8")
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = np.array(arr, dtype=np.int64)
+        tm.assert_numpy_array_equal(result, expected)
+
+        # check that we are still making copies when setting copy=copy_false
+        result = np.array(arr, dtype="i8", copy=copy_false)
+        assert result.base is not expected.base
+        assert result.base is None
+
+    def test_from_array_keeps_base(self):
+        # Ensure that DatetimeArray._ndarray.base isn't lost.
+        arr = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]")
+        dta = DatetimeArray._from_sequence(arr, dtype=arr.dtype)
+
+        assert dta._ndarray is arr
+        dta = DatetimeArray._from_sequence(arr[:0], dtype=arr.dtype)
+        assert dta._ndarray.base is arr
+
+    def test_from_dti(self, arr1d):
+        arr = arr1d
+        dti = self.index_cls(arr1d)
+        assert list(dti) == list(arr)
+
+        # Check that Index.__new__ knows what to do with DatetimeArray
+        dti2 = pd.Index(arr)
+        assert isinstance(dti2, DatetimeIndex)
+        assert list(dti2) == list(arr)
+
+    def test_astype_object(self, arr1d):
+        arr = arr1d
+        dti = self.index_cls(arr1d)
+
+        asobj = arr.astype("O")
+        assert isinstance(asobj, np.ndarray)
+        assert asobj.dtype == "O"
+        assert list(asobj) == list(dti)
+
+    @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
+    def test_to_period(self, datetime_index, freqstr):
+        dti = datetime_index
+        arr = dti._data
+
+        freqstr = PeriodDtype(to_offset(freqstr))._freqstr
+        expected = dti.to_period(freq=freqstr)
+        result = arr.to_period(freq=freqstr)
+        assert isinstance(result, PeriodArray)
+
+        tm.assert_equal(result, expected._data)
+
+    def test_to_period_2d(self, arr1d):
+        arr2d = arr1d.reshape(1, -1)
+
+        warn = None if arr1d.tz is None else UserWarning
+        with tm.assert_produces_warning(warn, match="will drop timezone information"):
+            result = arr2d.to_period("D")
+            expected = arr1d.to_period("D").reshape(1, -1)
+        tm.assert_period_array_equal(result, expected)
+
+    @pytest.mark.parametrize("propname", DatetimeArray._bool_ops)
+    def test_bool_properties(self, arr1d, propname):
+        # in this case _bool_ops is just `is_leap_year`
+        dti = self.index_cls(arr1d)
+        arr = arr1d
+        assert dti.freq == arr.freq
+
+        result = getattr(arr, propname)
+        expected = np.array(getattr(dti, propname), dtype=result.dtype)
+
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("propname", DatetimeArray._field_ops)
+    def test_int_properties(self, arr1d, propname):
+        dti = self.index_cls(arr1d)
+        arr = arr1d
+
+        result = getattr(arr, propname)
+        expected = np.array(getattr(dti, propname), dtype=result.dtype)
+
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_take_fill_valid(self, arr1d, fixed_now_ts):
+        arr = arr1d
+        dti = self.index_cls(arr1d)
+
+        now = fixed_now_ts.tz_localize(dti.tz)
+        result = arr.take([-1, 1], allow_fill=True, fill_value=now)
+        assert result[0] == now
+
+        msg = f"value should be a '{arr1d._scalar_type.__name__}' or 'NaT'. Got"
+        with pytest.raises(TypeError, match=msg):
+            # fill_value Timedelta invalid
+            arr.take([-1, 1], allow_fill=True, fill_value=now - now)
+
+        with pytest.raises(TypeError, match=msg):
+            # fill_value Period invalid
+            arr.take([-1, 1], allow_fill=True, fill_value=Period("2014Q1"))
+
+        tz = None if dti.tz is not None else "US/Eastern"
+        now = fixed_now_ts.tz_localize(tz)
+        msg = "Cannot compare tz-naive and tz-aware datetime-like objects"
+        with pytest.raises(TypeError, match=msg):
+            # Timestamp with mismatched tz-awareness
+            arr.take([-1, 1], allow_fill=True, fill_value=now)
+
+        value = NaT._value
+        msg = f"value should be a '{arr1d._scalar_type.__name__}' or 'NaT'. Got"
+        with pytest.raises(TypeError, match=msg):
+            # require NaT, not iNaT, as it could be confused with an integer
+            arr.take([-1, 1], allow_fill=True, fill_value=value)
+
+        value = np.timedelta64("NaT", "ns")
+        with pytest.raises(TypeError, match=msg):
+            # require appropriate-dtype if we have an NA value
+            arr.take([-1, 1], allow_fill=True, fill_value=value)
+
+        if arr.tz is not None:
+            # GH#37356
+            # Assuming here that arr1d fixture does not include Australia/Melbourne
+            value = fixed_now_ts.tz_localize("Australia/Melbourne")
+            result = arr.take([-1, 1], allow_fill=True, fill_value=value)
+
+            expected = arr.take(
+                [-1, 1],
+                allow_fill=True,
+                fill_value=value.tz_convert(arr.dtype.tz),
+            )
+            tm.assert_equal(result, expected)
+
+    def test_concat_same_type_invalid(self, arr1d):
+        # different timezones
+        arr = arr1d
+
+        if arr.tz is None:
+            other = arr.tz_localize("UTC")
+        else:
+            other = arr.tz_localize(None)
+
+        with pytest.raises(ValueError, match="to_concat must have the same"):
+            arr._concat_same_type([arr, other])
+
+    def test_concat_same_type_different_freq(self, unit):
+        # we *can* concatenate DTI with different freqs.
+        a = pd.date_range("2000", periods=2, freq="D", tz="US/Central", unit=unit)._data
+        b = pd.date_range("2000", periods=2, freq="h", tz="US/Central", unit=unit)._data
+        result = DatetimeArray._concat_same_type([a, b])
+        expected = (
+            pd.to_datetime(
+                [
+                    "2000-01-01 00:00:00",
+                    "2000-01-02 00:00:00",
+                    "2000-01-01 00:00:00",
+                    "2000-01-01 01:00:00",
+                ]
+            )
+            .tz_localize("US/Central")
+            .as_unit(unit)
+            ._data
+        )
+
+        tm.assert_datetime_array_equal(result, expected)
+
+    def test_strftime(self, arr1d, using_infer_string):
+        arr = arr1d
+
+        result = arr.strftime("%Y %b")
+        expected = np.array([ts.strftime("%Y %b") for ts in arr], dtype=object)
+        if using_infer_string:
+            expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan))
+        tm.assert_equal(result, expected)
+
+    def test_strftime_nat(self, using_infer_string):
+        # GH 29578
+        arr = DatetimeIndex(["2019-01-01", NaT])._data
+
+        result = arr.strftime("%Y-%m-%d")
+        expected = np.array(["2019-01-01", np.nan], dtype=object)
+        if using_infer_string:
+            expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan))
+        tm.assert_equal(result, expected)
+
+
+class TestTimedeltaArray(SharedTests):
+    index_cls = TimedeltaIndex
+    array_cls = TimedeltaArray
+    scalar_type = pd.Timedelta
+    example_dtype = "m8[ns]"
+
+    def test_from_tdi(self):
+        tdi = TimedeltaIndex(["1 Day", "3 Hours"])
+        arr = tdi._data
+        assert list(arr) == list(tdi)
+
+        # Check that Index.__new__ knows what to do with TimedeltaArray
+        tdi2 = pd.Index(arr)
+        assert isinstance(tdi2, TimedeltaIndex)
+        assert list(tdi2) == list(arr)
+
+    def test_astype_object(self):
+        tdi = TimedeltaIndex(["1 Day", "3 Hours"])
+        arr = tdi._data
+        asobj = arr.astype("O")
+        assert isinstance(asobj, np.ndarray)
+        assert asobj.dtype == "O"
+        assert list(asobj) == list(tdi)
+
+    def test_to_pytimedelta(self, timedelta_index):
+        tdi = timedelta_index
+        arr = tdi._data
+
+        expected = tdi.to_pytimedelta()
+        result = arr.to_pytimedelta()
+
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_total_seconds(self, timedelta_index):
+        tdi = timedelta_index
+        arr = tdi._data
+
+        expected = tdi.total_seconds()
+        result = arr.total_seconds()
+
+        tm.assert_numpy_array_equal(result, expected.values)
+
+    @pytest.mark.parametrize("propname", TimedeltaArray._field_ops)
+    def test_int_properties(self, timedelta_index, propname):
+        tdi = timedelta_index
+        arr = tdi._data
+
+        result = getattr(arr, propname)
+        expected = np.array(getattr(tdi, propname), dtype=result.dtype)
+
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_array_interface(self, timedelta_index):
+        arr = timedelta_index._data
+        copy_false = None if np_version_gt2 else False
+
+        # default asarray gives the same underlying data
+        result = np.asarray(arr)
+        expected = arr._ndarray
+        assert result is expected
+        tm.assert_numpy_array_equal(result, expected)
+        result = np.array(arr, copy=copy_false)
+        assert result is expected
+        tm.assert_numpy_array_equal(result, expected)
+
+        # specifying m8[us] gives the same result as default
+        result = np.asarray(arr, dtype="timedelta64[us]")
+        expected = arr._ndarray
+        assert result is expected
+        tm.assert_numpy_array_equal(result, expected)
+        result = np.array(arr, dtype="timedelta64[us]", copy=copy_false)
+        assert result is expected
+        tm.assert_numpy_array_equal(result, expected)
+        result = np.array(arr, dtype="timedelta64[us]")
+        if not np_version_gt2:
+            # TODO: GH 57739
+            assert result is not expected
+        tm.assert_numpy_array_equal(result, expected)
+
+        # to object dtype
+        result = np.asarray(arr, dtype=object)
+        expected = np.array(list(arr), dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+        # to other dtype always copies
+        result = np.asarray(arr, dtype="int64")
+        assert result is not arr.asi8
+        assert not np.may_share_memory(arr, result)
+        expected = arr.asi8.copy()
+        tm.assert_numpy_array_equal(result, expected)
+
+        # other dtypes handled by numpy
+        for dtype in ["float64", str]:
+            result = np.asarray(arr, dtype=dtype)
+            expected = np.asarray(arr).astype(dtype)
+            tm.assert_numpy_array_equal(result, expected)
+
+    def test_take_fill_valid(self, timedelta_index, fixed_now_ts):
+        tdi = timedelta_index
+        arr = tdi._data
+
+        td1 = pd.Timedelta(days=1)
+        result = arr.take([-1, 1], allow_fill=True, fill_value=td1)
+        assert result[0] == td1
+
+        value = fixed_now_ts
+        msg = f"value should be a '{arr._scalar_type.__name__}' or 'NaT'. Got"
+        with pytest.raises(TypeError, match=msg):
+            # fill_value Timestamp invalid
+            arr.take([0, 1], allow_fill=True, fill_value=value)
+
+        value = fixed_now_ts.to_period("D")
+        with pytest.raises(TypeError, match=msg):
+            # fill_value Period invalid
+            arr.take([0, 1], allow_fill=True, fill_value=value)
+
+        value = np.datetime64("NaT", "ns")
+        with pytest.raises(TypeError, match=msg):
+            # require appropriate-dtype if we have an NA value
+            arr.take([-1, 1], allow_fill=True, fill_value=value)
+
+
+@pytest.mark.filterwarnings(r"ignore:Period with BDay freq is deprecated:FutureWarning")
+@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
+class TestPeriodArray(SharedTests):
+    index_cls = PeriodIndex
+    array_cls = PeriodArray
+    scalar_type = Period
+    example_dtype = PeriodIndex([], freq="W").dtype
+
+    @pytest.fixture
+    def arr1d(self, period_index):
+        """
+        Fixture returning DatetimeArray from parametrized PeriodIndex objects
+        """
+        return period_index._data
+
+    def test_from_pi(self, arr1d):
+        pi = self.index_cls(arr1d)
+        arr = arr1d
+        assert list(arr) == list(pi)
+
+        # Check that Index.__new__ knows what to do with PeriodArray
+        pi2 = pd.Index(arr)
+        assert isinstance(pi2, PeriodIndex)
+        assert list(pi2) == list(arr)
+
+    def test_astype_object(self, arr1d):
+        pi = self.index_cls(arr1d)
+        arr = arr1d
+        asobj = arr.astype("O")
+        assert isinstance(asobj, np.ndarray)
+        assert asobj.dtype == "O"
+        assert list(asobj) == list(pi)
+
+    def test_take_fill_valid(self, arr1d):
+        arr = arr1d
+
+        value = NaT._value
+        msg = f"value should be a '{arr1d._scalar_type.__name__}' or 'NaT'. Got"
+        with pytest.raises(TypeError, match=msg):
+            # require NaT, not iNaT, as it could be confused with an integer
+            arr.take([-1, 1], allow_fill=True, fill_value=value)
+
+        value = np.timedelta64("NaT", "ns")
+        with pytest.raises(TypeError, match=msg):
+            # require appropriate-dtype if we have an NA value
+            arr.take([-1, 1], allow_fill=True, fill_value=value)
+
+    @pytest.mark.parametrize("how", ["S", "E"])
+    def test_to_timestamp(self, how, arr1d):
+        pi = self.index_cls(arr1d)
+        arr = arr1d
+
+        expected = DatetimeIndex(pi.to_timestamp(how=how))._data
+        result = arr.to_timestamp(how=how)
+        assert isinstance(result, DatetimeArray)
+
+        tm.assert_equal(result, expected)
+
+    def test_to_timestamp_roundtrip_bday(self):
+        # Case where infer_freq inside would choose "D" instead of "B"
+        dta = pd.date_range("2021-10-18", periods=3, freq="B", unit="ns")._data
+        parr = dta.to_period()
+        result = parr.to_timestamp()
+        assert result.freq == "B"
+        tm.assert_extension_array_equal(result, dta.as_unit("us"))
+
+        dta2 = dta[::2]
+        parr2 = dta2.to_period()
+        result2 = parr2.to_timestamp()
+        assert result2.freq == "2B"
+        tm.assert_extension_array_equal(result2, dta2.as_unit("us"))
+
+        parr3 = dta.to_period("2B")
+        result3 = parr3.to_timestamp()
+        assert result3.freq == "B"
+        tm.assert_extension_array_equal(result3, dta.as_unit("us"))
+
+    def test_to_timestamp_out_of_bounds(self):
+        # GH#19643 previously overflowed silently
+        pi = pd.period_range("1500", freq="Y", periods=3)
+        pi.to_timestamp()
+        dta = pi._data.to_timestamp()
+        assert dta[0] == Timestamp(1500, 1, 1)
+
+    @pytest.mark.parametrize("propname", PeriodArray._bool_ops)
+    def test_bool_properties(self, arr1d, propname):
+        # in this case _bool_ops is just `is_leap_year`
+        pi = self.index_cls(arr1d)
+        arr = arr1d
+
+        result = getattr(arr, propname)
+        expected = np.array(getattr(pi, propname))
+
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("propname", PeriodArray._field_ops)
+    def test_int_properties(self, arr1d, propname):
+        pi = self.index_cls(arr1d)
+        arr = arr1d
+
+        result = getattr(arr, propname)
+        expected = np.array(getattr(pi, propname))
+
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_array_interface(self, arr1d):
+        arr = arr1d
+
+        # default asarray gives objects
+        result = np.asarray(arr)
+        expected = np.array(list(arr), dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+        # to object dtype (same as default)
+        result = np.asarray(arr, dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+        # to int64 gives the underlying representation
+        result = np.asarray(arr, dtype="int64")
+        tm.assert_numpy_array_equal(result, arr.asi8)
+
+        result2 = np.asarray(arr, dtype="int64")
+        assert np.may_share_memory(result, result2)
+
+        result_copy1 = np.array(arr, dtype="int64", copy=True)
+        result_copy2 = np.array(arr, dtype="int64", copy=True)
+        assert not np.may_share_memory(result_copy1, result_copy2)
+
+        # to other dtypes
+        msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'"
+        with pytest.raises(TypeError, match=msg):
+            np.asarray(arr, dtype="float64")
+
+        result = np.asarray(arr, dtype="S20")
+        expected = np.asarray(arr).astype("S20")
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_strftime(self, arr1d, using_infer_string):
+        arr = arr1d
+
+        result = arr.strftime("%Y")
+        expected = np.array([per.strftime("%Y") for per in arr], dtype=object)
+        if using_infer_string:
+            expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan))
+        tm.assert_equal(result, expected)
+
+    def test_strftime_nat(self, using_infer_string):
+        # GH 29578
+        arr = PeriodArray(PeriodIndex(["2019-01-01", NaT], dtype="period[D]"))
+
+        result = arr.strftime("%Y-%m-%d")
+        expected = np.array(["2019-01-01", np.nan], dtype=object)
+        if using_infer_string:
+            expected = pd.array(expected, dtype=pd.StringDtype(na_value=np.nan))
+        tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "arr,casting_nats",
+    [
+        (
+            TimedeltaIndex(["1 Day", "3 Hours", "NaT"])._data,
+            (NaT, np.timedelta64("NaT", "ns")),
+        ),
+        (
+            pd.date_range("2000-01-01", periods=3, freq="D")._data,
+            (NaT, np.datetime64("NaT", "ns")),
+        ),
+        (pd.period_range("2000-01-01", periods=3, freq="D")._data, (NaT,)),
+    ],
+    ids=lambda x: type(x).__name__,
+)
+def test_casting_nat_setitem_array(arr, casting_nats):
+    expected = type(arr)._from_sequence([NaT, arr[1], arr[2]], dtype=arr.dtype)
+
+    for nat in casting_nats:
+        arr = arr.copy()
+        arr[0] = nat
+        tm.assert_equal(arr, expected)
+
+
+@pytest.mark.parametrize(
+    "arr,non_casting_nats",
+    [
+        (
+            TimedeltaIndex(["1 Day", "3 Hours", "NaT"])._data,
+            (np.datetime64("NaT", "ns"), NaT._value),
+        ),
+        (
+            pd.date_range("2000-01-01", periods=3, freq="D")._data,
+            (np.timedelta64("NaT", "ns"), NaT._value),
+        ),
+        (
+            pd.period_range("2000-01-01", periods=3, freq="D")._data,
+            (np.datetime64("NaT", "ns"), np.timedelta64("NaT", "ns"), NaT._value),
+        ),
+    ],
+    ids=lambda x: type(x).__name__,
+)
+def test_invalid_nat_setitem_array(arr, non_casting_nats):
+    msg = (
+        "value should be a '(Timestamp|Timedelta|Period)', 'NaT', or array of those. "
+        "Got '(timedelta64|datetime64|int)' instead."
+    )
+
+    for nat in non_casting_nats:
+        with pytest.raises(TypeError, match=msg):
+            arr[0] = nat
+
+
+@pytest.mark.parametrize(
+    "arr",
+    [
+        pd.date_range("2000", periods=4)._values,
+        pd.timedelta_range("2000", periods=4)._values,
+    ],
+)
+def test_to_numpy_extra(arr):
+    arr[0] = NaT
+    original = arr.copy()
+
+    result = arr.to_numpy()
+    assert np.isnan(result[0])
+
+    result = arr.to_numpy(dtype="int64")
+    assert result[0] == -9223372036854775808
+
+    result = arr.to_numpy(dtype="int64", na_value=0)
+    assert result[0] == 0
+
+    result = arr.to_numpy(na_value=arr[1].to_numpy())
+    assert result[0] == result[1]
+
+    result = arr.to_numpy(na_value=arr[1].to_numpy(copy=False))
+    assert result[0] == result[1]
+
+    tm.assert_equal(arr, original)
+
+
+@pytest.mark.parametrize(
+    "arr",
+    [
+        pd.date_range("2000", periods=4)._values,
+        pd.timedelta_range("2000", periods=4)._values,
+    ],
+)
+def test_to_numpy_extra_readonly(arr):
+    arr[0] = NaT
+    original = arr.copy()
+    arr._readonly = True
+
+    result = arr.to_numpy(dtype=object)
+    assert result.flags.writeable
+
+    # numpy does not do zero-copy conversion from M8 to i8
+    result = arr.to_numpy(dtype="int64")
+    assert result.flags.writeable
+
+    tm.assert_equal(arr, original)
+
+
+@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize(
+    "values",
+    [
+        pd.to_datetime(["2020-01-01", "2020-02-01"]),
+        pd.to_timedelta([1, 2], unit="D"),
+        PeriodIndex(["2020-01-01", "2020-02-01"], freq="D"),
+    ],
+)
+@pytest.mark.parametrize(
+    "klass",
+    [
+        list,
+        np.array,
+        pd.array,
+        pd.Series,
+        pd.Index,
+        pd.Categorical,
+        pd.CategoricalIndex,
+    ],
+)
+def test_searchsorted_datetimelike_with_listlike(values, klass, as_index):
+    # https://github.com/pandas-dev/pandas/issues/32762
+    if not as_index:
+        values = values._data
+
+    result = values.searchsorted(klass(values))
+    expected = np.array([0, 1], dtype=result.dtype)
+
+    tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values",
+    [
+        pd.to_datetime(["2020-01-01", "2020-02-01"]),
+        pd.to_timedelta([1, 2], unit="D"),
+        PeriodIndex(["2020-01-01", "2020-02-01"], freq="D"),
+    ],
+)
+@pytest.mark.parametrize(
+    "arg", [[1, 2], ["a", "b"], [Timestamp("2020-01-01", tz="Europe/London")] * 2]
+)
+def test_searchsorted_datetimelike_with_listlike_invalid_dtype(values, arg):
+    # https://github.com/pandas-dev/pandas/issues/32762
+    msg = "[Unexpected type|Cannot compare]"
+    with pytest.raises(TypeError, match=msg):
+        values.searchsorted(arg)
+
+
+@pytest.mark.parametrize("klass", [list, tuple, np.array, pd.Series])
+def test_period_index_construction_from_strings(klass):
+    # https://github.com/pandas-dev/pandas/issues/26109
+    strings = ["2020Q1", "2020Q2"] * 2
+    data = klass(strings)
+    result = PeriodIndex(data, freq="Q")
+    expected = PeriodIndex([Period(s) for s in strings])
+    tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
+def test_from_pandas_array(dtype):
+    # GH#24615
+    data = np.array([1, 2, 3], dtype=dtype)
+    arr = NumpyExtensionArray(data)
+
+    cls = {"M8[ns]": DatetimeArray, "m8[ns]": TimedeltaArray}[dtype]
+
+    result = cls._from_sequence(arr, dtype=dtype)
+    expected = cls._from_sequence(data, dtype=dtype)
+    tm.assert_extension_array_equal(result, expected)
+
+    func = {"M8[ns]": pd.to_datetime, "m8[ns]": pd.to_timedelta}[dtype]
+    result = func(arr).array
+    expected = func(data).array
+    tm.assert_equal(result, expected)
+
+    # Let's check the Indexes while we're here
+    idx_cls = {"M8[ns]": DatetimeIndex, "m8[ns]": TimedeltaIndex}[dtype]
+    result = idx_cls(arr)
+    expected = idx_cls(data)
+    tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f1be30e5c0221d73abc148e15922db225d8dfed
--- /dev/null
+++ b/pandas/tests/arrays/test_datetimes.py
@@ -0,0 +1,848 @@
+"""
+Tests for DatetimeArray
+"""
+
+from __future__ import annotations
+
+from datetime import timedelta
+import operator
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import tz_compare
+from pandas.errors import Pandas4Warning
+
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.core.arrays import (
+    DatetimeArray,
+    TimedeltaArray,
+)
+
+
+class TestNonNano:
+    @pytest.fixture(params=["s", "ms", "us"])
+    def unit(self, request):
+        """Fixture returning parametrized time units"""
+        return request.param
+
+    @pytest.fixture
+    def dtype(self, unit, tz_naive_fixture):
+        tz = tz_naive_fixture
+        if tz is None:
+            return np.dtype(f"datetime64[{unit}]")
+        else:
+            return DatetimeTZDtype(unit=unit, tz=tz)
+
+    @pytest.fixture
+    def dta_dti(self, unit, dtype):
+        tz = getattr(dtype, "tz", None)
+
+        dti = pd.date_range("2016-01-01", periods=55, freq="D", tz=tz, unit="ns")
+        if tz is None:
+            arr = np.asarray(dti).astype(f"M8[{unit}]")
+        else:
+            arr = np.asarray(dti.tz_convert("UTC").tz_localize(None)).astype(
+                f"M8[{unit}]"
+            )
+
+        dta = DatetimeArray._simple_new(arr, dtype=dtype)
+        return dta, dti
+
+    @pytest.fixture
+    def dta(self, dta_dti):
+        dta, dti = dta_dti
+        return dta
+
+    def test_non_nano(self, unit, dtype):
+        arr = np.arange(5, dtype=np.int64).view(f"M8[{unit}]")
+        dta = DatetimeArray._simple_new(arr, dtype=dtype)
+
+        assert dta.dtype == dtype
+        assert dta[0].unit == unit
+        assert tz_compare(dta.tz, dta[0].tz)
+        assert (dta[0] == dta[:1]).all()
+
+    @pytest.mark.parametrize(
+        "field", DatetimeArray._field_ops + DatetimeArray._bool_ops
+    )
+    def test_fields(self, unit, field, dtype, dta_dti):
+        dta, dti = dta_dti
+
+        assert (dti == dta).all()
+
+        res = getattr(dta, field)
+        expected = getattr(dti._data, field)
+        tm.assert_numpy_array_equal(res, expected)
+
+    def test_normalize(self, unit):
+        dti = pd.date_range("2016-01-01 06:00:00", periods=55, freq="D")
+        arr = np.asarray(dti).astype(f"M8[{unit}]")
+
+        dta = DatetimeArray._simple_new(arr, dtype=arr.dtype)
+
+        assert not dta.is_normalized
+
+        # TODO: simplify once we can just .astype to other unit
+        exp = np.asarray(dti.normalize()).astype(f"M8[{unit}]")
+        expected = DatetimeArray._simple_new(exp, dtype=exp.dtype)
+
+        res = dta.normalize()
+        tm.assert_extension_array_equal(res, expected)
+
+    def test_normalize_overflow_raises(self):
+        # GH#60583
+        ts = pd.Timestamp.min
+        dta = DatetimeArray._from_sequence([ts], dtype="M8[ns]")
+
+        msg = "Cannot normalize Timestamp without integer overflow"
+        with pytest.raises(ValueError, match=msg):
+            dta.normalize()
+
+    def test_simple_new_requires_match(self, unit):
+        arr = np.arange(5, dtype=np.int64).view(f"M8[{unit}]")
+        dtype = DatetimeTZDtype(unit, "UTC")
+
+        dta = DatetimeArray._simple_new(arr, dtype=dtype)
+        assert dta.dtype == dtype
+
+        wrong = DatetimeTZDtype("ns", "UTC")
+        with pytest.raises(AssertionError, match="^$"):
+            DatetimeArray._simple_new(arr, dtype=wrong)
+
+    def test_std_non_nano(self, unit):
+        dti = pd.date_range("2016-01-01", periods=55, freq="D", unit="ns")
+        arr = np.asarray(dti).astype(f"M8[{unit}]")
+
+        dta = DatetimeArray._simple_new(arr, dtype=arr.dtype)
+
+        # we should match the nano-reso std, but floored to our reso.
+        res = dta.std()
+        assert res._creso == dta._creso
+        assert res == dti.std().floor(unit)
+
+    @pytest.mark.filterwarnings("ignore:Converting to PeriodArray.*:UserWarning")
+    def test_to_period(self, dta_dti):
+        dta, dti = dta_dti
+        result = dta.to_period("D")
+        expected = dti._data.to_period("D")
+
+        tm.assert_extension_array_equal(result, expected)
+
+    def test_iter(self, dta):
+        res = next(iter(dta))
+        expected = dta[0]
+
+        assert type(res) is pd.Timestamp
+        assert res._value == expected._value
+        assert res._creso == expected._creso
+        assert res == expected
+
+    def test_astype_object(self, dta):
+        result = dta.astype(object)
+        assert all(x._creso == dta._creso for x in result)
+        assert all(x == y for x, y in zip(result, dta, strict=True))
+
+    def test_to_pydatetime(self, dta_dti):
+        dta, dti = dta_dti
+
+        result = dta.to_pydatetime()
+        expected = dti.to_pydatetime()
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("meth", ["time", "timetz", "date"])
+    def test_time_date(self, dta_dti, meth):
+        dta, dti = dta_dti
+
+        result = getattr(dta, meth)
+        expected = getattr(dti, meth)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_format_native_types(self, unit, dtype, dta_dti):
+        # In this case we should get the same formatted values with our nano
+        #  version dti._data as we do with the non-nano dta
+        dta, dti = dta_dti
+
+        res = dta._format_native_types()
+        exp = dti._data._format_native_types()
+        tm.assert_numpy_array_equal(res, exp)
+
+    def test_repr(self, dta_dti, unit):
+        dta, dti = dta_dti
+
+        assert repr(dta) == repr(dti._data).replace("[ns", f"[{unit}")
+
+    # TODO: tests with td64
+    def test_compare_mismatched_resolutions(self, comparison_op):
+        # comparison that numpy gets wrong bc of silent overflows
+        op = comparison_op
+
+        iinfo = np.iinfo(np.int64)
+        vals = np.array([iinfo.min, iinfo.min + 1, iinfo.max], dtype=np.int64)
+
+        # Construct so that arr2[1] < arr[1] < arr[2] < arr2[2]
+        arr = np.array(vals).view("M8[ns]")
+        arr2 = arr.view("M8[s]")
+
+        left = DatetimeArray._simple_new(arr, dtype=arr.dtype)
+        right = DatetimeArray._simple_new(arr2, dtype=arr2.dtype)
+
+        if comparison_op is operator.eq:
+            expected = np.array([False, False, False])
+        elif comparison_op is operator.ne:
+            expected = np.array([True, True, True])
+        elif comparison_op in [operator.lt, operator.le]:
+            expected = np.array([False, False, True])
+        else:
+            expected = np.array([False, True, False])
+
+        result = op(left, right)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = op(left[1], right)
+        tm.assert_numpy_array_equal(result, expected)
+
+        if op not in [operator.eq, operator.ne]:
+            # check that numpy still gets this wrong; if it is fixed we may be
+            #  able to remove compare_mismatched_resolutions
+            np_res = op(left._ndarray, right._ndarray)
+            tm.assert_numpy_array_equal(np_res[1:], ~expected[1:])
+
+    def test_add_mismatched_reso_doesnt_downcast(self):
+        # https://github.com/pandas-dev/pandas/pull/48748#issuecomment-1260181008
+        td = pd.Timedelta(microseconds=1)
+        dti = pd.date_range("2016-01-01", periods=3) - td
+        dta = dti._data.as_unit("us")
+
+        res = dta + td.as_unit("us")
+        # even though the result is an even number of days
+        #  (so we _could_ downcast to unit="s"), we do not.
+        assert res.unit == "us"
+
+    @pytest.mark.parametrize(
+        "scalar",
+        [
+            timedelta(hours=2),
+            pd.Timedelta(hours=2),
+            np.timedelta64(2, "h"),
+            np.timedelta64(2 * 3600 * 1000, "ms"),
+            pd.offsets.Minute(120),
+            pd.offsets.Hour(2),
+        ],
+    )
+    def test_add_timedeltalike_scalar_mismatched_reso(self, dta_dti, scalar):
+        dta, dti = dta_dti
+
+        td = pd.Timedelta(scalar)
+        exp_unit = tm.get_finest_unit(dta.unit, td.unit)
+
+        expected = (dti + td)._data.as_unit(exp_unit)
+        result = dta + scalar
+        tm.assert_extension_array_equal(result, expected)
+
+        result = scalar + dta
+        tm.assert_extension_array_equal(result, expected)
+
+        expected = (dti - td)._data.as_unit(exp_unit)
+        result = dta - scalar
+        tm.assert_extension_array_equal(result, expected)
+
+    def test_sub_datetimelike_scalar_mismatch(self):
+        dti = pd.date_range("2016-01-01", periods=3)
+        dta = dti._data.as_unit("us")
+
+        ts = dta[0].as_unit("s")
+
+        result = dta - ts
+        expected = (dti - dti[0])._data.as_unit("us")
+        assert result.dtype == "m8[us]"
+        tm.assert_extension_array_equal(result, expected)
+
+    def test_sub_datetime64_reso_mismatch(self):
+        dti = pd.date_range("2016-01-01", periods=3)
+        left = dti._data.as_unit("s")
+        right = left.as_unit("ms")
+
+        result = left - right
+        exp_values = np.array([0, 0, 0], dtype="m8[ms]")
+        expected = TimedeltaArray._simple_new(
+            exp_values,
+            dtype=exp_values.dtype,
+        )
+        tm.assert_extension_array_equal(result, expected)
+        result2 = right - left
+        tm.assert_extension_array_equal(result2, expected)
+
+
+class TestDatetimeArrayComparisons:
+    # TODO: merge this into tests/arithmetic/test_datetime64 once it is
+    #  sufficiently robust
+
+    def test_cmp_dt64_arraylike_tznaive(self, comparison_op):
+        # arbitrary tz-naive DatetimeIndex
+        op = comparison_op
+
+        dti = pd.date_range("2016-01-1", freq="MS", periods=9, tz=None)
+        arr = dti._data
+        assert arr.freq == dti.freq
+        assert arr.tz == dti.tz
+
+        right = dti
+
+        expected = np.ones(len(arr), dtype=bool)
+        if comparison_op.__name__ in ["ne", "gt", "lt"]:
+            # for these the comparisons should be all-False
+            expected = ~expected
+
+        result = op(arr, arr)
+        tm.assert_numpy_array_equal(result, expected)
+        for other in [
+            right,
+            np.array(right),
+            list(right),
+            tuple(right),
+            right.astype(object),
+        ]:
+            result = op(arr, other)
+            tm.assert_numpy_array_equal(result, expected)
+
+            result = op(other, arr)
+            tm.assert_numpy_array_equal(result, expected)
+
+
+class TestDatetimeArray:
+    def test_astype_ns_to_ms_near_bounds(self):
+        # GH#55979
+        ts = pd.Timestamp("1677-09-21 00:12:43.145225")
+        target = ts.as_unit("ms")
+
+        dta = DatetimeArray._from_sequence([ts], dtype="M8[ns]")
+        assert (dta.view("i8") == ts.as_unit("ns").value).all()
+
+        result = dta.astype("M8[ms]")
+        assert result[0] == target
+
+        expected = DatetimeArray._from_sequence([ts], dtype="M8[ms]")
+        assert (expected.view("i8") == target._value).all()
+
+        tm.assert_datetime_array_equal(result, expected)
+
+    def test_astype_non_nano_tznaive(self):
+        dti = pd.date_range("2016-01-01", periods=3)
+
+        res = dti.astype("M8[s]")
+        assert res.dtype == "M8[s]"
+
+        dta = dti._data
+        res = dta.astype("M8[s]")
+        assert res.dtype == "M8[s]"
+        assert isinstance(res, pd.core.arrays.DatetimeArray)  # used to be ndarray
+
+    def test_astype_non_nano_tzaware(self):
+        dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
+
+        res = dti.astype("M8[s, US/Pacific]")
+        assert res.dtype == "M8[s, US/Pacific]"
+
+        dta = dti._data
+        res = dta.astype("M8[s, US/Pacific]")
+        assert res.dtype == "M8[s, US/Pacific]"
+
+        # from non-nano to non-nano, preserving reso
+        res2 = res.astype("M8[s, UTC]")
+        assert res2.dtype == "M8[s, UTC]"
+        assert not tm.shares_memory(res2, res)
+
+        res3 = res.astype("M8[s, UTC]", copy=False)
+        assert res2.dtype == "M8[s, UTC]"
+        assert tm.shares_memory(res3, res)
+
+    def test_astype_to_same(self):
+        arr = DatetimeArray._from_sequence(
+            ["2000"], dtype=DatetimeTZDtype(tz="US/Central")
+        )
+        result = arr.astype(DatetimeTZDtype(tz="US/Central"), copy=False)
+        assert result is arr
+
+    @pytest.mark.parametrize("dtype", ["datetime64[ns]", "datetime64[ns, UTC]"])
+    @pytest.mark.parametrize(
+        "other", ["datetime64[ns]", "datetime64[ns, UTC]", "datetime64[ns, CET]"]
+    )
+    def test_astype_copies(self, dtype, other):
+        # https://github.com/pandas-dev/pandas/pull/32490
+        ser = pd.Series([1, 2], dtype=dtype)
+        orig = ser.copy()
+
+        err = False
+        if (dtype == "datetime64[ns]") ^ (other == "datetime64[ns]"):
+            # deprecated in favor of tz_localize
+            err = True
+
+        if err:
+            if dtype == "datetime64[ns]":
+                msg = "Use obj.tz_localize instead or series.dt.tz_localize instead"
+            else:
+                msg = "from timezone-aware dtype to timezone-naive dtype"
+            with pytest.raises(TypeError, match=msg):
+                ser.astype(other)
+        else:
+            t = ser.astype(other)
+            t[:] = pd.NaT
+            tm.assert_series_equal(ser, orig)
+
+    @pytest.mark.parametrize("dtype", [int, np.int32, np.int64, "uint32", "uint64"])
+    def test_astype_int(self, dtype):
+        arr = DatetimeArray._from_sequence(
+            [pd.Timestamp("2000"), pd.Timestamp("2001")], dtype="M8[ns]"
+        )
+
+        if np.dtype(dtype) != np.int64:
+            with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"):
+                arr.astype(dtype)
+            return
+
+        result = arr.astype(dtype)
+        expected = arr._ndarray.view("i8")
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_astype_to_sparse_dt64(self):
+        # GH#50082
+        dti = pd.date_range("2016-01-01", periods=4)
+        dta = dti._data
+        result = dta.astype("Sparse[datetime64[ns]]")
+
+        assert result.dtype == "Sparse[datetime64[ns]]"
+        assert (result == dta).all()
+
+    def test_tz_setter_raises(self):
+        arr = DatetimeArray._from_sequence(
+            ["2000"], dtype=DatetimeTZDtype(tz="US/Central")
+        )
+        with pytest.raises(AttributeError, match="tz_localize"):
+            arr.tz = "UTC"
+
+    def test_setitem_str_impute_tz(self, tz_naive_fixture):
+        # Like for getitem, if we are passed a naive-like string, we impute
+        #  our own timezone.
+        tz = tz_naive_fixture
+
+        data = np.array([1, 2, 3], dtype="M8[ns]")
+        dtype = data.dtype if tz is None else DatetimeTZDtype(tz=tz)
+        arr = DatetimeArray._from_sequence(data, dtype=dtype)
+        expected = arr.copy()
+
+        ts = pd.Timestamp("2020-09-08 16:50").tz_localize(tz)
+        setter = str(ts.tz_localize(None))
+
+        # Setting a scalar tznaive string
+        expected[0] = ts
+        arr[0] = setter
+        tm.assert_equal(arr, expected)
+
+        # Setting a listlike of tznaive strings
+        expected[1] = ts
+        arr[:2] = [setter, setter]
+        tm.assert_equal(arr, expected)
+
+    def test_setitem_different_tz_raises(self):
+        # pre-2.0 we required exact tz match, in 2.0 we require only
+        #  tzawareness-match
+        data = np.array([1, 2, 3], dtype="M8[ns]")
+        arr = DatetimeArray._from_sequence(
+            data, copy=False, dtype=DatetimeTZDtype(tz="US/Central")
+        )
+        with pytest.raises(TypeError, match="Cannot compare tz-naive and tz-aware"):
+            arr[0] = pd.Timestamp("2000")
+
+        ts = pd.Timestamp("2000", tz="US/Eastern")
+        arr[0] = ts
+        assert arr[0] == ts.tz_convert("US/Central")
+
+    def test_setitem_clears_freq(self):
+        a = pd.date_range("2000", periods=2, freq="D", tz="US/Central")._data
+        a[0] = pd.Timestamp("2000", tz="US/Central")
+        assert a.freq is None
+
+    @pytest.mark.parametrize(
+        "obj",
+        [
+            pd.Timestamp("2021-01-01"),
+            pd.Timestamp("2021-01-01").to_datetime64(),
+            pd.Timestamp("2021-01-01").to_pydatetime(),
+        ],
+    )
+    def test_setitem_objects(self, obj):
+        # make sure we accept datetime64 and datetime in addition to Timestamp
+        dti = pd.date_range("2000", periods=2, freq="D")
+        arr = dti._data
+
+        arr[0] = obj
+        assert arr[0] == obj
+
+    def test_repeat_preserves_tz(self):
+        dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central")
+        arr = dti._data
+
+        repeated = arr.repeat([1, 1])
+
+        # preserves tz and values, but not freq
+        expected = DatetimeArray._from_sequence(arr.asi8, dtype=arr.dtype)
+        tm.assert_equal(repeated, expected)
+
+    def test_value_counts_preserves_tz(self):
+        dti = pd.date_range("2000", periods=2, freq="D", tz="US/Central")
+        arr = dti._data.repeat([4, 3])
+
+        result = arr.value_counts()
+
+        # Note: not tm.assert_index_equal, since `freq`s do not match
+        assert result.index.equals(dti)
+
+        arr[-2] = pd.NaT
+        result = arr.value_counts(dropna=False)
+        expected = pd.Series([4, 2, 1], index=[dti[0], dti[1], pd.NaT], name="count")
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("method", ["pad", "backfill"])
+    def test_fillna_preserves_tz(self, method):
+        dti = pd.date_range(
+            "2000-01-01", periods=5, freq="D", tz="US/Central", unit="ns"
+        )
+        arr = DatetimeArray._from_sequence(dti, dtype=dti.dtype, copy=True)
+        arr[2] = pd.NaT
+
+        fill_val = dti[1] if method == "pad" else dti[3]
+        expected = DatetimeArray._from_sequence(
+            [dti[0], dti[1], fill_val, dti[3], dti[4]],
+            dtype=DatetimeTZDtype(tz="US/Central"),
+        )
+
+        result = arr._pad_or_backfill(method=method)
+        tm.assert_extension_array_equal(result, expected)
+
+        # assert that arr and dti were not modified in-place
+        assert arr[2] is pd.NaT
+        assert dti[2] == pd.Timestamp("2000-01-03", tz="US/Central")
+
+    def test_fillna_2d(self):
+        dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific")
+        dta = dti._data.reshape(3, 2).copy()
+        dta[0, 1] = pd.NaT
+        dta[1, 0] = pd.NaT
+
+        res1 = dta._pad_or_backfill(method="pad")
+        expected1 = dta.copy()
+        expected1[1, 0] = dta[0, 0]
+        tm.assert_extension_array_equal(res1, expected1)
+
+        res2 = dta._pad_or_backfill(method="backfill")
+        expected2 = dta.copy()
+        expected2 = dta.copy()
+        expected2[1, 0] = dta[2, 0]
+        expected2[0, 1] = dta[1, 1]
+        tm.assert_extension_array_equal(res2, expected2)
+
+        # with different ordering for underlying ndarray; behavior should
+        #  be unchanged
+        dta2 = dta._from_backing_data(dta._ndarray.copy(order="F"))
+        assert dta2._ndarray.flags["F_CONTIGUOUS"]
+        assert not dta2._ndarray.flags["C_CONTIGUOUS"]
+        tm.assert_extension_array_equal(dta, dta2)
+
+        res3 = dta2._pad_or_backfill(method="pad")
+        tm.assert_extension_array_equal(res3, expected1)
+
+        res4 = dta2._pad_or_backfill(method="backfill")
+        tm.assert_extension_array_equal(res4, expected2)
+
+        # test the DataFrame method while we're here
+        df = pd.DataFrame(dta)
+        res = df.ffill()
+        expected = pd.DataFrame(expected1)
+        tm.assert_frame_equal(res, expected)
+
+        res = df.bfill()
+        expected = pd.DataFrame(expected2)
+        tm.assert_frame_equal(res, expected)
+
+    def test_array_interface_tz(self):
+        tz = "US/Central"
+        data = pd.date_range("2017", periods=2, tz=tz, unit="ns")._data
+        result = np.asarray(data)
+
+        expected = np.array(
+            [
+                pd.Timestamp("2017-01-01T00:00:00", tz=tz),
+                pd.Timestamp("2017-01-02T00:00:00", tz=tz),
+            ],
+            dtype=object,
+        )
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = np.asarray(data, dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = np.asarray(data, dtype="M8[ns]")
+
+        expected = np.array(
+            ["2017-01-01T06:00:00", "2017-01-02T06:00:00"], dtype="M8[ns]"
+        )
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_array_interface(self):
+        data = pd.date_range("2017", periods=2, unit="ns")._data
+        expected = np.array(
+            ["2017-01-01T00:00:00", "2017-01-02T00:00:00"], dtype="datetime64[ns]"
+        )
+
+        result = np.asarray(data)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = np.asarray(data, dtype=object)
+        expected = np.array(
+            [pd.Timestamp("2017-01-01T00:00:00"), pd.Timestamp("2017-01-02T00:00:00")],
+            dtype=object,
+        )
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("index", [True, False])
+    def test_searchsorted_different_tz(self, index):
+        data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
+        arr = pd.DatetimeIndex(data, freq="D")._data.tz_localize("Asia/Tokyo")
+        if index:
+            arr = pd.Index(arr)
+
+        expected = arr.searchsorted(arr[2])
+        result = arr.searchsorted(arr[2].tz_convert("UTC"))
+        assert result == expected
+
+        expected = arr.searchsorted(arr[2:6])
+        result = arr.searchsorted(arr[2:6].tz_convert("UTC"))
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize("index", [True, False])
+    def test_searchsorted_tzawareness_compat(self, index):
+        data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
+        arr = pd.DatetimeIndex(data, freq="D")._data
+        if index:
+            arr = pd.Index(arr)
+
+        mismatch = arr.tz_localize("Asia/Tokyo")
+
+        msg = "Cannot compare tz-naive and tz-aware datetime-like objects"
+        with pytest.raises(TypeError, match=msg):
+            arr.searchsorted(mismatch[0])
+        with pytest.raises(TypeError, match=msg):
+            arr.searchsorted(mismatch)
+
+        with pytest.raises(TypeError, match=msg):
+            mismatch.searchsorted(arr[0])
+        with pytest.raises(TypeError, match=msg):
+            mismatch.searchsorted(arr)
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            1,
+            np.int64(1),
+            1.0,
+            np.timedelta64("NaT"),
+            pd.Timedelta(days=2),
+            "invalid",
+            np.arange(10, dtype="i8") * 24 * 3600 * 10**9,
+            np.arange(10).view("timedelta64[ns]") * 24 * 3600 * 10**9,
+            pd.Timestamp("2021-01-01").to_period("D"),
+        ],
+    )
+    @pytest.mark.parametrize("index", [True, False])
+    def test_searchsorted_invalid_types(self, other, index):
+        data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
+        arr = pd.DatetimeIndex(data, freq="D")._data
+        if index:
+            arr = pd.Index(arr)
+
+        msg = "|".join(
+            [
+                "searchsorted requires compatible dtype or scalar",
+                "value should be a 'Timestamp', 'NaT', or array of those. Got",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            arr.searchsorted(other)
+
+    def test_shift_fill_value(self):
+        dti = pd.date_range("2016-01-01", periods=3)
+
+        dta = dti._data
+        expected = DatetimeArray._from_sequence(
+            np.roll(dta._ndarray, 1), dtype=dti.dtype
+        )
+
+        fv = dta[-1]
+        for fill_value in [fv, fv.to_pydatetime(), fv.to_datetime64()]:
+            result = dta.shift(1, fill_value=fill_value)
+            tm.assert_datetime_array_equal(result, expected)
+
+        dta = dta.tz_localize("UTC")
+        expected = expected.tz_localize("UTC")
+        fv = dta[-1]
+        for fill_value in [fv, fv.to_pydatetime()]:
+            result = dta.shift(1, fill_value=fill_value)
+            tm.assert_datetime_array_equal(result, expected)
+
+    def test_shift_value_tzawareness_mismatch(self):
+        dti = pd.date_range("2016-01-01", periods=3)
+
+        dta = dti._data
+
+        fv = dta[-1].tz_localize("UTC")
+        for invalid in [fv, fv.to_pydatetime()]:
+            with pytest.raises(TypeError, match="Cannot compare"):
+                dta.shift(1, fill_value=invalid)
+
+        dta = dta.tz_localize("UTC")
+        fv = dta[-1].tz_localize(None)
+        for invalid in [fv, fv.to_pydatetime(), fv.to_datetime64()]:
+            with pytest.raises(TypeError, match="Cannot compare"):
+                dta.shift(1, fill_value=invalid)
+
+    def test_shift_requires_tzmatch(self):
+        # pre-2.0 we required exact tz match, in 2.0 we require just
+        #  matching tzawareness
+        dti = pd.date_range("2016-01-01", periods=3, tz="UTC")
+        dta = dti._data
+
+        fill_value = pd.Timestamp("2020-10-18 18:44", tz="US/Pacific")
+
+        result = dta.shift(1, fill_value=fill_value)
+        expected = dta.shift(1, fill_value=fill_value.tz_convert("UTC"))
+        tm.assert_equal(result, expected)
+
+    def test_tz_localize_t2d(self):
+        dti = pd.date_range("1994-05-12", periods=12, tz="US/Pacific")
+        dta = dti._data.reshape(3, 4)
+        result = dta.tz_localize(None)
+
+        expected = dta.ravel().tz_localize(None).reshape(dta.shape)
+        tm.assert_datetime_array_equal(result, expected)
+
+        roundtrip = expected.tz_localize("US/Pacific")
+        tm.assert_datetime_array_equal(roundtrip, dta)
+
+    @pytest.mark.parametrize(
+        "tz", ["US/Eastern", "dateutil/US/Eastern", "pytz/US/Eastern"]
+    )
+    def test_iter_zoneinfo_fold(self, tz):
+        # GH#49684
+        if tz.startswith("pytz/"):
+            pytz = pytest.importorskip("pytz")
+            tz = pytz.timezone(tz.removeprefix("pytz/"))
+        utc_vals = np.array(
+            [1320552000, 1320555600, 1320559200, 1320562800], dtype=np.int64
+        )
+        utc_vals *= 1_000_000_000
+
+        dta = (
+            DatetimeArray._from_sequence(utc_vals, dtype=np.dtype("M8[ns]"))
+            .tz_localize("UTC")
+            .tz_convert(tz)
+        )
+
+        left = dta[2]
+        right = list(dta)[2]
+        assert str(left) == str(right)
+        # previously there was a bug where with non-pytz right would be
+        #  Timestamp('2011-11-06 01:00:00-0400', tz='US/Eastern')
+        # while left would be
+        #  Timestamp('2011-11-06 01:00:00-0500', tz='US/Eastern')
+        # The .value's would match (so they would compare as equal),
+        #  but the folds would not
+        assert left.utcoffset() == right.utcoffset()
+
+        # The same bug in ints_to_pydatetime affected .astype, so we test
+        #  that here.
+        right2 = dta.astype(object)[2]
+        assert str(left) == str(right2)
+        assert left.utcoffset() == right2.utcoffset()
+
+    @pytest.mark.parametrize(
+        "freq",
+        ["2M", "2SM", "2sm", "2Q", "2Q-SEP", "1Y", "2Y-MAR", "2m", "2q-sep", "2y"],
+    )
+    def test_date_range_frequency_M_Q_Y_raises(self, freq):
+        msg = f"Invalid frequency: {freq}"
+
+        with pytest.raises(ValueError, match=msg):
+            pd.date_range("1/1/2000", periods=4, freq=freq)
+
+    @pytest.mark.parametrize("freq_depr", ["2MIN", "2nS", "2Us"])
+    def test_date_range_uppercase_frequency_deprecated(self, freq_depr):
+        # GH#9586, GH#54939
+        depr_msg = (
+            f"'{freq_depr[1:]}' is deprecated and will be removed in a "
+            f"future version, please use '{freq_depr.lower()[1:]}' instead."
+        )
+
+        expected = pd.date_range("1/1/2000", periods=4, freq=freq_depr.lower())
+        with tm.assert_produces_warning(Pandas4Warning, match=depr_msg):
+            result = pd.date_range("1/1/2000", periods=4, freq=freq_depr)
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "freq",
+        [
+            "2ye-mar",
+            "2ys",
+            "2qe",
+            "2qs-feb",
+            "2bqs",
+            "2sms",
+            "2bms",
+            "2cbme",
+            "2me",
+        ],
+    )
+    def test_date_range_lowercase_frequency_raises(self, freq):
+        msg = f"Invalid frequency: {freq}"
+
+        with pytest.raises(ValueError, match=msg):
+            pd.date_range("1/1/2000", periods=4, freq=freq)
+
+    def test_date_range_lowercase_frequency_deprecated(self):
+        # GH#9586, GH#54939
+        depr_msg = "'w' is deprecated and will be removed in a future version"
+
+        expected = pd.date_range("1/1/2000", periods=4, freq="2W")
+        with tm.assert_produces_warning(Pandas4Warning, match=depr_msg):
+            result = pd.date_range("1/1/2000", periods=4, freq="2w")
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize("freq", ["1A", "2A-MAR", "2a-mar"])
+    def test_date_range_frequency_A_raises(self, freq):
+        msg = f"Invalid frequency: {freq}"
+
+        with pytest.raises(ValueError, match=msg):
+            pd.date_range("1/1/2000", periods=4, freq=freq)
+
+    @pytest.mark.parametrize("freq", ["2H", "2CBH", "2S"])
+    def test_date_range_uppercase_frequency_raises(self, freq):
+        msg = f"Invalid frequency: {freq}"
+
+        with pytest.raises(ValueError, match=msg):
+            pd.date_range("1/1/2000", periods=4, freq=freq)
+
+
+def test_factorize_sort_without_freq():
+    dta = DatetimeArray._from_sequence([0, 2, 1], dtype="M8[ns]")
+
+    msg = r"call pd.factorize\(obj, sort=True\) instead"
+    with pytest.raises(NotImplementedError, match=msg):
+        dta.factorize(sort=True)
+
+    # Do TimedeltaArray while we're here
+    tda = dta - dta[0]
+    with pytest.raises(NotImplementedError, match=msg):
+        tda.factorize(sort=True)
diff --git a/pandas/tests/arrays/test_ndarray_backed.py b/pandas/tests/arrays/test_ndarray_backed.py
new file mode 100644
index 0000000000000000000000000000000000000000..2af59a03a5b3e774c1c0692399c285f0ec26a1dc
--- /dev/null
+++ b/pandas/tests/arrays/test_ndarray_backed.py
@@ -0,0 +1,76 @@
+"""
+Tests for subclasses of NDArrayBackedExtensionArray
+"""
+
+import numpy as np
+
+from pandas import (
+    CategoricalIndex,
+    date_range,
+)
+from pandas.core.arrays import (
+    Categorical,
+    DatetimeArray,
+    NumpyExtensionArray,
+    TimedeltaArray,
+)
+
+
+class TestEmpty:
+    def test_empty_categorical(self):
+        ci = CategoricalIndex(["a", "b", "c"], ordered=True)
+        dtype = ci.dtype
+
+        # case with int8 codes
+        shape = (4,)
+        result = Categorical._empty(shape, dtype=dtype)
+        assert isinstance(result, Categorical)
+        assert result.shape == shape
+        assert result._ndarray.dtype == np.int8
+
+        # case where repr would segfault if we didn't override base implementation
+        result = Categorical._empty((4096,), dtype=dtype)
+        assert isinstance(result, Categorical)
+        assert result.shape == (4096,)
+        assert result._ndarray.dtype == np.int8
+        repr(result)
+
+        # case with int16 codes
+        ci = CategoricalIndex(list(range(512)) * 4, ordered=False)
+        dtype = ci.dtype
+        result = Categorical._empty(shape, dtype=dtype)
+        assert isinstance(result, Categorical)
+        assert result.shape == shape
+        assert result._ndarray.dtype == np.int16
+
+    def test_empty_dt64tz(self):
+        dti = date_range("2016-01-01", periods=2, tz="Asia/Tokyo")
+        dtype = dti.dtype
+
+        shape = (0,)
+        result = DatetimeArray._empty(shape, dtype=dtype)
+        assert result.dtype == dtype
+        assert isinstance(result, DatetimeArray)
+        assert result.shape == shape
+
+    def test_empty_dt64(self):
+        shape = (3, 9)
+        result = DatetimeArray._empty(shape, dtype="datetime64[ns]")
+        assert isinstance(result, DatetimeArray)
+        assert result.shape == shape
+
+    def test_empty_td64(self):
+        shape = (3, 9)
+        result = TimedeltaArray._empty(shape, dtype="m8[ns]")
+        assert isinstance(result, TimedeltaArray)
+        assert result.shape == shape
+
+    def test_empty_pandas_array(self):
+        arr = NumpyExtensionArray(np.array([1, 2]))
+        dtype = arr.dtype
+
+        shape = (3, 9)
+        result = NumpyExtensionArray._empty(shape, dtype=dtype)
+        assert isinstance(result, NumpyExtensionArray)
+        assert result.dtype == dtype
+        assert result.shape == shape
diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py
new file mode 100644
index 0000000000000000000000000000000000000000..48453ba19e9a1f6971a2e56872ec42f1856d1dd0
--- /dev/null
+++ b/pandas/tests/arrays/test_period.py
@@ -0,0 +1,184 @@
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import iNaT
+from pandas._libs.tslibs.period import IncompatibleFrequency
+
+from pandas.core.dtypes.base import _registry as registry
+from pandas.core.dtypes.dtypes import PeriodDtype
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.core.arrays import PeriodArray
+
+# ----------------------------------------------------------------------------
+# Dtype
+
+
+def test_registered():
+    assert PeriodDtype in registry.dtypes
+    result = registry.find("Period[D]")
+    expected = PeriodDtype("D")
+    assert result == expected
+
+
+# ----------------------------------------------------------------------------
+# period_array
+
+
+def test_asi8():
+    result = PeriodArray._from_sequence(["2000", "2001", None], dtype="period[D]").asi8
+    expected = np.array([10957, 11323, iNaT])
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_take_raises():
+    arr = PeriodArray._from_sequence(["2000", "2001"], dtype="period[D]")
+    with pytest.raises(IncompatibleFrequency, match="freq"):
+        arr.take([0, -1], allow_fill=True, fill_value=pd.Period("2000", freq="W"))
+
+    msg = "value should be a 'Period' or 'NaT'. Got 'str' instead"
+    with pytest.raises(TypeError, match=msg):
+        arr.take([0, -1], allow_fill=True, fill_value="foo")
+
+
+def test_fillna_raises():
+    arr = PeriodArray._from_sequence(["2000", "2001", "2002"], dtype="period[D]")
+    with pytest.raises(ValueError, match="Length"):
+        arr.fillna(arr[:2])
+
+
+def test_fillna_copies():
+    arr = PeriodArray._from_sequence(["2000", "2001", "2002"], dtype="period[D]")
+    result = arr.fillna(pd.Period("2000", "D"))
+    assert result is not arr
+
+
+# ----------------------------------------------------------------------------
+# setitem
+
+
+@pytest.mark.parametrize(
+    "key, value, expected",
+    [
+        ([0], pd.Period("2000", "D"), [10957, 1, 2]),
+        ([0], None, [iNaT, 1, 2]),
+        ([0], np.nan, [iNaT, 1, 2]),
+        ([0, 1, 2], pd.Period("2000", "D"), [10957] * 3),
+        (
+            [0, 1, 2],
+            [pd.Period("2000", "D"), pd.Period("2001", "D"), pd.Period("2002", "D")],
+            [10957, 11323, 11688],
+        ),
+    ],
+)
+def test_setitem(key, value, expected):
+    arr = PeriodArray(np.arange(3), dtype="period[D]")
+    expected = PeriodArray(expected, dtype="period[D]")
+    arr[key] = value
+    tm.assert_period_array_equal(arr, expected)
+
+
+def test_setitem_raises_incompatible_freq():
+    arr = PeriodArray(np.arange(3), dtype="period[D]")
+    with pytest.raises(IncompatibleFrequency, match="freq"):
+        arr[0] = pd.Period("2000", freq="Y")
+
+    other = PeriodArray._from_sequence(["2000", "2001"], dtype="period[Y]")
+    with pytest.raises(IncompatibleFrequency, match="freq"):
+        arr[[0, 1]] = other
+
+
+def test_setitem_raises_length():
+    arr = PeriodArray(np.arange(3), dtype="period[D]")
+    with pytest.raises(ValueError, match="length"):
+        arr[[0, 1]] = [pd.Period("2000", freq="D")]
+
+
+def test_setitem_raises_type():
+    arr = PeriodArray(np.arange(3), dtype="period[D]")
+    with pytest.raises(TypeError, match="int"):
+        arr[0] = 1
+
+
+# ----------------------------------------------------------------------------
+# Ops
+
+
+def test_sub_period():
+    arr = PeriodArray._from_sequence(["2000", "2001"], dtype="period[D]")
+    other = pd.Period("2000", freq="M")
+    with pytest.raises(IncompatibleFrequency, match="freq"):
+        arr - other
+
+
+def test_sub_period_overflow():
+    # GH#47538
+    dti = pd.date_range("1677-09-22", periods=2, freq="D")
+    pi = dti.to_period("ns")
+
+    per = pd.Period._from_ordinal(10**14, pi.freq)
+
+    with pytest.raises(OverflowError, match="Overflow in int64 addition"):
+        pi - per
+
+    with pytest.raises(OverflowError, match="Overflow in int64 addition"):
+        per - pi
+
+
+# ----------------------------------------------------------------------------
+# Methods
+
+
+@pytest.mark.parametrize(
+    "other",
+    [
+        pd.Period("2000", freq="h"),
+        PeriodArray._from_sequence(["2000", "2001", "2000"], dtype="period[h]"),
+    ],
+)
+def test_where_different_freq_raises(other):
+    # GH#45768 The PeriodArray method raises, the Series method coerces
+    ser = pd.Series(
+        PeriodArray._from_sequence(["2000", "2001", "2002"], dtype="period[D]")
+    )
+    cond = np.array([True, False, True])
+
+    with pytest.raises(IncompatibleFrequency, match="freq"):
+        ser.array._where(cond, other)
+
+    res = ser.where(cond, other)
+    expected = ser.astype(object).where(cond, other)
+    tm.assert_series_equal(res, expected)
+
+
+# ----------------------------------------------------------------------------
+# Printing
+
+
+def test_repr_small():
+    arr = PeriodArray._from_sequence(["2000", "2001"], dtype="period[D]")
+    result = str(arr)
+    expected = (
+        "<PeriodArray>\n['2000-01-01', '2001-01-01']\nLength: 2, dtype: period[D]"
+    )
+    assert result == expected
+
+
+def test_repr_large():
+    arr = PeriodArray._from_sequence(["2000", "2001"] * 500, dtype="period[D]")
+    result = str(arr)
+    expected = (
+        "<PeriodArray>\n"
+        "['2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', "
+        "'2000-01-01',\n"
+        " '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', "
+        "'2001-01-01',\n"
+        " ...\n"
+        " '2000-01-01', '2001-01-01', '2000-01-01', '2001-01-01', "
+        "'2000-01-01',\n"
+        " '2001-01-01', '2000-01-01', '2001-01-01', '2000-01-01', "
+        "'2001-01-01']\n"
+        "Length: 1000, dtype: period[D]"
+    )
+    assert result == expected
diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb7c7afdc6ff984c175c7a000e92b84b607c3b70
--- /dev/null
+++ b/pandas/tests/arrays/test_timedeltas.py
@@ -0,0 +1,312 @@
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Timedelta
+import pandas._testing as tm
+from pandas.core.arrays import (
+    DatetimeArray,
+    TimedeltaArray,
+)
+
+
+class TestNonNano:
+    @pytest.fixture(params=["s", "ms", "us"])
+    def unit(self, request):
+        return request.param
+
+    @pytest.fixture
+    def tda(self, unit):
+        arr = np.arange(5, dtype=np.int64).view(f"m8[{unit}]")
+        return TimedeltaArray._simple_new(arr, dtype=arr.dtype)
+
+    def test_non_nano(self, unit):
+        arr = np.arange(5, dtype=np.int64).view(f"m8[{unit}]")
+        tda = TimedeltaArray._simple_new(arr, dtype=arr.dtype)
+
+        assert tda.dtype == arr.dtype
+        assert tda[0].unit == unit
+
+    def test_as_unit_raises(self, tda):
+        # GH#50616
+        with pytest.raises(ValueError, match="Supported units"):
+            tda.as_unit("D")
+
+        tdi = pd.Index(tda)
+        with pytest.raises(ValueError, match="Supported units"):
+            tdi.as_unit("D")
+
+    @pytest.mark.parametrize("field", TimedeltaArray._field_ops)
+    def test_fields(self, tda, field):
+        as_nano = tda._ndarray.astype("m8[ns]")
+        tda_nano = TimedeltaArray._simple_new(as_nano, dtype=as_nano.dtype)
+
+        result = getattr(tda, field)
+        expected = getattr(tda_nano, field)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_to_pytimedelta(self, tda):
+        as_nano = tda._ndarray.astype("m8[ns]")
+        tda_nano = TimedeltaArray._simple_new(as_nano, dtype=as_nano.dtype)
+
+        result = tda.to_pytimedelta()
+        expected = tda_nano.to_pytimedelta()
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_total_seconds(self, unit, tda):
+        as_nano = tda._ndarray.astype("m8[ns]")
+        tda_nano = TimedeltaArray._simple_new(as_nano, dtype=as_nano.dtype)
+
+        result = tda.total_seconds()
+        expected = tda_nano.total_seconds()
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_timedelta_array_total_seconds(self):
+        # GH34290
+        expected = Timedelta("2 min").total_seconds()
+
+        result = pd.array([Timedelta("2 min")]).total_seconds()[0]
+        assert result == expected
+
+    def test_total_seconds_nanoseconds(self):
+        # issue #48521
+        start_time = pd.Series(["2145-11-02 06:00:00"]).astype("datetime64[ns]")
+        end_time = pd.Series(["2145-11-02 07:06:00"]).astype("datetime64[ns]")
+        expected = (end_time - start_time).values / np.timedelta64(1, "s")
+        result = (end_time - start_time).dt.total_seconds().values
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "nat", [np.datetime64("NaT", "ns"), np.datetime64("NaT", "us")]
+    )
+    def test_add_nat_datetimelike_scalar(self, nat, tda):
+        result = tda + nat
+        assert isinstance(result, DatetimeArray)
+        assert result._creso == tda._creso
+        assert result.isna().all()
+
+        result = nat + tda
+        assert isinstance(result, DatetimeArray)
+        assert result._creso == tda._creso
+        assert result.isna().all()
+
+    def test_add_pdnat(self, tda):
+        result = tda + pd.NaT
+        assert isinstance(result, TimedeltaArray)
+        assert result._creso == tda._creso
+        assert result.isna().all()
+
+        result = pd.NaT + tda
+        assert isinstance(result, TimedeltaArray)
+        assert result._creso == tda._creso
+        assert result.isna().all()
+
+    # TODO: 2022-07-11 this is the only test that gets to DTA.tz_convert
+    #  or tz_localize with non-nano; implement tests specific to that.
+    def test_add_datetimelike_scalar(self, tda, tz_naive_fixture):
+        ts = pd.Timestamp("2016-01-01", tz=tz_naive_fixture).as_unit("ns")
+
+        expected = tda.as_unit("ns") + ts
+        res = tda + ts
+        tm.assert_extension_array_equal(res, expected)
+        res = ts + tda
+        tm.assert_extension_array_equal(res, expected)
+
+        ts += Timedelta(1)  # case where we can't cast losslessly
+
+        exp_values = tda._ndarray + ts.asm8
+        expected = (
+            DatetimeArray._simple_new(exp_values, dtype=exp_values.dtype)
+            .tz_localize("UTC")
+            .tz_convert(ts.tz)
+        )
+
+        result = tda + ts
+        tm.assert_extension_array_equal(result, expected)
+
+        result = ts + tda
+        tm.assert_extension_array_equal(result, expected)
+
+    def test_mul_scalar(self, tda):
+        other = 2
+        result = tda * other
+        expected = TimedeltaArray._simple_new(tda._ndarray * other, dtype=tda.dtype)
+        tm.assert_extension_array_equal(result, expected)
+        assert result._creso == tda._creso
+
+    def test_mul_listlike(self, tda):
+        other = np.arange(len(tda))
+        result = tda * other
+        expected = TimedeltaArray._simple_new(tda._ndarray * other, dtype=tda.dtype)
+        tm.assert_extension_array_equal(result, expected)
+        assert result._creso == tda._creso
+
+    def test_mul_listlike_object(self, tda):
+        other = np.arange(len(tda))
+        result = tda * other.astype(object)
+        expected = TimedeltaArray._simple_new(tda._ndarray * other, dtype=tda.dtype)
+        tm.assert_extension_array_equal(result, expected)
+        assert result._creso == tda._creso
+
+    def test_div_numeric_scalar(self, tda):
+        other = 2
+        result = tda / other
+        expected = TimedeltaArray._simple_new(tda._ndarray / other, dtype=tda.dtype)
+        tm.assert_extension_array_equal(result, expected)
+        assert result._creso == tda._creso
+
+    def test_div_td_scalar(self, tda):
+        other = timedelta(seconds=1)
+        result = tda / other
+        expected = tda._ndarray / np.timedelta64(1, "s")
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_div_numeric_array(self, tda):
+        other = np.arange(len(tda))
+        result = tda / other
+        expected = TimedeltaArray._simple_new(tda._ndarray / other, dtype=tda.dtype)
+        tm.assert_extension_array_equal(result, expected)
+        assert result._creso == tda._creso
+
+    def test_div_td_array(self, tda):
+        other = tda._ndarray + tda._ndarray[-1]
+        result = tda / other
+        expected = tda._ndarray / other
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_add_timedeltaarraylike(self, tda):
+        tda_nano = tda.astype("m8[ns]")
+
+        expected = tda_nano * 2
+        res = tda_nano + tda
+        tm.assert_extension_array_equal(res, expected)
+        res = tda + tda_nano
+        tm.assert_extension_array_equal(res, expected)
+
+        expected = tda_nano * 0
+        res = tda - tda_nano
+        tm.assert_extension_array_equal(res, expected)
+
+        res = tda_nano - tda
+        tm.assert_extension_array_equal(res, expected)
+
+
+class TestTimedeltaArray:
+    def test_astype_int(self, any_int_numpy_dtype):
+        arr = TimedeltaArray._from_sequence(
+            [Timedelta("1h"), Timedelta("2h")], dtype="m8[ns]"
+        )
+
+        if np.dtype(any_int_numpy_dtype) != np.int64:
+            with pytest.raises(TypeError, match=r"Do obj.astype\('int64'\)"):
+                arr.astype(any_int_numpy_dtype)
+            return
+
+        result = arr.astype(any_int_numpy_dtype)
+        expected = arr._ndarray.view("i8")
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_setitem_clears_freq(self):
+        a = pd.timedelta_range("1h", periods=2, freq="h")._data
+        a[0] = Timedelta("1h")
+        assert a.freq is None
+
+    @pytest.mark.parametrize(
+        "obj",
+        [
+            Timedelta(seconds=1),
+            Timedelta(seconds=1).to_timedelta64(),
+            Timedelta(seconds=1).to_pytimedelta(),
+        ],
+    )
+    def test_setitem_objects(self, obj):
+        # make sure we accept timedelta64 and timedelta in addition to Timedelta
+        tdi = pd.timedelta_range("2 Days", periods=4, freq="h")
+        arr = tdi._data
+
+        arr[0] = obj
+        assert arr[0] == Timedelta(seconds=1)
+
+    @pytest.mark.parametrize(
+        "other",
+        [
+            1,
+            np.int64(1),
+            1.0,
+            np.datetime64("NaT"),
+            pd.Timestamp("2021-01-01"),
+            "invalid",
+            np.arange(10, dtype="i8") * 24 * 3600 * 10**9,
+            (np.arange(10) * 24 * 3600 * 10**9).view("datetime64[ns]"),
+            pd.Timestamp("2021-01-01").to_period("D"),
+        ],
+    )
+    @pytest.mark.parametrize("index", [True, False])
+    def test_searchsorted_invalid_types(self, other, index):
+        data = np.arange(10, dtype="i8") * 24 * 3600 * 10**9
+        arr = pd.TimedeltaIndex(data, freq="D")._data
+        if index:
+            arr = pd.Index(arr)
+
+        msg = "|".join(
+            [
+                "searchsorted requires compatible dtype or scalar",
+                "value should be a 'Timedelta', 'NaT', or array of those. Got",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            arr.searchsorted(other)
+
+
+class TestUnaryOps:
+    def test_abs(self):
+        vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
+        arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype)
+
+        evals = np.array([3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
+        expected = TimedeltaArray._from_sequence(evals, dtype=evals.dtype)
+
+        result = abs(arr)
+        tm.assert_timedelta_array_equal(result, expected)
+
+        result2 = np.abs(arr)
+        tm.assert_timedelta_array_equal(result2, expected)
+
+    def test_pos(self):
+        vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
+        arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype)
+
+        result = +arr
+        tm.assert_timedelta_array_equal(result, arr)
+        assert not tm.shares_memory(result, arr)
+
+        result2 = np.positive(arr)
+        tm.assert_timedelta_array_equal(result2, arr)
+        assert not tm.shares_memory(result2, arr)
+
+    def test_neg(self):
+        vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
+        arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype)
+
+        evals = np.array([3600 * 10**9, "NaT", -7200 * 10**9], dtype="m8[ns]")
+        expected = TimedeltaArray._from_sequence(evals)
+
+        result = -arr
+        tm.assert_timedelta_array_equal(result, expected)
+
+        result2 = np.negative(arr)
+        tm.assert_timedelta_array_equal(result2, expected)
+
+    def test_neg_freq(self):
+        tdi = pd.timedelta_range("2 Days", periods=4, freq="h")
+        arr = tdi._data
+
+        expected = -tdi._data
+
+        result = -arr
+        tm.assert_timedelta_array_equal(result, expected)
+
+        result2 = np.negative(arr)
+        tm.assert_timedelta_array_equal(result2, expected)
diff --git a/pandas/tests/computation/__init__.py b/pandas/tests/computation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/computation/test_compat.py b/pandas/tests/computation/test_compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..856a5b3a22a95d35cc577050f52d762b065e3ddf
--- /dev/null
+++ b/pandas/tests/computation/test_compat.py
@@ -0,0 +1,32 @@
+import pytest
+
+from pandas.compat._optional import VERSIONS
+
+import pandas as pd
+from pandas.core.computation import expr
+from pandas.core.computation.engines import ENGINES
+from pandas.util.version import Version
+
+
+def test_compat():
+    # test we have compat with our version of numexpr
+
+    from pandas.core.computation.check import NUMEXPR_INSTALLED
+
+    ne = pytest.importorskip("numexpr")
+
+    ver = ne.__version__
+    if Version(ver) < Version(VERSIONS["numexpr"]):
+        assert not NUMEXPR_INSTALLED
+    else:
+        assert NUMEXPR_INSTALLED
+
+
+@pytest.mark.parametrize("engine", ENGINES)
+@pytest.mark.parametrize("parser", expr.PARSERS)
+def test_invalid_numexpr_version(engine, parser):
+    if engine == "numexpr":
+        pytest.importorskip("numexpr")
+    a, b = 1, 2  # noqa: F841
+    res = pd.eval("a + b", engine=engine, parser=parser)
+    assert res == 3
diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..e14b997310baa321f85090e6c4dfc9068867e891
--- /dev/null
+++ b/pandas/tests/computation/test_eval.py
@@ -0,0 +1,2044 @@
+from __future__ import annotations
+
+from functools import reduce
+from itertools import product
+import operator
+
+import numpy as np
+import pytest
+
+from pandas.compat import (
+    PY312,
+    PY314,
+)
+from pandas.compat._optional import import_optional_dependency
+from pandas.errors import (
+    NumExprClobberingError,
+    PerformanceWarning,
+    UndefinedVariableError,
+)
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.common import (
+    is_bool,
+    is_float,
+    is_list_like,
+    is_scalar,
+)
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    date_range,
+    period_range,
+    timedelta_range,
+)
+import pandas._testing as tm
+from pandas.core.computation import (
+    expr,
+    pytables,
+)
+from pandas.core.computation.engines import ENGINES
+from pandas.core.computation.expr import (
+    BaseExprVisitor,
+    PandasExprVisitor,
+    PythonExprVisitor,
+)
+from pandas.core.computation.expressions import (
+    NUMEXPR_INSTALLED,
+    USE_NUMEXPR,
+)
+from pandas.core.computation.ops import (
+    ARITH_OPS_SYMS,
+    _binary_math_ops,
+    _binary_ops_dict,
+    _unary_math_ops,
+)
+from pandas.core.computation.scope import DEFAULT_GLOBALS
+from pandas.util.version import Version
+
+numexpr = import_optional_dependency("numexpr", errors="ignore")
+
+
+@pytest.fixture(
+    params=(
+        pytest.param(
+            engine,
+            marks=[
+                pytest.mark.skipif(
+                    engine == "numexpr" and not USE_NUMEXPR,
+                    reason=f"numexpr enabled->{USE_NUMEXPR}, "
+                    f"installed->{NUMEXPR_INSTALLED}",
+                ),
+                td.skip_if_no("numexpr"),
+            ],
+        )
+        for engine in ENGINES
+    )
+)
+def engine(request):
+    return request.param
+
+
+@pytest.fixture(params=expr.PARSERS)
+def parser(request):
+    return request.param
+
+
+def _eval_single_bin(lhs, cmp1, rhs, engine):
+    c = _binary_ops_dict[cmp1]
+    if ENGINES[engine].has_neg_frac:
+        try:
+            return c(lhs, rhs)
+        except ValueError as e:
+            if str(e).startswith(
+                "negative number cannot be raised to a fractional power"
+            ):
+                return np.nan
+            raise
+    return c(lhs, rhs)
+
+
+# TODO: using range(5) here is a kludge
+@pytest.fixture(
+    params=list(range(5)),
+    ids=["DataFrame", "Series", "SeriesNaN", "DataFrameNaN", "float"],
+)
+def lhs(request):
+    rng = np.random.default_rng(2)
+    if request.param == 0:
+        return DataFrame(rng.standard_normal((10, 5)))
+    elif request.param == 1:
+        return Series(rng.standard_normal(5))
+    elif request.param == 2:
+        return Series([1, 2, np.nan, np.nan, 5])
+    elif request.param == 3:
+        nan_df1 = DataFrame(rng.standard_normal((10, 5)))
+        nan_df1[nan_df1 > 0.5] = np.nan
+        return nan_df1
+    elif request.param == 4:
+        return rng.standard_normal()
+    else:
+        raise ValueError(f"{request.param}")
+
+
+rhs = lhs
+midhs = lhs
+
+
+@pytest.fixture
+def idx_func_dict():
+    return {
+        "i": lambda n: Index(np.arange(n), dtype=np.int64),
+        "f": lambda n: Index(np.arange(n), dtype=np.float64),
+        "s": lambda n: Index([f"{i}_{chr(i)}" for i in range(97, 97 + n)]),
+        "dt": lambda n: date_range("2020-01-01", periods=n),
+        "td": lambda n: timedelta_range("1 day", periods=n),
+        "p": lambda n: period_range("2020-01-01", periods=n, freq="D"),
+    }
+
+
+class TestEval:
+    @pytest.mark.parametrize(
+        "cmp1",
+        ["!=", "==", "<=", ">=", "<", ">"],
+        ids=["ne", "eq", "le", "ge", "lt", "gt"],
+    )
+    @pytest.mark.parametrize("cmp2", [">", "<"], ids=["gt", "lt"])
+    @pytest.mark.parametrize("binop", expr.BOOL_OPS_SYMS)
+    def test_complex_cmp_ops(self, cmp1, cmp2, binop, lhs, rhs, engine, parser):
+        if parser == "python" and binop in ["and", "or"]:
+            msg = "'BoolOp' nodes are not implemented"
+            ex = f"(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)"
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval(ex, engine=engine, parser=parser)
+            return
+
+        lhs_new = _eval_single_bin(lhs, cmp1, rhs, engine)
+        rhs_new = _eval_single_bin(lhs, cmp2, rhs, engine)
+        expected = _eval_single_bin(lhs_new, binop, rhs_new, engine)
+
+        ex = f"(lhs {cmp1} rhs) {binop} (lhs {cmp2} rhs)"
+        result = pd.eval(ex, engine=engine, parser=parser)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize("cmp_op", expr.CMP_OPS_SYMS)
+    def test_simple_cmp_ops(self, cmp_op, lhs, rhs, engine, parser):
+        lhs = lhs < 0
+        rhs = rhs < 0
+
+        if parser == "python" and cmp_op in ["in", "not in"]:
+            msg = "'(In|NotIn)' nodes are not implemented"
+            ex = f"lhs {cmp_op} rhs"
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval(ex, engine=engine, parser=parser)
+            return
+
+        ex = f"lhs {cmp_op} rhs"
+        msg = "|".join(
+            [
+                r"only list-like( or dict-like)? objects are allowed to be "
+                r"passed to (DataFrame\.)?isin\(\), you passed a "
+                r"(`|')bool(`|')",
+                "argument of type 'bool' is not .*",
+            ]
+        )
+        if cmp_op in ("in", "not in") and not is_list_like(rhs):
+            with pytest.raises(TypeError, match=msg):
+                pd.eval(
+                    ex,
+                    engine=engine,
+                    parser=parser,
+                    local_dict={"lhs": lhs, "rhs": rhs},
+                )
+        else:
+            expected = _eval_single_bin(lhs, cmp_op, rhs, engine)
+            result = pd.eval(ex, engine=engine, parser=parser)
+            tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize("op", expr.CMP_OPS_SYMS)
+    def test_compound_invert_op(self, op, lhs, rhs, request, engine, parser):
+        if parser == "python" and op in ["in", "not in"]:
+            msg = "'(In|NotIn)' nodes are not implemented"
+            ex = f"~(lhs {op} rhs)"
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval(ex, engine=engine, parser=parser)
+            return
+
+        if (
+            is_float(lhs)
+            and not is_float(rhs)
+            and op in ["in", "not in"]
+            and engine == "python"
+            and parser == "pandas"
+        ):
+            mark = pytest.mark.xfail(
+                reason="Looks like expected is negative, unclear whether "
+                "expected is incorrect or result is incorrect"
+            )
+            request.applymarker(mark)
+        skip_these = ["in", "not in"]
+        ex = f"~(lhs {op} rhs)"
+
+        msg = "|".join(
+            [
+                r"only list-like( or dict-like)? objects are allowed to be "
+                r"passed to (DataFrame\.)?isin\(\), you passed a "
+                r"(`|')float(`|')",
+                "argument of type 'float' is not .*",
+            ]
+        )
+        if is_scalar(rhs) and op in skip_these:
+            with pytest.raises(TypeError, match=msg):
+                pd.eval(
+                    ex,
+                    engine=engine,
+                    parser=parser,
+                    local_dict={"lhs": lhs, "rhs": rhs},
+                )
+        else:
+            # compound
+            if is_scalar(lhs) and is_scalar(rhs):
+                lhs, rhs = (np.array([x]) for x in (lhs, rhs))
+            expected = _eval_single_bin(lhs, op, rhs, engine)
+            if is_scalar(expected):
+                expected = not expected
+            else:
+                expected = ~expected
+            result = pd.eval(ex, engine=engine, parser=parser)
+            tm.assert_almost_equal(expected, result)
+
+    @pytest.mark.parametrize("cmp1", ["<", ">"])
+    @pytest.mark.parametrize("cmp2", ["<", ">"])
+    def test_chained_cmp_op(self, cmp1, cmp2, lhs, midhs, rhs, engine, parser):
+        mid = midhs
+        if parser == "python":
+            ex1 = f"lhs {cmp1} mid {cmp2} rhs"
+            msg = "'BoolOp' nodes are not implemented"
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval(ex1, engine=engine, parser=parser)
+            return
+
+        lhs_new = _eval_single_bin(lhs, cmp1, mid, engine)
+        rhs_new = _eval_single_bin(mid, cmp2, rhs, engine)
+
+        if lhs_new is not None and rhs_new is not None:
+            ex1 = f"lhs {cmp1} mid {cmp2} rhs"
+            ex2 = f"lhs {cmp1} mid and mid {cmp2} rhs"
+            ex3 = f"(lhs {cmp1} mid) & (mid {cmp2} rhs)"
+            expected = _eval_single_bin(lhs_new, "&", rhs_new, engine)
+
+            for ex in (ex1, ex2, ex3):
+                result = pd.eval(ex, engine=engine, parser=parser)
+
+                tm.assert_almost_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "arith1", sorted(set(ARITH_OPS_SYMS).difference({"**", "//", "%"}))
+    )
+    def test_binary_arith_ops(self, arith1, lhs, rhs, engine, parser):
+        ex = f"lhs {arith1} rhs"
+        result = pd.eval(ex, engine=engine, parser=parser)
+        expected = _eval_single_bin(lhs, arith1, rhs, engine)
+
+        tm.assert_almost_equal(result, expected)
+        ex = f"lhs {arith1} rhs {arith1} rhs"
+        result = pd.eval(ex, engine=engine, parser=parser)
+        nlhs = _eval_single_bin(lhs, arith1, rhs, engine)
+        try:
+            nlhs, ghs = nlhs.align(rhs)
+        except (ValueError, TypeError, AttributeError):
+            # ValueError: series frame or frame series align
+            # TypeError, AttributeError: series or frame with scalar align
+            return
+        else:
+            if engine == "numexpr":
+                import numexpr as ne
+
+                # direct numpy comparison
+                expected = ne.evaluate(f"nlhs {arith1} ghs")
+                # Update assert statement due to unreliable numerical
+                # precision component (GH37328)
+                # TODO: update testing code so that assert_almost_equal statement
+                #  can be replaced again by the assert_numpy_array_equal statement
+                tm.assert_almost_equal(result.values, expected)
+            else:
+                expected = eval(f"nlhs {arith1} ghs")
+                tm.assert_almost_equal(result, expected)
+
+    # modulus, pow, and floor division require special casing
+
+    def test_modulus(self, lhs, rhs, engine, parser):
+        ex = r"lhs % rhs"
+        result = pd.eval(ex, engine=engine, parser=parser)
+        expected = lhs % rhs
+        tm.assert_almost_equal(result, expected)
+
+        if engine == "numexpr":
+            import numexpr as ne
+
+            expected = ne.evaluate(r"expected % rhs")
+            if isinstance(result, (DataFrame, Series)):
+                tm.assert_almost_equal(result.values, expected)
+            else:
+                tm.assert_almost_equal(result, expected.item())
+        else:
+            expected = _eval_single_bin(expected, "%", rhs, engine)
+            tm.assert_almost_equal(result, expected)
+
+    def test_floor_division(self, lhs, rhs, engine, parser):
+        ex = "lhs // rhs"
+
+        if engine == "python" or (
+            engine == "numexpr" and Version(numexpr.__version__) >= Version("2.13.0")
+        ):
+            res = pd.eval(ex, engine=engine, parser=parser)
+            expected = lhs // rhs
+            tm.assert_equal(res, expected)
+        else:
+            msg = (
+                r"unsupported operand type\(s\) for //: 'VariableNode' and "
+                "'VariableNode'"
+            )
+            with pytest.raises(TypeError, match=msg):
+                pd.eval(
+                    ex,
+                    local_dict={"lhs": lhs, "rhs": rhs},
+                    engine=engine,
+                    parser=parser,
+                )
+
+    @td.skip_if_windows
+    def test_pow(self, lhs, rhs, engine, parser):
+        # odd failure on win32 platform, so skip
+        ex = "lhs ** rhs"
+        expected = _eval_single_bin(lhs, "**", rhs, engine)
+        result = pd.eval(ex, engine=engine, parser=parser)
+
+        if (
+            is_scalar(lhs)
+            and is_scalar(rhs)
+            and isinstance(expected, (complex, np.complexfloating))
+            and np.isnan(result)
+        ):
+            msg = "(DataFrame.columns|numpy array) are different"
+            with pytest.raises(AssertionError, match=msg):
+                tm.assert_numpy_array_equal(result, expected)
+        else:
+            tm.assert_almost_equal(result, expected)
+
+            ex = "(lhs ** rhs) ** rhs"
+            result = pd.eval(ex, engine=engine, parser=parser)
+
+            middle = _eval_single_bin(lhs, "**", rhs, engine)
+            expected = _eval_single_bin(middle, "**", rhs, engine)
+            tm.assert_almost_equal(result, expected)
+
+    def test_check_single_invert_op(self, lhs, engine, parser):
+        # simple
+        try:
+            elb = lhs.astype(bool)
+        except AttributeError:
+            elb = np.array([bool(lhs)])
+        expected = ~elb
+        result = pd.eval("~elb", engine=engine, parser=parser)
+        tm.assert_almost_equal(expected, result)
+
+    def test_frame_invert(self, engine, parser):
+        expr = "~lhs"
+
+        # ~ ##
+        # frame
+        # float always raises
+        lhs = DataFrame(np.random.default_rng(2).standard_normal((5, 2)))
+        if engine == "numexpr":
+            msg = "couldn't find matching opcode for 'invert_dd'"
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval(expr, engine=engine, parser=parser)
+        else:
+            msg = "ufunc 'invert' not supported for the input types"
+            with pytest.raises(TypeError, match=msg):
+                pd.eval(expr, engine=engine, parser=parser)
+
+        # int raises on numexpr
+        lhs = DataFrame(np.random.default_rng(2).integers(5, size=(5, 2)))
+        if engine == "numexpr" and Version(numexpr.__version__) < Version("2.13.0"):
+            msg = "couldn't find matching opcode for 'invert"
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval(expr, engine=engine, parser=parser)
+        else:
+            expect = ~lhs
+            result = pd.eval(expr, engine=engine, parser=parser)
+            tm.assert_frame_equal(expect, result)
+
+        # bool always works
+        lhs = DataFrame(np.random.default_rng(2).standard_normal((5, 2)) > 0.5)
+        expect = ~lhs
+        result = pd.eval(expr, engine=engine, parser=parser)
+        tm.assert_frame_equal(expect, result)
+
+        # object raises
+        lhs = DataFrame(
+            {"b": ["a", 1, 2.0], "c": np.random.default_rng(2).standard_normal(3) > 0.5}
+        )
+        if engine == "numexpr":
+            with pytest.raises(ValueError, match="unknown type object"):
+                pd.eval(expr, engine=engine, parser=parser)
+        else:
+            msg = "bad operand type for unary ~: 'str'"
+            with pytest.raises(TypeError, match=msg):
+                pd.eval(expr, engine=engine, parser=parser)
+
+    def test_series_invert(self, engine, parser):
+        # ~ ####
+        expr = "~lhs"
+
+        # series
+        # float raises
+        lhs = Series(np.random.default_rng(2).standard_normal(5))
+        if engine == "numexpr":
+            msg = "couldn't find matching opcode for 'invert_dd'"
+            with pytest.raises(NotImplementedError, match=msg):
+                result = pd.eval(expr, engine=engine, parser=parser)
+        else:
+            msg = "ufunc 'invert' not supported for the input types"
+            with pytest.raises(TypeError, match=msg):
+                pd.eval(expr, engine=engine, parser=parser)
+
+        # int raises on numexpr
+        lhs = Series(np.random.default_rng(2).integers(5, size=5))
+        if engine == "numexpr" and Version(numexpr.__version__) < Version("2.13.0"):
+            msg = "couldn't find matching opcode for 'invert"
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval(expr, engine=engine, parser=parser)
+        else:
+            expect = ~lhs
+            result = pd.eval(expr, engine=engine, parser=parser)
+            tm.assert_series_equal(expect, result)
+
+        # bool
+        lhs = Series(np.random.default_rng(2).standard_normal(5) > 0.5)
+        expect = ~lhs
+        result = pd.eval(expr, engine=engine, parser=parser)
+        tm.assert_series_equal(expect, result)
+
+        # float
+        # int
+        # bool
+
+        # object
+        lhs = Series(["a", 1, 2.0])
+        if engine == "numexpr":
+            with pytest.raises(ValueError, match="unknown type object"):
+                pd.eval(expr, engine=engine, parser=parser)
+        else:
+            msg = "bad operand type for unary ~: 'str'"
+            with pytest.raises(TypeError, match=msg):
+                pd.eval(expr, engine=engine, parser=parser)
+
+    def test_frame_negate(self, engine, parser):
+        expr = "-lhs"
+
+        # float
+        lhs = DataFrame(np.random.default_rng(2).standard_normal((5, 2)))
+        expect = -lhs
+        result = pd.eval(expr, engine=engine, parser=parser)
+        tm.assert_frame_equal(expect, result)
+
+        # int
+        lhs = DataFrame(np.random.default_rng(2).integers(5, size=(5, 2)))
+        expect = -lhs
+        result = pd.eval(expr, engine=engine, parser=parser)
+        tm.assert_frame_equal(expect, result)
+
+        # bool doesn't work with numexpr but works elsewhere
+        lhs = DataFrame(np.random.default_rng(2).standard_normal((5, 2)) > 0.5)
+        if engine == "numexpr":
+            msg = "couldn't find matching opcode for 'neg_bb'"
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval(expr, engine=engine, parser=parser)
+        else:
+            expect = -lhs
+            result = pd.eval(expr, engine=engine, parser=parser)
+            tm.assert_frame_equal(expect, result)
+
+    def test_series_negate(self, engine, parser):
+        expr = "-lhs"
+
+        # float
+        lhs = Series(np.random.default_rng(2).standard_normal(5))
+        expect = -lhs
+        result = pd.eval(expr, engine=engine, parser=parser)
+        tm.assert_series_equal(expect, result)
+
+        # int
+        lhs = Series(np.random.default_rng(2).integers(5, size=5))
+        expect = -lhs
+        result = pd.eval(expr, engine=engine, parser=parser)
+        tm.assert_series_equal(expect, result)
+
+        # bool doesn't work with numexpr but works elsewhere
+        lhs = Series(np.random.default_rng(2).standard_normal(5) > 0.5)
+        if engine == "numexpr":
+            msg = "couldn't find matching opcode for 'neg_bb'"
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval(expr, engine=engine, parser=parser)
+        else:
+            expect = -lhs
+            result = pd.eval(expr, engine=engine, parser=parser)
+            tm.assert_series_equal(expect, result)
+
+    @pytest.mark.parametrize(
+        "lhs",
+        [
+            # Float
+            np.random.default_rng(2).standard_normal((5, 2)),
+            # Int
+            np.random.default_rng(2).integers(5, size=(5, 2)),
+            # bool doesn't work with numexpr but works elsewhere
+            np.array([True, False, True, False, True], dtype=np.bool_),
+        ],
+    )
+    def test_frame_pos(self, lhs, engine, parser):
+        lhs = DataFrame(lhs)
+        expr = "+lhs"
+        expect = lhs
+
+        result = pd.eval(expr, engine=engine, parser=parser)
+        tm.assert_frame_equal(expect, result)
+
+    @pytest.mark.parametrize(
+        "lhs",
+        [
+            # Float
+            np.random.default_rng(2).standard_normal(5),
+            # Int
+            np.random.default_rng(2).integers(5, size=5),
+            # bool doesn't work with numexpr but works elsewhere
+            np.array([True, False, True, False, True], dtype=np.bool_),
+        ],
+    )
+    def test_series_pos(self, lhs, engine, parser):
+        lhs = Series(lhs)
+        expr = "+lhs"
+        expect = lhs
+
+        result = pd.eval(expr, engine=engine, parser=parser)
+        tm.assert_series_equal(expect, result)
+
+    def test_scalar_unary(self, engine, parser):
+        msg = "bad operand type for unary ~: 'float'"
+        warn = None
+        if (PY314 and engine == "numexpr" and parser == "pandas") or (
+            PY312 and not (engine == "numexpr" and parser == "pandas")
+        ):
+            warn = DeprecationWarning
+        with pytest.raises(TypeError, match=msg):
+            pd.eval("~1.0", engine=engine, parser=parser)
+
+        assert pd.eval("-1.0", parser=parser, engine=engine) == -1.0
+        assert pd.eval("+1.0", parser=parser, engine=engine) == +1.0
+        assert pd.eval("~1", parser=parser, engine=engine) == ~1
+        assert pd.eval("-1", parser=parser, engine=engine) == -1
+        assert pd.eval("+1", parser=parser, engine=engine) == +1
+        with tm.assert_produces_warning(
+            warn, match="Bitwise inversion", check_stacklevel=False
+        ):
+            assert pd.eval("~True", parser=parser, engine=engine) == ~True
+        with tm.assert_produces_warning(
+            warn, match="Bitwise inversion", check_stacklevel=False
+        ):
+            assert pd.eval("~False", parser=parser, engine=engine) == ~False
+        assert pd.eval("-True", parser=parser, engine=engine) == -True
+        assert pd.eval("-False", parser=parser, engine=engine) == -False
+        assert pd.eval("+True", parser=parser, engine=engine) == +True
+        assert pd.eval("+False", parser=parser, engine=engine) == +False
+
+    def test_unary_in_array(self):
+        # GH 11235
+        # TODO: 2022-01-29: result return list with numexpr 2.7.3 in CI
+        # but cannot reproduce locally
+        result = np.array(
+            pd.eval("[-True, True, +True, -False, False, +False, -37, 37, ~37, +37]"),
+            dtype=np.object_,
+        )
+        expected = np.array(
+            [
+                -True,
+                True,
+                +True,
+                -False,
+                False,
+                +False,
+                -37,
+                37,
+                ~37,
+                +37,
+            ],
+            dtype=np.object_,
+        )
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("expr", ["x < -0.1", "-5 > x"])
+    def test_float_comparison_bin_op(self, float_numpy_dtype, expr):
+        # GH 16363
+        df = DataFrame({"x": np.array([0], dtype=float_numpy_dtype)})
+        res = df.eval(expr)
+        assert res.values == np.array([False])
+
+    def test_unary_in_function(self):
+        # GH 46471
+        df = DataFrame({"x": [0, 1, np.nan]})
+
+        result = df.eval("x.fillna(-1)")
+        expected = df.x.fillna(-1)
+        # column name becomes None if using numexpr
+        # only check names when the engine is not numexpr
+        tm.assert_series_equal(result, expected, check_names=not USE_NUMEXPR)
+
+        result = df.eval("x.shift(1, fill_value=-1)")
+        expected = df.x.shift(1, fill_value=-1)
+        tm.assert_series_equal(result, expected, check_names=not USE_NUMEXPR)
+
+    @pytest.mark.parametrize(
+        "ex",
+        (
+            "1 or 2",
+            "1 and 2",
+            "a and b",
+            "a or b",
+            "1 or 2 and (3 + 2) > 3",
+            "2 * x > 2 or 1 and 2",
+            "2 * df > 3 and 1 or a",
+        ),
+    )
+    def test_disallow_scalar_bool_ops(self, ex, engine, parser):
+        x, a, b = np.random.default_rng(2).standard_normal(3), 1, 2  # noqa: F841
+        df = DataFrame(np.random.default_rng(2).standard_normal((3, 2)))  # noqa: F841
+
+        msg = "cannot evaluate scalar only bool ops|'BoolOp' nodes are not"
+        with pytest.raises(NotImplementedError, match=msg):
+            pd.eval(ex, engine=engine, parser=parser)
+
+    def test_identical(self, engine, parser):
+        # see gh-10546
+        x = 1
+        result = pd.eval("x", engine=engine, parser=parser)
+        assert result == 1
+        assert is_scalar(result)
+
+        x = 1.5
+        result = pd.eval("x", engine=engine, parser=parser)
+        assert result == 1.5
+        assert is_scalar(result)
+
+        x = False
+        result = pd.eval("x", engine=engine, parser=parser)
+        assert not result
+        assert is_bool(result)
+        assert is_scalar(result)
+
+        x = np.array([1])
+        result = pd.eval("x", engine=engine, parser=parser)
+        tm.assert_numpy_array_equal(result, np.array([1]))
+        assert result.shape == (1,)
+
+        x = np.array([1.5])
+        result = pd.eval("x", engine=engine, parser=parser)
+        tm.assert_numpy_array_equal(result, np.array([1.5]))
+        assert result.shape == (1,)
+
+        x = np.array([False])  # noqa: F841
+        result = pd.eval("x", engine=engine, parser=parser)
+        tm.assert_numpy_array_equal(result, np.array([False]))
+        assert result.shape == (1,)
+
+    def test_line_continuation(self, engine, parser):
+        # GH 11149
+        exp = """1 + 2 * \
+        5 - 1 + 2 """
+        result = pd.eval(exp, engine=engine, parser=parser)
+        assert result == 12
+
+    def test_float_truncation(self, engine, parser):
+        # GH 14241
+        exp = "1000000000.006"
+        result = pd.eval(exp, engine=engine, parser=parser)
+        expected = np.float64(exp)
+        assert result == expected
+
+        df = DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]})
+        cutoff = 1000000000.0006
+        result = df.query(f"A < {cutoff:.4f}")
+        assert result.empty
+
+        cutoff = 1000000000.0010
+        result = df.query(f"A > {cutoff:.4f}")
+        expected = df.loc[[1, 2], :]
+        tm.assert_frame_equal(expected, result)
+
+        exact = 1000000000.0011
+        result = df.query(f"A == {exact:.4f}")
+        expected = df.loc[[1], :]
+        tm.assert_frame_equal(expected, result)
+
+    def test_disallow_python_keywords(self):
+        # GH 18221
+        df = DataFrame([[0, 0, 0]], columns=["foo", "bar", "class"])
+        msg = "Python keyword not valid identifier in numexpr query"
+        with pytest.raises(SyntaxError, match=msg):
+            df.query("class == 0")
+
+        df = DataFrame()
+        df.index.name = "lambda"
+        with pytest.raises(SyntaxError, match=msg):
+            df.query("lambda == 0")
+
+    def test_true_false_logic(self):
+        # GH 25823
+        # This behavior is deprecated in Python 3.12
+        with tm.maybe_produces_warning(
+            DeprecationWarning, PY312, check_stacklevel=False
+        ):
+            assert pd.eval("not True") == -2
+            assert pd.eval("not False") == -1
+            assert pd.eval("True and not True") == 0
+
+    def test_and_logic_string_match(self):
+        # GH 25823
+        event = Series({"a": "hello"})
+        assert pd.eval(f"{event.str.match('hello').a}")
+        assert pd.eval(f"{event.str.match('hello').a and event.str.match('hello').a}")
+
+    def test_eval_keep_name(self, engine, parser):
+        df = Series([2, 15, 28], name="a").to_frame()
+        res = df.eval("a + a", engine=engine, parser=parser)
+        expected = Series([4, 30, 56], name="a")
+        tm.assert_series_equal(expected, res)
+
+    def test_eval_unmatching_names(self, engine, parser):
+        variable_name = Series([42], name="series_name")
+        res = pd.eval("variable_name + 0", engine=engine, parser=parser)
+        tm.assert_series_equal(variable_name, res)
+
+
+# -------------------------------------
+# gh-12388: Typecasting rules consistency with python
+
+
+class TestTypeCasting:
+    @pytest.mark.parametrize("op", ["+", "-", "*", "**", "/"])
+    # maybe someday... numexpr has too many upcasting rules now
+    # chain(*(np.core.sctypes[x] for x in ['uint', 'int', 'float']))
+    @pytest.mark.parametrize("left_right", [("df", "3"), ("3", "df")])
+    def test_binop_typecasting(
+        self, engine, parser, op, complex_or_float_dtype, left_right, request
+    ):
+        # GH#21374
+        dtype = complex_or_float_dtype
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)), dtype=dtype)
+        left, right = left_right
+        s = f"{left} {op} {right}"
+        res = pd.eval(s, engine=engine, parser=parser)
+        if dtype == "complex64" and engine == "numexpr":
+            mark = pytest.mark.xfail(
+                reason="numexpr issue with complex that are upcast "
+                "to complex 128 "
+                "https://github.com/pydata/numexpr/issues/492"
+            )
+            request.applymarker(mark)
+        assert df.values.dtype == dtype
+        assert res.values.dtype == dtype
+        tm.assert_frame_equal(res, eval(s), check_exact=False)
+
+
+# -------------------------------------
+# Basic and complex alignment
+
+
+def should_warn(*args):
+    not_mono = not any(map(operator.attrgetter("is_monotonic_increasing"), args))
+    only_one_dt = reduce(
+        operator.xor, (issubclass(x.dtype.type, np.datetime64) for x in args)
+    )
+    return not_mono and only_one_dt
+
+
+class TestAlignment:
+    index_types = ["i", "s", "dt"]
+    lhs_index_types = [*index_types, "s"]  # 'p'
+
+    def test_align_nested_unary_op(self, engine, parser):
+        s = "df * ~2"
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+        res = pd.eval(s, engine=engine, parser=parser)
+        tm.assert_frame_equal(res, df * ~2)
+
+    @pytest.mark.filterwarnings("always::RuntimeWarning")
+    @pytest.mark.parametrize("lr_idx_type", lhs_index_types)
+    @pytest.mark.parametrize("rr_idx_type", index_types)
+    @pytest.mark.parametrize("c_idx_type", index_types)
+    def test_basic_frame_alignment(
+        self, engine, parser, lr_idx_type, rr_idx_type, c_idx_type, idx_func_dict
+    ):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 10)),
+            index=idx_func_dict[lr_idx_type](10),
+            columns=idx_func_dict[c_idx_type](10),
+        )
+        df2 = DataFrame(
+            np.random.default_rng(2).standard_normal((20, 10)),
+            index=idx_func_dict[rr_idx_type](20),
+            columns=idx_func_dict[c_idx_type](10),
+        )
+        # only warns if not monotonic and not sortable
+        if should_warn(df.index, df2.index):
+            with tm.assert_produces_warning(RuntimeWarning):
+                res = pd.eval("df + df2", engine=engine, parser=parser)
+        else:
+            res = pd.eval("df + df2", engine=engine, parser=parser)
+        tm.assert_frame_equal(res, df + df2)
+
+    @pytest.mark.parametrize("r_idx_type", lhs_index_types)
+    @pytest.mark.parametrize("c_idx_type", lhs_index_types)
+    def test_frame_comparison(
+        self, engine, parser, r_idx_type, c_idx_type, idx_func_dict
+    ):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 10)),
+            index=idx_func_dict[r_idx_type](10),
+            columns=idx_func_dict[c_idx_type](10),
+        )
+        res = pd.eval("df < 2", engine=engine, parser=parser)
+        tm.assert_frame_equal(res, df < 2)
+
+        df3 = DataFrame(
+            np.random.default_rng(2).standard_normal(df.shape),
+            index=df.index,
+            columns=df.columns,
+        )
+        res = pd.eval("df < df3", engine=engine, parser=parser)
+        tm.assert_frame_equal(res, df < df3)
+
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    @pytest.mark.parametrize("r1", lhs_index_types)
+    @pytest.mark.parametrize("c1", index_types)
+    @pytest.mark.parametrize("r2", index_types)
+    @pytest.mark.parametrize("c2", index_types)
+    def test_medium_complex_frame_alignment(
+        self, engine, parser, r1, c1, r2, c2, idx_func_dict
+    ):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((3, 2)),
+            index=idx_func_dict[r1](3),
+            columns=idx_func_dict[c1](2),
+        )
+        df2 = DataFrame(
+            np.random.default_rng(2).standard_normal((4, 2)),
+            index=idx_func_dict[r2](4),
+            columns=idx_func_dict[c2](2),
+        )
+        df3 = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)),
+            index=idx_func_dict[r2](5),
+            columns=idx_func_dict[c2](2),
+        )
+        if should_warn(df.index, df2.index, df3.index):
+            with tm.assert_produces_warning(RuntimeWarning):
+                res = pd.eval("df + df2 + df3", engine=engine, parser=parser)
+        else:
+            res = pd.eval("df + df2 + df3", engine=engine, parser=parser)
+        tm.assert_frame_equal(res, df + df2 + df3)
+
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    @pytest.mark.parametrize("index_name", ["index", "columns"])
+    @pytest.mark.parametrize("c_idx_type", index_types)
+    @pytest.mark.parametrize("r_idx_type", lhs_index_types)
+    def test_basic_frame_series_alignment(
+        self, engine, parser, index_name, r_idx_type, c_idx_type, idx_func_dict
+    ):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 10)),
+            index=idx_func_dict[r_idx_type](10),
+            columns=idx_func_dict[c_idx_type](10),
+        )
+        index = getattr(df, index_name)
+        s = Series(np.random.default_rng(2).standard_normal(5), index[:5])
+
+        if should_warn(df.index, s.index):
+            with tm.assert_produces_warning(RuntimeWarning):
+                res = pd.eval("df + s", engine=engine, parser=parser)
+        else:
+            res = pd.eval("df + s", engine=engine, parser=parser)
+
+        if r_idx_type == "dt" or c_idx_type == "dt":
+            expected = df.add(s) if engine == "numexpr" else df + s
+        else:
+            expected = df + s
+        tm.assert_frame_equal(res, expected)
+
+    @pytest.mark.parametrize("index_name", ["index", "columns"])
+    @pytest.mark.parametrize(
+        "r_idx_type, c_idx_type",
+        [*list(product(["i", "s"], ["i", "s"])), ("dt", "dt")],
+    )
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    def test_basic_series_frame_alignment(
+        self, request, engine, parser, index_name, r_idx_type, c_idx_type, idx_func_dict
+    ):
+        if (
+            engine == "numexpr"
+            and parser in ("pandas", "python")
+            and index_name == "index"
+            and r_idx_type == "i"
+            and c_idx_type == "s"
+        ):
+            reason = (
+                f"Flaky column ordering when engine={engine}, "
+                f"parser={parser}, index_name={index_name}, "
+                f"r_idx_type={r_idx_type}, c_idx_type={c_idx_type}"
+            )
+            request.applymarker(pytest.mark.xfail(reason=reason, strict=False))
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 7)),
+            index=idx_func_dict[r_idx_type](10),
+            columns=idx_func_dict[c_idx_type](7),
+        )
+        index = getattr(df, index_name)
+        s = Series(np.random.default_rng(2).standard_normal(5), index[:5])
+        if should_warn(s.index, df.index):
+            with tm.assert_produces_warning(RuntimeWarning):
+                res = pd.eval("s + df", engine=engine, parser=parser)
+        else:
+            res = pd.eval("s + df", engine=engine, parser=parser)
+
+        if r_idx_type == "dt" or c_idx_type == "dt":
+            expected = df.add(s) if engine == "numexpr" else s + df
+        else:
+            expected = s + df
+        tm.assert_frame_equal(res, expected)
+
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    @pytest.mark.parametrize("c_idx_type", index_types)
+    @pytest.mark.parametrize("r_idx_type", lhs_index_types)
+    @pytest.mark.parametrize("index_name", ["index", "columns"])
+    @pytest.mark.parametrize("op", ["+", "*"])
+    def test_series_frame_commutativity(
+        self, engine, parser, index_name, op, r_idx_type, c_idx_type, idx_func_dict
+    ):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 10)),
+            index=idx_func_dict[r_idx_type](10),
+            columns=idx_func_dict[c_idx_type](10),
+        )
+        index = getattr(df, index_name)
+        s = Series(np.random.default_rng(2).standard_normal(5), index[:5])
+
+        lhs = f"s {op} df"
+        rhs = f"df {op} s"
+        if should_warn(df.index, s.index):
+            with tm.assert_produces_warning(RuntimeWarning):
+                a = pd.eval(lhs, engine=engine, parser=parser)
+            with tm.assert_produces_warning(RuntimeWarning):
+                b = pd.eval(rhs, engine=engine, parser=parser)
+        else:
+            a = pd.eval(lhs, engine=engine, parser=parser)
+            b = pd.eval(rhs, engine=engine, parser=parser)
+
+        if r_idx_type != "dt" and c_idx_type != "dt":
+            if engine == "numexpr":
+                tm.assert_frame_equal(a, b)
+
+    @pytest.mark.filterwarnings("always::RuntimeWarning")
+    @pytest.mark.parametrize("r1", lhs_index_types)
+    @pytest.mark.parametrize("c1", index_types)
+    @pytest.mark.parametrize("r2", index_types)
+    @pytest.mark.parametrize("c2", index_types)
+    def test_complex_series_frame_alignment(
+        self, engine, parser, r1, c1, r2, c2, idx_func_dict
+    ):
+        n = 3
+        m1 = 5
+        m2 = 2 * m1
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((m1, n)),
+            index=idx_func_dict[r1](m1),
+            columns=idx_func_dict[c1](n),
+        )
+        df2 = DataFrame(
+            np.random.default_rng(2).standard_normal((m2, n)),
+            index=idx_func_dict[r2](m2),
+            columns=idx_func_dict[c2](n),
+        )
+        index = df2.columns
+        ser = Series(np.random.default_rng(2).standard_normal(n), index[:n])
+
+        if r2 == "dt" or c2 == "dt":
+            if engine == "numexpr":
+                expected2 = df2.add(ser)
+            else:
+                expected2 = df2 + ser
+        else:
+            expected2 = df2 + ser
+
+        if r1 == "dt" or c1 == "dt":
+            if engine == "numexpr":
+                expected = expected2.add(df)
+            else:
+                expected = expected2 + df
+        else:
+            expected = expected2 + df
+
+        if should_warn(df2.index, ser.index, df.index):
+            with tm.assert_produces_warning(RuntimeWarning):
+                res = pd.eval("df2 + ser + df", engine=engine, parser=parser)
+        else:
+            res = pd.eval("df2 + ser + df", engine=engine, parser=parser)
+        assert res.shape == expected.shape
+        tm.assert_frame_equal(res, expected)
+
+    def test_performance_warning_for_poor_alignment(
+        self, performance_warning, engine, parser
+    ):
+        df = DataFrame(np.random.default_rng(2).standard_normal((1000, 10)))
+        s = Series(np.random.default_rng(2).standard_normal(10000))
+        if engine == "numexpr" and performance_warning:
+            seen = PerformanceWarning
+        else:
+            seen = False
+
+        msg = "Alignment difference on axis 1 is larger than an order of magnitude"
+        with tm.assert_produces_warning(seen, match=msg):
+            pd.eval("df + s", engine=engine, parser=parser)
+
+        s = Series(np.random.default_rng(2).standard_normal(1000))
+        with tm.assert_produces_warning(False):
+            pd.eval("df + s", engine=engine, parser=parser)
+
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 10000)))
+        s = Series(np.random.default_rng(2).standard_normal(10000))
+        with tm.assert_produces_warning(False):
+            pd.eval("df + s", engine=engine, parser=parser)
+
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 10)))
+        s = Series(np.random.default_rng(2).standard_normal(10000))
+
+        is_python_engine = engine == "python"
+
+        if not is_python_engine and performance_warning:
+            wrn = PerformanceWarning
+        else:
+            wrn = False
+
+        with tm.assert_produces_warning(wrn, match=msg) as w:
+            pd.eval("df + s", engine=engine, parser=parser)
+
+            if not is_python_engine and performance_warning:
+                assert len(w) == 1
+                msg = str(w[0].message)
+                logged = np.log10(s.size - df.shape[1])
+                expected = (
+                    f"Alignment difference on axis 1 is larger "
+                    f"than an order of magnitude on term 'df', "
+                    f"by more than {logged:.4g}; performance may suffer."
+                )
+                assert msg == expected
+
+
+# ------------------------------------
+# Slightly more complex ops
+
+
+class TestOperations:
+    def eval(self, *args, **kwargs):
+        kwargs["level"] = kwargs.pop("level", 0) + 1
+        return pd.eval(*args, **kwargs)
+
+    def test_simple_arith_ops(self, engine, parser):
+        exclude_arith = []
+        if parser == "python":
+            exclude_arith = ["in", "not in"]
+
+        arith_ops = [
+            op
+            for op in expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS
+            if op not in exclude_arith
+        ]
+
+        ops = (op for op in arith_ops if op != "//")
+
+        for op in ops:
+            ex = f"1 {op} 1"
+            ex2 = f"x {op} 1"
+            ex3 = f"1 {op} (x + 1)"
+
+            if op in ("in", "not in"):
+                msg = "argument of type 'int' is not .*"
+                with pytest.raises(TypeError, match=msg):
+                    pd.eval(ex, engine=engine, parser=parser)
+            else:
+                expec = _eval_single_bin(1, op, 1, engine)
+                x = self.eval(ex, engine=engine, parser=parser)
+                assert x == expec
+
+                expec = _eval_single_bin(x, op, 1, engine)
+                y = self.eval(ex2, local_dict={"x": x}, engine=engine, parser=parser)
+                assert y == expec
+
+                expec = _eval_single_bin(1, op, x + 1, engine)
+                y = self.eval(ex3, local_dict={"x": x}, engine=engine, parser=parser)
+                assert y == expec
+
+    @pytest.mark.parametrize("rhs", [True, False])
+    @pytest.mark.parametrize("lhs", [True, False])
+    @pytest.mark.parametrize("op", expr.BOOL_OPS_SYMS)
+    def test_simple_bool_ops(self, rhs, lhs, op):
+        ex = f"{lhs} {op} {rhs}"
+
+        if parser == "python" and op in ["and", "or"]:
+            msg = "'BoolOp' nodes are not implemented"
+            with pytest.raises(NotImplementedError, match=msg):
+                self.eval(ex)
+            return
+
+        res = self.eval(ex)
+        exp = eval(ex)
+        assert res == exp
+
+    @pytest.mark.parametrize("rhs", [True, False])
+    @pytest.mark.parametrize("lhs", [True, False])
+    @pytest.mark.parametrize("op", expr.BOOL_OPS_SYMS)
+    def test_bool_ops_with_constants(self, rhs, lhs, op):
+        ex = f"{lhs} {op} {rhs}"
+
+        if parser == "python" and op in ["and", "or"]:
+            msg = "'BoolOp' nodes are not implemented"
+            with pytest.raises(NotImplementedError, match=msg):
+                self.eval(ex)
+            return
+
+        res = self.eval(ex)
+        exp = eval(ex)
+        assert res == exp
+
+    def test_4d_ndarray_fails(self):
+        x = np.random.default_rng(2).standard_normal((3, 4, 5, 6))
+        y = Series(np.random.default_rng(2).standard_normal(10))
+        msg = "N-dimensional objects, where N > 2, are not supported with eval"
+        with pytest.raises(NotImplementedError, match=msg):
+            self.eval("x + y", local_dict={"x": x, "y": y})
+
+    def test_constant(self):
+        x = self.eval("1")
+        assert x == 1
+
+    def test_single_variable(self):
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)))
+        df2 = self.eval("df", local_dict={"df": df})
+        tm.assert_frame_equal(df, df2)
+
+    def test_failing_subscript_with_name_error(self):
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))  # noqa: F841
+        with pytest.raises(NameError, match="name 'x' is not defined"):
+            self.eval("df[x > 2] > 2")
+
+    def test_lhs_expression_subscript(self):
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+        result = self.eval("(df + 1)[df > 2]", local_dict={"df": df})
+        expected = (df + 1)[df > 2]
+        tm.assert_frame_equal(result, expected)
+
+    def test_attr_expression(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 3)), columns=list("abc")
+        )
+        expr1 = "df.a < df.b"
+        expec1 = df.a < df.b
+        expr2 = "df.a + df.b + df.c"
+        expec2 = df.a + df.b + df.c
+        expr3 = "df.a + df.b + df.c[df.b < 0]"
+        expec3 = df.a + df.b + df.c[df.b < 0]
+        exprs = expr1, expr2, expr3
+        expecs = expec1, expec2, expec3
+        for e, expec in zip(exprs, expecs, strict=True):
+            tm.assert_series_equal(expec, self.eval(e, local_dict={"df": df}))
+
+    def test_assignment_fails(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 3)), columns=list("abc")
+        )
+        df2 = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+        expr1 = "df = df2"
+        msg = "cannot assign without a target object"
+        with pytest.raises(ValueError, match=msg):
+            self.eval(expr1, local_dict={"df": df, "df2": df2})
+
+    def test_assignment_column_multiple_raise(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab")
+        )
+        # multiple assignees
+        with pytest.raises(SyntaxError, match="invalid syntax"):
+            df.eval("d c = a + b")
+
+    def test_assignment_column_invalid_assign(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab")
+        )
+        # invalid assignees
+        msg = "left hand side of an assignment must be a single name"
+        with pytest.raises(SyntaxError, match=msg):
+            df.eval("d,c = a + b")
+
+    def test_assignment_column_invalid_assign_function_call(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab")
+        )
+        msg = "cannot assign to function call"
+        with pytest.raises(SyntaxError, match=msg):
+            df.eval('Timestamp("20131001") = a + b')
+
+    def test_assignment_single_assign_existing(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab")
+        )
+        # single assignment - existing variable
+        expected = df.copy()
+        expected["a"] = expected["a"] + expected["b"]
+        df.eval("a = a + b", inplace=True)
+        tm.assert_frame_equal(df, expected)
+
+    def test_assignment_single_assign_new(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab")
+        )
+        # single assignment - new variable
+        expected = df.copy()
+        expected["c"] = expected["a"] + expected["b"]
+        df.eval("c = a + b", inplace=True)
+        tm.assert_frame_equal(df, expected)
+
+    def test_assignment_single_assign_local_overlap(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab")
+        )
+        df = df.copy()
+        a = 1  # noqa: F841
+        df.eval("a = 1 + b", inplace=True)
+
+        expected = df.copy()
+        expected["a"] = 1 + expected["b"]
+        tm.assert_frame_equal(df, expected)
+
+    def test_assignment_single_assign_name(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab")
+        )
+
+        a = 1  # noqa: F841
+        old_a = df.a.copy()
+        df.eval("a = a + b", inplace=True)
+        result = old_a + df.b
+        tm.assert_series_equal(result, df.a, check_names=False)
+        assert result.name is None
+
+    def test_assignment_multiple_raises(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab")
+        )
+        # multiple assignment
+        df.eval("c = a + b", inplace=True)
+        msg = "can only assign a single expression"
+        with pytest.raises(SyntaxError, match=msg):
+            df.eval("c = a = b")
+
+    def test_assignment_explicit(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab")
+        )
+        # explicit targets
+        self.eval("c = df.a + df.b", local_dict={"df": df}, target=df, inplace=True)
+        expected = df.copy()
+        expected["c"] = expected["a"] + expected["b"]
+        tm.assert_frame_equal(df, expected)
+
+    def test_column_in(self, engine):
+        # GH 11235
+        df = DataFrame({"a": [11], "b": [-32]})
+        result = df.eval("a in [11, -32]", engine=engine)
+        expected = Series([True], name="a")
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.xfail(reason="Unknown: Omitted test_ in name prior.")
+    def test_assignment_not_inplace(self):
+        # see gh-9297
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)), columns=list("ab")
+        )
+
+        actual = df.eval("c = a + b", inplace=False)
+        assert actual is not None
+
+        expected = df.copy()
+        expected["c"] = expected["a"] + expected["b"]
+        tm.assert_frame_equal(df, expected)
+
+    def test_multi_line_expression(self):
+        # GH 11149
+        df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+        expected = df.copy()
+
+        expected["c"] = expected["a"] + expected["b"]
+        expected["d"] = expected["c"] + expected["b"]
+        answer = df.eval(
+            """
+        c = a + b
+        d = c + b""",
+            inplace=True,
+        )
+        tm.assert_frame_equal(expected, df)
+        assert answer is None
+
+        expected["a"] = expected["a"] - 1
+        expected["e"] = expected["a"] + 2
+        answer = df.eval(
+            """
+        a = a - 1
+        e = a + 2""",
+            inplace=True,
+        )
+        tm.assert_frame_equal(expected, df)
+        assert answer is None
+
+        # multi-line not valid if not all assignments
+        msg = "Multi-line expressions are only valid if all expressions contain"
+        with pytest.raises(ValueError, match=msg):
+            df.eval(
+                """
+            a = b + 2
+            b - 2""",
+                inplace=False,
+            )
+
+    def test_multi_line_expression_not_inplace(self):
+        # GH 11149
+        df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+        expected = df.copy()
+
+        expected["c"] = expected["a"] + expected["b"]
+        expected["d"] = expected["c"] + expected["b"]
+        df = df.eval(
+            """
+        c = a + b
+        d = c + b""",
+            inplace=False,
+        )
+        tm.assert_frame_equal(expected, df)
+
+        expected["a"] = expected["a"] - 1
+        expected["e"] = expected["a"] + 2
+        df = df.eval(
+            """
+        a = a - 1
+        e = a + 2""",
+            inplace=False,
+        )
+        tm.assert_frame_equal(expected, df)
+
+    def test_multi_line_expression_local_variable(self):
+        # GH 15342
+        df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+        expected = df.copy()
+
+        local_var = 7
+        expected["c"] = expected["a"] * local_var
+        expected["d"] = expected["c"] + local_var
+        answer = df.eval(
+            """
+        c = a * @local_var
+        d = c + @local_var
+        """,
+            inplace=True,
+        )
+        tm.assert_frame_equal(expected, df)
+        assert answer is None
+
+    def test_multi_line_expression_callable_local_variable(self):
+        # 26426
+        df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+
+        def local_func(a, b):
+            return b
+
+        expected = df.copy()
+        expected["c"] = expected["a"] * local_func(1, 7)
+        expected["d"] = expected["c"] + local_func(1, 7)
+        answer = df.eval(
+            """
+        c = a * @local_func(1, 7)
+        d = c + @local_func(1, 7)
+        """,
+            inplace=True,
+        )
+        tm.assert_frame_equal(expected, df)
+        assert answer is None
+
+    def test_multi_line_expression_callable_local_variable_with_kwargs(self):
+        # 26426
+        df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+
+        def local_func(a, b):
+            return b
+
+        expected = df.copy()
+        expected["c"] = expected["a"] * local_func(b=7, a=1)
+        expected["d"] = expected["c"] + local_func(b=7, a=1)
+        answer = df.eval(
+            """
+        c = a * @local_func(b=7, a=1)
+        d = c + @local_func(b=7, a=1)
+        """,
+            inplace=True,
+        )
+        tm.assert_frame_equal(expected, df)
+        assert answer is None
+
+    def test_assignment_in_query(self):
+        # GH 8664
+        df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+        df_orig = df.copy()
+        msg = "cannot assign without a target object"
+        with pytest.raises(ValueError, match=msg):
+            df.query("a = 1")
+        tm.assert_frame_equal(df, df_orig)
+
+    def test_query_inplace(self):
+        # see gh-11149
+        df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+        expected = df.copy()
+        expected = expected[expected["a"] == 2]
+        df.query("a == 2", inplace=True)
+        tm.assert_frame_equal(expected, df)
+
+        df = {}
+        expected = {"a": 3}
+
+        self.eval("a = 1 + 2", target=df, inplace=True)
+        tm.assert_dict_equal(df, expected)
+
+    @pytest.mark.parametrize("invalid_target", [1, "cat", [1, 2], np.array([]), (1, 3)])
+    def test_cannot_item_assign(self, invalid_target):
+        msg = "Cannot assign expression output to target"
+        expression = "a = 1 + 2"
+
+        with pytest.raises(ValueError, match=msg):
+            self.eval(expression, target=invalid_target, inplace=True)
+
+        if hasattr(invalid_target, "copy"):
+            with pytest.raises(ValueError, match=msg):
+                self.eval(expression, target=invalid_target, inplace=False)
+
+    @pytest.mark.parametrize("invalid_target", [1, "cat", (1, 3)])
+    def test_cannot_copy_item(self, invalid_target):
+        msg = "Cannot return a copy of the target"
+        expression = "a = 1 + 2"
+
+        with pytest.raises(ValueError, match=msg):
+            self.eval(expression, target=invalid_target, inplace=False)
+
+    @pytest.mark.parametrize("target", [1, "cat", [1, 2], np.array([]), (1, 3), {1: 2}])
+    def test_inplace_no_assignment(self, target):
+        expression = "1 + 2"
+
+        assert self.eval(expression, target=target, inplace=False) == 3
+
+        msg = "Cannot operate inplace if there is no assignment"
+        with pytest.raises(ValueError, match=msg):
+            self.eval(expression, target=target, inplace=True)
+
+    def test_basic_period_index_boolean_expression(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((2, 2)),
+            columns=period_range("2020-01-01", freq="D", periods=2),
+        )
+        e = df < 2
+        r = self.eval("df < 2", local_dict={"df": df})
+        x = df < 2
+
+        tm.assert_frame_equal(r, e)
+        tm.assert_frame_equal(x, e)
+
+    def test_basic_period_index_subscript_expression(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((2, 2)),
+            columns=period_range("2020-01-01", freq="D", periods=2),
+        )
+        r = self.eval("df[df < 2 + 3]", local_dict={"df": df})
+        e = df[df < 2 + 3]
+        tm.assert_frame_equal(r, e)
+
+    def test_nested_period_index_subscript_expression(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((2, 2)),
+            columns=period_range("2020-01-01", freq="D", periods=2),
+        )
+        r = self.eval("df[df[df < 2] < 2] + df * 2", local_dict={"df": df})
+        e = df[df[df < 2] < 2] + df * 2
+        tm.assert_frame_equal(r, e)
+
+    def test_date_boolean(self, engine, parser):
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+        df["dates1"] = date_range("1/1/2012", periods=5)
+        res = self.eval(
+            "df.dates1 < 20130101",
+            local_dict={"df": df},
+            engine=engine,
+            parser=parser,
+        )
+        expec = df.dates1 < "20130101"
+        tm.assert_series_equal(res, expec)
+
+    def test_simple_in_ops(self, engine, parser):
+        if parser != "python":
+            res = pd.eval("1 in [1, 2]", engine=engine, parser=parser)
+            assert res
+
+            res = pd.eval("2 in (1, 2)", engine=engine, parser=parser)
+            assert res
+
+            res = pd.eval("3 in (1, 2)", engine=engine, parser=parser)
+            assert not res
+
+            res = pd.eval("3 not in (1, 2)", engine=engine, parser=parser)
+            assert res
+
+            res = pd.eval("[3] not in (1, 2)", engine=engine, parser=parser)
+            assert res
+
+            res = pd.eval("[3] in ([3], 2)", engine=engine, parser=parser)
+            assert res
+
+            res = pd.eval("[[3]] in [[[3]], 2]", engine=engine, parser=parser)
+            assert res
+
+            res = pd.eval("(3,) in [(3,), 2]", engine=engine, parser=parser)
+            assert res
+
+            res = pd.eval("(3,) not in [(3,), 2]", engine=engine, parser=parser)
+            assert not res
+
+            res = pd.eval("[(3,)] in [[(3,)], 2]", engine=engine, parser=parser)
+            assert res
+        else:
+            msg = "'In' nodes are not implemented"
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval("1 in [1, 2]", engine=engine, parser=parser)
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval("2 in (1, 2)", engine=engine, parser=parser)
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval("3 in (1, 2)", engine=engine, parser=parser)
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval("[(3,)] in (1, 2, [(3,)])", engine=engine, parser=parser)
+            msg = "'NotIn' nodes are not implemented"
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval("3 not in (1, 2)", engine=engine, parser=parser)
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval("[3] not in (1, 2, [[3]])", engine=engine, parser=parser)
+
+    def test_check_many_exprs(self, engine, parser):
+        a = 1  # noqa: F841
+        expr = " * ".join("a" * 33)
+        expected = 1
+        res = pd.eval(expr, engine=engine, parser=parser)
+        assert res == expected
+
+    @pytest.mark.parametrize(
+        "expr",
+        [
+            "df > 2 and df > 3",
+            "df > 2 or df > 3",
+            "not df > 2",
+        ],
+    )
+    def test_fails_and_or_not(self, expr, engine, parser):
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+        if parser == "python":
+            msg = "'BoolOp' nodes are not implemented"
+            if "not" in expr:
+                msg = "'Not' nodes are not implemented"
+
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval(
+                    expr,
+                    local_dict={"df": df},
+                    parser=parser,
+                    engine=engine,
+                )
+        else:
+            # smoke-test, should not raise
+            pd.eval(
+                expr,
+                local_dict={"df": df},
+                parser=parser,
+                engine=engine,
+            )
+
+    @pytest.mark.parametrize("char", ["|", "&"])
+    def test_fails_ampersand_pipe(self, char, engine, parser):
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))  # noqa: F841
+        ex = f"(df + 2)[df > 1] > 0 {char} (df > 0)"
+        if parser == "python":
+            msg = "cannot evaluate scalar only bool ops"
+            with pytest.raises(NotImplementedError, match=msg):
+                pd.eval(ex, parser=parser, engine=engine)
+        else:
+            # smoke-test, should not raise
+            pd.eval(ex, parser=parser, engine=engine)
+
+
+class TestMath:
+    def eval(self, *args, **kwargs):
+        kwargs["level"] = kwargs.pop("level", 0) + 1
+        return pd.eval(*args, **kwargs)
+
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    @pytest.mark.parametrize("fn", _unary_math_ops)
+    def test_unary_functions(self, fn, engine, parser):
+        df = DataFrame({"a": np.random.default_rng(2).standard_normal(10)})
+        a = df.a
+
+        expr = f"{fn}(a)"
+        got = self.eval(expr, engine=engine, parser=parser)
+        with np.errstate(all="ignore"):
+            expect = getattr(np, fn)(a)
+        tm.assert_series_equal(got, expect)
+
+    @pytest.mark.parametrize("fn", _binary_math_ops)
+    def test_binary_functions(self, fn, engine, parser):
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).standard_normal(10),
+                "b": np.random.default_rng(2).standard_normal(10),
+            }
+        )
+        a = df.a
+        b = df.b
+
+        expr = f"{fn}(a, b)"
+        got = self.eval(expr, engine=engine, parser=parser)
+        with np.errstate(all="ignore"):
+            expect = getattr(np, fn)(a, b)
+        tm.assert_almost_equal(got, expect)
+
+    def test_df_use_case(self, engine, parser):
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).standard_normal(10),
+                "b": np.random.default_rng(2).standard_normal(10),
+            }
+        )
+        df.eval(
+            "e = arctan2(sin(a), b)",
+            engine=engine,
+            parser=parser,
+            inplace=True,
+        )
+        got = df.e
+        expect = np.arctan2(np.sin(df.a), df.b).rename("e")
+        tm.assert_series_equal(got, expect)
+
+    def test_df_arithmetic_subexpression(self, engine, parser):
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).standard_normal(10),
+                "b": np.random.default_rng(2).standard_normal(10),
+            }
+        )
+        df.eval("e = sin(a + b)", engine=engine, parser=parser, inplace=True)
+        got = df.e
+        expect = np.sin(df.a + df.b).rename("e")
+        tm.assert_series_equal(got, expect)
+
+    @pytest.mark.parametrize(
+        "dtype, expect_dtype",
+        [
+            (np.int32, np.float64),
+            (np.int64, np.float64),
+            (np.float32, np.float32),
+            (np.float64, np.float64),
+            pytest.param(np.complex128, np.complex128, marks=td.skip_if_windows),
+        ],
+    )
+    def test_result_types(self, dtype, expect_dtype, engine, parser):
+        # xref https://github.com/pandas-dev/pandas/issues/12293
+        #  this fails on Windows, apparently a floating point precision issue
+
+        # Did not test complex64 because DataFrame is converting it to
+        # complex128. Due to https://github.com/pandas-dev/pandas/issues/10952
+        df = DataFrame(
+            {"a": np.random.default_rng(2).standard_normal(10).astype(dtype)}
+        )
+        assert df.a.dtype == dtype
+        df.eval("b = sin(a)", engine=engine, parser=parser, inplace=True)
+        got = df.b
+        expect = np.sin(df.a).rename("b")
+        assert expect.dtype == got.dtype
+        assert expect_dtype == got.dtype
+        tm.assert_series_equal(got, expect)
+
+    def test_undefined_func(self, engine, parser):
+        df = DataFrame({"a": np.random.default_rng(2).standard_normal(10)})
+        msg = '"mysin" is not a supported function'
+
+        with pytest.raises(ValueError, match=msg):
+            df.eval("mysin(a)", engine=engine, parser=parser)
+
+    def test_keyword_arg(self, engine, parser):
+        df = DataFrame({"a": np.random.default_rng(2).standard_normal(10)})
+        msg = 'Function "sin" does not support keyword arguments'
+
+        with pytest.raises(TypeError, match=msg):
+            df.eval("sin(x=a)", engine=engine, parser=parser)
+
+
+_var_s = np.random.default_rng(2).standard_normal(10)
+
+
+class TestScope:
+    def test_global_scope(self, engine, parser):
+        e = "_var_s * 2"
+        tm.assert_numpy_array_equal(
+            _var_s * 2, pd.eval(e, engine=engine, parser=parser)
+        )
+
+    def test_no_new_locals(self, engine, parser):
+        x = 1
+        lcls = locals().copy()
+        pd.eval("x + 1", local_dict=lcls, engine=engine, parser=parser)
+        lcls2 = locals().copy()
+        lcls2.pop("lcls")
+        assert lcls == lcls2
+
+    def test_no_new_globals(self, engine, parser):
+        x = 1  # noqa: F841
+        gbls = globals().copy()
+        pd.eval("x + 1", engine=engine, parser=parser)
+        gbls2 = globals().copy()
+        assert gbls == gbls2
+
+    def test_empty_locals(self, engine, parser):
+        # GH 47084
+        x = 1  # noqa: F841
+        msg = "name 'x' is not defined"
+        with pytest.raises(UndefinedVariableError, match=msg):
+            pd.eval("x + 1", engine=engine, parser=parser, local_dict={})
+
+    def test_empty_globals(self, engine, parser):
+        # GH 47084
+        msg = "name '_var_s' is not defined"
+        e = "_var_s * 2"
+        with pytest.raises(UndefinedVariableError, match=msg):
+            pd.eval(e, engine=engine, parser=parser, global_dict={})
+
+
+@td.skip_if_no("numexpr")
+def test_invalid_engine():
+    msg = "Invalid engine 'asdf' passed"
+    with pytest.raises(KeyError, match=msg):
+        pd.eval("x + y", local_dict={"x": 1, "y": 2}, engine="asdf")
+
+
+@td.skip_if_no("numexpr")
+@pytest.mark.parametrize(
+    ("use_numexpr", "expected"),
+    (
+        (True, "numexpr"),
+        (False, "python"),
+    ),
+)
+def test_numexpr_option_respected(use_numexpr, expected):
+    # GH 32556
+    from pandas.core.computation.eval import _check_engine
+
+    with pd.option_context("compute.use_numexpr", use_numexpr):
+        result = _check_engine(None)
+        assert result == expected
+
+
+@td.skip_if_no("numexpr")
+def test_numexpr_option_incompatible_op():
+    # GH 32556
+    with pd.option_context("compute.use_numexpr", False):
+        df = DataFrame(
+            {"A": [True, False, True, False, None, None], "B": [1, 2, 3, 4, 5, 6]}
+        )
+        result = df.query("A.isnull()")
+        expected = DataFrame({"A": [None, None], "B": [5, 6]}, index=range(4, 6))
+        tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("numexpr")
+def test_invalid_parser():
+    msg = "Invalid parser 'asdf' passed"
+    with pytest.raises(KeyError, match=msg):
+        pd.eval("x + y", local_dict={"x": 1, "y": 2}, parser="asdf")
+
+
+_parsers: dict[str, type[BaseExprVisitor]] = {
+    "python": PythonExprVisitor,
+    "pytables": pytables.PyTablesExprVisitor,
+    "pandas": PandasExprVisitor,
+}
+
+
+@pytest.mark.parametrize("engine", ENGINES)
+@pytest.mark.parametrize("parser", _parsers)
+def test_disallowed_nodes(engine, parser):
+    VisitorClass = _parsers[parser]
+    inst = VisitorClass("x + 1", engine, parser)
+
+    for ops in VisitorClass.unsupported_nodes:
+        msg = "nodes are not implemented"
+        with pytest.raises(NotImplementedError, match=msg):
+            getattr(inst, ops)()
+
+
+def test_syntax_error_exprs(engine, parser):
+    e = "s +"
+    with pytest.raises(SyntaxError, match="invalid syntax"):
+        pd.eval(e, engine=engine, parser=parser)
+
+
+def test_name_error_exprs(engine, parser):
+    e = "s + t"
+    msg = "name 's' is not defined"
+    with pytest.raises(NameError, match=msg):
+        pd.eval(e, engine=engine, parser=parser)
+
+
+@pytest.mark.parametrize("express", ["a + @b", "@a + b", "@a + @b"])
+def test_invalid_local_variable_reference(engine, parser, express):
+    a, b = 1, 2  # noqa: F841
+
+    if parser != "pandas":
+        with pytest.raises(SyntaxError, match="The '@' prefix is only"):
+            pd.eval(express, engine=engine, parser=parser)
+    else:
+        with pytest.raises(SyntaxError, match="The '@' prefix is not"):
+            pd.eval(express, engine=engine, parser=parser)
+
+
+def test_numexpr_builtin_raises(engine, parser):
+    sin, dotted_line = 1, 2
+    if engine == "numexpr":
+        msg = "Variables in expression .+"
+        with pytest.raises(NumExprClobberingError, match=msg):
+            pd.eval("sin + dotted_line", engine=engine, parser=parser)
+    else:
+        res = pd.eval("sin + dotted_line", engine=engine, parser=parser)
+        assert res == sin + dotted_line
+
+
+def test_bad_resolver_raises(engine, parser):
+    cannot_resolve = 42, 3.0
+    with pytest.raises(TypeError, match="Resolver of type .+"):
+        pd.eval("1 + 2", resolvers=cannot_resolve, engine=engine, parser=parser)
+
+
+def test_empty_string_raises(engine, parser):
+    # GH 13139
+    with pytest.raises(ValueError, match="expr cannot be an empty string"):
+        pd.eval("", engine=engine, parser=parser)
+
+
+def test_more_than_one_expression_raises(engine, parser):
+    with pytest.raises(SyntaxError, match="only a single expression is allowed"):
+        pd.eval("1 + 1; 2 + 2", engine=engine, parser=parser)
+
+
+@pytest.mark.parametrize("cmp", ("and", "or"))
+@pytest.mark.parametrize("lhs", (int, float))
+@pytest.mark.parametrize("rhs", (int, float))
+def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser):
+    gen = {
+        int: lambda: np.random.default_rng(2).integers(10),
+        float: np.random.default_rng(2).standard_normal,
+    }
+
+    mid = gen[lhs]()  # noqa: F841
+    lhs = gen[lhs]()
+    rhs = gen[rhs]()
+
+    ex1 = f"lhs {cmp} mid {cmp} rhs"
+    ex2 = f"lhs {cmp} mid and mid {cmp} rhs"
+    ex3 = f"(lhs {cmp} mid) & (mid {cmp} rhs)"
+    for ex in (ex1, ex2, ex3):
+        msg = "cannot evaluate scalar only bool ops|'BoolOp' nodes are not"
+        with pytest.raises(NotImplementedError, match=msg):
+            pd.eval(ex, engine=engine, parser=parser)
+
+
+@pytest.mark.parametrize(
+    "other",
+    [
+        "'x'",
+        "...",
+    ],
+)
+def test_equals_various(other):
+    df = DataFrame({"A": ["a", "b", "c"]}, dtype=object)
+    result = df.eval(f"A == {other}")
+    expected = Series([False, False, False], name="A")
+    tm.assert_series_equal(result, expected)
+
+
+def test_inf(engine, parser):
+    s = "inf + 1"
+    expected = np.inf
+    result = pd.eval(s, engine=engine, parser=parser)
+    assert result == expected
+
+
+@pytest.mark.parametrize("column", ["Temp(°C)", "Capacitance(μF)"])
+def test_query_token(engine, column):
+    # See: https://github.com/pandas-dev/pandas/pull/42826
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((5, 2)), columns=[column, "b"]
+    )
+    expected = df[df[column] > 5]
+    query_string = f"`{column}` > 5"
+    result = df.query(query_string, engine=engine)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_negate_lt_eq_le(engine, parser):
+    df = DataFrame([[0, 10], [1, 20]], columns=["cat", "count"])
+    expected = df[~(df.cat > 0)]
+
+    result = df.query("~(cat > 0)", engine=engine, parser=parser)
+    tm.assert_frame_equal(result, expected)
+
+    if parser == "python":
+        msg = "'Not' nodes are not implemented"
+        with pytest.raises(NotImplementedError, match=msg):
+            df.query("not (cat > 0)", engine=engine, parser=parser)
+    else:
+        result = df.query("not (cat > 0)", engine=engine, parser=parser)
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "column",
+    DEFAULT_GLOBALS.keys(),
+)
+def test_eval_no_support_column_name(request, column):
+    # GH 44603
+    if column in ["True", "False", "inf", "Inf"]:
+        request.applymarker(
+            pytest.mark.xfail(
+                raises=KeyError,
+                reason=f"GH 47859 DataFrame eval not supported with {column}",
+            )
+        )
+
+    df = DataFrame(
+        np.random.default_rng(2).integers(0, 100, size=(10, 2)),
+        columns=[column, "col1"],
+    )
+    expected = df[df[column] > 6]
+    result = df.query(f"{column}>6")
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_set_inplace():
+    # https://github.com/pandas-dev/pandas/issues/47449
+    # Ensure we don't only update the DataFrame inplace, but also the actual
+    # column values, such that references to this column also get updated
+    df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
+    result_view = df[:]
+    ser = df["A"]
+    df.eval("A = B + C", inplace=True)
+    expected = DataFrame({"A": [11, 13, 15], "B": [4, 5, 6], "C": [7, 8, 9]})
+    tm.assert_frame_equal(df, expected)
+    expected = Series([1, 2, 3], name="A")
+    tm.assert_series_equal(ser, expected)
+    tm.assert_series_equal(result_view["A"], expected)
+
+
+@pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
+def test_validate_bool_args(value):
+    msg = 'For argument "inplace" expected type bool, received type'
+    with pytest.raises(ValueError, match=msg):
+        pd.eval("2+2", inplace=value)
+
+
+@td.skip_if_no("numexpr")
+def test_eval_float_div_numexpr():
+    # GH 59736
+    result = pd.eval("1 / 2", engine="numexpr")
+    expected = 0.5
+    assert result == expected
+
+
+def test_method_calls_on_binop():
+    # GH 61175
+    x = Series([1, 2, 3, 5])
+    y = Series([2, 3, 4])
+
+    # Method call on binary operation result
+    result = pd.eval("(x + y).dropna()")
+    expected = (x + y).dropna()
+    tm.assert_series_equal(result, expected)
+
+    # Test with other binary operations
+    result = pd.eval("(x * y).dropna()")
+    expected = (x * y).dropna()
+    tm.assert_series_equal(result, expected)
+
+    # Test with method chaining
+    result = pd.eval("(x + y).dropna().reset_index(drop=True)")
+    expected = (x + y).dropna().reset_index(drop=True)
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/config/__init__.py b/pandas/tests/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/config/test_config.py b/pandas/tests/config/test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2f526f0ca3249503b05b84ba08804236ba41a67
--- /dev/null
+++ b/pandas/tests/config/test_config.py
@@ -0,0 +1,499 @@
+import pytest
+
+from pandas._config import config as cf
+from pandas._config.config import OptionError
+
+from pandas.errors import Pandas4Warning
+
+import pandas as pd
+import pandas._testing as tm
+
+
+class TestConfig:
+    @pytest.fixture(autouse=True)
+    def clean_config(self, monkeypatch):
+        with monkeypatch.context() as m:
+            m.setattr(cf, "_global_config", {})
+            m.setattr(cf, "options", cf.DictWrapper(cf._global_config))
+            m.setattr(cf, "_deprecated_options", {})
+            m.setattr(cf, "_registered_options", {})
+
+            # Our test fixture in conftest.py sets "chained_assignment"
+            # to "raise" only after all test methods have been setup.
+            # However, after this setup, there is no longer any
+            # "chained_assignment" option, so re-register it.
+            cf.register_option("chained_assignment", "raise")
+            yield
+
+    def test_api(self):
+        # the pandas object exposes the user API
+        assert hasattr(pd, "get_option")
+        assert hasattr(pd, "set_option")
+        assert hasattr(pd, "reset_option")
+        assert hasattr(pd, "describe_option")
+
+    def test_is_one_of_factory(self):
+        v = cf.is_one_of_factory([None, 12])
+
+        v(12)
+        v(None)
+        msg = r"Value must be one of None\|12"
+        with pytest.raises(ValueError, match=msg):
+            v(1.1)
+
+    def test_register_option(self):
+        cf.register_option("a", 1, "doc")
+
+        # can't register an already registered option
+        msg = "Option 'a' has already been registered"
+        with pytest.raises(OptionError, match=msg):
+            cf.register_option("a", 1, "doc")
+
+        # can't register an already registered option
+        msg = "Path prefix to option 'a' is already an option"
+        with pytest.raises(OptionError, match=msg):
+            cf.register_option("a.b.c.d1", 1, "doc")
+        with pytest.raises(OptionError, match=msg):
+            cf.register_option("a.b.c.d2", 1, "doc")
+
+        # no python keywords
+        msg = "for is a python keyword"
+        with pytest.raises(ValueError, match=msg):
+            cf.register_option("for", 0)
+        with pytest.raises(ValueError, match=msg):
+            cf.register_option("a.for.b", 0)
+        # must be valid identifier (ensure attribute access works)
+        msg = "oh my goddess! is not a valid identifier"
+        with pytest.raises(ValueError, match=msg):
+            cf.register_option("Oh my Goddess!", 0)
+
+        # we can register options several levels deep
+        # without predefining the intermediate steps
+        # and we can define differently named options
+        # in the same namespace
+        cf.register_option("k.b.c.d1", 1, "doc")
+        cf.register_option("k.b.c.d2", 1, "doc")
+
+    def test_describe_option(self):
+        cf.register_option("a", 1, "doc")
+        cf.register_option("b", 1, "doc2")
+        cf.deprecate_option("b", FutureWarning)
+
+        cf.register_option("c.d.e1", 1, "doc3")
+        cf.register_option("c.d.e2", 1, "doc4")
+        cf.register_option("f", 1)
+        cf.register_option("g.h", 1)
+        cf.register_option("k", 2)
+        cf.deprecate_option("g.h", FutureWarning, rkey="k")
+        cf.register_option("l", "foo")
+
+        # non-existent keys raise KeyError
+        msg = r"No such keys\(s\)"
+        with pytest.raises(OptionError, match=msg):
+            cf.describe_option("no.such.key")
+
+        # we can get the description for any key we registered
+        assert "doc" in cf.describe_option("a", _print_desc=False)
+        assert "doc2" in cf.describe_option("b", _print_desc=False)
+        assert "precated" in cf.describe_option("b", _print_desc=False)
+        assert "doc3" in cf.describe_option("c.d.e1", _print_desc=False)
+        assert "doc4" in cf.describe_option("c.d.e2", _print_desc=False)
+
+        # if no doc is specified we get a default message
+        # saying "description not available"
+        assert "available" in cf.describe_option("f", _print_desc=False)
+        assert "available" in cf.describe_option("g.h", _print_desc=False)
+        assert "precated" in cf.describe_option("g.h", _print_desc=False)
+        assert "k" in cf.describe_option("g.h", _print_desc=False)
+
+        # default is reported
+        assert "foo" in cf.describe_option("l", _print_desc=False)
+        # current value is reported
+        assert "bar" not in cf.describe_option("l", _print_desc=False)
+        cf.set_option("l", "bar")
+        assert "bar" in cf.describe_option("l", _print_desc=False)
+
+    @pytest.mark.parametrize("category", [DeprecationWarning, FutureWarning])
+    def test_case_insensitive(self, category):
+        cf.register_option("KanBAN", 1, "doc")
+
+        assert "doc" in cf.describe_option("kanbaN", _print_desc=False)
+        assert cf.get_option("kanBaN") == 1
+        cf.set_option("KanBan", 2)
+        assert cf.get_option("kAnBaN") == 2
+
+        # gets of non-existent keys fail
+        msg = r"No such keys\(s\): 'no_such_option'"
+        with pytest.raises(OptionError, match=msg):
+            cf.get_option("no_such_option")
+
+        cf.deprecate_option("KanBan", category)
+        msg = "'kanban' is deprecated, please refrain from using it."
+        with tm.assert_produces_warning(category, match=msg):
+            cf.get_option("kAnBaN")
+
+    def test_get_option(self):
+        cf.register_option("a", 1, "doc")
+        cf.register_option("b.c", "hullo", "doc2")
+        cf.register_option("b.b", None, "doc2")
+
+        # gets of existing keys succeed
+        assert cf.get_option("a") == 1
+        assert cf.get_option("b.c") == "hullo"
+        assert cf.get_option("b.b") is None
+
+        # gets of non-existent keys fail
+        msg = r"No such keys\(s\): 'no_such_option'"
+        with pytest.raises(OptionError, match=msg):
+            cf.get_option("no_such_option")
+
+    def test_set_option(self):
+        cf.register_option("a", 1, "doc")
+        cf.register_option("b.c", "hullo", "doc2")
+        cf.register_option("b.b", None, "doc2")
+
+        assert cf.get_option("a") == 1
+        assert cf.get_option("b.c") == "hullo"
+        assert cf.get_option("b.b") is None
+
+        cf.set_option("a", 2)
+        cf.set_option("b.c", "wurld")
+        cf.set_option("b.b", 1.1)
+
+        assert cf.get_option("a") == 2
+        assert cf.get_option("b.c") == "wurld"
+        assert cf.get_option("b.b") == 1.1
+
+        msg = r"No such keys\(s\): 'no.such.key'"
+        with pytest.raises(OptionError, match=msg):
+            cf.set_option("no.such.key", None)
+
+    def test_set_option_empty_args(self):
+        msg = "Must provide an even number of non-keyword arguments"
+        with pytest.raises(ValueError, match=msg):
+            cf.set_option()
+
+    def test_set_option_uneven_args(self):
+        msg = "Must provide an even number of non-keyword arguments"
+        with pytest.raises(ValueError, match=msg):
+            cf.set_option("a.b", 2, "b.c")
+
+    def test_set_option_invalid_single_argument_type(self):
+        msg = "Must provide an even number of non-keyword arguments"
+        with pytest.raises(ValueError, match=msg):
+            cf.set_option(2)
+
+    def test_set_option_multiple(self):
+        cf.register_option("a", 1, "doc")
+        cf.register_option("b.c", "hullo", "doc2")
+        cf.register_option("b.b", None, "doc2")
+
+        assert cf.get_option("a") == 1
+        assert cf.get_option("b.c") == "hullo"
+        assert cf.get_option("b.b") is None
+
+        cf.set_option("a", "2", "b.c", None, "b.b", 10.0)
+
+        assert cf.get_option("a") == "2"
+        assert cf.get_option("b.c") is None
+        assert cf.get_option("b.b") == 10.0
+
+    def test_set_option_dict(self):
+        # GH 61093
+
+        cf.register_option("a", 1, "doc")
+        cf.register_option("b.c", "hullo", "doc2")
+        cf.register_option("b.b", None, "doc2")
+
+        assert cf.get_option("a") == 1
+        assert cf.get_option("b.c") == "hullo"
+        assert cf.get_option("b.b") is None
+
+        options_dict = {"a": "2", "b.c": None, "b.b": 10.0}
+        cf.set_option(options_dict)
+
+        assert cf.get_option("a") == "2"
+        assert cf.get_option("b.c") is None
+        assert cf.get_option("b.b") == 10.0
+
+    def test_validation(self):
+        cf.register_option("a", 1, "doc", validator=cf.is_int)
+        cf.register_option("d", 1, "doc", validator=cf.is_nonnegative_int)
+        cf.register_option("b.c", "hullo", "doc2", validator=cf.is_text)
+
+        msg = "Value must have type '<class 'int'>'"
+        with pytest.raises(ValueError, match=msg):
+            cf.register_option("a.b.c.d2", "NO", "doc", validator=cf.is_int)
+
+        cf.set_option("a", 2)  # int is_int
+        cf.set_option("b.c", "wurld")  # str is_str
+        cf.set_option("d", 2)
+        cf.set_option("d", None)  # non-negative int can be None
+
+        # None not is_int
+        with pytest.raises(ValueError, match=msg):
+            cf.set_option("a", None)
+        with pytest.raises(ValueError, match=msg):
+            cf.set_option("a", "ab")
+
+        msg = "Value must be a nonnegative integer or None"
+        with pytest.raises(ValueError, match=msg):
+            cf.register_option("a.b.c.d3", "NO", "doc", validator=cf.is_nonnegative_int)
+        with pytest.raises(ValueError, match=msg):
+            cf.register_option("a.b.c.d3", -2, "doc", validator=cf.is_nonnegative_int)
+
+        msg = r"Value must be an instance of <class 'str'>\|<class 'bytes'>"
+        with pytest.raises(ValueError, match=msg):
+            cf.set_option("b.c", 1)
+
+        validator = cf.is_one_of_factory([None, cf.is_callable])
+        cf.register_option("b", lambda: None, "doc", validator=validator)
+        cf.set_option("b", "%.1f".format)  # Formatter is callable
+        cf.set_option("b", None)  # Formatter is none (default)
+        with pytest.raises(ValueError, match="Value must be a callable"):
+            cf.set_option("b", "%.1f")
+
+    def test_reset_option(self):
+        cf.register_option("a", 1, "doc", validator=cf.is_int)
+        cf.register_option("b.c", "hullo", "doc2", validator=cf.is_str)
+        assert cf.get_option("a") == 1
+        assert cf.get_option("b.c") == "hullo"
+
+        cf.set_option("a", 2)
+        cf.set_option("b.c", "wurld")
+        assert cf.get_option("a") == 2
+        assert cf.get_option("b.c") == "wurld"
+
+        cf.reset_option("a")
+        assert cf.get_option("a") == 1
+        assert cf.get_option("b.c") == "wurld"
+        cf.reset_option("b.c")
+        assert cf.get_option("a") == 1
+        assert cf.get_option("b.c") == "hullo"
+
+    def test_reset_option_all(self):
+        cf.register_option("a", 1, "doc", validator=cf.is_int)
+        cf.register_option("b.c", "hullo", "doc2", validator=cf.is_str)
+        assert cf.get_option("a") == 1
+        assert cf.get_option("b.c") == "hullo"
+
+        cf.set_option("a", 2)
+        cf.set_option("b.c", "wurld")
+        assert cf.get_option("a") == 2
+        assert cf.get_option("b.c") == "wurld"
+
+        cf.reset_option("all")
+        assert cf.get_option("a") == 1
+        assert cf.get_option("b.c") == "hullo"
+
+    def test_deprecate_option(self):
+        # we can deprecate non-existent options
+        cf.deprecate_option("foo", FutureWarning)
+
+        with tm.assert_produces_warning(FutureWarning, match="deprecated"):
+            with pytest.raises(KeyError, match="No such keys.s.: 'foo'"):
+                cf.get_option("foo")
+
+        cf.register_option("a", 1, "doc", validator=cf.is_int)
+        cf.register_option("b.c", "hullo", "doc2")
+        cf.register_option("foo", "hullo", "doc2")
+
+        cf.deprecate_option("a", FutureWarning, removal_ver="nifty_ver")
+        with tm.assert_produces_warning(FutureWarning, match="eprecated.*nifty_ver"):
+            cf.get_option("a")
+
+            msg = "Option 'a' has already been defined as deprecated"
+            with pytest.raises(OptionError, match=msg):
+                cf.deprecate_option("a", FutureWarning)
+
+        cf.deprecate_option("b.c", FutureWarning, "zounds!")
+        with tm.assert_produces_warning(FutureWarning, match="zounds!"):
+            cf.get_option("b.c")
+
+        # test rerouting keys
+        cf.register_option("d.a", "foo", "doc2")
+        cf.register_option("d.dep", "bar", "doc2")
+        assert cf.get_option("d.a") == "foo"
+        assert cf.get_option("d.dep") == "bar"
+
+        cf.deprecate_option("d.dep", FutureWarning, rkey="d.a")  # reroute d.dep to d.a
+        with tm.assert_produces_warning(FutureWarning, match="eprecated"):
+            assert cf.get_option("d.dep") == "foo"
+
+        with tm.assert_produces_warning(FutureWarning, match="eprecated"):
+            cf.set_option("d.dep", "baz")  # should overwrite "d.a"
+
+        with tm.assert_produces_warning(FutureWarning, match="eprecated"):
+            assert cf.get_option("d.dep") == "baz"
+
+    def test_config_prefix(self):
+        with cf.config_prefix("base"):
+            cf.register_option("a", 1, "doc1")
+            cf.register_option("b", 2, "doc2")
+            assert cf.get_option("a") == 1
+            assert cf.get_option("b") == 2
+
+            cf.set_option("a", 3)
+            cf.set_option("b", 4)
+            assert cf.get_option("a") == 3
+            assert cf.get_option("b") == 4
+
+        assert cf.get_option("base.a") == 3
+        assert cf.get_option("base.b") == 4
+        assert "doc1" in cf.describe_option("base.a", _print_desc=False)
+        assert "doc2" in cf.describe_option("base.b", _print_desc=False)
+
+        cf.reset_option("base.a")
+        cf.reset_option("base.b")
+
+        with cf.config_prefix("base"):
+            assert cf.get_option("a") == 1
+            assert cf.get_option("b") == 2
+
+    def test_callback(self):
+        k = [None]
+        v = [None]
+
+        def callback(key):
+            k.append(key)
+            v.append(cf.get_option(key))
+
+        cf.register_option("d.a", "foo", cb=callback)
+        cf.register_option("d.b", "foo", cb=callback)
+
+        del k[-1], v[-1]
+        cf.set_option("d.a", "fooz")
+        assert k[-1] == "d.a"
+        assert v[-1] == "fooz"
+
+        del k[-1], v[-1]
+        cf.set_option("d.b", "boo")
+        assert k[-1] == "d.b"
+        assert v[-1] == "boo"
+
+        del k[-1], v[-1]
+        cf.reset_option("d.b")
+        assert k[-1] == "d.b"
+
+    def test_set_ContextManager(self):
+        def eq(val):
+            assert cf.get_option("a") == val
+
+        cf.register_option("a", 0)
+        eq(0)
+        with cf.option_context("a", 15):
+            eq(15)
+            with cf.option_context("a", 25):
+                eq(25)
+            eq(15)
+        eq(0)
+
+        cf.set_option("a", 17)
+        eq(17)
+
+        # Test that option_context can be used as a decorator too (#34253).
+        @cf.option_context("a", 123)
+        def f():
+            eq(123)
+
+        f()
+
+    def test_set_ContextManager_dict(self):
+        def eq(val):
+            assert cf.get_option("a") == val
+            assert cf.get_option("b.c") == val
+
+        cf.register_option("a", 0)
+        cf.register_option("b.c", 0)
+
+        eq(0)
+        with cf.option_context({"a": 15, "b.c": 15}):
+            eq(15)
+            with cf.option_context({"a": 25, "b.c": 25}):
+                eq(25)
+            eq(15)
+        eq(0)
+
+        cf.set_option("a", 17)
+        cf.set_option("b.c", 17)
+        eq(17)
+
+        # Test that option_context can be used as a decorator too
+        @cf.option_context({"a": 123, "b.c": 123})
+        def f():
+            eq(123)
+
+        f()
+
+    def test_attribute_access(self):
+        holder = []
+
+        def f3(key):
+            holder.append(True)
+
+        cf.register_option("a", 0)
+        cf.register_option("c", 0, cb=f3)
+        options = cf.options
+
+        assert options.a == 0
+        with cf.option_context("a", 15):
+            assert options.a == 15
+
+        options.a = 500
+        assert cf.get_option("a") == 500
+
+        cf.reset_option("a")
+        assert options.a == cf.get_option("a")
+
+        msg = "You can only set the value of existing options"
+        with pytest.raises(OptionError, match=msg):
+            options.b = 1
+        with pytest.raises(OptionError, match=msg):
+            options.display = 1
+
+        # make sure callback kicks when using this form of setting
+        options.c = 1
+        assert len(holder) == 1
+
+    def test_option_context_scope(self):
+        # Ensure that creating a context does not affect the existing
+        # environment as it is supposed to be used with the `with` statement.
+        # See https://github.com/pandas-dev/pandas/issues/8514
+
+        original_value = 60
+        context_value = 10
+        option_name = "a"
+
+        cf.register_option(option_name, original_value)
+
+        # Ensure creating contexts didn't affect the current context.
+        ctx = cf.option_context(option_name, context_value)
+        assert cf.get_option(option_name) == original_value
+
+        # Ensure the correct value is available inside the context.
+        with ctx:
+            assert cf.get_option(option_name) == context_value
+
+        # Ensure the current context is reset
+        assert cf.get_option(option_name) == original_value
+
+    def test_dictwrapper_getattr(self):
+        options = cf.options
+        # GH 19789
+        with pytest.raises(OptionError, match="No such option"):
+            options.bananas
+        assert not hasattr(options, "bananas")
+
+
+def test_no_silent_downcasting_deprecated():
+    # GH#59502
+    with tm.assert_produces_warning(Pandas4Warning, match="is deprecated"):
+        cf.get_option("future.no_silent_downcasting")
+    with tm.assert_produces_warning(Pandas4Warning, match="is deprecated"):
+        cf.set_option("future.no_silent_downcasting", True)
+
+
+def test_option_context_invalid_option():
+    with pytest.raises(OptionError, match="No such keys"):
+        with cf.option_context("invalid", True):
+            pass
diff --git a/pandas/tests/config/test_localization.py b/pandas/tests/config/test_localization.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9a0a44bf8c89f10537bc2b7c64d3bb418a9a4a7
--- /dev/null
+++ b/pandas/tests/config/test_localization.py
@@ -0,0 +1,155 @@
+import codecs
+import locale
+import os
+
+import pytest
+
+from pandas._config.localization import (
+    can_set_locale,
+    get_locales,
+    set_locale,
+)
+
+from pandas.compat import ISMUSL
+
+import pandas as pd
+
+_all_locales = get_locales()
+
+# Don't run any of these tests if we have no locales.
+pytestmark = pytest.mark.skipif(not _all_locales, reason="Need locales")
+
+_skip_if_only_one_locale = pytest.mark.skipif(
+    len(_all_locales) <= 1, reason="Need multiple locales for meaningful test"
+)
+
+
+def _get_current_locale(lc_var: int = locale.LC_ALL) -> str:
+    # getlocale is not always compliant with setlocale, use setlocale. GH#46595
+    return locale.setlocale(lc_var)
+
+
+@pytest.mark.parametrize("lc_var", (locale.LC_ALL, locale.LC_CTYPE, locale.LC_TIME))
+def test_can_set_current_locale(lc_var):
+    # Can set the current locale
+    before_locale = _get_current_locale(lc_var)
+    assert can_set_locale(before_locale, lc_var=lc_var)
+    after_locale = _get_current_locale(lc_var)
+    assert before_locale == after_locale
+
+
+@pytest.mark.parametrize("lc_var", (locale.LC_ALL, locale.LC_CTYPE, locale.LC_TIME))
+def test_can_set_locale_valid_set(lc_var):
+    # Can set the default locale.
+    before_locale = _get_current_locale(lc_var)
+    assert can_set_locale("", lc_var=lc_var)
+    after_locale = _get_current_locale(lc_var)
+    assert before_locale == after_locale
+
+
+@pytest.mark.parametrize(
+    "lc_var",
+    (
+        locale.LC_ALL,
+        locale.LC_CTYPE,
+        pytest.param(
+            locale.LC_TIME,
+            marks=pytest.mark.skipif(
+                ISMUSL, reason="MUSL allows setting invalid LC_TIME."
+            ),
+        ),
+    ),
+)
+def test_can_set_locale_invalid_set(lc_var):
+    # Cannot set an invalid locale.
+    before_locale = _get_current_locale(lc_var)
+    assert not can_set_locale("non-existent_locale", lc_var=lc_var)
+    after_locale = _get_current_locale(lc_var)
+    assert before_locale == after_locale
+
+
+@pytest.mark.parametrize(
+    "lang,enc",
+    [
+        ("it_CH", "UTF-8"),
+        ("en_US", "ascii"),
+        ("zh_CN", "GB2312"),
+        ("it_IT", "ISO-8859-1"),
+    ],
+)
+@pytest.mark.parametrize("lc_var", (locale.LC_ALL, locale.LC_CTYPE, locale.LC_TIME))
+def test_can_set_locale_no_leak(lang, enc, lc_var):
+    # Test that can_set_locale does not leak even when returning False. See GH#46595
+    before_locale = _get_current_locale(lc_var)
+    can_set_locale((lang, enc), locale.LC_ALL)
+    after_locale = _get_current_locale(lc_var)
+    assert before_locale == after_locale
+
+
+def test_can_set_locale_invalid_get(monkeypatch):
+    # see GH#22129
+    # In some cases, an invalid locale can be set,
+    #  but a subsequent getlocale() raises a ValueError.
+
+    def mock_get_locale():
+        raise ValueError
+
+    with monkeypatch.context() as m:
+        m.setattr(locale, "getlocale", mock_get_locale)
+        assert not can_set_locale("")
+
+
+def test_get_locales_at_least_one():
+    # see GH#9744
+    assert len(_all_locales) > 0
+
+
+@_skip_if_only_one_locale
+def test_get_locales_prefix():
+    first_locale = _all_locales[0]
+    assert len(get_locales(prefix=first_locale[:2])) > 0
+
+
+@_skip_if_only_one_locale
+@pytest.mark.parametrize(
+    "lang,enc",
+    [
+        ("it_CH", "UTF-8"),
+        ("en_US", "ascii"),
+        ("zh_CN", "GB2312"),
+        ("it_IT", "ISO-8859-1"),
+    ],
+)
+def test_set_locale(lang, enc):
+    before_locale = _get_current_locale()
+
+    enc = codecs.lookup(enc).name
+    new_locale = lang, enc
+
+    if not can_set_locale(new_locale):
+        msg = "unsupported locale setting"
+
+        with pytest.raises(locale.Error, match=msg):
+            with set_locale(new_locale):
+                pass
+    else:
+        with set_locale(new_locale) as normalized_locale:
+            new_lang, new_enc = normalized_locale.split(".")
+            new_enc = codecs.lookup(enc).name
+
+            normalized_locale = new_lang, new_enc
+            assert normalized_locale == new_locale
+
+    # Once we exit the "with" statement, locale should be back to what it was.
+    after_locale = _get_current_locale()
+    assert before_locale == after_locale
+
+
+def test_encoding_detected():
+    system_locale = os.environ.get("LC_ALL")
+    system_encoding = system_locale.split(".")[-1] if system_locale else "utf-8"
+
+    assert (
+        codecs.lookup(pd.options.display.encoding).name
+        == codecs.lookup(system_encoding).name
+    )
diff --git a/pandas/tests/construction/__init__.py b/pandas/tests/construction/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/construction/test_extract_array.py b/pandas/tests/construction/test_extract_array.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dd3eda8c995ce022e9d46b907323e79bcd679f8
--- /dev/null
+++ b/pandas/tests/construction/test_extract_array.py
@@ -0,0 +1,18 @@
+from pandas import Index
+import pandas._testing as tm
+from pandas.core.construction import extract_array
+
+
+def test_extract_array_rangeindex():
+    ri = Index(range(5))
+
+    expected = ri._values
+    res = extract_array(ri, extract_numpy=True, extract_range=True)
+    tm.assert_numpy_array_equal(res, expected)
+    res = extract_array(ri, extract_numpy=False, extract_range=True)
+    tm.assert_numpy_array_equal(res, expected)
+
+    res = extract_array(ri, extract_numpy=True, extract_range=False)
+    tm.assert_index_equal(res, ri)
+    res = extract_array(ri, extract_numpy=False, extract_range=False)
+    tm.assert_index_equal(res, ri)
diff --git a/pandas/tests/copy_view/__init__.py b/pandas/tests/copy_view/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py
new file mode 100644
index 0000000000000000000000000000000000000000..22976f307cae6d8da9852c12aad9f85b14c5dd64
--- /dev/null
+++ b/pandas/tests/copy_view/test_array.py
@@ -0,0 +1,229 @@
+import numpy as np
+import pytest
+
+from pandas.compat.numpy import np_version_gt2
+
+from pandas import (
+    DataFrame,
+    Series,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+# -----------------------------------------------------------------------------
+# Copy/view behaviour for accessing underlying array of Series/DataFrame
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda ser: ser.values,
+        lambda ser: np.asarray(ser.array),
+        lambda ser: np.asarray(ser),
+        lambda ser: np.array(ser, copy=False),
+    ],
+    ids=["values", "array", "np.asarray", "np.array"],
+)
+def test_series_values(request, method):
+    ser = Series([1, 2, 3], name="name")
+    ser_orig = ser.copy()
+
+    arr = method(ser)
+
+    if request.node.callspec.id == "array":
+        # https://github.com/pandas-dev/pandas/issues/63099
+        # .array for now does not return a read-only view
+        assert arr.flags.writeable is True
+        # updating the array updates the series
+        arr[0] = 0
+        assert ser.iloc[0] == 0
+        return
+
+    # .values still gives a view but is read-only
+    assert np.shares_memory(arr, get_array(ser, "name"))
+    assert arr.flags.writeable is False
+
+    # mutating series through arr therefore doesn't work
+    with pytest.raises(ValueError, match="read-only"):
+        arr[0] = 0
+    tm.assert_series_equal(ser, ser_orig)
+
+    # mutating the series itself still works
+    ser.iloc[0] = 0
+    assert ser.values[0] == 0
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda df: df.values,
+        lambda df: np.asarray(df),
+        lambda ser: np.array(ser, copy=False),
+    ],
+    ids=["values", "asarray", "array"],
+)
+def test_dataframe_values(method):
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df_orig = df.copy()
+
+    arr = method(df)
+
+    # .values still gives a view but is read-only
+    assert np.shares_memory(arr, get_array(df, "a"))
+    assert arr.flags.writeable is False
+
+    # mutating series through arr therefore doesn't work
+    with pytest.raises(ValueError, match="read-only"):
+        arr[0, 0] = 0
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating the series itself still works
+    df.iloc[0, 0] = 0
+    assert df.values[0, 0] == 0
+
+
+def test_series_to_numpy():
+    ser = Series([1, 2, 3], name="name")
+    ser_orig = ser.copy()
+
+    # default: copy=False, no dtype or NAs
+    arr = ser.to_numpy()
+    # to_numpy still gives a view but is read-only
+    assert np.shares_memory(arr, get_array(ser, "name"))
+    assert arr.flags.writeable is False
+
+    # mutating series through arr therefore doesn't work
+    with pytest.raises(ValueError, match="read-only"):
+        arr[0] = 0
+    tm.assert_series_equal(ser, ser_orig)
+
+    # mutating the series itself still works
+    ser.iloc[0] = 0
+    assert ser.values[0] == 0
+
+    # specify copy=True gives a writeable array
+    ser = Series([1, 2, 3], name="name")
+    arr = ser.to_numpy(copy=True)
+    assert not np.shares_memory(arr, get_array(ser, "name"))
+    assert arr.flags.writeable is True
+
+    # specifying a dtype that already causes a copy also gives a writeable array
+    ser = Series([1, 2, 3], name="name")
+    arr = ser.to_numpy(dtype="float64")
+    assert not np.shares_memory(arr, get_array(ser, "name"))
+    assert arr.flags.writeable is True
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda ser: np.asarray(ser.values),
+        lambda ser: np.asarray(ser.array),
+        lambda ser: np.asarray(ser),
+        lambda ser: np.asarray(ser, dtype="int64"),
+        lambda ser: np.array(ser, copy=False),
+    ],
+    ids=["values", "array", "np.asarray", "np.asarray-dtype", "np.array"],
+)
+def test_series_values_ea_dtypes(request, method):
+    ser = Series([1, 2, 3], dtype="Int64")
+    ser_orig = ser.copy()
+
+    arr = method(ser)
+
+    if request.node.callspec.id in ("values", "array"):
+        # https://github.com/pandas-dev/pandas/issues/63099
+        # .array/values for now does not return a read-only view
+        assert arr.flags.writeable is True
+        # updating the array updates the series
+        arr[0] = 0
+        assert ser.iloc[0] == 0
+        return
+
+    # conversion to ndarray gives a view but is read-only
+    assert np.shares_memory(arr, get_array(ser))
+    assert arr.flags.writeable is False
+
+    # mutating series through arr therefore doesn't work
+    with pytest.raises(ValueError, match="read-only"):
+        arr[0] = 0
+    tm.assert_series_equal(ser, ser_orig)
+
+    # mutating the series itself still works
+    ser.iloc[0] = 0
+    assert ser.values[0] == 0
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda df: df.values,
+        lambda df: np.asarray(df),
+        lambda df: np.asarray(df, dtype="int64"),
+        lambda df: np.array(df, copy=False),
+    ],
+    ids=["values", "np.asarray", "np.asarray-dtype", "np.array"],
+)
+def test_dataframe_array_ea_dtypes(method):
+    df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
+    arr = method(df)
+
+    assert np.shares_memory(arr, get_array(df, "a"))
+    assert arr.flags.writeable is False
+
+
+def test_dataframe_array_string_dtype():
+    df = DataFrame({"a": ["a", "b"]}, dtype="string[python]")
+    arr = np.asarray(df)
+    assert np.shares_memory(arr, get_array(df, "a"))
+    assert arr.flags.writeable is False
+
+
+def test_series_array_string_dtype(any_string_dtype):
+    ser = Series(["a", "b"], dtype=any_string_dtype)
+    arr = np.asarray(ser)
+    if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
+        # for pyarrow strings, the numpy arrays is not a view, so also does
+        # not need to be read-only (https://github.com/pandas-dev/pandas/pull/64035)
+        assert not np.shares_memory(arr, get_array(ser))
+        assert arr.flags.writeable is True
+    else:
+        assert np.shares_memory(arr, get_array(ser))
+        assert arr.flags.writeable is False
+
+
+def test_dataframe_multiple_numpy_dtypes():
+    df = DataFrame({"a": [1, 2, 3], "b": 1.5})
+    arr = np.asarray(df)
+    assert not np.shares_memory(arr, get_array(df, "a"))
+    assert arr.flags.writeable is True
+
+    if np_version_gt2:
+        # copy=False semantics are only supported in NumPy>=2.
+
+        with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
+            arr = np.array(df, copy=False)
+
+    arr = np.array(df, copy=True)
+    assert arr.flags.writeable is True
+
+
+def test_dataframe_single_block_copy_true():
+    # the copy=False/None cases are tested above in test_dataframe_values
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    arr = np.array(df, copy=True)
+    assert not np.shares_memory(arr, get_array(df, "a"))
+    assert arr.flags.writeable is True
+
+
+def test_values_is_ea():
+    df = DataFrame({"a": date_range("2012-01-01", periods=3)})
+    arr = np.asarray(df)
+    assert arr.flags.writeable is False
+
+
+def test_empty_dataframe():
+    df = DataFrame()
+    arr = np.asarray(df)
+    assert arr.flags.writeable is True
diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
new file mode 100644
index 0000000000000000000000000000000000000000..c436391739ab282ffd612186a1406422e3b0774a
--- /dev/null
+++ b/pandas/tests/copy_view/test_astype.py
@@ -0,0 +1,230 @@
+import pickle
+
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Series,
+    Timestamp,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+
+def test_astype_single_dtype():
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1.5})
+    df_orig = df.copy()
+    df2 = df.astype("float64")
+
+    assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 2] = 5.5
+    assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype("float64")
+    df.iloc[0, 2] = 5.5
+    tm.assert_frame_equal(df2, df_orig.astype("float64"))
+
+
+@pytest.mark.parametrize("dtype", ["int64", "Int64"])
+@pytest.mark.parametrize("new_dtype", ["int64", "Int64", "int64[pyarrow]"])
+def test_astype_avoids_copy(dtype, new_dtype):
+    if new_dtype == "int64[pyarrow]":
+        pytest.importorskip("pyarrow")
+    df = DataFrame({"a": [1, 2, 3]}, dtype=dtype)
+    df_orig = df.copy()
+    df2 = df.astype(new_dtype)
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 0] = 10
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype(new_dtype)
+    df.iloc[0, 0] = 100
+    tm.assert_frame_equal(df2, df_orig.astype(new_dtype))
+
+
+@pytest.mark.parametrize("dtype", ["float64", "int32", "Int32", "int32[pyarrow]"])
+def test_astype_different_target_dtype(dtype):
+    if dtype == "int32[pyarrow]":
+        pytest.importorskip("pyarrow")
+    df = DataFrame({"a": [1, 2, 3]})
+    df_orig = df.copy()
+    df2 = df.astype(dtype)
+
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert df2._mgr._has_no_reference(0)
+
+    df2.iloc[0, 0] = 5
+    tm.assert_frame_equal(df, df_orig)
+
+    # mutating parent also doesn't update result
+    df2 = df.astype(dtype)
+    df.iloc[0, 0] = 100
+    tm.assert_frame_equal(df2, df_orig.astype(dtype))
+
+
+def test_astype_numpy_to_ea():
+    ser = Series([1, 2, 3])
+    result = ser.astype("Int64")
+    assert np.shares_memory(get_array(ser), get_array(result))
+
+
+@pytest.mark.parametrize(
+    "dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")]
+)
+def test_astype_string_and_object(dtype, new_dtype):
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
+    df_orig = df.copy()
+    df2 = df.astype(new_dtype)
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df2.iloc[0, 0] = "x"
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "dtype, new_dtype", [("object", "string[python]"), ("string[python]", "object")]
+)
+def test_astype_string_and_object_update_original(dtype, new_dtype):
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype=dtype)
+    df2 = df.astype(new_dtype)
+    df_orig = df2.copy()
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df.iloc[0, 0] = "x"
+    tm.assert_frame_equal(df2, df_orig)
+
+
+def test_astype_str_copy_on_pickle_roundrip():
+    # TODO(infer_string) this test can be removed after 3.0 (once str is the default)
+    # https://github.com/pandas-dev/pandas/issues/54654
+    # ensure_string_array may alter array inplace
+    base = Series(np.array([(1, 2), None, 1], dtype="object"))
+    base_copy = pickle.loads(pickle.dumps(base))
+    base_copy.astype(str)
+    tm.assert_series_equal(base, base_copy)
+
+
+def test_astype_string_copy_on_pickle_roundrip(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/issues/54654
+    # ensure_string_array may alter array inplace
+    base = Series(np.array([(1, 2), None, 1], dtype="object"))
+    base_copy = pickle.loads(pickle.dumps(base))
+    base_copy.astype(any_string_dtype)
+    tm.assert_series_equal(base, base_copy)
+
+
+def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/issues/54654
+    # ensure_string_array may alter read-only array inplace
+    base = Series(np.array([(1, 2), None, 1], dtype="object"))
+    base_copy = pickle.loads(pickle.dumps(base))
+    base_copy._values.flags.writeable = False
+    base_copy.astype(any_string_dtype)
+    tm.assert_series_equal(base, base_copy)
+
+
+def test_astype_dict_dtypes():
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": Series([1.5, 1.5, 1.5], dtype="float64")}
+    )
+    df_orig = df.copy()
+    df2 = df.astype({"a": "float64", "c": "float64"})
+
+    assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 2] = 5.5
+    assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+
+    df2.iloc[0, 1] = 10
+    assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_astype_different_datetime_resos():
+    df = DataFrame({"a": date_range("2019-12-31", periods=2, freq="D")})
+    result = df.astype("datetime64[ms]")
+
+    assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
+    assert result._mgr._has_no_reference(0)
+
+
+def test_astype_different_timezones():
+    df = DataFrame(
+        {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific", unit="ns")}
+    )
+    result = df.astype("datetime64[ns, Europe/Berlin]")
+    assert not result._mgr._has_no_reference(0)
+    assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
+
+
+def test_astype_different_timezones_different_reso():
+    df = DataFrame(
+        {"a": date_range("2019-12-31", periods=5, freq="D", tz="US/Pacific", unit="ns")}
+    )
+    result = df.astype("datetime64[ms, Europe/Berlin]")
+    assert result._mgr._has_no_reference(0)
+    assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
+
+
+def test_astype_arrow_timestamp():
+    pytest.importorskip("pyarrow")
+    df = DataFrame(
+        {
+            "a": [
+                Timestamp("2020-01-01 01:01:01.000001"),
+                Timestamp("2020-01-01 01:01:01.000001"),
+            ]
+        },
+        dtype="M8[ns]",
+    )
+    result = df.astype("timestamp[ns][pyarrow]")
+    assert not result._mgr._has_no_reference(0)
+    assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
+
+
+def test_convert_dtypes_infer_objects():
+    ser = Series(["a", "b", "c"])
+    ser_orig = ser.copy()
+    result = ser.convert_dtypes(
+        convert_integer=False,
+        convert_boolean=False,
+        convert_floating=False,
+        convert_string=False,
+    )
+
+    assert tm.shares_memory(get_array(ser), get_array(result))
+    result.iloc[0] = "x"
+    tm.assert_series_equal(ser, ser_orig)
+
+
+def test_convert_dtypes(using_infer_string):
+    df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
+    df_orig = df.copy()
+    df2 = df.convert_dtypes()
+
+    if using_infer_string:
+        # String column is already Arrow-backed, so memory is shared
+        assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        # String column converts from object to Arrow, no memory sharing
+        assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
+    assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    df2.iloc[0, 0] = "x"
+    df2.iloc[0, 1] = 10
+    tm.assert_frame_equal(df, df_orig)
diff --git a/pandas/tests/copy_view/test_chained_assignment_deprecation.py b/pandas/tests/copy_view/test_chained_assignment_deprecation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8a75fcd380c446a6822f4c551af7e152053a56a
--- /dev/null
+++ b/pandas/tests/copy_view/test_chained_assignment_deprecation.py
@@ -0,0 +1,104 @@
+import numpy as np
+import pytest
+
+from pandas.compat import CHAINED_WARNING_DISABLED
+from pandas.errors import ChainedAssignmentError
+
+from pandas import DataFrame
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize(
+    "indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
+)
+def test_series_setitem(indexer):
+    # ensure we only get a single warning for those typical cases of chained
+    # assignment
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+
+    # using custom check instead of tm.assert_produces_warning because that doesn't
+    # fail if multiple warnings are raised
+    if CHAINED_WARNING_DISABLED:
+        return
+    with pytest.warns() as record:  # noqa: TID251
+        df["a"][indexer] = 0
+    assert len(record) == 1
+    assert record[0].category == ChainedAssignmentError
+
+
+@pytest.mark.parametrize(
+    "indexer", ["a", ["a", "b"], slice(0, 2), np.array([True, False, True])]
+)
+def test_frame_setitem(indexer):
+    df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
+
+    with tm.raises_chained_assignment_error():
+        df[0:3][indexer] = 10
+
+
+@pytest.mark.parametrize(
+    "indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
+)
+def test_series_iloc_setitem(indexer):
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+
+    with tm.raises_chained_assignment_error():
+        df["a"].iloc[indexer] = 0
+
+
+@pytest.mark.parametrize(
+    "indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
+)
+def test_frame_iloc_setitem(indexer):
+    df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
+
+    with tm.raises_chained_assignment_error():
+        df[0:3].iloc[indexer] = 10
+
+
+@pytest.mark.parametrize(
+    "indexer", [0, [0, 1], slice(0, 2), np.array([True, False, True])]
+)
+def test_series_loc_setitem(indexer):
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+
+    with tm.raises_chained_assignment_error():
+        df["a"].loc[indexer] = 0
+
+
+@pytest.mark.parametrize(
+    "indexer", [0, [0, 1], (0, "a"), slice(0, 2), np.array([True, False, True])]
+)
+def test_frame_loc_setitem(indexer):
+    df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
+
+    with tm.raises_chained_assignment_error():
+        df[0:3].loc[indexer] = 10
+
+
+def test_series_at_setitem():
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+
+    with tm.raises_chained_assignment_error():
+        df["a"].at[0] = 0
+
+
+def test_frame_at_setitem():
+    df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
+
+    with tm.raises_chained_assignment_error():
+        df[0:3].at[0, "a"] = 10
+
+
+def test_series_iat_setitem():
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+
+    with tm.raises_chained_assignment_error():
+        df["a"].iat[0] = 0
+
+
+def test_frame_iat_setitem():
+    df = DataFrame({"a": [1, 2, 3, 4, 5], "b": 1})
+
+    with tm.raises_chained_assignment_error():
+        df[0:3].iat[0, 0] = 10
diff --git a/pandas/tests/copy_view/test_clip.py b/pandas/tests/copy_view/test_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..56df33db6d416e6ae2307139b531f48a012f8d4c
--- /dev/null
+++ b/pandas/tests/copy_view/test_clip.py
@@ -0,0 +1,72 @@
+import numpy as np
+
+from pandas import DataFrame
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+
+def test_clip_inplace_reference():
+    df = DataFrame({"a": [1.5, 2, 3]})
+    df_copy = df.copy()
+    arr_a = get_array(df, "a")
+    view = df[:]
+    df.clip(lower=2, inplace=True)
+
+    assert not np.shares_memory(get_array(df, "a"), arr_a)
+    assert df._mgr._has_no_reference(0)
+    assert view._mgr._has_no_reference(0)
+    tm.assert_frame_equal(df_copy, view)
+
+
+def test_clip_inplace_reference_no_op():
+    df = DataFrame({"a": [1.5, 2, 3]})
+    df_copy = df.copy()
+    arr_a = get_array(df, "a")
+    view = df[:]
+    df.clip(lower=0, inplace=True)
+
+    assert np.shares_memory(get_array(df, "a"), arr_a)
+
+    assert not df._mgr._has_no_reference(0)
+    assert not view._mgr._has_no_reference(0)
+    tm.assert_frame_equal(df_copy, view)
+
+
+def test_clip_inplace():
+    df = DataFrame({"a": [1.5, 2, 3]})
+    arr_a = get_array(df, "a")
+    df.clip(lower=2, inplace=True)
+
+    assert np.shares_memory(get_array(df, "a"), arr_a)
+    assert df._mgr._has_no_reference(0)
+
+
+def test_clip():
+    df = DataFrame({"a": [1.5, 2, 3]})
+    df_orig = df.copy()
+    df2 = df.clip(lower=2)
+
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    assert df._mgr._has_no_reference(0)
+    tm.assert_frame_equal(df_orig, df)
+
+
+def test_clip_no_op():
+    df = DataFrame({"a": [1.5, 2, 3]})
+    df2 = df.clip(lower=0)
+
+    assert not df._mgr._has_no_reference(0)
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+
+def test_clip_chained_inplace():
+    df = DataFrame({"a": [1, 4, 2], "b": 1})
+    df_orig = df.copy()
+    with tm.raises_chained_assignment_error():
+        df["a"].clip(1, 2, inplace=True)
+    tm.assert_frame_equal(df, df_orig)
+
+    with tm.raises_chained_assignment_error():
+        df[["a"]].clip(1, 2, inplace=True)
+    tm.assert_frame_equal(df, df_orig)
diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py
new file mode 100644
index 0000000000000000000000000000000000000000..7204aea950314f6b6e08f64291b42491c1fd415d
--- /dev/null
+++ b/pandas/tests/copy_view/test_constructors.py
@@ -0,0 +1,382 @@
+import numpy as np
+import pytest
+
+from pandas._config import using_string_dtype
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    Period,
+    PeriodIndex,
+    Series,
+    Timedelta,
+    TimedeltaIndex,
+    Timestamp,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+# -----------------------------------------------------------------------------
+# Copy/view behaviour for Series / DataFrame constructors
+
+
+@pytest.mark.parametrize("dtype", [None, "int64"])
+def test_series_from_series(dtype):
+    # Case: constructing a Series from another Series object follows CoW rules:
+    # a new object is returned and thus mutations are not propagated
+    ser = Series([1, 2, 3], name="name")
+
+    # default is copy=False -> new Series is a shallow copy / view of original
+    result = Series(ser, dtype=dtype)
+
+    # the shallow copy still shares memory
+    assert np.shares_memory(get_array(ser), get_array(result))
+
+    assert result._mgr.blocks[0].refs.has_reference()
+
+    # mutating new series copy doesn't mutate original
+    result.iloc[0] = 0
+    assert ser.iloc[0] == 1
+    # mutating triggered a copy-on-write -> no longer shares memory
+    assert not np.shares_memory(get_array(ser), get_array(result))
+
+    # the same when modifying the parent
+    result = Series(ser, dtype=dtype)
+
+    # mutating original doesn't mutate new series
+    ser.iloc[0] = 0
+    assert result.iloc[0] == 1
+
+    # forcing copy=False still gives a CoW shallow copy
+    result = Series(ser, dtype=dtype, copy=False)
+    assert np.shares_memory(get_array(ser), get_array(result))
+    assert result._mgr.blocks[0].refs.has_reference()
+
+    # forcing copy=True still results in an actual hard copy up front
+    result = Series(ser, dtype=dtype, copy=True)
+    assert not np.shares_memory(get_array(ser), get_array(result))
+    assert ser._mgr._has_no_reference(0)
+
+
+def test_series_from_series_with_reindex():
+    # Case: constructing a Series from another Series with specifying an index
+    # that potentially requires a reindex of the values
+    ser = Series([1, 2, 3], name="name")
+
+    # passing an index that doesn't actually require a reindex of the values
+    # -> still getting a CoW shallow copy
+    for index in [
+        ser.index,
+        ser.index.copy(),
+        list(ser.index),
+        ser.index.rename("idx"),
+    ]:
+        result = Series(ser, index=index)
+        assert np.shares_memory(ser.values, result.values)
+        result.iloc[0] = 0
+        assert ser.iloc[0] == 1
+
+        # forcing copy=True still results in an actual hard copy up front
+        result = Series(ser, index=index, copy=True)
+        assert not np.shares_memory(ser.values, result.values)
+        assert not result._mgr.blocks[0].refs.has_reference()
+
+    # ensure that if an actual reindex is needed, we don't have any refs
+    # (mutating the result wouldn't trigger CoW)
+    result = Series(ser, index=[0, 1, 2, 3])
+    assert not np.shares_memory(ser.values, result.values)
+    assert not result._mgr.blocks[0].refs.has_reference()
+
+
+@pytest.mark.parametrize("dtype", [None, "int64"])
+@pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
+@pytest.mark.parametrize(
+    "arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")]
+)
+def test_series_from_array(idx, dtype, arr):
+    ser = Series(arr, dtype=dtype, index=idx)
+    ser_orig = ser.copy()
+    data = getattr(arr, "_data", arr)
+    assert not np.shares_memory(get_array(ser), data)
+
+    arr[0] = 100
+    tm.assert_series_equal(ser, ser_orig)
+
+    # if the user explicitly passes copy=False, we get an actual view
+    # not protected by CoW
+    ser = Series(arr, dtype=dtype, index=idx, copy=False)
+    assert np.shares_memory(get_array(ser), data)
+    arr[0] = 50
+    assert ser.iloc[0] == 50
+
+
+@pytest.mark.parametrize("copy", [True, False, None])
+def test_series_from_array_different_dtype(copy):
+    arr = np.array([1, 2, 3], dtype="int64")
+    ser = Series(arr, dtype="int32", copy=copy)
+    assert not np.shares_memory(get_array(ser), arr)
+
+
+@pytest.mark.parametrize(
+    "idx",
+    [
+        Index([1, 2]),
+        DatetimeIndex([Timestamp("2019-12-31"), Timestamp("2020-12-31")]),
+        PeriodIndex([Period("2019-12-31"), Period("2020-12-31")]),
+        TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]),
+    ],
+)
+def test_series_from_index(idx):
+    ser = Series(idx)
+    expected = idx.copy(deep=True)
+    assert np.shares_memory(get_array(ser), get_array(idx))
+    assert not ser._mgr._has_no_reference(0)
+    ser.iloc[0] = ser.iloc[1]
+    tm.assert_index_equal(idx, expected)
+
+    # forcing copy=False still gives a CoW shallow copy
+    ser = Series(idx, copy=False)
+    assert np.shares_memory(get_array(ser), get_array(idx))
+    assert not ser._mgr._has_no_reference(0)
+    ser.iloc[0] = ser.iloc[1]
+    tm.assert_index_equal(idx, expected)
+
+    # forcing copy=True still results in a copy
+    ser = Series(idx, copy=True)
+    assert not np.shares_memory(get_array(ser), get_array(idx))
+    assert ser._mgr._has_no_reference(0)
+
+
+@pytest.mark.parametrize("copy", [True, False, None])
+def test_series_from_index_different_dtypes(copy):
+    idx = Index([1, 2, 3], dtype="int64", copy=copy)
+    ser = Series(idx, dtype="int32")
+    assert not np.shares_memory(get_array(ser), get_array(idx))
+    assert ser._mgr._has_no_reference(0)
+
+
+def test_series_from_block_manager_different_dtype():
+    ser = Series([1, 2, 3], dtype="int64")
+    msg = "Passing a SingleBlockManager to Series"
+    with tm.assert_produces_warning(DeprecationWarning, match=msg):
+        ser2 = Series(ser._mgr, dtype="int32")
+    assert not np.shares_memory(get_array(ser), get_array(ser2))
+    assert ser2._mgr._has_no_reference(0)
+
+
+@pytest.mark.parametrize("use_mgr", [True, False])
+@pytest.mark.parametrize("columns", [None, ["a"]])
+def test_dataframe_constructor_mgr_or_df(columns, use_mgr):
+    df = DataFrame({"a": [1, 2, 3]})
+    df_orig = df.copy()
+
+    if use_mgr:
+        data = df._mgr
+        warn = DeprecationWarning
+    else:
+        data = df
+        warn = None
+    msg = "Passing a BlockManager to DataFrame"
+    with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
+        new_df = DataFrame(data)
+
+    assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
+    new_df.iloc[0] = 100
+
+    assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("dtype", [None, "int64", "Int64"])
+@pytest.mark.parametrize("index", [None, [0, 1, 2]])
+@pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]])
+def test_dataframe_from_dict_of_series(columns, index, dtype):
+    # Case: constructing a DataFrame from Series objects with copy=False
+    # has to do a lazy following CoW rules
+    # (the default for DataFrame(dict) is still to copy to ensure consolidation)
+    s1 = Series([1, 2, 3])
+    s2 = Series([4, 5, 6])
+    s1_orig = s1.copy()
+    expected = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6]}, index=index, columns=columns, dtype=dtype
+    )
+
+    result = DataFrame(
+        {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
+    )
+
+    # the shallow copy still shares memory
+    assert np.shares_memory(get_array(result, "a"), get_array(s1))
+
+    # mutating the new dataframe doesn't mutate original
+    result.iloc[0, 0] = 10
+    assert not np.shares_memory(get_array(result, "a"), get_array(s1))
+    tm.assert_series_equal(s1, s1_orig)
+
+    # the same when modifying the parent series
+    s1 = Series([1, 2, 3])
+    s2 = Series([4, 5, 6])
+    result = DataFrame(
+        {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
+    )
+    s1.iloc[0] = 10
+    assert not np.shares_memory(get_array(result, "a"), get_array(s1))
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", [None, "int64"])
+def test_dataframe_from_dict_of_series_with_reindex(dtype):
+    # Case: constructing a DataFrame from Series objects with copy=False
+    # and passing an index that requires an actual (no-view) reindex -> need
+    # to ensure the result doesn't have refs set up to unnecessarily trigger
+    # a copy on write
+    s1 = Series([1, 2, 3])
+    s2 = Series([4, 5, 6])
+    df = DataFrame({"a": s1, "b": s2}, index=[1, 2, 3], dtype=dtype, copy=False)
+
+    # df should own its memory, so mutating shouldn't trigger a copy
+    arr_before = get_array(df, "a")
+    assert not np.shares_memory(arr_before, get_array(s1))
+    df.iloc[0, 0] = 100
+    arr_after = get_array(df, "a")
+    assert np.shares_memory(arr_before, arr_after)
+
+
+@pytest.mark.parametrize(
+    "data, dtype",
+    [
+        ([1, 2], "int64"),
+        # 1D-only EA
+        ([1, 2], "Int64"),
+        pytest.param(
+            ["a", "b"],
+            "str",
+            marks=pytest.mark.xfail(
+                reason="TODO bug with infer_string=False and specifying dtype='str'"
+            )
+            if not using_string_dtype()
+            else [],
+        ),
+        (["a", "b"], object),
+        # 2D EA
+        (
+            [Timestamp("2020", tz="UTC"), Timestamp("2021", tz="UTC")],
+            "datetime64[ns, UTC]",
+        ),
+    ],
+    ids=["int", "int-ea", "str", "object", "datetime64tz"],
+)
+def test_dataframe_from_series_or_index(data, dtype, index_or_series):
+    obj = index_or_series(data, dtype=dtype)
+    obj_orig = obj.copy(deep=True)  # deep=True needed for Index
+
+    # default is copy=False -> DataFrame holds a shallow copy of original Index/Series
+    df = DataFrame(obj)
+    assert tm.shares_memory(get_array(obj), get_array(df, 0))
+    assert not df._mgr._has_no_reference(0)
+
+    df.iloc[0, 0] = data[-1]
+    tm.assert_equal(obj, obj_orig)
+
+    # with passing the (identical) dtype -> same
+    df = DataFrame(obj, dtype=dtype)
+    assert tm.shares_memory(get_array(obj), get_array(df, 0))
+    assert not df._mgr._has_no_reference(0)
+
+    df.iloc[0, 0] = data[-1]
+    tm.assert_equal(obj, obj_orig)
+
+    # forcing copy=True still results in an actual hard copy up front
+    df = DataFrame(obj, copy=True)
+    if not (obj.dtype == "str" and obj.dtype.storage == "pyarrow"):
+        # ArrowExtensionArray deep copy still points to the same underlying data
+        assert not tm.shares_memory(get_array(obj), get_array(df, 0))
+        assert df._mgr._has_no_reference(0)
+
+    df.iloc[0, 0] = data[-1]
+    tm.assert_equal(obj, obj_orig)
+
+
+def test_dataframe_from_series_or_index_different_dtype(index_or_series):
+    obj = index_or_series([1, 2], dtype="int64")
+    df = DataFrame(obj, dtype="int32")
+    assert not np.shares_memory(get_array(obj), get_array(df, 0))
+    assert df._mgr._has_no_reference(0)
+
+
+def test_dataframe_from_series_dont_infer_datetime():
+    ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object)
+    df = DataFrame(ser)
+    assert df.dtypes.iloc[0] == np.dtype(object)
+    assert np.shares_memory(get_array(ser), get_array(df, 0))
+    assert not df._mgr._has_no_reference(0)
+
+
+@pytest.mark.parametrize("index", [None, [0, 1, 2]])
+def test_dataframe_from_dict_of_series_with_dtype(index):
+    # Variant of above, but now passing a dtype that causes a copy
+    # -> need to ensure the result doesn't have refs set up to unnecessarily
+    # trigger a copy on write
+    s1 = Series([1.0, 2.0, 3.0])
+    s2 = Series([4, 5, 6])
+    df = DataFrame({"a": s1, "b": s2}, index=index, dtype="int64", copy=False)
+
+    # df should own its memory, so mutating shouldn't trigger a copy
+    arr_before = get_array(df, "a")
+    assert not np.shares_memory(arr_before, get_array(s1))
+    df.iloc[0, 0] = 100
+    arr_after = get_array(df, "a")
+    assert np.shares_memory(arr_before, arr_after)
+
+
+@pytest.mark.parametrize("copy", [False, None, True])
+def test_dataframe_from_numpy_array(copy):
+    arr = np.array([[1, 2], [3, 4]])
+    df = DataFrame(arr, copy=copy)
+
+    if copy is not False or copy is True:
+        assert not np.shares_memory(get_array(df, 0), arr)
+    else:
+        assert np.shares_memory(get_array(df, 0), arr)
+
+
+@pytest.mark.parametrize(
+    "data, dtype",
+    [
+        # 1D-only EA
+        ([1, 2], "Int64"),
+        # 2D EA
+        (
+            [Timestamp("2020", tz="UTC"), Timestamp("2021", tz="UTC")],
+            "datetime64[ns, UTC]",
+        ),
+    ],
+    ids=["int-ea", "datetime64tz"],
+)
+@pytest.mark.parametrize("copy", [False, None, True])
+def test_dataframe_from_extension_array(copy, data, dtype):
+    arr = pd.array(data, dtype=dtype)
+    df = DataFrame(arr, copy=copy)
+
+    if arr.dtype == "Int64":
+        # to ensure tm.shares_memory works correctly
+        # TODO fix in tm.shares_memory or get_array?
+        arr = arr._data
+
+    if copy is None or copy is True:
+        assert not tm.shares_memory(get_array(df, 0), arr)
+    else:
+        assert tm.shares_memory(get_array(df, 0), arr)
+
+
+def test_frame_from_dict_of_index():
+    idx = Index([1, 2, 3])
+    expected = idx.copy(deep=True)
+    df = DataFrame({"a": idx}, copy=False)
+    assert np.shares_memory(get_array(df, "a"), idx._values)
+    assert not df._mgr._has_no_reference(0)
+
+    df.iloc[0, 0] = 100
+    tm.assert_index_equal(idx, expected)
diff --git a/pandas/tests/copy_view/test_copy_deprecation.py b/pandas/tests/copy_view/test_copy_deprecation.py
new file mode 100644
index 0000000000000000000000000000000000000000..acc87787dbe0a3b678bbdb347f775b59dab90d8b
--- /dev/null
+++ b/pandas/tests/copy_view/test_copy_deprecation.py
@@ -0,0 +1,100 @@
+import pytest
+
+from pandas.errors import Pandas4Warning
+
+import pandas as pd
+from pandas import (
+    concat,
+    merge,
+)
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize(
+    "meth, kwargs",
+    [
+        ("truncate", {}),
+        ("tz_convert", {"tz": "UTC"}),
+        ("tz_localize", {"tz": "UTC"}),
+        ("infer_objects", {}),
+        ("astype", {"dtype": "float64"}),
+        ("reindex", {"index": [2, 0, 1]}),
+        ("transpose", {}),
+        ("set_axis", {"labels": [1, 2, 3]}),
+        ("rename", {"index": {1: 2}}),
+        ("set_flags", {}),
+        ("to_period", {}),
+        ("to_timestamp", {}),
+        ("swaplevel", {"i": 0, "j": 1}),
+    ],
+)
+def test_copy_deprecation(meth, kwargs):
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": 1})
+
+    if meth in ("tz_convert", "tz_localize", "to_period"):
+        tz = None if meth in ("tz_localize", "to_period") else "US/Eastern"
+        df.index = pd.date_range("2020-01-01", freq="D", periods=len(df), tz=tz)
+    elif meth == "to_timestamp":
+        df.index = pd.period_range("2020-01-01", freq="D", periods=len(df))
+    elif meth == "swaplevel":
+        df = df.set_index(["b", "c"])
+
+    if meth != "swaplevel":
+        with tm.assert_produces_warning(Pandas4Warning, match="copy"):
+            getattr(df, meth)(copy=False, **kwargs)
+
+    if meth != "transpose":
+        with tm.assert_produces_warning(Pandas4Warning, match="copy"):
+            getattr(df.a, meth)(copy=False, **kwargs)
+
+
+def test_copy_deprecation_reindex_like_align():
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    # Somehow the stack level check is incorrect here
+    with tm.assert_produces_warning(
+        Pandas4Warning, match="copy", check_stacklevel=False
+    ):
+        df.reindex_like(df, copy=False)
+
+    with tm.assert_produces_warning(
+        Pandas4Warning, match="copy", check_stacklevel=False
+    ):
+        df.a.reindex_like(df.a, copy=False)
+
+    with tm.assert_produces_warning(
+        Pandas4Warning, match="copy", check_stacklevel=False
+    ):
+        df.align(df, copy=False)
+
+    with tm.assert_produces_warning(
+        Pandas4Warning, match="copy", check_stacklevel=False
+    ):
+        df.a.align(df.a, copy=False)
+
+
+def test_copy_deprecation_merge_concat():
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+
+    with tm.assert_produces_warning(
+        Pandas4Warning, match="copy", check_stacklevel=False
+    ):
+        df.merge(df, copy=False)
+
+    with tm.assert_produces_warning(
+        Pandas4Warning, match="copy", check_stacklevel=False
+    ):
+        merge(df, df, copy=False)
+
+    with tm.assert_produces_warning(
+        Pandas4Warning, match="copy", check_stacklevel=False
+    ):
+        concat([df, df], copy=False)
+
+
+@pytest.mark.parametrize("value", [False, True, "warn"])
+def test_copy_on_write_deprecation_option(value):
+    msg = "Copy-on-Write can no longer be disabled"
+    # stacklevel points to contextlib due to use of context manager.
+    with tm.assert_produces_warning(Pandas4Warning, match=msg, check_stacklevel=False):
+        with pd.option_context("mode.copy_on_write", value):
+            pass
diff --git a/pandas/tests/copy_view/test_core_functionalities.py b/pandas/tests/copy_view/test_core_functionalities.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad16bafdf0ee431b6af53835fc8c8ccbee7cac97
--- /dev/null
+++ b/pandas/tests/copy_view/test_core_functionalities.py
@@ -0,0 +1,93 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+
+def test_assigning_to_same_variable_removes_references():
+    df = DataFrame({"a": [1, 2, 3]})
+    df = df.reset_index()
+    assert df._mgr._has_no_reference(1)
+    arr = get_array(df, "a")
+    df.iloc[0, 1] = 100  # Write into a
+
+    assert np.shares_memory(arr, get_array(df, "a"))
+
+
+def test_setitem_dont_track_unnecessary_references():
+    df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
+
+    df["b"] = 100
+    arr = get_array(df, "a")
+    # We split the block in setitem, if we are not careful the new blocks will
+    # reference each other triggering a copy
+    df.iloc[0, 0] = 100
+    assert np.shares_memory(arr, get_array(df, "a"))
+
+
+def test_setitem_with_view_copies():
+    df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
+    view = df[:]
+    expected = df.copy()
+
+    df["b"] = 100
+    arr = get_array(df, "a")
+    df.iloc[0, 0] = 100  # Check that we correctly track reference
+    assert not np.shares_memory(arr, get_array(df, "a"))
+    tm.assert_frame_equal(view, expected)
+
+
+def test_setitem_with_view_invalidated_does_not_copy(request):
+    df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 1})
+    view = df[:]
+
+    df["b"] = 100
+    arr = get_array(df, "a")
+    view = None  # noqa: F841
+    # TODO(CoW) block gets split because of `df["b"] = 100`
+    # which introduces additional refs, even when those of `view` go out of scopes
+    df.iloc[0, 0] = 100
+    # Setitem split the block. Since the old block shared data with view
+    # all the new blocks are referencing view and each other. When view
+    # goes out of scope, they don't share data with any other block,
+    # so we should not trigger a copy
+    mark = pytest.mark.xfail(reason="blk.delete does not track references correctly")
+    request.applymarker(mark)
+    assert np.shares_memory(arr, get_array(df, "a"))
+
+
+def test_out_of_scope():
+    def func():
+        df = DataFrame({"a": [1, 2], "b": 1.5, "c": 1})
+        # create some subset
+        result = df[["a", "b"]]
+        return result
+
+    result = func()
+    assert not result._mgr.blocks[0].refs.has_reference()
+    assert not result._mgr.blocks[1].refs.has_reference()
+
+
+def test_delete():
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((4, 3)), columns=["a", "b", "c"]
+    )
+    del df["b"]
+    assert not df._mgr.blocks[0].refs.has_reference()
+    assert not df._mgr.blocks[1].refs.has_reference()
+
+    df = df[["a"]]
+    assert not df._mgr.blocks[0].refs.has_reference()
+
+
+def test_delete_reference():
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((4, 3)), columns=["a", "b", "c"]
+    )
+    x = df[:]
+    del df["b"]
+    assert df._mgr.blocks[0].refs.has_reference()
+    assert df._mgr.blocks[1].refs.has_reference()
+    assert x._mgr.blocks[0].refs.has_reference()
diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e355ce1b5ed59cda09f0546ef4a76f40eb2f7d5
--- /dev/null
+++ b/pandas/tests/copy_view/test_functions.py
@@ -0,0 +1,332 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    concat,
+    merge,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+
+def test_concat_frames():
+    df = DataFrame({"b": ["a"] * 3}, dtype=object)
+    df2 = DataFrame({"a": ["a"] * 3}, dtype=object)
+    df_orig = df.copy()
+    result = concat([df, df2], axis=1)
+
+    assert np.shares_memory(get_array(result, "b"), get_array(df, "b"))
+    assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
+
+    result.iloc[0, 0] = "d"
+    assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
+    assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
+
+    result.iloc[0, 1] = "d"
+    assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_concat_frames_updating_input():
+    df = DataFrame({"b": ["a"] * 3}, dtype=object)
+    df2 = DataFrame({"a": ["a"] * 3}, dtype=object)
+    result = concat([df, df2], axis=1)
+
+    assert np.shares_memory(get_array(result, "b"), get_array(df, "b"))
+    assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
+
+    expected = result.copy()
+    df.iloc[0, 0] = "d"
+    assert not np.shares_memory(get_array(result, "b"), get_array(df, "b"))
+    assert np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
+
+    df2.iloc[0, 0] = "d"
+    assert not np.shares_memory(get_array(result, "a"), get_array(df2, "a"))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_concat_series():
+    ser = Series([1, 2], name="a")
+    ser2 = Series([3, 4], name="b")
+    ser_orig = ser.copy()
+    ser2_orig = ser2.copy()
+    result = concat([ser, ser2], axis=1)
+
+    assert np.shares_memory(get_array(result, "a"), ser.values)
+    assert np.shares_memory(get_array(result, "b"), ser2.values)
+
+    result.iloc[0, 0] = 100
+    assert not np.shares_memory(get_array(result, "a"), ser.values)
+    assert np.shares_memory(get_array(result, "b"), ser2.values)
+
+    result.iloc[0, 1] = 1000
+    assert not np.shares_memory(get_array(result, "b"), ser2.values)
+    tm.assert_series_equal(ser, ser_orig)
+    tm.assert_series_equal(ser2, ser2_orig)
+
+
+def test_concat_frames_chained():
+    df1 = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
+    df2 = DataFrame({"c": [4, 5, 6]})
+    df3 = DataFrame({"d": [4, 5, 6]})
+    result = concat([concat([df1, df2], axis=1), df3], axis=1)
+    expected = result.copy()
+
+    assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
+    assert np.shares_memory(get_array(result, "c"), get_array(df2, "c"))
+    assert np.shares_memory(get_array(result, "d"), get_array(df3, "d"))
+
+    df1.iloc[0, 0] = 100
+    assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_concat_series_chained():
+    ser1 = Series([1, 2, 3], name="a")
+    ser2 = Series([4, 5, 6], name="c")
+    ser3 = Series([4, 5, 6], name="d")
+    result = concat([concat([ser1, ser2], axis=1), ser3], axis=1)
+    expected = result.copy()
+
+    assert np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
+    assert np.shares_memory(get_array(result, "c"), get_array(ser2, "c"))
+    assert np.shares_memory(get_array(result, "d"), get_array(ser3, "d"))
+
+    ser1.iloc[0] = 100
+    assert not np.shares_memory(get_array(result, "a"), get_array(ser1, "a"))
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_concat_series_updating_input():
+    ser = Series([1, 2], name="a")
+    ser2 = Series([3, 4], name="b")
+    expected = DataFrame({"a": [1, 2], "b": [3, 4]})
+    result = concat([ser, ser2], axis=1)
+
+    assert np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
+    assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
+
+    ser.iloc[0] = 100
+    assert not np.shares_memory(get_array(result, "a"), get_array(ser, "a"))
+    assert np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
+    tm.assert_frame_equal(result, expected)
+
+    ser2.iloc[0] = 1000
+    assert not np.shares_memory(get_array(result, "b"), get_array(ser2, "b"))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_concat_mixed_series_frame():
+    df = DataFrame({"a": [1, 2, 3], "c": 1})
+    ser = Series([4, 5, 6], name="d")
+    result = concat([df, ser], axis=1)
+    expected = result.copy()
+
+    assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
+    assert np.shares_memory(get_array(result, "c"), get_array(df, "c"))
+    assert np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
+
+    ser.iloc[0] = 100
+    assert not np.shares_memory(get_array(result, "d"), get_array(ser, "d"))
+
+    df.iloc[0, 0] = 100
+    assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_concat_copy_keyword():
+    df = DataFrame({"a": [1, 2]})
+    df2 = DataFrame({"b": [1.5, 2.5]})
+
+    result = concat([df, df2], axis=1)
+
+    assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
+    assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda df1, df2, **kwargs: df1.merge(df2, **kwargs),
+        lambda df1, df2, **kwargs: merge(df1, df2, **kwargs),
+    ],
+)
+def test_merge_on_key(func):
+    df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]})
+    df2 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "b": [4, 5, 6]})
+    df1_orig = df1.copy()
+    df2_orig = df2.copy()
+
+    result = func(df1, df2, on="key")
+
+    assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
+    assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
+    assert np.shares_memory(get_array(result, "key"), get_array(df1, "key"))
+    assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key"))
+
+    result.iloc[0, 1] = 0
+    assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
+    assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
+
+    result.iloc[0, 2] = 0
+    assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
+    tm.assert_frame_equal(df1, df1_orig)
+    tm.assert_frame_equal(df2, df2_orig)
+
+
+def test_merge_on_index():
+    df1 = DataFrame({"a": [1, 2, 3]})
+    df2 = DataFrame({"b": [4, 5, 6]})
+    df1_orig = df1.copy()
+    df2_orig = df2.copy()
+
+    result = merge(df1, df2, left_index=True, right_index=True)
+
+    assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
+    assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
+
+    result.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
+    assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
+
+    result.iloc[0, 1] = 0
+    assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
+    tm.assert_frame_equal(df1, df1_orig)
+    tm.assert_frame_equal(df2, df2_orig)
+
+
+@pytest.mark.parametrize(
+    "func, how",
+    [
+        (lambda df1, df2, **kwargs: merge(df2, df1, on="key", **kwargs), "right"),
+        (lambda df1, df2, **kwargs: merge(df1, df2, on="key", **kwargs), "left"),
+    ],
+)
+def test_merge_on_key_enlarging_one(func, how):
+    df1 = DataFrame({"key": Series(["a", "b", "c"], dtype=object), "a": [1, 2, 3]})
+    df2 = DataFrame({"key": Series(["a", "b"], dtype=object), "b": [4, 5]})
+    df1_orig = df1.copy()
+    df2_orig = df2.copy()
+
+    result = func(df1, df2, how=how)
+
+    assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
+    assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
+    assert df2._mgr._has_no_reference(1)
+    assert df2._mgr._has_no_reference(0)
+    assert np.shares_memory(get_array(result, "key"), get_array(df1, "key")) is (
+        how == "left"
+    )
+    assert not np.shares_memory(get_array(result, "key"), get_array(df2, "key"))
+
+    if how == "left":
+        result.iloc[0, 1] = 0
+    else:
+        result.iloc[0, 2] = 0
+    assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
+    tm.assert_frame_equal(df1, df1_orig)
+    tm.assert_frame_equal(df2, df2_orig)
+
+
+def test_merge_copy_keyword():
+    df = DataFrame({"a": [1, 2]})
+    df2 = DataFrame({"b": [3, 4.5]})
+
+    result = df.merge(df2, left_index=True, right_index=True)
+
+    assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
+    assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
+
+
+def test_merge_upcasting_no_copy():
+    left = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    left_copy = left.copy()
+    right = DataFrame({"a": [1, 2, 3], "c": [7, 8, 9]}, dtype=object)
+    result = merge(left, right, on="a")
+    assert np.shares_memory(get_array(result, "b"), get_array(left, "b"))
+    assert not np.shares_memory(get_array(result, "a"), get_array(left, "a"))
+    tm.assert_frame_equal(left, left_copy)
+
+    result = merge(right, left, on="a")
+    assert np.shares_memory(get_array(result, "b"), get_array(left, "b"))
+    assert not np.shares_memory(get_array(result, "a"), get_array(left, "a"))
+    tm.assert_frame_equal(left, left_copy)
+
+
+def test_merge_indicator_no_deep_copy():
+    left = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    right = DataFrame({"a": [1, 2, 3], "c": [7, 8, 9]})
+    result = merge(left, right, on="a", indicator=True)
+    assert np.shares_memory(get_array(result, "b"), get_array(left, "b"))
+    assert np.shares_memory(get_array(result, "c"), get_array(right, "c"))
+
+
+@pytest.mark.parametrize("dtype", [object, "str"])
+def test_join_on_key(dtype):
+    df_index = Index(["a", "b", "c"], name="key", dtype=dtype)
+
+    df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True))
+    df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True))
+
+    df1_orig = df1.copy()
+    df2_orig = df2.copy()
+
+    result = df1.join(df2, on="key")
+
+    assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
+    assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
+    assert tm.shares_memory(get_array(result.index), get_array(df1.index))
+    assert not np.shares_memory(get_array(result.index), get_array(df2.index))
+
+    result.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
+    assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
+
+    result.iloc[0, 1] = 0
+    assert not np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
+
+    tm.assert_frame_equal(df1, df1_orig)
+    tm.assert_frame_equal(df2, df2_orig)
+
+
+def test_join_multiple_dataframes_on_key():
+    df_index = Index(["a", "b", "c"], name="key", dtype=object)
+
+    df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True))
+    dfs_list = [
+        DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True)),
+        DataFrame({"c": [7, 8, 9]}, index=df_index.copy(deep=True)),
+    ]
+
+    df1_orig = df1.copy()
+    dfs_list_orig = [df.copy() for df in dfs_list]
+
+    result = df1.join(dfs_list)
+
+    assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
+    assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
+    assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
+    assert np.shares_memory(get_array(result.index), get_array(df1.index))
+    assert not np.shares_memory(get_array(result.index), get_array(dfs_list[0].index))
+    assert not np.shares_memory(get_array(result.index), get_array(dfs_list[1].index))
+
+    result.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
+    assert np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
+    assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
+
+    result.iloc[0, 1] = 0
+    assert not np.shares_memory(get_array(result, "b"), get_array(dfs_list[0], "b"))
+    assert np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
+
+    result.iloc[0, 2] = 0
+    assert not np.shares_memory(get_array(result, "c"), get_array(dfs_list[1], "c"))
+
+    tm.assert_frame_equal(df1, df1_orig)
+    for df, df_orig in zip(dfs_list, dfs_list_orig, strict=True):
+        tm.assert_frame_equal(df, df_orig)
diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e32b88849e836502e303e73296dc1f80ae253b9
--- /dev/null
+++ b/pandas/tests/copy_view/test_indexing.py
@@ -0,0 +1,902 @@
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.common import is_float_dtype
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+
+@pytest.fixture(params=["numpy", "nullable"])
+def backend(request):
+    if request.param == "numpy":
+
+        def make_dataframe(*args, **kwargs):
+            return DataFrame(*args, **kwargs)
+
+        def make_series(*args, **kwargs):
+            return Series(*args, **kwargs)
+
+    elif request.param == "nullable":
+
+        def make_dataframe(*args, **kwargs):
+            df = DataFrame(*args, **kwargs)
+            df_nullable = df.convert_dtypes()
+            # convert_dtypes will try to cast float to int if there is no loss in
+            # precision -> undo that change
+            for col in df.columns:
+                if is_float_dtype(df[col].dtype) and not is_float_dtype(
+                    df_nullable[col].dtype
+                ):
+                    df_nullable[col] = df_nullable[col].astype("Float64")
+            # copy final result to ensure we start with a fully self-owning DataFrame
+            return df_nullable.copy()
+
+        def make_series(*args, **kwargs):
+            ser = Series(*args, **kwargs)
+            return ser.convert_dtypes().copy()
+
+    return request.param, make_dataframe, make_series
+
+
+# -----------------------------------------------------------------------------
+# Indexing operations taking subset + modifying the subset/parent
+
+
+def test_subset_column_selection(backend):
+    # Case: taking a subset of the columns of a DataFrame
+    # + afterwards modifying the subset
+    _, DataFrame, _ = backend
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+
+    subset = df[["a", "c"]]
+
+    assert subset.index is not df.index
+
+    # the subset shares memory ...
+    assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
+    # ... but uses CoW when being modified
+    subset.iloc[0, 0] = 0
+
+    assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
+
+    expected = DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]})
+    tm.assert_frame_equal(subset, expected)
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_subset_column_selection_modify_parent(backend):
+    # Case: taking a subset of the columns of a DataFrame
+    # + afterwards modifying the parent
+    _, DataFrame, _ = backend
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+
+    subset = df[["a", "c"]]
+
+    # the subset shares memory ...
+    assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
+    # ... but parent uses CoW parent when it is modified
+    df.iloc[0, 0] = 0
+
+    assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
+    # different column/block still shares memory
+    assert np.shares_memory(get_array(subset, "c"), get_array(df, "c"))
+
+    expected = DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]})
+    tm.assert_frame_equal(subset, expected)
+
+
+def test_subset_row_slice(backend):
+    # Case: taking a subset of the rows of a DataFrame using a slice
+    # + afterwards modifying the subset
+    _, DataFrame, _ = backend
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+
+    subset = df[1:3]
+    subset._mgr._verify_integrity()
+
+    assert subset.columns is not df.columns
+    assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
+
+    subset.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
+
+    subset._mgr._verify_integrity()
+
+    expected = DataFrame({"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3))
+    tm.assert_frame_equal(subset, expected)
+    # original parent dataframe is not modified (CoW)
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
+)
+def test_subset_column_slice(backend, dtype):
+    # Case: taking a subset of the columns of a DataFrame using a slice
+    # + afterwards modifying the subset
+    dtype_backend, DataFrame, _ = backend
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
+    )
+    df_orig = df.copy()
+
+    subset = df.iloc[:, 1:]
+    subset._mgr._verify_integrity()
+
+    assert subset.index is not df.index
+    assert np.shares_memory(get_array(subset, "b"), get_array(df, "b"))
+
+    subset.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b"))
+
+    expected = DataFrame({"b": [0, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)})
+    tm.assert_frame_equal(subset, expected)
+    # original parent dataframe is not modified (also not for BlockManager case,
+    # except for single block)
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
+)
+@pytest.mark.parametrize(
+    "row_indexer",
+    [slice(1, 2), np.array([False, True, True]), np.array([1, 2])],
+    ids=["slice", "mask", "array"],
+)
+@pytest.mark.parametrize(
+    "column_indexer",
+    [slice("b", "c"), np.array([False, True, True]), ["b", "c"]],
+    ids=["slice", "mask", "array"],
+)
+def test_subset_loc_rows_columns(
+    backend,
+    dtype,
+    row_indexer,
+    column_indexer,
+):
+    # Case: taking a subset of the rows+columns of a DataFrame using .loc
+    # + afterwards modifying the subset
+    # Generic test for several combinations of row/column indexers, not all
+    # of those could actually return a view / need CoW (so this test is not
+    # checking memory sharing, only ensuring subsequent mutation doesn't
+    # affect the parent dataframe)
+    dtype_backend, DataFrame, _ = backend
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
+    )
+    df_orig = df.copy()
+
+    subset = df.loc[row_indexer, column_indexer]
+
+    assert subset.index is not df.index
+    assert subset.columns is not df.columns
+
+    # modifying the subset never modifies the parent
+    subset.iloc[0, 0] = 0
+
+    expected = DataFrame(
+        {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3)
+    )
+    tm.assert_frame_equal(subset, expected)
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
+)
+@pytest.mark.parametrize(
+    "row_indexer",
+    [slice(1, 3), np.array([False, True, True]), np.array([1, 2])],
+    ids=["slice", "mask", "array"],
+)
+@pytest.mark.parametrize(
+    "column_indexer",
+    [slice(1, 3), np.array([False, True, True]), [1, 2]],
+    ids=["slice", "mask", "array"],
+)
+def test_subset_iloc_rows_columns(
+    backend,
+    dtype,
+    row_indexer,
+    column_indexer,
+):
+    # Case: taking a subset of the rows+columns of a DataFrame using .iloc
+    # + afterwards modifying the subset
+    # Generic test for several combinations of row/column indexers, not all
+    # of those could actually return a view / need CoW (so this test is not
+    # checking memory sharing, only ensuring subsequent mutation doesn't
+    # affect the parent dataframe)
+    dtype_backend, DataFrame, _ = backend
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
+    )
+    df_orig = df.copy()
+
+    subset = df.iloc[row_indexer, column_indexer]
+
+    assert subset.index is not df.index
+    assert subset.columns is not df.columns
+
+    # modifying the subset never modifies the parent
+    subset.iloc[0, 0] = 0
+
+    expected = DataFrame(
+        {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3)
+    )
+    tm.assert_frame_equal(subset, expected)
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "indexer",
+    [slice(0, 2), np.array([True, True, False]), np.array([0, 1])],
+    ids=["slice", "mask", "array"],
+)
+def test_subset_set_with_row_indexer(backend, indexer_si, indexer):
+    # Case: setting values with a row indexer on a viewing subset
+    # subset[indexer] = value and subset.iloc[indexer] = value
+    _, DataFrame, _ = backend
+    df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]})
+    df_orig = df.copy()
+    subset = df[1:4]
+
+    if (
+        indexer_si is tm.setitem
+        and isinstance(indexer, np.ndarray)
+        and indexer.dtype == "int"
+    ):
+        pytest.skip("setitem with labels selects on columns")
+
+    indexer_si(subset)[indexer] = 0
+
+    expected = DataFrame(
+        {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4)
+    )
+    tm.assert_frame_equal(subset, expected)
+    # original parent dataframe is not modified (CoW)
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_subset_set_with_mask(backend):
+    # Case: setting values with a mask on a viewing subset: subset[mask] = value
+    _, DataFrame, _ = backend
+    df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]})
+    df_orig = df.copy()
+    subset = df[1:4]
+
+    mask = subset > 3
+
+    subset[mask] = 0
+
+    expected = DataFrame(
+        {"a": [2, 3, 0], "b": [0, 0, 0], "c": [0.20, 0.3, 0.4]}, index=range(1, 4)
+    )
+    tm.assert_frame_equal(subset, expected)
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_subset_set_column(backend):
+    # Case: setting a single column on a viewing subset -> subset[col] = value
+    dtype_backend, DataFrame, _ = backend
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    subset = df[1:3]
+
+    if dtype_backend == "numpy":
+        arr = np.array([10, 11], dtype="int64")
+    else:
+        arr = pd.array([10, 11], dtype="Int64")
+
+    subset["a"] = arr
+    subset._mgr._verify_integrity()
+    expected = DataFrame(
+        {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3)
+    )
+    tm.assert_frame_equal(subset, expected)
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
+)
+def test_subset_set_column_with_loc(backend, dtype):
+    # Case: setting a single column with loc on a viewing subset
+    # -> subset.loc[:, col] = value
+    _, DataFrame, _ = backend
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
+    )
+    df_orig = df.copy()
+    subset = df[1:3]
+
+    subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
+
+    subset._mgr._verify_integrity()
+    expected = DataFrame(
+        {"a": [10, 11], "b": [5, 6], "c": np.array([8, 9], dtype=dtype)},
+        index=range(1, 3),
+    )
+    tm.assert_frame_equal(subset, expected)
+    # original parent dataframe is not modified (CoW)
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_subset_set_column_with_loc2(backend):
+    # Case: setting a single column with loc on a viewing subset
+    # -> subset.loc[:, col] = value
+    # separate test for case of DataFrame of a single column -> takes a separate
+    # code path
+    _, DataFrame, _ = backend
+    df = DataFrame({"a": [1, 2, 3]})
+    df_orig = df.copy()
+    subset = df[1:3]
+
+    subset.loc[:, "a"] = 0
+
+    subset._mgr._verify_integrity()
+    expected = DataFrame({"a": [0, 0]}, index=range(1, 3))
+    tm.assert_frame_equal(subset, expected)
+    # original parent dataframe is not modified (CoW)
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
+)
+def test_subset_set_columns(backend, dtype):
+    # Case: setting multiple columns on a viewing subset
+    # -> subset[[col1, col2]] = value
+    dtype_backend, DataFrame, _ = backend
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
+    )
+    df_orig = df.copy()
+    subset = df[1:3]
+
+    subset[["a", "c"]] = 0
+
+    subset._mgr._verify_integrity()
+    # first and third column should certainly have no references anymore
+    assert all(subset._mgr._has_no_reference(i) for i in [0, 2])
+    expected = DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3))
+    if dtype_backend == "nullable":
+        # there is not yet a global option, so overriding a column by setting a scalar
+        # defaults to numpy dtype even if original column was nullable
+        expected["a"] = expected["a"].astype("int64")
+        expected["c"] = expected["c"].astype("int64")
+
+    tm.assert_frame_equal(subset, expected)
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "indexer",
+    [slice("a", "b"), np.array([True, True, False]), ["a", "b"]],
+    ids=["slice", "mask", "array"],
+)
+def test_subset_set_with_column_indexer(backend, indexer):
+    # Case: setting multiple columns with a column indexer on a viewing subset
+    # -> subset.loc[:, [col1, col2]] = value
+    _, DataFrame, _ = backend
+    df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]})
+    df_orig = df.copy()
+    subset = df[1:3]
+
+    subset.loc[:, indexer] = 0
+
+    subset._mgr._verify_integrity()
+    expected = DataFrame({"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3))
+    tm.assert_frame_equal(subset, expected)
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda df: df[["a", "b"]][0:2],
+        lambda df: df[0:2][["a", "b"]],
+        lambda df: df[["a", "b"]].iloc[0:2],
+        lambda df: df[["a", "b"]].loc[0:1],
+        lambda df: df[0:2].iloc[:, 0:2],
+        lambda df: df[0:2].loc[:, "a":"b"],  # type: ignore[misc]
+    ],
+    ids=[
+        "row-getitem-slice",
+        "column-getitem",
+        "row-iloc-slice",
+        "row-loc-slice",
+        "column-iloc-slice",
+        "column-loc-slice",
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
+)
+def test_subset_chained_getitem(
+    request,
+    backend,
+    method,
+    dtype,
+):
+    # Case: creating a subset using multiple, chained getitem calls using views
+    # still needs to guarantee proper CoW behaviour
+    _, DataFrame, _ = backend
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
+    )
+    df_orig = df.copy()
+
+    # modify subset -> don't modify parent
+    subset = method(df)
+
+    subset.iloc[0, 0] = 0
+    tm.assert_frame_equal(df, df_orig)
+
+    # modify parent -> don't modify subset
+    subset = method(df)
+    df.iloc[0, 0] = 0
+    expected = DataFrame({"a": [1, 2], "b": [4, 5]})
+    tm.assert_frame_equal(subset, expected)
+
+
+@pytest.mark.parametrize(
+    "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
+)
+def test_subset_chained_getitem_column(backend, dtype):
+    # Case: creating a subset using multiple, chained getitem calls using views
+    # still needs to guarantee proper CoW behaviour
+    dtype_backend, DataFrame, Series = backend
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
+    )
+    df_orig = df.copy()
+
+    # modify subset -> don't modify parent
+    subset = df[:]["a"][0:2]
+    subset.iloc[0] = 0
+    tm.assert_frame_equal(df, df_orig)
+
+    # modify parent -> don't modify subset
+    subset = df[:]["a"][0:2]
+    df.iloc[0, 0] = 0
+    expected = Series([1, 2], name="a")
+    tm.assert_series_equal(subset, expected)
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda s: s["a":"c"]["a":"b"],  # type: ignore[misc]
+        lambda s: s.iloc[0:3].iloc[0:2],
+        lambda s: s.loc["a":"c"].loc["a":"b"],  # type: ignore[misc]
+        lambda s: s.loc["a":"c"]  # type: ignore[misc]
+        .iloc[0:3]
+        .iloc[0:2]
+        .loc["a":"b"]  # type: ignore[misc]
+        .iloc[0:1],
+    ],
+    ids=["getitem", "iloc", "loc", "long-chain"],
+)
+def test_subset_chained_getitem_series(backend, method):
+    # Case: creating a subset using multiple, chained getitem calls using views
+    # still needs to guarantee proper CoW behaviour
+    _, _, Series = backend
+    s = Series([1, 2, 3], index=["a", "b", "c"])
+    s_orig = s.copy()
+
+    # modify subset -> don't modify parent
+    subset = method(s)
+    subset.iloc[0] = 0
+    tm.assert_series_equal(s, s_orig)
+
+    # modify parent -> don't modify subset
+    subset = s.iloc[0:3].iloc[0:2]
+    s.iloc[0] = 0
+    expected = Series([1, 2], index=["a", "b"])
+    tm.assert_series_equal(subset, expected)
+
+
+def test_subset_chained_single_block_row():
+    # not parametrizing this for dtype backend, since this explicitly tests single block
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
+    df_orig = df.copy()
+
+    # modify subset -> don't modify parent
+    subset = df[:].iloc[0].iloc[0:2]
+    subset.iloc[0] = 0
+    tm.assert_frame_equal(df, df_orig)
+
+    # modify parent -> don't modify subset
+    subset = df[:].iloc[0].iloc[0:2]
+    df.iloc[0, 0] = 0
+    expected = Series([1, 4], index=["a", "b"], name=0)
+    tm.assert_series_equal(subset, expected)
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda df: df[:],
+        lambda df: df.loc[:, :],
+        lambda df: df.loc[:],
+        lambda df: df.iloc[:, :],
+        lambda df: df.iloc[:],
+    ],
+    ids=["getitem", "loc", "loc-rows", "iloc", "iloc-rows"],
+)
+def test_null_slice(backend, method):
+    # Case: also all variants of indexing with a null slice (:) should return
+    # new objects to ensure we correctly use CoW for the results
+    dtype_backend, DataFrame, _ = backend
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
+    df_orig = df.copy()
+
+    df2 = method(df)
+
+    # we always return new objects (shallow copy), regardless of CoW or not
+    assert df2 is not df
+    assert df2.index is not df.index
+    assert df2.columns is not df.columns
+
+    # and those trigger CoW when mutated
+    df2.iloc[0, 0] = 0
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda s: s[:],
+        lambda s: s.loc[:],
+        lambda s: s.iloc[:],
+    ],
+    ids=["getitem", "loc", "iloc"],
+)
+def test_null_slice_series(backend, method):
+    _, _, Series = backend
+    s = Series([1, 2, 3], index=["a", "b", "c"])
+    s_orig = s.copy()
+
+    s2 = method(s)
+
+    # we always return new objects, regardless of CoW or not
+    assert s2 is not s
+    assert s2.index is not s.index
+
+    # and those trigger CoW when mutated
+    s2.iloc[0] = 0
+    tm.assert_series_equal(s, s_orig)
+
+
+# TODO add more tests modifying the parent
+
+
+# -----------------------------------------------------------------------------
+# Series -- Indexing operations taking subset + modifying the subset/parent
+
+
+def test_series_getitem_slice(backend):
+    # Case: taking a slice of a Series + afterwards modifying the subset
+    _, _, Series = backend
+    s = Series([1, 2, 3], index=["a", "b", "c"])
+    s_orig = s.copy()
+
+    subset = s[:]
+    assert np.shares_memory(get_array(subset), get_array(s))
+    assert subset.index is not s.index
+
+    subset.iloc[0] = 0
+
+    assert not np.shares_memory(get_array(subset), get_array(s))
+
+    expected = Series([0, 2, 3], index=["a", "b", "c"])
+    tm.assert_series_equal(subset, expected)
+
+    # original parent series is not modified (CoW)
+    tm.assert_series_equal(s, s_orig)
+
+
+def test_series_getitem_ellipsis():
+    # Case: taking a view of a Series using Ellipsis + afterwards modifying the subset
+    s = Series([1, 2, 3])
+    s_orig = s.copy()
+
+    subset = s[...]
+    assert np.shares_memory(get_array(subset), get_array(s))
+    assert subset.index is not s.index
+
+    subset.iloc[0] = 0
+
+    assert not np.shares_memory(get_array(subset), get_array(s))
+
+    expected = Series([0, 2, 3])
+    tm.assert_series_equal(subset, expected)
+
+    # original parent series is not modified (CoW)
+    tm.assert_series_equal(s, s_orig)
+
+
+@pytest.mark.parametrize(
+    "indexer",
+    [slice(0, 2), np.array([True, True, False]), np.array([0, 1])],
+    ids=["slice", "mask", "array"],
+)
+def test_series_subset_set_with_indexer(backend, indexer_si, indexer):
+    # Case: setting values in a viewing Series with an indexer
+    _, _, Series = backend
+    s = Series([1, 2, 3], index=["a", "b", "c"])
+    s_orig = s.copy()
+    subset = s[:]
+
+    if (
+        indexer_si is tm.setitem
+        and isinstance(indexer, np.ndarray)
+        and indexer.dtype.kind == "i"
+    ):
+        # In 3.0 we treat integers as always-labels
+        with pytest.raises(KeyError):
+            indexer_si(subset)[indexer] = 0
+        return
+
+    indexer_si(subset)[indexer] = 0
+    expected = Series([0, 0, 3], index=["a", "b", "c"])
+    tm.assert_series_equal(subset, expected)
+
+    tm.assert_series_equal(s, s_orig)
+
+
+# -----------------------------------------------------------------------------
+# del operator
+
+
+def test_del_frame(backend):
+    # Case: deleting a column with `del` on a viewing child dataframe should
+    # not modify parent + update the references
+    dtype_backend, DataFrame, _ = backend
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    df2 = df[:]
+
+    assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    del df2["b"]
+
+    assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+    tm.assert_frame_equal(df, df_orig)
+    tm.assert_frame_equal(df2, df_orig[["a", "c"]])
+    df2._mgr._verify_integrity()
+
+    df.loc[0, "b"] = 200
+    assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+    df_orig = df.copy()
+
+    df2.loc[0, "a"] = 100
+    # modifying child after deleting a column still doesn't update parent
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_del_series(backend):
+    _, _, Series = backend
+    s = Series([1, 2, 3], index=["a", "b", "c"])
+    s_orig = s.copy()
+    s2 = s[:]
+
+    assert np.shares_memory(get_array(s), get_array(s2))
+
+    del s2["a"]
+
+    assert not np.shares_memory(get_array(s), get_array(s2))
+    tm.assert_series_equal(s, s_orig)
+    tm.assert_series_equal(s2, s_orig[["b", "c"]])
+
+    # modifying s2 doesn't need copy on write (due to `del`, s2 is backed by new array)
+    values = s2.values
+    s2.loc["b"] = 100
+    assert values[0] == 100
+
+
+# -----------------------------------------------------------------------------
+# Accessing column as Series
+
+
+def test_column_as_series(backend):
+    # Case: selecting a single column now also uses Copy-on-Write
+    dtype_backend, DataFrame, Series = backend
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+
+    s = df["a"]
+
+    assert s.index is not df.index
+    assert np.shares_memory(get_array(s, "a"), get_array(df, "a"))
+
+    s[0] = 0
+
+    expected = Series([0, 2, 3], name="a")
+    tm.assert_series_equal(s, expected)
+    # assert not np.shares_memory(s.values, get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+    # ensure cached series on getitem is not the changed series
+    tm.assert_series_equal(df["a"], df_orig["a"])
+
+
+def test_column_as_series_set_with_upcast(backend):
+    # Case: selecting a single column now also uses Copy-on-Write -> when
+    # setting a value causes an upcast, we don't need to update the parent
+    # DataFrame through the cache mechanism
+    dtype_backend, DataFrame, Series = backend
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+
+    s = df["a"]
+    if dtype_backend == "nullable":
+        with pytest.raises(TypeError, match="Invalid value"):
+            s[0] = "foo"
+        expected = Series([1, 2, 3], name="a")
+        tm.assert_series_equal(s, expected)
+        tm.assert_frame_equal(df, df_orig)
+        # ensure cached series on getitem is not the changed series
+        tm.assert_series_equal(df["a"], df_orig["a"])
+    else:
+        with pytest.raises(TypeError, match="Invalid value"):
+            s[0] = "foo"
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda df: df["a"],
+        lambda df: df.loc[:, "a"],
+        lambda df: df.iloc[:, 0],
+    ],
+    ids=["getitem", "loc", "iloc"],
+)
+def test_column_as_series_no_item_cache(request, backend, method):
+    # Case: selecting a single column (which now also uses Copy-on-Write to protect
+    # the view) should always give a new object (i.e. not make use of a cache)
+    dtype_backend, DataFrame, _ = backend
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+
+    s1 = method(df)
+    s2 = method(df)
+
+    assert s1 is not s2
+    assert s1.index is not df.index
+    assert s1.index is not s2.index
+
+    s1.iloc[0] = 0
+
+    tm.assert_series_equal(s2, df_orig["a"])
+    tm.assert_frame_equal(df, df_orig)
+
+
+# TODO add tests for other indexing methods on the Series
+
+
+def test_dataframe_add_column_from_series(backend):
+    # Case: adding a new column to a DataFrame from an existing column/series
+    # -> delays copy under CoW
+    _, DataFrame, Series = backend
+    df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
+
+    s = Series([10, 11, 12])
+    df["new"] = s
+    assert np.shares_memory(get_array(df, "new"), get_array(s))
+
+    # editing series -> doesn't modify column in frame
+    s[0] = 0
+    expected = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]})
+    tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.parametrize("val", [100, "a"])
+@pytest.mark.parametrize(
+    "indexer_func, indexer",
+    [
+        (tm.loc, (0, "a")),
+        (tm.iloc, (0, 0)),
+        (tm.loc, ([0], "a")),
+        (tm.iloc, ([0], 0)),
+        (tm.loc, (slice(None), "a")),
+        (tm.iloc, (slice(None), 0)),
+    ],
+)
+@pytest.mark.parametrize(
+    "col", [[0.1, 0.2, 0.3], [7, 8, 9]], ids=["mixed-block", "single-block"]
+)
+def test_set_value_copy_only_necessary_column(indexer_func, indexer, val, col):
+    # When setting inplace, only copy column that is modified instead of the whole
+    # block (by splitting the block)
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": col})
+    df_orig = df.copy()
+    view = df[:]
+
+    if val == "a":
+        with pytest.raises(TypeError, match="Invalid value"):
+            indexer_func(df)[indexer] = val
+    else:
+        indexer_func(df)[indexer] = val
+
+        assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
+        assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
+        tm.assert_frame_equal(view, df_orig)
+
+
+def test_series_midx_slice():
+    ser = Series([1, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]]))
+    ser_orig = ser.copy()
+    result = ser[1]
+    assert np.shares_memory(get_array(ser), get_array(result))
+    result.iloc[0] = 100
+    tm.assert_series_equal(ser, ser_orig)
+
+
+def test_getitem_midx_slice():
+    df = DataFrame({("a", "x"): [1, 2], ("a", "y"): 1, ("b", "x"): 2})
+    df_orig = df.copy()
+    new_df = df[("a",)]
+
+    assert not new_df._mgr._has_no_reference(0)
+
+    assert np.shares_memory(get_array(df, ("a", "x")), get_array(new_df, "x"))
+    new_df.iloc[0, 0] = 100
+    tm.assert_frame_equal(df_orig, df)
+
+
+def test_series_midx_tuples_slice():
+    ser = Series(
+        [1, 2, 3],
+        index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]),
+    )
+    result = ser[(1, 2)]
+    assert np.shares_memory(get_array(ser), get_array(result))
+    result.iloc[0] = 100
+    expected = Series(
+        [1, 2, 3],
+        index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]),
+    )
+    tm.assert_series_equal(ser, expected)
+
+
+def test_midx_read_only_bool_indexer():
+    # GH#56635
+    def mklbl(prefix, n):
+        return [f"{prefix}{i}" for i in range(n)]
+
+    idx = pd.MultiIndex.from_product(
+        [mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)]
+    )
+    cols = pd.MultiIndex.from_tuples(
+        [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"]
+    )
+    df = DataFrame(1, index=idx, columns=cols).sort_index().sort_index(axis=1)
+
+    mask = df[("a", "foo")] == 1
+    expected_mask = mask.copy()
+    result = df.loc[pd.IndexSlice[mask, :, ["C1", "C3"]], :]
+    expected = df.loc[pd.IndexSlice[:, :, ["C1", "C3"]], :]
+    tm.assert_frame_equal(result, expected)
+    tm.assert_series_equal(mask, expected_mask)
+
+
+def test_loc_enlarging_with_dataframe():
+    df = DataFrame({"a": [1, 2, 3]})
+    rhs = DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]})
+    rhs_orig = rhs.copy()
+    df.loc[:, ["b", "c"]] = rhs
+    assert np.shares_memory(get_array(df, "b"), get_array(rhs, "b"))
+    assert np.shares_memory(get_array(df, "c"), get_array(rhs, "c"))
+    assert not df._mgr._has_no_reference(1)
+
+    df.iloc[0, 1] = 100
+    tm.assert_frame_equal(rhs, rhs_orig)
diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7baf01ecc36e5a3cfed4ba443d244ac669344fd
--- /dev/null
+++ b/pandas/tests/copy_view/test_internals.py
@@ -0,0 +1,112 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+
+def test_consolidate():
+    # create unconsolidated DataFrame
+    df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
+    df["c"] = [4, 5, 6]
+
+    # take a viewing subset
+    subset = df[:]
+
+    # each block of subset references a block of df
+    assert all(blk.refs.has_reference() for blk in subset._mgr.blocks)
+
+    # consolidate the two int64 blocks
+    subset._consolidate_inplace()
+
+    # the float64 block still references the parent one because it still a view
+    assert subset._mgr.blocks[0].refs.has_reference()
+    # equivalent of assert np.shares_memory(df["b"].values, subset["b"].values)
+    # but avoids caching df["b"]
+    assert np.shares_memory(get_array(df, "b"), get_array(subset, "b"))
+
+    # the new consolidated int64 block does not reference another
+    assert not subset._mgr.blocks[1].refs.has_reference()
+
+    # the parent dataframe now also only is linked for the float column
+    assert not df._mgr.blocks[0].refs.has_reference()
+    assert df._mgr.blocks[1].refs.has_reference()
+    assert not df._mgr.blocks[2].refs.has_reference()
+
+    # and modifying subset still doesn't modify parent
+    subset.iloc[0, 1] = 0.0
+    assert not df._mgr.blocks[1].refs.has_reference()
+    assert df.loc[0, "b"] == 0.1
+
+
+@pytest.mark.parametrize("dtype", [np.intp, np.int8])
+@pytest.mark.parametrize(
+    "locs, arr",
+    [
+        ([0], np.array([-1, -2, -3])),
+        ([1], np.array([-1, -2, -3])),
+        ([5], np.array([-1, -2, -3])),
+        ([0, 1], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
+        ([0, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
+        ([0, 1, 2], np.array([[-1, -2, -3], [-4, -5, -6], [-4, -5, -6]]).T),
+        ([1, 2], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
+        ([1, 3], np.array([[-1, -2, -3], [-4, -5, -6]]).T),
+    ],
+)
+def test_iset_splits_blocks_inplace(locs, arr, dtype):
+    # Nothing currently calls iset with
+    # more than 1 loc with inplace=True (only happens with inplace=False)
+    # but ensure that it works
+    df = DataFrame(
+        {
+            "a": [1, 2, 3],
+            "b": [4, 5, 6],
+            "c": [7, 8, 9],
+            "d": [10, 11, 12],
+            "e": [13, 14, 15],
+            "f": Series(["a", "b", "c"], dtype=object),
+        },
+    )
+    arr = arr.astype(dtype)
+    df_orig = df.copy()
+    df2 = df.copy(deep=False)  # Trigger a CoW (if enabled, otherwise makes copy)
+    df2._mgr.iset(locs, arr, inplace=True)
+
+    tm.assert_frame_equal(df, df_orig)
+    for i, col in enumerate(df.columns):
+        if i not in locs:
+            assert np.shares_memory(get_array(df, col), get_array(df2, col))
+
+
+def test_exponential_backoff():
+    # GH#55518
+    df = DataFrame({"a": [1, 2, 3]})
+    for i in range(490):
+        df.copy(deep=False)
+
+    assert len(df._mgr.blocks[0].refs.referenced_blocks) == 491
+
+    df = DataFrame({"a": [1, 2, 3]})
+    dfs = [df.copy(deep=False) for i in range(510)]
+
+    for i in range(20):
+        df.copy(deep=False)
+    assert len(df._mgr.blocks[0].refs.referenced_blocks) == 531
+    assert df._mgr.blocks[0].refs.clear_counter == 1000
+
+    for i in range(500):
+        df.copy(deep=False)
+
+    # Don't reduce since we still have over 500 objects alive
+    assert df._mgr.blocks[0].refs.clear_counter == 1000
+
+    dfs = dfs[:300]
+    for i in range(500):
+        df.copy(deep=False)
+
+    # Reduce since there are less than 500 objects alive
+    assert df._mgr.blocks[0].refs.clear_counter == 500
diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5880e99df5d7122c1bb396189ae735dfe2e77ba
--- /dev/null
+++ b/pandas/tests/copy_view/test_interp_fillna.py
@@ -0,0 +1,307 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    NA,
+    DataFrame,
+    Interval,
+    NaT,
+    Series,
+    Timestamp,
+    interval_range,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+
+@pytest.mark.parametrize("method", ["pad", "nearest", "linear"])
+def test_interpolate_no_op(method):
+    df = DataFrame({"a": [1, 2]})
+    df_orig = df.copy()
+
+    if method == "pad":
+        msg = f"Can not interpolate with method={method}"
+        with pytest.raises(ValueError, match=msg):
+            df.interpolate(method=method)
+    else:
+        result = df.interpolate(method=method)
+        assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
+        assert result.index is not df.index
+        assert result.columns is not df.columns
+
+        result.iloc[0, 0] = 100
+
+        assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
+        tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("func", ["ffill", "bfill"])
+def test_interp_fill_functions(func):
+    # Check that these takes the same code paths as interpolate
+    df = DataFrame({"a": [1, 2]})
+    df_orig = df.copy()
+
+    result = getattr(df, func)()
+
+    assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
+    assert result.index is not df.index
+    assert result.columns is not df.columns
+
+    result.iloc[0, 0] = 100
+    assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("func", ["ffill", "bfill"])
+@pytest.mark.parametrize(
+    "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
+)
+def test_interpolate_triggers_copy(vals, func):
+    df = DataFrame({"a": vals})
+    result = getattr(df, func)()
+
+    assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
+    # Check that we don't have references when triggering a copy
+    assert result._mgr._has_no_reference(0)
+
+
+@pytest.mark.parametrize(
+    "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
+)
+def test_interpolate_inplace_no_reference_no_copy(vals):
+    df = DataFrame({"a": vals})
+    arr = get_array(df, "a")
+    df.interpolate(method="linear", inplace=True)
+
+    assert np.shares_memory(arr, get_array(df, "a"))
+    # Check that we don't have references when triggering a copy
+    assert df._mgr._has_no_reference(0)
+
+
+@pytest.mark.parametrize(
+    "vals", [[1, np.nan, 2], [Timestamp("2019-12-31"), NaT, Timestamp("2020-12-31")]]
+)
+def test_interpolate_inplace_with_refs(vals):
+    df = DataFrame({"a": [1, np.nan, 2]})
+    df_orig = df.copy()
+    arr = get_array(df, "a")
+    view = df[:]
+    df.interpolate(method="linear", inplace=True)
+    # Check that copy was triggered in interpolate and that we don't
+    # have any references left
+    assert not np.shares_memory(arr, get_array(df, "a"))
+    tm.assert_frame_equal(df_orig, view)
+    assert df._mgr._has_no_reference(0)
+    assert view._mgr._has_no_reference(0)
+
+
+@pytest.mark.parametrize("func", ["ffill", "bfill"])
+@pytest.mark.parametrize("dtype", ["float64", "Float64"])
+def test_interp_fill_functions_inplace(func, dtype):
+    # Check that these takes the same code paths as interpolate
+    df = DataFrame({"a": [1, np.nan, 2]}, dtype=dtype)
+    df_orig = df.copy()
+    arr = get_array(df, "a")
+    view = df[:]
+
+    getattr(df, func)(inplace=True)
+
+    # Check that copy was triggered in interpolate and that we don't
+    # have any references left
+    assert not np.shares_memory(arr, get_array(df, "a"))
+    tm.assert_frame_equal(df_orig, view)
+    assert df._mgr._has_no_reference(0)
+    assert view._mgr._has_no_reference(0)
+
+
+def test_interpolate_cannot_with_object_dtype():
+    df = DataFrame({"a": ["a", np.nan, "c"], "b": 1})
+    df["a"] = df["a"].astype(object)
+
+    msg = "DataFrame cannot interpolate with object dtype"
+    with pytest.raises(TypeError, match=msg):
+        df.interpolate()
+
+
+def test_interpolate_object_convert_no_op():
+    df = DataFrame({"a": ["a", "b", "c"], "b": 1})
+    df["a"] = df["a"].astype(object)
+    arr_a = get_array(df, "a")
+
+    # Now CoW makes a copy, it should not!
+    assert df._mgr._has_no_reference(0)
+    assert np.shares_memory(arr_a, get_array(df, "a"))
+
+
+def test_interpolate_object_convert_copies():
+    df = DataFrame({"a": [1, np.nan, 2.5], "b": 1})
+    arr_a = get_array(df, "a")
+    msg = "Can not interpolate with method=pad"
+    with pytest.raises(ValueError, match=msg):
+        df.interpolate(method="pad", inplace=True)
+
+    assert df._mgr._has_no_reference(0)
+    assert np.shares_memory(arr_a, get_array(df, "a"))
+
+
+def test_interpolate_downcast_reference_triggers_copy():
+    df = DataFrame({"a": [1, np.nan, 2.5], "b": 1})
+    df_orig = df.copy()
+    arr_a = get_array(df, "a")
+    view = df[:]
+
+    msg = "Can not interpolate with method=pad"
+    with pytest.raises(ValueError, match=msg):
+        df.interpolate(method="pad", inplace=True)
+        assert df._mgr._has_no_reference(0)
+        assert not np.shares_memory(arr_a, get_array(df, "a"))
+
+    tm.assert_frame_equal(df_orig, view)
+
+
+def test_fillna():
+    df = DataFrame({"a": [1.5, np.nan], "b": 1})
+    df_orig = df.copy()
+
+    df2 = df.fillna(5.5)
+    assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+    assert df2.index is not df.index
+    assert df2.columns is not df.columns
+
+    df2.iloc[0, 1] = 100
+    tm.assert_frame_equal(df_orig, df)
+
+
+def test_fillna_dict():
+    df = DataFrame({"a": [1.5, np.nan], "b": 1})
+    df_orig = df.copy()
+
+    df2 = df.fillna({"a": 100.5})
+    assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+    assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    df2.iloc[0, 1] = 100
+    tm.assert_frame_equal(df_orig, df)
+
+
+def test_fillna_inplace():
+    df = DataFrame({"a": [1.5, np.nan], "b": 1})
+    arr_a = get_array(df, "a")
+    arr_b = get_array(df, "b")
+
+    df.fillna(5.5, inplace=True)
+    assert np.shares_memory(get_array(df, "a"), arr_a)
+    assert np.shares_memory(get_array(df, "b"), arr_b)
+    assert df._mgr._has_no_reference(0)
+    assert df._mgr._has_no_reference(1)
+
+
+def test_fillna_inplace_reference():
+    df = DataFrame({"a": [1.5, np.nan], "b": 1})
+    df_orig = df.copy()
+    arr_a = get_array(df, "a")
+    arr_b = get_array(df, "b")
+    view = df[:]
+
+    df.fillna(5.5, inplace=True)
+    assert not np.shares_memory(get_array(df, "a"), arr_a)
+    assert np.shares_memory(get_array(df, "b"), arr_b)
+    assert view._mgr._has_no_reference(0)
+    assert df._mgr._has_no_reference(0)
+    tm.assert_frame_equal(view, df_orig)
+    expected = DataFrame({"a": [1.5, 5.5], "b": 1})
+    tm.assert_frame_equal(df, expected)
+
+
+def test_fillna_interval_inplace_reference():
+    # Set dtype explicitly to avoid implicit cast when setting nan
+    ser = Series(
+        interval_range(start=0, end=5), name="a", dtype="interval[float64, right]"
+    )
+    ser.iloc[1] = np.nan
+
+    ser_orig = ser.copy()
+    view = ser[:]
+    ser.fillna(value=Interval(left=0, right=5), inplace=True)
+
+    assert not np.shares_memory(
+        get_array(ser, "a").left.values, get_array(view, "a").left.values
+    )
+    tm.assert_series_equal(view, ser_orig)
+
+
+def test_fillna_series_empty_arg():
+    ser = Series([1, np.nan, 2])
+    ser_orig = ser.copy()
+    result = ser.fillna({})
+    assert np.shares_memory(get_array(ser), get_array(result))
+
+    ser.iloc[0] = 100.5
+    tm.assert_series_equal(ser_orig, result)
+
+
+def test_fillna_series_empty_arg_inplace():
+    ser = Series([1, np.nan, 2])
+    arr = get_array(ser)
+    ser.fillna({}, inplace=True)
+
+    assert np.shares_memory(get_array(ser), arr)
+    assert ser._mgr._has_no_reference(0)
+
+
+def test_fillna_ea_noop_shares_memory(any_numeric_ea_and_arrow_dtype):
+    df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype)
+    df_orig = df.copy()
+    df2 = df.fillna(100)
+
+    assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+    assert not df2._mgr._has_no_reference(1)
+    tm.assert_frame_equal(df_orig, df)
+
+    df2.iloc[0, 1] = 100
+    assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+    assert df2._mgr._has_no_reference(1)
+    assert df._mgr._has_no_reference(1)
+    tm.assert_frame_equal(df_orig, df)
+
+
+def test_fillna_inplace_ea_noop_shares_memory(any_numeric_ea_and_arrow_dtype):
+    df = DataFrame({"a": [1, NA, 3], "b": 1}, dtype=any_numeric_ea_and_arrow_dtype)
+    df_orig = df.copy()
+    view = df[:]
+    df.fillna(100, inplace=True)
+    assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
+
+    assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
+    assert not df._mgr._has_no_reference(1)
+    assert not view._mgr._has_no_reference(1)
+
+    df.iloc[0, 1] = 100
+    tm.assert_frame_equal(df_orig, view)
+
+
+def test_fillna_chained_assignment():
+    df = DataFrame({"a": [1, np.nan, 2], "b": 1})
+    df_orig = df.copy()
+    with tm.raises_chained_assignment_error():
+        df["a"].fillna(100, inplace=True)
+    tm.assert_frame_equal(df, df_orig)
+
+    with tm.raises_chained_assignment_error():
+        df[["a"]].fillna(100, inplace=True)
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("func", ["interpolate", "ffill", "bfill"])
+def test_interpolate_chained_assignment(func):
+    df = DataFrame({"a": [1, np.nan, 2], "b": 1})
+    df_orig = df.copy()
+    with tm.raises_chained_assignment_error():
+        getattr(df["a"], func)(inplace=True)
+    tm.assert_frame_equal(df, df_orig)
+
+    with tm.raises_chained_assignment_error():
+        getattr(df[["a"]], func)(inplace=True)
+    tm.assert_frame_equal(df, df_orig)
diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb19132d16a072304faa16599711a7d4be4122b8
--- /dev/null
+++ b/pandas/tests/copy_view/test_methods.py
@@ -0,0 +1,1601 @@
+import numpy as np
+import pytest
+
+from pandas.compat import HAS_PYARROW
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Period,
+    Series,
+    Timestamp,
+    date_range,
+    period_range,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+from pandas.util.version import Version
+
+
+def test_copy():
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_copy = df.copy()
+
+    # the deep copy by defaults takes a shallow copy of the Index
+    assert df_copy.index is not df.index
+    assert df_copy.columns is not df.columns
+    assert df_copy.index.is_(df.index)
+    assert df_copy.columns.is_(df.columns)
+
+    # the deep copy doesn't share memory
+    assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a"))
+    assert not df_copy._mgr.blocks[0].refs.has_reference()
+    assert not df_copy._mgr.blocks[1].refs.has_reference()
+
+    assert df_copy.index is not df.index
+    assert df_copy.columns is not df.columns
+
+    # mutating copy doesn't mutate original
+    df_copy.iloc[0, 0] = 0
+    assert df.iloc[0, 0] == 1
+
+
+def test_copy_shallow():
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_copy = df.copy(deep=False)
+
+    # the shallow copy also makes a shallow copy of the index
+    assert df_copy.index is not df.index
+    assert df_copy.columns is not df.columns
+    assert df_copy.index.is_(df.index)
+    assert df_copy.columns.is_(df.columns)
+
+    # the shallow copy still shares memory
+    assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a"))
+    assert df_copy._mgr.blocks[0].refs.has_reference()
+    assert df_copy._mgr.blocks[1].refs.has_reference()
+
+    # mutating shallow copy doesn't mutate original
+    df_copy.iloc[0, 0] = 0
+    assert df.iloc[0, 0] == 1
+    # mutating triggered a copy-on-write -> no longer shares memory
+    assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a"))
+    # but still shares memory for the other columns/blocks
+    assert np.shares_memory(get_array(df_copy, "c"), get_array(df, "c"))
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+@pytest.mark.parametrize("copy", [True, None, False])
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda df, copy: df.rename(columns=str.lower, copy=copy),
+        lambda df, copy: df.reindex(columns=["a", "c"], copy=copy),
+        lambda df, copy: df.reindex_like(df, copy=copy),
+        lambda df, copy: df.align(df, copy=copy)[0],
+        lambda df, copy: df.set_axis(["a", "b", "c"], axis="index", copy=copy),
+        lambda df, copy: df.rename_axis(index="test", copy=copy),
+        lambda df, copy: df.rename_axis(columns="test", copy=copy),
+        lambda df, copy: df.astype({"b": "int64"}, copy=copy),
+        # lambda df, copy: df.swaplevel(0, 0, copy=copy),
+        lambda df, copy: df.truncate(0, 5, copy=copy),
+        lambda df, copy: df.infer_objects(copy=copy),
+        lambda df, copy: df.to_timestamp(copy=copy),
+        lambda df, copy: df.to_period(freq="D", copy=copy),
+        lambda df, copy: df.tz_localize("US/Central", copy=copy),
+        lambda df, copy: df.tz_convert("US/Central", copy=copy),
+        lambda df, copy: df.set_flags(allows_duplicate_labels=False, copy=copy),
+    ],
+    ids=[
+        "rename",
+        "reindex",
+        "reindex_like",
+        "align",
+        "set_axis",
+        "rename_axis0",
+        "rename_axis1",
+        "astype",
+        # "swaplevel",  # only series
+        "truncate",
+        "infer_objects",
+        "to_timestamp",
+        "to_period",
+        "tz_localize",
+        "tz_convert",
+        "set_flags",
+    ],
+)
+def test_methods_copy_keyword(request, method, copy):
+    index = None
+    if "to_timestamp" in request.node.callspec.id:
+        index = period_range("2012-01-01", freq="D", periods=3)
+    elif "to_period" in request.node.callspec.id:
+        index = date_range("2012-01-01", freq="D", periods=3)
+    elif "tz_localize" in request.node.callspec.id:
+        index = date_range("2012-01-01", freq="D", periods=3)
+    elif "tz_convert" in request.node.callspec.id:
+        index = date_range("2012-01-01", freq="D", periods=3, tz="Europe/Brussels")
+
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=index)
+    df2 = method(df, copy=copy)
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+@pytest.mark.parametrize("copy", [True, None, False])
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda ser, copy: ser.rename(index={0: 100}, copy=copy),
+        lambda ser, copy: ser.rename(None, copy=copy),
+        lambda ser, copy: ser.reindex(index=ser.index, copy=copy),
+        lambda ser, copy: ser.reindex_like(ser, copy=copy),
+        lambda ser, copy: ser.align(ser, copy=copy)[0],
+        lambda ser, copy: ser.set_axis(["a", "b", "c"], axis="index", copy=copy),
+        lambda ser, copy: ser.rename_axis(index="test", copy=copy),
+        lambda ser, copy: ser.astype("int64", copy=copy),
+        lambda ser, copy: ser.swaplevel(0, 1, copy=copy),
+        lambda ser, copy: ser.truncate(0, 5, copy=copy),
+        lambda ser, copy: ser.infer_objects(copy=copy),
+        lambda ser, copy: ser.to_timestamp(copy=copy),
+        lambda ser, copy: ser.to_period(freq="D", copy=copy),
+        lambda ser, copy: ser.tz_localize("US/Central", copy=copy),
+        lambda ser, copy: ser.tz_convert("US/Central", copy=copy),
+        lambda ser, copy: ser.set_flags(allows_duplicate_labels=False, copy=copy),
+    ],
+    ids=[
+        "rename (dict)",
+        "rename",
+        "reindex",
+        "reindex_like",
+        "align",
+        "set_axis",
+        "rename_axis0",
+        "astype",
+        "swaplevel",
+        "truncate",
+        "infer_objects",
+        "to_timestamp",
+        "to_period",
+        "tz_localize",
+        "tz_convert",
+        "set_flags",
+    ],
+)
+def test_methods_series_copy_keyword(request, method, copy):
+    index = None
+    if "to_timestamp" in request.node.callspec.id:
+        index = period_range("2012-01-01", freq="D", periods=3)
+    elif "to_period" in request.node.callspec.id:
+        index = date_range("2012-01-01", freq="D", periods=3)
+    elif "tz_localize" in request.node.callspec.id:
+        index = date_range("2012-01-01", freq="D", periods=3)
+    elif "tz_convert" in request.node.callspec.id:
+        index = date_range("2012-01-01", freq="D", periods=3, tz="Europe/Brussels")
+    elif "swaplevel" in request.node.callspec.id:
+        index = MultiIndex.from_arrays([[1, 2, 3], [4, 5, 6]])
+
+    ser = Series([1, 2, 3], index=index)
+    ser2 = method(ser, copy=copy)
+    assert np.shares_memory(get_array(ser2), get_array(ser))
+
+
+# -----------------------------------------------------------------------------
+# DataFrame methods returning new DataFrame using shallow copy
+
+
+def test_reset_index():
+    # Case: resetting the index (i.e. adding a new column) + mutating the
+    # resulting dataframe
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=[10, 11, 12]
+    )
+    df_orig = df.copy()
+    df2 = df.reset_index()
+    df2._mgr._verify_integrity()
+
+    # still shares memory (df2 is a shallow copy)
+    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    # mutating df2 triggers a copy-on-write for that column / block
+    df2.iloc[0, 2] = 0
+    assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("index", [pd.RangeIndex(0, 2), Index([1, 2])])
+def test_reset_index_series_drop(index):
+    ser = Series([1, 2], index=index)
+    ser_orig = ser.copy()
+    ser2 = ser.reset_index(drop=True)
+    assert np.shares_memory(get_array(ser), get_array(ser2))
+    assert not ser._mgr._has_no_reference(0)
+
+    ser2.iloc[0] = 100
+    tm.assert_series_equal(ser, ser_orig)
+
+
+def test_groupby_column_index_in_references():
+    df = DataFrame(
+        {"A": ["a", "b", "c", "d"], "B": [1, 2, 3, 4], "C": ["a", "a", "b", "b"]}
+    )
+    df = df.set_index("A")
+    key = df["C"]
+    result = df.groupby(key, observed=True).sum()
+    expected = df.groupby("C", observed=True).sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_modify_series():
+    # https://github.com/pandas-dev/pandas/issues/63219
+    # Modifying a Series after using it to groupby should not impact
+    # the groupby operation.
+    ser = Series([1, 2, 1])
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    gb = df.groupby(ser)
+    ser.iloc[0] = 100
+    result = gb.sum()
+    expected = DataFrame({"a": [4, 2], "b": [10, 5]}, index=[1, 2])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_rename_columns():
+    # Case: renaming columns returns a new dataframe
+    # + afterwards modifying the result
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    df2 = df.rename(columns=str.upper)
+
+    assert np.shares_memory(get_array(df2, "A"), get_array(df, "a"))
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a"))
+    assert np.shares_memory(get_array(df2, "C"), get_array(df, "c"))
+    expected = DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]})
+    tm.assert_frame_equal(df2, expected)
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_rename_columns_modify_parent():
+    # Case: renaming columns returns a new dataframe
+    # + afterwards modifying the original (parent) dataframe
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df2 = df.rename(columns=str.upper)
+    df2_orig = df2.copy()
+
+    assert np.shares_memory(get_array(df2, "A"), get_array(df, "a"))
+    df.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a"))
+    assert np.shares_memory(get_array(df2, "C"), get_array(df, "c"))
+    expected = DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    tm.assert_frame_equal(df, expected)
+    tm.assert_frame_equal(df2, df2_orig)
+
+
+def test_pipe():
+    df = DataFrame({"a": [1, 2, 3], "b": 1.5})
+    df_orig = df.copy()
+
+    def testfunc(df):
+        return df
+
+    df2 = df.pipe(testfunc)
+
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column
+    df2.iloc[0, 0] = 0
+    tm.assert_frame_equal(df, df_orig)
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+
+
+def test_pipe_modify_df():
+    df = DataFrame({"a": [1, 2, 3], "b": 1.5})
+    df_orig = df.copy()
+
+    def testfunc(df):
+        df.iloc[0, 0] = 100
+        return df
+
+    df2 = df.pipe(testfunc)
+
+    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+
+    tm.assert_frame_equal(df, df_orig)
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+
+
+def test_reindex_columns():
+    # Case: reindexing the column returns a new dataframe
+    # + afterwards modifying the result
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    df2 = df.reindex(columns=["a", "c"])
+
+    # still shares memory (df2 is a shallow copy)
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    # mutating df2 triggers a copy-on-write for that column
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        lambda idx: idx,
+        lambda idx: idx.view(),
+        lambda idx: idx.copy(),
+        lambda idx: list(idx),
+    ],
+    ids=["identical", "view", "copy", "values"],
+)
+def test_reindex_rows(index):
+    # Case: reindexing the rows with an index that matches the current index
+    # can use a shallow copy
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    df2 = df.reindex(index=index(df.index))
+
+    # still shares memory (df2 is a shallow copy)
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    # mutating df2 triggers a copy-on-write for that column
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_drop_on_column():
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    df2 = df.drop(columns="a")
+    df2._mgr._verify_integrity()
+
+    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_select_dtypes():
+    # Case: selecting columns using `select_dtypes()` returns a new dataframe
+    # + afterwards modifying the result
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    df2 = df.select_dtypes("int64")
+    df2._mgr._verify_integrity()
+
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "filter_kwargs", [{"items": ["a"]}, {"like": "a"}, {"regex": "a"}]
+)
+def test_filter(filter_kwargs):
+    # Case: selecting columns using `filter()` returns a new dataframe
+    # + afterwards modifying the result
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    df2 = df.filter(**filter_kwargs)
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column/block
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_shift_no_op():
+    df = DataFrame(
+        [[1, 2], [3, 4], [5, 6]],
+        index=date_range("2020-01-01", "2020-01-03"),
+        columns=["a", "b"],
+    )
+    df_orig = df.copy()
+    df2 = df.shift(periods=0)
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert df2.index is not df.index
+    assert df2.columns is not df.columns
+
+    df.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+    assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+    tm.assert_frame_equal(df2, df_orig)
+
+
+def test_shift_index():
+    df = DataFrame(
+        [[1, 2], [3, 4], [5, 6]],
+        index=date_range("2020-01-01", "2020-01-03"),
+        columns=["a", "b"],
+    )
+    df2 = df.shift(periods=1, axis=0)
+
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert df2.index is not df.index
+    assert df2.columns is not df.columns
+
+
+def test_shift_rows_freq():
+    df = DataFrame(
+        [[1, 2], [3, 4], [5, 6]],
+        index=date_range("2020-01-01", "2020-01-03"),
+        columns=["a", "b"],
+    )
+    df_orig = df.copy()
+    df_orig.index = date_range("2020-01-02", "2020-01-04")
+    df2 = df.shift(periods=1, freq="1D")
+
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    df.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+    tm.assert_frame_equal(df2, df_orig)
+
+
+def test_shift_columns():
+    df = DataFrame(
+        [[1, 2], [3, 4], [5, 6]], columns=date_range("2020-01-01", "2020-01-02")
+    )
+    df2 = df.shift(periods=1, axis=1)
+
+    assert np.shares_memory(get_array(df2, "2020-01-02"), get_array(df, "2020-01-01"))
+    df.iloc[0, 0] = 0
+    assert not np.shares_memory(
+        get_array(df2, "2020-01-02"), get_array(df, "2020-01-01")
+    )
+    expected = DataFrame(
+        [[np.nan, 1], [np.nan, 3], [np.nan, 5]],
+        columns=date_range("2020-01-01", "2020-01-02"),
+    )
+    tm.assert_frame_equal(df2, expected)
+
+
+def test_pop():
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    view_original = df[:]
+    result = df.pop("a")
+
+    assert np.shares_memory(result.values, get_array(view_original, "a"))
+    assert np.shares_memory(get_array(df, "b"), get_array(view_original, "b"))
+
+    result.iloc[0] = 0
+    assert not np.shares_memory(result.values, get_array(view_original, "a"))
+    df.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df, "b"), get_array(view_original, "b"))
+    tm.assert_frame_equal(view_original, df_orig)
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda x, y: x.align(y),
+        lambda x, y: x.align(y.a, axis=0),
+        lambda x, y: x.align(y.a.iloc[slice(0, 1)], axis=1),
+    ],
+)
+def test_align_frame(func):
+    df = DataFrame({"a": [1, 2, 3], "b": "a"})
+    df_orig = df.copy()
+    df_changed = df[["b", "a"]].copy()
+    df2, _ = func(df, df_changed)
+
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_align_series():
+    ser = Series([1, 2])
+    ser_orig = ser.copy()
+    ser_other = ser.copy()
+    ser2, ser_other_result = ser.align(ser_other)
+
+    assert np.shares_memory(ser2.values, ser.values)
+    assert np.shares_memory(ser_other_result.values, ser_other.values)
+    ser2.iloc[0] = 0
+    ser_other_result.iloc[0] = 0
+    assert not np.shares_memory(ser2.values, ser.values)
+    assert not np.shares_memory(ser_other_result.values, ser_other.values)
+    tm.assert_series_equal(ser, ser_orig)
+    tm.assert_series_equal(ser_other, ser_orig)
+
+
+def test_align_copy_false():
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df_orig = df.copy()
+    df2, df3 = df.align(df)
+
+    assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+    assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    df2.loc[0, "a"] = 0
+    tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+
+    df3.loc[0, "a"] = 0
+    tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+
+
+def test_align_with_series_copy_false():
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    ser = Series([1, 2, 3], name="x")
+    ser_orig = ser.copy()
+    df_orig = df.copy()
+    df2, ser2 = df.align(ser, axis=0)
+
+    assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+    assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+    assert np.shares_memory(get_array(ser, "x"), get_array(ser2, "x"))
+
+    df2.loc[0, "a"] = 0
+    tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+
+    ser2.loc[0] = 0
+    tm.assert_series_equal(ser, ser_orig)  # Original is unchanged
+
+
+def test_to_frame():
+    # Case: converting a Series to a DataFrame with to_frame
+    ser = Series([1, 2, 3])
+    ser_orig = ser.copy()
+
+    df = ser[:].to_frame()
+
+    # currently this always returns a "view"
+    assert np.shares_memory(ser.values, get_array(df, 0))
+
+    df.iloc[0, 0] = 0
+
+    # mutating df triggers a copy-on-write for that column
+    assert not np.shares_memory(ser.values, get_array(df, 0))
+    tm.assert_series_equal(ser, ser_orig)
+
+    # modify original series -> don't modify dataframe
+    df = ser[:].to_frame()
+    ser.iloc[0] = 0
+
+    tm.assert_frame_equal(df, ser_orig.to_frame())
+
+    df = ser.to_frame()
+    assert df.index is not ser.index
+
+
+@pytest.mark.parametrize(
+    "method, idx",
+    [
+        (lambda df: df.copy(deep=False).copy(deep=False), 0),
+        (lambda df: df.reset_index().reset_index(), 2),
+        (lambda df: df.rename(columns=str.upper).rename(columns=str.lower), 0),
+        (lambda df: df.copy(deep=False).select_dtypes(include="number"), 0),
+    ],
+    ids=["shallow-copy", "reset_index", "rename", "select_dtypes"],
+)
+def test_chained_methods(method, idx):
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+
+    # modify df2 -> don't modify df
+    df2 = method(df)
+    df2.iloc[0, idx] = 0
+    tm.assert_frame_equal(df, df_orig)
+
+    # modify df -> don't modify df2
+    df2 = method(df)
+    df.iloc[0, 0] = 0
+    tm.assert_frame_equal(df2.iloc[:, idx:], df_orig)
+
+
+@pytest.mark.parametrize("obj", [Series([1, 2], name="a"), DataFrame({"a": [1, 2]})])
+def test_to_timestamp(obj):
+    obj.index = Index([Period("2012-1-1", freq="D"), Period("2012-1-2", freq="D")])
+
+    obj_orig = obj.copy()
+    obj2 = obj.to_timestamp()
+
+    assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a"))
+
+    # mutating obj2 triggers a copy-on-write for that column / block
+    obj2.iloc[0] = 0
+    assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a"))
+    tm.assert_equal(obj, obj_orig)
+
+
+@pytest.mark.parametrize("obj", [Series([1, 2], name="a"), DataFrame({"a": [1, 2]})])
+def test_to_period(obj):
+    obj.index = Index([Timestamp("2019-12-31"), Timestamp("2020-12-31")])
+
+    obj_orig = obj.copy()
+    obj2 = obj.to_period(freq="Y")
+
+    assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a"))
+
+    # mutating obj2 triggers a copy-on-write for that column / block
+    obj2.iloc[0] = 0
+    assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a"))
+    tm.assert_equal(obj, obj_orig)
+
+
+def test_set_index():
+    # GH 49473
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    df2 = df.set_index("a")
+
+    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+
+    # mutating df2 triggers a copy-on-write for that column / block
+    df2.iloc[0, 1] = 0
+    assert not np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_set_index_mutating_parent_does_not_mutate_index():
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+    result = df.set_index("a")
+    expected = result.copy()
+
+    df.iloc[0, 0] = 100
+    tm.assert_frame_equal(result, expected)
+
+
+def test_add_prefix():
+    # GH 49473
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    df2 = df.add_prefix("CoW_")
+
+    assert np.shares_memory(get_array(df2, "CoW_a"), get_array(df, "a"))
+    df2.iloc[0, 0] = 0
+
+    assert not np.shares_memory(get_array(df2, "CoW_a"), get_array(df, "a"))
+
+    assert np.shares_memory(get_array(df2, "CoW_c"), get_array(df, "c"))
+    expected = DataFrame(
+        {"CoW_a": [0, 2, 3], "CoW_b": [4, 5, 6], "CoW_c": [0.1, 0.2, 0.3]}
+    )
+    tm.assert_frame_equal(df2, expected)
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_add_suffix():
+    # GH 49473
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    df2 = df.add_suffix("_CoW")
+    assert np.shares_memory(get_array(df2, "a_CoW"), get_array(df, "a"))
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "a_CoW"), get_array(df, "a"))
+    assert np.shares_memory(get_array(df2, "c_CoW"), get_array(df, "c"))
+    expected = DataFrame(
+        {"a_CoW": [0, 2, 3], "b_CoW": [4, 5, 6], "c_CoW": [0.1, 0.2, 0.3]}
+    )
+    tm.assert_frame_equal(df2, expected)
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("axis, val", [(0, 5.5), (1, np.nan)])
+def test_dropna(axis, val):
+    df = DataFrame({"a": [1, 2, 3], "b": [4, val, 6], "c": "d"})
+    df_orig = df.copy()
+    df2 = df.dropna(axis=axis)
+
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("val", [5, 5.5])
+def test_dropna_series(val):
+    ser = Series([1, val, 4])
+    ser_orig = ser.copy()
+    ser2 = ser.dropna()
+    assert np.shares_memory(ser2.values, ser.values)
+
+    ser2.iloc[0] = 0
+    assert not np.shares_memory(ser2.values, ser.values)
+    tm.assert_series_equal(ser, ser_orig)
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda df: df.head(),
+        lambda df: df.head(2),
+        lambda df: df.tail(),
+        lambda df: df.tail(3),
+    ],
+)
+def test_head_tail(method):
+    df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    df2 = method(df)
+    df2._mgr._verify_integrity()
+
+    # We are explicitly deviating for CoW here to make an eager copy (avoids
+    # tracking references for very cheap ops)
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+
+    # modify df2 to trigger CoW for that block
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_infer_objects(using_infer_string):
+    df = DataFrame(
+        {"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"}
+    )
+    df_orig = df.copy()
+    df2 = df.infer_objects()
+
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    if using_infer_string and HAS_PYARROW:
+        assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    else:
+        assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+
+    df2.iloc[0, 0] = 0
+    df2.iloc[0, 1] = "d"
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_infer_objects_no_reference(using_infer_string):
+    df = DataFrame(
+        {
+            "a": [1, 2],
+            "b": Series(["x", "y"], dtype=object),
+            "c": 1,
+            "d": Series(
+                [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
+            ),
+            "e": Series(["z", "w"], dtype=object),
+        }
+    )
+    df = df.infer_objects()
+
+    arr_a = get_array(df, "a")
+    arr_b = get_array(df, "b")
+    arr_d = get_array(df, "d")
+
+    df.iloc[0, 0] = 0
+    df.iloc[0, 1] = "d"
+    df.iloc[0, 3] = Timestamp("2018-12-31")
+    assert np.shares_memory(arr_a, get_array(df, "a"))
+    if using_infer_string and HAS_PYARROW:
+        # note that the underlying memory of arr_b has been copied anyway
+        # because of the assignment, but the EA is updated inplace so still
+        # appears the share memory
+        assert tm.shares_memory(arr_b, get_array(df, "b"))
+    else:
+        # TODO(CoW): Block splitting causes references here
+        assert not np.shares_memory(arr_b, get_array(df, "b"))
+    assert np.shares_memory(arr_d, get_array(df, "d"))
+
+
+def test_infer_objects_reference():
+    df = DataFrame(
+        {
+            "a": [1, 2],
+            "b": Series(["x", "y"], dtype=object),
+            "c": 1,
+            "d": Series(
+                [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
+            ),
+        }
+    )
+    view = df[:]  # noqa: F841
+    df = df.infer_objects()
+
+    arr_a = get_array(df, "a")
+    arr_b = get_array(df, "b")
+    arr_d = get_array(df, "d")
+
+    df.iloc[0, 0] = 0
+    df.iloc[0, 1] = "d"
+    df.iloc[0, 3] = Timestamp("2018-12-31")
+    assert not np.shares_memory(arr_a, get_array(df, "a"))
+    assert not np.shares_memory(arr_b, get_array(df, "b"))
+    assert np.shares_memory(arr_d, get_array(df, "d"))
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"before": "a", "after": "b", "axis": 1},
+        {"before": 0, "after": 1, "axis": 0},
+    ],
+)
+def test_truncate(kwargs):
+    df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 2})
+    df_orig = df.copy()
+    df2 = df.truncate(**kwargs)
+    df2._mgr._verify_integrity()
+
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("method", ["assign", "drop_duplicates"])
+def test_assign_drop_duplicates(method):
+    df = DataFrame({"a": [1, 2, 3]})
+    df_orig = df.copy()
+    df2 = getattr(df, method)()
+    df2._mgr._verify_integrity()
+
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})])
+def test_take(obj):
+    # Check that no copy is made when we take all rows in original order
+    obj_orig = obj.copy()
+    obj2 = obj.take([0, 1])
+    assert np.shares_memory(obj2.values, obj.values)
+
+    obj2.iloc[0] = 0
+    assert not np.shares_memory(obj2.values, obj.values)
+    tm.assert_equal(obj, obj_orig)
+
+
+@pytest.mark.parametrize("obj", [Series([1, 2]), DataFrame({"a": [1, 2]})])
+def test_between_time(obj):
+    obj.index = date_range("2018-04-09", periods=2, freq="1D20min")
+    obj_orig = obj.copy()
+    obj2 = obj.between_time("0:00", "1:00")
+    assert np.shares_memory(obj2.values, obj.values)
+
+    obj2.iloc[0] = 0
+    assert not np.shares_memory(obj2.values, obj.values)
+    tm.assert_equal(obj, obj_orig)
+
+
+def test_reindex_like():
+    df = DataFrame({"a": [1, 2], "b": "a"})
+    other = DataFrame({"b": "a", "a": [1, 2]})
+
+    df_orig = df.copy()
+    df2 = df.reindex_like(other)
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df2.iloc[0, 1] = 0
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_sort_index():
+    # GH 49473
+    ser = Series([1, 2, 3])
+    ser_orig = ser.copy()
+    ser2 = ser.sort_index()
+    assert np.shares_memory(ser.values, ser2.values)
+
+    # mutating ser triggers a copy-on-write for the column / block
+    ser2.iloc[0] = 0
+    assert not np.shares_memory(ser2.values, ser.values)
+    tm.assert_series_equal(ser, ser_orig)
+
+
+@pytest.mark.parametrize(
+    "obj, kwargs",
+    [(Series([1, 2, 3], name="a"), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})],
+)
+def test_sort_values(obj, kwargs):
+    obj_orig = obj.copy()
+    obj2 = obj.sort_values(**kwargs)
+    assert np.shares_memory(get_array(obj2, "a"), get_array(obj, "a"))
+
+    # mutating df triggers a copy-on-write for the column / block
+    obj2.iloc[0] = 0
+    assert not np.shares_memory(get_array(obj2, "a"), get_array(obj, "a"))
+    tm.assert_equal(obj, obj_orig)
+
+
+@pytest.mark.parametrize(
+    "obj, kwargs",
+    [(Series([1, 2, 3], name="a"), {}), (DataFrame({"a": [1, 2, 3]}), {"by": "a"})],
+)
+def test_sort_values_inplace(obj, kwargs):
+    obj_orig = obj.copy()
+    view = obj[:]
+    obj.sort_values(inplace=True, **kwargs)
+
+    assert np.shares_memory(get_array(obj, "a"), get_array(view, "a"))
+
+    # mutating obj triggers a copy-on-write for the column / block
+    obj.iloc[0] = 0
+    assert not np.shares_memory(get_array(obj, "a"), get_array(view, "a"))
+    tm.assert_equal(view, obj_orig)
+
+
+@pytest.mark.parametrize("decimals", [-1, 0, 1])
+def test_round(decimals):
+    df = DataFrame({"a": [1, 2], "b": "c"})
+    df_orig = df.copy()
+    df2 = df.round(decimals=decimals)
+
+    assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    # TODO: Make inplace by using out parameter of ndarray.round?
+    if decimals >= 0 and Version(np.__version__) < Version("2.4.0.dev0"):
+        # Ensure lazy copy if no-op
+        # TODO: Cannot rely on Numpy returning view after version 2.3
+        assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert df2.index is not df.index
+    assert df2.columns is not df.columns
+
+    df2.iloc[0, 1] = "d"
+    df2.iloc[0, 0] = 4
+    assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_reorder_levels():
+    index = MultiIndex.from_tuples(
+        [(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"]
+    )
+    df = DataFrame({"a": [1, 2, 3, 4]}, index=index)
+    df_orig = df.copy()
+    df2 = df.reorder_levels(order=["two", "one"])
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_series_reorder_levels():
+    index = MultiIndex.from_tuples(
+        [(1, 1), (1, 2), (2, 1), (2, 2)], names=["one", "two"]
+    )
+    ser = Series([1, 2, 3, 4], index=index)
+    ser_orig = ser.copy()
+    ser2 = ser.reorder_levels(order=["two", "one"])
+    assert np.shares_memory(ser2.values, ser.values)
+
+    ser2.iloc[0] = 0
+    assert not np.shares_memory(ser2.values, ser.values)
+    tm.assert_series_equal(ser, ser_orig)
+
+
+@pytest.mark.parametrize("obj", [Series([1, 2, 3]), DataFrame({"a": [1, 2, 3]})])
+def test_swaplevel(obj):
+    index = MultiIndex.from_tuples([(1, 1), (1, 2), (2, 1)], names=["one", "two"])
+    obj.index = index
+    obj_orig = obj.copy()
+    obj2 = obj.swaplevel()
+    assert np.shares_memory(obj2.values, obj.values)
+
+    obj2.iloc[0] = 0
+    assert not np.shares_memory(obj2.values, obj.values)
+    tm.assert_equal(obj, obj_orig)
+
+
+def test_frame_set_axis():
+    # GH 49473
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+    df2 = df.set_axis(["a", "b", "c"], axis="index")
+
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column / block
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_series_set_axis():
+    # GH 49473
+    ser = Series([1, 2, 3])
+    ser_orig = ser.copy()
+    ser2 = ser.set_axis(["a", "b", "c"], axis="index")
+    assert np.shares_memory(ser, ser2)
+
+    # mutating ser triggers a copy-on-write for the column / block
+    ser2.iloc[0] = 0
+    assert not np.shares_memory(ser2, ser)
+    tm.assert_series_equal(ser, ser_orig)
+
+
+def test_set_flags():
+    ser = Series([1, 2, 3])
+    ser_orig = ser.copy()
+    ser2 = ser.set_flags(allows_duplicate_labels=False)
+
+    assert np.shares_memory(ser, ser2)
+
+    # mutating ser triggers a copy-on-write for the column / block
+    ser2.iloc[0] = 0
+    assert not np.shares_memory(ser2, ser)
+    tm.assert_series_equal(ser, ser_orig)
+
+
+@pytest.mark.parametrize("kwargs", [{"mapper": "test"}, {"index": "test"}])
+def test_rename_axis(kwargs):
+    df = DataFrame({"a": [1, 2, 3, 4]}, index=Index([1, 2, 3, 4], name="a"))
+    df_orig = df.copy()
+    df2 = df.rename_axis(**kwargs)
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    df2.iloc[0, 0] = 0
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize(
+    "func, tz", [("tz_convert", "Europe/Berlin"), ("tz_localize", None)]
+)
+def test_tz_convert_localize(func, tz):
+    # GH 49473
+    ser = Series(
+        [1, 2], index=date_range(start="2014-08-01 09:00", freq="h", periods=2, tz=tz)
+    )
+    ser_orig = ser.copy()
+    ser2 = getattr(ser, func)("US/Central")
+    assert np.shares_memory(ser.values, ser2.values)
+
+    # mutating ser triggers a copy-on-write for the column / block
+    ser2.iloc[0] = 0
+    assert not np.shares_memory(ser2.values, ser.values)
+    tm.assert_series_equal(ser, ser_orig)
+
+
+def test_droplevel():
+    # GH 49473
+    index = MultiIndex.from_tuples([(1, 1), (1, 2), (2, 1)], names=["one", "two"])
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, index=index)
+    df_orig = df.copy()
+    df2 = df.droplevel(0)
+
+    assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column / block
+    df2.iloc[0, 0] = 0
+
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_squeeze():
+    df = DataFrame({"a": [1, 2, 3]})
+    df_orig = df.copy()
+    series = df.squeeze()
+
+    # Should share memory regardless of CoW since squeeze is just an iloc
+    assert np.shares_memory(series.values, get_array(df, "a"))
+
+    # mutating squeezed df triggers a copy-on-write for that column/block
+    series.iloc[0] = 0
+    assert not np.shares_memory(series.values, get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_items():
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
+    df_orig = df.copy()
+
+    # Test this twice, since the second time, the item cache will be
+    # triggered, and we want to make sure it still works then.
+    for i in range(2):
+        for name, ser in df.items():
+            assert np.shares_memory(get_array(ser, name), get_array(df, name))
+
+            # mutating df triggers a copy-on-write for that column / block
+            ser.iloc[0] = 0
+
+            assert not np.shares_memory(get_array(ser, name), get_array(df, name))
+            tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("dtype", ["int64", "Int64"])
+def test_putmask(dtype):
+    df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype)
+    view = df[:]
+    df_orig = df.copy()
+    df[df == df] = 5
+
+    assert not np.shares_memory(get_array(view, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(view, df_orig)
+
+
+@pytest.mark.parametrize("dtype", ["int64", "Int64"])
+def test_putmask_no_reference(dtype):
+    df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype)
+    arr_a = get_array(df, "a")
+    df[df == df] = 5
+    assert np.shares_memory(arr_a, get_array(df, "a"))
+
+
+@pytest.mark.parametrize("dtype", ["float64", "Float64"])
+def test_putmask_aligns_rhs_no_reference(dtype):
+    df = DataFrame({"a": [1.5, 2], "b": 1.5}, dtype=dtype)
+    arr_a = get_array(df, "a")
+    df[df == df] = DataFrame({"a": [5.5, 5]})
+    assert np.shares_memory(arr_a, get_array(df, "a"))
+
+
+@pytest.mark.parametrize("val, exp, raises", [(5.5, True, True), (5, False, False)])
+def test_putmask_dont_copy_some_blocks(val, exp, raises: bool):
+    df = DataFrame({"a": [1, 2], "b": 1, "c": 1.5})
+    view = df[:]
+    df_orig = df.copy()
+    indexer = DataFrame(
+        [[True, False, False], [True, False, False]], columns=list("abc")
+    )
+    if raises:
+        with pytest.raises(TypeError, match="Invalid value"):
+            df[indexer] = val
+    else:
+        df[indexer] = val
+        assert not np.shares_memory(get_array(view, "a"), get_array(df, "a"))
+        # TODO(CoW): Could split blocks to avoid copying the whole block
+        assert np.shares_memory(get_array(view, "b"), get_array(df, "b")) is exp
+        assert np.shares_memory(get_array(view, "c"), get_array(df, "c"))
+        assert df._mgr._has_no_reference(1) is not exp
+        assert not df._mgr._has_no_reference(2)
+        tm.assert_frame_equal(view, df_orig)
+
+
+@pytest.mark.parametrize("dtype", ["int64", "Int64"])
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda ser: ser.where(ser > 0, 10),
+        lambda ser: ser.mask(ser <= 0, 10),
+    ],
+)
+def test_where_mask_noop(dtype, func):
+    ser = Series([1, 2, 3], dtype=dtype)
+    ser_orig = ser.copy()
+
+    result = func(ser)
+    assert np.shares_memory(get_array(ser), get_array(result))
+    assert result.index is not ser.index
+
+    result.iloc[0] = 10
+    assert not np.shares_memory(get_array(ser), get_array(result))
+    tm.assert_series_equal(ser, ser_orig)
+
+
+@pytest.mark.parametrize("dtype", ["int64", "Int64"])
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda ser: ser.where(ser < 0, 10),
+        lambda ser: ser.mask(ser >= 0, 10),
+    ],
+)
+def test_where_mask(dtype, func):
+    ser = Series([1, 2, 3], dtype=dtype)
+    ser_orig = ser.copy()
+
+    result = func(ser)
+
+    assert not np.shares_memory(get_array(ser), get_array(result))
+    assert result.index is not ser.index
+    tm.assert_series_equal(ser, ser_orig)
+
+
+@pytest.mark.parametrize("dtype, val", [("int64", 10.5), ("Int64", 10)])
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda df, val: df.where(df < 0, val),
+        lambda df, val: df.mask(df >= 0, val),
+    ],
+)
+def test_where_mask_noop_on_single_column(dtype, val, func):
+    df = DataFrame({"a": [1, 2, 3], "b": [-4, -5, -6]}, dtype=dtype)
+    df_orig = df.copy()
+
+    result = func(df, val)
+    assert np.shares_memory(get_array(df, "b"), get_array(result, "b"))
+    assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
+
+    result.iloc[0, 1] = 10
+    assert not np.shares_memory(get_array(df, "b"), get_array(result, "b"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("func", ["mask", "where"])
+def test_chained_where_mask(func):
+    df = DataFrame({"a": [1, 4, 2], "b": 1})
+    df_orig = df.copy()
+    with tm.raises_chained_assignment_error():
+        getattr(df["a"], func)(df["a"] > 2, 5, inplace=True)
+    tm.assert_frame_equal(df, df_orig)
+
+    with tm.raises_chained_assignment_error():
+        getattr(df[["a"]], func)(df["a"] > 2, 5, inplace=True)
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_asfreq_noop():
+    df = DataFrame(
+        {"a": [0.0, None, 2.0, 3.0]},
+        index=date_range("1/1/2000", periods=4, freq="min"),
+    )
+    df_orig = df.copy()
+    df2 = df.asfreq(freq="min")
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+    # mutating df2 triggers a copy-on-write for that column / block
+    df2.iloc[0, 0] = 0
+
+    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_iterrows():
+    df = DataFrame({"a": 0, "b": 1}, index=[1, 2, 3])
+    df_orig = df.copy()
+
+    for _, sub in df.iterrows():
+        sub.iloc[0] = 100
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_interpolate_creates_copy():
+    # GH#51126
+    df = DataFrame({"a": [1.5, np.nan, 3]})
+    view = df[:]
+    expected = df.copy()
+
+    df.ffill(inplace=True)
+    df.iloc[0, 0] = 100.5
+    tm.assert_frame_equal(view, expected)
+
+
+def test_isetitem():
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
+    df_orig = df.copy()
+    df2 = df.copy(deep=False)  # Trigger a CoW
+    df2.isetitem(1, np.array([-1, -2, -3]))  # This is inplace
+    assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
+    assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    df2.loc[0, "a"] = 0
+    tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+    assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
+
+
+@pytest.mark.parametrize(
+    "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
+)
+def test_isetitem_series(dtype):
+    df = DataFrame({"a": [1, 2, 3], "b": np.array([4, 5, 6], dtype=dtype)})
+    ser = Series([7, 8, 9])
+    ser_orig = ser.copy()
+    df.isetitem(0, ser)
+
+    assert np.shares_memory(get_array(df, "a"), get_array(ser))
+    assert not df._mgr._has_no_reference(0)
+
+    # mutating dataframe doesn't update series
+    df.loc[0, "a"] = 0
+    tm.assert_series_equal(ser, ser_orig)
+
+    # mutating series doesn't update dataframe
+    df = DataFrame({"a": [1, 2, 3], "b": np.array([4, 5, 6], dtype=dtype)})
+    ser = Series([7, 8, 9])
+    df.isetitem(0, ser)
+
+    ser.loc[0] = 0
+    expected = DataFrame({"a": [7, 8, 9], "b": np.array([4, 5, 6], dtype=dtype)})
+    tm.assert_frame_equal(df, expected)
+
+
+def test_isetitem_frame():
+    df = DataFrame({"a": [1, 2, 3], "b": 1, "c": 2})
+    rhs = DataFrame({"a": [4, 5, 6], "b": 2})
+    df.isetitem([0, 1], rhs)
+    assert np.shares_memory(get_array(df, "a"), get_array(rhs, "a"))
+    assert np.shares_memory(get_array(df, "b"), get_array(rhs, "b"))
+    assert not df._mgr._has_no_reference(0)
+    expected = df.copy()
+    rhs.iloc[0, 0] = 100
+    rhs.iloc[0, 1] = 100
+    tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.parametrize("key", ["a", ["a"]])
+def test_get(key):
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df_orig = df.copy()
+
+    result = df.get(key)
+
+    assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
+    result.iloc[0] = 0
+    assert not np.shares_memory(get_array(result, "a"), get_array(df, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("axis, key", [(0, 0), (1, "a")])
+@pytest.mark.parametrize(
+    "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
+)
+def test_xs(axis, key, dtype):
+    single_block = dtype == "int64"
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
+    )
+    df_orig = df.copy()
+
+    result = df.xs(key, axis=axis)
+
+    if axis == 1 or single_block:
+        assert np.shares_memory(get_array(df, "a"), get_array(result))
+    else:
+        assert result._mgr._has_no_reference(0)
+    if axis == 0:
+        assert result.index is not df.columns
+    else:
+        assert result.index is not df.index
+
+    result.iloc[0] = 0
+    tm.assert_frame_equal(df, df_orig)
+
+
+@pytest.mark.parametrize("axis", [0, 1])
+@pytest.mark.parametrize("key, level", [("l1", 0), (2, 1)])
+def test_xs_multiindex(key, level, axis):
+    arr = np.arange(18).reshape(6, 3)
+    index = MultiIndex.from_product([["l1", "l2"], [1, 2, 3]], names=["lev1", "lev2"])
+    df = DataFrame(arr, index=index, columns=list("abc"))
+    if axis == 1:
+        df = df.transpose().copy()
+    df_orig = df.copy()
+
+    result = df.xs(key, level=level, axis=axis)
+
+    if level == 0:
+        assert np.shares_memory(
+            get_array(df, df.columns[0]), get_array(result, result.columns[0])
+        )
+    assert result.index is not df.index
+    assert result.columns is not df.columns
+
+    result.iloc[0, 0] = 0
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_update_frame():
+    df1 = DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 5.0, 6.0]})
+    df2 = DataFrame({"b": [100.0]}, index=[1])
+    df1_orig = df1.copy()
+    view = df1[:]
+    df1.update(df2)
+
+    expected = DataFrame({"a": [1.0, 2.0, 3.0], "b": [4.0, 100.0, 6.0]})
+    tm.assert_frame_equal(df1, expected)
+    # df1 is updated, but its view not
+    tm.assert_frame_equal(view, df1_orig)
+    assert np.shares_memory(get_array(df1, "a"), get_array(view, "a"))
+    assert not np.shares_memory(get_array(df1, "b"), get_array(view, "b"))
+
+
+def test_update_series():
+    ser1 = Series([1.0, 2.0, 3.0])
+    ser2 = Series([100.0], index=[1])
+    ser1_orig = ser1.copy()
+    view = ser1[:]
+
+    ser1.update(ser2)
+
+    expected = Series([1.0, 100.0, 3.0])
+    tm.assert_series_equal(ser1, expected)
+    # ser1 is updated, but its view not
+    tm.assert_series_equal(view, ser1_orig)
+
+
+def test_update_chained_assignment():
+    df = DataFrame({"a": [1, 2, 3]})
+    ser2 = Series([100.0], index=[1])
+    df_orig = df.copy()
+    with tm.raises_chained_assignment_error():
+        df["a"].update(ser2)
+    tm.assert_frame_equal(df, df_orig)
+
+    with tm.raises_chained_assignment_error():
+        df[["a"]].update(ser2.to_frame())
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_inplace_arithmetic_series():
+    ser = Series([1, 2, 3])
+    ser_orig = ser.copy()
+    data = get_array(ser)
+    ser *= 2
+    # https://github.com/pandas-dev/pandas/pull/55745
+    # changed to NOT update inplace because there is no benefit (actual
+    # operation already done non-inplace). This was only for the optics
+    # of updating the backing array inplace, but we no longer want to make
+    # that guarantee
+    assert not np.shares_memory(get_array(ser), data)
+    tm.assert_numpy_array_equal(data, get_array(ser_orig))
+
+
+def test_inplace_arithmetic_series_with_reference():
+    ser = Series([1, 2, 3])
+    ser_orig = ser.copy()
+    view = ser[:]
+    ser *= 2
+    assert not np.shares_memory(get_array(ser), get_array(view))
+    tm.assert_series_equal(ser_orig, view)
+
+
+def test_transpose():
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+    df_orig = df.copy()
+    result = df.transpose()
+    assert np.shares_memory(get_array(df, "a"), get_array(result, 0))
+
+    result.iloc[0, 0] = 100
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_transpose_different_dtypes():
+    df = DataFrame({"a": [1, 2, 3], "b": 1.5})
+    df_orig = df.copy()
+    result = df.T
+
+    assert not np.shares_memory(get_array(df, "a"), get_array(result, 0))
+    result.iloc[0, 0] = 100
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_transpose_ea_single_column():
+    df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
+    result = df.T
+
+    assert not np.shares_memory(get_array(df, "a"), get_array(result, 0))
+
+
+def test_transform_frame():
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+    df_orig = df.copy()
+
+    def func(ser):
+        ser.iloc[0] = 100
+        return ser
+
+    df.transform(func)
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_transform_series():
+    ser = Series([1, 2, 3])
+    ser_orig = ser.copy()
+
+    def func(ser):
+        ser.iloc[0] = 100
+        return ser
+
+    ser.transform(func)
+    tm.assert_series_equal(ser, ser_orig)
+
+
+def test_count_read_only_array():
+    df = DataFrame({"a": [1, 2], "b": 3})
+    result = df.count()
+    result.iloc[0] = 100
+    expected = Series([100, 2], index=["a", "b"])
+    tm.assert_series_equal(result, expected)
+
+
+def test_insert_series():
+    df = DataFrame({"a": [1, 2, 3]})
+    ser = Series([1, 2, 3])
+    ser_orig = ser.copy()
+    df.insert(loc=1, value=ser, column="b")
+    assert np.shares_memory(get_array(ser), get_array(df, "b"))
+    assert not df._mgr._has_no_reference(1)
+
+    df.iloc[0, 1] = 100
+    tm.assert_series_equal(ser, ser_orig)
+
+
+def test_eval():
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+    df_orig = df.copy()
+
+    result = df.eval("c = a+b")
+    assert np.shares_memory(get_array(df, "a"), get_array(result, "a"))
+
+    result.iloc[0, 0] = 100
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_eval_inplace():
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+    df_orig = df.copy()
+    df_view = df[:]
+
+    df.eval("c = a+b", inplace=True)
+    assert np.shares_memory(get_array(df, "a"), get_array(df_view, "a"))
+
+    df.iloc[0, 0] = 100
+    tm.assert_frame_equal(df_view, df_orig)
+
+
+def test_apply_modify_row():
+    # Case: applying a function on each row as a Series object, where the
+    # function mutates the row object (which needs to trigger CoW if row is a view)
+    df = DataFrame({"A": [1, 2], "B": [3, 4]})
+    df_orig = df.copy()
+
+    def transform(row):
+        row["B"] = 100
+        return row
+
+    df.apply(transform, axis=1)
+
+    tm.assert_frame_equal(df, df_orig)
+
+    # row Series is a copy
+    df = DataFrame({"A": [1, 2], "B": ["b", "c"]})
+    df_orig = df.copy()
+
+    with tm.assert_produces_warning(None):
+        df.apply(transform, axis=1)
+
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_reduce():
+    df = DataFrame({"a": [1, 2, 3], "b": 1.5})
+
+    result = df.sum()
+    assert result.index is not df.columns
+
+    result = df.groupby([0, 0, 1]).sum()
+    assert result.columns is not df.columns
+
+    result = df.quantile(0.5)
+    assert result.index is not df.columns
+    result = df.quantile([0.25, 0.5, 0.75])
+    assert result.columns is not df.columns
+
+
+def test_diff():
+    df = DataFrame({"a": [1, 2, 3], "b": 1.5})
+
+    result = df.diff()
+    assert result.index is not df.index
+    assert result.columns is not df.columns
+
+    ser = Series([1, 2, 3])
+    result = ser.diff()
+    assert result.index is not ser.index
diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4838a5e68ab8328d6263289bc73309424edf458
--- /dev/null
+++ b/pandas/tests/copy_view/test_replace.py
@@ -0,0 +1,356 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    Categorical,
+    DataFrame,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+
+@pytest.mark.parametrize(
+    "replace_kwargs",
+    [
+        {"to_replace": {"a": 1, "b": 4}, "value": -1},
+        # Test CoW splits blocks to avoid copying unchanged columns
+        {"to_replace": {"a": 1}, "value": -1},
+        {"to_replace": {"b": 4}, "value": -1},
+        {"to_replace": {"b": {4: 1}}},
+        # TODO: Add these in a further optimization
+        # We would need to see which columns got replaced in the mask
+        # which could be expensive
+        # {"to_replace": {"b": 1}},
+        # 1
+    ],
+)
+def test_replace(replace_kwargs):
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
+    df_orig = df.copy()
+
+    df_replaced = df.replace(**replace_kwargs)
+
+    if (df_replaced["b"] == df["b"]).all():
+        assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
+    assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
+
+    # mutating squeezed df triggers a copy-on-write for that column/block
+    df_replaced.loc[0, "c"] = -1
+    assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
+
+    if "a" in replace_kwargs["to_replace"]:
+        arr = get_array(df_replaced, "a")
+        df_replaced.loc[0, "a"] = 100
+        assert np.shares_memory(get_array(df_replaced, "a"), arr)
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_replace_regex_inplace_refs():
+    df = DataFrame({"a": ["aaa", "bbb"]})
+    df_orig = df.copy()
+    view = df[:]
+    arr = get_array(df, "a")
+    df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
+    assert not np.shares_memory(arr, get_array(df, "a"))
+    assert df._mgr._has_no_reference(0)
+    tm.assert_frame_equal(view, df_orig)
+
+
+def test_replace_regex_inplace():
+    df = DataFrame({"a": ["aaa", "bbb"]})
+    arr = get_array(df, "a")
+    df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
+    assert df._mgr._has_no_reference(0)
+    assert tm.shares_memory(arr, get_array(df, "a"))
+
+    df_orig = df.copy()
+    df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
+    tm.assert_frame_equal(df_orig, df)
+    assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+
+def test_replace_regex_inplace_no_op():
+    df = DataFrame({"a": [1, 2]})
+    arr = get_array(df, "a")
+    df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True)
+    assert df._mgr._has_no_reference(0)
+    assert np.shares_memory(arr, get_array(df, "a"))
+
+    df_orig = df.copy()
+    df2 = df.replace(to_replace=r"^x.$", value="new", regex=True)
+    tm.assert_frame_equal(df_orig, df)
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+
+
+def test_replace_mask_all_false_second_block():
+    df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
+    df_orig = df.copy()
+
+    df2 = df.replace(to_replace=1.5, value=55.5)
+
+    # TODO: Block splitting would allow us to avoid copying b
+    assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
+    assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    df2.loc[0, "c"] = 1
+    tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+
+    assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
+    assert np.shares_memory(get_array(df, "d"), get_array(df2, "d"))
+
+
+def test_replace_coerce_single_column():
+    df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
+    df_orig = df.copy()
+
+    df2 = df.replace(to_replace=1.5, value="a")
+    assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+    assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    df2.loc[0, "b"] = 0.5
+    tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+    assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+
+
+def test_replace_to_replace_wrong_dtype():
+    df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
+    df_orig = df.copy()
+
+    df2 = df.replace(to_replace="xxx", value=1.5)
+
+    assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+    assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    df2.loc[0, "b"] = 0.5
+    tm.assert_frame_equal(df, df_orig)  # Original is unchanged
+    assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
+
+
+def test_replace_list_categorical():
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
+    arr = get_array(df, "a")
+
+    df.replace(["c"], value="a", inplace=True)
+    assert np.shares_memory(arr.codes, get_array(df, "a").codes)
+    assert df._mgr._has_no_reference(0)
+
+    df_orig = df.copy()
+    df.replace(["b"], value="a")
+    df2 = df.apply(lambda x: x.cat.rename_categories({"b": "d"}))
+    assert not np.shares_memory(arr.codes, get_array(df2, "a").codes)
+
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_replace_list_inplace_refs_categorical():
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
+    view = df[:]
+    df_orig = df.copy()
+    df.replace(["c"], value="a", inplace=True)
+    tm.assert_frame_equal(df_orig, view)
+
+
+@pytest.mark.parametrize("to_replace", [1.5, [1.5], []])
+def test_replace_inplace(to_replace):
+    df = DataFrame({"a": [1.5, 2, 3]})
+    arr_a = get_array(df, "a")
+    df.replace(to_replace=1.5, value=15.5, inplace=True)
+
+    assert np.shares_memory(get_array(df, "a"), arr_a)
+    assert df._mgr._has_no_reference(0)
+
+
+@pytest.mark.parametrize("to_replace", [1.5, [1.5]])
+def test_replace_inplace_reference(to_replace):
+    df = DataFrame({"a": [1.5, 2, 3]})
+    arr_a = get_array(df, "a")
+    view = df[:]
+    df.replace(to_replace=to_replace, value=15.5, inplace=True)
+
+    assert not np.shares_memory(get_array(df, "a"), arr_a)
+    assert df._mgr._has_no_reference(0)
+    assert view._mgr._has_no_reference(0)
+
+
+@pytest.mark.parametrize("to_replace", ["a", 100.5])
+def test_replace_inplace_reference_no_op(to_replace):
+    df = DataFrame({"a": [1.5, 2, 3]})
+    arr_a = get_array(df, "a")
+    view = df[:]
+    df.replace(to_replace=to_replace, value=15.5, inplace=True)
+
+    assert np.shares_memory(get_array(df, "a"), arr_a)
+    assert not df._mgr._has_no_reference(0)
+    assert not view._mgr._has_no_reference(0)
+
+
+@pytest.mark.parametrize("to_replace", [1, [1]])
+def test_replace_categorical_inplace_reference(to_replace):
+    df = DataFrame({"a": Categorical([1, 2, 3])})
+    df_orig = df.copy()
+    arr_a = get_array(df, "a")
+    view = df[:]
+    df.replace(to_replace=to_replace, value=1, inplace=True)
+    assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
+    assert df._mgr._has_no_reference(0)
+    assert view._mgr._has_no_reference(0)
+    tm.assert_frame_equal(view, df_orig)
+
+
+def test_replace_categorical_inplace():
+    df = DataFrame({"a": Categorical([1, 2, 3])})
+    arr_a = get_array(df, "a")
+    df.replace(to_replace=1, value=1, inplace=True)
+
+    assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
+    assert df._mgr._has_no_reference(0)
+
+    expected = DataFrame({"a": Categorical([1, 2, 3])})
+    tm.assert_frame_equal(df, expected)
+
+
+def test_replace_categorical():
+    df = DataFrame({"a": Categorical([1, 2, 3])})
+    df_orig = df.copy()
+    df2 = df.replace(to_replace=1, value=1)
+
+    assert df._mgr._has_no_reference(0)
+    assert df2._mgr._has_no_reference(0)
+    assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes)
+    tm.assert_frame_equal(df, df_orig)
+
+    arr_a = get_array(df2, "a").codes
+    df2.iloc[0, 0] = 2.0
+    assert np.shares_memory(get_array(df2, "a").codes, arr_a)
+
+
+@pytest.mark.parametrize("method", ["where", "mask"])
+def test_masking_inplace(method):
+    df = DataFrame({"a": [1.5, 2, 3]})
+    df_orig = df.copy()
+    arr_a = get_array(df, "a")
+    view = df[:]
+
+    method = getattr(df, method)
+    method(df["a"] > 1.6, -1, inplace=True)
+
+    assert not np.shares_memory(get_array(df, "a"), arr_a)
+    assert df._mgr._has_no_reference(0)
+    assert view._mgr._has_no_reference(0)
+    tm.assert_frame_equal(view, df_orig)
+
+
+def test_replace_empty_list():
+    df = DataFrame({"a": [1, 2]})
+
+    df2 = df.replace([], [])
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert not df._mgr._has_no_reference(0)
+    arr_a = get_array(df, "a")
+    df.replace([], [])
+    assert np.shares_memory(get_array(df, "a"), arr_a)
+    assert not df._mgr._has_no_reference(0)
+    assert not df2._mgr._has_no_reference(0)
+
+
+@pytest.mark.parametrize("value", ["d", None])
+def test_replace_object_list_inplace(value):
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype=object)
+    arr = get_array(df, "a")
+    df.replace(["c"], value, inplace=True)
+    assert np.shares_memory(arr, get_array(df, "a"))
+    assert df._mgr._has_no_reference(0)
+
+
+def test_replace_list_multiple_elements_inplace():
+    df = DataFrame({"a": [1, 2, 3]})
+    arr = get_array(df, "a")
+    df.replace([1, 2], 4, inplace=True)
+    assert np.shares_memory(arr, get_array(df, "a"))
+    assert df._mgr._has_no_reference(0)
+
+
+def test_replace_list_none():
+    df = DataFrame({"a": ["a", "b", "c"]})
+
+    df_orig = df.copy()
+    df2 = df.replace(["b"], value=None)
+    tm.assert_frame_equal(df, df_orig)
+
+    assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
+
+    # replace multiple values that don't actually replace anything with None
+    # https://github.com/pandas-dev/pandas/issues/59770
+    df3 = df.replace(["d", "e", "f"], value=None)
+    tm.assert_frame_equal(df3, df_orig)
+    assert tm.shares_memory(get_array(df, "a"), get_array(df3, "a"))
+
+
+def test_replace_list_none_inplace_refs():
+    df = DataFrame({"a": ["a", "b", "c"]})
+    arr = get_array(df, "a")
+    df_orig = df.copy()
+    view = df[:]
+    df.replace(["a"], value=None, inplace=True)
+    assert df._mgr._has_no_reference(0)
+    assert not np.shares_memory(arr, get_array(df, "a"))
+    tm.assert_frame_equal(df_orig, view)
+
+
+def test_replace_columnwise_no_op_inplace():
+    df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
+    view = df[:]
+    df_orig = df.copy()
+    df.replace({"a": 10}, 100, inplace=True)
+    assert np.shares_memory(get_array(view, "a"), get_array(df, "a"))
+    df.iloc[0, 0] = 100
+    tm.assert_frame_equal(view, df_orig)
+
+
+def test_replace_columnwise_no_op():
+    df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
+    df_orig = df.copy()
+    df2 = df.replace({"a": 10}, 100)
+    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    df2.iloc[0, 0] = 100
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_replace_chained_assignment():
+    df = DataFrame({"a": [1, np.nan, 2], "b": 1})
+    df_orig = df.copy()
+    with tm.raises_chained_assignment_error():
+        df["a"].replace(1, 100, inplace=True)
+    tm.assert_frame_equal(df, df_orig)
+
+    with tm.raises_chained_assignment_error():
+        df[["a"]].replace(1, 100, inplace=True)
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_replace_listlike():
+    df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
+    df_orig = df.copy()
+
+    result = df.replace([200, 201], [11, 11])
+    assert np.shares_memory(get_array(result, "a"), get_array(df, "a"))
+
+    result.iloc[0, 0] = 100
+    tm.assert_frame_equal(df, df)
+
+    result = df.replace([200, 2], [10, 10])
+    assert not np.shares_memory(get_array(df, "a"), get_array(result, "a"))
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_replace_listlike_inplace():
+    df = DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]})
+    arr = get_array(df, "a")
+    df.replace([200, 2], [10, 11], inplace=True)
+    assert np.shares_memory(get_array(df, "a"), arr)
+
+    view = df[:]
+    df_orig = df.copy()
+    df.replace([200, 3], [10, 11], inplace=True)
+    assert not np.shares_memory(get_array(df, "a"), arr)
+    tm.assert_frame_equal(view, df_orig)
diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f28e9826c7a1bb2b5379e005f0bd2fd57ef4067
--- /dev/null
+++ b/pandas/tests/copy_view/test_setitem.py
@@ -0,0 +1,142 @@
+import numpy as np
+
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    RangeIndex,
+    Series,
+)
+import pandas._testing as tm
+from pandas.tests.copy_view.util import get_array
+
+# -----------------------------------------------------------------------------
+# Copy/view behaviour for the values that are set in a DataFrame
+
+
+def test_set_column_with_array():
+    # Case: setting an array as a new column (df[col] = arr) copies that data
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    arr = np.array([1, 2, 3], dtype="int64")
+
+    df["c"] = arr
+
+    # the array data is copied
+    assert not np.shares_memory(get_array(df, "c"), arr)
+    # and thus modifying the array does not modify the DataFrame
+    arr[0] = 0
+    tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))
+
+
+def test_set_column_with_series():
+    # Case: setting a series as a new column (df[col] = s) copies that data
+    # (with delayed copy with CoW)
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    ser = Series([1, 2, 3])
+
+    df["c"] = ser
+
+    assert np.shares_memory(get_array(df, "c"), get_array(ser))
+
+    # and modifying the series does not modify the DataFrame
+    ser.iloc[0] = 0
+    assert ser.iloc[0] == 0
+    tm.assert_series_equal(df["c"], Series([1, 2, 3], name="c"))
+
+
+def test_set_column_with_index():
+    # Case: setting an index as a new column (df[col] = idx) copies that data
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    idx = Index([1, 2, 3])
+
+    df["c"] = idx
+
+    # the index data is copied
+    assert not np.shares_memory(get_array(df, "c"), idx.values)
+
+    idx = RangeIndex(1, 4)
+    arr = idx.values
+
+    df["d"] = idx
+
+    assert not np.shares_memory(get_array(df, "d"), arr)
+
+
+def test_set_columns_with_dataframe():
+    # Case: setting a DataFrame as new columns copies that data
+    # (with delayed copy with CoW)
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df2 = DataFrame({"c": [7, 8, 9], "d": [10, 11, 12]})
+
+    df[["c", "d"]] = df2
+
+    assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
+    # and modifying the set DataFrame does not modify the original DataFrame
+    df2.iloc[0, 0] = 0
+    tm.assert_series_equal(df["c"], Series([7, 8, 9], name="c"))
+
+
+def test_setitem_series_no_copy():
+    # Case: setting a Series as column into a DataFrame can delay copying that data
+    df = DataFrame({"a": [1, 2, 3]})
+    rhs = Series([4, 5, 6])
+    rhs_orig = rhs.copy()
+
+    # adding a new column
+    df["b"] = rhs
+    assert np.shares_memory(get_array(rhs), get_array(df, "b"))
+
+    df.iloc[0, 1] = 100
+    tm.assert_series_equal(rhs, rhs_orig)
+
+
+def test_setitem_series_no_copy_single_block():
+    # Overwriting an existing column that is a single block
+    df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
+    rhs = Series([4, 5, 6])
+    rhs_orig = rhs.copy()
+
+    df["a"] = rhs
+    assert np.shares_memory(get_array(rhs), get_array(df, "a"))
+
+    df.iloc[0, 0] = 100
+    tm.assert_series_equal(rhs, rhs_orig)
+
+
+def test_setitem_series_no_copy_split_block():
+    # Overwriting an existing column that is part of a larger block
+    df = DataFrame({"a": [1, 2, 3], "b": 1})
+    rhs = Series([4, 5, 6])
+    rhs_orig = rhs.copy()
+
+    df["b"] = rhs
+    assert np.shares_memory(get_array(rhs), get_array(df, "b"))
+
+    df.iloc[0, 1] = 100
+    tm.assert_series_equal(rhs, rhs_orig)
+
+
+def test_setitem_series_column_midx_broadcasting():
+    # Setting a Series to multiple columns will repeat the data
+    # (currently copying the data eagerly)
+    df = DataFrame(
+        [[1, 2, 3], [3, 4, 5]],
+        columns=MultiIndex.from_arrays([["a", "a", "b"], [1, 2, 3]]),
+    )
+    rhs = Series([10, 11])
+    df["a"] = rhs
+    assert not np.shares_memory(get_array(rhs), df._get_column_array(0))
+    assert df._mgr._has_no_reference(0)
+
+
+def test_set_column_with_inplace_operator():
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+
+    # this should not raise any warning
+    with tm.assert_produces_warning(None):
+        df["a"] += 1
+
+    # when it is not in a chain, then it should produce a warning
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    ser = df["a"]
+    ser += 1
diff --git a/pandas/tests/copy_view/test_util.py b/pandas/tests/copy_view/test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff55330d70b28c5459a4c0915dd93c8640a91add
--- /dev/null
+++ b/pandas/tests/copy_view/test_util.py
@@ -0,0 +1,14 @@
+import numpy as np
+
+from pandas import DataFrame
+from pandas.tests.copy_view.util import get_array
+
+
+def test_get_array_numpy():
+    df = DataFrame({"a": [1, 2, 3]})
+    assert np.shares_memory(get_array(df, "a"), get_array(df, "a"))
+
+
+def test_get_array_masked():
+    df = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
+    assert np.shares_memory(get_array(df, "a"), get_array(df, "a"))
diff --git a/pandas/tests/copy_view/util.py b/pandas/tests/copy_view/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..969334424936559767b0bca87093acfec52f9763
--- /dev/null
+++ b/pandas/tests/copy_view/util.py
@@ -0,0 +1,30 @@
+from pandas import (
+    Categorical,
+    Index,
+    Series,
+)
+from pandas.core.arrays import BaseMaskedArray
+
+
+def get_array(obj, col=None):
+    """
+    Helper method to get array for a DataFrame column or a Series.
+
+    Equivalent of df[col].values, but without going through normal getitem,
+    which triggers tracking references / CoW (and we might be testing that
+    this is done by some other operation).
+    """
+    if isinstance(obj, Index):
+        arr = obj._values
+    elif isinstance(obj, Series) and (col is None or obj.name == col):
+        arr = obj._values
+    else:
+        assert col is not None
+        icol = obj.columns.get_loc(col)
+        assert isinstance(icol, int)
+        arr = obj._get_column_array(icol)
+    if isinstance(arr, BaseMaskedArray):
+        return arr._data
+    elif isinstance(arr, Categorical):
+        return arr
+    return getattr(arr, "_ndarray", arr)
diff --git a/pandas/tests/dtypes/__init__.py b/pandas/tests/dtypes/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1d8195321140a0bc9e43473b561c5f2d09a1973
--- /dev/null
+++ b/pandas/tests/dtypes/test_common.py
@@ -0,0 +1,882 @@
+from __future__ import annotations
+
+import numpy as np
+import pytest
+
+from pandas.compat import HAS_PYARROW
+from pandas.errors import Pandas4Warning
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.astype import astype_array
+import pandas.core.dtypes.common as com
+from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
+    CategoricalDtypeType,
+    DatetimeTZDtype,
+    ExtensionDtype,
+    IntervalDtype,
+    PeriodDtype,
+)
+from pandas.core.dtypes.missing import isna
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.api.types import pandas_dtype
+from pandas.arrays import SparseArray
+from pandas.util.version import Version
+
+
+# EA & Actual Dtypes
+def to_ea_dtypes(dtypes):
+    """convert list of string dtypes to EA dtype"""
+    return [getattr(pd, dt + "Dtype") for dt in dtypes]
+
+
+def to_numpy_dtypes(dtypes):
+    """convert list of string dtypes to numpy dtype"""
+    return [getattr(np, dt) for dt in dtypes if isinstance(dt, str)]
+
+
+class TestNumpyEADtype:
+    # Passing invalid dtype, both as a string or object, must raise TypeError
+    # Per issue GH15520
+    @pytest.mark.parametrize("box", [pd.Timestamp, "pd.Timestamp", list])
+    def test_invalid_dtype_error(self, box):
+        with pytest.raises(TypeError, match="not understood"):
+            com.pandas_dtype(box)
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            object,
+            "float64",
+            np.object_,
+            np.dtype("object"),
+            "O",
+            np.float64,
+            float,
+            np.dtype("float64"),
+            "object_",
+        ],
+    )
+    def test_pandas_dtype_valid(self, dtype):
+        assert com.pandas_dtype(dtype) == dtype
+
+    @pytest.mark.parametrize(
+        "dtype", ["M8[ns]", "m8[ns]", "object", "float64", "int64"]
+    )
+    def test_numpy_dtype(self, dtype):
+        assert com.pandas_dtype(dtype) == np.dtype(dtype)
+
+    def test_numpy_string_dtype(self):
+        # do not parse freq-like string as period dtype
+        assert com.pandas_dtype("U") == np.dtype("U")
+        assert com.pandas_dtype("S") == np.dtype("S")
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            "datetime64[ns, US/Eastern]",
+            "datetime64[ns, Asia/Tokyo]",
+            "datetime64[ns, UTC]",
+            # GH#33885 check that the M8 alias is understood
+            "M8[ns, US/Eastern]",
+            "M8[ns, Asia/Tokyo]",
+            "M8[ns, UTC]",
+        ],
+    )
+    def test_datetimetz_dtype(self, dtype):
+        assert com.pandas_dtype(dtype) == DatetimeTZDtype.construct_from_string(dtype)
+        assert com.pandas_dtype(dtype) == dtype
+
+    def test_categorical_dtype(self):
+        assert com.pandas_dtype("category") == CategoricalDtype()
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            "period[D]",
+            "period[3M]",
+            "period[us]",
+            "Period[D]",
+            "Period[3M]",
+            "Period[us]",
+        ],
+    )
+    def test_period_dtype(self, dtype):
+        assert com.pandas_dtype(dtype) is not PeriodDtype(dtype)
+        assert com.pandas_dtype(dtype) == PeriodDtype(dtype)
+        assert com.pandas_dtype(dtype) == dtype
+
+
+dtypes = {
+    "datetime_tz": com.pandas_dtype("datetime64[ns, US/Eastern]"),
+    "datetime": com.pandas_dtype("datetime64[ns]"),
+    "timedelta": com.pandas_dtype("timedelta64[ns]"),
+    "period": PeriodDtype("D"),
+    "integer": np.dtype(np.int64),
+    "float": np.dtype(np.float64),
+    "object": np.dtype(object),
+    "category": com.pandas_dtype("category"),
+    "string": pd.StringDtype("python"),
+}
+
+
+@pytest.mark.parametrize("name1,dtype1", list(dtypes.items()), ids=lambda x: str(x))
+@pytest.mark.parametrize("name2,dtype2", list(dtypes.items()), ids=lambda x: str(x))
+def test_dtype_equal(name1, dtype1, name2, dtype2):
+    # match equal to self, but not equal to other
+    assert com.is_dtype_equal(dtype1, dtype1)
+    if name1 != name2:
+        assert not com.is_dtype_equal(dtype1, dtype2)
+
+
+@pytest.mark.parametrize("name,dtype", list(dtypes.items()), ids=lambda x: str(x))
+def test_pyarrow_string_import_error(name, dtype):
+    # GH-44276
+    assert not com.is_dtype_equal(dtype, "string[pyarrow]")
+
+
+@pytest.mark.parametrize(
+    "dtype1,dtype2",
+    [
+        (np.int8, np.int64),
+        (np.int16, np.int64),
+        (np.int32, np.int64),
+        (np.float32, np.float64),
+        (PeriodDtype("D"), PeriodDtype("2D")),  # PeriodType
+        (
+            com.pandas_dtype("datetime64[ns, US/Eastern]"),
+            com.pandas_dtype("datetime64[ns, CET]"),
+        ),  # Datetime
+        (None, None),  # gh-15941: no exception should be raised.
+    ],
+)
+def test_dtype_equal_strict(dtype1, dtype2):
+    assert not com.is_dtype_equal(dtype1, dtype2)
+
+
+def get_is_dtype_funcs():
+    """
+    Get all functions in pandas.core.dtypes.common that
+    begin with 'is_' and end with 'dtype'
+
+    """
+    fnames = [f for f in dir(com) if (f.startswith("is_") and f.endswith("dtype"))]
+    fnames.remove("is_string_or_object_np_dtype")  # fastpath requires np.dtype obj
+    return [getattr(com, fname) for fname in fnames]
+
+
+@pytest.mark.filterwarnings(
+    "ignore:is_categorical_dtype is deprecated:DeprecationWarning"
+)
+@pytest.mark.parametrize("func", get_is_dtype_funcs(), ids=lambda x: x.__name__)
+def test_get_dtype_error_catch(func):
+    # see gh-15941
+    #
+    # No exception should be raised.
+
+    msg = f"{func.__name__} is deprecated"
+    warn = None
+    if (
+        func is com.is_int64_dtype
+        or func is com.is_interval_dtype
+        or func is com.is_datetime64tz_dtype
+        or func is com.is_categorical_dtype
+        or func is com.is_period_dtype
+    ):
+        warn = Pandas4Warning
+
+    with tm.assert_produces_warning(warn, match=msg):
+        assert not func(None)
+
+
+def test_is_object():
+    assert com.is_object_dtype(object)
+    assert com.is_object_dtype(np.array([], dtype=object))
+
+    assert not com.is_object_dtype(int)
+    assert not com.is_object_dtype(np.array([], dtype=int))
+    assert not com.is_object_dtype([1, 2, 3])
+
+
+@pytest.mark.parametrize(
+    "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))]
+)
+def test_is_sparse(check_scipy):
+    msg = "is_sparse is deprecated"
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        assert com.is_sparse(SparseArray([1, 2, 3]))
+
+        assert not com.is_sparse(np.array([1, 2, 3]))
+
+        if check_scipy:
+            import scipy.sparse
+
+            assert not com.is_sparse(scipy.sparse.bsr_matrix([1, 2, 3]))
+
+
+def test_is_scipy_sparse():
+    sp_sparse = pytest.importorskip("scipy.sparse")
+
+    assert com.is_scipy_sparse(sp_sparse.bsr_matrix([1, 2, 3]))
+
+    assert not com.is_scipy_sparse(SparseArray([1, 2, 3]))
+
+
+def test_is_datetime64_dtype():
+    assert not com.is_datetime64_dtype(object)
+    assert not com.is_datetime64_dtype([1, 2, 3])
+    assert not com.is_datetime64_dtype(np.array([], dtype=int))
+
+    assert com.is_datetime64_dtype(np.datetime64)
+    assert com.is_datetime64_dtype(np.array([], dtype=np.datetime64))
+
+
+def test_is_datetime64tz_dtype():
+    msg = "is_datetime64tz_dtype is deprecated"
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        assert not com.is_datetime64tz_dtype(object)
+        assert not com.is_datetime64tz_dtype([1, 2, 3])
+        assert not com.is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3]))
+        assert com.is_datetime64tz_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern"))
+
+
+def test_custom_ea_kind_M_not_datetime64tz():
+    # GH 34986
+    class NotTZDtype(ExtensionDtype):
+        @property
+        def kind(self) -> str:
+            return "M"
+
+    not_tz_dtype = NotTZDtype()
+    msg = "is_datetime64tz_dtype is deprecated"
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        assert not com.is_datetime64tz_dtype(not_tz_dtype)
+        assert not com.needs_i8_conversion(not_tz_dtype)
+
+
+def test_is_timedelta64_dtype():
+    assert not com.is_timedelta64_dtype(object)
+    assert not com.is_timedelta64_dtype(None)
+    assert not com.is_timedelta64_dtype([1, 2, 3])
+    assert not com.is_timedelta64_dtype(np.array([], dtype=np.datetime64))
+    assert not com.is_timedelta64_dtype("0 days")
+    assert not com.is_timedelta64_dtype("0 days 00:00:00")
+    assert not com.is_timedelta64_dtype(["0 days 00:00:00"])
+    assert not com.is_timedelta64_dtype("NO DATE")
+
+    assert com.is_timedelta64_dtype(np.timedelta64)
+    assert com.is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]"))
+    assert com.is_timedelta64_dtype(pd.to_timedelta(["0 days", "1 days"]))
+
+
+def test_is_period_dtype():
+    msg = "is_period_dtype is deprecated"
+    with tm.assert_produces_warning(DeprecationWarning, match=msg):
+        assert not com.is_period_dtype(object)
+        assert not com.is_period_dtype([1, 2, 3])
+        assert not com.is_period_dtype(pd.Period("2017-01-01"))
+
+        assert com.is_period_dtype(PeriodDtype(freq="D"))
+        assert com.is_period_dtype(pd.PeriodIndex([], freq="Y"))
+
+
+def test_is_interval_dtype():
+    msg = "is_interval_dtype is deprecated"
+    with tm.assert_produces_warning(DeprecationWarning, match=msg):
+        assert not com.is_interval_dtype(object)
+        assert not com.is_interval_dtype([1, 2, 3])
+
+        assert com.is_interval_dtype(IntervalDtype())
+
+        interval = pd.Interval(1, 2, closed="right")
+        assert not com.is_interval_dtype(interval)
+        assert com.is_interval_dtype(pd.IntervalIndex([interval]))
+
+
+def test_is_categorical_dtype():
+    msg = "is_categorical_dtype is deprecated"
+    with tm.assert_produces_warning(DeprecationWarning, match=msg):
+        assert not com.is_categorical_dtype(object)
+        assert not com.is_categorical_dtype([1, 2, 3])
+
+        assert com.is_categorical_dtype(CategoricalDtype())
+        assert com.is_categorical_dtype(pd.Categorical([1, 2, 3]))
+        assert com.is_categorical_dtype(pd.CategoricalIndex([1, 2, 3]))
+
+
+@pytest.mark.parametrize(
+    "dtype, expected",
+    [
+        (int, False),
+        (pd.Series([1, 2]), False),
+        (str, True),
+        (object, True),
+        (np.array(["a", "b"]), True),
+        (pd.StringDtype(), True),
+        (pd.Index([], dtype="O"), True),
+    ],
+)
+def test_is_string_dtype(dtype, expected):
+    # GH#54661
+
+    result = com.is_string_dtype(dtype)
+    assert result is expected
+
+
+@pytest.mark.parametrize(
+    "data",
+    [[(0, 1), (1, 1)], pd.Categorical([1, 2, 3]), np.array([1, 2], dtype=object)],
+)
+def test_is_string_dtype_arraylike_with_object_elements_not_strings(data):
+    # GH 15585
+    assert not com.is_string_dtype(pd.Series(data))
+
+
+def test_is_string_dtype_nullable(nullable_string_dtype):
+    assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype))
+
+
+integer_dtypes: list = []
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        *integer_dtypes,
+        pd.Series([1, 2]),
+        *tm.ALL_INT_NUMPY_DTYPES,
+        *to_numpy_dtypes(tm.ALL_INT_NUMPY_DTYPES),
+        *tm.ALL_INT_EA_DTYPES,
+        *to_ea_dtypes(tm.ALL_INT_EA_DTYPES),
+    ],
+)
+def test_is_integer_dtype(dtype):
+    assert com.is_integer_dtype(dtype)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        str,
+        float,
+        np.datetime64,
+        np.timedelta64,
+        pd.Index([1, 2.0]),
+        np.array(["a", "b"]),
+        np.array([], dtype=np.timedelta64),
+    ],
+)
+def test_is_not_integer_dtype(dtype):
+    assert not com.is_integer_dtype(dtype)
+
+
+signed_integer_dtypes: list = []
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        *signed_integer_dtypes,
+        pd.Series([1, 2]),
+        *tm.SIGNED_INT_NUMPY_DTYPES,
+        *to_numpy_dtypes(tm.SIGNED_INT_NUMPY_DTYPES),
+        *tm.SIGNED_INT_EA_DTYPES,
+        *to_ea_dtypes(tm.SIGNED_INT_EA_DTYPES),
+    ],
+)
+def test_is_signed_integer_dtype(dtype):
+    assert com.is_integer_dtype(dtype)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        str,
+        float,
+        np.datetime64,
+        np.timedelta64,
+        pd.Index([1, 2.0]),
+        np.array(["a", "b"]),
+        np.array([], dtype=np.timedelta64),
+        *tm.UNSIGNED_INT_NUMPY_DTYPES,
+        *to_numpy_dtypes(tm.UNSIGNED_INT_NUMPY_DTYPES),
+        *tm.UNSIGNED_INT_EA_DTYPES,
+        *to_ea_dtypes(tm.UNSIGNED_INT_EA_DTYPES),
+    ],
+)
+def test_is_not_signed_integer_dtype(dtype):
+    assert not com.is_signed_integer_dtype(dtype)
+
+
+unsigned_integer_dtypes: list = []
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        *unsigned_integer_dtypes,
+        pd.Series([1, 2], dtype=np.uint32),
+        *tm.UNSIGNED_INT_NUMPY_DTYPES,
+        *to_numpy_dtypes(tm.UNSIGNED_INT_NUMPY_DTYPES),
+        *tm.UNSIGNED_INT_EA_DTYPES,
+        *to_ea_dtypes(tm.UNSIGNED_INT_EA_DTYPES),
+    ],
+)
+def test_is_unsigned_integer_dtype(dtype):
+    assert com.is_unsigned_integer_dtype(dtype)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        str,
+        float,
+        np.datetime64,
+        np.timedelta64,
+        pd.Index([1, 2.0]),
+        np.array(["a", "b"]),
+        np.array([], dtype=np.timedelta64),
+        *tm.SIGNED_INT_NUMPY_DTYPES,
+        *to_numpy_dtypes(tm.SIGNED_INT_NUMPY_DTYPES),
+        *tm.SIGNED_INT_EA_DTYPES,
+        *to_ea_dtypes(tm.SIGNED_INT_EA_DTYPES),
+    ],
+)
+def test_is_not_unsigned_integer_dtype(dtype):
+    assert not com.is_unsigned_integer_dtype(dtype)
+
+
+@pytest.mark.parametrize(
+    "dtype", [np.int64, np.array([1, 2], dtype=np.int64), "Int64", pd.Int64Dtype]
+)
+def test_is_int64_dtype(dtype):
+    msg = "is_int64_dtype is deprecated"
+    with tm.assert_produces_warning(DeprecationWarning, match=msg):
+        assert com.is_int64_dtype(dtype)
+
+
+def test_type_comparison_with_numeric_ea_dtype(any_numeric_ea_dtype):
+    # GH#43038
+    assert pandas_dtype(any_numeric_ea_dtype) == any_numeric_ea_dtype
+
+
+def test_type_comparison_with_real_numpy_dtype(any_real_numpy_dtype):
+    # GH#43038
+    assert pandas_dtype(any_real_numpy_dtype) == any_real_numpy_dtype
+
+
+def test_type_comparison_with_signed_int_ea_dtype_and_signed_int_numpy_dtype(
+    any_signed_int_ea_dtype, any_signed_int_numpy_dtype
+):
+    # GH#43038
+    assert not pandas_dtype(any_signed_int_ea_dtype) == any_signed_int_numpy_dtype
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        str,
+        float,
+        np.int32,
+        np.uint64,
+        pd.Index([1, 2.0]),
+        np.array(["a", "b"]),
+        np.array([1, 2], dtype=np.uint32),
+        "int8",
+        "Int8",
+        pd.Int8Dtype,
+    ],
+)
+def test_is_not_int64_dtype(dtype):
+    msg = "is_int64_dtype is deprecated"
+    with tm.assert_produces_warning(DeprecationWarning, match=msg):
+        assert not com.is_int64_dtype(dtype)
+
+
+def test_is_datetime64_any_dtype():
+    assert not com.is_datetime64_any_dtype(int)
+    assert not com.is_datetime64_any_dtype(str)
+    assert not com.is_datetime64_any_dtype(np.array([1, 2]))
+    assert not com.is_datetime64_any_dtype(np.array(["a", "b"]))
+
+    assert com.is_datetime64_any_dtype(np.datetime64)
+    assert com.is_datetime64_any_dtype(np.array([], dtype=np.datetime64))
+    assert com.is_datetime64_any_dtype(DatetimeTZDtype("ns", "US/Eastern"))
+    assert com.is_datetime64_any_dtype(
+        pd.DatetimeIndex([1, 2, 3], dtype="datetime64[ns]")
+    )
+
+
+def test_is_datetime64_ns_dtype():
+    assert not com.is_datetime64_ns_dtype(int)
+    assert not com.is_datetime64_ns_dtype(str)
+    assert not com.is_datetime64_ns_dtype(np.datetime64)
+    assert not com.is_datetime64_ns_dtype(np.array([1, 2]))
+    assert not com.is_datetime64_ns_dtype(np.array(["a", "b"]))
+    assert not com.is_datetime64_ns_dtype(np.array([], dtype=np.datetime64))
+
+    # This datetime array has the wrong unit (ps instead of ns)
+    assert not com.is_datetime64_ns_dtype(np.array([], dtype="datetime64[ps]"))
+
+    assert com.is_datetime64_ns_dtype(DatetimeTZDtype("ns", "US/Eastern"))
+    assert com.is_datetime64_ns_dtype(
+        pd.DatetimeIndex([1, 2, 3], dtype=np.dtype("datetime64[ns]"))
+    )
+
+    # non-nano dt64tz
+    assert not com.is_datetime64_ns_dtype(DatetimeTZDtype("us", "US/Eastern"))
+
+
+def test_is_timedelta64_ns_dtype():
+    assert not com.is_timedelta64_ns_dtype(np.dtype("m8[ps]"))
+    assert not com.is_timedelta64_ns_dtype(np.array([1, 2], dtype=np.timedelta64))
+
+    assert com.is_timedelta64_ns_dtype(np.dtype("m8[ns]"))
+    assert com.is_timedelta64_ns_dtype(np.array([1, 2], dtype="m8[ns]"))
+
+
+def test_is_numeric_v_string_like():
+    assert not com.is_numeric_v_string_like(np.array([1]), 1)
+    assert not com.is_numeric_v_string_like(np.array([1]), np.array([2]))
+    assert not com.is_numeric_v_string_like(np.array(["foo"]), np.array(["foo"]))
+
+    assert com.is_numeric_v_string_like(np.array([1]), "foo")
+    assert com.is_numeric_v_string_like(np.array([1, 2]), np.array(["foo"]))
+    assert com.is_numeric_v_string_like(np.array(["foo"]), np.array([1, 2]))
+
+
+def test_needs_i8_conversion():
+    assert not com.needs_i8_conversion(str)
+    assert not com.needs_i8_conversion(np.int64)
+    assert not com.needs_i8_conversion(pd.Series([1, 2]))
+    assert not com.needs_i8_conversion(np.array(["a", "b"]))
+
+    assert not com.needs_i8_conversion(np.datetime64)
+    assert com.needs_i8_conversion(np.dtype(np.datetime64))
+    assert not com.needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]"))
+    assert com.needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]").dtype)
+    assert not com.needs_i8_conversion(pd.DatetimeIndex(["2000"], tz="US/Eastern"))
+    assert com.needs_i8_conversion(pd.DatetimeIndex(["2000"], tz="US/Eastern").dtype)
+
+
+def test_is_numeric_dtype():
+    assert not com.is_numeric_dtype(str)
+    assert not com.is_numeric_dtype(np.datetime64)
+    assert not com.is_numeric_dtype(np.timedelta64)
+    assert not com.is_numeric_dtype(np.array(["a", "b"]))
+    assert not com.is_numeric_dtype(np.array([], dtype=np.timedelta64))
+
+    assert com.is_numeric_dtype(int)
+    assert com.is_numeric_dtype(float)
+    assert com.is_numeric_dtype(np.uint64)
+    assert com.is_numeric_dtype(pd.Series([1, 2]))
+    assert com.is_numeric_dtype(pd.Index([1, 2.0]))
+
+    class MyNumericDType(ExtensionDtype):
+        @property
+        def type(self):
+            return str
+
+        @property
+        def name(self):
+            raise NotImplementedError
+
+        def construct_array_type(self):
+            raise NotImplementedError
+
+        def _is_numeric(self) -> bool:
+            return True
+
+    assert com.is_numeric_dtype(MyNumericDType())
+
+
+def test_is_any_real_numeric_dtype():
+    assert not com.is_any_real_numeric_dtype(str)
+    assert not com.is_any_real_numeric_dtype(bool)
+    assert not com.is_any_real_numeric_dtype(complex)
+    assert not com.is_any_real_numeric_dtype(object)
+    assert not com.is_any_real_numeric_dtype(np.datetime64)
+    assert not com.is_any_real_numeric_dtype(np.array(["a", "b", complex(1, 2)]))
+    assert not com.is_any_real_numeric_dtype(pd.DataFrame([complex(1, 2), True]))
+
+    assert com.is_any_real_numeric_dtype(int)
+    assert com.is_any_real_numeric_dtype(float)
+    assert com.is_any_real_numeric_dtype(np.array([1, 2.5]))
+
+
+def test_is_float_dtype():
+    assert not com.is_float_dtype(str)
+    assert not com.is_float_dtype(int)
+    assert not com.is_float_dtype(pd.Series([1, 2]))
+    assert not com.is_float_dtype(np.array(["a", "b"]))
+
+    assert com.is_float_dtype(float)
+    assert com.is_float_dtype(pd.Index([1, 2.0]))
+
+
+def test_is_bool_dtype():
+    assert not com.is_bool_dtype(int)
+    assert not com.is_bool_dtype(str)
+    assert not com.is_bool_dtype(pd.Series([1, 2]))
+    assert not com.is_bool_dtype(pd.Series(["a", "b"], dtype="category"))
+    assert not com.is_bool_dtype(np.array(["a", "b"]))
+    assert not com.is_bool_dtype(pd.Index(["a", "b"]))
+    assert not com.is_bool_dtype("Int64")
+
+    assert com.is_bool_dtype(bool)
+    assert com.is_bool_dtype(np.bool_)
+    assert com.is_bool_dtype(pd.Series([True, False], dtype="category"))
+    assert com.is_bool_dtype(np.array([True, False]))
+    assert com.is_bool_dtype(pd.Index([True, False]))
+
+    assert com.is_bool_dtype(pd.BooleanDtype())
+    assert com.is_bool_dtype(pd.array([True, False, None], dtype="boolean"))
+    assert com.is_bool_dtype("boolean")
+
+
+def test_is_bool_dtype_numpy_error():
+    # GH39010
+    assert not com.is_bool_dtype("0 - Name")
+
+
+@pytest.mark.parametrize(
+    "check_scipy", [False, pytest.param(True, marks=td.skip_if_no("scipy"))]
+)
+def test_is_extension_array_dtype(check_scipy):
+    assert not com.is_extension_array_dtype([1, 2, 3])
+    assert not com.is_extension_array_dtype(np.array([1, 2, 3]))
+    assert not com.is_extension_array_dtype(pd.DatetimeIndex([1, 2, 3]))
+
+    cat = pd.Categorical([1, 2, 3])
+    assert com.is_extension_array_dtype(cat)
+    assert com.is_extension_array_dtype(pd.Series(cat))
+    assert com.is_extension_array_dtype(SparseArray([1, 2, 3]))
+    assert com.is_extension_array_dtype(pd.DatetimeIndex(["2000"], tz="US/Eastern"))
+
+    dtype = DatetimeTZDtype("ns", tz="US/Eastern")
+    s = pd.Series([], dtype=dtype)
+    assert com.is_extension_array_dtype(s)
+
+    if check_scipy:
+        import scipy.sparse
+
+        assert not com.is_extension_array_dtype(scipy.sparse.bsr_matrix([1, 2, 3]))
+
+
+def test_is_complex_dtype():
+    assert not com.is_complex_dtype(int)
+    assert not com.is_complex_dtype(str)
+    assert not com.is_complex_dtype(pd.Series([1, 2]))
+    assert not com.is_complex_dtype(np.array(["a", "b"]))
+
+    assert com.is_complex_dtype(np.complex128)
+    assert com.is_complex_dtype(complex)
+    assert com.is_complex_dtype(np.array([1 + 1j, 5]))
+
+
+@pytest.mark.parametrize(
+    "input_param,result",
+    [
+        (int, np.dtype(int)),
+        ("int32", np.dtype("int32")),
+        (float, np.dtype(float)),
+        ("float64", np.dtype("float64")),
+        (np.dtype("float64"), np.dtype("float64")),
+        (str, np.dtype(str)),
+        (pd.Series([1, 2], dtype=np.dtype("int16")), np.dtype("int16")),
+        (pd.Series(["a", "b"], dtype=object), np.dtype(object)),
+        (pd.Index([1, 2]), np.dtype("int64")),
+        (pd.Index(["a", "b"], dtype=object), np.dtype(object)),
+        ("category", "category"),
+        (pd.Categorical(["a", "b"]).dtype, CategoricalDtype(["a", "b"])),
+        (pd.Categorical(["a", "b"]), CategoricalDtype(["a", "b"])),
+        (pd.CategoricalIndex(["a", "b"]).dtype, CategoricalDtype(["a", "b"])),
+        (pd.CategoricalIndex(["a", "b"]), CategoricalDtype(["a", "b"])),
+        (CategoricalDtype(), CategoricalDtype()),
+        (pd.DatetimeIndex([1, 2]), np.dtype("=M8[ns]")),
+        (pd.DatetimeIndex([1, 2]).dtype, np.dtype("=M8[ns]")),
+        ("<M8[ns]", np.dtype("<M8[ns]")),
+        ("datetime64[ns, Europe/London]", DatetimeTZDtype("ns", "Europe/London")),
+        (PeriodDtype(freq="D"), PeriodDtype(freq="D")),
+        ("period[D]", PeriodDtype(freq="D")),
+        (IntervalDtype(), IntervalDtype()),
+    ],
+)
+def test_get_dtype(input_param, result):
+    assert com._get_dtype(input_param) == result
+
+
+@pytest.mark.parametrize(
+    "input_param,expected_error_message",
+    [
+        (None, "Cannot deduce dtype from null object"),
+        (1, "data type not understood"),
+        (1.2, "data type not understood"),
+        # numpy dev changed from double-quotes to single quotes
+        ("random string", "data type [\"']random string[\"'] not understood"),
+        (pd.DataFrame([1, 2]), "data type not understood"),
+        (
+            np.typing.NDArray[np.float32],
+            "data type not understood|Cannot interpret.*numpy.*as a data type",
+        ),
+    ],
+)
+def test_get_dtype_fails(input_param, expected_error_message):
+    # python objects
+    # 2020-02-02 npdev changed error message
+    expected_error_message += f"|Cannot interpret '{input_param}' as a data type"
+    with pytest.raises(TypeError, match=expected_error_message):
+        com._get_dtype(input_param)
+
+
+@pytest.mark.parametrize(
+    "input_param,result",
+    [
+        (int, np.dtype(int).type),
+        ("int32", np.int32),
+        (float, np.dtype(float).type),
+        ("float64", np.float64),
+        (np.dtype("float64"), np.float64),
+        (str, np.dtype(str).type),
+        (pd.Series([1, 2], dtype=np.dtype("int16")), np.int16),
+        (pd.Series(["a", "b"], dtype=object), np.object_),
+        (pd.Index([1, 2], dtype="int64"), np.int64),
+        (pd.Index(["a", "b"], dtype=object), np.object_),
+        ("category", CategoricalDtypeType),
+        (pd.Categorical(["a", "b"]).dtype, CategoricalDtypeType),
+        (pd.Categorical(["a", "b"]), CategoricalDtypeType),
+        (pd.CategoricalIndex(["a", "b"]).dtype, CategoricalDtypeType),
+        (pd.CategoricalIndex(["a", "b"]), CategoricalDtypeType),
+        (pd.DatetimeIndex([1, 2]), np.datetime64),
+        (pd.DatetimeIndex([1, 2]).dtype, np.datetime64),
+        ("<M8[ns]", np.datetime64),
+        (pd.DatetimeIndex(["2000"], tz="Europe/London"), pd.Timestamp),
+        (pd.DatetimeIndex(["2000"], tz="Europe/London").dtype, pd.Timestamp),
+        ("datetime64[ns, Europe/London]", pd.Timestamp),
+        (PeriodDtype(freq="D"), pd.Period),
+        ("period[D]", pd.Period),
+        (IntervalDtype(), pd.Interval),
+        (None, type(None)),
+        (1, type(None)),
+        (1.2, type(None)),
+        (pd.DataFrame([1, 2]), type(None)),  # composite dtype
+    ],
+)
+def test__is_dtype_type(input_param, result):
+    assert com._is_dtype_type(input_param, lambda tipo: tipo == result)
+
+
+def test_astype_nansafe_copy_false(any_int_numpy_dtype):
+    # GH#34457 use astype, not view
+    arr = np.array([1, 2, 3], dtype=any_int_numpy_dtype)
+
+    dtype = np.dtype("float64")
+    result = astype_array(arr, dtype, copy=False)
+
+    expected = np.array([1.0, 2.0, 3.0], dtype=dtype)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("from_type", [np.datetime64, np.timedelta64])
+def test_astype_object_preserves_datetime_na(from_type):
+    arr = np.array([from_type("NaT", "ns")])
+    result = astype_array(arr, dtype=np.dtype("object"))
+
+    assert isna(result)[0]
+
+
+def test_validate_allhashable():
+    assert com.validate_all_hashable(1, "a") is None
+
+    with pytest.raises(TypeError, match="All elements must be hashable"):
+        com.validate_all_hashable([])
+
+    with pytest.raises(TypeError, match="list must be a hashable type"):
+        com.validate_all_hashable([], error_name="list")
+
+
+def test_pandas_dtype_numpy_warning():
+    # GH#51523
+    if Version(np.__version__) < Version("2.3.0.dev0"):
+        ctx = tm.assert_produces_warning(
+            DeprecationWarning,
+            check_stacklevel=False,
+            match=(
+                "Converting `np.integer` or `np.signedinteger` to a dtype is deprecated"
+            ),
+        )
+    else:
+        ctx = tm.external_error_raised(TypeError)
+
+    with ctx:
+        pandas_dtype(np.integer)
+
+
+def test_pandas_dtype_ea_not_instance():
+    # GH 31356 GH 54592
+    with tm.assert_produces_warning(UserWarning, match="without any arguments"):
+        assert pandas_dtype(CategoricalDtype) == CategoricalDtype()
+
+
+def test_pandas_dtype_string_dtypes(string_storage):
+    with pd.option_context("future.infer_string", True):
+        # with the default string_storage setting
+        result = pandas_dtype("str")
+    assert result == pd.StringDtype(
+        "pyarrow" if HAS_PYARROW else "python", na_value=np.nan
+    )
+
+    with pd.option_context("future.infer_string", True):
+        # with the default string_storage setting
+        result = pandas_dtype(str)
+    assert result == pd.StringDtype(
+        "pyarrow" if HAS_PYARROW else "python", na_value=np.nan
+    )
+
+    with pd.option_context("future.infer_string", True):
+        with pd.option_context("string_storage", string_storage):
+            result = pandas_dtype("str")
+    assert result == pd.StringDtype(string_storage, na_value=np.nan)
+
+    with pd.option_context("future.infer_string", True):
+        with pd.option_context("string_storage", string_storage):
+            result = pandas_dtype(str)
+    assert result == pd.StringDtype(string_storage, na_value=np.nan)
+
+    with pd.option_context("future.infer_string", False):
+        with pd.option_context("string_storage", string_storage):
+            result = pandas_dtype("str")
+    assert result == np.dtype("U")
+
+    with pd.option_context("string_storage", string_storage):
+        result = pandas_dtype("string")
+    assert result == pd.StringDtype(string_storage, na_value=pd.NA)
+
+
+def test_pandas_dtype_string_dtype_alias_with_storage():
+    with pytest.raises(TypeError, match="not understood"):
+        pandas_dtype("str[python]")
+
+    with pytest.raises(TypeError, match="not understood"):
+        pandas_dtype("str[pyarrow]")
+
+    result = pandas_dtype("string[python]")
+    assert result == pd.StringDtype("python", na_value=pd.NA)
+
+    if HAS_PYARROW:
+        result = pandas_dtype("string[pyarrow]")
+        assert result == pd.StringDtype("pyarrow", na_value=pd.NA)
+    else:
+        with pytest.raises(
+            ImportError, match="required for PyArrow backed StringArray"
+        ):
+            pandas_dtype("string[pyarrow]")
+
+
+@td.skip_if_installed("pyarrow")
+def test_construct_from_string_without_pyarrow_installed():
+    # GH 57928
+    with pytest.raises(ImportError, match="pyarrow>=.* is required"):
+        pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")
diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..571e12d0c3303aab0e97c5909664e5965a1c90ee
--- /dev/null
+++ b/pandas/tests/dtypes/test_concat.py
@@ -0,0 +1,66 @@
+import pytest
+
+import pandas.core.dtypes.concat as _concat
+
+import pandas as pd
+from pandas import Series
+import pandas._testing as tm
+
+
+def test_concat_mismatched_categoricals_with_empty():
+    # concat_compat behavior on series._values should match pd.concat on series
+    ser1 = Series(["a", "b", "c"], dtype="category")
+    ser2 = Series([], dtype="category")
+
+    result = _concat.concat_compat([ser1._values, ser2._values])
+    expected = pd.concat([ser1, ser2])._values
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_concat_single_dataframe_tz_aware():
+    # https://github.com/pandas-dev/pandas/issues/25257
+    df = pd.DataFrame(
+        {"timestamp": [pd.Timestamp("2020-04-08 09:00:00.709949+0000", tz="UTC")]}
+    )
+    expected = df.copy()
+    result = pd.concat([df])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_concat_periodarray_2d():
+    pi = pd.period_range("2016-01-01", periods=36, freq="D")
+    arr = pi._data.reshape(6, 6)
+
+    result = _concat.concat_compat([arr[:2], arr[2:]], axis=0)
+    tm.assert_period_array_equal(result, arr)
+
+    result = _concat.concat_compat([arr[:, :2], arr[:, 2:]], axis=1)
+    tm.assert_period_array_equal(result, arr)
+
+    msg = (
+        "all the input array dimensions.* for the concatenation axis must match exactly"
+    )
+    with pytest.raises(ValueError, match=msg):
+        _concat.concat_compat([arr[:, :2], arr[:, 2:]], axis=0)
+
+    with pytest.raises(ValueError, match=msg):
+        _concat.concat_compat([arr[:2], arr[2:]], axis=1)
+
+
+def test_concat_series_between_empty_and_tzaware_series(using_infer_string):
+    tzaware_time = pd.Timestamp("2020-01-01T00:00:00+00:00")
+    ser1 = Series(index=[tzaware_time], data=0, dtype=float)
+    ser2 = Series(dtype=float)
+
+    result = pd.concat([ser1, ser2], axis=1)
+    expected = pd.DataFrame(
+        data=[
+            (0.0, None),
+        ],
+        index=[tzaware_time]
+        if using_infer_string
+        else pd.Index([tzaware_time], dtype=object),
+        columns=[0, 1],
+        dtype=float,
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ab99458b98e1a737af8f01a584cfcec3bd8acb6
--- /dev/null
+++ b/pandas/tests/dtypes/test_dtypes.py
@@ -0,0 +1,1258 @@
+import re
+import warnings
+import weakref
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs.dtypes import NpyDatetimeUnit
+from pandas.errors import Pandas4Warning
+
+from pandas.core.dtypes.base import _registry as registry
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_categorical_dtype,
+    is_datetime64_any_dtype,
+    is_datetime64_dtype,
+    is_datetime64_ns_dtype,
+    is_datetime64tz_dtype,
+    is_dtype_equal,
+    is_interval_dtype,
+    is_period_dtype,
+    is_string_dtype,
+)
+from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
+    DatetimeTZDtype,
+    IntervalDtype,
+    PeriodDtype,
+)
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    CategoricalIndex,
+    DatetimeIndex,
+    IntervalIndex,
+    Series,
+    SparseDtype,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.core.arrays.sparse import SparseArray
+
+
+class Base:
+    def test_hash(self, dtype):
+        hash(dtype)
+
+    def test_equality_invalid(self, dtype):
+        assert not dtype == "foo"
+        assert not is_dtype_equal(dtype, np.int64)
+
+    def test_numpy_informed(self, dtype):
+        # npdev 2020-02-02 changed from "data type not understood" to
+        #  "Cannot interpret 'foo' as a data type"
+        msg = "|".join(
+            ["data type not understood", "Cannot interpret '.*' as a data type"]
+        )
+        with pytest.raises(TypeError, match=msg):
+            np.dtype(dtype)
+
+        assert not dtype == np.str_
+        assert not np.str_ == dtype
+
+    def test_pickle(self, dtype, temp_file):
+        # make sure our cache is NOT pickled
+
+        # clear the cache
+        type(dtype).reset_cache()
+        assert not len(dtype._cache_dtypes)
+
+        # force back to the cache
+        result = tm.round_trip_pickle(dtype, temp_file)
+        if not isinstance(dtype, PeriodDtype):
+            # Because PeriodDtype has a cython class as a base class,
+            #  it has different pickle semantics, and its cache is re-populated
+            #  on un-pickling.
+            assert not len(dtype._cache_dtypes)
+        assert result == dtype
+
+
+class TestCategoricalDtype(Base):
+    @pytest.fixture
+    def dtype(self):
+        """
+        Class level fixture of dtype for TestCategoricalDtype
+        """
+        return CategoricalDtype()
+
+    def test_hash_vs_equality(self, dtype):
+        dtype2 = CategoricalDtype()
+        assert dtype == dtype2
+        assert dtype2 == dtype
+        assert hash(dtype) == hash(dtype2)
+
+    def test_equality(self, dtype):
+        assert dtype == "category"
+        assert is_dtype_equal(dtype, "category")
+        assert "category" == dtype
+        assert is_dtype_equal("category", dtype)
+
+        assert dtype == CategoricalDtype()
+        assert is_dtype_equal(dtype, CategoricalDtype())
+        assert CategoricalDtype() == dtype
+        assert is_dtype_equal(CategoricalDtype(), dtype)
+
+        assert dtype != "foo"
+        assert not is_dtype_equal(dtype, "foo")
+        assert "foo" != dtype
+        assert not is_dtype_equal("foo", dtype)
+
+    def test_construction_from_string(self, dtype):
+        result = CategoricalDtype.construct_from_string("category")
+        assert is_dtype_equal(dtype, result)
+        msg = "Cannot construct a 'CategoricalDtype' from 'foo'"
+        with pytest.raises(TypeError, match=msg):
+            CategoricalDtype.construct_from_string("foo")
+
+    def test_constructor_invalid(self):
+        msg = "Parameter 'categories' must be list-like"
+        with pytest.raises(TypeError, match=msg):
+            CategoricalDtype("category")
+
+    dtype1 = CategoricalDtype(["a", "b"], ordered=True)
+    dtype2 = CategoricalDtype(["x", "y"], ordered=False)
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore")
+        c = Categorical([0, 1], dtype=dtype1)
+
+    @pytest.mark.parametrize(
+        "values, categories, ordered, dtype, expected",
+        [
+            [None, None, None, None, CategoricalDtype()],
+            [None, ["a", "b"], True, None, dtype1],
+            [c, None, None, dtype2, dtype2],
+            [c, ["x", "y"], False, None, dtype2],
+        ],
+    )
+    def test_from_values_or_dtype(self, values, categories, ordered, dtype, expected):
+        result = CategoricalDtype._from_values_or_dtype(
+            values, categories, ordered, dtype
+        )
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "values, categories, ordered, dtype",
+        [
+            [None, ["a", "b"], True, dtype2],
+            [None, ["a", "b"], None, dtype2],
+            [None, None, True, dtype2],
+        ],
+    )
+    def test_from_values_or_dtype_raises(self, values, categories, ordered, dtype):
+        msg = "Cannot specify `categories` or `ordered` together with `dtype`."
+        with pytest.raises(ValueError, match=msg):
+            CategoricalDtype._from_values_or_dtype(values, categories, ordered, dtype)
+
+    def test_from_values_or_dtype_invalid_dtype(self):
+        msg = "Cannot not construct CategoricalDtype from <class 'object'>"
+        with pytest.raises(ValueError, match=msg):
+            CategoricalDtype._from_values_or_dtype(None, None, None, object)
+
+    def test_is_dtype(self, dtype):
+        assert CategoricalDtype.is_dtype(dtype)
+        assert CategoricalDtype.is_dtype("category")
+        assert CategoricalDtype.is_dtype(CategoricalDtype())
+        assert not CategoricalDtype.is_dtype("foo")
+        assert not CategoricalDtype.is_dtype(np.float64)
+
+    def test_basic(self, dtype):
+        msg = "is_categorical_dtype is deprecated"
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            assert is_categorical_dtype(dtype)
+
+            factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])
+
+            s = Series(factor, name="A")
+
+            # dtypes
+            assert is_categorical_dtype(s.dtype)
+            assert is_categorical_dtype(s)
+            assert not is_categorical_dtype(np.dtype("float64"))
+
+    def test_tuple_categories(self):
+        categories = [(1, "a"), (2, "b"), (3, "c")]
+        result = CategoricalDtype(categories)
+        assert all(result.categories == categories)
+
+    @pytest.mark.parametrize(
+        "categories, expected",
+        [
+            ([True, False], True),
+            ([True, False, None], True),
+            ([True, False, "a", "b'"], False),
+            ([0, 1], False),
+        ],
+    )
+    def test_is_boolean(self, categories, expected):
+        cat = Categorical(categories)
+        assert cat.dtype._is_boolean is expected
+        assert is_bool_dtype(cat) is expected
+        assert is_bool_dtype(cat.dtype) is expected
+
+    def test_dtype_specific_categorical_dtype(self):
+        expected = "datetime64[ns]"
+        dti = DatetimeIndex([], dtype=expected)
+        result = str(Categorical(dti).categories.dtype)
+        assert result == expected
+
+    def test_not_string(self):
+        # though CategoricalDtype has object kind, it cannot be string
+        assert not is_string_dtype(CategoricalDtype())
+
+    def test_repr_range_categories(self):
+        rng = pd.Index(range(3))
+        dtype = CategoricalDtype(categories=rng, ordered=False)
+        result = repr(dtype)
+
+        expected = (
+            "CategoricalDtype(categories=range(0, 3), ordered=False, "
+            "categories_dtype=int64)"
+        )
+        assert result == expected
+
+    def test_update_dtype(self):
+        # GH 27338
+        result = CategoricalDtype(["a"]).update_dtype(Categorical(["b"], ordered=True))
+        expected = CategoricalDtype(["b"], ordered=True)
+        assert result == expected
+
+    def test_repr(self):
+        cat = Categorical(pd.Index([1, 2, 3], dtype="int32"))
+        result = cat.dtype.__repr__()
+        expected = (
+            "CategoricalDtype(categories=[1, 2, 3], ordered=False, "
+            "categories_dtype=int32)"
+        )
+        assert result == expected
+
+
+class TestDatetimeTZDtype(Base):
+    @pytest.fixture
+    def dtype(self):
+        """
+        Class level fixture of dtype for TestDatetimeTZDtype
+        """
+        return DatetimeTZDtype("ns", "US/Eastern")
+
+    def test_alias_to_unit_raises(self):
+        # 23990
+        with pytest.raises(ValueError, match="Passing a dtype alias"):
+            DatetimeTZDtype("datetime64[ns, US/Central]")
+
+    def test_alias_to_unit_bad_alias_raises(self):
+        # 23990
+        with pytest.raises(
+            TypeError, match="Cannot construct a 'DatetimeTZDtype' from"
+        ):
+            DatetimeTZDtype("this is a bad string")
+
+        with pytest.raises(
+            TypeError, match="Cannot construct a 'DatetimeTZDtype' from"
+        ):
+            DatetimeTZDtype("datetime64[ns, US/NotATZ]")
+
+    def test_hash_vs_equality(self, dtype):
+        # make sure that we satisfy is semantics
+        dtype2 = DatetimeTZDtype("ns", "US/Eastern")
+        dtype3 = DatetimeTZDtype(dtype2)
+        assert dtype == dtype2
+        assert dtype2 == dtype
+        assert dtype3 == dtype
+        assert hash(dtype) == hash(dtype2)
+        assert hash(dtype) == hash(dtype3)
+
+        dtype4 = DatetimeTZDtype("ns", "US/Central")
+        assert dtype2 != dtype4
+        assert hash(dtype2) != hash(dtype4)
+
+    def test_construction_non_nanosecond(self):
+        res = DatetimeTZDtype("ms", "US/Eastern")
+        assert res.unit == "ms"
+        assert res._creso == NpyDatetimeUnit.NPY_FR_ms.value
+        assert res.str == "|M8[ms]"
+        assert str(res) == "datetime64[ms, US/Eastern]"
+        assert res.base == np.dtype("M8[ms]")
+
+    def test_day_not_supported(self):
+        msg = "DatetimeTZDtype only supports s, ms, us, ns units"
+        with pytest.raises(ValueError, match=msg):
+            DatetimeTZDtype("D", "US/Eastern")
+
+    def test_subclass(self):
+        a = DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]")
+        b = DatetimeTZDtype.construct_from_string("datetime64[ns, CET]")
+
+        assert issubclass(type(a), type(b))
+
+    def test_compat(self, dtype):
+        msg = "is_datetime64tz_dtype is deprecated"
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            assert is_datetime64tz_dtype(dtype)
+            assert is_datetime64tz_dtype("datetime64[ns, US/Eastern]")
+        assert is_datetime64_any_dtype(dtype)
+        assert is_datetime64_any_dtype("datetime64[ns, US/Eastern]")
+        assert is_datetime64_ns_dtype(dtype)
+        assert is_datetime64_ns_dtype("datetime64[ns, US/Eastern]")
+        assert not is_datetime64_dtype(dtype)
+        assert not is_datetime64_dtype("datetime64[ns, US/Eastern]")
+
+    def test_construction_from_string(self, dtype):
+        result = DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]")
+        assert is_dtype_equal(dtype, result)
+
+    @pytest.mark.parametrize(
+        "string",
+        [
+            "foo",
+            "datetime64[ns, notatz]",
+            # non-nano unit
+            "datetime64[ps, UTC]",
+            # dateutil str that returns None from gettz
+            "datetime64[ns, dateutil/invalid]",
+        ],
+    )
+    def test_construct_from_string_invalid_raises(self, string):
+        msg = f"Cannot construct a 'DatetimeTZDtype' from '{string}'"
+        with pytest.raises(TypeError, match=re.escape(msg)):
+            DatetimeTZDtype.construct_from_string(string)
+
+    def test_construct_from_string_wrong_type_raises(self):
+        msg = "'construct_from_string' expects a string, got <class 'list'>"
+        with pytest.raises(TypeError, match=msg):
+            DatetimeTZDtype.construct_from_string(["datetime64[ns, notatz]"])
+
+    def test_is_dtype(self, dtype):
+        assert not DatetimeTZDtype.is_dtype(None)
+        assert DatetimeTZDtype.is_dtype(dtype)
+        assert DatetimeTZDtype.is_dtype("datetime64[ns, US/Eastern]")
+        assert DatetimeTZDtype.is_dtype("M8[ns, US/Eastern]")
+        assert not DatetimeTZDtype.is_dtype("foo")
+        assert DatetimeTZDtype.is_dtype(DatetimeTZDtype("ns", "US/Pacific"))
+        assert not DatetimeTZDtype.is_dtype(np.float64)
+
+    def test_equality(self, dtype):
+        assert is_dtype_equal(dtype, "datetime64[ns, US/Eastern]")
+        assert is_dtype_equal(dtype, "M8[ns, US/Eastern]")
+        assert is_dtype_equal(dtype, DatetimeTZDtype("ns", "US/Eastern"))
+        assert not is_dtype_equal(dtype, "foo")
+        assert not is_dtype_equal(dtype, DatetimeTZDtype("ns", "CET"))
+        assert not is_dtype_equal(
+            DatetimeTZDtype("ns", "US/Eastern"), DatetimeTZDtype("ns", "US/Pacific")
+        )
+
+        # numpy compat
+        assert is_dtype_equal(np.dtype("M8[ns]"), "datetime64[ns]")
+
+        assert dtype == "M8[ns, US/Eastern]"
+
+    def test_basic(self, dtype):
+        msg = "is_datetime64tz_dtype is deprecated"
+        with tm.assert_produces_warning(DeprecationWarning, match=msg):
+            assert is_datetime64tz_dtype(dtype)
+
+        dr = date_range("20130101", periods=3, tz="US/Eastern")
+        s = Series(dr, name="A")
+
+        # dtypes
+        with tm.assert_produces_warning(DeprecationWarning, match=msg):
+            assert is_datetime64tz_dtype(s.dtype)
+            assert is_datetime64tz_dtype(s)
+            assert not is_datetime64tz_dtype(np.dtype("float64"))
+            assert not is_datetime64tz_dtype(1.0)
+
+    def test_dst(self):
+        dr1 = date_range("2013-01-01", periods=3, tz="US/Eastern")
+        s1 = Series(dr1, name="A")
+        assert isinstance(s1.dtype, DatetimeTZDtype)
+
+        dr2 = date_range("2013-08-01", periods=3, tz="US/Eastern")
+        s2 = Series(dr2, name="A")
+        assert isinstance(s2.dtype, DatetimeTZDtype)
+        assert s1.dtype == s2.dtype
+
+    @pytest.mark.parametrize("tz", ["UTC", "US/Eastern"])
+    @pytest.mark.parametrize("constructor", ["M8", "datetime64"])
+    def test_parser(self, tz, constructor):
+        # pr #11245
+        dtz_str = f"{constructor}[ns, {tz}]"
+        result = DatetimeTZDtype.construct_from_string(dtz_str)
+        expected = DatetimeTZDtype("ns", tz)
+        assert result == expected
+
+    def test_empty(self):
+        with pytest.raises(TypeError, match="A 'tz' is required."):
+            DatetimeTZDtype()
+
+    def test_tz_standardize(self):
+        # GH 24713
+        pytz = pytest.importorskip("pytz")
+        tz = pytz.timezone("US/Eastern")
+        dr = date_range("2013-01-01", periods=3, tz=tz)
+        dtype = DatetimeTZDtype("ns", dr.tz)
+        assert dtype.tz == tz
+        dtype = DatetimeTZDtype("ns", dr[0].tz)
+        assert dtype.tz == tz
+
+
+class TestPeriodDtype(Base):
+    @pytest.fixture
+    def dtype(self):
+        """
+        Class level fixture of dtype for TestPeriodDtype
+        """
+        return PeriodDtype("D")
+
+    def test_hash_vs_equality(self, dtype):
+        # make sure that we satisfy is semantics
+        dtype2 = PeriodDtype("D")
+        dtype3 = PeriodDtype(dtype2)
+        assert dtype == dtype2
+        assert dtype2 == dtype
+        assert dtype3 == dtype
+        assert dtype is not dtype2
+        assert dtype2 is not dtype
+        assert dtype3 is not dtype
+        assert hash(dtype) == hash(dtype2)
+        assert hash(dtype) == hash(dtype3)
+
+    def test_construction(self):
+        with pytest.raises(ValueError, match="Invalid frequency: xx"):
+            PeriodDtype("xx")
+
+        for s in ["period[D]", "Period[D]", "D"]:
+            dt = PeriodDtype(s)
+            assert dt.freq == pd.tseries.offsets.Day()
+
+        for s in ["period[3D]", "Period[3D]", "3D"]:
+            dt = PeriodDtype(s)
+            assert dt.freq == pd.tseries.offsets.Day(3)
+
+        for s in [
+            "period[26h]",
+            "Period[26h]",
+            "26h",
+            "period[1D2h]",
+            "Period[1D2h]",
+            "1D2h",
+        ]:
+            dt = PeriodDtype(s)
+            assert dt.freq == pd.tseries.offsets.Hour(26)
+
+    def test_cannot_use_custom_businessday(self):
+        # GH#52534
+        msg = "C is not supported as period frequency"
+        msg1 = "<CustomBusinessDay> is not supported as period frequency"
+        msg2 = r"PeriodDtype\[B\] is deprecated"
+        with pytest.raises(ValueError, match=msg):
+            PeriodDtype("C")
+        with pytest.raises(ValueError, match=msg1):
+            with tm.assert_produces_warning(FutureWarning, match=msg2):
+                PeriodDtype(pd.offsets.CustomBusinessDay())
+
+    def test_subclass(self):
+        a = PeriodDtype("period[D]")
+        b = PeriodDtype("period[3D]")
+
+        assert issubclass(type(a), type(b))
+
+    def test_identity(self):
+        assert PeriodDtype("period[D]") == PeriodDtype("period[D]")
+        assert PeriodDtype("period[D]") is not PeriodDtype("period[D]")
+
+        assert PeriodDtype("period[3D]") == PeriodDtype("period[3D]")
+        assert PeriodDtype("period[3D]") is not PeriodDtype("period[3D]")
+
+        assert PeriodDtype("period[1s1us]") == PeriodDtype("period[1000001us]")
+        assert PeriodDtype("period[1s1us]") is not PeriodDtype("period[1000001us]")
+
+    def test_compat(self, dtype):
+        assert not is_datetime64_ns_dtype(dtype)
+        assert not is_datetime64_ns_dtype("period[D]")
+        assert not is_datetime64_dtype(dtype)
+        assert not is_datetime64_dtype("period[D]")
+
+    def test_construction_from_string(self, dtype):
+        result = PeriodDtype("period[D]")
+        assert is_dtype_equal(dtype, result)
+        result = PeriodDtype.construct_from_string("period[D]")
+        assert is_dtype_equal(dtype, result)
+
+        with pytest.raises(TypeError, match="list"):
+            PeriodDtype.construct_from_string([1, 2, 3])
+
+    @pytest.mark.parametrize(
+        "string",
+        [
+            "foo",
+            "period[foo]",
+            "foo[D]",
+            "datetime64[ns]",
+            "datetime64[ns, US/Eastern]",
+        ],
+    )
+    def test_construct_dtype_from_string_invalid_raises(self, string):
+        msg = f"Cannot construct a 'PeriodDtype' from '{string}'"
+        with pytest.raises(TypeError, match=re.escape(msg)):
+            PeriodDtype.construct_from_string(string)
+
+    def test_is_dtype(self, dtype):
+        assert PeriodDtype.is_dtype(dtype)
+        assert PeriodDtype.is_dtype("period[D]")
+        assert PeriodDtype.is_dtype("period[3D]")
+        assert PeriodDtype.is_dtype(PeriodDtype("3D"))
+        assert PeriodDtype.is_dtype("period[us]")
+        assert PeriodDtype.is_dtype("period[s]")
+        assert PeriodDtype.is_dtype(PeriodDtype("us"))
+        assert PeriodDtype.is_dtype(PeriodDtype("s"))
+
+        assert not PeriodDtype.is_dtype("D")
+        assert not PeriodDtype.is_dtype("3D")
+        assert not PeriodDtype.is_dtype("U")
+        assert not PeriodDtype.is_dtype("s")
+        assert not PeriodDtype.is_dtype("foo")
+        assert not PeriodDtype.is_dtype(np.object_)
+        assert not PeriodDtype.is_dtype(np.int64)
+        assert not PeriodDtype.is_dtype(np.float64)
+
+    def test_equality(self, dtype):
+        assert is_dtype_equal(dtype, "period[D]")
+        assert is_dtype_equal(dtype, PeriodDtype("D"))
+        assert is_dtype_equal(dtype, PeriodDtype("D"))
+        assert is_dtype_equal(PeriodDtype("D"), PeriodDtype("D"))
+
+        assert not is_dtype_equal(dtype, "D")
+        assert not is_dtype_equal(PeriodDtype("D"), PeriodDtype("2D"))
+
+    def test_basic(self, dtype):
+        msg = "is_period_dtype is deprecated"
+        with tm.assert_produces_warning(DeprecationWarning, match=msg):
+            assert is_period_dtype(dtype)
+
+            pidx = pd.period_range("2013-01-01 09:00", periods=5, freq="h")
+
+            assert is_period_dtype(pidx.dtype)
+            assert is_period_dtype(pidx)
+
+            s = Series(pidx, name="A")
+
+            assert is_period_dtype(s.dtype)
+            assert is_period_dtype(s)
+
+            assert not is_period_dtype(np.dtype("float64"))
+            assert not is_period_dtype(1.0)
+
+    def test_freq_argument_required(self):
+        # GH#27388
+        msg = "missing 1 required positional argument: 'freq'"
+        with pytest.raises(TypeError, match=msg):
+            PeriodDtype()
+
+        msg = "PeriodDtype argument should be string or BaseOffset, got NoneType"
+        with pytest.raises(TypeError, match=msg):
+            # GH#51790
+            PeriodDtype(None)
+
+    def test_not_string(self):
+        # though PeriodDtype has object kind, it cannot be string
+        assert not is_string_dtype(PeriodDtype("D"))
+
+    def test_perioddtype_caching_dateoffset_normalize(self):
+        # GH 24121
+        per_d = PeriodDtype(pd.offsets.YearEnd(normalize=True))
+        assert per_d.freq.normalize
+
+        per_d2 = PeriodDtype(pd.offsets.YearEnd(normalize=False))
+        assert not per_d2.freq.normalize
+
+    def test_dont_keep_ref_after_del(self):
+        # GH 54184
+        dtype = PeriodDtype("D")
+        ref = weakref.ref(dtype)
+        del dtype
+        assert ref() is None
+
+
+class TestIntervalDtype(Base):
+    @pytest.fixture
+    def dtype(self):
+        """
+        Class level fixture of dtype for TestIntervalDtype
+        """
+        return IntervalDtype("int64", "right")
+
+    def test_hash_vs_equality(self, dtype):
+        # make sure that we satisfy is semantics
+        dtype2 = IntervalDtype("int64", "right")
+        dtype3 = IntervalDtype(dtype2)
+        assert dtype == dtype2
+        assert dtype2 == dtype
+        assert dtype3 == dtype
+        assert dtype is not dtype2
+        assert dtype2 is not dtype3
+        assert dtype3 is not dtype
+        assert hash(dtype) == hash(dtype2)
+        assert hash(dtype) == hash(dtype3)
+
+        dtype1 = IntervalDtype("interval")
+        dtype2 = IntervalDtype(dtype1)
+        dtype3 = IntervalDtype("interval")
+        assert dtype2 == dtype1
+        assert dtype2 == dtype2
+        assert dtype2 == dtype3
+        assert dtype2 is not dtype1
+        assert dtype2 is dtype2
+        assert dtype2 is not dtype3
+        assert hash(dtype2) == hash(dtype1)
+        assert hash(dtype2) == hash(dtype2)
+        assert hash(dtype2) == hash(dtype3)
+
+    @pytest.mark.parametrize(
+        "subtype", ["interval[int64]", "Interval[int64]", "int64", np.dtype("int64")]
+    )
+    def test_construction(self, subtype):
+        i = IntervalDtype(subtype, closed="right")
+        assert i.subtype == np.dtype("int64")
+        msg = "is_interval_dtype is deprecated"
+        with tm.assert_produces_warning(DeprecationWarning, match=msg):
+            assert is_interval_dtype(i)
+
+    @pytest.mark.parametrize(
+        "subtype", ["interval[int64]", "Interval[int64]", "int64", np.dtype("int64")]
+    )
+    def test_construction_allows_closed_none(self, subtype):
+        # GH#38394
+        dtype = IntervalDtype(subtype)
+
+        assert dtype.closed is None
+
+    def test_closed_mismatch(self):
+        msg = "'closed' keyword does not match value specified in dtype string"
+        with pytest.raises(ValueError, match=msg):
+            IntervalDtype("interval[int64, left]", "right")
+
+    @pytest.mark.parametrize("subtype", [None, "interval", "Interval"])
+    def test_construction_generic(self, subtype):
+        # generic
+        i = IntervalDtype(subtype)
+        assert i.subtype is None
+        msg = "is_interval_dtype is deprecated"
+        with tm.assert_produces_warning(DeprecationWarning, match=msg):
+            assert is_interval_dtype(i)
+
+    @pytest.mark.parametrize(
+        "subtype",
+        [
+            CategoricalDtype(list("abc"), False),
+            CategoricalDtype(list("wxyz"), True),
+            object,
+            str,
+            "<U10",
+            "interval[category]",
+            "interval[object]",
+        ],
+    )
+    def test_construction_not_supported(self, subtype):
+        # GH 19016
+        msg = (
+            "category, object, and string subtypes are not supported for IntervalDtype"
+        )
+        with pytest.raises(TypeError, match=msg):
+            IntervalDtype(subtype)
+
+    @pytest.mark.parametrize("subtype", ["xx", "IntervalA", "Interval[foo]"])
+    def test_construction_errors(self, subtype):
+        msg = "could not construct IntervalDtype"
+        with pytest.raises(TypeError, match=msg):
+            IntervalDtype(subtype)
+
+    def test_closed_must_match(self):
+        # GH#37933
+        dtype = IntervalDtype(np.float64, "left")
+
+        msg = "dtype.closed and 'closed' do not match"
+        with pytest.raises(ValueError, match=msg):
+            IntervalDtype(dtype, closed="both")
+
+    def test_closed_invalid(self):
+        with pytest.raises(ValueError, match="closed must be one of"):
+            IntervalDtype(np.float64, "foo")
+
+    def test_construction_from_string(self, dtype):
+        result = IntervalDtype("interval[int64, right]")
+        assert is_dtype_equal(dtype, result)
+        result = IntervalDtype.construct_from_string("interval[int64, right]")
+        assert is_dtype_equal(dtype, result)
+
+    @pytest.mark.parametrize("string", [0, 3.14, ("a", "b"), None])
+    def test_construction_from_string_errors(self, string):
+        # these are invalid entirely
+        msg = f"'construct_from_string' expects a string, got {type(string)}"
+
+        with pytest.raises(TypeError, match=re.escape(msg)):
+            IntervalDtype.construct_from_string(string)
+
+    @pytest.mark.parametrize("string", ["foo", "foo[int64]", "IntervalA"])
+    def test_construction_from_string_error_subtype(self, string):
+        # this is an invalid subtype
+        msg = (
+            "Incorrectly formatted string passed to constructor. "
+            r"Valid formats include Interval or Interval\[dtype\] "
+            "where dtype is numeric, datetime, or timedelta"
+        )
+
+        with pytest.raises(TypeError, match=msg):
+            IntervalDtype.construct_from_string(string)
+
+    def test_subclass(self):
+        a = IntervalDtype("interval[int64, right]")
+        b = IntervalDtype("interval[int64, right]")
+
+        assert issubclass(type(a), type(b))
+
+    def test_is_dtype(self, dtype):
+        assert IntervalDtype.is_dtype(dtype)
+        assert IntervalDtype.is_dtype("interval")
+        assert IntervalDtype.is_dtype(IntervalDtype("float64"))
+        assert IntervalDtype.is_dtype(IntervalDtype("int64"))
+        assert IntervalDtype.is_dtype(IntervalDtype(np.int64))
+        assert IntervalDtype.is_dtype(IntervalDtype("float64", "left"))
+        assert IntervalDtype.is_dtype(IntervalDtype("int64", "right"))
+        assert IntervalDtype.is_dtype(IntervalDtype(np.int64, "both"))
+
+        assert not IntervalDtype.is_dtype("D")
+        assert not IntervalDtype.is_dtype("3D")
+        assert not IntervalDtype.is_dtype("us")
+        assert not IntervalDtype.is_dtype("S")
+        assert not IntervalDtype.is_dtype("foo")
+        assert not IntervalDtype.is_dtype("IntervalA")
+        assert not IntervalDtype.is_dtype(np.object_)
+        assert not IntervalDtype.is_dtype(np.int64)
+        assert not IntervalDtype.is_dtype(np.float64)
+
+    def test_equality(self, dtype):
+        assert is_dtype_equal(dtype, "interval[int64, right]")
+        assert is_dtype_equal(dtype, IntervalDtype("int64", "right"))
+        assert is_dtype_equal(
+            IntervalDtype("int64", "right"), IntervalDtype("int64", "right")
+        )
+
+        assert not is_dtype_equal(dtype, "interval[int64]")
+        assert not is_dtype_equal(dtype, IntervalDtype("int64"))
+        assert not is_dtype_equal(
+            IntervalDtype("int64", "right"), IntervalDtype("int64")
+        )
+
+        assert not is_dtype_equal(dtype, "int64")
+        assert not is_dtype_equal(
+            IntervalDtype("int64", "neither"), IntervalDtype("float64", "right")
+        )
+        assert not is_dtype_equal(
+            IntervalDtype("int64", "both"), IntervalDtype("int64", "left")
+        )
+
+        # invalid subtype comparisons do not raise when directly compared
+        dtype1 = IntervalDtype("float64", "left")
+        dtype2 = IntervalDtype("datetime64[ns, US/Eastern]", "left")
+        assert dtype1 != dtype2
+        assert dtype2 != dtype1
+
+    @pytest.mark.parametrize(
+        "subtype",
+        [
+            None,
+            "interval",
+            "Interval",
+            "int64",
+            "uint64",
+            "float64",
+            "complex128",
+            "datetime64",
+            "timedelta64",
+            PeriodDtype("Q"),
+        ],
+    )
+    def test_equality_generic(self, subtype):
+        # GH 18980
+        closed = "right" if subtype is not None else None
+        dtype = IntervalDtype(subtype, closed=closed)
+        assert is_dtype_equal(dtype, "interval")
+        assert is_dtype_equal(dtype, IntervalDtype())
+
+    @pytest.mark.parametrize(
+        "subtype",
+        [
+            "int64",
+            "uint64",
+            "float64",
+            "complex128",
+            "datetime64",
+            "timedelta64",
+            PeriodDtype("Q"),
+        ],
+    )
+    def test_name_repr(self, subtype):
+        # GH 18980
+        closed = "right" if subtype is not None else None
+        dtype = IntervalDtype(subtype, closed=closed)
+        expected = f"interval[{subtype}, {closed}]"
+        assert str(dtype) == expected
+        assert dtype.name == "interval"
+
+    @pytest.mark.parametrize("subtype", [None, "interval", "Interval"])
+    def test_name_repr_generic(self, subtype):
+        # GH 18980
+        dtype = IntervalDtype(subtype)
+        assert str(dtype) == "interval"
+        assert dtype.name == "interval"
+
+    def test_basic(self, dtype):
+        msg = "is_interval_dtype is deprecated"
+        with tm.assert_produces_warning(DeprecationWarning, match=msg):
+            assert is_interval_dtype(dtype)
+
+            ii = IntervalIndex.from_breaks(range(3))
+
+            assert is_interval_dtype(ii.dtype)
+            assert is_interval_dtype(ii)
+
+            s = Series(ii, name="A")
+
+            assert is_interval_dtype(s.dtype)
+            assert is_interval_dtype(s)
+
+    def test_basic_dtype(self):
+        msg = "is_interval_dtype is deprecated"
+        with tm.assert_produces_warning(DeprecationWarning, match=msg):
+            assert is_interval_dtype("interval[int64, both]")
+            assert is_interval_dtype(IntervalIndex.from_tuples([(0, 1)]))
+            assert is_interval_dtype(IntervalIndex.from_breaks(np.arange(4)))
+            assert is_interval_dtype(
+                IntervalIndex.from_breaks(date_range("20130101", periods=3))
+            )
+            assert not is_interval_dtype("U")
+            assert not is_interval_dtype("S")
+            assert not is_interval_dtype("foo")
+            assert not is_interval_dtype(np.object_)
+            assert not is_interval_dtype(np.int64)
+            assert not is_interval_dtype(np.float64)
+
+    def test_caching(self, temp_file):
+        # GH 54184: Caching not shown to improve performance
+        IntervalDtype.reset_cache()
+        dtype = IntervalDtype("int64", "right")
+        assert len(IntervalDtype._cache_dtypes) == 0
+
+        IntervalDtype("interval")
+        assert len(IntervalDtype._cache_dtypes) == 0
+
+        IntervalDtype.reset_cache()
+        tm.round_trip_pickle(dtype, temp_file)
+        assert len(IntervalDtype._cache_dtypes) == 0
+
+    def test_not_string(self):
+        # GH30568: though IntervalDtype has object kind, it cannot be string
+        assert not is_string_dtype(IntervalDtype())
+
+    def test_unpickling_without_closed(self, temp_file):
+        # GH#38394
+        dtype = IntervalDtype("interval")
+
+        assert dtype._closed is None
+
+        tm.round_trip_pickle(dtype, temp_file)
+
+    def test_dont_keep_ref_after_del(self):
+        # GH 54184
+        dtype = IntervalDtype("int64", "right")
+        ref = weakref.ref(dtype)
+        del dtype
+        assert ref() is None
+
+
+class TestCategoricalDtypeParametrized:
+    @pytest.mark.parametrize(
+        "categories",
+        [
+            list("abcd"),
+            np.arange(1000),
+            ["a", "b", 10, 2, 1.3, True],
+            [True, False],
+            date_range("2017", periods=4),
+        ],
+    )
+    def test_basic(self, categories, ordered):
+        c1 = CategoricalDtype(categories, ordered=ordered)
+        tm.assert_index_equal(c1.categories, pd.Index(categories))
+        assert c1.ordered is ordered
+
+    def test_order_matters(self):
+        categories = ["a", "b"]
+        c1 = CategoricalDtype(categories, ordered=True)
+        c2 = CategoricalDtype(categories, ordered=False)
+        c3 = CategoricalDtype(categories, ordered=None)
+        assert c1 is not c2
+        assert c1 is not c3
+
+    @pytest.mark.parametrize("ordered", [False, None])
+    def test_unordered_same(self, ordered):
+        c1 = CategoricalDtype(["a", "b"], ordered=ordered)
+        c2 = CategoricalDtype(["b", "a"], ordered=ordered)
+        assert hash(c1) == hash(c2)
+
+    def test_categories(self):
+        result = CategoricalDtype(["a", "b", "c"])
+        tm.assert_index_equal(result.categories, pd.Index(["a", "b", "c"]))
+        assert result.ordered is False
+
+    def test_equal_but_different(self):
+        c1 = CategoricalDtype([1, 2, 3])
+        c2 = CategoricalDtype([1.0, 2.0, 3.0])
+        assert c1 is not c2
+        assert c1 != c2
+
+    def test_equal_but_different_mixed_dtypes(self):
+        c1 = CategoricalDtype([1, 2, "3"])
+        c2 = CategoricalDtype(["3", 1, 2])
+        assert c1 is not c2
+        assert c1 == c2
+
+    def test_equal_empty_ordered(self):
+        c1 = CategoricalDtype([], ordered=True)
+        c2 = CategoricalDtype([], ordered=True)
+        assert c1 is not c2
+        assert c1 == c2
+
+    def test_equal_empty_unordered(self):
+        c1 = CategoricalDtype([])
+        c2 = CategoricalDtype([])
+        assert c1 is not c2
+        assert c1 == c2
+
+    @pytest.mark.parametrize("v1, v2", [([1, 2, 3], [1, 2, 3]), ([1, 2, 3], [3, 2, 1])])
+    def test_order_hashes_different(self, v1, v2):
+        c1 = CategoricalDtype(v1, ordered=False)
+        c2 = CategoricalDtype(v2, ordered=True)
+        c3 = CategoricalDtype(v1, ordered=None)
+        assert c1 is not c2
+        assert c1 is not c3
+
+    def test_nan_invalid(self):
+        msg = "Categorical categories cannot be null"
+        with pytest.raises(ValueError, match=msg):
+            CategoricalDtype([1, 2, np.nan])
+
+    def test_non_unique_invalid(self):
+        msg = "Categorical categories must be unique"
+        with pytest.raises(ValueError, match=msg):
+            CategoricalDtype([1, 2, 1])
+
+    def test_same_categories_different_order(self):
+        c1 = CategoricalDtype(["a", "b"], ordered=True)
+        c2 = CategoricalDtype(["b", "a"], ordered=True)
+        assert c1 is not c2
+
+    @pytest.mark.parametrize("ordered2", [True, False, None])
+    def test_categorical_equality(self, ordered, ordered2):
+        # same categories, same order
+        # any combination of None/False are equal
+        # True/True is the only combination with True that are equal
+        c1 = CategoricalDtype(list("abc"), ordered)
+        c2 = CategoricalDtype(list("abc"), ordered2)
+        result = c1 == c2
+        expected = bool(ordered) is bool(ordered2)
+        assert result is expected
+
+        # same categories, different order
+        # any combination of None/False are equal (order doesn't matter)
+        # any combination with True are not equal (different order of cats)
+        c1 = CategoricalDtype(list("abc"), ordered)
+        c2 = CategoricalDtype(list("cab"), ordered2)
+        result = c1 == c2
+        expected = (bool(ordered) is False) and (bool(ordered2) is False)
+        assert result is expected
+
+        # different categories
+        c2 = CategoricalDtype([1, 2, 3], ordered2)
+        assert c1 != c2
+
+        # none categories
+        c1 = CategoricalDtype(list("abc"), ordered)
+        c2 = CategoricalDtype(None, ordered2)
+        c3 = CategoricalDtype(None, ordered)
+        assert c1 != c2
+        assert c2 != c1
+        assert c2 == c3
+
+    def test_categorical_dtype_equality_requires_categories(self):
+        # CategoricalDtype with categories=None is *not* equal to
+        #  any fully-initialized CategoricalDtype
+        first = CategoricalDtype(["a", "b"])
+        second = CategoricalDtype()
+        third = CategoricalDtype(ordered=True)
+
+        assert second == second
+        assert third == third
+
+        assert first != second
+        assert second != first
+        assert first != third
+        assert third != first
+        assert second == third
+        assert third == second
+
+    @pytest.mark.parametrize("categories", [list("abc"), None])
+    @pytest.mark.parametrize("other", ["category", "not a category"])
+    def test_categorical_equality_strings(self, categories, ordered, other):
+        c1 = CategoricalDtype(categories, ordered)
+        result = c1 == other
+        expected = other == "category"
+        assert result is expected
+
+    def test_invalid_raises(self):
+        with pytest.raises(TypeError, match="ordered"):
+            CategoricalDtype(["a", "b"], ordered="foo")
+
+        with pytest.raises(TypeError, match="'categories' must be list-like"):
+            CategoricalDtype("category")
+
+    def test_mixed(self):
+        a = CategoricalDtype(["a", "b", 1, 2])
+        b = CategoricalDtype(["a", "b", "1", "2"])
+        assert hash(a) != hash(b)
+
+    def test_from_categorical_dtype_identity(self):
+        c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
+        # Identity test for no changes
+        c2 = CategoricalDtype._from_categorical_dtype(c1)
+        assert c2 is c1
+
+    def test_from_categorical_dtype_categories(self):
+        c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
+        # override categories
+        result = CategoricalDtype._from_categorical_dtype(c1, categories=[2, 3])
+        assert result == CategoricalDtype([2, 3], ordered=True)
+
+    def test_from_categorical_dtype_ordered(self):
+        c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
+        # override ordered
+        result = CategoricalDtype._from_categorical_dtype(c1, ordered=False)
+        assert result == CategoricalDtype([1, 2, 3], ordered=False)
+
+    def test_from_categorical_dtype_both(self):
+        c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
+        # override ordered
+        result = CategoricalDtype._from_categorical_dtype(
+            c1, categories=[1, 2], ordered=False
+        )
+        assert result == CategoricalDtype([1, 2], ordered=False)
+
+    def test_str_vs_repr(self, ordered, using_infer_string):
+        c1 = CategoricalDtype(["a", "b"], ordered=ordered)
+        assert str(c1) == "category"
+        # Py2 will have unicode prefixes
+        dtype = "str" if using_infer_string else "object"
+        pat = (
+            r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, "
+            rf"categories_dtype={dtype}\)"
+        )
+        assert re.match(pat.format(ordered=ordered), repr(c1))
+
+    def test_categorical_categories(self):
+        # GH17884
+        c1 = CategoricalDtype(Categorical(["a", "b"]))
+        tm.assert_index_equal(c1.categories, pd.Index(["a", "b"]))
+        c1 = CategoricalDtype(CategoricalIndex(["a", "b"]))
+        tm.assert_index_equal(c1.categories, pd.Index(["a", "b"]))
+
+    @pytest.mark.parametrize(
+        "new_categories", [list("abc"), list("cba"), list("wxyz"), None]
+    )
+    @pytest.mark.parametrize("new_ordered", [True, False, None])
+    def test_update_dtype(self, ordered, new_categories, new_ordered):
+        original_categories = list("abc")
+        dtype = CategoricalDtype(original_categories, ordered)
+        new_dtype = CategoricalDtype(new_categories, new_ordered)
+
+        result = dtype.update_dtype(new_dtype)
+        expected_categories = pd.Index(new_categories or original_categories)
+        expected_ordered = new_ordered if new_ordered is not None else dtype.ordered
+
+        tm.assert_index_equal(result.categories, expected_categories)
+        assert result.ordered is expected_ordered
+
+    def test_update_dtype_string(self, ordered):
+        dtype = CategoricalDtype(list("abc"), ordered)
+        expected_categories = dtype.categories
+        expected_ordered = dtype.ordered
+        result = dtype.update_dtype("category")
+        tm.assert_index_equal(result.categories, expected_categories)
+        assert result.ordered is expected_ordered
+
+    @pytest.mark.parametrize("bad_dtype", ["foo", object, np.int64, PeriodDtype("Q")])
+    def test_update_dtype_errors(self, bad_dtype):
+        dtype = CategoricalDtype(list("abc"), False)
+        msg = "a CategoricalDtype must be passed to perform an update, "
+        with pytest.raises(ValueError, match=msg):
+            dtype.update_dtype(bad_dtype)
+
+
+@pytest.mark.parametrize(
+    "dtype", [CategoricalDtype, IntervalDtype, DatetimeTZDtype, PeriodDtype]
+)
+def test_registry(dtype):
+    assert dtype in registry.dtypes
+
+
+@pytest.mark.parametrize(
+    "dtype, expected",
+    [
+        ("int64", None),
+        ("interval", IntervalDtype()),
+        ("interval[int64, neither]", IntervalDtype()),
+        ("interval[datetime64[ns], left]", IntervalDtype("datetime64[ns]", "left")),
+        ("period[D]", PeriodDtype("D")),
+        ("category", CategoricalDtype()),
+        ("datetime64[ns, US/Eastern]", DatetimeTZDtype("ns", "US/Eastern")),
+    ],
+)
+def test_registry_find(dtype, expected):
+    assert registry.find(dtype) == expected
+
+
+@pytest.mark.parametrize(
+    "dtype, expected",
+    [
+        (str, False),
+        (int, False),
+        (bool, True),
+        (np.bool_, True),
+        (np.array(["a", "b"]), False),
+        (Series([1, 2]), False),
+        (np.array([True, False]), True),
+        (Series([True, False]), True),
+        (SparseArray([True, False]), True),
+        (SparseDtype(bool), True),
+    ],
+)
+def test_is_bool_dtype(dtype, expected):
+    result = is_bool_dtype(dtype)
+    assert result is expected
+
+
+def test_is_bool_dtype_sparse():
+    result = is_bool_dtype(Series(SparseArray([True, False])))
+    assert result is True
+
+
+@pytest.mark.parametrize(
+    "check",
+    [
+        is_categorical_dtype,
+        is_datetime64tz_dtype,
+        is_period_dtype,
+        is_datetime64_ns_dtype,
+        is_datetime64_dtype,
+        is_interval_dtype,
+        is_datetime64_any_dtype,
+        is_string_dtype,
+        is_bool_dtype,
+    ],
+)
+def test_is_dtype_no_warning(check):
+    data = pd.DataFrame({"A": [1, 2]})
+
+    warn = None
+    msg = f"{check.__name__} is deprecated"
+    if (
+        check is is_categorical_dtype
+        or check is is_interval_dtype
+        or check is is_datetime64tz_dtype
+        or check is is_period_dtype
+    ):
+        warn = DeprecationWarning
+
+    with tm.assert_produces_warning(warn, match=msg):
+        check(data)
+
+    with tm.assert_produces_warning(warn, match=msg):
+        check(data["A"])
+
+
+def test_period_dtype_compare_to_string():
+    # https://github.com/pandas-dev/pandas/issues/37265
+    dtype = PeriodDtype(freq="M")
+    assert (dtype == "period[M]") is True
+    assert (dtype != "period[M]") is False
+
+
+def test_compare_complex_dtypes():
+    # GH 28050
+    df = pd.DataFrame(np.arange(5).astype(np.complex128))
+    msg = "'<' not supported between instances of 'complex' and 'complex'"
+
+    with pytest.raises(TypeError, match=msg):
+        df < df.astype(object)
+
+    with pytest.raises(TypeError, match=msg):
+        df.lt(df.astype(object))
+
+
+def test_cast_string_to_complex():
+    # GH 4895
+    expected = pd.DataFrame(["1.0+5j", "1.5-3j"], dtype=complex)
+    result = pd.DataFrame(["1.0+5j", "1.5-3j"]).astype(complex)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_complex():
+    result = Categorical([1, 2 + 2j])
+    expected = Categorical([1.0 + 0.0j, 2.0 + 2.0j])
+    tm.assert_categorical_equal(result, expected)
+    result = Categorical([1, 2, 2 + 2j])
+    expected = Categorical([1.0 + 0.0j, 2.0 + 0.0j, 2.0 + 2.0j])
+    tm.assert_categorical_equal(result, expected)
+
+
+def test_multi_column_dtype_assignment():
+    # GH #27583
+    df = pd.DataFrame({"a": [0.0], "b": 0.0})
+    expected = pd.DataFrame({"a": [0], "b": 0})
+
+    df[["a", "b"]] = 0
+    tm.assert_frame_equal(df, expected)
+
+    df["b"] = 0
+    tm.assert_frame_equal(df, expected)
+
+
+def test_loc_setitem_empty_labels_no_dtype_conversion():
+    # GH 29707
+
+    df = pd.DataFrame({"a": [2, 3]})
+    expected = df.copy()
+    assert df.a.dtype == "int64"
+    df.loc[[]] = 0.1
+
+    assert df.a.dtype == "int64"
+    tm.assert_frame_equal(df, expected)
+
+
+def test_categorical_nan_no_dtype_conversion():
+    # GH 43996
+
+    df = pd.DataFrame({"a": Categorical([np.nan], [1]), "b": [1]})
+    expected = pd.DataFrame({"a": Categorical([1], [1]), "b": [1]})
+    df.loc[0, "a"] = np.array([1])
+    tm.assert_frame_equal(df, expected)
diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b90886a8d0703773d67d31d9db38a7d59284d25
--- /dev/null
+++ b/pandas/tests/dtypes/test_generic.py
@@ -0,0 +1,130 @@
+import re
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes import generic as gt
+
+import pandas as pd
+import pandas._testing as tm
+
+
+class TestABCClasses:
+    tuples = [[1, 2, 2], ["red", "blue", "red"]]
+    multi_index = pd.MultiIndex.from_arrays(tuples, names=("number", "color"))
+    datetime_index = pd.to_datetime(["2000/1/1", "2010/1/1"])
+    timedelta_index = pd.to_timedelta(np.arange(5), unit="s")
+    period_index = pd.period_range("2000/1/1", "2010/1/1/", freq="M")
+    categorical = pd.Categorical([1, 2, 3], categories=[2, 3, 1])
+    categorical_df = pd.DataFrame({"values": [1, 2, 3]}, index=categorical)
+    df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index)
+    sparse_array = pd.arrays.SparseArray(np.random.default_rng(2).standard_normal(10))
+
+    datetime_array = datetime_index.array
+    timedelta_array = timedelta_index.array
+
+    abc_pairs = [
+        ("ABCMultiIndex", multi_index),
+        ("ABCDatetimeIndex", datetime_index),
+        ("ABCRangeIndex", pd.RangeIndex(3)),
+        ("ABCTimedeltaIndex", timedelta_index),
+        ("ABCIntervalIndex", pd.interval_range(start=0, end=3)),
+        (
+            "ABCPeriodArray",
+            pd.arrays.PeriodArray([2000, 2001, 2002], dtype="period[D]"),
+        ),
+        ("ABCNumpyExtensionArray", pd.arrays.NumpyExtensionArray(np.array([0, 1, 2]))),
+        ("ABCPeriodIndex", period_index),
+        ("ABCCategoricalIndex", categorical_df.index),
+        ("ABCSeries", pd.Series([1, 2, 3])),
+        ("ABCDataFrame", df),
+        ("ABCCategorical", categorical),
+        ("ABCDatetimeArray", datetime_array),
+        ("ABCTimedeltaArray", timedelta_array),
+    ]
+
+    @pytest.mark.parametrize("abctype1, inst", abc_pairs)
+    @pytest.mark.parametrize("abctype2, _", abc_pairs)
+    def test_abc_pairs_instance_check(self, abctype1, abctype2, inst, _):
+        # GH 38588, 46719
+        if abctype1 == abctype2:
+            assert isinstance(inst, getattr(gt, abctype2))
+            assert not isinstance(type(inst), getattr(gt, abctype2))
+        else:
+            assert not isinstance(inst, getattr(gt, abctype2))
+
+    @pytest.mark.parametrize("abctype1, inst", abc_pairs)
+    @pytest.mark.parametrize("abctype2, _", abc_pairs)
+    def test_abc_pairs_subclass_check(self, abctype1, abctype2, inst, _):
+        # GH 38588, 46719
+        if abctype1 == abctype2:
+            assert issubclass(type(inst), getattr(gt, abctype2))
+
+            with pytest.raises(
+                TypeError, match=re.escape("issubclass() arg 1 must be a class")
+            ):
+                issubclass(inst, getattr(gt, abctype2))
+        else:
+            assert not issubclass(type(inst), getattr(gt, abctype2))
+
+    abc_subclasses = {
+        "ABCIndex": [
+            abctype
+            for abctype, _ in abc_pairs
+            if "Index" in abctype and abctype != "ABCIndex"
+        ],
+        "ABCNDFrame": ["ABCSeries", "ABCDataFrame"],
+        "ABCExtensionArray": [
+            "ABCCategorical",
+            "ABCDatetimeArray",
+            "ABCPeriodArray",
+            "ABCTimedeltaArray",
+        ],
+    }
+
+    @pytest.mark.parametrize("parent, subs", abc_subclasses.items())
+    @pytest.mark.parametrize("abctype, inst", abc_pairs)
+    def test_abc_hierarchy(self, parent, subs, abctype, inst):
+        # GH 38588
+        if abctype in subs:
+            assert isinstance(inst, getattr(gt, parent))
+        else:
+            assert not isinstance(inst, getattr(gt, parent))
+
+    @pytest.mark.parametrize("abctype", [e for e in gt.__dict__ if e.startswith("ABC")])
+    def test_abc_coverage(self, abctype):
+        # GH 38588
+        assert (
+            abctype in (e for e, _ in self.abc_pairs) or abctype in self.abc_subclasses
+        )
+
+
+def test_setattr_warnings():
+    # GH7175 - GOTCHA: You can't use dot notation to add a column...
+    d = {
+        "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]),
+        "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]),
+    }
+    df = pd.DataFrame(d)
+
+    with tm.assert_produces_warning(None):
+        #  successfully add new column
+        #  this should not raise a warning
+        df["three"] = df.two + 1
+        assert df.three.sum() > df.two.sum()
+
+    with tm.assert_produces_warning(None):
+        #  successfully modify column in place
+        #  this should not raise a warning
+        df.one += 1
+        assert df.one.iloc[0] == 2
+
+    with tm.assert_produces_warning(None):
+        #  successfully add an attribute to a series
+        #  this should not raise a warning
+        df.two.not_an_index = [1, 2]
+
+    with tm.assert_produces_warning(UserWarning, match="doesn't allow columns"):
+        #  warn when setting column to nonexistent name
+        df.four = df.two + 2
+        assert df.four.sum() > df.two.sum()
diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cea1276f7f4bd5d5077b29aa698484d324ccc7f
--- /dev/null
+++ b/pandas/tests/dtypes/test_inference.py
@@ -0,0 +1,2155 @@
+"""
+These the test the public routines exposed in types/common.py
+related to inference and not otherwise tested in types/test_common.py
+
+"""
+
+import collections
+from collections import namedtuple
+from collections.abc import Iterator
+from datetime import (
+    date,
+    datetime,
+    time,
+    timedelta,
+    timezone,
+)
+from decimal import Decimal
+from fractions import Fraction
+from io import StringIO
+import itertools
+from numbers import Number
+import re
+import sys
+from typing import (
+    Generic,
+    TypeVar,
+)
+
+import numpy as np
+import pytest
+
+from pandas._libs import (
+    lib,
+    missing as libmissing,
+    ops as libops,
+)
+from pandas.compat import PY312
+from pandas.compat.numpy import np_version_gt2
+from pandas.errors import Pandas4Warning
+
+from pandas.core.dtypes import inference
+from pandas.core.dtypes.cast import find_result_type
+from pandas.core.dtypes.common import (
+    ensure_int32,
+    is_bool,
+    is_complex,
+    is_datetime64_any_dtype,
+    is_datetime64_dtype,
+    is_datetime64_ns_dtype,
+    is_datetime64tz_dtype,
+    is_float,
+    is_integer,
+    is_number,
+    is_scalar,
+    is_scipy_sparse,
+    is_timedelta64_dtype,
+    is_timedelta64_ns_dtype,
+)
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    DataFrame,
+    DateOffset,
+    DatetimeIndex,
+    Index,
+    Interval,
+    Period,
+    Series,
+    Timedelta,
+    TimedeltaIndex,
+    Timestamp,
+)
+import pandas._testing as tm
+from pandas.core.arrays import (
+    BooleanArray,
+    FloatingArray,
+    IntegerArray,
+)
+
+
+@pytest.fixture(params=[True, False], ids=str)
+def coerce(request):
+    return request.param
+
+
+class MockNumpyLikeArray:
+    """
+    A class which is numpy-like (e.g. Pint's Quantity) but not actually numpy
+
+    The key is that it is not actually a numpy array so
+    ``util.is_array(mock_numpy_like_array_instance)`` returns ``False``. Other
+    important properties are that the class defines a :meth:`__iter__` method
+    (so that ``isinstance(abc.Iterable)`` returns ``True``) and has a
+    :meth:`ndim` property, as pandas special-cases 0-dimensional arrays in some
+    cases.
+
+    We expect pandas to behave with respect to such duck arrays exactly as
+    with real numpy arrays. In particular, a 0-dimensional duck array is *NOT*
+    a scalar (`is_scalar(np.array(1)) == False`), but it is not list-like either.
+    """
+
+    def __init__(self, values) -> None:
+        self._values = values
+
+    def __iter__(self) -> Iterator:
+        iter_values = iter(self._values)
+
+        def it_outer():
+            yield from iter_values
+
+        return it_outer()
+
+    def __len__(self) -> int:
+        return len(self._values)
+
+    def __array__(self, dtype=None, copy=None):
+        return np.asarray(self._values, dtype=dtype)
+
+    @property
+    def ndim(self):
+        return self._values.ndim
+
+    @property
+    def dtype(self):
+        return self._values.dtype
+
+    @property
+    def size(self):
+        return self._values.size
+
+    @property
+    def shape(self):
+        return self._values.shape
+
+
+# collect all objects to be tested for list-like-ness; use tuples of objects,
+# whether they are list-like or not (special casing for sets), and their ID
+ll_params = [
+    ([1], True, "list"),
+    ([], True, "list-empty"),
+    ((1,), True, "tuple"),
+    ((), True, "tuple-empty"),
+    ({"a": 1}, True, "dict"),
+    ({}, True, "dict-empty"),
+    ({"a", 1}, "set", "set"),
+    (set(), "set", "set-empty"),
+    (frozenset({"a", 1}), "set", "frozenset"),
+    (frozenset(), "set", "frozenset-empty"),
+    (iter([1, 2]), True, "iterator"),
+    (iter([]), True, "iterator-empty"),
+    ((x for x in [1, 2]), True, "generator"),
+    ((_ for _ in []), True, "generator-empty"),
+    (Series([1]), True, "Series"),
+    (Series([], dtype=object), True, "Series-empty"),
+    # Series.str will still raise a TypeError if iterated
+    (Series(["a"]).str, True, "StringMethods"),
+    (Series([], dtype="O").str, True, "StringMethods-empty"),
+    (Index([1]), True, "Index"),
+    (Index([]), True, "Index-empty"),
+    (DataFrame([[1]]), True, "DataFrame"),
+    (DataFrame(), True, "DataFrame-empty"),
+    (np.ndarray((2,) * 1), True, "ndarray-1d"),
+    (np.array([]), True, "ndarray-1d-empty"),
+    (np.ndarray((2,) * 2), True, "ndarray-2d"),
+    (np.array([[]]), True, "ndarray-2d-empty"),
+    (np.ndarray((2,) * 3), True, "ndarray-3d"),
+    (np.array([[[]]]), True, "ndarray-3d-empty"),
+    (np.ndarray((2,) * 4), True, "ndarray-4d"),
+    (np.array([[[[]]]]), True, "ndarray-4d-empty"),
+    (np.array(2), False, "ndarray-0d"),
+    (MockNumpyLikeArray(np.ndarray((2,) * 1)), True, "duck-ndarray-1d"),
+    (MockNumpyLikeArray(np.array([])), True, "duck-ndarray-1d-empty"),
+    (MockNumpyLikeArray(np.ndarray((2,) * 2)), True, "duck-ndarray-2d"),
+    (MockNumpyLikeArray(np.array([[]])), True, "duck-ndarray-2d-empty"),
+    (MockNumpyLikeArray(np.ndarray((2,) * 3)), True, "duck-ndarray-3d"),
+    (MockNumpyLikeArray(np.array([[[]]])), True, "duck-ndarray-3d-empty"),
+    (MockNumpyLikeArray(np.ndarray((2,) * 4)), True, "duck-ndarray-4d"),
+    (MockNumpyLikeArray(np.array([[[[]]]])), True, "duck-ndarray-4d-empty"),
+    (MockNumpyLikeArray(np.array(2)), False, "duck-ndarray-0d"),
+    (1, False, "int"),
+    (b"123", False, "bytes"),
+    (b"", False, "bytes-empty"),
+    ("123", False, "string"),
+    ("", False, "string-empty"),
+    (str, False, "string-type"),
+    (object(), False, "object"),
+    (np.nan, False, "NaN"),
+    (None, False, "None"),
+]
+objs, expected, ids = zip(*ll_params, strict=True)
+
+
+@pytest.fixture(params=zip(objs, expected, strict=True), ids=ids)
+def maybe_list_like(request):
+    return request.param
+
+
+def test_is_list_like(maybe_list_like):
+    obj, expected = maybe_list_like
+    expected = True if expected == "set" else expected
+    assert inference.is_list_like(obj) == expected
+
+
+def test_is_list_like_disallow_sets(maybe_list_like):
+    obj, expected = maybe_list_like
+    expected = False if expected == "set" else expected
+    assert inference.is_list_like(obj, allow_sets=False) == expected
+
+
+def test_is_list_like_recursion():
+    # GH 33721
+    # interpreter would crash with SIGABRT
+    def list_like():
+        inference.is_list_like([])
+        list_like()
+
+    rec_limit = sys.getrecursionlimit()
+    try:
+        # Limit to avoid stack overflow on Windows CI
+        sys.setrecursionlimit(100)
+        with tm.external_error_raised(RecursionError):
+            list_like()
+    finally:
+        sys.setrecursionlimit(rec_limit)
+
+
+def test_is_list_like_iter_is_none():
+    # GH 43373
+    # is_list_like was yielding false positives with __iter__ == None
+    class NotListLike:
+        def __getitem__(self, item):
+            return self
+
+        __iter__ = None
+
+    assert not inference.is_list_like(NotListLike())
+
+
+def test_is_list_like_generic():
+    # GH 49649
+    # is_list_like was yielding false positives for Generic classes in python 3.11
+    T = TypeVar("T")
+
+    class MyDataFrame(DataFrame, Generic[T]): ...
+
+    tstc = MyDataFrame[int]
+    tst = MyDataFrame[int]({"x": [1, 2, 3]})
+
+    assert not inference.is_list_like(tstc)
+    assert isinstance(tst, DataFrame)
+    assert inference.is_list_like(tst)
+
+
+def test_is_list_like_native_container_types():
+    # GH 61565
+    # is_list_like was yielding false positives for native container types
+    assert not inference.is_list_like(list[int])
+    assert not inference.is_list_like(list[str])
+    assert not inference.is_list_like(tuple[int])
+    assert not inference.is_list_like(tuple[str])
+
+
+def test_is_sequence():
+    is_seq = inference.is_sequence
+    assert is_seq((1, 2))
+    assert is_seq([1, 2])
+    assert not is_seq("abcd")
+    assert not is_seq(np.int64)
+
+    class A:
+        def __getitem__(self, item):
+            return 1
+
+    assert not is_seq(A())
+
+
+def test_is_array_like():
+    assert inference.is_array_like(Series([], dtype=object))
+    assert inference.is_array_like(Series([1, 2]))
+    assert inference.is_array_like(np.array(["a", "b"]))
+    assert inference.is_array_like(Index(["2016-01-01"]))
+    assert inference.is_array_like(np.array([2, 3]))
+    assert inference.is_array_like(MockNumpyLikeArray(np.array([2, 3])))
+
+    class DtypeList(list):
+        dtype = "special"
+
+    assert inference.is_array_like(DtypeList())
+
+    assert not inference.is_array_like([1, 2, 3])
+    assert not inference.is_array_like(())
+    assert not inference.is_array_like("foo")
+    assert not inference.is_array_like(123)
+
+
+@pytest.mark.parametrize(
+    "inner",
+    [
+        [],
+        [1],
+        (1,),
+        (1, 2),
+        {"a": 1},
+        {1, "a"},
+        Series([1]),
+        Series([], dtype=object),
+        Series(["a"]).str,
+        (x for x in range(5)),
+    ],
+)
+@pytest.mark.parametrize("outer", [list, Series, np.array, tuple])
+def test_is_nested_list_like_passes(inner, outer):
+    result = outer([inner for _ in range(5)])
+    assert inference.is_list_like(result)
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        "abc",
+        [],
+        [1],
+        (1,),
+        ["a"],
+        "a",
+        {"a"},
+        [1, 2, 3],
+        Series([1]),
+        DataFrame({"A": [1]}),
+        ([1, 2] for _ in range(5)),
+    ],
+)
+def test_is_nested_list_like_fails(obj):
+    assert not inference.is_nested_list_like(obj)
+
+
+@pytest.mark.parametrize("ll", [{}, {"A": 1}, Series([1]), collections.defaultdict()])
+def test_is_dict_like_passes(ll):
+    assert inference.is_dict_like(ll)
+
+
+@pytest.mark.parametrize(
+    "ll",
+    [
+        "1",
+        1,
+        [1, 2],
+        (1, 2),
+        range(2),
+        Index([1]),
+        dict,
+        collections.defaultdict,
+        Series,
+    ],
+)
+def test_is_dict_like_fails(ll):
+    assert not inference.is_dict_like(ll)
+
+
+@pytest.mark.parametrize("has_keys", [True, False])
+@pytest.mark.parametrize("has_getitem", [True, False])
+@pytest.mark.parametrize("has_contains", [True, False])
+def test_is_dict_like_duck_type(has_keys, has_getitem, has_contains):
+    class DictLike:
+        def __init__(self, d) -> None:
+            self.d = d
+
+        if has_keys:
+
+            def keys(self):
+                return self.d.keys()
+
+        if has_getitem:
+
+            def __getitem__(self, key):
+                return self.d.__getitem__(key)
+
+        if has_contains:
+
+            def __contains__(self, key) -> bool:
+                return self.d.__contains__(key)
+
+    d = DictLike({1: 2})
+    result = inference.is_dict_like(d)
+    expected = has_keys and has_getitem and has_contains
+
+    assert result is expected
+
+
+def test_is_file_like():
+    class MockFile:
+        pass
+
+    is_file = inference.is_file_like
+
+    data = StringIO("data")
+    assert is_file(data)
+
+    # No read / write attributes
+    # No iterator attributes
+    m = MockFile()
+    assert not is_file(m)
+
+    MockFile.write = lambda self: 0
+
+    # Write attribute but not an iterator
+    m = MockFile()
+    assert not is_file(m)
+
+    # gh-16530: Valid iterator just means we have the
+    # __iter__ attribute for our purposes.
+    MockFile.__iter__ = lambda self: self
+
+    # Valid write-only file
+    m = MockFile()
+    assert is_file(m)
+
+    del MockFile.write
+    MockFile.read = lambda self: 0
+
+    # Valid read-only file
+    m = MockFile()
+    assert is_file(m)
+
+    # Iterator but no read / write attributes
+    data = [1, 2, 3]
+    assert not is_file(data)
+
+
+test_tuple = collections.namedtuple("test_tuple", ["a", "b", "c"])
+
+
+@pytest.mark.parametrize("ll", [test_tuple(1, 2, 3)])
+def test_is_names_tuple_passes(ll):
+    assert inference.is_named_tuple(ll)
+
+
+@pytest.mark.parametrize("ll", [(1, 2, 3), "a", Series({"pi": 3.14})])
+def test_is_names_tuple_fails(ll):
+    assert not inference.is_named_tuple(ll)
+
+
+def test_is_hashable():
+    # all new-style classes are hashable by default
+    class HashableClass:
+        pass
+
+    class UnhashableClass1:
+        __hash__ = None
+
+    class UnhashableClass2:
+        def __hash__(self):
+            raise TypeError("Not hashable")
+
+    # Temporary helper for Python 3.11 compatibility.
+    # This can be removed once support for Python 3.11 is dropped.
+    class HashableSlice:
+        def __init__(self, start, stop, step=None):
+            self.slice = slice(start, stop, step)
+
+        def __eq__(self, other):
+            return isinstance(other, HashableSlice) and self.slice == other.slice
+
+        def __hash__(self):
+            return hash((self.slice.start, self.slice.stop, self.slice.step))
+
+        def __repr__(self):
+            return (
+                f"HashableSlice({self.slice.start}, {self.slice.stop}, "
+                f"{self.slice.step})"
+            )
+
+    hashable = (1, 3.14, np.float64(3.14), "a", (), (1,), HashableClass())
+    not_hashable = ([], UnhashableClass1())
+    abc_hashable_not_really_hashable = (([],), UnhashableClass2())
+    hashable_slice = HashableSlice(1, 2)
+    tuple_with_slice = (slice(1, 2), 3)
+
+    for i in hashable:
+        assert inference.is_hashable(i)
+        assert inference.is_hashable(i, allow_slice=True)
+        assert inference.is_hashable(i, allow_slice=False)
+    for i in not_hashable:
+        assert not inference.is_hashable(i)
+        assert not inference.is_hashable(i, allow_slice=True)
+        assert not inference.is_hashable(i, allow_slice=False)
+    for i in abc_hashable_not_really_hashable:
+        assert not inference.is_hashable(i)
+        assert not inference.is_hashable(i, allow_slice=True)
+        assert not inference.is_hashable(i, allow_slice=False)
+
+    assert inference.is_hashable(hashable_slice)
+    assert inference.is_hashable(hashable_slice, allow_slice=True)
+    assert inference.is_hashable(hashable_slice, allow_slice=False)
+
+    if PY312:
+        for obj in [slice(1, 2), tuple_with_slice]:
+            assert inference.is_hashable(obj)
+            assert inference.is_hashable(obj, allow_slice=True)
+            assert not inference.is_hashable(obj, allow_slice=False)
+    else:
+        for obj in [slice(1, 2), tuple_with_slice]:
+            assert not inference.is_hashable(obj)
+            assert not inference.is_hashable(obj, allow_slice=True)
+            assert not inference.is_hashable(obj, allow_slice=False)
+
+    # numpy.array is no longer collections.abc.Hashable as of
+    # https://github.com/numpy/numpy/pull/5326, just test
+    # is_hashable()
+    assert not inference.is_hashable(np.array([]))
+
+
+@pytest.mark.parametrize("ll", [re.compile("ad")])
+def test_is_re_passes(ll):
+    assert inference.is_re(ll)
+
+
+@pytest.mark.parametrize("ll", ["x", 2, 3, object()])
+def test_is_re_fails(ll):
+    assert not inference.is_re(ll)
+
+
+@pytest.mark.parametrize(
+    "ll", [r"a", "x", r"asdf", re.compile("adsf"), r"\u2233\s*", re.compile(r"")]
+)
+def test_is_recompilable_passes(ll):
+    assert inference.is_re_compilable(ll)
+
+
+@pytest.mark.parametrize("ll", [1, [], object()])
+def test_is_recompilable_fails(ll):
+    assert not inference.is_re_compilable(ll)
+
+
+class TestInference:
+    @pytest.mark.parametrize(
+        "arr",
+        [
+            np.array(list("abc"), dtype="S1"),
+            np.array(list("abc"), dtype="S1").astype(object),
+            [b"a", np.nan, b"c"],
+        ],
+    )
+    def test_infer_dtype_bytes(self, arr):
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "bytes"
+
+    @pytest.mark.parametrize(
+        "value, expected",
+        [
+            (float("inf"), True),
+            (np.inf, True),
+            (-np.inf, False),
+            (1, False),
+            ("a", False),
+        ],
+    )
+    def test_isposinf_scalar(self, value, expected):
+        # GH 11352
+        result = libmissing.isposinf_scalar(value)
+        assert result is expected
+
+    @pytest.mark.parametrize(
+        "value, expected",
+        [
+            (float("-inf"), True),
+            (-np.inf, True),
+            (np.inf, False),
+            (1, False),
+            ("a", False),
+        ],
+    )
+    def test_isneginf_scalar(self, value, expected):
+        result = libmissing.isneginf_scalar(value)
+        assert result is expected
+
+    @pytest.mark.parametrize(
+        "convert_to_masked_nullable, exp",
+        [
+            (
+                True,
+                BooleanArray(
+                    np.array([True, False], dtype="bool"), np.array([False, True])
+                ),
+            ),
+            (False, np.array([True, np.nan], dtype="object")),
+        ],
+    )
+    def test_maybe_convert_nullable_boolean(self, convert_to_masked_nullable, exp):
+        # GH 40687
+        arr = np.array([True, np.nan], dtype=object)
+        result = libops.maybe_convert_bool(
+            arr, set(), convert_to_masked_nullable=convert_to_masked_nullable
+        )
+        if convert_to_masked_nullable:
+            tm.assert_extension_array_equal(BooleanArray(*result), exp)
+        else:
+            result = result[0]
+            tm.assert_numpy_array_equal(result, exp)
+
+    @pytest.mark.parametrize("convert_to_masked_nullable", [True, False])
+    @pytest.mark.parametrize("coerce_numeric", [True, False])
+    @pytest.mark.parametrize(
+        "infinity", ["inf", "inF", "iNf", "Inf", "iNF", "InF", "INf", "INF"]
+    )
+    @pytest.mark.parametrize("prefix", ["", "-", "+"])
+    def test_maybe_convert_numeric_infinities(
+        self, coerce_numeric, infinity, prefix, convert_to_masked_nullable
+    ):
+        # see gh-13274
+        result, _ = lib.maybe_convert_numeric(
+            np.array([prefix + infinity], dtype=object),
+            na_values={"", "NULL", "nan"},
+            coerce_numeric=coerce_numeric,
+            convert_to_masked_nullable=convert_to_masked_nullable,
+        )
+        expected = np.array([np.inf if prefix in ["", "+"] else -np.inf])
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("convert_to_masked_nullable", [True, False])
+    def test_maybe_convert_numeric_infinities_raises(self, convert_to_masked_nullable):
+        msg = "Unable to parse string"
+        with pytest.raises(ValueError, match=msg):
+            lib.maybe_convert_numeric(
+                np.array(["foo_inf"], dtype=object),
+                na_values={"", "NULL", "nan"},
+                coerce_numeric=False,
+                convert_to_masked_nullable=convert_to_masked_nullable,
+            )
+
+    @pytest.mark.parametrize("convert_to_masked_nullable", [True, False])
+    def test_maybe_convert_numeric_post_floatify_nan(
+        self, coerce, convert_to_masked_nullable
+    ):
+        # see gh-13314
+        data = np.array(["1.200", "-999.000", "4.500"], dtype=object)
+        expected = np.array([1.2, np.nan, 4.5], dtype=np.float64)
+        nan_values = {-999, -999.0}
+
+        out = lib.maybe_convert_numeric(
+            data,
+            nan_values,
+            coerce,
+            convert_to_masked_nullable=convert_to_masked_nullable,
+        )
+        if convert_to_masked_nullable:
+            expected = FloatingArray(expected, np.isnan(expected))
+            tm.assert_extension_array_equal(expected, FloatingArray(*out))
+        else:
+            out = out[0]
+            tm.assert_numpy_array_equal(out, expected)
+
+    def test_convert_infs(self):
+        arr = np.array(["inf", "inf", "inf"], dtype="O")
+        result, _ = lib.maybe_convert_numeric(arr, set(), False)
+        assert result.dtype == np.float64
+
+        arr = np.array(["-inf", "-inf", "-inf"], dtype="O")
+        result, _ = lib.maybe_convert_numeric(arr, set(), False)
+        assert result.dtype == np.float64
+
+    def test_scientific_no_exponent(self):
+        # See PR 12215
+        arr = np.array(["42E", "2E", "99e", "6e"], dtype="O")
+        result, _ = lib.maybe_convert_numeric(arr, set(), False, True)
+        assert np.all(np.isnan(result))
+
+    def test_convert_non_hashable(self):
+        # GH13324
+        # make sure that we are handing non-hashables
+        arr = np.array([[10.0, 2], 1.0, "apple"], dtype=object)
+        result, _ = lib.maybe_convert_numeric(arr, set(), False, True)
+        tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan]))
+
+    def test_convert_numeric_uint64(self):
+        arr = np.array([2**63], dtype=object)
+        exp = np.array([2**63], dtype=np.uint64)
+        tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set())[0], exp)
+
+        arr = np.array([str(2**63)], dtype=object)
+        exp = np.array([2**63], dtype=np.uint64)
+        tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set())[0], exp)
+
+        arr = np.array([np.uint64(2**63)], dtype=object)
+        exp = np.array([2**63], dtype=np.uint64)
+        tm.assert_numpy_array_equal(lib.maybe_convert_numeric(arr, set())[0], exp)
+
+    @pytest.mark.parametrize(
+        "arr",
+        [
+            np.array([2**63, np.nan], dtype=object),
+            np.array([str(2**63), np.nan], dtype=object),
+            np.array([np.nan, 2**63], dtype=object),
+            np.array([np.nan, str(2**63)], dtype=object),
+        ],
+    )
+    def test_convert_numeric_uint64_nan(self, coerce, arr):
+        expected = arr.astype(float) if coerce else arr.copy()
+        result, _ = lib.maybe_convert_numeric(arr, set(), coerce_numeric=coerce)
+        tm.assert_almost_equal(result, expected)
+
+    @pytest.mark.parametrize("convert_to_masked_nullable", [True, False])
+    def test_convert_numeric_uint64_nan_values(
+        self, coerce, convert_to_masked_nullable
+    ):
+        arr = np.array([2**63, 2**63 + 1], dtype=object)
+        na_values = {2**63}
+
+        expected = np.array([np.nan, 2**63 + 1], dtype=float) if coerce else arr.copy()
+        result = lib.maybe_convert_numeric(
+            arr,
+            na_values,
+            coerce_numeric=coerce,
+            convert_to_masked_nullable=convert_to_masked_nullable,
+        )
+        if convert_to_masked_nullable and coerce:
+            expected = IntegerArray(
+                np.array([0, 2**63 + 1], dtype="u8"),
+                np.array([True, False], dtype="bool"),
+            )
+            result = IntegerArray(*result)
+        else:
+            result = result[0]  # discard mask
+        tm.assert_almost_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "case",
+        [
+            np.array([2**63, -1], dtype=object),
+            np.array([str(2**63), -1], dtype=object),
+            np.array([str(2**63), str(-1)], dtype=object),
+            np.array([-1, 2**63], dtype=object),
+            np.array([-1, str(2**63)], dtype=object),
+            np.array([str(-1), str(2**63)], dtype=object),
+        ],
+    )
+    @pytest.mark.parametrize("convert_to_masked_nullable", [True, False])
+    def test_convert_numeric_int64_uint64(
+        self, case, coerce, convert_to_masked_nullable
+    ):
+        expected = case.astype(float) if coerce else case.copy()
+        result, _ = lib.maybe_convert_numeric(
+            case,
+            set(),
+            coerce_numeric=coerce,
+            convert_to_masked_nullable=convert_to_masked_nullable,
+        )
+
+        tm.assert_almost_equal(result, expected)
+
+    @pytest.mark.parametrize("convert_to_masked_nullable", [True, False])
+    def test_convert_numeric_string_uint64(self, convert_to_masked_nullable):
+        # GH32394
+        result = lib.maybe_convert_numeric(
+            np.array(["uint64"], dtype=object),
+            set(),
+            coerce_numeric=True,
+            convert_to_masked_nullable=convert_to_masked_nullable,
+        )
+        if convert_to_masked_nullable:
+            result = FloatingArray(*result)
+        else:
+            result = result[0]
+        assert np.isnan(result)
+
+    @pytest.mark.parametrize("value", [-(2**63) - 1, 2**64])
+    def test_convert_int_overflow(self, value):
+        # see gh-18584
+        arr = np.array([value], dtype=object)
+        result = lib.maybe_convert_objects(arr)
+        tm.assert_numpy_array_equal(arr, result)
+
+    @pytest.mark.parametrize(
+        "value, expected_value",
+        [
+            (-(1 << 65), -(1 << 65)),
+            (1 << 65, 1 << 65),
+            (str(1 << 65), 1 << 65),
+            (f"-{1 << 65}", -(1 << 65)),
+        ],
+    )
+    @pytest.mark.parametrize("coerce_numeric", [False, True])
+    def test_convert_numeric_overflow(self, value, expected_value, coerce_numeric):
+        arr = np.array([value], dtype=object)
+        expected = np.array([expected_value], dtype=float if coerce_numeric else object)
+        result, _ = lib.maybe_convert_numeric(
+            arr,
+            set(),
+            coerce_numeric=coerce_numeric,
+        )
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("val", [None, np.nan, float("nan")])
+    @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
+    def test_maybe_convert_objects_nat_inference(self, val, dtype):
+        dtype = np.dtype(dtype)
+        vals = np.array([pd.NaT, val], dtype=object)
+        result = lib.maybe_convert_objects(
+            vals,
+            convert_non_numeric=True,
+            dtype_if_all_nat=dtype,
+        )
+        assert result.dtype == dtype
+        assert np.isnat(result).all()
+
+        result = lib.maybe_convert_objects(
+            vals[::-1],
+            convert_non_numeric=True,
+            dtype_if_all_nat=dtype,
+        )
+        assert result.dtype == dtype
+        assert np.isnat(result).all()
+
+    @pytest.mark.parametrize(
+        "value, expected_dtype",
+        [
+            # see gh-4471
+            ([2**63], np.uint64),
+            # NumPy bug: can't compare uint64 to int64, as that
+            # results in both casting to float64, so we should
+            # make sure that this function is robust against it
+            ([np.uint64(2**63)], np.uint64),
+            ([2, -1], np.int64),
+            ([2**63, -1], object),
+            # GH#47294
+            ([np.uint8(1)], np.uint8),
+            ([np.uint16(1)], np.uint16),
+            ([np.uint32(1)], np.uint32),
+            ([np.uint64(1)], np.uint64),
+            ([np.uint8(2), np.uint16(1)], np.uint16),
+            ([np.uint32(2), np.uint16(1)], np.uint32),
+            ([np.uint32(2), -1], object),
+            ([np.uint32(2), 1], np.uint64),
+            ([np.uint32(2), np.int32(1)], object),
+        ],
+    )
+    def test_maybe_convert_objects_uint(self, value, expected_dtype):
+        arr = np.array(value, dtype=object)
+        exp = np.array(value, dtype=expected_dtype)
+        tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp)
+
+    def test_maybe_convert_objects_datetime(self):
+        # GH27438
+        arr = np.array(
+            [np.datetime64("2000-01-01"), np.timedelta64(1, "s")], dtype=object
+        )
+        exp = arr.copy()
+        out = lib.maybe_convert_objects(arr, convert_non_numeric=True)
+        tm.assert_numpy_array_equal(out, exp)
+
+        arr = np.array([pd.NaT, np.timedelta64(1, "s")], dtype=object)
+        exp = np.array([np.timedelta64("NaT"), np.timedelta64(1, "s")], dtype="m8[s]")
+        out = lib.maybe_convert_objects(arr, convert_non_numeric=True)
+        tm.assert_numpy_array_equal(out, exp)
+
+        # with convert_non_numeric=True, the nan is a valid NA value for td64
+        arr = np.array([np.timedelta64(1, "s"), np.nan], dtype=object)
+        exp = exp[::-1]
+        out = lib.maybe_convert_objects(arr, convert_non_numeric=True)
+        tm.assert_numpy_array_equal(out, exp)
+
+    def test_maybe_convert_objects_dtype_if_all_nat(self):
+        arr = np.array([pd.NaT, pd.NaT], dtype=object)
+        out = lib.maybe_convert_objects(arr, convert_non_numeric=True)
+        # no dtype_if_all_nat passed -> we dont guess
+        tm.assert_numpy_array_equal(out, arr)
+
+        out = lib.maybe_convert_objects(
+            arr,
+            convert_non_numeric=True,
+            dtype_if_all_nat=np.dtype("timedelta64[ns]"),
+        )
+        exp = np.array(["NaT", "NaT"], dtype="timedelta64[ns]")
+        tm.assert_numpy_array_equal(out, exp)
+
+        out = lib.maybe_convert_objects(
+            arr,
+            convert_non_numeric=True,
+            dtype_if_all_nat=np.dtype("datetime64[ns]"),
+        )
+        exp = np.array(["NaT", "NaT"], dtype="datetime64[ns]")
+        tm.assert_numpy_array_equal(out, exp)
+
+    def test_maybe_convert_objects_dtype_if_all_nat_invalid(self):
+        # we accept datetime64[ns], timedelta64[ns], and EADtype
+        arr = np.array([pd.NaT, pd.NaT], dtype=object)
+
+        with pytest.raises(ValueError, match="int64"):
+            lib.maybe_convert_objects(
+                arr,
+                convert_non_numeric=True,
+                dtype_if_all_nat=np.dtype("int64"),
+            )
+
+    @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
+    def test_maybe_convert_objects_datetime_overflow_safe(self, dtype):
+        stamp = datetime(2363, 10, 4)  # Enterprise-D launch date
+        if dtype == "timedelta64[ns]":
+            stamp = stamp - datetime(1970, 1, 1)
+        arr = np.array([stamp], dtype=object)
+
+        out = lib.maybe_convert_objects(arr, convert_non_numeric=True)
+        # no OutOfBoundsDatetime/OutOfBoundsTimedeltas
+        if dtype == "datetime64[ns]":
+            expected = np.array(["2363-10-04"], dtype="M8[us]")
+        else:
+            expected = arr.astype("m8[us]")
+        tm.assert_numpy_array_equal(out, expected)
+
+    def test_maybe_convert_objects_mixed_datetimes(self):
+        ts = Timestamp("now")
+        vals = [ts, ts.to_pydatetime(), ts.to_datetime64(), pd.NaT, np.nan, None]
+
+        for data in itertools.permutations(vals):
+            data = np.array(list(data), dtype=object)
+            expected = DatetimeIndex(data)._data._ndarray
+            result = lib.maybe_convert_objects(data, convert_non_numeric=True)
+            tm.assert_numpy_array_equal(result, expected)
+
+    def test_maybe_convert_objects_timedelta64_nat(self):
+        obj = np.timedelta64("NaT", "ns")
+        arr = np.array([obj], dtype=object)
+        assert arr[0] is obj
+
+        result = lib.maybe_convert_objects(arr, convert_non_numeric=True)
+
+        expected = np.array([obj], dtype="m8[ns]")
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "exp",
+        [
+            IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True])),
+            IntegerArray(np.array([2, 0], dtype="int64"), np.array([False, True])),
+        ],
+    )
+    def test_maybe_convert_objects_nullable_integer(self, exp):
+        # GH27335
+        arr = np.array([2, np.nan], dtype=object)
+        result = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True)
+
+        tm.assert_extension_array_equal(result, exp)
+
+    @pytest.mark.parametrize(
+        "dtype, val", [("int64", 1), ("uint64", np.iinfo(np.int64).max + 1)]
+    )
+    def test_maybe_convert_objects_nullable_none(self, dtype, val):
+        # GH#50043
+        arr = np.array([val, None, 3], dtype="object")
+        result = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True)
+        expected = IntegerArray(
+            np.array([val, 0, 3], dtype=dtype), np.array([False, True, False])
+        )
+        tm.assert_extension_array_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "convert_to_masked_nullable, exp",
+        [
+            (True, IntegerArray(np.array([2, 0], dtype="i8"), np.array([False, True]))),
+            (False, np.array([2, np.nan], dtype="float64")),
+        ],
+    )
+    def test_maybe_convert_numeric_nullable_integer(
+        self, convert_to_masked_nullable, exp
+    ):
+        # GH 40687
+        arr = np.array([2, np.nan], dtype=object)
+        result = lib.maybe_convert_numeric(
+            arr, set(), convert_to_masked_nullable=convert_to_masked_nullable
+        )
+        if convert_to_masked_nullable:
+            result = IntegerArray(*result)
+            tm.assert_extension_array_equal(result, exp)
+        else:
+            result = result[0]
+            tm.assert_numpy_array_equal(result, exp)
+
+    @pytest.mark.parametrize(
+        "convert_to_masked_nullable, exp",
+        [
+            (
+                True,
+                FloatingArray(
+                    np.array([2.0, 0.0], dtype="float64"), np.array([False, True])
+                ),
+            ),
+            (False, np.array([2.0, np.nan], dtype="float64")),
+        ],
+    )
+    def test_maybe_convert_numeric_floating_array(
+        self, convert_to_masked_nullable, exp
+    ):
+        # GH 40687
+        arr = np.array([2.0, np.nan], dtype=object)
+        result = lib.maybe_convert_numeric(
+            arr, set(), convert_to_masked_nullable=convert_to_masked_nullable
+        )
+        if convert_to_masked_nullable:
+            tm.assert_extension_array_equal(FloatingArray(*result), exp)
+        else:
+            result = result[0]
+            tm.assert_numpy_array_equal(result, exp)
+
+    def test_maybe_convert_objects_bool_nan(self):
+        # GH32146
+        ind = Index([True, False, np.nan], dtype=object)
+        exp = np.array([True, False, np.nan], dtype=object)
+        out = lib.maybe_convert_objects(ind.values, safe=1)
+        tm.assert_numpy_array_equal(out, exp)
+
+    def test_maybe_convert_objects_nullable_boolean(self):
+        # GH50047
+        arr = np.array([True, False], dtype=object)
+        exp = BooleanArray._from_sequence([True, False], dtype="boolean")
+        out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True)
+        tm.assert_extension_array_equal(out, exp)
+
+        arr = np.array([True, False, pd.NaT], dtype=object)
+        exp = np.array([True, False, pd.NaT], dtype=object)
+        out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True)
+        tm.assert_numpy_array_equal(out, exp)
+
+    @pytest.mark.parametrize("val", [None, np.nan])
+    def test_maybe_convert_objects_nullable_boolean_na(self, val):
+        # GH50047
+        arr = np.array([True, False, val], dtype=object)
+        exp = BooleanArray(
+            np.array([True, False, False]), np.array([False, False, True])
+        )
+        out = lib.maybe_convert_objects(arr, convert_to_nullable_dtype=True)
+        tm.assert_extension_array_equal(out, exp)
+
+    @pytest.mark.parametrize(
+        "data0",
+        [
+            True,
+            1,
+            1.0,
+            1.0 + 1.0j,
+            np.int8(1),
+            np.int16(1),
+            np.int32(1),
+            np.int64(1),
+            np.float16(1),
+            np.float32(1),
+            np.float64(1),
+            np.complex64(1),
+            np.complex128(1),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "data1",
+        [
+            True,
+            1,
+            1.0,
+            1.0 + 1.0j,
+            np.int8(1),
+            np.int16(1),
+            np.int32(1),
+            np.int64(1),
+            np.float16(1),
+            np.float32(1),
+            np.float64(1),
+            np.complex64(1),
+            np.complex128(1),
+        ],
+    )
+    def test_maybe_convert_objects_itemsize(self, data0, data1):
+        # GH 40908
+        data = [data0, data1]
+        arr = np.array(data, dtype="object")
+
+        common_kind = np.result_type(type(data0), type(data1)).kind
+        kind0 = "python" if not hasattr(data0, "dtype") else data0.dtype.kind
+        kind1 = "python" if not hasattr(data1, "dtype") else data1.dtype.kind
+        if kind0 != "python" and kind1 != "python":
+            kind = common_kind
+            itemsize = max(data0.dtype.itemsize, data1.dtype.itemsize)
+        elif is_bool(data0) or is_bool(data1):
+            kind = "bool" if (is_bool(data0) and is_bool(data1)) else "object"
+            itemsize = ""
+        elif is_complex(data0) or is_complex(data1):
+            kind = common_kind
+            itemsize = 16
+        else:
+            kind = common_kind
+            itemsize = 8
+
+        expected = np.array(data, dtype=f"{kind}{itemsize}")
+        result = lib.maybe_convert_objects(arr)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_mixed_dtypes_remain_object_array(self):
+        # GH14956
+        arr = np.array([datetime(2015, 1, 1, tzinfo=timezone.utc), 1], dtype=object)
+        result = lib.maybe_convert_objects(arr, convert_non_numeric=True)
+        tm.assert_numpy_array_equal(result, arr)
+
+    @pytest.mark.parametrize(
+        "idx",
+        [
+            pd.IntervalIndex.from_breaks(range(5), closed="both"),
+            pd.period_range("2016-01-01", periods=3, freq="D"),
+        ],
+    )
+    def test_maybe_convert_objects_ea(self, idx):
+        result = lib.maybe_convert_objects(
+            np.array(idx, dtype=object),
+            convert_non_numeric=True,
+        )
+        tm.assert_extension_array_equal(result, idx._data)
+
+
+class TestTypeInference:
+    # Dummy class used for testing with Python objects
+    class Dummy:
+        pass
+
+    def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype):
+        # see pandas/conftest.py
+        inferred_dtype, values = any_skipna_inferred_dtype
+
+        # make sure the inferred dtype of the fixture is as requested
+        assert inferred_dtype == lib.infer_dtype(values, skipna=True)
+
+    def test_length_zero(self, skipna):
+        result = lib.infer_dtype(np.array([], dtype="i4"), skipna=skipna)
+        assert result == "integer"
+
+        result = lib.infer_dtype([], skipna=skipna)
+        assert result == "empty"
+
+        # GH 18004
+        arr = np.array([np.array([], dtype=object), np.array([], dtype=object)])
+        result = lib.infer_dtype(arr, skipna=skipna)
+        assert result == "empty"
+
+    def test_integers(self):
+        arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype="O")
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "integer"
+
+        arr = np.array([1, 2, 3, np.int64(4), np.int32(5), "foo"], dtype="O")
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "mixed-integer"
+
+        arr = np.array([1, 2, 3, 4, 5], dtype="i4")
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "integer"
+
+    @pytest.mark.parametrize(
+        "arr, skipna",
+        [
+            ([1, 2, np.nan, np.nan, 3], False),
+            ([1, 2, np.nan, np.nan, 3], True),
+            ([1, 2, 3, np.int64(4), np.int32(5), np.nan], False),
+            ([1, 2, 3, np.int64(4), np.int32(5), np.nan], True),
+        ],
+    )
+    def test_integer_na(self, arr, skipna):
+        # GH 27392
+        result = lib.infer_dtype(np.array(arr, dtype="O"), skipna=skipna)
+        expected = "integer" if skipna else "integer-na"
+        assert result == expected
+
+    def test_infer_dtype_skipna_default(self):
+        # infer_dtype `skipna` default deprecated in GH#24050,
+        #  changed to True in GH#29876
+        arr = np.array([1, 2, 3, np.nan], dtype=object)
+
+        result = lib.infer_dtype(arr)
+        assert result == "integer"
+
+    def test_bools(self):
+        arr = np.array([True, False, True, True, True], dtype="O")
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "boolean"
+
+        arr = np.array([np.bool_(True), np.bool_(False)], dtype="O")
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "boolean"
+
+        arr = np.array([True, False, True, "foo"], dtype="O")
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "mixed"
+
+        arr = np.array([True, False, True], dtype=bool)
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "boolean"
+
+        arr = np.array([True, np.nan, False], dtype="O")
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "boolean"
+
+        result = lib.infer_dtype(arr, skipna=False)
+        assert result == "mixed"
+
+    def test_floats(self):
+        arr = np.array([1.0, 2.0, 3.0, np.float64(4), np.float32(5)], dtype="O")
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "floating"
+
+        arr = np.array([1, 2, 3, np.float64(4), np.float32(5), "foo"], dtype="O")
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "mixed-integer"
+
+        arr = np.array([1, 2, 3, 4, 5], dtype="f4")
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "floating"
+
+        arr = np.array([1, 2, 3, 4, 5], dtype="f8")
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "floating"
+
+    def test_decimals(self):
+        # GH15690
+        arr = np.array([Decimal(1), Decimal(2), Decimal(3)])
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "decimal"
+
+        arr = np.array([1.0, 2.0, Decimal(3)])
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "mixed"
+
+        result = lib.infer_dtype(arr[::-1], skipna=True)
+        assert result == "mixed"
+
+        arr = np.array([Decimal(1), Decimal("NaN"), Decimal(3)])
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "decimal"
+
+        arr = np.array([Decimal(1), np.nan, Decimal(3)], dtype="O")
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "decimal"
+
+    # complex is compatible with nan, so skipna has no effect
+    def test_complex(self, skipna):
+        # gets cast to complex on array construction
+        arr = np.array([1.0, 2.0, 1 + 1j])
+        result = lib.infer_dtype(arr, skipna=skipna)
+        assert result == "complex"
+
+        arr = np.array([1.0, 2.0, 1 + 1j], dtype="O")
+        result = lib.infer_dtype(arr, skipna=skipna)
+        assert result == "mixed"
+
+        result = lib.infer_dtype(arr[::-1], skipna=skipna)
+        assert result == "mixed"
+
+        # gets cast to complex on array construction
+        arr = np.array([1, np.nan, 1 + 1j])
+        result = lib.infer_dtype(arr, skipna=skipna)
+        assert result == "complex"
+
+        arr = np.array([1.0, np.nan, 1 + 1j], dtype="O")
+        result = lib.infer_dtype(arr, skipna=skipna)
+        assert result == "mixed"
+
+        # complex with nans stays complex
+        arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype="O")
+        result = lib.infer_dtype(arr, skipna=skipna)
+        assert result == "complex"
+
+        # test smaller complex dtype; will pass through _try_infer_map fastpath
+        arr = np.array([1 + 1j, np.nan, 3 + 3j], dtype=np.complex64)
+        result = lib.infer_dtype(arr, skipna=skipna)
+        assert result == "complex"
+
+    def test_string(self):
+        pass
+
+    def test_unicode(self):
+        arr = ["a", np.nan, "c"]
+        result = lib.infer_dtype(arr, skipna=False)
+        # This currently returns "mixed", but it's not clear that's optimal.
+        # This could also return "string" or "mixed-string"
+        assert result == "mixed"
+
+        # even though we use skipna, we are only skipping those NAs that are
+        #  considered matching by is_string_array
+        arr = ["a", np.nan, "c"]
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "string"
+
+        arr = ["a", pd.NA, "c"]
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "string"
+
+        arr = ["a", pd.NaT, "c"]
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "mixed"
+
+        arr = ["a", "c"]
+        result = lib.infer_dtype(arr, skipna=False)
+        assert result == "string"
+
+    @pytest.mark.parametrize(
+        "dtype, missing, skipna, expected",
+        [
+            (float, np.nan, False, "floating"),
+            (float, np.nan, True, "floating"),
+            (object, np.nan, False, "floating"),
+            (object, np.nan, True, "empty"),
+            (object, None, False, "mixed"),
+            (object, None, True, "empty"),
+        ],
+    )
+    @pytest.mark.parametrize("box", [Series, np.array])
+    def test_object_empty(self, box, missing, dtype, skipna, expected):
+        # GH 23421
+        arr = box([missing, missing], dtype=dtype)
+
+        result = lib.infer_dtype(arr, skipna=skipna)
+        assert result == expected
+
+    def test_datetime(self):
+        dates = [datetime(2012, 1, x) for x in range(1, 20)]
+        index = Index(dates)
+        assert index.inferred_type == "datetime64"
+
+    def test_infer_dtype_datetime64(self):
+        arr = np.array(
+            [np.datetime64("2011-01-01"), np.datetime64("2011-01-01")], dtype=object
+        )
+        assert lib.infer_dtype(arr, skipna=True) == "datetime64"
+
+    @pytest.mark.parametrize("na_value", [pd.NaT, np.nan])
+    def test_infer_dtype_datetime64_with_na(self, na_value):
+        # starts with nan
+        arr = np.array([na_value, np.datetime64("2011-01-02")])
+        assert lib.infer_dtype(arr, skipna=True) == "datetime64"
+
+        arr = np.array([na_value, np.datetime64("2011-01-02"), na_value])
+        assert lib.infer_dtype(arr, skipna=True) == "datetime64"
+
+    @pytest.mark.parametrize(
+        "arr",
+        [
+            np.array(
+                [np.timedelta64("nat"), np.datetime64("2011-01-02")], dtype=object
+            ),
+            np.array(
+                [np.datetime64("2011-01-02"), np.timedelta64("nat")], dtype=object
+            ),
+            np.array([np.datetime64("2011-01-01"), Timestamp("2011-01-02")]),
+            np.array([Timestamp("2011-01-02"), np.datetime64("2011-01-01")]),
+            np.array([np.nan, Timestamp("2011-01-02"), 1.1]),
+            np.array([np.nan, "2011-01-01", Timestamp("2011-01-02")], dtype=object),
+            np.array([np.datetime64("nat"), np.timedelta64(1, "D")], dtype=object),
+            np.array([np.timedelta64(1, "D"), np.datetime64("nat")], dtype=object),
+        ],
+    )
+    def test_infer_datetimelike_dtype_mixed(self, arr):
+        assert lib.infer_dtype(arr, skipna=False) == "mixed"
+
+    def test_infer_dtype_mixed_integer(self):
+        arr = np.array([np.nan, Timestamp("2011-01-02"), 1])
+        assert lib.infer_dtype(arr, skipna=True) == "mixed-integer"
+
+    @pytest.mark.parametrize(
+        "arr",
+        [
+            [Timestamp("2011-01-01"), Timestamp("2011-01-02")],
+            [datetime(2011, 1, 1), datetime(2012, 2, 1)],
+            [datetime(2011, 1, 1), Timestamp("2011-01-02")],
+        ],
+    )
+    def test_infer_dtype_datetime(self, arr):
+        assert lib.infer_dtype(np.array(arr), skipna=True) == "datetime"
+
+    @pytest.mark.parametrize("na_value", [pd.NaT, np.nan])
+    @pytest.mark.parametrize(
+        "time_stamp", [Timestamp("2011-01-01"), datetime(2011, 1, 1)]
+    )
+    def test_infer_dtype_datetime_with_na(self, na_value, time_stamp):
+        # starts with nan
+        arr = np.array([na_value, time_stamp])
+        assert lib.infer_dtype(arr, skipna=True) == "datetime"
+
+        arr = np.array([na_value, time_stamp, na_value])
+        assert lib.infer_dtype(arr, skipna=True) == "datetime"
+
+    @pytest.mark.parametrize(
+        "arr",
+        [
+            np.array([Timedelta("1 days"), Timedelta("2 days")]),
+            np.array([np.timedelta64(1, "D"), np.timedelta64(2, "D")], dtype=object),
+            np.array([timedelta(1), timedelta(2)]),
+        ],
+    )
+    def test_infer_dtype_timedelta(self, arr):
+        assert lib.infer_dtype(arr, skipna=True) == "timedelta"
+
+    @pytest.mark.parametrize("na_value", [pd.NaT, np.nan])
+    @pytest.mark.parametrize(
+        "delta", [Timedelta("1 days"), np.timedelta64(1, "D"), timedelta(1)]
+    )
+    def test_infer_dtype_timedelta_with_na(self, na_value, delta):
+        # starts with nan
+        arr = np.array([na_value, delta])
+        assert lib.infer_dtype(arr, skipna=True) == "timedelta"
+
+        arr = np.array([na_value, delta, na_value])
+        assert lib.infer_dtype(arr, skipna=True) == "timedelta"
+
+    def test_infer_dtype_period(self):
+        # GH 13664
+        arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="D")])
+        assert lib.infer_dtype(arr, skipna=True) == "period"
+
+        # non-homogeneous freqs -> mixed
+        arr = np.array([Period("2011-01", freq="D"), Period("2011-02", freq="M")])
+        assert lib.infer_dtype(arr, skipna=True) == "mixed"
+
+    def test_infer_dtype_period_array(self, index_or_series_or_array, skipna):
+        klass = index_or_series_or_array
+        # https://github.com/pandas-dev/pandas/issues/23553
+        values = klass(
+            [
+                Period("2011-01-01", freq="D"),
+                Period("2011-01-02", freq="D"),
+                pd.NaT,
+            ]
+        )
+        assert lib.infer_dtype(values, skipna=skipna) == "period"
+
+        # periods but mixed freq
+        values = klass(
+            [
+                Period("2011-01-01", freq="D"),
+                Period("2011-01-02", freq="M"),
+                pd.NaT,
+            ]
+        )
+        # with pd.array this becomes NumpyExtensionArray which ends up
+        #  as "unknown-array"
+        exp = "unknown-array" if klass is pd.array else "mixed"
+        assert lib.infer_dtype(values, skipna=skipna) == exp
+
+    def test_infer_dtype_period_mixed(self):
+        arr = np.array(
+            [Period("2011-01", freq="M"), np.datetime64("nat")], dtype=object
+        )
+        assert lib.infer_dtype(arr, skipna=False) == "mixed"
+
+        arr = np.array(
+            [np.datetime64("nat"), Period("2011-01", freq="M")], dtype=object
+        )
+        assert lib.infer_dtype(arr, skipna=False) == "mixed"
+
+    @pytest.mark.parametrize("na_value", [pd.NaT, np.nan])
+    def test_infer_dtype_period_with_na(self, na_value):
+        # starts with nan
+        arr = np.array([na_value, Period("2011-01", freq="D")])
+        assert lib.infer_dtype(arr, skipna=True) == "period"
+
+        arr = np.array([na_value, Period("2011-01", freq="D"), na_value])
+        assert lib.infer_dtype(arr, skipna=True) == "period"
+
+    @pytest.mark.parametrize("na_value", [pd.NA, np.nan])
+    def test_infer_dtype_numeric_with_na(self, na_value):
+        # GH61621
+        ser = Series([1, 2, na_value], dtype=object)
+        assert lib.infer_dtype(ser, skipna=True) == "integer"
+
+        ser = Series([1.0, 2.0, na_value], dtype=object)
+        assert lib.infer_dtype(ser, skipna=True) == "floating"
+
+        # GH#61976
+        ser = Series([1 + 1j, na_value], dtype=object)
+        assert lib.infer_dtype(ser, skipna=True) == "complex"
+
+    def test_infer_dtype_all_nan_nat_like(self):
+        arr = np.array([np.nan, np.nan])
+        assert lib.infer_dtype(arr, skipna=True) == "floating"
+
+        # nan and None mix are result in mixed
+        arr = np.array([np.nan, np.nan, None])
+        assert lib.infer_dtype(arr, skipna=True) == "empty"
+        assert lib.infer_dtype(arr, skipna=False) == "mixed"
+
+        arr = np.array([None, np.nan, np.nan])
+        assert lib.infer_dtype(arr, skipna=True) == "empty"
+        assert lib.infer_dtype(arr, skipna=False) == "mixed"
+
+        # pd.NaT
+        arr = np.array([pd.NaT])
+        assert lib.infer_dtype(arr, skipna=False) == "datetime"
+
+        arr = np.array([pd.NaT, np.nan])
+        assert lib.infer_dtype(arr, skipna=False) == "datetime"
+
+        arr = np.array([np.nan, pd.NaT])
+        assert lib.infer_dtype(arr, skipna=False) == "datetime"
+
+        arr = np.array([np.nan, pd.NaT, np.nan])
+        assert lib.infer_dtype(arr, skipna=False) == "datetime"
+
+        arr = np.array([None, pd.NaT, None])
+        assert lib.infer_dtype(arr, skipna=False) == "datetime"
+
+        # np.datetime64(nat)
+        arr = np.array([np.datetime64("nat")])
+        assert lib.infer_dtype(arr, skipna=False) == "datetime64"
+
+        for n in [np.nan, pd.NaT, None]:
+            arr = np.array([n, np.datetime64("nat"), n])
+            assert lib.infer_dtype(arr, skipna=False) == "datetime64"
+
+            arr = np.array([pd.NaT, n, np.datetime64("nat"), n])
+            assert lib.infer_dtype(arr, skipna=False) == "datetime64"
+
+        arr = np.array([np.timedelta64("nat")], dtype=object)
+        assert lib.infer_dtype(arr, skipna=False) == "timedelta"
+
+        for n in [np.nan, pd.NaT, None]:
+            arr = np.array([n, np.timedelta64("nat"), n])
+            assert lib.infer_dtype(arr, skipna=False) == "timedelta"
+
+            arr = np.array([pd.NaT, n, np.timedelta64("nat"), n])
+            assert lib.infer_dtype(arr, skipna=False) == "timedelta"
+
+        # datetime / timedelta mixed
+        arr = np.array([pd.NaT, np.datetime64("nat"), np.timedelta64("nat"), np.nan])
+        assert lib.infer_dtype(arr, skipna=False) == "mixed"
+
+        arr = np.array([np.timedelta64("nat"), np.datetime64("nat")], dtype=object)
+        assert lib.infer_dtype(arr, skipna=False) == "mixed"
+
+    def test_is_datetimelike_array_all_nan_nat_like(self):
+        arr = np.array([np.nan, pd.NaT, np.datetime64("nat")])
+        assert lib.is_datetime_array(arr)
+        assert lib.is_datetime64_array(arr)
+        assert not lib.is_timedelta_or_timedelta64_array(arr)
+
+        arr = np.array([np.nan, pd.NaT, np.timedelta64("nat")])
+        assert not lib.is_datetime_array(arr)
+        assert not lib.is_datetime64_array(arr)
+        assert lib.is_timedelta_or_timedelta64_array(arr)
+
+        arr = np.array([np.nan, pd.NaT, np.datetime64("nat"), np.timedelta64("nat")])
+        assert not lib.is_datetime_array(arr)
+        assert not lib.is_datetime64_array(arr)
+        assert not lib.is_timedelta_or_timedelta64_array(arr)
+
+        arr = np.array([np.nan, pd.NaT])
+        assert lib.is_datetime_array(arr)
+        assert lib.is_datetime64_array(arr)
+        assert lib.is_timedelta_or_timedelta64_array(arr)
+
+        arr = np.array([np.nan, np.nan], dtype=object)
+        assert not lib.is_datetime_array(arr)
+        assert not lib.is_datetime64_array(arr)
+        assert not lib.is_timedelta_or_timedelta64_array(arr)
+
+        assert lib.is_datetime_with_singletz_array(
+            np.array(
+                [
+                    Timestamp("20130101", tz="US/Eastern"),
+                    Timestamp("20130102", tz="US/Eastern"),
+                ],
+                dtype=object,
+            )
+        )
+        assert not lib.is_datetime_with_singletz_array(
+            np.array(
+                [
+                    Timestamp("20130101", tz="US/Eastern"),
+                    Timestamp("20130102", tz="CET"),
+                ],
+                dtype=object,
+            )
+        )
+
+    @pytest.mark.parametrize(
+        "func",
+        [
+            "is_datetime_array",
+            "is_datetime64_array",
+            "is_bool_array",
+            "is_timedelta_or_timedelta64_array",
+            "is_date_array",
+            "is_time_array",
+            "is_interval_array",
+        ],
+    )
+    def test_other_dtypes_for_array(self, func):
+        func = getattr(lib, func)
+        arr = np.array(["foo", "bar"])
+        assert not func(arr)
+        assert not func(arr.reshape(2, 1))
+
+        arr = np.array([1, 2])
+        assert not func(arr)
+        assert not func(arr.reshape(2, 1))
+
+    def test_date(self):
+        dates = [date(2012, 1, day) for day in range(1, 20)]
+        index = Index(dates)
+        assert index.inferred_type == "date"
+
+        dates = [date(2012, 1, day) for day in range(1, 20)] + [np.nan]
+        result = lib.infer_dtype(dates, skipna=False)
+        assert result == "mixed"
+
+        result = lib.infer_dtype(dates, skipna=True)
+        assert result == "date"
+
+    @pytest.mark.parametrize(
+        "values",
+        [
+            [date(2020, 1, 1), Timestamp("2020-01-01")],
+            [Timestamp("2020-01-01"), date(2020, 1, 1)],
+            [date(2020, 1, 1), pd.NaT],
+            [pd.NaT, date(2020, 1, 1)],
+        ],
+    )
+    def test_infer_dtype_date_order_invariant(self, values, skipna):
+        # https://github.com/pandas-dev/pandas/issues/33741
+        result = lib.infer_dtype(values, skipna=skipna)
+        assert result == "date"
+
+    def test_is_numeric_array(self):
+        assert lib.is_float_array(np.array([1, 2.0]))
+        assert lib.is_float_array(np.array([1, 2.0, np.nan]))
+        assert not lib.is_float_array(np.array([1, 2]))
+
+        assert lib.is_integer_array(np.array([1, 2]))
+        assert not lib.is_integer_array(np.array([1, 2.0]))
+
+    def test_is_string_array(self):
+        # We should only be accepting pd.NA, np.nan,
+        # other floating point nans e.g. float('nan')]
+        # when skipna is True.
+        assert lib.is_string_array(np.array(["foo", "bar"]))
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", pd.NA], dtype=object), skipna=False
+        )
+        assert lib.is_string_array(
+            np.array(["foo", "bar", pd.NA], dtype=object), skipna=True
+        )
+        # we allow NaN/None in the StringArray constructor, so its allowed here
+        assert lib.is_string_array(
+            np.array(["foo", "bar", None], dtype=object), skipna=True
+        )
+        assert lib.is_string_array(
+            np.array(["foo", "bar", np.nan], dtype=object), skipna=True
+        )
+        # But not e.g. datetimelike or Decimal NAs
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", pd.NaT], dtype=object), skipna=True
+        )
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", np.datetime64("NaT")], dtype=object), skipna=True
+        )
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", Decimal("NaN")], dtype=object), skipna=True
+        )
+
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", None], dtype=object), skipna=False
+        )
+        assert not lib.is_string_array(
+            np.array(["foo", "bar", np.nan], dtype=object), skipna=False
+        )
+        assert not lib.is_string_array(np.array([1, 2]))
+
+    def test_is_interval_array_subclass(self):
+        # GH#46945
+
+        class TimestampsInterval(Interval):
+            def __init__(self, left: str, right: str, closed="both") -> None:
+                super().__init__(Timestamp(left), Timestamp(right), closed)
+
+            @property
+            def seconds(self) -> float:
+                return self.length.seconds
+
+        item = TimestampsInterval("1970-01-01 00:00:00", "1970-01-01 00:00:01")
+        arr = np.array([item], dtype=object)
+        assert not lib.is_interval_array(arr)
+        assert lib.infer_dtype(arr) != "interval"
+        out = Series([item])[0]
+        assert isinstance(out, TimestampsInterval)
+
+    @pytest.mark.parametrize(
+        "func",
+        [
+            "is_bool_array",
+            "is_date_array",
+            "is_datetime_array",
+            "is_datetime64_array",
+            "is_float_array",
+            "is_integer_array",
+            "is_interval_array",
+            "is_string_array",
+            "is_time_array",
+            "is_timedelta_or_timedelta64_array",
+        ],
+    )
+    def test_is_dtype_array_empty_obj(self, func):
+        # https://github.com/pandas-dev/pandas/pull/60796
+        func = getattr(lib, func)
+
+        arr = np.empty((2, 0), dtype=object)
+        assert not func(arr)
+
+        arr = np.empty((0, 2), dtype=object)
+        assert not func(arr)
+
+    def test_to_object_array_tuples(self):
+        r = (5, 6)
+        values = [r]
+        lib.to_object_array_tuples(values)
+
+        # make sure record array works
+        record = namedtuple("record", "x y")
+        r = record(5, 6)
+        values = [r]
+        lib.to_object_array_tuples(values)
+
+    def test_object(self):
+        # GH 7431
+        # cannot infer more than this as only a single element
+        arr = np.array([None], dtype="O")
+        result = lib.infer_dtype(arr, skipna=False)
+        assert result == "mixed"
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "empty"
+
+    def test_to_object_array_width(self):
+        # see gh-13320
+        rows = [[1, 2, 3], [4, 5, 6]]
+
+        expected = np.array(rows, dtype=object)
+        out = lib.to_object_array(rows)
+        tm.assert_numpy_array_equal(out, expected)
+
+        expected = np.array(rows, dtype=object)
+        out = lib.to_object_array(rows, min_width=1)
+        tm.assert_numpy_array_equal(out, expected)
+
+        expected = np.array(
+            [[1, 2, 3, None, None], [4, 5, 6, None, None]], dtype=object
+        )
+        out = lib.to_object_array(rows, min_width=5)
+        tm.assert_numpy_array_equal(out, expected)
+
+    def test_categorical(self):
+        # GH 8974
+        arr = Categorical(list("abc"))
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "categorical"
+
+        result = lib.infer_dtype(Series(arr), skipna=True)
+        assert result == "categorical"
+
+        arr = Categorical([None, None, None], categories=["cegfab"], ordered=True)
+        result = lib.infer_dtype(arr, skipna=True)
+        assert result == "categorical"
+
+        result = lib.infer_dtype(Series(arr), skipna=True)
+        assert result == "categorical"
+
+    @pytest.mark.parametrize("asobject", [True, False])
+    def test_interval(self, asobject):
+        idx = pd.IntervalIndex.from_breaks(range(5), closed="both")
+        if asobject:
+            idx = idx.astype(object)
+
+        inferred = lib.infer_dtype(idx, skipna=False)
+        assert inferred == "interval"
+
+        inferred = lib.infer_dtype(idx._data, skipna=False)
+        assert inferred == "interval"
+
+        inferred = lib.infer_dtype(Series(idx, dtype=idx.dtype), skipna=False)
+        assert inferred == "interval"
+
+    @pytest.mark.parametrize("value", [Timestamp(0), Timedelta(0), 0, 0.0])
+    def test_interval_mismatched_closed(self, value):
+        first = Interval(value, value, closed="left")
+        second = Interval(value, value, closed="right")
+
+        # if closed match, we should infer "interval"
+        arr = np.array([first, first], dtype=object)
+        assert lib.infer_dtype(arr, skipna=False) == "interval"
+
+        # if closed dont match, we should _not_ get "interval"
+        arr2 = np.array([first, second], dtype=object)
+        assert lib.infer_dtype(arr2, skipna=False) == "mixed"
+
+    def test_interval_mismatched_subtype(self):
+        first = Interval(0, 1, closed="left")
+        second = Interval(Timestamp(0), Timestamp(1), closed="left")
+        third = Interval(Timedelta(0), Timedelta(1), closed="left")
+
+        arr = np.array([first, second])
+        assert lib.infer_dtype(arr, skipna=False) == "mixed"
+
+        arr = np.array([second, third])
+        assert lib.infer_dtype(arr, skipna=False) == "mixed"
+
+        arr = np.array([first, third])
+        assert lib.infer_dtype(arr, skipna=False) == "mixed"
+
+        # float vs int subdtype are compatible
+        flt_interval = Interval(1.5, 2.5, closed="left")
+        arr = np.array([first, flt_interval], dtype=object)
+        assert lib.infer_dtype(arr, skipna=False) == "interval"
+
+    @pytest.mark.parametrize("data", [["a", "b", "c"], ["a", "b", pd.NA]])
+    def test_string_dtype(
+        self, data, skipna, index_or_series_or_array, nullable_string_dtype
+    ):
+        # StringArray
+        val = index_or_series_or_array(data, dtype=nullable_string_dtype)
+        inferred = lib.infer_dtype(val, skipna=skipna)
+        assert inferred == "string"
+
+    @pytest.mark.parametrize("data", [[True, False, True], [True, False, pd.NA]])
+    def test_boolean_dtype(self, data, skipna, index_or_series_or_array):
+        # BooleanArray
+        val = index_or_series_or_array(data, dtype="boolean")
+        inferred = lib.infer_dtype(val, skipna=skipna)
+        assert inferred == "boolean"
+
+
+class TestNumberScalar:
+    def test_is_number(self):
+        assert is_number(True)
+        assert is_number(1)
+        assert is_number(1.1)
+        assert is_number(1 + 3j)
+        assert is_number(np.int64(1))
+        assert is_number(np.float64(1.1))
+        assert is_number(np.complex128(1 + 3j))
+        assert is_number(np.nan)
+
+        assert not is_number(None)
+        assert not is_number("x")
+        assert not is_number(datetime(2011, 1, 1))
+        assert not is_number(np.datetime64("2011-01-01"))
+        assert not is_number(Timestamp("2011-01-01"))
+        assert not is_number(Timestamp("2011-01-01", tz="US/Eastern"))
+        assert not is_number(timedelta(1000))
+        assert not is_number(Timedelta("1 days"))
+
+        # questionable
+        assert not is_number(np.bool_(False))
+        assert is_number(np.timedelta64(1, "D"))
+
+    def test_is_bool(self):
+        assert is_bool(True)
+        assert is_bool(False)
+        assert is_bool(np.bool_(False))
+
+        assert not is_bool(1)
+        assert not is_bool(1.1)
+        assert not is_bool(1 + 3j)
+        assert not is_bool(np.int64(1))
+        assert not is_bool(np.float64(1.1))
+        assert not is_bool(np.complex128(1 + 3j))
+        assert not is_bool(np.nan)
+        assert not is_bool(None)
+        assert not is_bool("x")
+        assert not is_bool(datetime(2011, 1, 1))
+        assert not is_bool(np.datetime64("2011-01-01"))
+        assert not is_bool(Timestamp("2011-01-01"))
+        assert not is_bool(Timestamp("2011-01-01", tz="US/Eastern"))
+        assert not is_bool(timedelta(1000))
+        assert not is_bool(np.timedelta64(1, "D"))
+        assert not is_bool(Timedelta("1 days"))
+
+    def test_is_integer(self):
+        assert is_integer(1)
+        assert is_integer(np.int64(1))
+
+        assert not is_integer(True)
+        assert not is_integer(1.1)
+        assert not is_integer(1 + 3j)
+        assert not is_integer(False)
+        assert not is_integer(np.bool_(False))
+        assert not is_integer(np.float64(1.1))
+        assert not is_integer(np.complex128(1 + 3j))
+        assert not is_integer(np.nan)
+        assert not is_integer(None)
+        assert not is_integer("x")
+        assert not is_integer(datetime(2011, 1, 1))
+        assert not is_integer(np.datetime64("2011-01-01"))
+        assert not is_integer(Timestamp("2011-01-01"))
+        assert not is_integer(Timestamp("2011-01-01", tz="US/Eastern"))
+        assert not is_integer(timedelta(1000))
+        assert not is_integer(Timedelta("1 days"))
+        assert not is_integer(np.timedelta64(1, "D"))
+
+    def test_is_float(self):
+        assert is_float(1.1)
+        assert is_float(np.float64(1.1))
+        assert is_float(np.nan)
+
+        assert not is_float(True)
+        assert not is_float(1)
+        assert not is_float(1 + 3j)
+        assert not is_float(False)
+        assert not is_float(np.bool_(False))
+        assert not is_float(np.int64(1))
+        assert not is_float(np.complex128(1 + 3j))
+        assert not is_float(None)
+        assert not is_float("x")
+        assert not is_float(datetime(2011, 1, 1))
+        assert not is_float(np.datetime64("2011-01-01"))
+        assert not is_float(Timestamp("2011-01-01"))
+        assert not is_float(Timestamp("2011-01-01", tz="US/Eastern"))
+        assert not is_float(timedelta(1000))
+        assert not is_float(np.timedelta64(1, "D"))
+        assert not is_float(Timedelta("1 days"))
+
+    def test_is_datetime_dtypes(self):
+        ts = pd.date_range("20130101", periods=3, unit="ns")
+        tsa = pd.date_range("20130101", periods=3, tz="US/Eastern", unit="ns")
+
+        msg = "is_datetime64tz_dtype is deprecated"
+
+        assert is_datetime64_dtype("datetime64")
+        assert is_datetime64_dtype("datetime64[ns]")
+        assert is_datetime64_dtype(ts)
+        assert not is_datetime64_dtype(tsa)
+
+        assert not is_datetime64_ns_dtype("datetime64")
+        assert is_datetime64_ns_dtype("datetime64[ns]")
+        assert is_datetime64_ns_dtype(ts)
+        assert is_datetime64_ns_dtype(tsa)
+
+        assert is_datetime64_any_dtype("datetime64")
+        assert is_datetime64_any_dtype("datetime64[ns]")
+        assert is_datetime64_any_dtype(ts)
+        assert is_datetime64_any_dtype(tsa)
+
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            assert not is_datetime64tz_dtype("datetime64")
+            assert not is_datetime64tz_dtype("datetime64[ns]")
+            assert not is_datetime64tz_dtype(ts)
+            assert is_datetime64tz_dtype(tsa)
+
+    @pytest.mark.parametrize("tz", ["US/Eastern", "UTC"])
+    def test_is_datetime_dtypes_with_tz(self, tz):
+        dtype = f"datetime64[ns, {tz}]"
+        assert not is_datetime64_dtype(dtype)
+
+        msg = "is_datetime64tz_dtype is deprecated"
+        with tm.assert_produces_warning(DeprecationWarning, match=msg):
+            assert is_datetime64tz_dtype(dtype)
+        assert is_datetime64_ns_dtype(dtype)
+        assert is_datetime64_any_dtype(dtype)
+
+    def test_is_timedelta(self):
+        assert is_timedelta64_dtype("timedelta64")
+        assert is_timedelta64_dtype("timedelta64[ns]")
+        assert not is_timedelta64_ns_dtype("timedelta64")
+        assert is_timedelta64_ns_dtype("timedelta64[ns]")
+
+        tdi = TimedeltaIndex([1e14, 2e14], dtype="timedelta64[ns]")
+        assert is_timedelta64_dtype(tdi)
+        assert is_timedelta64_ns_dtype(tdi)
+        assert is_timedelta64_ns_dtype(tdi.astype("timedelta64[ns]"))
+
+        assert not is_timedelta64_ns_dtype(Index([], dtype=np.float64))
+        assert not is_timedelta64_ns_dtype(Index([], dtype=np.int64))
+
+
+class TestIsScalar:
+    def test_is_scalar_builtin_scalars(self):
+        assert is_scalar(None)
+        assert is_scalar(True)
+        assert is_scalar(False)
+        assert is_scalar(Fraction())
+        assert is_scalar(0.0)
+        assert is_scalar(1)
+        assert is_scalar(complex(2))
+        assert is_scalar(float("NaN"))
+        assert is_scalar(np.nan)
+        assert is_scalar("foobar")
+        assert is_scalar(b"foobar")
+        assert is_scalar(datetime(2014, 1, 1))
+        assert is_scalar(date(2014, 1, 1))
+        assert is_scalar(time(12, 0))
+        assert is_scalar(timedelta(hours=1))
+        assert is_scalar(pd.NaT)
+        assert is_scalar(pd.NA)
+
+    def test_is_scalar_builtin_nonscalars(self):
+        assert not is_scalar({})
+        assert not is_scalar([])
+        assert not is_scalar([1])
+        assert not is_scalar(())
+        assert not is_scalar((1,))
+        assert not is_scalar(slice(None))
+        assert not is_scalar(Ellipsis)
+
+    def test_is_scalar_numpy_array_scalars(self):
+        assert is_scalar(np.int64(1))
+        assert is_scalar(np.float64(1.0))
+        assert is_scalar(np.int32(1))
+        assert is_scalar(np.complex64(2))
+        assert is_scalar(np.object_("foobar"))
+        assert is_scalar(np.str_("foobar"))
+        assert is_scalar(np.bytes_(b"foobar"))
+        assert is_scalar(np.datetime64("2014-01-01"))
+        assert is_scalar(np.timedelta64(1, "h"))
+
+    @pytest.mark.parametrize(
+        "zerodim",
+        [
+            1,
+            "foobar",
+            np.datetime64("2014-01-01"),
+            np.timedelta64(1, "h"),
+            np.datetime64("NaT"),
+        ],
+    )
+    def test_is_scalar_numpy_zerodim_arrays(self, zerodim):
+        zerodim = np.array(zerodim)
+        assert not is_scalar(zerodim)
+        assert is_scalar(lib.item_from_zerodim(zerodim))
+
+    @pytest.mark.parametrize("arr", [np.array([]), np.array([[]])])
+    def test_is_scalar_numpy_arrays(self, arr):
+        assert not is_scalar(arr)
+        assert not is_scalar(MockNumpyLikeArray(arr))
+
+    def test_is_scalar_pandas_scalars(self):
+        assert is_scalar(Timestamp("2014-01-01"))
+        assert is_scalar(Timedelta(hours=1))
+        assert is_scalar(Period("2014-01-01"))
+        assert is_scalar(Interval(left=0, right=1))
+        assert is_scalar(DateOffset(days=1))
+        assert is_scalar(pd.offsets.Minute(3))
+
+    def test_is_scalar_pandas_containers(self):
+        assert not is_scalar(Series(dtype=object))
+        assert not is_scalar(Series([1]))
+        assert not is_scalar(DataFrame())
+        assert not is_scalar(DataFrame([[1]]))
+        assert not is_scalar(Index([]))
+        assert not is_scalar(Index([1]))
+        assert not is_scalar(Categorical([]))
+        assert not is_scalar(DatetimeIndex([])._data)
+        assert not is_scalar(TimedeltaIndex([])._data)
+        assert not is_scalar(DatetimeIndex([])._data.to_period("D"))
+        assert not is_scalar(pd.array([1, 2, 3]))
+
+    def test_is_scalar_number(self):
+        # Number() is not recognized by PyNumber_Check, so by extension
+        #  is not recognized by is_scalar, but instances of non-abstract
+        #  subclasses are.
+
+        class Numeric(Number):
+            def __init__(self, value) -> None:
+                self.value = value
+
+            def __int__(self) -> int:
+                return self.value
+
+        num = Numeric(1)
+        assert is_scalar(num)
+
+
+@pytest.mark.parametrize("unit", ["ms", "us", "ns"])
+def test_datetimeindex_from_empty_datetime64_array(unit):
+    idx = DatetimeIndex(np.array([], dtype=f"datetime64[{unit}]"))
+    assert len(idx) == 0
+
+
+def test_nan_to_nat_conversions():
+    df = DataFrame(
+        {"A": np.asarray(range(10), dtype="float64"), "B": Timestamp("20010101")}
+    )
+    df.iloc[3:6, :] = np.nan
+    result = df.loc[4, "B"]
+    assert result is pd.NaT
+
+    s = df["B"].copy()
+    s[8:9] = np.nan
+    assert s[8] is pd.NaT
+
+
+@pytest.mark.filterwarnings("ignore::PendingDeprecationWarning")
+@pytest.mark.parametrize("spmatrix", ["bsr", "coo", "csc", "csr", "dia", "dok", "lil"])
+def test_is_scipy_sparse(spmatrix):
+    sparse = pytest.importorskip("scipy.sparse")
+
+    klass = getattr(sparse, spmatrix + "_matrix")
+    assert is_scipy_sparse(klass([[0, 1]]))
+    assert not is_scipy_sparse(np.array([1]))
+
+
+def test_ensure_int32():
+    values = np.arange(10, dtype=np.int32)
+    result = ensure_int32(values)
+    assert result.dtype == np.int32
+
+    values = np.arange(10, dtype=np.int64)
+    result = ensure_int32(values)
+    assert result.dtype == np.int32
+
+
+@pytest.mark.parametrize(
+    "right,result",
+    [
+        (0, np.uint8),
+        (-1, np.int16),
+        (300, np.uint16),
+        # For floats, we just upcast directly to float64 instead of trying to
+        # find a smaller floating dtype
+        (300.0, np.uint16),  # for integer floats, we convert them to ints
+        (300.1, np.float64),
+        (np.int16(300), np.int16 if np_version_gt2 else np.uint16),
+    ],
+)
+def test_find_result_type_uint_int(right, result):
+    left_dtype = np.dtype("uint8")
+    assert find_result_type(left_dtype, right) == result
+
+
+@pytest.mark.parametrize(
+    "right,result",
+    [
+        (0, np.int8),
+        (-1, np.int8),
+        (300, np.int16),
+        # For floats, we just upcast directly to float64 instead of trying to
+        # find a smaller floating dtype
+        (300.0, np.int16),  # for integer floats, we convert them to ints
+        (300.1, np.float64),
+        (np.int16(300), np.int16),
+    ],
+)
+def test_find_result_type_int_int(right, result):
+    left_dtype = np.dtype("int8")
+    assert find_result_type(left_dtype, right) == result
+
+
+@pytest.mark.parametrize(
+    "right,result",
+    [
+        (300.0, np.float64),
+        (np.float32(300), np.float32),
+    ],
+)
+def test_find_result_type_floats(right, result):
+    left_dtype = np.dtype("float16")
+    assert find_result_type(left_dtype, right) == result
diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5b22ac30d820b4d1c2178bf3a532fafc9f917e3
--- /dev/null
+++ b/pandas/tests/dtypes/test_missing.py
@@ -0,0 +1,876 @@
+from datetime import datetime
+from decimal import Decimal
+
+import numpy as np
+import pytest
+
+from pandas._libs import missing as libmissing
+from pandas._libs.tslibs import iNaT
+
+from pandas.core.dtypes.common import (
+    is_float,
+    is_scalar,
+    pandas_dtype,
+)
+from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
+    DatetimeTZDtype,
+    IntervalDtype,
+    PeriodDtype,
+)
+from pandas.core.dtypes.missing import (
+    array_equivalent,
+    is_valid_na_for_dtype,
+    isna,
+    isnull,
+    na_value_for_dtype,
+    notna,
+    notnull,
+)
+
+import pandas as pd
+from pandas import (
+    DatetimeIndex,
+    Index,
+    NaT,
+    Series,
+    TimedeltaIndex,
+    date_range,
+    period_range,
+)
+import pandas._testing as tm
+
+fix_now = pd.Timestamp("2021-01-01")
+fix_utcnow = pd.Timestamp("2021-01-01", tz="UTC")
+
+
+@pytest.mark.parametrize("notna_f", [notna, notnull])
+def test_notna_notnull(notna_f):
+    assert notna_f(1.0)
+    assert not notna_f(None)
+    assert not notna_f(np.nan)
+
+
+@pytest.mark.parametrize("null_func", [notna, notnull, isna, isnull])
+@pytest.mark.parametrize(
+    "ser",
+    [
+        Series(
+            [str(i) for i in range(5)],
+            index=Index([str(i) for i in range(5)], dtype=object),
+            dtype=object,
+        ),
+        Series(range(5), date_range("2020-01-01", periods=5)),
+        Series(range(5), period_range("2020-01-01", periods=5)),
+    ],
+)
+def test_null_check_is_series(null_func, ser):
+    assert isinstance(null_func(ser), Series)
+
+
+class TestIsNA:
+    def test_0d_array(self):
+        assert isna(np.array(np.nan))
+        assert not isna(np.array(0.0))
+        assert not isna(np.array(0))
+        # test object dtype
+        assert isna(np.array(np.nan, dtype=object))
+        assert not isna(np.array(0.0, dtype=object))
+        assert not isna(np.array(0, dtype=object))
+
+    @pytest.mark.parametrize("shape", [(4, 0), (4,)])
+    def test_empty_object(self, shape):
+        arr = np.empty(shape=shape, dtype=object)
+        result = isna(arr)
+        expected = np.ones(shape=shape, dtype=bool)
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("isna_f", [isna, isnull])
+    def test_isna_isnull(self, isna_f):
+        assert not isna_f(1.0)
+        assert isna_f(None)
+        assert isna_f(np.nan)
+        assert float("nan")
+        assert not isna_f(np.inf)
+        assert not isna_f(-np.inf)
+
+        # type
+        assert not isna_f(type(Series(dtype=object)))
+        assert not isna_f(type(Series(dtype=np.float64)))
+        assert not isna_f(type(pd.DataFrame()))
+
+    @pytest.mark.parametrize("isna_f", [isna, isnull])
+    @pytest.mark.parametrize(
+        "data",
+        [
+            np.arange(4, dtype=float),
+            [0.0, 1.0, 0.0, 1.0],
+            Series(list("abcd"), dtype=object),
+            date_range("2020-01-01", periods=4),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "index",
+        [
+            date_range("2020-01-01", periods=4),
+            range(4),
+            period_range("2020-01-01", periods=4),
+        ],
+    )
+    def test_isna_isnull_frame(self, isna_f, data, index):
+        # frame
+        df = pd.DataFrame(data, index=index)
+        result = isna_f(df)
+        expected = df.apply(isna_f)
+        tm.assert_frame_equal(result, expected)
+
+    def test_isna_lists(self):
+        result = isna([[False]])
+        exp = np.array([[False]])
+        tm.assert_numpy_array_equal(result, exp)
+
+        result = isna([[1], [2]])
+        exp = np.array([[False], [False]])
+        tm.assert_numpy_array_equal(result, exp)
+
+        # list of strings / unicode
+        result = isna(["foo", "bar"])
+        exp = np.array([False, False])
+        tm.assert_numpy_array_equal(result, exp)
+
+        result = isna(["foo", "bar"])
+        exp = np.array([False, False])
+        tm.assert_numpy_array_equal(result, exp)
+
+        # GH20675
+        result = isna([np.nan, "world"])
+        exp = np.array([True, False])
+        tm.assert_numpy_array_equal(result, exp)
+
+    def test_isna_nat(self):
+        result = isna([NaT])
+        exp = np.array([True])
+        tm.assert_numpy_array_equal(result, exp)
+
+        result = isna(np.array([NaT], dtype=object))
+        exp = np.array([True])
+        tm.assert_numpy_array_equal(result, exp)
+
+    def test_isna_numpy_nat(self):
+        arr = np.array(
+            [
+                NaT,
+                np.datetime64("NaT"),
+                np.timedelta64("NaT"),
+                np.datetime64("NaT", "s"),
+            ]
+        )
+        result = isna(arr)
+        expected = np.array([True] * 4)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_isna_datetime(self):
+        assert not isna(datetime.now())
+        assert notna(datetime.now())
+
+        idx = date_range("1/1/1990", periods=20)
+        exp = np.ones(len(idx), dtype=bool)
+        tm.assert_numpy_array_equal(notna(idx), exp)
+
+        idx = np.asarray(idx)
+        idx[0] = iNaT
+        idx = DatetimeIndex(idx)
+        mask = isna(idx)
+        assert mask[0]
+        exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool)
+        tm.assert_numpy_array_equal(mask, exp)
+
+        # GH 9129
+        pidx = idx.to_period(freq="M")
+        mask = isna(pidx)
+        assert mask[0]
+        exp = np.array([True] + [False] * (len(idx) - 1), dtype=bool)
+        tm.assert_numpy_array_equal(mask, exp)
+
+        mask = isna(pidx[1:])
+        exp = np.zeros(len(mask), dtype=bool)
+        tm.assert_numpy_array_equal(mask, exp)
+
+    def test_isna_old_datetimelike(self):
+        # isna_old should work for dt64tz, td64, and period, not just tznaive
+        dti = date_range("2016-01-01", periods=3)
+        dta = dti._data
+        dta[-1] = NaT
+        expected = np.array([False, False, True], dtype=bool)
+
+        objs = [dta, dta.tz_localize("US/Eastern"), dta - dta, dta.to_period("D")]
+
+        for obj in objs:
+            result = isna(obj)
+
+            tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "value, expected",
+        [
+            (np.complex128(np.nan), True),
+            (np.float64(1), False),
+            (np.array([1, 1 + 0j, np.nan, 3]), np.array([False, False, True, False])),
+            (
+                np.array([1, 1 + 0j, np.nan, 3], dtype=object),
+                np.array([False, False, True, False]),
+            ),
+            (
+                np.array([1, 1 + 0j, np.nan, 3]).astype(object),
+                np.array([False, False, True, False]),
+            ),
+        ],
+    )
+    def test_complex(self, value, expected):
+        result = isna(value)
+        if is_scalar(result):
+            assert result is expected
+        else:
+            tm.assert_numpy_array_equal(result, expected)
+
+    def test_datetime_other_units(self):
+        idx = DatetimeIndex(["2011-01-01", "NaT", "2011-01-02"])
+        exp = np.array([False, True, False])
+        tm.assert_numpy_array_equal(isna(idx), exp)
+        tm.assert_numpy_array_equal(notna(idx), ~exp)
+        tm.assert_numpy_array_equal(isna(idx.values), exp)
+        tm.assert_numpy_array_equal(notna(idx.values), ~exp)
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            "datetime64[D]",
+            "datetime64[h]",
+            "datetime64[m]",
+            "datetime64[s]",
+            "datetime64[ms]",
+            "datetime64[us]",
+            "datetime64[ns]",
+        ],
+    )
+    def test_datetime_other_units_astype(self, dtype):
+        idx = DatetimeIndex(["2011-01-01", "NaT", "2011-01-02"])
+        values = idx.values.astype(dtype)
+
+        exp = np.array([False, True, False])
+        tm.assert_numpy_array_equal(isna(values), exp)
+        tm.assert_numpy_array_equal(notna(values), ~exp)
+
+        exp = Series([False, True, False])
+        s = Series(values)
+        tm.assert_series_equal(isna(s), exp)
+        tm.assert_series_equal(notna(s), ~exp)
+        s = Series(values, dtype=object)
+        tm.assert_series_equal(isna(s), exp)
+        tm.assert_series_equal(notna(s), ~exp)
+
+    def test_timedelta_other_units(self):
+        idx = TimedeltaIndex(["1 days", "NaT", "2 days"])
+        exp = np.array([False, True, False])
+        tm.assert_numpy_array_equal(isna(idx), exp)
+        tm.assert_numpy_array_equal(notna(idx), ~exp)
+        tm.assert_numpy_array_equal(isna(idx.values), exp)
+        tm.assert_numpy_array_equal(notna(idx.values), ~exp)
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            "timedelta64[D]",
+            "timedelta64[h]",
+            "timedelta64[m]",
+            "timedelta64[s]",
+            "timedelta64[ms]",
+            "timedelta64[us]",
+            "timedelta64[ns]",
+        ],
+    )
+    def test_timedelta_other_units_dtype(self, dtype):
+        idx = TimedeltaIndex(["1 days", "NaT", "2 days"])
+        values = idx.values.astype(dtype)
+
+        exp = np.array([False, True, False])
+        tm.assert_numpy_array_equal(isna(values), exp)
+        tm.assert_numpy_array_equal(notna(values), ~exp)
+
+        exp = Series([False, True, False])
+        s = Series(values)
+        tm.assert_series_equal(isna(s), exp)
+        tm.assert_series_equal(notna(s), ~exp)
+        s = Series(values, dtype=object)
+        tm.assert_series_equal(isna(s), exp)
+        tm.assert_series_equal(notna(s), ~exp)
+
+    def test_period(self):
+        idx = pd.PeriodIndex(["2011-01", "NaT", "2012-01"], freq="M")
+        exp = np.array([False, True, False])
+        tm.assert_numpy_array_equal(isna(idx), exp)
+        tm.assert_numpy_array_equal(notna(idx), ~exp)
+
+        exp = Series([False, True, False])
+        s = Series(idx)
+        tm.assert_series_equal(isna(s), exp)
+        tm.assert_series_equal(notna(s), ~exp)
+        s = Series(idx, dtype=object)
+        tm.assert_series_equal(isna(s), exp)
+        tm.assert_series_equal(notna(s), ~exp)
+
+    def test_decimal(self):
+        # scalars GH#23530
+        a = Decimal("1.0")
+        assert isna(a) is False
+        assert notna(a) is True
+
+        b = Decimal("NaN")
+        assert isna(b) is True
+        assert notna(b) is False
+
+        # array
+        arr = np.array([a, b])
+        expected = np.array([False, True])
+        result = isna(arr)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = notna(arr)
+        tm.assert_numpy_array_equal(result, ~expected)
+
+        # series
+        ser = Series(arr)
+        expected = Series(expected)
+        result = isna(ser)
+        tm.assert_series_equal(result, expected)
+
+        result = notna(ser)
+        tm.assert_series_equal(result, ~expected)
+
+        # index
+        idx = Index(arr)
+        expected = np.array([False, True])
+        result = isna(idx)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = notna(idx)
+        tm.assert_numpy_array_equal(result, ~expected)
+
+
+@pytest.mark.parametrize("dtype_equal", [True, False])
+def test_array_equivalent(dtype_equal):
+    assert array_equivalent(
+        np.array([np.nan, np.nan]), np.array([np.nan, np.nan]), dtype_equal=dtype_equal
+    )
+    assert array_equivalent(
+        np.array([np.nan, 1, np.nan]),
+        np.array([np.nan, 1, np.nan]),
+        dtype_equal=dtype_equal,
+    )
+    assert array_equivalent(
+        np.array([np.nan, None], dtype="object"),
+        np.array([np.nan, None], dtype="object"),
+        dtype_equal=dtype_equal,
+    )
+    # Check the handling of nested arrays in array_equivalent_object
+    assert array_equivalent(
+        np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"),
+        np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"),
+        dtype_equal=dtype_equal,
+    )
+    assert array_equivalent(
+        np.array([np.nan, 1 + 1j], dtype="complex"),
+        np.array([np.nan, 1 + 1j], dtype="complex"),
+        dtype_equal=dtype_equal,
+    )
+    assert not array_equivalent(
+        np.array([np.nan, 1 + 1j], dtype="complex"),
+        np.array([np.nan, 1 + 2j], dtype="complex"),
+        dtype_equal=dtype_equal,
+    )
+    assert not array_equivalent(
+        np.array([np.nan, 1, np.nan]),
+        np.array([np.nan, 2, np.nan]),
+        dtype_equal=dtype_equal,
+    )
+    assert not array_equivalent(
+        np.array(["a", "b", "c", "d"]), np.array(["e", "e"]), dtype_equal=dtype_equal
+    )
+    assert array_equivalent(
+        Index([0, np.nan]), Index([0, np.nan]), dtype_equal=dtype_equal
+    )
+    assert not array_equivalent(
+        Index([0, np.nan]), Index([1, np.nan]), dtype_equal=dtype_equal
+    )
+
+
+@pytest.mark.parametrize("dtype_equal", [True, False])
+def test_array_equivalent_tdi(dtype_equal):
+    assert array_equivalent(
+        TimedeltaIndex([0, np.nan]),
+        TimedeltaIndex([0, np.nan]),
+        dtype_equal=dtype_equal,
+    )
+    assert not array_equivalent(
+        TimedeltaIndex([0, np.nan]),
+        TimedeltaIndex([1, np.nan]),
+        dtype_equal=dtype_equal,
+    )
+
+
+@pytest.mark.parametrize("dtype_equal", [True, False])
+def test_array_equivalent_dti(dtype_equal):
+    assert array_equivalent(
+        DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan]), dtype_equal=dtype_equal
+    )
+    assert not array_equivalent(
+        DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan]), dtype_equal=dtype_equal
+    )
+
+    dti1 = DatetimeIndex([0, np.nan], tz="US/Eastern")
+    dti2 = DatetimeIndex([0, np.nan], tz="CET")
+    dti3 = DatetimeIndex([1, np.nan], tz="US/Eastern")
+
+    assert array_equivalent(
+        dti1,
+        dti1,
+        dtype_equal=dtype_equal,
+    )
+    assert not array_equivalent(
+        dti1,
+        dti3,
+        dtype_equal=dtype_equal,
+    )
+    # The rest are not dtype_equal
+    assert not array_equivalent(DatetimeIndex([0, np.nan]), dti1)
+    assert array_equivalent(
+        dti2,
+        dti1,
+    )
+
+    assert not array_equivalent(DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan]))
+
+
+@pytest.mark.parametrize(
+    "val", [1, 1.1, 1 + 1j, True, "abc", [1, 2], (1, 2), {1, 2}, {"a": 1}, None]
+)
+def test_array_equivalent_series(val):
+    arr = np.array([1, 2])
+    assert not array_equivalent(Series([arr, arr]), Series([arr, val]))
+
+
+def test_array_equivalent_array_mismatched_shape():
+    # to trigger the motivating bug, the first N elements of the arrays need
+    #  to match
+    first = np.array([1, 2, 3])
+    second = np.array([1, 2])
+
+    left = Series([first, "a"], dtype=object)
+    right = Series([second, "a"], dtype=object)
+    assert not array_equivalent(left, right)
+
+
+def test_array_equivalent_array_mismatched_dtype():
+    # same shape, different dtype can still be equivalent
+    first = np.array([1, 2], dtype=np.float64)
+    second = np.array([1, 2])
+
+    left = Series([first, "a"], dtype=object)
+    right = Series([second, "a"], dtype=object)
+    assert array_equivalent(left, right)
+
+
+def test_array_equivalent_different_dtype_but_equal():
+    # Unclear if this is exposed anywhere in the public-facing API
+    assert array_equivalent(np.array([1, 2]), np.array([1.0, 2.0]))
+
+
+@pytest.mark.parametrize(
+    "lvalue, rvalue",
+    [
+        # There are 3 variants for each of lvalue and rvalue. We include all
+        #  three for the tz-naive `now` and exclude the datetim64 variant
+        #  for utcnow because it drops tzinfo.
+        (fix_now, fix_utcnow),
+        (fix_now.to_datetime64(), fix_utcnow),
+        (fix_now.to_pydatetime(), fix_utcnow),
+        (fix_now.to_datetime64(), fix_utcnow.to_pydatetime()),
+        (fix_now.to_pydatetime(), fix_utcnow.to_pydatetime()),
+    ],
+)
+def test_array_equivalent_tzawareness(lvalue, rvalue):
+    # we shouldn't raise if comparing tzaware and tznaive datetimes
+    left = np.array([lvalue], dtype=object)
+    right = np.array([rvalue], dtype=object)
+
+    assert not array_equivalent(left, right, strict_nan=True)
+    assert not array_equivalent(left, right, strict_nan=False)
+
+
+def test_array_equivalent_compat():
+    # see gh-13388
+    m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)])
+    n = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)])
+    assert array_equivalent(m, n, strict_nan=True)
+    assert array_equivalent(m, n, strict_nan=False)
+
+    m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)])
+    n = np.array([(1, 2), (4, 3)], dtype=[("a", int), ("b", float)])
+    assert not array_equivalent(m, n, strict_nan=True)
+    assert not array_equivalent(m, n, strict_nan=False)
+
+    m = np.array([(1, 2), (3, 4)], dtype=[("a", int), ("b", float)])
+    n = np.array([(1, 2), (3, 4)], dtype=[("b", int), ("a", float)])
+    assert not array_equivalent(m, n, strict_nan=True)
+    assert not array_equivalent(m, n, strict_nan=False)
+
+
+@pytest.mark.parametrize("dtype", ["O", "S", "U"])
+def test_array_equivalent_str(dtype):
+    assert array_equivalent(
+        np.array(["A", "B"], dtype=dtype), np.array(["A", "B"], dtype=dtype)
+    )
+    assert not array_equivalent(
+        np.array(["A", "B"], dtype=dtype), np.array(["A", "X"], dtype=dtype)
+    )
+
+
+@pytest.mark.parametrize("strict_nan", [True, False])
+def test_array_equivalent_nested(strict_nan):
+    # reached in groupby aggregations, make sure we use np.any when checking
+    #  if the comparison is truthy
+    left = np.array([np.array([50, 70, 90]), np.array([20, 30])], dtype=object)
+    right = np.array([np.array([50, 70, 90]), np.array([20, 30])], dtype=object)
+
+    assert array_equivalent(left, right, strict_nan=strict_nan)
+    assert not array_equivalent(left, right[::-1], strict_nan=strict_nan)
+
+    left = np.empty(2, dtype=object)
+    left[:] = [np.array([50, 70, 90]), np.array([20, 30, 40])]
+    right = np.empty(2, dtype=object)
+    right[:] = [np.array([50, 70, 90]), np.array([20, 30, 40])]
+    assert array_equivalent(left, right, strict_nan=strict_nan)
+    assert not array_equivalent(left, right[::-1], strict_nan=strict_nan)
+
+    left = np.array([np.array([50, 50, 50]), np.array([40, 40])], dtype=object)
+    right = np.array([50, 40])
+    assert not array_equivalent(left, right, strict_nan=strict_nan)
+
+
+@pytest.mark.filterwarnings("ignore:elementwise comparison failed:DeprecationWarning")
+@pytest.mark.parametrize("strict_nan", [True, False])
+def test_array_equivalent_nested2(strict_nan):
+    # more than one level of nesting
+    left = np.array(
+        [
+            np.array([np.array([50, 70]), np.array([90])], dtype=object),
+            np.array([np.array([20, 30])], dtype=object),
+        ],
+        dtype=object,
+    )
+    right = np.array(
+        [
+            np.array([np.array([50, 70]), np.array([90])], dtype=object),
+            np.array([np.array([20, 30])], dtype=object),
+        ],
+        dtype=object,
+    )
+    assert array_equivalent(left, right, strict_nan=strict_nan)
+    assert not array_equivalent(left, right[::-1], strict_nan=strict_nan)
+
+    left = np.array([np.array([np.array([50, 50, 50])], dtype=object)], dtype=object)
+    right = np.array([50])
+    assert not array_equivalent(left, right, strict_nan=strict_nan)
+
+
+@pytest.mark.parametrize("strict_nan", [True, False])
+def test_array_equivalent_nested_list(strict_nan):
+    left = np.array([[50, 70, 90], [20, 30]], dtype=object)
+    right = np.array([[50, 70, 90], [20, 30]], dtype=object)
+
+    assert array_equivalent(left, right, strict_nan=strict_nan)
+    assert not array_equivalent(left, right[::-1], strict_nan=strict_nan)
+
+    left = np.array([[50, 50, 50], [40, 40]], dtype=object)
+    right = np.array([50, 40])
+    assert not array_equivalent(left, right, strict_nan=strict_nan)
+
+
+@pytest.mark.filterwarnings("ignore:elementwise comparison failed:DeprecationWarning")
+@pytest.mark.xfail(reason="failing")
+@pytest.mark.parametrize("strict_nan", [True, False])
+def test_array_equivalent_nested_mixed_list(strict_nan):
+    # mixed arrays / lists in left and right
+    # https://github.com/pandas-dev/pandas/issues/50360
+    left = np.array([np.array([1, 2, 3]), np.array([4, 5])], dtype=object)
+    right = np.array([[1, 2, 3], [4, 5]], dtype=object)
+
+    assert array_equivalent(left, right, strict_nan=strict_nan)
+    assert not array_equivalent(left, right[::-1], strict_nan=strict_nan)
+
+    # multiple levels of nesting
+    left = np.array(
+        [
+            np.array([np.array([1, 2, 3]), np.array([4, 5])], dtype=object),
+            np.array([np.array([6]), np.array([7, 8]), np.array([9])], dtype=object),
+        ],
+        dtype=object,
+    )
+    right = np.array([[[1, 2, 3], [4, 5]], [[6], [7, 8], [9]]], dtype=object)
+    assert array_equivalent(left, right, strict_nan=strict_nan)
+    assert not array_equivalent(left, right[::-1], strict_nan=strict_nan)
+
+    # same-length lists
+    subarr = np.empty(2, dtype=object)
+    subarr[:] = [
+        np.array([None, "b"], dtype=object),
+        np.array(["c", "d"], dtype=object),
+    ]
+    left = np.array([subarr, None], dtype=object)
+    right = np.array([[[None, "b"], ["c", "d"]], None], dtype=object)
+    assert array_equivalent(left, right, strict_nan=strict_nan)
+    assert not array_equivalent(left, right[::-1], strict_nan=strict_nan)
+
+
+@pytest.mark.xfail(reason="failing")
+@pytest.mark.parametrize("strict_nan", [True, False])
+def test_array_equivalent_nested_dicts(strict_nan):
+    left = np.array([{"f1": 1, "f2": np.array(["a", "b"], dtype=object)}], dtype=object)
+    right = np.array(
+        [{"f1": 1, "f2": np.array(["a", "b"], dtype=object)}], dtype=object
+    )
+    assert array_equivalent(left, right, strict_nan=strict_nan)
+    assert not array_equivalent(left, right[::-1], strict_nan=strict_nan)
+
+    right2 = np.array([{"f1": 1, "f2": ["a", "b"]}], dtype=object)
+    assert array_equivalent(left, right2, strict_nan=strict_nan)
+    assert not array_equivalent(left, right2[::-1], strict_nan=strict_nan)
+
+
+def test_array_equivalent_index_with_tuples():
+    # GH#48446
+    idx1 = Index(np.array([(pd.NA, 4), (1, 1)], dtype="object"))
+    idx2 = Index(np.array([(1, 1), (pd.NA, 4)], dtype="object"))
+    assert not array_equivalent(idx1, idx2)
+    assert not idx1.equals(idx2)
+    assert not array_equivalent(idx2, idx1)
+    assert not idx2.equals(idx1)
+
+    idx1 = Index(np.array([(4, pd.NA), (1, 1)], dtype="object"))
+    idx2 = Index(np.array([(1, 1), (4, pd.NA)], dtype="object"))
+    assert not array_equivalent(idx1, idx2)
+    assert not idx1.equals(idx2)
+    assert not array_equivalent(idx2, idx1)
+    assert not idx2.equals(idx1)
+
+
+@pytest.mark.parametrize(
+    "dtype, na_value",
+    [
+        # Datetime-like
+        (np.dtype("M8[ns]"), np.datetime64("NaT", "ns")),
+        (np.dtype("m8[ns]"), np.timedelta64("NaT", "ns")),
+        (DatetimeTZDtype.construct_from_string("datetime64[ns, US/Eastern]"), NaT),
+        (PeriodDtype("M"), NaT),
+        # Integer
+        ("u1", 0),
+        ("u2", 0),
+        ("u4", 0),
+        ("u8", 0),
+        ("i1", 0),
+        ("i2", 0),
+        ("i4", 0),
+        ("i8", 0),
+        # Bool
+        ("bool", False),
+        # Float
+        ("f2", np.nan),
+        ("f4", np.nan),
+        ("f8", np.nan),
+        # Complex
+        ("c8", np.nan),
+        ("c16", np.nan),
+        # Object
+        ("O", np.nan),
+        # Interval
+        (IntervalDtype(), np.nan),
+    ],
+)
+def test_na_value_for_dtype(dtype, na_value):
+    result = na_value_for_dtype(pandas_dtype(dtype))
+    # identify check doesn't work for datetime64/timedelta64("NaT") bc they
+    #  are not singletons
+    assert result is na_value or (
+        isna(result) and isna(na_value) and type(result) is type(na_value)
+    )
+
+
+class TestNAObj:
+    def _check_behavior(self, arr, expected):
+        result = libmissing.isnaobj(arr)
+        tm.assert_numpy_array_equal(result, expected)
+
+        arr = np.atleast_2d(arr)
+        expected = np.atleast_2d(expected)
+
+        result = libmissing.isnaobj(arr)
+        tm.assert_numpy_array_equal(result, expected)
+
+        # Test fortran order
+        arr = arr.copy(order="F")
+        result = libmissing.isnaobj(arr)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_basic(self):
+        arr = np.array([1, None, "foo", -5.1, NaT, np.nan])
+        expected = np.array([False, True, False, False, True, True])
+
+        self._check_behavior(arr, expected)
+
+    def test_non_obj_dtype(self):
+        arr = np.array([1, 3, np.nan, 5], dtype=float)
+        expected = np.array([False, False, True, False])
+
+        self._check_behavior(arr, expected)
+
+    def test_empty_arr(self):
+        arr = np.array([])
+        expected = np.array([], dtype=bool)
+
+        self._check_behavior(arr, expected)
+
+    def test_empty_str_inp(self):
+        arr = np.array([""])  # empty but not na
+        expected = np.array([False])
+
+        self._check_behavior(arr, expected)
+
+    def test_empty_like(self):
+        # see gh-13717: no segfaults!
+        arr = np.empty_like([None])
+        expected = np.array([True])
+
+        self._check_behavior(arr, expected)
+
+
+m8_units = ["as", "ps", "ns", "us", "ms", "s", "m", "h", "D", "W", "M", "Y"]
+
+na_vals = (
+    [
+        None,
+        NaT,
+        float("NaN"),
+        complex("NaN"),
+        np.nan,
+        np.float64("NaN"),
+        np.float32("NaN"),
+        np.complex64(np.nan),
+        np.complex128(np.nan),
+        np.datetime64("NaT"),
+        np.timedelta64("NaT"),
+    ]
+    + [np.datetime64("NaT", unit) for unit in m8_units]  # type: ignore[call-overload]
+    + [np.timedelta64("NaT", unit) for unit in m8_units]  # type: ignore[call-overload]
+)
+
+inf_vals = [
+    float("inf"),
+    float("-inf"),
+    complex("inf"),
+    complex("-inf"),
+    np.inf,
+    -np.inf,
+]
+
+int_na_vals = [
+    # Values that match iNaT, which we treat as null in specific cases
+    np.int64(NaT._value),
+    int(NaT._value),
+]
+
+sometimes_na_vals = [Decimal("NaN")]
+
+never_na_vals = [
+    # float/complex values that when viewed as int64 match iNaT
+    -0.0,
+    np.float64("-0.0"),
+    -0j,
+    np.complex64(-0j),
+]
+
+
+class TestLibMissing:
+    @pytest.mark.parametrize("func", [libmissing.checknull, isna])
+    @pytest.mark.parametrize(
+        "value",
+        na_vals + sometimes_na_vals,  # type: ignore[operator]
+    )
+    def test_checknull_na_vals(self, func, value):
+        assert func(value)
+
+    @pytest.mark.parametrize("func", [libmissing.checknull, isna])
+    @pytest.mark.parametrize("value", inf_vals)
+    def test_checknull_inf_vals(self, func, value):
+        assert not func(value)
+
+    @pytest.mark.parametrize("func", [libmissing.checknull, isna])
+    @pytest.mark.parametrize("value", int_na_vals)
+    def test_checknull_intna_vals(self, func, value):
+        assert not func(value)
+
+    @pytest.mark.parametrize("func", [libmissing.checknull, isna])
+    @pytest.mark.parametrize("value", never_na_vals)
+    def test_checknull_never_na_vals(self, func, value):
+        assert not func(value)
+
+    @pytest.mark.parametrize(
+        "value",
+        na_vals + sometimes_na_vals,  # type: ignore[operator]
+    )
+    def test_checknull_old_na_vals(self, value):
+        assert libmissing.checknull(value)
+
+    @pytest.mark.parametrize("value", int_na_vals)
+    def test_checknull_old_intna_vals(self, value):
+        assert not libmissing.checknull(value)
+
+    def test_is_matching_na(self, nulls_fixture, nulls_fixture2):
+        left = nulls_fixture
+        right = nulls_fixture2
+
+        assert libmissing.is_matching_na(left, left)
+
+        if left is right:
+            assert libmissing.is_matching_na(left, right)
+        elif is_float(left) and is_float(right):
+            # np.nan vs float("NaN") we consider as matching
+            assert libmissing.is_matching_na(left, right)
+        elif type(left) is type(right):
+            # e.g. both Decimal("NaN")
+            assert libmissing.is_matching_na(left, right)
+        else:
+            assert not libmissing.is_matching_na(left, right)
+
+    def test_is_matching_na_nan_matches_none(self):
+        assert not libmissing.is_matching_na(None, np.nan)
+        assert not libmissing.is_matching_na(np.nan, None)
+
+        assert libmissing.is_matching_na(None, np.nan, nan_matches_none=True)
+        assert libmissing.is_matching_na(np.nan, None, nan_matches_none=True)
+
+
+class TestIsValidNAForDtype:
+    def test_is_valid_na_for_dtype_interval(self):
+        dtype = IntervalDtype("int64", "left")
+        assert not is_valid_na_for_dtype(NaT, dtype)
+
+        dtype = IntervalDtype("datetime64[ns]", "both")
+        assert not is_valid_na_for_dtype(NaT, dtype)
+
+    def test_is_valid_na_for_dtype_categorical(self):
+        dtype = CategoricalDtype(categories=[0, 1, 2])
+        assert is_valid_na_for_dtype(np.nan, dtype)
+
+        assert not is_valid_na_for_dtype(NaT, dtype)
+        assert not is_valid_na_for_dtype(np.datetime64("NaT", "ns"), dtype)
+        assert not is_valid_na_for_dtype(np.timedelta64("NaT", "ns"), dtype)
diff --git a/pandas/tests/extension/__init__.py b/pandas/tests/extension/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/extension/conftest.py b/pandas/tests/extension/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..1376af5e51a6b6255625c257031e74a33d7b7b77
--- /dev/null
+++ b/pandas/tests/extension/conftest.py
@@ -0,0 +1,214 @@
+import operator
+
+import pytest
+
+from pandas import Series
+
+
+@pytest.fixture
+def dtype():
+    """A fixture providing the ExtensionDtype to validate."""
+    raise NotImplementedError
+
+
+@pytest.fixture
+def data():
+    """
+    Length-10 array for this type.
+
+    * data[0] and data[1] should both be non missing
+    * data[0] and data[1] should not be equal
+    """
+    raise NotImplementedError
+
+
+@pytest.fixture
+def data_for_twos(dtype):
+    """
+    Length-10 array in which all the elements are two.
+
+    Call pytest.skip in your fixture if the dtype does not support divmod.
+    """
+    if not (dtype._is_numeric or dtype.kind == "m"):
+        # Object-dtypes may want to allow this, but for the most part
+        #  only numeric and timedelta-like dtypes will need to implement this.
+        pytest.skip(f"{dtype} is not a numeric dtype")
+
+    raise NotImplementedError
+
+
+@pytest.fixture
+def data_missing():
+    """Length-2 array with [NA, Valid]"""
+    raise NotImplementedError
+
+
+@pytest.fixture(params=["data", "data_missing"])
+def all_data(request, data, data_missing):
+    """Parametrized fixture giving 'data' and 'data_missing'"""
+    if request.param == "data":
+        return data
+    elif request.param == "data_missing":
+        return data_missing
+
+
+@pytest.fixture
+def data_repeated(data):
+    """
+    Generate many datasets.
+
+    Parameters
+    ----------
+    data : fixture implementing `data`
+
+    Returns
+    -------
+    Callable[[int], Generator]:
+        A callable that takes a `count` argument and
+        returns a generator yielding `count` datasets.
+    """
+
+    def gen(count):
+        for _ in range(count):
+            yield data
+
+    return gen
+
+
+@pytest.fixture
+def data_for_sorting():
+    """
+    Length-3 array with a known sort order.
+
+    This should be three items [B, C, A] with
+    A < B < C
+
+    For boolean dtypes (for which there are only 2 values available),
+    set B=C=True
+    """
+    raise NotImplementedError
+
+
+@pytest.fixture
+def data_missing_for_sorting():
+    """
+    Length-3 array with a known sort order.
+
+    This should be three items [B, NA, A] with
+    A < B and NA missing.
+    """
+    raise NotImplementedError
+
+
+@pytest.fixture
+def na_cmp():
+    """
+    Binary operator for comparing NA values.
+
+    Should return a function of two arguments that returns
+    True if both arguments are (scalar) NA for your type.
+
+    By default, uses ``operator.is_``
+    """
+    return operator.is_
+
+
+@pytest.fixture
+def na_value(dtype):
+    """
+    The scalar missing value for this type. Default dtype.na_value.
+
+    TODO: can be removed in 3.x (see https://github.com/pandas-dev/pandas/pull/54930)
+    """
+    return dtype.na_value
+
+
+@pytest.fixture
+def data_for_grouping():
+    """
+    Data for factorization, grouping, and unique tests.
+
+    Expected to be like [B, B, NA, NA, A, A, B, C]
+
+    Where A < B < C and NA is missing.
+
+    If a dtype has _is_boolean = True, i.e. only 2 unique non-NA entries,
+    then set C=B.
+    """
+    raise NotImplementedError
+
+
+@pytest.fixture(params=[True, False])
+def box_in_series(request):
+    """Whether to box the data in a Series"""
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        lambda x: 1,
+        lambda x: [1] * len(x),
+        lambda x: Series([1] * len(x)),
+        lambda x: x,
+    ],
+    ids=["scalar", "list", "series", "object"],
+)
+def groupby_apply_op(request):
+    """
+    Functions to test groupby.apply().
+    """
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def as_frame(request):
+    """
+    Boolean fixture to support Series and Series.to_frame() comparison testing.
+    """
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def as_series(request):
+    """
+    Boolean fixture to support arr and Series(arr) comparison testing.
+    """
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def use_numpy(request):
+    """
+    Boolean fixture to support comparison testing of ExtensionDtype array
+    and numpy array.
+    """
+    return request.param
+
+
+@pytest.fixture(params=["ffill", "bfill"])
+def fillna_method(request):
+    """
+    Parametrized fixture giving method parameters 'ffill' and 'bfill' for
+    Series.<method> testing.
+    """
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def as_array(request):
+    """
+    Boolean fixture to support ExtensionDtype _from_sequence method testing.
+    """
+    return request.param
+
+
+@pytest.fixture
+def invalid_scalar(data):
+    """
+    A scalar that *cannot* be held by this ExtensionArray.
+
+    The default should work for most subclasses, but is not guaranteed.
+
+    If the array can hold any item (i.e. object dtype), then use pytest.skip.
+    """
+    return object.__new__(object)
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3388d74447391dc3640c35b0cb12da2267c579c
--- /dev/null
+++ b/pandas/tests/extension/test_arrow.py
@@ -0,0 +1,3951 @@
+"""
+This file contains a minimal set of tests for compliance with the extension
+array interface test suite, and should contain no other tests.
+The test suite for the full functionality of the array is located in
+`pandas/tests/arrays/`.
+The tests in this file are inherited from the BaseExtensionTests, and only
+minimal tweaks should be applied to get the tests passing (by overwriting a
+parent method).
+Additional tests should either be added to one of the BaseExtensionTests
+classes (if they are relevant for the extension interface for all dtypes), or
+be added to the array-specific tests in `pandas/tests/arrays/`.
+"""
+
+from __future__ import annotations
+
+from datetime import (
+    date,
+    datetime,
+    time,
+    timedelta,
+)
+from decimal import Decimal
+from io import (
+    BytesIO,
+    StringIO,
+)
+import operator
+import pickle
+import re
+import sys
+
+import numpy as np
+import pytest
+
+from pandas._libs import lib
+from pandas._libs.tslibs import timezones
+from pandas.compat import (
+    PY312,
+    is_ci_environment,
+    is_platform_windows,
+    pa_version_under14p0,
+    pa_version_under19p0,
+    pa_version_under20p0,
+    pa_version_under21p0,
+)
+from pandas.compat.pyarrow import pa_version_under22p0
+from pandas.errors import Pandas4Warning
+
+from pandas.core.dtypes.common import pandas_dtype
+from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
+    CategoricalDtypeType,
+)
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.api.extensions import no_default
+from pandas.api.types import (
+    is_bool_dtype,
+    is_datetime64_any_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+    is_numeric_dtype,
+    is_signed_integer_dtype,
+    is_string_dtype,
+    is_unsigned_integer_dtype,
+)
+from pandas.tests.extension import base
+
+pa = pytest.importorskip("pyarrow")
+
+from pandas.core.arrays.arrow.array import ArrowExtensionArray
+from pandas.core.arrays.arrow.extension_types import ArrowPeriodType
+
+
+def _require_timezone_database(request):
+    if is_platform_windows() and is_ci_environment() and pa_version_under22p0:
+        mark = pytest.mark.xfail(
+            raises=pa.ArrowInvalid,
+            reason=(
+                "TODO: Set ARROW_TIMEZONE_DATABASE environment variable "
+                "on CI to path to the tzdata for pyarrow."
+            ),
+        )
+        request.applymarker(mark)
+
+
+@pytest.fixture(params=tm.ALL_PYARROW_DTYPES, ids=str)
+def dtype(request):
+    return ArrowDtype(pyarrow_dtype=request.param)
+
+
+@pytest.fixture
+def data(dtype):
+    pa_dtype = dtype.pyarrow_dtype
+    if pa.types.is_boolean(pa_dtype):
+        data = [True, False] * 2 + [None] + [True, False] + [None] + [True, False]
+    elif pa.types.is_floating(pa_dtype):
+        data = [1.0, 0.0] * 2 + [None] + [-2.0, -1.0] + [None] + [0.5, 99.5]
+    elif pa.types.is_signed_integer(pa_dtype):
+        data = [1, 0] * 2 + [None] + [-2, -1] + [None] + [1, 99]
+    elif pa.types.is_unsigned_integer(pa_dtype):
+        data = [1, 0] * 2 + [None] + [2, 1] + [None] + [1, 99]
+    elif pa.types.is_decimal(pa_dtype):
+        data = (
+            [Decimal("1"), Decimal("0.0")] * 2
+            + [None]
+            + [Decimal("-2.0"), Decimal("-1.0")]
+            + [None]
+            + [Decimal("0.5"), Decimal("33.123")]
+        )
+    elif pa.types.is_date(pa_dtype):
+        data = (
+            [date(2022, 1, 1), date(1999, 12, 31)] * 2
+            + [None]
+            + [date(2022, 1, 1), date(2022, 1, 1)]
+            + [None]
+            + [date(1999, 12, 31), date(1999, 12, 31)]
+        )
+    elif pa.types.is_timestamp(pa_dtype):
+        data = (
+            [datetime(2020, 1, 1, 1, 1, 1, 1), datetime(1999, 1, 1, 1, 1, 1, 1)] * 2
+            + [None]
+            + [datetime(2020, 1, 1, 1), datetime(1999, 1, 1, 1)]
+            + [None]
+            + [datetime(2020, 1, 1), datetime(1999, 1, 1)]
+        )
+    elif pa.types.is_duration(pa_dtype):
+        data = (
+            [timedelta(1), timedelta(1, 1)] * 2
+            + [None]
+            + [timedelta(-1), timedelta(0)]
+            + [None]
+            + [timedelta(-10), timedelta(10)]
+        )
+    elif pa.types.is_time(pa_dtype):
+        data = (
+            [time(12, 0), time(0, 12)] * 2
+            + [None]
+            + [time(0, 0), time(1, 1)]
+            + [None]
+            + [time(0, 5), time(5, 0)]
+        )
+    elif pa.types.is_string(pa_dtype):
+        data = ["a", "b"] * 2 + [None] + ["1", "2"] + [None] + ["!", ">"]
+    elif pa.types.is_binary(pa_dtype):
+        data = [b"a", b"b"] * 2 + [None] + [b"1", b"2"] + [None] + [b"!", b">"]
+    else:
+        raise NotImplementedError
+    return pd.array(data, dtype=dtype)
+
+
+@pytest.fixture
+def data_missing(data):
+    """Length-2 array with [NA, Valid]"""
+    return type(data)._from_sequence([None, data[0]], dtype=data.dtype)
+
+
+@pytest.fixture(params=["data", "data_missing"])
+def all_data(request, data, data_missing):
+    """Parametrized fixture returning 'data' or 'data_missing' integer arrays.
+
+    Used to test dtype conversion with and without missing values.
+    """
+    if request.param == "data":
+        return data
+    elif request.param == "data_missing":
+        return data_missing
+
+
+@pytest.fixture
+def data_for_grouping(dtype):
+    """
+    Data for factorization, grouping, and unique tests.
+
+    Expected to be like [B, B, NA, NA, A, A, B, C]
+
+    Where A < B < C and NA is missing
+    """
+    pa_dtype = dtype.pyarrow_dtype
+    if pa.types.is_boolean(pa_dtype):
+        A = False
+        B = True
+        C = True
+    elif pa.types.is_floating(pa_dtype):
+        A = -1.1
+        B = 0.0
+        C = 1.1
+    elif pa.types.is_signed_integer(pa_dtype):
+        A = -1
+        B = 0
+        C = 1
+    elif pa.types.is_unsigned_integer(pa_dtype):
+        A = 0
+        B = 1
+        C = 10
+    elif pa.types.is_date(pa_dtype):
+        A = date(1999, 12, 31)
+        B = date(2010, 1, 1)
+        C = date(2022, 1, 1)
+    elif pa.types.is_timestamp(pa_dtype):
+        A = datetime(1999, 1, 1, 1, 1, 1, 1)
+        B = datetime(2020, 1, 1)
+        C = datetime(2020, 1, 1, 1)
+    elif pa.types.is_duration(pa_dtype):
+        A = timedelta(-1)
+        B = timedelta(0)
+        C = timedelta(1, 4)
+    elif pa.types.is_time(pa_dtype):
+        A = time(0, 0)
+        B = time(0, 12)
+        C = time(12, 12)
+    elif pa.types.is_string(pa_dtype):
+        A = "a"
+        B = "b"
+        C = "c"
+    elif pa.types.is_binary(pa_dtype):
+        A = b"a"
+        B = b"b"
+        C = b"c"
+    elif pa.types.is_decimal(pa_dtype):
+        A = Decimal("-1.1")
+        B = Decimal("0.0")
+        C = Decimal("1.1")
+    else:
+        raise NotImplementedError
+    return pd.array([B, B, None, None, A, A, B, C], dtype=dtype)
+
+
+@pytest.fixture
+def data_for_sorting(data_for_grouping):
+    """
+    Length-3 array with a known sort order.
+
+    This should be three items [B, C, A] with
+    A < B < C
+    """
+    return type(data_for_grouping)._from_sequence(
+        [data_for_grouping[0], data_for_grouping[7], data_for_grouping[4]],
+        dtype=data_for_grouping.dtype,
+    )
+
+
+@pytest.fixture
+def data_missing_for_sorting(data_for_grouping):
+    """
+    Length-3 array with a known sort order.
+
+    This should be three items [B, NA, A] with
+    A < B and NA missing.
+    """
+    return type(data_for_grouping)._from_sequence(
+        [data_for_grouping[0], data_for_grouping[2], data_for_grouping[4]],
+        dtype=data_for_grouping.dtype,
+    )
+
+
+@pytest.fixture
+def data_for_twos(data):
+    """Length-100 array in which all the elements are two."""
+    pa_dtype = data.dtype.pyarrow_dtype
+    if (
+        pa.types.is_integer(pa_dtype)
+        or pa.types.is_floating(pa_dtype)
+        or pa.types.is_decimal(pa_dtype)
+        or pa.types.is_duration(pa_dtype)
+    ):
+        return pd.array([2] * 10, dtype=data.dtype)
+    # tests will be xfailed where 2 is not a valid scalar for pa_dtype
+    return data
+    # TODO: skip otherwise?
+
+
+class TestArrowArray(base.ExtensionTests):
+    def _construct_for_combine_add(self, left, right):
+        dtype = left.dtype
+
+        # in a couple cases, addition is not dtype-preserving
+        if dtype == "bool[pyarrow]":
+            dtype = pandas_dtype("int64[pyarrow]")
+        elif dtype == "int8[pyarrow]" and isinstance(right, type(left)):
+            dtype = pandas_dtype("int64[pyarrow]")
+
+        if isinstance(right, type(left)):
+            return left._from_sequence(
+                [a + b for (a, b) in zip(list(left), list(right), strict=True)],
+                dtype=dtype,
+            )
+        else:
+            return left._from_sequence(
+                [a + right for a in list(left)],
+                dtype=dtype,
+            )
+
+    def test_compare_scalar(self, data, comparison_op):
+        ser = pd.Series(data)
+        self._compare_other(ser, data, comparison_op, data[0])
+
+    def test_compare_range_len(self, data, comparison_op):
+        # GH#63429
+        ser = pd.Series(data)
+        range_test = range(len(ser))
+        self._compare_other(ser, range_test, comparison_op, range_test)
+
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data_missing, na_action, using_nan_is_na):
+        if data_missing.dtype.kind in "mM":
+            result = data_missing.map(lambda x: x, na_action=na_action)
+            expected = data_missing.to_numpy(dtype=object)
+            tm.assert_numpy_array_equal(result, expected)
+        else:
+            result = data_missing.map(lambda x: x, na_action=na_action)
+            if data_missing.dtype == "float32[pyarrow]" and using_nan_is_na:
+                # map roundtrips through objects, which converts to float64
+                expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
+            else:
+                expected = data_missing.to_numpy()
+            tm.assert_numpy_array_equal(result, expected)
+
+    def test_astype_str(self, data, request, using_infer_string):
+        pa_dtype = data.dtype.pyarrow_dtype
+        if pa.types.is_binary(pa_dtype):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=f"For {pa_dtype} .astype(str) decodes.",
+                )
+            )
+        elif not using_infer_string and (
+            (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None)
+            or pa.types.is_duration(pa_dtype)
+        ):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason="pd.Timestamp/pd.Timedelta repr different from numpy repr",
+                )
+            )
+        super().test_astype_str(data)
+
+    def test_from_dtype(self, data, request):
+        pa_dtype = data.dtype.pyarrow_dtype
+        if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype):
+            if pa.types.is_string(pa_dtype):
+                reason = "ArrowDtype(pa.string()) != StringDtype('pyarrow')"
+            else:
+                reason = f"pyarrow.type_for_alias cannot infer {pa_dtype}"
+
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=reason,
+                )
+            )
+        super().test_from_dtype(data)
+
+    def test_from_sequence_pa_array(self, data):
+        # https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784
+        # data._pa_array = pa.ChunkedArray
+        result = type(data)._from_sequence(data._pa_array, dtype=data.dtype)
+        tm.assert_extension_array_equal(result, data)
+        assert isinstance(result._pa_array, pa.ChunkedArray)
+
+        result = type(data)._from_sequence(
+            data._pa_array.combine_chunks(), dtype=data.dtype
+        )
+        tm.assert_extension_array_equal(result, data)
+        assert isinstance(result._pa_array, pa.ChunkedArray)
+
+    def test_from_sequence_pa_array_notimplemented(self, request):
+        dtype = ArrowDtype(pa.month_day_nano_interval())
+        with pytest.raises(NotImplementedError, match="Converting strings to"):
+            ArrowExtensionArray._from_sequence_of_strings(["12-1"], dtype=dtype)
+
+    def test_from_sequence_of_strings_pa_array(self, data, request):
+        pa_dtype = data.dtype.pyarrow_dtype
+        if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
+            _require_timezone_database(request)
+
+        pa_array = data._pa_array.cast(pa.string())
+        result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
+        tm.assert_extension_array_equal(result, data)
+
+        pa_array = pa_array.combine_chunks()
+        result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
+        tm.assert_extension_array_equal(result, data)
+
+    def check_accumulate(self, ser, op_name, skipna):
+        result = getattr(ser, op_name)(skipna=skipna)
+
+        pa_type = ser.dtype.pyarrow_dtype
+        if pa.types.is_temporal(pa_type):
+            # Just check that we match the integer behavior.
+            if pa_type.bit_width == 32:
+                int_type = "int32[pyarrow]"
+            else:
+                int_type = "int64[pyarrow]"
+            ser = ser.astype(int_type)
+            result = result.astype(int_type)
+
+        result = result.astype("Float64")
+        expected = getattr(ser.astype("Float64"), op_name)(skipna=skipna)
+        tm.assert_series_equal(result, expected, check_dtype=False)
+
+    def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
+        # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no
+        # attribute "pyarrow_dtype"
+        pa_type = ser.dtype.pyarrow_dtype  # type: ignore[union-attr]
+
+        if pa.types.is_binary(pa_type) or pa.types.is_decimal(pa_type):
+            if op_name in ["cumsum", "cumprod", "cummax", "cummin"]:
+                return False
+        elif pa.types.is_string(pa_type):
+            if op_name == "cumprod":
+                return False
+        elif pa.types.is_boolean(pa_type):
+            if op_name in ["cumprod", "cummax", "cummin"]:
+                return False
+        elif pa.types.is_temporal(pa_type):
+            if op_name == "cumsum" and not pa.types.is_duration(pa_type):
+                return False
+            elif op_name == "cumprod":
+                return False
+        return True
+
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_accumulate_series(self, data, all_numeric_accumulations, skipna, request):
+        pa_type = data.dtype.pyarrow_dtype
+        op_name = all_numeric_accumulations
+
+        if pa.types.is_string(pa_type) and op_name in ["cumsum", "cummin", "cummax"]:
+            # https://github.com/pandas-dev/pandas/pull/60633
+            # Doesn't fit test structure, tested in series/test_cumulative.py instead.
+            return
+
+        ser = pd.Series(data)
+
+        if not self._supports_accumulation(ser, op_name):
+            # The base class test will check that we raise
+            return super().test_accumulate_series(
+                data, all_numeric_accumulations, skipna
+            )
+
+        if all_numeric_accumulations == "cumsum" and (
+            pa.types.is_boolean(pa_type) or pa.types.is_decimal(pa_type)
+        ):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=f"{all_numeric_accumulations} not implemented for {pa_type}",
+                    raises=TypeError,
+                )
+            )
+
+        self.check_accumulate(ser, op_name, skipna)
+
+    def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
+        if op_name == "kurt" or (pa_version_under20p0 and op_name == "skew"):
+            return False
+
+        dtype = ser.dtype
+        # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has
+        # no attribute "pyarrow_dtype"
+        pa_dtype = dtype.pyarrow_dtype  # type: ignore[union-attr]
+        if pa.types.is_temporal(pa_dtype) and op_name in ["sum", "var", "prod", "skew"]:
+            if pa.types.is_duration(pa_dtype) and op_name in ["sum"]:
+                # summing timedeltas is one case that *is* well-defined
+                pass
+            else:
+                return False
+        elif pa.types.is_binary(pa_dtype) and op_name in ["sum", "skew"]:
+            return False
+        elif (
+            pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
+        ) and op_name in ["mean", "median", "prod", "std", "sem", "var", "skew"]:
+            return False
+
+        if (
+            pa.types.is_temporal(pa_dtype)
+            and not pa.types.is_duration(pa_dtype)
+            and op_name in ["any", "all"]
+        ):
+            # xref GH#34479 we support this in our non-pyarrow datetime64 dtypes,
+            #  but it isn't obvious we _should_.  For now, we keep the pyarrow
+            #  behavior which does not support this.
+            return False
+
+        if pa.types.is_boolean(pa_dtype) and op_name in [
+            "median",
+            "std",
+            "var",
+            "skew",
+            "kurt",
+            "sem",
+        ]:
+            return False
+
+        return True
+
+    def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
+        # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no
+        # attribute "pyarrow_dtype"
+        pa_dtype = ser.dtype.pyarrow_dtype  # type: ignore[union-attr]
+        if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype):
+            alt = ser.astype("Float64")
+        else:
+            # TODO: in the opposite case, aren't we testing... nothing? For
+            # e.g. date/time dtypes trying to calculate 'expected' by converting
+            # to object will raise for mean, std etc
+            alt = ser
+
+        # TODO: in the opposite case, aren't we testing... nothing?
+        if op_name == "count":
+            result = getattr(ser, op_name)()
+            expected = getattr(alt, op_name)()
+        else:
+            result = getattr(ser, op_name)(skipna=skipna)
+            expected = getattr(alt, op_name)(skipna=skipna)
+        tm.assert_almost_equal(result, expected)
+
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_series_boolean(
+        self, data, all_boolean_reductions, skipna, na_value, request
+    ):
+        pa_dtype = data.dtype.pyarrow_dtype
+        xfail_mark = pytest.mark.xfail(
+            raises=TypeError,
+            reason=(
+                f"{all_boolean_reductions} is not implemented in "
+                f"pyarrow={pa.__version__} for {pa_dtype}"
+            ),
+        )
+        if pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype):
+            # We *might* want to make this behave like the non-pyarrow cases,
+            #  but have not yet decided.
+            request.applymarker(xfail_mark)
+
+        return super().test_reduce_series_boolean(data, all_boolean_reductions, skipna)
+
+    def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
+        pa_type = arr._pa_array.type
+
+        if op_name in ["max", "min"]:
+            cmp_dtype = arr.dtype
+        elif pa.types.is_temporal(pa_type):
+            if op_name in ["std", "sem"]:
+                if pa.types.is_duration(pa_type):
+                    cmp_dtype = arr.dtype
+                elif pa.types.is_date(pa_type):
+                    cmp_dtype = ArrowDtype(pa.duration("s"))
+                elif pa.types.is_time(pa_type):
+                    cmp_dtype = ArrowDtype(pa.duration(pa_type.unit))
+                else:
+                    cmp_dtype = ArrowDtype(pa.duration(pa_type.unit))
+            else:
+                cmp_dtype = arr.dtype
+        elif arr.dtype.name == "decimal128(7, 3)[pyarrow]":
+            if op_name == "sum" and not pa_version_under21p0:
+                # https://github.com/apache/arrow/pull/44184
+                cmp_dtype = ArrowDtype(pa.decimal128(38, 3))
+            elif op_name not in ["median", "var", "std", "sem", "skew"]:
+                cmp_dtype = arr.dtype
+            else:
+                cmp_dtype = "float64[pyarrow]"
+        elif op_name in ["median", "var", "std", "mean", "skew", "sem"]:
+            cmp_dtype = "float64[pyarrow]"
+        elif op_name in ["sum", "prod"] and pa.types.is_boolean(pa_type):
+            cmp_dtype = "uint64[pyarrow]"
+        elif op_name == "sum" and pa.types.is_string(pa_type):
+            cmp_dtype = arr.dtype
+        else:
+            cmp_dtype = {
+                "i": "int64[pyarrow]",
+                "u": "uint64[pyarrow]",
+                "f": "float64[pyarrow]",
+            }[arr.dtype.kind]
+        return cmp_dtype
+
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request):
+        if (
+            not pa_version_under20p0
+            and skipna
+            and all_numeric_reductions == "skew"
+            and (
+                pa.types.is_integer(data.dtype.pyarrow_dtype)
+                or pa.types.is_floating(data.dtype.pyarrow_dtype)
+            )
+        ):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason="https://github.com/apache/arrow/issues/45733",
+                )
+            )
+        return super().test_reduce_series_numeric(data, all_numeric_reductions, skipna)
+
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_frame(self, data, all_numeric_reductions, skipna, request):
+        op_name = all_numeric_reductions
+        if op_name == "skew" and pa_version_under20p0:
+            if data.dtype._is_numeric:
+                mark = pytest.mark.xfail(reason="skew not implemented")
+                request.applymarker(mark)
+        return super().test_reduce_frame(data, all_numeric_reductions, skipna)
+
+    @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"])
+    def test_median_not_approximate(self, typ):
+        # GH 52679
+        result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median()
+        assert result == 1.5
+
+    def test_construct_from_string_own_name(self, dtype, request):
+        pa_dtype = dtype.pyarrow_dtype
+        if pa.types.is_decimal(pa_dtype):
+            request.applymarker(
+                pytest.mark.xfail(
+                    raises=NotImplementedError,
+                    reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
+                )
+            )
+
+        if pa.types.is_string(pa_dtype):
+            # We still support StringDtype('pyarrow') over ArrowDtype(pa.string())
+            msg = r"string\[pyarrow\] should be constructed by StringDtype"
+            with pytest.raises(TypeError, match=msg):
+                dtype.construct_from_string(dtype.name)
+
+            return
+
+        super().test_construct_from_string_own_name(dtype)
+
+    def test_is_dtype_from_name(self, dtype, request):
+        pa_dtype = dtype.pyarrow_dtype
+        if pa.types.is_string(pa_dtype):
+            # We still support StringDtype('pyarrow') over ArrowDtype(pa.string())
+            assert not type(dtype).is_dtype(dtype.name)
+        else:
+            if pa.types.is_decimal(pa_dtype):
+                request.applymarker(
+                    pytest.mark.xfail(
+                        raises=NotImplementedError,
+                        reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
+                    )
+                )
+            super().test_is_dtype_from_name(dtype)
+
+    def test_construct_from_string_another_type_raises(self, dtype):
+        msg = r"'another_type' must end with '\[pyarrow\]'"
+        with pytest.raises(TypeError, match=msg):
+            type(dtype).construct_from_string("another_type")
+
+    def test_get_common_dtype(self, dtype, request):
+        pa_dtype = dtype.pyarrow_dtype
+        if (
+            pa.types.is_date(pa_dtype)
+            or pa.types.is_time(pa_dtype)
+            or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None)
+            or pa.types.is_binary(pa_dtype)
+            or pa.types.is_decimal(pa_dtype)
+        ):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=(
+                        f"{pa_dtype} does not have associated numpy "
+                        f"dtype findable by find_common_type"
+                    )
+                )
+            )
+        super().test_get_common_dtype(dtype)
+
+    def test_is_not_string_type(self, dtype):
+        pa_dtype = dtype.pyarrow_dtype
+        if pa.types.is_string(pa_dtype):
+            assert is_string_dtype(dtype)
+        else:
+            super().test_is_not_string_type(dtype)
+
+    @pytest.mark.xfail(
+        reason="GH 45419: pyarrow.ChunkedArray does not support views.", run=False
+    )
+    def test_view(self, data):
+        super().test_view(data)
+
+    def test_fillna_no_op_returns_copy(self, data):
+        data = data[~data.isna()]
+
+        valid = data[0]
+        result = data.fillna(valid)
+        assert result is not data
+        tm.assert_extension_array_equal(result, data)
+
+    def test_fillna_readonly(self, data_missing):
+        data = data_missing.copy()
+        data._readonly = True
+
+        # by default fillna(copy=True), then this works fine
+        result = data.fillna(data_missing[1])
+        assert result[0] == data_missing[1]
+        tm.assert_extension_array_equal(data, data_missing)
+
+        # fillna(copy=False) is generally not honored by Arrow-backed array,
+        # but always returns new data -> same result as above
+        result = data.fillna(data_missing[1])
+        assert result[0] == data_missing[1]
+        tm.assert_extension_array_equal(data, data_missing)
+
+    @pytest.mark.xfail(
+        reason="GH 45419: pyarrow.ChunkedArray does not support views", run=False
+    )
+    def test_transpose(self, data):
+        super().test_transpose(data)
+
+    @pytest.mark.xfail(
+        reason="GH 45419: pyarrow.ChunkedArray does not support views", run=False
+    )
+    def test_setitem_preserves_views(self, data):
+        super().test_setitem_preserves_views(data)
+
+    @pytest.mark.parametrize("dtype_backend", ["pyarrow", no_default])
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    def test_EA_types(self, engine, data, dtype_backend, request, using_nan_is_na):
+        pa_dtype = data.dtype.pyarrow_dtype
+        if pa.types.is_decimal(pa_dtype):
+            request.applymarker(
+                pytest.mark.xfail(
+                    raises=NotImplementedError,
+                    reason=f"Parameterized types {pa_dtype} not supported.",
+                )
+            )
+        elif pa.types.is_timestamp(pa_dtype) and pa_dtype.unit in ("us", "ns"):
+            request.applymarker(
+                pytest.mark.xfail(
+                    raises=ValueError,
+                    reason="https://github.com/pandas-dev/pandas/issues/49767",
+                )
+            )
+        elif pa.types.is_binary(pa_dtype):
+            request.applymarker(
+                pytest.mark.xfail(reason="CSV parsers don't correctly handle binary")
+            )
+        df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
+        if not using_nan_is_na:
+            csv_output = df.to_csv(index=False, na_rep="NA")
+        else:
+            csv_output = df.to_csv(index=False, na_rep=np.nan)
+        if pa.types.is_binary(pa_dtype):
+            csv_output = BytesIO(csv_output)
+        else:
+            csv_output = StringIO(csv_output)
+        result = pd.read_csv(
+            csv_output,
+            dtype={"with_dtype": str(data.dtype)},
+            engine=engine,
+            dtype_backend=dtype_backend,
+        )
+        expected = df
+        tm.assert_frame_equal(result, expected)
+
+    def test_invert(self, data, request):
+        pa_dtype = data.dtype.pyarrow_dtype
+        if not (
+            pa.types.is_boolean(pa_dtype)
+            or pa.types.is_integer(pa_dtype)
+            or pa.types.is_string(pa_dtype)
+        ):
+            request.applymarker(
+                pytest.mark.xfail(
+                    raises=pa.ArrowNotImplementedError,
+                    reason=f"pyarrow.compute.invert does support {pa_dtype}",
+                )
+            )
+        if PY312 and pa.types.is_boolean(pa_dtype):
+            with tm.assert_produces_warning(
+                DeprecationWarning, match="Bitwise inversion", check_stacklevel=False
+            ):
+                super().test_invert(data)
+        else:
+            super().test_invert(data)
+
+    @pytest.mark.parametrize("periods", [1, -2])
+    def test_diff(self, data, periods, request):
+        pa_dtype = data.dtype.pyarrow_dtype
+        if pa.types.is_unsigned_integer(pa_dtype) and periods == 1:
+            request.applymarker(
+                pytest.mark.xfail(
+                    raises=pa.ArrowInvalid,
+                    reason=(
+                        f"diff with {pa_dtype} and periods={periods} will overflow"
+                    ),
+                )
+            )
+        super().test_diff(data, periods)
+
+    def test_value_counts_returns_pyarrow_int64(self, data):
+        # GH 51462
+        data = data[:10]
+        result = data.value_counts()
+        assert result.dtype == ArrowDtype(pa.int64())
+
+    _combine_le_expected_dtype = "bool[pyarrow]"
+
+    def get_op_from_name(self, op_name):
+        short_opname = op_name.strip("_")
+        if short_opname == "rtruediv":
+            # use the numpy version that won't raise on division by zero
+
+            def rtruediv(x, y):
+                return np.divide(y, x)
+
+            return rtruediv
+        elif short_opname == "rfloordiv":
+            return lambda x, y: np.floor_divide(y, x)
+
+        return tm.get_op_from_name(op_name)
+
+    # TODO: use EA._cast_pointwise_result, same with other test files that
+    #  override this
+    def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
+        # BaseOpsUtil._combine can upcast expected dtype
+        # (because it generates expected on python scalars)
+        # while ArrowExtensionArray maintains original type
+        expected = pointwise_result
+
+        if op_name in ["eq", "ne", "lt", "le", "gt", "ge"]:
+            return pointwise_result.astype("boolean[pyarrow]")
+
+        original_dtype = tm.get_dtype(expected)
+
+        was_frame = False
+        if isinstance(expected, pd.DataFrame):
+            was_frame = True
+            expected_data = expected.iloc[:, 0]
+        else:
+            expected_data = expected
+
+        # the pointwise method will have retained our original dtype, while
+        #  the op(ser, other) version will have cast to 64bit
+        if type(other) is int and op_name not in ["__floordiv__"]:
+            if original_dtype.kind == "f":
+                return expected.astype("float64[pyarrow]")
+            else:
+                return expected.astype("int64[pyarrow]")
+        elif type(other) is float:
+            return expected.astype("float64[pyarrow]")
+
+        # error: Item "ExtensionDtype" of "dtype[Any] | ExtensionDtype" has
+        #  no attribute "pyarrow_dtype"
+        orig_pa_type = original_dtype.pyarrow_dtype  # type: ignore[union-attr]
+        if not was_frame and isinstance(other, pd.Series):
+            # i.e. test_arith_series_with_array
+            if not (
+                pa.types.is_floating(orig_pa_type)
+                or (
+                    pa.types.is_integer(orig_pa_type)
+                    and op_name not in ["__truediv__", "__rtruediv__"]
+                )
+                or pa.types.is_duration(orig_pa_type)
+                or pa.types.is_timestamp(orig_pa_type)
+                or pa.types.is_date(orig_pa_type)
+                or pa.types.is_decimal(orig_pa_type)
+            ):
+                # base class _combine always returns int64, while
+                #  ArrowExtensionArray does not upcast
+                return expected
+        elif not (
+            (op_name == "__floordiv__" and pa.types.is_integer(orig_pa_type))
+            or pa.types.is_duration(orig_pa_type)
+            or pa.types.is_timestamp(orig_pa_type)
+            or pa.types.is_date(orig_pa_type)
+            or pa.types.is_decimal(orig_pa_type)
+        ):
+            # base class _combine always returns int64, while
+            #  ArrowExtensionArray does not upcast
+            return expected
+
+        pa_expected = pa.array(expected_data._values)
+
+        if pa.types.is_decimal(pa_expected.type) and pa.types.is_decimal(orig_pa_type):
+            # decimal precision can resize in the result type depending on data
+            # just compare the float values
+            alt = getattr(obj, op_name)(other)
+            alt_dtype = tm.get_dtype(alt)
+            assert isinstance(alt_dtype, ArrowDtype)
+            if op_name == "__pow__" and isinstance(other, Decimal):
+                # TODO: would it make more sense to retain Decimal here?
+                alt_dtype = ArrowDtype(pa.float64())
+            elif (
+                op_name == "__pow__"
+                and isinstance(other, pd.Series)
+                and other.dtype == original_dtype
+            ):
+                # TODO: would it make more sense to retain Decimal here?
+                alt_dtype = ArrowDtype(pa.float64())
+            else:
+                assert pa.types.is_decimal(alt_dtype.pyarrow_dtype)
+            return expected.astype(alt_dtype)
+
+        else:
+            pa_expected = pa_expected.cast(orig_pa_type)
+
+        pd_expected = type(expected_data._values)(pa_expected)
+        if was_frame:
+            expected = pd.DataFrame(
+                pd_expected, index=expected.index, columns=expected.columns
+            )
+        else:
+            expected = pd.Series(pd_expected)
+        return expected
+
+    def _is_temporal_supported(self, opname, pa_dtype):
+        return (
+            (
+                opname in ("__add__", "__radd__")
+                or (
+                    opname
+                    in ("__truediv__", "__rtruediv__", "__floordiv__", "__rfloordiv__")
+                    and not pa_version_under14p0
+                )
+            )
+            and pa.types.is_duration(pa_dtype)
+        ) or (opname in ("__sub__", "__rsub__") and pa.types.is_temporal(pa_dtype))
+
+    def _get_expected_exception(
+        self, op_name: str, obj, other
+    ) -> type[Exception] | tuple[type[Exception], ...] | None:
+        if op_name in ("__divmod__", "__rdivmod__"):
+            return (NotImplementedError, TypeError)
+
+        exc: type[Exception] | tuple[type[Exception], ...] | None
+        dtype = tm.get_dtype(obj)
+        # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no
+        # attribute "pyarrow_dtype"
+        pa_dtype = dtype.pyarrow_dtype  # type: ignore[union-attr]
+
+        arrow_temporal_supported = self._is_temporal_supported(op_name, pa_dtype)
+        if op_name in {
+            "__mod__",
+            "__rmod__",
+        }:
+            exc = (NotImplementedError, TypeError)
+        elif arrow_temporal_supported:
+            exc = None
+        elif op_name in ["__add__", "__radd__"] and (
+            pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
+        ):
+            exc = None
+        elif not (
+            pa.types.is_floating(pa_dtype)
+            or pa.types.is_integer(pa_dtype)
+            or pa.types.is_decimal(pa_dtype)
+        ):
+            exc = TypeError
+        else:
+            exc = None
+        return exc
+
+    def _get_arith_xfail_marker(self, opname, pa_dtype):
+        mark = None
+
+        arrow_temporal_supported = self._is_temporal_supported(opname, pa_dtype)
+
+        if opname == "__rpow__" and (
+            pa.types.is_floating(pa_dtype)
+            or pa.types.is_integer(pa_dtype)
+            or pa.types.is_decimal(pa_dtype)
+        ):
+            mark = pytest.mark.xfail(
+                reason=(
+                    f"GH#29997: 1**pandas.NA == 1 while 1**pyarrow.NA == NULL "
+                    f"for {pa_dtype}"
+                )
+            )
+        elif arrow_temporal_supported and (
+            pa.types.is_time(pa_dtype)
+            or (
+                opname
+                in ("__truediv__", "__rtruediv__", "__floordiv__", "__rfloordiv__")
+                and pa.types.is_duration(pa_dtype)
+            )
+        ):
+            mark = pytest.mark.xfail(
+                raises=TypeError,
+                reason=(
+                    f"{opname} not supported betweenpd.NA and {pa_dtype} Python scalar"
+                ),
+            )
+        elif opname == "__rfloordiv__" and (
+            pa.types.is_integer(pa_dtype) or pa.types.is_decimal(pa_dtype)
+        ):
+            mark = pytest.mark.xfail(
+                raises=pa.ArrowInvalid,
+                reason="divide by 0",
+            )
+        elif opname == "__rtruediv__" and pa.types.is_decimal(pa_dtype):
+            mark = pytest.mark.xfail(
+                raises=pa.ArrowInvalid,
+                reason="divide by 0",
+            )
+
+        return mark
+
+    def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
+        pa_dtype = data.dtype.pyarrow_dtype
+
+        if all_arithmetic_operators == "__rmod__" and pa.types.is_binary(pa_dtype):
+            pytest.skip("Skip testing Python string formatting")
+
+        mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype)
+        if mark is not None:
+            request.applymarker(mark)
+
+        super().test_arith_series_with_scalar(data, all_arithmetic_operators)
+
+    def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
+        pa_dtype = data.dtype.pyarrow_dtype
+
+        if all_arithmetic_operators == "__rmod__" and (
+            pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
+        ):
+            pytest.skip("Skip testing Python string formatting")
+
+        mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype)
+        if mark is not None:
+            request.applymarker(mark)
+
+        super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
+
+    def test_arith_series_with_array(self, data, all_arithmetic_operators, request):
+        pa_dtype = data.dtype.pyarrow_dtype
+
+        if all_arithmetic_operators in (
+            "__sub__",
+            "__rsub__",
+        ) and pa.types.is_unsigned_integer(pa_dtype):
+            request.applymarker(
+                pytest.mark.xfail(
+                    raises=pa.ArrowInvalid,
+                    reason=(
+                        f"Implemented pyarrow.compute.subtract_checked "
+                        f"which raises on overflow for {pa_dtype}"
+                    ),
+                )
+            )
+
+        mark = self._get_arith_xfail_marker(all_arithmetic_operators, pa_dtype)
+        if mark is not None:
+            request.applymarker(mark)
+
+        op_name = all_arithmetic_operators
+        ser = pd.Series(data)
+        # pd.Series([ser.iloc[0]] * len(ser)) may not return ArrowExtensionArray
+        # since ser.iloc[0] is a python scalar
+        other = pd.Series(pd.array([ser.iloc[0]] * len(ser), dtype=data.dtype))
+
+        self.check_opname(ser, op_name, other)
+
+    def test_add_series_with_extension_array(self, data, request):
+        pa_dtype = data.dtype.pyarrow_dtype
+
+        if pa_dtype.equals("int8"):
+            request.applymarker(
+                pytest.mark.xfail(
+                    raises=pa.ArrowInvalid,
+                    reason=f"raises on overflow for {pa_dtype}",
+                )
+            )
+        super().test_add_series_with_extension_array(data)
+
+    def test_invalid_other_comp(self, data, comparison_op):
+        # GH 48833
+        with pytest.raises(
+            NotImplementedError, match=".* not implemented for <class 'object'>"
+        ):
+            comparison_op(data, object())
+
+    @pytest.mark.parametrize("masked_dtype", ["boolean", "Int64", "Float64"])
+    def test_comp_masked_numpy(self, masked_dtype, comparison_op):
+        # GH 52625
+        data = [1, 0, None]
+        ser_masked = pd.Series(data, dtype=masked_dtype)
+        ser_pa = pd.Series(data, dtype=f"{masked_dtype.lower()}[pyarrow]")
+        result = comparison_op(ser_pa, ser_masked)
+        if comparison_op in [operator.lt, operator.gt, operator.ne]:
+            exp = [False, False, None]
+        else:
+            exp = [True, True, None]
+        expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
+        tm.assert_series_equal(result, expected)
+
+    def test_loc_setitem_with_expansion_preserves_ea_index_dtype(self, data, request):
+        pa_dtype = data.dtype.pyarrow_dtype
+        if pa.types.is_date(pa_dtype):
+            mark = pytest.mark.xfail(
+                reason="GH#62343 incorrectly casts to timestamp[ms][pyarrow]"
+            )
+            request.applymarker(mark)
+        super().test_loc_setitem_with_expansion_preserves_ea_index_dtype(data)
+
+
+class TestLogicalOps:
+    """Various Series and DataFrame logical ops methods."""
+
+    def test_kleene_or(self):
+        a = pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]")
+        b = pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
+        result = a | b
+        expected = pd.Series(
+            [True, True, True, True, False, None, True, None, None],
+            dtype="boolean[pyarrow]",
+        )
+        tm.assert_series_equal(result, expected)
+
+        result = b | a
+        tm.assert_series_equal(result, expected)
+
+        # ensure we haven't mutated anything inplace
+        tm.assert_series_equal(
+            a,
+            pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]"),
+        )
+        tm.assert_series_equal(
+            b, pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
+        )
+
+    @pytest.mark.parametrize(
+        "other, expected",
+        [
+            (None, [True, None, None]),
+            (pd.NA, [True, None, None]),
+            (True, [True, True, True]),
+            (np.bool_(True), [True, True, True]),
+            (False, [True, False, None]),
+            (np.bool_(False), [True, False, None]),
+        ],
+    )
+    def test_kleene_or_scalar(self, other, expected):
+        a = pd.Series([True, False, None], dtype="boolean[pyarrow]")
+        result = a | other
+        expected = pd.Series(expected, dtype="boolean[pyarrow]")
+        tm.assert_series_equal(result, expected)
+
+        result = other | a
+        tm.assert_series_equal(result, expected)
+
+        # ensure we haven't mutated anything inplace
+        tm.assert_series_equal(
+            a, pd.Series([True, False, None], dtype="boolean[pyarrow]")
+        )
+
+    def test_kleene_and(self):
+        a = pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]")
+        b = pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
+        result = a & b
+        expected = pd.Series(
+            [True, False, None, False, False, False, None, False, None],
+            dtype="boolean[pyarrow]",
+        )
+        tm.assert_series_equal(result, expected)
+
+        result = b & a
+        tm.assert_series_equal(result, expected)
+
+        # ensure we haven't mutated anything inplace
+        tm.assert_series_equal(
+            a,
+            pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]"),
+        )
+        tm.assert_series_equal(
+            b, pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
+        )
+
+    @pytest.mark.parametrize(
+        "other, expected",
+        [
+            (None, [None, False, None]),
+            (pd.NA, [None, False, None]),
+            (True, [True, False, None]),
+            (False, [False, False, False]),
+            (np.bool_(True), [True, False, None]),
+            (np.bool_(False), [False, False, False]),
+        ],
+    )
+    def test_kleene_and_scalar(self, other, expected):
+        a = pd.Series([True, False, None], dtype="boolean[pyarrow]")
+        result = a & other
+        expected = pd.Series(expected, dtype="boolean[pyarrow]")
+        tm.assert_series_equal(result, expected)
+
+        result = other & a
+        tm.assert_series_equal(result, expected)
+
+        # ensure we haven't mutated anything inplace
+        tm.assert_series_equal(
+            a, pd.Series([True, False, None], dtype="boolean[pyarrow]")
+        )
+
+    def test_kleene_xor(self):
+        a = pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]")
+        b = pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
+        result = a ^ b
+        expected = pd.Series(
+            [False, True, None, True, False, None, None, None, None],
+            dtype="boolean[pyarrow]",
+        )
+        tm.assert_series_equal(result, expected)
+
+        result = b ^ a
+        tm.assert_series_equal(result, expected)
+
+        # ensure we haven't mutated anything inplace
+        tm.assert_series_equal(
+            a,
+            pd.Series([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean[pyarrow]"),
+        )
+        tm.assert_series_equal(
+            b, pd.Series([True, False, None] * 3, dtype="boolean[pyarrow]")
+        )
+
+    @pytest.mark.parametrize(
+        "other, expected",
+        [
+            (None, [None, None, None]),
+            (pd.NA, [None, None, None]),
+            (True, [False, True, None]),
+            (np.bool_(True), [False, True, None]),
+            (np.bool_(False), [True, False, None]),
+        ],
+    )
+    def test_kleene_xor_scalar(self, other, expected):
+        a = pd.Series([True, False, None], dtype="boolean[pyarrow]")
+        result = a ^ other
+        expected = pd.Series(expected, dtype="boolean[pyarrow]")
+        tm.assert_series_equal(result, expected)
+
+        result = other ^ a
+        tm.assert_series_equal(result, expected)
+
+        # ensure we haven't mutated anything inplace
+        tm.assert_series_equal(
+            a, pd.Series([True, False, None], dtype="boolean[pyarrow]")
+        )
+
+    @pytest.mark.parametrize(
+        "op, exp",
+        [
+            ["__and__", True],
+            ["__or__", True],
+            ["__xor__", False],
+        ],
+    )
+    def test_logical_masked_numpy(self, op, exp):
+        # GH 52625
+        data = [True, False, None]
+        ser_masked = pd.Series(data, dtype="boolean")
+        ser_pa = pd.Series(data, dtype="boolean[pyarrow]")
+        result = getattr(ser_pa, op)(ser_masked)
+        expected = pd.Series([exp, False, None], dtype=ArrowDtype(pa.bool_()))
+        tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES)
+def test_bitwise(pa_type):
+    # GH 54495
+    dtype = ArrowDtype(pa_type)
+    left = pd.Series([1, None, 3, 4], dtype=dtype)
+    right = pd.Series([None, 3, 5, 4], dtype=dtype)
+
+    result = left | right
+    expected = pd.Series([None, None, 3 | 5, 4 | 4], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = left & right
+    expected = pd.Series([None, None, 3 & 5, 4 & 4], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = left ^ right
+    expected = pd.Series([None, None, 3 ^ 5, 4 ^ 4], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ~left
+    expected = ~(left.fillna(0).to_numpy())
+    expected = pd.Series(expected, dtype=dtype).mask(left.isnull())
+    tm.assert_series_equal(result, expected)
+
+
+def test_arrowdtype_construct_from_string_type_with_unsupported_parameters():
+    with pytest.raises(NotImplementedError, match="Passing pyarrow type"):
+        ArrowDtype.construct_from_string("not_a_real_dype[s, tz=UTC][pyarrow]")
+
+    with pytest.raises(NotImplementedError, match="Passing pyarrow type"):
+        ArrowDtype.construct_from_string("decimal(7, 2)[pyarrow]")
+
+
+def test_arrowdtype_construct_from_string_supports_dt64tz():
+    # as of GH#50689, timestamptz is supported
+    dtype = ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]")
+    expected = ArrowDtype(pa.timestamp("s", "UTC"))
+    assert dtype == expected
+
+
+def test_arrowdtype_construct_from_string_type_only_one_pyarrow():
+    # GH#51225
+    invalid = "int64[pyarrow]foobar[pyarrow]"
+    msg = (
+        r"Passing pyarrow type specific parameters \(\[pyarrow\]\) in the "
+        r"string is not supported\."
+    )
+    with pytest.raises(NotImplementedError, match=msg):
+        pd.Series(range(3), dtype=invalid)
+
+
+def test_arrow_string_multiplication():
+    # GH 56537
+    binary = pd.Series(["abc", "defg"], dtype=ArrowDtype(pa.string()))
+    repeat = pd.Series([2, -2], dtype="int64[pyarrow]")
+    result = binary * repeat
+    expected = pd.Series(["abcabc", ""], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+    reflected_result = repeat * binary
+    tm.assert_series_equal(result, reflected_result)
+
+
+def test_arrow_string_multiplication_scalar_repeat():
+    binary = pd.Series(["abc", "defg"], dtype=ArrowDtype(pa.string()))
+    result = binary * 2
+    expected = pd.Series(["abcabc", "defgdefg"], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+    reflected_result = 2 * binary
+    tm.assert_series_equal(reflected_result, expected)
+
+
+@pytest.mark.parametrize(
+    "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
+)
+@pytest.mark.parametrize("quantile", [0.5, [0.5, 0.5]])
+def test_quantile(data, interpolation, quantile, request):
+    pa_dtype = data.dtype.pyarrow_dtype
+
+    data = data.take([0, 0, 0])
+    ser = pd.Series(data)
+
+    if (
+        pa.types.is_string(pa_dtype)
+        or pa.types.is_binary(pa_dtype)
+        or pa.types.is_boolean(pa_dtype)
+    ):
+        # For string, bytes, and bool, we don't *expect* to have quantile work
+        # Note this matches the non-pyarrow behavior
+        msg = r"Function 'quantile' has no kernel matching input types \(.*\)"
+        with pytest.raises(pa.ArrowNotImplementedError, match=msg):
+            ser.quantile(q=quantile, interpolation=interpolation)
+        return
+
+    if (
+        pa.types.is_integer(pa_dtype)
+        or pa.types.is_floating(pa_dtype)
+        or pa.types.is_decimal(pa_dtype)
+    ):
+        pass
+    elif pa.types.is_temporal(data._pa_array.type):
+        pass
+    else:
+        request.applymarker(
+            pytest.mark.xfail(
+                raises=pa.ArrowNotImplementedError,
+                reason=f"quantile not supported by pyarrow for {pa_dtype}",
+            )
+        )
+    data = data.take([0, 0, 0])
+    ser = pd.Series(data)
+    result = ser.quantile(q=quantile, interpolation=interpolation)
+
+    if pa.types.is_timestamp(pa_dtype) and interpolation not in ["lower", "higher"]:
+        # rounding error will make the check below fail
+        #  (e.g. '2020-01-01 01:01:01.000001' vs '2020-01-01 01:01:01.000001024'),
+        #  so we'll check for now that we match the numpy analogue
+        if pa_dtype.tz:
+            pd_dtype = f"M8[{pa_dtype.unit}, {pa_dtype.tz}]"
+        else:
+            pd_dtype = f"M8[{pa_dtype.unit}]"
+        ser_np = ser.astype(pd_dtype)
+
+        expected = ser_np.quantile(q=quantile, interpolation=interpolation)
+        if quantile == 0.5:
+            if pa_dtype.unit == "us":
+                expected = expected.to_pydatetime(warn=False)
+            assert result == expected
+        else:
+            if pa_dtype.unit == "us":
+                expected = expected.dt.floor("us")
+            tm.assert_series_equal(result, expected.astype(data.dtype))
+        return
+
+    if quantile == 0.5:
+        assert result == data[0]
+    else:
+        # Just check the values
+        expected = pd.Series(data.take([0, 0]), index=[0.5, 0.5])
+        if (
+            pa.types.is_integer(pa_dtype)
+            or pa.types.is_floating(pa_dtype)
+            or pa.types.is_decimal(pa_dtype)
+        ):
+            expected = expected.astype("float64[pyarrow]")
+            result = result.astype("float64[pyarrow]")
+        tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "take_idx, exp_idx",
+    [[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]],
+    ids=["multi_mode", "single_mode"],
+)
+def test_mode_dropna_true(data_for_grouping, take_idx, exp_idx):
+    data = data_for_grouping.take(take_idx)
+    ser = pd.Series(data)
+    result = ser.mode(dropna=True)
+    expected = pd.Series(data_for_grouping.take(exp_idx))
+    tm.assert_series_equal(result, expected)
+
+
+def test_mode_dropna_false_mode_na(data):
+    # GH 50982
+    more_nans = pd.Series([None, None, data[0]], dtype=data.dtype)
+    result = more_nans.mode(dropna=False)
+    expected = pd.Series([None], dtype=data.dtype)
+    tm.assert_series_equal(result, expected)
+
+    expected = pd.Series([data[0], None], dtype=data.dtype)
+    result = expected.mode(dropna=False)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "arrow_dtype, expected_type",
+    [
+        [pa.binary(), bytes],
+        [pa.binary(16), bytes],
+        [pa.large_binary(), bytes],
+        [pa.large_string(), str],
+        [pa.list_(pa.int64()), list],
+        [pa.large_list(pa.int64()), list],
+        [pa.map_(pa.string(), pa.int64()), list],
+        [pa.struct([("f1", pa.int8()), ("f2", pa.string())]), dict],
+        [pa.dictionary(pa.int64(), pa.int64()), CategoricalDtypeType],
+    ],
+)
+def test_arrow_dtype_type(arrow_dtype, expected_type):
+    # GH 51845
+    # TODO: Redundant with test_getitem_scalar once arrow_dtype exists in data fixture
+    assert ArrowDtype(arrow_dtype).type == expected_type
+
+
+def test_is_bool_dtype():
+    # GH 22667
+    data = ArrowExtensionArray(pa.array([True, False, True]))
+    assert is_bool_dtype(data)
+    assert pd.core.common.is_bool_indexer(data)
+    s = pd.Series(range(len(data)))
+    result = s[data]
+    expected = s[np.asarray(data)]
+    tm.assert_series_equal(result, expected)
+
+
+def test_is_numeric_dtype(data):
+    # GH 50563
+    pa_type = data.dtype.pyarrow_dtype
+    if (
+        pa.types.is_floating(pa_type)
+        or pa.types.is_integer(pa_type)
+        or pa.types.is_decimal(pa_type)
+    ):
+        assert is_numeric_dtype(data)
+    else:
+        assert not is_numeric_dtype(data)
+
+
+def test_is_integer_dtype(data):
+    # GH 50667
+    pa_type = data.dtype.pyarrow_dtype
+    if pa.types.is_integer(pa_type):
+        assert is_integer_dtype(data)
+    else:
+        assert not is_integer_dtype(data)
+
+
+def test_is_signed_integer_dtype(data):
+    pa_type = data.dtype.pyarrow_dtype
+    if pa.types.is_signed_integer(pa_type):
+        assert is_signed_integer_dtype(data)
+    else:
+        assert not is_signed_integer_dtype(data)
+
+
+def test_is_unsigned_integer_dtype(data):
+    pa_type = data.dtype.pyarrow_dtype
+    if pa.types.is_unsigned_integer(pa_type):
+        assert is_unsigned_integer_dtype(data)
+    else:
+        assert not is_unsigned_integer_dtype(data)
+
+
+def test_is_datetime64_any_dtype(data):
+    pa_type = data.dtype.pyarrow_dtype
+    if pa.types.is_timestamp(pa_type) or pa.types.is_date(pa_type):
+        assert is_datetime64_any_dtype(data)
+    else:
+        assert not is_datetime64_any_dtype(data)
+
+
+def test_is_float_dtype(data):
+    pa_type = data.dtype.pyarrow_dtype
+    if pa.types.is_floating(pa_type):
+        assert is_float_dtype(data)
+    else:
+        assert not is_float_dtype(data)
+
+
+def test_pickle_roundtrip(data):
+    # GH 42600
+    expected = pd.Series(data)
+    expected_sliced = expected.head(2)
+    full_pickled = pickle.dumps(expected)
+    sliced_pickled = pickle.dumps(expected_sliced)
+
+    assert len(full_pickled) > len(sliced_pickled)
+
+    result = pickle.loads(full_pickled)
+    tm.assert_series_equal(result, expected)
+
+    result_sliced = pickle.loads(sliced_pickled)
+    tm.assert_series_equal(result_sliced, expected_sliced)
+
+
+def test_astype_from_non_pyarrow(data):
+    # GH49795
+    np_arr = data.to_numpy()
+    pd_array = pd.array(np_arr, dtype=np_arr.dtype)
+    result = pd_array.astype(data.dtype)
+    assert not isinstance(pd_array.dtype, ArrowDtype)
+    assert isinstance(result.dtype, ArrowDtype)
+    tm.assert_extension_array_equal(result, data)
+
+
+def test_astype_float_from_non_pyarrow_str():
+    # GH50430
+    ser = pd.Series(["1.0"])
+    result = ser.astype("float64[pyarrow]")
+    expected = pd.Series([1.0], dtype="float64[pyarrow]")
+    tm.assert_series_equal(result, expected)
+
+
+def test_astype_errors_ignore():
+    # GH 55399
+    expected = pd.DataFrame({"col": [17000000]}, dtype="int32[pyarrow]")
+    result = expected.astype("float[pyarrow]", errors="ignore")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_to_numpy_with_defaults(data, using_nan_is_na):
+    # GH49973
+    result = data.to_numpy()
+
+    pa_type = data._pa_array.type
+    if pa.types.is_duration(pa_type) or pa.types.is_timestamp(pa_type):
+        pytest.skip("Tested in test_to_numpy_temporal")
+    elif pa.types.is_date(pa_type):
+        expected = np.array(list(data))
+    else:
+        expected = np.array(data._pa_array)
+
+    if data._hasna and (not is_numeric_dtype(data.dtype) or not using_nan_is_na):
+        expected = expected.astype(object)
+        expected[pd.isna(data)] = pd.NA
+
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_to_numpy_int_with_na(using_nan_is_na):
+    # GH51227: ensure to_numpy does not convert int to float
+    data = [1, None]
+    arr = pd.array(data, dtype="int64[pyarrow]")
+    result = arr.to_numpy()
+    if not using_nan_is_na:
+        expected = np.array([1, pd.NA], dtype=object)
+    else:
+        expected = np.array([1, np.nan])
+        assert isinstance(result[0], float)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("na_val, exp", [(lib.no_default, np.nan), (1, 1)])
+def test_to_numpy_null_array(na_val, exp):
+    # GH#52443
+    arr = pd.array([pd.NA, pd.NA], dtype="null[pyarrow]")
+    result = arr.to_numpy(dtype="float64", na_value=na_val)
+    expected = np.array([exp] * 2, dtype="float64")
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_to_numpy_null_array_no_dtype():
+    # GH#52443
+    arr = pd.array([pd.NA, pd.NA], dtype="null[pyarrow]")
+    result = arr.to_numpy(dtype=None)
+    expected = np.array([pd.NA] * 2, dtype="object")
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_to_numpy_without_dtype():
+    # GH 54808
+    arr = pd.array([True, pd.NA], dtype="boolean[pyarrow]")
+    result = arr.to_numpy(na_value=False)
+    expected = np.array([True, False], dtype=np.bool_)
+    tm.assert_numpy_array_equal(result, expected)
+
+    arr = pd.array([1.0, pd.NA], dtype="float32[pyarrow]")
+    result = arr.to_numpy(na_value=0.0)
+    expected = np.array([1.0, 0.0], dtype=np.float32)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_setitem_null_slice(data):
+    # GH50248
+    orig = data.copy()
+
+    result = orig.copy()
+    result[:] = data[0]
+    expected = ArrowExtensionArray._from_sequence(
+        [data[0]] * len(data),
+        dtype=data.dtype,
+    )
+    tm.assert_extension_array_equal(result, expected)
+
+    result = orig.copy()
+    result[:] = data[::-1]
+    expected = data[::-1]
+    tm.assert_extension_array_equal(result, expected)
+
+    result = orig.copy()
+    result[:] = data.tolist()
+    expected = data
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_setitem_invalid_dtype(data):
+    # GH50248
+    pa_type = data._pa_array.type
+    if pa.types.is_string(pa_type) or pa.types.is_binary(pa_type):
+        fill_value = 123
+        err = TypeError
+        msg = "Invalid value '123' for dtype"
+    elif (
+        pa.types.is_integer(pa_type)
+        or pa.types.is_floating(pa_type)
+        or pa.types.is_boolean(pa_type)
+    ):
+        fill_value = "foo"
+        err = pa.ArrowInvalid
+        msg = "Could not convert"
+    else:
+        fill_value = "foo"
+        err = TypeError
+        msg = "Invalid value 'foo' for dtype"
+    with pytest.raises(err, match=msg):
+        data[:] = fill_value
+
+
+def test_from_arrow_respecting_given_dtype():
+    date_array = pa.array(
+        [pd.Timestamp("2019-12-31"), pd.Timestamp("2019-12-31")], type=pa.date32()
+    )
+    result = date_array.to_pandas(
+        types_mapper={pa.date32(): ArrowDtype(pa.date64())}.get
+    )
+    expected = pd.Series(
+        [pd.Timestamp("2019-12-31"), pd.Timestamp("2019-12-31")],
+        dtype=ArrowDtype(pa.date64()),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_from_arrow_respecting_given_dtype_unsafe():
+    array = pa.array([1.5, 2.5], type=pa.float64())
+    with tm.external_error_raised(pa.ArrowInvalid):
+        array.to_pandas(types_mapper={pa.float64(): ArrowDtype(pa.int64())}.get)
+
+
+def test_round():
+    dtype = "float64[pyarrow]"
+
+    ser = pd.Series([0.0, 1.23, 2.56, pd.NA], dtype=dtype)
+    result = ser.round(1)
+    expected = pd.Series([0.0, 1.2, 2.6, pd.NA], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+    ser = pd.Series([123.4, pd.NA, 56.78], dtype=dtype)
+    result = ser.round(-1)
+    expected = pd.Series([120.0, pd.NA, 60.0], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_searchsorted_with_na_raises(data_for_sorting, as_series):
+    # GH50447
+    b, c, a = data_for_sorting
+    arr = data_for_sorting.take([2, 0, 1])  # to get [a, b, c]
+    arr[-1] = pd.NA
+
+    if as_series:
+        arr = pd.Series(arr)
+
+    msg = (
+        "searchsorted requires array to be sorted, "
+        "which is impossible with NAs present."
+    )
+    with pytest.raises(ValueError, match=msg):
+        arr.searchsorted(b)
+
+
+def test_sort_values_dictionary():
+    df = pd.DataFrame(
+        {
+            "a": pd.Series(
+                ["x", "y"], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.string()))
+            ),
+            "b": [1, 2],
+        },
+    )
+    expected = df.copy()
+    result = df.sort_values(by=["a", "b"])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("pat", ["abc", "a[a-z]{2}"])
+def test_str_count(pat):
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.count(pat)
+    expected = pd.Series([1, None], dtype=ArrowDtype(pa.int32()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_count_flags_unsupported():
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    with pytest.raises(NotImplementedError, match="count not"):
+        ser.str.count("abc", flags=1)
+
+
+@pytest.mark.parametrize(
+    "side, str_func", [["left", "rjust"], ["right", "ljust"], ["both", "center"]]
+)
+def test_str_pad(side, str_func):
+    ser = pd.Series(["a", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.pad(width=3, side=side, fillchar="x")
+    expected = pd.Series(
+        [getattr("a", str_func)(3, "x"), None], dtype=ArrowDtype(pa.string())
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_pad_invalid_side():
+    ser = pd.Series(["a", None], dtype=ArrowDtype(pa.string()))
+    with pytest.raises(ValueError, match="Invalid side: foo"):
+        ser.str.pad(3, "foo", "x")
+
+
+@pytest.mark.parametrize(
+    "pat, case, na, regex, exp",
+    [
+        ["ab", False, None, False, [True, None]],
+        ["Ab", True, None, False, [False, None]],
+        ["ab", False, True, False, [True, True]],
+        ["a[a-z]{1}", False, None, True, [True, None]],
+        ["A[a-z]{1}", True, None, True, [False, None]],
+    ],
+)
+def test_str_contains(pat, case, na, regex, exp):
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.contains(pat, case=case, na=na, regex=regex)
+    expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_contains_flags_unsupported():
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    with pytest.raises(NotImplementedError, match="contains not"):
+        ser.str.contains("a", flags=1)
+
+
+def test_str_contains_re2_unicode_escape():
+    # GH 63901
+    ser = pd.Series(["a", "\u0e01", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.contains(r"[\x{0e00}-\x{0e7f}]")
+    expected = pd.Series([False, True, None], dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "side, pat, na, exp",
+    [
+        ["startswith", "ab", None, [True, None, False]],
+        ["startswith", "b", False, [False, False, False]],
+        ["endswith", "b", True, [False, True, False]],
+        ["endswith", "bc", None, [True, None, False]],
+        ["startswith", ("a", "e", "g"), None, [True, None, True]],
+        ["endswith", ("a", "c", "g"), None, [True, None, True]],
+        ["startswith", (), None, [False, None, False]],
+        ["endswith", (), None, [False, None, False]],
+    ],
+)
+def test_str_start_ends_with(side, pat, na, exp):
+    ser = pd.Series(["abc", None, "efg"], dtype=ArrowDtype(pa.string()))
+    result = getattr(ser.str, side)(pat, na=na)
+    expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("side", ("startswith", "endswith"))
+def test_str_starts_ends_with_all_nulls_empty_tuple(side):
+    ser = pd.Series([None, None], dtype=ArrowDtype(pa.string()))
+    result = getattr(ser.str, side)(())
+
+    # bool datatype preserved for all nulls.
+    expected = pd.Series([None, None], dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "arg_name, arg",
+    [["pat", re.compile("b")], ["repl", str], ["case", False], ["flags", 1]],
+)
+def test_str_replace_unsupported(arg_name, arg):
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    kwargs = {"pat": "b", "repl": "x", "regex": True}
+    kwargs[arg_name] = arg
+    with pytest.raises(NotImplementedError, match="replace is not supported"):
+        ser.str.replace(**kwargs)
+
+
+@pytest.mark.parametrize(
+    "pat, repl, n, regex, exp",
+    [
+        ["a", "x", -1, False, ["xbxc", None]],
+        ["a", "x", 1, False, ["xbac", None]],
+        ["[a-b]", "x", -1, True, ["xxxc", None]],
+    ],
+)
+def test_str_replace(pat, repl, n, regex, exp):
+    ser = pd.Series(["abac", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.replace(pat, repl, n=n, regex=regex)
+    expected = pd.Series(exp, dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_replace_re2_unicode_property():
+    ser = pd.Series(["Jan", "Feb", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.replace(r"\p{Lu}", "U", regex=True)
+    expected = pd.Series(["Uan", "Ueb", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_replace_negative_n():
+    # GH 56404
+    ser = pd.Series(["abc", "aaaaaa"], dtype=ArrowDtype(pa.string()))
+    actual = ser.str.replace("a", "", -3, True)
+    expected = pd.Series(["bc", ""], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(expected, actual)
+
+    # Same bug for pyarrow-backed StringArray GH#59628
+    ser2 = ser.astype(pd.StringDtype(storage="pyarrow"))
+    actual2 = ser2.str.replace("a", "", -3, True)
+    expected2 = expected.astype(ser2.dtype)
+    tm.assert_series_equal(expected2, actual2)
+
+    ser3 = ser.astype(pd.StringDtype(storage="pyarrow", na_value=np.nan))
+    actual3 = ser3.str.replace("a", "", -3, True)
+    expected3 = expected.astype(ser3.dtype)
+    tm.assert_series_equal(expected3, actual3)
+
+
+def test_str_repeat_unsupported():
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    with pytest.raises(NotImplementedError, match="repeat is not"):
+        ser.str.repeat([1, 2])
+
+
+def test_str_repeat():
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.repeat(2)
+    expected = pd.Series(["abcabc", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pat, case, na, exp",
+    [
+        ["ab", False, None, [True, None]],
+        ["Ab", True, None, [False, None]],
+        ["bc", True, None, [False, None]],
+        ["ab", False, True, [True, True]],
+        ["a[a-z]{1}", False, None, [True, None]],
+        ["A[a-z]{1}", True, None, [False, None]],
+    ],
+)
+def test_str_match(pat, case, na, exp):
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.match(pat, case=case, na=na)
+    expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pat, case, na, exp",
+    # Note: keep cases in sync with
+    # pandas/tests/strings/test_find_replace.py::test_str_fullmatch_extra_cases
+    [
+        ["abc", False, None, [True, False, False, None]],
+        ["Abc", True, None, [False, False, False, None]],
+        ["bc", True, None, [False, False, False, None]],
+        ["ab", False, None, [False, False, False, None]],
+        ["a[a-z]{2}", False, None, [True, False, False, None]],
+        ["A[a-z]{1}", True, None, [False, False, False, None]],
+        # GH Issue: #56652
+        ["abc$", False, None, [True, False, False, None]],
+        ["abc\\$", False, None, [False, True, False, None]],
+        ["Abc$", True, None, [False, False, False, None]],
+        ["Abc\\$", True, None, [False, False, False, None]],
+        # https://github.com/pandas-dev/pandas/issues/61072
+        ["(abc)|(abx)", True, None, [True, False, False, None]],
+        ["((abc)|(abx))", True, None, [True, False, False, None]],
+    ],
+)
+def test_str_fullmatch(pat, case, na, exp):
+    ser = pd.Series(["abc", "abc$", "$abc", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.fullmatch(pat, case=case, na=na)
+    expected = pd.Series(exp, dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "sub, start, end, exp, exp_type",
+    [
+        ["ab", 0, None, [0, None], pa.int32()],
+        ["bc", 1, 3, [1, None], pa.int64()],
+        ["ab", 1, 3, [-1, None], pa.int64()],
+        ["ab", -3, -3, [-1, None], pa.int64()],
+    ],
+)
+def test_str_find(sub, start, end, exp, exp_type):
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.find(sub, start=start, end=end)
+    expected = pd.Series(exp, dtype=ArrowDtype(exp_type))
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_find_negative_start():
+    # GH 56411
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.find(sub="b", start=-1000, end=3)
+    expected = pd.Series([1, None], dtype=ArrowDtype(pa.int64()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_find_no_end():
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.find("ab", start=1)
+    expected = pd.Series([-1, None], dtype="int64[pyarrow]")
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_find_negative_start_negative_end():
+    # GH 56791
+    ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.find(sub="d", start=-6, end=-3)
+    expected = pd.Series([3, None], dtype=ArrowDtype(pa.int64()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_find_large_start():
+    # GH 56791
+    ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.find(sub="d", start=16)
+    expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("start", [-15, -3, 0, 1, 15, None])
+@pytest.mark.parametrize("end", [-15, -1, 0, 3, 15, None])
+@pytest.mark.parametrize("sub", ["", "az", "abce", "a", "caa"])
+def test_str_find_e2e(start, end, sub):
+    s = pd.Series(
+        ["abcaadef", "abc", "abcdeddefgj8292", "ab", "a", ""],
+        dtype=ArrowDtype(pa.string()),
+    )
+    object_series = s.astype(pd.StringDtype(storage="python"))
+    result = s.str.find(sub, start, end)
+    expected = object_series.str.find(sub, start, end).astype(result.dtype)
+    tm.assert_series_equal(result, expected)
+
+    arrow_str_series = s.astype(pd.StringDtype(storage="pyarrow"))
+    result2 = arrow_str_series.str.find(sub, start, end).astype(result.dtype)
+    tm.assert_series_equal(result2, expected)
+
+
+def test_str_find_negative_start_negative_end_no_match():
+    # GH 56791
+    ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.find(sub="d", start=-3, end=-6)
+    expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "i, exp",
+    [
+        [1, ["b", "e", None]],
+        [-1, ["c", "e", None]],
+        [2, ["c", None, None]],
+        [-3, ["a", None, None]],
+        [4, [None, None, None]],
+    ],
+)
+def test_str_get(i, exp):
+    ser = pd.Series(["abc", "de", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.get(i)
+    expected = pd.Series(exp, dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.xfail(
+    reason="TODO: StringMethods._validate should support Arrow list types",
+    raises=AttributeError,
+)
+def test_str_join():
+    ser = pd.Series(ArrowExtensionArray(pa.array([list("abc"), list("123"), None])))
+    result = ser.str.join("=")
+    expected = pd.Series(["a=b=c", "1=2=3", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_join_string_type():
+    ser = pd.Series(ArrowExtensionArray(pa.array(["abc", "123", None])))
+    result = ser.str.join("=")
+    expected = pd.Series(["a=b=c", "1=2=3", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "start, stop, step, exp",
+    [
+        [None, 2, None, ["ab", None]],
+        [None, 2, 1, ["ab", None]],
+        [1, 3, 1, ["bc", None]],
+        (None, None, -1, ["dcba", None]),
+    ],
+)
+def test_str_slice(start, stop, step, exp):
+    ser = pd.Series(["abcd", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.slice(start, stop, step)
+    expected = pd.Series(exp, dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "start, stop, repl, exp",
+    [
+        [1, 2, "x", ["axcd", None]],
+        [None, 2, "x", ["xcd", None]],
+        [None, 2, None, ["cd", None]],
+    ],
+)
+def test_str_slice_replace(start, stop, repl, exp):
+    ser = pd.Series(["abcd", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.slice_replace(start, stop, repl)
+    expected = pd.Series(exp, dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "value, method, exp",
+    [
+        ["a1c", "isalnum", True],
+        ["!|,", "isalnum", False],
+        ["aaa", "isalpha", True],
+        ["!!!", "isalpha", False],
+        ["٠", "isdecimal", True],  # noqa: RUF001
+        ["~!", "isdecimal", False],
+        ["2", "isdigit", True],
+        ["~", "isdigit", False],
+        ["aaa", "islower", True],
+        ["aaA", "islower", False],
+        ["123", "isnumeric", True],
+        ["11I", "isnumeric", False],
+        [" ", "isspace", True],
+        ["", "isspace", False],
+        ["The That", "istitle", True],
+        ["the That", "istitle", False],
+        ["AAA", "isupper", True],
+        ["AAc", "isupper", False],
+    ],
+)
+def test_str_is_functions(value, method, exp):
+    ser = pd.Series([value, None], dtype=ArrowDtype(pa.string()))
+    result = getattr(ser.str, method)()
+    expected = pd.Series([exp, None], dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, exp",
+    [
+        ["capitalize", "Abc def"],
+        ["title", "Abc Def"],
+        ["swapcase", "AbC Def"],
+        ["lower", "abc def"],
+        ["upper", "ABC DEF"],
+        ["casefold", "abc def"],
+    ],
+)
+def test_str_transform_functions(method, exp):
+    ser = pd.Series(["aBc dEF", None], dtype=ArrowDtype(pa.string()))
+    result = getattr(ser.str, method)()
+    expected = pd.Series([exp, None], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_len():
+    ser = pd.Series(["abcd", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.len()
+    expected = pd.Series([4, None], dtype=ArrowDtype(pa.int32()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, to_strip, val",
+    [
+        ["strip", None, " abc "],
+        ["strip", "x", "xabcx"],
+        ["lstrip", None, " abc"],
+        ["lstrip", "x", "xabc"],
+        ["rstrip", None, "abc "],
+        ["rstrip", "x", "abcx"],
+    ],
+)
+def test_str_strip(method, to_strip, val):
+    ser = pd.Series([val, None], dtype=ArrowDtype(pa.string()))
+    result = getattr(ser.str, method)(to_strip=to_strip)
+    expected = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("val", ["abc123", "abc"])
+def test_str_removesuffix(val):
+    ser = pd.Series([val, None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.removesuffix("123")
+    expected = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("val", ["123abc", "abc"])
+def test_str_removeprefix(val):
+    ser = pd.Series([val, None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.removeprefix("123")
+    expected = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("errors", ["ignore", "strict"])
+@pytest.mark.parametrize(
+    "encoding, exp",
+    [
+        ("utf8", {"little": b"abc", "big": "abc"}),
+        (
+            "utf32",
+            {
+                "little": b"\xff\xfe\x00\x00a\x00\x00\x00b\x00\x00\x00c\x00\x00\x00",
+                "big": b"\x00\x00\xfe\xff\x00\x00\x00a\x00\x00\x00b\x00\x00\x00c",
+            },
+        ),
+    ],
+    ids=["utf8", "utf32"],
+)
+def test_str_encode(errors, encoding, exp):
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.encode(encoding, errors)
+    expected = pd.Series([exp[sys.byteorder], None], dtype=ArrowDtype(pa.binary()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("flags", [0, 2])
+def test_str_findall(flags):
+    ser = pd.Series(["abc", "efg", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.findall("b", flags=flags)
+    expected = pd.Series([["b"], [], None], dtype=ArrowDtype(pa.list_(pa.string())))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["index", "rindex"])
+@pytest.mark.parametrize(
+    "start, end",
+    [
+        [0, None],
+        [1, 4],
+    ],
+)
+def test_str_r_index(method, start, end):
+    ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
+    result = getattr(ser.str, method)("c", start, end)
+    expected = pd.Series([2, None], dtype=ArrowDtype(pa.int64()))
+    tm.assert_series_equal(result, expected)
+
+    with pytest.raises(ValueError, match="substring not found"):
+        getattr(ser.str, method)("foo", start, end)
+
+
+@pytest.mark.parametrize("form", ["NFC", "NFKC"])
+def test_str_normalize(form):
+    ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.normalize(form)
+    expected = ser.copy()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "start, end",
+    [
+        [0, None],
+        [1, 4],
+    ],
+)
+def test_str_rfind(start, end):
+    ser = pd.Series(["abcba", "foo", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.rfind("c", start, end)
+    expected = pd.Series([2, -1, None], dtype=ArrowDtype(pa.int64()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_translate():
+    ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.translate({97: "b"})
+    expected = pd.Series(["bbcbb", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_wrap():
+    ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.wrap(3)
+    expected = pd.Series(["abc\nba", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_get_dummies():
+    ser = pd.Series(["a|b", None, "a|c"], dtype=ArrowDtype(pa.string()))
+    result = ser.str.get_dummies()
+    expected = pd.DataFrame(
+        [[True, True, False], [False, False, False], [True, False, True]],
+        dtype=ArrowDtype(pa.bool_()),
+        columns=["a", "b", "c"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_str_partition():
+    ser = pd.Series(["abcba", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.partition("b")
+    expected = pd.DataFrame(
+        [["a", "b", "cba"], [None, None, None]],
+        dtype=ArrowDtype(pa.string()),
+        columns=pd.RangeIndex(3),
+    )
+    tm.assert_frame_equal(result, expected, check_column_type=True)
+
+    result = ser.str.partition("b", expand=False)
+    expected = pd.Series(ArrowExtensionArray(pa.array([["a", "b", "cba"], None])))
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.rpartition("b")
+    expected = pd.DataFrame(
+        [["abc", "b", "a"], [None, None, None]],
+        dtype=ArrowDtype(pa.string()),
+        columns=pd.RangeIndex(3),
+    )
+    tm.assert_frame_equal(result, expected, check_column_type=True)
+
+    result = ser.str.rpartition("b", expand=False)
+    expected = pd.Series(ArrowExtensionArray(pa.array([["abc", "b", "a"], None])))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["rsplit", "split"])
+def test_str_split_pat_none(method):
+    # GH 56271
+    ser = pd.Series(["a1 cbc\nb", None], dtype=ArrowDtype(pa.string()))
+    result = getattr(ser.str, method)()
+    expected = pd.Series(ArrowExtensionArray(pa.array([["a1", "cbc", "b"], None])))
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_split():
+    # GH 52401
+    ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.split("c")
+    expected = pd.Series(
+        ArrowExtensionArray(pa.array([["a1", "b", "b"], ["a2", "b", "b"], None]))
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.split("c", n=1)
+    expected = pd.Series(
+        ArrowExtensionArray(pa.array([["a1", "bcb"], ["a2", "bcb"], None]))
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.split("[1-2]", regex=True)
+    expected = pd.Series(
+        ArrowExtensionArray(pa.array([["a", "cbcb"], ["a", "cbcb"], None]))
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.split("[1-2]", regex=True, expand=True)
+    expected = pd.DataFrame(
+        {
+            0: ArrowExtensionArray(pa.array(["a", "a", None])),
+            1: ArrowExtensionArray(pa.array(["cbcb", "cbcb", None])),
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = ser.str.split("1", expand=True)
+    expected = pd.DataFrame(
+        {
+            0: ArrowExtensionArray(pa.array(["a", "a2cbcb", None])),
+            1: ArrowExtensionArray(pa.array(["cbcb", None, None])),
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_str_rsplit():
+    # GH 52401
+    ser = pd.Series(["a1cbcb", "a2cbcb", None], dtype=ArrowDtype(pa.string()))
+    result = ser.str.rsplit("c")
+    expected = pd.Series(
+        ArrowExtensionArray(pa.array([["a1", "b", "b"], ["a2", "b", "b"], None]))
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.rsplit("c", n=1)
+    expected = pd.Series(
+        ArrowExtensionArray(pa.array([["a1cb", "b"], ["a2cb", "b"], None]))
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.rsplit("c", n=1, expand=True)
+    expected = pd.DataFrame(
+        {
+            0: ArrowExtensionArray(pa.array(["a1cb", "a2cb", None])),
+            1: ArrowExtensionArray(pa.array(["b", "b", None])),
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = ser.str.rsplit("1", expand=True)
+    expected = pd.DataFrame(
+        {
+            0: ArrowExtensionArray(pa.array(["a", "a2cbcb", None])),
+            1: ArrowExtensionArray(pa.array(["cbcb", None, None])),
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_str_extract_non_symbolic():
+    ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string()))
+    with pytest.raises(ValueError, match="pat=.* must contain a symbolic group name."):
+        ser.str.extract(r"[ab](\d)")
+
+
+@pytest.mark.parametrize("expand", [True, False])
+def test_str_extract(expand):
+    ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string()))
+    result = ser.str.extract(r"(?P<letter>[ab])(?P<digit>\d)", expand=expand)
+    expected = pd.DataFrame(
+        {
+            "letter": ArrowExtensionArray(pa.array(["a", "b", None])),
+            "digit": ArrowExtensionArray(pa.array(["1", "2", None])),
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_str_extract_expand():
+    ser = pd.Series(["a1", "b2", "c3"], dtype=ArrowDtype(pa.string()))
+    result = ser.str.extract(r"[ab](?P<digit>\d)", expand=True)
+    expected = pd.DataFrame(
+        {
+            "digit": ArrowExtensionArray(pa.array(["1", "2", None])),
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = ser.str.extract(r"[ab](?P<digit>\d)", expand=False)
+    expected = pd.Series(ArrowExtensionArray(pa.array(["1", "2", None])), name="digit")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("unit", ["ns", "us", "ms", "s"])
+def test_duration_from_strings_with_nat(unit):
+    # GH51175
+    strings = ["1000", "NaT"]
+    pa_type = pa.duration(unit)
+    dtype = ArrowDtype(pa_type)
+    result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype)
+    expected = ArrowExtensionArray(pa.array([1000, None], type=pa_type))
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_unsupported_dt(data):
+    pa_dtype = data.dtype.pyarrow_dtype
+    if not pa.types.is_temporal(pa_dtype):
+        with pytest.raises(
+            AttributeError, match="Can only use .dt accessor with datetimelike values"
+        ):
+            pd.Series(data).dt
+
+
+@pytest.mark.parametrize(
+    "prop, expected",
+    [
+        ["year", 2023],
+        ["day", 2],
+        ["day_of_week", 0],
+        ["dayofweek", 0],
+        ["weekday", 0],
+        ["day_of_year", 2],
+        ["dayofyear", 2],
+        ["hour", 3],
+        ["minute", 4],
+        ["is_leap_year", False],
+        ["microsecond", 2000],
+        ["month", 1],
+        ["nanosecond", 6],
+        ["quarter", 1],
+        ["second", 7],
+        ["date", date(2023, 1, 2)],
+        ["time", time(3, 4, 7, 2000)],
+    ],
+)
+def test_dt_properties(prop, expected):
+    ser = pd.Series(
+        [
+            pd.Timestamp(
+                year=2023,
+                month=1,
+                day=2,
+                hour=3,
+                minute=4,
+                second=7,
+                microsecond=2000,
+                nanosecond=6,
+            ),
+            None,
+        ],
+        dtype=ArrowDtype(pa.timestamp("ns")),
+    )
+    result = getattr(ser.dt, prop)
+    exp_type = None
+    if isinstance(expected, date):
+        exp_type = pa.date32()
+    elif isinstance(expected, time):
+        exp_type = pa.time64("ns")
+    expected = pd.Series(ArrowExtensionArray(pa.array([expected, None], type=exp_type)))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("microsecond", [2000, 5, 0])
+def test_dt_microsecond(microsecond):
+    # GH 59183
+    ser = pd.Series(
+        [
+            pd.Timestamp(
+                year=2024,
+                month=7,
+                day=7,
+                second=5,
+                microsecond=microsecond,
+                nanosecond=6,
+            ),
+            None,
+        ],
+        dtype=ArrowDtype(pa.timestamp("ns")),
+    )
+    result = ser.dt.microsecond
+    expected = pd.Series([microsecond, None], dtype="int64[pyarrow]")
+    tm.assert_series_equal(result, expected)
+
+
+def test_dt_is_month_start_end():
+    ser = pd.Series(
+        [
+            datetime(year=2023, month=12, day=2, hour=3),
+            datetime(year=2023, month=1, day=1, hour=3),
+            datetime(year=2023, month=3, day=31, hour=3),
+            None,
+        ],
+        dtype=ArrowDtype(pa.timestamp("us")),
+    )
+    result = ser.dt.is_month_start
+    expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+    result = ser.dt.is_month_end
+    expected = pd.Series([False, False, True, None], dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_dt_is_year_start_end():
+    ser = pd.Series(
+        [
+            datetime(year=2023, month=12, day=31, hour=3),
+            datetime(year=2023, month=1, day=1, hour=3),
+            datetime(year=2023, month=3, day=31, hour=3),
+            None,
+        ],
+        dtype=ArrowDtype(pa.timestamp("us")),
+    )
+    result = ser.dt.is_year_start
+    expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+    result = ser.dt.is_year_end
+    expected = pd.Series([True, False, False, None], dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_dt_is_quarter_start_end():
+    ser = pd.Series(
+        [
+            datetime(year=2023, month=11, day=30, hour=3),
+            datetime(year=2023, month=1, day=1, hour=3),
+            datetime(year=2023, month=3, day=31, hour=3),
+            None,
+        ],
+        dtype=ArrowDtype(pa.timestamp("us")),
+    )
+    result = ser.dt.is_quarter_start
+    expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+    result = ser.dt.is_quarter_end
+    expected = pd.Series([False, False, True, None], dtype=ArrowDtype(pa.bool_()))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["days_in_month", "daysinmonth"])
+def test_dt_days_in_month(method):
+    ser = pd.Series(
+        [
+            datetime(year=2023, month=3, day=30, hour=3),
+            datetime(year=2023, month=4, day=1, hour=3),
+            datetime(year=2023, month=2, day=3, hour=3),
+            None,
+        ],
+        dtype=ArrowDtype(pa.timestamp("us")),
+    )
+    result = getattr(ser.dt, method)
+    expected = pd.Series([31, 30, 28, None], dtype=ArrowDtype(pa.int64()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_dt_normalize():
+    ser = pd.Series(
+        [
+            datetime(year=2023, month=3, day=30),
+            datetime(year=2023, month=4, day=1, hour=3),
+            datetime(year=2023, month=2, day=3, hour=23, minute=59, second=59),
+            None,
+        ],
+        dtype=ArrowDtype(pa.timestamp("us")),
+    )
+    result = ser.dt.normalize()
+    expected = pd.Series(
+        [
+            datetime(year=2023, month=3, day=30),
+            datetime(year=2023, month=4, day=1),
+            datetime(year=2023, month=2, day=3),
+            None,
+        ],
+        dtype=ArrowDtype(pa.timestamp("us")),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("unit", ["us", "ns"])
+def test_dt_time_preserve_unit(unit):
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=2, hour=3), None],
+        dtype=ArrowDtype(pa.timestamp(unit)),
+    )
+    assert ser.dt.unit == unit
+
+    result = ser.dt.time
+    expected = pd.Series(
+        ArrowExtensionArray(pa.array([time(3, 0), None], type=pa.time64(unit)))
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
+def test_dt_tz(tz):
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=2, hour=3), None],
+        dtype=ArrowDtype(pa.timestamp("ns", tz=tz)),
+    )
+    result = ser.dt.tz
+    assert result == timezones.maybe_get_tz(tz)
+
+
+def test_dt_isocalendar():
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=2, hour=3), None],
+        dtype=ArrowDtype(pa.timestamp("ns")),
+    )
+    result = ser.dt.isocalendar()
+    expected = pd.DataFrame(
+        [[2023, 1, 1], [0, 0, 0]],
+        columns=["year", "week", "day"],
+        dtype="int64[pyarrow]",
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, exp", [["day_name", "Sunday"], ["month_name", "January"]]
+)
+def test_dt_day_month_name(method, exp, request):
+    # GH 52388
+    _require_timezone_database(request)
+
+    ser = pd.Series([datetime(2023, 1, 1), None], dtype=ArrowDtype(pa.timestamp("ms")))
+    result = getattr(ser.dt, method)()
+    expected = pd.Series([exp, None], dtype=ArrowDtype(pa.string()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_dt_strftime(request):
+    _require_timezone_database(request)
+
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=2, hour=3), None],
+        dtype=ArrowDtype(pa.timestamp("ns")),
+    )
+    result = ser.dt.strftime("%Y-%m-%dT%H:%M:%S")
+    expected = pd.Series(
+        ["2023-01-02T03:00:00.000000000", None], dtype=ArrowDtype(pa.string())
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["ceil", "floor", "round"])
+def test_dt_roundlike_tz_options_not_supported(method):
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=2, hour=3), None],
+        dtype=ArrowDtype(pa.timestamp("ns")),
+    )
+    with pytest.raises(NotImplementedError, match="ambiguous is not supported."):
+        getattr(ser.dt, method)("1h", ambiguous="NaT")
+
+    with pytest.raises(NotImplementedError, match="nonexistent is not supported."):
+        getattr(ser.dt, method)("1h", nonexistent="NaT")
+
+
+@pytest.mark.parametrize("method", ["ceil", "floor", "round"])
+def test_dt_roundlike_unsupported_freq(method):
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=2, hour=3), None],
+        dtype=ArrowDtype(pa.timestamp("ns")),
+    )
+    with pytest.raises(ValueError, match="freq='1B' is not supported"):
+        getattr(ser.dt, method)("1B")
+
+    with pytest.raises(ValueError, match="Must specify a valid frequency: None"):
+        getattr(ser.dt, method)(None)
+
+
+@pytest.mark.parametrize("freq", ["D", "h", "min", "s", "ms", "us", "ns"])
+@pytest.mark.parametrize("method", ["ceil", "floor", "round"])
+def test_dt_ceil_year_floor(freq, method):
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=1), None],
+    )
+    pa_dtype = ArrowDtype(pa.timestamp("ns"))
+    expected = getattr(ser.dt, method)(f"1{freq}").astype(pa_dtype)
+    result = getattr(ser.astype(pa_dtype).dt, method)(f"1{freq}")
+    tm.assert_series_equal(result, expected)
+
+
+def test_dt_to_pydatetime():
+    # GH 51859
+    data = [datetime(2022, 1, 1), datetime(2023, 1, 1)]
+    ser = pd.Series(data, dtype=ArrowDtype(pa.timestamp("ns")))
+    result = ser.dt.to_pydatetime()
+    expected = pd.Series(data, dtype=object)
+    tm.assert_series_equal(result, expected)
+    assert all(type(expected.iloc[i]) is datetime for i in range(len(expected)))
+
+    expected = ser.astype("datetime64[ns]").dt.to_pydatetime()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("date_type", [32, 64])
+def test_dt_to_pydatetime_date_error(date_type):
+    # GH 52812
+    ser = pd.Series(
+        [date(2022, 12, 31)],
+        dtype=ArrowDtype(getattr(pa, f"date{date_type}")()),
+    )
+    with pytest.raises(ValueError, match="to_pydatetime cannot be called with"):
+        ser.dt.to_pydatetime()
+
+
+def test_dt_tz_localize_unsupported_tz_options():
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=2, hour=3), None],
+        dtype=ArrowDtype(pa.timestamp("ns")),
+    )
+    with pytest.raises(NotImplementedError, match="ambiguous='NaT' is not supported"):
+        ser.dt.tz_localize("UTC", ambiguous="NaT")
+
+    with pytest.raises(NotImplementedError, match="nonexistent='NaT' is not supported"):
+        ser.dt.tz_localize("UTC", nonexistent="NaT")
+
+
+def test_dt_tz_localize_none(request):
+    _require_timezone_database(request)
+
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=2, hour=3), None],
+        dtype=ArrowDtype(pa.timestamp("ns", tz="US/Pacific")),
+    )
+    result = ser.dt.tz_localize(None)
+    expected = pd.Series(
+        [ser[0].tz_localize(None), None],
+        dtype=ArrowDtype(pa.timestamp("ns")),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("unit", ["us", "ns"])
+def test_dt_tz_localize(unit, request):
+    _require_timezone_database(request)
+
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=2, hour=3), None],
+        dtype=ArrowDtype(pa.timestamp(unit)),
+    )
+    result = ser.dt.tz_localize("US/Pacific")
+    exp_data = pa.array(
+        [datetime(year=2023, month=1, day=2, hour=3), None], type=pa.timestamp(unit)
+    )
+    exp_data = pa.compute.assume_timezone(exp_data, "US/Pacific")
+    expected = pd.Series(ArrowExtensionArray(exp_data))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "nonexistent, exp_date",
+    [
+        ["shift_forward", datetime(year=2023, month=3, day=12, hour=3)],
+        ["shift_backward", pd.Timestamp("2023-03-12 01:59:59.999999999")],
+    ],
+)
+def test_dt_tz_localize_nonexistent(nonexistent, exp_date, request):
+    _require_timezone_database(request)
+
+    ser = pd.Series(
+        [datetime(year=2023, month=3, day=12, hour=2, minute=30), None],
+        dtype=ArrowDtype(pa.timestamp("ns")),
+    )
+    result = ser.dt.tz_localize("US/Pacific", nonexistent=nonexistent)
+    exp_data = pa.array([exp_date, None], type=pa.timestamp("ns"))
+    exp_data = pa.compute.assume_timezone(exp_data, "US/Pacific")
+    expected = pd.Series(ArrowExtensionArray(exp_data))
+    tm.assert_series_equal(result, expected)
+
+
+def test_dt_tz_convert_not_tz_raises():
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=2, hour=3), None],
+        dtype=ArrowDtype(pa.timestamp("ns")),
+    )
+    with pytest.raises(TypeError, match="Cannot convert tz-naive timestamps"):
+        ser.dt.tz_convert("UTC")
+
+
+def test_dt_tz_convert_none():
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=2, hour=3), None],
+        dtype=ArrowDtype(pa.timestamp("ns", "US/Pacific")),
+    )
+    result = ser.dt.tz_convert(None)
+    expected = pd.Series(
+        [ser[0].tz_convert(None), None],
+        dtype=ArrowDtype(pa.timestamp("ns")),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("unit", ["us", "ns"])
+def test_dt_tz_convert(unit):
+    ser = pd.Series(
+        [datetime(year=2023, month=1, day=2, hour=3), None],
+        dtype=ArrowDtype(pa.timestamp(unit, "US/Pacific")),
+    )
+    result = ser.dt.tz_convert("US/Eastern")
+    expected = pd.Series(
+        [ser[0].tz_convert("US/Eastern"), None],
+        dtype=ArrowDtype(pa.timestamp(unit, "US/Eastern")),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["timestamp[ms][pyarrow]", "duration[ms][pyarrow]"])
+def test_as_unit(dtype):
+    # GH 52284
+    ser = pd.Series([1000, None], dtype=dtype)
+    result = ser.dt.as_unit("ns")
+    expected = ser.astype(dtype.replace("ms", "ns"))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "from_unit,to_unit",
+    [
+        ("ns", "us"),
+        ("ns", "ms"),
+        ("ns", "s"),
+        ("us", "ms"),
+        ("us", "s"),
+        ("ms", "s"),
+        ("s", "ms"),
+        ("s", "us"),
+        ("s", "ns"),
+        ("ms", "us"),
+        ("ms", "ns"),
+        ("us", "ns"),
+    ],
+)
+def test_as_unit_duration_truncation(from_unit, to_unit):
+    # Test that as_unit truncates correctly (matches NumPy behavior)
+    # Value with sub-unit precision to test truncation
+    ser_numpy = pd.Series(
+        pd.to_timedelta([93784567890123, None], unit="ns").as_unit(from_unit)
+    )
+    ser_arrow = ser_numpy.astype(f"duration[{from_unit}][pyarrow]")
+
+    result = ser_arrow.dt.as_unit(to_unit)
+    expected = ser_numpy.dt.as_unit(to_unit).astype(f"duration[{to_unit}][pyarrow]")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "from_unit,to_unit",
+    [
+        ("ns", "us"),
+        ("ns", "ms"),
+        ("ns", "s"),
+        ("s", "ns"),
+        ("ms", "ns"),
+        ("us", "ns"),
+    ],
+)
+def test_as_unit_timestamp(from_unit, to_unit):
+    # Test timestamp as_unit matches NumPy behavior
+    # Create Arrow series directly to preserve nulls correctly
+    ser_arrow = pd.Series(
+        [pd.Timestamp("2024-01-15 12:30:45.123456789"), None],
+        dtype=f"timestamp[{from_unit}][pyarrow]",
+    )
+    ser_numpy = ser_arrow.astype(f"datetime64[{from_unit}]")
+
+    result = ser_arrow.dt.as_unit(to_unit)
+    expected_numpy = ser_numpy.dt.as_unit(to_unit)
+    # Compare values (excluding null handling differences)
+    tm.assert_almost_equal(
+        result.dropna().to_numpy(dtype=f"datetime64[{to_unit}]"),
+        expected_numpy.dropna().to_numpy(),
+    )
+    # Verify nulls are preserved
+    assert result.isna().sum() == ser_arrow.isna().sum()
+
+
+@pytest.mark.parametrize("to_unit", ["s", "ms", "us", "ns"])
+def test_as_unit_timestamp_with_timezone(to_unit):
+    # Test that timezone is preserved
+    ser_numpy = pd.Series(
+        pd.to_datetime(["2024-01-15 12:30:45.123456789"])
+        .tz_localize("US/Eastern")
+        .as_unit("ns")
+    )
+    ser_arrow = ser_numpy.astype("timestamp[ns, US/Eastern][pyarrow]")
+
+    result = ser_arrow.dt.as_unit(to_unit)
+    expected = ser_numpy.dt.as_unit(to_unit).astype(
+        f"timestamp[{to_unit}, US/Eastern][pyarrow]"
+    )
+    tm.assert_series_equal(result, expected)
+    assert str(result.dtype) == f"timestamp[{to_unit}, tz=US/Eastern][pyarrow]"
+
+
+def test_as_unit_date_raises():
+    # as_unit should raise for date types
+    ser = pd.Series([1, 2], dtype=ArrowDtype(pa.date32()))
+    with pytest.raises(NotImplementedError, match="as_unit not implemented"):
+        ser.dt.as_unit("ns")
+
+
+@pytest.mark.parametrize(
+    "prop, expected",
+    [
+        ["days", 1],
+        ["seconds", 2],
+        ["microseconds", 3],
+        ["nanoseconds", 4],
+    ],
+)
+def test_dt_timedelta_properties(prop, expected):
+    # GH 52284
+    ser = pd.Series(
+        [
+            pd.Timedelta(
+                days=1,
+                seconds=2,
+                microseconds=3,
+                nanoseconds=4,
+            ),
+            None,
+        ],
+        dtype=ArrowDtype(pa.duration("ns")),
+    )
+    result = getattr(ser.dt, prop)
+    expected = pd.Series(
+        ArrowExtensionArray(pa.array([expected, None], type=pa.int32()))
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_dt_timedelta_total_seconds():
+    # GH 52284
+    ser = pd.Series(
+        [
+            pd.Timedelta(
+                days=1,
+                seconds=2,
+                microseconds=3,
+                nanoseconds=4,
+            ),
+            None,
+        ],
+        dtype=ArrowDtype(pa.duration("ns")),
+    )
+    result = ser.dt.total_seconds()
+    expected = pd.Series(
+        ArrowExtensionArray(pa.array([86402.000003, None], type=pa.float64()))
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_dt_to_pytimedelta():
+    # GH 52284
+    data = [timedelta(1, 2, 3), timedelta(1, 2, 4)]
+    ser = pd.Series(data, dtype=ArrowDtype(pa.duration("ns")))
+
+    msg = "The behavior of ArrowTemporalProperties.to_pytimedelta is deprecated"
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        result = ser.dt.to_pytimedelta()
+    expected = np.array(data, dtype=object)
+    tm.assert_numpy_array_equal(result, expected)
+    assert all(type(res) is timedelta for res in result)
+
+    msg = "The behavior of TimedeltaProperties.to_pytimedelta is deprecated"
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        expected = ser.astype("timedelta64[ns]").dt.to_pytimedelta()
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_dt_components():
+    # GH 52284
+    ser = pd.Series(
+        [
+            pd.Timedelta(
+                days=1,
+                seconds=2,
+                microseconds=3,
+                nanoseconds=4,
+            ),
+            None,
+        ],
+        dtype=ArrowDtype(pa.duration("ns")),
+    )
+    result = ser.dt.components
+    expected = pd.DataFrame(
+        [[1, 0, 0, 2, 0, 3, 4], [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA]],
+        columns=[
+            "days",
+            "hours",
+            "minutes",
+            "seconds",
+            "milliseconds",
+            "microseconds",
+            "nanoseconds",
+        ],
+        dtype="int32[pyarrow]",
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_dt_components_large_values():
+    ser = pd.Series(
+        [
+            pd.Timedelta("365 days 23:59:59.999000"),
+            None,
+        ],
+        dtype=ArrowDtype(pa.duration("ns")),
+    )
+    result = ser.dt.components
+    expected = pd.DataFrame(
+        [
+            [365, 23, 59, 59, 999, 0, 0],
+            [pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA],
+        ],
+        columns=[
+            "days",
+            "hours",
+            "minutes",
+            "seconds",
+            "milliseconds",
+            "microseconds",
+            "nanoseconds",
+        ],
+        dtype="int32[pyarrow]",
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("skipna", [True, False])
+def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna):
+    # GH51624
+    ser = pd.Series([None], dtype="float64[pyarrow]")
+    result = getattr(ser, all_boolean_reductions)(skipna=skipna)
+    if skipna:
+        expected = all_boolean_reductions == "all"
+    else:
+        expected = pd.NA
+    assert result is expected
+
+
+def test_from_sequence_of_strings_boolean():
+    true_strings = ["true", "TRUE", "True", "1", "1.0"]
+    false_strings = ["false", "FALSE", "False", "0", "0.0"]
+    nulls = [None]
+    strings = true_strings + false_strings + nulls
+    bools = (
+        [True] * len(true_strings) + [False] * len(false_strings) + [None] * len(nulls)
+    )
+
+    dtype = ArrowDtype(pa.bool_())
+    result = ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype)
+    expected = pd.array(bools, dtype="boolean[pyarrow]")
+    tm.assert_extension_array_equal(result, expected)
+
+    strings = ["True", "foo"]
+    with pytest.raises(pa.ArrowInvalid, match="Failed to parse"):
+        ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype)
+
+
+def test_concat_empty_arrow_backed_series(dtype):
+    # GH#51734
+    ser = pd.Series([], dtype=dtype)
+    expected = ser.copy()
+    result = pd.concat([ser[np.array([], dtype=np.bool_)]])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["string", "string[pyarrow]"])
+def test_series_from_string_array(dtype):
+    arr = pa.array("the quick brown fox".split())
+    ser = pd.Series(arr, dtype=dtype)
+    expected = pd.Series(ArrowExtensionArray(arr), dtype=dtype)
+    tm.assert_series_equal(ser, expected)
+
+
+# _data was renamed to _pa_data
+class OldArrowExtensionArray(ArrowExtensionArray):
+    def __getstate__(self):
+        state = super().__getstate__()
+        state["_data"] = state.pop("_pa_array")
+        return state
+
+
+def test_pickle_old_arrowextensionarray():
+    data = pa.array([1])
+    expected = OldArrowExtensionArray(data)
+    result = pickle.loads(pickle.dumps(expected))
+    tm.assert_extension_array_equal(result, expected)
+    assert result._pa_array == pa.chunked_array(data)
+    assert not hasattr(result, "_data")
+
+
+def test_setitem_boolean_replace_with_mask_segfault():
+    # GH#52059
+    N = 145_000
+    arr = ArrowExtensionArray(pa.chunked_array([np.ones((N,), dtype=np.bool_)]))
+    expected = arr.copy()
+    arr[np.zeros((N,), dtype=np.bool_)] = False
+    assert arr._pa_array == expected._pa_array
+
+
+@pytest.mark.parametrize(
+    "data, arrow_dtype",
+    [
+        ([b"a", b"b"], pa.large_binary()),
+        (["a", "b"], pa.large_string()),
+    ],
+)
+def test_conversion_large_dtypes_from_numpy_array(data, arrow_dtype):
+    dtype = ArrowDtype(arrow_dtype)
+    result = pd.array(np.array(data), dtype=dtype)
+    expected = pd.array(data, dtype=dtype)
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_concat_null_array():
+    df = pd.DataFrame({"a": [None, None]}, dtype=ArrowDtype(pa.null()))
+    df2 = pd.DataFrame({"a": [0, 1]}, dtype="int64[pyarrow]")
+
+    result = pd.concat([df, df2], ignore_index=True)
+    expected = pd.DataFrame({"a": [None, None, 0, 1]}, dtype="int64[pyarrow]")
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES + tm.FLOAT_PYARROW_DTYPES)
+def test_describe_numeric_data(pa_type):
+    # GH 52470
+    data = pd.Series([1, 2, 3], dtype=ArrowDtype(pa_type))
+    result = data.describe()
+    expected = pd.Series(
+        [3, 2, 1, 1, 1.5, 2.0, 2.5, 3],
+        dtype=ArrowDtype(pa.float64()),
+        index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES)
+def test_describe_timedelta_data(pa_type):
+    # GH53001
+    data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
+    result = data.describe()
+    expected = pd.Series(
+        [9, *pd.to_timedelta([5, 2, 1, 3, 5, 7, 9], unit=pa_type.unit).tolist()],
+        dtype=object,
+        index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("pa_type", tm.DATETIME_PYARROW_DTYPES)
+def test_describe_datetime_data(pa_type):
+    # GH53001
+    data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
+    result = data.describe()
+    expected = pd.Series(
+        [9]
+        + [
+            pd.Timestamp(v, tz=pa_type.tz, unit=pa_type.unit)
+            for v in [5, 1, 3, 5, 7, 9]
+        ],
+        dtype=object,
+        index=["count", "mean", "min", "25%", "50%", "75%", "max"],
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
+)
+def test_quantile_temporal(pa_type):
+    # GH52678
+    data = [1, 2, 3]
+    ser = pd.Series(data, dtype=ArrowDtype(pa_type))
+    result = ser.quantile(0.1)
+    expected = ser[0]
+    assert result == expected
+
+
+def test_date32_repr():
+    # GH48238
+    arrow_dt = pa.array([date.fromisoformat("2020-01-01")], type=pa.date32())
+    ser = pd.Series(arrow_dt, dtype=ArrowDtype(arrow_dt.type))
+    assert repr(ser) == "0    2020-01-01\ndtype: date32[day][pyarrow]"
+
+
+def test_duration_overflow_from_ndarray_containing_nat():
+    # GH52843
+    data_ts = pd.to_datetime([1, None])
+    data_td = pd.to_timedelta([1, None])
+    ser_ts = pd.Series(data_ts, dtype=ArrowDtype(pa.timestamp("ns")))
+    ser_td = pd.Series(data_td, dtype=ArrowDtype(pa.duration("ns")))
+    result = ser_ts + ser_td
+    expected = pd.Series([2, None], dtype=ArrowDtype(pa.timestamp("ns")))
+    tm.assert_series_equal(result, expected)
+
+
+def test_infer_dtype_pyarrow_dtype(data, request):
+    res = lib.infer_dtype(data)
+    assert res != "unknown-array"
+
+    if data._hasna and res in ["datetime64", "timedelta64"]:
+        mark = pytest.mark.xfail(
+            reason="in infer_dtype pd.NA is not ignored in these cases "
+            "even with skipna=True in the list(data) check below"
+        )
+        request.applymarker(mark)
+
+    assert res == lib.infer_dtype(list(data), skipna=True)
+
+
+@pytest.mark.parametrize(
+    "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
+)
+def test_from_sequence_temporal(pa_type):
+    # GH 53171
+    val = 3
+    unit = pa_type.unit
+    if pa.types.is_duration(pa_type):
+        seq = [pd.Timedelta(val, unit=unit).as_unit(unit)]
+    else:
+        seq = [pd.Timestamp(val, unit=unit, tz=pa_type.tz).as_unit(unit)]
+
+    result = ArrowExtensionArray._from_sequence(seq, dtype=pa_type)
+    expected = ArrowExtensionArray(pa.array([val], type=pa_type))
+    tm.assert_extension_array_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
+)
+def test_setitem_temporal(pa_type):
+    # GH 53171
+    unit = pa_type.unit
+    if pa.types.is_duration(pa_type):
+        val = pd.Timedelta(1, unit=unit).as_unit(unit)
+    else:
+        val = pd.Timestamp(1, unit=unit, tz=pa_type.tz).as_unit(unit)
+
+    arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
+
+    result = arr.copy()
+    result[:] = val
+    expected = ArrowExtensionArray(pa.array([1, 1, 1], type=pa_type))
+    tm.assert_extension_array_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
+)
+def test_arithmetic_temporal(pa_type, request):
+    # GH 53171
+    arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
+    unit = pa_type.unit
+    result = arr - pd.Timedelta(1, unit=unit).as_unit(unit)
+    expected = ArrowExtensionArray(pa.array([0, 1, 2], type=pa_type))
+    tm.assert_extension_array_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
+)
+def test_comparison_temporal(pa_type):
+    # GH 53171
+    unit = pa_type.unit
+    if pa.types.is_duration(pa_type):
+        val = pd.Timedelta(1, unit=unit).as_unit(unit)
+    else:
+        val = pd.Timestamp(1, unit=unit, tz=pa_type.tz).as_unit(unit)
+
+    arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
+
+    result = arr > val
+    expected = ArrowExtensionArray(pa.array([False, True, True], type=pa.bool_()))
+    tm.assert_extension_array_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
+)
+def test_getitem_temporal(pa_type):
+    # GH 53326
+    arr = ArrowExtensionArray(pa.array([1, 2, 3], type=pa_type))
+    result = arr[1]
+    if pa.types.is_duration(pa_type):
+        expected = pd.Timedelta(2, unit=pa_type.unit).as_unit(pa_type.unit)
+        assert isinstance(result, pd.Timedelta)
+    else:
+        expected = pd.Timestamp(2, unit=pa_type.unit, tz=pa_type.tz).as_unit(
+            pa_type.unit
+        )
+        assert isinstance(result, pd.Timestamp)
+    assert result.unit == expected.unit
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
+)
+def test_iter_temporal(pa_type):
+    # GH 53326
+    arr = ArrowExtensionArray(pa.array([1, None], type=pa_type))
+    result = list(arr)
+    if pa.types.is_duration(pa_type):
+        expected = [
+            pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit),
+            pd.NA,
+        ]
+        assert isinstance(result[0], pd.Timedelta)
+    else:
+        expected = [
+            pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit),
+            pd.NA,
+        ]
+        assert isinstance(result[0], pd.Timestamp)
+    assert result[0].unit == expected[0].unit
+    assert result == expected
+
+
+def test_groupby_series_size_returns_pa_int(data):
+    # GH 54132
+    ser = pd.Series(data[:3], index=["a", "a", "b"])
+    result = ser.groupby(level=0).size()
+    expected = pd.Series([2, 1], dtype="int64[pyarrow]", index=["a", "b"])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES, ids=repr
+)
+@pytest.mark.parametrize("dtype", [None, object])
+def test_to_numpy_temporal(pa_type, dtype):
+    # GH 53326
+    # GH 55997: Return datetime64/timedelta64 types with NaT if possible
+    arr = ArrowExtensionArray(pa.array([1, None], type=pa_type))
+    result = arr.to_numpy(dtype=dtype)
+    if pa.types.is_duration(pa_type):
+        value = pd.Timedelta(1, unit=pa_type.unit).as_unit(pa_type.unit)
+    else:
+        value = pd.Timestamp(1, unit=pa_type.unit, tz=pa_type.tz).as_unit(pa_type.unit)
+
+    if dtype == object or (pa.types.is_timestamp(pa_type) and pa_type.tz is not None):
+        if dtype == object:
+            na = pd.NA
+        else:
+            na = pd.NaT
+        expected = np.array([value, na], dtype=object)
+        assert result[0].unit == value.unit
+    else:
+        na = pa_type.to_pandas_dtype().type("nat", pa_type.unit)
+        value = value.to_numpy()
+        expected = np.array([value, na])
+        assert np.datetime_data(result[0])[0] == pa_type.unit
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_groupby_count_return_arrow_dtype(data_missing):
+    df = pd.DataFrame({"A": [1, 1], "B": data_missing, "C": data_missing})
+    result = df.groupby("A").count()
+    expected = pd.DataFrame(
+        [[1, 1]],
+        index=pd.Index([1], name="A"),
+        columns=["B", "C"],
+        dtype="int64[pyarrow]",
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_fixed_size_list():
+    # GH#55000
+    ser = pd.Series(
+        [[1, 2], [3, 4]], dtype=ArrowDtype(pa.list_(pa.int64(), list_size=2))
+    )
+    result = ser.dtype.type
+    assert result == list
+
+
+def test_arrowextensiondtype_dataframe_repr():
+    # GH 54062
+    df = pd.DataFrame(
+        pd.period_range("2012", periods=3),
+        columns=["col"],
+        dtype=ArrowDtype(ArrowPeriodType("D")),
+    )
+    result = repr(df)
+    # TODO: repr value may not be expected; address how
+    # pyarrow.ExtensionType values are displayed
+    expected = "     col\n0  15340\n1  15341\n2  15342"
+    assert result == expected
+
+
+def test_pow_missing_operand():
+    # GH 55512
+    k = pd.Series([2, None], dtype="int64[pyarrow]")
+    result = k.pow(None, fill_value=3)
+    expected = pd.Series([8, None], dtype="int64[pyarrow]")
+    tm.assert_series_equal(result, expected)
+
+
+def test_decimal_parse_raises():
+    # GH 56984
+    ser = pd.Series(["1.2345"], dtype=ArrowDtype(pa.string()))
+    with pytest.raises(
+        pa.lib.ArrowInvalid, match="Rescaling Decimal(128)? value would cause data loss"
+    ):
+        ser.astype(ArrowDtype(pa.decimal128(1, 0)))
+
+
+def test_decimal_parse_succeeds():
+    # GH 56984
+    ser = pd.Series(["1.2345"], dtype=ArrowDtype(pa.string()))
+    dtype = ArrowDtype(pa.decimal128(5, 4))
+    result = ser.astype(dtype)
+    expected = pd.Series([Decimal("1.2345")], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES)
+def test_duration_fillna_numpy(pa_type):
+    # GH 54707
+    ser1 = pd.Series([None, 2], dtype=ArrowDtype(pa_type))
+    ser2 = pd.Series(np.array([1, 3], dtype=f"m8[{pa_type.unit}]"))
+    result = ser1.fillna(ser2)
+    expected = pd.Series([1, 2], dtype=ArrowDtype(pa_type))
+    tm.assert_series_equal(result, expected)
+
+
+def test_comparison_not_propagating_arrow_error():
+    # GH#54944
+    a = pd.Series([1 << 63], dtype="uint64[pyarrow]")
+    b = pd.Series([None], dtype="int64[pyarrow]")
+    with pytest.raises(pa.lib.ArrowInvalid, match="Integer value"):
+        a < b
+
+
+def test_factorize_chunked_dictionary():
+    # GH 54844
+    pa_array = pa.chunked_array(
+        [pa.array(["a"]).dictionary_encode(), pa.array(["b"]).dictionary_encode()]
+    )
+    ser = pd.Series(ArrowExtensionArray(pa_array))
+    res_indices, res_uniques = ser.factorize()
+    exp_indices = np.array([0, 1], dtype=np.intp)
+    exp_uniques = pd.Index(ArrowExtensionArray(pa_array.combine_chunks()))
+    tm.assert_numpy_array_equal(res_indices, exp_indices)
+    tm.assert_index_equal(res_uniques, exp_uniques)
+
+
+def test_factorize_dictionary_with_na():
+    # GH#60567
+    arr = pd.array(
+        ["a1", pd.NA], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8()))
+    )
+    indices, uniques = arr.factorize(use_na_sentinel=False)
+    expected_indices = np.array([0, 1], dtype=np.intp)
+    expected_uniques = pd.array(["a1", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_numpy_array_equal(indices, expected_indices)
+    tm.assert_extension_array_equal(uniques, expected_uniques)
+
+
+def test_dictionary_astype_categorical():
+    # GH#56672
+    arrs = [
+        pa.array(np.array(["a", "x", "c", "a"])).dictionary_encode(),
+        pa.array(np.array(["a", "d", "c"])).dictionary_encode(),
+    ]
+    ser = pd.Series(ArrowExtensionArray(pa.chunked_array(arrs)))
+    result = ser.astype("category")
+    categories = pd.Index(["a", "x", "c", "d"], dtype=ArrowDtype(pa.string()))
+    expected = pd.Series(
+        ["a", "x", "c", "a", "a", "d", "c"],
+        dtype=pd.CategoricalDtype(categories=categories),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_arrow_floordiv():
+    # GH 55561
+    a = pd.Series([-7], dtype="int64[pyarrow]")
+    b = pd.Series([4], dtype="int64[pyarrow]")
+    expected = pd.Series([-2], dtype="int64[pyarrow]")
+    result = a // b
+    tm.assert_series_equal(result, expected)
+
+
+def test_arrow_floordiv_large_values():
+    # GH 56645
+    a = pd.Series([1425801600000000000], dtype="int64[pyarrow]")
+    expected = pd.Series([1425801600000], dtype="int64[pyarrow]")
+    result = a // 1_000_000
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
+def test_arrow_floordiv_large_integral_result(dtype):
+    # GH 56676
+    a = pd.Series([18014398509481983], dtype=dtype)
+    result = a // 1
+    tm.assert_series_equal(result, a)
+
+
+@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES)
+def test_arrow_floordiv_larger_divisor(pa_type):
+    # GH 56676
+    dtype = ArrowDtype(pa_type)
+    a = pd.Series([-23], dtype=dtype)
+    result = a // 24
+    expected = pd.Series([-1], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("pa_type", tm.SIGNED_INT_PYARROW_DTYPES)
+def test_arrow_floordiv_integral_invalid(pa_type):
+    # GH 56676
+    min_value = np.iinfo(pa_type.to_pandas_dtype()).min
+    a = pd.Series([min_value], dtype=ArrowDtype(pa_type))
+    with pytest.raises(pa.lib.ArrowInvalid, match="overflow|not in range"):
+        a // -1
+    with pytest.raises(pa.lib.ArrowInvalid, match="divide by zero"):
+        a // 0
+
+
+@pytest.mark.parametrize("dtype", tm.FLOAT_PYARROW_DTYPES_STR_REPR)
+def test_arrow_floordiv_floating_0_divisor(dtype):
+    # GH 56676
+    a = pd.Series([2], dtype=dtype)
+    result = a // 0
+    expected = pd.Series([float("inf")], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["float64", "datetime64[ns]", "timedelta64[ns]"])
+def test_astype_int_with_null_to_numpy_dtype(dtype):
+    # GH 57093
+    ser = pd.Series([1, None], dtype="int64[pyarrow]")
+    result = ser.astype(dtype)
+    expected = pd.Series([1, None], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES)
+def test_arrow_integral_floordiv_large_values(pa_type):
+    # GH 56676
+    max_value = np.iinfo(pa_type.to_pandas_dtype()).max
+    dtype = ArrowDtype(pa_type)
+    a = pd.Series([max_value], dtype=dtype)
+    b = pd.Series([1], dtype=dtype)
+    result = a // b
+    tm.assert_series_equal(result, a)
+
+
+@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
+def test_arrow_true_division_large_divisor(dtype):
+    # GH 56706
+    a = pd.Series([0], dtype=dtype)
+    b = pd.Series([18014398509481983], dtype=dtype)
+    expected = pd.Series([0], dtype="float64[pyarrow]")
+    result = a / b
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "uint64[pyarrow]"])
+def test_arrow_floor_division_large_divisor(dtype):
+    # GH 56706
+    a = pd.Series([0], dtype=dtype)
+    b = pd.Series([18014398509481983], dtype=dtype)
+    expected = pd.Series([0], dtype=dtype)
+    result = a // b
+    tm.assert_series_equal(result, expected)
+
+
+def test_string_to_datetime_parsing_cast():
+    # GH 56266
+    string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"]
+    result = pd.Series(string_dates, dtype="timestamp[s][pyarrow]")
+
+    pd_res = pd.to_datetime(string_dates).as_unit("s")
+    expected = pd.Series(ArrowExtensionArray(pa.array(pd_res, from_pandas=True)))
+    tm.assert_series_equal(result, expected)
+
+
+def test_interpolate_not_numeric(data):
+    if not data.dtype._is_numeric:
+        ser = pd.Series(data)
+        msg = re.escape(f"Cannot interpolate with {ser.dtype} dtype")
+        with pytest.raises(TypeError, match=msg):
+            pd.Series(data).interpolate()
+
+
+@pytest.mark.parametrize("dtype", ["int64[pyarrow]", "float64[pyarrow]"])
+def test_interpolate_linear(dtype):
+    ser = pd.Series([None, 1, 2, None, 4, None], dtype=dtype)
+    result = ser.interpolate()
+    expected = pd.Series([None, 1, 2, 3, 4, None], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_string_to_time_parsing_cast():
+    # GH 56463
+    string_times = ["11:41:43.076160"]
+    result = pd.Series(string_times, dtype="time64[us][pyarrow]")
+    expected = pd.Series(
+        ArrowExtensionArray(pa.array([time(11, 41, 43, 76160)], from_pandas=True))
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_to_numpy_float():
+    # GH#56267
+    ser = pd.Series([32, 40, None], dtype="float[pyarrow]")
+    result = ser.astype("float64")
+    expected = pd.Series([32, 40, np.nan], dtype="float64")
+    tm.assert_series_equal(result, expected)
+
+
+def test_to_numpy_timestamp_to_int():
+    # GH 55997
+    ser = pd.Series(["2020-01-01 04:30:00"], dtype="timestamp[ns][pyarrow]")
+    result = ser.to_numpy(dtype=np.int64)
+    expected = np.array([1577853000000000000])
+    tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("arrow_type", [pa.large_string(), pa.string()])
+def test_cast_dictionary_different_value_dtype(arrow_type):
+    df = pd.DataFrame({"a": ["x", "y"]}, dtype="string[pyarrow]")
+    data_type = ArrowDtype(pa.dictionary(pa.int32(), arrow_type))
+    result = df.astype({"a": data_type})
+    assert result.dtypes.iloc[0] == data_type
+
+
+def test_map_numeric_na_action(using_nan_is_na):
+    ser = pd.Series([32, 40, None], dtype="int64[pyarrow]")
+    result = ser.map(lambda x: 42, na_action="ignore")
+    if not using_nan_is_na:
+        expected = pd.Series([42.0, 42.0, pd.NA], dtype="object")
+    else:
+        expected = pd.Series([42.0, 42.0, np.nan], dtype="float64")
+    tm.assert_series_equal(result, expected)
+
+
+def test_categorical_from_arrow_dictionary():
+    # GH 60563
+    df = pd.DataFrame(
+        {"A": ["a1", "a2"]}, dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8()))
+    )
+    result = df.value_counts(dropna=False)
+    expected = pd.Series(
+        [1, 1],
+        index=pd.MultiIndex.from_arrays(
+            [pd.Index(["a1", "a2"], dtype=ArrowDtype(pa.string()), name="A")]
+        ),
+        name="count",
+        dtype="int64",
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.skipif(
+    pa_version_under19p0, reason="pa.json_ was introduced in pyarrow v19.0"
+)
+def test_arrow_json_type():
+    # GH 60958
+    dtype = ArrowDtype(pa.json_(pa.string()))
+    result = dtype.type
+    assert result == str
+
+
+def test_timestamp_dtype_disallows_decimal():
+    # GH#61773 constructing with pyarrow timestamp dtype should disallow
+    #  Decimal NaN, just like pd.to_datetime
+    vals = [pd.Timestamp("2016-01-02 03:04:05"), Decimal("NaN")]
+
+    msg = "<class 'decimal.Decimal'> is not convertible to datetime"
+    with pytest.raises(TypeError, match=msg):
+        # Check that the non-pyarrow version raises as expected
+        pd.to_datetime(vals)
+
+    with pytest.raises(TypeError, match=msg):
+        pd.array(vals, dtype=ArrowDtype(pa.timestamp("us")))
+
+
+def test_timestamp_dtype_matches_to_datetime():
+    # GH#61775
+    dtype1 = "datetime64[ns, US/Eastern]"
+    dtype2 = "timestamp[ns, US/Eastern][pyarrow]"
+
+    ts = pd.Timestamp("2025-07-03 18:10")
+
+    result = pd.Series([ts], dtype=dtype2)
+    expected = pd.Series([ts], dtype=dtype1).convert_dtypes(dtype_backend="pyarrow")
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_timestamp_vs_dt64_comparison():
+    # GH#60937
+    left = pd.Series(["2016-01-01"], dtype="timestamp[ns][pyarrow]")
+    right = left.astype("datetime64[ns]")
+
+    result = left == right
+    expected = pd.Series([True], dtype="bool[pyarrow]")
+    tm.assert_series_equal(result, expected)
+
+    result = right == left
+    tm.assert_series_equal(result, expected)
+
+
+# TODO: reuse assert_invalid_comparison?
+def test_date_vs_timestamp_scalar_comparison():
+    # GH#62157 match non-pyarrow behavior
+    ser = pd.Series(["2016-01-01"], dtype="date32[pyarrow]")
+    ser2 = ser.astype("timestamp[ns][pyarrow]")
+
+    ts = ser2[0]
+    dt = ser[0]
+
+    # date dtype don't match a Timestamp object
+    assert not (ser == ts).any()
+    assert not (ts == ser).any()
+
+    # timestamp dtype doesn't match date object
+    assert not (ser2 == dt).any()
+    assert not (dt == ser2).any()
+
+
+# TODO: reuse assert_invalid_comparison?
+def test_date_vs_timestamp_array_comparison():
+    # GH#62157 match non-pyarrow behavior
+    # GH#
+    ser = pd.Series(["2016-01-01"], dtype="date32[pyarrow]")
+    ser2 = ser.astype("timestamp[ns][pyarrow]")
+    ser3 = ser.astype("datetime64[ns]")
+
+    assert not (ser == ser2).any()
+    assert not (ser2 == ser).any()
+    assert (ser != ser2).all()
+    assert (ser2 != ser).all()
+
+    assert not (ser == ser3).any()
+    assert not (ser3 == ser).any()
+    assert (ser != ser3).all()
+    assert (ser3 != ser).all()
+
+
+def test_ops_with_nan_is_na(using_nan_is_na):
+    # GH#61732
+    ser = pd.Series([-1, 0, 1], dtype="int64[pyarrow]")
+
+    result = ser - np.nan
+    if using_nan_is_na:
+        assert result.isna().all()
+    else:
+        assert not result.isna().any()
+
+    result = ser * np.nan
+    if using_nan_is_na:
+        assert result.isna().all()
+    else:
+        assert not result.isna().any()
+
+    result = ser / 0
+    if using_nan_is_na:
+        assert result.isna()[1]
+    else:
+        assert not result.isna()[1]
+
+
+def test_setitem_float_nan_is_na(using_nan_is_na):
+    # GH#61732
+    ser = pd.Series([-1, 0, 1], dtype="int64[pyarrow]")
+
+    if using_nan_is_na:
+        ser[1] = np.nan
+        assert ser.isna()[1]
+    else:
+        msg = "Could not convert nan with type float: tried to convert to int64"
+        with pytest.raises(pa.lib.ArrowInvalid, match=msg):
+            ser[1] = np.nan
+
+    ser = pd.Series([-1, np.nan, 1], dtype="float64[pyarrow]")
+    if using_nan_is_na:
+        assert ser.isna()[1]
+        assert ser[1] is pd.NA
+
+        ser[1] = np.nan
+        assert ser[1] is pd.NA
+
+    else:
+        assert not ser.isna()[1]
+        assert isinstance(ser[1], float)
+        assert np.isnan(ser[1])
+
+        ser[2] = np.nan
+        assert isinstance(ser[2], float)
+        assert np.isnan(ser[2])
+
+
+def test_pow_with_all_na_float():
+    # GH#62520
+
+    s = pd.Series([None, None], dtype="float64[pyarrow]")
+    result = s.pow(2)
+    expected = pd.Series([pd.NA, pd.NA], dtype="float64[pyarrow]")
+    tm.assert_series_equal(result, expected)
+
+
+def test_mul_numpy_nullable_with_pyarrow_float():
+    # GH#58602
+    left = pd.Series(range(5), dtype="Float64")
+    right = pd.Series(range(5), dtype="float64[pyarrow]")
+
+    expected = pd.Series([0, 1, 4, 9, 16], dtype="float64[pyarrow]")
+
+    result = left * right
+    tm.assert_series_equal(result, expected)
+
+    result2 = right * left
+    tm.assert_series_equal(result2, expected)
+
+    # while we're here, let's check __eq__
+    result3 = left == right
+    expected3 = pd.Series([True] * 5, dtype="bool[pyarrow]")
+    tm.assert_series_equal(result3, expected3)
+
+    result4 = right == left
+    tm.assert_series_equal(result4, expected3)
+
+
+@pytest.mark.parametrize(
+    "type_name, expected_size",
+    [
+        # Integer types
+        ("int8", 1),
+        ("int16", 2),
+        ("int32", 4),
+        ("int64", 8),
+        ("uint8", 1),
+        ("uint16", 2),
+        ("uint32", 4),
+        ("uint64", 8),
+        # Floating point types
+        ("float16", 2),
+        ("float32", 4),
+        ("float64", 8),
+        # Boolean
+        ("bool_", 1),
+        # Date and timestamp types
+        ("date32", 4),
+        ("date64", 8),
+        ("timestamp", 8),
+        # Time types
+        ("time32", 4),
+        ("time64", 8),
+        # Decimal types
+        ("decimal128", 16),
+        ("decimal256", 32),
+    ],
+)
+def test_arrow_dtype_itemsize_fixed_width(type_name, expected_size):
+    # GH 57948
+
+    parametric_type_map = {
+        "timestamp": pa.timestamp("ns"),
+        "time32": pa.time32("s"),
+        "time64": pa.time64("ns"),
+        "decimal128": pa.decimal128(38, 10),
+        "decimal256": pa.decimal256(76, 10),
+    }
+
+    if type_name in parametric_type_map:
+        arrow_type = parametric_type_map.get(type_name)
+    else:
+        arrow_type = getattr(pa, type_name)()
+    dtype = ArrowDtype(arrow_type)
+
+    if type_name == "bool_":
+        expected_size = dtype.numpy_dtype.itemsize
+
+    assert dtype.itemsize == expected_size, (
+        f"{type_name} expected {expected_size}, got {dtype.itemsize} "
+        f"(bit_width={getattr(dtype.pyarrow_dtype, 'bit_width', 'N/A')})"
+    )
+
+
+@pytest.mark.parametrize("type_name", ["string", "binary", "large_string"])
+def test_arrow_dtype_itemsize_variable_width(type_name):
+    # GH 57948
+
+    arrow_type = getattr(pa, type_name)()
+    dtype = ArrowDtype(arrow_type)
+
+    assert dtype.itemsize == dtype.numpy_dtype.itemsize
+
+
+def test_cast_pontwise_result_decimal_nan():
+    # GH#62522 we don't want to get back null[pyarrow] here
+    ser = pd.Series([], dtype="float64[pyarrow]")
+    arr = ser.array
+    item = Decimal("NaN")
+
+    result = arr._cast_pointwise_result([item])
+
+    pa_type = result.dtype.pyarrow_dtype
+    assert pa.types.is_decimal(pa_type)
+
+
+def test_ufunc_retains_missing():
+    # GH#62800
+    ser = pd.Series([0.1, pd.NA], dtype="float64[pyarrow]")
+
+    result = np.sin(ser)
+
+    expected = pd.Series([np.sin(0.1), pd.NA], dtype="float64[pyarrow]")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["sum", "min", "max", "mean", "median"])
+def test_duration_reduction_consistency(unit, method):
+    # GH#63170
+    dtype = f"duration[{unit}][pyarrow]"
+    ser = pd.Series([timedelta(seconds=1), timedelta(seconds=2)], dtype=dtype)
+    result = getattr(ser, method)()
+    assert isinstance(result, pd.Timedelta), (
+        f"{method} for {unit} returned {type(result)}"
+    )
+    assert result.unit == unit
+
+
+@pytest.mark.parametrize("method", ["min", "max", "median"])
+def test_timestamp_reduction_consistency(unit, method):
+    # GH#63170
+    dtype = f"timestamp[{unit}][pyarrow]"
+    ser = pd.Series([datetime(2024, 1, 1), datetime(2024, 1, 3)], dtype=dtype)
+    result = getattr(ser, method)()
+    assert isinstance(result, pd.Timestamp), (
+        f"{method} for {unit} returned {type(result)}"
+    )
+    assert result.unit == unit
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb671e74f4b25924121e83cd687530136d1fb54a
--- /dev/null
+++ b/pandas/tests/extension/test_categorical.py
@@ -0,0 +1,192 @@
+"""
+This file contains a minimal set of tests for compliance with the extension
+array interface test suite, and should contain no other tests.
+The test suite for the full functionality of the array is located in
+`pandas/tests/arrays/`.
+
+The tests in this file are inherited from the BaseExtensionTests, and only
+minimal tweaks should be applied to get the tests passing (by overwriting a
+parent method).
+
+Additional tests should either be added to one of the BaseExtensionTests
+classes (if they are relevant for the extension interface for all dtypes), or
+be added to the array-specific tests in `pandas/tests/arrays/`.
+
+"""
+
+import string
+
+import numpy as np
+import pytest
+
+from pandas._config import using_string_dtype
+
+import pandas as pd
+from pandas import Categorical
+import pandas._testing as tm
+from pandas.api.types import CategoricalDtype
+from pandas.tests.extension import base
+
+
+def make_data(n: int):
+    while True:
+        values = np.random.default_rng(2).choice(list(string.ascii_letters), size=n)
+        # ensure we meet the requirements
+        # 1. first two not null
+        # 2. first and second are different
+        if values[0] != values[1]:
+            break
+    return values
+
+
+@pytest.fixture
+def dtype():
+    return CategoricalDtype()
+
+
+@pytest.fixture
+def data():
+    """Length-100 array for this type.
+
+    * data[0] and data[1] should both be non missing
+    * data[0] and data[1] should not be equal
+    """
+    return Categorical(make_data(10))
+
+
+@pytest.fixture
+def data_missing():
+    """Length 2 array with [NA, Valid]"""
+    return Categorical([np.nan, "A"])
+
+
+@pytest.fixture
+def data_for_sorting():
+    return Categorical(["A", "B", "C"], categories=["C", "A", "B"], ordered=True)
+
+
+@pytest.fixture
+def data_missing_for_sorting():
+    return Categorical(["A", None, "B"], categories=["B", "A"], ordered=True)
+
+
+@pytest.fixture
+def data_for_grouping():
+    return Categorical(["a", "a", None, None, "b", "b", "a", "c"])
+
+
+class TestCategorical(base.ExtensionTests):
+    def test_contains(self, data, data_missing):
+        # GH-37867
+        # na value handling in Categorical.__contains__ is deprecated.
+        # See base.BaseInterFaceTests.test_contains for more details.
+
+        na_value = data.dtype.na_value
+        # ensure data without missing values
+        data = data[~data.isna()]
+
+        # first elements are non-missing
+        assert data[0] in data
+        assert data_missing[0] in data_missing
+
+        # check the presence of na_value
+        assert na_value in data_missing
+        assert na_value not in data
+
+        # Categoricals can contain other nan-likes than na_value
+        for na_value_obj in tm.NULL_OBJECTS:
+            if na_value_obj is na_value:
+                continue
+            assert na_value_obj not in data
+            # this section suffers from super method
+            if not using_string_dtype():
+                assert na_value_obj in data_missing
+
+    def test_empty(self, dtype):
+        cls = dtype.construct_array_type()
+        result = cls._empty((4,), dtype=dtype)
+
+        assert isinstance(result, cls)
+        # the dtype we passed is not initialized, so will not match the
+        #  dtype on our result.
+        assert result.dtype == CategoricalDtype([])
+
+    @pytest.mark.skip(reason="Backwards compatibility")
+    def test_getitem_scalar(self, data):
+        # CategoricalDtype.type isn't "correct" since it should
+        # be a parent of the elements (object). But don't want
+        # to break things by changing.
+        super().test_getitem_scalar(data)
+
+    def test_combine_add(self, data_repeated):
+        # GH 20825
+        # When adding categoricals in combine, result is a string
+        orig_data1, orig_data2 = data_repeated(2)
+        s1 = pd.Series(orig_data1)
+        s2 = pd.Series(orig_data2)
+        result = s1.combine(s2, lambda x1, x2: x1 + x2)
+        expected = pd.Series(
+            [a + b for (a, b) in zip(list(orig_data1), list(orig_data2), strict=True)]
+        )
+        tm.assert_series_equal(result, expected)
+
+        val = s1.iloc[0]
+        result = s1.combine(val, lambda x1, x2: x1 + x2)
+        expected = pd.Series([a + val for a in list(orig_data1)])
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data, na_action):
+        result = data.map(lambda x: x, na_action=na_action)
+        tm.assert_extension_array_equal(result, data)
+
+    def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
+        # frame & scalar
+        op_name = all_arithmetic_operators
+        if op_name == "__rmod__":
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason="rmod never called when string is first argument"
+                )
+            )
+        super().test_arith_frame_with_scalar(data, op_name)
+
+    def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
+        op_name = all_arithmetic_operators
+        if op_name == "__rmod__":
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason="rmod never called when string is first argument"
+                )
+            )
+        super().test_arith_series_with_scalar(data, op_name)
+
+    def _compare_other(self, ser: pd.Series, data, op, other):
+        op_name = f"__{op.__name__}__"
+        if op_name not in ["__eq__", "__ne__"]:
+            msg = "Unordered Categoricals can only compare equality or not"
+            with pytest.raises(TypeError, match=msg):
+                op(data, other)
+        else:
+            return super()._compare_other(ser, data, op, other)
+
+    @pytest.mark.xfail(reason="Categorical overrides __repr__")
+    @pytest.mark.parametrize("size", ["big", "small"])
+    def test_array_repr(self, data, size):
+        super().test_array_repr(data, size)
+
+    @pytest.mark.xfail(reason="TBD")
+    @pytest.mark.parametrize("as_index", [True, False])
+    def test_groupby_extension_agg(self, as_index, data_for_grouping):
+        super().test_groupby_extension_agg(as_index, data_for_grouping)
+
+
+class Test2DCompat(base.NDArrayBacked2DTests):
+    def test_repr_2d(self, data):
+        # Categorical __repr__ doesn't include "Categorical", so we need
+        #  to special-case
+        res = repr(data.reshape(1, -1))
+        assert res.count("\nCategories") == 1
+
+        res = repr(data.reshape(-1, 1))
+        assert res.count("\nCategories") == 1
diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa5c91f478c45c363c34a7de1083154baa8f752b
--- /dev/null
+++ b/pandas/tests/extension/test_common.py
@@ -0,0 +1,110 @@
+import numpy as np
+import pytest
+
+from pandas.core.dtypes import dtypes
+from pandas.core.dtypes.common import is_extension_array_dtype
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.core.arrays import ExtensionArray
+
+
+class DummyDtype(dtypes.ExtensionDtype):
+    pass
+
+
+class DummyArray(ExtensionArray):
+    def __init__(self, data) -> None:
+        self.data = data
+
+    def __array__(self, dtype=None, copy=None):
+        return self.data
+
+    @property
+    def dtype(self):
+        return DummyDtype()
+
+    def astype(self, dtype, copy=True):
+        # we don't support anything but a single dtype
+        if isinstance(dtype, DummyDtype):
+            if copy:
+                return type(self)(self.data)
+            return self
+        elif not copy:
+            return np.asarray(self, dtype=dtype)
+        else:
+            return np.array(self, dtype=dtype, copy=copy)
+
+
+class TestExtensionArrayDtype:
+    @pytest.mark.parametrize(
+        "values",
+        [
+            pd.Categorical([]),
+            pd.Categorical([]).dtype,
+            pd.Series(pd.Categorical([])),
+            DummyDtype(),
+            DummyArray(np.array([1, 2])),
+        ],
+    )
+    def test_is_extension_array_dtype(self, values):
+        assert is_extension_array_dtype(values)
+
+    @pytest.mark.parametrize("values", [np.array([]), pd.Series(np.array([]))])
+    def test_is_not_extension_array_dtype(self, values):
+        assert not is_extension_array_dtype(values)
+
+
+def test_astype():
+    arr = DummyArray(np.array([1, 2, 3]))
+    expected = np.array([1, 2, 3], dtype=object)
+
+    result = arr.astype(object)
+    tm.assert_numpy_array_equal(result, expected)
+
+    result = arr.astype("object")
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_astype_no_copy():
+    arr = DummyArray(np.array([1, 2, 3], dtype=np.int64))
+    result = arr.astype(arr.dtype, copy=False)
+
+    assert arr is result
+
+    result = arr.astype(arr.dtype)
+    assert arr is not result
+
+
+@pytest.mark.parametrize("dtype", [dtypes.CategoricalDtype(), dtypes.IntervalDtype()])
+def test_is_extension_array_dtype(dtype):
+    assert isinstance(dtype, dtypes.ExtensionDtype)
+    assert is_extension_array_dtype(dtype)
+
+
+class CapturingStringArray(pd.arrays.StringArray):
+    """Extend StringArray to capture arguments to __getitem__"""
+
+    def __getitem__(self, item):
+        self.last_item_arg = item
+        return super().__getitem__(item)
+
+
+def test_ellipsis_index():
+    # GH#42430 1D slices over extension types turn into N-dimensional slices
+    #  over ExtensionArrays
+    dtype = pd.StringDtype()
+    df = pd.DataFrame(
+        {
+            "col1": CapturingStringArray(
+                np.array(["hello", "world"], dtype=object), dtype=dtype
+            )
+        }
+    )
+    _ = df.iloc[:1]
+
+    # String comparison because there's no native way to compare slices.
+    # Before the fix for GH#42430, last_item_arg would get set to the 2D slice
+    # (Ellipsis, slice(None, 1, None))
+    out = df["col1"]._values.last_item_arg
+    assert str(out) == "slice(None, 1, None)"
diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab9eff220914da0b9d33ff81bafc8e644d2290b8
--- /dev/null
+++ b/pandas/tests/extension/test_datetime.py
@@ -0,0 +1,148 @@
+"""
+This file contains a minimal set of tests for compliance with the extension
+array interface test suite, and should contain no other tests.
+The test suite for the full functionality of the array is located in
+`pandas/tests/arrays/`.
+
+The tests in this file are inherited from the BaseExtensionTests, and only
+minimal tweaks should be applied to get the tests passing (by overwriting a
+parent method).
+
+Additional tests should either be added to one of the BaseExtensionTests
+classes (if they are relevant for the extension interface for all dtypes), or
+be added to the array-specific tests in `pandas/tests/arrays/`.
+
+"""
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.dtypes import DatetimeTZDtype
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.core.arrays import DatetimeArray
+from pandas.tests.extension import base
+
+
+@pytest.fixture
+def dtype():
+    return DatetimeTZDtype(unit="ns", tz="US/Central")
+
+
+@pytest.fixture
+def data(dtype):
+    data = DatetimeArray._from_sequence(
+        pd.date_range("2000", periods=10, tz=dtype.tz), dtype=dtype
+    )
+    return data
+
+
+@pytest.fixture
+def data_missing(dtype):
+    return DatetimeArray._from_sequence(
+        np.array(["NaT", "2000-01-01"], dtype="datetime64[ns]"), dtype=dtype
+    )
+
+
+@pytest.fixture
+def data_for_sorting(dtype):
+    a = pd.Timestamp("2000-01-01")
+    b = pd.Timestamp("2000-01-02")
+    c = pd.Timestamp("2000-01-03")
+    return DatetimeArray._from_sequence(
+        np.array([b, c, a], dtype="datetime64[ns]"), dtype=dtype
+    )
+
+
+@pytest.fixture
+def data_missing_for_sorting(dtype):
+    a = pd.Timestamp("2000-01-01")
+    b = pd.Timestamp("2000-01-02")
+    return DatetimeArray._from_sequence(
+        np.array([b, "NaT", a], dtype="datetime64[ns]"), dtype=dtype
+    )
+
+
+@pytest.fixture
+def data_for_grouping(dtype):
+    """
+    Expected to be like [B, B, NA, NA, A, A, B, C]
+
+    Where A < B < C and NA is missing
+    """
+    a = pd.Timestamp("2000-01-01")
+    b = pd.Timestamp("2000-01-02")
+    c = pd.Timestamp("2000-01-03")
+    na = "NaT"
+    return DatetimeArray._from_sequence(
+        np.array([b, b, na, na, a, a, b, c], dtype="datetime64[ns]"), dtype=dtype
+    )
+
+
+@pytest.fixture
+def na_cmp():
+    def cmp(a, b):
+        return a is pd.NaT and a is b
+
+    return cmp
+
+
+# ----------------------------------------------------------------------------
+class TestDatetimeArray(base.ExtensionTests):
+    def _get_expected_exception(self, op_name, obj, other):
+        if op_name in ["__sub__", "__rsub__"]:
+            return None
+        return super()._get_expected_exception(op_name, obj, other)
+
+    def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
+        if op_name == "std":
+            return "timedelta64[ns]"
+        return arr.dtype
+
+    def _supports_accumulation(self, ser, op_name: str) -> bool:
+        return op_name in ["cummin", "cummax"]
+
+    def _supports_reduction(self, obj, op_name: str) -> bool:
+        return op_name in ["min", "max", "median", "mean", "std", "any", "all"]
+
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
+        meth = all_boolean_reductions
+        msg = f"datetime64 type does not support operation '{meth}'"
+        with pytest.raises(TypeError, match=msg):
+            super().test_reduce_series_boolean(data, all_boolean_reductions, skipna)
+
+    def test_series_constructor(self, data):
+        # Series construction drops any .freq attr
+        data = data._with_freq(None)
+        super().test_series_constructor(data)
+
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data, na_action):
+        result = data.map(lambda x: x, na_action=na_action)
+        tm.assert_extension_array_equal(result, data)
+
+    def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
+        if op_name in ["median", "mean", "std"]:
+            alt = ser.astype("int64")
+
+            res_op = getattr(ser, op_name)
+            exp_op = getattr(alt, op_name)
+            result = res_op(skipna=skipna)
+            expected = exp_op(skipna=skipna)
+            if op_name in ["mean", "median"]:
+                # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype"
+                # has no attribute "tz"
+                tz = ser.dtype.tz  # type: ignore[union-attr]
+                expected = pd.Timestamp(expected, tz=tz)
+            else:
+                expected = pd.Timedelta(expected)
+            tm.assert_almost_equal(result, expected)
+
+        else:
+            return super().check_reduce(ser, op_name, skipna)
+
+
+class Test2DCompat(base.NDArrayBacked2DTests):
+    pass
diff --git a/pandas/tests/extension/test_extension.py b/pandas/tests/extension/test_extension.py
new file mode 100644
index 0000000000000000000000000000000000000000..456f4863b1c313c624ac42905a3b4117f6ba556d
--- /dev/null
+++ b/pandas/tests/extension/test_extension.py
@@ -0,0 +1,27 @@
+"""
+Tests for behavior if an author does *not* implement EA methods.
+"""
+
+import numpy as np
+import pytest
+
+from pandas.core.arrays import ExtensionArray
+
+
+class MyEA(ExtensionArray):
+    def __init__(self, values) -> None:
+        self._values = values
+
+
+@pytest.fixture
+def data():
+    arr = np.arange(10)
+    return MyEA(arr)
+
+
+class TestExtensionArray:
+    def test_errors(self, data, all_arithmetic_operators):
+        # invalid ops
+        op_name = all_arithmetic_operators
+        with pytest.raises(AttributeError):
+            getattr(data, op_name)
diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py
new file mode 100644
index 0000000000000000000000000000000000000000..47bc26ba4a7666b67fe774b4287a6e9660cd82c8
--- /dev/null
+++ b/pandas/tests/extension/test_interval.py
@@ -0,0 +1,147 @@
+"""
+This file contains a minimal set of tests for compliance with the extension
+array interface test suite, and should contain no other tests.
+The test suite for the full functionality of the array is located in
+`pandas/tests/arrays/`.
+
+The tests in this file are inherited from the BaseExtensionTests, and only
+minimal tweaks should be applied to get the tests passing (by overwriting a
+parent method).
+
+Additional tests should either be added to one of the BaseExtensionTests
+classes (if they are relevant for the extension interface for all dtypes), or
+be added to the array-specific tests in `pandas/tests/arrays/`.
+
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.dtypes import IntervalDtype
+
+from pandas import Interval
+from pandas.core.arrays import IntervalArray
+from pandas.tests.extension import base
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+
+def make_data(n: int):
+    left_array = np.random.default_rng(2).uniform(size=n).cumsum()
+    right_array = left_array + np.random.default_rng(2).uniform(size=n)
+    return [
+        Interval(left, right)
+        for left, right in zip(left_array, right_array, strict=True)
+    ]
+
+
+@pytest.fixture
+def dtype():
+    return IntervalDtype()
+
+
+@pytest.fixture
+def data():
+    """Length-10 IntervalArray for semantics test."""
+    return IntervalArray(make_data(10))
+
+
+@pytest.fixture
+def data_missing():
+    """Length 2 array with [NA, Valid]"""
+    return IntervalArray.from_tuples([None, (0, 1)])
+
+
+@pytest.fixture
+def data_for_twos():
+    pytest.skip("Interval is not a numeric dtype")
+
+
+@pytest.fixture
+def data_for_sorting():
+    return IntervalArray.from_tuples([(1, 2), (2, 3), (0, 1)])
+
+
+@pytest.fixture
+def data_missing_for_sorting():
+    return IntervalArray.from_tuples([(1, 2), None, (0, 1)])
+
+
+@pytest.fixture
+def data_for_grouping():
+    a = (0, 1)
+    b = (1, 2)
+    c = (2, 3)
+    return IntervalArray.from_tuples([b, b, None, None, a, a, b, c])
+
+
+class TestIntervalArray(base.ExtensionTests):
+    divmod_exc = TypeError
+
+    def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
+        return op_name in ["min", "max"]
+
+    def test_fillna_limit_frame(self, data_missing):
+        # GH#58001
+        with pytest.raises(ValueError, match="limit must be None"):
+            super().test_fillna_limit_frame(data_missing)
+
+    def test_fillna_limit_series(self, data_missing):
+        # GH#58001
+        with pytest.raises(ValueError, match="limit must be None"):
+            super().test_fillna_limit_frame(data_missing)
+
+    @pytest.mark.xfail(
+        reason="Raises with incorrect message bc it disallows *all* listlikes "
+        "instead of just wrong-length listlikes"
+    )
+    def test_fillna_length_mismatch(self, data_missing):
+        super().test_fillna_length_mismatch(data_missing)
+
+    @pytest.mark.xfail(reason="copy=False is not Implemented")
+    def test_fillna_readonly(self, data_missing):
+        super().test_fillna_readonly(data_missing)
+
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
+    def test_hash_pandas_object(self, data):
+        super().test_hash_pandas_object(data)
+
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
+    def test_hash_pandas_object_works(self, data, as_frame):
+        super().test_hash_pandas_object_works(data, as_frame)
+
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    def test_EA_types(self, engine, data, request):
+        super().test_EA_types(engine, data, request)
+
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
+    def test_astype_str(self, data):
+        super().test_astype_str(data)
+
+    @pytest.mark.xfail(
+        reason="Test is invalid for IntervalDtype, needs to be adapted for "
+        "this dtype with an index with index._index_as_unique."
+    )
+    def test_loc_setitem_with_expansion_preserves_ea_index_dtype(self, data):
+        super().test_loc_setitem_with_expansion_preserves_ea_index_dtype(data)
+
+
+# TODO: either belongs in tests.arrays.interval or move into base tests.
+def test_fillna_non_scalar_raises(data_missing):
+    msg = "can only insert Interval objects and NA into an IntervalArray"
+    with pytest.raises(TypeError, match=msg):
+        data_missing.fillna([1, 1])
diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py
new file mode 100644
index 0000000000000000000000000000000000000000..fadc51ea714c0413133c1ced5b6666e52fd1cf51
--- /dev/null
+++ b/pandas/tests/extension/test_masked.py
@@ -0,0 +1,375 @@
+"""
+This file contains a minimal set of tests for compliance with the extension
+array interface test suite, and should contain no other tests.
+The test suite for the full functionality of the array is located in
+`pandas/tests/arrays/`.
+
+The tests in this file are inherited from the BaseExtensionTests, and only
+minimal tweaks should be applied to get the tests passing (by overwriting a
+parent method).
+
+Additional tests should either be added to one of the BaseExtensionTests
+classes (if they are relevant for the extension interface for all dtypes), or
+be added to the array-specific tests in `pandas/tests/arrays/`.
+
+"""
+
+import numpy as np
+import pytest
+
+from pandas.compat import (
+    IS64,
+    is_platform_windows,
+)
+from pandas.compat.numpy import np_version_gt2
+
+from pandas.core.dtypes.common import (
+    is_float_dtype,
+    is_signed_integer_dtype,
+    is_unsigned_integer_dtype,
+)
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.core.arrays.boolean import BooleanDtype
+from pandas.core.arrays.floating import (
+    Float32Dtype,
+    Float64Dtype,
+)
+from pandas.core.arrays.integer import (
+    Int8Dtype,
+    Int16Dtype,
+    Int32Dtype,
+    Int64Dtype,
+    UInt8Dtype,
+    UInt16Dtype,
+    UInt32Dtype,
+    UInt64Dtype,
+)
+from pandas.tests.extension import base
+
+is_windows_or_32bit = (is_platform_windows() and not np_version_gt2) or not IS64
+
+pytestmark = [
+    pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in divide:RuntimeWarning"
+    ),
+    pytest.mark.filterwarnings("ignore:Mean of empty slice:RuntimeWarning"),
+    # overflow only relevant for Floating dtype cases cases
+    pytest.mark.filterwarnings("ignore:overflow encountered in reduce:RuntimeWarning"),
+]
+
+
+def make_data():
+    return [1, 2, 3, 4, pd.NA, 10, 11, pd.NA, 99, 100]
+
+
+def make_float_data():
+    return [0.1, 0.2, 0.3, 0.4, pd.NA, 1.0, 1.1, pd.NA, 9.9, 10.0]
+
+
+def make_bool_data():
+    return [True, False] * 2 + [np.nan] + [True, False] + [np.nan] + [True, False]
+
+
+@pytest.fixture(
+    params=[
+        Int8Dtype,
+        Int16Dtype,
+        Int32Dtype,
+        Int64Dtype,
+        UInt8Dtype,
+        UInt16Dtype,
+        UInt32Dtype,
+        UInt64Dtype,
+        Float32Dtype,
+        Float64Dtype,
+        BooleanDtype,
+    ]
+)
+def dtype(request):
+    return request.param()
+
+
+@pytest.fixture
+def data(dtype):
+    if dtype.kind == "f":
+        data = make_float_data()
+    elif dtype.kind == "b":
+        data = make_bool_data()
+    else:
+        data = make_data()
+    return pd.array(data, dtype=dtype)
+
+
+@pytest.fixture
+def data_for_twos(dtype):
+    if dtype.kind == "b":
+        return pd.array(np.ones(10), dtype=dtype)
+    return pd.array(np.ones(10) * 2, dtype=dtype)
+
+
+@pytest.fixture
+def data_missing(dtype):
+    if dtype.kind == "f":
+        return pd.array([pd.NA, 0.1], dtype=dtype)
+    elif dtype.kind == "b":
+        return pd.array([np.nan, True], dtype=dtype)
+    return pd.array([pd.NA, 1], dtype=dtype)
+
+
+@pytest.fixture
+def data_for_sorting(dtype):
+    if dtype.kind == "f":
+        return pd.array([0.1, 0.2, 0.0], dtype=dtype)
+    elif dtype.kind == "b":
+        return pd.array([True, True, False], dtype=dtype)
+    return pd.array([1, 2, 0], dtype=dtype)
+
+
+@pytest.fixture
+def data_missing_for_sorting(dtype):
+    if dtype.kind == "f":
+        return pd.array([0.1, pd.NA, 0.0], dtype=dtype)
+    elif dtype.kind == "b":
+        return pd.array([True, np.nan, False], dtype=dtype)
+    return pd.array([1, pd.NA, 0], dtype=dtype)
+
+
+@pytest.fixture
+def na_cmp():
+    # we are pd.NA
+    return lambda x, y: x is pd.NA and y is pd.NA
+
+
+@pytest.fixture
+def data_for_grouping(dtype):
+    if dtype.kind == "f":
+        b = 0.1
+        a = 0.0
+        c = 0.2
+    elif dtype.kind == "b":
+        b = True
+        a = False
+        c = b
+    else:
+        b = 1
+        a = 0
+        c = 2
+
+    na = pd.NA
+    return pd.array([b, b, na, na, a, a, b, c], dtype=dtype)
+
+
+class TestMaskedArrays(base.ExtensionTests):
+    _combine_le_expected_dtype = "boolean"
+
+    @pytest.fixture(autouse=True)
+    def skip_if_doesnt_support_2d(self, dtype, request):
+        # Override the fixture so that we run these tests.
+        assert not dtype._supports_2d
+        # If dtype._supports_2d is ever changed to True, then this fixture
+        #  override becomes unnecessary.
+
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data_missing, na_action, using_nan_is_na):
+        result = data_missing.map(lambda x: x, na_action=na_action)
+        if data_missing.dtype == Float32Dtype() and using_nan_is_na:
+            # map roundtrips through objects, which converts to float64
+            expected = data_missing.to_numpy(dtype="float64", na_value=np.nan)
+        else:
+            expected = data_missing.to_numpy()
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_map_na_action_ignore(self, data_missing_for_sorting, using_nan_is_na):
+        zero = data_missing_for_sorting[2]
+        result = data_missing_for_sorting.map(lambda x: zero, na_action="ignore")
+        if data_missing_for_sorting.dtype.kind == "b":
+            expected = np.array([False, pd.NA, False], dtype=object)
+        elif not using_nan_is_na:
+            # TODO: would we prefer to get NaN in this case to get a non-object?
+            expected = np.array([zero, pd.NA, zero], dtype=object)
+        else:
+            expected = np.array([zero, np.nan, zero])
+        tm.assert_numpy_array_equal(result, expected)
+
+    def _get_expected_exception(self, op_name, obj, other):
+        try:
+            dtype = tm.get_dtype(obj)
+        except AttributeError:
+            # passed arguments reversed
+            dtype = tm.get_dtype(other)
+
+        if dtype.kind == "b":
+            if op_name.strip("_").lstrip("r") in ["pow", "truediv", "floordiv"]:
+                # match behavior with non-masked bool dtype
+                return NotImplementedError
+            elif op_name in ["__sub__", "__rsub__"]:
+                # exception message would include "numpy boolean subtract""
+                return TypeError
+            return None
+        return None
+
+    def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
+        sdtype = tm.get_dtype(obj)
+        expected = pointwise_result
+
+        if sdtype.kind == "b":
+            if op_name in (
+                "__mod__",
+                "__rmod__",
+            ):
+                # combine keeps boolean type
+                expected = expected.astype("Int8")
+
+        return expected
+
+    def test_divmod_series_array(self, data, data_for_twos, request):
+        if data.dtype.kind == "b":
+            mark = pytest.mark.xfail(
+                reason="Inconsistency between floordiv and divmod; we raise for "
+                "floordiv but not for divmod. This matches what we do for "
+                "non-masked bool dtype."
+            )
+            request.applymarker(mark)
+        super().test_divmod_series_array(data, data_for_twos)
+
+    def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
+        if op_name in ["any", "all"] and ser.dtype.kind != "b":
+            pytest.skip(reason="Tested in tests/reductions/test_reductions.py")
+        return True
+
+    def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
+        # overwrite to ensure pd.NA is tested instead of np.nan
+        # https://github.com/pandas-dev/pandas/issues/30958
+
+        cmp_dtype = "int64"
+        if ser.dtype.kind == "f":
+            # Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has
+            # no attribute "numpy_dtype"
+            cmp_dtype = ser.dtype.numpy_dtype  # type: ignore[union-attr]
+        elif ser.dtype.kind == "b":
+            if op_name in ["min", "max"]:
+                cmp_dtype = "bool"
+
+        # TODO: prod with integer dtypes does *not* match the result we would
+        #  get if we used object for cmp_dtype. In that cae the object result
+        #  is a large integer while the non-object case overflows and returns 0
+        alt = ser.dropna().astype(cmp_dtype)
+        if op_name == "count":
+            result = getattr(ser, op_name)()
+            expected = getattr(alt, op_name)()
+        else:
+            result = getattr(ser, op_name)(skipna=skipna)
+            expected = getattr(alt, op_name)(skipna=skipna)
+            if not skipna and ser.isna().any() and op_name not in ["any", "all"]:
+                expected = pd.NA
+        tm.assert_almost_equal(result, expected)
+
+    def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
+        if is_float_dtype(arr.dtype):
+            cmp_dtype = arr.dtype.name
+        elif op_name in ["mean", "median", "var", "std", "skew", "kurt", "sem"]:
+            cmp_dtype = "Float64"
+        elif op_name in ["max", "min"]:
+            cmp_dtype = arr.dtype.name
+        elif arr.dtype in ["Int64", "UInt64"]:
+            cmp_dtype = arr.dtype.name
+        elif is_signed_integer_dtype(arr.dtype):
+            # TODO: Why does Window Numpy 2.0 dtype depend on skipna?
+            cmp_dtype = (
+                "Int32"
+                if (is_platform_windows() and (not np_version_gt2 or not skipna))
+                or not IS64
+                else "Int64"
+            )
+        elif is_unsigned_integer_dtype(arr.dtype):
+            cmp_dtype = (
+                "UInt32"
+                if (is_platform_windows() and (not np_version_gt2 or not skipna))
+                or not IS64
+                else "UInt64"
+            )
+        elif arr.dtype.kind == "b":
+            if op_name in ["min", "max"]:
+                cmp_dtype = "boolean"
+            elif op_name in ["sum", "prod"]:
+                cmp_dtype = (
+                    "Int32"
+                    if (is_platform_windows() and (not np_version_gt2 or not skipna))
+                    or not IS64
+                    else "Int64"
+                )
+            else:
+                raise TypeError("not supposed to reach this")
+        else:
+            raise TypeError("not supposed to reach this")
+        return cmp_dtype
+
+    def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
+        return True
+
+    def check_accumulate(self, ser: pd.Series, op_name: str, skipna: bool):
+        # overwrite to ensure pd.NA is tested instead of np.nan
+        # https://github.com/pandas-dev/pandas/issues/30958
+        length = 64
+        if is_windows_or_32bit:
+            # Item "ExtensionDtype" of "Union[dtype[Any], ExtensionDtype]" has
+            # no attribute "itemsize"
+            if not ser.dtype.itemsize == 8:  # type: ignore[union-attr]
+                length = 32
+
+        if ser.dtype.name.startswith("U"):
+            expected_dtype = f"UInt{length}"
+        elif ser.dtype.name.startswith("I"):
+            expected_dtype = f"Int{length}"
+        elif ser.dtype.name.startswith("F"):
+            # Incompatible types in assignment (expression has type
+            # "Union[dtype[Any], ExtensionDtype]", variable has type "str")
+            expected_dtype = ser.dtype  # type: ignore[assignment]
+        elif ser.dtype.kind == "b":
+            if op_name in ("cummin", "cummax"):
+                expected_dtype = "boolean"
+            else:
+                expected_dtype = f"Int{length}"
+
+        if expected_dtype == "Float32" and op_name == "cumprod" and skipna:
+            # TODO: xfail?
+            pytest.skip(
+                f"Float32 precision lead to large differences with op {op_name} "
+                f"and skipna={skipna}"
+            )
+
+        if op_name == "cumsum":
+            pass
+        elif op_name in ["cummax", "cummin"]:
+            expected_dtype = ser.dtype  # type: ignore[assignment]
+        elif op_name == "cumprod":
+            ser = ser[:12]
+        else:
+            raise NotImplementedError(f"{op_name} not supported")
+
+        result = getattr(ser, op_name)(skipna=skipna)
+        expected = pd.Series(
+            pd.array(
+                getattr(ser.astype("float64"), op_name)(skipna=skipna),
+                dtype="Float64",
+            )
+        )
+        expected[np.isnan(expected)] = pd.NA
+        expected = expected.astype(expected_dtype)
+        tm.assert_series_equal(result, expected)
+
+    def test_loc_setitem_with_expansion_preserves_ea_index_dtype(self, data, request):
+        super().test_loc_setitem_with_expansion_preserves_ea_index_dtype(data)
+
+
+@pytest.mark.parametrize(
+    "arr", [pd.array([True, False]), pd.array([1, 2]), pd.array([1.0, 2.0])]
+)
+def test_cast_pointwise_result_all_na_respects_original_dtype(arr):
+    # GH#62344
+    values = [pd.NA, pd.NA]
+    result = arr._cast_pointwise_result(values)
+    assert result.dtype == arr.dtype
+    assert all(x is pd.NA for x in result)
diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3f619e4263df86162ef6d9dee9698aa7492d83d
--- /dev/null
+++ b/pandas/tests/extension/test_numpy.py
@@ -0,0 +1,439 @@
+"""
+This file contains a minimal set of tests for compliance with the extension
+array interface test suite, and should contain no other tests.
+The test suite for the full functionality of the array is located in
+`pandas/tests/arrays/`.
+
+The tests in this file are inherited from the BaseExtensionTests, and only
+minimal tweaks should be applied to get the tests passing (by overwriting a
+parent method).
+
+Additional tests should either be added to one of the BaseExtensionTests
+classes (if they are relevant for the extension interface for all dtypes), or
+be added to the array-specific tests in `pandas/tests/arrays/`.
+
+Note: we do not bother with base.BaseIndexTests because NumpyExtensionArray
+will never be held in an Index.
+"""
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.dtypes import NumpyEADtype
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.api.types import is_object_dtype
+from pandas.core.arrays.numpy_ import NumpyExtensionArray
+from pandas.tests.extension import base
+
+orig_assert_attr_equal = tm.assert_attr_equal
+
+
+def _assert_attr_equal(attr: str, left, right, obj: str = "Attributes"):
+    """
+    patch tm.assert_attr_equal so NumpyEADtype("object") is closed enough to
+    np.dtype("object")
+    """
+    if attr == "dtype":
+        lattr = getattr(left, "dtype", None)
+        rattr = getattr(right, "dtype", None)
+        if isinstance(lattr, NumpyEADtype) and not isinstance(rattr, NumpyEADtype):
+            left = left.astype(lattr.numpy_dtype)
+        elif isinstance(rattr, NumpyEADtype) and not isinstance(lattr, NumpyEADtype):
+            right = right.astype(rattr.numpy_dtype)
+
+    orig_assert_attr_equal(attr, left, right, obj)
+
+
+@pytest.fixture(params=["float", "object"])
+def dtype(request):
+    return NumpyEADtype(np.dtype(request.param))
+
+
+@pytest.fixture
+def allow_in_pandas(monkeypatch):
+    """
+    A monkeypatch to tells pandas to let us in.
+
+    By default, passing a NumpyExtensionArray to an index / series / frame
+    constructor will unbox that NumpyExtensionArray to an ndarray, and treat
+    it as a non-EA column. We don't want people using EAs without
+    reason.
+
+    The mechanism for this is a check against ABCNumpyExtensionArray
+    in each constructor.
+
+    But, for testing, we need to allow them in pandas. So we patch
+    the _typ of NumpyExtensionArray, so that we evade the ABCNumpyExtensionArray
+    check.
+    """
+    with monkeypatch.context() as m:
+        m.setattr(NumpyExtensionArray, "_typ", "extension")
+        m.setattr(tm.asserters, "assert_attr_equal", _assert_attr_equal)
+        yield
+
+
+@pytest.fixture
+def data(allow_in_pandas, dtype):
+    if dtype.numpy_dtype == "object":
+        arr = pd.Series([(i,) for i in range(10)])._values
+    else:
+        arr = np.arange(1, 11, dtype=dtype._dtype)
+    return NumpyExtensionArray(arr)
+
+
+@pytest.fixture
+def data_missing(allow_in_pandas, dtype):
+    if dtype.numpy_dtype == "object":
+        return NumpyExtensionArray(np.array([np.nan, (1,)], dtype=object))
+    return NumpyExtensionArray(np.array([np.nan, 1.0]))
+
+
+@pytest.fixture
+def na_cmp():
+    def cmp(a, b):
+        return np.isnan(a) and np.isnan(b)
+
+    return cmp
+
+
+@pytest.fixture
+def data_for_sorting(allow_in_pandas, dtype):
+    """Length-3 array with a known sort order.
+
+    This should be three items [B, C, A] with
+    A < B < C
+    """
+    if dtype.numpy_dtype == "object":
+        # Use an empty tuple for first element, then remove,
+        # to disable np.array's shape inference.
+        return NumpyExtensionArray(np.array([(), (2,), (3,), (1,)], dtype=object)[1:])
+    return NumpyExtensionArray(np.array([1, 2, 0]))
+
+
+@pytest.fixture
+def data_missing_for_sorting(allow_in_pandas, dtype):
+    """Length-3 array with a known sort order.
+
+    This should be three items [B, NA, A] with
+    A < B and NA missing.
+    """
+    if dtype.numpy_dtype == "object":
+        return NumpyExtensionArray(np.array([(1,), np.nan, (0,)], dtype=object))
+    return NumpyExtensionArray(np.array([1, np.nan, 0]))
+
+
+@pytest.fixture
+def data_for_grouping(allow_in_pandas, dtype):
+    """Data for factorization, grouping, and unique tests.
+
+    Expected to be like [B, B, NA, NA, A, A, B, C]
+
+    Where A < B < C and NA is missing
+    """
+    if dtype.numpy_dtype == "object":
+        a, b, c = (1,), (2,), (3,)
+    else:
+        a, b, c = np.arange(3)
+    return NumpyExtensionArray(
+        np.array([b, b, np.nan, np.nan, a, a, b, c], dtype=dtype.numpy_dtype)
+    )
+
+
+@pytest.fixture
+def data_for_twos(dtype):
+    if dtype.kind == "O":
+        pytest.skip(f"{dtype} is not a numeric dtype")
+    arr = np.ones(10) * 2
+    return NumpyExtensionArray._from_sequence(arr, dtype=dtype)
+
+
+@pytest.fixture
+def skip_numpy_object(dtype, request):
+    """
+    Tests for NumpyExtensionArray with nested data. Users typically won't create
+    these objects via `pd.array`, but they can show up through `.array`
+    on a Series with nested data. Many of the base tests fail, as they aren't
+    appropriate for nested data.
+
+    This fixture allows these tests to be skipped when used as a usefixtures
+    marker to either an individual test or a test class.
+    """
+    if dtype == "object":
+        mark = pytest.mark.xfail(reason="Fails for object dtype")
+        request.applymarker(mark)
+
+
+skip_nested = pytest.mark.usefixtures("skip_numpy_object")
+
+
+class TestNumpyExtensionArray(base.ExtensionTests):
+    @pytest.mark.skip(reason="We don't register our dtype")
+    # We don't want to register. This test should probably be split in two.
+    def test_from_dtype(self, data):
+        pass
+
+    @skip_nested
+    def test_series_constructor_scalar_with_index(self, data, dtype):
+        # ValueError: Length of passed values is 1, index implies 3.
+        super().test_series_constructor_scalar_with_index(data, dtype)
+
+    def test_check_dtype(self, data, request, using_infer_string):
+        if data.dtype.numpy_dtype == "object":
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=f"NumpyExtensionArray expectedly clashes with a "
+                    f"NumPy name: {data.dtype.numpy_dtype}"
+                )
+            )
+        super().test_check_dtype(data)
+
+    def test_is_not_object_type(self, dtype, request):
+        if dtype.numpy_dtype == "object":
+            # Different from BaseDtypeTests.test_is_not_object_type
+            # because NumpyEADtype(object) is an object type
+            assert is_object_dtype(dtype)
+        else:
+            super().test_is_not_object_type(dtype)
+
+    @skip_nested
+    def test_getitem_scalar(self, data):
+        # AssertionError
+        super().test_getitem_scalar(data)
+
+    @skip_nested
+    def test_shift_fill_value(self, data):
+        # np.array shape inference. Shift implementation fails.
+        super().test_shift_fill_value(data)
+
+    @skip_nested
+    def test_fillna_limit_frame(self, data_missing):
+        # GH#58001
+        # The "scalar" for this array isn't a scalar.
+        super().test_fillna_limit_frame(data_missing)
+
+    @skip_nested
+    def test_fillna_limit_series(self, data_missing):
+        # GH#58001
+        # The "scalar" for this array isn't a scalar.
+        super().test_fillna_limit_series(data_missing)
+
+    @skip_nested
+    def test_fillna_copy_frame(self, data_missing):
+        # The "scalar" for this array isn't a scalar.
+        super().test_fillna_copy_frame(data_missing)
+
+    @skip_nested
+    def test_fillna_copy_series(self, data_missing):
+        # The "scalar" for this array isn't a scalar.
+        super().test_fillna_copy_series(data_missing)
+
+    @skip_nested
+    def test_searchsorted(self, data_for_sorting, as_series):
+        # TODO: NumpyExtensionArray.searchsorted calls ndarray.searchsorted which
+        #  isn't quite what we want in nested data cases. Instead we need to
+        #  adapt something like libindex._bin_search.
+        super().test_searchsorted(data_for_sorting, as_series)
+
+    @pytest.mark.xfail(reason="NumpyExtensionArray.diff may fail on dtype")
+    def test_diff(self, data, periods):
+        return super().test_diff(data, periods)
+
+    def test_insert(self, data, request):
+        if data.dtype.numpy_dtype == object:
+            mark = pytest.mark.xfail(reason="Dimension mismatch in np.concatenate")
+            request.applymarker(mark)
+
+        super().test_insert(data)
+
+    @skip_nested
+    def test_insert_invalid(self, data, invalid_scalar):
+        # NumpyExtensionArray[object] can hold anything, so skip
+        super().test_insert_invalid(data, invalid_scalar)
+
+    divmod_exc = None
+    series_scalar_exc = None
+    frame_scalar_exc = None
+    series_array_exc = None
+
+    def test_divmod(self, data):
+        divmod_exc = None
+        if data.dtype.kind == "O":
+            divmod_exc = TypeError
+        self.divmod_exc = divmod_exc
+        super().test_divmod(data)
+
+    def test_divmod_series_array(self, data):
+        ser = pd.Series(data)
+        exc = None
+        if data.dtype.kind == "O":
+            exc = TypeError
+            self.divmod_exc = exc
+        self._check_divmod_op(ser, divmod, data)
+
+    def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
+        opname = all_arithmetic_operators
+        series_scalar_exc = None
+        if data.dtype.numpy_dtype == object:
+            if opname in ["__mul__", "__rmul__"]:
+                mark = pytest.mark.xfail(
+                    reason="the Series.combine step raises but not the Series method."
+                )
+                request.node.add_marker(mark)
+            series_scalar_exc = TypeError
+        self.series_scalar_exc = series_scalar_exc
+        super().test_arith_series_with_scalar(data, all_arithmetic_operators)
+
+    def test_arith_series_with_array(self, data, all_arithmetic_operators):
+        opname = all_arithmetic_operators
+        series_array_exc = None
+        if data.dtype.numpy_dtype == object and opname not in ["__add__", "__radd__"]:
+            series_array_exc = TypeError
+        self.series_array_exc = series_array_exc
+        super().test_arith_series_with_array(data, all_arithmetic_operators)
+
+    def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
+        opname = all_arithmetic_operators
+        frame_scalar_exc = None
+        if data.dtype.numpy_dtype == object:
+            if opname in ["__mul__", "__rmul__"]:
+                mark = pytest.mark.xfail(
+                    reason="the Series.combine step raises but not the Series method."
+                )
+                request.node.add_marker(mark)
+            frame_scalar_exc = TypeError
+        self.frame_scalar_exc = frame_scalar_exc
+        super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
+
+    def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
+        if ser.dtype.kind == "O":
+            return op_name in ["sum", "min", "max", "any", "all"]
+        return True
+
+    def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
+        res_op = getattr(ser, op_name)
+        # avoid coercing int -> float. Just cast to the actual numpy type.
+        # error: Item "ExtensionDtype" of "dtype[Any] | ExtensionDtype" has
+        # no attribute "numpy_dtype"
+        cmp_dtype = ser.dtype.numpy_dtype  # type: ignore[union-attr]
+        alt = ser.astype(cmp_dtype)
+        exp_op = getattr(alt, op_name)
+        if op_name == "count":
+            result = res_op()
+            expected = exp_op()
+        else:
+            result = res_op(skipna=skipna)
+            expected = exp_op(skipna=skipna)
+        tm.assert_almost_equal(result, expected)
+
+    @pytest.mark.skip("TODO: tests not written yet")
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_frame(self, data, all_numeric_reductions, skipna):
+        pass
+
+    @skip_nested
+    def test_fillna_series(self, data_missing):
+        # Non-scalar "scalar" values.
+        super().test_fillna_series(data_missing)
+
+    @skip_nested
+    def test_fillna_frame(self, data_missing):
+        # Non-scalar "scalar" values.
+        super().test_fillna_frame(data_missing)
+
+    @skip_nested
+    def test_fillna_readonly(self, data_missing):
+        # Non-scalar "scalar" values.
+        super().test_fillna_readonly(data_missing)
+
+    @skip_nested
+    def test_setitem_invalid(self, data, invalid_scalar):
+        # object dtype can hold anything, so doesn't raise
+        super().test_setitem_invalid(data, invalid_scalar)
+
+    @skip_nested
+    def test_setitem_sequence_broadcasts(self, data, box_in_series):
+        # ValueError: cannot set using a list-like indexer with a different
+        # length than the value
+        super().test_setitem_sequence_broadcasts(data, box_in_series)
+
+    @skip_nested
+    @pytest.mark.parametrize("setter", ["loc", None])
+    def test_setitem_mask_broadcast(self, data, setter):
+        # ValueError: cannot set using a list-like indexer with a different
+        # length than the value
+        super().test_setitem_mask_broadcast(data, setter)
+
+    @skip_nested
+    def test_setitem_scalar_key_sequence_raise(self, data):
+        # Failed: DID NOT RAISE <class 'ValueError'>
+        super().test_setitem_scalar_key_sequence_raise(data)
+
+    # TODO: there is some issue with NumpyExtensionArray, therefore,
+    #   skip the setitem test for now, and fix it later (GH 31446)
+
+    @skip_nested
+    @pytest.mark.parametrize(
+        "mask",
+        [
+            np.array([True, True, True, False, False]),
+            pd.array([True, True, True, False, False], dtype="boolean"),
+        ],
+        ids=["numpy-array", "boolean-array"],
+    )
+    def test_setitem_mask(self, data, mask, box_in_series):
+        super().test_setitem_mask(data, mask, box_in_series)
+
+    @skip_nested
+    @pytest.mark.parametrize(
+        "idx",
+        [[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
+        ids=["list", "integer-array", "numpy-array"],
+    )
+    def test_setitem_integer_array(self, data, idx, box_in_series):
+        super().test_setitem_integer_array(data, idx, box_in_series)
+
+    @skip_nested
+    def test_setitem_slice(self, data, box_in_series):
+        super().test_setitem_slice(data, box_in_series)
+
+    @skip_nested
+    def test_setitem_loc_iloc_slice(self, data):
+        super().test_setitem_loc_iloc_slice(data)
+
+    def test_setitem_with_expansion_dataframe_column(self, data, full_indexer):
+        # https://github.com/pandas-dev/pandas/issues/32395
+        df = expected = pd.DataFrame({"data": pd.Series(data)})
+        result = pd.DataFrame(index=df.index)
+
+        # because result has object dtype, the attempt to do setting inplace
+        #  is successful, and object dtype is retained
+        key = full_indexer(df)
+        result.loc[key, "data"] = df["data"]
+
+        # base class method has expected = df; NumpyExtensionArray behaves oddly because
+        #  we patch _typ for these tests.
+        if data.dtype.numpy_dtype != object:
+            if not isinstance(key, slice) or key != slice(None):
+                expected = pd.DataFrame({"data": data.to_numpy()})
+        tm.assert_frame_equal(result, expected, check_column_type=False)
+
+    @pytest.mark.xfail(reason="NumpyEADtype is unpacked")
+    def test_index_from_listlike_with_dtype(self, data):
+        super().test_index_from_listlike_with_dtype(data)
+
+    @skip_nested
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    def test_EA_types(self, engine, data, request):
+        super().test_EA_types(engine, data, request)
+
+    def test_loc_setitem_with_expansion_preserves_ea_index_dtype(self, data, request):
+        if isinstance(data[-1], tuple):
+            mark = pytest.mark.xfail(reason="Unpacks tuple")
+            request.applymarker(mark)
+        super().test_loc_setitem_with_expansion_preserves_ea_index_dtype(data)
+
+
+class Test2DCompat(base.NDArrayBacked2DTests):
+    pass
diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3be4e2b4420a569b2b5249c60a570897c847474
--- /dev/null
+++ b/pandas/tests/extension/test_period.py
@@ -0,0 +1,116 @@
+"""
+This file contains a minimal set of tests for compliance with the extension
+array interface test suite, and should contain no other tests.
+The test suite for the full functionality of the array is located in
+`pandas/tests/arrays/`.
+
+The tests in this file are inherited from the BaseExtensionTests, and only
+minimal tweaks should be applied to get the tests passing (by overwriting a
+parent method).
+
+Additional tests should either be added to one of the BaseExtensionTests
+classes (if they are relevant for the extension interface for all dtypes), or
+be added to the array-specific tests in `pandas/tests/arrays/`.
+
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pytest
+
+from pandas._libs import (
+    Period,
+    iNaT,
+)
+
+from pandas.core.dtypes.dtypes import PeriodDtype
+
+import pandas._testing as tm
+from pandas.core.arrays import PeriodArray
+from pandas.tests.extension import base
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+
+@pytest.fixture(params=["D", "2D"])
+def dtype(request):
+    return PeriodDtype(freq=request.param)
+
+
+@pytest.fixture
+def data(dtype):
+    return PeriodArray(np.arange(1970, 1980), dtype=dtype)
+
+
+@pytest.fixture
+def data_for_sorting(dtype):
+    return PeriodArray([2018, 2019, 2017], dtype=dtype)
+
+
+@pytest.fixture
+def data_missing(dtype):
+    return PeriodArray([iNaT, 2017], dtype=dtype)
+
+
+@pytest.fixture
+def data_missing_for_sorting(dtype):
+    return PeriodArray([2018, iNaT, 2017], dtype=dtype)
+
+
+@pytest.fixture
+def data_for_grouping(dtype):
+    B = 2018
+    NA = iNaT
+    A = 2017
+    C = 2019
+    return PeriodArray([B, B, NA, NA, A, A, B, C], dtype=dtype)
+
+
+class TestPeriodArray(base.ExtensionTests):
+    def _get_expected_exception(self, op_name, obj, other):
+        if op_name in ("__sub__", "__rsub__"):
+            return None
+        return super()._get_expected_exception(op_name, obj, other)
+
+    def _supports_accumulation(self, ser, op_name: str) -> bool:
+        return op_name in ["cummin", "cummax"]
+
+    def _supports_reduction(self, obj, op_name: str) -> bool:
+        return op_name in ["min", "max", "median"]
+
+    def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool):
+        if op_name == "median":
+            res_op = getattr(ser, op_name)
+
+            alt = ser.astype("int64")
+
+            exp_op = getattr(alt, op_name)
+            result = res_op(skipna=skipna)
+            expected = exp_op(skipna=skipna)
+            # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has no
+            # attribute "freq"
+            freq = ser.dtype.freq  # type: ignore[union-attr]
+            expected = Period._from_ordinal(int(expected), freq=freq)
+            tm.assert_almost_equal(result, expected)
+
+        else:
+            return super().check_reduce(ser, op_name, skipna)
+
+    @pytest.mark.parametrize("periods", [1, -2])
+    # NOTE: RuntimeWarning on Windows(non-ARM) platforms (in CI)
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    def test_diff(self, request, data, periods):
+        super().test_diff(data, periods)
+
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data, na_action):
+        result = data.map(lambda x: x, na_action=na_action)
+        tm.assert_extension_array_equal(result, data)
+
+
+class Test2DCompat(base.NDArrayBacked2DTests):
+    pass
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..59c752cd24163b1a698a9296974e1f595a359fbf
--- /dev/null
+++ b/pandas/tests/extension/test_sparse.py
@@ -0,0 +1,517 @@
+"""
+This file contains a minimal set of tests for compliance with the extension
+array interface test suite, and should contain no other tests.
+The test suite for the full functionality of the array is located in
+`pandas/tests/arrays/`.
+
+The tests in this file are inherited from the BaseExtensionTests, and only
+minimal tweaks should be applied to get the tests passing (by overwriting a
+parent method).
+
+Additional tests should either be added to one of the BaseExtensionTests
+classes (if they are relevant for the extension interface for all dtypes), or
+be added to the array-specific tests in `pandas/tests/arrays/`.
+
+"""
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import SparseDtype
+import pandas._testing as tm
+from pandas.arrays import SparseArray
+from pandas.tests.extension import base
+
+
+def make_data(fill_value, n: int):
+    rng = np.random.default_rng(2)
+    if np.isnan(fill_value):
+        data = rng.uniform(size=n)
+    else:
+        data = rng.integers(1, 100, size=n, dtype=int)
+        if data[0] == data[1]:
+            data[0] += 1
+
+    data[2::3] = fill_value
+    return data
+
+
+@pytest.fixture
+def dtype():
+    return SparseDtype()
+
+
+@pytest.fixture(params=[0, np.nan])
+def data(request):
+    """Length-10 SparseArray for semantics test."""
+    res = SparseArray(make_data(request.param, 10), fill_value=request.param)
+    return res
+
+
+@pytest.fixture
+def data_for_twos():
+    return SparseArray(np.ones(10) * 2)
+
+
+@pytest.fixture(params=[0, np.nan])
+def data_missing(request):
+    """Length 2 array with [NA, Valid]"""
+    return SparseArray([np.nan, 1], fill_value=request.param)
+
+
+@pytest.fixture(params=[0, np.nan])
+def data_repeated(request):
+    """Return different versions of data for count times"""
+
+    def gen(count):
+        for _ in range(count):
+            yield SparseArray(make_data(request.param, 10), fill_value=request.param)
+
+    return gen
+
+
+@pytest.fixture(params=[0, np.nan])
+def data_for_sorting(request):
+    return SparseArray([2, 3, 1], fill_value=request.param)
+
+
+@pytest.fixture(params=[0, np.nan])
+def data_missing_for_sorting(request):
+    return SparseArray([2, np.nan, 1], fill_value=request.param)
+
+
+@pytest.fixture
+def na_cmp():
+    return lambda left, right: pd.isna(left) and pd.isna(right)
+
+
+@pytest.fixture(params=[0, np.nan])
+def data_for_grouping(request):
+    return SparseArray([1, 1, np.nan, np.nan, 2, 2, 1, 3], fill_value=request.param)
+
+
+@pytest.fixture(params=[0, np.nan])
+def data_for_compare(request):
+    return SparseArray([0, 0, np.nan, -2, -1, 4, 2, 3, 0, 0], fill_value=request.param)
+
+
+class TestSparseArray(base.ExtensionTests):
+    def _supports_reduction(self, obj, op_name: str) -> bool:
+        return True
+
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna, request):
+        if all_numeric_reductions in [
+            "prod",
+            "median",
+            "var",
+            "std",
+            "sem",
+            "skew",
+            "kurt",
+        ]:
+            mark = pytest.mark.xfail(
+                reason="This should be viable but is not implemented"
+            )
+            request.node.add_marker(mark)
+        elif (
+            all_numeric_reductions in ["sum", "max", "min", "mean"]
+            and data.dtype.kind == "f"
+            and not skipna
+        ):
+            mark = pytest.mark.xfail(reason="getting a non-nan float")
+            request.node.add_marker(mark)
+
+        super().test_reduce_series_numeric(data, all_numeric_reductions, skipna)
+
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_frame(self, data, all_numeric_reductions, skipna, request):
+        if all_numeric_reductions in [
+            "prod",
+            "median",
+            "var",
+            "std",
+            "sem",
+            "skew",
+            "kurt",
+        ]:
+            mark = pytest.mark.xfail(
+                reason="This should be viable but is not implemented"
+            )
+            request.node.add_marker(mark)
+        elif (
+            all_numeric_reductions in ["sum", "max", "min", "mean"]
+            and data.dtype.kind == "f"
+            and not skipna
+        ):
+            mark = pytest.mark.xfail(reason="ExtensionArray NA mask are different")
+            request.node.add_marker(mark)
+
+        super().test_reduce_frame(data, all_numeric_reductions, skipna)
+
+    def _check_unsupported(self, data):
+        if data.dtype == SparseDtype(int, 0):
+            pytest.skip("Can't store nan in int array.")
+
+    def test_concat_mixed_dtypes(self, data):
+        # https://github.com/pandas-dev/pandas/issues/20762
+        # This should be the same, aside from concat([sparse, float])
+        df1 = pd.DataFrame({"A": data[:3]})
+        df2 = pd.DataFrame({"A": [1, 2, 3]})
+        df3 = pd.DataFrame({"A": ["a", "b", "c"]}).astype("category")
+        dfs = [df1, df2, df3]
+
+        # dataframes
+        result = pd.concat(dfs)
+        expected = pd.concat(
+            [x.apply(lambda s: np.asarray(s).astype(object)) for x in dfs]
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    @pytest.mark.parametrize(
+        "columns",
+        [
+            ["A", "B"],
+            pd.MultiIndex.from_tuples(
+                [("A", "a"), ("A", "b")], names=["outer", "inner"]
+            ),
+        ],
+    )
+    @pytest.mark.parametrize("future_stack", [True, False])
+    def test_stack(self, data, columns, future_stack):
+        super().test_stack(data, columns, future_stack)
+
+    def test_concat_columns(self, data, na_value):
+        self._check_unsupported(data)
+        super().test_concat_columns(data, na_value)
+
+    def test_concat_extension_arrays_copy_false(self, data, na_value):
+        self._check_unsupported(data)
+        super().test_concat_extension_arrays_copy_false(data, na_value)
+
+    def test_align(self, data, na_value):
+        self._check_unsupported(data)
+        super().test_align(data, na_value)
+
+    def test_align_frame(self, data, na_value):
+        self._check_unsupported(data)
+        super().test_align_frame(data, na_value)
+
+    def test_align_series_frame(self, data, na_value):
+        self._check_unsupported(data)
+        super().test_align_series_frame(data, na_value)
+
+    def test_merge(self, data, na_value):
+        self._check_unsupported(data)
+        super().test_merge(data, na_value)
+
+    def test_get(self, data):
+        ser = pd.Series(data, index=[2 * i for i in range(len(data))])
+        if np.isnan(ser.values.fill_value):
+            assert np.isnan(ser.get(4)) and np.isnan(ser.iloc[2])
+        else:
+            assert ser.get(4) == ser.iloc[2]
+        assert ser.get(2) == ser.iloc[1]
+
+    def test_array_item_with_index(self, data, request):
+        # TODO https://github.com/pandas-dev/pandas/pull/64183
+        request.node.add_marker(pytest.mark.xfail(reason="SparseArray getitem buggy"))
+        super().test_array_item_with_index(data)
+
+    def test_reindex(self, data, na_value):
+        self._check_unsupported(data)
+        super().test_reindex(data, na_value)
+
+    def test_isna(self, data_missing):
+        sarr = SparseArray(data_missing)
+        expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
+        expected = SparseArray([True, False], dtype=expected_dtype)
+        result = sarr.isna()
+        tm.assert_sp_array_equal(result, expected)
+
+        # test isna for arr without na
+        sarr = sarr.fillna(0)
+        expected_dtype = SparseDtype(bool, pd.isna(data_missing.dtype.fill_value))
+        expected = SparseArray([False, False], fill_value=False, dtype=expected_dtype)
+        tm.assert_equal(sarr.isna(), expected)
+
+    def test_fillna_no_op_returns_copy(self, data, request):
+        super().test_fillna_no_op_returns_copy(data)
+
+    def test_fillna_readonly(self, data_missing):
+        # copy keyword is ignored by SparseArray.fillna
+        # -> copy=True vs False doesn't make a difference
+        data = data_missing.copy()
+        data._readonly = True
+
+        result = data.fillna(data_missing[1])
+        assert result[0] == data_missing[1]
+        tm.assert_extension_array_equal(data, data_missing)
+
+        # fillna(copy=False) is ignored -> so same result as above
+        result = data.fillna(data_missing[1], copy=False)
+        assert result[0] == data_missing[1]
+        tm.assert_extension_array_equal(data, data_missing)
+
+    @pytest.mark.xfail(reason="Unsupported")
+    def test_fillna_series(self, data_missing):
+        # this one looks doable.
+        # TODO: this fails bc we do not pass through data_missing. If we did,
+        #  the 0-fill case would xpass
+        super().test_fillna_series()
+
+    def test_fillna_frame(self, data_missing):
+        # Have to override to specify that fill_value will change.
+        fill_value = data_missing[1]
+
+        result = pd.DataFrame({"A": data_missing, "B": [1, 2]}).fillna(fill_value)
+
+        if pd.isna(data_missing.fill_value):
+            dtype = SparseDtype(data_missing.dtype, fill_value)
+        else:
+            dtype = data_missing.dtype
+
+        expected = pd.DataFrame(
+            {
+                "A": data_missing._from_sequence([fill_value, fill_value], dtype=dtype),
+                "B": [1, 2],
+            }
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_fillna_limit_frame(self, data_missing):
+        # GH#58001
+        with pytest.raises(ValueError, match="limit must be None"):
+            super().test_fillna_limit_frame(data_missing)
+
+    def test_fillna_limit_series(self, data_missing):
+        # GH#58001
+        with pytest.raises(ValueError, match="limit must be None"):
+            super().test_fillna_limit_frame(data_missing)
+
+    _combine_le_expected_dtype = "Sparse[bool]"
+
+    def test_fillna_copy_frame(self, data_missing):
+        arr = data_missing.take([1, 1])
+        df = pd.DataFrame({"A": arr}, copy=False)
+
+        filled_val = df.iloc[0, 0]
+        result = df.fillna(filled_val)
+
+        if hasattr(df._mgr, "blocks"):
+            assert df.values.base is result.values.base
+        assert df.A._values.to_dense() is arr.to_dense()
+
+    def test_fillna_copy_series(self, data_missing):
+        arr = data_missing.take([1, 1])
+        ser = pd.Series(arr, copy=False)
+
+        filled_val = ser[0]
+        result = ser.fillna(filled_val)
+
+        assert ser._values is result._values
+        assert ser._values.to_dense() is arr.to_dense()
+
+    @pytest.mark.xfail(reason="Not Applicable")
+    def test_fillna_length_mismatch(self, data_missing):
+        super().test_fillna_length_mismatch(data_missing)
+
+    def test_where_series(self, data, na_value):
+        assert data[0] != data[1]
+        cls = type(data)
+        a, b = data[:2]
+
+        ser = pd.Series(cls._from_sequence([a, a, b, b], dtype=data.dtype))
+
+        cond = np.array([True, True, False, False])
+        result = ser.where(cond)
+
+        new_dtype = SparseDtype("float", 0.0)
+        expected = pd.Series(
+            cls._from_sequence([a, a, na_value, na_value], dtype=new_dtype)
+        )
+        tm.assert_series_equal(result, expected)
+
+        other = cls._from_sequence([a, b, a, b], dtype=data.dtype)
+        cond = np.array([True, False, True, True])
+        result = ser.where(cond, other)
+        expected = pd.Series(cls._from_sequence([a, b, b, b], dtype=data.dtype))
+        tm.assert_series_equal(result, expected)
+
+    def test_searchsorted(self, performance_warning, data_for_sorting, as_series):
+        with tm.assert_produces_warning(performance_warning, check_stacklevel=False):
+            super().test_searchsorted(data_for_sorting, as_series)
+
+    def test_shift_0_periods(self, data):
+        # GH#33856 shifting with periods=0 should return a copy, not same obj
+        result = data.shift(0)
+
+        data._sparse_values[0] = data._sparse_values[1]
+        assert result._sparse_values[0] != result._sparse_values[1]
+
+    @pytest.mark.parametrize("method", ["argmax", "argmin"])
+    def test_argmin_argmax_all_na(self, method, data, na_value):
+        # overriding because Sparse[int64, 0] cannot handle na_value
+        self._check_unsupported(data)
+        super().test_argmin_argmax_all_na(method, data, na_value)
+
+    @pytest.mark.fails_arm_wheels
+    @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
+    def test_equals(self, data, na_value, as_series, box):
+        self._check_unsupported(data)
+        super().test_equals(data, na_value, as_series, box)
+
+    @pytest.mark.fails_arm_wheels
+    def test_equals_same_data_different_object(self, data):
+        super().test_equals_same_data_different_object(data)
+
+    @pytest.mark.parametrize(
+        "func, na_action, expected",
+        [
+            (lambda x: x, None, SparseArray([1.0, np.nan])),
+            (lambda x: x, "ignore", SparseArray([1.0, np.nan])),
+            (str, None, SparseArray(["1.0", "nan"], fill_value="nan")),
+            (str, "ignore", SparseArray(["1.0", np.nan])),
+        ],
+    )
+    def test_map(self, func, na_action, expected):
+        # GH52096
+        data = SparseArray([1, np.nan])
+        result = data.map(func, na_action=na_action)
+        tm.assert_extension_array_equal(result, expected)
+
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map_raises(self, data, na_action):
+        # GH52096
+        msg = "fill value in the sparse values not supported"
+        with pytest.raises(ValueError, match=msg):
+            data.map(lambda x: np.nan, na_action=na_action)
+
+    @pytest.mark.xfail(raises=TypeError, reason="no sparse StringDtype")
+    def test_astype_string(self, data, nullable_string_dtype):
+        # TODO: this fails bc we do not pass through nullable_string_dtype;
+        #  If we did, the 0-cases would xpass
+        super().test_astype_string(data)
+
+    series_scalar_exc = None
+    frame_scalar_exc = None
+    divmod_exc = None
+    series_array_exc = None
+
+    def _skip_if_different_combine(self, data):
+        if data.fill_value == 0:
+            # arith ops call on dtype.fill_value so that the sparsity
+            # is maintained. Combine can't be called on a dtype in
+            # general, so we can't make the expected. This is tested elsewhere
+            pytest.skip("Incorrected expected from Series.combine and tested elsewhere")
+
+    def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
+        self._skip_if_different_combine(data)
+        super().test_arith_series_with_scalar(data, all_arithmetic_operators)
+
+    def test_arith_series_with_array(self, data, all_arithmetic_operators):
+        self._skip_if_different_combine(data)
+        super().test_arith_series_with_array(data, all_arithmetic_operators)
+
+    def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
+        if data.dtype.fill_value != 0:
+            pass
+        elif all_arithmetic_operators.strip("_") not in [
+            "mul",
+            "rmul",
+            "floordiv",
+            "rfloordiv",
+            "truediv",
+            "rtruediv",
+            "pow",
+            "mod",
+            "rmod",
+        ]:
+            mark = pytest.mark.xfail(reason="result dtype.fill_value mismatch")
+            request.applymarker(mark)
+        super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
+
+    def _compare_other(
+        self, ser: pd.Series, data_for_compare: SparseArray, comparison_op, other
+    ):
+        op = comparison_op
+
+        result = op(data_for_compare, other)
+        if isinstance(other, pd.Series):
+            assert isinstance(result, pd.Series)
+            assert isinstance(result.dtype, SparseDtype)
+        else:
+            assert isinstance(result, SparseArray)
+        assert result.dtype.subtype == np.bool_
+
+        if isinstance(other, pd.Series):
+            fill_value = op(data_for_compare.fill_value, other._values.fill_value)
+            expected = SparseArray(
+                op(data_for_compare.to_dense(), np.asarray(other)),
+                fill_value=fill_value,
+                dtype=np.bool_,
+            )
+
+        else:
+            fill_value = np.all(
+                op(np.asarray(data_for_compare.fill_value), np.asarray(other))
+            )
+
+            expected = SparseArray(
+                op(data_for_compare.to_dense(), np.asarray(other)),
+                fill_value=fill_value,
+                dtype=np.bool_,
+            )
+        if isinstance(other, pd.Series):
+            # error: Incompatible types in assignment
+            expected = pd.Series(expected)  # type: ignore[assignment]
+        tm.assert_equal(result, expected)
+
+    def test_scalar(self, data_for_compare: SparseArray, comparison_op):
+        ser = pd.Series(data_for_compare)
+        self._compare_other(ser, data_for_compare, comparison_op, 0)
+        self._compare_other(ser, data_for_compare, comparison_op, 1)
+        self._compare_other(ser, data_for_compare, comparison_op, -1)
+        self._compare_other(ser, data_for_compare, comparison_op, np.nan)
+
+    def test_array(self, data_for_compare: SparseArray, comparison_op, request):
+        if data_for_compare.dtype.fill_value == 0 and comparison_op.__name__ in [
+            "eq",
+            "ge",
+            "le",
+        ]:
+            mark = pytest.mark.xfail(reason="Wrong fill_value")
+            request.applymarker(mark)
+
+        arr = np.linspace(-4, 5, 10)
+        ser = pd.Series(data_for_compare)
+        self._compare_other(ser, data_for_compare, comparison_op, arr)
+
+    def test_sparse_array(self, data_for_compare: SparseArray, comparison_op, request):
+        if data_for_compare.dtype.fill_value == 0 and comparison_op.__name__ != "gt":
+            mark = pytest.mark.xfail(reason="Wrong fill_value")
+            request.applymarker(mark)
+
+        ser = pd.Series(data_for_compare)
+        arr = data_for_compare + 1
+        self._compare_other(ser, data_for_compare, comparison_op, arr)
+        arr = data_for_compare * 2
+        self._compare_other(ser, data_for_compare, comparison_op, arr)
+
+    @pytest.mark.xfail(reason="Different repr")
+    def test_array_repr(self, data, size):
+        super().test_array_repr(data, size)
+
+    @pytest.mark.xfail(reason="result does not match expected")
+    @pytest.mark.parametrize("as_index", [True, False])
+    def test_groupby_extension_agg(self, as_index, data_for_grouping):
+        super().test_groupby_extension_agg(as_index, data_for_grouping)
+
+
+def test_array_type_with_arg(dtype):
+    assert dtype.construct_array_type() is SparseArray
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
new file mode 100644
index 0000000000000000000000000000000000000000..07c957beef6522267e393bbfc27e02fcfb487d3c
--- /dev/null
+++ b/pandas/tests/extension/test_string.py
@@ -0,0 +1,308 @@
+"""
+This file contains a minimal set of tests for compliance with the extension
+array interface test suite, and should contain no other tests.
+The test suite for the full functionality of the array is located in
+`pandas/tests/arrays/`.
+
+The tests in this file are inherited from the BaseExtensionTests, and only
+minimal tweaks should be applied to get the tests passing (by overwriting a
+parent method).
+
+Additional tests should either be added to one of the BaseExtensionTests
+classes (if they are relevant for the extension interface for all dtypes), or
+be added to the array-specific tests in `pandas/tests/arrays/`.
+
+"""
+
+from __future__ import annotations
+
+import string
+from typing import cast
+
+import numpy as np
+import pytest
+
+from pandas.compat import HAS_PYARROW
+
+from pandas.core.dtypes.base import StorageExtensionDtype
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.api.types import is_string_dtype
+from pandas.core.arrays import ArrowStringArray
+from pandas.core.arrays.string_ import StringDtype
+from pandas.tests.arithmetic.test_string import string_dtype_highest_priority
+from pandas.tests.extension import base
+
+
+def maybe_split_array(arr, chunked):
+    if not chunked:
+        return arr
+    elif arr.dtype.storage != "pyarrow":
+        return arr
+
+    pa = pytest.importorskip("pyarrow")
+
+    arrow_array = arr._pa_array
+    split = len(arrow_array) // 2
+    arrow_array = pa.chunked_array(
+        [*arrow_array[:split].chunks, *arrow_array[split:].chunks]
+    )
+    assert arrow_array.num_chunks == 2
+    return arr._from_pyarrow_array(arrow_array)
+
+
+@pytest.fixture(params=[True, False])
+def chunked(request):
+    return request.param
+
+
+@pytest.fixture
+def dtype(string_dtype_arguments):
+    storage, na_value = string_dtype_arguments
+    return StringDtype(storage=storage, na_value=na_value)
+
+
+@pytest.fixture
+def data(dtype, chunked):
+    strings = np.random.default_rng(2).choice(list(string.ascii_letters), size=10)
+    while strings[0] == strings[1]:
+        strings = np.random.default_rng(2).choice(list(string.ascii_letters), size=10)
+
+    arr = dtype.construct_array_type()._from_sequence(strings, dtype=dtype)
+    return maybe_split_array(arr, chunked)
+
+
+@pytest.fixture
+def data_missing(dtype, chunked):
+    """Length 2 array with [NA, Valid]"""
+    arr = dtype.construct_array_type()._from_sequence([pd.NA, "A"], dtype=dtype)
+    return maybe_split_array(arr, chunked)
+
+
+@pytest.fixture
+def data_for_sorting(dtype, chunked):
+    arr = dtype.construct_array_type()._from_sequence(["B", "C", "A"], dtype=dtype)
+    return maybe_split_array(arr, chunked)
+
+
+@pytest.fixture
+def data_missing_for_sorting(dtype, chunked):
+    arr = dtype.construct_array_type()._from_sequence(["B", pd.NA, "A"], dtype=dtype)
+    return maybe_split_array(arr, chunked)
+
+
+@pytest.fixture
+def data_for_grouping(dtype, chunked):
+    arr = dtype.construct_array_type()._from_sequence(
+        ["B", "B", pd.NA, pd.NA, "A", "A", "B", "C"], dtype=dtype
+    )
+    return maybe_split_array(arr, chunked)
+
+
+class TestStringArray(base.ExtensionTests):
+    def test_combine_le(self, data_repeated):
+        dtype = next(iter(data_repeated(2))).dtype
+        if dtype.storage == "pyarrow" and dtype.na_value is pd.NA:
+            self._combine_le_expected_dtype = "bool[pyarrow]"
+        else:
+            self._combine_le_expected_dtype = "bool"
+        return super().test_combine_le(data_repeated)
+
+    def test_eq_with_str(self, dtype):
+        super().test_eq_with_str(dtype)
+
+        if dtype.na_value is pd.NA:
+            # only the NA-variant supports parametrized string alias
+            assert dtype == f"string[{dtype.storage}]"
+        elif dtype.storage == "pyarrow":
+            assert dtype == "str"
+
+    def test_is_not_string_type(self, dtype):
+        # Different from BaseDtypeTests.test_is_not_string_type
+        # because StringDtype is a string type
+        assert is_string_dtype(dtype)
+
+    def test_is_dtype_from_name(self, dtype, using_infer_string):
+        if dtype.na_value is np.nan and not using_infer_string:
+            result = type(dtype).is_dtype(dtype.name)
+            assert result is False
+        else:
+            super().test_is_dtype_from_name(dtype)
+
+    def test_construct_from_string_own_name(self, dtype, using_infer_string):
+        if dtype.na_value is np.nan and not using_infer_string:
+            with pytest.raises(TypeError, match="Cannot construct a 'StringDtype'"):
+                dtype.construct_from_string(dtype.name)
+        else:
+            super().test_construct_from_string_own_name(dtype)
+
+    def test_view(self, data):
+        if data.dtype.storage == "pyarrow":
+            pytest.skip(reason="2D support not implemented for ArrowStringArray")
+        super().test_view(data)
+
+    def test_from_dtype(self, data):
+        # base test uses string representation of dtype
+        pass
+
+    def test_transpose(self, data):
+        if data.dtype.storage == "pyarrow":
+            pytest.skip(reason="2D support not implemented for ArrowStringArray")
+        super().test_transpose(data)
+
+    def test_setitem_preserves_views(self, data):
+        if data.dtype.storage == "pyarrow":
+            pytest.skip(reason="2D support not implemented for ArrowStringArray")
+        super().test_setitem_preserves_views(data)
+
+    def test_dropna_array(self, data_missing):
+        result = data_missing.dropna()
+        expected = data_missing[[1]]
+        tm.assert_extension_array_equal(result, expected)
+
+    def test_fillna_no_op_returns_copy(self, data):
+        data = data[~data.isna()]
+
+        valid = data[0]
+        result = data.fillna(valid)
+        assert result is not data
+        tm.assert_extension_array_equal(result, data)
+
+    def test_fillna_readonly(self, data_missing):
+        data = data_missing.copy()
+        data._readonly = True
+
+        # by default fillna(copy=True), then this works fine
+        result = data.fillna(data_missing[1])
+        assert result[0] == data_missing[1]
+        tm.assert_extension_array_equal(data, data_missing)
+
+        # fillna(copy=False) is generally not honored by Arrow-backed array,
+        # but always returns new data -> same result as above
+        if data.dtype.storage == "pyarrow":
+            result = data.fillna(data_missing[1])
+            assert result[0] == data_missing[1]
+        else:
+            with pytest.raises(ValueError, match="Cannot modify read-only array"):
+                data.fillna(data_missing[1], copy=False)
+        tm.assert_extension_array_equal(data, data_missing)
+
+    def _get_expected_exception(
+        self, op_name: str, obj, other
+    ) -> type[Exception] | tuple[type[Exception], ...] | None:
+        if op_name in [
+            "__mod__",
+            "__rmod__",
+            "__divmod__",
+            "__rdivmod__",
+            "__pow__",
+            "__rpow__",
+        ]:
+            return TypeError
+        elif op_name in ["__mul__", "__rmul__"]:
+            # Can only multiply strings by integers
+            return TypeError
+        elif op_name in [
+            "__truediv__",
+            "__rtruediv__",
+            "__floordiv__",
+            "__rfloordiv__",
+            "__sub__",
+            "__rsub__",
+        ]:
+            return TypeError
+
+        return None
+
+    def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
+        return op_name in ["min", "max", "sum"] or (
+            ser.dtype.na_value is np.nan  # type: ignore[union-attr]
+            and op_name in ("any", "all")
+        )
+
+    def _supports_accumulation(self, ser: pd.Series, op_name: str) -> bool:
+        assert isinstance(ser.dtype, StorageExtensionDtype)
+        return op_name in ["cummin", "cummax", "cumsum"]
+
+    def _cast_pointwise_result(self, op_name: str, obj, other, pointwise_result):
+        dtype = cast(StringDtype, tm.get_dtype(obj))
+        if op_name in ["__add__", "__radd__"]:
+            cast_to = dtype
+            dtype_other = tm.get_dtype(other) if not isinstance(other, str) else None
+            if isinstance(dtype_other, StringDtype):
+                cast_to = string_dtype_highest_priority(dtype, dtype_other)
+        elif dtype.na_value is np.nan:
+            cast_to = np.bool_  # type: ignore[assignment]
+        elif dtype.storage == "pyarrow":
+            cast_to = "bool[pyarrow]"  # type: ignore[assignment]
+        else:
+            cast_to = "boolean"  # type: ignore[assignment]
+        return pointwise_result.astype(cast_to)
+
+    def test_compare_scalar(self, data, comparison_op):
+        ser = pd.Series(data)
+        self._compare_other(ser, data, comparison_op, "abc")
+
+    def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
+        super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)
+
+    def test_combine_add(self, data_repeated, using_infer_string, request):
+        dtype = next(data_repeated(1)).dtype
+        if not using_infer_string and dtype.storage == "python":
+            mark = pytest.mark.xfail(
+                reason="The pointwise operation result will be inferred to "
+                "string[nan, pyarrow], which does not match the input dtype"
+            )
+            request.applymarker(mark)
+        super().test_combine_add(data_repeated)
+
+    def test_arith_series_with_array(
+        self, data, all_arithmetic_operators, using_infer_string, request
+    ):
+        dtype = data.dtype
+        if (
+            using_infer_string
+            and all_arithmetic_operators == "__radd__"
+            and dtype.na_value is pd.NA
+            and (HAS_PYARROW or dtype.storage == "pyarrow")
+        ):
+            # TODO(infer_string)
+            mark = pytest.mark.xfail(
+                reason="The pointwise operation result will be inferred to "
+                "string[nan, pyarrow], which does not match the input dtype"
+            )
+            request.applymarker(mark)
+        super().test_arith_series_with_array(data, all_arithmetic_operators)
+
+    def test_loc_setitem_with_expansion_preserves_ea_index_dtype(
+        self, data, request, using_infer_string
+    ):
+        if not using_infer_string and data.dtype.storage == "python":
+            mark = pytest.mark.xfail(reason="Casts to object")
+            request.applymarker(mark)
+        super().test_loc_setitem_with_expansion_preserves_ea_index_dtype(data)
+
+
+class Test2DCompat(base.Dim2CompatTests):
+    @pytest.fixture(autouse=True)
+    def arrow_not_supported(self, data):
+        if isinstance(data, ArrowStringArray):
+            pytest.skip(reason="2D support not implemented for ArrowStringArray")
+
+
+def test_searchsorted_with_na_raises(data_for_sorting, as_series):
+    # GH50447
+    b, c, a = data_for_sorting
+    arr = data_for_sorting.take([2, 0, 1])  # to get [a, b, c]
+    arr[-1] = pd.NA
+
+    if as_series:
+        arr = pd.Series(arr)
+
+    msg = (
+        "searchsorted requires array to be sorted, "
+        "which is impossible with NAs present."
+    )
+    with pytest.raises(ValueError, match=msg):
+        arr.searchsorted(b)
diff --git a/pandas/tests/frame/__init__.py b/pandas/tests/frame/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc41d7907a240f0dd9dc19e0ae1296bee86be421
--- /dev/null
+++ b/pandas/tests/frame/common.py
@@ -0,0 +1,63 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from pandas import (
+    DataFrame,
+    concat,
+)
+
+if TYPE_CHECKING:
+    from pandas._typing import AxisInt
+
+
+def _check_mixed_float(df, dtype=None):
+    # float16 are most likely to be upcasted to float32
+    dtypes = {"A": "float32", "B": "float32", "C": "float16", "D": "float64"}
+    if isinstance(dtype, str):
+        dtypes = {k: dtype for k, v in dtypes.items()}
+    elif isinstance(dtype, dict):
+        dtypes.update(dtype)
+    if dtypes.get("A"):
+        assert df.dtypes["A"] == dtypes["A"]
+    if dtypes.get("B"):
+        assert df.dtypes["B"] == dtypes["B"]
+    if dtypes.get("C"):
+        assert df.dtypes["C"] == dtypes["C"]
+    if dtypes.get("D"):
+        assert df.dtypes["D"] == dtypes["D"]
+
+
+def _check_mixed_int(df, dtype=None):
+    dtypes = {"A": "int32", "B": "uint64", "C": "uint8", "D": "int64"}
+    if isinstance(dtype, str):
+        dtypes = {k: dtype for k, v in dtypes.items()}
+    elif isinstance(dtype, dict):
+        dtypes.update(dtype)
+    if dtypes.get("A"):
+        assert df.dtypes["A"] == dtypes["A"]
+    if dtypes.get("B"):
+        assert df.dtypes["B"] == dtypes["B"]
+    if dtypes.get("C"):
+        assert df.dtypes["C"] == dtypes["C"]
+    if dtypes.get("D"):
+        assert df.dtypes["D"] == dtypes["D"]
+
+
+def zip_frames(frames: list[DataFrame], axis: AxisInt = 1) -> DataFrame:
+    """
+    take a list of frames, zip them together under the
+    assumption that these all have the first frames' index/columns.
+
+    Returns
+    -------
+    new_frame : DataFrame
+    """
+    if axis == 1:
+        columns = frames[0].columns
+        zipped = [f.loc[:, c] for c in columns for f in frames]
+        return concat(zipped, axis=1)
+    else:
+        index = frames[0].index
+        zipped = [f.loc[i, :] for i in index for f in frames]
+        return DataFrame(zipped)
diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..73b8f08957687a8b4af0d582a93968c1514c96c3
--- /dev/null
+++ b/pandas/tests/frame/conftest.py
@@ -0,0 +1,100 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Index,
+    NaT,
+    date_range,
+)
+
+
+@pytest.fixture
+def datetime_frame() -> DataFrame:
+    """
+    Fixture for DataFrame of floats with DatetimeIndex
+
+    Columns are ['A', 'B', 'C', 'D']
+    """
+    return DataFrame(
+        np.random.default_rng(2).standard_normal((10, 4)),
+        columns=Index(list("ABCD")),
+        index=date_range("2000-01-01", periods=10, freq="B"),
+    )
+
+
+@pytest.fixture
+def float_string_frame():
+    """
+    Fixture for DataFrame of floats and strings with index of unique strings
+
+    Columns are ['A', 'B', 'C', 'D', 'foo'].
+    """
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((30, 4)),
+        index=Index([f"foo_{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD")),
+    )
+    df["foo"] = "bar"
+    return df
+
+
+@pytest.fixture
+def mixed_float_frame():
+    """
+    Fixture for DataFrame of different float types with index of unique strings
+
+    Columns are ['A', 'B', 'C', 'D'].
+    """
+    df = DataFrame(
+        {
+            col: np.random.default_rng(2).random(30, dtype=dtype)
+            for col, dtype in zip(
+                list("ABCD"), ["float32", "float32", "float32", "float64"]
+            )
+        },
+        index=Index([f"foo_{i}" for i in range(30)], dtype=object),
+    )
+    # not supported by numpy random
+    df["C"] = df["C"].astype("float16")
+    return df
+
+
+@pytest.fixture
+def mixed_int_frame():
+    """
+    Fixture for DataFrame of different int types with index of unique strings
+
+    Columns are ['A', 'B', 'C', 'D'].
+    """
+    return DataFrame(
+        {
+            col: np.ones(30, dtype=dtype)
+            for col, dtype in zip(list("ABCD"), ["int32", "uint64", "uint8", "int64"])
+        },
+        index=Index([f"foo_{i}" for i in range(30)], dtype=object),
+    )
+
+
+@pytest.fixture
+def timezone_frame():
+    """
+    Fixture for DataFrame of date_range Series with different time zones
+
+    Columns are ['A', 'B', 'C']; some entries are missing
+
+               A                         B                         C
+    0 2013-01-01 2013-01-01 00:00:00-05:00 2013-01-01 00:00:00+01:00
+    1 2013-01-02                       NaT                       NaT
+    2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00
+    """
+    df = DataFrame(
+        {
+            "A": date_range("20130101", periods=3, unit="ns"),
+            "B": date_range("20130101", periods=3, tz="US/Eastern", unit="ns"),
+            "C": date_range("20130101", periods=3, tz="CET", unit="ns"),
+        }
+    )
+    df.iloc[1, 1] = NaT
+    df.iloc[1, 2] = NaT
+    return df
diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4c16b94fcf8b1ee918dce8a9084e56d00225e7b
--- /dev/null
+++ b/pandas/tests/frame/test_alter_axes.py
@@ -0,0 +1,31 @@
+from datetime import (
+    datetime,
+    timezone,
+)
+
+from pandas import DataFrame
+import pandas._testing as tm
+
+
+class TestDataFrameAlterAxes:
+    # Tests for setting index/columns attributes directly (i.e. __setattr__)
+
+    def test_set_axis_setattr_index(self):
+        # GH 6785
+        # set the index manually
+
+        df = DataFrame([{"ts": datetime(2014, 4, 1, tzinfo=timezone.utc), "foo": 1}])
+        expected = df.set_index("ts")
+        df.index = df["ts"]
+        df.pop("ts")
+        tm.assert_frame_equal(df, expected)
+
+    # Renaming
+
+    def test_assign_columns(self, float_frame):
+        float_frame["hi"] = "there"
+
+        df = float_frame.copy()
+        df.columns = ["foo", "bar", "baz", "quux", "foo2"]
+        tm.assert_series_equal(float_frame["C"], df["baz"], check_names=False)
+        tm.assert_series_equal(float_frame["hi"], df["foo2"], check_names=False)
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..f54e7605528254fa18c9267057ee4eff0e2977c8
--- /dev/null
+++ b/pandas/tests/frame/test_api.py
@@ -0,0 +1,408 @@
+from copy import deepcopy
+import inspect
+import pydoc
+
+import numpy as np
+import pytest
+
+from pandas._config import using_string_dtype
+from pandas._config.config import option_context
+
+from pandas.compat import HAS_PYARROW
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Series,
+    date_range,
+    timedelta_range,
+)
+import pandas._testing as tm
+
+
+class TestDataFrameMisc:
+    def test_getitem_pop_assign_name(self, float_frame):
+        s = float_frame["A"]
+        assert s.name == "A"
+
+        s = float_frame.pop("A")
+        assert s.name == "A"
+
+        s = float_frame.loc[:, "B"]
+        assert s.name == "B"
+
+        s2 = s.loc[:]
+        assert s2.name == "B"
+
+    def test_get_axis(self, float_frame):
+        f = float_frame
+        assert f._get_axis_number(0) == 0
+        assert f._get_axis_number(1) == 1
+        assert f._get_axis_number("index") == 0
+        assert f._get_axis_number("rows") == 0
+        assert f._get_axis_number("columns") == 1
+
+        assert f._get_axis_name(0) == "index"
+        assert f._get_axis_name(1) == "columns"
+        assert f._get_axis_name("index") == "index"
+        assert f._get_axis_name("rows") == "index"
+        assert f._get_axis_name("columns") == "columns"
+
+        assert f._get_axis(0) is f.index
+        assert f._get_axis(1) is f.columns
+
+        with pytest.raises(ValueError, match="No axis named"):
+            f._get_axis_number(2)
+
+        with pytest.raises(ValueError, match="No axis.*foo"):
+            f._get_axis_name("foo")
+
+        with pytest.raises(ValueError, match="No axis.*None"):
+            f._get_axis_name(None)
+
+        with pytest.raises(ValueError, match="No axis named"):
+            f._get_axis_number(None)
+
+    def test_column_contains_raises(self, float_frame):
+        with pytest.raises(TypeError, match="unhashable type: 'Index'"):
+            float_frame.columns in float_frame
+
+    def test_tab_completion(self):
+        # DataFrame whose columns are identifiers shall have them in __dir__.
+        df = DataFrame([list("abcd"), list("efgh")], columns=list("ABCD"))
+        for key in list("ABCD"):
+            assert key in dir(df)
+        assert isinstance(df.__getitem__("A"), Series)
+
+        # DataFrame whose first-level columns are identifiers shall have
+        # them in __dir__.
+        df = DataFrame(
+            [list("abcd"), list("efgh")],
+            columns=pd.MultiIndex.from_tuples(list(zip("ABCD", "EFGH"))),
+        )
+        for key in list("ABCD"):
+            assert key in dir(df)
+        for key in list("EFGH"):
+            assert key not in dir(df)
+        assert isinstance(df.__getitem__("A"), DataFrame)
+
+    def test_display_max_dir_items(self):
+        # display.max_dir_items increases the number of columns that are in __dir__.
+        columns = ["a" + str(i) for i in range(420)]
+        values = [range(420), range(420)]
+        df = DataFrame(values, columns=columns)
+
+        # The default value for display.max_dir_items is 100
+        assert "a99" in dir(df)
+        assert "a100" not in dir(df)
+
+        with option_context("display.max_dir_items", 300):
+            df = DataFrame(values, columns=columns)
+            assert "a299" in dir(df)
+            assert "a300" not in dir(df)
+
+        with option_context("display.max_dir_items", None):
+            df = DataFrame(values, columns=columns)
+            assert "a419" in dir(df)
+
+    def test_not_hashable(self):
+        empty_frame = DataFrame()
+
+        df = DataFrame([1])
+        msg = "unhashable type: 'DataFrame'"
+        with pytest.raises(TypeError, match=msg):
+            hash(df)
+        with pytest.raises(TypeError, match=msg):
+            hash(empty_frame)
+
+    @pytest.mark.xfail(
+        using_string_dtype() and HAS_PYARROW, reason="surrogates not allowed"
+    )
+    def test_column_name_contains_unicode_surrogate(self):
+        # GH 25509
+        colname = "\ud83d"
+        df = DataFrame({colname: []})
+        # this should not crash
+        assert colname not in dir(df)
+        assert df.columns[0] == colname
+
+    def test_new_empty_index(self):
+        df1 = DataFrame(np.random.default_rng(2).standard_normal((0, 3)))
+        df2 = DataFrame(np.random.default_rng(2).standard_normal((0, 3)))
+        df1.index.name = "foo"
+        assert df2.index.name is None
+
+    def test_get_agg_axis(self, float_frame):
+        cols = float_frame._get_agg_axis(0)
+        assert cols is float_frame.columns
+
+        idx = float_frame._get_agg_axis(1)
+        assert idx is float_frame.index
+
+        msg = r"Axis must be 0 or 1 \(got 2\)"
+        with pytest.raises(ValueError, match=msg):
+            float_frame._get_agg_axis(2)
+
+    def test_empty(self, float_frame, float_string_frame):
+        empty_frame = DataFrame()
+        assert empty_frame.empty
+
+        assert not float_frame.empty
+        assert not float_string_frame.empty
+
+        # corner case
+        df = DataFrame({"A": [1.0, 2.0, 3.0], "B": ["a", "b", "c"]}, index=np.arange(3))
+        del df["A"]
+        assert not df.empty
+
+    def test_len(self, float_frame):
+        assert len(float_frame) == len(float_frame.index)
+
+        # single block corner case
+        arr = float_frame[["A", "B"]].values
+        expected = float_frame.reindex(columns=["A", "B"]).values
+        tm.assert_almost_equal(arr, expected)
+
+    def test_axis_aliases(self, float_frame):
+        f = float_frame
+
+        # reg name
+        expected = f.sum(axis=0)
+        result = f.sum(axis="index")
+        tm.assert_series_equal(result, expected)
+
+        expected = f.sum(axis=1)
+        result = f.sum(axis="columns")
+        tm.assert_series_equal(result, expected)
+
+    def test_class_axis(self):
+        # GH 18147
+        # no exception and no empty docstring
+        assert pydoc.getdoc(DataFrame.index)
+        assert pydoc.getdoc(DataFrame.columns)
+
+    def test_series_put_names(self, float_string_frame):
+        series = float_string_frame._series
+        for k, v in series.items():
+            assert v.name == k
+
+    def test_empty_nonzero(self):
+        df = DataFrame([1, 2, 3])
+        assert not df.empty
+        df = DataFrame(index=[1], columns=[1])
+        assert not df.empty
+        df = DataFrame(index=["a", "b"], columns=["c", "d"]).dropna()
+        assert df.empty
+        assert df.T.empty
+
+    @pytest.mark.parametrize(
+        "df",
+        [
+            DataFrame(),
+            DataFrame(index=[1]),
+            DataFrame(columns=[1]),
+            DataFrame({1: []}),
+        ],
+    )
+    def test_empty_like(self, df):
+        assert df.empty
+        assert df.T.empty
+
+    def test_with_datetimelikes(self):
+        df = DataFrame(
+            {
+                "A": date_range("20130101", periods=10),
+                "B": timedelta_range("1 day", periods=10),
+            }
+        )
+        t = df.T
+
+        result = t.dtypes.value_counts()
+        expected = Series({np.dtype("object"): 10}, name="count")
+        tm.assert_series_equal(result, expected)
+
+    def test_deepcopy(self, float_frame):
+        cp = deepcopy(float_frame)
+        cp.loc[0, "A"] = 10
+        assert not float_frame.equals(cp)
+
+    def test_inplace_return_self(self):
+        # GH 1893
+
+        data = DataFrame(
+            {"a": ["foo", "bar", "baz", "qux"], "b": [0, 0, 1, 1], "c": [1, 2, 3, 4]}
+        )
+
+        def _check_none(base, f):
+            result = f(base)
+            assert result is None
+
+        def _check_return(base, f):
+            result = f(base)
+            assert result is base
+
+        # -----DataFrame-----
+
+        # set_index
+        f = lambda x: x.set_index("a", inplace=True)
+        _check_none(data.copy(), f)
+
+        # reset_index
+        f = lambda x: x.reset_index(inplace=True)
+        _check_none(data.set_index("a"), f)
+
+        # drop_duplicates
+        f = lambda x: x.drop_duplicates(inplace=True)
+        _check_none(data.copy(), f)
+
+        # sort
+        f = lambda x: x.sort_values("b", inplace=True)
+        _check_none(data.copy(), f)
+
+        # sort_index
+        f = lambda x: x.sort_index(inplace=True)
+        _check_none(data.copy(), f)
+
+        # fillna
+        f = lambda x: x.fillna(0, inplace=True)
+        _check_return(data.copy(), f)
+
+        # replace
+        f = lambda x: x.replace(1, 0, inplace=True)
+        _check_return(data.copy(), f)
+
+        # rename
+        f = lambda x: x.rename({1: "foo"}, inplace=True)
+        _check_none(data.copy(), f)
+
+        # -----Series-----
+        d = data.copy()["c"]
+
+        # reset_index
+        f = lambda x: x.reset_index(inplace=True, drop=True)
+        _check_none(data.set_index("a")["c"], f)
+
+        # fillna
+        f = lambda x: x.fillna(0, inplace=True)
+        _check_return(d.copy(), f)
+
+        # replace
+        f = lambda x: x.replace(1, 0, inplace=True)
+        _check_return(d.copy(), f)
+
+        # rename
+        f = lambda x: x.rename({1: "foo"}, inplace=True)
+        _check_none(d.copy(), f)
+
+    def test_tab_complete_warning(self, ip, frame_or_series):
+        # GH 16409
+        pytest.importorskip("IPython", minversion="6.0.0")
+        from IPython.core.completer import provisionalcompleter
+
+        if frame_or_series is DataFrame:
+            code = "from pandas import DataFrame; obj = DataFrame()"
+        else:
+            code = "from pandas import Series; obj = Series(dtype=object)"
+
+        ip.run_cell(code)
+        # GH 31324 newer jedi version raises Deprecation warning;
+        #  appears resolved 2021-02-02
+        with tm.assert_produces_warning(None, raise_on_extra_warnings=False):
+            with provisionalcompleter("ignore"):
+                list(ip.Completer.completions("obj.", 1))
+
+    def test_attrs(self):
+        df = DataFrame({"A": [2, 3]})
+        assert df.attrs == {}
+        df.attrs["version"] = 1
+
+        result = df.rename(columns=str)
+        assert result.attrs == {"version": 1}
+
+    def test_attrs_is_deepcopy(self):
+        df = DataFrame({"A": [2, 3]})
+        assert df.attrs == {}
+        df.attrs["tags"] = {"spam", "ham"}
+
+        result = df.rename(columns=str)
+        assert result.attrs == df.attrs
+        assert result.attrs["tags"] is not df.attrs["tags"]
+
+    def test_attrs_concat(self):
+        # concat propagates attrs if all input attrs are equal
+        df1 = DataFrame({"A": [2, 3]})
+        df1.attrs = {"a": 1, "b": 2}
+        df2 = DataFrame({"A": [4, 5]})
+        df2.attrs = df1.attrs.copy()
+        df3 = DataFrame({"A": [6, 7]})
+        df3.attrs = df1.attrs.copy()
+        assert pd.concat([df1, df2, df3]).attrs == df1.attrs
+        # concat does not propagate attrs if input attrs are different
+        df2.attrs = {"c": 3}
+        assert pd.concat([df1, df2, df3]).attrs == {}
+
+    def test_attrs_merge(self):
+        # merge propagates attrs if all input attrs are equal
+        df1 = DataFrame({"key": ["a", "b"], "val1": [1, 2]})
+        df1.attrs = {"a": 1, "b": 2}
+        df2 = DataFrame({"key": ["a", "b"], "val2": [3, 4]})
+        df2.attrs = df1.attrs.copy()
+        assert pd.merge(df1, df2).attrs == df1.attrs
+        # merge does not propagate attrs if input attrs are different
+        df2.attrs = {"c": 3}
+        assert pd.merge(df1, df2).attrs == {}
+
+    @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None])
+    def test_set_flags(
+        self,
+        allows_duplicate_labels,
+        frame_or_series,
+    ):
+        obj = DataFrame({"A": [1, 2]})
+        key = (0, 0)
+        if frame_or_series is Series:
+            obj = obj["A"]
+            key = 0
+
+        result = obj.set_flags(allows_duplicate_labels=allows_duplicate_labels)
+
+        if allows_duplicate_labels is None:
+            # We don't update when it's not provided
+            assert result.flags.allows_duplicate_labels is True
+        else:
+            assert result.flags.allows_duplicate_labels is allows_duplicate_labels
+
+        # We made a copy
+        assert obj is not result
+
+        # We didn't mutate obj
+        assert obj.flags.allows_duplicate_labels is True
+
+        # But we didn't copy data
+        if frame_or_series is Series:
+            assert np.may_share_memory(obj.values, result.values)
+        else:
+            assert np.may_share_memory(obj["A"].values, result["A"].values)
+
+        result.iloc[key] = 0
+        assert obj.iloc[key] == 1
+
+        # Now we do copy.
+        result = obj.set_flags(allows_duplicate_labels=allows_duplicate_labels)
+        result.iloc[key] = 10
+        assert obj.iloc[key] == 1
+
+    def test_constructor_expanddim(self):
+        # GH#33628 accessing _constructor_expanddim should not raise NotImplementedError
+        # GH38782 pandas has no container higher than DataFrame (two-dim), so
+        # DataFrame._constructor_expand_dim, doesn't make sense, so is removed.
+        df = DataFrame()
+
+        msg = "'DataFrame' object has no attribute '_constructor_expanddim'"
+        with pytest.raises(AttributeError, match=msg):
+            df._constructor_expanddim(np.arange(27).reshape(3, 3, 3))
+
+    def test_inspect_getmembers(self):
+        # GH38740
+        df = DataFrame()
+        inspect.getmembers(df)
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
new file mode 100644
index 0000000000000000000000000000000000000000..388c28f4015f4e53c6ac42cea8945be60fc414fb
--- /dev/null
+++ b/pandas/tests/frame/test_arithmetic.py
@@ -0,0 +1,2203 @@
+from collections import deque
+from datetime import (
+    datetime,
+    timezone,
+)
+from enum import Enum
+import functools
+import operator
+import re
+
+import numpy as np
+import pytest
+
+from pandas.compat._optional import import_optional_dependency
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+)
+import pandas._testing as tm
+from pandas.core.computation import expressions as expr
+from pandas.tests.frame.common import (
+    _check_mixed_float,
+    _check_mixed_int,
+)
+from pandas.util.version import Version
+
+
+@pytest.fixture
+def simple_frame():
+    """
+    Fixture for simple 3x3 DataFrame
+
+    Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c'].
+
+       one  two  three
+    a  1.0  2.0    3.0
+    b  4.0  5.0    6.0
+    c  7.0  8.0    9.0
+    """
+    arr = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
+
+    return DataFrame(arr, columns=["one", "two", "three"], index=["a", "b", "c"])
+
+
+@pytest.fixture(autouse=True, params=[0, 100], ids=["numexpr", "python"])
+def switch_numexpr_min_elements(request, monkeypatch):
+    with monkeypatch.context() as m:
+        m.setattr(expr, "_MIN_ELEMENTS", request.param)
+        yield request.param
+
+
+class DummyElement:
+    def __init__(self, value, dtype) -> None:
+        self.value = value
+        self.dtype = np.dtype(dtype)
+
+    def __array__(self, dtype=None, copy=None):
+        return np.array(self.value, dtype=self.dtype)
+
+    def __str__(self) -> str:
+        return f"DummyElement({self.value}, {self.dtype})"
+
+    def __repr__(self) -> str:
+        return str(self)
+
+    def astype(self, dtype, copy=False):
+        self.dtype = dtype
+        return self
+
+    def view(self, dtype):
+        return type(self)(self.value.view(dtype), dtype)
+
+    def any(self, axis=None):
+        return bool(self.value)
+
+
+# -------------------------------------------------------------------
+# Comparisons
+
+
+class TestFrameComparisons:
+    # Specifically _not_ flex-comparisons
+
+    def test_comparison_with_categorical_dtype(self):
+        # GH#12564
+
+        df = DataFrame({"A": ["foo", "bar", "baz"]})
+        exp = DataFrame({"A": [True, False, False]})
+
+        res = df == "foo"
+        tm.assert_frame_equal(res, exp)
+
+        # casting to categorical shouldn't affect the result
+        df["A"] = df["A"].astype("category")
+
+        res = df == "foo"
+        tm.assert_frame_equal(res, exp)
+
+    def test_frame_in_list(self):
+        # GH#12689 this should raise at the DataFrame level, not blocks
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((6, 4)), columns=list("ABCD")
+        )
+        msg = "The truth value of a DataFrame is ambiguous"
+        with pytest.raises(ValueError, match=msg):
+            df in [None]
+
+    @pytest.mark.parametrize(
+        "arg, arg2",
+        [
+            [
+                {
+                    "a": np.random.default_rng(2).integers(10, size=10),
+                    "b": pd.date_range("20010101", periods=10, unit="ns"),
+                },
+                {
+                    "a": np.random.default_rng(2).integers(10, size=10),
+                    "b": np.random.default_rng(2).integers(10, size=10),
+                },
+            ],
+            [
+                {
+                    "a": np.random.default_rng(2).integers(10, size=10),
+                    "b": np.random.default_rng(2).integers(10, size=10),
+                },
+                {
+                    "a": np.random.default_rng(2).integers(10, size=10),
+                    "b": pd.date_range("20010101", periods=10, unit="ns"),
+                },
+            ],
+            [
+                {
+                    "a": pd.date_range("20010101", periods=10, unit="ns"),
+                    "b": pd.date_range("20010101", periods=10, unit="ns"),
+                },
+                {
+                    "a": np.random.default_rng(2).integers(10, size=10),
+                    "b": np.random.default_rng(2).integers(10, size=10),
+                },
+            ],
+            [
+                {
+                    "a": np.random.default_rng(2).integers(10, size=10),
+                    "b": pd.date_range("20010101", periods=10, unit="ns"),
+                },
+                {
+                    "a": pd.date_range("20010101", periods=10, unit="ns"),
+                    "b": pd.date_range("20010101", periods=10, unit="ns"),
+                },
+            ],
+        ],
+    )
+    def test_comparison_invalid(self, arg, arg2):
+        # GH4968
+        # invalid date/int comparisons
+        x = DataFrame(arg)
+        y = DataFrame(arg2)
+        # we expect the result to match Series comparisons for
+        # == and !=, inequalities should raise
+        result = x == y
+        expected = DataFrame(
+            {col: x[col] == y[col] for col in x.columns},
+            index=x.index,
+            columns=x.columns,
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = x != y
+        expected = DataFrame(
+            {col: x[col] != y[col] for col in x.columns},
+            index=x.index,
+            columns=x.columns,
+        )
+        tm.assert_frame_equal(result, expected)
+
+        msgs = [
+            r"Invalid comparison between dtype=datetime64\[ns\] and ndarray",
+            "invalid type promotion",
+            (
+                # npdev 1.20.0
+                r"The DTypes <class 'numpy.dtype\[.*\]'> and "
+                r"<class 'numpy.dtype\[.*\]'> do not have a common DType."
+            ),
+        ]
+        msg = "|".join(msgs)
+        with pytest.raises(TypeError, match=msg):
+            x >= y
+        with pytest.raises(TypeError, match=msg):
+            x > y
+        with pytest.raises(TypeError, match=msg):
+            x < y
+        with pytest.raises(TypeError, match=msg):
+            x <= y
+
+    @pytest.mark.parametrize(
+        "left, right",
+        [
+            ("gt", "lt"),
+            ("lt", "gt"),
+            ("ge", "le"),
+            ("le", "ge"),
+            ("eq", "eq"),
+            ("ne", "ne"),
+        ],
+    )
+    def test_timestamp_compare(self, left, right):
+        # make sure we can compare Timestamps on the right AND left hand side
+        # GH#4982
+        df = DataFrame(
+            {
+                "dates1": pd.date_range("20010101", periods=10),
+                "dates2": pd.date_range("20010102", periods=10),
+                "intcol": np.random.default_rng(2).integers(1000000000, size=10),
+                "floatcol": np.random.default_rng(2).standard_normal(10),
+                "stringcol": [chr(100 + i) for i in range(10)],
+            }
+        )
+        df.loc[np.random.default_rng(2).random(len(df)) > 0.5, "dates2"] = pd.NaT
+        left_f = getattr(operator, left)
+        right_f = getattr(operator, right)
+
+        # no nats
+        if left in ["eq", "ne"]:
+            expected = left_f(df, pd.Timestamp("20010109"))
+            result = right_f(pd.Timestamp("20010109"), df)
+            tm.assert_frame_equal(result, expected)
+        else:
+            msg = (
+                "'(<|>)=?' not supported between "
+                "instances of 'numpy.ndarray' and 'Timestamp'"
+            )
+            with pytest.raises(TypeError, match=msg):
+                left_f(df, pd.Timestamp("20010109"))
+            with pytest.raises(TypeError, match=msg):
+                right_f(pd.Timestamp("20010109"), df)
+        # nats
+        if left in ["eq", "ne"]:
+            expected = left_f(df, pd.Timestamp("nat"))
+            result = right_f(pd.Timestamp("nat"), df)
+            tm.assert_frame_equal(result, expected)
+        else:
+            msg = (
+                "'(<|>)=?' not supported between "
+                "instances of 'numpy.ndarray' and 'NaTType'"
+            )
+            with pytest.raises(TypeError, match=msg):
+                left_f(df, pd.Timestamp("nat"))
+            with pytest.raises(TypeError, match=msg):
+                right_f(pd.Timestamp("nat"), df)
+
+    def test_mixed_comparison(self):
+        # GH#13128, GH#22163 != datetime64 vs non-dt64 should be False,
+        # not raise TypeError
+        # (this appears to be fixed before GH#22163, not sure when)
+        df = DataFrame([["1989-08-01", 1], ["1989-08-01", 2]])
+        other = DataFrame([["a", "b"], ["c", "d"]])
+
+        result = df == other
+        assert not result.any().any()
+
+        result = df != other
+        assert result.all().all()
+
+    def test_df_boolean_comparison_error(self):
+        # GH#4576, GH#22880
+        # comparing DataFrame against list/tuple with len(obj) matching
+        #  len(df.columns) is supported as of GH#22800
+        df = DataFrame(np.arange(6).reshape((3, 2)))
+
+        expected = DataFrame([[False, False], [True, False], [False, False]])
+
+        result = df == (2, 2)
+        tm.assert_frame_equal(result, expected)
+
+        result = df == [2, 2]
+        tm.assert_frame_equal(result, expected)
+
+    def test_df_float_none_comparison(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((8, 3)),
+            index=range(8),
+            columns=["A", "B", "C"],
+        )
+
+        result = df.__eq__(None)
+        assert not result.any().any()
+
+    def test_df_string_comparison(self):
+        df = DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}])
+        mask_a = df.a > 1
+        tm.assert_frame_equal(df[mask_a], df.loc[1:1, :])
+        tm.assert_frame_equal(df[-mask_a], df.loc[0:0, :])
+
+        mask_b = df.b == "foo"
+        tm.assert_frame_equal(df[mask_b], df.loc[0:0, :])
+        tm.assert_frame_equal(df[-mask_b], df.loc[1:1, :])
+
+
+class TestFrameFlexComparisons:
+    # TODO: test_bool_flex_frame needs a better name
+    def test_bool_flex_frame(self, comparison_op):
+        data = np.random.default_rng(2).standard_normal((5, 3))
+        other_data = np.random.default_rng(2).standard_normal((5, 3))
+        df = DataFrame(data)
+        other = DataFrame(other_data)
+        ndim_5 = np.ones((*df.shape, 1, 3))
+
+        # DataFrame
+        assert df.eq(df).values.all()
+        assert not df.ne(df).values.any()
+        f = getattr(df, comparison_op.__name__)
+        o = comparison_op
+        # No NAs
+        tm.assert_frame_equal(f(other), o(df, other))
+        # Unaligned
+        part_o = other.loc[3:, 1:].copy()
+        rs = f(part_o)
+        xp = o(df, part_o.reindex(index=df.index, columns=df.columns))
+        tm.assert_frame_equal(rs, xp)
+        # ndarray
+        tm.assert_frame_equal(f(other.values), o(df, other.values))
+        # scalar
+        tm.assert_frame_equal(f(0), o(df, 0))
+        # NAs
+        msg = "Unable to coerce to Series/DataFrame"
+        tm.assert_frame_equal(f(np.nan), o(df, np.nan))
+        with pytest.raises(ValueError, match=msg):
+            f(ndim_5)
+
+    @pytest.mark.parametrize("box", [np.array, Series])
+    def test_bool_flex_series(self, box):
+        # Series
+        # list/tuple
+        data = np.random.default_rng(2).standard_normal((5, 3))
+        df = DataFrame(data)
+        idx_ser = box(np.random.default_rng(2).standard_normal(5))
+        col_ser = box(np.random.default_rng(2).standard_normal(3))
+
+        idx_eq = df.eq(idx_ser, axis=0)
+        col_eq = df.eq(col_ser)
+        idx_ne = df.ne(idx_ser, axis=0)
+        col_ne = df.ne(col_ser)
+        tm.assert_frame_equal(col_eq, df == Series(col_ser))
+        tm.assert_frame_equal(col_eq, -col_ne)
+        tm.assert_frame_equal(idx_eq, -idx_ne)
+        tm.assert_frame_equal(idx_eq, df.T.eq(idx_ser).T)
+        tm.assert_frame_equal(col_eq, df.eq(list(col_ser)))
+        tm.assert_frame_equal(idx_eq, df.eq(Series(idx_ser), axis=0))
+        tm.assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0))
+
+        idx_gt = df.gt(idx_ser, axis=0)
+        col_gt = df.gt(col_ser)
+        idx_le = df.le(idx_ser, axis=0)
+        col_le = df.le(col_ser)
+
+        tm.assert_frame_equal(col_gt, df > Series(col_ser))
+        tm.assert_frame_equal(col_gt, -col_le)
+        tm.assert_frame_equal(idx_gt, -idx_le)
+        tm.assert_frame_equal(idx_gt, df.T.gt(idx_ser).T)
+
+        idx_ge = df.ge(idx_ser, axis=0)
+        col_ge = df.ge(col_ser)
+        idx_lt = df.lt(idx_ser, axis=0)
+        col_lt = df.lt(col_ser)
+        tm.assert_frame_equal(col_ge, df >= Series(col_ser))
+        tm.assert_frame_equal(col_ge, -col_lt)
+        tm.assert_frame_equal(idx_ge, -idx_lt)
+        tm.assert_frame_equal(idx_ge, df.T.ge(idx_ser).T)
+
+        idx_ser = Series(np.random.default_rng(2).standard_normal(5))
+        col_ser = Series(np.random.default_rng(2).standard_normal(3))
+
+    def test_bool_flex_frame_na(self):
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+        # NA
+        df.loc[0, 0] = np.nan
+        rs = df.eq(df)
+        assert not rs.loc[0, 0]
+        rs = df.ne(df)
+        assert rs.loc[0, 0]
+        rs = df.gt(df)
+        assert not rs.loc[0, 0]
+        rs = df.lt(df)
+        assert not rs.loc[0, 0]
+        rs = df.ge(df)
+        assert not rs.loc[0, 0]
+        rs = df.le(df)
+        assert not rs.loc[0, 0]
+
+    def test_bool_flex_frame_complex_dtype(self):
+        # complex
+        arr = np.array([np.nan, 1, 6, np.nan])
+        arr2 = np.array([2j, np.nan, 7, None])
+        df = DataFrame({"a": arr})
+        df2 = DataFrame({"a": arr2})
+
+        msg = "|".join(
+            [
+                "'>' not supported between instances of '.*' and 'complex'",
+                r"unorderable types: .*complex\(\)",  # PY35
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            # inequalities are not well-defined for complex numbers
+            df.gt(df2)
+        with pytest.raises(TypeError, match=msg):
+            # regression test that we get the same behavior for Series
+            df["a"].gt(df2["a"])
+        with pytest.raises(TypeError, match=msg):
+            # Check that we match numpy behavior here
+            df.values > df2.values
+
+        rs = df.ne(df2)
+        assert rs.values.all()
+
+        arr3 = np.array([2j, np.nan, None])
+        df3 = DataFrame({"a": arr3})
+
+        with pytest.raises(TypeError, match=msg):
+            # inequalities are not well-defined for complex numbers
+            df3.gt(2j)
+        with pytest.raises(TypeError, match=msg):
+            # regression test that we get the same behavior for Series
+            df3["a"].gt(2j)
+        with pytest.raises(TypeError, match=msg):
+            # Check that we match numpy behavior here
+            df3.values > 2j
+
+    def test_bool_flex_frame_object_dtype(self):
+        # corner, dtype=object
+        df1 = DataFrame({"col": ["foo", np.nan, "bar"]}, dtype=object)
+        df2 = DataFrame({"col": ["foo", datetime.now(), "bar"]}, dtype=object)
+        result = df1.ne(df2)
+        exp = DataFrame({"col": [False, True, False]})
+        tm.assert_frame_equal(result, exp)
+
+    def test_flex_comparison_nat(self):
+        # GH 15697, GH 22163 df.eq(pd.NaT) should behave like df == pd.NaT,
+        # and _definitely_ not be NaN
+        df = DataFrame([pd.NaT])
+
+        result = df == pd.NaT
+        # result.iloc[0, 0] is an np.bool_ object
+        assert result.iloc[0, 0].item() is False
+
+        result = df.eq(pd.NaT)
+        assert result.iloc[0, 0].item() is False
+
+        result = df != pd.NaT
+        assert result.iloc[0, 0].item() is True
+
+        result = df.ne(pd.NaT)
+        assert result.iloc[0, 0].item() is True
+
+    def test_df_flex_cmp_constant_return_types(self, comparison_op):
+        # GH 15077, non-empty DataFrame
+        df = DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})
+        const = 2
+
+        result = getattr(df, comparison_op.__name__)(const).dtypes.value_counts()
+        tm.assert_series_equal(
+            result, Series([2], index=[np.dtype(bool)], name="count")
+        )
+
+    def test_df_flex_cmp_constant_return_types_empty(self, comparison_op):
+        # GH 15077 empty DataFrame
+        df = DataFrame({"x": [1, 2, 3], "y": [1.0, 2.0, 3.0]})
+        const = 2
+
+        empty = df.iloc[:0]
+        result = getattr(empty, comparison_op.__name__)(const).dtypes.value_counts()
+        tm.assert_series_equal(
+            result, Series([2], index=[np.dtype(bool)], name="count")
+        )
+
+    def test_df_flex_cmp_ea_dtype_with_ndarray_series(self):
+        ii = pd.IntervalIndex.from_breaks([1, 2, 3])
+        df = DataFrame({"A": ii, "B": ii})
+
+        ser = Series([0, 0])
+        res = df.eq(ser, axis=0)
+
+        expected = DataFrame({"A": [False, False], "B": [False, False]})
+        tm.assert_frame_equal(res, expected)
+
+        ser2 = Series([1, 2], index=["A", "B"])
+        res2 = df.eq(ser2, axis=1)
+        tm.assert_frame_equal(res2, expected)
+
+
+# -------------------------------------------------------------------
+# Arithmetic
+
+
+class TestFrameFlexArithmetic:
+    def test_floordiv_axis0(self):
+        # make sure we df.floordiv(ser, axis=0) matches column-wise result
+        arr = np.arange(3)
+        ser = Series(arr)
+        df = DataFrame({"A": ser, "B": ser})
+
+        result = df.floordiv(ser, axis=0)
+
+        expected = DataFrame({col: df[col] // ser for col in df.columns})
+
+        tm.assert_frame_equal(result, expected)
+
+        result2 = df.floordiv(ser.values, axis=0)
+        tm.assert_frame_equal(result2, expected)
+
+    def test_df_add_td64_columnwise(self):
+        # GH 22534 Check that column-wise addition broadcasts correctly
+        dti = pd.date_range("2016-01-01", periods=10)
+        tdi = pd.timedelta_range("1", periods=10)
+        tser = Series(tdi)
+        df = DataFrame({0: dti, 1: tdi})
+
+        result = df.add(tser, axis=0)
+        expected = DataFrame({0: dti + tdi, 1: tdi + tdi})
+        tm.assert_frame_equal(result, expected)
+
+    def test_df_add_flex_filled_mixed_dtypes(self):
+        # GH 19611
+        dti = pd.date_range("2016-01-01", periods=3)
+        ser = Series(["1 Day", "NaT", "2 Days"], dtype="timedelta64[ns]")
+        df = DataFrame({"A": dti, "B": ser})
+        other = DataFrame({"A": ser, "B": ser})
+        fill = pd.Timedelta(days=1).to_timedelta64()
+        result = df.add(other, fill_value=fill)
+
+        expected = DataFrame(
+            {
+                "A": Series(
+                    ["2016-01-02", "2016-01-03", "2016-01-05"], dtype="datetime64[ns]"
+                ),
+                "B": ser * 2,
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_arith_flex_frame(
+        self, all_arithmetic_operators, float_frame, mixed_float_frame
+    ):
+        # one instance of parametrized fixture
+        op = all_arithmetic_operators
+
+        def f(x, y):
+            # r-versions not in operator-stdlib; get op without "r" and invert
+            if op.startswith("__r"):
+                return getattr(operator, op.replace("__r", "__"))(y, x)
+            return getattr(operator, op)(x, y)
+
+        result = getattr(float_frame, op)(2 * float_frame)
+        expected = f(float_frame, 2 * float_frame)
+        tm.assert_frame_equal(result, expected)
+
+        # vs mix float
+        result = getattr(mixed_float_frame, op)(2 * mixed_float_frame)
+        expected = f(mixed_float_frame, 2 * mixed_float_frame)
+        tm.assert_frame_equal(result, expected)
+        _check_mixed_float(result, dtype={"C": None})
+
+    @pytest.mark.parametrize("op", ["__add__", "__sub__", "__mul__"])
+    def test_arith_flex_frame_mixed(
+        self,
+        op,
+        int_frame,
+        mixed_int_frame,
+        mixed_float_frame,
+        switch_numexpr_min_elements,
+    ):
+        f = getattr(operator, op)
+
+        # vs mix int
+        result = getattr(mixed_int_frame, op)(2 + mixed_int_frame)
+        expected = f(mixed_int_frame, 2 + mixed_int_frame)
+
+        # no overflow in the uint
+        dtype = None
+        if op in ["__sub__"]:
+            dtype = {"B": "uint64", "C": None}
+        elif op in ["__add__", "__mul__"]:
+            dtype = {"C": None}
+        if expr.USE_NUMEXPR and switch_numexpr_min_elements == 0:
+            # when using numexpr, the casting rules are slightly different:
+            # in the `2 + mixed_int_frame` operation, int32 column becomes
+            # and int64 column (not preserving dtype in operation with Python
+            # scalar), and then the int32/int64 combo results in int64 result
+            dtype["A"] = (2 + mixed_int_frame)["A"].dtype
+        tm.assert_frame_equal(result, expected)
+        _check_mixed_int(result, dtype=dtype)
+
+        # vs mix float
+        result = getattr(mixed_float_frame, op)(2 * mixed_float_frame)
+        expected = f(mixed_float_frame, 2 * mixed_float_frame)
+        tm.assert_frame_equal(result, expected)
+        _check_mixed_float(result, dtype={"C": None})
+
+        # vs plain int
+        result = getattr(int_frame, op)(2 * int_frame)
+        expected = f(int_frame, 2 * int_frame)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("dim", range(3, 6))
+    def test_arith_flex_frame_raise(self, all_arithmetic_operators, float_frame, dim):
+        # one instance of parametrized fixture
+        op = all_arithmetic_operators
+
+        # Check that arrays with dim >= 3 raise
+        arr = np.ones((1,) * dim)
+        msg = "Unable to coerce to Series/DataFrame"
+        with pytest.raises(ValueError, match=msg):
+            getattr(float_frame, op)(arr)
+
+    def test_arith_flex_frame_corner(self, float_frame):
+        const_add = float_frame.add(1)
+        tm.assert_frame_equal(const_add, float_frame + 1)
+
+        # corner cases
+        result = float_frame.add(float_frame[:0])
+        expected = float_frame.sort_index() * np.nan
+        tm.assert_frame_equal(result, expected)
+
+        result = float_frame[:0].add(float_frame)
+        expected = float_frame.sort_index() * np.nan
+        tm.assert_frame_equal(result, expected)
+
+        with pytest.raises(NotImplementedError, match="fill_value"):
+            float_frame.add(float_frame.iloc[0], fill_value=3)
+
+        with pytest.raises(NotImplementedError, match="fill_value"):
+            float_frame.add(float_frame.iloc[0], axis="index", fill_value=3)
+
+    @pytest.mark.parametrize("op", ["add", "sub", "mul", "mod"])
+    def test_arith_flex_series_ops(self, simple_frame, op):
+        # after arithmetic refactor, add truediv here
+        df = simple_frame
+
+        row = df.xs("a")
+        col = df["two"]
+        f = getattr(df, op)
+        op = getattr(operator, op)
+        tm.assert_frame_equal(f(row), op(df, row))
+        tm.assert_frame_equal(f(col, axis=0), op(df.T, col).T)
+
+    def test_arith_flex_series(self, simple_frame):
+        df = simple_frame
+
+        row = df.xs("a")
+        col = df["two"]
+        # special case for some reason
+        tm.assert_frame_equal(df.add(row, axis=None), df + row)
+
+        # cases which will be refactored after big arithmetic refactor
+        tm.assert_frame_equal(df.div(row), df / row)
+        tm.assert_frame_equal(df.div(col, axis=0), (df.T / col).T)
+
+    def test_arith_flex_series_broadcasting(self, any_real_numpy_dtype):
+        # broadcasting issue in GH 7325
+        df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype=any_real_numpy_dtype)
+        expected = DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]])
+        if any_real_numpy_dtype == "float32":
+            expected = expected.astype(any_real_numpy_dtype)
+        result = df.div(df[0], axis="index")
+        tm.assert_frame_equal(result, expected)
+
+    def test_arith_flex_zero_len_raises(self):
+        # GH 19522 passing fill_value to frame flex arith methods should
+        # raise even in the zero-length special cases
+        ser_len0 = Series([], dtype=object)
+        df_len0 = DataFrame(columns=["A", "B"])
+        df = DataFrame([[1, 2], [3, 4]], columns=["A", "B"])
+
+        with pytest.raises(NotImplementedError, match="fill_value"):
+            df.add(ser_len0, fill_value="E")
+
+        with pytest.raises(NotImplementedError, match="fill_value"):
+            df_len0.sub(df["A"], axis=None, fill_value=3)
+
+    def test_flex_add_scalar_fill_value(self):
+        # GH#12723
+        dat = np.array([0, 1, np.nan, 3, 4, 5], dtype="float")
+        df = DataFrame({"foo": dat}, index=range(6))
+
+        exp = df.fillna(0).add(2)
+        res = df.add(2, fill_value=0)
+        tm.assert_frame_equal(res, exp)
+
+    def test_sub_alignment_with_duplicate_index(self):
+        # GH#5185 dup aligning operations should work
+        df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
+        df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
+        expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
+        result = df1.sub(df2)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("op", ["__add__", "__mul__", "__sub__", "__truediv__"])
+    def test_arithmetic_with_duplicate_columns(self, op):
+        # operations
+        df = DataFrame({"A": np.arange(10), "B": np.random.default_rng(2).random(10)})
+        expected = getattr(df, op)(df)
+        expected.columns = ["A", "A"]
+        df.columns = ["A", "A"]
+        result = getattr(df, op)(df)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("level", [0, None])
+    def test_broadcast_multiindex(self, level):
+        # GH34388
+        df1 = DataFrame({"A": [0, 1, 2], "B": [1, 2, 3]})
+        df1.columns = df1.columns.set_names("L1")
+
+        df2 = DataFrame({("A", "C"): [0, 0, 0], ("A", "D"): [0, 0, 0]})
+        df2.columns = df2.columns.set_names(["L1", "L2"])
+
+        result = df1.add(df2, level=level)
+        expected = DataFrame({("A", "C"): [0, 1, 2], ("A", "D"): [0, 1, 2]})
+        expected.columns = expected.columns.set_names(["L1", "L2"])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_frame_multiindex_operations(self):
+        # GH 43321
+        df = DataFrame(
+            {2010: [1, 2, 3], 2020: [3, 4, 5]},
+            index=MultiIndex.from_product(
+                [["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
+            ),
+        )
+
+        series = Series(
+            [0.4],
+            index=MultiIndex.from_product([["b"], ["a"]], names=["mod", "scen"]),
+        )
+
+        expected = DataFrame(
+            {2010: [1.4, 2.4, 3.4], 2020: [3.4, 4.4, 5.4]},
+            index=MultiIndex.from_product(
+                [["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
+            ),
+        )
+        result = df.add(series, axis=0)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_frame_multiindex_operations_series_index_to_frame_index(self):
+        # GH 43321
+        df = DataFrame(
+            {2010: [1], 2020: [3]},
+            index=MultiIndex.from_product([["a"], ["b"]], names=["scen", "mod"]),
+        )
+
+        series = Series(
+            [10.0, 20.0, 30.0],
+            index=MultiIndex.from_product(
+                [["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
+            ),
+        )
+
+        expected = DataFrame(
+            {2010: [11.0, 21, 31.0], 2020: [13.0, 23.0, 33.0]},
+            index=MultiIndex.from_product(
+                [["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
+            ),
+        )
+        result = df.add(series, axis=0)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_frame_multiindex_operations_no_align(self):
+        df = DataFrame(
+            {2010: [1, 2, 3], 2020: [3, 4, 5]},
+            index=MultiIndex.from_product(
+                [["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
+            ),
+        )
+
+        series = Series(
+            [0.4],
+            index=MultiIndex.from_product([["c"], ["a"]], names=["mod", "scen"]),
+        )
+
+        expected = DataFrame(
+            {2010: np.nan, 2020: np.nan},
+            index=MultiIndex.from_tuples(
+                [
+                    ("a", "b", 0),
+                    ("a", "b", 1),
+                    ("a", "b", 2),
+                    ("a", "c", np.nan),
+                ],
+                names=["scen", "mod", "id"],
+            ),
+        )
+        result = df.add(series, axis=0)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_frame_multiindex_operations_part_align(self):
+        df = DataFrame(
+            {2010: [1, 2, 3], 2020: [3, 4, 5]},
+            index=MultiIndex.from_tuples(
+                [
+                    ("a", "b", 0),
+                    ("a", "b", 1),
+                    ("a", "c", 2),
+                ],
+                names=["scen", "mod", "id"],
+            ),
+        )
+
+        series = Series(
+            [0.4],
+            index=MultiIndex.from_product([["b"], ["a"]], names=["mod", "scen"]),
+        )
+
+        expected = DataFrame(
+            {2010: [1.4, 2.4, np.nan], 2020: [3.4, 4.4, np.nan]},
+            index=MultiIndex.from_tuples(
+                [
+                    ("a", "b", 0),
+                    ("a", "b", 1),
+                    ("a", "c", 2),
+                ],
+                names=["scen", "mod", "id"],
+            ),
+        )
+        result = df.add(series, axis=0)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_frame_multiindex_operations_part_align_axis1(self):
+        # GH#61009 Test DataFrame-Series arithmetic operation
+        # with partly aligned MultiIndex and axis = 1
+        df = DataFrame(
+            [[1, 2, 3], [3, 4, 5]],
+            index=[2010, 2020],
+            columns=MultiIndex.from_tuples(
+                [
+                    ("a", "b", 0),
+                    ("a", "b", 1),
+                    ("a", "c", 2),
+                ],
+                names=["scen", "mod", "id"],
+            ),
+        )
+
+        series = Series(
+            [0.4],
+            index=MultiIndex.from_product([["b"], ["a"]], names=["mod", "scen"]),
+        )
+
+        expected = DataFrame(
+            [[1.4, 2.4, np.nan], [3.4, 4.4, np.nan]],
+            index=[2010, 2020],
+            columns=MultiIndex.from_tuples(
+                [
+                    ("a", "b", 0),
+                    ("a", "b", 1),
+                    ("a", "c", 2),
+                ],
+                names=["scen", "mod", "id"],
+            ),
+        )
+        result = df.add(series, axis=1)
+
+        tm.assert_frame_equal(result, expected)
+
+
+class TestFrameArithmetic:
+    def test_td64_op_nat_casting(self):
+        # Make sure we don't accidentally treat timedelta64(NaT) as datetime64
+        #  when calling dispatch_to_series in DataFrame arithmetic
+        ser = Series(["NaT", "NaT"], dtype="timedelta64[ns]")
+        df = DataFrame([[1, 2], [3, 4]])
+
+        result = df * ser
+        expected = DataFrame({0: ser, 1: ser})
+        tm.assert_frame_equal(result, expected)
+
+    def test_df_add_2d_array_rowlike_broadcasts(self):
+        # GH#23000
+        arr = np.arange(6).reshape(3, 2)
+        df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"])
+
+        rowlike = arr[[1], :]  # shape --> (1, ncols)
+        assert rowlike.shape == (1, df.shape[1])
+
+        expected = DataFrame(
+            [[2, 4], [4, 6], [6, 8]],
+            columns=df.columns,
+            index=df.index,
+            # specify dtype explicitly to avoid failing
+            # on 32bit builds
+            dtype=arr.dtype,
+        )
+        result = df + rowlike
+        tm.assert_frame_equal(result, expected)
+        result = rowlike + df
+        tm.assert_frame_equal(result, expected)
+
+    def test_df_add_2d_array_collike_broadcasts(self):
+        # GH#23000
+        arr = np.arange(6).reshape(3, 2)
+        df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"])
+
+        collike = arr[:, [1]]  # shape --> (nrows, 1)
+        assert collike.shape == (df.shape[0], 1)
+
+        expected = DataFrame(
+            [[1, 2], [5, 6], [9, 10]],
+            columns=df.columns,
+            index=df.index,
+            # specify dtype explicitly to avoid failing
+            # on 32bit builds
+            dtype=arr.dtype,
+        )
+        result = df + collike
+        tm.assert_frame_equal(result, expected)
+        result = collike + df
+        tm.assert_frame_equal(result, expected)
+
+    def test_df_arith_2d_array_rowlike_broadcasts(
+        self, request, all_arithmetic_operators
+    ):
+        # GH#23000
+        opname = all_arithmetic_operators
+        arr = np.arange(6).reshape(3, 2)
+        df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"])
+
+        rowlike = arr[[1], :]  # shape --> (1, ncols)
+        assert rowlike.shape == (1, df.shape[1])
+
+        exvals = [
+            getattr(df.loc["A"], opname)(rowlike.squeeze()),
+            getattr(df.loc["B"], opname)(rowlike.squeeze()),
+            getattr(df.loc["C"], opname)(rowlike.squeeze()),
+        ]
+
+        expected = DataFrame(exvals, columns=df.columns, index=df.index)
+
+        result = getattr(df, opname)(rowlike)
+        tm.assert_frame_equal(result, expected)
+
+    def test_df_arith_2d_array_collike_broadcasts(
+        self, request, all_arithmetic_operators
+    ):
+        # GH#23000
+        opname = all_arithmetic_operators
+        arr = np.arange(6).reshape(3, 2)
+        df = DataFrame(arr, columns=[True, False], index=["A", "B", "C"])
+
+        collike = arr[:, [1]]  # shape --> (nrows, 1)
+        assert collike.shape == (df.shape[0], 1)
+
+        exvals = {
+            True: getattr(df[True], opname)(collike.squeeze()),
+            False: getattr(df[False], opname)(collike.squeeze()),
+        }
+
+        dtype = None
+        if opname in ["__rmod__", "__rfloordiv__"]:
+            # Series ops may return mixed int/float dtypes in cases where
+            #   DataFrame op will return all-float.  So we upcast `expected`
+            dtype = np.common_type(*(x.values for x in exvals.values()))
+
+        expected = DataFrame(exvals, columns=df.columns, index=df.index, dtype=dtype)
+
+        result = getattr(df, opname)(collike)
+        tm.assert_frame_equal(result, expected)
+
+    def test_df_bool_mul_int(self):
+        # GH 22047, GH 22163 multiplication by 1 should result in int dtype,
+        # not object dtype
+        df = DataFrame([[False, True], [False, False]])
+        result = df * 1
+
+        # On appveyor this comes back as np.int32 instead of np.int64,
+        # so we check dtype.kind instead of just dtype
+        kinds = result.dtypes.apply(lambda x: x.kind)
+        assert (kinds == "i").all()
+
+        result = 1 * df
+        kinds = result.dtypes.apply(lambda x: x.kind)
+        assert (kinds == "i").all()
+
+    def test_arith_mixed(self):
+        left = DataFrame({"A": ["a", "b", "c"], "B": [1, 2, 3]})
+
+        result = left + left
+        expected = DataFrame({"A": ["aa", "bb", "cc"], "B": [2, 4, 6]})
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("col", ["A", "B"])
+    def test_arith_getitem_commute(self, all_arithmetic_functions, col):
+        df = DataFrame({"A": [1.1, 3.3], "B": [2.5, -3.9]})
+        result = all_arithmetic_functions(df, 1)[col]
+        expected = all_arithmetic_functions(df[col], 1)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "values", [[1, 2], (1, 2), np.array([1, 2]), range(1, 3), deque([1, 2])]
+    )
+    def test_arith_alignment_non_pandas_object(self, values):
+        # GH#17901
+        df = DataFrame({"A": [1, 1], "B": [1, 1]})
+        expected = DataFrame({"A": [2, 2], "B": [3, 3]})
+        result = df + values
+        tm.assert_frame_equal(result, expected)
+
+    def test_arith_non_pandas_object(self):
+        df = DataFrame(
+            np.arange(1, 10, dtype="f8").reshape(3, 3),
+            columns=["one", "two", "three"],
+            index=["a", "b", "c"],
+        )
+
+        val1 = df.xs("a").values
+        added = DataFrame(df.values + val1, index=df.index, columns=df.columns)
+        tm.assert_frame_equal(df + val1, added)
+
+        added = DataFrame((df.values.T + val1).T, index=df.index, columns=df.columns)
+        tm.assert_frame_equal(df.add(val1, axis=0), added)
+
+        val2 = list(df["two"])
+
+        added = DataFrame(df.values + val2, index=df.index, columns=df.columns)
+        tm.assert_frame_equal(df + val2, added)
+
+        added = DataFrame((df.values.T + val2).T, index=df.index, columns=df.columns)
+        tm.assert_frame_equal(df.add(val2, axis="index"), added)
+
+        val3 = np.random.default_rng(2).random(df.shape)
+        added = DataFrame(df.values + val3, index=df.index, columns=df.columns)
+        tm.assert_frame_equal(df.add(val3), added)
+
+    def test_operations_with_interval_categories_index(self, all_arithmetic_operators):
+        # GH#27415
+        op = all_arithmetic_operators
+        ind = pd.CategoricalIndex(pd.interval_range(start=0.0, end=2.0))
+        data = [1, 2]
+        df = DataFrame([data], columns=ind)
+        num = 10
+        result = getattr(df, op)(num)
+        expected = DataFrame([[getattr(n, op)(num) for n in data]], columns=ind)
+        tm.assert_frame_equal(result, expected)
+
+    def test_frame_with_frame_reindex(self):
+        # GH#31623
+        df = DataFrame(
+            {
+                "foo": [pd.Timestamp("2019"), pd.Timestamp("2020")],
+                "bar": [pd.Timestamp("2018"), pd.Timestamp("2021")],
+            },
+            columns=["foo", "bar"],
+            dtype="M8[ns]",
+        )
+        df2 = df[["foo"]]
+
+        result = df - df2
+
+        expected = DataFrame(
+            {"foo": [pd.Timedelta(0), pd.Timedelta(0)], "bar": [np.nan, np.nan]},
+            columns=["bar", "foo"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "value, dtype",
+        [
+            (1, "i8"),
+            (1.0, "f8"),
+            (2**63, "f8"),
+            (1j, "complex128"),
+            (2**63, "complex128"),
+            (True, "bool"),
+            (np.timedelta64(20, "ns"), "<m8[ns]"),
+            (np.datetime64(20, "ns"), "<M8[ns]"),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "op",
+        [
+            operator.add,
+            operator.sub,
+            operator.mul,
+            operator.truediv,
+            operator.mod,
+            operator.pow,
+        ],
+        ids=lambda x: x.__name__,
+    )
+    def test_binop_other(self, op, value, dtype, switch_numexpr_min_elements):
+        skip = {
+            (operator.truediv, "bool"),
+            (operator.pow, "bool"),
+            (operator.add, "bool"),
+            (operator.mul, "bool"),
+        }
+
+        elem = DummyElement(value, dtype)
+        df = DataFrame({"A": [elem.value, elem.value]}, dtype=elem.dtype)
+
+        invalid = {
+            (operator.pow, "<M8[ns]"),
+            (operator.mod, "<M8[ns]"),
+            (operator.truediv, "<M8[ns]"),
+            (operator.mul, "<M8[ns]"),
+            (operator.add, "<M8[ns]"),
+            (operator.pow, "<m8[ns]"),
+            (operator.mul, "<m8[ns]"),
+            (operator.sub, "bool"),
+            (operator.mod, "complex128"),
+        }
+
+        ne = import_optional_dependency("numexpr", errors="ignore")
+        ne_warns_on_op = ne is not None and Version(ne.__version__) < Version("2.13.1")
+        if (op, dtype) in invalid:
+            warn = None
+            if (dtype == "<M8[ns]" and op == operator.add) or (
+                dtype == "<m8[ns]" and op == operator.mul
+            ):
+                msg = None
+            elif dtype == "complex128":
+                msg = "ufunc 'remainder' not supported for the input types"
+            elif op is operator.sub:
+                msg = "numpy boolean subtract, the `-` operator, is "
+                if (
+                    dtype == "bool"
+                    and expr.USE_NUMEXPR
+                    and switch_numexpr_min_elements == 0
+                ):
+                    warn = UserWarning
+            else:
+                msg = (
+                    f"cannot perform __{op.__name__}__ with this "
+                    "index type: (DatetimeArray|TimedeltaArray)"
+                )
+
+            with pytest.raises(TypeError, match=msg):
+                with tm.assert_produces_warning(warn, match="evaluating in Python"):
+                    op(df, elem.value)
+
+        elif (op, dtype) in skip:
+            if op in [operator.add, operator.mul]:
+                if (
+                    expr.USE_NUMEXPR
+                    and switch_numexpr_min_elements == 0
+                    and ne_warns_on_op
+                ):
+                    warn = UserWarning
+                else:
+                    warn = None
+                with tm.assert_produces_warning(warn, match="evaluating in Python"):
+                    op(df, elem.value)
+
+            else:
+                msg = "operator '.*' not implemented for .* dtypes"
+                with pytest.raises(NotImplementedError, match=msg):
+                    op(df, elem.value)
+
+        else:
+            with tm.assert_produces_warning(None):
+                result = op(df, elem.value).dtypes
+                expected = op(df, value).dtypes
+            tm.assert_series_equal(result, expected)
+
+    def test_arithmetic_midx_cols_different_dtypes(self):
+        # GH#49769
+        midx = MultiIndex.from_arrays([Series([1, 2]), Series([3, 4])])
+        midx2 = MultiIndex.from_arrays([Series([1, 2], dtype="Int8"), Series([3, 4])])
+        left = DataFrame([[1, 2], [3, 4]], columns=midx)
+        right = DataFrame([[1, 2], [3, 4]], columns=midx2)
+        result = left - right
+        expected = DataFrame([[0, 0], [0, 0]], columns=midx)
+        tm.assert_frame_equal(result, expected)
+
+    def test_arithmetic_midx_cols_different_dtypes_different_order(self):
+        # GH#49769
+        midx = MultiIndex.from_arrays([Series([1, 2]), Series([3, 4])])
+        midx2 = MultiIndex.from_arrays([Series([2, 1], dtype="Int8"), Series([4, 3])])
+        left = DataFrame([[1, 2], [3, 4]], columns=midx)
+        right = DataFrame([[1, 2], [3, 4]], columns=midx2)
+        result = left - right
+        expected = DataFrame([[-1, 1], [-1, 1]], columns=midx)
+        tm.assert_frame_equal(result, expected)
+
+
+def test_frame_with_zero_len_series_corner_cases():
+    # GH#28600
+    # easy all-float case
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal(6).reshape(3, 2), columns=["A", "B"]
+    )
+    ser = Series(dtype=np.float64)
+
+    result = df + ser
+    expected = DataFrame(df.values * np.nan, columns=df.columns)
+    tm.assert_frame_equal(result, expected)
+
+    with pytest.raises(ValueError, match="not aligned"):
+        # Automatic alignment for comparisons deprecated GH#36795, enforced 2.0
+        df == ser
+
+    # non-float case should not raise TypeError on comparison
+    df2 = DataFrame(df.values.view("M8[ns]"), columns=df.columns)
+    with pytest.raises(ValueError, match="not aligned"):
+        # Automatic alignment for comparisons deprecated
+        df2 == ser
+
+
+def test_zero_len_frame_with_series_corner_cases():
+    # GH#28600
+    df = DataFrame(columns=["A", "B"], dtype=np.float64)
+    ser = Series([1, 2], index=["A", "B"])
+
+    result = df + ser
+    expected = df
+    tm.assert_frame_equal(result, expected)
+
+
+def test_frame_single_columns_object_sum_axis_1():
+    # GH 13758
+    data = {
+        "One": Series(["A", 1.2, np.nan]),
+    }
+    df = DataFrame(data)
+    result = df.sum(axis=1)
+    expected = Series(["A", 1.2, 0])
+    tm.assert_series_equal(result, expected)
+
+
+# -------------------------------------------------------------------
+# Unsorted
+#  These arithmetic tests were previously in other files, eventually
+#  should be parametrized and put into tests.arithmetic
+
+
+class TestFrameArithmeticUnsorted:
+    def test_frame_add_tz_mismatch_converts_to_utc(self):
+        rng = pd.date_range("1/1/2011", periods=10, freq="h", tz="US/Eastern")
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal(len(rng)), index=rng, columns=["a"]
+        )
+
+        df_moscow = df.tz_convert("Europe/Moscow")
+        result = df + df_moscow
+        assert result.index.tz is timezone.utc
+
+        result = df_moscow + df
+        assert result.index.tz is timezone.utc
+
+    def test_align_frame(self):
+        rng = pd.period_range("1/1/2000", "1/1/2010", freq="Y")
+        ts = DataFrame(
+            np.random.default_rng(2).standard_normal((len(rng), 3)), index=rng
+        )
+
+        result = ts + ts[::2]
+        expected = ts + ts
+        expected.iloc[1::2] = np.nan
+        tm.assert_frame_equal(result, expected)
+
+        half = ts[::2]
+        result = ts + half.take(np.random.default_rng(2).permutation(len(half)))
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "op", [operator.add, operator.sub, operator.mul, operator.truediv]
+    )
+    def test_operators_none_as_na(self, op):
+        df = DataFrame(
+            {"col1": [2, 5.0, 123, None], "col2": [1, 2, 3, 4]}, dtype=object
+        )
+
+        # since filling converts dtypes from object, changed expected to be
+        # object
+
+        filled = df.fillna(np.nan)
+        result = op(df, 3)
+        expected = op(filled, 3).astype(object)
+        expected[pd.isna(expected)] = np.nan
+        tm.assert_frame_equal(result, expected)
+
+        result = op(df, df)
+        expected = op(filled, filled).astype(object)
+        expected[pd.isna(expected)] = np.nan
+        tm.assert_frame_equal(result, expected)
+
+        result = op(df, df.fillna(7))
+        tm.assert_frame_equal(result, expected)
+
+        result = op(df.fillna(7), df)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("op,res", [("__eq__", False), ("__ne__", True)])
+    # TODO: not sure what's correct here.
+    @pytest.mark.filterwarnings("ignore:elementwise:FutureWarning")
+    def test_logical_typeerror_with_non_valid(self, op, res, float_frame):
+        # we are comparing floats vs a string
+        result = getattr(float_frame, op)("foo")
+        assert bool(result.all().all()) is res
+
+    @pytest.mark.parametrize("op", ["add", "sub", "mul", "div", "truediv"])
+    def test_binary_ops_align(self, op):
+        # test aligning binary ops
+
+        # GH 6681
+        index = MultiIndex.from_product(
+            [list("abc"), ["one", "two", "three"], [1, 2, 3]],
+            names=["first", "second", "third"],
+        )
+
+        df = DataFrame(
+            np.arange(27 * 3).reshape(27, 3),
+            index=index,
+            columns=["value1", "value2", "value3"],
+        ).sort_index()
+
+        idx = pd.IndexSlice
+        opa = getattr(operator, op, None)
+        if opa is None:
+            return
+
+        x = Series([1.0, 10.0, 100.0], [1, 2, 3])
+        result = getattr(df, op)(x, level="third", axis=0)
+
+        expected = pd.concat(
+            [opa(df.loc[idx[:, :, i], :], v) for i, v in x.items()]
+        ).sort_index()
+        tm.assert_frame_equal(result, expected)
+
+        x = Series([1.0, 10.0], ["two", "three"])
+        result = getattr(df, op)(x, level="second", axis=0)
+
+        expected = (
+            pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.items()])
+            .reindex_like(df)
+            .sort_index()
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_binary_ops_align_series_dataframe(self):
+        # GH9463 (alignment level of dataframe with series)
+
+        midx = MultiIndex.from_product([["A", "B"], ["a", "b"]])
+        df = DataFrame(np.ones((2, 4), dtype="int64"), columns=midx)
+        s = Series({"a": 1, "b": 2})
+
+        df2 = df.copy()
+        df2.columns.names = ["lvl0", "lvl1"]
+        s2 = s.copy()
+        s2.index.name = "lvl1"
+
+        # different cases of integer/string level names:
+        res1 = df.mul(s, axis=1, level=1)
+        res2 = df.mul(s2, axis=1, level=1)
+        res3 = df2.mul(s, axis=1, level=1)
+        res4 = df2.mul(s2, axis=1, level=1)
+        res5 = df2.mul(s, axis=1, level="lvl1")
+        res6 = df2.mul(s2, axis=1, level="lvl1")
+
+        exp = DataFrame(
+            np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype="int64"), columns=midx
+        )
+
+        for res in [res1, res2]:
+            tm.assert_frame_equal(res, exp)
+
+        exp.columns.names = ["lvl0", "lvl1"]
+        for res in [res3, res4, res5, res6]:
+            tm.assert_frame_equal(res, exp)
+
+    def test_add_with_dti_mismatched_tzs(self):
+        base = pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="UTC")
+        idx1 = base.tz_convert("Asia/Tokyo")[:2]
+        idx2 = base.tz_convert("US/Eastern")[1:]
+
+        df1 = DataFrame({"A": [1, 2]}, index=idx1)
+        df2 = DataFrame({"A": [1, 1]}, index=idx2)
+        exp = DataFrame({"A": [np.nan, 3, np.nan]}, index=base)
+        tm.assert_frame_equal(df1 + df2, exp)
+
+    def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame):
+        frame_copy = float_frame.reindex(float_frame.index[::2])
+
+        del frame_copy["D"]
+        # adding NAs to first 5 values of column "C"
+        frame_copy.loc[: frame_copy.index[4], "C"] = np.nan
+
+        added = float_frame + frame_copy
+
+        indexer = added["A"].dropna().index
+        exp = (float_frame["A"] * 2).copy()
+
+        tm.assert_series_equal(added["A"].dropna(), exp.loc[indexer])
+
+        exp.loc[~exp.index.isin(indexer)] = np.nan
+        tm.assert_series_equal(added["A"], exp.loc[added["A"].index])
+
+        assert np.isnan(added["C"].reindex(frame_copy.index)[:5]).all()
+
+        # assert(False)
+
+        assert np.isnan(added["D"]).all()
+
+        self_added = float_frame + float_frame
+        tm.assert_index_equal(self_added.index, float_frame.index)
+
+        added_rev = frame_copy + float_frame
+        assert np.isnan(added["D"]).all()
+        assert np.isnan(added_rev["D"]).all()
+
+        # corner cases
+
+        # empty
+        plus_empty = float_frame + DataFrame()
+        assert np.isnan(plus_empty.values).all()
+
+        empty_plus = DataFrame() + float_frame
+        assert np.isnan(empty_plus.values).all()
+
+        empty_empty = DataFrame() + DataFrame()
+        assert empty_empty.empty
+
+        # out of order
+        reverse = float_frame.reindex(columns=float_frame.columns[::-1])
+
+        tm.assert_frame_equal(reverse + float_frame, float_frame * 2)
+
+        # mix vs float64, upcast
+        added = float_frame + mixed_float_frame
+        _check_mixed_float(added, dtype="float64")
+        added = mixed_float_frame + float_frame
+        _check_mixed_float(added, dtype="float64")
+
+        # mix vs mix
+        added = mixed_float_frame + mixed_float_frame
+        _check_mixed_float(added, dtype={"C": None})
+
+        # with int
+        added = float_frame + mixed_int_frame
+        _check_mixed_float(added, dtype="float64")
+
+    def test_combine_series(self, float_frame, mixed_float_frame, mixed_int_frame):
+        # Series
+        series = float_frame.xs(float_frame.index[0])
+
+        added = float_frame + series
+
+        for key, s in added.items():
+            tm.assert_series_equal(s, float_frame[key] + series[key])
+
+        larger_series = series.to_dict()
+        larger_series["E"] = 1
+        larger_series = Series(larger_series)
+        larger_added = float_frame + larger_series
+
+        for key, s in float_frame.items():
+            tm.assert_series_equal(larger_added[key], s + series[key])
+        assert "E" in larger_added
+        assert np.isnan(larger_added["E"]).all()
+
+        # no upcast needed
+        added = mixed_float_frame + series
+        assert np.all(added.dtypes == series.dtype)
+
+        # vs mix (upcast) as needed
+        added = mixed_float_frame + series.astype("float32")
+        _check_mixed_float(added, dtype={"C": None})
+        added = mixed_float_frame + series.astype("float16")
+        _check_mixed_float(added, dtype={"C": None})
+
+        # these used to raise with numexpr as we are adding an int64 to an
+        #  uint64....weird vs int
+        added = mixed_int_frame + (100 * series).astype("int64")
+        _check_mixed_int(
+            added, dtype={"A": "int64", "B": "float64", "C": "int64", "D": "int64"}
+        )
+        added = mixed_int_frame + (100 * series).astype("int32")
+        _check_mixed_int(
+            added, dtype={"A": "int32", "B": "float64", "C": "int32", "D": "int64"}
+        )
+
+    def test_combine_timeseries(self, datetime_frame):
+        # TimeSeries
+        ts = datetime_frame["A"]
+
+        # 10890
+        # we no longer allow auto timeseries broadcasting
+        # and require explicit broadcasting
+        added = datetime_frame.add(ts, axis="index")
+
+        for key, col in datetime_frame.items():
+            result = col + ts
+            tm.assert_series_equal(added[key], result, check_names=False)
+            assert added[key].name == key
+            if col.name == ts.name:
+                assert result.name == "A"
+            else:
+                assert result.name is None
+
+        smaller_frame = datetime_frame[:-5]
+        smaller_added = smaller_frame.add(ts, axis="index")
+
+        tm.assert_index_equal(smaller_added.index, datetime_frame.index)
+
+        smaller_ts = ts[:-5]
+        smaller_added2 = datetime_frame.add(smaller_ts, axis="index")
+        tm.assert_frame_equal(smaller_added, smaller_added2)
+
+        # length 0, result is all-nan
+        result = datetime_frame.add(ts[:0], axis="index")
+        expected = DataFrame(
+            np.nan, index=datetime_frame.index, columns=datetime_frame.columns
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # Frame is all-nan
+        result = datetime_frame[:0].add(ts, axis="index")
+        expected = DataFrame(
+            np.nan, index=datetime_frame.index, columns=datetime_frame.columns
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # empty but with non-empty index
+        frame = datetime_frame[:1].reindex(columns=[])
+        result = frame.mul(ts, axis="index")
+        assert len(result) == len(ts)
+
+    def test_combineFunc(self, float_frame, mixed_float_frame):
+        result = float_frame * 2
+        tm.assert_numpy_array_equal(result.values, float_frame.values * 2)
+
+        # vs mix
+        result = mixed_float_frame * 2
+        for c, s in result.items():
+            tm.assert_numpy_array_equal(s.values, mixed_float_frame[c].values * 2)
+        _check_mixed_float(result, dtype={"C": None})
+
+        result = DataFrame() * 2
+        assert result.index.equals(DataFrame().index)
+        assert len(result.columns) == 0
+
+    @pytest.mark.parametrize(
+        "func",
+        [operator.eq, operator.ne, operator.lt, operator.gt, operator.ge, operator.le],
+    )
+    def test_comparisons(self, simple_frame, float_frame, func):
+        df1 = DataFrame(
+            np.random.default_rng(2).standard_normal((30, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=pd.date_range("2000-01-01", periods=30, freq="B"),
+        )
+        df2 = df1.copy()
+
+        row = simple_frame.xs("a")
+        ndim_5 = np.ones((*df1.shape, 1, 1, 1))
+
+        result = func(df1, df2)
+        tm.assert_numpy_array_equal(result.values, func(df1.values, df2.values))
+
+        msg = (
+            "Unable to coerce to Series/DataFrame, "
+            "dimension must be <= 2: (30, 4, 1, 1, 1)"
+        )
+        with pytest.raises(ValueError, match=re.escape(msg)):
+            func(df1, ndim_5)
+
+        result2 = func(simple_frame, row)
+        tm.assert_numpy_array_equal(
+            result2.values, func(simple_frame.values, row.values)
+        )
+
+        result3 = func(float_frame, 0)
+        tm.assert_numpy_array_equal(result3.values, func(float_frame.values, 0))
+
+        msg = (
+            r"Can only compare identically-labeled \(both index and columns\) "
+            "DataFrame objects"
+        )
+        with pytest.raises(ValueError, match=msg):
+            func(simple_frame, simple_frame[:2])
+
+    def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne):
+        # GH 11565
+        df = DataFrame(
+            {x: {"x": "foo", "y": "bar", "z": "baz"} for x in ["a", "b", "c"]}
+        )
+
+        f = getattr(operator, compare_operators_no_eq_ne)
+        msg = "|".join(
+            [
+                "'[<>]=?' not supported between instances of 'str' and 'int'",
+                "Invalid comparison between dtype=str and int",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            f(df, 0)
+
+    def test_comparison_protected_from_errstate(self):
+        missing_df = DataFrame(
+            np.ones((10, 4), dtype=np.float64),
+            columns=Index(list("ABCD"), dtype=object),
+        )
+        missing_df.loc[missing_df.index[0], "A"] = np.nan
+        with np.errstate(invalid="ignore"):
+            expected = missing_df.values < 0
+        with np.errstate(invalid="raise"):
+            result = (missing_df < 0).values
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_boolean_comparison(self):
+        # GH 4576
+        # boolean comparisons with a tuple/list give unexpected results
+        df = DataFrame(np.arange(6).reshape((3, 2)))
+        b = np.array([2, 2])
+        b_r = np.atleast_2d([2, 2])
+        b_c = b_r.T
+        lst = [2, 2, 2]
+        tup = tuple(lst)
+
+        # gt
+        expected = DataFrame([[False, False], [False, True], [True, True]])
+        result = df > b
+        tm.assert_frame_equal(result, expected)
+
+        result = df.values > b
+        tm.assert_numpy_array_equal(result, expected.values)
+
+        msg1d = "Unable to coerce to Series, length must be 2: given 3"
+        msg2d = "Unable to coerce to DataFrame, shape must be"
+        msg2db = "operands could not be broadcast together with shapes"
+        with pytest.raises(ValueError, match=msg1d):
+            # wrong shape
+            df > lst
+
+        with pytest.raises(ValueError, match=msg1d):
+            # wrong shape
+            df > tup
+
+        # broadcasts like ndarray (GH#23000)
+        result = df > b_r
+        tm.assert_frame_equal(result, expected)
+
+        result = df.values > b_r
+        tm.assert_numpy_array_equal(result, expected.values)
+
+        with pytest.raises(ValueError, match=msg2d):
+            df > b_c
+
+        with pytest.raises(ValueError, match=msg2db):
+            df.values > b_c
+
+        # ==
+        expected = DataFrame([[False, False], [True, False], [False, False]])
+        result = df == b
+        tm.assert_frame_equal(result, expected)
+
+        with pytest.raises(ValueError, match=msg1d):
+            df == lst
+
+        with pytest.raises(ValueError, match=msg1d):
+            df == tup
+
+        # broadcasts like ndarray (GH#23000)
+        result = df == b_r
+        tm.assert_frame_equal(result, expected)
+
+        result = df.values == b_r
+        tm.assert_numpy_array_equal(result, expected.values)
+
+        with pytest.raises(ValueError, match=msg2d):
+            df == b_c
+
+        assert df.values.shape != b_c.shape
+
+        # with alignment
+        df = DataFrame(
+            np.arange(6).reshape((3, 2)), columns=list("AB"), index=list("abc")
+        )
+        expected.index = df.index
+        expected.columns = df.columns
+
+        with pytest.raises(ValueError, match=msg1d):
+            df == lst
+
+        with pytest.raises(ValueError, match=msg1d):
+            df == tup
+
+    def test_inplace_ops_alignment(self):
+        # inplace ops / ops alignment
+        # GH 8511
+
+        columns = list("abcdefg")
+        X_orig = DataFrame(
+            np.arange(10 * len(columns)).reshape(-1, len(columns)),
+            columns=columns,
+            index=range(10),
+        )
+        Z = 100 * X_orig.iloc[:, 1:-1].copy()
+        block1 = list("bedcf")
+        subs = list("bcdef")
+
+        # add
+        X = X_orig.copy()
+        result1 = (X[block1] + Z).reindex(columns=subs)
+
+        X[block1] += Z
+        result2 = X.reindex(columns=subs)
+
+        X = X_orig.copy()
+        result3 = (X[block1] + Z[block1]).reindex(columns=subs)
+
+        X[block1] += Z[block1]
+        result4 = X.reindex(columns=subs)
+
+        tm.assert_frame_equal(result1, result2)
+        tm.assert_frame_equal(result1, result3)
+        tm.assert_frame_equal(result1, result4)
+
+        # sub
+        X = X_orig.copy()
+        result1 = (X[block1] - Z).reindex(columns=subs)
+
+        X[block1] -= Z
+        result2 = X.reindex(columns=subs)
+
+        X = X_orig.copy()
+        result3 = (X[block1] - Z[block1]).reindex(columns=subs)
+
+        X[block1] -= Z[block1]
+        result4 = X.reindex(columns=subs)
+
+        tm.assert_frame_equal(result1, result2)
+        tm.assert_frame_equal(result1, result3)
+        tm.assert_frame_equal(result1, result4)
+
+    def test_inplace_ops_identity(self):
+        # GH 5104
+        # make sure that we are actually changing the object
+        s_orig = Series([1, 2, 3])
+        df_orig = DataFrame(
+            np.random.default_rng(2).integers(0, 5, size=10).reshape(-1, 5)
+        )
+
+        # no dtype change
+        s = s_orig.copy()
+        s2 = s
+        s += 1
+        tm.assert_series_equal(s, s2)
+        tm.assert_series_equal(s_orig + 1, s)
+        assert s is s2
+        assert s._mgr is s2._mgr
+
+        df = df_orig.copy()
+        df2 = df
+        df += 1
+        tm.assert_frame_equal(df, df2)
+        tm.assert_frame_equal(df_orig + 1, df)
+        assert df is df2
+        assert df._mgr is df2._mgr
+
+        # dtype change
+        s = s_orig.copy()
+        s2 = s
+        s += 1.5
+        tm.assert_series_equal(s, s2)
+        tm.assert_series_equal(s_orig + 1.5, s)
+
+        df = df_orig.copy()
+        df2 = df
+        df += 1.5
+        tm.assert_frame_equal(df, df2)
+        tm.assert_frame_equal(df_orig + 1.5, df)
+        assert df is df2
+        assert df._mgr is df2._mgr
+
+        # mixed dtype
+        arr = np.random.default_rng(2).integers(0, 10, size=5)
+        df_orig = DataFrame({"A": arr.copy(), "B": "foo"})
+        df = df_orig.copy()
+        df2 = df
+        df["A"] += 1
+        expected = DataFrame({"A": arr.copy() + 1, "B": "foo"})
+        tm.assert_frame_equal(df, expected)
+        tm.assert_frame_equal(df2, expected)
+        assert df._mgr is df2._mgr
+
+        df = df_orig.copy()
+        df2 = df
+        df["A"] += 1.5
+        expected = DataFrame({"A": arr.copy() + 1.5, "B": "foo"})
+        tm.assert_frame_equal(df, expected)
+        tm.assert_frame_equal(df2, expected)
+        assert df._mgr is df2._mgr
+
+    @pytest.mark.parametrize(
+        "op",
+        [
+            "add",
+            "and",
+            pytest.param(
+                "div",
+                marks=pytest.mark.xfail(
+                    raises=AttributeError, reason="__idiv__ not implemented"
+                ),
+            ),
+            "floordiv",
+            "mod",
+            "mul",
+            "or",
+            "pow",
+            "sub",
+            "truediv",
+            "xor",
+        ],
+    )
+    def test_inplace_ops_identity2(self, op):
+        df = DataFrame({"a": [1.0, 2.0, 3.0], "b": [1, 2, 3]})
+
+        operand = 2
+        if op in ("and", "or", "xor"):
+            # cannot use floats for boolean ops
+            df["a"] = [True, False, True]
+
+        df_copy = df.copy()
+        iop = f"__i{op}__"
+        op = f"__{op}__"
+
+        # no id change and value is correct
+        getattr(df, iop)(operand)
+        expected = getattr(df_copy, op)(operand)
+        tm.assert_frame_equal(df, expected)
+        expected = id(df)
+        assert id(df) == expected
+
+    @pytest.mark.parametrize(
+        "val",
+        [
+            [1, 2, 3],
+            (1, 2, 3),
+            np.array([1, 2, 3], dtype=np.int64),
+            range(1, 4),
+        ],
+    )
+    def test_alignment_non_pandas(self, val):
+        index = ["A", "B", "C"]
+        columns = ["X", "Y", "Z"]
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((3, 3)),
+            index=index,
+            columns=columns,
+        )
+
+        align = DataFrame._align_for_op
+
+        expected = DataFrame({"X": val, "Y": val, "Z": val}, index=df.index)
+        tm.assert_frame_equal(align(df, val, axis=0)[1], expected)
+
+        expected = DataFrame(
+            {"X": [1, 1, 1], "Y": [2, 2, 2], "Z": [3, 3, 3]}, index=df.index
+        )
+        tm.assert_frame_equal(align(df, val, axis=1)[1], expected)
+
+    @pytest.mark.parametrize("val", [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)])
+    def test_alignment_non_pandas_length_mismatch(self, val):
+        index = ["A", "B", "C"]
+        columns = ["X", "Y", "Z"]
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((3, 3)),
+            index=index,
+            columns=columns,
+        )
+
+        align = DataFrame._align_for_op
+        # length mismatch
+        msg = "Unable to coerce to Series, length must be 3: given 2"
+        with pytest.raises(ValueError, match=msg):
+            align(df, val, axis=0)
+
+        with pytest.raises(ValueError, match=msg):
+            align(df, val, axis=1)
+
+    def test_alignment_non_pandas_index_columns(self):
+        index = ["A", "B", "C"]
+        columns = ["X", "Y", "Z"]
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((3, 3)),
+            index=index,
+            columns=columns,
+        )
+
+        align = DataFrame._align_for_op
+        val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+        tm.assert_frame_equal(
+            align(df, val, axis=0)[1],
+            DataFrame(val, index=df.index, columns=df.columns),
+        )
+        tm.assert_frame_equal(
+            align(df, val, axis=1)[1],
+            DataFrame(val, index=df.index, columns=df.columns),
+        )
+
+        # shape mismatch
+        msg = "Unable to coerce to DataFrame, shape must be"
+        val = np.array([[1, 2, 3], [4, 5, 6]])
+        with pytest.raises(ValueError, match=msg):
+            align(df, val, axis=0)
+
+        with pytest.raises(ValueError, match=msg):
+            align(df, val, axis=1)
+
+        val = np.zeros((3, 3, 3))
+        msg = re.escape(
+            "Unable to coerce to Series/DataFrame, dimension must be <= 2: (3, 3, 3)"
+        )
+        with pytest.raises(ValueError, match=msg):
+            align(df, val, axis=0)
+        with pytest.raises(ValueError, match=msg):
+            align(df, val, axis=1)
+
+    def test_no_warning(self, all_arithmetic_operators):
+        df = DataFrame({"A": [0.0, 0.0], "B": [0.0, None]})
+        b = df["B"]
+        with tm.assert_produces_warning(None):
+            getattr(df, all_arithmetic_operators)(b)
+
+    def test_dunder_methods_binary(self, all_arithmetic_operators):
+        # GH#??? frame.__foo__ should only accept one argument
+        df = DataFrame({"A": [0.0, 0.0], "B": [0.0, None]})
+        b = df["B"]
+        with pytest.raises(TypeError, match="takes 2 positional arguments"):
+            getattr(df, all_arithmetic_operators)(b, 0)
+
+    def test_align_int_fill_bug(self):
+        # GH#910
+        X = np.arange(10 * 10, dtype="float64").reshape(10, 10)
+        Y = np.ones((10, 1), dtype=int)
+
+        df1 = DataFrame(X)
+        df1["0.X"] = Y.squeeze()
+
+        df2 = df1.astype(float)
+
+        result = df1 - df1.mean()
+        expected = df2 - df2.mean()
+        tm.assert_frame_equal(result, expected)
+
+
+def test_pow_with_realignment():
+    # GH#32685 pow has special semantics for operating with null values
+    left = DataFrame({"A": [0, 1, 2]})
+    right = DataFrame(index=[0, 1, 2])
+
+    result = left**right
+    expected = DataFrame({"A": [np.nan, 1.0, np.nan]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_dataframe_series_extension_dtypes():
+    # https://github.com/pandas-dev/pandas/issues/34311
+    df = DataFrame(
+        np.random.default_rng(2).integers(0, 100, (10, 3)), columns=["a", "b", "c"]
+    )
+    ser = Series([1, 2, 3], index=["a", "b", "c"])
+
+    expected = df.to_numpy("int64") + ser.to_numpy("int64").reshape(-1, 3)
+    expected = DataFrame(expected, columns=df.columns, dtype="Int64")
+
+    df_ea = df.astype("Int64")
+    result = df_ea + ser
+    tm.assert_frame_equal(result, expected)
+    result = df_ea + ser.astype("Int64")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_dataframe_blockwise_slicelike():
+    # GH#34367
+    arr = np.random.default_rng(2).integers(0, 1000, (100, 10))
+    df1 = DataFrame(arr)
+    # Explicit cast to float to avoid implicit cast when setting nan
+    df2 = df1.copy().astype({1: "float", 3: "float", 7: "float"})
+    df2.iloc[0, [1, 3, 7]] = np.nan
+
+    # Explicit cast to float to avoid implicit cast when setting nan
+    df3 = df1.copy().astype({5: "float"})
+    df3.iloc[0, [5]] = np.nan
+
+    # Explicit cast to float to avoid implicit cast when setting nan
+    df4 = df1.copy().astype({2: "float", 3: "float", 4: "float"})
+    df4.iloc[0, np.arange(2, 5)] = np.nan
+    # Explicit cast to float to avoid implicit cast when setting nan
+    df5 = df1.copy().astype({4: "float", 5: "float", 6: "float"})
+    df5.iloc[0, np.arange(4, 7)] = np.nan
+
+    for left, right in [(df1, df2), (df2, df3), (df4, df5)]:
+        res = left + right
+
+        expected = DataFrame({i: left[i] + right[i] for i in left.columns})
+        tm.assert_frame_equal(res, expected)
+
+
+@pytest.mark.parametrize(
+    "df, col_dtype",
+    [
+        (DataFrame([[1.0, 2.0], [4.0, 5.0]], columns=list("ab")), "float64"),
+        (
+            DataFrame([[1.0, "b"], [4.0, "b"]], columns=list("ab")).astype(
+                {"b": object}
+            ),
+            "object",
+        ),
+    ],
+)
+def test_dataframe_operation_with_non_numeric_types(df, col_dtype):
+    # GH #22663
+    expected = DataFrame([[0.0, np.nan], [3.0, np.nan]], columns=list("ab"))
+    expected = expected.astype({"b": col_dtype})
+    result = df + Series([-1.0], index=list("a"))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_arith_reindex_with_duplicates():
+    # https://github.com/pandas-dev/pandas/issues/35194
+    df1 = DataFrame(data=[[0]], columns=["second"])
+    df2 = DataFrame(data=[[0, 0, 0]], columns=["first", "second", "second"])
+    result = df1 + df2
+    expected = DataFrame([[np.nan, 0, 0]], columns=["first", "second", "second"])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("to_add", [[Series([1, 1])], [Series([1, 1]), Series([1, 1])]])
+def test_arith_list_of_arraylike_raise(to_add):
+    # GH 36702. Raise when trying to add list of array-like to DataFrame
+    df = DataFrame({"x": [1, 2], "y": [1, 2]})
+
+    msg = f"Unable to coerce list of {type(to_add[0])} to Series/DataFrame"
+    with pytest.raises(ValueError, match=msg):
+        df + to_add
+    with pytest.raises(ValueError, match=msg):
+        to_add + df
+
+
+def test_inplace_arithmetic_series_update():
+    # https://github.com/pandas-dev/pandas/issues/36373
+    df = DataFrame({"A": [1, 2, 3]})
+    df_orig = df.copy()
+    series = df["A"]
+    vals = series._values
+
+    series += 1
+    assert series._values is not vals
+    tm.assert_frame_equal(df, df_orig)
+
+
+def test_arithmetic_multiindex_align():
+    """
+    Regression test for: https://github.com/pandas-dev/pandas/issues/33765
+    """
+    df1 = DataFrame(
+        [[1]],
+        index=["a"],
+        columns=MultiIndex.from_product([[0], [1]], names=["a", "b"]),
+    )
+    df2 = DataFrame([[1]], index=["a"], columns=Index([0], name="a"))
+    expected = DataFrame(
+        [[0]],
+        index=["a"],
+        columns=MultiIndex.from_product([[0], [1]], names=["a", "b"]),
+    )
+    result = df1 - df2
+    tm.assert_frame_equal(result, expected)
+
+
+def test_arithmetic_multiindex_column_align():
+    # GH#60498
+    df1 = DataFrame(
+        data=100,
+        columns=MultiIndex.from_product(
+            [["1A", "1B"], ["2A", "2B"]], names=["Lev1", "Lev2"]
+        ),
+        index=["C1", "C2"],
+    )
+    df2 = DataFrame(
+        data=np.array([[0.1, 0.25], [0.2, 0.45]]),
+        columns=MultiIndex.from_product([["1A", "1B"]], names=["Lev1"]),
+        index=["C1", "C2"],
+    )
+    expected = DataFrame(
+        data=np.array([[10.0, 10.0, 25.0, 25.0], [20.0, 20.0, 45.0, 45.0]]),
+        columns=MultiIndex.from_product(
+            [["1A", "1B"], ["2A", "2B"]], names=["Lev1", "Lev2"]
+        ),
+        index=["C1", "C2"],
+    )
+    result = df1 * df2
+    tm.assert_frame_equal(result, expected)
+
+
+def test_arithmetic_multiindex_column_align_with_fillvalue():
+    # GH#60903
+    df1 = DataFrame(
+        data=[[1.0, 2.0]],
+        columns=MultiIndex.from_tuples([("A", "one"), ("A", "two")]),
+    )
+    df2 = DataFrame(
+        data=[[3.0, 4.0]],
+        columns=MultiIndex.from_tuples([("B", "one"), ("B", "two")]),
+    )
+    expected = DataFrame(
+        data=[[1.0, 2.0, 3.0, 4.0]],
+        columns=MultiIndex.from_tuples(
+            [("A", "one"), ("A", "two"), ("B", "one"), ("B", "two")]
+        ),
+    )
+    result = df1.add(df2, fill_value=0)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_bool_frame_mult_float():
+    # GH 18549
+    df = DataFrame(True, list("ab"), list("cd"))
+    result = df * 1.0
+    expected = DataFrame(np.ones((2, 2)), list("ab"), list("cd"))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_frame_sub_nullable_int(any_int_ea_dtype):
+    # GH 32822
+    series1 = Series([1, 2, None], dtype=any_int_ea_dtype)
+    series2 = Series([1, 2, 3], dtype=any_int_ea_dtype)
+    expected = DataFrame([0, 0, None], dtype=any_int_ea_dtype)
+    result = series1.to_frame() - series2.to_frame()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
+)
+def test_frame_op_subclass_nonclass_constructor():
+    # GH#43201 subclass._constructor is a function, not the subclass itself
+
+    class SubclassedSeries(Series):
+        @property
+        def _constructor(self):
+            return SubclassedSeries
+
+        @property
+        def _constructor_expanddim(self):
+            return SubclassedDataFrame
+
+    class SubclassedDataFrame(DataFrame):
+        _metadata = ["my_extra_data"]
+
+        def __init__(self, my_extra_data, *args, **kwargs) -> None:
+            self.my_extra_data = my_extra_data
+            super().__init__(*args, **kwargs)
+
+        @property
+        def _constructor(self):
+            return functools.partial(type(self), self.my_extra_data)
+
+        @property
+        def _constructor_sliced(self):
+            return SubclassedSeries
+
+    sdf = SubclassedDataFrame("some_data", {"A": [1, 2, 3], "B": [4, 5, 6]})
+    result = sdf * 2
+    expected = SubclassedDataFrame("some_data", {"A": [2, 4, 6], "B": [8, 10, 12]})
+    tm.assert_frame_equal(result, expected)
+
+    result = sdf + sdf
+    tm.assert_frame_equal(result, expected)
+
+
+def test_enum_column_equality():
+    Cols = Enum("Cols", "col1 col2")
+
+    q1 = DataFrame({Cols.col1: [1, 2, 3]})
+    q2 = DataFrame({Cols.col1: [1, 2, 3]})
+
+    result = q1[Cols.col1] == q2[Cols.col1]
+    expected = Series([True, True, True], name=Cols.col1)
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_mixed_col_index_dtype(string_dtype_no_object):
+    # GH 47382
+    df1 = DataFrame(columns=list("abc"), data=1.0, index=[0])
+    df2 = DataFrame(columns=list("abc"), data=0.0, index=[0])
+    df1.columns = df2.columns.astype(string_dtype_no_object)
+    result = df1 + df2
+    expected = DataFrame(columns=list("abc"), data=1.0, index=[0])
+
+    expected.columns = expected.columns.astype(string_dtype_no_object)
+
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcebabb434683027d40e1bfca2febe6e733f18f3
--- /dev/null
+++ b/pandas/tests/frame/test_arrow_interface.py
@@ -0,0 +1,94 @@
+import ctypes
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas._testing as tm
+
+pa = pytest.importorskip("pyarrow")
+
+
+@td.skip_if_no("pyarrow", min_version="14.0")
+def test_dataframe_arrow_interface(using_infer_string):
+    df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+
+    capsule = df.__arrow_c_stream__()
+    assert (
+        ctypes.pythonapi.PyCapsule_IsValid(
+            ctypes.py_object(capsule), b"arrow_array_stream"
+        )
+        == 1
+    )
+
+    table = pa.table(df)
+    string_type = pa.large_string() if using_infer_string else pa.string()
+    expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)})
+    assert table.equals(expected)
+
+    schema = pa.schema([("a", pa.int8()), ("b", pa.string())])
+    table = pa.table(df, schema=schema)
+    expected = expected.cast(schema)
+    assert table.equals(expected)
+
+
+@td.skip_if_no("pyarrow", min_version="15.0")
+def test_dataframe_to_arrow(using_infer_string):
+    df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+
+    table = pa.RecordBatchReader.from_stream(df).read_all()
+    string_type = pa.large_string() if using_infer_string else pa.string()
+    expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)})
+    assert table.equals(expected)
+
+    schema = pa.schema([("a", pa.int8()), ("b", pa.string())])
+    table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all()
+    expected = expected.cast(schema)
+    assert table.equals(expected)
+
+
+class ArrowArrayWrapper:
+    def __init__(self, batch):
+        self.array = batch
+
+    def __arrow_c_array__(self, requested_schema=None):
+        return self.array.__arrow_c_array__(requested_schema)
+
+
+class ArrowStreamWrapper:
+    def __init__(self, table):
+        self.stream = table
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        return self.stream.__arrow_c_stream__(requested_schema)
+
+
+@td.skip_if_no("pyarrow", min_version="14.0")
+def test_dataframe_from_arrow(using_infer_string):
+    # objects with __arrow_c_stream__
+    table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+
+    result = pd.DataFrame.from_arrow(table)
+    expected = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    if not using_infer_string:
+        expected["b"] = expected["b"].astype(pd.StringDtype(na_value=np.nan))
+    tm.assert_frame_equal(result, expected)
+
+    # not only pyarrow object are supported
+    result = pd.DataFrame.from_arrow(ArrowStreamWrapper(table))
+    tm.assert_frame_equal(result, expected)
+
+    # objects with __arrow_c_array__
+    batch = pa.record_batch([[1, 2, 3], ["a", "b", "c"]], names=["a", "b"])
+
+    result = pd.DataFrame.from_arrow(table)
+    tm.assert_frame_equal(result, expected)
+
+    result = pd.DataFrame.from_arrow(ArrowArrayWrapper(batch))
+    tm.assert_frame_equal(result, expected)
+
+    # only accept actual Arrow objects
+    with pytest.raises(TypeError, match="Expected an Arrow-compatible tabular object"):
+        pd.DataFrame.from_arrow({"a": [1, 2, 3], "b": ["a", "b", "c"]})
diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac7438ecf492d950e660196f3bb43152addcde52
--- /dev/null
+++ b/pandas/tests/frame/test_block_internals.py
@@ -0,0 +1,455 @@
+from datetime import (
+    datetime,
+    timedelta,
+)
+from io import StringIO
+import itertools
+from textwrap import dedent
+
+import numpy as np
+import pytest
+
+from pandas.errors import Pandas4Warning
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    DataFrame,
+    Series,
+    Timestamp,
+    date_range,
+    option_context,
+)
+import pandas._testing as tm
+from pandas.core.internals.blocks import NumpyBlock
+
+# Segregated collection of methods that require the BlockManager internal data
+# structure
+
+
+class TestDataFrameBlockInternals:
+    def test_setitem_invalidates_datetime_index_freq(self):
+        # GH#24096 altering a datetime64tz column inplace invalidates the
+        #  `freq` attribute on the underlying DatetimeIndex
+
+        dti = date_range("20130101", periods=3, tz="US/Eastern")
+        ts = dti[1]
+
+        df = DataFrame({"B": dti})
+        assert df["B"]._values.freq is None
+
+        df.iloc[1, 0] = pd.NaT
+        assert df["B"]._values.freq is None
+
+        # check that the DatetimeIndex was not altered in place
+        assert dti.freq == "D"
+        assert dti[1] == ts
+
+    def test_cast_internals(self, float_frame):
+        msg = "Passing a BlockManager to DataFrame"
+        with tm.assert_produces_warning(
+            Pandas4Warning, match=msg, check_stacklevel=False
+        ):
+            casted = DataFrame(float_frame._mgr, dtype=int)
+        expected = DataFrame(float_frame._series, dtype=int)
+        tm.assert_frame_equal(casted, expected)
+
+        with tm.assert_produces_warning(
+            Pandas4Warning, match=msg, check_stacklevel=False
+        ):
+            casted = DataFrame(float_frame._mgr, dtype=np.int32)
+        expected = DataFrame(float_frame._series, dtype=np.int32)
+        tm.assert_frame_equal(casted, expected)
+
+    def test_consolidate(self, float_frame):
+        float_frame["E"] = 7.0
+        consolidated = float_frame._consolidate()
+        assert len(consolidated._mgr.blocks) == 1
+
+        # Ensure copy, do I want this?
+        recons = consolidated._consolidate()
+        assert recons is not consolidated
+        tm.assert_frame_equal(recons, consolidated)
+
+        float_frame["F"] = 8.0
+        assert len(float_frame._mgr.blocks) == 3
+
+        return_value = float_frame._consolidate_inplace()
+        assert return_value is None
+        assert len(float_frame._mgr.blocks) == 1
+
+    def test_consolidate_inplace(self, float_frame):
+        # triggers in-place consolidation
+        for letter in range(ord("A"), ord("Z")):
+            float_frame[chr(letter)] = chr(letter)
+
+    def test_modify_values(self, float_frame):
+        with pytest.raises(ValueError, match="read-only"):
+            float_frame.values[5] = 5
+        assert (float_frame.values[5] != 5).all()
+
+    def test_boolean_set_uncons(self, float_frame):
+        float_frame["E"] = 7.0
+
+        expected = float_frame.values.copy()
+        expected[expected > 1] = 2
+
+        float_frame[float_frame > 1] = 2
+        tm.assert_almost_equal(expected, float_frame.values)
+
+    def test_constructor_with_convert(self):
+        # this is actually mostly a test of lib.maybe_convert_objects
+        # #2845
+        df = DataFrame({"A": [2**63 - 1]})
+        result = df["A"]
+        expected = Series(np.asarray([2**63 - 1], np.int64), name="A")
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"A": [2**63]})
+        result = df["A"]
+        expected = Series(np.asarray([2**63], np.uint64), name="A")
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"A": [datetime(2005, 1, 1), True]})
+        result = df["A"]
+        expected = Series(
+            np.asarray([datetime(2005, 1, 1), True], np.object_), name="A"
+        )
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"A": [None, 1]})
+        result = df["A"]
+        expected = Series(np.asarray([np.nan, 1], np.float64), name="A")
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"A": [1.0, 2]})
+        result = df["A"]
+        expected = Series(np.asarray([1.0, 2], np.float64), name="A")
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"A": [1.0 + 2.0j, 3]})
+        result = df["A"]
+        expected = Series(np.asarray([1.0 + 2.0j, 3], np.complex128), name="A")
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"A": [1.0 + 2.0j, 3.0]})
+        result = df["A"]
+        expected = Series(np.asarray([1.0 + 2.0j, 3.0], np.complex128), name="A")
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"A": [1.0 + 2.0j, True]})
+        result = df["A"]
+        expected = Series(np.asarray([1.0 + 2.0j, True], np.object_), name="A")
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"A": [1.0, None]})
+        result = df["A"]
+        expected = Series(np.asarray([1.0, np.nan], np.float64), name="A")
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"A": [1.0 + 2.0j, None]})
+        result = df["A"]
+        expected = Series(np.asarray([1.0 + 2.0j, np.nan], np.complex128), name="A")
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"A": [2.0, 1, True, None]})
+        result = df["A"]
+        expected = Series(np.asarray([2.0, 1, True, None], np.object_), name="A")
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"A": [2.0, 1, datetime(2006, 1, 1), None]})
+        result = df["A"]
+        expected = Series(
+            np.asarray([2.0, 1, datetime(2006, 1, 1), None], np.object_), name="A"
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_construction_with_mixed(self, float_string_frame, using_infer_string):
+        # mixed-type frames
+        float_string_frame["datetime"] = datetime.now()
+        float_string_frame["timedelta"] = timedelta(days=1, seconds=1)
+        assert float_string_frame["datetime"].dtype == "M8[us]"
+        assert float_string_frame["timedelta"].dtype == "m8[us]"
+        result = float_string_frame.dtypes
+        expected = Series(
+            [np.dtype("float64")] * 4
+            + [
+                np.dtype("object")
+                if not using_infer_string
+                else pd.StringDtype(na_value=np.nan),
+                np.dtype("datetime64[us]"),
+                np.dtype("timedelta64[us]"),
+            ],
+            index=[*list("ABCD"), "foo", "datetime", "timedelta"],
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_construction_with_conversions(self):
+        # convert from a numpy array of non-ns timedelta64; as of 2.0 this does
+        #  *not* convert
+        arr = np.array([1, 2, 3], dtype="timedelta64[s]")
+        df = DataFrame({"A": arr})
+        expected = DataFrame(
+            {"A": pd.timedelta_range("00:00:01", periods=3, freq="s")}, index=range(3)
+        )
+        tm.assert_numpy_array_equal(df["A"].to_numpy(), arr)
+
+        expected = DataFrame(
+            {
+                "dt1": Timestamp("20130101").as_unit("s"),
+                "dt2": date_range("20130101", periods=3).astype("M8[s]"),
+                # 'dt3' : date_range('20130101 00:00:01',periods=3,freq='s'),
+                # FIXME: don't leave commented-out
+            },
+            index=range(3),
+        )
+        assert expected.dtypes["dt1"] == "M8[s]"
+        assert expected.dtypes["dt2"] == "M8[s]"
+
+        dt1 = np.datetime64("2013-01-01")
+        dt2 = np.array(
+            ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]"
+        )
+        df = DataFrame({"dt1": dt1, "dt2": dt2})
+
+        # df['dt3'] = np.array(['2013-01-01 00:00:01','2013-01-01
+        # 00:00:02','2013-01-01 00:00:03'],dtype='datetime64[s]')
+        # FIXME: don't leave commented-out
+
+        tm.assert_frame_equal(df, expected)
+
+    def test_constructor_compound_dtypes(self):
+        # GH 5191
+        # compound dtypes should raise not-implementederror
+
+        def f(dtype):
+            data = list(itertools.repeat((datetime(2001, 1, 1), "aa", 20), 9))
+            return DataFrame(data=data, columns=["A", "B", "C"], dtype=dtype)
+
+        msg = "compound dtypes are not implemented in the DataFrame constructor"
+        with pytest.raises(NotImplementedError, match=msg):
+            f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")])
+
+        # pre-2.0 these used to work (though results may be unexpected)
+        with pytest.raises(TypeError, match="argument must be"):
+            f("int64")
+        with pytest.raises(TypeError, match="argument must be"):
+            f("float64")
+
+        # 10822
+        msg = "^Unknown datetime string format, unable to parse: aa$"
+        with pytest.raises(ValueError, match=msg):
+            f("M8[ns]")
+
+    def test_pickle_float_string_frame(self, float_string_frame, temp_file):
+        unpickled = tm.round_trip_pickle(float_string_frame, temp_file)
+        tm.assert_frame_equal(float_string_frame, unpickled)
+
+        # buglet
+        float_string_frame._mgr.ndim
+
+    def test_pickle_empty(self, temp_file):
+        empty_frame = DataFrame()
+        unpickled = tm.round_trip_pickle(empty_frame, temp_file)
+        repr(unpickled)
+
+    def test_pickle_empty_tz_frame(self, timezone_frame, temp_file):
+        unpickled = tm.round_trip_pickle(timezone_frame, temp_file)
+        tm.assert_frame_equal(timezone_frame, unpickled)
+
+    def test_consolidate_datetime64(self):
+        # numpy vstack bug
+
+        df = DataFrame(
+            {
+                "starting": pd.to_datetime(
+                    [
+                        "2012-06-21 00:00",
+                        "2012-06-23 07:00",
+                        "2012-06-23 16:30",
+                        "2012-06-25 08:00",
+                        "2012-06-26 12:00",
+                    ]
+                ),
+                "ending": pd.to_datetime(
+                    [
+                        "2012-06-23 07:00",
+                        "2012-06-23 16:30",
+                        "2012-06-25 08:00",
+                        "2012-06-26 12:00",
+                        "2012-06-27 08:00",
+                    ]
+                ),
+                "measure": [77, 65, 77, 0, 77],
+            }
+        )
+
+        ser_starting = df.starting
+        ser_starting.index = ser_starting.values
+        ser_starting = ser_starting.tz_localize("US/Eastern")
+        ser_starting = ser_starting.tz_convert("UTC")
+        ser_starting.index.name = "starting"
+
+        ser_ending = df.ending
+        ser_ending.index = ser_ending.values
+        ser_ending = ser_ending.tz_localize("US/Eastern")
+        ser_ending = ser_ending.tz_convert("UTC")
+        ser_ending.index.name = "ending"
+
+        df.starting = ser_starting.index
+        df.ending = ser_ending.index
+
+        tm.assert_index_equal(pd.DatetimeIndex(df.starting), ser_starting.index)
+        tm.assert_index_equal(pd.DatetimeIndex(df.ending), ser_ending.index)
+
+    def test_is_mixed_type(self, float_frame, float_string_frame):
+        assert not float_frame._is_mixed_type
+        assert float_string_frame._is_mixed_type
+
+    def test_stale_cached_series_bug_473(self):
+        # this is chained, but ok
+        with option_context("chained_assignment", None):
+            Y = DataFrame(
+                np.random.default_rng(2).random((4, 4)),
+                index=("a", "b", "c", "d"),
+                columns=("e", "f", "g", "h"),
+            )
+            repr(Y)
+            Y["e"] = Y["e"].astype("object")
+            with tm.raises_chained_assignment_error():
+                Y["g"]["c"] = np.nan
+            repr(Y)
+            Y.sum()
+            Y["g"].sum()
+            assert not pd.isna(Y["g"]["c"])
+
+    def test_strange_column_corruption_issue(self, performance_warning):
+        # TODO(wesm): Unclear how exactly this is related to internal matters
+        df = DataFrame(index=[0, 1])
+        df[0] = np.nan
+        wasCol = {}
+
+        with tm.assert_produces_warning(
+            performance_warning, raise_on_extra_warnings=False
+        ):
+            for i, dt in enumerate(df.index):
+                for col in range(100, 200):
+                    if col not in wasCol:
+                        wasCol[col] = 1
+                        df[col] = np.nan
+                    df.loc[dt, col] = i
+
+        myid = 100
+
+        first = len(df.loc[pd.isna(df[myid]), [myid]])
+        second = len(df.loc[pd.isna(df[myid]), [myid]])
+        assert first == second == 0
+
+    def test_constructor_no_pandas_array(self):
+        # Ensure that NumpyExtensionArray isn't allowed inside Series
+        # See https://github.com/pandas-dev/pandas/issues/23995 for more.
+        arr = Series([1, 2, 3]).array
+        result = DataFrame({"A": arr})
+        expected = DataFrame({"A": [1, 2, 3]})
+        tm.assert_frame_equal(result, expected)
+        assert isinstance(result._mgr.blocks[0], NumpyBlock)
+        assert result._mgr.blocks[0].is_numeric
+
+    def test_add_column_with_pandas_array(self):
+        # GH 26390
+        df = DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
+        df["c"] = pd.arrays.NumpyExtensionArray(np.array([1, 2, None, 3], dtype=object))
+        df2 = DataFrame(
+            {
+                "a": [1, 2, 3, 4],
+                "b": ["a", "b", "c", "d"],
+                "c": pd.arrays.NumpyExtensionArray(
+                    np.array([1, 2, None, 3], dtype=object)
+                ),
+            }
+        )
+        assert type(df["c"]._mgr.blocks[0]) == NumpyBlock
+        assert df["c"]._mgr.blocks[0].is_object
+        assert type(df2["c"]._mgr.blocks[0]) == NumpyBlock
+        assert df2["c"]._mgr.blocks[0].is_object
+        tm.assert_frame_equal(df, df2)
+
+
+def test_update_inplace_sets_valid_block_values():
+    # https://github.com/pandas-dev/pandas/issues/33457
+    df = DataFrame({"a": Series([1, 2, None], dtype="category")})
+
+    # inplace update of a single column
+    with tm.raises_chained_assignment_error():
+        df["a"].fillna(1, inplace=True)
+
+    # check we haven't put a Series into any block.values
+    assert isinstance(df._mgr.blocks[0].values, Categorical)
+
+
+def get_longley_data():
+    # From statsmodels.datasets.longley
+    # This specific dataset seems to trigger races in Pandas 3.0.0 more readily
+    # than data frames used elsewhere in the tests
+    longley_csv = StringIO(
+        dedent(
+            """"Obs","GNPDEFL","GNP","UNEMP","ARMED","POP","YEAR"
+            1,83,234289,2356,1590,107608,1947
+            2,88.5,259426,2325,1456,108632,1948
+            3,88.2,258054,3682,1616,109773,1949
+            4,89.5,284599,3351,1650,110929,1950
+            5,96.2,328975,2099,3099,112075,1951
+            6,98.1,346999,1932,3594,113270,1952
+            7,99,365385,1870,3547,115094,1953
+            8,100,363112,3578,3350,116219,1954
+            9,101.2,397469,2904,3048,117388,1955
+            10,104.6,419180,2822,2857,118734,1956
+            11,108.4,442769,2936,2798,120445,1957
+            12,110.8,444546,4681,2637,121950,1958
+            13,112.6,482704,3813,2552,123366,1959
+            14,114.2,502601,3931,2514,125368,1960
+            15,115.7,518173,4806,2572,127852,1961
+            16,116.9,554894,4007,2827,130081,1962
+            """
+        )
+    )
+
+    return pd.read_csv(longley_csv).iloc[:, [1, 2, 3, 4, 5, 6]].astype(float)
+
+
+# See gh-63685, comparisons and copying led to races in statsmodels tests
+#
+# This test spawns a thread pool, so it shouldn't run under xdist.
+# It generates warnings, so it needs warnings to be thread-safe as well
+@td.skip_if_thread_unsafe_warnings
+@pytest.mark.single_cpu
+def test_multithreaded_reading():
+    def numpy_assert(data, b):
+        b.wait()
+        tm.assert_almost_equal((data + 1) - 1, data.copy())
+
+    tm.run_multithreaded(
+        numpy_assert, max_workers=8, arguments=(get_longley_data(),), pass_barrier=True
+    )
+
+    def safe_is_const(s):
+        try:
+            return np.ptp(s) == 0.0 and np.any(s != 0.0)
+        except Exception:
+            return False
+
+    def concat(data, b):
+        b.wait()
+        x = data.copy()
+        nobs = len(x)
+        trendarr = np.fliplr(np.vander(np.arange(1, nobs + 1, dtype=np.float64), 1))
+        x.apply(safe_is_const, 0)
+        trendarr = DataFrame(trendarr, index=x.index, columns=["const"])
+        x = [trendarr, x]
+        x = pd.concat(x[::1], axis=1)
+        tm.assert_frame_equal(x, x)
+
+    tm.run_multithreaded(
+        concat, max_workers=8, arguments=(get_longley_data(),), pass_barrier=True
+    )
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
new file mode 100644
index 0000000000000000000000000000000000000000..2368f75ec06cd76658d6a18776bbda3355ab301d
--- /dev/null
+++ b/pandas/tests/frame/test_constructors.py
@@ -0,0 +1,3376 @@
+import array
+from collections import (
+    OrderedDict,
+    abc,
+    defaultdict,
+    namedtuple,
+)
+from collections.abc import Iterator
+from dataclasses import make_dataclass
+from datetime import (
+    date,
+    datetime,
+    timedelta,
+)
+import functools
+import re
+import zoneinfo
+
+import numpy as np
+from numpy import ma
+from numpy.ma import mrecords
+import pytest
+
+from pandas._libs import lib
+from pandas.compat.numpy import np_version_gt2
+from pandas.errors import IntCastingNaNError
+
+from pandas.core.dtypes.common import is_integer_dtype
+from pandas.core.dtypes.dtypes import (
+    DatetimeTZDtype,
+    IntervalDtype,
+    NumpyEADtype,
+    PeriodDtype,
+)
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    CategoricalIndex,
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    Interval,
+    MultiIndex,
+    Period,
+    RangeIndex,
+    Series,
+    Timedelta,
+    Timestamp,
+    cut,
+    date_range,
+    isna,
+)
+import pandas._testing as tm
+from pandas.arrays import (
+    DatetimeArray,
+    IntervalArray,
+    PeriodArray,
+    SparseArray,
+    TimedeltaArray,
+)
+
+MIXED_FLOAT_DTYPES = ["float16", "float32", "float64"]
+MIXED_INT_DTYPES = [
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+]
+
+
+class TestDataFrameConstructors:
+    def test_constructor_from_ndarray_with_str_dtype(self):
+        # If we don't ravel/reshape around ensure_str_array, we end up
+        #  with an array of strings each of which is e.g. "[0 1 2]"
+        arr = np.arange(12).reshape(4, 3)
+        df = DataFrame(arr, dtype=str)
+        expected = DataFrame(arr.astype(str), dtype="str")
+        tm.assert_frame_equal(df, expected)
+
+    def test_constructor_from_2d_datetimearray(self):
+        dti = date_range("2016-01-01", periods=6, tz="US/Pacific")
+        dta = dti._data.reshape(3, 2)
+
+        df = DataFrame(dta)
+        expected = DataFrame({0: dta[:, 0], 1: dta[:, 1]})
+        tm.assert_frame_equal(df, expected)
+        # GH#44724 big performance hit if we de-consolidate
+        assert len(df._mgr.blocks) == 1
+
+    def test_constructor_dict_with_tzaware_scalar(self):
+        # GH#42505
+        dt = Timestamp("2019-11-03 01:00:00-0700").tz_convert("America/Los_Angeles")
+        dt = dt.as_unit("ns")
+
+        df = DataFrame({"dt": dt}, index=[0])
+        expected = DataFrame({"dt": [dt]})
+        tm.assert_frame_equal(df, expected, check_index_type=False)
+
+        # Non-homogeneous
+        df = DataFrame({"dt": dt, "value": [1]})
+        expected = DataFrame({"dt": [dt], "value": [1]})
+        tm.assert_frame_equal(df, expected)
+
+    def test_construct_ndarray_with_nas_and_int_dtype(self):
+        # GH#26919 match Series by not casting np.nan to meaningless int
+        arr = np.array([[1, np.nan], [2, 3]])
+        msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
+        with pytest.raises(IntCastingNaNError, match=msg):
+            DataFrame(arr, dtype="i8")
+
+        # check this matches Series behavior
+        with pytest.raises(IntCastingNaNError, match=msg):
+            Series(arr[0], dtype="i8", name=0)
+
+    def test_construct_from_list_of_datetimes(self):
+        df = DataFrame([datetime.now(), datetime.now()])
+        assert df[0].dtype == np.dtype("M8[us]")
+
+    def test_constructor_from_tzaware_datetimeindex(self):
+        # don't cast a DatetimeIndex WITH a tz, leave as object
+        # GH#6032
+        naive = DatetimeIndex(["2013-1-1 13:00", "2013-1-2 14:00"], name="B")
+        idx = naive.tz_localize("US/Pacific")
+
+        expected = Series(np.array(idx.tolist(), dtype="object"), name="B")
+        assert expected.dtype == idx.dtype
+
+        # convert index to series
+        result = Series(idx)
+        tm.assert_series_equal(result, expected)
+
+    def test_columns_with_leading_underscore_work_with_to_dict(self):
+        col_underscore = "_b"
+        df = DataFrame({"a": [1, 2], col_underscore: [3, 4]})
+        d = df.to_dict(orient="records")
+
+        ref_d = [{"a": 1, col_underscore: 3}, {"a": 2, col_underscore: 4}]
+
+        assert ref_d == d
+
+    def test_columns_with_leading_number_and_underscore_work_with_to_dict(self):
+        col_with_num = "1_b"
+        df = DataFrame({"a": [1, 2], col_with_num: [3, 4]})
+        d = df.to_dict(orient="records")
+
+        ref_d = [{"a": 1, col_with_num: 3}, {"a": 2, col_with_num: 4}]
+
+        assert ref_d == d
+
+    def test_array_of_dt64_nat_with_td64dtype_raises(self, frame_or_series):
+        # GH#39462
+        nat = np.datetime64("NaT", "ns")
+        arr = np.array([nat], dtype=object)
+        if frame_or_series is DataFrame:
+            arr = arr.reshape(1, 1)
+
+        msg = "Invalid type for timedelta scalar: <class 'numpy.datetime64'>"
+        with pytest.raises(TypeError, match=msg):
+            frame_or_series(arr, dtype="m8[ns]")
+
+    @pytest.mark.parametrize("kind", ["m", "M"])
+    def test_datetimelike_values_with_object_dtype(self, kind, frame_or_series):
+        # with dtype=object, we should cast dt64 values to Timestamps, not pydatetimes
+        if kind == "M":
+            dtype = "M8[ns]"
+            scalar_type = Timestamp
+        else:
+            dtype = "m8[ns]"
+            scalar_type = Timedelta
+
+        arr = np.arange(6, dtype="i8").view(dtype).reshape(3, 2)
+        if frame_or_series is Series:
+            arr = arr[:, 0]
+
+        obj = frame_or_series(arr, dtype=object)
+        assert obj._mgr.blocks[0].values.dtype == object
+        assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type)
+
+        # go through a different path in internals.construction
+        obj = frame_or_series(frame_or_series(arr), dtype=object)
+        assert obj._mgr.blocks[0].values.dtype == object
+        assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type)
+
+        obj = frame_or_series(frame_or_series(arr), dtype=NumpyEADtype(object))
+        assert obj._mgr.blocks[0].values.dtype == object
+        assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type)
+
+        if frame_or_series is DataFrame:
+            # other paths through internals.construction
+            sers = [Series(x) for x in arr]
+            obj = frame_or_series(sers, dtype=object)
+            assert obj._mgr.blocks[0].values.dtype == object
+            assert isinstance(obj._mgr.blocks[0].values.ravel()[0], scalar_type)
+
+    def test_series_with_name_not_matching_column(self):
+        # GH#9232
+        x = Series(range(5), name=1)
+        y = Series(range(5), name=0)
+
+        result = DataFrame(x, columns=[0])
+        expected = DataFrame([], columns=[0])
+        tm.assert_frame_equal(result, expected)
+
+        result = DataFrame(y, columns=[1])
+        expected = DataFrame([], columns=[1])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "constructor",
+        [
+            lambda: DataFrame(),
+            lambda: DataFrame(None),
+            lambda: DataFrame(()),
+            lambda: DataFrame([]),
+            lambda: DataFrame(_ for _ in []),
+            lambda: DataFrame(range(0)),
+            lambda: DataFrame(data=None),
+            lambda: DataFrame(data=()),
+            lambda: DataFrame(data=[]),
+            lambda: DataFrame(data=(_ for _ in [])),
+            lambda: DataFrame(data=range(0)),
+        ],
+    )
+    def test_empty_constructor(self, constructor):
+        expected = DataFrame()
+        result = constructor()
+        assert len(result.index) == 0
+        assert len(result.columns) == 0
+        tm.assert_frame_equal(result, expected)
+
+    def test_empty_constructor_object_index(self):
+        expected = DataFrame(index=RangeIndex(0), columns=RangeIndex(0))
+        result = DataFrame({})
+        assert len(result.index) == 0
+        assert len(result.columns) == 0
+        tm.assert_frame_equal(result, expected, check_index_type=True)
+
+    @pytest.mark.parametrize(
+        "emptylike,expected_index,expected_columns",
+        [
+            ([[]], RangeIndex(1), RangeIndex(0)),
+            ([[], []], RangeIndex(2), RangeIndex(0)),
+            ([(_ for _ in [])], RangeIndex(1), RangeIndex(0)),
+        ],
+    )
+    def test_emptylike_constructor(self, emptylike, expected_index, expected_columns):
+        expected = DataFrame(index=expected_index, columns=expected_columns)
+        result = DataFrame(emptylike)
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_mixed(self, float_string_frame, using_infer_string):
+        dtype = "str" if using_infer_string else np.object_
+        assert float_string_frame["foo"].dtype == dtype
+
+    def test_constructor_cast_failure(self):
+        # as of 2.0, we raise if we can't respect "dtype", previously we
+        #  silently ignored
+        msg = "could not convert string to float"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64)
+
+        # GH 3010, constructing with odd arrays
+        df = DataFrame(np.ones((4, 2)))
+
+        # this is ok
+        df["foo"] = np.ones((4, 2)).tolist()
+
+        # this is not ok
+        msg = "Expected a 1D array, got an array with shape \\(4, 2\\)"
+        with pytest.raises(ValueError, match=msg):
+            df["test"] = np.ones((4, 2))
+
+        # this is ok
+        df["foo2"] = np.ones((4, 2)).tolist()
+
+    def test_constructor_dtype_copy(self):
+        orig_df = DataFrame({"col1": [1.0], "col2": [2.0], "col3": [3.0]})
+
+        new_df = DataFrame(orig_df, dtype=float, copy=True)
+
+        new_df["col1"] = 200.0
+        assert orig_df["col1"][0] == 1.0
+
+    def test_constructor_dtype_nocast_view_dataframe(self):
+        df = DataFrame([[1, 2]])
+        should_be_view = DataFrame(df, dtype=df[0].dtype)
+        should_be_view.iloc[0, 0] = 99
+        assert df.values[0, 0] == 1
+
+    def test_constructor_dtype_nocast_view_2d_array(self):
+        df = DataFrame([[1, 2], [3, 4]], dtype="int64")
+        df2 = DataFrame(df.values, dtype=df[0].dtype)
+        assert df2._mgr.blocks[0].values.flags.c_contiguous
+
+    def test_1d_object_array_does_not_copy(self, using_infer_string):
+        # https://github.com/pandas-dev/pandas/issues/39272
+        arr = np.array(["a", "b"], dtype="object")
+        df = DataFrame(arr, copy=False)
+        if using_infer_string:
+            if df[0].dtype.storage == "pyarrow":
+                # object dtype strings are converted to arrow memory,
+                # no numpy arrays to compare
+                pass
+            else:
+                assert np.shares_memory(df[0].to_numpy(), arr)
+        else:
+            assert np.shares_memory(df.values, arr)
+
+        df = DataFrame(arr, dtype=object, copy=False)
+        assert np.shares_memory(df.values, arr)
+
+    def test_2d_object_array_does_not_copy(self, using_infer_string):
+        # https://github.com/pandas-dev/pandas/issues/39272
+        arr = np.array([["a", "b"], ["c", "d"]], dtype="object")
+        df = DataFrame(arr, copy=False)
+        if using_infer_string:
+            if df[0].dtype.storage == "pyarrow":
+                # object dtype strings are converted to arrow memory,
+                # no numpy arrays to compare
+                pass
+            else:
+                assert np.shares_memory(df[0].to_numpy(), arr)
+        else:
+            assert np.shares_memory(df.values, arr)
+
+        df = DataFrame(arr, dtype=object, copy=False)
+        assert np.shares_memory(df.values, arr)
+
+    def test_constructor_dtype_list_data(self):
+        df = DataFrame([[1, "2"], [None, "a"]], dtype=object)
+        assert df.loc[1, 0] is None
+        assert df.loc[0, 1] == "2"
+
+    def test_constructor_list_of_2d_raises(self):
+        # https://github.com/pandas-dev/pandas/issues/32289
+        a = DataFrame()
+        b = np.empty((0, 0))
+        with pytest.raises(ValueError, match=r"shape=\(1, 0, 0\)"):
+            DataFrame([a])
+
+        with pytest.raises(ValueError, match=r"shape=\(1, 0, 0\)"):
+            DataFrame([b])
+
+        a = DataFrame({"A": [1, 2]})
+        with pytest.raises(ValueError, match=r"shape=\(2, 2, 1\)"):
+            DataFrame([a, a])
+
+    @pytest.mark.parametrize(
+        "typ, ad",
+        [
+            # mixed floating and integer coexist in the same frame
+            ["float", {}],
+            # add lots of types
+            ["float", {"A": 1, "B": "foo", "C": "bar"}],
+            # GH 622
+            ["int", {}],
+        ],
+    )
+    def test_constructor_mixed_dtypes(self, typ, ad):
+        if typ == "int":
+            dtypes = MIXED_INT_DTYPES
+            arrays = [
+                np.array(np.random.default_rng(2).random(10), dtype=d) for d in dtypes
+            ]
+        elif typ == "float":
+            dtypes = MIXED_FLOAT_DTYPES
+            arrays = [
+                np.array(np.random.default_rng(2).integers(10, size=10), dtype=d)
+                for d in dtypes
+            ]
+
+        for d, a in zip(dtypes, arrays):
+            assert a.dtype == d
+        ad.update(dict(zip(dtypes, arrays)))
+        df = DataFrame(ad)
+
+        dtypes = MIXED_FLOAT_DTYPES + MIXED_INT_DTYPES
+        for d in dtypes:
+            if d in df:
+                assert df.dtypes[d] == d
+
+    def test_constructor_complex_dtypes(self):
+        # GH10952
+        a = np.random.default_rng(2).random(10).astype(np.complex64)
+        b = np.random.default_rng(2).random(10).astype(np.complex128)
+
+        df = DataFrame({"a": a, "b": b})
+        assert a.dtype == df.a.dtype
+        assert b.dtype == df.b.dtype
+
+    def test_constructor_dtype_str_na_values(self, string_dtype):
+        # https://github.com/pandas-dev/pandas/issues/21083
+        df = DataFrame({"A": ["x", None]}, dtype=string_dtype)
+        result = df.isna()
+        expected = DataFrame({"A": [False, True]})
+        tm.assert_frame_equal(result, expected)
+        assert df.iloc[1, 0] is None
+
+        df = DataFrame({"A": ["x", np.nan]}, dtype=string_dtype)
+        assert np.isnan(df.iloc[1, 0])
+
+    def test_constructor_rec(self, float_frame):
+        rec = float_frame.to_records(index=False)
+        rec.dtype.names = list(rec.dtype.names)[::-1]
+
+        index = float_frame.index
+
+        df = DataFrame(rec)
+        tm.assert_index_equal(df.columns, Index(rec.dtype.names))
+
+        df2 = DataFrame(rec, index=index)
+        tm.assert_index_equal(df2.columns, Index(rec.dtype.names))
+        tm.assert_index_equal(df2.index, index)
+
+        # case with columns != the ones we would infer from the data
+        rng = np.arange(len(rec))[::-1]
+        df3 = DataFrame(rec, index=rng, columns=["C", "B"])
+        expected = DataFrame(rec, index=rng).reindex(columns=["C", "B"])
+        tm.assert_frame_equal(df3, expected)
+
+    def test_constructor_bool(self):
+        df = DataFrame({0: np.ones(10, dtype=bool), 1: np.zeros(10, dtype=bool)})
+        assert df.values.dtype == np.bool_
+
+    def test_constructor_overflow_int64(self):
+        # see gh-14881
+        values = np.array([2**64 - i for i in range(1, 10)], dtype=np.uint64)
+
+        result = DataFrame({"a": values})
+        assert result["a"].dtype == np.uint64
+
+        # see gh-2355
+        data_scores = [
+            (6311132704823138710, 273),
+            (2685045978526272070, 23),
+            (8921811264899370420, 45),
+            (17019687244989530680, 270),
+            (9930107427299601010, 273),
+        ]
+        dtype = [("uid", "u8"), ("score", "u8")]
+        data = np.zeros((len(data_scores),), dtype=dtype)
+        data[:] = data_scores
+        df_crawls = DataFrame(data)
+        assert df_crawls["uid"].dtype == np.uint64
+
+    @pytest.mark.parametrize(
+        "values",
+        [
+            np.array([2**64], dtype=object),
+            np.array([2**65]),
+            [2**64 + 1],
+            np.array([-(2**63) - 4], dtype=object),
+            np.array([-(2**64) - 1]),
+            [-(2**65) - 2],
+        ],
+    )
+    def test_constructor_int_overflow(self, values):
+        # see gh-18584
+        value = values[0]
+        result = DataFrame(values)
+
+        assert result[0].dtype == object
+        assert result[0][0] == value
+
+    @pytest.mark.parametrize(
+        "values",
+        [
+            np.array([1], dtype=np.uint16),
+            np.array([1], dtype=np.uint32),
+            np.array([1], dtype=np.uint64),
+            [np.uint16(1)],
+            [np.uint32(1)],
+            [np.uint64(1)],
+        ],
+    )
+    def test_constructor_numpy_uints(self, values):
+        # GH#47294
+        value = values[0]
+        result = DataFrame(values)
+
+        assert result[0].dtype == value.dtype
+        assert result[0][0] == value
+
+    def test_constructor_ordereddict(self):
+        nitems = 100
+        nums = list(range(nitems))
+        np.random.default_rng(2).shuffle(nums)
+        expected = [f"A{i:d}" for i in nums]
+        df = DataFrame(OrderedDict(zip(expected, [[0]] * nitems)))
+        assert expected == list(df.columns)
+
+    def test_constructor_dict(self):
+        datetime_series = Series(
+            np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30)
+        )
+        # test expects index shifted by 5
+        datetime_series_short = datetime_series[5:]
+
+        frame = DataFrame({"col1": datetime_series, "col2": datetime_series_short})
+
+        # col2 is padded with NaN
+        assert len(datetime_series) == 30
+        assert len(datetime_series_short) == 25
+
+        tm.assert_series_equal(frame["col1"], datetime_series.rename("col1"))
+
+        exp = Series(
+            np.concatenate([[np.nan] * 5, datetime_series_short.values]),
+            index=datetime_series.index,
+            name="col2",
+        )
+        tm.assert_series_equal(exp, frame["col2"])
+
+        frame = DataFrame(
+            {"col1": datetime_series, "col2": datetime_series_short},
+            columns=["col2", "col3", "col4"],
+        )
+
+        assert len(frame) == len(datetime_series_short)
+        assert "col1" not in frame
+        assert isna(frame["col3"]).all()
+
+        # Corner cases
+        assert len(DataFrame()) == 0
+
+        # mix dict and array, wrong size - no spec for which error should raise
+        # first
+        msg = "Mixing dicts with non-Series may lead to ambiguous ordering."
+        with pytest.raises(ValueError, match=msg):
+            DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]})
+
+    def test_constructor_dict_length1(self):
+        # Length-one dict micro-optimization
+        frame = DataFrame({"A": {"1": 1, "2": 2}})
+        tm.assert_index_equal(frame.index, Index(["1", "2"]))
+
+    def test_constructor_dict_with_index(self):
+        # empty dict plus index
+        idx = Index([0, 1, 2])
+        frame = DataFrame({}, index=idx)
+        assert frame.index is idx
+
+    def test_constructor_dict_with_index_and_columns(self):
+        # empty dict with index and columns
+        idx = Index([0, 1, 2])
+        frame = DataFrame({}, index=idx, columns=idx)
+        assert frame.index is idx
+        assert frame.columns is idx
+        assert len(frame._series) == 3
+
+    def test_constructor_dict_of_empty_lists(self):
+        # with dict of empty list and Series
+        frame = DataFrame({"A": [], "B": []}, columns=["A", "B"])
+        tm.assert_index_equal(frame.index, RangeIndex(0), exact=True)
+
+    def test_constructor_dict_with_none(self):
+        # GH 14381
+        # Dict with None value
+        frame_none = DataFrame({"a": None}, index=[0])
+        frame_none_list = DataFrame({"a": [None]}, index=[0])
+        assert frame_none._get_value(0, "a") is None
+        assert frame_none_list._get_value(0, "a") is None
+        tm.assert_frame_equal(frame_none, frame_none_list)
+
+    def test_constructor_dict_errors(self):
+        # GH10856
+        # dict with scalar values should raise error, even if columns passed
+        msg = "If using all scalar values, you must pass an index"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame({"a": 0.7})
+
+        with pytest.raises(ValueError, match=msg):
+            DataFrame({"a": 0.7}, columns=["a"])
+
+    @pytest.mark.parametrize("scalar", [2, np.nan, None, "D"])
+    def test_constructor_invalid_items_unused(self, scalar):
+        # No error if invalid (scalar) value is in fact not used:
+        result = DataFrame({"a": scalar}, columns=["b"])
+        expected = DataFrame(columns=["b"])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("value", [4, np.nan, None, float("nan")])
+    def test_constructor_dict_nan_key(self, value):
+        # GH 18455
+        cols = [1, value, 3]
+        idx = ["a", value]
+        values = [[0, 3], [1, 4], [2, 5]]
+        data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
+        result = DataFrame(data).sort_values(1).sort_values("a", axis=1)
+        expected = DataFrame(
+            np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = DataFrame(data, index=idx).sort_values("a", axis=1)
+        tm.assert_frame_equal(result, expected)
+
+        result = DataFrame(data, index=idx, columns=cols)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("value", [np.nan, None, float("nan")])
+    def test_constructor_dict_nan_tuple_key(self, value):
+        # GH 18455
+        cols = Index([(11, 21), (value, 22), (13, value)])
+        idx = Index([("a", value), (value, 2)])
+        values = [[0, 3], [1, 4], [2, 5]]
+        data = {cols[c]: Series(values[c], index=idx) for c in range(3)}
+        result = DataFrame(data).sort_values((11, 21)).sort_values(("a", value), axis=1)
+        expected = DataFrame(
+            np.arange(6, dtype="int64").reshape(2, 3), index=idx, columns=cols
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = DataFrame(data, index=idx).sort_values(("a", value), axis=1)
+        tm.assert_frame_equal(result, expected)
+
+        result = DataFrame(data, index=idx, columns=cols)
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_dict_order_insertion(self):
+        datetime_series = Series(
+            np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
+        )
+        datetime_series_short = datetime_series[:5]
+
+        # GH19018
+        # initialization ordering: by insertion order if python>= 3.6
+        d = {"b": datetime_series_short, "a": datetime_series}
+        frame = DataFrame(data=d)
+        expected = DataFrame(data=d, columns=list("ba"))
+        tm.assert_frame_equal(frame, expected)
+
+    def test_constructor_dict_nan_key_and_columns(self):
+        # GH 16894
+        result = DataFrame({np.nan: [1, 2], 2: [2, 3]}, columns=[np.nan, 2])
+        expected = DataFrame([[1, 2], [2, 3]], columns=[np.nan, 2])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("missing_value", [None, np.nan, pd.NA])
+    def test_constructor_list_of_dict_with_str_na_key(
+        self, missing_value, using_infer_string
+    ):
+        # https://github.com/pandas-dev/pandas/issues/63889
+        # preserve values when None key is converted to NaN column name
+        dict_data = [
+            {"colA": 1, missing_value: 2},
+            {"colA": 3, missing_value: 4},
+        ]
+        result = DataFrame(dict_data)
+        expected = DataFrame(
+            [[1, 2], [3, 4]],
+            columns=["colA", np.nan if using_infer_string else missing_value],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("missing_value", [None, np.nan, pd.NA])
+    def test_constructor_dict_of_dict_with_str_na_key(
+        self, missing_value, using_infer_string
+    ):
+        # https://github.com/pandas-dev/pandas/issues/63889
+        dict_data = {"col": {"row1": 1, missing_value: 2, "row3": 3}}
+        result = DataFrame(dict_data)
+        expected = DataFrame(
+            {"col": [1, 2, 3]},
+            index=Index(
+                ["row1", np.nan if using_infer_string else missing_value, "row3"]
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_multi_index(self):
+        # GH 4078
+        # construction error with mi and all-nan frame
+        tuples = [(2, 3), (3, 3), (3, 3)]
+        mi = MultiIndex.from_tuples(tuples)
+        df = DataFrame(index=mi, columns=mi)
+        assert isna(df).values.ravel().all()
+
+        tuples = [(3, 3), (2, 3), (3, 3)]
+        mi = MultiIndex.from_tuples(tuples)
+        df = DataFrame(index=mi, columns=mi)
+        assert isna(df).values.ravel().all()
+
+    def test_constructor_2d_index(self):
+        # GH 25416
+        # handling of 2d index in construction
+        df = DataFrame([[1]], columns=[[1]], index=[1, 2])
+        expected = DataFrame(
+            [1, 1],
+            index=Index([1, 2], dtype="int64"),
+            columns=MultiIndex(levels=[[1]], codes=[[0]]),
+        )
+        tm.assert_frame_equal(df, expected)
+
+        df = DataFrame([[1]], columns=[[1]], index=[[1, 2]])
+        expected = DataFrame(
+            [1, 1],
+            index=MultiIndex(levels=[[1, 2]], codes=[[0, 1]]),
+            columns=MultiIndex(levels=[[1]], codes=[[0]]),
+        )
+        tm.assert_frame_equal(df, expected)
+
+    def test_constructor_error_msgs(self):
+        msg = "Empty data passed with indices specified."
+        # passing an empty array with columns specified.
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(np.empty(0), index=[1])
+
+        msg = "Mixing dicts with non-Series may lead to ambiguous ordering."
+        # mix dict and array, wrong size
+        with pytest.raises(ValueError, match=msg):
+            DataFrame({"A": {"a": "a", "b": "b"}, "B": ["a", "b", "c"]})
+
+        # wrong size ndarray, GH 3105
+        msg = r"Shape of passed values is \(4, 3\), indices imply \(3, 3\)"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(
+                np.arange(12).reshape((4, 3)),
+                columns=["foo", "bar", "baz"],
+                index=date_range("2000-01-01", periods=3),
+            )
+
+        arr = np.array([[4, 5, 6]])
+        msg = r"Shape of passed values is \(1, 3\), indices imply \(1, 4\)"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(index=[0], columns=range(4), data=arr)
+
+        arr = np.array([4, 5, 6])
+        msg = r"Shape of passed values is \(3, 1\), indices imply \(1, 4\)"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(index=[0], columns=range(4), data=arr)
+
+        # higher dim raise exception
+        with pytest.raises(ValueError, match="Must pass 2-d input"):
+            DataFrame(np.zeros((3, 3, 3)), columns=["A", "B", "C"], index=[1])
+
+        # wrong size axis labels
+        msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(
+                np.random.default_rng(2).random((2, 3)),
+                columns=["A", "B", "C"],
+                index=[1],
+            )
+
+        msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(
+                np.random.default_rng(2).random((2, 3)),
+                columns=["A", "B"],
+                index=[1, 2],
+            )
+
+        # gh-26429
+        msg = "2 columns passed, passed data had 10 columns"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame((range(10), range(10, 20)), columns=("ones", "twos"))
+
+        msg = "If using all scalar values, you must pass an index"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame({"a": False, "b": True})
+
+    def test_constructor_subclass_dict(self, dict_subclass):
+        # Test for passing dict subclass to constructor
+        data = {
+            "col1": dict_subclass((x, 10.0 * x) for x in range(10)),
+            "col2": dict_subclass((x, 20.0 * x) for x in range(10)),
+        }
+        df = DataFrame(data)
+        refdf = DataFrame({col: dict(val.items()) for col, val in data.items()})
+        tm.assert_frame_equal(refdf, df)
+
+        data = dict_subclass(data.items())
+        df = DataFrame(data)
+        tm.assert_frame_equal(refdf, df)
+
+    def test_constructor_defaultdict(self, float_frame):
+        # try with defaultdict
+        data = {}
+        float_frame.loc[: float_frame.index[10], "B"] = np.nan
+
+        for k, v in float_frame.items():
+            dct = defaultdict(dict)
+            dct.update(v.to_dict())
+            data[k] = dct
+        frame = DataFrame(data)
+        expected = frame.reindex(index=float_frame.index)
+        tm.assert_frame_equal(float_frame, expected)
+
+    def test_constructor_dict_block(self):
+        expected = np.array([[4.0, 3.0, 2.0, 1.0]])
+        df = DataFrame(
+            {"d": [4.0], "c": [3.0], "b": [2.0], "a": [1.0]},
+            columns=["d", "c", "b", "a"],
+        )
+        tm.assert_numpy_array_equal(df.values, expected)
+
+    def test_constructor_dict_cast(self, using_infer_string):
+        # cast float tests
+        test_data = {"A": {"1": 1, "2": 2}, "B": {"1": "1", "2": "2", "3": "3"}}
+        frame = DataFrame(test_data, dtype=float)
+        assert len(frame) == 3
+        assert frame["B"].dtype == np.float64
+        assert frame["A"].dtype == np.float64
+
+        frame = DataFrame(test_data)
+        assert len(frame) == 3
+        assert frame["B"].dtype == np.object_ if not using_infer_string else "str"
+        assert frame["A"].dtype == np.float64
+
+    def test_constructor_dict_cast2(self):
+        # can't cast to float
+        test_data = {
+            "A": dict(zip(range(20), [f"word_{i}" for i in range(20)])),
+            "B": dict(zip(range(15), np.random.default_rng(2).standard_normal(15))),
+        }
+        with pytest.raises(ValueError, match="could not convert string"):
+            DataFrame(test_data, dtype=float)
+
+    def test_constructor_dict_dont_upcast(self):
+        d = {"Col1": {"Row1": "A String", "Row2": np.nan}}
+        df = DataFrame(d)
+        assert isinstance(df["Col1"]["Row2"], float)
+
+    def test_constructor_dict_dont_upcast2(self):
+        dm = DataFrame([[1, 2], ["a", "b"]], index=[1, 2], columns=[1, 2])
+        assert isinstance(dm[1][1], int)
+
+    def test_constructor_dict_of_tuples(self):
+        # GH #1491
+        data = {"a": (1, 2, 3), "b": (4, 5, 6)}
+
+        result = DataFrame(data)
+        expected = DataFrame({k: list(v) for k, v in data.items()})
+        tm.assert_frame_equal(result, expected, check_dtype=False)
+
+    def test_constructor_dict_of_ranges(self):
+        # GH 26356
+        data = {"a": range(3), "b": range(3, 6)}
+
+        result = DataFrame(data)
+        expected = DataFrame({"a": [0, 1, 2], "b": [3, 4, 5]})
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_dict_of_iterators(self):
+        # GH 26349
+        data = {"a": iter(range(3)), "b": reversed(range(3))}
+
+        result = DataFrame(data)
+        expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]})
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_dict_of_generators(self):
+        # GH 26349
+        data = {"a": (i for i in (range(3))), "b": (i for i in reversed(range(3)))}
+        result = DataFrame(data)
+        expected = DataFrame({"a": [0, 1, 2], "b": [2, 1, 0]})
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_dict_multiindex(self):
+        d = {
+            ("a", "a"): {("i", "i"): 0, ("i", "j"): 1, ("j", "i"): 2},
+            ("b", "a"): {("i", "i"): 6, ("i", "j"): 5, ("j", "i"): 4},
+            ("b", "c"): {("i", "i"): 7, ("i", "j"): 8, ("j", "i"): 9},
+        }
+        _d = sorted(d.items())
+        df = DataFrame(d)
+        expected = DataFrame(
+            [x[1] for x in _d], index=MultiIndex.from_tuples([x[0] for x in _d])
+        ).T
+        expected.index = MultiIndex.from_tuples(expected.index)
+        tm.assert_frame_equal(
+            df,
+            expected,
+        )
+
+        d["z"] = {"y": 123.0, ("i", "i"): 111, ("i", "j"): 111, ("j", "i"): 111}
+        _d.insert(0, ("z", d["z"]))
+        expected = DataFrame(
+            [x[1] for x in _d], index=Index([x[0] for x in _d], tupleize_cols=False)
+        ).T
+        expected.index = Index(expected.index, tupleize_cols=False)
+        df = DataFrame(d)
+        df = df.reindex(columns=expected.columns, index=expected.index)
+        tm.assert_frame_equal(df, expected)
+
+    def test_constructor_dict_datetime64_index(self):
+        # GH 10160
+        dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"]
+
+        def create_data(constructor):
+            return {i: {constructor(s): 2 * i} for i, s in enumerate(dates_as_str)}
+
+        data_datetime64 = create_data(np.datetime64)
+        data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d"))
+        data_Timestamp = create_data(Timestamp)
+
+        expected = DataFrame(
+            [
+                [0, None, None, None],
+                [None, 2, None, None],
+                [None, None, 4, None],
+                [None, None, None, 6],
+            ],
+            index=[Timestamp(dt) for dt in dates_as_str],
+        )
+
+        result_datetime64 = DataFrame(data_datetime64)
+        assert result_datetime64.index.unit == "s"
+        result_datetime64.index = result_datetime64.index.as_unit("us")
+        result_datetime = DataFrame(data_datetime)
+        assert result_datetime.index.unit == "us"
+        result_Timestamp = DataFrame(data_Timestamp)
+        tm.assert_frame_equal(result_datetime64, expected)
+        tm.assert_frame_equal(result_datetime, expected)
+        tm.assert_frame_equal(result_Timestamp, expected)
+
+    @pytest.mark.parametrize(
+        "klass,exp_dtype",
+        [
+            (lambda x: np.timedelta64(x, "D"), "m8[s]"),
+            (lambda x: timedelta(days=x), "m8[us]"),
+            (lambda x: Timedelta(x, "D"), "m8[s]"),
+            (lambda x: Timedelta(x, "D").as_unit("ms"), "m8[ms]"),
+        ],
+    )
+    def test_constructor_dict_timedelta64_index(self, klass, exp_dtype):
+        # GH 10160
+        td_as_int = [1, 2, 3, 4]
+
+        data = {i: {klass(s): 2 * i} for i, s in enumerate(td_as_int)}
+
+        expected = DataFrame(
+            [
+                {0: 0, 1: None, 2: None, 3: None},
+                {0: None, 1: 2, 2: None, 3: None},
+                {0: None, 1: None, 2: 4, 3: None},
+                {0: None, 1: None, 2: None, 3: 6},
+            ],
+            index=[Timedelta(td, "D") for td in td_as_int],
+        )
+        expected.index = expected.index.astype(exp_dtype)
+
+        result = DataFrame(data)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_period_dict(self):
+        # PeriodIndex
+        a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M")
+        b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D")
+        df = DataFrame({"a": a, "b": b})
+        assert df["a"].dtype == a.dtype
+        assert df["b"].dtype == b.dtype
+
+        # list of periods
+        df = DataFrame({"a": a.astype(object).tolist(), "b": b.astype(object).tolist()})
+        assert df["a"].dtype == a.dtype
+        assert df["b"].dtype == b.dtype
+
+    def test_constructor_dict_extension_scalar(self, ea_scalar_and_dtype):
+        ea_scalar, ea_dtype = ea_scalar_and_dtype
+        df = DataFrame({"a": ea_scalar}, index=[0])
+        assert df["a"].dtype == ea_dtype
+
+        expected = DataFrame(index=[0], columns=["a"], data=ea_scalar)
+
+        tm.assert_frame_equal(df, expected)
+
+    @pytest.mark.parametrize(
+        "data,dtype",
+        [
+            (Period("2020-01"), PeriodDtype("M")),
+            (Interval(left=0, right=5), IntervalDtype("int64", "right")),
+            (
+                Timestamp("2011-01-01", tz="US/Eastern").as_unit("s"),
+                DatetimeTZDtype(unit="s", tz="US/Eastern"),
+            ),
+        ],
+    )
+    def test_constructor_extension_scalar_data(self, data, dtype):
+        # GH 34832
+        df = DataFrame(index=range(2), columns=["a", "b"], data=data)
+
+        assert df["a"].dtype == dtype
+        assert df["b"].dtype == dtype
+
+        arr = pd.array([data] * 2, dtype=dtype)
+        expected = DataFrame({"a": arr, "b": arr})
+
+        tm.assert_frame_equal(df, expected)
+
+    def test_nested_dict_frame_constructor(self):
+        rng = pd.period_range("1/1/2000", periods=5)
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)), columns=rng)
+
+        data = {}
+        for col in df.columns:
+            for row in df.index:
+                data.setdefault(col, {})[row] = df._get_value(row, col)
+
+        result = DataFrame(data, columns=rng)
+        tm.assert_frame_equal(result, df)
+
+        data = {}
+        for col in df.columns:
+            for row in df.index:
+                data.setdefault(row, {})[col] = df._get_value(row, col)
+
+        result = DataFrame(data, index=rng).T
+        tm.assert_frame_equal(result, df)
+
+    def _check_basic_constructor(self, empty):
+        # mat: 2d matrix with shape (3, 2) to input. empty - makes sized
+        # objects
+        mat = empty((2, 3), dtype=float)
+        # 2-D input
+        frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
+
+        assert len(frame.index) == 2
+        assert len(frame.columns) == 3
+
+        # 1-D input
+        frame = DataFrame(empty((3,)), columns=["A"], index=[1, 2, 3])
+        assert len(frame.index) == 3
+        assert len(frame.columns) == 1
+
+        if empty is not np.ones:
+            msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
+            with pytest.raises(IntCastingNaNError, match=msg):
+                DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
+            return
+        else:
+            frame = DataFrame(
+                mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64
+            )
+            assert frame.values.dtype == np.int64
+
+        # wrong size axis labels
+        msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(mat, columns=["A", "B", "C"], index=[1])
+        msg = r"Shape of passed values is \(2, 3\), indices imply \(2, 2\)"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(mat, columns=["A", "B"], index=[1, 2])
+
+        # higher dim raise exception
+        with pytest.raises(ValueError, match="Must pass 2-d input"):
+            DataFrame(empty((3, 3, 3)), columns=["A", "B", "C"], index=[1])
+
+        # automatic labeling
+        frame = DataFrame(mat)
+        tm.assert_index_equal(frame.index, Index(range(2)), exact=True)
+        tm.assert_index_equal(frame.columns, Index(range(3)), exact=True)
+
+        frame = DataFrame(mat, index=[1, 2])
+        tm.assert_index_equal(frame.columns, Index(range(3)), exact=True)
+
+        frame = DataFrame(mat, columns=["A", "B", "C"])
+        tm.assert_index_equal(frame.index, Index(range(2)), exact=True)
+
+        # 0-length axis
+        frame = DataFrame(empty((0, 3)))
+        assert len(frame.index) == 0
+
+        frame = DataFrame(empty((3, 0)))
+        assert len(frame.columns) == 0
+
+    def test_constructor_ndarray(self):
+        self._check_basic_constructor(np.ones)
+
+        frame = DataFrame(["foo", "bar"], index=[0, 1], columns=["A"])
+        assert len(frame) == 2
+
+    def test_constructor_maskedarray(self):
+        self._check_basic_constructor(ma.masked_all)
+
+        # Check non-masked values
+        mat = ma.masked_all((2, 3), dtype=float)
+        mat[0, 0] = 1.0
+        mat[1, 2] = 2.0
+        frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
+        assert 1.0 == frame["A"][1]
+        assert 2.0 == frame["C"][2]
+
+        # what is this even checking??
+        mat = ma.masked_all((2, 3), dtype=float)
+        frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
+        assert np.all(~np.asarray(frame == frame))
+
+    @pytest.mark.filterwarnings(
+        "ignore:elementwise comparison failed:DeprecationWarning"
+    )
+    def test_constructor_maskedarray_nonfloat(self):
+        # masked int promoted to float
+        mat = ma.masked_all((2, 3), dtype=int)
+        # 2-D input
+        frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
+
+        assert len(frame.index) == 2
+        assert len(frame.columns) == 3
+        assert np.all(~np.asarray(frame == frame))
+
+        # cast type
+        frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.float64)
+        assert frame.values.dtype == np.float64
+
+        # Check non-masked values
+        mat2 = ma.copy(mat)
+        mat2[0, 0] = 1
+        mat2[1, 2] = 2
+        frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2])
+        assert 1 == frame["A"][1]
+        assert 2 == frame["C"][2]
+
+        # masked np.datetime64 stays (use NaT as null)
+        mat = ma.masked_all((2, 3), dtype="M8[ns]")
+        # 2-D input
+        frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
+
+        assert len(frame.index) == 2
+        assert len(frame.columns) == 3
+        assert isna(frame).values.all()
+
+        # cast type
+        msg = r"datetime64\[ns\] values and dtype=int64 is not supported"
+        with pytest.raises(TypeError, match=msg):
+            DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
+
+        # Check non-masked values
+        mat2 = ma.copy(mat)
+        mat2[0, 0] = 1
+        mat2[1, 2] = 2
+        frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2])
+        assert 1 == frame["A"].astype("i8")[1]
+        assert 2 == frame["C"].astype("i8")[2]
+
+        # masked bool promoted to object
+        mat = ma.masked_all((2, 3), dtype=bool)
+        # 2-D input
+        frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2])
+
+        assert len(frame.index) == 2
+        assert len(frame.columns) == 3
+        assert np.all(~np.asarray(frame == frame))
+
+        # cast type
+        frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=object)
+        assert frame.values.dtype == object
+
+        # Check non-masked values
+        mat2 = ma.copy(mat)
+        mat2[0, 0] = True
+        mat2[1, 2] = False
+        frame = DataFrame(mat2, columns=["A", "B", "C"], index=[1, 2])
+        assert frame["A"][1] is True
+        assert frame["C"][2] is False
+
+    def test_constructor_maskedarray_hardened(self):
+        # Check numpy masked arrays with hard masks -- from GH24574
+        mat_hard = ma.masked_all((2, 2), dtype=float).harden_mask()
+        result = DataFrame(mat_hard, columns=["A", "B"], index=[1, 2])
+        expected = DataFrame(
+            {"A": [np.nan, np.nan], "B": [np.nan, np.nan]},
+            columns=["A", "B"],
+            index=[1, 2],
+            dtype=float,
+        )
+        tm.assert_frame_equal(result, expected)
+        # Check case where mask is hard but no data are masked
+        mat_hard = ma.ones((2, 2), dtype=float).harden_mask()
+        result = DataFrame(mat_hard, columns=["A", "B"], index=[1, 2])
+        expected = DataFrame(
+            {"A": [1.0, 1.0], "B": [1.0, 1.0]},
+            columns=["A", "B"],
+            index=[1, 2],
+            dtype=float,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_maskedrecarray_dtype(self):
+        # Ensure constructor honors dtype
+        data = np.ma.array(
+            np.ma.zeros(5, dtype=[("date", "<f8"), ("price", "<f8")]), mask=[False] * 5
+        )
+        data = data.view(mrecords.mrecarray)
+        with pytest.raises(TypeError, match=r"Pass \{name: data\[name\]"):
+            # Support for MaskedRecords deprecated GH#40363
+            DataFrame(data, dtype=int)
+
+    def test_constructor_corner_shape(self):
+        df = DataFrame(index=[])
+        assert df.values.shape == (0, 0)
+
+    @pytest.mark.parametrize(
+        "data, index, columns, dtype, expected",
+        [
+            (None, list(range(10)), ["a", "b"], object, np.object_),
+            (None, None, ["a", "b"], "int64", np.dtype("int64")),
+            (None, list(range(10)), ["a", "b"], int, np.dtype("float64")),
+            ({}, None, ["foo", "bar"], None, np.object_),
+            ({"b": 1}, list(range(10)), list("abc"), int, np.dtype("float64")),
+        ],
+    )
+    def test_constructor_dtype(self, data, index, columns, dtype, expected):
+        df = DataFrame(data, index, columns, dtype)
+        assert df.values.dtype == expected
+
+    @pytest.mark.parametrize(
+        "data,input_dtype,expected_dtype",
+        (
+            ([True, False, None], "boolean", pd.BooleanDtype),
+            ([1.0, 2.0, None], "Float64", pd.Float64Dtype),
+            ([1, 2, None], "Int64", pd.Int64Dtype),
+            (["a", "b", "c"], "string", pd.StringDtype),
+        ),
+    )
+    def test_constructor_dtype_nullable_extension_arrays(
+        self, data, input_dtype, expected_dtype
+    ):
+        df = DataFrame({"a": data}, dtype=input_dtype)
+        assert df["a"].dtype == expected_dtype()
+
+    def test_constructor_scalar_inference(self, using_infer_string):
+        data = {"int": 1, "bool": True, "float": 3.0, "complex": 4j, "object": "foo"}
+        df = DataFrame(data, index=np.arange(10))
+
+        assert df["int"].dtype == np.int64
+        assert df["bool"].dtype == np.bool_
+        assert df["float"].dtype == np.float64
+        assert df["complex"].dtype == np.complex128
+        assert df["object"].dtype == np.object_ if not using_infer_string else "str"
+
+    def test_constructor_arrays_and_scalars(self):
+        df = DataFrame({"a": np.random.default_rng(2).standard_normal(10), "b": True})
+        exp = DataFrame({"a": df["a"].values, "b": [True] * 10})
+
+        tm.assert_frame_equal(df, exp)
+        with pytest.raises(ValueError, match="must pass an index"):
+            DataFrame({"a": False, "b": True})
+
+    def test_constructor_DataFrame(self, float_frame):
+        df = DataFrame(float_frame)
+        tm.assert_frame_equal(df, float_frame)
+
+        df_casted = DataFrame(float_frame, dtype=np.int64)
+        assert df_casted.values.dtype == np.int64
+
+    def test_constructor_empty_dataframe(self):
+        # GH 20624
+        actual = DataFrame(DataFrame(), dtype="object")
+        expected = DataFrame([], dtype="object")
+        tm.assert_frame_equal(actual, expected)
+
+    def test_constructor_more(self, float_frame):
+        # used to be in test_matrix.py
+        arr = np.random.default_rng(2).standard_normal(10)
+        dm = DataFrame(arr, columns=["A"], index=np.arange(10))
+        assert dm.values.ndim == 2
+
+        arr = np.random.default_rng(2).standard_normal(0)
+        dm = DataFrame(arr)
+        assert dm.values.ndim == 2
+        assert dm.values.ndim == 2
+
+        # no data specified
+        dm = DataFrame(columns=["A", "B"], index=np.arange(10))
+        assert dm.values.shape == (10, 2)
+
+        dm = DataFrame(columns=["A", "B"])
+        assert dm.values.shape == (0, 2)
+
+        dm = DataFrame(index=np.arange(10))
+        assert dm.values.shape == (10, 0)
+
+        # can't cast
+        mat = np.array(["foo", "bar"], dtype=object).reshape(2, 1)
+        msg = "could not convert string to float: 'foo'"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(mat, index=[0, 1], columns=[0], dtype=float)
+
+        dm = DataFrame(DataFrame(float_frame._series))
+        tm.assert_frame_equal(dm, float_frame)
+
+        # int cast
+        dm = DataFrame(
+            {"A": np.ones(10, dtype=int), "B": np.ones(10, dtype=np.float64)},
+            index=np.arange(10),
+        )
+
+        assert len(dm.columns) == 2
+        assert dm.values.dtype == np.float64
+
+    def test_constructor_empty_list(self):
+        df = DataFrame([], index=[])
+        expected = DataFrame(index=[])
+        tm.assert_frame_equal(df, expected)
+
+        # GH 9939
+        df = DataFrame([], columns=["A", "B"])
+        expected = DataFrame({}, columns=["A", "B"])
+        tm.assert_frame_equal(df, expected)
+
+        # Empty generator: list(empty_gen()) == []
+        def empty_gen():
+            yield from ()
+
+        df = DataFrame(empty_gen(), columns=["A", "B"])
+        tm.assert_frame_equal(df, expected)
+
+    def test_constructor_list_of_lists(self, using_infer_string):
+        # GH #484
+        df = DataFrame(data=[[1, "a"], [2, "b"]], columns=["num", "str"])
+        assert is_integer_dtype(df["num"])
+        assert df["str"].dtype == np.object_ if not using_infer_string else "str"
+
+        # GH 4851
+        # list of 0-dim ndarrays
+        expected = DataFrame(np.arange(10))
+        data = [np.array(x) for x in range(10)]
+        result = DataFrame(data)
+        tm.assert_frame_equal(result, expected)
+
+    def test_nested_pandasarray_matches_nested_ndarray(self):
+        # GH#43986
+        ser = Series([1, 2])
+
+        arr = np.array([None, None], dtype=object)
+        arr[0] = ser
+        arr[1] = ser * 2
+
+        df = DataFrame(arr)
+        expected = DataFrame(pd.array(arr))
+        tm.assert_frame_equal(df, expected)
+        assert df.shape == (2, 1)
+        tm.assert_numpy_array_equal(df[0].values, arr)
+
+    def test_constructor_list_like_data_nested_list_column(self):
+        # GH 32173
+        arrays = [list("abcd"), list("cdef")]
+        result = DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)
+
+        mi = MultiIndex.from_arrays(arrays)
+        expected = DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=mi)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_wrong_length_nested_list_column(self):
+        # GH 32173
+        arrays = [list("abc"), list("cde")]
+
+        msg = "3 columns passed, passed data had 4"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)
+
+    def test_constructor_unequal_length_nested_list_column(self):
+        # GH 32173
+        arrays = [list("abcd"), list("cde")]
+
+        # exception raised inside MultiIndex constructor
+        msg = "all arrays must be same length"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame([[1, 2, 3, 4], [4, 5, 6, 7]], columns=arrays)
+
+    @pytest.mark.parametrize(
+        "data",
+        [
+            [[Timestamp("2021-01-01")]],
+            [{"x": Timestamp("2021-01-01")}],
+            {"x": [Timestamp("2021-01-01")]},
+            {"x": Timestamp("2021-01-01")},
+        ],
+    )
+    def test_constructor_one_element_data_list(self, data):
+        # GH#42810
+        result = DataFrame(data, index=range(3), columns=["x"])
+        expected = DataFrame({"x": [Timestamp("2021-01-01")] * 3})
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_sequence_like(self):
+        # GH 3783
+        # collections.Sequence like
+
+        class DummyContainer(abc.Sequence):
+            def __init__(self, lst) -> None:
+                self._lst = lst
+
+            def __getitem__(self, n):
+                return self._lst.__getitem__(n)
+
+            def __len__(self) -> int:
+                return self._lst.__len__()
+
+        lst_containers = [DummyContainer([1, "a"]), DummyContainer([2, "b"])]
+        columns = ["num", "str"]
+        result = DataFrame(lst_containers, columns=columns)
+        expected = DataFrame([[1, "a"], [2, "b"]], columns=columns)
+        tm.assert_frame_equal(result, expected, check_dtype=False)
+
+    def test_constructor_stdlib_array(self):
+        # GH 4297
+        # support Array
+        result = DataFrame({"A": array.array("i", range(10))})
+        expected = DataFrame({"A": list(range(10))})
+        tm.assert_frame_equal(result, expected, check_dtype=False)
+
+        expected = DataFrame([list(range(10)), list(range(10))])
+        result = DataFrame([array.array("i", range(10)), array.array("i", range(10))])
+        tm.assert_frame_equal(result, expected, check_dtype=False)
+
+    def test_constructor_range(self):
+        # GH26342
+        result = DataFrame(range(10))
+        expected = DataFrame(list(range(10)))
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_list_of_ranges(self):
+        result = DataFrame([range(10), range(10)])
+        expected = DataFrame([list(range(10)), list(range(10))])
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_iterable(self):
+        # GH 21987
+        class Iter:
+            def __iter__(self) -> Iterator:
+                for i in range(10):
+                    yield [1, 2, 3]
+
+        expected = DataFrame([[1, 2, 3]] * 10)
+        result = DataFrame(Iter())
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_iterator(self):
+        result = DataFrame(iter(range(10)))
+        expected = DataFrame(list(range(10)))
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_list_of_iterators(self):
+        result = DataFrame([iter(range(10)), iter(range(10))])
+        expected = DataFrame([list(range(10)), list(range(10))])
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_generator(self):
+        # related #2305
+
+        gen1 = (i for i in range(10))
+        gen2 = (i for i in range(10))
+
+        expected = DataFrame([list(range(10)), list(range(10))])
+        result = DataFrame([gen1, gen2])
+        tm.assert_frame_equal(result, expected)
+
+        gen = ([i, "a"] for i in range(10))
+        result = DataFrame(gen)
+        expected = DataFrame({0: range(10), 1: "a"})
+        tm.assert_frame_equal(result, expected, check_dtype=False)
+
+    def test_constructor_list_of_dicts(self):
+        result = DataFrame([{}])
+        expected = DataFrame(index=RangeIndex(1), columns=[])
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_ordered_dict_nested_preserve_order(self):
+        # see gh-18166
+        nested1 = OrderedDict([("b", 1), ("a", 2)])
+        nested2 = OrderedDict([("b", 2), ("a", 5)])
+        data = OrderedDict([("col2", nested1), ("col1", nested2)])
+        result = DataFrame(data)
+        data = {"col2": [1, 2], "col1": [2, 5]}
+        expected = DataFrame(data=data, index=["b", "a"])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("dict_type", [dict, OrderedDict])
+    def test_constructor_ordered_dict_preserve_order(self, dict_type):
+        # see gh-13304
+        expected = DataFrame([[2, 1]], columns=["b", "a"])
+
+        data = dict_type()
+        data["b"] = [2]
+        data["a"] = [1]
+
+        result = DataFrame(data)
+        tm.assert_frame_equal(result, expected)
+
+        data = dict_type()
+        data["b"] = 2
+        data["a"] = 1
+
+        result = DataFrame([data])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("dict_type", [dict, OrderedDict])
+    def test_constructor_ordered_dict_conflicting_orders(self, dict_type):
+        # the first dict element sets the ordering for the DataFrame,
+        # even if there are conflicting orders from subsequent ones
+        row_one = dict_type()
+        row_one["b"] = 2
+        row_one["a"] = 1
+
+        row_two = dict_type()
+        row_two["a"] = 1
+        row_two["b"] = 2
+
+        row_three = {"b": 2, "a": 1}
+
+        expected = DataFrame([[2, 1], [2, 1]], columns=["b", "a"])
+        result = DataFrame([row_one, row_two])
+        tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame([[2, 1], [2, 1], [2, 1]], columns=["b", "a"])
+        result = DataFrame([row_one, row_two, row_three])
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_list_of_series_aligned_index(self):
+        series = [Series(i, index=["b", "a", "c"], name=str(i)) for i in range(3)]
+        result = DataFrame(series)
+        expected = DataFrame(
+            {"b": [0, 1, 2], "a": [0, 1, 2], "c": [0, 1, 2]},
+            columns=["b", "a", "c"],
+            index=["0", "1", "2"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_list_of_derived_dicts(self):
+        class CustomDict(dict):
+            pass
+
+        d = {"a": 1.5, "b": 3}
+
+        data_custom = [CustomDict(d)]
+        data = [d]
+
+        result_custom = DataFrame(data_custom)
+        result = DataFrame(data)
+        tm.assert_frame_equal(result, result_custom)
+
+    def test_constructor_ragged(self):
+        data = {
+            "A": np.random.default_rng(2).standard_normal(10),
+            "B": np.random.default_rng(2).standard_normal(8),
+        }
+        with pytest.raises(ValueError, match="All arrays must be of the same length"):
+            DataFrame(data)
+
+    def test_constructor_scalar(self):
+        idx = Index(range(3))
+        df = DataFrame({"a": 0}, index=idx)
+        expected = DataFrame({"a": [0, 0, 0]}, index=idx)
+        tm.assert_frame_equal(df, expected, check_dtype=False)
+
+    def test_constructor_Series_copy_bug(self, float_frame):
+        df = DataFrame(float_frame["A"], index=float_frame.index, columns=["A"])
+        df.copy()
+
+    def test_constructor_mixed_dict_and_Series(self):
+        data = {}
+        data["A"] = {"foo": 1, "bar": 2, "baz": 3}
+        data["B"] = Series([4, 3, 2, 1], index=["bar", "qux", "baz", "foo"])
+
+        result = DataFrame(data)
+        assert result.index.is_monotonic_increasing
+
+        # ordering ambiguous, raise exception
+        with pytest.raises(ValueError, match="ambiguous ordering"):
+            DataFrame({"A": ["a", "b"], "B": {"a": "a", "b": "b"}})
+
+        # this is OK though
+        result = DataFrame({"A": ["a", "b"], "B": Series(["a", "b"], index=["a", "b"])})
+        expected = DataFrame({"A": ["a", "b"], "B": ["a", "b"]}, index=["a", "b"])
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_mixed_type_rows(self):
+        # Issue 25075
+        data = [[1, 2], (3, 4)]
+        result = DataFrame(data)
+        expected = DataFrame([[1, 2], [3, 4]])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "tuples,lists",
+        [
+            ((), []),
+            (((),), [[]]),
+            (((), ()), [(), ()]),
+            (((), ()), [[], []]),
+            (([], []), [[], []]),
+            (([1], [2]), [[1], [2]]),  # GH 32776
+            (([1, 2, 3], [4, 5, 6]), [[1, 2, 3], [4, 5, 6]]),
+        ],
+    )
+    def test_constructor_tuple(self, tuples, lists):
+        # GH 25691
+        result = DataFrame(tuples)
+        expected = DataFrame(lists)
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_list_of_tuples(self):
+        result = DataFrame({"A": [(1, 2), (3, 4)]})
+        expected = DataFrame({"A": Series([(1, 2), (3, 4)])})
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_list_of_namedtuples(self):
+        # GH11181
+        named_tuple = namedtuple("Pandas", list("ab"))
+        tuples = [named_tuple(1, 3), named_tuple(2, 4)]
+        expected = DataFrame({"a": [1, 2], "b": [3, 4]})
+        result = DataFrame(tuples)
+        tm.assert_frame_equal(result, expected)
+
+        # with columns
+        expected = DataFrame({"y": [1, 2], "z": [3, 4]})
+        result = DataFrame(tuples, columns=["y", "z"])
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_list_of_dataclasses(self):
+        # GH21910
+        Point = make_dataclass("Point", [("x", int), ("y", int)])
+
+        data = [Point(0, 3), Point(1, 3)]
+        expected = DataFrame({"x": [0, 1], "y": [3, 3]})
+        result = DataFrame(data)
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_list_of_dataclasses_with_varying_types(self):
+        # GH21910
+        # varying types
+        Point = make_dataclass("Point", [("x", int), ("y", int)])
+        HLine = make_dataclass("HLine", [("x0", int), ("x1", int), ("y", int)])
+
+        data = [Point(0, 3), HLine(1, 3, 3)]
+
+        expected = DataFrame(
+            {"x": [0, np.nan], "y": [3, 3], "x0": [np.nan, 1], "x1": [np.nan, 3]}
+        )
+        result = DataFrame(data)
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_list_of_dataclasses_error_thrown(self):
+        # GH21910
+        Point = make_dataclass("Point", [("x", int), ("y", int)])
+
+        # expect TypeError
+        msg = "asdict() should be called on dataclass instances"
+        with pytest.raises(TypeError, match=re.escape(msg)):
+            DataFrame([Point(0, 0), {"x": 1, "y": 0}])
+
+    def test_constructor_list_of_dict_order(self):
+        # GH10056
+        data = [
+            {"First": 1, "Second": 4, "Third": 7, "Fourth": 10},
+            {"Second": 5, "First": 2, "Fourth": 11, "Third": 8},
+            {"Second": 6, "First": 3, "Fourth": 12, "Third": 9, "YYY": 14, "XXX": 13},
+        ]
+        expected = DataFrame(
+            {
+                "First": [1, 2, 3],
+                "Second": [4, 5, 6],
+                "Third": [7, 8, 9],
+                "Fourth": [10, 11, 12],
+                "YYY": [None, None, 14],
+                "XXX": [None, None, 13],
+            }
+        )
+        result = DataFrame(data)
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_Series_named(self):
+        a = Series([1, 2, 3], index=["a", "b", "c"], name="x")
+        df = DataFrame(a)
+        assert df.columns[0] == "x"
+        tm.assert_index_equal(df.index, a.index)
+
+        # ndarray like
+        arr = np.random.default_rng(2).standard_normal(10)
+        s = Series(arr, name="x")
+        df = DataFrame(s)
+        expected = DataFrame({"x": s})
+        tm.assert_frame_equal(df, expected)
+
+        s = Series(arr, index=range(3, 13))
+        df = DataFrame(s)
+        expected = DataFrame({0: s})
+        tm.assert_frame_equal(df, expected, check_column_type=False)
+
+        msg = r"Shape of passed values is \(10, 1\), indices imply \(10, 2\)"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(s, columns=[1, 2])
+
+        # #2234
+        a = Series([], name="x", dtype=object)
+        df = DataFrame(a)
+        assert df.columns[0] == "x"
+
+        # series with name and w/o
+        s1 = Series(arr, name="x")
+        df = DataFrame([s1, arr]).T
+        expected = DataFrame({"x": s1, "Unnamed 0": arr}, columns=["x", "Unnamed 0"])
+        tm.assert_frame_equal(df, expected)
+
+        # this is a bit non-intuitive here; the series collapse down to arrays
+        df = DataFrame([arr, s1]).T
+        expected = DataFrame({1: s1, 0: arr}, columns=range(2))
+        tm.assert_frame_equal(df, expected)
+
+    def test_constructor_Series_named_and_columns(self):
+        # GH 9232 validation
+
+        s0 = Series(range(5), name=0)
+        s1 = Series(range(5), name=1)
+
+        # matching name and column gives standard frame
+        tm.assert_frame_equal(DataFrame(s0, columns=[0]), s0.to_frame())
+        tm.assert_frame_equal(DataFrame(s1, columns=[1]), s1.to_frame())
+
+        # non-matching produces empty frame
+        assert DataFrame(s0, columns=[1]).empty
+        assert DataFrame(s1, columns=[0]).empty
+
+    def test_constructor_Series_differently_indexed(self):
+        # name
+        s1 = Series([1, 2, 3], index=["a", "b", "c"], name="x")
+
+        # no name
+        s2 = Series([1, 2, 3], index=["a", "b", "c"])
+
+        other_index = Index(["a", "b"])
+
+        df1 = DataFrame(s1, index=other_index)
+        exp1 = DataFrame(s1.reindex(other_index))
+        assert df1.columns[0] == "x"
+        tm.assert_frame_equal(df1, exp1)
+
+        df2 = DataFrame(s2, index=other_index)
+        exp2 = DataFrame(s2.reindex(other_index))
+        assert df2.columns[0] == 0
+        tm.assert_index_equal(df2.index, other_index)
+        tm.assert_frame_equal(df2, exp2)
+
+    @pytest.mark.parametrize(
+        "name_in1,name_in2,name_in3,name_out",
+        [
+            ("idx", "idx", "idx", "idx"),
+            ("idx", "idx", None, None),
+            ("idx", None, None, None),
+            ("idx1", "idx2", None, None),
+            ("idx1", "idx1", "idx2", None),
+            ("idx1", "idx2", "idx3", None),
+            (None, None, None, None),
+        ],
+    )
+    def test_constructor_index_names(self, name_in1, name_in2, name_in3, name_out):
+        # GH13475
+        indices = [
+            Index(["a", "b", "c"], name=name_in1),
+            Index(["b", "c", "d"], name=name_in2),
+            Index(["c", "d", "e"], name=name_in3),
+        ]
+        series = {
+            c: Series([0, 1, 2], index=i) for i, c in zip(indices, ["x", "y", "z"])
+        }
+        result = DataFrame(series)
+
+        exp_ind = Index(["a", "b", "c", "d", "e"], name=name_out)
+        expected = DataFrame(
+            {
+                "x": [0, 1, 2, np.nan, np.nan],
+                "y": [np.nan, 0, 1, 2, np.nan],
+                "z": [np.nan, np.nan, 0, 1, 2],
+            },
+            index=exp_ind,
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_manager_resize(self, float_frame):
+        index = list(float_frame.index[:5])
+        columns = list(float_frame.columns[:3])
+
+        msg = "Passing a BlockManager to DataFrame"
+        with tm.assert_produces_warning(
+            DeprecationWarning, match=msg, check_stacklevel=False
+        ):
+            result = DataFrame(float_frame._mgr, index=index, columns=columns)
+        tm.assert_index_equal(result.index, Index(index))
+        tm.assert_index_equal(result.columns, Index(columns))
+
+    def test_constructor_mix_series_nonseries(self, float_frame):
+        df = DataFrame(
+            {"A": float_frame["A"], "B": list(float_frame["B"])}, columns=["A", "B"]
+        )
+        tm.assert_frame_equal(df, float_frame.loc[:, ["A", "B"]])
+
+        msg = "does not match index length"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame({"A": float_frame["A"], "B": list(float_frame["B"])[:-2]})
+
+    def test_constructor_miscast_na_int_dtype(self):
+        msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
+
+        with pytest.raises(IntCastingNaNError, match=msg):
+            DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64)
+
+    def test_constructor_column_duplicates(self):
+        # it works! #2079
+        df = DataFrame([[8, 5]], columns=["a", "a"])
+        edf = DataFrame([[8, 5]])
+        edf.columns = ["a", "a"]
+
+        tm.assert_frame_equal(df, edf)
+
+        idf = DataFrame.from_records([(8, 5)], columns=["a", "a"])
+
+        tm.assert_frame_equal(idf, edf)
+
+    def test_constructor_empty_with_string_dtype(self, using_infer_string):
+        # GH 9428
+        expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object)
+        expected_str = DataFrame(
+            index=[0, 1], columns=[0, 1], dtype=pd.StringDtype(na_value=np.nan)
+        )
+
+        df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str)
+        if using_infer_string:
+            tm.assert_frame_equal(df, expected_str)
+        else:
+            tm.assert_frame_equal(df, expected)
+        df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_)
+        tm.assert_frame_equal(df, expected)
+        df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5")
+        tm.assert_frame_equal(df, expected)
+
+    def test_constructor_empty_with_string_extension(self, nullable_string_dtype):
+        # GH 34915
+        expected = DataFrame(columns=["c1"], dtype=nullable_string_dtype)
+        df = DataFrame(columns=["c1"], dtype=nullable_string_dtype)
+        tm.assert_frame_equal(df, expected)
+
+    def test_constructor_single_value(self):
+        # expecting single value upcasting here
+        df = DataFrame(0.0, index=[1, 2, 3], columns=["a", "b", "c"])
+        tm.assert_frame_equal(
+            df, DataFrame(np.zeros(df.shape).astype("float64"), df.index, df.columns)
+        )
+
+        df = DataFrame(0, index=[1, 2, 3], columns=["a", "b", "c"])
+        tm.assert_frame_equal(
+            df, DataFrame(np.zeros(df.shape).astype("int64"), df.index, df.columns)
+        )
+
+        df = DataFrame("a", index=[1, 2], columns=["a", "c"])
+        tm.assert_frame_equal(
+            df,
+            DataFrame(
+                np.array([["a", "a"], ["a", "a"]], dtype=object),
+                index=[1, 2],
+                columns=["a", "c"],
+            ),
+        )
+
+        msg = "DataFrame constructor not properly called!"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame("a", [1, 2])
+        with pytest.raises(ValueError, match=msg):
+            DataFrame("a", columns=["a", "c"])
+
+        msg = "incompatible data and dtype"
+        with pytest.raises(TypeError, match=msg):
+            DataFrame("a", [1, 2], ["a", "c"], float)
+
+    def test_constructor_with_datetimes(self, using_infer_string):
+        intname = np.dtype(int).name
+        floatname = np.dtype(np.float64).name
+        objectname = np.dtype(np.object_).name
+
+        # single item
+        df = DataFrame(
+            {
+                "A": 1,
+                "B": "foo",
+                "C": "bar",
+                "D": Timestamp("20010101").as_unit("s"),
+                "E": datetime(2001, 1, 2, 0, 0),
+            },
+            index=np.arange(10),
+        )
+        result = df.dtypes
+        expected = Series(
+            [np.dtype("int64")]
+            + [
+                np.dtype(objectname)
+                if not using_infer_string
+                else pd.StringDtype(na_value=np.nan)
+            ]
+            * 2
+            + [np.dtype("M8[s]"), np.dtype("M8[us]")],
+            index=list("ABCDE"),
+        )
+        tm.assert_series_equal(result, expected)
+
+        # check with ndarray construction ndim==0 (e.g. we are passing an ndim 0
+        # ndarray with a dtype specified)
+        df = DataFrame(
+            {
+                "a": 1.0,
+                "b": 2,
+                "c": "foo",
+                floatname: np.array(1.0, dtype=floatname),
+                intname: np.array(1, dtype=intname),
+            },
+            index=np.arange(10),
+        )
+        result = df.dtypes
+        expected = Series(
+            [
+                np.dtype("float64"),
+                np.dtype("int64"),
+                np.dtype("object")
+                if not using_infer_string
+                else pd.StringDtype(na_value=np.nan),
+                np.dtype("float64"),
+                np.dtype(intname),
+            ],
+            index=["a", "b", "c", floatname, intname],
+        )
+        tm.assert_series_equal(result, expected)
+
+        # check with ndarray construction ndim>0
+        df = DataFrame(
+            {
+                "a": 1.0,
+                "b": 2,
+                "c": "foo",
+                floatname: np.array([1.0] * 10, dtype=floatname),
+                intname: np.array([1] * 10, dtype=intname),
+            },
+            index=np.arange(10),
+        )
+        result = df.dtypes
+        expected = Series(
+            [
+                np.dtype("float64"),
+                np.dtype("int64"),
+                np.dtype("object")
+                if not using_infer_string
+                else pd.StringDtype(na_value=np.nan),
+                np.dtype("float64"),
+                np.dtype(intname),
+            ],
+            index=["a", "b", "c", floatname, intname],
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_with_datetimes1(self):
+        # GH 2809
+        ind = date_range(start="2000-01-01", freq="D", periods=10)
+        datetimes = [ts.to_pydatetime() for ts in ind]
+        datetime_s = Series(datetimes)
+        assert datetime_s.dtype == "M8[us]"
+
+    def test_constructor_with_datetimes2(self):
+        # GH 2810
+        ind = date_range(start="2000-01-01", freq="D", periods=10)
+        datetimes = [ts.to_pydatetime() for ts in ind]
+        dates = [ts.date() for ts in ind]
+        df = DataFrame(datetimes, columns=["datetimes"])
+        df["dates"] = dates
+        result = df.dtypes
+        expected = Series(
+            [np.dtype("datetime64[us]"), np.dtype("object")],
+            index=["datetimes", "dates"],
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_with_datetimes3(self):
+        # GH 7594
+        # don't coerce tz-aware
+        dt = datetime(2012, 1, 1, tzinfo=zoneinfo.ZoneInfo("US/Eastern"))
+
+        df = DataFrame({"End Date": dt}, index=[0])
+        assert df.iat[0, 0] == dt
+        tm.assert_series_equal(
+            df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}, dtype=object)
+        )
+
+        df = DataFrame([{"End Date": dt}])
+        assert df.iat[0, 0] == dt
+        tm.assert_series_equal(
+            df.dtypes, Series({"End Date": "datetime64[us, US/Eastern]"}, dtype=object)
+        )
+
+    def test_constructor_with_datetimes4(self):
+        # tz-aware (UTC and other tz's)
+        # GH 8411
+        dr = date_range("20130101", periods=3)
+        df = DataFrame({"value": dr})
+        assert df.iat[0, 0].tz is None
+        dr = date_range("20130101", periods=3, tz="UTC")
+        df = DataFrame({"value": dr})
+        assert str(df.iat[0, 0].tz) == "UTC"
+        dr = date_range("20130101", periods=3, tz="US/Eastern")
+        df = DataFrame({"value": dr})
+        assert str(df.iat[0, 0].tz) == "US/Eastern"
+
+    def test_constructor_with_datetimes5(self):
+        # GH 7822
+        # preserver an index with a tz on dict construction
+        i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern")
+
+        expected = DataFrame({"a": i.to_series().reset_index(drop=True)})
+        df = DataFrame()
+        df["a"] = i
+        tm.assert_frame_equal(df, expected)
+
+        df = DataFrame({"a": i})
+        tm.assert_frame_equal(df, expected)
+
+    def test_constructor_with_datetimes6(self):
+        # multiples
+        i = date_range("1/1/2011", periods=5, freq="10s", tz="US/Eastern")
+        i_no_tz = date_range("1/1/2011", periods=5, freq="10s")
+        df = DataFrame({"a": i, "b": i_no_tz})
+        expected = DataFrame({"a": i.to_series().reset_index(drop=True), "b": i_no_tz})
+        tm.assert_frame_equal(df, expected)
+
+    @pytest.mark.parametrize(
+        "arr",
+        [
+            np.array([None, None, None, None, datetime.now(), None]),
+            np.array([None, None, datetime.now(), None]),
+            [[np.datetime64("NaT")], [None]],
+            [[np.datetime64("NaT")], [pd.NaT]],
+            [[None], [np.datetime64("NaT")]],
+            [[None], [pd.NaT]],
+            [[pd.NaT], [np.datetime64("NaT")]],
+            [[pd.NaT], [None]],
+        ],
+    )
+    def test_constructor_datetimes_with_nulls(self, arr):
+        # gh-15869, GH#11220
+        result = DataFrame(arr).dtypes
+        unit = "ns"
+        if isinstance(arr, np.ndarray):
+            # inferred from a pydatetime object
+            unit = "us"
+        elif not any(isinstance(x, np.datetime64) for y in arr for x in y):
+            # TODO: this condition is not clear about why we have different behavior
+            unit = "s"
+        expected = Series([np.dtype(f"datetime64[{unit}]")])
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("order", ["K", "A", "C", "F"])
+    @pytest.mark.parametrize(
+        "unit",
+        ["M", "D", "h", "m", "s", "ms", "us", "ns"],
+    )
+    def test_constructor_datetimes_non_ns(self, order, unit):
+        dtype = f"datetime64[{unit}]"
+        na = np.array(
+            [
+                ["2015-01-01", "2015-01-02", "2015-01-03"],
+                ["2017-01-01", "2017-01-02", "2017-02-03"],
+            ],
+            dtype=dtype,
+            order=order,
+        )
+        df = DataFrame(na)
+        expected = DataFrame(na.astype("M8[ns]"))
+        if unit in ["M", "D", "h", "m"]:
+            with pytest.raises(TypeError, match="Cannot cast"):
+                expected.astype(dtype)
+
+            # instead the constructor casts to the closest supported reso, i.e. "s"
+            expected = expected.astype("datetime64[s]")
+        else:
+            expected = expected.astype(dtype=dtype)
+
+        tm.assert_frame_equal(df, expected)
+
+    @pytest.mark.parametrize("order", ["K", "A", "C", "F"])
+    @pytest.mark.parametrize(
+        "unit",
+        [
+            "D",
+            "h",
+            "m",
+            "s",
+            "ms",
+            "us",
+            "ns",
+        ],
+    )
+    def test_constructor_timedelta_non_ns(self, order, unit):
+        dtype = f"timedelta64[{unit}]"
+        na = np.array(
+            [
+                [np.timedelta64(1, "D"), np.timedelta64(2, "D")],
+                [np.timedelta64(4, "D"), np.timedelta64(5, "D")],
+            ],
+            dtype=dtype,
+            order=order,
+        )
+        df = DataFrame(na)
+        if unit in ["D", "h", "m"]:
+            # we get the nearest supported unit, i.e. "s"
+            exp_unit = "s"
+        else:
+            exp_unit = unit
+        exp_dtype = np.dtype(f"m8[{exp_unit}]")
+        expected = DataFrame(
+            [
+                [Timedelta(1, "D"), Timedelta(2, "D")],
+                [Timedelta(4, "D"), Timedelta(5, "D")],
+            ],
+            dtype=exp_dtype,
+        )
+        # TODO(2.0): ideally we should get the same 'expected' without passing
+        #  dtype=exp_dtype.
+        tm.assert_frame_equal(df, expected)
+
+    def test_constructor_for_list_with_dtypes(self, using_infer_string):
+        # test list of lists/ndarrays
+        df = DataFrame([np.arange(5) for x in range(5)])
+        result = df.dtypes
+        expected = Series([np.dtype("int")] * 5)
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame([np.array(np.arange(5), dtype="int32") for x in range(5)])
+        result = df.dtypes
+        expected = Series([np.dtype("int32")] * 5)
+        tm.assert_series_equal(result, expected)
+
+        # overflow issue? (we always expected int64 upcasting here)
+        df = DataFrame({"a": [2**31, 2**31 + 1]})
+        assert df.dtypes.iloc[0] == np.dtype("int64")
+
+        # GH #2751 (construction with no index specified), make sure we cast to
+        # platform values
+        df = DataFrame([1, 2])
+        assert df.dtypes.iloc[0] == np.dtype("int64")
+
+        df = DataFrame([1.0, 2.0])
+        assert df.dtypes.iloc[0] == np.dtype("float64")
+
+        df = DataFrame({"a": [1, 2]})
+        assert df.dtypes.iloc[0] == np.dtype("int64")
+
+        df = DataFrame({"a": [1.0, 2.0]})
+        assert df.dtypes.iloc[0] == np.dtype("float64")
+
+        df = DataFrame({"a": 1}, index=range(3))
+        assert df.dtypes.iloc[0] == np.dtype("int64")
+
+        df = DataFrame({"a": 1.0}, index=range(3))
+        assert df.dtypes.iloc[0] == np.dtype("float64")
+
+        # with object list
+        df = DataFrame(
+            {
+                "a": [1, 2, 4, 7],
+                "b": [1.2, 2.3, 5.1, 6.3],
+                "c": list("abcd"),
+                "d": [datetime(2000, 1, 1) for i in range(4)],
+                "e": [1.0, 2, 4.0, 7],
+            }
+        )
+        result = df.dtypes
+        expected = Series(
+            [
+                np.dtype("int64"),
+                np.dtype("float64"),
+                np.dtype("object")
+                if not using_infer_string
+                else pd.StringDtype(na_value=np.nan),
+                np.dtype("datetime64[us]"),
+                np.dtype("float64"),
+            ],
+            index=list("abcde"),
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_frame_copy(self, float_frame):
+        cop = DataFrame(float_frame, copy=True)
+        cop["A"] = 5
+        assert (cop["A"] == 5).all()
+        assert not (float_frame["A"] == 5).all()
+
+    def test_constructor_frame_shallow_copy(self, float_frame):
+        # constructing a DataFrame from DataFrame with copy=False should still
+        # give a "shallow" copy (share data, not attributes)
+        # https://github.com/pandas-dev/pandas/issues/49523
+        orig = float_frame.copy()
+        cop = DataFrame(float_frame)
+        assert cop._mgr is not float_frame._mgr
+        # Overwriting index of copy doesn't change original
+        cop.index = np.arange(len(cop))
+        tm.assert_frame_equal(float_frame, orig)
+
+    def test_constructor_ndarray_copy(self, float_frame):
+        arr = float_frame.values.copy()
+        df = DataFrame(arr)
+
+        arr[5] = 5
+        assert not (df.values[5] == 5).all()
+        df = DataFrame(arr, copy=True)
+        arr[6] = 6
+        assert not (df.values[6] == 6).all()
+
+    def test_constructor_series_copy(self, float_frame):
+        series = float_frame._series
+
+        df = DataFrame({"A": series["A"]}, copy=True)
+        # TODO can be replaced with `df.loc[:, "A"] = 5` after deprecation about
+        # inplace mutation is enforced
+        df.loc[df.index[0] : df.index[-1], "A"] = 5
+
+        assert not (series["A"] == 5).all()
+
+    @pytest.mark.parametrize(
+        "df",
+        [
+            DataFrame([[1, 2, 3], [4, 5, 6]], index=[1, np.nan]),
+            DataFrame([[1, 2, 3], [4, 5, 6]], columns=[1.1, 2.2, np.nan]),
+            DataFrame([[0, 1, 2, 3], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan]),
+            DataFrame(
+                [[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1.1, 2.2, np.nan]
+            ),
+            DataFrame([[0.0, 1, 2, 3.0], [4, 5, 6, 7]], columns=[np.nan, 1, 2, 2]),
+        ],
+    )
+    def test_constructor_with_nas(self, df):
+        # GH 5016
+        # na's in indices
+        # GH 21428 (non-unique columns)
+
+        for i in range(len(df.columns)):
+            df.iloc[:, i]
+
+        indexer = np.arange(len(df.columns))[isna(df.columns)]
+
+        # No NaN found -> error
+        if len(indexer) == 0:
+            with pytest.raises(KeyError, match="^nan$"):
+                df.loc[:, np.nan]
+        # single nan should result in Series
+        elif len(indexer) == 1:
+            tm.assert_series_equal(df.iloc[:, indexer[0]], df.loc[:, np.nan])
+        # multiple nans should result in DataFrame
+        else:
+            tm.assert_frame_equal(df.iloc[:, indexer], df.loc[:, np.nan])
+
+    def test_constructor_lists_to_object_dtype(self):
+        # from #1074
+        d = DataFrame({"a": [np.nan, False]})
+        assert d["a"].dtype == np.object_
+        assert not d["a"][1]
+
+    def test_constructor_ndarray_categorical_dtype(self):
+        cat = Categorical(["A", "B", "C"])
+        arr = np.array(cat).reshape(-1, 1)
+        arr = np.broadcast_to(arr, (3, 4))
+
+        result = DataFrame(arr, dtype=cat.dtype)
+
+        expected = DataFrame({0: cat, 1: cat, 2: cat, 3: cat})
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_categorical(self):
+        # GH8626
+
+        # dict creation
+        df = DataFrame({"A": list("abc")}, dtype="category")
+        expected = Series(list("abc"), dtype="category", name="A")
+        tm.assert_series_equal(df["A"], expected)
+
+        # to_frame
+        s = Series(list("abc"), dtype="category")
+        result = s.to_frame()
+        expected = Series(list("abc"), dtype="category", name=0)
+        tm.assert_series_equal(result[0], expected)
+        result = s.to_frame(name="foo")
+        expected = Series(list("abc"), dtype="category", name="foo")
+        tm.assert_series_equal(result["foo"], expected)
+
+        # list-like creation
+        df = DataFrame(list("abc"), dtype="category")
+        expected = Series(list("abc"), dtype="category", name=0)
+        tm.assert_series_equal(df[0], expected)
+
+    def test_construct_from_1item_list_of_categorical(self):
+        # pre-2.0 this behaved as DataFrame({0: cat}), in 2.0 we remove
+        #  Categorical special case
+        # ndim != 1
+        cat = Categorical(list("abc"))
+        df = DataFrame([cat])
+        expected = DataFrame([cat.astype(object)])
+        tm.assert_frame_equal(df, expected)
+
+    def test_construct_from_list_of_categoricals(self):
+        # pre-2.0 this behaved as DataFrame({0: cat}), in 2.0 we remove
+        #  Categorical special case
+
+        df = DataFrame([Categorical(list("abc")), Categorical(list("abd"))])
+        expected = DataFrame([["a", "b", "c"], ["a", "b", "d"]])
+        tm.assert_frame_equal(df, expected)
+
+    def test_from_nested_listlike_mixed_types(self):
+        # pre-2.0 this behaved as DataFrame({0: cat}), in 2.0 we remove
+        #  Categorical special case
+        # mixed
+        df = DataFrame([Categorical(list("abc")), list("def")])
+        expected = DataFrame([["a", "b", "c"], ["d", "e", "f"]])
+        tm.assert_frame_equal(df, expected)
+
+    def test_construct_from_listlikes_mismatched_lengths(self):
+        df = DataFrame([Categorical(list("abc")), Categorical(list("abdefg"))])
+        expected = DataFrame([list("abc"), list("abdefg")])
+        tm.assert_frame_equal(df, expected)
+
+    def test_constructor_categorical_series(self):
+        items = [1, 2, 3, 1]
+        exp = Series(items).astype("category")
+        res = Series(items, dtype="category")
+        tm.assert_series_equal(res, exp)
+
+        items = ["a", "b", "c", "a"]
+        exp = Series(items).astype("category")
+        res = Series(items, dtype="category")
+        tm.assert_series_equal(res, exp)
+
+        # insert into frame with different index
+        # GH 8076
+        index = date_range("20000101", periods=3)
+        expected = Series(
+            Categorical(values=[np.nan, np.nan, np.nan], categories=["a", "b", "c"])
+        )
+        expected.index = index
+
+        expected = DataFrame({"x": expected})
+        df = DataFrame({"x": Series(["a", "b", "c"], dtype="category")}, index=index)
+        tm.assert_frame_equal(df, expected)
+
+    @pytest.mark.parametrize(
+        "dtype",
+        tm.ALL_NUMERIC_DTYPES
+        + tm.DATETIME64_DTYPES
+        + tm.TIMEDELTA64_DTYPES
+        + tm.BOOL_DTYPES,
+    )
+    def test_check_dtype_empty_numeric_column(self, dtype):
+        # GH24386: Ensure dtypes are set correctly for an empty DataFrame.
+        # Empty DataFrame is generated via dictionary data with non-overlapping columns.
+        data = DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype)
+
+        assert data.b.dtype == dtype
+
+    @pytest.mark.parametrize(
+        "dtype", tm.STRING_DTYPES + tm.BYTES_DTYPES + tm.OBJECT_DTYPES
+    )
+    def test_check_dtype_empty_string_column(self, dtype):
+        # GH24386: Ensure dtypes are set correctly for an empty DataFrame.
+        # Empty DataFrame is generated via dictionary data with non-overlapping columns.
+        data = DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype)
+        assert data.b.dtype.name == "object"
+
+    def test_to_frame_with_falsey_names(self):
+        # GH 16114
+        result = Series(name=0, dtype=object).to_frame().dtypes
+        expected = Series({0: object})
+        tm.assert_series_equal(result, expected)
+
+        result = DataFrame(Series(name=0, dtype=object)).dtypes
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.arm_slow
+    @pytest.mark.parametrize("dtype", [None, "uint8", "category"])
+    def test_constructor_range_dtype(self, dtype):
+        expected = DataFrame({"A": [0, 1, 2, 3, 4]}, dtype=dtype or "int64")
+
+        # GH 26342
+        result = DataFrame(range(5), columns=["A"], dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+        # GH 16804
+        result = DataFrame({"A": range(5)}, dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_frame_from_list_subclass(self):
+        # GH21226
+        class List(list):
+            pass
+
+        expected = DataFrame([[1, 2, 3], [4, 5, 6]])
+        result = DataFrame(List([List([1, 2, 3]), List([4, 5, 6])]))
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "extension_arr",
+        [
+            Categorical(list("aabbc")),
+            SparseArray([1, np.nan, np.nan, np.nan]),
+            IntervalArray([Interval(0, 1), Interval(1, 5)]),
+            PeriodArray(pd.period_range(start="1/1/2017", end="1/1/2018", freq="M")),
+        ],
+    )
+    def test_constructor_with_extension_array(self, extension_arr):
+        # GH11363
+        expected = DataFrame(Series(extension_arr))
+        result = DataFrame(extension_arr)
+        tm.assert_frame_equal(result, expected)
+
+    def test_datetime_date_tuple_columns_from_dict(self):
+        # GH 10863
+        v = date.today()
+        tup = v, v
+        result = DataFrame({tup: Series(range(3), index=range(3))}, columns=[tup])
+        expected = DataFrame([0, 1, 2], columns=Index(Series([tup])))
+        tm.assert_frame_equal(result, expected)
+
+    def test_construct_with_two_categoricalindex_series(self):
+        # GH 14600
+        s1 = Series([39, 6, 4], index=CategoricalIndex(["female", "male", "unknown"]))
+        s2 = Series(
+            [2, 152, 2, 242, 150],
+            index=CategoricalIndex(["f", "female", "m", "male", "unknown"]),
+        )
+        result = DataFrame([s1, s2])
+        expected = DataFrame(
+            np.array([[39, 6, 4, np.nan, np.nan], [152.0, 242.0, 150.0, 2.0, 2.0]]),
+            columns=["female", "male", "unknown", "f", "m"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in cast:RuntimeWarning"
+    )
+    def test_constructor_series_nonexact_categoricalindex(self):
+        # GH 42424
+        ser = Series(range(100))
+        ser1 = cut(ser, 10).value_counts().head(5)
+        ser2 = cut(ser, 10).value_counts().tail(5)
+        result = DataFrame({"1": ser1, "2": ser2})
+        index = CategoricalIndex(
+            [
+                Interval(-0.099, 9.9, closed="right"),
+                Interval(9.9, 19.8, closed="right"),
+                Interval(19.8, 29.7, closed="right"),
+                Interval(29.7, 39.6, closed="right"),
+                Interval(39.6, 49.5, closed="right"),
+                Interval(49.5, 59.4, closed="right"),
+                Interval(59.4, 69.3, closed="right"),
+                Interval(69.3, 79.2, closed="right"),
+                Interval(79.2, 89.1, closed="right"),
+                Interval(89.1, 99, closed="right"),
+            ],
+            ordered=True,
+        )
+        expected = DataFrame(
+            {"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index
+        )
+        tm.assert_frame_equal(expected, result)
+
+    def test_from_M8_structured(self):
+        dates = [(datetime(2012, 9, 9, 0, 0), datetime(2012, 9, 8, 15, 10))]
+        arr = np.array(dates, dtype=[("Date", "M8[us]"), ("Forecasting", "M8[us]")])
+        df = DataFrame(arr)
+
+        assert df["Date"][0] == dates[0][0]
+        assert df["Forecasting"][0] == dates[0][1]
+
+        s = Series(arr["Date"])
+        assert isinstance(s[0], Timestamp)
+        assert s[0] == dates[0][0]
+
+    def test_from_datetime_subclass(self):
+        # GH21142 Verify whether Datetime subclasses are also of dtype datetime
+        class DatetimeSubclass(datetime):
+            pass
+
+        data = DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]})
+        assert data.datetime.dtype == "datetime64[us]"
+
+    def test_with_mismatched_index_length_raises(self):
+        # GH#33437
+        dti = date_range("2016-01-01", periods=3, tz="US/Pacific")
+        msg = "Shape of passed values|Passed arrays should have the same length"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(dti, index=range(4))
+
+    def test_frame_ctor_datetime64_column(self):
+        rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
+        dates = np.asarray(rng)
+
+        df = DataFrame(
+            {"A": np.random.default_rng(2).standard_normal(len(rng)), "B": dates}
+        )
+        assert np.issubdtype(df["B"].dtype, np.dtype("M8[ns]"))
+
+    def test_dataframe_constructor_infer_multiindex(self):
+        index_lists = [["a", "a", "b", "b"], ["x", "y", "x", "y"]]
+
+        multi = DataFrame(
+            np.random.default_rng(2).standard_normal((4, 4)),
+            index=[np.array(x) for x in index_lists],
+        )
+        assert isinstance(multi.index, MultiIndex)
+        assert not isinstance(multi.columns, MultiIndex)
+
+        multi = DataFrame(
+            np.random.default_rng(2).standard_normal((4, 4)), columns=index_lists
+        )
+        assert isinstance(multi.columns, MultiIndex)
+
+    @pytest.mark.parametrize(
+        "input_vals",
+        [
+            ([1, 2]),
+            (["1", "2"]),
+            (list(date_range("1/1/2011", periods=2, freq="h"))),
+            (list(date_range("1/1/2011", periods=2, freq="h", tz="US/Eastern"))),
+            ([Interval(left=0, right=5)]),
+        ],
+    )
+    def test_constructor_list_str(self, input_vals, string_dtype):
+        # GH#16605
+        # Ensure that data elements are converted to strings when
+        # dtype is str, 'str', or 'U'
+
+        result = DataFrame({"A": input_vals}, dtype=string_dtype)
+        expected = DataFrame({"A": input_vals}).astype({"A": string_dtype})
+        tm.assert_frame_equal(result, expected)
+
+    def test_constructor_list_str_na(self, string_dtype):
+        result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype)
+        expected = DataFrame({"A": ["1.0", "2.0", None]}, dtype=object)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("copy", [False, True])
+    def test_dict_nocopy(
+        self,
+        copy,
+        any_numeric_ea_dtype,
+        any_numpy_dtype,
+    ):
+        a = np.array([1, 2], dtype=any_numpy_dtype)
+        b = np.array([3, 4], dtype=any_numpy_dtype)
+        if b.dtype.kind in ["S", "U"]:
+            # These get cast, making the checks below more cumbersome
+            pytest.skip(f"{b.dtype} get cast, making the checks below more cumbersome")
+
+        c = pd.array([1, 2], dtype=any_numeric_ea_dtype)
+        c_orig = c.copy()
+        df = DataFrame({"a": a, "b": b, "c": c}, copy=copy)
+
+        def get_base(obj):
+            if isinstance(obj, np.ndarray):
+                return obj.base
+            elif isinstance(obj.dtype, np.dtype):
+                # i.e. DatetimeArray, TimedeltaArray
+                return obj._ndarray.base
+            else:
+                raise TypeError
+
+        def check_views(c_only: bool = False):
+            # Check that the underlying data behind df["c"] is still `c`
+            #  after setting with iloc.  Since we don't know which entry in
+            #  df._mgr.blocks corresponds to df["c"], we just check that exactly
+            #  one of these arrays is `c`.  GH#38939
+            assert sum(x.values is c for x in df._mgr.blocks) == 1
+            if c_only:
+                # If we ever stop consolidating in setitem_with_indexer,
+                #  this will become unnecessary.
+                return
+
+            assert (
+                sum(
+                    get_base(x.values) is a
+                    for x in df._mgr.blocks
+                    if isinstance(x.values.dtype, np.dtype)
+                )
+                == 1
+            )
+            assert (
+                sum(
+                    get_base(x.values) is b
+                    for x in df._mgr.blocks
+                    if isinstance(x.values.dtype, np.dtype)
+                )
+                == 1
+            )
+
+        if not copy:
+            # constructor preserves views
+            check_views()
+
+        # TODO: most of the rest of this test belongs in indexing tests
+        should_raise = not lib.is_np_dtype(df.dtypes.iloc[0], "fciuO")
+        if should_raise:
+            with pytest.raises(TypeError, match="Invalid value"):
+                df.iloc[0, 0] = 0
+                df.iloc[0, 1] = 0
+            return
+        else:
+            df.iloc[0, 0] = 0
+            df.iloc[0, 1] = 0
+        if not copy:
+            check_views(True)
+
+        # FIXME(GH#35417): until GH#35417, iloc.setitem into EA values does not preserve
+        #  view, so we have to check in the other direction
+        df.iloc[:, 2] = pd.array([45, 46], dtype=c.dtype)
+        assert df.dtypes.iloc[2] == c.dtype
+        if copy:
+            if a.dtype.kind == "M":
+                assert a[0] == a.dtype.type(1, "ns")
+                assert b[0] == b.dtype.type(3, "ns")
+            else:
+                assert a[0] == a.dtype.type(1)
+                assert b[0] == b.dtype.type(3)
+            # FIXME(GH#35417): enable after GH#35417
+            assert c[0] == c_orig[0]  # i.e. df.iloc[0, 2]=45 did *not* update c
+
+    def test_construct_from_dict_ea_series(self):
+        # GH#53744 - default of copy=True should also apply for Series with
+        # extension dtype
+        ser = Series([1, 2, 3], dtype="Int64")
+        df = DataFrame({"a": ser})
+        assert not np.shares_memory(ser.values._data, df["a"].values._data)
+
+    def test_from_series_with_name_with_columns(self):
+        # GH 7893
+        result = DataFrame(Series(1, name="foo"), columns=["bar"])
+        expected = DataFrame(columns=["bar"])
+        tm.assert_frame_equal(result, expected)
+
+    def test_nested_list_columns(self):
+        # GH 14467
+        result = DataFrame(
+            [[1, 2, 3], [4, 5, 6]], columns=[["A", "A", "A"], ["a", "b", "c"]]
+        )
+        expected = DataFrame(
+            [[1, 2, 3], [4, 5, 6]],
+            columns=MultiIndex.from_tuples([("A", "a"), ("A", "b"), ("A", "c")]),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_from_2d_object_array_of_periods_or_intervals(self):
+        # Period analogue to GH#26825
+        pi = pd.period_range("2016-04-05", periods=3)
+        data = pi._data.astype(object).reshape(1, -1)
+        df = DataFrame(data)
+        assert df.shape == (1, 3)
+        assert (df.dtypes == pi.dtype).all()
+        assert (df == pi).all().all()
+
+        ii = pd.IntervalIndex.from_breaks([3, 4, 5, 6])
+        data2 = ii._data.astype(object).reshape(1, -1)
+        df2 = DataFrame(data2)
+        assert df2.shape == (1, 3)
+        assert (df2.dtypes == ii.dtype).all()
+        assert (df2 == ii).all().all()
+
+        # mixed
+        data3 = np.r_[data, data2, data, data2].T
+        df3 = DataFrame(data3)
+        expected = DataFrame({0: pi, 1: ii, 2: pi, 3: ii})
+        tm.assert_frame_equal(df3, expected)
+
+    @pytest.mark.parametrize(
+        "col_a, col_b",
+        [
+            ([[1], [2]], np.array([[1], [2]])),
+            (np.array([[1], [2]]), [[1], [2]]),
+            (np.array([[1], [2]]), np.array([[1], [2]])),
+        ],
+    )
+    def test_error_from_2darray(self, col_a, col_b):
+        msg = "Per-column arrays must each be 1-dimensional"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame({"a": col_a, "b": col_b})
+
+    def test_from_dict_with_missing_copy_false(self):
+        # GH#45369 filled columns should not be views of one another
+        df = DataFrame(index=[1, 2, 3], columns=["a", "b", "c"], copy=False)
+        assert not np.shares_memory(df["a"]._values, df["b"]._values)
+
+        df.iloc[0, 0] = 0
+        expected = DataFrame(
+            {
+                "a": [0, np.nan, np.nan],
+                "b": [np.nan, np.nan, np.nan],
+                "c": [np.nan, np.nan, np.nan],
+            },
+            index=[1, 2, 3],
+            dtype=object,
+        )
+        tm.assert_frame_equal(df, expected)
+
+    def test_construction_empty_array_multi_column_raises(self):
+        # GH#46822
+        msg = r"Shape of passed values is \(0, 1\), indices imply \(0, 2\)"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(data=np.array([]), columns=["a", "b"])
+
+    def test_construct_with_strings_and_none(self):
+        # GH#32218
+        df = DataFrame(["1", "2", None], columns=["a"], dtype="str")
+        expected = DataFrame({"a": ["1", "2", None]}, dtype="str")
+        tm.assert_frame_equal(df, expected)
+
+    def test_frame_string_inference(self):
+        # GH#54430
+        dtype = pd.StringDtype(na_value=np.nan)
+        expected = DataFrame(
+            {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": ["a", "b"]})
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame(
+            {"a": ["a", "b"]},
+            dtype=dtype,
+            columns=Index(["a"], dtype=dtype),
+            index=Index(["x", "y"], dtype=dtype),
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": ["a", "b"]}, index=["x", "y"])
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame(
+            {"a": ["a", 1]}, dtype="object", columns=Index(["a"], dtype=dtype)
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": ["a", 1]})
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame(
+            {"a": ["a", "b"]}, dtype="object", columns=Index(["a"], dtype=dtype)
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": ["a", "b"]}, dtype="object")
+        tm.assert_frame_equal(df, expected)
+
+    def test_frame_string_inference_array_string_dtype(self):
+        # GH#54496
+        dtype = pd.StringDtype(na_value=np.nan)
+        expected = DataFrame(
+            {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame({"a": np.array(["a", "b"])})
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame({0: ["a", "b"], 1: ["c", "d"]}, dtype=dtype)
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame(np.array([["a", "c"], ["b", "d"]]))
+        tm.assert_frame_equal(df, expected)
+
+        expected = DataFrame(
+            {"a": ["a", "b"], "b": ["c", "d"]},
+            dtype=dtype,
+            columns=Index(["a", "b"], dtype=dtype),
+        )
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame(np.array([["a", "c"], ["b", "d"]]), columns=["a", "b"])
+        tm.assert_frame_equal(df, expected)
+
+    def test_frame_string_inference_block_dim(self):
+        # GH#55363
+        with pd.option_context("future.infer_string", True):
+            df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
+        assert df._mgr.blocks[0].ndim == 2
+
+    @pytest.mark.parametrize("klass", [Series, Index])
+    def test_inference_on_pandas_objects(self, klass):
+        # GH#56012
+        obj = klass([Timestamp("2019-12-31")], dtype=object)
+        result = DataFrame(obj, columns=["a"])
+        assert result.dtypes.iloc[0] == np.object_
+
+        result = DataFrame({"a": obj})
+        assert result.dtypes.iloc[0] == np.object_
+
+    def test_dict_keys_returns_rangeindex(self):
+        result = DataFrame({0: [1], 1: [2]}).columns
+        expected = RangeIndex(2)
+        tm.assert_index_equal(result, expected, exact=True)
+
+    @pytest.mark.parametrize(
+        "cons", [Series, Index, DatetimeIndex, DataFrame, pd.array, pd.to_datetime]
+    )
+    def test_construction_datetime_resolution_inference(self, cons):
+        ts = Timestamp(2999, 1, 1)
+        ts2 = ts.tz_localize("US/Pacific")
+
+        obj = cons([ts])
+        res_dtype = tm.get_dtype(obj)
+        assert res_dtype == "M8[us]", res_dtype
+
+        obj2 = cons([ts2])
+        res_dtype2 = tm.get_dtype(obj2)
+        assert res_dtype2 == "M8[us, US/Pacific]", res_dtype2
+
+    def test_construction_nan_value_timedelta64_dtype(self):
+        # GH#60064
+        result = DataFrame([None, 1], dtype="timedelta64[ns]")
+        expected = DataFrame(
+            ["NaT", "0 days 00:00:00.000000001"], dtype="timedelta64[ns]"
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_from_array_like_with_name_attribute(self):
+        # GH#61443
+        class DummyArray(np.ndarray):
+            def __new__(cls, input_array):
+                obj = np.asarray(input_array).view(cls)
+                obj.name = "foo"
+                return obj
+
+        dummy = DummyArray(np.eye(3))
+        df = DataFrame(dummy)
+        expected = DataFrame(np.eye(3))
+        tm.assert_frame_equal(df, expected)
+
+
+class TestDataFrameConstructorIndexInference:
+    def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self):
+        rng1 = pd.period_range("1/1/1999", "1/1/2012", freq="M")
+        s1 = Series(np.random.default_rng(2).standard_normal(len(rng1)), rng1)
+
+        rng2 = pd.period_range("1/1/1980", "12/1/2001", freq="M")
+        s2 = Series(np.random.default_rng(2).standard_normal(len(rng2)), rng2)
+        df = DataFrame({"s1": s1, "s2": s2})
+
+        exp = pd.period_range("1/1/1980", "1/1/2012", freq="M")
+        tm.assert_index_equal(df.index, exp)
+
+    def test_frame_from_dict_with_mixed_tzaware_indexes(self):
+        # GH#44091
+        dti = date_range("2016-01-01", periods=3)
+
+        ser1 = Series(range(3), index=dti)
+        ser2 = Series(range(3), index=dti.tz_localize("UTC"))
+        ser3 = Series(range(3), index=dti.tz_localize("US/Central"))
+        ser4 = Series(range(3))
+
+        # no tz-naive, but we do have mixed tzs and a non-DTI
+        df1 = DataFrame({"A": ser2, "B": ser3, "C": ser4})
+        exp_index = Index(
+            list(ser2.index) + list(ser3.index) + list(ser4.index), dtype=object
+        )
+        tm.assert_index_equal(df1.index, exp_index)
+
+        df2 = DataFrame({"A": ser2, "C": ser4, "B": ser3})
+        exp_index3 = Index(
+            list(ser2.index) + list(ser4.index) + list(ser3.index), dtype=object
+        )
+        tm.assert_index_equal(df2.index, exp_index3)
+
+        df3 = DataFrame({"B": ser3, "A": ser2, "C": ser4})
+        exp_index3 = Index(
+            list(ser3.index) + list(ser2.index) + list(ser4.index), dtype=object
+        )
+        tm.assert_index_equal(df3.index, exp_index3)
+
+        df4 = DataFrame({"C": ser4, "B": ser3, "A": ser2})
+        exp_index4 = Index(
+            list(ser4.index) + list(ser3.index) + list(ser2.index), dtype=object
+        )
+        tm.assert_index_equal(df4.index, exp_index4)
+
+        # TODO: not clear if these raising is desired (no extant tests),
+        #  but this is de facto behavior 2021-12-22
+        msg = "Cannot join tz-naive with tz-aware DatetimeIndex"
+        with pytest.raises(TypeError, match=msg):
+            DataFrame({"A": ser2, "B": ser3, "C": ser4, "D": ser1})
+        with pytest.raises(TypeError, match=msg):
+            DataFrame({"A": ser2, "B": ser3, "D": ser1})
+        with pytest.raises(TypeError, match=msg):
+            DataFrame({"D": ser1, "A": ser2, "B": ser3})
+
+    @pytest.mark.parametrize(
+        "key_val, col_vals, col_type",
+        [
+            ["3", ["3", "4"], "utf8"],
+            [3, [3, 4], "int8"],
+        ],
+    )
+    def test_dict_data_arrow_column_expansion(self, key_val, col_vals, col_type):
+        # GH 53617
+        pa = pytest.importorskip("pyarrow")
+        cols = pd.arrays.ArrowExtensionArray(
+            pa.array(col_vals, type=pa.dictionary(pa.int8(), getattr(pa, col_type)()))
+        )
+        result = DataFrame({key_val: [1, 2]}, columns=cols)
+        expected = DataFrame([[1, np.nan], [2, np.nan]], columns=cols)
+        expected.isetitem(1, expected.iloc[:, 1].astype(object))
+        tm.assert_frame_equal(result, expected)
+
+
+class TestDataFrameConstructorWithDtypeCoercion:
+    def test_floating_values_integer_dtype(self):
+        # GH#40110 make DataFrame behavior with arraylike floating data and
+        #  inty dtype match Series behavior
+
+        arr = np.random.default_rng(2).standard_normal((10, 5))
+
+        # GH#49599 in 2.0 we raise instead of either
+        #  a) silently ignoring dtype and returningfloat (the old Series behavior) or
+        #  b) rounding (the old DataFrame behavior)
+        msg = "Trying to coerce float values to integers"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(arr, dtype="i8")
+
+        df = DataFrame(arr.round(), dtype="i8")
+        assert (df.dtypes == "i8").all()
+
+        # with NaNs, we go through a different path with a different warning
+        arr[0, 0] = np.nan
+        msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
+        with pytest.raises(IntCastingNaNError, match=msg):
+            DataFrame(arr, dtype="i8")
+        with pytest.raises(IntCastingNaNError, match=msg):
+            Series(arr[0], dtype="i8")
+        # The future (raising) behavior matches what we would get via astype:
+        msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
+        with pytest.raises(IntCastingNaNError, match=msg):
+            DataFrame(arr).astype("i8")
+        with pytest.raises(IntCastingNaNError, match=msg):
+            Series(arr[0]).astype("i8")
+
+
+class TestDataFrameConstructorWithDatetimeTZ:
+    @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
+    def test_construction_preserves_tzaware_dtypes(self, tz):
+        # after GH#7822
+        # these retain the timezones on dict construction
+        dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI", unit="ns")
+        dr_tz = dr.tz_localize(tz)
+        df = DataFrame({"A": "foo", "B": dr_tz}, index=dr)
+        tz_expected = DatetimeTZDtype("ns", dr_tz.tzinfo)
+        assert df["B"].dtype == tz_expected
+
+        # GH#2810 (with timezones)
+        datetimes_naive = [ts.to_pydatetime() for ts in dr]
+        datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz]
+        df = DataFrame({"dr": dr})
+        df["dr_tz"] = dr_tz
+        df["datetimes_naive"] = datetimes_naive
+        df["datetimes_with_tz"] = datetimes_with_tz
+        result = df.dtypes
+        expected = Series(
+            [
+                np.dtype("datetime64[ns]"),
+                DatetimeTZDtype(tz=tz),
+                np.dtype("datetime64[us]"),
+                DatetimeTZDtype(tz=tz, unit="us"),
+            ],
+            index=["dr", "dr_tz", "datetimes_naive", "datetimes_with_tz"],
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("pydt", [True, False])
+    def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture, pydt):
+        # GH#25843, GH#41555, GH#33401
+        tz = tz_aware_fixture
+        ts = Timestamp("2019", tz=tz)
+        if pydt:
+            ts = ts.to_pydatetime()
+
+        msg = (
+            "Cannot convert timezone-aware data to timezone-naive dtype. "
+            r"Use pd.Series\(values\).dt.tz_localize\(None\) instead."
+        )
+        with pytest.raises(ValueError, match=msg):
+            DataFrame({0: [ts]}, dtype="datetime64[ns]")
+
+        msg2 = "Cannot unbox tzaware Timestamp to tznaive dtype"
+        with pytest.raises(TypeError, match=msg2):
+            DataFrame({0: ts}, index=[0], dtype="datetime64[ns]")
+
+        with pytest.raises(ValueError, match=msg):
+            DataFrame([ts], dtype="datetime64[ns]")
+
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(np.array([ts], dtype=object), dtype="datetime64[ns]")
+
+        with pytest.raises(TypeError, match=msg2):
+            DataFrame(ts, index=[0], columns=[0], dtype="datetime64[ns]")
+
+        with pytest.raises(ValueError, match=msg):
+            DataFrame([Series([ts])], dtype="datetime64[ns]")
+
+        with pytest.raises(ValueError, match=msg):
+            DataFrame([[ts]], columns=[0], dtype="datetime64[ns]")
+
+    def test_from_dict(self):
+        # 8260
+        # support datetime64 with tz
+
+        idx = Index(date_range("20130101", periods=3, tz="US/Eastern"), name="foo")
+        dr = date_range("20130110", periods=3)
+
+        # construction
+        df = DataFrame({"A": idx, "B": dr})
+        assert df["A"].dtype, "M8[ns, US/Eastern"
+        assert df["A"].name == "A"
+        tm.assert_series_equal(df["A"], Series(idx, name="A"))
+        tm.assert_series_equal(df["B"], Series(dr, name="B"))
+
+    def test_from_index(self):
+        # from index
+        idx2 = date_range("20130101", periods=3, tz="US/Eastern", name="foo")
+        df2 = DataFrame(idx2)
+        tm.assert_series_equal(df2["foo"], Series(idx2, name="foo"))
+        df2 = DataFrame(Series(idx2))
+        tm.assert_series_equal(df2["foo"], Series(idx2, name="foo"))
+
+        idx2 = date_range("20130101", periods=3, tz="US/Eastern")
+        df2 = DataFrame(idx2)
+        tm.assert_series_equal(df2[0], Series(idx2, name=0))
+        df2 = DataFrame(Series(idx2))
+        tm.assert_series_equal(df2[0], Series(idx2, name=0))
+
+    def test_frame_dict_constructor_datetime64_1680(self):
+        dr = date_range("1/1/2012", periods=10)
+        s = Series(dr, index=dr)
+
+        # it works!
+        DataFrame({"a": "foo", "b": s}, index=dr)
+        DataFrame({"a": "foo", "b": s.values}, index=dr)
+
+    def test_frame_datetime64_mixed_index_ctor_1681(self):
+        dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI")
+        ts = Series(dr)
+
+        # it works!
+        d = DataFrame({"A": "foo", "B": ts}, index=dr)
+        assert d["B"].isna().all()
+
+    def test_frame_timeseries_column(self):
+        # GH19157
+        dr = date_range(
+            start="20130101T10:00:00", periods=3, freq="min", tz="US/Eastern", unit="ns"
+        )
+        result = DataFrame(dr, columns=["timestamps"])
+        expected = DataFrame(
+            {
+                "timestamps": [
+                    Timestamp("20130101T10:00:00", tz="US/Eastern"),
+                    Timestamp("20130101T10:01:00", tz="US/Eastern"),
+                    Timestamp("20130101T10:02:00", tz="US/Eastern"),
+                ]
+            },
+            dtype="M8[ns, US/Eastern]",
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_nested_dict_construction(self):
+        # GH22227
+        columns = ["Nevada", "Ohio"]
+        pop = {
+            "Nevada": {2001: 2.4, 2002: 2.9},
+            "Ohio": {2000: 1.5, 2001: 1.7, 2002: 3.6},
+        }
+        result = DataFrame(pop, index=[2001, 2002, 2003], columns=columns)
+        expected = DataFrame(
+            [(2.4, 1.7), (2.9, 3.6), (np.nan, np.nan)],
+            columns=columns,
+            index=Index([2001, 2002, 2003]),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_from_tzaware_object_array(self):
+        # GH#26825 2D object array of tzaware timestamps should not raise
+        dti = date_range("2016-04-05 04:30", periods=3, tz="UTC")
+        data = dti._data.astype(object).reshape(1, -1)
+        df = DataFrame(data)
+        assert df.shape == (1, 3)
+        assert (df.dtypes == dti.dtype).all()
+        assert (df == dti).all().all()
+
+    def test_from_tzaware_mixed_object_array(self):
+        # GH#26825
+        arr = np.array(
+            [
+                [
+                    Timestamp("2013-01-01 00:00:00"),
+                    Timestamp("2013-01-02 00:00:00"),
+                    Timestamp("2013-01-03 00:00:00"),
+                ],
+                [
+                    Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
+                    pd.NaT,
+                    Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
+                ],
+                [
+                    Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
+                    pd.NaT,
+                    Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
+                ],
+            ],
+            dtype=object,
+        ).T
+        res = DataFrame(arr, columns=["A", "B", "C"])
+
+        expected_dtypes = [
+            "datetime64[us]",
+            "datetime64[us, US/Eastern]",
+            "datetime64[us, CET]",
+        ]
+        assert (res.dtypes == expected_dtypes).all()
+
+    def test_from_2d_ndarray_with_dtype(self):
+        # GH#12513
+        array_dim2 = np.arange(10).reshape((5, 2))
+        df = DataFrame(array_dim2, dtype="datetime64[ns, UTC]")
+
+        expected = DataFrame(array_dim2).astype("datetime64[ns, UTC]")
+        tm.assert_frame_equal(df, expected)
+
+    @pytest.mark.parametrize("typ", [set, frozenset])
+    def test_construction_from_set_raises(self, typ):
+        # https://github.com/pandas-dev/pandas/issues/32582
+        values = typ({1, 2, 3})
+        msg = f"'{typ.__name__}' type is unordered"
+        with pytest.raises(TypeError, match=msg):
+            DataFrame({"a": values})
+
+        with pytest.raises(TypeError, match=msg):
+            Series(values)
+
+    def test_construction_from_ndarray_datetimelike(self):
+        # ensure the underlying arrays are properly wrapped as EA when
+        # constructed from 2D ndarray
+        arr = np.arange(0, 12, dtype="datetime64[ns]").reshape(4, 3)
+        df = DataFrame(arr)
+        assert all(isinstance(block.values, DatetimeArray) for block in df._mgr.blocks)
+
+    def test_construction_from_ndarray_with_eadtype_mismatched_columns(self):
+        arr = np.random.default_rng(2).standard_normal((10, 2))
+        dtype = pd.array([2.0]).dtype
+        msg = r"len\(arrays\) must match len\(columns\)"
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(arr, columns=["foo"], dtype=dtype)
+
+        arr2 = pd.array([2.0, 3.0, 4.0])
+        with pytest.raises(ValueError, match=msg):
+            DataFrame(arr2, columns=["foo", "bar"])
+
+    def test_columns_indexes_raise_on_sets(self):
+        # GH 47215
+        data = [[1, 2, 3], [4, 5, 6]]
+        with pytest.raises(ValueError, match="index cannot be a set"):
+            DataFrame(data, index={"a", "b"})
+        with pytest.raises(ValueError, match="columns cannot be a set"):
+            DataFrame(data, columns={"a", "b", "c"})
+
+    def test_from_dict_with_columns_na_scalar(self):
+        result = DataFrame({"a": pd.NaT}, columns=["a"], index=range(2))
+        expected = DataFrame({"a": Series([pd.NaT, pd.NaT])})
+        tm.assert_frame_equal(result, expected)
+
+    # TODO: make this not cast to object in pandas 3.0
+    @pytest.mark.skipif(
+        not np_version_gt2, reason="StringDType only available in numpy 2 and above"
+    )
+    @pytest.mark.parametrize(
+        "data",
+        [
+            {"a": ["a", "b", "c"], "b": [1.0, 2.0, 3.0], "c": ["d", "e", "f"]},
+        ],
+    )
+    def test_np_string_array_object_cast(self, data):
+        from numpy.dtypes import StringDType
+
+        data["a"] = np.array(data["a"], dtype=StringDType())
+        res = DataFrame(data)
+        assert res["a"].dtype == np.object_
+        assert (res["a"] == data["a"]).all()
+
+
+def get1(obj):  # TODO: make a helper in tm?
+    if isinstance(obj, Series):
+        return obj.iloc[0]
+    else:
+        return obj.iloc[0, 0]
+
+
+class TestFromScalar:
+    @pytest.fixture(params=[list, dict, None])
+    def box(self, request):
+        return request.param
+
+    @pytest.fixture
+    def constructor(self, frame_or_series, box):
+        extra = {"index": range(2)}
+        if frame_or_series is DataFrame:
+            extra["columns"] = ["A"]
+
+        if box is None:
+            return functools.partial(frame_or_series, **extra)
+
+        elif box is dict:
+            if frame_or_series is Series:
+                return lambda x, **kwargs: frame_or_series(
+                    {0: x, 1: x}, **extra, **kwargs
+                )
+            else:
+                return lambda x, **kwargs: frame_or_series({"A": x}, **extra, **kwargs)
+        elif frame_or_series is Series:
+            return lambda x, **kwargs: frame_or_series([x, x], **extra, **kwargs)
+        else:
+            return lambda x, **kwargs: frame_or_series({"A": [x, x]}, **extra, **kwargs)
+
+    @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
+    def test_from_nat_scalar(self, dtype, constructor):
+        obj = constructor(pd.NaT, dtype=dtype)
+        assert np.all(obj.dtypes == dtype)
+        assert np.all(obj.isna())
+
+    def test_from_timedelta_scalar_preserves_nanos(self, constructor):
+        td = Timedelta(1)
+
+        obj = constructor(td, dtype="m8[ns]")
+        assert get1(obj) == td
+
+    def test_from_timestamp_scalar_preserves_nanos(self, constructor, fixed_now_ts):
+        ts = fixed_now_ts + Timedelta(1)
+
+        obj = constructor(ts, dtype="M8[ns]")
+        assert get1(obj) == ts
+
+    def test_from_timedelta64_scalar_object(self, constructor):
+        td = Timedelta(1)
+        td64 = td.to_timedelta64()
+
+        obj = constructor(td64, dtype=object)
+        assert isinstance(get1(obj), np.timedelta64)
+
+    @pytest.mark.parametrize("cls", [np.datetime64, np.timedelta64])
+    def test_from_scalar_datetimelike_mismatched(self, constructor, cls):
+        scalar = cls("NaT", "ns")
+        dtype = {np.datetime64: "m8[ns]", np.timedelta64: "M8[ns]"}[cls]
+
+        if cls is np.datetime64:
+            msg1 = "Invalid type for timedelta scalar: <class 'numpy.datetime64'>"
+        else:
+            msg1 = "<class 'numpy.timedelta64'> is not convertible to datetime"
+        msg = "|".join(["Cannot cast", msg1])
+
+        with pytest.raises(TypeError, match=msg):
+            constructor(scalar, dtype=dtype)
+
+        scalar = cls(4, "ns")
+        with pytest.raises(TypeError, match=msg):
+            constructor(scalar, dtype=dtype)
+
+    @pytest.mark.parametrize("cls", [datetime, np.datetime64])
+    def test_from_out_of_bounds_ns_datetime(
+        self, constructor, cls, request, box, frame_or_series
+    ):
+        # scalar that won't fit in nanosecond dt64, but will fit in microsecond
+        scalar = datetime(9999, 1, 1)
+        exp_dtype = "M8[us]"  # pydatetime objects default to this reso
+
+        if cls is np.datetime64:
+            scalar = np.datetime64(scalar, "D")
+            exp_dtype = "M8[s]"  # closest reso to input
+        result = constructor(scalar)
+
+        item = get1(result)
+        dtype = tm.get_dtype(result)
+
+        assert type(item) is Timestamp
+        assert item.asm8.dtype == exp_dtype
+        assert dtype == exp_dtype
+
+    def test_out_of_s_bounds_datetime64(self, constructor):
+        scalar = np.datetime64(np.iinfo(np.int64).max, "D")
+        result = constructor(scalar)
+        item = get1(result)
+        assert type(item) is np.datetime64
+        dtype = tm.get_dtype(result)
+        assert dtype == object
+
+    @pytest.mark.parametrize("cls", [timedelta, np.timedelta64])
+    def test_from_out_of_bounds_ns_timedelta(
+        self, constructor, cls, box, frame_or_series
+    ):
+        scalar = datetime(9999, 1, 1) - datetime(1970, 1, 1)
+        exp_dtype = "m8[us]"  # smallest reso that fits
+        if cls is np.timedelta64:
+            scalar = np.timedelta64(scalar, "D")
+            exp_dtype = "m8[s]"  # closest reso to input
+        result = constructor(scalar)
+
+        item = get1(result)
+        dtype = tm.get_dtype(result)
+
+        assert type(item) is Timedelta
+        assert item.asm8.dtype == exp_dtype
+        assert dtype == exp_dtype
+
+    @pytest.mark.parametrize("cls", [np.datetime64, np.timedelta64])
+    def test_out_of_s_bounds_timedelta64(self, constructor, cls):
+        scalar = cls(np.iinfo(np.int64).max, "D")
+        result = constructor(scalar)
+        item = get1(result)
+        assert type(item) is cls
+        dtype = tm.get_dtype(result)
+        assert dtype == object
+
+    def test_tzaware_data_tznaive_dtype(self, constructor, box, frame_or_series):
+        tz = "US/Eastern"
+        ts = Timestamp("2019", tz=tz)
+
+        if box is None or (frame_or_series is DataFrame and box is dict):
+            msg = "Cannot unbox tzaware Timestamp to tznaive dtype"
+            err = TypeError
+        else:
+            msg = (
+                "Cannot convert timezone-aware data to timezone-naive dtype. "
+                r"Use pd.Series\(values\).dt.tz_localize\(None\) instead."
+            )
+            err = ValueError
+
+        with pytest.raises(err, match=msg):
+            constructor(ts, dtype="M8[ns]")
+
+
+# TODO: better location for this test?
+class TestAllowNonNano:
+    # Until 2.0, we do not preserve non-nano dt64/td64 when passed as ndarray,
+    #  but do preserve it when passed as DTA/TDA
+
+    @pytest.fixture(params=[True, False])
+    def as_td(self, request):
+        return request.param
+
+    @pytest.fixture
+    def arr(self, as_td):
+        values = np.arange(5).astype(np.int64).view("M8[s]")
+        if as_td:
+            values = values - values[0]
+            return TimedeltaArray._simple_new(values, dtype=values.dtype)
+        else:
+            return DatetimeArray._simple_new(values, dtype=values.dtype)
+
+    def test_index_allow_non_nano(self, arr):
+        idx = Index(arr)
+        assert idx.dtype == arr.dtype
+
+    def test_dti_tdi_allow_non_nano(self, arr, as_td):
+        if as_td:
+            idx = pd.TimedeltaIndex(arr)
+        else:
+            idx = DatetimeIndex(arr)
+        assert idx.dtype == arr.dtype
+
+    def test_series_allow_non_nano(self, arr):
+        ser = Series(arr)
+        assert ser.dtype == arr.dtype
+
+    def test_frame_allow_non_nano(self, arr):
+        df = DataFrame(arr)
+        assert df.dtypes[0] == arr.dtype
+
+    def test_frame_from_dict_allow_non_nano(self, arr):
+        df = DataFrame({0: arr})
+        assert df.dtypes[0] == arr.dtype
diff --git a/pandas/tests/frame/test_cumulative.py b/pandas/tests/frame/test_cumulative.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab217e1b1332a67d3d17c086232dcf18e91e2a6f
--- /dev/null
+++ b/pandas/tests/frame/test_cumulative.py
@@ -0,0 +1,107 @@
+"""
+Tests for DataFrame cumulative operations
+
+See also
+--------
+tests.series.test_cumulative
+"""
+
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Series,
+    Timestamp,
+)
+import pandas._testing as tm
+
+
+class TestDataFrameCumulativeOps:
+    # ---------------------------------------------------------------------
+    # Cumulative Operations - cumsum, cummax, ...
+
+    def test_cumulative_ops_smoke(self):
+        # it works
+        df = DataFrame({"A": np.arange(20)}, index=np.arange(20))
+        df.cummax()
+        df.cummin()
+        df.cumsum()
+
+        dm = DataFrame(np.arange(20).reshape(4, 5), index=range(4), columns=range(5))
+        # TODO(wesm): do something with this?
+        dm.cumsum()
+
+    def test_cumprod_smoke(self, datetime_frame):
+        datetime_frame.iloc[5:10, 0] = np.nan
+        datetime_frame.iloc[10:15, 1] = np.nan
+        datetime_frame.iloc[15:, 2] = np.nan
+
+        # ints
+        df = datetime_frame.fillna(0).astype(int)
+        df.cumprod(0)
+        df.cumprod(1)
+
+        # ints32
+        df = datetime_frame.fillna(0).astype(np.int32)
+        df.cumprod(0)
+        df.cumprod(1)
+
+    def test_cumulative_ops_match_series_apply(
+        self, datetime_frame, all_numeric_accumulations
+    ):
+        datetime_frame.iloc[5:10, 0] = np.nan
+        datetime_frame.iloc[10:15, 1] = np.nan
+        datetime_frame.iloc[15:, 2] = np.nan
+
+        # axis = 0
+        result = getattr(datetime_frame, all_numeric_accumulations)()
+        expected = datetime_frame.apply(getattr(Series, all_numeric_accumulations))
+        tm.assert_frame_equal(result, expected)
+
+        # axis = 1
+        result = getattr(datetime_frame, all_numeric_accumulations)(axis=1)
+        expected = datetime_frame.apply(
+            getattr(Series, all_numeric_accumulations), axis=1
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # fix issue TODO: GH ref?
+        assert np.shape(result) == np.shape(datetime_frame)
+
+    def test_cumsum_preserve_dtypes(self):
+        # GH#19296 dont incorrectly upcast to object
+        df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3.0], "C": [True, False, False]})
+
+        result = df.cumsum()
+
+        expected = DataFrame(
+            {
+                "A": Series([1, 3, 6], dtype=np.int64),
+                "B": Series([1, 3, 6], dtype=np.float64),
+                "C": df["C"].cumsum(),
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("method", ["cumsum", "cumprod", "cummin", "cummax"])
+    @pytest.mark.parametrize("axis", [0, 1])
+    def test_numeric_only_flag(self, method, axis):
+        df = DataFrame(
+            {
+                "int": [1, 2, 3],
+                "bool": [True, False, False],
+                "string": ["a", "b", "c"],
+                "float": [1.0, 3.5, 4.0],
+                "datetime": [
+                    Timestamp(2018, 1, 1),
+                    Timestamp(2019, 1, 1),
+                    Timestamp(2020, 1, 1),
+                ],
+            }
+        )
+        df_numeric_only = df.drop(["string", "datetime"], axis=1)
+
+        result = getattr(df, method)(axis=axis, numeric_only=True)
+        expected = getattr(df_numeric_only, method)(axis)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/test_iteration.py b/pandas/tests/frame/test_iteration.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1c23ff05f3e19aca490444216ec295453483e80
--- /dev/null
+++ b/pandas/tests/frame/test_iteration.py
@@ -0,0 +1,160 @@
+import datetime
+
+import numpy as np
+import pytest
+
+from pandas.compat import (
+    IS64,
+    is_platform_windows,
+)
+
+from pandas import (
+    Categorical,
+    DataFrame,
+    Series,
+    date_range,
+)
+import pandas._testing as tm
+
+
+class TestIteration:
+    def test_keys(self, float_frame):
+        assert float_frame.keys() is float_frame.columns
+
+    def test_iteritems(self):
+        df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"])
+        for k, v in df.items():
+            assert isinstance(v, DataFrame._constructor_sliced)
+
+    def test_items(self):
+        # GH#17213, GH#13918
+        cols = ["a", "b", "c"]
+        df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols)
+        for c, (k, v) in zip(cols, df.items()):
+            assert c == k
+            assert isinstance(v, Series)
+            assert (df[k] == v).all()
+
+    def test_items_names(self, float_string_frame):
+        for k, v in float_string_frame.items():
+            assert v.name == k
+
+    def test_iter(self, float_frame):
+        assert list(float_frame) == list(float_frame.columns)
+
+    def test_iterrows(self, float_frame, float_string_frame):
+        for k, v in float_frame.iterrows():
+            exp = float_frame.loc[k]
+            tm.assert_series_equal(v, exp)
+
+        for k, v in float_string_frame.iterrows():
+            exp = float_string_frame.loc[k]
+            tm.assert_series_equal(v, exp)
+
+    def test_iterrows_iso8601(self):
+        # GH#19671
+        s = DataFrame(
+            {
+                "non_iso8601": ["M1701", "M1802", "M1903", "M2004"],
+                "iso8601": date_range("2000-01-01", periods=4, freq="ME"),
+            }
+        )
+        for k, v in s.iterrows():
+            exp = s.loc[k]
+            tm.assert_series_equal(v, exp)
+
+    def test_iterrows_corner(self):
+        # GH#12222
+        df = DataFrame(
+            {
+                "a": [datetime.datetime(2015, 1, 1)],
+                "b": [None],
+                "c": [None],
+                "d": [""],
+                "e": [[]],
+                "f": [set()],
+                "g": [{}],
+            }
+        )
+        expected = Series(
+            [datetime.datetime(2015, 1, 1), None, None, "", [], set(), {}],
+            index=list("abcdefg"),
+            name=0,
+            dtype="object",
+        )
+        _, result = next(df.iterrows())
+        tm.assert_series_equal(result, expected)
+
+    def test_itertuples(self, float_frame):
+        for i, tup in enumerate(float_frame.itertuples()):
+            ser = DataFrame._constructor_sliced(tup[1:])
+            ser.name = tup[0]
+            expected = float_frame.iloc[i, :].reset_index(drop=True)
+            tm.assert_series_equal(ser, expected)
+
+    def test_itertuples_index_false(self):
+        df = DataFrame(
+            {"floats": np.random.default_rng(2).standard_normal(5), "ints": range(5)},
+            columns=["floats", "ints"],
+        )
+
+        for tup in df.itertuples(index=False):
+            assert isinstance(tup[1], int)
+
+    def test_itertuples_duplicate_cols(self):
+        df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]})
+        dfaa = df[["a", "a"]]
+
+        assert list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)]
+
+        # repr with int on 32-bit/windows
+        if not (is_platform_windows() or not IS64):
+            assert (
+                repr(list(df.itertuples(name=None)))
+                == "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]"
+            )
+
+    def test_itertuples_tuple_name(self):
+        df = DataFrame(data={"a": [1, 2, 3], "b": [4, 5, 6]})
+        tup = next(df.itertuples(name="TestName"))
+        assert tup._fields == ("Index", "a", "b")
+        assert (tup.Index, tup.a, tup.b) == tup
+        assert type(tup).__name__ == "TestName"
+
+    def test_itertuples_disallowed_col_labels(self):
+        df = DataFrame(data={"def": [1, 2, 3], "return": [4, 5, 6]})
+        tup2 = next(df.itertuples(name="TestName"))
+        assert tup2 == (0, 1, 4)
+        assert tup2._fields == ("Index", "_1", "_2")
+
+    @pytest.mark.parametrize("limit", [254, 255, 1024])
+    @pytest.mark.parametrize("index", [True, False])
+    def test_itertuples_py2_3_field_limit_namedtuple(self, limit, index):
+        # GH#28282
+        df = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(limit)}])
+        result = next(df.itertuples(index=index))
+        assert isinstance(result, tuple)
+        assert hasattr(result, "_fields")
+
+    def test_sequence_like_with_categorical(self):
+        # GH#7839
+        # make sure can iterate
+        df = DataFrame(
+            {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]}
+        )
+        df["grade"] = Categorical(df["raw_grade"])
+
+        # basic sequencing testing
+        result = list(df.grade.values)
+        expected = np.array(df.grade.values).tolist()
+        tm.assert_almost_equal(result, expected)
+
+        # iteration
+        for t in df.itertuples(index=False):
+            str(t)
+
+        for row, s in df.iterrows():
+            str(s)
+
+        for c, col in df.items():
+            str(col)
diff --git a/pandas/tests/frame/test_logical_ops.py b/pandas/tests/frame/test_logical_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..251a7407edcdc16877f74ab8024a1c4dc64a730f
--- /dev/null
+++ b/pandas/tests/frame/test_logical_ops.py
@@ -0,0 +1,211 @@
+import operator
+import re
+
+import numpy as np
+import pytest
+
+from pandas import (
+    CategoricalIndex,
+    DataFrame,
+    Interval,
+    Series,
+    isnull,
+)
+import pandas._testing as tm
+
+
+class TestDataFrameLogicalOperators:
+    # &, |, ^
+
+    @pytest.mark.parametrize(
+        "left, right, op, expected",
+        [
+            (
+                [True, False, np.nan],
+                [True, False, True],
+                operator.and_,
+                [True, False, False],
+            ),
+            (
+                [True, False, True],
+                [True, False, np.nan],
+                operator.and_,
+                [True, False, False],
+            ),
+            (
+                [True, False, np.nan],
+                [True, False, True],
+                operator.or_,
+                [True, False, False],
+            ),
+            (
+                [True, False, True],
+                [True, False, np.nan],
+                operator.or_,
+                [True, False, True],
+            ),
+        ],
+    )
+    def test_logical_operators_nans(self, left, right, op, expected, frame_or_series):
+        # GH#13896
+        result = op(frame_or_series(left), frame_or_series(right))
+        expected = frame_or_series(expected)
+
+        tm.assert_equal(result, expected)
+
+    def test_logical_ops_empty_frame(self):
+        # GH#5808
+        # empty frames, non-mixed dtype
+        df = DataFrame(index=[1])
+
+        result = df & df
+        tm.assert_frame_equal(result, df)
+
+        result = df | df
+        tm.assert_frame_equal(result, df)
+
+        df2 = DataFrame(index=[1, 2])
+        result = df & df2
+        tm.assert_frame_equal(result, df2)
+
+        dfa = DataFrame(index=[1], columns=["A"])
+
+        result = dfa & dfa
+        expected = DataFrame(False, index=[1], columns=["A"])
+        tm.assert_frame_equal(result, expected)
+
+    def test_logical_ops_bool_frame(self):
+        # GH#5808
+        df1a_bool = DataFrame(True, index=[1], columns=["A"])
+
+        result = df1a_bool & df1a_bool
+        tm.assert_frame_equal(result, df1a_bool)
+
+        result = df1a_bool | df1a_bool
+        tm.assert_frame_equal(result, df1a_bool)
+
+    def test_logical_ops_int_frame(self):
+        # GH#5808
+        df1a_int = DataFrame(1, index=[1], columns=["A"])
+        df1a_bool = DataFrame(True, index=[1], columns=["A"])
+
+        result = df1a_int | df1a_bool
+        tm.assert_frame_equal(result, df1a_bool)
+
+        # Check that this matches Series behavior
+        res_ser = df1a_int["A"] | df1a_bool["A"]
+        tm.assert_series_equal(res_ser, df1a_bool["A"])
+
+    def test_logical_ops_invalid(self, using_infer_string):
+        # GH#5808
+
+        df1 = DataFrame(1.0, index=[1], columns=["A"])
+        df2 = DataFrame(True, index=[1], columns=["A"])
+        msg = re.escape("unsupported operand type(s) for |: 'float' and 'bool'")
+        with pytest.raises(TypeError, match=msg):
+            df1 | df2
+
+        df1 = DataFrame("foo", index=[1], columns=["A"])
+        df2 = DataFrame(True, index=[1], columns=["A"])
+        if using_infer_string and df1["A"].dtype.storage == "pyarrow":
+            msg = "operation 'or_' not supported for dtype 'str'"
+        else:
+            msg = re.escape("unsupported operand type(s) for |: 'str' and 'bool'")
+        with pytest.raises(TypeError, match=msg):
+            df1 | df2
+
+    def test_logical_operators(self):
+        def _check_bin_op(op):
+            result = op(df1, df2)
+            expected = DataFrame(
+                op(df1.values, df2.values), index=df1.index, columns=df1.columns
+            )
+            assert result.values.dtype == np.bool_
+            tm.assert_frame_equal(result, expected)
+
+        def _check_unary_op(op):
+            result = op(df1)
+            expected = DataFrame(op(df1.values), index=df1.index, columns=df1.columns)
+            assert result.values.dtype == np.bool_
+            tm.assert_frame_equal(result, expected)
+
+        df1 = {
+            "a": {"a": True, "b": False, "c": False, "d": True, "e": True},
+            "b": {"a": False, "b": True, "c": False, "d": False, "e": False},
+            "c": {"a": False, "b": False, "c": True, "d": False, "e": False},
+            "d": {"a": True, "b": False, "c": False, "d": True, "e": True},
+            "e": {"a": True, "b": False, "c": False, "d": True, "e": True},
+        }
+
+        df2 = {
+            "a": {"a": True, "b": False, "c": True, "d": False, "e": False},
+            "b": {"a": False, "b": True, "c": False, "d": False, "e": False},
+            "c": {"a": True, "b": False, "c": True, "d": False, "e": False},
+            "d": {"a": False, "b": False, "c": False, "d": True, "e": False},
+            "e": {"a": False, "b": False, "c": False, "d": False, "e": True},
+        }
+
+        df1 = DataFrame(df1)
+        df2 = DataFrame(df2)
+
+        _check_bin_op(operator.and_)
+        _check_bin_op(operator.or_)
+        _check_bin_op(operator.xor)
+
+        _check_unary_op(operator.inv)  # TODO: belongs elsewhere
+
+    def test_logical_with_nas(self):
+        d = DataFrame({"a": [np.nan, False], "b": [True, True]})
+
+        # GH4947
+        # bool comparisons should return bool
+        result = d["a"] | d["b"]
+        expected = Series([False, True])
+        tm.assert_series_equal(result, expected)
+
+        # GH4604, automatic casting here
+        result = d["a"].fillna(False) | d["b"]
+        expected = Series([True, True])
+        tm.assert_series_equal(result, expected)
+        result = d["a"].fillna(False) | d["b"]
+        expected = Series([True, True])
+        tm.assert_series_equal(result, expected)
+
+    def test_logical_ops_categorical_columns(self):
+        # GH#38367
+        intervals = [Interval(1, 2), Interval(3, 4)]
+        data = DataFrame(
+            [[1, np.nan], [2, np.nan]],
+            columns=CategoricalIndex(
+                intervals, categories=[*intervals, Interval(5, 6)]
+            ),
+        )
+        mask = DataFrame(
+            [[False, False], [False, False]], columns=data.columns, dtype=bool
+        )
+        result = mask | isnull(data)
+        expected = DataFrame(
+            [[False, True], [False, True]],
+            columns=CategoricalIndex(
+                intervals, categories=[*intervals, Interval(5, 6)]
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_int_dtype_different_index_not_bool(self):
+        # GH 52500
+        df1 = DataFrame([1, 2, 3], index=[10, 11, 23], columns=["a"])
+        df2 = DataFrame([10, 20, 30], index=[11, 10, 23], columns=["a"])
+        result = np.bitwise_xor(df1, df2)
+        expected = DataFrame([21, 8, 29], index=[10, 11, 23], columns=["a"])
+        tm.assert_frame_equal(result, expected)
+
+        result = df1 ^ df2
+        tm.assert_frame_equal(result, expected)
+
+    def test_different_dtypes_different_index_raises(self):
+        # GH 52538
+        df1 = DataFrame([1, 2], index=["a", "b"])
+        df2 = DataFrame([3, 4], index=["b", "c"])
+        with pytest.raises(TypeError, match="unsupported operand type"):
+            df1 & df2
diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e9aa2325e880d1f6ef651d24f31b87c35bba5f9
--- /dev/null
+++ b/pandas/tests/frame/test_nonunique_indexes.py
@@ -0,0 +1,336 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Series,
+    date_range,
+)
+import pandas._testing as tm
+
+
+class TestDataFrameNonuniqueIndexes:
+    def test_setattr_columns_vs_construct_with_columns(self):
+        # assignment
+        # GH 3687
+        arr = np.random.default_rng(2).standard_normal((3, 2))
+        idx = list(range(2))
+        df = DataFrame(arr, columns=["A", "A"])
+        df.columns = idx
+        expected = DataFrame(arr, columns=idx)
+        tm.assert_frame_equal(df, expected)
+
+    def test_setattr_columns_vs_construct_with_columns_datetimeindx(self):
+        idx = date_range("20130101", periods=4, freq="QE-NOV")
+        df = DataFrame(
+            [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=["a", "a", "a", "a"]
+        )
+        df.columns = idx
+        expected = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
+        tm.assert_frame_equal(df, expected)
+
+    def test_insert_with_duplicate_columns(self):
+        # insert
+        df = DataFrame(
+            [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
+            columns=["foo", "bar", "foo", "hello"],
+        )
+        df["string"] = "bah"
+        expected = DataFrame(
+            [[1, 1, 1, 5, "bah"], [1, 1, 2, 5, "bah"], [2, 1, 3, 5, "bah"]],
+            columns=["foo", "bar", "foo", "hello", "string"],
+        )
+        tm.assert_frame_equal(df, expected)
+        with pytest.raises(ValueError, match="Length of value"):
+            df.insert(0, "AnotherColumn", range(len(df.index) - 1))
+
+        # insert same dtype
+        df["foo2"] = 3
+        expected = DataFrame(
+            [[1, 1, 1, 5, "bah", 3], [1, 1, 2, 5, "bah", 3], [2, 1, 3, 5, "bah", 3]],
+            columns=["foo", "bar", "foo", "hello", "string", "foo2"],
+        )
+        tm.assert_frame_equal(df, expected)
+
+        # set (non-dup)
+        df["foo2"] = 4
+        expected = DataFrame(
+            [[1, 1, 1, 5, "bah", 4], [1, 1, 2, 5, "bah", 4], [2, 1, 3, 5, "bah", 4]],
+            columns=["foo", "bar", "foo", "hello", "string", "foo2"],
+        )
+        tm.assert_frame_equal(df, expected)
+        df["foo2"] = 3
+
+        # delete (non dup)
+        del df["bar"]
+        expected = DataFrame(
+            [[1, 1, 5, "bah", 3], [1, 2, 5, "bah", 3], [2, 3, 5, "bah", 3]],
+            columns=["foo", "foo", "hello", "string", "foo2"],
+        )
+        tm.assert_frame_equal(df, expected)
+
+        # try to delete again (its not consolidated)
+        del df["hello"]
+        expected = DataFrame(
+            [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
+            columns=["foo", "foo", "string", "foo2"],
+        )
+        tm.assert_frame_equal(df, expected)
+
+        # consolidate
+        df = df._consolidate()
+        expected = DataFrame(
+            [[1, 1, "bah", 3], [1, 2, "bah", 3], [2, 3, "bah", 3]],
+            columns=["foo", "foo", "string", "foo2"],
+        )
+        tm.assert_frame_equal(df, expected)
+
+        # insert
+        df.insert(2, "new_col", 5.0)
+        expected = DataFrame(
+            [[1, 1, 5.0, "bah", 3], [1, 2, 5.0, "bah", 3], [2, 3, 5.0, "bah", 3]],
+            columns=["foo", "foo", "new_col", "string", "foo2"],
+        )
+        tm.assert_frame_equal(df, expected)
+
+        # insert a dup
+        with pytest.raises(ValueError, match="cannot insert"):
+            df.insert(2, "new_col", 4.0)
+
+        df.insert(2, "new_col", 4.0, allow_duplicates=True)
+        expected = DataFrame(
+            [
+                [1, 1, 4.0, 5.0, "bah", 3],
+                [1, 2, 4.0, 5.0, "bah", 3],
+                [2, 3, 4.0, 5.0, "bah", 3],
+            ],
+            columns=["foo", "foo", "new_col", "new_col", "string", "foo2"],
+        )
+        tm.assert_frame_equal(df, expected)
+
+        # delete (dup)
+        del df["foo"]
+        expected = DataFrame(
+            [[4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3], [4.0, 5.0, "bah", 3]],
+            columns=["new_col", "new_col", "string", "foo2"],
+        )
+        tm.assert_frame_equal(df, expected)
+
+    def test_dup_across_dtypes(self):
+        # dup across dtypes
+        df = DataFrame(
+            [[1, 1, 1.0, 5], [1, 1, 2.0, 5], [2, 1, 3.0, 5]],
+            columns=["foo", "bar", "foo", "hello"],
+        )
+
+        df["foo2"] = 7.0
+        expected = DataFrame(
+            [[1, 1, 1.0, 5, 7.0], [1, 1, 2.0, 5, 7.0], [2, 1, 3.0, 5, 7.0]],
+            columns=["foo", "bar", "foo", "hello", "foo2"],
+        )
+        tm.assert_frame_equal(df, expected)
+
+        result = df["foo"]
+        expected = DataFrame([[1, 1.0], [1, 2.0], [2, 3.0]], columns=["foo", "foo"])
+        tm.assert_frame_equal(result, expected)
+
+        # multiple replacements
+        df["foo"] = "string"
+        expected = DataFrame(
+            [
+                ["string", 1, "string", 5, 7.0],
+                ["string", 1, "string", 5, 7.0],
+                ["string", 1, "string", 5, 7.0],
+            ],
+            columns=["foo", "bar", "foo", "hello", "foo2"],
+        )
+        tm.assert_frame_equal(df, expected)
+
+        del df["foo"]
+        expected = DataFrame(
+            [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "hello", "foo2"]
+        )
+        tm.assert_frame_equal(df, expected)
+
+    def test_column_dups_indexes(self):
+        # check column dups with index equal and not equal to df's index
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 3)),
+            index=["a", "b", "c", "d", "e"],
+            columns=["A", "B", "A"],
+        )
+        for index in [df.index, pd.Index(list("edcba"))]:
+            this_df = df.copy()
+            expected_ser = Series(index.values, index=this_df.index)
+            expected_df = DataFrame(
+                {"A": expected_ser, "B": this_df["B"]},
+                columns=["A", "B", "A"],
+            )
+            this_df["A"] = index
+            tm.assert_frame_equal(this_df, expected_df)
+
+    def test_changing_dtypes_with_duplicate_columns(self):
+        # multiple assignments that change dtypes
+        # the location indexer is a slice
+        # GH 6120
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)), columns=["that", "that"]
+        )
+        expected = DataFrame(1.0, index=range(5), columns=["that", "that"])
+
+        df["that"] = 1.0
+        tm.assert_frame_equal(df, expected)
+
+        df = DataFrame(
+            np.random.default_rng(2).random((5, 2)), columns=["that", "that"]
+        )
+        expected = DataFrame(1, index=range(5), columns=["that", "that"])
+
+        df["that"] = 1
+        tm.assert_frame_equal(df, expected)
+
+    def test_dup_columns_comparisons(self):
+        # equality
+        df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"])
+        df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"])
+
+        # not-comparing like-labelled
+        msg = (
+            r"Can only compare identically-labeled \(both index and columns\) "
+            "DataFrame objects"
+        )
+        with pytest.raises(ValueError, match=msg):
+            df1 == df2
+
+        df1r = df1.reindex_like(df2)
+        result = df1r == df2
+        expected = DataFrame(
+            [[False, True], [True, False], [False, False], [True, False]],
+            columns=["A", "A"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_mixed_column_selection(self):
+        # mixed column selection
+        # GH 5639
+        dfbool = DataFrame(
+            {
+                "one": Series([True, True, False], index=["a", "b", "c"]),
+                "two": Series([False, False, True, False], index=["a", "b", "c", "d"]),
+                "three": Series([False, True, True, True], index=["a", "b", "c", "d"]),
+            }
+        )
+        expected = pd.concat([dfbool["one"], dfbool["three"], dfbool["one"]], axis=1)
+        result = dfbool[["one", "three", "one"]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_multi_axis_dups(self):
+        # multi-axis dups
+        # GH 6121
+        df = DataFrame(
+            np.arange(25.0).reshape(5, 5),
+            index=["a", "b", "c", "d", "e"],
+            columns=["A", "B", "C", "D", "E"],
+        )
+        z = df[["A", "C", "A"]].copy()
+        expected = z.loc[["a", "c", "a"]]
+
+        df = DataFrame(
+            np.arange(25.0).reshape(5, 5),
+            index=["a", "b", "c", "d", "e"],
+            columns=["A", "B", "C", "D", "E"],
+        )
+        z = df[["A", "C", "A"]]
+        result = z.loc[["a", "c", "a"]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_columns_with_dups(self):
+        # GH 3468 related
+
+        # basic
+        df = DataFrame([[1, 2]], columns=["a", "a"])
+        df.columns = ["a", "a.1"]
+        expected = DataFrame([[1, 2]], columns=["a", "a.1"])
+        tm.assert_frame_equal(df, expected)
+
+        df = DataFrame([[1, 2, 3]], columns=["b", "a", "a"])
+        df.columns = ["b", "a", "a.1"]
+        expected = DataFrame([[1, 2, 3]], columns=["b", "a", "a.1"])
+        tm.assert_frame_equal(df, expected)
+
+    def test_columns_with_dup_index(self):
+        # with a dup index
+        df = DataFrame([[1, 2]], columns=["a", "a"])
+        df.columns = ["b", "b"]
+        expected = DataFrame([[1, 2]], columns=["b", "b"])
+        tm.assert_frame_equal(df, expected)
+
+    def test_multi_dtype(self):
+        # multi-dtype
+        df = DataFrame(
+            [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]],
+            columns=["a", "a", "b", "b", "d", "c", "c"],
+        )
+        df.columns = list("ABCDEFG")
+        expected = DataFrame(
+            [[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("ABCDEFG")
+        )
+        tm.assert_frame_equal(df, expected)
+
+    def test_multi_dtype2(self):
+        df = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a", "a", "a"])
+        df.columns = ["a", "a.1", "a.2", "a.3"]
+        expected = DataFrame([[1, 2, "foo", "bar"]], columns=["a", "a.1", "a.2", "a.3"])
+        tm.assert_frame_equal(df, expected)
+
+    def test_dups_across_blocks(self):
+        # dups across blocks
+        df_float = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 3)), dtype="float64"
+        )
+        df_int = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 3)).astype("int64")
+        )
+        df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns)
+        df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns)
+        df_dt = DataFrame(
+            pd.Timestamp("20010101"), index=df_float.index, columns=df_float.columns
+        )
+        df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
+
+        assert len(df._mgr.blknos) == len(df.columns)
+        assert len(df._mgr.blklocs) == len(df.columns)
+
+        # testing iloc
+        for i in range(len(df.columns)):
+            df.iloc[:, i]
+
+    def test_dup_columns_across_dtype(self):
+        # dup columns across dtype GH 2079/2194
+        vals = [[1, -1, 2.0], [2, -2, 3.0]]
+        rs = DataFrame(vals, columns=["A", "A", "B"])
+        xp = DataFrame(vals)
+        xp.columns = ["A", "A", "B"]
+        tm.assert_frame_equal(rs, xp)
+
+    def test_set_value_by_index(self):
+        # See gh-12344
+        warn = None
+        msg = "will attempt to set the values inplace"
+
+        df = DataFrame(np.arange(9).reshape(3, 3).T)
+        df.columns = list("AAA")
+        expected = df.iloc[:, 2].copy()
+
+        with tm.assert_produces_warning(warn, match=msg):
+            df.iloc[:, 0] = 3
+        tm.assert_series_equal(df.iloc[:, 2], expected)
+
+        df = DataFrame(np.arange(9).reshape(3, 3).T)
+        df.columns = [2, float(2), str(2)]
+        expected = df.iloc[:, 1].copy()
+
+        with tm.assert_produces_warning(warn, match=msg):
+            df.iloc[:, 0] = 3
+        tm.assert_series_equal(df.iloc[:, 1], expected)
diff --git a/pandas/tests/frame/test_npfuncs.py b/pandas/tests/frame/test_npfuncs.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9a241202d15696b0e91b6ad96546fa967471b29
--- /dev/null
+++ b/pandas/tests/frame/test_npfuncs.py
@@ -0,0 +1,84 @@
+"""
+Tests for np.foo applied to DataFrame, not necessarily ufuncs.
+"""
+
+import numpy as np
+
+from pandas import (
+    Categorical,
+    DataFrame,
+)
+import pandas._testing as tm
+
+
+class TestAsArray:
+    def test_asarray_homogeneous(self):
+        df = DataFrame({"A": Categorical([1, 2]), "B": Categorical([1, 2])})
+        result = np.asarray(df)
+        # may change from object in the future
+        expected = np.array([[1, 1], [2, 2]], dtype="object")
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_np_sqrt(self, float_frame):
+        with np.errstate(all="ignore"):
+            result = np.sqrt(float_frame)
+        assert isinstance(result, type(float_frame))
+        assert result.index.is_(float_frame.index)
+        assert result.columns.is_(float_frame.columns)
+
+        tm.assert_frame_equal(result, float_frame.apply(np.sqrt))
+
+    def test_sum_axis_behavior(self):
+        # GH#52042 df.sum(axis=None) now reduces over both axes, which gets
+        #  called when we do np.sum(df)
+
+        arr = np.random.default_rng(2).standard_normal((4, 3))
+        df = DataFrame(arr)
+
+        res = np.sum(df)
+        expected = df.to_numpy().sum(axis=None)
+        assert res == expected
+
+    def test_np_ravel(self):
+        # GH26247
+        arr = np.array(
+            [
+                [0.11197053, 0.44361564, -0.92589452],
+                [0.05883648, -0.00948922, -0.26469934],
+            ]
+        )
+
+        result = np.ravel([DataFrame(batch.reshape(1, 3)) for batch in arr])
+        expected = np.array(
+            [
+                0.11197053,
+                0.44361564,
+                -0.92589452,
+                0.05883648,
+                -0.00948922,
+                -0.26469934,
+            ]
+        )
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = np.ravel(DataFrame(arr[0].reshape(1, 3), columns=["x1", "x2", "x3"]))
+        expected = np.array([0.11197053, 0.44361564, -0.92589452])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = np.ravel(
+            [
+                DataFrame(batch.reshape(1, 3), columns=["x1", "x2", "x3"])
+                for batch in arr
+            ]
+        )
+        expected = np.array(
+            [
+                0.11197053,
+                0.44361564,
+                -0.92589452,
+                0.05883648,
+                -0.00948922,
+                -0.26469934,
+            ]
+        )
+        tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ccc3af674c09ccf82a75fbf3bf3e41f89fd0dea
--- /dev/null
+++ b/pandas/tests/frame/test_query_eval.py
@@ -0,0 +1,1609 @@
+import operator
+from tokenize import TokenError
+
+import numpy as np
+import pytest
+
+from pandas.errors import (
+    NumExprClobberingError,
+    UndefinedVariableError,
+)
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.core.computation.check import NUMEXPR_INSTALLED
+
+
+@pytest.fixture(params=["python", "pandas"], ids=lambda x: x)
+def parser(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=["python", pytest.param("numexpr", marks=td.skip_if_no("numexpr"))],
+    ids=lambda x: x,
+)
+def engine(request):
+    return request.param
+
+
+def skip_if_no_pandas_parser(parser):
+    if parser != "pandas":
+        pytest.skip(f"cannot evaluate with parser={parser}")
+
+
+class TestCompat:
+    @pytest.fixture
+    def df(self):
+        return DataFrame({"A": [1, 2, 3]})
+
+    @pytest.fixture
+    def expected1(self, df):
+        return df[df.A > 0]
+
+    @pytest.fixture
+    def expected2(self, df):
+        return df.A + 1
+
+    def test_query_default(self, df, expected1, expected2):
+        # GH 12749
+        # this should always work, whether NUMEXPR_INSTALLED or not
+        result = df.query("A>0")
+        tm.assert_frame_equal(result, expected1)
+        result = df.eval("A+1")
+        tm.assert_series_equal(result, expected2)
+
+    def test_query_None(self, df, expected1, expected2):
+        result = df.query("A>0", engine=None)
+        tm.assert_frame_equal(result, expected1)
+        result = df.eval("A+1", engine=None)
+        tm.assert_series_equal(result, expected2)
+
+    def test_query_python(self, df, expected1, expected2):
+        result = df.query("A>0", engine="python")
+        tm.assert_frame_equal(result, expected1)
+        result = df.eval("A+1", engine="python")
+        tm.assert_series_equal(result, expected2)
+
+    def test_query_numexpr(self, df, expected1, expected2):
+        if NUMEXPR_INSTALLED:
+            result = df.query("A>0", engine="numexpr")
+            tm.assert_frame_equal(result, expected1)
+            result = df.eval("A+1", engine="numexpr")
+            tm.assert_series_equal(result, expected2)
+        else:
+            msg = (
+                r"'numexpr' is not installed or an unsupported version. "
+                r"Cannot use engine='numexpr' for query/eval if 'numexpr' is "
+                r"not installed"
+            )
+            with pytest.raises(ImportError, match=msg):
+                df.query("A>0", engine="numexpr")
+            with pytest.raises(ImportError, match=msg):
+                df.eval("A+1", engine="numexpr")
+
+
+class TestDataFrameEval:
+    # smaller hits python, larger hits numexpr
+    @pytest.mark.parametrize("n", [4, 4000])
+    @pytest.mark.parametrize(
+        "op_str,op,rop",
+        [
+            ("+", "__add__", "__radd__"),
+            ("-", "__sub__", "__rsub__"),
+            ("*", "__mul__", "__rmul__"),
+            ("/", "__truediv__", "__rtruediv__"),
+        ],
+    )
+    def test_ops(self, op_str, op, rop, n):
+        # tst ops and reversed ops in evaluation
+        # GH7198
+
+        df = DataFrame(1, index=range(n), columns=list("abcd"))
+        df.iloc[0] = 2
+        m = df.mean()
+
+        base = DataFrame(  # noqa: F841
+            np.tile(m.values, n).reshape(n, -1), columns=list("abcd")
+        )
+
+        expected = eval(f"base {op_str} df")
+
+        # ops as strings
+        result = eval(f"m {op_str} df")
+        tm.assert_frame_equal(result, expected)
+
+        # these are commutative
+        if op in ["+", "*"]:
+            result = getattr(df, op)(m)
+            tm.assert_frame_equal(result, expected)
+
+        # these are not
+        elif op in ["-", "/"]:
+            result = getattr(df, rop)(m)
+            tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_sub_numexpr_path(self):
+        # GH7192: Note we need a large number of rows to ensure this
+        #  goes through the numexpr path
+        df = DataFrame({"A": np.random.default_rng(2).standard_normal(25000)})
+        df.iloc[0:5] = np.nan
+        expected = 1 - np.isnan(df.iloc[0:25])
+        result = (1 - np.isnan(df)).iloc[0:25]
+        tm.assert_frame_equal(result, expected)
+
+    def test_query_non_str(self):
+        # GH 11485
+        df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "b"]})
+
+        msg = "expr must be a string to be evaluated"
+        with pytest.raises(ValueError, match=msg):
+            df.query(lambda x: x.B == "b")
+
+        with pytest.raises(ValueError, match=msg):
+            df.query(111)
+
+    def test_query_empty_string(self):
+        # GH 13139
+        df = DataFrame({"A": [1, 2, 3]})
+
+        msg = "expr cannot be an empty string"
+        with pytest.raises(ValueError, match=msg):
+            df.query("")
+
+    def test_query_duplicate_column_name(self, engine, parser):
+        df = DataFrame({"A": range(3), "B": range(3), "C": range(3)}).rename(
+            columns={"B": "A"}
+        )
+
+        res = df.query("C == 1", engine=engine, parser=parser)
+
+        expect = DataFrame([[1, 1, 1]], columns=["A", "A", "C"], index=[1])
+
+        tm.assert_frame_equal(res, expect)
+
+    def test_eval_resolvers_as_list(self):
+        # GH 14095
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 2)), columns=list("ab")
+        )
+        dict1 = {"a": 1}
+        dict2 = {"b": 2}
+        assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
+        assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
+
+    def test_eval_resolvers_combined(self):
+        # GH 34966
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 2)), columns=list("ab")
+        )
+        dict1 = {"c": 2}
+
+        # Both input and default index/column resolvers should be usable
+        result = df.eval("a + b * c", resolvers=[dict1])
+
+        expected = df["a"] + df["b"] * dict1["c"]
+        tm.assert_series_equal(result, expected)
+
+    def test_eval_object_dtype_binop(self):
+        # GH#24883
+        df = DataFrame({"a1": ["Y", "N"]})
+        res = df.eval("c = ((a1 == 'Y') & True)")
+        expected = DataFrame({"a1": ["Y", "N"], "c": [True, False]})
+        tm.assert_frame_equal(res, expected)
+
+    def test_using_numpy(self, engine, parser):
+        # GH 58041
+        skip_if_no_pandas_parser(parser)
+        df = Series([0.2, 1.5, 2.8], name="a").to_frame()
+        res = df.eval("@np.floor(a)", engine=engine, parser=parser)
+        expected = np.floor(df["a"])
+        tm.assert_series_equal(expected, res)
+
+    def test_eval_simple(self, engine, parser):
+        df = Series([0.2, 1.5, 2.8], name="a").to_frame()
+        res = df.eval("a", engine=engine, parser=parser)
+        expected = df["a"]
+        tm.assert_series_equal(expected, res)
+
+    def test_extension_array_eval(self, engine, parser, request):
+        # GH#58748
+        if engine == "numexpr":
+            mark = pytest.mark.xfail(
+                reason="numexpr does not support extension array dtypes"
+            )
+            request.applymarker(mark)
+        df = DataFrame({"a": pd.array([1, 2, 3]), "b": pd.array([4, 5, 6])})
+        result = df.eval("a / b", engine=engine, parser=parser)
+        expected = Series(pd.array([0.25, 0.40, 0.50]))
+        tm.assert_series_equal(result, expected)
+
+    def test_complex_eval(self, engine, parser):
+        # GH#21374
+        df = DataFrame({"a": [1 + 2j], "b": [1 + 1j]})
+        result = df.eval("a/b", engine=engine, parser=parser)
+        expected = Series([1.5 + 0.5j])
+        tm.assert_series_equal(result, expected)
+
+
+class TestDataFrameQueryWithMultiIndex:
+    def test_query_with_named_multiindex(self, parser, engine):
+        skip_if_no_pandas_parser(parser)
+        a = np.random.default_rng(2).choice(["red", "green"], size=10)
+        b = np.random.default_rng(2).choice(["eggs", "ham"], size=10)
+        index = MultiIndex.from_arrays([a, b], names=["color", "food"])
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)), index=index)
+        ind = Series(
+            df.index.get_level_values("color").values, index=index, name="color"
+        )
+
+        # equality
+        res1 = df.query('color == "red"', parser=parser, engine=engine)
+        res2 = df.query('"red" == color', parser=parser, engine=engine)
+        exp = df[ind == "red"]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        # inequality
+        res1 = df.query('color != "red"', parser=parser, engine=engine)
+        res2 = df.query('"red" != color', parser=parser, engine=engine)
+        exp = df[ind != "red"]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        # list equality (really just set membership)
+        res1 = df.query('color == ["red"]', parser=parser, engine=engine)
+        res2 = df.query('["red"] == color', parser=parser, engine=engine)
+        exp = df[ind.isin(["red"])]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        res1 = df.query('color != ["red"]', parser=parser, engine=engine)
+        res2 = df.query('["red"] != color', parser=parser, engine=engine)
+        exp = df[~ind.isin(["red"])]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        # in/not in ops
+        res1 = df.query('["red"] in color', parser=parser, engine=engine)
+        res2 = df.query('"red" in color', parser=parser, engine=engine)
+        exp = df[ind.isin(["red"])]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        res1 = df.query('["red"] not in color', parser=parser, engine=engine)
+        res2 = df.query('"red" not in color', parser=parser, engine=engine)
+        exp = df[~ind.isin(["red"])]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+    def test_query_with_unnamed_multiindex(self, parser, engine):
+        skip_if_no_pandas_parser(parser)
+        a = np.random.default_rng(2).choice(["red", "green"], size=10)
+        b = np.random.default_rng(2).choice(["eggs", "ham"], size=10)
+        index = MultiIndex.from_arrays([a, b])
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)), index=index)
+        ind = Series(df.index.get_level_values(0).values, index=index)
+
+        res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
+        res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine)
+        exp = df[ind == "red"]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        # inequality
+        res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
+        res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine)
+        exp = df[ind != "red"]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        # list equality (really just set membership)
+        res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine)
+        res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine)
+        exp = df[ind.isin(["red"])]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine)
+        res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine)
+        exp = df[~ind.isin(["red"])]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        # in/not in ops
+        res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine)
+        res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine)
+        exp = df[ind.isin(["red"])]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        res1 = df.query('["red"] not in ilevel_0', parser=parser, engine=engine)
+        res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine)
+        exp = df[~ind.isin(["red"])]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        # ## LEVEL 1
+        ind = Series(df.index.get_level_values(1).values, index=index)
+        res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine)
+        res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine)
+        exp = df[ind == "eggs"]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        # inequality
+        res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine)
+        res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine)
+        exp = df[ind != "eggs"]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        # list equality (really just set membership)
+        res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine)
+        res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine)
+        exp = df[ind.isin(["eggs"])]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine)
+        res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine)
+        exp = df[~ind.isin(["eggs"])]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        # in/not in ops
+        res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine)
+        res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine)
+        exp = df[ind.isin(["eggs"])]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+        res1 = df.query('["eggs"] not in ilevel_1', parser=parser, engine=engine)
+        res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine)
+        exp = df[~ind.isin(["eggs"])]
+        tm.assert_frame_equal(res1, exp)
+        tm.assert_frame_equal(res2, exp)
+
+    def test_query_with_partially_named_multiindex(self, parser, engine):
+        skip_if_no_pandas_parser(parser)
+        a = np.random.default_rng(2).choice(["red", "green"], size=10)
+        b = np.arange(10)
+        index = MultiIndex.from_arrays([a, b])
+        index.names = [None, "rating"]
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)), index=index)
+        res = df.query("rating == 1", parser=parser, engine=engine)
+        ind = Series(
+            df.index.get_level_values("rating").values, index=index, name="rating"
+        )
+        exp = df[ind == 1]
+        tm.assert_frame_equal(res, exp)
+
+        res = df.query("rating != 1", parser=parser, engine=engine)
+        ind = Series(
+            df.index.get_level_values("rating").values, index=index, name="rating"
+        )
+        exp = df[ind != 1]
+        tm.assert_frame_equal(res, exp)
+
+        res = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
+        ind = Series(df.index.get_level_values(0).values, index=index)
+        exp = df[ind == "red"]
+        tm.assert_frame_equal(res, exp)
+
+        res = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
+        ind = Series(df.index.get_level_values(0).values, index=index)
+        exp = df[ind != "red"]
+        tm.assert_frame_equal(res, exp)
+
+    def test_query_multiindex_get_index_resolvers(self):
+        df = DataFrame(
+            np.ones((10, 3)),
+            index=MultiIndex.from_arrays(
+                [range(10) for _ in range(2)], names=["spam", "eggs"]
+            ),
+        )
+        resolvers = df._get_index_resolvers()
+
+        def to_series(mi, level):
+            level_values = mi.get_level_values(level)
+            s = level_values.to_series()
+            s.index = mi
+            return s
+
+        col_series = df.columns.to_series()
+        expected = {
+            "index": df.index,
+            "columns": col_series,
+            "spam": to_series(df.index, "spam"),
+            "eggs": to_series(df.index, "eggs"),
+            "clevel_0": col_series,
+        }
+        for k, v in resolvers.items():
+            if isinstance(v, Index):
+                assert v.is_(expected[k])
+            elif isinstance(v, Series):
+                tm.assert_series_equal(v, expected[k])
+            else:
+                raise AssertionError("object must be a Series or Index")
+
+
+@td.skip_if_no("numexpr")
+class TestDataFrameQueryNumExprPandas:
+    @pytest.fixture
+    def engine(self):
+        return "numexpr"
+
+    @pytest.fixture
+    def parser(self):
+        return "pandas"
+
+    def test_date_query_with_attribute_access(self, engine, parser):
+        skip_if_no_pandas_parser(parser)
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+        df["dates1"] = date_range("1/1/2012", periods=5)
+        df["dates2"] = date_range("1/1/2013", periods=5)
+        df["dates3"] = date_range("1/1/2014", periods=5)
+        res = df.query(
+            "@df.dates1 < 20130101 < @df.dates3", engine=engine, parser=parser
+        )
+        expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
+        tm.assert_frame_equal(res, expec)
+
+    def test_date_query_no_attribute_access(self, engine, parser):
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+        df["dates1"] = date_range("1/1/2012", periods=5)
+        df["dates2"] = date_range("1/1/2013", periods=5)
+        df["dates3"] = date_range("1/1/2014", periods=5)
+        res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
+        expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
+        tm.assert_frame_equal(res, expec)
+
+    def test_date_query_with_NaT(self, engine, parser):
+        n = 10
+        df = DataFrame(np.random.default_rng(2).standard_normal((n, 3)))
+        df["dates1"] = date_range("1/1/2012", periods=n)
+        df["dates2"] = date_range("1/1/2013", periods=n)
+        df["dates3"] = date_range("1/1/2014", periods=n)
+        df.loc[np.random.default_rng(2).random(n) > 0.5, "dates1"] = pd.NaT
+        df.loc[np.random.default_rng(2).random(n) > 0.5, "dates3"] = pd.NaT
+        res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
+        expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
+        tm.assert_frame_equal(res, expec)
+
+    def test_date_index_query(self, engine, parser):
+        n = 10
+        df = DataFrame(np.random.default_rng(2).standard_normal((n, 3)))
+        df["dates1"] = date_range("1/1/2012", periods=n)
+        df["dates3"] = date_range("1/1/2014", periods=n)
+        return_value = df.set_index("dates1", inplace=True, drop=True)
+        assert return_value is None
+        res = df.query("index < 20130101 < dates3", engine=engine, parser=parser)
+        expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
+        tm.assert_frame_equal(res, expec)
+
+    def test_date_index_query_with_NaT(self, engine, parser):
+        n = 10
+        # Cast to object to avoid implicit cast when setting entry to pd.NaT below
+        df = DataFrame(np.random.default_rng(2).standard_normal((n, 3))).astype(
+            {0: object}
+        )
+        df["dates1"] = date_range("1/1/2012", periods=n)
+        df["dates3"] = date_range("1/1/2014", periods=n)
+        df.iloc[0, 0] = pd.NaT
+        return_value = df.set_index("dates1", inplace=True, drop=True)
+        assert return_value is None
+        res = df.query("index < 20130101 < dates3", engine=engine, parser=parser)
+        expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
+        tm.assert_frame_equal(res, expec)
+
+    def test_date_index_query_with_NaT_duplicates(self, engine, parser):
+        n = 10
+        d = {}
+        d["dates1"] = date_range("1/1/2012", periods=n)
+        d["dates3"] = date_range("1/1/2014", periods=n)
+        df = DataFrame(d)
+        df.loc[np.random.default_rng(2).random(n) > 0.5, "dates1"] = pd.NaT
+        return_value = df.set_index("dates1", inplace=True, drop=True)
+        assert return_value is None
+        res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
+        expec = df[(df.index.to_series() < "20130101") & ("20130101" < df.dates3)]
+        tm.assert_frame_equal(res, expec)
+
+    def test_date_query_with_non_date(self, engine, parser):
+        n = 10
+        df = DataFrame(
+            {
+                "dates": date_range("1/1/2012", periods=n, unit="ns"),
+                "nondate": np.arange(n),
+            }
+        )
+
+        result = df.query("dates == nondate", parser=parser, engine=engine)
+        assert len(result) == 0
+
+        result = df.query("dates != nondate", parser=parser, engine=engine)
+        tm.assert_frame_equal(result, df)
+
+        msg = r"Invalid comparison between dtype=datetime64\[ns\] and ndarray"
+        for op in ["<", ">", "<=", ">="]:
+            with pytest.raises(TypeError, match=msg):
+                df.query(f"dates {op} nondate", parser=parser, engine=engine)
+
+    def test_query_syntax_error(self, engine, parser):
+        df = DataFrame({"i": range(10), "+": range(3, 13), "r": range(4, 14)})
+        msg = "invalid syntax"
+        with pytest.raises(SyntaxError, match=msg):
+            df.query("i - +", engine=engine, parser=parser)
+
+    def test_query_scope(self, engine, parser):
+        skip_if_no_pandas_parser(parser)
+
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((20, 2)), columns=list("ab")
+        )
+
+        a, b = 1, 2  # noqa: F841
+        res = df.query("a > b", engine=engine, parser=parser)
+        expected = df[df.a > df.b]
+        tm.assert_frame_equal(res, expected)
+
+        res = df.query("@a > b", engine=engine, parser=parser)
+        expected = df[a > df.b]
+        tm.assert_frame_equal(res, expected)
+
+        # no local variable c
+        with pytest.raises(
+            UndefinedVariableError, match="local variable 'c' is not defined"
+        ):
+            df.query("@a > b > @c", engine=engine, parser=parser)
+
+        # no column named 'c'
+        with pytest.raises(UndefinedVariableError, match="name 'c' is not defined"):
+            df.query("@a > b > c", engine=engine, parser=parser)
+
+    def test_query_doesnt_pickup_local(self, engine, parser):
+        n = m = 10
+        df = DataFrame(
+            np.random.default_rng(2).integers(m, size=(n, 3)), columns=list("abc")
+        )
+
+        # we don't pick up the local 'sin'
+        with pytest.raises(UndefinedVariableError, match="name 'sin' is not defined"):
+            df.query("sin > 5", engine=engine, parser=parser)
+
+    def test_query_builtin(self, engine, parser):
+        n = m = 10
+        df = DataFrame(
+            np.random.default_rng(2).integers(m, size=(n, 3)), columns=list("abc")
+        )
+
+        df.index.name = "sin"
+        msg = "Variables in expression.+"
+        with pytest.raises(NumExprClobberingError, match=msg):
+            df.query("sin > 5", engine=engine, parser=parser)
+
+    def test_query(self, engine, parser):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 3)), columns=["a", "b", "c"]
+        )
+
+        tm.assert_frame_equal(
+            df.query("a < b", engine=engine, parser=parser), df[df.a < df.b]
+        )
+        tm.assert_frame_equal(
+            df.query("a + b > b * c", engine=engine, parser=parser),
+            df[df.a + df.b > df.b * df.c],
+        )
+
+    def test_query_index_with_name(self, engine, parser):
+        df = DataFrame(
+            np.random.default_rng(2).integers(10, size=(10, 3)),
+            index=Index(range(10), name="blob"),
+            columns=["a", "b", "c"],
+        )
+        res = df.query("(blob < 5) & (a < b)", engine=engine, parser=parser)
+        expec = df[(df.index < 5) & (df.a < df.b)]
+        tm.assert_frame_equal(res, expec)
+
+        res = df.query("blob < b", engine=engine, parser=parser)
+        expec = df[df.index < df.b]
+
+        tm.assert_frame_equal(res, expec)
+
+    def test_query_index_without_name(self, engine, parser):
+        df = DataFrame(
+            np.random.default_rng(2).integers(10, size=(10, 3)),
+            index=range(10),
+            columns=["a", "b", "c"],
+        )
+
+        # "index" should refer to the index
+        res = df.query("index < b", engine=engine, parser=parser)
+        expec = df[df.index < df.b]
+        tm.assert_frame_equal(res, expec)
+
+        # test against a scalar
+        res = df.query("index < 5", engine=engine, parser=parser)
+        expec = df[df.index < 5]
+        tm.assert_frame_equal(res, expec)
+
+    def test_nested_scope(self, engine, parser):
+        skip_if_no_pandas_parser(parser)
+
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+        df2 = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+        expected = df[(df > 0) & (df2 > 0)]
+
+        result = df.query("(@df > 0) & (@df2 > 0)", engine=engine, parser=parser)
+        tm.assert_frame_equal(result, expected)
+
+        result = pd.eval("df[df > 0 and df2 > 0]", engine=engine, parser=parser)
+        tm.assert_frame_equal(result, expected)
+
+        result = pd.eval(
+            "df[df > 0 and df2 > 0 and df[df > 0] > 0]", engine=engine, parser=parser
+        )
+        expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
+        tm.assert_frame_equal(result, expected)
+
+        result = pd.eval("df[(df>0) & (df2>0)]", engine=engine, parser=parser)
+        expected = df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser)
+        tm.assert_frame_equal(result, expected)
+
+    def test_nested_raises_on_local_self_reference(self, engine, parser):
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+
+        # can't reference ourself b/c we're a local so @ is necessary
+        with pytest.raises(UndefinedVariableError, match="name 'df' is not defined"):
+            df.query("df > 0", engine=engine, parser=parser)
+
+    def test_local_syntax(self, engine, parser):
+        skip_if_no_pandas_parser(parser)
+
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((100, 10)),
+            columns=list("abcdefghij"),
+        )
+        b = 1
+        expect = df[df.a < b]
+        result = df.query("a < @b", engine=engine, parser=parser)
+        tm.assert_frame_equal(result, expect)
+
+        expect = df[df.a < df.b]
+        result = df.query("a < b", engine=engine, parser=parser)
+        tm.assert_frame_equal(result, expect)
+
+    def test_chained_cmp_and_in(self, engine, parser):
+        skip_if_no_pandas_parser(parser)
+        cols = list("abc")
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((100, len(cols))), columns=cols
+        )
+        res = df.query(
+            "a < b < c and a not in b not in c", engine=engine, parser=parser
+        )
+        ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b)
+        expec = df[ind]
+        tm.assert_frame_equal(res, expec)
+
+    def test_local_variable_with_in(self, engine, parser):
+        skip_if_no_pandas_parser(parser)
+        a = Series(np.random.default_rng(2).integers(3, size=15), name="a")
+        b = Series(np.random.default_rng(2).integers(10, size=15), name="b")
+        df = DataFrame({"a": a, "b": b})
+
+        expected = df.loc[(df.b - 1).isin(a)]
+        result = df.query("b - 1 in a", engine=engine, parser=parser)
+        tm.assert_frame_equal(expected, result)
+
+        b = Series(np.random.default_rng(2).integers(10, size=15), name="b")
+        expected = df.loc[(b - 1).isin(a)]
+        result = df.query("@b - 1 in a", engine=engine, parser=parser)
+        tm.assert_frame_equal(expected, result)
+
+    def test_at_inside_string(self, engine, parser):
+        skip_if_no_pandas_parser(parser)
+        c = 1  # noqa: F841
+        df = DataFrame({"a": ["a", "a", "b", "b", "@c", "@c"]})
+        result = df.query('a == "@c"', engine=engine, parser=parser)
+        expected = df[df.a == "@c"]
+        tm.assert_frame_equal(result, expected)
+
+    def test_query_undefined_local(self):
+        engine, parser = self.engine, self.parser
+        skip_if_no_pandas_parser(parser)
+
+        df = DataFrame(np.random.default_rng(2).random((10, 2)), columns=list("ab"))
+        with pytest.raises(
+            UndefinedVariableError, match="local variable 'c' is not defined"
+        ):
+            df.query("a == @c", engine=engine, parser=parser)
+
+    def test_index_resolvers_come_after_columns_with_the_same_name(
+        self, engine, parser
+    ):
+        n = 1  # noqa: F841
+        a = np.r_[20:101:20]
+
+        df = DataFrame(
+            {"index": a, "b": np.random.default_rng(2).standard_normal(a.size)}
+        )
+        df.index.name = "index"
+        result = df.query("index > 5", engine=engine, parser=parser)
+        expected = df[df["index"] > 5]
+        tm.assert_frame_equal(result, expected)
+
+        df = DataFrame(
+            {"index": a, "b": np.random.default_rng(2).standard_normal(a.size)}
+        )
+        result = df.query("ilevel_0 > 5", engine=engine, parser=parser)
+        expected = df.loc[df.index[df.index > 5]]
+        tm.assert_frame_equal(result, expected)
+
+        df = DataFrame({"a": a, "b": np.random.default_rng(2).standard_normal(a.size)})
+        df.index.name = "a"
+        result = df.query("a > 5", engine=engine, parser=parser)
+        expected = df[df.a > 5]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.query("index > 5", engine=engine, parser=parser)
+        expected = df.loc[df.index[df.index > 5]]
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("op, f", [["==", operator.eq], ["!=", operator.ne]])
+    def test_inf(self, op, f, engine, parser):
+        n = 10
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).random(n),
+                "b": np.random.default_rng(2).random(n),
+            }
+        )
+        df.loc[::2, 0] = np.inf
+        q = f"a {op} inf"
+        expected = df[f(df.a, np.inf)]
+        result = df.query(q, engine=engine, parser=parser)
+        tm.assert_frame_equal(result, expected)
+
+    def test_check_tz_aware_index_query(self, tz_aware_fixture):
+        # https://github.com/pandas-dev/pandas/issues/29463
+        tz = tz_aware_fixture
+        df_index = date_range(
+            start="2019-01-01", freq="1D", periods=10, tz=tz, name="time"
+        )
+        expected = DataFrame(index=df_index)
+        df = DataFrame(index=df_index)
+        result = df.query('"2018-01-03 00:00:00+00" < time')
+        tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame(df_index)
+        result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
+        tm.assert_frame_equal(result, expected)
+
+    def test_method_calls_in_query(self, engine, parser):
+        # https://github.com/pandas-dev/pandas/issues/22435
+        n = 10
+        df = DataFrame(
+            {
+                "a": 2 * np.random.default_rng(2).random(n),
+                "b": np.random.default_rng(2).random(n),
+            }
+        )
+        expected = df[df["a"].astype("int") == 0]
+        result = df.query("a.astype('int') == 0", engine=engine, parser=parser)
+        tm.assert_frame_equal(result, expected)
+
+        df = DataFrame(
+            {
+                "a": np.where(
+                    np.random.default_rng(2).random(n) < 0.5,
+                    np.nan,
+                    np.random.default_rng(2).standard_normal(n),
+                ),
+                "b": np.random.default_rng(2).standard_normal(n),
+            }
+        )
+        expected = df[df["a"].notnull()]
+        result = df.query("a.notnull()", engine=engine, parser=parser)
+        tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("numexpr")
+class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas):
+    @pytest.fixture
+    def engine(self):
+        return "numexpr"
+
+    @pytest.fixture
+    def parser(self):
+        return "python"
+
+    def test_date_query_no_attribute_access(self, engine, parser):
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+        df["dates1"] = date_range("1/1/2012", periods=5)
+        df["dates2"] = date_range("1/1/2013", periods=5)
+        df["dates3"] = date_range("1/1/2014", periods=5)
+        res = df.query(
+            "(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
+        )
+        expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
+        tm.assert_frame_equal(res, expec)
+
+    def test_date_query_with_NaT(self, engine, parser):
+        n = 10
+        df = DataFrame(np.random.default_rng(2).standard_normal((n, 3)))
+        df["dates1"] = date_range("1/1/2012", periods=n)
+        df["dates2"] = date_range("1/1/2013", periods=n)
+        df["dates3"] = date_range("1/1/2014", periods=n)
+        df.loc[np.random.default_rng(2).random(n) > 0.5, "dates1"] = pd.NaT
+        df.loc[np.random.default_rng(2).random(n) > 0.5, "dates3"] = pd.NaT
+        res = df.query(
+            "(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
+        )
+        expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
+        tm.assert_frame_equal(res, expec)
+
+    def test_date_index_query(self, engine, parser):
+        n = 10
+        df = DataFrame(np.random.default_rng(2).standard_normal((n, 3)))
+        df["dates1"] = date_range("1/1/2012", periods=n)
+        df["dates3"] = date_range("1/1/2014", periods=n)
+        return_value = df.set_index("dates1", inplace=True, drop=True)
+        assert return_value is None
+        res = df.query(
+            "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
+        )
+        expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
+        tm.assert_frame_equal(res, expec)
+
+    def test_date_index_query_with_NaT(self, engine, parser):
+        n = 10
+        # Cast to object to avoid implicit cast when setting entry to pd.NaT below
+        df = DataFrame(np.random.default_rng(2).standard_normal((n, 3))).astype(
+            {0: object}
+        )
+        df["dates1"] = date_range("1/1/2012", periods=n)
+        df["dates3"] = date_range("1/1/2014", periods=n)
+        df.iloc[0, 0] = pd.NaT
+        return_value = df.set_index("dates1", inplace=True, drop=True)
+        assert return_value is None
+        res = df.query(
+            "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
+        )
+        expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
+        tm.assert_frame_equal(res, expec)
+
+    def test_date_index_query_with_NaT_duplicates(self, engine, parser):
+        n = 10
+        df = DataFrame(np.random.default_rng(2).standard_normal((n, 3)))
+        df["dates1"] = date_range("1/1/2012", periods=n)
+        df["dates3"] = date_range("1/1/2014", periods=n)
+        df.loc[np.random.default_rng(2).random(n) > 0.5, "dates1"] = pd.NaT
+        return_value = df.set_index("dates1", inplace=True, drop=True)
+        assert return_value is None
+        msg = r"'BoolOp' nodes are not implemented"
+        with pytest.raises(NotImplementedError, match=msg):
+            df.query("index < 20130101 < dates3", engine=engine, parser=parser)
+
+    def test_nested_scope(self, engine, parser):
+        # smoke test
+        x = 1  # noqa: F841
+        result = pd.eval("x + 1", engine=engine, parser=parser)
+        assert result == 2
+
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+        df2 = DataFrame(np.random.default_rng(2).standard_normal((5, 3)))
+
+        # don't have the pandas parser
+        msg = r"The '@' prefix is only supported by the pandas parser"
+        with pytest.raises(SyntaxError, match=msg):
+            df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser)
+
+        with pytest.raises(UndefinedVariableError, match="name 'df' is not defined"):
+            df.query("(df>0) & (df2>0)", engine=engine, parser=parser)
+
+        expected = df[(df > 0) & (df2 > 0)]
+        result = pd.eval("df[(df > 0) & (df2 > 0)]", engine=engine, parser=parser)
+        tm.assert_frame_equal(expected, result)
+
+        expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
+        result = pd.eval(
+            "df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]", engine=engine, parser=parser
+        )
+        tm.assert_frame_equal(expected, result)
+
+    def test_query_numexpr_with_min_and_max_columns(self):
+        df = DataFrame({"min": [1, 2, 3], "max": [4, 5, 6]})
+        regex_to_match = (
+            r"Variables in expression \"\(min\) == \(1\)\" "
+            r"overlap with builtins: \('min'\)"
+        )
+        with pytest.raises(NumExprClobberingError, match=regex_to_match):
+            df.query("min == 1")
+
+        regex_to_match = (
+            r"Variables in expression \"\(max\) == \(1\)\" "
+            r"overlap with builtins: \('max'\)"
+        )
+        with pytest.raises(NumExprClobberingError, match=regex_to_match):
+            df.query("max == 1")
+
+
+class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas):
+    @pytest.fixture
+    def engine(self):
+        return "python"
+
+    @pytest.fixture
+    def parser(self):
+        return "pandas"
+
+    def test_query_builtin(self, engine, parser):
+        n = m = 10
+        df = DataFrame(
+            np.random.default_rng(2).integers(m, size=(n, 3)), columns=list("abc")
+        )
+
+        df.index.name = "sin"
+        expected = df[df.index > 5]
+        result = df.query("sin > 5", engine=engine, parser=parser)
+        tm.assert_frame_equal(expected, result)
+
+
+class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython):
+    @pytest.fixture
+    def engine(self):
+        return "python"
+
+    @pytest.fixture
+    def parser(self):
+        return "python"
+
+    def test_query_builtin(self, engine, parser):
+        n = m = 10
+        df = DataFrame(
+            np.random.default_rng(2).integers(m, size=(n, 3)), columns=list("abc")
+        )
+
+        df.index.name = "sin"
+        expected = df[df.index > 5]
+        result = df.query("sin > 5", engine=engine, parser=parser)
+        tm.assert_frame_equal(expected, result)
+
+
+class TestDataFrameQueryStrings:
+    def test_str_query_method(self, parser, engine):
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 1)), columns=["b"])
+        df["strings"] = Series(list("aabbccddee"))
+        expect = df[df.strings == "a"]
+
+        if parser != "pandas":
+            col = "strings"
+            lst = '"a"'
+
+            lhs = [col] * 2 + [lst] * 2
+            rhs = lhs[::-1]
+
+            eq, ne = "==", "!="
+            ops = 2 * ([eq, ne])
+            msg = r"'(Not)?In' nodes are not implemented"
+
+            for lh, op_, rh in zip(lhs, ops, rhs):
+                ex = f"{lh} {op_} {rh}"
+                with pytest.raises(NotImplementedError, match=msg):
+                    df.query(
+                        ex,
+                        engine=engine,
+                        parser=parser,
+                        local_dict={"strings": df.strings},
+                    )
+        else:
+            res = df.query('"a" == strings', engine=engine, parser=parser)
+            tm.assert_frame_equal(res, expect)
+
+            res = df.query('strings == "a"', engine=engine, parser=parser)
+            tm.assert_frame_equal(res, expect)
+            tm.assert_frame_equal(res, df[df.strings.isin(["a"])])
+
+            expect = df[df.strings != "a"]
+            res = df.query('strings != "a"', engine=engine, parser=parser)
+            tm.assert_frame_equal(res, expect)
+
+            res = df.query('"a" != strings', engine=engine, parser=parser)
+            tm.assert_frame_equal(res, expect)
+            tm.assert_frame_equal(res, df[~df.strings.isin(["a"])])
+
+    def test_str_list_query_method(self, parser, engine):
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 1)), columns=["b"])
+        df["strings"] = Series(list("aabbccddee"))
+        expect = df[df.strings.isin(["a", "b"])]
+
+        if parser != "pandas":
+            col = "strings"
+            lst = '["a", "b"]'
+
+            lhs = [col] * 2 + [lst] * 2
+            rhs = lhs[::-1]
+
+            eq, ne = "==", "!="
+            ops = 2 * ([eq, ne])
+            msg = r"'(Not)?In' nodes are not implemented"
+
+            for lh, ops_, rh in zip(lhs, ops, rhs):
+                ex = f"{lh} {ops_} {rh}"
+                with pytest.raises(NotImplementedError, match=msg):
+                    df.query(ex, engine=engine, parser=parser)
+        else:
+            res = df.query('strings == ["a", "b"]', engine=engine, parser=parser)
+            tm.assert_frame_equal(res, expect)
+
+            res = df.query('["a", "b"] == strings', engine=engine, parser=parser)
+            tm.assert_frame_equal(res, expect)
+
+            expect = df[~df.strings.isin(["a", "b"])]
+
+            res = df.query('strings != ["a", "b"]', engine=engine, parser=parser)
+            tm.assert_frame_equal(res, expect)
+
+            res = df.query('["a", "b"] != strings', engine=engine, parser=parser)
+            tm.assert_frame_equal(res, expect)
+
+    def test_query_with_string_columns(self, parser, engine):
+        df = DataFrame(
+            {
+                "a": list("aaaabbbbcccc"),
+                "b": list("aabbccddeeff"),
+                "c": np.random.default_rng(2).integers(5, size=12),
+                "d": np.random.default_rng(2).integers(9, size=12),
+            }
+        )
+        if parser == "pandas":
+            res = df.query("a in b", parser=parser, engine=engine)
+            expec = df[df.a.isin(df.b)]
+            tm.assert_frame_equal(res, expec)
+
+            res = df.query("a in b and c < d", parser=parser, engine=engine)
+            expec = df[df.a.isin(df.b) & (df.c < df.d)]
+            tm.assert_frame_equal(res, expec)
+        else:
+            msg = r"'(Not)?In' nodes are not implemented"
+            with pytest.raises(NotImplementedError, match=msg):
+                df.query("a in b", parser=parser, engine=engine)
+
+            msg = r"'BoolOp' nodes are not implemented"
+            with pytest.raises(NotImplementedError, match=msg):
+                df.query("a in b and c < d", parser=parser, engine=engine)
+
+    def test_object_array_eq_ne(self, parser, engine):
+        df = DataFrame(
+            {
+                "a": list("aaaabbbbcccc"),
+                "b": list("aabbccddeeff"),
+                "c": np.random.default_rng(2).integers(5, size=12),
+                "d": np.random.default_rng(2).integers(9, size=12),
+            }
+        )
+        res = df.query("a == b", parser=parser, engine=engine)
+        exp = df[df.a == df.b]
+        tm.assert_frame_equal(res, exp)
+
+        res = df.query("a != b", parser=parser, engine=engine)
+        exp = df[df.a != df.b]
+        tm.assert_frame_equal(res, exp)
+
+    def test_query_with_nested_strings(self, parser, engine):
+        skip_if_no_pandas_parser(parser)
+        events = [
+            f"page {n} {act}" for n in range(1, 4) for act in ["load", "exit"]
+        ] * 2
+        stamps1 = date_range("2014-01-01 0:00:01", freq="30s", periods=6)
+        stamps2 = date_range("2014-02-01 1:00:01", freq="30s", periods=6)
+        df = DataFrame(
+            {
+                "id": np.arange(1, 7).repeat(2),
+                "event": events,
+                "timestamp": stamps1.append(stamps2),
+            }
+        )
+
+        expected = df[df.event == '"page 1 load"']
+        res = df.query("""'"page 1 load"' in event""", parser=parser, engine=engine)
+        tm.assert_frame_equal(expected, res)
+
+    def test_query_with_nested_special_character(self, parser, engine):
+        skip_if_no_pandas_parser(parser)
+        df = DataFrame({"a": ["a", "b", "test & test"], "b": [1, 2, 3]})
+        res = df.query('a == "test & test"', parser=parser, engine=engine)
+        expec = df[df.a == "test & test"]
+        tm.assert_frame_equal(res, expec)
+
+    @pytest.mark.parametrize(
+        "op, func",
+        [
+            ["<", operator.lt],
+            [">", operator.gt],
+            ["<=", operator.le],
+            [">=", operator.ge],
+        ],
+    )
+    def test_query_lex_compare_strings(self, parser, engine, op, func):
+        a = Series(np.random.default_rng(2).choice(list("abcde"), 20))
+        b = Series(np.arange(a.size))
+        df = DataFrame({"X": a, "Y": b})
+
+        res = df.query(f'X {op} "d"', engine=engine, parser=parser)
+        expected = df[func(df.X, "d")]
+        tm.assert_frame_equal(res, expected)
+
+    def test_query_single_element_booleans(self, parser, engine):
+        columns = "bid", "bidsize", "ask", "asksize"
+        data = np.random.default_rng(2).integers(2, size=(1, len(columns))).astype(bool)
+        df = DataFrame(data, columns=columns)
+        res = df.query("bid & ask", engine=engine, parser=parser)
+        expected = df[df.bid & df.ask]
+        tm.assert_frame_equal(res, expected)
+
+    def test_query_string_scalar_variable(self, parser, engine):
+        skip_if_no_pandas_parser(parser)
+        df = DataFrame(
+            {
+                "Symbol": ["BUD US", "BUD US", "IBM US", "IBM US"],
+                "Price": [109.70, 109.72, 183.30, 183.35],
+            }
+        )
+        e = df[df.Symbol == "BUD US"]
+        symb = "BUD US"  # noqa: F841
+        r = df.query("Symbol == @symb", parser=parser, engine=engine)
+        tm.assert_frame_equal(e, r)
+
+    @pytest.mark.parametrize(
+        "in_list",
+        [
+            [None, "asdf", "ghjk"],
+            ["asdf", None, "ghjk"],
+            ["asdf", "ghjk", None],
+            [None, None, "asdf"],
+            ["asdf", None, None],
+            [None, None, None],
+        ],
+    )
+    def test_query_string_null_elements(self, in_list):
+        # GITHUB ISSUE #31516
+        parser = "pandas"
+        engine = "python"
+        expected = {i: value for i, value in enumerate(in_list) if value == "asdf"}
+
+        df_expected = DataFrame({"a": expected}, dtype="string")
+        df_expected.index = df_expected.index.astype("int64")
+        df = DataFrame({"a": in_list}, dtype="string")
+        df.index = Index(list(df.index), dtype=df.index.dtype)
+        res1 = df.query("a == 'asdf'", parser=parser, engine=engine)
+        res2 = df[df["a"] == "asdf"]
+        res3 = df.query("a <= 'asdf'", parser=parser, engine=engine)
+        tm.assert_frame_equal(res1, df_expected)
+        tm.assert_frame_equal(res1, res2)
+        tm.assert_frame_equal(res1, res3)
+        tm.assert_frame_equal(res2, res3)
+
+
+class TestDataFrameEvalWithFrame:
+    @pytest.fixture
+    def frame(self):
+        return DataFrame(
+            np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc")
+        )
+
+    def test_simple_expr(self, frame, parser, engine):
+        res = frame.eval("a + b", engine=engine, parser=parser)
+        expect = frame.a + frame.b
+        tm.assert_series_equal(res, expect)
+
+    def test_bool_arith_expr(self, frame, parser, engine):
+        res = frame.eval("a[a < 1] + b", engine=engine, parser=parser)
+        expect = frame.a[frame.a < 1] + frame.b
+        tm.assert_series_equal(res, expect)
+
+    @pytest.mark.parametrize("op", ["+", "-", "*", "/"])
+    def test_invalid_type_for_operator_raises(self, parser, engine, op):
+        df = DataFrame({"a": [1, 2], "b": ["c", "d"]})
+        msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'|Cannot"
+
+        with pytest.raises(TypeError, match=msg):
+            df.eval(f"a {op} b", engine=engine, parser=parser)
+
+
+class TestDataFrameQueryBacktickQuoting:
+    @pytest.fixture
+    def df(self):
+        """
+        Yields a dataframe with strings that may or may not need escaping
+        by backticks. The last two columns cannot be escaped by backticks
+        and should raise a ValueError.
+        """
+        return DataFrame(
+            {
+                "A": [1, 2, 3],
+                "B B": [3, 2, 1],
+                "C C": [4, 5, 6],
+                "C  C": [7, 4, 3],
+                "C_C": [8, 9, 10],
+                "D_D D": [11, 1, 101],
+                "E.E": [6, 3, 5],
+                "F-F": [8, 1, 10],
+                "1e1": [2, 4, 8],
+                "def": [10, 11, 2],
+                "A (x)": [4, 1, 3],
+                "B(x)": [1, 1, 5],
+                "B (x)": [2, 7, 4],
+                "  &^ :!€$?(} >    <++*''  ": [2, 5, 6],
+                "": [10, 11, 1],
+                " A": [4, 7, 9],
+                "  ": [1, 2, 1],
+                "it's": [6, 3, 1],
+                "that's": [9, 1, 8],
+                "☺": [8, 7, 6],
+                "xy （z）": [1, 2, 3],  # noqa: RUF001
+                "xy （z\\uff09": [4, 5, 6],  # noqa: RUF001
+                "foo#bar": [2, 4, 5],
+                1: [5, 7, 9],
+            }
+        )
+
+    def test_single_backtick_variable_query(self, df):
+        res = df.query("1 < `B B`")
+        expect = df[1 < df["B B"]]
+        tm.assert_frame_equal(res, expect)
+
+    def test_two_backtick_variables_query(self, df):
+        res = df.query("1 < `B B` and 4 < `C C`")
+        expect = df[(1 < df["B B"]) & (4 < df["C C"])]
+        tm.assert_frame_equal(res, expect)
+
+    def test_single_backtick_variable_expr(self, df):
+        res = df.eval("A + `B B`")
+        expect = df["A"] + df["B B"]
+        tm.assert_series_equal(res, expect)
+
+    def test_two_backtick_variables_expr(self, df):
+        res = df.eval("`B B` + `C C`")
+        expect = df["B B"] + df["C C"]
+        tm.assert_series_equal(res, expect)
+
+    def test_already_underscore_variable(self, df):
+        res = df.eval("`C_C` + A")
+        expect = df["C_C"] + df["A"]
+        tm.assert_series_equal(res, expect)
+
+    def test_same_name_but_underscores(self, df):
+        res = df.eval("C_C + `C C`")
+        expect = df["C_C"] + df["C C"]
+        tm.assert_series_equal(res, expect)
+
+    def test_mixed_underscores_and_spaces(self, df):
+        res = df.eval("A + `D_D D`")
+        expect = df["A"] + df["D_D D"]
+        tm.assert_series_equal(res, expect)
+
+    def test_backtick_quote_name_with_no_spaces(self, df):
+        res = df.eval("A + `C_C`")
+        expect = df["A"] + df["C_C"]
+        tm.assert_series_equal(res, expect)
+
+    def test_special_characters(self, df):
+        res = df.eval("`E.E` + `F-F` - A")
+        expect = df["E.E"] + df["F-F"] - df["A"]
+        tm.assert_series_equal(res, expect)
+
+    def test_start_with_digit(self, df):
+        res = df.eval("A + `1e1`")
+        expect = df["A"] + df["1e1"]
+        tm.assert_series_equal(res, expect)
+
+    def test_keyword(self, df):
+        res = df.eval("A + `def`")
+        expect = df["A"] + df["def"]
+        tm.assert_series_equal(res, expect)
+
+    def test_unneeded_quoting(self, df):
+        res = df.query("`A` > 2")
+        expect = df[df["A"] > 2]
+        tm.assert_frame_equal(res, expect)
+
+    def test_parenthesis(self, df):
+        res = df.query("`A (x)` > 2")
+        expect = df[df["A (x)"] > 2]
+        tm.assert_frame_equal(res, expect)
+
+    def test_empty_string(self, df):
+        res = df.query("`` > 5")
+        expect = df[df[""] > 5]
+        tm.assert_frame_equal(res, expect)
+
+    def test_multiple_spaces(self, df):
+        res = df.query("`C  C` > 5")
+        expect = df[df["C  C"] > 5]
+        tm.assert_frame_equal(res, expect)
+
+    def test_start_with_spaces(self, df):
+        res = df.eval("` A` + `  `")
+        expect = df[" A"] + df["  "]
+        tm.assert_series_equal(res, expect)
+
+    def test_ints(self, df):
+        res = df.query("`1` == 7")
+        expect = df[df[1] == 7]
+        tm.assert_frame_equal(res, expect)
+
+    def test_lots_of_operators_string(self, df):
+        res = df.query("`  &^ :!€$?(} >    <++*''  ` > 4")
+        expect = df[df["  &^ :!€$?(} >    <++*''  "] > 4]
+        tm.assert_frame_equal(res, expect)
+
+    def test_missing_attribute(self, df):
+        message = "module 'pandas' has no attribute 'thing'"
+        with pytest.raises(AttributeError, match=message):
+            df.eval("@pd.thing")
+
+    def test_quote(self, df):
+        res = df.query("`it's` > `that's`")
+        expect = df[df["it's"] > df["that's"]]
+        tm.assert_frame_equal(res, expect)
+
+    def test_character_outside_range_smiley(self, df):
+        res = df.query("`☺` > 4")
+        expect = df[df["☺"] > 4]
+        tm.assert_frame_equal(res, expect)
+
+    def test_character_outside_range_2_byte_parens(self, df):
+        # GH 49633
+        res = df.query("`xy （z）` == 2")  # noqa: RUF001
+        expect = df[df["xy （z）"] == 2]  # noqa: RUF001
+        tm.assert_frame_equal(res, expect)
+
+    def test_character_outside_range_and_actual_backslash(self, df):
+        # GH 49633
+        res = df.query("`xy （z\\uff09` == 2")  # noqa: RUF001
+        expect = df[df["xy \uff08z\\uff09"] == 2]
+        tm.assert_frame_equal(res, expect)
+
+    def test_hashtag(self, df):
+        res = df.query("`foo#bar` > 4")
+        expect = df[df["foo#bar"] > 4]
+        tm.assert_frame_equal(res, expect)
+
+    def test_expr_with_column_name_with_hashtag_character(self):
+        # GH 59285
+        df = DataFrame((1, 2, 3), columns=["a#"])
+        result = df.query("`a#` < 2")
+        expected = df[df["a#"] < 2]
+        tm.assert_frame_equal(result, expected)
+
+    def test_expr_with_comment(self):
+        # GH 59285
+        df = DataFrame((1, 2, 3), columns=["a#"])
+        result = df.query("`a#` < 2  # This is a comment")
+        expected = df[df["a#"] < 2]
+        tm.assert_frame_equal(result, expected)
+
+    def test_expr_with_column_name_with_backtick_and_hash(self):
+        # GH 59285
+        df = DataFrame((1, 2, 3), columns=["a`#b"])
+        result = df.query("`a``#b` < 2")
+        expected = df[df["a`#b"] < 2]
+        tm.assert_frame_equal(result, expected)
+
+    def test_expr_with_column_name_with_backtick(self):
+        # GH 59285
+        df = DataFrame({"a`b": (1, 2, 3), "ab": (4, 5, 6)})
+        result = df.query("`a``b` < 2")
+        # Note: Formatting checks may wrongly consider the above ``inline code``.
+        expected = df[df["a`b"] < 2]
+        tm.assert_frame_equal(result, expected)
+
+    def test_expr_with_string_with_backticks(self):
+        # GH 59285
+        df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"])
+        result = df.query("'```' < `#backticks`")
+        expected = df["```" < df["#backticks"]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_expr_with_string_with_backticked_substring_same_as_column_name(self):
+        # GH 59285
+        df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"])
+        result = df.query("'`#backticks`' < `#backticks`")
+        expected = df["`#backticks`" < df["#backticks"]]
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "col1,col2,expr",
+        [
+            ("it's", "that's", "`it's` < `that's`"),
+            ('it"s', 'that"s', '`it"s` < `that"s`'),
+            ("it's", 'that\'s "nice"', "`it's` < `that's \"nice\"`"),
+            ("it's", "that's #cool", "`it's` < `that's #cool` # This is a comment"),
+        ],
+    )
+    def test_expr_with_column_names_with_special_characters(self, col1, col2, expr):
+        # GH 59285
+        df = DataFrame(
+            [
+                {col1: 1, col2: 2},
+                {col1: 3, col2: 4},
+                {col1: -1, col2: -2},
+                {col1: -3, col2: -4},
+            ]
+        )
+        result = df.query(expr)
+        expected = df[df[col1] < df[col2]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_expr_with_no_backticks(self):
+        # GH 59285
+        df = DataFrame(("aaa", "vvv", "zzz"), columns=["column_name"])
+        result = df.query("'value' < column_name")
+        expected = df["value" < df["column_name"]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_expr_with_no_quotes_and_backtick_is_unmatched(self):
+        # GH 59285
+        df = DataFrame((1, 5, 10), columns=["column-name"])
+        with pytest.raises((SyntaxError, TokenError), match="invalid syntax"):
+            df.query("5 < `column-name")
+
+    def test_expr_with_no_quotes_and_backtick_is_matched(self):
+        # GH 59285
+        df = DataFrame((1, 5, 10), columns=["column-name"])
+        result = df.query("5 < `column-name`")
+        expected = df[5 < df["column-name"]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_expr_with_backtick_opened_before_quote_and_backtick_is_unmatched(self):
+        # GH 59285
+        df = DataFrame((1, 5, 10), columns=["It's"])
+        with pytest.raises(
+            (SyntaxError, TokenError), match="unterminated string literal"
+        ):
+            df.query("5 < `It's")
+
+    def test_expr_with_backtick_opened_before_quote_and_backtick_is_matched(self):
+        # GH 59285
+        df = DataFrame((1, 5, 10), columns=["It's"])
+        result = df.query("5 < `It's`")
+        expected = df[5 < df["It's"]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(self):
+        # GH 59285
+        df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"])
+        with pytest.raises(
+            (SyntaxError, TokenError), match="unterminated string literal"
+        ):
+            df.query("`column-name` < 'It`s that\\'s \"quote\" #hash")
+
+    def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self):
+        # GH 59285
+        df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"])
+        result = df.query("`column-name` < 'It`s that\\'s \"quote\" #hash'")
+        expected = df[df["column-name"] < 'It`s that\'s "quote" #hash']
+        tm.assert_frame_equal(result, expected)
+
+    def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_in_mid(self):
+        # GH 59285
+        df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"])
+        result = df.query("'It`s that\\'s \"quote\" #hash' < `column-name`")
+        expected = df['It`s that\'s "quote" #hash' < df["column-name"]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_call_non_named_expression(self, df):
+        """
+        Only attributes and variables ('named functions') can be called.
+        .__call__() is not an allowed attribute because that would allow
+        calling anything.
+        https://github.com/pandas-dev/pandas/pull/32460
+        """
+
+        def func(*_):
+            return 1
+
+        funcs = [func]  # noqa: F841
+
+        df.eval("@func()")
+
+        with pytest.raises(TypeError, match="Only named functions are supported"):
+            df.eval("@funcs[0]()")
+
+        with pytest.raises(TypeError, match="Only named functions are supported"):
+            df.eval("@funcs[0].__call__()")
+
+    def test_ea_dtypes(self, any_numeric_ea_and_arrow_dtype):
+        # GH#29618
+        df = DataFrame(
+            [[1, 2], [3, 4]], columns=["a", "b"], dtype=any_numeric_ea_and_arrow_dtype
+        )
+        warning = RuntimeWarning if NUMEXPR_INSTALLED else None
+        with tm.assert_produces_warning(warning):
+            result = df.eval("c = b - a")
+        expected = DataFrame(
+            [[1, 2, 1], [3, 4, 1]],
+            columns=["a", "b", "c"],
+            dtype=any_numeric_ea_and_arrow_dtype,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_ea_dtypes_and_scalar(self):
+        # GH#29618
+        df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"], dtype="Float64")
+        warning = RuntimeWarning if NUMEXPR_INSTALLED else None
+        with tm.assert_produces_warning(warning):
+            result = df.eval("c = b - 1")
+        expected = DataFrame(
+            [[1, 2, 1], [3, 4, 3]], columns=["a", "b", "c"], dtype="Float64"
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_ea_dtypes_and_scalar_operation(self, any_numeric_ea_and_arrow_dtype):
+        # GH#29618
+        df = DataFrame(
+            [[1, 2], [3, 4]], columns=["a", "b"], dtype=any_numeric_ea_and_arrow_dtype
+        )
+        result = df.eval("c = 2 - 1")
+        expected = DataFrame(
+            {
+                "a": Series([1, 3], dtype=any_numeric_ea_and_arrow_dtype),
+                "b": Series([2, 4], dtype=any_numeric_ea_and_arrow_dtype),
+                "c": Series([1, 1], dtype=result["c"].dtype),
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype", ["int64", "Int64", "int64[pyarrow]"])
+    def test_query_ea_dtypes(self, dtype):
+        if dtype == "int64[pyarrow]":
+            pytest.importorskip("pyarrow")
+        # GH#50261
+        df = DataFrame({"a": [1, 2]}, dtype=dtype)
+        ref = {2}  # noqa: F841
+        warning = RuntimeWarning if dtype == "Int64" and NUMEXPR_INSTALLED else None
+        with tm.assert_produces_warning(warning):
+            result = df.query("a in @ref")
+        expected = DataFrame({"a": [2]}, index=range(1, 2), dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("engine", ["python", "numexpr"])
+    @pytest.mark.parametrize("dtype", ["int64", "Int64", "int64[pyarrow]"])
+    def test_query_ea_equality_comparison(self, dtype, engine):
+        # GH#50261
+        warning = RuntimeWarning if engine == "numexpr" else None
+        if engine == "numexpr" and not NUMEXPR_INSTALLED:
+            pytest.skip("numexpr not installed")
+        if dtype == "int64[pyarrow]":
+            pytest.importorskip("pyarrow")
+        df = DataFrame(
+            {"A": Series([1, 1, 2], dtype="Int64"), "B": Series([1, 2, 2], dtype=dtype)}
+        )
+        with tm.assert_produces_warning(warning):
+            result = df.query("A == B", engine=engine)
+        expected = DataFrame(
+            {
+                "A": Series([1, 2], dtype="Int64", index=range(0, 4, 2)),
+                "B": Series([1, 2], dtype=dtype, index=range(0, 4, 2)),
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_all_nat_in_object(self):
+        # GH#57068
+        now = pd.Timestamp.now("UTC")  # noqa: F841
+        df = DataFrame({"a": pd.to_datetime([None, None], utc=True)}, dtype=object)
+        result = df.query("a > @now")
+        expected = DataFrame({"a": []}, dtype=object)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c702525156d7e5ba6b4fea1d69565bf7baf719e
--- /dev/null
+++ b/pandas/tests/frame/test_reductions.py
@@ -0,0 +1,2234 @@
+from datetime import timedelta
+from decimal import Decimal
+import re
+
+from dateutil.tz import tzlocal
+import numpy as np
+import pytest
+
+from pandas.compat import (
+    IS64,
+    is_platform_windows,
+)
+from pandas.compat.numpy import np_version_gt2
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    CategoricalDtype,
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    PeriodIndex,
+    RangeIndex,
+    Series,
+    Timestamp,
+    date_range,
+    isna,
+    notna,
+    to_datetime,
+    to_timedelta,
+)
+import pandas._testing as tm
+from pandas.core import (
+    algorithms,
+    nanops,
+)
+
+is_windows_np2_or_is32 = (is_platform_windows() and not np_version_gt2) or not IS64
+is_windows_or_is32 = is_platform_windows() or not IS64
+
+
+def make_skipna_wrapper(alternative, skipna_alternative=None):
+    """
+    Create a function for calling on an array.
+
+    Parameters
+    ----------
+    alternative : function
+        The function to be called on the array with no NaNs.
+        Only used when 'skipna_alternative' is None.
+    skipna_alternative : function
+        The function to be called on the original array
+
+    Returns
+    -------
+    function
+    """
+    if skipna_alternative:
+
+        def skipna_wrapper(x):
+            return skipna_alternative(x.values)
+
+    else:
+
+        def skipna_wrapper(x):
+            nona = x.dropna()
+            if len(nona) == 0:
+                return np.nan
+            return alternative(nona)
+
+    return skipna_wrapper
+
+
+def assert_stat_op_calc(
+    opname,
+    alternative,
+    frame,
+    has_skipna=True,
+    check_dtype=True,
+    check_dates=False,
+    rtol=1e-5,
+    atol=1e-8,
+    skipna_alternative=None,
+):
+    """
+    Check that operator opname works as advertised on frame
+
+    Parameters
+    ----------
+    opname : str
+        Name of the operator to test on frame
+    alternative : function
+        Function that opname is tested against; i.e. "frame.opname()" should
+        equal "alternative(frame)".
+    frame : DataFrame
+        The object that the tests are executed on
+    has_skipna : bool, default True
+        Whether the method "opname" has the kwarg "skip_na"
+    check_dtype : bool, default True
+        Whether the dtypes of the result of "frame.opname()" and
+        "alternative(frame)" should be checked.
+    check_dates : bool, default false
+        Whether opname should be tested on a Datetime Series
+    rtol : float, default 1e-5
+        Relative tolerance.
+    atol : float, default 1e-8
+        Absolute tolerance.
+    skipna_alternative : function, default None
+        NaN-safe version of alternative
+    """
+    f = getattr(frame, opname)
+
+    if check_dates:
+        df = DataFrame({"b": date_range("1/1/2001", periods=2)})
+        with tm.assert_produces_warning(None):
+            result = getattr(df, opname)()
+        assert isinstance(result, Series)
+
+        df["a"] = range(len(df))
+        with tm.assert_produces_warning(None):
+            result = getattr(df, opname)()
+        assert isinstance(result, Series)
+        assert len(result)
+
+    if has_skipna:
+
+        def wrapper(x):
+            return alternative(x.values)
+
+        skipna_wrapper = make_skipna_wrapper(alternative, skipna_alternative)
+        result0 = f(axis=0, skipna=False)
+        result1 = f(axis=1, skipna=False)
+        tm.assert_series_equal(
+            result0, frame.apply(wrapper), check_dtype=check_dtype, rtol=rtol, atol=atol
+        )
+        tm.assert_series_equal(
+            result1,
+            frame.apply(wrapper, axis=1),
+            rtol=rtol,
+            atol=atol,
+        )
+    else:
+        skipna_wrapper = alternative
+
+    result0 = f(axis=0)
+    result1 = f(axis=1)
+    tm.assert_series_equal(
+        result0,
+        frame.apply(skipna_wrapper),
+        check_dtype=check_dtype,
+        rtol=rtol,
+        atol=atol,
+    )
+
+    if opname in ["sum", "prod"]:
+        expected = frame.apply(skipna_wrapper, axis=1)
+        tm.assert_series_equal(
+            result1, expected, check_dtype=False, rtol=rtol, atol=atol
+        )
+
+    # check dtypes
+    if check_dtype:
+        lcd_dtype = frame.values.dtype
+        assert lcd_dtype == result0.dtype
+        assert lcd_dtype == result1.dtype
+
+    # bad axis
+    with pytest.raises(ValueError, match="No axis named 2"):
+        f(axis=2)
+
+    # all NA case
+    if has_skipna:
+        all_na = frame * np.nan
+        r0 = getattr(all_na, opname)(axis=0)
+        r1 = getattr(all_na, opname)(axis=1)
+        if opname in ["sum", "prod"]:
+            unit = 1 if opname == "prod" else 0  # result for empty sum/prod
+            expected = Series(unit, index=r0.index, dtype=r0.dtype)
+            tm.assert_series_equal(r0, expected)
+            expected = Series(unit, index=r1.index, dtype=r1.dtype)
+            tm.assert_series_equal(r1, expected)
+
+
+@pytest.fixture
+def bool_frame_with_na():
+    """
+    Fixture for DataFrame of booleans with index of unique strings
+
+    Columns are ['A', 'B', 'C', 'D']; some entries are missing
+    """
+    df = DataFrame(
+        np.concatenate(
+            [np.ones((15, 4), dtype=bool), np.zeros((15, 4), dtype=bool)], axis=0
+        ),
+        index=Index([f"foo_{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD"), dtype=object),
+        dtype=object,
+    )
+    # set some NAs
+    df.iloc[5:10] = np.nan
+    df.iloc[15:20, -2:] = np.nan
+    return df
+
+
+@pytest.fixture
+def float_frame_with_na():
+    """
+    Fixture for DataFrame of floats with index of unique strings
+
+    Columns are ['A', 'B', 'C', 'D']; some entries are missing
+    """
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((30, 4)),
+        index=Index([f"foo_{i}" for i in range(30)], dtype=object),
+        columns=Index(list("ABCD"), dtype=object),
+    )
+    # set some NAs
+    df.iloc[5:10] = np.nan
+    df.iloc[15:20, -2:] = np.nan
+    return df
+
+
+class TestDataFrameAnalytics:
+    # ---------------------------------------------------------------------
+    # Reductions
+    @pytest.mark.parametrize("axis", [0, 1])
+    @pytest.mark.parametrize(
+        "opname",
+        [
+            "count",
+            "sum",
+            "mean",
+            "product",
+            "median",
+            "min",
+            "max",
+            "nunique",
+            "var",
+            "std",
+            "sem",
+            pytest.param("skew", marks=td.skip_if_no("scipy")),
+            pytest.param("kurt", marks=td.skip_if_no("scipy")),
+        ],
+    )
+    def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname):
+        if (opname in ("sum", "min", "max") and axis == 0) or opname in (
+            "count",
+            "nunique",
+        ):
+            getattr(float_string_frame, opname)(axis=axis)
+        else:
+            if opname in ["var", "std", "sem", "skew", "kurt"]:
+                msg = "could not convert string to float: 'bar'"
+            elif opname == "product":
+                if axis == 1:
+                    msg = "can't multiply sequence by non-int of type 'float'"
+                else:
+                    msg = "can't multiply sequence by non-int of type 'str'"
+            elif opname == "sum":
+                msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'"
+            elif opname == "mean":
+                if axis == 0:
+                    # different message on different builds
+                    msg = "|".join(
+                        [
+                            r"Could not convert \['.*'\] to numeric",
+                            "Could not convert string '(bar){30}' to numeric",
+                        ]
+                    )
+                else:
+                    msg = r"unsupported operand type\(s\) for \+: 'float' and 'str'"
+            elif opname in ["min", "max"]:
+                msg = "'[><]=' not supported between instances of 'float' and 'str'"
+            elif opname == "median":
+                msg = re.compile(
+                    r"Cannot convert \[.*\] to numeric|does not support|Cannot perform",
+                    flags=re.S,
+                )
+            if not isinstance(msg, re.Pattern):
+                msg = msg + "|does not support|Cannot perform reduction"
+            with pytest.raises(TypeError, match=msg):
+                getattr(float_string_frame, opname)(axis=axis)
+        if opname != "nunique":
+            getattr(float_string_frame, opname)(axis=axis, numeric_only=True)
+
+    @pytest.mark.parametrize("axis", [0, 1])
+    @pytest.mark.parametrize(
+        "opname",
+        [
+            "count",
+            "sum",
+            "mean",
+            "product",
+            "median",
+            "min",
+            "max",
+            "var",
+            "std",
+            "sem",
+            pytest.param("skew", marks=td.skip_if_no("scipy")),
+            pytest.param("kurt", marks=td.skip_if_no("scipy")),
+        ],
+    )
+    def test_stat_op_api_float_frame(self, float_frame, axis, opname):
+        getattr(float_frame, opname)(axis=axis, numeric_only=False)
+
+    def test_stat_op_calc(self, float_frame_with_na, mixed_float_frame):
+        def count(s):
+            return notna(s).sum()
+
+        def nunique(s):
+            return len(algorithms.unique1d(s.dropna()))
+
+        def var(x):
+            return np.var(x, ddof=1)
+
+        def std(x):
+            return np.std(x, ddof=1)
+
+        def sem(x):
+            return np.std(x, ddof=1) / np.sqrt(len(x))
+
+        assert_stat_op_calc(
+            "nunique",
+            nunique,
+            float_frame_with_na,
+            has_skipna=False,
+            check_dtype=False,
+            check_dates=True,
+        )
+
+        # GH#32571: rol needed for flaky CI builds
+        # mixed types (with upcasting happening)
+        assert_stat_op_calc(
+            "sum",
+            np.sum,
+            mixed_float_frame.astype("float32"),
+            check_dtype=False,
+            rtol=1e-3,
+        )
+
+        assert_stat_op_calc(
+            "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum
+        )
+        assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True)
+        assert_stat_op_calc(
+            "product", np.prod, float_frame_with_na, skipna_alternative=np.nanprod
+        )
+
+        assert_stat_op_calc("var", var, float_frame_with_na)
+        assert_stat_op_calc("std", std, float_frame_with_na)
+        assert_stat_op_calc("sem", sem, float_frame_with_na)
+
+        assert_stat_op_calc(
+            "count",
+            count,
+            float_frame_with_na,
+            has_skipna=False,
+            check_dtype=False,
+            check_dates=True,
+        )
+
+    def test_stat_op_calc_skew_kurtosis(self, float_frame_with_na):
+        sp_stats = pytest.importorskip("scipy.stats")
+
+        def skewness(x):
+            if len(x) < 3:
+                return np.nan
+            return sp_stats.skew(x, bias=False)
+
+        def kurt(x):
+            if len(x) < 4:
+                return np.nan
+            return sp_stats.kurtosis(x, bias=False)
+
+        assert_stat_op_calc("skew", skewness, float_frame_with_na)
+        assert_stat_op_calc("kurt", kurt, float_frame_with_na)
+
+    def test_median(self, float_frame_with_na, int_frame):
+        def wrapper(x):
+            if isna(x).any():
+                return np.nan
+            return np.median(x)
+
+        assert_stat_op_calc("median", wrapper, float_frame_with_na, check_dates=True)
+        assert_stat_op_calc(
+            "median", wrapper, int_frame, check_dtype=False, check_dates=True
+        )
+
+    @pytest.mark.parametrize(
+        "method", ["sum", "mean", "prod", "var", "std", "skew", "min", "max"]
+    )
+    @pytest.mark.parametrize(
+        "df",
+        [
+            DataFrame(
+                {
+                    "a": [
+                        -0.00049987540199591344,
+                        -0.0016467257772919831,
+                        0.00067695870775883013,
+                    ],
+                    "b": [-0, -0, 0.0],
+                    "c": [
+                        0.00031111847529610595,
+                        0.0014902627951905339,
+                        -0.00094099200035979691,
+                    ],
+                },
+                index=["foo", "bar", "baz"],
+                dtype="O",
+            ),
+            DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object),
+        ],
+    )
+    @pytest.mark.filterwarnings("ignore:Mismatched null-like values:FutureWarning")
+    def test_stat_operators_attempt_obj_array(self, method, df, axis):
+        # GH#676
+        assert df.values.dtype == np.object_
+        result = getattr(df, method)(axis=axis)
+        expected = getattr(df.astype("f8"), method)(axis=axis).astype(object)
+        if axis in [1, "columns"] and method in ["min", "max"]:
+            expected[expected.isna()] = None
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
+    def test_mixed_ops(self, op):
+        # GH#16116
+        df = DataFrame(
+            {
+                "int": [1, 2, 3, 4],
+                "float": [1.0, 2.0, 3.0, 4.0],
+                "str": ["a", "b", "c", "d"],
+            }
+        )
+        msg = "|".join(
+            [
+                "Could not convert",
+                "could not convert",
+                "can't multiply sequence by non-int",
+                "does not support",
+                "Cannot perform",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            getattr(df, op)()
+
+        with pd.option_context("use_bottleneck", False):
+            with pytest.raises(TypeError, match=msg):
+                getattr(df, op)()
+
+    def test_reduce_mixed_frame(self):
+        # GH 6806
+        df = DataFrame(
+            {
+                "bool_data": [True, True, False, False, False],
+                "int_data": [10, 20, 30, 40, 50],
+                "string_data": ["a", "b", "c", "d", "e"],
+            }
+        )
+        df.reindex(columns=["bool_data", "int_data", "string_data"])
+        test = df.sum(axis=0)
+        tm.assert_numpy_array_equal(
+            test.values, np.array([2, 150, "abcde"], dtype=object)
+        )
+        alt = df.T.sum(axis=1)
+        tm.assert_series_equal(test, alt)
+
+    def test_nunique(self):
+        df = DataFrame({"A": [1, 1, 1], "B": [1, 2, 3], "C": [1, np.nan, 3]})
+        tm.assert_series_equal(df.nunique(), Series({"A": 1, "B": 3, "C": 2}))
+        tm.assert_series_equal(
+            df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3})
+        )
+        tm.assert_series_equal(df.nunique(axis=1), Series([1, 2, 2]))
+        tm.assert_series_equal(df.nunique(axis=1, dropna=False), Series([1, 3, 2]))
+
+    @pytest.mark.parametrize("tz", [None, "UTC"])
+    def test_mean_mixed_datetime_numeric(self, tz):
+        # https://github.com/pandas-dev/pandas/issues/24752
+        df = DataFrame({"A": [1, 1], "B": [Timestamp("2000", tz=tz)] * 2})
+        result = df.mean()
+        expected = Series([1.0, Timestamp("2000", tz=tz)], index=["A", "B"])
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("tz", [None, "UTC"])
+    def test_mean_includes_datetimes(self, tz):
+        # https://github.com/pandas-dev/pandas/issues/24752
+        # Behavior in 0.24.0rc1 was buggy.
+        # As of 2.0 with numeric_only=None we do *not* drop datetime columns
+        df = DataFrame({"A": [Timestamp("2000", tz=tz)] * 2})
+        result = df.mean()
+
+        expected = Series([Timestamp("2000", tz=tz)], index=["A"])
+        tm.assert_series_equal(result, expected)
+
+    def test_mean_mixed_string_decimal(self):
+        # GH 11670
+        # possible bug when calculating mean of DataFrame?
+
+        d = [
+            {"A": 2, "B": None, "C": Decimal("628.00")},
+            {"A": 1, "B": None, "C": Decimal("383.00")},
+            {"A": 3, "B": None, "C": Decimal("651.00")},
+            {"A": 2, "B": None, "C": Decimal("575.00")},
+            {"A": 4, "B": None, "C": Decimal("1114.00")},
+            {"A": 1, "B": "TEST", "C": Decimal("241.00")},
+            {"A": 2, "B": None, "C": Decimal("572.00")},
+            {"A": 4, "B": None, "C": Decimal("609.00")},
+            {"A": 3, "B": None, "C": Decimal("820.00")},
+            {"A": 5, "B": None, "C": Decimal("1223.00")},
+        ]
+
+        df = DataFrame(d)
+
+        with pytest.raises(
+            TypeError, match="unsupported operand type|does not support|Cannot perform"
+        ):
+            df.mean()
+        result = df[["A", "C"]].mean()
+        expected = Series([2.7, 681.6], index=["A", "C"], dtype=object)
+        tm.assert_series_equal(result, expected)
+
+    def test_var_std(self, datetime_frame):
+        result = datetime_frame.std(ddof=4)
+        expected = datetime_frame.apply(lambda x: x.std(ddof=4))
+        tm.assert_almost_equal(result, expected)
+
+        result = datetime_frame.var(ddof=4)
+        expected = datetime_frame.apply(lambda x: x.var(ddof=4))
+        tm.assert_almost_equal(result, expected)
+
+        arr = np.repeat(np.random.default_rng(2).random((1, 1000)), 1000, 0)
+        result = nanops.nanvar(arr, axis=0)
+        assert not (result < 0).any()
+
+        with pd.option_context("use_bottleneck", False):
+            result = nanops.nanvar(arr, axis=0)
+            assert not (result < 0).any()
+
+    @pytest.mark.parametrize("meth", ["sem", "var", "std"])
+    def test_numeric_only_flag(self, meth):
+        # GH 9201
+        df1 = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 3)),
+            columns=["foo", "bar", "baz"],
+        )
+        # Cast to object to avoid implicit cast when setting entry to "100" below
+        df1 = df1.astype({"foo": object})
+        # set one entry to a number in str format
+        df1.loc[0, "foo"] = "100"
+
+        df2 = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 3)),
+            columns=["foo", "bar", "baz"],
+        )
+        # Cast to object to avoid implicit cast when setting entry to "a" below
+        df2 = df2.astype({"foo": object})
+        # set one entry to a non-number str
+        df2.loc[0, "foo"] = "a"
+
+        result = getattr(df1, meth)(axis=1, numeric_only=True)
+        expected = getattr(df1[["bar", "baz"]], meth)(axis=1)
+        tm.assert_series_equal(expected, result)
+
+        result = getattr(df2, meth)(axis=1, numeric_only=True)
+        expected = getattr(df2[["bar", "baz"]], meth)(axis=1)
+        tm.assert_series_equal(expected, result)
+
+        # df1 has all numbers, df2 has a letter inside
+        msg = r"unsupported operand type\(s\) for -: 'float' and 'str'"
+        with pytest.raises(TypeError, match=msg):
+            getattr(df1, meth)(axis=1, numeric_only=False)
+        msg = "could not convert string to float: 'a'"
+        with pytest.raises(TypeError, match=msg):
+            getattr(df2, meth)(axis=1, numeric_only=False)
+
+    def test_sem(self, datetime_frame):
+        result = datetime_frame.sem(ddof=4)
+        expected = datetime_frame.apply(lambda x: x.std(ddof=4) / np.sqrt(len(x)))
+        tm.assert_almost_equal(result, expected)
+
+        arr = np.repeat(np.random.default_rng(2).random((1, 1000)), 1000, 0)
+        result = nanops.nansem(arr, axis=0)
+        assert not (result < 0).any()
+
+        with pd.option_context("use_bottleneck", False):
+            result = nanops.nansem(arr, axis=0)
+            assert not (result < 0).any()
+
+    @pytest.mark.parametrize(
+        "dropna, expected",
+        [
+            (
+                True,
+                {
+                    "A": [12],
+                    "B": [10.0],
+                    "C": [1.0],
+                    "D": ["a"],
+                    "E": Categorical(["a"], categories=["a"]),
+                    "F": DatetimeIndex(["2000-01-02"], dtype="M8[ns]"),
+                    "G": to_timedelta(["1 days"]),
+                },
+            ),
+            (
+                False,
+                {
+                    "A": [12],
+                    "B": [10.0],
+                    "C": [np.nan],
+                    "D": Series([np.nan], dtype="str"),
+                    "E": Categorical([np.nan], categories=["a"]),
+                    "F": DatetimeIndex([pd.NaT], dtype="M8[ns]"),
+                    "G": to_timedelta([pd.NaT]).as_unit("us"),
+                },
+            ),
+            (
+                True,
+                {
+                    "H": [8, 9, np.nan, np.nan],
+                    "I": [8, 9, np.nan, np.nan],
+                    "J": [1, np.nan, np.nan, np.nan],
+                    "K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]),
+                    "L": DatetimeIndex(
+                        ["2000-01-02", "NaT", "NaT", "NaT"], dtype="M8[ns]"
+                    ),
+                    "M": to_timedelta(["1 days", "nan", "nan", "nan"]),
+                    "N": [0, 1, 2, 3],
+                },
+            ),
+            (
+                False,
+                {
+                    "H": [8, 9, np.nan, np.nan],
+                    "I": [8, 9, np.nan, np.nan],
+                    "J": [1, np.nan, np.nan, np.nan],
+                    "K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]),
+                    "L": DatetimeIndex(
+                        ["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"
+                    ),
+                    "M": to_timedelta(["nan", "1 days", "nan", "nan"]),
+                    "N": [0, 1, 2, 3],
+                },
+            ),
+        ],
+    )
+    def test_mode_dropna(self, dropna, expected):
+        df = DataFrame(
+            {
+                "A": [12, 12, 19, 11],
+                "B": [10, 10, np.nan, 3],
+                "C": [1, np.nan, np.nan, np.nan],
+                "D": Series([np.nan, np.nan, "a", np.nan], dtype="str"),
+                "E": Categorical([np.nan, np.nan, "a", np.nan]),
+                "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"),
+                "G": to_timedelta(["1 days", "nan", "nan", "nan"]),
+                "H": [8, 8, 9, 9],
+                "I": [9, 9, 8, 8],
+                "J": [1, 1, np.nan, np.nan],
+                "K": Categorical(["a", np.nan, "a", np.nan]),
+                "L": DatetimeIndex(
+                    ["2000-01-02", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"
+                ),
+                "M": to_timedelta(["1 days", "nan", "1 days", "nan"]),
+                "N": np.arange(4, dtype="int64"),
+            }
+        )
+
+        result = df[sorted(expected.keys())].mode(dropna=dropna)
+        expected = DataFrame(expected)
+        tm.assert_frame_equal(result, expected)
+
+    def test_mode_sort_with_na(self, using_infer_string):
+        df = DataFrame({"A": [np.nan, np.nan, "a", "a"]})
+        expected = DataFrame({"A": ["a", np.nan]})
+        result = df.mode(dropna=False)
+        tm.assert_frame_equal(result, expected)
+
+    def test_mode_empty_df(self):
+        df = DataFrame([], columns=["a", "b"])
+        expected = df.copy()
+        result = df.mode()
+        tm.assert_frame_equal(result, expected)
+
+    def test_operators_timedelta64(self):
+        df = DataFrame(
+            {
+                "A": date_range("2012-1-1", periods=3, freq="D", unit="ns"),
+                "B": date_range("2012-1-2", periods=3, freq="D", unit="ns"),
+                "C": Timestamp("20120101") - timedelta(minutes=5, seconds=5),
+            }
+        )
+
+        diffs = DataFrame({"A": df["A"] - df["C"], "B": df["A"] - df["B"]})
+
+        # min
+        result = diffs.min()
+        assert result.iloc[0] == diffs.loc[0, "A"]
+        assert result.iloc[1] == diffs.loc[0, "B"]
+
+        result = diffs.min(axis=1)
+        assert (result == diffs.loc[0, "B"]).all()
+
+        # max
+        result = diffs.max()
+        assert result.iloc[0] == diffs.loc[2, "A"]
+        assert result.iloc[1] == diffs.loc[2, "B"]
+
+        result = diffs.max(axis=1)
+        assert (result == diffs["A"]).all()
+
+        # abs
+        result = diffs.abs()
+        result2 = abs(diffs)
+        expected = DataFrame({"A": df["A"] - df["C"], "B": df["B"] - df["A"]})
+        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(result2, expected)
+
+        # mixed frame
+        mixed = diffs.copy()
+        mixed["C"] = "foo"
+        mixed["D"] = 1
+        mixed["E"] = 1.0
+        mixed["F"] = Timestamp("20130101")
+
+        # results in an object array
+        result = mixed.min()
+        expected = Series(
+            [
+                pd.Timedelta(timedelta(seconds=5 * 60 + 5)),
+                pd.Timedelta(timedelta(days=-1)),
+                "foo",
+                1,
+                1.0,
+                Timestamp("20130101"),
+            ],
+            index=mixed.columns,
+        )
+        tm.assert_series_equal(result, expected)
+
+        # excludes non-numeric
+        result = mixed.min(axis=1, numeric_only=True)
+        expected = Series([1, 1, 1.0])
+        tm.assert_series_equal(result, expected)
+
+        # works when only those columns are selected
+        result = mixed[["A", "B"]].min(axis=1)
+        expected = Series([timedelta(days=-1)] * 3, dtype="m8[ns]")
+        tm.assert_series_equal(result, expected)
+
+        result = mixed[["A", "B"]].min()
+        expected = Series(
+            [timedelta(seconds=5 * 60 + 5), timedelta(days=-1)],
+            index=["A", "B"],
+            dtype="m8[ns]",
+        )
+        tm.assert_series_equal(result, expected)
+
+        # GH 3106
+        df = DataFrame(
+            {
+                "time": date_range("20130102", periods=5, unit="ns"),
+                "time2": date_range("20130105", periods=5, unit="ns"),
+            }
+        )
+        df["off1"] = df["time2"] - df["time"]
+        assert df["off1"].dtype == "timedelta64[ns]"
+
+        df["off2"] = df["time"] - df["time2"]
+        df._consolidate_inplace()
+        assert df["off1"].dtype == "timedelta64[ns]"
+        assert df["off2"].dtype == "timedelta64[ns]"
+
+    def test_std_timedelta64_skipna_false(self):
+        # GH#37392
+        tdi = pd.timedelta_range("1 Day", periods=10)
+        df = DataFrame({"A": tdi, "B": tdi}, copy=True)
+        df.iloc[-2, -1] = pd.NaT
+
+        result = df.std(skipna=False)
+        expected = Series(
+            [df["A"].std(), pd.NaT], index=["A", "B"], dtype="timedelta64[us]"
+        )
+        tm.assert_series_equal(result, expected)
+
+        result = df.std(axis=1, skipna=False)
+        expected = Series(
+            [pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)], dtype="m8[us]"
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]]
+    )
+    def test_std_datetime64_with_nat(self, values, skipna, request, unit):
+        # GH#51335
+        dti = to_datetime(values).as_unit(unit)
+        df = DataFrame({"a": dti})
+        result = df.std(skipna=skipna)
+        if not skipna or all(value is pd.NaT for value in values):
+            expected = Series({"a": pd.NaT}, dtype=f"timedelta64[{unit}]")
+        else:
+            expected = Series({"a": "1 days"}, dtype=f"timedelta64[{unit}]")
+        tm.assert_series_equal(result, expected)
+
+    def test_sum_corner(self):
+        empty_frame = DataFrame()
+
+        axis0 = empty_frame.sum(axis=0)
+        axis1 = empty_frame.sum(axis=1)
+        assert isinstance(axis0, Series)
+        assert isinstance(axis1, Series)
+        assert len(axis0) == 0
+        assert len(axis1) == 0
+
+    @pytest.mark.parametrize(
+        "index",
+        [
+            RangeIndex(0),
+            DatetimeIndex([]),
+            Index([], dtype=np.int64),
+            Index([], dtype=np.float64),
+            DatetimeIndex([], freq="ME"),
+            PeriodIndex([], freq="D"),
+        ],
+    )
+    def test_axis_1_empty(self, all_reductions, index):
+        df = DataFrame(columns=["a"], index=index)
+        result = getattr(df, all_reductions)(axis=1)
+        if all_reductions in ("any", "all"):
+            expected_dtype = "bool"
+        elif all_reductions == "count":
+            expected_dtype = "int64"
+        else:
+            expected_dtype = "object"
+        expected = Series([], index=index, dtype=expected_dtype)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("min_count", [0, 1])
+    def test_axis_1_sum_na(self, string_dtype_no_object, skipna, min_count):
+        # https://github.com/pandas-dev/pandas/issues/60229
+        dtype = string_dtype_no_object
+        df = DataFrame({"a": [pd.NA]}, dtype=dtype)
+        result = df.sum(axis=1, skipna=skipna, min_count=min_count)
+        value = "" if skipna and min_count == 0 else pd.NA
+        expected = Series([value], dtype=dtype)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
+    @pytest.mark.parametrize("numeric_only", [None, True, False])
+    def test_sum_prod_nanops(self, method, unit, numeric_only):
+        idx = ["a", "b", "c"]
+        df = DataFrame({"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]})
+        # The default
+        result = getattr(df, method)(numeric_only=numeric_only)
+        expected = Series([unit, unit, unit], index=idx, dtype="float64")
+        tm.assert_series_equal(result, expected)
+
+        # min_count=1
+        result = getattr(df, method)(numeric_only=numeric_only, min_count=1)
+        expected = Series([unit, unit, np.nan], index=idx)
+        tm.assert_series_equal(result, expected)
+
+        # min_count=0
+        result = getattr(df, method)(numeric_only=numeric_only, min_count=0)
+        expected = Series([unit, unit, unit], index=idx, dtype="float64")
+        tm.assert_series_equal(result, expected)
+
+        result = getattr(df.iloc[1:], method)(numeric_only=numeric_only, min_count=1)
+        expected = Series([unit, np.nan, np.nan], index=idx)
+        tm.assert_series_equal(result, expected)
+
+        # min_count > 1
+        df = DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5})
+        result = getattr(df, method)(numeric_only=numeric_only, min_count=5)
+        expected = Series(result, index=["A", "B"])
+        tm.assert_series_equal(result, expected)
+
+        result = getattr(df, method)(numeric_only=numeric_only, min_count=6)
+        expected = Series(result, index=["A", "B"])
+        tm.assert_series_equal(result, expected)
+
+    def test_sum_nanops_timedelta(self):
+        # prod isn't defined on timedeltas
+        idx = ["a", "b", "c"]
+        df = DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]})
+
+        df2 = df.apply(to_timedelta)
+
+        # 0 by default
+        result = df2.sum()
+        expected = Series([0, 0, 0], dtype="m8[ns]", index=idx)
+        tm.assert_series_equal(result, expected)
+
+        # min_count=0
+        result = df2.sum(min_count=0)
+        tm.assert_series_equal(result, expected)
+
+        # min_count=1
+        result = df2.sum(min_count=1)
+        expected = Series([0, 0, np.nan], dtype="m8[ns]", index=idx)
+        tm.assert_series_equal(result, expected)
+
+    def test_sum_nanops_min_count(self):
+        # https://github.com/pandas-dev/pandas/issues/39738
+        df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
+        result = df.sum(min_count=10)
+        expected = Series([np.nan, np.nan], index=["x", "y"])
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("float_type", ["float16", "float32", "float64"])
+    @pytest.mark.parametrize(
+        "kwargs, expected_result",
+        [
+            ({"axis": 1, "min_count": 2}, [3.2, 5.3, np.nan]),
+            ({"axis": 1, "min_count": 3}, [np.nan, np.nan, np.nan]),
+            ({"axis": 1, "skipna": False}, [3.2, 5.3, np.nan]),
+        ],
+    )
+    def test_sum_nanops_dtype_min_count(self, float_type, kwargs, expected_result):
+        # GH#46947
+        df = DataFrame({"a": [1.0, 2.3, 4.4], "b": [2.2, 3, np.nan]}, dtype=float_type)
+        result = df.sum(**kwargs)
+        expected = Series(expected_result).astype(float_type)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("float_type", ["float16", "float32", "float64"])
+    @pytest.mark.parametrize(
+        "kwargs, expected_result",
+        [
+            ({"axis": 1, "min_count": 2}, [2.0, 4.0, np.nan]),
+            ({"axis": 1, "min_count": 3}, [np.nan, np.nan, np.nan]),
+            ({"axis": 1, "skipna": False}, [2.0, 4.0, np.nan]),
+        ],
+    )
+    def test_prod_nanops_dtype_min_count(self, float_type, kwargs, expected_result):
+        # GH#46947
+        df = DataFrame(
+            {"a": [1.0, 2.0, 4.4], "b": [2.0, 2.0, np.nan]}, dtype=float_type
+        )
+        result = df.prod(**kwargs)
+        expected = Series(expected_result).astype(float_type)
+        tm.assert_series_equal(result, expected)
+
+    def test_sum_object(self, float_frame):
+        values = float_frame.values.astype(int)
+        frame = DataFrame(values, index=float_frame.index, columns=float_frame.columns)
+        deltas = frame * timedelta(1)
+        deltas.sum()
+
+    def test_sum_bool(self, float_frame):
+        # ensure this works, bug report
+        bools = np.isnan(float_frame)
+        bools.sum(axis=1)
+        bools.sum(axis=0)
+
+    def test_sum_mixed_datetime(self):
+        # GH#30886
+        df = DataFrame({"A": date_range("2000", periods=4), "B": [1, 2, 3, 4]}).reindex(
+            [2, 3, 4]
+        )
+        with pytest.raises(TypeError, match="does not support operation 'sum'"):
+            df.sum()
+
+    def test_mean_corner(self, float_frame, float_string_frame):
+        # unit test when have object data
+        msg = "Could not convert|does not support|Cannot perform"
+        with pytest.raises(TypeError, match=msg):
+            float_string_frame.mean(axis=0)
+
+        # xs sum mixed type, just want to know it works...
+        with pytest.raises(TypeError, match="unsupported operand type"):
+            float_string_frame.mean(axis=1)
+
+        # take mean of boolean column
+        float_frame["bool"] = float_frame["A"] > 0
+        means = float_frame.mean(axis=0)
+        assert means["bool"] == float_frame["bool"].values.mean()
+
+    def test_mean_datetimelike(self):
+        # GH#24757 check that datetimelike are excluded by default, handled
+        #  correctly with numeric_only=True
+        #  As of 2.0, datetimelike are *not* excluded with numeric_only=None
+
+        df = DataFrame(
+            {
+                "A": np.arange(3),
+                "B": date_range("2016-01-01", periods=3),
+                "C": pd.timedelta_range("1D", periods=3),
+                "D": pd.period_range("2016", periods=3, freq="Y"),
+            }
+        )
+        result = df.mean(numeric_only=True)
+        expected = Series({"A": 1.0})
+        tm.assert_series_equal(result, expected)
+
+        with pytest.raises(TypeError, match="mean is not implemented for PeriodArray"):
+            df.mean()
+
+    def test_mean_datetimelike_numeric_only_false(self):
+        df = DataFrame(
+            {
+                "A": np.arange(3),
+                "B": date_range("2016-01-01", periods=3),
+                "C": pd.timedelta_range("1D", periods=3),
+            }
+        )
+
+        # datetime(tz) and timedelta work
+        result = df.mean(numeric_only=False)
+        expected = Series({"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"]})
+        tm.assert_series_equal(result, expected)
+
+        # mean of period is not allowed
+        df["D"] = pd.period_range("2016", periods=3, freq="Y")
+
+        with pytest.raises(TypeError, match="mean is not implemented for Period"):
+            df.mean(numeric_only=False)
+
+    def test_mean_extensionarray_numeric_only_true(self):
+        # https://github.com/pandas-dev/pandas/issues/33256
+        arr = np.random.default_rng(2).integers(1000, size=(10, 5))
+        df = DataFrame(arr, dtype="Int64")
+        result = df.mean(numeric_only=True)
+        expected = DataFrame(arr).mean().astype("Float64")
+        tm.assert_series_equal(result, expected)
+
+    def test_stats_mixed_type(self, float_string_frame):
+        with pytest.raises(TypeError, match="could not convert"):
+            float_string_frame.std(axis=1)
+        with pytest.raises(TypeError, match="could not convert"):
+            float_string_frame.var(axis=1)
+        with pytest.raises(TypeError, match="unsupported operand type"):
+            float_string_frame.mean(axis=1)
+        with pytest.raises(TypeError, match="could not convert"):
+            float_string_frame.skew(axis=1)
+
+    def test_sum_bools(self):
+        df = DataFrame(index=range(1), columns=range(10))
+        bools = isna(df)
+        assert bools.sum(axis=1)[0] == 10
+
+    @pytest.mark.parametrize(
+        "input_data, expected_data",
+        [
+            ({"a": ["483", "3"], "b": ["94", "759"]}, ["48394", "3759"]),
+            (
+                {"a": ["483.948", "3.0"], "b": ["94.2", "759.93"]},
+                ["483.94894.2", "3.0759.93"],
+            ),
+            ({"a": ["483", "3.0"], "b": ["94.2", "79"]}, ["48394.2", "3.079"]),
+        ],
+    )
+    def test_sum_string_dtype_coercion(self, input_data, expected_data):
+        # GH#22642
+        # Check that summing numeric strings results in concatenation
+        # and not conversion to dtype int64 or float64
+        df = DataFrame(input_data)
+        expected = Series(expected_data)
+        result = df.sum(axis=1)
+        tm.assert_series_equal(result, expected)
+
+    # ----------------------------------------------------------------------
+    # Index of max / min
+
+    @pytest.mark.parametrize("axis", [0, 1])
+    def test_idxmin(self, float_frame, int_frame, skipna, axis):
+        frame = float_frame
+        frame.iloc[5:10] = np.nan
+        frame.iloc[15:20, -2:] = np.nan
+        for df in [frame, int_frame]:
+            if (not skipna or axis == 1) and df is not int_frame:
+                if skipna:
+                    msg = "Encountered all NA values"
+                else:
+                    msg = "Encountered an NA value"
+                with pytest.raises(ValueError, match=msg):
+                    df.idxmin(axis=axis, skipna=skipna)
+                with pytest.raises(ValueError, match=msg):
+                    df.idxmin(axis=axis, skipna=skipna)
+            else:
+                result = df.idxmin(axis=axis, skipna=skipna)
+                expected = df.apply(Series.idxmin, axis=axis, skipna=skipna)
+                expected = expected.astype(df.index.dtype)
+                tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("axis", [0, 1])
+    @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
+    def test_idxmin_empty(self, index, skipna, axis):
+        # GH53265
+        if axis == 0:
+            frame = DataFrame(index=index)
+        else:
+            frame = DataFrame(columns=index)
+
+        result = frame.idxmin(axis=axis, skipna=skipna)
+        expected = Series(dtype=index.dtype)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("numeric_only", [True, False])
+    def test_idxmin_numeric_only(self, numeric_only):
+        df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
+        result = df.idxmin(numeric_only=numeric_only)
+        if numeric_only:
+            expected = Series([2, 1], index=["a", "b"])
+        else:
+            expected = Series([2, 1, 0], index=["a", "b", "c"])
+        tm.assert_series_equal(result, expected)
+
+    def test_idxmin_axis_2(self, float_frame):
+        frame = float_frame
+        msg = "No axis named 2 for object type DataFrame"
+        with pytest.raises(ValueError, match=msg):
+            frame.idxmin(axis=2)
+
+    @pytest.mark.parametrize("axis", [0, 1])
+    def test_idxmax(self, float_frame, int_frame, skipna, axis):
+        frame = float_frame
+        frame.iloc[5:10] = np.nan
+        frame.iloc[15:20, -2:] = np.nan
+        for df in [frame, int_frame]:
+            if (skipna is False or axis == 1) and df is frame:
+                if skipna:
+                    msg = "Encountered all NA values"
+                else:
+                    msg = "Encountered an NA value"
+                with pytest.raises(ValueError, match=msg):
+                    df.idxmax(axis=axis, skipna=skipna)
+                return
+
+            result = df.idxmax(axis=axis, skipna=skipna)
+            expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
+            expected = expected.astype(df.index.dtype)
+            tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("axis", [0, 1])
+    @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
+    def test_idxmax_empty(self, index, skipna, axis):
+        # GH53265
+        if axis == 0:
+            frame = DataFrame(index=index)
+        else:
+            frame = DataFrame(columns=index)
+
+        result = frame.idxmax(axis=axis, skipna=skipna)
+        expected = Series(dtype=index.dtype)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("numeric_only", [True, False])
+    def test_idxmax_numeric_only(self, numeric_only):
+        df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
+        result = df.idxmax(numeric_only=numeric_only)
+        if numeric_only:
+            expected = Series([1, 0], index=["a", "b"])
+        else:
+            expected = Series([1, 0, 1], index=["a", "b", "c"])
+        tm.assert_series_equal(result, expected)
+
+    def test_idxmax_arrow_types(self):
+        # GH#55368
+        pytest.importorskip("pyarrow")
+
+        df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1]}, dtype="int64[pyarrow]")
+        result = df.idxmax()
+        expected = Series([1, 0], index=["a", "b"])
+        tm.assert_series_equal(result, expected)
+
+        result = df.idxmin()
+        expected = Series([2, 1], index=["a", "b"])
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"a": ["b", "c", "a"]}, dtype="string[pyarrow]")
+        result = df.idxmax(numeric_only=False)
+        expected = Series([1], index=["a"])
+        tm.assert_series_equal(result, expected)
+
+        result = df.idxmin(numeric_only=False)
+        expected = Series([2], index=["a"])
+        tm.assert_series_equal(result, expected)
+
+    def test_idxmax_axis_2(self, float_frame):
+        frame = float_frame
+        msg = "No axis named 2 for object type DataFrame"
+        with pytest.raises(ValueError, match=msg):
+            frame.idxmax(axis=2)
+
+    def test_idxmax_mixed_dtype(self):
+        # don't cast to object, which would raise in nanops
+        dti = date_range("2016-01-01", periods=3)
+        df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti})
+
+        result = df.idxmax()
+        expected = Series([1, 0, 2], index=range(1, 4))
+        tm.assert_series_equal(result, expected)
+
+        result = df.idxmin()
+        expected = Series([0, 2, 0], index=range(1, 4))
+        tm.assert_series_equal(result, expected)
+
+        # with NaTs
+        df.loc[0, 3] = pd.NaT
+        result = df.idxmax()
+        expected = Series([1, 0, 2], index=range(1, 4))
+        tm.assert_series_equal(result, expected)
+
+        result = df.idxmin()
+        expected = Series([0, 2, 1], index=range(1, 4))
+        tm.assert_series_equal(result, expected)
+
+        # with multi-column dt64 block
+        df[4] = dti[::-1]
+        df._consolidate_inplace()
+
+        result = df.idxmax()
+        expected = Series([1, 0, 2, 0], index=range(1, 5))
+        tm.assert_series_equal(result, expected)
+
+        result = df.idxmin()
+        expected = Series([0, 2, 1, 2], index=range(1, 5))
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "op, expected_value",
+        [("idxmax", [0, 4]), ("idxmin", [0, 5])],
+    )
+    def test_idxmax_idxmin_convert_dtypes(self, op, expected_value):
+        # GH 40346
+        df = DataFrame(
+            {
+                "ID": [100, 100, 100, 200, 200, 200],
+                "value": [0, 0, 0, 1, 2, 0],
+            },
+            dtype="Int64",
+        )
+        df = df.groupby("ID")
+
+        result = getattr(df, op)()
+        expected = DataFrame(
+            {"value": expected_value},
+            index=Index([100, 200], name="ID", dtype="Int64"),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_idxmax_dt64_multicolumn_axis1(self):
+        dti = date_range("2016-01-01", periods=3)
+        df = DataFrame({3: dti, 4: dti[::-1]}, copy=True)
+        df.iloc[0, 0] = pd.NaT
+
+        df._consolidate_inplace()
+
+        result = df.idxmax(axis=1)
+        expected = Series([4, 3, 3])
+        tm.assert_series_equal(result, expected)
+
+        result = df.idxmin(axis=1)
+        expected = Series([4, 3, 4])
+        tm.assert_series_equal(result, expected)
+
+    # ----------------------------------------------------------------------
+    # Logical reductions
+
+    @pytest.mark.parametrize("axis", [0, 1])
+    @pytest.mark.parametrize("bool_only", [False, True])
+    def test_any_all_mixed_float(
+        self, all_boolean_reductions, axis, bool_only, float_string_frame
+    ):
+        # make sure op works on mixed-type frame
+        mixed = float_string_frame
+        mixed["_bool_"] = np.random.default_rng(2).standard_normal(len(mixed)) > 0.5
+
+        getattr(mixed, all_boolean_reductions)(axis=axis, bool_only=bool_only)
+
+    @pytest.mark.parametrize("axis", [0, 1])
+    def test_any_all_bool_with_na(
+        self, all_boolean_reductions, axis, bool_frame_with_na
+    ):
+        getattr(bool_frame_with_na, all_boolean_reductions)(axis=axis, bool_only=False)
+
+    def test_any_all_bool_frame(self, all_boolean_reductions, bool_frame_with_na):
+        # GH#12863: numpy gives back non-boolean data for object type
+        # so fill NaNs to compare with pandas behavior
+        frame = bool_frame_with_na.fillna(True)
+        alternative = getattr(np, all_boolean_reductions)
+        f = getattr(frame, all_boolean_reductions)
+
+        def skipna_wrapper(x):
+            nona = x.dropna().values
+            return alternative(nona)
+
+        def wrapper(x):
+            return alternative(x.values)
+
+        result0 = f(axis=0, skipna=False)
+        result1 = f(axis=1, skipna=False)
+
+        tm.assert_series_equal(result0, frame.apply(wrapper))
+        tm.assert_series_equal(result1, frame.apply(wrapper, axis=1))
+
+        result0 = f(axis=0)
+        result1 = f(axis=1)
+
+        tm.assert_series_equal(result0, frame.apply(skipna_wrapper))
+        tm.assert_series_equal(
+            result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False
+        )
+
+        # bad axis
+        with pytest.raises(ValueError, match="No axis named 2"):
+            f(axis=2)
+
+        # all NA case
+        all_na = frame * np.nan
+        r0 = getattr(all_na, all_boolean_reductions)(axis=0)
+        r1 = getattr(all_na, all_boolean_reductions)(axis=1)
+        if all_boolean_reductions == "any":
+            assert not r0.any()
+            assert not r1.any()
+        else:
+            assert r0.all()
+            assert r1.all()
+
+    def test_any_all_extra(self, using_python_scalars):
+        df = DataFrame(
+            {
+                "A": [True, False, False],
+                "B": [True, True, False],
+                "C": [True, True, True],
+            },
+            index=["a", "b", "c"],
+        )
+        result = df[["A", "B"]].any(axis=1)
+        expected = Series([True, True, False], index=["a", "b", "c"])
+        tm.assert_series_equal(result, expected)
+
+        result = df[["A", "B"]].any(axis=1, bool_only=True)
+        tm.assert_series_equal(result, expected)
+
+        result = df.all(axis=1)
+        expected = Series([True, False, False], index=["a", "b", "c"])
+        tm.assert_series_equal(result, expected)
+
+        result = df.all(axis=1, bool_only=True)
+        tm.assert_series_equal(result, expected)
+
+        # Axis is None
+        result = df.all(axis=None)
+        if not using_python_scalars:
+            result = result.item()
+        assert result is False
+
+        result = df.any(axis=None)
+        if not using_python_scalars:
+            result = result.item()
+        assert result is True
+
+        result = df[["C"]].all(axis=None)
+        if not using_python_scalars:
+            result = result.item()
+        assert result is True
+
+    @pytest.mark.parametrize("axis", [0, 1])
+    def test_any_all_object_dtype(self, axis, all_boolean_reductions, skipna):
+        # GH#35450
+        df = DataFrame(
+            data=[
+                [1, np.nan, np.nan, True],
+                [np.nan, 2, np.nan, True],
+                [np.nan, np.nan, np.nan, True],
+                [np.nan, np.nan, "5", np.nan],
+            ]
+        )
+        result = getattr(df, all_boolean_reductions)(axis=axis, skipna=skipna)
+        expected = Series([True, True, True, True])
+        tm.assert_series_equal(result, expected)
+
+    def test_any_datetime(self):
+        # GH 23070
+        float_data = [1, np.nan, 3, np.nan]
+        datetime_data = [
+            Timestamp("1960-02-15"),
+            Timestamp("1960-02-16"),
+            pd.NaT,
+            pd.NaT,
+        ]
+        df = DataFrame({"A": float_data, "B": datetime_data})
+
+        msg = "datetime64 type does not support operation 'any'"
+        with pytest.raises(TypeError, match=msg):
+            df.any(axis=1)
+
+    def test_any_all_bool_only(self):
+        # GH 25101
+        df = DataFrame(
+            {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]},
+            columns=Index(["col1", "col2", "col3"], dtype=object),
+        )
+
+        result = df.all(bool_only=True)
+        expected = Series(dtype=np.bool_, index=[])
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame(
+            {
+                "col1": [1, 2, 3],
+                "col2": [4, 5, 6],
+                "col3": [None, None, None],
+                "col4": [False, False, True],
+            }
+        )
+
+        result = df.all(bool_only=True)
+        expected = Series({"col4": False})
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "func, data, expected",
+        [
+            (np.any, {}, False),
+            (np.all, {}, True),
+            (np.any, {"A": []}, False),
+            (np.all, {"A": []}, True),
+            (np.any, {"A": [False, False]}, False),
+            (np.all, {"A": [False, False]}, False),
+            (np.any, {"A": [True, False]}, True),
+            (np.all, {"A": [True, False]}, False),
+            (np.any, {"A": [True, True]}, True),
+            (np.all, {"A": [True, True]}, True),
+            (np.any, {"A": [False], "B": [False]}, False),
+            (np.all, {"A": [False], "B": [False]}, False),
+            (np.any, {"A": [False, False], "B": [False, True]}, True),
+            (np.all, {"A": [False, False], "B": [False, True]}, False),
+            # other types
+            (np.all, {"A": Series([0.0, 1.0], dtype="float")}, False),
+            (np.any, {"A": Series([0.0, 1.0], dtype="float")}, True),
+            (np.all, {"A": Series([0, 1], dtype=int)}, False),
+            (np.any, {"A": Series([0, 1], dtype=int)}, True),
+            pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns]")}, False),
+            pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, False),
+            pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns]")}, True),
+            pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, True),
+            pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns]")}, True),
+            pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True),
+            pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns]")}, True),
+            pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True),
+            pytest.param(np.all, {"A": Series([0, 1], dtype="m8[ns]")}, False),
+            pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True),
+            pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True),
+            pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True),
+            # np.all on Categorical raises, so the reduction drops the
+            #  column, so all is being done on an empty Series, so is True
+            (np.all, {"A": Series([0, 1], dtype="category")}, True),
+            (np.any, {"A": Series([0, 1], dtype="category")}, False),
+            (np.all, {"A": Series([1, 2], dtype="category")}, True),
+            (np.any, {"A": Series([1, 2], dtype="category")}, False),
+            # Mix GH#21484
+            pytest.param(
+                np.all,
+                {
+                    "A": Series([10, 20], dtype="M8[ns]"),
+                    "B": Series([10, 20], dtype="m8[ns]"),
+                },
+                True,
+            ),
+        ],
+    )
+    def test_any_all_np_func(self, func, data, expected, using_python_scalars):
+        # GH 19976
+        data = DataFrame(data)
+
+        if any(isinstance(x, CategoricalDtype) for x in data.dtypes):
+            with pytest.raises(
+                TypeError, match=".* dtype category does not support operation"
+            ):
+                func(data)
+
+            # method version
+            with pytest.raises(
+                TypeError, match=".* dtype category does not support operation"
+            ):
+                getattr(DataFrame(data), func.__name__)(axis=None)
+        if data.dtypes.apply(lambda x: x.kind == "M").any():
+            # GH#34479
+            msg = "datetime64 type does not support operation '(any|all)'"
+            with pytest.raises(TypeError, match=msg):
+                func(data)
+
+            # method version
+            with pytest.raises(TypeError, match=msg):
+                getattr(DataFrame(data), func.__name__)(axis=None)
+
+        elif data.dtypes.apply(lambda x: x != "category").any():
+            result = func(data)
+            if using_python_scalars:
+                assert result is expected
+            else:
+                assert isinstance(result, np.bool_)
+                assert result.item() is expected
+
+            # method version
+            result = getattr(DataFrame(data), func.__name__)(axis=None)
+            if using_python_scalars:
+                assert result is expected
+            else:
+                assert isinstance(result, np.bool_)
+                assert result.item() is expected
+
+    def test_any_all_object(self, using_python_scalars):
+        # GH 19976
+        result = np.all(DataFrame(columns=["a", "b"]))
+        if not using_python_scalars:
+            result = result.item()
+        assert result is True
+
+        result = np.any(DataFrame(columns=["a", "b"]))
+        if not using_python_scalars:
+            result = result.item()
+        assert result is False
+
+    def test_any_all_object_bool_only(self):
+        df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object)
+        df._consolidate_inplace()
+        df["C"] = Series([True, True])
+
+        # Categorical of bools is _not_ considered booly
+        df["D"] = df["C"].astype("category")
+
+        # The underlying bug is in DataFrame._get_bool_data, so we check
+        #  that while we're here
+        res = df._get_bool_data()
+        expected = df[["C"]]
+        tm.assert_frame_equal(res, expected)
+
+        res = df.all(bool_only=True, axis=0)
+        expected = Series([True], index=["C"])
+        tm.assert_series_equal(res, expected)
+
+        # operating on a subset of columns should not produce a _larger_ Series
+        res = df[["B", "C"]].all(bool_only=True, axis=0)
+        tm.assert_series_equal(res, expected)
+
+        assert df.all(bool_only=True, axis=None)
+
+        res = df.any(bool_only=True, axis=0)
+        expected = Series([True], index=["C"])
+        tm.assert_series_equal(res, expected)
+
+        # operating on a subset of columns should not produce a _larger_ Series
+        res = df[["C"]].any(bool_only=True, axis=0)
+        tm.assert_series_equal(res, expected)
+
+        assert df.any(bool_only=True, axis=None)
+
+    # ---------------------------------------------------------------------
+    # Unsorted
+
+    def test_series_broadcasting(self):
+        # smoke test for numpy warnings
+        # GH 16378, GH 16306
+        df = DataFrame([1.0, 1.0, 1.0])
+        df_nan = DataFrame({"A": [np.nan, 2.0, np.nan]})
+        s = Series([1, 1, 1])
+        s_nan = Series([np.nan, np.nan, 1])
+
+        with tm.assert_produces_warning(None):
+            df_nan.clip(lower=s, axis=0)
+            for op in ["lt", "le", "gt", "ge", "eq", "ne"]:
+                getattr(df, op)(s_nan, axis=0)
+
+
+class TestDataFrameReductions:
+    def test_min_max_dt64_with_NaT(self):
+        # Both NaT and Timestamp are in DataFrame.
+        df = DataFrame({"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01")]})
+
+        res = df.min()
+        exp = Series([Timestamp("2012-05-01")], index=["foo"])
+        tm.assert_series_equal(res, exp)
+
+        res = df.max()
+        exp = Series([Timestamp("2012-05-01")], index=["foo"])
+        tm.assert_series_equal(res, exp)
+
+        # GH12941, only NaTs are in DataFrame.
+        df = DataFrame({"foo": [pd.NaT, pd.NaT]})
+
+        res = df.min()
+        exp = Series([pd.NaT], index=["foo"])
+        tm.assert_series_equal(res, exp)
+
+        res = df.max()
+        exp = Series([pd.NaT], index=["foo"])
+        tm.assert_series_equal(res, exp)
+
+    def test_min_max_dt64_with_NaT_precision(self):
+        # GH#60646 Make sure the reduction doesn't cast input timestamps to
+        # float and lose precision.
+        df = DataFrame(
+            {"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01 09:20:00.123456789")]},
+            dtype="datetime64[ns]",
+        )
+
+        res = df.min(axis=1)
+        exp = df.foo.rename(None)
+        tm.assert_series_equal(res, exp)
+
+        res = df.max(axis=1)
+        exp = df.foo.rename(None)
+        tm.assert_series_equal(res, exp)
+
+    def test_min_max_td64_with_NaT_precision(self):
+        # GH#60646 Make sure the reduction doesn't cast input timedeltas to
+        # float and lose precision.
+        df = DataFrame(
+            {
+                "foo": [
+                    pd.NaT,
+                    pd.NaT,
+                    to_timedelta("10000 days 06:05:01.123456789"),
+                ],
+            },
+            dtype="timedelta64[ns]",
+        )
+
+        res = df.min(axis=1)
+        exp = df.foo.rename(None)
+        tm.assert_series_equal(res, exp)
+
+        res = df.max(axis=1)
+        exp = df.foo.rename(None)
+        tm.assert_series_equal(res, exp)
+
+    def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture):
+        # GH#36907
+        tz = tz_naive_fixture
+        if isinstance(tz, tzlocal) and is_platform_windows():
+            pytest.skip(
+                "GH#37659 OSError raised within tzlocal bc Windows "
+                "chokes in times before 1970-01-01"
+            )
+
+        df = DataFrame(
+            {
+                "a": [
+                    Timestamp("2020-01-01 08:00:00", tz=tz),
+                    Timestamp("1920-02-01 09:00:00", tz=tz),
+                ],
+                "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT],
+            }
+        )
+        res = df.min(axis=1, skipna=False)
+        expected = Series([df.loc[0, "a"], pd.NaT])
+        assert expected.dtype == df["a"].dtype
+
+        tm.assert_series_equal(res, expected)
+
+        res = df.max(axis=1, skipna=False)
+        expected = Series([df.loc[0, "b"], pd.NaT])
+        assert expected.dtype == df["a"].dtype
+
+        tm.assert_series_equal(res, expected)
+
+    def test_min_max_dt64_api_consistency_with_NaT(self):
+        # Calling the following sum functions returned an error for dataframes but
+        # returned NaT for series. These tests check that the API is consistent in
+        # min/max calls on empty Series/DataFrames. See GH:33704 for more
+        # information
+        df = DataFrame({"x": to_datetime([])})
+        expected_dt_series = Series(to_datetime([]))
+        # check axis 0
+        assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is pd.NaT)
+        assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is pd.NaT)
+
+        # check axis 1
+        tm.assert_series_equal(df.min(axis=1), expected_dt_series)
+        tm.assert_series_equal(df.max(axis=1), expected_dt_series)
+
+    def test_min_max_dt64_api_consistency_empty_df(self):
+        # check DataFrame/Series api consistency when calling min/max on an empty
+        # DataFrame/Series.
+        df = DataFrame({"x": []})
+        expected_float_series = Series([], dtype=float)
+        # check axis 0
+        assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min())
+        assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max())
+        # check axis 1
+        tm.assert_series_equal(df.min(axis=1), expected_float_series)
+        tm.assert_series_equal(df.min(axis=1), expected_float_series)
+
+    @pytest.mark.parametrize(
+        "initial",
+        ["2018-10-08 13:36:45+00:00", "2018-10-08 13:36:45+03:00"],  # Non-UTC timezone
+    )
+    @pytest.mark.parametrize("method", ["min", "max"])
+    def test_preserve_timezone(self, initial: str, method):
+        # GH 28552
+        initial_dt = to_datetime(initial)
+        expected = Series([initial_dt])
+        df = DataFrame([expected])
+        result = getattr(df, method)(axis=1)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("method", ["min", "max"])
+    def test_minmax_tzaware_skipna_axis_1(self, method, skipna):
+        # GH#51242
+        val = to_datetime("1900-01-01", utc=True)
+        df = DataFrame(
+            {"a": Series([pd.NaT, pd.NaT, val]), "b": Series([pd.NaT, val, val])}
+        )
+        op = getattr(df, method)
+        result = op(axis=1, skipna=skipna)
+        if skipna:
+            expected = Series([pd.NaT, val, val])
+        else:
+            expected = Series([pd.NaT, pd.NaT, val])
+        tm.assert_series_equal(result, expected)
+
+    def test_frame_any_with_timedelta(self):
+        # GH#17667
+        df = DataFrame(
+            {
+                "a": Series([0, 0]),
+                "t": Series([to_timedelta(0, "s"), to_timedelta(1, "ms")]),
+            }
+        )
+
+        result = df.any(axis=0)
+        expected = Series(data=[False, True], index=["a", "t"])
+        tm.assert_series_equal(result, expected)
+
+        result = df.any(axis=1)
+        expected = Series(data=[False, True])
+        tm.assert_series_equal(result, expected)
+
+    def test_reductions_skipna_none_raises(
+        self, request, frame_or_series, all_reductions
+    ):
+        if all_reductions == "count":
+            request.applymarker(
+                pytest.mark.xfail(reason="Count does not accept skipna")
+            )
+        obj = frame_or_series([1, 2, 3])
+        msg = 'For argument "skipna" expected type bool, received type NoneType.'
+        with pytest.raises(ValueError, match=msg):
+            getattr(obj, all_reductions)(skipna=None)
+
+    def test_reduction_timestamp_smallest_unit(self):
+        # GH#52524
+        df = DataFrame(
+            {
+                "a": Series([Timestamp("2019-12-31")], dtype="datetime64[s]"),
+                "b": Series(
+                    [Timestamp("2019-12-31 00:00:00.123")], dtype="datetime64[ms]"
+                ),
+            }
+        )
+        result = df.max()
+        expected = Series(
+            [Timestamp("2019-12-31"), Timestamp("2019-12-31 00:00:00.123")],
+            dtype="datetime64[ms]",
+            index=["a", "b"],
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_reduction_timedelta_smallest_unit(self):
+        # GH#52524
+        df = DataFrame(
+            {
+                "a": Series([pd.Timedelta("1 days")], dtype="timedelta64[s]"),
+                "b": Series([pd.Timedelta("1 days")], dtype="timedelta64[ms]"),
+            }
+        )
+        result = df.max()
+        expected = Series(
+            [pd.Timedelta("1 days"), pd.Timedelta("1 days")],
+            dtype="timedelta64[ms]",
+            index=["a", "b"],
+        )
+        tm.assert_series_equal(result, expected)
+
+
+class TestNuisanceColumns:
+    def test_any_all_categorical_dtype_nuisance_column(self, all_boolean_reductions):
+        # GH#36076 DataFrame should match Series behavior
+        ser = Series([0, 1], dtype="category", name="A")
+        df = ser.to_frame()
+
+        # Double-check the Series behavior is to raise
+        with pytest.raises(TypeError, match="does not support operation"):
+            getattr(ser, all_boolean_reductions)()
+
+        with pytest.raises(TypeError, match="does not support operation"):
+            getattr(np, all_boolean_reductions)(ser)
+
+        with pytest.raises(TypeError, match="does not support operation"):
+            getattr(df, all_boolean_reductions)(bool_only=False)
+
+        with pytest.raises(TypeError, match="does not support operation"):
+            getattr(df, all_boolean_reductions)(bool_only=None)
+
+        with pytest.raises(TypeError, match="does not support operation"):
+            getattr(np, all_boolean_reductions)(df, axis=0)
+
+    def test_median_categorical_dtype_nuisance_column(self):
+        # GH#21020 DataFrame.median should match Series.median
+        df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])})
+        ser = df["A"]
+
+        # Double-check the Series behavior is to raise
+        with pytest.raises(TypeError, match="does not support operation"):
+            ser.median()
+
+        with pytest.raises(TypeError, match="does not support operation"):
+            df.median(numeric_only=False)
+
+        with pytest.raises(TypeError, match="does not support operation"):
+            df.median()
+
+        # same thing, but with an additional non-categorical column
+        df["B"] = df["A"].astype(int)
+
+        with pytest.raises(TypeError, match="does not support operation"):
+            df.median(numeric_only=False)
+
+        with pytest.raises(TypeError, match="does not support operation"):
+            df.median()
+
+        # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead
+        #  of expected.values
+
+    @pytest.mark.parametrize("method", ["min", "max"])
+    def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
+        # GH#28949 DataFrame.min should behave like Series.min
+        cat = Categorical(["a", "b", "c", "b"], ordered=False)
+        ser = Series(cat)
+        df = ser.to_frame("A")
+
+        # Double-check the Series behavior
+        with pytest.raises(TypeError, match="is not ordered for operation"):
+            getattr(ser, method)()
+
+        with pytest.raises(TypeError, match="is not ordered for operation"):
+            getattr(np, method)(ser)
+
+        with pytest.raises(TypeError, match="is not ordered for operation"):
+            getattr(df, method)(numeric_only=False)
+
+        with pytest.raises(TypeError, match="is not ordered for operation"):
+            getattr(df, method)()
+
+        with pytest.raises(TypeError, match="is not ordered for operation"):
+            getattr(np, method)(df, axis=0)
+
+        # same thing, but with an additional non-categorical column
+        df["B"] = df["A"].astype(object)
+        with pytest.raises(TypeError, match="is not ordered for operation"):
+            getattr(df, method)()
+
+        with pytest.raises(TypeError, match="is not ordered for operation"):
+            getattr(np, method)(df, axis=0)
+
+
+class TestEmptyDataFrameReductions:
+    @pytest.mark.parametrize(
+        "opname, dtype, exp_value, exp_dtype",
+        [
+            ("sum", np.int8, 0, np.int64),
+            ("prod", np.int8, 1, np.int_),
+            ("sum", np.int64, 0, np.int64),
+            ("prod", np.int64, 1, np.int64),
+            ("sum", np.uint8, 0, np.uint64),
+            ("prod", np.uint8, 1, np.uint),
+            ("sum", np.uint64, 0, np.uint64),
+            ("prod", np.uint64, 1, np.uint64),
+            ("sum", np.float32, 0, np.float32),
+            ("prod", np.float32, 1, np.float32),
+            ("sum", np.float64, 0, np.float64),
+        ],
+    )
+    def test_df_empty_min_count_0(self, opname, dtype, exp_value, exp_dtype):
+        df = DataFrame({0: [], 1: []}, dtype=dtype)
+        result = getattr(df, opname)(min_count=0)
+
+        expected = Series([exp_value, exp_value], dtype=exp_dtype, index=range(2))
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "opname, dtype, exp_dtype",
+        [
+            ("sum", np.int8, np.float64),
+            ("prod", np.int8, np.float64),
+            ("sum", np.int64, np.float64),
+            ("prod", np.int64, np.float64),
+            ("sum", np.uint8, np.float64),
+            ("prod", np.uint8, np.float64),
+            ("sum", np.uint64, np.float64),
+            ("prod", np.uint64, np.float64),
+            ("sum", np.float32, np.float32),
+            ("prod", np.float32, np.float32),
+            ("sum", np.float64, np.float64),
+        ],
+    )
+    def test_df_empty_min_count_1(self, opname, dtype, exp_dtype):
+        df = DataFrame({0: [], 1: []}, dtype=dtype)
+        result = getattr(df, opname)(min_count=1)
+
+        expected = Series([np.nan, np.nan], dtype=exp_dtype, index=Index([0, 1]))
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "opname, dtype, exp_value, exp_dtype",
+        [
+            ("sum", "Int8", 0, ("Int32" if is_windows_np2_or_is32 else "Int64")),
+            ("prod", "Int8", 1, ("Int32" if is_windows_np2_or_is32 else "Int64")),
+            ("sum", "Int64", 0, "Int64"),
+            ("prod", "Int64", 1, "Int64"),
+            ("sum", "UInt8", 0, ("UInt32" if is_windows_np2_or_is32 else "UInt64")),
+            ("prod", "UInt8", 1, ("UInt32" if is_windows_np2_or_is32 else "UInt64")),
+            ("sum", "UInt64", 0, "UInt64"),
+            ("prod", "UInt64", 1, "UInt64"),
+            ("sum", "Float32", 0, "Float32"),
+            ("prod", "Float32", 1, "Float32"),
+            ("sum", "Float64", 0, "Float64"),
+        ],
+    )
+    def test_df_empty_nullable_min_count_0(self, opname, dtype, exp_value, exp_dtype):
+        df = DataFrame({0: [], 1: []}, dtype=dtype)
+        result = getattr(df, opname)(min_count=0)
+
+        expected = Series([exp_value, exp_value], dtype=exp_dtype, index=Index([0, 1]))
+        tm.assert_series_equal(result, expected)
+
+    # TODO: why does min_count=1 impact the resulting Windows dtype
+    # differently than min_count=0?
+    @pytest.mark.parametrize(
+        "opname, dtype, exp_dtype",
+        [
+            ("sum", "Int8", ("Int32" if is_windows_or_is32 else "Int64")),
+            ("prod", "Int8", ("Int32" if is_windows_or_is32 else "Int64")),
+            ("sum", "Int64", "Int64"),
+            ("prod", "Int64", "Int64"),
+            ("sum", "UInt8", ("UInt32" if is_windows_or_is32 else "UInt64")),
+            ("prod", "UInt8", ("UInt32" if is_windows_or_is32 else "UInt64")),
+            ("sum", "UInt64", "UInt64"),
+            ("prod", "UInt64", "UInt64"),
+            ("sum", "Float32", "Float32"),
+            ("prod", "Float32", "Float32"),
+            ("sum", "Float64", "Float64"),
+        ],
+    )
+    def test_df_empty_nullable_min_count_1(self, opname, dtype, exp_dtype):
+        df = DataFrame({0: [], 1: []}, dtype=dtype)
+        result = getattr(df, opname)(min_count=1)
+
+        expected = Series([pd.NA, pd.NA], dtype=exp_dtype, index=Index([0, 1]))
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "data",
+        [
+            {"a": [0, 1, 2], "b": [pd.NaT, pd.NaT, pd.NaT]},
+            {"a": [0, 1, 2], "b": [Timestamp("1990-01-01"), pd.NaT, pd.NaT]},
+            {
+                "a": [0, 1, 2],
+                "b": [
+                    Timestamp("1990-01-01"),
+                    Timestamp("1991-01-01"),
+                    Timestamp("1992-01-01"),
+                ],
+            },
+            {
+                "a": [0, 1, 2],
+                "b": [pd.Timedelta("1 days"), pd.Timedelta("2 days"), pd.NaT],
+            },
+            {
+                "a": [0, 1, 2],
+                "b": [
+                    pd.Timedelta("1 days"),
+                    pd.Timedelta("2 days"),
+                    pd.Timedelta("3 days"),
+                ],
+            },
+        ],
+    )
+    def test_df_cov_pd_nat(self, data):
+        # GH #53115
+        df = DataFrame(data)
+        with pytest.raises(TypeError, match="not supported for cov"):
+            df.cov()
+
+
+def test_sum_timedelta64_skipna_false():
+    # GH#17235
+    arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2)
+    arr[-1, -1] = "Nat"
+
+    df = DataFrame(arr)
+    assert (df.dtypes == arr.dtype).all()
+
+    result = df.sum(skipna=False)
+    expected = Series([pd.Timedelta(seconds=12), pd.NaT], dtype="m8[s]")
+    tm.assert_series_equal(result, expected)
+
+    result = df.sum(axis=0, skipna=False)
+    tm.assert_series_equal(result, expected)
+
+    result = df.sum(axis=1, skipna=False)
+    expected = Series(
+        [
+            pd.Timedelta(seconds=1),
+            pd.Timedelta(seconds=5),
+            pd.Timedelta(seconds=9),
+            pd.NaT,
+        ],
+        dtype="m8[s]",
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_mixed_frame_with_integer_sum():
+    # https://github.com/pandas-dev/pandas/issues/34520
+    df = DataFrame([["a", 1]], columns=list("ab"))
+    df = df.astype({"b": "Int64"})
+    result = df.sum()
+    expected = Series(["a", 1], index=["a", "b"])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("numeric_only", [True, False, None])
+@pytest.mark.parametrize("method", ["min", "max"])
+def test_minmax_extensionarray(method, numeric_only):
+    # https://github.com/pandas-dev/pandas/issues/32651
+    int64_info = np.iinfo("int64")
+    ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype())
+    df = DataFrame({"Int64": ser})
+    result = getattr(df, method)(numeric_only=numeric_only)
+    expected = Series(
+        [getattr(int64_info, method)],
+        dtype="Int64",
+        index=Index(["Int64"]),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("ts_value", [Timestamp("2000-01-01"), pd.NaT])
+def test_frame_mixed_numeric_object_with_timestamp(ts_value):
+    # GH 13912
+    df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]})
+    with pytest.raises(TypeError, match="does not support operation|Cannot perform"):
+        df.sum()
+
+
+def test_prod_sum_min_count_mixed_object():
+    # https://github.com/pandas-dev/pandas/issues/41074
+    df = DataFrame([1, "a", True])
+
+    result = df.prod(axis=0, min_count=1, numeric_only=False)
+    expected = Series(["a"], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+    msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'")
+    with pytest.raises(TypeError, match=msg):
+        df.sum(axis=0, min_count=1, numeric_only=False)
+
+
+@pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"])
+@pytest.mark.parametrize("numeric_only", [True, False])
+@pytest.mark.parametrize("dtype", ["float64", "Float64"])
+def test_reduction_axis_none_returns_scalar(method, numeric_only, dtype):
+    # GH#21597 As of 2.0, axis=None reduces over all axes.
+
+    df = DataFrame(np.random.default_rng(2).standard_normal((4, 4)), dtype=dtype)
+
+    result = getattr(df, method)(axis=None, numeric_only=numeric_only)
+    np_arr = df.to_numpy(dtype=np.float64)
+    if method in {"skew", "kurt"}:
+        comp_mod = pytest.importorskip("scipy.stats")
+        if method == "kurt":
+            method = "kurtosis"
+        expected = getattr(comp_mod, method)(np_arr, bias=False, axis=None)
+        tm.assert_almost_equal(result, expected)
+    else:
+        expected = getattr(np, method)(np_arr, axis=None)
+        assert result == expected
+
+
+@pytest.mark.parametrize(
+    "kernel",
+    [
+        "corr",
+        "corrwith",
+        "cov",
+        "idxmax",
+        "idxmin",
+        "kurt",
+        "max",
+        "mean",
+        "median",
+        "min",
+        "prod",
+        "quantile",
+        "sem",
+        "skew",
+        "std",
+        "sum",
+        "var",
+    ],
+)
+def test_fails_on_non_numeric(kernel):
+    # GH#46852
+    df = DataFrame({"a": [1, 2, 3], "b": object})
+    args = (df,) if kernel == "corrwith" else ()
+    msg = "|".join(
+        [
+            "not allowed for this dtype",
+            "argument must be a string or a number",
+            "not supported between instances of",
+            "unsupported operand type",
+            "argument must be a string or a real number",
+        ]
+    )
+    if kernel == "median":
+        # slightly different message on different builds
+        msg1 = (
+            r"Cannot convert \[\[<class 'object'> <class 'object'> "
+            r"<class 'object'>\]\] to numeric"
+        )
+        msg2 = (
+            r"Cannot convert \[<class 'object'> <class 'object'> "
+            r"<class 'object'>\] to numeric"
+        )
+        msg = "|".join([msg1, msg2])
+    with pytest.raises(TypeError, match=msg):
+        getattr(df, kernel)(*args)
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        "all",
+        "any",
+        "count",
+        "idxmax",
+        "idxmin",
+        "kurt",
+        "kurtosis",
+        "max",
+        "mean",
+        "median",
+        "min",
+        "nunique",
+        "prod",
+        "product",
+        "sem",
+        "skew",
+        "std",
+        "sum",
+        "var",
+    ],
+)
+@pytest.mark.parametrize("min_count", [0, 2])
+def test_numeric_ea_axis_1(
+    method, skipna, min_count, any_numeric_ea_dtype, using_nan_is_na
+):
+    # GH 54341
+    df = DataFrame(
+        {
+            "a": Series([0, 1, 2, 3], dtype=any_numeric_ea_dtype),
+            "b": Series([0, 1, pd.NA, 3], dtype=any_numeric_ea_dtype),
+        },
+    )
+    expected_df = DataFrame(
+        {
+            "a": [0.0, 1.0, 2.0, 3.0],
+            "b": [0.0, 1.0, np.nan, 3.0],
+        },
+    )
+    if method in ("count", "nunique"):
+        expected_dtype = "int64"
+    elif method in ("all", "any"):
+        expected_dtype = "boolean"
+    elif method in (
+        "kurt",
+        "kurtosis",
+        "mean",
+        "median",
+        "sem",
+        "skew",
+        "std",
+        "var",
+    ) and not any_numeric_ea_dtype.startswith("Float"):
+        expected_dtype = "Float64"
+    else:
+        expected_dtype = any_numeric_ea_dtype
+
+    kwargs = {}
+    if method not in ("count", "nunique", "quantile"):
+        kwargs["skipna"] = skipna
+    if method in ("prod", "product", "sum"):
+        kwargs["min_count"] = min_count
+
+    if not skipna and method in ("idxmax", "idxmin"):
+        with pytest.raises(ValueError, match="encountered an NA value"):
+            getattr(df, method)(axis=1, **kwargs)
+        with pytest.raises(ValueError, match="Encountered an NA value"):
+            getattr(expected_df, method)(axis=1, **kwargs)
+        return
+    result = getattr(df, method)(axis=1, **kwargs)
+    expected = getattr(expected_df, method)(axis=1, **kwargs)
+    if method not in ("idxmax", "idxmin"):
+        if using_nan_is_na:
+            expected = expected.astype(expected_dtype)
+        else:
+            mask = np.isnan(expected)
+            expected[mask] = 0
+            expected = expected.astype(expected_dtype)
+            expected[mask] = pd.NA
+    tm.assert_series_equal(result, expected)
+
+
+def test_mean_nullable_int_axis_1():
+    # GH##36585
+    df = DataFrame(
+        {"a": [1, 2, 3, 4], "b": Series([1, 2, 4, None], dtype=pd.Int64Dtype())}
+    )
+
+    result = df.mean(axis=1, skipna=True)
+    expected = Series([1.0, 2.0, 3.5, 4.0], dtype="Float64")
+    tm.assert_series_equal(result, expected)
+
+    result = df.mean(axis=1, skipna=False)
+    expected = Series([1.0, 2.0, 3.5, pd.NA], dtype="Float64")
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py
new file mode 100644
index 0000000000000000000000000000000000000000..73628424725e57fd5c639f0238d229a289d6ea8e
--- /dev/null
+++ b/pandas/tests/frame/test_repr.py
@@ -0,0 +1,498 @@
+from datetime import (
+    datetime,
+    timedelta,
+)
+from io import StringIO
+
+import numpy as np
+import pytest
+
+from pandas import (
+    NA,
+    Categorical,
+    CategoricalIndex,
+    DataFrame,
+    IntervalIndex,
+    MultiIndex,
+    NaT,
+    PeriodIndex,
+    Series,
+    Timestamp,
+    date_range,
+    option_context,
+    period_range,
+)
+import pandas._testing as tm
+
+
+class TestDataFrameRepr:
+    def test_repr_should_return_str(self):
+        # https://docs.python.org/3/reference/datamodel.html#object.__repr__
+        # "...The return value must be a string object."
+
+        # (str on py2.x, str (unicode) on py3)
+
+        data = [8, 5, 3, 5]
+        index1 = ["\u03c3", "\u03c4", "\u03c5", "\u03c6"]
+        cols = ["\u03c8"]
+        df = DataFrame(data, columns=cols, index=index1)
+        assert type(df.__repr__()) is str
+
+        ser = df[cols[0]]
+        assert type(ser.__repr__()) is str
+
+    def test_repr_bytes_61_lines(self):
+        # GH#12857
+        lets = list("ACDEFGHIJKLMNOP")
+        words = np.random.default_rng(2).choice(lets, (1000, 50))
+        df = DataFrame(words).astype("U1")
+        assert (df.dtypes == object).all()
+
+        # smoke tests; at one point this raised with 61 but not 60
+        repr(df)
+        repr(df.iloc[:60, :])
+        repr(df.iloc[:61, :])
+
+    def test_repr_unicode_level_names(self, frame_or_series):
+        index = MultiIndex.from_tuples([(0, 0), (1, 1)], names=["\u0394", "i1"])
+
+        obj = DataFrame(np.random.default_rng(2).standard_normal((2, 4)), index=index)
+        obj = tm.get_obj(obj, frame_or_series)
+        repr(obj)
+
+    def test_assign_index_sequences(self):
+        # GH#2200
+        df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}).set_index(
+            ["a", "b"]
+        )
+        index = list(df.index)
+        index[0] = ("faz", "boo")
+        df.index = index
+        repr(df)
+
+        # this travels an improper code path
+        index[0] = ["faz", "boo"]
+        df.index = index
+        repr(df)
+
+    def test_repr_with_mi_nat(self):
+        df = DataFrame({"X": [1, 2]}, index=[[NaT, Timestamp("20130101")], ["a", "b"]])
+        result = repr(df)
+        expected = "              X\nNaT        a  1\n2013-01-01 b  2"
+        assert result == expected
+
+    def test_repr_with_different_nulls(self):
+        # GH45263
+        df = DataFrame([1, 2, 3, 4], [True, None, np.nan, NaT])
+        result = repr(df)
+        expected = """      0
+True  1
+None  2
+NaN   3
+NaT   4"""
+        assert result == expected
+
+    def test_repr_with_different_nulls_cols(self):
+        # GH45263
+        d = {np.nan: [1, 2], None: [3, 4], NaT: [6, 7], True: [8, 9]}
+        df = DataFrame(data=d)
+        result = repr(df)
+        expected = """   NaN  None  NaT  True
+0    1     3    6     8
+1    2     4    7     9"""
+        assert result == expected
+
+    def test_multiindex_na_repr(self):
+        # only an issue with long columns
+        df3 = DataFrame(
+            {
+                "A" * 30: {("A", "A0006000", "nuit"): "A0006000"},
+                "B" * 30: {("A", "A0006000", "nuit"): np.nan},
+                "C" * 30: {("A", "A0006000", "nuit"): np.nan},
+                "D" * 30: {("A", "A0006000", "nuit"): np.nan},
+                "E" * 30: {("A", "A0006000", "nuit"): "A"},
+                "F" * 30: {("A", "A0006000", "nuit"): np.nan},
+            }
+        )
+
+        idf = df3.set_index(["A" * 30, "C" * 30])
+        repr(idf)
+
+    def test_repr_name_coincide(self):
+        index = MultiIndex.from_tuples(
+            [("a", 0, "foo"), ("b", 1, "bar")], names=["a", "b", "c"]
+        )
+
+        df = DataFrame({"value": [0, 1]}, index=index)
+
+        lines = repr(df).split("\n")
+        assert lines[2].startswith("a 0 foo")
+
+    def test_repr_to_string(
+        self,
+        multiindex_year_month_day_dataframe_random_data,
+        multiindex_dataframe_random_data,
+    ):
+        ymd = multiindex_year_month_day_dataframe_random_data
+        frame = multiindex_dataframe_random_data
+
+        repr(frame)
+        repr(ymd)
+        repr(frame.T)
+        repr(ymd.T)
+
+        buf = StringIO()
+        frame.to_string(buf=buf)
+        ymd.to_string(buf=buf)
+        frame.T.to_string(buf=buf)
+        ymd.T.to_string(buf=buf)
+
+    def test_repr_empty(self):
+        # empty
+        repr(DataFrame())
+
+        # empty with index
+        frame = DataFrame(index=np.arange(1000))
+        repr(frame)
+
+    def test_repr_mixed(self, float_string_frame):
+        # mixed
+        repr(float_string_frame)
+
+    @pytest.mark.slow
+    def test_repr_mixed_big(self):
+        # big mixed
+        biggie = DataFrame(
+            {
+                "A": np.random.default_rng(2).standard_normal(200),
+                "B": [str(i) for i in range(200)],
+            },
+            index=range(200),
+        )
+        biggie.loc[:20, "A"] = np.nan
+        biggie.loc[:20, "B"] = np.nan
+
+        repr(biggie)
+
+    def test_repr(self):
+        # columns but no index
+        no_index = DataFrame(columns=[0, 1, 3])
+        repr(no_index)
+
+        df = DataFrame(["a\n\r\tb"], columns=["a\n\r\td"], index=["a\n\r\tf"])
+        assert "\t" not in repr(df)
+        assert "\r" not in repr(df)
+        assert "a\n" not in repr(df)
+
+    def test_repr_dimensions(self):
+        df = DataFrame([[1, 2], [3, 4]])
+        with option_context("display.show_dimensions", True):
+            assert "2 rows x 2 columns" in repr(df)
+
+        with option_context("display.show_dimensions", False):
+            assert "2 rows x 2 columns" not in repr(df)
+
+        with option_context("display.show_dimensions", "truncate"):
+            assert "2 rows x 2 columns" not in repr(df)
+
+    @pytest.mark.slow
+    def test_repr_big(self):
+        # big one
+        biggie = DataFrame(np.zeros((200, 4)), columns=range(4), index=range(200))
+        repr(biggie)
+
+    def test_repr_unsortable(self):
+        # columns are not sortable
+
+        unsortable = DataFrame(
+            {
+                "foo": [1] * 50,
+                datetime.today(): [1] * 50,
+                "bar": ["bar"] * 50,
+                datetime.today() + timedelta(1): ["bar"] * 50,
+            },
+            index=np.arange(50),
+        )
+        repr(unsortable)
+
+    def test_repr_float_frame_options(self, float_frame):
+        repr(float_frame)
+
+        with option_context("display.precision", 3):
+            repr(float_frame)
+
+        with option_context("display.max_rows", 10, "display.max_columns", 2):
+            repr(float_frame)
+
+        with option_context("display.max_rows", 1000, "display.max_columns", 1000):
+            repr(float_frame)
+
+    def test_repr_unicode(self):
+        uval = "\u03c3\u03c3\u03c3\u03c3"
+
+        df = DataFrame({"A": [uval, uval]})
+
+        result = repr(df)
+        ex_top = "      A"
+        assert result.split("\n")[0].rstrip() == ex_top
+
+        df = DataFrame({"A": [uval, uval]})
+        result = repr(df)
+        assert result.split("\n")[0].rstrip() == ex_top
+
+    def test_unicode_string_with_unicode(self):
+        df = DataFrame({"A": ["\u05d0"]})
+        str(df)
+
+    def test_repr_unicode_columns(self):
+        df = DataFrame({"\u05d0": [1, 2, 3], "\u05d1": [4, 5, 6], "c": [7, 8, 9]})
+        repr(df.columns)  # should not raise UnicodeDecodeError
+
+    def test_str_to_bytes_raises(self):
+        # GH 26447
+        df = DataFrame({"A": ["abc"]})
+        msg = "^'str' object cannot be interpreted as an integer$"
+        with pytest.raises(TypeError, match=msg):
+            bytes(df)
+
+    def test_very_wide_repr(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 20)),
+            columns=np.array(["a" * 10] * 20, dtype=object),
+        )
+        repr(df)
+
+    def test_repr_column_name_unicode_truncation_bug(self):
+        # #1906
+        df = DataFrame(
+            {
+                "Id": [7117434],
+                "StringCol": (
+                    "Is it possible to modify drop plot code"
+                    "so that the output graph is displayed "
+                    "in iphone simulator, Is it possible to "
+                    "modify drop plot code so that the "
+                    "output graph is \xe2\x80\xa8displayed "
+                    "in iphone simulator.Now we are adding "
+                    "the CSV file externally. I want to Call "
+                    "the File through the code.."
+                ),
+            }
+        )
+
+        with option_context("display.max_columns", 20):
+            assert "StringCol" in repr(df)
+
+    def test_latex_repr(self):
+        pytest.importorskip("jinja2")
+        expected = r"""\begin{tabular}{llll}
+\toprule
+ & 0 & 1 & 2 \\
+\midrule
+0 & $\alpha$ & b & c \\
+1 & 1 & 2 & 3 \\
+\bottomrule
+\end{tabular}
+"""
+        with option_context(
+            "styler.format.escape", None, "styler.render.repr", "latex"
+        ):
+            df = DataFrame([[r"$\alpha$", "b", "c"], [1, 2, 3]])
+            result = df._repr_latex_()
+            assert result == expected
+
+        # GH 12182
+        assert df._repr_latex_() is None
+
+    def test_repr_with_datetimeindex(self):
+        df = DataFrame({"A": [1, 2, 3]}, index=date_range("2000", periods=3))
+        result = repr(df)
+        expected = "            A\n2000-01-01  1\n2000-01-02  2\n2000-01-03  3"
+        assert result == expected
+
+    def test_repr_with_intervalindex(self):
+        # https://github.com/pandas-dev/pandas/pull/24134/files
+        df = DataFrame(
+            {"A": [1, 2, 3, 4]}, index=IntervalIndex.from_breaks([0, 1, 2, 3, 4])
+        )
+        result = repr(df)
+        expected = "        A\n(0, 1]  1\n(1, 2]  2\n(2, 3]  3\n(3, 4]  4"
+        assert result == expected
+
+    def test_repr_with_categorical_index(self):
+        df = DataFrame({"A": [1, 2, 3]}, index=CategoricalIndex(["a", "b", "c"]))
+        result = repr(df)
+        expected = "   A\na  1\nb  2\nc  3"
+        assert result == expected
+
+    def test_repr_categorical_dates_periods(self):
+        # normal DataFrame
+        dt = date_range("2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern")
+        p = period_range("2011-01", freq="M", periods=5)
+        df = DataFrame({"dt": dt, "p": p})
+        exp = """                         dt        p
+0 2011-01-01 09:00:00-05:00  2011-01
+1 2011-01-01 10:00:00-05:00  2011-02
+2 2011-01-01 11:00:00-05:00  2011-03
+3 2011-01-01 12:00:00-05:00  2011-04
+4 2011-01-01 13:00:00-05:00  2011-05"""
+
+        assert repr(df) == exp
+
+        df2 = DataFrame({"dt": Categorical(dt), "p": Categorical(p)})
+        assert repr(df2) == exp
+
+    @pytest.mark.parametrize("arg", [np.datetime64, np.timedelta64])
+    @pytest.mark.parametrize(
+        "box, expected",
+        [[Series, "0    NaT\ndtype: object"], [DataFrame, "     0\n0  NaT"]],
+    )
+    def test_repr_np_nat_with_object(self, arg, box, expected):
+        # GH 25445
+        result = repr(box([arg("NaT")], dtype=object))
+        assert result == expected
+
+    def test_frame_datetime64_pre1900_repr(self):
+        df = DataFrame({"year": date_range("1/1/1700", periods=50, freq="YE-DEC")})
+        # it works!
+        repr(df)
+
+    def test_frame_to_string_with_periodindex(self):
+        index = PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M")
+        frame = DataFrame(np.random.default_rng(2).standard_normal((3, 4)), index=index)
+
+        # it works!
+        frame.to_string()
+
+    def test_to_string_ea_na_in_multiindex(self):
+        # GH#47986
+        df = DataFrame(
+            {"a": [1, 2]},
+            index=MultiIndex.from_arrays([Series([NA, 1], dtype="Int64")]),
+        )
+
+        result = df.to_string()
+        expected = """      a
+<NA>  1
+1     2"""
+        assert result == expected
+
+    def test_datetime64tz_slice_non_truncate(self):
+        # GH 30263
+        df = DataFrame({"x": date_range("2019", periods=10, tz="UTC")})
+        expected = repr(df)
+        df = df.iloc[:, :5]
+        result = repr(df)
+        assert result == expected
+
+    def test_to_records_no_typeerror_in_repr(self):
+        # GH 48526
+        df = DataFrame([["a", "b"], ["c", "d"], ["e", "f"]], columns=["left", "right"])
+        df["record"] = df[["left", "right"]].to_records()
+        expected = """  left right     record
+0    a     b  [0, a, b]
+1    c     d  [1, c, d]
+2    e     f  [2, e, f]"""
+        result = repr(df)
+        assert result == expected
+
+    def test_to_records_with_na_record_value(self):
+        # GH 48526
+        df = DataFrame(
+            [["a", np.nan], ["c", "d"], ["e", "f"]], columns=["left", "right"]
+        )
+        df["record"] = df[["left", "right"]].to_records()
+        expected = """  left right       record
+0    a   NaN  [0, a, nan]
+1    c     d    [1, c, d]
+2    e     f    [2, e, f]"""
+        result = repr(df)
+        assert result == expected
+
+    def test_to_records_with_na_record(self):
+        # GH 48526
+        df = DataFrame(
+            [["a", "b"], [np.nan, np.nan], ["e", "f"]], columns=[np.nan, "right"]
+        )
+        df["record"] = df[[np.nan, "right"]].to_records()
+        expected = """   NaN right         record
+0    a     b      [0, a, b]
+1  NaN   NaN  [1, nan, nan]
+2    e     f      [2, e, f]"""
+        result = repr(df)
+        assert result == expected
+
+    def test_to_records_with_inf_record(self):
+        # GH 48526
+        expected = """   NaN  inf         record
+0  inf    b    [0, inf, b]
+1  NaN  NaN  [1, nan, nan]
+2    e    f      [2, e, f]"""
+        df = DataFrame(
+            [[np.inf, "b"], [np.nan, np.nan], ["e", "f"]],
+            columns=[np.nan, np.inf],
+        )
+        df["record"] = df[[np.nan, np.inf]].to_records()
+        result = repr(df)
+        assert result == expected
+
+    def test_masked_ea_with_formatter(self):
+        # GH#39336
+        df = DataFrame(
+            {
+                "a": Series([0.123456789, 1.123456789], dtype="Float64"),
+                "b": Series([1, 2], dtype="Int64"),
+            }
+        )
+        result = df.to_string(formatters=["{:.2f}".format, "{:.2f}".format])
+        expected = """      a     b
+0  0.12  1.00
+1  1.12  2.00"""
+        assert result == expected
+
+    def test_repr_ea_columns(self, any_string_dtype):
+        # GH#54797
+        pytest.importorskip("pyarrow")
+        df = DataFrame({"long_column_name": [1, 2, 3], "col2": [4, 5, 6]})
+        df.columns = df.columns.astype(any_string_dtype)
+        expected = """   long_column_name  col2
+0                 1     4
+1                 2     5
+2                 3     6"""
+        assert repr(df) == expected
+
+
+@pytest.mark.parametrize(
+    "data,output",
+    [
+        ([2, complex("nan"), 1], [" 2.0+0.0j", " NaN+0.0j", " 1.0+0.0j"]),
+        ([2, complex("nan"), -1], [" 2.0+0.0j", " NaN+0.0j", "-1.0+0.0j"]),
+        ([-2, complex("nan"), -1], ["-2.0+0.0j", " NaN+0.0j", "-1.0+0.0j"]),
+        ([-1.23j, complex("nan"), -1], ["-0.00-1.23j", "  NaN+0.00j", "-1.00+0.00j"]),
+        ([1.23j, complex("nan"), 1.23], [" 0.00+1.23j", "  NaN+0.00j", " 1.23+0.00j"]),
+        (
+            [-1.23j, complex(np.nan, np.nan), 1],
+            ["-0.00-1.23j", "  NaN+ NaNj", " 1.00+0.00j"],
+        ),
+        (
+            [-1.23j, complex(1.2, np.nan), 1],
+            ["-0.00-1.23j", " 1.20+ NaNj", " 1.00+0.00j"],
+        ),
+        (
+            [-1.23j, complex(np.nan, -1.2), 1],
+            ["-0.00-1.23j", "  NaN-1.20j", " 1.00+0.00j"],
+        ),
+    ],
+)
+@pytest.mark.parametrize("as_frame", [True, False])
+def test_repr_with_complex_nans(data, output, as_frame):
+    # GH#53762, GH#53841
+    obj = Series(np.array(data))
+    if as_frame:
+        obj = obj.to_frame(name="val")
+        reprs = [f"{i} {val}" for i, val in enumerate(output)]
+        expected = f"{'val': >{len(reprs[0])}}\n" + "\n".join(reprs)
+    else:
+        reprs = [f"{i}   {val}" for i, val in enumerate(output)]
+        expected = "\n".join(reprs) + "\ndtype: complex128"
+    assert str(obj) == expected, f"\n{obj!s}\n\n{expected}"
diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6587ff486d8a4eb016fad9fd5cf583441080829
--- /dev/null
+++ b/pandas/tests/frame/test_stack_unstack.py
@@ -0,0 +1,2781 @@
+from datetime import datetime
+import itertools
+import re
+
+import numpy as np
+import pytest
+
+from pandas._libs import lib
+from pandas.errors import Pandas4Warning
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Period,
+    Series,
+    Timedelta,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.core.reshape import reshape as reshape_lib
+
+
+@pytest.fixture(params=[True, False])
+def future_stack(request):
+    return request.param
+
+
+class TestDataFrameReshape:
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_unstack(self, float_frame, future_stack):
+        df = float_frame.copy()
+        df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)
+
+        stacked = df.stack(future_stack=future_stack)
+        stacked_df = DataFrame({"foo": stacked, "bar": stacked})
+
+        unstacked = stacked.unstack()
+        unstacked_df = stacked_df.unstack()
+
+        tm.assert_frame_equal(unstacked, df)
+        tm.assert_frame_equal(unstacked_df["bar"], df)
+
+        unstacked_cols = stacked.unstack(0)
+        unstacked_cols_df = stacked_df.unstack(0)
+        tm.assert_frame_equal(unstacked_cols.T, df)
+        tm.assert_frame_equal(unstacked_cols_df["bar"].T, df)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_mixed_level(self, future_stack):
+        # GH 18310
+        levels = [range(3), [3, "a", "b"], [1, 2]]
+
+        # flat columns:
+        df = DataFrame(1, index=levels[0], columns=levels[1])
+        result = df.stack(future_stack=future_stack)
+        expected = Series(1, index=MultiIndex.from_product(levels[:2]))
+        tm.assert_series_equal(result, expected)
+
+        # MultiIndex columns:
+        df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:]))
+        result = df.stack(1, future_stack=future_stack)
+        expected = DataFrame(
+            1, index=MultiIndex.from_product([levels[0], levels[2]]), columns=levels[1]
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # as above, but used labels in level are actually of homogeneous type
+        result = df[["a", "b"]].stack(1, future_stack=future_stack)
+        expected = expected[["a", "b"]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_not_consolidated(self):
+        # Gh#34708
+        df = DataFrame({"x": [1, 2, np.nan], "y": [3.0, 4, np.nan]})
+        df2 = df[["x"]]
+        df2["y"] = df["y"]
+        assert len(df2._mgr.blocks) == 2
+
+        res = df2.unstack()
+        expected = df.unstack()
+        tm.assert_series_equal(res, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_unstack_fill(self, future_stack):
+        # GH #9746: fill_value keyword argument for Series
+        # and DataFrame unstack
+
+        # From a series
+        data = Series([1, 2, 4, 5], dtype=np.int16)
+        data.index = MultiIndex.from_tuples(
+            [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
+        )
+
+        result = data.unstack(fill_value=-1)
+        expected = DataFrame(
+            {"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16
+        )
+        tm.assert_frame_equal(result, expected)
+
+        msg = (
+            "Using a fill_value that cannot be held in the existing dtype is deprecated"
+        )
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            # From a series with incorrect data type for fill_value
+            result = data.unstack(fill_value=0.5)
+        expected = DataFrame(
+            {"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=float
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # GH #13971: fill_value when unstacking multiple levels:
+        df = DataFrame(
+            {"x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2]}
+        ).set_index(["x", "y", "z"])
+        unstacked = df.unstack(["x", "y"], fill_value=0)
+        key = ("w", "b", "j")
+        expected = unstacked[key]
+        result = Series([0, 0, 2], index=unstacked.index, name=key)
+        tm.assert_series_equal(result, expected)
+
+        stacked = unstacked.stack(["x", "y"], future_stack=future_stack)
+        stacked.index = stacked.index.reorder_levels(df.index.names)
+        # Workaround for GH #17886 (unnecessarily casts to float):
+        stacked = stacked.astype(np.int64)
+        result = stacked.loc[df.index]
+        tm.assert_frame_equal(result, df)
+
+        # From a series
+        s = df["w"]
+        result = s.unstack(["x", "y"], fill_value=0)
+        expected = unstacked["w"]
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_fill_frame(self):
+        # From a dataframe
+        rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
+        df = DataFrame(rows, columns=list("AB"), dtype=np.int32)
+        df.index = MultiIndex.from_tuples(
+            [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
+        )
+
+        result = df.unstack(fill_value=-1)
+
+        rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
+        expected = DataFrame(rows, index=list("xyz"), dtype=np.int32)
+        expected.columns = MultiIndex.from_tuples(
+            [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # From a mixed type dataframe
+        df["A"] = df["A"].astype(np.int16)
+        df["B"] = df["B"].astype(np.float64)
+
+        result = df.unstack(fill_value=-1)
+        expected["A"] = expected["A"].astype(np.int16)
+        expected["B"] = expected["B"].astype(np.float64)
+        tm.assert_frame_equal(result, expected)
+
+        msg = (
+            "Using a fill_value that cannot be held in the existing dtype is deprecated"
+        )
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            # From a dataframe with incorrect data type for fill_value
+            result = df.unstack(fill_value=0.5)
+
+        rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
+        expected = DataFrame(rows, index=list("xyz"), dtype=float)
+        expected.columns = MultiIndex.from_tuples(
+            [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_fill_frame_datetime(self):
+        # Test unstacking with date times
+        dv = date_range("2012-01-01", periods=4).values
+        data = Series(dv)
+        data.index = MultiIndex.from_tuples(
+            [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
+        )
+
+        result = data.unstack()
+        expected = DataFrame(
+            {"a": [dv[0], pd.NaT, dv[3]], "b": [dv[1], dv[2], pd.NaT]},
+            index=["x", "y", "z"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = data.unstack(fill_value=dv[0])
+        expected = DataFrame(
+            {"a": [dv[0], dv[0], dv[3]], "b": [dv[1], dv[2], dv[0]]},
+            index=["x", "y", "z"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_fill_frame_timedelta(self):
+        # Test unstacking with time deltas
+        td = [Timedelta(days=i) for i in range(4)]
+        data = Series(td)
+        data.index = MultiIndex.from_tuples(
+            [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
+        )
+
+        result = data.unstack()
+        expected = DataFrame(
+            {"a": [td[0], pd.NaT, td[3]], "b": [td[1], td[2], pd.NaT]},
+            index=["x", "y", "z"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = data.unstack(fill_value=td[1])
+        expected = DataFrame(
+            {"a": [td[0], td[1], td[3]], "b": [td[1], td[2], td[1]]},
+            index=["x", "y", "z"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_fill_frame_period(self):
+        # Test unstacking with period
+        periods = [
+            Period("2012-01"),
+            Period("2012-02"),
+            Period("2012-03"),
+            Period("2012-04"),
+        ]
+        data = Series(periods)
+        data.index = MultiIndex.from_tuples(
+            [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
+        )
+
+        result = data.unstack()
+        expected = DataFrame(
+            {"a": [periods[0], None, periods[3]], "b": [periods[1], periods[2], None]},
+            index=["x", "y", "z"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = data.unstack(fill_value=periods[1])
+        expected = DataFrame(
+            {
+                "a": [periods[0], periods[1], periods[3]],
+                "b": [periods[1], periods[2], periods[1]],
+            },
+            index=["x", "y", "z"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_fill_frame_categorical(self):
+        # Test unstacking with categorical
+        data = Series(["a", "b", "c", "a"], dtype="category")
+        data.index = MultiIndex.from_tuples(
+            [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
+        )
+
+        # By default missing values will be NaN
+        result = data.unstack()
+        expected = DataFrame(
+            {
+                "a": pd.Categorical(["a", None, "a"], categories=list("abc")),
+                "b": pd.Categorical(["b", "c", None], categories=list("abc")),
+            },
+            index=list("xyz"),
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # Fill with non-category results in a ValueError
+        msg = r"Cannot setitem on a Categorical with a new category \(d\)"
+        with pytest.raises(TypeError, match=msg):
+            data.unstack(fill_value="d")
+
+        # Fill with category value replaces missing values as expected
+        result = data.unstack(fill_value="c")
+        expected = DataFrame(
+            {
+                "a": pd.Categorical(list("aca"), categories=list("abc")),
+                "b": pd.Categorical(list("bcc"), categories=list("abc")),
+            },
+            index=list("xyz"),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_tuplename_in_multiindex(self):
+        # GH 19966
+        idx = MultiIndex.from_product(
+            [["a", "b", "c"], [1, 2, 3]], names=[("A", "a"), ("B", "b")]
+        )
+        df = DataFrame({"d": [1] * 9, "e": [2] * 9}, index=idx)
+        result = df.unstack(("A", "a"))
+
+        expected = DataFrame(
+            [[1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2]],
+            columns=MultiIndex.from_tuples(
+                [
+                    ("d", "a"),
+                    ("d", "b"),
+                    ("d", "c"),
+                    ("e", "a"),
+                    ("e", "b"),
+                    ("e", "c"),
+                ],
+                names=[None, ("A", "a")],
+            ),
+            index=Index([1, 2, 3], name=("B", "b")),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "unstack_idx, expected_values, expected_index, expected_columns",
+        [
+            (
+                ("A", "a"),
+                [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]],
+                MultiIndex.from_tuples(
+                    [(1, 3), (1, 4), (2, 3), (2, 4)], names=["B", "C"]
+                ),
+                MultiIndex.from_tuples(
+                    [("d", "a"), ("d", "b"), ("e", "a"), ("e", "b")],
+                    names=[None, ("A", "a")],
+                ),
+            ),
+            (
+                (("A", "a"), "B"),
+                [[1, 1, 1, 1, 2, 2, 2, 2], [1, 1, 1, 1, 2, 2, 2, 2]],
+                Index([3, 4], name="C"),
+                MultiIndex.from_tuples(
+                    [
+                        ("d", "a", 1),
+                        ("d", "a", 2),
+                        ("d", "b", 1),
+                        ("d", "b", 2),
+                        ("e", "a", 1),
+                        ("e", "a", 2),
+                        ("e", "b", 1),
+                        ("e", "b", 2),
+                    ],
+                    names=[None, ("A", "a"), "B"],
+                ),
+            ),
+        ],
+    )
+    def test_unstack_mixed_type_name_in_multiindex(
+        self, unstack_idx, expected_values, expected_index, expected_columns
+    ):
+        # GH 19966
+        idx = MultiIndex.from_product(
+            [["a", "b"], [1, 2], [3, 4]], names=[("A", "a"), "B", "C"]
+        )
+        df = DataFrame({"d": [1] * 8, "e": [2] * 8}, index=idx)
+        result = df.unstack(unstack_idx)
+
+        expected = DataFrame(
+            expected_values, columns=expected_columns, index=expected_index
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_preserve_dtypes(self):
+        # Checks fix for #11847
+        df = DataFrame(
+            {
+                "state": ["IL", "MI", "NC"],
+                "index": ["a", "b", "c"],
+                "some_categories": Series(["a", "b", "c"]).astype("category"),
+                "A": np.random.default_rng(2).random(3),
+                "B": 1,
+                "C": "foo",
+                "D": pd.Timestamp("20010102"),
+                "E": Series([1.0, 50.0, 100.0]).astype("float32"),
+                "F": Series([3.0, 4.0, 5.0]).astype("float64"),
+                "G": False,
+                "H": Series([1, 200, 923442]).astype("int8"),
+            }
+        )
+
+        def unstack_and_compare(df, column_name):
+            unstacked1 = df.unstack([column_name])
+            unstacked2 = df.unstack(column_name)
+            tm.assert_frame_equal(unstacked1, unstacked2)
+
+        df1 = df.set_index(["state", "index"])
+        unstack_and_compare(df1, "index")
+
+        df1 = df.set_index(["state", "some_categories"])
+        unstack_and_compare(df1, "some_categories")
+
+        df1 = df.set_index(["F", "C"])
+        unstack_and_compare(df1, "F")
+
+        df1 = df.set_index(["G", "B", "state"])
+        unstack_and_compare(df1, "B")
+
+        df1 = df.set_index(["E", "A"])
+        unstack_and_compare(df1, "E")
+
+        df1 = df.set_index(["state", "index"])
+        s = df1["A"]
+        unstack_and_compare(s, "index")
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_ints(self, future_stack):
+        columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3)))
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((30, 27)), columns=columns
+        )
+
+        tm.assert_frame_equal(
+            df.stack(level=[1, 2], future_stack=future_stack),
+            df.stack(level=1, future_stack=future_stack).stack(
+                level=1, future_stack=future_stack
+            ),
+        )
+        tm.assert_frame_equal(
+            df.stack(level=[-2, -1], future_stack=future_stack),
+            df.stack(level=1, future_stack=future_stack).stack(
+                level=1, future_stack=future_stack
+            ),
+        )
+
+        df_named = df.copy()
+        return_value = df_named.columns.set_names(range(3), inplace=True)
+        assert return_value is None
+
+        tm.assert_frame_equal(
+            df_named.stack(level=[1, 2], future_stack=future_stack),
+            df_named.stack(level=1, future_stack=future_stack).stack(
+                level=1, future_stack=future_stack
+            ),
+        )
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_mixed_levels(self, future_stack):
+        columns = MultiIndex.from_tuples(
+            [
+                ("A", "cat", "long"),
+                ("B", "cat", "long"),
+                ("A", "dog", "short"),
+                ("B", "dog", "short"),
+            ],
+            names=["exp", "animal", "hair_length"],
+        )
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((4, 4)), columns=columns
+        )
+
+        animal_hair_stacked = df.stack(
+            level=["animal", "hair_length"], future_stack=future_stack
+        )
+        exp_hair_stacked = df.stack(
+            level=["exp", "hair_length"], future_stack=future_stack
+        )
+
+        # GH #8584: Need to check that stacking works when a number
+        # is passed that is both a level name and in the range of
+        # the level numbers
+        df2 = df.copy()
+        df2.columns.names = ["exp", "animal", 1]
+        tm.assert_frame_equal(
+            df2.stack(level=["animal", 1], future_stack=future_stack),
+            animal_hair_stacked,
+            check_names=False,
+        )
+        tm.assert_frame_equal(
+            df2.stack(level=["exp", 1], future_stack=future_stack),
+            exp_hair_stacked,
+            check_names=False,
+        )
+
+        # When mixed types are passed and the ints are not level
+        # names, raise
+        msg = (
+            "level should contain all level names or all level numbers, not "
+            "a mixture of the two"
+        )
+        with pytest.raises(ValueError, match=msg):
+            df2.stack(level=["animal", 0], future_stack=future_stack)
+
+        # GH #8584: Having 0 in the level names could raise a
+        # strange error about lexsort depth
+        df3 = df.copy()
+        df3.columns.names = ["exp", "animal", 0]
+        tm.assert_frame_equal(
+            df3.stack(level=["animal", 0], future_stack=future_stack),
+            animal_hair_stacked,
+            check_names=False,
+        )
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_int_level_names(self, future_stack):
+        columns = MultiIndex.from_tuples(
+            [
+                ("A", "cat", "long"),
+                ("B", "cat", "long"),
+                ("A", "dog", "short"),
+                ("B", "dog", "short"),
+            ],
+            names=["exp", "animal", "hair_length"],
+        )
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((4, 4)), columns=columns
+        )
+
+        exp_animal_stacked = df.stack(
+            level=["exp", "animal"], future_stack=future_stack
+        )
+        animal_hair_stacked = df.stack(
+            level=["animal", "hair_length"], future_stack=future_stack
+        )
+        exp_hair_stacked = df.stack(
+            level=["exp", "hair_length"], future_stack=future_stack
+        )
+
+        df2 = df.copy()
+        df2.columns.names = [0, 1, 2]
+        tm.assert_frame_equal(
+            df2.stack(level=[1, 2], future_stack=future_stack),
+            animal_hair_stacked,
+            check_names=False,
+        )
+        tm.assert_frame_equal(
+            df2.stack(level=[0, 1], future_stack=future_stack),
+            exp_animal_stacked,
+            check_names=False,
+        )
+        tm.assert_frame_equal(
+            df2.stack(level=[0, 2], future_stack=future_stack),
+            exp_hair_stacked,
+            check_names=False,
+        )
+
+        # Out-of-order int column names
+        df3 = df.copy()
+        df3.columns.names = [2, 0, 1]
+        tm.assert_frame_equal(
+            df3.stack(level=[0, 1], future_stack=future_stack),
+            animal_hair_stacked,
+            check_names=False,
+        )
+        tm.assert_frame_equal(
+            df3.stack(level=[2, 0], future_stack=future_stack),
+            exp_animal_stacked,
+            check_names=False,
+        )
+        tm.assert_frame_equal(
+            df3.stack(level=[2, 1], future_stack=future_stack),
+            exp_hair_stacked,
+            check_names=False,
+        )
+
+    def test_unstack_bool(self):
+        df = DataFrame(
+            [False, False],
+            index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]),
+            columns=["col"],
+        )
+        rs = df.unstack()
+        xp = DataFrame(
+            np.array([[False, np.nan], [np.nan, False]], dtype=object),
+            index=["a", "b"],
+            columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]),
+        )
+        tm.assert_frame_equal(rs, xp)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_unstack_level_binding(self, future_stack):
+        # GH9856
+        mi = MultiIndex(
+            levels=[["foo", "bar"], ["one", "two"], ["a", "b"]],
+            codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
+            names=["first", "second", "third"],
+        )
+        s = Series(0, index=mi)
+        result = s.unstack([1, 2]).stack(0, future_stack=future_stack)
+
+        expected_mi = MultiIndex(
+            levels=[["foo", "bar"], ["one", "two"]],
+            codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
+            names=["first", "second"],
+        )
+
+        expected = DataFrame(
+            np.array(
+                [[0, np.nan], [np.nan, 0], [0, np.nan], [np.nan, 0]], dtype=np.float64
+            ),
+            index=expected_mi,
+            columns=Index(["b", "a"], name="third"),
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_to_series(self, float_frame):
+        # check reversibility
+        data = float_frame.unstack()
+
+        assert isinstance(data, Series)
+        undo = data.unstack().T
+        tm.assert_frame_equal(undo, float_frame)
+
+        # check NA handling
+        data = DataFrame({"x": [1, 2, np.nan], "y": [3.0, 4, np.nan]})
+        data.index = Index(["a", "b", "c"])
+        result = data.unstack()
+
+        midx = MultiIndex(
+            levels=[["x", "y"], ["a", "b", "c"]],
+            codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]],
+        )
+        expected = Series([1, 2, np.nan, 3, 4, np.nan], index=midx)
+
+        tm.assert_series_equal(result, expected)
+
+        # check composability of unstack
+        old_data = data.copy()
+        for _ in range(4):
+            data = data.unstack()
+        tm.assert_frame_equal(old_data, data)
+
+    def test_unstack_dtypes(self, using_infer_string):
+        # GH 2929
+        rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]]
+
+        df = DataFrame(rows, columns=list("ABCD"))
+        result = df.dtypes
+        expected = Series([np.dtype("int64")] * 4, index=list("ABCD"))
+        tm.assert_series_equal(result, expected)
+
+        # single dtype
+        df2 = df.set_index(["A", "B"])
+        df3 = df2.unstack("B")
+        result = df3.dtypes
+        expected = Series(
+            [np.dtype("int64")] * 4,
+            index=MultiIndex.from_arrays(
+                [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B")
+            ),
+        )
+        tm.assert_series_equal(result, expected)
+
+        # mixed
+        df2 = df.set_index(["A", "B"])
+        df2["C"] = 3.0
+        df3 = df2.unstack("B")
+        result = df3.dtypes
+        expected = Series(
+            [np.dtype("float64")] * 2 + [np.dtype("int64")] * 2,
+            index=MultiIndex.from_arrays(
+                [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B")
+            ),
+        )
+        tm.assert_series_equal(result, expected)
+        df2["D"] = "foo"
+        df3 = df2.unstack("B")
+        result = df3.dtypes
+        dtype = (
+            pd.StringDtype(na_value=np.nan)
+            if using_infer_string
+            else np.dtype("object")
+        )
+        expected = Series(
+            [np.dtype("float64")] * 2 + [dtype] * 2,
+            index=MultiIndex.from_arrays(
+                [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B")
+            ),
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "c, d",
+        (
+            (np.zeros(5), np.zeros(5)),
+            (np.arange(5, dtype="f8"), np.arange(5, 10, dtype="f8")),
+        ),
+    )
+    def test_unstack_dtypes_mixed_date(self, c, d):
+        # GH7405
+        df = DataFrame(
+            {
+                "A": ["a"] * 5,
+                "C": c,
+                "D": d,
+                "B": date_range("2012-01-01", periods=5),
+            }
+        )
+
+        right = df.iloc[:3].copy(deep=True)
+
+        df = df.set_index(["A", "B"])
+        df["D"] = df["D"].astype("int64")
+
+        left = df.iloc[:3].unstack(0)
+        right = right.set_index(["A", "B"]).unstack(0)
+        right[("D", "a")] = right[("D", "a")].astype("int64")
+
+        assert left.shape == (3, 2)
+        tm.assert_frame_equal(left, right)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_unstack_non_unique_index_names(self, future_stack):
+        idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"])
+        df = DataFrame([1, 2], index=idx)
+        msg = "The name c1 occurs multiple times, use a level number"
+        with pytest.raises(ValueError, match=msg):
+            df.unstack("c1")
+
+        with pytest.raises(ValueError, match=msg):
+            df.T.stack("c1", future_stack=future_stack)
+
+    def test_unstack_unused_levels(self):
+        # GH 17845: unused codes in index make unstack() cast int to float
+        idx = MultiIndex.from_product([["a"], ["A", "B", "C", "D"]])[:-1]
+        df = DataFrame([[1, 0]] * 3, index=idx)
+
+        result = df.unstack()
+        exp_col = MultiIndex.from_product([range(2), ["A", "B", "C"]])
+        expected = DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col)
+        tm.assert_frame_equal(result, expected)
+        assert (result.columns.levels[1] == idx.levels[1]).all()
+
+        # Unused items on both levels
+        levels = [range(3), range(4)]
+        codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
+        idx = MultiIndex(levels, codes)
+        block = np.arange(4).reshape(2, 2)
+        df = DataFrame(np.concatenate([block, block + 4]), index=idx)
+        result = df.unstack()
+        expected = DataFrame(
+            np.concatenate([block * 2, block * 2 + 1], axis=1), columns=idx
+        )
+        tm.assert_frame_equal(result, expected)
+        assert (result.columns.levels[1] == idx.levels[1]).all()
+
+    @pytest.mark.parametrize(
+        "level, idces, col_level, idx_level",
+        (
+            (0, [13, 16, 6, 9, 2, 5, 8, 11], [np.nan, "a", 2], [np.nan, 5, 1]),
+            (1, [8, 11, 1, 4, 12, 15, 13, 16], [np.nan, 5, 1], [np.nan, "a", 2]),
+        ),
+    )
+    def test_unstack_unused_levels_mixed_with_nan(
+        self, level, idces, col_level, idx_level
+    ):
+        # With mixed dtype and NaN
+        levels = [["a", 2, "c"], [1, 3, 5, 7]]
+        codes = [[0, -1, 1, 1], [0, 2, -1, 2]]
+        idx = MultiIndex(levels, codes)
+        data = np.arange(8)
+        df = DataFrame(data.reshape(4, 2), index=idx)
+
+        result = df.unstack(level=level)
+        exp_data = np.zeros(18) * np.nan
+        exp_data[idces] = data
+        cols = MultiIndex.from_product([range(2), col_level])
+        expected = DataFrame(exp_data.reshape(3, 6), index=idx_level, columns=cols)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("cols", [["A", "C"], slice(None)])
+    def test_unstack_unused_level(self, cols):
+        # GH 18562 : unused codes on the unstacked level
+        df = DataFrame([[2010, "a", "I"], [2011, "b", "II"]], columns=["A", "B", "C"])
+
+        ind = df.set_index(["A", "B", "C"], drop=False)
+        selection = ind.loc[(slice(None), slice(None), "I"), cols]
+        result = selection.unstack()
+
+        expected = ind.iloc[[0]][cols]
+        expected.columns = MultiIndex.from_product(
+            [expected.columns, ["I"]], names=[None, "C"]
+        )
+        expected.index = expected.index.droplevel("C")
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_long_index(self):
+        # PH 32624: Error when using a lot of indices to unstack.
+        # The error occurred only, if a lot of indices are used.
+        df = DataFrame(
+            [[1]],
+            columns=MultiIndex.from_tuples([[0]], names=["c1"]),
+            index=MultiIndex.from_tuples(
+                [[0, 0, 1, 0, 0, 0, 1]],
+                names=["i1", "i2", "i3", "i4", "i5", "i6", "i7"],
+            ),
+        )
+        result = df.unstack(["i2", "i3", "i4", "i5", "i6", "i7"])
+        expected = DataFrame(
+            [[1]],
+            columns=MultiIndex.from_tuples(
+                [[0, 0, 1, 0, 0, 0, 1]],
+                names=["c1", "i2", "i3", "i4", "i5", "i6", "i7"],
+            ),
+            index=Index([0], name="i1"),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_multi_level_cols(self):
+        # PH 24729: Unstack a df with multi level columns
+        df = DataFrame(
+            [[0.0, 0.0], [0.0, 0.0]],
+            columns=MultiIndex.from_tuples(
+                [["B", "C"], ["B", "D"]], names=["c1", "c2"]
+            ),
+            index=MultiIndex.from_tuples(
+                [[10, 20, 30], [10, 20, 40]], names=["i1", "i2", "i3"]
+            ),
+        )
+        assert df.unstack(["i2", "i1"]).columns.names[-2:] == ["i2", "i1"]
+
+    def test_unstack_multi_level_rows_and_cols(self):
+        # PH 28306: Unstack df with multi level cols and rows
+        df = DataFrame(
+            [[1, 2], [3, 4], [-1, -2], [-3, -4]],
+            columns=MultiIndex.from_tuples([["a", "b", "c"], ["d", "e", "f"]]),
+            index=MultiIndex.from_tuples(
+                [
+                    ["m1", "P3", 222],
+                    ["m1", "A5", 111],
+                    ["m2", "P3", 222],
+                    ["m2", "A5", 111],
+                ],
+                names=["i1", "i2", "i3"],
+            ),
+        )
+        result = df.unstack(["i3", "i2"])
+        expected = df.unstack(["i3"]).unstack(["i2"])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("idx", [("jim", "joe"), ("joe", "jim")])
+    @pytest.mark.parametrize("lev", list(range(2)))
+    def test_unstack_nan_index1(self, idx, lev):
+        # GH7466
+        def cast(val):
+            val_str = "" if val != val else val
+            return f"{val_str:1}"
+
+        df = DataFrame(
+            {
+                "jim": ["a", "b", np.nan, "d"],
+                "joe": ["w", "x", "y", "z"],
+                "jolie": ["a.w", "b.x", " .y", "d.z"],
+            }
+        )
+
+        left = df.set_index(["jim", "joe"]).unstack()["jolie"]
+        right = df.set_index(["joe", "jim"]).unstack()["jolie"].T
+        tm.assert_frame_equal(left, right)
+
+        mi = df.set_index(list(idx))
+        udf = mi.unstack(level=lev)
+        assert udf.notna().values.sum() == len(df)
+        mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
+        rows, cols = udf["jolie"].notna().values.nonzero()
+        for i, j in zip(rows, cols):
+            left = sorted(udf["jolie"].iloc[i, j].split("."))
+            right = mk_list(udf["jolie"].index[i]) + mk_list(udf["jolie"].columns[j])
+            right = sorted(map(cast, right))
+            assert left == right
+
+    @pytest.mark.parametrize("idx", itertools.permutations(["1st", "2nd", "3rd"]))
+    @pytest.mark.parametrize("lev", list(range(3)))
+    @pytest.mark.parametrize("col", ["4th", "5th"])
+    def test_unstack_nan_index_repeats(self, idx, lev, col):
+        def cast(val):
+            val_str = "" if val != val else val
+            return f"{val_str:1}"
+
+        df = DataFrame(
+            {
+                "1st": ["d"] * 3
+                + [np.nan] * 5
+                + ["a"] * 2
+                + ["c"] * 3
+                + ["e"] * 2
+                + ["b"] * 5,
+                "2nd": ["y"] * 2
+                + ["w"] * 3
+                + [np.nan] * 3
+                + ["z"] * 4
+                + [np.nan] * 3
+                + ["x"] * 3
+                + [np.nan] * 2,
+                "3rd": [
+                    67,
+                    39,
+                    53,
+                    72,
+                    57,
+                    80,
+                    31,
+                    18,
+                    11,
+                    30,
+                    59,
+                    50,
+                    62,
+                    59,
+                    76,
+                    52,
+                    14,
+                    53,
+                    60,
+                    51,
+                ],
+            }
+        )
+
+        df["4th"], df["5th"] = (
+            df.apply(lambda r: ".".join(map(cast, r)), axis=1),
+            df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1),
+        )
+
+        mi = df.set_index(list(idx))
+        udf = mi.unstack(level=lev)
+        assert udf.notna().values.sum() == 2 * len(df)
+        mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
+        rows, cols = udf[col].notna().values.nonzero()
+        for i, j in zip(rows, cols):
+            left = sorted(udf[col].iloc[i, j].split("."))
+            right = mk_list(udf[col].index[i]) + mk_list(udf[col].columns[j])
+            right = sorted(map(cast, right))
+            assert left == right
+
+    def test_unstack_nan_index2(self):
+        # GH7403
+        df = DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)})
+        # Explicit cast to avoid implicit cast when setting to np.nan
+        df = df.astype({"B": "float"})
+        df.iloc[3, 1] = np.nan
+        left = df.set_index(["A", "B"]).unstack(0)
+
+        vals = [
+            [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan],
+            [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7],
+        ]
+        vals = list(map(list, zip(*vals)))
+        idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B")
+        cols = MultiIndex(
+            levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
+        )
+
+        right = DataFrame(vals, columns=cols, index=idx)
+        tm.assert_frame_equal(left, right)
+
+        df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)})
+        # Explicit cast to avoid implicit cast when setting to np.nan
+        df = df.astype({"B": "float"})
+        df.iloc[2, 1] = np.nan
+        left = df.set_index(["A", "B"]).unstack(0)
+
+        vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]]
+        cols = MultiIndex(
+            levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
+        )
+        idx = Index([np.nan, 0, 1, 2, 3], name="B")
+        right = DataFrame(vals, columns=cols, index=idx)
+        tm.assert_frame_equal(left, right)
+
+        df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)})
+        # Explicit cast to avoid implicit cast when setting to np.nan
+        df = df.astype({"B": "float"})
+        df.iloc[3, 1] = np.nan
+        left = df.set_index(["A", "B"]).unstack(0)
+
+        vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]]
+        cols = MultiIndex(
+            levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
+        )
+        idx = Index([np.nan, 0, 1, 2, 3], name="B")
+        right = DataFrame(vals, columns=cols, index=idx)
+        tm.assert_frame_equal(left, right)
+
+    def test_unstack_nan_index3(self):
+        # GH7401
+        df = DataFrame(
+            {
+                "A": list("aaaaabbbbb"),
+                "B": (date_range("2012-01-01", periods=5).tolist() * 2),
+                "C": np.arange(10),
+            }
+        )
+
+        df.iloc[3, 1] = np.nan
+        left = df.set_index(["A", "B"]).unstack()
+
+        vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]])
+        idx = Index(["a", "b"], name="A")
+        cols = MultiIndex(
+            levels=[["C"], date_range("2012-01-01", periods=5)],
+            codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
+            names=[None, "B"],
+        )
+
+        right = DataFrame(vals, columns=cols, index=idx)
+        tm.assert_frame_equal(left, right)
+
+    def test_unstack_nan_index4(self):
+        # GH4862
+        vals = [
+            ["Hg", np.nan, np.nan, 680585148],
+            ["U", 0.0, np.nan, 680585148],
+            ["Pb", 7.07e-06, np.nan, 680585148],
+            ["Sn", 2.3614e-05, 0.0133, 680607017],
+            ["Ag", 0.0, 0.0133, 680607017],
+            ["Hg", -0.00015, 0.0133, 680607017],
+        ]
+        df = DataFrame(
+            vals,
+            columns=["agent", "change", "dosage", "s_id"],
+            index=[17263, 17264, 17265, 17266, 17267, 17268],
+        )
+
+        left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack()
+
+        vals = [
+            [np.nan, np.nan, 7.07e-06, np.nan, 0.0],
+            [0.0, -0.00015, np.nan, 2.3614e-05, np.nan],
+        ]
+
+        idx = MultiIndex(
+            levels=[[680585148, 680607017], [0.0133]],
+            codes=[[0, 1], [-1, 0]],
+            names=["s_id", "dosage"],
+        )
+
+        cols = MultiIndex(
+            levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]],
+            codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
+            names=[None, "agent"],
+        )
+
+        right = DataFrame(vals, columns=cols, index=idx)
+        tm.assert_frame_equal(left, right)
+
+        left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"])
+        tm.assert_frame_equal(left.unstack(), right)
+
+    def test_unstack_nan_index5(self):
+        # GH9497 - multiple unstack with nulls
+        df = DataFrame(
+            {
+                "1st": [1, 2, 1, 2, 1, 2],
+                "2nd": date_range("2014-02-01", periods=6, freq="D"),
+                "jim": 100 + np.arange(6),
+                "joe": (np.random.default_rng(2).standard_normal(6) * 10).round(2),
+            }
+        )
+
+        df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02")
+        df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan
+        df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan
+
+        left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"])
+        assert left.notna().values.sum() == 2 * len(df)
+
+        for col in ["jim", "joe"]:
+            for _, r in df.iterrows():
+                key = r["1st"], (col, r["2nd"], r["3rd"])
+                assert r[col] == left.loc[key]
+
+    def test_stack_datetime_column_multiIndex(self, future_stack):
+        # GH 8039
+        t = datetime(2014, 1, 1)
+        df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")]))
+        warn = None if future_stack else Pandas4Warning
+        msg = "The previous implementation of stack is deprecated"
+        with tm.assert_produces_warning(warn, match=msg):
+            result = df.stack(future_stack=future_stack)
+
+        eidx = MultiIndex.from_product([range(4), ("B",)])
+        ecols = MultiIndex.from_tuples([(t, "A")])
+        expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    @pytest.mark.parametrize(
+        "multiindex_columns",
+        [
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3],
+            [0, 1, 2, 4],
+            [0, 1, 2],
+            [1, 2, 3],
+            [2, 3, 4],
+            [0, 1],
+            [0, 2],
+            [0, 3],
+            [0],
+            [2],
+            [4],
+            [4, 3, 2, 1, 0],
+            [3, 2, 1, 0],
+            [4, 2, 1, 0],
+            [2, 1, 0],
+            [3, 2, 1],
+            [4, 3, 2],
+            [1, 0],
+            [2, 0],
+            [3, 0],
+        ],
+    )
+    @pytest.mark.parametrize("level", (-1, 0, 1, [0, 1], [1, 0]))
+    def test_stack_partial_multiIndex(self, multiindex_columns, level, future_stack):
+        # GH 8844
+        dropna = False if not future_stack else lib.no_default
+        full_multiindex = MultiIndex.from_tuples(
+            [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")],
+            names=["Upper", "Lower"],
+        )
+        multiindex = full_multiindex[multiindex_columns]
+        df = DataFrame(
+            np.arange(3 * len(multiindex)).reshape(3, len(multiindex)),
+            columns=multiindex,
+        )
+        result = df.stack(level=level, dropna=dropna, future_stack=future_stack)
+
+        if isinstance(level, int) and not future_stack:
+            # Stacking a single level should not make any all-NaN rows,
+            # so df.stack(level=level, dropna=False) should be the same
+            # as df.stack(level=level, dropna=True).
+            expected = df.stack(level=level, dropna=True, future_stack=future_stack)
+            if isinstance(expected, Series):
+                tm.assert_series_equal(result, expected)
+            else:
+                tm.assert_frame_equal(result, expected)
+
+        df.columns = MultiIndex.from_tuples(
+            df.columns.to_numpy(), names=df.columns.names
+        )
+        expected = df.stack(level=level, dropna=dropna, future_stack=future_stack)
+        if isinstance(expected, Series):
+            tm.assert_series_equal(result, expected)
+        else:
+            tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_full_multiIndex(self, future_stack):
+        # GH 8844
+        full_multiindex = MultiIndex.from_tuples(
+            [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")],
+            names=["Upper", "Lower"],
+        )
+        df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]])
+        dropna = False if not future_stack else lib.no_default
+        result = df.stack(dropna=dropna, future_stack=future_stack)
+        expected = DataFrame(
+            [[0, 2], [1, np.nan], [3, 5], [4, np.nan]],
+            index=MultiIndex(
+                levels=[range(2), ["u", "x", "y", "z"]],
+                codes=[[0, 0, 1, 1], [1, 3, 1, 3]],
+                names=[None, "Lower"],
+            ),
+            columns=Index(["B", "C"], name="Upper"),
+        )
+        expected["B"] = expected["B"].astype(df.dtypes.iloc[0])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    @pytest.mark.parametrize("ordered", [False, True])
+    def test_stack_preserve_categorical_dtype(self, ordered, future_stack):
+        # GH13854
+        cidx = pd.CategoricalIndex(list("yxz"), categories=list("xyz"), ordered=ordered)
+        df = DataFrame([[10, 11, 12]], columns=cidx)
+        result = df.stack(future_stack=future_stack)
+
+        # `MultiIndex.from_product` preserves categorical dtype -
+        # it's tested elsewhere.
+        midx = MultiIndex.from_product([df.index, cidx])
+        expected = Series([10, 11, 12], index=midx)
+
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    @pytest.mark.parametrize("ordered", [False, True])
+    @pytest.mark.parametrize(
+        "labels,data",
+        [
+            (list("xyz"), [10, 11, 12, 13, 14, 15]),
+            (list("zyx"), [14, 15, 12, 13, 10, 11]),
+        ],
+    )
+    def test_stack_multi_preserve_categorical_dtype(
+        self, ordered, labels, data, future_stack
+    ):
+        # GH-36991
+        cidx = pd.CategoricalIndex(labels, categories=sorted(labels), ordered=ordered)
+        cidx2 = pd.CategoricalIndex(["u", "v"], ordered=ordered)
+        midx = MultiIndex.from_product([cidx, cidx2])
+        df = DataFrame([sorted(data)], columns=midx)
+        result = df.stack([0, 1], future_stack=future_stack)
+
+        labels = labels if future_stack else sorted(labels)
+        s_cidx = pd.CategoricalIndex(labels, ordered=ordered)
+        expected_data = sorted(data) if future_stack else data
+        expected = Series(
+            expected_data, index=MultiIndex.from_product([range(1), s_cidx, cidx2])
+        )
+
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_preserve_categorical_dtype_values(self, future_stack):
+        # GH-23077
+        cat = pd.Categorical(["a", "a", "b", "c"])
+        df = DataFrame({"A": cat, "B": cat})
+        result = df.stack(future_stack=future_stack)
+        index = MultiIndex.from_product([range(4), ["A", "B"]])
+        expected = Series(
+            pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    @pytest.mark.parametrize(
+        "index",
+        [
+            [0, 0, 1, 1],
+            [0, 0, 2, 3],
+            [0, 1, 2, 3],
+        ],
+    )
+    def test_stack_multi_columns_non_unique_index(self, index, future_stack):
+        # GH-28301
+        columns = MultiIndex.from_product([[1, 2], ["a", "b"]])
+        df = DataFrame(index=index, columns=columns).fillna(1)
+        stacked = df.stack(future_stack=future_stack)
+        new_index = MultiIndex.from_tuples(stacked.index.to_numpy())
+        expected = DataFrame(
+            stacked.to_numpy(), index=new_index, columns=stacked.columns
+        )
+        tm.assert_frame_equal(stacked, expected)
+        stacked_codes = np.asarray(stacked.index.codes)
+        expected_codes = np.asarray(new_index.codes)
+        tm.assert_numpy_array_equal(stacked_codes, expected_codes)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    @pytest.mark.parametrize(
+        "vals1, vals2, dtype1, dtype2, expected_dtype",
+        [
+            ([1, 2], [3.0, 4.0], "Int64", "Float64", "Float64"),
+            ([1, 2], ["foo", "bar"], "Int64", "string", "object"),
+        ],
+    )
+    def test_stack_multi_columns_mixed_extension_types(
+        self, vals1, vals2, dtype1, dtype2, expected_dtype, future_stack
+    ):
+        # GH45740
+        df = DataFrame(
+            {
+                ("A", 1): Series(vals1, dtype=dtype1),
+                ("A", 2): Series(vals2, dtype=dtype2),
+            }
+        )
+        result = df.stack(future_stack=future_stack)
+        expected = (
+            df.astype(object).stack(future_stack=future_stack).astype(expected_dtype)
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("level", [0, 1])
+    def test_unstack_mixed_extension_types(self, level):
+        index = MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 1)], names=["a", "b"])
+        df = DataFrame(
+            {
+                "A": pd.array([0, 1, None], dtype="Int64"),
+                "B": pd.Categorical(["a", "a", "b"]),
+            },
+            index=index,
+        )
+
+        result = df.unstack(level=level)
+        expected = df.astype(object).unstack(level=level)
+        if level == 0:
+            expected[("A", "B")] = expected[("A", "B")].fillna(pd.NA)
+        else:
+            expected[("A", 0)] = expected[("A", 0)].fillna(pd.NA)
+
+        expected_dtypes = Series(
+            [df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns
+        )
+        tm.assert_series_equal(result.dtypes, expected_dtypes)
+        tm.assert_frame_equal(result.astype(object), expected)
+
+    @pytest.mark.parametrize("level", [0, "baz"])
+    def test_unstack_swaplevel_sortlevel(self, level):
+        # GH 20994
+        mi = MultiIndex.from_product([range(1), ["d", "c"]], names=["bar", "baz"])
+        df = DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"])
+        df.columns.name = "foo"
+
+        expected = DataFrame(
+            [[3, 1, 2, 0]],
+            columns=MultiIndex.from_tuples(
+                [("c", "A"), ("c", "B"), ("d", "A"), ("d", "B")], names=["baz", "foo"]
+            ),
+        )
+        expected.index.name = "bar"
+
+        result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["float64", "Float64"])
+def test_unstack_sort_false(frame_or_series, dtype):
+    # GH 15105
+    index = MultiIndex.from_tuples(
+        [("two", "z", "b"), ("two", "y", "a"), ("one", "z", "b"), ("one", "y", "a")]
+    )
+    obj = frame_or_series(np.arange(1.0, 5.0), index=index, dtype=dtype)
+
+    result = obj.unstack(level=0, sort=False)
+
+    if frame_or_series is DataFrame:
+        expected_columns = MultiIndex.from_tuples([(0, "two"), (0, "one")])
+    else:
+        expected_columns = ["two", "one"]
+    expected = DataFrame(
+        [[1.0, 3.0], [2.0, 4.0]],
+        index=MultiIndex.from_tuples([("z", "b"), ("y", "a")]),
+        columns=expected_columns,
+        dtype=dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = obj.unstack(level=-1, sort=False)
+
+    if frame_or_series is DataFrame:
+        expected_columns = MultiIndex(
+            levels=[range(1), ["b", "a"]], codes=[[0, 0], [0, 1]]
+        )
+    else:
+        expected_columns = ["b", "a"]
+
+    item = pd.NA if dtype == "Float64" else np.nan
+    expected = DataFrame(
+        [[1.0, item], [item, 2.0], [3.0, item], [item, 4.0]],
+        columns=expected_columns,
+        index=MultiIndex.from_tuples(
+            [("two", "z"), ("two", "y"), ("one", "z"), ("one", "y")]
+        ),
+        dtype=dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = obj.unstack(level=[1, 2], sort=False)
+
+    if frame_or_series is DataFrame:
+        expected_columns = MultiIndex(
+            levels=[range(1), ["z", "y"], ["b", "a"]], codes=[[0, 0], [0, 1], [0, 1]]
+        )
+    else:
+        expected_columns = MultiIndex.from_tuples([("z", "b"), ("y", "a")])
+    expected = DataFrame(
+        [[1.0, 2.0], [3.0, 4.0]],
+        index=["two", "one"],
+        columns=expected_columns,
+        dtype=dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "levels2, expected_columns",
+    [
+        (
+            [None, 1, 2, 3],
+            [("value", np.nan), ("value", 1), ("value", 2), ("value", 3)],
+        ),
+        (
+            [1, None, 2, 3],
+            [("value", 1), ("value", np.nan), ("value", 2), ("value", 3)],
+        ),
+        (
+            [1, 2, None, 3],
+            [("value", 1), ("value", 2), ("value", np.nan), ("value", 3)],
+        ),
+        (
+            [1, 2, 3, None],
+            [("value", 1), ("value", 2), ("value", 3), ("value", np.nan)],
+        ),
+    ],
+    ids=["nan=first", "nan=second", "nan=third", "nan=last"],
+)
+def test_unstack_sort_false_nan(levels2, expected_columns):
+    # GH#61221
+    levels1 = ["b", "a"]
+    index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
+    df = DataFrame({"value": [0, 1, 2, 3, 4, 5, 6, 7]}, index=index)
+    result = df.unstack(level="level2", sort=False)
+    expected_data = [[0, 4], [1, 5], [2, 6], [3, 7]]
+    expected = DataFrame(
+        dict(zip(expected_columns, expected_data)),
+        index=Index(["b", "a"], name="level1"),
+        columns=MultiIndex.from_tuples(expected_columns, names=[None, "level2"]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_unstack_fill_frame_object():
+    # GH12815 Test unstacking with object.
+    data = Series(["a", "b", "c", "a"], dtype="object")
+    data.index = MultiIndex.from_tuples(
+        [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
+    )
+
+    # By default missing values will be NaN
+    result = data.unstack()
+    expected = DataFrame(
+        {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]},
+        index=list("xyz"),
+        dtype=object,
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # Fill with any value replaces missing values as expected
+    result = data.unstack(fill_value="d")
+    expected = DataFrame(
+        {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz"), dtype=object
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_unstack_timezone_aware_values():
+    # GH 18338
+    df = DataFrame(
+        {
+            "timestamp": [pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC")],
+            "a": ["a"],
+            "b": ["b"],
+            "c": ["c"],
+        },
+        columns=["timestamp", "a", "b", "c"],
+    )
+    result = df.set_index(["a", "b"]).unstack()
+    expected = DataFrame(
+        [[pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC"), "c"]],
+        index=Index(["a"], name="a"),
+        columns=MultiIndex(
+            levels=[["timestamp", "c"], ["b"]],
+            codes=[[0, 1], [0, 0]],
+            names=[None, "b"],
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
+def test_stack_timezone_aware_values(future_stack):
+    # GH 19420
+    ts = date_range(freq="D", start="20180101", end="20180103", tz="America/New_York")
+    df = DataFrame({"A": ts}, index=["a", "b", "c"])
+    result = df.stack(future_stack=future_stack)
+    expected = Series(
+        ts,
+        index=MultiIndex(levels=[["a", "b", "c"], ["A"]], codes=[[0, 1, 2], [0, 0, 0]]),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
+@pytest.mark.parametrize("dropna", [True, False, lib.no_default])
+def test_stack_empty_frame(dropna, future_stack):
+    # GH 36113
+    levels = [pd.RangeIndex(0), pd.RangeIndex(0)]
+    expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []]))
+    if future_stack and dropna is not lib.no_default:
+        with pytest.raises(ValueError, match="dropna must be unspecified"):
+            DataFrame(dtype=np.float64).stack(dropna=dropna, future_stack=future_stack)
+    else:
+        result = DataFrame(dtype=np.float64).stack(
+            dropna=dropna, future_stack=future_stack
+        )
+        tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
+@pytest.mark.parametrize("dropna", [True, False, lib.no_default])
+def test_stack_empty_level(dropna, future_stack, int_frame):
+    # GH 60740
+    if future_stack and dropna is not lib.no_default:
+        with pytest.raises(ValueError, match="dropna must be unspecified"):
+            DataFrame(dtype=np.int64).stack(dropna=dropna, future_stack=future_stack)
+    else:
+        expected = int_frame
+        result = int_frame.copy().stack(
+            level=[], dropna=dropna, future_stack=future_stack
+        )
+        tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame()
+        result = DataFrame().stack(level=[], dropna=dropna, future_stack=future_stack)
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
+@pytest.mark.parametrize("dropna", [True, False, lib.no_default])
+@pytest.mark.parametrize("fill_value", [None, 0])
+def test_stack_unstack_empty_frame(dropna, fill_value, future_stack):
+    # GH 36113
+    if future_stack and dropna is not lib.no_default:
+        with pytest.raises(ValueError, match="dropna must be unspecified"):
+            DataFrame(dtype=np.int64).stack(
+                dropna=dropna, future_stack=future_stack
+            ).unstack(fill_value=fill_value)
+    else:
+        result = (
+            DataFrame(dtype=np.int64)
+            .stack(dropna=dropna, future_stack=future_stack)
+            .unstack(fill_value=fill_value)
+        )
+        expected = DataFrame(dtype=np.int64)
+        tm.assert_frame_equal(result, expected)
+
+
+def test_unstack_single_index_series():
+    # GH 36113
+    msg = r"index must be a MultiIndex to unstack.*"
+    with pytest.raises(ValueError, match=msg):
+        Series(dtype=np.int64).unstack()
+
+
+def test_unstacking_multi_index_df():
+    # see gh-30740
+    df = DataFrame(
+        {
+            "name": ["Alice", "Bob"],
+            "score": [9.5, 8],
+            "employed": [False, True],
+            "kids": [0, 0],
+            "gender": ["female", "male"],
+        }
+    )
+    df = df.set_index(["name", "employed", "kids", "gender"])
+    df = df.unstack(["gender"], fill_value=0)
+    expected = df.unstack("employed", fill_value=0).unstack("kids", fill_value=0)
+    result = df.unstack(["employed", "kids"], fill_value=0)
+    expected = DataFrame(
+        [[9.5, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 8.0]],
+        index=Index(["Alice", "Bob"], name="name"),
+        columns=MultiIndex.from_tuples(
+            [
+                ("score", "female", False, 0),
+                ("score", "female", True, 0),
+                ("score", "male", False, 0),
+                ("score", "male", True, 0),
+            ],
+            names=[None, "gender", "employed", "kids"],
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
+def test_stack_positional_level_duplicate_column_names(future_stack):
+    # https://github.com/pandas-dev/pandas/issues/36353
+    columns = MultiIndex.from_product([("x", "y"), ("y", "z")], names=["a", "a"])
+    df = DataFrame([[1, 1, 1, 1]], columns=columns)
+    result = df.stack(0, future_stack=future_stack)
+
+    new_columns = Index(["y", "z"], name="a")
+    new_index = MultiIndex(
+        levels=[range(1), ["x", "y"]], codes=[[0, 0], [0, 1]], names=[None, "a"]
+    )
+    expected = DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_unstack_non_slice_like_blocks():
+    # Case where the mgr_locs of a DataFrame's underlying blocks are not slice-like
+
+    mi = MultiIndex.from_product([range(5), ["A", "B", "C"]])
+    df = DataFrame(
+        {
+            0: np.random.default_rng(2).standard_normal(15),
+            1: np.random.default_rng(2).standard_normal(15).astype(np.int64),
+            2: np.random.default_rng(2).standard_normal(15),
+            3: np.random.default_rng(2).standard_normal(15),
+        },
+        index=mi,
+    )
+    assert any(not x.mgr_locs.is_slice_like for x in df._mgr.blocks)
+
+    res = df.unstack()
+
+    expected = pd.concat([df[n].unstack() for n in range(4)], keys=range(4), axis=1)
+    tm.assert_frame_equal(res, expected)
+
+
+@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
+def test_stack_sort_false(future_stack):
+    # GH 15105
+    data = [[1, 2, 3.0, 4.0], [2, 3, 4.0, 5.0], [3, 4, np.nan, np.nan]]
+    df = DataFrame(
+        data,
+        columns=MultiIndex(
+            levels=[["B", "A"], ["x", "y"]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]]
+        ),
+    )
+    kwargs = {} if future_stack else {"sort": False}
+    result = df.stack(level=0, future_stack=future_stack, **kwargs)
+    if future_stack:
+        expected = DataFrame(
+            {
+                "x": [1.0, 3.0, 2.0, 4.0, 3.0, np.nan],
+                "y": [2.0, 4.0, 3.0, 5.0, 4.0, np.nan],
+            },
+            index=MultiIndex.from_arrays(
+                [[0, 0, 1, 1, 2, 2], ["B", "A", "B", "A", "B", "A"]]
+            ),
+        )
+    else:
+        expected = DataFrame(
+            {"x": [1.0, 3.0, 2.0, 4.0, 3.0], "y": [2.0, 4.0, 3.0, 5.0, 4.0]},
+            index=MultiIndex.from_arrays([[0, 0, 1, 1, 2], ["B", "A", "B", "A", "B"]]),
+        )
+    tm.assert_frame_equal(result, expected)
+
+    # Codes sorted in this call
+    df = DataFrame(
+        data,
+        columns=MultiIndex.from_arrays([["B", "B", "A", "A"], ["x", "y", "x", "y"]]),
+    )
+    kwargs = {} if future_stack else {"sort": False}
+    result = df.stack(level=0, future_stack=future_stack, **kwargs)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
+def test_stack_sort_false_multi_level(future_stack):
+    # GH 15105
+    idx = MultiIndex.from_tuples([("weight", "kg"), ("height", "m")])
+    df = DataFrame([[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=idx)
+    kwargs = {} if future_stack else {"sort": False}
+    result = df.stack([0, 1], future_stack=future_stack, **kwargs)
+    expected_index = MultiIndex.from_tuples(
+        [
+            ("cat", "weight", "kg"),
+            ("cat", "height", "m"),
+            ("dog", "weight", "kg"),
+            ("dog", "height", "m"),
+        ]
+    )
+    expected = Series([1.0, 2.0, 3.0, 4.0], index=expected_index)
+    tm.assert_series_equal(result, expected)
+
+
+class TestStackUnstackMultiLevel:
+    def test_unstack(self, multiindex_year_month_day_dataframe_random_data):
+        # just check that it works for now
+        ymd = multiindex_year_month_day_dataframe_random_data
+
+        unstacked = ymd.unstack()
+        unstacked.unstack()
+
+        # test that ints work
+        ymd.astype(int).unstack()
+
+        # test that int32 work
+        ymd.astype(np.int32).unstack()
+
+    @pytest.mark.parametrize(
+        "result_rows,result_columns,index_product,expected_row",
+        [
+            (
+                [[1, 1, None, None, 30.0, None], [2, 2, None, None, 30.0, None]],
+                ["ix1", "ix2", "col1", "col2", "col3", "col4"],
+                2,
+                [None, None, 30.0, None],
+            ),
+            (
+                [[1, 1, None, None, 30.0], [2, 2, None, None, 30.0]],
+                ["ix1", "ix2", "col1", "col2", "col3"],
+                2,
+                [None, None, 30.0],
+            ),
+            (
+                [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]],
+                ["ix1", "ix2", "col1", "col2", "col3"],
+                None,
+                [None, None, 30.0],
+            ),
+        ],
+    )
+    def test_unstack_partial(
+        self, result_rows, result_columns, index_product, expected_row
+    ):
+        # check for regressions on this issue:
+        # https://github.com/pandas-dev/pandas/issues/19351
+        # make sure DataFrame.unstack() works when its run on a subset of the DataFrame
+        # and the Index levels contain values that are not present in the subset
+        result = DataFrame(result_rows, columns=result_columns).set_index(
+            ["ix1", "ix2"]
+        )
+        result = result.iloc[1:2].unstack("ix2")
+        expected = DataFrame(
+            [expected_row],
+            columns=MultiIndex.from_product(
+                [result_columns[2:], [index_product]], names=[None, "ix2"]
+            ),
+            index=Index([2], name="ix1"),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_multiple_no_empty_columns(self):
+        index = MultiIndex.from_tuples(
+            [(0, "foo", 0), (0, "bar", 0), (1, "baz", 1), (1, "qux", 1)]
+        )
+
+        s = Series(np.random.default_rng(2).standard_normal(4), index=index)
+
+        unstacked = s.unstack([1, 2])
+        expected = unstacked.dropna(axis=1, how="all")
+        tm.assert_frame_equal(unstacked, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack(self, multiindex_year_month_day_dataframe_random_data, future_stack):
+        ymd = multiindex_year_month_day_dataframe_random_data
+
+        # regular roundtrip
+        unstacked = ymd.unstack()
+        restacked = unstacked.stack(future_stack=future_stack)
+        if future_stack:
+            # NA values in unstacked persist to restacked in version 3
+            restacked = restacked.dropna(how="all")
+        tm.assert_frame_equal(restacked, ymd)
+
+        unlexsorted = ymd.sort_index(level=2)
+
+        unstacked = unlexsorted.unstack(2)
+        restacked = unstacked.stack(future_stack=future_stack)
+        if future_stack:
+            # NA values in unstacked persist to restacked in version 3
+            restacked = restacked.dropna(how="all")
+        tm.assert_frame_equal(restacked.sort_index(level=0), ymd)
+
+        unlexsorted = unlexsorted[::-1]
+        unstacked = unlexsorted.unstack(1)
+        restacked = unstacked.stack(future_stack=future_stack).swaplevel(1, 2)
+        if future_stack:
+            # NA values in unstacked persist to restacked in version 3
+            restacked = restacked.dropna(how="all")
+        tm.assert_frame_equal(restacked.sort_index(level=0), ymd)
+
+        unlexsorted = unlexsorted.swaplevel(0, 1)
+        unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
+        restacked = unstacked.stack(0, future_stack=future_stack).swaplevel(1, 2)
+        if future_stack:
+            # NA values in unstacked persist to restacked in version 3
+            restacked = restacked.dropna(how="all")
+        tm.assert_frame_equal(restacked.sort_index(level=0), ymd)
+
+        # columns unsorted
+        unstacked = ymd.unstack()
+        restacked = unstacked.stack(future_stack=future_stack)
+        if future_stack:
+            # NA values in unstacked persist to restacked in version 3
+            restacked = restacked.dropna(how="all")
+        tm.assert_frame_equal(restacked, ymd)
+
+        # more than 2 levels in the columns
+        unstacked = ymd.unstack(1).unstack(1)
+
+        result = unstacked.stack(1, future_stack=future_stack)
+        expected = ymd.unstack()
+        tm.assert_frame_equal(result, expected)
+
+        result = unstacked.stack(2, future_stack=future_stack)
+        expected = ymd.unstack(1)
+        tm.assert_frame_equal(result, expected)
+
+        result = unstacked.stack(0, future_stack=future_stack)
+        expected = ymd.stack(future_stack=future_stack).unstack(1).unstack(1)
+        tm.assert_frame_equal(result, expected)
+
+        # not all levels present in each echelon
+        unstacked = ymd.unstack(2).loc[:, ::3]
+        stacked = unstacked.stack(future_stack=future_stack).stack(
+            future_stack=future_stack
+        )
+        ymd_stacked = ymd.stack(future_stack=future_stack)
+        if future_stack:
+            # NA values in unstacked persist to restacked in version 3
+            stacked = stacked.dropna(how="all")
+            ymd_stacked = ymd_stacked.dropna(how="all")
+        tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))
+
+        # stack with negative number
+        result = ymd.unstack(0).stack(-2, future_stack=future_stack)
+        expected = ymd.unstack(0).stack(0, future_stack=future_stack)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    @pytest.mark.parametrize(
+        "idx, exp_idx",
+        [
+            [
+                list("abab"),
+                MultiIndex(
+                    levels=[["a", "b"], ["1st", "2nd"]],
+                    codes=[np.tile(np.arange(2).repeat(3), 2), np.tile([0, 1, 0], 4)],
+                ),
+            ],
+            [
+                MultiIndex.from_tuples((("a", 2), ("b", 1), ("a", 1), ("b", 2))),
+                MultiIndex(
+                    levels=[["a", "b"], [1, 2], ["1st", "2nd"]],
+                    codes=[
+                        np.tile(np.arange(2).repeat(3), 2),
+                        np.repeat([1, 0, 1], [3, 6, 3]),
+                        np.tile([0, 1, 0], 4),
+                    ],
+                ),
+            ],
+        ],
+    )
+    def test_stack_duplicate_index(self, idx, exp_idx, future_stack):
+        # GH10417
+        df = DataFrame(
+            np.arange(12).reshape(4, 3),
+            index=idx,
+            columns=["1st", "2nd", "1st"],
+        )
+        if future_stack:
+            msg = "Columns with duplicate values are not supported in stack"
+            with pytest.raises(ValueError, match=msg):
+                df.stack(future_stack=future_stack)
+        else:
+            result = df.stack(future_stack=future_stack)
+            expected = Series(np.arange(12), index=exp_idx)
+            tm.assert_series_equal(result, expected)
+            assert result.index.is_unique is False
+            li, ri = result.index, expected.index
+            tm.assert_index_equal(li, ri)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_unstack_odd_failure(self, future_stack):
+        mi = MultiIndex.from_arrays(
+            [
+                ["Fri"] * 4 + ["Sat"] * 2 + ["Sun"] * 2 + ["Thu"] * 3,
+                ["Dinner"] * 2 + ["Lunch"] * 2 + ["Dinner"] * 5 + ["Lunch"] * 2,
+                ["No", "Yes"] * 4 + ["No", "No", "Yes"],
+            ],
+            names=["day", "time", "smoker"],
+        )
+        df = DataFrame(
+            {
+                "sum": np.arange(11, dtype="float64"),
+                "len": np.arange(11, dtype="float64"),
+            },
+            index=mi,
+        )
+        # it works, #2100
+        result = df.unstack(2)
+
+        recons = result.stack(future_stack=future_stack)
+        if future_stack:
+            # NA values in unstacked persist to restacked in version 3
+            recons = recons.dropna(how="all")
+        tm.assert_frame_equal(recons, df)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_mixed_dtype(self, multiindex_dataframe_random_data, future_stack):
+        frame = multiindex_dataframe_random_data
+
+        df = frame.T
+        df["foo", "four"] = "foo"
+        df = df.sort_index(level=1, axis=1)
+
+        stacked = df.stack(future_stack=future_stack)
+        result = df["foo"].stack(future_stack=future_stack).sort_index()
+        tm.assert_series_equal(stacked["foo"], result, check_names=False)
+        assert result.name is None
+        assert stacked["bar"].dtype == np.float64
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_unstack_bug(self, future_stack):
+        df = DataFrame(
+            {
+                "state": ["naive", "naive", "naive", "active", "active", "active"],
+                "exp": ["a", "b", "b", "b", "a", "a"],
+                "barcode": [1, 2, 3, 4, 1, 3],
+                "v": ["hi", "hi", "bye", "bye", "bye", "peace"],
+                "extra": np.arange(6.0),
+            }
+        )
+
+        result = df.groupby(["state", "exp", "barcode", "v"]).apply(len)
+        unstacked = result.unstack()
+        restacked = unstacked.stack(future_stack=future_stack)
+        tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float))
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_unstack_preserve_names(
+        self, multiindex_dataframe_random_data, future_stack
+    ):
+        frame = multiindex_dataframe_random_data
+
+        unstacked = frame.unstack()
+        assert unstacked.index.name == "first"
+        assert unstacked.columns.names == ["exp", "second"]
+
+        restacked = unstacked.stack(future_stack=future_stack)
+        assert restacked.index.names == frame.index.names
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    @pytest.mark.parametrize("method", ["stack", "unstack"])
+    def test_stack_unstack_wrong_level_name(
+        self, method, multiindex_dataframe_random_data, future_stack
+    ):
+        # GH 18303 - wrong level name should raise
+        frame = multiindex_dataframe_random_data
+
+        # A DataFrame with flat axes:
+        df = frame.loc["foo"]
+
+        kwargs = {"future_stack": future_stack} if method == "stack" else {}
+        with pytest.raises(KeyError, match="does not match index name"):
+            getattr(df, method)("mistake", **kwargs)
+
+        if method == "unstack":
+            # Same on a Series:
+            s = df.iloc[:, 0]
+            with pytest.raises(KeyError, match="does not match index name"):
+                getattr(s, method)("mistake", **kwargs)
+
+    def test_unstack_level_name(self, multiindex_dataframe_random_data):
+        frame = multiindex_dataframe_random_data
+
+        result = frame.unstack("second")
+        expected = frame.unstack(level=1)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_level_name(self, multiindex_dataframe_random_data, future_stack):
+        frame = multiindex_dataframe_random_data
+
+        unstacked = frame.unstack("second")
+        result = unstacked.stack("exp", future_stack=future_stack)
+        expected = frame.unstack().stack(0, future_stack=future_stack)
+        tm.assert_frame_equal(result, expected)
+
+        result = frame.stack("exp", future_stack=future_stack)
+        expected = frame.stack(future_stack=future_stack)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_unstack_multiple(
+        self, multiindex_year_month_day_dataframe_random_data, future_stack
+    ):
+        ymd = multiindex_year_month_day_dataframe_random_data
+
+        unstacked = ymd.unstack(["year", "month"])
+        expected = ymd.unstack("year").unstack("month")
+        tm.assert_frame_equal(unstacked, expected)
+        assert unstacked.columns.names == expected.columns.names
+
+        # series
+        s = ymd["A"]
+        s_unstacked = s.unstack(["year", "month"])
+        tm.assert_frame_equal(s_unstacked, expected["A"])
+
+        restacked = unstacked.stack(["year", "month"], future_stack=future_stack)
+        if future_stack:
+            # NA values in unstacked persist to restacked in version 3
+            restacked = restacked.dropna(how="all")
+        restacked = restacked.swaplevel(0, 1).swaplevel(1, 2)
+        restacked = restacked.sort_index(level=0)
+
+        tm.assert_frame_equal(restacked, ymd)
+        assert restacked.index.names == ymd.index.names
+
+        # GH #451
+        unstacked = ymd.unstack([1, 2])
+        expected = ymd.unstack(1).unstack(1).dropna(axis=1, how="all")
+        tm.assert_frame_equal(unstacked, expected)
+
+        unstacked = ymd.unstack([2, 1])
+        expected = ymd.unstack(2).unstack(1).dropna(axis=1, how="all")
+        tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns])
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_names_and_numbers(
+        self, multiindex_year_month_day_dataframe_random_data, future_stack
+    ):
+        ymd = multiindex_year_month_day_dataframe_random_data
+
+        unstacked = ymd.unstack(["year", "month"])
+
+        # Can't use mixture of names and numbers to stack
+        with pytest.raises(ValueError, match="level should contain"):
+            unstacked.stack([0, "month"], future_stack=future_stack)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_multiple_out_of_bounds(
+        self, multiindex_year_month_day_dataframe_random_data, future_stack
+    ):
+        # nlevels == 3
+        ymd = multiindex_year_month_day_dataframe_random_data
+
+        unstacked = ymd.unstack(["year", "month"])
+
+        with pytest.raises(IndexError, match="Too many levels"):
+            unstacked.stack([2, 3], future_stack=future_stack)
+        with pytest.raises(IndexError, match="not a valid level number"):
+            unstacked.stack([-4, -3], future_stack=future_stack)
+
+    def test_unstack_period_series(self):
+        # GH4342
+        idx1 = pd.PeriodIndex(
+            ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"],
+            freq="M",
+            name="period",
+        )
+        idx2 = Index(["A", "B"] * 3, name="str")
+        value = [1, 2, 3, 4, 5, 6]
+
+        idx = MultiIndex.from_arrays([idx1, idx2])
+        s = Series(value, index=idx)
+
+        result1 = s.unstack()
+        result2 = s.unstack(level=1)
+        result3 = s.unstack(level=0)
+
+        e_idx = pd.PeriodIndex(
+            ["2013-01", "2013-02", "2013-03"], freq="M", name="period"
+        )
+        expected = DataFrame(
+            {"A": [1, 3, 5], "B": [2, 4, 6]}, index=e_idx, columns=["A", "B"]
+        )
+        expected.columns.name = "str"
+
+        tm.assert_frame_equal(result1, expected)
+        tm.assert_frame_equal(result2, expected)
+        tm.assert_frame_equal(result3, expected.T)
+
+        idx1 = pd.PeriodIndex(
+            ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"],
+            freq="M",
+            name="period1",
+        )
+
+        idx2 = pd.PeriodIndex(
+            ["2013-12", "2013-11", "2013-10", "2013-09", "2013-08", "2013-07"],
+            freq="M",
+            name="period2",
+        )
+        idx = MultiIndex.from_arrays([idx1, idx2])
+        s = Series(value, index=idx)
+
+        result1 = s.unstack()
+        result2 = s.unstack(level=1)
+        result3 = s.unstack(level=0)
+
+        e_idx = pd.PeriodIndex(
+            ["2013-01", "2013-02", "2013-03"], freq="M", name="period1"
+        )
+        e_cols = pd.PeriodIndex(
+            ["2013-07", "2013-08", "2013-09", "2013-10", "2013-11", "2013-12"],
+            freq="M",
+            name="period2",
+        )
+        expected = DataFrame(
+            [
+                [np.nan, np.nan, np.nan, np.nan, 2, 1],
+                [np.nan, np.nan, 4, 3, np.nan, np.nan],
+                [6, 5, np.nan, np.nan, np.nan, np.nan],
+            ],
+            index=e_idx,
+            columns=e_cols,
+        )
+
+        tm.assert_frame_equal(result1, expected)
+        tm.assert_frame_equal(result2, expected)
+        tm.assert_frame_equal(result3, expected.T)
+
+    def test_unstack_period_frame(self):
+        # GH4342
+        idx1 = pd.PeriodIndex(
+            ["2014-01", "2014-02", "2014-02", "2014-02", "2014-01", "2014-01"],
+            freq="M",
+            name="period1",
+        )
+        idx2 = pd.PeriodIndex(
+            ["2013-12", "2013-12", "2014-02", "2013-10", "2013-10", "2014-02"],
+            freq="M",
+            name="period2",
+        )
+        value = {"A": [1, 2, 3, 4, 5, 6], "B": [6, 5, 4, 3, 2, 1]}
+        idx = MultiIndex.from_arrays([idx1, idx2])
+        df = DataFrame(value, index=idx)
+
+        result1 = df.unstack()
+        result2 = df.unstack(level=1)
+        result3 = df.unstack(level=0)
+
+        e_1 = pd.PeriodIndex(["2014-01", "2014-02"], freq="M", name="period1")
+        e_2 = pd.PeriodIndex(
+            ["2013-10", "2013-12", "2014-02", "2013-10", "2013-12", "2014-02"],
+            freq="M",
+            name="period2",
+        )
+        e_cols = MultiIndex.from_arrays(["A A A B B B".split(), e_2])
+        expected = DataFrame(
+            [[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], index=e_1, columns=e_cols
+        )
+
+        tm.assert_frame_equal(result1, expected)
+        tm.assert_frame_equal(result2, expected)
+
+        e_1 = pd.PeriodIndex(
+            ["2014-01", "2014-02", "2014-01", "2014-02"], freq="M", name="period1"
+        )
+        e_2 = pd.PeriodIndex(
+            ["2013-10", "2013-12", "2014-02"], freq="M", name="period2"
+        )
+        e_cols = MultiIndex.from_arrays(["A A B B".split(), e_1])
+        expected = DataFrame(
+            [[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], index=e_2, columns=e_cols
+        )
+
+        tm.assert_frame_equal(result3, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_multiple_bug(self, future_stack, using_infer_string):
+        # bug when some uniques are not present in the data GH#3170
+        id_col = ([1] * 3) + ([2] * 3)
+        name = (["a"] * 3) + (["b"] * 3)
+        date = pd.to_datetime(["2013-01-03", "2013-01-04", "2013-01-05"] * 2)
+        var1 = np.random.default_rng(2).integers(0, 100, 6)
+        df = DataFrame({"ID": id_col, "NAME": name, "DATE": date, "VAR1": var1})
+
+        multi = df.set_index(["DATE", "ID"])
+        multi.columns.name = "Params"
+        unst = multi.unstack("ID")
+        msg = re.escape("agg function failed [how->mean,dtype->")
+        if using_infer_string:
+            msg = "dtype 'str' does not support operation 'mean'"
+        with pytest.raises(TypeError, match=msg):
+            unst.resample("W-THU").mean()
+        down = unst.resample("W-THU").mean(numeric_only=True)
+        rs = down.stack("ID", future_stack=future_stack)
+        xp = (
+            unst.loc[:, ["VAR1"]]
+            .resample("W-THU")
+            .mean()
+            .stack("ID", future_stack=future_stack)
+        )
+        xp.columns.name = "Params"
+        tm.assert_frame_equal(rs, xp)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_dropna(self, future_stack):
+        # GH#3997
+        df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]})
+        df = df.set_index(["A", "B"])
+
+        dropna = False if not future_stack else lib.no_default
+        stacked = df.unstack().stack(dropna=dropna, future_stack=future_stack)
+        assert len(stacked) > len(stacked.dropna())
+
+        if future_stack:
+            with pytest.raises(ValueError, match="dropna must be unspecified"):
+                df.unstack().stack(dropna=True, future_stack=future_stack)
+        else:
+            stacked = df.unstack().stack(dropna=True, future_stack=future_stack)
+            tm.assert_frame_equal(stacked, stacked.dropna())
+
+    def test_unstack_multiple_hierarchical(self, future_stack):
+        df = DataFrame(
+            index=[
+                [0, 0, 0, 0, 1, 1, 1, 1],
+                [0, 0, 1, 1, 0, 0, 1, 1],
+                [0, 1, 0, 1, 0, 1, 0, 1],
+            ],
+            columns=[[0, 0, 1, 1], [0, 1, 0, 1]],
+        )
+
+        df.index.names = ["a", "b", "c"]
+        df.columns.names = ["d", "e"]
+
+        # it works!
+        df.unstack(["b", "c"])
+
+    def test_unstack_sparse_keyspace(self):
+        # memory problems with naive impl GH#2278
+        # Generate Long File & Test Pivot
+        NUM_ROWS = 1000
+
+        df = DataFrame(
+            {
+                "A": np.random.default_rng(2).integers(100, size=NUM_ROWS),
+                "B": np.random.default_rng(3).integers(300, size=NUM_ROWS),
+                "C": np.random.default_rng(4).integers(-7, 7, size=NUM_ROWS),
+                "D": np.random.default_rng(5).integers(-19, 19, size=NUM_ROWS),
+                "E": np.random.default_rng(6).integers(3000, size=NUM_ROWS),
+                "F": np.random.default_rng(7).standard_normal(NUM_ROWS),
+            }
+        )
+
+        idf = df.set_index(["A", "B", "C", "D", "E"])
+
+        # it works! is sufficient
+        idf.unstack("E")
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_unstack_unobserved_keys(self, future_stack):
+        # related to GH#2278 refactoring
+        levels = [[0, 1], [0, 1, 2, 3]]
+        codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
+
+        index = MultiIndex(levels, codes)
+
+        df = DataFrame(np.random.default_rng(2).standard_normal((4, 2)), index=index)
+
+        result = df.unstack()
+        assert len(result.columns) == 4
+
+        recons = result.stack(future_stack=future_stack)
+        tm.assert_frame_equal(recons, df)
+
+    @pytest.mark.slow
+    def test_unstack_number_of_levels_larger_than_int32_warns(
+        self, performance_warning, monkeypatch
+    ):
+        # GH#20601
+        # GH 26314: Change ValueError to PerformanceWarning
+
+        class MockUnstacker(reshape_lib._Unstacker):
+            def __init__(self, *args, **kwargs) -> None:
+                # __init__ will raise the warning
+                super().__init__(*args, **kwargs)
+                raise Exception("Don't compute final result.")
+
+            def _make_selectors(self) -> None:
+                pass
+
+        with monkeypatch.context() as m:
+            m.setattr(reshape_lib, "_Unstacker", MockUnstacker)
+            df = DataFrame(
+                np.zeros((2**16, 2)),
+                index=[np.arange(2**16), np.arange(2**16)],
+            )
+            msg = "The following operation may generate"
+            with tm.assert_produces_warning(performance_warning, match=msg):
+                with pytest.raises(Exception, match="Don't compute final result."):
+                    df.unstack()
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    @pytest.mark.parametrize(
+        "levels",
+        itertools.chain.from_iterable(
+            itertools.product(itertools.permutations([0, 1, 2], width), repeat=2)
+            for width in [2, 3]
+        ),
+    )
+    @pytest.mark.parametrize("stack_lev", range(2))
+    def test_stack_order_with_unsorted_levels(
+        self, levels, stack_lev, sort, future_stack
+    ):
+        # GH#16323
+        # deep check for 1-row case
+        columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
+        df = DataFrame(columns=columns, data=[range(4)])
+        kwargs = {} if future_stack else {"sort": sort}
+        df_stacked = df.stack(stack_lev, future_stack=future_stack, **kwargs)
+        for row in df.index:
+            for col in df.columns:
+                expected = df.loc[row, col]
+                result_row = row, col[stack_lev]
+                result_col = col[1 - stack_lev]
+                result = df_stacked.loc[result_row, result_col]
+                assert result == expected
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_order_with_unsorted_levels_multi_row(self, future_stack):
+        # GH#16323
+
+        # check multi-row case
+        mi = MultiIndex(
+            levels=[["A", "C", "B"], ["B", "A", "C"]],
+            codes=[np.repeat(range(3), 3), np.tile(range(3), 3)],
+        )
+        df = DataFrame(
+            columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1)
+        )
+        assert all(
+            df.loc[row, col]
+            == df.stack(0, future_stack=future_stack).loc[(row, col[0]), col[1]]
+            for row in df.index
+            for col in df.columns
+        )
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_order_with_unsorted_levels_multi_row_2(self, future_stack):
+        # GH#53636
+        levels = ((0, 1), (1, 0))
+        stack_lev = 1
+        columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
+        df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3])
+        kwargs = {} if future_stack else {"sort": True}
+        result = df.stack(stack_lev, future_stack=future_stack, **kwargs)
+        expected_index = MultiIndex(
+            levels=[[0, 1, 2, 3], [0, 1]],
+            codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]],
+        )
+        expected = DataFrame(
+            {
+                0: [0, 1, 0, 1, 0, 1, 0, 1],
+                1: [2, 3, 2, 3, 2, 3, 2, 3],
+            },
+            index=expected_index,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_unstack_unordered_multiindex(self, future_stack):
+        # GH# 18265
+        values = np.arange(5)
+        data = np.vstack(
+            [
+                [f"b{x}" for x in values],  # b0, b1, ..
+                [f"a{x}" for x in values],  # a0, a1, ..
+            ]
+        )
+        df = DataFrame(data.T, columns=["b", "a"])
+        df.columns.name = "first"
+        second_level_dict = {"x": df}
+        multi_level_df = pd.concat(second_level_dict, axis=1)
+        multi_level_df.columns.names = ["second", "first"]
+        df = multi_level_df.reindex(sorted(multi_level_df.columns), axis=1)
+        result = df.stack(["first", "second"], future_stack=future_stack).unstack(
+            ["first", "second"]
+        )
+        expected = DataFrame(
+            [["a0", "b0"], ["a1", "b1"], ["a2", "b2"], ["a3", "b3"], ["a4", "b4"]],
+            index=range(5),
+            columns=MultiIndex.from_tuples(
+                [("a", "x"), ("b", "x")], names=["first", "second"]
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_preserve_types(
+        self, multiindex_year_month_day_dataframe_random_data, using_infer_string
+    ):
+        # GH#403
+        ymd = multiindex_year_month_day_dataframe_random_data
+        ymd["E"] = "foo"
+        ymd["F"] = 2
+
+        unstacked = ymd.unstack("month")
+        assert unstacked["A", 1].dtype == np.float64
+        assert (
+            unstacked["E", 1].dtype == np.object_
+            if not using_infer_string
+            else "string"
+        )
+        assert unstacked["F", 1].dtype == np.float64
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_unstack_group_index_overflow(self, future_stack):
+        codes = np.tile(np.arange(500), 2)
+        level = np.arange(500)
+
+        index = MultiIndex(
+            levels=[level] * 8 + [[0, 1]],
+            codes=[codes] * 8 + [np.arange(2).repeat(500)],
+        )
+
+        s = Series(np.arange(1000), index=index)
+        result = s.unstack()
+        assert result.shape == (500, 2)
+
+        # test roundtrip
+        stacked = result.stack(future_stack=future_stack)
+        tm.assert_series_equal(s, stacked.reindex(s.index))
+
+        # put it at beginning
+        index = MultiIndex(
+            levels=[[0, 1]] + [level] * 8,
+            codes=[np.arange(2).repeat(500)] + [codes] * 8,
+        )
+
+        s = Series(np.arange(1000), index=index)
+        result = s.unstack(0)
+        assert result.shape == (500, 2)
+
+        # put it in middle
+        index = MultiIndex(
+            levels=[level] * 4 + [[0, 1]] + [level] * 4,
+            codes=([codes] * 4 + [np.arange(2).repeat(500)] + [codes] * 4),
+        )
+
+        s = Series(np.arange(1000), index=index)
+        result = s.unstack(4)
+        assert result.shape == (500, 2)
+
+    def test_unstack_with_missing_int_cast_to_float(self):
+        # https://github.com/pandas-dev/pandas/issues/37115
+        df = DataFrame(
+            {
+                "a": ["A", "A", "B"],
+                "b": ["ca", "cb", "cb"],
+                "v": [10] * 3,
+            }
+        ).set_index(["a", "b"])
+
+        # add another int column to get 2 blocks
+        df["is_"] = 1
+        assert len(df._mgr.blocks) == 2
+
+        result = df.unstack("b")
+        result[("is_", "ca")] = result[("is_", "ca")].fillna(0)
+
+        expected = DataFrame(
+            [[10.0, 10.0, 1.0, 1.0], [np.nan, 10.0, 0.0, 1.0]],
+            index=Index(["A", "B"], name="a"),
+            columns=MultiIndex.from_tuples(
+                [("v", "ca"), ("v", "cb"), ("is_", "ca"), ("is_", "cb")],
+                names=[None, "b"],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_with_level_has_nan(self):
+        # GH 37510
+        df1 = DataFrame(
+            {
+                "L1": [1, 2, 3, 4],
+                "L2": [3, 4, 1, 2],
+                "L3": [1, 1, 1, 1],
+                "x": [1, 2, 3, 4],
+            }
+        )
+        df1 = df1.set_index(["L1", "L2", "L3"])
+        new_levels = ["n1", "n2", "n3", None]
+        df1.index = df1.index.set_levels(levels=new_levels, level="L1")
+        df1.index = df1.index.set_levels(levels=new_levels, level="L2")
+
+        result = df1.unstack("L3")[("x", 1)].sort_index().index
+        expected = MultiIndex(
+            levels=[["n1", "n2", "n3", None], ["n1", "n2", "n3", None]],
+            codes=[[0, 1, 2, 3], [2, 3, 0, 1]],
+            names=["L1", "L2"],
+        )
+
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_nan_in_multiindex_columns(self, future_stack):
+        # GH#39481
+        df = DataFrame(
+            np.zeros([1, 5]),
+            columns=MultiIndex.from_tuples(
+                [
+                    (0, None, None),
+                    (0, 2, 0),
+                    (0, 2, 1),
+                    (0, 3, 0),
+                    (0, 3, 1),
+                ],
+            ),
+        )
+        result = df.stack(2, future_stack=future_stack)
+        if future_stack:
+            index = MultiIndex(levels=[[0], [0.0, 1.0]], codes=[[0, 0, 0], [-1, 0, 1]])
+            columns = MultiIndex(levels=[[0], [2, 3]], codes=[[0, 0, 0], [-1, 0, 1]])
+        else:
+            index = Index([(0, None), (0, 0), (0, 1)])
+            columns = Index([(0, None), (0, 2), (0, 3)])
+        expected = DataFrame(
+            [[0.0, np.nan, np.nan], [np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0]],
+            index=index,
+            columns=columns,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_multi_level_stack_categorical(self, future_stack):
+        # GH 15239
+        midx = MultiIndex.from_arrays(
+            [
+                ["A"] * 2 + ["B"] * 2,
+                pd.Categorical(list("abab")),
+                pd.Categorical(list("ccdd")),
+            ]
+        )
+        df = DataFrame(np.arange(8).reshape(2, 4), columns=midx)
+        result = df.stack([1, 2], future_stack=future_stack)
+        if future_stack:
+            expected = DataFrame(
+                [
+                    [0, np.nan],
+                    [1, np.nan],
+                    [np.nan, 2],
+                    [np.nan, 3],
+                    [4, np.nan],
+                    [5, np.nan],
+                    [np.nan, 6],
+                    [np.nan, 7],
+                ],
+                columns=["A", "B"],
+                index=MultiIndex.from_arrays(
+                    [
+                        [0] * 4 + [1] * 4,
+                        pd.Categorical(list("abababab")),
+                        pd.Categorical(list("ccddccdd")),
+                    ]
+                ),
+            )
+        else:
+            expected = DataFrame(
+                [
+                    [0, np.nan],
+                    [np.nan, 2],
+                    [1, np.nan],
+                    [np.nan, 3],
+                    [4, np.nan],
+                    [np.nan, 6],
+                    [5, np.nan],
+                    [np.nan, 7],
+                ],
+                columns=["A", "B"],
+                index=MultiIndex.from_arrays(
+                    [
+                        [0] * 4 + [1] * 4,
+                        pd.Categorical(list("aabbaabb")),
+                        pd.Categorical(list("cdcdcdcd")),
+                    ]
+                ),
+            )
+        tm.assert_frame_equal(result, expected, check_index_type=False)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_nan_level(self, future_stack):
+        # GH 9406
+        df_nan = DataFrame(
+            np.arange(4).reshape(2, 2),
+            columns=MultiIndex.from_tuples(
+                [("A", np.nan), ("B", "b")], names=["Upper", "Lower"]
+            ),
+            index=Index([0, 1], name="Num"),
+            dtype=np.float64,
+        )
+        result = df_nan.stack(future_stack=future_stack)
+        if future_stack:
+            index = MultiIndex(
+                levels=[[0, 1], [np.nan, "b"]],
+                codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
+                names=["Num", "Lower"],
+            )
+        else:
+            index = MultiIndex.from_tuples(
+                [(0, np.nan), (0, "b"), (1, np.nan), (1, "b")], names=["Num", "Lower"]
+            )
+        expected = DataFrame(
+            [[0.0, np.nan], [np.nan, 1], [2.0, np.nan], [np.nan, 3.0]],
+            columns=Index(["A", "B"], name="Upper"),
+            index=index,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_categorical_columns(self):
+        # GH 14018
+        idx = MultiIndex.from_product([["A"], [0, 1]])
+        df = DataFrame({"cat": pd.Categorical(["a", "b"])}, index=idx)
+        result = df.unstack()
+        expected = DataFrame(
+            {
+                0: pd.Categorical(["a"], categories=["a", "b"]),
+                1: pd.Categorical(["b"], categories=["a", "b"]),
+            },
+            index=["A"],
+        )
+        expected.columns = MultiIndex.from_tuples([("cat", 0), ("cat", 1)])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_unsorted(self, future_stack):
+        # GH 16925
+        PAE = ["ITA", "FRA"]
+        VAR = ["A1", "A2"]
+        TYP = ["CRT", "DBT", "NET"]
+        MI = MultiIndex.from_product([PAE, VAR, TYP], names=["PAE", "VAR", "TYP"])
+
+        V = list(range(len(MI)))
+        DF = DataFrame(data=V, index=MI, columns=["VALUE"])
+
+        DF = DF.unstack(["VAR", "TYP"])
+        DF.columns = DF.columns.droplevel(0)
+        DF.loc[:, ("A0", "NET")] = 9999
+
+        result = DF.stack(["VAR", "TYP"], future_stack=future_stack).sort_index()
+        expected = (
+            DF.sort_index(axis=1)
+            .stack(["VAR", "TYP"], future_stack=future_stack)
+            .sort_index()
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:The previous implementation of stack is deprecated"
+    )
+    def test_stack_nullable_dtype(self, future_stack):
+        # GH#43561
+        columns = MultiIndex.from_product(
+            [["54511", "54515"], ["r", "t_mean"]], names=["station", "element"]
+        )
+        index = Index([1, 2, 3], name="time")
+
+        arr = np.array([[50, 226, 10, 215], [10, 215, 9, 220], [305, 232, 111, 220]])
+        df = DataFrame(arr, columns=columns, index=index, dtype=pd.Int64Dtype())
+
+        result = df.stack("station", future_stack=future_stack)
+
+        expected = (
+            df.astype(np.int64)
+            .stack("station", future_stack=future_stack)
+            .astype(pd.Int64Dtype())
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # non-homogeneous case
+        df[df.columns[0]] = df[df.columns[0]].astype(pd.Float64Dtype())
+        result = df.stack("station", future_stack=future_stack)
+
+        expected = DataFrame(
+            {
+                "r": pd.array(
+                    [50.0, 10.0, 10.0, 9.0, 305.0, 111.0], dtype=pd.Float64Dtype()
+                ),
+                "t_mean": pd.array(
+                    [226, 215, 215, 220, 232, 220], dtype=pd.Int64Dtype()
+                ),
+            },
+            index=MultiIndex.from_product([index, columns.levels[0]]),
+        )
+        expected.columns.name = "element"
+        tm.assert_frame_equal(result, expected)
+
+    def test_unstack_mixed_level_names(self):
+        # GH#48763
+        arrays = [["a", "a"], [1, 2], ["red", "blue"]]
+        idx = MultiIndex.from_arrays(arrays, names=("x", 0, "y"))
+        df = DataFrame({"m": [1, 2]}, index=idx)
+        result = df.unstack("x")
+        expected = DataFrame(
+            [[1], [2]],
+            columns=MultiIndex.from_tuples([("m", "a")], names=[None, "x"]),
+            index=MultiIndex.from_tuples([(1, "red"), (2, "blue")], names=[0, "y"]),
+        )
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.filterwarnings("ignore:The previous implementation of stack is deprecated")
+def test_stack_tuple_columns(future_stack):
+    # GH#54948 - test stack when the input has a non-MultiIndex with tuples
+    df = DataFrame(
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=[("a", 1), ("a", 2), ("b", 1)]
+    )
+    result = df.stack(future_stack=future_stack)
+    expected = Series(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9],
+        index=MultiIndex(
+            levels=[range(3), [("a", 1), ("a", 2), ("b", 1)]],
+            codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
+        ),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "dtype, na_value",
+    [
+        ("float64", np.nan),
+        ("Float64", np.nan),
+        ("Float64", pd.NA),
+        ("Int64", pd.NA),
+    ],
+)
+@pytest.mark.parametrize("test_multiindex", [True, False])
+def test_stack_preserves_na(dtype, na_value, test_multiindex):
+    # GH#56573
+    if test_multiindex:
+        index = MultiIndex.from_arrays(2 * [Index([na_value], dtype=dtype)])
+    else:
+        index = Index([na_value], dtype=dtype)
+    df = DataFrame({"a": [1]}, index=index)
+    result = df.stack()
+
+    if test_multiindex:
+        expected_index = MultiIndex.from_arrays(
+            [
+                Index([na_value], dtype=dtype),
+                Index([na_value], dtype=dtype),
+                Index(["a"]),
+            ]
+        )
+    else:
+        expected_index = MultiIndex.from_arrays(
+            [
+                Index([na_value], dtype=dtype),
+                Index(["a"]),
+            ]
+        )
+    expected = Series(1, index=expected_index)
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1abbeea80ff3093948cb7c031eed304089a377c
--- /dev/null
+++ b/pandas/tests/frame/test_subclass.py
@@ -0,0 +1,817 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+)
+import pandas._testing as tm
+
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
+)
+
+
+class TestDataFrameSubclassing:
+    def test_no_warning_on_mgr(self):
+        # GH#57032
+        df = tm.SubclassedDataFrame(
+            {"X": [1, 2, 3], "Y": [1, 2, 3]}, index=["a", "b", "c"]
+        )
+        with tm.assert_produces_warning(None):
+            # df.isna() goes through _constructor_from_mgr, which we want to
+            #  *not* pass a Manager do __init__
+            df.isna()
+            df["X"].isna()
+
+    def test_frame_subclassing_and_slicing(self):
+        # Subclass frame and ensure it returns the right class on slicing it
+        # In reference to PR 9632
+
+        class CustomSeries(Series):
+            @property
+            def _constructor(self):
+                return CustomSeries
+
+            def custom_series_function(self):
+                return "OK"
+
+        class CustomDataFrame(DataFrame):
+            """
+            Subclasses pandas DF, fills DF with simulation results, adds some
+            custom plotting functions.
+            """
+
+            def __init__(self, *args, **kw) -> None:
+                super().__init__(*args, **kw)
+
+            @property
+            def _constructor(self):
+                return CustomDataFrame
+
+            _constructor_sliced = CustomSeries
+
+            def custom_frame_function(self):
+                return "OK"
+
+        data = {"col1": range(10), "col2": range(10)}
+        cdf = CustomDataFrame(data)
+
+        # Did we get back our own DF class?
+        assert isinstance(cdf, CustomDataFrame)
+
+        # Do we get back our own Series class after selecting a column?
+        cdf_series = cdf.col1
+        assert isinstance(cdf_series, CustomSeries)
+        assert cdf_series.custom_series_function() == "OK"
+
+        # Do we get back our own DF class after slicing row-wise?
+        cdf_rows = cdf[1:5]
+        assert isinstance(cdf_rows, CustomDataFrame)
+        assert cdf_rows.custom_frame_function() == "OK"
+
+        # Make sure sliced part of multi-index frame is custom class
+        mcol = MultiIndex.from_tuples([("A", "A"), ("A", "B")])
+        cdf_multi = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
+        assert isinstance(cdf_multi["A"], CustomDataFrame)
+
+        mcol = MultiIndex.from_tuples([("A", ""), ("B", "")])
+        cdf_multi2 = CustomDataFrame([[0, 1], [2, 3]], columns=mcol)
+        assert isinstance(cdf_multi2["A"], CustomSeries)
+
+    def test_dataframe_metadata(self, temp_file):
+        df = tm.SubclassedDataFrame(
+            {"X": [1, 2, 3], "Y": [1, 2, 3]}, index=["a", "b", "c"]
+        )
+        df.testattr = "XXX"
+
+        assert df.testattr == "XXX"
+        assert df[["X"]].testattr == "XXX"
+        assert df.loc[["a", "b"], :].testattr == "XXX"
+        assert df.iloc[[0, 1], :].testattr == "XXX"
+
+        # see gh-9776
+        assert df.iloc[0:1, :].testattr == "XXX"
+
+        # see gh-10553
+        unpickled = tm.round_trip_pickle(df, temp_file)
+        tm.assert_frame_equal(df, unpickled)
+        assert df._metadata == unpickled._metadata
+        assert df.testattr == unpickled.testattr
+
+    def test_indexing_sliced(self):
+        # GH 11559
+        df = tm.SubclassedDataFrame(
+            {"X": [1, 2, 3], "Y": [4, 5, 6], "Z": [7, 8, 9]}, index=["a", "b", "c"]
+        )
+        res = df.loc[:, "X"]
+        exp = tm.SubclassedSeries([1, 2, 3], index=list("abc"), name="X")
+        tm.assert_series_equal(res, exp)
+        assert isinstance(res, tm.SubclassedSeries)
+
+        res = df.iloc[:, 1]
+        exp = tm.SubclassedSeries([4, 5, 6], index=list("abc"), name="Y")
+        tm.assert_series_equal(res, exp)
+        assert isinstance(res, tm.SubclassedSeries)
+
+        res = df.loc[:, "Z"]
+        exp = tm.SubclassedSeries([7, 8, 9], index=list("abc"), name="Z")
+        tm.assert_series_equal(res, exp)
+        assert isinstance(res, tm.SubclassedSeries)
+
+        res = df.loc["a", :]
+        exp = tm.SubclassedSeries([1, 4, 7], index=list("XYZ"), name="a")
+        tm.assert_series_equal(res, exp)
+        assert isinstance(res, tm.SubclassedSeries)
+
+        res = df.iloc[1, :]
+        exp = tm.SubclassedSeries([2, 5, 8], index=list("XYZ"), name="b")
+        tm.assert_series_equal(res, exp)
+        assert isinstance(res, tm.SubclassedSeries)
+
+        res = df.loc["c", :]
+        exp = tm.SubclassedSeries([3, 6, 9], index=list("XYZ"), name="c")
+        tm.assert_series_equal(res, exp)
+        assert isinstance(res, tm.SubclassedSeries)
+
+    def test_subclass_attr_err_propagation(self):
+        # GH 11808
+        class A(DataFrame):
+            @property
+            def nonexistence(self):
+                return self.i_dont_exist
+
+        with pytest.raises(AttributeError, match=".*i_dont_exist.*"):
+            A().nonexistence
+
+    def test_subclass_align(self):
+        # GH 12983
+        df1 = tm.SubclassedDataFrame(
+            {"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE")
+        )
+        df2 = tm.SubclassedDataFrame(
+            {"c": [1, 2, 4], "d": [1, 2, 4]}, index=list("ABD")
+        )
+
+        res1, res2 = df1.align(df2, axis=0)
+        exp1 = tm.SubclassedDataFrame(
+            {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]},
+            index=list("ABCDE"),
+        )
+        exp2 = tm.SubclassedDataFrame(
+            {"c": [1, 2, np.nan, 4, np.nan], "d": [1, 2, np.nan, 4, np.nan]},
+            index=list("ABCDE"),
+        )
+        assert isinstance(res1, tm.SubclassedDataFrame)
+        tm.assert_frame_equal(res1, exp1)
+        assert isinstance(res2, tm.SubclassedDataFrame)
+        tm.assert_frame_equal(res2, exp2)
+
+        res1, res2 = df1.a.align(df2.c)
+        assert isinstance(res1, tm.SubclassedSeries)
+        tm.assert_series_equal(res1, exp1.a)
+        assert isinstance(res2, tm.SubclassedSeries)
+        tm.assert_series_equal(res2, exp2.c)
+
+    def test_subclass_align_combinations(self):
+        # GH 12983
+        df = tm.SubclassedDataFrame({"a": [1, 3, 5], "b": [1, 3, 5]}, index=list("ACE"))
+        s = tm.SubclassedSeries([1, 2, 4], index=list("ABD"), name="x")
+
+        # frame + series
+        res1, res2 = df.align(s, axis=0)
+        exp1 = tm.SubclassedDataFrame(
+            {"a": [1, np.nan, 3, np.nan, 5], "b": [1, np.nan, 3, np.nan, 5]},
+            index=list("ABCDE"),
+        )
+        # name is lost when
+        exp2 = tm.SubclassedSeries(
+            [1, 2, np.nan, 4, np.nan], index=list("ABCDE"), name="x"
+        )
+
+        assert isinstance(res1, tm.SubclassedDataFrame)
+        tm.assert_frame_equal(res1, exp1)
+        assert isinstance(res2, tm.SubclassedSeries)
+        tm.assert_series_equal(res2, exp2)
+
+        # series + frame
+        res1, res2 = s.align(df)
+        assert isinstance(res1, tm.SubclassedSeries)
+        tm.assert_series_equal(res1, exp2)
+        assert isinstance(res2, tm.SubclassedDataFrame)
+        tm.assert_frame_equal(res2, exp1)
+
+    def test_subclass_iterrows(self):
+        # GH 13977
+        df = tm.SubclassedDataFrame({"a": [1]})
+        for i, row in df.iterrows():
+            assert isinstance(row, tm.SubclassedSeries)
+            tm.assert_series_equal(row, df.loc[i])
+
+    def test_subclass_stack(self):
+        # GH 15564
+        df = tm.SubclassedDataFrame(
+            [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+            index=["a", "b", "c"],
+            columns=["X", "Y", "Z"],
+        )
+
+        res = df.stack()
+        exp = tm.SubclassedSeries(
+            [1, 2, 3, 4, 5, 6, 7, 8, 9], index=[list("aaabbbccc"), list("XYZXYZXYZ")]
+        )
+
+        tm.assert_series_equal(res, exp)
+
+    def test_subclass_stack_multi(self):
+        # GH 15564
+        df = tm.SubclassedDataFrame(
+            [[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]],
+            index=MultiIndex.from_tuples(
+                list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
+            ),
+            columns=MultiIndex.from_tuples(
+                list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
+            ),
+        )
+
+        exp = tm.SubclassedDataFrame(
+            [
+                [10, 12],
+                [11, 13],
+                [20, 22],
+                [21, 23],
+                [30, 32],
+                [31, 33],
+                [40, 42],
+                [41, 43],
+            ],
+            index=MultiIndex.from_tuples(
+                list(zip(list("AAAABBBB"), list("ccddccdd"), list("yzyzyzyz"))),
+                names=["aaa", "ccc", "yyy"],
+            ),
+            columns=Index(["W", "X"], name="www"),
+        )
+
+        res = df.stack()
+        tm.assert_frame_equal(res, exp)
+
+        res = df.stack("yyy")
+        tm.assert_frame_equal(res, exp)
+
+        exp = tm.SubclassedDataFrame(
+            [
+                [10, 11],
+                [12, 13],
+                [20, 21],
+                [22, 23],
+                [30, 31],
+                [32, 33],
+                [40, 41],
+                [42, 43],
+            ],
+            index=MultiIndex.from_tuples(
+                list(zip(list("AAAABBBB"), list("ccddccdd"), list("WXWXWXWX"))),
+                names=["aaa", "ccc", "www"],
+            ),
+            columns=Index(["y", "z"], name="yyy"),
+        )
+
+        res = df.stack("www")
+        tm.assert_frame_equal(res, exp)
+
+    def test_subclass_stack_multi_mixed(self):
+        # GH 15564
+        df = tm.SubclassedDataFrame(
+            [
+                [10, 11, 12.0, 13.0],
+                [20, 21, 22.0, 23.0],
+                [30, 31, 32.0, 33.0],
+                [40, 41, 42.0, 43.0],
+            ],
+            index=MultiIndex.from_tuples(
+                list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
+            ),
+            columns=MultiIndex.from_tuples(
+                list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
+            ),
+        )
+
+        exp = tm.SubclassedDataFrame(
+            [
+                [10, 12.0],
+                [11, 13.0],
+                [20, 22.0],
+                [21, 23.0],
+                [30, 32.0],
+                [31, 33.0],
+                [40, 42.0],
+                [41, 43.0],
+            ],
+            index=MultiIndex.from_tuples(
+                list(zip(list("AAAABBBB"), list("ccddccdd"), list("yzyzyzyz"))),
+                names=["aaa", "ccc", "yyy"],
+            ),
+            columns=Index(["W", "X"], name="www"),
+        )
+
+        res = df.stack()
+        tm.assert_frame_equal(res, exp)
+
+        res = df.stack("yyy")
+        tm.assert_frame_equal(res, exp)
+
+        exp = tm.SubclassedDataFrame(
+            [
+                [10.0, 11.0],
+                [12.0, 13.0],
+                [20.0, 21.0],
+                [22.0, 23.0],
+                [30.0, 31.0],
+                [32.0, 33.0],
+                [40.0, 41.0],
+                [42.0, 43.0],
+            ],
+            index=MultiIndex.from_tuples(
+                list(zip(list("AAAABBBB"), list("ccddccdd"), list("WXWXWXWX"))),
+                names=["aaa", "ccc", "www"],
+            ),
+            columns=Index(["y", "z"], name="yyy"),
+        )
+
+        res = df.stack("www")
+        tm.assert_frame_equal(res, exp)
+
+    def test_subclass_unstack(self):
+        # GH 15564
+        df = tm.SubclassedDataFrame(
+            [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+            index=["a", "b", "c"],
+            columns=["X", "Y", "Z"],
+        )
+
+        res = df.unstack()
+        exp = tm.SubclassedSeries(
+            [1, 4, 7, 2, 5, 8, 3, 6, 9], index=[list("XXXYYYZZZ"), list("abcabcabc")]
+        )
+
+        tm.assert_series_equal(res, exp)
+
+    def test_subclass_unstack_multi(self):
+        # GH 15564
+        df = tm.SubclassedDataFrame(
+            [[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]],
+            index=MultiIndex.from_tuples(
+                list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
+            ),
+            columns=MultiIndex.from_tuples(
+                list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
+            ),
+        )
+
+        exp = tm.SubclassedDataFrame(
+            [[10, 20, 11, 21, 12, 22, 13, 23], [30, 40, 31, 41, 32, 42, 33, 43]],
+            index=Index(["A", "B"], name="aaa"),
+            columns=MultiIndex.from_tuples(
+                list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("cdcdcdcd"))),
+                names=["www", "yyy", "ccc"],
+            ),
+        )
+
+        res = df.unstack()
+        tm.assert_frame_equal(res, exp)
+
+        res = df.unstack("ccc")
+        tm.assert_frame_equal(res, exp)
+
+        exp = tm.SubclassedDataFrame(
+            [[10, 30, 11, 31, 12, 32, 13, 33], [20, 40, 21, 41, 22, 42, 23, 43]],
+            index=Index(["c", "d"], name="ccc"),
+            columns=MultiIndex.from_tuples(
+                list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("ABABABAB"))),
+                names=["www", "yyy", "aaa"],
+            ),
+        )
+
+        res = df.unstack("aaa")
+        tm.assert_frame_equal(res, exp)
+
+    def test_subclass_unstack_multi_mixed(self):
+        # GH 15564
+        df = tm.SubclassedDataFrame(
+            [
+                [10, 11, 12.0, 13.0],
+                [20, 21, 22.0, 23.0],
+                [30, 31, 32.0, 33.0],
+                [40, 41, 42.0, 43.0],
+            ],
+            index=MultiIndex.from_tuples(
+                list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
+            ),
+            columns=MultiIndex.from_tuples(
+                list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
+            ),
+        )
+
+        exp = tm.SubclassedDataFrame(
+            [
+                [10, 20, 11, 21, 12.0, 22.0, 13.0, 23.0],
+                [30, 40, 31, 41, 32.0, 42.0, 33.0, 43.0],
+            ],
+            index=Index(["A", "B"], name="aaa"),
+            columns=MultiIndex.from_tuples(
+                list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("cdcdcdcd"))),
+                names=["www", "yyy", "ccc"],
+            ),
+        )
+
+        res = df.unstack()
+        tm.assert_frame_equal(res, exp)
+
+        res = df.unstack("ccc")
+        tm.assert_frame_equal(res, exp)
+
+        exp = tm.SubclassedDataFrame(
+            [
+                [10, 30, 11, 31, 12.0, 32.0, 13.0, 33.0],
+                [20, 40, 21, 41, 22.0, 42.0, 23.0, 43.0],
+            ],
+            index=Index(["c", "d"], name="ccc"),
+            columns=MultiIndex.from_tuples(
+                list(zip(list("WWWWXXXX"), list("yyzzyyzz"), list("ABABABAB"))),
+                names=["www", "yyy", "aaa"],
+            ),
+        )
+
+        res = df.unstack("aaa")
+        tm.assert_frame_equal(res, exp)
+
+    def test_subclass_pivot(self):
+        # GH 15564
+        df = tm.SubclassedDataFrame(
+            {
+                "index": ["A", "B", "C", "C", "B", "A"],
+                "columns": ["One", "One", "One", "Two", "Two", "Two"],
+                "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
+            }
+        )
+
+        pivoted = df.pivot(index="index", columns="columns", values="values")
+
+        expected = tm.SubclassedDataFrame(
+            {
+                "One": {"A": 1.0, "B": 2.0, "C": 3.0},
+                "Two": {"A": 1.0, "B": 2.0, "C": 3.0},
+            }
+        )
+
+        expected.index.name, expected.columns.name = "index", "columns"
+
+        tm.assert_frame_equal(pivoted, expected)
+
+    def test_subclassed_melt(self):
+        # GH 15564
+        cheese = tm.SubclassedDataFrame(
+            {
+                "first": ["John", "Mary"],
+                "last": ["Doe", "Bo"],
+                "height": [5.5, 6.0],
+                "weight": [130, 150],
+            }
+        )
+
+        melted = pd.melt(cheese, id_vars=["first", "last"])
+
+        expected = tm.SubclassedDataFrame(
+            [
+                ["John", "Doe", "height", 5.5],
+                ["Mary", "Bo", "height", 6.0],
+                ["John", "Doe", "weight", 130],
+                ["Mary", "Bo", "weight", 150],
+            ],
+            columns=["first", "last", "variable", "value"],
+        )
+
+        tm.assert_frame_equal(melted, expected)
+
+    def test_subclassed_wide_to_long(self):
+        # GH 9762
+
+        x = np.random.default_rng(2).standard_normal(3)
+        df = tm.SubclassedDataFrame(
+            {
+                "A1970": {0: "a", 1: "b", 2: "c"},
+                "A1980": {0: "d", 1: "e", 2: "f"},
+                "B1970": {0: 2.5, 1: 1.2, 2: 0.7},
+                "B1980": {0: 3.2, 1: 1.3, 2: 0.1},
+                "X": dict(zip(range(3), x)),
+            }
+        )
+
+        df["id"] = df.index
+        exp_data = {
+            "X": x.tolist() + x.tolist(),
+            "A": ["a", "b", "c", "d", "e", "f"],
+            "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
+            "year": [1970, 1970, 1970, 1980, 1980, 1980],
+            "id": [0, 1, 2, 0, 1, 2],
+        }
+        expected = tm.SubclassedDataFrame(exp_data)
+        expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
+        long_frame = pd.wide_to_long(df, ["A", "B"], i="id", j="year")
+
+        tm.assert_frame_equal(long_frame, expected)
+
+    def test_subclassed_apply(self):
+        # GH 19822
+
+        def check_row_subclass(row):
+            assert isinstance(row, tm.SubclassedSeries)
+
+        def stretch(row):
+            if row["variable"] == "height":
+                row["value"] += 0.5
+            return row
+
+        df = tm.SubclassedDataFrame(
+            [
+                ["John", "Doe", "height", 5.5],
+                ["Mary", "Bo", "height", 6.0],
+                ["John", "Doe", "weight", 130],
+                ["Mary", "Bo", "weight", 150],
+            ],
+            columns=["first", "last", "variable", "value"],
+        )
+
+        df.apply(lambda x: check_row_subclass(x))
+        df.apply(lambda x: check_row_subclass(x), axis=1)
+
+        expected = tm.SubclassedDataFrame(
+            [
+                ["John", "Doe", "height", 6.0],
+                ["Mary", "Bo", "height", 6.5],
+                ["John", "Doe", "weight", 130],
+                ["Mary", "Bo", "weight", 150],
+            ],
+            columns=["first", "last", "variable", "value"],
+        )
+
+        result = df.apply(lambda x: stretch(x), axis=1)
+        assert isinstance(result, tm.SubclassedDataFrame)
+        tm.assert_frame_equal(result, expected)
+
+        expected = tm.SubclassedDataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]])
+
+        result = df.apply(lambda x: tm.SubclassedSeries([1, 2, 3]), axis=1)
+        assert isinstance(result, tm.SubclassedDataFrame)
+        tm.assert_frame_equal(result, expected)
+
+        result = df.apply(lambda x: [1, 2, 3], axis=1, result_type="expand")
+        assert isinstance(result, tm.SubclassedDataFrame)
+        tm.assert_frame_equal(result, expected)
+
+        expected = tm.SubclassedSeries([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]])
+
+        result = df.apply(lambda x: [1, 2, 3], axis=1)
+        assert not isinstance(result, tm.SubclassedDataFrame)
+        tm.assert_series_equal(result, expected)
+
+    def test_subclassed_reductions(self, all_reductions):
+        # GH 25596
+
+        df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
+        result = getattr(df, all_reductions)()
+        assert isinstance(result, tm.SubclassedSeries)
+
+    def test_subclassed_count(self):
+        df = tm.SubclassedDataFrame(
+            {
+                "Person": ["John", "Myla", "Lewis", "John", "Myla"],
+                "Age": [24.0, np.nan, 21.0, 33, 26],
+                "Single": [False, True, True, True, False],
+            }
+        )
+        result = df.count()
+        assert isinstance(result, tm.SubclassedSeries)
+
+        df = tm.SubclassedDataFrame({"A": [1, 0, 3], "B": [0, 5, 6], "C": [7, 8, 0]})
+        result = df.count()
+        assert isinstance(result, tm.SubclassedSeries)
+
+        df = tm.SubclassedDataFrame(
+            [[10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]],
+            index=MultiIndex.from_tuples(
+                list(zip(list("AABB"), list("cdcd"))), names=["aaa", "ccc"]
+            ),
+            columns=MultiIndex.from_tuples(
+                list(zip(list("WWXX"), list("yzyz"))), names=["www", "yyy"]
+            ),
+        )
+        result = df.count()
+        assert isinstance(result, tm.SubclassedSeries)
+
+        df = tm.SubclassedDataFrame()
+        result = df.count()
+        assert isinstance(result, tm.SubclassedSeries)
+
+    def test_isin(self):
+        df = tm.SubclassedDataFrame(
+            {"num_legs": [2, 4], "num_wings": [2, 0]}, index=["falcon", "dog"]
+        )
+        result = df.isin([0, 2])
+        assert isinstance(result, tm.SubclassedDataFrame)
+
+    def test_duplicated(self):
+        df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
+        result = df.duplicated()
+        assert isinstance(result, tm.SubclassedSeries)
+
+        df = tm.SubclassedDataFrame()
+        result = df.duplicated()
+        assert isinstance(result, tm.SubclassedSeries)
+
+    @pytest.mark.parametrize("idx_method", ["idxmax", "idxmin"])
+    def test_idx(self, idx_method):
+        df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
+        result = getattr(df, idx_method)()
+        assert isinstance(result, tm.SubclassedSeries)
+
+    def test_dot(self):
+        df = tm.SubclassedDataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
+        s = tm.SubclassedSeries([1, 1, 2, 1])
+        result = df.dot(s)
+        assert isinstance(result, tm.SubclassedSeries)
+
+        df = tm.SubclassedDataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
+        s = tm.SubclassedDataFrame([1, 1, 2, 1])
+        result = df.dot(s)
+        assert isinstance(result, tm.SubclassedDataFrame)
+
+    def test_memory_usage(self):
+        df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
+        result = df.memory_usage()
+        assert isinstance(result, tm.SubclassedSeries)
+
+        result = df.memory_usage(index=False)
+        assert isinstance(result, tm.SubclassedSeries)
+
+    def test_corrwith(self):
+        pytest.importorskip("scipy")
+        index = ["a", "b", "c", "d", "e"]
+        columns = ["one", "two", "three", "four"]
+        df1 = tm.SubclassedDataFrame(
+            np.random.default_rng(2).standard_normal((5, 4)),
+            index=index,
+            columns=columns,
+        )
+        df2 = tm.SubclassedDataFrame(
+            np.random.default_rng(2).standard_normal((4, 4)),
+            index=index[:4],
+            columns=columns,
+        )
+        correls = df1.corrwith(df2, axis=1, drop=True, method="kendall")
+
+        assert isinstance(correls, (tm.SubclassedSeries))
+
+    def test_asof(self):
+        N = 3
+        rng = pd.date_range("1/1/1990", periods=N, freq="53s")
+        df = tm.SubclassedDataFrame(
+            {
+                "A": [np.nan, np.nan, np.nan],
+                "B": [np.nan, np.nan, np.nan],
+                "C": [np.nan, np.nan, np.nan],
+            },
+            index=rng,
+        )
+
+        result = df.asof(rng[-2:])
+        assert isinstance(result, tm.SubclassedDataFrame)
+
+        result = df.asof(rng[-2])
+        assert isinstance(result, tm.SubclassedSeries)
+
+        result = df.asof("1989-12-31")
+        assert isinstance(result, tm.SubclassedSeries)
+
+    def test_idxmin_preserves_subclass(self):
+        # GH 28330
+
+        df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
+        result = df.idxmin()
+        assert isinstance(result, tm.SubclassedSeries)
+
+    def test_idxmax_preserves_subclass(self):
+        # GH 28330
+
+        df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
+        result = df.idxmax()
+        assert isinstance(result, tm.SubclassedSeries)
+
+    def test_convert_dtypes_preserves_subclass(self):
+        # GH 43668
+        df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
+        result = df.convert_dtypes()
+        assert isinstance(result, tm.SubclassedDataFrame)
+
+    def test_convert_dtypes_preserves_subclass_with_constructor(self):
+        class SubclassedDataFrame(DataFrame):
+            @property
+            def _constructor(self):
+                return SubclassedDataFrame
+
+        df = SubclassedDataFrame({"a": [1, 2, 3]})
+        result = df.convert_dtypes()
+        assert isinstance(result, SubclassedDataFrame)
+
+    def test_astype_preserves_subclass(self):
+        # GH#40810
+        df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
+
+        result = df.astype({"A": np.int64, "B": np.int32, "C": np.float64})
+        assert isinstance(result, tm.SubclassedDataFrame)
+
+    def test_equals_subclass(self):
+        # https://github.com/pandas-dev/pandas/pull/34402
+        # allow subclass in both directions
+        df1 = DataFrame({"a": [1, 2, 3]})
+        df2 = tm.SubclassedDataFrame({"a": [1, 2, 3]})
+        assert df1.equals(df2)
+        assert df2.equals(df1)
+
+
+class MySubclassWithMetadata(DataFrame):
+    _metadata = ["my_metadata"]
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+        my_metadata = kwargs.pop("my_metadata", None)
+        if args and isinstance(args[0], MySubclassWithMetadata):
+            my_metadata = args[0].my_metadata  # type: ignore[has-type]
+        self.my_metadata = my_metadata
+
+    @property
+    def _constructor(self):
+        return MySubclassWithMetadata
+
+
+def test_constructor_with_metadata():
+    # https://github.com/pandas-dev/pandas/pull/54922
+    # https://github.com/pandas-dev/pandas/issues/55120
+    df = MySubclassWithMetadata(
+        np.random.default_rng(2).random((5, 3)), columns=["A", "B", "C"]
+    )
+    subset = df[["A", "B"]]
+    assert isinstance(subset, MySubclassWithMetadata)
+
+
+def test_constructor_with_metadata_from_records():
+    # GH#57008
+    df = MySubclassWithMetadata.from_records([{"a": 1, "b": 2}])
+    assert df.my_metadata is None
+    assert type(df) is MySubclassWithMetadata
+
+
+class SimpleDataFrameSubClass(DataFrame):
+    """A subclass of DataFrame that does not define a constructor."""
+
+
+class SimpleSeriesSubClass(Series):
+    """A subclass of Series that does not define a constructor."""
+
+
+class TestSubclassWithoutConstructor:
+    def test_copy_df(self):
+        expected = DataFrame({"a": [1, 2, 3]})
+        result = SimpleDataFrameSubClass(expected).copy()
+
+        assert (
+            type(result) is DataFrame
+        )  # assert_frame_equal only checks isinstance(lhs, type(rhs))
+        tm.assert_frame_equal(result, expected)
+
+    def test_copy_series(self):
+        expected = Series([1, 2, 3])
+        result = SimpleSeriesSubClass(expected).copy()
+
+        tm.assert_series_equal(result, expected)
+
+    def test_series_to_frame(self):
+        orig = Series([1, 2, 3])
+        expected = orig.to_frame()
+        result = SimpleSeriesSubClass(orig).to_frame()
+
+        assert (
+            type(result) is DataFrame
+        )  # assert_frame_equal only checks isinstance(lhs, type(rhs))
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby(self):
+        df = SimpleDataFrameSubClass(DataFrame({"a": [1, 2, 3]}))
+
+        for _, v in df.groupby("a"):
+            assert type(v) is DataFrame
diff --git a/pandas/tests/frame/test_ufunc.py b/pandas/tests/frame/test_ufunc.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d5a227652462e17025e1fcfae023bec296ad751
--- /dev/null
+++ b/pandas/tests/frame/test_ufunc.py
@@ -0,0 +1,312 @@
+from functools import partial
+import re
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.api.types import is_extension_array_dtype
+
+dtypes = [
+    "int64",
+    "Int64",
+    {"A": "int64", "B": "Int64"},
+]
+
+
+@pytest.mark.parametrize("dtype", dtypes)
+def test_unary_unary(dtype):
+    # unary input, unary output
+    values = np.array([[-1, -1], [1, 1]], dtype="int64")
+    df = pd.DataFrame(values, columns=["A", "B"], index=["a", "b"]).astype(dtype=dtype)
+    result = np.positive(df)
+    expected = pd.DataFrame(
+        np.positive(values), index=df.index, columns=df.columns
+    ).astype(dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", dtypes)
+def test_unary_binary(request, dtype):
+    # unary input, binary output
+    if is_extension_array_dtype(dtype) or isinstance(dtype, dict):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="Extension / mixed with multiple outputs not implemented."
+            )
+        )
+
+    values = np.array([[-1, -1], [1, 1]], dtype="int64")
+    df = pd.DataFrame(values, columns=["A", "B"], index=["a", "b"]).astype(dtype=dtype)
+    result_pandas = np.modf(df)
+    assert isinstance(result_pandas, tuple)
+    assert len(result_pandas) == 2
+    expected_numpy = np.modf(values)
+
+    for result, b in zip(result_pandas, expected_numpy):
+        expected = pd.DataFrame(b, index=df.index, columns=df.columns)
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", dtypes)
+def test_binary_input_dispatch_binop(dtype):
+    # binop ufuncs are dispatched to our dunder methods.
+    values = np.array([[-1, -1], [1, 1]], dtype="int64")
+    df = pd.DataFrame(values, columns=["A", "B"], index=["a", "b"]).astype(dtype=dtype)
+    result = np.add(df, df)
+    expected = pd.DataFrame(
+        np.add(values, values), index=df.index, columns=df.columns
+    ).astype(dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "func,arg,expected",
+    [
+        (np.add, 1, [2, 3, 4, 5]),
+        (
+            partial(np.add, where=[[False, True], [True, False]]),
+            np.array([[1, 1], [1, 1]]),
+            [0, 3, 4, 0],
+        ),
+        (np.power, np.array([[1, 1], [2, 2]]), [1, 2, 9, 16]),
+        (np.subtract, 2, [-1, 0, 1, 2]),
+        (
+            partial(np.negative, where=np.array([[False, True], [True, False]])),
+            None,
+            [0, -2, -3, 0],
+        ),
+    ],
+)
+def test_ufunc_passes_args(func, arg, expected):
+    # GH#40662
+    arr = np.array([[1, 2], [3, 4]])
+    df = pd.DataFrame(arr)
+    result_inplace = np.zeros_like(arr)
+    # 1-argument ufunc
+    if arg is None:
+        result = func(df, out=result_inplace)
+    else:
+        result = func(df, arg, out=result_inplace)
+
+    expected = np.array(expected).reshape(2, 2)
+    tm.assert_numpy_array_equal(result_inplace, expected)
+
+    expected = pd.DataFrame(expected)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype_a", dtypes)
+@pytest.mark.parametrize("dtype_b", dtypes)
+def test_binary_input_aligns_columns(request, dtype_a, dtype_b):
+    if (
+        is_extension_array_dtype(dtype_a)
+        or isinstance(dtype_a, dict)
+        or is_extension_array_dtype(dtype_b)
+        or isinstance(dtype_b, dict)
+    ):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="Extension / mixed with multiple inputs not implemented."
+            )
+        )
+
+    df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}).astype(dtype_a)
+
+    if isinstance(dtype_a, dict) and isinstance(dtype_b, dict):
+        dtype_b = dtype_b.copy()
+        dtype_b["C"] = dtype_b.pop("B")
+    df2 = pd.DataFrame({"A": [1, 2], "C": [3, 4]}).astype(dtype_b)
+    # As of 2.0, align first before applying the ufunc
+    result = np.heaviside(df1, df2)
+    expected = np.heaviside(
+        np.array([[1, 3, np.nan], [2, 4, np.nan]]),
+        np.array([[1, np.nan, 3], [2, np.nan, 4]]),
+    )
+    expected = pd.DataFrame(expected, index=[0, 1], columns=["A", "B", "C"])
+    tm.assert_frame_equal(result, expected)
+
+    result = np.heaviside(df1, df2.values)
+    expected = pd.DataFrame([[1.0, 1.0], [1.0, 1.0]], columns=["A", "B"])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", dtypes)
+def test_binary_input_aligns_index(request, dtype):
+    if is_extension_array_dtype(dtype) or isinstance(dtype, dict):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="Extension / mixed with multiple inputs not implemented."
+            )
+        )
+    df1 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).astype(dtype)
+    df2 = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "c"]).astype(dtype)
+    result = np.heaviside(df1, df2)
+    expected = np.heaviside(
+        np.array([[1, 3], [3, 4], [np.nan, np.nan]]),
+        np.array([[1, 3], [np.nan, np.nan], [3, 4]]),
+    )
+    # TODO(FloatArray): this will be Float64Dtype.
+    expected = pd.DataFrame(expected, index=["a", "b", "c"], columns=["A", "B"])
+    tm.assert_frame_equal(result, expected)
+
+    result = np.heaviside(df1, df2.values)
+    expected = pd.DataFrame(
+        [[1.0, 1.0], [1.0, 1.0]], columns=["A", "B"], index=["a", "b"]
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_binary_frame_series_raises():
+    # We don't currently implement
+    df = pd.DataFrame({"A": [1, 2]})
+    with pytest.raises(NotImplementedError, match="logaddexp"):
+        np.logaddexp(df, df["A"])
+
+    with pytest.raises(NotImplementedError, match="logaddexp"):
+        np.logaddexp(df["A"], df)
+
+
+def test_unary_accumulate_axis():
+    # https://github.com/pandas-dev/pandas/issues/39259
+    df = pd.DataFrame({"a": [1, 3, 2, 4]})
+    result = np.maximum.accumulate(df)
+    expected = pd.DataFrame({"a": [1, 3, 3, 4]})
+    tm.assert_frame_equal(result, expected)
+
+    df = pd.DataFrame({"a": [1, 3, 2, 4], "b": [0.1, 4.0, 3.0, 2.0]})
+    result = np.maximum.accumulate(df)
+    # in theory could preserve int dtype for default axis=0
+    expected = pd.DataFrame({"a": [1.0, 3.0, 3.0, 4.0], "b": [0.1, 4.0, 4.0, 4.0]})
+    tm.assert_frame_equal(result, expected)
+
+    result = np.maximum.accumulate(df, axis=0)
+    tm.assert_frame_equal(result, expected)
+
+    result = np.maximum.accumulate(df, axis=1)
+    expected = pd.DataFrame({"a": [1.0, 3.0, 2.0, 4.0], "b": [1.0, 4.0, 3.0, 4.0]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_frame_outer_disallowed():
+    df = pd.DataFrame({"A": [1, 2]})
+    with pytest.raises(NotImplementedError, match="^$"):
+        # deprecation enforced in 2.0
+        np.subtract.outer(df, df)
+
+
+def test_alignment_deprecation_enforced():
+    # Enforced in 2.0
+    # https://github.com/pandas-dev/pandas/issues/39184
+    df1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df2 = pd.DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]})
+    s1 = pd.Series([1, 2], index=["a", "b"])
+    s2 = pd.Series([1, 2], index=["b", "c"])
+
+    # binary dataframe / dataframe
+    expected = pd.DataFrame({"a": [2, 4, 6], "b": [8, 10, 12]})
+
+    with tm.assert_produces_warning(None):
+        # aligned -> no warning!
+        result = np.add(df1, df1)
+    tm.assert_frame_equal(result, expected)
+
+    result = np.add(df1, df2.values)
+    tm.assert_frame_equal(result, expected)
+
+    result = np.add(df1, df2)
+    expected = pd.DataFrame({"a": [np.nan] * 3, "b": [5, 7, 9], "c": [np.nan] * 3})
+    tm.assert_frame_equal(result, expected)
+
+    result = np.add(df1.values, df2)
+    expected = pd.DataFrame({"b": [2, 4, 6], "c": [8, 10, 12]})
+    tm.assert_frame_equal(result, expected)
+
+    # binary dataframe / series
+    expected = pd.DataFrame({"a": [2, 3, 4], "b": [6, 7, 8]})
+
+    with tm.assert_produces_warning(None):
+        # aligned -> no warning!
+        result = np.add(df1, s1)
+    tm.assert_frame_equal(result, expected)
+
+    result = np.add(df1, s2.values)
+    tm.assert_frame_equal(result, expected)
+
+    expected = pd.DataFrame(
+        {"a": [np.nan] * 3, "b": [5.0, 6.0, 7.0], "c": [np.nan] * 3}
+    )
+    result = np.add(df1, s2)
+    tm.assert_frame_equal(result, expected)
+
+    msg = "Cannot apply ufunc <ufunc 'add'> to mixed DataFrame and Series inputs."
+    with pytest.raises(NotImplementedError, match=msg):
+        np.add(s2, df1)
+
+
+@pytest.mark.single_cpu
+def test_alignment_deprecation_many_inputs_enforced():
+    # Enforced in 2.0
+    # https://github.com/pandas-dev/pandas/issues/39184
+    # test that the deprecation also works with > 2 inputs -> using a numba
+    # written ufunc for this because numpy itself doesn't have such ufuncs
+    numba = pytest.importorskip("numba")
+
+    @numba.vectorize([numba.float64(numba.float64, numba.float64, numba.float64)])
+    def my_ufunc(x, y, z):
+        return x + y + z
+
+    df1 = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    df2 = pd.DataFrame({"b": [1, 2, 3], "c": [4, 5, 6]})
+    df3 = pd.DataFrame({"a": [1, 2, 3], "c": [4, 5, 6]})
+
+    result = my_ufunc(df1, df2, df3)
+    expected = pd.DataFrame(np.full((3, 3), np.nan), columns=["a", "b", "c"])
+    tm.assert_frame_equal(result, expected)
+
+    # all aligned -> no warning
+    with tm.assert_produces_warning(None):
+        result = my_ufunc(df1, df1, df1)
+    expected = pd.DataFrame([[3.0, 12.0], [6.0, 15.0], [9.0, 18.0]], columns=["a", "b"])
+    tm.assert_frame_equal(result, expected)
+
+    # mixed frame / arrays
+    msg = (
+        r"operands could not be broadcast together with shapes \(3,3\) \(3,3\) \(3,2\)"
+    )
+    with pytest.raises(ValueError, match=msg):
+        my_ufunc(df1, df2, df3.values)
+
+    # single frame -> no warning
+    with tm.assert_produces_warning(None):
+        result = my_ufunc(df1, df2.values, df3.values)
+    tm.assert_frame_equal(result, expected)
+
+    # takes indices of first frame
+    msg = (
+        r"operands could not be broadcast together with shapes \(3,2\) \(3,3\) \(3,3\)"
+    )
+    with pytest.raises(ValueError, match=msg):
+        my_ufunc(df1.values, df2, df3)
+
+
+def test_array_ufuncs_for_many_arguments():
+    # GH39853
+    def add3(x, y, z):
+        return x + y + z
+
+    ufunc = np.frompyfunc(add3, 3, 1)
+    df = pd.DataFrame([[1, 2], [3, 4]])
+
+    result = ufunc(df, df, 1)
+    expected = pd.DataFrame([[3, 5], [7, 9]], dtype=object)
+    tm.assert_frame_equal(result, expected)
+
+    ser = pd.Series([1, 2])
+    msg = (
+        "Cannot apply ufunc <ufunc 'add3 (vectorized)'> "
+        "to mixed DataFrame and Series inputs."
+    )
+    with pytest.raises(NotImplementedError, match=re.escape(msg)):
+        ufunc(df, df, ser)
diff --git a/pandas/tests/frame/test_unary.py b/pandas/tests/frame/test_unary.py
new file mode 100644
index 0000000000000000000000000000000000000000..034a43ac40bbafee06eb6cc079d7b820ccedb65b
--- /dev/null
+++ b/pandas/tests/frame/test_unary.py
@@ -0,0 +1,180 @@
+from decimal import Decimal
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+
+class TestDataFrameUnaryOperators:
+    # __pos__, __neg__, __invert__
+
+    @pytest.mark.parametrize(
+        "df_data,expected_data",
+        [
+            ([-1, 1], [1, -1]),
+            ([False, True], [True, False]),
+            (pd.to_timedelta([-1, 1]), pd.to_timedelta([1, -1])),
+        ],
+    )
+    def test_neg_numeric(self, df_data, expected_data):
+        df = pd.DataFrame({"a": df_data})
+        expected = pd.DataFrame({"a": expected_data})
+        tm.assert_frame_equal(-df, expected)
+        tm.assert_series_equal(-df["a"], expected["a"])
+
+    @pytest.mark.parametrize(
+        "df, expected",
+        [
+            (np.array([1, 2], dtype=object), np.array([-1, -2], dtype=object)),
+            ([Decimal("1.0"), Decimal("2.0")], [Decimal("-1.0"), Decimal("-2.0")]),
+        ],
+    )
+    def test_neg_object(self, df, expected):
+        # GH#21380
+        df = pd.DataFrame({"a": df})
+        expected = pd.DataFrame({"a": expected})
+        tm.assert_frame_equal(-df, expected)
+        tm.assert_series_equal(-df["a"], expected["a"])
+
+    @pytest.mark.parametrize(
+        "df_data",
+        [
+            ["a", "b"],
+            pd.to_datetime(["2017-01-22", "1970-01-01"]),
+        ],
+    )
+    def test_neg_raises(self, df_data, using_infer_string):
+        df = pd.DataFrame({"a": df_data})
+        msg = (
+            "bad operand type for unary -: 'str'|"
+            r"bad operand type for unary -: 'DatetimeArray'|"
+            "unary '-' not supported for dtype"
+        )
+        with pytest.raises(TypeError, match=msg):
+            (-df)
+        with pytest.raises(TypeError, match=msg):
+            (-df["a"])
+
+    def test_invert(self, float_frame):
+        df = float_frame
+
+        tm.assert_frame_equal(-(df < 0), ~(df < 0))
+
+    def test_invert_mixed(self):
+        shape = (10, 5)
+        df = pd.concat(
+            [
+                pd.DataFrame(np.zeros(shape, dtype="bool")),
+                pd.DataFrame(np.zeros(shape, dtype=int)),
+            ],
+            axis=1,
+            ignore_index=True,
+        )
+        result = ~df
+        expected = pd.concat(
+            [
+                pd.DataFrame(np.ones(shape, dtype="bool")),
+                pd.DataFrame(-np.ones(shape, dtype=int)),
+            ],
+            axis=1,
+            ignore_index=True,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_invert_empty_not_input(self):
+        # GH#51032
+        df = pd.DataFrame()
+        result = ~df
+        tm.assert_frame_equal(df, result)
+        assert df is not result
+
+    @pytest.mark.parametrize(
+        "df_data",
+        [
+            [-1, 1],
+            [False, True],
+            pd.to_timedelta([-1, 1]),
+        ],
+    )
+    def test_pos_numeric(self, df_data):
+        # GH#16073
+        df = pd.DataFrame({"a": df_data})
+        tm.assert_frame_equal(+df, df)
+        tm.assert_series_equal(+df["a"], df["a"])
+
+    @pytest.mark.parametrize(
+        "df_data",
+        [
+            np.array([-1, 2], dtype=object),
+            [Decimal("-1.0"), Decimal("2.0")],
+        ],
+    )
+    def test_pos_object(self, df_data):
+        # GH#21380
+        df = pd.DataFrame({"a": df_data})
+        tm.assert_frame_equal(+df, df)
+        tm.assert_series_equal(+df["a"], df["a"])
+
+    @pytest.mark.filterwarnings("ignore:Applying:DeprecationWarning")
+    def test_pos_object_raises(self):
+        # GH#21380
+        df = pd.DataFrame({"a": ["a", "b"]})
+        with pytest.raises(
+            TypeError, match=r"^bad operand type for unary \+: \'str\'$"
+        ):
+            tm.assert_frame_equal(+df, df)
+
+    def test_pos_raises(self):
+        df = pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})
+        msg = r"bad operand type for unary \+: 'DatetimeArray'"
+        with pytest.raises(TypeError, match=msg):
+            (+df)
+        with pytest.raises(TypeError, match=msg):
+            (+df["a"])
+
+    def test_unary_nullable(self):
+        df = pd.DataFrame(
+            {
+                "a": pd.array([1, -2, 3, pd.NA], dtype="Int64"),
+                "b": pd.array([4.0, -5.0, 6.0, pd.NA], dtype="Float32"),
+                "c": pd.array([True, False, False, pd.NA], dtype="boolean"),
+                # include numpy bool to make sure bool-vs-boolean behavior
+                #  is consistent in non-NA locations
+                "d": np.array([True, False, False, True]),
+            }
+        )
+
+        result = +df
+        res_ufunc = np.positive(df)
+        expected = df
+        # TODO: assert that we have copies?
+        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(res_ufunc, expected)
+
+        result = -df
+        res_ufunc = np.negative(df)
+        expected = pd.DataFrame(
+            {
+                "a": pd.array([-1, 2, -3, pd.NA], dtype="Int64"),
+                "b": pd.array([-4.0, 5.0, -6.0, pd.NA], dtype="Float32"),
+                "c": pd.array([False, True, True, pd.NA], dtype="boolean"),
+                "d": np.array([False, True, True, False]),
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(res_ufunc, expected)
+
+        result = abs(df)
+        res_ufunc = np.abs(df)
+        expected = pd.DataFrame(
+            {
+                "a": pd.array([1, 2, 3, pd.NA], dtype="Int64"),
+                "b": pd.array([4.0, 5.0, 6.0, pd.NA], dtype="Float32"),
+                "c": pd.array([True, False, False, pd.NA], dtype="boolean"),
+                "d": np.array([True, False, False, True]),
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(res_ufunc, expected)
diff --git a/pandas/tests/frame/test_validate.py b/pandas/tests/frame/test_validate.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdeecba29a6177444df6141487505e24d284c285
--- /dev/null
+++ b/pandas/tests/frame/test_validate.py
@@ -0,0 +1,37 @@
+import pytest
+
+from pandas.core.frame import DataFrame
+
+
+class TestDataFrameValidate:
+    """Tests for error handling related to data types of method arguments."""
+
+    @pytest.mark.parametrize(
+        "func",
+        [
+            "query",
+            "eval",
+            "set_index",
+            "reset_index",
+            "dropna",
+            "drop_duplicates",
+            "sort_values",
+        ],
+    )
+    @pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0])
+    def test_validate_bool_args(self, func, inplace):
+        dataframe = DataFrame({"a": [1, 2], "b": [3, 4]})
+        msg = 'For argument "inplace" expected type bool'
+        kwargs = {"inplace": inplace}
+
+        if func == "query":
+            kwargs["expr"] = "a > b"
+        elif func == "eval":
+            kwargs["expr"] = "a + b"
+        elif func == "set_index":
+            kwargs["keys"] = ["a"]
+        elif func == "sort_values":
+            kwargs["by"] = ["a"]
+
+        with pytest.raises(ValueError, match=msg):
+            getattr(dataframe, func)(**kwargs)
diff --git a/pandas/tests/generic/__init__.py b/pandas/tests/generic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py
new file mode 100644
index 0000000000000000000000000000000000000000..4de8c8df852f475d0bbea99ec1f0b8bbdbae117a
--- /dev/null
+++ b/pandas/tests/generic/test_duplicate_labels.py
@@ -0,0 +1,390 @@
+"""Tests dealing with the NDFrame.allows_duplicates."""
+
+import operator
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+not_implemented = pytest.mark.xfail(reason="Not implemented.")
+
+# ----------------------------------------------------------------------------
+# Preservation
+
+
+class TestPreserves:
+    @pytest.mark.parametrize(
+        "cls, data",
+        [
+            (pd.Series, np.array([])),
+            (pd.Series, [1, 2]),
+            (pd.DataFrame, {}),
+            (pd.DataFrame, {"A": [1, 2]}),
+        ],
+    )
+    def test_construction_ok(self, cls, data):
+        result = cls(data)
+        assert result.flags.allows_duplicate_labels is True
+
+        result = cls(data).set_flags(allows_duplicate_labels=False)
+        assert result.flags.allows_duplicate_labels is False
+
+    @pytest.mark.parametrize(
+        "func",
+        [
+            operator.itemgetter(["a"]),
+            operator.methodcaller("add", 1),
+            operator.methodcaller("rename", str.upper),
+            operator.methodcaller("rename", "name"),
+            operator.methodcaller("abs"),
+            np.abs,
+        ],
+    )
+    def test_preserved_series(self, func):
+        s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
+        assert func(s).flags.allows_duplicate_labels is False
+
+    @pytest.mark.parametrize("index", [["a", "b", "c"], ["a", "b"]])
+    # TODO: frame
+    @not_implemented
+    def test_align(self, index):
+        other = pd.Series(0, index=index)
+        s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
+        a, b = s.align(other)
+        assert a.flags.allows_duplicate_labels is False
+        assert b.flags.allows_duplicate_labels is False
+
+    def test_preserved_frame(self):
+        df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags(
+            allows_duplicate_labels=False
+        )
+        assert df.loc[["a"]].flags.allows_duplicate_labels is False
+        assert df.loc[:, ["A", "B"]].flags.allows_duplicate_labels is False
+
+    def test_to_frame(self):
+        ser = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False)
+        assert ser.to_frame().flags.allows_duplicate_labels is False
+
+    @pytest.mark.parametrize("func", ["add", "sub"])
+    @pytest.mark.parametrize("frame", [False, True])
+    @pytest.mark.parametrize("other", [1, pd.Series([1, 2], name="A")])
+    def test_binops(self, func, other, frame):
+        df = pd.Series([1, 2], name="A", index=["a", "b"]).set_flags(
+            allows_duplicate_labels=False
+        )
+        if frame:
+            df = df.to_frame()
+        if isinstance(other, pd.Series) and frame:
+            other = other.to_frame()
+        func = operator.methodcaller(func, other)
+        assert df.flags.allows_duplicate_labels is False
+        assert func(df).flags.allows_duplicate_labels is False
+
+    def test_preserve_getitem(self):
+        df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False)
+        assert df[["A"]].flags.allows_duplicate_labels is False
+        assert df["A"].flags.allows_duplicate_labels is False
+        assert df.loc[0].flags.allows_duplicate_labels is False
+        assert df.loc[[0]].flags.allows_duplicate_labels is False
+        assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False
+
+    @pytest.mark.parametrize(
+        "objs, kwargs",
+        [
+            # Series
+            (
+                [
+                    pd.Series(1, index=["a", "b"]),
+                    pd.Series(2, index=["c", "d"]),
+                ],
+                {},
+            ),
+            (
+                [
+                    pd.Series(1, index=["a", "b"]),
+                    pd.Series(2, index=["a", "b"]),
+                ],
+                {"ignore_index": True},
+            ),
+            (
+                [
+                    pd.Series(1, index=["a", "b"]),
+                    pd.Series(2, index=["a", "b"]),
+                ],
+                {"axis": 1},
+            ),
+            # Frame
+            (
+                [
+                    pd.DataFrame({"A": [1, 2]}, index=["a", "b"]),
+                    pd.DataFrame({"A": [1, 2]}, index=["c", "d"]),
+                ],
+                {},
+            ),
+            (
+                [
+                    pd.DataFrame({"A": [1, 2]}, index=["a", "b"]),
+                    pd.DataFrame({"A": [1, 2]}, index=["a", "b"]),
+                ],
+                {"ignore_index": True},
+            ),
+            (
+                [
+                    pd.DataFrame({"A": [1, 2]}, index=["a", "b"]),
+                    pd.DataFrame({"B": [1, 2]}, index=["a", "b"]),
+                ],
+                {"axis": 1},
+            ),
+            # Series / Frame
+            (
+                [
+                    pd.DataFrame({"A": [1, 2]}, index=["a", "b"]),
+                    pd.Series([1, 2], index=["a", "b"], name="B"),
+                ],
+                {"axis": 1},
+            ),
+        ],
+    )
+    def test_concat(self, objs, kwargs):
+        objs = [x.set_flags(allows_duplicate_labels=False) for x in objs]
+        result = pd.concat(objs, **kwargs)
+        assert result.flags.allows_duplicate_labels is False
+
+    @pytest.mark.parametrize(
+        "left, right, expected",
+        [
+            # false false false
+            pytest.param(
+                pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags(
+                    allows_duplicate_labels=False
+                ),
+                pd.DataFrame({"B": [0, 1]}, index=["a", "d"]).set_flags(
+                    allows_duplicate_labels=False
+                ),
+                False,
+            ),
+            # false true false
+            pytest.param(
+                pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags(
+                    allows_duplicate_labels=False
+                ),
+                pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
+                False,
+            ),
+            # true true true
+            (
+                pd.DataFrame({"A": [0, 1]}, index=["a", "b"]),
+                pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
+                True,
+            ),
+        ],
+    )
+    def test_merge(self, left, right, expected):
+        result = pd.merge(left, right, left_index=True, right_index=True)
+        assert result.flags.allows_duplicate_labels is expected
+
+    @not_implemented
+    def test_groupby(self):
+        # XXX: This is under tested
+        # TODO:
+        #  - apply
+        #  - transform
+        #  - Should passing a grouper that disallows duplicates propagate?
+        df = pd.DataFrame({"A": [1, 2, 3]}).set_flags(allows_duplicate_labels=False)
+        result = df.groupby([0, 0, 1]).agg("count")
+        assert result.flags.allows_duplicate_labels is False
+
+    @pytest.mark.parametrize("frame", [True, False])
+    @not_implemented
+    def test_window(self, frame):
+        df = pd.Series(
+            1,
+            index=pd.date_range("2000", periods=12),
+            name="A",
+            allows_duplicate_labels=False,
+        )
+        if frame:
+            df = df.to_frame()
+        assert df.rolling(3).mean().flags.allows_duplicate_labels is False
+        assert df.ewm(3).mean().flags.allows_duplicate_labels is False
+        assert df.expanding(3).mean().flags.allows_duplicate_labels is False
+
+
+# ----------------------------------------------------------------------------
+# Raises
+
+
+class TestRaises:
+    @pytest.mark.parametrize(
+        "cls, axes",
+        [
+            (pd.Series, {"index": ["a", "a"], "dtype": float}),
+            (pd.DataFrame, {"index": ["a", "a"]}),
+            (pd.DataFrame, {"index": ["a", "a"], "columns": ["b", "b"]}),
+            (pd.DataFrame, {"columns": ["b", "b"]}),
+        ],
+    )
+    def test_set_flags_with_duplicates(self, cls, axes):
+        result = cls(**axes)
+        assert result.flags.allows_duplicate_labels is True
+
+        msg = "Index has duplicates."
+        with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
+            cls(**axes).set_flags(allows_duplicate_labels=False)
+
+    @pytest.mark.parametrize(
+        "data",
+        [
+            pd.Series(index=[0, 0], dtype=float),
+            pd.DataFrame(index=[0, 0]),
+            pd.DataFrame(columns=[0, 0]),
+        ],
+    )
+    def test_setting_allows_duplicate_labels_raises(self, data):
+        msg = "Index has duplicates."
+        with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
+            data.flags.allows_duplicate_labels = False
+
+        assert data.flags.allows_duplicate_labels is True
+
+    def test_series_raises(self):
+        a = pd.Series(0, index=["a", "b"])
+        b = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
+        msg = "Index has duplicates."
+        with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
+            pd.concat([a, b])
+
+    @pytest.mark.parametrize(
+        "getter, target",
+        [
+            (operator.itemgetter(["A", "A"]), None),
+            # loc
+            (operator.itemgetter(["a", "a"]), "loc"),
+            pytest.param(operator.itemgetter(("a", ["A", "A"])), "loc"),
+            (operator.itemgetter((["a", "a"], "A")), "loc"),
+            # iloc
+            (operator.itemgetter([0, 0]), "iloc"),
+            pytest.param(operator.itemgetter((0, [0, 0])), "iloc"),
+            pytest.param(operator.itemgetter(([0, 0], 0)), "iloc"),
+        ],
+    )
+    def test_getitem_raises(self, getter, target):
+        df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags(
+            allows_duplicate_labels=False
+        )
+        if target:
+            # df, df.loc, or df.iloc
+            target = getattr(df, target)
+        else:
+            target = df
+
+        msg = "Index has duplicates."
+        with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
+            getter(target)
+
+    def test_concat_raises(self):
+        objs = [
+            pd.Series(1, index=[0, 1], name="a"),
+            pd.Series(2, index=[0, 1], name="a"),
+        ]
+        objs = [x.set_flags(allows_duplicate_labels=False) for x in objs]
+        msg = "Index has duplicates."
+        with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
+            pd.concat(objs, axis=1)
+
+    def test_merge_raises(self):
+        a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags(
+            allows_duplicate_labels=False
+        )
+        b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"])
+        msg = "Index has duplicates."
+        with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
+            pd.merge(a, b, left_index=True, right_index=True)
+
+
+@pytest.mark.parametrize(
+    "idx",
+    [
+        pd.Index([1, 1]),
+        pd.Index(["a", "a"]),
+        pd.Index([1.1, 1.1]),
+        pd.PeriodIndex([pd.Period("2000", "D")] * 2),
+        pd.DatetimeIndex([pd.Timestamp("2000")] * 2),
+        pd.TimedeltaIndex([pd.Timedelta("1D")] * 2),
+        pd.CategoricalIndex(["a", "a"]),
+        pd.IntervalIndex([pd.Interval(0, 1)] * 2),
+        pd.MultiIndex.from_tuples([("a", 1), ("a", 1)]),
+    ],
+    ids=lambda x: type(x).__name__,
+)
+def test_raises_basic(idx):
+    msg = "Index has duplicates."
+    with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
+        pd.Series(1, index=idx).set_flags(allows_duplicate_labels=False)
+
+    with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
+        pd.DataFrame({"A": [1, 1]}, index=idx).set_flags(allows_duplicate_labels=False)
+
+    with pytest.raises(pd.errors.DuplicateLabelError, match=msg):
+        pd.DataFrame([[1, 2]], columns=idx).set_flags(allows_duplicate_labels=False)
+
+
+def test_format_duplicate_labels_message():
+    idx = pd.Index(["a", "b", "a", "b", "c"])
+    result = idx._format_duplicate_message()
+    expected = pd.DataFrame(
+        {"positions": [[0, 2], [1, 3]]}, index=pd.Index(["a", "b"], name="label")
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_format_duplicate_labels_message_multi():
+    idx = pd.MultiIndex.from_product([["A"], ["a", "b", "a", "b", "c"]])
+    result = idx._format_duplicate_message()
+    expected = pd.DataFrame(
+        {"positions": [[0, 2], [1, 3]]},
+        index=pd.MultiIndex.from_product([["A"], ["a", "b"]]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_dataframe_insert_raises():
+    df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False)
+    msg = "Cannot specify"
+    with pytest.raises(ValueError, match=msg):
+        df.insert(0, "A", [3, 4], allow_duplicates=True)
+
+
+@pytest.mark.parametrize(
+    "method, frame_only",
+    [
+        (operator.methodcaller("set_index", "A", inplace=True), True),
+        (operator.methodcaller("reset_index", inplace=True), True),
+        (operator.methodcaller("rename", lambda x: x, inplace=True), False),
+    ],
+)
+def test_inplace_raises(method, frame_only):
+    df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}).set_flags(
+        allows_duplicate_labels=False
+    )
+    s = df["A"]
+    s.flags.allows_duplicate_labels = False
+    msg = "Cannot specify"
+
+    with pytest.raises(ValueError, match=msg):
+        method(df)
+    if not frame_only:
+        with pytest.raises(ValueError, match=msg):
+            method(s)
+
+
+def test_pickle(temp_file):
+    a = pd.Series([1, 2]).set_flags(allows_duplicate_labels=False)
+    b = tm.round_trip_pickle(a, temp_file)
+    tm.assert_series_equal(a, b)
+
+    a = pd.DataFrame({"A": []}).set_flags(allows_duplicate_labels=False)
+    b = tm.round_trip_pickle(a, temp_file)
+    tm.assert_frame_equal(a, b)
diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..de972f2f2f9c7a4101d8e00a3db22e17fa464822
--- /dev/null
+++ b/pandas/tests/generic/test_finalize.py
@@ -0,0 +1,792 @@
+"""
+An exhaustive list of pandas methods exercising NDFrame.__finalize__.
+"""
+
+from copy import deepcopy
+from datetime import time
+import operator
+import re
+
+import numpy as np
+import pytest
+
+from pandas._typing import MergeHow
+
+import pandas as pd
+
+# TODO:
+# * Binary methods (mul, div, etc.)
+# * Binary outputs (align, etc.)
+# * top-level methods (concat, merge, get_dummies, etc.)
+# * window
+# * cumulative reductions
+
+not_implemented_mark = pytest.mark.xfail(reason="not implemented")
+
+mi = pd.MultiIndex.from_product([["a", "b"], [0, 1]], names=["A", "B"])
+
+frame_data = ({"A": [1]},)
+frame_mi_data = ({"A": [1, 2, 3, 4]}, mi)
+
+
+# Tuple of
+# - Callable: Constructor (Series, DataFrame)
+# - Tuple: Constructor args
+# - Callable: pass the constructed value with attrs set to this.
+
+_all_methods = [
+    (pd.Series, ([0],), operator.methodcaller("take", [])),
+    (pd.Series, ([0],), operator.methodcaller("__getitem__", [True])),
+    (pd.Series, ([0],), operator.methodcaller("repeat", 2)),
+    (pd.Series, ([0],), operator.methodcaller("reset_index")),
+    (pd.Series, ([0],), operator.methodcaller("reset_index", drop=True)),
+    (pd.Series, ([0],), operator.methodcaller("to_frame")),
+    (pd.Series, ([0, 0],), operator.methodcaller("drop_duplicates")),
+    (pd.Series, ([0, 0],), operator.methodcaller("duplicated")),
+    (pd.Series, ([0, 0],), operator.methodcaller("round")),
+    (pd.Series, ([0, 0],), operator.methodcaller("rename", lambda x: x + 1)),
+    (pd.Series, ([0, 0],), operator.methodcaller("rename", "name")),
+    (pd.Series, ([0, 0],), operator.methodcaller("set_axis", ["a", "b"])),
+    (pd.Series, ([0, 0],), operator.methodcaller("reindex", [1, 0])),
+    (pd.Series, ([0, 0],), operator.methodcaller("drop", [0])),
+    (pd.Series, (pd.array([0, pd.NA]),), operator.methodcaller("fillna", 0)),
+    (pd.Series, ([0, 0],), operator.methodcaller("replace", {0: 1})),
+    (pd.Series, ([0, 0],), operator.methodcaller("shift")),
+    (pd.Series, ([0, 0],), operator.methodcaller("isin", [0, 1])),
+    (pd.Series, ([0, 0],), operator.methodcaller("between", 0, 2)),
+    (pd.Series, ([0, 0],), operator.methodcaller("isna")),
+    (pd.Series, ([0, 0],), operator.methodcaller("isnull")),
+    (pd.Series, ([0, 0],), operator.methodcaller("notna")),
+    (pd.Series, ([0, 0],), operator.methodcaller("notnull")),
+    (pd.Series, ([1],), operator.methodcaller("add", pd.Series([1]))),
+    # TODO: mul, div, etc.
+    (
+        pd.Series,
+        ([0], pd.period_range("2000", periods=1)),
+        operator.methodcaller("to_timestamp"),
+    ),
+    (
+        pd.Series,
+        ([0], pd.date_range("2000", periods=1)),
+        operator.methodcaller("to_period"),
+    ),
+    pytest.param(
+        (
+            pd.DataFrame,
+            frame_data,
+            operator.methodcaller("dot", pd.DataFrame(index=["A"])),
+        ),
+        marks=pytest.mark.xfail(reason="Implement binary finalize"),
+    ),
+    (pd.DataFrame, frame_data, operator.methodcaller("transpose")),
+    (pd.DataFrame, frame_data, operator.methodcaller("__getitem__", "A")),
+    (pd.DataFrame, frame_data, operator.methodcaller("__getitem__", ["A"])),
+    (pd.DataFrame, frame_data, operator.methodcaller("__getitem__", np.array([True]))),
+    (pd.DataFrame, ({("A", "a"): [1]},), operator.methodcaller("__getitem__", ["A"])),
+    (pd.DataFrame, frame_data, operator.methodcaller("query", "A == 1")),
+    (pd.DataFrame, frame_data, operator.methodcaller("eval", "A + 1", engine="python")),
+    (pd.DataFrame, frame_data, operator.methodcaller("select_dtypes", include="int")),
+    (pd.DataFrame, frame_data, operator.methodcaller("assign", b=1)),
+    (pd.DataFrame, frame_data, operator.methodcaller("set_axis", ["A"])),
+    (pd.DataFrame, frame_data, operator.methodcaller("reindex", [0, 1])),
+    (pd.DataFrame, frame_data, operator.methodcaller("drop", columns=["A"])),
+    (pd.DataFrame, frame_data, operator.methodcaller("drop", index=[0])),
+    (pd.DataFrame, frame_data, operator.methodcaller("rename", columns={"A": "a"})),
+    (pd.DataFrame, frame_data, operator.methodcaller("rename", index=lambda x: x)),
+    (pd.DataFrame, frame_data, operator.methodcaller("fillna", "A")),
+    (pd.DataFrame, frame_data, operator.methodcaller("set_index", "A")),
+    (pd.DataFrame, frame_data, operator.methodcaller("reset_index")),
+    (pd.DataFrame, frame_data, operator.methodcaller("isna")),
+    (pd.DataFrame, frame_data, operator.methodcaller("isnull")),
+    (pd.DataFrame, frame_data, operator.methodcaller("notna")),
+    (pd.DataFrame, frame_data, operator.methodcaller("notnull")),
+    (pd.DataFrame, frame_data, operator.methodcaller("dropna")),
+    (pd.DataFrame, frame_data, operator.methodcaller("drop_duplicates")),
+    (pd.DataFrame, frame_data, operator.methodcaller("duplicated")),
+    (pd.DataFrame, frame_data, operator.methodcaller("sort_values", by="A")),
+    (pd.DataFrame, frame_data, operator.methodcaller("sort_index")),
+    (pd.DataFrame, frame_data, operator.methodcaller("nlargest", 1, "A")),
+    (pd.DataFrame, frame_data, operator.methodcaller("nsmallest", 1, "A")),
+    (pd.DataFrame, frame_mi_data, operator.methodcaller("swaplevel")),
+    (
+        pd.DataFrame,
+        frame_data,
+        operator.methodcaller("add", pd.DataFrame(*frame_data)),
+    ),
+    # TODO: div, mul, etc.
+    (
+        pd.DataFrame,
+        frame_data,
+        operator.methodcaller("combine", pd.DataFrame(*frame_data), operator.add),
+    ),
+    (
+        pd.DataFrame,
+        frame_data,
+        operator.methodcaller("combine_first", pd.DataFrame(*frame_data)),
+    ),
+    pytest.param(
+        (
+            pd.DataFrame,
+            frame_data,
+            operator.methodcaller("update", pd.DataFrame(*frame_data)),
+        ),
+        marks=not_implemented_mark,
+    ),
+    (pd.DataFrame, frame_data, operator.methodcaller("pivot", columns="A")),
+    (
+        pd.DataFrame,
+        ({"A": [1], "B": [1]},),
+        operator.methodcaller("pivot_table", columns="A"),
+    ),
+    (
+        pd.DataFrame,
+        ({"A": [1], "B": [1]},),
+        operator.methodcaller("pivot_table", columns="A", aggfunc=["mean", "sum"]),
+    ),
+    (pd.DataFrame, frame_data, operator.methodcaller("stack")),
+    (pd.DataFrame, frame_data, operator.methodcaller("explode", "A")),
+    (pd.DataFrame, frame_mi_data, operator.methodcaller("unstack")),
+    (
+        pd.DataFrame,
+        ({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]},),
+        operator.methodcaller("melt", id_vars=["A"], value_vars=["B"]),
+    ),
+    (pd.DataFrame, frame_data, operator.methodcaller("map", lambda x: x)),
+    (pd.DataFrame, frame_data, operator.methodcaller("round", 2)),
+    (pd.DataFrame, frame_data, operator.methodcaller("corr")),
+    pytest.param(
+        (pd.DataFrame, frame_data, operator.methodcaller("cov")),
+        marks=[
+            pytest.mark.filterwarnings("ignore::RuntimeWarning"),
+        ],
+    ),
+    (
+        pd.DataFrame,
+        frame_data,
+        operator.methodcaller("corrwith", pd.DataFrame(*frame_data)),
+    ),
+    (pd.DataFrame, frame_data, operator.methodcaller("count")),
+    (pd.DataFrame, frame_data, operator.methodcaller("nunique")),
+    (pd.DataFrame, frame_data, operator.methodcaller("idxmin")),
+    (pd.DataFrame, frame_data, operator.methodcaller("idxmax")),
+    (pd.DataFrame, frame_data, operator.methodcaller("mode")),
+    (pd.Series, [0], operator.methodcaller("mode")),
+    (pd.DataFrame, frame_data, operator.methodcaller("median")),
+    (
+        pd.DataFrame,
+        frame_data,
+        operator.methodcaller("quantile", numeric_only=True),
+    ),
+    (
+        pd.DataFrame,
+        frame_data,
+        operator.methodcaller("quantile", q=[0.25, 0.75], numeric_only=True),
+    ),
+    (
+        pd.DataFrame,
+        ({"A": [pd.Timedelta(days=1), pd.Timedelta(days=2)]},),
+        operator.methodcaller("quantile", numeric_only=False),
+    ),
+    (
+        pd.DataFrame,
+        ({"A": [np.datetime64("2022-01-01"), np.datetime64("2022-01-02")]},),
+        operator.methodcaller("quantile", numeric_only=True),
+    ),
+    (
+        pd.DataFrame,
+        ({"A": [1]}, [pd.Period("2000", "D")]),
+        operator.methodcaller("to_timestamp"),
+    ),
+    (
+        pd.DataFrame,
+        ({"A": [1]}, [pd.Timestamp("2000")]),
+        operator.methodcaller("to_period", freq="D"),
+    ),
+    (pd.DataFrame, frame_mi_data, operator.methodcaller("isin", [1])),
+    (pd.DataFrame, frame_mi_data, operator.methodcaller("isin", pd.Series([1]))),
+    (
+        pd.DataFrame,
+        frame_mi_data,
+        operator.methodcaller("isin", pd.DataFrame({"A": [1]})),
+    ),
+    (pd.DataFrame, frame_mi_data, operator.methodcaller("droplevel", "A")),
+    (pd.DataFrame, frame_data, operator.methodcaller("pop", "A")),
+    # Squeeze on columns, otherwise we'll end up with a scalar
+    (pd.DataFrame, frame_data, operator.methodcaller("squeeze", axis="columns")),
+    (pd.Series, ([1, 2],), operator.methodcaller("squeeze")),
+    (pd.Series, ([1, 2],), operator.methodcaller("rename_axis", index="a")),
+    (pd.DataFrame, frame_data, operator.methodcaller("rename_axis", columns="a")),
+    # Unary ops
+    (pd.DataFrame, frame_data, operator.neg),
+    (pd.Series, [1], operator.neg),
+    (pd.DataFrame, frame_data, operator.pos),
+    (pd.Series, [1], operator.pos),
+    (pd.DataFrame, frame_data, operator.inv),
+    (pd.Series, [1], operator.inv),
+    (pd.DataFrame, frame_data, abs),
+    (pd.Series, [1], abs),
+    (pd.DataFrame, frame_data, round),
+    (pd.Series, [1], round),
+    (pd.DataFrame, frame_data, operator.methodcaller("take", [0, 0])),
+    (pd.DataFrame, frame_mi_data, operator.methodcaller("xs", "a")),
+    (pd.Series, (1, mi), operator.methodcaller("xs", "a")),
+    (pd.DataFrame, frame_data, operator.methodcaller("get", "A")),
+    (
+        pd.DataFrame,
+        frame_data,
+        operator.methodcaller("reindex_like", pd.DataFrame({"A": [1, 2, 3]})),
+    ),
+    (
+        pd.Series,
+        frame_data,
+        operator.methodcaller("reindex_like", pd.Series([0, 1, 2])),
+    ),
+    (pd.DataFrame, frame_data, operator.methodcaller("add_prefix", "_")),
+    (pd.DataFrame, frame_data, operator.methodcaller("add_suffix", "_")),
+    (pd.Series, (1, ["a", "b"]), operator.methodcaller("add_prefix", "_")),
+    (pd.Series, (1, ["a", "b"]), operator.methodcaller("add_suffix", "_")),
+    (pd.Series, ([3, 2],), operator.methodcaller("sort_values")),
+    (pd.Series, ([1] * 10,), operator.methodcaller("head")),
+    (pd.DataFrame, ({"A": [1] * 10},), operator.methodcaller("head")),
+    (pd.Series, ([1] * 10,), operator.methodcaller("tail")),
+    (pd.DataFrame, ({"A": [1] * 10},), operator.methodcaller("tail")),
+    (pd.Series, ([1, 2],), operator.methodcaller("sample", n=2, replace=True)),
+    (pd.DataFrame, (frame_data,), operator.methodcaller("sample", n=2, replace=True)),
+    (pd.Series, ([1, 2],), operator.methodcaller("astype", float)),
+    (pd.DataFrame, frame_data, operator.methodcaller("astype", float)),
+    (pd.Series, ([1, 2],), operator.methodcaller("copy")),
+    (pd.DataFrame, frame_data, operator.methodcaller("copy")),
+    (pd.Series, ([1, 2], None, object), operator.methodcaller("infer_objects")),
+    (
+        pd.DataFrame,
+        ({"A": np.array([1, 2], dtype=object)},),
+        operator.methodcaller("infer_objects"),
+    ),
+    (pd.Series, ([1, 2],), operator.methodcaller("convert_dtypes")),
+    (pd.DataFrame, frame_data, operator.methodcaller("convert_dtypes")),
+    (pd.Series, ([1, None, 3],), operator.methodcaller("interpolate")),
+    (pd.DataFrame, ({"A": [1, None, 3]},), operator.methodcaller("interpolate")),
+    (pd.Series, ([1, 2],), operator.methodcaller("clip", lower=1)),
+    (pd.DataFrame, frame_data, operator.methodcaller("clip", lower=1)),
+    (
+        pd.Series,
+        (1, pd.date_range("2000", periods=4)),
+        operator.methodcaller("asfreq", "h"),
+    ),
+    (
+        pd.DataFrame,
+        ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)),
+        operator.methodcaller("asfreq", "h"),
+    ),
+    (
+        pd.Series,
+        (1, pd.date_range("2000", periods=4)),
+        operator.methodcaller("at_time", time(12)),
+    ),
+    (
+        pd.DataFrame,
+        ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)),
+        operator.methodcaller("at_time", time(12)),
+    ),
+    (
+        pd.Series,
+        (1, pd.date_range("2000", periods=4)),
+        operator.methodcaller("between_time", "12:00", "13:00"),
+    ),
+    (
+        pd.DataFrame,
+        ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)),
+        operator.methodcaller("between_time", "12:00", "13:00"),
+    ),
+    (pd.Series, ([1, 2],), operator.methodcaller("rank")),
+    (pd.DataFrame, frame_data, operator.methodcaller("rank")),
+    (pd.Series, ([1, 2],), operator.methodcaller("where", np.array([True, False]))),
+    (pd.DataFrame, frame_data, operator.methodcaller("where", np.array([[True]]))),
+    (pd.Series, ([1, 2],), operator.methodcaller("mask", np.array([True, False]))),
+    (pd.DataFrame, frame_data, operator.methodcaller("mask", np.array([[True]]))),
+    (pd.Series, ([1, 2],), operator.methodcaller("truncate", before=0)),
+    (pd.DataFrame, frame_data, operator.methodcaller("truncate", before=0)),
+    (
+        pd.Series,
+        (1, pd.date_range("2000", periods=4, tz="UTC")),
+        operator.methodcaller("tz_convert", "CET"),
+    ),
+    (
+        pd.DataFrame,
+        ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4, tz="UTC")),
+        operator.methodcaller("tz_convert", "CET"),
+    ),
+    (
+        pd.Series,
+        (1, pd.date_range("2000", periods=4)),
+        operator.methodcaller("tz_localize", "CET"),
+    ),
+    (
+        pd.DataFrame,
+        ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)),
+        operator.methodcaller("tz_localize", "CET"),
+    ),
+    (pd.Series, ([1, 2],), operator.methodcaller("describe")),
+    (pd.DataFrame, frame_data, operator.methodcaller("describe")),
+    (pd.Series, ([1, 2],), operator.methodcaller("pct_change")),
+    (pd.DataFrame, frame_data, operator.methodcaller("pct_change")),
+    (pd.Series, ([1],), operator.methodcaller("transform", lambda x: x - x.min())),
+    (
+        pd.DataFrame,
+        frame_mi_data,
+        operator.methodcaller("transform", lambda x: x - x.min()),
+    ),
+    (pd.Series, ([1],), operator.methodcaller("apply", lambda x: x)),
+    (pd.DataFrame, frame_mi_data, operator.methodcaller("apply", lambda x: x)),
+    # Cumulative reductions
+    (pd.Series, ([1],), operator.methodcaller("cumsum")),
+    (pd.DataFrame, frame_data, operator.methodcaller("cumsum")),
+    (pd.Series, ([1],), operator.methodcaller("cummin")),
+    (pd.DataFrame, frame_data, operator.methodcaller("cummin")),
+    (pd.Series, ([1],), operator.methodcaller("cummax")),
+    (pd.DataFrame, frame_data, operator.methodcaller("cummax")),
+    (pd.Series, ([1],), operator.methodcaller("cumprod")),
+    (pd.DataFrame, frame_data, operator.methodcaller("cumprod")),
+    # Reductions
+    (pd.DataFrame, frame_data, operator.methodcaller("any")),
+    (pd.DataFrame, frame_data, operator.methodcaller("all")),
+    (pd.DataFrame, frame_data, operator.methodcaller("min")),
+    (pd.DataFrame, frame_data, operator.methodcaller("max")),
+    (pd.DataFrame, frame_data, operator.methodcaller("sum")),
+    (pd.DataFrame, frame_data, operator.methodcaller("std")),
+    (pd.DataFrame, frame_data, operator.methodcaller("mean")),
+    (pd.DataFrame, frame_data, operator.methodcaller("prod")),
+    (pd.DataFrame, frame_data, operator.methodcaller("sem")),
+    (pd.DataFrame, frame_data, operator.methodcaller("skew")),
+    (pd.DataFrame, frame_data, operator.methodcaller("kurt")),
+]
+
+
+def idfn(x):
+    xpr = re.compile(r"'(.*)?'")
+    m = xpr.search(str(x))
+    if m:
+        return m.group(1)
+    else:
+        return str(x)
+
+
+@pytest.mark.parametrize("ndframe_method", _all_methods, ids=lambda x: idfn(x[-1]))
+def test_finalize_called(ndframe_method):
+    cls, init_args, method = ndframe_method
+    ndframe = cls(*init_args)
+
+    ndframe.attrs = {"a": 1}
+    result = method(ndframe)
+
+    assert result.attrs == {"a": 1}
+
+
+@not_implemented_mark
+def test_finalize_called_eval_numexpr():
+    pytest.importorskip("numexpr")
+    df = pd.DataFrame({"A": [1, 2]})
+    df.attrs["A"] = 1
+    result = df.eval("A + 1", engine="numexpr")
+    assert result.attrs == {"A": 1}
+
+
+# ----------------------------------------------------------------------------
+# Binary operations
+
+
+@pytest.mark.parametrize("annotate", ["left", "right", "both"])
+@pytest.mark.parametrize(
+    "args",
+    [
+        (1, pd.Series([1])),
+        (1, pd.DataFrame({"A": [1]})),
+        (pd.Series([1]), 1),
+        (pd.DataFrame({"A": [1]}), 1),
+        (pd.Series([1]), pd.Series([1])),
+        (pd.DataFrame({"A": [1]}), pd.DataFrame({"A": [1]})),
+        (pd.Series([1]), pd.DataFrame({"A": [1]})),
+        (pd.DataFrame({"A": [1]}), pd.Series([1])),
+    ],
+    ids=lambda x: f"({type(x[0]).__name__},{type(x[1]).__name__})",
+)
+def test_binops(request, args, annotate, all_binary_operators):
+    # This generates 624 tests... Is that needed?
+    left, right = args
+    if isinstance(left, (pd.DataFrame, pd.Series)):
+        left.attrs = {}
+    if isinstance(right, (pd.DataFrame, pd.Series)):
+        right.attrs = {}
+
+    if annotate == "left" and isinstance(left, int):
+        pytest.skip("left is an int and doesn't support .attrs")
+    if annotate == "right" and isinstance(right, int):
+        pytest.skip("right is an int and doesn't support .attrs")
+
+    if annotate in {"left", "both"} and not isinstance(left, int):
+        left.attrs = {"a": 1}
+    if annotate in {"right", "both"} and not isinstance(right, int):
+        right.attrs = {"a": 1}
+
+    is_cmp = all_binary_operators in [
+        operator.eq,
+        operator.ne,
+        operator.gt,
+        operator.ge,
+        operator.lt,
+        operator.le,
+    ]
+    if is_cmp and isinstance(left, pd.DataFrame) and isinstance(right, pd.Series):
+        # in 2.0 silent alignment on comparisons was removed xref GH#28759
+        left, right = left.align(right, axis=1)
+    elif is_cmp and isinstance(left, pd.Series) and isinstance(right, pd.DataFrame):
+        right, left = right.align(left, axis=1)
+
+    result = all_binary_operators(left, right)
+    assert result.attrs == {"a": 1}
+
+
+@pytest.mark.parametrize("left", [pd.Series, pd.DataFrame])
+@pytest.mark.parametrize("right", [pd.Series, pd.DataFrame])
+def test_attrs_binary_operations(all_binary_operators, left, right):
+    # GH 51607
+    attrs = {"a": 1}
+    left = left([1])
+    left.attrs = attrs
+    right = right([2])
+    assert all_binary_operators(left, right).attrs == attrs
+    assert all_binary_operators(right, left).attrs == attrs
+
+
+# ----------------------------------------------------------------------------
+# Accessors
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        operator.methodcaller("capitalize"),
+        operator.methodcaller("casefold"),
+        operator.methodcaller("cat", ["a"]),
+        operator.methodcaller("contains", "a"),
+        operator.methodcaller("count", "a"),
+        operator.methodcaller("encode", "utf-8"),
+        operator.methodcaller("endswith", "a"),
+        operator.methodcaller("extract", r"(\w)(\d)"),
+        operator.methodcaller("extract", r"(\w)(\d)", expand=False),
+        operator.methodcaller("find", "a"),
+        operator.methodcaller("findall", "a"),
+        operator.methodcaller("get", 0),
+        operator.methodcaller("index", "a"),
+        operator.methodcaller("len"),
+        operator.methodcaller("ljust", 4),
+        operator.methodcaller("lower"),
+        operator.methodcaller("lstrip"),
+        operator.methodcaller("match", r"\w"),
+        operator.methodcaller("normalize", "NFC"),
+        operator.methodcaller("pad", 4),
+        operator.methodcaller("partition", "a"),
+        operator.methodcaller("repeat", 2),
+        operator.methodcaller("replace", "a", "b"),
+        operator.methodcaller("rfind", "a"),
+        operator.methodcaller("rindex", "a"),
+        operator.methodcaller("rjust", 4),
+        operator.methodcaller("rpartition", "a"),
+        operator.methodcaller("rstrip"),
+        operator.methodcaller("slice", 4),
+        operator.methodcaller("slice_replace", 1, repl="a"),
+        operator.methodcaller("startswith", "a"),
+        operator.methodcaller("strip"),
+        operator.methodcaller("swapcase"),
+        operator.methodcaller("translate", {"a": "b"}),
+        operator.methodcaller("upper"),
+        operator.methodcaller("wrap", 4),
+        operator.methodcaller("zfill", 4),
+        operator.methodcaller("isalnum"),
+        operator.methodcaller("isalpha"),
+        operator.methodcaller("isdigit"),
+        operator.methodcaller("isspace"),
+        operator.methodcaller("islower"),
+        operator.methodcaller("isupper"),
+        operator.methodcaller("istitle"),
+        operator.methodcaller("isnumeric"),
+        operator.methodcaller("isdecimal"),
+        operator.methodcaller("get_dummies"),
+    ],
+    ids=idfn,
+)
+def test_string_method(method):
+    s = pd.Series(["a1"])
+    s.attrs = {"a": 1}
+    result = method(s.str)
+    assert result.attrs == {"a": 1}
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        operator.methodcaller("to_period"),
+        operator.methodcaller("tz_localize", "CET"),
+        operator.methodcaller("normalize"),
+        operator.methodcaller("strftime", "%Y"),
+        operator.methodcaller("round", "h"),
+        operator.methodcaller("floor", "h"),
+        operator.methodcaller("ceil", "h"),
+        operator.methodcaller("month_name"),
+        operator.methodcaller("day_name"),
+    ],
+    ids=idfn,
+)
+def test_datetime_method(method):
+    s = pd.Series(pd.date_range("2000", periods=4))
+    s.attrs = {"a": 1}
+    result = method(s.dt)
+    assert result.attrs == {"a": 1}
+
+
+@pytest.mark.parametrize(
+    "attr",
+    [
+        "date",
+        "time",
+        "timetz",
+        "year",
+        "month",
+        "day",
+        "hour",
+        "minute",
+        "second",
+        "microsecond",
+        "nanosecond",
+        "dayofweek",
+        "day_of_week",
+        "dayofyear",
+        "day_of_year",
+        "quarter",
+        "is_month_start",
+        "is_month_end",
+        "is_quarter_start",
+        "is_quarter_end",
+        "is_year_start",
+        "is_year_end",
+        "is_leap_year",
+        "daysinmonth",
+        "days_in_month",
+    ],
+)
+def test_datetime_property(attr):
+    s = pd.Series(pd.date_range("2000", periods=4))
+    s.attrs = {"a": 1}
+    result = getattr(s.dt, attr)
+    assert result.attrs == {"a": 1}
+
+
+@pytest.mark.parametrize(
+    "attr", ["days", "seconds", "microseconds", "nanoseconds", "components"]
+)
+def test_timedelta_property(attr):
+    s = pd.Series(pd.timedelta_range("2000", periods=4))
+    s.attrs = {"a": 1}
+    result = getattr(s.dt, attr)
+    assert result.attrs == {"a": 1}
+
+
+@pytest.mark.parametrize("method", [operator.methodcaller("total_seconds")])
+def test_timedelta_methods(method):
+    s = pd.Series(pd.timedelta_range("2000", periods=4))
+    s.attrs = {"a": 1}
+    result = method(s.dt)
+    assert result.attrs == {"a": 1}
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        operator.methodcaller("add_categories", ["c"]),
+        operator.methodcaller("as_ordered"),
+        operator.methodcaller("as_unordered"),
+        lambda x: x.codes,
+        operator.methodcaller("remove_categories", "a"),
+        operator.methodcaller("remove_unused_categories"),
+        operator.methodcaller("rename_categories", {"a": "A", "b": "B"}),
+        operator.methodcaller("reorder_categories", ["b", "a"]),
+        operator.methodcaller("set_categories", ["A", "B"]),
+    ],
+)
+@not_implemented_mark
+def test_categorical_accessor(method):
+    s = pd.Series(["a", "b"], dtype="category")
+    s.attrs = {"a": 1}
+    result = method(s.cat)
+    assert result.attrs == {"a": 1}
+
+
+# ----------------------------------------------------------------------------
+# Groupby
+
+
+@pytest.mark.parametrize(
+    "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})]
+)
+@pytest.mark.parametrize(
+    "method",
+    [
+        operator.methodcaller("sum"),
+        lambda x: x.apply(lambda y: y),
+        lambda x: x.agg("sum"),
+        lambda x: x.agg("mean"),
+        lambda x: x.agg("median"),
+    ],
+)
+def test_groupby_finalize(obj, method):
+    obj.attrs = {"a": 1}
+    result = method(obj.groupby([0, 0], group_keys=False))
+    assert result.attrs == {"a": 1}
+
+
+@pytest.mark.parametrize(
+    "obj", [pd.Series([0, 0]), pd.DataFrame({"A": [0, 1], "B": [1, 2]})]
+)
+@pytest.mark.parametrize(
+    "method",
+    [
+        lambda x: x.agg(["sum", "count"]),
+        lambda x: x.agg("std"),
+        lambda x: x.agg("var"),
+        lambda x: x.agg("sem"),
+        lambda x: x.agg("size"),
+        lambda x: x.agg("ohlc"),
+    ],
+)
+@not_implemented_mark
+def test_groupby_finalize_not_implemented(obj, method):
+    obj.attrs = {"a": 1}
+    result = method(obj.groupby([0, 0]))
+    assert result.attrs == {"a": 1}
+
+
+def test_finalize_frame_series_name():
+    # https://github.com/pandas-dev/pandas/pull/37186/files#r506978889
+    # ensure we don't copy the column `name` to the Series.
+    df = pd.DataFrame({"name": [1, 2]})
+    result = pd.Series([1, 2]).__finalize__(df)
+    assert result.name is None
+
+
+# ----------------------------------------------------------------------------
+# Merge
+
+
+@pytest.mark.parametrize(
+    ["allow_on_left", "allow_on_right"],
+    [(False, False), (False, True), (True, False), (True, True)],
+)
+@pytest.mark.parametrize(
+    "how",
+    [
+        "left",
+        "right",
+        "inner",
+        "outer",
+        "left_anti",
+        "right_anti",
+        "cross",
+    ],
+)
+def test_merge_correctly_sets_duplication_allowance_flag(
+    how: MergeHow,
+    allow_on_left: bool,
+    allow_on_right: bool,
+):
+    left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left)
+    right = pd.DataFrame({"test": [1]}).set_flags(
+        allows_duplicate_labels=allow_on_right,
+    )
+
+    if not how == "cross":
+        result = left.merge(right, how=how, on="test")
+    else:
+        result = left.merge(right, how=how)
+
+    expected_duplication_allowance = allow_on_left and allow_on_right
+    assert result.flags.allows_duplicate_labels == expected_duplication_allowance
+
+
+@pytest.mark.parametrize(
+    ["allow_on_left", "allow_on_right"],
+    [(False, False), (False, True), (True, False), (True, True)],
+)
+def test_merge_asof_correctly_sets_duplication_allowance_flag(
+    allow_on_left: bool,
+    allow_on_right: bool,
+):
+    left = pd.DataFrame({"test": [1]}).set_flags(allows_duplicate_labels=allow_on_left)
+    right = pd.DataFrame({"test": [1]}).set_flags(
+        allows_duplicate_labels=allow_on_right,
+    )
+
+    result = pd.merge_asof(left, right)
+
+    expected_duplication_allowance = allow_on_left and allow_on_right
+    assert result.flags.allows_duplicate_labels == expected_duplication_allowance
+
+
+def test_merge_propagates_metadata_from_equal_input_metadata():
+    metadata = {"a": [1, 2]}
+    left = pd.DataFrame({"test": [1]})
+    left.attrs = metadata
+    right = pd.DataFrame({"test": [1]})
+    right.attrs = deepcopy(metadata)
+
+    result = left.merge(right, how="inner", on="test")
+
+    assert result.attrs == metadata
+
+    # Verify that merge deep-copies the attr dictionary.
+    assert result.attrs is not left.attrs
+    assert result.attrs is not right.attrs
+    assert result.attrs["a"] is not left.attrs["a"]
+    assert result.attrs["a"] is not right.attrs["a"]
+
+
+def test_merge_does_not_propagate_metadata_from_unequal_input_metadata():
+    left = pd.DataFrame({"test": [1]})
+    left.attrs = {"a": 2}
+    right = pd.DataFrame({"test": [1]})
+    right.attrs = {"b": 3}
+
+    result = left.merge(right, how="inner", on="test")
+
+    assert result.attrs == {}
+
+
+@pytest.mark.parametrize(
+    ["left_has_metadata", "right_has_metadata", "expected"],
+    [
+        (False, True, {}),
+        (True, False, {}),
+        (False, False, {}),
+    ],
+    ids=["left-empty", "right-empty", "both-empty"],
+)
+def test_merge_does_not_propagate_metadata_if_one_input_has_no_metadata(
+    left_has_metadata: bool,
+    right_has_metadata: bool,
+    expected: dict,
+):
+    left = pd.DataFrame({"test": [1]})
+    right = pd.DataFrame({"test": [1]})
+
+    if left_has_metadata:
+        left.attrs = {"a": [1, 2]}
+    else:
+        left.attrs = {}
+
+    if right_has_metadata:
+        right.attrs = {"a": [1, 2]}
+    else:
+        right.attrs = {}
+
+    result = left.merge(right, how="inner", on="test")
+
+    assert result.attrs == expected
diff --git a/pandas/tests/generic/test_frame.py b/pandas/tests/generic/test_frame.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2d24cceeab0c0a241338b9826bd420489e0dbba
--- /dev/null
+++ b/pandas/tests/generic/test_frame.py
@@ -0,0 +1,202 @@
+from copy import deepcopy
+from operator import methodcaller
+from typing import Literal
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    MultiIndex,
+    Series,
+    date_range,
+)
+import pandas._testing as tm
+
+
+class TestDataFrame:
+    @pytest.mark.parametrize("func", ["_set_axis_name", "rename_axis"])
+    def test_set_axis_name(self, func):
+        df = DataFrame([[1, 2], [3, 4]])
+
+        result = methodcaller(func, "foo")(df)
+        assert df.index.name is None
+        assert result.index.name == "foo"
+
+        result = methodcaller(func, "cols", axis=1)(df)
+        assert df.columns.name is None
+        assert result.columns.name == "cols"
+
+    @pytest.mark.parametrize("func", ["_set_axis_name", "rename_axis"])
+    def test_set_axis_name_mi(self, func):
+        df = DataFrame(
+            np.empty((3, 3)),
+            index=MultiIndex.from_tuples([("A", x) for x in list("aBc")]),
+            columns=MultiIndex.from_tuples([("C", x) for x in list("xyz")]),
+        )
+
+        level_names = ["L1", "L2"]
+
+        result = methodcaller(func, level_names)(df)
+        assert result.index.names == level_names
+        assert result.columns.names == [None, None]
+
+        result = methodcaller(func, level_names, axis=1)(df)
+        assert result.columns.names == ["L1", "L2"]
+        assert result.index.names == [None, None]
+
+    def test_nonzero_single_element(self):
+        df = DataFrame([[False, False]])
+        msg_err = "The truth value of a DataFrame is ambiguous"
+        with pytest.raises(ValueError, match=msg_err):
+            bool(df)
+
+    def test_metadata_propagation_indiv_groupby(self):
+        # groupby
+        df = DataFrame(
+            {
+                "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+                "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+                "C": np.random.default_rng(2).standard_normal(8),
+                "D": np.random.default_rng(2).standard_normal(8),
+            }
+        )
+        result = df.groupby("A").sum()
+        tm.assert_metadata_equivalent(df, result)
+
+    def test_metadata_propagation_indiv_resample(self):
+        # resample
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((1000, 2)),
+            index=date_range("20130101", periods=1000, freq="s"),
+        )
+        result = df.resample("1min")
+        tm.assert_metadata_equivalent(df, result)
+
+    def test_metadata_propagation_indiv(self, monkeypatch):
+        # merging with override
+        # GH 6923
+
+        def finalize(
+            self: DataFrame,
+            other: DataFrame,
+            method: Literal["merge", "concat"] | None = None,
+            **kwargs,
+        ):
+            for name in self._metadata:
+                if method == "merge":
+                    left, right = other.input_objs
+                    value = getattr(left, name, "") + "|" + getattr(right, name, "")
+                    object.__setattr__(self, name, value)
+                elif method == "concat":
+                    value = "+".join(
+                        [
+                            getattr(o, name)
+                            for o in other.input_objs
+                            if getattr(o, name, None)
+                        ]
+                    )
+                    object.__setattr__(self, name, value)
+                else:
+                    object.__setattr__(self, name, getattr(other, name, ""))
+
+            return self
+
+        with monkeypatch.context() as m:
+            m.setattr(DataFrame, "_metadata", ["filename"])
+            m.setattr(DataFrame, "__finalize__", finalize)
+
+            df1 = DataFrame(
+                np.random.default_rng(2).integers(0, 4, (3, 2)), columns=["a", "b"]
+            )
+            df2 = DataFrame(
+                np.random.default_rng(2).integers(0, 4, (3, 2)), columns=["c", "d"]
+            )
+            DataFrame._metadata = ["filename"]
+            df1.filename = "fname1.csv"
+            df2.filename = "fname2.csv"
+
+            result = df1.merge(df2, left_on=["a"], right_on=["c"], how="inner")
+            assert result.filename == "fname1.csv|fname2.csv"
+
+            # concat
+            # GH#6927
+            df1 = DataFrame(
+                np.random.default_rng(2).integers(0, 4, (3, 2)), columns=list("ab")
+            )
+            df1.filename = "foo"
+
+            result = pd.concat([df1, df1])
+            assert result.filename == "foo+foo"
+
+    def test_set_attribute(self):
+        # Test for consistent setattr behavior when an attribute and a column
+        # have the same name (Issue #8994)
+        df = DataFrame({"x": [1, 2, 3]})
+
+        df.y = 2
+        df["y"] = [2, 4, 6]
+        df.y = 5
+
+        assert df.y == 5
+        tm.assert_series_equal(df["y"], Series([2, 4, 6], name="y"))
+
+    def test_deepcopy_empty(self):
+        # This test covers empty frame copying with non-empty column sets
+        # as reported in issue GH15370
+        empty_frame = DataFrame(data=[], index=[], columns=["A"])
+        empty_frame_copy = deepcopy(empty_frame)
+
+        tm.assert_frame_equal(empty_frame_copy, empty_frame)
+
+
+# formerly in Generic but only test DataFrame
+class TestDataFrame2:
+    @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
+    def test_validate_bool_args(self, value):
+        df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+
+        msg = 'For argument "inplace" expected type bool, received type'
+        with pytest.raises(ValueError, match=msg):
+            df.copy().rename_axis(mapper={"a": "x", "b": "y"}, axis=1, inplace=value)
+
+        with pytest.raises(ValueError, match=msg):
+            df.copy().drop("a", axis=1, inplace=value)
+
+        with pytest.raises(ValueError, match=msg):
+            df.copy().fillna(value=0, inplace=value)
+
+        with pytest.raises(ValueError, match=msg):
+            df.copy().replace(to_replace=1, value=7, inplace=value)
+
+        with pytest.raises(ValueError, match=msg):
+            df.copy().interpolate(inplace=value)
+
+        with pytest.raises(ValueError, match=msg):
+            df.copy()._where(cond=df.a > 2, inplace=value)
+
+        with pytest.raises(ValueError, match=msg):
+            df.copy().mask(cond=df.a > 2, inplace=value)
+
+    def test_unexpected_keyword(self):
+        # GH8597
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 2)), columns=["jim", "joe"]
+        )
+        ca = pd.Categorical([0, 0, 2, 2, 3, np.nan])
+        ts = df["joe"].copy()
+        ts[2] = np.nan
+
+        msg = "unexpected keyword"
+        with pytest.raises(TypeError, match=msg):
+            df.drop("joe", axis=1, in_place=True)
+
+        with pytest.raises(TypeError, match=msg):
+            df.reindex([1, 0], inplace=True)
+
+        with pytest.raises(TypeError, match=msg):
+            ca.fillna(0, inplace=True)
+
+        with pytest.raises(TypeError, match=msg):
+            ts.fillna(0, in_place=True)
diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee6503b6929b615163e42cef167b34e87539c001
--- /dev/null
+++ b/pandas/tests/generic/test_generic.py
@@ -0,0 +1,494 @@
+from copy import (
+    copy,
+    deepcopy,
+)
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.common import is_scalar
+
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    date_range,
+)
+import pandas._testing as tm
+
+# ----------------------------------------------------------------------
+# Generic types test cases
+
+
+def construct(box, shape, value=None, dtype=None, **kwargs):
+    """
+    construct an object for the given shape
+    if value is specified use that if its a scalar
+    if value is an array, repeat it as needed
+    """
+    if isinstance(shape, int):
+        shape = tuple([shape] * box._AXIS_LEN)
+    if value is not None:
+        if is_scalar(value):
+            if value == "empty":
+                arr = None
+                dtype = np.float64
+
+                # remove the info axis
+                kwargs.pop(box._info_axis_name, None)
+            else:
+                arr = np.empty(shape, dtype=dtype)
+                arr.fill(value)
+        else:
+            fshape = np.prod(shape)
+            arr = value.ravel()
+            new_shape = fshape / arr.shape[0]
+            if fshape % arr.shape[0] != 0:
+                raise Exception("invalid value passed in construct")
+
+            arr = np.repeat(arr, new_shape).reshape(shape)
+    else:
+        arr = np.random.default_rng(2).standard_normal(shape)
+    return box(arr, dtype=dtype, **kwargs)
+
+
+class TestGeneric:
+    @pytest.mark.parametrize(
+        "func",
+        [
+            str.lower,
+            {x: x.lower() for x in list("ABCD")},
+            Series({x: x.lower() for x in list("ABCD")}),
+        ],
+    )
+    def test_rename(self, frame_or_series, func):
+        # single axis
+        idx = list("ABCD")
+
+        for axis in frame_or_series._AXIS_ORDERS:
+            kwargs = {axis: idx}
+            obj = construct(frame_or_series, 4, **kwargs)
+
+            # rename a single axis
+            result = obj.rename(**{axis: func})
+            expected = obj.copy()
+            setattr(expected, axis, list("abcd"))
+            tm.assert_equal(result, expected)
+
+    def test_get_numeric_data(self, frame_or_series):
+        n = 4
+        kwargs = {
+            frame_or_series._get_axis_name(i): list(range(n))
+            for i in range(frame_or_series._AXIS_LEN)
+        }
+
+        # get the numeric data
+        o = construct(frame_or_series, n, **kwargs)
+        result = o._get_numeric_data()
+        tm.assert_equal(result, o)
+
+        # non-inclusion
+        result = o._get_bool_data()
+        expected = construct(frame_or_series, n, value="empty", **kwargs)
+        if isinstance(o, DataFrame):
+            # preserve columns dtype
+            expected.columns = o.columns[:0]
+        tm.assert_equal(result, expected)
+
+        # get the bool data
+        arr = np.array([True, True, False, True])
+        o = construct(frame_or_series, n, value=arr, **kwargs)
+        result = o._get_numeric_data()
+        tm.assert_equal(result, o)
+
+    def test_get_bool_data_empty_preserve_index(self):
+        expected = Series([], dtype="bool")
+        result = expected._get_bool_data()
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+    def test_nonzero(self, frame_or_series):
+        # GH 4633
+        # look at the boolean/nonzero behavior for objects
+        obj = construct(frame_or_series, shape=4)
+        msg = f"The truth value of a {frame_or_series.__name__} is ambiguous"
+        with pytest.raises(ValueError, match=msg):
+            bool(obj == 0)
+        with pytest.raises(ValueError, match=msg):
+            bool(obj == 1)
+        with pytest.raises(ValueError, match=msg):
+            bool(obj)
+
+        obj = construct(frame_or_series, shape=4, value=1)
+        with pytest.raises(ValueError, match=msg):
+            bool(obj == 0)
+        with pytest.raises(ValueError, match=msg):
+            bool(obj == 1)
+        with pytest.raises(ValueError, match=msg):
+            bool(obj)
+
+        obj = construct(frame_or_series, shape=4, value=np.nan)
+        with pytest.raises(ValueError, match=msg):
+            bool(obj == 0)
+        with pytest.raises(ValueError, match=msg):
+            bool(obj == 1)
+        with pytest.raises(ValueError, match=msg):
+            bool(obj)
+
+        # empty
+        obj = construct(frame_or_series, shape=0)
+        with pytest.raises(ValueError, match=msg):
+            bool(obj)
+
+        # invalid behaviors
+
+        obj1 = construct(frame_or_series, shape=4, value=1)
+        obj2 = construct(frame_or_series, shape=4, value=1)
+
+        with pytest.raises(ValueError, match=msg):
+            if obj1:
+                pass
+
+        with pytest.raises(ValueError, match=msg):
+            obj1 and obj2
+        with pytest.raises(ValueError, match=msg):
+            obj1 or obj2
+        with pytest.raises(ValueError, match=msg):
+            not obj1
+
+    def test_frame_or_series_compound_dtypes(self, frame_or_series):
+        # see gh-5191
+        # Compound dtypes should raise NotImplementedError.
+
+        def f(dtype):
+            return construct(frame_or_series, shape=3, value=1, dtype=dtype)
+
+        msg = (
+            "compound dtypes are not implemented "
+            f"in the {frame_or_series.__name__} constructor"
+        )
+
+        with pytest.raises(NotImplementedError, match=msg):
+            f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")])
+
+        # these work (though results may be unexpected)
+        f("int64")
+        f("float64")
+        f("M8[ns]")
+
+    def test_metadata_propagation(self, frame_or_series):
+        # check that the metadata matches up on the resulting ops
+
+        o = construct(frame_or_series, shape=3)
+        o.name = "foo"
+        o2 = construct(frame_or_series, shape=3)
+        o2.name = "bar"
+
+        # ----------
+        # preserving
+        # ----------
+
+        # simple ops with scalars
+        for op in ["__add__", "__sub__", "__truediv__", "__mul__"]:
+            result = getattr(o, op)(1)
+            tm.assert_metadata_equivalent(o, result)
+
+        # ops with like
+        for op in ["__add__", "__sub__", "__truediv__", "__mul__"]:
+            result = getattr(o, op)(o)
+            tm.assert_metadata_equivalent(o, result)
+
+        # simple boolean
+        for op in ["__eq__", "__le__", "__ge__"]:
+            v1 = getattr(o, op)(o)
+            tm.assert_metadata_equivalent(o, v1)
+            tm.assert_metadata_equivalent(o, v1 & v1)
+            tm.assert_metadata_equivalent(o, v1 | v1)
+
+        # combine_first
+        result = o.combine_first(o2)
+        tm.assert_metadata_equivalent(o, result)
+
+        # ---------------------------
+        # non-preserving (by default)
+        # ---------------------------
+
+        # add non-like
+        result = o + o2
+        tm.assert_metadata_equivalent(result)
+
+        # simple boolean
+        for op in ["__eq__", "__le__", "__ge__"]:
+            # this is a name matching op
+            v1 = getattr(o, op)(o)
+            v2 = getattr(o, op)(o2)
+            tm.assert_metadata_equivalent(v2)
+            tm.assert_metadata_equivalent(v1 & v2)
+            tm.assert_metadata_equivalent(v1 | v2)
+
+    def test_size_compat(self, frame_or_series):
+        # GH8846
+        # size property should be defined
+
+        o = construct(frame_or_series, shape=10)
+        assert o.size == np.prod(o.shape)
+        assert o.size == 10 ** len(o.axes)
+
+    def test_split_compat(self, frame_or_series):
+        # xref GH8846
+        o = construct(frame_or_series, shape=10)
+        assert len(np.array_split(o, 5)) == 5
+        assert len(np.array_split(o, 2)) == 2
+
+    # See gh-12301
+    def test_stat_unexpected_keyword(self, frame_or_series):
+        obj = construct(frame_or_series, 5)
+        starwars = "Star Wars"
+        errmsg = "unexpected keyword"
+
+        with pytest.raises(TypeError, match=errmsg):
+            obj.max(epic=starwars)  # stat_function
+        with pytest.raises(TypeError, match=errmsg):
+            obj.var(epic=starwars)  # stat_function_ddof
+        with pytest.raises(TypeError, match=errmsg):
+            obj.sum(epic=starwars)  # cum_function
+        with pytest.raises(TypeError, match=errmsg):
+            obj.any(epic=starwars)  # logical_function
+
+    @pytest.mark.parametrize("func", ["sum", "cumsum", "any", "var"])
+    def test_api_compat(self, func, frame_or_series):
+        # GH 12021
+        # compat for __name__, __qualname__
+
+        obj = construct(frame_or_series, 5)
+        f = getattr(obj, func)
+        assert f.__name__ == func
+        assert f.__qualname__.endswith(func)
+
+    def test_stat_non_defaults_args(self, frame_or_series):
+        obj = construct(frame_or_series, 5)
+        out = np.array([0])
+        errmsg = "the 'out' parameter is not supported"
+
+        with pytest.raises(ValueError, match=errmsg):
+            obj.max(out=out)  # stat_function
+        with pytest.raises(ValueError, match=errmsg):
+            obj.var(out=out)  # stat_function_ddof
+        with pytest.raises(ValueError, match=errmsg):
+            obj.sum(out=out)  # cum_function
+        with pytest.raises(ValueError, match=errmsg):
+            obj.any(out=out)  # logical_function
+
+    def test_truncate_out_of_bounds(self, frame_or_series):
+        # GH11382
+
+        # small
+        shape = [2000] + ([1] * (frame_or_series._AXIS_LEN - 1))
+        small = construct(frame_or_series, shape, dtype="int8", value=1)
+        tm.assert_equal(small.truncate(), small)
+        tm.assert_equal(small.truncate(before=0, after=3e3), small)
+        tm.assert_equal(small.truncate(before=-1, after=2e3), small)
+
+        # big
+        shape = [2_000_000] + ([1] * (frame_or_series._AXIS_LEN - 1))
+        big = construct(frame_or_series, shape, dtype="int8", value=1)
+        tm.assert_equal(big.truncate(), big)
+        tm.assert_equal(big.truncate(before=0, after=3e6), big)
+        tm.assert_equal(big.truncate(before=-1, after=2e6), big)
+
+    @pytest.mark.parametrize(
+        "func",
+        [copy, deepcopy, lambda x: x.copy(deep=False), lambda x: x.copy(deep=True)],
+    )
+    @pytest.mark.parametrize("shape", [0, 1, 2])
+    def test_copy_and_deepcopy(self, frame_or_series, shape, func):
+        # GH 15444
+        obj = construct(frame_or_series, shape)
+        obj_copy = func(obj)
+        assert obj_copy is not obj
+        tm.assert_equal(obj_copy, obj)
+
+    def test_stdlib_copy_shallow_copies(self, frame_or_series):
+        obj = frame_or_series(range(3))
+        obj_copy = copy(obj)
+        assert tm.shares_memory(obj, obj_copy)
+
+
+class TestNDFrame:
+    # tests that don't fit elsewhere
+
+    @pytest.mark.parametrize(
+        "ser",
+        [
+            Series(range(10), dtype=np.float64),
+            Series([str(i) for i in range(10)], dtype=object),
+        ],
+    )
+    def test_squeeze_series_noop(self, ser):
+        # noop
+        tm.assert_series_equal(ser.squeeze(), ser)
+
+    def test_squeeze_frame_noop(self):
+        # noop
+        df = DataFrame(np.eye(2))
+        tm.assert_frame_equal(df.squeeze(), df)
+
+    def test_squeeze_frame_reindex(self):
+        # squeezing
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=10, freq="B"),
+        ).reindex(columns=["A"])
+        tm.assert_series_equal(df.squeeze(), df["A"])
+
+    def test_squeeze_0_len_dim(self):
+        # don't fail with 0 length dimensions GH11229 & GH8999
+        empty_series = Series([], name="five", dtype=np.float64)
+        empty_frame = DataFrame([empty_series])
+        tm.assert_series_equal(empty_series, empty_series.squeeze())
+        tm.assert_series_equal(empty_series, empty_frame.squeeze())
+
+    def test_squeeze_axis(self):
+        # axis argument
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((1, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=1, freq="B"),
+        ).iloc[:, :1]
+        assert df.shape == (1, 1)
+        tm.assert_series_equal(df.squeeze(axis=0), df.iloc[0])
+        tm.assert_series_equal(df.squeeze(axis="index"), df.iloc[0])
+        tm.assert_series_equal(df.squeeze(axis=1), df.iloc[:, 0])
+        tm.assert_series_equal(df.squeeze(axis="columns"), df.iloc[:, 0])
+        assert df.squeeze() == df.iloc[0, 0]
+        msg = "No axis named 2 for object type DataFrame"
+        with pytest.raises(ValueError, match=msg):
+            df.squeeze(axis=2)
+        msg = "No axis named x for object type DataFrame"
+        with pytest.raises(ValueError, match=msg):
+            df.squeeze(axis="x")
+
+    def test_squeeze_axis_len_3(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((3, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=3, freq="B"),
+        )
+        tm.assert_frame_equal(df.squeeze(axis=0), df)
+
+    def test_numpy_squeeze(self):
+        s = Series(range(2), dtype=np.float64)
+        tm.assert_series_equal(np.squeeze(s), s)
+
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=10, freq="B"),
+        ).reindex(columns=["A"])
+        tm.assert_series_equal(np.squeeze(df), df["A"])
+
+    @pytest.mark.parametrize(
+        "ser",
+        [
+            Series(range(10), dtype=np.float64),
+            Series([str(i) for i in range(10)], dtype=object),
+        ],
+    )
+    def test_transpose_series(self, ser):
+        # calls implementation in pandas/core/base.py
+        tm.assert_series_equal(ser.transpose(), ser)
+
+    def test_transpose_frame(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=10, freq="B"),
+        )
+        tm.assert_frame_equal(df.transpose().transpose(), df)
+
+    def test_numpy_transpose(self, frame_or_series):
+        obj = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=10, freq="B"),
+        )
+        obj = tm.get_obj(obj, frame_or_series)
+
+        if frame_or_series is Series:
+            # 1D -> np.transpose is no-op
+            tm.assert_series_equal(np.transpose(obj), obj)
+
+        # round-trip preserved
+        tm.assert_equal(np.transpose(np.transpose(obj)), obj)
+
+        msg = "the 'axes' parameter is not supported"
+        with pytest.raises(ValueError, match=msg):
+            np.transpose(obj, axes=1)
+
+    @pytest.mark.parametrize(
+        "ser",
+        [
+            Series(range(10), dtype=np.float64),
+            Series([str(i) for i in range(10)], dtype=object),
+        ],
+    )
+    def test_take_series(self, ser):
+        indices = [1, 5, -2, 6, 3, -1]
+        out = ser.take(indices)
+        expected = Series(
+            data=ser.values.take(indices),
+            index=ser.index.take(indices),
+            dtype=ser.dtype,
+        )
+        tm.assert_series_equal(out, expected)
+
+    def test_take_frame(self):
+        indices = [1, 5, -2, 6, 3, -1]
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=10, freq="B"),
+        )
+        out = df.take(indices)
+        expected = DataFrame(
+            data=df.values.take(indices, axis=0),
+            index=df.index.take(indices),
+            columns=df.columns,
+        )
+        tm.assert_frame_equal(out, expected)
+
+    def test_take_invalid_kwargs(self, frame_or_series):
+        indices = [-3, 2, 0, 1]
+
+        obj = DataFrame(range(5))
+        obj = tm.get_obj(obj, frame_or_series)
+
+        msg = r"take\(\) got an unexpected keyword argument 'foo'"
+        with pytest.raises(TypeError, match=msg):
+            obj.take(indices, foo=2)
+
+        msg = "the 'out' parameter is not supported"
+        with pytest.raises(ValueError, match=msg):
+            obj.take(indices, out=indices)
+
+        msg = "the 'mode' parameter is not supported"
+        with pytest.raises(ValueError, match=msg):
+            obj.take(indices, mode="clip")
+
+    def test_axis_classmethods(self, frame_or_series):
+        box = frame_or_series
+        obj = box(dtype=object)
+        values = box._AXIS_TO_AXIS_NUMBER.keys()
+        for v in values:
+            assert obj._get_axis_number(v) == box._get_axis_number(v)
+            assert obj._get_axis_name(v) == box._get_axis_name(v)
+            assert obj._get_block_manager_axis(v) == box._get_block_manager_axis(v)
+
+    def test_flags_identity(self, frame_or_series):
+        obj = Series([1, 2])
+        if frame_or_series is DataFrame:
+            obj = obj.to_frame()
+
+        assert obj.flags is obj.flags
+        obj2 = obj.copy()
+        assert obj2.flags is not obj.flags
diff --git a/pandas/tests/generic/test_label_or_level_utils.py b/pandas/tests/generic/test_label_or_level_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..797f3d488ab18285fd2a829075f5bd06985b25a3
--- /dev/null
+++ b/pandas/tests/generic/test_label_or_level_utils.py
@@ -0,0 +1,329 @@
+import pytest
+
+from pandas.core.dtypes.missing import array_equivalent
+
+import pandas as pd
+
+
+# Fixtures
+# ========
+@pytest.fixture
+def df():
+    """DataFrame with columns 'L1', 'L2', and 'L3'"""
+    return pd.DataFrame({"L1": [1, 2, 3], "L2": [11, 12, 13], "L3": ["A", "B", "C"]})
+
+
+@pytest.fixture(params=[[], ["L1"], ["L1", "L2"], ["L1", "L2", "L3"]])
+def df_levels(request, df):
+    """DataFrame with columns or index levels 'L1', 'L2', and 'L3'"""
+    levels = request.param
+
+    if levels:
+        df = df.set_index(levels)
+
+    return df
+
+
+@pytest.fixture
+def df_ambig(df):
+    """DataFrame with levels 'L1' and 'L2' and labels 'L1' and 'L3'"""
+    df = df.set_index(["L1", "L2"])
+
+    df["L1"] = df["L3"]
+
+    return df
+
+
+# Test is label/level reference
+# =============================
+def get_labels_levels(df_levels):
+    expected_labels = list(df_levels.columns)
+    expected_levels = [name for name in df_levels.index.names if name is not None]
+    return expected_labels, expected_levels
+
+
+def assert_label_reference(frame, labels, axis):
+    for label in labels:
+        assert frame._is_label_reference(label, axis=axis)
+        assert not frame._is_level_reference(label, axis=axis)
+        assert frame._is_label_or_level_reference(label, axis=axis)
+
+
+def assert_level_reference(frame, levels, axis):
+    for level in levels:
+        assert frame._is_level_reference(level, axis=axis)
+        assert not frame._is_label_reference(level, axis=axis)
+        assert frame._is_label_or_level_reference(level, axis=axis)
+
+
+# DataFrame
+# ---------
+def test_is_level_or_label_reference_df_simple(df_levels, axis):
+    axis = df_levels._get_axis_number(axis)
+    # Compute expected labels and levels
+    expected_labels, expected_levels = get_labels_levels(df_levels)
+
+    # Transpose frame if axis == 1
+    if axis == 1:
+        df_levels = df_levels.T
+
+    # Perform checks
+    assert_level_reference(df_levels, expected_levels, axis=axis)
+    assert_label_reference(df_levels, expected_labels, axis=axis)
+
+
+def test_is_level_reference_df_ambig(df_ambig, axis):
+    axis = df_ambig._get_axis_number(axis)
+
+    # Transpose frame if axis == 1
+    if axis == 1:
+        df_ambig = df_ambig.T
+
+    # df has both an on-axis level and off-axis label named L1
+    # Therefore L1 should reference the label, not the level
+    assert_label_reference(df_ambig, ["L1"], axis=axis)
+
+    # df has an on-axis level named L2 and it is not ambiguous
+    # Therefore L2 is a level reference
+    assert_level_reference(df_ambig, ["L2"], axis=axis)
+
+    # df has a column named L3 and it is not a level reference
+    assert_label_reference(df_ambig, ["L3"], axis=axis)
+
+
+# Series
+# ------
+def test_is_level_reference_series_simple_axis0(df):
+    # Make series with L1 as index
+    s = df.set_index("L1").L2
+    assert_level_reference(s, ["L1"], axis=0)
+    assert not s._is_level_reference("L2")
+
+    # Make series with L1 and L2 as index
+    s = df.set_index(["L1", "L2"]).L3
+    assert_level_reference(s, ["L1", "L2"], axis=0)
+    assert not s._is_level_reference("L3")
+
+
+def test_is_level_reference_series_axis1_error(df):
+    # Make series with L1 as index
+    s = df.set_index("L1").L2
+
+    with pytest.raises(ValueError, match="No axis named 1"):
+        s._is_level_reference("L1", axis=1)
+
+
+# Test _check_label_or_level_ambiguity_df
+# =======================================
+
+
+# DataFrame
+# ---------
+def test_check_label_or_level_ambiguity_df(df_ambig, axis):
+    axis = df_ambig._get_axis_number(axis)
+    # Transpose frame if axis == 1
+    if axis == 1:
+        df_ambig = df_ambig.T
+        msg = "'L1' is both a column level and an index label"
+
+    else:
+        msg = "'L1' is both an index level and a column label"
+    # df_ambig has both an on-axis level and off-axis label named L1
+    # Therefore, L1 is ambiguous.
+    with pytest.raises(ValueError, match=msg):
+        df_ambig._check_label_or_level_ambiguity("L1", axis=axis)
+
+    # df_ambig has an on-axis level named L2,, and it is not ambiguous.
+    df_ambig._check_label_or_level_ambiguity("L2", axis=axis)
+
+    # df_ambig has an off-axis label named L3, and it is not ambiguous
+    assert not df_ambig._check_label_or_level_ambiguity("L3", axis=axis)
+
+
+# Series
+# ------
+def test_check_label_or_level_ambiguity_series(df):
+    # A series has no columns and therefore references are never ambiguous
+
+    # Make series with L1 as index
+    s = df.set_index("L1").L2
+    s._check_label_or_level_ambiguity("L1", axis=0)
+    s._check_label_or_level_ambiguity("L2", axis=0)
+
+    # Make series with L1 and L2 as index
+    s = df.set_index(["L1", "L2"]).L3
+    s._check_label_or_level_ambiguity("L1", axis=0)
+    s._check_label_or_level_ambiguity("L2", axis=0)
+    s._check_label_or_level_ambiguity("L3", axis=0)
+
+
+def test_check_label_or_level_ambiguity_series_axis1_error(df):
+    # Make series with L1 as index
+    s = df.set_index("L1").L2
+
+    with pytest.raises(ValueError, match="No axis named 1"):
+        s._check_label_or_level_ambiguity("L1", axis=1)
+
+
+# Test _get_label_or_level_values
+# ===============================
+def assert_label_values(frame, labels, axis):
+    axis = frame._get_axis_number(axis)
+    for label in labels:
+        if axis == 0:
+            expected = frame[label]._values
+        else:
+            expected = frame.loc[label]._values
+
+        result = frame._get_label_or_level_values(label, axis=axis)
+        assert array_equivalent(expected, result)
+
+
+def assert_level_values(frame, levels, axis):
+    axis = frame._get_axis_number(axis)
+    for level in levels:
+        if axis == 0:
+            expected = frame.index.get_level_values(level=level)._values
+        else:
+            expected = frame.columns.get_level_values(level=level)._values
+
+        result = frame._get_label_or_level_values(level, axis=axis)
+        assert array_equivalent(expected, result)
+
+
+# DataFrame
+# ---------
+def test_get_label_or_level_values_df_simple(df_levels, axis):
+    # Compute expected labels and levels
+    expected_labels, expected_levels = get_labels_levels(df_levels)
+
+    axis = df_levels._get_axis_number(axis)
+    # Transpose frame if axis == 1
+    if axis == 1:
+        df_levels = df_levels.T
+
+    # Perform checks
+    assert_label_values(df_levels, expected_labels, axis=axis)
+    assert_level_values(df_levels, expected_levels, axis=axis)
+
+
+def test_get_label_or_level_values_df_ambig(df_ambig, axis):
+    axis = df_ambig._get_axis_number(axis)
+    # Transpose frame if axis == 1
+    if axis == 1:
+        df_ambig = df_ambig.T
+
+    # df has an on-axis level named L2, and it is not ambiguous.
+    assert_level_values(df_ambig, ["L2"], axis=axis)
+
+    # df has an off-axis label named L3, and it is not ambiguous.
+    assert_label_values(df_ambig, ["L3"], axis=axis)
+
+
+def test_get_label_or_level_values_df_duplabels(df, axis):
+    df = df.set_index(["L1"])
+    df_duplabels = pd.concat([df, df["L2"]], axis=1)
+    axis = df_duplabels._get_axis_number(axis)
+    # Transpose frame if axis == 1
+    if axis == 1:
+        df_duplabels = df_duplabels.T
+
+    # df has unambiguous level 'L1'
+    assert_level_values(df_duplabels, ["L1"], axis=axis)
+
+    # df has unique label 'L3'
+    assert_label_values(df_duplabels, ["L3"], axis=axis)
+
+    # df has duplicate labels 'L2'
+    if axis == 0:
+        expected_msg = "The column label 'L2' is not unique"
+    else:
+        expected_msg = "The index label 'L2' is not unique"
+
+    with pytest.raises(ValueError, match=expected_msg):
+        assert_label_values(df_duplabels, ["L2"], axis=axis)
+
+
+# Series
+# ------
+def test_get_label_or_level_values_series_axis0(df):
+    # Make series with L1 as index
+    s = df.set_index("L1").L2
+    assert_level_values(s, ["L1"], axis=0)
+
+    # Make series with L1 and L2 as index
+    s = df.set_index(["L1", "L2"]).L3
+    assert_level_values(s, ["L1", "L2"], axis=0)
+
+
+def test_get_label_or_level_values_series_axis1_error(df):
+    # Make series with L1 as index
+    s = df.set_index("L1").L2
+
+    with pytest.raises(ValueError, match="No axis named 1"):
+        s._get_label_or_level_values("L1", axis=1)
+
+
+# Test _drop_labels_or_levels
+# ===========================
+def assert_labels_dropped(frame, labels, axis):
+    axis = frame._get_axis_number(axis)
+    for label in labels:
+        df_dropped = frame._drop_labels_or_levels(label, axis=axis)
+
+        if axis == 0:
+            assert label in frame.columns
+            assert label not in df_dropped.columns
+        else:
+            assert label in frame.index
+            assert label not in df_dropped.index
+
+
+def assert_levels_dropped(frame, levels, axis):
+    axis = frame._get_axis_number(axis)
+    for level in levels:
+        df_dropped = frame._drop_labels_or_levels(level, axis=axis)
+
+        if axis == 0:
+            assert level in frame.index.names
+            assert level not in df_dropped.index.names
+        else:
+            assert level in frame.columns.names
+            assert level not in df_dropped.columns.names
+
+
+# DataFrame
+# ---------
+def test_drop_labels_or_levels_df(df_levels, axis):
+    # Compute expected labels and levels
+    expected_labels, expected_levels = get_labels_levels(df_levels)
+
+    axis = df_levels._get_axis_number(axis)
+    # Transpose frame if axis == 1
+    if axis == 1:
+        df_levels = df_levels.T
+
+    # Perform checks
+    assert_labels_dropped(df_levels, expected_labels, axis=axis)
+    assert_levels_dropped(df_levels, expected_levels, axis=axis)
+
+    with pytest.raises(ValueError, match="not valid labels or levels"):
+        df_levels._drop_labels_or_levels("L4", axis=axis)
+
+
+# Series
+# ------
+def test_drop_labels_or_levels_series(df):
+    # Make series with L1 as index
+    s = df.set_index("L1").L2
+    assert_levels_dropped(s, ["L1"], axis=0)
+
+    with pytest.raises(ValueError, match="not valid labels or levels"):
+        s._drop_labels_or_levels("L4", axis=0)
+
+    # Make series with L1 and L2 as index
+    s = df.set_index(["L1", "L2"]).L3
+    assert_levels_dropped(s, ["L1", "L2"], axis=0)
+
+    with pytest.raises(ValueError, match="not valid labels or levels"):
+        s._drop_labels_or_levels("L4", axis=0)
diff --git a/pandas/tests/generic/test_series.py b/pandas/tests/generic/test_series.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ea4b8be5cf91731107a41b107c28c9e137ab994
--- /dev/null
+++ b/pandas/tests/generic/test_series.py
@@ -0,0 +1,119 @@
+from operator import methodcaller
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    MultiIndex,
+    Series,
+    date_range,
+)
+import pandas._testing as tm
+
+
+class TestSeries:
+    @pytest.mark.parametrize("func", ["rename_axis", "_set_axis_name"])
+    def test_set_axis_name_mi(self, func):
+        ser = Series(
+            [11, 21, 31],
+            index=MultiIndex.from_tuples(
+                [("A", x) for x in ["a", "B", "c"]], names=["l1", "l2"]
+            ),
+        )
+
+        result = methodcaller(func, ["L1", "L2"])(ser)
+        assert ser.index.name is None
+        assert ser.index.names == ["l1", "l2"]
+        assert result.index.name is None
+        assert result.index.names, ["L1", "L2"]
+
+    def test_set_axis_name_raises(self):
+        ser = Series([1])
+        msg = "No axis named 1 for object type Series"
+        with pytest.raises(ValueError, match=msg):
+            ser._set_axis_name(name="a", axis=1)
+
+    def test_get_bool_data_preserve_dtype(self):
+        ser = Series([True, False, True])
+        result = ser._get_bool_data()
+        tm.assert_series_equal(result, ser)
+
+    @pytest.mark.parametrize("data", [np.nan, pd.NaT, True, False])
+    def test_nonzero_single_element_raise_1(self, data):
+        # single item nan to raise
+        series = Series([data])
+
+        msg = "The truth value of a Series is ambiguous"
+        with pytest.raises(ValueError, match=msg):
+            bool(series)
+
+    @pytest.mark.parametrize("data", [(True, True), (False, False)])
+    def test_nonzero_multiple_element_raise(self, data):
+        # multiple bool are still an error
+        msg_err = "The truth value of a Series is ambiguous"
+        series = Series([data])
+        with pytest.raises(ValueError, match=msg_err):
+            bool(series)
+
+    @pytest.mark.parametrize("data", [1, 0, "a", 0.0])
+    def test_nonbool_single_element_raise(self, data):
+        # single non-bool are an error
+        msg_err1 = "The truth value of a Series is ambiguous"
+        series = Series([data])
+        with pytest.raises(ValueError, match=msg_err1):
+            bool(series)
+
+    def test_metadata_propagation_indiv_resample(self):
+        # resample
+        ts = Series(
+            np.random.default_rng(2).random(1000),
+            index=date_range("20130101", periods=1000, freq="s"),
+            name="foo",
+        )
+        result = ts.resample("1min").mean()
+        tm.assert_metadata_equivalent(ts, result)
+
+        result = ts.resample("1min").min()
+        tm.assert_metadata_equivalent(ts, result)
+
+        result = ts.resample("1min").apply(lambda x: x.sum())
+        tm.assert_metadata_equivalent(ts, result)
+
+    def test_metadata_propagation_indiv(self, monkeypatch):
+        # check that the metadata matches up on the resulting ops
+
+        ser = Series(range(3), range(3))
+        ser.name = "foo"
+        ser2 = Series(range(3), range(3))
+        ser2.name = "bar"
+
+        result = ser.T
+        tm.assert_metadata_equivalent(ser, result)
+
+        def finalize(self, other, method=None, **kwargs):
+            for name in self._metadata:
+                if method == "concat" and name == "filename":
+                    value = "+".join(
+                        [
+                            getattr(obj, name)
+                            for obj in other.input_objs
+                            if getattr(obj, name, None)
+                        ]
+                    )
+                    object.__setattr__(self, name, value)
+                else:
+                    object.__setattr__(self, name, getattr(other, name, None))
+
+            return self
+
+        with monkeypatch.context() as m:
+            m.setattr(Series, "_metadata", ["name", "filename"])
+            m.setattr(Series, "__finalize__", finalize)
+
+            ser.filename = "foo"
+            ser2.filename = "bar"
+
+            result = pd.concat([ser, ser2])
+            assert result.filename == "foo+bar"
+            assert result.name is None
diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aabdb6d7869a3f007fa50ad3adb56544d575077
--- /dev/null
+++ b/pandas/tests/generic/test_to_xarray.py
@@ -0,0 +1,126 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    Categorical,
+    DataFrame,
+    MultiIndex,
+    Series,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.util.version import Version
+
+xarray = pytest.importorskip("xarray")
+
+if xarray is not None and Version(xarray.__version__) < Version("2025.1.0"):
+    pytestmark = pytest.mark.filterwarnings(
+        "ignore:Converting non-nanosecond precision:UserWarning"
+    )
+
+
+class TestDataFrameToXArray:
+    @pytest.fixture
+    def df(self):
+        return DataFrame(
+            {
+                "a": list("abcd"),
+                "b": list(range(1, 5)),
+                "c": np.arange(3, 7).astype("u1"),
+                "d": np.arange(4.0, 8.0, dtype="float64"),
+                "e": [True, False, True, False],
+                "f": Categorical(list("abcd")),
+                "g": date_range("20130101", periods=4),
+                "h": date_range("20130101", periods=4, tz="US/Eastern"),
+            }
+        )
+
+    def test_to_xarray_index_types(self, index_flat, df, request):
+        index = index_flat
+        # MultiIndex is tested in test_to_xarray_with_multiindex
+        if len(index) == 0:
+            pytest.skip("Test doesn't make sense for empty index")
+        if Version(xarray.__version__) < Version("2025.9.0"):
+            pytest.skip("Xarray bug https://github.com/pydata/xarray/issues/9661")
+
+        df.index = index[:4]
+        df.index.name = "foo"
+        df.columns.name = "bar"
+        result = df.to_xarray()
+        assert result.sizes["foo"] == 4
+        assert len(result.coords) == 1
+        assert len(result.data_vars) == 8
+        tm.assert_almost_equal(list(result.coords.keys()), ["foo"])
+        assert isinstance(result, xarray.Dataset)
+
+        # idempotency
+        # datetimes w/tz are preserved
+        # column names are lost
+        expected = df.copy()
+        expected.columns.name = None
+        tm.assert_frame_equal(result.to_dataframe(), expected)
+
+    def test_to_xarray_empty(self, df):
+        df.index.name = "foo"
+        result = df[0:0].to_xarray()
+        assert result.sizes["foo"] == 0
+        assert isinstance(result, xarray.Dataset)
+
+    def test_to_xarray_with_multiindex(self, df, using_infer_string):
+        # MultiIndex
+        df.index = MultiIndex.from_product([["a"], range(4)], names=["one", "two"])
+        result = df.to_xarray()
+        assert result.sizes["one"] == 1
+        assert result.sizes["two"] == 4
+        assert len(result.coords) == 2
+        assert len(result.data_vars) == 8
+        tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"])
+        assert isinstance(result, xarray.Dataset)
+
+        result = result.to_dataframe()
+        expected = df.copy()
+        expected["f"] = expected["f"].astype(
+            object if not using_infer_string else "str"
+        )
+        if Version(xarray.__version__) < Version("2025.1.0"):
+            expected["g"] = expected["g"].astype("M8[ns]")
+            expected["h"] = expected["h"].astype("M8[ns, US/Eastern]")
+        expected.columns.name = None
+        tm.assert_frame_equal(result, expected)
+
+
+class TestSeriesToXArray:
+    def test_to_xarray_index_types(self, index_flat, request):
+        # MultiIndex is tested in test_to_xarray_with_multiindex
+        index = index_flat
+
+        ser = Series(range(len(index)), index=index, dtype="int64")
+        ser.index.name = "foo"
+        result = ser.to_xarray()
+        repr(result)
+        assert len(result) == len(index)
+        assert len(result.coords) == 1
+        tm.assert_almost_equal(list(result.coords.keys()), ["foo"])
+        assert isinstance(result, xarray.DataArray)
+
+        # idempotency
+        tm.assert_series_equal(result.to_series(), ser)
+
+    def test_to_xarray_empty(self):
+        ser = Series([], dtype=object)
+        ser.index.name = "foo"
+        result = ser.to_xarray()
+        assert len(result) == 0
+        assert len(result.coords) == 1
+        tm.assert_almost_equal(list(result.coords.keys()), ["foo"])
+        assert isinstance(result, xarray.DataArray)
+
+    def test_to_xarray_with_multiindex(self):
+        mi = MultiIndex.from_product([["a", "b"], range(3)], names=["one", "two"])
+        ser = Series(range(6), dtype="int64", index=mi)
+        result = ser.to_xarray()
+        assert len(result) == 2
+        tm.assert_almost_equal(list(result.coords.keys()), ["one", "two"])
+        assert isinstance(result, xarray.DataArray)
+        res = result.to_series()
+        tm.assert_series_equal(res, ser)
diff --git a/pandas/tests/groupby/__init__.py b/pandas/tests/groupby/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..79046cd7ed415166e0c81ff645174015fc48eaf6
--- /dev/null
+++ b/pandas/tests/groupby/__init__.py
@@ -0,0 +1,25 @@
+def get_groupby_method_args(name, obj):
+    """
+    Get required arguments for a groupby method.
+
+    When parametrizing a test over groupby methods (e.g. "sum", "mean"),
+    it is often the case that arguments are required for certain methods.
+
+    Parameters
+    ----------
+    name: str
+        Name of the method.
+    obj: Series or DataFrame
+        pandas object that is being grouped.
+
+    Returns
+    -------
+    A tuple of required arguments for the method.
+    """
+    if name in ("nth", "take"):
+        return (0,)
+    if name == "quantile":
+        return (0.5,)
+    if name == "corrwith":
+        return (obj,)
+    return ()
diff --git a/pandas/tests/groupby/conftest.py b/pandas/tests/groupby/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..2745f7c2b8d0faf0842d23f1584e9dd079a5ef30
--- /dev/null
+++ b/pandas/tests/groupby/conftest.py
@@ -0,0 +1,166 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    date_range,
+)
+from pandas.core.groupby.base import (
+    reduction_kernels,
+    transformation_kernels,
+)
+
+
+@pytest.fixture
+def df():
+    return DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+            "C": np.random.default_rng(2).standard_normal(8),
+            "D": np.random.default_rng(2).standard_normal(8),
+        }
+    )
+
+
+@pytest.fixture
+def ts():
+    return Series(
+        np.random.default_rng(2).standard_normal(30),
+        index=date_range("2000-01-01", periods=30, freq="B"),
+    )
+
+
+@pytest.fixture
+def tsframe():
+    return DataFrame(
+        np.random.default_rng(2).standard_normal((30, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=date_range("2000-01-01", periods=30, freq="B"),
+    )
+
+
+@pytest.fixture
+def three_group():
+    return DataFrame(
+        {
+            "A": [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            "B": [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            "C": [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            "D": np.random.default_rng(2).standard_normal(11),
+            "E": np.random.default_rng(2).standard_normal(11),
+            "F": np.random.default_rng(2).standard_normal(11),
+        }
+    )
+
+
+@pytest.fixture
+def slice_test_df():
+    data = [
+        [0, "a", "a0_at_0"],
+        [1, "b", "b0_at_1"],
+        [2, "a", "a1_at_2"],
+        [3, "b", "b1_at_3"],
+        [4, "c", "c0_at_4"],
+        [5, "a", "a2_at_5"],
+        [6, "a", "a3_at_6"],
+        [7, "a", "a4_at_7"],
+    ]
+    df = DataFrame(data, columns=["Index", "Group", "Value"])
+    return df.set_index("Index")
+
+
+@pytest.fixture
+def slice_test_grouped(slice_test_df):
+    return slice_test_df.groupby("Group", as_index=False)
+
+
+@pytest.fixture(params=sorted(reduction_kernels))
+def reduction_func(request):
+    """
+    yields the string names of all groupby reduction functions, one at a time.
+    """
+    return request.param
+
+
+@pytest.fixture(params=sorted(transformation_kernels))
+def transformation_func(request):
+    """yields the string names of all groupby transformation functions."""
+    return request.param
+
+
+@pytest.fixture(params=sorted(reduction_kernels) + sorted(transformation_kernels))
+def groupby_func(request):
+    """yields both aggregation and transformation functions."""
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        ("mean", {}),
+        ("var", {"ddof": 1}),
+        ("var", {"ddof": 0}),
+        ("std", {"ddof": 1}),
+        ("std", {"ddof": 0}),
+        ("sum", {}),
+        ("min", {}),
+        ("max", {}),
+        ("sum", {"min_count": 2}),
+        ("min", {"min_count": 2}),
+        ("max", {"min_count": 2}),
+    ],
+    ids=[
+        "mean",
+        "var_1",
+        "var_0",
+        "std_1",
+        "std_0",
+        "sum",
+        "min",
+        "max",
+        "sum-min_count",
+        "min-min_count",
+        "max-min_count",
+    ],
+)
+def numba_supported_reductions(request):
+    """reductions supported with engine='numba'"""
+    return request.param
diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a012f5da4aa827dbea1350d7a874ca6e0d85233
--- /dev/null
+++ b/pandas/tests/groupby/test_all_methods.py
@@ -0,0 +1,105 @@
+"""
+Tests that apply to all groupby operation methods.
+
+The only tests that should appear here are those that use the `groupby_func` fixture.
+Even if it does use that fixture, prefer a more specific test file if it available
+such as:
+
+ - test_categorical
+ - test_groupby_dropna
+ - test_groupby_subclass
+ - test_raises
+"""
+
+import pytest
+
+from pandas.errors import Pandas4Warning
+
+import pandas as pd
+from pandas import DataFrame
+import pandas._testing as tm
+from pandas.tests.groupby import get_groupby_method_args
+
+
+def test_multiindex_group_all_columns_when_empty(groupby_func):
+    # GH 32464
+    df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"])
+    gb = df.groupby(["a", "b", "c"], group_keys=True)
+    method = getattr(gb, groupby_func)
+    args = get_groupby_method_args(groupby_func, df)
+    if groupby_func == "corrwith":
+        warn = Pandas4Warning
+        warn_msg = "DataFrameGroupBy.corrwith is deprecated"
+    else:
+        warn = None
+        warn_msg = ""
+    with tm.assert_produces_warning(warn, match=warn_msg):
+        result = method(*args).index
+    expected = df.index
+    tm.assert_index_equal(result, expected)
+
+
+def test_duplicate_columns(request, groupby_func, as_index):
+    # GH#50806
+    if groupby_func == "corrwith":
+        msg = "GH#50845 - corrwith fails when there are duplicate columns"
+        request.applymarker(pytest.mark.xfail(reason=msg))
+    df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb"))
+    args = get_groupby_method_args(groupby_func, df)
+    gb = df.groupby("a", as_index=as_index)
+    result = getattr(gb, groupby_func)(*args)
+
+    expected_df = df.set_axis(["a", "b", "c"], axis=1)
+    expected_args = get_groupby_method_args(groupby_func, expected_df)
+    expected_gb = expected_df.groupby("a", as_index=as_index)
+    expected = getattr(expected_gb, groupby_func)(*expected_args)
+    if groupby_func not in ("size", "ngroup", "cumcount"):
+        expected = expected.rename(columns={"c": "b"})
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "idx",
+    [
+        pd.Index(["a", "a"], name="foo"),
+        pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]),
+    ],
+)
+def test_dup_labels_output_shape(groupby_func, idx):
+    if groupby_func in {"size", "ngroup", "cumcount"}:
+        pytest.skip(f"Not applicable for {groupby_func}")
+
+    df = DataFrame([[1, 1]], columns=idx)
+    grp_by = df.groupby([0])
+
+    args = get_groupby_method_args(groupby_func, df)
+    if groupby_func == "corrwith":
+        warn = Pandas4Warning
+        warn_msg = "DataFrameGroupBy.corrwith is deprecated"
+    else:
+        warn = None
+        warn_msg = ""
+    with tm.assert_produces_warning(warn, match=warn_msg):
+        result = getattr(grp_by, groupby_func)(*args)
+
+    assert result.shape == (1, 2)
+    tm.assert_index_equal(result.columns, idx)
+
+
+def test_not_c_contiguous_mask(groupby_func):
+    # https://github.com/pandas-dev/pandas/issues/61031
+    if groupby_func == "corrwith":
+        # corrwith is deprecated
+        return
+    df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}, dtype="Int64")
+    reversed = DataFrame(
+        {"a": [2, 1, 1], "b": [5, 4, 3]}, dtype="Int64", index=[2, 1, 0]
+    )[::-1]
+    assert not reversed["b"].array._mask.flags["C_CONTIGUOUS"]
+    args = get_groupby_method_args(groupby_func, df)
+
+    gb_reversed = reversed.groupby("a")
+    result = getattr(gb_reversed, groupby_func)(*args)
+    gb = df.groupby("a")
+    expected = getattr(gb, groupby_func)(*args)
+    tm.assert_equal(result, expected)
diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..215e627abb018f04bff337056dee9c4a60312d62
--- /dev/null
+++ b/pandas/tests/groupby/test_api.py
@@ -0,0 +1,274 @@
+"""
+Tests of the groupby API, including internal consistency and with other pandas objects.
+
+Tests in this file should only check the existence, names, and arguments of groupby
+methods. It should not test the results of any groupby operation.
+"""
+
+import inspect
+
+import pytest
+
+from pandas import (
+    DataFrame,
+    Series,
+)
+from pandas.core.groupby.base import (
+    groupby_other_methods,
+    reduction_kernels,
+    transformation_kernels,
+)
+from pandas.core.groupby.generic import (
+    DataFrameGroupBy,
+    SeriesGroupBy,
+)
+
+
+def test_tab_completion(multiindex_dataframe_random_data):
+    grp = multiindex_dataframe_random_data.groupby(level="second")
+    results = {v for v in dir(grp) if not v.startswith("_")}
+    expected = {
+        "A",
+        "B",
+        "C",
+        "agg",
+        "aggregate",
+        "apply",
+        "boxplot",
+        "filter",
+        "first",
+        "get_group",
+        "groups",
+        "hist",
+        "indices",
+        "last",
+        "max",
+        "mean",
+        "median",
+        "min",
+        "ngroups",
+        "nth",
+        "ohlc",
+        "plot",
+        "prod",
+        "size",
+        "std",
+        "sum",
+        "transform",
+        "var",
+        "sem",
+        "count",
+        "nunique",
+        "head",
+        "describe",
+        "cummax",
+        "quantile",
+        "rank",
+        "cumprod",
+        "tail",
+        "resample",
+        "cummin",
+        "cumsum",
+        "cumcount",
+        "ngroup",
+        "all",
+        "shift",
+        "skew",
+        "kurt",
+        "take",
+        "pct_change",
+        "any",
+        "corr",
+        "corrwith",
+        "cov",
+        "ndim",
+        "diff",
+        "idxmax",
+        "idxmin",
+        "ffill",
+        "bfill",
+        "rolling",
+        "expanding",
+        "pipe",
+        "sample",
+        "ewm",
+        "value_counts",
+    }
+    assert results == expected
+
+
+def test_all_methods_categorized(multiindex_dataframe_random_data):
+    grp = multiindex_dataframe_random_data.groupby(
+        multiindex_dataframe_random_data.iloc[:, 0]
+    )
+    names = {_ for _ in dir(grp) if not _.startswith("_")} - set(
+        multiindex_dataframe_random_data.columns
+    )
+    new_names = set(names)
+    new_names -= reduction_kernels
+    new_names -= transformation_kernels
+    new_names -= groupby_other_methods
+
+    assert not reduction_kernels & transformation_kernels
+    assert not reduction_kernels & groupby_other_methods
+    assert not transformation_kernels & groupby_other_methods
+
+    # new public method?
+    if new_names:
+        msg = f"""
+There are uncategorized methods defined on the Grouper class:
+{new_names}.
+
+Was a new method recently added?
+
+Every public method On Grouper must appear in exactly one the
+following three lists defined in pandas.core.groupby.base:
+- `reduction_kernels`
+- `transformation_kernels`
+- `groupby_other_methods`
+see the comments in pandas/core/groupby/base.py for guidance on
+how to fix this test.
+        """
+        raise AssertionError(msg)
+
+    # removed a public method?
+    all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods
+    if names != all_categorized:
+        msg = f"""
+Some methods which are supposed to be on the Grouper class
+are missing:
+{all_categorized - names}.
+
+They're still defined in one of the lists that live in pandas/core/groupby/base.py.
+If you removed a method, you should update them
+"""
+        raise AssertionError(msg)
+
+
+def test_frame_consistency(groupby_func):
+    # GH#48028
+    if groupby_func in ("first", "last"):
+        msg = "first and last don't exist for DataFrame anymore"
+        pytest.skip(reason=msg)
+
+    if groupby_func in ("cumcount", "ngroup"):
+        assert not hasattr(DataFrame, groupby_func)
+        return
+
+    frame_method = getattr(DataFrame, groupby_func)
+    gb_method = getattr(DataFrameGroupBy, groupby_func)
+    result = set(inspect.signature(gb_method).parameters)
+    if groupby_func == "size":
+        # "size" is a method on GroupBy but property on DataFrame:
+        expected = {"self"}
+    else:
+        expected = set(inspect.signature(frame_method).parameters)
+
+    # Exclude certain arguments from result and expected depending on the operation
+    # Some of these may be purposeful inconsistencies between the APIs
+    exclude_expected, exclude_result = set(), set()
+    if groupby_func in ("any", "all"):
+        exclude_expected = {"kwargs", "bool_only", "axis"}
+    elif groupby_func in ("count",):
+        exclude_expected = {"numeric_only", "axis"}
+    elif groupby_func in ("nunique",):
+        exclude_expected = {"axis"}
+    elif groupby_func in ("max", "min"):
+        exclude_expected = {"axis", "kwargs"}
+        exclude_result = {"min_count", "engine", "engine_kwargs"}
+    elif groupby_func in ("sum", "mean", "std", "var"):
+        exclude_expected = {"axis", "kwargs"}
+        exclude_result = {"engine", "engine_kwargs"}
+    elif groupby_func in ("median", "prod", "sem"):
+        exclude_expected = {"axis", "kwargs"}
+    elif groupby_func in ("bfill", "ffill"):
+        exclude_expected = {"inplace", "axis", "limit_area"}
+    elif groupby_func in ("cummax", "cummin"):
+        exclude_expected = {"axis", "skipna", "args"}
+    elif groupby_func in ("cumprod", "cumsum"):
+        exclude_expected = {"axis", "skipna"}
+    elif groupby_func in ("pct_change",):
+        exclude_expected = {"kwargs"}
+    elif groupby_func in ("rank",):
+        exclude_expected = {"numeric_only"}
+    elif groupby_func in ("quantile",):
+        exclude_expected = {"method", "axis"}
+    elif groupby_func in ["corrwith"]:
+        exclude_expected = {"min_periods"}
+    if groupby_func not in ["pct_change", "size"]:
+        exclude_expected |= {"axis"}
+
+    # Ensure excluded arguments are actually in the signatures
+    assert result & exclude_result == exclude_result
+    assert expected & exclude_expected == exclude_expected
+
+    result -= exclude_result
+    expected -= exclude_expected
+    assert result == expected
+
+
+def test_series_consistency(request, groupby_func):
+    # GH#48028
+    if groupby_func in ("first", "last"):
+        msg = "first and last don't exist for Series anymore"
+        pytest.skip(msg)
+
+    if groupby_func in ("cumcount", "corrwith", "ngroup"):
+        assert not hasattr(Series, groupby_func)
+        return
+
+    series_method = getattr(Series, groupby_func)
+    gb_method = getattr(SeriesGroupBy, groupby_func)
+    result = set(inspect.signature(gb_method).parameters)
+    if groupby_func == "size":
+        # "size" is a method on GroupBy but property on Series
+        expected = {"self"}
+    else:
+        expected = set(inspect.signature(series_method).parameters)
+
+    # Exclude certain arguments from result and expected depending on the operation
+    # Some of these may be purposeful inconsistencies between the APIs
+    exclude_expected, exclude_result = set(), set()
+    if groupby_func in ("any", "all"):
+        exclude_expected = {"kwargs", "bool_only", "axis"}
+    elif groupby_func in ("max", "min"):
+        exclude_expected = {"axis", "kwargs"}
+        exclude_result = {"min_count", "engine", "engine_kwargs"}
+    elif groupby_func in ("sum", "mean", "std", "var"):
+        exclude_expected = {"axis", "kwargs"}
+        exclude_result = {"engine", "engine_kwargs"}
+    elif groupby_func in ("median", "prod", "sem"):
+        exclude_expected = {"axis", "kwargs"}
+    elif groupby_func in ("bfill", "ffill"):
+        exclude_expected = {"inplace", "axis", "limit_area"}
+    elif groupby_func in ("cummax", "cummin"):
+        exclude_expected = {"skipna", "args"}
+        exclude_result = {"numeric_only"}
+    elif groupby_func in ("cumprod", "cumsum"):
+        exclude_expected = {"skipna"}
+        exclude_result = {"numeric_only"}
+    elif groupby_func in ("pct_change",):
+        exclude_expected = {"kwargs"}
+    elif groupby_func in ("rank",):
+        exclude_expected = {"numeric_only"}
+    elif groupby_func in ("idxmin", "idxmax"):
+        exclude_expected = {"args", "kwargs"}
+    elif groupby_func in ("quantile",):
+        exclude_result = {"numeric_only"}
+    if groupby_func not in [
+        "diff",
+        "pct_change",
+        "count",
+        "nunique",
+        "quantile",
+        "size",
+    ]:
+        exclude_expected |= {"axis"}
+
+    # Ensure excluded arguments are actually in the signatures
+    assert result & exclude_result == exclude_result
+    assert expected & exclude_expected == exclude_expected
+
+    result -= exclude_result
+    expected -= exclude_expected
+    assert result == expected
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7c9168d6003e4ea964b1b41c7059e4afe72c8cb
--- /dev/null
+++ b/pandas/tests/groupby/test_apply.py
@@ -0,0 +1,1543 @@
+from datetime import (
+    date,
+    datetime,
+)
+
+import numpy as np
+import pytest
+
+from pandas.errors import Pandas4Warning
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    bdate_range,
+)
+import pandas._testing as tm
+from pandas.tests.groupby import get_groupby_method_args
+
+
+def test_apply_func_that_appends_group_to_list_without_copy():
+    # GH: 17718
+
+    df = DataFrame(1, index=list(range(10)) * 10, columns=[0]).reset_index()
+    groups = []
+
+    def store(group):
+        groups.append(group)
+
+    df.groupby("index").apply(store)
+    expected_value = DataFrame({0: [1] * 10}, index=pd.RangeIndex(0, 100, 10))
+    expected_value.columns = expected_value.columns.astype(object)
+
+    tm.assert_frame_equal(groups[0], expected_value)
+
+
+def test_apply_index_date(using_infer_string):
+    # GH 5788
+    ts = [
+        "2011-05-16 00:00",
+        "2011-05-16 01:00",
+        "2011-05-16 02:00",
+        "2011-05-16 03:00",
+        "2011-05-17 02:00",
+        "2011-05-17 03:00",
+        "2011-05-17 04:00",
+        "2011-05-17 05:00",
+        "2011-05-18 02:00",
+        "2011-05-18 03:00",
+        "2011-05-18 04:00",
+        "2011-05-18 05:00",
+    ]
+    df = DataFrame(
+        {
+            "value": [
+                1.40893,
+                1.40760,
+                1.40750,
+                1.40649,
+                1.40893,
+                1.40760,
+                1.40750,
+                1.40649,
+                1.40893,
+                1.40760,
+                1.40750,
+                1.40649,
+            ],
+        },
+        index=Index(pd.to_datetime(ts), name="date_time"),
+    )
+    expected = df.groupby(df.index.date).idxmax()
+    result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_index_date_object():
+    # GH 5789
+    # don't auto coerce dates
+    ts = [
+        "2011-05-16 00:00",
+        "2011-05-16 01:00",
+        "2011-05-16 02:00",
+        "2011-05-16 03:00",
+        "2011-05-17 02:00",
+        "2011-05-17 03:00",
+        "2011-05-17 04:00",
+        "2011-05-17 05:00",
+        "2011-05-18 02:00",
+        "2011-05-18 03:00",
+        "2011-05-18 04:00",
+        "2011-05-18 05:00",
+    ]
+    df = DataFrame([row.split() for row in ts], columns=["date", "time"])
+    df["value"] = [
+        1.40893,
+        1.40760,
+        1.40750,
+        1.40649,
+        1.40893,
+        1.40760,
+        1.40750,
+        1.40649,
+        1.40893,
+        1.40760,
+        1.40750,
+        1.40649,
+    ]
+    exp_idx = Index(["2011-05-16", "2011-05-17", "2011-05-18"], name="date")
+    expected = Series(["00:00", "02:00", "02:00"], index=exp_idx)
+    result = df.groupby("date").apply(lambda x: x["time"][x["value"].idxmax()])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "df, group_names",
+    [
+        (DataFrame({"a": [1, 1, 1, 2, 3], "b": ["a", "a", "a", "b", "c"]}), [1, 2, 3]),
+        (DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}), [0, 1]),
+        (DataFrame({"a": [1]}), [1]),
+        (DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], "b": range(8)}), [1, 2]),
+        (DataFrame({"a": [1, 2, 3, 1, 2, 3], "two": [4, 5, 6, 7, 8, 9]}), [1, 2, 3]),
+        (
+            DataFrame(
+                {
+                    "a": list("aaabbbcccc"),
+                    "B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4],
+                    "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8],
+                }
+            ),
+            ["a", "b", "c"],
+        ),
+        (DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), [1, 2]),
+    ],
+    ids=[
+        "GH2936",
+        "GH7739 & GH10519",
+        "GH10519",
+        "GH2656",
+        "GH12155",
+        "GH20084",
+        "GH21417",
+    ],
+)
+def test_group_apply_once_per_group(df, group_names):
+    # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417
+
+    # This test should ensure that a function is only evaluated
+    # once per group. Previously the function has been evaluated twice
+    # on the first group to check if the Cython index slider is safe to use
+    # This test ensures that the side effect (append to list) is only triggered
+    # once per group
+
+    names = []
+    # cannot parameterize over the functions since they need external
+    # `names` to detect side effects
+
+    def f_copy(group):
+        # this takes the fast apply path
+        names.append(group.name)
+        return group.copy()
+
+    def f_nocopy(group):
+        # this takes the slow apply path
+        names.append(group.name)
+        return group
+
+    def f_scalar(group):
+        # GH7739, GH2656
+        names.append(group.name)
+        return 0
+
+    def f_none(group):
+        # GH10519, GH12155, GH21417
+        names.append(group.name)
+
+    def f_constant_df(group):
+        # GH2936, GH20084
+        names.append(group.name)
+        return DataFrame({"a": [1], "b": [1]})
+
+    for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]:
+        del names[:]
+
+        df.groupby("a").apply(func)
+        assert names == group_names
+
+
+def test_group_apply_once_per_group2(capsys):
+    # GH: 31111
+    # groupby-apply need to execute len(set(group_by_columns)) times
+
+    expected = 2  # Number of times `apply` should call a function for the current test
+
+    df = DataFrame(
+        {
+            "group_by_column": [0, 0, 0, 0, 1, 1, 1, 1],
+            "test_column": ["0", "2", "4", "6", "8", "10", "12", "14"],
+        },
+        index=["0", "2", "4", "6", "8", "10", "12", "14"],
+    )
+
+    df.groupby("group_by_column", group_keys=False).apply(
+        lambda df: print("function_called")
+    )
+
+    result = capsys.readouterr().out.count("function_called")
+    # If `groupby` behaves unexpectedly, this test will break
+    assert result == expected
+
+
+def test_apply_fast_slow_identical():
+    # GH 31613
+
+    df = DataFrame({"A": [0, 0, 1], "b": range(3)})
+
+    # For simple index structures we check for fast/slow apply using
+    # an identity check on in/output
+    def slow(group):
+        return group
+
+    def fast(group):
+        return group.copy()
+
+    fast_df = df.groupby("A", group_keys=False).apply(fast)
+    slow_df = df.groupby("A", group_keys=False).apply(slow)
+    tm.assert_frame_equal(fast_df, slow_df)
+
+
+def test_apply_fast_slow_identical_index():
+    # GH#44803
+    df = DataFrame(
+        {
+            "name": ["Alice", "Bob", "Carl"],
+            "age": [20, 21, 20],
+        }
+    ).set_index("name")
+
+    grp_by_same_value = df.groupby(["age"], group_keys=False).apply(lambda group: group)
+    grp_by_copy = df.groupby(["age"], group_keys=False).apply(
+        lambda group: group.copy()
+    )
+    tm.assert_frame_equal(grp_by_same_value, grp_by_copy)
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda x: x,
+        lambda x: x[:],
+        lambda x: x.copy(deep=False),
+        lambda x: x.copy(deep=True),
+    ],
+)
+def test_groupby_apply_identity_maybecopy_index_identical(func):
+    # GH 14927
+    # Whether the function returns a copy of the input data or not should not
+    # have an impact on the index structure of the result since this is not
+    # transparent to the user
+
+    df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
+    result = df.groupby("g", group_keys=False).apply(func)
+    tm.assert_frame_equal(result, df[["a", "b"]])
+
+
+def test_apply_with_mixed_dtype():
+    # GH3480, apply with mixed dtype on axis=1 breaks in 0.11
+    df = DataFrame(
+        {
+            "foo1": np.random.default_rng(2).standard_normal(6),
+            "foo2": ["one", "two", "two", "three", "one", "two"],
+        }
+    )
+    result = df.apply(lambda x: x, axis=1)
+    expected = df
+    tm.assert_frame_equal(result, expected)
+
+    # GH 3610 incorrect dtype conversion with as_index=False
+    df = DataFrame({"c1": [1, 2, 6, 6, 8]})
+    df["c2"] = df.c1 / 2.0
+    result1 = df.groupby("c2").mean().reset_index()
+    result2 = df.groupby("c2", as_index=False).mean()
+    tm.assert_frame_equal(result1, result2)
+
+
+def test_groupby_as_index_apply(as_index):
+    # GH #4648 and #3417
+    df = DataFrame(
+        {
+            "item_id": ["b", "b", "a", "c", "a", "b"],
+            "user_id": [1, 2, 1, 1, 3, 1],
+            "time": range(6),
+        }
+    )
+    gb = df.groupby("user_id", as_index=as_index)
+
+    expected = DataFrame(
+        {
+            "item_id": ["b", "b", "a", "a"],
+            "user_id": [1, 2, 1, 3],
+            "time": [0, 1, 2, 4],
+        },
+        index=[0, 1, 2, 4],
+    )
+    result = gb.head(2)
+    tm.assert_frame_equal(result, expected)
+
+    # apply doesn't maintain the original ordering
+    # changed in GH5610 as the as_index=False returns a MI here
+    if as_index:
+        tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
+        index = MultiIndex.from_tuples(tp, names=["user_id", None])
+    else:
+        index = Index([0, 2, 1, 4])
+    expected = DataFrame(
+        {
+            "item_id": list("baba"),
+            "time": [0, 2, 1, 4],
+        },
+        index=index,
+    )
+    result = gb.apply(lambda x: x.head(2))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_as_index_apply_str():
+    ind = Index(list("abcde"))
+    df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
+    res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index
+    tm.assert_index_equal(res, ind)
+
+
+def test_apply_concat_preserve_names(three_group):
+    grouped = three_group.groupby(["A", "B"])
+
+    def desc(group):
+        result = group.describe()
+        result.index.name = "stat"
+        return result
+
+    def desc2(group):
+        result = group.describe()
+        result.index.name = "stat"
+        result = result[: len(group)]
+        # weirdo
+        return result
+
+    def desc3(group):
+        result = group.describe()
+
+        # names are different
+        result.index.name = f"stat_{len(group):d}"
+
+        result = result[: len(group)]
+        # weirdo
+        return result
+
+    result = grouped.apply(desc)
+    assert result.index.names == ("A", "B", "stat")
+
+    result2 = grouped.apply(desc2)
+    assert result2.index.names == ("A", "B", "stat")
+
+    result3 = grouped.apply(desc3)
+    assert result3.index.names == ("A", "B", None)
+
+
+def test_apply_series_to_frame():
+    def f(piece):
+        with np.errstate(invalid="ignore"):
+            logged = np.log(piece)
+        return DataFrame(
+            {"value": piece, "demeaned": piece - piece.mean(), "logged": logged}
+        )
+
+    dr = bdate_range("1/1/2000", periods=10)
+    ts = Series(np.random.default_rng(2).standard_normal(10), index=dr)
+
+    grouped = ts.groupby(lambda x: x.month, group_keys=False)
+    result = grouped.apply(f)
+
+    assert isinstance(result, DataFrame)
+    assert not hasattr(result, "name")  # GH49907
+    tm.assert_index_equal(result.index, ts.index)
+
+
+def test_apply_series_yield_constant(df):
+    result = df.groupby(["A", "B"])["C"].apply(len)
+    assert result.index.names[:2] == ("A", "B")
+
+
+def test_apply_frame_yield_constant(df):
+    # GH13568
+    result = df.groupby(["A", "B"]).apply(len)
+    assert isinstance(result, Series)
+    assert result.name is None
+
+    result = df.groupby(["A", "B"])[["C", "D"]].apply(len)
+    assert isinstance(result, Series)
+    assert result.name is None
+
+
+def test_apply_frame_to_series(df):
+    grouped = df.groupby(["A", "B"])
+    result = grouped.apply(len)
+    expected = grouped.count()["C"]
+    tm.assert_index_equal(result.index, expected.index)
+    tm.assert_numpy_array_equal(result.values, expected.values)
+
+
+def test_apply_frame_not_as_index_column_name(df):
+    # GH 35964 - path within _wrap_applied_output not hit by a test
+    grouped = df.groupby(["A", "B"], as_index=False)
+    result = grouped.apply(len)
+    expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D")
+    # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan
+    tm.assert_index_equal(result.index, expected.index)
+    tm.assert_numpy_array_equal(result.values, expected.values)
+
+
+def test_apply_frame_concat_series():
+    def trans(group):
+        return group.groupby("B")["C"].sum().sort_values().iloc[:2]
+
+    def trans2(group):
+        grouped = group.groupby(df.reindex(group.index)["B"])
+        return grouped.sum().sort_values().iloc[:2]
+
+    df = DataFrame(
+        {
+            "A": np.random.default_rng(2).integers(0, 5, 1000),
+            "B": np.random.default_rng(2).integers(0, 5, 1000),
+            "C": np.random.default_rng(2).standard_normal(1000),
+        }
+    )
+
+    result = df.groupby("A").apply(trans)
+    exp = df.groupby("A")["C"].apply(trans2)
+    tm.assert_series_equal(result, exp, check_names=False)
+    assert result.name == "C"
+
+
+def test_apply_transform(ts):
+    grouped = ts.groupby(lambda x: x.month, group_keys=False)
+    result = grouped.apply(lambda x: x * 2)
+    expected = grouped.transform(lambda x: x * 2)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_multikey_corner(tsframe):
+    grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
+
+    def f(group):
+        return group.sort_values("A")[-5:]
+
+    result = grouped.apply(f)
+    for key, group in grouped:
+        tm.assert_frame_equal(result.loc[key], f(group))
+
+
+@pytest.mark.parametrize("group_keys", [True, False])
+def test_apply_chunk_view(group_keys):
+    # Low level tinkering could be unsafe, make sure not
+    df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
+
+    result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2])
+    expected = df[["value"]].take([0, 1, 3, 4, 6, 7])
+    if group_keys:
+        expected.index = MultiIndex.from_arrays(
+            [[1, 1, 2, 2, 3, 3], expected.index], names=["key", None]
+        )
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_no_name_column_conflict():
+    df = DataFrame(
+        {
+            "name": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
+            "name2": [0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
+            "value": range(9, -1, -1),
+        }
+    )
+
+    # it works! #2605
+    grouped = df.groupby(["name", "name2"])
+    grouped.apply(lambda x: x.sort_values("value", inplace=True))
+
+
+def test_apply_typecast_fail():
+    df = DataFrame(
+        {
+            "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
+            "c": np.tile(["a", "b", "c"], 2),
+            "v": np.arange(1.0, 7.0),
+        }
+    )
+
+    def f(group):
+        v = group["v"]
+        group["v2"] = (v - v.min()) / (v.max() - v.min())
+        return group
+
+    result = df.groupby("d", group_keys=False).apply(f)
+
+    expected = df[["c", "v"]]
+    expected["v2"] = np.tile([0.0, 0.5, 1], 2)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_multiindex_fail():
+    index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]])
+    df = DataFrame(
+        {
+            "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
+            "c": np.tile(["a", "b", "c"], 2),
+            "v": np.arange(1.0, 7.0),
+        },
+        index=index,
+    )
+
+    def f(group):
+        v = group["v"]
+        group["v2"] = (v - v.min()) / (v.max() - v.min())
+        return group
+
+    result = df.groupby("d", group_keys=False).apply(f)
+
+    expected = df[["c", "v"]]
+    expected["v2"] = np.tile([0.0, 0.5, 1], 2)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_corner(tsframe):
+    result = tsframe.groupby(lambda x: x.year, group_keys=False).apply(lambda x: x * 2)
+    expected = tsframe * 2
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_without_copy():
+    # GH 5545
+    # returning a non-copy in an applied function fails
+
+    data = DataFrame(
+        {
+            "id_field": [100, 100, 200, 300],
+            "category": ["a", "b", "c", "c"],
+            "value": [1, 2, 3, 4],
+        }
+    )
+
+    def filt1(x):
+        if x.shape[0] == 1:
+            return x.copy()
+        else:
+            return x[x.category == "c"]
+
+    def filt2(x):
+        if x.shape[0] == 1:
+            return x
+        else:
+            return x[x.category == "c"]
+
+    expected = data.groupby("id_field").apply(filt1)
+    result = data.groupby("id_field").apply(filt2)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("test_series", [True, False])
+def test_apply_with_duplicated_non_sorted_axis(test_series):
+    # GH 30667
+    df = DataFrame(
+        [["x", "p"], ["x", "p"], ["x", "o"]], columns=["X", "Y"], index=[1, 2, 2]
+    )
+    if test_series:
+        ser = df.set_index("Y")["X"]
+        result = ser.groupby(level=0, group_keys=False).apply(lambda x: x)
+        expected = ser
+        tm.assert_series_equal(result, expected)
+    else:
+        result = df.groupby("Y", group_keys=False).apply(lambda x: x)
+        expected = df[["X"]]
+        tm.assert_frame_equal(result, expected)
+
+
+def test_apply_reindex_values():
+    # GH: 26209
+    # reindexing from a single column of a groupby object with duplicate indices caused
+    # a ValueError (cannot reindex from duplicate axis) in 0.24.2, the problem was
+    # solved in #30679
+    values = [1, 2, 3, 4]
+    indices = [1, 1, 2, 2]
+    df = DataFrame({"group": ["Group1", "Group2"] * 2, "value": values}, index=indices)
+    expected = Series(values, index=indices, name="value")
+
+    def reindex_helper(x):
+        return x.reindex(np.arange(x.index.min(), x.index.max() + 1))
+
+    # the following group by raised a ValueError
+    result = df.groupby("group", group_keys=False).value.apply(reindex_helper)
+    tm.assert_series_equal(expected, result)
+
+
+def test_apply_corner_cases():
+    # #535, can't use sliding iterator
+
+    N = 10
+    labels = np.random.default_rng(2).integers(0, 100, size=N)
+    df = DataFrame(
+        {
+            "key": labels,
+            "value1": np.random.default_rng(2).standard_normal(N),
+            "value2": ["foo", "bar", "baz", "qux", "a"] * (N // 5),
+        }
+    )
+
+    grouped = df.groupby("key", group_keys=False)
+
+    def f(g):
+        g["value3"] = g["value1"] * 2
+        return g
+
+    result = grouped.apply(f)
+    assert "value3" in result
+
+
+def test_apply_numeric_coercion_when_datetime():
+    # In the past, group-by/apply operations have been over-eager
+    # in converting dtypes to numeric, in the presence of datetime
+    # columns.  Various GH issues were filed, the reproductions
+    # for which are here.
+
+    # GH 15670
+    df = DataFrame(
+        {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]}
+    )
+    expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
+    df.Date = pd.to_datetime(df.Date)
+    result = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
+    tm.assert_series_equal(result["Str"], expected["Str"])
+
+
+def test_apply_numeric_coercion_when_datetime_getitem():
+    # GH 15421
+    df = DataFrame(
+        {"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3}
+    )
+
+    def get_B(g):
+        return g.iloc[0][["B"]]
+
+    result = df.groupby("A").apply(get_B)["B"]
+    expected = df.B
+    expected.index = df.A
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_numeric_coercion_when_datetime_with_nat():
+    # GH 14423
+    def predictions(tool):
+        out = Series(index=["p1", "p2", "useTime"], dtype=object)
+        if "step1" in list(tool.State):
+            out["p1"] = str(tool[tool.State == "step1"].Machine.values[0])
+        if "step2" in list(tool.State):
+            out["p2"] = str(tool[tool.State == "step2"].Machine.values[0])
+            out["useTime"] = str(tool[tool.State == "step2"].oTime.values[0])
+        return out
+
+    df1 = DataFrame(
+        {
+            "Key": ["B", "B", "A", "A"],
+            "State": ["step1", "step2", "step1", "step2"],
+            "oTime": ["", "2016-09-19 05:24:33", "", "2016-09-19 23:59:04"],
+            "Machine": ["23", "36L", "36R", "36R"],
+        }
+    )
+    df2 = df1.copy()
+    df2.oTime = pd.to_datetime(df2.oTime)
+    expected = df1.groupby("Key").apply(predictions).p1
+    result = df2.groupby("Key").apply(predictions).p1
+    tm.assert_series_equal(expected, result)
+
+
+def test_apply_aggregating_timedelta_and_datetime():
+    # Regression test for GH 15562
+    # The following groupby caused ValueErrors and IndexErrors pre 0.20.0
+
+    df = DataFrame(
+        {
+            "clientid": ["A", "B", "C"],
+            "datetime": [np.datetime64("2017-02-01 00:00:00")] * 3,
+        }
+    )
+    df["time_delta_zero"] = df.datetime - df.datetime
+    result = df.groupby("clientid").apply(
+        lambda ddf: Series(
+            {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()}
+        )
+    )
+    expected = DataFrame(
+        {
+            "clientid": ["A", "B", "C"],
+            "clientid_age": [np.timedelta64(0, "D")] * 3,
+            "date": [np.datetime64("2017-02-01 00:00:00")] * 3,
+        }
+    ).set_index("clientid")
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_groupby_datetimeindex():
+    # GH 26182
+    # groupby apply failed on dataframe with DatetimeIndex
+
+    data = [["A", 10], ["B", 20], ["B", 30], ["C", 40], ["C", 50]]
+    df = DataFrame(
+        data, columns=["Name", "Value"], index=pd.date_range("2020-09-01", "2020-09-05")
+    )
+
+    result = df.groupby("Name").sum()
+
+    expected = DataFrame({"Name": ["A", "B", "C"], "Value": [10, 50, 90]})
+    expected.set_index("Name", inplace=True)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_time_field_bug():
+    # Test a fix for the following error related to GH issue 11324 When
+    # non-key fields in a group-by dataframe contained time-based fields
+    # that were not returned by the apply function, an exception would be
+    # raised.
+
+    df = DataFrame({"a": 1, "b": [datetime.now() for nn in range(10)]})
+
+    def func_with_no_date(batch):
+        return Series({"c": 2})
+
+    def func_with_date(batch):
+        return Series({"b": datetime(2015, 1, 1), "c": 2})
+
+    dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date)
+    dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1])
+    dfg_no_conversion_expected.index.name = "a"
+
+    dfg_conversion = df.groupby(by=["a"]).apply(func_with_date)
+    dfg_conversion_expected = DataFrame(
+        {"b": pd.Timestamp(2015, 1, 1), "c": 2}, index=[1]
+    )
+    dfg_conversion_expected.index.name = "a"
+
+    tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
+    tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
+
+
+def test_gb_apply_list_of_unequal_len_arrays():
+    # GH1738
+    df = DataFrame(
+        {
+            "group1": ["a", "a", "a", "b", "b", "b", "a", "a", "a", "b", "b", "b"],
+            "group2": ["c", "c", "d", "d", "d", "e", "c", "c", "d", "d", "d", "e"],
+            "weight": [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2],
+            "value": [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3],
+        }
+    )
+    df = df.set_index(["group1", "group2"])
+    df_grouped = df.groupby(level=["group1", "group2"], sort=True)
+
+    def noddy(value, weight):
+        out = np.array(value * weight).repeat(3)
+        return out
+
+    # the kernel function returns arrays of unequal length
+    # pandas sniffs the first one, sees it's an array and not
+    # a list, and assumed the rest are of equal length
+    # and so tries a vstack
+
+    # don't die
+    df_grouped.apply(lambda x: noddy(x.value, x.weight))
+
+
+def test_groupby_apply_all_none():
+    # Tests to make sure no errors if apply function returns all None
+    # values. Issue 9684.
+    test_df = DataFrame({"groups": [0, 0, 1, 1], "random_vars": [8, 7, 4, 5]})
+
+    def test_func(x):
+        pass
+
+    result = test_df.groupby("groups").apply(test_func)
+    expected = DataFrame(columns=["random_vars"], dtype="int64")
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "in_data, out_idx, out_data",
+    [
+        [
+            {"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]},
+            [[1, 1], [0, 2]],
+            {"vars": [0, 2]},
+        ],
+        [
+            {"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]},
+            [[2, 2], [1, 3]],
+            {"vars": [1, 3]},
+        ],
+    ],
+)
+def test_groupby_apply_none_first(in_data, out_idx, out_data):
+    # GH 12824. Tests if apply returns None first.
+    test_df1 = DataFrame(in_data)
+
+    def test_func(x):
+        if x.shape[0] < 2:
+            return None
+        return x.iloc[[0, -1]]
+
+    result1 = test_df1.groupby("groups").apply(test_func)
+    index1 = MultiIndex.from_arrays(out_idx, names=["groups", None])
+    expected1 = DataFrame(out_data, index=index1)
+    tm.assert_frame_equal(result1, expected1)
+
+
+def test_groupby_apply_return_empty_chunk():
+    # GH 22221: apply filter which returns some empty groups
+    df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]})
+    groups = df.groupby("group")
+    result = groups.apply(lambda group: group[group.value != 1]["value"])
+    expected = Series(
+        [0],
+        name="value",
+        index=MultiIndex.from_product(
+            [["empty", "filled"], [0]], names=["group", None]
+        ).drop("empty"),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("meth", ["apply", "transform"])
+def test_apply_with_mixed_types(meth):
+    # gh-20949
+    df = DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]})
+    g = df.groupby("A", group_keys=False)
+
+    result = getattr(g, meth)(lambda x: x / x.sum())
+    expected = DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_func_returns_object():
+    # GH 28652
+    df = DataFrame({"a": [1, 2]}, index=Index([1, 2]))
+    result = df.groupby("a").apply(lambda g: g.index)
+    expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a"))
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "group_column_dtlike",
+    [datetime.today(), datetime.today().date(), datetime.today().time()],
+)
+def test_apply_datetime_issue(group_column_dtlike):
+    # GH-28247
+    # groupby-apply throws an error if one of the columns in the DataFrame
+    #   is a datetime object and the column labels are different from
+    #   standard int values in range(len(num_columns))
+
+    df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]})
+    result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42]))
+
+    expected = DataFrame(["spam"], Index(["foo"], dtype="str", name="a"), columns=[42])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_series_return_dataframe_groups():
+    # GH 10078
+    tdf = DataFrame(
+        {
+            "day": {
+                0: pd.Timestamp("2015-02-24 00:00:00"),
+                1: pd.Timestamp("2015-02-24 00:00:00"),
+                2: pd.Timestamp("2015-02-24 00:00:00"),
+                3: pd.Timestamp("2015-02-24 00:00:00"),
+                4: pd.Timestamp("2015-02-24 00:00:00"),
+            },
+            "userAgent": {
+                0: "some UA string",
+                1: "some UA string",
+                2: "some UA string",
+                3: "another UA string",
+                4: "some UA string",
+            },
+            "userId": {
+                0: "17661101",
+                1: "17661101",
+                2: "17661101",
+                3: "17661101",
+                4: "17661101",
+            },
+        }
+    )
+
+    def most_common_values(df):
+        return Series({c: s.value_counts().index[0] for c, s in df.items()})
+
+    result = tdf.groupby("day").apply(most_common_values)["userId"]
+    expected = Series(
+        ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId"
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("category", [False, True])
+def test_apply_multi_level_name(category):
+    # https://github.com/pandas-dev/pandas/issues/31068
+    b = [1, 2] * 5
+    if category:
+        b = pd.Categorical(b, categories=[1, 2, 3])
+        expected_index = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3], name="B")
+        expected_values = [20, 25, 0]
+    else:
+        expected_index = Index([1, 2], name="B")
+        expected_values = [20, 25]
+    expected = DataFrame(
+        {"C": expected_values, "D": expected_values}, index=expected_index
+    )
+
+    df = DataFrame(
+        {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))}
+    ).set_index(["A", "B"])
+    result = df.groupby("B", observed=False).apply(lambda x: x.sum())
+    tm.assert_frame_equal(result, expected)
+    assert df.index.names == ["A", "B"]
+
+
+def test_groupby_apply_datetime_result_dtypes(using_infer_string):
+    # GH 14849
+    data = DataFrame.from_records(
+        [
+            (pd.Timestamp(2016, 1, 1), "red", "dark", 1, "8"),
+            (pd.Timestamp(2015, 1, 1), "green", "stormy", 2, "9"),
+            (pd.Timestamp(2014, 1, 1), "blue", "bright", 3, "10"),
+            (pd.Timestamp(2013, 1, 1), "blue", "calm", 4, "potato"),
+        ],
+        columns=["observation", "color", "mood", "intensity", "score"],
+    )
+    result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes
+    dtype = pd.StringDtype(na_value=np.nan) if using_infer_string else object
+    expected = Series(
+        [np.dtype("datetime64[us]"), dtype, np.int64, dtype],
+        index=["observation", "mood", "intensity", "score"],
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        pd.CategoricalIndex(list("abc")),
+        pd.interval_range(0, 3),
+        pd.period_range("2020", periods=3, freq="D"),
+        MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
+    ],
+)
+def test_apply_index_has_complex_internals(index):
+    # GH 31248
+    df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
+    result = df.groupby("group", group_keys=False).apply(lambda x: x)
+    tm.assert_frame_equal(result, df[["value"]])
+
+
+@pytest.mark.parametrize(
+    "function, expected_values",
+    [
+        (lambda x: x.index.to_list(), [[0, 1], [2, 3]]),
+        (lambda x: set(x.index.to_list()), [{0, 1}, {2, 3}]),
+        (lambda x: tuple(x.index.to_list()), [(0, 1), (2, 3)]),
+        (
+            lambda x: dict(enumerate(x.index.to_list())),
+            [{0: 0, 1: 1}, {0: 2, 1: 3}],
+        ),
+        (
+            lambda x: [{n: i} for (n, i) in enumerate(x.index.to_list())],
+            [[{0: 0}, {1: 1}], [{0: 2}, {1: 3}]],
+        ),
+    ],
+)
+def test_apply_function_returns_non_pandas_non_scalar(function, expected_values):
+    # GH 31441
+    df = DataFrame(["A", "A", "B", "B"], columns=["groups"])
+    result = df.groupby("groups").apply(function)
+    expected = Series(expected_values, index=Index(["A", "B"], name="groups"))
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_function_returns_numpy_array():
+    # GH 31605
+    def fct(group):
+        return group["B"].values.flatten()
+
+    df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]})
+
+    result = df.groupby("A").apply(fct)
+    expected = Series(
+        [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A")
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("function", [lambda gr: gr.index, lambda gr: gr.index + 1 - 1])
+def test_apply_function_index_return(function):
+    # GH: 22541
+    df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"])
+    result = df.groupby("id").apply(function)
+    expected = Series(
+        [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])],
+        index=Index([1, 2, 3], name="id"),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_function_with_indexing_return_column():
+    # GH#7002, GH#41480, GH#49256
+    df = DataFrame(
+        {
+            "foo1": ["one", "two", "two", "three", "one", "two"],
+            "foo2": [1, 2, 4, 4, 5, 6],
+        }
+    )
+    result = df.groupby("foo1", as_index=False).apply(lambda x: x.mean())
+    expected = DataFrame(
+        {
+            "foo1": ["one", "three", "two"],
+            "foo2": [3.0, 4.0, 4.0],
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "udf",
+    [lambda x: x.copy(), lambda x: x.copy().rename(lambda y: y + 1)],
+)
+@pytest.mark.parametrize("group_keys", [True, False])
+def test_apply_result_type(group_keys, udf):
+    # https://github.com/pandas-dev/pandas/issues/34809
+    # We'd like to control whether the group keys end up in the index
+    # regardless of whether the UDF happens to be a transform.
+    df = DataFrame({"A": ["a", "b"], "B": [1, 2]})
+    df_result = df.groupby("A", group_keys=group_keys).apply(udf)
+    series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf)
+
+    if group_keys:
+        assert df_result.index.nlevels == 2
+        assert series_result.index.nlevels == 2
+    else:
+        assert df_result.index.nlevels == 1
+        assert series_result.index.nlevels == 1
+
+
+def test_result_order_group_keys_false():
+    # GH 34998
+    # apply result order should not depend on whether index is the same or just equal
+    df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]})
+    result = df.groupby("A", group_keys=False).apply(lambda x: x)
+    expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy())
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_with_timezones_aware():
+    # GH: 27212
+    dates = ["2001-01-01"] * 2 + ["2001-01-02"] * 2 + ["2001-01-03"] * 2
+    index_no_tz = pd.DatetimeIndex(dates)
+    index_tz = pd.DatetimeIndex(dates, tz="UTC")
+    df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz})
+    df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz})
+
+    result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["y"]].copy())
+    result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["y"]].copy())
+
+    tm.assert_frame_equal(result1, result2)
+
+
+def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func):
+    # GH #34656
+    # GH #34271
+    df = DataFrame(
+        {
+            "a": [99, 99, 99, 88, 88, 88],
+            "b": [1, 2, 3, 4, 5, 6],
+            "c": [10, 20, 30, 40, 50, 60],
+        }
+    )
+
+    expected = DataFrame(
+        {"b": [15, 6], "c": [150, 60]},
+        index=Index([88, 99], name="a"),
+    )
+
+    # Check output when no other methods are called before .apply()
+    grp = df.groupby(by="a")
+    result = grp.apply(np.sum, axis=0)
+    tm.assert_frame_equal(result, expected)
+
+    # Check output when another method is called before .apply()
+    grp = df.groupby(by="a")
+    args = get_groupby_method_args(reduction_func, df)
+    if reduction_func == "corrwith":
+        warn = Pandas4Warning
+        msg = "DataFrameGroupBy.corrwith is deprecated"
+    else:
+        warn = None
+        msg = ""
+    with tm.assert_produces_warning(warn, match=msg):
+        _ = getattr(grp, reduction_func)(*args)
+    result = grp.apply(np.sum, axis=0)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp():
+    # GH 29617
+
+    df = DataFrame(
+        {
+            "A": ["a", "a", "a", "b"],
+            "B": [
+                date(2020, 1, 10),
+                date(2020, 1, 10),
+                date(2020, 2, 10),
+                date(2020, 2, 10),
+            ],
+            "C": [1, 2, 3, 4],
+        },
+        index=Index([100, 101, 102, 103], name="idx"),
+    )
+
+    grp = df.groupby(["A", "B"])
+    result = grp.apply(lambda x: x.head(1))
+
+    expected = df.iloc[[0, 2, 3]]
+    expected = expected.reset_index()
+    expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]])
+    expected = expected.drop(columns=["A", "B", "idx"])
+
+    tm.assert_frame_equal(result, expected)
+    for val in result.index.levels[1]:
+        assert type(val) is date
+
+
+def test_apply_dropna_with_indexed_same(dropna):
+    # GH 38227
+    # GH#43205
+    df = DataFrame(
+        {
+            "col": [1, 2, 3, 4, 5],
+            "group": ["a", np.nan, np.nan, "b", "b"],
+        },
+        index=list("xxyxz"),
+    )
+    result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x)
+    expected = df.dropna()[["col"]] if dropna else df[["col"]].iloc[[0, 3, 1, 2, 4]]
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "as_index, expected",
+    [
+        [
+            False,
+            DataFrame(
+                [[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None], dtype=object)
+            ),
+        ],
+        [
+            True,
+            Series(
+                [1, 1], index=MultiIndex.from_tuples([(1, 1), (2, 2)], names=["a", "b"])
+            ),
+        ],
+    ],
+)
+def test_apply_as_index_constant_lambda(as_index, expected):
+    # GH 13217
+    df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]})
+    result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1)
+    tm.assert_equal(result, expected)
+
+
+def test_sort_index_groups():
+    # GH 20420
+    df = DataFrame(
+        {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]},
+        index=range(5),
+    )
+    result = df.groupby("C").apply(lambda x: x.A.sort_index())
+    expected = Series(
+        range(1, 6),
+        index=MultiIndex.from_tuples(
+            [(1, 0), (1, 1), (1, 2), (2, 3), (2, 4)], names=["C", None]
+        ),
+        name="A",
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_positional_slice_groups_datetimelike():
+    # GH 21651
+    expected = DataFrame(
+        {
+            "date": pd.date_range("2010-01-01", freq="12h", periods=5),
+            "vals": range(5),
+            "let": list("abcde"),
+        }
+    )
+    result = expected.groupby(
+        [expected.let, expected.date.dt.date], group_keys=False
+    ).apply(lambda x: x.iloc[0:])
+    tm.assert_frame_equal(result, expected[["date", "vals"]])
+
+
+def test_groupby_apply_shape_cache_safety():
+    # GH#42702 this fails if we cache_readonly Block.shape
+    df = DataFrame({"A": ["a", "a", "b"], "B": [1, 2, 3], "C": [4, 6, 5]})
+    gb = df.groupby("A")
+    result = gb[["B", "C"]].apply(lambda x: x.astype(float).max() - x.min())
+
+    expected = DataFrame(
+        {"B": [1.0, 0.0], "C": [2.0, 0.0]}, index=Index(["a", "b"], name="A")
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_apply_to_series_name():
+    # GH52444
+    df = DataFrame.from_dict(
+        {
+            "a": ["a", "b", "a", "b"],
+            "b1": ["aa", "ac", "ac", "ad"],
+            "b2": ["aa", "aa", "aa", "ac"],
+        }
+    )
+    grp = df.groupby("a")[["b1", "b2"]]
+    result = grp.apply(lambda x: x.unstack().value_counts())
+
+    expected_idx = MultiIndex.from_arrays(
+        arrays=[["a", "a", "b", "b", "b"], ["aa", "ac", "ac", "ad", "aa"]],
+        names=["a", None],
+    )
+    expected = Series([3, 1, 2, 1, 1], index=expected_idx, name="count")
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_na(dropna):
+    # GH#28984
+    df = DataFrame(
+        {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]}
+    )
+    dfgrp = df.groupby("grp", dropna=dropna)
+    result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z"))
+    expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_empty_string_nan_coerce_bug():
+    # GH#24903
+    result = (
+        DataFrame(
+            {
+                "a": [1, 1, 2, 2],
+                "b": ["", "", "", ""],
+                "c": pd.to_datetime([1, 2, 3, 4], unit="s"),
+            }
+        )
+        .groupby(["a", "b"])
+        .apply(lambda df: df.iloc[-1])
+    )
+    expected = DataFrame(
+        [[pd.to_datetime(2, unit="s")], [pd.to_datetime(4, unit="s")]],
+        columns=["c"],
+        index=MultiIndex.from_tuples([(1, ""), (2, "")], names=["a", "b"]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("index_values", [[1, 2, 3], [1.0, 2.0, 3.0]])
+def test_apply_index_key_error_bug(index_values):
+    # GH 44310
+    result = DataFrame(
+        {
+            "a": ["aa", "a2", "a3"],
+            "b": [1, 2, 3],
+        },
+        index=Index(index_values),
+    )
+    expected = DataFrame(
+        {
+            "b_mean": [2.0, 3.0, 1.0],
+        },
+        index=Index(["a2", "a3", "aa"], name="a"),
+    )
+    result = result.groupby("a").apply(
+        lambda df: Series([df["b"].mean()], index=["b_mean"])
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "arg,idx",
+    [
+        [
+            [
+                1,
+                2,
+                3,
+            ],
+            [
+                0.1,
+                0.3,
+                0.2,
+            ],
+        ],
+        [
+            [
+                1,
+                2,
+                3,
+            ],
+            [
+                0.1,
+                0.2,
+                0.3,
+            ],
+        ],
+        [
+            [
+                1,
+                4,
+                3,
+            ],
+            [
+                0.1,
+                0.4,
+                0.2,
+            ],
+        ],
+    ],
+)
+def test_apply_nonmonotonic_float_index(arg, idx):
+    # GH 34455
+    df = DataFrame({"grp": arg, "col": arg}, index=idx)
+    result = df.groupby("grp", group_keys=False).apply(lambda x: x)
+    expected = df[["col"]]
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("args, kwargs", [([True], {}), ([], {"numeric_only": True})])
+def test_apply_str_with_args(df, args, kwargs):
+    # GH#46479
+    gb = df.groupby("A")
+    result = gb.apply("sum", *args, **kwargs)
+    expected = gb.sum(numeric_only=True)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("name", ["some_name", None])
+def test_result_name_when_one_group(name):
+    # GH 46369
+    ser = Series([1, 2], name=name)
+    result = ser.groupby(["a", "a"], group_keys=False).apply(lambda x: x)
+    expected = Series([1, 2], name=name)
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, op",
+    [
+        ("apply", lambda gb: gb.values[-1]),
+        ("apply", lambda gb: gb["b"].iloc[0]),
+        ("agg", "skew"),
+        ("agg", "kurt"),
+        ("agg", "prod"),
+        ("agg", "sum"),
+    ],
+)
+def test_empty_df(method, op):
+    # GH 47985
+    empty_df = DataFrame({"a": [], "b": []})
+    gb = empty_df.groupby("a", group_keys=True)
+    group = gb.b
+
+    result = getattr(group, method)(op)
+    expected = Series(
+        [], name="b", dtype="float64", index=Index([], dtype="float64", name="a")
+    )
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_include_groups():
+    # GH#7155
+    df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]})
+    gb = df.groupby("a")
+    with pytest.raises(ValueError, match="include_groups=True is no longer allowed"):
+        gb.apply(lambda x: x.sum(), include_groups=True)
+
+
+@pytest.mark.parametrize("func, value", [(max, 2), (min, 1), (sum, 3)])
+def test_builtins_apply(func, value):
+    # GH#8155, GH#53974
+    # Builtins act as e.g. sum(group), which sums the column labels of group
+    df = DataFrame({0: [1, 1, 2], 1: [3, 4, 5], 2: [3, 4, 5]})
+    gb = df.groupby(0)
+    result = gb.apply(func)
+
+    expected = Series([value, value], index=Index([1, 2], name=0))
+    tm.assert_series_equal(result, expected)
+
+
+def test_inconsistent_return_type():
+    # GH5592
+    # inconsistent return type
+    df = DataFrame(
+        {
+            "A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"],
+            "B": Series(np.arange(7), dtype="int64"),
+            "C": pd.date_range("20130101", periods=7),
+        }
+    )
+
+    def f_0(grp):
+        return grp.iloc[0]
+
+    expected = df.groupby("A").first()[["B"]]
+    result = df.groupby("A").apply(f_0)[["B"]]
+    tm.assert_frame_equal(result, expected)
+
+    def f_1(grp):
+        if grp.name == "Tiger":
+            return None
+        return grp.iloc[0]
+
+    result = df.groupby("A").apply(f_1)[["B"]]
+    e = expected.copy()
+    e.loc["Tiger"] = np.nan
+    tm.assert_frame_equal(result, e)
+
+    def f_2(grp):
+        if grp.name == "Pony":
+            return None
+        return grp.iloc[0]
+
+    result = df.groupby("A").apply(f_2)[["B"]]
+    e = expected.copy()
+    e.loc["Pony"] = np.nan
+    tm.assert_frame_equal(result, e)
+
+    # 5592 revisited, with datetimes
+    def f_3(grp):
+        if grp.name == "Pony":
+            return None
+        return grp.iloc[0]
+
+    result = df.groupby("A").apply(f_3)[["C"]]
+    e = df.groupby("A").first()[["C"]]
+    e.loc["Pony"] = pd.NaT
+    tm.assert_frame_equal(result, e)
+
+    # scalar outputs
+    def f_4(grp):
+        if grp.name == "Pony":
+            return None
+        return grp.iloc[0].loc["C"]
+
+    result = df.groupby("A").apply(f_4)
+    e = df.groupby("A").first()["C"].copy()
+    e.loc["Pony"] = np.nan
+    e.name = None
+    tm.assert_series_equal(result, e)
+
+
+def test_nonreducer_nonstransform():
+    # GH3380, GH60619
+    # Was originally testing mutating in a UDF; now kept as an example
+    # of using apply with a nonreducer and nontransformer.
+    df = DataFrame(
+        {
+            "cat1": ["a"] * 8 + ["b"] * 6,
+            "cat2": ["c"] * 2
+            + ["d"] * 2
+            + ["e"] * 2
+            + ["f"] * 2
+            + ["c"] * 2
+            + ["d"] * 2
+            + ["e"] * 2,
+            "val": np.random.default_rng(2).integers(100, size=14),
+        }
+    )
+
+    def f(x):
+        x = x.copy()
+        x["rank"] = x.val.rank(method="min")
+        return x.groupby("cat2")["rank"].min()
+
+    expected = DataFrame(
+        {
+            "cat1": list("aaaabbb"),
+            "cat2": list("cdefcde"),
+            "rank": [3.0, 2.0, 5.0, 1.0, 2.0, 4.0, 1.0],
+        }
+    ).set_index(["cat1", "cat2"])["rank"]
+    result = df.groupby("cat1").apply(f)
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_apply_store_copy():
+    # GH40673
+    rng = np.random.default_rng(seed=42)
+
+    df = DataFrame(
+        {
+            "A": rng.normal(10, 12, size=(4,)),
+            "B": [1, 2, 1, 2],
+        }
+    )
+
+    store = {}
+
+    def addstore(x):
+        store[len(store)] = x.copy()
+
+    df.groupby("B").apply(addstore)
+
+    expected_out_0 = df.iloc[[0, 2], [0]]
+    expected_out_1 = df.iloc[[1, 3], [0]]
+
+    tm.assert_frame_equal(store[0], expected_out_0)
+    tm.assert_frame_equal(store[1], expected_out_1)
diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py
new file mode 100644
index 0000000000000000000000000000000000000000..07d52308e308ad637fad8df5802b8e79985b8127
--- /dev/null
+++ b/pandas/tests/groupby/test_bin_groupby.py
@@ -0,0 +1,67 @@
+import numpy as np
+import pytest
+
+from pandas._libs import lib
+
+import pandas as pd
+import pandas._testing as tm
+
+
+def assert_block_lengths(x):
+    assert len(x) == len(x._mgr.blocks[0].mgr_locs)
+    return 0
+
+
+def cumsum_max(x):
+    x.cumsum().max()
+    return 0
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        cumsum_max,
+        assert_block_lengths,
+    ],
+)
+def test_mgr_locs_updated(func):
+    # https://github.com/pandas-dev/pandas/issues/31802
+    # Some operations may require creating new blocks, which requires
+    # valid mgr_locs
+    df = pd.DataFrame({"A": ["a", "a", "a"], "B": ["a", "b", "b"], "C": [1, 1, 1]})
+    result = df.groupby(["A", "B"]).agg(func)
+    expected = pd.DataFrame(
+        {"C": [0, 0]},
+        index=pd.MultiIndex.from_product([["a"], ["a", "b"]], names=["A", "B"]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "binner,closed,expected",
+    [
+        (
+            [0, 3, 6, 9],
+            "left",
+            [2, 5, 6],
+        ),
+        (
+            [0, 3, 6, 9],
+            "right",
+            [3, 6, 6],
+        ),
+        ([0, 3, 6], "left", [2, 5]),
+        (
+            [0, 3, 6],
+            "right",
+            [3, 6],
+        ),
+    ],
+)
+def test_generate_bins(binner, closed, expected):
+    values = np.array([1, 2, 3, 4, 5, 6], dtype=np.int64)
+    result = lib.generate_bins_dt64(
+        values, np.array(binner, dtype=np.int64), closed=closed
+    )
+    expected = np.array(expected, dtype=np.int64)
+    tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
new file mode 100644
index 0000000000000000000000000000000000000000..e39052e64e3072ef31edb29bb14648c427e8247c
--- /dev/null
+++ b/pandas/tests/groupby/test_categorical.py
@@ -0,0 +1,2189 @@
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas.errors import Pandas4Warning
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    CategoricalIndex,
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    qcut,
+)
+import pandas._testing as tm
+from pandas.api.typing import SeriesGroupBy
+from pandas.tests.groupby import get_groupby_method_args
+
+
+def cartesian_product_for_groupers(result, args, names, fill_value=np.nan):
+    """Reindex to a cartesian production for the groupers,
+    preserving the nature (Categorical) of each grouper
+    """
+
+    def f(a):
+        if isinstance(a, (CategoricalIndex, Categorical)):
+            categories = a.categories
+            a = Categorical.from_codes(
+                np.arange(len(categories)), categories=categories, ordered=a.ordered
+            )
+        return a
+
+    index = MultiIndex.from_product(map(f, args), names=names)
+    if isinstance(fill_value, dict):
+        # fill_value is a dict mapping column names to fill values
+        # -> reindex column by column (reindex itself does not support this)
+        res = {}
+        for col in result.columns:
+            res[col] = result[col].reindex(index, fill_value=fill_value[col])
+        return DataFrame(res, index=index).sort_index()
+
+    return result.reindex(index, fill_value=fill_value).sort_index()
+
+
+_results_for_groupbys_with_missing_categories = {
+    # This maps the builtin groupby functions to their expected outputs for
+    # missing categories when they are called on a categorical grouper with
+    # observed=False. Some functions are expected to return NaN, some zero.
+    # These expected values can be used across several tests (i.e. they are
+    # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be
+    # hardcoded in one place.
+    "all": True,
+    "any": False,
+    "count": 0,
+    "corrwith": np.nan,
+    "first": np.nan,
+    "idxmax": np.nan,
+    "idxmin": np.nan,
+    "last": np.nan,
+    "max": np.nan,
+    "mean": np.nan,
+    "median": np.nan,
+    "min": np.nan,
+    "nth": np.nan,
+    "nunique": 0,
+    "prod": 1,
+    "quantile": np.nan,
+    "sem": np.nan,
+    "size": 0,
+    "skew": np.nan,
+    "kurt": np.nan,
+    "std": np.nan,
+    "sum": 0,
+    "var": np.nan,
+}
+
+
+def test_apply_use_categorical_name(df):
+    cats = qcut(df.C, 4)
+
+    def get_stats(group):
+        return {
+            "min": group.min(),
+            "max": group.max(),
+            "count": group.count(),
+            "mean": group.mean(),
+        }
+
+    result = df.groupby(cats, observed=False).D.apply(get_stats)
+    assert result.index.names[0] == "C"
+
+
+def test_basic():
+    cats = Categorical(
+        ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+        categories=["a", "b", "c", "d"],
+        ordered=True,
+    )
+    data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
+
+    exp_index = CategoricalIndex(list("abcd"), name="b", ordered=True)
+    expected = DataFrame({"a": [1, 2, 4, np.nan]}, index=exp_index)
+    result = data.groupby("b", observed=False).mean()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_basic_single_grouper():
+    cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
+    cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
+    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+
+    gb = df.groupby("A", observed=False)
+    exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
+    expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
+    result = gb.sum(numeric_only=True)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_basic_string(using_infer_string):
+    # GH 8623
+    x = DataFrame(
+        [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
+        columns=["person_id", "person_name"],
+    )
+    x["person_name"] = Categorical(x.person_name)
+
+    g = x.groupby(["person_id"], observed=False)
+    result = g.transform(lambda x: x)
+    tm.assert_frame_equal(result, x[["person_name"]])
+
+    result = x.drop_duplicates("person_name")
+    expected = x.iloc[[0, 1]]
+    tm.assert_frame_equal(result, expected)
+
+    def f(x):
+        return x.drop_duplicates("person_name").iloc[0]
+
+    result = g.apply(f)
+    expected = x[["person_name"]].iloc[[0, 1]]
+    expected.index = Index([1, 2], name="person_id")
+    dtype = "str" if using_infer_string else object
+    expected["person_name"] = expected["person_name"].astype(dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_basic_monotonic():
+    # GH 9921
+    df = DataFrame({"a": [5, 15, 25]})
+    c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
+
+    result = df.a.groupby(c, observed=False).transform(sum)
+    tm.assert_series_equal(result, df["a"])
+
+    tm.assert_series_equal(
+        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]
+    )
+    result = df.groupby(c, observed=False).transform(sum)
+    expected = df[["a"]]
+    tm.assert_frame_equal(result, expected)
+
+    gbc = df.groupby(c, observed=False)
+    result = gbc.transform(lambda xs: np.max(xs, axis=0))
+    tm.assert_frame_equal(result, df[["a"]])
+
+    result2 = gbc.transform(lambda xs: np.max(xs, axis=0))
+    result3 = gbc.transform(max)
+    result4 = gbc.transform(np.maximum.reduce)
+    result5 = gbc.transform(lambda xs: np.maximum.reduce(xs))
+    tm.assert_frame_equal(result2, df[["a"]], check_dtype=False)
+    tm.assert_frame_equal(result3, df[["a"]], check_dtype=False)
+    tm.assert_frame_equal(result4, df[["a"]])
+    tm.assert_frame_equal(result5, df[["a"]])
+
+    # Filter
+    tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"])
+    tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)
+
+
+def test_basic_non_monotonic():
+    df = DataFrame({"a": [5, 15, 25, -5]})
+    c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
+
+    result = df.a.groupby(c, observed=False).transform(sum)
+    tm.assert_series_equal(result, df["a"])
+
+    tm.assert_series_equal(
+        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]
+    )
+    result = df.groupby(c, observed=False).transform(sum)
+    expected = df[["a"]]
+    tm.assert_frame_equal(result, expected)
+
+    tm.assert_frame_equal(
+        df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]]
+    )
+
+
+def test_basic_cut_grouping():
+    # GH 9603
+    df = DataFrame({"a": [1, 0, 0, 0]})
+    c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd")))
+    result = df.groupby(c, observed=False).apply(len)
+
+    exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
+    expected = Series([1, 0, 0, 0], index=exp_index)
+    expected.index.name = "a"
+    tm.assert_series_equal(result, expected)
+
+
+def test_more_basic():
+    levels = ["foo", "bar", "baz", "qux"]
+    codes = np.random.default_rng(2).integers(0, 4, size=10)
+
+    cats = Categorical.from_codes(codes, levels, ordered=True)
+
+    data = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
+
+    result = data.groupby(cats, observed=False).mean()
+
+    expected = data.groupby(np.asarray(cats), observed=False).mean()
+    exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True)
+    expected = expected.reindex(exp_idx)
+
+    tm.assert_frame_equal(result, expected)
+
+    grouped = data.groupby(cats, observed=False)
+    desc_result = grouped.describe()
+
+    idx = cats.codes.argsort()
+    ord_labels = np.asarray(cats).take(idx)
+    ord_data = data.take(idx)
+
+    exp_cats = Categorical(
+        ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"]
+    )
+    expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe()
+    tm.assert_frame_equal(desc_result, expected)
+
+    # GH 10460
+    expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
+    exp = CategoricalIndex(expc)
+    tm.assert_index_equal(desc_result.stack().index.get_level_values(0), exp)
+    exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
+    tm.assert_index_equal(desc_result.stack().index.get_level_values(1), exp)
+
+
+def test_level_get_group(observed):
+    # GH15155
+    df = DataFrame(
+        data=np.arange(2, 22, 2),
+        index=MultiIndex(
+            levels=[CategoricalIndex(["a", "b"]), range(10)],
+            codes=[[0] * 5 + [1] * 5, range(10)],
+            names=["Index1", "Index2"],
+        ),
+    )
+    g = df.groupby(level=["Index1"], observed=observed)
+
+    # expected should equal test.loc[["a"]]
+    # GH15166
+    expected = DataFrame(
+        data=np.arange(2, 12, 2),
+        index=MultiIndex(
+            levels=[CategoricalIndex(["a", "b"]), range(5)],
+            codes=[[0] * 5, range(5)],
+            names=["Index1", "Index2"],
+        ),
+    )
+    result = g.get_group(("a",))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_sorting_with_different_categoricals():
+    # GH 24271
+    df = DataFrame(
+        {
+            "group": ["A"] * 6 + ["B"] * 6,
+            "dose": ["high", "med", "low"] * 4,
+            "outcomes": np.arange(12.0),
+        }
+    )
+
+    df.dose = Categorical(df.dose, categories=["low", "med", "high"], ordered=True)
+
+    result = df.groupby("group")["dose"].value_counts()
+    result = result.sort_index(level=0, sort_remaining=True)
+    index = ["low", "med", "high", "low", "med", "high"]
+    index = Categorical(index, categories=["low", "med", "high"], ordered=True)
+    index = [["A", "A", "A", "B", "B", "B"], CategoricalIndex(index)]
+    index = MultiIndex.from_arrays(index, names=["group", "dose"])
+    expected = Series([2] * 6, index=index, name="count")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_apply(ordered):
+    # GH 10138
+
+    dense = Categorical(list("abc"), ordered=ordered)
+
+    # 'b' is in the categories but not in the list
+    missing = Categorical(list("aaa"), categories=["a", "b"], ordered=ordered)
+    values = np.arange(len(dense))
+    df = DataFrame({"missing": missing, "dense": dense, "values": values})
+    grouped = df.groupby(["missing", "dense"], observed=True)
+
+    # missing category 'b' should still exist in the output index
+    idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
+    expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"])
+
+    result = grouped.apply(lambda x: np.mean(x, axis=0))
+    tm.assert_frame_equal(result, expected)
+
+    result = grouped.mean()
+    tm.assert_frame_equal(result, expected)
+
+    result = grouped.agg(np.mean)
+    tm.assert_frame_equal(result, expected)
+
+    # but for transform we should still get back the original index
+    idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
+    expected = Series(1, index=idx)
+    result = grouped.apply(lambda x: 1)
+    tm.assert_series_equal(result, expected)
+
+
+def test_observed(observed, using_infer_string):
+    # multiple groupers, don't re-expand the output space
+    # of the grouper
+    # gh-14942 (implement)
+    # gh-10132 (back-compat)
+    # gh-8138 (back-compat)
+    # gh-8869
+
+    cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
+    cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
+    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+    df["C"] = ["foo", "bar"] * 2
+
+    # multiple groupers with a non-cat
+    gb = df.groupby(["A", "B", "C"], observed=observed)
+    exp_index = MultiIndex.from_arrays(
+        [cat1, cat2, ["foo", "bar"] * 2], names=["A", "B", "C"]
+    )
+    expected = DataFrame({"values": Series([1, 2, 3, 4], index=exp_index)}).sort_index()
+    result = gb.sum()
+    if not observed:
+        expected = cartesian_product_for_groupers(
+            expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0
+        )
+
+    tm.assert_frame_equal(result, expected)
+
+    gb = df.groupby(["A", "B"], observed=observed)
+    exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
+    expected = DataFrame(
+        {"values": [1, 2, 3, 4], "C": ["foo", "bar", "foo", "bar"]}, index=exp_index
+    )
+    result = gb.sum()
+    if not observed:
+        expected = cartesian_product_for_groupers(
+            expected,
+            [cat1, cat2],
+            list("AB"),
+            fill_value={"values": 0, "C": ""} if using_infer_string else 0,
+        )
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_observed_single_column(observed):
+    # https://github.com/pandas-dev/pandas/issues/8138
+    d = {
+        "cat": Categorical(
+            ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True
+        ),
+        "ints": [1, 1, 2, 2],
+        "val": [10, 20, 30, 40],
+    }
+    df = DataFrame(d)
+
+    groups_single_key = df.groupby("cat", observed=observed)
+    result = groups_single_key.mean()
+
+    exp_index = CategoricalIndex(
+        list("ab"), name="cat", categories=list("abc"), ordered=True
+    )
+    expected = DataFrame({"ints": [1.5, 1.5], "val": [20.0, 30]}, index=exp_index)
+    if not observed:
+        index = CategoricalIndex(
+            list("abc"), name="cat", categories=list("abc"), ordered=True
+        )
+        expected = expected.reindex(index)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_observed_two_columns(observed):
+    # https://github.com/pandas-dev/pandas/issues/8138
+    d = {
+        "cat": Categorical(
+            ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True
+        ),
+        "ints": [1, 1, 2, 2],
+        "val": [10, 20, 30, 40],
+    }
+    df = DataFrame(d)
+    groups_double_key = df.groupby(["cat", "ints"], observed=observed)
+    result = groups_double_key.agg("mean")
+    expected = DataFrame(
+        {
+            "val": [10.0, 30.0, 20.0, 40.0],
+            "cat": Categorical(
+                ["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True
+            ),
+            "ints": [1, 2, 1, 2],
+        }
+    ).set_index(["cat", "ints"])
+    if not observed:
+        expected = cartesian_product_for_groupers(
+            expected, [df.cat.values, [1, 2]], ["cat", "ints"]
+        )
+
+    tm.assert_frame_equal(result, expected)
+
+    # GH 10132
+    for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]:
+        c, i = key
+        result = groups_double_key.get_group(key)
+        expected = df[(df.cat == c) & (df.ints == i)]
+        tm.assert_frame_equal(result, expected)
+
+
+def test_observed_with_as_index(observed):
+    # gh-8869
+    # with as_index
+    d = {
+        "foo": [10, 8, 4, 8, 4, 1, 1],
+        "bar": [10, 20, 30, 40, 50, 60, 70],
+        "baz": ["d", "c", "e", "a", "a", "d", "c"],
+    }
+    df = DataFrame(d)
+    cat = pd.cut(df["foo"], np.linspace(0, 10, 3))
+    df["range"] = cat
+    groups = df.groupby(["range", "baz"], as_index=False, observed=observed)
+    result = groups.agg("mean")
+
+    groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed)
+    expected = groups2.agg("mean").reset_index()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_observed_codes_remap(observed):
+    d = {"C1": [3, 3, 4, 5], "C2": [1, 2, 3, 4], "C3": [10, 100, 200, 34]}
+    df = DataFrame(d)
+    values = pd.cut(df["C1"], [1, 2, 3, 6])
+    values.name = "cat"
+    groups_double_key = df.groupby([values, "C2"], observed=observed)
+
+    idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"])
+    expected = DataFrame(
+        {"C1": [3.0, 3.0, 4.0, 5.0], "C3": [10.0, 100.0, 200.0, 34.0]}, index=idx
+    )
+    if not observed:
+        expected = cartesian_product_for_groupers(
+            expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"]
+        )
+
+    result = groups_double_key.agg("mean")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_observed_perf():
+    # we create a cartesian product, so this is
+    # non-performant if we don't use observed values
+    # gh-14942
+    df = DataFrame(
+        {
+            "cat": np.random.default_rng(2).integers(0, 255, size=30000),
+            "int_id": np.random.default_rng(2).integers(0, 255, size=30000),
+            "other_id": np.random.default_rng(2).integers(0, 10000, size=30000),
+            "foo": 0,
+        }
+    )
+    df["cat"] = df.cat.astype(str).astype("category")
+
+    grouped = df.groupby(["cat", "int_id", "other_id"], observed=True)
+    result = grouped.count()
+    assert result.index.levels[0].nunique() == df.cat.nunique()
+    assert result.index.levels[1].nunique() == df.int_id.nunique()
+    assert result.index.levels[2].nunique() == df.other_id.nunique()
+
+
+def test_observed_groups(observed):
+    # gh-20583
+    # test that we have the appropriate groups
+
+    cat = Categorical(["a", "c", "a"], categories=["a", "b", "c"])
+    df = DataFrame({"cat": cat, "vals": [1, 2, 3]})
+    g = df.groupby("cat", observed=observed)
+
+    result = g.groups
+    if observed:
+        expected = {"a": Index([0, 2], dtype="int64"), "c": Index([1], dtype="int64")}
+    else:
+        expected = {
+            "a": Index([0, 2], dtype="int64"),
+            "b": Index([], dtype="int64"),
+            "c": Index([1], dtype="int64"),
+        }
+
+    tm.assert_dict_equal(result, expected)
+
+
+def test_groups_na_category(dropna, observed):
+    # https://github.com/pandas-dev/pandas/issues/61356
+    df = DataFrame(
+        {"cat": Categorical(["a", np.nan, "a"], categories=list("adb"))},
+        index=list("xyz"),
+    )
+    g = df.groupby("cat", observed=observed, dropna=dropna)
+
+    result = g.groups
+    expected = {"a": Index(["x", "z"])}
+    if not dropna:
+        expected |= {np.nan: Index(["y"])}
+    if not observed:
+        expected |= {"b": Index([]), "d": Index([])}
+    tm.assert_dict_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "keys, expected_values, expected_index_levels",
+    [
+        ("a", [15, 9, 0], CategoricalIndex([1, 2, 3], name="a")),
+        (
+            ["a", "b"],
+            [7, 8, 0, 0, 0, 9, 0, 0, 0],
+            [CategoricalIndex([1, 2, 3], name="a"), Index([4, 5, 6])],
+        ),
+        (
+            ["a", "a2"],
+            [15, 0, 0, 0, 9, 0, 0, 0, 0],
+            [
+                CategoricalIndex([1, 2, 3], name="a"),
+                CategoricalIndex([1, 2, 3], name="a"),
+            ],
+        ),
+    ],
+)
+@pytest.mark.parametrize("test_series", [True, False])
+def test_unobserved_in_index(keys, expected_values, expected_index_levels, test_series):
+    # GH#49354 - ensure unobserved cats occur when grouping by index levels
+    df = DataFrame(
+        {
+            "a": Categorical([1, 1, 2], categories=[1, 2, 3]),
+            "a2": Categorical([1, 1, 2], categories=[1, 2, 3]),
+            "b": [4, 5, 6],
+            "c": [7, 8, 9],
+        }
+    ).set_index(["a", "a2"])
+    if "b" not in keys:
+        # Only keep b when it is used for grouping for consistent columns in the result
+        df = df.drop(columns="b")
+
+    gb = df.groupby(keys, observed=False)
+    if test_series:
+        gb = gb["c"]
+    result = gb.sum()
+
+    if len(keys) == 1:
+        index = expected_index_levels
+    else:
+        codes = [[0, 0, 0, 1, 1, 1, 2, 2, 2], 3 * [0, 1, 2]]
+        index = MultiIndex(
+            expected_index_levels,
+            codes=codes,
+            names=keys,
+        )
+    expected = DataFrame({"c": expected_values}, index=index)
+    if test_series:
+        expected = expected["c"]
+    tm.assert_equal(result, expected)
+
+
+def test_observed_groups_with_nan(observed):
+    # GH 24740
+    df = DataFrame(
+        {
+            "cat": Categorical(["a", np.nan, "a"], categories=["a", "b", "d"]),
+            "vals": [1, 2, 3],
+        }
+    )
+    g = df.groupby("cat", observed=observed)
+    result = g.groups
+    if observed:
+        expected = {"a": Index([0, 2], dtype="int64")}
+    else:
+        expected = {
+            "a": Index([0, 2], dtype="int64"),
+            "b": Index([], dtype="int64"),
+            "d": Index([], dtype="int64"),
+        }
+    tm.assert_dict_equal(result, expected)
+
+
+def test_observed_nth():
+    # GH 26385
+    cat = Categorical(["a", np.nan, np.nan], categories=["a", "b", "c"])
+    ser = Series([1, 2, 3])
+    df = DataFrame({"cat": cat, "ser": ser})
+
+    result = df.groupby("cat", observed=False)["ser"].nth(0)
+    expected = df["ser"].iloc[[0]]
+    tm.assert_series_equal(result, expected)
+
+
+def test_dataframe_categorical_with_nan(observed):
+    # GH 21151
+    s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"])
+    s2 = Series([1, 2, 3, 4])
+    df = DataFrame({"s1": s1, "s2": s2})
+    result = df.groupby("s1", observed=observed).first().reset_index()
+    if observed:
+        expected = DataFrame(
+            {"s1": Categorical(["a"], categories=["a", "b", "c"]), "s2": [2]}
+        )
+    else:
+        expected = DataFrame(
+            {
+                "s1": Categorical(["a", "b", "c"], categories=["a", "b", "c"]),
+                "s2": [2, np.nan, np.nan],
+            }
+        )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
+    # GH 25871: Fix groupby sorting on ordered Categoricals
+    # GH 25167: Groupby with observed=True doesn't sort
+
+    # Build a dataframe with cat having one unobserved category ('missing'),
+    # and a Series with identical values
+    label = Categorical(
+        ["d", "a", "b", "a", "d", "b"],
+        categories=["a", "b", "missing", "d"],
+        ordered=ordered,
+    )
+    val = Series(["d", "a", "b", "a", "d", "b"])
+    df = DataFrame({"label": label, "val": val})
+
+    # aggregate on the Categorical
+    result = df.groupby("label", observed=observed, sort=sort)["val"].aggregate("first")
+
+    # If ordering works, we expect index labels equal to aggregation results,
+    # except for 'observed=False': label 'missing' has aggregation None
+    label = Series(result.index.array, dtype="object")
+    aggr = Series(result.array)
+    if not observed:
+        aggr[aggr.isna()] = "missing"
+    if not all(label == aggr):
+        msg = (
+            "Labels and aggregation results not consistently sorted\n"
+            f"for (ordered={ordered}, observed={observed}, sort={sort})\n"
+            f"Result:\n{result}"
+        )
+        pytest.fail(msg)
+
+
+def test_datetime():
+    # GH9049: ensure backward compatibility
+    levels = pd.date_range("2014-01-01", periods=4)
+    codes = np.random.default_rng(2).integers(0, 4, size=10)
+
+    cats = Categorical.from_codes(codes, levels, ordered=True)
+
+    data = DataFrame(np.random.default_rng(2).standard_normal((10, 4)))
+    result = data.groupby(cats, observed=False).mean()
+
+    expected = data.groupby(np.asarray(cats), observed=False).mean()
+    expected = expected.reindex(levels)
+    expected.index = CategoricalIndex(
+        expected.index, categories=expected.index, ordered=True
+    )
+
+    tm.assert_frame_equal(result, expected)
+
+    grouped = data.groupby(cats, observed=False)
+    desc_result = grouped.describe()
+
+    idx = cats.codes.argsort()
+    ord_labels = cats.take(idx)
+    ord_data = data.take(idx)
+    expected = ord_data.groupby(ord_labels, observed=False).describe()
+    tm.assert_frame_equal(desc_result, expected)
+    tm.assert_index_equal(desc_result.index, expected.index)
+    tm.assert_index_equal(
+        desc_result.index.get_level_values(0), expected.index.get_level_values(0)
+    )
+
+    # GH 10460
+    expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
+    exp = CategoricalIndex(expc)
+    tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
+    exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
+    tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
+
+
+def test_categorical_index():
+    s = np.random.default_rng(2)
+    levels = ["foo", "bar", "baz", "qux"]
+    codes = s.integers(0, 4, size=20)
+    cats = Categorical.from_codes(codes, levels, ordered=True)
+    df = DataFrame(np.repeat(np.arange(20), 4).reshape(-1, 4), columns=list("abcd"))
+    df["cats"] = cats
+
+    # with a cat index
+    result = df.set_index("cats").groupby(level=0, observed=False).sum()
+    expected = df[list("abcd")].groupby(cats.codes, observed=False).sum()
+    expected.index = CategoricalIndex(
+        Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats"
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # with a cat column, should produce a cat index
+    result = df.groupby("cats", observed=False).sum()
+    expected = df[list("abcd")].groupby(cats.codes, observed=False).sum()
+    expected.index = CategoricalIndex(
+        Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats"
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_describe_categorical_columns():
+    # GH 11558
+    cats = CategoricalIndex(
+        ["qux", "foo", "baz", "bar"],
+        categories=["foo", "bar", "baz", "qux"],
+        ordered=True,
+    )
+    df = DataFrame(np.random.default_rng(2).standard_normal((20, 4)), columns=cats)
+    result = df.groupby([1, 2, 3, 4] * 5).describe()
+
+    tm.assert_index_equal(result.stack().columns, cats)
+    tm.assert_categorical_equal(result.stack().columns.values, cats.values)
+
+
+def test_unstack_categorical():
+    # GH11558 (example is taken from the original issue)
+    df = DataFrame(
+        {"a": range(10), "medium": ["A", "B"] * 5, "artist": list("XYXXY") * 2}
+    )
+    df["medium"] = df["medium"].astype("category")
+
+    gcat = df.groupby(["artist", "medium"], observed=False)["a"].count().unstack()
+    result = gcat.describe()
+
+    exp_columns = CategoricalIndex(["A", "B"], ordered=False, name="medium")
+    tm.assert_index_equal(result.columns, exp_columns)
+    tm.assert_categorical_equal(result.columns.values, exp_columns.values)
+
+    result = gcat["A"] + gcat["B"]
+    expected = Series([6, 4], index=Index(["X", "Y"], name="artist"))
+    tm.assert_series_equal(result, expected)
+
+
+def test_bins_unequal_len():
+    # GH3011
+    series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
+    bins = pd.cut(series.dropna().values, 4)
+
+    # len(bins) != len(series) here
+    with pytest.raises(ValueError, match="Grouper and axis must be same length"):
+        series.groupby(bins).mean()
+
+
+@pytest.mark.parametrize(
+    ["series", "data"],
+    [
+        # Group a series with length and index equal to those of the grouper.
+        (Series(range(4)), {"A": [0, 3], "B": [1, 2]}),
+        # Group a series with length equal to that of the grouper and index unequal to
+        # that of the grouper.
+        (Series(range(4)).rename(lambda idx: idx + 1), {"A": [2], "B": [0, 1]}),
+        # GH44179: Group a series with length unequal to that of the grouper.
+        (Series(range(7)), {"A": [0, 3], "B": [1, 2]}),
+    ],
+)
+def test_categorical_series(series, data):
+    # Group the given series by a series with categorical data type such that group A
+    # takes indices 0 and 3 and group B indices 1 and 2, obtaining the values mapped in
+    # the given data.
+    groupby = series.groupby(Series(list("ABBA"), dtype="category"), observed=False)
+    result = groupby.aggregate(list)
+    expected = Series(data, index=CategoricalIndex(data.keys()))
+    tm.assert_series_equal(result, expected)
+
+
+def test_as_index():
+    # GH13204
+    df = DataFrame(
+        {
+            "cat": Categorical([1, 2, 2], [1, 2, 3]),
+            "A": [10, 11, 11],
+            "B": [101, 102, 103],
+        }
+    )
+    result = df.groupby(["cat", "A"], as_index=False, observed=True).sum()
+    expected = DataFrame(
+        {
+            "cat": Categorical([1, 2], categories=df.cat.cat.categories),
+            "A": [10, 11],
+            "B": [101, 205],
+        },
+        columns=["cat", "A", "B"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # function grouper
+    f = lambda r: df.loc[r, "A"]
+    result = df.groupby(["cat", f], as_index=False, observed=True).sum()
+    expected = DataFrame(
+        {
+            "cat": Categorical([1, 2], categories=df.cat.cat.categories),
+            "level_1": [10, 11],
+            "A": [10, 22],
+            "B": [101, 205],
+        },
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # another not in-axis grouper (conflicting names in index)
+    s = Series(["a", "b", "b"], name="cat")
+    result = df.groupby(["cat", s], as_index=False, observed=True).sum()
+    expected = DataFrame(
+        {
+            "cat": ["a", "b"],
+            "A": [10, 22],
+            "B": [101, 205],
+        },
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # is original index dropped?
+    group_columns = ["cat", "A"]
+    expected = DataFrame(
+        {
+            "cat": Categorical([1, 2], categories=df.cat.cat.categories),
+            "A": [10, 11],
+            "B": [101, 205],
+        },
+        columns=["cat", "A", "B"],
+    )
+
+    for name in [None, "X", "B"]:
+        df.index = Index(list("abc"), name=name)
+        result = df.groupby(group_columns, as_index=False, observed=True).sum()
+
+        tm.assert_frame_equal(result, expected)
+
+
+def test_preserve_categories():
+    # GH-13179
+    categories = list("abc")
+
+    # ordered=True
+    df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=True)})
+    sort_index = CategoricalIndex(categories, categories, ordered=True, name="A")
+    nosort_index = CategoricalIndex(list("bac"), categories, ordered=True, name="A")
+    tm.assert_index_equal(
+        df.groupby("A", sort=True, observed=False).first().index, sort_index
+    )
+    # GH#42482 - don't sort result when sort=False, even when ordered=True
+    tm.assert_index_equal(
+        df.groupby("A", sort=False, observed=False).first().index, nosort_index
+    )
+
+
+def test_preserve_categories_ordered_false():
+    # GH-13179
+    categories = list("abc")
+    df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
+    sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
+    # GH#48749 - don't change order of categories
+    # GH#42482 - don't sort result when sort=False, even when ordered=True
+    nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A")
+    tm.assert_index_equal(
+        df.groupby("A", sort=True, observed=False).first().index, sort_index
+    )
+    tm.assert_index_equal(
+        df.groupby("A", sort=False, observed=False).first().index, nosort_index
+    )
+
+
+@pytest.mark.parametrize("col", ["C1", "C2"])
+def test_preserve_categorical_dtype(col):
+    # GH13743, GH13854
+    df = DataFrame(
+        {
+            "A": [1, 2, 1, 1, 2],
+            "B": [10, 16, 22, 28, 34],
+            "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False),
+            "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True),
+        }
+    )
+    # single grouper
+    exp_full = DataFrame(
+        {
+            "A": [2.0, 1.0, np.nan],
+            "B": [25.0, 20.0, np.nan],
+            "C1": Categorical(list("bac"), categories=list("bac"), ordered=False),
+            "C2": Categorical(list("bac"), categories=list("bac"), ordered=True),
+        }
+    )
+    result1 = df.groupby(by=col, as_index=False, observed=False).mean(numeric_only=True)
+    result2 = (
+        df.groupby(by=col, as_index=True, observed=False)
+        .mean(numeric_only=True)
+        .reset_index()
+    )
+    expected = exp_full.reindex(columns=result1.columns)
+    tm.assert_frame_equal(result1, expected)
+    tm.assert_frame_equal(result2, expected)
+
+
+@pytest.mark.parametrize(
+    "func, values",
+    [
+        ("first", ["second", "first"]),
+        ("last", ["fourth", "third"]),
+        ("min", ["fourth", "first"]),
+        ("max", ["second", "third"]),
+    ],
+)
+def test_preserve_on_ordered_ops(func, values):
+    # gh-18502
+    # preserve the categoricals on ops
+    c = Categorical(["first", "second", "third", "fourth"], ordered=True)
+    df = DataFrame({"payload": [-1, -2, -1, -2], "col": c})
+    g = df.groupby("payload")
+    result = getattr(g, func)()
+    expected = DataFrame(
+        {"payload": [-2, -1], "col": Series(values, dtype=c.dtype)}
+    ).set_index("payload")
+    tm.assert_frame_equal(result, expected)
+
+    # we should also preserve categorical for SeriesGroupBy
+    sgb = df.groupby("payload")["col"]
+    result = getattr(sgb, func)()
+    expected = expected["col"]
+    tm.assert_series_equal(result, expected)
+
+
+def test_categorical_no_compress():
+    data = Series(np.random.default_rng(2).standard_normal(9))
+
+    codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
+    cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
+
+    result = data.groupby(cats, observed=False).mean()
+    exp = data.groupby(codes, observed=False).mean()
+
+    exp.index = CategoricalIndex(
+        exp.index, categories=cats.categories, ordered=cats.ordered
+    )
+    tm.assert_series_equal(result, exp)
+
+    codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
+    cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
+
+    result = data.groupby(cats, observed=False).mean()
+    exp = data.groupby(codes, observed=False).mean().reindex(cats.categories)
+    exp.index = CategoricalIndex(
+        exp.index, categories=cats.categories, ordered=cats.ordered
+    )
+    tm.assert_series_equal(result, exp)
+
+
+def test_categorical_no_compress_string():
+    cats = Categorical(
+        ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+        categories=["a", "b", "c", "d"],
+        ordered=True,
+    )
+    data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
+
+    result = data.groupby("b", observed=False).mean()
+    result = result["a"].values
+    exp = np.array([1, 2, 4, np.nan])
+    tm.assert_numpy_array_equal(result, exp)
+
+
+def test_groupby_empty_with_category():
+    # GH-9614
+    # test fix for when group by on None resulted in
+    # coercion of dtype categorical -> float
+    df = DataFrame({"A": [None] * 3, "B": Categorical(["train", "train", "test"])})
+    result = df.groupby("A").first()["B"]
+    expected = Series(
+        Categorical([], categories=["test", "train"]),
+        index=Series([], dtype="object", name="A"),
+        name="B",
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_sort():
+    # https://stackoverflow.com/questions/23814368/sorting-pandas-
+    #        categorical-labels-after-groupby
+    # This should result in a properly sorted Series so that the plot
+    # has a sorted x axis
+    # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
+
+    df = DataFrame({"value": np.random.default_rng(2).integers(0, 10000, 10)})
+    labels = [f"{i} - {i + 499}" for i in range(0, 10000, 500)]
+    cat_labels = Categorical(labels, labels)
+
+    df = df.sort_values(by=["value"], ascending=True)
+    df["value_group"] = pd.cut(
+        df.value, range(0, 10500, 500), right=False, labels=cat_labels
+    )
+
+    res = df.groupby(["value_group"], observed=False)["value_group"].count()
+    exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
+    exp.index = CategoricalIndex(exp.index, name=exp.index.name)
+    tm.assert_series_equal(res, exp)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_sort2(sort, ordered):
+    # dataframe groupby sort was being ignored # GH 8868
+    # GH#48749 - don't change order of categories
+    # GH#42482 - don't sort result when sort=False, even when ordered=True
+    df = DataFrame(
+        [
+            ["(7.5, 10]", 10, 10],
+            ["(7.5, 10]", 8, 20],
+            ["(2.5, 5]", 5, 30],
+            ["(5, 7.5]", 6, 40],
+            ["(2.5, 5]", 4, 50],
+            ["(0, 2.5]", 1, 60],
+            ["(5, 7.5]", 7, 70],
+        ],
+        columns=["range", "foo", "bar"],
+    )
+    df["range"] = Categorical(df["range"], ordered=ordered)
+    result = df.groupby("range", sort=sort, observed=False).first()
+
+    if sort:
+        data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
+        index_values = ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"]
+    else:
+        data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
+        index_values = ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"]
+    expected = DataFrame(
+        data_values,
+        columns=["foo", "bar"],
+        index=CategoricalIndex(index_values, name="range", ordered=ordered),
+    )
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_sort_datetimelike(sort, ordered):
+    # GH10505
+    # GH#42482 - don't sort result when sort=False, even when ordered=True
+
+    # use same data as test_groupby_sort_categorical, which category is
+    # corresponding to datetime.month
+    df = DataFrame(
+        {
+            "dt": [
+                datetime(2011, 7, 1),
+                datetime(2011, 7, 1),
+                datetime(2011, 2, 1),
+                datetime(2011, 5, 1),
+                datetime(2011, 2, 1),
+                datetime(2011, 1, 1),
+                datetime(2011, 5, 1),
+            ],
+            "foo": [10, 8, 5, 6, 4, 1, 7],
+            "bar": [10, 20, 30, 40, 50, 60, 70],
+        },
+        columns=["dt", "foo", "bar"],
+    )
+
+    # ordered=True
+    df["dt"] = Categorical(df["dt"], ordered=ordered)
+    if sort:
+        data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
+        index_values = [
+            datetime(2011, 1, 1),
+            datetime(2011, 2, 1),
+            datetime(2011, 5, 1),
+            datetime(2011, 7, 1),
+        ]
+    else:
+        data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
+        index_values = [
+            datetime(2011, 7, 1),
+            datetime(2011, 2, 1),
+            datetime(2011, 5, 1),
+            datetime(2011, 1, 1),
+        ]
+    expected = DataFrame(
+        data_values,
+        columns=["foo", "bar"],
+        index=CategoricalIndex(index_values, name="dt", ordered=ordered),
+    )
+    result = df.groupby("dt", sort=sort, observed=False).first()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_empty_sum():
+    # https://github.com/pandas-dev/pandas/issues/18678
+    df = DataFrame(
+        {"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]}
+    )
+    expected_idx = CategoricalIndex(["a", "b", "c"], name="A")
+
+    # 0 by default
+    result = df.groupby("A", observed=False).B.sum()
+    expected = Series([3, 1, 0], expected_idx, name="B")
+    tm.assert_series_equal(result, expected)
+
+    # min_count=0
+    result = df.groupby("A", observed=False).B.sum(min_count=0)
+    expected = Series([3, 1, 0], expected_idx, name="B")
+    tm.assert_series_equal(result, expected)
+
+    # min_count=1
+    result = df.groupby("A", observed=False).B.sum(min_count=1)
+    expected = Series([3, 1, np.nan], expected_idx, name="B")
+    tm.assert_series_equal(result, expected)
+
+    # min_count>1
+    result = df.groupby("A", observed=False).B.sum(min_count=2)
+    expected = Series([3, np.nan, np.nan], expected_idx, name="B")
+    tm.assert_series_equal(result, expected)
+
+
+def test_empty_prod():
+    # https://github.com/pandas-dev/pandas/issues/18678
+    df = DataFrame(
+        {"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]}
+    )
+
+    expected_idx = CategoricalIndex(["a", "b", "c"], name="A")
+
+    # 1 by default
+    result = df.groupby("A", observed=False).B.prod()
+    expected = Series([2, 1, 1], expected_idx, name="B")
+    tm.assert_series_equal(result, expected)
+
+    # min_count=0
+    result = df.groupby("A", observed=False).B.prod(min_count=0)
+    expected = Series([2, 1, 1], expected_idx, name="B")
+    tm.assert_series_equal(result, expected)
+
+    # min_count=1
+    result = df.groupby("A", observed=False).B.prod(min_count=1)
+    expected = Series([2, 1, np.nan], expected_idx, name="B")
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_multiindex_categorical_datetime():
+    # https://github.com/pandas-dev/pandas/issues/21390
+
+    df = DataFrame(
+        {
+            "key1": Categorical(list("abcbabcba")),
+            "key2": Categorical(
+                list(pd.date_range("2018-06-01 00", freq="1min", periods=3)) * 3
+            ),
+            "values": np.arange(9),
+        }
+    )
+    result = df.groupby(["key1", "key2"], observed=False).mean()
+
+    idx = MultiIndex.from_product(
+        [
+            Categorical(["a", "b", "c"]),
+            Categorical(pd.date_range("2018-06-01 00", freq="1min", periods=3)),
+        ],
+        names=["key1", "key2"],
+    )
+    expected = DataFrame({"values": [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "as_index, expected",
+    [
+        (
+            True,
+            Series(
+                index=MultiIndex.from_arrays(
+                    [Series([1, 1, 2], dtype="category"), [1, 2, 2]], names=["a", "b"]
+                ),
+                data=[1, 2, 3],
+                name="x",
+            ),
+        ),
+        (
+            False,
+            DataFrame(
+                {
+                    "a": Series([1, 1, 2], dtype="category"),
+                    "b": [1, 2, 2],
+                    "x": [1, 2, 3],
+                }
+            ),
+        ),
+    ],
+)
+def test_groupby_agg_observed_true_single_column(as_index, expected):
+    # GH-23970
+    df = DataFrame(
+        {"a": Series([1, 1, 2], dtype="category"), "b": [1, 2, 2], "x": [1, 2, 3]}
+    )
+
+    result = df.groupby(["a", "b"], as_index=as_index, observed=True)["x"].sum()
+
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize("fill_value", [None, np.nan, pd.NaT])
+def test_shift(fill_value):
+    ct = Categorical(
+        ["a", "b", "c", "d"], categories=["a", "b", "c", "d"], ordered=False
+    )
+    expected = Categorical(
+        [None, "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False
+    )
+    res = ct.shift(1, fill_value=fill_value)
+    tm.assert_equal(res, expected)
+
+
+@pytest.fixture
+def df_cat(df):
+    """
+    DataFrame with multiple categorical columns and a column of integers.
+    Shortened so as not to contain all possible combinations of categories.
+    Useful for testing `observed` kwarg functionality on GroupBy objects.
+
+    Parameters
+    ----------
+    df: DataFrame
+        Non-categorical, longer DataFrame from another fixture, used to derive
+        this one
+
+    Returns
+    -------
+    df_cat: DataFrame
+    """
+    df_cat = df.copy()[:4]  # leave out some groups
+    df_cat["A"] = df_cat["A"].astype("category")
+    df_cat["B"] = df_cat["B"].astype("category")
+    df_cat["C"] = Series([1, 2, 3, 4])
+    df_cat = df_cat.drop(["D"], axis=1)
+    return df_cat
+
+
+@pytest.mark.parametrize("operation", ["agg", "apply"])
+def test_seriesgroupby_observed_true(df_cat, operation):
+    # GH#24880
+    # GH#49223 - order of results was wrong when grouping by index levels
+    lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A")
+    lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B")
+    index = MultiIndex.from_arrays([lev_a, lev_b])
+    expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index()
+
+    grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
+    result = getattr(grouped, operation)(sum)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("operation", ["agg", "apply"])
+@pytest.mark.parametrize("observed", [False, None])
+def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
+    # GH 24880
+    # GH#49223 - order of results was wrong when grouping by index levels
+    index, _ = MultiIndex.from_product(
+        [
+            CategoricalIndex(["bar", "foo"], ordered=False),
+            CategoricalIndex(["one", "three", "two"], ordered=False),
+        ],
+        names=["A", "B"],
+    ).sortlevel()
+
+    expected = Series(data=[2, 4, 0, 1, 0, 3], index=index, name="C")
+    grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
+    result = getattr(grouped, operation)(sum)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "observed, index, data",
+    [
+        (
+            True,
+            MultiIndex.from_arrays(
+                [
+                    Index(["bar"] * 4 + ["foo"] * 4, dtype="category", name="A"),
+                    Index(
+                        ["one", "one", "three", "three", "one", "one", "two", "two"],
+                        dtype="category",
+                        name="B",
+                    ),
+                    Index(["min", "max"] * 4),
+                ]
+            ),
+            [2, 2, 4, 4, 1, 1, 3, 3],
+        ),
+        (
+            False,
+            MultiIndex.from_product(
+                [
+                    CategoricalIndex(["bar", "foo"], ordered=False),
+                    CategoricalIndex(["one", "three", "two"], ordered=False),
+                    Index(["min", "max"]),
+                ],
+                names=["A", "B", None],
+            ),
+            [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3],
+        ),
+        (
+            None,
+            MultiIndex.from_product(
+                [
+                    CategoricalIndex(["bar", "foo"], ordered=False),
+                    CategoricalIndex(["one", "three", "two"], ordered=False),
+                    Index(["min", "max"]),
+                ],
+                names=["A", "B", None],
+            ),
+            [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3],
+        ),
+    ],
+)
+def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data):
+    # GH 24880
+    expected = Series(data=data, index=index, name="C")
+    result = df_cat.groupby(["A", "B"], observed=observed)["C"].apply(
+        lambda x: {"min": x.min(), "max": x.max()}
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_categorical_series_dataframe_consistent(df_cat):
+    # GH 20416
+    expected = df_cat.groupby(["A", "B"], observed=False)["C"].mean()
+    result = df_cat.groupby(["A", "B"], observed=False).mean()["C"]
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_cat_preserves_structure(observed, ordered):
+    # GH 28787
+    df = DataFrame(
+        {"Name": Categorical(["Bob", "Greg"], ordered=ordered), "Item": [1, 2]},
+        columns=["Name", "Item"],
+    )
+    expected = df.copy()
+
+    result = (
+        df.groupby("Name", observed=observed)
+        .agg(DataFrame.sum, skipna=True)
+        .reset_index()
+    )
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_get_nonexistent_category():
+    # Accessing a Category that is not in the dataframe
+    df = DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)})
+    with pytest.raises(KeyError, match="'vau'"):
+        df.groupby("var").apply(lambda rows: DataFrame({"val": [rows.iloc[-1]["vau"]]}))
+
+
+def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed):
+    # GH 17605
+    if reduction_func == "ngroup":
+        pytest.skip("ngroup is not truly a reduction")
+
+    df = DataFrame(
+        {
+            "cat_1": Categorical(list("AABB"), categories=list("ABCD")),
+            "cat_2": Categorical(list("AB") * 2, categories=list("ABCD")),
+            "value": [0.1] * 4,
+        }
+    )
+    args = get_groupby_method_args(reduction_func, df)
+
+    expected_length = 4 if observed else 16
+
+    series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"]
+
+    if reduction_func == "corrwith":
+        # TODO: implemented SeriesGroupBy.corrwith. See GH 32293
+        assert not hasattr(series_groupby, reduction_func)
+        return
+
+    agg = getattr(series_groupby, reduction_func)
+
+    if not observed and reduction_func in ["idxmin", "idxmax"]:
+        # idxmin and idxmax are designed to fail on empty inputs
+        with pytest.raises(
+            ValueError, match="empty group due to unobserved categories"
+        ):
+            agg(*args)
+        return
+
+    result = agg(*args)
+
+    assert len(result) == expected_length
+
+
+def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
+    reduction_func, request
+):
+    # GH 17605
+    # Tests whether the unobserved categories in the result contain 0 or NaN
+
+    if reduction_func == "ngroup":
+        pytest.skip("ngroup is not truly a reduction")
+
+    if reduction_func == "corrwith":  # GH 32293
+        mark = pytest.mark.xfail(
+            reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
+        )
+        request.applymarker(mark)
+
+    df = DataFrame(
+        {
+            "cat_1": Categorical(list("AABB"), categories=list("ABC")),
+            "cat_2": Categorical(list("AB") * 2, categories=list("ABC")),
+            "value": [0.1] * 4,
+        }
+    )
+    unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
+    args = get_groupby_method_args(reduction_func, df)
+
+    series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
+    agg = getattr(series_groupby, reduction_func)
+
+    if reduction_func in ["idxmin", "idxmax"]:
+        # idxmin and idxmax are designed to fail on empty inputs
+        with pytest.raises(
+            ValueError, match="empty group due to unobserved categories"
+        ):
+            agg(*args)
+        return
+
+    result = agg(*args)
+
+    missing_fillin = _results_for_groupbys_with_missing_categories[reduction_func]
+
+    for idx in unobserved:
+        val = result.loc[idx]
+        assert (pd.isna(missing_fillin) and pd.isna(val)) or (val == missing_fillin)
+
+    # If we expect unobserved values to be zero, we also expect the dtype to be int.
+    # Except for .sum(). If the observed categories sum to dtype=float (i.e. their
+    # sums have decimals), then the zeros for the missing categories should also be
+    # floats.
+    if missing_fillin == 0:
+        if reduction_func in ["count", "nunique", "size"]:
+            assert np.issubdtype(result.dtype, np.integer)
+        else:
+            assert reduction_func in ["sum", "any"]
+
+
+def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func):
+    # GH 23865
+    # GH 27075
+    # Ensure that df.groupby, when 'by' is two Categorical variables,
+    # does not return the categories that are not in df when observed=True
+    if reduction_func == "ngroup":
+        pytest.skip("ngroup does not return the Categories on the index")
+
+    df = DataFrame(
+        {
+            "cat_1": Categorical(list("AABB"), categories=list("ABC")),
+            "cat_2": Categorical(list("1111"), categories=list("12")),
+            "value": [0.1, 0.1, 0.1, 0.1],
+        }
+    )
+    unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
+
+    df_grp = df.groupby(["cat_1", "cat_2"], observed=True)
+
+    args = get_groupby_method_args(reduction_func, df)
+    if reduction_func == "corrwith":
+        warn = Pandas4Warning
+        warn_msg = "DataFrameGroupBy.corrwith is deprecated"
+    else:
+        warn = None
+        warn_msg = ""
+    with tm.assert_produces_warning(warn, match=warn_msg):
+        res = getattr(df_grp, reduction_func)(*args)
+
+    for cat in unobserved_cats:
+        assert cat not in res.index
+
+
+@pytest.mark.parametrize("observed", [False, None])
+def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
+    reduction_func, observed, using_python_scalars
+):
+    # GH 23865
+    # GH 27075
+    # Ensure that df.groupby, when 'by' is two Categorical variables,
+    # returns the categories that are not in df when observed=False/None
+
+    if reduction_func == "ngroup":
+        pytest.skip("ngroup does not return the Categories on the index")
+
+    df = DataFrame(
+        {
+            "cat_1": Categorical(list("AABB"), categories=list("ABC")),
+            "cat_2": Categorical(list("1111"), categories=list("12")),
+            "value": [0.1, 0.1, 0.1, 0.1],
+        }
+    )
+    unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
+
+    df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)
+
+    args = get_groupby_method_args(reduction_func, df)
+
+    if not observed and reduction_func in ["idxmin", "idxmax"]:
+        # idxmin and idxmax are designed to fail on empty inputs
+        with pytest.raises(
+            ValueError, match="empty group due to unobserved categories"
+        ):
+            getattr(df_grp, reduction_func)(*args)
+        return
+
+    if reduction_func == "corrwith":
+        warn = Pandas4Warning
+        warn_msg = "DataFrameGroupBy.corrwith is deprecated"
+    else:
+        warn = None
+        warn_msg = ""
+    with tm.assert_produces_warning(warn, match=warn_msg):
+        res = getattr(df_grp, reduction_func)(*args)
+
+    expected = _results_for_groupbys_with_missing_categories[reduction_func]
+
+    if using_python_scalars and reduction_func == "size":
+        assert (res.loc[unobserved_cats] == expected).all() is True
+    elif expected is np.nan:
+        assert res.loc[unobserved_cats].isnull().all().all()
+    else:
+        assert (res.loc[unobserved_cats] == expected).all().all()
+
+
+def test_series_groupby_categorical_aggregation_getitem():
+    # GH 8870
+    d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]}
+    df = DataFrame(d)
+    cat = pd.cut(df["foo"], np.linspace(0, 20, 5))
+    df["range"] = cat
+    groups = df.groupby(["range", "baz"], as_index=True, sort=True, observed=False)
+    result = groups["foo"].agg("mean")
+    expected = groups.agg("mean")["foo"]
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "func, expected_values",
+    [(Series.nunique, [1, 1, 2]), (Series.count, [1, 2, 2])],
+)
+def test_groupby_agg_categorical_columns(func, expected_values):
+    # 31256
+    df = DataFrame(
+        {
+            "id": [0, 1, 2, 3, 4],
+            "groups": [0, 1, 1, 2, 2],
+            "value": Categorical([0, 0, 0, 0, 1]),
+        }
+    ).set_index("id")
+    result = df.groupby("groups").agg(func)
+
+    expected = DataFrame(
+        {"value": expected_values}, index=Index([0, 1, 2], name="groups")
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_agg_non_numeric():
+    df = DataFrame({"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"])})
+    expected = DataFrame({"A": [2, 1]}, index=np.array([1, 2]))
+
+    result = df.groupby([1, 2, 1]).agg(Series.nunique)
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby([1, 2, 1]).nunique()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["first", "last"])
+def test_groupby_first_returned_categorical_instead_of_dataframe(func):
+    # GH 28641: groupby drops index, when grouping over categorical column with
+    # first/last. Renamed Categorical instead of DataFrame previously.
+    df = DataFrame({"A": [1997], "B": Series(["b"], dtype="category").cat.as_ordered()})
+    df_grouped = df.groupby("A")["B"]
+    result = getattr(df_grouped, func)()
+
+    # ordered categorical dtype should be preserved
+    expected = Series(
+        ["b"], index=Index([1997], name="A"), name="B", dtype=df["B"].dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_read_only_category_no_sort():
+    # GH33410
+    cats = np.array([1, 2])
+    cats.flags.writeable = False
+    df = DataFrame(
+        {"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))}
+    )
+    expected = DataFrame(data={"a": [2.0, 6.0]}, index=CategoricalIndex(cats, name="b"))
+    result = df.groupby("b", sort=False, observed=False).mean()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_sorted_missing_category_values():
+    # GH 28597
+    df = DataFrame(
+        {
+            "foo": [
+                "small",
+                "large",
+                "large",
+                "large",
+                "medium",
+                "large",
+                "large",
+                "medium",
+            ],
+            "bar": ["C", "A", "A", "C", "A", "C", "A", "C"],
+        }
+    )
+    df["foo"] = (
+        df["foo"]
+        .astype("category")
+        .cat.set_categories(["tiny", "small", "medium", "large"], ordered=True)
+    )
+
+    expected = DataFrame(
+        {
+            "tiny": {"A": 0, "C": 0},
+            "small": {"A": 0, "C": 1},
+            "medium": {"A": 1, "C": 1},
+            "large": {"A": 3, "C": 2},
+        }
+    )
+    expected = expected.rename_axis("bar", axis="index")
+    expected.columns = CategoricalIndex(
+        ["tiny", "small", "medium", "large"],
+        categories=["tiny", "small", "medium", "large"],
+        ordered=True,
+        name="foo",
+        dtype="category",
+    )
+
+    result = df.groupby(["bar", "foo"], observed=False).size().unstack()
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_agg_cython_category_not_implemented_fallback():
+    # https://github.com/pandas-dev/pandas/issues/31450
+    df = DataFrame({"col_num": [1, 1, 2, 3]})
+    df["col_cat"] = df["col_num"].astype("category")
+
+    result = df.groupby("col_num").col_cat.first()
+
+    # ordered categorical dtype should definitely be preserved;
+    #  this is unordered, so is less-clear case (if anything, it should raise)
+    expected = Series(
+        [1, 2, 3],
+        index=Index([1, 2, 3], name="col_num"),
+        name="col_cat",
+        dtype=df["col_cat"].dtype,
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = df.groupby("col_num").agg({"col_cat": "first"})
+    expected = expected.to_frame()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_aggregate_categorical_with_isnan():
+    # GH 29837
+    df = DataFrame(
+        {
+            "A": [1, 1, 1, 1],
+            "B": [1, 2, 1, 2],
+            "numerical_col": [0.1, 0.2, np.nan, 0.3],
+            "object_col": ["foo", "bar", "foo", "fee"],
+            "categorical_col": ["foo", "bar", "foo", "fee"],
+        }
+    )
+
+    df = df.astype({"categorical_col": "category"})
+
+    result = df.groupby(["A", "B"]).agg(lambda df: df.isna().sum())
+    index = MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B"))
+    expected = DataFrame(
+        data={
+            "numerical_col": [1, 0],
+            "object_col": [0, 0],
+            "categorical_col": [0, 0],
+        },
+        index=index,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_transform():
+    # GH 29037
+    df = DataFrame(
+        {
+            "package_id": [1, 1, 1, 2, 2, 3],
+            "status": [
+                "Waiting",
+                "OnTheWay",
+                "Delivered",
+                "Waiting",
+                "OnTheWay",
+                "Waiting",
+            ],
+        }
+    )
+
+    delivery_status_type = pd.CategoricalDtype(
+        categories=["Waiting", "OnTheWay", "Delivered"], ordered=True
+    )
+    df["status"] = df["status"].astype(delivery_status_type)
+    df["last_status"] = df.groupby("package_id")["status"].transform(max)
+    result = df.copy()
+
+    expected = DataFrame(
+        {
+            "package_id": [1, 1, 1, 2, 2, 3],
+            "status": [
+                "Waiting",
+                "OnTheWay",
+                "Delivered",
+                "Waiting",
+                "OnTheWay",
+                "Waiting",
+            ],
+            "last_status": [
+                "Waiting",
+                "Waiting",
+                "Waiting",
+                "Waiting",
+                "Waiting",
+                "Waiting",
+            ],
+        }
+    )
+
+    expected["status"] = expected["status"].astype(delivery_status_type)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["first", "last"])
+def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals(
+    func: str, observed: bool
+):
+    # GH 34951
+    cat = Categorical([0, 0, 1, 1])
+    val = [0, 1, 1, 0]
+    df = DataFrame({"a": cat, "b": cat, "c": val})
+
+    cat2 = Categorical([0, 1])
+    idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"])
+    expected_dict = {
+        "first": Series([0, np.nan, np.nan, 1], idx, name="c"),
+        "last": Series([1, np.nan, np.nan, 0], idx, name="c"),
+    }
+
+    expected = expected_dict[func]
+    if observed:
+        expected = expected.dropna().astype(np.int64)
+
+    srs_grp = df.groupby(["a", "b"], observed=observed)["c"]
+    result = getattr(srs_grp, func)()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["first", "last"])
+def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals(
+    func: str, observed: bool
+):
+    # GH 34951
+    cat = Categorical([0, 0, 1, 1])
+    val = [0, 1, 1, 0]
+    df = DataFrame({"a": cat, "b": cat, "c": val})
+
+    cat2 = Categorical([0, 1])
+    idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"])
+    expected_dict = {
+        "first": Series([0, np.nan, np.nan, 1], idx, name="c"),
+        "last": Series([1, np.nan, np.nan, 0], idx, name="c"),
+    }
+
+    expected = expected_dict[func].to_frame()
+    if observed:
+        expected = expected.dropna().astype(np.int64)
+
+    df_grp = df.groupby(["a", "b"], observed=observed)
+    result = getattr(df_grp, func)()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_categorical_indices_unused_categories():
+    # GH#38642
+    df = DataFrame(
+        {
+            "key": Categorical(["b", "b", "a"], categories=["a", "b", "c"]),
+            "col": range(3),
+        }
+    )
+    grouped = df.groupby("key", sort=False, observed=False)
+    result = grouped.indices
+    expected = {
+        "b": np.array([0, 1], dtype="intp"),
+        "a": np.array([2], dtype="intp"),
+        "c": np.array([], dtype="intp"),
+    }
+    assert result.keys() == expected.keys()
+    for key in result.keys():
+        tm.assert_numpy_array_equal(result[key], expected[key])
+
+
+@pytest.mark.parametrize("func", ["first", "last"])
+def test_groupby_last_first_preserve_categoricaldtype(func):
+    # GH#33090
+    df = DataFrame({"a": [1, 2, 3]})
+    df["b"] = df["a"].astype("category")
+    result = getattr(df.groupby("a")["b"], func)()
+    expected = Series(
+        Categorical([1, 2, 3]), name="b", index=Index([1, 2, 3], name="a")
+    )
+    tm.assert_series_equal(expected, result)
+
+
+def test_groupby_categorical_observed_nunique():
+    # GH#45128
+    df = DataFrame({"a": [1, 2], "b": [1, 2], "c": [10, 11]})
+    df = df.astype(dtype={"a": "category", "b": "category"})
+    result = df.groupby(["a", "b"], observed=True).nunique()["c"]
+    expected = Series(
+        [1, 1],
+        index=MultiIndex.from_arrays(
+            [CategoricalIndex([1, 2], name="a"), CategoricalIndex([1, 2], name="b")]
+        ),
+        name="c",
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_categorical_aggregate_functions():
+    # GH#37275
+    dtype = pd.CategoricalDtype(categories=["small", "big"], ordered=True)
+    df = DataFrame(
+        [[1, "small"], [1, "big"], [2, "small"]], columns=["grp", "description"]
+    ).astype({"description": dtype})
+
+    result = df.groupby("grp")["description"].max()
+    expected = Series(
+        ["big", "small"],
+        index=Index([1, 2], name="grp"),
+        name="description",
+        dtype=pd.CategoricalDtype(categories=["small", "big"], ordered=True),
+    )
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_categorical_dropna(observed, dropna):
+    # GH#48645 - dropna should have no impact on the result when there are no NA values
+    cat = Categorical([1, 2], categories=[1, 2, 3])
+    df = DataFrame({"x": Categorical([1, 2], categories=[1, 2, 3]), "y": [3, 4]})
+    gb = df.groupby("x", observed=observed, dropna=dropna)
+    result = gb.sum()
+
+    if observed:
+        expected = DataFrame({"y": [3, 4]}, index=cat)
+    else:
+        index = CategoricalIndex([1, 2, 3], [1, 2, 3])
+        expected = DataFrame({"y": [3, 4, 0]}, index=index)
+    expected.index.name = "x"
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
+@pytest.mark.parametrize("ordered", [True, False])
+def test_category_order_reducer(
+    request, as_index, sort, observed, reduction_func, index_kind, ordered
+):
+    # GH#48749
+    if reduction_func == "corrwith" and not as_index and index_kind != "single":
+        msg = "GH#49950 - corrwith with as_index=False may not have grouping column"
+        request.applymarker(pytest.mark.xfail(reason=msg))
+    elif index_kind != "range" and not as_index:
+        pytest.skip(reason="Result doesn't have categories, nothing to test")
+    df = DataFrame(
+        {
+            "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
+            "b": range(4),
+        }
+    )
+    if index_kind == "range":
+        keys = ["a"]
+    elif index_kind == "single":
+        keys = ["a"]
+        df = df.set_index(keys)
+    elif index_kind == "multi":
+        keys = ["a", "a2"]
+        df["a2"] = df["a"]
+        df = df.set_index(keys)
+    args = get_groupby_method_args(reduction_func, df)
+    gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
+
+    if not observed and reduction_func in ["idxmin", "idxmax"]:
+        # idxmin and idxmax are designed to fail on empty inputs
+        with pytest.raises(
+            ValueError, match="empty group due to unobserved categories"
+        ):
+            getattr(gb, reduction_func)(*args)
+        return
+    if reduction_func == "corrwith":
+        warn = Pandas4Warning
+        warn_msg = "DataFrameGroupBy.corrwith is deprecated"
+    else:
+        warn = None
+        warn_msg = ""
+    with tm.assert_produces_warning(warn, match=warn_msg):
+        op_result = getattr(gb, reduction_func)(*args)
+    if as_index:
+        result = op_result.index.get_level_values("a").categories
+    else:
+        result = op_result["a"].cat.categories
+    expected = Index([1, 4, 3, 2])
+    tm.assert_index_equal(result, expected)
+
+    if index_kind == "multi":
+        result = op_result.index.get_level_values("a2").categories
+        tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("index_kind", ["single", "multi"])
+@pytest.mark.parametrize("ordered", [True, False])
+def test_category_order_transformer(
+    as_index, sort, observed, transformation_func, index_kind, ordered
+):
+    # GH#48749
+    df = DataFrame(
+        {
+            "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
+            "b": range(4),
+        }
+    )
+    if index_kind == "single":
+        keys = ["a"]
+        df = df.set_index(keys)
+    elif index_kind == "multi":
+        keys = ["a", "a2"]
+        df["a2"] = df["a"]
+        df = df.set_index(keys)
+    args = get_groupby_method_args(transformation_func, df)
+    gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
+    op_result = getattr(gb, transformation_func)(*args)
+    result = op_result.index.get_level_values("a").categories
+    expected = Index([1, 4, 3, 2])
+    tm.assert_index_equal(result, expected)
+
+    if index_kind == "multi":
+        result = op_result.index.get_level_values("a2").categories
+        tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
+@pytest.mark.parametrize("method", ["head", "tail"])
+@pytest.mark.parametrize("ordered", [True, False])
+def test_category_order_head_tail(
+    as_index, sort, observed, method, index_kind, ordered
+):
+    # GH#48749
+    df = DataFrame(
+        {
+            "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
+            "b": range(4),
+        }
+    )
+    if index_kind == "range":
+        keys = ["a"]
+    elif index_kind == "single":
+        keys = ["a"]
+        df = df.set_index(keys)
+    elif index_kind == "multi":
+        keys = ["a", "a2"]
+        df["a2"] = df["a"]
+        df = df.set_index(keys)
+    gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
+    op_result = getattr(gb, method)()
+    if index_kind == "range":
+        result = op_result["a"].cat.categories
+    else:
+        result = op_result.index.get_level_values("a").categories
+    expected = Index([1, 4, 3, 2])
+    tm.assert_index_equal(result, expected)
+
+    if index_kind == "multi":
+        result = op_result.index.get_level_values("a2").categories
+        tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
+@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
+@pytest.mark.parametrize("ordered", [True, False])
+def test_category_order_apply(as_index, sort, observed, method, index_kind, ordered):
+    # GH#48749
+    if (method == "transform" and index_kind == "range") or (
+        not as_index and index_kind != "range"
+    ):
+        pytest.skip("No categories in result, nothing to test")
+    df = DataFrame(
+        {
+            "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
+            "b": range(4),
+        }
+    )
+    if index_kind == "range":
+        keys = ["a"]
+    elif index_kind == "single":
+        keys = ["a"]
+        df = df.set_index(keys)
+    elif index_kind == "multi":
+        keys = ["a", "a2"]
+        df["a2"] = df["a"]
+        df = df.set_index(keys)
+    gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
+    op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True))
+    if (method == "transform" or not as_index) and index_kind == "range":
+        result = op_result["a"].cat.categories
+    else:
+        result = op_result.index.get_level_values("a").categories
+    expected = Index([1, 4, 3, 2])
+    tm.assert_index_equal(result, expected)
+
+    if index_kind == "multi":
+        result = op_result.index.get_level_values("a2").categories
+        tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
+def test_many_categories(as_index, sort, index_kind, ordered):
+    # GH#48749 - Test when the grouper has many categories
+    if index_kind != "range" and not as_index:
+        pytest.skip(reason="Result doesn't have categories, nothing to test")
+    categories = np.arange(9999, -1, -1)
+    grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered)
+    df = DataFrame({"a": grouper, "b": range(4)})
+    if index_kind == "range":
+        keys = ["a"]
+    elif index_kind == "single":
+        keys = ["a"]
+        df = df.set_index(keys)
+    elif index_kind == "multi":
+        keys = ["a", "a2"]
+        df["a2"] = df["a"]
+        df = df.set_index(keys)
+    gb = df.groupby(keys, as_index=as_index, sort=sort, observed=True)
+    result = gb.sum()
+
+    # Test is setup so that data and index are the same values
+    data = [3, 2, 1] if sort else [2, 1, 3]
+
+    index = CategoricalIndex(
+        data, categories=grouper.categories, ordered=ordered, name="a"
+    )
+    if as_index:
+        expected = DataFrame({"b": data})
+        if index_kind == "multi":
+            expected.index = MultiIndex.from_frame(DataFrame({"a": index, "a2": index}))
+        else:
+            expected.index = index
+    elif index_kind == "multi":
+        expected = DataFrame({"a": Series(index), "a2": Series(index), "b": data})
+    else:
+        expected = DataFrame({"a": Series(index), "b": data})
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("test_series", [True, False])
+@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
+def test_agg_list(request, as_index, observed, reduction_func, test_series, keys):
+    # GH#52760
+    if test_series and reduction_func == "corrwith":
+        assert not hasattr(SeriesGroupBy, "corrwith")
+        pytest.skip("corrwith not implemented for SeriesGroupBy")
+    elif reduction_func == "corrwith":
+        msg = "GH#32293: attempts to call SeriesGroupBy.corrwith"
+        request.applymarker(pytest.mark.xfail(reason=msg))
+
+    df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]})
+    df = df.astype({"a1": "category", "a2": "category"})
+    if "a2" not in keys:
+        df = df.drop(columns="a2")
+    gb = df.groupby(by=keys, as_index=as_index, observed=observed)
+    if test_series:
+        gb = gb["b"]
+    args = get_groupby_method_args(reduction_func, df)
+
+    if not observed and reduction_func in ["idxmin", "idxmax"] and keys == ["a1", "a2"]:
+        with pytest.raises(
+            ValueError, match="empty group due to unobserved categories"
+        ):
+            gb.agg([reduction_func], *args)
+        return
+
+    result = gb.agg([reduction_func], *args)
+    expected = getattr(gb, reduction_func)(*args)
+
+    if as_index and (test_series or reduction_func == "size"):
+        expected = expected.to_frame(reduction_func)
+    if not test_series:
+        expected.columns = MultiIndex.from_tuples(
+            [(ind, "") for ind in expected.columns[:-1]] + [("b", reduction_func)]
+        )
+    elif not as_index:
+        expected.columns = [*keys, reduction_func]
+
+    tm.assert_equal(result, expected)
+
+
+def test_categorical_with_noncategorical_na(observed, sort):
+    # https://github.com/pandas-dev/pandas/issues/63920
+    df = DataFrame(
+        {
+            "dates": list("YXXYY"),
+            "sector": Categorical(
+                [2, 1, 2, 1, np.nan], categories=[1, 2, 3], ordered=True
+            ),
+            "metric": [1, 2, 3, 4, 5],
+        }
+    )
+    gb = df.groupby(["dates", "sector"], observed=observed, sort=sort)
+    # Only testing the ids/result_index, okay to just use one kernel
+    result = gb.sum()
+
+    if sort and observed:
+        taker = [0, 1, 2, 3]
+    elif not sort and observed:
+        taker = [3, 0, 1, 2]
+    elif sort and not observed:
+        taker = [0, 1, 4, 2, 3, 5]
+    elif not sort and not observed:
+        taker = [3, 0, 1, 2, 5, 4]
+    expected = (
+        DataFrame(
+            {
+                "dates": list("XXYYXY"),
+                "sector": Categorical(
+                    [1, 2, 1, 2, 3, 3], categories=[1, 2, 3], ordered=True
+                ),
+                "metric": [2, 3, 4, 1, 0, 0],
+            }
+        )
+        .set_index(["dates", "sector"])
+        .take(taker)
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_counting.py b/pandas/tests/groupby/test_counting.py
new file mode 100644
index 0000000000000000000000000000000000000000..679f7eb7f7f11d842dc36c9c6cb83d2096fb66b2
--- /dev/null
+++ b/pandas/tests/groupby/test_counting.py
@@ -0,0 +1,394 @@
+from itertools import product
+from string import ascii_lowercase
+
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Period,
+    Series,
+    Timedelta,
+    Timestamp,
+    date_range,
+)
+import pandas._testing as tm
+
+
+class TestCounting:
+    def test_cumcount(self):
+        df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
+        g = df.groupby("A")
+        sg = g.A
+
+        expected = Series([0, 1, 2, 0, 3])
+
+        tm.assert_series_equal(expected, g.cumcount())
+        tm.assert_series_equal(expected, sg.cumcount())
+
+    def test_cumcount_empty(self):
+        ge = DataFrame().groupby(level=0)
+        se = Series(dtype=object).groupby(level=0)
+
+        # edge case, as this is usually considered float
+        e = Series(dtype="int64")
+
+        tm.assert_series_equal(e, ge.cumcount())
+        tm.assert_series_equal(e, se.cumcount())
+
+    def test_cumcount_dupe_index(self):
+        df = DataFrame(
+            [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
+        )
+        g = df.groupby("A")
+        sg = g.A
+
+        expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
+
+        tm.assert_series_equal(expected, g.cumcount())
+        tm.assert_series_equal(expected, sg.cumcount())
+
+    def test_cumcount_mi(self):
+        mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
+        df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
+        g = df.groupby("A")
+        sg = g.A
+
+        expected = Series([0, 1, 2, 0, 3], index=mi)
+
+        tm.assert_series_equal(expected, g.cumcount())
+        tm.assert_series_equal(expected, sg.cumcount())
+
+    def test_cumcount_groupby_not_col(self):
+        df = DataFrame(
+            [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
+        )
+        g = df.groupby([0, 0, 0, 1, 0])
+        sg = g.A
+
+        expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
+
+        tm.assert_series_equal(expected, g.cumcount())
+        tm.assert_series_equal(expected, sg.cumcount())
+
+    def test_ngroup(self):
+        df = DataFrame({"A": list("aaaba")})
+        g = df.groupby("A")
+        sg = g.A
+
+        expected = Series([0, 0, 0, 1, 0])
+
+        tm.assert_series_equal(expected, g.ngroup())
+        tm.assert_series_equal(expected, sg.ngroup())
+
+    def test_ngroup_distinct(self):
+        df = DataFrame({"A": list("abcde")})
+        g = df.groupby("A")
+        sg = g.A
+
+        expected = Series(range(5), dtype="int64")
+
+        tm.assert_series_equal(expected, g.ngroup())
+        tm.assert_series_equal(expected, sg.ngroup())
+
+    def test_ngroup_one_group(self):
+        df = DataFrame({"A": [0] * 5})
+        g = df.groupby("A")
+        sg = g.A
+
+        expected = Series([0] * 5)
+
+        tm.assert_series_equal(expected, g.ngroup())
+        tm.assert_series_equal(expected, sg.ngroup())
+
+    def test_ngroup_empty(self):
+        ge = DataFrame().groupby(level=0)
+        se = Series(dtype=object).groupby(level=0)
+
+        # edge case, as this is usually considered float
+        e = Series(dtype="int64")
+
+        tm.assert_series_equal(e, ge.ngroup())
+        tm.assert_series_equal(e, se.ngroup())
+
+    def test_ngroup_series_matches_frame(self):
+        df = DataFrame({"A": list("aaaba")})
+        s = Series(list("aaaba"))
+
+        tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
+
+    def test_ngroup_dupe_index(self):
+        df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
+        g = df.groupby("A")
+        sg = g.A
+
+        expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
+
+        tm.assert_series_equal(expected, g.ngroup())
+        tm.assert_series_equal(expected, sg.ngroup())
+
+    def test_ngroup_mi(self):
+        mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
+        df = DataFrame({"A": list("aaaba")}, index=mi)
+        g = df.groupby("A")
+        sg = g.A
+        expected = Series([0, 0, 0, 1, 0], index=mi)
+
+        tm.assert_series_equal(expected, g.ngroup())
+        tm.assert_series_equal(expected, sg.ngroup())
+
+    def test_ngroup_groupby_not_col(self):
+        df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
+        g = df.groupby([0, 0, 0, 1, 0])
+        sg = g.A
+
+        expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
+
+        tm.assert_series_equal(expected, g.ngroup())
+        tm.assert_series_equal(expected, sg.ngroup())
+
+    def test_ngroup_descending(self):
+        df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
+        g = df.groupby(["A"])
+
+        ascending = Series([0, 0, 1, 0, 1])
+        descending = Series([1, 1, 0, 1, 0])
+
+        tm.assert_series_equal(descending, (g.ngroups - 1) - ascending)
+        tm.assert_series_equal(ascending, g.ngroup(ascending=True))
+        tm.assert_series_equal(descending, g.ngroup(ascending=False))
+
+    def test_ngroup_matches_cumcount(self):
+        # verify one manually-worked out case works
+        df = DataFrame(
+            [["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
+            columns=["A", "X"],
+        )
+        g = df.groupby(["A", "X"])
+        g_ngroup = g.ngroup()
+        g_cumcount = g.cumcount()
+        expected_ngroup = Series([0, 1, 2, 0, 3])
+        expected_cumcount = Series([0, 0, 0, 1, 0])
+
+        tm.assert_series_equal(g_ngroup, expected_ngroup)
+        tm.assert_series_equal(g_cumcount, expected_cumcount)
+
+    def test_ngroup_cumcount_pair(self):
+        # brute force comparison for all small series
+        for p in product(range(3), repeat=4):
+            df = DataFrame({"a": p})
+            g = df.groupby(["a"])
+
+            order = sorted(set(p))
+            ngroupd = [order.index(val) for val in p]
+            cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
+
+            tm.assert_series_equal(g.ngroup(), Series(ngroupd))
+            tm.assert_series_equal(g.cumcount(), Series(cumcounted))
+
+    def test_ngroup_respects_groupby_order(self, sort):
+        df = DataFrame({"a": np.random.default_rng(2).choice(list("abcdef"), 100)})
+        g = df.groupby("a", sort=sort)
+        df["group_id"] = -1
+        df["group_index"] = -1
+
+        for i, (_, group) in enumerate(g):
+            df.loc[group.index, "group_id"] = i
+            for j, ind in enumerate(group.index):
+                df.loc[ind, "group_index"] = j
+
+        tm.assert_series_equal(Series(df["group_id"].values), g.ngroup())
+        tm.assert_series_equal(Series(df["group_index"].values), g.cumcount())
+
+    @pytest.mark.parametrize(
+        "datetimelike",
+        [
+            [Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)],
+            [Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)],
+            [Timestamp(f"2016-05-{i:02d} 20:09:25", tz="UTC") for i in range(1, 4)],
+            [Timedelta(x, unit="h") for x in range(1, 4)],
+            [Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
+        ],
+    )
+    def test_count_with_datetimelike(self, datetimelike):
+        # test for #13393, where DataframeGroupBy.count() fails
+        # when counting a datetimelike column.
+
+        df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
+        res = df.groupby("x").count()
+        expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
+        expected.index.name = "x"
+        tm.assert_frame_equal(expected, res)
+
+    def test_count_with_only_nans_in_first_group(self):
+        # GH21956
+        df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
+        result = df.groupby(["A", "B"]).C.count()
+        mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
+        expected = Series([], index=mi, dtype=np.int64, name="C")
+        tm.assert_series_equal(result, expected, check_index_type=False)
+
+    def test_count_groupby_column_with_nan_in_groupby_column(self):
+        # https://github.com/pandas-dev/pandas/issues/32841
+        df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.nan, 3, 0]})
+        res = df.groupby(["B"]).count()
+        expected = DataFrame(
+            index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
+        )
+        tm.assert_frame_equal(expected, res)
+
+    def test_groupby_count_dateparseerror(self):
+        dr = date_range(start="1/1/2012", freq="5min", periods=10)
+
+        # BAD Example, datetimes first
+        ser = Series(np.arange(10), index=[dr, np.arange(10)])
+        grouped = ser.groupby(lambda x: x[1] % 2 == 0)
+        result = grouped.count()
+
+        ser = Series(np.arange(10), index=[np.arange(10), dr])
+        grouped = ser.groupby(lambda x: x[0] % 2 == 0)
+        expected = grouped.count()
+
+        tm.assert_series_equal(result, expected)
+
+
+def test_groupby_timedelta_cython_count():
+    df = DataFrame(
+        {"g": list("ab" * 2), "delta": np.arange(4).astype("timedelta64[ns]")}
+    )
+    expected = Series([2, 2], index=Index(["a", "b"], name="g"), name="delta")
+    result = df.groupby("g").delta.count()
+    tm.assert_series_equal(expected, result)
+
+
+def test_count():
+    n = 1 << 15
+    dr = date_range("2015-08-30", periods=n // 10, freq="min")
+
+    df = DataFrame(
+        {
+            "1st": np.random.default_rng(2).choice(list(ascii_lowercase), n),
+            "2nd": np.random.default_rng(2).integers(0, 5, n),
+            "3rd": np.random.default_rng(2).standard_normal(n).round(3),
+            "4th": np.random.default_rng(2).integers(-10, 10, n),
+            "5th": np.random.default_rng(2).choice(dr, n),
+            "6th": np.random.default_rng(2).standard_normal(n).round(3),
+            "7th": np.random.default_rng(2).standard_normal(n).round(3),
+            "8th": np.random.default_rng(2).choice(dr, n)
+            - np.random.default_rng(2).choice(dr, 1),
+            "9th": np.random.default_rng(2).choice(list(ascii_lowercase), n),
+        }
+    )
+
+    for col in df.columns.drop(["1st", "2nd", "4th"]):
+        df.loc[np.random.default_rng(2).choice(n, n // 10), col] = np.nan
+
+    df["9th"] = df["9th"].astype("category")
+
+    for key in ["1st", "2nd", ["1st", "2nd"]]:
+        left = df.groupby(key).count()
+        right = df.groupby(key).apply(DataFrame.count)
+        tm.assert_frame_equal(left, right)
+
+
+def test_count_non_nulls():
+    # GH#5610
+    # count counts non-nulls
+    df = DataFrame(
+        [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]],
+        columns=["A", "B", "C"],
+    )
+
+    count_as = df.groupby("A").count()
+    count_not_as = df.groupby("A", as_index=False).count()
+
+    expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3])
+    expected.index.name = "A"
+    tm.assert_frame_equal(count_not_as, expected.reset_index())
+    tm.assert_frame_equal(count_as, expected)
+
+    count_B = df.groupby("A")["B"].count()
+    tm.assert_series_equal(count_B, expected["B"])
+
+
+def test_count_object():
+    df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3})
+    result = df.groupby("c").a.count()
+    expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
+    tm.assert_series_equal(result, expected)
+
+
+def test_count_object_nan():
+    df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
+    result = df.groupby("c").a.count()
+    expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("typ", ["object", "float32"])
+def test_count_cross_type(typ):
+    # GH8169
+    # Set float64 dtype to avoid upcast when setting nan below
+    vals = np.hstack(
+        (
+            np.random.default_rng(2).integers(0, 5, (10, 2)),
+            np.random.default_rng(2).integers(0, 2, (10, 2)),
+        )
+    ).astype("float64")
+
+    df = DataFrame(vals, columns=["a", "b", "c", "d"])
+    df[df == 2] = np.nan
+    expected = df.groupby(["c", "d"]).count()
+
+    df["a"] = df["a"].astype(typ)
+    df["b"] = df["b"].astype(typ)
+    result = df.groupby(["c", "d"]).count()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_lower_int_prec_count():
+    df = DataFrame(
+        {
+            "a": np.array([0, 1, 2, 100], np.int8),
+            "b": np.array([1, 2, 3, 6], np.uint32),
+            "c": np.array([4, 5, 6, 8], np.int16),
+            "grp": list("ab" * 2),
+        }
+    )
+    result = df.groupby("grp").count()
+    expected = DataFrame(
+        {"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=Index(list("ab"), name="grp")
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_count_uses_size_on_exception():
+    class RaisingObjectException(Exception):
+        pass
+
+    class RaisingObject:
+        def __init__(self, msg="I will raise inside Cython") -> None:
+            super().__init__()
+            self.msg = msg
+
+        def __eq__(self, other):
+            # gets called in Cython to check that raising calls the method
+            raise RaisingObjectException(self.msg)
+
+    df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)})
+    result = df.groupby("grp").count()
+    expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp"))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_count_arrow_string_array(any_string_dtype):
+    # GH#54751
+    pytest.importorskip("pyarrow")
+    df = DataFrame(
+        {"a": [1, 2, 3], "b": Series(["a", "b", "a"], dtype=any_string_dtype)}
+    )
+    result = df.groupby("a").count()
+    expected = DataFrame({"b": 1}, index=Index([1, 2, 3], name="a"))
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_cumulative.py b/pandas/tests/groupby/test_cumulative.py
new file mode 100644
index 0000000000000000000000000000000000000000..cca4971e930b416fb5cbbacd704d6f171ebdfeee
--- /dev/null
+++ b/pandas/tests/groupby/test_cumulative.py
@@ -0,0 +1,332 @@
+import numpy as np
+import pytest
+
+from pandas.errors import UnsupportedFunctionCall
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+
+
+@pytest.fixture(
+    params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"],
+    ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"],
+)
+def dtypes_for_minmax(request):
+    """
+    Fixture of dtypes with min and max values used for testing
+    cummin and cummax
+    """
+    dtype = request.param
+
+    np_type = dtype
+    if dtype == "Int64":
+        np_type = np.int64
+    elif dtype == "Float64":
+        np_type = np.float64
+
+    min_val = (
+        np.iinfo(np_type).min
+        if np.dtype(np_type).kind == "i"
+        else np.finfo(np_type).min
+    )
+    max_val = (
+        np.iinfo(np_type).max
+        if np.dtype(np_type).kind == "i"
+        else np.finfo(np_type).max
+    )
+
+    return (dtype, min_val, max_val)
+
+
+def test_groupby_cumprod():
+    # GH 4095
+    df = DataFrame({"key": ["b"] * 10, "value": 2})
+
+    actual = df.groupby("key")["value"].cumprod()
+    expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
+    expected.name = "value"
+    tm.assert_series_equal(actual, expected)
+
+    df = DataFrame({"key": ["b"] * 100, "value": 2})
+    df["value"] = df["value"].astype(float)
+    actual = df.groupby("key")["value"].cumprod()
+    expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
+    expected.name = "value"
+    tm.assert_series_equal(actual, expected)
+
+
+def test_groupby_cumprod_overflow():
+    # GH#37493 if we overflow we return garbage consistent with numpy
+    df = DataFrame({"key": ["b"] * 4, "value": 100_000})
+    actual = df.groupby("key")["value"].cumprod()
+    expected = Series(
+        [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920],
+        name="value",
+    )
+    tm.assert_series_equal(actual, expected)
+
+    numpy_result = df.groupby("key", group_keys=False)["value"].apply(
+        lambda x: x.cumprod()
+    )
+    numpy_result.name = "value"
+    tm.assert_series_equal(actual, numpy_result)
+
+
+def test_groupby_cumprod_nan_influences_other_columns():
+    # GH#48064
+    df = DataFrame(
+        {
+            "a": 1,
+            "b": [1, np.nan, 2],
+            "c": [1, 2, 3.0],
+        }
+    )
+    result = df.groupby("a").cumprod(numeric_only=True, skipna=False)
+    expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_cummin(dtypes_for_minmax):
+    dtype = dtypes_for_minmax[0]
+
+    # GH 15048
+    base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
+    expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
+
+    df = base_df.astype(dtype)
+    expected = DataFrame({"B": expected_mins}).astype(dtype)
+    result = df.groupby("A").cummin()
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_cummin_min_value_for_dtype(dtypes_for_minmax):
+    dtype = dtypes_for_minmax[0]
+    min_val = dtypes_for_minmax[1]
+
+    # GH 15048
+    base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
+    expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
+    expected = DataFrame({"B": expected_mins}).astype(dtype)
+    df = base_df.astype(dtype)
+    df.loc[[2, 6], "B"] = min_val
+    df.loc[[1, 5], "B"] = min_val + 1
+    expected.loc[[2, 3, 6, 7], "B"] = min_val
+    expected.loc[[1, 5], "B"] = min_val + 1  # should not be rounded to min_val
+    result = df.groupby("A").cummin()
+    tm.assert_frame_equal(result, expected, check_exact=True)
+    expected = (
+        df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
+    )
+    tm.assert_frame_equal(result, expected, check_exact=True)
+
+
+def test_cummin_nan_in_some_values(dtypes_for_minmax):
+    # Explicit cast to float to avoid implicit cast when setting nan
+    base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
+    base_df = base_df.astype({"B": "float"})
+    base_df.loc[[0, 2, 4, 6], "B"] = np.nan
+    expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
+    result = base_df.groupby("A").cummin()
+    tm.assert_frame_equal(result, expected)
+    expected = (
+        base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_cummin_datetime():
+    # GH 15561
+    df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
+    expected = Series(pd.to_datetime("2001"), index=[0], name="b")
+
+    result = df.groupby("a")["b"].cummin()
+    tm.assert_series_equal(expected, result)
+
+
+def test_cummin_getattr_series():
+    # GH 15635
+    df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
+    result = df.groupby("a").b.cummin()
+    expected = Series([1, 2, 1], name="b")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["cummin", "cummax"])
+@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"])
+def test_cummin_max_all_nan_column(method, dtype):
+    item = np.nan if dtype == "float" else pd.NA
+    base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [item] * 8})
+    base_df["B"] = base_df["B"].astype(dtype)
+    grouped = base_df.groupby("A")
+
+    expected = DataFrame({"B": [item] * 8}, dtype=dtype)
+    result = getattr(grouped, method)()
+    tm.assert_frame_equal(expected, result)
+
+    result = getattr(grouped["B"], method)().to_frame()
+    tm.assert_frame_equal(expected, result)
+
+
+def test_cummax(dtypes_for_minmax):
+    dtype = dtypes_for_minmax[0]
+
+    # GH 15048
+    base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
+    expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
+
+    df = base_df.astype(dtype)
+
+    expected = DataFrame({"B": expected_maxs}).astype(dtype)
+    result = df.groupby("A").cummax()
+    tm.assert_frame_equal(result, expected)
+    result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_cummax_min_value_for_dtype(dtypes_for_minmax):
+    dtype = dtypes_for_minmax[0]
+    max_val = dtypes_for_minmax[2]
+
+    # GH 15048
+    base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
+    expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
+
+    df = base_df.astype(dtype)
+    df.loc[[2, 6], "B"] = max_val
+    expected = DataFrame({"B": expected_maxs}).astype(dtype)
+    expected.loc[[2, 3, 6, 7], "B"] = max_val
+    result = df.groupby("A").cummax()
+    tm.assert_frame_equal(result, expected)
+    expected = (
+        df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_cummax_nan_in_some_values(dtypes_for_minmax):
+    # Test nan in some values
+    # Explicit cast to float to avoid implicit cast when setting nan
+    base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
+    base_df = base_df.astype({"B": "float"})
+    base_df.loc[[0, 2, 4, 6], "B"] = np.nan
+    expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
+    result = base_df.groupby("A").cummax()
+    tm.assert_frame_equal(result, expected)
+    expected = (
+        base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_cummax_datetime():
+    # GH 15561
+    df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
+    expected = Series(pd.to_datetime("2001"), index=[0], name="b")
+
+    result = df.groupby("a")["b"].cummax()
+    tm.assert_series_equal(expected, result)
+
+
+def test_cummax_getattr_series():
+    # GH 15635
+    df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
+    result = df.groupby("a").b.cummax()
+    expected = Series([2, 1, 2], name="b")
+    tm.assert_series_equal(result, expected)
+
+
+def test_cummax_i8_at_implementation_bound():
+    # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT
+    #  for int64 dtype GH#46382
+    ser = Series([pd.NaT._value + n for n in range(5)])
+    df = DataFrame({"A": 1, "B": ser, "C": ser._values.view("M8[ns]")})
+    gb = df.groupby("A")
+
+    res = gb.cummax()
+    exp = df[["B", "C"]]
+    tm.assert_frame_equal(res, exp)
+
+
+@pytest.mark.parametrize("method", ["cummin", "cummax"])
+@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
+@pytest.mark.parametrize(
+    "groups,expected_data",
+    [
+        ([1, 1, 1], [1, None, None]),
+        ([1, 2, 3], [1, None, 2]),
+        ([1, 3, 3], [1, None, None]),
+    ],
+)
+def test_cummin_max_skipna(method, dtype, groups, expected_data):
+    # GH-34047
+    df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
+    orig = df.copy()
+    gb = df.groupby(groups)["a"]
+
+    result = getattr(gb, method)(skipna=False)
+    expected = Series(expected_data, dtype=dtype, name="a")
+
+    # check we didn't accidentally alter df
+    tm.assert_frame_equal(df, orig)
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["cummin", "cummax"])
+def test_cummin_max_skipna_multiple_cols(method):
+    # Ensure missing value in "a" doesn't cause "b" to be nan-filled
+    df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]})
+    gb = df.groupby([1, 1, 1])[["a", "b"]]
+
+    result = getattr(gb, method)(skipna=False)
+    expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]})
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["cumprod", "cumsum"])
+def test_numpy_compat(func):
+    # see gh-12811
+    df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
+    g = df.groupby("A")
+
+    msg = "numpy operations are not valid with groupby"
+
+    with pytest.raises(UnsupportedFunctionCall, match=msg):
+        getattr(g, func)(1, 2, 3)
+    with pytest.raises(UnsupportedFunctionCall, match=msg):
+        getattr(g, func)(foo=1)
+
+
+@td.skip_if_32bit
+@pytest.mark.parametrize("method", ["cummin", "cummax"])
+@pytest.mark.parametrize(
+    "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)]
+)
+def test_nullable_int_not_cast_as_float(method, dtype, val):
+    data = [val, pd.NA]
+    df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
+    grouped = df.groupby("grp")
+
+    result = grouped.transform(method)
+    expected = DataFrame({"b": data}, dtype=dtype)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_cython_api2(as_index):
+    # this takes the fast apply path
+
+    # cumsum (GH5614)
+    # GH 5755 - cumsum is a transformer and should ignore as_index
+    df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
+    expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
+    result = df.groupby("A", as_index=as_index).cumsum()
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py
new file mode 100644
index 0000000000000000000000000000000000000000..c20fc9e3d62e77949383e9fd47080d0e23f94875
--- /dev/null
+++ b/pandas/tests/groupby/test_filters.py
@@ -0,0 +1,638 @@
+from string import ascii_lowercase
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Series,
+    Timestamp,
+)
+import pandas._testing as tm
+
+
+def test_filter_series():
+    s = Series([1, 3, 20, 5, 22, 24, 7])
+    expected_odd = Series([1, 3, 5, 7], index=[0, 1, 3, 6])
+    expected_even = Series([20, 22, 24], index=[2, 4, 5])
+    grouper = s.apply(lambda x: x % 2)
+    grouped = s.groupby(grouper)
+    tm.assert_series_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
+    tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
+    # Test dropna=False.
+    tm.assert_series_equal(
+        grouped.filter(lambda x: x.mean() < 10, dropna=False),
+        expected_odd.reindex(s.index),
+    )
+    tm.assert_series_equal(
+        grouped.filter(lambda x: x.mean() > 10, dropna=False),
+        expected_even.reindex(s.index),
+    )
+
+
+def test_filter_single_column_df():
+    df = DataFrame([1, 3, 20, 5, 22, 24, 7])
+    expected_odd = DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6])
+    expected_even = DataFrame([20, 22, 24], index=[2, 4, 5])
+    grouper = df[0].apply(lambda x: x % 2)
+    grouped = df.groupby(grouper)
+    tm.assert_frame_equal(grouped.filter(lambda x: x.mean() < 10), expected_odd)
+    tm.assert_frame_equal(grouped.filter(lambda x: x.mean() > 10), expected_even)
+    # Test dropna=False.
+    tm.assert_frame_equal(
+        grouped.filter(lambda x: x.mean() < 10, dropna=False),
+        expected_odd.reindex(df.index),
+    )
+    tm.assert_frame_equal(
+        grouped.filter(lambda x: x.mean() > 10, dropna=False),
+        expected_even.reindex(df.index),
+    )
+
+
+def test_filter_multi_column_df():
+    df = DataFrame({"A": [1, 12, 12, 1], "B": [1, 1, 1, 1]})
+    grouper = df["A"].apply(lambda x: x % 2)
+    grouped = df.groupby(grouper)
+    expected = DataFrame({"A": [12, 12], "B": [1, 1]}, index=[1, 2])
+    tm.assert_frame_equal(
+        grouped.filter(lambda x: x["A"].sum() - x["B"].sum() > 10), expected
+    )
+
+
+def test_filter_mixed_df():
+    df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
+    grouper = df["A"].apply(lambda x: x % 2)
+    grouped = df.groupby(grouper)
+    expected = DataFrame({"A": [12, 12], "B": ["b", "c"]}, index=[1, 2])
+    tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 10), expected)
+
+
+def test_filter_out_all_groups():
+    s = Series([1, 3, 20, 5, 22, 24, 7])
+    grouper = s.apply(lambda x: x % 2)
+    grouped = s.groupby(grouper)
+    tm.assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]])
+    df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
+    grouper = df["A"].apply(lambda x: x % 2)
+    grouped = df.groupby(grouper)
+    tm.assert_frame_equal(grouped.filter(lambda x: x["A"].sum() > 1000), df.loc[[]])
+
+
+def test_filter_out_no_groups():
+    s = Series([1, 3, 20, 5, 22, 24, 7])
+    grouper = s.apply(lambda x: x % 2)
+    grouped = s.groupby(grouper)
+    filtered = grouped.filter(lambda x: x.mean() > 0)
+    tm.assert_series_equal(filtered, s)
+
+
+def test_filter_out_no_groups_dataframe():
+    df = DataFrame({"A": [1, 12, 12, 1], "B": "a b c d".split()})
+    grouper = df["A"].apply(lambda x: x % 2)
+    grouped = df.groupby(grouper)
+    filtered = grouped.filter(lambda x: x["A"].mean() > 0)
+    tm.assert_frame_equal(filtered, df)
+
+
+def test_filter_out_all_groups_in_df():
+    # GH12768
+    df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
+    res = df.groupby("a")
+    res = res.filter(lambda x: x["b"].sum() > 5, dropna=False)
+    expected = DataFrame({"a": [np.nan] * 3, "b": [np.nan] * 3})
+    tm.assert_frame_equal(expected, res)
+
+
+def test_filter_out_all_groups_in_df_dropna_true():
+    # GH12768
+    df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 0]})
+    res = df.groupby("a")
+    res = res.filter(lambda x: x["b"].sum() > 5, dropna=True)
+    expected = DataFrame({"a": [], "b": []}, dtype="int64")
+    tm.assert_frame_equal(expected, res)
+
+
+def test_filter_condition_raises():
+    def raise_if_sum_is_zero(x):
+        if x.sum() == 0:
+            raise ValueError
+        return x.sum() > 0
+
+    s = Series([-1, 0, 1, 2])
+    grouper = s.apply(lambda x: x % 2)
+    grouped = s.groupby(grouper)
+    msg = "the filter must return a boolean result"
+    with pytest.raises(TypeError, match=msg):
+        grouped.filter(raise_if_sum_is_zero)
+
+
+def test_filter_bad_shapes():
+    df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
+    s = df["B"]
+    g_df = df.groupby("B")
+    g_s = s.groupby(s)
+
+    f = lambda x: x
+    msg = "filter function returned a DataFrame, but expected a scalar bool"
+    with pytest.raises(TypeError, match=msg):
+        g_df.filter(f)
+    msg = "the filter must return a boolean result"
+    with pytest.raises(TypeError, match=msg):
+        g_s.filter(f)
+
+    f = lambda x: x == 1
+    msg = "filter function returned a DataFrame, but expected a scalar bool"
+    with pytest.raises(TypeError, match=msg):
+        g_df.filter(f)
+    msg = "the filter must return a boolean result"
+    with pytest.raises(TypeError, match=msg):
+        g_s.filter(f)
+
+    f = lambda x: np.outer(x, x)
+    msg = "can't multiply sequence by non-int of type 'str'"
+    with pytest.raises(TypeError, match=msg):
+        g_df.filter(f)
+    msg = "the filter must return a boolean result"
+    with pytest.raises(TypeError, match=msg):
+        g_s.filter(f)
+
+
+def test_filter_nan_is_false():
+    df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
+    s = df["B"]
+    g_df = df.groupby(df["B"])
+    g_s = s.groupby(s)
+
+    f = lambda x: np.nan
+    tm.assert_frame_equal(g_df.filter(f), df.loc[[]])
+    tm.assert_series_equal(g_s.filter(f), s[[]])
+
+
+def test_filter_pdna_is_false():
+    # in particular, dont raise in filter trying to call bool(pd.NA)
+    df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
+    ser = df["B"]
+    g_df = df.groupby(df["B"])
+    g_s = ser.groupby(ser)
+
+    func = lambda x: pd.NA
+    res = g_df.filter(func)
+    tm.assert_frame_equal(res, df.loc[[]])
+    res = g_s.filter(func)
+    tm.assert_series_equal(res, ser[[]])
+
+
+def test_filter_against_workaround_ints():
+    # Series of ints
+    s = Series(np.random.default_rng(2).integers(0, 100, 10))
+    grouper = s.apply(lambda x: np.round(x, -1))
+    grouped = s.groupby(grouper)
+    f = lambda x: x.mean() > 10
+
+    old_way = s[grouped.transform(f).astype("bool")]
+    new_way = grouped.filter(f)
+    tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
+
+
+def test_filter_against_workaround_floats():
+    # Series of floats
+    s = 100 * Series(np.random.default_rng(2).random(10))
+    grouper = s.apply(lambda x: np.round(x, -1))
+    grouped = s.groupby(grouper)
+    f = lambda x: x.mean() > 10
+    old_way = s[grouped.transform(f).astype("bool")]
+    new_way = grouped.filter(f)
+    tm.assert_series_equal(new_way.sort_values(), old_way.sort_values())
+
+
+def test_filter_against_workaround_dataframe():
+    # Set up DataFrame of ints, floats, strings.
+    letters = np.array(list(ascii_lowercase))
+    N = 10
+    random_letters = letters.take(
+        np.random.default_rng(2).integers(0, 26, N, dtype=int)
+    )
+    df = DataFrame(
+        {
+            "ints": Series(np.random.default_rng(2).integers(0, 10, N)),
+            "floats": N / 10 * Series(np.random.default_rng(2).random(N)),
+            "letters": Series(random_letters),
+        }
+    )
+
+    # Group by ints; filter on floats.
+    grouped = df.groupby("ints")
+    old_way = df[grouped.floats.transform(lambda x: x.mean() > N / 2).astype("bool")]
+    new_way = grouped.filter(lambda x: x["floats"].mean() > N / 2)
+    tm.assert_frame_equal(new_way, old_way)
+
+    # Group by floats (rounded); filter on strings.
+    grouper = df.floats.apply(lambda x: np.round(x, -1))
+    grouped = df.groupby(grouper)
+    old_way = df[grouped.letters.transform(lambda x: len(x) < N / 2).astype("bool")]
+    new_way = grouped.filter(lambda x: len(x.letters) < N / 2)
+    tm.assert_frame_equal(new_way, old_way)
+
+    # Group by strings; filter on ints.
+    grouped = df.groupby("letters")
+    old_way = df[grouped.ints.transform(lambda x: x.mean() > N / 2).astype("bool")]
+    new_way = grouped.filter(lambda x: x["ints"].mean() > N / 2)
+    tm.assert_frame_equal(new_way, old_way)
+
+
+def test_filter_using_len():
+    # GH 4447
+    df = DataFrame({"A": np.arange(8), "B": list("aabbbbcc"), "C": np.arange(8)})
+    grouped = df.groupby("B")
+    actual = grouped.filter(lambda x: len(x) > 2)
+    expected = DataFrame(
+        {"A": np.arange(2, 6), "B": list("bbbb"), "C": np.arange(2, 6)},
+        index=range(2, 6),
+    )
+    tm.assert_frame_equal(actual, expected)
+
+    actual = grouped.filter(lambda x: len(x) > 4)
+    expected = df.loc[[]]
+    tm.assert_frame_equal(actual, expected)
+
+
+def test_filter_using_len_series():
+    # GH 4447
+    s = Series(list("aabbbbcc"), name="B")
+    grouped = s.groupby(s)
+    actual = grouped.filter(lambda x: len(x) > 2)
+    expected = Series(4 * ["b"], index=range(2, 6), name="B")
+    tm.assert_series_equal(actual, expected)
+
+    actual = grouped.filter(lambda x: len(x) > 4)
+    expected = s[[]]
+    tm.assert_series_equal(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "index", [range(8), range(7, -1, -1), [0, 2, 1, 3, 4, 6, 5, 7]]
+)
+def test_filter_maintains_ordering(index):
+    # GH 4621
+    df = DataFrame(
+        {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
+        index=index,
+    )
+    s = df["pid"]
+    grouped = df.groupby("tag")
+    actual = grouped.filter(lambda x: len(x) > 1)
+    expected = df.iloc[[1, 2, 4, 7]]
+    tm.assert_frame_equal(actual, expected)
+
+    grouped = s.groupby(df["tag"])
+    actual = grouped.filter(lambda x: len(x) > 1)
+    expected = s.iloc[[1, 2, 4, 7]]
+    tm.assert_series_equal(actual, expected)
+
+
+def test_filter_multiple_timestamp():
+    # GH 10114
+    df = DataFrame(
+        {
+            "A": np.arange(5, dtype="int64"),
+            "B": ["foo", "bar", "foo", "bar", "bar"],
+            "C": Timestamp("20130101"),
+        }
+    )
+
+    grouped = df.groupby(["B", "C"])
+
+    result = grouped["A"].filter(lambda x: True)
+    tm.assert_series_equal(df["A"], result)
+
+    result = grouped["A"].transform(len)
+    expected = Series([2, 3, 2, 3, 3], name="A")
+    tm.assert_series_equal(result, expected)
+
+    result = grouped.filter(lambda x: True)
+    tm.assert_frame_equal(df, result)
+
+    result = grouped.transform("sum")
+    expected = DataFrame({"A": [2, 8, 2, 8, 8]})
+    tm.assert_frame_equal(result, expected)
+
+    result = grouped.transform(len)
+    expected = DataFrame({"A": [2, 3, 2, 3, 3]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_filter_and_transform_with_non_unique_int_index():
+    # GH4620
+    index = [1, 1, 1, 2, 1, 1, 0, 1]
+    df = DataFrame(
+        {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
+        index=index,
+    )
+    grouped_df = df.groupby("tag")
+    ser = df["pid"]
+    grouped_ser = ser.groupby(df["tag"])
+    expected_indexes = [1, 2, 4, 7]
+
+    # Filter DataFrame
+    actual = grouped_df.filter(lambda x: len(x) > 1)
+    expected = df.iloc[expected_indexes]
+    tm.assert_frame_equal(actual, expected)
+
+    actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+    # Cast to avoid upcast when setting nan below
+    expected = df.copy().astype("float64")
+    expected.iloc[[0, 3, 5, 6]] = np.nan
+    tm.assert_frame_equal(actual, expected)
+
+    # Filter Series
+    actual = grouped_ser.filter(lambda x: len(x) > 1)
+    expected = ser.take(expected_indexes)
+    tm.assert_series_equal(actual, expected)
+
+    actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+    expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
+    # ^ made manually because this can get confusing!
+    tm.assert_series_equal(actual, expected)
+
+    # Transform Series
+    actual = grouped_ser.transform(len)
+    expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
+    tm.assert_series_equal(actual, expected)
+
+    # Transform (a column from) DataFrameGroupBy
+    actual = grouped_df.pid.transform(len)
+    tm.assert_series_equal(actual, expected)
+
+
+def test_filter_and_transform_with_multiple_non_unique_int_index():
+    # GH4620
+    index = [1, 1, 1, 2, 0, 0, 0, 1]
+    df = DataFrame(
+        {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
+        index=index,
+    )
+    grouped_df = df.groupby("tag")
+    ser = df["pid"]
+    grouped_ser = ser.groupby(df["tag"])
+    expected_indexes = [1, 2, 4, 7]
+
+    # Filter DataFrame
+    actual = grouped_df.filter(lambda x: len(x) > 1)
+    expected = df.iloc[expected_indexes]
+    tm.assert_frame_equal(actual, expected)
+
+    actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+    # Cast to avoid upcast when setting nan below
+    expected = df.copy().astype("float64")
+    expected.iloc[[0, 3, 5, 6]] = np.nan
+    tm.assert_frame_equal(actual, expected)
+
+    # Filter Series
+    actual = grouped_ser.filter(lambda x: len(x) > 1)
+    expected = ser.take(expected_indexes)
+    tm.assert_series_equal(actual, expected)
+
+    actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+    expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
+    # ^ made manually because this can get confusing!
+    tm.assert_series_equal(actual, expected)
+
+    # Transform Series
+    actual = grouped_ser.transform(len)
+    expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
+    tm.assert_series_equal(actual, expected)
+
+    # Transform (a column from) DataFrameGroupBy
+    actual = grouped_df.pid.transform(len)
+    tm.assert_series_equal(actual, expected)
+
+
+def test_filter_and_transform_with_non_unique_float_index():
+    # GH4620
+    index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float)
+    df = DataFrame(
+        {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
+        index=index,
+    )
+    grouped_df = df.groupby("tag")
+    ser = df["pid"]
+    grouped_ser = ser.groupby(df["tag"])
+    expected_indexes = [1, 2, 4, 7]
+
+    # Filter DataFrame
+    actual = grouped_df.filter(lambda x: len(x) > 1)
+    expected = df.iloc[expected_indexes]
+    tm.assert_frame_equal(actual, expected)
+
+    actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+    # Cast to avoid upcast when setting nan below
+    expected = df.copy().astype("float64")
+    expected.iloc[[0, 3, 5, 6]] = np.nan
+    tm.assert_frame_equal(actual, expected)
+
+    # Filter Series
+    actual = grouped_ser.filter(lambda x: len(x) > 1)
+    expected = ser.take(expected_indexes)
+    tm.assert_series_equal(actual, expected)
+
+    actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+    expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
+    # ^ made manually because this can get confusing!
+    tm.assert_series_equal(actual, expected)
+
+    # Transform Series
+    actual = grouped_ser.transform(len)
+    expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
+    tm.assert_series_equal(actual, expected)
+
+    # Transform (a column from) DataFrameGroupBy
+    actual = grouped_df.pid.transform(len)
+    tm.assert_series_equal(actual, expected)
+
+
+def test_filter_and_transform_with_non_unique_timestamp_index():
+    # GH4620
+    t0 = Timestamp("2013-09-30 00:05:00")
+    t1 = Timestamp("2013-10-30 00:05:00")
+    t2 = Timestamp("2013-11-30 00:05:00")
+    index = [t1, t1, t1, t2, t1, t1, t0, t1]
+    df = DataFrame(
+        {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
+        index=index,
+    )
+    grouped_df = df.groupby("tag")
+    ser = df["pid"]
+    grouped_ser = ser.groupby(df["tag"])
+    expected_indexes = [1, 2, 4, 7]
+
+    # Filter DataFrame
+    actual = grouped_df.filter(lambda x: len(x) > 1)
+    expected = df.iloc[expected_indexes]
+    tm.assert_frame_equal(actual, expected)
+
+    actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+    # Cast to avoid upcast when setting nan below
+    expected = df.copy().astype("float64")
+    expected.iloc[[0, 3, 5, 6]] = np.nan
+    tm.assert_frame_equal(actual, expected)
+
+    # Filter Series
+    actual = grouped_ser.filter(lambda x: len(x) > 1)
+    expected = ser.take(expected_indexes)
+    tm.assert_series_equal(actual, expected)
+
+    actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+    expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
+    # ^ made manually because this can get confusing!
+    tm.assert_series_equal(actual, expected)
+
+    # Transform Series
+    actual = grouped_ser.transform(len)
+    expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
+    tm.assert_series_equal(actual, expected)
+
+    # Transform (a column from) DataFrameGroupBy
+    actual = grouped_df.pid.transform(len)
+    tm.assert_series_equal(actual, expected)
+
+
+def test_filter_and_transform_with_non_unique_string_index():
+    # GH4620
+    index = list("bbbcbbab")
+    df = DataFrame(
+        {"pid": [1, 1, 1, 2, 2, 3, 3, 3], "tag": [23, 45, 62, 24, 45, 34, 25, 62]},
+        index=index,
+    )
+    grouped_df = df.groupby("tag")
+    ser = df["pid"]
+    grouped_ser = ser.groupby(df["tag"])
+    expected_indexes = [1, 2, 4, 7]
+
+    # Filter DataFrame
+    actual = grouped_df.filter(lambda x: len(x) > 1)
+    expected = df.iloc[expected_indexes]
+    tm.assert_frame_equal(actual, expected)
+
+    actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False)
+    # Cast to avoid upcast when setting nan below
+    expected = df.copy().astype("float64")
+    expected.iloc[[0, 3, 5, 6]] = np.nan
+    tm.assert_frame_equal(actual, expected)
+
+    # Filter Series
+    actual = grouped_ser.filter(lambda x: len(x) > 1)
+    expected = ser.take(expected_indexes)
+    tm.assert_series_equal(actual, expected)
+
+    actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False)
+    expected = Series([np.nan, 1, 1, np.nan, 2, np.nan, np.nan, 3], index, name="pid")
+    # ^ made manually because this can get confusing!
+    tm.assert_series_equal(actual, expected)
+
+    # Transform Series
+    actual = grouped_ser.transform(len)
+    expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name="pid")
+    tm.assert_series_equal(actual, expected)
+
+    # Transform (a column from) DataFrameGroupBy
+    actual = grouped_df.pid.transform(len)
+    tm.assert_series_equal(actual, expected)
+
+
+def test_filter_has_access_to_grouped_cols():
+    df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=["A", "B"])
+    g = df.groupby("A")
+    # previously didn't have access to col A #????
+    filt = g.filter(lambda x: x["A"].sum() == 2)
+    tm.assert_frame_equal(filt, df.iloc[[0, 1]])
+
+
+def test_filter_enforces_scalarness():
+    df = DataFrame(
+        [
+            ["best", "a", "x"],
+            ["worst", "b", "y"],
+            ["best", "c", "x"],
+            ["best", "d", "y"],
+            ["worst", "d", "y"],
+            ["worst", "d", "y"],
+            ["best", "d", "z"],
+        ],
+        columns=["a", "b", "c"],
+    )
+    with pytest.raises(TypeError, match="filter function returned a.*"):
+        df.groupby("c").filter(lambda g: g["a"] == "best")
+
+
+def test_filter_non_bool_raises():
+    df = DataFrame(
+        [
+            ["best", "a", 1],
+            ["worst", "b", 1],
+            ["best", "c", 1],
+            ["best", "d", 1],
+            ["worst", "d", 1],
+            ["worst", "d", 1],
+            ["best", "d", 1],
+        ],
+        columns=["a", "b", "c"],
+    )
+    with pytest.raises(TypeError, match="filter function returned a.*"):
+        df.groupby("a").filter(lambda g: g.c.mean())
+
+
+def test_filter_dropna_with_empty_groups():
+    # GH 10780
+    data = Series(np.random.default_rng(2).random(9), index=np.repeat([1, 2, 3], 3))
+    grouped = data.groupby(level=0)
+    result_false = grouped.filter(lambda x: x.mean() > 1, dropna=False)
+    expected_false = Series([np.nan] * 9, index=np.repeat([1, 2, 3], 3))
+    tm.assert_series_equal(result_false, expected_false)
+
+    result_true = grouped.filter(lambda x: x.mean() > 1, dropna=True)
+    expected_true = Series(index=pd.Index([], dtype=int), dtype=np.float64)
+    tm.assert_series_equal(result_true, expected_true)
+
+
+def test_filter_consistent_result_before_after_agg_func():
+    # GH 17091
+    df = DataFrame({"data": range(6), "key": list("ABCABC")})
+    grouper = df.groupby("key")
+    result = grouper.filter(lambda x: True)
+    expected = DataFrame({"data": range(6), "key": list("ABCABC")})
+    tm.assert_frame_equal(result, expected)
+
+    grouper.sum()
+    result = grouper.filter(lambda x: True)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_filter_with_non_values():
+    # GH 62501
+    df = DataFrame(
+        [
+            [1],
+            [None],
+        ],
+        columns=["a"],
+    )
+
+    result = df.groupby("a", dropna=False).filter(lambda x: True)
+    tm.assert_frame_equal(result, df)
+
+
+def test_filter_with_non_values_multi_index():
+    # GH 62501
+    df = DataFrame(
+        [
+            [1, 2],
+            [3, None],
+            [None, 4],
+            [None, None],
+        ],
+        columns=["a", "b"],
+    )
+
+    result = df.groupby(["a", "b"], dropna=False).filter(lambda x: True)
+    tm.assert_frame_equal(result, df)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
new file mode 100644
index 0000000000000000000000000000000000000000..54716bfff0fbafae2e354a2066ae6324d05f62bd
--- /dev/null
+++ b/pandas/tests/groupby/test_groupby.py
@@ -0,0 +1,3004 @@
+from datetime import datetime
+import decimal
+from decimal import Decimal
+import re
+
+import numpy as np
+import pytest
+
+from pandas.errors import SpecificationError
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    DataFrame,
+    Grouper,
+    Index,
+    Interval,
+    MultiIndex,
+    RangeIndex,
+    Series,
+    Timedelta,
+    Timestamp,
+    date_range,
+    to_datetime,
+)
+import pandas._testing as tm
+from pandas.core.arrays import BooleanArray
+import pandas.core.common as com
+
+pytestmark = pytest.mark.filterwarnings("ignore:Mean of empty slice:RuntimeWarning")
+
+
+def test_repr():
+    # GH18203
+    result = repr(Grouper(key="A", level="B"))
+    expected = "Grouper(key='A', level='B', sort=False, dropna=True)"
+    assert result == expected
+
+
+def test_groupby_nonobject_dtype(multiindex_dataframe_random_data):
+    key = multiindex_dataframe_random_data.index.codes[0]
+    grouped = multiindex_dataframe_random_data.groupby(key)
+    result = grouped.sum()
+
+    expected = multiindex_dataframe_random_data.groupby(key.astype("O")).sum()
+    assert result.index.dtype == np.int8
+    assert expected.index.dtype == np.int64
+    tm.assert_frame_equal(result, expected, check_index_type=False)
+
+
+def test_groupby_nonobject_dtype_mixed():
+    # GH 3911, mixed frame non-conversion
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+            "C": np.random.default_rng(2).standard_normal(8),
+            "D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"),
+        }
+    )
+    df["value"] = range(len(df))
+
+    def max_value(group):
+        return group.loc[group["value"].idxmax()]
+
+    applied = df.groupby("A").apply(max_value)
+    result = applied.dtypes
+    expected = df.drop(columns="A").dtypes
+    tm.assert_series_equal(result, expected)
+
+
+def test_pass_args_kwargs(ts):
+    def f(x, q=None, axis=0):
+        return np.percentile(x, q, axis=axis)
+
+    g = lambda x: np.percentile(x, 80, axis=0)
+
+    # Series
+    ts_grouped = ts.groupby(lambda x: x.month)
+    agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
+    apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
+    trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
+
+    agg_expected = ts_grouped.quantile(0.8)
+    trans_expected = ts_grouped.transform(g)
+
+    tm.assert_series_equal(apply_result, agg_expected)
+    tm.assert_series_equal(agg_result, agg_expected)
+    tm.assert_series_equal(trans_result, trans_expected)
+
+    agg_result = ts_grouped.agg(f, q=80)
+    apply_result = ts_grouped.apply(f, q=80)
+    trans_result = ts_grouped.transform(f, q=80)
+    tm.assert_series_equal(agg_result, agg_expected)
+    tm.assert_series_equal(apply_result, agg_expected)
+    tm.assert_series_equal(trans_result, trans_expected)
+
+
+def test_pass_args_kwargs_dataframe(tsframe, as_index):
+    def f(x, q=None, axis=0):
+        return np.percentile(x, q, axis=axis)
+
+    df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index)
+    agg_result = df_grouped.agg(np.percentile, 80, axis=0)
+    apply_result = df_grouped.apply(DataFrame.quantile, 0.8)
+    expected = df_grouped.quantile(0.8)
+    tm.assert_frame_equal(apply_result, expected, check_names=False)
+    tm.assert_frame_equal(agg_result, expected)
+
+    apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8])
+    expected_seq = df_grouped.quantile([0.4, 0.8])
+    if not as_index:
+        # apply treats the op as a transform; .quantile knows it's a reduction
+        apply_result.index = range(4)
+        apply_result.insert(loc=0, column="level_0", value=[1, 1, 2, 2])
+        apply_result.insert(loc=1, column="level_1", value=[0.4, 0.8, 0.4, 0.8])
+    tm.assert_frame_equal(apply_result, expected_seq, check_names=False)
+
+    agg_result = df_grouped.agg(f, q=80)
+    apply_result = df_grouped.apply(DataFrame.quantile, q=0.8)
+    tm.assert_frame_equal(agg_result, expected)
+    tm.assert_frame_equal(apply_result, expected, check_names=False)
+
+
+def test_len():
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((10, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=date_range("2000-01-01", periods=10, freq="B"),
+    )
+    grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])
+    assert len(grouped) == len(df)
+
+    grouped = df.groupby([lambda x: x.year, lambda x: x.month])
+    expected = len({(x.year, x.month) for x in df.index})
+    assert len(grouped) == expected
+
+
+def test_len_nan_group():
+    # issue 11016
+    df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]})
+    assert len(df.groupby("a")) == 0
+    assert len(df.groupby("b")) == 3
+    assert len(df.groupby(["a", "b"])) == 0
+
+
+def test_groupby_timedelta_median():
+    # issue 57926
+    expected = Series(data=Timedelta("1D"), index=["foo"], dtype="m8[us]")
+    df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1D")]})
+    gb = df.groupby("label")["timedelta"]
+    actual = gb.median()
+    tm.assert_series_equal(actual, expected, check_names=False)
+
+
+@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
+def test_len_categorical(dropna, observed, keys):
+    # GH#57595
+    df = DataFrame(
+        {
+            "a": Categorical([1, 1, 2, np.nan], categories=[1, 2, 3]),
+            "b": Categorical([1, 1, 2, np.nan], categories=[1, 2, 3]),
+            "c": 1,
+        }
+    )
+    gb = df.groupby(keys, observed=observed, dropna=dropna)
+    result = len(gb)
+    if observed and dropna:
+        expected = 2
+    elif observed and not dropna:
+        expected = 3
+    elif len(keys) == 1:
+        expected = 3 if dropna else 4
+    else:
+        expected = 9 if dropna else 16
+    assert result == expected, f"{result} vs {expected}"
+
+
+def test_basic_regression():
+    # regression
+    result = Series([1.0 * x for x in list(range(1, 10)) * 10])
+
+    data = np.random.default_rng(2).random(1100) * 10.0
+    groupings = Series(data)
+
+    grouped = result.groupby(groupings)
+    grouped.mean()
+
+
+def test_indices_concatenation_order():
+    # GH 2808
+
+    def f1(x):
+        y = x[(x.b % 2) == 1] ** 2
+        if y.empty:
+            multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=["b", "c"])
+            res = DataFrame(columns=["a"], index=multiindex)
+            return res
+        else:
+            y = y.set_index(["b", "c"])
+            return y
+
+    def f2(x):
+        y = x[(x.b % 2) == 1] ** 2
+        if y.empty:
+            return DataFrame()
+        else:
+            y = y.set_index(["b", "c"])
+            return y
+
+    def f3(x):
+        y = x[(x.b % 2) == 1] ** 2
+        if y.empty:
+            multiindex = MultiIndex(
+                levels=[[]] * 2, codes=[[]] * 2, names=["foo", "bar"]
+            )
+            res = DataFrame(columns=["a", "b"], index=multiindex)
+            return res
+        else:
+            return y
+
+    df = DataFrame({"a": [1, 2, 2, 2], "b": range(4), "c": range(5, 9)})
+
+    df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)})
+
+    # correct result
+    result1 = df.groupby("a").apply(f1)
+    result2 = df2.groupby("a").apply(f1)
+    tm.assert_frame_equal(result1, result2)
+
+    # should fail (not the same number of levels)
+    msg = "Cannot concat indices that do not have the same number of levels"
+    with pytest.raises(AssertionError, match=msg):
+        df.groupby("a").apply(f2)
+    with pytest.raises(AssertionError, match=msg):
+        df2.groupby("a").apply(f2)
+
+    # should fail (incorrect shape)
+    with pytest.raises(AssertionError, match=msg):
+        df.groupby("a").apply(f3)
+    with pytest.raises(AssertionError, match=msg):
+        df2.groupby("a").apply(f3)
+
+
+def test_attr_wrapper(ts):
+    grouped = ts.groupby(lambda x: x.weekday())
+
+    result = grouped.std()
+    expected = grouped.agg(lambda x: np.std(x, ddof=1))
+    tm.assert_series_equal(result, expected)
+
+    # this is pretty cool
+    result = grouped.describe()
+    expected = {name: gp.describe() for name, gp in grouped}
+    expected = DataFrame(expected).T
+    tm.assert_frame_equal(result, expected)
+
+    # get attribute
+    result = grouped.dtype
+    expected = grouped.agg(lambda x: x.dtype)
+    tm.assert_series_equal(result, expected)
+
+    # make sure raises error
+    msg = "'SeriesGroupBy' object has no attribute 'foo'"
+    with pytest.raises(AttributeError, match=msg):
+        grouped.foo
+
+
+def test_frame_groupby(tsframe):
+    grouped = tsframe.groupby(lambda x: x.weekday())
+
+    # aggregate
+    aggregated = grouped.aggregate("mean")
+    assert len(aggregated) == 5
+    assert len(aggregated.columns) == 4
+
+    # by string
+    tscopy = tsframe.copy()
+    tscopy["weekday"] = [x.weekday() for x in tscopy.index]
+    stragged = tscopy.groupby("weekday").aggregate("mean")
+    tm.assert_frame_equal(stragged, aggregated, check_names=False)
+
+    # transform
+    grouped = tsframe.head(30).groupby(lambda x: x.weekday())
+    transformed = grouped.transform(lambda x: x - x.mean())
+    assert len(transformed) == 30
+    assert len(transformed.columns) == 4
+
+    # transform propagate
+    transformed = grouped.transform(lambda x: x.mean())
+    for name, group in grouped:
+        mean = group.mean()
+        for idx in group.index:
+            tm.assert_series_equal(transformed.xs(idx), mean, check_names=False)
+
+    # iterate
+    for weekday, group in grouped:
+        assert group.index[0].weekday() == weekday
+
+    # groups / group_indices
+    groups = grouped.groups
+    indices = grouped.indices
+
+    for k, v in groups.items():
+        samething = tsframe.index.take(indices[k])
+        assert (samething == v).all()
+
+
+def test_frame_set_name_single(df):
+    grouped = df.groupby("A")
+
+    result = grouped.mean(numeric_only=True)
+    assert result.index.name == "A"
+
+    result = df.groupby("A", as_index=False).mean(numeric_only=True)
+    assert result.index.name != "A"
+
+    result = grouped[["C", "D"]].agg("mean")
+    assert result.index.name == "A"
+
+    result = grouped.agg({"C": "mean", "D": "std"})
+    assert result.index.name == "A"
+
+    result = grouped["C"].mean()
+    assert result.index.name == "A"
+    result = grouped["C"].agg("mean")
+    assert result.index.name == "A"
+    result = grouped["C"].agg(["mean", "std"])
+    assert result.index.name == "A"
+
+    msg = r"nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        grouped["C"].agg({"foo": "mean", "bar": "std"})
+
+
+def test_multi_func(df):
+    col1 = df["A"]
+    col2 = df["B"]
+
+    grouped = df.groupby([col1.get, col2.get])
+    agged = grouped.mean(numeric_only=True)
+    expected = df.groupby(["A", "B"]).mean()
+
+    # TODO groupby get drops names
+    tm.assert_frame_equal(
+        agged.loc[:, ["C", "D"]], expected.loc[:, ["C", "D"]], check_names=False
+    )
+
+    # some "groups" with no data
+    df = DataFrame(
+        {
+            "v1": np.random.default_rng(2).standard_normal(6),
+            "v2": np.random.default_rng(2).standard_normal(6),
+            "k1": np.array(["b", "b", "b", "a", "a", "a"]),
+            "k2": np.array(["1", "1", "1", "2", "2", "2"]),
+        },
+        index=["one", "two", "three", "four", "five", "six"],
+    )
+    # only verify that it works for now
+    grouped = df.groupby(["k1", "k2"])
+    grouped.agg("sum")
+
+
+def test_multi_key_multiple_functions(df):
+    grouped = df.groupby(["A", "B"])["C"]
+
+    agged = grouped.agg(["mean", "std"])
+    expected = DataFrame({"mean": grouped.agg("mean"), "std": grouped.agg("std")})
+    tm.assert_frame_equal(agged, expected)
+
+
+def test_frame_multi_key_function_list():
+    data = DataFrame(
+        {
+            "A": [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            "B": [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            "D": np.random.default_rng(2).standard_normal(11),
+            "E": np.random.default_rng(2).standard_normal(11),
+            "F": np.random.default_rng(2).standard_normal(11),
+        }
+    )
+
+    grouped = data.groupby(["A", "B"])
+    funcs = ["mean", "std"]
+    agged = grouped.agg(funcs)
+    expected = pd.concat(
+        [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)],
+        keys=["D", "E", "F"],
+        axis=1,
+    )
+    assert isinstance(agged.index, MultiIndex)
+    assert isinstance(expected.index, MultiIndex)
+    tm.assert_frame_equal(agged, expected)
+
+
+def test_frame_multi_key_function_list_partial_failure(using_infer_string):
+    data = DataFrame(
+        {
+            "A": [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            "B": [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            "C": [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            "D": np.random.default_rng(2).standard_normal(11),
+            "E": np.random.default_rng(2).standard_normal(11),
+            "F": np.random.default_rng(2).standard_normal(11),
+        }
+    )
+
+    grouped = data.groupby(["A", "B"])
+    funcs = ["mean", "std"]
+    msg = re.escape("agg function failed [how->mean,dtype->")
+    if using_infer_string:
+        msg = "dtype 'str' does not support operation 'mean'"
+    with pytest.raises(TypeError, match=msg):
+        grouped.agg(funcs)
+
+
+@pytest.mark.parametrize("op", [lambda x: x.sum(), lambda x: x.mean()])
+def test_groupby_multiple_columns(df, op):
+    data = df
+    grouped = data.groupby(["A", "B"])
+
+    result1 = op(grouped)
+
+    keys = []
+    values = []
+    for n1, gp1 in data.groupby("A"):
+        for n2, gp2 in gp1.groupby("B"):
+            keys.append((n1, n2))
+            values.append(op(gp2.loc[:, ["C", "D"]]))
+
+    mi = MultiIndex.from_tuples(keys, names=["A", "B"])
+    expected = pd.concat(values, axis=1).T
+    expected.index = mi
+
+    # a little bit crude
+    for col in ["C", "D"]:
+        result_col = op(grouped[col])
+        pivoted = result1[col]
+        exp = expected[col]
+        tm.assert_series_equal(result_col, exp)
+        tm.assert_series_equal(pivoted, exp)
+
+    # test single series works the same
+    result = data["C"].groupby([data["A"], data["B"]]).mean()
+    expected = data.groupby(["A", "B"]).mean()["C"]
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_as_index_select_column():
+    # GH 5764
+    df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
+    result = df.groupby("A", as_index=False)["B"].get_group(1)
+    expected = Series([2, 4], name="B")
+    tm.assert_series_equal(result, expected)
+
+    result = df.groupby("A", as_index=False, group_keys=True)["B"].apply(
+        lambda x: x.cumsum()
+    )
+    expected = Series([2, 6, 6], name="B", index=range(3))
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_as_index_select_column_sum_empty_df():
+    # GH 35246
+    df = DataFrame(columns=Index(["A", "B", "C"], name="alpha"))
+    left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False)
+
+    expected = DataFrame(columns=df.columns[:2], index=range(0))
+    # GH#50744 - Columns after selection shouldn't retain names
+    expected.columns.names = [None]
+    tm.assert_frame_equal(left, expected)
+
+
+def test_ops_not_as_index(reduction_func):
+    # GH 10355, 21090
+    # Using as_index=False should not modify grouped column
+
+    if reduction_func in ("corrwith", "nth", "ngroup"):
+        pytest.skip(f"GH 5755: Test not applicable for {reduction_func}")
+
+    df = DataFrame(
+        np.random.default_rng(2).integers(0, 5, size=(100, 2)), columns=["a", "b"]
+    )
+    expected = getattr(df.groupby("a"), reduction_func)()
+    if reduction_func == "size":
+        expected = expected.rename("size")
+    expected = expected.reset_index()
+
+    if reduction_func != "size":
+        # 32 bit compat -> groupby preserves dtype whereas reset_index casts to int64
+        expected["a"] = expected["a"].astype(df["a"].dtype)
+
+    g = df.groupby("a", as_index=False)
+
+    result = getattr(g, reduction_func)()
+    tm.assert_frame_equal(result, expected)
+
+    result = g.agg(reduction_func)
+    tm.assert_frame_equal(result, expected)
+
+    result = getattr(g["b"], reduction_func)()
+    tm.assert_frame_equal(result, expected)
+
+    result = g["b"].agg(reduction_func)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_as_index_series_return_frame(df):
+    grouped = df.groupby("A", as_index=False)
+    grouped2 = df.groupby(["A", "B"], as_index=False)
+
+    result = grouped["C"].agg("sum")
+    expected = grouped.agg("sum").loc[:, ["A", "C"]]
+    assert isinstance(result, DataFrame)
+    tm.assert_frame_equal(result, expected)
+
+    result2 = grouped2["C"].agg("sum")
+    expected2 = grouped2.agg("sum").loc[:, ["A", "B", "C"]]
+    assert isinstance(result2, DataFrame)
+    tm.assert_frame_equal(result2, expected2)
+
+    result = grouped["C"].sum()
+    expected = grouped.sum().loc[:, ["A", "C"]]
+    assert isinstance(result, DataFrame)
+    tm.assert_frame_equal(result, expected)
+
+    result2 = grouped2["C"].sum()
+    expected2 = grouped2.sum().loc[:, ["A", "B", "C"]]
+    assert isinstance(result2, DataFrame)
+    tm.assert_frame_equal(result2, expected2)
+
+
+def test_as_index_series_column_slice_raises(df):
+    # GH15072
+    grouped = df.groupby("A", as_index=False)
+    msg = r"Column\(s\) C already selected"
+
+    with pytest.raises(IndexError, match=msg):
+        grouped["C"].__getitem__("D")
+
+
+def test_groupby_as_index_cython(df):
+    data = df
+
+    # single-key
+    grouped = data.groupby("A", as_index=False)
+    result = grouped.mean(numeric_only=True)
+    expected = data.groupby(["A"]).mean(numeric_only=True)
+    expected.insert(0, "A", expected.index)
+    expected.index = RangeIndex(len(expected))
+    tm.assert_frame_equal(result, expected)
+
+    # multi-key
+    grouped = data.groupby(["A", "B"], as_index=False)
+    result = grouped.mean()
+    expected = data.groupby(["A", "B"]).mean()
+
+    arrays = list(zip(*expected.index.values, strict=True))
+    expected.insert(0, "A", arrays[0])
+    expected.insert(1, "B", arrays[1])
+    expected.index = RangeIndex(len(expected))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_as_index_series_scalar(df):
+    grouped = df.groupby(["A", "B"], as_index=False)
+
+    # GH #421
+
+    result = grouped["C"].agg(len)
+    expected = grouped.agg(len).loc[:, ["A", "B", "C"]]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_multiple_key():
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((10, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=date_range("2000-01-01", periods=10, freq="B"),
+    )
+    grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])
+    agged = grouped.sum()
+    tm.assert_almost_equal(df.values, agged.values)
+
+
+def test_groupby_multi_corner(df):
+    # test that having an all-NA column doesn't mess you up
+    df = df.copy()
+    df["bad"] = np.nan
+    agged = df.groupby(["A", "B"]).mean()
+
+    expected = df.groupby(["A", "B"]).mean()
+    expected["bad"] = np.nan
+
+    tm.assert_frame_equal(agged, expected)
+
+
+def test_raises_on_nuisance(df, using_infer_string):
+    grouped = df.groupby("A")
+    msg = re.escape("agg function failed [how->mean,dtype->")
+    if using_infer_string:
+        msg = "dtype 'str' does not support operation 'mean'"
+    with pytest.raises(TypeError, match=msg):
+        grouped.agg("mean")
+    with pytest.raises(TypeError, match=msg):
+        grouped.mean()
+
+    df = df.loc[:, ["A", "C", "D"]]
+    df["E"] = datetime.now()
+    grouped = df.groupby("A")
+    msg = "datetime64 type does not support operation 'sum'"
+    with pytest.raises(TypeError, match=msg):
+        grouped.agg("sum")
+    with pytest.raises(TypeError, match=msg):
+        grouped.sum()
+
+
+@pytest.mark.parametrize(
+    "agg_function",
+    ["max", "min"],
+)
+def test_keep_nuisance_agg(df, agg_function):
+    # GH 38815
+    grouped = df.groupby("A")
+    result = getattr(grouped, agg_function)()
+    expected = result.copy()
+    expected.loc["bar", "B"] = getattr(df.loc[df["A"] == "bar", "B"], agg_function)()
+    expected.loc["foo", "B"] = getattr(df.loc[df["A"] == "foo", "B"], agg_function)()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "agg_function",
+    ["sum", "mean", "prod", "std", "var", "sem", "median"],
+)
+@pytest.mark.parametrize("numeric_only", [True, False])
+def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string):
+    # GH 38774, GH 38815
+    grouped = df.groupby("A")
+
+    no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
+    if agg_function in no_drop_nuisance and not numeric_only:
+        # Added numeric_only as part of GH#46560; these do not drop nuisance
+        # columns when numeric_only is False
+        if using_infer_string:
+            msg = f"dtype 'str' does not support operation '{agg_function}'"
+            klass = TypeError
+        elif agg_function in ("std", "sem"):
+            klass = ValueError
+            msg = "could not convert string to float: 'one'"
+        else:
+            klass = TypeError
+            msg = re.escape(f"agg function failed [how->{agg_function},dtype->")
+        with pytest.raises(klass, match=msg):
+            getattr(grouped, agg_function)(numeric_only=numeric_only)
+    else:
+        result = getattr(grouped, agg_function)(numeric_only=numeric_only)
+        if not numeric_only and agg_function == "sum":
+            # sum is successful on column B
+            columns = ["A", "B", "C", "D"]
+        else:
+            columns = ["A", "C", "D"]
+        expected = getattr(df.loc[:, columns].groupby("A"), agg_function)(
+            numeric_only=numeric_only
+        )
+        tm.assert_frame_equal(result, expected)
+
+
+def test_raise_on_nuisance_python_single(df, using_infer_string):
+    # GH 38815
+    grouped = df.groupby("A")
+
+    err = ValueError
+    msg = "could not convert"
+    if using_infer_string:
+        err = TypeError
+        msg = "dtype 'str' does not support operation 'skew'"
+    with pytest.raises(err, match=msg):
+        grouped.skew()
+
+
+def test_raise_on_nuisance_python_multiple(three_group, using_infer_string):
+    grouped = three_group.groupby(["A", "B"])
+    msg = re.escape("agg function failed [how->mean,dtype->")
+    if using_infer_string:
+        msg = "dtype 'str' does not support operation 'mean'"
+    with pytest.raises(TypeError, match=msg):
+        grouped.agg("mean")
+    with pytest.raises(TypeError, match=msg):
+        grouped.mean()
+
+
+def test_empty_groups_corner(multiindex_dataframe_random_data):
+    # handle empty groups
+    df = DataFrame(
+        {
+            "k1": np.array(["b", "b", "b", "a", "a", "a"]),
+            "k2": np.array(["1", "1", "1", "2", "2", "2"]),
+            "k3": ["foo", "bar"] * 3,
+            "v1": np.random.default_rng(2).standard_normal(6),
+            "v2": np.random.default_rng(2).standard_normal(6),
+        }
+    )
+
+    grouped = df.groupby(["k1", "k2"])
+    result = grouped[["v1", "v2"]].agg("mean")
+    expected = grouped.mean(numeric_only=True)
+    tm.assert_frame_equal(result, expected)
+
+    grouped = multiindex_dataframe_random_data[3:5].groupby(level=0)
+    agged = grouped.apply(lambda x: x.mean())
+    agged_A = grouped["A"].apply("mean")
+    tm.assert_series_equal(agged["A"], agged_A)
+    assert agged.index.name == "first"
+
+
+def test_nonsense_func():
+    df = DataFrame([0])
+    msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'"
+    with pytest.raises(TypeError, match=msg):
+        df.groupby(lambda x: x + "foo")
+
+
+def test_wrap_aggregated_output_multindex(
+    multiindex_dataframe_random_data, using_infer_string
+):
+    df = multiindex_dataframe_random_data.T
+    df["baz", "two"] = "peekaboo"
+
+    keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
+    msg = re.escape("agg function failed [how->mean,dtype->")
+    if using_infer_string:
+        msg = "dtype 'str' does not support operation 'mean'"
+    with pytest.raises(TypeError, match=msg):
+        df.groupby(keys).agg("mean")
+    agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean")
+    assert isinstance(agged.columns, MultiIndex)
+
+    def aggfun(ser):
+        if ser.name == ("foo", "one"):
+            raise TypeError("Test error message")
+        return ser.sum()
+
+    with pytest.raises(TypeError, match="Test error message"):
+        df.groupby(keys).aggregate(aggfun)
+
+
+def test_groupby_level_apply(multiindex_dataframe_random_data):
+    result = multiindex_dataframe_random_data.groupby(level=0).count()
+    assert result.index.name == "first"
+    result = multiindex_dataframe_random_data.groupby(level=1).count()
+    assert result.index.name == "second"
+
+    result = multiindex_dataframe_random_data["A"].groupby(level=0).count()
+    assert result.index.name == "first"
+
+
+def test_groupby_level_mapper(multiindex_dataframe_random_data):
+    deleveled = multiindex_dataframe_random_data.reset_index()
+
+    mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1}
+    mapper1 = {"one": 0, "two": 0, "three": 1}
+
+    result0 = multiindex_dataframe_random_data.groupby(mapper0, level=0).sum()
+    result1 = multiindex_dataframe_random_data.groupby(mapper1, level=1).sum()
+
+    mapped_level0 = np.array(
+        [mapper0.get(x) for x in deleveled["first"]], dtype=np.int64
+    )
+    mapped_level1 = np.array(
+        [mapper1.get(x) for x in deleveled["second"]], dtype=np.int64
+    )
+    expected0 = multiindex_dataframe_random_data.groupby(mapped_level0).sum()
+    expected1 = multiindex_dataframe_random_data.groupby(mapped_level1).sum()
+    expected0.index.name, expected1.index.name = "first", "second"
+
+    tm.assert_frame_equal(result0, expected0)
+    tm.assert_frame_equal(result1, expected1)
+
+
+def test_groupby_level_nonmulti():
+    # GH 1313, GH 13901
+    s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo"))
+    expected = Series([11, 22, 3, 4, 5, 6], Index(list(range(1, 7)), name="foo"))
+
+    result = s.groupby(level=0).sum()
+    tm.assert_series_equal(result, expected)
+    result = s.groupby(level=[0]).sum()
+    tm.assert_series_equal(result, expected)
+    result = s.groupby(level=-1).sum()
+    tm.assert_series_equal(result, expected)
+    result = s.groupby(level=[-1]).sum()
+    tm.assert_series_equal(result, expected)
+
+    msg = "level > 0 or level < -1 only valid with MultiIndex"
+    with pytest.raises(ValueError, match=msg):
+        s.groupby(level=1)
+    with pytest.raises(ValueError, match=msg):
+        s.groupby(level=-2)
+    msg = "No group keys passed!"
+    with pytest.raises(ValueError, match=msg):
+        s.groupby(level=[])
+    msg = "multiple levels only valid with MultiIndex"
+    with pytest.raises(ValueError, match=msg):
+        s.groupby(level=[0, 0])
+    with pytest.raises(ValueError, match=msg):
+        s.groupby(level=[0, 1])
+    msg = "level > 0 or level < -1 only valid with MultiIndex"
+    with pytest.raises(ValueError, match=msg):
+        s.groupby(level=[1])
+
+
+def test_groupby_complex():
+    # GH 12902
+    a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
+    expected = Series((1 + 2j, 5 + 10j), index=Index([0, 1]))
+
+    result = a.groupby(level=0).sum()
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_complex_mean():
+    # GH 26475
+    df = DataFrame(
+        [
+            {"a": 2, "b": 1 + 2j},
+            {"a": 1, "b": 1 + 1j},
+            {"a": 1, "b": 1 + 2j},
+        ]
+    )
+    result = df.groupby("b").mean()
+    expected = DataFrame(
+        [[1.0], [1.5]],
+        index=Index([(1 + 1j), (1 + 2j)], name="b"),
+        columns=Index(["a"]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_complex_numbers():
+    # GH 17927
+    df = DataFrame(
+        [
+            {"a": 1, "b": 1 + 1j},
+            {"a": 1, "b": 1 + 2j},
+            {"a": 4, "b": 1},
+        ]
+    )
+    expected = DataFrame(
+        np.array([1, 1, 1], dtype=np.int64),
+        index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"),
+        columns=Index(["a"]),
+    )
+    result = df.groupby("b", sort=False).count()
+    tm.assert_frame_equal(result, expected)
+
+    # Sorted by the magnitude of the complex numbers
+    expected.index = Index([(1 + 0j), (1 + 1j), (1 + 2j)], name="b")
+    result = df.groupby("b", sort=True).count()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_series_indexed_differently():
+    s1 = Series(
+        [5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7],
+        index=Index(["a", "b", "c", "d", "e", "f", "g"]),
+    )
+    s2 = Series(
+        [1.0, 1.0, 4.0, 5.0, 5.0, 7.0], index=Index(["a", "b", "d", "f", "g", "h"])
+    )
+
+    grouped = s1.groupby(s2)
+    agged = grouped.mean()
+    exp = s1.groupby(s2.reindex(s1.index).get).mean()
+    tm.assert_series_equal(agged, exp)
+
+
+def test_groupby_with_hier_columns():
+    tuples = list(
+        zip(
+            *[
+                ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
+                ["one", "two", "one", "two", "one", "two", "one", "two"],
+            ],
+            strict=True,
+        )
+    )
+    index = MultiIndex.from_tuples(tuples)
+    columns = MultiIndex.from_tuples(
+        [("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")]
+    )
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((8, 4)), index=index, columns=columns
+    )
+
+    result = df.groupby(level=0).mean()
+    tm.assert_index_equal(result.columns, columns)
+
+    result = df.groupby(level=0).agg("mean")
+    tm.assert_index_equal(result.columns, columns)
+
+    result = df.groupby(level=0).apply(lambda x: x.mean())
+    tm.assert_index_equal(result.columns, columns)
+
+    # add a nuisance column
+    sorted_columns, _ = columns.sortlevel(0)
+    df["A", "foo"] = "bar"
+    result = df.groupby(level=0).mean(numeric_only=True)
+    tm.assert_index_equal(result.columns, df.columns[:-1])
+
+
+def test_grouping_ndarray(df):
+    grouped = df.groupby(df["A"].values)
+    grouped2 = df.groupby(df["A"].rename(None))
+
+    result = grouped.sum()
+    expected = grouped2.sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_wrong_multi_labels():
+    index = Index([0, 1, 2, 3, 4], name="index")
+    data = DataFrame(
+        {
+            "foo": ["foo1", "foo1", "foo2", "foo1", "foo3"],
+            "bar": ["bar1", "bar2", "bar2", "bar1", "bar1"],
+            "baz": ["baz1", "baz1", "baz1", "baz2", "baz2"],
+            "spam": ["spam2", "spam3", "spam2", "spam1", "spam1"],
+            "data": [20, 30, 40, 50, 60],
+        },
+        index=index,
+    )
+
+    grouped = data.groupby(["foo", "bar", "baz", "spam"])
+
+    result = grouped.agg("mean")
+    expected = grouped.mean()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_series_with_name(df):
+    result = df.groupby(df["A"]).mean(numeric_only=True)
+    result2 = df.groupby(df["A"], as_index=False).mean(numeric_only=True)
+    assert result.index.name == "A"
+    assert "A" in result2
+
+    result = df.groupby([df["A"], df["B"]]).mean()
+    result2 = df.groupby([df["A"], df["B"]], as_index=False).mean()
+    assert result.index.names == ("A", "B")
+    assert "A" in result2
+    assert "B" in result2
+
+
+def test_seriesgroupby_name_attr(df):
+    # GH 6265
+    result = df.groupby("A")["C"]
+    assert result.count().name == "C"
+    assert result.mean().name == "C"
+
+    testFunc = lambda x: np.sum(x) * 2
+    assert result.agg(testFunc).name == "C"
+
+
+def test_consistency_name():
+    # GH 12363
+
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
+            "C": np.random.default_rng(2).standard_normal(8) + 1.0,
+            "D": np.arange(8),
+        }
+    )
+
+    expected = df.groupby(["A"]).B.count()
+    result = df.B.groupby(df.A).count()
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_name_propagation(df):
+    # GH 6124
+    def summarize(df, name=None):
+        return Series({"count": 1, "mean": 2, "omissions": 3}, name=name)
+
+    def summarize_random_name(df):
+        # Provide a different name for each Series.  In this case, groupby
+        # should not attempt to propagate the Series name since they are
+        # inconsistent.
+        return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["C"])
+
+    metrics = df.groupby("A").apply(summarize)
+    assert metrics.columns.name is None
+    metrics = df.groupby("A").apply(summarize, "metrics")
+    assert metrics.columns.name == "metrics"
+    metrics = df.groupby("A").apply(summarize_random_name)
+    assert metrics.columns.name is None
+
+
+def test_groupby_nonstring_columns():
+    df = DataFrame([np.arange(10) for x in range(10)])
+    grouped = df.groupby(0)
+    result = grouped.mean()
+    expected = df.groupby(df[0]).mean()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_mixed_type_columns():
+    # GH 13432, unorderable types in py3
+    df = DataFrame([[0, 1, 2]], columns=["A", "B", 0])
+    expected = DataFrame([[1, 2]], columns=["B", 0], index=Index([0], name="A"))
+
+    result = df.groupby("A").first()
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby("A").sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_cython_grouper_series_bug_noncontig():
+    arr = np.empty((100, 100))
+    arr.fill(np.nan)
+    obj = Series(arr[:, 0])
+    inds = np.tile(range(10), 10)
+
+    result = obj.groupby(inds).agg(Series.median)
+    assert result.isna().all()
+
+
+def test_series_grouper_noncontig_index():
+    index = Index(["a" * 10] * 100)
+
+    values = Series(np.random.default_rng(2).standard_normal(50), index=index[::2])
+    labels = np.random.default_rng(2).integers(0, 5, 50)
+
+    # it works!
+    grouped = values.groupby(labels)
+
+    # accessing the index elements causes segfault
+    f = lambda x: len(set(map(id, x.index)))
+    grouped.agg(f)
+
+
+def test_convert_objects_leave_decimal_alone():
+    s = Series(range(5))
+    labels = np.array(["a", "b", "c", "d", "e"], dtype="O")
+
+    def convert_fast(x):
+        return Decimal(str(x.mean()))
+
+    def convert_force_pure(x):
+        # base will be length 0
+        assert len(x.values.base) > 0
+        return Decimal(str(x.mean()))
+
+    grouped = s.groupby(labels)
+
+    result = grouped.agg(convert_fast)
+    assert result.dtype == np.object_
+    assert isinstance(result.iloc[0], Decimal)
+
+    result = grouped.agg(convert_force_pure)
+    assert result.dtype == np.object_
+    assert isinstance(result.iloc[0], Decimal)
+
+
+def test_groupby_dtype_inference_empty():
+    # GH 6733
+    df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")})
+    assert df["x"].dtype == np.float64
+
+    result = df.groupby("x").first()
+    exp_index = Index([], name="x", dtype=np.float64)
+    expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")})
+    tm.assert_frame_equal(result, expected, by_blocks=True)
+
+
+def test_groupby_unit64_float_conversion():
+    # GH: 30859 groupby converts unit64 to floats sometimes
+    df = DataFrame({"first": [1], "second": [1], "value": [16148277970000000000]})
+    result = df.groupby(["first", "second"])["value"].max()
+    expected = Series(
+        [16148277970000000000],
+        MultiIndex.from_product([[1], [1]], names=["first", "second"]),
+        name="value",
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_list_infer_array_like(df):
+    result = df.groupby(list(df["A"])).mean(numeric_only=True)
+    expected = df.groupby(df["A"]).mean(numeric_only=True)
+    tm.assert_frame_equal(result, expected, check_names=False)
+
+    with pytest.raises(KeyError, match=r"^'foo'$"):
+        df.groupby(list(df["A"][:-1]))
+
+    # pathological case of ambiguity
+    df = DataFrame(
+        {
+            "foo": [0, 1],
+            "bar": [3, 4],
+            "val": np.random.default_rng(2).standard_normal(2),
+        }
+    )
+
+    result = df.groupby(["foo", "bar"]).mean()
+    expected = df.groupby([df["foo"], df["bar"]]).mean()[["val"]]
+
+
+def test_groupby_keys_same_size_as_index():
+    # GH 11185
+    freq = "s"
+    index = date_range(
+        start=Timestamp("2015-09-29T11:34:44-0700"), periods=2, freq=freq
+    )
+    df = DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index)
+    result = df.groupby([Grouper(level=0, freq=freq), "metric"]).mean()
+    expected = df.set_index([df.index, "metric"]).astype(float)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_one_row():
+    # GH 11741
+    msg = r"^'Z'$"
+    df1 = DataFrame(
+        np.random.default_rng(2).standard_normal((1, 4)), columns=list("ABCD")
+    )
+    with pytest.raises(KeyError, match=msg):
+        df1.groupby("Z")
+    df2 = DataFrame(
+        np.random.default_rng(2).standard_normal((2, 4)), columns=list("ABCD")
+    )
+    with pytest.raises(KeyError, match=msg):
+        df2.groupby("Z")
+
+
+def test_groupby_nat_exclude():
+    # GH 6992
+    df = DataFrame(
+        {
+            "values": np.random.default_rng(2).standard_normal(8),
+            "dt": [
+                np.nan,
+                Timestamp("2013-01-01"),
+                np.nan,
+                Timestamp("2013-02-01"),
+                np.nan,
+                Timestamp("2013-02-01"),
+                np.nan,
+                Timestamp("2013-01-01"),
+            ],
+            "str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"],
+        }
+    )
+    grouped = df.groupby("dt")
+
+    expected = [
+        RangeIndex(start=1, stop=13, step=6),
+        RangeIndex(start=3, stop=7, step=2),
+    ]
+    keys = sorted(grouped.groups.keys())
+    assert len(keys) == 2
+    for k, e in zip(keys, expected, strict=True):
+        # grouped.groups keys are np.datetime64 with system tz
+        # not to be affected by tz, only compare values
+        tm.assert_index_equal(grouped.groups[k], e)
+
+    # confirm obj is not filtered
+    tm.assert_frame_equal(grouped._grouper.groupings[0].obj, df)
+    assert grouped.ngroups == 2
+
+    expected = {
+        Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.intp),
+        Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.intp),
+    }
+
+    for k in grouped.indices:
+        tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
+
+    tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]])
+    tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]])
+
+    with pytest.raises(KeyError, match=r"^NaT$"):
+        grouped.get_group(pd.NaT)
+
+    nan_df = DataFrame(
+        {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]}
+    )
+    assert nan_df["nan"].dtype == "float64"
+    assert nan_df["nat"].dtype == "datetime64[s]"
+
+    for key in ["nan", "nat"]:
+        grouped = nan_df.groupby(key)
+        assert grouped.groups == {}
+        assert grouped.ngroups == 0
+        assert grouped.indices == {}
+        with pytest.raises(KeyError, match=r"^nan$"):
+            grouped.get_group(np.nan)
+        with pytest.raises(KeyError, match=r"^NaT$"):
+            grouped.get_group(pd.NaT)
+
+
+def test_groupby_two_group_keys_all_nan():
+    # GH #36842: Grouping over two group keys shouldn't raise an error
+    df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]})
+    result = df.groupby(["a", "b"]).indices
+    assert result == {}
+
+
+def test_groupby_2d_malformed():
+    d = DataFrame(index=range(2))
+    d["group"] = ["g1", "g2"]
+    d["zeros"] = [0, 0]
+    d["ones"] = [1, 1]
+    d["label"] = ["l1", "l2"]
+    tmp = d.groupby(["group"]).mean(numeric_only=True)
+    res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
+    tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
+    tm.assert_numpy_array_equal(tmp.values, res_values)
+
+
+def test_int32_overflow():
+    B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)))
+    A = np.arange(25000)
+    df = DataFrame(
+        {
+            "A": A,
+            "B": B,
+            "C": A,
+            "D": B,
+            "E": np.random.default_rng(2).standard_normal(25000),
+        }
+    )
+
+    left = df.groupby(["A", "B", "C", "D"]).sum()
+    right = df.groupby(["D", "C", "B", "A"]).sum()
+    assert len(left) == len(right)
+
+
+def test_groupby_sort_multi():
+    df = DataFrame(
+        {
+            "a": ["foo", "bar", "baz"],
+            "b": [3, 2, 1],
+            "c": [0, 1, 2],
+            "d": np.random.default_rng(2).standard_normal(3),
+        }
+    )
+
+    tups = [tuple(row) for row in df[["a", "b", "c"]].values]
+    tups = com.asarray_tuplesafe(tups)
+    result = df.groupby(["a", "b", "c"], sort=True).sum()
+    tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
+
+    tups = [tuple(row) for row in df[["c", "a", "b"]].values]
+    tups = com.asarray_tuplesafe(tups)
+    result = df.groupby(["c", "a", "b"], sort=True).sum()
+    tm.assert_numpy_array_equal(result.index.values, tups)
+
+    tups = [tuple(x) for x in df[["b", "c", "a"]].values]
+    tups = com.asarray_tuplesafe(tups)
+    result = df.groupby(["b", "c", "a"], sort=True).sum()
+    tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
+
+    df = DataFrame(
+        {
+            "a": [0, 1, 2, 0, 1, 2],
+            "b": [0, 0, 0, 1, 1, 1],
+            "d": np.random.default_rng(2).standard_normal(6),
+        }
+    )
+    grouped = df.groupby(["a", "b"])["d"]
+    result = grouped.sum()
+
+    def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
+        tups = [tuple(row) for row in df[keys].values]
+        tups = com.asarray_tuplesafe(tups)
+        expected = f(df.groupby(tups)[field])
+        for k, v in expected.items():
+            assert result[k] == v
+
+    _check_groupby(df, result, ["a", "b"], "d")
+
+
+def test_dont_clobber_name_column():
+    df = DataFrame(
+        {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2}
+    )
+
+    result = df.groupby("key", group_keys=False).apply(lambda x: x)
+    tm.assert_frame_equal(result, df[["name"]])
+
+
+def test_skip_group_keys():
+    tsf = DataFrame(
+        np.random.default_rng(2).standard_normal((10, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=date_range("2000-01-01", periods=10, freq="B"),
+    )
+
+    grouped = tsf.groupby(lambda x: x.month, group_keys=False)
+    result = grouped.apply(lambda x: x.sort_values(by="A")[:3])
+
+    pieces = [group.sort_values(by="A")[:3] for key, group in grouped]
+
+    expected = pd.concat(pieces)
+    tm.assert_frame_equal(result, expected)
+
+    grouped = tsf["A"].groupby(lambda x: x.month, group_keys=False)
+    result = grouped.apply(lambda x: x.sort_values()[:3])
+
+    pieces = [group.sort_values()[:3] for key, group in grouped]
+
+    expected = pd.concat(pieces)
+    tm.assert_series_equal(result, expected)
+
+
+def test_no_nonsense_name(float_frame):
+    # GH #995
+    s = float_frame["C"].copy()
+    s.name = None
+
+    result = s.groupby(float_frame["A"]).agg("sum")
+    assert result.name is None
+
+
+def test_multifunc_sum_bug():
+    # GH #1065
+    x = DataFrame(np.arange(9).reshape(3, 3))
+    x["test"] = 0
+    x["fl"] = [1.3, 1.5, 1.6]
+
+    grouped = x.groupby("test")
+    result = grouped.agg({"fl": "sum", 2: "size"})
+    assert result["fl"].dtype == np.float64
+
+
+def test_handle_dict_return_value(df):
+    def f(group):
+        return {"max": group.max(), "min": group.min()}
+
+    def g(group):
+        return Series({"max": group.max(), "min": group.min()})
+
+    result = df.groupby("A")["C"].apply(f)
+    expected = df.groupby("A")["C"].apply(g)
+
+    assert isinstance(result, Series)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("grouper", ["A", ["A", "B"]])
+def test_set_group_name(df, grouper):
+    def f(group):
+        assert group.name is not None
+        return group
+
+    def freduce(group):
+        assert group.name is not None
+        return group.sum()
+
+    def freducex(x):
+        return freduce(x)
+
+    grouped = df.groupby(grouper, group_keys=False)
+
+    # make sure all these work
+    grouped.apply(f)
+    grouped.aggregate(freduce)
+    grouped.aggregate({"C": freduce, "D": freduce})
+    grouped.transform(f)
+
+    grouped["C"].apply(f)
+    grouped["C"].aggregate(freduce)
+    grouped["C"].aggregate([freduce, freducex])
+    grouped["C"].transform(f)
+
+
+def test_group_name_available_in_inference_pass():
+    # gh-15062
+    df = DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)})
+
+    names = []
+
+    def f(group):
+        names.append(group.name)
+        return group.copy()
+
+    df.groupby("a", sort=False, group_keys=False).apply(f)
+    expected_names = [0, 1, 2]
+    assert names == expected_names
+
+
+def test_no_dummy_key_names(df):
+    # see gh-1291
+    result = df.groupby(df["A"].values).sum()
+    assert result.index.name is None
+
+    result2 = df.groupby([df["A"].values, df["B"].values]).sum()
+    assert result2.index.names == (None, None)
+
+
+def test_groupby_sort_multiindex_series():
+    # series multiindex groupby sort argument was not being passed through
+    # _compress_group_index
+    # GH 9444
+    index = MultiIndex(
+        levels=[[1, 2], [1, 2]],
+        codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
+        names=["a", "b"],
+    )
+    mseries = Series([0, 1, 2, 3, 4, 5], index=index)
+    index = MultiIndex(
+        levels=[[1, 2], [1, 2]], codes=[[0, 0, 1], [1, 0, 0]], names=["a", "b"]
+    )
+    mseries_result = Series([0, 2, 4], index=index)
+
+    result = mseries.groupby(level=["a", "b"], sort=False).first()
+    tm.assert_series_equal(result, mseries_result)
+    result = mseries.groupby(level=["a", "b"], sort=True).first()
+    tm.assert_series_equal(result, mseries_result.sort_index())
+
+
+def test_groupby_reindex_inside_function():
+    periods = 1000
+    ind = date_range(start="2012/1/1", freq="5min", periods=periods)
+    df = DataFrame({"high": np.arange(periods), "low": np.arange(periods)}, index=ind)
+
+    def agg_before(func, fix=False):
+        """
+        Run an aggregate func on the subset of data.
+        """
+
+        def _func(data):
+            d = data.loc[data.index.map(lambda x: x.hour < 11)].dropna()
+            if fix:
+                data[data.index[0]]
+            if len(d) == 0:
+                return None
+            return func(d)
+
+        return _func
+
+    grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
+    closure_bad = grouped.agg({"high": agg_before(np.max)})
+    closure_good = grouped.agg({"high": agg_before(np.max, True)})
+
+    tm.assert_frame_equal(closure_bad, closure_good)
+
+
+def test_groupby_multiindex_missing_pair():
+    # GH9049
+    df = DataFrame(
+        {
+            "group1": ["a", "a", "a", "b"],
+            "group2": ["c", "c", "d", "c"],
+            "value": [1, 1, 1, 5],
+        }
+    )
+    df = df.set_index(["group1", "group2"])
+    df_grouped = df.groupby(level=["group1", "group2"], sort=True)
+
+    res = df_grouped.agg("sum")
+    idx = MultiIndex.from_tuples(
+        [("a", "c"), ("a", "d"), ("b", "c")], names=["group1", "group2"]
+    )
+    exp = DataFrame([[2], [1], [5]], index=idx, columns=["value"])
+
+    tm.assert_frame_equal(res, exp)
+
+
+def test_groupby_multiindex_not_lexsorted(performance_warning):
+    # GH 11640
+
+    # define the lexsorted version
+    lexsorted_mi = MultiIndex.from_tuples(
+        [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
+    )
+    lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
+    assert lexsorted_df.columns._is_lexsorted()
+
+    # define the non-lexsorted version
+    not_lexsorted_df = DataFrame(
+        columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]
+    )
+    not_lexsorted_df = not_lexsorted_df.pivot_table(
+        index="a", columns=["b", "c"], values="d"
+    )
+    not_lexsorted_df = not_lexsorted_df.reset_index()
+    assert not not_lexsorted_df.columns._is_lexsorted()
+
+    expected = lexsorted_df.groupby("a").mean()
+    with tm.assert_produces_warning(performance_warning):
+        result = not_lexsorted_df.groupby("a").mean()
+    tm.assert_frame_equal(expected, result)
+
+    # a transforming function should work regardless of sort
+    # GH 14776
+    df = DataFrame(
+        {"x": ["a", "a", "b", "a"], "y": [1, 1, 2, 2], "z": [1, 2, 3, 4]}
+    ).set_index(["x", "y"])
+    assert not df.index._is_lexsorted()
+
+    for level in [0, 1, [0, 1]]:
+        for sort in [False, True]:
+            result = df.groupby(level=level, sort=sort, group_keys=False).apply(
+                DataFrame.drop_duplicates
+            )
+            expected = df
+            tm.assert_frame_equal(expected, result)
+
+            result = (
+                df.sort_index()
+                .groupby(level=level, sort=sort, group_keys=False)
+                .apply(DataFrame.drop_duplicates)
+            )
+            expected = df.sort_index()
+            tm.assert_frame_equal(expected, result)
+
+
+def test_index_label_overlaps_location():
+    # checking we don't have any label/location confusion in the
+    # wake of GH5375
+    df = DataFrame(list("ABCDE"), index=[2, 0, 2, 1, 1])
+    g = df.groupby(list("ababb"))
+    actual = g.filter(lambda x: len(x) > 2)
+    expected = df.iloc[[1, 3, 4]]
+    tm.assert_frame_equal(actual, expected)
+
+    ser = df[0]
+    g = ser.groupby(list("ababb"))
+    actual = g.filter(lambda x: len(x) > 2)
+    expected = ser.take([1, 3, 4])
+    tm.assert_series_equal(actual, expected)
+
+    #  and again, with a generic Index of floats
+    df.index = df.index.astype(float)
+    g = df.groupby(list("ababb"))
+    actual = g.filter(lambda x: len(x) > 2)
+    expected = df.iloc[[1, 3, 4]]
+    tm.assert_frame_equal(actual, expected)
+
+    ser = df[0]
+    g = ser.groupby(list("ababb"))
+    actual = g.filter(lambda x: len(x) > 2)
+    expected = ser.take([1, 3, 4])
+    tm.assert_series_equal(actual, expected)
+
+
+def test_transform_doesnt_clobber_ints():
+    # GH 7972
+    n = 6
+    x = np.arange(n)
+    df = DataFrame({"a": x // 2, "b": 2.0 * x, "c": 3.0 * x})
+    df2 = DataFrame({"a": x // 2 * 1.0, "b": 2.0 * x, "c": 3.0 * x})
+
+    gb = df.groupby("a")
+    result = gb.transform("mean")
+
+    gb2 = df2.groupby("a")
+    expected = gb2.transform("mean")
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "sort_column",
+    ["ints", "floats", "strings", ["ints", "floats"], ["ints", "strings"]],
+)
+@pytest.mark.parametrize(
+    "group_column", ["int_groups", "string_groups", ["int_groups", "string_groups"]]
+)
+def test_groupby_preserves_sort(sort_column, group_column):
+    # Test to ensure that groupby always preserves sort order of original
+    # object. Issue #8588 and #9651
+
+    df = DataFrame(
+        {
+            "int_groups": [3, 1, 0, 1, 0, 3, 3, 3],
+            "string_groups": ["z", "a", "z", "a", "a", "g", "g", "g"],
+            "ints": [8, 7, 4, 5, 2, 9, 1, 1],
+            "floats": [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
+            "strings": ["z", "d", "a", "e", "word", "word2", "42", "47"],
+        }
+    )
+
+    # Try sorting on different types and with different group types
+
+    df = df.sort_values(by=sort_column)
+    g = df.groupby(group_column)
+
+    def test_sort(x):
+        tm.assert_frame_equal(x, x.sort_values(by=sort_column))
+
+    g.apply(test_sort)
+
+
+def test_pivot_table_values_key_error():
+    # This test is designed to replicate the error in issue #14938
+    df = DataFrame(
+        {
+            "eventDate": date_range(datetime.today(), periods=20, freq="ME").tolist(),
+            "thename": range(20),
+        }
+    )
+
+    df["year"] = df.set_index("eventDate").index.year
+    df["month"] = df.set_index("eventDate").index.month
+
+    with pytest.raises(KeyError, match="'badname'"):
+        df.reset_index().pivot_table(
+            index="year", columns="month", values="badname", aggfunc="count"
+        )
+
+
+@pytest.mark.parametrize("columns", ["C", ["C"]])
+@pytest.mark.parametrize("keys", [["A"], ["A", "B"]])
+@pytest.mark.parametrize(
+    "values",
+    [
+        [True],
+        [0],
+        [0.0],
+        ["a"],
+        Categorical([0]),
+        [to_datetime(0)],
+        date_range(0, 1, 1, tz="US/Eastern"),
+        pd.period_range("2016-01-01", periods=3, freq="D"),
+        pd.array([0], dtype="Int64"),
+        pd.array([0], dtype="Float64"),
+        pd.array([False], dtype="boolean"),
+    ],
+    ids=[
+        "bool",
+        "int",
+        "float",
+        "str",
+        "cat",
+        "dt64",
+        "dt64tz",
+        "period",
+        "Int64",
+        "Float64",
+        "boolean",
+    ],
+)
+@pytest.mark.parametrize("method", ["attr", "agg", "apply"])
+@pytest.mark.parametrize(
+    "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew", "kurt"]
+)
+def test_empty_groupby(columns, keys, values, method, op, dropna, using_infer_string):
+    # GH8093 & GH26411
+    override_dtype = None
+
+    if isinstance(values, BooleanArray) and op in ["sum", "prod"]:
+        # We expect to get Int64 back for these
+        override_dtype = "Int64"
+
+    if isinstance(values[0], bool) and op in ("prod", "sum"):
+        # sum/product of bools is an integer
+        override_dtype = "int64"
+
+    df = DataFrame({"A": values, "B": values, "C": values}, columns=list("ABC"))
+
+    if hasattr(values, "dtype"):
+        # check that we did the construction right
+        assert (df.dtypes == values.dtype).all()
+
+    df = df.iloc[:0]
+
+    gb = df.groupby(keys, group_keys=False, dropna=dropna, observed=False)[columns]
+
+    def get_result(**kwargs):
+        if method == "attr":
+            return getattr(gb, op)(**kwargs)
+        else:
+            return getattr(gb, method)(op, **kwargs)
+
+    def get_categorical_invalid_expected():
+        # Categorical is special without 'observed=True', we get a NaN entry
+        #  corresponding to the unobserved group. If we passed observed=True
+        #  to groupby, expected would just be 'df.set_index(keys)[columns]'
+        #  as below
+        lev = Categorical([0], dtype=values.dtype)
+        if len(keys) != 1:
+            idx = MultiIndex.from_product([lev, lev], names=keys)
+        else:
+            # all columns are dropped, but we end up with one row
+            # Categorical is special without 'observed=True'
+            idx = Index(lev, name=keys[0])
+
+        if using_infer_string:
+            columns = Index([], dtype="str")
+        else:
+            columns = []
+        expected = DataFrame([], columns=columns, index=idx)
+        return expected
+
+    is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype)
+    is_dt64 = df.dtypes.iloc[0].kind == "M"
+    is_cat = isinstance(values, Categorical)
+    is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype)
+
+    if (
+        isinstance(values, Categorical)
+        and not values.ordered
+        and op in ["min", "max", "idxmin", "idxmax"]
+    ):
+        if op in ["min", "max"]:
+            msg = f"Cannot perform {op} with non-ordered Categorical"
+            klass = TypeError
+        else:
+            msg = f"Can't get {op} of an empty group due to unobserved categories"
+            klass = ValueError
+        with pytest.raises(klass, match=msg):
+            get_result()
+
+        if op in ["min", "max", "idxmin", "idxmax"] and isinstance(columns, list):
+            # i.e. DataframeGroupBy, not SeriesGroupBy
+            result = get_result(numeric_only=True)
+            expected = get_categorical_invalid_expected()
+            tm.assert_equal(result, expected)
+        return
+
+    if op in ["prod", "sum", "skew", "kurt"]:
+        # ops that require more than just ordered-ness
+        if is_dt64 or is_cat or is_per or (is_str and op != "sum"):
+            # GH#41291
+            # datetime64 -> prod and sum are invalid
+            if is_dt64:
+                msg = "datetime64 type does not support"
+            elif is_per:
+                msg = "Period type does not support"
+            elif is_str:
+                msg = f"dtype 'str' does not support operation '{op}'"
+            else:
+                msg = "category type does not support"
+            if op in ["skew", "kurt"]:
+                msg = "|".join([msg, f"does not support operation '{op}'"])
+            with pytest.raises(TypeError, match=msg):
+                get_result()
+
+            if not isinstance(columns, list):
+                # i.e. SeriesGroupBy
+                return
+            elif op in ["skew", "kurt"]:
+                # TODO: test the numeric_only=True case
+                return
+            else:
+                # i.e. op in ["prod", "sum"]:
+                # i.e. DataFrameGroupBy
+                # ops that require more than just ordered-ness
+                # GH#41291
+                result = get_result(numeric_only=True)
+
+                # with numeric_only=True, these are dropped, and we get
+                # an empty DataFrame back
+                expected = df.set_index(keys)[[]]
+                if is_cat:
+                    expected = get_categorical_invalid_expected()
+                tm.assert_equal(result, expected)
+                return
+
+    result = get_result()
+    expected = df.set_index(keys)[columns]
+    if op in ["idxmax", "idxmin"]:
+        expected = expected.astype(df.index.dtype)
+    if override_dtype is not None:
+        expected = expected.astype(override_dtype)
+    if len(keys) == 1:
+        expected.index.name = keys[0]
+    tm.assert_equal(result, expected)
+
+
+def test_empty_groupby_apply_nonunique_columns():
+    # GH#44417
+    df = DataFrame(np.random.default_rng(2).standard_normal((0, 4)))
+    df[3] = df[3].astype(np.int64)
+    df.columns = [0, 1, 2, 0]
+    gb = df.groupby(df[1], group_keys=False)
+    res = gb.apply(lambda x: x)
+    assert (res.dtypes == df.drop(columns=1).dtypes).all()
+
+
+def test_tuple_as_grouping():
+    # https://github.com/pandas-dev/pandas/issues/18314
+    df = DataFrame(
+        {
+            ("a", "b"): [1, 1, 1, 1],
+            "a": [2, 2, 2, 2],
+            "b": [2, 2, 2, 2],
+            "c": [1, 1, 1, 1],
+        }
+    )
+
+    with pytest.raises(KeyError, match=r"('a', 'b')"):
+        df[["a", "b", "c"]].groupby(("a", "b"))
+
+    result = df.groupby(("a", "b"))["c"].sum()
+    expected = Series([4], name="c", index=Index([1], name=("a", "b")))
+    tm.assert_series_equal(result, expected)
+
+
+def test_tuple_correct_keyerror():
+    # https://github.com/pandas-dev/pandas/issues/18798
+    df = DataFrame(1, index=range(3), columns=MultiIndex.from_product([[1, 2], [3, 4]]))
+    with pytest.raises(KeyError, match=r"^\(7, 8\)$"):
+        df.groupby((7, 8)).mean()
+
+
+def test_groupby_agg_ohlc_non_first():
+    # GH 21716
+    df = DataFrame(
+        [[1], [1]],
+        columns=Index(["foo"], name="mycols"),
+        index=date_range("2018-01-01", periods=2, freq="D", name="dti"),
+    )
+
+    expected = DataFrame(
+        [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]],
+        columns=MultiIndex.from_tuples(
+            (
+                ("foo", "sum", "foo"),
+                ("foo", "ohlc", "open"),
+                ("foo", "ohlc", "high"),
+                ("foo", "ohlc", "low"),
+                ("foo", "ohlc", "close"),
+            ),
+            names=["mycols", None, None],
+        ),
+        index=date_range("2018-01-01", periods=2, freq="D", name="dti"),
+    )
+
+    result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"])
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_multiindex_nat():
+    # GH 9236
+    values = [
+        (pd.NaT, "a"),
+        (datetime(2012, 1, 2), "a"),
+        (datetime(2012, 1, 2), "b"),
+        (datetime(2012, 1, 3), "a"),
+    ]
+    mi = MultiIndex.from_tuples(values, names=["date", None])
+    ser = Series([3, 2, 2.5, 4], index=mi)
+
+    result = ser.groupby(level=1).mean()
+    expected = Series([3.0, 2.5], index=["a", "b"])
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_empty_list_raises():
+    # GH 5289
+    values = zip(range(10), range(10), strict=True)
+    df = DataFrame(values, columns=["apple", "b"])
+    msg = "Grouper and axis must be same length"
+    with pytest.raises(ValueError, match=msg):
+        df.groupby([[]])
+
+
+def test_groupby_multiindex_series_keys_len_equal_group_axis():
+    # GH 25704
+    index_array = [["x", "x"], ["a", "b"], ["k", "k"]]
+    index_names = ["first", "second", "third"]
+    ri = MultiIndex.from_arrays(index_array, names=index_names)
+    s = Series(data=[1, 2], index=ri)
+    result = s.groupby(["first", "third"]).sum()
+
+    index_array = [["x"], ["k"]]
+    index_names = ["first", "third"]
+    ei = MultiIndex.from_arrays(index_array, names=index_names)
+    expected = Series([3], index=ei)
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_groups_in_BaseGrouper():
+    # GH 26326
+    # Test if DataFrame grouped with a pandas.Grouper has correct groups
+    mi = MultiIndex.from_product([["A", "B"], ["C", "D"]], names=["alpha", "beta"])
+    df = DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi)
+    result = df.groupby([Grouper(level="alpha"), "beta"])
+    expected = df.groupby(["alpha", "beta"])
+    assert result.groups == expected.groups
+
+    result = df.groupby(["beta", Grouper(level="alpha")])
+    expected = df.groupby(["beta", "alpha"])
+    assert result.groups == expected.groups
+
+
+def test_groups_sort_dropna(sort, dropna):
+    # GH#56966, GH#56851
+    df = DataFrame([[2.0, 1.0], [np.nan, 4.0], [0.0, 3.0]])
+    keys = [(2.0, 1.0), (np.nan, 4.0), (0.0, 3.0)]
+    values = [
+        RangeIndex(0, 1),
+        RangeIndex(1, 2),
+        RangeIndex(2, 3),
+    ]
+    if sort:
+        taker = [2, 0] if dropna else [2, 0, 1]
+    else:
+        taker = [0, 2] if dropna else [0, 1, 2]
+    expected = {keys[idx]: values[idx] for idx in taker}
+
+    gb = df.groupby([0, 1], sort=sort, dropna=dropna)
+    result = gb.groups
+
+    for result_key, expected_key in zip(result.keys(), expected.keys(), strict=True):
+        # Compare as NumPy arrays to handle np.nan
+        result_key = np.array(result_key)
+        expected_key = np.array(expected_key)
+        tm.assert_numpy_array_equal(result_key, expected_key)
+    for result_value, expected_value in zip(
+        result.values(), expected.values(), strict=True
+    ):
+        tm.assert_index_equal(result_value, expected_value)
+
+
+@pytest.mark.parametrize(
+    "op, expected",
+    [
+        (
+            "shift",
+            {
+                "time": [
+                    None,
+                    None,
+                    Timestamp("2019-01-01 12:00:00"),
+                    Timestamp("2019-01-01 12:30:00"),
+                    None,
+                    None,
+                ]
+            },
+        ),
+        (
+            "bfill",
+            {
+                "time": [
+                    Timestamp("2019-01-01 12:00:00"),
+                    Timestamp("2019-01-01 12:30:00"),
+                    Timestamp("2019-01-01 14:00:00"),
+                    Timestamp("2019-01-01 14:30:00"),
+                    Timestamp("2019-01-01 14:00:00"),
+                    Timestamp("2019-01-01 14:30:00"),
+                ]
+            },
+        ),
+        (
+            "ffill",
+            {
+                "time": [
+                    Timestamp("2019-01-01 12:00:00"),
+                    Timestamp("2019-01-01 12:30:00"),
+                    Timestamp("2019-01-01 12:00:00"),
+                    Timestamp("2019-01-01 12:30:00"),
+                    Timestamp("2019-01-01 14:00:00"),
+                    Timestamp("2019-01-01 14:30:00"),
+                ]
+            },
+        ),
+    ],
+)
+def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected):
+    # GH19995, GH27992: Check that timezone does not drop in shift, bfill, and ffill
+    tz = tz_naive_fixture
+    data = {
+        "id": ["A", "B", "A", "B", "A", "B"],
+        "time": [
+            Timestamp("2019-01-01 12:00:00"),
+            Timestamp("2019-01-01 12:30:00"),
+            None,
+            None,
+            Timestamp("2019-01-01 14:00:00"),
+            Timestamp("2019-01-01 14:30:00"),
+        ],
+    }
+    df = DataFrame(data).assign(time=lambda x: x.time.dt.tz_localize(tz))
+
+    grouped = df.groupby("id")
+    result = getattr(grouped, op)()
+    expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_only_none_group():
+    # see GH21624
+    # this was crashing with "ValueError: Length of passed values is 1, index implies 0"
+    df = DataFrame({"g": [None], "x": 1})
+    actual = df.groupby("g")["x"].transform("sum")
+    expected = Series([np.nan], name="x")
+
+    tm.assert_series_equal(actual, expected)
+
+
+def test_groupby_duplicate_index():
+    # GH#29189 the groupby call here used to raise
+    ser = Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0])
+    gb = ser.groupby(level=0)
+
+    result = gb.mean()
+    expected = Series([2, 5.5, 8], index=[2.0, 4.0, 5.0])
+    tm.assert_series_equal(result, expected)
+
+
+def test_group_on_empty_multiindex(transformation_func, request):
+    # GH 47787
+    # With one row, those are transforms so the schema should be the same
+    df = DataFrame(
+        data=[[1, Timestamp("today"), 3, 4]],
+        columns=["col_1", "col_2", "col_3", "col_4"],
+    )
+    df["col_3"] = df["col_3"].astype(int)
+    df["col_4"] = df["col_4"].astype(int)
+    df = df.set_index(["col_1", "col_2"])
+    result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func)
+    expected = df.groupby(["col_1"]).transform(transformation_func).iloc[:0]
+    if transformation_func in ("diff", "shift"):
+        expected = expected.astype(int)
+    tm.assert_equal(result, expected)
+
+    result = df["col_3"].iloc[:0].groupby(["col_1"]).transform(transformation_func)
+    expected = df["col_3"].groupby(["col_1"]).transform(transformation_func).iloc[:0]
+    if transformation_func in ("diff", "shift"):
+        expected = expected.astype(int)
+    tm.assert_equal(result, expected)
+
+
+def test_groupby_crash_on_nunique():
+    # Fix following 30253
+    dti = date_range("2016-01-01", periods=2, name="foo")
+    df = DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]})
+    df.columns.names = ("bar", "baz")
+    df.index = dti
+
+    df = df.T
+    gb = df.groupby(level=0)
+    result = gb.nunique()
+
+    expected = DataFrame({"A": [1, 2], "D": [1, 1]}, index=dti)
+    expected.columns.name = "bar"
+    expected = expected.T
+
+    tm.assert_frame_equal(result, expected)
+
+    # same thing, but empty columns
+    gb2 = df[[]].groupby(level=0)
+    exp = expected[[]]
+
+    res = gb2.nunique()
+    tm.assert_frame_equal(res, exp)
+
+
+def test_groupby_list_level():
+    # GH 9790
+    expected = DataFrame(np.arange(0, 9).reshape(3, 3), dtype=float)
+    result = expected.groupby(level=[0]).mean()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "max_seq_items, expected",
+    [
+        (5, "{0: [0], 1: [1], 2: [2], 3: [3], 4: [4]}"),
+        (4, "{0: [0], 1: [1], 2: [2], 3: [3], ...}"),
+        (1, "{0: [0], ...}"),
+    ],
+)
+def test_groups_repr_truncates(max_seq_items, expected):
+    # GH 1135
+    df = DataFrame(np.random.default_rng(2).standard_normal((5, 1)))
+    df["a"] = df.index
+
+    with pd.option_context("display.max_seq_items", max_seq_items):
+        result = df.groupby("a").groups.__repr__()
+        assert result == expected
+
+        result = df.groupby(np.array(df.a)).groups.__repr__()
+        assert result == expected
+
+
+def test_group_on_two_row_multiindex_returns_one_tuple_key():
+    # GH 18451
+    df = DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}])
+    df = df.set_index(["a", "b"])
+
+    grp = df.groupby(["a", "b"])
+    result = grp.indices
+    expected = {(1, 2): np.array([0, 1], dtype=np.int64)}
+
+    assert len(result) == 1
+    key = (1, 2)
+    assert (result[key] == expected[key]).all()
+
+
+@pytest.mark.parametrize(
+    "klass, attr, value",
+    [
+        (DataFrame, "level", "a"),
+        (DataFrame, "as_index", False),
+        (DataFrame, "sort", False),
+        (DataFrame, "group_keys", False),
+        (DataFrame, "observed", True),
+        (DataFrame, "dropna", False),
+        (Series, "level", "a"),
+        (Series, "as_index", False),
+        (Series, "sort", False),
+        (Series, "group_keys", False),
+        (Series, "observed", True),
+        (Series, "dropna", False),
+    ],
+)
+def test_subsetting_columns_keeps_attrs(klass, attr, value):
+    # GH 9959 - When subsetting columns, don't drop attributes
+    df = DataFrame({"a": [1], "b": [2], "c": [3]})
+    if attr != "axis":
+        df = df.set_index("a")
+
+    expected = df.groupby("a", **{attr: value})
+    result = expected[["b"]] if klass is DataFrame else expected["b"]
+    assert getattr(result, attr) == getattr(expected, attr)
+
+
+@pytest.mark.parametrize("func", ["sum", "any", "shift"])
+def test_groupby_column_index_name_lost(func):
+    # GH: 29764 groupby loses index sometimes
+    expected = Index(["a"], name="idx")
+    df = DataFrame([[1]], columns=expected)
+    df_grouped = df.groupby([1])
+    result = getattr(df_grouped, func)().columns
+    tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "infer_string",
+    [
+        False,
+        pytest.param(True, marks=td.skip_if_no("pyarrow")),
+    ],
+)
+def test_groupby_duplicate_columns(infer_string):
+    # GH: 31735
+    if infer_string:
+        pytest.importorskip("pyarrow")
+    df = DataFrame(
+        {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]}
+    ).astype(object)
+    df.columns = ["A", "B", "B"]
+    with pd.option_context("future.infer_string", infer_string):
+        result = df.groupby([0, 0, 0, 0]).min()
+    expected = DataFrame(
+        [["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"], dtype=object
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_series_with_tuple_name():
+    # GH 37755
+    ser = Series([1, 2, 3, 4], index=[1, 1, 2, 2], name=("a", "a"))
+    ser.index.name = ("b", "b")
+    result = ser.groupby(level=0).last()
+    expected = Series([2, 4], index=[1, 2], name=("a", "a"))
+    expected.index.name = ("b", "b")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])]
+)
+def test_groupby_numerical_stability_sum_mean(func, values):
+    # GH#38778
+    data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
+    df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
+    result = getattr(df.groupby("group"), func)()
+    expected = DataFrame({"a": values, "b": values}, index=Index([1, 2], name="group"))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_numerical_stability_cumsum():
+    # GH#38934
+    data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
+    df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
+    result = df.groupby("group").cumsum()
+    exp_data = (
+        [1e16] * 2 + [1e16 + 96, 1e16 + 98] + [5e15 + 97, 5e15 + 98] + [97.0, 98.0]
+    )
+    expected = DataFrame({"a": exp_data, "b": exp_data})
+    tm.assert_frame_equal(result, expected, check_exact=True)
+
+
+def test_groupby_cumsum_skipna_false():
+    # GH#46216 don't propagate np.nan above the diagonal
+    arr = np.random.default_rng(2).standard_normal((5, 5))
+    df = DataFrame(arr)
+    for i in range(5):
+        df.iloc[i, i] = np.nan
+
+    df["A"] = 1
+    gb = df.groupby("A")
+
+    res = gb.cumsum(skipna=False)
+
+    expected = df[[0, 1, 2, 3, 4]].cumsum(skipna=False)
+    tm.assert_frame_equal(res, expected)
+
+
+def test_groupby_cumsum_timedelta64():
+    # GH#46216 don't ignore is_datetimelike in libgroupby.group_cumsum
+    dti = date_range("2016-01-01", periods=5, unit="ns")
+    ser = Series(dti) - dti[0]
+    ser[2] = pd.NaT
+
+    df = DataFrame({"A": 1, "B": ser})
+    gb = df.groupby("A")
+
+    res = gb.cumsum(numeric_only=False, skipna=True)
+    exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, ser[4], ser[4] * 2]})
+    tm.assert_frame_equal(res, exp)
+
+    res = gb.cumsum(numeric_only=False, skipna=False)
+    exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, pd.NaT, pd.NaT]})
+    tm.assert_frame_equal(res, exp)
+
+
+def test_groupby_mean_duplicate_index(rand_series_with_duplicate_datetimeindex):
+    dups = rand_series_with_duplicate_datetimeindex
+    result = dups.groupby(level=0).mean()
+    expected = dups.groupby(dups.index).mean()
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_all_nan_groups_drop():
+    # GH 15036
+    s = Series([1, 2, 3], [np.nan, np.nan, np.nan])
+    result = s.groupby(s.index).sum()
+    expected = Series([], index=Index([], dtype=np.float64), dtype=np.int64)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("numeric_only", [True, False])
+def test_groupby_empty_multi_column(as_index, numeric_only):
+    # GH 15106 & GH 41998
+    df = DataFrame(data=[], columns=["A", "B", "C"])
+    gb = df.groupby(["A", "B"], as_index=as_index)
+    result = gb.sum(numeric_only=numeric_only)
+    if as_index:
+        index = MultiIndex([[], []], [[], []], names=["A", "B"])
+        columns = ["C"] if not numeric_only else Index([], dtype="str")
+    else:
+        index = RangeIndex(0)
+        columns = ["A", "B", "C"] if not numeric_only else ["A", "B"]
+    expected = DataFrame([], columns=columns, index=index)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_aggregation_non_numeric_dtype():
+    # GH #43108
+    df = DataFrame(
+        [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"]
+    )
+
+    expected = DataFrame(
+        {
+            "v": [[1, 1], [10, 20]],
+        },
+        index=Index(["M", "W"], name="MW"),
+    )
+
+    gb = df.groupby(by=["MW"])
+    result = gb.sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_aggregation_multi_non_numeric_dtype():
+    # GH #42395
+    df = DataFrame(
+        {
+            "x": [1, 0, 1, 1, 0],
+            "y": [Timedelta(i, "days") for i in range(1, 6)],
+            "z": [Timedelta(i * 10, "days") for i in range(1, 6)],
+        }
+    )
+
+    expected = DataFrame(
+        {
+            "y": [Timedelta(i, "days") for i in range(7, 9)],
+            "z": [Timedelta(i * 10, "days") for i in range(7, 9)],
+        },
+        index=Index([0, 1], dtype="int64", name="x"),
+    )
+
+    gb = df.groupby(by=["x"])
+    result = gb.sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_aggregation_numeric_with_non_numeric_dtype():
+    # GH #43108
+    df = DataFrame(
+        {
+            "x": [1, 0, 1, 1, 0],
+            "y": [Timedelta(i, "days") for i in range(1, 6)],
+            "z": list(range(1, 6)),
+        }
+    )
+
+    expected = DataFrame(
+        {"y": [Timedelta(7, "days"), Timedelta(8, "days")], "z": [7, 8]},
+        index=Index([0, 1], dtype="int64", name="x"),
+    )
+
+    gb = df.groupby(by=["x"])
+    result = gb.sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_filtered_df_std():
+    # GH 16174
+    dicts = [
+        {"filter_col": False, "groupby_col": True, "bool_col": True, "float_col": 10.5},
+        {"filter_col": True, "groupby_col": True, "bool_col": True, "float_col": 20.5},
+        {"filter_col": True, "groupby_col": True, "bool_col": True, "float_col": 30.5},
+    ]
+    df = DataFrame(dicts)
+
+    df_filter = df[df["filter_col"] == True]  # noqa: E712
+    dfgb = df_filter.groupby("groupby_col")
+    result = dfgb.std()
+    expected = DataFrame(
+        [[0.0, 0.0, 7.071068]],
+        columns=["filter_col", "bool_col", "float_col"],
+        index=Index([True], name="groupby_col"),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_datetime_categorical_multikey_groupby_indices():
+    # GH 26859
+    df = DataFrame(
+        {
+            "a": Series(list("abc")),
+            "b": Series(
+                to_datetime(["2018-01-01", "2018-02-01", "2018-03-01"]),
+                dtype="category",
+            ),
+            "c": Categorical.from_codes([-1, 0, 1], categories=[0, 1]),
+        }
+    )
+    result = df.groupby(["a", "b"], observed=False).indices
+    expected = {
+        ("a", Timestamp("2018-01-01 00:00:00")): np.array([0]),
+        ("b", Timestamp("2018-02-01 00:00:00")): np.array([1]),
+        ("c", Timestamp("2018-03-01 00:00:00")): np.array([2]),
+    }
+    assert result == expected
+
+
+def test_rolling_wrong_param_min_period():
+    # GH34037
+    name_l = ["Alice"] * 5 + ["Bob"] * 5
+    val_l = [np.nan, np.nan, 1, 2, 3, np.nan, 1, 2, 3, 4]
+    test_df = DataFrame([name_l, val_l]).T
+    test_df.columns = ["name", "val"]
+
+    result_error_msg = (
+        r"^[a-zA-Z._]*\(\) got an unexpected keyword argument 'min_period'"
+    )
+    with pytest.raises(TypeError, match=result_error_msg):
+        test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()
+
+
+def test_by_column_values_with_same_starting_value(any_string_dtype):
+    # GH29635
+    dtype = any_string_dtype
+    df = DataFrame(
+        {
+            "Name": ["Thomas", "Thomas", "Thomas John"],
+            "Credit": [1200, 1300, 900],
+            "Mood": Series(["sad", "happy", "happy"], dtype=dtype),
+        }
+    )
+    aggregate_details = {"Mood": Series.mode, "Credit": "sum"}
+
+    result = df.groupby(["Name"]).agg(aggregate_details)
+    expected = DataFrame(
+        {
+            "Mood": [["happy", "sad"], "happy"],
+            "Credit": [2500, 900],
+            "Name": ["Thomas", "Thomas John"],
+        },
+    ).set_index("Name")
+    if getattr(dtype, "storage", None) == "pyarrow":
+        mood_values = pd.array(["happy", "sad"], dtype=dtype)
+        expected["Mood"] = [mood_values, "happy"]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_none_in_first_mi_level():
+    # GH#47348
+    arr = [[None, 1, 0, 1], [2, 3, 2, 3]]
+    ser = Series(1, index=MultiIndex.from_arrays(arr, names=["a", "b"]))
+    result = ser.groupby(level=[0, 1]).sum()
+    expected = Series(
+        [1, 2], MultiIndex.from_tuples([(0.0, 2), (1.0, 3)], names=["a", "b"])
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_none_column_name(using_infer_string):
+    # GH#47348
+    df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]})
+    by = [np.nan] if using_infer_string else [None]
+    gb = df.groupby(by=by)
+    result = gb.sum()
+    expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=by[0]))
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("selection", [None, "a", ["a"]])
+def test_single_element_list_grouping(selection):
+    # GH#42795, GH#53500
+    df = DataFrame({"a": [1, 2], "b": [np.nan, 5], "c": [np.nan, 2]}, index=["x", "y"])
+    grouped = df.groupby(["a"]) if selection is None else df.groupby(["a"])[selection]
+    result = [key for key, _ in grouped]
+
+    expected = [(1,), (2,)]
+    assert result == expected
+
+
+def test_groupby_string_dtype():
+    # GH 40148
+    df = DataFrame({"str_col": ["a", "b", "c", "a"], "num_col": [1, 2, 3, 2]})
+    df["str_col"] = df["str_col"].astype("string")
+    expected = DataFrame(
+        {
+            "str_col": [
+                "a",
+                "b",
+                "c",
+            ],
+            "num_col": [1.5, 2.0, 3.0],
+        }
+    )
+    expected["str_col"] = expected["str_col"].astype("string")
+    grouped = df.groupby("str_col", as_index=False)
+    result = grouped.mean()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "level_arg, multiindex", [([0], False), ((0,), False), ([0], True), ((0,), True)]
+)
+def test_single_element_listlike_level_grouping(level_arg, multiindex):
+    # GH 51583
+    df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"])
+    if multiindex:
+        df = df.set_index(["a", "b"])
+    result = [key for key, _ in df.groupby(level=level_arg)]
+    expected = [(1,), (2,)] if multiindex else [("x",), ("y",)]
+    assert result == expected
+
+
+@pytest.mark.parametrize("func", ["sum", "cumsum", "cumprod", "prod"])
+def test_groupby_avoid_casting_to_float(func):
+    # GH#37493
+    val = 922337203685477580
+    df = DataFrame({"a": 1, "b": [val]})
+    result = getattr(df.groupby("a"), func)() - val
+    expected = DataFrame({"b": [0]}, index=Index([1], name="a"))
+    if func in ["cumsum", "cumprod"]:
+        expected = expected.reset_index(drop=True)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("func, val", [("sum", 3), ("prod", 2)])
+def test_groupby_sum_support_mask(any_numeric_ea_dtype, func, val):
+    # GH#37493
+    df = DataFrame({"a": 1, "b": [1, 2, pd.NA]}, dtype=any_numeric_ea_dtype)
+    result = getattr(df.groupby("a"), func)()
+    expected = DataFrame(
+        {"b": [val]},
+        index=Index([1], name="a", dtype=any_numeric_ea_dtype),
+        dtype=any_numeric_ea_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("val, dtype", [(111, "int"), (222, "uint")])
+def test_groupby_overflow(val, dtype):
+    # GH#37493
+    df = DataFrame({"a": 1, "b": [val, val]}, dtype=f"{dtype}8")
+    result = df.groupby("a").sum()
+    expected = DataFrame(
+        {"b": [val * 2]},
+        index=Index([1], name="a", dtype=f"{dtype}8"),
+        dtype=f"{dtype}64",
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby("a").cumsum()
+    expected = DataFrame({"b": [val, val * 2]}, dtype=f"{dtype}64")
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby("a").prod()
+    expected = DataFrame(
+        {"b": [val * val]},
+        index=Index([1], name="a", dtype=f"{dtype}8"),
+        dtype=f"{dtype}64",
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("skipna, val", [(True, 3), (False, pd.NA)])
+def test_groupby_cumsum_mask(any_numeric_ea_dtype, skipna, val):
+    # GH#37493
+    df = DataFrame({"a": 1, "b": [1, pd.NA, 2]}, dtype=any_numeric_ea_dtype)
+    result = df.groupby("a").cumsum(skipna=skipna)
+    expected = DataFrame(
+        {"b": [1, pd.NA, val]},
+        dtype=any_numeric_ea_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "val_in, index, val_out",
+    [
+        (
+            [1.0, 2.0, 3.0, 4.0, 5.0],
+            ["foo", "foo", "bar", "baz", "blah"],
+            [3.0, 4.0, 5.0, 3.0],
+        ),
+        (
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+            ["foo", "foo", "bar", "baz", "blah", "blah"],
+            [3.0, 4.0, 11.0, 3.0],
+        ),
+    ],
+)
+def test_groupby_index_name_in_index_content(val_in, index, val_out):
+    # GH 48567
+    series = Series(data=val_in, name="values", index=Index(index, name="blah"))
+    result = series.groupby("blah").sum()
+    expected = Series(
+        data=val_out,
+        name="values",
+        index=Index(["bar", "baz", "blah", "foo"], name="blah"),
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = series.to_frame().groupby("blah").sum()
+    expected = expected.to_frame()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("n", [1, 10, 32, 100, 1000])
+def test_sum_of_booleans(n):
+    # GH 50347
+    df = DataFrame({"groupby_col": 1, "bool": [True] * n})
+    df["bool"] = df["bool"].eq(True)
+    result = df.groupby("groupby_col").sum()
+    expected = DataFrame({"bool": [n]}, index=Index([1], name="groupby_col"))
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.filterwarnings(
+    "ignore:invalid value encountered in remainder:RuntimeWarning"
+)
+@pytest.mark.parametrize("method", ["head", "tail", "nth", "first", "last"])
+def test_groupby_method_drop_na(method):
+    # GH 21755
+    df = DataFrame({"A": ["a", np.nan, "b", np.nan, "c"], "B": range(5)})
+
+    if method == "nth":
+        result = getattr(df.groupby("A"), method)(n=0)
+    else:
+        result = getattr(df.groupby("A"), method)()
+
+    if method in ["first", "last"]:
+        expected = DataFrame({"B": [0, 2, 4]}).set_index(
+            Series(["a", "b", "c"], name="A")
+        )
+    else:
+        expected = DataFrame(
+            {"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=range(0, 6, 2)
+        )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_reduce_period():
+    # GH#51040
+    pi = pd.period_range("2016-01-01", periods=100, freq="D")
+    grps = list(range(10)) * 10
+    ser = pi.to_series()
+    gb = ser.groupby(grps)
+
+    with pytest.raises(TypeError, match="Period type does not support sum operations"):
+        gb.sum()
+    with pytest.raises(
+        TypeError, match="Period type does not support cumsum operations"
+    ):
+        gb.cumsum()
+    with pytest.raises(TypeError, match="Period type does not support prod operations"):
+        gb.prod()
+    with pytest.raises(
+        TypeError, match="Period type does not support cumprod operations"
+    ):
+        gb.cumprod()
+
+    res = gb.max()
+    expected = ser[-10:]
+    expected.index = Index(range(10), dtype=int)
+    tm.assert_series_equal(res, expected)
+
+    res = gb.min()
+    expected = ser[:10]
+    expected.index = Index(range(10), dtype=int)
+    tm.assert_series_equal(res, expected)
+
+
+def test_obj_with_exclusions_duplicate_columns():
+    # GH#50806
+    df = DataFrame([[0, 1, 2, 3]])
+    df.columns = [0, 1, 2, 0]
+    gb = df.groupby(df[1])
+    result = gb._obj_with_exclusions
+    expected = df.take([0, 2, 3], axis=1)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("numeric_only", [True, False])
+def test_groupby_numeric_only_std_no_result(numeric_only):
+    # GH 51080
+    dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}]
+    df = DataFrame(dicts_non_numeric, dtype=object)
+    dfgb = df.groupby("a", as_index=False, sort=False)
+
+    if numeric_only:
+        result = dfgb.std(numeric_only=True)
+        expected_df = DataFrame(["foo", "car"], columns=["a"])
+        tm.assert_frame_equal(result, expected_df)
+    else:
+        with pytest.raises(
+            ValueError, match="could not convert string to float: 'bar'"
+        ):
+            dfgb.std(numeric_only=numeric_only)
+
+
+def test_grouping_with_categorical_interval_columns():
+    # GH#34164
+    df = DataFrame({"x": [0.1, 0.2, 0.3, -0.4, 0.5], "w": ["a", "b", "a", "c", "a"]})
+    qq = pd.qcut(df["x"], q=np.linspace(0, 1, 5))
+    result = df.groupby([qq, "w"], observed=False)["x"].agg("mean")
+    categorical_index_level_1 = Categorical(
+        [
+            Interval(-0.401, 0.1, closed="right"),
+            Interval(0.1, 0.2, closed="right"),
+            Interval(0.2, 0.3, closed="right"),
+            Interval(0.3, 0.5, closed="right"),
+        ],
+        ordered=True,
+    )
+    index_level_2 = ["a", "b", "c"]
+    mi = MultiIndex.from_product(
+        [categorical_index_level_1, index_level_2], names=["x", "w"]
+    )
+    expected = Series(
+        np.array(
+            [
+                0.1,
+                np.nan,
+                -0.4,
+                np.nan,
+                0.2,
+                np.nan,
+                0.3,
+                np.nan,
+                np.nan,
+                0.5,
+                np.nan,
+                np.nan,
+            ]
+        ),
+        index=mi,
+        name="x",
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("bug_var", [1, "a"])
+def test_groupby_sum_on_nan_should_return_nan(bug_var):
+    # GH 24196
+    df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]})
+    if isinstance(bug_var, str):
+        df = df.astype(object)
+    dfgb = df.groupby(lambda x: x)
+    result = dfgb.sum(min_count=1)
+
+    expected_df = DataFrame(
+        [bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype
+    )
+    tm.assert_frame_equal(result, expected_df)
+
+
+@pytest.mark.parametrize(
+    "method",
+    [
+        "count",
+        "corr",
+        "cummax",
+        "cummin",
+        "cumprod",
+        "describe",
+        "rank",
+        "quantile",
+        "diff",
+        "shift",
+        "all",
+        "any",
+        "idxmin",
+        "idxmax",
+        "ffill",
+        "bfill",
+        "pct_change",
+    ],
+)
+def test_groupby_selection_with_methods(df, method):
+    # some methods which require DatetimeIndex
+    rng = date_range("2014", periods=len(df))
+    df.index = rng
+
+    g = df.groupby(["A"])[["C"]]
+    g_exp = df[["C"]].groupby(df["A"])
+    # TODO check groupby with > 1 col ?
+
+    res = getattr(g, method)()
+    exp = getattr(g_exp, method)()
+
+    # should always be frames!
+    tm.assert_frame_equal(res, exp)
+
+
+def test_groupby_selection_other_methods(df):
+    # some methods which require DatetimeIndex
+    rng = date_range("2014", periods=len(df))
+    df.columns.name = "foo"
+    df.index = rng
+
+    g = df.groupby(["A"])[["C"]]
+    g_exp = df[["C"]].groupby(df["A"])
+
+    # methods which aren't just .foo()
+    tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum()))
+
+    tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean())
+    tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc())
+
+    tm.assert_frame_equal(
+        g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3)
+    )
+
+
+def test_groupby_with_Time_Grouper(unit):
+    idx2 = to_datetime(
+        [
+            "2016-08-31 22:08:12.000",
+            "2016-08-31 22:09:12.200",
+            "2016-08-31 22:20:12.400",
+        ]
+    ).as_unit(unit)
+
+    test_data = DataFrame(
+        {"quant": [1.0, 1.0, 3.0], "quant2": [1.0, 1.0, 3.0], "time2": idx2}
+    )
+
+    time2 = date_range("2016-08-31 22:08:00", periods=13, freq="1min", unit=unit)
+    expected_output = DataFrame(
+        {
+            "time2": time2,
+            "quant": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+            "quant2": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
+        }
+    )
+
+    gb = test_data.groupby(Grouper(key="time2", freq="1min"))
+    result = gb.count().reset_index()
+
+    tm.assert_frame_equal(result, expected_output)
+
+
+def test_groupby_series_with_datetimeindex_month_name():
+    # GH 48509
+    s = Series([0, 1, 0], index=date_range("2022-01-01", periods=3), name="jan")
+    result = s.groupby(s).count()
+    expected = Series([2, 1], name="jan")
+    expected.index.name = "jan"
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("test_series", [True, False])
+@pytest.mark.parametrize(
+    "kwarg, value, name, warn",
+    [
+        ("by", "a", 1, None),
+        ("by", ["a"], (1,), None),
+        ("level", 0, 1, None),
+        ("level", [0], (1,), None),
+    ],
+)
+def test_get_group_len_1_list_likes(test_series, kwarg, value, name, warn):
+    # GH#25971
+    obj = DataFrame({"b": [3, 4, 5]}, index=Index([1, 1, 2], name="a"))
+    if test_series:
+        obj = obj["b"]
+    gb = obj.groupby(**{kwarg: value})
+    result = gb.get_group(name)
+    if test_series:
+        expected = Series([3, 4], index=Index([1, 1], name="a"), name="b")
+    else:
+        expected = DataFrame({"b": [3, 4]}, index=Index([1, 1], name="a"))
+    tm.assert_equal(result, expected)
+
+
+def test_groupby_ngroup_with_nan():
+    # GH#50100
+    df = DataFrame({"a": Categorical([np.nan]), "b": [1]})
+    result = df.groupby(["a", "b"], dropna=False, observed=False).ngroup()
+    expected = Series([0])
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_ffill_with_duplicated_index():
+    # GH#43412
+    df = DataFrame({"a": [1, 2, 3, 4, np.nan, np.nan]}, index=[0, 1, 2, 0, 1, 2])
+
+    result = df.groupby(level=0).ffill()
+    expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2])
+    tm.assert_frame_equal(result, expected, check_dtype=False)
+
+
+@pytest.mark.parametrize("test_series", [True, False])
+def test_decimal_na_sort(test_series):
+    # GH#54847
+    # We catch both TypeError and decimal.InvalidOperation exceptions in safe_sort.
+    # If this next assert raises, we can just catch TypeError
+    assert not isinstance(decimal.InvalidOperation, TypeError)
+    df = DataFrame(
+        {
+            "key": [Decimal(1), Decimal(1), None, None],
+            "value": [Decimal(2), Decimal(3), Decimal(4), Decimal(5)],
+        }
+    )
+    gb = df.groupby("key", dropna=False)
+    if test_series:
+        gb = gb["value"]
+    result = gb._grouper.result_index
+    expected = Index([Decimal(1), None], name="key")
+    tm.assert_index_equal(result, expected)
+
+
+def test_groupby_dropna_with_nunique_unique():
+    # GH#42016
+    df = [[1, 1, 1, "A"], [1, None, 1, "A"], [1, None, 2, "A"], [1, None, 3, "A"]]
+    df_dropna = DataFrame(df, columns=["a", "b", "c", "partner"])
+    result = df_dropna.groupby(["a", "b", "c"], dropna=False).agg(
+        {"partner": ["nunique", "unique"]}
+    )
+
+    index = MultiIndex.from_tuples(
+        [(1, 1.0, 1), (1, np.nan, 1), (1, np.nan, 2), (1, np.nan, 3)],
+        names=["a", "b", "c"],
+    )
+    columns = MultiIndex.from_tuples([("partner", "nunique"), ("partner", "unique")])
+    expected = DataFrame(
+        [(1, ["A"]), (1, ["A"]), (1, ["A"]), (1, ["A"])], index=index, columns=columns
+    )
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_agg_namedagg_with_duplicate_columns():
+    # GH#58446
+    df = DataFrame(
+        {
+            "col1": [2, 1, 1, 0, 2, 0],
+            "col2": [4, 5, 36, 7, 4, 5],
+            "col3": [3.1, 8.0, 12, 10, 4, 1.1],
+            "col4": [17, 3, 16, 15, 5, 6],
+            "col5": [-1, 3, -1, 3, -2, -1],
+        }
+    )
+
+    result = df.groupby(by=["col1", "col1", "col2"], as_index=False).agg(
+        new_col=pd.NamedAgg(column="col1", aggfunc="min"),
+        new_col1=pd.NamedAgg(column="col1", aggfunc="max"),
+        new_col2=pd.NamedAgg(column="col2", aggfunc="count"),
+    )
+
+    expected = DataFrame(
+        {
+            "col1": [0, 0, 1, 1, 2],
+            "col2": [5, 7, 5, 36, 4],
+            "new_col": [0, 0, 1, 1, 2],
+            "new_col1": [0, 0, 1, 1, 2],
+            "new_col2": [1, 1, 1, 1, 2],
+        }
+    )
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_multi_index_codes():
+    # GH#54347
+    df = DataFrame(
+        {"A": [1, 2, 3, 4], "B": [1, float("nan"), 2, float("nan")], "C": [2, 4, 6, 8]}
+    )
+    df_grouped = df.groupby(["A", "B"], dropna=False).sum()
+
+    index = df_grouped.index
+    tm.assert_index_equal(index, MultiIndex.from_frame(index.to_frame()))
+
+
+def test_groupby_datetime_with_nat():
+    # GH##35202
+    df = DataFrame(
+        {
+            "a": [
+                to_datetime("2019-02-12"),
+                to_datetime("2019-02-12"),
+                to_datetime("2019-02-13"),
+                pd.NaT,
+            ],
+            "b": [1, 2, 3, 4],
+        }
+    )
+    grouped = df.groupby("a", dropna=False)
+    result = len(grouped)
+    assert result == 3
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ddeefedc217ff960b957d92a2f7ca169b2f9ba5
--- /dev/null
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -0,0 +1,692 @@
+import numpy as np
+import pytest
+
+from pandas.errors import Pandas4Warning
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.missing import na_value_for_dtype
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.tests.groupby import get_groupby_method_args
+
+
+@pytest.mark.parametrize(
+    "dropna, tuples, outputs",
+    [
+        (
+            True,
+            [["A", "B"], ["B", "A"]],
+            {"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]},
+        ),
+        (
+            False,
+            [["A", "B"], ["A", np.nan], ["B", "A"]],
+            {
+                "c": [13.0, 12.3, 123.23],
+                "d": [13.0, 233.0, 123.0],
+                "e": [13.0, 12.0, 1.0],
+            },
+        ),
+    ],
+)
+def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
+    dropna, tuples, outputs, nulls_fixture
+):
+    # GH 3729 this is to test that NA is in one group
+    df_list = [
+        ["A", "B", 12, 12, 12],
+        ["A", nulls_fixture, 12.3, 233.0, 12],
+        ["B", "A", 123.23, 123, 1],
+        ["A", "B", 1, 1, 1.0],
+    ]
+    df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
+    grouped = df.groupby(["a", "b"], dropna=dropna).sum()
+
+    mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
+
+    # Since right now, by default MI will drop NA from levels when we create MI
+    # via `from_*`, so we need to add NA for level manually afterwards.
+    if not dropna:
+        mi = mi.set_levels(["A", "B", np.nan], level="b")
+    expected = pd.DataFrame(outputs, index=mi)
+
+    tm.assert_frame_equal(grouped, expected)
+
+
+@pytest.mark.parametrize(
+    "dropna, tuples, outputs",
+    [
+        (
+            True,
+            [["A", "B"], ["B", "A"]],
+            {"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]},
+        ),
+        (
+            False,
+            [["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]],
+            {
+                "c": [12.0, 13.3, 123.23, 1.0],
+                "d": [12.0, 234.0, 123.0, 1.0],
+                "e": [12.0, 13.0, 1.0, 1.0],
+            },
+        ),
+    ],
+)
+def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
+    dropna, tuples, outputs, nulls_fixture, nulls_fixture2
+):
+    # GH 3729 this is to test that NA in different groups with different representations
+    df_list = [
+        ["A", "B", 12, 12, 12],
+        ["A", nulls_fixture, 12.3, 233.0, 12],
+        ["B", "A", 123.23, 123, 1],
+        [nulls_fixture2, "B", 1, 1, 1.0],
+        ["A", nulls_fixture2, 1, 1, 1.0],
+    ]
+    df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
+    grouped = df.groupby(["a", "b"], dropna=dropna).sum()
+
+    mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
+
+    # Since right now, by default MI will drop NA from levels when we create MI
+    # via `from_*`, so we need to add NA for level manually afterwards.
+    if not dropna:
+        mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
+    expected = pd.DataFrame(outputs, index=mi)
+
+    tm.assert_frame_equal(grouped, expected)
+
+
+@pytest.mark.parametrize(
+    "dropna, idx, outputs",
+    [
+        (True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}),
+        (
+            False,
+            ["A", "B", np.nan],
+            {
+                "b": [123.23, 13.0, 12.3],
+                "c": [123.0, 13.0, 233.0],
+                "d": [1.0, 13.0, 12.0],
+            },
+        ),
+    ],
+)
+def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
+    # GH 3729
+    df_list = [
+        ["B", 12, 12, 12],
+        [None, 12.3, 233.0, 12],
+        ["A", 123.23, 123, 1],
+        ["B", 1, 1, 1.0],
+    ]
+    df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
+    grouped = df.groupby("a", dropna=dropna).sum()
+
+    expected = pd.DataFrame(outputs, index=pd.Index(idx, name="a"))
+
+    tm.assert_frame_equal(grouped, expected)
+
+
+@pytest.mark.parametrize(
+    "dropna, idx, expected",
+    [
+        (True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])),
+        (
+            False,
+            ["a", "a", "b", np.nan],
+            pd.Series([3, 3, 3], index=["a", "b", np.nan]),
+        ),
+    ],
+)
+def test_groupby_dropna_series_level(dropna, idx, expected):
+    ser = pd.Series([1, 2, 3, 3], index=idx)
+
+    result = ser.groupby(level=0, dropna=dropna).sum()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "dropna, expected",
+    [
+        (True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")),
+        (
+            False,
+            pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"),
+        ),
+    ],
+)
+def test_groupby_dropna_series_by(dropna, expected):
+    ser = pd.Series(
+        [390.0, 350.0, 30.0, 20.0],
+        index=["Falcon", "Falcon", "Parrot", "Parrot"],
+        name="Max Speed",
+    )
+
+    result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean()
+    tm.assert_series_equal(result, expected)
+
+
+def test_grouper_dropna_propagation(dropna):
+    # GH 36604
+    df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
+    gb = df.groupby("A", dropna=dropna)
+    assert gb._grouper.dropna == dropna
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        pd.RangeIndex(0, 4),
+        list("abcd"),
+        pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
+    ],
+)
+def test_groupby_dataframe_slice_then_transform(dropna, index):
+    # GH35014 & GH35612
+    expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}
+
+    df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
+    gb = df.groupby("A", dropna=dropna)
+
+    result = gb.transform(len)
+    expected = pd.DataFrame(expected_data, index=index)
+    tm.assert_frame_equal(result, expected)
+
+    result = gb[["B"]].transform(len)
+    expected = pd.DataFrame(expected_data, index=index)
+    tm.assert_frame_equal(result, expected)
+
+    result = gb["B"].transform(len)
+    expected = pd.Series(expected_data["B"], index=index, name="B")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "dropna, tuples, outputs",
+    [
+        (
+            True,
+            [["A", "B"], ["B", "A"]],
+            {"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]},
+        ),
+        (
+            False,
+            [["A", "B"], ["A", np.nan], ["B", "A"]],
+            {
+                "c": [13.0, 12.3, 123.23],
+                "d": [12.0, 233.0, 123.0],
+                "e": [1.0, 12.0, 1.0],
+            },
+        ),
+    ],
+)
+def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs):
+    # GH 3729
+    df_list = [
+        ["A", "B", 12, 12, 12],
+        ["A", None, 12.3, 233.0, 12],
+        ["B", "A", 123.23, 123, 1],
+        ["A", "B", 1, 1, 1.0],
+    ]
+    df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
+    agg_dict = {"c": "sum", "d": "max", "e": "min"}
+    grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict)
+
+    mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
+
+    # Since right now, by default MI will drop NA from levels when we create MI
+    # via `from_*`, so we need to add NA for level manually afterwards.
+    if not dropna:
+        mi = mi.set_levels(["A", "B", np.nan], level="b")
+    expected = pd.DataFrame(outputs, index=mi)
+
+    tm.assert_frame_equal(grouped, expected)
+
+
+@pytest.mark.arm_slow
+@pytest.mark.parametrize(
+    "datetime1, datetime2",
+    [
+        (pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")),
+        (pd.Timedelta("-2 days"), pd.Timedelta("-1 days")),
+        (pd.Period("2020-01-01"), pd.Period("2020-02-01")),
+    ],
+)
+@pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])])
+def test_groupby_dropna_datetime_like_data(
+    dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2
+):
+    # 3729
+    df = pd.DataFrame(
+        {
+            "values": [1, 2, 3, 4, 5, 6],
+            "dt": [
+                datetime1,
+                unique_nulls_fixture,
+                datetime2,
+                unique_nulls_fixture2,
+                datetime1,
+                datetime1,
+            ],
+        }
+    )
+
+    if dropna:
+        indexes = [datetime1, datetime2]
+    else:
+        indexes = [datetime1, datetime2, np.nan]
+
+    grouped = df.groupby("dt", dropna=dropna).agg({"values": "sum"})
+    expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))
+
+    tm.assert_frame_equal(grouped, expected)
+
+
+@pytest.mark.parametrize(
+    "dropna, data, selected_data, levels",
+    [
+        pytest.param(
+            False,
+            {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
+            {"values": [0, 1, 0, 0]},
+            ["a", "b", np.nan],
+            id="dropna_false_has_nan",
+        ),
+        pytest.param(
+            True,
+            {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
+            {"values": [0, 1, 0]},
+            None,
+            id="dropna_true_has_nan",
+        ),
+        pytest.param(
+            # no nan in "groups"; dropna=True|False should be same.
+            False,
+            {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
+            {"values": [0, 1, 0, 0]},
+            None,
+            id="dropna_false_no_nan",
+        ),
+        pytest.param(
+            # no nan in "groups"; dropna=True|False should be same.
+            True,
+            {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
+            {"values": [0, 1, 0, 0]},
+            None,
+            id="dropna_true_no_nan",
+        ),
+    ],
+)
+def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels):
+    # GH 35889
+
+    df = pd.DataFrame(data)
+    gb = df.groupby("groups", dropna=dropna)
+    result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
+
+    mi_tuples = tuple(zip(data["groups"], selected_data["values"], strict=False))
+    mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
+    # Since right now, by default MI will drop NA from levels when we create MI
+    # via `from_*`, so we need to add NA for level manually afterwards.
+    if not dropna and levels:
+        mi = mi.set_levels(levels, level="groups")
+
+    expected = pd.DataFrame(selected_data, index=mi)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("input_index", [None, ["a"], ["a", "b"]])
+@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
+@pytest.mark.parametrize("series", [True, False])
+def test_groupby_dropna_with_multiindex_input(input_index, keys, series):
+    # GH#46783
+    obj = pd.DataFrame(
+        {
+            "a": [1, np.nan],
+            "b": [1, 1],
+            "c": [2, 3],
+        }
+    )
+
+    expected = obj.set_index(keys)
+    if series:
+        expected = expected["c"]
+    elif input_index == ["a", "b"] and keys == ["a"]:
+        # Column b should not be aggregated
+        expected = expected[["c"]]
+
+    if input_index is not None:
+        obj = obj.set_index(input_index)
+    gb = obj.groupby(keys, dropna=False)
+    if series:
+        gb = gb["c"]
+    result = gb.sum()
+
+    tm.assert_equal(result, expected)
+
+
+def test_groupby_nan_included():
+    # GH 35646
+    data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
+    df = pd.DataFrame(data)
+    grouped = df.groupby("group", dropna=False)
+    result = grouped.indices
+    dtype = np.intp
+    expected = {
+        "g1": np.array([0, 2], dtype=dtype),
+        "g2": np.array([3], dtype=dtype),
+        np.nan: np.array([1, 4], dtype=dtype),
+    }
+    for result_values, expected_values in zip(
+        result.values(), expected.values(), strict=True
+    ):
+        tm.assert_numpy_array_equal(result_values, expected_values)
+    assert np.isnan(list(result.keys())[2])
+    assert list(result.keys())[0:2] == ["g1", "g2"]
+
+
+def test_groupby_drop_nan_with_multi_index():
+    # GH 39895
+    df = pd.DataFrame([[np.nan, 0, 1]], columns=["a", "b", "c"])
+    df = df.set_index(["a", "b"])
+    result = df.groupby(["a", "b"], dropna=False).first()
+    expected = df
+    tm.assert_frame_equal(result, expected)
+
+
+# y >x and z is the missing value
+@pytest.mark.parametrize(
+    "sequence",
+    [
+        "xyzy",
+        "xxyz",
+        "yzxz",
+        "zzzz",
+        "zyzx",
+        "yyyy",
+        "zzxy",
+        "xyxy",
+    ],
+)
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        None,
+        "UInt8",
+        "Int8",
+        "UInt16",
+        "Int16",
+        "UInt32",
+        "Int32",
+        "UInt64",
+        "Int64",
+        "Float32",
+        "Float64",
+        "category",
+        "string",
+        pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        "datetime64[ns]",
+        "period[D]",
+        "Sparse[float]",
+    ],
+)
+@pytest.mark.parametrize("test_series", [True, False])
+def test_no_sort_keep_na(sequence, dtype, test_series, as_index):
+    # GH#46584, GH#48794
+
+    # Unique values to use for grouper, depends on dtype
+    if dtype in ("string", "string[pyarrow]"):
+        uniques = {"x": "x", "y": "y", "z": pd.NA}
+    elif dtype in ("datetime64[ns]", "period[D]"):
+        uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA}
+    elif dtype is not None and dtype.startswith(("I", "U", "F")):
+        uniques = {"x": 1, "y": 2, "z": pd.NA}
+    else:
+        uniques = {"x": 1, "y": 2, "z": np.nan}
+
+    df = pd.DataFrame(
+        {
+            "key": pd.Series([uniques[label] for label in sequence], dtype=dtype),
+            "a": [0, 1, 2, 3],
+        }
+    )
+    gb = df.groupby("key", dropna=False, sort=False, as_index=as_index, observed=False)
+    if test_series:
+        gb = gb["a"]
+    result = gb.sum()
+
+    # Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
+    # issues with hashing np.nan
+    summed = {}
+    for idx, label in enumerate(sequence):
+        summed[label] = summed.get(label, 0) + idx
+    if dtype == "category":
+        index = pd.CategoricalIndex(
+            [uniques[e] for e in summed],
+            df["key"].cat.categories,
+            name="key",
+        )
+    elif isinstance(dtype, str) and dtype.startswith("Sparse"):
+        index = pd.Index(
+            pd.array([uniques[label] for label in summed], dtype=dtype), name="key"
+        )
+    else:
+        index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
+    expected = pd.Series(summed.values(), index=index, name="a", dtype=None)
+    if not test_series:
+        expected = expected.to_frame()
+    if not as_index:
+        expected = expected.reset_index()
+        if dtype is not None and dtype.startswith("Sparse"):
+            expected["key"] = expected["key"].astype(dtype)
+
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize("test_series", [True, False])
+@pytest.mark.parametrize("dtype", [object, None])
+def test_null_is_null_for_dtype(
+    sort, dtype, nulls_fixture, nulls_fixture2, test_series
+):
+    # GH#48506 - groups should always result in using the null for the dtype
+    df = pd.DataFrame({"a": [1, 2]})
+    groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype)
+    obj = df["a"] if test_series else df
+    gb = obj.groupby(groups, dropna=False, sort=sort)
+    result = gb.sum()
+    index = pd.Index([na_value_for_dtype(groups.dtype)])
+    expected = pd.DataFrame({"a": [3]}, index=index)
+    if test_series:
+        tm.assert_series_equal(result, expected["a"])
+    else:
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
+def test_categorical_reducers(reduction_func, observed, sort, as_index, index_kind):
+    # Ensure there is at least one null value by appending to the end
+    values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
+    df = pd.DataFrame(
+        {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
+    )
+
+    # Strategy: Compare to dropna=True by filling null values with a new code
+    df_filled = df.copy()
+    df_filled["x"] = pd.Categorical(values, categories=[1, 2, 3, 4]).fillna(4)
+
+    if index_kind == "range":
+        keys = ["x"]
+    elif index_kind == "single":
+        keys = ["x"]
+        df = df.set_index("x")
+        df_filled = df_filled.set_index("x")
+    else:
+        keys = ["x", "x2"]
+        df["x2"] = df["x"]
+        df = df.set_index(["x", "x2"])
+        df_filled["x2"] = df_filled["x"]
+        df_filled = df_filled.set_index(["x", "x2"])
+    args = get_groupby_method_args(reduction_func, df)
+    args_filled = get_groupby_method_args(reduction_func, df_filled)
+    if reduction_func == "corrwith" and index_kind == "range":
+        # Don't include the grouping columns so we can call reset_index
+        args = (args[0].drop(columns=keys),)
+        args_filled = (args_filled[0].drop(columns=keys),)
+
+    gb_keepna = df.groupby(
+        keys, dropna=False, observed=observed, sort=sort, as_index=as_index
+    )
+
+    if not observed and reduction_func in ["idxmin", "idxmax"]:
+        with pytest.raises(
+            ValueError, match="empty group due to unobserved categories"
+        ):
+            getattr(gb_keepna, reduction_func)(*args)
+        return
+
+    gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True)
+    if reduction_func == "corrwith":
+        warn = Pandas4Warning
+        msg = "DataFrameGroupBy.corrwith is deprecated"
+    else:
+        warn = None
+        msg = ""
+    with tm.assert_produces_warning(warn, match=msg):
+        expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index()
+    expected["x"] = expected["x"].cat.remove_categories([4])
+    if index_kind == "multi":
+        expected["x2"] = expected["x2"].cat.remove_categories([4])
+    if as_index:
+        if index_kind == "multi":
+            expected = expected.set_index(["x", "x2"])
+        else:
+            expected = expected.set_index("x")
+    if reduction_func in ("idxmax", "idxmin") and index_kind != "range":
+        # expected was computed with a RangeIndex; need to translate to index values
+        values = expected["y"].values.tolist()
+        if index_kind == "single":
+            values = [np.nan if e == 4 else e for e in values]
+            expected["y"] = pd.Categorical(values, categories=[1, 2, 3])
+        else:
+            values = [(np.nan, np.nan) if e == (4, 4) else e for e in values]
+            expected["y"] = values
+    if reduction_func == "size":
+        # size, unlike other methods, has the desired behavior in GH#49519
+        expected = expected.rename(columns={0: "size"})
+        if as_index:
+            expected = expected["size"].rename(None)
+
+    if reduction_func == "corrwith":
+        warn = Pandas4Warning
+        msg = "DataFrameGroupBy.corrwith is deprecated"
+    else:
+        warn = None
+        msg = ""
+    with tm.assert_produces_warning(warn, match=msg):
+        result = getattr(gb_keepna, reduction_func)(*args)
+
+    # size will return a Series, others are DataFrame
+    tm.assert_equal(result, expected)
+
+
+def test_categorical_transformers(transformation_func, observed, sort, as_index):
+    # GH#36327
+    values = np.append(np.random.default_rng(2).choice([1, 2, None], size=19), None)
+    df = pd.DataFrame(
+        {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
+    )
+    args = get_groupby_method_args(transformation_func, df)
+
+    # Compute result for null group
+    null_group_values = df[df["x"].isnull()]["y"]
+    if transformation_func == "cumcount":
+        null_group_data = list(range(len(null_group_values)))
+    elif transformation_func == "ngroup":
+        if sort:
+            if observed:
+                na_group = df["x"].nunique(dropna=False) - 1
+            else:
+                # TODO: Should this be 3?
+                na_group = df["x"].nunique(dropna=False) - 1
+        else:
+            na_group = df.iloc[: null_group_values.index[0]]["x"].nunique()
+        null_group_data = len(null_group_values) * [na_group]
+    else:
+        null_group_data = getattr(null_group_values, transformation_func)(*args)
+    null_group_result = pd.DataFrame({"y": null_group_data})
+
+    gb_keepna = df.groupby(
+        "x", dropna=False, observed=observed, sort=sort, as_index=as_index
+    )
+    gb_dropna = df.groupby("x", dropna=True, observed=observed, sort=sort)
+
+    result = getattr(gb_keepna, transformation_func)(*args)
+    expected = getattr(gb_dropna, transformation_func)(*args)
+
+    for iloc, value in zip(
+        df[df["x"].isnull()].index.tolist(),
+        null_group_result.values.ravel(),
+        strict=True,
+    ):
+        if expected.ndim == 1:
+            expected.iloc[iloc] = value
+        else:
+            expected.iloc[iloc, 0] = value
+    if transformation_func == "ngroup":
+        expected[df["x"].notnull() & expected.ge(na_group)] += 1
+    if transformation_func not in ("rank", "diff", "pct_change", "shift"):
+        expected = expected.astype("int64")
+
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["head", "tail"])
+def test_categorical_head_tail(method, observed, sort, as_index):
+    # GH#36327
+    values = np.random.default_rng(2).choice([1, 2, None], 30)
+    df = pd.DataFrame(
+        {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
+    )
+    gb = df.groupby("x", dropna=False, observed=observed, sort=sort, as_index=as_index)
+    result = getattr(gb, method)()
+
+    if method == "tail":
+        values = values[::-1]
+    # Take the top 5 values from each group
+    mask = (
+        ((values == 1) & ((values == 1).cumsum() <= 5))
+        | ((values == 2) & ((values == 2).cumsum() <= 5))
+        # flake8 doesn't like the vectorized check for None, thinks we should use `is`
+        | ((values == None) & ((values == None).cumsum() <= 5))  # noqa: E711
+    )
+    if method == "tail":
+        mask = mask[::-1]
+    expected = df[mask]
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_agg():
+    # GH#36327
+    values = np.random.default_rng(2).choice([1, 2, None], 30)
+    df = pd.DataFrame(
+        {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
+    )
+    gb = df.groupby("x", dropna=False, observed=False)
+    result = gb.agg(lambda x: x.sum())
+    expected = gb.sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_transform():
+    # GH#36327
+    values = np.random.default_rng(2).choice([1, 2, None], 30)
+    df = pd.DataFrame(
+        {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
+    )
+    gb = df.groupby("x", dropna=False, observed=False)
+    result = gb.transform(lambda x: x.sum())
+    expected = gb.transform("sum")
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1dfb3aabdaf03d120058793e0c9866be2255db6
--- /dev/null
+++ b/pandas/tests/groupby/test_groupby_subclass.py
@@ -0,0 +1,152 @@
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas.errors import Pandas4Warning
+
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+)
+import pandas._testing as tm
+from pandas.tests.groupby import get_groupby_method_args
+
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
+)
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        tm.SubclassedDataFrame({"A": np.arange(0, 10)}),
+        tm.SubclassedSeries(np.arange(0, 10), name="A"),
+    ],
+)
+def test_groupby_preserves_subclass(obj, groupby_func):
+    # GH28330 -- preserve subclass through groupby operations
+
+    if isinstance(obj, Series) and groupby_func in {"corrwith"}:
+        pytest.skip(f"Not applicable for Series and {groupby_func}")
+
+    grouped = obj.groupby(np.arange(0, 10))
+
+    # Groups should preserve subclass type
+    assert isinstance(grouped.get_group(0), type(obj))
+
+    args = get_groupby_method_args(groupby_func, obj)
+
+    warn = Pandas4Warning if groupby_func == "corrwith" else None
+    msg = f"{type(grouped).__name__}.corrwith is deprecated"
+    with tm.assert_produces_warning(warn, match=msg):
+        result1 = getattr(grouped, groupby_func)(*args)
+    with tm.assert_produces_warning(warn, match=msg):
+        result2 = grouped.agg(groupby_func, *args)
+
+    # Reduction or transformation kernels should preserve type
+    slices = {"ngroup", "cumcount", "size"}
+    if isinstance(obj, DataFrame) and groupby_func in slices:
+        assert isinstance(result1, tm.SubclassedSeries)
+    else:
+        assert isinstance(result1, type(obj))
+
+    # Confirm .agg() groupby operations return same results
+    if isinstance(result1, DataFrame):
+        tm.assert_frame_equal(result1, result2)
+    else:
+        tm.assert_series_equal(result1, result2)
+
+
+def test_groupby_preserves_metadata():
+    # GH-37343
+    custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]})
+    assert "testattr" in custom_df._metadata
+    custom_df.testattr = "hello"
+    for _, group_df in custom_df.groupby("c"):
+        assert group_df.testattr == "hello"
+
+    # GH-45314
+    def func(group):
+        assert isinstance(group, tm.SubclassedDataFrame)
+        assert hasattr(group, "testattr")
+        assert group.testattr == "hello"
+        return group.testattr
+
+    result = custom_df.groupby("c").apply(func)
+    expected = tm.SubclassedSeries(["hello"] * 3, index=Index([7, 8, 9], name="c"))
+    tm.assert_series_equal(result, expected)
+
+    result = custom_df.groupby("c").apply(func)
+    tm.assert_series_equal(result, expected)
+
+    # https://github.com/pandas-dev/pandas/pull/56761
+    result = custom_df.groupby("c")[["a", "b"]].apply(func)
+    tm.assert_series_equal(result, expected)
+
+    def func2(group):
+        assert isinstance(group, tm.SubclassedSeries)
+        assert hasattr(group, "testattr")
+        return group.testattr
+
+    custom_series = tm.SubclassedSeries([1, 2, 3])
+    custom_series.testattr = "hello"
+    result = custom_series.groupby(custom_df["c"]).apply(func2)
+    tm.assert_series_equal(result, expected)
+    result = custom_series.groupby(custom_df["c"]).agg(func2)
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_apply_preserves_metadata():
+    # GH#62134 - Test that apply() preserves metadata when returning DataFrames/Series
+    custom_df = tm.SubclassedDataFrame({"a": [1, 2, 3], "b": [1, 1, 2], "c": [7, 8, 9]})
+    custom_df.testattr = "hello"
+
+    def sum_func(group):
+        assert isinstance(group, tm.SubclassedDataFrame)
+        assert hasattr(group, "testattr")
+        assert group.testattr == "hello"
+        return group.sum()
+
+    result = custom_df.groupby("c").apply(sum_func)
+    assert hasattr(result, "testattr"), "DataFrame apply() should preserve metadata"
+    assert result.testattr == "hello"
+
+    custom_series = tm.SubclassedSeries([1, 2, 3])
+    custom_series.testattr = "hello"
+
+    def sum_series_func(group):
+        assert isinstance(group, tm.SubclassedSeries)
+        assert hasattr(group, "testattr")
+        assert group.testattr == "hello"
+        return group.sum()
+
+    result = custom_series.groupby(custom_df["c"]).apply(sum_series_func)
+    assert hasattr(result, "testattr"), "Series apply() should preserve metadata"
+    assert result.testattr == "hello"
+
+
+@pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame])
+def test_groupby_resample_preserves_subclass(obj):
+    # GH28330 -- preserve subclass through groupby.resample()
+
+    df = obj(
+        {
+            "Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object),
+            "Quantity": [18, 3, 5, 1, 9, 3],
+            "Date": [
+                datetime(2013, 9, 1, 13, 0),
+                datetime(2013, 9, 1, 13, 5),
+                datetime(2013, 10, 1, 20, 0),
+                datetime(2013, 10, 3, 10, 0),
+                datetime(2013, 12, 2, 12, 0),
+                datetime(2013, 9, 2, 14, 0),
+            ],
+        }
+    )
+    df = df.set_index("Date")
+
+    # Confirm groupby.resample() preserves dataframe type
+    result = df.groupby("Buyer").resample("5D").sum()
+    assert isinstance(result, obj)
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
new file mode 100644
index 0000000000000000000000000000000000000000..6450b1108d240b3eff5a26b6cb33fab8d0a07b90
--- /dev/null
+++ b/pandas/tests/groupby/test_grouping.py
@@ -0,0 +1,1216 @@
+"""
+test where we are determining what we are grouping, or getting groups
+"""
+
+from datetime import (
+    date,
+    timedelta,
+)
+
+import numpy as np
+import pytest
+
+from pandas.errors import (
+    Pandas4Warning,
+    SpecificationError,
+)
+
+import pandas as pd
+from pandas import (
+    CategoricalIndex,
+    DataFrame,
+    Grouper,
+    Index,
+    MultiIndex,
+    Series,
+    Timestamp,
+    date_range,
+    period_range,
+)
+import pandas._testing as tm
+from pandas.core.groupby.grouper import Grouping
+
+# selection
+# --------------------------------
+
+
+class TestSelection:
+    def test_select_bad_cols(self):
+        df = DataFrame([[1, 2]], columns=["A", "B"])
+        g = df.groupby("A")
+        with pytest.raises(KeyError, match="\"Columns not found: 'C'\""):
+            g[["C"]]
+
+        with pytest.raises(KeyError, match="^[^A]+$"):
+            # A should not be referenced as a bad column...
+            # will have to rethink regex if you change message!
+            g[["A", "C"]]
+
+    def test_groupby_duplicated_column_errormsg(self):
+        # GH7511
+        df = DataFrame(
+            columns=["A", "B", "A", "C"], data=[range(4), range(2, 6), range(0, 8, 2)]
+        )
+
+        msg = "Grouper for 'A' not 1-dimensional"
+        with pytest.raises(ValueError, match=msg):
+            df.groupby("A")
+        with pytest.raises(ValueError, match=msg):
+            df.groupby(["A", "B"])
+
+        grouped = df.groupby("B")
+        c = grouped.count()
+        assert c.columns.nlevels == 1
+        assert c.columns.size == 3
+
+    def test_column_select_via_attr(self, df):
+        result = df.groupby("A").C.sum()
+        expected = df.groupby("A")["C"].sum()
+        tm.assert_series_equal(result, expected)
+
+        df["mean"] = 1.5
+        result = df.groupby("A").mean(numeric_only=True)
+        expected = df.groupby("A")[["C", "D", "mean"]].agg("mean")
+        tm.assert_frame_equal(result, expected)
+
+    def test_getitem_list_of_columns(self):
+        df = DataFrame(
+            {
+                "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+                "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+                "C": np.random.default_rng(2).standard_normal(8),
+                "D": np.random.default_rng(2).standard_normal(8),
+                "E": np.random.default_rng(2).standard_normal(8),
+            }
+        )
+
+        result = df.groupby("A")[["C", "D"]].mean()
+        result2 = df.groupby("A")[df.columns[2:4]].mean()
+
+        expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean()
+
+        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(result2, expected)
+
+    def test_getitem_numeric_column_names(self):
+        # GH #13731
+        df = DataFrame(
+            {
+                0: list("abcd") * 2,
+                2: np.random.default_rng(2).standard_normal(8),
+                4: np.random.default_rng(2).standard_normal(8),
+                6: np.random.default_rng(2).standard_normal(8),
+            }
+        )
+        result = df.groupby(0)[df.columns[1:3]].mean()
+        result2 = df.groupby(0)[[2, 4]].mean()
+
+        expected = df.loc[:, [0, 2, 4]].groupby(0).mean()
+
+        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(result2, expected)
+
+        # per GH 23566 enforced deprecation raises a ValueError
+        with pytest.raises(ValueError, match="Cannot subset columns with a tuple"):
+            df.groupby(0)[2, 4].mean()
+
+    def test_getitem_single_tuple_of_columns_raises(self, df):
+        # per GH 23566 enforced deprecation raises a ValueError
+        with pytest.raises(ValueError, match="Cannot subset columns with a tuple"):
+            df.groupby("A")["C", "D"].mean()
+
+    def test_getitem_single_column(self):
+        df = DataFrame(
+            {
+                "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+                "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
+                "C": np.random.default_rng(2).standard_normal(8),
+                "D": np.random.default_rng(2).standard_normal(8),
+                "E": np.random.default_rng(2).standard_normal(8),
+            }
+        )
+
+        result = df.groupby("A")["C"].mean()
+
+        as_frame = df.loc[:, ["A", "C"]].groupby("A").mean()
+        as_series = as_frame.iloc[:, 0]
+        expected = as_series
+
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "func", [lambda x: x.sum(), lambda x: x.agg(lambda y: y.sum())]
+    )
+    def test_getitem_from_grouper(self, func):
+        # GH 50383
+        df = DataFrame({"a": [1, 1, 2], "b": 3, "c": 4, "d": 5})
+        gb = df.groupby(["a", "b"])[["a", "c"]]
+
+        idx = MultiIndex.from_tuples([(1, 3), (2, 3)], names=["a", "b"])
+        expected = DataFrame({"a": [2, 2], "c": [8, 4]}, index=idx)
+        result = func(gb)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_indices_grouped_by_tuple_with_lambda(self):
+        # GH 36158
+        df = DataFrame(
+            {
+                "Tuples": (
+                    (x, y)
+                    for x in [0, 1]
+                    for y in np.random.default_rng(2).integers(3, 5, 5)
+                )
+            }
+        )
+
+        gb = df.groupby("Tuples")
+        gb_lambda = df.groupby(lambda x: df.iloc[x, 0])
+
+        expected = gb.indices
+        result = gb_lambda.indices
+
+        tm.assert_dict_equal(result, expected)
+
+
+# grouping
+# --------------------------------
+
+
+class TestGrouping:
+    @pytest.mark.parametrize(
+        "index",
+        [
+            Index(list("abcde")),
+            Index(np.arange(5)),
+            Index(np.arange(5, dtype=float)),
+            date_range("2020-01-01", periods=5),
+            period_range("2020-01-01", periods=5),
+        ],
+    )
+    def test_grouper_index_types(self, index):
+        # related GH5375
+        # groupby misbehaving when using a Floatlike index
+        df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB"), index=index)
+
+        df.groupby(list("abcde"), group_keys=False).apply(lambda x: x)
+
+        df.index = df.index[::-1]
+        df.groupby(list("abcde"), group_keys=False).apply(lambda x: x)
+
+    def test_grouper_multilevel_freq(self):
+        # GH 7885
+        # with level and freq specified in a Grouper
+        d0 = date.today() - timedelta(days=14)
+        dates = date_range(d0, date.today())
+        date_index = MultiIndex.from_product([dates, dates], names=["foo", "bar"])
+        df = DataFrame(np.random.default_rng(2).integers(0, 100, 225), index=date_index)
+
+        # Check string level
+        expected = (
+            df.reset_index()
+            .groupby([Grouper(key="foo", freq="W"), Grouper(key="bar", freq="W")])
+            .sum()
+        )
+        # reset index changes columns dtype to object
+        expected.columns = Index([0], dtype="int64")
+
+        result = df.groupby(
+            [Grouper(level="foo", freq="W"), Grouper(level="bar", freq="W")]
+        ).sum()
+        tm.assert_frame_equal(result, expected)
+
+        # Check integer level
+        result = df.groupby(
+            [Grouper(level=0, freq="W"), Grouper(level=1, freq="W")]
+        ).sum()
+        tm.assert_frame_equal(result, expected)
+
+    def test_grouper_creation_bug(self):
+        # GH 8795
+        df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]})
+        g = df.groupby("A")
+        expected = g.sum()
+
+        g = df.groupby(Grouper(key="A"))
+        result = g.sum()
+        tm.assert_frame_equal(result, expected)
+
+        result = g.apply(lambda x: x.sum())
+        tm.assert_frame_equal(result, expected)
+
+    def test_grouper_creation_bug2(self):
+        # GH14334
+        # Grouper(key=...) may be passed in a list
+        df = DataFrame(
+            {"A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6]}
+        )
+        # Group by single column
+        expected = df.groupby("A").sum()
+        g = df.groupby([Grouper(key="A")])
+        result = g.sum()
+        tm.assert_frame_equal(result, expected)
+
+        # Group by two columns
+        # using a combination of strings and Grouper objects
+        expected = df.groupby(["A", "B"]).sum()
+
+        # Group with two Grouper objects
+        g = df.groupby([Grouper(key="A"), Grouper(key="B")])
+        result = g.sum()
+        tm.assert_frame_equal(result, expected)
+
+        # Group with a string and a Grouper object
+        g = df.groupby(["A", Grouper(key="B")])
+        result = g.sum()
+        tm.assert_frame_equal(result, expected)
+
+        # Group with a Grouper object and a string
+        g = df.groupby([Grouper(key="A"), "B"])
+        result = g.sum()
+        tm.assert_frame_equal(result, expected)
+
+    def test_grouper_creation_bug3(self, unit):
+        # GH8866
+        dti = date_range("20130101", periods=2, unit=unit)
+        mi = MultiIndex.from_product(
+            [list("ab"), range(2), dti],
+            names=["one", "two", "three"],
+        )
+        ser = Series(
+            np.arange(8, dtype="int64"),
+            index=mi,
+        )
+        result = ser.groupby(Grouper(level="three", freq="ME")).sum()
+        exp_dti = pd.DatetimeIndex(
+            [Timestamp("2013-01-31")], freq="ME", name="three"
+        ).as_unit(unit)
+        expected = Series(
+            [28],
+            index=exp_dti,
+        )
+        tm.assert_series_equal(result, expected)
+
+        # just specifying a level breaks
+        result = ser.groupby(Grouper(level="one")).sum()
+        expected = ser.groupby(level="one").sum()
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("func", [False, True])
+    def test_grouper_returning_tuples(self, func):
+        # GH 22257 , both with dict and with callable
+        df = DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]})
+        mapping = dict(zip(range(4), [("C", 5), ("D", 6)] * 2, strict=True))
+
+        if func:
+            gb = df.groupby(by=lambda idx: mapping[idx], sort=False)
+        else:
+            gb = df.groupby(by=mapping, sort=False)
+
+        name, expected = next(iter(gb))
+        assert name == ("C", 5)
+        result = gb.get_group(name)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_grouper_column_and_index(self):
+        # GH 14327
+
+        # Grouping a multi-index frame by a column and an index level should
+        # be equivalent to resetting the index and grouping by two columns
+        idx = MultiIndex.from_tuples(
+            [("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)]
+        )
+        idx.names = ["outer", "inner"]
+        df_multi = DataFrame(
+            {"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]},
+            index=idx,
+        )
+        result = df_multi.groupby(["B", Grouper(level="inner")]).mean(numeric_only=True)
+        expected = (
+            df_multi.reset_index().groupby(["B", "inner"]).mean(numeric_only=True)
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # Test the reverse grouping order
+        result = df_multi.groupby([Grouper(level="inner"), "B"]).mean(numeric_only=True)
+        expected = (
+            df_multi.reset_index().groupby(["inner", "B"]).mean(numeric_only=True)
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # Grouping a single-index frame by a column and the index should
+        # be equivalent to resetting the index and grouping by two columns
+        df_single = df_multi.reset_index("outer")
+        result = df_single.groupby(["B", Grouper(level="inner")]).mean(
+            numeric_only=True
+        )
+        expected = (
+            df_single.reset_index().groupby(["B", "inner"]).mean(numeric_only=True)
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # Test the reverse grouping order
+        result = df_single.groupby([Grouper(level="inner"), "B"]).mean(
+            numeric_only=True
+        )
+        expected = (
+            df_single.reset_index().groupby(["inner", "B"]).mean(numeric_only=True)
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_levels_and_columns(self):
+        # GH9344, GH9049
+        idx_names = ["x", "y"]
+        idx = MultiIndex.from_tuples([(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
+        df = DataFrame(np.arange(12).reshape(-1, 3), index=idx)
+
+        by_levels = df.groupby(level=idx_names).mean()
+        # reset_index changes columns dtype to object
+        by_columns = df.reset_index().groupby(idx_names).mean()
+
+        # without casting, by_columns.columns is object-dtype
+        by_columns.columns = by_columns.columns.astype(np.int64)
+        tm.assert_frame_equal(by_levels, by_columns)
+
+    def test_groupby_categorical_index_and_columns(self, observed):
+        # GH18432, adapted for GH25871
+        columns = ["A", "B", "A", "B"]
+        categories = ["B", "A"]
+        data = np.array(
+            [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int
+        )
+        cat_columns = CategoricalIndex(columns, categories=categories, ordered=True)
+        expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int)
+        expected_columns = CategoricalIndex(
+            categories, categories=categories, ordered=True
+        )
+
+        # test transposed version
+        df = DataFrame(data.T, index=cat_columns)
+        result = df.groupby(level=0, observed=observed).sum()
+        expected = DataFrame(data=expected_data.T, index=expected_columns)
+        tm.assert_frame_equal(result, expected)
+
+    def test_grouper_getting_correct_binner(self):
+        # GH 10063
+        # using a non-time-based grouper and a time-based grouper
+        # and specifying levels
+        df = DataFrame(
+            {"A": 1},
+            index=MultiIndex.from_product(
+                [list("ab"), date_range("20130101", periods=80)], names=["one", "two"]
+            ),
+        )
+        result = df.groupby(
+            [Grouper(level="one"), Grouper(level="two", freq="ME")]
+        ).sum()
+        expected = DataFrame(
+            {"A": [31, 28, 21, 31, 28, 21]},
+            index=MultiIndex.from_product(
+                [list("ab"), date_range("20130101", freq="ME", periods=3)],
+                names=["one", "two"],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_grouper_iter(self, df):
+        gb = df.groupby("A")
+        grouper = gb._grouper
+        result = sorted(grouper)
+        expected = ["bar", "foo"]
+        assert result == expected
+
+    def test_empty_groups(self, df):
+        # see gh-1048
+        with pytest.raises(ValueError, match="No group keys passed!"):
+            df.groupby([])
+
+    def test_groupby_grouper(self, df):
+        grouped = df.groupby("A")
+        grouper = grouped._grouper
+        result = df.groupby(grouper).mean(numeric_only=True)
+        expected = grouped.mean(numeric_only=True)
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_dict_mapping(self):
+        # GH #679
+        s = Series({"T1": 5})
+        result = s.groupby({"T1": "T2"}).agg("sum")
+        expected = s.groupby(["T2"]).agg("sum")
+        tm.assert_series_equal(result, expected)
+
+        s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd"))
+        mapping = {"a": 0, "b": 0, "c": 1, "d": 1}
+
+        result = s.groupby(mapping).mean()
+        result2 = s.groupby(mapping).agg("mean")
+        exp_key = np.array([0, 0, 1, 1], dtype=np.int64)
+        expected = s.groupby(exp_key).mean()
+        expected2 = s.groupby(exp_key).mean()
+        tm.assert_series_equal(result, expected)
+        tm.assert_series_equal(result, result2)
+        tm.assert_series_equal(result, expected2)
+
+    @pytest.mark.parametrize(
+        "index",
+        [
+            [0, 1, 2, 3],
+            ["a", "b", "c", "d"],
+            [Timestamp(2021, 7, 28 + i) for i in range(4)],
+        ],
+    )
+    def test_groupby_series_named_with_tuple(self, frame_or_series, index):
+        # GH 42731
+        obj = frame_or_series([1, 2, 3, 4], index=index)
+        groups = Series([1, 0, 1, 0], index=index, name=("a", "a"))
+        result = obj.groupby(groups).last()
+        expected = frame_or_series([4, 3])
+        expected.index.name = ("a", "a")
+        tm.assert_equal(result, expected)
+
+    def test_groupby_grouper_f_sanity_checked(self):
+        dates = date_range("01-Jan-2013", periods=12, freq="MS")
+        ts = Series(np.random.default_rng(2).standard_normal(12), index=dates)
+
+        # GH51979
+        # simple check that the passed function doesn't operates on the whole index
+        msg = "'Timestamp' object is not subscriptable"
+        with pytest.raises(TypeError, match=msg):
+            ts.groupby(lambda key: key[0:6])
+
+        result = ts.groupby(lambda x: x).sum()
+        expected = ts.groupby(ts.index).sum()
+        expected.index.freq = None
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_with_datetime_key(self):
+        # GH 51158
+        df = DataFrame(
+            {
+                "id": ["a", "b"] * 3,
+                "b": date_range("2000-01-01", "2000-01-03", freq="9h"),
+            }
+        )
+        grouper = Grouper(key="b", freq="D")
+        gb = df.groupby([grouper, "id"])
+
+        # test number of groups
+        expected = {
+            (Timestamp("2000-01-01"), "a"): [0, 2],
+            (Timestamp("2000-01-01"), "b"): [1],
+            (Timestamp("2000-01-02"), "a"): [4],
+            (Timestamp("2000-01-02"), "b"): [3, 5],
+        }
+        tm.assert_dict_equal(gb.groups, expected)
+
+        # test number of group keys
+        assert len(gb.groups.keys()) == 4
+
+    def test_grouping_error_on_multidim_input(self, df):
+        msg = "Grouper for '<class 'pandas.DataFrame'>' not 1-dimensional"
+        with pytest.raises(ValueError, match=msg):
+            Grouping(df.index, df[["A", "A"]])
+
+    def test_multiindex_negative_level(self, multiindex_dataframe_random_data):
+        # GH 13901
+        result = multiindex_dataframe_random_data.groupby(level=-1).sum()
+        expected = multiindex_dataframe_random_data.groupby(level="second").sum()
+        tm.assert_frame_equal(result, expected)
+
+        result = multiindex_dataframe_random_data.groupby(level=-2).sum()
+        expected = multiindex_dataframe_random_data.groupby(level="first").sum()
+        tm.assert_frame_equal(result, expected)
+
+        result = multiindex_dataframe_random_data.groupby(level=[-2, -1]).sum()
+        expected = multiindex_dataframe_random_data.sort_index()
+        tm.assert_frame_equal(result, expected)
+
+        result = multiindex_dataframe_random_data.groupby(level=[-1, "first"]).sum()
+        expected = multiindex_dataframe_random_data.groupby(
+            level=["second", "first"]
+        ).sum()
+        tm.assert_frame_equal(result, expected)
+
+    def test_agg_with_dict_raises(self, df):
+        df.columns = np.arange(len(df.columns))
+        msg = "nested renamer is not supported"
+        with pytest.raises(SpecificationError, match=msg):
+            df.groupby(1, as_index=False)[2].agg({"Q": np.mean})
+
+    def test_multiindex_columns_empty_level(self):
+        lst = [["count", "values"], ["to filter", ""]]
+        midx = MultiIndex.from_tuples(lst)
+
+        df = DataFrame([[1, "A"]], columns=midx)
+
+        msg = "In a future version, the keys"
+        grouped = df.groupby("to filter").groups
+        assert grouped["A"] == [0]
+
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            grouped = df.groupby([("to filter", "")]).groups
+        assert grouped["A"] == [0]
+
+        df = DataFrame([[1, "A"], [2, "B"]], columns=midx)
+
+        expected = df.groupby("to filter").groups
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            result = df.groupby([("to filter", "")]).groups
+        assert result == expected
+
+        df = DataFrame([[1, "A"], [2, "A"]], columns=midx)
+
+        expected = df.groupby("to filter").groups
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            result = df.groupby([("to filter", "")]).groups
+        tm.assert_dict_equal(result, expected)
+
+    def test_groupby_multiindex_tuple(self):
+        # GH 17979, GH#59179
+        df = DataFrame(
+            [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
+            columns=MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]),
+        )
+
+        msg = "In a future version, the keys"
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            expected = df.groupby([("b", 1)]).groups
+        result = df.groupby(("b", 1)).groups
+        tm.assert_dict_equal(expected, result)
+
+        df2 = DataFrame(
+            df.values,
+            columns=MultiIndex.from_arrays(
+                [["a", "b", "b", "c"], ["d", "d", "e", "e"]]
+            ),
+        )
+
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            expected = df2.groupby([("b", "d")]).groups
+        result = df.groupby(("b", 1)).groups
+        tm.assert_dict_equal(expected, result)
+
+        df3 = DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"])
+
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            expected = df3.groupby([("b", "d")]).groups
+        result = df.groupby(("b", 1)).groups
+        tm.assert_dict_equal(expected, result)
+
+    def test_groupby_multiindex_partial_indexing_equivalence(self):
+        # GH 17977, GH#59179
+        df = DataFrame(
+            [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
+            columns=MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]),
+        )
+
+        expected_mean = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].mean()
+        result_mean = df.groupby([("a", 1)])["b"].mean()
+        tm.assert_frame_equal(expected_mean, result_mean)
+
+        expected_sum = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].sum()
+        result_sum = df.groupby([("a", 1)])["b"].sum()
+        tm.assert_frame_equal(expected_sum, result_sum)
+
+        expected_count = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].count()
+        result_count = df.groupby([("a", 1)])["b"].count()
+        tm.assert_frame_equal(expected_count, result_count)
+
+        expected_min = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].min()
+        result_min = df.groupby([("a", 1)])["b"].min()
+        tm.assert_frame_equal(expected_min, result_min)
+
+        expected_max = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].max()
+        result_max = df.groupby([("a", 1)])["b"].max()
+        tm.assert_frame_equal(expected_max, result_max)
+
+        msg = "In a future version, the keys"
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            expected_groups = df.groupby([("a", 1)])[[("b", 1), ("b", 2)]].groups
+            result_groups = df.groupby([("a", 1)])["b"].groups
+        tm.assert_dict_equal(expected_groups, result_groups)
+
+    def test_groupby_level(self, sort, multiindex_dataframe_random_data, df):
+        # GH 17537
+        frame = multiindex_dataframe_random_data
+        deleveled = frame.reset_index()
+
+        result0 = frame.groupby(level=0, sort=sort).sum()
+        result1 = frame.groupby(level=1, sort=sort).sum()
+
+        expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum()
+        expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum()
+
+        expected0.index.name = "first"
+        expected1.index.name = "second"
+
+        assert result0.index.name == "first"
+        assert result1.index.name == "second"
+
+        tm.assert_frame_equal(result0, expected0)
+        tm.assert_frame_equal(result1, expected1)
+        assert result0.index.name == frame.index.names[0]
+        assert result1.index.name == frame.index.names[1]
+
+        # groupby level name
+        result0 = frame.groupby(level="first", sort=sort).sum()
+        result1 = frame.groupby(level="second", sort=sort).sum()
+        tm.assert_frame_equal(result0, expected0)
+        tm.assert_frame_equal(result1, expected1)
+
+        # raise exception for non-MultiIndex
+        msg = "level > 0 or level < -1 only valid with MultiIndex"
+        with pytest.raises(ValueError, match=msg):
+            df.groupby(level=1)
+
+    def test_groupby_level_index_names(self):
+        # GH4014 this used to raise ValueError since 'exp'>1 (in py2)
+        df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index(
+            "exp"
+        )
+        df.groupby(level="exp")
+        msg = "level name foo is not the name of the index"
+        with pytest.raises(ValueError, match=msg):
+            df.groupby(level="foo")
+
+    def test_groupby_level_with_nas(self, sort):
+        # GH 17537
+        index = MultiIndex(
+            levels=[[1, 0], [0, 1, 2, 3]],
+            codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
+        )
+
+        # factorizing doesn't confuse things
+        s = Series(np.arange(8.0), index=index)
+        result = s.groupby(level=0, sort=sort).sum()
+        expected = Series([6.0, 22.0], index=[0, 1])
+        tm.assert_series_equal(result, expected)
+
+        index = MultiIndex(
+            levels=[[1, 0], [0, 1, 2, 3]],
+            codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
+        )
+
+        # factorizing doesn't confuse things
+        s = Series(np.arange(8.0), index=index)
+        result = s.groupby(level=0, sort=sort).sum()
+        expected = Series([6.0, 18.0], index=[0.0, 1.0])
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_args(self, multiindex_dataframe_random_data):
+        # PR8618 and issue 8015
+        frame = multiindex_dataframe_random_data
+
+        msg = "You have to supply one of 'by' and 'level'"
+        with pytest.raises(TypeError, match=msg):
+            frame.groupby()
+
+        msg = "You have to supply one of 'by' and 'level'"
+        with pytest.raises(TypeError, match=msg):
+            frame.groupby(by=None, level=None)
+
+    @pytest.mark.parametrize(
+        "sort,labels",
+        [
+            [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
+            [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]],
+        ],
+    )
+    def test_level_preserve_order(self, sort, labels, multiindex_dataframe_random_data):
+        # GH 17537
+        grouped = multiindex_dataframe_random_data.groupby(level=0, sort=sort)
+        exp_labels = np.array(labels, np.intp)
+        tm.assert_almost_equal(grouped._grouper.ids, exp_labels)
+
+    def test_grouping_labels(self, multiindex_dataframe_random_data):
+        grouped = multiindex_dataframe_random_data.groupby(
+            multiindex_dataframe_random_data.index.get_level_values(0)
+        )
+        exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
+        tm.assert_almost_equal(grouped._grouper.codes[0], exp_labels)
+
+    def test_list_grouper_with_nat(self):
+        # GH 14715, GH#59179
+        df = DataFrame({"date": date_range("1/1/2011", periods=365, freq="D")})
+        df.iloc[-1] = pd.NaT
+        grouper = Grouper(key="date", freq="YS")
+        msg = "In a future version, the keys"
+
+        # Grouper in a list grouping
+        gb = df.groupby([grouper])
+        expected = {Timestamp("2011-01-01"): Index(list(range(364)))}
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            result = gb.groups
+        tm.assert_dict_equal(result, expected)
+
+        # Test case without a list
+        result = df.groupby(grouper)
+        expected = {Timestamp("2011-01-01"): 365}
+        tm.assert_dict_equal(result.groups, expected)
+
+    @pytest.mark.parametrize(
+        "func,expected",
+        [
+            (
+                "transform",
+                Series(name=2, dtype=np.float64),
+            ),
+            (
+                "agg",
+                Series(
+                    name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
+                ),
+            ),
+            (
+                "apply",
+                Series(
+                    name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
+                ),
+            ),
+        ],
+    )
+    def test_evaluate_with_empty_groups(self, func, expected):
+        # 26208
+        # test transform'ing empty groups
+        # (not testing other agg fns, because they return
+        # different index objects.
+        df = DataFrame({1: [], 2: []})
+        g = df.groupby(1, group_keys=True)
+        result = getattr(g[2], func)(lambda x: x)
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_apply_empty_with_group_keys_false(self):
+        # 60471
+        # test apply'ing empty groups with group_keys False
+        # (not testing other agg fns, because they return
+        # different index objects.
+        df = DataFrame({"A": [], "B": [], "C": []})
+        g = df.groupby("A", group_keys=False)
+        result = g.apply(lambda x: x / x.sum())
+        expected = DataFrame({"B": [], "C": []}, index=None)
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_empty(self):
+        # https://github.com/pandas-dev/pandas/issues/27190
+        s = Series([], name="name", dtype="float64")
+        gr = s.groupby([])
+
+        result = gr.mean()
+        expected = s.set_axis(Index([], dtype=np.intp))
+        tm.assert_series_equal(result, expected)
+
+        # check group properties
+        assert len(gr._grouper.groupings) == 1
+        tm.assert_numpy_array_equal(
+            gr._grouper.ids, np.array([], dtype=np.dtype(np.intp))
+        )
+
+        assert gr._grouper.ngroups == 0
+
+        # check name
+        gb = s.groupby(s)
+        grouper = gb._grouper
+        result = grouper.names
+        expected = ["name"]
+        assert result == expected
+
+    def test_groupby_level_index_value_all_na(self):
+        # issue 20519
+        df = DataFrame(
+            [["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"]
+        ).set_index(["A", "B"])
+        result = df.groupby(level=["A", "B"]).sum()
+        expected = DataFrame(
+            data=[],
+            index=MultiIndex(
+                levels=[Index(["x"], dtype="str"), Index([], dtype="float64")],
+                codes=[[], []],
+                names=["A", "B"],
+            ),
+            columns=["C"],
+            dtype="int64",
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_multiindex_level_empty(self):
+        # https://github.com/pandas-dev/pandas/issues/31670
+        df = DataFrame(
+            [[123, "a", 1.0], [123, "b", 2.0]], columns=["id", "category", "value"]
+        )
+        df = df.set_index(["id", "category"])
+        empty = df[df.value < 0]
+        result = empty.groupby("id").sum()
+        expected = DataFrame(
+            dtype="float64",
+            columns=["value"],
+            index=Index([], dtype=np.int64, name="id"),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_tuple_keys_handle_multiindex(self):
+        # https://github.com/pandas-dev/pandas/issues/21340
+        df = DataFrame(
+            {
+                "num1": [0, 8, 9, 4, 3, 3, 5, 9, 3, 6],
+                "num2": [3, 8, 6, 4, 9, 2, 1, 7, 0, 9],
+                "num3": [6, 5, 7, 8, 5, 1, 1, 10, 7, 8],
+                "category_tuple": [
+                    (0, 1),
+                    (0, 1),
+                    (0, 1),
+                    (0, 4),
+                    (2, 3),
+                    (2, 3),
+                    (2, 3),
+                    (2, 3),
+                    (5,),
+                    (6,),
+                ],
+                "category_string": list("aaabbbbcde"),
+            }
+        )
+        expected = df.sort_values(by=["category_tuple", "num1"])
+        result = df.groupby("category_tuple").apply(lambda x: x.sort_values(by="num1"))
+        expected = expected[result.columns]
+        tm.assert_frame_equal(result.reset_index(drop=True), expected)
+
+    def test_groupby_grouper_immutable_list_item(self):
+        # GH 26564 - prevent 'ValueError: all keys need to be the same shape'
+        # when reusing a list of groupers
+        df1 = DataFrame([["05/29/2019"], ["05/28/2019"]], columns=["date"]).assign(
+            date=lambda df: pd.to_datetime(df["date"])
+        )
+        df2 = DataFrame(columns=["date"]).assign(
+            date=lambda df: pd.to_datetime(df["date"])
+        )
+
+        groupers = [Grouper(key="date", freq="1D")]
+
+        df1.groupby(groupers).head()
+        # no error
+        df2.groupby(groupers).head()
+
+
+# get_group
+# --------------------------------
+
+
+class TestGetGroup:
+    def test_get_group(self):
+        # GH 5267
+        # be datelike friendly
+        df = DataFrame(
+            {
+                "DATE": pd.to_datetime(
+                    [
+                        "10-Oct-2013",
+                        "10-Oct-2013",
+                        "10-Oct-2013",
+                        "11-Oct-2013",
+                        "11-Oct-2013",
+                        "11-Oct-2013",
+                    ]
+                ),
+                "label": ["foo", "foo", "bar", "foo", "foo", "bar"],
+                "VAL": [1, 2, 3, 4, 5, 6],
+            }
+        )
+
+        g = df.groupby("DATE")
+        key = next(iter(g.groups))
+        result1 = g.get_group(key)
+        result2 = g.get_group(Timestamp(key).to_pydatetime())
+        result3 = g.get_group(str(Timestamp(key)))
+        tm.assert_frame_equal(result1, result2)
+        tm.assert_frame_equal(result1, result3)
+
+        g = df.groupby(["DATE", "label"])
+
+        key = next(iter(g.groups))
+        result1 = g.get_group(key)
+        result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1]))
+        result3 = g.get_group((str(Timestamp(key[0])), key[1]))
+        tm.assert_frame_equal(result1, result2)
+        tm.assert_frame_equal(result1, result3)
+
+        # must pass a same-length tuple with multiple keys
+        msg = "must supply a tuple to get_group with multiple grouping keys"
+        with pytest.raises(ValueError, match=msg):
+            g.get_group("foo")
+        with pytest.raises(ValueError, match=msg):
+            g.get_group("foo")
+        msg = "must supply a same-length tuple to get_group with multiple grouping keys"
+        with pytest.raises(ValueError, match=msg):
+            g.get_group(("foo", "bar", "baz"))
+
+    def test_get_group_empty_bins(self, observed):
+        d = DataFrame([3, 1, 7, 6])
+        bins = [0, 5, 10, 15]
+        g = d.groupby(pd.cut(d[0], bins), observed=observed)
+
+        # TODO: should prob allow a str of Interval work as well
+        # IOW '(0, 5]'
+        result = g.get_group(pd.Interval(0, 5))
+        expected = DataFrame([3, 1], index=[0, 1])
+        tm.assert_frame_equal(result, expected)
+
+        msg = r"Interval\(10, 15, closed='right'\)"
+        with pytest.raises(KeyError, match=msg):
+            g.get_group(pd.Interval(10, 15))
+
+    def test_get_group_grouped_by_tuple(self):
+        # GH 8121
+        df = DataFrame([[(1,), (1, 2), (1,), (1, 2)]], index=["ids"]).T
+        gr = df.groupby("ids")
+        expected = DataFrame({"ids": [(1,), (1,)]}, index=[0, 2])
+        result = gr.get_group((1,))
+        tm.assert_frame_equal(result, expected)
+
+        dt = pd.to_datetime(["2010-01-01", "2010-01-02", "2010-01-01", "2010-01-02"])
+        df = DataFrame({"ids": [(x,) for x in dt]})
+        gr = df.groupby("ids")
+        result = gr.get_group(("2010-01-01",))
+        expected = DataFrame({"ids": [(dt[0],), (dt[0],)]}, index=[0, 2])
+        tm.assert_frame_equal(result, expected)
+
+    def test_get_group_grouped_by_tuple_with_lambda(self):
+        # GH 36158
+        df = DataFrame(
+            {
+                "Tuples": (
+                    (x, y)
+                    for x in [0, 1]
+                    for y in np.random.default_rng(2).integers(3, 5, 5)
+                )
+            }
+        )
+
+        gb = df.groupby("Tuples")
+        gb_lambda = df.groupby(lambda x: df.iloc[x, 0])
+
+        expected = gb.get_group(next(iter(gb.groups.keys())))
+        result = gb_lambda.get_group(next(iter(gb_lambda.groups.keys())))
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_with_empty(self):
+        index = pd.DatetimeIndex(())
+        data = ()
+        series = Series(data, index, dtype=object)
+        grouper = Grouper(freq="D")
+        grouped = series.groupby(grouper)
+        assert next(iter(grouped), None) is None
+
+    def test_groupby_with_single_column(self):
+        df = DataFrame({"a": list("abssbab")})
+        tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
+        # GH 13530
+        exp = DataFrame(
+            index=Index(["a", "b", "s"], name="a"), columns=Index([], dtype="str")
+        )
+        tm.assert_frame_equal(df.groupby("a").count(), exp)
+        tm.assert_frame_equal(df.groupby("a").sum(), exp)
+
+        exp = df.iloc[[3, 4, 5]]
+        tm.assert_frame_equal(df.groupby("a").nth(1), exp)
+
+    def test_gb_key_len_equal_axis_len(self):
+        # GH16843
+        # test ensures that index and column keys are recognized correctly
+        # when number of keys equals axis length of groupby
+        df = DataFrame(
+            [["foo", "bar", "B", 1], ["foo", "bar", "B", 2], ["foo", "baz", "C", 3]],
+            columns=["first", "second", "third", "one"],
+        )
+        df = df.set_index(["first", "second"])
+        df = df.groupby(["first", "second", "third"]).size()
+        assert df.loc[("foo", "bar", "B")] == 2
+        assert df.loc[("foo", "baz", "C")] == 1
+
+
+# groups & iteration
+# --------------------------------
+
+
+class TestIteration:
+    def test_groups(self, df):
+        grouped = df.groupby(["A"])
+        msg = "In a future version, the keys"
+
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            groups = grouped.groups
+            assert groups is grouped.groups  # caching works
+
+        for k, v in groups.items():
+            assert (df.loc[v]["A"] == k).all()
+
+        grouped = df.groupby(["A", "B"])
+        groups = grouped.groups
+        assert groups is grouped.groups  # caching works
+
+        for k, v in groups.items():
+            assert (df.loc[v]["A"] == k[0]).all()
+            assert (df.loc[v]["B"] == k[1]).all()
+
+    def test_grouping_is_iterable(self, tsframe):
+        # this code path isn't used anywhere else
+        # not sure it's useful
+        grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year])
+
+        # test it works
+        for g in grouped._grouper.groupings[0]:
+            pass
+
+    def test_multi_iter(self):
+        s = Series(np.arange(6))
+        k1 = np.array(["a", "a", "a", "b", "b", "b"])
+        k2 = np.array(["1", "2", "1", "2", "1", "2"])
+
+        grouped = s.groupby([k1, k2])
+
+        iterated = list(grouped)
+        expected = [
+            ("a", "1", s[[0, 2]]),
+            ("a", "2", s[[1]]),
+            ("b", "1", s[[4]]),
+            ("b", "2", s[[3, 5]]),
+        ]
+        for i, ((one, two), three) in enumerate(iterated):
+            e1, e2, e3 = expected[i]
+            assert e1 == one
+            assert e2 == two
+            tm.assert_series_equal(three, e3)
+
+    def test_multi_iter_frame(self, three_group):
+        k1 = np.array(["b", "b", "b", "a", "a", "a"])
+        k2 = np.array(["1", "2", "1", "2", "1", "2"])
+        df = DataFrame(
+            {
+                "v1": np.random.default_rng(2).standard_normal(6),
+                "v2": np.random.default_rng(2).standard_normal(6),
+                "k1": k1,
+                "k2": k2,
+            },
+            index=["one", "two", "three", "four", "five", "six"],
+        )
+
+        grouped = df.groupby(["k1", "k2"])
+
+        # things get sorted!
+        iterated = list(grouped)
+        idx = df.index
+        expected = [
+            ("a", "1", df.loc[idx[[4]]]),
+            ("a", "2", df.loc[idx[[3, 5]]]),
+            ("b", "1", df.loc[idx[[0, 2]]]),
+            ("b", "2", df.loc[idx[[1]]]),
+        ]
+        for i, ((one, two), three) in enumerate(iterated):
+            e1, e2, e3 = expected[i]
+            assert e1 == one
+            assert e2 == two
+            tm.assert_frame_equal(three, e3)
+
+        # don't iterate through groups with no data
+        df["k1"] = np.array(["b", "b", "b", "a", "a", "a"])
+        df["k2"] = np.array(["1", "1", "1", "2", "2", "2"])
+        grouped = df.groupby(["k1", "k2"])
+        # calling `dict` on a DataFrameGroupBy leads to a TypeError,
+        # we need to use a dictionary comprehension here
+        groups = {key: gp for key, gp in grouped}  # noqa: C416
+        assert len(groups) == 2
+
+    def test_dictify(self, df):
+        dict(iter(df.groupby("A")))
+        dict(iter(df.groupby(["A", "B"])))
+        dict(iter(df["C"].groupby(df["A"])))
+        dict(iter(df["C"].groupby([df["A"], df["B"]])))
+        dict(iter(df.groupby("A")["C"]))
+        dict(iter(df.groupby(["A", "B"])["C"]))
+
+    def test_groupby_with_small_elem(self):
+        # GH 8542
+        # length=2
+        df = DataFrame(
+            {"event": ["start", "start"], "change": [1234, 5678]},
+            index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]),
+        )
+        grouped = df.groupby([Grouper(freq="ME"), "event"])
+        assert len(grouped.groups) == 2
+        assert grouped.ngroups == 2
+        assert (Timestamp("2014-09-30"), "start") in grouped.groups
+        assert (Timestamp("2013-10-31"), "start") in grouped.groups
+
+        res = grouped.get_group((Timestamp("2014-09-30"), "start"))
+        tm.assert_frame_equal(res, df.iloc[[0], :])
+        res = grouped.get_group((Timestamp("2013-10-31"), "start"))
+        tm.assert_frame_equal(res, df.iloc[[1], :])
+
+        df = DataFrame(
+            {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
+            index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]),
+        )
+        grouped = df.groupby([Grouper(freq="ME"), "event"])
+        assert len(grouped.groups) == 2
+        assert grouped.ngroups == 2
+        assert (Timestamp("2014-09-30"), "start") in grouped.groups
+        assert (Timestamp("2013-10-31"), "start") in grouped.groups
+
+        res = grouped.get_group((Timestamp("2014-09-30"), "start"))
+        tm.assert_frame_equal(res, df.iloc[[0, 2], :])
+        res = grouped.get_group((Timestamp("2013-10-31"), "start"))
+        tm.assert_frame_equal(res, df.iloc[[1], :])
+
+        # length=3
+        df = DataFrame(
+            {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
+            index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]),
+        )
+        grouped = df.groupby([Grouper(freq="ME"), "event"])
+        assert len(grouped.groups) == 3
+        assert grouped.ngroups == 3
+        assert (Timestamp("2014-09-30"), "start") in grouped.groups
+        assert (Timestamp("2013-10-31"), "start") in grouped.groups
+        assert (Timestamp("2014-08-31"), "start") in grouped.groups
+
+        res = grouped.get_group((Timestamp("2014-09-30"), "start"))
+        tm.assert_frame_equal(res, df.iloc[[0], :])
+        res = grouped.get_group((Timestamp("2013-10-31"), "start"))
+        tm.assert_frame_equal(res, df.iloc[[1], :])
+        res = grouped.get_group((Timestamp("2014-08-31"), "start"))
+        tm.assert_frame_equal(res, df.iloc[[2], :])
+
+    def test_grouping_string_repr(self):
+        # GH 13394
+        mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
+        df = DataFrame([[1, 2, 3]], columns=mi)
+        gr = df.groupby(df[("A", "a")])
+
+        result = gr._grouper.groupings[0].__repr__()
+        expected = "Grouping(('A', 'a'))"
+        assert result == expected
+
+
+def test_grouping_by_key_is_in_axis():
+    # GH#50413 - Groupers specified by key are in-axis
+    df = DataFrame({"a": [1, 1, 2], "b": [1, 1, 2], "c": [3, 4, 5]}).set_index("a")
+    gb = df.groupby([Grouper(level="a"), Grouper(key="b")], as_index=False)
+    assert not gb._grouper.groupings[0].in_axis
+    assert gb._grouper.groupings[1].in_axis
+
+    result = gb.sum()
+    expected = DataFrame({"a": [1, 2], "b": [1, 2], "c": [7, 5]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_any_with_timedelta():
+    # GH#59712
+    df = DataFrame({"value": [pd.Timedelta(1), pd.NaT]})
+
+    result = df.groupby(np.array([0, 1], dtype=np.int64))["value"].any()
+
+    expected = Series({0: True, 1: False}, name="value", dtype=bool)
+    expected.index = expected.index.astype(np.int64)
+
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_index_as_string.py b/pandas/tests/groupby/test_index_as_string.py
new file mode 100644
index 0000000000000000000000000000000000000000..743db7e70b14b7f8c2d047e403884bdbf1b878a4
--- /dev/null
+++ b/pandas/tests/groupby/test_index_as_string.py
@@ -0,0 +1,72 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize(
+    "key_strs,groupers",
+    [
+        ("inner", pd.Grouper(level="inner")),  # Index name
+        (["inner"], [pd.Grouper(level="inner")]),  # List of index name
+        (["B", "inner"], ["B", pd.Grouper(level="inner")]),  # Column and index
+        (["inner", "B"], [pd.Grouper(level="inner"), "B"]),  # Index and column
+    ],
+)
+@pytest.mark.parametrize("levels", [["inner"], ["inner", "outer"]])
+def test_grouper_index_level_as_string(levels, key_strs, groupers):
+    frame = pd.DataFrame(
+        {
+            "outer": ["a", "a", "a", "b", "b", "b"],
+            "inner": [1, 2, 3, 1, 2, 3],
+            "A": np.arange(6),
+            "B": ["one", "one", "two", "two", "one", "one"],
+        }
+    )
+    frame = frame.set_index(levels)
+    if "B" not in key_strs or "outer" in frame.columns:
+        result = frame.groupby(key_strs).mean(numeric_only=True)
+        expected = frame.groupby(groupers).mean(numeric_only=True)
+    else:
+        result = frame.groupby(key_strs).mean()
+        expected = frame.groupby(groupers).mean()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "levels",
+    [
+        "inner",
+        "outer",
+        "B",
+        ["inner"],
+        ["outer"],
+        ["B"],
+        ["inner", "outer"],
+        ["outer", "inner"],
+        ["inner", "outer", "B"],
+        ["B", "outer", "inner"],
+    ],
+)
+def test_grouper_index_level_as_string_series(levels):
+    # Compute expected result
+    df = pd.DataFrame(
+        {
+            "outer": ["a", "a", "a", "b", "b", "b"],
+            "inner": [1, 2, 3, 1, 2, 3],
+            "A": np.arange(6),
+            "B": ["one", "one", "two", "two", "one", "one"],
+        }
+    )
+    series = df.set_index(["outer", "inner", "B"])["A"]
+    if isinstance(levels, list):
+        groupers = [pd.Grouper(level=lv) for lv in levels]
+    else:
+        groupers = pd.Grouper(level=levels)
+
+    expected = series.groupby(groupers).mean()
+
+    # Compute and check result
+    result = series.groupby(levels).mean()
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_indexing.py b/pandas/tests/groupby/test_indexing.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3d3f509e186ac8099dd5fd1c50e8265753fe16c
--- /dev/null
+++ b/pandas/tests/groupby/test_indexing.py
@@ -0,0 +1,310 @@
+# Test GroupBy._positional_selector positional grouped indexing GH#42864
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize(
+    "arg, expected_rows",
+    [
+        [0, [0, 1, 4]],
+        [2, [5]],
+        [5, []],
+        [-1, [3, 4, 7]],
+        [-2, [1, 6]],
+        [-6, []],
+    ],
+)
+def test_int(slice_test_df, slice_test_grouped, arg, expected_rows):
+    # Test single integer
+    result = slice_test_grouped._positional_selector[arg]
+    expected = slice_test_df.iloc[expected_rows]
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_slice(slice_test_df, slice_test_grouped):
+    # Test single slice
+    result = slice_test_grouped._positional_selector[0:3:2]
+    expected = slice_test_df.iloc[[0, 1, 4, 5]]
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "arg, expected_rows",
+    [
+        [[0, 2], [0, 1, 4, 5]],
+        [[0, 2, -1], [0, 1, 3, 4, 5, 7]],
+        [range(0, 3, 2), [0, 1, 4, 5]],
+        [{0, 2}, [0, 1, 4, 5]],
+    ],
+    ids=[
+        "list",
+        "negative",
+        "range",
+        "set",
+    ],
+)
+def test_list(slice_test_df, slice_test_grouped, arg, expected_rows):
+    # Test lists of integers and integer valued iterables
+    result = slice_test_grouped._positional_selector[arg]
+    expected = slice_test_df.iloc[expected_rows]
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_ints(slice_test_df, slice_test_grouped):
+    # Test tuple of ints
+    result = slice_test_grouped._positional_selector[0, 2, -1]
+    expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]]
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_slices(slice_test_df, slice_test_grouped):
+    # Test tuple of slices
+    result = slice_test_grouped._positional_selector[:2, -2:]
+    expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_mix(slice_test_df, slice_test_grouped):
+    # Test mixed tuple of ints and slices
+    result = slice_test_grouped._positional_selector[0, 1, -2:]
+    expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "arg, expected_rows",
+    [
+        [0, [0, 1, 4]],
+        [[0, 2, -1], [0, 1, 3, 4, 5, 7]],
+        [(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]],
+    ],
+)
+def test_as_index(slice_test_df, arg, expected_rows):
+    # Test the default as_index behaviour
+    result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg]
+    expected = slice_test_df.iloc[expected_rows]
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_doc_examples():
+    # Test the examples in the documentation
+    df = pd.DataFrame(
+        [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"]
+    )
+
+    grouped = df.groupby("A", as_index=False)
+
+    result = grouped._positional_selector[1:2]
+    expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4])
+
+    tm.assert_frame_equal(result, expected)
+
+    result = grouped._positional_selector[1, -1]
+    expected = pd.DataFrame(
+        [["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4]
+    )
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_multiindex():
+    # Test the multiindex mentioned as the use-case in the documentation
+
+    def _make_df_from_data(data):
+        rows = {}
+        for date in data:
+            for level in data[date]:
+                rows[(date, level[0])] = {"A": level[1], "B": level[2]}
+
+        df = pd.DataFrame.from_dict(rows, orient="index")
+        df.index.names = ("Date", "Item")
+        return df
+
+    rng = np.random.default_rng(2)
+    ndates = 100
+    nitems = 20
+    dates = pd.date_range("20130101", periods=ndates, freq="D")
+    items = [f"item {i}" for i in range(nitems)]
+
+    multiindex_data = {}
+    for date in dates:
+        nitems_for_date = nitems - rng.integers(0, 12)
+        levels = [
+            (item, rng.integers(0, 10000) / 100, rng.integers(0, 10000) / 100)
+            for item in items[:nitems_for_date]
+        ]
+        levels.sort(key=lambda x: x[1])
+        multiindex_data[date] = levels
+
+    df = _make_df_from_data(multiindex_data)
+    result = df.groupby("Date", as_index=False).nth(slice(3, -3))
+
+    sliced = {date: values[3:-3] for date, values in multiindex_data.items()}
+    expected = _make_df_from_data(sliced)
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000])
+@pytest.mark.parametrize("method", ["head", "tail"])
+@pytest.mark.parametrize("simulated", [True, False])
+def test_against_head_and_tail(arg, method, simulated):
+    # Test gives the same results as grouped head and tail
+    n_groups = 100
+    n_rows_per_group = 30
+
+    data = {
+        "group": [
+            f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups)
+        ],
+        "value": [
+            f"group {g} row {j}"
+            for j in range(n_rows_per_group)
+            for g in range(n_groups)
+        ],
+    }
+    df = pd.DataFrame(data)
+    grouped = df.groupby("group", as_index=False)
+    size = arg if arg >= 0 else n_rows_per_group + arg
+
+    if method == "head":
+        result = grouped._positional_selector[:arg]
+
+        if simulated:
+            indices = [
+                j * n_groups + i
+                for j in range(size)
+                for i in range(n_groups)
+                if j * n_groups + i < n_groups * n_rows_per_group
+            ]
+            expected = df.iloc[indices]
+
+        else:
+            expected = grouped.head(arg)
+
+    else:
+        result = grouped._positional_selector[-arg:]
+
+        if simulated:
+            indices = [
+                (n_rows_per_group + j - size) * n_groups + i
+                for j in range(size)
+                for i in range(n_groups)
+                if (n_rows_per_group + j - size) * n_groups + i >= 0
+            ]
+            expected = df.iloc[indices]
+
+        else:
+            expected = grouped.tail(arg)
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10])
+@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10])
+@pytest.mark.parametrize("step", [None, 1, 5])
+def test_against_df_iloc(start, stop, step):
+    # Test that a single group gives the same results as DataFrame.iloc
+    n_rows = 30
+
+    data = {
+        "group": ["group 0"] * n_rows,
+        "value": list(range(n_rows)),
+    }
+    df = pd.DataFrame(data)
+    grouped = df.groupby("group", as_index=False)
+
+    result = grouped._positional_selector[start:stop:step]
+    expected = df.iloc[start:stop:step]
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_series():
+    # Test grouped Series
+    ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"])
+    grouped = ser.groupby(level=0)
+    result = grouped._positional_selector[1:2]
+    expected = pd.Series([2, 5], index=["a", "b"])
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("step", [1, 2, 3, 4, 5])
+def test_step(step):
+    # Test slice with various step values
+    data = [["x", f"x{i}"] for i in range(5)]
+    data += [["y", f"y{i}"] for i in range(4)]
+    data += [["z", f"z{i}"] for i in range(3)]
+    df = pd.DataFrame(data, columns=["A", "B"])
+
+    grouped = df.groupby("A", as_index=False)
+
+    result = grouped._positional_selector[::step]
+
+    data = [["x", f"x{i}"] for i in range(0, 5, step)]
+    data += [["y", f"y{i}"] for i in range(0, 4, step)]
+    data += [["z", f"z{i}"] for i in range(0, 3, step)]
+
+    index = [0 + i for i in range(0, 5, step)]
+    index += [5 + i for i in range(0, 4, step)]
+    index += [9 + i for i in range(0, 3, step)]
+
+    expected = pd.DataFrame(data, columns=["A", "B"], index=index)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_columns_on_iter():
+    # GitHub issue #44821
+    df = pd.DataFrame({k: range(10) for k in "ABC"})
+
+    # Group-by and select columns
+    cols = ["A", "B"]
+    for _, dg in df.groupby(df.A < 4)[cols]:
+        tm.assert_index_equal(dg.columns, pd.Index(cols))
+        assert "C" not in dg.columns
+
+
+@pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array])
+def test_groupby_duplicated_columns(func):
+    # GH#44924
+    df = pd.DataFrame(
+        {
+            "A": [1, 2],
+            "B": [3, 3],
+            "C": ["G", "G"],
+        }
+    )
+    result = df.groupby("C")[func(["A", "B", "A"])].mean()
+    expected = pd.DataFrame(
+        [[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C")
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_get_nonexisting_groups():
+    # GH#32492
+    df = pd.DataFrame(
+        data={
+            "A": ["a1", "a2", None],
+            "B": ["b1", "b2", "b1"],
+            "val": [1, 2, 3],
+        }
+    )
+    grps = df.groupby(by=["A", "B"])
+
+    msg = "('a2', 'b1')"
+    with pytest.raises(KeyError, match=msg):
+        grps.get_group(("a2", "b1"))
diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py
new file mode 100644
index 0000000000000000000000000000000000000000..60095663b5a7d0c6fa2896f85b49373c1afff37a
--- /dev/null
+++ b/pandas/tests/groupby/test_libgroupby.py
@@ -0,0 +1,344 @@
+import numpy as np
+import pytest
+
+from pandas._libs import groupby as libgroupby
+from pandas._libs.groupby import (
+    group_cumprod,
+    group_cumsum,
+    group_mean,
+    group_sum,
+    group_var,
+)
+
+from pandas.core.dtypes.common import ensure_platform_int
+
+from pandas import isna
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize("dtype, rtol", [("float32", 1e-2), ("float64", 1e-5)])
+class TestGroupVar:
+    def test_group_var_generic_1d(self, dtype, rtol):
+        prng = np.random.default_rng(2)
+
+        out = (np.nan * np.ones((5, 1))).astype(dtype)
+        counts = np.zeros(5, dtype="int64")
+        values = 10 * prng.random((15, 1)).astype(dtype)
+        labels = np.tile(np.arange(5), (3,)).astype("intp")
+
+        expected_out = (
+            np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2
+        )[:, np.newaxis]
+        expected_counts = counts + 3
+
+        group_var(out, counts, values, labels)
+        assert np.allclose(out, expected_out, rtol)
+        tm.assert_numpy_array_equal(counts, expected_counts)
+
+    def test_group_var_generic_1d_flat_labels(self, dtype, rtol):
+        prng = np.random.default_rng(2)
+
+        out = (np.nan * np.ones((1, 1))).astype(dtype)
+        counts = np.zeros(1, dtype="int64")
+        values = 10 * prng.random((5, 1)).astype(dtype)
+        labels = np.zeros(5, dtype="intp")
+
+        expected_out = np.array([[values.std(ddof=1) ** 2]])
+        expected_counts = counts + 5
+
+        group_var(out, counts, values, labels)
+
+        assert np.allclose(out, expected_out, rtol)
+        tm.assert_numpy_array_equal(counts, expected_counts)
+
+    def test_group_var_generic_2d_all_finite(self, dtype, rtol):
+        prng = np.random.default_rng(2)
+
+        out = (np.nan * np.ones((5, 2))).astype(dtype)
+        counts = np.zeros(5, dtype="int64")
+        values = 10 * prng.random((10, 2)).astype(dtype)
+        labels = np.tile(np.arange(5), (2,)).astype("intp")
+
+        expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2
+        expected_counts = counts + 2
+
+        group_var(out, counts, values, labels)
+        assert np.allclose(out, expected_out, rtol)
+        tm.assert_numpy_array_equal(counts, expected_counts)
+
+    def test_group_var_generic_2d_some_nan(self, dtype, rtol):
+        prng = np.random.default_rng(2)
+
+        out = (np.nan * np.ones((5, 2))).astype(dtype)
+        counts = np.zeros(5, dtype="int64")
+        values = 10 * prng.random((10, 2)).astype(dtype)
+        values[:, 1] = np.nan
+        labels = np.tile(np.arange(5), (2,)).astype("intp")
+
+        expected_out = np.vstack(
+            [
+                values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2,
+                np.nan * np.ones(5),
+            ]
+        ).T.astype(dtype)
+        expected_counts = counts + 2
+
+        group_var(out, counts, values, labels)
+        tm.assert_almost_equal(out, expected_out, rtol=0.5e-06)
+        tm.assert_numpy_array_equal(counts, expected_counts)
+
+    def test_group_var_constant(self, dtype, rtol):
+        # Regression test from GH 10448.
+
+        out = np.array([[np.nan]], dtype=dtype)
+        counts = np.array([0], dtype="int64")
+        values = 0.832845131556193 * np.ones((3, 1), dtype=dtype)
+        labels = np.zeros(3, dtype="intp")
+
+        group_var(out, counts, values, labels)
+
+        assert counts[0] == 3
+        assert out[0, 0] >= 0
+        tm.assert_almost_equal(out[0, 0], 0.0)
+
+
+def test_group_var_large_inputs():
+    dtype = np.float64
+    prng = np.random.default_rng(2)
+
+    out = np.array([[np.nan]], dtype=dtype)
+    counts = np.array([0], dtype="int64")
+    values = (prng.random((10**6, 1)) + 10**12).astype(dtype)
+    labels = np.zeros(10**6, dtype="intp")
+
+    group_var(out, counts, values, labels)
+
+    assert counts[0] == 10**6
+    tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3)
+
+
+@pytest.mark.parametrize("dtype", ["float32", "float64"])
+def test_group_ohlc(dtype):
+    obj = np.array(np.random.default_rng(2).standard_normal(20), dtype=dtype)
+
+    bins = np.array([6, 12, 20])
+    out = np.zeros((3, 4), dtype)
+    counts = np.zeros(len(out), dtype=np.int64)
+    labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins])))
+
+    func = libgroupby.group_ohlc
+    func(out, counts, obj[:, None], labels)
+
+    def _ohlc(group):
+        if isna(group).all():
+            return np.repeat(np.nan, 4)
+        return [group[0], group.max(), group.min(), group[-1]]
+
+    expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])])
+
+    tm.assert_almost_equal(out, expected)
+    tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64))
+
+    obj[:6] = np.nan
+    func(out, counts, obj[:, None], labels)
+    expected[0] = np.nan
+    tm.assert_almost_equal(out, expected)
+
+
+@pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float32, np.float64])
+@pytest.mark.parametrize(
+    "pd_op, np_op",
+    [
+        (group_cumsum, np.cumsum),
+        (group_cumprod, np.cumprod),
+    ],
+)
+def test_cython_group_transform(dtype, pd_op, np_op):
+    # see gh-4095
+    is_datetimelike = False
+
+    data = np.array([[1], [2], [3], [4]], dtype=dtype)
+    answer = np.zeros_like(data)
+
+    labels = np.array([0, 0, 0, 0], dtype=np.intp)
+    ngroups = 1
+    pd_op(answer, data, labels, ngroups, is_datetimelike)
+
+    tm.assert_numpy_array_equal(np_op(data), answer[:, 0], check_dtype=False)
+
+
+def test_cython_group_transform_algos():
+    # see gh-4095
+    is_datetimelike = False
+
+    # with nans
+    labels = np.array([0, 0, 0, 0, 0], dtype=np.intp)
+    ngroups = 1
+
+    data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
+    actual = np.zeros_like(data)
+    actual.fill(np.nan)
+    group_cumprod(actual, data, labels, ngroups, is_datetimelike)
+    expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
+    tm.assert_numpy_array_equal(actual[:, 0], expected)
+
+    actual = np.zeros_like(data)
+    actual.fill(np.nan)
+    group_cumsum(actual, data, labels, ngroups, is_datetimelike)
+    expected = np.array([1, 3, 6, np.nan, 10], dtype="float64")
+    tm.assert_numpy_array_equal(actual[:, 0], expected)
+
+    # timedelta
+    is_datetimelike = True
+    data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None]
+    actual = np.zeros_like(data, dtype="int64")
+    group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike)
+    expected = np.array(
+        [
+            np.timedelta64(1, "ns"),
+            np.timedelta64(2, "ns"),
+            np.timedelta64(3, "ns"),
+            np.timedelta64(4, "ns"),
+            np.timedelta64(5, "ns"),
+        ]
+    )
+    tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected)
+
+
+def test_cython_group_mean_datetimelike():
+    actual = np.zeros(shape=(1, 1), dtype="float64")
+    counts = np.array([0], dtype="int64")
+    data = (
+        np.array(
+            [np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")],
+            dtype="m8[ns]",
+        )[:, None]
+        .view("int64")
+        .astype("float64")
+    )
+    labels = np.zeros(len(data), dtype=np.intp)
+
+    group_mean(actual, counts, data, labels, is_datetimelike=True)
+
+    tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64"))
+
+
+def test_cython_group_mean_wrong_min_count():
+    actual = np.zeros(shape=(1, 1), dtype="float64")
+    counts = np.zeros(1, dtype="int64")
+    data = np.zeros(1, dtype="float64")[:, None]
+    labels = np.zeros(1, dtype=np.intp)
+
+    with pytest.raises(AssertionError, match="min_count"):
+        group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0)
+
+
+def test_cython_group_mean_not_datetimelike_but_has_NaT_values():
+    actual = np.zeros(shape=(1, 1), dtype="float64")
+    counts = np.array([0], dtype="int64")
+    data = (
+        np.array(
+            [np.timedelta64("NaT"), np.timedelta64("NaT")],
+            dtype="m8[ns]",
+        )[:, None]
+        .view("int64")
+        .astype("float64")
+    )
+    labels = np.zeros(len(data), dtype=np.intp)
+
+    group_mean(actual, counts, data, labels, is_datetimelike=False)
+
+    tm.assert_numpy_array_equal(
+        actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64")
+    )
+
+
+def test_cython_group_mean_Inf_at_beginning_and_end():
+    # GH 50367
+    actual = np.array([[np.nan, np.nan], [np.nan, np.nan]], dtype="float64")
+    counts = np.array([0, 0], dtype="int64")
+    data = np.array(
+        [[np.inf, 1.0], [1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0], [5, np.inf]],
+        dtype="float64",
+    )
+    labels = np.array([0, 1, 0, 1, 0, 1], dtype=np.intp)
+
+    group_mean(actual, counts, data, labels, is_datetimelike=False)
+
+    expected = np.array([[np.inf, 3], [3, np.inf]], dtype="float64")
+
+    tm.assert_numpy_array_equal(
+        actual,
+        expected,
+    )
+
+
+@pytest.mark.parametrize(
+    "values, out",
+    [
+        ([[np.inf], [np.inf], [np.inf]], [[np.inf], [np.inf]]),
+        ([[np.inf], [np.inf], [-np.inf]], [[np.inf], [np.nan]]),
+        ([[np.inf], [-np.inf], [np.inf]], [[np.inf], [np.nan]]),
+        ([[np.inf], [-np.inf], [-np.inf]], [[np.inf], [-np.inf]]),
+    ],
+)
+def test_cython_group_sum_Inf_at_beginning_and_end(values, out):
+    # GH #53606
+    actual = np.array([[np.nan], [np.nan]], dtype="float64")
+    counts = np.array([0, 0], dtype="int64")
+    data = np.array(values, dtype="float64")
+    labels = np.array([0, 1, 1], dtype=np.intp)
+
+    group_sum(actual, counts, data, labels, None, is_datetimelike=False)
+
+    expected = np.array(out, dtype="float64")
+
+    tm.assert_numpy_array_equal(
+        actual,
+        expected,
+    )
+
+
+@pytest.mark.parametrize(
+    "values, expected_values",
+    [
+        (np.finfo(np.float64).max, [[np.inf]]),
+        (np.finfo(np.float64).min, [[-np.inf]]),
+        (
+            np.complex128(np.finfo(np.float64).min + np.finfo(np.float64).max * 1j),
+            [[complex(-np.inf, np.inf)]],
+        ),
+        (
+            np.complex128(np.finfo(np.float64).max + np.finfo(np.float64).min * 1j),
+            [[complex(np.inf, -np.inf)]],
+        ),
+        (
+            np.complex128(np.finfo(np.float64).max + np.finfo(np.float64).max * 1j),
+            [[complex(np.inf, np.inf)]],
+        ),
+        (
+            np.complex128(np.finfo(np.float64).min + np.finfo(np.float64).min * 1j),
+            [[complex(-np.inf, -np.inf)]],
+        ),
+        (
+            np.complex128(3.0 + np.finfo(np.float64).min * 1j),
+            [[complex(9.0, -np.inf)]],
+        ),
+        (
+            np.complex128(np.finfo(np.float64).max + 3 * 1j),
+            [[complex(np.inf, 9.0)]],
+        ),
+    ],
+)
+def test_cython_group_sum_overflow(values, expected_values):
+    # GH-60303
+    data = np.array([[values] for _ in range(3)])
+    labels = np.array([0, 0, 0], dtype=np.intp)
+    counts = np.array([0], dtype="int64")
+
+    expected = np.array(expected_values, dtype=values.dtype)
+    actual = np.zeros_like(expected)
+
+    group_sum(actual, counts, data, labels, None, is_datetimelike=False)
+
+    tm.assert_numpy_array_equal(actual, expected)
diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b590c50371e9e6f22247c73e94785b16678a2d4
--- /dev/null
+++ b/pandas/tests/groupby/test_missing.py
@@ -0,0 +1,89 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+)
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize("func", ["ffill", "bfill"])
+def test_groupby_column_index_name_lost_fill_funcs(func):
+    # GH: 29764 groupby loses index sometimes
+    df = DataFrame(
+        [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]],
+        columns=Index(["type", "a", "b"], name="idx"),
+    )
+    df_grouped = df.groupby(["type"])[["a", "b"]]
+    result = getattr(df_grouped, func)().columns
+    expected = Index(["a", "b"], name="idx")
+    tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["ffill", "bfill"])
+def test_groupby_fill_duplicate_column_names(func):
+    # GH: 25610 ValueError with duplicate column names
+    df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]})
+    df2 = DataFrame({"field1": [1, np.nan, 4]})
+    df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"])
+    expected = DataFrame(
+        [[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"]
+    )
+    result = getattr(df_grouped, func)()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["ffill", "bfill"])
+@pytest.mark.parametrize("has_nan_group", [True, False])
+def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
+    # GH 34725
+
+    df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)])
+
+    ridx = [-1, 0, -1, -1, 1, -1]
+    df = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
+
+    group_b = np.nan if has_nan_group else "b"
+    df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3)
+
+    grouped = df.groupby(by="group_col", dropna=dropna)
+    result = getattr(grouped, method)(limit=None)
+
+    expected_rows = {
+        ("ffill", True, True): [-1, 0, 0, -1, -1, -1],
+        ("ffill", True, False): [-1, 0, 0, -1, 1, 1],
+        ("ffill", False, True): [-1, 0, 0, -1, 1, 1],
+        ("ffill", False, False): [-1, 0, 0, -1, 1, 1],
+        ("bfill", True, True): [0, 0, -1, -1, -1, -1],
+        ("bfill", True, False): [0, 0, -1, 1, 1, -1],
+        ("bfill", False, True): [0, 0, -1, 1, 1, -1],
+        ("bfill", False, False): [0, 0, -1, 1, 1, -1],
+    }
+
+    ridx = expected_rows.get((method, dropna, has_nan_group))
+    expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
+    # columns are a 'take' on df.columns, which are object dtype
+    expected.columns = expected.columns.astype(object)
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
+@pytest.mark.parametrize("func", ["first", "last", "max", "min"])
+def test_min_count(func, min_count, value):
+    # GH#37821
+    df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
+    result = getattr(df.groupby("a"), func)(min_count=min_count)
+    expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_indices_with_missing():
+    # GH 9304
+    df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
+    g = df.groupby(["a", "b"])
+    result = g.indices
+    expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
+    assert result == expected
diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py
new file mode 100644
index 0000000000000000000000000000000000000000..082319d8479f02b9c61d08042269e77d56156898
--- /dev/null
+++ b/pandas/tests/groupby/test_numba.py
@@ -0,0 +1,82 @@
+import pytest
+
+from pandas.compat import is_platform_arm
+
+from pandas import (
+    DataFrame,
+    Series,
+    option_context,
+)
+import pandas._testing as tm
+from pandas.util.version import Version
+
+pytestmark = [pytest.mark.single_cpu]
+
+numba = pytest.importorskip("numba")
+pytestmark.append(
+    pytest.mark.skipif(
+        Version(numba.__version__) == Version("0.61") and is_platform_arm(),
+        reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
+    )
+)
+
+
+@pytest.mark.filterwarnings("ignore")
+# Filter warnings when parallel=True and the function can't be parallelized by Numba
+class TestEngine:
+    def test_cython_vs_numba_frame(
+        self, sort, nogil, parallel, nopython, numba_supported_reductions
+    ):
+        func, kwargs = numba_supported_reductions
+        df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+        gb = df.groupby("a", sort=sort)
+        result = getattr(gb, func)(
+            engine="numba", engine_kwargs=engine_kwargs, **kwargs
+        )
+        expected = getattr(gb, func)(**kwargs)
+        tm.assert_frame_equal(result, expected)
+
+    def test_cython_vs_numba_getitem(
+        self, sort, nogil, parallel, nopython, numba_supported_reductions
+    ):
+        func, kwargs = numba_supported_reductions
+        df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+        gb = df.groupby("a", sort=sort)["c"]
+        result = getattr(gb, func)(
+            engine="numba", engine_kwargs=engine_kwargs, **kwargs
+        )
+        expected = getattr(gb, func)(**kwargs)
+        tm.assert_series_equal(result, expected)
+
+    def test_cython_vs_numba_series(
+        self, sort, nogil, parallel, nopython, numba_supported_reductions
+    ):
+        func, kwargs = numba_supported_reductions
+        ser = Series(range(3), index=[1, 2, 1], name="foo")
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+        gb = ser.groupby(level=0, sort=sort)
+        result = getattr(gb, func)(
+            engine="numba", engine_kwargs=engine_kwargs, **kwargs
+        )
+        expected = getattr(gb, func)(**kwargs)
+        tm.assert_series_equal(result, expected)
+
+    def test_as_index_false_unsupported(self, numba_supported_reductions):
+        func, kwargs = numba_supported_reductions
+        df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
+        gb = df.groupby("a", as_index=False)
+        with pytest.raises(NotImplementedError, match="as_index=False"):
+            getattr(gb, func)(engine="numba", **kwargs)
+
+    def test_no_engine_doesnt_raise(self):
+        # GH55520
+        df = DataFrame({"a": [3, 2, 3, 2], "b": range(4), "c": range(1, 5)})
+        gb = df.groupby("a")
+        # Make sure behavior of functions w/out engine argument don't raise
+        # when the global use_numba option is set
+        with option_context("compute.use_numba", True):
+            res = gb.agg({"b": "first"})
+        expected = gb.agg({"b": "first"})
+        tm.assert_frame_equal(res, expected)
diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py
new file mode 100644
index 0000000000000000000000000000000000000000..b79ca8bf1ee3ae0e62b550d9c4322d7e10d7cf3a
--- /dev/null
+++ b/pandas/tests/groupby/test_numeric_only.py
@@ -0,0 +1,445 @@
+import re
+
+import pytest
+
+from pandas._libs import lib
+from pandas.errors import Pandas4Warning
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    Timestamp,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.tests.groupby import get_groupby_method_args
+
+
+class TestNumericOnly:
+    # make sure that we are passing thru kwargs to our agg functions
+
+    @pytest.fixture
+    def df(self):
+        # GH3668
+        # GH5724
+        df = DataFrame(
+            {
+                "group": [1, 1, 2],
+                "int": [1, 2, 3],
+                "float": [4.0, 5.0, 6.0],
+                "string": Series(["a", "b", "c"], dtype="str"),
+                "object": Series(["a", "b", "c"], dtype=object),
+                "category_string": Series(list("abc")).astype("category"),
+                "category_int": [7, 8, 9],
+                "datetime": date_range("20130101", periods=3),
+                "datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
+                "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
+            },
+            columns=[
+                "group",
+                "int",
+                "float",
+                "string",
+                "object",
+                "category_string",
+                "category_int",
+                "datetime",
+                "datetimetz",
+                "timedelta",
+            ],
+        )
+        return df
+
+    @pytest.mark.parametrize("method", ["mean", "median"])
+    def test_averages(self, df, method):
+        # mean / median
+        expected_columns_numeric = Index(["int", "float", "category_int"])
+
+        gb = df.groupby("group")
+        expected = DataFrame(
+            {
+                "category_int": [7.5, 9],
+                "float": [4.5, 6.0],
+                "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
+                "int": [1.5, 3],
+                "datetime": [
+                    Timestamp("2013-01-01 12:00:00"),
+                    Timestamp("2013-01-03 00:00:00"),
+                ],
+                "datetimetz": [
+                    Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
+                    Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
+                ],
+            },
+            index=Index([1, 2], name="group"),
+            columns=[
+                "int",
+                "float",
+                "category_int",
+            ],
+        )
+
+        result = getattr(gb, method)(numeric_only=True)
+        tm.assert_frame_equal(result.reindex_like(expected), expected)
+
+        expected_columns = expected.columns
+
+        self._check(df, method, expected_columns, expected_columns_numeric)
+
+    @pytest.mark.parametrize("method", ["min", "max"])
+    def test_extrema(self, df, method):
+        # TODO: min, max *should* handle
+        # categorical (ordered) dtype
+
+        expected_columns = Index(
+            [
+                "int",
+                "float",
+                "string",
+                "category_int",
+                "datetime",
+                "datetimetz",
+                "timedelta",
+            ]
+        )
+        expected_columns_numeric = expected_columns
+
+        self._check(df, method, expected_columns, expected_columns_numeric)
+
+    @pytest.mark.parametrize("method", ["first", "last"])
+    def test_first_last(self, df, method):
+        expected_columns = Index(
+            [
+                "int",
+                "float",
+                "string",
+                "object",
+                "category_string",
+                "category_int",
+                "datetime",
+                "datetimetz",
+                "timedelta",
+            ]
+        )
+        expected_columns_numeric = expected_columns
+
+        self._check(df, method, expected_columns, expected_columns_numeric)
+
+    @pytest.mark.parametrize("method", ["sum", "cumsum"])
+    def test_sum_cumsum(self, df, method):
+        expected_columns_numeric = Index(["int", "float", "category_int"])
+        expected_columns = Index(
+            ["int", "float", "string", "category_int", "timedelta"]
+        )
+        if method == "cumsum":
+            # cumsum loses string
+            expected_columns = Index(["int", "float", "category_int", "timedelta"])
+
+        self._check(df, method, expected_columns, expected_columns_numeric)
+
+    @pytest.mark.parametrize("method", ["prod", "cumprod"])
+    def test_prod_cumprod(self, df, method):
+        expected_columns = Index(["int", "float", "category_int"])
+        expected_columns_numeric = expected_columns
+
+        self._check(df, method, expected_columns, expected_columns_numeric)
+
+    @pytest.mark.parametrize("method", ["cummin", "cummax"])
+    def test_cummin_cummax(self, df, method):
+        # like min, max, but don't include strings
+        expected_columns = Index(
+            ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
+        )
+
+        # GH#15561: numeric_only=False set by default like min/max
+        expected_columns_numeric = expected_columns
+
+        self._check(df, method, expected_columns, expected_columns_numeric)
+
+    def _check(self, df, method, expected_columns, expected_columns_numeric):
+        gb = df.groupby("group")
+
+        # object dtypes for transformations are not implemented in Cython and
+        # have no Python fallback
+        exception = (
+            (NotImplementedError, TypeError) if method.startswith("cum") else TypeError
+        )
+
+        if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
+            # The methods default to numeric_only=False and raise TypeError
+            msg = "|".join(
+                [
+                    "Categorical is not ordered",
+                    f"Cannot perform {method} with non-ordered Categorical",
+                    re.escape(f"agg function failed [how->{method},dtype->object]"),
+                    # cumsum/cummin/cummax/cumprod
+                    "function is not implemented for this dtype",
+                    f"dtype 'str' does not support operation '{method}'",
+                ]
+            )
+            with pytest.raises(exception, match=msg):
+                getattr(gb, method)()
+        elif method in ("sum", "mean", "median", "prod"):
+            msg = "|".join(
+                [
+                    "category type does not support sum operations",
+                    re.escape(f"agg function failed [how->{method},dtype->object]"),
+                    re.escape(f"agg function failed [how->{method},dtype->string]"),
+                    f"dtype 'str' does not support operation '{method}'",
+                ]
+            )
+            with pytest.raises(exception, match=msg):
+                getattr(gb, method)()
+        else:
+            result = getattr(gb, method)()
+            tm.assert_index_equal(result.columns, expected_columns_numeric)
+
+        if method not in ("first", "last"):
+            msg = "|".join(
+                [
+                    "Categorical is not ordered",
+                    "category type does not support",
+                    "function is not implemented for this dtype",
+                    f"Cannot perform {method} with non-ordered Categorical",
+                    re.escape(f"agg function failed [how->{method},dtype->object]"),
+                    re.escape(f"agg function failed [how->{method},dtype->string]"),
+                    f"dtype 'str' does not support operation '{method}'",
+                ]
+            )
+            with pytest.raises(exception, match=msg):
+                getattr(gb, method)(numeric_only=False)
+        else:
+            result = getattr(gb, method)(numeric_only=False)
+            tm.assert_index_equal(result.columns, expected_columns)
+
+
+@pytest.mark.parametrize(
+    "kernel, has_arg",
+    [
+        ("all", False),
+        ("any", False),
+        ("bfill", False),
+        ("corr", True),
+        ("corrwith", True),
+        ("cov", True),
+        ("cummax", True),
+        ("cummin", True),
+        ("cumprod", True),
+        ("cumsum", True),
+        ("diff", False),
+        ("ffill", False),
+        ("first", True),
+        ("idxmax", True),
+        ("idxmin", True),
+        ("last", True),
+        ("max", True),
+        ("mean", True),
+        ("median", True),
+        ("min", True),
+        ("nth", False),
+        ("nunique", False),
+        ("pct_change", False),
+        ("prod", True),
+        ("quantile", True),
+        ("sem", True),
+        ("skew", True),
+        ("kurt", True),
+        ("std", True),
+        ("sum", True),
+        ("var", True),
+    ],
+)
+@pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
+@pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
+def test_numeric_only(kernel, has_arg, numeric_only, keys):
+    # GH#46072
+    # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False
+    # has_arg: Whether the op has a numeric_only arg
+    df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]})
+
+    args = get_groupby_method_args(kernel, df)
+    kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only}
+
+    gb = df.groupby(keys)
+    method = getattr(gb, kernel)
+    if has_arg and numeric_only is True:
+        # Cases where b does not appear in the result
+        if kernel == "corrwith":
+            warn = Pandas4Warning
+            msg = "DataFrameGroupBy.corrwith is deprecated"
+        else:
+            warn = None
+            msg = ""
+        with tm.assert_produces_warning(warn, match=msg):
+            result = method(*args, **kwargs)
+        assert "b" not in result.columns
+    elif (
+        # kernels that work on any dtype and have numeric_only arg
+        kernel in ("first", "last")
+        or (
+            # kernels that work on any dtype and don't have numeric_only arg
+            kernel in ("any", "all", "bfill", "ffill", "nth", "nunique")
+            and numeric_only is lib.no_default
+        )
+    ):
+        result = method(*args, **kwargs)
+        assert "b" in result.columns
+    elif has_arg:
+        assert numeric_only is not True
+        # kernels that are successful on any dtype were above; this will fail
+
+        # object dtypes for transformations are not implemented in Cython and
+        # have no Python fallback
+        exception = NotImplementedError if kernel.startswith("cum") else TypeError
+
+        msg = "|".join(
+            [
+                "not allowed for this dtype",
+                "cannot be performed against 'object' dtypes",
+                "must be a string or a real number",
+                "unsupported operand type",
+                "function is not implemented for this dtype",
+                re.escape(f"agg function failed [how->{kernel},dtype->object]"),
+            ]
+        )
+        if kernel == "quantile":
+            msg = "dtype 'object' does not support operation 'quantile'"
+        elif kernel == "idxmin":
+            msg = "'<' not supported between instances of 'type' and 'type'"
+        elif kernel == "idxmax":
+            msg = "'>' not supported between instances of 'type' and 'type'"
+        with pytest.raises(exception, match=msg):
+            if kernel == "corrwith":
+                warn = Pandas4Warning
+                msg = "DataFrameGroupBy.corrwith is deprecated"
+            else:
+                warn = None
+                msg = ""
+            with tm.assert_produces_warning(warn, match=msg):
+                method(*args, **kwargs)
+    elif not has_arg and numeric_only is not lib.no_default:
+        with pytest.raises(
+            TypeError, match="got an unexpected keyword argument 'numeric_only'"
+        ):
+            method(*args, **kwargs)
+    else:
+        assert kernel in ("diff", "pct_change")
+        assert numeric_only is lib.no_default
+        # Doesn't have numeric_only argument and fails on nuisance columns
+        with pytest.raises(TypeError, match=r"unsupported operand type"):
+            method(*args, **kwargs)
+
+
+@pytest.mark.parametrize("dtype", [bool, int, float, object])
+def test_deprecate_numeric_only_series(dtype, groupby_func, request):
+    # GH#46560
+    grouper = [0, 0, 1]
+
+    ser = Series([1, 0, 0], dtype=dtype)
+    gb = ser.groupby(grouper)
+
+    if groupby_func == "corrwith":
+        # corrwith is not implemented on SeriesGroupBy
+        assert not hasattr(gb, groupby_func)
+        return
+
+    method = getattr(gb, groupby_func)
+
+    expected_ser = Series([1, 0, 0])
+    expected_gb = expected_ser.groupby(grouper)
+    expected_method = getattr(expected_gb, groupby_func)
+
+    args = get_groupby_method_args(groupby_func, ser)
+
+    fails_on_numeric_object = (
+        "corr",
+        "cov",
+        "cummax",
+        "cummin",
+        "cumprod",
+        "cumsum",
+        "quantile",
+    )
+    # ops that give an object result on object input
+    obj_result = (
+        "first",
+        "last",
+        "nth",
+        "bfill",
+        "ffill",
+        "shift",
+        "sum",
+        "diff",
+        "pct_change",
+        "var",
+        "mean",
+        "median",
+        "min",
+        "max",
+        "prod",
+        "skew",
+        "kurt",
+    )
+
+    # Test default behavior; kernels that fail may be enabled in the future but kernels
+    # that succeed should not be allowed to fail (without deprecation, at least)
+    if groupby_func in fails_on_numeric_object and dtype is object:
+        if groupby_func == "quantile":
+            msg = "dtype 'object' does not support operation 'quantile'"
+        else:
+            msg = "is not supported for object dtype"
+        with pytest.raises(TypeError, match=msg):
+            method(*args)
+    elif dtype is object:
+        result = method(*args)
+        expected = expected_method(*args)
+        if groupby_func in obj_result:
+            expected = expected.astype(object)
+        tm.assert_series_equal(result, expected)
+
+    has_numeric_only = (
+        "first",
+        "last",
+        "max",
+        "mean",
+        "median",
+        "min",
+        "prod",
+        "quantile",
+        "sem",
+        "skew",
+        "kurt",
+        "std",
+        "sum",
+        "var",
+        "cummax",
+        "cummin",
+        "cumprod",
+        "cumsum",
+    )
+    if groupby_func not in has_numeric_only:
+        msg = "got an unexpected keyword argument 'numeric_only'"
+        with pytest.raises(TypeError, match=msg):
+            method(*args, numeric_only=True)
+    elif dtype is object:
+        msg = "|".join(
+            [
+                "SeriesGroupBy.sem called with numeric_only=True and dtype object",
+                "Series.skew does not allow numeric_only=True with non-numeric",
+                "cum(sum|prod|min|max) is not supported for object dtype",
+                r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            method(*args, numeric_only=True)
+    elif dtype == bool and groupby_func == "quantile":
+        msg = "Cannot use quantile with bool dtype"
+        with pytest.raises(TypeError, match=msg):
+            # GH#51424
+            method(*args, numeric_only=False)
+    else:
+        result = method(*args, numeric_only=True)
+        expected = method(*args, numeric_only=False)
+        tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_pipe.py b/pandas/tests/groupby/test_pipe.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee59a93695bcf84bcfcd8f1add8120e2c04004f5
--- /dev/null
+++ b/pandas/tests/groupby/test_pipe.py
@@ -0,0 +1,80 @@
+import numpy as np
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+)
+import pandas._testing as tm
+
+
+def test_pipe():
+    # Test the pipe method of DataFrameGroupBy.
+    # Issue #17871
+
+    random_state = np.random.default_rng(2)
+
+    df = DataFrame(
+        {
+            "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
+            "B": random_state.standard_normal(8),
+            "C": random_state.standard_normal(8),
+        }
+    )
+
+    def f(dfgb):
+        return dfgb.B.max() - dfgb.C.min().min()
+
+    def square(srs):
+        return srs**2
+
+    # Note that the transformations are
+    # GroupBy -> Series
+    # Series -> Series
+    # This then chains the GroupBy.pipe and the
+    # NDFrame.pipe methods
+    result = df.groupby("A").pipe(f).pipe(square)
+
+    index = Index(["bar", "foo"], name="A")
+    expected = pd.Series([3.749306591013693, 6.717707873081384], name="B", index=index)
+
+    tm.assert_series_equal(expected, result)
+
+
+def test_pipe_args():
+    # Test passing args to the pipe method of DataFrameGroupBy.
+    # Issue #17871
+
+    df = DataFrame(
+        {
+            "group": ["A", "A", "B", "B", "C"],
+            "x": [1.0, 2.0, 3.0, 2.0, 5.0],
+            "y": [10.0, 100.0, 1000.0, -100.0, -1000.0],
+        }
+    )
+
+    def f(dfgb, arg1):
+        filtered = dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False)
+        return filtered.groupby("group")
+
+    def g(dfgb, arg2):
+        return dfgb.sum() / dfgb.sum().sum() + arg2
+
+    def h(df, arg3):
+        return df.x + df.y - arg3
+
+    result = df.groupby("group").pipe(f, 0).pipe(g, 10).pipe(h, 100)
+
+    # Assert the results here
+    index = Index(["A", "B"], name="group")
+    expected = pd.Series([-79.5160891089, -78.4839108911], index=index)
+
+    tm.assert_series_equal(result, expected)
+
+    # test SeriesGroupby.pipe
+    ser = pd.Series([1, 1, 2, 2, 3, 3])
+    result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count())
+
+    expected = pd.Series([4, 8, 12], index=Index([1, 2, 3], dtype=np.int64))
+
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py
new file mode 100644
index 0000000000000000000000000000000000000000..652c3808f5f1d13318a577533365ee552151f32d
--- /dev/null
+++ b/pandas/tests/groupby/test_raises.py
@@ -0,0 +1,741 @@
+# Only tests that raise an error and have no better location should go here.
+# Tests for specific groupby methods should go in their respective
+# test file.
+
+import datetime
+import re
+
+import numpy as np
+import pytest
+
+from pandas.errors import Pandas4Warning
+
+from pandas import (
+    Categorical,
+    DataFrame,
+    Grouper,
+    Series,
+)
+import pandas._testing as tm
+from pandas.tests.groupby import get_groupby_method_args
+
+
+@pytest.fixture(
+    params=[
+        "a",
+        ["a"],
+        ["a", "b"],
+        Grouper(key="a"),
+        lambda x: x % 2,
+        [0, 0, 0, 1, 2, 2, 2, 3, 3],
+        np.array([0, 0, 0, 1, 2, 2, 2, 3, 3]),
+        dict(zip(range(9), [0, 0, 0, 1, 2, 2, 2, 3, 3], strict=True)),
+        Series([1, 1, 1, 1, 1, 2, 2, 2, 2]),
+        [Series([1, 1, 1, 1, 1, 2, 2, 2, 2]), Series([3, 3, 4, 4, 4, 4, 4, 3, 3])],
+    ]
+)
+def by(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def groupby_series(request):
+    return request.param
+
+
+@pytest.fixture
+def df_with_string_col():
+    df = DataFrame(
+        {
+            "a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
+            "b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
+            "c": range(9),
+            "d": list("xyzwtyuio"),
+        }
+    )
+    return df
+
+
+@pytest.fixture
+def df_with_datetime_col():
+    df = DataFrame(
+        {
+            "a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
+            "b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
+            "c": range(9),
+            "d": datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
+        }
+    )
+    return df
+
+
+@pytest.fixture
+def df_with_cat_col():
+    df = DataFrame(
+        {
+            "a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
+            "b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
+            "c": range(9),
+            "d": Categorical(
+                ["a", "a", "a", "a", "b", "b", "b", "b", "c"],
+                categories=["a", "b", "c", "d"],
+                ordered=True,
+            ),
+        }
+    )
+    return df
+
+
+def _call_and_check(
+    klass, msg, how, gb, groupby_func, args, warn_category=None, warn_msg=""
+):
+    with tm.assert_produces_warning(
+        warn_category, match=warn_msg, check_stacklevel=False
+    ):
+        if klass is None:
+            if how == "method":
+                getattr(gb, groupby_func)(*args)
+            elif how == "agg":
+                gb.agg(groupby_func, *args)
+            else:
+                gb.transform(groupby_func, *args)
+        else:
+            with pytest.raises(klass, match=msg):
+                if how == "method":
+                    getattr(gb, groupby_func)(*args)
+                elif how == "agg":
+                    gb.agg(groupby_func, *args)
+                else:
+                    gb.transform(groupby_func, *args)
+
+
+@pytest.mark.parametrize("how", ["method", "agg", "transform"])
+def test_groupby_raises_string(
+    how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string
+):
+    df = df_with_string_col
+    args = get_groupby_method_args(groupby_func, df)
+    gb = df.groupby(by=by)
+
+    if groupby_series:
+        gb = gb["d"]
+
+        if groupby_func == "corrwith":
+            assert not hasattr(gb, "corrwith")
+            return
+
+    klass, msg = {
+        "all": (None, ""),
+        "any": (None, ""),
+        "bfill": (None, ""),
+        "corrwith": (TypeError, "Could not convert"),
+        "count": (None, ""),
+        "cumcount": (None, ""),
+        "cummax": (
+            (NotImplementedError, TypeError),
+            "(function|cummax) is not (implemented|supported) for (this|object) dtype",
+        ),
+        "cummin": (
+            (NotImplementedError, TypeError),
+            "(function|cummin) is not (implemented|supported) for (this|object) dtype",
+        ),
+        "cumprod": (
+            (NotImplementedError, TypeError),
+            "(function|cumprod) is not (implemented|supported) for (this|object) dtype",
+        ),
+        "cumsum": (
+            (NotImplementedError, TypeError),
+            "(function|cumsum) is not (implemented|supported) for (this|object) dtype",
+        ),
+        "diff": (TypeError, "unsupported operand type"),
+        "ffill": (None, ""),
+        "first": (None, ""),
+        "idxmax": (None, ""),
+        "idxmin": (None, ""),
+        "last": (None, ""),
+        "max": (None, ""),
+        "mean": (
+            TypeError,
+            re.escape("agg function failed [how->mean,dtype->object]"),
+        ),
+        "median": (
+            TypeError,
+            re.escape("agg function failed [how->median,dtype->object]"),
+        ),
+        "min": (None, ""),
+        "ngroup": (None, ""),
+        "nunique": (None, ""),
+        "pct_change": (TypeError, "unsupported operand type"),
+        "prod": (
+            TypeError,
+            re.escape("agg function failed [how->prod,dtype->object]"),
+        ),
+        "quantile": (TypeError, "dtype 'object' does not support operation 'quantile'"),
+        "rank": (None, ""),
+        "sem": (ValueError, "could not convert string to float"),
+        "shift": (None, ""),
+        "size": (None, ""),
+        "skew": (ValueError, "could not convert string to float"),
+        "kurt": (ValueError, "could not convert string to float"),
+        "std": (ValueError, "could not convert string to float"),
+        "sum": (None, ""),
+        "var": (
+            TypeError,
+            re.escape("agg function failed [how->var,dtype->"),
+        ),
+    }[groupby_func]
+
+    if using_infer_string:
+        if groupby_func in [
+            "prod",
+            "mean",
+            "median",
+            "cumsum",
+            "cumprod",
+            "std",
+            "sem",
+            "var",
+            "skew",
+            "kurt",
+            "quantile",
+        ]:
+            msg = f"dtype 'str' does not support operation '{groupby_func}'"
+            if groupby_func in ["sem", "std", "skew", "kurt"]:
+                # The object-dtype raises ValueError when trying to convert to numeric.
+                klass = TypeError
+        elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow":
+            # This doesn't go through EA._groupby_op so the message isn't controlled
+            #  there.
+            msg = "operation 'truediv' not supported for dtype 'str' with dtype 'str'"
+        elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow":
+            # This doesn't go through EA._groupby_op so the message isn't controlled
+            #  there.
+            msg = "operation 'sub' not supported for dtype 'str' with dtype 'str'"
+
+        elif groupby_func in ["cummin", "cummax"]:
+            msg = msg.replace("object", "str")
+        elif groupby_func == "corrwith":
+            msg = "Cannot perform reduction 'mean' with string dtype"
+
+    if groupby_func == "corrwith":
+        warn_category = Pandas4Warning
+        warn_msg = "DataFrameGroupBy.corrwith is deprecated"
+    else:
+        warn_category = None
+        warn_msg = ""
+    _call_and_check(klass, msg, how, gb, groupby_func, args, warn_category, warn_msg)
+
+
+@pytest.mark.parametrize("how", ["agg", "transform"])
+def test_groupby_raises_string_udf(how, by, groupby_series, df_with_string_col):
+    df = df_with_string_col
+    gb = df.groupby(by=by)
+
+    if groupby_series:
+        gb = gb["d"]
+
+    def func(x):
+        raise TypeError("Test error message")
+
+    with pytest.raises(TypeError, match="Test error message"):
+        getattr(gb, how)(func)
+
+
+@pytest.mark.parametrize("how", ["agg", "transform"])
+@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
+def test_groupby_raises_string_np(
+    how,
+    by,
+    groupby_series,
+    groupby_func_np,
+    df_with_string_col,
+    using_infer_string,
+):
+    # GH#50749
+    df = df_with_string_col
+    gb = df.groupby(by=by)
+
+    if groupby_series:
+        gb = gb["d"]
+
+    klass, msg = {
+        np.sum: (None, ""),
+        np.mean: (
+            TypeError,
+            "Could not convert string .* to numeric|"
+            "Cannot perform reduction 'mean' with string dtype",
+        ),
+    }[groupby_func_np]
+
+    if using_infer_string:
+        if groupby_func_np is np.mean:
+            klass = TypeError
+        msg = f"Cannot perform reduction '{groupby_func_np.__name__}' with string dtype"
+
+    _call_and_check(klass, msg, how, gb, groupby_func_np, ())
+
+
+@pytest.mark.parametrize("how", ["method", "agg", "transform"])
+def test_groupby_raises_datetime(
+    how, by, groupby_series, groupby_func, df_with_datetime_col
+):
+    df = df_with_datetime_col
+    args = get_groupby_method_args(groupby_func, df)
+    gb = df.groupby(by=by)
+
+    if groupby_series:
+        gb = gb["d"]
+
+        if groupby_func == "corrwith":
+            assert not hasattr(gb, "corrwith")
+            return
+
+    klass, msg = {
+        "all": (TypeError, "'all' with datetime64 dtypes is no longer supported"),
+        "any": (TypeError, "'any' with datetime64 dtypes is no longer supported"),
+        "bfill": (None, ""),
+        "corrwith": (TypeError, "cannot perform __mul__ with this index type"),
+        "count": (None, ""),
+        "cumcount": (None, ""),
+        "cummax": (None, ""),
+        "cummin": (None, ""),
+        "cumprod": (TypeError, "datetime64 type does not support operation 'cumprod'"),
+        "cumsum": (TypeError, "datetime64 type does not support operation 'cumsum'"),
+        "diff": (None, ""),
+        "ffill": (None, ""),
+        "first": (None, ""),
+        "idxmax": (None, ""),
+        "idxmin": (None, ""),
+        "last": (None, ""),
+        "max": (None, ""),
+        "mean": (None, ""),
+        "median": (None, ""),
+        "min": (None, ""),
+        "ngroup": (None, ""),
+        "nunique": (None, ""),
+        "pct_change": (TypeError, "cannot perform __truediv__ with this index type"),
+        "prod": (TypeError, "datetime64 type does not support operation 'prod'"),
+        "quantile": (None, ""),
+        "rank": (None, ""),
+        "sem": (None, ""),
+        "shift": (None, ""),
+        "size": (None, ""),
+        "skew": (
+            TypeError,
+            "|".join(
+                [
+                    r"dtype datetime64\[ns\] does not support operation",
+                    "datetime64 type does not support operation 'skew'",
+                ]
+            ),
+        ),
+        "kurt": (
+            TypeError,
+            "|".join(
+                [
+                    r"dtype datetime64\[ns\] does not support operation",
+                    "datetime64 type does not support operation 'kurt'",
+                ]
+            ),
+        ),
+        "std": (None, ""),
+        "sum": (TypeError, "datetime64 type does not support operation 'sum"),
+        "var": (TypeError, "datetime64 type does not support operation 'var'"),
+    }[groupby_func]
+
+    if groupby_func == "corrwith":
+        warn_category = Pandas4Warning
+        warn_msg = "DataFrameGroupBy.corrwith is deprecated"
+    else:
+        warn_category = None
+        warn_msg = ""
+    _call_and_check(klass, msg, how, gb, groupby_func, args, warn_category, warn_msg)
+
+
+@pytest.mark.parametrize("how", ["agg", "transform"])
+def test_groupby_raises_datetime_udf(how, by, groupby_series, df_with_datetime_col):
+    df = df_with_datetime_col
+    gb = df.groupby(by=by)
+
+    if groupby_series:
+        gb = gb["d"]
+
+    def func(x):
+        raise TypeError("Test error message")
+
+    with pytest.raises(TypeError, match="Test error message"):
+        getattr(gb, how)(func)
+
+
+@pytest.mark.parametrize("how", ["agg", "transform"])
+@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
+def test_groupby_raises_datetime_np(
+    how, by, groupby_series, groupby_func_np, df_with_datetime_col
+):
+    # GH#50749
+    df = df_with_datetime_col
+    gb = df.groupby(by=by)
+
+    if groupby_series:
+        gb = gb["d"]
+
+    klass, msg = {
+        np.sum: (
+            TypeError,
+            re.escape("datetime64[us] does not support operation 'sum'"),
+        ),
+        np.mean: (None, ""),
+    }[groupby_func_np]
+    _call_and_check(klass, msg, how, gb, groupby_func_np, ())
+
+
+@pytest.mark.parametrize("func", ["prod", "cumprod", "skew", "kurt", "var"])
+def test_groupby_raises_timedelta(func):
+    df = DataFrame(
+        {
+            "a": [1, 1, 1, 1, 1, 2, 2, 2, 2],
+            "b": [3, 3, 4, 4, 4, 4, 4, 3, 3],
+            "c": range(9),
+            "d": datetime.timedelta(days=1),
+        }
+    )
+    gb = df.groupby(by="a")
+
+    _call_and_check(
+        TypeError,
+        "timedelta64 type does not support .* operations",
+        "method",
+        gb,
+        func,
+        [],
+    )
+
+
+@pytest.mark.parametrize("how", ["method", "agg", "transform"])
+def test_groupby_raises_category(
+    how, by, groupby_series, groupby_func, df_with_cat_col
+):
+    # GH#50749
+    df = df_with_cat_col
+    args = get_groupby_method_args(groupby_func, df)
+    gb = df.groupby(by=by)
+
+    if groupby_series:
+        gb = gb["d"]
+
+        if groupby_func == "corrwith":
+            assert not hasattr(gb, "corrwith")
+            return
+
+    klass, msg = {
+        "all": (None, ""),
+        "any": (None, ""),
+        "bfill": (None, ""),
+        "corrwith": (
+            TypeError,
+            r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
+        ),
+        "count": (None, ""),
+        "cumcount": (None, ""),
+        "cummax": (
+            (NotImplementedError, TypeError),
+            "(category type does not support cummax operations|"
+            "category dtype not supported|"
+            "cummax is not supported for category dtype)",
+        ),
+        "cummin": (
+            (NotImplementedError, TypeError),
+            "(category type does not support cummin operations|"
+            "category dtype not supported|"
+            "cummin is not supported for category dtype)",
+        ),
+        "cumprod": (
+            (NotImplementedError, TypeError),
+            "(category type does not support cumprod operations|"
+            "category dtype not supported|"
+            "cumprod is not supported for category dtype)",
+        ),
+        "cumsum": (
+            (NotImplementedError, TypeError),
+            "(category type does not support cumsum operations|"
+            "category dtype not supported|"
+            "cumsum is not supported for category dtype)",
+        ),
+        "diff": (
+            TypeError,
+            r"unsupported operand type\(s\) for -: 'Categorical' and 'Categorical'",
+        ),
+        "ffill": (None, ""),
+        "first": (None, ""),
+        "idxmax": (None, ""),
+        "idxmin": (None, ""),
+        "last": (None, ""),
+        "max": (None, ""),
+        "mean": (
+            TypeError,
+            "|".join(
+                [
+                    "'Categorical' .* does not support operation 'mean'",
+                    "category dtype does not support aggregation 'mean'",
+                ]
+            ),
+        ),
+        "median": (
+            TypeError,
+            "|".join(
+                [
+                    "'Categorical' .* does not support operation 'median'",
+                    "category dtype does not support aggregation 'median'",
+                ]
+            ),
+        ),
+        "min": (None, ""),
+        "ngroup": (None, ""),
+        "nunique": (None, ""),
+        "pct_change": (
+            TypeError,
+            r"unsupported operand type\(s\) for /: 'Categorical' and 'Categorical'",
+        ),
+        "prod": (TypeError, "category type does not support prod operations"),
+        "quantile": (TypeError, "No matching signature found"),
+        "rank": (None, ""),
+        "sem": (
+            TypeError,
+            "|".join(
+                [
+                    "'Categorical' .* does not support operation 'sem'",
+                    "category dtype does not support aggregation 'sem'",
+                ]
+            ),
+        ),
+        "shift": (None, ""),
+        "size": (None, ""),
+        "skew": (
+            TypeError,
+            "|".join(
+                [
+                    "dtype category does not support operation 'skew'",
+                    "category type does not support skew operations",
+                ]
+            ),
+        ),
+        "kurt": (
+            TypeError,
+            "|".join(
+                [
+                    "dtype category does not support operation 'kurt'",
+                    "category type does not support kurt operations",
+                ]
+            ),
+        ),
+        "std": (
+            TypeError,
+            "|".join(
+                [
+                    "'Categorical' .* does not support operation 'std'",
+                    "category dtype does not support aggregation 'std'",
+                ]
+            ),
+        ),
+        "sum": (TypeError, "category type does not support sum operations"),
+        "var": (
+            TypeError,
+            "|".join(
+                [
+                    "'Categorical' .* does not support operation 'var'",
+                    "category dtype does not support aggregation 'var'",
+                ]
+            ),
+        ),
+    }[groupby_func]
+
+    if groupby_func == "corrwith":
+        warn_category = Pandas4Warning
+        warn_msg = "DataFrameGroupBy.corrwith is deprecated"
+    else:
+        warn_category = None
+        warn_msg = ""
+    _call_and_check(klass, msg, how, gb, groupby_func, args, warn_category, warn_msg)
+
+
+@pytest.mark.parametrize("how", ["agg", "transform"])
+def test_groupby_raises_category_udf(how, by, groupby_series, df_with_cat_col):
+    # GH#50749
+    df = df_with_cat_col
+    gb = df.groupby(by=by)
+
+    if groupby_series:
+        gb = gb["d"]
+
+    def func(x):
+        raise TypeError("Test error message")
+
+    with pytest.raises(TypeError, match="Test error message"):
+        getattr(gb, how)(func)
+
+
+@pytest.mark.parametrize("how", ["agg", "transform"])
+@pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
+def test_groupby_raises_category_np(
+    how, by, groupby_series, groupby_func_np, df_with_cat_col
+):
+    # GH#50749
+    df = df_with_cat_col
+    gb = df.groupby(by=by)
+
+    if groupby_series:
+        gb = gb["d"]
+
+    klass, msg = {
+        np.sum: (TypeError, "dtype category does not support operation 'sum'"),
+        np.mean: (
+            TypeError,
+            "dtype category does not support operation 'mean'",
+        ),
+    }[groupby_func_np]
+    _call_and_check(klass, msg, how, gb, groupby_func_np, ())
+
+
+@pytest.mark.filterwarnings("ignore:In a future version, the keys")
+@pytest.mark.parametrize("how", ["method", "agg", "transform"])
+def test_groupby_raises_category_on_category(
+    how,
+    by,
+    groupby_series,
+    groupby_func,
+    observed,
+    df_with_cat_col,
+):
+    # GH#50749
+    df = df_with_cat_col
+    df["a"] = Categorical(
+        ["a", "a", "a", "a", "b", "b", "b", "b", "c"],
+        categories=["a", "b", "c", "d"],
+        ordered=True,
+    )
+    args = get_groupby_method_args(groupby_func, df)
+    gb = df.groupby(by=by, observed=observed)
+
+    if groupby_series:
+        gb = gb["d"]
+
+        if groupby_func == "corrwith":
+            assert not hasattr(gb, "corrwith")
+            return
+
+    empty_groups = not observed and any(group.empty for group in gb.groups.values())
+    if how == "transform":
+        # empty groups will be ignored
+        empty_groups = False
+
+    klass, msg = {
+        "all": (None, ""),
+        "any": (None, ""),
+        "bfill": (None, ""),
+        "corrwith": (
+            TypeError,
+            r"unsupported operand type\(s\) for \*: 'Categorical' and 'int'",
+        ),
+        "count": (None, ""),
+        "cumcount": (None, ""),
+        "cummax": (
+            (NotImplementedError, TypeError),
+            "(cummax is not supported for category dtype|"
+            "category dtype not supported|"
+            "category type does not support cummax operations)",
+        ),
+        "cummin": (
+            (NotImplementedError, TypeError),
+            "(cummin is not supported for category dtype|"
+            "category dtype not supported|"
+            "category type does not support cummin operations)",
+        ),
+        "cumprod": (
+            (NotImplementedError, TypeError),
+            "(cumprod is not supported for category dtype|"
+            "category dtype not supported|"
+            "category type does not support cumprod operations)",
+        ),
+        "cumsum": (
+            (NotImplementedError, TypeError),
+            "(cumsum is not supported for category dtype|"
+            "category dtype not supported|"
+            "category type does not support cumsum operations)",
+        ),
+        "diff": (TypeError, "unsupported operand type"),
+        "ffill": (None, ""),
+        "first": (None, ""),
+        "idxmax": (ValueError, "empty group due to unobserved categories")
+        if empty_groups
+        else (None, ""),
+        "idxmin": (ValueError, "empty group due to unobserved categories")
+        if empty_groups
+        else (None, ""),
+        "last": (None, ""),
+        "max": (None, ""),
+        "mean": (TypeError, "category dtype does not support aggregation 'mean'"),
+        "median": (TypeError, "category dtype does not support aggregation 'median'"),
+        "min": (None, ""),
+        "ngroup": (None, ""),
+        "nunique": (None, ""),
+        "pct_change": (TypeError, "unsupported operand type"),
+        "prod": (TypeError, "category type does not support prod operations"),
+        "quantile": (TypeError, "No matching signature found"),
+        "rank": (None, ""),
+        "sem": (
+            TypeError,
+            "|".join(
+                [
+                    "'Categorical' .* does not support operation 'sem'",
+                    "category dtype does not support aggregation 'sem'",
+                ]
+            ),
+        ),
+        "shift": (None, ""),
+        "size": (None, ""),
+        "skew": (
+            TypeError,
+            "|".join(
+                [
+                    "category type does not support skew operations",
+                    "dtype category does not support operation 'skew'",
+                ]
+            ),
+        ),
+        "kurt": (
+            TypeError,
+            "|".join(
+                [
+                    "category type does not support kurt operations",
+                    "dtype category does not support operation 'kurt'",
+                ]
+            ),
+        ),
+        "std": (
+            TypeError,
+            "|".join(
+                [
+                    "'Categorical' .* does not support operation 'std'",
+                    "category dtype does not support aggregation 'std'",
+                ]
+            ),
+        ),
+        "sum": (TypeError, "category type does not support sum operations"),
+        "var": (
+            TypeError,
+            "|".join(
+                [
+                    "'Categorical' .* does not support operation 'var'",
+                    "category dtype does not support aggregation 'var'",
+                ]
+            ),
+        ),
+    }[groupby_func]
+
+    if groupby_func == "corrwith":
+        warn_category = Pandas4Warning
+        warn_msg = "DataFrameGroupBy.corrwith is deprecated"
+    else:
+        warn_category = None
+        warn_msg = ""
+    _call_and_check(klass, msg, how, gb, groupby_func, args, warn_category, warn_msg)
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
new file mode 100644
index 0000000000000000000000000000000000000000..890ce4e398f0104209799201a6c2a5ea4973edcb
--- /dev/null
+++ b/pandas/tests/groupby/test_reductions.py
@@ -0,0 +1,1538 @@
+import builtins
+import datetime as dt
+from string import ascii_lowercase
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import iNaT
+
+from pandas.core.dtypes.common import pandas_dtype
+from pandas.core.dtypes.missing import na_value_for_dtype
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    MultiIndex,
+    Series,
+    Timestamp,
+    date_range,
+    isna,
+)
+import pandas._testing as tm
+from pandas.tests.groupby import get_groupby_method_args
+from pandas.util import _test_decorators as td
+
+
+@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"])
+def test_basic_aggregations(dtype):
+    data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
+
+    index = np.arange(9)
+    np.random.default_rng(2).shuffle(index)
+    data = data.reindex(index)
+
+    grouped = data.groupby(lambda x: x // 3, group_keys=False)
+
+    for k, v in grouped:
+        assert len(v) == 3
+
+    agged = grouped.aggregate(np.mean)
+    assert agged[1] == 1
+
+    expected = grouped.agg(np.mean)
+    tm.assert_series_equal(agged, expected)  # shorthand
+    tm.assert_series_equal(agged, grouped.mean())
+    result = grouped.sum()
+    expected = grouped.agg(np.sum)
+    if dtype == "int32":
+        # NumPy's sum returns int64
+        expected = expected.astype("int32")
+    tm.assert_series_equal(result, expected)
+
+    expected = grouped.apply(lambda x: x * x.sum())
+    transformed = grouped.transform(lambda x: x * x.sum())
+    assert transformed[7] == 12
+    tm.assert_series_equal(transformed, expected)
+
+    value_grouped = data.groupby(data)
+    result = value_grouped.aggregate(np.mean)
+    tm.assert_series_equal(result, agged, check_index_type=False)
+
+    # complex agg
+    agged = grouped.aggregate([np.mean, np.std])
+
+    msg = r"nested renamer is not supported"
+    with pytest.raises(pd.errors.SpecificationError, match=msg):
+        grouped.aggregate({"one": np.mean, "two": np.std})
+
+    # corner cases
+    msg = "Must produce aggregated value"
+    # exception raised is type Exception
+    with pytest.raises(Exception, match=msg):
+        grouped.aggregate(lambda x: x * 2)
+
+
+@pytest.mark.parametrize(
+    "vals",
+    [
+        ["foo", "bar", "baz"],
+        ["foo", "", ""],
+        ["", "", ""],
+        [1, 2, 3],
+        [1, 0, 0],
+        [0, 0, 0],
+        [1.0, 2.0, 3.0],
+        [1.0, 0.0, 0.0],
+        [0.0, 0.0, 0.0],
+        [True, True, True],
+        [True, False, False],
+        [False, False, False],
+        [np.nan, np.nan, np.nan],
+    ],
+)
+def test_groupby_bool_aggs(skipna, all_boolean_reductions, vals):
+    df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2})
+
+    # Figure out expectation using Python builtin
+    exp = getattr(builtins, all_boolean_reductions)(vals)
+
+    # edge case for missing data with skipna and 'any'
+    if skipna and all(isna(vals)) and all_boolean_reductions == "any":
+        exp = False
+
+    expected = DataFrame(
+        [exp] * 2, columns=["val"], index=pd.Index(["a", "b"], name="key")
+    )
+    result = getattr(df.groupby("key"), all_boolean_reductions)(skipna=skipna)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_any():
+    df = DataFrame(
+        [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
+        columns=["A", "B", "C"],
+    )
+    expected = DataFrame(
+        [[True, True], [False, True]], columns=["B", "C"], index=[1, 3]
+    )
+    expected.index.name = "A"
+    result = df.groupby("A").any()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_bool_aggs_dup_column_labels(all_boolean_reductions):
+    # GH#21668
+    df = DataFrame([[True, True]], columns=["a", "a"])
+    grp_by = df.groupby([0])
+    result = getattr(grp_by, all_boolean_reductions)()
+
+    expected = df.set_axis(np.array([0]))
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [False, False, False],
+        [True, True, True],
+        [pd.NA, pd.NA, pd.NA],
+        [False, pd.NA, False],
+        [True, pd.NA, True],
+        [True, pd.NA, False],
+    ],
+)
+def test_masked_kleene_logic(all_boolean_reductions, skipna, data):
+    # GH#37506
+    ser = Series(data, dtype="boolean")
+
+    # The result should match aggregating on the whole series. Correctness
+    # there is verified in test_reductions.py::test_any_all_boolean_kleene_logic
+    expected_data = getattr(ser, all_boolean_reductions)(skipna=skipna)
+    expected = Series(expected_data, index=np.array([0]), dtype="boolean")
+
+    result = ser.groupby([0, 0, 0]).agg(all_boolean_reductions, skipna=skipna)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "dtype1,dtype2,exp_col1,exp_col2",
+    [
+        (
+            "float",
+            "Float64",
+            np.array([True], dtype=bool),
+            pd.array([pd.NA], dtype="boolean"),
+        ),
+        (
+            "Int64",
+            "float",
+            pd.array([pd.NA], dtype="boolean"),
+            np.array([True], dtype=bool),
+        ),
+        (
+            "Int64",
+            "Int64",
+            pd.array([pd.NA], dtype="boolean"),
+            pd.array([pd.NA], dtype="boolean"),
+        ),
+        (
+            "Float64",
+            "boolean",
+            pd.array([pd.NA], dtype="boolean"),
+            pd.array([pd.NA], dtype="boolean"),
+        ),
+    ],
+)
+def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2):
+    # GH#37506
+    data1 = [1.0, np.nan] if dtype1.startswith("f") else [1.0, pd.NA]
+    data2 = [1.0, np.nan] if dtype2.startswith("f") else [1.0, pd.NA]
+    df = DataFrame(
+        {"col1": pd.array(data1, dtype=dtype1), "col2": pd.array(data2, dtype=dtype2)}
+    )
+    result = df.groupby([1, 1]).agg("all", skipna=False)
+
+    expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=np.array([1]))
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
+def test_masked_bool_aggs_skipna(
+    all_boolean_reductions, dtype, skipna, frame_or_series
+):
+    # GH#40585
+    obj = frame_or_series([pd.NA, 1], dtype=dtype)
+    expected_res = True
+    if not skipna and all_boolean_reductions == "all":
+        expected_res = pd.NA
+    expected = frame_or_series([expected_res], index=np.array([1]), dtype="boolean")
+
+    result = obj.groupby([1, 1]).agg(all_boolean_reductions, skipna=skipna)
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "bool_agg_func,data,expected_res",
+    [
+        ("any", [pd.NA, np.nan], False),
+        ("any", [pd.NA, 1, np.nan], True),
+        ("all", [pd.NA, pd.NaT], True),
+        ("all", [pd.NA, False, pd.NaT], False),
+    ],
+)
+def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_series):
+    # GH#37501
+    obj = frame_or_series(data, dtype=object)
+    result = obj.groupby([1] * len(data)).agg(bool_agg_func)
+    expected = frame_or_series([expected_res], index=np.array([1]), dtype="bool")
+    tm.assert_equal(result, expected)
+
+
+def test_object_NA_raises_with_skipna_false(all_boolean_reductions):
+    # GH#37501
+    ser = Series([pd.NA], dtype=object)
+    with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
+        ser.groupby([1]).agg(all_boolean_reductions, skipna=False)
+
+
+def test_empty(frame_or_series, all_boolean_reductions):
+    # GH 45231
+    kwargs = {"columns": ["a"]} if frame_or_series is DataFrame else {"name": "a"}
+    obj = frame_or_series(**kwargs, dtype=object)
+    result = getattr(obj.groupby(obj.index), all_boolean_reductions)()
+    expected = frame_or_series(**kwargs, dtype=bool)
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
+def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype):
+    # GH#57040
+    if any_real_numpy_dtype is int or any_real_numpy_dtype is float:
+        # No need to test
+        return
+    info = np.iinfo if "int" in any_real_numpy_dtype else np.finfo
+    min_value = info(any_real_numpy_dtype).min
+    max_value = info(any_real_numpy_dtype).max
+    df = DataFrame(
+        {"a": [2, 1, 1, 2], "b": [min_value, max_value, max_value, min_value]},
+        dtype=any_real_numpy_dtype,
+    )
+    gb = df.groupby("a")
+    result = getattr(gb, how)()
+    expected = DataFrame(
+        {"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype)
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("how", ["idxmin", "idxmax"])
+def test_idxmin_idxmax_extremes_skipna(skipna, how, float_numpy_dtype):
+    # GH#57040
+    min_value = np.finfo(float_numpy_dtype).min
+    max_value = np.finfo(float_numpy_dtype).max
+    df = DataFrame(
+        {
+            "a": Series(np.repeat(range(1, 5), repeats=2), dtype="intp"),
+            "b": Series(
+                [
+                    np.nan,
+                    min_value,
+                    np.nan,
+                    max_value,
+                    min_value,
+                    np.nan,
+                    max_value,
+                    np.nan,
+                ],
+                dtype=float_numpy_dtype,
+            ),
+        },
+    )
+    gb = df.groupby("a")
+
+    if not skipna:
+        msg = f"{how} with skipna=False"
+        with pytest.raises(ValueError, match=msg):
+            getattr(gb, how)(skipna=skipna)
+        return
+    result = getattr(gb, how)(skipna=skipna)
+    expected = DataFrame(
+        {"b": [1, 3, 4, 6]}, index=pd.Index(range(1, 5), name="a", dtype="intp")
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "func, values",
+    [
+        ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}),
+        ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}),
+    ],
+)
+@pytest.mark.parametrize("numeric_only", [True, False])
+def test_idxmin_idxmax_returns_int_types(func, values, numeric_only):
+    # GH 25444
+    df = DataFrame(
+        {
+            "name": ["A", "A", "B", "B"],
+            "c_int": [1, 2, 3, 4],
+            "c_float": [4.02, 3.03, 2.04, 1.05],
+            "c_date": ["2019", "2018", "2016", "2017"],
+        }
+    )
+    df["c_date"] = pd.to_datetime(df["c_date"])
+    df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific")
+    df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0]
+    df["c_period"] = df["c_date"].dt.to_period("W")
+    df["c_Integer"] = df["c_int"].astype("Int64")
+    df["c_Floating"] = df["c_float"].astype("Float64")
+
+    result = getattr(df.groupby("name"), func)(numeric_only=numeric_only)
+
+    expected = DataFrame(values, index=pd.Index(["A", "B"], name="name"))
+    if numeric_only:
+        expected = expected.drop(columns=["c_date"])
+    else:
+        expected["c_date_tz"] = expected["c_date"]
+        expected["c_timedelta"] = expected["c_date"]
+        expected["c_period"] = expected["c_date"]
+    expected["c_Integer"] = expected["c_int"]
+    expected["c_Floating"] = expected["c_float"]
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        (
+            Timestamp("2011-01-15 12:50:28.502376"),
+            Timestamp("2011-01-20 12:50:28.593448"),
+        ),
+        (24650000000000001, 24650000000000002),
+    ],
+)
+@pytest.mark.parametrize("method", ["count", "min", "max", "first", "last"])
+def test_groupby_non_arithmetic_agg_int_like_precision(method, data):
+    # GH#6620, GH#9311
+    df = DataFrame({"a": [1, 1], "b": data})
+
+    grouped = df.groupby("a")
+    result = getattr(grouped, method)()
+    if method == "count":
+        expected_value = 2
+    elif method == "first":
+        expected_value = data[0]
+    elif method == "last":
+        expected_value = data[1]
+    else:
+        expected_value = getattr(df["b"], method)()
+    expected = DataFrame({"b": [expected_value]}, index=pd.Index([1], name="a"))
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("how", ["first", "last"])
+def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how):
+    # GH#57019
+    na_value = na_value_for_dtype(pandas_dtype(any_real_nullable_dtype))
+    df = DataFrame(
+        {
+            "a": [2, 1, 1, 2, 3, 3],
+            # TODO: test that has mixed na_value and NaN either working for
+            #  float or raising for int?
+            "b": [na_value, 3.0, na_value, 4.0, na_value, na_value],
+            "c": [na_value, 3.0, na_value, 4.0, na_value, na_value],
+        },
+        dtype=any_real_nullable_dtype,
+    )
+    gb = df.groupby("a", sort=sort)
+    method = getattr(gb, how)
+    result = method(skipna=skipna)
+
+    ilocs = {
+        ("first", True): [3, 1, 4],
+        ("first", False): [0, 1, 4],
+        ("last", True): [3, 1, 5],
+        ("last", False): [3, 2, 5],
+    }[how, skipna]
+    expected = df.iloc[ilocs].set_index("a")
+    if sort:
+        expected = expected.sort_index()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_mean_no_overflow():
+    # Regression test for (#22487)
+    df = DataFrame(
+        {
+            "user": ["A", "A", "A", "A", "A"],
+            "connections": [4970, 4749, 4719, 4704, 18446744073699999744],
+        }
+    )
+    assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840
+
+
+def test_mean_on_timedelta():
+    # GH 17382
+    df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5})
+    result = df.groupby("cat")["time"].mean()
+    expected = Series(
+        pd.to_timedelta([4, 5]), name="time", index=pd.Index(["A", "B"], name="cat")
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values, dtype, result_dtype",
+    [
+        ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "float64", "float64"),
+        ([0, 1, pd.NA, 3, 4, 5, 6, 7, 8, 9], "Float64", "Float64"),
+        ([0, 1, pd.NA, 3, 4, 5, 6, 7, 8, 9], "Int64", "Float64"),
+        ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "timedelta64[ns]", "timedelta64[ns]"),
+        (
+            pd.to_datetime(
+                [
+                    "2019-05-09",
+                    pd.NaT,
+                    "2019-05-11",
+                    "2019-05-12",
+                    "2019-05-13",
+                    "2019-05-14",
+                    "2019-05-15",
+                    "2019-05-16",
+                    "2019-05-17",
+                    "2019-05-18",
+                ]
+            ),
+            "datetime64[ns]",
+            "datetime64[ns]",
+        ),
+    ],
+)
+def test_mean_skipna(values, dtype, result_dtype, skipna):
+    # GH#15675
+    df = DataFrame(
+        {
+            "val": values,
+            "cat": ["A", "B"] * 5,
+        }
+    ).astype({"val": dtype})
+    # We need to recast the expected values to the result_dtype because
+    # Series.mean() changes the dtype to float64/object depending on the input dtype
+    expected = (
+        df.groupby("cat")["val"]
+        .apply(lambda x: x.mean(skipna=skipna))
+        .astype(result_dtype)
+    )
+    result = df.groupby("cat")["val"].mean(skipna=skipna)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values, dtype",
+    [
+        ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "float64"),
+        ([0, 1, pd.NA, 3, 4, 5, 6, 7, 8, 9], "Float64"),
+        ([0, 1, pd.NA, 3, 4, 5, 6, 7, 8, 9], "Int64"),
+        ([0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9], "timedelta64[ns]"),
+    ],
+)
+def test_sum_skipna(values, dtype, skipna):
+    # GH#15675
+    df = DataFrame(
+        {
+            "val": values,
+            "cat": ["A", "B"] * 5,
+        }
+    ).astype({"val": dtype})
+    # We need to recast the expected values to the original dtype because
+    # Series.sum() changes the dtype
+    expected = (
+        df.groupby("cat")["val"].apply(lambda x: x.sum(skipna=skipna)).astype(dtype)
+    )
+    result = df.groupby("cat")["val"].sum(skipna=skipna)
+    tm.assert_series_equal(result, expected)
+
+
+def test_sum_skipna_object(skipna):
+    # GH#15675
+    df = DataFrame(
+        {
+            "val": ["a", "b", np.nan, "d", "e", "f", "g", "h", "i", "j"],
+            "cat": ["A", "B"] * 5,
+        }
+    ).astype({"val": object})
+    if skipna:
+        expected = Series(
+            ["aegi", "bdfhj"], index=pd.Index(["A", "B"], name="cat"), name="val"
+        ).astype(object)
+    else:
+        expected = Series(
+            [np.nan, "bdfhj"], index=pd.Index(["A", "B"], name="cat"), name="val"
+        ).astype(object)
+    result = df.groupby("cat")["val"].sum(skipna=skipna)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "func, values, dtype, result_dtype",
+    [
+        ("prod", [0, 1, 3, np.nan, 4, 5, 6, 7, -8, 9], "float64", "float64"),
+        ("prod", [0, -1, 3, 4, 5, pd.NA, 6, 7, 8, 9], "Float64", "Float64"),
+        ("prod", [0, 1, 3, -4, 5, 6, 7, -8, pd.NA, 9], "Int64", "Int64"),
+        ("prod", [np.nan] * 10, "float64", "float64"),
+        ("prod", [pd.NA] * 10, "Float64", "Float64"),
+        ("prod", [pd.NA] * 10, "Int64", "Int64"),
+        ("var", [0, -1, 3, 4, np.nan, 5, 6, 7, 8, 9], "float64", "float64"),
+        ("var", [0, 1, 3, -4, 5, 6, 7, -8, 9, pd.NA], "Float64", "Float64"),
+        ("var", [0, -1, 3, 4, 5, -6, 7, pd.NA, 8, 9], "Int64", "Float64"),
+        ("var", [np.nan] * 10, "float64", "float64"),
+        ("var", [pd.NA] * 10, "Float64", "Float64"),
+        ("var", [pd.NA] * 10, "Int64", "Float64"),
+        ("std", [0, 1, 3, -4, 5, 6, 7, -8, np.nan, 9], "float64", "float64"),
+        ("std", [0, -1, 3, 4, 5, -6, 7, pd.NA, 8, 9], "Float64", "Float64"),
+        ("std", [0, 1, 3, -4, 5, 6, 7, -8, 9, pd.NA], "Int64", "Float64"),
+        ("std", [np.nan] * 10, "float64", "float64"),
+        ("std", [pd.NA] * 10, "Float64", "Float64"),
+        ("std", [pd.NA] * 10, "Int64", "Float64"),
+        ("sem", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"),
+        ("sem", [0, 1, 3, -4, 5, 6, 7, -8, pd.NA, 9], "Float64", "Float64"),
+        ("sem", [0, -1, 3, 4, 5, -6, 7, 8, 9, pd.NA], "Int64", "Float64"),
+        ("sem", [np.nan] * 10, "float64", "float64"),
+        ("sem", [pd.NA] * 10, "Float64", "Float64"),
+        ("sem", [pd.NA] * 10, "Int64", "Float64"),
+        ("min", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"),
+        ("min", [0, 1, 3, -4, 5, 6, 7, -8, pd.NA, 9], "Float64", "Float64"),
+        ("min", [0, -1, 3, 4, 5, -6, 7, 8, 9, pd.NA], "Int64", "Int64"),
+        (
+            "min",
+            [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9],
+            "timedelta64[ns]",
+            "timedelta64[ns]",
+        ),
+        (
+            "min",
+            pd.to_datetime(
+                [
+                    "2019-05-09",
+                    pd.NaT,
+                    "2019-05-11",
+                    "2019-05-12",
+                    "2019-05-13",
+                    "2019-05-14",
+                    "2019-05-15",
+                    "2019-05-16",
+                    "2019-05-17",
+                    "2019-05-18",
+                ]
+            ),
+            "datetime64[ns]",
+            "datetime64[ns]",
+        ),
+        ("min", [np.nan] * 10, "float64", "float64"),
+        ("min", [pd.NA] * 10, "Float64", "Float64"),
+        ("min", [pd.NA] * 10, "Int64", "Int64"),
+        ("max", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"),
+        ("max", [0, 1, 3, -4, 5, 6, 7, -8, pd.NA, 9], "Float64", "Float64"),
+        ("max", [0, -1, 3, 4, 5, -6, 7, 8, 9, pd.NA], "Int64", "Int64"),
+        (
+            "max",
+            [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9],
+            "timedelta64[ns]",
+            "timedelta64[ns]",
+        ),
+        (
+            "max",
+            pd.to_datetime(
+                [
+                    "2019-05-09",
+                    pd.NaT,
+                    "2019-05-11",
+                    "2019-05-12",
+                    "2019-05-13",
+                    "2019-05-14",
+                    "2019-05-15",
+                    "2019-05-16",
+                    "2019-05-17",
+                    "2019-05-18",
+                ]
+            ),
+            "datetime64[ns]",
+            "datetime64[ns]",
+        ),
+        ("max", [np.nan] * 10, "float64", "float64"),
+        ("max", [pd.NA] * 10, "Float64", "Float64"),
+        ("max", [pd.NA] * 10, "Int64", "Int64"),
+        ("median", [0, -1, 3, 4, 5, -6, 7, np.nan, 8, 9], "float64", "float64"),
+        ("median", [0, 1, 3, -4, 5, 6, 7, -8, pd.NA, 9], "Float64", "Float64"),
+        ("median", [0, -1, 3, 4, 5, -6, 7, 8, 9, pd.NA], "Int64", "Float64"),
+        (
+            "median",
+            [0, 1, np.nan, 3, 4, 5, 6, 7, 8, 9],
+            "timedelta64[ns]",
+            "timedelta64[ns]",
+        ),
+        (
+            "median",
+            pd.to_datetime(
+                [
+                    "2019-05-09",
+                    pd.NaT,
+                    "2019-05-11",
+                    "2019-05-12",
+                    "2019-05-13",
+                    "2019-05-14",
+                    "2019-05-15",
+                    "2019-05-16",
+                    "2019-05-17",
+                    "2019-05-18",
+                ]
+            ),
+            "datetime64[ns]",
+            "datetime64[ns]",
+        ),
+        ("median", [np.nan] * 10, "float64", "float64"),
+        ("median", [pd.NA] * 10, "Float64", "Float64"),
+        ("median", [pd.NA] * 10, "Int64", "Float64"),
+    ],
+)
+def test_multifunc_skipna(func, values, dtype, result_dtype, skipna):
+    # GH#15675
+    df = DataFrame(
+        {
+            "val": values,
+            "cat": ["A", "B"] * 5,
+        }
+    ).astype({"val": dtype})
+    # We need to recast the expected values to the result_dtype as some operations
+    # change the dtype
+    expected = (
+        df.groupby("cat")["val"]
+        .apply(lambda x: getattr(x, func)(skipna=skipna))
+        .astype(result_dtype)
+    )
+    result = getattr(df.groupby("cat")["val"], func)(skipna=skipna)
+    tm.assert_series_equal(result, expected)
+
+
+def test_cython_median():
+    arr = np.random.default_rng(2).standard_normal(1000)
+    arr[::2] = np.nan
+    df = DataFrame(arr)
+
+    labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
+    labels[::17] = np.nan
+
+    result = df.groupby(labels).median()
+    exp = df.groupby(labels).agg(np.nanmedian)
+    tm.assert_frame_equal(result, exp)
+
+    df = DataFrame(np.random.default_rng(2).standard_normal((1000, 5)))
+    rs = df.groupby(labels).agg(np.median)
+    xp = df.groupby(labels).median()
+    tm.assert_frame_equal(rs, xp)
+
+
+def test_median_empty_bins(observed):
+    df = DataFrame(np.random.default_rng(2).integers(0, 44, 500))
+
+    grps = range(0, 55, 5)
+    bins = pd.cut(df[0], grps)
+
+    result = df.groupby(bins, observed=observed).median()
+    expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
+    tm.assert_frame_equal(result, expected)
+
+
+def test_max_min_non_numeric():
+    # #2700
+    aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]})
+
+    result = aa.groupby("nn").max()
+    assert "ss" in result
+
+    result = aa.groupby("nn").max(numeric_only=False)
+    assert "ss" in result
+
+    result = aa.groupby("nn").min()
+    assert "ss" in result
+
+    result = aa.groupby("nn").min(numeric_only=False)
+    assert "ss" in result
+
+
+def test_max_min_object_multiple_columns(using_infer_string):
+    # GH#41111 case where the aggregation is valid for some columns but not
+    # others; we split object blocks column-wise, consistent with
+    # DataFrame._reduce
+
+    df = DataFrame(
+        {
+            "A": [1, 1, 2, 2, 3],
+            "B": [1, "foo", 2, "bar", False],
+            "C": ["a", "b", "c", "d", "e"],
+        }
+    )
+    df._consolidate_inplace()  # should already be consolidate, but double-check
+    assert len(df._mgr.blocks) == 3 if using_infer_string else 2
+
+    gb = df.groupby("A")
+
+    result = gb[["C"]].max()
+    # "max" is valid for column "C" but not for "B"
+    ei = pd.Index([1, 2, 3], name="A")
+    expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
+    tm.assert_frame_equal(result, expected)
+
+    result = gb[["C"]].min()
+    # "min" is valid for column "C" but not for "B"
+    ei = pd.Index([1, 2, 3], name="A")
+    expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_min_date_with_nans():
+    # GH26321
+    dates = pd.to_datetime(
+        Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d"
+    ).dt.date
+    df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates})
+
+    result = df.groupby("b", as_index=False)["c"].min()["c"]
+    expected = pd.to_datetime(
+        Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d"
+    ).dt.date
+    tm.assert_series_equal(result, expected)
+
+    result = df.groupby("b")["c"].min()
+    expected.index.name = "b"
+    tm.assert_series_equal(result, expected)
+
+
+def test_max_inat():
+    # GH#40767 dont interpret iNaT as NaN
+    ser = Series([1, iNaT])
+    key = np.array([1, 1], dtype=np.int64)
+    gb = ser.groupby(key)
+
+    result = gb.max(min_count=2)
+    expected = Series({1: 1}, dtype=np.int64)
+    tm.assert_series_equal(result, expected, check_exact=True)
+
+    result = gb.min(min_count=2)
+    expected = Series({1: iNaT}, dtype=np.int64)
+    tm.assert_series_equal(result, expected, check_exact=True)
+
+    # not enough entries -> gets masked to NaN
+    result = gb.min(min_count=3)
+    expected = Series({1: np.nan})
+    tm.assert_series_equal(result, expected, check_exact=True)
+
+
+def test_max_inat_not_all_na():
+    # GH#40767 dont interpret iNaT as NaN
+
+    # make sure we dont round iNaT+1 to iNaT
+    ser = Series([1, iNaT, 2, iNaT + 1])
+    gb = ser.groupby([1, 2, 3, 3])
+    result = gb.min(min_count=2)
+
+    # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy
+    expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1})
+    expected.index = expected.index.astype(int)
+    tm.assert_series_equal(result, expected, check_exact=True)
+
+
+@pytest.mark.parametrize("func", ["min", "max"])
+def test_groupby_aggregate_period_column(func):
+    # GH 31471
+    groups = [1, 2]
+    periods = pd.period_range("2020", periods=2, freq="Y")
+    df = DataFrame({"a": groups, "b": periods})
+
+    result = getattr(df.groupby("a")["b"], func)()
+    idx = pd.Index([1, 2], name="a")
+    expected = Series(periods, index=idx, name="b")
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["min", "max"])
+def test_groupby_aggregate_period_frame(func):
+    # GH 31471
+    groups = [1, 2]
+    periods = pd.period_range("2020", periods=2, freq="Y")
+    df = DataFrame({"a": groups, "b": periods})
+
+    result = getattr(df.groupby("a"), func)()
+    idx = pd.Index([1, 2], name="a")
+    expected = DataFrame({"b": periods}, index=idx)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_aggregate_numeric_object_dtype():
+    # https://github.com/pandas-dev/pandas/issues/39329
+    # simplified case: multiple object columns where one is all-NaN
+    # -> gets split as the all-NaN is inferred as float
+    df = DataFrame(
+        {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4},
+    ).astype(object)
+    result = df.groupby("key").min()
+    expected = (
+        DataFrame(
+            {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]},
+        )
+        .set_index("key")
+        .astype(object)
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # same but with numbers
+    df = DataFrame(
+        {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)},
+    ).astype(object)
+    result = df.groupby("key").min()
+    expected = (
+        DataFrame({"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]})
+        .set_index("key")
+        .astype(object)
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["min", "max"])
+def test_aggregate_categorical_lost_index(func: str):
+    # GH: 28641 groupby drops index, when grouping over categorical column with min/max
+    ds = Series(["b"], dtype="category").cat.as_ordered()
+    df = DataFrame({"A": [1997], "B": ds})
+    result = df.groupby("A").agg({"B": func})
+    expected = DataFrame({"B": ["b"]}, index=pd.Index([1997], name="A"))
+
+    # ordered categorical dtype should be preserved
+    expected["B"] = expected["B"].astype(ds.dtype)
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"])
+def test_groupby_min_max_nullable(dtype):
+    if dtype == "Int64":
+        # GH#41743 avoid precision loss
+        ts = 1618556707013635762
+    elif dtype == "boolean":
+        ts = 0
+    else:
+        ts = 4.0
+
+    df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]})
+    df["ts"] = df["ts"].astype(dtype)
+
+    gb = df.groupby("id")
+
+    result = gb.min()
+    expected = df.iloc[:1].set_index("id")
+    tm.assert_frame_equal(result, expected)
+
+    res_max = gb.max()
+    expected_max = df.iloc[1:].set_index("id")
+    tm.assert_frame_equal(res_max, expected_max)
+
+    result2 = gb.min(min_count=3)
+    expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype)
+    tm.assert_frame_equal(result2, expected2)
+
+    res_max2 = gb.max(min_count=3)
+    tm.assert_frame_equal(res_max2, expected2)
+
+    # Case with NA values
+    df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]})
+    df2["ts"] = df2["ts"].astype(dtype)
+    gb2 = df2.groupby("id")
+
+    result3 = gb2.min()
+    tm.assert_frame_equal(result3, expected)
+
+    res_max3 = gb2.max()
+    tm.assert_frame_equal(res_max3, expected_max)
+
+    result4 = gb2.min(min_count=100)
+    tm.assert_frame_equal(result4, expected2)
+
+    res_max4 = gb2.max(min_count=100)
+    tm.assert_frame_equal(res_max4, expected2)
+
+
+def test_min_max_nullable_uint64_empty_group():
+    # don't raise NotImplementedError from libgroupby
+    cat = pd.Categorical([0] * 10, categories=[0, 1])
+    df = DataFrame({"A": cat, "B": pd.array(np.arange(10, dtype=np.uint64))})
+    gb = df.groupby("A", observed=False)
+
+    res = gb.min()
+
+    idx = pd.CategoricalIndex([0, 1], dtype=cat.dtype, name="A")
+    expected = DataFrame({"B": pd.array([0, pd.NA], dtype="UInt64")}, index=idx)
+    tm.assert_frame_equal(res, expected)
+
+    res = gb.max()
+    expected.iloc[0, 0] = 9
+    tm.assert_frame_equal(res, expected)
+
+
+@pytest.mark.parametrize("func", ["first", "last", "min", "max"])
+def test_groupby_min_max_categorical(func):
+    # GH: 52151
+    df = DataFrame(
+        {
+            "col1": pd.Categorical(["A"], categories=list("AB"), ordered=True),
+            "col2": pd.Categorical([1], categories=[1, 2], ordered=True),
+            "value": 0.1,
+        }
+    )
+    result = getattr(df.groupby("col1", observed=False), func)()
+
+    idx = pd.CategoricalIndex(data=["A", "B"], name="col1", ordered=True)
+    expected = DataFrame(
+        {
+            "col2": pd.Categorical([1, None], categories=[1, 2], ordered=True),
+            "value": [0.1, None],
+        },
+        index=idx,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["min", "max"])
+def test_min_empty_string_dtype(func, string_dtype_no_object):
+    # GH#55619
+    dtype = string_dtype_no_object
+    df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0]
+    result = getattr(df.groupby("a"), func)()
+    expected = DataFrame(
+        columns=["b", "c"], dtype=dtype, index=pd.Index([], dtype=dtype, name="a")
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("min_count", [0, 1])
+@pytest.mark.parametrize("test_series", [True, False])
+def test_string_dtype_all_na(
+    string_dtype_no_object, reduction_func, skipna, min_count, test_series
+):
+    # https://github.com/pandas-dev/pandas/issues/60985
+    if reduction_func == "corrwith":
+        # corrwith is deprecated.
+        return
+
+    dtype = string_dtype_no_object
+
+    if reduction_func in [
+        "any",
+        "all",
+        "idxmin",
+        "idxmax",
+        "mean",
+        "median",
+        "std",
+        "var",
+    ]:
+        kwargs = {"skipna": skipna}
+    elif reduction_func in ["kurt"]:
+        kwargs = {"min_count": min_count}
+    elif reduction_func in ["count", "nunique", "quantile", "sem", "size"]:
+        kwargs = {}
+    else:
+        kwargs = {"skipna": skipna, "min_count": min_count}
+
+    expected_dtype, expected_value = dtype, pd.NA
+    if reduction_func in ["all", "any"]:
+        expected_dtype = "bool"
+        # TODO: For skipna=False, bool(pd.NA) raises; should groupby?
+        expected_value = not skipna if reduction_func == "any" else True
+    elif reduction_func in ["count", "nunique", "size"]:
+        # TODO: Should be more consistent - return Int64 when dtype.na_value is pd.NA?
+        if (
+            test_series
+            and reduction_func == "size"
+            and dtype.storage == "pyarrow"
+            and dtype.na_value is pd.NA
+        ):
+            expected_dtype = "Int64"
+        else:
+            expected_dtype = "int64"
+        expected_value = 1 if reduction_func == "size" else 0
+    elif not skipna or min_count > 0:
+        expected_value = pd.NA
+    elif reduction_func == "sum":
+        # https://github.com/pandas-dev/pandas/pull/60936
+        expected_value = ""
+
+    df = DataFrame({"a": ["x"], "b": [pd.NA]}, dtype=dtype)
+    obj = df["b"] if test_series else df
+    args = get_groupby_method_args(reduction_func, obj)
+    gb = obj.groupby(df["a"])
+    method = getattr(gb, reduction_func)
+
+    if reduction_func in [
+        "mean",
+        "median",
+        "kurt",
+        "prod",
+        "quantile",
+        "sem",
+        "skew",
+        "std",
+        "var",
+    ]:
+        msg = f"dtype '{dtype}' does not support operation '{reduction_func}'"
+        with pytest.raises(TypeError, match=msg):
+            method(*args, **kwargs)
+        return
+    elif reduction_func in ["idxmin", "idxmax"]:
+        if skipna:
+            msg = f"{reduction_func} with skipna=True encountered all NA values"
+        else:
+            msg = f"{reduction_func} with skipna=False encountered an NA value."
+        with pytest.raises(ValueError, match=msg):
+            method(*args, **kwargs)
+        return
+
+    result = method(*args, **kwargs)
+    index = pd.Index(["x"], name="a", dtype=dtype)
+    if test_series or reduction_func == "size":
+        name = None if not test_series and reduction_func == "size" else "b"
+        expected = Series(expected_value, index=index, dtype=expected_dtype, name=name)
+    else:
+        expected = DataFrame({"b": expected_value}, index=index, dtype=expected_dtype)
+    tm.assert_equal(result, expected)
+
+
+def test_max_nan_bug():
+    df = DataFrame(
+        {
+            "Unnamed: 0": ["-04-23", "-05-06", "-05-07"],
+            "Date": [
+                "2013-04-23 00:00:00",
+                "2013-05-06 00:00:00",
+                "2013-05-07 00:00:00",
+            ],
+            "app": Series([np.nan, np.nan, "OE"]),
+            "File": ["log080001.log", "log.log", "xlsx"],
+        }
+    )
+    gb = df.groupby("Date")
+    r = gb[["File"]].max()
+    e = gb["File"].max().to_frame()
+    tm.assert_frame_equal(r, e)
+    assert not r["File"].isna().any()
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("with_nan", [True, False])
+@pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]])
+def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys):
+    n = 100
+    m = 10
+    days = date_range("2015-08-23", periods=10)
+    df = DataFrame(
+        {
+            "jim": np.random.default_rng(2).choice(list(ascii_lowercase), n),
+            "joe": np.random.default_rng(2).choice(days, n),
+            "julie": np.random.default_rng(2).integers(0, m, n),
+        }
+    )
+    if with_nan:
+        df = df.astype({"julie": float})  # Explicit cast to avoid implicit cast below
+        df.loc[1::17, "jim"] = None
+        df.loc[3::37, "joe"] = None
+        df.loc[7::19, "julie"] = None
+        df.loc[8::19, "julie"] = None
+        df.loc[9::19, "julie"] = None
+    original_df = df.copy()
+    gr = df.groupby(keys, as_index=as_index, sort=sort)
+    left = gr["julie"].nunique(dropna=dropna)
+
+    gr = df.groupby(keys, as_index=as_index, sort=sort)
+    right = gr["julie"].apply(Series.nunique, dropna=dropna)
+    if not as_index:
+        right = right.reset_index(drop=True)
+
+    if as_index:
+        tm.assert_series_equal(left, right, check_names=False)
+    else:
+        tm.assert_frame_equal(left, right, check_names=False)
+    tm.assert_frame_equal(df, original_df)
+
+
+def test_nunique():
+    df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})
+
+    expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]})
+    result = df.groupby("A", as_index=False).nunique()
+    tm.assert_frame_equal(result, expected)
+
+    # as_index
+    expected.index = list("abc")
+    expected.index.name = "A"
+    expected = expected.drop(columns="A")
+    result = df.groupby("A").nunique()
+    tm.assert_frame_equal(result, expected)
+
+    # with na
+    result = df.replace({"x": None}).groupby("A").nunique(dropna=False)
+    tm.assert_frame_equal(result, expected)
+
+    # dropna
+    expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc"))
+    expected.index.name = "A"
+    result = df.replace({"x": None}).groupby("A").nunique()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_nunique_with_object():
+    # GH 11077
+    data = DataFrame(
+        [
+            [100, 1, "Alice"],
+            [200, 2, "Bob"],
+            [300, 3, "Charlie"],
+            [-400, 4, "Dan"],
+            [500, 5, "Edith"],
+        ],
+        columns=["amount", "id", "name"],
+    )
+
+    result = data.groupby(["id", "amount"])["name"].nunique()
+    index = MultiIndex.from_arrays([data.id, data.amount])
+    expected = Series([1] * 5, name="name", index=index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_nunique_with_empty_series():
+    # GH 12553
+    data = Series(name="name", dtype=object)
+    result = data.groupby(level=0).nunique()
+    expected = Series(name="name", dtype="int64")
+    tm.assert_series_equal(result, expected)
+
+
+def test_nunique_with_timegrouper():
+    # GH 13453
+    test = DataFrame(
+        {
+            "time": [
+                Timestamp("2016-06-28 09:35:35"),
+                Timestamp("2016-06-28 16:09:30"),
+                Timestamp("2016-06-28 16:46:28"),
+            ],
+            "data": ["1", "2", "3"],
+        }
+    ).set_index("time")
+    result = test.groupby(pd.Grouper(freq="h"))["data"].nunique()
+    expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(Series.nunique)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "key, data, dropna, expected",
+    [
+        (
+            ["x", "x", "x"],
+            [Timestamp("2019-01-01"), pd.NaT, Timestamp("2019-01-01")],
+            True,
+            Series([1], index=pd.Index(["x"], name="key"), name="data"),
+        ),
+        (
+            ["x", "x", "x"],
+            [dt.date(2019, 1, 1), pd.NaT, dt.date(2019, 1, 1)],
+            True,
+            Series([1], index=pd.Index(["x"], name="key"), name="data"),
+        ),
+        (
+            ["x", "x", "x", "y", "y"],
+            [
+                dt.date(2019, 1, 1),
+                pd.NaT,
+                dt.date(2019, 1, 1),
+                pd.NaT,
+                dt.date(2019, 1, 1),
+            ],
+            False,
+            Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"),
+        ),
+        (
+            ["x", "x", "x", "x", "y"],
+            [
+                dt.date(2019, 1, 1),
+                pd.NaT,
+                dt.date(2019, 1, 1),
+                pd.NaT,
+                dt.date(2019, 1, 1),
+            ],
+            False,
+            Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"),
+        ),
+    ],
+)
+def test_nunique_with_NaT(key, data, dropna, expected):
+    # GH 27951
+    df = DataFrame({"key": key, "data": data})
+    result = df.groupby(["key"])["data"].nunique(dropna=dropna)
+    tm.assert_series_equal(result, expected)
+
+
+def test_nunique_preserves_column_level_names():
+    # GH 23222
+    test = DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0"))
+    result = test.groupby([0, 0, 0]).nunique()
+    expected = DataFrame([2], index=np.array([0]), columns=test.columns)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_nunique_transform_with_datetime():
+    # GH 35109 - transform with nunique on datetimes results in integers
+    df = DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"])
+    result = df.groupby([0, 0, 1])["date"].transform("nunique")
+    expected = Series([2, 2, 1], name="date")
+    tm.assert_series_equal(result, expected)
+
+
+def test_empty_categorical(observed):
+    # GH#21334
+    cat = Series([1]).astype("category")
+    ser = cat[:0]
+    gb = ser.groupby(ser, observed=observed)
+    result = gb.nunique()
+    if observed:
+        expected = Series([], index=cat[:0], dtype="int64")
+    else:
+        expected = Series([0], index=cat, dtype="int64")
+    tm.assert_series_equal(result, expected)
+
+
+def test_intercept_builtin_sum():
+    s = Series([1.0, 2.0, np.nan, 3.0])
+    grouped = s.groupby([0, 1, 2, 2])
+
+    # GH#53425
+    result = grouped.agg(builtins.sum)
+    # GH#53425
+    result2 = grouped.apply(builtins.sum)
+    expected = Series([1.0, 2.0, np.nan], index=np.array([0, 1, 2]))
+    tm.assert_series_equal(result, expected)
+    tm.assert_series_equal(result2, expected)
+
+
+@pytest.mark.parametrize("min_count", [0, 10])
+def test_groupby_sum_mincount_boolean(min_count):
+    b = True
+    a = False
+    na = np.nan
+    dfg = pd.array([b, b, na, na, a, a, b], dtype="boolean")
+
+    df = DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": dfg})
+    result = df.groupby("A").sum(min_count=min_count)
+    if min_count == 0:
+        expected = DataFrame(
+            {"B": pd.array([3, 0, 0], dtype="Int64")},
+            index=pd.Index([1, 2, 3], name="A"),
+        )
+        tm.assert_frame_equal(result, expected)
+    else:
+        expected = DataFrame(
+            {"B": pd.array([pd.NA] * 3, dtype="Int64")},
+            index=pd.Index([1, 2, 3], name="A"),
+        )
+        tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_sum_below_mincount_nullable_integer():
+    # https://github.com/pandas-dev/pandas/issues/32861
+    df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64")
+    grouped = df.groupby("a")
+    idx = pd.Index([0, 1, 2], name="a", dtype="Int64")
+
+    result = grouped["b"].sum(min_count=2)
+    expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b")
+    tm.assert_series_equal(result, expected)
+
+    result = grouped.sum(min_count=2)
+    expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_sum_timedelta_with_nat():
+    # GH#42659
+    df = DataFrame(
+        {
+            "a": [1, 1, 2, 2],
+            "b": [pd.Timedelta("1D"), pd.Timedelta("2D"), pd.Timedelta("3D"), pd.NaT],
+        }
+    )
+    td3 = pd.Timedelta(days=3).as_unit("us")
+
+    gb = df.groupby("a")
+
+    res = gb.sum()
+    expected = DataFrame({"b": [td3, td3]}, index=pd.Index([1, 2], name="a"))
+    tm.assert_frame_equal(res, expected)
+
+    res = gb["b"].sum()
+    tm.assert_series_equal(res, expected["b"])
+
+    res = gb["b"].sum(min_count=2)
+    expected = Series([td3, pd.NaT], dtype="m8[us]", name="b", index=expected.index)
+    tm.assert_series_equal(res, expected)
+
+
+@pytest.mark.parametrize(
+    "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]
+)
+@pytest.mark.parametrize(
+    "method,data",
+    [
+        ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
+        ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
+        ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
+        ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
+        ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}),
+    ],
+)
+def test_groupby_non_arithmetic_agg_types(dtype, method, data):
+    # GH9311, GH6620
+    df = DataFrame(
+        [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}]
+    )
+
+    df["b"] = df.b.astype(dtype)
+
+    if "args" not in data:
+        data["args"] = []
+
+    if "out_type" in data:
+        out_type = data["out_type"]
+    else:
+        out_type = dtype
+
+    exp = data["df"]
+    df_out = DataFrame(exp)
+
+    df_out["b"] = df_out.b.astype(out_type)
+    df_out.set_index("a", inplace=True)
+
+    grpd = df.groupby("a")
+    t = getattr(grpd, method)(*data["args"])
+    tm.assert_frame_equal(t, df_out)
+
+
+def scipy_sem(*args, **kwargs):
+    from scipy.stats import sem
+
+    return sem(*args, ddof=1, **kwargs)
+
+
+@pytest.mark.parametrize(
+    "op,targop",
+    [
+        ("mean", np.mean),
+        ("median", np.median),
+        ("std", np.std),
+        ("var", np.var),
+        ("sum", np.sum),
+        ("prod", np.prod),
+        ("min", np.min),
+        ("max", np.max),
+        ("first", lambda x: x.iloc[0]),
+        ("last", lambda x: x.iloc[-1]),
+        ("count", np.size),
+        pytest.param("sem", scipy_sem, marks=td.skip_if_no("scipy")),
+    ],
+)
+def test_ops_general(op, targop):
+    df = DataFrame(np.random.default_rng(2).standard_normal(1000))
+    labels = np.random.default_rng(2).integers(0, 50, size=1000).astype(float)
+
+    result = getattr(df.groupby(labels), op)()
+    kwargs = {"ddof": 1, "axis": 0} if op in ["std", "var"] else {}
+    expected = df.groupby(labels).agg(targop, **kwargs)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values",
+    [
+        {
+            "a": [1, 1, 1, 2, 2, 2, 3, 3, 3],
+            "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2],
+        },
+        {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]},
+    ],
+)
+@pytest.mark.parametrize("function", ["mean", "median", "var"])
+def test_apply_to_nullable_integer_returns_float(values, function):
+    # https://github.com/pandas-dev/pandas/issues/32219
+    output = 0.5 if function == "var" else 1.5
+    arr = np.array([output] * 3, dtype=float)
+    idx = pd.Index([1, 2, 3], name="a", dtype="Int64")
+    expected = DataFrame({"b": arr}, index=idx).astype("Float64")
+
+    groups = DataFrame(values, dtype="Int64").groupby("a")
+
+    result = getattr(groups, function)()
+    tm.assert_frame_equal(result, expected)
+
+    result = groups.agg(function)
+    tm.assert_frame_equal(result, expected)
+
+    result = groups.agg([function])
+    expected.columns = MultiIndex.from_tuples([("b", function)])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "sum",
+        "prod",
+        "min",
+        "max",
+        "median",
+        "mean",
+        "skew",
+        "kurt",
+        "std",
+        "var",
+        "sem",
+    ],
+)
+def test_regression_allowlist_methods(op, skipna, sort):
+    # GH6944
+    # GH 17537
+    # explicitly test the allowlist methods
+    frame = DataFrame([0])
+
+    grouped = frame.groupby(level=0, sort=sort)
+
+    if op in ["skew", "kurt", "sum", "mean"]:
+        # skew, kurt, sum, mean have skipna
+        result = getattr(grouped, op)(skipna=skipna)
+        expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(skipna=skipna))
+        if sort:
+            expected = expected.sort_index()
+        tm.assert_frame_equal(result, expected)
+    else:
+        result = getattr(grouped, op)()
+        expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)())
+        if sort:
+            expected = expected.sort_index()
+        tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_prod_with_int64_dtype():
+    # GH#46573
+    data = [
+        [1, 11],
+        [1, 41],
+        [1, 17],
+        [1, 37],
+        [1, 7],
+        [1, 29],
+        [1, 31],
+        [1, 2],
+        [1, 3],
+        [1, 43],
+        [1, 5],
+        [1, 47],
+        [1, 19],
+        [1, 88],
+    ]
+    df = DataFrame(data, columns=["A", "B"], dtype="int64")
+    result = df.groupby(["A"]).prod().reset_index()
+    expected = DataFrame({"A": [1], "B": [180970905912331920]}, dtype="int64")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_std_datetimelike():
+    # GH#48481
+    tdi = pd.timedelta_range("1 Day", periods=10000, unit="ns")
+    ser = Series(tdi)
+    ser[::5] *= 2  # get different std for different groups
+
+    df = ser.to_frame("A").copy()
+
+    df["B"] = ser + Timestamp(0)
+    df["C"] = ser + Timestamp(0, tz="UTC")
+    df.iloc[-1] = pd.NaT  # last group includes NaTs
+
+    gb = df.groupby(list(range(5)) * 2000)
+
+    result = gb.std()
+
+    # Note: this does not _exactly_ match what we would get if we did
+    # [gb.get_group(i).std() for i in gb.groups]
+    #  but it _does_ match the floating point error we get doing the
+    #  same operation on int64 data xref GH#51332
+    td1 = pd.Timedelta("2887 days 11:21:02.326710176")
+    td4 = pd.Timedelta("2886 days 00:42:34.664668096")
+    exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5))
+    expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_mean_numeric_only_validates_bool():
+    # GH#62778
+
+    df = DataFrame({"A": range(5), "B": range(5)})
+
+    msg = "numeric_only accepts only Boolean values"
+    with pytest.raises(ValueError, match=msg):
+        df.groupby(["A"]).mean(["B"])
+
+    with pytest.raises(ValueError, match=msg):
+        df.groupby(["A"]).mean(numeric_only="True")
+
+    with pytest.raises(ValueError, match=msg):
+        df.groupby(["A"]).mean(numeric_only=1)
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
new file mode 100644
index 0000000000000000000000000000000000000000..b60947e61fb23acf3cd0c3314f0dbfe124c22188
--- /dev/null
+++ b/pandas/tests/groupby/test_timegrouper.py
@@ -0,0 +1,984 @@
+"""
+test with the TimeGrouper / grouping with datetimes
+"""
+
+from datetime import (
+    datetime,
+    timedelta,
+    timezone,
+)
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    MultiIndex,
+    Series,
+    Timestamp,
+    date_range,
+    offsets,
+)
+import pandas._testing as tm
+from pandas.core.groupby.grouper import Grouper
+from pandas.core.groupby.ops import BinGrouper
+
+
+@pytest.fixture
+def frame_for_truncated_bingrouper():
+    """
+    DataFrame used by groupby_with_truncated_bingrouper, made into
+    a separate fixture for easier reuse in
+    test_groupby_apply_timegrouper_with_nat_apply_squeeze
+    """
+    df = DataFrame(
+        {
+            "Quantity": [18, 3, 5, 1, 9, 3],
+            "Date": [
+                Timestamp(2013, 9, 1, 13, 0),
+                Timestamp(2013, 9, 1, 13, 5),
+                Timestamp(2013, 10, 1, 20, 0),
+                Timestamp(2013, 10, 3, 10, 0),
+                pd.NaT,
+                Timestamp(2013, 9, 2, 14, 0),
+            ],
+        }
+    )
+    return df
+
+
+@pytest.fixture
+def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
+    """
+    GroupBy object such that gb._grouper is a BinGrouper and
+    len(gb._grouper.result_index) < len(gb._grouper.group_keys_seq)
+
+    Aggregations on this groupby should have
+
+        dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
+
+    As either the index or an index level.
+    """
+    df = frame_for_truncated_bingrouper
+
+    tdg = Grouper(key="Date", freq="5D")
+    gb = df.groupby(tdg)
+
+    # check we're testing the case we're interested in
+    assert len(gb._grouper.result_index) != len(gb._grouper.codes)
+
+    return gb
+
+
+class TestGroupBy:
+    def test_groupby_with_timegrouper(self, using_infer_string):
+        # GH 4161
+        # TimeGrouper requires a sorted index
+        # also verifies that the resultant index has the correct name
+        df_original = DataFrame(
+            {
+                "Buyer": "Carl Carl Carl Carl Joe Carl".split(),
+                "Quantity": [18, 3, 5, 1, 9, 3],
+                "Date": [
+                    datetime(2013, 9, 1, 13, 0),
+                    datetime(2013, 9, 1, 13, 5),
+                    datetime(2013, 10, 1, 20, 0),
+                    datetime(2013, 10, 3, 10, 0),
+                    datetime(2013, 12, 2, 12, 0),
+                    datetime(2013, 9, 2, 14, 0),
+                ],
+            }
+        )
+
+        # GH 6908 change target column's order
+        df_reordered = df_original.sort_values(by="Quantity")
+
+        for df in [df_original, df_reordered]:
+            df = df.set_index(["Date"])
+
+            exp_dti = date_range(
+                "20130901",
+                "20131205",
+                freq="5D",
+                name="Date",
+                inclusive="left",
+                unit=df.index.unit,
+            )
+            expected = DataFrame(
+                {"Buyer": "" if using_infer_string else 0, "Quantity": 0},
+                index=exp_dti,
+            )
+            # Cast to object/str to avoid implicit cast when setting
+            #  entry to "CarlCarlCarl"
+            expected = expected.astype({"Buyer": object})
+            if using_infer_string:
+                expected = expected.astype({"Buyer": "str"})
+            expected.iloc[0, 0] = "CarlCarlCarl"
+            expected.iloc[6, 0] = "CarlCarl"
+            expected.iloc[18, 0] = "Joe"
+            expected.iloc[[0, 6, 18], 1] = np.array([24, 6, 9], dtype="int64")
+
+            result1 = df.resample("5D").sum()
+            tm.assert_frame_equal(result1, expected)
+
+            df_sorted = df.sort_index()
+            result2 = df_sorted.groupby(Grouper(freq="5D")).sum()
+            tm.assert_frame_equal(result2, expected)
+
+            result3 = df.groupby(Grouper(freq="5D")).sum()
+            tm.assert_frame_equal(result3, expected)
+
+    @pytest.mark.parametrize("should_sort", [True, False])
+    def test_groupby_with_timegrouper_methods(self, should_sort):
+        # GH 3881
+        # make sure API of timegrouper conforms
+
+        df = DataFrame(
+            {
+                "Branch": "A A A A A B".split(),
+                "Buyer": "Carl Mark Carl Joe Joe Carl".split(),
+                "Quantity": [1, 3, 5, 8, 9, 3],
+                "Date": [
+                    datetime(2013, 1, 1, 13, 0),
+                    datetime(2013, 1, 1, 13, 5),
+                    datetime(2013, 10, 1, 20, 0),
+                    datetime(2013, 10, 2, 10, 0),
+                    datetime(2013, 12, 2, 12, 0),
+                    datetime(2013, 12, 2, 14, 0),
+                ],
+            }
+        )
+
+        if should_sort:
+            df = df.sort_values(by="Quantity", ascending=False)
+
+        df = df.set_index("Date", drop=False)
+        g = df.groupby(Grouper(freq="6ME"))
+        assert g.group_keys
+
+        assert isinstance(g._grouper, BinGrouper)
+        groups = g.groups
+        assert isinstance(groups, dict)
+        assert len(groups) == 3
+
+    def test_timegrouper_with_reg_groups(self):
+        # GH 3794
+        # allow combination of timegrouper/reg groups
+
+        df_original = DataFrame(
+            {
+                "Branch": "A A A A A A A B".split(),
+                "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
+                "Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
+                "Date": [
+                    datetime(2013, 1, 1, 13, 0),
+                    datetime(2013, 1, 1, 13, 5),
+                    datetime(2013, 10, 1, 20, 0),
+                    datetime(2013, 10, 2, 10, 0),
+                    datetime(2013, 10, 1, 20, 0),
+                    datetime(2013, 10, 2, 10, 0),
+                    datetime(2013, 12, 2, 12, 0),
+                    datetime(2013, 12, 2, 14, 0),
+                ],
+            }
+        ).set_index("Date")
+
+        df_sorted = df_original.sort_values(by="Quantity", ascending=False)
+
+        for df in [df_original, df_sorted]:
+            expected = DataFrame(
+                {
+                    "Buyer": "Carl Joe Mark".split(),
+                    "Quantity": [10, 18, 3],
+                    "Date": [
+                        datetime(2013, 12, 31, 0, 0),
+                        datetime(2013, 12, 31, 0, 0),
+                        datetime(2013, 12, 31, 0, 0),
+                    ],
+                }
+            ).set_index(["Date", "Buyer"])
+
+            msg = "The default value of numeric_only"
+            result = df.groupby([Grouper(freq="YE"), "Buyer"]).sum(numeric_only=True)
+            tm.assert_frame_equal(result, expected)
+
+            expected = DataFrame(
+                {
+                    "Buyer": "Carl Mark Carl Joe".split(),
+                    "Quantity": [1, 3, 9, 18],
+                    "Date": [
+                        datetime(2013, 1, 1, 0, 0),
+                        datetime(2013, 1, 1, 0, 0),
+                        datetime(2013, 7, 1, 0, 0),
+                        datetime(2013, 7, 1, 0, 0),
+                    ],
+                }
+            ).set_index(["Date", "Buyer"])
+            result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum(numeric_only=True)
+            tm.assert_frame_equal(result, expected)
+
+        df_original = DataFrame(
+            {
+                "Branch": "A A A A A A A B".split(),
+                "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
+                "Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
+                "Date": [
+                    datetime(2013, 10, 1, 13, 0),
+                    datetime(2013, 10, 1, 13, 5),
+                    datetime(2013, 10, 1, 20, 0),
+                    datetime(2013, 10, 2, 10, 0),
+                    datetime(2013, 10, 1, 20, 0),
+                    datetime(2013, 10, 2, 10, 0),
+                    datetime(2013, 10, 2, 12, 0),
+                    datetime(2013, 10, 2, 14, 0),
+                ],
+            }
+        ).set_index("Date")
+
+        df_sorted = df_original.sort_values(by="Quantity", ascending=False)
+        for df in [df_original, df_sorted]:
+            expected = DataFrame(
+                {
+                    "Buyer": "Carl Joe Mark Carl Joe".split(),
+                    "Quantity": [6, 8, 3, 4, 10],
+                    "Date": [
+                        datetime(2013, 10, 1, 0, 0),
+                        datetime(2013, 10, 1, 0, 0),
+                        datetime(2013, 10, 1, 0, 0),
+                        datetime(2013, 10, 2, 0, 0),
+                        datetime(2013, 10, 2, 0, 0),
+                    ],
+                }
+            ).set_index(["Date", "Buyer"])
+
+            result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True)
+            tm.assert_frame_equal(result, expected)
+
+            result = df.groupby([Grouper(freq="1ME"), "Buyer"]).sum(numeric_only=True)
+            expected = DataFrame(
+                {
+                    "Buyer": "Carl Joe Mark".split(),
+                    "Quantity": [10, 18, 3],
+                    "Date": [
+                        datetime(2013, 10, 31, 0, 0),
+                        datetime(2013, 10, 31, 0, 0),
+                        datetime(2013, 10, 31, 0, 0),
+                    ],
+                }
+            ).set_index(["Date", "Buyer"])
+            tm.assert_frame_equal(result, expected)
+
+            # passing the name
+            df = df.reset_index()
+            result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum(
+                numeric_only=True
+            )
+            tm.assert_frame_equal(result, expected)
+
+            with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
+                df.groupby([Grouper(freq="1ME", key="foo"), "Buyer"]).sum()
+
+            # passing the level
+            df = df.set_index("Date")
+            result = df.groupby([Grouper(freq="1ME", level="Date"), "Buyer"]).sum(
+                numeric_only=True
+            )
+            tm.assert_frame_equal(result, expected)
+            result = df.groupby([Grouper(freq="1ME", level=0), "Buyer"]).sum(
+                numeric_only=True
+            )
+            tm.assert_frame_equal(result, expected)
+
+            with pytest.raises(ValueError, match="The level foo is not valid"):
+                df.groupby([Grouper(freq="1ME", level="foo"), "Buyer"]).sum()
+
+            # multi names
+            df = df.copy()
+            df["Date"] = df.index + offsets.MonthEnd(2)
+            result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum(
+                numeric_only=True
+            )
+            expected = DataFrame(
+                {
+                    "Buyer": "Carl Joe Mark".split(),
+                    "Quantity": [10, 18, 3],
+                    "Date": [
+                        datetime(2013, 11, 30, 0, 0),
+                        datetime(2013, 11, 30, 0, 0),
+                        datetime(2013, 11, 30, 0, 0),
+                    ],
+                }
+            ).set_index(["Date", "Buyer"])
+            tm.assert_frame_equal(result, expected)
+
+            # error as we have both a level and a name!
+            msg = "The Grouper cannot specify both a key and a level!"
+            with pytest.raises(ValueError, match=msg):
+                df.groupby(
+                    [Grouper(freq="1ME", key="Date", level="Date"), "Buyer"]
+                ).sum()
+
+            # single groupers
+            expected = DataFrame(
+                [[31]],
+                columns=["Quantity"],
+                index=DatetimeIndex(
+                    [datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date"
+                ),
+            )
+            result = df.groupby(Grouper(freq="1ME")).sum(numeric_only=True)
+            tm.assert_frame_equal(result, expected)
+
+            result = df.groupby([Grouper(freq="1ME")]).sum(numeric_only=True)
+            tm.assert_frame_equal(result, expected)
+
+            expected.index = expected.index.shift(1)
+            assert expected.index.freq == offsets.MonthEnd()
+            result = df.groupby(Grouper(freq="1ME", key="Date")).sum(numeric_only=True)
+            tm.assert_frame_equal(result, expected)
+
+            result = df.groupby([Grouper(freq="1ME", key="Date")]).sum(
+                numeric_only=True
+            )
+            tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("freq", ["D", "ME", "YE", "QE-APR"])
+    def test_timegrouper_with_reg_groups_freq(self, freq):
+        # GH 6764 multiple grouping with/without sort
+        df = DataFrame(
+            {
+                "date": pd.to_datetime(
+                    [
+                        "20121002",
+                        "20121007",
+                        "20130130",
+                        "20130202",
+                        "20130305",
+                        "20121002",
+                        "20121207",
+                        "20130130",
+                        "20130202",
+                        "20130305",
+                        "20130202",
+                        "20130305",
+                    ]
+                ),
+                "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
+                "whole_cost": [
+                    1790,
+                    364,
+                    280,
+                    259,
+                    201,
+                    623,
+                    90,
+                    312,
+                    359,
+                    301,
+                    359,
+                    801,
+                ],
+                "cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12],
+            }
+        ).set_index("date")
+
+        expected = (
+            df.groupby("user_id")["whole_cost"]
+            .resample(freq)
+            .sum(min_count=1)  # XXX
+            .dropna()
+            .reorder_levels(["date", "user_id"])
+            .sort_index()
+            .astype("int64")
+        )
+        expected.name = "whole_cost"
+
+        result1 = (
+            df.sort_index().groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
+        )
+        tm.assert_series_equal(result1, expected)
+
+        result2 = df.groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
+        tm.assert_series_equal(result2, expected)
+
+    def test_timegrouper_get_group(self):
+        # GH 6914
+
+        df_original = DataFrame(
+            {
+                "Buyer": "Carl Joe Joe Carl Joe Carl".split(),
+                "Quantity": [18, 3, 5, 1, 9, 3],
+                "Date": [
+                    datetime(2013, 9, 1, 13, 0),
+                    datetime(2013, 9, 1, 13, 5),
+                    datetime(2013, 10, 1, 20, 0),
+                    datetime(2013, 10, 3, 10, 0),
+                    datetime(2013, 12, 2, 12, 0),
+                    datetime(2013, 9, 2, 14, 0),
+                ],
+            }
+        )
+        df_reordered = df_original.sort_values(by="Quantity")
+
+        # single grouping
+        expected_list = [
+            df_original.iloc[[0, 1, 5]],
+            df_original.iloc[[2, 3]],
+            df_original.iloc[[4]],
+        ]
+        dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"]
+
+        for df in [df_original, df_reordered]:
+            grouped = df.groupby(Grouper(freq="ME", key="Date"))
+            for t, expected in zip(dt_list, expected_list, strict=True):
+                dt = Timestamp(t)
+                result = grouped.get_group(dt)
+                tm.assert_frame_equal(result, expected)
+
+        # multiple grouping
+        expected_list = [
+            df_original.iloc[[1]],
+            df_original.iloc[[3]],
+            df_original.iloc[[4]],
+        ]
+        g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")]
+
+        for df in [df_original, df_reordered]:
+            grouped = df.groupby(["Buyer", Grouper(freq="ME", key="Date")])
+            for (b, t), expected in zip(g_list, expected_list, strict=True):
+                dt = Timestamp(t)
+                result = grouped.get_group((b, dt))
+                tm.assert_frame_equal(result, expected)
+
+        # with index
+        df_original = df_original.set_index("Date")
+        df_reordered = df_original.sort_values(by="Quantity")
+
+        expected_list = [
+            df_original.iloc[[0, 1, 5]],
+            df_original.iloc[[2, 3]],
+            df_original.iloc[[4]],
+        ]
+
+        for df in [df_original, df_reordered]:
+            grouped = df.groupby(Grouper(freq="ME"))
+            for t, expected in zip(dt_list, expected_list, strict=True):
+                dt = Timestamp(t)
+                result = grouped.get_group(dt)
+                tm.assert_frame_equal(result, expected)
+
+    def test_timegrouper_apply_return_type_series(self):
+        # Using `apply` with the `TimeGrouper` should give the
+        # same return type as an `apply` with a `Grouper`.
+        # Issue #11742
+        df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
+        df_dt = df.copy()
+        df_dt["date"] = pd.to_datetime(df_dt["date"])
+
+        def sumfunc_series(x):
+            return Series([x["value"].sum()], ("sum",))
+
+        expected = df.groupby(Grouper(key="date")).apply(sumfunc_series)
+        result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series)
+        tm.assert_frame_equal(
+            result.reset_index(drop=True), expected.reset_index(drop=True)
+        )
+
+    def test_timegrouper_apply_return_type_value(self):
+        # Using `apply` with the `TimeGrouper` should give the
+        # same return type as an `apply` with a `Grouper`.
+        # Issue #11742
+        df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
+        df_dt = df.copy()
+        df_dt["date"] = pd.to_datetime(df_dt["date"])
+
+        def sumfunc_value(x):
+            return x.value.sum()
+
+        expected = df.groupby(Grouper(key="date")).apply(sumfunc_value)
+        result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value)
+        tm.assert_series_equal(
+            result.reset_index(drop=True), expected.reset_index(drop=True)
+        )
+
+    def test_groupby_groups_datetimeindex(self):
+        # GH#1430
+        periods = 1000
+        ind = date_range(start="2012/1/1", freq="5min", periods=periods)
+        df = DataFrame(
+            {"high": np.arange(periods), "low": np.arange(periods)}, index=ind
+        )
+        grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
+
+        # it works!
+        groups = grouped.groups
+        assert isinstance(next(iter(groups.keys())), datetime)
+
+    def test_groupby_groups_datetimeindex2(self):
+        # GH#11442
+        index = date_range("2015/01/01", periods=5, name="date")
+        df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index)
+        result = df.groupby(level="date").groups
+        dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"]
+        expected = {
+            Timestamp(date): DatetimeIndex([date], name="date") for date in dates
+        }
+        tm.assert_dict_equal(result, expected)
+
+        grouped = df.groupby(level="date")
+        for date in dates:
+            result = grouped.get_group(date)
+            data = [[df.loc[date, "A"], df.loc[date, "B"]]]
+            expected_index = DatetimeIndex(
+                [date], name="date", freq="D", dtype=index.dtype
+            )
+            expected = DataFrame(data, columns=list("AB"), index=expected_index)
+            tm.assert_frame_equal(result, expected)
+
+    def test_groupby_groups_datetimeindex_tz(self):
+        # GH 3950
+        dates = [
+            "2011-07-19 07:00:00",
+            "2011-07-19 08:00:00",
+            "2011-07-19 09:00:00",
+            "2011-07-19 07:00:00",
+            "2011-07-19 08:00:00",
+            "2011-07-19 09:00:00",
+        ]
+        df = DataFrame(
+            {
+                "label": ["a", "a", "a", "b", "b", "b"],
+                "datetime": dates,
+                "value1": np.arange(6, dtype="int64"),
+                "value2": [1, 2] * 3,
+            }
+        )
+        df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific"))
+
+        exp_idx1 = DatetimeIndex(
+            [
+                "2011-07-19 07:00:00",
+                "2011-07-19 07:00:00",
+                "2011-07-19 08:00:00",
+                "2011-07-19 08:00:00",
+                "2011-07-19 09:00:00",
+                "2011-07-19 09:00:00",
+            ],
+            tz="US/Pacific",
+            name="datetime",
+        )
+        exp_idx2 = Index(["a", "b"] * 3, name="label")
+        exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
+        expected = DataFrame(
+            {"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
+            index=exp_idx,
+            columns=["value1", "value2"],
+        )
+
+        result = df.groupby(["datetime", "label"]).sum()
+        tm.assert_frame_equal(result, expected)
+
+        # by level
+        didx = DatetimeIndex(dates, tz="Asia/Tokyo")
+        df = DataFrame(
+            {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
+            index=didx,
+        )
+
+        exp_idx = DatetimeIndex(
+            ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
+            tz="Asia/Tokyo",
+        )
+        expected = DataFrame(
+            {"value1": [3, 5, 7], "value2": [2, 4, 6]},
+            index=exp_idx,
+            columns=["value1", "value2"],
+        )
+
+        result = df.groupby(level=0).sum()
+        tm.assert_frame_equal(result, expected)
+
+    def test_frame_datetime64_handling_groupby(self):
+        # it works!
+        df = DataFrame(
+            [(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))],
+            columns=["a", "date"],
+        )
+        result = df.groupby("a").first()
+        assert result["date"][3] == Timestamp("2012-07-03")
+
+    def test_groupby_multi_timezone(self):
+        # combining multiple / different timezones yields UTC
+        df = DataFrame(
+            {
+                "value": range(5),
+                "date": [
+                    "2000-01-28 16:47:00",
+                    "2000-01-29 16:48:00",
+                    "2000-01-30 16:49:00",
+                    "2000-01-31 16:50:00",
+                    "2000-01-01 16:50:00",
+                ],
+                "tz": [
+                    "America/Chicago",
+                    "America/Chicago",
+                    "America/Los_Angeles",
+                    "America/Chicago",
+                    "America/New_York",
+                ],
+            }
+        )
+
+        result = df.groupby("tz", group_keys=False).date.apply(
+            lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
+        )
+
+        expected = Series(
+            [
+                Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"),
+                Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"),
+                Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"),
+                Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"),
+                Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"),
+            ],
+            name="date",
+            dtype=object,
+        )
+        tm.assert_series_equal(result, expected)
+
+        tz = "America/Chicago"
+        res_values = df.groupby("tz").date.get_group(tz)
+        result = pd.to_datetime(res_values).dt.tz_localize(tz)
+        exp_values = Series(
+            ["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"],
+            index=[0, 1, 3],
+            name="date",
+        )
+        expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_groups_periods(self):
+        dates = [
+            "2011-07-19 07:00:00",
+            "2011-07-19 08:00:00",
+            "2011-07-19 09:00:00",
+            "2011-07-19 07:00:00",
+            "2011-07-19 08:00:00",
+            "2011-07-19 09:00:00",
+        ]
+        df = DataFrame(
+            {
+                "label": ["a", "a", "a", "b", "b", "b"],
+                "period": [pd.Period(d, freq="h") for d in dates],
+                "value1": np.arange(6, dtype="int64"),
+                "value2": [1, 2] * 3,
+            }
+        )
+
+        exp_idx1 = pd.PeriodIndex(
+            [
+                "2011-07-19 07:00:00",
+                "2011-07-19 07:00:00",
+                "2011-07-19 08:00:00",
+                "2011-07-19 08:00:00",
+                "2011-07-19 09:00:00",
+                "2011-07-19 09:00:00",
+            ],
+            freq="h",
+            name="period",
+        )
+        exp_idx2 = Index(["a", "b"] * 3, name="label")
+        exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
+        expected = DataFrame(
+            {"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
+            index=exp_idx,
+            columns=["value1", "value2"],
+        )
+
+        result = df.groupby(["period", "label"]).sum()
+        tm.assert_frame_equal(result, expected)
+
+        # by level
+        didx = pd.PeriodIndex(dates, freq="h")
+        df = DataFrame(
+            {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
+            index=didx,
+        )
+
+        exp_idx = pd.PeriodIndex(
+            ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
+            freq="h",
+        )
+        expected = DataFrame(
+            {"value1": [3, 5, 7], "value2": [2, 4, 6]},
+            index=exp_idx,
+            columns=["value1", "value2"],
+        )
+
+        result = df.groupby(level=0).sum()
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_first_datetime64(self):
+        df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
+        df[1] = df[1].astype("M8[ns]")
+
+        assert issubclass(df[1].dtype.type, np.datetime64)
+
+        result = df.groupby(level=0).first()
+        got_dt = result[1].dtype
+        assert issubclass(got_dt.type, np.datetime64)
+
+        result = df[1].groupby(level=0).first()
+        got_dt = result.dtype
+        assert issubclass(got_dt.type, np.datetime64)
+
+    def test_groupby_max_datetime64(self):
+        # GH 5869
+        # datetimelike dtype conversion from int
+        df = DataFrame({"A": Timestamp("20130101").as_unit("s"), "B": np.arange(5)})
+        # TODO: can we retain second reso in .apply here?
+        expected = df.groupby("A")["A"].apply(lambda x: x.max()).astype("M8[s]")
+        result = df.groupby("A")["A"].max()
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_datetime64_32_bit(self):
+        # GH 6410 / numpy 4328
+        # 32-bit under 1.9-dev indexing issue
+
+        df = DataFrame({"A": range(2), "B": [Timestamp("2000-01-1")] * 2})
+        result = df.groupby("A")["B"].transform("min")
+        expected = Series([Timestamp("2000-01-1")] * 2, name="B")
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_with_timezone_selection(self):
+        # GH 11616
+        # Test that column selection returns output in correct timezone.
+
+        df = DataFrame(
+            {
+                "factor": np.random.default_rng(2).integers(0, 3, size=60),
+                "time": date_range("01/01/2000 00:00", periods=60, freq="s", tz="UTC"),
+            }
+        )
+        df1 = df.groupby("factor").max()["time"]
+        df2 = df.groupby("factor")["time"].max()
+        tm.assert_series_equal(df1, df2)
+
+    def test_timezone_info(self):
+        # see gh-11682: Timezone info lost when broadcasting
+        # scalar datetime to DataFrame
+        utc = timezone.utc
+        df = DataFrame({"a": [1], "b": [datetime.now(utc)]})
+        assert df["b"][0].tzinfo == utc
+        df = DataFrame({"a": [1, 2, 3]})
+        df["b"] = datetime.now(utc)
+        assert df["b"][0].tzinfo == utc
+
+    def test_datetime_count(self):
+        df = DataFrame(
+            {"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="min")}
+        )
+        result = df.groupby("a").dates.count()
+        expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates")
+        tm.assert_series_equal(result, expected)
+
+    def test_first_last_max_min_on_time_data(self):
+        # GH 10295
+        # Verify that NaT is not in the result of max, min, first and last on
+        # Dataframe with datetime or timedelta values.
+        df_test = DataFrame(
+            {
+                "dt": [
+                    np.nan,
+                    "2015-07-24 10:10",
+                    "2015-07-25 11:11",
+                    "2015-07-23 12:12",
+                    np.nan,
+                ],
+                "td": [
+                    np.nan,
+                    timedelta(days=1),
+                    timedelta(days=2),
+                    timedelta(days=3),
+                    np.nan,
+                ],
+            }
+        )
+        df_test.dt = pd.to_datetime(df_test.dt)
+        df_test["group"] = "A"
+        df_ref = df_test[df_test.dt.notna()]
+
+        grouped_test = df_test.groupby("group")
+        grouped_ref = df_ref.groupby("group")
+
+        tm.assert_frame_equal(grouped_ref.max(), grouped_test.max())
+        tm.assert_frame_equal(grouped_ref.min(), grouped_test.min())
+        tm.assert_frame_equal(grouped_ref.first(), grouped_test.first())
+        tm.assert_frame_equal(grouped_ref.last(), grouped_test.last())
+
+    def test_nunique_with_timegrouper_and_nat(self):
+        # GH 17575
+        test = DataFrame(
+            {
+                "time": [
+                    Timestamp("2016-06-28 09:35:35"),
+                    pd.NaT,
+                    Timestamp("2016-06-28 16:46:28"),
+                ],
+                "data": ["1", "2", "3"],
+            }
+        )
+
+        grouper = Grouper(key="time", freq="h")
+        result = test.groupby(grouper)["data"].nunique()
+        expected = test[test.time.notnull()].groupby(grouper)["data"].nunique()
+        expected.index = expected.index._with_freq(None)
+        tm.assert_series_equal(result, expected)
+
+    def test_scalar_call_versus_list_call(self):
+        # Issue: 17530
+        data_frame = {
+            "location": ["shanghai", "beijing", "shanghai"],
+            "time": Series(
+                ["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"],
+                dtype="datetime64[ns]",
+            ),
+            "value": [1, 2, 3],
+        }
+        data_frame = DataFrame(data_frame).set_index("time")
+        grouper = Grouper(freq="D")
+
+        grouped = data_frame.groupby(grouper)
+        result = grouped.count()
+        grouped = data_frame.groupby([grouper])
+        expected = grouped.count()
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_grouper_period_index(self):
+        # GH 32108
+        periods = 2
+        index = pd.period_range(
+            start="2018-01", periods=periods, freq="M", name="Month"
+        )
+        period_series = Series(range(periods), index=index)
+        result = period_series.groupby(period_series.index.month).sum()
+
+        expected = Series(
+            range(periods), index=Index(range(1, periods + 1), name=index.name)
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_apply_timegrouper_with_nat_dict_returns(
+        self, groupby_with_truncated_bingrouper
+    ):
+        # GH#43500 case where gb._grouper.result_index and gb._grouper.group_keys_seq
+        #  have different lengths that goes through the `isinstance(values[0], dict)`
+        #  path
+        gb = groupby_with_truncated_bingrouper
+
+        res = gb["Quantity"].apply(lambda x: {"foo": len(x)})
+
+        df = gb.obj
+        unit = df["Date"]._values.unit
+        dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit)
+        mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)])
+        expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity")
+        tm.assert_series_equal(res, expected)
+
+    def test_groupby_apply_timegrouper_with_nat_scalar_returns(
+        self, groupby_with_truncated_bingrouper
+    ):
+        # GH#43500 Previously raised ValueError bc used index with incorrect
+        #  length in wrap_applied_result
+        gb = groupby_with_truncated_bingrouper
+
+        res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan)
+
+        df = gb.obj
+        unit = df["Date"]._values.unit
+        dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit)
+        expected = Series(
+            [18, np.nan, np.nan, np.nan, np.nan, np.nan, 5],
+            index=dti._with_freq(None),
+            name="Quantity",
+        )
+
+        tm.assert_series_equal(res, expected)
+
+    def test_groupby_apply_timegrouper_with_nat_apply_squeeze(
+        self, frame_for_truncated_bingrouper
+    ):
+        df = frame_for_truncated_bingrouper
+
+        # We need to create a GroupBy object with only one non-NaT group,
+        #  so use a huge freq so that all non-NaT dates will be grouped together
+        tdg = Grouper(key="Date", freq="100YE")
+        gb = df.groupby(tdg)
+
+        # check that we will go through the singular_series path
+        #  in _wrap_applied_output_series
+        assert gb.ngroups == 1
+        assert gb._selected_obj.index.nlevels == 1
+
+        # function that returns a Series
+        res = gb.apply(lambda x: x["Quantity"] * 2)
+
+        dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date")
+        expected = DataFrame(
+            [[36, 6, 6, 10, 2]],
+            index=dti,
+            columns=Index([0, 1, 5, 2, 3], name="Quantity"),
+        )
+        tm.assert_frame_equal(res, expected)
+
+    @pytest.mark.single_cpu
+    def test_groupby_agg_numba_timegrouper_with_nat(
+        self, groupby_with_truncated_bingrouper
+    ):
+        pytest.importorskip("numba")
+
+        # See discussion in GH#43487
+        gb = groupby_with_truncated_bingrouper
+
+        result = gb["Quantity"].aggregate(
+            lambda values, index: np.nanmean(values), engine="numba"
+        )
+
+        expected = gb["Quantity"].aggregate("mean")
+        tm.assert_series_equal(result, expected)
+
+        result_df = gb[["Quantity"]].aggregate(
+            lambda values, index: np.nanmean(values), engine="numba"
+        )
+        expected_df = gb[["Quantity"]].aggregate("mean")
+        tm.assert_frame_equal(result_df, expected_df)
+
+    @td.skip_if_no("pyarrow")
+    def test_pyarrow_index_retention(self):
+        # https://github.com/pandas-dev/pandas/issues/63518
+        df = DataFrame(
+            {
+                "a": [1, 2, 3],
+            },
+            index=Index(
+                [
+                    Timestamp("2013-01-01"),
+                    Timestamp("2013-01-01"),
+                    Timestamp("2013-01-02"),
+                ],
+                dtype="timestamp[ns, America/Denver][pyarrow]",
+            ),
+        )
+        gb = df.groupby(Grouper(freq="D"))
+        result = gb._grouper.result_index
+        expected = Index(
+            [Timestamp("2013-01-01"), Timestamp("2013-01-02")],
+            dtype="timestamp[ns, America/Denver][pyarrow]",
+        )
+        tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/interchange/__init__.py b/pandas/tests/interchange/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e6fa4e2c83e1e539a638044684c0f9f4060d2b5
--- /dev/null
+++ b/pandas/tests/interchange/test_impl.py
@@ -0,0 +1,699 @@
+from datetime import (
+    datetime,
+    timezone,
+)
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import iNaT
+from pandas.compat import (
+    is_ci_environment,
+    is_platform_windows,
+)
+from pandas.compat.pyarrow import pa_version_under22p0
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.core.interchange.column import PandasColumn
+from pandas.core.interchange.dataframe_protocol import (
+    ColumnNullType,
+    DtypeKind,
+)
+from pandas.core.interchange.from_dataframe import from_dataframe
+from pandas.core.interchange.utils import ArrowCTypes
+
+
+@pytest.mark.parametrize("data", [("ordered", True), ("unordered", False)])
+def test_categorical_dtype(data):
+    data_categorical = {
+        "ordered": pd.Categorical(list("testdata") * 30, ordered=True),
+        "unordered": pd.Categorical(list("testdata") * 30, ordered=False),
+    }
+    df = pd.DataFrame({"A": (data_categorical[data[0]])})
+
+    with tm.assert_produces_warning(match="Interchange"):
+        col = df.__dataframe__().get_column_by_name("A")
+    assert col.dtype[0] == DtypeKind.CATEGORICAL
+    assert col.null_count == 0
+    assert col.describe_null == (ColumnNullType.USE_SENTINEL, -1)
+    assert col.num_chunks() == 1
+    desc_cat = col.describe_categorical
+    assert desc_cat["is_ordered"] == data[1]
+    assert desc_cat["is_dictionary"] is True
+    assert isinstance(desc_cat["categories"], PandasColumn)
+    tm.assert_series_equal(
+        desc_cat["categories"]._col, pd.Series(["a", "d", "e", "s", "t"])
+    )
+
+    with tm.assert_produces_warning(match="Interchange"):
+        tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
+
+
+def test_categorical_pyarrow():
+    # GH 49889
+    pa = pytest.importorskip("pyarrow", "11.0.0")
+
+    arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]
+    table = pa.table({"weekday": pa.array(arr).dictionary_encode()})
+    exchange_df = table.__dataframe__()
+    with tm.assert_produces_warning(match="Interchange"):
+        result = from_dataframe(exchange_df)
+    weekday = pd.Categorical(
+        arr, categories=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
+    )
+    expected = pd.DataFrame({"weekday": weekday})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.filterwarnings(
+    "ignore:Constructing a Categorical with a dtype and values containing"
+)
+def test_empty_categorical_pyarrow():
+    # https://github.com/pandas-dev/pandas/issues/53077
+    pa = pytest.importorskip("pyarrow", "11.0.0")
+
+    arr = [None]
+    table = pa.table({"arr": pa.array(arr, "float64").dictionary_encode()})
+    exchange_df = table.__dataframe__()
+    with tm.assert_produces_warning(match="Interchange"):
+        result = pd.api.interchange.from_dataframe(exchange_df)
+    expected = pd.DataFrame({"arr": pd.Categorical([np.nan])})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_large_string_pyarrow():
+    # GH 52795
+    pa = pytest.importorskip("pyarrow", "11.0.0")
+
+    arr = ["Mon", "Tue"]
+    table = pa.table({"weekday": pa.array(arr, "large_string")})
+    exchange_df = table.__dataframe__()
+    with tm.assert_produces_warning(match="Interchange"):
+        result = from_dataframe(exchange_df)
+    expected = pd.DataFrame({"weekday": ["Mon", "Tue"]})
+    tm.assert_frame_equal(result, expected)
+
+    # check round-trip
+    # Don't check stacklevel as PyArrow calls the deprecated `__dataframe__` method.
+    with tm.assert_produces_warning(match="Interchange", check_stacklevel=False):
+        assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
+
+
+@pytest.mark.parametrize(
+    ("offset", "length", "expected_values"),
+    [
+        (0, None, [3.3, float("nan"), 2.1]),
+        (1, None, [float("nan"), 2.1]),
+        (2, None, [2.1]),
+        (0, 2, [3.3, float("nan")]),
+        (0, 1, [3.3]),
+        (1, 1, [float("nan")]),
+    ],
+)
+def test_bitmasks_pyarrow(offset, length, expected_values):
+    # GH 52795
+    pa = pytest.importorskip("pyarrow", "11.0.0")
+
+    arr = [3.3, None, 2.1]
+    table = pa.table({"arr": arr}).slice(offset, length)
+    exchange_df = table.__dataframe__()
+    with tm.assert_produces_warning(match="Interchange"):
+        result = from_dataframe(exchange_df)
+    expected = pd.DataFrame({"arr": expected_values})
+    tm.assert_frame_equal(result, expected)
+
+    # check round-trip
+    # Don't check stacklevel as PyArrow calls the deprecated `__dataframe__` method.
+    with tm.assert_produces_warning(match="Interchange", check_stacklevel=False):
+        assert pa.Table.equals(pa.interchange.from_dataframe(result), table)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        lambda: np.random.default_rng(2).integers(-100, 100),
+        lambda: np.random.default_rng(2).integers(1, 100),
+        lambda: np.random.default_rng(2).random(),
+        lambda: np.random.default_rng(2).choice([True, False]),
+        lambda: datetime(
+            year=np.random.default_rng(2).integers(1900, 2100),
+            month=np.random.default_rng(2).integers(1, 12),
+            day=np.random.default_rng(2).integers(1, 20),
+        ),
+    ],
+)
+def test_dataframe(data):
+    NCOLS, NROWS = 10, 20
+    data = {
+        f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [data() for _ in range(NROWS)]
+        for i in range(NCOLS)
+    }
+    df = pd.DataFrame(data)
+
+    with tm.assert_produces_warning(match="Interchange"):
+        df2 = df.__dataframe__()
+
+    assert df2.num_columns() == NCOLS
+    assert df2.num_rows() == NROWS
+
+    assert list(df2.column_names()) == list(data.keys())
+
+    indices = (0, 2)
+    names = tuple(list(data.keys())[idx] for idx in indices)
+
+    with tm.assert_produces_warning(match="Interchange"):
+        result = from_dataframe(df2.select_columns(indices))
+        expected = from_dataframe(df2.select_columns_by_name(names))
+    tm.assert_frame_equal(result, expected)
+
+    assert isinstance(result.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list)
+    assert isinstance(expected.attrs["_INTERCHANGE_PROTOCOL_BUFFERS"], list)
+
+
+def test_missing_from_masked():
+    df = pd.DataFrame(
+        {
+            "x": np.array([1.0, 2.0, 3.0, 4.0, 0.0]),
+            "y": np.array([1.5, 2.5, 3.5, 4.5, 0]),
+            "z": np.array([1.0, 0.0, 1.0, 1.0, 1.0]),
+        }
+    )
+
+    rng = np.random.default_rng(2)
+    dict_null = {col: rng.integers(low=0, high=len(df)) for col in df.columns}
+    for col, num_nulls in dict_null.items():
+        null_idx = df.index[
+            rng.choice(np.arange(len(df)), size=num_nulls, replace=False)
+        ]
+        df.loc[null_idx, col] = None
+
+    with tm.assert_produces_warning(match="Interchange"):
+        df2 = df.__dataframe__()
+
+    assert df2.get_column_by_name("x").null_count == dict_null["x"]
+    assert df2.get_column_by_name("y").null_count == dict_null["y"]
+    assert df2.get_column_by_name("z").null_count == dict_null["z"]
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"x": [1.5, 2.5, 3.5], "y": [9.2, 10.5, 11.8]},
+        {"x": [1, 2, 0], "y": [9.2, 10.5, 11.8]},
+        {
+            "x": np.array([True, True, False]),
+            "y": np.array([1, 2, 0]),
+            "z": np.array([9.2, 10.5, 11.8]),
+        },
+    ],
+)
+def test_mixed_data(data):
+    df = pd.DataFrame(data)
+    with tm.assert_produces_warning(match="Interchange"):
+        df2 = df.__dataframe__()
+
+    for col_name in df.columns:
+        assert df2.get_column_by_name(col_name).null_count == 0
+
+
+def test_mixed_missing():
+    df = pd.DataFrame(
+        {
+            "x": np.array([True, None, False, None, True]),
+            "y": np.array([None, 2, None, 1, 2]),
+            "z": np.array([9.2, 10.5, None, 11.8, None]),
+        }
+    )
+
+    with tm.assert_produces_warning(match="Interchange"):
+        df2 = df.__dataframe__()
+
+    for col_name in df.columns:
+        assert df2.get_column_by_name(col_name).null_count == 2
+
+
+def test_string():
+    string_data = {
+        "separator data": [
+            "abC|DeF,Hik",
+            "234,3245.67",
+            "gSaf,qWer|Gre",
+            "asd3,4sad|",
+            np.nan,
+        ]
+    }
+    test_str_data = string_data["separator data"] + [""]
+    df = pd.DataFrame({"A": test_str_data})
+    with tm.assert_produces_warning(match="Interchange"):
+        col = df.__dataframe__().get_column_by_name("A")
+
+    assert col.size() == 6
+    assert col.null_count == 1
+    assert col.dtype[0] == DtypeKind.STRING
+    assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0)
+
+    df_sliced = df[1:]
+    with tm.assert_produces_warning(match="Interchange"):
+        col = df_sliced.__dataframe__().get_column_by_name("A")
+    assert col.size() == 5
+    assert col.null_count == 1
+    assert col.dtype[0] == DtypeKind.STRING
+    assert col.describe_null == (ColumnNullType.USE_BYTEMASK, 0)
+
+
+def test_nonstring_object():
+    df = pd.DataFrame({"A": ["a", 10, 1.0, ()]})
+    with tm.assert_produces_warning(match="Interchange"):
+        col = df.__dataframe__().get_column_by_name("A")
+    with pytest.raises(NotImplementedError, match="not supported yet"):
+        col.dtype
+
+
+def test_datetime():
+    df = pd.DataFrame({"A": [pd.Timestamp("2022-01-01"), pd.NaT]})
+    with tm.assert_produces_warning(match="Interchange"):
+        col = df.__dataframe__().get_column_by_name("A")
+
+    assert col.size() == 2
+    assert col.null_count == 1
+    assert col.dtype[0] == DtypeKind.DATETIME
+    assert col.describe_null == (ColumnNullType.USE_SENTINEL, iNaT)
+
+    with tm.assert_produces_warning(match="Interchange"):
+        tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
+
+
+def test_categorical_to_numpy_dlpack():
+    # https://github.com/pandas-dev/pandas/issues/48393
+    df = pd.DataFrame({"A": pd.Categorical(["a", "b", "a"])})
+    with tm.assert_produces_warning(match="Interchange"):
+        col = df.__dataframe__().get_column_by_name("A")
+    result = np.from_dlpack(col.get_buffers()["data"][0])
+    expected = np.array([0, 1, 0], dtype="int8")
+    tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("data", [{}, {"a": []}])
+def test_empty_pyarrow(data):
+    # GH 53155
+    pytest.importorskip("pyarrow", "14.0.0")
+    from pyarrow.interchange import from_dataframe as pa_from_dataframe
+
+    expected = pd.DataFrame(data)
+    # Don't check stacklevel as PyArrow calls the deprecated `__dataframe__` method.
+    with tm.assert_produces_warning(match="Interchange", check_stacklevel=False):
+        arrow_df = pa_from_dataframe(expected)
+    result = from_dataframe(arrow_df)
+    tm.assert_frame_equal(result, expected, check_column_type=False)
+
+
+def test_multi_chunk_pyarrow() -> None:
+    pa = pytest.importorskip("pyarrow", "14.0.0")
+    n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+    names = ["n_legs"]
+    table = pa.table([n_legs], names=names)
+    with pytest.raises(
+        RuntimeError,
+        match="Cannot do zero copy conversion into multi-column DataFrame block",
+    ):
+        pd.api.interchange.from_dataframe(table, allow_copy=False)
+
+
+def test_multi_chunk_column() -> None:
+    pytest.importorskip("pyarrow", "11.0.0")
+    ser = pd.Series([1, 2, None], dtype="Int64[pyarrow]")
+    df = pd.concat([ser, ser], ignore_index=True).to_frame("a")
+    df_orig = df.copy()
+
+    with tm.assert_produces_warning(match="Interchange"):
+        with pytest.raises(
+            RuntimeError,
+            match="Found multi-chunk pyarrow array, but `allow_copy` is False",
+        ):
+            pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=False))
+    with tm.assert_produces_warning(match="Interchange"):
+        result = pd.api.interchange.from_dataframe(df.__dataframe__(allow_copy=True))
+    # Interchange protocol defaults to creating numpy-backed columns, so currently this
+    # is 'float64'.
+    expected = pd.DataFrame({"a": [1.0, 2.0, None, 1.0, 2.0, None]}, dtype="float64")
+    tm.assert_frame_equal(result, expected)
+
+    # Check that the rechunking we did didn't modify the original DataFrame.
+    tm.assert_frame_equal(df, df_orig)
+    assert len(df["a"].array._pa_array.chunks) == 2
+    assert len(df_orig["a"].array._pa_array.chunks) == 2
+
+
+def test_timestamp_ns_pyarrow():
+    # GH 56712
+    pytest.importorskip("pyarrow", "11.0.0")
+    timestamp_args = {
+        "year": 2000,
+        "month": 1,
+        "day": 1,
+        "hour": 1,
+        "minute": 1,
+        "second": 1,
+    }
+    df = pd.Series(
+        [datetime(**timestamp_args)],
+        dtype="timestamp[ns][pyarrow]",
+        name="col0",
+    ).to_frame()
+
+    with tm.assert_produces_warning(match="Interchange"):
+        dfi = df.__dataframe__()
+        result = pd.api.interchange.from_dataframe(dfi)["col0"].item()
+
+    expected = pd.Timestamp(**timestamp_args)
+    assert result == expected
+
+
+@pytest.mark.parametrize("tz", ["UTC", "US/Pacific"])
+def test_datetimetzdtype(tz, unit):
+    # GH 54239
+    tz_data = (
+        pd.date_range("2018-01-01", periods=5, freq="D").tz_localize(tz).as_unit(unit)
+    )
+    df = pd.DataFrame({"ts_tz": tz_data})
+    with tm.assert_produces_warning(match="Interchange"):
+        tm.assert_frame_equal(df, from_dataframe(df.__dataframe__()))
+
+
+def test_interchange_from_non_pandas_tz_aware(request):
+    # GH 54239, 54287
+    pa = pytest.importorskip("pyarrow", "11.0.0")
+    import pyarrow.compute as pc
+
+    if is_platform_windows() and is_ci_environment() and pa_version_under22p0:
+        mark = pytest.mark.xfail(
+            raises=pa.ArrowInvalid,
+            reason=(
+                "TODO: Set ARROW_TIMEZONE_DATABASE environment variable "
+                "on CI to path to the tzdata for pyarrow."
+            ),
+        )
+        request.applymarker(mark)
+
+    arr = pa.array([datetime(2020, 1, 1), None, datetime(2020, 1, 2)])
+    arr = pc.assume_timezone(arr, "Asia/Kathmandu")
+    table = pa.table({"arr": arr})
+    exchange_df = table.__dataframe__()
+    with tm.assert_produces_warning(match="Interchange"):
+        result = from_dataframe(exchange_df)
+
+    expected = pd.DataFrame(
+        ["2020-01-01 00:00:00+05:45", "NaT", "2020-01-02 00:00:00+05:45"],
+        columns=["arr"],
+        dtype="datetime64[us, Asia/Kathmandu]",
+    )
+    tm.assert_frame_equal(expected, result)
+
+
+def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
+    # https://github.com/pandas-dev/pandas/issues/54781
+    with tm.assert_produces_warning(match="Interchange"):
+        df = pd.DataFrame({"a": ["foo", "bar"]}).__dataframe__()
+        interchange = df.__dataframe__()
+    column = interchange.get_column_by_name("a")
+    buffers = column.get_buffers()
+    buffers_data = buffers["data"]
+    buffer_dtype = buffers_data[1]
+    buffer_dtype = (
+        DtypeKind.UINT,
+        8,
+        ArrowCTypes.UINT8,
+        buffer_dtype[3],
+    )
+    buffers["data"] = (buffers_data[0], buffer_dtype)
+    column.get_buffers = lambda: buffers
+    interchange.get_column_by_name = lambda _: column
+    monkeypatch.setattr(df, "__dataframe__", lambda allow_copy: interchange)
+    with tm.assert_produces_warning(match="Interchange"):
+        pd.api.interchange.from_dataframe(df)
+
+
+def test_empty_string_column():
+    # https://github.com/pandas-dev/pandas/issues/56703
+    df = pd.DataFrame({"a": []}, dtype=str)
+    with tm.assert_produces_warning(match="Interchange"):
+        df2 = df.__dataframe__()
+        result = pd.api.interchange.from_dataframe(df2)
+    tm.assert_frame_equal(df, result)
+
+
+def test_large_string():
+    # GH#56702
+    pytest.importorskip("pyarrow")
+    df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]")
+    # Don't check stacklevel as PyArrow calls the deprecated `__dataframe__` method.
+    with tm.assert_produces_warning(match="Interchange", check_stacklevel=False):
+        result = pd.api.interchange.from_dataframe(df.__dataframe__())
+    expected = pd.DataFrame({"a": ["x"]}, dtype="str")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_non_str_names():
+    # https://github.com/pandas-dev/pandas/issues/56701
+    df = pd.Series([1, 2, 3], name=0).to_frame()
+    with tm.assert_produces_warning(match="Interchange"):
+        names = df.__dataframe__().column_names()
+    assert names == ["0"]
+
+
+def test_non_str_names_w_duplicates():
+    # https://github.com/pandas-dev/pandas/issues/56701
+    df = pd.DataFrame({"0": [1, 2, 3], 0: [4, 5, 6]})
+    with tm.assert_produces_warning(match="Interchange"):
+        dfi = df.__dataframe__()
+    with tm.assert_produces_warning(match="Interchange"):
+        with pytest.raises(
+            TypeError,
+            match=(
+                "Expected a Series, got a DataFrame. This likely happened because you "
+                "called __dataframe__ on a DataFrame which, after converting column "
+                r"names to string, resulted in duplicated names: Index\(\['0', '0'\], "
+                r"dtype='(str|object)'\). Please rename these columns before using the "
+                "interchange protocol."
+            ),
+        ):
+            pd.api.interchange.from_dataframe(dfi, allow_copy=False)
+
+
+@pytest.mark.parametrize(
+    ("data", "dtype", "expected_dtype"),
+    [
+        ([1, 2, None], "Int64", "int64"),
+        ([1, 2, None], "Int64[pyarrow]", "int64"),
+        ([1, 2, None], "Int8", "int8"),
+        ([1, 2, None], "Int8[pyarrow]", "int8"),
+        (
+            [1, 2, None],
+            "UInt64",
+            "uint64",
+        ),
+        (
+            [1, 2, None],
+            "UInt64[pyarrow]",
+            "uint64",
+        ),
+        ([1.0, 2.25, None], "Float32", "float32"),
+        ([1.0, 2.25, None], "Float32[pyarrow]", "float32"),
+        ([True, False, None], "boolean", "bool"),
+        ([True, False, None], "boolean[pyarrow]", "bool"),
+        (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"),
+        (["much ado", "about", None], "string[pyarrow]", "large_string"),
+        (
+            [datetime(2020, 1, 1), datetime(2020, 1, 2), None],
+            "timestamp[ns][pyarrow]",
+            "timestamp[ns]",
+        ),
+        (
+            [datetime(2020, 1, 1), datetime(2020, 1, 2), None],
+            "timestamp[us][pyarrow]",
+            "timestamp[us]",
+        ),
+        (
+            [
+                datetime(2020, 1, 1, tzinfo=timezone.utc),
+                datetime(2020, 1, 2, tzinfo=timezone.utc),
+                None,
+            ],
+            "timestamp[us, Asia/Kathmandu][pyarrow]",
+            "timestamp[us, tz=Asia/Kathmandu]",
+        ),
+    ],
+)
+def test_pandas_nullable_with_missing_values(
+    data: list, dtype: str, expected_dtype: str
+) -> None:
+    # https://github.com/pandas-dev/pandas/issues/57643
+    # https://github.com/pandas-dev/pandas/issues/57664
+    pa = pytest.importorskip("pyarrow", "14.0.0")
+    import pyarrow.interchange as pai
+
+    if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]":
+        expected_dtype = pa.timestamp("us", "Asia/Kathmandu")
+
+    df = pd.DataFrame({"a": data}, dtype=dtype)
+    with tm.assert_produces_warning(match="Interchange"):
+        result = pai.from_dataframe(df.__dataframe__())["a"]
+    assert result.type == expected_dtype
+    assert result[0].as_py() == data[0]
+    assert result[1].as_py() == data[1]
+    assert result[2].as_py() is None
+
+
+@pytest.mark.parametrize(
+    ("data", "dtype", "expected_dtype"),
+    [
+        ([1, 2, 3], "Int64", "int64"),
+        ([1, 2, 3], "Int64[pyarrow]", "int64"),
+        ([1, 2, 3], "Int8", "int8"),
+        ([1, 2, 3], "Int8[pyarrow]", "int8"),
+        (
+            [1, 2, 3],
+            "UInt64",
+            "uint64",
+        ),
+        (
+            [1, 2, 3],
+            "UInt64[pyarrow]",
+            "uint64",
+        ),
+        ([1.0, 2.25, 5.0], "Float32", "float32"),
+        ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"),
+        ([True, False, False], "boolean", "bool"),
+        ([True, False, False], "boolean[pyarrow]", "bool"),
+        (
+            ["much ado", "about", "nothing"],
+            pd.StringDtype(na_value=np.nan),
+            "large_string",
+        ),
+        (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"),
+        (
+            [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)],
+            "timestamp[ns][pyarrow]",
+            "timestamp[ns]",
+        ),
+        (
+            [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)],
+            "timestamp[us][pyarrow]",
+            "timestamp[us]",
+        ),
+        (
+            [
+                datetime(2020, 1, 1, tzinfo=timezone.utc),
+                datetime(2020, 1, 2, tzinfo=timezone.utc),
+                datetime(2020, 1, 3, tzinfo=timezone.utc),
+            ],
+            "timestamp[us, Asia/Kathmandu][pyarrow]",
+            "timestamp[us, tz=Asia/Kathmandu]",
+        ),
+    ],
+)
+def test_pandas_nullable_without_missing_values(
+    data: list, dtype: str, expected_dtype: str
+) -> None:
+    # https://github.com/pandas-dev/pandas/issues/57643
+    pa = pytest.importorskip("pyarrow", "14.0.0")
+    import pyarrow.interchange as pai
+
+    if expected_dtype == "timestamp[us, tz=Asia/Kathmandu]":
+        expected_dtype = pa.timestamp("us", "Asia/Kathmandu")
+
+    df = pd.DataFrame({"a": data}, dtype=dtype)
+    with tm.assert_produces_warning(match="Interchange"):
+        result = pai.from_dataframe(df.__dataframe__())["a"]
+    assert result.type == expected_dtype
+    assert result[0].as_py() == data[0]
+    assert result[1].as_py() == data[1]
+    assert result[2].as_py() == data[2]
+
+
+def test_string_validity_buffer() -> None:
+    # https://github.com/pandas-dev/pandas/issues/57761
+    pytest.importorskip("pyarrow", "11.0.0")
+    df = pd.DataFrame({"a": ["x"]}, dtype="large_string[pyarrow]")
+    with tm.assert_produces_warning(match="Interchange"):
+        result = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"]
+    assert result is None
+
+
+def test_string_validity_buffer_no_missing() -> None:
+    # https://github.com/pandas-dev/pandas/issues/57762
+    pytest.importorskip("pyarrow", "11.0.0")
+    df = pd.DataFrame({"a": ["x", None]}, dtype="large_string[pyarrow]")
+    with tm.assert_produces_warning(match="Interchange"):
+        validity = df.__dataframe__().get_column_by_name("a").get_buffers()["validity"]
+    assert validity is not None
+    result = validity[1]
+    expected = (DtypeKind.BOOL, 1, ArrowCTypes.BOOL, "=")
+    assert result == expected
+
+
+def test_empty_dataframe():
+    # https://github.com/pandas-dev/pandas/issues/56700
+    df = pd.DataFrame({"a": []}, dtype="int8")
+    with tm.assert_produces_warning(match="Interchange"):
+        dfi = df.__dataframe__()
+        result = pd.api.interchange.from_dataframe(dfi, allow_copy=False)
+    expected = pd.DataFrame({"a": []}, dtype="int8")
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("data", "expected_dtype", "expected_buffer_dtype"),
+    [
+        (
+            pd.Series(["a", "b", "a"], dtype="category"),
+            (DtypeKind.CATEGORICAL, 8, "c", "="),
+            (DtypeKind.INT, 8, "c", "|"),
+        ),
+        (
+            pd.Series(
+                [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)],
+                dtype="M8[ns]",
+            ),
+            (DtypeKind.DATETIME, 64, "tsn:", "="),
+            (DtypeKind.INT, 64, ArrowCTypes.INT64, "="),
+        ),
+        (
+            pd.Series(["a", "bc", None]),
+            (DtypeKind.STRING, 8, ArrowCTypes.STRING, "="),
+            (DtypeKind.UINT, 8, ArrowCTypes.UINT8, "="),
+        ),
+        (
+            pd.Series([1, 2, 3]),
+            (DtypeKind.INT, 64, ArrowCTypes.INT64, "="),
+            (DtypeKind.INT, 64, ArrowCTypes.INT64, "="),
+        ),
+        (
+            pd.Series([1.5, 2, 3]),
+            (DtypeKind.FLOAT, 64, ArrowCTypes.FLOAT64, "="),
+            (DtypeKind.FLOAT, 64, ArrowCTypes.FLOAT64, "="),
+        ),
+    ],
+)
+def test_buffer_dtype_categorical(
+    data: pd.Series,
+    expected_dtype: tuple[DtypeKind, int, str, str],
+    expected_buffer_dtype: tuple[DtypeKind, int, str, str],
+) -> None:
+    # https://github.com/pandas-dev/pandas/issues/54781
+    df = pd.DataFrame({"data": data})
+    with tm.assert_produces_warning(match="Interchange"):
+        dfi = df.__dataframe__()
+    col = dfi.get_column_by_name("data")
+    assert col.dtype == expected_dtype
+    assert col.get_buffers()["data"][1] == expected_buffer_dtype
+
+
+def test_from_dataframe_list_dtype():
+    pa = pytest.importorskip("pyarrow", "14.0.0")
+    data = {"a": [[1, 2], [4, 5, 6]]}
+    tbl = pa.table(data)
+    result = from_dataframe(tbl)
+    expected = pd.DataFrame(data)
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/interchange/test_spec_conformance.py b/pandas/tests/interchange/test_spec_conformance.py
new file mode 100644
index 0000000000000000000000000000000000000000..04e19b290f886a4cd37e8c41ef2f2e65d7e678e8
--- /dev/null
+++ b/pandas/tests/interchange/test_spec_conformance.py
@@ -0,0 +1,187 @@
+"""
+A verbatim copy (vendored) of the spec tests.
+Taken from https://github.com/data-apis/dataframe-api
+"""
+
+import ctypes
+import math
+
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+
+@pytest.fixture
+def df_from_dict():
+    def maker(dct, is_categorical=False):
+        df = pd.DataFrame(dct)
+        return df.astype("category") if is_categorical else df
+
+    return maker
+
+
+@pytest.mark.parametrize(
+    "test_data",
+    [
+        {"a": ["foo", "bar"], "b": ["baz", "qux"]},
+        {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]},
+        {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]},
+    ],
+    ids=["str_data", "float_data", "int_data"],
+)
+def test_only_one_dtype(test_data, df_from_dict):
+    columns = list(test_data.keys())
+    df = df_from_dict(test_data)
+    with tm.assert_produces_warning(match="Interchange"):
+        dfX = df.__dataframe__()
+
+    column_size = len(test_data[columns[0]])
+    for column in columns:
+        null_count = dfX.get_column_by_name(column).null_count
+        assert null_count == 0
+        assert isinstance(null_count, int)
+        assert dfX.get_column_by_name(column).size() == column_size
+        assert dfX.get_column_by_name(column).offset == 0
+
+
+def test_mixed_dtypes(df_from_dict):
+    df = df_from_dict(
+        {
+            "a": [1, 2, 3],  # dtype kind INT = 0
+            "b": [3, 4, 5],  # dtype kind INT = 0
+            "c": [1.5, 2.5, 3.5],  # dtype kind FLOAT = 2
+            "d": [9, 10, 11],  # dtype kind INT = 0
+            "e": [True, False, True],  # dtype kind BOOLEAN = 20
+            "f": ["a", "", "c"],  # dtype kind STRING = 21
+        }
+    )
+    with tm.assert_produces_warning(match="Interchange"):
+        dfX = df.__dataframe__()
+    # for meanings of dtype[0] see the spec; we cannot import the spec here as this
+    # file is expected to be vendored *anywhere*;
+    # values for dtype[0] are explained above
+    columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21}
+
+    for column, kind in columns.items():
+        colX = dfX.get_column_by_name(column)
+        assert colX.null_count == 0
+        assert isinstance(colX.null_count, int)
+        assert colX.size() == 3
+        assert colX.offset == 0
+
+        assert colX.dtype[0] == kind
+
+    assert dfX.get_column_by_name("c").dtype[1] == 64
+
+
+def test_na_float(df_from_dict):
+    df = df_from_dict({"a": [1.0, math.nan, 2.0]})
+    with tm.assert_produces_warning(match="Interchange"):
+        dfX = df.__dataframe__()
+    colX = dfX.get_column_by_name("a")
+    assert colX.null_count == 1
+    assert isinstance(colX.null_count, int)
+
+
+def test_noncategorical(df_from_dict):
+    df = df_from_dict({"a": [1, 2, 3]})
+    with tm.assert_produces_warning(match="Interchange"):
+        dfX = df.__dataframe__()
+    colX = dfX.get_column_by_name("a")
+    with pytest.raises(TypeError, match=".*categorical.*"):
+        colX.describe_categorical
+
+
+def test_categorical(df_from_dict):
+    df = df_from_dict(
+        {"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]},
+        is_categorical=True,
+    )
+
+    with tm.assert_produces_warning(match="Interchange"):
+        colX = df.__dataframe__().get_column_by_name("weekday")
+    categorical = colX.describe_categorical
+    assert isinstance(categorical["is_ordered"], bool)
+    assert isinstance(categorical["is_dictionary"], bool)
+
+
+def test_dataframe(df_from_dict):
+    df = df_from_dict(
+        {"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]}
+    )
+    with tm.assert_produces_warning(match="Interchange"):
+        dfX = df.__dataframe__()
+
+    assert dfX.num_columns() == 3
+    assert dfX.num_rows() == 3
+    assert dfX.num_chunks() == 1
+    assert list(dfX.column_names()) == ["x", "y", "z"]
+    assert list(dfX.select_columns((0, 2)).column_names()) == list(
+        dfX.select_columns_by_name(("x", "z")).column_names()
+    )
+
+
+@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
+def test_df_get_chunks(size, n_chunks, df_from_dict):
+    df = df_from_dict({"x": list(range(size))})
+    with tm.assert_produces_warning(match="Interchange"):
+        dfX = df.__dataframe__()
+    chunks = list(dfX.get_chunks(n_chunks))
+    assert len(chunks) == n_chunks
+    assert sum(chunk.num_rows() for chunk in chunks) == size
+
+
+@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
+def test_column_get_chunks(size, n_chunks, df_from_dict):
+    df = df_from_dict({"x": list(range(size))})
+    with tm.assert_produces_warning(match="Interchange"):
+        dfX = df.__dataframe__()
+    chunks = list(dfX.get_column(0).get_chunks(n_chunks))
+    assert len(chunks) == n_chunks
+    assert sum(chunk.size() for chunk in chunks) == size
+
+
+def test_get_columns(df_from_dict):
+    df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]})
+    with tm.assert_produces_warning(match="Interchange"):
+        dfX = df.__dataframe__()
+    for colX in dfX.get_columns():
+        assert colX.size() == 2
+        assert colX.num_chunks() == 1
+    # for meanings of dtype[0] see the spec; we cannot import the spec here as this
+    # file is expected to be vendored *anywhere*
+    assert dfX.get_column(0).dtype[0] == 0  # INT
+    assert dfX.get_column(1).dtype[0] == 2  # FLOAT
+
+
+def test_buffer(df_from_dict):
+    arr = [0, 1, -1]
+    df = df_from_dict({"a": arr})
+    with tm.assert_produces_warning(match="Interchange"):
+        dfX = df.__dataframe__()
+    colX = dfX.get_column(0)
+    bufX = colX.get_buffers()
+
+    dataBuf, dataDtype = bufX["data"]
+
+    assert dataBuf.bufsize > 0
+    assert dataBuf.ptr != 0
+    device, _ = dataBuf.__dlpack_device__()
+
+    # for meanings of dtype[0] see the spec; we cannot import the spec here as this
+    # file is expected to be vendored *anywhere*
+    assert dataDtype[0] == 0  # INT
+
+    if device == 1:  # CPU-only as we're going to directly read memory here
+        bitwidth = dataDtype[1]
+        ctype = {
+            8: ctypes.c_int8,
+            16: ctypes.c_int16,
+            32: ctypes.c_int32,
+            64: ctypes.c_int64,
+        }[bitwidth]
+
+        for idx, truth in enumerate(arr):
+            val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value
+            assert val == truth, f"Buffer at index {idx} mismatch"
diff --git a/pandas/tests/interchange/test_utils.py b/pandas/tests/interchange/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a47bc2752ff32f5eb7630a3960e7611242cb73e3
--- /dev/null
+++ b/pandas/tests/interchange/test_utils.py
@@ -0,0 +1,89 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas.core.interchange.utils import dtype_to_arrow_c_fmt
+
+# TODO: use ArrowSchema to get reference C-string.
+# At the time, there is no way to access ArrowSchema holding a type format string
+# from python. The only way to access it is to export the structure to a C-pointer,
+# see DataType._export_to_c() method defined in
+# https://github.com/apache/arrow/blob/master/python/pyarrow/types.pxi
+
+
+@pytest.mark.parametrize(
+    "pandas_dtype, c_string",
+    [
+        (np.dtype("bool"), "b"),
+        (np.dtype("int8"), "c"),
+        (np.dtype("uint8"), "C"),
+        (np.dtype("int16"), "s"),
+        (np.dtype("uint16"), "S"),
+        (np.dtype("int32"), "i"),
+        (np.dtype("uint32"), "I"),
+        (np.dtype("int64"), "l"),
+        (np.dtype("uint64"), "L"),
+        (np.dtype("float16"), "e"),
+        (np.dtype("float32"), "f"),
+        (np.dtype("float64"), "g"),
+        (pd.Series(["a"]).dtype, "u"),
+        (
+            pd.Series([0]).astype("datetime64[ns]").dtype,
+            "tsn:",
+        ),
+        (pd.CategoricalDtype(["a"]), "l"),
+        (np.dtype("O"), "u"),
+    ],
+)
+def test_dtype_to_arrow_c_fmt(pandas_dtype, c_string):  # PR01
+    """Test ``dtype_to_arrow_c_fmt`` utility function."""
+    assert dtype_to_arrow_c_fmt(pandas_dtype) == c_string
+
+
+@pytest.mark.parametrize(
+    "pa_dtype, args_kwargs, c_string",
+    [
+        ["null", {}, "n"],
+        ["bool_", {}, "b"],
+        ["uint8", {}, "C"],
+        ["uint16", {}, "S"],
+        ["uint32", {}, "I"],
+        ["uint64", {}, "L"],
+        ["int8", {}, "c"],
+        ["int16", {}, "S"],
+        ["int32", {}, "i"],
+        ["int64", {}, "l"],
+        ["float16", {}, "e"],
+        ["float32", {}, "f"],
+        ["float64", {}, "g"],
+        ["string", {}, "u"],
+        ["binary", {}, "z"],
+        ["time32", ("s",), "tts"],
+        ["time32", ("ms",), "ttm"],
+        ["time64", ("us",), "ttu"],
+        ["time64", ("ns",), "ttn"],
+        ["date32", {}, "tdD"],
+        ["date64", {}, "tdm"],
+        ["timestamp", {"unit": "s"}, "tss:"],
+        ["timestamp", {"unit": "ms"}, "tsm:"],
+        ["timestamp", {"unit": "us"}, "tsu:"],
+        ["timestamp", {"unit": "ns"}, "tsn:"],
+        ["timestamp", {"unit": "ns", "tz": "UTC"}, "tsn:UTC"],
+        ["duration", ("s",), "tDs"],
+        ["duration", ("ms",), "tDm"],
+        ["duration", ("us",), "tDu"],
+        ["duration", ("ns",), "tDn"],
+        ["decimal128", {"precision": 4, "scale": 2}, "d:4,2"],
+    ],
+)
+def test_dtype_to_arrow_c_fmt_arrowdtype(pa_dtype, args_kwargs, c_string):
+    # GH 52323
+    pa = pytest.importorskip("pyarrow")
+    if not args_kwargs:
+        pa_type = getattr(pa, pa_dtype)()
+    elif isinstance(args_kwargs, tuple):
+        pa_type = getattr(pa, pa_dtype)(*args_kwargs)
+    else:
+        pa_type = getattr(pa, pa_dtype)(**args_kwargs)
+    arrow_type = pd.ArrowDtype(pa_type)
+    assert dtype_to_arrow_c_fmt(arrow_type) == c_string
diff --git a/pandas/tests/internals/__init__.py b/pandas/tests/internals/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..9368105a1fa5b192e7b223529e527390b9d6d7bf
--- /dev/null
+++ b/pandas/tests/internals/test_api.py
@@ -0,0 +1,178 @@
+"""
+Tests for the pseudo-public API implemented in internals/api.py and exposed
+in core.internals
+"""
+
+import datetime
+
+import numpy as np
+import pytest
+
+from pandas.errors import Pandas4Warning
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.api.internals import create_dataframe_from_blocks
+from pandas.core import internals
+from pandas.core.internals import api
+
+
+def test_internals_api():
+    assert internals.make_block is api.make_block
+
+
+def test_namespace():
+    # SUBJECT TO CHANGE
+
+    modules = [
+        "blocks",
+        "concat",
+        "managers",
+        "construction",
+        "api",
+        "ops",
+    ]
+    expected = [
+        "make_block",
+        "BlockManager",
+        "SingleBlockManager",
+        "concatenate_managers",
+    ]
+
+    result = [x for x in dir(internals) if not x.startswith("__")]
+    assert set(result) == set(expected + modules), set(result) ^ set(expected + modules)
+
+
+@pytest.mark.parametrize(
+    "name",
+    [
+        "Block",
+        "ExtensionBlock",
+    ],
+)
+def test_deprecations(name):
+    # GH#55139
+    msg = f"{name} is deprecated.* Use public APIs instead"
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        getattr(internals, name)
+
+
+def test_make_block_2d_with_dti():
+    # GH#41168
+    dti = pd.date_range("2012", periods=3, tz="UTC")
+
+    msg = "make_block is deprecated"
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        blk = api.make_block(dti, placement=[0])
+
+    assert blk.shape == (1, 3)
+    assert blk.values.shape == (1, 3)
+
+
+def test_create_block_manager_from_blocks_deprecated():
+    # GH#33892
+    # If they must, downstream packages should get this from internals.api,
+    #  not internals.
+    msg = (
+        "create_block_manager_from_blocks is deprecated and will be "
+        "removed in a future version. Use public APIs instead"
+    )
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        internals.create_block_manager_from_blocks
+
+
+def test_maybe_infer_ndim_deprecated():
+    # GH#40226
+    msg = "maybe_infer_ndim is deprecated and will be removed in a future version."
+    arr = np.arange(5)
+    bp = pd._libs.internals.BlockPlacement([1])
+    with tm.assert_produces_warning(DeprecationWarning, match=msg):
+        internals.api.maybe_infer_ndim(arr, bp, 1)
+
+
+def test_create_dataframe_from_blocks(float_frame):
+    block = float_frame._mgr.blocks[0]
+    index = float_frame.index.copy()
+    columns = float_frame.columns.copy()
+
+    result = create_dataframe_from_blocks(
+        [(block.values, block.mgr_locs.as_array)], index=index, columns=columns
+    )
+    tm.assert_frame_equal(result, float_frame)
+
+
+def test_create_dataframe_from_blocks_types():
+    df = pd.DataFrame(
+        {
+            "int": list(range(1, 4)),
+            "uint": np.arange(3, 6).astype("uint8"),
+            "float": [2.0, np.nan, 3.0],
+            "bool": np.array([True, False, True]),
+            "boolean": pd.array([True, False, None], dtype="boolean"),
+            "string": list("abc"),
+            "datetime": pd.date_range("20130101", periods=3),
+            "datetimetz": pd.date_range("20130101", periods=3).tz_localize(
+                "Europe/Brussels"
+            ),
+            "timedelta": pd.timedelta_range("1 day", periods=3),
+            "period": pd.period_range("2012-01-01", periods=3, freq="D"),
+            "categorical": pd.Categorical(["a", "b", "a"]),
+            "interval": pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)]),
+        }
+    )
+
+    result = create_dataframe_from_blocks(
+        [(block.values, block.mgr_locs.as_array) for block in df._mgr.blocks],
+        index=df.index,
+        columns=df.columns,
+    )
+    tm.assert_frame_equal(result, df)
+
+
+def test_create_dataframe_from_blocks_datetimelike():
+    # extension dtypes that have an exact matching numpy dtype can also be
+    # be passed as a numpy array
+    index, columns = pd.RangeIndex(3), pd.Index(["a", "b", "c", "d"])
+
+    block_array1 = np.arange(
+        datetime.datetime(2020, 1, 1),
+        datetime.datetime(2020, 1, 7),
+        step=datetime.timedelta(1),
+    ).reshape((2, 3))
+    block_array2 = np.arange(
+        datetime.timedelta(1), datetime.timedelta(7), step=datetime.timedelta(1)
+    ).reshape((2, 3))
+    result = create_dataframe_from_blocks(
+        [(block_array1, np.array([0, 2])), (block_array2, np.array([1, 3]))],
+        index=index,
+        columns=columns,
+    )
+    expected = pd.DataFrame(
+        {
+            "a": pd.date_range("2020-01-01", periods=3, unit="us"),
+            "b": pd.timedelta_range("1 days", periods=3, unit="us"),
+            "c": pd.date_range("2020-01-04", periods=3, unit="us"),
+            "d": pd.timedelta_range("4 days", periods=3, unit="us"),
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "array",
+    [
+        pd.date_range("2020-01-01", periods=3),
+        pd.date_range("2020-01-01", periods=3, tz="UTC"),
+        pd.period_range("2012-01-01", periods=3, freq="D"),
+        pd.timedelta_range("1 day", periods=3),
+    ],
+)
+def test_create_dataframe_from_blocks_1dEA(array):
+    # ExtensionArrays can be passed as 1D even if stored under the hood as 2D
+    df = pd.DataFrame({"a": array})
+
+    block = df._mgr.blocks[0]
+    result = create_dataframe_from_blocks(
+        [(block.values[0], block.mgr_locs.as_array)], index=df.index, columns=df.columns
+    )
+    tm.assert_frame_equal(result, df)
diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
new file mode 100644
index 0000000000000000000000000000000000000000..8852ae81bddf2032f3cf03105917dde9fb6637bf
--- /dev/null
+++ b/pandas/tests/internals/test_internals.py
@@ -0,0 +1,1421 @@
+from datetime import (
+    date,
+    datetime,
+)
+import itertools
+import re
+
+import numpy as np
+import pytest
+
+from pandas._libs.internals import BlockPlacement
+from pandas.compat import IS64
+from pandas.errors import Pandas4Warning
+
+from pandas.core.dtypes.common import is_scalar
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    IntervalIndex,
+    Series,
+    Timedelta,
+    Timestamp,
+    period_range,
+)
+import pandas._testing as tm
+import pandas.core.algorithms as algos
+from pandas.core.arrays import (
+    DatetimeArray,
+    SparseArray,
+    TimedeltaArray,
+)
+from pandas.core.internals import (
+    BlockManager,
+    SingleBlockManager,
+    make_block,
+)
+from pandas.core.internals.blocks import (
+    ensure_block_shape,
+    maybe_coerce_values,
+    new_block,
+)
+
+
+@pytest.fixture(params=[new_block, make_block])
+def block_maker(request):
+    """
+    Fixture to test both the internal new_block and pseudo-public make_block.
+    """
+    return request.param
+
+
+@pytest.fixture
+def mgr():
+    return create_mgr(
+        "a: f8; b: object; c: f8; d: object; e: f8;"
+        "f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;"
+        "k: M8[ns, US/Eastern]; l: M8[ns, CET];"
+    )
+
+
+def assert_block_equal(left, right):
+    tm.assert_numpy_array_equal(left.values, right.values)
+    assert left.dtype == right.dtype
+    assert isinstance(left.mgr_locs, BlockPlacement)
+    assert isinstance(right.mgr_locs, BlockPlacement)
+    tm.assert_numpy_array_equal(left.mgr_locs.as_array, right.mgr_locs.as_array)
+
+
+def get_numeric_mat(shape):
+    arr = np.arange(shape[0])
+    return np.lib.stride_tricks.as_strided(
+        x=arr, shape=shape, strides=(arr.itemsize,) + (0,) * (len(shape) - 1)
+    ).copy()
+
+
+N = 10
+
+
+def create_block(typestr, placement, item_shape=None, num_offset=0, maker=new_block):
+    """
+    Supported typestr:
+
+        * float, f8, f4, f2
+        * int, i8, i4, i2, i1
+        * uint, u8, u4, u2, u1
+        * complex, c16, c8
+        * bool
+        * object, string, O
+        * datetime, dt, M8[ns], M8[ns, tz]
+        * timedelta, td, m8[ns]
+        * sparse (SparseArray with fill_value=0.0)
+        * sparse_na (SparseArray with fill_value=np.nan)
+        * category, category2
+
+    """
+    placement = BlockPlacement(placement)
+    num_items = len(placement)
+
+    if item_shape is None:
+        item_shape = (N,)
+
+    shape = (num_items, *item_shape)
+
+    mat = get_numeric_mat(shape)
+
+    if typestr in (
+        "float",
+        "f8",
+        "f4",
+        "f2",
+        "int",
+        "i8",
+        "i4",
+        "i2",
+        "i1",
+        "uint",
+        "u8",
+        "u4",
+        "u2",
+        "u1",
+    ):
+        values = mat.astype(typestr) + num_offset
+    elif typestr in ("complex", "c16", "c8"):
+        values = 1.0j * (mat.astype(typestr) + num_offset)
+    elif typestr in ("object", "string", "O"):
+        values = np.reshape([f"A{i:d}" for i in mat.ravel() + num_offset], shape)
+    elif typestr in ("b", "bool"):
+        values = np.ones(shape, dtype=np.bool_)
+    elif typestr in ("datetime", "dt", "M8[ns]"):
+        values = (mat * 1e9).astype("M8[ns]")
+    elif typestr.startswith("M8[ns"):
+        # datetime with tz
+        m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr)
+        assert m is not None, f"incompatible typestr -> {typestr}"
+        tz = m.groups()[0]
+        assert num_items == 1, "must have only 1 num items for a tz-aware"
+        values = DatetimeIndex(np.arange(N) * 10**9, tz=tz)._data
+        values = ensure_block_shape(values, ndim=len(shape))
+    elif typestr in ("timedelta", "td", "m8[ns]"):
+        values = (mat * 1).astype("m8[ns]")
+    elif typestr in ("category",):
+        values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4])
+    elif typestr in ("category2",):
+        values = Categorical(["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"])
+    elif typestr in ("sparse", "sparse_na"):
+        if shape[-1] != 10:
+            # We also are implicitly assuming this in the category cases above
+            raise NotImplementedError
+
+        assert all(s == 1 for s in shape[:-1])
+        if typestr.endswith("_na"):
+            fill_value = np.nan
+        else:
+            fill_value = 0.0
+        values = SparseArray(
+            [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6],
+            fill_value=fill_value,
+        )
+        arr = values.sp_values.view()
+        arr += num_offset - 1
+    else:
+        raise ValueError(f'Unsupported typestr: "{typestr}"')
+
+    values = maybe_coerce_values(values)
+    return maker(values, placement=placement, ndim=len(shape))
+
+
+def create_single_mgr(typestr, num_rows=None):
+    if num_rows is None:
+        num_rows = N
+
+    return SingleBlockManager(
+        create_block(typestr, placement=slice(0, num_rows), item_shape=()),
+        Index(np.arange(num_rows)),
+    )
+
+
+def create_mgr(descr, item_shape=None):
+    """
+    Construct BlockManager from string description.
+
+    String description syntax looks similar to np.matrix initializer.  It looks
+    like this::
+
+        a,b,c: f8; d,e,f: i8
+
+    Rules are rather simple:
+
+    * see list of supported datatypes in `create_block` method
+    * components are semicolon-separated
+    * each component is `NAME,NAME,NAME: DTYPE_ID`
+    * whitespace around colons & semicolons are removed
+    * components with same DTYPE_ID are combined into single block
+    * to force multiple blocks with same dtype, use '-SUFFIX'::
+
+        "a:f8-1; b:f8-2; c:f8-foobar"
+
+    """
+    if item_shape is None:
+        item_shape = (N,)
+
+    offset = 0
+    mgr_items = []
+    block_placements = {}
+    for d in descr.split(";"):
+        d = d.strip()
+        if not len(d):
+            continue
+        names, blockstr = d.partition(":")[::2]
+        blockstr = blockstr.strip()
+        names = names.strip().split(",")
+
+        mgr_items.extend(names)
+        placement = list(np.arange(len(names)) + offset)
+        try:
+            block_placements[blockstr].extend(placement)
+        except KeyError:
+            block_placements[blockstr] = placement
+        offset += len(names)
+
+    mgr_items = Index(mgr_items)
+
+    blocks = []
+    num_offset = 0
+    for blockstr, placement in block_placements.items():
+        typestr = blockstr.split("-")[0]
+        blocks.append(
+            create_block(
+                typestr, placement, item_shape=item_shape, num_offset=num_offset
+            )
+        )
+        num_offset += len(placement)
+
+    sblocks = sorted(blocks, key=lambda b: b.mgr_locs[0])
+    return BlockManager(
+        tuple(sblocks),
+        [mgr_items] + [Index(np.arange(n)) for n in item_shape],
+    )
+
+
+@pytest.fixture
+def fblock():
+    return create_block("float", [0, 2, 4])
+
+
+class TestBlock:
+    def test_constructor(self):
+        int32block = create_block("i4", [0])
+        assert int32block.dtype == np.int32
+
+    @pytest.mark.parametrize(
+        "typ, data",
+        [
+            ["float", [0, 2, 4]],
+            ["complex", [7]],
+            ["object", [1, 3]],
+            ["bool", [5]],
+        ],
+    )
+    def test_pickle(self, typ, data, temp_file):
+        blk = create_block(typ, data)
+        assert_block_equal(tm.round_trip_pickle(blk, temp_file), blk)
+
+    def test_mgr_locs(self, fblock):
+        assert isinstance(fblock.mgr_locs, BlockPlacement)
+        tm.assert_numpy_array_equal(
+            fblock.mgr_locs.as_array, np.array([0, 2, 4], dtype=np.intp)
+        )
+
+    def test_attrs(self, fblock):
+        assert fblock.shape == fblock.values.shape
+        assert fblock.dtype == fblock.values.dtype
+        assert len(fblock) == len(fblock.values)
+
+    def test_copy(self, fblock):
+        cop = fblock.copy(deep=True)
+        assert cop is not fblock
+        assert_block_equal(fblock, cop)
+
+    def test_delete(self, fblock):
+        newb = fblock.copy(deep=True)
+        locs = newb.mgr_locs
+        nb = newb.delete(0)[0]
+        assert newb.mgr_locs is locs
+
+        assert nb is not newb
+
+        tm.assert_numpy_array_equal(
+            nb.mgr_locs.as_array, np.array([2, 4], dtype=np.intp)
+        )
+        assert not (newb.values[0] == 1).all()
+        assert (nb.values[0] == 1).all()
+
+        newb = fblock.copy(deep=True)
+        locs = newb.mgr_locs
+        nb = newb.delete(1)
+        assert len(nb) == 2
+        assert newb.mgr_locs is locs
+
+        tm.assert_numpy_array_equal(
+            nb[0].mgr_locs.as_array, np.array([0], dtype=np.intp)
+        )
+        tm.assert_numpy_array_equal(
+            nb[1].mgr_locs.as_array, np.array([4], dtype=np.intp)
+        )
+        assert not (newb.values[1] == 2).all()
+        assert (nb[1].values[0] == 2).all()
+
+        newb = fblock.copy(deep=True)
+        nb = newb.delete(2)
+        assert len(nb) == 1
+        tm.assert_numpy_array_equal(
+            nb[0].mgr_locs.as_array, np.array([0, 2], dtype=np.intp)
+        )
+        assert (nb[0].values[1] == 1).all()
+
+        newb = fblock.copy(deep=True)
+
+        with pytest.raises(IndexError, match=None):
+            newb.delete(3)
+
+    def test_delete_datetimelike(self):
+        # dont use np.delete on values, as that will coerce from DTA/TDA to ndarray
+        arr = np.arange(20, dtype="i8").reshape(5, 4).view("m8[ns]")
+        df = DataFrame(arr)
+        blk = df._mgr.blocks[0]
+        assert isinstance(blk.values, TimedeltaArray)
+
+        nb = blk.delete(1)
+        assert len(nb) == 2
+        assert isinstance(nb[0].values, TimedeltaArray)
+        assert isinstance(nb[1].values, TimedeltaArray)
+
+        df = DataFrame(arr.view("M8[ns]"))
+        blk = df._mgr.blocks[0]
+        assert isinstance(blk.values, DatetimeArray)
+
+        nb = blk.delete([1, 3])
+        assert len(nb) == 2
+        assert isinstance(nb[0].values, DatetimeArray)
+        assert isinstance(nb[1].values, DatetimeArray)
+
+    def test_split(self):
+        # GH#37799
+        values = np.random.default_rng(2).standard_normal((3, 4))
+        blk = new_block(values, placement=BlockPlacement([3, 1, 6]), ndim=2)
+        result = list(blk._split())
+
+        # check that we get views, not copies
+        values[:] = -9999
+        assert (blk.values == -9999).all()
+
+        assert len(result) == 3
+        expected = [
+            new_block(values[[0]], placement=BlockPlacement([3]), ndim=2),
+            new_block(values[[1]], placement=BlockPlacement([1]), ndim=2),
+            new_block(values[[2]], placement=BlockPlacement([6]), ndim=2),
+        ]
+        for res, exp in zip(result, expected):
+            assert_block_equal(res, exp)
+
+
+class TestBlockManager:
+    def test_attrs(self):
+        mgr = create_mgr("a,b,c: f8-1; d,e,f: f8-2")
+        assert mgr.nblocks == 2
+        assert len(mgr) == 6
+
+    def test_duplicate_ref_loc_failure(self):
+        tmp_mgr = create_mgr("a:bool; a: f8")
+
+        axes, blocks = tmp_mgr.axes, tmp_mgr.blocks
+
+        blocks[0].mgr_locs = BlockPlacement(np.array([0]))
+        blocks[1].mgr_locs = BlockPlacement(np.array([0]))
+
+        # test trying to create block manager with overlapping ref locs
+
+        msg = "Gaps in blk ref_locs"
+
+        mgr = BlockManager(blocks, axes)
+        with pytest.raises(AssertionError, match=msg):
+            mgr._rebuild_blknos_and_blklocs()
+
+        blocks[0].mgr_locs = BlockPlacement(np.array([0]))
+        blocks[1].mgr_locs = BlockPlacement(np.array([1]))
+        mgr = BlockManager(blocks, axes)
+        mgr.iget(1)
+
+    def test_pickle(self, mgr, temp_file):
+        mgr2 = tm.round_trip_pickle(mgr, temp_file)
+        tm.assert_frame_equal(
+            DataFrame._from_mgr(mgr, axes=mgr.axes),
+            DataFrame._from_mgr(mgr2, axes=mgr2.axes),
+        )
+
+        # GH2431
+        assert hasattr(mgr2, "_is_consolidated")
+        assert hasattr(mgr2, "_known_consolidated")
+
+        # reset to False on load
+        assert not mgr2._is_consolidated
+        assert not mgr2._known_consolidated
+
+    @pytest.mark.parametrize("mgr_string", ["a,a,a:f8", "a: f8; a: i8"])
+    def test_non_unique_pickle(self, mgr_string, temp_file):
+        mgr = create_mgr(mgr_string)
+        mgr2 = tm.round_trip_pickle(mgr, temp_file)
+        tm.assert_frame_equal(
+            DataFrame._from_mgr(mgr, axes=mgr.axes),
+            DataFrame._from_mgr(mgr2, axes=mgr2.axes),
+        )
+
+    def test_categorical_block_pickle(self, temp_file):
+        mgr = create_mgr("a: category")
+        mgr2 = tm.round_trip_pickle(mgr, temp_file)
+        tm.assert_frame_equal(
+            DataFrame._from_mgr(mgr, axes=mgr.axes),
+            DataFrame._from_mgr(mgr2, axes=mgr2.axes),
+        )
+
+        smgr = create_single_mgr("category")
+        smgr2 = tm.round_trip_pickle(smgr, temp_file)
+        tm.assert_series_equal(
+            Series()._constructor_from_mgr(smgr, axes=smgr.axes),
+            Series()._constructor_from_mgr(smgr2, axes=smgr2.axes),
+        )
+
+    def test_iget(self):
+        cols = Index(list("abc"))
+        values = np.random.default_rng(2).random((3, 3))
+        block = new_block(
+            values=values.copy(),
+            placement=BlockPlacement(np.arange(3, dtype=np.intp)),
+            ndim=values.ndim,
+        )
+        mgr = BlockManager(blocks=(block,), axes=[cols, Index(np.arange(3))])
+
+        tm.assert_almost_equal(mgr.iget(0).internal_values(), values[0])
+        tm.assert_almost_equal(mgr.iget(1).internal_values(), values[1])
+        tm.assert_almost_equal(mgr.iget(2).internal_values(), values[2])
+
+    def test_set(self):
+        mgr = create_mgr("a,b,c: int", item_shape=(3,))
+
+        mgr.insert(len(mgr.items), "d", np.array(["foo"] * 3))
+        mgr.iset(1, np.array(["bar"] * 3))
+        tm.assert_numpy_array_equal(mgr.iget(0).internal_values(), np.array([0] * 3))
+        tm.assert_numpy_array_equal(
+            mgr.iget(1).internal_values(), np.array(["bar"] * 3, dtype=np.object_)
+        )
+        tm.assert_numpy_array_equal(mgr.iget(2).internal_values(), np.array([2] * 3))
+        tm.assert_numpy_array_equal(
+            mgr.iget(3).internal_values(), np.array(["foo"] * 3, dtype=np.object_)
+        )
+
+    def test_set_change_dtype(self, mgr):
+        mgr.insert(len(mgr.items), "baz", np.zeros(N, dtype=bool))
+
+        mgr.iset(mgr.items.get_loc("baz"), np.repeat("foo", N))
+        idx = mgr.items.get_loc("baz")
+        assert mgr.iget(idx).dtype == np.object_
+
+        mgr2 = mgr.consolidate()
+        mgr2.iset(mgr2.items.get_loc("baz"), np.repeat("foo", N))
+        idx = mgr2.items.get_loc("baz")
+        assert mgr2.iget(idx).dtype == np.object_
+
+        mgr2.insert(
+            len(mgr2.items),
+            "quux",
+            np.random.default_rng(2).standard_normal(N).astype(int),
+        )
+        idx = mgr2.items.get_loc("quux")
+        assert mgr2.iget(idx).dtype == np.dtype(int)
+
+        mgr2.iset(
+            mgr2.items.get_loc("quux"), np.random.default_rng(2).standard_normal(N)
+        )
+        assert mgr2.iget(idx).dtype == np.float64
+
+    def test_copy(self, mgr):
+        cp = mgr.copy(deep=False)
+        for blk, cp_blk in zip(mgr.blocks, cp.blocks):
+            # view assertion
+            tm.assert_equal(cp_blk.values, blk.values)
+            if isinstance(blk.values, np.ndarray):
+                assert cp_blk.values.base.base is blk.values.base
+            else:
+                # DatetimeTZBlock has DatetimeIndex values
+                assert cp_blk.values._ndarray.base is blk.values._ndarray.base
+
+        # copy(deep=True) consolidates, so the block-wise assertions will
+        #  fail is mgr is not consolidated
+        mgr._consolidate_inplace()
+        cp = mgr.copy(deep=True)
+        for blk, cp_blk in zip(mgr.blocks, cp.blocks):
+            bvals = blk.values
+            cpvals = cp_blk.values
+
+            tm.assert_equal(cpvals, bvals)
+
+            if isinstance(cpvals, np.ndarray):
+                lbase = cpvals.base
+                rbase = bvals.base
+            else:
+                lbase = cpvals._ndarray.base
+                rbase = bvals._ndarray.base
+
+            # copy assertion we either have a None for a base or in case of
+            # some blocks it is an array (e.g. datetimetz), but was copied
+            if isinstance(cpvals, DatetimeArray):
+                assert (lbase is None and rbase is None) or (lbase is not rbase)
+            elif not isinstance(cpvals, np.ndarray):
+                assert lbase is not rbase
+            else:
+                assert lbase is None and rbase is None
+
+    def test_sparse(self):
+        mgr = create_mgr("a: sparse-1; b: sparse-2")
+        assert mgr.as_array().dtype == np.float64
+
+    def test_sparse_mixed(self):
+        mgr = create_mgr("a: sparse-1; b: sparse-2; c: f8")
+        assert len(mgr.blocks) == 3
+        assert isinstance(mgr, BlockManager)
+
+    @pytest.mark.parametrize(
+        "mgr_string, dtype",
+        [("c: f4; d: f2", np.float32), ("c: f4; d: f2; e: f8", np.float64)],
+    )
+    def test_as_array_float(self, mgr_string, dtype):
+        mgr = create_mgr(mgr_string)
+        assert mgr.as_array().dtype == dtype
+
+    @pytest.mark.parametrize(
+        "mgr_string, dtype",
+        [
+            ("a: bool-1; b: bool-2", np.bool_),
+            ("a: i8-1; b: i8-2; c: i4; d: i2; e: u1", np.int64),
+            ("c: i4; d: i2; e: u1", np.int32),
+        ],
+    )
+    def test_as_array_int_bool(self, mgr_string, dtype):
+        mgr = create_mgr(mgr_string)
+        assert mgr.as_array().dtype == dtype
+
+    def test_as_array_datetime(self):
+        mgr = create_mgr("h: datetime-1; g: datetime-2")
+        assert mgr.as_array().dtype == "M8[ns]"
+
+    def test_as_array_datetime_tz(self):
+        mgr = create_mgr("h: M8[ns, US/Eastern]; g: M8[ns, CET]")
+        assert mgr.iget(0).dtype == "datetime64[ns, US/Eastern]"
+        assert mgr.iget(1).dtype == "datetime64[ns, CET]"
+        assert mgr.as_array().dtype == "object"
+
+    @pytest.mark.parametrize("t", ["float16", "float32", "float64", "int32", "int64"])
+    def test_astype(self, t):
+        # coerce all
+        mgr = create_mgr("c: f4; d: f2; e: f8")
+
+        t = np.dtype(t)
+        tmgr = mgr.astype(t)
+        assert tmgr.iget(0).dtype.type == t
+        assert tmgr.iget(1).dtype.type == t
+        assert tmgr.iget(2).dtype.type == t
+
+        # mixed
+        mgr = create_mgr("a,b: object; c: bool; d: datetime; e: f4; f: f2; g: f8")
+
+        t = np.dtype(t)
+        tmgr = mgr.astype(t, errors="ignore")
+        assert tmgr.iget(2).dtype.type == t
+        assert tmgr.iget(4).dtype.type == t
+        assert tmgr.iget(5).dtype.type == t
+        assert tmgr.iget(6).dtype.type == t
+
+        assert tmgr.iget(0).dtype.type == np.object_
+        assert tmgr.iget(1).dtype.type == np.object_
+        if t != np.int64:
+            assert tmgr.iget(3).dtype.type == np.datetime64
+        else:
+            assert tmgr.iget(3).dtype.type == t
+
+    def test_convert(self, using_infer_string):
+        def _compare(old_mgr, new_mgr):
+            """compare the blocks, numeric compare ==, object don't"""
+            old_blocks = set(old_mgr.blocks)
+            new_blocks = set(new_mgr.blocks)
+            assert len(old_blocks) == len(new_blocks)
+
+            # compare non-numeric
+            for b in old_blocks:
+                found = False
+                for nb in new_blocks:
+                    if (b.values == nb.values).all():
+                        found = True
+                        break
+                assert found
+
+            for b in new_blocks:
+                found = False
+                for ob in old_blocks:
+                    if (b.values == ob.values).all():
+                        found = True
+                        break
+                assert found
+
+        # noops
+        mgr = create_mgr("f: i8; g: f8")
+        new_mgr = mgr.convert()
+        _compare(mgr, new_mgr)
+
+        # convert
+        mgr = create_mgr("a,b,foo: object; f: i8; g: f8")
+        mgr.iset(0, np.array(["1"] * N, dtype=np.object_))
+        mgr.iset(1, np.array(["2."] * N, dtype=np.object_))
+        mgr.iset(2, np.array(["foo."] * N, dtype=np.object_))
+        new_mgr = mgr.convert()
+        dtype = "str" if using_infer_string else np.object_
+        assert new_mgr.iget(0).dtype == dtype
+        assert new_mgr.iget(1).dtype == dtype
+        assert new_mgr.iget(2).dtype == dtype
+        assert new_mgr.iget(3).dtype == np.int64
+        assert new_mgr.iget(4).dtype == np.float64
+
+        mgr = create_mgr(
+            "a,b,foo: object; f: i4; bool: bool; dt: datetime; i: i8; g: f8; h: f2"
+        )
+        mgr.iset(0, np.array(["1"] * N, dtype=np.object_))
+        mgr.iset(1, np.array(["2."] * N, dtype=np.object_))
+        mgr.iset(2, np.array(["foo."] * N, dtype=np.object_))
+        new_mgr = mgr.convert()
+        assert new_mgr.iget(0).dtype == dtype
+        assert new_mgr.iget(1).dtype == dtype
+        assert new_mgr.iget(2).dtype == dtype
+        assert new_mgr.iget(3).dtype == np.int32
+        assert new_mgr.iget(4).dtype == np.bool_
+        assert new_mgr.iget(5).dtype.type, np.datetime64
+        assert new_mgr.iget(6).dtype == np.int64
+        assert new_mgr.iget(7).dtype == np.float64
+        assert new_mgr.iget(8).dtype == np.float16
+
+    def test_interleave(self):
+        # self
+        for dtype in ["f8", "i8", "object", "bool", "complex", "M8[ns]", "m8[ns]"]:
+            mgr = create_mgr(f"a: {dtype}")
+            assert mgr.as_array().dtype == dtype
+            mgr = create_mgr(f"a: {dtype}; b: {dtype}")
+            assert mgr.as_array().dtype == dtype
+
+    @pytest.mark.parametrize(
+        "mgr_string, dtype",
+        [
+            ("a: category", "i8"),
+            ("a: category; b: category", "i8"),
+            ("a: category; b: category2", "object"),
+            ("a: category2", "object"),
+            ("a: category2; b: category2", "object"),
+            ("a: f8", "f8"),
+            ("a: f8; b: i8", "f8"),
+            ("a: f4; b: i8", "f8"),
+            ("a: f4; b: i8; d: object", "object"),
+            ("a: bool; b: i8", "object"),
+            ("a: complex", "complex"),
+            ("a: f8; b: category", "object"),
+            ("a: M8[ns]; b: category", "object"),
+            ("a: M8[ns]; b: bool", "object"),
+            ("a: M8[ns]; b: i8", "object"),
+            ("a: m8[ns]; b: bool", "object"),
+            ("a: m8[ns]; b: i8", "object"),
+            ("a: M8[ns]; b: m8[ns]", "object"),
+        ],
+    )
+    def test_interleave_dtype(self, mgr_string, dtype):
+        # will be converted according the actual dtype of the underlying
+        mgr = create_mgr("a: category")
+        assert mgr.as_array().dtype == "i8"
+        mgr = create_mgr("a: category; b: category2")
+        assert mgr.as_array().dtype == "object"
+        mgr = create_mgr("a: category2")
+        assert mgr.as_array().dtype == "object"
+
+        # combinations
+        mgr = create_mgr("a: f8")
+        assert mgr.as_array().dtype == "f8"
+        mgr = create_mgr("a: f8; b: i8")
+        assert mgr.as_array().dtype == "f8"
+        mgr = create_mgr("a: f4; b: i8")
+        assert mgr.as_array().dtype == "f8"
+        mgr = create_mgr("a: f4; b: i8; d: object")
+        assert mgr.as_array().dtype == "object"
+        mgr = create_mgr("a: bool; b: i8")
+        assert mgr.as_array().dtype == "object"
+        mgr = create_mgr("a: complex")
+        assert mgr.as_array().dtype == "complex"
+        mgr = create_mgr("a: f8; b: category")
+        assert mgr.as_array().dtype == "f8"
+        mgr = create_mgr("a: M8[ns]; b: category")
+        assert mgr.as_array().dtype == "object"
+        mgr = create_mgr("a: M8[ns]; b: bool")
+        assert mgr.as_array().dtype == "object"
+        mgr = create_mgr("a: M8[ns]; b: i8")
+        assert mgr.as_array().dtype == "object"
+        mgr = create_mgr("a: m8[ns]; b: bool")
+        assert mgr.as_array().dtype == "object"
+        mgr = create_mgr("a: m8[ns]; b: i8")
+        assert mgr.as_array().dtype == "object"
+        mgr = create_mgr("a: M8[ns]; b: m8[ns]")
+        assert mgr.as_array().dtype == "object"
+
+    def test_consolidate_ordering_issues(self, mgr):
+        mgr.iset(mgr.items.get_loc("f"), np.random.default_rng(2).standard_normal(N))
+        mgr.iset(mgr.items.get_loc("d"), np.random.default_rng(2).standard_normal(N))
+        mgr.iset(mgr.items.get_loc("b"), np.random.default_rng(2).standard_normal(N))
+        mgr.iset(mgr.items.get_loc("g"), np.random.default_rng(2).standard_normal(N))
+        mgr.iset(mgr.items.get_loc("h"), np.random.default_rng(2).standard_normal(N))
+
+        # we have datetime/tz blocks in mgr
+        cons = mgr.consolidate()
+        assert cons.nblocks == 4
+        cons = mgr.consolidate().get_numeric_data()
+        assert cons.nblocks == 1
+        assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement)
+        tm.assert_numpy_array_equal(
+            cons.blocks[0].mgr_locs.as_array, np.arange(len(cons.items), dtype=np.intp)
+        )
+
+    def test_reindex_items(self):
+        # mgr is not consolidated, f8 & f8-2 blocks
+        mgr = create_mgr("a: f8; b: i8; c: f8; d: i8; e: f8; f: bool; g: f8-2")
+
+        reindexed = mgr.reindex_axis(["g", "c", "a", "d"], axis=0)
+        assert not reindexed.is_consolidated()
+
+        tm.assert_index_equal(reindexed.items, Index(["g", "c", "a", "d"]))
+        tm.assert_almost_equal(
+            mgr.iget(6).internal_values(), reindexed.iget(0).internal_values()
+        )
+        tm.assert_almost_equal(
+            mgr.iget(2).internal_values(), reindexed.iget(1).internal_values()
+        )
+        tm.assert_almost_equal(
+            mgr.iget(0).internal_values(), reindexed.iget(2).internal_values()
+        )
+        tm.assert_almost_equal(
+            mgr.iget(3).internal_values(), reindexed.iget(3).internal_values()
+        )
+
+    def test_get_numeric_data(self):
+        mgr = create_mgr(
+            "int: int; float: float; complex: complex;"
+            "str: object; bool: bool; obj: object; dt: datetime",
+            item_shape=(3,),
+        )
+        mgr.iset(5, np.array([1, 2, 3], dtype=np.object_))
+
+        numeric = mgr.get_numeric_data()
+        tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"]))
+        tm.assert_almost_equal(
+            mgr.iget(mgr.items.get_loc("float")).internal_values(),
+            numeric.iget(numeric.items.get_loc("float")).internal_values(),
+        )
+
+        # Check sharing
+        numeric.iset(
+            numeric.items.get_loc("float"),
+            np.array([100.0, 200.0, 300.0]),
+            inplace=True,
+        )
+        tm.assert_almost_equal(
+            mgr.iget(mgr.items.get_loc("float")).internal_values(),
+            np.array([1.0, 1.0, 1.0]),
+        )
+
+    def test_get_bool_data(self):
+        mgr = create_mgr(
+            "int: int; float: float; complex: complex;"
+            "str: object; bool: bool; obj: object; dt: datetime",
+            item_shape=(3,),
+        )
+        mgr.iset(6, np.array([True, False, True], dtype=np.object_))
+
+        bools = mgr.get_bool_data()
+        tm.assert_index_equal(bools.items, Index(["bool"]))
+        tm.assert_almost_equal(
+            mgr.iget(mgr.items.get_loc("bool")).internal_values(),
+            bools.iget(bools.items.get_loc("bool")).internal_values(),
+        )
+
+        bools.iset(0, np.array([True, False, True]), inplace=True)
+        tm.assert_numpy_array_equal(
+            mgr.iget(mgr.items.get_loc("bool")).internal_values(),
+            np.array([True, True, True]),
+        )
+
+    def test_unicode_repr_doesnt_raise(self):
+        repr(create_mgr("b,\u05d0: object"))
+
+    @pytest.mark.parametrize(
+        "mgr_string", ["a,b,c: i8-1; d,e,f: i8-2", "a,a,a: i8-1; b,b,b: i8-2"]
+    )
+    def test_equals(self, mgr_string):
+        # unique items
+        bm1 = create_mgr(mgr_string)
+        bm2 = BlockManager(bm1.blocks[::-1], bm1.axes)
+        assert bm1.equals(bm2)
+
+    @pytest.mark.parametrize(
+        "mgr_string",
+        [
+            "a:i8;b:f8",  # basic case
+            "a:i8;b:f8;c:c8;d:b",  # many types
+            "a:i8;e:dt;f:td;g:string",  # more types
+            "a:i8;b:category;c:category2",  # categories
+            "c:sparse;d:sparse_na;b:f8",  # sparse
+        ],
+    )
+    def test_equals_block_order_different_dtypes(self, mgr_string):
+        # GH 9330
+        bm = create_mgr(mgr_string)
+        block_perms = itertools.permutations(bm.blocks)
+        for bm_perm in block_perms:
+            bm_this = BlockManager(tuple(bm_perm), bm.axes)
+            assert bm.equals(bm_this)
+            assert bm_this.equals(bm)
+
+    def test_single_mgr_ctor(self):
+        mgr = create_single_mgr("f8", num_rows=5)
+        assert mgr.external_values().tolist() == [0.0, 1.0, 2.0, 3.0, 4.0]
+
+    @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
+    def test_validate_bool_args(self, value):
+        bm1 = create_mgr("a,b,c: i8-1; d,e,f: i8-2")
+
+        msg = (
+            'For argument "inplace" expected type bool, '
+            f"received type {type(value).__name__}."
+        )
+        with pytest.raises(ValueError, match=msg):
+            bm1.replace_list([1], [2], inplace=value)
+
+    def test_iset_split_block(self):
+        bm = create_mgr("a,b,c: i8; d: f8")
+        bm._iset_split_block(0, np.array([0]))
+        tm.assert_numpy_array_equal(
+            bm.blklocs, np.array([0, 0, 1, 0], dtype="int64" if IS64 else "int32")
+        )
+        # First indexer currently does not have a block associated with it in case
+        tm.assert_numpy_array_equal(
+            bm.blknos, np.array([0, 0, 0, 1], dtype="int64" if IS64 else "int32")
+        )
+        assert len(bm.blocks) == 2
+
+    def test_iset_split_block_values(self):
+        bm = create_mgr("a,b,c: i8; d: f8")
+        bm._iset_split_block(0, np.array([0]), np.array([list(range(10))]))
+        tm.assert_numpy_array_equal(
+            bm.blklocs, np.array([0, 0, 1, 0], dtype="int64" if IS64 else "int32")
+        )
+        # First indexer currently does not have a block associated with it in case
+        tm.assert_numpy_array_equal(
+            bm.blknos, np.array([0, 2, 2, 1], dtype="int64" if IS64 else "int32")
+        )
+        assert len(bm.blocks) == 3
+
+
+def _as_array(mgr):
+    if mgr.ndim == 1:
+        return mgr.external_values()
+    return mgr.as_array().T
+
+
+class TestIndexing:
+    # Nosetests-style data-driven tests.
+    #
+    # This test applies different indexing routines to block managers and
+    # compares the outcome to the result of same operations on np.ndarray.
+    #
+    # NOTE: sparse (SparseBlock with fill_value != np.nan) fail a lot of tests
+    #       and are disabled.
+
+    MANAGERS = [
+        create_single_mgr("f8", N),
+        create_single_mgr("i8", N),
+        # 2-dim
+        create_mgr("a,b,c,d,e,f: f8", item_shape=(N,)),
+        create_mgr("a,b,c,d,e,f: i8", item_shape=(N,)),
+        create_mgr("a,b: f8; c,d: i8; e,f: string", item_shape=(N,)),
+        create_mgr("a,b: f8; c,d: i8; e,f: f8", item_shape=(N,)),
+    ]
+
+    @pytest.mark.parametrize("mgr", MANAGERS)
+    def test_get_slice(self, mgr):
+        def assert_slice_ok(mgr, axis, slobj):
+            mat = _as_array(mgr)
+
+            # we maybe using an ndarray to test slicing and
+            # might not be the full length of the axis
+            if isinstance(slobj, np.ndarray):
+                ax = mgr.axes[axis]
+                if len(ax) and len(slobj) and len(slobj) != len(ax):
+                    slobj = np.concatenate(
+                        [slobj, np.zeros(len(ax) - len(slobj), dtype=bool)]
+                    )
+
+            if isinstance(slobj, slice):
+                sliced = mgr.get_slice(slobj, axis=axis)
+            elif (
+                mgr.ndim == 1
+                and axis == 0
+                and isinstance(slobj, np.ndarray)
+                and slobj.dtype == bool
+            ):
+                sliced = mgr.get_rows_with_mask(slobj)
+            else:
+                # BlockManager doesn't support non-slice, SingleBlockManager
+                #  doesn't support axis > 0
+                raise TypeError(slobj)
+
+            mat_slobj = (slice(None),) * axis + (slobj,)
+            tm.assert_numpy_array_equal(
+                mat[mat_slobj], _as_array(sliced), check_dtype=False
+            )
+            tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis])
+
+        assert mgr.ndim <= 2, mgr.ndim
+        for ax in range(mgr.ndim):
+            # slice
+            assert_slice_ok(mgr, ax, slice(None))
+            assert_slice_ok(mgr, ax, slice(3))
+            assert_slice_ok(mgr, ax, slice(100))
+            assert_slice_ok(mgr, ax, slice(1, 4))
+            assert_slice_ok(mgr, ax, slice(3, 0, -2))
+
+            if mgr.ndim < 2:
+                # 2D only support slice objects
+
+                # boolean mask
+                assert_slice_ok(mgr, ax, np.ones(mgr.shape[ax], dtype=np.bool_))
+                assert_slice_ok(mgr, ax, np.zeros(mgr.shape[ax], dtype=np.bool_))
+
+                if mgr.shape[ax] >= 3:
+                    assert_slice_ok(mgr, ax, np.arange(mgr.shape[ax]) % 3 == 0)
+                    assert_slice_ok(
+                        mgr, ax, np.array([True, True, False], dtype=np.bool_)
+                    )
+
+    @pytest.mark.parametrize("mgr", MANAGERS)
+    def test_take(self, mgr):
+        def assert_take_ok(mgr, axis, indexer):
+            mat = _as_array(mgr)
+            taken = mgr.take(indexer, axis)
+            tm.assert_numpy_array_equal(
+                np.take(mat, indexer, axis), _as_array(taken), check_dtype=False
+            )
+            tm.assert_index_equal(mgr.axes[axis].take(indexer), taken.axes[axis])
+
+        for ax in range(mgr.ndim):
+            # take/fancy indexer
+            assert_take_ok(mgr, ax, indexer=np.array([], dtype=np.intp))
+            assert_take_ok(mgr, ax, indexer=np.array([0, 0, 0], dtype=np.intp))
+            assert_take_ok(
+                mgr, ax, indexer=np.array(list(range(mgr.shape[ax])), dtype=np.intp)
+            )
+
+            if mgr.shape[ax] >= 3:
+                assert_take_ok(mgr, ax, indexer=np.array([0, 1, 2], dtype=np.intp))
+                assert_take_ok(mgr, ax, indexer=np.array([-1, -2, -3], dtype=np.intp))
+
+    @pytest.mark.parametrize("mgr", MANAGERS)
+    @pytest.mark.parametrize("fill_value", [None, np.nan, 100.0])
+    def test_reindex_axis(self, fill_value, mgr):
+        def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value):
+            mat = _as_array(mgr)
+            indexer = mgr.axes[axis].get_indexer_for(new_labels)
+
+            reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value)
+            tm.assert_numpy_array_equal(
+                algos.take_nd(mat, indexer, axis, fill_value=fill_value),
+                _as_array(reindexed),
+                check_dtype=False,
+            )
+            tm.assert_index_equal(reindexed.axes[axis], new_labels)
+
+        for ax in range(mgr.ndim):
+            assert_reindex_axis_is_ok(mgr, ax, Index([]), fill_value)
+            assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax], fill_value)
+            assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][[0, 0, 0]], fill_value)
+            assert_reindex_axis_is_ok(mgr, ax, Index(["foo", "bar", "baz"]), fill_value)
+            assert_reindex_axis_is_ok(
+                mgr, ax, Index(["foo", mgr.axes[ax][0], "baz"]), fill_value
+            )
+
+            if mgr.shape[ax] >= 3:
+                assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][:-3], fill_value)
+                assert_reindex_axis_is_ok(mgr, ax, mgr.axes[ax][-3::-1], fill_value)
+                assert_reindex_axis_is_ok(
+                    mgr, ax, mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value
+                )
+
+    @pytest.mark.parametrize("mgr", MANAGERS)
+    @pytest.mark.parametrize("fill_value", [None, np.nan, 100.0])
+    def test_reindex_indexer(self, fill_value, mgr):
+        def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value):
+            mat = _as_array(mgr)
+            reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value)
+            reindexed = mgr.reindex_indexer(
+                new_labels, indexer, axis, fill_value=fill_value
+            )
+            tm.assert_numpy_array_equal(
+                reindexed_mat, _as_array(reindexed), check_dtype=False
+            )
+            tm.assert_index_equal(reindexed.axes[axis], new_labels)
+
+        for ax in range(mgr.ndim):
+            assert_reindex_indexer_is_ok(
+                mgr, ax, Index([]), np.array([], dtype=np.intp), fill_value
+            )
+            assert_reindex_indexer_is_ok(
+                mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value
+            )
+            assert_reindex_indexer_is_ok(
+                mgr,
+                ax,
+                Index(["foo"] * mgr.shape[ax]),
+                np.arange(mgr.shape[ax]),
+                fill_value,
+            )
+            assert_reindex_indexer_is_ok(
+                mgr, ax, mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), fill_value
+            )
+            assert_reindex_indexer_is_ok(
+                mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value
+            )
+            assert_reindex_indexer_is_ok(
+                mgr, ax, Index(["foo", "bar", "baz"]), np.array([0, 0, 0]), fill_value
+            )
+            assert_reindex_indexer_is_ok(
+                mgr, ax, Index(["foo", "bar", "baz"]), np.array([-1, 0, -1]), fill_value
+            )
+            assert_reindex_indexer_is_ok(
+                mgr,
+                ax,
+                Index(["foo", mgr.axes[ax][0], "baz"]),
+                np.array([-1, -1, -1]),
+                fill_value,
+            )
+
+            if mgr.shape[ax] >= 3:
+                assert_reindex_indexer_is_ok(
+                    mgr,
+                    ax,
+                    Index(["foo", "bar", "baz"]),
+                    np.array([0, 1, 2]),
+                    fill_value,
+                )
+
+
+class TestBlockPlacement:
+    @pytest.mark.parametrize(
+        "slc, expected",
+        [
+            (slice(0, 4), 4),
+            (slice(0, 4, 2), 2),
+            (slice(0, 3, 2), 2),
+            (slice(0, 1, 2), 1),
+            (slice(1, 0, -1), 1),
+        ],
+    )
+    def test_slice_len(self, slc, expected):
+        assert len(BlockPlacement(slc)) == expected
+
+    @pytest.mark.parametrize("slc", [slice(1, 1, 0), slice(1, 2, 0)])
+    def test_zero_step_raises(self, slc):
+        msg = "slice step cannot be zero"
+        with pytest.raises(ValueError, match=msg):
+            BlockPlacement(slc)
+
+    def test_slice_canonize_negative_stop(self):
+        # GH#37524 negative stop is OK with negative step and positive start
+        slc = slice(3, -1, -2)
+
+        bp = BlockPlacement(slc)
+        assert bp.indexer == slice(3, None, -2)
+
+    @pytest.mark.parametrize(
+        "slc",
+        [
+            slice(None, None),
+            slice(10, None),
+            slice(None, None, -1),
+            slice(None, 10, -1),
+            # These are "unbounded" because negative index will
+            #  change depending on container shape.
+            slice(-1, None),
+            slice(None, -1),
+            slice(-1, -1),
+            slice(-1, None, -1),
+            slice(None, -1, -1),
+            slice(-1, -1, -1),
+        ],
+    )
+    def test_unbounded_slice_raises(self, slc):
+        msg = "unbounded slice"
+        with pytest.raises(ValueError, match=msg):
+            BlockPlacement(slc)
+
+    @pytest.mark.parametrize(
+        "slc",
+        [
+            slice(0, 0),
+            slice(100, 0),
+            slice(100, 100),
+            slice(100, 100, -1),
+            slice(0, 100, -1),
+        ],
+    )
+    def test_not_slice_like_slices(self, slc):
+        assert not BlockPlacement(slc).is_slice_like
+
+    @pytest.mark.parametrize(
+        "arr, slc",
+        [
+            ([0], slice(0, 1, 1)),
+            ([100], slice(100, 101, 1)),
+            ([0, 1, 2], slice(0, 3, 1)),
+            ([0, 5, 10], slice(0, 15, 5)),
+            ([0, 100], slice(0, 200, 100)),
+            ([2, 1], slice(2, 0, -1)),
+        ],
+    )
+    def test_array_to_slice_conversion(self, arr, slc):
+        assert BlockPlacement(arr).as_slice == slc
+
+    @pytest.mark.parametrize(
+        "arr",
+        [
+            [],
+            [-1],
+            [-1, -2, -3],
+            [-10],
+            [-1, 0, 1, 2],
+            [-2, 0, 2, 4],
+            [1, 0, -1],
+            [1, 1, 1],
+        ],
+    )
+    def test_not_slice_like_arrays(self, arr):
+        assert not BlockPlacement(arr).is_slice_like
+
+    @pytest.mark.parametrize(
+        "slc, expected",
+        [(slice(0, 3), [0, 1, 2]), (slice(0, 0), []), (slice(3, 0), [])],
+    )
+    def test_slice_iter(self, slc, expected):
+        assert list(BlockPlacement(slc)) == expected
+
+    @pytest.mark.parametrize(
+        "slc, arr",
+        [
+            (slice(0, 3), [0, 1, 2]),
+            (slice(0, 0), []),
+            (slice(3, 0), []),
+            (slice(3, 0, -1), [3, 2, 1]),
+        ],
+    )
+    def test_slice_to_array_conversion(self, slc, arr):
+        tm.assert_numpy_array_equal(
+            BlockPlacement(slc).as_array, np.asarray(arr, dtype=np.intp)
+        )
+
+    def test_blockplacement_add(self):
+        bpl = BlockPlacement(slice(0, 5))
+        assert bpl.add(1).as_slice == slice(1, 6, 1)
+        assert bpl.add(np.arange(5)).as_slice == slice(0, 10, 2)
+        assert list(bpl.add(np.arange(5, 0, -1))) == [5, 5, 5, 5, 5]
+
+    @pytest.mark.parametrize(
+        "val, inc, expected",
+        [
+            (slice(0, 0), 0, []),
+            (slice(1, 4), 0, [1, 2, 3]),
+            (slice(3, 0, -1), 0, [3, 2, 1]),
+            ([1, 2, 4], 0, [1, 2, 4]),
+            (slice(0, 0), 10, []),
+            (slice(1, 4), 10, [11, 12, 13]),
+            (slice(3, 0, -1), 10, [13, 12, 11]),
+            ([1, 2, 4], 10, [11, 12, 14]),
+            (slice(0, 0), -1, []),
+            (slice(1, 4), -1, [0, 1, 2]),
+            ([1, 2, 4], -1, [0, 1, 3]),
+        ],
+    )
+    def test_blockplacement_add_int(self, val, inc, expected):
+        assert list(BlockPlacement(val).add(inc)) == expected
+
+    @pytest.mark.parametrize("val", [slice(1, 4), [1, 2, 4]])
+    def test_blockplacement_add_int_raises(self, val):
+        msg = "iadd causes length change"
+        with pytest.raises(ValueError, match=msg):
+            BlockPlacement(val).add(-10)
+
+
+class TestCanHoldElement:
+    @pytest.fixture(
+        params=[
+            lambda x: x,
+            lambda x: x.to_series(),
+            lambda x: x._data,
+            lambda x: list(x),
+            lambda x: x.astype(object),
+            lambda x: np.asarray(x),
+            lambda x: x[0],
+            lambda x: x[:0],
+        ]
+    )
+    def element(self, request):
+        """
+        Functions that take an Index and return an element that should have
+        blk._can_hold_element(element) for a Block with this index's dtype.
+        """
+        return request.param
+
+    def test_datetime_block_can_hold_element(self):
+        block = create_block("datetime", [0])
+
+        assert block._can_hold_element([])
+
+        # We will check that block._can_hold_element iff arr.__setitem__ works
+        arr = pd.array(block.values.ravel())
+
+        # coerce None
+        assert block._can_hold_element(None)
+        arr[0] = None
+        assert arr[0] is pd.NaT
+
+        # coerce different types of datetime objects
+        vals = [np.datetime64("2010-10-10"), datetime(2010, 10, 10)]
+        for val in vals:
+            assert block._can_hold_element(val)
+            arr[0] = val
+
+        val = date(2010, 10, 10)
+        assert not block._can_hold_element(val)
+
+        msg = (
+            "value should be a 'Timestamp', 'NaT', "
+            "or array of those. Got 'date' instead."
+        )
+        with pytest.raises(TypeError, match=msg):
+            arr[0] = val
+
+    @pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64])
+    def test_interval_can_hold_element_emptylist(self, dtype, element):
+        arr = np.array([1, 3, 4], dtype=dtype)
+        ii = IntervalIndex.from_breaks(arr)
+        blk = new_block(ii._data, BlockPlacement([1]), ndim=2)
+
+        assert blk._can_hold_element([])
+        # TODO: check this holds for all blocks
+
+    @pytest.mark.parametrize("dtype", [np.int64, np.uint64, np.float64])
+    def test_interval_can_hold_element(self, dtype, element):
+        arr = np.array([1, 3, 4, 9], dtype=dtype)
+        ii = IntervalIndex.from_breaks(arr)
+        blk = new_block(ii._data, BlockPlacement([1]), ndim=2)
+
+        elem = element(ii)
+        self.check_series_setitem(elem, ii, True)
+        assert blk._can_hold_element(elem)
+
+        # Careful: to get the expected Series-inplace behavior we need
+        # `elem` to not have the same length as `arr`
+        ii2 = IntervalIndex.from_breaks(arr[:-1], closed="neither")
+        elem = element(ii2)
+        with pytest.raises(TypeError, match="Invalid value"):
+            self.check_series_setitem(elem, ii, False)
+        assert not blk._can_hold_element(elem)
+
+        ii3 = IntervalIndex.from_breaks([Timestamp(1), Timestamp(3), Timestamp(4)])
+        elem = element(ii3)
+        with pytest.raises(TypeError, match="Invalid value"):
+            self.check_series_setitem(elem, ii, False)
+        assert not blk._can_hold_element(elem)
+
+        ii4 = IntervalIndex.from_breaks([Timedelta(1), Timedelta(3), Timedelta(4)])
+        elem = element(ii4)
+        with pytest.raises(TypeError, match="Invalid value"):
+            self.check_series_setitem(elem, ii, False)
+        assert not blk._can_hold_element(elem)
+
+    def test_period_can_hold_element_emptylist(self):
+        pi = period_range("2016", periods=3, freq="Y")
+        blk = new_block(pi._data.reshape(1, 3), BlockPlacement([1]), ndim=2)
+
+        assert blk._can_hold_element([])
+
+    def test_period_can_hold_element(self, element):
+        pi = period_range("2016", periods=3, freq="Y")
+
+        elem = element(pi)
+        self.check_series_setitem(elem, pi, True)
+
+        # Careful: to get the expected Series-inplace behavior we need
+        # `elem` to not have the same length as `arr`
+        pi2 = pi.asfreq("D")[:-1]
+        elem = element(pi2)
+        with pytest.raises(TypeError, match="Invalid value"):
+            self.check_series_setitem(elem, pi, False)
+
+        dti = pi.to_timestamp("s")[:-1]
+        elem = element(dti)
+        with pytest.raises(TypeError, match="Invalid value"):
+            self.check_series_setitem(elem, pi, False)
+
+    def test_period_reindex_axis(self):
+        # GH#60273 Test reindexing of block with PeriodDtype
+        pi = period_range("2020", periods=5, freq="Y")
+        blk = new_block(pi._data.reshape(5, 1), BlockPlacement(slice(5)), ndim=2)
+        mgr = BlockManager(blocks=(blk,), axes=[Index(np.arange(5)), Index(["a"])])
+        reindexed = mgr.reindex_axis(Index([0, 2, 4]), axis=0)
+        result = DataFrame._from_mgr(reindexed, axes=reindexed.axes)
+        expected = DataFrame([[pi[0], pi[2], pi[4]]], columns=[0, 2, 4], index=["a"])
+        tm.assert_frame_equal(result, expected)
+
+    def check_can_hold_element(self, obj, elem, inplace: bool):
+        blk = obj._mgr.blocks[0]
+        if inplace:
+            assert blk._can_hold_element(elem)
+        else:
+            assert not blk._can_hold_element(elem)
+
+    def check_series_setitem(self, elem, index: Index, inplace: bool):
+        arr = index._data.copy()
+        ser = Series(arr, copy=False)
+
+        self.check_can_hold_element(ser, elem, inplace)
+
+        if is_scalar(elem):
+            ser[0] = elem
+        else:
+            ser[: len(elem)] = elem
+
+        if inplace:
+            assert ser._values is arr  # i.e. setting was done inplace
+        else:
+            assert ser.dtype == object
+
+
+class TestShouldStore:
+    def test_should_store_categorical(self):
+        cat = Categorical(["A", "B", "C"])
+        df = DataFrame(cat)
+        blk = df._mgr.blocks[0]
+
+        # matching dtype
+        assert blk.should_store(cat)
+        assert blk.should_store(cat[:-1])
+
+        # different dtype
+        assert not blk.should_store(cat.as_ordered())
+
+        # ndarray instead of Categorical
+        assert not blk.should_store(np.asarray(cat))
+
+
+def test_validate_ndim():
+    values = np.array([1.0, 2.0])
+    placement = BlockPlacement(slice(2))
+    msg = r"Wrong number of dimensions. values.ndim != ndim \[1 != 2\]"
+
+    depr_msg = "make_block is deprecated"
+    with pytest.raises(ValueError, match=msg):
+        with tm.assert_produces_warning(Pandas4Warning, match=depr_msg):
+            make_block(values, placement, ndim=2)
+
+
+def test_block_shape():
+    idx = Index([0, 1, 2, 3, 4])
+    a = Series([1, 2, 3]).reindex(idx)
+    b = Series(Categorical([1, 2, 3])).reindex(idx)
+
+    assert a._mgr.blocks[0].mgr_locs.indexer == b._mgr.blocks[0].mgr_locs.indexer
+
+
+def test_make_block_no_pandas_array(block_maker):
+    # https://github.com/pandas-dev/pandas/pull/24866
+    arr = pd.arrays.NumpyExtensionArray(np.array([1, 2]))
+
+    depr_msg = "make_block is deprecated"
+    warn = DeprecationWarning if block_maker is make_block else None
+
+    # NumpyExtensionArray, no dtype
+    with tm.assert_produces_warning(warn, match=depr_msg):
+        result = block_maker(arr, BlockPlacement(slice(len(arr))), ndim=arr.ndim)
+    assert result.dtype.kind in ["i", "u"]
+
+    if block_maker is make_block:
+        # new_block requires caller to unwrap NumpyExtensionArray
+        assert result.is_extension is False
+
+        # NumpyExtensionArray, NumpyEADtype
+        with tm.assert_produces_warning(warn, match=depr_msg):
+            result = block_maker(arr, slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim)
+        assert result.dtype.kind in ["i", "u"]
+        assert result.is_extension is False
+
+        # new_block no longer accepts dtype keyword
+        # ndarray, NumpyEADtype
+        with tm.assert_produces_warning(warn, match=depr_msg):
+            result = block_maker(
+                arr.to_numpy(), slice(len(arr)), dtype=arr.dtype, ndim=arr.ndim
+            )
+        assert result.dtype.kind in ["i", "u"]
+        assert result.is_extension is False
diff --git a/pandas/tests/io/__init__.py b/pandas/tests/io/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ce44e87570a77aa31b2cc534ced2c2797e66c74
--- /dev/null
+++ b/pandas/tests/io/conftest.py
@@ -0,0 +1,197 @@
+import uuid
+
+import pytest
+
+from pandas.compat import (
+    is_ci_environment,
+    is_platform_arm,
+    is_platform_mac,
+    is_platform_windows,
+)
+import pandas.util._test_decorators as td
+
+import pandas.io.common as icom
+from pandas.io.parsers import read_csv
+
+
+@pytest.fixture
+def compression_to_extension():
+    return {value: key for key, value in icom.extension_to_compression.items()}
+
+
+@pytest.fixture
+def tips_file(datapath):
+    """Path to the tips dataset"""
+    return datapath("io", "data", "csv", "tips.csv")
+
+
+@pytest.fixture
+def jsonl_file(datapath):
+    """Path to a JSONL dataset"""
+    return datapath("io", "parser", "data", "items.jsonl")
+
+
+@pytest.fixture
+def salaries_table(datapath):
+    """DataFrame with the salaries dataset"""
+    return read_csv(datapath("io", "parser", "data", "salaries.csv"), sep="\t")
+
+
+@pytest.fixture
+def feather_file(datapath):
+    return datapath("io", "data", "feather", "feather-0_3_1.feather")
+
+
+@pytest.fixture
+def xml_file(datapath):
+    return datapath("io", "data", "xml", "books.xml")
+
+
+@pytest.fixture(scope="session")
+def aws_credentials(monkeysession):
+    """Mocked AWS Credentials for moto."""
+    monkeysession.setenv("AWS_ACCESS_KEY_ID", "testing")
+    monkeysession.setenv("AWS_SECRET_ACCESS_KEY", "testing")
+    monkeysession.setenv("AWS_SECURITY_TOKEN", "testing")
+    monkeysession.setenv("AWS_SESSION_AWS_SESSION_TOKEN", "testing")
+    monkeysession.setenv("AWS_DEFAULT_REGION", "us-east-1")
+
+
+@pytest.fixture(scope="session")
+def moto_server(aws_credentials):
+    # use service container for Linux on GitHub Actions
+    if is_ci_environment() and not (
+        is_platform_mac() or is_platform_arm() or is_platform_windows()
+    ):
+        yield "http://localhost:5000"
+    else:
+        moto_server = pytest.importorskip("moto.server")
+        server = moto_server.ThreadedMotoServer(port=0)
+        server.start()
+        host, port = server.get_host_and_port()
+        yield f"http://{host}:{port}"
+        server.stop()
+
+
+@pytest.fixture
+def moto_s3_resource(moto_server):
+    boto3 = pytest.importorskip("boto3")
+    s3 = boto3.resource("s3", endpoint_url=moto_server)
+    return s3
+
+
+@pytest.fixture(scope="session")
+def s3so(moto_server):
+    return {
+        "client_kwargs": {
+            "endpoint_url": moto_server,
+        }
+    }
+
+
+@pytest.fixture
+def s3_bucket_public(moto_s3_resource):
+    """
+    Create a public S3 bucket using moto.
+    """
+    bucket_name = f"pandas-test-{uuid.uuid4()}"
+    bucket = moto_s3_resource.Bucket(bucket_name)
+    bucket.create(ACL="public-read")
+    yield bucket
+    bucket.objects.delete()
+    bucket.delete()
+
+
+@pytest.fixture
+def s3_bucket_private(moto_s3_resource):
+    """
+    Create a private S3 bucket using moto.
+    """
+    bucket_name = f"cant_get_it-{uuid.uuid4()}"
+    bucket = moto_s3_resource.Bucket(bucket_name)
+    bucket.create(ACL="private")
+    yield bucket
+    bucket.objects.delete()
+    bucket.delete()
+
+
+@pytest.fixture
+def s3_bucket_public_with_data(
+    s3_bucket_public, tips_file, jsonl_file, feather_file, xml_file
+):
+    """
+    The following datasets
+    are loaded.
+
+    - tips.csv
+    - tips.csv.gz
+    - tips.csv.bz2
+    - items.jsonl
+    """
+    test_s3_files = [
+        ("tips#1.csv", tips_file),
+        ("tips.csv", tips_file),
+        ("tips.csv.gz", tips_file + ".gz"),
+        ("tips.csv.bz2", tips_file + ".bz2"),
+        ("items.jsonl", jsonl_file),
+        ("simple_dataset.feather", feather_file),
+        ("books.xml", xml_file),
+    ]
+    for s3_key, file_name in test_s3_files:
+        with open(file_name, "rb") as f:
+            s3_bucket_public.put_object(Key=s3_key, Body=f)
+    return s3_bucket_public
+
+
+@pytest.fixture
+def s3_bucket_private_with_data(
+    s3_bucket_private, tips_file, jsonl_file, feather_file, xml_file
+):
+    """
+    The following datasets
+    are loaded.
+
+    - tips.csv
+    - tips.csv.gz
+    - tips.csv.bz2
+    - items.jsonl
+    """
+    test_s3_files = [
+        ("tips#1.csv", tips_file),
+        ("tips.csv", tips_file),
+        ("tips.csv.gz", tips_file + ".gz"),
+        ("tips.csv.bz2", tips_file + ".bz2"),
+        ("items.jsonl", jsonl_file),
+        ("simple_dataset.feather", feather_file),
+        ("books.xml", xml_file),
+    ]
+    for s3_key, file_name in test_s3_files:
+        with open(file_name, "rb") as f:
+            s3_bucket_private.put_object(Key=s3_key, Body=f)
+    return s3_bucket_private
+
+
+_compression_formats_params = [
+    (".no_compress", None),
+    ("", None),
+    (".gz", "gzip"),
+    (".GZ", "gzip"),
+    (".bz2", "bz2"),
+    (".BZ2", "bz2"),
+    (".zip", "zip"),
+    (".ZIP", "zip"),
+    (".xz", "xz"),
+    (".XZ", "xz"),
+    pytest.param((".zst", "zstd"), marks=td.skip_if_no("zstandard")),
+    pytest.param((".ZST", "zstd"), marks=td.skip_if_no("zstandard")),
+]
+
+
+@pytest.fixture(params=_compression_formats_params[1:])
+def compression_format(request):
+    return request.param
+
+
+@pytest.fixture(params=_compression_formats_params)
+def compression_ext(request):
+    return request.param[0]
diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..04f176a550edf0b927727788e958421b422c26d0
--- /dev/null
+++ b/pandas/tests/io/generate_legacy_storage_files.py
@@ -0,0 +1,421 @@
+"""
+self-contained to write legacy storage pickle files
+
+To use this script. Create an environment where you want
+generate pickles, say its for 0.20.3, with your pandas clone
+in ~/pandas
+
+. activate pandas_0.20.3
+cd ~/pandas/pandas
+
+$ python -m tests.io.generate_legacy_storage_files \
+    tests/io/data/legacy_pickle/0.20.3/ pickle
+
+This script generates a storage file for the current arch, system,
+and python version
+  pandas version: 0.20.3
+  output dir    : pandas/pandas/tests/io/data/legacy_pickle/0.20.3/
+  storage format: pickle
+created pickle file: 0.20.3_x86_64_darwin_3.5.2.pickle
+
+The idea here is you are using the *current* version of the
+generate_legacy_storage_files with an *older* version of pandas to
+generate a pickle file. We will then check this file into a current
+branch, and test using test_pickle.py. This will load the *older*
+pickles and test versus the current data that is generated
+(with main). These are then compared.
+
+If we have cases where we changed the signature (e.g. we renamed
+offset -> freq in Timestamp). Then we have to conditionally execute
+in the generate_legacy_storage_files.py to make it
+run under the older AND the newer version.
+
+"""
+
+from datetime import timedelta
+import os
+import pickle
+import platform as pl
+import sys
+
+# Remove script directory from path, otherwise Python will try to
+# import the JSON test directory as the json module
+sys.path.pop(0)
+
+import numpy as np
+
+import pandas
+from pandas import (
+    Categorical,
+    DataFrame,
+    Index,
+    MultiIndex,
+    NaT,
+    Period,
+    RangeIndex,
+    Series,
+    Timestamp,
+    bdate_range,
+    date_range,
+    interval_range,
+    period_range,
+    timedelta_range,
+)
+from pandas.arrays import SparseArray
+
+from pandas.tseries.offsets import (
+    FY5253,
+    BusinessDay,
+    BusinessHour,
+    CustomBusinessDay,
+    DateOffset,
+    Day,
+    Easter,
+    Hour,
+    LastWeekOfMonth,
+    Minute,
+    MonthBegin,
+    MonthEnd,
+    QuarterBegin,
+    QuarterEnd,
+    SemiMonthBegin,
+    SemiMonthEnd,
+    Week,
+    WeekOfMonth,
+    YearBegin,
+    YearEnd,
+)
+
+
+def _create_sp_series():
+    nan = np.nan
+
+    # nan-based
+    arr = np.arange(15, dtype=np.float64)
+    arr[7:12] = nan
+    arr[-1:] = nan
+
+    bseries = Series(SparseArray(arr, kind="block"))
+    bseries.name = "bseries"
+    return bseries
+
+
+def _create_sp_tsseries():
+    nan = np.nan
+
+    # nan-based
+    arr = np.arange(15, dtype=np.float64)
+    arr[7:12] = nan
+    arr[-1:] = nan
+
+    date_index = bdate_range("1/1/2011", periods=len(arr))
+    bseries = Series(SparseArray(arr, kind="block"), index=date_index)
+    bseries.name = "btsseries"
+    return bseries
+
+
+def _create_sp_frame():
+    nan = np.nan
+
+    data = {
+        "A": [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
+        "B": [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
+        "C": np.arange(10).astype(np.int64),
+        "D": [0, 1, 2, 3, 4, 5, nan, nan, nan, nan],
+    }
+
+    dates = bdate_range("1/1/2011", periods=10)
+    return DataFrame(data, index=dates).apply(SparseArray)
+
+
+def create_pickle_data(test: bool = True):
+    """create the pickle data"""
+    data = {
+        "A": [0.0, 1.0, 2.0, 3.0, np.nan],
+        "B": [0, 1, 0, 1, 0],
+        "C": ["foo1", "foo2", "foo3", "foo4", "foo5"],
+        "D": date_range("1/1/2009", periods=5),
+        "E": [0.0, 1, Timestamp("20100101"), "foo", 2.0],
+    }
+
+    scalars = {"timestamp": Timestamp("20130101"), "period": Period("2012", "M")}
+
+    index = {
+        "int": Index(np.arange(10)),
+        "date": date_range("20130101", periods=10),
+        "period": period_range("2013-01-01", freq="M", periods=10),
+        "float": Index(np.arange(10, dtype=np.float64)),
+        "uint": Index(np.arange(10, dtype=np.uint64)),
+        "timedelta": timedelta_range("00:00:00", freq="30min", periods=10),
+        "string": Index(["foo", "bar", "baz", "qux", "quux"], dtype="string"),
+    }
+
+    index["range"] = RangeIndex(10)
+
+    index["interval"] = interval_range(0, periods=10)
+
+    mi = {
+        "reg2": MultiIndex.from_tuples(
+            tuple(
+                zip(
+                    *[
+                        ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
+                        ["one", "two", "one", "two", "one", "two", "one", "two"],
+                    ]
+                )
+            ),
+            names=["first", "second"],
+        )
+    }
+
+    series = {
+        "float": Series(data["A"]),
+        "int": Series(data["B"]),
+        "mixed": Series(data["E"]),
+        "ts": Series(
+            np.arange(10).astype(np.int64), index=date_range("20130101", periods=10)
+        ),
+        "mi": Series(
+            np.arange(5).astype(np.float64),
+            index=MultiIndex.from_tuples(
+                tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), names=["one", "two"]
+            ),
+        ),
+        "dup": Series(np.arange(5).astype(np.float64), index=["A", "B", "C", "D", "A"]),
+        "cat": Series(Categorical(["foo", "bar", "baz"])),
+        "dt": Series(date_range("20130101", periods=5)),
+        "dt_tz": Series(date_range("20130101", periods=5, tz="US/Eastern")),
+        "period": Series([Period("2000Q1")] * 5),
+        "string": Series(["foo", "bar", "baz", "qux", "quux"], dtype="string"),
+    }
+
+    mixed_dup_df = DataFrame(data)
+    mixed_dup_df.columns = list("ABCDA")
+    frame = {
+        "float": DataFrame({"A": series["float"], "B": series["float"] + 1}),
+        "int": DataFrame({"A": series["int"], "B": series["int"] + 1}),
+        "mixed": DataFrame({k: data[k] for k in ["A", "B", "C", "D"]}),
+        "mi": DataFrame(
+            {"A": np.arange(5).astype(np.float64), "B": np.arange(5).astype(np.int64)},
+            index=MultiIndex.from_tuples(
+                tuple(
+                    zip(
+                        *[
+                            ["bar", "bar", "baz", "baz", "baz"],
+                            ["one", "two", "one", "two", "three"],
+                        ]
+                    )
+                ),
+                names=["first", "second"],
+            ),
+        ),
+        "dup": DataFrame(
+            np.arange(15).reshape(5, 3).astype(np.float64), columns=["A", "B", "A"]
+        ),
+        "cat_onecol": DataFrame({"A": Categorical(["foo", "bar"])}),
+        "cat_and_float": DataFrame(
+            {
+                "A": Categorical(["foo", "bar", "baz"]),
+                "B": np.arange(3).astype(np.int64),
+            }
+        ),
+        "mixed_dup": mixed_dup_df,
+        "dt_mixed_tzs": DataFrame(
+            {
+                "A": Timestamp("20130102", tz="US/Eastern"),
+                "B": Timestamp("20130603", tz="CET"),
+            },
+            index=range(5),
+        ),
+        "dt_mixed2_tzs": DataFrame(
+            {
+                "A": Timestamp("20130102", tz="US/Eastern"),
+                "B": Timestamp("20130603", tz="CET"),
+                "C": Timestamp("20130603", tz="UTC"),
+            },
+            index=range(5),
+        ),
+        "string": DataFrame(
+            {
+                "A": Series(["foo", "bar", "baz", "qux", "quux"], dtype="string"),
+                "B": Series(["one", "two", "one", "two", "three"], dtype="string"),
+            }
+        ),
+    }
+
+    cat = {
+        "int8": Categorical(list("abcdefg")),
+        "int16": Categorical(np.arange(1000)),
+        "int32": Categorical(np.arange(10000)),
+    }
+
+    timestamp = {
+        "normal": Timestamp("2011-01-01"),
+        "nat": NaT,
+        "tz": Timestamp("2011-01-01", tz="US/Eastern"),
+    }
+    if test:
+        # kept because those are present in the legacy pickles (<= 1.4)
+        timestamp["freq"] = Timestamp("2011-01-01")
+        timestamp["both"] = Timestamp("2011-01-01", tz="Asia/Tokyo")
+
+    off = {
+        "DateOffset": DateOffset(years=1),
+        "DateOffset_h_ns": DateOffset(hour=6, nanoseconds=5824),
+        "BusinessDay": BusinessDay(offset=timedelta(seconds=9)),
+        "BusinessHour": BusinessHour(normalize=True, n=6, end="15:14"),
+        "CustomBusinessDay": CustomBusinessDay(weekmask="Mon Fri"),
+        "SemiMonthBegin": SemiMonthBegin(day_of_month=9),
+        "SemiMonthEnd": SemiMonthEnd(day_of_month=24),
+        "MonthBegin": MonthBegin(1),
+        "MonthEnd": MonthEnd(1),
+        "QuarterBegin": QuarterBegin(1),
+        "QuarterEnd": QuarterEnd(1),
+        "Day": Day(1),
+        "YearBegin": YearBegin(1),
+        "YearEnd": YearEnd(1),
+        "Week": Week(1),
+        "Week_Tues": Week(2, normalize=False, weekday=1),
+        "WeekOfMonth": WeekOfMonth(week=3, weekday=4),
+        "LastWeekOfMonth": LastWeekOfMonth(n=1, weekday=3),
+        "FY5253": FY5253(n=2, weekday=6, startingMonth=7, variation="last"),
+        "Easter": Easter(),
+        "Hour": Hour(1),
+        "Minute": Minute(1),
+    }
+
+    return {
+        "series": series,
+        "frame": frame,
+        "index": index,
+        "scalars": scalars,
+        "mi": mi,
+        "sp_series": {"float": _create_sp_series(), "ts": _create_sp_tsseries()},
+        "sp_frame": {"float": _create_sp_frame()},
+        "cat": cat,
+        "timestamp": timestamp,
+        "offsets": off,
+    }
+
+
+def create_dataframe_all_types():
+    timestamps = Series(
+        [
+            Timestamp("2013-01-01"),
+            NaT,
+            Timestamp("2013-01-03"),
+            Timestamp("2013-01-04"),
+            Timestamp("2013-01-05"),
+        ]
+    )
+    timedeltas = timestamps - timestamps[0]
+
+    data = {
+        # "string": Series(
+        #     ["a", "b", "c", None, "e"], dtype=StringDtype(na_value=np.nan)
+        # ),
+        # "object": Series(["a", "b", "c", None, "e"], dtype=object),
+        # "object_nan": Series(["a", "b", "c", np.nan, "e"], dtype=object),
+        "int": list(range(1, 6)),
+        "uint64": np.arange(3, 8).astype("uint64"),
+        "float": [0.1, 0.2, 0.3, 0.4, np.nan],
+        "float32": Series([0.1, 0.2, 0.3, 0.4, np.nan], dtype="float32"),
+        "bool": [True, False, True, False, True],
+        "datetime_ns": timestamps.dt.as_unit("ns"),
+        "datetime_us": timestamps.dt.as_unit("us"),
+        "datetime_ms": timestamps.dt.as_unit("ms"),
+        "datetime_s": timestamps.dt.as_unit("s"),
+        "datetimetz_ns": timestamps.dt.tz_localize("US/Eastern").dt.as_unit("ns"),
+        "datetimetz_us": timestamps.dt.tz_localize("US/Eastern").dt.as_unit("us"),
+        "timedelta_ns": timedeltas.dt.as_unit("ns"),
+        "timedelta_us": timedeltas.dt.as_unit("us"),
+        "timedelta_ms": timedeltas.dt.as_unit("ms"),
+        "timedelta_s": timedeltas.dt.as_unit("s"),
+        # "categorical": Categorical(
+        #     Series(
+        #         ["foo", "bar", "baz",np.nan,"foo"],dtype=StringDtype(na_value=np.nan)
+        #     )
+        # ),
+        # "categorical_object": Categorical(
+        #     Series(["foo", "bar", "baz", np.nan, "foo"], dtype=object)
+        # ),
+        "categorical_int": Categorical([1, 2, 3, np.nan, 1]),
+    }
+    return DataFrame(data)
+
+
+def platform_name():
+    return "_".join(
+        [
+            str(pandas.__version__),
+            str(pl.machine()),
+            str(pl.system().lower()),
+            str(pl.python_version()),
+        ]
+    )
+
+
+def write_legacy_pickles(output_dir):
+    pth = f"{platform_name()}.pickle"
+
+    with open(os.path.join(output_dir, pth), "wb") as fh:
+        pickle.dump(create_pickle_data(test=False), fh, pickle.DEFAULT_PROTOCOL)
+
+    print(f"created pickle file: {pth}")
+
+
+def write_legacy_hdf(output_dir, format):
+    import tables
+
+    pth = f"{platform_name()}_pytables-{tables.__version__}_{format}.h5"
+
+    df = create_dataframe_all_types()
+    if format == "fixed":
+        # df = df.drop(columns=["categorical", "categorical_object", "categorical_int"])
+        df = df.drop(columns=["categorical_int"])
+    complevel = 9 if format == "table" else None
+    df.to_hdf(
+        os.path.join(output_dir, pth),
+        key="df_alltypes",
+        format=format,
+        complevel=complevel,
+    )
+
+    print(f"created hdf file: {pth}")
+
+
+def write_legacy_file():
+    # force our cwd to be the first searched
+    sys.path.insert(0, "")
+
+    if not 3 <= len(sys.argv) <= 4:
+        sys.exit(
+            "Specify output directory and storage type: generate_legacy_"
+            "storage_files.py <output_dir> <storage_type> "
+        )
+
+    output_dir = str(sys.argv[1])
+    storage_type = str(sys.argv[2])
+
+    print(
+        "This script generates a storage file for the current arch, system, "
+        "and python version"
+    )
+    print(f"  pandas version: {pandas.__version__}")
+    print(f"  output dir    : {output_dir}")
+    print(f"  storage format: {storage_type}")
+
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+
+    if storage_type == "pickle":
+        write_legacy_pickles(output_dir=output_dir)
+    elif storage_type == "hdf":
+        write_legacy_hdf(output_dir=output_dir, format="fixed")
+        write_legacy_hdf(output_dir=output_dir, format="table")
+    else:
+        sys.exit("storage_type must be one of {'pickle', 'hdf'}")
+
+
+if __name__ == "__main__":
+    write_legacy_file()
diff --git a/pandas/tests/io/test_clipboard.py b/pandas/tests/io/test_clipboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..25834c47c09c67c3feadd7817a4902e6bdeff378
--- /dev/null
+++ b/pandas/tests/io/test_clipboard.py
@@ -0,0 +1,402 @@
+from textwrap import dedent
+
+import numpy as np
+import pytest
+
+from pandas.errors import (
+    PyperclipException,
+    PyperclipWindowsException,
+)
+
+import pandas as pd
+from pandas import (
+    NA,
+    DataFrame,
+    Series,
+    get_option,
+    read_clipboard,
+)
+import pandas._testing as tm
+
+from pandas.io.clipboard import (
+    CheckedCall,
+    _stringifyText,
+    init_qt_clipboard,
+)
+
+
+def build_kwargs(sep, excel):
+    kwargs = {}
+    if excel != "default":
+        kwargs["excel"] = excel
+    if sep != "default":
+        kwargs["sep"] = sep
+    return kwargs
+
+
+@pytest.fixture(
+    params=[
+        "delims",
+        "utf8",
+        "utf16",
+        "string",
+        "long",
+        "nonascii",
+        "colwidth",
+        "mixed",
+        "float",
+        "int",
+    ]
+)
+def df(request):
+    data_type = request.param
+
+    if data_type == "delims":
+        return DataFrame({"a": ['"a,\t"b|c', "d\tef`"], "b": ["hi'j", "k''lm"]})
+    elif data_type == "utf8":
+        return DataFrame({"a": ["µasd", "Ωœ∑`"], "b": ["øπ∆˚¬", "œ∑`®"]})
+    elif data_type == "utf16":
+        return DataFrame(
+            {"a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"], "b": ["abc", "def"]}
+        )
+    elif data_type == "string":
+        return DataFrame(
+            np.array([f"i-{i}" for i in range(15)]).reshape(5, 3), columns=list("abc")
+        )
+    elif data_type == "long":
+        max_rows = get_option("display.max_rows")
+        return DataFrame(
+            np.random.default_rng(2).integers(0, 10, size=(max_rows + 1, 3)),
+            columns=list("abc"),
+        )
+    elif data_type == "nonascii":
+        return DataFrame({"en": "in English".split(), "es": "en español".split()})
+    elif data_type == "colwidth":
+        _cw = get_option("display.max_colwidth") + 1
+        return DataFrame(
+            np.array(["x" * _cw for _ in range(15)]).reshape(5, 3), columns=list("abc")
+        )
+    elif data_type == "mixed":
+        return DataFrame(
+            {
+                "a": np.arange(1.0, 6.0) + 0.01,
+                "b": np.arange(1, 6).astype(np.int64),
+                "c": list("abcde"),
+            }
+        )
+    elif data_type == "float":
+        return DataFrame(np.random.default_rng(2).random((5, 3)), columns=list("abc"))
+    elif data_type == "int":
+        return DataFrame(
+            np.random.default_rng(2).integers(0, 10, (5, 3)), columns=list("abc")
+        )
+    else:
+        raise ValueError
+
+
+@pytest.fixture
+def mock_ctypes(monkeypatch):
+    """
+    Mocks WinError to help with testing the clipboard.
+    """
+
+    def _mock_win_error():
+        return "Window Error"
+
+    # Set raising to False because WinError won't exist on non-windows platforms
+    with monkeypatch.context() as m:
+        m.setattr("ctypes.WinError", _mock_win_error, raising=False)
+        yield
+
+
+@pytest.mark.usefixtures("mock_ctypes")
+def test_checked_call_with_bad_call(monkeypatch):
+    """
+    Give CheckCall a function that returns a falsey value and
+    mock get_errno so it returns false so an exception is raised.
+    """
+
+    def _return_false():
+        return False
+
+    monkeypatch.setattr("pandas.io.clipboard.get_errno", lambda: True)
+    msg = f"Error calling {_return_false.__name__} \\(Window Error\\)"
+
+    with pytest.raises(PyperclipWindowsException, match=msg):
+        CheckedCall(_return_false)()
+
+
+@pytest.mark.usefixtures("mock_ctypes")
+def test_checked_call_with_valid_call(monkeypatch):
+    """
+    Give CheckCall a function that returns a truthy value and
+    mock get_errno so it returns true so an exception is not raised.
+    The function should return the results from _return_true.
+    """
+
+    def _return_true():
+        return True
+
+    monkeypatch.setattr("pandas.io.clipboard.get_errno", lambda: False)
+
+    # Give CheckedCall a callable that returns a truthy value s
+    checked_call = CheckedCall(_return_true)
+    assert checked_call() is True
+
+
+@pytest.mark.parametrize(
+    "text",
+    [
+        "String_test",
+        True,
+        1,
+        1.0,
+        1j,
+    ],
+)
+def test_stringify_text(text):
+    valid_types = (str, int, float, bool)
+
+    if isinstance(text, valid_types):
+        result = _stringifyText(text)
+        assert result == str(text)
+    else:
+        msg = (
+            "only str, int, float, and bool values "
+            f"can be copied to the clipboard, not {type(text).__name__}"
+        )
+        with pytest.raises(PyperclipException, match=msg):
+            _stringifyText(text)
+
+
+@pytest.fixture
+def set_pyqt_clipboard(monkeypatch):
+    qt_cut, qt_paste = init_qt_clipboard()
+    with monkeypatch.context() as m:
+        m.setattr(pd.io.clipboard, "clipboard_set", qt_cut)
+        m.setattr(pd.io.clipboard, "clipboard_get", qt_paste)
+        yield
+
+
+@pytest.fixture
+def clipboard(qapp):
+    clip = qapp.clipboard()
+    yield clip
+    clip.clear()
+
+
+@pytest.mark.single_cpu
+@pytest.mark.clipboard
+@pytest.mark.usefixtures("set_pyqt_clipboard")
+@pytest.mark.usefixtures("clipboard")
+class TestClipboard:
+    # Test that default arguments copy as tab delimited
+    # Test that explicit delimiters are respected
+    @pytest.mark.parametrize("sep", [None, "\t", ",", "|"])
+    @pytest.mark.parametrize("encoding", [None, "UTF-8", "utf-8", "utf8"])
+    def test_round_trip_frame_sep(self, df, sep, encoding):
+        df.to_clipboard(excel=None, sep=sep, encoding=encoding)
+        result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding)
+        tm.assert_frame_equal(df, result)
+
+    # Test white space separator
+    def test_round_trip_frame_string(self, df):
+        df.to_clipboard(excel=False, sep=None)
+        result = read_clipboard()
+        assert df.to_string() == result.to_string()
+        assert df.shape == result.shape
+
+    # Two character separator is not supported in to_clipboard
+    # Test that multi-character separators are not silently passed
+    def test_excel_sep_warning(self, df):
+        with tm.assert_produces_warning(
+            UserWarning,
+            match="to_clipboard in excel mode requires a single character separator.",
+            check_stacklevel=False,
+        ):
+            df.to_clipboard(excel=True, sep=r"\t")
+
+    # Separator is ignored when excel=False and should produce a warning
+    def test_copy_delim_warning(self, df):
+        with tm.assert_produces_warning(UserWarning, match="ignores the sep argument"):
+            df.to_clipboard(excel=False, sep="\t")
+
+    # Tests that the default behavior of to_clipboard is tab
+    # delimited and excel="True"
+    @pytest.mark.parametrize("sep", ["\t", None, "default"])
+    @pytest.mark.parametrize("excel", [True, None, "default"])
+    def test_clipboard_copy_tabs_default(self, sep, excel, df, clipboard):
+        kwargs = build_kwargs(sep, excel)
+        df.to_clipboard(**kwargs)
+        assert clipboard.text() == df.to_csv(sep="\t")
+
+    # Tests reading of white space separated tables
+    @pytest.mark.parametrize("sep", [None, "default"])
+    def test_clipboard_copy_strings(self, sep, df):
+        kwargs = build_kwargs(sep, False)
+        df.to_clipboard(**kwargs)
+        result = read_clipboard(sep=r"\s+")
+        assert result.to_string() == df.to_string()
+        assert df.shape == result.shape
+
+    def test_read_clipboard_infer_excel(self, clipboard):
+        # gh-19010: avoid warnings
+        clip_kwargs = {"engine": "python"}
+
+        text = dedent(
+            """
+            John James\tCharlie Mingus
+            1\t2
+            4\tHarry Carney
+            """.strip()
+        )
+        clipboard.setText(text)
+        df = read_clipboard(**clip_kwargs)
+
+        # excel data is parsed correctly
+        assert df.iloc[1, 1] == "Harry Carney"
+
+        # having diff tab counts doesn't trigger it
+        text = dedent(
+            """
+            a\t b
+            1  2
+            3  4
+            """.strip()
+        )
+        clipboard.setText(text)
+        res = read_clipboard(**clip_kwargs)
+
+        text = dedent(
+            """
+            a  b
+            1  2
+            3  4
+            """.strip()
+        )
+        clipboard.setText(text)
+        exp = read_clipboard(**clip_kwargs)
+
+        tm.assert_frame_equal(res, exp)
+
+    def test_infer_excel_with_nulls(self, clipboard):
+        # GH41108
+        text = "col1\tcol2\n1\tred\n\tblue\n2\tgreen"
+
+        clipboard.setText(text)
+        df = read_clipboard()
+        df_expected = DataFrame(
+            data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]}
+        )
+
+        # excel data is parsed correctly
+        tm.assert_frame_equal(df, df_expected)
+
+    @pytest.mark.parametrize(
+        "multiindex",
+        [
+            (  # Can't use `dedent` here as it will remove the leading `\t`
+                "\n".join(
+                    [
+                        "\t\t\tcol1\tcol2",
+                        "A\t0\tTrue\t1\tred",
+                        "A\t1\tTrue\t\tblue",
+                        "B\t0\tFalse\t2\tgreen",
+                    ]
+                ),
+                [["A", "A", "B"], [0, 1, 0], [True, True, False]],
+            ),
+            (
+                "\n".join(
+                    ["\t\tcol1\tcol2", "A\t0\t1\tred", "A\t1\t\tblue", "B\t0\t2\tgreen"]
+                ),
+                [["A", "A", "B"], [0, 1, 0]],
+            ),
+        ],
+    )
+    def test_infer_excel_with_multiindex(self, clipboard, multiindex):
+        # GH41108
+
+        clipboard.setText(multiindex[0])
+        df = read_clipboard()
+        df_expected = DataFrame(
+            data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]},
+            index=multiindex[1],
+        )
+
+        # excel data is parsed correctly
+        tm.assert_frame_equal(df, df_expected)
+
+    def test_invalid_encoding(self, df):
+        msg = "clipboard only supports utf-8 encoding"
+        # test case for testing invalid encoding
+        with pytest.raises(ValueError, match=msg):
+            df.to_clipboard(encoding="ascii")
+        with pytest.raises(NotImplementedError, match=msg):
+            read_clipboard(encoding="ascii")
+
+    @pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑`...", "abcd..."])
+    def test_raw_roundtrip(self, data):
+        # PR #25040 wide unicode wasn't copied correctly on PY3 on windows
+        df = DataFrame({"data": [data]})
+        df.to_clipboard()
+        result = read_clipboard()
+        tm.assert_frame_equal(df, result)
+
+    @pytest.mark.parametrize("engine", ["c", "python"])
+    def test_read_clipboard_dtype_backend(
+        self, clipboard, string_storage, dtype_backend, engine, using_infer_string
+    ):
+        # GH#50502
+        if dtype_backend == "pyarrow":
+            pa = pytest.importorskip("pyarrow")
+            string_dtype = pd.ArrowDtype(pa.string())
+        else:
+            string_dtype = pd.StringDtype(string_storage)
+
+        text = """a,b,c,d,e,f,g,h,i
+x,1,4.0,x,2,4.0,,True,False
+y,2,5.0,,,,,False,"""
+        clipboard.setText(text)
+
+        with pd.option_context("mode.string_storage", string_storage):
+            result = read_clipboard(sep=",", dtype_backend=dtype_backend, engine=engine)
+
+        expected = DataFrame(
+            {
+                "a": Series(["x", "y"], dtype=string_dtype),
+                "b": Series([1, 2], dtype="Int64"),
+                "c": Series([4.0, 5.0], dtype="Float64"),
+                "d": Series(["x", None], dtype=string_dtype),
+                "e": Series([2, NA], dtype="Int64"),
+                "f": Series([4.0, NA], dtype="Float64"),
+                "g": Series([NA, NA], dtype="Int64"),
+                "h": Series([True, False], dtype="boolean"),
+                "i": Series([False, NA], dtype="boolean"),
+            }
+        )
+        if dtype_backend == "pyarrow":
+            from pandas.arrays import ArrowExtensionArray
+
+            expected = DataFrame(
+                {
+                    col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
+                    for col in expected.columns
+                }
+            )
+            expected["g"] = ArrowExtensionArray(pa.array([None, None]))
+
+        if using_infer_string:
+            expected.columns = expected.columns.astype(
+                pd.StringDtype(string_storage, na_value=np.nan)
+            )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_invalid_dtype_backend(self):
+        msg = (
+            "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+            "'pyarrow' are allowed."
+        )
+        with pytest.raises(ValueError, match=msg):
+            read_clipboard(dtype_backend="numpy")
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5081109d2299799f9c982d8117f65157049d3ad
--- /dev/null
+++ b/pandas/tests/io/test_common.py
@@ -0,0 +1,688 @@
+"""
+Tests for the pandas.io.common functionalities
+"""
+
+import codecs
+import errno
+from functools import partial
+from io import (
+    BytesIO,
+    StringIO,
+    UnsupportedOperation,
+)
+import mmap
+import os
+from pathlib import Path
+import pickle
+import tempfile
+
+import numpy as np
+import pytest
+
+from pandas.compat import (
+    WASM,
+    is_platform_windows,
+)
+from pandas.compat.pyarrow import pa_version_under19p0
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas._testing as tm
+
+import pandas.io.common as icom
+
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+)
+
+
+class CustomFSPath:
+    """For testing fspath on unknown objects"""
+
+    def __init__(self, path) -> None:
+        self.path = path
+
+    def __fspath__(self):
+        return self.path
+
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+
+
+# https://github.com/cython/cython/issues/1720
+class TestCommonIOCapabilities:
+    data1 = """index,A,B,C,D
+foo,2,3,4,5
+bar,7,8,9,10
+baz,12,13,14,15
+qux,12,13,14,15
+foo2,12,13,14,15
+bar2,12,13,14,15
+"""
+
+    def test_expand_user(self):
+        filename = "~/sometest"
+        expanded_name = icom._expand_user(filename)
+
+        assert expanded_name != filename
+        assert os.path.isabs(expanded_name)
+        assert os.path.expanduser(filename) == expanded_name
+
+    def test_expand_user_normal_path(self):
+        filename = "/somefolder/sometest"
+        expanded_name = icom._expand_user(filename)
+
+        assert expanded_name == filename
+        assert os.path.expanduser(filename) == expanded_name
+
+    def test_stringify_path_pathlib(self):
+        rel_path = icom.stringify_path(Path("."))
+        assert rel_path == "."
+        redundant_path = icom.stringify_path(Path("foo//bar"))
+        assert redundant_path == os.path.join("foo", "bar")
+
+    def test_stringify_path_fspath(self):
+        p = CustomFSPath("foo/bar.csv")
+        result = icom.stringify_path(p)
+        assert result == "foo/bar.csv"
+
+    def test_stringify_file_and_path_like(self, temp_file):
+        # GH 38125: do not stringify file objects that are also path-like
+        fsspec = pytest.importorskip("fsspec")
+        with fsspec.open(f"file://{temp_file}", mode="wb") as fsspec_obj:
+            assert fsspec_obj == icom.stringify_path(fsspec_obj)
+
+    @pytest.mark.parametrize("path_type", [str, CustomFSPath, Path])
+    def test_infer_compression_from_path(self, compression_format, path_type):
+        extension, expected = compression_format
+        path = path_type("foo/bar.csv" + extension)
+        compression = icom.infer_compression(path, compression="infer")
+        assert compression == expected
+
+    @pytest.mark.parametrize("path_type", [str, CustomFSPath, Path])
+    def test_get_handle_with_path(self, path_type):
+        with tempfile.TemporaryDirectory(dir=Path.home()) as tmp:
+            filename = path_type("~/" + Path(tmp).name + "/sometest")
+            with icom.get_handle(filename, "w") as handles:
+                assert Path(handles.handle.name).is_absolute()
+                assert os.path.expanduser(filename) == handles.handle.name
+
+    def test_get_handle_with_buffer(self):
+        with StringIO() as input_buffer:
+            with icom.get_handle(input_buffer, "r") as handles:
+                assert handles.handle == input_buffer
+            assert not input_buffer.closed
+        assert input_buffer.closed
+
+    # Test that BytesIOWrapper(get_handle) returns correct amount of bytes every time
+    def test_bytesiowrapper_returns_correct_bytes(self):
+        # Test latin1, ucs-2, and ucs-4 chars
+        data = """a,b,c
+1,2,3
+©,®,®
+Look,a snake,🐍"""
+        with icom.get_handle(StringIO(data), "rb", is_text=False) as handles:
+            result = b""
+            chunksize = 5
+            while True:
+                chunk = handles.handle.read(chunksize)
+                # Make sure each chunk is correct amount of bytes
+                assert len(chunk) <= chunksize
+                if len(chunk) < chunksize:
+                    # Can be less amount of bytes, but only at EOF
+                    # which happens when read returns empty
+                    assert len(handles.handle.read()) == 0
+                    result += chunk
+                    break
+                result += chunk
+            assert result == data.encode("utf-8")
+
+    # Test that pyarrow can handle a file opened with get_handle
+    def test_get_handle_pyarrow_compat(sel, using_infer_string):
+        pa_csv = pytest.importorskip("pyarrow.csv")
+
+        # Test latin1, ucs-2, and ucs-4 chars
+        data = """a,b,c
+1,2,3
+©,®,®
+Look,a snake,🐍"""
+        expected = pd.DataFrame(
+            {"a": ["1", "©", "Look"], "b": ["2", "®", "a snake"], "c": ["3", "®", "🐍"]}
+        )
+        s = StringIO(data)
+        with icom.get_handle(s, "rb", is_text=False) as handles:
+            df = pa_csv.read_csv(handles.handle).to_pandas()
+            if pa_version_under19p0:
+                expected = expected.astype("object")
+            elif not using_infer_string:
+                expected = expected.astype(pd.StringDtype(na_value=np.nan))
+            tm.assert_frame_equal(df, expected)
+            assert not s.closed
+
+    def test_iterator(self):
+        with pd.read_csv(StringIO(self.data1), chunksize=1) as reader:
+            result = pd.concat(reader, ignore_index=True)
+        expected = pd.read_csv(StringIO(self.data1))
+        tm.assert_frame_equal(result, expected)
+
+        # GH12153
+        with pd.read_csv(StringIO(self.data1), chunksize=1) as it:
+            first = next(it)
+            tm.assert_frame_equal(first, expected.iloc[[0]])
+            tm.assert_frame_equal(pd.concat(it), expected.iloc[1:])
+
+    @pytest.mark.skipif(WASM, reason="limited file system access on WASM")
+    @pytest.mark.parametrize(
+        "reader, module, error_class, fn_ext",
+        [
+            (pd.read_csv, "os", FileNotFoundError, "csv"),
+            (pd.read_fwf, "os", FileNotFoundError, "txt"),
+            (pd.read_excel, "xlrd", FileNotFoundError, "xlsx"),
+            (pd.read_feather, "pyarrow", OSError, "feather"),
+            (pd.read_hdf, "tables", FileNotFoundError, "h5"),
+            (pd.read_stata, "os", FileNotFoundError, "dta"),
+            (pd.read_sas, "os", FileNotFoundError, "sas7bdat"),
+            (pd.read_json, "os", FileNotFoundError, "json"),
+            (pd.read_pickle, "os", FileNotFoundError, "pickle"),
+        ],
+    )
+    def test_read_non_existent(self, reader, module, error_class, fn_ext):
+        pytest.importorskip(module)
+
+        path = os.path.join(HERE, "data", "does_not_exist." + fn_ext)
+        msg1 = rf"File (b')?.+does_not_exist\.{fn_ext}'? does not exist"
+        msg2 = rf"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'"
+        msg3 = "Expected object or value"
+        msg4 = "path_or_buf needs to be a string file path or file-like"
+        msg5 = (
+            rf"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist: "
+            rf"'.+does_not_exist\.{fn_ext}'"
+        )
+        msg6 = rf"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'"
+        msg7 = (
+            rf"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'"
+        )
+        msg8 = rf"Failed to open local file.+does_not_exist\.{fn_ext}"
+
+        with pytest.raises(
+            error_class,
+            match=rf"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7}|{msg8})",
+        ):
+            reader(path)
+
+    @pytest.mark.parametrize(
+        "method, module, error_class, fn_ext",
+        [
+            (pd.DataFrame.to_csv, "os", OSError, "csv"),
+            (pd.DataFrame.to_html, "os", OSError, "html"),
+            (pd.DataFrame.to_excel, "xlrd", OSError, "xlsx"),
+            (pd.DataFrame.to_feather, "pyarrow", OSError, "feather"),
+            (pd.DataFrame.to_parquet, "pyarrow", OSError, "parquet"),
+            (pd.DataFrame.to_stata, "os", OSError, "dta"),
+            (pd.DataFrame.to_json, "os", OSError, "json"),
+            (pd.DataFrame.to_pickle, "os", OSError, "pickle"),
+        ],
+    )
+    # NOTE: Missing parent directory for pd.DataFrame.to_hdf is handled by PyTables
+    def test_write_missing_parent_directory(self, method, module, error_class, fn_ext):
+        pytest.importorskip(module)
+
+        dummy_frame = pd.DataFrame({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})
+
+        path = os.path.join(HERE, "data", "missing_folder", "does_not_exist." + fn_ext)
+
+        with pytest.raises(
+            error_class,
+            match=r"Cannot save file into a non-existent directory: .*missing_folder",
+        ):
+            method(dummy_frame, path)
+
+    @pytest.mark.skipif(WASM, reason="limited file system access on WASM")
+    @pytest.mark.parametrize(
+        "reader, module, error_class, fn_ext",
+        [
+            (pd.read_csv, "os", FileNotFoundError, "csv"),
+            (pd.read_table, "os", FileNotFoundError, "csv"),
+            (pd.read_fwf, "os", FileNotFoundError, "txt"),
+            (pd.read_excel, "xlrd", FileNotFoundError, "xlsx"),
+            (pd.read_feather, "pyarrow", OSError, "feather"),
+            (pd.read_hdf, "tables", FileNotFoundError, "h5"),
+            (pd.read_stata, "os", FileNotFoundError, "dta"),
+            (pd.read_sas, "os", FileNotFoundError, "sas7bdat"),
+            (pd.read_json, "os", FileNotFoundError, "json"),
+            (pd.read_pickle, "os", FileNotFoundError, "pickle"),
+        ],
+    )
+    def test_read_expands_user_home_dir(
+        self, reader, module, error_class, fn_ext, monkeypatch
+    ):
+        pytest.importorskip(module)
+
+        path = os.path.join("~", "does_not_exist." + fn_ext)
+        monkeypatch.setattr(icom, "_expand_user", lambda x: os.path.join("foo", x))
+
+        msg1 = rf"File (b')?.+does_not_exist\.{fn_ext}'? does not exist"
+        msg2 = rf"\[Errno 2\] No such file or directory: '.+does_not_exist\.{fn_ext}'"
+        msg3 = "Unexpected character found when decoding 'false'"
+        msg4 = "path_or_buf needs to be a string file path or file-like"
+        msg5 = (
+            rf"\[Errno 2\] File .+does_not_exist\.{fn_ext} does not exist: "
+            rf"'.+does_not_exist\.{fn_ext}'"
+        )
+        msg6 = rf"\[Errno 2\] 没有那个文件或目录: '.+does_not_exist\.{fn_ext}'"
+        msg7 = (
+            rf"\[Errno 2\] File o directory non esistente: '.+does_not_exist\.{fn_ext}'"
+        )
+        msg8 = rf"Failed to open local file.+does_not_exist\.{fn_ext}"
+
+        with pytest.raises(
+            error_class,
+            match=rf"({msg1}|{msg2}|{msg3}|{msg4}|{msg5}|{msg6}|{msg7}|{msg8})",
+        ):
+            reader(path)
+
+    @pytest.mark.parametrize(
+        "reader, module, path",
+        [
+            (pd.read_csv, "os", ("io", "data", "csv", "iris.csv")),
+            (pd.read_table, "os", ("io", "data", "csv", "iris.csv")),
+            (
+                pd.read_fwf,
+                "os",
+                ("io", "data", "fixed_width", "fixed_width_format.txt"),
+            ),
+            (pd.read_excel, "xlrd", ("io", "data", "excel", "test1.xlsx")),
+            (
+                pd.read_feather,
+                "pyarrow",
+                ("io", "data", "feather", "feather-0_3_1.feather"),
+            ),
+            (
+                pd.read_hdf,
+                "tables",
+                ("io", "data", "legacy_hdf", "pytables_native2.h5"),
+            ),
+            (pd.read_stata, "os", ("io", "data", "stata", "stata10_115.dta")),
+            (pd.read_sas, "os", ("io", "sas", "data", "test1.sas7bdat")),
+            (pd.read_json, "os", ("io", "json", "data", "tsframe_v012.json")),
+            (
+                pd.read_pickle,
+                "os",
+                ("io", "data", "pickle", "categorical.0.25.0.pickle"),
+            ),
+        ],
+    )
+    def test_read_fspath_all(self, reader, module, path, datapath):
+        pytest.importorskip(module)
+        path = datapath(*path)
+
+        mypath = CustomFSPath(path)
+        result = reader(mypath)
+        expected = reader(path)
+
+        if path.endswith(".pickle"):
+            # categorical
+            tm.assert_categorical_equal(result, expected)
+        else:
+            tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "writer_name, writer_kwargs, module",
+        [
+            ("to_csv", {}, "os"),
+            ("to_excel", {"engine": "openpyxl"}, "openpyxl"),
+            ("to_feather", {}, "pyarrow"),
+            ("to_html", {}, "os"),
+            ("to_json", {}, "os"),
+            ("to_latex", {}, "os"),
+            ("to_pickle", {}, "os"),
+            ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"),
+        ],
+    )
+    def test_write_fspath_all(self, writer_name, writer_kwargs, module, tmp_path):
+        if writer_name in ["to_latex"]:  # uses Styler implementation
+            pytest.importorskip("jinja2")
+        string = str(tmp_path / "string")
+        fspath = str(tmp_path / "fspath")
+        df = pd.DataFrame({"A": [1, 2]})
+
+        pytest.importorskip(module)
+        mypath = CustomFSPath(fspath)
+        writer = getattr(df, writer_name)
+
+        writer(string, **writer_kwargs)
+        writer(mypath, **writer_kwargs)
+        with open(string, "rb") as f_str, open(fspath, "rb") as f_path:
+            if writer_name == "to_excel":
+                # binary representation of excel contains time creation
+                # data that causes flaky CI failures
+                result = pd.read_excel(f_str, **writer_kwargs)
+                expected = pd.read_excel(f_path, **writer_kwargs)
+                tm.assert_frame_equal(result, expected)
+            else:
+                result = f_str.read()
+                expected = f_path.read()
+                assert result == expected
+
+    def test_write_fspath_hdf5(self, tmp_path):
+        # Same test as write_fspath_all, except HDF5 files aren't
+        # necessarily byte-for-byte identical for a given dataframe, so we'll
+        # have to read and compare equality
+        pytest.importorskip("tables")
+
+        df = pd.DataFrame({"A": [1, 2]})
+        string = str(tmp_path / "string")
+        fspath = str(tmp_path / "fspath")
+
+        mypath = CustomFSPath(fspath)
+        df.to_hdf(mypath, key="bar")
+        df.to_hdf(string, key="bar")
+
+        result = pd.read_hdf(fspath, key="bar")
+        expected = pd.read_hdf(string, key="bar")
+
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.fixture
+def mmap_file(datapath):
+    return datapath("io", "data", "csv", "test_mmap.csv")
+
+
+class TestMMapWrapper:
+    @pytest.mark.skipif(WASM, reason="limited file system access on WASM")
+    def test_constructor_bad_file(self, mmap_file):
+        non_file = StringIO("I am not a file")
+        non_file.fileno = lambda: -1
+
+        # the error raised is different on Windows
+        if is_platform_windows():
+            msg = "The parameter is incorrect"
+            err = OSError
+        else:
+            msg = "[Errno 22]"
+            err = mmap.error
+
+        with pytest.raises(err, match=msg):
+            icom._maybe_memory_map(non_file, True)
+
+        with open(mmap_file, encoding="utf-8") as target:
+            pass
+
+        msg = "I/O operation on closed file"
+        with pytest.raises(ValueError, match=msg):
+            icom._maybe_memory_map(target, True)
+
+    @pytest.mark.skipif(WASM, reason="limited file system access on WASM")
+    def test_next(self, mmap_file):
+        with open(mmap_file, encoding="utf-8") as target:
+            lines = target.readlines()
+
+            with icom.get_handle(
+                target, "r", is_text=True, memory_map=True
+            ) as wrappers:
+                wrapper = wrappers.handle
+                assert isinstance(wrapper.buffer.buffer, mmap.mmap)
+
+                for line in lines:
+                    next_line = next(wrapper)
+                    assert next_line.strip() == line.strip()
+
+                with pytest.raises(StopIteration, match=r"^$"):
+                    next(wrapper)
+
+    def test_unknown_engine(self, temp_file):
+        df = pd.DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
+        )
+        df.to_csv(temp_file)
+        with pytest.raises(ValueError, match="Unknown engine"):
+            pd.read_csv(temp_file, engine="pyt")
+
+    def test_binary_mode(self, temp_file):
+        """
+        'encoding' shouldn't be passed to 'open' in binary mode.
+
+        GH 35058
+        """
+        df = pd.DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
+        )
+        df.to_csv(temp_file, mode="w+b")
+        tm.assert_frame_equal(df, pd.read_csv(temp_file, index_col=0))
+
+    @pytest.mark.parametrize("encoding", ["utf-16", "utf-32"])
+    @pytest.mark.parametrize("compression_", ["bz2", "xz"])
+    def test_warning_missing_utf_bom(self, encoding, compression_, temp_file):
+        """
+        bz2 and xz do not write the byte order mark (BOM) for utf-16/32.
+
+        https://stackoverflow.com/questions/55171439
+
+        GH 35681
+        """
+        df = pd.DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
+        )
+        with tm.assert_produces_warning(UnicodeWarning, match="byte order mark"):
+            df.to_csv(temp_file, compression=compression_, encoding=encoding)
+
+        # reading should fail (otherwise we wouldn't need the warning)
+        msg = (
+            r"UTF-\d+ stream does not start with BOM|"
+            r"'utf-\d+' codec can't decode byte"
+        )
+        with pytest.raises(UnicodeError, match=msg):
+            pd.read_csv(temp_file, compression=compression_, encoding=encoding)
+
+
+def test_is_fsspec_url():
+    assert icom.is_fsspec_url("gcs://pandas/somethingelse.com")
+    assert icom.is_fsspec_url("gs://pandas/somethingelse.com")
+    # the following is the only remote URL that is handled without fsspec
+    assert not icom.is_fsspec_url("http://pandas/somethingelse.com")
+    assert not icom.is_fsspec_url("random:pandas/somethingelse.com")
+    assert not icom.is_fsspec_url("/local/path")
+    assert not icom.is_fsspec_url("relative/local/path")
+    # fsspec URL in string should not be recognized
+    assert not icom.is_fsspec_url("this is not fsspec://url")
+    assert not icom.is_fsspec_url("{'url': 'gs://pandas/somethingelse.com'}")
+    # accept everything that conforms to RFC 3986 schema
+    assert icom.is_fsspec_url("RFC-3986+compliant.spec://something")
+
+
+def test_is_fsspec_url_chained():
+    # GH#48978 Support chained fsspec URLs
+    # See https://filesystem-spec.readthedocs.io/en/latest/features.html#url-chaining.
+    assert icom.is_fsspec_url("filecache::s3://pandas/test.csv")
+    assert icom.is_fsspec_url("zip://test.csv::filecache::gcs://bucket/file.zip")
+    assert icom.is_fsspec_url("filecache::zip://test.csv::gcs://bucket/file.zip")
+    assert icom.is_fsspec_url("filecache::dask::s3://pandas/test.csv")
+    assert not icom.is_fsspec_url("filecache:s3://pandas/test.csv")
+    assert not icom.is_fsspec_url("filecache:::s3://pandas/test.csv")
+    assert not icom.is_fsspec_url("filecache::://pandas/test.csv")
+
+
+@pytest.mark.parametrize("format", ["csv", "json"])
+def test_codecs_encoding(format, temp_file):
+    # GH39247
+    expected = pd.DataFrame(
+        1.1 * np.arange(120).reshape((30, 4)),
+        columns=pd.Index(list("ABCD")),
+        index=pd.Index([f"i-{i}" for i in range(30)]),
+    )
+    with open(temp_file, mode="w", encoding="utf-8") as handle:
+        getattr(expected, f"to_{format}")(handle)
+    with open(temp_file, encoding="utf-8") as handle:
+        if format == "csv":
+            df = pd.read_csv(handle, index_col=0)
+        else:
+            df = pd.read_json(handle)
+    tm.assert_frame_equal(expected, df)
+
+
+def test_codecs_get_writer_reader(temp_file):
+    # GH39247
+    expected = pd.DataFrame(
+        1.1 * np.arange(120).reshape((30, 4)),
+        columns=pd.Index(list("ABCD")),
+        index=pd.Index([f"i-{i}" for i in range(30)]),
+    )
+    with open(temp_file, "wb") as handle:
+        with codecs.getwriter("utf-8")(handle) as encoded:
+            expected.to_csv(encoded)
+    with open(temp_file, "rb") as handle:
+        with codecs.getreader("utf-8")(handle) as encoded:
+            df = pd.read_csv(encoded, index_col=0)
+    tm.assert_frame_equal(expected, df)
+
+
+@pytest.mark.parametrize(
+    "io_class,mode,msg",
+    [
+        (BytesIO, "t", "a bytes-like object is required, not 'str'"),
+        (StringIO, "b", "string argument expected, got 'bytes'"),
+    ],
+)
+def test_explicit_encoding(io_class, mode, msg):
+    # GH39247; this test makes sure that if a user provides mode="*t" or "*b",
+    # it is used. In the case of this test it leads to an error as intentionally the
+    # wrong mode is requested
+    expected = pd.DataFrame(
+        1.1 * np.arange(120).reshape((30, 4)),
+        columns=pd.Index(list("ABCD")),
+        index=pd.Index([f"i-{i}" for i in range(30)]),
+    )
+    with io_class() as buffer:
+        with pytest.raises(TypeError, match=msg):
+            expected.to_csv(buffer, mode=f"w{mode}")
+
+
+@pytest.mark.parametrize("encoding_errors", ["strict", "replace"])
+@pytest.mark.parametrize("format", ["csv", "json"])
+def test_encoding_errors(encoding_errors, format, temp_file):
+    # GH39450
+    msg = "'utf-8' codec can't decode byte"
+    bad_encoding = b"\xe4"
+
+    if format == "csv":
+        content = b"," + bad_encoding + b"\n" + bad_encoding * 2 + b"," + bad_encoding
+        reader = partial(pd.read_csv, index_col=0)
+    else:
+        content = (
+            b'{"'
+            + bad_encoding * 2
+            + b'": {"'
+            + bad_encoding
+            + b'":"'
+            + bad_encoding
+            + b'"}}'
+        )
+        reader = partial(pd.read_json, orient="index")
+    file = temp_file
+    file.write_bytes(content)
+
+    if encoding_errors != "replace":
+        with pytest.raises(UnicodeDecodeError, match=msg):
+            reader(temp_file, encoding_errors=encoding_errors)
+    else:
+        df = reader(temp_file, encoding_errors=encoding_errors)
+        decoded = bad_encoding.decode(errors=encoding_errors)
+        expected = pd.DataFrame({decoded: [decoded]}, index=[decoded * 2])
+        tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.parametrize("encoding_errors", [0, None])
+def test_encoding_errors_badtype(encoding_errors):
+    # GH 59075
+    content = StringIO("A,B\n1,2\n3,4\n")
+    reader = partial(pd.read_csv, encoding_errors=encoding_errors)
+    expected_error = "encoding_errors must be a string, got "
+    expected_error += f"{type(encoding_errors).__name__}"
+    with pytest.raises(ValueError, match=expected_error):
+        reader(content)
+
+
+def test_bad_encdoing_errors(temp_file):
+    # GH 39777
+    with pytest.raises(LookupError, match="unknown error handler name"):
+        icom.get_handle(temp_file, "w", errors="bad")
+
+
+@pytest.mark.skipif(WASM, reason="limited file system access on WASM")
+def test_errno_attribute():
+    # GH 13872
+    with pytest.raises(FileNotFoundError, match="\\[Errno 2\\]") as err:
+        pd.read_csv("doesnt_exist")
+        assert err.errno == errno.ENOENT
+
+
+def test_fail_mmap():
+    with pytest.raises(UnsupportedOperation, match="fileno"):
+        with BytesIO() as buffer:
+            icom.get_handle(buffer, "rb", memory_map=True)
+
+
+def test_close_on_error():
+    # GH 47136
+    class TestError:
+        def close(self):
+            raise OSError("test")
+
+    with pytest.raises(OSError, match="test"):
+        with BytesIO() as buffer:
+            with icom.get_handle(buffer, "rb") as handles:
+                handles.created_handles.append(TestError())
+
+
+@td.skip_if_no("fsspec")
+@pytest.mark.parametrize("compression", [None, "infer"])
+def test_read_csv_chained_url_no_error(datapath, compression):
+    # GH 60100
+    tar_file_path = datapath("io", "data", "tar", "test-csv.tar")
+    chained_file_url = f"tar://test.csv::file://{tar_file_path}"
+
+    result = pd.read_csv(chained_file_url, compression=compression, sep=";")
+    expected = pd.DataFrame({"1": {0: 3}, "2": {0: 4}})
+
+    tm.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize(
+    "reader",
+    [
+        pd.read_csv,
+        pd.read_fwf,
+        pd.read_excel,
+        pd.read_feather,
+        pd.read_hdf,
+        pd.read_stata,
+        pd.read_sas,
+        pd.read_json,
+        pd.read_pickle,
+    ],
+)
+def test_pickle_reader(reader):
+    # GH 22265
+    with BytesIO() as buffer:
+        pickle.dump(reader, buffer)
+
+
+@td.skip_if_no("pyarrow")
+def test_pyarrow_read_csv_datetime_dtype():
+    # GH 59904
+    data = '"date"\n"20/12/2025"\n""\n"31/12/2020"'
+    result = pd.read_csv(
+        StringIO(data), parse_dates=["date"], dayfirst=True, dtype_backend="pyarrow"
+    )
+
+    expect_data = pd.to_datetime(["20/12/2025", pd.NaT, "31/12/2020"], dayfirst=True)
+    expect = pd.DataFrame({"date": expect_data})
+
+    tm.assert_frame_equal(expect, result)
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
new file mode 100644
index 0000000000000000000000000000000000000000..97b64a29a7f2cff7bfd5c1311fab9b40dcdf1a0a
--- /dev/null
+++ b/pandas/tests/io/test_compression.py
@@ -0,0 +1,382 @@
+import gzip
+import io
+import os
+import subprocess
+import sys
+import tarfile
+import textwrap
+import zipfile
+
+import numpy as np
+import pytest
+
+from pandas.compat import is_platform_windows
+
+import pandas as pd
+import pandas._testing as tm
+
+import pandas.io.common as icom
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame(
+            100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+            columns=["X", "Y", "Z"],
+        ),
+        pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
+    ],
+)
+@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
+def test_compression_size(obj, method, compression_only, temp_file):
+    if compression_only == "tar":
+        compression_only = {"method": "tar", "mode": "w:gz"}
+
+    path = temp_file
+    getattr(obj, method)(path, compression=compression_only)
+    compressed_size = os.path.getsize(path)
+    getattr(obj, method)(path, compression=None)
+    uncompressed_size = os.path.getsize(path)
+    assert uncompressed_size > compressed_size
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame(
+            100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+            columns=["X", "Y", "Z"],
+        ),
+        pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
+    ],
+)
+@pytest.mark.parametrize("method", ["to_csv", "to_json"])
+def test_compression_size_fh(obj, method, compression_only, temp_file):
+    path = temp_file
+    with icom.get_handle(
+        path,
+        "w:gz" if compression_only == "tar" else "w",
+        compression=compression_only,
+    ) as handles:
+        getattr(obj, method)(handles.handle)
+        assert not handles.handle.closed
+    compressed_size = os.path.getsize(path)
+
+    # Create a new temporary file for uncompressed comparison
+    path2 = temp_file.parent / f"{temp_file.stem}_uncompressed{temp_file.suffix}"
+    path2.touch()
+    with icom.get_handle(path2, "w", compression=None) as handles:
+        getattr(obj, method)(handles.handle)
+        assert not handles.handle.closed
+    uncompressed_size = os.path.getsize(path2)
+    assert uncompressed_size > compressed_size
+
+
+@pytest.mark.parametrize(
+    "write_method, write_kwargs, read_method",
+    [
+        ("to_csv", {"index": False}, pd.read_csv),
+        ("to_json", {}, pd.read_json),
+        ("to_pickle", {}, pd.read_pickle),
+    ],
+)
+def test_dataframe_compression_defaults_to_infer(
+    write_method,
+    write_kwargs,
+    read_method,
+    compression_only,
+    compression_to_extension,
+    temp_file,
+):
+    # GH22004
+    input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"])
+    extension = compression_to_extension[compression_only]
+    path = temp_file.parent / f"compressed{extension}"
+    getattr(input, write_method)(path, **write_kwargs)
+    output = read_method(path, compression=compression_only)
+    tm.assert_frame_equal(output, input)
+
+
+@pytest.mark.parametrize(
+    "write_method,write_kwargs,read_method,read_kwargs",
+    [
+        ("to_csv", {"index": False, "header": True}, pd.read_csv, {"squeeze": True}),
+        ("to_json", {}, pd.read_json, {"typ": "series"}),
+        ("to_pickle", {}, pd.read_pickle, {}),
+    ],
+)
+def test_series_compression_defaults_to_infer(
+    write_method,
+    write_kwargs,
+    read_method,
+    read_kwargs,
+    compression_only,
+    compression_to_extension,
+    temp_file,
+):
+    # GH22004
+    input = pd.Series([0, 5, -2, 10], name="X")
+    extension = compression_to_extension[compression_only]
+    path = temp_file.parent / f"compressed{extension}"
+    getattr(input, write_method)(path, **write_kwargs)
+    if "squeeze" in read_kwargs:
+        kwargs = read_kwargs.copy()
+        del kwargs["squeeze"]
+        output = read_method(path, compression=compression_only, **kwargs).squeeze(
+            "columns"
+        )
+    else:
+        output = read_method(path, compression=compression_only, **read_kwargs)
+    tm.assert_series_equal(output, input, check_names=False)
+
+
+def test_compression_warning(compression_only, temp_file):
+    # Assert that passing a file object to to_csv while explicitly specifying a
+    # compression protocol triggers a RuntimeWarning, as per GH21227.
+    df = pd.DataFrame(
+        100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+        columns=["X", "Y", "Z"],
+    )
+    path = temp_file
+    with icom.get_handle(path, "w", compression=compression_only) as handles:
+        with tm.assert_produces_warning(RuntimeWarning, match="has no effect"):
+            df.to_csv(handles.handle, compression=compression_only)
+
+
+def test_compression_binary(compression_only, temp_file):
+    """
+    Binary file handles support compression.
+
+    GH22555
+    """
+    df = pd.DataFrame(
+        1.1 * np.arange(120).reshape((30, 4)),
+        columns=pd.Index(list("ABCD")),
+        index=pd.Index([f"i-{i}" for i in range(30)]),
+    )
+
+    # with a file
+    path = temp_file
+    with open(path, mode="wb") as file:
+        df.to_csv(file, mode="wb", compression=compression_only)
+        file.seek(0)  # file shouldn't be closed
+    tm.assert_frame_equal(
+        df, pd.read_csv(path, index_col=0, compression=compression_only)
+    )
+
+    # with BytesIO
+    file = io.BytesIO()
+    df.to_csv(file, mode="wb", compression=compression_only)
+    file.seek(0)  # file shouldn't be closed
+    tm.assert_frame_equal(
+        df, pd.read_csv(file, index_col=0, compression=compression_only)
+    )
+
+
+def test_gzip_reproducibility_file_name(temp_file):
+    """
+    Gzip should create reproducible archives with mtime.
+
+    Note: Archives created with different filenames will still be different!
+
+    GH 28103
+    """
+    df = pd.DataFrame(
+        1.1 * np.arange(120).reshape((30, 4)),
+        columns=pd.Index(list("ABCD")),
+        index=pd.Index([f"i-{i}" for i in range(30)]),
+    )
+    compression_options = {"method": "gzip", "mtime": 1}
+
+    # test for filename
+    path = temp_file
+    df.to_csv(path, compression=compression_options)
+    output = path.read_bytes()
+    df.to_csv(path, compression=compression_options)
+    assert output == path.read_bytes()
+
+
+def test_gzip_reproducibility_file_object():
+    """
+    Gzip should create reproducible archives with mtime.
+
+    GH 28103
+    """
+    df = pd.DataFrame(
+        1.1 * np.arange(120).reshape((30, 4)),
+        columns=pd.Index(list("ABCD")),
+        index=pd.Index([f"i-{i}" for i in range(30)]),
+    )
+    compression_options = {"method": "gzip", "mtime": 1}
+
+    # test for file object
+    buffer = io.BytesIO()
+    df.to_csv(buffer, compression=compression_options, mode="wb")
+    output = buffer.getvalue()
+    buffer = io.BytesIO()
+    df.to_csv(buffer, compression=compression_options, mode="wb")
+    assert output == buffer.getvalue()
+
+
+@pytest.mark.single_cpu
+def test_with_missing_lzma():
+    """Tests if import pandas works when lzma is not present."""
+    # https://github.com/pandas-dev/pandas/issues/27575
+    code = textwrap.dedent(
+        """\
+        import sys
+        sys.modules['lzma'] = None
+        import pandas
+        """
+    )
+    subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE)
+
+
+@pytest.mark.single_cpu
+def test_with_missing_lzma_runtime():
+    """Tests if ModuleNotFoundError is hit when calling lzma without
+    having the module available.
+    """
+    code = textwrap.dedent(
+        """
+        import sys
+        import pytest
+        sys.modules['lzma'] = None
+        import pandas as pd
+        df = pd.DataFrame()
+        with pytest.raises(ModuleNotFoundError, match='import of lzma'):
+            df.to_csv('foo.csv', compression='xz')
+        """
+    )
+    subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE)
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame(
+            100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+            columns=["X", "Y", "Z"],
+        ),
+        pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
+    ],
+)
+@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
+def test_gzip_compression_level(obj, method, temp_file):
+    # GH33196
+    path = temp_file
+    getattr(obj, method)(path, compression="gzip")
+    compressed_size_default = os.path.getsize(path)
+    getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1})
+    compressed_size_fast = os.path.getsize(path)
+    assert compressed_size_default < compressed_size_fast
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame(
+            100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+            columns=["X", "Y", "Z"],
+        ),
+        pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
+    ],
+)
+@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
+def test_xz_compression_level_read(obj, method, temp_file):
+    path = temp_file
+    getattr(obj, method)(path, compression="xz")
+    compressed_size_default = os.path.getsize(path)
+    getattr(obj, method)(path, compression={"method": "xz", "preset": 1})
+    compressed_size_fast = os.path.getsize(path)
+    assert compressed_size_default < compressed_size_fast
+    if method == "to_csv":
+        pd.read_csv(path, compression="xz")
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame(
+            100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+            columns=["X", "Y", "Z"],
+        ),
+        pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
+    ],
+)
+@pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
+def test_bzip_compression_level(obj, method, temp_file):
+    """GH33196 bzip needs file size > 100k to show a size difference between
+    compression levels, so here we just check if the call works when
+    compression is passed as a dict.
+    """
+    path = temp_file
+    getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1})
+
+
+@pytest.mark.parametrize(
+    "suffix,archive",
+    [
+        (".zip", zipfile.ZipFile),
+        (".tar", tarfile.TarFile),
+    ],
+)
+def test_empty_archive_zip(suffix, archive, temp_file):
+    path = temp_file.parent / f"archive{suffix}"
+    with archive(path, "w"):
+        pass
+    with pytest.raises(ValueError, match="Zero files found"):
+        pd.read_csv(path)
+
+
+def test_ambiguous_archive_zip(temp_file):
+    path = temp_file.parent / "archive.zip"
+    with zipfile.ZipFile(path, "w") as file:
+        file.writestr("a.csv", "foo,bar")
+        file.writestr("b.csv", "foo,bar")
+    with pytest.raises(ValueError, match="Multiple files found in ZIP file"):
+        pd.read_csv(path)
+
+
+def test_ambiguous_archive_tar(tmp_path):
+    csvAPath = tmp_path / "a.csv"
+    with open(csvAPath, "w", encoding="utf-8") as a:
+        a.write("foo,bar\n")
+    csvBPath = tmp_path / "b.csv"
+    with open(csvBPath, "w", encoding="utf-8") as b:
+        b.write("foo,bar\n")
+
+    tarpath = tmp_path / "archive.tar"
+    with tarfile.TarFile(tarpath, "w") as tar:
+        tar.add(csvAPath, "a.csv")
+        tar.add(csvBPath, "b.csv")
+
+    with pytest.raises(ValueError, match="Multiple files found in TAR archive"):
+        pd.read_csv(tarpath)
+
+
+def test_tar_gz_to_different_filename(temp_file):
+    file = temp_file.parent / "archive.foo"
+    pd.DataFrame(
+        [["1", "2"]],
+        columns=["foo", "bar"],
+    ).to_csv(file, compression={"method": "tar", "mode": "w:gz"}, index=False)
+    with gzip.open(file) as uncompressed:
+        with tarfile.TarFile(fileobj=uncompressed) as archive:
+            members = archive.getmembers()
+            assert len(members) == 1
+            content = archive.extractfile(members[0]).read().decode("utf8")
+
+            if is_platform_windows():
+                expected = "foo,bar\r\n1,2\r\n"
+            else:
+                expected = "foo,bar\n1,2\n"
+
+            assert content == expected
+
+
+def test_tar_no_error_on_close():
+    with io.BytesIO() as buffer:
+        with icom._BytesTarFile(fileobj=buffer, mode="w"):
+            pass
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
new file mode 100644
index 0000000000000000000000000000000000000000..6351a9760b773e2ffdd1547061b7f1918ce325e4
--- /dev/null
+++ b/pandas/tests/io/test_feather.py
@@ -0,0 +1,291 @@
+"""test feather-format compat"""
+
+from datetime import datetime
+import zoneinfo
+
+import numpy as np
+import pytest
+
+from pandas.compat.pyarrow import (
+    pa_version_under18p0,
+    pa_version_under19p0,
+)
+
+import pandas as pd
+import pandas._testing as tm
+
+from pandas.io.feather_format import read_feather, to_feather  # isort:skip
+
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+)
+
+
+pa = pytest.importorskip("pyarrow")
+
+
+@pytest.mark.single_cpu
+class TestFeather:
+    def check_error_on_write(self, df, exc, err_msg, temp_file):
+        # check that we are raising the exception
+        # on writing
+
+        with pytest.raises(exc, match=err_msg):
+            to_feather(df, temp_file)
+
+    def check_external_error_on_write(self, df, temp_file):
+        # check that we are raising the exception
+        # on writing
+
+        with tm.external_error_raised(Exception):
+            to_feather(df, temp_file)
+
+    def check_round_trip(
+        self, df, temp_file, expected=None, write_kwargs=None, **read_kwargs
+    ):
+        if write_kwargs is None:
+            write_kwargs = {}
+        if expected is None:
+            expected = df.copy()
+
+        to_feather(df, temp_file, **write_kwargs)
+
+        result = read_feather(temp_file, **read_kwargs)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_error(self, temp_file):
+        msg = "feather only support IO with DataFrames"
+        for obj in [
+            pd.Series([1, 2, 3]),
+            1,
+            "foo",
+            pd.Timestamp("20130101"),
+            np.array([1, 2, 3]),
+        ]:
+            self.check_error_on_write(obj, ValueError, msg, temp_file)
+
+    def test_basic(self, temp_file):
+        tz = zoneinfo.ZoneInfo("US/Eastern")
+        df = pd.DataFrame(
+            {
+                "string": list("abc"),
+                "int": list(range(1, 4)),
+                "uint": np.arange(3, 6).astype("u1"),
+                "float": np.arange(4.0, 7.0, dtype="float64"),
+                "float_with_null": [1.0, np.nan, 3],
+                "bool": [True, False, True],
+                "bool_with_null": [True, np.nan, False],
+                "cat": pd.Categorical(list("abc")),
+                "dt": pd.DatetimeIndex(
+                    list(pd.date_range("20130101", periods=3)), freq=None
+                ),
+                "dttz": pd.DatetimeIndex(
+                    list(pd.date_range("20130101", periods=3, tz=tz)),
+                    freq=None,
+                ),
+                "dt_with_null": [
+                    pd.Timestamp("20130101"),
+                    pd.NaT,
+                    pd.Timestamp("20130103"),
+                ],
+                "dtns": pd.DatetimeIndex(
+                    list(pd.date_range("20130101", periods=3, freq="ns")), freq=None
+                ),
+            }
+        )
+        df["periods"] = pd.period_range("2013", freq="M", periods=3)
+        df["timedeltas"] = pd.timedelta_range("1 day", periods=3)
+        df["intervals"] = pd.interval_range(0, 3, 3)
+
+        assert df.dttz.dtype.tz.key == "US/Eastern"
+
+        expected = df.copy()
+        expected.loc[1, "bool_with_null"] = None
+        self.check_round_trip(df, temp_file, expected=expected)
+
+    def test_duplicate_columns(self, temp_file):
+        # https://github.com/wesm/feather/issues/53
+        # not currently able to handle duplicate columns
+        df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()
+        self.check_external_error_on_write(df, temp_file)
+
+    def test_read_columns(self, temp_file):
+        # GH 24025
+        df = pd.DataFrame(
+            {
+                "col1": list("abc"),
+                "col2": list(range(1, 4)),
+                "col3": list("xyz"),
+                "col4": list(range(4, 7)),
+            }
+        )
+        columns = ["col1", "col3"]
+        self.check_round_trip(df, temp_file, expected=df[columns], columns=columns)
+
+    def test_read_columns_different_order(self, temp_file):
+        # GH 33878
+        df = pd.DataFrame({"A": [1, 2], "B": ["x", "y"], "C": [True, False]})
+        expected = df[["B", "A"]]
+        self.check_round_trip(df, temp_file, expected, columns=["B", "A"])
+
+    def test_unsupported_other(self, temp_file):
+        # mixed python objects
+        df = pd.DataFrame({"a": ["a", 1, 2.0]})
+        self.check_external_error_on_write(df, temp_file)
+
+    def test_rw_use_threads(self, temp_file):
+        df = pd.DataFrame({"A": np.arange(100000)})
+        self.check_round_trip(df, temp_file, use_threads=True)
+        self.check_round_trip(df, temp_file, use_threads=False)
+
+    def test_path_pathlib(self, temp_file):
+        df = pd.DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
+        ).reset_index()
+        result = tm.round_trip_pathlib(df.to_feather, read_feather, temp_file)
+        tm.assert_frame_equal(df, result)
+
+    def test_passthrough_keywords(self, temp_file):
+        df = pd.DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
+        ).reset_index()
+        self.check_round_trip(df, temp_file, write_kwargs={"version": 1})
+
+    @pytest.mark.network
+    @pytest.mark.single_cpu
+    def test_http_path(self, feather_file, httpserver):
+        # GH 29055
+        expected = read_feather(feather_file)
+        with open(feather_file, "rb") as f:
+            httpserver.serve_content(content=f.read())
+            res = read_feather(httpserver.url)
+        tm.assert_frame_equal(expected, res)
+
+    def test_read_feather_dtype_backend(
+        self, string_storage, dtype_backend, using_infer_string, temp_file
+    ):
+        # GH#50765
+        df = pd.DataFrame(
+            {
+                "a": pd.Series([1, pd.NA, 3], dtype="Int64"),
+                "b": pd.Series([1, 2, 3], dtype="Int64"),
+                "c": pd.Series([1.5, pd.NA, 2.5], dtype="Float64"),
+                "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
+                "e": [True, False, None],
+                "f": [True, False, True],
+                "g": ["a", "b", "c"],
+                "h": ["a", "b", None],
+            }
+        )
+
+        to_feather(df, temp_file)
+        with pd.option_context("mode.string_storage", string_storage):
+            result = read_feather(temp_file, dtype_backend=dtype_backend)
+
+        if dtype_backend == "pyarrow":
+            pa = pytest.importorskip("pyarrow")
+            if using_infer_string:
+                string_dtype = pd.ArrowDtype(pa.large_string())
+            else:
+                string_dtype = pd.ArrowDtype(pa.string())
+        else:
+            string_dtype = pd.StringDtype(string_storage)
+
+        expected = pd.DataFrame(
+            {
+                "a": pd.Series([1, pd.NA, 3], dtype="Int64"),
+                "b": pd.Series([1, 2, 3], dtype="Int64"),
+                "c": pd.Series([1.5, pd.NA, 2.5], dtype="Float64"),
+                "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
+                "e": pd.Series([True, False, pd.NA], dtype="boolean"),
+                "f": pd.Series([True, False, True], dtype="boolean"),
+                "g": pd.Series(["a", "b", "c"], dtype=string_dtype),
+                "h": pd.Series(["a", "b", None], dtype=string_dtype),
+            }
+        )
+
+        if dtype_backend == "pyarrow":
+            from pandas.arrays import ArrowExtensionArray
+
+            expected = pd.DataFrame(
+                {
+                    col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
+                    for col in expected.columns
+                }
+            )
+
+        if using_infer_string:
+            expected.columns = expected.columns.astype(
+                pd.StringDtype(string_storage, na_value=np.nan)
+            )
+        tm.assert_frame_equal(result, expected)
+
+    def test_int_columns_and_index(self, temp_file):
+        df = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index([3, 4, 5], name="test"))
+        self.check_round_trip(df, temp_file)
+
+    def test_invalid_dtype_backend(self, temp_file):
+        msg = (
+            "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+            "'pyarrow' are allowed."
+        )
+        df = pd.DataFrame({"int": list(range(1, 4))})
+        df.to_feather(temp_file)
+        with pytest.raises(ValueError, match=msg):
+            read_feather(temp_file, dtype_backend="numpy")
+
+    def test_string_inference(self, temp_file, using_infer_string):
+        # GH#54431
+        df = pd.DataFrame(data={"a": ["x", "y"]})
+        df.to_feather(temp_file)
+        with pd.option_context("future.infer_string", True):
+            result = read_feather(temp_file)
+        dtype = pd.StringDtype(na_value=np.nan)
+        expected = pd.DataFrame(
+            data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan)
+        )
+        expected = pd.DataFrame(
+            data={"a": ["x", "y"]},
+            dtype=dtype,
+            columns=pd.Index(
+                ["a"],
+                dtype=object
+                if pa_version_under19p0 and not using_infer_string
+                else dtype,
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0")
+    def test_string_inference_string_view_type(self, temp_file):
+        # GH#54798
+        import pyarrow as pa
+        from pyarrow import feather
+
+        table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())})
+        feather.write_feather(table, temp_file)
+
+        with pd.option_context("future.infer_string", True):
+            result = read_feather(temp_file)
+
+            expected = pd.DataFrame(
+                data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan)
+            )
+        tm.assert_frame_equal(result, expected)
+
+    def test_out_of_bounds_datetime_to_feather(self, temp_file):
+        # GH#47832
+        df = pd.DataFrame(
+            {
+                "date": [
+                    datetime.fromisoformat("1654-01-01"),
+                    datetime.fromisoformat("1920-01-01"),
+                ],
+            }
+        )
+        self.check_round_trip(df, temp_file)
diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d76a622d29148caa47deb98c3e7687226ea5b1f
--- /dev/null
+++ b/pandas/tests/io/test_fsspec.py
@@ -0,0 +1,348 @@
+import io
+
+import numpy as np
+import pytest
+
+from pandas._config import using_string_dtype
+
+from pandas.compat import HAS_PYARROW
+from pandas.compat.pyarrow import pa_version_under14p0
+
+from pandas import (
+    DataFrame,
+    date_range,
+    read_csv,
+    read_excel,
+    read_feather,
+    read_json,
+    read_parquet,
+    read_pickle,
+    read_stata,
+    read_table,
+)
+import pandas._testing as tm
+from pandas.util import _test_decorators as td
+
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+)
+
+
+@pytest.fixture
+def fsspectest():
+    pytest.importorskip("fsspec")
+    from fsspec import register_implementation
+    from fsspec.implementations.memory import MemoryFileSystem
+    from fsspec.registry import _registry as registry
+
+    class TestMemoryFS(MemoryFileSystem):
+        protocol = "testmem"
+        test = [None]
+
+        def __init__(self, **kwargs) -> None:
+            self.test[0] = kwargs.pop("test", None)
+            super().__init__(**kwargs)
+
+    register_implementation("testmem", TestMemoryFS, clobber=True)
+    yield TestMemoryFS()
+    registry.pop("testmem", None)
+    TestMemoryFS.test[0] = None
+    TestMemoryFS.store.clear()
+
+
+@pytest.fixture
+def df1():
+    return DataFrame(
+        {
+            "int": [1, 3],
+            "float": [2.0, np.nan],
+            "str": ["t", "s"],
+            "dt": date_range("2018-06-18", periods=2),
+        }
+    )
+
+
+@pytest.fixture
+def cleared_fs():
+    fsspec = pytest.importorskip("fsspec")
+
+    memfs = fsspec.filesystem("memory")
+    yield memfs
+    memfs.store.clear()
+
+
+def test_read_csv(cleared_fs, df1):
+    text = str(df1.to_csv(index=False)).encode()
+    with cleared_fs.open("test/test.csv", "wb") as w:
+        w.write(text)
+    df2 = read_csv("memory://test/test.csv", parse_dates=["dt"])
+
+    expected = df1.copy()
+    expected["dt"] = expected["dt"].astype("M8[us]")
+    tm.assert_frame_equal(df2, expected)
+
+
+def test_reasonable_error(monkeypatch, cleared_fs):
+    from fsspec.registry import known_implementations
+
+    with pytest.raises(ValueError, match="nosuchprotocol"):
+        read_csv("nosuchprotocol://test/test.csv")
+    err_msg = "test error message"
+    monkeypatch.setitem(
+        known_implementations,
+        "couldexist",
+        {"class": "unimportable.CouldExist", "err": err_msg},
+    )
+    with pytest.raises(ImportError, match=err_msg):
+        read_csv("couldexist://test/test.csv")
+
+
+def test_to_csv(cleared_fs, df1):
+    df1.to_csv("memory://test/test.csv", index=True)
+
+    df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0)
+
+    expected = df1.copy()
+    expected["dt"] = expected["dt"].astype("M8[us]")
+    tm.assert_frame_equal(df2, expected)
+
+
+def test_to_excel(cleared_fs, df1):
+    pytest.importorskip("openpyxl")
+    ext = "xlsx"
+    path = f"memory://test/test.{ext}"
+    df1.to_excel(path, index=True)
+
+    df2 = read_excel(path, parse_dates=["dt"], index_col=0)
+
+    expected = df1.copy()
+    expected["dt"] = expected["dt"].astype("M8[us]")
+    tm.assert_frame_equal(df2, expected)
+
+
+@pytest.mark.parametrize("binary_mode", [False, True])
+def test_to_csv_fsspec_object(cleared_fs, binary_mode, df1):
+    fsspec = pytest.importorskip("fsspec")
+
+    path = "memory://test/test.csv"
+    mode = "wb" if binary_mode else "w"
+    with fsspec.open(path, mode=mode).open() as fsspec_object:
+        df1.to_csv(fsspec_object, index=True)
+        assert not fsspec_object.closed
+
+    mode = mode.replace("w", "r")
+    with fsspec.open(path, mode=mode) as fsspec_object:
+        df2 = read_csv(
+            fsspec_object,
+            parse_dates=["dt"],
+            index_col=0,
+        )
+        assert not fsspec_object.closed
+
+    expected = df1.copy()
+    expected["dt"] = expected["dt"].astype("M8[us]")
+    tm.assert_frame_equal(df2, expected)
+
+
+def test_csv_options(fsspectest):
+    df = DataFrame({"a": [0]})
+    df.to_csv(
+        "testmem://test/test.csv", storage_options={"test": "csv_write"}, index=False
+    )
+    assert fsspectest.test[0] == "csv_write"
+    read_csv("testmem://test/test.csv", storage_options={"test": "csv_read"})
+    assert fsspectest.test[0] == "csv_read"
+
+
+def test_read_table_options(fsspectest):
+    # GH #39167
+    df = DataFrame({"a": [0]})
+    df.to_csv(
+        "testmem://test/test.csv", storage_options={"test": "csv_write"}, index=False
+    )
+    assert fsspectest.test[0] == "csv_write"
+    read_table("testmem://test/test.csv", storage_options={"test": "csv_read"})
+    assert fsspectest.test[0] == "csv_read"
+
+
+def test_excel_options(fsspectest):
+    pytest.importorskip("openpyxl")
+    extension = "xlsx"
+
+    df = DataFrame({"a": [0]})
+
+    path = f"testmem://test/test.{extension}"
+
+    df.to_excel(path, storage_options={"test": "write"}, index=False)
+    assert fsspectest.test[0] == "write"
+    read_excel(path, storage_options={"test": "read"})
+    assert fsspectest.test[0] == "read"
+
+
+@pytest.mark.xfail(
+    using_string_dtype() and HAS_PYARROW and not pa_version_under14p0,
+    reason="TODO(infer_string) fastparquet",
+)
+def test_to_parquet_new_file(cleared_fs, df1):
+    """Regression test for writing to a not-yet-existent GCS Parquet file."""
+    pytest.importorskip("fastparquet")
+
+    df1.to_parquet(
+        "memory://test/test.csv", index=True, engine="fastparquet", compression=None
+    )
+
+
+def test_arrowparquet_options(fsspectest):
+    """Regression test for writing to a not-yet-existent GCS Parquet file."""
+    pytest.importorskip("pyarrow")
+    df = DataFrame({"a": [0]})
+    df.to_parquet(
+        "testmem://test/test.csv",
+        engine="pyarrow",
+        compression=None,
+        storage_options={"test": "parquet_write"},
+    )
+    assert fsspectest.test[0] == "parquet_write"
+    read_parquet(
+        "testmem://test/test.csv",
+        engine="pyarrow",
+        storage_options={"test": "parquet_read"},
+    )
+    assert fsspectest.test[0] == "parquet_read"
+
+
+def test_fastparquet_options(fsspectest):
+    """Regression test for writing to a not-yet-existent GCS Parquet file."""
+    pytest.importorskip("fastparquet")
+
+    df = DataFrame({"a": [0]})
+    df.to_parquet(
+        "testmem://test/test.csv",
+        engine="fastparquet",
+        compression=None,
+        storage_options={"test": "parquet_write"},
+    )
+    assert fsspectest.test[0] == "parquet_write"
+    read_parquet(
+        "testmem://test/test.csv",
+        engine="fastparquet",
+        storage_options={"test": "parquet_read"},
+    )
+    assert fsspectest.test[0] == "parquet_read"
+
+
+@pytest.mark.single_cpu
+@pytest.mark.parametrize("compression_suffix", ["", ".gz", ".bz2"])
+def test_from_s3_csv(s3_bucket_public_with_data, s3so, tips_file, compression_suffix):
+    pytest.importorskip("s3fs")
+    df_from_s3 = read_csv(
+        f"s3://{s3_bucket_public_with_data.name}/tips.csv{compression_suffix}",
+        storage_options=s3so,
+    )
+    df_from_local = read_csv(tips_file)
+    tm.assert_equal(df_from_s3, df_from_local)
+
+
+@pytest.mark.single_cpu
+@pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"])
+def test_s3_protocols(s3_bucket_public_with_data, s3so, tips_file, protocol):
+    pytest.importorskip("s3fs")
+    df_from_s3 = read_csv(
+        f"{protocol}://{s3_bucket_public_with_data.name}/tips.csv",
+        storage_options=s3so,
+    )
+    df_from_local = read_csv(tips_file)
+    tm.assert_equal(df_from_s3, df_from_local)
+
+
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
+@pytest.mark.single_cpu
+def test_s3_parquet(s3_bucket_public, s3so, df1):
+    pytest.importorskip("fastparquet")
+    pytest.importorskip("s3fs")
+
+    fn = f"s3://{s3_bucket_public.name}/test.parquet"
+    df1.to_parquet(
+        fn, index=False, engine="fastparquet", compression=None, storage_options=s3so
+    )
+    df2 = read_parquet(fn, engine="fastparquet", storage_options=s3so)
+    tm.assert_equal(df1, df2)
+
+
+@td.skip_if_installed("fsspec")
+def test_not_present_exception():
+    msg = "`Import fsspec` failed.  Use pip or conda to install the fsspec package."
+    with pytest.raises(ImportError, match=msg):
+        read_csv("memory://test/test.csv")
+
+
+def test_feather_options(fsspectest):
+    pytest.importorskip("pyarrow")
+    df = DataFrame({"a": [0]})
+    df.to_feather("testmem://mockfile", storage_options={"test": "feather_write"})
+    assert fsspectest.test[0] == "feather_write"
+    out = read_feather("testmem://mockfile", storage_options={"test": "feather_read"})
+    assert fsspectest.test[0] == "feather_read"
+    tm.assert_frame_equal(df, out)
+
+
+def test_pickle_options(fsspectest):
+    df = DataFrame({"a": [0]})
+    df.to_pickle("testmem://mockfile", storage_options={"test": "pickle_write"})
+    assert fsspectest.test[0] == "pickle_write"
+    out = read_pickle("testmem://mockfile", storage_options={"test": "pickle_read"})
+    assert fsspectest.test[0] == "pickle_read"
+    tm.assert_frame_equal(df, out)
+
+
+def test_json_options(fsspectest, compression):
+    df = DataFrame({"a": [0]})
+    df.to_json(
+        "testmem://mockfile",
+        compression=compression,
+        storage_options={"test": "json_write"},
+    )
+    assert fsspectest.test[0] == "json_write"
+    out = read_json(
+        "testmem://mockfile",
+        compression=compression,
+        storage_options={"test": "json_read"},
+    )
+    assert fsspectest.test[0] == "json_read"
+    tm.assert_frame_equal(df, out)
+
+
+def test_stata_options(fsspectest):
+    df = DataFrame({"a": [0]})
+    df.to_stata(
+        "testmem://mockfile", storage_options={"test": "stata_write"}, write_index=False
+    )
+    assert fsspectest.test[0] == "stata_write"
+    out = read_stata("testmem://mockfile", storage_options={"test": "stata_read"})
+    assert fsspectest.test[0] == "stata_read"
+    tm.assert_frame_equal(df, out.astype("int64"))
+
+
+def test_markdown_options(fsspectest):
+    pytest.importorskip("tabulate")
+    df = DataFrame({"a": [0]})
+    df.to_markdown("testmem://mockfile", storage_options={"test": "md_write"})
+    assert fsspectest.test[0] == "md_write"
+    assert fsspectest.cat("testmem://mockfile")
+
+
+def test_non_fsspec_options():
+    pytest.importorskip("pyarrow")
+    with pytest.raises(ValueError, match="storage_options"):
+        read_csv("localfile", storage_options={"a": True})
+    with pytest.raises(ValueError, match="storage_options"):
+        # separate test for parquet, which has a different code path
+        read_parquet("localfile", storage_options={"a": True})
+    by = io.BytesIO()
+
+    with pytest.raises(ValueError, match="storage_options"):
+        read_csv(by, storage_options={"a": True})
+
+    df = DataFrame({"a": [0]})
+    with pytest.raises(ValueError, match="storage_options"):
+        df.to_parquet("nonfsspecpath", storage_options={"a": True})
diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..022fd89c1f555551a47d6361f0368dcbb3d28ab8
--- /dev/null
+++ b/pandas/tests/io/test_gcs.py
@@ -0,0 +1,233 @@
+from io import BytesIO
+import os
+import pathlib
+import tarfile
+import zipfile
+
+import numpy as np
+import pytest
+
+from pandas.compat.pyarrow import pa_version_under17p0
+
+from pandas import (
+    DataFrame,
+    Index,
+    date_range,
+    read_csv,
+    read_excel,
+    read_json,
+    read_parquet,
+)
+import pandas._testing as tm
+from pandas.util import _test_decorators as td
+
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+)
+
+
+@pytest.fixture
+def gcs_buffer():
+    """Emulate GCS using a binary buffer."""
+    pytest.importorskip("gcsfs")
+    fsspec = pytest.importorskip("fsspec")
+
+    gcs_buffer = BytesIO()
+    gcs_buffer.close = lambda: True
+
+    class MockGCSFileSystem(fsspec.AbstractFileSystem):
+        @staticmethod
+        def open(*args, **kwargs):
+            gcs_buffer.seek(0)
+            return gcs_buffer
+
+        def ls(self, path, **kwargs):
+            # needed for pyarrow
+            return [{"name": path, "type": "file"}]
+
+    # Overwrites the default implementation from gcsfs to our mock class
+    fsspec.register_implementation("gs", MockGCSFileSystem, clobber=True)
+
+    return gcs_buffer
+
+
+# Patches pyarrow; other processes should not pick up change
+@pytest.mark.single_cpu
+@pytest.mark.parametrize("format", ["csv", "json", "parquet", "excel", "markdown"])
+def test_to_read_gcs(gcs_buffer, format, monkeypatch, capsys, request):
+    """
+    Test that many to/read functions support GCS.
+
+    GH 33987
+    """
+
+    df1 = DataFrame(
+        {
+            "int": [1, 3],
+            "float": [2.0, np.nan],
+            "str": ["t", "s"],
+            "dt": date_range("2018-06-18", periods=2, unit="ns"),
+        }
+    )
+
+    path = f"gs://test/test.{format}"
+
+    if format == "csv":
+        df1.to_csv(path, index=True)
+        df2 = read_csv(path, parse_dates=["dt"], index_col=0)
+    elif format == "excel":
+        path = "gs://test/test.xlsx"
+        df1.to_excel(path)
+        df2 = read_excel(path, parse_dates=["dt"], index_col=0)
+    elif format == "json":
+        df1.to_json(path, date_format="iso")
+        df2 = read_json(path, convert_dates=["dt"])
+    elif format == "parquet":
+        pytest.importorskip("pyarrow")
+        pa_fs = pytest.importorskip("pyarrow.fs")
+
+        class MockFileSystem(pa_fs.FileSystem):
+            @staticmethod
+            def from_uri(path):
+                print("Using pyarrow filesystem")
+                to_local = pathlib.Path(path.replace("gs://", "")).absolute().as_uri()
+                return pa_fs.LocalFileSystem(to_local)
+
+        request.applymarker(
+            pytest.mark.xfail(
+                not pa_version_under17p0,
+                raises=TypeError,
+                reason="pyarrow 17 broke the mocked filesystem",
+            )
+        )
+        with monkeypatch.context() as m:
+            m.setattr(pa_fs, "FileSystem", MockFileSystem)
+            df1.to_parquet(path)
+            df2 = read_parquet(path)
+        captured = capsys.readouterr()
+        assert captured.out == "Using pyarrow filesystem\nUsing pyarrow filesystem\n"
+    elif format == "markdown":
+        pytest.importorskip("tabulate")
+        df1.to_markdown(path)
+        df2 = df1
+
+    expected = df1[:]
+    if format in ["csv", "excel", "json"]:
+        expected["dt"] = expected["dt"].dt.as_unit("us")
+
+    tm.assert_frame_equal(df2, expected)
+
+
+def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str):
+    """
+    For zip compression, only compare the CRC-32 checksum of the file contents
+    to avoid checking the time-dependent last-modified timestamp which
+    in some CI builds is off-by-one
+
+    See https://en.wikipedia.org/wiki/ZIP_(file_format)#File_headers
+    """
+    if compression == "zip":
+        # Only compare the CRC checksum of the file contents
+        with (
+            zipfile.ZipFile(BytesIO(result)) as exp,
+            zipfile.ZipFile(BytesIO(expected)) as res,
+        ):
+            for res_info, exp_info in zip(res.infolist(), exp.infolist()):
+                assert res_info.CRC == exp_info.CRC
+    elif compression == "tar":
+        with (
+            tarfile.open(fileobj=BytesIO(result)) as tar_exp,
+            tarfile.open(fileobj=BytesIO(expected)) as tar_res,
+        ):
+            for tar_res_info, tar_exp_info in zip(
+                tar_res.getmembers(), tar_exp.getmembers()
+            ):
+                actual_file = tar_res.extractfile(tar_res_info)
+                expected_file = tar_exp.extractfile(tar_exp_info)
+                assert (actual_file is None) == (expected_file is None)
+                if actual_file is not None and expected_file is not None:
+                    assert actual_file.read() == expected_file.read()
+    else:
+        assert result == expected
+
+
+@pytest.mark.parametrize("encoding", ["utf-8", "cp1251"])
+def test_to_csv_compression_encoding_gcs(
+    gcs_buffer, compression_only, encoding, compression_to_extension
+):
+    """
+    Compression and encoding should with GCS.
+
+    GH 35677 (to_csv, compression), GH 26124 (to_csv, encoding), and
+    GH 32392 (read_csv, encoding)
+    """
+    df = DataFrame(
+        1.1 * np.arange(120).reshape((30, 4)),
+        columns=Index(list("ABCD")),
+        index=Index([f"i-{i}" for i in range(30)]),
+    )
+
+    # reference of compressed and encoded file
+    compression = {"method": compression_only}
+    if compression_only == "gzip":
+        compression["mtime"] = 1  # be reproducible
+    buffer = BytesIO()
+    df.to_csv(buffer, compression=compression, encoding=encoding, mode="wb")
+
+    # write compressed file with explicit compression
+    path_gcs = "gs://test/test.csv"
+    df.to_csv(path_gcs, compression=compression, encoding=encoding)
+    res = gcs_buffer.getvalue()
+    expected = buffer.getvalue()
+    assert_equal_zip_safe(res, expected, compression_only)
+
+    read_df = read_csv(
+        path_gcs, index_col=0, compression=compression_only, encoding=encoding
+    )
+    tm.assert_frame_equal(df, read_df)
+
+    # write compressed file with implicit compression
+    file_ext = compression_to_extension[compression_only]
+    compression["method"] = "infer"
+    path_gcs += f".{file_ext}"
+    df.to_csv(path_gcs, compression=compression, encoding=encoding)
+
+    res = gcs_buffer.getvalue()
+    expected = buffer.getvalue()
+    assert_equal_zip_safe(res, expected, compression_only)
+
+    read_df = read_csv(path_gcs, index_col=0, compression="infer", encoding=encoding)
+    tm.assert_frame_equal(df, read_df)
+
+
+def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
+    """Regression test for writing to a not-yet-existent GCS Parquet file."""
+    pytest.importorskip("fastparquet")
+    pytest.importorskip("gcsfs")
+
+    from fsspec import AbstractFileSystem
+
+    df1 = DataFrame(
+        {
+            "int": [1, 3],
+            "float": [2.0, np.nan],
+            "dt": date_range("2018-06-18", periods=2),
+        }
+    )
+
+    class MockGCSFileSystem(AbstractFileSystem):
+        def open(self, path, mode="r", *args):
+            if "w" not in mode:
+                raise FileNotFoundError
+            return open(os.path.join(tmpdir, "test.parquet"), mode, encoding="utf-8")
+
+    monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem)
+    df1.to_parquet(
+        "gs://test/test.csv", index=True, engine="fastparquet", compression=None
+    )
+
+
+@td.skip_if_installed("gcsfs")
+def test_gcs_not_present_exception():
+    with tm.external_error_raised(ImportError):
+        read_csv("gs://test/test.csv")
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
new file mode 100644
index 0000000000000000000000000000000000000000..abc0cfbb36332daeb3ea606450a1df75d455ed50
--- /dev/null
+++ b/pandas/tests/io/test_html.py
@@ -0,0 +1,1669 @@
+from collections.abc import Iterator
+from functools import partial
+from io import (
+    BytesIO,
+    StringIO,
+)
+import os
+from pathlib import Path
+import re
+import threading
+from urllib.error import URLError
+
+import numpy as np
+import pytest
+
+from pandas.compat import is_platform_windows
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    NA,
+    DataFrame,
+    MultiIndex,
+    Series,
+    Timestamp,
+    date_range,
+    read_csv,
+    read_html,
+    to_datetime,
+)
+import pandas._testing as tm
+
+from pandas.io.common import file_path_to_url
+
+
+@pytest.fixture(
+    params=[
+        "chinese_utf-16.html",
+        "chinese_utf-32.html",
+        "chinese_utf-8.html",
+        "letz_latin1.html",
+    ]
+)
+def html_encoding_file(request, datapath):
+    """Parametrized fixture for HTML encoding test filenames."""
+    return datapath("io", "data", "html_encoding", request.param)
+
+
+def assert_framelist_equal(list1, list2, *args, **kwargs):
+    assert len(list1) == len(list2), (
+        "lists are not of equal size "
+        f"len(list1) == {len(list1)}, "
+        f"len(list2) == {len(list2)}"
+    )
+    msg = "not all list elements are DataFrames"
+    both_frames = all(
+        map(
+            lambda x, y: isinstance(x, DataFrame) and isinstance(y, DataFrame),
+            list1,
+            list2,
+        )
+    )
+    assert both_frames, msg
+    for frame_i, frame_j in zip(list1, list2):
+        tm.assert_frame_equal(frame_i, frame_j, *args, **kwargs)
+        assert not frame_i.empty, "frames are both empty"
+
+
+def test_bs4_version_fails(monkeypatch, datapath):
+    bs4 = pytest.importorskip("bs4")
+    pytest.importorskip("html5lib")
+
+    monkeypatch.setattr(bs4, "__version__", "4.2")
+    with pytest.raises(ImportError, match="Pandas requires version"):
+        read_html(datapath("io", "data", "html", "spam.html"), flavor="bs4")
+
+
+def test_invalid_flavor():
+    url = "google.com"
+    flavor = "invalid flavor"
+    msg = r"\{" + flavor + r"\} is not a valid set of flavors"
+
+    with pytest.raises(ValueError, match=msg):
+        read_html(StringIO(url), match="google", flavor=flavor)
+
+
+def test_same_ordering(datapath):
+    pytest.importorskip("bs4")
+    pytest.importorskip("lxml")
+    pytest.importorskip("html5lib")
+
+    filename = datapath("io", "data", "html", "valid_markup.html")
+    dfs_lxml = read_html(filename, index_col=0, flavor=["lxml"])
+    dfs_bs4 = read_html(filename, index_col=0, flavor=["bs4"])
+    assert_framelist_equal(dfs_lxml, dfs_bs4)
+
+
+@pytest.fixture(
+    params=[
+        pytest.param("bs4", marks=[td.skip_if_no("bs4"), td.skip_if_no("html5lib")]),
+        pytest.param("lxml", marks=td.skip_if_no("lxml")),
+    ],
+)
+def flavor_read_html(request):
+    return partial(read_html, flavor=request.param)
+
+
+class TestReadHtml:
+    def test_literal_html_deprecation(self, flavor_read_html):
+        # GH 53785
+        msg = r"\[Errno 2\] No such file or director"
+
+        with pytest.raises(FileNotFoundError, match=msg):
+            flavor_read_html(
+                """<table>
+                <thead>
+                    <tr>
+                        <th>A</th>
+                        <th>B</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <td>1</td>
+                        <td>2</td>
+                    </tr>
+                </tbody>
+                <tbody>
+                    <tr>
+                        <td>3</td>
+                        <td>4</td>
+                    </tr>
+                </tbody>
+            </table>"""
+            )
+
+    @pytest.fixture
+    def spam_data(self, datapath):
+        return datapath("io", "data", "html", "spam.html")
+
+    @pytest.fixture
+    def banklist_data(self, datapath):
+        return datapath("io", "data", "html", "banklist.html")
+
+    def test_to_html_compat(self, flavor_read_html):
+        df = (
+            DataFrame(
+                np.random.default_rng(2).random((4, 3)),
+                columns=pd.Index(list("abc")),
+            )
+            .map("{:.3f}".format)
+            .astype(float)
+        )
+        out = df.to_html()
+        res = flavor_read_html(
+            StringIO(out), attrs={"class": "dataframe"}, index_col=0
+        )[0]
+        tm.assert_frame_equal(res, df)
+
+    def test_dtype_backend(self, string_storage, dtype_backend, flavor_read_html):
+        # GH#50286
+        df = DataFrame(
+            {
+                "a": Series([1, NA, 3], dtype="Int64"),
+                "b": Series([1, 2, 3], dtype="Int64"),
+                "c": Series([1.5, NA, 2.5], dtype="Float64"),
+                "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
+                "e": [True, False, None],
+                "f": [True, False, True],
+                "g": ["a", "b", "c"],
+                "h": ["a", "b", None],
+            }
+        )
+
+        out = df.to_html(index=False)
+        with pd.option_context("mode.string_storage", string_storage):
+            result = flavor_read_html(StringIO(out), dtype_backend=dtype_backend)[0]
+
+        if dtype_backend == "pyarrow":
+            pa = pytest.importorskip("pyarrow")
+            string_dtype = pd.ArrowDtype(pa.string())
+        else:
+            string_dtype = pd.StringDtype(string_storage)
+
+        expected = DataFrame(
+            {
+                "a": Series([1, NA, 3], dtype="Int64"),
+                "b": Series([1, 2, 3], dtype="Int64"),
+                "c": Series([1.5, NA, 2.5], dtype="Float64"),
+                "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
+                "e": Series([True, False, NA], dtype="boolean"),
+                "f": Series([True, False, True], dtype="boolean"),
+                "g": Series(["a", "b", "c"], dtype=string_dtype),
+                "h": Series(["a", "b", None], dtype=string_dtype),
+            }
+        )
+
+        if dtype_backend == "pyarrow":
+            import pyarrow as pa
+
+            from pandas.arrays import ArrowExtensionArray
+
+            expected = DataFrame(
+                {
+                    col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
+                    for col in expected.columns
+                }
+            )
+
+        # the storage of the str columns' Index is also affected by the
+        # string_storage setting -> ignore that for checking the result
+        tm.assert_frame_equal(result, expected, check_column_type=False)
+
+    @pytest.mark.network
+    @pytest.mark.single_cpu
+    def test_banklist_url(self, httpserver, banklist_data, flavor_read_html):
+        with open(banklist_data, encoding="utf-8") as f:
+            httpserver.serve_content(content=f.read())
+            df1 = flavor_read_html(
+                # lxml cannot find attrs leave out for now
+                httpserver.url,
+                match="First Federal Bank of Florida",  # attrs={"class": "dataTable"}
+            )
+            # lxml cannot find attrs leave out for now
+            df2 = flavor_read_html(
+                httpserver.url,
+                match="Metcalf Bank",
+            )  # attrs={"class": "dataTable"})
+
+        assert_framelist_equal(df1, df2)
+
+    @pytest.mark.network
+    @pytest.mark.single_cpu
+    def test_spam_url(self, httpserver, spam_data, flavor_read_html):
+        with open(spam_data, encoding="utf-8") as f:
+            httpserver.serve_content(content=f.read())
+            df1 = flavor_read_html(httpserver.url, match=".*Water.*")
+            df2 = flavor_read_html(httpserver.url, match="Unit")
+
+        assert_framelist_equal(df1, df2)
+
+    @pytest.mark.slow
+    def test_banklist(self, banklist_data, flavor_read_html):
+        df1 = flavor_read_html(
+            banklist_data, match=".*Florida.*", attrs={"id": "table"}
+        )
+        df2 = flavor_read_html(
+            banklist_data, match="Metcalf Bank", attrs={"id": "table"}
+        )
+
+        assert_framelist_equal(df1, df2)
+
+    def test_spam(self, spam_data, flavor_read_html):
+        df1 = flavor_read_html(spam_data, match=".*Water.*")
+        df2 = flavor_read_html(spam_data, match="Unit")
+        assert_framelist_equal(df1, df2)
+
+        assert df1[0].iloc[0, 0] == "Proximates"
+        assert df1[0].columns[0] == "Nutrient"
+
+    def test_spam_no_match(self, spam_data, flavor_read_html):
+        dfs = flavor_read_html(spam_data)
+        for df in dfs:
+            assert isinstance(df, DataFrame)
+
+    def test_banklist_no_match(self, banklist_data, flavor_read_html):
+        dfs = flavor_read_html(banklist_data, attrs={"id": "table"})
+        for df in dfs:
+            assert isinstance(df, DataFrame)
+
+    def test_spam_header(self, spam_data, flavor_read_html):
+        df = flavor_read_html(spam_data, match=".*Water.*", header=2)[0]
+        assert df.columns[0] == "Proximates"
+        assert not df.empty
+
+    def test_skiprows_int(self, spam_data, flavor_read_html):
+        df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=1)
+        df2 = flavor_read_html(spam_data, match="Unit", skiprows=1)
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_range(self, spam_data, flavor_read_html):
+        df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=range(2))
+        df2 = flavor_read_html(spam_data, match="Unit", skiprows=range(2))
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_list(self, spam_data, flavor_read_html):
+        df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=[1, 2])
+        df2 = flavor_read_html(spam_data, match="Unit", skiprows=[2, 1])
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_set(self, spam_data, flavor_read_html):
+        df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows={1, 2})
+        df2 = flavor_read_html(spam_data, match="Unit", skiprows={2, 1})
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_slice(self, spam_data, flavor_read_html):
+        df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=1)
+        df2 = flavor_read_html(spam_data, match="Unit", skiprows=1)
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_slice_short(self, spam_data, flavor_read_html):
+        df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=slice(2))
+        df2 = flavor_read_html(spam_data, match="Unit", skiprows=slice(2))
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_slice_long(self, spam_data, flavor_read_html):
+        df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=slice(2, 5))
+        df2 = flavor_read_html(spam_data, match="Unit", skiprows=slice(4, 1, -1))
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_ndarray(self, spam_data, flavor_read_html):
+        df1 = flavor_read_html(spam_data, match=".*Water.*", skiprows=np.arange(2))
+        df2 = flavor_read_html(spam_data, match="Unit", skiprows=np.arange(2))
+
+        assert_framelist_equal(df1, df2)
+
+    def test_skiprows_invalid(self, spam_data, flavor_read_html):
+        with pytest.raises(TypeError, match=("is not a valid type for skipping rows")):
+            flavor_read_html(spam_data, match=".*Water.*", skiprows="asdf")
+
+    def test_index(self, spam_data, flavor_read_html):
+        df1 = flavor_read_html(spam_data, match=".*Water.*", index_col=0)
+        df2 = flavor_read_html(spam_data, match="Unit", index_col=0)
+        assert_framelist_equal(df1, df2)
+
+    def test_header_and_index_no_types(self, spam_data, flavor_read_html):
+        df1 = flavor_read_html(spam_data, match=".*Water.*", header=1, index_col=0)
+        df2 = flavor_read_html(spam_data, match="Unit", header=1, index_col=0)
+        assert_framelist_equal(df1, df2)
+
+    def test_header_and_index_with_types(self, spam_data, flavor_read_html):
+        df1 = flavor_read_html(spam_data, match=".*Water.*", header=1, index_col=0)
+        df2 = flavor_read_html(spam_data, match="Unit", header=1, index_col=0)
+        assert_framelist_equal(df1, df2)
+
+    def test_infer_types(self, spam_data, flavor_read_html):
+        # 10892 infer_types removed
+        df1 = flavor_read_html(spam_data, match=".*Water.*", index_col=0)
+        df2 = flavor_read_html(spam_data, match="Unit", index_col=0)
+        assert_framelist_equal(df1, df2)
+
+    def test_string_io(self, spam_data, flavor_read_html):
+        with open(spam_data, encoding="UTF-8") as f:
+            data1 = StringIO(f.read())
+
+        with open(spam_data, encoding="UTF-8") as f:
+            data2 = StringIO(f.read())
+
+        df1 = flavor_read_html(data1, match=".*Water.*")
+        df2 = flavor_read_html(data2, match="Unit")
+        assert_framelist_equal(df1, df2)
+
+    def test_string(self, spam_data, flavor_read_html):
+        with open(spam_data, encoding="UTF-8") as f:
+            data = f.read()
+
+        df1 = flavor_read_html(StringIO(data), match=".*Water.*")
+        df2 = flavor_read_html(StringIO(data), match="Unit")
+
+        assert_framelist_equal(df1, df2)
+
+    def test_file_like(self, spam_data, flavor_read_html):
+        with open(spam_data, encoding="UTF-8") as f:
+            df1 = flavor_read_html(f, match=".*Water.*")
+
+        with open(spam_data, encoding="UTF-8") as f:
+            df2 = flavor_read_html(f, match="Unit")
+
+        assert_framelist_equal(df1, df2)
+
+    @pytest.mark.network
+    @pytest.mark.single_cpu
+    def test_bad_url_protocol(self, httpserver, flavor_read_html):
+        httpserver.serve_content("urlopen error unknown url type: git", code=404)
+        with pytest.raises(URLError, match="urlopen error unknown url type: git"):
+            flavor_read_html("git://github.com", match=".*Water.*")
+
+    @pytest.mark.slow
+    @pytest.mark.network
+    @pytest.mark.single_cpu
+    def test_invalid_url(self, httpserver, flavor_read_html):
+        httpserver.serve_content("Name or service not known", code=404)
+        try:
+            with pytest.raises(
+                (URLError, ValueError), match="HTTP Error 404: NOT FOUND"
+            ) as err:
+                flavor_read_html(httpserver.url, match=".*Water.*")
+        finally:
+            if isinstance(err.value, URLError):
+                # Has a file-like handle that we can close
+                # https://docs.python.org/3/library/urllib.error.html#urllib.error.HTTPError
+                err.value.close()
+
+    @pytest.mark.slow
+    def test_file_url(self, banklist_data, flavor_read_html):
+        url = banklist_data
+        dfs = flavor_read_html(
+            file_path_to_url(os.path.abspath(url)), match="First", attrs={"id": "table"}
+        )
+        assert isinstance(dfs, list)
+        for df in dfs:
+            assert isinstance(df, DataFrame)
+
+    @pytest.mark.slow
+    def test_invalid_table_attrs(self, banklist_data, flavor_read_html):
+        url = banklist_data
+        with pytest.raises(ValueError, match="No tables found"):
+            flavor_read_html(
+                url, match="First Federal Bank of Florida", attrs={"id": "tasdfable"}
+            )
+
+    @pytest.mark.slow
+    def test_multiindex_header(self, banklist_data, flavor_read_html):
+        df = flavor_read_html(
+            banklist_data, match="Metcalf", attrs={"id": "table"}, header=[0, 1]
+        )[0]
+        assert isinstance(df.columns, MultiIndex)
+
+    @pytest.mark.slow
+    def test_multiindex_index(self, banklist_data, flavor_read_html):
+        df = flavor_read_html(
+            banklist_data, match="Metcalf", attrs={"id": "table"}, index_col=[0, 1]
+        )[0]
+        assert isinstance(df.index, MultiIndex)
+
+    @pytest.mark.slow
+    def test_multiindex_header_index(self, banklist_data, flavor_read_html):
+        df = flavor_read_html(
+            banklist_data,
+            match="Metcalf",
+            attrs={"id": "table"},
+            header=[0, 1],
+            index_col=[0, 1],
+        )[0]
+        assert isinstance(df.columns, MultiIndex)
+        assert isinstance(df.index, MultiIndex)
+
+    @pytest.mark.slow
+    def test_multiindex_header_skiprows_tuples(self, banklist_data, flavor_read_html):
+        df = flavor_read_html(
+            banklist_data,
+            match="Metcalf",
+            attrs={"id": "table"},
+            header=[0, 1],
+            skiprows=1,
+        )[0]
+        assert isinstance(df.columns, MultiIndex)
+
+    @pytest.mark.slow
+    def test_multiindex_header_skiprows(self, banklist_data, flavor_read_html):
+        df = flavor_read_html(
+            banklist_data,
+            match="Metcalf",
+            attrs={"id": "table"},
+            header=[0, 1],
+            skiprows=1,
+        )[0]
+        assert isinstance(df.columns, MultiIndex)
+
+    @pytest.mark.slow
+    def test_multiindex_header_index_skiprows(self, banklist_data, flavor_read_html):
+        df = flavor_read_html(
+            banklist_data,
+            match="Metcalf",
+            attrs={"id": "table"},
+            header=[0, 1],
+            index_col=[0, 1],
+            skiprows=1,
+        )[0]
+        assert isinstance(df.index, MultiIndex)
+        assert isinstance(df.columns, MultiIndex)
+
+    @pytest.mark.slow
+    def test_regex_idempotency(self, banklist_data, flavor_read_html):
+        url = banklist_data
+        dfs = flavor_read_html(
+            file_path_to_url(os.path.abspath(url)),
+            match=re.compile(re.compile("Florida")),
+            attrs={"id": "table"},
+        )
+        assert isinstance(dfs, list)
+        for df in dfs:
+            assert isinstance(df, DataFrame)
+
+    def test_negative_skiprows(self, spam_data, flavor_read_html):
+        msg = r"\(you passed a negative value\)"
+        with pytest.raises(ValueError, match=msg):
+            flavor_read_html(spam_data, match="Water", skiprows=-1)
+
+    @pytest.fixture
+    def python_docs(self):
+        return """
+          <table class="contentstable" align="center"><tr>
+            <td width="50%">
+            <p class="biglink"><a class="biglink" href="whatsnew/2.7.html">What's new in Python 2.7?</a><br/>
+                <span class="linkdescr">or <a href="whatsnew/index.html">all "What's new" documents</a> since 2.0</span></p>
+            <p class="biglink"><a class="biglink" href="tutorial/index.html">Tutorial</a><br/>
+                <span class="linkdescr">start here</span></p>
+            <p class="biglink"><a class="biglink" href="library/index.html">Library Reference</a><br/>
+                <span class="linkdescr">keep this under your pillow</span></p>
+            <p class="biglink"><a class="biglink" href="reference/index.html">Language Reference</a><br/>
+                <span class="linkdescr">describes syntax and language elements</span></p>
+            <p class="biglink"><a class="biglink" href="using/index.html">Python Setup and Usage</a><br/>
+                <span class="linkdescr">how to use Python on different platforms</span></p>
+            <p class="biglink"><a class="biglink" href="howto/index.html">Python HOWTOs</a><br/>
+                <span class="linkdescr">in-depth documents on specific topics</span></p>
+            </td><td width="50%">
+            <p class="biglink"><a class="biglink" href="installing/index.html">Installing Python Modules</a><br/>
+                <span class="linkdescr">installing from the Python Package Index &amp; other sources</span></p>
+            <p class="biglink"><a class="biglink" href="distributing/index.html">Distributing Python Modules</a><br/>
+                <span class="linkdescr">publishing modules for installation by others</span></p>
+            <p class="biglink"><a class="biglink" href="extending/index.html">Extending and Embedding</a><br/>
+                <span class="linkdescr">tutorial for C/C++ programmers</span></p>
+            <p class="biglink"><a class="biglink" href="c-api/index.html">Python/C API</a><br/>
+                <span class="linkdescr">reference for C/C++ programmers</span></p>
+            <p class="biglink"><a class="biglink" href="faq/index.html">FAQs</a><br/>
+                <span class="linkdescr">frequently asked questions (with answers!)</span></p>
+            </td></tr>
+        </table>
+
+        <p><strong>Indices and tables:</strong></p>
+        <table class="contentstable" align="center"><tr>
+            <td width="50%">
+            <p class="biglink"><a class="biglink" href="py-modindex.html">Python Global Module Index</a><br/>
+                <span class="linkdescr">quick access to all modules</span></p>
+            <p class="biglink"><a class="biglink" href="genindex.html">General Index</a><br/>
+                <span class="linkdescr">all functions, classes, terms</span></p>
+            <p class="biglink"><a class="biglink" href="glossary.html">Glossary</a><br/>
+                <span class="linkdescr">the most important terms explained</span></p>
+            </td><td width="50%">
+            <p class="biglink"><a class="biglink" href="search.html">Search page</a><br/>
+                <span class="linkdescr">search this documentation</span></p>
+            <p class="biglink"><a class="biglink" href="contents.html">Complete Table of Contents</a><br/>
+                <span class="linkdescr">lists all sections and subsections</span></p>
+            </td></tr>
+        </table>
+        """  # noqa: E501
+
+    @pytest.mark.network
+    @pytest.mark.single_cpu
+    def test_multiple_matches(self, python_docs, httpserver, flavor_read_html):
+        httpserver.serve_content(content=python_docs)
+        dfs = flavor_read_html(httpserver.url, match="Python")
+        assert len(dfs) > 1
+
+    @pytest.mark.network
+    @pytest.mark.single_cpu
+    def test_python_docs_table(self, python_docs, httpserver, flavor_read_html):
+        httpserver.serve_content(content=python_docs)
+        dfs = flavor_read_html(httpserver.url, match="Python")
+        zz = [df.iloc[0, 0][0:4] for df in dfs]
+        assert sorted(zz) == ["Pyth", "What"]
+
+    def test_empty_tables(self, flavor_read_html):
+        """
+        Make sure that read_html ignores empty tables.
+        """
+        html = """
+            <table>
+                <thead>
+                    <tr>
+                        <th>A</th>
+                        <th>B</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <td>1</td>
+                        <td>2</td>
+                    </tr>
+                </tbody>
+            </table>
+            <table>
+                <tbody>
+                </tbody>
+            </table>
+        """
+        result = flavor_read_html(StringIO(html))
+        assert len(result) == 1
+
+    def test_multiple_tbody(self, flavor_read_html):
+        # GH-20690
+        # Read all tbody tags within a single table.
+        result = flavor_read_html(
+            StringIO(
+                """<table>
+            <thead>
+                <tr>
+                    <th>A</th>
+                    <th>B</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>1</td>
+                    <td>2</td>
+                </tr>
+            </tbody>
+            <tbody>
+                <tr>
+                    <td>3</td>
+                    <td>4</td>
+                </tr>
+            </tbody>
+        </table>"""
+            )
+        )[0]
+
+        expected = DataFrame(data=[[1, 2], [3, 4]], columns=["A", "B"])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_header_and_one_column(self, flavor_read_html):
+        """
+        Don't fail with bs4 when there is a header and only one column
+        as described in issue #9178
+        """
+        result = flavor_read_html(
+            StringIO(
+                """<table>
+                <thead>
+                    <tr>
+                        <th>Header</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <td>first</td>
+                    </tr>
+                </tbody>
+            </table>"""
+            )
+        )[0]
+
+        expected = DataFrame(data={"Header": "first"}, index=[0])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_thead_without_tr(self, flavor_read_html):
+        """
+        Ensure parser adds <tr> within <thead> on malformed HTML.
+        """
+        result = flavor_read_html(
+            StringIO(
+                """<table>
+            <thead>
+                <tr>
+                    <th>Country</th>
+                    <th>Municipality</th>
+                    <th>Year</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>Ukraine</td>
+                    <th>Odessa</th>
+                    <td>1944</td>
+                </tr>
+            </tbody>
+        </table>"""
+            )
+        )[0]
+
+        expected = DataFrame(
+            data=[["Ukraine", "Odessa", 1944]],
+            columns=["Country", "Municipality", "Year"],
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_tfoot_read(self, flavor_read_html):
+        """
+        Make sure that read_html reads tfoot, containing td or th.
+        Ignores empty tfoot
+        """
+        data_template = """<table>
+            <thead>
+                <tr>
+                    <th>A</th>
+                    <th>B</th>
+                </tr>
+            </thead>
+            <tbody>
+                <tr>
+                    <td>bodyA</td>
+                    <td>bodyB</td>
+                </tr>
+            </tbody>
+            <tfoot>
+                {footer}
+            </tfoot>
+        </table>"""
+
+        expected1 = DataFrame(data=[["bodyA", "bodyB"]], columns=["A", "B"])
+
+        expected2 = DataFrame(
+            data=[["bodyA", "bodyB"], ["footA", "footB"]], columns=["A", "B"]
+        )
+
+        data1 = data_template.format(footer="")
+        data2 = data_template.format(footer="<tr><td>footA</td><th>footB</th></tr>")
+
+        result1 = flavor_read_html(StringIO(data1))[0]
+        result2 = flavor_read_html(StringIO(data2))[0]
+
+        tm.assert_frame_equal(result1, expected1)
+        tm.assert_frame_equal(result2, expected2)
+
+    def test_parse_header_of_non_string_column(self, flavor_read_html):
+        # GH5048: if header is specified explicitly, an int column should be
+        # parsed as int while its header is parsed as str
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <td>S</td>
+                    <td>I</td>
+                </tr>
+                <tr>
+                    <td>text</td>
+                    <td>1944</td>
+                </tr>
+            </table>
+        """
+            ),
+            header=0,
+        )[0]
+
+        expected = DataFrame([["text", 1944]], columns=("S", "I"))
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.slow
+    def test_banklist_header(self, banklist_data, datapath, flavor_read_html):
+        from pandas.io.html import _remove_whitespace
+
+        def try_remove_ws(x):
+            try:
+                return _remove_whitespace(x)
+            except AttributeError:
+                return x
+
+        df = flavor_read_html(banklist_data, match="Metcalf", attrs={"id": "table"})[0]
+        ground_truth = read_csv(
+            datapath("io", "data", "csv", "banklist.csv"),
+            converters={"Updated Date": Timestamp, "Closing Date": Timestamp},
+        )
+        # html is a truncated version of banklist since bs4 is slow to parse it
+        assert df.shape == (len(df), ground_truth.shape[1])
+        old = [
+            "First Vietnamese American Bank In Vietnamese",
+            "Westernbank Puerto Rico En Espanol",
+            "R-G Premier Bank of Puerto Rico En Espanol",
+            "Eurobank En Espanol",
+            "Sanderson State Bank En Espanol",
+            "Washington Mutual Bank (Including its subsidiary Washington "
+            "Mutual Bank FSB)",
+            "Silver State Bank En Espanol",
+            "AmTrade International Bank En Espanol",
+            "Hamilton Bank, NA En Espanol",
+            "The Citizens Savings Bank Pioneer Community Bank, Inc.",
+        ]
+        new = [
+            "First Vietnamese American Bank",
+            "Westernbank Puerto Rico",
+            "R-G Premier Bank of Puerto Rico",
+            "Eurobank",
+            "Sanderson State Bank",
+            "Washington Mutual Bank",
+            "Silver State Bank",
+            "AmTrade International Bank",
+            "Hamilton Bank, NA",
+            "The Citizens Savings Bank",
+        ]
+        dfnew = df.map(try_remove_ws).replace(old, new)
+        gtnew = ground_truth.map(try_remove_ws)
+        converted = dfnew
+        date_cols = ["Closing Date", "Updated Date"]
+        converted[date_cols] = converted[date_cols].apply(to_datetime)
+        gtnew = gtnew[gtnew["Bank Name"].isin(converted["Bank Name"])].reset_index(
+            drop=True
+        )
+        tm.assert_frame_equal(converted, gtnew)
+
+    @pytest.mark.slow
+    def test_heartland_bank(self, banklist_data, flavor_read_html):
+        gc = "Heartland Bank"
+        with open(banklist_data, encoding="utf-8") as f:
+            raw_text = f.read()
+
+        assert gc in raw_text
+        df = flavor_read_html(banklist_data, match=gc, attrs={"id": "table"})[0]
+        assert gc in df.to_string()
+
+    def test_different_number_of_cols(self, flavor_read_html):
+        expected = flavor_read_html(
+            StringIO(
+                """<table>
+                        <thead>
+                            <tr style="text-align: right;">
+                            <th></th>
+                            <th>C_l0_g0</th>
+                            <th>C_l0_g1</th>
+                            <th>C_l0_g2</th>
+                            <th>C_l0_g3</th>
+                            <th>C_l0_g4</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <th>R_l0_g0</th>
+                            <td> 0.763</td>
+                            <td> 0.233</td>
+                            <td> nan</td>
+                            <td> nan</td>
+                            <td> nan</td>
+                            </tr>
+                            <tr>
+                            <th>R_l0_g1</th>
+                            <td> 0.244</td>
+                            <td> 0.285</td>
+                            <td> 0.392</td>
+                            <td> 0.137</td>
+                            <td> 0.222</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+            ),
+            index_col=0,
+        )[0]
+
+        result = flavor_read_html(
+            StringIO(
+                """<table>
+                    <thead>
+                        <tr style="text-align: right;">
+                        <th></th>
+                        <th>C_l0_g0</th>
+                        <th>C_l0_g1</th>
+                        <th>C_l0_g2</th>
+                        <th>C_l0_g3</th>
+                        <th>C_l0_g4</th>
+                        </tr>
+                    </thead>
+                    <tbody>
+                        <tr>
+                        <th>R_l0_g0</th>
+                        <td> 0.763</td>
+                        <td> 0.233</td>
+                        </tr>
+                        <tr>
+                        <th>R_l0_g1</th>
+                        <td> 0.244</td>
+                        <td> 0.285</td>
+                        <td> 0.392</td>
+                        <td> 0.137</td>
+                        <td> 0.222</td>
+                        </tr>
+                    </tbody>
+                 </table>"""
+            ),
+            index_col=0,
+        )[0]
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_colspan_rowspan_1(self, flavor_read_html):
+        # GH17054
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <th>A</th>
+                    <th colspan="1">B</th>
+                    <th rowspan="1">C</th>
+                </tr>
+                <tr>
+                    <td>a</td>
+                    <td>b</td>
+                    <td>c</td>
+                </tr>
+            </table>
+        """
+            )
+        )[0]
+
+        expected = DataFrame([["a", "b", "c"]], columns=["A", "B", "C"])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_colspan_rowspan_copy_values(self, flavor_read_html):
+        # GH17054
+
+        # In ASCII, with lowercase letters being copies:
+        #
+        # X x Y Z W
+        # A B b z C
+
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <td colspan="2">X</td>
+                    <td>Y</td>
+                    <td rowspan="2">Z</td>
+                    <td>W</td>
+                </tr>
+                <tr>
+                    <td>A</td>
+                    <td colspan="2">B</td>
+                    <td>C</td>
+                </tr>
+            </table>
+        """
+            ),
+            header=0,
+        )[0]
+
+        expected = DataFrame(
+            data=[["A", "B", "B", "Z", "C"]], columns=["X", "X.1", "Y", "Z", "W"]
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_colspan_rowspan_both_not_1(self, flavor_read_html):
+        # GH17054
+
+        # In ASCII, with lowercase letters being copies:
+        #
+        # A B b b C
+        # a b b b D
+
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <td rowspan="2">A</td>
+                    <td rowspan="2" colspan="3">B</td>
+                    <td>C</td>
+                </tr>
+                <tr>
+                    <td>D</td>
+                </tr>
+            </table>
+        """
+            ),
+            header=0,
+        )[0]
+
+        expected = DataFrame(
+            data=[["A", "B", "B", "B", "D"]], columns=["A", "B", "B.1", "B.2", "C"]
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_rowspan_at_end_of_row(self, flavor_read_html):
+        # GH17054
+
+        # In ASCII, with lowercase letters being copies:
+        #
+        # A B
+        # C b
+
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <td>A</td>
+                    <td rowspan="2">B</td>
+                </tr>
+                <tr>
+                    <td>C</td>
+                </tr>
+            </table>
+        """
+            ),
+            header=0,
+        )[0]
+
+        expected = DataFrame(data=[["C", "B"]], columns=["A", "B"])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_rowspan_only_rows(self, flavor_read_html):
+        # GH17054
+
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <td rowspan="3">A</td>
+                    <td rowspan="3">B</td>
+                </tr>
+            </table>
+        """
+            ),
+            header=0,
+        )[0]
+
+        expected = DataFrame(data=[["A", "B"], ["A", "B"]], columns=["A", "B"])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_rowspan_in_header_overflowing_to_body(self, flavor_read_html):
+        # GH60210
+
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <th rowspan="2">A</th>
+                    <th>B</th>
+                </tr>
+                <tr>
+                    <td>1</td>
+                </tr>
+                <tr>
+                    <td>C</td>
+                    <td>2</td>
+                </tr>
+            </table>
+        """
+            )
+        )[0]
+
+        expected = DataFrame(data=[["A", 1], ["C", 2]], columns=["A", "B"])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_header_inferred_from_rows_with_only_th(self, flavor_read_html):
+        # GH17054
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <th>A</th>
+                    <th>B</th>
+                </tr>
+                <tr>
+                    <th>a</th>
+                    <th>b</th>
+                </tr>
+                <tr>
+                    <td>1</td>
+                    <td>2</td>
+                </tr>
+            </table>
+        """
+            )
+        )[0]
+
+        columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]])
+        expected = DataFrame(data=[[1, 2]], columns=columns)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_parse_dates_list(self, flavor_read_html):
+        df = DataFrame({"date": date_range("1/1/2001", periods=10)})
+
+        expected = df[:]
+        expected["date"] = expected["date"].dt.as_unit("us")
+
+        str_df = df.to_html()
+        res = flavor_read_html(StringIO(str_df), parse_dates=[1], index_col=0)
+        tm.assert_frame_equal(expected, res[0])
+        res = flavor_read_html(StringIO(str_df), parse_dates=["date"], index_col=0)
+        tm.assert_frame_equal(expected, res[0])
+
+    def test_wikipedia_states_table(self, datapath, flavor_read_html):
+        data = datapath("io", "data", "html", "wikipedia_states.html")
+        assert os.path.isfile(data), f"{data!r} is not a file"
+        assert os.path.getsize(data), f"{data!r} is an empty file"
+        result = flavor_read_html(data, match="Arizona", header=1)[0]
+        assert result.shape == (60, 12)
+        assert "Unnamed" in result.columns[-1]
+        assert result["sq mi"].dtype == np.dtype("float64")
+        assert np.allclose(result.loc[0, "sq mi"], 665384.04)
+
+    def test_wikipedia_states_multiindex(self, datapath, flavor_read_html):
+        data = datapath("io", "data", "html", "wikipedia_states.html")
+        result = flavor_read_html(data, match="Arizona", index_col=0)[0]
+        assert result.shape == (60, 11)
+        assert "Unnamed" in result.columns[-1][1]
+        assert result.columns.nlevels == 2
+        assert np.allclose(result.loc["Alaska", ("Total area[2]", "sq mi")], 665384.04)
+
+    def test_parser_error_on_empty_header_row(self, flavor_read_html):
+        result = flavor_read_html(
+            StringIO(
+                """
+                <table>
+                    <thead>
+                        <tr><th></th><th></tr>
+                        <tr><th>A</th><th>B</th></tr>
+                    </thead>
+                    <tbody>
+                        <tr><td>a</td><td>b</td></tr>
+                    </tbody>
+                </table>
+            """
+            ),
+            header=[0, 1],
+        )
+        expected = DataFrame(
+            [["a", "b"]],
+            columns=MultiIndex.from_tuples(
+                [("Unnamed: 0_level_0", "A"), ("Unnamed: 1_level_0", "B")]
+            ),
+        )
+        tm.assert_frame_equal(result[0], expected)
+
+    def test_decimal_rows(self, flavor_read_html):
+        # GH 12907
+        result = flavor_read_html(
+            StringIO(
+                """<html>
+            <body>
+             <table>
+                <thead>
+                    <tr>
+                        <th>Header</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <td>1100#101</td>
+                    </tr>
+                </tbody>
+            </table>
+            </body>
+        </html>"""
+            ),
+            decimal="#",
+        )[0]
+
+        expected = DataFrame(data={"Header": 1100.101}, index=[0])
+
+        assert result["Header"].dtype == np.dtype("float64")
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("arg", [True, False])
+    def test_bool_header_arg(self, spam_data, arg, flavor_read_html):
+        # GH 6114
+        msg = re.escape(
+            "Passing a bool to header is invalid. Use header=None for no header or "
+            "header=int or list-like of ints to specify the row(s) making up the "
+            "column names"
+        )
+        with pytest.raises(TypeError, match=msg):
+            flavor_read_html(spam_data, header=arg)
+
+    def test_converters(self, flavor_read_html):
+        # GH 13461
+        result = flavor_read_html(
+            StringIO(
+                """<table>
+                 <thead>
+                   <tr>
+                     <th>a</th>
+                    </tr>
+                 </thead>
+                 <tbody>
+                   <tr>
+                     <td> 0.763</td>
+                   </tr>
+                   <tr>
+                     <td> 0.244</td>
+                   </tr>
+                 </tbody>
+               </table>"""
+            ),
+            converters={"a": str},
+        )[0]
+
+        expected = DataFrame({"a": ["0.763", "0.244"]})
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_na_values(self, flavor_read_html):
+        # GH 13461
+        result = flavor_read_html(
+            StringIO(
+                """<table>
+                 <thead>
+                   <tr>
+                     <th>a</th>
+                   </tr>
+                 </thead>
+                 <tbody>
+                   <tr>
+                     <td> 0.763</td>
+                   </tr>
+                   <tr>
+                     <td> 0.244</td>
+                   </tr>
+                 </tbody>
+               </table>"""
+            ),
+            na_values=[0.244],
+        )[0]
+
+        expected = DataFrame({"a": [0.763, np.nan]})
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_keep_default_na(self, flavor_read_html):
+        html_data = """<table>
+                        <thead>
+                            <tr>
+                            <th>a</th>
+                            </tr>
+                        </thead>
+                        <tbody>
+                            <tr>
+                            <td> N/A</td>
+                            </tr>
+                            <tr>
+                            <td> NA</td>
+                            </tr>
+                        </tbody>
+                    </table>"""
+
+        expected_df = DataFrame({"a": ["N/A", "NA"]})
+        html_df = flavor_read_html(StringIO(html_data), keep_default_na=False)[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
+        expected_df = DataFrame({"a": [np.nan, np.nan]})
+        html_df = flavor_read_html(StringIO(html_data), keep_default_na=True)[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
+    def test_preserve_empty_rows(self, flavor_read_html):
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <th>A</th>
+                    <th>B</th>
+                </tr>
+                <tr>
+                    <td>a</td>
+                    <td>b</td>
+                </tr>
+                <tr>
+                    <td></td>
+                    <td></td>
+                </tr>
+            </table>
+        """
+            )
+        )[0]
+
+        expected = DataFrame(data=[["a", "b"], [np.nan, np.nan]], columns=["A", "B"])
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_ignore_empty_rows_when_inferring_header(self, flavor_read_html):
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <thead>
+                    <tr><th></th><th></tr>
+                    <tr><th>A</th><th>B</th></tr>
+                    <tr><th>a</th><th>b</th></tr>
+                </thead>
+                <tbody>
+                    <tr><td>1</td><td>2</td></tr>
+                </tbody>
+            </table>
+        """
+            )
+        )[0]
+
+        columns = MultiIndex(levels=[["A", "B"], ["a", "b"]], codes=[[0, 1], [0, 1]])
+        expected = DataFrame(data=[[1, 2]], columns=columns)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_multiple_header_rows(self, flavor_read_html):
+        # Issue #13434
+        expected_df = DataFrame(
+            data=[("Hillary", 68, "D"), ("Bernie", 74, "D"), ("Donald", 69, "R")]
+        )
+        expected_df.columns = [
+            ["Unnamed: 0_level_0", "Age", "Party"],
+            ["Name", "Unnamed: 1_level_1", "Unnamed: 2_level_1"],
+        ]
+        html = expected_df.to_html(index=False)
+        html_df = flavor_read_html(StringIO(html))[0]
+        tm.assert_frame_equal(expected_df, html_df)
+
+    def test_works_on_valid_markup(self, datapath, flavor_read_html):
+        filename = datapath("io", "data", "html", "valid_markup.html")
+        dfs = flavor_read_html(filename, index_col=0)
+        assert isinstance(dfs, list)
+        assert isinstance(dfs[0], DataFrame)
+
+    @pytest.mark.slow
+    def test_fallback_success(self, datapath, flavor_read_html):
+        banklist_data = datapath("io", "data", "html", "banklist.html")
+
+        flavor_read_html(banklist_data, match=".*Water.*", flavor=["lxml", "html5lib"])
+
+    def test_to_html_timestamp(self):
+        rng = date_range("2000-01-01", periods=10)
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 4)), index=rng)
+
+        result = df.to_html()
+        assert "2000-01-01" in result
+
+    def test_to_html_borderless(self):
+        df = DataFrame([{"A": 1, "B": 2}])
+        out_border_default = df.to_html()
+        out_border_true = df.to_html(border=True)
+        out_border_explicit_default = df.to_html(border=1)
+        out_border_nondefault = df.to_html(border=2)
+        out_border_zero = df.to_html(border=0)
+
+        out_border_false = df.to_html(border=False)
+
+        assert ' border="1"' in out_border_default
+        assert out_border_true == out_border_default
+        assert out_border_default == out_border_explicit_default
+        assert out_border_default != out_border_nondefault
+        assert ' border="2"' in out_border_nondefault
+        assert ' border="0"' not in out_border_zero
+        assert " border" not in out_border_false
+        assert out_border_zero == out_border_false
+
+    @pytest.mark.parametrize(
+        "displayed_only,exp0,exp1",
+        [
+            (True, ["foo"], None),
+            (False, ["foo  bar  baz  qux"], DataFrame(["foo"])),
+        ],
+    )
+    def test_displayed_only(self, displayed_only, exp0, exp1, flavor_read_html):
+        # GH 20027
+        data = """<html>
+          <body>
+            <table>
+              <tr>
+                <td>
+                  foo
+                  <span style="display:none;text-align:center">bar</span>
+                  <span style="display:none">baz</span>
+                  <span style="display: none">qux</span>
+                </td>
+              </tr>
+            </table>
+            <table style="display: none">
+              <tr>
+                <td>foo</td>
+              </tr>
+            </table>
+          </body>
+        </html>"""
+
+        exp0 = DataFrame(exp0)
+        dfs = flavor_read_html(StringIO(data), displayed_only=displayed_only)
+        tm.assert_frame_equal(dfs[0], exp0)
+
+        if exp1 is not None:
+            tm.assert_frame_equal(dfs[1], exp1)
+        else:
+            assert len(dfs) == 1  # Should not parse hidden table
+
+    @pytest.mark.parametrize("displayed_only", [True, False])
+    def test_displayed_only_with_many_elements(self, displayed_only, flavor_read_html):
+        html_table = """
+        <table>
+            <tr>
+                <th>A</th>
+                <th>B</th>
+            </tr>
+            <tr>
+                <td>1</td>
+                <td>2</td>
+            </tr>
+            <tr>
+                <td><span style="display:none"></span>4</td>
+                <td>5</td>
+            </tr>
+        </table>
+        """
+        result = flavor_read_html(StringIO(html_table), displayed_only=displayed_only)[
+            0
+        ]
+        expected = DataFrame({"A": [1, 4], "B": [2, 5]})
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:You provided Unicode markup but also provided a value for "
+        "from_encoding.*:UserWarning"
+    )
+    def test_encode(self, html_encoding_file, flavor_read_html):
+        base_path = os.path.basename(html_encoding_file)
+        root = os.path.splitext(base_path)[0]
+        _, encoding = root.split("_")
+
+        try:
+            with open(html_encoding_file, "rb") as fobj:
+                from_string = flavor_read_html(
+                    BytesIO(fobj.read()), encoding=encoding, index_col=0
+                ).pop()
+
+            with open(html_encoding_file, "rb") as fobj:
+                from_file_like = flavor_read_html(
+                    BytesIO(fobj.read()), encoding=encoding, index_col=0
+                ).pop()
+
+            from_filename = flavor_read_html(
+                html_encoding_file, encoding=encoding, index_col=0
+            ).pop()
+            tm.assert_frame_equal(from_string, from_file_like)
+            tm.assert_frame_equal(from_string, from_filename)
+        except Exception:
+            # seems utf-16/32 fail on windows
+            if is_platform_windows():
+                if "16" in encoding or "32" in encoding:
+                    pytest.skip()
+            raise
+
+    def test_parse_failure_unseekable(self, flavor_read_html):
+        # Issue #17975
+
+        if flavor_read_html.keywords.get("flavor") == "lxml":
+            pytest.skip("Not applicable for lxml")
+
+        class UnseekableStringIO(StringIO):
+            def seekable(self):
+                return False
+
+        bad = UnseekableStringIO(
+            """
+            <table><tr><td>spam<foobr />eggs</td></tr></table>"""
+        )
+
+        assert flavor_read_html(bad)
+
+        with pytest.raises(ValueError, match="passed a non-rewindable file object"):
+            flavor_read_html(bad)
+
+    def test_parse_failure_rewinds(self, flavor_read_html):
+        # Issue #17975
+
+        class MockFile:
+            def __init__(self, data) -> None:
+                self.data = data
+                self.at_end = False
+
+            def read(self, size=None):
+                data = "" if self.at_end else self.data
+                self.at_end = True
+                return data
+
+            def seek(self, offset):
+                self.at_end = False
+
+            def seekable(self):
+                return True
+
+            def __next__(self): ...
+
+            def __iter__(self) -> Iterator:
+                # `is_file_like` depends on the presence of
+                # the __iter__ attribute.
+                return self
+
+        good = MockFile("<table><tr><td>spam<br />eggs</td></tr></table>")
+        bad = MockFile("<table><tr><td>spam<foobr />eggs</td></tr></table>")
+
+        assert flavor_read_html(good)
+        assert flavor_read_html(bad)
+
+    @pytest.mark.slow
+    @pytest.mark.single_cpu
+    def test_importcheck_thread_safety(self, datapath, flavor_read_html):
+        # see gh-16928
+
+        class ErrorThread(threading.Thread):
+            def run(self):
+                try:
+                    super().run()
+                except Exception as err:
+                    self.err = err
+                else:
+                    self.err = None
+
+        filename = datapath("io", "data", "html", "valid_markup.html")
+        helper_thread1 = ErrorThread(target=flavor_read_html, args=(filename,))
+        helper_thread2 = ErrorThread(target=flavor_read_html, args=(filename,))
+
+        helper_thread1.start()
+        helper_thread2.start()
+
+        while helper_thread1.is_alive() or helper_thread2.is_alive():
+            pass
+        assert None is helper_thread1.err is helper_thread2.err
+
+    def test_parse_path_object(self, datapath, flavor_read_html):
+        # GH 37705
+        file_path_string = datapath("io", "data", "html", "spam.html")
+        file_path = Path(file_path_string)
+        df1 = flavor_read_html(file_path_string)[0]
+        df2 = flavor_read_html(file_path)[0]
+        tm.assert_frame_equal(df1, df2)
+
+    def test_parse_br_as_space(self, flavor_read_html):
+        # GH 29528: pd.read_html() convert <br> to space
+        result = flavor_read_html(
+            StringIO(
+                """
+            <table>
+                <tr>
+                    <th>A</th>
+                </tr>
+                <tr>
+                    <td>word1<br>word2</td>
+                </tr>
+            </table>
+        """
+            )
+        )[0]
+
+        expected = DataFrame(data=[["word1 word2"]], columns=["A"])
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("arg", ["all", "body", "header", "footer"])
+    def test_extract_links(self, arg, flavor_read_html):
+        gh_13141_data = """
+          <table>
+            <tr>
+              <th>HTTP</th>
+              <th>FTP</th>
+              <th><a href="https://en.wiktionary.org/wiki/linkless">Linkless</a></th>
+            </tr>
+            <tr>
+              <td><a href="https://en.wikipedia.org/">Wikipedia</a></td>
+              <td>SURROUNDING <a href="ftp://ftp.us.debian.org/">Debian</a> TEXT</td>
+              <td>Linkless</td>
+            </tr>
+            <tfoot>
+              <tr>
+                <td><a href="https://en.wikipedia.org/wiki/Page_footer">Footer</a></td>
+                <td>
+                  Multiple <a href="1">links:</a> <a href="2">Only first captured.</a>
+                </td>
+              </tr>
+            </tfoot>
+          </table>
+          """
+
+        gh_13141_expected = {
+            "head_ignore": ["HTTP", "FTP", "Linkless"],
+            "head_extract": [
+                ("HTTP", None),
+                ("FTP", None),
+                ("Linkless", "https://en.wiktionary.org/wiki/linkless"),
+            ],
+            "body_ignore": ["Wikipedia", "SURROUNDING Debian TEXT", "Linkless"],
+            "body_extract": [
+                ("Wikipedia", "https://en.wikipedia.org/"),
+                ("SURROUNDING Debian TEXT", "ftp://ftp.us.debian.org/"),
+                ("Linkless", None),
+            ],
+            "footer_ignore": [
+                "Footer",
+                "Multiple links: Only first captured.",
+                None,
+            ],
+            "footer_extract": [
+                ("Footer", "https://en.wikipedia.org/wiki/Page_footer"),
+                ("Multiple links: Only first captured.", "1"),
+                None,
+            ],
+        }
+
+        data_exp = gh_13141_expected["body_ignore"]
+        foot_exp = gh_13141_expected["footer_ignore"]
+        head_exp = gh_13141_expected["head_ignore"]
+        if arg == "all":
+            data_exp = gh_13141_expected["body_extract"]
+            foot_exp = gh_13141_expected["footer_extract"]
+            head_exp = gh_13141_expected["head_extract"]
+        elif arg == "body":
+            data_exp = gh_13141_expected["body_extract"]
+        elif arg == "footer":
+            foot_exp = gh_13141_expected["footer_extract"]
+        elif arg == "header":
+            head_exp = gh_13141_expected["head_extract"]
+
+        result = flavor_read_html(StringIO(gh_13141_data), extract_links=arg)[0]
+        expected = DataFrame([data_exp, foot_exp], columns=head_exp)
+        expected = expected.fillna(np.nan)
+        tm.assert_frame_equal(result, expected)
+
+    def test_extract_links_bad(self, spam_data):
+        msg = (
+            "`extract_links` must be one of "
+            '{None, "header", "footer", "body", "all"}, got "incorrect"'
+        )
+        with pytest.raises(ValueError, match=msg):
+            read_html(spam_data, extract_links="incorrect")
+
+    def test_extract_links_all_no_header(self, flavor_read_html):
+        # GH 48316
+        data = """
+        <table>
+          <tr>
+            <td>
+              <a href='https://google.com'>Google.com</a>
+            </td>
+          </tr>
+        </table>
+        """
+        result = flavor_read_html(StringIO(data), extract_links="all")[0]
+        expected = DataFrame([[("Google.com", "https://google.com")]])
+        tm.assert_frame_equal(result, expected)
+
+    def test_invalid_dtype_backend(self):
+        msg = (
+            "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+            "'pyarrow' are allowed."
+        )
+        with pytest.raises(ValueError, match=msg):
+            read_html("test", dtype_backend="numpy")
+
+    def test_style_tag(self, flavor_read_html):
+        # GH 48316
+        data = """
+        <table>
+            <tr>
+                <th>
+                    <style>.style</style>
+                    A
+                    </th>
+                <th>B</th>
+            </tr>
+            <tr>
+                <td>A1</td>
+                <td>B1</td>
+            </tr>
+            <tr>
+                <td>A2</td>
+                <td>B2</td>
+            </tr>
+        </table>
+        """
+        result = flavor_read_html(StringIO(data))[0]
+        expected = DataFrame(data=[["A1", "B1"], ["A2", "B2"]], columns=["A", "B"])
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_http_headers.py b/pandas/tests/io/test_http_headers.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b9c8769ad9dc909f99d346395d5cd2113984992
--- /dev/null
+++ b/pandas/tests/io/test_http_headers.py
@@ -0,0 +1,174 @@
+"""
+Tests for the pandas custom headers in http(s) requests
+"""
+
+from functools import partial
+import gzip
+from io import BytesIO
+
+import pytest
+
+from pandas._config import using_string_dtype
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas._testing as tm
+
+pytestmark = [
+    pytest.mark.single_cpu,
+    pytest.mark.network,
+    pytest.mark.filterwarnings(
+        "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+    ),
+]
+
+
+def gzip_bytes(response_bytes):
+    with BytesIO() as bio:
+        with gzip.GzipFile(fileobj=bio, mode="w") as zipper:
+            zipper.write(response_bytes)
+        return bio.getvalue()
+
+
+def csv_responder(df):
+    return df.to_csv(index=False).encode("utf-8")
+
+
+def gz_csv_responder(df):
+    return gzip_bytes(csv_responder(df))
+
+
+def json_responder(df):
+    return df.to_json().encode("utf-8")
+
+
+def gz_json_responder(df):
+    return gzip_bytes(json_responder(df))
+
+
+def html_responder(df):
+    return df.to_html(index=False).encode("utf-8")
+
+
+def parquetpyarrow_reponder(df):
+    return df.to_parquet(index=False, engine="pyarrow")
+
+
+def parquetfastparquet_responder(df):
+    # the fastparquet engine doesn't like to write to a buffer
+    # it can do it via the open_with function being set appropriately
+    # however it automatically calls the close method and wipes the buffer
+    # so just overwrite that attribute on this instance to not do that
+
+    # protected by an importorskip in the respective test
+    import fsspec
+
+    df.to_parquet(
+        "memory://fastparquet_user_agent.parquet",
+        index=False,
+        engine="fastparquet",
+        compression=None,
+    )
+    with fsspec.open("memory://fastparquet_user_agent.parquet", "rb") as f:
+        return f.read()
+
+
+def pickle_respnder(df):
+    with BytesIO() as bio:
+        df.to_pickle(bio)
+        return bio.getvalue()
+
+
+def stata_responder(df):
+    with BytesIO() as bio:
+        df.to_stata(bio, write_index=False)
+        return bio.getvalue()
+
+
+@pytest.mark.parametrize(
+    "responder, read_method",
+    [
+        (csv_responder, pd.read_csv),
+        (json_responder, pd.read_json),
+        (
+            html_responder,
+            lambda *args, **kwargs: pd.read_html(*args, **kwargs)[0],
+        ),
+        pytest.param(
+            parquetpyarrow_reponder,
+            partial(pd.read_parquet, engine="pyarrow"),
+            marks=td.skip_if_no("pyarrow"),
+        ),
+        pytest.param(
+            parquetfastparquet_responder,
+            partial(pd.read_parquet, engine="fastparquet"),
+            marks=[
+                td.skip_if_no("fastparquet"),
+                td.skip_if_no("fsspec"),
+                pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string"),
+            ],
+        ),
+        (pickle_respnder, pd.read_pickle),
+        (stata_responder, pd.read_stata),
+        (gz_csv_responder, pd.read_csv),
+        (gz_json_responder, pd.read_json),
+    ],
+)
+@pytest.mark.parametrize(
+    "storage_options",
+    [
+        None,
+        {"User-Agent": "foo"},
+        {"User-Agent": "foo", "Auth": "bar"},
+    ],
+)
+def test_request_headers(responder, read_method, httpserver, storage_options):
+    expected = pd.DataFrame({"a": ["b"]})
+    default_headers = ["Accept-Encoding", "Host", "Connection", "User-Agent"]
+    if "gz" in responder.__name__:
+        extra = {"Content-Encoding": "gzip"}
+        if storage_options is None:
+            storage_options = extra
+        else:
+            storage_options |= extra
+    else:
+        extra = None
+    expected_headers = set(default_headers).union(
+        storage_options.keys() if storage_options else []
+    )
+    httpserver.serve_content(content=responder(expected), headers=extra)
+    result = read_method(httpserver.url, storage_options=storage_options)
+    tm.assert_frame_equal(result, expected)
+
+    request_headers = dict(httpserver.requests[0].headers)
+    for header in expected_headers:
+        exp = request_headers.pop(header)
+        if storage_options and header in storage_options:
+            assert exp == storage_options[header]
+    # No extra headers added
+    assert not request_headers
+
+
+@pytest.mark.parametrize(
+    "engine",
+    [
+        "pyarrow",
+        "fastparquet",
+    ],
+)
+def test_to_parquet_to_disk_with_storage_options(engine):
+    headers = {
+        "User-Agent": "custom",
+        "Auth": "other_custom",
+    }
+
+    pytest.importorskip(engine)
+
+    true_df = pd.DataFrame({"column_name": ["column_value"]})
+    msg = (
+        "storage_options passed with file object or non-fsspec file path|"
+        "storage_options passed with buffer, or non-supported URL"
+    )
+    with pytest.raises(ValueError, match=msg):
+        true_df.to_parquet("/tmp/junk.parquet", storage_options=headers, engine=engine)
diff --git a/pandas/tests/io/test_iceberg.py b/pandas/tests/io/test_iceberg.py
new file mode 100644
index 0000000000000000000000000000000000000000..689eddb1985e6344d72c26fbb30a5237489cdadd
--- /dev/null
+++ b/pandas/tests/io/test_iceberg.py
@@ -0,0 +1,222 @@
+"""
+Tests for the Apache Iceberg format.
+
+Tests in this file use a simple Iceberg catalog based on SQLite, with the same
+data used for Parquet tests (``pandas/tests/io/data/parquet/simple.parquet``).
+"""
+
+import collections
+import importlib
+import pathlib
+
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+from pandas.io.iceberg import read_iceberg
+
+pytestmark = pytest.mark.single_cpu
+
+pyiceberg = pytest.importorskip("pyiceberg")
+pyiceberg_catalog = pytest.importorskip("pyiceberg.catalog")
+pq = pytest.importorskip("pyarrow.parquet")
+
+Catalog = collections.namedtuple("Catalog", ["name", "uri", "warehouse"])
+
+
+@pytest.fixture
+def catalog(request, tmp_path):
+    # the catalog stores the full path of data files, so the catalog needs to be
+    # created dynamically, and not saved in pandas/tests/io/data as other formats
+    uri = f"sqlite:///{tmp_path}/catalog.sqlite"
+    warehouse = f"file://{tmp_path}"
+    catalog_name = request.param if hasattr(request, "param") else None
+    catalog = pyiceberg_catalog.load_catalog(
+        catalog_name or "default",
+        type="sql",
+        uri=uri,
+        warehouse=warehouse,
+    )
+    catalog.create_namespace("ns")
+
+    df = pq.read_table(
+        pathlib.Path(__file__).parent / "data" / "parquet" / "simple.parquet"
+    )
+    table = catalog.create_table("ns.my_table", schema=df.schema)
+    table.append(df)
+
+    if catalog_name is not None:
+        config_path = pathlib.Path.home() / ".pyiceberg.yaml"
+        with open(config_path, "w", encoding="utf-8") as f:
+            f.write(f"""\
+catalog:
+  {catalog_name}:
+    type: sql
+    uri: {uri}
+    warehouse: {warehouse}""")
+
+        importlib.reload(pyiceberg_catalog)  # needed to reload the config file
+
+    yield Catalog(name=catalog_name or "default", uri=uri, warehouse=warehouse)
+
+    if catalog_name is not None:
+        config_path.unlink()
+
+
+class TestIceberg:
+    def test_read(self, catalog):
+        expected = pd.DataFrame(
+            {
+                "A": [1, 2, 3],
+                "B": ["foo", "foo", "foo"],
+            }
+        )
+        result = read_iceberg(
+            "ns.my_table",
+            catalog_properties={"uri": catalog.uri},
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("catalog", ["default", "pandas_tests"], indirect=True)
+    def test_read_by_catalog_name(self, catalog):
+        expected = pd.DataFrame(
+            {
+                "A": [1, 2, 3],
+                "B": ["foo", "foo", "foo"],
+            }
+        )
+        result = read_iceberg(
+            "ns.my_table",
+            catalog_name=catalog.name,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_read_with_row_filter(self, catalog):
+        expected = pd.DataFrame(
+            {
+                "A": [2, 3],
+                "B": ["foo", "foo"],
+            }
+        )
+        result = read_iceberg(
+            "ns.my_table",
+            catalog_properties={"uri": catalog.uri},
+            row_filter="A > 1",
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_read_with_case_sensitive(self, catalog):
+        expected = pd.DataFrame(
+            {
+                "A": [1, 2, 3],
+            }
+        )
+        result = read_iceberg(
+            "ns.my_table",
+            catalog_properties={"uri": catalog.uri},
+            columns=["a"],
+            case_sensitive=False,
+        )
+        tm.assert_frame_equal(result, expected)
+
+        with pytest.raises(ValueError, match="^Could not find column"):
+            read_iceberg(
+                "ns.my_table",
+                catalog_properties={"uri": catalog.uri},
+                columns=["a"],
+                case_sensitive=True,
+            )
+
+    def test_read_with_limit(self, catalog):
+        expected = pd.DataFrame(
+            {
+                "A": [1, 2],
+                "B": ["foo", "foo"],
+            }
+        )
+        result = read_iceberg(
+            "ns.my_table",
+            catalog_properties={"uri": catalog.uri},
+            limit=2,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_write(self, catalog):
+        df = pd.DataFrame(
+            {
+                "A": [1, 2, 3],
+                "B": ["foo", "foo", "foo"],
+            }
+        )
+        df.to_iceberg(
+            "ns.new_table",
+            catalog_properties={"uri": catalog.uri},
+            location=catalog.warehouse,
+        )
+        result = read_iceberg(
+            "ns.new_table",
+            catalog_properties={"uri": catalog.uri},
+        )
+        tm.assert_frame_equal(result, df)
+
+    @pytest.mark.parametrize("catalog", ["default", "pandas_tests"], indirect=True)
+    def test_write_by_catalog_name(self, catalog):
+        df = pd.DataFrame(
+            {
+                "A": [1, 2, 3],
+                "B": ["foo", "foo", "foo"],
+            }
+        )
+        df.to_iceberg(
+            "ns.new_table",
+            catalog_name=catalog.name,
+        )
+        result = read_iceberg(
+            "ns.new_table",
+            catalog_name=catalog.name,
+        )
+        tm.assert_frame_equal(result, df)
+
+    def test_write_existing_table_with_append_true(self, catalog):
+        original = read_iceberg(
+            "ns.my_table",
+            catalog_properties={"uri": catalog.uri},
+        )
+        new = pd.DataFrame(
+            {
+                "A": [1, 2, 3],
+                "B": ["foo", "foo", "foo"],
+            }
+        )
+        expected = pd.concat([original, new], ignore_index=True)
+        new.to_iceberg(
+            "ns.my_table",
+            catalog_properties={"uri": catalog.uri},
+            location=catalog.warehouse,
+            append=True,
+        )
+        result = read_iceberg(
+            "ns.my_table",
+            catalog_properties={"uri": catalog.uri},
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_write_existing_table_with_append_false(self, catalog):
+        df = pd.DataFrame(
+            {
+                "A": [1, 2, 3],
+                "B": ["foo", "foo", "foo"],
+            }
+        )
+        df.to_iceberg(
+            "ns.my_table",
+            catalog_properties={"uri": catalog.uri},
+            location=catalog.warehouse,
+            append=False,
+        )
+        result = read_iceberg(
+            "ns.my_table",
+            catalog_properties={"uri": catalog.uri},
+        )
+        tm.assert_frame_equal(result, df)
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e61494103355c5836f3ad0e9187d712093610dc
--- /dev/null
+++ b/pandas/tests/io/test_orc.py
@@ -0,0 +1,432 @@
+"""test orc compat"""
+
+import datetime
+from decimal import Decimal
+from io import BytesIO
+import os
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import read_orc
+import pandas._testing as tm
+
+pytest.importorskip("pyarrow.orc")
+
+import pyarrow as pa
+
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+)
+
+
+@pytest.fixture
+def dirpath(datapath):
+    return datapath("io", "data", "orc")
+
+
+def test_orc_reader_empty(dirpath, using_infer_string):
+    columns = [
+        "boolean1",
+        "byte1",
+        "short1",
+        "int1",
+        "long1",
+        "float1",
+        "double1",
+        "bytes1",
+        "string1",
+    ]
+    dtypes = [
+        "bool",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "float32",
+        "float64",
+        "object",
+        "str" if using_infer_string else "object",
+    ]
+    expected = pd.DataFrame(index=pd.RangeIndex(0))
+    for colname, dtype in zip(columns, dtypes, strict=True):
+        expected[colname] = pd.Series(dtype=dtype)
+    expected.columns = expected.columns.astype("str")
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
+    got = read_orc(inputfile, columns=columns)
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_basic(dirpath):
+    data = {
+        "boolean1": np.array([False, True], dtype="bool"),
+        "byte1": np.array([1, 100], dtype="int8"),
+        "short1": np.array([1024, 2048], dtype="int16"),
+        "int1": np.array([65536, 65536], dtype="int32"),
+        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
+        "float1": np.array([1.0, 2.0], dtype="float32"),
+        "double1": np.array([-15.0, -5.0], dtype="float64"),
+        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
+        "string1": np.array(["hi", "bye"], dtype="object"),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.test1.orc")
+    got = read_orc(inputfile, columns=data.keys())
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_decimal(dirpath):
+    # Only testing the first 10 rows of data
+    data = {
+        "_col0": np.array(
+            [
+                Decimal("-1000.50000"),
+                Decimal("-999.60000"),
+                Decimal("-998.70000"),
+                Decimal("-997.80000"),
+                Decimal("-996.90000"),
+                Decimal("-995.10000"),
+                Decimal("-994.11000"),
+                Decimal("-993.12000"),
+                Decimal("-992.13000"),
+                Decimal("-991.14000"),
+            ],
+            dtype="object",
+        )
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.decimal.orc")
+    got = read_orc(inputfile).iloc[:10]
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_date_low(dirpath):
+    data = {
+        "time": np.array(
+            [
+                "1900-05-05 12:34:56.100000",
+                "1900-05-05 12:34:56.100100",
+                "1900-05-05 12:34:56.100200",
+                "1900-05-05 12:34:56.100300",
+                "1900-05-05 12:34:56.100400",
+                "1900-05-05 12:34:56.100500",
+                "1900-05-05 12:34:56.100600",
+                "1900-05-05 12:34:56.100700",
+                "1900-05-05 12:34:56.100800",
+                "1900-05-05 12:34:56.100900",
+            ],
+            dtype="datetime64[ns]",
+        ),
+        "date": np.array(
+            [
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+                datetime.date(1900, 12, 25),
+            ],
+            dtype="object",
+        ),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.testDate1900.orc")
+    got = read_orc(inputfile).iloc[:10]
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_date_high(dirpath):
+    data = {
+        "time": np.array(
+            [
+                "2038-05-05 12:34:56.100000",
+                "2038-05-05 12:34:56.100100",
+                "2038-05-05 12:34:56.100200",
+                "2038-05-05 12:34:56.100300",
+                "2038-05-05 12:34:56.100400",
+                "2038-05-05 12:34:56.100500",
+                "2038-05-05 12:34:56.100600",
+                "2038-05-05 12:34:56.100700",
+                "2038-05-05 12:34:56.100800",
+                "2038-05-05 12:34:56.100900",
+            ],
+            dtype="datetime64[ns]",
+        ),
+        "date": np.array(
+            [
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+                datetime.date(2038, 12, 25),
+            ],
+            dtype="object",
+        ),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.testDate2038.orc")
+    got = read_orc(inputfile).iloc[:10]
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_reader_snappy_compressed(dirpath):
+    data = {
+        "int1": np.array(
+            [
+                -1160101563,
+                1181413113,
+                2065821249,
+                -267157795,
+                172111193,
+                1752363137,
+                1406072123,
+                1911809390,
+                -1308542224,
+                -467100286,
+            ],
+            dtype="int32",
+        ),
+        "string1": np.array(
+            [
+                "f50dcb8",
+                "382fdaaa",
+                "90758c6",
+                "9e8caf3f",
+                "ee97332b",
+                "d634da1",
+                "2bea4396",
+                "d67d89e8",
+                "ad71007e",
+                "e8c82066",
+            ],
+            dtype="object",
+        ),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    inputfile = os.path.join(dirpath, "TestOrcFile.testSnappy.orc")
+    got = read_orc(inputfile).iloc[:10]
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_roundtrip_file(dirpath, temp_file):
+    # GH44554
+    # PyArrow gained ORC write support with the current argument order
+    pytest.importorskip("pyarrow")
+
+    data = {
+        "boolean1": np.array([False, True], dtype="bool"),
+        "byte1": np.array([1, 100], dtype="int8"),
+        "short1": np.array([1024, 2048], dtype="int16"),
+        "int1": np.array([65536, 65536], dtype="int32"),
+        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
+        "float1": np.array([1.0, 2.0], dtype="float32"),
+        "double1": np.array([-15.0, -5.0], dtype="float64"),
+        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
+        "string1": np.array(["hi", "bye"], dtype="object"),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    expected.to_orc(temp_file)
+    got = read_orc(temp_file)
+
+    tm.assert_equal(expected, got)
+
+
+def test_orc_roundtrip_bytesio():
+    # GH44554
+    # PyArrow gained ORC write support with the current argument order
+    pytest.importorskip("pyarrow")
+
+    data = {
+        "boolean1": np.array([False, True], dtype="bool"),
+        "byte1": np.array([1, 100], dtype="int8"),
+        "short1": np.array([1024, 2048], dtype="int16"),
+        "int1": np.array([65536, 65536], dtype="int32"),
+        "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
+        "float1": np.array([1.0, 2.0], dtype="float32"),
+        "double1": np.array([-15.0, -5.0], dtype="float64"),
+        "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
+        "string1": np.array(["hi", "bye"], dtype="object"),
+    }
+    expected = pd.DataFrame.from_dict(data)
+
+    bytes = expected.to_orc()
+    got = read_orc(BytesIO(bytes))
+
+    tm.assert_equal(expected, got)
+
+
+@pytest.mark.parametrize(
+    "orc_writer_dtypes_not_supported",
+    [
+        np.array([1, 20], dtype="uint64"),
+        pd.Series(["a", "b", "a"], dtype="category"),
+        [pd.Interval(left=0, right=2), pd.Interval(left=0, right=5)],
+        [pd.Period("2022-01-03", freq="D"), pd.Period("2022-01-04", freq="D")],
+    ],
+)
+def test_orc_writer_dtypes_not_supported(orc_writer_dtypes_not_supported):
+    # GH44554
+    # PyArrow gained ORC write support with the current argument order
+    pytest.importorskip("pyarrow")
+
+    df = pd.DataFrame({"unimpl": orc_writer_dtypes_not_supported})
+    msg = "The dtype of one or more columns is not supported yet."
+    with pytest.raises(NotImplementedError, match=msg):
+        df.to_orc()
+
+
+def test_orc_dtype_backend_pyarrow(using_infer_string):
+    pytest.importorskip("pyarrow")
+    df = pd.DataFrame(
+        {
+            "string": list("abc"),
+            "string_with_nan": ["a", np.nan, "c"],
+            "string_with_none": ["a", None, "c"],
+            "bytes": [b"foo", b"bar", None],
+            "int": list(range(1, 4)),
+            "float": np.arange(4.0, 7.0, dtype="float64"),
+            "float_with_nan": [2.0, np.nan, 3.0],
+            "bool": [True, False, True],
+            "bool_with_na": [True, False, None],
+            "datetime": pd.date_range("20130101", periods=3, unit="ns"),
+            "datetime_with_nat": [
+                pd.Timestamp("20130101"),
+                pd.NaT,
+                pd.Timestamp("20130103"),
+            ],
+        }
+    )
+    # FIXME: without casting to ns we do not round-trip correctly
+    df["datetime_with_nat"] = df["datetime_with_nat"].astype("M8[ns]")
+
+    bytes_data = df.copy().to_orc()
+    result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow")
+
+    expected = pd.DataFrame(
+        {
+            col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
+            for col in df.columns
+        }
+    )
+    if using_infer_string:
+        # ORC does not preserve distinction between string and large string
+        # -> the default large string comes back as string
+        string_dtype = pd.ArrowDtype(pa.string())
+        expected["string"] = expected["string"].astype(string_dtype)
+        expected["string_with_nan"] = expected["string_with_nan"].astype(string_dtype)
+        expected["string_with_none"] = expected["string_with_none"].astype(string_dtype)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_orc_dtype_backend_numpy_nullable():
+    # GH#50503
+    pytest.importorskip("pyarrow")
+    df = pd.DataFrame(
+        {
+            "string": list("abc"),
+            "string_with_nan": ["a", np.nan, "c"],
+            "string_with_none": ["a", None, "c"],
+            "int": list(range(1, 4)),
+            "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
+            "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
+            "float": np.arange(4.0, 7.0, dtype="float64"),
+            "float_with_nan": [2.0, np.nan, 3.0],
+            "bool": [True, False, True],
+            "bool_with_na": [True, False, None],
+        }
+    )
+
+    bytes_data = df.copy().to_orc()
+    result = read_orc(BytesIO(bytes_data), dtype_backend="numpy_nullable")
+
+    expected = pd.DataFrame(
+        {
+            "string": pd.array(["a", "b", "c"], dtype=pd.StringDtype()),
+            "string_with_nan": pd.array(["a", pd.NA, "c"], dtype=pd.StringDtype()),
+            "string_with_none": pd.array(["a", pd.NA, "c"], dtype=pd.StringDtype()),
+            "int": pd.Series([1, 2, 3], dtype="Int64"),
+            "int_with_nan": pd.Series([1, pd.NA, 3], dtype="Int64"),
+            "na_only": pd.Series([pd.NA, pd.NA, pd.NA], dtype="Int64"),
+            "float": pd.Series([4.0, 5.0, 6.0], dtype="Float64"),
+            "float_with_nan": pd.Series([2.0, pd.NA, 3.0], dtype="Float64"),
+            "bool": pd.Series([True, False, True], dtype="boolean"),
+            "bool_with_na": pd.Series([True, False, pd.NA], dtype="boolean"),
+        }
+    )
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_orc_uri_path(temp_file):
+    expected = pd.DataFrame({"int": list(range(1, 4))})
+    expected.to_orc(temp_file)
+    uri = temp_file.as_uri()
+    result = read_orc(uri)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        pd.RangeIndex(start=2, stop=5, step=1),
+        pd.RangeIndex(start=0, stop=3, step=1, name="non-default"),
+        pd.Index([1, 2, 3]),
+    ],
+)
+def test_to_orc_non_default_index(index):
+    df = pd.DataFrame({"a": [1, 2, 3]}, index=index)
+    msg = (
+        "orc does not support serializing a non-default index|"
+        "orc does not serialize index meta-data"
+    )
+    with pytest.raises(ValueError, match=msg):
+        df.to_orc()
+
+
+def test_invalid_dtype_backend(temp_file):
+    msg = (
+        "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+        "'pyarrow' are allowed."
+    )
+    df = pd.DataFrame({"int": list(range(1, 4))})
+    df.to_orc(temp_file)
+    with pytest.raises(ValueError, match=msg):
+        read_orc(temp_file, dtype_backend="numpy")
+
+
+def test_string_inference(temp_file):
+    # GH#54431
+    df = pd.DataFrame(data={"a": ["x", "y"]})
+    df.to_orc(temp_file)
+    with pd.option_context("future.infer_string", True):
+        result = read_orc(temp_file)
+    expected = pd.DataFrame(
+        data={"a": ["x", "y"]},
+        dtype=pd.StringDtype(na_value=np.nan),
+        columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b7f49e18f5490c0d74ba41e4581a8edc336ddc6
--- /dev/null
+++ b/pandas/tests/io/test_parquet.py
@@ -0,0 +1,1465 @@
+"""test parquet compat"""
+
+import datetime
+from decimal import Decimal
+from io import BytesIO
+import os
+import pathlib
+
+import numpy as np
+import pytest
+
+from pandas._config import using_string_dtype
+
+from pandas.compat import is_platform_windows
+from pandas.compat.pyarrow import (
+    pa_version_under15p0,
+    pa_version_under17p0,
+    pa_version_under19p0,
+    pa_version_under20p0,
+)
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.util.version import Version
+
+from pandas.io.parquet import (
+    FastParquetImpl,
+    PyArrowImpl,
+    get_engine,
+    read_parquet,
+    to_parquet,
+)
+
+try:
+    import pyarrow
+
+    _HAVE_PYARROW = True
+except ImportError:
+    _HAVE_PYARROW = False
+
+try:
+    import fastparquet
+
+    _HAVE_FASTPARQUET = True
+except ImportError:
+    _HAVE_FASTPARQUET = False
+
+
+pytestmark = [
+    pytest.mark.filterwarnings("ignore:DataFrame._data is deprecated:FutureWarning"),
+    pytest.mark.filterwarnings(
+        "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+    ),
+]
+
+
+# setup engines & skips
+@pytest.fixture(
+    params=[
+        pytest.param(
+            "fastparquet",
+            marks=[
+                pytest.mark.skipif(
+                    not _HAVE_FASTPARQUET,
+                    reason="fastparquet is not installed",
+                ),
+                pytest.mark.xfail(
+                    using_string_dtype(),
+                    reason="TODO(infer_string) fastparquet",
+                    strict=False,
+                ),
+            ],
+        ),
+        pytest.param(
+            "pyarrow",
+            marks=pytest.mark.skipif(
+                not _HAVE_PYARROW, reason="pyarrow is not installed"
+            ),
+        ),
+    ]
+)
+def engine(request):
+    return request.param
+
+
+@pytest.fixture
+def pa():
+    if not _HAVE_PYARROW:
+        pytest.skip("pyarrow is not installed")
+    return "pyarrow"
+
+
+@pytest.fixture
+def fp(request):
+    if not _HAVE_FASTPARQUET:
+        pytest.skip("fastparquet is not installed")
+    if using_string_dtype():
+        request.applymarker(
+            pytest.mark.xfail(reason="TODO(infer_string) fastparquet", strict=False)
+        )
+    return "fastparquet"
+
+
+@pytest.fixture
+def df_compat():
+    return pd.DataFrame({"A": [1, 2, 3], "B": "foo"}, columns=pd.Index(["A", "B"]))
+
+
+@pytest.fixture
+def df_cross_compat():
+    df = pd.DataFrame(
+        {
+            "a": list("abc"),
+            "b": list(range(1, 4)),
+            # 'c': np.arange(3, 6).astype('u1'),
+            "d": np.arange(4.0, 7.0, dtype="float64"),
+            "e": [True, False, True],
+            "f": pd.date_range("20130101", periods=3),
+            # 'g': pd.date_range('20130101', periods=3,
+            #                    tz='US/Eastern'),
+            # 'h': pd.date_range('20130101', periods=3, freq='ns')
+        }
+    )
+    return df
+
+
+@pytest.fixture
+def df_full():
+    return pd.DataFrame(
+        {
+            "string": list("abc"),
+            "string_with_nan": ["a", np.nan, "c"],
+            "string_with_none": ["a", None, "c"],
+            "bytes": [b"foo", b"bar", b"baz"],
+            "unicode": ["foo", "bar", "baz"],
+            "int": list(range(1, 4)),
+            "uint": np.arange(3, 6).astype("u1"),
+            "float": np.arange(4.0, 7.0, dtype="float64"),
+            "float_with_nan": [2.0, np.nan, 3.0],
+            "bool": [True, False, True],
+            "datetime": pd.date_range("20130101", periods=3, unit="ns"),
+            "datetime_with_nat": [
+                pd.Timestamp("20130101"),
+                pd.NaT,
+                pd.Timestamp("20130103"),
+            ],
+        }
+    )
+
+
+@pytest.fixture(
+    params=[
+        datetime.datetime.now(datetime.UTC),
+        datetime.datetime.now(datetime.timezone.min),
+        datetime.datetime.now(datetime.timezone.max),
+        datetime.datetime.strptime("2019-01-04T16:41:24+0200", "%Y-%m-%dT%H:%M:%S%z"),
+        datetime.datetime.strptime("2019-01-04T16:41:24+0215", "%Y-%m-%dT%H:%M:%S%z"),
+        datetime.datetime.strptime("2019-01-04T16:41:24-0200", "%Y-%m-%dT%H:%M:%S%z"),
+        datetime.datetime.strptime("2019-01-04T16:41:24-0215", "%Y-%m-%dT%H:%M:%S%z"),
+    ]
+)
+def timezone_aware_date_list(request):
+    return request.param
+
+
+def check_round_trip(
+    df,
+    temp_file,
+    engine=None,
+    path=None,
+    write_kwargs=None,
+    read_kwargs=None,
+    expected=None,
+    check_names=True,
+    check_like=False,
+    check_dtype=True,
+    repeat=2,
+):
+    """Verify parquet serializer and deserializer produce the same results.
+
+    Performs a pandas to disk and disk to pandas round trip,
+    then compares the 2 resulting DataFrames to verify equality.
+
+    Parameters
+    ----------
+    df: Dataframe
+    engine: str, optional
+        'pyarrow' or 'fastparquet'
+    path: str, optional
+    write_kwargs: dict of str:str, optional
+    read_kwargs: dict of str:str, optional
+    expected: DataFrame, optional
+        Expected deserialization result, otherwise will be equal to `df`
+    check_names: list of str, optional
+        Closed set of column names to be compared
+    check_like: bool, optional
+        If True, ignore the order of index & columns.
+    repeat: int, optional
+        How many times to repeat the test
+    """
+    if not isinstance(temp_file, pathlib.Path):
+        raise ValueError("temp_file must be a pathlib.Path")
+    write_kwargs = write_kwargs or {"compression": None}
+    read_kwargs = read_kwargs or {}
+
+    if expected is None:
+        expected = df
+
+    if engine:
+        write_kwargs["engine"] = engine
+        read_kwargs["engine"] = engine
+
+    def compare(repeat):
+        for _ in range(repeat):
+            df.to_parquet(path, **write_kwargs)
+            actual = read_parquet(path, **read_kwargs)
+
+            if "string_with_nan" in expected:
+                expected.loc[1, "string_with_nan"] = None
+            tm.assert_frame_equal(
+                expected,
+                actual,
+                check_names=check_names,
+                check_like=check_like,
+                check_dtype=check_dtype,
+            )
+
+    if path is None:
+        path = temp_file
+        compare(repeat)
+    else:
+        compare(repeat)
+
+
+def check_partition_names(path, expected):
+    """Check partitions of a parquet file are as expected.
+
+    Parameters
+    ----------
+    path: str
+        Path of the dataset.
+    expected: iterable of str
+        Expected partition names.
+    """
+    import pyarrow.dataset as ds
+
+    dataset = ds.dataset(path, partitioning="hive")
+    assert dataset.partitioning.schema.names == expected
+
+
+def test_invalid_engine(df_compat, temp_file):
+    msg = "engine must be one of 'pyarrow', 'fastparquet'"
+    with pytest.raises(ValueError, match=msg):
+        check_round_trip(df_compat, temp_file, "foo", "bar")
+
+
+def test_options_py(df_compat, pa, using_infer_string, temp_file):
+    # use the set option
+    if using_infer_string and not pa_version_under19p0:
+        df_compat.columns = df_compat.columns.astype("str")
+
+    with pd.option_context("io.parquet.engine", "pyarrow"):
+        check_round_trip(df_compat, temp_file)
+
+
+def test_options_fp(df_compat, fp, temp_file):
+    # use the set option
+
+    with pd.option_context("io.parquet.engine", "fastparquet"):
+        check_round_trip(df_compat, temp_file)
+
+
+def test_options_auto(df_compat, fp, pa, temp_file):
+    # use the set option
+
+    with pd.option_context("io.parquet.engine", "auto"):
+        check_round_trip(df_compat, temp_file)
+
+
+def test_options_get_engine(fp, pa):
+    assert isinstance(get_engine("pyarrow"), PyArrowImpl)
+    assert isinstance(get_engine("fastparquet"), FastParquetImpl)
+
+    with pd.option_context("io.parquet.engine", "pyarrow"):
+        assert isinstance(get_engine("auto"), PyArrowImpl)
+        assert isinstance(get_engine("pyarrow"), PyArrowImpl)
+        assert isinstance(get_engine("fastparquet"), FastParquetImpl)
+
+    with pd.option_context("io.parquet.engine", "fastparquet"):
+        assert isinstance(get_engine("auto"), FastParquetImpl)
+        assert isinstance(get_engine("pyarrow"), PyArrowImpl)
+        assert isinstance(get_engine("fastparquet"), FastParquetImpl)
+
+    with pd.option_context("io.parquet.engine", "auto"):
+        assert isinstance(get_engine("auto"), PyArrowImpl)
+        assert isinstance(get_engine("pyarrow"), PyArrowImpl)
+        assert isinstance(get_engine("fastparquet"), FastParquetImpl)
+
+
+def test_get_engine_auto_error_message():
+    # Expect different error messages from get_engine(engine="auto")
+    # if engines aren't installed vs. are installed but bad version
+    from pandas.compat._optional import VERSIONS
+
+    # Do we have engines installed, but a bad version of them?
+    pa_min_ver = VERSIONS.get("pyarrow")
+    fp_min_ver = VERSIONS.get("fastparquet")
+    have_pa_bad_version = (
+        False
+        if not _HAVE_PYARROW
+        else Version(pyarrow.__version__) < Version(pa_min_ver)
+    )
+    have_fp_bad_version = (
+        False
+        if not _HAVE_FASTPARQUET
+        else Version(fastparquet.__version__) < Version(fp_min_ver)
+    )
+    # Do we have usable engines installed?
+    have_usable_pa = _HAVE_PYARROW and not have_pa_bad_version
+    have_usable_fp = _HAVE_FASTPARQUET and not have_fp_bad_version
+
+    if not have_usable_pa and not have_usable_fp:
+        # No usable engines found.
+        if have_pa_bad_version:
+            match = f"Pandas requires version .{pa_min_ver}. or newer of .pyarrow."
+            with pytest.raises(ImportError, match=match):
+                get_engine("auto")
+        else:
+            match = "Unable to find a usable engine; tried using: 'pyarrow'"
+            with pytest.raises(ImportError, match=match):
+                get_engine("auto")
+
+        if have_fp_bad_version:
+            match = f"Pandas requires version .{fp_min_ver}. or newer of .fastparquet."
+            with pytest.raises(ImportError, match=match):
+                get_engine("auto")
+        else:
+            match = "Use pip or conda to install the fastparquet package"
+            with pytest.raises(ImportError, match=match):
+                get_engine("auto")
+
+
+def test_cross_engine_pa_fp(df_cross_compat, pa, fp, temp_file):
+    # cross-compat with differing reading/writing engines
+
+    df = df_cross_compat
+    df.to_parquet(temp_file, engine=pa, compression=None)
+
+    result = read_parquet(temp_file, engine=fp)
+    tm.assert_frame_equal(result, df)
+
+    result = read_parquet(temp_file, engine=fp, columns=["a", "d"])
+    tm.assert_frame_equal(result, df[["a", "d"]])
+
+
+def test_cross_engine_fp_pa(df_cross_compat, pa, fp, temp_file):
+    # cross-compat with differing reading/writing engines
+    df = df_cross_compat
+
+    df.to_parquet(temp_file, engine=fp, compression=None)
+
+    result = read_parquet(temp_file, engine=pa)
+    tm.assert_frame_equal(result, df)
+
+    result = read_parquet(temp_file, engine=pa, columns=["a", "d"])
+    tm.assert_frame_equal(result, df[["a", "d"]])
+
+
+class Base:
+    def check_error_on_write(self, df, engine, exc, err_msg, temp_file_path):
+        # check that we are raising the exception on writing
+        with pytest.raises(exc, match=err_msg):
+            to_parquet(df, temp_file_path, engine, compression=None)
+
+    def check_external_error_on_write(self, df, engine, exc, temp_file_path):
+        # check that an external library is raising the exception on writing
+        with tm.external_error_raised(exc):
+            to_parquet(df, temp_file_path, engine, compression=None)
+
+
+class TestBasic(Base):
+    def test_error(self, engine, temp_file):
+        for obj in [
+            pd.Series([1, 2, 3]),
+            1,
+            "foo",
+            pd.Timestamp("20130101"),
+            np.array([1, 2, 3]),
+        ]:
+            msg = "to_parquet only supports IO with DataFrames"
+            self.check_error_on_write(obj, engine, ValueError, msg, temp_file)
+
+    def test_columns_dtypes(self, engine, temp_file):
+        df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
+
+        # unicode
+        df.columns = ["foo", "bar"]
+        check_round_trip(df, temp_file, engine)
+
+    @pytest.mark.parametrize("compression", [None, "gzip", "snappy", "brotli"])
+    def test_compression(self, engine, compression, temp_file):
+        df = pd.DataFrame({"A": [1, 2, 3]})
+        check_round_trip(
+            df, temp_file, engine, write_kwargs={"compression": compression}
+        )
+
+    def test_read_columns(self, engine, temp_file):
+        # GH18154
+        df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
+
+        expected = pd.DataFrame({"string": list("abc")})
+        check_round_trip(
+            df,
+            temp_file,
+            engine,
+            expected=expected,
+            read_kwargs={"columns": ["string"]},
+        )
+
+    def test_read_filters(self, engine, tmp_path):
+        df = pd.DataFrame(
+            {
+                "int": list(range(4)),
+                "part": list("aabb"),
+            }
+        )
+
+        expected = pd.DataFrame({"int": [0, 1]})
+        check_round_trip(
+            df,
+            tmp_path,
+            engine,
+            expected=expected,
+            write_kwargs={"partition_cols": ["part"]},
+            read_kwargs={"filters": [("part", "==", "a")], "columns": ["int"]},
+            repeat=1,
+        )
+
+    def test_write_index(self, temp_file):
+        pytest.importorskip("pyarrow")
+        df = pd.DataFrame({"A": [1, 2, 3]})
+        check_round_trip(df, temp_file, "pyarrow")
+
+        indexes = [
+            [2, 3, 4],
+            pd.date_range("20130101", periods=3, unit="ns"),
+            list("abc"),
+            [1, 3, 4],
+        ]
+        # non-default index
+        for index in indexes:
+            df.index = index
+            if isinstance(index, pd.DatetimeIndex):
+                df.index = df.index._with_freq(None)  # freq doesn't round-trip
+            check_round_trip(df, temp_file, "pyarrow")
+
+        # index with meta-data
+        df.index = [0, 1, 2]
+        df.index.name = "foo"
+        check_round_trip(df, temp_file, "pyarrow")
+
+    def test_write_multiindex(self, pa, temp_file):
+        # Not supported in fastparquet as of 0.1.3 or older pyarrow version
+        engine = pa
+
+        df = pd.DataFrame({"A": [1, 2, 3]})
+        index = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
+        df.index = index
+        check_round_trip(df, temp_file, engine)
+
+    def test_multiindex_with_columns(self, pa, temp_file):
+        engine = pa
+        dates = pd.date_range("01-Jan-2018", "01-Dec-2018", freq="MS", unit="ns")
+        df = pd.DataFrame(
+            np.random.default_rng(2).standard_normal((2 * len(dates), 3)),
+            columns=list("ABC"),
+        )
+        index1 = pd.MultiIndex.from_product(
+            [["Level1", "Level2"], dates], names=["level", "date"]
+        )
+        index2 = index1.copy(names=None)
+        for index in [index1, index2]:
+            df.index = index
+
+            check_round_trip(df, temp_file, engine)
+            check_round_trip(
+                df,
+                temp_file,
+                engine,
+                read_kwargs={"columns": ["A", "B"]},
+                expected=df[["A", "B"]],
+            )
+
+    def test_write_ignoring_index(self, engine, temp_file):
+        # ENH 20768
+        # Ensure index=False omits the index from the written Parquet file.
+        df = pd.DataFrame({"a": [1, 2, 3], "b": ["q", "r", "s"]})
+
+        write_kwargs = {"compression": None, "index": False}
+
+        # Because we're dropping the index, we expect the loaded dataframe to
+        # have the default integer index.
+        expected = df.reset_index(drop=True)
+
+        check_round_trip(
+            df, temp_file, engine, write_kwargs=write_kwargs, expected=expected
+        )
+
+        # Ignore custom index
+        df = pd.DataFrame(
+            {"a": [1, 2, 3], "b": ["q", "r", "s"]}, index=["zyx", "wvu", "tsr"]
+        )
+
+        check_round_trip(
+            df, temp_file, engine, write_kwargs=write_kwargs, expected=expected
+        )
+
+        # Ignore multi-indexes as well.
+        arrays = [
+            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
+            ["one", "two", "one", "two", "one", "two", "one", "two"],
+        ]
+        df = pd.DataFrame(
+            {"one": list(range(8)), "two": [-i for i in range(8)]}, index=arrays
+        )
+
+        expected = df.reset_index(drop=True)
+        check_round_trip(
+            df, temp_file, engine, write_kwargs=write_kwargs, expected=expected
+        )
+
+    def test_write_column_multiindex(self, engine, temp_file):
+        # Not able to write column multi-indexes with non-string column names.
+        mi_columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
+        df = pd.DataFrame(
+            np.random.default_rng(2).standard_normal((4, 3)), columns=mi_columns
+        )
+
+        if engine == "fastparquet":
+            self.check_error_on_write(
+                df, engine, TypeError, "Column name must be a string", temp_file
+            )
+        elif engine == "pyarrow":
+            check_round_trip(df, temp_file, engine)
+
+    def test_write_column_multiindex_nonstring(self, engine, temp_file):
+        # GH #34777
+
+        # Not able to write column multi-indexes with non-string column names
+        arrays = [
+            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
+            [1, 2, 1, 2, 1, 2, 1, 2],
+        ]
+        df = pd.DataFrame(
+            np.random.default_rng(2).standard_normal((8, 8)), columns=arrays
+        )
+        df.columns.names = ["Level1", "Level2"]
+        if engine == "fastparquet":
+            self.check_error_on_write(df, engine, ValueError, "Column name", temp_file)
+        elif engine == "pyarrow":
+            check_round_trip(df, temp_file, engine)
+
+    def test_write_column_multiindex_string(self, pa, temp_file):
+        # GH #34777
+        # Not supported in fastparquet as of 0.1.3
+        engine = pa
+
+        # Write column multi-indexes with string column names
+        arrays = [
+            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
+            ["one", "two", "one", "two", "one", "two", "one", "two"],
+        ]
+        df = pd.DataFrame(
+            np.random.default_rng(2).standard_normal((8, 8)), columns=arrays
+        )
+        df.columns.names = ["ColLevel1", "ColLevel2"]
+
+        check_round_trip(df, temp_file, engine)
+
+    def test_write_column_index_string(self, pa, temp_file):
+        # GH #34777
+        # Not supported in fastparquet as of 0.1.3
+        engine = pa
+
+        # Write column indexes with string column names
+        arrays = ["bar", "baz", "foo", "qux"]
+        df = pd.DataFrame(
+            np.random.default_rng(2).standard_normal((8, 4)), columns=arrays
+        )
+        df.columns.name = "StringCol"
+
+        check_round_trip(df, temp_file, engine)
+
+    def test_write_column_index_nonstring(self, engine, temp_file):
+        # GH #34777
+
+        # Write column indexes with string column names
+        arrays = [1, 2, 3, 4]
+        df = pd.DataFrame(
+            np.random.default_rng(2).standard_normal((8, 4)), columns=arrays
+        )
+        df.columns.name = "NonStringCol"
+        if engine == "fastparquet":
+            self.check_error_on_write(
+                df, engine, TypeError, "Column name must be a string", temp_file
+            )
+        else:
+            check_round_trip(df, temp_file, engine)
+
+    def test_dtype_backend(self, engine, request, temp_file):
+        pq = pytest.importorskip("pyarrow.parquet")
+
+        if engine == "fastparquet":
+            # We are manually disabling fastparquet's
+            # nullable dtype support pending discussion
+            mark = pytest.mark.xfail(
+                reason="Fastparquet nullable dtype support is disabled"
+            )
+            request.applymarker(mark)
+
+        table = pyarrow.table(
+            {
+                "a": pyarrow.array([1, 2, 3, None], "int64"),
+                "b": pyarrow.array([1, 2, 3, None], "uint8"),
+                "c": pyarrow.array(["a", "b", "c", None]),
+                "d": pyarrow.array([True, False, True, None]),
+                # Test that nullable dtypes used even in absence of nulls
+                "e": pyarrow.array([1, 2, 3, 4], "int64"),
+                # GH 45694
+                "f": pyarrow.array([1.0, 2.0, 3.0, None], "float32"),
+                "g": pyarrow.array([1.0, 2.0, 3.0, None], "float64"),
+            }
+        )
+        # write manually with pyarrow to write integers
+        pq.write_table(table, temp_file)
+        result1 = read_parquet(temp_file, engine=engine)
+        result2 = read_parquet(temp_file, engine=engine, dtype_backend="numpy_nullable")
+
+        assert result1["a"].dtype == np.dtype("float64")
+        expected = pd.DataFrame(
+            {
+                "a": pd.array([1, 2, 3, None], dtype="Int64"),
+                "b": pd.array([1, 2, 3, None], dtype="UInt8"),
+                "c": pd.array(["a", "b", "c", None], dtype="string"),
+                "d": pd.array([True, False, True, None], dtype="boolean"),
+                "e": pd.array([1, 2, 3, 4], dtype="Int64"),
+                "f": pd.array([1.0, 2.0, 3.0, None], dtype="Float32"),
+                "g": pd.array([1.0, 2.0, 3.0, None], dtype="Float64"),
+            }
+        )
+        if engine == "fastparquet":
+            # Fastparquet doesn't support string columns yet
+            # Only int and boolean
+            result2 = result2.drop("c", axis=1)
+            expected = expected.drop("c", axis=1)
+        tm.assert_frame_equal(result2, expected)
+
+    @pytest.mark.parametrize(
+        "dtype",
+        [
+            "Int64",
+            "UInt8",
+            "boolean",
+            "object",
+            "datetime64[ns, UTC]",
+            "float",
+            "period[D]",
+            "Float64",
+            "string",
+        ],
+    )
+    def test_read_empty_array(self, pa, dtype, temp_file):
+        # GH #41241
+        df = pd.DataFrame(
+            {
+                "value": pd.array([], dtype=dtype),
+            }
+        )
+        pytest.importorskip("pyarrow", "11.0.0")
+        # GH 45694
+        expected = None
+        if dtype == "float":
+            expected = pd.DataFrame(
+                {
+                    "value": pd.array([], dtype="Float64"),
+                }
+            )
+        check_round_trip(
+            df,
+            temp_file,
+            pa,
+            read_kwargs={"dtype_backend": "numpy_nullable"},
+            expected=expected,
+        )
+
+    @pytest.mark.network
+    @pytest.mark.single_cpu
+    def test_parquet_read_from_url(self, httpserver, datapath, df_compat, engine):
+        if engine != "auto":
+            pytest.importorskip(engine)
+        with open(datapath("io", "data", "parquet", "simple.parquet"), mode="rb") as f:
+            httpserver.serve_content(content=f.read())
+            df = read_parquet(httpserver.url, engine=engine)
+
+        expected = df_compat
+        if pa_version_under19p0:
+            expected.columns = expected.columns.astype(object)
+        tm.assert_frame_equal(df, expected)
+
+
+class TestParquetPyArrow(Base):
+    def test_basic(self, pa, df_full, temp_file):
+        df = df_full
+        pytest.importorskip("pyarrow", "11.0.0")
+
+        # additional supported types for pyarrow
+        dti = pd.date_range("20130101", periods=3, tz="Europe/Brussels")
+        dti = dti._with_freq(None)  # freq doesn't round-trip
+        df["datetime_tz"] = dti
+        df["bool_with_none"] = [True, None, True]
+
+        check_round_trip(df, temp_file, pa)
+
+    def test_basic_subset_columns(self, pa, df_full, temp_file):
+        # GH18628
+
+        df = df_full
+        # additional supported types for pyarrow
+        df["datetime_tz"] = pd.date_range("20130101", periods=3, tz="Europe/Brussels")
+
+        check_round_trip(
+            df,
+            temp_file,
+            pa,
+            expected=df[["string", "int"]],
+            read_kwargs={"columns": ["string", "int"]},
+        )
+
+    def test_to_bytes_without_path_or_buf_provided(self, pa, df_full):
+        # GH 37105
+        buf_bytes = df_full.to_parquet(engine=pa)
+        assert isinstance(buf_bytes, bytes)
+
+        buf_stream = BytesIO(buf_bytes)
+        res = read_parquet(buf_stream)
+
+        expected = df_full.copy()
+        expected.loc[1, "string_with_nan"] = None
+        expected["datetime_with_nat"] = expected["datetime_with_nat"].astype("M8[us]")
+        tm.assert_frame_equal(res, expected)
+
+    def test_duplicate_columns(self, pa, temp_file):
+        # not currently able to handle duplicate columns
+        df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()
+        self.check_error_on_write(
+            df, pa, ValueError, "Duplicate column names found", temp_file
+        )
+
+    def test_timedelta(self, pa, temp_file):
+        df = pd.DataFrame({"a": pd.timedelta_range("1 day", periods=3)})
+        check_round_trip(df, temp_file, pa)
+
+    def test_unsupported(self, pa, temp_file):
+        # mixed python objects
+        df = pd.DataFrame({"a": ["a", 1, 2.0]})
+        # pyarrow 0.11 raises ArrowTypeError
+        # older pyarrows raise ArrowInvalid
+        self.check_external_error_on_write(df, pa, pyarrow.ArrowException, temp_file)
+
+    def test_unsupported_float16(self, pa, temp_file):
+        # #44847, #44914
+        # Not able to write float 16 column using pyarrow.
+        data = np.arange(2, 10, dtype=np.float16)
+        df = pd.DataFrame(data=data, columns=["fp16"])
+        if pa_version_under15p0:
+            self.check_external_error_on_write(
+                df, pa, pyarrow.ArrowException, temp_file
+            )
+        else:
+            check_round_trip(df, temp_file, pa)
+
+    @pytest.mark.xfail(
+        is_platform_windows(),
+        reason=(
+            "PyArrow does not cleanup of partial files dumps when unsupported "
+            "dtypes are passed to_parquet function in windows"
+        ),
+    )
+    @pytest.mark.skipif(not pa_version_under15p0, reason="float16 works on 15")
+    @pytest.mark.parametrize("path_type", [str, pathlib.Path])
+    def test_unsupported_float16_cleanup(self, pa, path_type, temp_file):
+        # #44847, #44914
+        # Not able to write float 16 column using pyarrow.
+        # Tests cleanup by pyarrow in case of an error
+        data = np.arange(2, 10, dtype=np.float16)
+        df = pd.DataFrame(data=data, columns=["fp16"])
+
+        path = path_type(temp_file)
+        with tm.external_error_raised(pyarrow.ArrowException):
+            df.to_parquet(path=path, engine=pa)
+        assert not os.path.isfile(path)
+
+    def test_categorical(self, pa, temp_file):
+        # supported in >= 0.7.0
+        df = pd.DataFrame(
+            {
+                "a": pd.Categorical(list("abcdef")),
+                # test for null, out-of-order values, and unobserved category
+                "b": pd.Categorical(
+                    ["bar", "foo", "foo", "bar", None, "bar"],
+                    dtype=pd.CategoricalDtype(["foo", "bar", "baz"]),
+                ),
+                # test for ordered flag
+                "c": pd.Categorical(
+                    [None, "b", "c", None, "c", "b"],
+                    categories=["b", "c", "d"],
+                    ordered=True,
+                ),
+            }
+        )
+
+        check_round_trip(df, temp_file, pa)
+
+    @pytest.mark.single_cpu
+    def test_s3_roundtrip_explicit_fs(
+        self, df_compat, s3_bucket_public, s3so, pa, temp_file
+    ):
+        s3fs = pytest.importorskip("s3fs")
+        s3 = s3fs.S3FileSystem(**s3so)
+        kw = {"filesystem": s3}
+        check_round_trip(
+            df_compat,
+            temp_file,
+            pa,
+            path=f"{s3_bucket_public.name}/pyarrow.parquet",
+            read_kwargs=kw,
+            write_kwargs=kw,
+        )
+
+    @pytest.mark.single_cpu
+    def test_s3_roundtrip(self, df_compat, s3_bucket_public, s3so, pa, temp_file):
+        # GH #19134
+        s3so = {"storage_options": s3so}
+        check_round_trip(
+            df_compat,
+            temp_file,
+            pa,
+            path=f"s3://{s3_bucket_public.name}/pyarrow.parquet",
+            read_kwargs=s3so,
+            write_kwargs=s3so,
+        )
+
+    @pytest.mark.single_cpu
+    @pytest.mark.parametrize("partition_col", [["A"], []])
+    def test_s3_roundtrip_for_dir(
+        self, df_compat, s3_bucket_public, pa, partition_col, s3so, temp_file
+    ):
+        pytest.importorskip("s3fs")
+        # GH #26388
+        expected_df = df_compat.copy()
+
+        # GH #35791
+        if partition_col:
+            expected_df = expected_df.astype(dict.fromkeys(partition_col, np.int32))
+            partition_col_type = "category"
+
+            expected_df[partition_col] = expected_df[partition_col].astype(
+                partition_col_type
+            )
+
+        check_round_trip(
+            df_compat,
+            temp_file,
+            pa,
+            expected=expected_df,
+            path=f"s3://{s3_bucket_public.name}/parquet_dir",
+            read_kwargs={"storage_options": s3so},
+            write_kwargs={
+                "partition_cols": partition_col,
+                "compression": None,
+                "storage_options": s3so,
+            },
+            check_like=True,
+            repeat=1,
+        )
+
+    def test_read_file_like_obj_support(self, df_compat, using_infer_string):
+        pytest.importorskip("pyarrow")
+        buffer = BytesIO()
+        df_compat.to_parquet(buffer)
+        df_from_buf = read_parquet(buffer)
+        if using_infer_string and not pa_version_under19p0:
+            df_compat.columns = df_compat.columns.astype("str")
+        tm.assert_frame_equal(df_compat, df_from_buf)
+
+    def test_expand_user(self, df_compat, monkeypatch):
+        pytest.importorskip("pyarrow")
+        monkeypatch.setenv("HOME", "TestingUser")
+        monkeypatch.setenv("USERPROFILE", "TestingUser")
+        with pytest.raises(OSError, match=r".*TestingUser.*"):
+            read_parquet("~/file.parquet")
+        with pytest.raises(OSError, match=r".*TestingUser.*"):
+            df_compat.to_parquet("~/file.parquet")
+
+    def test_partition_cols_supported(self, tmp_path, pa, df_full):
+        # GH #23283
+        partition_cols = ["bool", "int"]
+        df = df_full
+        df.to_parquet(tmp_path, partition_cols=partition_cols, compression=None)
+        check_partition_names(tmp_path, partition_cols)
+        assert read_parquet(tmp_path).shape == df.shape
+
+    def test_partition_cols_string(self, tmp_path, pa, df_full):
+        # GH #27117
+        partition_cols = "bool"
+        partition_cols_list = [partition_cols]
+        df = df_full
+        df.to_parquet(tmp_path, partition_cols=partition_cols, compression=None)
+        check_partition_names(tmp_path, partition_cols_list)
+        assert read_parquet(tmp_path).shape == df.shape
+
+    @pytest.mark.parametrize(
+        "path_type", [str, lambda x: x], ids=["string", "pathlib.Path"]
+    )
+    def test_partition_cols_pathlib(self, tmp_path, pa, df_compat, path_type):
+        # GH 35902
+
+        partition_cols = "B"
+        partition_cols_list = [partition_cols]
+        df = df_compat
+
+        path = path_type(tmp_path)
+        df.to_parquet(path, partition_cols=partition_cols_list)
+        assert read_parquet(path).shape == df.shape
+
+    def test_empty_dataframe(self, pa, temp_file):
+        # GH #27339
+        df = pd.DataFrame(index=[], columns=[])
+        check_round_trip(df, temp_file, pa)
+
+    def test_write_with_schema(self, pa, temp_file):
+        import pyarrow
+
+        df = pd.DataFrame({"x": [0, 1]})
+        schema = pyarrow.schema([pyarrow.field("x", type=pyarrow.bool_())])
+        out_df = df.astype(bool)
+        check_round_trip(
+            df, temp_file, pa, write_kwargs={"schema": schema}, expected=out_df
+        )
+
+    def test_additional_extension_arrays(self, pa, using_infer_string, temp_file):
+        # test additional ExtensionArrays that are supported through the
+        # __arrow_array__ protocol
+        pytest.importorskip("pyarrow")
+        df = pd.DataFrame(
+            {
+                "a": pd.Series([1, 2, 3], dtype="Int64"),
+                "b": pd.Series([1, 2, 3], dtype="UInt32"),
+                "c": pd.Series(["a", None, "c"], dtype="string"),
+            }
+        )
+        if using_infer_string and pa_version_under19p0:
+            check_round_trip(df, temp_file, pa, expected=df.astype({"c": "str"}))
+        else:
+            check_round_trip(df, temp_file, pa)
+
+        df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")})
+        check_round_trip(df, temp_file, pa)
+
+    def test_pyarrow_backed_string_array(
+        self, pa, string_storage, using_infer_string, temp_file
+    ):
+        # test ArrowStringArray supported through the __arrow_array__ protocol
+        pytest.importorskip("pyarrow")
+        df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="string[pyarrow]")})
+        with pd.option_context("string_storage", string_storage):
+            if using_infer_string:
+                if pa_version_under19p0:
+                    expected = df.astype("str")
+                else:
+                    expected = df.astype(f"string[{string_storage}]")
+                expected.columns = expected.columns.astype("str")
+            else:
+                expected = df.astype(f"string[{string_storage}]")
+            check_round_trip(df, temp_file, pa, expected=expected)
+
+    def test_additional_extension_types(self, pa, temp_file):
+        # test additional ExtensionArrays that are supported through the
+        # __arrow_array__ protocol + by defining a custom ExtensionType
+        pytest.importorskip("pyarrow")
+        df = pd.DataFrame(
+            {
+                "c": pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (3, 4)]),
+                "d": pd.period_range("2012-01-01", periods=3, freq="D"),
+                # GH-45881 issue with interval with datetime64[ns] subtype
+                "e": pd.IntervalIndex.from_breaks(
+                    pd.date_range("2012-01-01", periods=4, freq="D")
+                ),
+            }
+        )
+        check_round_trip(df, temp_file, pa)
+
+    def test_timestamp_nanoseconds(self, pa, temp_file):
+        # with version 2.6, pyarrow defaults to writing the nanoseconds, so
+        # this should work without error, even for pyarrow < 13
+        ver = "2.6"
+        df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)})
+        check_round_trip(df, temp_file, pa, write_kwargs={"version": ver})
+
+    def test_timezone_aware_index(self, pa, timezone_aware_date_list, temp_file):
+        idx = 5 * [timezone_aware_date_list]
+        df = pd.DataFrame(index=idx, data={"index_as_col": idx})
+
+        # see gh-36004
+        # compare time(zone) values only, skip their class:
+        # pyarrow always creates fixed offset timezones using pytz.FixedOffset()
+        # even if it was datetime.timezone() originally
+        #
+        # technically they are the same:
+        # they both implement datetime.tzinfo
+        # they both wrap datetime.timedelta()
+        # this use-case sets the resolution to 1 minute
+
+        expected = df[:]
+        if timezone_aware_date_list.tzinfo != datetime.UTC:
+            # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone
+            # https://github.com/pandas-dev/pandas/issues/37286
+            try:
+                import pytz
+            except ImportError:
+                pass
+            else:
+                offset = df.index.tz.utcoffset(timezone_aware_date_list)
+                tz = pytz.FixedOffset(offset.total_seconds() / 60)
+                expected.index = expected.index.tz_convert(tz)
+                expected["index_as_col"] = expected["index_as_col"].dt.tz_convert(tz)
+        check_round_trip(df, temp_file, pa, check_dtype=False, expected=expected)
+
+    def test_filter_row_groups(self, pa, temp_file):
+        # https://github.com/pandas-dev/pandas/issues/26551
+        pytest.importorskip("pyarrow")
+        df = pd.DataFrame({"a": list(range(3))})
+        df.to_parquet(temp_file, engine=pa)
+        result = read_parquet(temp_file, pa, filters=[("a", "==", 0)])
+        assert len(result) == 1
+
+    @pytest.mark.filterwarnings("ignore:make_block is deprecated:DeprecationWarning")
+    def test_read_dtype_backend_pyarrow_config(self, pa, df_full, temp_file):
+        import pyarrow
+
+        df = df_full
+
+        # additional supported types for pyarrow
+        dti = pd.date_range("20130101", periods=3, tz="Europe/Brussels", unit="ns")
+        dti = dti._with_freq(None)  # freq doesn't round-trip
+        df["datetime_tz"] = dti
+        df["bool_with_none"] = [True, None, True]
+
+        pa_table = pyarrow.Table.from_pandas(df)
+        expected = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
+        expected["datetime_with_nat"] = expected["datetime_with_nat"].astype(
+            "timestamp[us][pyarrow]"
+        )
+
+        check_round_trip(
+            df,
+            temp_file,
+            engine=pa,
+            read_kwargs={"dtype_backend": "pyarrow"},
+            expected=expected,
+        )
+
+    def test_read_dtype_backend_pyarrow_config_index(self, pa, temp_file):
+        df = pd.DataFrame(
+            {"a": [1, 2]}, index=pd.Index([3, 4], name="test"), dtype="int64[pyarrow]"
+        )
+        expected = df.copy()
+
+        expected.index = expected.index.astype("int64[pyarrow]")
+        check_round_trip(
+            df,
+            temp_file,
+            engine=pa,
+            read_kwargs={"dtype_backend": "pyarrow"},
+            expected=expected,
+        )
+
+    @pytest.mark.parametrize(
+        "columns",
+        [
+            [0, 1],
+            pytest.param(
+                [b"foo", b"bar"],
+                marks=pytest.mark.xfail(
+                    pa_version_under20p0,
+                    raises=NotImplementedError,
+                    reason="https://github.com/apache/arrow/pull/44171",
+                ),
+            ),
+            pytest.param(
+                [
+                    datetime.datetime(2011, 1, 1, 0, 0),
+                    datetime.datetime(2011, 1, 1, 1, 1),
+                ],
+                marks=pytest.mark.xfail(
+                    pa_version_under17p0,
+                    reason="pa.pandas_compat passes 'datetime64' to .astype",
+                ),
+            ),
+        ],
+    )
+    def test_columns_dtypes_not_invalid(self, pa, columns, temp_file):
+        df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
+
+        df.columns = columns
+        check_round_trip(df, temp_file, pa)
+
+    def test_empty_columns(self, pa, temp_file):
+        # GH 52034
+        df = pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name"))
+        check_round_trip(df, temp_file, pa)
+
+    def test_df_attrs_persistence(self, temp_file, pa):
+        df = pd.DataFrame(data={1: [1]})
+        df.attrs = {"test_attribute": 1}
+        df.to_parquet(temp_file, engine=pa)
+        new_df = read_parquet(temp_file, engine=pa)
+        assert new_df.attrs == df.attrs
+
+    def test_string_inference(self, temp_file, pa, using_infer_string):
+        # GH#54431
+        df = pd.DataFrame(data={"a": ["x", "y"]}, index=["a", "b"])
+        df.to_parquet(temp_file, engine=pa)
+        with pd.option_context("future.infer_string", True):
+            result = read_parquet(temp_file, engine=pa)
+        dtype = pd.StringDtype(na_value=np.nan)
+        expected = pd.DataFrame(
+            data={"a": ["x", "y"]},
+            dtype=dtype,
+            index=pd.Index(["a", "b"], dtype=dtype),
+            columns=pd.Index(
+                ["a"],
+                dtype=(
+                    object if pa_version_under19p0 and not using_infer_string else dtype
+                ),
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_roundtrip_decimal(self, temp_file, pa):
+        # GH#54768
+        import pyarrow as pa
+
+        df = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="string[pyarrow]")
+        df.to_parquet(temp_file, schema=pa.schema([("a", pa.decimal128(5))]))
+        result = read_parquet(temp_file)
+        if pa_version_under19p0:
+            expected = pd.DataFrame({"a": ["123"]}, dtype="string")
+        else:
+            expected = pd.DataFrame({"a": [Decimal("123.00")]}, dtype="object")
+        tm.assert_frame_equal(result, expected)
+
+    def test_infer_string_large_string_type(self, temp_file, pa):
+        # GH#54798
+        import pyarrow as pa
+        import pyarrow.parquet as pq
+
+        table = pa.table({"a": pa.array([None, "b", "c"], pa.large_string())})
+        pq.write_table(table, temp_file)
+
+        with pd.option_context("future.infer_string", True):
+            result = read_parquet(temp_file)
+        expected = pd.DataFrame(
+            data={"a": [None, "b", "c"]},
+            dtype=pd.StringDtype(na_value=np.nan),
+            columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    # NOTE: this test is not run by default, because it requires a lot of memory (>5GB)
+    # @pytest.mark.slow
+    # def test_string_column_above_2GB(self, tmp_path, pa):
+    #     # https://github.com/pandas-dev/pandas/issues/55606
+    #     # above 2GB of string data
+    #     v1 = b"x" * 100000000
+    #     v2 = b"x" * 147483646
+    #     df = pd.DataFrame({"strings": [v1] * 20 + [v2] + ["x"] * 20}, dtype="string")
+    #     df.to_parquet(tmp_path / "test.parquet")
+    #     result = read_parquet(tmp_path / "test.parquet")
+    #     assert result["strings"].dtype == "string"
+    # FIXME: don't leave commented-out
+
+    def test_non_nanosecond_timestamps(self, temp_file):
+        # GH#49236
+        pa = pytest.importorskip("pyarrow", "13.0.0")
+        pq = pytest.importorskip("pyarrow.parquet")
+
+        arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us"))
+        table = pa.table([arr], names=["timestamp"])
+        pq.write_table(table, temp_file)
+        result = read_parquet(temp_file)
+        expected = pd.DataFrame(
+            data={"timestamp": [datetime.datetime(1600, 1, 1)]},
+            dtype="datetime64[us]",
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_maps_as_pydicts(self, pa, temp_file):
+        pyarrow = pytest.importorskip("pyarrow", "13.0.0")
+
+        schema = pyarrow.schema(
+            [("foo", pyarrow.map_(pyarrow.string(), pyarrow.int64()))]
+        )
+        df = pd.DataFrame([{"foo": {"A": 1}}, {"foo": {"B": 2}}])
+        check_round_trip(
+            df,
+            temp_file,
+            pa,
+            write_kwargs={"schema": schema},
+            read_kwargs={"to_pandas_kwargs": {"maps_as_pydicts": "strict"}},
+        )
+
+
+class TestParquetFastParquet(Base):
+    def test_basic(self, fp, df_full, request, temp_file):
+        pytz = pytest.importorskip("pytz")
+
+        tz = pytz.timezone("US/Eastern")
+        df = df_full
+
+        dti = pd.date_range("20130101", periods=3, tz=tz)
+        dti = dti._with_freq(None)  # freq doesn't round-trip
+        df["datetime_tz"] = dti
+        df["timedelta"] = pd.timedelta_range("1 day", periods=3)
+        check_round_trip(df, temp_file, fp)
+
+    def test_columns_dtypes_invalid(self, fp, temp_file):
+        df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))})
+
+        err = TypeError
+        msg = "Column name must be a string"
+
+        # numeric
+        df.columns = [0, 1]
+        self.check_error_on_write(df, fp, err, msg, temp_file)
+
+        # bytes
+        df.columns = [b"foo", b"bar"]
+        self.check_error_on_write(df, fp, err, msg, temp_file)
+
+        # python object
+        df.columns = [
+            datetime.datetime(2011, 1, 1, 0, 0),
+            datetime.datetime(2011, 1, 1, 1, 1),
+        ]
+        self.check_error_on_write(df, fp, err, msg, temp_file)
+
+    def test_duplicate_columns(self, fp, temp_file):
+        # not currently able to handle duplicate columns
+        df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()
+        msg = "Cannot create parquet dataset with duplicate column names"
+        self.check_error_on_write(df, fp, ValueError, msg, temp_file)
+
+    def test_bool_with_none(self, fp, request, temp_file):
+        df = pd.DataFrame({"a": [True, None, False]})
+        expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
+        # Fastparquet bug in 0.7.1 makes it so that this dtype becomes
+        # float64
+        check_round_trip(df, temp_file, fp, expected=expected, check_dtype=False)
+
+    def test_unsupported(self, fp, temp_file):
+        # period
+        df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)})
+        # error from fastparquet -> don't check exact error message
+        self.check_error_on_write(df, fp, ValueError, None, temp_file)
+
+        # mixed
+        df = pd.DataFrame({"a": ["a", 1, 2.0]})
+        msg = "Can't infer object conversion type"
+        self.check_error_on_write(df, fp, ValueError, msg, temp_file)
+
+    def test_categorical(self, fp, temp_file):
+        df = pd.DataFrame({"a": pd.Categorical(list("abc"))})
+        check_round_trip(df, temp_file, fp)
+
+    def test_filter_row_groups(self, fp, temp_file):
+        d = {"a": list(range(3))}
+        df = pd.DataFrame(d)
+        df.to_parquet(temp_file, engine=fp, compression=None, row_group_offsets=1)
+        result = read_parquet(temp_file, fp, filters=[("a", "==", 0)])
+        assert len(result) == 1
+
+    @pytest.mark.single_cpu
+    def test_s3_roundtrip(self, df_compat, s3_bucket_public, s3so, fp, temp_file):
+        # GH #19134
+        check_round_trip(
+            df_compat,
+            temp_file,
+            fp,
+            path=f"s3://{s3_bucket_public.name}/fastparquet.parquet",
+            read_kwargs={"storage_options": s3so},
+            write_kwargs={"compression": None, "storage_options": s3so},
+        )
+
+    def test_partition_cols_supported(self, tmp_path, fp, df_full):
+        # GH #23283
+        partition_cols = ["bool", "int"]
+        df = df_full
+        df.to_parquet(
+            tmp_path,
+            engine="fastparquet",
+            partition_cols=partition_cols,
+            compression=None,
+        )
+        assert os.path.exists(tmp_path)
+        import fastparquet
+
+        actual_partition_cols = fastparquet.ParquetFile(str(tmp_path), False).cats
+        assert len(actual_partition_cols) == 2
+
+    def test_partition_cols_string(self, tmp_path, fp, df_full):
+        # GH #27117
+        partition_cols = "bool"
+        df = df_full
+        df.to_parquet(
+            tmp_path,
+            engine="fastparquet",
+            partition_cols=partition_cols,
+            compression=None,
+        )
+        assert os.path.exists(tmp_path)
+        import fastparquet
+
+        actual_partition_cols = fastparquet.ParquetFile(str(tmp_path), False).cats
+        assert len(actual_partition_cols) == 1
+
+    def test_partition_on_supported(self, tmp_path, fp, df_full):
+        # GH #23283
+        partition_cols = ["bool", "int"]
+        df = df_full
+        df.to_parquet(
+            tmp_path,
+            engine="fastparquet",
+            compression=None,
+            partition_on=partition_cols,
+        )
+        assert os.path.exists(tmp_path)
+        import fastparquet
+
+        actual_partition_cols = fastparquet.ParquetFile(str(tmp_path), False).cats
+        assert len(actual_partition_cols) == 2
+
+    def test_error_on_using_partition_cols_and_partition_on(
+        self, tmp_path, fp, df_full
+    ):
+        # GH #23283
+        partition_cols = ["bool", "int"]
+        df = df_full
+        msg = (
+            "Cannot use both partition_on and partition_cols. Use partition_cols for "
+            "partitioning data"
+        )
+        with pytest.raises(ValueError, match=msg):
+            df.to_parquet(
+                tmp_path,
+                engine="fastparquet",
+                compression=None,
+                partition_on=partition_cols,
+                partition_cols=partition_cols,
+            )
+
+    def test_empty_dataframe(self, fp, temp_file):
+        # GH #27339
+        df = pd.DataFrame()
+        expected = df.copy()
+        check_round_trip(df, temp_file, fp, expected=expected)
+
+    def test_timezone_aware_index(
+        self, fp, timezone_aware_date_list, request, temp_file
+    ):
+        idx = 5 * [timezone_aware_date_list]
+
+        df = pd.DataFrame(index=idx, data={"index_as_col": idx})
+
+        expected = df.copy()
+        expected.index.name = "index"
+        check_round_trip(df, temp_file, fp, expected=expected)
+
+    def test_close_file_handle_on_read_error(self, temp_file):
+        pathlib.Path(temp_file).write_bytes(b"breakit")
+        with tm.external_error_raised(Exception):  # Not important which exception
+            read_parquet(temp_file, engine="fastparquet")
+        # The next line raises an error on Windows if the file is still open
+        pathlib.Path(temp_file).unlink(missing_ok=False)
+
+    def test_bytes_file_name(self, engine, temp_file):
+        # GH#48944
+        df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]})
+        with open(temp_file, "wb") as f:
+            df.to_parquet(f)
+
+        result = read_parquet(temp_file, engine=engine)
+        tm.assert_frame_equal(result, df)
+
+    def test_filesystem_notimplemented(self, temp_file):
+        pytest.importorskip("fastparquet")
+        df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]})
+        with pytest.raises(NotImplementedError, match="filesystem is not implemented"):
+            df.to_parquet(temp_file, engine="fastparquet", filesystem="foo")
+
+        pathlib.Path(temp_file).write_bytes(b"foo")
+        with pytest.raises(NotImplementedError, match="filesystem is not implemented"):
+            read_parquet(temp_file, engine="fastparquet", filesystem="foo")
+
+    def test_invalid_filesystem(self, temp_file):
+        pytest.importorskip("pyarrow")
+        df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]})
+
+        with pytest.raises(
+            ValueError, match="filesystem must be a pyarrow or fsspec FileSystem"
+        ):
+            df.to_parquet(temp_file, engine="pyarrow", filesystem="foo")
+
+        pathlib.Path(temp_file).write_bytes(b"foo")
+        with pytest.raises(
+            ValueError, match="filesystem must be a pyarrow or fsspec FileSystem"
+        ):
+            read_parquet(temp_file, engine="pyarrow", filesystem="foo")
+
+    def test_unsupported_pa_filesystem_storage_options(self, temp_file):
+        pa_fs = pytest.importorskip("pyarrow.fs")
+        df = pd.DataFrame(data={"A": [0, 1], "B": [1, 0]})
+
+        with pytest.raises(
+            NotImplementedError,
+            match="storage_options not supported with a pyarrow FileSystem.",
+        ):
+            df.to_parquet(
+                temp_file,
+                engine="pyarrow",
+                filesystem=pa_fs.LocalFileSystem(),
+                storage_options={"foo": "bar"},
+            )
+
+        pathlib.Path(temp_file).write_bytes(b"foo")
+        with pytest.raises(
+            NotImplementedError,
+            match="storage_options not supported with a pyarrow FileSystem.",
+        ):
+            read_parquet(
+                temp_file,
+                engine="pyarrow",
+                filesystem=pa_fs.LocalFileSystem(),
+                storage_options={"foo": "bar"},
+            )
+
+    def test_invalid_dtype_backend(self, engine, temp_file):
+        msg = (
+            "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+            "'pyarrow' are allowed."
+        )
+        df = pd.DataFrame({"int": list(range(1, 4))})
+        df.to_parquet(temp_file)
+        with pytest.raises(ValueError, match=msg):
+            read_parquet(temp_file, dtype_backend="numpy")
diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
new file mode 100644
index 0000000000000000000000000000000000000000..7754c58a88ef137825906a543a3783f2e21c71b0
--- /dev/null
+++ b/pandas/tests/io/test_pickle.py
@@ -0,0 +1,590 @@
+"""
+manage legacy pickle tests
+
+How to add pickle tests:
+
+1. Install pandas version intended to output the pickle.
+
+2. Execute "generate_legacy_storage_files.py" to create the pickle.
+$ python generate_legacy_storage_files.py <output_dir> pickle
+
+3. Move the created pickle to "data/legacy_pickle/<version>" directory.
+"""
+
+from __future__ import annotations
+
+import bz2
+import datetime
+import functools
+from functools import partial
+import gzip
+import io
+import os
+from pathlib import Path
+import pickle
+import shutil
+import tarfile
+from typing import Any
+import uuid
+import zipfile
+
+import numpy as np
+import pytest
+
+from pandas.compat import is_platform_little_endian
+from pandas.compat._optional import import_optional_dependency
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    period_range,
+)
+import pandas._testing as tm
+from pandas.tests.io.generate_legacy_storage_files import create_pickle_data
+from pandas.util.version import Version
+
+import pandas.io.common as icom
+from pandas.tseries.offsets import (
+    Day,
+    MonthEnd,
+)
+
+
+# ---------------------
+# comparison functions
+# ---------------------
+def compare_element(result, expected, typ):
+    if isinstance(expected, Index):
+        tm.assert_index_equal(result, expected)
+        return
+
+    if typ.startswith("sp_"):
+        tm.assert_equal(result, expected)
+    elif typ == "timestamp":
+        if expected is pd.NaT:
+            assert result is pd.NaT
+        else:
+            assert result == expected
+    else:
+        comparator = getattr(tm, f"assert_{typ}_equal", tm.assert_almost_equal)
+        comparator(result, expected)
+
+
+# ---------------------
+# tests
+# ---------------------
+
+
+def test_pickles(datapath):
+    pytest.importorskip("pytz")
+    if not is_platform_little_endian():
+        pytest.skip("known failure on non-little endian")
+
+    current_data = create_pickle_data()
+
+    # For loop for compat with --strict-data-files
+    for legacy_pickle in Path(__file__).parent.glob("data/legacy_pickle/*/*.p*kl*"):
+        legacy_version = Version(legacy_pickle.parent.name)
+        legacy_pickle = datapath(legacy_pickle)
+
+        data = pd.read_pickle(legacy_pickle)
+
+        for typ, dv in data.items():
+            for dt, result in dv.items():
+                expected = current_data[typ][dt]
+
+                if (
+                    typ == "timestamp"
+                    and dt in ("tz", "both")
+                    and legacy_version < Version("1.3.0")
+                ):
+                    # convert to wall time
+                    # (bug since pandas 2.0 that tz gets dropped for older pickle files)
+                    expected = expected.tz_convert(None)
+
+                if legacy_version < Version("3.0.0.dev0"):
+                    # before 3.0, we had:
+                    # - object dtype instead of string
+                    # - ns instead of us as the default unit
+                    if typ in ("frame", "sp_frame"):
+                        expected.columns = expected.columns.astype("object")
+                        if dt in ("mixed", "mixed_dup"):
+                            expected["C"] = expected["C"].astype(object)
+                            expected["D"] = expected["D"].dt.as_unit("ns")
+                        elif dt in ("cat_onecol", "cat_and_float"):
+                            expected["A"] = expected["A"].astype(
+                                pd.CategoricalDtype(
+                                    expected["A"].cat.categories.astype(object)
+                                )
+                            )
+                        elif typ == "sp_frame" and dt == "float":
+                            expected.index = expected.index.as_unit("ns")
+                        elif dt == "mi":
+                            expected.index = expected.index.set_levels(
+                                [
+                                    level.astype("object")
+                                    for level in expected.index.levels
+                                ],
+                            )
+                    elif typ in ("series", "sp_series"):
+                        if dt == "ts":
+                            expected.index = expected.index.as_unit("ns")
+                        elif dt in ("dt", "dt_tz"):
+                            expected = expected.dt.as_unit("ns")
+                        elif dt == "cat":
+                            expected = expected.astype(
+                                pd.CategoricalDtype(
+                                    expected.cat.categories.astype(object)
+                                )
+                            )
+                        elif dt == "dup":
+                            expected.index = expected.index.astype(object)
+                    elif typ == "index" and dt in ("date", "timedelta"):
+                        expected = expected.as_unit("ns")
+                    elif typ == "mi":
+                        expected = expected.set_levels(
+                            [level.astype("object") for level in expected.levels],
+                        )
+                    if dt == "string":
+                        # we switched from python to pyarrow as default storage in 3.0
+                        expected = expected.astype(pd.StringDtype("python"))
+
+                if dt in ("dt_mixed_tzs", "dt_mixed2_tzs"):
+                    if legacy_version < Version("2.1"):
+                        # in pandas < 2.0, Timestamp() unit defaulted to 'ns'
+                        expected_unit = "ns"
+                    elif Version("2.1") <= legacy_version < Version("3.0.0.dev0"):
+                        # in pandas 2.x, Timestamp() unit depended on input
+                        expected_unit = "s"
+                    else:
+                        expected_unit = "us"
+                    for col in expected.columns:
+                        expected[col] = expected[col].dt.as_unit(expected_unit)
+                if typ == "index" and dt == "int" and "windows" in legacy_pickle:
+                    expected = expected.astype(np.int32)
+
+                if typ == "series" and dt == "ts":
+                    # GH 7748
+                    tm.assert_series_equal(result, expected)
+                    assert result.index.freq == expected.index.freq
+                    assert not result.index.freq.normalize
+                    tm.assert_series_equal(result > 0, expected > 0)
+
+                    # GH 9291
+                    freq = result.index.freq
+                    assert freq + Day(1) == Day(2)
+
+                    res = freq + pd.Timedelta(hours=1)
+                    assert isinstance(res, pd.Timedelta)
+                    assert res == pd.Timedelta(days=1, hours=1)
+
+                    res = freq + pd.Timedelta(nanoseconds=1)
+                    assert isinstance(res, pd.Timedelta)
+                    assert res == pd.Timedelta(days=1, nanoseconds=1)
+                elif typ == "index" and dt == "period":
+                    tm.assert_index_equal(result, expected)
+                    assert isinstance(result.freq, MonthEnd)
+                    assert result.freq == MonthEnd()
+                    assert result.freqstr == "M"
+                    tm.assert_index_equal(result.shift(2), expected.shift(2))
+                elif typ == "series" and dt in ("dt_tz", "cat"):
+                    tm.assert_series_equal(result, expected)
+                elif typ == "frame" and dt in (
+                    "dt_mixed_tzs",
+                    "cat_onecol",
+                    "cat_and_float",
+                ):
+                    tm.assert_frame_equal(result, expected)
+                else:
+                    compare_element(result, expected, typ)
+
+
+def python_pickler(obj, path):
+    with open(path, "wb") as fh:
+        pickle.dump(obj, fh, protocol=-1)
+
+
+def python_unpickler(path):
+    with open(path, "rb") as fh:
+        fh.seek(0)
+        return pickle.load(fh)
+
+
+def flatten(data: dict) -> list[tuple[str, Any]]:
+    """Flatten create_pickle_data"""
+    return [
+        (typ, example)
+        for typ, examples in data.items()
+        for example in examples.values()
+    ]
+
+
+@pytest.mark.parametrize(
+    "pickle_writer",
+    [
+        pytest.param(python_pickler, id="python"),
+        pytest.param(pd.to_pickle, id="pandas_proto_default"),
+        pytest.param(
+            functools.partial(pd.to_pickle, protocol=pickle.HIGHEST_PROTOCOL),
+            id="pandas_proto_highest",
+        ),
+        pytest.param(functools.partial(pd.to_pickle, protocol=4), id="pandas_proto_4"),
+        pytest.param(
+            functools.partial(pd.to_pickle, protocol=5),
+            id="pandas_proto_5",
+        ),
+    ],
+)
+@pytest.mark.parametrize("writer", [pd.to_pickle, python_pickler])
+@pytest.mark.parametrize("typ, expected", flatten(create_pickle_data()))
+def test_round_trip_current(typ, expected, pickle_writer, writer, temp_file):
+    path = temp_file
+    # test writing with each pickler
+    pickle_writer(expected, path)
+
+    # test reading with each unpickler
+    result = pd.read_pickle(path)
+    compare_element(result, expected, typ)
+
+    result = python_unpickler(path)
+    compare_element(result, expected, typ)
+
+    # and the same for file objects (GH 35679)
+    with open(path, mode="wb") as handle:
+        writer(expected, path)
+        handle.seek(0)  # shouldn't close file handle
+    with open(path, mode="rb") as handle:
+        result = pd.read_pickle(handle)
+        handle.seek(0)  # shouldn't close file handle
+    compare_element(result, expected, typ)
+
+
+def test_pickle_path_pathlib(temp_file):
+    df = DataFrame(
+        1.1 * np.arange(120).reshape((30, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+    )
+    result = tm.round_trip_pathlib(df.to_pickle, pd.read_pickle, temp_file)
+    tm.assert_frame_equal(df, result)
+
+
+# ---------------------
+# test pickle compression
+# ---------------------
+
+
+@pytest.fixture
+def get_random_path():
+    return f"__{uuid.uuid4()}__.pickle"
+
+
+class TestCompression:
+    _extension_to_compression = icom.extension_to_compression
+
+    def compress_file(self, src_path, dest_path, compression):
+        if compression is None:
+            shutil.copyfile(src_path, dest_path)
+            return
+
+        if compression == "gzip":
+            f = gzip.open(dest_path, "w")
+        elif compression == "bz2":
+            f = bz2.BZ2File(dest_path, "w")
+        elif compression == "zip":
+            with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f:
+                f.write(src_path, os.path.basename(src_path))
+        elif compression == "tar":
+            with open(src_path, "rb") as fh:
+                with tarfile.open(dest_path, mode="w") as tar:
+                    tarinfo = tar.gettarinfo(src_path, os.path.basename(src_path))
+                    tar.addfile(tarinfo, fh)
+        elif compression == "xz":
+            import lzma
+
+            f = lzma.LZMAFile(dest_path, "w")
+        elif compression == "zstd":
+            f = import_optional_dependency("zstandard").open(dest_path, "wb")
+        else:
+            msg = f"Unrecognized compression type: {compression}"
+            raise ValueError(msg)
+
+        if compression not in ["zip", "tar"]:
+            with open(src_path, "rb") as fh:
+                with f:
+                    f.write(fh.read())
+
+    def test_write_explicit(self, compression, get_random_path, temp_file):
+        p1 = temp_file.parent / f"{temp_file.stem}.compressed"
+        p2 = temp_file.parent / f"{temp_file.stem}.raw"
+        df = DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        )
+
+        # write to compressed file
+        df.to_pickle(p1, compression=compression)
+
+        # decompress
+        with tm.decompress_file(p1, compression=compression) as f:
+            with open(p2, "wb") as fh:
+                fh.write(f.read())
+
+        # read decompressed file
+        df2 = pd.read_pickle(p2, compression=None)
+
+        tm.assert_frame_equal(df, df2)
+
+    @pytest.mark.parametrize("compression", ["", "None", "bad", "7z"])
+    def test_write_explicit_bad(self, compression, get_random_path, temp_file):
+        df = DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        )
+        path = temp_file
+        with pytest.raises(ValueError, match="Unrecognized compression type"):
+            df.to_pickle(path, compression=compression)
+
+    def test_write_infer(self, compression_ext, get_random_path, temp_file):
+        p1 = temp_file.parent / f"{temp_file.stem}{compression_ext}"
+        p2 = temp_file.parent / f"{temp_file.stem}.raw"
+        compression = self._extension_to_compression.get(compression_ext.lower())
+        df = DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        )
+
+        # write to compressed file by inferred compression method
+        df.to_pickle(p1)
+
+        # decompress
+        with tm.decompress_file(p1, compression=compression) as f:
+            with open(p2, "wb") as fh:
+                fh.write(f.read())
+
+        # read decompressed file
+        df2 = pd.read_pickle(p2, compression=None)
+
+        tm.assert_frame_equal(df, df2)
+
+    def test_read_explicit(self, compression, get_random_path, temp_file):
+        p1 = temp_file.parent / f"{temp_file.stem}.raw"
+        p2 = temp_file.parent / f"{temp_file.stem}.compressed"
+        df = DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        )
+
+        # write to uncompressed file
+        df.to_pickle(p1, compression=None)
+
+        # compress
+        self.compress_file(p1, p2, compression=compression)
+
+        # read compressed file
+        df2 = pd.read_pickle(p2, compression=compression)
+        tm.assert_frame_equal(df, df2)
+
+    def test_read_infer(self, compression_ext, get_random_path, temp_file):
+        p1 = temp_file.parent / f"{temp_file.stem}.raw"
+        p2 = temp_file.parent / f"{temp_file.stem}{compression_ext}"
+        compression = self._extension_to_compression.get(compression_ext.lower())
+        df = DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        )
+
+        # write to uncompressed file
+        df.to_pickle(p1, compression=None)
+
+        # compress
+        self.compress_file(p1, p2, compression=compression)
+
+        # read compressed file by inferred compression method
+        df2 = pd.read_pickle(p2)
+        tm.assert_frame_equal(df, df2)
+
+
+# ---------------------
+# test pickle compression
+# ---------------------
+
+
+class TestProtocol:
+    @pytest.mark.parametrize("protocol", [-1, 0, 1, 2])
+    def test_read(self, protocol, get_random_path, temp_file):
+        path = temp_file
+        df = DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=Index([f"i-{i}" for i in range(30)], dtype=object),
+        )
+        df.to_pickle(path, protocol=protocol)
+        df2 = pd.read_pickle(path)
+        tm.assert_frame_equal(df, df2)
+
+
+def test_pickle_buffer_roundtrip(temp_file):
+    path = temp_file
+    df = DataFrame(
+        1.1 * np.arange(120).reshape((30, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+    )
+    with open(path, "wb") as fh:
+        df.to_pickle(fh)
+    with open(path, "rb") as fh:
+        result = pd.read_pickle(fh)
+    tm.assert_frame_equal(df, result)
+
+
+def test_pickle_fsspec_roundtrip(temp_file):
+    pytest.importorskip("fsspec")
+    # Using temp_file for context, but fsspec uses memory URL
+    mockurl = "memory://mockfile"
+    df = DataFrame(
+        1.1 * np.arange(120).reshape((30, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+    )
+    df.to_pickle(mockurl)
+    result = pd.read_pickle(mockurl)
+    tm.assert_frame_equal(df, result)
+
+
+class MyTz(datetime.tzinfo):
+    def __init__(self) -> None:
+        pass
+
+
+def test_read_pickle_with_subclass(temp_file):
+    # GH 12163
+    expected = Series(dtype=object), MyTz()
+    result = tm.round_trip_pickle(expected, temp_file)
+
+    tm.assert_series_equal(result[0], expected[0])
+    assert isinstance(result[1], MyTz)
+
+
+def test_pickle_binary_object_compression(compression, temp_file):
+    """
+    Read/write from binary file-objects w/wo compression.
+
+    GH 26237, GH 29054, and GH 29570
+    """
+    df = DataFrame(
+        1.1 * np.arange(120).reshape((30, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=Index([f"i-{i}" for i in range(30)], dtype=object),
+    )
+
+    # reference for compression
+    path = temp_file
+    df.to_pickle(path, compression=compression)
+    reference = path.read_bytes()
+
+    # write
+    buffer = io.BytesIO()
+    df.to_pickle(buffer, compression=compression)
+    buffer.seek(0)
+
+    # gzip  and zip safe the filename: cannot compare the compressed content
+    assert buffer.getvalue() == reference or compression in ("gzip", "zip", "tar")
+
+    # read
+    read_df = pd.read_pickle(buffer, compression=compression)
+    buffer.seek(0)
+    tm.assert_frame_equal(df, read_df)
+
+
+def test_pickle_dataframe_with_multilevel_index(
+    multiindex_year_month_day_dataframe_random_data,
+    multiindex_dataframe_random_data,
+    temp_file,
+):
+    ymd = multiindex_year_month_day_dataframe_random_data
+    frame = multiindex_dataframe_random_data
+
+    def _test_roundtrip(frame, temp_file):
+        unpickled = tm.round_trip_pickle(frame, temp_file)
+        tm.assert_frame_equal(frame, unpickled)
+
+    _test_roundtrip(frame, temp_file)
+    _test_roundtrip(frame.T, temp_file)
+    _test_roundtrip(ymd, temp_file)
+    _test_roundtrip(ymd.T, temp_file)
+
+
+def test_pickle_timeseries_periodindex(temp_file):
+    # GH#2891
+    prng = period_range("1/1/2011", "1/1/2012", freq="M")
+    ts = Series(np.random.default_rng(2).standard_normal(len(prng)), prng)
+    new_ts = tm.round_trip_pickle(ts, temp_file)
+    assert new_ts.index.freqstr == "M"
+
+
+@pytest.mark.parametrize(
+    "name", [777, 777.0, "name", datetime.datetime(2001, 11, 11), (1, 2)]
+)
+def test_pickle_preserve_name(name, temp_file):
+    unpickled = tm.round_trip_pickle(
+        Series(np.arange(10, dtype=np.float64), name=name), temp_file
+    )
+    assert unpickled.name == name
+
+
+def test_pickle_datetimes(datetime_series, temp_file):
+    unp_ts = tm.round_trip_pickle(datetime_series, temp_file)
+    tm.assert_series_equal(unp_ts, datetime_series)
+
+
+def test_pickle_strings(string_series, temp_file):
+    unp_series = tm.round_trip_pickle(string_series, temp_file)
+    tm.assert_series_equal(unp_series, string_series)
+
+
+def test_pickle_preserves_block_ndim(temp_file):
+    # GH#37631
+    ser = Series(list("abc")).astype("category").iloc[[0]]
+    res = tm.round_trip_pickle(ser, temp_file)
+
+    assert res._mgr.blocks[0].ndim == 1
+    assert res._mgr.blocks[0].shape == (1,)
+
+    # GH#37631 OP issue was about indexing, underlying problem was pickle
+    tm.assert_series_equal(res[[True]], ser)
+
+
+@pytest.mark.parametrize("protocol", [pickle.DEFAULT_PROTOCOL, pickle.HIGHEST_PROTOCOL])
+def test_pickle_big_dataframe_compression(protocol, compression, temp_file):
+    # GH#39002
+    df = DataFrame(range(100000))
+    result = tm.round_trip_pathlib(
+        partial(df.to_pickle, protocol=protocol, compression=compression),
+        partial(pd.read_pickle, compression=compression),
+        temp_file,
+    )
+    tm.assert_frame_equal(df, result)
+
+
+def test_pickle_frame_v124_unpickle_130(datapath):
+    # GH#42345 DataFrame created in 1.2.x, unpickle in 1.3.x
+    path = datapath(
+        Path(__file__).parent,
+        "data",
+        "legacy_pickle",
+        "1.2.4",
+        "empty_frame_v1_2_4-GH#42345.pkl",
+    )
+    with open(path, "rb") as fd:
+        df = pickle.load(fd)
+
+    expected = DataFrame(index=[], columns=[])
+    tm.assert_frame_equal(df, expected)
diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py
new file mode 100644
index 0000000000000000000000000000000000000000..31d22223b0a33218db58120cea1db393827ff699
--- /dev/null
+++ b/pandas/tests/io/test_s3.py
@@ -0,0 +1,33 @@
+from io import BytesIO
+
+import pytest
+
+from pandas import read_csv
+
+
+@pytest.mark.parametrize("data", [b"foo,bar,baz\n1,2,3\n4,5,6\n", b"just,the,header\n"])
+def test_streaming_s3_objects(data):
+    # GH 17135
+    # botocore gained iteration support in 1.10.47, can now be used in read_*
+    pytest.importorskip("botocore", minversion="1.10.47")
+    from botocore.response import StreamingBody
+
+    body = StreamingBody(BytesIO(data), content_length=len(data))
+    read_csv(body)
+
+
+@pytest.mark.single_cpu
+@pytest.mark.parametrize("header", ["infer", None])
+def test_read_with_and_without_creds_from_pub_bucket(
+    s3_bucket_public_with_data, s3so, header
+):
+    # GH 34626
+    pytest.importorskip("s3fs")
+    nrows = 5
+    df = read_csv(
+        f"s3://{s3_bucket_public_with_data.name}/tips.csv",
+        nrows=nrows,
+        header=header,
+        storage_options=s3so,
+    )
+    assert len(df) == nrows
diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py
new file mode 100644
index 0000000000000000000000000000000000000000..6210c0289a160e0ec853cb810dd80e404e11c37e
--- /dev/null
+++ b/pandas/tests/io/test_spss.py
@@ -0,0 +1,169 @@
+import datetime
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+pyreadstat = pytest.importorskip("pyreadstat")
+
+
+# TODO(CoW) - detection of chained assignment in cython
+# https://github.com/pandas-dev/pandas/issues/51315
+@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError")
+@pytest.mark.parametrize("path_klass", [lambda p: p, Path])
+def test_spss_labelled_num(path_klass, datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT
+    fname = path_klass(datapath("io", "data", "spss", "labelled-num.sav"))
+
+    df = pd.read_spss(fname, convert_categoricals=True)
+    expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0])
+    expected["VAR00002"] = pd.Categorical(expected["VAR00002"])
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False)
+    expected = pd.DataFrame({"VAR00002": 1.0}, index=[0])
+    tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError")
+def test_spss_labelled_num_na(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT
+    fname = datapath("io", "data", "spss", "labelled-num-na.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True)
+    expected = pd.DataFrame({"VAR00002": ["This is one", None]})
+    expected["VAR00002"] = pd.Categorical(expected["VAR00002"])
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False)
+    expected = pd.DataFrame({"VAR00002": [1.0, np.nan]})
+    tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError")
+def test_spss_labelled_str(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT
+    fname = datapath("io", "data", "spss", "labelled-str.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True)
+    expected = pd.DataFrame({"gender": ["Male", "Female"]})
+    expected["gender"] = pd.Categorical(expected["gender"])
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False)
+    expected = pd.DataFrame({"gender": ["M", "F"]})
+    tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError")
+def test_spss_kwargs(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT
+    fname = datapath("io", "data", "spss", "labelled-str.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True, row_limit=1)
+    expected = pd.DataFrame({"gender": ["Male"]}, dtype="category")
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False, row_offset=1)
+    expected = pd.DataFrame({"gender": ["F"]})
+    tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError")
+def test_spss_umlauts(datapath):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT
+    fname = datapath("io", "data", "spss", "umlauts.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=True)
+    expected = pd.DataFrame(
+        {"var1": ["the ä umlaut", "the ü umlaut", "the ä umlaut", "the ö umlaut"]}
+    )
+    expected["var1"] = pd.Categorical(expected["var1"])
+    tm.assert_frame_equal(df, expected)
+
+    df = pd.read_spss(fname, convert_categoricals=False)
+    expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]})
+    tm.assert_frame_equal(df, expected)
+
+
+def test_spss_usecols(datapath):
+    # usecols must be list-like
+    fname = datapath("io", "data", "spss", "labelled-num.sav")
+
+    with pytest.raises(TypeError, match="usecols must be list-like."):
+        pd.read_spss(fname, usecols="VAR00002")
+
+
+def test_spss_umlauts_dtype_backend(datapath, dtype_backend):
+    # test file from the Haven project (https://haven.tidyverse.org/)
+    # Licence at LICENSES/HAVEN_LICENSE, LICENSES/HAVEN_MIT
+    fname = datapath("io", "data", "spss", "umlauts.sav")
+
+    df = pd.read_spss(fname, convert_categoricals=False, dtype_backend=dtype_backend)
+    expected = pd.DataFrame({"var1": [1.0, 2.0, 1.0, 3.0]}, dtype="Int64")
+
+    if dtype_backend == "pyarrow":
+        pa = pytest.importorskip("pyarrow")
+
+        from pandas.arrays import ArrowExtensionArray
+
+        expected = pd.DataFrame(
+            {
+                col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
+                for col in expected.columns
+            }
+        )
+
+    tm.assert_frame_equal(df, expected)
+
+
+def test_invalid_dtype_backend():
+    msg = (
+        "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+        "'pyarrow' are allowed."
+    )
+    with pytest.raises(ValueError, match=msg):
+        pd.read_spss("test", dtype_backend="numpy")
+
+
+@pytest.mark.filterwarnings("ignore::pandas.errors.ChainedAssignmentError")
+def test_spss_metadata(datapath):
+    # GH 54264
+    fname = datapath("io", "data", "spss", "labelled-num.sav")
+
+    df = pd.read_spss(fname)
+    metadata = {
+        "column_names": ["VAR00002"],
+        "column_labels": [None],
+        "column_names_to_labels": {"VAR00002": None},
+        "file_encoding": "UTF-8",
+        "number_columns": 1,
+        "number_rows": 1,
+        "variable_value_labels": {"VAR00002": {1.0: "This is one"}},
+        "value_labels": {"labels0": {1.0: "This is one"}},
+        "variable_to_label": {"VAR00002": "labels0"},
+        "notes": [],
+        "original_variable_types": {"VAR00002": "F8.0"},
+        "readstat_variable_types": {"VAR00002": "double"},
+        "table_name": None,
+        "missing_ranges": {},
+        "missing_user_values": {},
+        "variable_storage_width": {"VAR00002": 8},
+        "variable_display_width": {"VAR00002": 8},
+        "variable_alignment": {"VAR00002": "unknown"},
+        "variable_measure": {"VAR00002": "unknown"},
+        "file_label": None,
+        "file_format": "sav/zsav",
+        "creation_time": datetime.datetime(2015, 2, 6, 14, 33, 36),
+        "modification_time": datetime.datetime(2015, 2, 6, 14, 33, 36),
+        "mr_sets": {},
+    }
+    tm.assert_dict_equal(df.attrs, metadata)
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0e06b204379986e0af0b5edcde4182a091e5fb9
--- /dev/null
+++ b/pandas/tests/io/test_sql.py
@@ -0,0 +1,4398 @@
+from __future__ import annotations
+
+import contextlib
+import csv
+from datetime import (
+    date,
+    datetime,
+    time,
+    timedelta,
+)
+from decimal import Decimal
+from io import StringIO
+from pathlib import Path
+import sqlite3
+from typing import TYPE_CHECKING
+import uuid
+
+import numpy as np
+import pytest
+
+from pandas._config import using_string_dtype
+
+from pandas._libs import lib
+from pandas.compat import pa_version_under14p1
+from pandas.compat._optional import import_optional_dependency
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    Timestamp,
+    concat,
+    date_range,
+    isna,
+    to_datetime,
+    to_timedelta,
+)
+import pandas._testing as tm
+from pandas.util.version import Version
+
+from pandas.io import sql
+from pandas.io.sql import (
+    SQLAlchemyEngine,
+    SQLDatabase,
+    SQLiteDatabase,
+    get_engine,
+    pandasSQL_builder,
+    read_sql_query,
+    read_sql_table,
+)
+
+if TYPE_CHECKING:
+    import sqlalchemy
+
+
+pytestmark = [
+    pytest.mark.filterwarnings(
+        "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
+    ),
+    pytest.mark.single_cpu,
+]
+
+
+@pytest.fixture
+def sql_strings():
+    return {
+        "read_parameters": {
+            "sqlite": "SELECT * FROM iris WHERE Name=? AND SepalLength=?",
+            "mysql": "SELECT * FROM iris WHERE `Name`=%s AND `SepalLength`=%s",
+            "postgresql": 'SELECT * FROM iris WHERE "Name"=%s AND "SepalLength"=%s',
+        },
+        "read_named_parameters": {
+            "sqlite": """
+                SELECT * FROM iris WHERE Name=:name AND SepalLength=:length
+                """,
+            "mysql": """
+                SELECT * FROM iris WHERE
+                `Name`=%(name)s AND `SepalLength`=%(length)s
+                """,
+            "postgresql": """
+                SELECT * FROM iris WHERE
+                "Name"=%(name)s AND "SepalLength"=%(length)s
+                """,
+        },
+        "read_no_parameters_with_percent": {
+            "sqlite": "SELECT * FROM iris WHERE Name LIKE '%'",
+            "mysql": "SELECT * FROM iris WHERE `Name` LIKE '%'",
+            "postgresql": "SELECT * FROM iris WHERE \"Name\" LIKE '%'",
+        },
+    }
+
+
+def iris_table_metadata():
+    import sqlalchemy
+    from sqlalchemy import (
+        Column,
+        Double,
+        Float,
+        MetaData,
+        String,
+        Table,
+    )
+
+    dtype = Double if Version(sqlalchemy.__version__) >= Version("2.0.0") else Float
+    metadata = MetaData()
+    iris = Table(
+        "iris",
+        metadata,
+        Column("SepalLength", dtype),
+        Column("SepalWidth", dtype),
+        Column("PetalLength", dtype),
+        Column("PetalWidth", dtype),
+        Column("Name", String(200)),
+    )
+    return iris
+
+
+def create_and_load_iris_sqlite3(conn, iris_file: Path):
+    stmt = """CREATE TABLE iris (
+            "SepalLength" REAL,
+            "SepalWidth" REAL,
+            "PetalLength" REAL,
+            "PetalWidth" REAL,
+            "Name" TEXT
+        )"""
+
+    cur = conn.cursor()
+    cur.execute(stmt)
+    with iris_file.open(newline=None, encoding="utf-8") as csvfile:
+        reader = csv.reader(csvfile)
+        next(reader)
+        stmt = "INSERT INTO iris VALUES(?, ?, ?, ?, ?)"
+        # ADBC requires explicit types - no implicit str -> float conversion
+        records = []
+        records = [
+            (
+                float(row[0]),
+                float(row[1]),
+                float(row[2]),
+                float(row[3]),
+                row[4],
+            )
+            for row in reader
+        ]
+
+        cur.executemany(stmt, records)
+    cur.close()
+
+    conn.commit()
+
+
+def create_and_load_iris_postgresql(conn, iris_file: Path):
+    stmt = """CREATE TABLE iris (
+            "SepalLength" DOUBLE PRECISION,
+            "SepalWidth" DOUBLE PRECISION,
+            "PetalLength" DOUBLE PRECISION,
+            "PetalWidth" DOUBLE PRECISION,
+            "Name" TEXT
+        )"""
+    with conn.cursor() as cur:
+        cur.execute(stmt)
+        with iris_file.open(newline=None, encoding="utf-8") as csvfile:
+            reader = csv.reader(csvfile)
+            next(reader)
+            stmt = "INSERT INTO iris VALUES($1, $2, $3, $4, $5)"
+            # ADBC requires explicit types - no implicit str -> float conversion
+            records = [
+                (
+                    float(row[0]),
+                    float(row[1]),
+                    float(row[2]),
+                    float(row[3]),
+                    row[4],
+                )
+                for row in reader
+            ]
+
+            cur.executemany(stmt, records)
+
+    conn.commit()
+
+
+def create_and_load_iris(conn, iris_file: Path):
+    from sqlalchemy import insert
+
+    iris = iris_table_metadata()
+
+    with iris_file.open(newline=None, encoding="utf-8") as csvfile:
+        reader = csv.reader(csvfile)
+        header = next(reader)
+        params = [dict(zip(header, row)) for row in reader]
+        stmt = insert(iris).values(params)
+        with conn.begin() as con:
+            iris.drop(con, checkfirst=True)
+            iris.create(bind=con)
+            con.execute(stmt)
+
+
+def create_and_load_iris_view(conn):
+    stmt = "CREATE VIEW iris_view AS SELECT * FROM iris"
+    if isinstance(conn, sqlite3.Connection):
+        cur = conn.cursor()
+        cur.execute(stmt)
+    else:
+        adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore")
+        if adbc and isinstance(conn, adbc.Connection):
+            with conn.cursor() as cur:
+                cur.execute(stmt)
+            conn.commit()
+        else:
+            from sqlalchemy import text
+
+            stmt = text(stmt)
+            with conn.begin() as con:
+                con.execute(stmt)
+
+
+def types_table_metadata(dialect: str):
+    from sqlalchemy import (
+        TEXT,
+        Boolean,
+        Column,
+        DateTime,
+        Float,
+        Integer,
+        MetaData,
+        Table,
+    )
+
+    date_type = TEXT if dialect == "sqlite" else DateTime
+    bool_type = Integer if dialect == "sqlite" else Boolean
+    metadata = MetaData()
+    types = Table(
+        "types",
+        metadata,
+        Column("TextCol", TEXT),
+        # error: Cannot infer type argument 1 of "Column"
+        Column("DateCol", date_type),  # type: ignore[misc]
+        Column("IntDateCol", Integer),
+        Column("IntDateOnlyCol", Integer),
+        Column("FloatCol", Float),
+        Column("IntCol", Integer),
+        # error: Cannot infer type argument 1 of "Column"
+        Column("BoolCol", bool_type),  # type: ignore[misc]
+        Column("IntColWithNull", Integer),
+        # error: Cannot infer type argument 1 of "Column"
+        Column("BoolColWithNull", bool_type),  # type: ignore[misc]
+    )
+    return types
+
+
+def create_and_load_types_sqlite3(conn, types_data: list[dict]):
+    stmt = """CREATE TABLE types (
+                    "TextCol" TEXT,
+                    "DateCol" TEXT,
+                    "IntDateCol" INTEGER,
+                    "IntDateOnlyCol" INTEGER,
+                    "FloatCol" REAL,
+                    "IntCol" INTEGER,
+                    "BoolCol" INTEGER,
+                    "IntColWithNull" INTEGER,
+                    "BoolColWithNull" INTEGER
+                )"""
+
+    ins_stmt = """
+                INSERT INTO types
+                VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)
+                """
+
+    if isinstance(conn, sqlite3.Connection):
+        cur = conn.cursor()
+        cur.execute(stmt)
+        cur.executemany(ins_stmt, types_data)
+    else:
+        with conn.cursor() as cur:
+            cur.execute(stmt)
+            cur.executemany(ins_stmt, types_data)
+
+        conn.commit()
+
+
+def create_and_load_types_postgresql(conn, types_data: list[dict]):
+    with conn.cursor() as cur:
+        stmt = """CREATE TABLE types (
+                        "TextCol" TEXT,
+                        "DateCol" TIMESTAMP,
+                        "IntDateCol" INTEGER,
+                        "IntDateOnlyCol" INTEGER,
+                        "FloatCol" DOUBLE PRECISION,
+                        "IntCol" INTEGER,
+                        "BoolCol" BOOLEAN,
+                        "IntColWithNull" INTEGER,
+                        "BoolColWithNull" BOOLEAN
+                    )"""
+        cur.execute(stmt)
+
+        stmt = """
+                INSERT INTO types
+                VALUES($1, $2::timestamp, $3, $4, $5, $6, $7, $8, $9)
+                """
+
+        cur.executemany(stmt, types_data)
+
+    conn.commit()
+
+
+def create_and_load_types(conn, types_data: list[dict], dialect: str):
+    from sqlalchemy import insert
+    from sqlalchemy.engine import Engine
+
+    types = types_table_metadata(dialect)
+
+    stmt = insert(types).values(types_data)
+    if isinstance(conn, Engine):
+        with conn.connect() as conn:
+            with conn.begin():
+                types.drop(conn, checkfirst=True)
+                types.create(bind=conn)
+                conn.execute(stmt)
+    else:
+        with conn.begin():
+            types.drop(conn, checkfirst=True)
+            types.create(bind=conn)
+            conn.execute(stmt)
+
+
+def create_and_load_postgres_datetz(conn):
+    from sqlalchemy import (
+        Column,
+        DateTime,
+        MetaData,
+        Table,
+        insert,
+    )
+    from sqlalchemy.engine import Engine
+
+    metadata = MetaData()
+    datetz = Table("datetz", metadata, Column("DateColWithTz", DateTime(timezone=True)))
+    datetz_data = [
+        {
+            "DateColWithTz": "2000-01-01 00:00:00-08:00",
+        },
+        {
+            "DateColWithTz": "2000-06-01 00:00:00-07:00",
+        },
+    ]
+    stmt = insert(datetz).values(datetz_data)
+    if isinstance(conn, Engine):
+        with conn.connect() as conn:
+            with conn.begin():
+                datetz.drop(conn, checkfirst=True)
+                datetz.create(bind=conn)
+                conn.execute(stmt)
+    else:
+        with conn.begin():
+            datetz.drop(conn, checkfirst=True)
+            datetz.create(bind=conn)
+            conn.execute(stmt)
+
+    # "2000-01-01 00:00:00-08:00" should convert to
+    # "2000-01-01 08:00:00"
+    # "2000-06-01 00:00:00-07:00" should convert to
+    # "2000-06-01 07:00:00"
+    # GH 6415
+    expected_data = [
+        Timestamp("2000-01-01 08:00:00", tz="UTC"),
+        Timestamp("2000-06-01 07:00:00", tz="UTC"),
+    ]
+    return Series(expected_data, name="DateColWithTz").astype("M8[us, UTC]")
+
+
+def check_iris_frame(frame: DataFrame):
+    pytype = frame.dtypes.iloc[0].type
+    row = frame.iloc[0]
+    assert issubclass(pytype, np.floating)
+    tm.assert_series_equal(
+        row, Series([5.1, 3.5, 1.4, 0.2, "Iris-setosa"], index=frame.columns, name=0)
+    )
+    assert frame.shape in ((150, 5), (8, 5))
+
+
+def count_rows(conn, table_name: str):
+    stmt = f"SELECT count(*) AS count_1 FROM {table_name}"
+    adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore")
+    if isinstance(conn, sqlite3.Connection):
+        cur = conn.cursor()
+        return cur.execute(stmt).fetchone()[0]
+    elif adbc and isinstance(conn, adbc.Connection):
+        with conn.cursor() as cur:
+            cur.execute(stmt)
+            return cur.fetchone()[0]
+    else:
+        from sqlalchemy import create_engine
+        from sqlalchemy.engine import Engine
+
+        if isinstance(conn, str):
+            try:
+                engine = create_engine(conn)
+                with engine.connect() as conn:
+                    return conn.exec_driver_sql(stmt).scalar_one()
+            finally:
+                engine.dispose()
+        elif isinstance(conn, Engine):
+            with conn.connect() as conn:
+                return conn.exec_driver_sql(stmt).scalar_one()
+        else:
+            return conn.exec_driver_sql(stmt).scalar_one()
+
+
+@pytest.fixture
+def iris_path(datapath):
+    iris_path = datapath("io", "data", "csv", "iris.csv")
+    return Path(iris_path)
+
+
+@pytest.fixture
+def types_data():
+    return [
+        {
+            "TextCol": "first",
+            "DateCol": "2000-01-03 00:00:00",
+            "IntDateCol": 535852800,
+            "IntDateOnlyCol": 20101010,
+            "FloatCol": 10.10,
+            "IntCol": 1,
+            "BoolCol": False,
+            "IntColWithNull": 1,
+            "BoolColWithNull": False,
+        },
+        {
+            "TextCol": "first",
+            "DateCol": "2000-01-04 00:00:00",
+            "IntDateCol": 1356998400,
+            "IntDateOnlyCol": 20101212,
+            "FloatCol": 10.10,
+            "IntCol": 1,
+            "BoolCol": False,
+            "IntColWithNull": None,
+            "BoolColWithNull": None,
+        },
+    ]
+
+
+@pytest.fixture
+def types_data_frame(types_data):
+    dtypes = {
+        "TextCol": "str",
+        "DateCol": "str",
+        "IntDateCol": "int64",
+        "IntDateOnlyCol": "int64",
+        "FloatCol": "float",
+        "IntCol": "int64",
+        "BoolCol": "int64",
+        "IntColWithNull": "float",
+        "BoolColWithNull": "float",
+    }
+    df = DataFrame(types_data)
+    return df[dtypes.keys()].astype(dtypes)
+
+
+@pytest.fixture
+def test_frame1():
+    columns = ["index", "A", "B", "C", "D"]
+    data = [
+        (
+            "2000-01-03 00:00:00",
+            0.980268513777,
+            3.68573087906,
+            -0.364216805298,
+            -1.15973806169,
+        ),
+        (
+            "2000-01-04 00:00:00",
+            1.04791624281,
+            -0.0412318367011,
+            -0.16181208307,
+            0.212549316967,
+        ),
+        (
+            "2000-01-05 00:00:00",
+            0.498580885705,
+            0.731167677815,
+            -0.537677223318,
+            1.34627041952,
+        ),
+        (
+            "2000-01-06 00:00:00",
+            1.12020151869,
+            1.56762092543,
+            0.00364077397681,
+            0.67525259227,
+        ),
+    ]
+    return DataFrame(data, columns=columns)
+
+
+@pytest.fixture
+def test_frame3():
+    columns = ["index", "A", "B"]
+    data = [
+        ("2000-01-03 00:00:00", 2**31 - 1, -1.987670),
+        ("2000-01-04 00:00:00", -29, -0.0412318367011),
+        ("2000-01-05 00:00:00", 20000, 0.731167677815),
+        ("2000-01-06 00:00:00", -290867, 1.56762092543),
+    ]
+    return DataFrame(data, columns=columns)
+
+
+def get_all_views(conn):
+    if isinstance(conn, sqlite3.Connection):
+        c = conn.execute("SELECT name FROM sqlite_master WHERE type='view'")
+        return [view[0] for view in c.fetchall()]
+    else:
+        adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore")
+        if adbc and isinstance(conn, adbc.Connection):
+            results = []
+            info = conn.adbc_get_objects().read_all().to_pylist()
+            for catalog in info:
+                catalog["catalog_name"]
+                for schema in catalog["catalog_db_schemas"]:
+                    schema["db_schema_name"]
+                    for table in schema["db_schema_tables"]:
+                        if table["table_type"] == "view":
+                            view_name = table["table_name"]
+                            results.append(view_name)
+
+            return results
+        else:
+            from sqlalchemy import inspect
+
+            return inspect(conn).get_view_names()
+
+
+def get_all_tables(conn):
+    if isinstance(conn, sqlite3.Connection):
+        c = conn.execute("SELECT name FROM sqlite_master WHERE type='table'")
+        return [table[0] for table in c.fetchall()]
+    else:
+        adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore")
+
+        if adbc and isinstance(conn, adbc.Connection):
+            results = []
+            info = conn.adbc_get_objects().read_all().to_pylist()
+            for catalog in info:
+                for schema in catalog["catalog_db_schemas"]:
+                    for table in schema["db_schema_tables"]:
+                        if table["table_type"] == "table":
+                            table_name = table["table_name"]
+                            results.append(table_name)
+
+            return results
+        else:
+            from sqlalchemy import inspect
+
+            return inspect(conn).get_table_names()
+
+
+def drop_table(
+    table_name: str,
+    conn: sqlite3.Connection | sqlalchemy.engine.Engine | sqlalchemy.engine.Connection,
+):
+    if isinstance(conn, sqlite3.Connection):
+        conn.execute(f"DROP TABLE IF EXISTS {sql._get_valid_sqlite_name(table_name)}")
+        conn.commit()
+
+    else:
+        adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore")
+        if adbc and isinstance(conn, adbc.Connection):
+            with conn.cursor() as cur:
+                cur.execute(f'DROP TABLE IF EXISTS "{table_name}"')
+        else:
+            with conn.begin() as con:
+                with sql.SQLDatabase(con) as db:
+                    db.drop_table(table_name)
+
+
+def drop_view(
+    view_name: str,
+    conn: sqlite3.Connection | sqlalchemy.engine.Engine | sqlalchemy.engine.Connection,
+):
+    import sqlalchemy
+
+    if isinstance(conn, sqlite3.Connection):
+        conn.execute(f"DROP VIEW IF EXISTS {sql._get_valid_sqlite_name(view_name)}")
+        conn.commit()
+    else:
+        adbc = import_optional_dependency("adbc_driver_manager.dbapi", errors="ignore")
+        if adbc and isinstance(conn, adbc.Connection):
+            with conn.cursor() as cur:
+                cur.execute(f'DROP VIEW IF EXISTS "{view_name}"')
+        else:
+            quoted_view = conn.engine.dialect.identifier_preparer.quote_identifier(
+                view_name
+            )
+            stmt = sqlalchemy.text(f"DROP VIEW IF EXISTS {quoted_view}")
+            with conn.begin() as con:
+                con.execute(stmt)  # type: ignore[union-attr]
+
+
+@pytest.fixture
+def mysql_pymysql_engine():
+    sqlalchemy = pytest.importorskip("sqlalchemy")
+    pymysql = pytest.importorskip("pymysql")
+    engine = sqlalchemy.create_engine(
+        "mysql+pymysql://root@localhost:3306/pandas",
+        connect_args={"client_flag": pymysql.constants.CLIENT.MULTI_STATEMENTS},
+        poolclass=sqlalchemy.pool.NullPool,
+    )
+    yield engine
+    for view in get_all_views(engine):
+        drop_view(view, engine)
+    for tbl in get_all_tables(engine):
+        drop_table(tbl, engine)
+    engine.dispose()
+
+
+@pytest.fixture
+def mysql_pymysql_engine_iris(mysql_pymysql_engine, iris_path):
+    create_and_load_iris(mysql_pymysql_engine, iris_path)
+    create_and_load_iris_view(mysql_pymysql_engine)
+    return mysql_pymysql_engine
+
+
+@pytest.fixture
+def mysql_pymysql_engine_types(mysql_pymysql_engine, types_data):
+    create_and_load_types(mysql_pymysql_engine, types_data, "mysql")
+    return mysql_pymysql_engine
+
+
+@pytest.fixture
+def mysql_pymysql_conn(mysql_pymysql_engine):
+    with mysql_pymysql_engine.connect() as conn:
+        yield conn
+
+
+@pytest.fixture
+def mysql_pymysql_conn_iris(mysql_pymysql_engine_iris):
+    with mysql_pymysql_engine_iris.connect() as conn:
+        yield conn
+
+
+@pytest.fixture
+def mysql_pymysql_conn_types(mysql_pymysql_engine_types):
+    with mysql_pymysql_engine_types.connect() as conn:
+        yield conn
+
+
+@pytest.fixture
+def postgresql_psycopg2_engine():
+    sqlalchemy = pytest.importorskip("sqlalchemy")
+    pytest.importorskip("psycopg2")
+    engine = sqlalchemy.create_engine(
+        "postgresql+psycopg2://postgres:postgres@localhost:5432/pandas",
+        poolclass=sqlalchemy.pool.NullPool,
+    )
+    yield engine
+    for view in get_all_views(engine):
+        drop_view(view, engine)
+    for tbl in get_all_tables(engine):
+        drop_table(tbl, engine)
+    engine.dispose()
+
+
+@pytest.fixture
+def postgresql_psycopg2_engine_iris(postgresql_psycopg2_engine, iris_path):
+    create_and_load_iris(postgresql_psycopg2_engine, iris_path)
+    create_and_load_iris_view(postgresql_psycopg2_engine)
+    return postgresql_psycopg2_engine
+
+
+@pytest.fixture
+def postgresql_psycopg2_engine_types(postgresql_psycopg2_engine, types_data):
+    create_and_load_types(postgresql_psycopg2_engine, types_data, "postgres")
+    return postgresql_psycopg2_engine
+
+
+@pytest.fixture
+def postgresql_psycopg2_conn(postgresql_psycopg2_engine):
+    with postgresql_psycopg2_engine.connect() as conn:
+        yield conn
+
+
+@pytest.fixture
+def postgresql_adbc_conn():
+    pytest.importorskip("pyarrow")
+    pytest.importorskip("adbc_driver_postgresql")
+    from adbc_driver_postgresql import dbapi
+
+    uri = "postgresql://postgres:postgres@localhost:5432/pandas"
+    with dbapi.connect(uri) as conn:
+        yield conn
+        for view in get_all_views(conn):
+            drop_view(view, conn)
+        for tbl in get_all_tables(conn):
+            drop_table(tbl, conn)
+        conn.commit()
+
+
+@pytest.fixture
+def postgresql_adbc_iris(postgresql_adbc_conn, iris_path):
+    import adbc_driver_manager as mgr
+
+    conn = postgresql_adbc_conn
+
+    try:
+        conn.adbc_get_table_schema("iris")
+    except mgr.ProgrammingError:
+        conn.rollback()
+        create_and_load_iris_postgresql(conn, iris_path)
+    try:
+        conn.adbc_get_table_schema("iris_view")
+    except mgr.ProgrammingError:  # note arrow-adbc issue 1022
+        conn.rollback()
+        create_and_load_iris_view(conn)
+    return conn
+
+
+@pytest.fixture
+def postgresql_adbc_types(postgresql_adbc_conn, types_data):
+    import adbc_driver_manager as mgr
+
+    conn = postgresql_adbc_conn
+
+    try:
+        conn.adbc_get_table_schema("types")
+    except mgr.ProgrammingError:
+        conn.rollback()
+        new_data = [tuple(entry.values()) for entry in types_data]
+
+        create_and_load_types_postgresql(conn, new_data)
+
+    return conn
+
+
+@pytest.fixture
+def postgresql_psycopg2_conn_iris(postgresql_psycopg2_engine_iris):
+    with postgresql_psycopg2_engine_iris.connect() as conn:
+        yield conn
+
+
+@pytest.fixture
+def postgresql_psycopg2_conn_types(postgresql_psycopg2_engine_types):
+    with postgresql_psycopg2_engine_types.connect() as conn:
+        yield conn
+
+
+@pytest.fixture
+def sqlite_str(temp_file):
+    pytest.importorskip("sqlalchemy")
+    return f"sqlite:///{temp_file}"
+
+
+@pytest.fixture
+def sqlite_engine(sqlite_str):
+    sqlalchemy = pytest.importorskip("sqlalchemy")
+    engine = sqlalchemy.create_engine(sqlite_str, poolclass=sqlalchemy.pool.NullPool)
+    yield engine
+    for view in get_all_views(engine):
+        drop_view(view, engine)
+    for tbl in get_all_tables(engine):
+        drop_table(tbl, engine)
+    engine.dispose()
+
+
+@pytest.fixture
+def sqlite_conn(sqlite_engine):
+    with sqlite_engine.connect() as conn:
+        yield conn
+
+
+@pytest.fixture
+def sqlite_str_iris(sqlite_str, iris_path):
+    sqlalchemy = pytest.importorskip("sqlalchemy")
+    engine = sqlalchemy.create_engine(sqlite_str)
+    create_and_load_iris(engine, iris_path)
+    create_and_load_iris_view(engine)
+    engine.dispose()
+    return sqlite_str
+
+
+@pytest.fixture
+def sqlite_engine_iris(sqlite_engine, iris_path):
+    create_and_load_iris(sqlite_engine, iris_path)
+    create_and_load_iris_view(sqlite_engine)
+    return sqlite_engine
+
+
+@pytest.fixture
+def sqlite_conn_iris(sqlite_engine_iris):
+    with sqlite_engine_iris.connect() as conn:
+        yield conn
+
+
+@pytest.fixture
+def sqlite_str_types(sqlite_str, types_data):
+    sqlalchemy = pytest.importorskip("sqlalchemy")
+    engine = sqlalchemy.create_engine(sqlite_str)
+    create_and_load_types(engine, types_data, "sqlite")
+    engine.dispose()
+    return sqlite_str
+
+
+@pytest.fixture
+def sqlite_engine_types(sqlite_engine, types_data):
+    create_and_load_types(sqlite_engine, types_data, "sqlite")
+    return sqlite_engine
+
+
+@pytest.fixture
+def sqlite_conn_types(sqlite_engine_types):
+    with sqlite_engine_types.connect() as conn:
+        yield conn
+
+
+@pytest.fixture
+def sqlite_adbc_conn(temp_file):
+    pytest.importorskip("pyarrow")
+    pytest.importorskip("adbc_driver_sqlite")
+    from adbc_driver_sqlite import dbapi
+
+    uri = f"file:{temp_file}"
+    with dbapi.connect(uri) as conn:
+        yield conn
+        for view in get_all_views(conn):
+            drop_view(view, conn)
+        for tbl in get_all_tables(conn):
+            drop_table(tbl, conn)
+        conn.commit()
+
+
+@pytest.fixture
+def sqlite_adbc_iris(sqlite_adbc_conn, iris_path):
+    import adbc_driver_manager as mgr
+
+    conn = sqlite_adbc_conn
+    try:
+        conn.adbc_get_table_schema("iris")
+    except mgr.ProgrammingError:
+        conn.rollback()
+        create_and_load_iris_sqlite3(conn, iris_path)
+    try:
+        conn.adbc_get_table_schema("iris_view")
+    except mgr.ProgrammingError:
+        conn.rollback()
+        create_and_load_iris_view(conn)
+    return conn
+
+
+@pytest.fixture
+def sqlite_adbc_types(sqlite_adbc_conn, types_data):
+    import adbc_driver_manager as mgr
+
+    conn = sqlite_adbc_conn
+    try:
+        conn.adbc_get_table_schema("types")
+    except mgr.ProgrammingError:
+        conn.rollback()
+        new_data = []
+        for entry in types_data:
+            entry["BoolCol"] = int(entry["BoolCol"])
+            if entry["BoolColWithNull"] is not None:
+                entry["BoolColWithNull"] = int(entry["BoolColWithNull"])
+            new_data.append(tuple(entry.values()))
+
+        create_and_load_types_sqlite3(conn, new_data)
+        conn.commit()
+
+    return conn
+
+
+@pytest.fixture
+def sqlite_buildin():
+    with contextlib.closing(sqlite3.connect(":memory:")) as closing_conn:
+        with closing_conn as conn:
+            yield conn
+
+
+@pytest.fixture
+def sqlite_buildin_iris(sqlite_buildin, iris_path):
+    create_and_load_iris_sqlite3(sqlite_buildin, iris_path)
+    create_and_load_iris_view(sqlite_buildin)
+    return sqlite_buildin
+
+
+@pytest.fixture
+def sqlite_buildin_types(sqlite_buildin, types_data):
+    types_data = [tuple(entry.values()) for entry in types_data]
+    create_and_load_types_sqlite3(sqlite_buildin, types_data)
+    return sqlite_buildin
+
+
+mysql_connectable = [
+    pytest.param("mysql_pymysql_engine", marks=pytest.mark.db),
+    pytest.param("mysql_pymysql_conn", marks=pytest.mark.db),
+]
+
+mysql_connectable_iris = [
+    pytest.param("mysql_pymysql_engine_iris", marks=pytest.mark.db),
+    pytest.param("mysql_pymysql_conn_iris", marks=pytest.mark.db),
+]
+
+mysql_connectable_types = [
+    pytest.param("mysql_pymysql_engine_types", marks=pytest.mark.db),
+    pytest.param("mysql_pymysql_conn_types", marks=pytest.mark.db),
+]
+
+postgresql_connectable = [
+    pytest.param("postgresql_psycopg2_engine", marks=pytest.mark.db),
+    pytest.param("postgresql_psycopg2_conn", marks=pytest.mark.db),
+]
+
+postgresql_connectable_iris = [
+    pytest.param("postgresql_psycopg2_engine_iris", marks=pytest.mark.db),
+    pytest.param("postgresql_psycopg2_conn_iris", marks=pytest.mark.db),
+]
+
+postgresql_connectable_types = [
+    pytest.param("postgresql_psycopg2_engine_types", marks=pytest.mark.db),
+    pytest.param("postgresql_psycopg2_conn_types", marks=pytest.mark.db),
+]
+
+sqlite_connectable = [
+    "sqlite_engine",
+    "sqlite_conn",
+    "sqlite_str",
+]
+
+sqlite_connectable_iris = [
+    "sqlite_engine_iris",
+    "sqlite_conn_iris",
+    "sqlite_str_iris",
+]
+
+sqlite_connectable_types = [
+    "sqlite_engine_types",
+    "sqlite_conn_types",
+    "sqlite_str_types",
+]
+
+sqlalchemy_connectable = mysql_connectable + postgresql_connectable + sqlite_connectable
+
+sqlalchemy_connectable_iris = (
+    mysql_connectable_iris + postgresql_connectable_iris + sqlite_connectable_iris
+)
+
+sqlalchemy_connectable_types = (
+    mysql_connectable_types + postgresql_connectable_types + sqlite_connectable_types
+)
+
+adbc_connectable = [
+    "sqlite_adbc_conn",
+    pytest.param("postgresql_adbc_conn", marks=pytest.mark.db),
+]
+
+adbc_connectable_iris = [
+    pytest.param("postgresql_adbc_iris", marks=pytest.mark.db),
+    "sqlite_adbc_iris",
+]
+
+adbc_connectable_types = [
+    pytest.param("postgresql_adbc_types", marks=pytest.mark.db),
+    "sqlite_adbc_types",
+]
+
+
+all_connectable = [*sqlalchemy_connectable, "sqlite_buildin", *adbc_connectable]
+
+all_connectable_iris = [
+    *sqlalchemy_connectable_iris,
+    "sqlite_buildin_iris",
+    *adbc_connectable_iris,
+]
+
+all_connectable_types = [
+    *sqlalchemy_connectable_types,
+    "sqlite_buildin_types",
+    *adbc_connectable_types,
+]
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_dataframe_to_sql(conn, test_frame1, request):
+    # GH 51086 if conn is sqlite_engine
+    conn = request.getfixturevalue(conn)
+    test_frame1.to_sql(name="test", con=conn, if_exists="append", index=False)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_dataframe_to_sql_empty(conn, test_frame1, request):
+    if conn == "postgresql_adbc_conn" and not using_string_dtype():
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="postgres ADBC driver < 1.2 cannot insert index with null type",
+            )
+        )
+
+    # GH 51086 if conn is sqlite_engine
+    conn = request.getfixturevalue(conn)
+    empty_df = test_frame1.iloc[:0]
+    empty_df.to_sql(name="test", con=conn, if_exists="append", index=False)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_dataframe_to_sql_arrow_dtypes(conn, request):
+    # GH 52046
+    pytest.importorskip("pyarrow")
+    df = DataFrame(
+        {
+            "int": pd.array([1], dtype="int8[pyarrow]"),
+            "datetime": pd.array(
+                [datetime(2023, 1, 1)], dtype="timestamp[ns][pyarrow]"
+            ),
+            "date": pd.array([date(2023, 1, 1)], dtype="date32[day][pyarrow]"),
+            "timedelta": pd.array([timedelta(1)], dtype="duration[ns][pyarrow]"),
+            "string": pd.array(["a"], dtype="string[pyarrow]"),
+        }
+    )
+
+    if "adbc" in conn:
+        if conn == "sqlite_adbc_conn":
+            df = df.drop(columns=["timedelta"])
+        if pa_version_under14p1:
+            exp_warning = DeprecationWarning
+            msg = "is_sparse is deprecated"
+        else:
+            exp_warning = None
+            msg = ""
+    else:
+        exp_warning = UserWarning
+        msg = "the 'timedelta'"
+
+    conn = request.getfixturevalue(conn)
+    with tm.assert_produces_warning(exp_warning, match=msg, check_stacklevel=False):
+        df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture):
+    # GH 52046
+    pytest.importorskip("pyarrow")
+    if isinstance(nulls_fixture, Decimal):
+        pytest.skip(
+            # GH#61773
+            reason="Decimal('NaN') not supported in constructor for timestamp dtype"
+        )
+
+    df = DataFrame(
+        {
+            "datetime": pd.array(
+                [datetime(2023, 1, 1), nulls_fixture], dtype="timestamp[ns][pyarrow]"
+            ),
+        }
+    )
+    conn = request.getfixturevalue(conn)
+    df.to_sql(name="test_arrow", con=conn, if_exists="replace", index=False)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+@pytest.mark.parametrize("method", [None, "multi"])
+def test_to_sql(conn, method, test_frame1, request):
+    if method == "multi" and "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="'method' not implemented for ADBC drivers", strict=True
+            )
+        )
+
+    conn = request.getfixturevalue(conn)
+    with pandasSQL_builder(conn, need_transaction=True) as pandasSQL:
+        pandasSQL.to_sql(test_frame1, "test_frame", method=method)
+        assert pandasSQL.has_table("test_frame")
+    assert count_rows(conn, "test_frame") == len(test_frame1)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+@pytest.mark.parametrize(
+    "mode, num_row_coef", [("replace", 1), ("append", 2), ("delete_rows", 1)]
+)
+def test_to_sql_exist(conn, mode, num_row_coef, test_frame1, request):
+    conn = request.getfixturevalue(conn)
+    with pandasSQL_builder(conn, need_transaction=True) as pandasSQL:
+        pandasSQL.to_sql(test_frame1, "test_frame", if_exists="fail")
+        pandasSQL.to_sql(test_frame1, "test_frame", if_exists=mode)
+        assert pandasSQL.has_table("test_frame")
+    assert count_rows(conn, "test_frame") == num_row_coef * len(test_frame1)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_to_sql_exist_fail(conn, test_frame1, request):
+    conn = request.getfixturevalue(conn)
+    with pandasSQL_builder(conn, need_transaction=True) as pandasSQL:
+        pandasSQL.to_sql(test_frame1, "test_frame", if_exists="fail")
+        assert pandasSQL.has_table("test_frame")
+
+        msg = "Table 'test_frame' already exists"
+        with pytest.raises(ValueError, match=msg):
+            pandasSQL.to_sql(test_frame1, "test_frame", if_exists="fail")
+
+
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_read_iris_query(conn, request):
+    conn = request.getfixturevalue(conn)
+    iris_frame = read_sql_query("SELECT * FROM iris", conn)
+    check_iris_frame(iris_frame)
+    iris_frame = pd.read_sql("SELECT * FROM iris", conn)
+    check_iris_frame(iris_frame)
+    iris_frame = pd.read_sql("SELECT * FROM iris where 0=1", conn)
+    assert iris_frame.shape == (0, 5)
+    assert "SepalWidth" in iris_frame.columns
+
+
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_read_iris_query_chunksize(conn, request):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="'chunksize' not implemented for ADBC drivers",
+                strict=True,
+            )
+        )
+    conn = request.getfixturevalue(conn)
+    iris_frame = concat(read_sql_query("SELECT * FROM iris", conn, chunksize=7))
+    check_iris_frame(iris_frame)
+    iris_frame = concat(pd.read_sql("SELECT * FROM iris", conn, chunksize=7))
+    check_iris_frame(iris_frame)
+    iris_frame = concat(pd.read_sql("SELECT * FROM iris where 0=1", conn, chunksize=7))
+    assert iris_frame.shape == (0, 5)
+    assert "SepalWidth" in iris_frame.columns
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris)
+def test_read_iris_query_expression_with_parameter(conn, request):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="'chunksize' not implemented for ADBC drivers",
+                strict=True,
+            )
+        )
+    conn = request.getfixturevalue(conn)
+    from sqlalchemy import (
+        MetaData,
+        Table,
+        create_engine,
+        select,
+    )
+
+    metadata = MetaData()
+    autoload_con = create_engine(conn) if isinstance(conn, str) else conn
+    iris = Table("iris", metadata, autoload_with=autoload_con)
+    iris_frame = read_sql_query(
+        select(iris), conn, params={"name": "Iris-setosa", "length": 5.1}
+    )
+    check_iris_frame(iris_frame)
+    if isinstance(conn, str):
+        autoload_con.dispose()
+
+
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_read_iris_query_string_with_parameter(conn, request, sql_strings):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="'chunksize' not implemented for ADBC drivers",
+                strict=True,
+            )
+        )
+
+    for db, query in sql_strings["read_parameters"].items():
+        if db in conn:
+            break
+    else:
+        raise KeyError(f"No part of {conn} found in sql_strings['read_parameters']")
+    conn = request.getfixturevalue(conn)
+    iris_frame = read_sql_query(query, conn, params=("Iris-setosa", 5.1))
+    check_iris_frame(iris_frame)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris)
+def test_read_iris_table(conn, request):
+    # GH 51015 if conn = sqlite_iris_str
+    conn = request.getfixturevalue(conn)
+    iris_frame = read_sql_table("iris", conn)
+    check_iris_frame(iris_frame)
+    iris_frame = pd.read_sql("iris", conn)
+    check_iris_frame(iris_frame)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris)
+def test_read_iris_table_chunksize(conn, request):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC")
+        )
+    conn = request.getfixturevalue(conn)
+    iris_frame = concat(read_sql_table("iris", conn, chunksize=7))
+    check_iris_frame(iris_frame)
+    iris_frame = concat(pd.read_sql("iris", conn, chunksize=7))
+    check_iris_frame(iris_frame)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_to_sql_callable(conn, test_frame1, request):
+    conn = request.getfixturevalue(conn)
+
+    check = []  # used to double check function below is really being used
+
+    def sample(pd_table, conn, keys, data_iter):
+        check.append(1)
+        data = [dict(zip(keys, row)) for row in data_iter]
+        conn.execute(pd_table.table.insert(), data)
+
+    with pandasSQL_builder(conn, need_transaction=True) as pandasSQL:
+        pandasSQL.to_sql(test_frame1, "test_frame", method=sample)
+        assert pandasSQL.has_table("test_frame")
+    assert check == [1]
+    assert count_rows(conn, "test_frame") == len(test_frame1)
+
+
+@pytest.mark.parametrize("conn", all_connectable_types)
+def test_default_type_conversion(conn, request):
+    conn_name = conn
+    if conn_name == "sqlite_buildin_types":
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="sqlite_buildin connection does not implement read_sql_table"
+            )
+        )
+
+    conn = request.getfixturevalue(conn)
+    df = sql.read_sql_table("types", conn)
+
+    assert issubclass(df.FloatCol.dtype.type, np.floating)
+    assert issubclass(df.IntCol.dtype.type, np.integer)
+
+    # MySQL/sqlite has no real BOOL type
+    if "postgresql" in conn_name:
+        assert issubclass(df.BoolCol.dtype.type, np.bool_)
+    else:
+        assert issubclass(df.BoolCol.dtype.type, np.integer)
+
+    # Int column with NA values stays as float
+    assert issubclass(df.IntColWithNull.dtype.type, np.floating)
+
+    # Bool column with NA = int column with NA values => becomes float
+    if "postgresql" in conn_name:
+        assert issubclass(df.BoolColWithNull.dtype.type, object)
+    else:
+        assert issubclass(df.BoolColWithNull.dtype.type, np.floating)
+
+
+@pytest.mark.parametrize("conn", mysql_connectable)
+def test_read_procedure(conn, request):
+    conn = request.getfixturevalue(conn)
+
+    # GH 7324
+    # Although it is more an api test, it is added to the
+    # mysql tests as sqlite does not have stored procedures
+    from sqlalchemy import text
+    from sqlalchemy.engine import Engine
+
+    df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
+    df.to_sql(name="test_frame", con=conn, index=False)
+
+    proc = """DROP PROCEDURE IF EXISTS get_testdb;
+
+    CREATE PROCEDURE get_testdb ()
+
+    BEGIN
+        SELECT * FROM test_frame;
+    END"""
+    proc = text(proc)
+    if isinstance(conn, Engine):
+        with conn.connect() as engine_conn:
+            with engine_conn.begin():
+                engine_conn.execute(proc)
+    else:
+        with conn.begin():
+            conn.execute(proc)
+
+    res1 = sql.read_sql_query("CALL get_testdb();", conn)
+    tm.assert_frame_equal(df, res1)
+
+    # test delegation to read_sql_query
+    res2 = sql.read_sql("CALL get_testdb();", conn)
+    tm.assert_frame_equal(df, res2)
+
+
+@pytest.mark.parametrize("conn", postgresql_connectable)
+@pytest.mark.parametrize("expected_count", [2, "Success!"])
+def test_copy_from_callable_insertion_method(conn, expected_count, request):
+    # GH 8953
+    # Example in io.rst found under _io.sql.method
+    # not available in sqlite, mysql
+    def psql_insert_copy(table, conn, keys, data_iter):
+        # gets a DBAPI connection that can provide a cursor
+        dbapi_conn = conn.connection
+        with dbapi_conn.cursor() as cur:
+            s_buf = StringIO()
+            writer = csv.writer(s_buf)
+            writer.writerows(data_iter)
+            s_buf.seek(0)
+
+            columns = ", ".join([f'"{k}"' for k in keys])
+            if table.schema:
+                table_name = f"{table.schema}.{table.name}"
+            else:
+                table_name = table.name
+
+            sql_query = f"COPY {table_name} ({columns}) FROM STDIN WITH CSV"
+            cur.copy_expert(sql=sql_query, file=s_buf)
+        return expected_count
+
+    conn = request.getfixturevalue(conn)
+    expected = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]})
+    result_count = expected.to_sql(
+        name="test_frame", con=conn, index=False, method=psql_insert_copy
+    )
+    # GH 46891
+    if expected_count is None:
+        assert result_count is None
+    else:
+        assert result_count == expected_count
+    result = sql.read_sql_table("test_frame", conn)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("conn", postgresql_connectable)
+def test_insertion_method_on_conflict_do_nothing(conn, request):
+    # GH 15988: Example in to_sql docstring
+    conn = request.getfixturevalue(conn)
+
+    from sqlalchemy.dialects.postgresql import insert
+    from sqlalchemy.engine import Engine
+    from sqlalchemy.sql import text
+
+    def insert_on_conflict(table, conn, keys, data_iter):
+        data = [dict(zip(keys, row)) for row in data_iter]
+        stmt = (
+            insert(table.table)
+            .values(data)
+            .on_conflict_do_nothing(index_elements=["a"])
+        )
+        result = conn.execute(stmt)
+        return result.rowcount
+
+    create_sql = text(
+        """
+    CREATE TABLE test_insert_conflict (
+        a  integer PRIMARY KEY,
+        b  numeric,
+        c  text
+    );
+    """
+    )
+    if isinstance(conn, Engine):
+        with conn.connect() as con:
+            with con.begin():
+                con.execute(create_sql)
+    else:
+        with conn.begin():
+            conn.execute(create_sql)
+
+    expected = DataFrame([[1, 2.1, "a"]], columns=list("abc"))
+    expected.to_sql(
+        name="test_insert_conflict", con=conn, if_exists="append", index=False
+    )
+
+    df_insert = DataFrame([[1, 3.2, "b"]], columns=list("abc"))
+    inserted = df_insert.to_sql(
+        name="test_insert_conflict",
+        con=conn,
+        index=False,
+        if_exists="append",
+        method=insert_on_conflict,
+    )
+    result = sql.read_sql_table("test_insert_conflict", conn)
+    tm.assert_frame_equal(result, expected)
+    assert inserted == 0
+
+    # Cleanup
+    with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+        pandasSQL.drop_table("test_insert_conflict")
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_to_sql_on_public_schema(conn, request):
+    if "sqlite" in conn or "mysql" in conn:
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="test for public schema only specific to postgresql"
+            )
+        )
+
+    conn = request.getfixturevalue(conn)
+
+    test_data = DataFrame([[1, 2.1, "a"], [2, 3.1, "b"]], columns=list("abc"))
+    test_data.to_sql(
+        name="test_public_schema",
+        con=conn,
+        if_exists="append",
+        index=False,
+        schema="public",
+    )
+
+    df_out = sql.read_sql_table("test_public_schema", conn, schema="public")
+    tm.assert_frame_equal(test_data, df_out)
+
+
+@pytest.mark.parametrize("conn", mysql_connectable)
+def test_insertion_method_on_conflict_update(conn, request):
+    # GH 14553: Example in to_sql docstring
+    conn = request.getfixturevalue(conn)
+
+    from sqlalchemy.dialects.mysql import insert
+    from sqlalchemy.engine import Engine
+    from sqlalchemy.sql import text
+
+    def insert_on_conflict(table, conn, keys, data_iter):
+        data = [dict(zip(keys, row)) for row in data_iter]
+        stmt = insert(table.table).values(data)
+        stmt = stmt.on_duplicate_key_update(b=stmt.inserted.b, c=stmt.inserted.c)
+        result = conn.execute(stmt)
+        return result.rowcount
+
+    create_sql = text(
+        """
+    CREATE TABLE test_insert_conflict (
+        a INT PRIMARY KEY,
+        b FLOAT,
+        c VARCHAR(10)
+    );
+    """
+    )
+    if isinstance(conn, Engine):
+        with conn.connect() as con:
+            with con.begin():
+                con.execute(create_sql)
+    else:
+        with conn.begin():
+            conn.execute(create_sql)
+
+    df = DataFrame([[1, 2.1, "a"]], columns=list("abc"))
+    df.to_sql(name="test_insert_conflict", con=conn, if_exists="append", index=False)
+
+    expected = DataFrame([[1, 3.2, "b"]], columns=list("abc"))
+    inserted = expected.to_sql(
+        name="test_insert_conflict",
+        con=conn,
+        index=False,
+        if_exists="append",
+        method=insert_on_conflict,
+    )
+    result = sql.read_sql_table("test_insert_conflict", conn)
+    tm.assert_frame_equal(result, expected)
+    assert inserted == 2
+
+    # Cleanup
+    with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+        pandasSQL.drop_table("test_insert_conflict")
+
+
+@pytest.mark.parametrize("conn", postgresql_connectable)
+def test_read_view_postgres(conn, request):
+    # GH 52969
+    conn = request.getfixturevalue(conn)
+
+    from sqlalchemy.engine import Engine
+    from sqlalchemy.sql import text
+
+    table_name = f"group_{uuid.uuid4().hex}"
+    view_name = f"group_view_{uuid.uuid4().hex}"
+
+    sql_stmt = text(
+        f"""
+    CREATE TABLE {table_name} (
+        group_id INTEGER,
+        name TEXT
+    );
+    INSERT INTO {table_name} VALUES
+        (1, 'name');
+    CREATE VIEW {view_name}
+    AS
+    SELECT * FROM {table_name};
+    """
+    )
+    if isinstance(conn, Engine):
+        with conn.connect() as con:
+            with con.begin():
+                con.execute(sql_stmt)
+    else:
+        with conn.begin():
+            conn.execute(sql_stmt)
+    result = read_sql_table(view_name, conn)
+    expected = DataFrame({"group_id": [1], "name": "name"})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_read_view_sqlite(sqlite_buildin):
+    # GH 52969
+    create_table = """
+CREATE TABLE groups (
+   group_id INTEGER,
+   name TEXT
+);
+"""
+    insert_into = """
+INSERT INTO groups VALUES
+    (1, 'name');
+"""
+    create_view = """
+CREATE VIEW group_view
+AS
+SELECT * FROM groups;
+"""
+    sqlite_buildin.execute(create_table)
+    sqlite_buildin.execute(insert_into)
+    sqlite_buildin.execute(create_view)
+    result = pd.read_sql("SELECT * FROM group_view", sqlite_buildin)
+    expected = DataFrame({"group_id": [1], "name": "name"})
+    tm.assert_frame_equal(result, expected)
+
+
+def flavor(conn_name):
+    if "postgresql" in conn_name:
+        return "postgresql"
+    elif "sqlite" in conn_name:
+        return "sqlite"
+    elif "mysql" in conn_name:
+        return "mysql"
+
+    raise ValueError(f"unsupported connection: {conn_name}")
+
+
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_read_sql_iris_parameter(conn, request, sql_strings):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="'params' not implemented for ADBC drivers",
+                strict=True,
+            )
+        )
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    query = sql_strings["read_parameters"][flavor(conn_name)]
+    params = ("Iris-setosa", 5.1)
+    with pandasSQL_builder(conn) as pandasSQL:
+        with pandasSQL.run_transaction():
+            iris_frame = pandasSQL.read_query(query, params=params)
+    check_iris_frame(iris_frame)
+
+
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_read_sql_iris_named_parameter(conn, request, sql_strings):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="'params' not implemented for ADBC drivers",
+                strict=True,
+            )
+        )
+
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    query = sql_strings["read_named_parameters"][flavor(conn_name)]
+    params = {"name": "Iris-setosa", "length": 5.1}
+    with pandasSQL_builder(conn) as pandasSQL:
+        with pandasSQL.run_transaction():
+            iris_frame = pandasSQL.read_query(query, params=params)
+    check_iris_frame(iris_frame)
+
+
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_read_sql_iris_no_parameter_with_percent(conn, request, sql_strings):
+    if "mysql" in conn or ("postgresql" in conn and "adbc" not in conn):
+        request.applymarker(pytest.mark.xfail(reason="broken test"))
+
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+
+    query = sql_strings["read_no_parameters_with_percent"][flavor(conn_name)]
+    with pandasSQL_builder(conn) as pandasSQL:
+        with pandasSQL.run_transaction():
+            iris_frame = pandasSQL.read_query(query, params=None)
+    check_iris_frame(iris_frame)
+
+
+# -----------------------------------------------------------------------------
+# -- Testing the public API
+
+
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_api_read_sql_view(conn, request):
+    conn = request.getfixturevalue(conn)
+    iris_frame = sql.read_sql_query("SELECT * FROM iris_view", conn)
+    check_iris_frame(iris_frame)
+
+
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_api_read_sql_with_chunksize_no_result(conn, request):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC")
+        )
+    conn = request.getfixturevalue(conn)
+    query = 'SELECT * FROM iris_view WHERE "SepalLength" < 0.0'
+    with_batch = sql.read_sql_query(query, conn, chunksize=5)
+    without_batch = sql.read_sql_query(query, conn)
+    tm.assert_frame_equal(concat(with_batch), without_batch)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql(conn, request, test_frame1):
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_frame1", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_frame1")
+
+    sql.to_sql(test_frame1, "test_frame1", conn)
+    assert sql.has_table("test_frame1", conn)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql_fail(conn, request, test_frame1):
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_frame2", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_frame2")
+
+    sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail")
+    assert sql.has_table("test_frame2", conn)
+
+    msg = "Table 'test_frame2' already exists"
+    with pytest.raises(ValueError, match=msg):
+        sql.to_sql(test_frame1, "test_frame2", conn, if_exists="fail")
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql_replace(conn, request, test_frame1):
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_frame3", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_frame3")
+
+    sql.to_sql(test_frame1, "test_frame3", conn, if_exists="fail")
+    # Add to table again
+    sql.to_sql(test_frame1, "test_frame3", conn, if_exists="replace")
+    assert sql.has_table("test_frame3", conn)
+
+    num_entries = len(test_frame1)
+    num_rows = count_rows(conn, "test_frame3")
+
+    assert num_rows == num_entries
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql_append(conn, request, test_frame1):
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_frame4", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_frame4")
+
+    assert sql.to_sql(test_frame1, "test_frame4", conn, if_exists="fail") == 4
+
+    # Add to table again
+    assert sql.to_sql(test_frame1, "test_frame4", conn, if_exists="append") == 4
+    assert sql.has_table("test_frame4", conn)
+
+    num_entries = 2 * len(test_frame1)
+    num_rows = count_rows(conn, "test_frame4")
+
+    assert num_rows == num_entries
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql_type_mapping(conn, request, test_frame3):
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_frame5", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_frame5")
+
+    sql.to_sql(test_frame3, "test_frame5", conn, index=False)
+    result = sql.read_sql("SELECT * FROM test_frame5", conn)
+
+    tm.assert_frame_equal(test_frame3, result)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql_series(conn, request):
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_series", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_series")
+
+    s = Series(np.arange(5, dtype="int64"), name="series")
+    sql.to_sql(s, "test_series", conn, index=False)
+    s2 = sql.read_sql_query("SELECT * FROM test_series", conn)
+    tm.assert_frame_equal(s.to_frame(), s2)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_roundtrip(conn, request, test_frame1):
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_frame_roundtrip", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_frame_roundtrip")
+
+    sql.to_sql(test_frame1, "test_frame_roundtrip", con=conn)
+    result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn)
+
+    # HACK!
+    if "adbc" in conn_name:
+        result = result.drop(columns="__index_level_0__")
+    else:
+        result = result.drop(columns="level_0")
+    tm.assert_frame_equal(result, test_frame1)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_roundtrip_chunksize(conn, request, test_frame1):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC")
+        )
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_frame_roundtrip", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_frame_roundtrip")
+
+    sql.to_sql(
+        test_frame1,
+        "test_frame_roundtrip",
+        con=conn,
+        index=False,
+        chunksize=2,
+    )
+    result = sql.read_sql_query("SELECT * FROM test_frame_roundtrip", con=conn)
+    tm.assert_frame_equal(result, test_frame1)
+
+
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_api_execute_sql(conn, request):
+    # drop_sql = "DROP TABLE IF EXISTS test"  # should already be done
+    conn = request.getfixturevalue(conn)
+    with sql.pandasSQL_builder(conn) as pandas_sql:
+        iris_results = pandas_sql.execute("SELECT * FROM iris")
+        row = iris_results.fetchone()
+        iris_results.close()
+    assert list(row) == [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]
+
+
+@pytest.mark.parametrize("conn", all_connectable_types)
+def test_api_date_parsing(conn, request):
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    # Test date parsing in read_sql
+    # No Parsing
+    df = sql.read_sql_query("SELECT * FROM types", conn)
+    if not ("mysql" in conn_name or "postgres" in conn_name):
+        assert not issubclass(df.DateCol.dtype.type, np.datetime64)
+
+    df = sql.read_sql_query("SELECT * FROM types", conn, parse_dates=["DateCol"])
+    assert issubclass(df.DateCol.dtype.type, np.datetime64)
+    assert df.DateCol.tolist() == [
+        Timestamp(2000, 1, 3, 0, 0, 0),
+        Timestamp(2000, 1, 4, 0, 0, 0),
+    ]
+
+    df = sql.read_sql_query(
+        "SELECT * FROM types",
+        conn,
+        parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"},
+    )
+    assert issubclass(df.DateCol.dtype.type, np.datetime64)
+    assert df.DateCol.tolist() == [
+        Timestamp(2000, 1, 3, 0, 0, 0),
+        Timestamp(2000, 1, 4, 0, 0, 0),
+    ]
+
+    df = sql.read_sql_query("SELECT * FROM types", conn, parse_dates=["IntDateCol"])
+    assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+    assert df.IntDateCol.tolist() == [
+        Timestamp(1986, 12, 25, 0, 0, 0),
+        Timestamp(2013, 1, 1, 0, 0, 0),
+    ]
+
+    df = sql.read_sql_query(
+        "SELECT * FROM types", conn, parse_dates={"IntDateCol": "s"}
+    )
+    assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+    assert df.IntDateCol.tolist() == [
+        Timestamp(1986, 12, 25, 0, 0, 0),
+        Timestamp(2013, 1, 1, 0, 0, 0),
+    ]
+
+    df = sql.read_sql_query(
+        "SELECT * FROM types",
+        conn,
+        parse_dates={"IntDateOnlyCol": "%Y%m%d"},
+    )
+    assert issubclass(df.IntDateOnlyCol.dtype.type, np.datetime64)
+    assert df.IntDateOnlyCol.tolist() == [
+        Timestamp("2010-10-10"),
+        Timestamp("2010-12-12"),
+    ]
+
+
+@pytest.mark.parametrize("conn", all_connectable_types)
+@pytest.mark.parametrize("error", ["raise", "coerce"])
+@pytest.mark.parametrize(
+    "read_sql, text, mode",
+    [
+        (sql.read_sql, "SELECT * FROM types", ("sqlalchemy", "fallback")),
+        (sql.read_sql, "types", ("sqlalchemy")),
+        (
+            sql.read_sql_query,
+            "SELECT * FROM types",
+            ("sqlalchemy", "fallback"),
+        ),
+        (sql.read_sql_table, "types", ("sqlalchemy")),
+    ],
+)
+def test_api_custom_dateparsing_error(
+    conn, request, read_sql, text, mode, error, types_data_frame
+):
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    if text == "types" and conn_name == "sqlite_buildin_types":
+        request.applymarker(
+            pytest.mark.xfail(reason="failing combination of arguments")
+        )
+
+    expected = types_data_frame.astype({"DateCol": "datetime64[us]"})
+
+    result = read_sql(
+        text,
+        con=conn,
+        parse_dates={
+            "DateCol": {"errors": error},
+        },
+    )
+    if "postgres" in conn_name:
+        # TODO: clean up types_data_frame fixture
+        result["BoolCol"] = result["BoolCol"].astype(int)
+        result["BoolColWithNull"] = result["BoolColWithNull"].astype(float)
+
+    if conn_name == "postgresql_adbc_types":
+        expected = expected.astype(
+            {
+                "IntDateCol": "int32",
+                "IntDateOnlyCol": "int32",
+                "IntCol": "int32",
+            }
+        )
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("conn", all_connectable_types)
+def test_api_date_and_index(conn, request):
+    # Test case where same column appears in parse_date and index_col
+    conn = request.getfixturevalue(conn)
+    df = sql.read_sql_query(
+        "SELECT * FROM types",
+        conn,
+        index_col="DateCol",
+        parse_dates=["DateCol", "IntDateCol"],
+    )
+
+    assert issubclass(df.index.dtype.type, np.datetime64)
+    assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_timedelta(conn, request):
+    # see #6921
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_timedelta", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_timedelta")
+
+    df = to_timedelta(Series(["00:00:01", "00:00:03"], name="foo")).to_frame()
+
+    if conn_name == "sqlite_adbc_conn":
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="sqlite ADBC driver doesn't implement timedelta",
+            )
+        )
+
+    if "adbc" in conn_name:
+        if pa_version_under14p1:
+            exp_warning = DeprecationWarning
+        else:
+            exp_warning = None
+    else:
+        exp_warning = UserWarning
+
+    with tm.assert_produces_warning(exp_warning, check_stacklevel=False):
+        result_count = df.to_sql(name="test_timedelta", con=conn)
+    assert result_count == 2
+    result = sql.read_sql_query("SELECT * FROM test_timedelta", conn)
+
+    if conn_name == "postgresql_adbc_conn":
+        # TODO: Postgres stores an INTERVAL, which ADBC reads as a Month-Day-Nano
+        # Interval; the default pandas type mapper maps this to a DateOffset
+        # but maybe we should try and restore the timedelta here?
+        expected = Series(
+            [
+                pd.DateOffset(months=0, days=0, microseconds=1000000, nanoseconds=0),
+                pd.DateOffset(months=0, days=0, microseconds=3000000, nanoseconds=0),
+            ],
+            name="foo",
+        )
+    else:
+        expected = df["foo"].astype("int64")
+    tm.assert_series_equal(result["foo"], expected)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_complex_raises(conn, request):
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    df = DataFrame({"a": [1 + 1j, 2j]})
+
+    if "adbc" in conn_name:
+        msg = "datatypes not supported"
+    else:
+        msg = "Complex datatypes not supported"
+    with pytest.raises(ValueError, match=msg):
+        assert df.to_sql("test_complex", con=conn) is None
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+@pytest.mark.parametrize(
+    "index_name,index_label,expected",
+    [
+        # no index name, defaults to 'index'
+        (None, None, "index"),
+        # specifying index_label
+        (None, "other_label", "other_label"),
+        # using the index name
+        ("index_name", None, "index_name"),
+        # has index name, but specifying index_label
+        ("index_name", "other_label", "other_label"),
+        # index name is integer
+        (0, None, "0"),
+        # index name is None but index label is integer
+        (None, 0, "0"),
+    ],
+)
+def test_api_to_sql_index_label(conn, request, index_name, index_label, expected):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(reason="index_label argument NotImplemented with ADBC")
+        )
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_index_label", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_index_label")
+
+    temp_frame = DataFrame({"col1": range(4)})
+    temp_frame.index.name = index_name
+    query = "SELECT * FROM test_index_label"
+    sql.to_sql(temp_frame, "test_index_label", conn, index_label=index_label)
+    frame = sql.read_sql_query(query, conn)
+    assert frame.columns[0] == expected
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_to_sql_index_label_multiindex(conn, request):
+    conn_name = conn
+    if "mysql" in conn_name:
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="MySQL can fail using TEXT without length as key", strict=False
+            )
+        )
+    elif "adbc" in conn_name:
+        request.node.add_marker(
+            pytest.mark.xfail(reason="index_label argument NotImplemented with ADBC")
+        )
+
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_index_label", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_index_label")
+
+    expected_row_count = 4
+    temp_frame = DataFrame(
+        {"col1": range(4)},
+        index=MultiIndex.from_product([("A0", "A1"), ("B0", "B1")]),
+    )
+
+    # no index name, defaults to 'level_0' and 'level_1'
+    result = sql.to_sql(temp_frame, "test_index_label", conn)
+    assert result == expected_row_count
+    frame = sql.read_sql_query("SELECT * FROM test_index_label", conn)
+    assert frame.columns[0] == "level_0"
+    assert frame.columns[1] == "level_1"
+
+    # specifying index_label
+    result = sql.to_sql(
+        temp_frame,
+        "test_index_label",
+        conn,
+        if_exists="replace",
+        index_label=["A", "B"],
+    )
+    assert result == expected_row_count
+    frame = sql.read_sql_query("SELECT * FROM test_index_label", conn)
+    assert frame.columns[:2].tolist() == ["A", "B"]
+
+    # using the index name
+    temp_frame.index.names = ["A", "B"]
+    result = sql.to_sql(temp_frame, "test_index_label", conn, if_exists="replace")
+    assert result == expected_row_count
+    frame = sql.read_sql_query("SELECT * FROM test_index_label", conn)
+    assert frame.columns[:2].tolist() == ["A", "B"]
+
+    # has index name, but specifying index_label
+    result = sql.to_sql(
+        temp_frame,
+        "test_index_label",
+        conn,
+        if_exists="replace",
+        index_label=["C", "D"],
+    )
+    assert result == expected_row_count
+    frame = sql.read_sql_query("SELECT * FROM test_index_label", conn)
+    assert frame.columns[:2].tolist() == ["C", "D"]
+
+    msg = "Length of 'index_label' should match number of levels, which is 2"
+    with pytest.raises(ValueError, match=msg):
+        sql.to_sql(
+            temp_frame,
+            "test_index_label",
+            conn,
+            if_exists="replace",
+            index_label="C",
+        )
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_multiindex_roundtrip(conn, request):
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_multiindex_roundtrip", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_multiindex_roundtrip")
+
+    df = DataFrame.from_records(
+        [(1, 2.1, "line1"), (2, 1.5, "line2")],
+        columns=["A", "B", "C"],
+        index=["A", "B"],
+    )
+
+    df.to_sql(name="test_multiindex_roundtrip", con=conn)
+    result = sql.read_sql_query(
+        "SELECT * FROM test_multiindex_roundtrip", conn, index_col=["A", "B"]
+    )
+    tm.assert_frame_equal(df, result, check_index_type=True)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        None,
+        int,
+        float,
+        {"A": int, "B": float},
+    ],
+)
+def test_api_dtype_argument(conn, request, dtype):
+    # GH10285 Add dtype argument to read_sql_query
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_dtype_argument", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_dtype_argument")
+
+    df = DataFrame([[1.2, 3.4], [5.6, 7.8]], columns=["A", "B"])
+    assert df.to_sql(name="test_dtype_argument", con=conn) == 2
+
+    expected = df.astype(dtype)
+
+    if "postgres" in conn_name:
+        query = 'SELECT "A", "B" FROM test_dtype_argument'
+    else:
+        query = "SELECT A, B FROM test_dtype_argument"
+    result = sql.read_sql_query(query, con=conn, dtype=dtype)
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_integer_col_names(conn, request):
+    conn = request.getfixturevalue(conn)
+    df = DataFrame([[1, 2], [3, 4]], columns=[0, 1])
+    sql.to_sql(df, "test_frame_integer_col_names", conn, if_exists="replace")
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_get_schema(conn, request, test_frame1):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="'get_schema' not implemented for ADBC drivers",
+                strict=True,
+            )
+        )
+    conn = request.getfixturevalue(conn)
+    create_sql = sql.get_schema(test_frame1, "test", con=conn)
+    assert "CREATE" in create_sql
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_get_schema_with_schema(conn, request, test_frame1):
+    # GH28486
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="'get_schema' not implemented for ADBC drivers",
+                strict=True,
+            )
+        )
+    conn = request.getfixturevalue(conn)
+    create_sql = sql.get_schema(test_frame1, "test", con=conn, schema="pypi")
+    assert "CREATE TABLE pypi." in create_sql
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_get_schema_dtypes(conn, request):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="'get_schema' not implemented for ADBC drivers",
+                strict=True,
+            )
+        )
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    float_frame = DataFrame({"a": [1.1, 1.2], "b": [2.1, 2.2]})
+
+    if conn_name == "sqlite_buildin":
+        dtype = "INTEGER"
+    else:
+        from sqlalchemy import Integer
+
+        dtype = Integer
+    create_sql = sql.get_schema(float_frame, "test", con=conn, dtype={"b": dtype})
+    assert "CREATE" in create_sql
+    assert "INTEGER" in create_sql
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_get_schema_keys(conn, request, test_frame1):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="'get_schema' not implemented for ADBC drivers",
+                strict=True,
+            )
+        )
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    frame = DataFrame({"Col1": [1.1, 1.2], "Col2": [2.1, 2.2]})
+    create_sql = sql.get_schema(frame, "test", con=conn, keys="Col1")
+
+    if "mysql" in conn_name:
+        constraint_sentence = "CONSTRAINT test_pk PRIMARY KEY (`Col1`)"
+    else:
+        constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")'
+    assert constraint_sentence in create_sql
+
+    # multiple columns as key (GH10385)
+    create_sql = sql.get_schema(test_frame1, "test", con=conn, keys=["A", "B"])
+    if "mysql" in conn_name:
+        constraint_sentence = "CONSTRAINT test_pk PRIMARY KEY (`A`, `B`)"
+    else:
+        constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("A", "B")'
+    assert constraint_sentence in create_sql
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_chunksize_read(conn, request):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC")
+        )
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_chunksize", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_chunksize")
+
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((22, 5)), columns=list("abcde")
+    )
+    df.to_sql(name="test_chunksize", con=conn, index=False)
+
+    # reading the query in one time
+    res1 = sql.read_sql_query("select * from test_chunksize", conn)
+
+    # reading the query in chunks with read_sql_query
+    res2 = DataFrame()
+    i = 0
+    sizes = [5, 5, 5, 5, 2]
+
+    for chunk in sql.read_sql_query("select * from test_chunksize", conn, chunksize=5):
+        res2 = concat([res2, chunk], ignore_index=True)
+        assert len(chunk) == sizes[i]
+        i += 1
+
+    tm.assert_frame_equal(res1, res2)
+
+    # reading the query in chunks with read_sql_query
+    if conn_name == "sqlite_buildin":
+        with pytest.raises(NotImplementedError, match="^$"):
+            sql.read_sql_table("test_chunksize", conn, chunksize=5)
+    else:
+        res3 = DataFrame()
+        i = 0
+        sizes = [5, 5, 5, 5, 2]
+
+        for chunk in sql.read_sql_table("test_chunksize", conn, chunksize=5):
+            res3 = concat([res3, chunk], ignore_index=True)
+            assert len(chunk) == sizes[i]
+            i += 1
+
+        tm.assert_frame_equal(res1, res3)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_categorical(conn, request):
+    if conn == "postgresql_adbc_conn":
+        adbc = import_optional_dependency("adbc_driver_postgresql", errors="ignore")
+        if adbc is not None and Version(adbc.__version__) < Version("0.9.0"):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    reason="categorical dtype not implemented for ADBC postgres driver",
+                    strict=True,
+                )
+            )
+    # GH8624
+    # test that categorical gets written correctly as dense column
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_categorical", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_categorical")
+
+    df = DataFrame(
+        {
+            "person_id": [1, 2, 3],
+            "person_name": ["John P. Doe", "Jane Dove", "John P. Doe"],
+        }
+    )
+    df2 = df.copy()
+    df2["person_name"] = df2["person_name"].astype("category")
+
+    df2.to_sql(name="test_categorical", con=conn, index=False)
+    res = sql.read_sql_query("SELECT * FROM test_categorical", conn)
+
+    tm.assert_frame_equal(res, df)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_unicode_column_name(conn, request):
+    # GH 11431
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_unicode", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_unicode")
+
+    df = DataFrame([[1, 2], [3, 4]], columns=["\xe9", "b"])
+    df.to_sql(name="test_unicode", con=conn, index=False)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_escaped_table_name(conn, request):
+    # GH 13206
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("d1187b08-4943-4c8d-a7f6", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("d1187b08-4943-4c8d-a7f6")
+
+    df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]})
+    df.to_sql(name="d1187b08-4943-4c8d-a7f6", con=conn, index=False)
+
+    if "postgres" in conn_name:
+        query = 'SELECT * FROM "d1187b08-4943-4c8d-a7f6"'
+    else:
+        query = "SELECT * FROM `d1187b08-4943-4c8d-a7f6`"
+    res = sql.read_sql_query(query, conn)
+
+    tm.assert_frame_equal(res, df)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_api_read_sql_duplicate_columns(conn, request):
+    # GH#53117
+    if "adbc" in conn:
+        pa = pytest.importorskip("pyarrow")
+        if not (
+            Version(pa.__version__) >= Version("16.0")
+            and conn in ["sqlite_adbc_conn", "postgresql_adbc_conn"]
+        ):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    reason="pyarrow->pandas throws ValueError", strict=True
+                )
+            )
+    conn = request.getfixturevalue(conn)
+    if sql.has_table("test_table", conn):
+        with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+            pandasSQL.drop_table("test_table")
+
+    df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": 1})
+    df.to_sql(name="test_table", con=conn, index=False)
+
+    result = pd.read_sql("SELECT a, b, a +1 as a, c FROM test_table", conn)
+    expected = DataFrame(
+        [[1, 0.1, 2, 1], [2, 0.2, 3, 1], [3, 0.3, 4, 1]],
+        columns=["a", "b", "a", "c"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_read_table_columns(conn, request, test_frame1):
+    # test columns argument in read_table
+    conn_name = conn
+    if conn_name == "sqlite_buildin":
+        request.applymarker(pytest.mark.xfail(reason="Not Implemented"))
+
+    conn = request.getfixturevalue(conn)
+    sql.to_sql(test_frame1, "test_frame", conn)
+
+    cols = ["A", "B"]
+
+    result = sql.read_sql_table("test_frame", conn, columns=cols)
+    assert result.columns.tolist() == cols
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_read_table_index_col(conn, request, test_frame1):
+    # test columns argument in read_table
+    conn_name = conn
+    if conn_name == "sqlite_buildin":
+        request.applymarker(pytest.mark.xfail(reason="Not Implemented"))
+
+    conn = request.getfixturevalue(conn)
+    sql.to_sql(test_frame1, "test_frame", conn)
+
+    result = sql.read_sql_table("test_frame", conn, index_col="index")
+    assert result.index.names == ["index"]
+
+    result = sql.read_sql_table("test_frame", conn, index_col=["A", "B"])
+    assert result.index.names == ["A", "B"]
+
+    result = sql.read_sql_table(
+        "test_frame", conn, index_col=["A", "B"], columns=["C", "D"]
+    )
+    assert result.index.names == ["A", "B"]
+    assert result.columns.tolist() == ["C", "D"]
+
+
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_read_sql_delegate(conn, request):
+    if conn == "sqlite_buildin_iris":
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="sqlite_buildin connection does not implement read_sql_table"
+            )
+        )
+
+    conn = request.getfixturevalue(conn)
+    iris_frame1 = sql.read_sql_query("SELECT * FROM iris", conn)
+    iris_frame2 = sql.read_sql("SELECT * FROM iris", conn)
+    tm.assert_frame_equal(iris_frame1, iris_frame2)
+
+    iris_frame1 = sql.read_sql_table("iris", conn)
+    iris_frame2 = sql.read_sql("iris", conn)
+    tm.assert_frame_equal(iris_frame1, iris_frame2)
+
+
+def test_not_reflect_all_tables(sqlite_conn):
+    conn = sqlite_conn
+    from sqlalchemy import text
+    from sqlalchemy.engine import Engine
+
+    # create invalid table
+    query_list = [
+        text("CREATE TABLE invalid (x INTEGER, y UNKNOWN);"),
+        text("CREATE TABLE other_table (x INTEGER, y INTEGER);"),
+    ]
+
+    for query in query_list:
+        if isinstance(conn, Engine):
+            with conn.connect() as conn:
+                with conn.begin():
+                    conn.execute(query)
+        else:
+            with conn.begin():
+                conn.execute(query)
+
+    with tm.assert_produces_warning(None):
+        sql.read_sql_table("other_table", conn)
+        sql.read_sql_query("SELECT * FROM other_table", conn)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_warning_case_insensitive_table_name(conn, request, test_frame1):
+    conn_name = conn
+    if conn_name == "sqlite_buildin" or "adbc" in conn_name:
+        request.applymarker(pytest.mark.xfail(reason="Does not raise warning"))
+
+    conn = request.getfixturevalue(conn)
+    # see gh-7815
+    with tm.assert_produces_warning(
+        UserWarning,
+        match=(
+            r"The provided table name 'TABLE1' is not found exactly as such in "
+            r"the database after writing the table, possibly due to case "
+            r"sensitivity issues. Consider using lower case table names."
+        ),
+    ):
+        with sql.SQLDatabase(conn) as db:
+            db.check_case_sensitive("TABLE1", "")
+
+    # Test that the warning is certainly NOT triggered in a normal case.
+    with tm.assert_produces_warning(None):
+        test_frame1.to_sql(name="CaseSensitive", con=conn)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_sqlalchemy_type_mapping(conn, request):
+    conn = request.getfixturevalue(conn)
+    from sqlalchemy import TIMESTAMP
+
+    # Test Timestamp objects (no datetime64 because of timezone) (GH9085)
+    df = DataFrame(
+        {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)}
+    )
+    with sql.SQLDatabase(conn) as db:
+        table = sql.SQLTable("test_type", db, frame=df)
+        # GH 9086: TIMESTAMP is the suggested type for datetimes with timezones
+        assert isinstance(table.table.c["time"].type, TIMESTAMP)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+@pytest.mark.parametrize(
+    "integer, expected",
+    [
+        ("int8", "SMALLINT"),
+        ("Int8", "SMALLINT"),
+        ("uint8", "SMALLINT"),
+        ("UInt8", "SMALLINT"),
+        ("int16", "SMALLINT"),
+        ("Int16", "SMALLINT"),
+        ("uint16", "INTEGER"),
+        ("UInt16", "INTEGER"),
+        ("int32", "INTEGER"),
+        ("Int32", "INTEGER"),
+        ("uint32", "BIGINT"),
+        ("UInt32", "BIGINT"),
+        ("int64", "BIGINT"),
+        ("Int64", "BIGINT"),
+        (int, "BIGINT" if np.dtype(int).name == "int64" else "INTEGER"),
+    ],
+)
+def test_sqlalchemy_integer_mapping(conn, request, integer, expected):
+    # GH35076 Map pandas integer to optimal SQLAlchemy integer type
+    conn = request.getfixturevalue(conn)
+    df = DataFrame([0, 1], columns=["a"], dtype=integer)
+    with sql.SQLDatabase(conn) as db:
+        table = sql.SQLTable("test_type", db, frame=df)
+
+        result = str(table.table.c.a.type)
+    assert result == expected
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+@pytest.mark.parametrize("integer", ["uint64", "UInt64"])
+def test_sqlalchemy_integer_overload_mapping(conn, request, integer):
+    conn = request.getfixturevalue(conn)
+    # GH35076 Map pandas integer to optimal SQLAlchemy integer type
+    df = DataFrame([0, 1], columns=["a"], dtype=integer)
+    with sql.SQLDatabase(conn) as db:
+        with pytest.raises(
+            ValueError, match="Unsigned 64 bit integer datatype is not supported"
+        ):
+            sql.SQLTable("test_type", db, frame=df)
+
+
+def test_database_uri_string(temp_file, request, test_frame1):
+    pytest.importorskip("sqlalchemy")
+    # Test read_sql and .to_sql method with a database URI (GH10654)
+    # db_uri = 'sqlite:///:memory:' # raises
+    # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near
+    # "iris": syntax error [SQL: 'iris']
+    name = str(temp_file)
+    db_uri = "sqlite:///" + name
+    table = "iris"
+    test_frame1.to_sql(name=table, con=db_uri, if_exists="replace", index=False)
+    test_frame2 = sql.read_sql(table, db_uri)
+    test_frame3 = sql.read_sql_table(table, db_uri)
+    query = "SELECT * FROM iris"
+    test_frame4 = sql.read_sql_query(query, db_uri)
+    tm.assert_frame_equal(test_frame1, test_frame2)
+    tm.assert_frame_equal(test_frame1, test_frame3)
+    tm.assert_frame_equal(test_frame1, test_frame4)
+
+
+@td.skip_if_installed("pg8000")
+def test_pg8000_sqlalchemy_passthrough_error(request):
+    pytest.importorskip("sqlalchemy")
+    # using driver that will not be installed on CI to trigger error
+    # in sqlalchemy.create_engine -> test passing of this error to user
+    db_uri = "postgresql+pg8000://user:pass@host/dbname"
+    with pytest.raises(ImportError, match="pg8000"):
+        sql.read_sql("select * from table", db_uri)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris)
+def test_query_by_text_obj(conn, request):
+    # WIP : GH10846
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    from sqlalchemy import text
+
+    if "postgres" in conn_name:
+        name_text = text('select * from iris where "Name"=:name')
+    else:
+        name_text = text("select * from iris where name=:name")
+    iris_df = sql.read_sql(name_text, conn, params={"name": "Iris-versicolor"})
+    all_names = set(iris_df["Name"])
+    assert all_names == {"Iris-versicolor"}
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris)
+def test_query_by_select_obj(conn, request):
+    conn = request.getfixturevalue(conn)
+    # WIP : GH10846
+    from sqlalchemy import (
+        bindparam,
+        select,
+    )
+
+    iris = iris_table_metadata()
+    name_select = select(iris).where(iris.c.Name == bindparam("name"))
+    iris_df = sql.read_sql(name_select, conn, params={"name": "Iris-setosa"})
+    all_names = set(iris_df["Name"])
+    assert all_names == {"Iris-setosa"}
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_column_with_percentage(conn, request):
+    # GH 37157
+    conn_name = conn
+    if conn_name == "sqlite_buildin":
+        request.applymarker(pytest.mark.xfail(reason="Not Implemented"))
+
+    conn = request.getfixturevalue(conn)
+    df = DataFrame({"A": [0, 1, 2], "%_variation": [3, 4, 5]})
+    df.to_sql(name="test_column_percentage", con=conn, index=False)
+
+    res = sql.read_sql_table("test_column_percentage", conn)
+
+    tm.assert_frame_equal(res, df)
+
+
+def test_sql_open_close(temp_file, test_frame3):
+    # Test if the IO in the database still work if the connection closed
+    # between the writing and reading (as in many real situations).
+
+    with contextlib.closing(sqlite3.connect(temp_file)) as conn:
+        assert sql.to_sql(test_frame3, "test_frame3_legacy", conn, index=False) == 4
+
+    with contextlib.closing(sqlite3.connect(temp_file)) as conn:
+        result = sql.read_sql_query("SELECT * FROM test_frame3_legacy;", conn)
+
+    tm.assert_frame_equal(test_frame3, result)
+
+
+@td.skip_if_installed("sqlalchemy")
+def test_con_string_import_error():
+    conn = "mysql://root@localhost/pandas"
+    msg = "Using URI string without sqlalchemy installed"
+    with pytest.raises(ImportError, match=msg):
+        sql.read_sql("SELECT * FROM iris", conn)
+
+
+@td.skip_if_installed("sqlalchemy")
+def test_con_unknown_dbapi2_class_does_not_error_without_sql_alchemy_installed():
+    class MockSqliteConnection:
+        def __init__(self, *args, **kwargs) -> None:
+            self.conn = sqlite3.Connection(*args, **kwargs)
+
+        def __getattr__(self, name):
+            return getattr(self.conn, name)
+
+        def close(self):
+            self.conn.close()
+
+    with contextlib.closing(MockSqliteConnection(":memory:")) as conn:
+        with tm.assert_produces_warning(UserWarning, match="only supports SQLAlchemy"):
+            sql.read_sql("SELECT 1", conn)
+
+
+def test_sqlite_read_sql_delegate(sqlite_buildin_iris):
+    conn = sqlite_buildin_iris
+    iris_frame1 = sql.read_sql_query("SELECT * FROM iris", conn)
+    iris_frame2 = sql.read_sql("SELECT * FROM iris", conn)
+    tm.assert_frame_equal(iris_frame1, iris_frame2)
+
+    msg = "Execution failed on sql 'iris': near \"iris\": syntax error"
+    with pytest.raises(sql.DatabaseError, match=msg):
+        sql.read_sql("iris", conn)
+
+
+def test_get_schema2(test_frame1):
+    # without providing a connection object (available for backwards comp)
+    create_sql = sql.get_schema(test_frame1, "test")
+    assert "CREATE" in create_sql
+
+
+def test_sqlite_type_mapping(sqlite_buildin):
+    # Test Timestamp objects (no datetime64 because of timezone) (GH9085)
+    conn = sqlite_buildin
+    df = DataFrame(
+        {"time": to_datetime(["2014-12-12 01:54", "2014-12-11 02:54"], utc=True)}
+    )
+    db = sql.SQLiteDatabase(conn)
+    table = sql.SQLiteTable("test_type", db, frame=df)
+    schema = table.sql_schema()
+    for col in schema.split("\n"):
+        if col.split()[0].strip('"') == "time":
+            assert col.split()[1] == "TIMESTAMP"
+
+
+# -----------------------------------------------------------------------------
+# -- Database flavor specific tests
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_create_table(conn, request):
+    if conn == "sqlite_str":
+        pytest.skip("sqlite_str has no inspection system")
+
+    conn = request.getfixturevalue(conn)
+
+    from sqlalchemy import inspect
+
+    temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]})
+    with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+        assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4
+
+    insp = inspect(conn)
+    assert insp.has_table("temp_frame")
+
+    # Cleanup
+    with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+        pandasSQL.drop_table("temp_frame")
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_drop_table(conn, request):
+    if conn == "sqlite_str":
+        pytest.skip("sqlite_str has no inspection system")
+
+    conn = request.getfixturevalue(conn)
+
+    from sqlalchemy import inspect
+
+    temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]})
+    with sql.SQLDatabase(conn) as pandasSQL:
+        with pandasSQL.run_transaction():
+            assert pandasSQL.to_sql(temp_frame, "temp_frame") == 4
+
+        insp = inspect(conn)
+        assert insp.has_table("temp_frame")
+
+        with pandasSQL.run_transaction():
+            pandasSQL.drop_table("temp_frame")
+        try:
+            insp.clear_cache()  # needed with SQLAlchemy 2.0, unavailable prior
+        except AttributeError:
+            pass
+        assert not insp.has_table("temp_frame")
+
+
+@pytest.mark.parametrize("conn_name", all_connectable)
+def test_delete_rows_success(conn_name, test_frame1, request):
+    table_name = "temp_delete_rows_frame"
+    conn = request.getfixturevalue(conn_name)
+
+    with pandasSQL_builder(conn) as pandasSQL:
+        with pandasSQL.run_transaction():
+            assert pandasSQL.to_sql(test_frame1, table_name) == test_frame1.shape[0]
+
+        with pandasSQL.run_transaction():
+            assert pandasSQL.delete_rows(table_name) is None
+
+        assert count_rows(conn, table_name) == 0
+        assert pandasSQL.has_table(table_name)
+
+
+@pytest.mark.parametrize("conn_name", all_connectable)
+def test_delete_rows_is_atomic(conn_name, request):
+    sqlalchemy = pytest.importorskip("sqlalchemy")
+
+    table_name = "temp_delete_rows_atomic_frame"
+    table_stmt = f"CREATE TABLE {table_name} (a INTEGER, b INTEGER UNIQUE NOT NULL)"
+
+    if conn_name != "sqlite_buildin" and "adbc" not in conn_name:
+        table_stmt = sqlalchemy.text(table_stmt)
+
+    # setting dtype is mandatory for adbc related tests
+    original_df = DataFrame({"a": [1, 2], "b": [3, 4]}, dtype="int32")
+    replacing_df = DataFrame({"a": [5, 6, 7], "b": [8, 8, 8]}, dtype="int32")
+
+    conn = request.getfixturevalue(conn_name)
+    with pandasSQL_builder(conn) as pandasSQL:
+        with pandasSQL.run_transaction() as cur:
+            cur.execute(table_stmt)
+
+        with pandasSQL.run_transaction():
+            pandasSQL.to_sql(original_df, table_name, if_exists="append", index=False)
+
+        # inserting duplicated values in a UNIQUE constraint column
+        with pytest.raises(pd.errors.DatabaseError):
+            with pandasSQL.run_transaction():
+                pandasSQL.to_sql(
+                    replacing_df, table_name, if_exists="delete_rows", index=False
+                )
+
+        # failed "delete_rows" is rolled back preserving original data
+        with pandasSQL.run_transaction():
+            result_df = pandasSQL.read_query(
+                f"SELECT * FROM {table_name}", dtype="int32"
+            )
+            tm.assert_frame_equal(result_df, original_df)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_roundtrip(conn, request, test_frame1):
+    if conn == "sqlite_str":
+        pytest.skip("sqlite_str has no inspection system")
+
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    with pandasSQL_builder(conn) as pandasSQL:
+        with pandasSQL.run_transaction():
+            assert pandasSQL.to_sql(test_frame1, "test_frame_roundtrip") == 4
+            result = pandasSQL.read_query("SELECT * FROM test_frame_roundtrip")
+
+    if "adbc" in conn_name:
+        result = result.rename(columns={"__index_level_0__": "level_0"})
+    result.set_index("level_0", inplace=True)
+    # result.index.astype(int)
+
+    result.index.name = None
+
+    tm.assert_frame_equal(result, test_frame1)
+
+
+@pytest.mark.parametrize("conn", all_connectable_iris)
+def test_execute_sql(conn, request):
+    conn = request.getfixturevalue(conn)
+    with pandasSQL_builder(conn) as pandasSQL:
+        with pandasSQL.run_transaction():
+            iris_results = pandasSQL.execute("SELECT * FROM iris")
+            row = iris_results.fetchone()
+            iris_results.close()
+    assert list(row) == [5.1, 3.5, 1.4, 0.2, "Iris-setosa"]
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris)
+def test_sqlalchemy_read_table(conn, request):
+    conn = request.getfixturevalue(conn)
+    iris_frame = sql.read_sql_table("iris", con=conn)
+    check_iris_frame(iris_frame)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris)
+def test_sqlalchemy_read_table_columns(conn, request):
+    conn = request.getfixturevalue(conn)
+    iris_frame = sql.read_sql_table(
+        "iris", con=conn, columns=["SepalLength", "SepalLength"]
+    )
+    tm.assert_index_equal(iris_frame.columns, Index(["SepalLength", "SepalLength__1"]))
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable_iris)
+def test_read_table_absent_raises(conn, request):
+    conn = request.getfixturevalue(conn)
+    msg = "Table this_doesnt_exist not found"
+    with pytest.raises(ValueError, match=msg):
+        sql.read_sql_table("this_doesnt_exist", con=conn)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable_types)
+def test_sqlalchemy_default_type_conversion(conn, request):
+    conn_name = conn
+    if conn_name == "sqlite_str":
+        pytest.skip("types tables not created in sqlite_str fixture")
+    elif "mysql" in conn_name or "sqlite" in conn_name:
+        request.applymarker(
+            pytest.mark.xfail(reason="boolean dtype not inferred properly")
+        )
+
+    conn = request.getfixturevalue(conn)
+    df = sql.read_sql_table("types", conn)
+
+    assert issubclass(df.FloatCol.dtype.type, np.floating)
+    assert issubclass(df.IntCol.dtype.type, np.integer)
+    assert issubclass(df.BoolCol.dtype.type, np.bool_)
+
+    # Int column with NA values stays as float
+    assert issubclass(df.IntColWithNull.dtype.type, np.floating)
+    # Bool column with NA values becomes object
+    assert issubclass(df.BoolColWithNull.dtype.type, object)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_bigint(conn, request):
+    # int64 should be converted to BigInteger, GH7433
+    conn = request.getfixturevalue(conn)
+    df = DataFrame(data={"i64": [2**62]})
+    assert df.to_sql(name="test_bigint", con=conn, index=False) == 1
+    result = sql.read_sql_table("test_bigint", conn)
+
+    tm.assert_frame_equal(df, result)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable_types)
+def test_default_date_load(conn, request):
+    conn_name = conn
+    if conn_name == "sqlite_str":
+        pytest.skip("types tables not created in sqlite_str fixture")
+    elif "sqlite" in conn_name:
+        request.applymarker(
+            pytest.mark.xfail(reason="sqlite does not read date properly")
+        )
+
+    conn = request.getfixturevalue(conn)
+    df = sql.read_sql_table("types", conn)
+
+    assert issubclass(df.DateCol.dtype.type, np.datetime64)
+
+
+@pytest.mark.parametrize("conn", postgresql_connectable)
+@pytest.mark.parametrize("parse_dates", [None, ["DateColWithTz"]])
+def test_datetime_with_timezone_query(conn, request, parse_dates):
+    # edge case that converts postgresql datetime with time zone types
+    # to datetime64[ns,psycopg2.tz.FixedOffsetTimezone..], which is ok
+    # but should be more natural, so coerce to datetime64[ns] for now
+    conn = request.getfixturevalue(conn)
+    expected = create_and_load_postgres_datetz(conn)
+
+    # GH11216
+    df = read_sql_query("select * from datetz", conn, parse_dates=parse_dates)
+    col = df.DateColWithTz
+    tm.assert_series_equal(col, expected)
+
+
+@pytest.mark.parametrize("conn", postgresql_connectable)
+def test_datetime_with_timezone_query_chunksize(conn, request):
+    conn = request.getfixturevalue(conn)
+    expected = create_and_load_postgres_datetz(conn)
+
+    df = concat(
+        list(read_sql_query("select * from datetz", conn, chunksize=1)),
+        ignore_index=True,
+    )
+    col = df.DateColWithTz
+    tm.assert_series_equal(col, expected)
+
+
+@pytest.mark.parametrize("conn", postgresql_connectable)
+def test_datetime_with_timezone_table(conn, request):
+    conn = request.getfixturevalue(conn)
+    expected = create_and_load_postgres_datetz(conn)
+    result = sql.read_sql_table("datetz", conn)
+
+    exp_frame = expected.to_frame()
+    tm.assert_frame_equal(result, exp_frame)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_datetime_with_timezone_roundtrip(conn, request):
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    # GH 9086
+    # Write datetimetz data to a db and read it back
+    # For dbs that support timestamps with timezones, should get back UTC
+    # otherwise naive data should be returned
+    expected = DataFrame(
+        {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific", unit="us")}
+    )
+    assert expected.to_sql(name="test_datetime_tz", con=conn, index=False) == 3
+
+    if "postgresql" in conn_name:
+        # SQLAlchemy "timezones" (i.e. offsets) are coerced to UTC
+        expected["A"] = expected["A"].dt.tz_convert("UTC")
+    else:
+        # Otherwise, timestamps are returned as local, naive
+        expected["A"] = expected["A"].dt.tz_localize(None)
+
+    result = sql.read_sql_table("test_datetime_tz", conn)
+    tm.assert_frame_equal(result, expected)
+
+    result = sql.read_sql_query("SELECT * FROM test_datetime_tz", conn)
+    if "sqlite" in conn_name:
+        # read_sql_query does not return datetime type like read_sql_table
+        assert isinstance(result.loc[0, "A"], str)
+        result["A"] = to_datetime(result["A"]).dt.as_unit("us")
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_out_of_bounds_datetime(conn, request):
+    # GH 26761
+    conn = request.getfixturevalue(conn)
+    data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0])
+    assert data.to_sql(name="test_datetime_obb", con=conn, index=False) == 1
+    result = sql.read_sql_table("test_datetime_obb", conn)
+    expected = DataFrame(
+        np.array([datetime(9999, 1, 1)], dtype="M8[us]"), columns=["date"]
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_naive_datetimeindex_roundtrip(conn, request):
+    # GH 23510
+    # Ensure that a naive DatetimeIndex isn't converted to UTC
+    conn = request.getfixturevalue(conn)
+    dates = date_range("2018-01-01", periods=5, freq="6h", unit="us")._with_freq(None)
+    expected = DataFrame({"nums": range(5)}, index=dates)
+    assert expected.to_sql(name="foo_table", con=conn, index_label="info_date") == 5
+    result = sql.read_sql_table("foo_table", conn, index_col="info_date")
+    # result index with gain a name from a set_index operation; expected
+    tm.assert_frame_equal(result, expected, check_names=False)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable_types)
+def test_date_parsing(conn, request):
+    # No Parsing
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    df = sql.read_sql_table("types", conn)
+    expected_type = object if "sqlite" in conn_name else np.datetime64
+    assert issubclass(df.DateCol.dtype.type, expected_type)
+
+    df = sql.read_sql_table("types", conn, parse_dates=["DateCol"])
+    assert issubclass(df.DateCol.dtype.type, np.datetime64)
+
+    df = sql.read_sql_table("types", conn, parse_dates={"DateCol": "%Y-%m-%d %H:%M:%S"})
+    assert issubclass(df.DateCol.dtype.type, np.datetime64)
+
+    df = sql.read_sql_table(
+        "types",
+        conn,
+        parse_dates={"DateCol": {"format": "%Y-%m-%d %H:%M:%S"}},
+    )
+    assert issubclass(df.DateCol.dtype.type, np.datetime64)
+
+    df = sql.read_sql_table("types", conn, parse_dates=["IntDateCol"])
+    assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+
+    df = sql.read_sql_table("types", conn, parse_dates={"IntDateCol": "s"})
+    assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+
+    df = sql.read_sql_table("types", conn, parse_dates={"IntDateCol": {"unit": "s"}})
+    assert issubclass(df.IntDateCol.dtype.type, np.datetime64)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_datetime(conn, request):
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    df = DataFrame(
+        {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)}
+    )
+    assert df.to_sql(name="test_datetime", con=conn) == 3
+
+    # with read_table -> type information from schema used
+    result = sql.read_sql_table("test_datetime", conn)
+    result = result.drop("index", axis=1)
+
+    expected = df[:]
+    expected["A"] = expected["A"].astype("M8[us]")
+    tm.assert_frame_equal(result, expected)
+
+    # with read_sql -> no type information -> sqlite has no native
+    result = sql.read_sql_query("SELECT * FROM test_datetime", conn)
+    result = result.drop("index", axis=1)
+    if "sqlite" in conn_name:
+        assert isinstance(result.loc[0, "A"], str)
+        result["A"] = to_datetime(result["A"])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_datetime_NaT(conn, request):
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    df = DataFrame(
+        {"A": date_range("2013-01-01 09:00:00", periods=3), "B": np.arange(3.0)}
+    )
+    df.loc[1, "A"] = np.nan
+    assert df.to_sql(name="test_datetime", con=conn, index=False) == 3
+
+    # with read_table -> type information from schema used
+    result = sql.read_sql_table("test_datetime", conn)
+    expected = df[:]
+    expected["A"] = expected["A"].astype("M8[us]")
+    tm.assert_frame_equal(result, expected)
+
+    # with read_sql -> no type information -> sqlite has no native
+    result = sql.read_sql_query("SELECT * FROM test_datetime", conn)
+    if "sqlite" in conn_name:
+        assert isinstance(result.loc[0, "A"], str)
+        result["A"] = to_datetime(result["A"], errors="coerce")
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_datetime_date(conn, request):
+    # test support for datetime.date
+    conn = request.getfixturevalue(conn)
+    df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"])
+    assert df.to_sql(name="test_date", con=conn, index=False) == 2
+    res = read_sql_table("test_date", conn)
+    result = res["a"]
+    expected = to_datetime(df["a"])
+    # comes back as datetime64
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_datetime_time(conn, request, sqlite_buildin):
+    # test support for datetime.time
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    df = DataFrame([time(9, 0, 0), time(9, 1, 30)], columns=["a"])
+    assert df.to_sql(name="test_time", con=conn, index=False) == 2
+    res = read_sql_table("test_time", conn)
+    tm.assert_frame_equal(res, df)
+
+    # GH8341
+    # first, use the fallback to have the sqlite adapter put in place
+    sqlite_conn = sqlite_buildin
+    assert sql.to_sql(df, "test_time2", sqlite_conn, index=False) == 2
+    res = sql.read_sql_query("SELECT * FROM test_time2", sqlite_conn)
+    ref = df.map(lambda _: _.strftime("%H:%M:%S.%f"))
+    tm.assert_frame_equal(ref, res)  # check if adapter is in place
+    # then test if sqlalchemy is unaffected by the sqlite adapter
+    assert sql.to_sql(df, "test_time3", conn, index=False) == 2
+    if "sqlite" in conn_name:
+        res = sql.read_sql_query("SELECT * FROM test_time3", conn)
+        ref = df.map(lambda _: _.strftime("%H:%M:%S.%f"))
+        tm.assert_frame_equal(ref, res)
+    res = sql.read_sql_table("test_time3", conn)
+    tm.assert_frame_equal(df, res)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_mixed_dtype_insert(conn, request):
+    # see GH6509
+    conn = request.getfixturevalue(conn)
+    s1 = Series(2**25 + 1, dtype=np.int32)
+    s2 = Series(0.0, dtype=np.float32)
+    df = DataFrame({"s1": s1, "s2": s2})
+
+    # write and read again
+    assert df.to_sql(name="test_read_write", con=conn, index=False) == 1
+    df2 = sql.read_sql_table("test_read_write", conn)
+
+    tm.assert_frame_equal(df, df2, check_dtype=False, check_exact=True)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_nan_numeric(conn, request):
+    # NaNs in numeric float column
+    conn = request.getfixturevalue(conn)
+    df = DataFrame({"A": [0, 1, 2], "B": [0.2, np.nan, 5.6]})
+    assert df.to_sql(name="test_nan", con=conn, index=False) == 3
+
+    # with read_table
+    result = sql.read_sql_table("test_nan", conn)
+    tm.assert_frame_equal(result, df)
+
+    # with read_sql
+    result = sql.read_sql_query("SELECT * FROM test_nan", conn)
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_nan_fullcolumn(conn, request):
+    # full NaN column (numeric float column)
+    conn = request.getfixturevalue(conn)
+    df = DataFrame({"A": [0, 1, 2], "B": [np.nan, np.nan, np.nan]})
+    assert df.to_sql(name="test_nan", con=conn, index=False) == 3
+
+    # with read_table
+    result = sql.read_sql_table("test_nan", conn)
+    tm.assert_frame_equal(result, df)
+
+    # with read_sql -> not type info from table -> stays None
+    df["B"] = df["B"].astype("object")
+    df["B"] = None
+    result = sql.read_sql_query("SELECT * FROM test_nan", conn)
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_nan_string(conn, request):
+    # NaNs in string column
+    conn = request.getfixturevalue(conn)
+    df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", np.nan]})
+    assert df.to_sql(name="test_nan", con=conn, index=False) == 3
+
+    # NaNs are coming back as None
+    df.loc[2, "B"] = None
+
+    # with read_table
+    result = sql.read_sql_table("test_nan", conn)
+    tm.assert_frame_equal(result, df)
+
+    # with read_sql
+    result = sql.read_sql_query("SELECT * FROM test_nan", conn)
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_to_sql_save_index(conn, request):
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(
+                reason="ADBC implementation does not create index", strict=True
+            )
+        )
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    df = DataFrame.from_records(
+        [(1, 2.1, "line1"), (2, 1.5, "line2")], columns=["A", "B", "C"], index=["A"]
+    )
+
+    tbl_name = "test_to_sql_saves_index"
+    with pandasSQL_builder(conn) as pandasSQL:
+        with pandasSQL.run_transaction():
+            assert pandasSQL.to_sql(df, tbl_name) == 2
+
+    if conn_name in {"sqlite_buildin", "sqlite_str"}:
+        ixs = sql.read_sql_query(
+            "SELECT * FROM sqlite_master WHERE type = 'index' "
+            f"AND tbl_name = '{tbl_name}'",
+            conn,
+        )
+        ix_cols = []
+        for ix_name in ixs.name:
+            ix_info = sql.read_sql_query(f"PRAGMA index_info({ix_name})", conn)
+            ix_cols.append(ix_info.name.tolist())
+    else:
+        from sqlalchemy import inspect
+
+        insp = inspect(conn)
+
+        ixs = insp.get_indexes(tbl_name)
+        ix_cols = [i["column_names"] for i in ixs]
+
+    assert ix_cols == [["A"]]
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_transactions(conn, request):
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+
+    stmt = "CREATE TABLE test_trans (A INT, B TEXT)"
+    if conn_name != "sqlite_buildin" and "adbc" not in conn_name:
+        from sqlalchemy import text
+
+        stmt = text(stmt)
+
+    with pandasSQL_builder(conn) as pandasSQL:
+        with pandasSQL.run_transaction() as trans:
+            trans.execute(stmt)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_transaction_rollback(conn, request):
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    with pandasSQL_builder(conn) as pandasSQL:
+        with pandasSQL.run_transaction() as trans:
+            stmt = "CREATE TABLE test_trans (A INT, B TEXT)"
+            if "adbc" in conn_name or isinstance(pandasSQL, SQLiteDatabase):
+                trans.execute(stmt)
+            else:
+                from sqlalchemy import text
+
+                stmt = text(stmt)
+                trans.execute(stmt)
+
+        class DummyException(Exception):
+            pass
+
+        # Make sure when transaction is rolled back, no rows get inserted
+        ins_sql = "INSERT INTO test_trans (A,B) VALUES (1, 'blah')"
+        if isinstance(pandasSQL, SQLDatabase):
+            from sqlalchemy import text
+
+            ins_sql = text(ins_sql)
+        try:
+            with pandasSQL.run_transaction() as trans:
+                trans.execute(ins_sql)
+                raise DummyException("error")
+        except DummyException:
+            # ignore raised exception
+            pass
+        with pandasSQL.run_transaction():
+            res = pandasSQL.read_query("SELECT * FROM test_trans")
+        assert len(res) == 0
+
+        # Make sure when transaction is committed, rows do get inserted
+        with pandasSQL.run_transaction() as trans:
+            trans.execute(ins_sql)
+            res2 = pandasSQL.read_query("SELECT * FROM test_trans")
+        assert len(res2) == 1
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_get_schema_create_table(conn, request, test_frame3):
+    # Use a dataframe without a bool column, since MySQL converts bool to
+    # TINYINT (which read_sql_table returns as an int and causes a dtype
+    # mismatch)
+    if conn == "sqlite_str":
+        request.applymarker(
+            pytest.mark.xfail(reason="test does not support sqlite_str fixture")
+        )
+
+    conn = request.getfixturevalue(conn)
+
+    from sqlalchemy import text
+    from sqlalchemy.engine import Engine
+
+    tbl = "test_get_schema_create_table"
+    create_sql = sql.get_schema(test_frame3, tbl, con=conn)
+    blank_test_df = test_frame3.iloc[:0]
+
+    create_sql = text(create_sql)
+    if isinstance(conn, Engine):
+        with conn.connect() as newcon:
+            with newcon.begin():
+                newcon.execute(create_sql)
+    else:
+        conn.execute(create_sql)
+    returned_df = sql.read_sql_table(tbl, conn)
+    tm.assert_frame_equal(returned_df, blank_test_df, check_index_type=False)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_dtype(conn, request):
+    if conn == "sqlite_str":
+        pytest.skip("sqlite_str has no inspection system")
+
+    conn = request.getfixturevalue(conn)
+
+    from sqlalchemy import (
+        TEXT,
+        String,
+    )
+    from sqlalchemy.schema import MetaData
+
+    cols = ["A", "B"]
+    data = [(0.8, True), (0.9, None)]
+    df = DataFrame(data, columns=cols)
+    assert df.to_sql(name="dtype_test", con=conn) == 2
+    assert df.to_sql(name="dtype_test2", con=conn, dtype={"B": TEXT}) == 2
+    meta = MetaData()
+    meta.reflect(bind=conn)
+    sqltype = meta.tables["dtype_test2"].columns["B"].type
+    assert isinstance(sqltype, TEXT)
+    msg = "The type of B is not a SQLAlchemy type"
+    with pytest.raises(ValueError, match=msg):
+        df.to_sql(name="error", con=conn, dtype={"B": str})
+
+    # GH9083
+    assert df.to_sql(name="dtype_test3", con=conn, dtype={"B": String(10)}) == 2
+    meta.reflect(bind=conn)
+    sqltype = meta.tables["dtype_test3"].columns["B"].type
+    assert isinstance(sqltype, String)
+    assert sqltype.length == 10
+
+    # single dtype
+    assert df.to_sql(name="single_dtype_test", con=conn, dtype=TEXT) == 2
+    meta.reflect(bind=conn)
+    sqltypea = meta.tables["single_dtype_test"].columns["A"].type
+    sqltypeb = meta.tables["single_dtype_test"].columns["B"].type
+    assert isinstance(sqltypea, TEXT)
+    assert isinstance(sqltypeb, TEXT)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_notna_dtype(conn, request):
+    if conn == "sqlite_str":
+        pytest.skip("sqlite_str has no inspection system")
+
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+
+    from sqlalchemy import (
+        Boolean,
+        DateTime,
+        Float,
+        Integer,
+    )
+    from sqlalchemy.schema import MetaData
+
+    cols = {
+        "Bool": Series([True, None]),
+        "Date": Series([datetime(2012, 5, 1), None]),
+        "Int": Series([1, None], dtype="object"),
+        "Float": Series([1.1, None]),
+    }
+    df = DataFrame(cols)
+
+    tbl = "notna_dtype_test"
+    assert df.to_sql(name=tbl, con=conn) == 2
+    _ = sql.read_sql_table(tbl, conn)
+    meta = MetaData()
+    meta.reflect(bind=conn)
+    my_type = Integer if "mysql" in conn_name else Boolean
+    col_dict = meta.tables[tbl].columns
+    assert isinstance(col_dict["Bool"].type, my_type)
+    assert isinstance(col_dict["Date"].type, DateTime)
+    assert isinstance(col_dict["Int"].type, Integer)
+    assert isinstance(col_dict["Float"].type, Float)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_double_precision(conn, request):
+    if conn == "sqlite_str":
+        pytest.skip("sqlite_str has no inspection system")
+
+    conn = request.getfixturevalue(conn)
+
+    from sqlalchemy import (
+        BigInteger,
+        Float,
+        Integer,
+    )
+    from sqlalchemy.schema import MetaData
+
+    V = 1.23456789101112131415
+
+    df = DataFrame(
+        {
+            "f32": Series([V], dtype="float32"),
+            "f64": Series([V], dtype="float64"),
+            "f64_as_f32": Series([V], dtype="float64"),
+            "i32": Series([5], dtype="int32"),
+            "i64": Series([5], dtype="int64"),
+        }
+    )
+
+    assert (
+        df.to_sql(
+            name="test_dtypes",
+            con=conn,
+            index=False,
+            if_exists="replace",
+            dtype={"f64_as_f32": Float(precision=23)},
+        )
+        == 1
+    )
+    res = sql.read_sql_table("test_dtypes", conn)
+
+    # check precision of float64
+    assert np.round(df["f64"].iloc[0], 14) == np.round(res["f64"].iloc[0], 14)
+
+    # check sql types
+    meta = MetaData()
+    meta.reflect(bind=conn)
+    col_dict = meta.tables["test_dtypes"].columns
+    assert str(col_dict["f32"].type) == str(col_dict["f64_as_f32"].type)
+    assert isinstance(col_dict["f32"].type, Float)
+    assert isinstance(col_dict["f64"].type, Float)
+    assert isinstance(col_dict["i32"].type, Integer)
+    assert isinstance(col_dict["i64"].type, BigInteger)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_connectable_issue_example(conn, request):
+    conn = request.getfixturevalue(conn)
+
+    # This tests the example raised in issue
+    # https://github.com/pandas-dev/pandas/issues/10104
+    from sqlalchemy.engine import Engine
+
+    def test_select(connection):
+        query = "SELECT test_foo_data FROM test_foo_data"
+        return sql.read_sql_query(query, con=connection)
+
+    def test_append(connection, data):
+        data.to_sql(name="test_foo_data", con=connection, if_exists="append")
+
+    def test_connectable(conn):
+        # https://github.com/sqlalchemy/sqlalchemy/commit/
+        # 00b5c10846e800304caa86549ab9da373b42fa5d#r48323973
+        foo_data = test_select(conn)
+        test_append(conn, foo_data)
+
+    def main(connectable):
+        if isinstance(connectable, Engine):
+            with connectable.connect() as conn:
+                with conn.begin():
+                    test_connectable(conn)
+        else:
+            test_connectable(connectable)
+
+    assert (
+        DataFrame({"test_foo_data": [0, 1, 2]}).to_sql(name="test_foo_data", con=conn)
+        == 3
+    )
+    main(conn)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+@pytest.mark.parametrize(
+    "input",
+    [{"foo": [np.inf]}, {"foo": [-np.inf]}, {"foo": [-np.inf], "infe0": ["bar"]}],
+)
+def test_to_sql_with_negative_npinf(conn, request, input):
+    # GH 34431
+
+    df = DataFrame(input)
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+
+    if "mysql" in conn_name:
+        # GH 36465
+        # The input {"foo": [-np.inf], "infe0": ["bar"]} does not raise any error
+        # for pymysql version >= 0.10
+        msg = "Execution failed on sql"
+        with pytest.raises(pd.errors.DatabaseError, match=msg):
+            df.to_sql(name="foobar", con=conn, index=False)
+    else:
+        assert df.to_sql(name="foobar", con=conn, index=False) == 1
+        res = sql.read_sql_table("foobar", conn)
+        tm.assert_equal(df, res)
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_temporary_table(conn, request):
+    if conn == "sqlite_str":
+        pytest.skip("test does not work with str connection")
+
+    conn = request.getfixturevalue(conn)
+
+    from sqlalchemy import (
+        Column,
+        Integer,
+        Unicode,
+        select,
+    )
+    from sqlalchemy.orm import (
+        Session,
+        declarative_base,
+    )
+
+    test_data = "Hello, World!"
+    expected = DataFrame({"spam": [test_data]})
+    Base = declarative_base()
+
+    class Temporary(Base):
+        __tablename__ = "temp_test"
+        __table_args__ = {"prefixes": ["TEMPORARY"]}
+        id = Column(Integer, primary_key=True)
+        spam = Column(Unicode(30), nullable=False)
+
+    with Session(conn) as session:
+        with session.begin():
+            conn = session.connection()
+            Temporary.__table__.create(conn)
+            session.add(Temporary(spam=test_data))
+            session.flush()
+            df = sql.read_sql_query(sql=select(Temporary.spam), con=conn)
+    tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_invalid_engine(conn, request, test_frame1):
+    if conn == "sqlite_buildin" or "adbc" in conn:
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="SQLiteDatabase/ADBCDatabase does not raise for bad engine"
+            )
+        )
+
+    conn = request.getfixturevalue(conn)
+    msg = "engine must be one of 'auto', 'sqlalchemy'"
+    with pandasSQL_builder(conn) as pandasSQL:
+        with pytest.raises(ValueError, match=msg):
+            pandasSQL.to_sql(test_frame1, "test_frame1", engine="bad_engine")
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_to_sql_with_sql_engine(conn, request, test_frame1):
+    """`to_sql` with the `engine` param"""
+    # mostly copied from this class's `_to_sql()` method
+    conn = request.getfixturevalue(conn)
+    with pandasSQL_builder(conn) as pandasSQL:
+        with pandasSQL.run_transaction():
+            assert pandasSQL.to_sql(test_frame1, "test_frame1", engine="auto") == 4
+            assert pandasSQL.has_table("test_frame1")
+
+    num_entries = len(test_frame1)
+    num_rows = count_rows(conn, "test_frame1")
+    assert num_rows == num_entries
+
+
+@pytest.mark.parametrize("conn", sqlalchemy_connectable)
+def test_options_sqlalchemy(conn, request, test_frame1):
+    # use the set option
+    conn = request.getfixturevalue(conn)
+    with pd.option_context("io.sql.engine", "sqlalchemy"):
+        with pandasSQL_builder(conn) as pandasSQL:
+            with pandasSQL.run_transaction():
+                assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4
+                assert pandasSQL.has_table("test_frame1")
+
+        num_entries = len(test_frame1)
+        num_rows = count_rows(conn, "test_frame1")
+        assert num_rows == num_entries
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_options_auto(conn, request, test_frame1):
+    # use the set option
+    conn = request.getfixturevalue(conn)
+    with pd.option_context("io.sql.engine", "auto"):
+        with pandasSQL_builder(conn) as pandasSQL:
+            with pandasSQL.run_transaction():
+                assert pandasSQL.to_sql(test_frame1, "test_frame1") == 4
+                assert pandasSQL.has_table("test_frame1")
+
+        num_entries = len(test_frame1)
+        num_rows = count_rows(conn, "test_frame1")
+        assert num_rows == num_entries
+
+
+def test_options_get_engine():
+    pytest.importorskip("sqlalchemy")
+    assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine)
+
+    with pd.option_context("io.sql.engine", "sqlalchemy"):
+        assert isinstance(get_engine("auto"), SQLAlchemyEngine)
+        assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine)
+
+    with pd.option_context("io.sql.engine", "auto"):
+        assert isinstance(get_engine("auto"), SQLAlchemyEngine)
+        assert isinstance(get_engine("sqlalchemy"), SQLAlchemyEngine)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+@pytest.mark.parametrize("func", ["read_sql", "read_sql_query"])
+def test_read_sql_dtype_backend(
+    conn,
+    request,
+    string_storage,
+    func,
+    dtype_backend,
+    dtype_backend_data,
+    dtype_backend_expected,
+):
+    # GH#50048
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    table = "test"
+    df = dtype_backend_data
+    df.to_sql(name=table, con=conn, index=False, if_exists="replace")
+
+    with pd.option_context("mode.string_storage", string_storage):
+        result = getattr(pd, func)(
+            f"Select * from {table}", conn, dtype_backend=dtype_backend
+        )
+        expected = dtype_backend_expected(string_storage, dtype_backend, conn_name)
+
+    tm.assert_frame_equal(result, expected)
+
+    if "adbc" in conn_name:
+        # adbc does not support chunksize argument
+        request.applymarker(
+            pytest.mark.xfail(reason="adbc does not support chunksize argument")
+        )
+
+    with pd.option_context("mode.string_storage", string_storage):
+        iterator = getattr(pd, func)(
+            f"Select * from {table}",
+            con=conn,
+            dtype_backend=dtype_backend,
+            chunksize=3,
+        )
+        expected = dtype_backend_expected(string_storage, dtype_backend, conn_name)
+        for result in iterator:
+            tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+@pytest.mark.parametrize("func", ["read_sql", "read_sql_table"])
+def test_read_sql_dtype_backend_table(
+    conn,
+    request,
+    string_storage,
+    func,
+    dtype_backend,
+    dtype_backend_data,
+    dtype_backend_expected,
+):
+    if "sqlite" in conn and "adbc" not in conn:
+        request.applymarker(
+            pytest.mark.xfail(
+                reason=(
+                    "SQLite actually returns proper boolean values via "
+                    "read_sql_table, but before pytest refactor was skipped"
+                )
+            )
+        )
+    # GH#50048
+    conn_name = conn
+    conn = request.getfixturevalue(conn)
+    table = "test"
+    df = dtype_backend_data
+    df.to_sql(name=table, con=conn, index=False, if_exists="replace")
+
+    with pd.option_context("mode.string_storage", string_storage):
+        result = getattr(pd, func)(table, conn, dtype_backend=dtype_backend)
+        expected = dtype_backend_expected(string_storage, dtype_backend, conn_name)
+    tm.assert_frame_equal(result, expected)
+
+    if "adbc" in conn_name:
+        # adbc does not support chunksize argument
+        return
+
+    with pd.option_context("mode.string_storage", string_storage):
+        iterator = getattr(pd, func)(
+            table,
+            conn,
+            dtype_backend=dtype_backend,
+            chunksize=3,
+        )
+        expected = dtype_backend_expected(string_storage, dtype_backend, conn_name)
+        for result in iterator:
+            tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+@pytest.mark.parametrize("func", ["read_sql", "read_sql_table", "read_sql_query"])
+def test_read_sql_invalid_dtype_backend_table(conn, request, func, dtype_backend_data):
+    conn = request.getfixturevalue(conn)
+    table = "test"
+    df = dtype_backend_data
+    df.to_sql(name=table, con=conn, index=False, if_exists="replace")
+
+    msg = (
+        "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+        "'pyarrow' are allowed."
+    )
+    with pytest.raises(ValueError, match=msg):
+        getattr(pd, func)(table, conn, dtype_backend="numpy")
+
+
+@pytest.fixture
+def dtype_backend_data() -> DataFrame:
+    return DataFrame(
+        {
+            "a": Series([1, pd.NA, 3], dtype="Int64"),
+            "b": Series([1, 2, 3], dtype="Int64"),
+            "c": Series([1.5, pd.NA, 2.5], dtype="Float64"),
+            "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
+            "e": [True, False, None],
+            "f": [True, False, True],
+            "g": ["a", "b", "c"],
+            "h": ["a", "b", None],
+        }
+    )
+
+
+@pytest.fixture
+def dtype_backend_expected():
+    def func(string_storage, dtype_backend, conn_name) -> DataFrame:
+        string_dtype: pd.StringDtype | pd.ArrowDtype
+        if dtype_backend == "pyarrow":
+            pa = pytest.importorskip("pyarrow")
+            string_dtype = pd.ArrowDtype(pa.string())
+        else:
+            string_dtype = pd.StringDtype(string_storage)
+
+        df = DataFrame(
+            {
+                "a": Series([1, pd.NA, 3], dtype="Int64"),
+                "b": Series([1, 2, 3], dtype="Int64"),
+                "c": Series([1.5, pd.NA, 2.5], dtype="Float64"),
+                "d": Series([1.5, 2.0, 2.5], dtype="Float64"),
+                "e": Series([True, False, pd.NA], dtype="boolean"),
+                "f": Series([True, False, True], dtype="boolean"),
+                "g": Series(["a", "b", "c"], dtype=string_dtype),
+                "h": Series(["a", "b", None], dtype=string_dtype),
+            }
+        )
+        if dtype_backend == "pyarrow":
+            pa = pytest.importorskip("pyarrow")
+
+            from pandas.arrays import ArrowExtensionArray
+
+            df = DataFrame(
+                {
+                    col: ArrowExtensionArray(pa.array(df[col], from_pandas=True))
+                    for col in df.columns
+                }
+            )
+
+        if "mysql" in conn_name or "sqlite" in conn_name:
+            if dtype_backend == "numpy_nullable":
+                df = df.astype({"e": "Int64", "f": "Int64"})
+            else:
+                df = df.astype({"e": "int64[pyarrow]", "f": "int64[pyarrow]"})
+
+        return df
+
+    return func
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+def test_chunksize_empty_dtypes(conn, request):
+    # GH#50245
+    if "adbc" in conn:
+        request.node.add_marker(
+            pytest.mark.xfail(reason="chunksize argument NotImplemented with ADBC")
+        )
+    conn = request.getfixturevalue(conn)
+    dtypes = {"a": "int64", "b": "object"}
+    df = DataFrame(columns=["a", "b"]).astype(dtypes)
+    expected = df.copy()
+    df.to_sql(name="test", con=conn, index=False, if_exists="replace")
+
+    for result in read_sql_query(
+        "SELECT * FROM test",
+        conn,
+        dtype=dtypes,
+        chunksize=1,
+    ):
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("conn", all_connectable)
+@pytest.mark.parametrize("dtype_backend", [lib.no_default, "numpy_nullable"])
+@pytest.mark.parametrize("func", ["read_sql", "read_sql_query"])
+def test_read_sql_dtype(conn, request, func, dtype_backend):
+    # GH#50797
+    conn = request.getfixturevalue(conn)
+    table = "test"
+    df = DataFrame({"a": [1, 2, 3], "b": 5})
+    df.to_sql(name=table, con=conn, index=False, if_exists="replace")
+
+    result = getattr(pd, func)(
+        f"Select * from {table}",
+        conn,
+        dtype={"a": np.float64},
+        dtype_backend=dtype_backend,
+    )
+    expected = DataFrame(
+        {
+            "a": Series([1, 2, 3], dtype=np.float64),
+            "b": Series(
+                [5, 5, 5],
+                dtype="int64" if not dtype_backend == "numpy_nullable" else "Int64",
+            ),
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_bigint_warning(sqlite_engine):
+    conn = sqlite_engine
+    # test no warning for BIGINT (to support int64) is raised (GH7433)
+    df = DataFrame({"a": [1, 2]}, dtype="int64")
+    assert df.to_sql(name="test_bigintwarning", con=conn, index=False) == 2
+
+    with tm.assert_produces_warning(None):
+        sql.read_sql_table("test_bigintwarning", conn)
+
+
+def test_valueerror_exception(sqlite_engine):
+    conn = sqlite_engine
+    df = DataFrame({"col1": [1, 2], "col2": [3, 4]})
+    with pytest.raises(ValueError, match="Empty table name specified"):
+        df.to_sql(name="", con=conn, if_exists="replace", index=False)
+
+
+def test_row_object_is_named_tuple(sqlite_engine):
+    conn = sqlite_engine
+    # GH 40682
+    # Test for the is_named_tuple() function
+    # Placed here due to its usage of sqlalchemy
+
+    from sqlalchemy import (
+        Column,
+        Integer,
+        String,
+    )
+    from sqlalchemy.orm import (
+        declarative_base,
+        sessionmaker,
+    )
+
+    BaseModel = declarative_base()
+
+    class Test(BaseModel):
+        __tablename__ = "test_frame"
+        id = Column(Integer, primary_key=True)
+        string_column = Column(String(50))
+
+    with conn.begin():
+        BaseModel.metadata.create_all(conn)
+    Session = sessionmaker(bind=conn)
+    with Session() as session:
+        df = DataFrame({"id": [0, 1], "string_column": ["hello", "world"]})
+        assert (
+            df.to_sql(name="test_frame", con=conn, index=False, if_exists="replace")
+            == 2
+        )
+        session.commit()
+        test_query = session.query(Test.id, Test.string_column)
+        df = DataFrame(test_query)
+
+    assert list(df.columns) == ["id", "string_column"]
+
+
+def test_read_sql_string_inference(sqlite_engine):
+    conn = sqlite_engine
+    # GH#54430
+    table = "test"
+    df = DataFrame({"a": ["x", "y"]})
+    df.to_sql(table, con=conn, index=False, if_exists="replace")
+
+    with pd.option_context("future.infer_string", True):
+        result = read_sql_table(table, conn)
+
+    dtype = pd.StringDtype(na_value=np.nan)
+    expected = DataFrame(
+        {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
+    )
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_roundtripping_datetimes(sqlite_engine):
+    conn = sqlite_engine
+    # GH#54877
+    df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]")
+    df.to_sql("test", conn, if_exists="replace", index=False)
+    result = pd.read_sql("select * from test", conn).iloc[0, 0]
+    assert result == "2020-12-31 12:00:00.000000"
+
+
+@pytest.fixture
+def sqlite_builtin_detect_types():
+    with contextlib.closing(
+        sqlite3.connect(":memory:", detect_types=sqlite3.PARSE_DECLTYPES)
+    ) as closing_conn:
+        with closing_conn as conn:
+            yield conn
+
+
+def test_roundtripping_datetimes_detect_types(sqlite_builtin_detect_types):
+    # https://github.com/pandas-dev/pandas/issues/55554
+    conn = sqlite_builtin_detect_types
+    df = DataFrame({"t": [datetime(2020, 12, 31, 12)]}, dtype="datetime64[ns]")
+    df.to_sql("test", conn, if_exists="replace", index=False)
+    result = pd.read_sql("select * from test", conn).iloc[0, 0]
+    assert result == Timestamp("2020-12-31 12:00:00.000000")
+
+
+@pytest.mark.db
+def test_psycopg2_schema_support(postgresql_psycopg2_engine):
+    conn = postgresql_psycopg2_engine
+
+    # only test this for postgresql (schema's not supported in
+    # mysql/sqlite)
+    df = DataFrame({"col1": [1, 2], "col2": [0.1, 0.2], "col3": ["a", "n"]})
+
+    # create a schema
+    with conn.connect() as con:
+        with con.begin():
+            con.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;")
+            con.exec_driver_sql("CREATE SCHEMA other;")
+
+    # write dataframe to different schema's
+    assert df.to_sql(name="test_schema_public", con=conn, index=False) == 2
+    assert (
+        df.to_sql(
+            name="test_schema_public_explicit",
+            con=conn,
+            index=False,
+            schema="public",
+        )
+        == 2
+    )
+    assert (
+        df.to_sql(name="test_schema_other", con=conn, index=False, schema="other") == 2
+    )
+
+    # read dataframes back in
+    res1 = sql.read_sql_table("test_schema_public", conn)
+    tm.assert_frame_equal(df, res1)
+    res2 = sql.read_sql_table("test_schema_public_explicit", conn)
+    tm.assert_frame_equal(df, res2)
+    res3 = sql.read_sql_table("test_schema_public_explicit", conn, schema="public")
+    tm.assert_frame_equal(df, res3)
+    res4 = sql.read_sql_table("test_schema_other", conn, schema="other")
+    tm.assert_frame_equal(df, res4)
+    msg = "Table test_schema_other not found"
+    with pytest.raises(ValueError, match=msg):
+        sql.read_sql_table("test_schema_other", conn, schema="public")
+
+    # different if_exists options
+
+    # create a schema
+    with conn.connect() as con:
+        with con.begin():
+            con.exec_driver_sql("DROP SCHEMA IF EXISTS other CASCADE;")
+            con.exec_driver_sql("CREATE SCHEMA other;")
+
+    # write dataframe with different if_exists options
+    assert (
+        df.to_sql(name="test_schema_other", con=conn, schema="other", index=False) == 2
+    )
+    df.to_sql(
+        name="test_schema_other",
+        con=conn,
+        schema="other",
+        index=False,
+        if_exists="replace",
+    )
+    assert (
+        df.to_sql(
+            name="test_schema_other",
+            con=conn,
+            schema="other",
+            index=False,
+            if_exists="append",
+        )
+        == 2
+    )
+    res = sql.read_sql_table("test_schema_other", conn, schema="other")
+    tm.assert_frame_equal(concat([df, df], ignore_index=True), res)
+
+
+@pytest.mark.db
+def test_self_join_date_columns(postgresql_psycopg2_engine):
+    # GH 44421
+    conn = postgresql_psycopg2_engine
+    from sqlalchemy.sql import text
+
+    create_table = text(
+        """
+    CREATE TABLE person
+    (
+        id serial constraint person_pkey primary key,
+        created_dt timestamp with time zone
+    );
+
+    INSERT INTO person
+        VALUES (1, '2021-01-01T00:00:00Z');
+    """
+    )
+    with conn.connect() as con:
+        with con.begin():
+            con.execute(create_table)
+
+    sql_query = (
+        'SELECT * FROM "person" AS p1 INNER JOIN "person" AS p2 ON p1.id = p2.id;'
+    )
+    result = pd.read_sql(sql_query, conn)
+    expected = DataFrame(
+        [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2
+    )
+    expected["created_dt"] = expected["created_dt"].astype("M8[us, UTC]")
+    tm.assert_frame_equal(result, expected)
+
+    # Cleanup
+    with sql.SQLDatabase(conn, need_transaction=True) as pandasSQL:
+        pandasSQL.drop_table("person")
+
+
+def test_create_and_drop_table(sqlite_engine):
+    conn = sqlite_engine
+    temp_frame = DataFrame({"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]})
+    with sql.SQLDatabase(conn) as pandasSQL:
+        with pandasSQL.run_transaction():
+            assert pandasSQL.to_sql(temp_frame, "drop_test_frame") == 4
+
+        assert pandasSQL.has_table("drop_test_frame")
+
+        with pandasSQL.run_transaction():
+            pandasSQL.drop_table("drop_test_frame")
+
+        assert not pandasSQL.has_table("drop_test_frame")
+
+
+def test_sqlite_datetime_date(sqlite_buildin):
+    conn = sqlite_buildin
+    df = DataFrame([date(2014, 1, 1), date(2014, 1, 2)], columns=["a"])
+    assert df.to_sql(name="test_date", con=conn, index=False) == 2
+    res = read_sql_query("SELECT * FROM test_date", conn)
+    # comes back as strings
+    tm.assert_frame_equal(res, df.astype(str))
+
+
+@pytest.mark.parametrize("tz_aware", [False, True])
+def test_sqlite_datetime_time(tz_aware, sqlite_buildin):
+    conn = sqlite_buildin
+    # test support for datetime.time, GH #8341
+    if not tz_aware:
+        tz_times = [time(9, 0, 0), time(9, 1, 30)]
+    else:
+        tz_dt = date_range("2013-01-01 09:00:00", periods=2, tz="US/Pacific")
+        tz_times = Series(tz_dt.to_pydatetime()).map(lambda dt: dt.timetz())
+
+    df = DataFrame(tz_times, columns=["a"])
+
+    assert df.to_sql(name="test_time", con=conn, index=False) == 2
+    res = read_sql_query("SELECT * FROM test_time", conn)
+    # comes back as strings
+    expected = df.map(lambda _: _.strftime("%H:%M:%S.%f"))
+    tm.assert_frame_equal(res, expected)
+
+
+def get_sqlite_column_type(conn, table, column):
+    recs = conn.execute(f"PRAGMA table_info({table})")
+    for cid, name, ctype, not_null, default, pk in recs:
+        if name == column:
+            return ctype
+    raise ValueError(f"Table {table}, column {column} not found")
+
+
+def test_sqlite_test_dtype(sqlite_buildin):
+    conn = sqlite_buildin
+    cols = ["A", "B"]
+    data = [(0.8, True), (0.9, None)]
+    df = DataFrame(data, columns=cols)
+    assert df.to_sql(name="dtype_test", con=conn) == 2
+    assert df.to_sql(name="dtype_test2", con=conn, dtype={"B": "STRING"}) == 2
+
+    # sqlite stores Boolean values as INTEGER
+    assert get_sqlite_column_type(conn, "dtype_test", "B") == "INTEGER"
+
+    assert get_sqlite_column_type(conn, "dtype_test2", "B") == "STRING"
+    msg = r"B \(<class 'bool'>\) not a string"
+    with pytest.raises(ValueError, match=msg):
+        df.to_sql(name="error", con=conn, dtype={"B": bool})
+
+    # single dtype
+    assert df.to_sql(name="single_dtype_test", con=conn, dtype="STRING") == 2
+    assert get_sqlite_column_type(conn, "single_dtype_test", "A") == "STRING"
+    assert get_sqlite_column_type(conn, "single_dtype_test", "B") == "STRING"
+
+
+def test_sqlite_notna_dtype(sqlite_buildin):
+    conn = sqlite_buildin
+    cols = {
+        "Bool": Series([True, None]),
+        "Date": Series([datetime(2012, 5, 1), None]),
+        "Int": Series([1, None], dtype="object"),
+        "Float": Series([1.1, None]),
+    }
+    df = DataFrame(cols)
+
+    tbl = "notna_dtype_test"
+    assert df.to_sql(name=tbl, con=conn) == 2
+
+    assert get_sqlite_column_type(conn, tbl, "Bool") == "INTEGER"
+    assert get_sqlite_column_type(conn, tbl, "Date") == "TIMESTAMP"
+    assert get_sqlite_column_type(conn, tbl, "Int") == "INTEGER"
+    assert get_sqlite_column_type(conn, tbl, "Float") == "REAL"
+
+
+def test_sqlite_illegal_names(sqlite_buildin):
+    # For sqlite, these should work fine
+    conn = sqlite_buildin
+    df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"])
+
+    msg = "Empty table or column name specified"
+    with pytest.raises(ValueError, match=msg):
+        df.to_sql(name="", con=conn)
+
+    for ndx, weird_name in enumerate(
+        [
+            "test_weird_name]",
+            "test_weird_name[",
+            "test_weird_name`",
+            'test_weird_name"',
+            "test_weird_name'",
+            "_b.test_weird_name_01-30",
+            '"_b.test_weird_name_01-30"',
+            "99beginswithnumber",
+            "12345",
+            "\xe9",
+        ]
+    ):
+        assert df.to_sql(name=weird_name, con=conn) == 2
+        sql.table_exists(weird_name, conn)
+
+        df2 = DataFrame([[1, 2], [3, 4]], columns=["a", weird_name])
+        c_tbl = f"test_weird_col_name{ndx:d}"
+        assert df2.to_sql(name=c_tbl, con=conn) == 2
+        sql.table_exists(c_tbl, conn)
+
+
+def format_query(sql, *args):
+    _formatters = {
+        datetime: "'{}'".format,
+        str: "'{}'".format,
+        np.str_: "'{}'".format,
+        bytes: "'{}'".format,
+        float: "{:.8f}".format,
+        int: "{:d}".format,
+        type(None): lambda x: "NULL",
+        np.float64: "{:.10f}".format,
+        bool: "'{!s}'".format,
+    }
+    processed_args = []
+    for arg in args:
+        if isinstance(arg, float) and isna(arg):
+            arg = None
+
+        formatter = _formatters[type(arg)]
+        processed_args.append(formatter(arg))
+
+    return sql % tuple(processed_args)
+
+
+def tquery(query, con=None):
+    """Replace removed sql.tquery function"""
+    with sql.pandasSQL_builder(con) as pandas_sql:
+        res = pandas_sql.execute(query).fetchall()
+    return None if res is None else list(res)
+
+
+def test_xsqlite_basic(sqlite_buildin):
+    frame = DataFrame(
+        np.random.default_rng(2).standard_normal((10, 4)),
+        columns=Index(list("ABCD")),
+        index=date_range("2000-01-01", periods=10, freq="B"),
+    )
+    assert sql.to_sql(frame, name="test_table", con=sqlite_buildin, index=False) == 10
+    result = sql.read_sql("select * from test_table", sqlite_buildin)
+
+    # HACK! Change this once indexes are handled properly.
+    result.index = frame.index
+
+    expected = frame
+    tm.assert_frame_equal(result, frame)
+
+    frame["txt"] = ["a"] * len(frame)
+    frame2 = frame.copy()
+    new_idx = Index(np.arange(len(frame2)), dtype=np.int64) + 10
+    frame2["Idx"] = new_idx.copy()
+    assert sql.to_sql(frame2, name="test_table2", con=sqlite_buildin, index=False) == 10
+    result = sql.read_sql("select * from test_table2", sqlite_buildin, index_col="Idx")
+    expected = frame.copy()
+    expected.index = new_idx
+    expected.index.name = "Idx"
+    tm.assert_frame_equal(expected, result)
+
+
+def test_xsqlite_write_row_by_row(sqlite_buildin):
+    frame = DataFrame(
+        np.random.default_rng(2).standard_normal((10, 4)),
+        columns=Index(list("ABCD")),
+        index=date_range("2000-01-01", periods=10, freq="B"),
+    )
+    frame.iloc[0, 0] = np.nan
+    create_sql = sql.get_schema(frame, "test")
+    cur = sqlite_buildin.cursor()
+    cur.execute(create_sql)
+
+    ins = "INSERT INTO test VALUES (%s, %s, %s, %s)"
+    for _, row in frame.iterrows():
+        fmt_sql = format_query(ins, *row)
+        tquery(fmt_sql, con=sqlite_buildin)
+
+    sqlite_buildin.commit()
+
+    result = sql.read_sql("select * from test", con=sqlite_buildin)
+    result.index = frame.index
+    tm.assert_frame_equal(result, frame, rtol=1e-3)
+
+
+def test_xsqlite_execute(sqlite_buildin):
+    frame = DataFrame(
+        np.random.default_rng(2).standard_normal((10, 4)),
+        columns=Index(list("ABCD")),
+        index=date_range("2000-01-01", periods=10, freq="B"),
+    )
+    create_sql = sql.get_schema(frame, "test")
+    cur = sqlite_buildin.cursor()
+    cur.execute(create_sql)
+    ins = "INSERT INTO test VALUES (?, ?, ?, ?)"
+
+    row = frame.iloc[0]
+    with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql:
+        pandas_sql.execute(ins, tuple(row))
+    sqlite_buildin.commit()
+
+    result = sql.read_sql("select * from test", sqlite_buildin)
+    result.index = frame.index[:1]
+    tm.assert_frame_equal(result, frame[:1])
+
+
+def test_xsqlite_schema(sqlite_buildin):
+    frame = DataFrame(
+        np.random.default_rng(2).standard_normal((10, 4)),
+        columns=Index(list("ABCD")),
+        index=date_range("2000-01-01", periods=10, freq="B"),
+    )
+    create_sql = sql.get_schema(frame, "test")
+    lines = create_sql.splitlines()
+    for line in lines:
+        tokens = line.split(" ")
+        if len(tokens) == 2 and tokens[0] == "A":
+            assert tokens[1] == "DATETIME"
+
+    create_sql = sql.get_schema(frame, "test", keys=["A", "B"])
+    lines = create_sql.splitlines()
+    assert 'PRIMARY KEY ("A", "B")' in create_sql
+    cur = sqlite_buildin.cursor()
+    cur.execute(create_sql)
+
+
+def test_xsqlite_execute_fail(sqlite_buildin):
+    create_sql = """
+    CREATE TABLE test
+    (
+    a TEXT,
+    b TEXT,
+    c REAL,
+    PRIMARY KEY (a, b)
+    );
+    """
+    cur = sqlite_buildin.cursor()
+    cur.execute(create_sql)
+
+    with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql:
+        pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 1.234)")
+        pandas_sql.execute("INSERT INTO test VALUES('foo', 'baz', 2.567)")
+
+        with pytest.raises(sql.DatabaseError, match="Execution failed on sql"):
+            pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 7)")
+
+
+def test_xsqlite_execute_closed_connection():
+    create_sql = """
+    CREATE TABLE test
+    (
+    a TEXT,
+    b TEXT,
+    c REAL,
+    PRIMARY KEY (a, b)
+    );
+    """
+    with contextlib.closing(sqlite3.connect(":memory:")) as conn:
+        cur = conn.cursor()
+        cur.execute(create_sql)
+
+        with sql.pandasSQL_builder(conn) as pandas_sql:
+            pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 1.234)")
+
+    msg = "Cannot operate on a closed database."
+    with pytest.raises(sqlite3.ProgrammingError, match=msg):
+        tquery("select * from test", con=conn)
+
+
+def test_xsqlite_keyword_as_column_names(sqlite_buildin):
+    df = DataFrame({"From": np.ones(5)})
+    assert sql.to_sql(df, con=sqlite_buildin, name="testkeywords", index=False) == 5
+
+
+def test_xsqlite_onecolumn_of_integer(sqlite_buildin):
+    # GH 3628
+    # a column_of_integers dataframe should transfer well to sql
+
+    mono_df = DataFrame([1, 2], columns=["c0"])
+    assert sql.to_sql(mono_df, con=sqlite_buildin, name="mono_df", index=False) == 2
+    # computing the sum via sql
+    con_x = sqlite_buildin
+    the_sum = sum(my_c0[0] for my_c0 in con_x.execute("select * from mono_df"))
+    # it should not fail, and gives 3 ( Issue #3628 )
+    assert the_sum == 3
+
+    result = sql.read_sql("select * from mono_df", con_x)
+    tm.assert_frame_equal(result, mono_df)
+
+
+def test_xsqlite_if_exists(sqlite_buildin):
+    df_if_exists_1 = DataFrame({"col1": [1, 2], "col2": ["A", "B"]})
+    df_if_exists_2 = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"]})
+    table_name = "table_if_exists"
+    sql_select = f"SELECT * FROM {table_name}"
+
+    msg = "'notvalidvalue' is not valid for if_exists"
+    with pytest.raises(ValueError, match=msg):
+        sql.to_sql(
+            frame=df_if_exists_1,
+            con=sqlite_buildin,
+            name=table_name,
+            if_exists="notvalidvalue",
+        )
+    drop_table(table_name, sqlite_buildin)
+
+    # test if_exists='fail'
+    sql.to_sql(
+        frame=df_if_exists_1, con=sqlite_buildin, name=table_name, if_exists="fail"
+    )
+    msg = "Table 'table_if_exists' already exists"
+    with pytest.raises(ValueError, match=msg):
+        sql.to_sql(
+            frame=df_if_exists_1,
+            con=sqlite_buildin,
+            name=table_name,
+            if_exists="fail",
+        )
+    # test if_exists='replace'
+    sql.to_sql(
+        frame=df_if_exists_1,
+        con=sqlite_buildin,
+        name=table_name,
+        if_exists="replace",
+        index=False,
+    )
+    assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")]
+    assert (
+        sql.to_sql(
+            frame=df_if_exists_2,
+            con=sqlite_buildin,
+            name=table_name,
+            if_exists="replace",
+            index=False,
+        )
+        == 3
+    )
+    assert tquery(sql_select, con=sqlite_buildin) == [(3, "C"), (4, "D"), (5, "E")]
+    drop_table(table_name, sqlite_buildin)
+
+    # test if_exists='append'
+    assert (
+        sql.to_sql(
+            frame=df_if_exists_1,
+            con=sqlite_buildin,
+            name=table_name,
+            if_exists="fail",
+            index=False,
+        )
+        == 2
+    )
+    assert tquery(sql_select, con=sqlite_buildin) == [(1, "A"), (2, "B")]
+    assert (
+        sql.to_sql(
+            frame=df_if_exists_2,
+            con=sqlite_buildin,
+            name=table_name,
+            if_exists="append",
+            index=False,
+        )
+        == 3
+    )
+    assert tquery(sql_select, con=sqlite_buildin) == [
+        (1, "A"),
+        (2, "B"),
+        (3, "C"),
+        (4, "D"),
+        (5, "E"),
+    ]
+    drop_table(table_name, sqlite_buildin)
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
new file mode 100644
index 0000000000000000000000000000000000000000..f69ec1f6105605f80eadccd0072c7c0683158b49
--- /dev/null
+++ b/pandas/tests/io/test_stata.py
@@ -0,0 +1,2624 @@
+import bz2
+import datetime as dt
+from datetime import datetime
+import gzip
+import io
+import itertools
+import os
+import string
+import struct
+import tarfile
+import zipfile
+
+import numpy as np
+import pytest
+
+from pandas.errors import Pandas4Warning
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import CategoricalDtype
+import pandas._testing as tm
+from pandas.core.frame import (
+    DataFrame,
+    Series,
+)
+
+from pandas.io.parsers import read_csv
+from pandas.io.stata import (
+    CategoricalConversionWarning,
+    InvalidColumnName,
+    PossiblePrecisionLoss,
+    StataMissingValue,
+    StataReader,
+    StataWriter,
+    StataWriterUTF8,
+    ValueLabelTypeMismatch,
+    read_stata,
+)
+
+
+@pytest.fixture
+def mixed_frame():
+    return DataFrame(
+        {
+            "a": [1, 2, 3, 4],
+            "b": [1.0, 3.0, 27.0, 81.0],
+            "c": ["Atlanta", "Birmingham", "Cincinnati", "Detroit"],
+        }
+    )
+
+
+@pytest.fixture
+def parsed_114(datapath):
+    dta14_114 = datapath("io", "data", "stata", "stata5_114.dta")
+    parsed_114 = read_stata(dta14_114, convert_dates=True)
+    parsed_114.index.name = "index"
+    return parsed_114
+
+
+class TestStata:
+    def read_dta(self, file):
+        # Legacy default reader configuration
+        return read_stata(file, convert_dates=True)
+
+    def read_csv(self, file):
+        return read_csv(file, parse_dates=True)
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_read_empty_dta(self, version, temp_file):
+        empty_ds = DataFrame(columns=["unit"])
+        # GH 7369, make sure can read a 0-obs dta file
+        path = temp_file
+        empty_ds.to_stata(path, write_index=False, version=version)
+        empty_ds2 = read_stata(path)
+        tm.assert_frame_equal(empty_ds, empty_ds2)
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_read_empty_dta_with_dtypes(self, version, temp_file):
+        # GH 46240
+        # Fixing above bug revealed that types are not correctly preserved when
+        # writing empty DataFrames
+        empty_df_typed = DataFrame(
+            {
+                "i8": np.array([0], dtype=np.int8),
+                "i16": np.array([0], dtype=np.int16),
+                "i32": np.array([0], dtype=np.int32),
+                "i64": np.array([0], dtype=np.int64),
+                "u8": np.array([0], dtype=np.uint8),
+                "u16": np.array([0], dtype=np.uint16),
+                "u32": np.array([0], dtype=np.uint32),
+                "u64": np.array([0], dtype=np.uint64),
+                "f32": np.array([0], dtype=np.float32),
+                "f64": np.array([0], dtype=np.float64),
+            }
+        )
+        # GH 7369, make sure can read a 0-obs dta file
+        path = temp_file
+        empty_df_typed.to_stata(path, write_index=False, version=version)
+        empty_reread = read_stata(path)
+
+        expected = empty_df_typed
+        # No uint# support. Downcast since values in range for int#
+        expected["u8"] = expected["u8"].astype(np.int8)
+        expected["u16"] = expected["u16"].astype(np.int16)
+        expected["u32"] = expected["u32"].astype(np.int32)
+        # No int64 supported at all. Downcast since values in range for int32
+        expected["u64"] = expected["u64"].astype(np.int32)
+        expected["i64"] = expected["i64"].astype(np.int32)
+
+        tm.assert_frame_equal(expected, empty_reread)
+        tm.assert_series_equal(expected.dtypes, empty_reread.dtypes)
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_read_index_col_none(self, version, temp_file):
+        df = DataFrame({"a": range(5), "b": ["b1", "b2", "b3", "b4", "b5"]})
+        # GH 7369, make sure can read a 0-obs dta file
+        path = temp_file
+        df.to_stata(path, write_index=False, version=version)
+        read_df = read_stata(path)
+
+        assert isinstance(read_df.index, pd.RangeIndex)
+        expected = df
+        expected["a"] = expected["a"].astype(np.int32)
+        tm.assert_frame_equal(read_df, expected, check_index_type=True)
+
+    @pytest.mark.parametrize(
+        "version", [102, 103, 104, 105, 108, 110, 111, 113, 114, 115, 117, 118, 119]
+    )
+    def test_read_dta1(self, version, datapath):
+        file = datapath("io", "data", "stata", f"stata1_{version}.dta")
+        parsed = self.read_dta(file)
+
+        # Pandas uses np.nan as missing value.
+        # Thus, all columns will be of type float, regardless of their name.
+        expected = DataFrame(
+            [(np.nan, np.nan, np.nan, np.nan, np.nan)],
+            columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"],
+        )
+
+        # this is an oddity as really the nan should be float64, but
+        # the casting doesn't fail so need to match stata here
+        expected["float_miss"] = expected["float_miss"].astype(np.float32)
+
+        # Column names too long for older Stata formats
+        if version <= 108:
+            expected = expected.rename(
+                columns={
+                    "float_miss": "f_miss",
+                    "double_miss": "d_miss",
+                    "byte_miss": "b_miss",
+                    "int_miss": "i_miss",
+                    "long_miss": "l_miss",
+                }
+            )
+
+        tm.assert_frame_equal(parsed, expected)
+
+    def test_read_dta2(self, datapath):
+        expected = DataFrame.from_records(
+            [
+                (
+                    datetime(2006, 11, 19, 23, 13, 20),
+                    1479596223000,
+                    datetime(2010, 1, 20),
+                    datetime(2010, 1, 8),
+                    datetime(2010, 1, 1),
+                    datetime(1974, 7, 1),
+                    datetime(2010, 1, 1),
+                    datetime(2010, 1, 1),
+                ),
+                (
+                    datetime(1959, 12, 31, 20, 3, 20),
+                    -1479590,
+                    datetime(1953, 10, 2),
+                    datetime(1948, 6, 10),
+                    datetime(1955, 1, 1),
+                    datetime(1955, 7, 1),
+                    datetime(1955, 1, 1),
+                    datetime(2, 1, 1),
+                ),
+                (pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT, pd.NaT),
+            ],
+            columns=[
+                "datetime_c",
+                "datetime_big_c",
+                "date",
+                "weekly_date",
+                "monthly_date",
+                "quarterly_date",
+                "half_yearly_date",
+                "yearly_date",
+            ],
+        )
+        # TODO(GH#55564): just pass M8[s] to the constructor
+        expected["datetime_c"] = expected["datetime_c"].astype("M8[ms]")
+        expected["date"] = expected["date"].astype("M8[s]")
+        expected["weekly_date"] = expected["weekly_date"].astype("M8[s]")
+        expected["monthly_date"] = expected["monthly_date"].astype("M8[s]")
+        expected["quarterly_date"] = expected["quarterly_date"].astype("M8[s]")
+        expected["half_yearly_date"] = expected["half_yearly_date"].astype("M8[s]")
+        expected["yearly_date"] = expected["yearly_date"].astype("M8[s]")
+
+        path1 = datapath("io", "data", "stata", "stata2_114.dta")
+        path2 = datapath("io", "data", "stata", "stata2_115.dta")
+        path3 = datapath("io", "data", "stata", "stata2_117.dta")
+
+        msg = "Leaving in Stata Internal Format"
+        with tm.assert_produces_warning(UserWarning, match=msg):
+            parsed_114 = self.read_dta(path1)
+        with tm.assert_produces_warning(UserWarning, match=msg):
+            parsed_115 = self.read_dta(path2)
+        with tm.assert_produces_warning(UserWarning, match=msg):
+            parsed_117 = self.read_dta(path3)
+            # FIXME: don't leave commented-out
+            # 113 is buggy due to limits of date format support in Stata
+            # parsed_113 = self.read_dta(
+            # datapath("io", "data", "stata", "stata2_113.dta")
+            # )
+
+        # FIXME: don't leave commented-out
+        # buggy test because of the NaT comparison on certain platforms
+        # Format 113 test fails since it does not support tc and tC formats
+        # tm.assert_frame_equal(parsed_113, expected)
+        tm.assert_frame_equal(parsed_114, expected)
+        tm.assert_frame_equal(parsed_115, expected)
+        tm.assert_frame_equal(parsed_117, expected)
+
+    @pytest.mark.parametrize(
+        "file", ["stata3_113", "stata3_114", "stata3_115", "stata3_117"]
+    )
+    def test_read_dta3(self, file, datapath):
+        file = datapath("io", "data", "stata", f"{file}.dta")
+        parsed = self.read_dta(file)
+
+        # match stata here
+        expected = self.read_csv(datapath("io", "data", "stata", "stata3.csv"))
+        expected = expected.astype(np.float32)
+        expected["year"] = expected["year"].astype(np.int16)
+        expected["quarter"] = expected["quarter"].astype(np.int8)
+
+        tm.assert_frame_equal(parsed, expected)
+
+    @pytest.mark.parametrize("version", [110, 111, 113, 114, 115, 117])
+    def test_read_dta4(self, version, datapath):
+        file = datapath("io", "data", "stata", f"stata4_{version}.dta")
+        parsed = self.read_dta(file)
+
+        expected = DataFrame.from_records(
+            [
+                ["one", "ten", "one", "one", "one"],
+                ["two", "nine", "two", "two", "two"],
+                ["three", "eight", "three", "three", "three"],
+                ["four", "seven", 4, "four", "four"],
+                ["five", "six", 5, np.nan, "five"],
+                ["six", "five", 6, np.nan, "six"],
+                ["seven", "four", 7, np.nan, "seven"],
+                ["eight", "three", 8, np.nan, "eight"],
+                ["nine", "two", 9, np.nan, "nine"],
+                ["ten", "one", "ten", np.nan, "ten"],
+            ],
+            columns=[
+                "fully_labeled",
+                "fully_labeled2",
+                "incompletely_labeled",
+                "labeled_with_missings",
+                "float_labelled",
+            ],
+        )
+
+        # these are all categoricals
+        for col in expected:
+            orig = expected[col].copy()
+
+            categories = np.asarray(expected["fully_labeled"][orig.notna()])
+            if col == "incompletely_labeled":
+                categories = orig
+
+            cat = orig.astype("category")._values
+            cat = cat.set_categories(categories, ordered=True)
+            cat.categories.rename(None, inplace=True)
+
+            expected[col] = cat
+
+        # stata doesn't save .category metadata
+        tm.assert_frame_equal(parsed, expected)
+
+    @pytest.mark.parametrize("version", [102, 103, 104, 105, 108])
+    def test_readold_dta4(self, version, datapath):
+        # This test is the same as test_read_dta4 above except that the columns
+        # had to be renamed to match the restrictions in older file format
+        file = datapath("io", "data", "stata", f"stata4_{version}.dta")
+        parsed = self.read_dta(file)
+
+        expected = DataFrame.from_records(
+            [
+                ["one", "ten", "one", "one", "one"],
+                ["two", "nine", "two", "two", "two"],
+                ["three", "eight", "three", "three", "three"],
+                ["four", "seven", 4, "four", "four"],
+                ["five", "six", 5, np.nan, "five"],
+                ["six", "five", 6, np.nan, "six"],
+                ["seven", "four", 7, np.nan, "seven"],
+                ["eight", "three", 8, np.nan, "eight"],
+                ["nine", "two", 9, np.nan, "nine"],
+                ["ten", "one", "ten", np.nan, "ten"],
+            ],
+            columns=[
+                "fulllab",
+                "fulllab2",
+                "incmplab",
+                "misslab",
+                "floatlab",
+            ],
+        )
+
+        # these are all categoricals
+        for col in expected:
+            orig = expected[col].copy()
+
+            categories = np.asarray(expected["fulllab"][orig.notna()])
+            if col == "incmplab":
+                categories = orig
+
+            cat = orig.astype("category")._values
+            cat = cat.set_categories(categories, ordered=True)
+            cat.categories.rename(None, inplace=True)
+
+            expected[col] = cat
+
+        # stata doesn't save .category metadata
+        tm.assert_frame_equal(parsed, expected)
+
+    # File containing strls
+    @pytest.mark.parametrize(
+        "file",
+        [
+            "stata12_117",
+            "stata12_be_117",
+            "stata12_118",
+            "stata12_be_118",
+            "stata12_119",
+            "stata12_be_119",
+        ],
+    )
+    def test_read_dta_strl(self, file, datapath):
+        parsed = self.read_dta(datapath("io", "data", "stata", f"{file}.dta"))
+        expected = DataFrame.from_records(
+            [
+                [1, "abc", "abcdefghi"],
+                [3, "cba", "qwertywertyqwerty"],
+                [93, "", "strl"],
+            ],
+            columns=["x", "y", "z"],
+        )
+
+        tm.assert_frame_equal(parsed, expected, check_dtype=False)
+
+    # 117 is not included in this list as it uses ASCII strings
+    @pytest.mark.parametrize(
+        "file",
+        [
+            "stata14_118",
+            "stata14_be_118",
+            "stata14_119",
+            "stata14_be_119",
+        ],
+    )
+    def test_read_dta118_119(self, file, datapath):
+        parsed_118 = self.read_dta(datapath("io", "data", "stata", f"{file}.dta"))
+        parsed_118["Bytes"] = parsed_118["Bytes"].astype("O")
+        expected = DataFrame.from_records(
+            [
+                ["Cat", "Bogota", "Bogotá", 1, 1.0, "option b Ünicode", 1.0],
+                ["Dog", "Boston", "Uzunköprü", np.nan, np.nan, np.nan, np.nan],
+                ["Plane", "Rome", "Tromsø", 0, 0.0, "option a", 0.0],
+                ["Potato", "Tokyo", "Elâzığ", -4, 4.0, 4, 4],  # noqa: RUF001
+                ["", "", "", 0, 0.3332999, "option a", 1 / 3.0],
+            ],
+            columns=[
+                "Things",
+                "Cities",
+                "Unicode_Cities_Strl",
+                "Ints",
+                "Floats",
+                "Bytes",
+                "Longs",
+            ],
+        )
+        expected["Floats"] = expected["Floats"].astype(np.float32)
+        for col in parsed_118.columns:
+            tm.assert_almost_equal(parsed_118[col], expected[col])
+
+        with StataReader(datapath("io", "data", "stata", f"{file}.dta")) as rdr:
+            vl = rdr.variable_labels()
+            vl_expected = {
+                "Unicode_Cities_Strl": "Here are some strls with Ünicode chars",
+                "Longs": "long data",
+                "Things": "Here are some things",
+                "Bytes": "byte data",
+                "Ints": "int data",
+                "Cities": "Here are some cities",
+                "Floats": "float data",
+            }
+            tm.assert_dict_equal(vl, vl_expected)
+
+            assert rdr.data_label == "This is a  Ünicode data label"
+
+    def test_read_write_dta5(self, temp_file):
+        original = DataFrame(
+            [(np.nan, np.nan, np.nan, np.nan, np.nan)],
+            columns=["float_miss", "double_miss", "byte_miss", "int_miss", "long_miss"],
+        )
+        original.index.name = "index"
+
+        path = temp_file
+        original.to_stata(path, convert_dates=None)
+        written_and_read_again = self.read_dta(path)
+
+        expected = original
+        expected.index = expected.index.astype(np.int32)
+        tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
+
+    def test_write_dta6(self, datapath, temp_file):
+        original = self.read_csv(datapath("io", "data", "stata", "stata3.csv"))
+        original.index.name = "index"
+        original.index = original.index.astype(np.int32)
+        original["year"] = original["year"].astype(np.int32)
+        original["quarter"] = original["quarter"].astype(np.int32)
+
+        path = temp_file
+        original.to_stata(path, convert_dates=None)
+        written_and_read_again = self.read_dta(path)
+        tm.assert_frame_equal(
+            written_and_read_again.set_index("index"),
+            original,
+            check_index_type=False,
+        )
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_read_write_dta10(self, version, temp_file, using_infer_string):
+        original = DataFrame(
+            data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]],
+            columns=["string", "object", "integer", "floating", "datetime"],
+        )
+        original["object"] = Series(original["object"], dtype=object)
+        original.index.name = "index"
+        original.index = original.index.astype(np.int32)
+        original["integer"] = original["integer"].astype(np.int32)
+
+        path = temp_file
+        original.to_stata(path, convert_dates={"datetime": "tc"}, version=version)
+        written_and_read_again = self.read_dta(path)
+
+        expected = original.copy()
+        # "tc" convert_dates means we store in ms
+        expected["datetime"] = expected["datetime"].astype("M8[ms]")
+        if using_infer_string:
+            expected["object"] = expected["object"].astype("str")
+
+        tm.assert_frame_equal(
+            written_and_read_again.set_index("index"),
+            expected,
+        )
+
+    def test_stata_doc_examples(self, temp_file):
+        path = temp_file
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB")
+        )
+        df.to_stata(path)
+
+    def test_write_preserves_original(self, temp_file):
+        # 9795
+
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 4)), columns=list("abcd")
+        )
+        df.loc[2, "a":"c"] = np.nan
+        df_copy = df.copy()
+        path = temp_file
+        df.to_stata(path, write_index=False)
+        tm.assert_frame_equal(df, df_copy)
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_encoding(self, version, datapath, temp_file):
+        # GH 4626, proper encoding handling
+        raw = read_stata(datapath("io", "data", "stata", "stata1_encoding.dta"))
+        encoded = read_stata(datapath("io", "data", "stata", "stata1_encoding.dta"))
+        result = encoded.kreis1849[0]
+
+        expected = raw.kreis1849[0]
+        assert result == expected
+        assert isinstance(result, str)
+
+        path = temp_file
+        encoded.to_stata(path, write_index=False, version=version)
+        reread_encoded = read_stata(path)
+        tm.assert_frame_equal(encoded, reread_encoded)
+
+    def test_read_write_dta11(self, temp_file):
+        original = DataFrame(
+            [(1, 2, 3, 4)],
+            columns=[
+                "good",
+                "b\u00e4d",
+                "8number",
+                "astringwithmorethan32characters______",
+            ],
+        )
+        formatted = DataFrame(
+            [(1, 2, 3, 4)],
+            columns=["good", "b_d", "_8number", "astringwithmorethan32characters_"],
+        )
+        formatted.index.name = "index"
+        formatted = formatted.astype(np.int32)
+
+        path = temp_file
+        msg = "Not all pandas column names were valid Stata variable names"
+        with tm.assert_produces_warning(InvalidColumnName, match=msg):
+            original.to_stata(path, convert_dates=None)
+
+        written_and_read_again = self.read_dta(path)
+
+        expected = formatted
+        expected.index = expected.index.astype(np.int32)
+        tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_read_write_dta12(self, version, temp_file):
+        original = DataFrame(
+            [(1, 2, 3, 4, 5, 6)],
+            columns=[
+                "astringwithmorethan32characters_1",
+                "astringwithmorethan32characters_2",
+                "+",
+                "-",
+                "short",
+                "delete",
+            ],
+        )
+        formatted = DataFrame(
+            [(1, 2, 3, 4, 5, 6)],
+            columns=[
+                "astringwithmorethan32characters_",
+                "_0astringwithmorethan32character",
+                "_",
+                "_1_",
+                "_short",
+                "_delete",
+            ],
+        )
+        formatted.index.name = "index"
+        formatted = formatted.astype(np.int32)
+
+        path = temp_file
+        msg = "Not all pandas column names were valid Stata variable names"
+        with tm.assert_produces_warning(InvalidColumnName, match=msg):
+            original.to_stata(path, convert_dates=None, version=version)
+            # should get a warning for that format.
+
+        written_and_read_again = self.read_dta(path)
+
+        expected = formatted
+        expected.index = expected.index.astype(np.int32)
+        tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
+
+    def test_read_write_dta13(self, temp_file):
+        s1 = Series(2**9, dtype=np.int16)
+        s2 = Series(2**17, dtype=np.int32)
+        s3 = Series(2**33, dtype=np.int64)
+        original = DataFrame({"int16": s1, "int32": s2, "int64": s3})
+        original.index.name = "index"
+
+        formatted = original
+        formatted["int64"] = formatted["int64"].astype(np.float64)
+
+        path = temp_file
+        original.to_stata(path)
+        written_and_read_again = self.read_dta(path)
+
+        expected = formatted
+        expected.index = expected.index.astype(np.int32)
+        tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    @pytest.mark.parametrize(
+        "file", ["stata5_113", "stata5_114", "stata5_115", "stata5_117"]
+    )
+    def test_read_write_reread_dta14(
+        self, file, parsed_114, version, datapath, temp_file
+    ):
+        file = datapath("io", "data", "stata", f"{file}.dta")
+        parsed = self.read_dta(file)
+        parsed.index.name = "index"
+
+        tm.assert_frame_equal(parsed_114, parsed)
+
+        path = temp_file
+        parsed_114.to_stata(path, convert_dates={"date_td": "td"}, version=version)
+        written_and_read_again = self.read_dta(path)
+
+        expected = parsed_114.copy()
+        tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
+
+    @pytest.mark.parametrize(
+        "file", ["stata6_113", "stata6_114", "stata6_115", "stata6_117"]
+    )
+    def test_read_write_reread_dta15(self, file, datapath):
+        expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv"))
+        expected["byte_"] = expected["byte_"].astype(np.int8)
+        expected["int_"] = expected["int_"].astype(np.int16)
+        expected["long_"] = expected["long_"].astype(np.int32)
+        expected["float_"] = expected["float_"].astype(np.float32)
+        expected["double_"] = expected["double_"].astype(np.float64)
+
+        # TODO(GH#55564): directly cast to M8[s]
+        arr = expected["date_td"].astype("Period[D]")._values.asfreq("s", how="S")
+        expected["date_td"] = arr.view("M8[s]")
+
+        file = datapath("io", "data", "stata", f"{file}.dta")
+        parsed = self.read_dta(file)
+
+        tm.assert_frame_equal(expected, parsed)
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_timestamp_and_label(self, version, temp_file):
+        original = DataFrame([(1,)], columns=["variable"])
+        time_stamp = datetime(2000, 2, 29, 14, 21)
+        data_label = "This is a data file."
+        path = temp_file
+        original.to_stata(
+            path, time_stamp=time_stamp, data_label=data_label, version=version
+        )
+
+        with StataReader(path) as reader:
+            assert reader.time_stamp == "29 Feb 2000 14:21"
+            assert reader.data_label == data_label
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_invalid_timestamp(self, version, temp_file):
+        original = DataFrame([(1,)], columns=["variable"])
+        time_stamp = "01 Jan 2000, 00:00:00"
+        path = temp_file
+        msg = "time_stamp should be datetime type"
+        with pytest.raises(ValueError, match=msg):
+            original.to_stata(path, time_stamp=time_stamp, version=version)
+        assert not os.path.isfile(path)
+
+    def test_numeric_column_names(self, temp_file):
+        original = DataFrame(np.reshape(np.arange(25.0), (5, 5)))
+        original.index.name = "index"
+        path = temp_file
+        # should get a warning for that format.
+        msg = "Not all pandas column names were valid Stata variable names"
+        with tm.assert_produces_warning(InvalidColumnName, match=msg):
+            original.to_stata(path)
+
+        written_and_read_again = self.read_dta(path)
+
+        written_and_read_again = written_and_read_again.set_index("index")
+        columns = list(written_and_read_again.columns)
+        convert_col_name = lambda x: int(x[1])
+        written_and_read_again.columns = map(convert_col_name, columns)
+
+        expected = original
+        tm.assert_frame_equal(expected, written_and_read_again)
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_nan_to_missing_value(self, version, temp_file):
+        s1 = Series(np.arange(4.0), dtype=np.float32)
+        s2 = Series(np.arange(4.0), dtype=np.float64)
+        s1[::2] = np.nan
+        s2[1::2] = np.nan
+        original = DataFrame({"s1": s1, "s2": s2})
+        original.index.name = "index"
+
+        path = temp_file
+        original.to_stata(path, version=version)
+        written_and_read_again = self.read_dta(path)
+
+        written_and_read_again = written_and_read_again.set_index("index")
+        expected = original
+        tm.assert_frame_equal(written_and_read_again, expected)
+
+    def test_no_index(self, temp_file):
+        columns = ["x", "y"]
+        original = DataFrame(np.reshape(np.arange(10.0), (5, 2)), columns=columns)
+        original.index.name = "index_not_written"
+        path = temp_file
+        original.to_stata(path, write_index=False)
+        written_and_read_again = self.read_dta(path)
+        with pytest.raises(KeyError, match=original.index.name):
+            written_and_read_again["index_not_written"]
+
+    def test_string_no_dates(self, temp_file):
+        s1 = Series(["a", "A longer string"])
+        s2 = Series([1.0, 2.0], dtype=np.float64)
+        original = DataFrame({"s1": s1, "s2": s2})
+        original.index.name = "index"
+        path = temp_file
+        original.to_stata(path)
+        written_and_read_again = self.read_dta(path)
+
+        expected = original
+        tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
+
+    def test_large_value_conversion(self, temp_file):
+        s0 = Series([1, 99], dtype=np.int8)
+        s1 = Series([1, 127], dtype=np.int8)
+        s2 = Series([1, 2**15 - 1], dtype=np.int16)
+        s3 = Series([1, 2**63 - 1], dtype=np.int64)
+        original = DataFrame({"s0": s0, "s1": s1, "s2": s2, "s3": s3})
+        original.index.name = "index"
+        path = temp_file
+        with tm.assert_produces_warning(PossiblePrecisionLoss, match="from int64 to"):
+            original.to_stata(path)
+
+        written_and_read_again = self.read_dta(path)
+
+        modified = original
+        modified["s1"] = Series(modified["s1"], dtype=np.int16)
+        modified["s2"] = Series(modified["s2"], dtype=np.int32)
+        modified["s3"] = Series(modified["s3"], dtype=np.float64)
+        tm.assert_frame_equal(written_and_read_again.set_index("index"), modified)
+
+    def test_dates_invalid_column(self, temp_file):
+        original = DataFrame([datetime(2006, 11, 19, 23, 13, 20)])
+        original.index.name = "index"
+        path = temp_file
+        msg = "Not all pandas column names were valid Stata variable names"
+        with tm.assert_produces_warning(InvalidColumnName, match=msg):
+            original.to_stata(path, convert_dates={0: "tc"})
+
+        written_and_read_again = self.read_dta(path)
+
+        expected = original.copy()
+        expected.columns = ["_0"]
+        expected.index = original.index.astype(np.int32)
+        expected["_0"] = expected["_0"].astype("M8[ms]")
+        tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
+
+    def test_105(self, datapath):
+        # Data obtained from:
+        # http://go.worldbank.org/ZXY29PVJ21
+        dpath = datapath("io", "data", "stata", "S4_EDUC1.dta")
+        df = read_stata(dpath)
+        df0 = [[1, 1, 3, -2], [2, 1, 2, -2], [4, 1, 1, -2]]
+        df0 = DataFrame(df0)
+        df0.columns = ["clustnum", "pri_schl", "psch_num", "psch_dis"]
+        df0["clustnum"] = df0["clustnum"].astype(np.int16)
+        df0["pri_schl"] = df0["pri_schl"].astype(np.int8)
+        df0["psch_num"] = df0["psch_num"].astype(np.int8)
+        df0["psch_dis"] = df0["psch_dis"].astype(np.float32)
+        tm.assert_frame_equal(df.head(3), df0)
+
+    def test_value_labels_old_format(self, datapath):
+        # GH 19417
+        #
+        # Test that value_labels() returns an empty dict if the file format
+        # predates supporting value labels.
+        dpath = datapath("io", "data", "stata", "S4_EDUC1.dta")
+        with StataReader(dpath) as reader:
+            assert reader.value_labels() == {}
+
+    def test_date_export_formats(self, temp_file):
+        columns = ["tc", "td", "tw", "tm", "tq", "th", "ty"]
+        conversions = {c: c for c in columns}
+        data = [datetime(2006, 11, 20, 23, 13, 20)] * len(columns)
+        original = DataFrame([data], columns=columns)
+        original.index.name = "index"
+        expected_values = [
+            datetime(2006, 11, 20, 23, 13, 20),  # Time
+            datetime(2006, 11, 20),  # Day
+            datetime(2006, 11, 19),  # Week
+            datetime(2006, 11, 1),  # Month
+            datetime(2006, 10, 1),  # Quarter year
+            datetime(2006, 7, 1),  # Half year
+            datetime(2006, 1, 1),
+        ]  # Year
+
+        expected = DataFrame(
+            [expected_values],
+            index=pd.Index([0], dtype=np.int32, name="index"),
+            columns=columns,
+            dtype="M8[s]",
+        )
+        expected["tc"] = expected["tc"].astype("M8[ms]")
+
+        path = temp_file
+        original.to_stata(path, convert_dates=conversions)
+        written_and_read_again = self.read_dta(path)
+
+        tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
+
+    def test_write_missing_strings(self, temp_file):
+        original = DataFrame([["1"], [None]], columns=["foo"])
+
+        expected = DataFrame(
+            [["1"], [""]],
+            index=pd.RangeIndex(2, name="index"),
+            columns=["foo"],
+        )
+
+        path = temp_file
+        original.to_stata(path)
+        written_and_read_again = self.read_dta(path)
+
+        tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    @pytest.mark.parametrize("byteorder", [">", "<"])
+    def test_bool_uint(self, byteorder, version, temp_file):
+        s0 = Series([0, 1, True], dtype=np.bool_)
+        s1 = Series([0, 1, 100], dtype=np.uint8)
+        s2 = Series([0, 1, 255], dtype=np.uint8)
+        s3 = Series([0, 1, 2**15 - 100], dtype=np.uint16)
+        s4 = Series([0, 1, 2**16 - 1], dtype=np.uint16)
+        s5 = Series([0, 1, 2**31 - 100], dtype=np.uint32)
+        s6 = Series([0, 1, 2**32 - 1], dtype=np.uint32)
+
+        original = DataFrame(
+            {"s0": s0, "s1": s1, "s2": s2, "s3": s3, "s4": s4, "s5": s5, "s6": s6}
+        )
+        original.index.name = "index"
+
+        path = temp_file
+        original.to_stata(path, byteorder=byteorder, version=version)
+        written_and_read_again = self.read_dta(path)
+
+        written_and_read_again = written_and_read_again.set_index("index")
+
+        expected = original
+        expected_types = (
+            np.int8,
+            np.int8,
+            np.int16,
+            np.int16,
+            np.int32,
+            np.int32,
+            np.float64,
+        )
+        for c, t in zip(expected.columns, expected_types):
+            expected[c] = expected[c].astype(t)
+
+        tm.assert_frame_equal(written_and_read_again, expected)
+
+    def test_variable_labels(self, datapath):
+        with StataReader(datapath("io", "data", "stata", "stata7_115.dta")) as rdr:
+            sr_115 = rdr.variable_labels()
+        with StataReader(datapath("io", "data", "stata", "stata7_117.dta")) as rdr:
+            sr_117 = rdr.variable_labels()
+        keys = ("var1", "var2", "var3")
+        labels = ("label1", "label2", "label3")
+        for k, v in sr_115.items():
+            assert k in sr_117
+            assert v == sr_117[k]
+            assert k in keys
+            assert v in labels
+
+    def test_minimal_size_col(self, temp_file):
+        str_lens = (1, 100, 244)
+        s = {}
+        for str_len in str_lens:
+            s["s" + str(str_len)] = Series(
+                ["a" * str_len, "b" * str_len, "c" * str_len]
+            )
+        original = DataFrame(s)
+        path = temp_file
+        original.to_stata(path, write_index=False)
+
+        with StataReader(path) as sr:
+            sr._ensure_open()  # The `_*list` variables are initialized here
+            for variable, fmt, typ in zip(sr._varlist, sr._fmtlist, sr._typlist):
+                assert int(variable[1:]) == int(fmt[1:-1])
+                assert int(variable[1:]) == typ
+
+    def test_excessively_long_string(self, temp_file):
+        str_lens = (1, 244, 500)
+        s = {}
+        for str_len in str_lens:
+            s["s" + str(str_len)] = Series(
+                ["a" * str_len, "b" * str_len, "c" * str_len]
+            )
+        original = DataFrame(s)
+        msg = (
+            r"Fixed width strings in Stata \.dta files are limited to 244 "
+            r"\(or fewer\)\ncharacters\.  Column 's500' does not satisfy "
+            r"this restriction\. Use the\n'version=117' parameter to write "
+            r"the newer \(Stata 13 and later\) format\."
+        )
+        with pytest.raises(ValueError, match=msg):
+            path = temp_file
+            original.to_stata(path)
+
+    def test_missing_value_generator(self, temp_file):
+        types = ("b", "h", "l")
+        df = DataFrame([[0.0]], columns=["float_"])
+        path = temp_file
+        df.to_stata(path)
+        with StataReader(path) as rdr:
+            valid_range = rdr.VALID_RANGE
+        expected_values = ["." + chr(97 + i) for i in range(26)]
+        expected_values.insert(0, ".")
+        for t in types:
+            offset = valid_range[t][1]
+            for i in range(27):
+                val = StataMissingValue(offset + 1 + i)
+                assert val.string == expected_values[i]
+
+        # Test extremes for floats
+        val = StataMissingValue(struct.unpack("<f", b"\x00\x00\x00\x7f")[0])
+        assert val.string == "."
+        val = StataMissingValue(struct.unpack("<f", b"\x00\xd0\x00\x7f")[0])
+        assert val.string == ".z"
+
+        # Test extremes for floats
+        val = StataMissingValue(
+            struct.unpack("<d", b"\x00\x00\x00\x00\x00\x00\xe0\x7f")[0]
+        )
+        assert val.string == "."
+        val = StataMissingValue(
+            struct.unpack("<d", b"\x00\x00\x00\x00\x00\x1a\xe0\x7f")[0]
+        )
+        assert val.string == ".z"
+
+    @pytest.mark.parametrize("version", [113, 115, 117])
+    def test_missing_value_conversion(self, version, datapath):
+        columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
+        smv = StataMissingValue(101)
+        keys = sorted(smv.MISSING_VALUES.keys())
+        data = []
+        for i in range(27):
+            row = [StataMissingValue(keys[i + (j * 27)]) for j in range(5)]
+            data.append(row)
+        expected = DataFrame(data, columns=columns)
+
+        parsed = read_stata(
+            datapath("io", "data", "stata", f"stata8_{version}.dta"),
+            convert_missing=True,
+        )
+        tm.assert_frame_equal(parsed, expected)
+
+    @pytest.mark.parametrize("version", [104, 105, 108, 110, 111])
+    def test_missing_value_conversion_compat(self, version, datapath):
+        columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
+        smv = StataMissingValue(101)
+        keys = sorted(smv.MISSING_VALUES.keys())
+        data = []
+        row = [StataMissingValue(keys[j * 27]) for j in range(5)]
+        data.append(row)
+        expected = DataFrame(data, columns=columns)
+
+        parsed = read_stata(
+            datapath("io", "data", "stata", f"stata8_{version}.dta"),
+            convert_missing=True,
+        )
+        tm.assert_frame_equal(parsed, expected)
+
+    # The byte type was not supported prior to the 104 format
+    @pytest.mark.parametrize("version", [102, 103])
+    def test_missing_value_conversion_compat_nobyte(self, version, datapath):
+        columns = ["int8_", "int16_", "int32_", "float32_", "float64_"]
+        smv = StataMissingValue(101)
+        keys = sorted(smv.MISSING_VALUES.keys())
+        data = []
+        row = [StataMissingValue(keys[j * 27]) for j in [1, 1, 2, 3, 4]]
+        data.append(row)
+        expected = DataFrame(data, columns=columns)
+
+        parsed = read_stata(
+            datapath("io", "data", "stata", f"stata8_{version}.dta"),
+            convert_missing=True,
+        )
+        tm.assert_frame_equal(parsed, expected)
+
+    def test_big_dates(self, datapath, temp_file):
+        yr = [1960, 2000, 9999, 100, 2262, 1677]
+        mo = [1, 1, 12, 1, 4, 9]
+        dd = [1, 1, 31, 1, 22, 23]
+        hr = [0, 0, 23, 0, 0, 0]
+        mm = [0, 0, 59, 0, 0, 0]
+        ss = [0, 0, 59, 0, 0, 0]
+        expected = []
+        for year, month, day, hour, minute, second in zip(yr, mo, dd, hr, mm, ss):
+            row = []
+            for j in range(7):
+                if j == 0:
+                    row.append(datetime(year, month, day, hour, minute, second))
+                elif j == 6:
+                    row.append(datetime(year, 1, 1))
+                else:
+                    row.append(datetime(year, month, day))
+            expected.append(row)
+        expected.append([pd.NaT] * 7)
+        columns = [
+            "date_tc",
+            "date_td",
+            "date_tw",
+            "date_tm",
+            "date_tq",
+            "date_th",
+            "date_ty",
+        ]
+
+        # Fixes for weekly, quarterly,half,year
+        expected[2][2] = datetime(9999, 12, 24)
+        expected[2][3] = datetime(9999, 12, 1)
+        expected[2][4] = datetime(9999, 10, 1)
+        expected[2][5] = datetime(9999, 7, 1)
+        expected[4][2] = datetime(2262, 4, 16)
+        expected[4][3] = expected[4][4] = datetime(2262, 4, 1)
+        expected[4][5] = expected[4][6] = datetime(2262, 1, 1)
+        expected[5][2] = expected[5][3] = expected[5][4] = datetime(1677, 10, 1)
+        expected[5][5] = expected[5][6] = datetime(1678, 1, 1)
+
+        expected = DataFrame(expected, columns=columns, dtype=object)
+        expected["date_tc"] = expected["date_tc"].astype("M8[ms]")
+        expected["date_td"] = expected["date_td"].astype("M8[s]")
+        expected["date_tm"] = expected["date_tm"].astype("M8[s]")
+        expected["date_tw"] = expected["date_tw"].astype("M8[s]")
+        expected["date_tq"] = expected["date_tq"].astype("M8[s]")
+        expected["date_th"] = expected["date_th"].astype("M8[s]")
+        expected["date_ty"] = expected["date_ty"].astype("M8[s]")
+
+        parsed_115 = read_stata(datapath("io", "data", "stata", "stata9_115.dta"))
+        parsed_117 = read_stata(datapath("io", "data", "stata", "stata9_117.dta"))
+        tm.assert_frame_equal(expected, parsed_115)
+        tm.assert_frame_equal(expected, parsed_117)
+
+        date_conversion = {c: c[-2:] for c in columns}
+        # {c : c[-2:] for c in columns}
+        path = temp_file
+        expected.index.name = "index"
+        msg = (
+            "Converting object-dtype columns of datetimes to datetime64 "
+            "when writing to stata is deprecated"
+        )
+        exp_object = expected.astype(object)
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            exp_object.to_stata(path, convert_dates=date_conversion)
+        written_and_read_again = self.read_dta(path)
+
+        tm.assert_frame_equal(
+            written_and_read_again.set_index("index"),
+            expected.set_index(expected.index.astype(np.int32)),
+        )
+
+    def test_dtype_conversion(self, datapath):
+        expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv"))
+        expected["byte_"] = expected["byte_"].astype(np.int8)
+        expected["int_"] = expected["int_"].astype(np.int16)
+        expected["long_"] = expected["long_"].astype(np.int32)
+        expected["float_"] = expected["float_"].astype(np.float32)
+        expected["double_"] = expected["double_"].astype(np.float64)
+        expected["date_td"] = expected["date_td"].astype("M8[s]")
+
+        no_conversion = read_stata(
+            datapath("io", "data", "stata", "stata6_117.dta"), convert_dates=True
+        )
+        tm.assert_frame_equal(expected, no_conversion)
+
+        conversion = read_stata(
+            datapath("io", "data", "stata", "stata6_117.dta"),
+            convert_dates=True,
+            preserve_dtypes=False,
+        )
+
+        # read_csv types are the same
+        expected2 = self.read_csv(datapath("io", "data", "stata", "stata6.csv"))
+        expected2["date_td"] = expected["date_td"]
+
+        tm.assert_frame_equal(expected2, conversion)
+
+    def test_drop_column(self, datapath):
+        expected = self.read_csv(datapath("io", "data", "stata", "stata6.csv"))
+        expected["byte_"] = expected["byte_"].astype(np.int8)
+        expected["int_"] = expected["int_"].astype(np.int16)
+        expected["long_"] = expected["long_"].astype(np.int32)
+        expected["float_"] = expected["float_"].astype(np.float32)
+        expected["double_"] = expected["double_"].astype(np.float64)
+        expected["date_td"] = expected["date_td"].apply(
+            datetime.strptime, args=("%Y-%m-%d",)
+        )
+
+        columns = ["byte_", "int_", "long_"]
+        expected = expected[columns]
+        dropped = read_stata(
+            datapath("io", "data", "stata", "stata6_117.dta"),
+            convert_dates=True,
+            columns=columns,
+        )
+
+        tm.assert_frame_equal(expected, dropped)
+
+        # See PR 10757
+        columns = ["int_", "long_", "byte_"]
+        expected = expected[columns]
+        reordered = read_stata(
+            datapath("io", "data", "stata", "stata6_117.dta"),
+            convert_dates=True,
+            columns=columns,
+        )
+        tm.assert_frame_equal(expected, reordered)
+
+        msg = "columns contains duplicate entries"
+        with pytest.raises(ValueError, match=msg):
+            read_stata(
+                datapath("io", "data", "stata", "stata6_117.dta"),
+                convert_dates=True,
+                columns=["byte_", "byte_"],
+            )
+
+        msg = "The following columns were not found in the Stata data set: not_found"
+        with pytest.raises(ValueError, match=msg):
+            read_stata(
+                datapath("io", "data", "stata", "stata6_117.dta"),
+                convert_dates=True,
+                columns=["byte_", "int_", "long_", "not_found"],
+            )
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    @pytest.mark.filterwarnings(
+        "ignore:\\nStata value:pandas.io.stata.ValueLabelTypeMismatch"
+    )
+    def test_categorical_writing(self, version, temp_file):
+        original = DataFrame.from_records(
+            [
+                ["one", "ten", "one", "one", "one", 1],
+                ["two", "nine", "two", "two", "two", 2],
+                ["three", "eight", "three", "three", "three", 3],
+                ["four", "seven", 4, "four", "four", 4],
+                ["five", "six", 5, np.nan, "five", 5],
+                ["six", "five", 6, np.nan, "six", 6],
+                ["seven", "four", 7, np.nan, "seven", 7],
+                ["eight", "three", 8, np.nan, "eight", 8],
+                ["nine", "two", 9, np.nan, "nine", 9],
+                ["ten", "one", "ten", np.nan, "ten", 10],
+            ],
+            columns=[
+                "fully_labeled",
+                "fully_labeled2",
+                "incompletely_labeled",
+                "labeled_with_missings",
+                "float_labelled",
+                "unlabeled",
+            ],
+        )
+        path = temp_file
+        original.astype("category").to_stata(path, version=version)
+        written_and_read_again = self.read_dta(path)
+
+        res = written_and_read_again.set_index("index")
+
+        expected = original
+        expected.index = expected.index.set_names("index")
+
+        expected["incompletely_labeled"] = expected["incompletely_labeled"].apply(str)
+        expected["unlabeled"] = expected["unlabeled"].apply(str)
+        for col in expected:
+            orig = expected[col]
+
+            cat = orig.astype("category")._values
+            cat = cat.as_ordered()
+            if col == "unlabeled":
+                cat = cat.set_categories(orig, ordered=True)
+
+            cat.categories.rename(None, inplace=True)
+
+            expected[col] = cat
+
+        tm.assert_frame_equal(res, expected)
+
+    def test_categorical_warnings_and_errors(self, temp_file):
+        # Warning for non-string labels
+        original = DataFrame.from_records(
+            [["a"], ["b"], ["c"], ["d"], [1]], columns=["Too_long"]
+        ).astype("category")
+
+        msg = "data file created has not lost information due to duplicate labels"
+        with tm.assert_produces_warning(ValueLabelTypeMismatch, match=msg):
+            original.to_stata(temp_file)
+            # should get a warning for mixed content
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_categorical_with_stata_missing_values(self, version, temp_file):
+        values = [["a" + str(i)] for i in range(120)]
+        values.append([np.nan])
+        original = DataFrame.from_records(values, columns=["many_labels"])
+        original = pd.concat(
+            [original[col].astype("category") for col in original], axis=1
+        )
+        original.index.name = "index"
+        path = temp_file
+        original.to_stata(path, version=version)
+        written_and_read_again = self.read_dta(path)
+
+        res = written_and_read_again.set_index("index")
+
+        expected = original
+        for col in expected:
+            cat = expected[col]._values
+            new_cats = cat.remove_unused_categories().categories
+            cat = cat.set_categories(new_cats, ordered=True)
+            expected[col] = cat
+        tm.assert_frame_equal(res, expected)
+
+    @pytest.mark.parametrize("file", ["stata10_115", "stata10_117"])
+    def test_categorical_order(self, file, datapath):
+        # Directly construct using expected codes
+        # Format is is_cat, col_name, labels (in order), underlying data
+        expected = [
+            (True, "ordered", ["a", "b", "c", "d", "e"], np.arange(5)),
+            (True, "reverse", ["a", "b", "c", "d", "e"], np.arange(5)[::-1]),
+            (True, "noorder", ["a", "b", "c", "d", "e"], np.array([2, 1, 4, 0, 3])),
+            (True, "floating", ["a", "b", "c", "d", "e"], np.arange(0, 5)),
+            (True, "float_missing", ["a", "d", "e"], np.array([0, 1, 2, -1, -1])),
+            (False, "nolabel", [1.0, 2.0, 3.0, 4.0, 5.0], np.arange(5)),
+            (True, "int32_mixed", ["d", 2, "e", "b", "a"], np.arange(5)),
+        ]
+        cols = []
+        for is_cat, col, labels, codes in expected:
+            if is_cat:
+                cols.append(
+                    (col, pd.Categorical.from_codes(codes, labels, ordered=True))
+                )
+            else:
+                cols.append((col, Series(labels, dtype=np.float32)))
+        expected = DataFrame.from_dict(dict(cols))
+
+        # Read with and with out categoricals, ensure order is identical
+        file = datapath("io", "data", "stata", f"{file}.dta")
+        parsed = read_stata(file)
+        tm.assert_frame_equal(expected, parsed)
+
+        # Check identity of codes
+        for col in expected:
+            if isinstance(expected[col].dtype, CategoricalDtype):
+                tm.assert_series_equal(expected[col].cat.codes, parsed[col].cat.codes)
+                tm.assert_index_equal(
+                    expected[col].cat.categories, parsed[col].cat.categories
+                )
+
+    @pytest.mark.parametrize("file", ["stata11_115", "stata11_117"])
+    def test_categorical_sorting(self, file, datapath):
+        parsed = read_stata(datapath("io", "data", "stata", f"{file}.dta"))
+
+        # Sort based on codes, not strings
+        parsed = parsed.sort_values("srh", na_position="first")
+
+        # Don't sort index
+        parsed.index = pd.RangeIndex(len(parsed))
+        codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4]
+        categories = ["Poor", "Fair", "Good", "Very good", "Excellent"]
+        cat = pd.Categorical.from_codes(
+            codes=codes, categories=categories, ordered=True
+        )
+        expected = Series(cat, name="srh")
+        tm.assert_series_equal(expected, parsed["srh"])
+
+    @pytest.mark.parametrize("file", ["stata10_115", "stata10_117"])
+    def test_categorical_ordering(self, file, datapath):
+        file = datapath("io", "data", "stata", f"{file}.dta")
+        parsed = read_stata(file)
+
+        parsed_unordered = read_stata(file, order_categoricals=False)
+        for col in parsed:
+            if not isinstance(parsed[col].dtype, CategoricalDtype):
+                continue
+            assert parsed[col].cat.ordered
+            assert not parsed_unordered[col].cat.ordered
+
+    @pytest.mark.filterwarnings("ignore::UserWarning")
+    @pytest.mark.parametrize(
+        "file",
+        [
+            "stata1_117",
+            "stata2_117",
+            "stata3_117",
+            "stata4_117",
+            "stata5_117",
+            "stata6_117",
+            "stata7_117",
+            "stata8_117",
+            "stata9_117",
+            "stata10_117",
+            "stata11_117",
+        ],
+    )
+    @pytest.mark.parametrize("chunksize", [1, 2])
+    @pytest.mark.parametrize("convert_categoricals", [False, True])
+    @pytest.mark.parametrize("convert_dates", [False, True])
+    def test_read_chunks_117(
+        self, file, chunksize, convert_categoricals, convert_dates, datapath
+    ):
+        fname = datapath("io", "data", "stata", f"{file}.dta")
+
+        parsed = read_stata(
+            fname,
+            convert_categoricals=convert_categoricals,
+            convert_dates=convert_dates,
+        )
+        with read_stata(
+            fname,
+            iterator=True,
+            convert_categoricals=convert_categoricals,
+            convert_dates=convert_dates,
+        ) as itr:
+            pos = 0
+            for j in range(5):
+                try:
+                    chunk = itr.read(chunksize)
+                except StopIteration:
+                    break
+                from_frame = parsed.iloc[pos : pos + chunksize, :].copy()
+                from_frame = self._convert_categorical(from_frame)
+                tm.assert_frame_equal(
+                    from_frame,
+                    chunk,
+                    check_dtype=False,
+                )
+                pos += chunksize
+
+    @staticmethod
+    def _convert_categorical(from_frame: DataFrame) -> DataFrame:
+        """
+        Emulate the categorical casting behavior we expect from roundtripping.
+        """
+        for col in from_frame:
+            ser = from_frame[col]
+            if isinstance(ser.dtype, CategoricalDtype):
+                cat = ser._values.remove_unused_categories()
+                if cat.categories.dtype == object:
+                    categories = pd.Index._with_infer(
+                        cat.categories._values, copy=False
+                    )
+                    cat = cat.set_categories(categories)
+                elif cat.categories.dtype == "string" and len(cat.categories) == 0:
+                    # if the read categories are empty, it comes back as object dtype
+                    categories = cat.categories.astype(object)
+                    cat = cat.set_categories(categories)
+                from_frame[col] = cat
+        return from_frame
+
+    def test_iterator(self, datapath):
+        fname = datapath("io", "data", "stata", "stata12_117.dta")
+
+        parsed = read_stata(fname)
+        expected = parsed.iloc[0:5, :]
+
+        with read_stata(fname, iterator=True) as itr:
+            chunk = itr.read(5)
+            tm.assert_frame_equal(expected, chunk)
+
+        with read_stata(fname, chunksize=5) as itr:
+            chunk = next(itr)
+            tm.assert_frame_equal(expected, chunk)
+
+        with read_stata(fname, iterator=True) as itr:
+            chunk = itr.get_chunk(5)
+            tm.assert_frame_equal(expected, chunk)
+
+        with read_stata(fname, chunksize=5) as itr:
+            chunk = itr.get_chunk()
+            tm.assert_frame_equal(expected, chunk)
+
+        # GH12153
+        with read_stata(fname, chunksize=4) as itr:
+            from_chunks = pd.concat(itr)
+        tm.assert_frame_equal(parsed, from_chunks)
+
+    @pytest.mark.filterwarnings("ignore::UserWarning")
+    @pytest.mark.parametrize(
+        "file",
+        [
+            "stata2_115",
+            "stata3_115",
+            "stata4_115",
+            "stata5_115",
+            "stata6_115",
+            "stata7_115",
+            "stata8_115",
+            "stata9_115",
+            "stata10_115",
+            "stata11_115",
+        ],
+    )
+    @pytest.mark.parametrize("chunksize", [1, 2])
+    @pytest.mark.parametrize("convert_categoricals", [False, True])
+    @pytest.mark.parametrize("convert_dates", [False, True])
+    def test_read_chunks_115(
+        self, file, chunksize, convert_categoricals, convert_dates, datapath
+    ):
+        fname = datapath("io", "data", "stata", f"{file}.dta")
+
+        # Read the whole file
+        parsed = read_stata(
+            fname,
+            convert_categoricals=convert_categoricals,
+            convert_dates=convert_dates,
+        )
+
+        # Compare to what we get when reading by chunk
+        with read_stata(
+            fname,
+            iterator=True,
+            convert_dates=convert_dates,
+            convert_categoricals=convert_categoricals,
+        ) as itr:
+            pos = 0
+            for j in range(5):
+                try:
+                    chunk = itr.read(chunksize)
+                except StopIteration:
+                    break
+                from_frame = parsed.iloc[pos : pos + chunksize, :].copy()
+                from_frame = self._convert_categorical(from_frame)
+                tm.assert_frame_equal(
+                    from_frame,
+                    chunk,
+                    check_dtype=False,
+                )
+                pos += chunksize
+
+    def test_read_chunks_columns(self, datapath):
+        fname = datapath("io", "data", "stata", "stata3_117.dta")
+        columns = ["quarter", "cpi", "m1"]
+        chunksize = 2
+
+        parsed = read_stata(fname, columns=columns)
+        with read_stata(fname, iterator=True) as itr:
+            pos = 0
+            for j in range(5):
+                chunk = itr.read(chunksize, columns=columns)
+                if chunk is None:
+                    break
+                from_frame = parsed.iloc[pos : pos + chunksize, :]
+                tm.assert_frame_equal(from_frame, chunk, check_dtype=False)
+                pos += chunksize
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_write_variable_labels(self, version, mixed_frame, temp_file):
+        # GH 13631, add support for writing variable labels
+        mixed_frame.index.name = "index"
+        variable_labels = {"a": "City Rank", "b": "City Exponent", "c": "City"}
+        path = temp_file
+        mixed_frame.to_stata(path, variable_labels=variable_labels, version=version)
+        with StataReader(path) as sr:
+            read_labels = sr.variable_labels()
+        expected_labels = {
+            "index": "",
+            "a": "City Rank",
+            "b": "City Exponent",
+            "c": "City",
+        }
+        assert read_labels == expected_labels
+
+        variable_labels["index"] = "The Index"
+        path = temp_file
+        mixed_frame.to_stata(path, variable_labels=variable_labels, version=version)
+        with StataReader(path) as sr:
+            read_labels = sr.variable_labels()
+        assert read_labels == variable_labels
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_invalid_variable_labels(self, version, mixed_frame, temp_file):
+        mixed_frame.index.name = "index"
+        variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"}
+        path = temp_file
+        msg = "Variable labels must be 80 characters or fewer"
+        with pytest.raises(ValueError, match=msg):
+            mixed_frame.to_stata(path, variable_labels=variable_labels, version=version)
+
+    @pytest.mark.parametrize("version", [114, 117])
+    def test_invalid_variable_label_encoding(self, version, mixed_frame, temp_file):
+        mixed_frame.index.name = "index"
+        variable_labels = {"a": "very long" * 10, "b": "City Exponent", "c": "City"}
+        variable_labels["a"] = "invalid character Œ"
+        path = temp_file
+        with pytest.raises(
+            ValueError, match="Variable labels must contain only characters"
+        ):
+            mixed_frame.to_stata(path, variable_labels=variable_labels, version=version)
+
+    def test_write_variable_label_errors(self, mixed_frame, temp_file):
+        values = ["\u03a1", "\u0391", "\u039d", "\u0394", "\u0391", "\u03a3"]
+
+        variable_labels_utf8 = {
+            "a": "City Rank",
+            "b": "City Exponent",
+            "c": "".join(values),
+        }
+
+        msg = (
+            "Variable labels must contain only characters that can be "
+            "encoded in Latin-1"
+        )
+        with pytest.raises(ValueError, match=msg):
+            path = temp_file
+            mixed_frame.to_stata(path, variable_labels=variable_labels_utf8)
+
+        variable_labels_long = {
+            "a": "City Rank",
+            "b": "City Exponent",
+            "c": "A very, very, very long variable label "
+            "that is too long for Stata which means "
+            "that it has more than 80 characters",
+        }
+
+        msg = "Variable labels must be 80 characters or fewer"
+        with pytest.raises(ValueError, match=msg):
+            path = temp_file
+            mixed_frame.to_stata(path, variable_labels=variable_labels_long)
+
+    def test_default_date_conversion(self, temp_file):
+        # GH 12259
+        dates = [
+            dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
+            dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
+            dt.datetime(1776, 7, 4, 7, 4, 7, 4000),
+        ]
+        original = DataFrame(
+            {
+                "nums": [1.0, 2.0, 3.0],
+                "strs": ["apple", "banana", "cherry"],
+                "dates": dates,
+            }
+        )
+
+        expected = original[:]
+        # "tc" for convert_dates below stores with "ms" resolution
+        expected["dates"] = expected["dates"].astype("M8[ms]")
+
+        path = temp_file
+        original.to_stata(path, write_index=False)
+        reread = read_stata(path, convert_dates=True)
+        tm.assert_frame_equal(expected, reread)
+
+        original.to_stata(path, write_index=False, convert_dates={"dates": "tc"})
+        direct = read_stata(path, convert_dates=True)
+        tm.assert_frame_equal(reread, direct)
+
+        dates_idx = original.columns.tolist().index("dates")
+        original.to_stata(path, write_index=False, convert_dates={dates_idx: "tc"})
+        direct = read_stata(path, convert_dates=True)
+        tm.assert_frame_equal(reread, direct)
+
+    def test_unsupported_type(self, temp_file):
+        original = DataFrame({"a": [1 + 2j, 2 + 4j]})
+
+        msg = "Data type complex128 not supported"
+        with pytest.raises(NotImplementedError, match=msg):
+            path = temp_file
+            original.to_stata(path)
+
+    def test_unsupported_datetype(self, temp_file):
+        dates = [
+            dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
+            dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
+            dt.datetime(1776, 7, 4, 7, 4, 7, 4000),
+        ]
+        original = DataFrame(
+            {
+                "nums": [1.0, 2.0, 3.0],
+                "strs": ["apple", "banana", "cherry"],
+                "dates": dates,
+            }
+        )
+
+        msg = "Format %tC not implemented"
+        with pytest.raises(NotImplementedError, match=msg):
+            path = temp_file
+            original.to_stata(path, convert_dates={"dates": "tC"})
+
+        dates = pd.date_range("1-1-1990", periods=3, tz="Asia/Hong_Kong")
+        original = DataFrame(
+            {
+                "nums": [1.0, 2.0, 3.0],
+                "strs": ["apple", "banana", "cherry"],
+                "dates": dates,
+            }
+        )
+        with pytest.raises(NotImplementedError, match="Data type datetime64"):
+            path = temp_file
+            original.to_stata(path)
+
+    def test_repeated_column_labels(self, datapath):
+        # GH 13923, 25772
+        msg = """
+Value labels for column ethnicsn are not unique. These cannot be converted to
+pandas categoricals.
+
+Either read the file with `convert_categoricals` set to False or use the
+low level interface in `StataReader` to separately read the values and the
+value_labels.
+
+The repeated labels are:\n-+\nwolof
+"""
+        with pytest.raises(ValueError, match=msg):
+            read_stata(
+                datapath("io", "data", "stata", "stata15.dta"),
+                convert_categoricals=True,
+            )
+
+    def test_stata_111(self, datapath):
+        # 111 is an old version but still used by current versions of
+        # SAS when exporting to Stata format. We do not know of any
+        # on-line documentation for this version.
+        df = read_stata(datapath("io", "data", "stata", "stata7_111.dta"))
+        original = DataFrame(
+            {
+                "y": [1, 1, 1, 1, 1, 0, 0, np.nan, 0, 0],
+                "x": [1, 2, 1, 3, np.nan, 4, 3, 5, 1, 6],
+                "w": [2, np.nan, 5, 2, 4, 4, 3, 1, 2, 3],
+                "z": ["a", "b", "c", "d", "e", "", "g", "h", "i", "j"],
+            }
+        )
+        original = original[["y", "x", "w", "z"]]
+        tm.assert_frame_equal(original, df)
+
+    def test_out_of_range_double(self, temp_file):
+        # GH 14618
+        df = DataFrame(
+            {
+                "ColumnOk": [0.0, np.finfo(np.double).eps, 4.49423283715579e307],
+                "ColumnTooBig": [0.0, np.finfo(np.double).eps, np.finfo(np.double).max],
+            }
+        )
+        msg = (
+            r"Column ColumnTooBig has a maximum value \(.+\) outside the range "
+            r"supported by Stata \(.+\)"
+        )
+        with pytest.raises(ValueError, match=msg):
+            path = temp_file
+            df.to_stata(path)
+
+    def test_out_of_range_float(self, temp_file):
+        original = DataFrame(
+            {
+                "ColumnOk": [
+                    0.0,
+                    np.finfo(np.float32).eps,
+                    np.finfo(np.float32).max / 10.0,
+                ],
+                "ColumnTooBig": [
+                    0.0,
+                    np.finfo(np.float32).eps,
+                    np.finfo(np.float32).max,
+                ],
+            }
+        )
+        original.index.name = "index"
+        for col in original:
+            original[col] = original[col].astype(np.float32)
+
+        path = temp_file
+        original.to_stata(path)
+        reread = read_stata(path)
+
+        original["ColumnTooBig"] = original["ColumnTooBig"].astype(np.float64)
+        expected = original
+        tm.assert_frame_equal(reread.set_index("index"), expected)
+
+    @pytest.mark.parametrize("infval", [np.inf, -np.inf])
+    def test_inf(self, infval, temp_file):
+        # GH 45350
+        df = DataFrame({"WithoutInf": [0.0, 1.0], "WithInf": [2.0, infval]})
+        msg = (
+            "Column WithInf contains infinity or -infinity"
+            "which is outside the range supported by Stata."
+        )
+        with pytest.raises(ValueError, match=msg):
+            path = temp_file
+            df.to_stata(path)
+
+    def test_path_pathlib(self, temp_file):
+        df = DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
+        )
+        df.index.name = "index"
+        reader = lambda x: read_stata(x).set_index("index")
+        result = tm.round_trip_pathlib(df.to_stata, reader, temp_file)
+        tm.assert_frame_equal(df, result)
+
+    @pytest.mark.parametrize("write_index", [True, False])
+    def test_value_labels_iterator(self, write_index, temp_file):
+        # GH 16923
+        d = {"A": ["B", "E", "C", "A", "E"]}
+        df = DataFrame(data=d)
+        df["A"] = df["A"].astype("category")
+        path = temp_file
+        df.to_stata(path, write_index=write_index)
+
+        with read_stata(path, iterator=True) as dta_iter:
+            value_labels = dta_iter.value_labels()
+        assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}}
+
+    def test_set_index(self, temp_file):
+        # GH 17328
+        df = DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
+        )
+        df.index.name = "index"
+        path = temp_file
+        df.to_stata(path)
+        reread = read_stata(path, index_col="index")
+        tm.assert_frame_equal(df, reread)
+
+    @pytest.mark.parametrize(
+        "column", ["ms", "day", "week", "month", "qtr", "half", "yr"]
+    )
+    def test_date_parsing_ignores_format_details(self, column, datapath):
+        # GH 17797
+        #
+        # Test that display formats are ignored when determining if a numeric
+        # column is a date value.
+        #
+        # All date types are stored as numbers and format associated with the
+        # column denotes both the type of the date and the display format.
+        #
+        # STATA supports 9 date types which each have distinct units. We test 7
+        # of the 9 types, ignoring %tC and %tb. %tC is a variant of %tc that
+        # accounts for leap seconds and %tb relies on STATAs business calendar.
+        df = read_stata(datapath("io", "data", "stata", "stata13_dates.dta"))
+        unformatted = df.loc[0, column]
+        formatted = df.loc[0, column + "_fmt"]
+        assert unformatted == formatted
+
+    @pytest.mark.parametrize("byteorder", ["little", "big"])
+    def test_writer_117(self, byteorder, temp_file, using_infer_string):
+        original = DataFrame(
+            data=[
+                [
+                    "string",
+                    "object",
+                    1,
+                    1,
+                    1,
+                    1.1,
+                    1.1,
+                    np.datetime64("2003-12-25"),
+                    "a",
+                    "a" * 2045,
+                    "a" * 5000,
+                    "a",
+                ],
+                [
+                    "string-1",
+                    "object-1",
+                    1,
+                    1,
+                    1,
+                    1.1,
+                    1.1,
+                    np.datetime64("2003-12-26"),
+                    "b",
+                    "b" * 2045,
+                    "",
+                    "",
+                ],
+            ],
+            columns=[
+                "string",
+                "object",
+                "int8",
+                "int16",
+                "int32",
+                "float32",
+                "float64",
+                "datetime",
+                "s1",
+                "s2045",
+                "srtl",
+                "forced_strl",
+            ],
+        )
+        original["object"] = Series(original["object"], dtype=object)
+        original["int8"] = Series(original["int8"], dtype=np.int8)
+        original["int16"] = Series(original["int16"], dtype=np.int16)
+        original["int32"] = original["int32"].astype(np.int32)
+        original["float32"] = Series(original["float32"], dtype=np.float32)
+        original.index.name = "index"
+        copy = original.copy()
+        path = temp_file
+        original.to_stata(
+            path,
+            convert_dates={"datetime": "tc"},
+            byteorder=byteorder,
+            convert_strl=["forced_strl"],
+            version=117,
+        )
+        written_and_read_again = self.read_dta(path)
+
+        expected = original[:]
+        # "tc" for convert_dates means we store with "ms" resolution
+        expected["datetime"] = expected["datetime"].astype("M8[ms]")
+        if using_infer_string:
+            # object dtype (with only strings/None) comes back as string dtype
+            expected["object"] = expected["object"].astype("str")
+
+        tm.assert_frame_equal(
+            written_and_read_again.set_index("index"),
+            expected,
+        )
+        tm.assert_frame_equal(original, copy)
+
+    def test_convert_strl_name_swap(self, temp_file):
+        original = DataFrame(
+            [["a" * 3000, "A", "apple"], ["b" * 1000, "B", "banana"]],
+            columns=["long1" * 10, "long", 1],
+        )
+        original.index.name = "index"
+
+        msg = "Not all pandas column names were valid Stata variable names"
+        with tm.assert_produces_warning(InvalidColumnName, match=msg):
+            path = temp_file
+            original.to_stata(path, convert_strl=["long", 1], version=117)
+            reread = self.read_dta(path)
+            reread = reread.set_index("index")
+            reread.columns = original.columns
+            tm.assert_frame_equal(reread, original, check_index_type=False)
+
+    def test_invalid_date_conversion(self, temp_file):
+        # GH 12259
+        dates = [
+            dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
+            dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
+            dt.datetime(1776, 7, 4, 7, 4, 7, 4000),
+        ]
+        original = DataFrame(
+            {
+                "nums": [1.0, 2.0, 3.0],
+                "strs": ["apple", "banana", "cherry"],
+                "dates": dates,
+            }
+        )
+
+        path = temp_file
+        msg = "convert_dates key must be a column or an integer"
+        with pytest.raises(ValueError, match=msg):
+            original.to_stata(path, convert_dates={"wrong_name": "tc"})
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_nonfile_writing(self, version, temp_file):
+        # GH 21041
+        bio = io.BytesIO()
+        df = DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
+        )
+        df.index.name = "index"
+        path = temp_file
+        df.to_stata(bio, version=version)
+        bio.seek(0)
+        with open(path, "wb") as dta:
+            dta.write(bio.read())
+        reread = read_stata(path, index_col="index")
+        tm.assert_frame_equal(df, reread)
+
+    def test_gzip_writing(self, temp_file):
+        # writing version 117 requires seek and cannot be used with gzip
+        df = DataFrame(
+            1.1 * np.arange(120).reshape((30, 4)),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
+        )
+        df.index.name = "index"
+        path = temp_file
+        with gzip.GzipFile(path, "wb") as gz:
+            df.to_stata(gz, version=114)
+        with gzip.GzipFile(path, "rb") as gz:
+            reread = read_stata(gz, index_col="index")
+        tm.assert_frame_equal(df, reread)
+
+    # 117 is not included in this list as it uses ASCII strings
+    @pytest.mark.parametrize(
+        "file",
+        [
+            "stata16_118",
+            "stata16_be_118",
+            "stata16_119",
+            "stata16_be_119",
+        ],
+    )
+    def test_unicode_dta_118_119(self, file, datapath):
+        unicode_df = self.read_dta(datapath("io", "data", "stata", f"{file}.dta"))
+
+        columns = ["utf8", "latin1", "ascii", "utf8_strl", "ascii_strl"]
+        values = [
+            ["ραηδας", "PÄNDÄS", "p", "ραηδας", "p"],
+            ["ƤĀńĐąŜ", "Ö", "a", "ƤĀńĐąŜ", "a"],
+            ["ᴘᴀᴎᴅᴀS", "Ü", "n", "ᴘᴀᴎᴅᴀS", "n"],
+            ["      ", "      ", "d", "      ", "d"],
+            [" ", "", "a", " ", "a"],
+            ["", "", "s", "", "s"],
+            ["", "", " ", "", " "],
+        ]
+        expected = DataFrame(values, columns=columns)
+
+        tm.assert_frame_equal(unicode_df, expected)
+
+    def test_mixed_string_strl(self, temp_file, using_infer_string):
+        # GH 23633
+        output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}]
+        output = DataFrame(output)
+        output.number = output.number.astype("int32")
+
+        path = temp_file
+        output.to_stata(path, write_index=False, version=117)
+        reread = read_stata(path)
+        expected = output.fillna("")
+        tm.assert_frame_equal(reread, expected)
+
+        # Check strl supports all None (null)
+        output["mixed"] = None
+        output.to_stata(path, write_index=False, convert_strl=["mixed"], version=117)
+        reread = read_stata(path)
+        expected = output.fillna("")
+        if using_infer_string:
+            expected["mixed"] = expected["mixed"].astype("str")
+        tm.assert_frame_equal(reread, expected)
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_all_none_exception(self, version, temp_file):
+        output = [{"none": "none", "number": 0}, {"none": None, "number": 1}]
+        output = DataFrame(output)
+        output["none"] = None
+        with pytest.raises(ValueError, match="Column `none` cannot be exported"):
+            output.to_stata(temp_file, version=version)
+
+    @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+    def test_invalid_file_not_written(self, version, temp_file):
+        content = "Here is one __�__ Another one __·__ Another one __½__"
+        df = DataFrame([content], columns=["invalid"])
+        msg1 = (
+            r"'latin-1' codec can't encode character '\\ufffd' "
+            r"in position 14: ordinal not in range\(256\)"
+        )
+        msg2 = (
+            "'ascii' codec can't decode byte 0xef in position 14: "
+            r"ordinal not in range\(128\)"
+        )
+        with pytest.raises(UnicodeEncodeError, match=f"{msg1}|{msg2}"):
+            df.to_stata(temp_file)
+
+    def test_strl_latin1(self, temp_file):
+        # GH 23573, correct GSO data to reflect correct size
+        output = DataFrame(
+            [["pandas"] * 2, ["þâÑÐÅ§"] * 2], columns=["var_str", "var_strl"]
+        )
+
+        output.to_stata(temp_file, version=117, convert_strl=["var_strl"])
+        with open(temp_file, "rb") as reread:
+            content = reread.read()
+            expected = "þâÑÐÅ§"
+            assert expected.encode("latin-1") in content
+            assert expected.encode("utf-8") in content
+            gsos = content.split(b"strls")[1][1:-2]
+            for gso in gsos.split(b"GSO")[1:]:
+                val = gso.split(b"\x00")[-2]
+                size = gso[gso.find(b"\x82") + 1]
+                assert len(val) == size - 1
+
+    def test_encoding_latin1_118(self, datapath):
+        # GH 25960
+        msg = """
+One or more strings in the dta file could not be decoded using utf-8, and
+so the fallback encoding of latin-1 is being used.  This can happen when a file
+has been incorrectly encoded by Stata or some other software. You should verify
+the string values returned are correct."""
+        # Move path outside of read_stata, or else assert_produces_warning
+        # will block pytests skip mechanism from triggering (failing the test)
+        # if the path is not present
+        path = datapath("io", "data", "stata", "stata1_encoding_118.dta")
+        with tm.assert_produces_warning(UnicodeWarning, filter_level="once") as w:
+            encoded = read_stata(path)
+            # with filter_level="always", produces 151 warnings which can be slow
+            assert len(w) == 1
+            assert w[0].message.args[0] == msg
+
+        expected = DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"])
+        tm.assert_frame_equal(encoded, expected)
+
+    @pytest.mark.slow
+    def test_stata_119(self, datapath):
+        # Gzipped since contains 32,999 variables and uncompressed is 20MiB
+        # Just validate that the reader reports correct number of variables
+        # to avoid high peak memory
+        with gzip.open(
+            datapath("io", "data", "stata", "stata1_119.dta.gz"), "rb"
+        ) as gz:
+            with StataReader(gz) as reader:
+                reader._ensure_open()
+                assert reader._nvar == 32999
+
+    @pytest.mark.parametrize("version", [118, 119, None])
+    @pytest.mark.parametrize("byteorder", ["little", "big"])
+    def test_utf8_writer(self, version, byteorder, temp_file):
+        cat = pd.Categorical(["a", "β", "ĉ"], ordered=True)
+        data = DataFrame(
+            [
+                [1.0, 1, "ᴬ", "ᴀ relatively long ŝtring"],
+                [2.0, 2, "ᴮ", ""],
+                [3.0, 3, "ᴰ", None],
+            ],
+            columns=["Å", "β", "ĉ", "strls"],
+        )
+        data["ᴐᴬᵀ"] = cat
+        variable_labels = {
+            "Å": "apple",
+            "β": "ᵈᵉᵊ",
+            "ĉ": "ᴎტჄႲႳႴႶႺ",
+            "strls": "Long Strings",
+            "ᴐᴬᵀ": "",
+        }
+        data_label = "ᴅaᵀa-label"
+        value_labels = {"β": {1: "label", 2: "æøå", 3: "ŋot valid latin-1"}}
+        data["β"] = data["β"].astype(np.int32)
+        writer = StataWriterUTF8(
+            temp_file,
+            data,
+            data_label=data_label,
+            convert_strl=["strls"],
+            variable_labels=variable_labels,
+            write_index=False,
+            byteorder=byteorder,
+            version=version,
+            value_labels=value_labels,
+        )
+        writer.write_file()
+        reread_encoded = read_stata(temp_file)
+        # Missing is intentionally converted to empty strl
+        data["strls"] = data["strls"].fillna("")
+        # Variable with value labels is reread as categorical
+        data["β"] = (
+            data["β"].replace(value_labels["β"]).astype("category").cat.as_ordered()
+        )
+        tm.assert_frame_equal(data, reread_encoded)
+        with StataReader(temp_file) as reader:
+            assert reader.data_label == data_label
+            assert reader.variable_labels() == variable_labels
+
+        data.to_stata(temp_file, version=version, write_index=False)
+        reread_to_stata = read_stata(temp_file)
+        tm.assert_frame_equal(data, reread_to_stata)
+
+    def test_writer_118_exceptions(self, temp_file):
+        df = DataFrame(np.zeros((1, 33000), dtype=np.int8))
+        with pytest.raises(ValueError, match="version must be either 118 or 119."):
+            StataWriterUTF8(temp_file, df, version=117)
+        with pytest.raises(ValueError, match="You must use version 119"):
+            StataWriterUTF8(temp_file, df, version=118)
+
+    @pytest.mark.parametrize(
+        "dtype_backend",
+        ["numpy_nullable", pytest.param("pyarrow", marks=td.skip_if_no("pyarrow"))],
+    )
+    def test_read_write_ea_dtypes(self, dtype_backend, temp_file, tmp_path):
+        dtype = "Int64" if dtype_backend == "numpy_nullable" else "int64[pyarrow]"
+        df = DataFrame(
+            {
+                "a": pd.array([1, 2, None], dtype=dtype),
+                "b": ["a", "b", "c"],
+                "c": [True, False, None],
+                "d": [1.5, 2.5, 3.5],
+                "e": pd.date_range("2020-12-31", periods=3, freq="D"),
+            },
+            index=pd.Index([0, 1, 2], name="index"),
+        )
+        df = df.convert_dtypes(dtype_backend=dtype_backend)
+        stata_path = tmp_path / "test_stata.dta"
+        df.to_stata(stata_path, version=118)
+
+        df.to_stata(temp_file)
+        written_and_read_again = self.read_dta(temp_file)
+
+        expected = DataFrame(
+            {
+                "a": [1, 2, np.nan],
+                "b": ["a", "b", "c"],
+                "c": [1.0, 0, np.nan],
+                "d": [1.5, 2.5, 3.5],
+                # stata stores with ms unit, so unit does not round-trip exactly
+                "e": pd.date_range("2020-12-31", periods=3, freq="D", unit="ms"),
+            },
+            index=pd.RangeIndex(range(3), name="index"),
+        )
+
+        tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
+
+    @pytest.mark.parametrize("version", [113, 114, 115, 117, 118, 119])
+    def test_read_data_int_validranges(self, version, datapath):
+        expected = DataFrame(
+            {
+                "byte": np.array([-127, 100], dtype=np.int8),
+                "int": np.array([-32767, 32740], dtype=np.int16),
+                "long": np.array([-2147483647, 2147483620], dtype=np.int32),
+            }
+        )
+
+        parsed = read_stata(
+            datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta")
+        )
+        tm.assert_frame_equal(parsed, expected)
+
+    @pytest.mark.parametrize("version", [104, 105, 108, 110, 111])
+    def test_read_data_int_validranges_compat(self, version, datapath):
+        expected = DataFrame(
+            {
+                "byte": np.array([-128, 126], dtype=np.int8),
+                "int": np.array([-32768, 32766], dtype=np.int16),
+                "long": np.array([-2147483648, 2147483646], dtype=np.int32),
+            }
+        )
+
+        parsed = read_stata(
+            datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta")
+        )
+        tm.assert_frame_equal(parsed, expected)
+
+    # The byte type was not supported prior to the 104 format
+    @pytest.mark.parametrize("version", [102, 103])
+    def test_read_data_int_validranges_compat_nobyte(self, version, datapath):
+        expected = DataFrame(
+            {
+                "byte": np.array([-128, 126], dtype=np.int16),
+                "int": np.array([-32768, 32766], dtype=np.int16),
+                "long": np.array([-2147483648, 2147483646], dtype=np.int32),
+            }
+        )
+
+        parsed = read_stata(
+            datapath("io", "data", "stata", f"stata_int_validranges_{version}.dta")
+        )
+        tm.assert_frame_equal(parsed, expected)
+
+
+@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114])
+def test_backward_compat(version, datapath):
+    data_base = datapath("io", "data", "stata")
+    ref = os.path.join(data_base, "stata-compat-118.dta")
+    old = os.path.join(data_base, f"stata-compat-{version}.dta")
+    expected = read_stata(ref)
+    old_dta = read_stata(old)
+    tm.assert_frame_equal(old_dta, expected, check_dtype=False)
+
+
+@pytest.mark.parametrize("version", [103, 104])
+def test_backward_compat_nodateconversion(version, datapath):
+    # The Stata data format prior to 105 did not support a date format
+    # so read the raw values for comparison
+    data_base = datapath("io", "data", "stata")
+    ref = os.path.join(data_base, "stata-compat-118.dta")
+    old = os.path.join(data_base, f"stata-compat-{version}.dta")
+    expected = read_stata(ref, convert_dates=False)
+    old_dta = read_stata(old, convert_dates=False)
+    tm.assert_frame_equal(old_dta, expected, check_dtype=False)
+
+
+@pytest.mark.parametrize("version", [102])
+def test_backward_compat_nostring(version, datapath):
+    # The Stata data format prior to 105 did not support a date format
+    # so read the raw values for comparison
+    ref = datapath("io", "data", "stata", "stata-compat-118.dta")
+    old = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
+    expected = read_stata(ref, convert_dates=False)
+    # The Stata data format prior to 103 did not support string data
+    expected = expected.drop(columns=["s10"])
+    old_dta = read_stata(old, convert_dates=False)
+    tm.assert_frame_equal(old_dta, expected, check_dtype=False)
+
+
+@pytest.mark.parametrize("version", [105, 108, 110, 111, 113, 114, 118])
+def test_bigendian(version, datapath):
+    ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
+    big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta")
+    expected = read_stata(ref)
+    big_dta = read_stata(big)
+    tm.assert_frame_equal(big_dta, expected)
+
+
+# Note: 102 format does not support big-endian byte order
+@pytest.mark.parametrize("version", [103, 104])
+def test_bigendian_nodateconversion(version, datapath):
+    # The Stata data format prior to 105 did not support a date format
+    # so read the raw values for comparison
+    ref = datapath("io", "data", "stata", f"stata-compat-{version}.dta")
+    big = datapath("io", "data", "stata", f"stata-compat-be-{version}.dta")
+    expected = read_stata(ref, convert_dates=False)
+    big_dta = read_stata(big, convert_dates=False)
+    tm.assert_frame_equal(big_dta, expected)
+
+
+def test_direct_read(datapath, monkeypatch):
+    file_path = datapath("io", "data", "stata", "stata-compat-118.dta")
+
+    # Test that opening a file path doesn't buffer the file.
+    with StataReader(file_path) as reader:
+        # Must not have been buffered to memory
+        assert not reader.read().empty
+        assert not isinstance(reader._path_or_buf, io.BytesIO)
+
+    # Test that we use a given fp exactly, if possible.
+    with open(file_path, "rb") as fp:
+        with StataReader(fp) as reader:
+            assert not reader.read().empty
+            assert reader._path_or_buf is fp
+
+    # Test that we use a given BytesIO exactly, if possible.
+    with open(file_path, "rb") as fp:
+        with io.BytesIO(fp.read()) as bio:
+            with StataReader(bio) as reader:
+                assert not reader.read().empty
+                assert reader._path_or_buf is bio
+
+
+@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+@pytest.mark.parametrize("use_dict", [True, False])
+@pytest.mark.parametrize("infer", [True, False])
+def test_compression(
+    compression, version, use_dict, infer, compression_to_extension, tmp_path
+):
+    file_name = "dta_inferred_compression.dta"
+    if compression:
+        if use_dict:
+            file_ext = compression
+        else:
+            file_ext = compression_to_extension[compression]
+        file_name += f".{file_ext}"
+    compression_arg = compression
+    if infer:
+        compression_arg = "infer"
+    if use_dict:
+        compression_arg = {"method": compression}
+
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB")
+    )
+    df.index.name = "index"
+    path = tmp_path / file_name
+    path.touch()
+    df.to_stata(path, version=version, compression=compression_arg)
+    if compression == "gzip":
+        with gzip.open(path, "rb") as comp:
+            fp = io.BytesIO(comp.read())
+    elif compression == "zip":
+        with zipfile.ZipFile(path, "r") as comp:
+            fp = io.BytesIO(comp.read(comp.filelist[0]))
+    elif compression == "tar":
+        with tarfile.open(path) as tar:
+            fp = io.BytesIO(tar.extractfile(tar.getnames()[0]).read())
+    elif compression == "bz2":
+        with bz2.open(path, "rb") as comp:
+            fp = io.BytesIO(comp.read())
+    elif compression == "zstd":
+        zstd = pytest.importorskip("zstandard")
+        with zstd.open(path, "rb") as comp:
+            fp = io.BytesIO(comp.read())
+    elif compression == "xz":
+        lzma = pytest.importorskip("lzma")
+        with lzma.open(path, "rb") as comp:
+            fp = io.BytesIO(comp.read())
+    elif compression is None:
+        fp = path
+    reread = read_stata(fp, index_col="index")
+
+    expected = df
+    tm.assert_frame_equal(reread, expected)
+
+
+@pytest.mark.parametrize("method", ["zip", "infer"])
+@pytest.mark.parametrize("file_ext", [None, "dta", "zip"])
+def test_compression_dict(method, file_ext, tmp_path):
+    file_name = f"test.{file_ext}"
+    archive_name = "test.dta"
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((10, 2)), columns=list("AB")
+    )
+    df.index.name = "index"
+    compression = {"method": method, "archive_name": archive_name}
+    path = tmp_path / file_name
+    path.touch()
+    df.to_stata(path, compression=compression)
+    if method == "zip" or file_ext == "zip":
+        with zipfile.ZipFile(path, "r") as zp:
+            assert len(zp.filelist) == 1
+            assert zp.filelist[0].filename == archive_name
+            fp = io.BytesIO(zp.read(zp.filelist[0]))
+    else:
+        fp = path
+    reread = read_stata(fp, index_col="index")
+
+    expected = df
+    tm.assert_frame_equal(reread, expected)
+
+
+@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+def test_chunked_categorical(version, temp_file):
+    df = DataFrame({"cats": Series(["a", "b", "a", "b", "c"], dtype="category")})
+    df.index.name = "index"
+
+    expected = df.copy()
+
+    df.to_stata(temp_file, version=version)
+    with StataReader(temp_file, chunksize=2, order_categoricals=False) as reader:
+        for i, block in enumerate(reader):
+            block = block.set_index("index")
+            assert "cats" in block
+            tm.assert_series_equal(
+                block.cats,
+                expected.cats.iloc[2 * i : 2 * (i + 1)],
+                check_index_type=len(block) > 1,
+            )
+
+
+def test_chunked_categorical_partial(datapath):
+    dta_file = datapath("io", "data", "stata", "stata-dta-partially-labeled.dta")
+    values = ["a", "b", "a", "b", 3.0]
+    msg = "series with value labels are not fully labeled"
+    with StataReader(dta_file, chunksize=2) as reader:
+        with tm.assert_produces_warning(CategoricalConversionWarning, match=msg):
+            for i, block in enumerate(reader):
+                assert list(block.cats) == values[2 * i : 2 * (i + 1)]
+                if i < 2:
+                    idx = pd.Index(["a", "b"])
+                else:
+                    idx = pd.Index([3.0], dtype="float64")
+                tm.assert_index_equal(block.cats.cat.categories, idx)
+    with tm.assert_produces_warning(CategoricalConversionWarning, match=msg):
+        with StataReader(dta_file, chunksize=5) as reader:
+            large_chunk = reader.__next__()
+    direct = read_stata(dta_file)
+    tm.assert_frame_equal(direct, large_chunk)
+
+
+@pytest.mark.parametrize("chunksize", (-1, 0, "apple"))
+def test_iterator_errors(datapath, chunksize):
+    dta_file = datapath("io", "data", "stata", "stata-dta-partially-labeled.dta")
+    with pytest.raises(ValueError, match="chunksize must be a positive"):
+        with StataReader(dta_file, chunksize=chunksize):
+            pass
+
+
+def test_iterator_value_labels(temp_file):
+    # GH 31544
+    values = ["c_label", "b_label"] + ["a_label"] * 500
+    df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)})
+    df.to_stata(temp_file, write_index=False)
+    expected = pd.Index(["a_label", "b_label", "c_label"])
+    with read_stata(temp_file, chunksize=100) as reader:
+        for j, chunk in enumerate(reader):
+            for i in range(2):
+                tm.assert_index_equal(chunk.dtypes.iloc[i].categories, expected)
+            tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])
+
+
+def test_precision_loss(temp_file):
+    df = DataFrame(
+        [[sum(2**i for i in range(60)), sum(2**i for i in range(52))]],
+        columns=["big", "little"],
+    )
+    with tm.assert_produces_warning(
+        PossiblePrecisionLoss, match="Column converted from int64 to float64"
+    ):
+        df.to_stata(temp_file, write_index=False)
+    reread = read_stata(temp_file)
+    expected_dt = Series([np.float64, np.float64], index=["big", "little"])
+    tm.assert_series_equal(reread.dtypes, expected_dt)
+    assert reread.loc[0, "little"] == df.loc[0, "little"]
+    assert reread.loc[0, "big"] == float(df.loc[0, "big"])
+
+
+def test_compression_roundtrip(compression, temp_file):
+    df = DataFrame(
+        [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+        index=["A", "B"],
+        columns=["X", "Y", "Z"],
+    )
+    df.index.name = "index"
+
+    df.to_stata(temp_file, compression=compression)
+    reread = read_stata(temp_file, compression=compression, index_col="index")
+    tm.assert_frame_equal(df, reread)
+
+    # explicitly ensure file was compressed.
+    with tm.decompress_file(temp_file, compression) as fh:
+        contents = io.BytesIO(fh.read())
+    reread = read_stata(contents, index_col="index")
+    tm.assert_frame_equal(df, reread)
+
+
+@pytest.mark.parametrize("to_infer", [True, False])
+@pytest.mark.parametrize("read_infer", [True, False])
+def test_stata_compression(
+    compression_only, read_infer, to_infer, compression_to_extension, tmp_path
+):
+    compression = compression_only
+
+    ext = compression_to_extension[compression]
+    filename = f"test.{ext}"
+
+    df = DataFrame(
+        [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
+        index=["A", "B"],
+        columns=["X", "Y", "Z"],
+    )
+    df.index.name = "index"
+
+    to_compression = "infer" if to_infer else compression
+    read_compression = "infer" if read_infer else compression
+
+    path = tmp_path / filename
+    path.touch()
+    df.to_stata(path, compression=to_compression)
+    result = read_stata(path, compression=read_compression, index_col="index")
+    tm.assert_frame_equal(result, df)
+
+
+def test_non_categorical_value_labels(temp_file):
+    data = DataFrame(
+        {
+            "fully_labelled": [1, 2, 3, 3, 1],
+            "partially_labelled": [1.0, 2.0, np.nan, 9.0, np.nan],
+            "Y": [7, 7, 9, 8, 10],
+            "Z": pd.Categorical(["j", "k", "l", "k", "j"]),
+        }
+    )
+
+    path = temp_file
+    value_labels = {
+        "fully_labelled": {1: "one", 2: "two", 3: "three"},
+        "partially_labelled": {1.0: "one", 2.0: "two"},
+    }
+    expected = {**value_labels, "Z": {0: "j", 1: "k", 2: "l"}}
+
+    writer = StataWriter(path, data, value_labels=value_labels)
+    writer.write_file()
+
+    with StataReader(path) as reader:
+        reader_value_labels = reader.value_labels()
+        assert reader_value_labels == expected
+
+    msg = "Can't create value labels for notY, it wasn't found in the dataset."
+    value_labels = {"notY": {7: "label1", 8: "label2"}}
+    with pytest.raises(KeyError, match=msg):
+        StataWriter(path, data, value_labels=value_labels)
+
+    msg = (
+        "Can't create value labels for Z, value labels "
+        "can only be applied to numeric columns."
+    )
+    value_labels = {"Z": {1: "a", 2: "k", 3: "j", 4: "i"}}
+    with pytest.raises(ValueError, match=msg):
+        StataWriter(path, data, value_labels=value_labels)
+
+
+def test_non_categorical_value_label_name_conversion(temp_file):
+    # Check conversion of invalid variable names
+    data = DataFrame(
+        {
+            "invalid~!": [1, 1, 2, 3, 5, 8],  # Only alphanumeric and _
+            "6_invalid": [1, 1, 2, 3, 5, 8],  # Must start with letter or _
+            "invalid_name_longer_than_32_characters": [8, 8, 9, 9, 8, 8],  # Too long
+            "aggregate": [2, 5, 5, 6, 6, 9],  # Reserved words
+            (1, 2): [1, 2, 3, 4, 5, 6],  # Hashable non-string
+        }
+    )
+
+    value_labels = {
+        "invalid~!": {1: "label1", 2: "label2"},
+        "6_invalid": {1: "label1", 2: "label2"},
+        "invalid_name_longer_than_32_characters": {8: "eight", 9: "nine"},
+        "aggregate": {5: "five"},
+        (1, 2): {3: "three"},
+    }
+
+    expected = {
+        "invalid__": {1: "label1", 2: "label2"},
+        "_6_invalid": {1: "label1", 2: "label2"},
+        "invalid_name_longer_than_32_char": {8: "eight", 9: "nine"},
+        "_aggregate": {5: "five"},
+        "_1__2_": {3: "three"},
+    }
+
+    msg = "Not all pandas column names were valid Stata variable names"
+    with tm.assert_produces_warning(InvalidColumnName, match=msg):
+        data.to_stata(temp_file, value_labels=value_labels)
+
+    with StataReader(temp_file) as reader:
+        reader_value_labels = reader.value_labels()
+        assert reader_value_labels == expected
+
+
+def test_non_categorical_value_label_convert_categoricals_error(temp_file):
+    # Mapping more than one value to the same label is valid for Stata
+    # labels, but can't be read with convert_categoricals=True
+    value_labels = {
+        "repeated_labels": {10: "Ten", 20: "More than ten", 40: "More than ten"}
+    }
+
+    data = DataFrame(
+        {
+            "repeated_labels": [10, 10, 20, 20, 40, 40],
+        }
+    )
+
+    data.to_stata(temp_file, value_labels=value_labels)
+
+    with StataReader(temp_file, convert_categoricals=False) as reader:
+        reader_value_labels = reader.value_labels()
+    assert reader_value_labels == value_labels
+
+    col = "repeated_labels"
+    repeats = "-" * 80 + "\n" + "\n".join(["More than ten"])
+
+    msg = f"""
+Value labels for column {col} are not unique. These cannot be converted to
+pandas categoricals.
+
+Either read the file with `convert_categoricals` set to False or use the
+low level interface in `StataReader` to separately read the values and the
+value_labels.
+
+The repeated labels are:
+{repeats}
+"""
+    with pytest.raises(ValueError, match=msg):
+        read_stata(temp_file, convert_categoricals=True)
+
+
+@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        pd.BooleanDtype,
+        pd.Int8Dtype,
+        pd.Int16Dtype,
+        pd.Int32Dtype,
+        pd.Int64Dtype,
+        pd.UInt8Dtype,
+        pd.UInt16Dtype,
+        pd.UInt32Dtype,
+        pd.UInt64Dtype,
+    ],
+)
+def test_nullable_support(dtype, version, temp_file):
+    df = DataFrame(
+        {
+            "a": Series([1.0, 2.0, 3.0]),
+            "b": Series([1, pd.NA, pd.NA], dtype=dtype.name),
+            "c": Series(["a", "b", None]),
+        }
+    )
+    dtype_name = df.b.dtype.numpy_dtype.name
+    # Only use supported names: no uint, bool or int64
+    dtype_name = dtype_name.replace("u", "")
+    if dtype_name == "int64":
+        dtype_name = "int32"
+    elif dtype_name == "bool":
+        dtype_name = "int8"
+    value = StataMissingValue.BASE_MISSING_VALUES[dtype_name]
+    smv = StataMissingValue(value)
+    expected_b = Series([1, smv, smv], dtype=object, name="b")
+    expected_c = Series(["a", "b", ""], name="c")
+    df.to_stata(temp_file, write_index=False, version=version)
+    reread = read_stata(temp_file, convert_missing=True)
+    tm.assert_series_equal(df.a, reread.a)
+    tm.assert_series_equal(reread.b, expected_b)
+    tm.assert_series_equal(reread.c, expected_c)
+
+
+def test_empty_frame(temp_file):
+    # GH 46240
+    # create an empty DataFrame with int64 and float64 dtypes
+    df = DataFrame(data={"a": range(3), "b": [1.0, 2.0, 3.0]}).head(0)
+    path = temp_file
+    df.to_stata(path, write_index=False, version=117)
+    # Read entire dataframe
+    df2 = read_stata(path)
+    assert "b" in df2
+    # Dtypes don't match since no support for int32
+    dtypes = Series({"a": np.dtype("int32"), "b": np.dtype("float64")})
+    tm.assert_series_equal(df2.dtypes, dtypes)
+    # read one column of empty .dta file
+    df3 = read_stata(path, columns=["a"])
+    assert "b" not in df3
+    tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]])
+
+
+@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+def test_many_strl(temp_file, version):
+    n = 65534
+    df = DataFrame(np.arange(n), columns=["col"])
+    lbls = ["".join(v) for v in itertools.product(*([string.ascii_letters] * 3))]
+    value_labels = {"col": {i: lbls[i] for i in range(n)}}
+    df.to_stata(temp_file, value_labels=value_labels, version=version)
+
+
+@pytest.mark.parametrize("version", [117, 118, 119, None])
+def test_strl_missings(temp_file, version):
+    # GH 23633
+    # Check that strl supports None and pd.NA
+    df = DataFrame(
+        [
+            {"str1": "string" * 500, "number": 0},
+            {"str1": None, "number": 1},
+            {"str1": pd.NA, "number": 1},
+        ]
+    )
+    df.to_stata(temp_file, version=version)
+
+
+@pytest.mark.parametrize("version", [117, 118, 119, None])
+def test_ascii_error(temp_file, version):
+    # GH #61583
+    # Check that 2 byte long unicode characters doesn't cause export error
+    df = DataFrame({"doubleByteCol": ["§" * 1500]})
+    df.to_stata(temp_file, write_index=0, version=version)
+    df_input = read_stata(temp_file)
+    tm.assert_frame_equal(df, df_input)
diff --git a/pandas/tests/libs/__init__.py b/pandas/tests/libs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f24f87348595b7416e6717f929b92f0c9df22c4
--- /dev/null
+++ b/pandas/tests/libs/test_hashtable.py
@@ -0,0 +1,782 @@
+from collections import namedtuple
+from collections.abc import Generator
+from contextlib import contextmanager
+import re
+import struct
+import tracemalloc
+
+import numpy as np
+import pytest
+
+from pandas._libs import hashtable as ht
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.core.algorithms import isin
+
+
+@contextmanager
+def activated_tracemalloc() -> Generator[None, None, None]:
+    tracemalloc.start()
+    try:
+        yield
+    finally:
+        tracemalloc.stop()
+
+
+def get_allocated_khash_memory():
+    snapshot = tracemalloc.take_snapshot()
+    snapshot = snapshot.filter_traces(
+        (tracemalloc.DomainFilter(True, ht.get_hashtable_trace_domain()),)
+    )
+    return sum(x.size for x in snapshot.traces)
+
+
+@pytest.mark.parametrize(
+    "table_type, dtype",
+    [
+        (ht.PyObjectHashTable, np.object_),
+        (ht.Complex128HashTable, np.complex128),
+        (ht.Int64HashTable, np.int64),
+        (ht.UInt64HashTable, np.uint64),
+        (ht.Float64HashTable, np.float64),
+        (ht.Complex64HashTable, np.complex64),
+        (ht.Int32HashTable, np.int32),
+        (ht.UInt32HashTable, np.uint32),
+        (ht.Float32HashTable, np.float32),
+        (ht.Int16HashTable, np.int16),
+        (ht.UInt16HashTable, np.uint16),
+        (ht.Int8HashTable, np.int8),
+        (ht.UInt8HashTable, np.uint8),
+        (ht.IntpHashTable, np.intp),
+    ],
+)
+class TestHashTable:
+    def test_get_set_contains_len(self, table_type, dtype):
+        index = 5
+        table = table_type(55)
+        assert len(table) == 0
+        assert index not in table
+
+        table.set_item(index, 42)
+        assert len(table) == 1
+        assert index in table
+        assert table.get_item(index) == 42
+
+        table.set_item(index + 1, 41)
+        assert index in table
+        assert index + 1 in table
+        assert len(table) == 2
+        assert table.get_item(index) == 42
+        assert table.get_item(index + 1) == 41
+
+        table.set_item(index, 21)
+        assert index in table
+        assert index + 1 in table
+        assert len(table) == 2
+        assert table.get_item(index) == 21
+        assert table.get_item(index + 1) == 41
+        assert index + 2 not in table
+
+        table.set_item(index + 1, 21)
+        assert index in table
+        assert index + 1 in table
+        assert len(table) == 2
+        assert table.get_item(index) == 21
+        assert table.get_item(index + 1) == 21
+
+        with pytest.raises(KeyError, match=str(index + 2)):
+            table.get_item(index + 2)
+
+    def test_get_set_contains_len_mask(self, table_type, dtype):
+        if table_type == ht.PyObjectHashTable:
+            pytest.skip("Mask not supported for object")
+        index = 5
+        table = table_type(55, uses_mask=True)
+        assert len(table) == 0
+        assert index not in table
+
+        table.set_item(index, 42)
+        assert len(table) == 1
+        assert index in table
+        assert table.get_item(index) == 42
+        with pytest.raises(KeyError, match="NA"):
+            table.get_na()
+
+        table.set_item(index + 1, 41)
+        table.set_na(41)
+        assert pd.NA in table
+        assert index in table
+        assert index + 1 in table
+        assert len(table) == 3
+        assert table.get_item(index) == 42
+        assert table.get_item(index + 1) == 41
+        assert table.get_na() == 41
+
+        table.set_na(21)
+        assert index in table
+        assert index + 1 in table
+        assert len(table) == 3
+        assert table.get_item(index + 1) == 41
+        assert table.get_na() == 21
+        assert index + 2 not in table
+
+        with pytest.raises(KeyError, match=str(index + 2)):
+            table.get_item(index + 2)
+
+    def test_map_keys_to_values(self, table_type, dtype, writable):
+        # only Int64HashTable has this method
+        if table_type == ht.Int64HashTable:
+            N = 77
+            table = table_type()
+            keys = np.arange(N).astype(dtype)
+            vals = np.arange(N).astype(np.int64) + N
+            keys.flags.writeable = writable
+            vals.flags.writeable = writable
+            table.map_keys_to_values(keys, vals)
+            for i in range(N):
+                assert table.get_item(keys[i]) == i + N
+
+    def test_map_locations(self, table_type, dtype, writable):
+        N = 8
+        table = table_type()
+        keys = (np.arange(N) + N).astype(dtype)
+        keys.flags.writeable = writable
+        table.map_locations(keys)
+        for i in range(N):
+            assert table.get_item(keys[i]) == i
+
+    def test_map_locations_mask(self, table_type, dtype, writable):
+        if table_type == ht.PyObjectHashTable:
+            pytest.skip("Mask not supported for object")
+        N = 129  # must be > 128 to test GH#58924
+        table = table_type(uses_mask=True)
+        keys = (np.arange(N) + N).astype(dtype)
+        keys.flags.writeable = writable
+        mask = np.concatenate([np.repeat(False, N - 1), [True]], axis=0)
+        table.map_locations(keys, mask)
+        for i in range(N - 1):
+            assert table.get_item(keys[i]) == i
+
+        with pytest.raises(KeyError, match=re.escape(str(keys[N - 1]))):
+            table.get_item(keys[N - 1])
+
+        assert table.get_na() == N - 1
+
+    def test_lookup(self, table_type, dtype, writable):
+        N = 3
+        table = table_type()
+        keys = (np.arange(N) + N).astype(dtype)
+        keys.flags.writeable = writable
+        table.map_locations(keys)
+        result = table.lookup(keys)
+        expected = np.arange(N)
+        tm.assert_numpy_array_equal(result.astype(np.int64), expected.astype(np.int64))
+
+    def test_lookup_wrong(self, table_type, dtype):
+        if dtype in (np.int8, np.uint8):
+            N = 100
+        else:
+            N = 512
+        table = table_type()
+        keys = (np.arange(N) + N).astype(dtype)
+        table.map_locations(keys)
+        wrong_keys = np.arange(N).astype(dtype)
+        result = table.lookup(wrong_keys)
+        assert np.all(result == -1)
+
+    def test_lookup_mask(self, table_type, dtype, writable):
+        if table_type == ht.PyObjectHashTable:
+            pytest.skip("Mask not supported for object")
+        N = 3
+        table = table_type(uses_mask=True)
+        keys = (np.arange(N) + N).astype(dtype)
+        mask = np.array([False, True, False])
+        keys.flags.writeable = writable
+        table.map_locations(keys, mask)
+        result = table.lookup(keys, mask)
+        expected = np.arange(N)
+        tm.assert_numpy_array_equal(result.astype(np.int64), expected.astype(np.int64))
+
+        result = table.lookup(np.array([1 + N]).astype(dtype), np.array([False]))
+        tm.assert_numpy_array_equal(
+            result.astype(np.int64), np.array([-1], dtype=np.int64)
+        )
+
+    def test_unique(self, table_type, dtype, writable):
+        if dtype in (np.int8, np.uint8):
+            N = 88
+        else:
+            N = 1000
+        table = table_type()
+        expected = (np.arange(N) + N).astype(dtype)
+        keys = np.repeat(expected, 5)
+        keys.flags.writeable = writable
+        unique = table.unique(keys)
+        tm.assert_numpy_array_equal(unique, expected)
+
+    def test_tracemalloc_works(self, table_type, dtype):
+        if dtype in (np.int8, np.uint8):
+            N = 256
+        else:
+            N = 30000
+        keys = np.arange(N).astype(dtype)
+        with activated_tracemalloc():
+            table = table_type()
+            table.map_locations(keys)
+            used = get_allocated_khash_memory()
+            my_size = table.sizeof()
+            assert used == my_size
+            del table
+            assert get_allocated_khash_memory() == 0
+
+    def test_tracemalloc_for_empty(self, table_type, dtype):
+        with activated_tracemalloc():
+            table = table_type()
+            used = get_allocated_khash_memory()
+            my_size = table.sizeof()
+            assert used == my_size
+            del table
+            assert get_allocated_khash_memory() == 0
+
+    def test_get_state(self, table_type, dtype):
+        table = table_type(1000)
+        state = table.get_state()
+        assert state["size"] == 0
+        assert state["n_occupied"] == 0
+        assert "n_buckets" in state
+        assert "upper_bound" in state
+
+    @pytest.mark.parametrize("N", range(1, 110, 4))
+    def test_no_reallocation(self, table_type, dtype, N):
+        keys = np.arange(N).astype(dtype)
+        preallocated_table = table_type(N)
+        n_buckets_start = preallocated_table.get_state()["n_buckets"]
+        preallocated_table.map_locations(keys)
+        n_buckets_end = preallocated_table.get_state()["n_buckets"]
+        # original number of buckets was enough:
+        assert n_buckets_start == n_buckets_end
+        # check with clean table (not too much preallocated)
+        clean_table = table_type()
+        clean_table.map_locations(keys)
+        assert n_buckets_start == clean_table.get_state()["n_buckets"]
+
+
+class TestHashTableUnsorted:
+    # TODO: moved from test_algos; may be redundancies with other tests
+    def test_string_hashtable_set_item_signature(self):
+        # GH#30419 fix typing in StringHashTable.set_item to prevent segfault
+        tbl = ht.StringHashTable()
+
+        tbl.set_item("key", 1)
+        assert tbl.get_item("key") == 1
+
+        with pytest.raises(TypeError, match="'key' has incorrect type"):
+            # key arg typed as string, not object
+            tbl.set_item(4, 6)
+        with pytest.raises(TypeError, match="'val' has incorrect type"):
+            tbl.get_item(4)
+
+    def test_lookup_nan(self, writable):
+        # GH#21688 ensure we can deal with readonly memory views
+        xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3])
+        xs.setflags(write=writable)
+        m = ht.Float64HashTable()
+        m.map_locations(xs)
+        tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))
+
+    def test_add_signed_zeros(self):
+        # GH#21866 inconsistent hash-function for float64
+        # default hash-function would lead to different hash-buckets
+        # for 0.0 and -0.0 if there are more than 2^30 hash-buckets
+        # but this would mean 16GB
+        N = 4  # 12 * 10**8 would trigger the error, if you have enough memory
+        m = ht.Float64HashTable(N)
+        m.set_item(0.0, 0)
+        m.set_item(-0.0, 0)
+        assert len(m) == 1  # 0.0 and -0.0 are equivalent
+
+    def test_add_different_nans(self):
+        # GH#21866 inconsistent hash-function for float64
+        # create different nans from bit-patterns:
+        NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
+        NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
+        assert NAN1 != NAN1
+        assert NAN2 != NAN2
+        # default hash function would lead to different hash-buckets
+        # for NAN1 and NAN2 even if there are only 4 buckets:
+        m = ht.Float64HashTable()
+        m.set_item(NAN1, 0)
+        m.set_item(NAN2, 0)
+        assert len(m) == 1  # NAN1 and NAN2 are equivalent
+
+    def test_lookup_overflow(self, writable):
+        xs = np.array([1, 2, 2**63], dtype=np.uint64)
+        # GH 21688 ensure we can deal with readonly memory views
+        xs.setflags(write=writable)
+        m = ht.UInt64HashTable()
+        m.map_locations(xs)
+        tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp))
+
+    @pytest.mark.parametrize("nvals", [0, 10])  # resizing to 0 is special case
+    @pytest.mark.parametrize(
+        "htable, uniques, dtype, safely_resizes",
+        [
+            (ht.PyObjectHashTable, ht.ObjectVector, "object", False),
+            (ht.StringHashTable, ht.ObjectVector, "object", True),
+            (ht.Float64HashTable, ht.Float64Vector, "float64", False),
+            (ht.Int64HashTable, ht.Int64Vector, "int64", False),
+            (ht.Int32HashTable, ht.Int32Vector, "int32", False),
+            (ht.UInt64HashTable, ht.UInt64Vector, "uint64", False),
+        ],
+    )
+    def test_vector_resize(
+        self, writable, htable, uniques, dtype, safely_resizes, nvals
+    ):
+        # Test for memory errors after internal vector
+        # reallocations (GH 7157)
+        # Changed from using np.random.default_rng(2).rand to range
+        # which could cause flaky CI failures when safely_resizes=False
+        vals = np.array(range(1000), dtype=dtype)
+
+        # GH 21688 ensures we can deal with read-only memory views
+        vals.setflags(write=writable)
+
+        # initialise instances; cannot initialise in parametrization,
+        # as otherwise external views would be held on the array (which is
+        # one of the things this test is checking)
+        htable = htable()
+        uniques = uniques()
+
+        # get_labels may append to uniques
+        htable.get_labels(vals[:nvals], uniques, 0, -1)
+        # to_array() sets an external_view_exists flag on uniques.
+        tmp = uniques.to_array()
+        oldshape = tmp.shape
+
+        # subsequent get_labels() calls can no longer append to it
+        # (except for StringHashTables + ObjectVector)
+        if safely_resizes:
+            htable.get_labels(vals, uniques, 0, -1)
+        else:
+            with pytest.raises(ValueError, match="external reference.*"):
+                htable.get_labels(vals, uniques, 0, -1)
+
+        uniques.to_array()  # should not raise here
+        assert tmp.shape == oldshape
+
+    @pytest.mark.parametrize(
+        "hashtable",
+        [
+            ht.PyObjectHashTable,
+            ht.StringHashTable,
+            ht.Float64HashTable,
+            ht.Int64HashTable,
+            ht.Int32HashTable,
+            ht.UInt64HashTable,
+        ],
+    )
+    def test_hashtable_large_sizehint(self, hashtable):
+        # GH#22729 smoketest for not raising when passing a large size_hint
+        size_hint = np.iinfo(np.uint32).max + 1
+        hashtable(size_hint=size_hint)
+
+
+class TestPyObjectHashTableWithNans:
+    def test_nan_float(self):
+        nan1 = float("nan")
+        nan2 = float("nan")
+        assert nan1 is not nan2
+        table = ht.PyObjectHashTable()
+        table.set_item(nan1, 42)
+        assert table.get_item(nan2) == 42
+
+    def test_nan_complex_both(self):
+        nan1 = complex(float("nan"), float("nan"))
+        nan2 = complex(float("nan"), float("nan"))
+        assert nan1 is not nan2
+        table = ht.PyObjectHashTable()
+        table.set_item(nan1, 42)
+        assert table.get_item(nan2) == 42
+
+    def test_nan_complex_real(self):
+        nan1 = complex(float("nan"), 1)
+        nan2 = complex(float("nan"), 1)
+        other = complex(float("nan"), 2)
+        assert nan1 is not nan2
+        table = ht.PyObjectHashTable()
+        table.set_item(nan1, 42)
+        assert table.get_item(nan2) == 42
+        with pytest.raises(KeyError, match=re.escape(repr(other))):
+            table.get_item(other)
+
+    def test_nan_complex_imag(self):
+        nan1 = complex(1, float("nan"))
+        nan2 = complex(1, float("nan"))
+        other = complex(2, float("nan"))
+        assert nan1 is not nan2
+        table = ht.PyObjectHashTable()
+        table.set_item(nan1, 42)
+        assert table.get_item(nan2) == 42
+        with pytest.raises(KeyError, match=re.escape(repr(other))):
+            table.get_item(other)
+
+    def test_nan_in_tuple(self):
+        nan1 = (float("nan"),)
+        nan2 = (float("nan"),)
+        assert nan1[0] is not nan2[0]
+        table = ht.PyObjectHashTable()
+        table.set_item(nan1, 42)
+        assert table.get_item(nan2) == 42
+
+    def test_nan_in_nested_tuple(self):
+        nan1 = (1, (2, (float("nan"),)))
+        nan2 = (1, (2, (float("nan"),)))
+        other = (1, 2)
+        table = ht.PyObjectHashTable()
+        table.set_item(nan1, 42)
+        assert table.get_item(nan2) == 42
+        with pytest.raises(KeyError, match=re.escape(repr(other))):
+            table.get_item(other)
+
+    def test_nan_in_namedtuple(self):
+        T = namedtuple("T", ["x"])
+        nan1 = T(float("nan"))
+        nan2 = T(float("nan"))
+        assert nan1.x is not nan2.x
+        table = ht.PyObjectHashTable()
+        table.set_item(nan1, 42)
+        assert table.get_item(nan2) == 42
+
+    def test_nan_in_nested_namedtuple(self):
+        T = namedtuple("T", ["x", "y"])
+        nan1 = T(1, (2, (float("nan"),)))
+        nan2 = T(1, (2, (float("nan"),)))
+        other = T(1, 2)
+        table = ht.PyObjectHashTable()
+        table.set_item(nan1, 42)
+        assert table.get_item(nan2) == 42
+        with pytest.raises(KeyError, match=re.escape(repr(other))):
+            table.get_item(other)
+
+
+def test_hash_equal_tuple_with_nans():
+    a = (float("nan"), (float("nan"), float("nan")))
+    b = (float("nan"), (float("nan"), float("nan")))
+    assert ht.object_hash(a) == ht.object_hash(b)
+    assert ht.objects_are_equal(a, b)
+
+
+def test_hash_equal_namedtuple_with_nans():
+    T = namedtuple("T", ["x", "y"])
+    a = T(float("nan"), (float("nan"), float("nan")))
+    b = T(float("nan"), (float("nan"), float("nan")))
+    assert ht.object_hash(a) == ht.object_hash(b)
+    assert ht.objects_are_equal(a, b)
+
+
+def test_hash_equal_namedtuple_and_tuple():
+    T = namedtuple("T", ["x", "y"])
+    a = T(1, (2, 3))
+    b = (1, (2, 3))
+    assert ht.object_hash(a) == ht.object_hash(b)
+    assert ht.objects_are_equal(a, b)
+
+
+def test_get_labels_groupby_for_Int64(writable):
+    table = ht.Int64HashTable()
+    vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64)
+    vals.flags.writeable = writable
+    arr, unique = table.get_labels_groupby(vals)
+    expected_arr = np.array([0, 1, -1, 1, 0, -1], dtype=np.intp)
+    expected_unique = np.array([1, 2], dtype=np.int64)
+    tm.assert_numpy_array_equal(arr, expected_arr)
+    tm.assert_numpy_array_equal(unique, expected_unique)
+
+
+def test_tracemalloc_works_for_StringHashTable():
+    N = 1000
+    keys = np.arange(N).astype(np.str_).astype(np.object_)
+    with activated_tracemalloc():
+        table = ht.StringHashTable()
+        table.map_locations(keys)
+        used = get_allocated_khash_memory()
+        my_size = table.sizeof()
+        assert used == my_size
+        del table
+        assert get_allocated_khash_memory() == 0
+
+
+def test_tracemalloc_for_empty_StringHashTable():
+    with activated_tracemalloc():
+        table = ht.StringHashTable()
+        used = get_allocated_khash_memory()
+        my_size = table.sizeof()
+        assert used == my_size
+        del table
+        assert get_allocated_khash_memory() == 0
+
+
+@pytest.mark.parametrize("N", range(1, 110, 4))
+def test_no_reallocation_StringHashTable(N):
+    keys = np.arange(N).astype(np.str_).astype(np.object_)
+    preallocated_table = ht.StringHashTable(N)
+    n_buckets_start = preallocated_table.get_state()["n_buckets"]
+    preallocated_table.map_locations(keys)
+    n_buckets_end = preallocated_table.get_state()["n_buckets"]
+    # original number of buckets was enough:
+    assert n_buckets_start == n_buckets_end
+    # check with clean table (not too much preallocated)
+    clean_table = ht.StringHashTable()
+    clean_table.map_locations(keys)
+    assert n_buckets_start == clean_table.get_state()["n_buckets"]
+
+
+@pytest.mark.parametrize(
+    "table_type, dtype",
+    [
+        (ht.Float64HashTable, np.float64),
+        (ht.Float32HashTable, np.float32),
+        (ht.Complex128HashTable, np.complex128),
+        (ht.Complex64HashTable, np.complex64),
+    ],
+)
+class TestHashTableWithNans:
+    def test_get_set_contains_len(self, table_type, dtype):
+        index = float("nan")
+        table = table_type()
+        assert index not in table
+
+        table.set_item(index, 42)
+        assert len(table) == 1
+        assert index in table
+        assert table.get_item(index) == 42
+
+        table.set_item(index, 41)
+        assert len(table) == 1
+        assert index in table
+        assert table.get_item(index) == 41
+
+    def test_map_locations(self, table_type, dtype):
+        N = 10
+        table = table_type()
+        keys = np.full(N, np.nan, dtype=dtype)
+        table.map_locations(keys)
+        assert len(table) == 1
+        assert table.get_item(np.nan) == N - 1
+
+    def test_unique(self, table_type, dtype):
+        N = 1020
+        table = table_type()
+        keys = np.full(N, np.nan, dtype=dtype)
+        unique = table.unique(keys)
+        assert np.all(np.isnan(unique)) and len(unique) == 1
+
+
+def test_unique_for_nan_objects_floats():
+    table = ht.PyObjectHashTable()
+    keys = np.array([float("nan") for i in range(50)], dtype=np.object_)
+    unique = table.unique(keys)
+    assert len(unique) == 1
+
+
+def test_unique_for_nan_objects_complex():
+    table = ht.PyObjectHashTable()
+    keys = np.array([complex(float("nan"), 1.0) for i in range(50)], dtype=np.object_)
+    unique = table.unique(keys)
+    assert len(unique) == 1
+
+
+def test_unique_for_nan_objects_tuple():
+    table = ht.PyObjectHashTable()
+    keys = np.array(
+        [1] + [(1.0, (float("nan"), 1.0)) for i in range(50)], dtype=np.object_
+    )
+    unique = table.unique(keys)
+    assert len(unique) == 2
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.object_,
+        np.complex128,
+        np.int64,
+        np.uint64,
+        np.float64,
+        np.complex64,
+        np.int32,
+        np.uint32,
+        np.float32,
+        np.int16,
+        np.uint16,
+        np.int8,
+        np.uint8,
+        np.intp,
+    ],
+)
+class TestHelpFunctions:
+    def test_value_count(self, dtype, writable):
+        N = 43
+        expected = (np.arange(N) + N).astype(dtype)
+        values = np.repeat(expected, 5)
+        values.flags.writeable = writable
+        keys, counts, _ = ht.value_count(values, False)
+        tm.assert_numpy_array_equal(np.sort(keys), expected)
+        assert np.all(counts == 5)
+
+    def test_value_count_mask(self, dtype):
+        if dtype == np.object_:
+            pytest.skip("mask not implemented for object dtype")
+        values = np.array([1] * 5, dtype=dtype)
+        mask = np.zeros((5,), dtype=np.bool_)
+        mask[1] = True
+        mask[4] = True
+        keys, counts, na_counter = ht.value_count(values, False, mask=mask)
+        assert len(keys) == 2
+        assert na_counter == 2
+
+    def test_value_count_stable(self, dtype, writable):
+        # GH12679
+        values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
+        values.flags.writeable = writable
+        keys, counts, _ = ht.value_count(values, False)
+        tm.assert_numpy_array_equal(keys, values)
+        assert np.all(counts == 1)
+
+    def test_duplicated_first(self, dtype, writable):
+        N = 100
+        values = np.repeat(np.arange(N).astype(dtype), 5)
+        values.flags.writeable = writable
+        result = ht.duplicated(values)
+        expected = np.ones_like(values, dtype=np.bool_)
+        expected[::5] = False
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_ismember_yes(self, dtype, writable):
+        N = 127
+        arr = np.arange(N).astype(dtype)
+        values = np.arange(N).astype(dtype)
+        arr.flags.writeable = writable
+        values.flags.writeable = writable
+        result = ht.ismember(arr, values)
+        expected = np.ones_like(values, dtype=np.bool_)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_ismember_no(self, dtype):
+        N = 17
+        arr = np.arange(N).astype(dtype)
+        values = (np.arange(N) + N).astype(dtype)
+        result = ht.ismember(arr, values)
+        expected = np.zeros_like(values, dtype=np.bool_)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_mode(self, dtype, writable):
+        if dtype in (np.int8, np.uint8):
+            N = 53
+        else:
+            N = 11111
+        values = np.repeat(np.arange(N).astype(dtype), 5)
+        values[0] = 42
+        values.flags.writeable = writable
+        result = ht.mode(values, False)[0]
+        assert result == 42
+
+    def test_mode_stable(self, dtype, writable):
+        values = np.array([2, 1, 5, 22, 3, -1, 8]).astype(dtype)
+        values.flags.writeable = writable
+        keys = ht.mode(values, False)[0]
+        tm.assert_numpy_array_equal(keys, values)
+
+
+def test_modes_with_nans():
+    # GH42688, nans aren't mangled
+    nulls = [pd.NA, np.nan, pd.NaT, None]
+    values = np.array([True] + nulls * 2, dtype=np.object_)
+    modes = ht.mode(values, False)[0]
+    assert modes.size == len(nulls)
+
+
+def test_unique_label_indices_intp(writable):
+    keys = np.array([1, 2, 2, 2, 1, 3], dtype=np.intp)
+    keys.flags.writeable = writable
+    result = ht.unique_label_indices(keys)
+    expected = np.array([0, 1, 5], dtype=np.intp)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_unique_label_indices():
+    a = np.random.default_rng(2).integers(1, 1 << 10, 1 << 15).astype(np.intp)
+
+    left = ht.unique_label_indices(a)
+    right = np.unique(a, return_index=True)[1]
+
+    tm.assert_numpy_array_equal(left, right, check_dtype=False)
+
+    a[np.random.default_rng(2).choice(len(a), 10)] = -1
+    left = ht.unique_label_indices(a)
+    right = np.unique(a, return_index=True)[1][1:]
+    tm.assert_numpy_array_equal(left, right, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.float64,
+        np.float32,
+        np.complex128,
+        np.complex64,
+    ],
+)
+class TestHelpFunctionsWithNans:
+    def test_value_count(self, dtype):
+        values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
+        keys, counts, _ = ht.value_count(values, True)
+        assert len(keys) == 0
+        keys, counts, _ = ht.value_count(values, False)
+        assert len(keys) == 1 and np.all(np.isnan(keys))
+        assert counts[0] == 3
+
+    def test_duplicated_first(self, dtype):
+        values = np.array([np.nan, np.nan, np.nan], dtype=dtype)
+        result = ht.duplicated(values)
+        expected = np.array([False, True, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_ismember_yes(self, dtype):
+        arr = np.array([np.nan, np.nan, np.nan], dtype=dtype)
+        values = np.array([np.nan, np.nan], dtype=dtype)
+        result = ht.ismember(arr, values)
+        expected = np.array([True, True, True], dtype=np.bool_)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_ismember_no(self, dtype):
+        arr = np.array([np.nan, np.nan, np.nan], dtype=dtype)
+        values = np.array([1], dtype=dtype)
+        result = ht.ismember(arr, values)
+        expected = np.array([False, False, False], dtype=np.bool_)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_mode(self, dtype):
+        values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype)
+        assert ht.mode(values, True)[0] == 42
+        assert np.isnan(ht.mode(values, False)[0])
+
+
+def test_ismember_tuple_with_nans():
+    # GH-41836
+    values = np.empty(2, dtype=object)
+    values[:] = [("a", float("nan")), ("b", 1)]
+    comps = [("a", float("nan"))]
+
+    result = isin(values, comps)
+    expected = np.array([True, False], dtype=np.bool_)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_float_complex_int_are_equal_as_objects():
+    values = ["a", 5, 5.0, 5.0 + 0j]
+    comps = list(range(129))
+    result = isin(np.array(values, dtype=object), np.asarray(comps))
+    expected = np.array([False, True, True, True], dtype=np.bool_)
+    tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/libs/test_join.py b/pandas/tests/libs/test_join.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf8b4fabc54cbab338e9b4a3a061eb09b19714da
--- /dev/null
+++ b/pandas/tests/libs/test_join.py
@@ -0,0 +1,388 @@
+import numpy as np
+import pytest
+
+from pandas._libs import join as libjoin
+from pandas._libs.join import (
+    inner_join,
+    left_outer_join,
+)
+
+import pandas._testing as tm
+
+
+class TestIndexer:
+    @pytest.mark.parametrize(
+        "dtype", ["int32", "int64", "float32", "float64", "object"]
+    )
+    def test_outer_join_indexer(self, dtype):
+        indexer = libjoin.outer_join_indexer
+
+        left = np.arange(3, dtype=dtype)
+        right = np.arange(2, 5, dtype=dtype)
+        empty = np.array([], dtype=dtype)
+
+        result, lindexer, rindexer = indexer(left, right)
+        assert isinstance(result, np.ndarray)
+        assert isinstance(lindexer, np.ndarray)
+        assert isinstance(rindexer, np.ndarray)
+        tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype))
+        exp = np.array([0, 1, 2, -1, -1], dtype=np.intp)
+        tm.assert_numpy_array_equal(lindexer, exp)
+        exp = np.array([-1, -1, 0, 1, 2], dtype=np.intp)
+        tm.assert_numpy_array_equal(rindexer, exp)
+
+        result, lindexer, rindexer = indexer(empty, right)
+        tm.assert_numpy_array_equal(result, right)
+        exp = np.array([-1, -1, -1], dtype=np.intp)
+        tm.assert_numpy_array_equal(lindexer, exp)
+        exp = np.array([0, 1, 2], dtype=np.intp)
+        tm.assert_numpy_array_equal(rindexer, exp)
+
+        result, lindexer, rindexer = indexer(left, empty)
+        tm.assert_numpy_array_equal(result, left)
+        exp = np.array([0, 1, 2], dtype=np.intp)
+        tm.assert_numpy_array_equal(lindexer, exp)
+        exp = np.array([-1, -1, -1], dtype=np.intp)
+        tm.assert_numpy_array_equal(rindexer, exp)
+
+    def test_cython_left_outer_join(self):
+        left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
+        right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
+        max_group = 5
+
+        ls, rs = left_outer_join(left, right, max_group)
+
+        exp_ls = left.argsort(kind="mergesort")
+        exp_rs = right.argsort(kind="mergesort")
+
+        exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
+        exp_ri = np.array(
+            [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]
+        )
+
+        exp_ls = exp_ls.take(exp_li)
+        exp_ls[exp_li == -1] = -1
+
+        exp_rs = exp_rs.take(exp_ri)
+        exp_rs[exp_ri == -1] = -1
+
+        tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
+        tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
+
+    def test_cython_right_outer_join(self):
+        left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
+        right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
+        max_group = 5
+
+        rs, ls = left_outer_join(right, left, max_group)
+
+        exp_ls = left.argsort(kind="mergesort")
+        exp_rs = right.argsort(kind="mergesort")
+
+        #            0        1        1        1
+        exp_li = np.array(
+            [
+                0,
+                1,
+                2,
+                3,
+                4,
+                5,
+                3,
+                4,
+                5,
+                3,
+                4,
+                5,
+                #            2        2        4
+                6,
+                7,
+                8,
+                6,
+                7,
+                8,
+                -1,
+            ]
+        )
+        exp_ri = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])
+
+        exp_ls = exp_ls.take(exp_li)
+        exp_ls[exp_li == -1] = -1
+
+        exp_rs = exp_rs.take(exp_ri)
+        exp_rs[exp_ri == -1] = -1
+
+        tm.assert_numpy_array_equal(ls, exp_ls)
+        tm.assert_numpy_array_equal(rs, exp_rs)
+
+    def test_cython_inner_join(self):
+        left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
+        right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.intp)
+        max_group = 5
+
+        ls, rs = inner_join(left, right, max_group)
+
+        exp_ls = left.argsort(kind="mergesort")
+        exp_rs = right.argsort(kind="mergesort")
+
+        exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
+        exp_ri = np.array([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])
+
+        exp_ls = exp_ls.take(exp_li)
+        exp_ls[exp_li == -1] = -1
+
+        exp_rs = exp_rs.take(exp_ri)
+        exp_rs[exp_ri == -1] = -1
+
+        tm.assert_numpy_array_equal(ls, exp_ls)
+        tm.assert_numpy_array_equal(rs, exp_rs)
+
+
+def test_left_join_indexer_unique(writable):
+    a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
+    b = np.array([2, 2, 3, 4, 4], dtype=np.int64)
+    # GH#37312, GH#37264
+    a.setflags(write=writable)
+    b.setflags(write=writable)
+
+    result = libjoin.left_join_indexer_unique(b, a)
+    expected = np.array([1, 1, 2, 3, 3], dtype=np.intp)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_left_outer_join_bug():
+    left = np.array(
+        [
+            0,
+            1,
+            0,
+            1,
+            1,
+            2,
+            3,
+            1,
+            0,
+            2,
+            1,
+            2,
+            0,
+            1,
+            1,
+            2,
+            3,
+            2,
+            3,
+            2,
+            1,
+            1,
+            3,
+            0,
+            3,
+            2,
+            3,
+            0,
+            0,
+            2,
+            3,
+            2,
+            0,
+            3,
+            1,
+            3,
+            0,
+            1,
+            3,
+            0,
+            0,
+            1,
+            0,
+            3,
+            1,
+            0,
+            1,
+            0,
+            1,
+            1,
+            0,
+            2,
+            2,
+            2,
+            2,
+            2,
+            0,
+            3,
+            1,
+            2,
+            0,
+            0,
+            3,
+            1,
+            3,
+            2,
+            2,
+            0,
+            1,
+            3,
+            0,
+            2,
+            3,
+            2,
+            3,
+            3,
+            2,
+            3,
+            3,
+            1,
+            3,
+            2,
+            0,
+            0,
+            3,
+            1,
+            1,
+            1,
+            0,
+            2,
+            3,
+            3,
+            1,
+            2,
+            0,
+            3,
+            1,
+            2,
+            0,
+            2,
+        ],
+        dtype=np.intp,
+    )
+
+    right = np.array([3, 1], dtype=np.intp)
+    max_groups = 4
+
+    lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False)
+
+    exp_lidx = np.arange(len(left), dtype=np.intp)
+    exp_ridx = -np.ones(len(left), dtype=np.intp)
+
+    exp_ridx[left == 1] = 1
+    exp_ridx[left == 3] = 0
+
+    tm.assert_numpy_array_equal(lidx, exp_lidx)
+    tm.assert_numpy_array_equal(ridx, exp_ridx)
+
+
+def test_inner_join_indexer():
+    a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
+    b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
+
+    index, ares, bres = libjoin.inner_join_indexer(a, b)
+
+    index_exp = np.array([3, 5], dtype=np.int64)
+    tm.assert_almost_equal(index, index_exp)
+
+    aexp = np.array([2, 4], dtype=np.intp)
+    bexp = np.array([1, 2], dtype=np.intp)
+    tm.assert_almost_equal(ares, aexp)
+    tm.assert_almost_equal(bres, bexp)
+
+    a = np.array([5], dtype=np.int64)
+    b = np.array([5], dtype=np.int64)
+
+    index, ares, bres = libjoin.inner_join_indexer(a, b)
+    tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
+    tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
+    tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
+
+
+def test_outer_join_indexer():
+    a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
+    b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
+
+    index, ares, bres = libjoin.outer_join_indexer(a, b)
+
+    index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64)
+    tm.assert_almost_equal(index, index_exp)
+
+    aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.intp)
+    bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp)
+    tm.assert_almost_equal(ares, aexp)
+    tm.assert_almost_equal(bres, bexp)
+
+    a = np.array([5], dtype=np.int64)
+    b = np.array([5], dtype=np.int64)
+
+    index, ares, bres = libjoin.outer_join_indexer(a, b)
+    tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
+    tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
+    tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
+
+
+def test_left_join_indexer():
+    a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
+    b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
+
+    index, ares, bres = libjoin.left_join_indexer(a, b)
+
+    tm.assert_almost_equal(index, a)
+
+    aexp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
+    bexp = np.array([-1, -1, 1, -1, 2], dtype=np.intp)
+    tm.assert_almost_equal(ares, aexp)
+    tm.assert_almost_equal(bres, bexp)
+
+    a = np.array([5], dtype=np.int64)
+    b = np.array([5], dtype=np.int64)
+
+    index, ares, bres = libjoin.left_join_indexer(a, b)
+    tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
+    tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
+    tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
+
+
+def test_left_join_indexer2():
+    idx = np.array([1, 1, 2, 5], dtype=np.int64)
+    idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
+
+    res, lidx, ridx = libjoin.left_join_indexer(idx2, idx)
+
+    exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
+    tm.assert_almost_equal(res, exp_res)
+
+    exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
+    tm.assert_almost_equal(lidx, exp_lidx)
+
+    exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
+    tm.assert_almost_equal(ridx, exp_ridx)
+
+
+def test_outer_join_indexer2():
+    idx = np.array([1, 1, 2, 5], dtype=np.int64)
+    idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
+
+    res, lidx, ridx = libjoin.outer_join_indexer(idx2, idx)
+
+    exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
+    tm.assert_almost_equal(res, exp_res)
+
+    exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
+    tm.assert_almost_equal(lidx, exp_lidx)
+
+    exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
+    tm.assert_almost_equal(ridx, exp_ridx)
+
+
+def test_inner_join_indexer2():
+    idx = np.array([1, 1, 2, 5], dtype=np.int64)
+    idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
+
+    res, lidx, ridx = libjoin.inner_join_indexer(idx2, idx)
+
+    exp_res = np.array([1, 1, 2, 5], dtype=np.int64)
+    tm.assert_almost_equal(res, exp_res)
+
+    exp_lidx = np.array([0, 0, 1, 2], dtype=np.intp)
+    tm.assert_almost_equal(lidx, exp_lidx)
+
+    exp_ridx = np.array([0, 1, 2, 3], dtype=np.intp)
+    tm.assert_almost_equal(ridx, exp_ridx)
diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..792b2ef121cf2f0bd8266050217031b1c04f06c2
--- /dev/null
+++ b/pandas/tests/libs/test_lib.py
@@ -0,0 +1,358 @@
+import pickle
+
+import numpy as np
+import pytest
+
+from pandas._libs import (
+    Timedelta,
+    lib,
+    writers as libwriters,
+)
+from pandas.compat import IS64
+
+from pandas import Index
+import pandas._testing as tm
+
+
+class TestMisc:
+    def test_max_len_string_array(self):
+        arr = a = np.array(["foo", "b", np.nan], dtype="object")
+        assert libwriters.max_len_string_array(arr) == 3
+
+        # unicode
+        arr = a.astype("U").astype(object)
+        assert libwriters.max_len_string_array(arr) == 3
+
+        # bytes for python3
+        arr = a.astype("S").astype(object)
+        assert libwriters.max_len_string_array(arr) == 3
+
+        # raises
+        msg = "No matching signature found"
+        with pytest.raises(TypeError, match=msg):
+            libwriters.max_len_string_array(arr.astype("U"))
+
+    def test_fast_unique_multiple_list_gen_sort(self):
+        keys = [["p", "a"], ["n", "d"], ["a", "s"]]
+
+        gen = (key for key in keys)
+        expected = np.array(["a", "d", "n", "p", "s"])
+        out = lib.fast_unique_multiple_list_gen(gen, sort=True)
+        tm.assert_numpy_array_equal(np.array(out), expected)
+
+        gen = (key for key in keys)
+        expected = np.array(["p", "a", "n", "d", "s"])
+        out = lib.fast_unique_multiple_list_gen(gen, sort=False)
+        tm.assert_numpy_array_equal(np.array(out), expected)
+
+    def test_fast_multiget_timedelta_resos(self):
+        # This will become relevant for test_constructor_dict_timedelta64_index
+        #  once Timedelta constructor preserves reso when passed a
+        #  np.timedelta64 object
+        td = Timedelta(days=1)
+
+        mapping1 = {td: 1}
+        mapping2 = {td.as_unit("s"): 1}
+
+        oindex = Index([td * n for n in range(3)])._values.astype(object)
+
+        expected = lib.fast_multiget(mapping1, oindex)
+        result = lib.fast_multiget(mapping2, oindex)
+        tm.assert_numpy_array_equal(result, expected)
+
+        # case that can't be cast to td64ns
+        td = Timedelta(np.timedelta64(146000, "D"))
+        assert hash(td) == hash(td.as_unit("ms"))
+        assert hash(td) == hash(td.as_unit("us"))
+        mapping1 = {td: 1}
+        mapping2 = {td.as_unit("ms"): 1}
+
+        oindex = Index([td * n for n in range(3)])._values.astype(object)
+
+        expected = lib.fast_multiget(mapping1, oindex)
+        result = lib.fast_multiget(mapping2, oindex)
+        tm.assert_numpy_array_equal(result, expected)
+
+
+class TestIndexing:
+    def test_maybe_indices_to_slice_left_edge(self):
+        target = np.arange(100)
+
+        # slice
+        indices = np.array([], dtype=np.intp)
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+        assert isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+    @pytest.mark.parametrize("end", [1, 2, 5, 20, 99])
+    @pytest.mark.parametrize("step", [1, 2, 4])
+    def test_maybe_indices_to_slice_left_edge_not_slice_end_steps(self, end, step):
+        target = np.arange(100)
+        indices = np.arange(0, end, step, dtype=np.intp)
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+        assert isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+        # reverse
+        indices = indices[::-1]
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+        assert isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+    @pytest.mark.parametrize(
+        "case", [[2, 1, 2, 0], [2, 2, 1, 0], [0, 1, 2, 1], [-2, 0, 2], [2, 0, -2]]
+    )
+    def test_maybe_indices_to_slice_left_edge_not_slice(self, case):
+        # not slice
+        target = np.arange(100)
+        indices = np.array(case, dtype=np.intp)
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+        assert not isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(maybe_slice, indices)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+    @pytest.mark.parametrize("start", [0, 2, 5, 20, 97, 98])
+    @pytest.mark.parametrize("step", [1, 2, 4])
+    def test_maybe_indices_to_slice_right_edge(self, start, step):
+        target = np.arange(100)
+
+        # slice
+        indices = np.arange(start, 99, step, dtype=np.intp)
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+        assert isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+        # reverse
+        indices = indices[::-1]
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+        assert isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+    def test_maybe_indices_to_slice_right_edge_not_slice(self):
+        # not slice
+        target = np.arange(100)
+        indices = np.array([97, 98, 99, 100], dtype=np.intp)
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+        assert not isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(maybe_slice, indices)
+
+        msg = "index 100 is out of bounds for axis (0|1) with size 100"
+
+        with pytest.raises(IndexError, match=msg):
+            target[indices]
+        with pytest.raises(IndexError, match=msg):
+            target[maybe_slice]
+
+        indices = np.array([100, 99, 98, 97], dtype=np.intp)
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+        assert not isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(maybe_slice, indices)
+
+        with pytest.raises(IndexError, match=msg):
+            target[indices]
+        with pytest.raises(IndexError, match=msg):
+            target[maybe_slice]
+
+    @pytest.mark.parametrize(
+        "case", [[99, 97, 99, 96], [99, 99, 98, 97], [98, 98, 97, 96]]
+    )
+    def test_maybe_indices_to_slice_right_edge_cases(self, case):
+        target = np.arange(100)
+        indices = np.array(case, dtype=np.intp)
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+        assert not isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(maybe_slice, indices)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+    @pytest.mark.parametrize("step", [1, 2, 4, 5, 8, 9])
+    def test_maybe_indices_to_slice_both_edges(self, step):
+        target = np.arange(10)
+
+        # slice
+        indices = np.arange(0, 9, step, dtype=np.intp)
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+        assert isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+        # reverse
+        indices = indices[::-1]
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+        assert isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+    @pytest.mark.parametrize("case", [[4, 2, 0, -2], [2, 2, 1, 0], [0, 1, 2, 1]])
+    def test_maybe_indices_to_slice_both_edges_not_slice(self, case):
+        # not slice
+        target = np.arange(10)
+        indices = np.array(case, dtype=np.intp)
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+        assert not isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(maybe_slice, indices)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+    @pytest.mark.parametrize("start, end", [(2, 10), (5, 25), (65, 97)])
+    @pytest.mark.parametrize("step", [1, 2, 4, 20])
+    def test_maybe_indices_to_slice_middle(self, start, end, step):
+        target = np.arange(100)
+
+        # slice
+        indices = np.arange(start, end, step, dtype=np.intp)
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+        assert isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+        # reverse
+        indices = indices[::-1]
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+        assert isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+    @pytest.mark.parametrize(
+        "case", [[14, 12, 10, 12], [12, 12, 11, 10], [10, 11, 12, 11]]
+    )
+    def test_maybe_indices_to_slice_middle_not_slice(self, case):
+        # not slice
+        target = np.arange(100)
+        indices = np.array(case, dtype=np.intp)
+        maybe_slice = lib.maybe_indices_to_slice(indices, len(target))
+
+        assert not isinstance(maybe_slice, slice)
+        tm.assert_numpy_array_equal(maybe_slice, indices)
+        tm.assert_numpy_array_equal(target[indices], target[maybe_slice])
+
+    def test_maybe_booleans_to_slice(self):
+        arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8)
+        result = lib.maybe_booleans_to_slice(arr)
+        assert result.dtype == np.bool_
+
+        result = lib.maybe_booleans_to_slice(arr[:0])
+        assert result == slice(0, 0)
+
+    def test_get_reverse_indexer(self):
+        indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.intp)
+        result = lib.get_reverse_indexer(indexer, 5)
+        expected = np.array([4, 2, 3, 6, 7], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype", ["int64", "int32"])
+    def test_is_range_indexer(self, dtype):
+        # GH#50592
+        left = np.arange(0, 100, dtype=dtype)
+        assert lib.is_range_indexer(left, 100)
+
+    @pytest.mark.skipif(
+        not IS64,
+        reason="2**31 is too big for Py_ssize_t on 32-bit. "
+        "It doesn't matter though since you cannot create an array that long on 32-bit",
+    )
+    @pytest.mark.parametrize("dtype", ["int64", "int32"])
+    def test_is_range_indexer_big_n(self, dtype):
+        # GH53616
+        left = np.arange(0, 100, dtype=dtype)
+
+        assert not lib.is_range_indexer(left, 2**31)
+
+    @pytest.mark.parametrize("dtype", ["int64", "int32"])
+    def test_is_range_indexer_not_equal(self, dtype):
+        # GH#50592
+        left = np.array([1, 2], dtype=dtype)
+        assert not lib.is_range_indexer(left, 2)
+
+    @pytest.mark.parametrize("dtype", ["int64", "int32"])
+    def test_is_range_indexer_not_equal_shape(self, dtype):
+        # GH#50592
+        left = np.array([0, 1, 2], dtype=dtype)
+        assert not lib.is_range_indexer(left, 2)
+
+
+def test_cache_readonly_preserve_docstrings():
+    # GH18197
+    assert Index.hasnans.__doc__ is not None
+
+
+def test_no_default_pickle(temp_file):
+    # GH#40397
+    obj = tm.round_trip_pickle(lib.no_default, temp_file)
+    assert obj is lib.no_default
+
+
+def test_ensure_string_array_copy():
+    # ensure the original array is not modified in case of copy=False with
+    # pickle-roundtripped object dtype array
+    # https://github.com/pandas-dev/pandas/issues/54654
+    arr = np.array(["a", None], dtype=object)
+    arr = pickle.loads(pickle.dumps(arr))
+    result = lib.ensure_string_array(arr, copy=False)
+    assert not np.shares_memory(arr, result)
+    assert arr[1] is None
+    assert result[1] is np.nan
+
+
+def test_ensure_string_array_list_of_lists():
+    # GH#61155: ensure list of lists doesn't get converted to string
+    arr = [list("test"), list("word")]
+    result = lib.ensure_string_array(arr)
+
+    # Each item in result should still be a list, not a stringified version
+    expected = np.array(["['t', 'e', 's', 't']", "['w', 'o', 'r', 'd']"], dtype=object)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_item_from_zerodim_for_subclasses():
+    # GH#62981 Ensure item_from_zerodim preserves subclasses of ndarray
+    # Define a custom ndarray subclass
+    class TestArray(np.ndarray):
+        def __new__(cls, input_array):
+            return np.asarray(input_array).view(cls)
+
+        def __array_finalize__(self, obj) -> None:
+            self._is_test_array = True
+
+    # Define test data
+    val_0_dim = 1
+    val_1_dim = [1, 2, 3]
+
+    # 0-dim and 1-dim numpy arrays
+    arr_0_dim = np.array(val_0_dim)
+    arr_1_dim = np.array(val_1_dim)
+
+    # 0-dim and 1-dim TestArray arrays
+    test_arr_0_dim = TestArray(val_0_dim)
+    test_arr_1_dim = TestArray(val_1_dim)
+
+    # Check that behavior did not change for regular numpy arrays
+    # Test with regular numpy 0-dim array
+    result = lib.item_from_zerodim(arr_0_dim)
+    expected = val_0_dim
+    assert result == expected
+    assert np.isscalar(result)
+
+    # Test with regular numpy 1-dim array
+    result = lib.item_from_zerodim(arr_1_dim)
+    expected = arr_1_dim
+    tm.assert_numpy_array_equal(result, expected)
+    assert isinstance(result, np.ndarray)
+
+    # Check that behaviour for subclasses now is as expected
+    # Test with TestArray 0-dim array
+    result = lib.item_from_zerodim(test_arr_0_dim)
+    expected = test_arr_0_dim
+    assert result == expected
+    assert isinstance(result, TestArray)
+
+    # Test with TestArray 1-dim array
+    result = lib.item_from_zerodim(test_arr_1_dim)
+    expected = test_arr_1_dim
+    assert np.all(result == expected)
+    assert isinstance(result, TestArray)
diff --git a/pandas/tests/libs/test_libalgos.py b/pandas/tests/libs/test_libalgos.py
new file mode 100644
index 0000000000000000000000000000000000000000..42d09c72aab2baa9636093d172d864cbe0e41b12
--- /dev/null
+++ b/pandas/tests/libs/test_libalgos.py
@@ -0,0 +1,162 @@
+from datetime import datetime
+from itertools import permutations
+
+import numpy as np
+
+from pandas._libs import algos as libalgos
+
+import pandas._testing as tm
+
+
+def test_ensure_platform_int():
+    arr = np.arange(100, dtype=np.intp)
+
+    result = libalgos.ensure_platform_int(arr)
+    assert result is arr
+
+
+def test_is_lexsorted():
+    failure = [
+        np.array(
+            ([3] * 32) + ([2] * 32) + ([1] * 32) + ([0] * 32),
+            dtype="int64",
+        ),
+        np.array(
+            list(range(31))[::-1] * 4,
+            dtype="int64",
+        ),
+    ]
+
+    assert not libalgos.is_lexsorted(failure)
+
+
+def test_groupsort_indexer():
+    a = np.random.default_rng(2).integers(0, 1000, 100).astype(np.intp)
+    b = np.random.default_rng(2).integers(0, 1000, 100).astype(np.intp)
+
+    result = libalgos.groupsort_indexer(a, 1000)[0]
+
+    # need to use a stable sort
+    # np.argsort returns int, groupsort_indexer
+    # always returns intp
+    expected = np.argsort(a, kind="mergesort")
+    expected = expected.astype(np.intp)
+
+    tm.assert_numpy_array_equal(result, expected)
+
+    # compare with lexsort
+    # np.lexsort returns int, groupsort_indexer
+    # always returns intp
+    key = a * 1000 + b
+    result = libalgos.groupsort_indexer(key, 1000000)[0]
+    expected = np.lexsort((b, a))
+    expected = expected.astype(np.intp)
+
+    tm.assert_numpy_array_equal(result, expected)
+
+
+class TestPadBackfill:
+    def test_backfill(self):
+        old = np.array([1, 5, 10], dtype=np.int64)
+        new = np.array(list(range(12)), dtype=np.int64)
+
+        filler = libalgos.backfill["int64_t"](old, new)
+
+        expect_filler = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=np.intp)
+        tm.assert_numpy_array_equal(filler, expect_filler)
+
+        # corner case
+        old = np.array([1, 4], dtype=np.int64)
+        new = np.array(list(range(5, 10)), dtype=np.int64)
+        filler = libalgos.backfill["int64_t"](old, new)
+
+        expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp)
+        tm.assert_numpy_array_equal(filler, expect_filler)
+
+    def test_pad(self):
+        old = np.array([1, 5, 10], dtype=np.int64)
+        new = np.array(list(range(12)), dtype=np.int64)
+
+        filler = libalgos.pad["int64_t"](old, new)
+
+        expect_filler = np.array([-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2], dtype=np.intp)
+        tm.assert_numpy_array_equal(filler, expect_filler)
+
+        # corner case
+        old = np.array([5, 10], dtype=np.int64)
+        new = np.arange(5, dtype=np.int64)
+        filler = libalgos.pad["int64_t"](old, new)
+        expect_filler = np.array([-1, -1, -1, -1, -1], dtype=np.intp)
+        tm.assert_numpy_array_equal(filler, expect_filler)
+
+    def test_pad_backfill_object_segfault(self):
+        old = np.array([], dtype="O")
+        new = np.array([datetime(2010, 12, 31)], dtype="O")
+
+        result = libalgos.pad["object"](old, new)
+        expected = np.array([-1], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = libalgos.pad["object"](new, old)
+        expected = np.array([], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = libalgos.backfill["object"](old, new)
+        expected = np.array([-1], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = libalgos.backfill["object"](new, old)
+        expected = np.array([], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+
+
+class TestInfinity:
+    def test_infinity_sort(self):
+        # GH#13445
+        # numpy's argsort can be unhappy if something is less than
+        # itself.  Instead, let's give our infinities a self-consistent
+        # ordering, but outside the float extended real line.
+
+        Inf = libalgos.Infinity()
+        NegInf = libalgos.NegInfinity()
+
+        ref_nums = [NegInf, float("-inf"), -1e100, 0, 1e100, float("inf"), Inf]
+
+        assert all(Inf >= x for x in ref_nums)
+        assert all(Inf > x or x is Inf for x in ref_nums)
+        assert Inf >= Inf and Inf == Inf
+        assert not Inf < Inf and not Inf > Inf
+        assert libalgos.Infinity() == libalgos.Infinity()
+        assert not libalgos.Infinity() != libalgos.Infinity()
+
+        assert all(NegInf <= x for x in ref_nums)
+        assert all(NegInf < x or x is NegInf for x in ref_nums)
+        assert NegInf <= NegInf and NegInf == NegInf
+        assert not NegInf < NegInf and not NegInf > NegInf
+        assert libalgos.NegInfinity() == libalgos.NegInfinity()
+        assert not libalgos.NegInfinity() != libalgos.NegInfinity()
+
+        for perm in permutations(ref_nums):
+            assert sorted(perm) == ref_nums
+
+        # smoke tests
+        np.array([libalgos.Infinity()] * 32).argsort()
+        np.array([libalgos.NegInfinity()] * 32).argsort()
+
+    def test_infinity_against_nan(self):
+        Inf = libalgos.Infinity()
+        NegInf = libalgos.NegInfinity()
+
+        assert not Inf > np.nan
+        assert not Inf >= np.nan
+        assert not Inf < np.nan
+        assert not Inf <= np.nan
+        assert not Inf == np.nan
+        assert Inf != np.nan
+
+        assert not NegInf > np.nan
+        assert not NegInf >= np.nan
+        assert not NegInf < np.nan
+        assert not NegInf <= np.nan
+        assert not NegInf == np.nan
+        assert NegInf != np.nan
diff --git a/pandas/tests/plotting/__init__.py b/pandas/tests/plotting/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..588bbf88e856243ce539a5378b22468228bcf280
--- /dev/null
+++ b/pandas/tests/plotting/common.py
@@ -0,0 +1,579 @@
+"""
+Module consolidating common testing functions for checking plotting.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from pandas.core.dtypes.api import is_list_like
+
+import pandas as pd
+from pandas import Series
+import pandas._testing as tm
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from matplotlib.axes import Axes
+
+
+def _check_legend_labels(axes, labels=None, visible=True):
+    """
+    Check each axes has expected legend labels
+
+    Parameters
+    ----------
+    axes : matplotlib Axes object, or its list-like
+    labels : list-like
+        expected legend labels
+    visible : bool
+        expected legend visibility. labels are checked only when visible is
+        True
+    """
+    if visible and (labels is None):
+        raise ValueError("labels must be specified when visible is True")
+    axes = _flatten_visible(axes)
+    for ax in axes:
+        if visible:
+            assert ax.get_legend() is not None
+            _check_text_labels(ax.get_legend().get_texts(), labels)
+        else:
+            assert ax.get_legend() is None
+
+
+def _check_legend_marker(ax, expected_markers=None, visible=True):
+    """
+    Check ax has expected legend markers
+
+    Parameters
+    ----------
+    ax : matplotlib Axes object
+    expected_markers : list-like
+        expected legend markers
+    visible : bool
+        expected legend visibility. labels are checked only when visible is
+        True
+    """
+    if visible and (expected_markers is None):
+        raise ValueError("Markers must be specified when visible is True")
+    if visible:
+        handles, _ = ax.get_legend_handles_labels()
+        markers = [handle.get_marker() for handle in handles]
+        assert markers == expected_markers
+    else:
+        assert ax.get_legend() is None
+
+
+def _check_data(xp, rs):
+    """
+    Check each axes has identical lines
+
+    Parameters
+    ----------
+    xp : matplotlib Axes object
+    rs : matplotlib Axes object
+    """
+    xp_lines = xp.get_lines()
+    rs_lines = rs.get_lines()
+
+    assert len(xp_lines) == len(rs_lines)
+    for xpl, rsl in zip(xp_lines, rs_lines, strict=True):
+        xpdata = xpl.get_xydata()
+        rsdata = rsl.get_xydata()
+        tm.assert_almost_equal(xpdata, rsdata)
+
+
+def _check_visible(collections, visible=True):
+    """
+    Check each artist is visible or not
+
+    Parameters
+    ----------
+    collections : matplotlib Artist or its list-like
+        target Artist or its list or collection
+    visible : bool
+        expected visibility
+    """
+    from matplotlib.collections import Collection
+
+    if not isinstance(collections, Collection) and not is_list_like(collections):
+        collections = [collections]
+
+    for patch in collections:
+        assert patch.get_visible() == visible
+
+
+def _check_patches_all_filled(axes: Axes | Sequence[Axes], filled: bool = True) -> None:
+    """
+    Check for each artist whether it is filled or not
+
+    Parameters
+    ----------
+    axes : matplotlib Axes object, or its list-like
+    filled : bool
+        expected filling
+    """
+
+    axes = _flatten_visible(axes)
+    for ax in axes:
+        for patch in ax.patches:
+            assert patch.fill == filled
+
+
+def _get_colors_mapped(series, colors):
+    unique = series.unique()
+    # unique and colors length can be differed
+    # depending on slice value
+    mapped = dict(zip(unique, colors))
+    return [mapped[v] for v in series.values]
+
+
+def _check_colors(collections, linecolors=None, facecolors=None, mapping=None):
+    """
+    Check each artist has expected line colors and face colors
+
+    Parameters
+    ----------
+    collections : list-like
+        list or collection of target artist
+    linecolors : list-like which has the same length as collections
+        list of expected line colors
+    facecolors : list-like which has the same length as collections
+        list of expected face colors
+    mapping : Series
+        Series used for color grouping key
+        used for andrew_curves, parallel_coordinates, radviz test
+    """
+    from matplotlib import colors
+    from matplotlib.collections import (
+        Collection,
+        LineCollection,
+        PolyCollection,
+    )
+    from matplotlib.lines import Line2D
+
+    conv = colors.ColorConverter
+    if linecolors is not None:
+        if mapping is not None:
+            linecolors = _get_colors_mapped(mapping, linecolors)
+            linecolors = linecolors[: len(collections)]
+
+        assert len(collections) == len(linecolors)
+        for patch, color in zip(collections, linecolors, strict=True):
+            if isinstance(patch, Line2D):
+                result = patch.get_color()
+                # Line2D may contains string color expression
+                result = conv.to_rgba(result)
+            elif isinstance(patch, (PolyCollection, LineCollection)):
+                result = tuple(patch.get_edgecolor()[0])
+            else:
+                result = patch.get_edgecolor()
+
+            expected = conv.to_rgba(color)
+            assert result == expected
+
+    if facecolors is not None:
+        if mapping is not None:
+            facecolors = _get_colors_mapped(mapping, facecolors)
+            facecolors = facecolors[: len(collections)]
+
+        assert len(collections) == len(facecolors)
+        for patch, color in zip(collections, facecolors, strict=True):
+            if isinstance(patch, Collection):
+                # returned as list of np.array
+                result = patch.get_facecolor()[0]
+            else:
+                result = patch.get_facecolor()
+
+            if isinstance(result, np.ndarray):
+                result = tuple(result)
+
+            expected = conv.to_rgba(color)
+            assert result == expected
+
+
+def _check_text_labels(texts, expected):
+    """
+    Check each text has expected labels
+
+    Parameters
+    ----------
+    texts : matplotlib Text object, or its list-like
+        target text, or its list
+    expected : str or list-like which has the same length as texts
+        expected text label, or its list
+    """
+    if not is_list_like(texts):
+        assert texts.get_text() == expected
+    else:
+        labels = [t.get_text() for t in texts]
+        assert len(labels) == len(expected)
+        for label, e in zip(labels, expected, strict=True):
+            assert label == e
+
+
+def _check_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None):
+    """
+    Check each axes has expected tick properties
+
+    Parameters
+    ----------
+    axes : matplotlib Axes object, or its list-like
+    xlabelsize : number
+        expected xticks font size
+    xrot : number
+        expected xticks rotation
+    ylabelsize : number
+        expected yticks font size
+    yrot : number
+        expected yticks rotation
+    """
+    from matplotlib.ticker import NullFormatter
+
+    axes = _flatten_visible(axes)
+    for ax in axes:
+        if xlabelsize is not None or xrot is not None:
+            if isinstance(ax.xaxis.get_minor_formatter(), NullFormatter):
+                # If minor ticks has NullFormatter, rot / fontsize are not
+                # retained
+                labels = ax.get_xticklabels()
+            else:
+                labels = ax.get_xticklabels() + ax.get_xticklabels(minor=True)
+
+            for label in labels:
+                if xlabelsize is not None:
+                    tm.assert_almost_equal(label.get_fontsize(), xlabelsize)
+                if xrot is not None:
+                    tm.assert_almost_equal(label.get_rotation(), xrot)
+
+        if ylabelsize is not None or yrot is not None:
+            if isinstance(ax.yaxis.get_minor_formatter(), NullFormatter):
+                labels = ax.get_yticklabels()
+            else:
+                labels = ax.get_yticklabels() + ax.get_yticklabels(minor=True)
+
+            for label in labels:
+                if ylabelsize is not None:
+                    tm.assert_almost_equal(label.get_fontsize(), ylabelsize)
+                if yrot is not None:
+                    tm.assert_almost_equal(label.get_rotation(), yrot)
+
+
+def _check_ax_scales(axes, xaxis="linear", yaxis="linear"):
+    """
+    Check each axes has expected scales
+
+    Parameters
+    ----------
+    axes : matplotlib Axes object, or its list-like
+    xaxis : {'linear', 'log'}
+        expected xaxis scale
+    yaxis : {'linear', 'log'}
+        expected yaxis scale
+    """
+    axes = _flatten_visible(axes)
+    for ax in axes:
+        assert ax.xaxis.get_scale() == xaxis
+        assert ax.yaxis.get_scale() == yaxis
+
+
+def _check_axes_shape(axes, axes_num=None, layout=None, figsize=None):
+    """
+    Check expected number of axes is drawn in expected layout
+
+    Parameters
+    ----------
+    axes : matplotlib Axes object, or its list-like
+    axes_num : number
+        expected number of axes. Unnecessary axes should be set to
+        invisible.
+    layout : tuple
+        expected layout, (expected number of rows , columns)
+    figsize : tuple
+        expected figsize. default is matplotlib default
+    """
+    from pandas.plotting._matplotlib.tools import flatten_axes
+
+    if figsize is None:
+        figsize = (6.4, 4.8)
+    visible_axes = _flatten_visible(axes)
+
+    if axes_num is not None:
+        assert len(visible_axes) == axes_num
+        for ax in visible_axes:
+            # check something drawn on visible axes
+            assert len(ax.get_children()) > 0
+
+    if layout is not None:
+        x_set = set()
+        y_set = set()
+        for ax in flatten_axes(axes):
+            # check axes coordinates to estimate layout
+            points = ax.get_position().get_points()
+            x_set.add(points[0][0])
+            y_set.add(points[0][1])
+        result = (len(y_set), len(x_set))
+        assert result == layout
+
+    tm.assert_numpy_array_equal(
+        visible_axes[0].figure.get_size_inches(),
+        np.array(figsize, dtype=np.float64),
+    )
+
+
+def _flatten_visible(axes: Axes | Sequence[Axes]) -> Sequence[Axes]:
+    """
+    Flatten axes, and filter only visible
+
+    Parameters
+    ----------
+    axes : matplotlib Axes object, or its list-like
+
+    """
+    from pandas.plotting._matplotlib.tools import flatten_axes
+
+    axes_ndarray = flatten_axes(axes)
+    axes = [ax for ax in axes_ndarray if ax.get_visible()]
+    return axes
+
+
+def _check_has_errorbars(axes, xerr=0, yerr=0):
+    """
+    Check axes has expected number of errorbars
+
+    Parameters
+    ----------
+    axes : matplotlib Axes object, or its list-like
+    xerr : number
+        expected number of x errorbar
+    yerr : number
+        expected number of y errorbar
+    """
+    axes = _flatten_visible(axes)
+    for ax in axes:
+        containers = ax.containers
+        xerr_count = 0
+        yerr_count = 0
+        for c in containers:
+            has_xerr = getattr(c, "has_xerr", False)
+            has_yerr = getattr(c, "has_yerr", False)
+            if has_xerr:
+                xerr_count += 1
+            if has_yerr:
+                yerr_count += 1
+        assert xerr == xerr_count
+        assert yerr == yerr_count
+
+
+def _check_box_return_type(
+    returned, return_type, expected_keys=None, check_ax_title=True
+):
+    """
+    Check box returned type is correct
+
+    Parameters
+    ----------
+    returned : object to be tested, returned from boxplot
+    return_type : str
+        return_type passed to boxplot
+    expected_keys : list-like, optional
+        group labels in subplot case. If not passed,
+        the function checks assuming boxplot uses single ax
+    check_ax_title : bool
+        Whether to check the ax.title is the same as expected_key
+        Intended to be checked by calling from ``boxplot``.
+        Normal ``plot`` doesn't attach ``ax.title``, it must be disabled.
+    """
+    from matplotlib.axes import Axes
+
+    types = {"dict": dict, "axes": Axes, "both": tuple}
+    if expected_keys is None:
+        # should be fixed when the returning default is changed
+        if return_type is None:
+            return_type = "dict"
+
+        assert isinstance(returned, types[return_type])
+        if return_type == "both":
+            assert isinstance(returned.ax, Axes)
+            assert isinstance(returned.lines, dict)
+    else:
+        # should be fixed when the returning default is changed
+        if return_type is None:
+            for r in _flatten_visible(returned):
+                assert isinstance(r, Axes)
+            return
+
+        assert isinstance(returned, Series)
+
+        assert sorted(returned.keys()) == sorted(expected_keys)
+        for key, value in returned.items():
+            assert isinstance(value, types[return_type])
+            # check returned dict has correct mapping
+            if return_type == "axes":
+                if check_ax_title:
+                    assert value.get_title() == key
+            elif return_type == "both":
+                if check_ax_title:
+                    assert value.ax.get_title() == key
+                assert isinstance(value.ax, Axes)
+                assert isinstance(value.lines, dict)
+            elif return_type == "dict":
+                line = value["medians"][0]
+                axes = line.axes
+                if check_ax_title:
+                    assert axes.get_title() == key
+            else:
+                raise AssertionError
+
+
+def _check_grid_settings(obj, kinds, kws=None):
+    # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792
+
+    import matplotlib as mpl
+
+    def is_grid_on():
+        xticks = mpl.pyplot.gca().xaxis.get_major_ticks()
+        yticks = mpl.pyplot.gca().yaxis.get_major_ticks()
+        xoff = all(not g.gridline.get_visible() for g in xticks)
+        yoff = all(not g.gridline.get_visible() for g in yticks)
+
+        return not (xoff and yoff)
+
+    if kws is None:
+        kws = {}
+    spndx = 1
+    for kind in kinds:
+        mpl.pyplot.subplot(1, 4 * len(kinds), spndx)
+        spndx += 1
+        mpl.rc("axes", grid=False)
+        obj.plot(kind=kind, **kws)
+        assert not is_grid_on()
+        mpl.pyplot.clf()
+
+        mpl.pyplot.subplot(1, 4 * len(kinds), spndx)
+        spndx += 1
+        mpl.rc("axes", grid=True)
+        obj.plot(kind=kind, grid=False, **kws)
+        assert not is_grid_on()
+        mpl.pyplot.clf()
+
+        if kind not in ["pie", "hexbin", "scatter"]:
+            mpl.pyplot.subplot(1, 4 * len(kinds), spndx)
+            spndx += 1
+            mpl.rc("axes", grid=True)
+            obj.plot(kind=kind, **kws)
+            assert is_grid_on()
+            mpl.pyplot.clf()
+
+            mpl.pyplot.subplot(1, 4 * len(kinds), spndx)
+            spndx += 1
+            mpl.rc("axes", grid=False)
+            obj.plot(kind=kind, grid=True, **kws)
+            assert is_grid_on()
+            mpl.pyplot.clf()
+
+
+def _unpack_cycler(rcParams, field="color"):
+    """
+    Auxiliary function for correctly unpacking cycler after MPL >= 1.5
+    """
+    return [v[field] for v in rcParams["axes.prop_cycle"]]
+
+
+def get_x_axis(ax):
+    return ax._shared_axes["x"]
+
+
+def get_y_axis(ax):
+    return ax._shared_axes["y"]
+
+
+def assert_is_valid_plot_return_object(objs) -> None:
+    from matplotlib.artist import Artist
+    from matplotlib.axes import Axes
+
+    if isinstance(objs, (Series, np.ndarray)):
+        if isinstance(objs, Series):
+            objs = objs._values
+        for el in objs.reshape(-1):
+            msg = (
+                "one of 'objs' is not a matplotlib Axes instance, "
+                f"type encountered {type(el).__name__!r}"
+            )
+            assert isinstance(el, (Axes, dict)), msg
+    else:
+        msg = (
+            "objs is neither an ndarray of Artist instances nor a single "
+            "ArtistArtist instance, tuple, or dict, 'objs' is a "
+            f"{type(objs).__name__!r}"
+        )
+        assert isinstance(objs, (Artist, tuple, dict)), msg
+
+
+def _check_plot_works(f, default_axes=False, **kwargs):
+    """
+    Create plot and ensure that plot return object is valid.
+
+    Parameters
+    ----------
+    f : func
+        Plotting function.
+    default_axes : bool, optional
+        If False (default):
+            - If `ax` not in `kwargs`, then create subplot(211) and plot there
+            - Create new subplot(212) and plot there as well
+            - Mind special corner case for bootstrap_plot (see `_gen_two_subplots`)
+        If True:
+            - Simply run plotting function with kwargs provided
+            - All required axes instances will be created automatically
+            - It is recommended to use it when the plotting function
+            creates multiple axes itself. It helps avoid warnings like
+            'UserWarning: To output multiple subplots,
+            the figure containing the passed axes is being cleared'
+    **kwargs
+        Keyword arguments passed to the plotting function.
+
+    Returns
+    -------
+    Plot object returned by the last plotting.
+    """
+    import matplotlib.pyplot as plt
+
+    if default_axes:
+        gen_plots = _gen_default_plot
+    else:
+        gen_plots = _gen_two_subplots
+
+    ret = None
+    fig = kwargs.get("figure", plt.gcf())
+    fig.clf()
+
+    for ret in gen_plots(f, fig, **kwargs):
+        assert_is_valid_plot_return_object(ret)
+
+    return ret
+
+
+def _gen_default_plot(f, fig, **kwargs):
+    """
+    Create plot in a default way.
+    """
+    yield f(**kwargs)
+
+
+def _gen_two_subplots(f, fig, **kwargs):
+    """
+    Create plot on two subplots forcefully created.
+    """
+    if "ax" not in kwargs:
+        fig.add_subplot(211)
+    yield f(**kwargs)
+
+    if f is pd.plotting.bootstrap_plot:
+        assert "ax" not in kwargs
+    else:
+        kwargs["ax"] = fig.add_subplot(212)
+    yield f(**kwargs)
diff --git a/pandas/tests/plotting/conftest.py b/pandas/tests/plotting/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb5a1f1f6382e3fe48fd0fa4050a673160807a9a
--- /dev/null
+++ b/pandas/tests/plotting/conftest.py
@@ -0,0 +1,39 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    to_datetime,
+)
+
+
+@pytest.fixture(autouse=True)
+def autouse_mpl_cleanup(mpl_cleanup):
+    pass
+
+
+@pytest.fixture
+def hist_df():
+    n = 50
+    rng = np.random.default_rng(10)
+    gender = rng.choice(["Male", "Female"], size=n)
+    classroom = rng.choice(["A", "B", "C"], size=n)
+
+    hist_df = DataFrame(
+        {
+            "gender": gender,
+            "classroom": classroom,
+            "height": rng.normal(66, 4, size=n),
+            "weight": rng.normal(161, 32, size=n),
+            "category": rng.integers(4, size=n),
+            "datetime": to_datetime(
+                rng.integers(
+                    812419200000000000,
+                    819331200000000000,
+                    size=n,
+                    dtype=np.int64,
+                )
+            ),
+        }
+    )
+    return hist_df
diff --git a/pandas/tests/plotting/test_backend.py b/pandas/tests/plotting/test_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..683bfcfe54f3ed45567c77b636119973ac910f38
--- /dev/null
+++ b/pandas/tests/plotting/test_backend.py
@@ -0,0 +1,100 @@
+import sys
+import types
+
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas
+
+pytestmark = pytest.mark.single_cpu
+
+
+@pytest.fixture
+def dummy_backend():
+    db = types.ModuleType("pandas_dummy_backend")
+    setattr(db, "plot", lambda *args, **kwargs: "used_dummy")
+    return db
+
+
+@pytest.fixture
+def restore_backend():
+    """Restore the plotting backend to matplotlib"""
+    with pandas.option_context("plotting.backend", "matplotlib"):
+        yield
+
+
+def test_backend_is_not_module():
+    msg = "Could not find plotting backend 'not_an_existing_module'."
+    with pytest.raises(ValueError, match=msg):
+        pandas.set_option("plotting.backend", "not_an_existing_module")
+
+    assert pandas.options.plotting.backend == "matplotlib"
+
+
+def test_backend_is_correct(monkeypatch, restore_backend, dummy_backend):
+    monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend)
+
+    pandas.set_option("plotting.backend", "pandas_dummy_backend")
+    assert pandas.get_option("plotting.backend") == "pandas_dummy_backend"
+    assert (
+        pandas.plotting._core._get_plot_backend("pandas_dummy_backend") is dummy_backend
+    )
+
+
+def test_backend_can_be_set_in_plot_call(monkeypatch, restore_backend, dummy_backend):
+    monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend)
+    df = pandas.DataFrame([1, 2, 3])
+
+    assert pandas.get_option("plotting.backend") == "matplotlib"
+    assert df.plot(backend="pandas_dummy_backend") == "used_dummy"
+
+
+def test_register_entrypoint(restore_backend, tmp_path, monkeypatch, dummy_backend):
+    monkeypatch.syspath_prepend(tmp_path)
+    monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend)
+
+    dist_info = tmp_path / "my_backend-0.0.0.dist-info"
+    dist_info.mkdir()
+    # entry_point name should not match module name - otherwise pandas will
+    # fall back to backend lookup by module name
+    (dist_info / "entry_points.txt").write_bytes(
+        b"[pandas_plotting_backends]\nmy_ep_backend = pandas_dummy_backend\n"
+    )
+
+    assert pandas.plotting._core._get_plot_backend("my_ep_backend") is dummy_backend
+
+    with pandas.option_context("plotting.backend", "my_ep_backend"):
+        assert pandas.plotting._core._get_plot_backend() is dummy_backend
+
+
+def test_setting_backend_without_plot_raises(monkeypatch):
+    # GH-28163
+    module = types.ModuleType("pandas_plot_backend")
+    monkeypatch.setitem(sys.modules, "pandas_plot_backend", module)
+
+    assert pandas.options.plotting.backend == "matplotlib"
+    with pytest.raises(
+        ValueError, match="Could not find plotting backend 'pandas_plot_backend'."
+    ):
+        pandas.set_option("plotting.backend", "pandas_plot_backend")
+
+    assert pandas.options.plotting.backend == "matplotlib"
+
+
+@td.skip_if_installed("matplotlib")
+def test_no_matplotlib_ok():
+    msg = (
+        'matplotlib is required for plotting when the default backend "matplotlib" is '
+        "selected."
+    )
+    with pytest.raises(ImportError, match=msg):
+        pandas.plotting._core._get_plot_backend("matplotlib")
+
+
+def test_extra_kinds_ok(monkeypatch, restore_backend, dummy_backend):
+    # https://github.com/pandas-dev/pandas/pull/28647
+    monkeypatch.setitem(sys.modules, "pandas_dummy_backend", dummy_backend)
+    pandas.set_option("plotting.backend", "pandas_dummy_backend")
+    df = pandas.DataFrame({"A": [1, 2, 3]})
+    df.plot(kind="not a real kind")
diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py
new file mode 100644
index 0000000000000000000000000000000000000000..3554f1549e4889660340eb0ea2c41149e55df9a0
--- /dev/null
+++ b/pandas/tests/plotting/test_boxplot_method.py
@@ -0,0 +1,774 @@
+"""Test cases for .boxplot method"""
+
+from __future__ import annotations
+
+import itertools
+import string
+
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    MultiIndex,
+    Series,
+    date_range,
+    plotting,
+    timedelta_range,
+)
+import pandas._testing as tm
+from pandas.tests.plotting.common import (
+    _check_axes_shape,
+    _check_box_return_type,
+    _check_plot_works,
+    _check_ticks_props,
+    _check_visible,
+)
+from pandas.util.version import Version
+
+from pandas.io.formats.printing import pprint_thing
+
+mpl = pytest.importorskip("matplotlib")
+plt = pytest.importorskip("matplotlib.pyplot")
+
+
+def _check_ax_limits(col, ax):
+    y_min, y_max = ax.get_ylim()
+    assert y_min <= col.min()
+    assert y_max >= col.max()
+
+
+if Version(mpl.__version__) < Version("3.10"):
+    verts: list[dict[str, bool | str]] = [{"vert": False}, {"vert": True}]
+else:
+    verts = [{"orientation": "horizontal"}, {"orientation": "vertical"}]
+
+
+@pytest.fixture(params=verts)
+def vert(request):
+    return request.param
+
+
+class TestDataFramePlots:
+    def test_stacked_boxplot_set_axis(self):
+        # GH2980
+        n = 30
+        df = DataFrame(
+            {
+                "Clinical": np.random.default_rng(2).choice([0, 1, 2, 3], n),
+                "Confirmed": np.random.default_rng(2).choice([0, 1, 2, 3], n),
+                "Discarded": np.random.default_rng(2).choice([0, 1, 2, 3], n),
+            },
+            index=np.arange(0, n),
+        )
+        ax = df.plot(kind="bar", stacked=True)
+        assert [int(x.get_text()) for x in ax.get_xticklabels()] == df.index.to_list()
+        ax.set_xticks(np.arange(0, n, 10))
+        plt.draw()  # Update changes
+        assert [int(x.get_text()) for x in ax.get_xticklabels()] == list(
+            np.arange(0, n, 10)
+        )
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "kwargs, warn",
+        [
+            [{"return_type": "dict"}, None],
+            [{"column": ["one", "two"]}, None],
+            [{"column": ["one", "two"], "by": "indic"}, UserWarning],
+            [{"column": ["one"], "by": ["indic", "indic2"]}, None],
+            [{"by": "indic"}, UserWarning],
+            [{"by": ["indic", "indic2"]}, UserWarning],
+            [{"notch": 1}, None],
+            [{"by": "indic", "notch": 1}, UserWarning],
+        ],
+    )
+    def test_boxplot_legacy1(self, kwargs, warn):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((6, 4)),
+            index=list(string.ascii_letters[:6]),
+            columns=["one", "two", "three", "four"],
+        )
+        df["indic"] = ["foo", "bar"] * 3
+        df["indic2"] = ["foo", "bar", "foo"] * 2
+
+        # _check_plot_works can add an ax so catch warning. see GH #13188
+        with tm.assert_produces_warning(warn, check_stacklevel=False):
+            _check_plot_works(df.boxplot, **kwargs)
+
+    def test_boxplot_legacy1_series(self):
+        ser = Series(np.random.default_rng(2).standard_normal(6))
+        _check_plot_works(plotting._core.boxplot, data=ser, return_type="dict")
+
+    def test_boxplot_legacy2(self):
+        df = DataFrame(
+            np.random.default_rng(2).random((10, 2)), columns=["Col1", "Col2"]
+        )
+        df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
+        df["Y"] = Series(["A"] * 10)
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            _check_plot_works(df.boxplot, by="X")
+
+    def test_boxplot_legacy2_with_ax(self):
+        df = DataFrame(
+            np.random.default_rng(2).random((10, 2)), columns=["Col1", "Col2"]
+        )
+        df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
+        df["Y"] = Series(["A"] * 10)
+        # When ax is supplied and required number of axes is 1,
+        # passed ax should be used:
+        _, ax = mpl.pyplot.subplots()
+        axes = df.boxplot("Col1", by="X", ax=ax)
+        ax_axes = ax.axes
+        assert ax_axes is axes
+
+    def test_boxplot_legacy2_with_ax_return_type(self):
+        df = DataFrame(
+            np.random.default_rng(2).random((10, 2)), columns=["Col1", "Col2"]
+        )
+        df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
+        df["Y"] = Series(["A"] * 10)
+        fig, ax = mpl.pyplot.subplots()
+        axes = df.groupby("Y").boxplot(ax=ax, return_type="axes")
+        ax_axes = ax.axes
+        assert ax_axes is axes["A"]
+
+    def test_boxplot_legacy2_with_multi_col(self):
+        df = DataFrame(
+            np.random.default_rng(2).random((10, 2)), columns=["Col1", "Col2"]
+        )
+        df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
+        df["Y"] = Series(["A"] * 10)
+        # Multiple columns with an ax argument should use same figure
+        fig, ax = mpl.pyplot.subplots()
+        msg = "the figure containing the passed axes is being cleared"
+        with tm.assert_produces_warning(UserWarning, match=msg):
+            axes = df.boxplot(
+                column=["Col1", "Col2"], by="X", ax=ax, return_type="axes"
+            )
+        assert axes["Col1"].get_figure() is fig
+
+    def test_boxplot_legacy2_by_none(self):
+        df = DataFrame(
+            np.random.default_rng(2).random((10, 2)), columns=["Col1", "Col2"]
+        )
+        df["X"] = Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"])
+        df["Y"] = Series(["A"] * 10)
+        # When by is None, check that all relevant lines are present in the
+        # dict
+        _, ax = mpl.pyplot.subplots()
+        d = df.boxplot(ax=ax, return_type="dict")
+        lines = list(itertools.chain.from_iterable(d.values()))
+        assert len(ax.get_lines()) == len(lines)
+
+    def test_boxplot_return_type_none(self, hist_df):
+        # GH 12216; return_type=None & by=None -> axes
+        result = hist_df.boxplot()
+        assert isinstance(result, mpl.pyplot.Axes)
+
+    def test_boxplot_return_type_legacy(self):
+        # API change in https://github.com/pandas-dev/pandas/pull/7096
+
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((6, 4)),
+            index=list(string.ascii_letters[:6]),
+            columns=["one", "two", "three", "four"],
+        )
+        msg = "return_type must be {'axes', 'dict', 'both'}"
+        with pytest.raises(ValueError, match=msg):
+            df.boxplot(return_type="NOT_A_TYPE")
+
+        result = df.boxplot()
+        _check_box_return_type(result, "axes")
+
+    @pytest.mark.parametrize("return_type", ["dict", "axes", "both"])
+    def test_boxplot_return_type_legacy_return_type(self, return_type):
+        # API change in https://github.com/pandas-dev/pandas/pull/7096
+
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((6, 4)),
+            index=list(string.ascii_letters[:6]),
+            columns=["one", "two", "three", "four"],
+        )
+        with tm.assert_produces_warning(False):
+            result = df.boxplot(return_type=return_type)
+        _check_box_return_type(result, return_type)
+
+    def test_boxplot_axis_limits(self, hist_df):
+        df = hist_df.copy()
+        df["age"] = np.random.default_rng(2).integers(1, 20, df.shape[0])
+        # One full row
+        height_ax, weight_ax = df.boxplot(["height", "weight"], by="category")
+        _check_ax_limits(df["height"], height_ax)
+        _check_ax_limits(df["weight"], weight_ax)
+        assert weight_ax._sharey == height_ax
+
+    def test_boxplot_axis_limits_two_rows(self, hist_df):
+        df = hist_df.copy()
+        df["age"] = np.random.default_rng(2).integers(1, 20, df.shape[0])
+        # Two rows, one partial
+        p = df.boxplot(["height", "weight", "age"], by="category")
+        height_ax, weight_ax, age_ax = p[0, 0], p[0, 1], p[1, 0]
+        dummy_ax = p[1, 1]
+
+        _check_ax_limits(df["height"], height_ax)
+        _check_ax_limits(df["weight"], weight_ax)
+        _check_ax_limits(df["age"], age_ax)
+        assert weight_ax._sharey == height_ax
+        assert age_ax._sharey == height_ax
+        assert dummy_ax._sharey is None
+
+    def test_boxplot_empty_column(self):
+        df = DataFrame(np.random.default_rng(2).standard_normal((20, 4)))
+        df.loc[:, 0] = np.nan
+        _check_plot_works(df.boxplot, return_type="axes")
+
+    def test_figsize(self):
+        df = DataFrame(
+            np.random.default_rng(2).random((10, 5)), columns=["A", "B", "C", "D", "E"]
+        )
+        result = df.boxplot(return_type="axes", figsize=(12, 8))
+        assert result.figure.bbox_inches.width == 12
+        assert result.figure.bbox_inches.height == 8
+
+    def test_fontsize(self):
+        df = DataFrame({"a": [1, 2, 3, 4, 5, 6]})
+        _check_ticks_props(df.boxplot("a", fontsize=16), xlabelsize=16, ylabelsize=16)
+
+    def test_boxplot_numeric_data(self):
+        # GH 22799
+        df = DataFrame(
+            {
+                "a": date_range("2012-01-01", periods=10),
+                "b": np.random.default_rng(2).standard_normal(10),
+                "c": np.random.default_rng(2).standard_normal(10) + 2,
+                "d": date_range("2012-01-01", periods=10).astype(str),
+                "e": date_range("2012-01-01", periods=10, tz="UTC"),
+                "f": timedelta_range("1 days", periods=10),
+            }
+        )
+        ax = df.plot(kind="box")
+        assert [x.get_text() for x in ax.get_xticklabels()] == ["b", "c"]
+
+    @pytest.mark.parametrize(
+        "colors_kwd, expected",
+        [
+            (
+                {"boxes": "r", "whiskers": "b", "medians": "g", "caps": "c"},
+                {"boxes": "r", "whiskers": "b", "medians": "g", "caps": "c"},
+            ),
+            ({"boxes": "r"}, {"boxes": "r"}),
+            ("r", {"boxes": "r", "whiskers": "r", "medians": "r", "caps": "r"}),
+        ],
+    )
+    def test_color_kwd(self, colors_kwd, expected):
+        # GH: 26214
+        df = DataFrame(np.random.default_rng(2).random((10, 2)))
+        result = df.boxplot(color=colors_kwd, return_type="dict")
+        for k, v in expected.items():
+            assert result[k][0].get_color() == v
+
+    @pytest.mark.parametrize(
+        "scheme,expected",
+        [
+            (
+                "dark_background",
+                {
+                    "boxes": "#8dd3c7",
+                    "whiskers": "#8dd3c7",
+                    "medians": "#bfbbd9",
+                    "caps": "#8dd3c7",
+                },
+            ),
+            (
+                "default",
+                {
+                    "boxes": "#1f77b4",
+                    "whiskers": "#1f77b4",
+                    "medians": "#2ca02c",
+                    "caps": "#1f77b4",
+                },
+            ),
+        ],
+    )
+    def test_colors_in_theme(self, scheme, expected):
+        # GH: 40769
+        df = DataFrame(np.random.default_rng(2).random((10, 2)))
+        plt.style.use(scheme)
+        result = df.plot.box(return_type="dict")
+        for k, v in expected.items():
+            assert result[k][0].get_color() == v
+
+    @pytest.mark.parametrize(
+        "dict_colors, msg",
+        [({"boxes": "r", "invalid_key": "r"}, "invalid key 'invalid_key'")],
+    )
+    def test_color_kwd_errors(self, dict_colors, msg):
+        # GH: 26214
+        df = DataFrame(np.random.default_rng(2).random((10, 2)))
+        with pytest.raises(ValueError, match=msg):
+            df.boxplot(color=dict_colors, return_type="dict")
+
+    @pytest.mark.parametrize(
+        "props, expected",
+        [
+            ("boxprops", "boxes"),
+            ("whiskerprops", "whiskers"),
+            ("capprops", "caps"),
+            ("medianprops", "medians"),
+        ],
+    )
+    def test_specified_props_kwd(self, props, expected):
+        # GH 30346
+        df = DataFrame({k: np.random.default_rng(2).random(10) for k in "ABC"})
+        kwd = {props: {"color": "C1"}}
+        result = df.boxplot(return_type="dict", **kwd)
+
+        assert result[expected][0].get_color() == "C1"
+
+    @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning")
+    def test_plot_xlabel_ylabel(self, vert):
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).standard_normal(10),
+                "b": np.random.default_rng(2).standard_normal(10),
+                "group": np.random.default_rng(2).choice(["group1", "group2"], 10),
+            }
+        )
+        xlabel, ylabel = "x", "y"
+        ax = df.plot(kind="box", xlabel=xlabel, ylabel=ylabel, **vert)
+        assert ax.get_xlabel() == xlabel
+        assert ax.get_ylabel() == ylabel
+
+    @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning")
+    def test_plot_box(self, vert):
+        # GH 54941
+        rng = np.random.default_rng(2)
+        df1 = DataFrame(rng.integers(0, 100, size=(10, 4)), columns=list("ABCD"))
+        df2 = DataFrame(rng.integers(0, 100, size=(10, 4)), columns=list("ABCD"))
+
+        xlabel, ylabel = "x", "y"
+        _, axs = plt.subplots(ncols=2, figsize=(10, 7), sharey=True)
+        df1.plot.box(ax=axs[0], xlabel=xlabel, ylabel=ylabel, **vert)
+        df2.plot.box(ax=axs[1], xlabel=xlabel, ylabel=ylabel, **vert)
+        for ax in axs:
+            assert ax.get_xlabel() == xlabel
+            assert ax.get_ylabel() == ylabel
+
+    @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning")
+    def test_boxplot_xlabel_ylabel(self, vert):
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).standard_normal(10),
+                "b": np.random.default_rng(2).standard_normal(10),
+                "group": np.random.default_rng(2).choice(["group1", "group2"], 10),
+            }
+        )
+        xlabel, ylabel = "x", "y"
+        ax = df.boxplot(xlabel=xlabel, ylabel=ylabel, **vert)
+        assert ax.get_xlabel() == xlabel
+        assert ax.get_ylabel() == ylabel
+
+    @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning")
+    def test_boxplot_group_xlabel_ylabel(self, vert):
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).standard_normal(10),
+                "b": np.random.default_rng(2).standard_normal(10),
+                "group": np.random.default_rng(2).choice(["group1", "group2"], 10),
+            }
+        )
+        xlabel, ylabel = "x", "y"
+        ax = df.boxplot(by="group", xlabel=xlabel, ylabel=ylabel, **vert)
+        for subplot in ax:
+            assert subplot.get_xlabel() == xlabel
+            assert subplot.get_ylabel() == ylabel
+
+    @pytest.mark.filterwarnings("ignore:set_ticklabels:UserWarning")
+    def test_boxplot_group_no_xlabel_ylabel(self, vert, request):
+        if Version(mpl.__version__) >= Version("3.10") and vert == {
+            "orientation": "horizontal"
+        }:
+            request.applymarker(
+                pytest.mark.xfail(reason=f"{vert} fails starting with matplotlib 3.10")
+            )
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).standard_normal(10),
+                "b": np.random.default_rng(2).standard_normal(10),
+                "group": np.random.default_rng(2).choice(["group1", "group2"], 10),
+            }
+        )
+        ax = df.boxplot(by="group", **vert)
+        for subplot in ax:
+            target_label = (
+                subplot.get_xlabel()
+                if vert in ({"vert": True}, {"orientation": "vertical"})
+                else subplot.get_ylabel()
+            )
+            assert target_label == pprint_thing(["group"])
+
+
+class TestDataFrameGroupByPlots:
+    def test_boxplot_legacy1(self, hist_df):
+        grouped = hist_df.groupby(by="gender")
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            axes = _check_plot_works(grouped.boxplot, return_type="axes")
+        _check_axes_shape(list(axes.values), axes_num=2, layout=(1, 2))
+
+    def test_boxplot_legacy1_return_type(self, hist_df):
+        grouped = hist_df.groupby(by="gender")
+        axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes")
+        _check_axes_shape(axes, axes_num=1, layout=(1, 1))
+
+    @pytest.mark.slow
+    def test_boxplot_legacy2(self):
+        tuples = zip(string.ascii_letters[:10], range(10), strict=True)
+        df = DataFrame(
+            np.random.default_rng(2).random((10, 3)),
+            index=MultiIndex.from_tuples(tuples),
+        )
+        grouped = df.groupby(level=1)
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            axes = _check_plot_works(grouped.boxplot, return_type="axes")
+        _check_axes_shape(list(axes.values), axes_num=10, layout=(4, 3))
+
+    @pytest.mark.slow
+    def test_boxplot_legacy2_return_type(self):
+        tuples = zip(string.ascii_letters[:10], range(10), strict=True)
+        df = DataFrame(
+            np.random.default_rng(2).random((10, 3)),
+            index=MultiIndex.from_tuples(tuples),
+        )
+        grouped = df.groupby(level=1)
+        axes = _check_plot_works(grouped.boxplot, subplots=False, return_type="axes")
+        _check_axes_shape(axes, axes_num=1, layout=(1, 1))
+
+    def test_grouped_plot_fignums(self):
+        n = 10
+        weight = Series(np.random.default_rng(2).normal(166, 20, size=n))
+        height = Series(np.random.default_rng(2).normal(60, 10, size=n))
+        gender = np.random.default_rng(2).choice(["male", "female"], size=n)
+        df = DataFrame({"height": height, "weight": weight, "gender": gender})
+        gb = df.groupby("gender")
+
+        res = gb.plot()
+        assert len(mpl.pyplot.get_fignums()) == 2
+        assert len(res) == 2
+        plt.close("all")
+
+        res = gb.boxplot(return_type="axes")
+        assert len(mpl.pyplot.get_fignums()) == 1
+        assert len(res) == 2
+
+    def test_grouped_plot_fignums_excluded_col(self):
+        n = 10
+        weight = Series(np.random.default_rng(2).normal(166, 20, size=n))
+        height = Series(np.random.default_rng(2).normal(60, 10, size=n))
+        gender = np.random.default_rng(2).choice(["male", "female"], size=n)
+        df = DataFrame({"height": height, "weight": weight, "gender": gender})
+        # now works with GH 5610 as gender is excluded
+        df.groupby("gender").hist()
+
+    @pytest.mark.slow
+    def test_grouped_box_return_type(self, hist_df):
+        df = hist_df
+
+        # old style: return_type=None
+        result = df.boxplot(by="gender")
+        assert isinstance(result, np.ndarray)
+        _check_box_return_type(
+            result, None, expected_keys=["height", "weight", "category"]
+        )
+
+    @pytest.mark.slow
+    def test_grouped_box_return_type_groupby(self, hist_df):
+        df = hist_df
+        # now for groupby
+        result = df.groupby("gender").boxplot(return_type="dict")
+        _check_box_return_type(result, "dict", expected_keys=["Male", "Female"])
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize("return_type", ["dict", "axes", "both"])
+    def test_grouped_box_return_type_arg(self, hist_df, return_type):
+        df = hist_df
+
+        returned = df.groupby("classroom").boxplot(return_type=return_type)
+        _check_box_return_type(returned, return_type, expected_keys=["A", "B", "C"])
+
+        returned = df.boxplot(by="classroom", return_type=return_type)
+        _check_box_return_type(
+            returned, return_type, expected_keys=["height", "weight", "category"]
+        )
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize("return_type", ["dict", "axes", "both"])
+    def test_grouped_box_return_type_arg_duplcate_cats(self, return_type):
+        columns2 = "X B C D A".split()
+        df2 = DataFrame(
+            np.random.default_rng(2).standard_normal((6, 5)), columns=columns2
+        )
+        categories2 = "A B".split()
+        df2["category"] = categories2 * 3
+
+        returned = df2.groupby("category").boxplot(return_type=return_type)
+        _check_box_return_type(returned, return_type, expected_keys=categories2)
+
+        returned = df2.boxplot(by="category", return_type=return_type)
+        _check_box_return_type(returned, return_type, expected_keys=columns2)
+
+    @pytest.mark.slow
+    def test_grouped_box_layout_too_small(self, hist_df):
+        df = hist_df
+
+        msg = "Layout of 1x1 must be larger than required size 2"
+        with pytest.raises(ValueError, match=msg):
+            df.boxplot(column=["weight", "height"], by=df.gender, layout=(1, 1))
+
+    @pytest.mark.slow
+    def test_grouped_box_layout_needs_by(self, hist_df):
+        df = hist_df
+        msg = "The 'layout' keyword is not supported when 'by' is None"
+        with pytest.raises(ValueError, match=msg):
+            df.boxplot(
+                column=["height", "weight", "category"],
+                layout=(2, 1),
+                return_type="dict",
+            )
+
+    @pytest.mark.slow
+    def test_grouped_box_layout_positive_layout(self, hist_df):
+        df = hist_df
+        msg = "At least one dimension of layout must be positive"
+        with pytest.raises(ValueError, match=msg):
+            df.boxplot(column=["weight", "height"], by=df.gender, layout=(-1, -1))
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "gb_key, axes_num, rows",
+        [["gender", 2, 1], ["category", 4, 2], ["classroom", 3, 2]],
+    )
+    def test_grouped_box_layout_positive_layout_axes(
+        self, hist_df, gb_key, axes_num, rows
+    ):
+        df = hist_df
+        # _check_plot_works adds an ax so catch warning. see GH #13188 GH 6769
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            _check_plot_works(
+                df.groupby(gb_key).boxplot, column="height", return_type="dict"
+            )
+        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=axes_num, layout=(rows, 2))
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "col, visible", [["height", False], ["weight", True], ["category", True]]
+    )
+    def test_grouped_box_layout_visible(self, hist_df, col, visible):
+        df = hist_df
+        # GH 5897
+        axes = df.boxplot(
+            column=["height", "weight", "category"], by="gender", return_type="axes"
+        )
+        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))
+        ax = axes[col]
+        _check_visible(ax.get_xticklabels(), visible=visible)
+        _check_visible([ax.xaxis.get_label()], visible=visible)
+
+    @pytest.mark.slow
+    def test_grouped_box_layout_shape(self, hist_df):
+        df = hist_df
+        df.groupby("classroom").boxplot(
+            column=["height", "weight", "category"], return_type="dict"
+        )
+        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize("cols", [2, -1])
+    def test_grouped_box_layout_works(self, hist_df, cols):
+        df = hist_df
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            _check_plot_works(
+                df.groupby("category").boxplot,
+                column="height",
+                layout=(3, cols),
+                return_type="dict",
+            )
+        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(3, 2))
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize("rows, res", [[4, 4], [-1, 3]])
+    def test_grouped_box_layout_axes_shape_rows(self, hist_df, rows, res):
+        df = hist_df
+        df.boxplot(
+            column=["height", "weight", "category"], by="gender", layout=(rows, 1)
+        )
+        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(res, 1))
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize("cols, res", [[4, 4], [-1, 3]])
+    def test_grouped_box_layout_axes_shape_cols_groupby(self, hist_df, cols, res):
+        df = hist_df
+        df.groupby("classroom").boxplot(
+            column=["height", "weight", "category"],
+            layout=(1, cols),
+            return_type="dict",
+        )
+        _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, res))
+
+    @pytest.mark.slow
+    def test_grouped_box_multiple_axes(self, hist_df):
+        # GH 6970, GH 7069
+        df = hist_df
+
+        # check warning to ignore sharex / sharey
+        # this check should be done in the first function which
+        # passes multiple axes to plot, hist or boxplot
+        # location should be changed if other test is added
+        # which has earlier alphabetical order
+        with tm.assert_produces_warning(UserWarning, match="sharex and sharey"):
+            _, axes = mpl.pyplot.subplots(2, 2)
+            df.groupby("category").boxplot(column="height", return_type="axes", ax=axes)
+            _check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(2, 2))
+
+    @pytest.mark.slow
+    def test_grouped_box_multiple_axes_on_fig(self, hist_df):
+        # GH 6970, GH 7069
+        df = hist_df
+        fig, axes = mpl.pyplot.subplots(2, 3)
+        with tm.assert_produces_warning(UserWarning, match="sharex and sharey"):
+            returned = df.boxplot(
+                column=["height", "weight", "category"],
+                by="gender",
+                return_type="axes",
+                ax=axes[0],
+            )
+        returned = np.array(list(returned.values))
+        _check_axes_shape(returned, axes_num=3, layout=(1, 3))
+        tm.assert_numpy_array_equal(returned, axes[0])
+        assert returned[0].figure is fig
+
+        # draw on second row
+        with tm.assert_produces_warning(UserWarning, match="sharex and sharey"):
+            returned = df.groupby("classroom").boxplot(
+                column=["height", "weight", "category"], return_type="axes", ax=axes[1]
+            )
+        returned = np.array(list(returned.values))
+        _check_axes_shape(returned, axes_num=3, layout=(1, 3))
+        tm.assert_numpy_array_equal(returned, axes[1])
+        assert returned[0].figure is fig
+
+    @pytest.mark.slow
+    def test_grouped_box_multiple_axes_ax_error(self, hist_df):
+        # GH 6970, GH 7069
+        df = hist_df
+        msg = "The number of passed axes must be 3, the same as the output plot"
+        _, axes = mpl.pyplot.subplots(2, 3)
+        with pytest.raises(ValueError, match=msg):
+            # pass different number of axes from required
+            with tm.assert_produces_warning(UserWarning, match="sharex and sharey"):
+                axes = df.groupby("classroom").boxplot(ax=axes)
+
+    def test_fontsize(self):
+        df = DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [0, 0, 0, 1, 1, 1]})
+        _check_ticks_props(
+            df.boxplot("a", by="b", fontsize=16), xlabelsize=16, ylabelsize=16
+        )
+
+    @pytest.mark.parametrize(
+        "col, expected_xticklabel",
+        [
+            ("v", ["(a, v)", "(b, v)", "(c, v)", "(d, v)", "(e, v)"]),
+            (["v"], ["(a, v)", "(b, v)", "(c, v)", "(d, v)", "(e, v)"]),
+            ("v1", ["(a, v1)", "(b, v1)", "(c, v1)", "(d, v1)", "(e, v1)"]),
+            (
+                ["v", "v1"],
+                [
+                    "(a, v)",
+                    "(a, v1)",
+                    "(b, v)",
+                    "(b, v1)",
+                    "(c, v)",
+                    "(c, v1)",
+                    "(d, v)",
+                    "(d, v1)",
+                    "(e, v)",
+                    "(e, v1)",
+                ],
+            ),
+            (
+                None,
+                [
+                    "(a, v)",
+                    "(a, v1)",
+                    "(b, v)",
+                    "(b, v1)",
+                    "(c, v)",
+                    "(c, v1)",
+                    "(d, v)",
+                    "(d, v1)",
+                    "(e, v)",
+                    "(e, v1)",
+                ],
+            ),
+        ],
+    )
+    def test_groupby_boxplot_subplots_false(self, col, expected_xticklabel):
+        # GH 16748
+        df = DataFrame(
+            {
+                "cat": np.random.default_rng(2).choice(list("abcde"), 100),
+                "v": np.random.default_rng(2).random(100),
+                "v1": np.random.default_rng(2).random(100),
+            }
+        )
+        grouped = df.groupby("cat")
+
+        axes = _check_plot_works(
+            grouped.boxplot, subplots=False, column=col, return_type="axes"
+        )
+
+        result_xticklabel = [x.get_text() for x in axes.get_xticklabels()]
+        assert expected_xticklabel == result_xticklabel
+
+    def test_groupby_boxplot_object(self, hist_df):
+        # GH 43480
+        df = hist_df.astype("object")
+        grouped = df.groupby("gender")
+        msg = "boxplot method requires numerical columns, nothing to plot"
+        with pytest.raises(ValueError, match=msg):
+            _check_plot_works(grouped.boxplot, subplots=False)
+
+    def test_boxplot_multiindex_column(self):
+        # GH 16748
+        arrays = [
+            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
+            ["one", "two", "one", "two", "one", "two", "one", "two"],
+        ]
+        tuples = list(zip(*arrays, strict=True))
+        index = MultiIndex.from_tuples(tuples, names=["first", "second"])
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((3, 8)),
+            index=["A", "B", "C"],
+            columns=index,
+        )
+
+        col = [("bar", "one"), ("bar", "two")]
+        axes = _check_plot_works(df.boxplot, column=col, return_type="axes")
+
+        expected_xticklabel = ["(bar, one)", "(bar, two)"]
+        result_xticklabel = [x.get_text() for x in axes.get_xticklabels()]
+        assert expected_xticklabel == result_xticklabel
+
+    @pytest.mark.parametrize("group", ["X", ["X", "Y"]])
+    def test_boxplot_multi_groupby_groups(self, group):
+        # GH 14701
+        rows = 20
+        df = DataFrame(
+            np.random.default_rng(12).normal(size=(rows, 2)), columns=["Col1", "Col2"]
+        )
+        df["X"] = Series(np.repeat(["A", "B"], int(rows / 2)))
+        df["Y"] = Series(np.tile(["C", "D"], int(rows / 2)))
+        grouped = df.groupby(group)
+        _check_plot_works(df.boxplot, by=group, default_axes=True)
+        _check_plot_works(df.plot.box, by=group, default_axes=True)
+        _check_plot_works(grouped.boxplot, default_axes=True)
diff --git a/pandas/tests/plotting/test_common.py b/pandas/tests/plotting/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..20daf5935624843af3224f991497f84fa6639a0d
--- /dev/null
+++ b/pandas/tests/plotting/test_common.py
@@ -0,0 +1,60 @@
+import pytest
+
+from pandas import DataFrame
+from pandas.tests.plotting.common import (
+    _check_plot_works,
+    _check_ticks_props,
+    _gen_two_subplots,
+)
+
+plt = pytest.importorskip("matplotlib.pyplot")
+
+
+class TestCommon:
+    def test__check_ticks_props(self):
+        # GH 34768
+        df = DataFrame({"b": [0, 1, 0], "a": [1, 2, 3]})
+        ax = _check_plot_works(df.plot, rot=30)
+        ax.yaxis.set_tick_params(rotation=30)
+        msg = "expected 0.00000 but got "
+        with pytest.raises(AssertionError, match=msg):
+            _check_ticks_props(ax, xrot=0)
+        with pytest.raises(AssertionError, match=msg):
+            _check_ticks_props(ax, xlabelsize=0)
+        with pytest.raises(AssertionError, match=msg):
+            _check_ticks_props(ax, yrot=0)
+        with pytest.raises(AssertionError, match=msg):
+            _check_ticks_props(ax, ylabelsize=0)
+
+    def test__gen_two_subplots_with_ax(self):
+        fig = plt.gcf()
+        gen = _gen_two_subplots(f=lambda **kwargs: None, fig=fig, ax="test")
+        # On the first yield, no subplot should be added since ax was passed
+        next(gen)
+        assert fig.get_axes() == []
+        # On the second, the one axis should match fig.subplot(2, 1, 2)
+        next(gen)
+        axes = fig.get_axes()
+        assert len(axes) == 1
+        subplot_geometry = list(axes[0].get_subplotspec().get_geometry()[:-1])
+        subplot_geometry[-1] += 1
+        assert subplot_geometry == [2, 1, 2]
+
+    def test_colorbar_layout(self):
+        fig = plt.figure()
+
+        axes = fig.subplot_mosaic(
+            """
+            AB
+            CC
+            """
+        )
+
+        x = [1, 2, 3]
+        y = [1, 2, 3]
+
+        cs0 = axes["A"].scatter(x, y)
+        axes["B"].scatter(x, y)
+
+        fig.colorbar(cs0, ax=[axes["A"], axes["B"]], location="right")
+        DataFrame(x).plot(ax=axes["C"])
diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e33e91ccf6c6e2c508d4b528dbf1c194cd7918fa
--- /dev/null
+++ b/pandas/tests/plotting/test_converter.py
@@ -0,0 +1,391 @@
+from datetime import (
+    date,
+    datetime,
+)
+import subprocess
+import sys
+
+import numpy as np
+import pytest
+
+import pandas._config.config as cf
+
+from pandas._libs.tslibs import to_offset
+
+from pandas import (
+    Index,
+    Period,
+    PeriodIndex,
+    Series,
+    Timestamp,
+    arrays,
+    date_range,
+)
+import pandas._testing as tm
+
+from pandas.plotting import (
+    deregister_matplotlib_converters,
+    register_matplotlib_converters,
+)
+from pandas.tseries.offsets import (
+    Day,
+    Micro,
+    Milli,
+    Second,
+)
+
+plt = pytest.importorskip("matplotlib.pyplot")
+dates = pytest.importorskip("matplotlib.dates")
+units = pytest.importorskip("matplotlib.units")
+
+from pandas.plotting._matplotlib import converter
+
+
+@pytest.mark.single_cpu
+def test_registry_mpl_resets():
+    # Check that Matplotlib converters are properly reset (see issue #27481)
+    code = (
+        "import matplotlib.units as units; "
+        "import matplotlib.dates as mdates; "
+        "n_conv = len(units.registry); "
+        "import pandas as pd; "
+        "pd.plotting.register_matplotlib_converters(); "
+        "pd.plotting.deregister_matplotlib_converters(); "
+        "assert len(units.registry) == n_conv"
+    )
+    call = [sys.executable, "-c", code]
+    subprocess.check_output(call)
+
+
+def test_timtetonum_accepts_unicode():
+    assert converter.time2num("00:01") == converter.time2num("00:01")
+
+
+class TestRegistration:
+    @pytest.mark.single_cpu
+    def test_dont_register_by_default(self):
+        # Run in subprocess to ensure a clean state
+        code = (
+            "import matplotlib.units; "
+            "import pandas as pd; "
+            "units = dict(matplotlib.units.registry); "
+            "assert pd.Timestamp not in units"
+        )
+        call = [sys.executable, "-c", code]
+        assert subprocess.check_call(call) == 0
+
+    def test_registering_no_warning(self):
+        s = Series(range(12), index=date_range("2017", periods=12))
+        _, ax = plt.subplots()
+
+        # Set to the "warn" state, in case this isn't the first test run
+        register_matplotlib_converters()
+        ax.plot(s.index, s.values)
+
+    def test_pandas_plots_register(self):
+        s = Series(range(12), index=date_range("2017", periods=12))
+        # Set to the "warn" state, in case this isn't the first test run
+        with tm.assert_produces_warning(None) as w:
+            s.plot()
+
+        assert len(w) == 0
+
+    def test_matplotlib_formatters(self):
+        # Can't make any assertion about the start state.
+        # We we check that toggling converters off removes it, and toggling it
+        # on restores it.
+
+        with cf.option_context("plotting.matplotlib.register_converters", True):
+            with cf.option_context("plotting.matplotlib.register_converters", False):
+                assert Timestamp not in units.registry
+            assert Timestamp in units.registry
+
+    def test_option_no_warning(self):
+        s = Series(range(12), index=date_range("2017", periods=12))
+        _, ax = plt.subplots()
+
+        # Test without registering first, no warning
+        with cf.option_context("plotting.matplotlib.register_converters", False):
+            ax.plot(s.index, s.values)
+
+        # Now test with registering
+        register_matplotlib_converters()
+        with cf.option_context("plotting.matplotlib.register_converters", False):
+            ax.plot(s.index, s.values)
+
+    def test_registry_resets(self):
+        # make a copy, to reset to
+        original = dict(units.registry)
+
+        try:
+            # get to a known state
+            units.registry.clear()
+            date_converter = dates.DateConverter()
+            units.registry[datetime] = date_converter
+            units.registry[date] = date_converter
+
+            register_matplotlib_converters()
+            assert units.registry[date] is not date_converter
+            deregister_matplotlib_converters()
+            assert units.registry[date] is date_converter
+
+        finally:
+            # restore original stater
+            units.registry.clear()
+            for k, v in original.items():
+                units.registry[k] = v
+
+
+class TestDateTimeConverter:
+    @pytest.fixture
+    def dtc(self):
+        return converter.DatetimeConverter()
+
+    def test_convert_accepts_unicode(self, dtc):
+        r1 = dtc.convert("2000-01-01 12:22", None, None)
+        r2 = dtc.convert("2000-01-01 12:22", None, None)
+        assert r1 == r2, "DatetimeConverter.convert should accept unicode"
+
+    def test_conversion(self, dtc):
+        rs = dtc.convert(["2012-1-1"], None, None)[0]
+        xp = dates.date2num(datetime(2012, 1, 1))
+        assert rs == xp
+
+        rs = dtc.convert("2012-1-1", None, None)
+        assert rs == xp
+
+        rs = dtc.convert(date(2012, 1, 1), None, None)
+        assert rs == xp
+
+        rs = dtc.convert("2012-1-1", None, None)
+        assert rs == xp
+
+        rs = dtc.convert(Timestamp("2012-1-1"), None, None)
+        assert rs == xp
+
+        # also testing datetime64 dtype (GH8614)
+        rs = dtc.convert("2012-01-01", None, None)
+        assert rs == xp
+
+        rs = dtc.convert("2012-01-01 00:00:00+0000", None, None)
+        assert rs == xp
+
+        rs = dtc.convert(
+            np.array(["2012-01-01 00:00:00+0000", "2012-01-02 00:00:00+0000"]),
+            None,
+            None,
+        )
+        assert rs[0] == xp
+
+        # we have a tz-aware date (constructed to that when we turn to utc it
+        # is the same as our sample)
+        ts = Timestamp("2012-01-01").tz_localize("UTC").tz_convert("US/Eastern")
+        rs = dtc.convert(ts, None, None)
+        assert rs == xp
+
+        rs = dtc.convert(ts.to_pydatetime(), None, None)
+        assert rs == xp
+
+        rs = dtc.convert(Index([ts - Day(1), ts]), None, None)
+        assert rs[1] == xp
+
+        rs = dtc.convert(Index([ts - Day(1), ts]).to_pydatetime(), None, None)
+        assert rs[1] == xp
+
+    def test_conversion_float(self, dtc):
+        rtol = 0.5 * 10**-9
+
+        rs = dtc.convert(Timestamp("2012-1-1 01:02:03", tz="UTC"), None, None)
+        xp = dates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC"))
+        tm.assert_almost_equal(rs, xp, rtol=rtol)
+
+        rs = dtc.convert(
+            Timestamp("2012-1-1 09:02:03", tz="Asia/Hong_Kong"), None, None
+        )
+        tm.assert_almost_equal(rs, xp, rtol=rtol)
+
+        rs = dtc.convert(datetime(2012, 1, 1, 1, 2, 3), None, None)
+        tm.assert_almost_equal(rs, xp, rtol=rtol)
+
+    @pytest.mark.parametrize(
+        "values",
+        [
+            [date(1677, 1, 1), date(1677, 1, 2)],
+            [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)],
+        ],
+    )
+    def test_conversion_outofbounds_datetime(self, dtc, values):
+        # 2579
+        rs = dtc.convert(values, None, None)
+        xp = dates.date2num(values)
+        tm.assert_numpy_array_equal(rs, xp)
+        rs = dtc.convert(values[0], None, None)
+        xp = dates.date2num(values[0])
+        assert rs == xp
+
+    @pytest.mark.parametrize(
+        "time,format_expected",
+        [
+            (0, "00:00"),  # time2num(datetime.time.min)
+            (86399.999999, "23:59:59.999999"),  # time2num(datetime.time.max)
+            (90000, "01:00"),
+            (3723, "01:02:03"),
+            (39723.2, "11:02:03.200"),
+        ],
+    )
+    def test_time_formatter(self, time, format_expected):
+        # issue 18478
+        result = converter.TimeFormatter(None)(time)
+        assert result == format_expected
+
+    @pytest.mark.parametrize("freq", ("B", "ms", "s"))
+    def test_dateindex_conversion(self, freq, dtc):
+        rtol = 10**-9
+        dateindex = date_range("2020-01-01", periods=10, freq=freq)
+        rs = dtc.convert(dateindex, None, None)
+        xp = dates.date2num(dateindex._mpl_repr())
+        tm.assert_almost_equal(rs, xp, rtol=rtol)
+
+    @pytest.mark.parametrize("offset", [Second(), Milli(), Micro(50)])
+    def test_resolution(self, offset, dtc):
+        # Matplotlib's time representation using floats cannot distinguish
+        # intervals smaller than ~10 microsecond in the common range of years.
+        ts1 = Timestamp("2012-1-1")
+        ts2 = ts1 + offset
+        val1 = dtc.convert(ts1, None, None)
+        val2 = dtc.convert(ts2, None, None)
+        if not val1 < val2:
+            raise AssertionError(f"{val1} is not less than {val2}.")
+
+    def test_convert_nested(self, dtc):
+        inner = [Timestamp("2017-01-01"), Timestamp("2017-01-02")]
+        data = [inner, inner]
+        result = dtc.convert(data, None, None)
+        expected = [dtc.convert(x, None, None) for x in data]
+        assert (np.array(result) == expected).all()
+
+
+class TestPeriodConverter:
+    @pytest.fixture
+    def pc(self):
+        return converter.PeriodConverter()
+
+    @pytest.fixture
+    def axis(self):
+        class Axis:
+            pass
+
+        axis = Axis()
+        axis.freq = "D"
+        return axis
+
+    def test_convert_accepts_unicode(self, pc, axis):
+        r1 = pc.convert("2012-1-1", None, axis)
+        r2 = pc.convert("2012-1-1", None, axis)
+        assert r1 == r2
+
+    def test_conversion(self, pc, axis):
+        rs = pc.convert(["2012-1-1"], None, axis)[0]
+        xp = Period("2012-1-1").ordinal
+        assert rs == xp
+
+        rs = pc.convert("2012-1-1", None, axis)
+        assert rs == xp
+
+        rs = pc.convert([date(2012, 1, 1)], None, axis)[0]
+        assert rs == xp
+
+        rs = pc.convert(date(2012, 1, 1), None, axis)
+        assert rs == xp
+
+        rs = pc.convert([Timestamp("2012-1-1")], None, axis)[0]
+        assert rs == xp
+
+        rs = pc.convert(Timestamp("2012-1-1"), None, axis)
+        assert rs == xp
+
+        rs = pc.convert("2012-01-01", None, axis)
+        assert rs == xp
+
+        rs = pc.convert("2012-01-01 00:00:00+0000", None, axis)
+        assert rs == xp
+
+        rs = pc.convert(
+            np.array(
+                ["2012-01-01 00:00:00", "2012-01-02 00:00:00"],
+                dtype="datetime64[ns]",
+            ),
+            None,
+            axis,
+        )
+        assert rs[0] == xp
+
+    def test_integer_passthrough(self, pc, axis):
+        # GH9012
+        rs = pc.convert([0, 1], None, axis)
+        xp = [0, 1]
+        assert rs == xp
+
+    def test_convert_nested(self, pc, axis):
+        data = ["2012-1-1", "2012-1-2"]
+        r1 = pc.convert([data, data], None, axis)
+        r2 = [pc.convert(data, None, axis) for _ in range(2)]
+        assert r1 == r2
+
+
+class TestTimeDeltaConverter:
+    """Test timedelta converter"""
+
+    @pytest.mark.parametrize(
+        "x, decimal, format_expected",
+        [
+            (0.0, 0, "00:00:00"),
+            (3972320000000, 1, "01:06:12.3"),
+            (713233432000000, 2, "8 days 06:07:13.43"),
+            (32423432000000, 4, "09:00:23.4320"),
+        ],
+    )
+    def test_format_timedelta_ticks(self, x, decimal, format_expected):
+        tdc = converter.TimeSeries_TimedeltaFormatter
+        result = tdc.format_timedelta_ticks(x, pos=None, n_decimals=decimal, exp=9)
+        assert result == format_expected
+
+    @pytest.mark.parametrize("view_interval", [(1, 2), (2, 1)])
+    def test_call_w_different_view_intervals(self, view_interval, monkeypatch):
+        # previously broke on reversed xlmits; see GH37454
+        class mock_axis:
+            def get_view_interval(self):
+                return view_interval
+
+        tdc = converter.TimeSeries_TimedeltaFormatter()
+        monkeypatch.setattr(tdc, "axis", mock_axis())
+        tdc(0.0, 0)
+
+
+@pytest.mark.parametrize("year_span", [11.25, 30, 80, 150, 400, 800, 1500, 2500, 3500])
+# The range is limited to 11.25 at the bottom by if statements in
+# the _quarterly_finder() function
+def test_quarterly_finder(year_span):
+    vmin = -1000
+    vmax = vmin + year_span * 4
+    span = vmax - vmin + 1
+    if span < 45:
+        pytest.skip("the quarterly finder is only invoked if the span is >= 45")
+    nyears = span / 4
+    (min_anndef, maj_anndef) = converter._get_default_annual_spacing(nyears)
+    result = converter._quarterly_finder(vmin, vmax, to_offset("QE"))
+    quarters = PeriodIndex(
+        arrays.PeriodArray(np.array([x[0] for x in result]), dtype="period[Q]")
+    )
+    majors = np.array([x[1] for x in result])
+    minors = np.array([x[2] for x in result])
+    major_quarters = quarters[majors]
+    minor_quarters = quarters[minors]
+    check_major_years = major_quarters.year % maj_anndef == 0
+    check_minor_years = minor_quarters.year % min_anndef == 0
+    check_major_quarters = major_quarters.quarter == 1
+    check_minor_quarters = minor_quarters.quarter == 1
+    assert np.all(check_major_years)
+    assert np.all(check_minor_years)
+    assert np.all(check_major_quarters)
+    assert np.all(check_minor_quarters)
diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb845c6e6d71d0895c4424b2b54eb333c7b320ce
--- /dev/null
+++ b/pandas/tests/plotting/test_datetimelike.py
@@ -0,0 +1,1721 @@
+"""Test cases for time series specific (freq conversion, etc)"""
+
+from datetime import (
+    date,
+    datetime,
+    time,
+    timedelta,
+)
+import pickle
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import (
+    BaseOffset,
+    to_offset,
+)
+
+from pandas.core.dtypes.dtypes import PeriodDtype
+
+from pandas import (
+    DataFrame,
+    Index,
+    NaT,
+    Series,
+    concat,
+    isna,
+    to_datetime,
+)
+import pandas._testing as tm
+from pandas.core.indexes.datetimes import (
+    DatetimeIndex,
+    bdate_range,
+    date_range,
+)
+from pandas.core.indexes.period import (
+    Period,
+    PeriodIndex,
+    period_range,
+)
+from pandas.core.indexes.timedeltas import timedelta_range
+from pandas.tests.plotting.common import _check_ticks_props
+
+from pandas.tseries.offsets import WeekOfMonth
+
+mpl = pytest.importorskip("matplotlib")
+plt = pytest.importorskip("matplotlib.pyplot")
+
+import pandas.plotting._matplotlib.converter as conv
+
+
+class TestTSPlot:
+    @pytest.mark.filterwarnings("ignore::UserWarning")
+    def test_ts_plot_with_tz(self, tz_aware_fixture):
+        # GH2877, GH17173, GH31205, GH31580
+        tz = tz_aware_fixture
+        index = date_range("1/1/2011", periods=2, freq="h", tz=tz)
+        ts = Series([188.5, 328.25], index=index)
+        _check_plot_works(ts.plot)
+        ax = ts.plot()
+        xdata = next(iter(ax.get_lines())).get_xdata()
+        # Check first and last points' labels are correct
+        assert (xdata[0].hour, xdata[0].minute) == (0, 0)
+        assert (xdata[-1].hour, xdata[-1].minute) == (1, 0)
+
+    def test_fontsize_set_correctly(self):
+        # For issue #8765
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 9)), index=range(10)
+        )
+        _, ax = mpl.pyplot.subplots()
+        df.plot(fontsize=2, ax=ax)
+        for label in ax.get_xticklabels() + ax.get_yticklabels():
+            assert label.get_fontsize() == 2
+
+    def test_frame_inferred(self):
+        # inferred freq
+        idx = date_range("1/1/1987", freq="MS", periods=10)
+        idx = DatetimeIndex(idx.values, freq=None)
+
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx
+        )
+        _check_plot_works(df.plot)
+
+        # axes freq
+        idx = idx[0:4].union(idx[6:])
+        df2 = DataFrame(
+            np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx
+        )
+        _check_plot_works(df2.plot)
+
+    def test_frame_inferred_n_gt_1(self):
+        # N > 1
+        idx = date_range("2008-1-1 00:15:00", freq="15min", periods=10)
+        idx = DatetimeIndex(idx.values, freq=None)
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((len(idx), 3)), index=idx
+        )
+        _check_plot_works(df.plot)
+
+    def test_is_error_nozeroindex(self):
+        # GH11858
+        i = np.array([1, 2, 3])
+        a = DataFrame(i, index=i)
+        _check_plot_works(a.plot, xerr=a)
+        _check_plot_works(a.plot, yerr=a)
+
+    def test_nonnumeric_exclude(self):
+        idx = date_range("1/1/1987", freq="YE", periods=3)
+        df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}, idx)
+
+        fig, ax = mpl.pyplot.subplots()
+        df.plot(ax=ax)  # it works
+        assert len(ax.get_lines()) == 1  # B was plotted
+
+    def test_nonnumeric_exclude_error(self):
+        idx = date_range("1/1/1987", freq="YE", periods=3)
+        df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}, idx)
+        msg = "no numeric data to plot"
+        with pytest.raises(TypeError, match=msg):
+            df["A"].plot()
+
+    @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"])
+    def test_tsplot_period(self, freq):
+        idx = period_range("12/31/1999", freq=freq, periods=10)
+        ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx)
+        _, ax = mpl.pyplot.subplots()
+        _check_plot_works(ser.plot, ax=ax)
+
+    @pytest.mark.parametrize(
+        "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"]
+    )
+    def test_tsplot_datetime(self, freq):
+        idx = date_range("12/31/1999", freq=freq, periods=10)
+        ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx)
+        _, ax = mpl.pyplot.subplots()
+        _check_plot_works(ser.plot, ax=ax)
+
+    def test_tsplot(self):
+        ts = Series(
+            np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
+        )
+        _, ax = mpl.pyplot.subplots()
+        ts.plot(style="k", ax=ax)
+        color = (0.0, 0.0, 0.0, 1)
+        assert color == ax.get_lines()[0].get_color()
+
+    @pytest.mark.parametrize("index", [None, date_range("2020-01-01", periods=10)])
+    def test_both_style_and_color(self, index):
+        ts = Series(np.arange(10, dtype=np.float64), index=index)
+        msg = (
+            "Cannot pass 'style' string with a color symbol and 'color' "
+            "keyword argument. Please use one or the other or pass 'style' "
+            "without a color symbol"
+        )
+        with pytest.raises(ValueError, match=msg):
+            ts.plot(style="b-", color="#000099")
+
+    @pytest.mark.parametrize("freq", ["ms", "us"])
+    def test_high_freq(self, freq):
+        _, ax = mpl.pyplot.subplots()
+        rng = date_range("1/1/2012", periods=10, freq=freq)
+        ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+        _check_plot_works(ser.plot, ax=ax)
+
+    def test_get_datevalue(self):
+        assert conv._get_datevalue(None, "D") is None
+        assert conv._get_datevalue(1987, "Y") == 1987
+        assert (
+            conv._get_datevalue(Period(1987, "Y"), "M")
+            == Period("1987-12", "M").ordinal
+        )
+        assert conv._get_datevalue("1/1/1987", "D") == Period("1987-1-1", "D").ordinal
+
+    @pytest.mark.parametrize(
+        "freq, expected_string",
+        [["YE-DEC", "t = 2014  y = 1.000000"], ["D", "t = 2014-01-01  y = 1.000000"]],
+    )
+    def test_ts_plot_format_coord(self, freq, expected_string):
+        ser = Series(1, index=date_range("2014-01-01", periods=3, freq=freq))
+        _, ax = mpl.pyplot.subplots()
+        ser.plot(ax=ax)
+        first_line = ax.get_lines()[0]
+        first_x = first_line.get_xdata()[0].ordinal
+        first_y = first_line.get_ydata()[0]
+        assert expected_string == ax.format_coord(first_x, first_y)
+
+    @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "M", "Q", "Y"])
+    def test_line_plot_period_series(self, freq):
+        idx = period_range("12/31/1999", freq=freq, periods=10)
+        ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx)
+        _check_plot_works(ser.plot, ser.index.freq)
+
+    @pytest.mark.parametrize(
+        "frqncy", ["1s", "3s", "5min", "7h", "4D", "8W", "11M", "3Y"]
+    )
+    def test_line_plot_period_mlt_series(self, frqncy):
+        # test period index line plot for series with multiples (`mlt`) of the
+        # frequency (`frqncy`) rule code. tests resolution of issue #14763
+        idx = period_range("12/31/1999", freq=frqncy, periods=10)
+        s = Series(np.random.default_rng(2).standard_normal(len(idx)), idx)
+        _check_plot_works(s.plot, s.index.freq.rule_code)
+
+    @pytest.mark.parametrize(
+        "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"]
+    )
+    def test_line_plot_datetime_series(self, freq):
+        idx = date_range("12/31/1999", freq=freq, periods=10)
+        ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx)
+        _check_plot_works(ser.plot, ser.index.freq.rule_code)
+
+    @pytest.mark.parametrize("freq", ["s", "min", "h", "D", "W", "ME", "QE", "YE"])
+    def test_line_plot_period_frame(self, freq):
+        idx = date_range("12/31/1999", freq=freq, periods=10)
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((len(idx), 3)),
+            index=idx,
+            columns=["A", "B", "C"],
+        )
+        _check_plot_works(df.plot, df.index.freq)
+
+    @pytest.mark.parametrize(
+        "frqncy", ["1s", "3s", "5min", "7h", "4D", "8W", "11M", "3Y"]
+    )
+    def test_line_plot_period_mlt_frame(self, frqncy):
+        # test period index line plot for DataFrames with multiples (`mlt`)
+        # of the frequency (`frqncy`) rule code. tests resolution of issue
+        # #14763
+        idx = period_range("12/31/1999", freq=frqncy, periods=10)
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((len(idx), 3)),
+            index=idx,
+            columns=["A", "B", "C"],
+        )
+        freq = df.index.freq.rule_code
+        _check_plot_works(df.plot, freq)
+
+    @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
+    @pytest.mark.parametrize(
+        "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"]
+    )
+    def test_line_plot_datetime_frame(self, freq):
+        idx = date_range("12/31/1999", freq=freq, periods=10)
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((len(idx), 3)),
+            index=idx,
+            columns=["A", "B", "C"],
+        )
+        freq = PeriodDtype(df.index.freq)._freqstr
+        freq = df.index.to_period(freq).freq
+        _check_plot_works(df.plot, freq)
+
+    @pytest.mark.parametrize(
+        "freq", ["s", "min", "h", "D", "W", "ME", "QE-DEC", "YE", "1B30Min"]
+    )
+    def test_line_plot_inferred_freq(self, freq):
+        idx = date_range("12/31/1999", freq=freq, periods=10)
+        ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx)
+        ser = Series(ser.values, Index(np.asarray(ser.index)))
+        _check_plot_works(ser.plot, ser.index.inferred_freq)
+
+        ser = ser.iloc[[0, 3, 5, 6]]
+        _check_plot_works(ser.plot)
+
+    def test_fake_inferred_business(self):
+        _, ax = mpl.pyplot.subplots()
+        rng = date_range("2001-1-1", "2001-1-10")
+        ts = Series(range(len(rng)), index=rng)
+        ts = concat([ts[:3], ts[5:]])
+        ts.plot(ax=ax)
+        assert not hasattr(ax, "freq")
+
+    def test_plot_offset_freq(self):
+        ser = Series(
+            np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
+        )
+        _check_plot_works(ser.plot)
+
+    def test_plot_offset_freq_business(self):
+        dr = date_range("2023-01-01", freq="BQS", periods=10)
+        ser = Series(np.random.default_rng(2).standard_normal(len(dr)), index=dr)
+        _check_plot_works(ser.plot)
+
+    def test_plot_multiple_inferred_freq(self):
+        dr = Index([datetime(2000, 1, 1), datetime(2000, 1, 6), datetime(2000, 1, 11)])
+        ser = Series(np.random.default_rng(2).standard_normal(len(dr)), index=dr)
+        _check_plot_works(ser.plot)
+
+    def test_irreg_hf(self):
+        idx = date_range("2012-6-22 21:59:51", freq="s", periods=10)
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((len(idx), 2)), index=idx
+        )
+
+        irreg = df.iloc[[0, 1, 3, 4]]
+        _, ax = mpl.pyplot.subplots()
+        irreg.plot(ax=ax)
+        diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff()
+
+        sec = 1.0 / 24 / 60 / 60
+        assert (np.fabs(diffs[1:] - [sec, sec * 2, sec]) < 1e-8).all()
+
+    def test_irreg_hf_object(self):
+        idx = date_range("2012-6-22 21:59:51", freq="s", periods=10)
+        df2 = DataFrame(
+            np.random.default_rng(2).standard_normal((len(idx), 2)), index=idx
+        )
+        _, ax = mpl.pyplot.subplots()
+        df2.index = df2.index.astype(object)
+        df2.plot(ax=ax)
+        diffs = Series(ax.get_lines()[0].get_xydata()[:, 0]).diff()
+        sec = 1.0 / 24 / 60 / 60
+        assert (np.fabs(diffs[1:] - sec) < 1e-8).all()
+
+    def test_irregular_datetime64_repr_bug(self):
+        ser = Series(
+            np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
+        )
+        ser = ser.iloc[[0, 1, 2, 7]]
+
+        _, ax = mpl.pyplot.subplots()
+
+        ret = ser.plot(ax=ax)
+        assert ret is not None
+
+        for rs, xp in zip(ax.get_lines()[0].get_xdata(), ser.index, strict=True):
+            assert rs == xp
+
+    def test_business_freq(self):
+        bts = Series(range(5), period_range("2020-01-01", periods=5))
+        msg = r"PeriodDtype\[B\] is deprecated"
+        dt = bts.index[0].to_timestamp()
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            bts.index = period_range(start=dt, periods=len(bts), freq="B")
+        _, ax = mpl.pyplot.subplots()
+        bts.plot(ax=ax)
+        assert ax.get_lines()[0].get_xydata()[0, 0] == bts.index[0].ordinal
+        idx = ax.get_lines()[0].get_xdata()
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            assert PeriodIndex(data=idx).freqstr == "B"
+
+    def test_business_freq_convert(self):
+        bts = Series(
+            np.arange(50, dtype=np.float64),
+            index=date_range("2020-01-01", periods=50, freq="B"),
+        ).asfreq("BME")
+        ts = bts.to_period("M")
+        _, ax = mpl.pyplot.subplots()
+        bts.plot(ax=ax)
+        assert ax.get_lines()[0].get_xydata()[0, 0] == ts.index[0].ordinal
+        idx = ax.get_lines()[0].get_xdata()
+        assert PeriodIndex(data=idx).freqstr == "M"
+
+    def test_freq_with_no_period_alias(self):
+        # GH34487
+        freq = WeekOfMonth()
+        bts = Series(
+            np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
+        ).asfreq(freq)
+        _, ax = mpl.pyplot.subplots()
+        bts.plot(ax=ax)
+
+        idx = ax.get_lines()[0].get_xdata()
+        msg = "freq not specified and cannot be inferred"
+        with pytest.raises(ValueError, match=msg):
+            PeriodIndex(data=idx)
+
+    def test_nonzero_base(self):
+        # GH2571
+        idx = date_range("2012-12-20", periods=24, freq="h") + timedelta(minutes=30)
+        df = DataFrame(np.arange(24), index=idx)
+        _, ax = mpl.pyplot.subplots()
+        df.plot(ax=ax)
+        rs = ax.get_lines()[0].get_xdata()
+        assert not Index(rs).is_normalized
+
+    def test_dataframe(self):
+        bts = DataFrame(
+            {
+                "a": Series(
+                    np.arange(10, dtype=np.float64),
+                    index=date_range("2020-01-01", periods=10),
+                )
+            }
+        )
+        _, ax = mpl.pyplot.subplots()
+        bts.plot(ax=ax)
+        idx = ax.get_lines()[0].get_xdata()
+        tm.assert_index_equal(bts.index.to_period(), PeriodIndex(idx))
+
+    @pytest.mark.filterwarnings(
+        "ignore:Period with BDay freq is deprecated:FutureWarning"
+    )
+    @pytest.mark.parametrize(
+        "obj",
+        [
+            Series(
+                np.arange(10, dtype=np.float64),
+                index=date_range("2020-01-01", periods=10),
+            ),
+            DataFrame(
+                {
+                    "a": Series(
+                        np.arange(10, dtype=np.float64),
+                        index=date_range("2020-01-01", periods=10),
+                    ),
+                    "b": Series(
+                        np.arange(10, dtype=np.float64),
+                        index=date_range("2020-01-01", periods=10),
+                    )
+                    + 1,
+                }
+            ),
+        ],
+    )
+    def test_axis_limits(self, obj):
+        _, ax = mpl.pyplot.subplots()
+        obj.plot(ax=ax)
+        xlim = ax.get_xlim()
+        ax.set_xlim(xlim[0] - 5, xlim[1] + 10)
+        result = ax.get_xlim()
+        assert result[0] == xlim[0] - 5
+        assert result[1] == xlim[1] + 10
+
+        # string
+        expected = (Period("1/1/2000", ax.freq), Period("4/1/2000", ax.freq))
+        ax.set_xlim("1/1/2000", "4/1/2000")
+        result = ax.get_xlim()
+        assert int(result[0]) == expected[0].ordinal
+        assert int(result[1]) == expected[1].ordinal
+
+        # datetime
+        expected = (Period("1/1/2000", ax.freq), Period("4/1/2000", ax.freq))
+        ax.set_xlim(datetime(2000, 1, 1), datetime(2000, 4, 1))
+        result = ax.get_xlim()
+        assert int(result[0]) == expected[0].ordinal
+        assert int(result[1]) == expected[1].ordinal
+
+    def test_get_finder(self):
+        assert conv.get_finder(to_offset("B")) == conv._daily_finder
+        assert conv.get_finder(to_offset("D")) == conv._daily_finder
+        assert conv.get_finder(to_offset("ME")) == conv._monthly_finder
+        assert conv.get_finder(to_offset("QE")) == conv._quarterly_finder
+        assert conv.get_finder(to_offset("YE")) == conv._annual_finder
+        assert conv.get_finder(to_offset("W")) == conv._daily_finder
+
+    def test_finder_daily(self):
+        day_lst = [10, 40, 252, 400, 950, 2750, 10000]
+
+        msg = "Period with BDay freq is deprecated"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            xpl1 = xpl2 = [Period("1999-1-1", freq="B").ordinal] * len(day_lst)
+        rs1 = []
+        rs2 = []
+        for n in day_lst:
+            rng = bdate_range("1999-1-1", periods=n)
+            ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+            _, ax = mpl.pyplot.subplots()
+            ser.plot(ax=ax)
+            xaxis = ax.get_xaxis()
+            rs1.append(xaxis.get_majorticklocs()[0])
+
+            vmin, vmax = ax.get_xlim()
+            ax.set_xlim(vmin + 0.9, vmax)
+            rs2.append(xaxis.get_majorticklocs()[0])
+            mpl.pyplot.close(ax.get_figure())
+
+        assert rs1 == xpl1
+        assert rs2 == xpl2
+
+    def test_finder_quarterly(self):
+        yrs = [3.5, 11]
+
+        xpl1 = xpl2 = [Period("1988Q1").ordinal] * len(yrs)
+        rs1 = []
+        rs2 = []
+        for n in yrs:
+            rng = period_range("1987Q2", periods=int(n * 4), freq="Q")
+            ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+            _, ax = mpl.pyplot.subplots()
+            ser.plot(ax=ax)
+            xaxis = ax.get_xaxis()
+            rs1.append(xaxis.get_majorticklocs()[0])
+
+            (vmin, vmax) = ax.get_xlim()
+            ax.set_xlim(vmin + 0.9, vmax)
+            rs2.append(xaxis.get_majorticklocs()[0])
+            mpl.pyplot.close(ax.get_figure())
+
+        assert rs1 == xpl1
+        assert rs2 == xpl2
+
+    def test_finder_monthly(self):
+        yrs = [1.15, 2.5, 4, 11]
+
+        xpl1 = xpl2 = [Period("Jan 1988").ordinal] * len(yrs)
+        rs1 = []
+        rs2 = []
+        for n in yrs:
+            rng = period_range("1987Q2", periods=int(n * 12), freq="M")
+            ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+            _, ax = mpl.pyplot.subplots()
+            ser.plot(ax=ax)
+            xaxis = ax.get_xaxis()
+            rs1.append(xaxis.get_majorticklocs()[0])
+
+            vmin, vmax = ax.get_xlim()
+            ax.set_xlim(vmin + 0.9, vmax)
+            rs2.append(xaxis.get_majorticklocs()[0])
+            mpl.pyplot.close(ax.get_figure())
+
+        assert rs1 == xpl1
+        assert rs2 == xpl2
+
+    def test_finder_monthly_long(self):
+        rng = period_range("1988Q1", periods=24 * 12, freq="M")
+        ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+        _, ax = mpl.pyplot.subplots()
+        ser.plot(ax=ax)
+        xaxis = ax.get_xaxis()
+        rs = xaxis.get_majorticklocs()[0]
+        xp = Period("1989Q1", "M").ordinal
+        assert rs == xp
+
+    def test_finder_annual(self):
+        xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170]
+        xp = [Period(x, freq="Y").ordinal for x in xp]
+        rs = []
+        for nyears in [5, 10, 19, 49, 99, 199, 599, 1001]:
+            rng = period_range("1987", periods=nyears, freq="Y")
+            ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+            _, ax = mpl.pyplot.subplots()
+            ser.plot(ax=ax)
+            xaxis = ax.get_xaxis()
+            rs.append(xaxis.get_majorticklocs()[0])
+            mpl.pyplot.close(ax.get_figure())
+
+        assert rs == xp
+
+    @pytest.mark.slow
+    def test_finder_minutely(self):
+        nminutes = 1 * 24 * 60
+        rng = date_range("1/1/1999", freq="Min", periods=nminutes)
+        ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+        _, ax = mpl.pyplot.subplots()
+        ser.plot(ax=ax)
+        xaxis = ax.get_xaxis()
+        rs = xaxis.get_majorticklocs()[0]
+        xp = Period("1/1/1999", freq="Min").ordinal
+
+        assert rs == xp
+
+    def test_finder_hourly(self):
+        nhours = 23
+        rng = date_range("1/1/1999", freq="h", periods=nhours)
+        ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+        _, ax = mpl.pyplot.subplots()
+        ser.plot(ax=ax)
+        xaxis = ax.get_xaxis()
+        rs = xaxis.get_majorticklocs()[0]
+        xp = Period("1/1/1999", freq="h").ordinal
+
+        assert rs == xp
+
+    def test_gaps(self):
+        ts = Series(
+            np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
+        )
+        ts.iloc[5:7] = np.nan
+        _, ax = mpl.pyplot.subplots()
+        ts.plot(ax=ax)
+        lines = ax.get_lines()
+        assert len(lines) == 1
+        line = lines[0]
+        data = line.get_xydata()
+
+        data = np.ma.MaskedArray(data, mask=isna(data), fill_value=np.nan)
+
+        assert isinstance(data, np.ma.core.MaskedArray)
+        mask = data.mask
+        assert mask[5:7, 1].all()
+
+    def test_gaps_irregular(self):
+        # irregular
+        ts = Series(
+            np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30)
+        )
+        ts = ts.iloc[[0, 1, 2, 5, 7, 9, 12, 15, 20]]
+        ts.iloc[2:5] = np.nan
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot(ax=ax)
+        lines = ax.get_lines()
+        assert len(lines) == 1
+        line = lines[0]
+        data = line.get_xydata()
+
+        data = np.ma.MaskedArray(data, mask=isna(data), fill_value=np.nan)
+
+        assert isinstance(data, np.ma.core.MaskedArray)
+        mask = data.mask
+        assert mask[2:5, 1].all()
+
+    def test_gaps_non_ts(self):
+        # non-ts
+        idx = [0, 1, 2, 5, 7, 9, 12, 15, 20]
+        ser = Series(np.random.default_rng(2).standard_normal(len(idx)), idx)
+        ser.iloc[2:5] = np.nan
+        _, ax = mpl.pyplot.subplots()
+        ser.plot(ax=ax)
+        lines = ax.get_lines()
+        assert len(lines) == 1
+        line = lines[0]
+        data = line.get_xydata()
+        data = np.ma.MaskedArray(data, mask=isna(data), fill_value=np.nan)
+
+        assert isinstance(data, np.ma.core.MaskedArray)
+        mask = data.mask
+        assert mask[2:5, 1].all()
+
+    def test_gap_upsample(self):
+        low = Series(
+            np.arange(10, dtype=np.float64), index=date_range("2020-01-01", periods=10)
+        )
+        low.iloc[5:7] = np.nan
+        _, ax = mpl.pyplot.subplots()
+        low.plot(ax=ax)
+
+        idxh = date_range(low.index[0], low.index[-1], freq="12h")
+        s = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh)
+        s.plot(secondary_y=True)
+        lines = ax.get_lines()
+        assert len(lines) == 1
+        assert len(ax.right_ax.get_lines()) == 1
+
+        line = lines[0]
+        data = line.get_xydata()
+        data = np.ma.MaskedArray(data, mask=isna(data), fill_value=np.nan)
+
+        assert isinstance(data, np.ma.core.MaskedArray)
+        mask = data.mask
+        assert mask[5:7, 1].all()
+
+    def test_secondary_y(self):
+        ser = Series(np.random.default_rng(2).standard_normal(10))
+        fig, _ = mpl.pyplot.subplots()
+        ax = ser.plot(secondary_y=True)
+        assert hasattr(ax, "left_ax")
+        assert not hasattr(ax, "right_ax")
+        axes = fig.get_axes()
+        line = ax.get_lines()[0]
+        xp = Series(line.get_ydata(), line.get_xdata())
+        tm.assert_series_equal(ser, xp)
+        assert ax.get_yaxis().get_ticks_position() == "right"
+        assert not axes[0].get_yaxis().get_visible()
+
+    def test_secondary_y_yaxis(self):
+        Series(np.random.default_rng(2).standard_normal(10))
+        ser2 = Series(np.random.default_rng(2).standard_normal(10))
+        _, ax2 = mpl.pyplot.subplots()
+        ser2.plot(ax=ax2)
+        assert ax2.get_yaxis().get_ticks_position() == "left"
+
+    def test_secondary_both(self):
+        ser = Series(np.random.default_rng(2).standard_normal(10))
+        ser2 = Series(np.random.default_rng(2).standard_normal(10))
+        ax = ser2.plot()
+        ax2 = ser.plot(secondary_y=True)
+        assert ax.get_yaxis().get_visible()
+        assert not hasattr(ax, "left_ax")
+        assert hasattr(ax, "right_ax")
+        assert hasattr(ax2, "left_ax")
+        assert not hasattr(ax2, "right_ax")
+
+    def test_secondary_y_ts(self):
+        idx = date_range("1/1/2000", periods=10, unit="ns")
+        ser = Series(np.random.default_rng(2).standard_normal(10), idx)
+        fig, _ = mpl.pyplot.subplots()
+        ax = ser.plot(secondary_y=True)
+        assert hasattr(ax, "left_ax")
+        assert not hasattr(ax, "right_ax")
+        axes = fig.get_axes()
+        line = ax.get_lines()[0]
+        xp = Series(line.get_ydata(), line.get_xdata()).to_timestamp()
+        xp.index = xp.index.as_unit("ns")
+        tm.assert_series_equal(ser, xp)
+        assert ax.get_yaxis().get_ticks_position() == "right"
+        assert not axes[0].get_yaxis().get_visible()
+
+    def test_secondary_y_ts_yaxis(self):
+        idx = date_range("1/1/2000", periods=10)
+        ser2 = Series(np.random.default_rng(2).standard_normal(10), idx)
+        _, ax2 = mpl.pyplot.subplots()
+        ser2.plot(ax=ax2)
+        assert ax2.get_yaxis().get_ticks_position() == "left"
+
+    def test_secondary_y_ts_visible(self):
+        idx = date_range("1/1/2000", periods=10)
+        ser2 = Series(np.random.default_rng(2).standard_normal(10), idx)
+        ax = ser2.plot()
+        assert ax.get_yaxis().get_visible()
+
+    def test_secondary_kde(self):
+        pytest.importorskip("scipy")
+        ser = Series(np.random.default_rng(2).standard_normal(10))
+        fig, ax = mpl.pyplot.subplots()
+        ax = ser.plot(secondary_y=True, kind="density", ax=ax)
+        assert hasattr(ax, "left_ax")
+        assert not hasattr(ax, "right_ax")
+        axes = fig.get_axes()
+        assert axes[1].get_yaxis().get_ticks_position() == "right"
+
+    def test_secondary_bar(self):
+        ser = Series(np.random.default_rng(2).standard_normal(10))
+        fig, ax = mpl.pyplot.subplots()
+        ser.plot(secondary_y=True, kind="bar", ax=ax)
+        axes = fig.get_axes()
+        assert axes[1].get_yaxis().get_ticks_position() == "right"
+
+    def test_secondary_frame(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 3)), columns=["a", "b", "c"]
+        )
+        axes = df.plot(secondary_y=["a", "c"], subplots=True)
+        assert axes[0].get_yaxis().get_ticks_position() == "right"
+        assert axes[1].get_yaxis().get_ticks_position() == "left"
+        assert axes[2].get_yaxis().get_ticks_position() == "right"
+
+    def test_secondary_bar_frame(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 3)), columns=["a", "b", "c"]
+        )
+        axes = df.plot(kind="bar", secondary_y=["a", "c"], subplots=True)
+        assert axes[0].get_yaxis().get_ticks_position() == "right"
+        assert axes[1].get_yaxis().get_ticks_position() == "left"
+        assert axes[2].get_yaxis().get_ticks_position() == "right"
+
+    def test_mixed_freq_regular_first(self):
+        # TODO
+        s1 = Series(
+            np.arange(20, dtype=np.float64),
+            index=date_range("2020-01-01", periods=20, freq="B"),
+        )
+        s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15]]
+
+        # it works!
+        _, ax = mpl.pyplot.subplots()
+        s1.plot(ax=ax)
+
+        ax2 = s2.plot(style="g", ax=ax)
+        lines = ax2.get_lines()
+        msg = r"PeriodDtype\[B\] is deprecated"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            idx1 = PeriodIndex(lines[0].get_xdata())
+            idx2 = PeriodIndex(lines[1].get_xdata())
+
+            tm.assert_index_equal(idx1, s1.index.to_period("B"))
+            tm.assert_index_equal(idx2, s2.index.to_period("B"))
+
+            left, right = ax2.get_xlim()
+            pidx = s1.index.to_period()
+        assert left <= pidx[0].ordinal
+        assert right >= pidx[-1].ordinal
+
+    def test_mixed_freq_irregular_first(self):
+        s1 = Series(
+            np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20)
+        )
+        s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15]]
+        _, ax = mpl.pyplot.subplots()
+        s2.plot(style="g", ax=ax)
+        s1.plot(ax=ax)
+        assert not hasattr(ax, "freq")
+        lines = ax.get_lines()
+        x1 = lines[0].get_xdata()
+        tm.assert_numpy_array_equal(x1, s2.index.astype(object).values)
+        x2 = lines[1].get_xdata()
+        tm.assert_numpy_array_equal(x2, s1.index.astype(object).values)
+
+    def test_mixed_freq_regular_first_df(self):
+        # GH 9852
+        s1 = Series(
+            np.arange(20, dtype=np.float64),
+            index=date_range("2020-01-01", periods=20, freq="B"),
+        ).to_frame()
+        s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :]
+        _, ax = mpl.pyplot.subplots()
+        s1.plot(ax=ax)
+        ax2 = s2.plot(style="g", ax=ax)
+        lines = ax2.get_lines()
+        msg = r"PeriodDtype\[B\] is deprecated"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            idx1 = PeriodIndex(lines[0].get_xdata())
+            idx2 = PeriodIndex(lines[1].get_xdata())
+            assert idx1.equals(s1.index.to_period("B"))
+            assert idx2.equals(s2.index.to_period("B"))
+            left, right = ax2.get_xlim()
+            pidx = s1.index.to_period()
+        assert left <= pidx[0].ordinal
+        assert right >= pidx[-1].ordinal
+
+    def test_mixed_freq_irregular_first_df(self):
+        # GH 9852
+        s1 = Series(
+            np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20)
+        ).to_frame()
+        s2 = s1.iloc[[0, 5, 10, 11, 12, 13, 14, 15], :]
+        _, ax = mpl.pyplot.subplots()
+        s2.plot(style="g", ax=ax)
+        s1.plot(ax=ax)
+        assert not hasattr(ax, "freq")
+        lines = ax.get_lines()
+        x1 = lines[0].get_xdata()
+        tm.assert_numpy_array_equal(x1, s2.index.astype(object).values)
+        x2 = lines[1].get_xdata()
+        tm.assert_numpy_array_equal(x2, s1.index.astype(object).values)
+
+    def test_mixed_freq_hf_first(self):
+        idxh = date_range("1/1/1999", periods=365, freq="D")
+        idxl = date_range("1/1/1999", periods=12, freq="ME")
+        high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh)
+        low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl)
+        _, ax = mpl.pyplot.subplots()
+        high.plot(ax=ax)
+        low.plot(ax=ax)
+        for line in ax.get_lines():
+            assert PeriodIndex(data=line.get_xdata()).freq == "D"
+
+    def test_mixed_freq_alignment(self):
+        ts_ind = date_range("2012-01-01 13:00", "2012-01-02", freq="h")
+        ts_data = np.random.default_rng(2).standard_normal(12)
+
+        ts = Series(ts_data, index=ts_ind)
+        ts2 = ts.asfreq("min").interpolate()
+
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot(ax=ax)
+        ts2.plot(style="r", ax=ax)
+
+        assert ax.lines[0].get_xdata()[0] == ax.lines[1].get_xdata()[0]
+
+    def test_mixed_freq_lf_first(self):
+        idxh = date_range("1/1/1999", periods=365, freq="D")
+        idxl = date_range("1/1/1999", periods=12, freq="ME")
+        high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh)
+        low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl)
+        _, ax = mpl.pyplot.subplots()
+        low.plot(legend=True, ax=ax)
+        high.plot(legend=True, ax=ax)
+        for line in ax.get_lines():
+            assert PeriodIndex(data=line.get_xdata()).freq == "D"
+        leg = ax.get_legend()
+        assert len(leg.texts) == 2
+        mpl.pyplot.close(ax.get_figure())
+
+    def test_mixed_freq_lf_first_hourly(self):
+        idxh = date_range("1/1/1999", periods=240, freq="min")
+        idxl = date_range("1/1/1999", periods=4, freq="h")
+        high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh)
+        low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl)
+        _, ax = mpl.pyplot.subplots()
+        low.plot(ax=ax)
+        high.plot(ax=ax)
+        for line in ax.get_lines():
+            assert PeriodIndex(data=line.get_xdata()).freq == "min"
+
+    @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
+    def test_mixed_freq_irreg_period(self):
+        ts = Series(
+            np.arange(30, dtype=np.float64), index=date_range("2020-01-01", periods=30)
+        )
+        irreg = ts.iloc[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]]
+        msg = r"PeriodDtype\[B\] is deprecated"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            rng = period_range("1/3/2000", periods=30, freq="B")
+        ps = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+        _, ax = mpl.pyplot.subplots()
+        irreg.plot(ax=ax)
+        ps.plot(ax=ax)
+
+    def test_mixed_freq_shared_ax(self):
+        # GH13341, using sharex=True
+        idx1 = date_range("2015-01-01", periods=3, freq="ME")
+        idx2 = idx1[:1].union(idx1[2:])
+        s1 = Series(range(len(idx1)), idx1)
+        s2 = Series(range(len(idx2)), idx2)
+
+        _, (ax1, ax2) = mpl.pyplot.subplots(nrows=2, sharex=True)
+        s1.plot(ax=ax1)
+        s2.plot(ax=ax2)
+
+        assert ax1.freq == "M"
+        assert ax2.freq == "M"
+        assert ax1.lines[0].get_xydata()[0, 0] == ax2.lines[0].get_xydata()[0, 0]
+
+    def test_mixed_freq_shared_ax_twin_x(self):
+        # GH13341, using sharex=True
+        idx1 = date_range("2015-01-01", periods=3, freq="ME")
+        idx2 = idx1[:1].union(idx1[2:])
+        s1 = Series(range(len(idx1)), idx1)
+        s2 = Series(range(len(idx2)), idx2)
+        # using twinx
+        _, ax1 = mpl.pyplot.subplots()
+        ax2 = ax1.twinx()
+        s1.plot(ax=ax1)
+        s2.plot(ax=ax2)
+
+        assert ax1.lines[0].get_xydata()[0, 0] == ax2.lines[0].get_xydata()[0, 0]
+
+    @pytest.mark.xfail(reason="TODO (GH14330, GH14322)")
+    def test_mixed_freq_shared_ax_twin_x_irregular_first(self):
+        # GH13341, using sharex=True
+        idx1 = date_range("2015-01-01", periods=3, freq="ME")
+        idx2 = idx1[:1].union(idx1[2:])
+        s1 = Series(range(len(idx1)), idx1)
+        s2 = Series(range(len(idx2)), idx2)
+        _, ax1 = mpl.pyplot.subplots()
+        ax2 = ax1.twinx()
+        s2.plot(ax=ax1)
+        s1.plot(ax=ax2)
+        assert ax1.lines[0].get_xydata()[0, 0] == ax2.lines[0].get_xydata()[0, 0]
+
+    def test_nat_handling(self):
+        _, ax = mpl.pyplot.subplots()
+
+        dti = DatetimeIndex(["2015-01-01", NaT, "2015-01-03"])
+        s = Series(range(len(dti)), dti)
+        s.plot(ax=ax)
+        xdata = ax.get_lines()[0].get_xdata()
+        # plot x data is bounded by index values
+        assert s.index.min() <= Series(xdata).min()
+        assert Series(xdata).max() <= s.index.max()
+
+    def test_to_weekly_resampling_disallow_how_kwd(self):
+        idxh = date_range("1/1/1999", periods=52, freq="W")
+        idxl = date_range("1/1/1999", periods=12, freq="ME")
+        high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh)
+        low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl)
+        _, ax = mpl.pyplot.subplots()
+        high.plot(ax=ax)
+
+        msg = (
+            "'how' is not a valid keyword for plotting functions. If plotting "
+            "multiple objects on shared axes, resample manually first."
+        )
+        with pytest.raises(ValueError, match=msg):
+            low.plot(ax=ax, how="foo")
+
+    def test_to_weekly_resampling(self):
+        idxh = date_range("1/1/1999", periods=52, freq="W")
+        idxl = date_range("1/1/1999", periods=12, freq="ME")
+        high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh)
+        low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl)
+        _, ax = mpl.pyplot.subplots()
+        high.plot(ax=ax)
+        low.plot(ax=ax)
+        for line in ax.get_lines():
+            assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq
+
+    def test_from_weekly_resampling(self):
+        idxh = date_range("1/1/1999", periods=52, freq="W")
+        idxl = date_range("1/1/1999", periods=12, freq="ME")
+        high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh)
+        low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl)
+        _, ax = mpl.pyplot.subplots()
+        low.plot(ax=ax)
+        high.plot(ax=ax)
+
+        expected_h = idxh.to_period().asi8.astype(np.float64)
+        expected_l = np.array(
+            [1514, 1519, 1523, 1527, 1531, 1536, 1540, 1544, 1549, 1553, 1558, 1562],
+            dtype=np.float64,
+        )
+        for line in ax.get_lines():
+            assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq
+            xdata = line.get_xdata(orig=False)
+            if len(xdata) == 12:  # idxl lines
+                tm.assert_numpy_array_equal(xdata, expected_l)
+            else:
+                tm.assert_numpy_array_equal(xdata, expected_h)
+
+    @pytest.mark.parametrize("kind1, kind2", [("line", "area"), ("area", "line")])
+    def test_from_resampling_area_line_mixed(self, kind1, kind2):
+        idxh = date_range("1/1/1999", periods=52, freq="W")
+        idxl = date_range("1/1/1999", periods=12, freq="ME")
+        high = DataFrame(
+            np.random.default_rng(2).random((len(idxh), 3)),
+            index=idxh,
+            columns=[0, 1, 2],
+        )
+        low = DataFrame(
+            np.random.default_rng(2).random((len(idxl), 3)),
+            index=idxl,
+            columns=[0, 1, 2],
+        )
+
+        _, ax = mpl.pyplot.subplots()
+        low.plot(kind=kind1, stacked=True, ax=ax)
+        high.plot(kind=kind2, stacked=True, ax=ax)
+
+        # check low dataframe result
+        expected_x = np.array(
+            [
+                1514,
+                1519,
+                1523,
+                1527,
+                1531,
+                1536,
+                1540,
+                1544,
+                1549,
+                1553,
+                1558,
+                1562,
+            ],
+            dtype=np.float64,
+        )
+        expected_y = np.zeros(len(expected_x), dtype=np.float64)
+        for i in range(3):
+            line = ax.lines[i]
+            assert PeriodIndex(line.get_xdata()).freq == idxh.freq
+            tm.assert_numpy_array_equal(line.get_xdata(orig=False), expected_x)
+            # check stacked values are correct
+            expected_y += low[i].values
+            tm.assert_numpy_array_equal(line.get_ydata(orig=False), expected_y)
+
+        # check high dataframe result
+        expected_x = idxh.to_period().asi8.astype(np.float64)
+        expected_y = np.zeros(len(expected_x), dtype=np.float64)
+        for i in range(3):
+            line = ax.lines[3 + i]
+            assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq
+            tm.assert_numpy_array_equal(line.get_xdata(orig=False), expected_x)
+            expected_y += high[i].values
+            tm.assert_numpy_array_equal(line.get_ydata(orig=False), expected_y)
+
+    @pytest.mark.parametrize("kind1, kind2", [("line", "area"), ("area", "line")])
+    def test_from_resampling_area_line_mixed_high_to_low(self, kind1, kind2):
+        idxh = date_range("1/1/1999", periods=52, freq="W")
+        idxl = date_range("1/1/1999", periods=12, freq="ME")
+        high = DataFrame(
+            np.random.default_rng(2).random((len(idxh), 3)),
+            index=idxh,
+            columns=[0, 1, 2],
+        )
+        low = DataFrame(
+            np.random.default_rng(2).random((len(idxl), 3)),
+            index=idxl,
+            columns=[0, 1, 2],
+        )
+        _, ax = mpl.pyplot.subplots()
+        high.plot(kind=kind1, stacked=True, ax=ax)
+        low.plot(kind=kind2, stacked=True, ax=ax)
+
+        # check high dataframe result
+        expected_x = idxh.to_period().asi8.astype(np.float64)
+        expected_y = np.zeros(len(expected_x), dtype=np.float64)
+        for i in range(3):
+            line = ax.lines[i]
+            assert PeriodIndex(data=line.get_xdata()).freq == idxh.freq
+            tm.assert_numpy_array_equal(line.get_xdata(orig=False), expected_x)
+            expected_y += high[i].values
+            tm.assert_numpy_array_equal(line.get_ydata(orig=False), expected_y)
+
+        # check low dataframe result
+        expected_x = np.array(
+            [
+                1514,
+                1519,
+                1523,
+                1527,
+                1531,
+                1536,
+                1540,
+                1544,
+                1549,
+                1553,
+                1558,
+                1562,
+            ],
+            dtype=np.float64,
+        )
+        expected_y = np.zeros(len(expected_x), dtype=np.float64)
+        for i in range(3):
+            lines = ax.lines[3 + i]
+            assert PeriodIndex(data=lines.get_xdata()).freq == idxh.freq
+            tm.assert_numpy_array_equal(lines.get_xdata(orig=False), expected_x)
+            expected_y += low[i].values
+            tm.assert_numpy_array_equal(lines.get_ydata(orig=False), expected_y)
+
+    def test_mixed_freq_second_millisecond(self):
+        # GH 7772, GH 7760
+        idxh = date_range("2014-07-01 09:00", freq="s", periods=5)
+        idxl = date_range("2014-07-01 09:00", freq="100ms", periods=50)
+        high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh)
+        low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl)
+        # high to low
+        _, ax = mpl.pyplot.subplots()
+        high.plot(ax=ax)
+        low.plot(ax=ax)
+        assert len(ax.get_lines()) == 2
+        for line in ax.get_lines():
+            assert PeriodIndex(data=line.get_xdata()).freq == "ms"
+
+    def test_mixed_freq_second_millisecond_low_to_high(self):
+        # GH 7772, GH 7760
+        idxh = date_range("2014-07-01 09:00", freq="s", periods=5)
+        idxl = date_range("2014-07-01 09:00", freq="100ms", periods=50)
+        high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh)
+        low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl)
+        # low to high
+        _, ax = mpl.pyplot.subplots()
+        low.plot(ax=ax)
+        high.plot(ax=ax)
+        assert len(ax.get_lines()) == 2
+        for line in ax.get_lines():
+            assert PeriodIndex(data=line.get_xdata()).freq == "ms"
+
+    def test_irreg_dtypes(self):
+        # date
+        idx = [date(2000, 1, 1), date(2000, 1, 5), date(2000, 1, 20)]
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((len(idx), 3)),
+            Index(idx, dtype=object),
+        )
+        _check_plot_works(df.plot)
+
+    def test_irreg_dtypes_dt64(self):
+        # np.datetime64
+        idx = date_range("1/1/2000", periods=10)
+        idx = idx[[0, 2, 5, 9]].astype(object)
+        df = DataFrame(np.random.default_rng(2).standard_normal((len(idx), 3)), idx)
+        _, ax = mpl.pyplot.subplots()
+        _check_plot_works(df.plot, ax=ax)
+
+    def test_time(self):
+        t = datetime(1, 1, 1, 3, 30, 0)
+        deltas = np.random.default_rng(2).integers(1, 20, 3).cumsum()
+        ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas])
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).standard_normal(len(ts)),
+                "b": np.random.default_rng(2).standard_normal(len(ts)),
+            },
+            index=ts,
+        )
+        _, ax = mpl.pyplot.subplots()
+        df.plot(ax=ax)
+
+        # verify tick labels
+        ticks = ax.get_xticks()
+        labels = ax.get_xticklabels()
+        for _tick, _label in zip(ticks, labels, strict=True):
+            m, s = divmod(int(_tick), 60)
+            h, m = divmod(m, 60)
+            rs = _label.get_text()
+            if len(rs) > 0:
+                if s != 0:
+                    xp = time(h, m, s).strftime("%H:%M:%S")
+                else:
+                    xp = time(h, m, s).strftime("%H:%M")
+                assert xp == rs
+
+    def test_time_change_xlim(self):
+        t = datetime(1, 1, 1, 3, 30, 0)
+        deltas = np.random.default_rng(2).integers(1, 20, 3).cumsum()
+        ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas])
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).standard_normal(len(ts)),
+                "b": np.random.default_rng(2).standard_normal(len(ts)),
+            },
+            index=ts,
+        )
+        _, ax = mpl.pyplot.subplots()
+        df.plot(ax=ax)
+
+        # verify tick labels
+        ticks = ax.get_xticks()
+        labels = ax.get_xticklabels()
+        for _tick, _label in zip(ticks, labels, strict=True):
+            m, s = divmod(int(_tick), 60)
+            h, m = divmod(m, 60)
+            rs = _label.get_text()
+            if len(rs) > 0:
+                if s != 0:
+                    xp = time(h, m, s).strftime("%H:%M:%S")
+                else:
+                    xp = time(h, m, s).strftime("%H:%M")
+                assert xp == rs
+
+        # change xlim
+        ax.set_xlim("1:30", "5:00")
+
+        # check tick labels again
+        ticks = ax.get_xticks()
+        labels = ax.get_xticklabels()
+        for _tick, _label in zip(ticks, labels, strict=True):
+            m, s = divmod(int(_tick), 60)
+            h, m = divmod(m, 60)
+            rs = _label.get_text()
+            if len(rs) > 0:
+                if s != 0:
+                    xp = time(h, m, s).strftime("%H:%M:%S")
+                else:
+                    xp = time(h, m, s).strftime("%H:%M")
+                assert xp == rs
+
+    def test_time_musec(self):
+        t = datetime(1, 1, 1, 3, 30, 0)
+        deltas = np.random.default_rng(2).integers(1, 20, 3).cumsum()
+        ts = np.array([(t + timedelta(microseconds=int(x))).time() for x in deltas])
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).standard_normal(len(ts)),
+                "b": np.random.default_rng(2).standard_normal(len(ts)),
+            },
+            index=ts,
+        )
+        _, ax = mpl.pyplot.subplots()
+        ax = df.plot(ax=ax)
+
+        # verify tick labels
+        ticks = ax.get_xticks()
+        labels = ax.get_xticklabels()
+        for _tick, _label in zip(ticks, labels, strict=True):
+            m, s = divmod(int(_tick), 60)
+
+            us = round((_tick - int(_tick)) * 1e6)
+
+            h, m = divmod(m, 60)
+            rs = _label.get_text()
+            if len(rs) > 0:
+                if (us % 1000) != 0:
+                    xp = time(h, m, s, us).strftime("%H:%M:%S.%f")
+                elif (us // 1000) != 0:
+                    xp = time(h, m, s, us).strftime("%H:%M:%S.%f")[:-3]
+                elif s != 0:
+                    xp = time(h, m, s, us).strftime("%H:%M:%S")
+                else:
+                    xp = time(h, m, s, us).strftime("%H:%M")
+                assert xp == rs
+
+    def test_secondary_upsample(self):
+        idxh = date_range("1/1/1999", periods=365, freq="D")
+        idxl = date_range("1/1/1999", periods=12, freq="ME")
+        high = Series(np.random.default_rng(2).standard_normal(len(idxh)), idxh)
+        low = Series(np.random.default_rng(2).standard_normal(len(idxl)), idxl)
+        _, ax = mpl.pyplot.subplots()
+        low.plot(ax=ax)
+        ax = high.plot(secondary_y=True, ax=ax)
+        for line in ax.get_lines():
+            assert PeriodIndex(line.get_xdata()).freq == "D"
+        assert hasattr(ax, "left_ax")
+        assert not hasattr(ax, "right_ax")
+        for line in ax.left_ax.get_lines():
+            assert PeriodIndex(line.get_xdata()).freq == "D"
+
+    def test_secondary_legend(self):
+        fig = mpl.pyplot.figure()
+        ax = fig.add_subplot(211)
+
+        # ts
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=10, freq="B"),
+        )
+        df.plot(secondary_y=["A", "B"], ax=ax)
+        leg = ax.get_legend()
+        assert len(leg.get_lines()) == 4
+        assert leg.get_texts()[0].get_text() == "A (right)"
+        assert leg.get_texts()[1].get_text() == "B (right)"
+        assert leg.get_texts()[2].get_text() == "C"
+        assert leg.get_texts()[3].get_text() == "D"
+        assert ax.right_ax.get_legend() is None
+        colors = set()
+        for line in leg.get_lines():
+            colors.add(line.get_color())
+
+        # TODO: color cycle problems
+        assert len(colors) == 4
+
+    def test_secondary_legend_right(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=10, freq="B"),
+        )
+        fig = mpl.pyplot.figure()
+        ax = fig.add_subplot(211)
+        df.plot(secondary_y=["A", "C"], mark_right=False, ax=ax)
+        leg = ax.get_legend()
+        assert len(leg.get_lines()) == 4
+        assert leg.get_texts()[0].get_text() == "A"
+        assert leg.get_texts()[1].get_text() == "B"
+        assert leg.get_texts()[2].get_text() == "C"
+        assert leg.get_texts()[3].get_text() == "D"
+
+    def test_secondary_legend_bar(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=10, freq="B"),
+        )
+        fig, ax = mpl.pyplot.subplots()
+        df.plot(kind="bar", secondary_y=["A"], ax=ax)
+        leg = ax.get_legend()
+        assert leg.get_texts()[0].get_text() == "A (right)"
+        assert leg.get_texts()[1].get_text() == "B"
+
+    def test_secondary_legend_bar_right(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=10, freq="B"),
+        )
+        fig, ax = mpl.pyplot.subplots()
+        df.plot(kind="bar", secondary_y=["A"], mark_right=False, ax=ax)
+        leg = ax.get_legend()
+        assert leg.get_texts()[0].get_text() == "A"
+        assert leg.get_texts()[1].get_text() == "B"
+
+    def test_secondary_legend_multi_col(self):
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=10, freq="B"),
+        )
+        fig = mpl.pyplot.figure()
+        ax = fig.add_subplot(211)
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=date_range("2000-01-01", periods=10, freq="B"),
+        )
+        ax = df.plot(secondary_y=["C", "D"], ax=ax)
+        leg = ax.get_legend()
+        assert len(leg.get_lines()) == 4
+        assert ax.right_ax.get_legend() is None
+        colors = set()
+        for line in leg.get_lines():
+            colors.add(line.get_color())
+
+        # TODO: color cycle problems
+        assert len(colors) == 4
+
+    def test_secondary_legend_nonts(self):
+        # non-ts
+        df = DataFrame(
+            1.1 * np.arange(40).reshape((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=Index([f"i-{i}" for i in range(10)], dtype=object),
+        )
+        fig = mpl.pyplot.figure()
+        ax = fig.add_subplot(211)
+        ax = df.plot(secondary_y=["A", "B"], ax=ax)
+        leg = ax.get_legend()
+        assert len(leg.get_lines()) == 4
+        assert ax.right_ax.get_legend() is None
+        colors = set()
+        for line in leg.get_lines():
+            colors.add(line.get_color())
+
+        # TODO: color cycle problems
+        assert len(colors) == 4
+
+    def test_secondary_legend_nonts_multi_col(self):
+        # non-ts
+        df = DataFrame(
+            1.1 * np.arange(40).reshape((10, 4)),
+            columns=Index(list("ABCD"), dtype=object),
+            index=Index([f"i-{i}" for i in range(10)], dtype=object),
+        )
+        fig = mpl.pyplot.figure()
+        ax = fig.add_subplot(211)
+        ax = df.plot(secondary_y=["C", "D"], ax=ax)
+        leg = ax.get_legend()
+        assert len(leg.get_lines()) == 4
+        assert ax.right_ax.get_legend() is None
+        colors = set()
+        for line in leg.get_lines():
+            colors.add(line.get_color())
+
+        # TODO: color cycle problems
+        assert len(colors) == 4
+
+    @pytest.mark.xfail(reason="Api changed in 3.6.0")
+    def test_format_date_axis(self):
+        rng = date_range("1/1/2012", periods=12, freq="ME")
+        df = DataFrame(np.random.default_rng(2).standard_normal((len(rng), 3)), rng)
+        _, ax = mpl.pyplot.subplots()
+        ax = df.plot(ax=ax)
+        xaxis = ax.get_xaxis()
+        for line in xaxis.get_ticklabels():
+            if len(line.get_text()) > 0:
+                assert line.get_rotation() == 30
+
+    def test_ax_plot(self):
+        x = date_range(start="2012-01-02", periods=10, freq="D")
+        y = list(range(len(x)))
+        _, ax = mpl.pyplot.subplots()
+        lines = ax.plot(x, y, label="Y")
+        tm.assert_index_equal(DatetimeIndex(lines[0].get_xdata()), x)
+
+    def test_mpl_nopandas(self):
+        dates = [date(2008, 12, 31), date(2009, 1, 31)]
+        values1 = np.arange(10.0, 11.0, 0.5)
+        values2 = np.arange(11.0, 12.0, 0.5)
+
+        _, ax = mpl.pyplot.subplots()
+        (
+            line1,
+            line2,
+        ) = ax.plot(
+            [x.toordinal() for x in dates],
+            values1,
+            "-",
+            [x.toordinal() for x in dates],
+            values2,
+            "-",
+            linewidth=4,
+        )
+
+        exp = np.array([x.toordinal() for x in dates], dtype=np.float64)
+        tm.assert_numpy_array_equal(line1.get_xydata()[:, 0], exp)
+        tm.assert_numpy_array_equal(line2.get_xydata()[:, 0], exp)
+
+    def test_irregular_ts_shared_ax_xlim(self):
+        # GH 2960
+        ts = Series(
+            np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20)
+        )
+        ts_irregular = ts.iloc[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]]
+
+        # plot the left section of the irregular series, then the right section
+        _, ax = mpl.pyplot.subplots()
+        ts_irregular[:5].plot(ax=ax)
+        ts_irregular[5:].plot(ax=ax)
+
+        # check that axis limits are correct
+        left, right = ax.get_xlim()
+        assert left <= conv.DatetimeConverter.convert(ts_irregular.index.min(), "", ax)
+        assert right >= conv.DatetimeConverter.convert(ts_irregular.index.max(), "", ax)
+
+    def test_secondary_y_non_ts_xlim(self):
+        # GH 3490 - non-timeseries with secondary y
+        index_1 = [1, 2, 3, 4]
+        index_2 = [5, 6, 7, 8]
+        s1 = Series(1, index=index_1)
+        s2 = Series(2, index=index_2)
+
+        _, ax = mpl.pyplot.subplots()
+        s1.plot(ax=ax)
+        left_before, right_before = ax.get_xlim()
+        s2.plot(secondary_y=True, ax=ax)
+        left_after, right_after = ax.get_xlim()
+
+        assert left_before >= left_after
+        assert right_before < right_after
+
+    def test_secondary_y_regular_ts_xlim(self):
+        # GH 3490 - regular-timeseries with secondary y
+        index_1 = date_range(start="2000-01-01", periods=4, freq="D")
+        index_2 = date_range(start="2000-01-05", periods=4, freq="D")
+        s1 = Series(1, index=index_1)
+        s2 = Series(2, index=index_2)
+
+        _, ax = mpl.pyplot.subplots()
+        s1.plot(ax=ax)
+        left_before, right_before = ax.get_xlim()
+        s2.plot(secondary_y=True, ax=ax)
+        left_after, right_after = ax.get_xlim()
+
+        assert left_before >= left_after
+        assert right_before < right_after
+
+    def test_secondary_y_mixed_freq_ts_xlim(self):
+        # GH 3490 - mixed frequency timeseries with secondary y
+        rng = date_range("2000-01-01", periods=10, freq="min")
+        ts = Series(1, index=rng)
+
+        _, ax = mpl.pyplot.subplots()
+        ts.plot(ax=ax)
+        left_before, right_before = ax.get_xlim()
+        ts.resample("D").mean().plot(secondary_y=True, ax=ax)
+        left_after, right_after = ax.get_xlim()
+
+        # a downsample should not have changed either limit
+        assert left_before == left_after
+        assert right_before == right_after
+
+    def test_secondary_y_irregular_ts_xlim(self):
+        # GH 3490 - irregular-timeseries with secondary y
+        ts = Series(
+            np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20)
+        )
+        ts_irregular = ts.iloc[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]]
+
+        _, ax = mpl.pyplot.subplots()
+        ts_irregular[:5].plot(ax=ax)
+        # plot higher-x values on secondary axis
+        ts_irregular[5:].plot(secondary_y=True, ax=ax)
+        # ensure secondary limits aren't overwritten by plot on primary
+        ts_irregular[:5].plot(ax=ax)
+
+        left, right = ax.get_xlim()
+        assert left <= conv.DatetimeConverter.convert(ts_irregular.index.min(), "", ax)
+        assert right >= conv.DatetimeConverter.convert(ts_irregular.index.max(), "", ax)
+
+    def test_plot_outofbounds_datetime(self):
+        # 2579 - checking this does not raise
+        values = [date(1677, 1, 1), date(1677, 1, 2)]
+        _, ax = mpl.pyplot.subplots()
+        ax.plot(values)
+
+        values = [datetime(1677, 1, 1, 12), datetime(1677, 1, 2, 12)]
+        ax.plot(values)
+
+    def test_format_timedelta_ticks_narrow(self):
+        expected_labels = [f"00:00:00.0000000{i:0>2d}" for i in np.arange(10)]
+
+        rng = timedelta_range("0", periods=10, freq="ns")
+        df = DataFrame(np.random.default_rng(2).standard_normal((len(rng), 3)), rng)
+        _, ax = mpl.pyplot.subplots()
+        df.plot(fontsize=2, ax=ax)
+        mpl.pyplot.draw()
+        labels = ax.get_xticklabels()
+
+        result_labels = [x.get_text() for x in labels]
+        assert len(result_labels) == len(expected_labels)
+        assert result_labels == expected_labels
+
+    def test_format_timedelta_ticks_wide(self, unit):
+        expected_labels = [
+            "00:00:00",
+            "1 days 03:46:40",
+            "2 days 07:33:20",
+            "3 days 11:20:00",
+            "4 days 15:06:40",
+            "5 days 18:53:20",
+            "6 days 22:40:00",
+            "8 days 02:26:40",
+            "9 days 06:13:20",
+        ]
+
+        rng = timedelta_range("0", periods=10, freq="1 D", unit=unit)
+        df = DataFrame(np.random.default_rng(2).standard_normal((len(rng), 3)), rng)
+        _, ax = mpl.pyplot.subplots()
+        ax = df.plot(fontsize=2, ax=ax)
+        mpl.pyplot.draw()
+        labels = ax.get_xticklabels()
+
+        result_labels = [x.get_text() for x in labels]
+        assert len(result_labels) == len(expected_labels)
+        assert result_labels == expected_labels
+
+    def test_timedelta_plot(self):
+        # test issue #8711
+        s = Series(range(5), timedelta_range("1day", periods=5))
+        _, ax = mpl.pyplot.subplots()
+        _check_plot_works(s.plot, ax=ax)
+
+    def test_timedelta_long_period(self):
+        # test long period
+        index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 D")
+        s = Series(np.random.default_rng(2).standard_normal(len(index)), index)
+        _, ax = mpl.pyplot.subplots()
+        _check_plot_works(s.plot, ax=ax)
+
+    def test_timedelta_short_period(self):
+        # test short period
+        index = timedelta_range("1 day 2 hr 30 min 10 s", periods=10, freq="1 ns")
+        s = Series(np.random.default_rng(2).standard_normal(len(index)), index)
+        _, ax = mpl.pyplot.subplots()
+        _check_plot_works(s.plot, ax=ax)
+
+    def test_hist(self):
+        # https://github.com/matplotlib/matplotlib/issues/8459
+        rng = date_range("1/1/2011", periods=10, freq="h")
+        x = rng
+        w1 = np.arange(0, 1, 0.1)
+        w2 = np.arange(0, 1, 0.1)[::-1]
+        _, ax = mpl.pyplot.subplots()
+        ax.hist([x, x], weights=[w1, w2])
+
+    def test_overlapping_datetime(self):
+        # GB 6608
+        s1 = Series(
+            [1, 2, 3],
+            index=[
+                datetime(1995, 12, 31),
+                datetime(2000, 12, 31),
+                datetime(2005, 12, 31),
+            ],
+        )
+        s2 = Series(
+            [1, 2, 3],
+            index=[
+                datetime(1997, 12, 31),
+                datetime(2003, 12, 31),
+                datetime(2008, 12, 31),
+            ],
+        )
+
+        # plot first series, then add the second series to those axes,
+        # then try adding the first series again
+        _, ax = mpl.pyplot.subplots()
+        s1.plot(ax=ax)
+        s2.plot(ax=ax)
+        s1.plot(ax=ax)
+
+    @pytest.mark.xfail(reason="GH9053 matplotlib does not use ax.xaxis.converter")
+    def test_add_matplotlib_datetime64(self):
+        # GH9053 - ensure that a plot with PeriodConverter still understands
+        # datetime64 data. This still fails because matplotlib overrides the
+        # ax.xaxis.converter with a DatetimeConverter
+        s = Series(
+            np.random.default_rng(2).standard_normal(10),
+            index=date_range("1970-01-02", periods=10),
+        )
+        ax = s.plot()
+        with tm.assert_produces_warning(DeprecationWarning):
+            # multi-dimensional indexing
+            ax.plot(s.index, s.values, color="g")
+        l1, l2 = ax.lines
+        tm.assert_numpy_array_equal(l1.get_xydata(), l2.get_xydata())
+
+    def test_matplotlib_scatter_datetime64(self):
+        # https://github.com/matplotlib/matplotlib/issues/11391
+        df = DataFrame(np.random.default_rng(2).random((10, 2)), columns=["x", "y"])
+        df["time"] = date_range("2018-01-01", periods=10, freq="D")
+        _, ax = mpl.pyplot.subplots()
+        ax.scatter(x="time", y="y", data=df)
+        mpl.pyplot.draw()
+        label = ax.get_xticklabels()[0]
+        expected = "2018-01-01"
+        assert label.get_text() == expected
+
+    def test_check_xticks_rot(self):
+        # https://github.com/pandas-dev/pandas/issues/29460
+        # regular time series
+        x = to_datetime(["2020-05-01", "2020-05-02", "2020-05-03"])
+        df = DataFrame({"x": x, "y": [1, 2, 3]})
+        axes = df.plot(x="x", y="y")
+        _check_ticks_props(axes, xrot=0)
+
+    def test_check_xticks_rot_irregular(self):
+        # irregular time series
+        x = to_datetime(["2020-05-01", "2020-05-02", "2020-05-04"])
+        df = DataFrame({"x": x, "y": [1, 2, 3]})
+        axes = df.plot(x="x", y="y")
+        _check_ticks_props(axes, xrot=30)
+
+    def test_check_xticks_rot_use_idx(self):
+        # irregular time series
+        x = to_datetime(["2020-05-01", "2020-05-02", "2020-05-04"])
+        df = DataFrame({"x": x, "y": [1, 2, 3]})
+        # use timeseries index or not
+        axes = df.set_index("x").plot(y="y", use_index=True)
+        _check_ticks_props(axes, xrot=30)
+        axes = df.set_index("x").plot(y="y", use_index=False)
+        _check_ticks_props(axes, xrot=0)
+
+    def test_check_xticks_rot_sharex(self):
+        # irregular time series
+        x = to_datetime(["2020-05-01", "2020-05-02", "2020-05-04"])
+        df = DataFrame({"x": x, "y": [1, 2, 3]})
+        # separate subplots
+        axes = df.plot(x="x", y="y", subplots=True, sharex=True)
+        _check_ticks_props(axes, xrot=30)
+        axes = df.plot(x="x", y="y", subplots=True, sharex=False)
+        _check_ticks_props(axes, xrot=0)
+
+    @pytest.mark.parametrize(
+        "idx",
+        [
+            date_range("2020-01-01", periods=5),
+            date_range("2020-01-01", periods=5, tz="UTC"),
+            timedelta_range("1 day", periods=5, freq="D"),
+            period_range("2020-01-01", periods=5, freq="D"),
+            Index([date(2000, 1, i) for i in [1, 3, 6, 20, 22]], dtype=object),
+            range(5),
+        ],
+    )
+    def test_pickle_fig(self, temp_file, frame_or_series, idx):
+        # GH18439, GH#24088, statsmodels#4772
+        df = frame_or_series(range(5), index=idx)
+        fig, ax = plt.subplots(1, 1)
+        df.plot(ax=ax)
+        with temp_file.open(mode="wb") as path:
+            pickle.dump(fig, path)
+
+
+def _check_plot_works(f, freq=None, series=None, *args, **kwargs):
+    fig = plt.gcf()
+
+    fig.clf()
+    ax = fig.add_subplot(211)
+    orig_ax = kwargs.pop("ax", plt.gca())
+    orig_axfreq = getattr(orig_ax, "freq", None)
+
+    ret = f(*args, **kwargs)
+    assert ret is not None  # do something more intelligent
+
+    ax = kwargs.pop("ax", plt.gca())
+    if series is not None:
+        dfreq = series.index.freq
+        if isinstance(dfreq, BaseOffset):
+            dfreq = dfreq.rule_code
+        if orig_axfreq is None:
+            assert ax.freq == dfreq
+
+    if freq is not None and orig_axfreq is None:
+        assert to_offset(ax.freq, is_period=True) == freq
+
+    ax = fig.add_subplot(212)
+    kwargs["ax"] = ax
+    ret = f(*args, **kwargs)
+    assert ret is not None  # TODO: do something more intelligent
diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py
new file mode 100644
index 0000000000000000000000000000000000000000..e86c4e9838d2459f2ad87fcf97a4c46bad2e820d
--- /dev/null
+++ b/pandas/tests/plotting/test_groupby.py
@@ -0,0 +1,154 @@
+"""Test cases for GroupBy.plot"""
+
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+)
+from pandas.tests.plotting.common import (
+    _check_axes_shape,
+    _check_legend_labels,
+)
+
+pytest.importorskip("matplotlib")
+
+
+class TestDataFrameGroupByPlots:
+    def test_series_groupby_plotting_nominally_works(self):
+        n = 10
+        weight = Series(np.random.default_rng(2).normal(166, 20, size=n))
+        gender = np.random.default_rng(2).choice(["male", "female"], size=n)
+
+        weight.groupby(gender).plot()
+
+    def test_series_groupby_plotting_nominally_works_hist(self):
+        n = 10
+        height = Series(np.random.default_rng(2).normal(60, 10, size=n))
+        gender = np.random.default_rng(2).choice(["male", "female"], size=n)
+        height.groupby(gender).hist()
+
+    def test_series_groupby_plotting_nominally_works_alpha(self):
+        n = 10
+        height = Series(np.random.default_rng(2).normal(60, 10, size=n))
+        gender = np.random.default_rng(2).choice(["male", "female"], size=n)
+        # Regression test for GH8733
+        height.groupby(gender).plot(alpha=0.5)
+
+    def test_plotting_with_float_index_works(self):
+        # GH 7025
+        df = DataFrame(
+            {
+                "def": [1, 1, 1, 2, 2, 2, 3, 3, 3],
+                "val": np.random.default_rng(2).standard_normal(9),
+            },
+            index=[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0],
+        )
+
+        df.groupby("def")["val"].plot()
+
+    def test_plotting_with_float_index_works_apply(self):
+        # GH 7025
+        df = DataFrame(
+            {
+                "def": [1, 1, 1, 2, 2, 2, 3, 3, 3],
+                "val": np.random.default_rng(2).standard_normal(9),
+            },
+            index=[1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0],
+        )
+        df.groupby("def")["val"].apply(lambda x: x.plot())
+
+    def test_hist_single_row(self):
+        # GH10214
+        bins = np.arange(80, 100 + 2, 1)
+        df = DataFrame({"Name": ["AAA", "BBB"], "ByCol": [1, 2], "Mark": [85, 89]})
+        df["Mark"].hist(by=df["ByCol"], bins=bins)
+
+    def test_hist_single_row_single_bycol(self):
+        # GH10214
+        bins = np.arange(80, 100 + 2, 1)
+        df = DataFrame({"Name": ["AAA"], "ByCol": [1], "Mark": [85]})
+        df["Mark"].hist(by=df["ByCol"], bins=bins)
+
+    def test_plot_submethod_works(self):
+        df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 2, 1], "z": list("ababa")})
+        df.groupby("z").plot.scatter("x", "y")
+
+    def test_plot_submethod_works_line(self):
+        df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 2, 1], "z": list("ababa")})
+        df.groupby("z")["x"].plot.line()
+
+    def test_plot_kwargs(self):
+        df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 2, 1], "z": list("ababa")})
+
+        res = df.groupby("z").plot(kind="scatter", x="x", y="y")
+        # check that a scatter plot is effectively plotted: the axes should
+        # contain a PathCollection from the scatter plot (GH11805)
+        assert len(res["a"].collections) == 1
+
+    def test_plot_kwargs_scatter(self):
+        df = DataFrame({"x": [1, 2, 3, 4, 5], "y": [1, 2, 3, 2, 1], "z": list("ababa")})
+        res = df.groupby("z").plot.scatter(x="x", y="y")
+        assert len(res["a"].collections) == 1
+
+    @pytest.mark.parametrize("column, expected_axes_num", [(None, 2), ("b", 1)])
+    def test_groupby_hist_frame_with_legend(self, column, expected_axes_num):
+        # GH 6279 - DataFrameGroupBy histogram can have a legend
+        expected_layout = (1, expected_axes_num)
+        expected_labels = column or [["a"], ["b"]]
+
+        index = Index(15 * ["1"] + 15 * ["2"], name="c")
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((30, 2)),
+            index=index,
+            columns=["a", "b"],
+        )
+        g = df.groupby("c")
+
+        for axes in g.hist(legend=True, column=column):
+            _check_axes_shape(axes, axes_num=expected_axes_num, layout=expected_layout)
+            for ax, expected_label in zip(axes[0], expected_labels, strict=True):
+                _check_legend_labels(ax, expected_label)
+
+    @pytest.mark.parametrize("column", [None, "b"])
+    def test_groupby_hist_frame_with_legend_raises(self, column):
+        # GH 6279 - DataFrameGroupBy histogram with legend and label raises
+        index = Index(15 * ["1"] + 15 * ["2"], name="c")
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((30, 2)),
+            index=index,
+            columns=["a", "b"],
+        )
+        g = df.groupby("c")
+
+        with pytest.raises(ValueError, match="Cannot use both legend and label"):
+            g.hist(legend=True, column=column, label="d")
+
+    def test_groupby_hist_series_with_legend(self):
+        # GH 6279 - SeriesGroupBy histogram can have a legend
+        index = Index(15 * ["1"] + 15 * ["2"], name="c")
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((30, 2)),
+            index=index,
+            columns=["a", "b"],
+        )
+        g = df.groupby("c")
+
+        for ax in g["a"].hist(legend=True):
+            _check_axes_shape(ax, axes_num=1, layout=(1, 1))
+            _check_legend_labels(ax, ["1", "2"])
+
+    def test_groupby_hist_series_with_legend_raises(self):
+        # GH 6279 - SeriesGroupBy histogram with legend and label raises
+        index = Index(15 * ["1"] + 15 * ["2"], name="c")
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((30, 2)),
+            index=index,
+            columns=["a", "b"],
+        )
+        g = df.groupby("c")
+
+        with pytest.raises(ValueError, match="Cannot use both legend and label"):
+            g.hist(legend=True, label="d")
diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py
new file mode 100644
index 0000000000000000000000000000000000000000..e71d4ce5475a89dd686f831afcd1e4c4d726851d
--- /dev/null
+++ b/pandas/tests/plotting/test_hist_method.py
@@ -0,0 +1,957 @@
+"""Test cases for .hist method"""
+
+import re
+
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    date_range,
+    to_datetime,
+)
+import pandas._testing as tm
+from pandas.tests.plotting.common import (
+    _check_ax_scales,
+    _check_axes_shape,
+    _check_colors,
+    _check_legend_labels,
+    _check_patches_all_filled,
+    _check_plot_works,
+    _check_text_labels,
+    _check_ticks_props,
+    get_x_axis,
+    get_y_axis,
+)
+
+mpl = pytest.importorskip("matplotlib")
+plt = pytest.importorskip("matplotlib.pyplot")
+
+from pandas.plotting._matplotlib.hist import _grouped_hist
+
+
+@pytest.fixture
+def ts():
+    return Series(
+        np.arange(30, dtype=np.float64),
+        index=date_range("2020-01-01", periods=30, freq="B"),
+        name="ts",
+    )
+
+
+class TestSeriesPlots:
+    @pytest.mark.parametrize("kwargs", [{}, {"grid": False}, {"figsize": (8, 10)}])
+    def test_hist_legacy_kwargs(self, ts, kwargs):
+        _check_plot_works(ts.hist, **kwargs)
+
+    @pytest.mark.parametrize("kwargs", [{}, {"bins": 5}])
+    def test_hist_legacy_kwargs_warning(self, ts, kwargs):
+        # _check_plot_works adds an ax so catch warning. see GH #13188
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            _check_plot_works(ts.hist, by=ts.index.month, **kwargs)
+
+    def test_hist_legacy_ax(self, ts):
+        fig, ax = mpl.pyplot.subplots(1, 1)
+        _check_plot_works(ts.hist, ax=ax, default_axes=True)
+
+    def test_hist_legacy_ax_and_fig(self, ts):
+        fig, ax = mpl.pyplot.subplots(1, 1)
+        _check_plot_works(ts.hist, ax=ax, figure=fig, default_axes=True)
+
+    def test_hist_legacy_fig(self, ts):
+        fig, _ = mpl.pyplot.subplots(1, 1)
+        _check_plot_works(ts.hist, figure=fig, default_axes=True)
+
+    def test_hist_legacy_multi_ax(self, ts):
+        fig, (ax1, ax2) = mpl.pyplot.subplots(1, 2)
+        _check_plot_works(ts.hist, figure=fig, ax=ax1, default_axes=True)
+        _check_plot_works(ts.hist, figure=fig, ax=ax2, default_axes=True)
+
+    def test_hist_legacy_by_fig_error(self, ts):
+        fig, _ = mpl.pyplot.subplots(1, 1)
+        msg = (
+            "Cannot pass 'figure' when using the 'by' argument, since a new 'Figure' "
+            "instance will be created"
+        )
+        with pytest.raises(ValueError, match=msg):
+            ts.hist(by=ts.index, figure=fig)
+
+    def test_hist_bins_legacy(self):
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)))
+        ax = df.hist(bins=2)[0][0]
+        assert len(ax.patches) == 2
+
+    def test_hist_layout(self, hist_df):
+        df = hist_df
+        msg = "The 'layout' keyword is not supported when 'by' is None"
+        with pytest.raises(ValueError, match=msg):
+            df.height.hist(layout=(1, 1))
+
+        with pytest.raises(ValueError, match=msg):
+            df.height.hist(layout=[1, 1])
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "by, layout, axes_num, res_layout",
+        [
+            ["gender", (2, 1), 2, (2, 1)],
+            ["gender", (3, -1), 2, (3, 1)],
+            ["category", (4, 1), 4, (4, 1)],
+            ["category", (2, -1), 4, (2, 2)],
+            ["category", (3, -1), 4, (3, 2)],
+            ["category", (-1, 4), 4, (1, 4)],
+            ["classroom", (2, 2), 3, (2, 2)],
+        ],
+    )
+    def test_hist_layout_with_by(self, hist_df, by, layout, axes_num, res_layout):
+        df = hist_df
+
+        # _check_plot_works adds an `ax` kwarg to the method call
+        # so we get a warning about an axis being cleared, even
+        # though we don't explicitly pass one, see GH #13188
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            axes = _check_plot_works(df.height.hist, by=getattr(df, by), layout=layout)
+        _check_axes_shape(axes, axes_num=axes_num, layout=res_layout)
+
+    def test_hist_layout_with_by_shape(self, hist_df):
+        df = hist_df
+
+        axes = df.height.hist(by=df.category, layout=(4, 2), figsize=(12, 7))
+        _check_axes_shape(axes, axes_num=4, layout=(4, 2), figsize=(12, 7))
+
+    def test_hist_no_overlap(self):
+        x = Series(np.random.default_rng(2).standard_normal(2))
+        y = Series(np.random.default_rng(2).standard_normal(2))
+        plt.subplot(121)
+        x.hist()
+        plt.subplot(122)
+        y.hist()
+        fig = plt.gcf()
+        axes = fig.axes
+        assert len(axes) == 2
+
+    def test_hist_by_no_extra_plots(self, hist_df):
+        df = hist_df
+        df.height.hist(by=df.gender)
+        assert len(mpl.pyplot.get_fignums()) == 1
+
+    def test_plot_fails_when_ax_differs_from_figure(self, ts):
+        fig1 = plt.figure(1)
+        fig2 = plt.figure(2)
+        ax1 = fig1.add_subplot(111)
+        msg = "passed axis not bound to passed figure"
+        with pytest.raises(AssertionError, match=msg):
+            ts.hist(ax=ax1, figure=fig2)
+
+    @pytest.mark.parametrize(
+        "histtype, expected",
+        [
+            ("bar", True),
+            ("barstacked", True),
+            ("step", False),
+            ("stepfilled", True),
+        ],
+    )
+    def test_histtype_argument(self, histtype, expected):
+        # GH23992 Verify functioning of histtype argument
+        ser = Series(np.random.default_rng(2).integers(1, 10))
+        ax = ser.hist(histtype=histtype)
+        _check_patches_all_filled(ax, filled=expected)
+
+    @pytest.mark.parametrize(
+        "by, expected_axes_num, expected_layout", [(None, 1, (1, 1)), ("b", 2, (1, 2))]
+    )
+    def test_hist_with_legend(self, by, expected_axes_num, expected_layout):
+        # GH 6279 - Series histogram can have a legend
+        index = 5 * ["1"] + 5 * ["2"]
+        s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="a")
+        s.index.name = "b"
+
+        # Use default_axes=True when plotting method generate subplots itself
+        axes = _check_plot_works(s.hist, default_axes=True, legend=True, by=by)
+        _check_axes_shape(axes, axes_num=expected_axes_num, layout=expected_layout)
+        _check_legend_labels(axes, "a")
+
+    @pytest.mark.parametrize("by", [None, "b"])
+    def test_hist_with_legend_raises(self, by):
+        # GH 6279 - Series histogram with legend and label raises
+        index = 5 * ["1"] + 5 * ["2"]
+        s = Series(np.random.default_rng(2).standard_normal(10), index=index, name="a")
+        s.index.name = "b"
+
+        with pytest.raises(ValueError, match="Cannot use both legend and label"):
+            s.hist(legend=True, by=by, label="c")
+
+    def test_hist_kwargs(self, ts):
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot.hist(bins=5, ax=ax)
+        assert len(ax.patches) == 5
+        _check_text_labels(ax.yaxis.get_label(), "Frequency")
+
+    def test_hist_kwargs_horizontal(self, ts):
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot.hist(bins=5, ax=ax)
+        ax = ts.plot.hist(orientation="horizontal", ax=ax)
+        _check_text_labels(ax.xaxis.get_label(), "Frequency")
+
+    def test_hist_kwargs_align(self, ts):
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot.hist(bins=5, ax=ax)
+        ax = ts.plot.hist(align="left", stacked=True, ax=ax)
+
+    @pytest.mark.xfail(reason="Api changed in 3.6.0")
+    def test_hist_kde(self, ts):
+        pytest.importorskip("scipy")
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot.hist(logy=True, ax=ax)
+        _check_ax_scales(ax, yaxis="log")
+        xlabels = ax.get_xticklabels()
+        # ticks are values, thus ticklabels are blank
+        _check_text_labels(xlabels, [""] * len(xlabels))
+        ylabels = ax.get_yticklabels()
+        _check_text_labels(ylabels, [""] * len(ylabels))
+
+    def test_hist_kde_plot_works(self, ts):
+        pytest.importorskip("scipy")
+        _check_plot_works(ts.plot.kde)
+
+    def test_hist_kde_density_works(self, ts):
+        pytest.importorskip("scipy")
+        _check_plot_works(ts.plot.density)
+
+    @pytest.mark.xfail(reason="Api changed in 3.6.0")
+    def test_hist_kde_logy(self, ts):
+        pytest.importorskip("scipy")
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot.kde(logy=True, ax=ax)
+        _check_ax_scales(ax, yaxis="log")
+        xlabels = ax.get_xticklabels()
+        _check_text_labels(xlabels, [""] * len(xlabels))
+        ylabels = ax.get_yticklabels()
+        _check_text_labels(ylabels, [""] * len(ylabels))
+
+    def test_hist_kde_color_bins(self, ts):
+        pytest.importorskip("scipy")
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot.hist(logy=True, bins=10, color="b", ax=ax)
+        _check_ax_scales(ax, yaxis="log")
+        assert len(ax.patches) == 10
+        _check_colors(ax.patches, facecolors=["b"] * 10)
+
+    def test_hist_kde_color(self, ts):
+        pytest.importorskip("scipy")
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot.kde(logy=True, color="r", ax=ax)
+        _check_ax_scales(ax, yaxis="log")
+        lines = ax.get_lines()
+        assert len(lines) == 1
+        _check_colors(lines, ["r"])
+
+
+class TestDataFramePlots:
+    @pytest.mark.slow
+    def test_hist_df_legacy(self, hist_df):
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            _check_plot_works(hist_df.hist)
+
+    @pytest.mark.slow
+    def test_hist_df_legacy_layout(self):
+        # make sure layout is handled
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)))
+        df[2] = to_datetime(
+            np.random.default_rng(2).integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            axes = _check_plot_works(df.hist, grid=False)
+        _check_axes_shape(axes, axes_num=3, layout=(2, 2))
+        assert not axes[1, 1].get_visible()
+
+        _check_plot_works(df[[2]].hist)
+
+    @pytest.mark.slow
+    def test_hist_df_legacy_layout2(self):
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 1)))
+        _check_plot_works(df.hist)
+
+    @pytest.mark.slow
+    def test_hist_df_legacy_layout3(self):
+        # make sure layout is handled
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
+        df[5] = to_datetime(
+            np.random.default_rng(2).integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            axes = _check_plot_works(df.hist, layout=(4, 2))
+        _check_axes_shape(axes, axes_num=6, layout=(4, 2))
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "kwargs", [{"sharex": True, "sharey": True}, {"figsize": (8, 10)}, {"bins": 5}]
+    )
+    def test_hist_df_legacy_layout_kwargs(self, kwargs):
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 5)))
+        df[5] = to_datetime(
+            np.random.default_rng(2).integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        # make sure sharex, sharey is handled
+        # handle figsize arg
+        # check bins argument
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            _check_plot_works(df.hist, **kwargs)
+
+    @pytest.mark.slow
+    def test_hist_df_legacy_layout_labelsize_rot(self, frame_or_series):
+        # make sure xlabelsize and xrot are handled
+        obj = frame_or_series(range(10))
+        xf, yf = 20, 18
+        xrot, yrot = 30, 40
+        axes = obj.hist(xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot)
+        _check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot)
+
+    @pytest.mark.slow
+    def test_hist_df_legacy_rectangles(self):
+        ser = Series(range(10))
+        ax = ser.hist(cumulative=True, bins=4, density=True)
+        # height of last bin (index 5) must be 1.0
+        rects = [x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)]
+        tm.assert_almost_equal(rects[-1].get_height(), 1.0)
+
+    @pytest.mark.slow
+    def test_hist_df_legacy_scale(self):
+        ser = Series(range(10))
+        ax = ser.hist(log=True)
+        # scale of y must be 'log'
+        _check_ax_scales(ax, yaxis="log")
+
+    @pytest.mark.slow
+    def test_hist_df_legacy_external_error(self):
+        ser = Series(range(10))
+        # propagate attr exception from matplotlib.Axes.hist
+        with tm.external_error_raised(AttributeError):
+            ser.hist(foo="bar")
+
+    def test_hist_non_numerical_or_datetime_raises(self):
+        # gh-10444, GH32590
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).random(10),
+                "b": np.random.default_rng(2).integers(0, 10, 10),
+                "c": to_datetime(
+                    np.random.default_rng(2).integers(
+                        1582800000000000000, 1583500000000000000, 10, dtype=np.int64
+                    )
+                ),
+                "d": to_datetime(
+                    np.random.default_rng(2).integers(
+                        1582800000000000000, 1583500000000000000, 10, dtype=np.int64
+                    ),
+                    utc=True,
+                ),
+            }
+        )
+        df_o = df.astype(object)
+
+        msg = "hist method requires numerical or datetime columns, nothing to plot."
+        with pytest.raises(ValueError, match=msg):
+            df_o.hist()
+
+    @pytest.mark.parametrize(
+        "layout_test",
+        (
+            {"layout": None, "expected_size": (2, 2)},  # default is 2x2
+            {"layout": (2, 2), "expected_size": (2, 2)},
+            {"layout": (4, 1), "expected_size": (4, 1)},
+            {"layout": (1, 4), "expected_size": (1, 4)},
+            {"layout": (3, 3), "expected_size": (3, 3)},
+            {"layout": (-1, 4), "expected_size": (1, 4)},
+            {"layout": (4, -1), "expected_size": (4, 1)},
+            {"layout": (-1, 2), "expected_size": (2, 2)},
+            {"layout": (2, -1), "expected_size": (2, 2)},
+        ),
+    )
+    def test_hist_layout(self, layout_test):
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)))
+        df[2] = to_datetime(
+            np.random.default_rng(2).integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        axes = df.hist(layout=layout_test["layout"])
+        expected = layout_test["expected_size"]
+        _check_axes_shape(axes, axes_num=3, layout=expected)
+
+    def test_hist_layout_error(self):
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)))
+        df[2] = to_datetime(
+            np.random.default_rng(2).integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        # layout too small for all 4 plots
+        msg = "Layout of 1x1 must be larger than required size 3"
+        with pytest.raises(ValueError, match=msg):
+            df.hist(layout=(1, 1))
+
+        # invalid format for layout
+        msg = re.escape("Layout must be a tuple of (rows, columns)")
+        with pytest.raises(ValueError, match=msg):
+            df.hist(layout=(1,))
+        msg = "At least one dimension of layout must be positive"
+        with pytest.raises(ValueError, match=msg):
+            df.hist(layout=(-1, -1))
+
+    # GH 9351
+    def test_tight_layout(self):
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)))
+        df[2] = to_datetime(
+            np.random.default_rng(2).integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        # Use default_axes=True when plotting method generate subplots itself
+        _check_plot_works(df.hist, default_axes=True)
+        mpl.pyplot.tight_layout()
+
+    def test_hist_subplot_xrot(self):
+        # GH 30288
+        df = DataFrame(
+            {
+                "length": [1.5, 0.5, 1.2, 0.9, 3],
+                "animal": ["pig", "rabbit", "pig", "pig", "rabbit"],
+            }
+        )
+        # Use default_axes=True when plotting method generate subplots itself
+        axes = _check_plot_works(
+            df.hist,
+            default_axes=True,
+            column="length",
+            by="animal",
+            bins=5,
+            xrot=0,
+        )
+        _check_ticks_props(axes, xrot=0)
+
+    @pytest.mark.parametrize(
+        "column, expected",
+        [
+            (None, ["width", "length", "height"]),
+            (["length", "width", "height"], ["length", "width", "height"]),
+        ],
+    )
+    def test_hist_column_order_unchanged(self, column, expected):
+        # GH29235
+
+        df = DataFrame(
+            {
+                "width": [0.7, 0.2, 0.15, 0.2, 1.1],
+                "length": [1.5, 0.5, 1.2, 0.9, 3],
+                "height": [3, 0.5, 3.4, 2, 1],
+            },
+            index=["pig", "rabbit", "duck", "chicken", "horse"],
+        )
+
+        # Use default_axes=True when plotting method generate subplots itself
+        axes = _check_plot_works(
+            df.hist,
+            default_axes=True,
+            column=column,
+            layout=(1, 3),
+        )
+        result = [axes[0, i].get_title() for i in range(3)]
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "histtype, expected",
+        [
+            ("bar", True),
+            ("barstacked", True),
+            ("step", False),
+            ("stepfilled", True),
+        ],
+    )
+    def test_histtype_argument(self, histtype, expected):
+        # GH23992 Verify functioning of histtype argument
+        df = DataFrame(
+            np.random.default_rng(2).integers(1, 10, size=(10, 2)), columns=["a", "b"]
+        )
+        ax = df.hist(histtype=histtype)
+        _check_patches_all_filled(ax, filled=expected)
+
+    @pytest.mark.parametrize("by", [None, "c"])
+    @pytest.mark.parametrize("column", [None, "b"])
+    def test_hist_with_legend(self, by, column):
+        # GH 6279 - DataFrame histogram can have a legend
+        expected_axes_num = 1 if by is None and column is not None else 2
+        expected_layout = (1, expected_axes_num)
+        expected_labels = column or ["a", "b"]
+        if by is not None:
+            expected_labels = [expected_labels] * 2
+
+        index = Index(5 * ["1"] + 5 * ["2"], name="c")
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 2)),
+            index=index,
+            columns=["a", "b"],
+        )
+
+        # Use default_axes=True when plotting method generate subplots itself
+        axes = _check_plot_works(
+            df.hist,
+            default_axes=True,
+            legend=True,
+            by=by,
+            column=column,
+        )
+
+        _check_axes_shape(axes, axes_num=expected_axes_num, layout=expected_layout)
+        if by is None and column is None:
+            axes = axes[0]
+        for expected_label, ax in zip(expected_labels, axes, strict=True):
+            _check_legend_labels(ax, expected_label)
+
+    @pytest.mark.parametrize("by", [None, "c"])
+    @pytest.mark.parametrize("column", [None, "b"])
+    def test_hist_with_legend_raises(self, by, column):
+        # GH 6279 - DataFrame histogram with legend and label raises
+        index = Index(5 * ["1"] + 5 * ["2"], name="c")
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 2)),
+            index=index,
+            columns=["a", "b"],
+        )
+
+        with pytest.raises(ValueError, match="Cannot use both legend and label"):
+            df.hist(legend=True, by=by, column=column, label="d")
+
+    def test_hist_df_kwargs(self):
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 2)))
+        _, ax = mpl.pyplot.subplots()
+        ax = df.plot.hist(bins=5, ax=ax)
+        assert len(ax.patches) == 10
+
+    def test_hist_df_with_nonnumerics(self):
+        # GH 9853
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=["A", "B", "C", "D"],
+        )
+        df["E"] = ["x", "y"] * 5
+        _, ax = mpl.pyplot.subplots()
+        ax = df.plot.hist(bins=5, ax=ax)
+        assert len(ax.patches) == 20
+
+    def test_hist_df_with_nonnumerics_no_bins(self):
+        # GH 9853
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)),
+            columns=["A", "B", "C", "D"],
+        )
+        df["E"] = ["x", "y"] * 5
+        _, ax = mpl.pyplot.subplots()
+        ax = df.plot.hist(ax=ax)  # bins=10
+        assert len(ax.patches) == 40
+
+    def test_hist_secondary_legend(self):
+        # GH 9610
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd")
+        )
+
+        # primary -> secondary
+        _, ax = mpl.pyplot.subplots()
+        ax = df["a"].plot.hist(legend=True, ax=ax)
+        df["b"].plot.hist(ax=ax, legend=True, secondary_y=True)
+        # both legends are drawn on left ax
+        # left and right axis must be visible
+        _check_legend_labels(ax, labels=["a", "b (right)"])
+        assert ax.get_yaxis().get_visible()
+        assert ax.right_ax.get_yaxis().get_visible()
+
+    def test_hist_secondary_secondary(self):
+        # GH 9610
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd")
+        )
+        # secondary -> secondary
+        _, ax = mpl.pyplot.subplots()
+        ax = df["a"].plot.hist(legend=True, secondary_y=True, ax=ax)
+        df["b"].plot.hist(ax=ax, legend=True, secondary_y=True)
+        # both legends are draw on left ax
+        # left axis must be invisible, right axis must be visible
+        _check_legend_labels(ax.left_ax, labels=["a (right)", "b (right)"])
+        assert not ax.left_ax.get_yaxis().get_visible()
+        assert ax.get_yaxis().get_visible()
+
+    def test_hist_secondary_primary(self):
+        # GH 9610
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 4)), columns=list("abcd")
+        )
+        # secondary -> primary
+        _, ax = mpl.pyplot.subplots()
+        ax = df["a"].plot.hist(legend=True, secondary_y=True, ax=ax)
+        # right axes is returned
+        df["b"].plot.hist(ax=ax, legend=True)
+        # both legends are draw on left ax
+        # left and right axis must be visible
+        _check_legend_labels(ax.left_ax, labels=["a (right)", "b"])
+        assert ax.left_ax.get_yaxis().get_visible()
+        assert ax.get_yaxis().get_visible()
+
+    def test_hist_with_nans_and_weights(self):
+        # GH 48884
+        df = DataFrame(
+            [[np.nan, 0.2, 0.3], [0.4, np.nan, np.nan], [0.7, 0.8, 0.9]],
+            columns=list("abc"),
+        )
+        weights = np.array([0.25, 0.3, 0.45])
+        no_nan_df = DataFrame([[0.4, 0.2, 0.3], [0.7, 0.8, 0.9]], columns=list("abc"))
+        no_nan_weights = np.array([[0.3, 0.25, 0.25], [0.45, 0.45, 0.45]])
+
+        _, ax0 = mpl.pyplot.subplots()
+        df.plot.hist(ax=ax0, weights=weights)
+        rects = [x for x in ax0.get_children() if isinstance(x, mpl.patches.Rectangle)]
+        heights = [rect.get_height() for rect in rects]
+        _, ax1 = mpl.pyplot.subplots()
+        no_nan_df.plot.hist(ax=ax1, weights=no_nan_weights)
+        no_nan_rects = [
+            x for x in ax1.get_children() if isinstance(x, mpl.patches.Rectangle)
+        ]
+        no_nan_heights = [rect.get_height() for rect in no_nan_rects]
+        assert all(h0 == h1 for h0, h1 in zip(heights, no_nan_heights, strict=True))
+
+        idxerror_weights = np.array([[0.3, 0.25], [0.45, 0.45]])
+
+        msg = "weights must have the same shape as data, or be a single column"
+        _, ax2 = mpl.pyplot.subplots()
+        with pytest.raises(ValueError, match=msg):
+            no_nan_df.plot.hist(ax=ax2, weights=idxerror_weights)
+
+
+class TestDataFrameGroupByPlots:
+    def test_grouped_hist_legacy(self):
+        rs = np.random.default_rng(10)
+        df = DataFrame(rs.standard_normal((10, 1)), columns=["A"])
+        df["B"] = to_datetime(
+            rs.integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        df["C"] = rs.integers(0, 4, 10)
+        df["D"] = ["X"] * 10
+
+        axes = _grouped_hist(df.A, by=df.C)
+        _check_axes_shape(axes, axes_num=4, layout=(2, 2))
+
+    def test_grouped_hist_legacy_axes_shape_no_col(self):
+        rs = np.random.default_rng(10)
+        df = DataFrame(rs.standard_normal((10, 1)), columns=["A"])
+        df["B"] = to_datetime(
+            rs.integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        df["C"] = rs.integers(0, 4, 10)
+        df["D"] = ["X"] * 10
+        axes = df.hist(by=df.C)
+        _check_axes_shape(axes, axes_num=4, layout=(2, 2))
+
+    def test_grouped_hist_legacy_single_key(self):
+        rs = np.random.default_rng(2)
+        df = DataFrame(rs.standard_normal((10, 1)), columns=["A"])
+        df["B"] = to_datetime(
+            rs.integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        df["C"] = rs.integers(0, 4, 10)
+        df["D"] = ["X"] * 10
+        # group by a key with single value
+        axes = df.hist(by="D", rot=30)
+        _check_axes_shape(axes, axes_num=1, layout=(1, 1))
+        _check_ticks_props(axes, xrot=30)
+
+    def test_grouped_hist_legacy_grouped_hist_kwargs(self):
+        rs = np.random.default_rng(2)
+        df = DataFrame(rs.standard_normal((10, 1)), columns=["A"])
+        df["B"] = to_datetime(
+            rs.integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        df["C"] = rs.integers(0, 4, 10)
+        # make sure kwargs to hist are handled
+        xf, yf = 20, 18
+        xrot, yrot = 30, 40
+
+        axes = _grouped_hist(
+            df.A,
+            by=df.C,
+            cumulative=True,
+            bins=4,
+            xlabelsize=xf,
+            xrot=xrot,
+            ylabelsize=yf,
+            yrot=yrot,
+            density=True,
+        )
+        # height of last bin (index 5) must be 1.0
+        for ax in axes.ravel():
+            rects = [
+                x for x in ax.get_children() if isinstance(x, mpl.patches.Rectangle)
+            ]
+            height = rects[-1].get_height()
+            tm.assert_almost_equal(height, 1.0)
+        _check_ticks_props(axes, xlabelsize=xf, xrot=xrot, ylabelsize=yf, yrot=yrot)
+
+    def test_grouped_hist_legacy_grouped_hist(self):
+        rs = np.random.default_rng(2)
+        df = DataFrame(rs.standard_normal((10, 1)), columns=["A"])
+        df["B"] = to_datetime(
+            rs.integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        df["C"] = rs.integers(0, 4, 10)
+        df["D"] = ["X"] * 10
+        axes = _grouped_hist(df.A, by=df.C, log=True)
+        # scale of y must be 'log'
+        _check_ax_scales(axes, yaxis="log")
+
+    def test_grouped_hist_legacy_external_err(self):
+        rs = np.random.default_rng(2)
+        df = DataFrame(rs.standard_normal((10, 1)), columns=["A"])
+        df["B"] = to_datetime(
+            rs.integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        df["C"] = rs.integers(0, 4, 10)
+        df["D"] = ["X"] * 10
+        # propagate attr exception from matplotlib.Axes.hist
+        with tm.external_error_raised(AttributeError):
+            _grouped_hist(df.A, by=df.C, foo="bar")
+
+    def test_grouped_hist_legacy_figsize_err(self):
+        rs = np.random.default_rng(2)
+        df = DataFrame(rs.standard_normal((10, 1)), columns=["A"])
+        df["B"] = to_datetime(
+            rs.integers(
+                812419200000000000,
+                819331200000000000,
+                size=10,
+                dtype=np.int64,
+            )
+        )
+        df["C"] = rs.integers(0, 4, 10)
+        df["D"] = ["X"] * 10
+        msg = "Specify figure size by tuple instead"
+        with pytest.raises(ValueError, match=msg):
+            df.hist(by="C", figsize="default")
+
+    def test_grouped_hist_legacy2(self):
+        n = 10
+        weight = Series(np.random.default_rng(2).normal(166, 20, size=n))
+        height = Series(np.random.default_rng(2).normal(60, 10, size=n))
+        gender_int = np.random.default_rng(2).choice([0, 1], size=n)
+        df_int = DataFrame({"height": height, "weight": weight, "gender": gender_int})
+        gb = df_int.groupby("gender")
+        axes = gb.hist()
+        assert len(axes) == 2
+        assert len(mpl.pyplot.get_fignums()) == 2
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "msg, plot_col, by_col, layout",
+        [
+            [
+                "Layout of 1x1 must be larger than required size 2",
+                "weight",
+                "gender",
+                (1, 1),
+            ],
+            [
+                "Layout of 1x3 must be larger than required size 4",
+                "height",
+                "category",
+                (1, 3),
+            ],
+            [
+                "At least one dimension of layout must be positive",
+                "height",
+                "category",
+                (-1, -1),
+            ],
+        ],
+    )
+    def test_grouped_hist_layout_error(self, hist_df, msg, plot_col, by_col, layout):
+        df = hist_df
+        with pytest.raises(ValueError, match=msg):
+            df.hist(column=plot_col, by=getattr(df, by_col), layout=layout)
+
+    @pytest.mark.slow
+    def test_grouped_hist_layout_warning(self, hist_df):
+        df = hist_df
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            axes = _check_plot_works(
+                df.hist, column="height", by=df.gender, layout=(2, 1)
+            )
+        _check_axes_shape(axes, axes_num=2, layout=(2, 1))
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "layout, check_layout, figsize",
+        [[(4, 1), (4, 1), None], [(-1, 1), (4, 1), None], [(4, 2), (4, 2), (12, 8)]],
+    )
+    def test_grouped_hist_layout_figsize(self, hist_df, layout, check_layout, figsize):
+        df = hist_df
+        axes = df.hist(column="height", by=df.category, layout=layout, figsize=figsize)
+        _check_axes_shape(axes, axes_num=4, layout=check_layout, figsize=figsize)
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize("kwargs", [{}, {"column": "height", "layout": (2, 2)}])
+    def test_grouped_hist_layout_by_warning(self, hist_df, kwargs):
+        df = hist_df
+        # GH 6769
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            axes = _check_plot_works(df.hist, by="classroom", **kwargs)
+        _check_axes_shape(axes, axes_num=3, layout=(2, 2))
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "kwargs, axes_num, layout",
+        [
+            [{"by": "gender", "layout": (3, 5)}, 2, (3, 5)],
+            [{"column": ["height", "weight", "category"]}, 3, (2, 2)],
+        ],
+    )
+    def test_grouped_hist_layout_axes(self, hist_df, kwargs, axes_num, layout):
+        df = hist_df
+        axes = df.hist(**kwargs)
+        _check_axes_shape(axes, axes_num=axes_num, layout=layout)
+
+    def test_grouped_hist_multiple_axes(self, hist_df):
+        # GH 6970, GH 7069
+        df = hist_df
+
+        fig, axes = mpl.pyplot.subplots(2, 3)
+        returned = df.hist(column=["height", "weight", "category"], ax=axes[0])
+        _check_axes_shape(returned, axes_num=3, layout=(1, 3))
+        tm.assert_numpy_array_equal(returned, axes[0])
+        assert returned[0].figure is fig
+
+    def test_grouped_hist_multiple_axes_no_cols(self, hist_df):
+        # GH 6970, GH 7069
+        df = hist_df
+
+        fig, axes = mpl.pyplot.subplots(2, 3)
+        returned = df.hist(by="classroom", ax=axes[1])
+        _check_axes_shape(returned, axes_num=3, layout=(1, 3))
+        tm.assert_numpy_array_equal(returned, axes[1])
+        assert returned[0].figure is fig
+
+    def test_grouped_hist_multiple_axes_error(self, hist_df):
+        # GH 6970, GH 7069
+        df = hist_df
+        fig, axes = mpl.pyplot.subplots(2, 3)
+        # pass different number of axes from required
+        msg = "The number of passed axes must be 1, the same as the output plot"
+        with pytest.raises(ValueError, match=msg):
+            axes = df.hist(column="height", ax=axes)
+
+    def test_axis_share_x(self, hist_df):
+        df = hist_df
+        # GH4089
+        ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True)
+
+        # share x
+        assert get_x_axis(ax1).joined(ax1, ax2)
+        assert get_x_axis(ax2).joined(ax1, ax2)
+
+        # don't share y
+        assert not get_y_axis(ax1).joined(ax1, ax2)
+        assert not get_y_axis(ax2).joined(ax1, ax2)
+
+    def test_axis_share_y(self, hist_df):
+        df = hist_df
+        ax1, ax2 = df.hist(column="height", by=df.gender, sharey=True)
+
+        # share y
+        assert get_y_axis(ax1).joined(ax1, ax2)
+        assert get_y_axis(ax2).joined(ax1, ax2)
+
+        # don't share x
+        assert not get_x_axis(ax1).joined(ax1, ax2)
+        assert not get_x_axis(ax2).joined(ax1, ax2)
+
+    def test_axis_share_xy(self, hist_df):
+        df = hist_df
+        ax1, ax2 = df.hist(column="height", by=df.gender, sharex=True, sharey=True)
+
+        # share both x and y
+        assert get_x_axis(ax1).joined(ax1, ax2)
+        assert get_x_axis(ax2).joined(ax1, ax2)
+
+        assert get_y_axis(ax1).joined(ax1, ax2)
+        assert get_y_axis(ax2).joined(ax1, ax2)
+
+    @pytest.mark.parametrize(
+        "histtype, expected",
+        [
+            ("bar", True),
+            ("barstacked", True),
+            ("step", False),
+            ("stepfilled", True),
+        ],
+    )
+    def test_histtype_argument(self, histtype, expected):
+        # GH23992 Verify functioning of histtype argument
+        df = DataFrame(
+            np.random.default_rng(2).integers(1, 10, size=(10, 2)), columns=["a", "b"]
+        )
+        ax = df.hist(by="a", histtype=histtype)
+        _check_patches_all_filled(ax, filled=expected)
diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b6d2499787c02347b0f152fe0d4e0231568e544
--- /dev/null
+++ b/pandas/tests/plotting/test_misc.py
@@ -0,0 +1,866 @@
+"""Test cases for misc plot functions"""
+
+import os
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    Timestamp,
+    date_range,
+    interval_range,
+    period_range,
+    plotting,
+    read_csv,
+)
+import pandas._testing as tm
+from pandas.tests.plotting.common import (
+    _check_colors,
+    _check_legend_labels,
+    _check_plot_works,
+    _check_text_labels,
+    _check_ticks_props,
+)
+
+mpl = pytest.importorskip("matplotlib")
+plt = pytest.importorskip("matplotlib.pyplot")
+cm = pytest.importorskip("matplotlib.cm")
+
+import re
+
+from pandas.plotting._matplotlib.style import get_standard_colors
+
+
+@pytest.fixture
+def iris(datapath) -> DataFrame:
+    """
+    The iris dataset as a DataFrame.
+    """
+    return read_csv(datapath("io", "data", "csv", "iris.csv"))
+
+
+@td.skip_if_installed("matplotlib")
+def test_import_error_message():
+    # GH-19810
+    df = DataFrame({"A": [1, 2]})
+
+    with pytest.raises(ImportError, match="matplotlib is required for plotting"):
+        df.plot()
+
+
+def test_get_accessor_args():
+    func = plotting._core.PlotAccessor._get_call_args
+
+    msg = "Called plot accessor for type list, expected Series or DataFrame"
+    with pytest.raises(TypeError, match=msg):
+        func(backend_name="", data=[], args=[], kwargs={})
+
+    msg = "should not be called with positional arguments"
+    with pytest.raises(TypeError, match=msg):
+        func(backend_name="", data=Series(dtype=object), args=["line", None], kwargs={})
+
+    x, y, kind, kwargs = func(
+        backend_name="",
+        data=DataFrame(),
+        args=["x"],
+        kwargs={"y": "y", "kind": "bar", "grid": False},
+    )
+    assert x == "x"
+    assert y == "y"
+    assert kind == "bar"
+    assert kwargs == {"grid": False}
+
+    x, y, kind, kwargs = func(
+        backend_name="pandas.plotting._matplotlib",
+        data=Series(dtype=object),
+        args=[],
+        kwargs={},
+    )
+    assert x is None
+    assert y is None
+    assert kind == "line"
+    assert len(kwargs) == 24
+
+
+@pytest.mark.parametrize("kind", plotting.PlotAccessor._all_kinds)
+@pytest.mark.parametrize(
+    "data", [DataFrame(np.arange(15).reshape(5, 3)), Series(range(5))]
+)
+@pytest.mark.parametrize(
+    "index",
+    [
+        Index(range(5)),
+        date_range("2020-01-01", periods=5),
+        period_range("2020-01-01", periods=5),
+    ],
+)
+def test_savefig(kind, data, index):
+    fig, ax = plt.subplots()
+    data.index = index
+    kwargs = {}
+    if kind in ["hexbin", "scatter", "pie"]:
+        if isinstance(data, Series):
+            pytest.skip(f"{kind} not supported with Series")
+        kwargs = {"x": 0, "y": 1}
+    data.plot(kind=kind, ax=ax, **kwargs)
+    fig.savefig(os.devnull)
+
+
+class TestSeriesPlots:
+    def test_autocorrelation_plot(self):
+        ser = Series(
+            np.arange(10, dtype=np.float64),
+            index=date_range("2020-01-01", periods=10),
+            name="ts",
+        )
+        # Ensure no UserWarning when making plot
+        with tm.assert_produces_warning(None):
+            _check_plot_works(plotting.autocorrelation_plot, series=ser)
+            _check_plot_works(plotting.autocorrelation_plot, series=ser.values)
+
+            ax = plotting.autocorrelation_plot(ser, label="Test")
+        _check_legend_labels(ax, labels=["Test"])
+
+    @pytest.mark.parametrize("kwargs", [{}, {"lag": 5}])
+    def test_lag_plot(self, kwargs):
+        ser = Series(
+            np.arange(10, dtype=np.float64),
+            index=date_range("2020-01-01", periods=10),
+            name="ts",
+        )
+        _check_plot_works(plotting.lag_plot, series=ser, **kwargs)
+
+    def test_bootstrap_plot(self):
+        ser = Series(
+            np.arange(10, dtype=np.float64),
+            index=date_range("2020-01-01", periods=10),
+            name="ts",
+        )
+        _check_plot_works(plotting.bootstrap_plot, series=ser, size=10)
+
+
+class TestDataFramePlots:
+    @pytest.mark.parametrize("pass_axis", [False, True])
+    def test_scatter_matrix_axis(self, pass_axis):
+        pytest.importorskip("scipy")
+        scatter_matrix = plotting.scatter_matrix
+
+        ax = None
+        if pass_axis:
+            _, ax = mpl.pyplot.subplots(3, 3)
+
+        df = DataFrame(np.random.default_rng(2).standard_normal((10, 3)))
+
+        # we are plotting multiples on a sub-plot
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            axes = _check_plot_works(
+                scatter_matrix,
+                frame=df,
+                range_padding=0.1,
+                ax=ax,
+            )
+        axes0_labels = axes[0][0].yaxis.get_majorticklabels()
+        # GH 5662
+        expected = ["-2", "-1", "0"]
+        _check_text_labels(axes0_labels, expected)
+        _check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
+
+    @pytest.mark.parametrize("pass_axis", [False, True])
+    def test_scatter_matrix_axis_smaller(self, pass_axis):
+        pytest.importorskip("scipy")
+        scatter_matrix = plotting.scatter_matrix
+
+        ax = None
+        if pass_axis:
+            _, ax = mpl.pyplot.subplots(3, 3)
+
+        df = DataFrame(np.random.default_rng(11).standard_normal((10, 3)))
+        df[0] = (df[0] - 2) / 3
+
+        # we are plotting multiples on a sub-plot
+        with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
+            axes = _check_plot_works(
+                scatter_matrix,
+                frame=df,
+                range_padding=0.1,
+                ax=ax,
+            )
+        axes0_labels = axes[0][0].yaxis.get_majorticklabels()
+        expected = ["-1.25", "-1.0", "-0.75", "-0.5"]
+        _check_text_labels(axes0_labels, expected)
+        _check_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
+
+    @pytest.mark.slow
+    def test_andrews_curves_no_warning(self, iris):
+        # Ensure no UserWarning when making plot
+        with tm.assert_produces_warning(None):
+            _check_plot_works(plotting.andrews_curves, frame=iris, class_column="Name")
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "linecolors",
+        [
+            ("#556270", "#4ECDC4", "#C7F464"),
+            ["dodgerblue", "aquamarine", "seagreen"],
+        ],
+    )
+    @pytest.mark.parametrize(
+        "df",
+        [
+            "iris",
+            DataFrame(
+                {
+                    "A": np.random.default_rng(2).standard_normal(10),
+                    "B": np.random.default_rng(2).standard_normal(10),
+                    "C": np.random.default_rng(2).standard_normal(10),
+                    "Name": ["A"] * 10,
+                }
+            ),
+        ],
+    )
+    def test_andrews_curves_linecolors(self, request, df, linecolors):
+        if isinstance(df, str):
+            df = request.getfixturevalue(df)
+        ax = _check_plot_works(
+            plotting.andrews_curves, frame=df, class_column="Name", color=linecolors
+        )
+        _check_colors(
+            ax.get_lines()[:10], linecolors=linecolors, mapping=df["Name"][:10]
+        )
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "df",
+        [
+            "iris",
+            DataFrame(
+                {
+                    "A": np.random.default_rng(2).standard_normal(10),
+                    "B": np.random.default_rng(2).standard_normal(10),
+                    "C": np.random.default_rng(2).standard_normal(10),
+                    "Name": ["A"] * 10,
+                }
+            ),
+        ],
+    )
+    def test_andrews_curves_cmap(self, request, df):
+        if isinstance(df, str):
+            df = request.getfixturevalue(df)
+        cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())]
+        ax = _check_plot_works(
+            plotting.andrews_curves, frame=df, class_column="Name", color=cmaps
+        )
+        _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10])
+
+    @pytest.mark.slow
+    def test_andrews_curves_handle(self):
+        colors = ["b", "g", "r"]
+        df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors})
+        ax = plotting.andrews_curves(df, "Name", color=colors)
+        handles, _ = ax.get_legend_handles_labels()
+        _check_colors(handles, linecolors=colors)
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "color",
+        [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]],
+    )
+    def test_parallel_coordinates_colors(self, iris, color):
+        df = iris
+
+        ax = _check_plot_works(
+            plotting.parallel_coordinates, frame=df, class_column="Name", color=color
+        )
+        _check_colors(ax.get_lines()[:10], linecolors=color, mapping=df["Name"][:10])
+
+    @pytest.mark.slow
+    def test_parallel_coordinates_cmap(self, iris):
+        df = iris
+
+        ax = _check_plot_works(
+            plotting.parallel_coordinates,
+            frame=df,
+            class_column="Name",
+            colormap=cm.jet,
+        )
+        cmaps = [mpl.cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())]
+        _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10])
+
+    @pytest.mark.slow
+    def test_parallel_coordinates_line_diff(self, iris):
+        df = iris
+
+        ax = _check_plot_works(
+            plotting.parallel_coordinates, frame=df, class_column="Name"
+        )
+        nlines = len(ax.get_lines())
+        nxticks = len(ax.xaxis.get_ticklabels())
+
+        ax = _check_plot_works(
+            plotting.parallel_coordinates, frame=df, class_column="Name", axvlines=False
+        )
+        assert len(ax.get_lines()) == (nlines - nxticks)
+
+    @pytest.mark.slow
+    def test_parallel_coordinates_handles(self, iris):
+        df = iris
+        colors = ["b", "g", "r"]
+        df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors})
+        ax = plotting.parallel_coordinates(df, "Name", color=colors)
+        handles, _ = ax.get_legend_handles_labels()
+        _check_colors(handles, linecolors=colors)
+
+    # not sure if this is indicative of a problem
+    @pytest.mark.filterwarnings("ignore:Attempting to set:UserWarning")
+    def test_parallel_coordinates_with_sorted_labels(self):
+        # GH 15908
+        df = DataFrame(
+            {
+                "feat": list(range(30)),
+                "class": [2 for _ in range(10)]
+                + [3 for _ in range(10)]
+                + [1 for _ in range(10)],
+            }
+        )
+        ax = plotting.parallel_coordinates(df, "class", sort_labels=True)
+        polylines, labels = ax.get_legend_handles_labels()
+        color_label_tuples = zip(
+            [polyline.get_color() for polyline in polylines], labels, strict=True
+        )
+        ordered_color_label_tuples = sorted(color_label_tuples, key=lambda x: x[1])
+        prev_next_tupels = zip(
+            list(ordered_color_label_tuples[0:-1]),
+            list(ordered_color_label_tuples[1:]),
+            strict=True,
+        )
+        for prev, nxt in prev_next_tupels:
+            # labels and colors are ordered strictly increasing
+            assert prev[1] < nxt[1] and prev[0] < nxt[0]
+
+    def test_radviz_no_warning(self, iris):
+        # Ensure no UserWarning when making plot
+        with tm.assert_produces_warning(None):
+            _check_plot_works(plotting.radviz, frame=iris, class_column="Name")
+
+    @pytest.mark.parametrize(
+        "color",
+        [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]],
+    )
+    def test_radviz_color(self, iris, color):
+        df = iris
+        ax = _check_plot_works(
+            plotting.radviz, frame=df, class_column="Name", color=color
+        )
+        # skip Circle drawn as ticks
+        patches = [p for p in ax.patches[:20] if p.get_label() != ""]
+        _check_colors(patches[:10], facecolors=color, mapping=df["Name"][:10])
+
+    def test_radviz_color_cmap(self, iris):
+        df = iris
+        ax = _check_plot_works(
+            plotting.radviz, frame=df, class_column="Name", colormap=cm.jet
+        )
+        cmaps = [mpl.cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())]
+        patches = [p for p in ax.patches[:20] if p.get_label() != ""]
+        _check_colors(patches, facecolors=cmaps, mapping=df["Name"][:10])
+
+    def test_radviz_colors_handles(self):
+        colors = [[0.0, 0.0, 1.0, 1.0], [0.0, 0.5, 1.0, 1.0], [1.0, 0.0, 0.0, 1.0]]
+        df = DataFrame(
+            {"A": [1, 2, 3], "B": [2, 1, 3], "C": [3, 2, 1], "Name": ["b", "g", "r"]}
+        )
+        ax = plotting.radviz(df, "Name", color=colors)
+        handles, _ = ax.get_legend_handles_labels()
+        _check_colors(handles, facecolors=colors)
+
+    def test_subplot_titles(self, iris):
+        df = iris.drop("Name", axis=1).head()
+        # Use the column names as the subplot titles
+        title = list(df.columns)
+
+        # Case len(title) == len(df)
+        plot = df.plot(subplots=True, title=title)
+        assert [p.get_title() for p in plot] == title
+
+    def test_subplot_titles_too_much(self, iris):
+        df = iris.drop("Name", axis=1).head()
+        # Use the column names as the subplot titles
+        title = list(df.columns)
+        # Case len(title) > len(df)
+        msg = (
+            "The length of `title` must equal the number of columns if "
+            "using `title` of type `list` and `subplots=True`"
+        )
+        with pytest.raises(ValueError, match=msg):
+            df.plot(subplots=True, title=[*title, "kittens > puppies"])
+
+    def test_subplot_titles_too_little(self, iris):
+        df = iris.drop("Name", axis=1).head()
+        # Use the column names as the subplot titles
+        title = list(df.columns)
+        msg = (
+            "The length of `title` must equal the number of columns if "
+            "using `title` of type `list` and `subplots=True`"
+        )
+        # Case len(title) < len(df)
+        with pytest.raises(ValueError, match=msg):
+            df.plot(subplots=True, title=title[:2])
+
+    def test_subplot_titles_subplots_false(self, iris):
+        df = iris.drop("Name", axis=1).head()
+        # Use the column names as the subplot titles
+        title = list(df.columns)
+        # Case subplots=False and title is of type list
+        msg = (
+            "Using `title` of type `list` is not supported unless "
+            "`subplots=True` is passed"
+        )
+        with pytest.raises(ValueError, match=msg):
+            df.plot(subplots=False, title=title)
+
+    def test_subplot_titles_numeric_square_layout(self, iris):
+        df = iris.drop("Name", axis=1).head()
+        # Use the column names as the subplot titles
+        title = list(df.columns)
+        # Case df with 3 numeric columns but layout of (2,2)
+        plot = df.drop("SepalWidth", axis=1).plot(
+            subplots=True, layout=(2, 2), title=title[:-1]
+        )
+        title_list = [ax.get_title() for sublist in plot for ax in sublist]
+        assert title_list == [*title[:3], ""]
+
+    def test_get_standard_colors_random_seed(self):
+        # GH17525
+        df = DataFrame(np.zeros((10, 10)))
+
+        # Make sure that the random seed isn't reset by get_standard_colors
+        plotting.parallel_coordinates(df, 0)
+        rand1 = np.random.default_rng(None).random()
+        plotting.parallel_coordinates(df, 0)
+        rand2 = np.random.default_rng(None).random()
+        assert rand1 != rand2
+
+    def test_get_standard_colors_consistency(self):
+        # GH17525
+        # Make sure it produces the same colors every time it's called
+        color1 = get_standard_colors(1, color_type="random")
+        color2 = get_standard_colors(1, color_type="random")
+        assert color1 == color2
+
+    def test_get_standard_colors_default_num_colors(self):
+        # Make sure the default color_types returns the specified amount
+        color1 = get_standard_colors(1, color_type="default")
+        color2 = get_standard_colors(9, color_type="default")
+        color3 = get_standard_colors(20, color_type="default")
+        assert len(color1) == 1
+        assert len(color2) == 9
+        assert len(color3) == 20
+
+    def test_plot_single_color(self):
+        # Example from #20585. All 3 bars should have the same color
+        df = DataFrame(
+            {
+                "account-start": ["2017-02-03", "2017-03-03", "2017-01-01"],
+                "client": ["Alice Anders", "Bob Baker", "Charlie Chaplin"],
+                "balance": [-1432.32, 10.43, 30000.00],
+                "db-id": [1234, 2424, 251],
+                "proxy-id": [525, 1525, 2542],
+                "rank": [52, 525, 32],
+            }
+        )
+        ax = df.client.value_counts().plot.bar()
+        colors = [rect.get_facecolor() for rect in ax.get_children()[0:3]]
+        assert all(color == colors[0] for color in colors)
+
+    def test_get_standard_colors_no_appending(self):
+        # GH20726
+
+        # Make sure not to add more colors so that matplotlib can cycle
+        # correctly.
+        color_before = mpl.cm.gnuplot(range(5))
+        color_after = get_standard_colors(1, color=color_before)
+        assert len(color_after) == len(color_before)
+
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((48, 4)), columns=list("ABCD")
+        )
+
+        color_list = mpl.cm.gnuplot(np.linspace(0, 1, 16))
+        p = df.A.plot.bar(figsize=(16, 7), color=color_list)
+        assert p.patches[1].get_facecolor() == p.patches[17].get_facecolor()
+
+    @pytest.mark.parametrize("kind", ["bar", "line"])
+    def test_dictionary_color(self, kind):
+        # issue-8193
+        # Test plot color dictionary format
+        data_files = ["a", "b"]
+
+        expected = [(0.5, 0.24, 0.6), (0.3, 0.7, 0.7)]
+
+        df1 = DataFrame(np.random.default_rng(2).random((2, 2)), columns=data_files)
+        dic_color = {"b": (0.3, 0.7, 0.7), "a": (0.5, 0.24, 0.6)}
+
+        ax = df1.plot(kind=kind, color=dic_color)
+        if kind == "bar":
+            colors = [rect.get_facecolor()[0:-1] for rect in ax.get_children()[0:3:2]]
+        else:
+            colors = [rect.get_color() for rect in ax.get_lines()[0:2]]
+        assert all(color == expected[index] for index, color in enumerate(colors))
+
+    def test_bar_plot(self):
+        # GH38947
+        # Test bar plot with string and int index
+        expected = [mpl.text.Text(0, 0, "0"), mpl.text.Text(1, 0, "Total")]
+
+        df = DataFrame(
+            {
+                "a": [1, 2],
+            },
+            index=Index([0, "Total"]),
+        )
+        plot_bar = df.plot.bar()
+        assert all(
+            (a.get_text() == b.get_text())
+            for a, b in zip(plot_bar.get_xticklabels(), expected, strict=True)
+        )
+
+    def test_barh_plot_labels_mixed_integer_string(self):
+        # GH39126
+        # Test barh plot with string and integer at the same column
+        df = DataFrame([{"word": 1, "value": 0}, {"word": "knowledge", "value": 2}])
+        plot_barh = df.plot.barh(x="word", legend=None)
+        expected_yticklabels = [
+            mpl.text.Text(0, 0, "1"),
+            mpl.text.Text(0, 1, "knowledge"),
+        ]
+        assert all(
+            actual.get_text() == expected.get_text()
+            for actual, expected in zip(
+                plot_barh.get_yticklabels(), expected_yticklabels, strict=True
+            )
+        )
+
+    def test_has_externally_shared_axis_x_axis(self):
+        # GH33819
+        # Test _has_externally_shared_axis() works for x-axis
+        func = plotting._matplotlib.tools._has_externally_shared_axis
+
+        fig = mpl.pyplot.figure()
+        plots = fig.subplots(2, 4)
+
+        # Create *externally* shared axes for first and third columns
+        plots[0][0] = fig.add_subplot(231, sharex=plots[1][0])
+        plots[0][2] = fig.add_subplot(233, sharex=plots[1][2])
+
+        # Create *internally* shared axes for second and third columns
+        plots[0][1].twinx()
+        plots[0][2].twinx()
+
+        # First  column is only externally shared
+        # Second column is only internally shared
+        # Third  column is both
+        # Fourth column is neither
+        assert func(plots[0][0], "x")
+        assert not func(plots[0][1], "x")
+        assert func(plots[0][2], "x")
+        assert not func(plots[0][3], "x")
+
+    def test_has_externally_shared_axis_y_axis(self):
+        # GH33819
+        # Test _has_externally_shared_axis() works for y-axis
+        func = plotting._matplotlib.tools._has_externally_shared_axis
+
+        fig = mpl.pyplot.figure()
+        plots = fig.subplots(4, 2)
+
+        # Create *externally* shared axes for first and third rows
+        plots[0][0] = fig.add_subplot(321, sharey=plots[0][1])
+        plots[2][0] = fig.add_subplot(325, sharey=plots[2][1])
+
+        # Create *internally* shared axes for second and third rows
+        plots[1][0].twiny()
+        plots[2][0].twiny()
+
+        # First  row is only externally shared
+        # Second row is only internally shared
+        # Third  row is both
+        # Fourth row is neither
+        assert func(plots[0][0], "y")
+        assert not func(plots[1][0], "y")
+        assert func(plots[2][0], "y")
+        assert not func(plots[3][0], "y")
+
+    def test_has_externally_shared_axis_invalid_compare_axis(self):
+        # GH33819
+        # Test _has_externally_shared_axis() raises an exception when
+        # passed an invalid value as compare_axis parameter
+        func = plotting._matplotlib.tools._has_externally_shared_axis
+
+        fig = mpl.pyplot.figure()
+        plots = fig.subplots(4, 2)
+
+        # Create arbitrary axes
+        plots[0][0] = fig.add_subplot(321, sharey=plots[0][1])
+
+        # Check that an invalid compare_axis value triggers the expected exception
+        msg = "needs 'x' or 'y' as a second parameter"
+        with pytest.raises(ValueError, match=msg):
+            func(plots[0][0], "z")
+
+    def test_externally_shared_axes(self):
+        # Example from GH33819
+        # Create data
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).standard_normal(10),
+                "b": np.random.default_rng(2).standard_normal(10),
+            }
+        )
+
+        # Create figure
+        fig = mpl.pyplot.figure()
+        plots = fig.subplots(2, 3)
+
+        # Create *externally* shared axes
+        plots[0][0] = fig.add_subplot(231, sharex=plots[1][0])
+        # note: no plots[0][1] that's the twin only case
+        plots[0][2] = fig.add_subplot(233, sharex=plots[1][2])
+
+        # Create *internally* shared axes
+        # note: no plots[0][0] that's the external only case
+        twin_ax1 = plots[0][1].twinx()
+        twin_ax2 = plots[0][2].twinx()
+
+        # Plot data to primary axes
+        df["a"].plot(ax=plots[0][0], title="External share only").set_xlabel(
+            "this label should never be visible"
+        )
+        df["a"].plot(ax=plots[1][0])
+
+        df["a"].plot(ax=plots[0][1], title="Internal share (twin) only").set_xlabel(
+            "this label should always be visible"
+        )
+        df["a"].plot(ax=plots[1][1])
+
+        df["a"].plot(ax=plots[0][2], title="Both").set_xlabel(
+            "this label should never be visible"
+        )
+        df["a"].plot(ax=plots[1][2])
+
+        # Plot data to twinned axes
+        df["b"].plot(ax=twin_ax1, color="green")
+        df["b"].plot(ax=twin_ax2, color="yellow")
+
+        assert not plots[0][0].xaxis.get_label().get_visible()
+        assert plots[0][1].xaxis.get_label().get_visible()
+        assert not plots[0][2].xaxis.get_label().get_visible()
+
+    def test_plot_bar_axis_units_timestamp_conversion(self):
+        # GH 38736
+        # Ensure string x-axis from the second plot will not be converted to datetime
+        # due to axis data from first plot
+        df = DataFrame(
+            [1.0],
+            index=[Timestamp("2022-02-22 22:22:22")],
+        )
+        _check_plot_works(df.plot)
+        s = Series({"A": 1.0})
+        _check_plot_works(s.plot.bar)
+
+    def test_bar_plt_xaxis_intervalrange(self):
+        # GH 38969
+        # Ensure IntervalIndex x-axis produces a bar plot as expected
+        expected = [mpl.text.Text(0, 0, "([0, 1],)"), mpl.text.Text(1, 0, "([1, 2],)")]
+        s = Series(
+            [1, 2],
+            index=[interval_range(0, 2, closed="both")],
+        )
+        _check_plot_works(s.plot.bar)
+        assert all(
+            (a.get_text() == b.get_text())
+            for a, b in zip(s.plot.bar().get_xticklabels(), expected, strict=True)
+        )
+
+
+@pytest.fixture
+def df_bar_data():
+    return np.random.default_rng(3).integers(0, 100, 5)
+
+
+@pytest.fixture
+def df_bar_df(df_bar_data) -> DataFrame:
+    df_bar_df = DataFrame(
+        {
+            "A": df_bar_data,
+            "B": df_bar_data[::-1],
+            "C": df_bar_data[0],
+            "D": df_bar_data[-1],
+        }
+    )
+    return df_bar_df
+
+
+def _df_bar_xyheight_from_ax_helper(df_bar_data, ax, subplot_division):
+    subplot_data_df_list = []
+
+    # get xy and height of squares representing data, separated by subplots
+    for i in range(len(subplot_division)):
+        subplot_data = np.array(
+            [
+                (x.get_x(), x.get_y(), x.get_height())
+                for x in ax[i].findobj(plt.Rectangle)
+                if x.get_height() in df_bar_data
+            ]
+        )
+        subplot_data_df_list.append(
+            DataFrame(data=subplot_data, columns=["x_coord", "y_coord", "height"])
+        )
+
+    return subplot_data_df_list
+
+
+def _df_bar_subplot_checker(df_bar_data, df_bar_df, subplot_data_df, subplot_columns):
+    subplot_sliced_by_source = [
+        subplot_data_df.iloc[
+            len(df_bar_data) * i : len(df_bar_data) * (i + 1)
+        ].reset_index()
+        for i in range(len(subplot_columns))
+    ]
+
+    if len(subplot_columns) == 1:
+        expected_total_height = df_bar_df.loc[:, subplot_columns[0]]
+    else:
+        expected_total_height = df_bar_df.loc[:, subplot_columns].sum(axis=1)
+
+    for i in range(len(subplot_columns)):
+        sliced_df = subplot_sliced_by_source[i]
+        if i == 0:
+            # Checks that the bar chart starts y=0
+            assert (sliced_df["y_coord"] == 0).all()
+            height_iter = sliced_df["y_coord"].add(sliced_df["height"])
+        else:
+            height_iter = height_iter + sliced_df["height"]
+
+        if i + 1 == len(subplot_columns):
+            # Checks final height matches what is expected
+            tm.assert_series_equal(
+                height_iter, expected_total_height, check_names=False, check_dtype=False
+            )
+        else:
+            # Checks each preceding bar ends where the next one starts
+            next_start_coord = subplot_sliced_by_source[i + 1]["y_coord"]
+            tm.assert_series_equal(
+                height_iter, next_start_coord, check_names=False, check_dtype=False
+            )
+
+
+# GH Issue 61018
+@pytest.mark.parametrize("columns_used", [["A", "B"], ["C", "D"], ["D", "A"]])
+def test_bar_1_subplot_1_double_stacked(df_bar_data, df_bar_df, columns_used):
+    df_bar_df_trimmed = df_bar_df[columns_used]
+    subplot_division = [columns_used]
+    ax = df_bar_df_trimmed.plot(subplots=subplot_division, kind="bar", stacked=True)
+    subplot_data_df_list = _df_bar_xyheight_from_ax_helper(
+        df_bar_data, ax, subplot_division
+    )
+    for i in range(len(subplot_data_df_list)):
+        _df_bar_subplot_checker(
+            df_bar_data, df_bar_df_trimmed, subplot_data_df_list[i], subplot_division[i]
+        )
+
+
+@pytest.mark.parametrize(
+    "columns_used", [["A", "B", "C"], ["A", "C", "B"], ["D", "A", "C"]]
+)
+def test_bar_2_subplot_1_double_stacked(df_bar_data, df_bar_df, columns_used):
+    df_bar_df_trimmed = df_bar_df[columns_used]
+    subplot_division = [(columns_used[0], columns_used[1]), (columns_used[2],)]
+    ax = df_bar_df_trimmed.plot(subplots=subplot_division, kind="bar", stacked=True)
+    subplot_data_df_list = _df_bar_xyheight_from_ax_helper(
+        df_bar_data, ax, subplot_division
+    )
+    for i in range(len(subplot_data_df_list)):
+        _df_bar_subplot_checker(
+            df_bar_data, df_bar_df_trimmed, subplot_data_df_list[i], subplot_division[i]
+        )
+
+
+@pytest.mark.parametrize(
+    "subplot_division",
+    [
+        [("A", "B"), ("C", "D")],
+        [("A", "D"), ("C", "B")],
+        [("B", "C"), ("D", "A")],
+        [("B", "D"), ("C", "A")],
+    ],
+)
+def test_bar_2_subplot_2_double_stacked(df_bar_data, df_bar_df, subplot_division):
+    ax = df_bar_df.plot(subplots=subplot_division, kind="bar", stacked=True)
+    subplot_data_df_list = _df_bar_xyheight_from_ax_helper(
+        df_bar_data, ax, subplot_division
+    )
+    for i in range(len(subplot_data_df_list)):
+        _df_bar_subplot_checker(
+            df_bar_data, df_bar_df, subplot_data_df_list[i], subplot_division[i]
+        )
+
+
+@pytest.mark.parametrize(
+    "subplot_division",
+    [[("A", "B", "C")], [("A", "D", "B")], [("C", "A", "D")], [("D", "C", "A")]],
+)
+def test_bar_2_subplots_1_triple_stacked(df_bar_data, df_bar_df, subplot_division):
+    ax = df_bar_df.plot(subplots=subplot_division, kind="bar", stacked=True)
+    subplot_data_df_list = _df_bar_xyheight_from_ax_helper(
+        df_bar_data, ax, subplot_division
+    )
+    for i in range(len(subplot_data_df_list)):
+        _df_bar_subplot_checker(
+            df_bar_data, df_bar_df, subplot_data_df_list[i], subplot_division[i]
+        )
+
+
+def test_bar_subplots_stacking_bool(df_bar_data, df_bar_df):
+    subplot_division = [("A"), ("B"), ("C"), ("D")]
+    ax = df_bar_df.plot(subplots=True, kind="bar", stacked=True)
+    subplot_data_df_list = _df_bar_xyheight_from_ax_helper(
+        df_bar_data, ax, subplot_division
+    )
+    for i in range(len(subplot_data_df_list)):
+        _df_bar_subplot_checker(
+            df_bar_data, df_bar_df, subplot_data_df_list[i], subplot_division[i]
+        )
+
+
+def test_plot_bar_label_count_default():
+    df = DataFrame(
+        [(30, 10, 10, 10), (20, 20, 20, 20), (10, 30, 30, 10)], columns=list("ABCD")
+    )
+    df.plot(subplots=True, kind="bar", title=["A", "B", "C", "D"])
+
+
+def test_plot_bar_label_count_expected_fail():
+    df = DataFrame(
+        [(30, 10, 10, 10), (20, 20, 20, 20), (10, 30, 30, 10)], columns=list("ABCD")
+    )
+    error_regex = re.escape(
+        "The number of titles (4) must equal the number of subplots (3)."
+    )
+    with pytest.raises(ValueError, match=error_regex):
+        df.plot(
+            subplots=[("A", "B")],
+            kind="bar",
+            title=["A&B", "C", "D", "Extra Title"],
+        )
+
+
+def test_plot_bar_label_count_expected_success():
+    df = DataFrame(
+        [(30, 10, 10, 10), (20, 20, 20, 20), (10, 30, 30, 10)], columns=list("ABCD")
+    )
+    df.plot(subplots=[("A", "B", "D")], kind="bar", title=["A&B&D", "C"])
diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ef1e660236c899ef19e9419e9c64402d81beace
--- /dev/null
+++ b/pandas/tests/plotting/test_series.py
@@ -0,0 +1,1005 @@
+"""Test cases for Series.plot"""
+
+from datetime import datetime
+from itertools import chain
+
+import numpy as np
+import pytest
+
+from pandas.compat import is_platform_linux
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Series,
+    date_range,
+    period_range,
+    plotting,
+)
+import pandas._testing as tm
+from pandas.tests.plotting.common import (
+    _check_ax_scales,
+    _check_axes_shape,
+    _check_colors,
+    _check_grid_settings,
+    _check_has_errorbars,
+    _check_legend_labels,
+    _check_plot_works,
+    _check_text_labels,
+    _check_ticks_props,
+    _unpack_cycler,
+    get_y_axis,
+)
+
+from pandas.tseries.offsets import CustomBusinessDay
+
+mpl = pytest.importorskip("matplotlib")
+plt = pytest.importorskip("matplotlib.pyplot")
+
+from pandas.plotting._matplotlib.converter import DatetimeConverter
+from pandas.plotting._matplotlib.style import get_standard_colors
+
+pytestmark = [
+    pytest.mark.filterwarnings(
+        "ignore:divide by zero encountered in scalar divide:RuntimeWarning"
+    ),
+    pytest.mark.filterwarnings(
+        "ignore:invalid value encountered in scalar multiply:RuntimeWarning"
+    ),
+]
+
+
+@pytest.fixture
+def ts():
+    return Series(
+        np.arange(10, dtype=np.float64),
+        index=date_range("2020-01-01", periods=10),
+        name="ts",
+    )
+
+
+@pytest.fixture
+def series():
+    return Series(
+        range(10), dtype=np.float64, name="series", index=[f"i_{i}" for i in range(10)]
+    )
+
+
+class TestSeriesPlots:
+    @pytest.mark.slow
+    @pytest.mark.parametrize("kwargs", [{"label": "foo"}, {"use_index": False}])
+    def test_plot(self, ts, kwargs):
+        _check_plot_works(ts.plot, **kwargs)
+
+    @pytest.mark.slow
+    def test_plot_tick_props(self, ts):
+        axes = _check_plot_works(ts.plot, rot=0)
+        _check_ticks_props(axes, xrot=0)
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "scale, exp_scale",
+        [
+            [{"logy": True}, {"yaxis": "log"}],
+            [{"logx": True}, {"xaxis": "log"}],
+            [{"loglog": True}, {"xaxis": "log", "yaxis": "log"}],
+        ],
+    )
+    def test_plot_scales(self, ts, scale, exp_scale):
+        ax = _check_plot_works(ts.plot, style=".", **scale)
+        _check_ax_scales(ax, **exp_scale)
+
+    @pytest.mark.slow
+    def test_plot_ts_bar(self, ts):
+        _check_plot_works(ts[:10].plot.bar)
+
+    @pytest.mark.slow
+    def test_plot_ts_area_stacked(self, ts):
+        _check_plot_works(ts.plot.area, stacked=False)
+
+    def test_plot_iseries(self):
+        ser = Series(range(5), period_range("2020-01-01", periods=5))
+        _check_plot_works(ser.plot)
+
+    @pytest.mark.parametrize(
+        "kind",
+        [
+            "line",
+            "bar",
+            "barh",
+            pytest.param("kde", marks=td.skip_if_no("scipy")),
+            "hist",
+            "box",
+        ],
+    )
+    def test_plot_series_kinds(self, series, kind):
+        _check_plot_works(series[:5].plot, kind=kind)
+
+    def test_plot_series_barh(self, series):
+        _check_plot_works(series[:10].plot.barh)
+
+    def test_plot_series_bar_ax(self):
+        ax = _check_plot_works(
+            Series(np.random.default_rng(2).standard_normal(10)).plot.bar, color="black"
+        )
+        _check_colors([ax.patches[0]], facecolors=["black"])
+
+    @pytest.mark.parametrize("kwargs", [{}, {"layout": (-1, 1)}, {"layout": (1, -1)}])
+    def test_plot_6951(self, ts, kwargs):
+        # GH 6951
+        ax = _check_plot_works(ts.plot, subplots=True, **kwargs)
+        _check_axes_shape(ax, axes_num=1, layout=(1, 1))
+
+    def test_plot_figsize_and_title(self, series):
+        # figsize and title
+        _, ax = mpl.pyplot.subplots()
+        ax = series.plot(title="Test", figsize=(16, 8), ax=ax)
+        _check_text_labels(ax.title, "Test")
+        _check_axes_shape(ax, axes_num=1, layout=(1, 1), figsize=(16, 8))
+
+    def test_dont_modify_rcParams(self):
+        # GH 8242
+        key = "axes.prop_cycle"
+        colors = mpl.pyplot.rcParams[key]
+        _, ax = mpl.pyplot.subplots()
+        Series([1, 2, 3]).plot(ax=ax)
+        assert colors == mpl.pyplot.rcParams[key]
+
+    @pytest.mark.parametrize("kwargs", [{}, {"secondary_y": True}])
+    def test_ts_line_lim(self, ts, kwargs):
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot(ax=ax, **kwargs)
+        xmin, xmax = ax.get_xlim()
+        lines = ax.get_lines()
+        assert xmin <= lines[0].get_data(orig=False)[0][0]
+        assert xmax >= lines[0].get_data(orig=False)[0][-1]
+
+    def test_ts_area_lim(self, ts):
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot.area(stacked=False, ax=ax)
+        xmin, xmax = ax.get_xlim()
+        line = ax.get_lines()[0].get_data(orig=False)[0]
+        assert xmin <= line[0]
+        assert xmax >= line[-1]
+        _check_ticks_props(ax, xrot=0)
+
+    def test_ts_area_lim_xcompat(self, ts):
+        # GH 7471
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot.area(stacked=False, x_compat=True, ax=ax)
+        xmin, xmax = ax.get_xlim()
+        line = ax.get_lines()[0].get_data(orig=False)[0]
+        assert xmin <= line[0]
+        assert xmax >= line[-1]
+        _check_ticks_props(ax, xrot=30)
+
+    def test_ts_tz_area_lim_xcompat(self, ts):
+        tz_ts = ts.copy()
+        tz_ts.index = tz_ts.tz_localize("GMT").tz_convert("CET")
+        _, ax = mpl.pyplot.subplots()
+        ax = tz_ts.plot.area(stacked=False, x_compat=True, ax=ax)
+        xmin, xmax = ax.get_xlim()
+        line = ax.get_lines()[0].get_data(orig=False)[0]
+        assert xmin <= line[0]
+        assert xmax >= line[-1]
+        _check_ticks_props(ax, xrot=0)
+
+    def test_ts_tz_area_lim_xcompat_secondary_y(self, ts):
+        tz_ts = ts.copy()
+        tz_ts.index = tz_ts.tz_localize("GMT").tz_convert("CET")
+        _, ax = mpl.pyplot.subplots()
+        ax = tz_ts.plot.area(stacked=False, secondary_y=True, ax=ax)
+        xmin, xmax = ax.get_xlim()
+        line = ax.get_lines()[0].get_data(orig=False)[0]
+        assert xmin <= line[0]
+        assert xmax >= line[-1]
+        _check_ticks_props(ax, xrot=0)
+
+    def test_area_sharey_dont_overwrite(self, ts):
+        # GH37942
+        fig, (ax1, ax2) = mpl.pyplot.subplots(1, 2, sharey=True)
+
+        abs(ts).plot(ax=ax1, kind="area")
+        abs(ts).plot(ax=ax2, kind="area")
+
+        assert get_y_axis(ax1).joined(ax1, ax2)
+        assert get_y_axis(ax2).joined(ax1, ax2)
+
+    def test_label(self):
+        s = Series([1, 2])
+        _, ax = mpl.pyplot.subplots()
+        ax = s.plot(label="LABEL", legend=True, ax=ax)
+        _check_legend_labels(ax, labels=["LABEL"])
+
+    def test_label_none(self):
+        s = Series([1, 2])
+        _, ax = mpl.pyplot.subplots()
+        ax = s.plot(legend=True, ax=ax)
+        _check_legend_labels(ax, labels=[""])
+
+    def test_label_ser_name(self):
+        s = Series([1, 2], name="NAME")
+        _, ax = mpl.pyplot.subplots()
+        ax = s.plot(legend=True, ax=ax)
+        _check_legend_labels(ax, labels=["NAME"])
+
+    def test_label_ser_name_override(self):
+        s = Series([1, 2], name="NAME")
+        # override the default
+        _, ax = mpl.pyplot.subplots()
+        ax = s.plot(legend=True, label="LABEL", ax=ax)
+        _check_legend_labels(ax, labels=["LABEL"])
+
+    def test_label_ser_name_override_dont_draw(self):
+        s = Series([1, 2], name="NAME")
+        # Add lebel info, but don't draw
+        _, ax = mpl.pyplot.subplots()
+        ax = s.plot(legend=False, label="LABEL", ax=ax)
+        assert ax.get_legend() is None  # Hasn't been drawn
+        ax.legend()  # draw it
+        _check_legend_labels(ax, labels=["LABEL"])
+
+    def test_boolean(self):
+        # GH 23719
+        s = Series([False, False, True])
+        _check_plot_works(s.plot, include_bool=True)
+
+        msg = "no numeric data to plot"
+        with pytest.raises(TypeError, match=msg):
+            _check_plot_works(s.plot)
+
+    @pytest.mark.parametrize("index", [None, date_range("2020-01-01", periods=4)])
+    def test_line_area_nan_series(self, index):
+        values = [1, 2, np.nan, 3]
+        d = Series(values, index=index)
+        ax = _check_plot_works(d.plot)
+        masked = ax.lines[0].get_ydata()
+        # remove nan for comparison purpose
+        exp = np.array([1, 2, 3], dtype=np.float64)
+        tm.assert_numpy_array_equal(np.delete(masked.data, 2), exp)
+        tm.assert_numpy_array_equal(masked.mask, np.array([False, False, True, False]))
+
+        expected = np.array([1, 2, 0, 3], dtype=np.float64)
+        ax = _check_plot_works(d.plot, stacked=True)
+        tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected)
+        ax = _check_plot_works(d.plot.area)
+        tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected)
+        ax = _check_plot_works(d.plot.area, stacked=False)
+        tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected)
+
+    def test_line_use_index_false(self):
+        s = Series([1, 2, 3], index=["a", "b", "c"])
+        s.index.name = "The Index"
+        _, ax = mpl.pyplot.subplots()
+        ax = s.plot(use_index=False, ax=ax)
+        label = ax.get_xlabel()
+        assert label == ""
+
+    def test_line_use_index_false_diff_var(self):
+        s = Series([1, 2, 3], index=["a", "b", "c"])
+        s.index.name = "The Index"
+        _, ax = mpl.pyplot.subplots()
+        ax2 = s.plot.bar(use_index=False, ax=ax)
+        label2 = ax2.get_xlabel()
+        assert label2 == ""
+
+    @pytest.mark.xfail(
+        is_platform_linux(),
+        reason="Weird rounding problems",
+        strict=False,
+    )
+    @pytest.mark.parametrize("axis, meth", [("yaxis", "bar"), ("xaxis", "barh")])
+    def test_bar_log(self, axis, meth):
+        expected = np.array([1e-1, 1e0, 1e1, 1e2, 1e3, 1e4])
+
+        _, ax = mpl.pyplot.subplots()
+        ax = getattr(Series([200, 500]).plot, meth)(log=True, ax=ax)
+        tm.assert_numpy_array_equal(getattr(ax, axis).get_ticklocs(), expected)
+
+    @pytest.mark.xfail(
+        is_platform_linux(),
+        reason="Weird rounding problems",
+        strict=False,
+    )
+    @pytest.mark.parametrize(
+        "axis, kind, res_meth",
+        [["yaxis", "bar", "get_ylim"], ["xaxis", "barh", "get_xlim"]],
+    )
+    def test_bar_log_kind_bar(self, axis, kind, res_meth):
+        # GH 9905
+        expected = np.array([1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1])
+
+        _, ax = mpl.pyplot.subplots()
+        ax = Series([0.1, 0.01, 0.001]).plot(log=True, kind=kind, ax=ax)
+        ymin = 0.0007943282347242822
+        ymax = 0.12589254117941673
+        res = getattr(ax, res_meth)()
+        tm.assert_almost_equal(res[0], ymin)
+        tm.assert_almost_equal(res[1], ymax)
+        tm.assert_numpy_array_equal(getattr(ax, axis).get_ticklocs(), expected)
+
+    def test_bar_ignore_index(self):
+        df = Series([1, 2, 3, 4], index=["a", "b", "c", "d"])
+        _, ax = mpl.pyplot.subplots()
+        ax = df.plot.bar(use_index=False, ax=ax)
+        _check_text_labels(ax.get_xticklabels(), ["0", "1", "2", "3"])
+
+    def test_bar_user_colors(self):
+        s = Series([1, 2, 3, 4])
+        ax = s.plot.bar(color=["red", "blue", "blue", "red"])
+        result = [p.get_facecolor() for p in ax.patches]
+        expected = [
+            (1.0, 0.0, 0.0, 1.0),
+            (0.0, 0.0, 1.0, 1.0),
+            (0.0, 0.0, 1.0, 1.0),
+            (1.0, 0.0, 0.0, 1.0),
+        ]
+        assert result == expected
+
+    def test_rotation_default(self):
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 5)))
+        # Default rot 0
+        _, ax = mpl.pyplot.subplots()
+        axes = df.plot(ax=ax)
+        _check_ticks_props(axes, xrot=0)
+
+    def test_rotation_30(self):
+        df = DataFrame(np.random.default_rng(2).standard_normal((5, 5)))
+        _, ax = mpl.pyplot.subplots()
+        axes = df.plot(rot=30, ax=ax)
+        _check_ticks_props(axes, xrot=30)
+
+    def test_irregular_datetime(self):
+        rng = date_range("1/1/2000", "1/15/2000")
+        rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]]
+        ser = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+        _, ax = mpl.pyplot.subplots()
+        ax = ser.plot(ax=ax)
+        xp = DatetimeConverter.convert(datetime(1999, 1, 1), "", ax)
+        ax.set_xlim("1/1/1999", "1/1/2001")
+        assert xp == ax.get_xlim()[0]
+        _check_ticks_props(ax, xrot=30)
+
+    def test_unsorted_index_xlim(self):
+        ser = Series(
+            [0.0, 1.0, np.nan, 3.0, 4.0, 5.0, 6.0],
+            index=[1.0, 0.0, 3.0, 2.0, np.nan, 3.0, 2.0],
+        )
+        _, ax = mpl.pyplot.subplots()
+        ax = ser.plot(ax=ax)
+        xmin, xmax = ax.get_xlim()
+        lines = ax.get_lines()
+        assert xmin <= np.nanmin(lines[0].get_data(orig=False)[0])
+        assert xmax >= np.nanmax(lines[0].get_data(orig=False)[0])
+
+    def test_pie_series(self):
+        # if sum of values is less than 1.0, pie handle them as rate and draw
+        # semicircle.
+        series = Series(
+            np.random.default_rng(2).integers(1, 5),
+            index=["a", "b", "c", "d", "e"],
+            name="YLABEL",
+        )
+        ax = _check_plot_works(series.plot.pie)
+        _check_text_labels(ax.texts, series.index)
+        assert ax.get_ylabel() == ""
+
+    def test_pie_arrow_type(self):
+        # GH 59192
+        pytest.importorskip("pyarrow")
+        ser = Series([1, 2, 3, 4], dtype="int32[pyarrow]")
+        _check_plot_works(ser.plot.pie)
+
+    def test_pie_series_no_label(self):
+        series = Series(
+            np.random.default_rng(2).integers(1, 5),
+            index=["a", "b", "c", "d", "e"],
+            name="YLABEL",
+        )
+        ax = _check_plot_works(series.plot.pie, labels=None)
+        _check_text_labels(ax.texts, [""] * 5)
+
+    def test_pie_series_less_colors_than_elements(self):
+        series = Series(
+            np.random.default_rng(2).integers(1, 5),
+            index=["a", "b", "c", "d", "e"],
+            name="YLABEL",
+        )
+        color_args = ["r", "g", "b"]
+        ax = _check_plot_works(series.plot.pie, colors=color_args)
+
+        color_expected = ["r", "g", "b", "r", "g"]
+        _check_colors(ax.patches, facecolors=color_expected)
+
+    def test_pie_series_labels_and_colors(self):
+        series = Series(
+            np.random.default_rng(2).integers(1, 5),
+            index=["a", "b", "c", "d", "e"],
+            name="YLABEL",
+        )
+        # with labels and colors
+        labels = ["A", "B", "C", "D", "E"]
+        color_args = ["r", "g", "b", "c", "m"]
+        ax = _check_plot_works(series.plot.pie, labels=labels, colors=color_args)
+        _check_text_labels(ax.texts, labels)
+        _check_colors(ax.patches, facecolors=color_args)
+
+    def test_pie_series_autopct_and_fontsize(self):
+        series = Series(
+            np.random.default_rng(2).integers(1, 5),
+            index=["a", "b", "c", "d", "e"],
+            name="YLABEL",
+        )
+        color_args = ["r", "g", "b", "c", "m"]
+        ax = _check_plot_works(
+            series.plot.pie, colors=color_args, autopct="%.2f", fontsize=7
+        )
+        pcts = [f"{s * 100:.2f}" for s in series.values / series.sum()]
+        expected_texts = list(chain.from_iterable(zip(series.index, pcts, strict=True)))
+        _check_text_labels(ax.texts, expected_texts)
+        for t in ax.texts:
+            assert t.get_fontsize() == 7
+
+    def test_pie_series_negative_raises(self):
+        # includes negative value
+        series = Series([1, 2, 0, 4, -1], index=["a", "b", "c", "d", "e"])
+        with pytest.raises(ValueError, match="pie plot doesn't allow negative values"):
+            series.plot.pie()
+
+    def test_pie_series_nan(self):
+        # includes nan
+        series = Series([1, 2, np.nan, 4], index=["a", "b", "c", "d"], name="YLABEL")
+        ax = _check_plot_works(series.plot.pie)
+        _check_text_labels(ax.texts, ["a", "b", "", "d"])
+
+    def test_pie_nan(self):
+        s = Series([1, np.nan, 1, 1])
+        _, ax = mpl.pyplot.subplots()
+        ax = s.plot.pie(legend=True, ax=ax)
+        expected = ["0", "", "2", "3"]
+        result = [x.get_text() for x in ax.texts]
+        assert result == expected
+
+    def test_df_series_secondary_legend(self):
+        # GH 9779
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc")
+        )
+        s = Series(np.random.default_rng(2).standard_normal(10), name="x")
+
+        # primary -> secondary (without passing ax)
+        _, ax = mpl.pyplot.subplots()
+        ax = df.plot(ax=ax)
+        s.plot(legend=True, secondary_y=True, ax=ax)
+        # both legends are drawn on left ax
+        # left and right axis must be visible
+        _check_legend_labels(ax, labels=["a", "b", "c", "x (right)"])
+        assert ax.get_yaxis().get_visible()
+        assert ax.right_ax.get_yaxis().get_visible()
+
+    def test_df_series_secondary_legend_both(self):
+        # GH 9779
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc")
+        )
+        s = Series(np.random.default_rng(2).standard_normal(10), name="x")
+        # secondary -> secondary (without passing ax)
+        _, ax = mpl.pyplot.subplots()
+        ax = df.plot(secondary_y=True, ax=ax)
+        s.plot(legend=True, secondary_y=True, ax=ax)
+        # both legends are drawn on left ax
+        # left axis must be invisible and right axis must be visible
+        expected = ["a (right)", "b (right)", "c (right)", "x (right)"]
+        _check_legend_labels(ax.left_ax, labels=expected)
+        assert not ax.left_ax.get_yaxis().get_visible()
+        assert ax.get_yaxis().get_visible()
+
+    def test_df_series_secondary_legend_both_with_axis_2(self):
+        # GH 9779
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((10, 3)), columns=list("abc")
+        )
+        s = Series(np.random.default_rng(2).standard_normal(10), name="x")
+        # secondary -> secondary (with passing ax)
+        _, ax = mpl.pyplot.subplots()
+        ax = df.plot(secondary_y=True, mark_right=False, ax=ax)
+        s.plot(ax=ax, legend=True, secondary_y=True)
+        # both legends are drawn on left ax
+        # left axis must be invisible and right axis must be visible
+        expected = ["a", "b", "c", "x (right)"]
+        _check_legend_labels(ax.left_ax, expected)
+        assert not ax.left_ax.get_yaxis().get_visible()
+        assert ax.get_yaxis().get_visible()
+
+    @pytest.mark.parametrize(
+        "input_logy, expected_scale", [(True, "log"), ("sym", "symlog")]
+    )
+    @pytest.mark.parametrize("secondary_kwarg", [{}, {"secondary_y": True}])
+    def test_secondary_logy(self, input_logy, expected_scale, secondary_kwarg):
+        # GH 25545, GH 24980
+        s1 = Series(np.random.default_rng(2).standard_normal(10))
+        ax1 = s1.plot(logy=input_logy, **secondary_kwarg)
+        assert ax1.get_yscale() == expected_scale
+
+    def test_plot_fails_with_dupe_color_and_style(self):
+        x = Series(np.random.default_rng(2).standard_normal(2))
+        _, ax = mpl.pyplot.subplots()
+        msg = (
+            "Cannot pass 'style' string with a color symbol and 'color' keyword "
+            "argument. Please use one or the other or pass 'style' without a color "
+            "symbol"
+        )
+        with pytest.raises(ValueError, match=msg):
+            x.plot(style="k--", color="k", ax=ax)
+
+    @pytest.mark.parametrize(
+        "bw_method, ind",
+        [
+            ["scott", 20],
+            [None, 20],
+            [None, np.int_(20)],
+            [0.5, np.linspace(-100, 100, 20)],
+        ],
+    )
+    def test_kde_kwargs(self, ts, bw_method, ind):
+        pytest.importorskip("scipy")
+        _check_plot_works(ts.plot.kde, bw_method=bw_method, ind=ind)
+
+    @pytest.mark.parametrize(
+        "bw_method, ind, weights",
+        [
+            ["scott", 20, None],
+            [None, 20, None],
+            [None, np.int_(20), None],
+            [0.5, np.linspace(-100, 100, 20), None],
+            ["scott", 40, np.linspace(0.0, 2.0, 50)],
+        ],
+    )
+    def test_kde_kwargs_weights(self, bw_method, ind, weights):
+        # GH59337
+        pytest.importorskip("scipy")
+        s = Series(np.random.default_rng(2).uniform(size=50))
+        _check_plot_works(s.plot.kde, bw_method=bw_method, ind=ind, weights=weights)
+
+    def test_density_kwargs(self, ts):
+        pytest.importorskip("scipy")
+        sample_points = np.linspace(-100, 100, 20)
+        _check_plot_works(ts.plot.density, bw_method=0.5, ind=sample_points)
+
+    def test_kde_kwargs_check_axes(self, ts):
+        pytest.importorskip("scipy")
+        _, ax = mpl.pyplot.subplots()
+        sample_points = np.linspace(-100, 100, 20)
+        ax = ts.plot.kde(logy=True, bw_method=0.5, ind=sample_points, ax=ax)
+        _check_ax_scales(ax, yaxis="log")
+        _check_text_labels(ax.yaxis.get_label(), "Density")
+
+    def test_kde_missing_vals(self):
+        pytest.importorskip("scipy")
+        s = Series(np.random.default_rng(2).uniform(size=50))
+        s[0] = np.nan
+        axes = _check_plot_works(s.plot.kde)
+
+        # gh-14821: check if the values have any missing values
+        assert any(~np.isnan(axes.lines[0].get_xdata()))
+
+    @pytest.mark.xfail(reason="Api changed in 3.6.0")
+    def test_boxplot_series(self, ts):
+        _, ax = mpl.pyplot.subplots()
+        ax = ts.plot.box(logy=True, ax=ax)
+        _check_ax_scales(ax, yaxis="log")
+        xlabels = ax.get_xticklabels()
+        _check_text_labels(xlabels, [ts.name])
+        ylabels = ax.get_yticklabels()
+        _check_text_labels(ylabels, [""] * len(ylabels))
+
+    @pytest.mark.parametrize(
+        "kind",
+        plotting.PlotAccessor._common_kinds + plotting.PlotAccessor._series_kinds,
+    )
+    def test_kind_kwarg(self, kind):
+        pytest.importorskip("scipy")
+        s = Series(range(3))
+        _, ax = mpl.pyplot.subplots()
+        s.plot(kind=kind, ax=ax)
+        mpl.pyplot.close()
+
+    @pytest.mark.parametrize(
+        "kind",
+        plotting.PlotAccessor._common_kinds + plotting.PlotAccessor._series_kinds,
+    )
+    def test_kind_attr(self, kind):
+        pytest.importorskip("scipy")
+        s = Series(range(3))
+        _, ax = mpl.pyplot.subplots()
+        getattr(s.plot, kind)()
+        mpl.pyplot.close()
+
+    @pytest.mark.parametrize("kind", plotting.PlotAccessor._common_kinds)
+    def test_invalid_plot_data(self, kind):
+        s = Series(list("abcd"))
+        _, ax = mpl.pyplot.subplots()
+        msg = "no numeric data to plot"
+        with pytest.raises(TypeError, match=msg):
+            s.plot(kind=kind, ax=ax)
+
+    @pytest.mark.parametrize("kind", plotting.PlotAccessor._common_kinds)
+    def test_valid_object_plot(self, kind):
+        pytest.importorskip("scipy")
+        s = Series(range(10), dtype=object)
+        _check_plot_works(s.plot, kind=kind)
+
+    @pytest.mark.parametrize("kind", plotting.PlotAccessor._common_kinds)
+    def test_partially_invalid_plot_data(self, kind):
+        s = Series(["a", "b", 1.0, 2])
+        _, ax = mpl.pyplot.subplots()
+        msg = "no numeric data to plot"
+        with pytest.raises(TypeError, match=msg):
+            s.plot(kind=kind, ax=ax)
+
+    def test_invalid_kind(self):
+        s = Series([1, 2])
+        with pytest.raises(ValueError, match="invalid_kind is not a valid plot kind"):
+            s.plot(kind="invalid_kind")
+
+    def test_dup_datetime_index_plot(self):
+        dr1 = date_range("1/1/2009", periods=4)
+        dr2 = date_range("1/2/2009", periods=4)
+        index = dr1.append(dr2)
+        values = np.random.default_rng(2).standard_normal(index.size)
+        s = Series(values, index=index)
+        _check_plot_works(s.plot)
+
+    def test_errorbar_asymmetrical(self):
+        # GH9536
+        s = Series(np.arange(10), name="x")
+        err = np.random.default_rng(2).random((2, 10))
+
+        ax = s.plot(yerr=err, xerr=err)
+
+        result = np.vstack([i.vertices[:, 1] for i in ax.collections[1].get_paths()])
+        expected = (err.T * np.array([-1, 1])) + s.to_numpy().reshape(-1, 1)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_errorbar_asymmetrical_error(self):
+        # GH9536
+        s = Series(np.arange(10), name="x")
+        msg = (
+            "Asymmetrical error bars should be provided "
+            f"with the shape \\(2, {len(s)}\\)"
+        )
+        with pytest.raises(ValueError, match=msg):
+            s.plot(yerr=np.random.default_rng(2).random((2, 11)))
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize("kind", ["line", "bar"])
+    @pytest.mark.parametrize(
+        "yerr",
+        [
+            Series(np.abs(np.random.default_rng(2).standard_normal(10))),
+            np.abs(np.random.default_rng(2).standard_normal(10)),
+            list(np.abs(np.random.default_rng(2).standard_normal(10))),
+            DataFrame(
+                np.abs(np.random.default_rng(2).standard_normal((10, 2))),
+                columns=["x", "y"],
+            ),
+        ],
+    )
+    def test_errorbar_plot(self, kind, yerr):
+        s = Series(np.arange(10), name="x")
+        ax = _check_plot_works(s.plot, yerr=yerr, kind=kind)
+        _check_has_errorbars(ax, xerr=0, yerr=1)
+
+    @pytest.mark.slow
+    def test_errorbar_plot_yerr_0(self):
+        s = Series(np.arange(10), name="x")
+        s_err = np.abs(np.random.default_rng(2).standard_normal(10))
+        ax = _check_plot_works(s.plot, xerr=s_err)
+        _check_has_errorbars(ax, xerr=1, yerr=0)
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "yerr",
+        [
+            Series(np.abs(np.random.default_rng(2).standard_normal(12))),
+            DataFrame(
+                np.abs(np.random.default_rng(2).standard_normal((12, 2))),
+                columns=["x", "y"],
+            ),
+        ],
+    )
+    def test_errorbar_plot_ts(self, yerr):
+        # test time series plotting
+        ix = date_range("1/1/2000", "1/1/2001", freq="ME")
+        ts = Series(np.arange(12), index=ix, name="x")
+        yerr.index = ix
+
+        ax = _check_plot_works(ts.plot, yerr=yerr)
+        _check_has_errorbars(ax, xerr=0, yerr=1)
+
+    @pytest.mark.slow
+    def test_errorbar_plot_invalid_yerr_shape(self):
+        s = Series(np.arange(10), name="x")
+        # check incorrect lengths and types
+        with tm.external_error_raised(ValueError):
+            s.plot(yerr=np.arange(11))
+
+    @pytest.mark.slow
+    def test_errorbar_plot_invalid_yerr(self):
+        s = Series(np.arange(10), name="x")
+        s_err = ["zzz"] * 10
+        with tm.external_error_raised(TypeError):
+            s.plot(yerr=s_err)
+
+    @pytest.mark.slow
+    def test_table_true(self, series):
+        _check_plot_works(series.plot, table=True)
+
+    @pytest.mark.slow
+    def test_table_self(self, series):
+        _check_plot_works(series.plot, table=series)
+
+    @pytest.mark.slow
+    def test_series_grid_settings(self):
+        # Make sure plot defaults to rcParams['axes.grid'] setting, GH 9792
+        pytest.importorskip("scipy")
+        _check_grid_settings(
+            Series([1, 2, 3]),
+            plotting.PlotAccessor._series_kinds + plotting.PlotAccessor._common_kinds,
+        )
+
+    @pytest.mark.parametrize("c", ["r", "red", "green", "#FF0000"])
+    def test_standard_colors(self, c):
+        result = get_standard_colors(1, color=c)
+        assert result == [c]
+
+        result = get_standard_colors(1, color=[c])
+        assert result == [c]
+
+        result = get_standard_colors(3, color=c)
+        assert result == [c] * 3
+
+        result = get_standard_colors(3, color=[c])
+        assert result == [c] * 3
+
+    def test_standard_colors_all(self):
+        # multiple colors like mediumaquamarine
+        for c in mpl.colors.cnames:
+            result = get_standard_colors(num_colors=1, color=c)
+            assert result == [c]
+
+            result = get_standard_colors(num_colors=1, color=[c])
+            assert result == [c]
+
+            result = get_standard_colors(num_colors=3, color=c)
+            assert result == [c] * 3
+
+            result = get_standard_colors(num_colors=3, color=[c])
+            assert result == [c] * 3
+
+        # single letter colors like k
+        for c in mpl.colors.ColorConverter.colors:
+            result = get_standard_colors(num_colors=1, color=c)
+            assert result == [c]
+
+            result = get_standard_colors(num_colors=1, color=[c])
+            assert result == [c]
+
+            result = get_standard_colors(num_colors=3, color=c)
+            assert result == [c] * 3
+
+            result = get_standard_colors(num_colors=3, color=[c])
+            assert result == [c] * 3
+
+    def test_series_plot_color_kwargs(self):
+        # GH1890
+        _, ax = mpl.pyplot.subplots()
+        ax = Series(np.arange(12) + 1).plot(color="green", ax=ax)
+        _check_colors(ax.get_lines(), linecolors=["green"])
+
+    def test_time_series_plot_color_kwargs(self):
+        # #1890
+        _, ax = mpl.pyplot.subplots()
+        ax = Series(np.arange(12) + 1, index=date_range("1/1/2000", periods=12)).plot(
+            color="green", ax=ax
+        )
+        _check_colors(ax.get_lines(), linecolors=["green"])
+
+    def test_time_series_plot_color_with_empty_kwargs(self):
+        def_colors = _unpack_cycler(mpl.rcParams)
+        index = date_range("1/1/2000", periods=12)
+        s = Series(np.arange(1, 13), index=index)
+
+        ncolors = 3
+
+        _, ax = mpl.pyplot.subplots()
+        for i in range(ncolors):
+            ax = s.plot(ax=ax)
+        _check_colors(ax.get_lines(), linecolors=def_colors[:ncolors])
+
+    def test_xticklabels(self):
+        # GH11529
+        s = Series(np.arange(10), index=[f"P{i:02d}" for i in range(10)])
+        _, ax = mpl.pyplot.subplots()
+        ax = s.plot(xticks=[0, 3, 5, 9], ax=ax)
+        exp = [f"P{i:02d}" for i in [0, 3, 5, 9]]
+        _check_text_labels(ax.get_xticklabels(), exp)
+
+    def test_xtick_barPlot(self):
+        # GH28172
+        s = Series(range(10), index=[f"P{i:02d}" for i in range(10)])
+        ax = s.plot.bar(xticks=range(0, 11, 2))
+        exp = np.array(list(range(0, 11, 2)))
+        tm.assert_numpy_array_equal(exp, ax.get_xticks())
+
+    def test_custom_business_day_freq(self):
+        # GH7222
+        s = Series(
+            range(100, 121),
+            index=pd.bdate_range(
+                start="2014-05-01",
+                end="2014-06-01",
+                freq=CustomBusinessDay(holidays=["2014-05-26"]),
+            ),
+        )
+
+        _check_plot_works(s.plot)
+
+    @pytest.mark.xfail(
+        reason="GH#24426, see also "
+        "github.com/pandas-dev/pandas/commit/"
+        "ef1bd69fa42bbed5d09dd17f08c44fc8bfc2b685#r61470674"
+    )
+    def test_plot_accessor_updates_on_inplace(self):
+        ser = Series([1, 2, 3, 4])
+        _, ax = mpl.pyplot.subplots()
+        ax = ser.plot(ax=ax)
+        before = ax.xaxis.get_ticklocs()
+
+        ser.drop([0, 1], inplace=True)
+        _, ax = mpl.pyplot.subplots()
+        after = ax.xaxis.get_ticklocs()
+        tm.assert_numpy_array_equal(before, after)
+
+    @pytest.mark.parametrize("kind", ["line", "area"])
+    def test_plot_xlim_for_series(self, kind):
+        # test if xlim is also correctly plotted in Series for line and area
+        # GH 27686
+        s = Series([2, 3])
+        _, ax = mpl.pyplot.subplots()
+        s.plot(kind=kind, ax=ax)
+        xlims = ax.get_xlim()
+
+        assert xlims[0] < 0
+        assert xlims[1] > 1
+
+    def test_plot_no_rows(self):
+        # GH 27758
+        df = Series(dtype=int)
+        assert df.empty
+        ax = df.plot()
+        assert len(ax.get_lines()) == 1
+        line = ax.get_lines()[0]
+        assert len(line.get_xdata()) == 0
+        assert len(line.get_ydata()) == 0
+
+    def test_plot_no_numeric_data(self):
+        df = Series(["a", "b", "c"])
+        with pytest.raises(TypeError, match="no numeric data to plot"):
+            df.plot()
+
+    @pytest.mark.parametrize(
+        "data, index",
+        [
+            ([1, 2, 3, 4], [3, 2, 1, 0]),
+            ([10, 50, 20, 30], [1910, 1920, 1980, 1950]),
+        ],
+    )
+    def test_plot_order(self, data, index):
+        # GH38865 Verify plot order of a Series
+        ser = Series(data=data, index=index)
+        ax = ser.plot(kind="bar")
+
+        expected = ser.tolist()
+        result = [
+            patch.get_bbox().ymax
+            for patch in sorted(ax.patches, key=lambda patch: patch.get_bbox().xmax)
+        ]
+        assert expected == result
+
+    def test_style_single_ok(self):
+        s = Series([1, 2])
+        ax = s.plot(style="s", color="C3")
+        assert ax.lines[0].get_color() == "C3"
+
+    @pytest.mark.parametrize(
+        "index_name, old_label, new_label",
+        [(None, "", "new"), ("old", "old", "new"), (None, "", "")],
+    )
+    @pytest.mark.parametrize("kind", ["line", "area", "bar", "barh", "hist"])
+    def test_xlabel_ylabel_series(self, kind, index_name, old_label, new_label):
+        # GH 9093
+        ser = Series([1, 2, 3, 4])
+        ser.index.name = index_name
+
+        # default is the ylabel is not shown and xlabel is index name (reverse for barh)
+        ax = ser.plot(kind=kind)
+        if kind == "barh":
+            assert ax.get_xlabel() == ""
+            assert ax.get_ylabel() == old_label
+        elif kind == "hist":
+            assert ax.get_xlabel() == ""
+            assert ax.get_ylabel() == "Frequency"
+        else:
+            assert ax.get_ylabel() == ""
+            assert ax.get_xlabel() == old_label
+
+        # old xlabel will be overridden and assigned ylabel will be used as ylabel
+        ax = ser.plot(kind=kind, ylabel=new_label, xlabel=new_label)
+        assert ax.get_ylabel() == new_label
+        assert ax.get_xlabel() == new_label
+
+    @pytest.mark.parametrize(
+        "index",
+        [
+            pd.timedelta_range(start=0, periods=2, freq="D"),
+            [pd.Timedelta(days=1), pd.Timedelta(days=2)],
+        ],
+    )
+    def test_timedelta_index(self, index):
+        # GH37454
+        xlims = (3, 1)
+        ax = Series([1, 2], index=index).plot(xlim=(xlims))
+        assert ax.get_xlim() == (3, 1)
+
+    def test_series_none_color(self):
+        # GH51953
+        series = Series([1, 2, 3])
+        ax = series.plot(color=None)
+        expected = _unpack_cycler(mpl.pyplot.rcParams)[:1]
+        _check_colors(ax.get_lines(), linecolors=expected)
+
+    @pytest.mark.slow
+    def test_plot_no_warning(self, ts):
+        # GH 55138
+        # TODO(3.0): this can be removed once Period[B] deprecation is enforced
+        with tm.assert_produces_warning(False):
+            _ = ts.plot()
+
+    def test_secondary_y_subplot_axis_labels(self):
+        # GH#14102
+        s1 = Series([5, 7, 6, 8, 7], index=[1, 2, 3, 4, 5])
+        s2 = Series([6, 4, 5, 3, 4], index=[1, 2, 3, 4, 5])
+
+        ax = plt.subplot(2, 1, 1)
+        s1.plot(ax=ax)
+        s2.plot(ax=ax, secondary_y=True)
+        ax2 = plt.subplot(2, 1, 2)
+        s1.plot(ax=ax2)
+        assert len(ax.xaxis.get_minor_ticks()) == 0
+        assert len(ax.get_xticklabels()) > 0
+
+    def test_bar_line_plot(self):
+        """
+        Test that bar and line plots with the same x values are superposed
+        and that the x limits are set such that the plots are visible.
+        """
+        # GH61161
+        index = period_range("2023", periods=3, freq="Y")
+        years = set(index.year.astype(str))
+        s = Series([1, 2, 3], index=index)
+        ax = plt.subplot()
+        s.plot(kind="bar", ax=ax)
+        bar_xticks = [
+            label for label in ax.get_xticklabels() if label.get_text() in years
+        ]
+        s.plot(kind="line", ax=ax, color="r")
+        line_xticks = [
+            label for label in ax.get_xticklabels() if label.get_text() in years
+        ]
+        assert len(bar_xticks) == len(index)
+        assert bar_xticks == line_xticks
+        x_limits = ax.get_xlim()
+        assert x_limits[0] <= bar_xticks[0].get_position()[0]
+        assert x_limits[1] >= bar_xticks[-1].get_position()[0]
diff --git a/pandas/tests/plotting/test_style.py b/pandas/tests/plotting/test_style.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9c89e0a7893f501b2f8f7f0916ca1a008201b7d
--- /dev/null
+++ b/pandas/tests/plotting/test_style.py
@@ -0,0 +1,149 @@
+import pytest
+
+from pandas import Series
+
+mpl = pytest.importorskip("matplotlib")
+plt = pytest.importorskip("matplotlib.pyplot")
+from pandas.plotting._matplotlib.style import get_standard_colors
+
+
+class TestGetStandardColors:
+    @pytest.mark.parametrize(
+        "num_colors, expected",
+        [
+            (3, ["red", "green", "blue"]),
+            (5, ["red", "green", "blue", "red", "green"]),
+            (7, ["red", "green", "blue", "red", "green", "blue", "red"]),
+            (2, ["red", "green"]),
+            (1, ["red"]),
+        ],
+    )
+    def test_default_colors_named_from_prop_cycle(self, num_colors, expected):
+        mpl_params = {
+            "axes.prop_cycle": plt.cycler(color=["red", "green", "blue"]),
+        }
+        with mpl.rc_context(rc=mpl_params):
+            result = get_standard_colors(num_colors=num_colors)
+            assert result == expected
+
+    @pytest.mark.parametrize(
+        "num_colors, expected",
+        [
+            (1, ["b"]),
+            (3, ["b", "g", "r"]),
+            (4, ["b", "g", "r", "y"]),
+            (5, ["b", "g", "r", "y", "b"]),
+            (7, ["b", "g", "r", "y", "b", "g", "r"]),
+        ],
+    )
+    def test_default_colors_named_from_prop_cycle_string(self, num_colors, expected):
+        mpl_params = {
+            "axes.prop_cycle": plt.cycler(color="bgry"),
+        }
+        with mpl.rc_context(rc=mpl_params):
+            result = get_standard_colors(num_colors=num_colors)
+            assert result == expected
+
+    @pytest.mark.parametrize(
+        "num_colors, expected_name",
+        [
+            (1, ["C0"]),
+            (3, ["C0", "C1", "C2"]),
+            (
+                12,
+                [
+                    "C0",
+                    "C1",
+                    "C2",
+                    "C3",
+                    "C4",
+                    "C5",
+                    "C6",
+                    "C7",
+                    "C8",
+                    "C9",
+                    "C0",
+                    "C1",
+                ],
+            ),
+        ],
+    )
+    def test_default_colors_named_undefined_prop_cycle(self, num_colors, expected_name):
+        with mpl.rc_context(rc={}):
+            expected = [mpl.colors.to_hex(x) for x in expected_name]
+            result = get_standard_colors(num_colors=num_colors)
+            assert result == expected
+
+    @pytest.mark.parametrize(
+        "num_colors, expected",
+        [
+            (1, ["red", "green", (0.1, 0.2, 0.3)]),
+            (2, ["red", "green", (0.1, 0.2, 0.3)]),
+            (3, ["red", "green", (0.1, 0.2, 0.3)]),
+            (4, ["red", "green", (0.1, 0.2, 0.3), "red"]),
+        ],
+    )
+    def test_user_input_color_sequence(self, num_colors, expected):
+        color = ["red", "green", (0.1, 0.2, 0.3)]
+        result = get_standard_colors(color=color, num_colors=num_colors)
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "num_colors, expected",
+        [
+            (1, ["r", "g", "b", "k"]),
+            (2, ["r", "g", "b", "k"]),
+            (3, ["r", "g", "b", "k"]),
+            (4, ["r", "g", "b", "k"]),
+            (5, ["r", "g", "b", "k", "r"]),
+            (6, ["r", "g", "b", "k", "r", "g"]),
+        ],
+    )
+    def test_user_input_color_string(self, num_colors, expected):
+        color = "rgbk"
+        result = get_standard_colors(color=color, num_colors=num_colors)
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "num_colors, expected",
+        [
+            (1, [(0.1, 0.2, 0.3)]),
+            (2, [(0.1, 0.2, 0.3), (0.1, 0.2, 0.3)]),
+            (3, [(0.1, 0.2, 0.3), (0.1, 0.2, 0.3), (0.1, 0.2, 0.3)]),
+        ],
+    )
+    def test_user_input_color_floats(self, num_colors, expected):
+        color = (0.1, 0.2, 0.3)
+        result = get_standard_colors(color=color, num_colors=num_colors)
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "color, num_colors, expected",
+        [
+            ("Crimson", 1, ["Crimson"]),
+            ("DodgerBlue", 2, ["DodgerBlue", "DodgerBlue"]),
+            ("firebrick", 3, ["firebrick", "firebrick", "firebrick"]),
+        ],
+    )
+    def test_user_input_named_color_string(self, color, num_colors, expected):
+        result = get_standard_colors(color=color, num_colors=num_colors)
+        assert result == expected
+
+    @pytest.mark.parametrize("color", ["", [], (), Series([], dtype="object")])
+    def test_empty_color_raises(self, color):
+        with pytest.raises(ValueError, match="Invalid color argument"):
+            get_standard_colors(color=color, num_colors=1)
+
+    @pytest.mark.parametrize(
+        "color",
+        [
+            "bad_color",
+            ("red", "green", "bad_color"),
+            (0.1,),
+            (0.1, 0.2),
+            (0.1, 0.2, 0.3, 0.4, 0.5),  # must be either 3 or 4 floats
+        ],
+    )
+    def test_bad_color_raises(self, color):
+        with pytest.raises(ValueError, match="Invalid color"):
+            get_standard_colors(color=color, num_colors=5)
diff --git a/pandas/tests/resample/__init__.py b/pandas/tests/resample/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/resample/conftest.py b/pandas/tests/resample/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c45ece5d8fb963d32945877b4af6521ea7b9450
--- /dev/null
+++ b/pandas/tests/resample/conftest.py
@@ -0,0 +1,33 @@
+import pytest
+
+# The various methods we support
+downsample_methods = [
+    "min",
+    "max",
+    "first",
+    "last",
+    "sum",
+    "mean",
+    "sem",
+    "median",
+    "prod",
+    "var",
+    "std",
+    "ohlc",
+    "quantile",
+]
+upsample_methods = ["count", "size"]
+series_methods = ["nunique"]
+resample_methods = downsample_methods + upsample_methods + series_methods
+
+
+@pytest.fixture(params=downsample_methods)
+def downsample_method(request):
+    """Fixture for parametrization of Grouper downsample methods."""
+    return request.param
+
+
+@pytest.fixture(params=resample_methods)
+def resample_method(request):
+    """Fixture for parametrization of Grouper resample methods."""
+    return request.param
diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..359ad72bd67f32d472d502550dfb28285b33a4af
--- /dev/null
+++ b/pandas/tests/resample/test_base.py
@@ -0,0 +1,554 @@
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas.errors import Pandas4Warning
+
+from pandas.core.dtypes.common import is_extension_array_dtype
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    MultiIndex,
+    NaT,
+    PeriodIndex,
+    Series,
+    TimedeltaIndex,
+)
+import pandas._testing as tm
+from pandas.core.groupby.groupby import DataError
+from pandas.core.groupby.grouper import Grouper
+from pandas.core.indexes.datetimes import date_range
+from pandas.core.indexes.period import period_range
+from pandas.core.indexes.timedeltas import timedelta_range
+from pandas.core.resample import _asfreq_compat
+
+
+@pytest.fixture(
+    params=[
+        "linear",
+        "time",
+        "index",
+        "values",
+        "nearest",
+        "zero",
+        "slinear",
+        "quadratic",
+        "cubic",
+        "barycentric",
+        "krogh",
+        "from_derivatives",
+        "piecewise_polynomial",
+        "pchip",
+        "akima",
+    ],
+)
+def all_1d_no_arg_interpolation_methods(request):
+    return request.param
+
+
+@pytest.mark.parametrize("freq", ["2D", "1h"])
+@pytest.mark.parametrize(
+    "index",
+    [
+        timedelta_range("1 day", "10 day", freq="D"),
+        date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"),
+    ],
+)
+def test_asfreq(frame_or_series, index, freq):
+    obj = frame_or_series(range(len(index)), index=index)
+    idx_range = date_range if isinstance(index, DatetimeIndex) else timedelta_range
+
+    result = obj.resample(freq).asfreq()
+    new_index = idx_range(obj.index[0], obj.index[-1], freq=freq)
+    expected = obj.reindex(new_index)
+    tm.assert_almost_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        timedelta_range("1 day", "10 day", freq="D"),
+        date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"),
+    ],
+)
+def test_asfreq_fill_value(index):
+    # test for fill value during resampling, issue 3715
+
+    ser = Series(range(len(index)), index=index, name="a")
+    idx_range = date_range if isinstance(index, DatetimeIndex) else timedelta_range
+
+    result = ser.resample("1h").asfreq()
+    new_index = idx_range(ser.index[0], ser.index[-1], freq="1h")
+    expected = ser.reindex(new_index)
+    tm.assert_series_equal(result, expected)
+
+    # Explicit cast to float to avoid implicit cast when setting None
+    frame = ser.astype("float").to_frame("value")
+    frame.iloc[1] = None
+    result = frame.resample("1h").asfreq(fill_value=4.0)
+    new_index = idx_range(frame.index[0], frame.index[-1], freq="1h")
+    expected = frame.reindex(new_index, fill_value=4.0)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        timedelta_range("1 day", "3 day", freq="D"),
+        date_range(datetime(2005, 1, 1), datetime(2005, 1, 3), freq="D"),
+        period_range(datetime(2005, 1, 1), datetime(2005, 1, 3), freq="D"),
+    ],
+)
+def test_resample_interpolate(index):
+    # GH#12925
+    df = DataFrame(range(len(index)), index=index)
+    result = df.resample("1min").asfreq().interpolate()
+    expected = df.resample("1min").interpolate()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_interpolate_inplace_deprecated():
+    # GH#58690
+    dti = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D")
+
+    df = DataFrame(range(len(dti)), index=dti)
+    rs = df.resample("1min")
+    msg = "The 'inplace' keyword in DatetimeIndexResampler.interpolate"
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        rs.interpolate(inplace=False)
+
+    msg2 = "Cannot interpolate inplace on a resampled object"
+    with pytest.raises(ValueError, match=msg2):
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            rs.interpolate(inplace=True)
+
+
+def test_resample_interpolate_regular_sampling_off_grid(
+    all_1d_no_arg_interpolation_methods,
+):
+    pytest.importorskip("scipy")
+    # GH#21351
+    index = date_range("2000-01-01 00:01:00", periods=5, freq="2h")
+    ser = Series(np.arange(5.0), index)
+
+    method = all_1d_no_arg_interpolation_methods
+    result = ser.resample("1h").interpolate(method)
+
+    if method == "linear":
+        values = np.repeat(np.arange(0.0, 4.0), 2) + np.tile([1 / 3, 2 / 3], 4)
+    elif method == "nearest":
+        values = np.repeat(np.arange(0.0, 5.0), 2)[1:-1]
+    elif method == "zero":
+        values = np.repeat(np.arange(0.0, 4.0), 2)
+    else:
+        values = 0.491667 + np.arange(0.0, 4.0, 0.5)
+    values = np.insert(values, 0, np.nan)
+    index = date_range("2000-01-01 00:00:00", periods=9, freq="1h")
+    expected = Series(values, index=index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_interpolate_irregular_sampling(all_1d_no_arg_interpolation_methods):
+    pytest.importorskip("scipy")
+    # GH#21351
+    ser = Series(
+        np.linspace(0.0, 1.0, 5),
+        index=DatetimeIndex(
+            [
+                "2000-01-01 00:00:03",
+                "2000-01-01 00:00:22",
+                "2000-01-01 00:00:24",
+                "2000-01-01 00:00:31",
+                "2000-01-01 00:00:39",
+            ]
+        ),
+    )
+
+    # Resample to 5 second sampling and interpolate with the given method
+    ser_resampled = ser.resample("5s").interpolate(all_1d_no_arg_interpolation_methods)
+
+    # Check that none of the resampled values are NaN, except the first one
+    # which lies 3 seconds before the first actual data point
+    assert np.isnan(ser_resampled.iloc[0])
+    assert not ser_resampled.iloc[1:].isna().any()
+
+
+def test_raises_on_non_datetimelike_index():
+    # this is a non datetimelike index
+    xp = DataFrame()
+    msg = (
+        "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, "
+        "but got an instance of 'RangeIndex'"
+    )
+    with pytest.raises(TypeError, match=msg):
+        xp.resample("YE")
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        PeriodIndex([], freq="D", name="a"),
+        DatetimeIndex([], name="a"),
+        TimedeltaIndex([], name="a"),
+    ],
+)
+@pytest.mark.parametrize("freq", ["ME", "D", "h"])
+def test_resample_empty_series(freq, index, resample_method):
+    # GH12771 & GH12868
+
+    ser = Series(index=index, dtype=float)
+    if freq == "ME" and isinstance(ser.index, TimedeltaIndex):
+        msg = (
+            "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
+            "e.g. '24h' or '3D', not <MonthEnd>"
+        )
+        with pytest.raises(ValueError, match=msg):
+            ser.resample(freq)
+        return
+    elif freq == "ME" and isinstance(ser.index, PeriodIndex):
+        # index is PeriodIndex, so convert to corresponding Period freq
+        freq = "M"
+    rs = ser.resample(freq)
+    result = getattr(rs, resample_method)()
+
+    if resample_method == "ohlc":
+        expected = DataFrame(
+            [], index=ser.index[:0], columns=["open", "high", "low", "close"]
+        )
+        expected.index = _asfreq_compat(ser.index, freq)
+        tm.assert_frame_equal(result, expected, check_dtype=False)
+    else:
+        expected = ser.copy()
+        expected.index = _asfreq_compat(ser.index, freq)
+        tm.assert_series_equal(result, expected, check_dtype=False)
+
+    tm.assert_index_equal(result.index, expected.index)
+    assert result.index.freq == expected.index.freq
+
+
+@pytest.mark.parametrize("min_count", [0, 1])
+def test_resample_empty_sum_string(string_dtype_no_object, min_count):
+    # https://github.com/pandas-dev/pandas/issues/60229
+    dtype = string_dtype_no_object
+    ser = Series(
+        pd.NA,
+        index=DatetimeIndex(
+            [
+                "2000-01-01 00:00:00",
+                "2000-01-01 00:00:10",
+                "2000-01-01 00:00:20",
+                "2000-01-01 00:00:30",
+            ]
+        ),
+        dtype=dtype,
+    )
+    rs = ser.resample("20s")
+    result = rs.sum(min_count=min_count)
+
+    value = "" if min_count == 0 else pd.NA
+    index = date_range(start="2000-01-01", freq="20s", periods=2, unit="us")
+    expected = Series(value, index=index, dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "freq",
+    [
+        pytest.param("ME", marks=pytest.mark.xfail(reason="Don't know why this fails")),
+        "D",
+        "h",
+    ],
+)
+def test_resample_nat_index_series(freq, resample_method):
+    # GH39227
+
+    ser = Series(range(5), index=PeriodIndex([NaT] * 5, freq=freq))
+
+    rs = ser.resample(freq)
+    result = getattr(rs, resample_method)()
+
+    if resample_method == "ohlc":
+        expected = DataFrame(
+            [], index=ser.index[:0], columns=["open", "high", "low", "close"]
+        )
+        tm.assert_frame_equal(result, expected, check_dtype=False)
+    else:
+        expected = ser[:0].copy()
+        tm.assert_series_equal(result, expected, check_dtype=False)
+    tm.assert_index_equal(result.index, expected.index)
+    assert result.index.freq == expected.index.freq
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        PeriodIndex([], freq="D", name="a"),
+        DatetimeIndex([], name="a"),
+        TimedeltaIndex([], name="a"),
+    ],
+)
+@pytest.mark.parametrize("freq", ["ME", "D", "h"])
+@pytest.mark.parametrize("resample_method", ["count", "size"])
+def test_resample_count_empty_series(freq, index, resample_method):
+    # GH28427
+    ser = Series(index=index)
+    if freq == "ME" and isinstance(ser.index, TimedeltaIndex):
+        msg = (
+            "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
+            "e.g. '24h' or '3D', not <MonthEnd>"
+        )
+        with pytest.raises(ValueError, match=msg):
+            ser.resample(freq)
+        return
+    elif freq == "ME" and isinstance(ser.index, PeriodIndex):
+        # index is PeriodIndex, so convert to corresponding Period freq
+        freq = "M"
+    rs = ser.resample(freq)
+
+    result = getattr(rs, resample_method)()
+
+    index = _asfreq_compat(ser.index, freq)
+
+    expected = Series([], dtype="int64", index=index, name=ser.name)
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "index", [DatetimeIndex([]), TimedeltaIndex([]), PeriodIndex([], freq="D")]
+)
+@pytest.mark.parametrize("freq", ["ME", "D", "h"])
+def test_resample_empty_dataframe(index, freq, resample_method):
+    # GH13212
+    df = DataFrame(index=index)
+    # count retains dimensions too
+    if freq == "ME" and isinstance(df.index, TimedeltaIndex):
+        msg = (
+            "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
+            "e.g. '24h' or '3D', not <MonthEnd>"
+        )
+        with pytest.raises(ValueError, match=msg):
+            df.resample(freq, group_keys=False)
+        return
+    elif freq == "ME" and isinstance(df.index, PeriodIndex):
+        # index is PeriodIndex, so convert to corresponding Period freq
+        freq = "M"
+    rs = df.resample(freq, group_keys=False)
+    result = getattr(rs, resample_method)()
+    if resample_method == "ohlc":
+        # TODO: no tests with len(df.columns) > 0
+        mi = MultiIndex.from_product([df.columns, ["open", "high", "low", "close"]])
+        expected = DataFrame([], index=df.index[:0], columns=mi, dtype=np.float64)
+        expected.index = _asfreq_compat(df.index, freq)
+
+    elif resample_method != "size":
+        expected = df.copy()
+    else:
+        # GH14962
+        expected = Series([], dtype=np.int64)
+
+    expected.index = _asfreq_compat(df.index, freq)
+
+    tm.assert_index_equal(result.index, expected.index)
+    assert result.index.freq == expected.index.freq
+    tm.assert_almost_equal(result, expected)
+
+    # test size for GH13212 (currently stays as df)
+
+
+@pytest.mark.parametrize(
+    "index", [DatetimeIndex([]), TimedeltaIndex([]), PeriodIndex([], freq="D")]
+)
+@pytest.mark.parametrize("freq", ["ME", "D", "h"])
+def test_resample_count_empty_dataframe(freq, index):
+    # GH28427
+    empty_frame_dti = DataFrame(index=index, columns=Index(["a"], dtype=object))
+
+    if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex):
+        msg = (
+            "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
+            "e.g. '24h' or '3D', not <MonthEnd>"
+        )
+        with pytest.raises(ValueError, match=msg):
+            empty_frame_dti.resample(freq)
+        return
+    elif freq == "ME" and isinstance(empty_frame_dti.index, PeriodIndex):
+        # index is PeriodIndex, so convert to corresponding Period freq
+        freq = "M"
+    result = empty_frame_dti.resample(freq).count()
+
+    index = _asfreq_compat(empty_frame_dti.index, freq)
+
+    expected = DataFrame(dtype="int64", index=index, columns=Index(["a"], dtype=object))
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "index", [DatetimeIndex([]), TimedeltaIndex([]), PeriodIndex([], freq="D")]
+)
+@pytest.mark.parametrize("freq", ["ME", "D", "h"])
+def test_resample_size_empty_dataframe(freq, index):
+    # GH28427
+
+    empty_frame_dti = DataFrame(index=index, columns=Index(["a"], dtype=object))
+
+    if freq == "ME" and isinstance(empty_frame_dti.index, TimedeltaIndex):
+        msg = (
+            "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
+            "e.g. '24h' or '3D', not <MonthEnd>"
+        )
+        with pytest.raises(ValueError, match=msg):
+            empty_frame_dti.resample(freq)
+        return
+    elif freq == "ME" and isinstance(empty_frame_dti.index, PeriodIndex):
+        # index is PeriodIndex, so convert to corresponding Period freq
+        freq = "M"
+    result = empty_frame_dti.resample(freq).size()
+
+    index = _asfreq_compat(empty_frame_dti.index, freq)
+
+    expected = Series([], dtype="int64", index=index)
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("index", [DatetimeIndex([]), TimedeltaIndex([])])
+@pytest.mark.parametrize("freq", ["D", "h"])
+@pytest.mark.parametrize(
+    "method", ["ffill", "bfill", "nearest", "asfreq", "interpolate", "mean"]
+)
+def test_resample_apply_empty_dataframe(index, freq, method):
+    # GH#55572
+    empty_frame_dti = DataFrame(index=index)
+
+    rs = empty_frame_dti.resample(freq)
+    result = rs.apply(getattr(rs, method))
+
+    expected_index = _asfreq_compat(empty_frame_dti.index, freq)
+    expected = DataFrame([], index=expected_index)
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        PeriodIndex([], freq="M", name="a"),
+        DatetimeIndex([], name="a"),
+        TimedeltaIndex([], name="a"),
+    ],
+)
+@pytest.mark.parametrize("dtype", [float, int, object, "datetime64[ns]"])
+def test_resample_empty_dtypes(index, dtype, resample_method):
+    # Empty series were sometimes causing a segfault (for the functions
+    # with Cython bounds-checking disabled) or an IndexError.  We just run
+    # them to ensure they no longer do.  (GH #10228)
+    empty_series_dti = Series([], index, dtype)
+    rs = empty_series_dti.resample("D", group_keys=False)
+    try:
+        getattr(rs, resample_method)()
+    except DataError:
+        # Ignore these since some combinations are invalid
+        # (ex: doing mean with dtype of np.object_)
+        pass
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        PeriodIndex([], freq="D", name="a"),
+        DatetimeIndex([], name="a"),
+        TimedeltaIndex([], name="a"),
+    ],
+)
+@pytest.mark.parametrize("freq", ["ME", "D", "h"])
+def test_apply_to_empty_series(index, freq):
+    # GH 14313
+    ser = Series(index=index)
+
+    if freq == "ME" and isinstance(ser.index, TimedeltaIndex):
+        msg = (
+            "Resampling on a TimedeltaIndex requires fixed-duration `freq`, "
+            "e.g. '24h' or '3D', not <MonthEnd>"
+        )
+        with pytest.raises(ValueError, match=msg):
+            ser.resample(freq)
+        return
+    elif freq == "ME" and isinstance(ser.index, PeriodIndex):
+        # index is PeriodIndex, so convert to corresponding Period freq
+        freq = "M"
+    result = ser.resample(freq, group_keys=False).apply(lambda x: 1)
+    expected = ser.resample(freq).apply("sum")
+
+    tm.assert_series_equal(result, expected, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        timedelta_range("1 day", "10 day", freq="D"),
+        date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"),
+        period_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"),
+    ],
+)
+def test_resampler_is_iterable(index):
+    # GH 15314
+    series = Series(range(len(index)), index=index)
+    freq = "h"
+    tg = Grouper(freq=freq, convention="start")
+    grouped = series.groupby(tg)
+    resampled = series.resample(freq)
+    for (rk, rv), (gk, gv) in zip(resampled, grouped):
+        assert rk == gk
+        tm.assert_series_equal(rv, gv)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        timedelta_range("1 day", "10 day", freq="D"),
+        date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"),
+        period_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D"),
+    ],
+)
+def test_resample_quantile(index):
+    # GH 15023
+    ser = Series(range(len(index)), index=index)
+    q = 0.75
+    freq = "h"
+
+    result = ser.resample(freq).quantile(q)
+    expected = ser.resample(freq).agg(lambda x: x.quantile(q)).rename(ser.name)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("how", ["first", "last"])
+def test_first_last_skipna(any_real_nullable_dtype, skipna, how):
+    # GH#57019
+    if is_extension_array_dtype(any_real_nullable_dtype):
+        na_value = Series(dtype=any_real_nullable_dtype).dtype.na_value
+    else:
+        na_value = np.nan
+    df = DataFrame(
+        {
+            "a": [2, 1, 1, 2],
+            "b": [na_value, 3.0, na_value, 4.0],
+            "c": [na_value, 3.0, na_value, 4.0],
+        },
+        index=date_range("2020-01-01", periods=4, freq="D", unit="ns"),
+        dtype=any_real_nullable_dtype,
+    )
+    rs = df.resample("ME")
+    method = getattr(rs, how)
+    result = method(skipna=skipna)
+
+    ts = pd.to_datetime("2020-01-31").as_unit("ns")
+    gb = df.groupby(df.shape[0] * [ts])
+    expected = getattr(gb, how)(skipna=skipna)
+    expected.index.freq = "ME"
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..6867b6cf9927142888a066717a15278fbfee9013
--- /dev/null
+++ b/pandas/tests/resample/test_datetime_index.py
@@ -0,0 +1,2190 @@
+from datetime import datetime
+from functools import partial
+import zoneinfo
+
+import numpy as np
+import pytest
+
+from pandas._libs import lib
+from pandas._libs.tslibs import Day
+from pandas._typing import DatetimeNaTType
+from pandas.compat import is_platform_windows
+from pandas.compat.pyarrow import pa_version_under22p0
+from pandas.errors import Pandas4Warning
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    Timedelta,
+    Timestamp,
+    isna,
+    notna,
+)
+import pandas._testing as tm
+from pandas.core.groupby.grouper import Grouper
+from pandas.core.indexes.datetimes import date_range
+from pandas.core.indexes.period import (
+    Period,
+    period_range,
+)
+from pandas.core.resample import (
+    DatetimeIndex,
+    _get_timestamp_range_edges,
+)
+
+from pandas.tseries import offsets
+from pandas.tseries.frequencies import to_offset
+from pandas.tseries.offsets import Minute
+
+
+@pytest.fixture
+def simple_date_range_series():
+    """
+    Series with date range index and random data for test purposes.
+    """
+
+    def _simple_date_range_series(start, end, freq="D"):
+        rng = date_range(start, end, freq=freq)
+        return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    return _simple_date_range_series
+
+
+def test_custom_grouper(unit):
+    index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="Min")
+    dti = index.as_unit(unit)
+    s = Series(np.array([1] * len(dti)), index=dti, dtype="int64")
+
+    b = Grouper(freq=Minute(5))
+    g = s.groupby(b)
+
+    # check all cython functions work
+    g.ohlc()  # doesn't use _cython_agg_general
+    funcs = ["sum", "mean", "prod", "min", "max", "var"]
+    for f in funcs:
+        g._cython_agg_general(f, alt=None, numeric_only=True)
+
+    b = Grouper(freq=Minute(5), closed="right", label="right")
+    g = s.groupby(b)
+    # check all cython functions work
+    g.ohlc()  # doesn't use _cython_agg_general
+    funcs = ["sum", "mean", "prod", "min", "max", "var"]
+    for f in funcs:
+        g._cython_agg_general(f, alt=None, numeric_only=True)
+
+    assert g.ngroups == 2593
+    assert notna(g.mean()).all()
+
+    # construct expected val
+    arr = [1] + [5] * 2592
+    idx = dti[0:-1:5]
+    idx = idx.append(dti[-1:])
+    idx = DatetimeIndex(idx, freq="5min").as_unit(unit)
+    expect = Series(arr, index=idx)
+
+    # GH2763 - return input dtype if we can
+    result = g.agg("sum")
+    tm.assert_series_equal(result, expect)
+
+
+def test_custom_grouper_df(unit):
+    index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D")
+    b = Grouper(freq=Minute(5), closed="right", label="right")
+    dti = index.as_unit(unit)
+    df = DataFrame(
+        np.random.default_rng(2).random((len(dti), 10)), index=dti, dtype="float64"
+    )
+    r = df.groupby(b).agg("sum")
+
+    assert len(r.columns) == 10
+    assert len(r.index) == 2593
+
+
+@pytest.mark.parametrize(
+    "closed, expected",
+    [
+        (
+            "right",
+            lambda s: Series(
+                [s.iloc[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()],
+                index=date_range("1/1/2000", periods=4, freq="5min", name="index"),
+            ),
+        ),
+        (
+            "left",
+            lambda s: Series(
+                [s[:5].mean(), s[5:10].mean(), s[10:].mean()],
+                index=date_range(
+                    "1/1/2000 00:05", periods=3, freq="5min", name="index"
+                ),
+            ),
+        ),
+    ],
+)
+def test_resample_basic(closed, expected, unit):
+    index = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="Min")
+    s = Series(range(len(index)), index=index)
+    s.index.name = "index"
+    s.index = s.index.as_unit(unit)
+    expected = expected(s)
+    expected.index = expected.index.as_unit(unit)
+    result = s.resample("5min", closed=closed, label="right").mean()
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_integerarray(unit):
+    # GH 25580, resample on IntegerArray
+    ts = Series(
+        range(9),
+        index=date_range("1/1/2000", periods=9, freq="min").as_unit(unit),
+        dtype="Int64",
+    )
+    result = ts.resample("3min").sum()
+    expected = Series(
+        [3, 12, 21],
+        index=date_range("1/1/2000", periods=3, freq="3min").as_unit(unit),
+        dtype="Int64",
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = ts.resample("3min").mean()
+    expected = Series(
+        [1, 4, 7],
+        index=date_range("1/1/2000", periods=3, freq="3min").as_unit(unit),
+        dtype="Float64",
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_basic_grouper(unit):
+    index = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="Min")
+    s = Series(range(len(index)), index=index)
+    s.index.name = "index"
+    s.index = s.index.as_unit(unit)
+    result = s.resample("5Min").last()
+    grouper = Grouper(freq=Minute(5), closed="left", label="left")
+    expected = s.groupby(grouper).agg(lambda x: x.iloc[-1])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "keyword,value",
+    [("label", "righttt"), ("closed", "righttt"), ("convention", "starttt")],
+)
+def test_resample_string_kwargs(keyword, value, unit):
+    # see gh-19303
+    # Check that wrong keyword argument strings raise an error
+    index = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="Min")
+    series = Series(range(len(index)), index=index)
+    series.index.name = "index"
+    series.index = series.index.as_unit(unit)
+    msg = f"Unsupported value {value} for `{keyword}`"
+    with pytest.raises(ValueError, match=msg):
+        series.resample("5min", **({keyword: value}))
+
+
+def test_resample_how(downsample_method, unit):
+    if downsample_method == "ohlc":
+        pytest.skip("covered by test_resample_how_ohlc")
+    index = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="Min")
+    s = Series(range(len(index)), index=index)
+    s.index.name = "index"
+    s.index = s.index.as_unit(unit)
+    grouplist = np.ones_like(s)
+    grouplist[0] = 0
+    grouplist[1:6] = 1
+    grouplist[6:11] = 2
+    grouplist[11:] = 3
+    expected = s.groupby(grouplist).agg(downsample_method)
+    expected.index = date_range(
+        "1/1/2000", periods=4, freq="5min", name="index"
+    ).as_unit(unit)
+
+    result = getattr(
+        s.resample("5min", closed="right", label="right"), downsample_method
+    )()
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_how_ohlc(unit):
+    index = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="Min")
+    s = Series(range(len(index)), index=index)
+    s.index.name = "index"
+    s.index = s.index.as_unit(unit)
+    grouplist = np.ones_like(s)
+    grouplist[0] = 0
+    grouplist[1:6] = 1
+    grouplist[6:11] = 2
+    grouplist[11:] = 3
+
+    def _ohlc(group):
+        if isna(group).all():
+            return np.repeat(np.nan, 4)
+        return [group.iloc[0], group.max(), group.min(), group.iloc[-1]]
+
+    expected = DataFrame(
+        s.groupby(grouplist).agg(_ohlc).values.tolist(),
+        index=date_range("1/1/2000", periods=4, freq="5min", name="index").as_unit(
+            unit
+        ),
+        columns=["open", "high", "low", "close"],
+    )
+
+    result = s.resample("5min", closed="right", label="right").ohlc()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_how_callables(unit):
+    # GH#7929
+    data = np.arange(5, dtype=np.int64)
+    msg = "'d' is deprecated and will be removed in a future version."
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        ind = date_range(start="2014-01-01", periods=len(data), freq="d").as_unit(unit)
+    df = DataFrame({"A": data, "B": data}, index=ind)
+
+    def fn(x, a=1):
+        return str(type(x))
+
+    class FnClass:
+        def __call__(self, x):
+            return str(type(x))
+
+    df_standard = df.resample("ME").apply(fn)
+    df_lambda = df.resample("ME").apply(lambda x: str(type(x)))
+    df_partial = df.resample("ME").apply(partial(fn))
+    df_partial2 = df.resample("ME").apply(partial(fn, a=2))
+    df_class = df.resample("ME").apply(FnClass())
+
+    tm.assert_frame_equal(df_standard, df_lambda)
+    tm.assert_frame_equal(df_standard, df_partial)
+    tm.assert_frame_equal(df_standard, df_partial2)
+    tm.assert_frame_equal(df_standard, df_class)
+
+
+def test_resample_rounding(unit):
+    # GH 8371
+    # odd results when rounding is needed
+
+    ts = [
+        "2014-11-08 00:00:01",
+        "2014-11-08 00:00:02",
+        "2014-11-08 00:00:02",
+        "2014-11-08 00:00:03",
+        "2014-11-08 00:00:07",
+        "2014-11-08 00:00:07",
+        "2014-11-08 00:00:08",
+        "2014-11-08 00:00:08",
+        "2014-11-08 00:00:08",
+        "2014-11-08 00:00:09",
+        "2014-11-08 00:00:10",
+        "2014-11-08 00:00:11",
+        "2014-11-08 00:00:11",
+        "2014-11-08 00:00:13",
+        "2014-11-08 00:00:14",
+        "2014-11-08 00:00:15",
+        "2014-11-08 00:00:17",
+        "2014-11-08 00:00:20",
+        "2014-11-08 00:00:21",
+    ]
+    df = DataFrame({"value": [1] * 19}, index=pd.to_datetime(ts))
+    df.index = df.index.as_unit(unit)
+
+    result = df.resample("6s").sum()
+    expected = DataFrame(
+        {"value": [4, 9, 4, 2]},
+        index=date_range("2014-11-08", freq="6s", periods=4).as_unit(unit),
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = df.resample("7s").sum()
+    expected = DataFrame(
+        {"value": [4, 10, 4, 1]},
+        index=date_range("2014-11-08", freq="7s", periods=4).as_unit(unit),
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = df.resample("11s").sum()
+    expected = DataFrame(
+        {"value": [11, 8]},
+        index=date_range("2014-11-08", freq="11s", periods=2).as_unit(unit),
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = df.resample("13s").sum()
+    expected = DataFrame(
+        {"value": [13, 6]},
+        index=date_range("2014-11-08", freq="13s", periods=2).as_unit(unit),
+    )
+    tm.assert_frame_equal(result, expected)
+
+    result = df.resample("17s").sum()
+    expected = DataFrame(
+        {"value": [16, 3]},
+        index=date_range("2014-11-08", freq="17s", periods=2).as_unit(unit),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_basic_from_daily(unit):
+    # from daily
+    dti = date_range(
+        start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D", name="index"
+    ).as_unit(unit)
+
+    s = Series(np.random.default_rng(2).random(len(dti)), dti)
+
+    # to weekly
+    msg = "'w-sun' is deprecated and will be removed in a future version."
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        result = s.resample("w-sun").last()
+
+    assert len(result) == 3
+    assert (result.index.dayofweek == [6, 6, 6]).all()
+    assert result.iloc[0] == s["1/2/2005"]
+    assert result.iloc[1] == s["1/9/2005"]
+    assert result.iloc[2] == s.iloc[-1]
+
+    result = s.resample("W-MON").last()
+    assert len(result) == 2
+    assert (result.index.dayofweek == [0, 0]).all()
+    assert result.iloc[0] == s["1/3/2005"]
+    assert result.iloc[1] == s["1/10/2005"]
+
+    result = s.resample("W-TUE").last()
+    assert len(result) == 2
+    assert (result.index.dayofweek == [1, 1]).all()
+    assert result.iloc[0] == s["1/4/2005"]
+    assert result.iloc[1] == s["1/10/2005"]
+
+    result = s.resample("W-WED").last()
+    assert len(result) == 2
+    assert (result.index.dayofweek == [2, 2]).all()
+    assert result.iloc[0] == s["1/5/2005"]
+    assert result.iloc[1] == s["1/10/2005"]
+
+    result = s.resample("W-THU").last()
+    assert len(result) == 2
+    assert (result.index.dayofweek == [3, 3]).all()
+    assert result.iloc[0] == s["1/6/2005"]
+    assert result.iloc[1] == s["1/10/2005"]
+
+    result = s.resample("W-FRI").last()
+    assert len(result) == 2
+    assert (result.index.dayofweek == [4, 4]).all()
+    assert result.iloc[0] == s["1/7/2005"]
+    assert result.iloc[1] == s["1/10/2005"]
+
+    # to biz day
+    result = s.resample("B").last()
+    assert len(result) == 7
+    assert (result.index.dayofweek == [4, 0, 1, 2, 3, 4, 0]).all()
+
+    assert result.iloc[0] == s["1/2/2005"]
+    assert result.iloc[1] == s["1/3/2005"]
+    assert result.iloc[5] == s["1/9/2005"]
+    assert result.index.name == "index"
+
+
+def test_resample_upsampling_picked_but_not_correct(unit):
+    # Test for issue #3020
+    dates = date_range("01-Jan-2014", "05-Jan-2014", freq="D").as_unit(unit)
+    series = Series(1, index=dates)
+
+    result = series.resample("D").mean()
+    assert result.index[0] == dates[0]
+
+    # GH 5955
+    # incorrect deciding to upsample when the axis frequency matches the
+    # resample frequency
+
+    s = Series(
+        np.arange(1.0, 6), index=[datetime(1975, 1, i, 12, 0) for i in range(1, 6)]
+    )
+    s.index = s.index.as_unit(unit)
+    expected = Series(
+        np.arange(1.0, 6),
+        index=date_range("19750101", periods=5, freq="D").as_unit(unit),
+    )
+
+    result = s.resample("D").count()
+    tm.assert_series_equal(result, Series(1, index=expected.index))
+
+    result1 = s.resample("D").sum()
+    result2 = s.resample("D").mean()
+    tm.assert_series_equal(result1, expected)
+    tm.assert_series_equal(result2, expected)
+
+
+@pytest.mark.parametrize("f", ["sum", "mean", "prod", "min", "max", "var"])
+def test_resample_frame_basic_cy_funcs(f, unit):
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((50, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=date_range("2000-01-01", periods=50, freq="B"),
+    )
+    df.index = df.index.as_unit(unit)
+
+    b = Grouper(freq="ME")
+    g = df.groupby(b)
+
+    # check all cython functions work
+    g._cython_agg_general(f, alt=None, numeric_only=True)
+
+
+@pytest.mark.parametrize("freq", ["YE", "ME"])
+def test_resample_frame_basic_M_A(freq, unit):
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((50, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=date_range("2000-01-01", periods=50, freq="B"),
+    )
+    df.index = df.index.as_unit(unit)
+    result = df.resample(freq).mean()
+    tm.assert_series_equal(result["A"], df["A"].resample(freq).mean())
+
+
+def test_resample_upsample(unit):
+    # from daily
+    dti = date_range(
+        start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D", name="index"
+    ).as_unit(unit)
+
+    s = Series(np.random.default_rng(2).random(len(dti)), dti)
+
+    # to minutely, by padding
+    result = s.resample("Min").ffill()
+    assert len(result) == 12961
+    assert result.iloc[0] == s.iloc[0]
+    assert result.iloc[-1] == s.iloc[-1]
+
+    assert result.index.name == "index"
+
+
+def test_resample_how_method(unit):
+    # GH9915
+    s = Series(
+        [11, 22],
+        index=[
+            Timestamp("2015-03-31 21:48:52.672000"),
+            Timestamp("2015-03-31 21:49:52.739000"),
+        ],
+    )
+    s.index = s.index.as_unit(unit)
+    expected = Series(
+        [11, np.nan, np.nan, np.nan, np.nan, np.nan, 22],
+        index=DatetimeIndex(
+            [
+                Timestamp("2015-03-31 21:48:50"),
+                Timestamp("2015-03-31 21:49:00"),
+                Timestamp("2015-03-31 21:49:10"),
+                Timestamp("2015-03-31 21:49:20"),
+                Timestamp("2015-03-31 21:49:30"),
+                Timestamp("2015-03-31 21:49:40"),
+                Timestamp("2015-03-31 21:49:50"),
+            ],
+            freq="10s",
+        ),
+    )
+    expected.index = expected.index.as_unit(unit)
+    tm.assert_series_equal(s.resample("10s").mean(), expected)
+
+
+def test_resample_extra_index_point(unit):
+    # GH#9756
+    index = date_range(start="20150101", end="20150331", freq="BME").as_unit(unit)
+    expected = DataFrame({"A": Series([21, 41, 63], index=index)})
+
+    index = date_range(start="20150101", end="20150331", freq="B").as_unit(unit)
+    df = DataFrame({"A": Series(range(len(index)), index=index)}, dtype="int64")
+    result = df.resample("BME").last()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_upsample_with_limit(unit):
+    rng = date_range("1/1/2000", periods=3, freq="5min").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+
+    result = ts.resample("min").ffill(limit=2)
+    expected = ts.reindex(result.index, method="ffill", limit=2)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("freq", ["1D", "10h", "5Min", "10s"])
+@pytest.mark.parametrize("rule", ["YE", "3ME", "15D", "30h", "15Min", "30s"])
+def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule, unit):
+    # GH 33939
+    rng = date_range("1/1/2000", periods=3, freq=freq, tz=tz_aware_fixture).as_unit(
+        unit
+    )
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+
+    result = ts.resample(rule).nearest(limit=2)
+    expected = ts.reindex(result.index, method="nearest", limit=2)
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_ohlc(unit):
+    index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 2), freq="Min")
+    s = Series(range(len(index)), index=index)
+    s.index.name = "index"
+    s.index = s.index.as_unit(unit)
+
+    grouper = Grouper(freq=Minute(5))
+    expect = s.groupby(grouper).agg(lambda x: x.iloc[-1])
+    result = s.resample("5Min").ohlc()
+
+    assert len(result) == len(expect)
+    assert len(result.columns) == 4
+
+    xs = result.iloc[-2]
+    assert xs["open"] == s.iloc[-6]
+    assert xs["high"] == s[-6:-1].max()
+    assert xs["low"] == s[-6:-1].min()
+    assert xs["close"] == s.iloc[-2]
+
+    xs = result.iloc[0]
+    assert xs["open"] == s.iloc[0]
+    assert xs["high"] == s[:5].max()
+    assert xs["low"] == s[:5].min()
+    assert xs["close"] == s.iloc[4]
+
+
+def test_resample_ohlc_result(unit):
+    # GH 12332
+    index = date_range("1-1-2000", "2-15-2000", freq="h").as_unit(unit)
+    index = index.union(date_range("4-15-2000", "5-15-2000", freq="h").as_unit(unit))
+    s = Series(range(len(index)), index=index)
+
+    a = s.loc[:"4-15-2000"].resample("30min").ohlc()
+    assert isinstance(a, DataFrame)
+
+    b = s.loc[:"4-14-2000"].resample("30min").ohlc()
+    assert isinstance(b, DataFrame)
+
+
+def test_resample_ohlc_result_odd_period(unit):
+    # GH12348
+    # raising on odd period
+    rng = date_range("2013-12-30", "2014-01-07").as_unit(unit)
+    index = rng.drop(
+        [
+            Timestamp("2014-01-01"),
+            Timestamp("2013-12-31"),
+            Timestamp("2014-01-04"),
+            Timestamp("2014-01-05"),
+        ]
+    )
+    df = DataFrame(data=np.arange(len(index)), index=index)
+    result = df.resample("B").mean()
+    expected = df.reindex(index=date_range(rng[0], rng[-1], freq="B").as_unit(unit))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_ohlc_dataframe(unit):
+    df = (
+        DataFrame(
+            {
+                "PRICE": {
+                    Timestamp("2011-01-06 10:59:05", tz=None): 24990,
+                    Timestamp("2011-01-06 12:43:33", tz=None): 25499,
+                    Timestamp("2011-01-06 12:54:09", tz=None): 25499,
+                },
+                "VOLUME": {
+                    Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
+                    Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
+                    Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
+                },
+            }
+        )
+    ).reindex(["VOLUME", "PRICE"], axis=1)
+    df.index = df.index.as_unit(unit)
+    df.columns.name = "Cols"
+    res = df.resample("h").ohlc()
+    exp = pd.concat(
+        [df["VOLUME"].resample("h").ohlc(), df["PRICE"].resample("h").ohlc()],
+        axis=1,
+        keys=df.columns,
+    )
+    assert exp.columns.names[0] == "Cols"
+    tm.assert_frame_equal(exp, res)
+
+    df.columns = [["a", "b"], ["c", "d"]]
+    res = df.resample("h").ohlc()
+    exp.columns = pd.MultiIndex.from_tuples(
+        [
+            ("a", "c", "open"),
+            ("a", "c", "high"),
+            ("a", "c", "low"),
+            ("a", "c", "close"),
+            ("b", "d", "open"),
+            ("b", "d", "high"),
+            ("b", "d", "low"),
+            ("b", "d", "close"),
+        ]
+    )
+    tm.assert_frame_equal(exp, res)
+
+    # dupe columns fail atm
+    # df.columns = ['PRICE', 'PRICE']
+
+
+def test_resample_reresample(unit):
+    dti = date_range(
+        start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="D"
+    ).as_unit(unit)
+    s = Series(np.random.default_rng(2).random(len(dti)), dti)
+    bs = s.resample("B", closed="right", label="right").mean()
+    result = bs.resample("8h").mean()
+    assert len(result) == 25
+    assert isinstance(result.index.freq, offsets.DateOffset)
+    assert result.index.freq == offsets.Hour(8)
+
+
+@pytest.mark.parametrize(
+    "freq, expected_kwargs",
+    [
+        ["YE-DEC", {"start": "1990", "end": "2000", "freq": "Y-DEC"}],
+        ["YE-JUN", {"start": "1990", "end": "2000", "freq": "Y-JUN"}],
+        ["ME", {"start": "1990-01", "end": "2000-01", "freq": "M"}],
+    ],
+)
+def test_resample_timestamp_to_period(
+    simple_date_range_series, freq, expected_kwargs, unit
+):
+    ts = simple_date_range_series("1/1/1990", "1/1/2000")
+    ts.index = ts.index.as_unit(unit)
+
+    result = ts.resample(freq).mean().to_period()
+    expected = ts.resample(freq).mean()
+    expected.index = period_range(**expected_kwargs)
+    tm.assert_series_equal(result, expected)
+
+
+def test_ohlc_5min(unit):
+    def _ohlc(group):
+        if isna(group).all():
+            return np.repeat(np.nan, 4)
+        return [group.iloc[0], group.max(), group.min(), group.iloc[-1]]
+
+    rng = date_range("1/1/2000 00:00:00", "1/1/2000 5:59:50", freq="10s").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    resampled = ts.resample("5min", closed="right", label="right").ohlc()
+
+    assert (resampled.loc["1/1/2000 00:00"] == ts.iloc[0]).all()
+
+    exp = _ohlc(ts[1:31])
+    assert (resampled.loc["1/1/2000 00:05"] == exp).all()
+
+    exp = _ohlc(ts["1/1/2000 5:55:01":])
+    assert (resampled.loc["1/1/2000 6:00:00"] == exp).all()
+
+
+def test_downsample_non_unique(unit):
+    rng = date_range("1/1/2000", "2/29/2000").as_unit(unit)
+    rng2 = rng.repeat(5).values
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng2)), index=rng2)
+
+    result = ts.resample("ME").mean()
+
+    expected = ts.groupby(lambda x: x.month).mean()
+    assert len(result) == 2
+    tm.assert_almost_equal(result.iloc[0], expected[1])
+    tm.assert_almost_equal(result.iloc[1], expected[2])
+
+
+def test_asfreq_non_unique(unit):
+    # GH #1077
+    rng = date_range("1/1/2000", "2/29/2000").as_unit(unit)
+    rng2 = rng.repeat(2).values
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng2)), index=rng2)
+
+    msg = "cannot reindex on an axis with duplicate labels"
+    with pytest.raises(ValueError, match=msg):
+        ts.asfreq("B")
+
+
+@pytest.mark.parametrize("freq", ["min", "5min", "15min", "30min", "4h", "12h"])
+def test_resample_anchored_ticks(freq, unit):
+    # If a fixed delta (5 minute, 4 hour) evenly divides a day, we should
+    # "anchor" the origin at midnight so we get regular intervals rather
+    # than starting from the first timestamp which might start in the
+    # middle of a desired interval
+
+    rng = date_range("1/1/2000 04:00:00", periods=86400, freq="s").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+    ts[:2] = np.nan  # so results are the same
+    result = ts[2:].resample(freq, closed="left", label="left").mean()
+    expected = ts.resample(freq, closed="left", label="left").mean()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("end", [1, 2])
+def test_resample_single_group(end, unit):
+    mysum = lambda x: x.sum()
+
+    rng = date_range("2000-1-1", f"2000-{end}-10", freq="D").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+    tm.assert_series_equal(ts.resample("ME").sum(), ts.resample("ME").apply(mysum))
+
+
+def test_resample_single_group_std(unit):
+    # GH 3849
+    s = Series(
+        [30.1, 31.6],
+        index=[Timestamp("20070915 15:30:00"), Timestamp("20070915 15:40:00")],
+    )
+    s.index = s.index.as_unit(unit)
+    expected = Series(
+        [0.75], index=DatetimeIndex([Timestamp("20070915")], freq="D").as_unit(unit)
+    )
+    result = s.resample("D").apply(lambda x: np.std(x))
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_offset(unit):
+    # GH 31809
+
+    rng = date_range("1/1/2000 00:00:00", "1/1/2000 02:00", freq="s").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    resampled = ts.resample("5min", offset="2min").mean()
+    exp_rng = date_range("12/31/1999 23:57:00", "1/1/2000 01:57", freq="5min").as_unit(
+        unit
+    )
+    tm.assert_index_equal(resampled.index, exp_rng)
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"origin": "1999-12-31 23:57:00"},
+        {"origin": Timestamp("1970-01-01 00:02:00")},
+        {"origin": "epoch", "offset": "2m"},
+        # origin of '1999-31-12 12:02:00' should be equivalent for this case
+        {"origin": "1999-12-31 12:02:00"},
+        {"offset": "-3m"},
+    ],
+)
+def test_resample_origin(kwargs, unit):
+    # GH 31809
+    rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    exp_rng = date_range(
+        "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min"
+    ).as_unit(unit)
+
+    resampled = ts.resample("5min", **kwargs).mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+
+
+@pytest.mark.parametrize(
+    "origin", ["invalid_value", "epch", "startday", "startt", "2000-30-30", object()]
+)
+def test_resample_bad_origin(origin, unit):
+    rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+    msg = (
+        "'origin' should be equal to 'epoch', 'start', 'start_day', "
+        "'end', 'end_day' or should be a Timestamp convertible type. Got "
+        f"'{origin}' instead."
+    )
+    with pytest.raises(ValueError, match=msg):
+        ts.resample("5min", origin=origin)
+
+
+@pytest.mark.parametrize("offset", ["invalid_value", "12dayys", "2000-30-30", object()])
+def test_resample_bad_offset(offset, unit):
+    rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+    msg = f"'offset' should be a Timedelta convertible type. Got '{offset}' instead."
+    with pytest.raises(ValueError, match=msg):
+        ts.resample("5min", offset=offset)
+
+
+def test_resample_origin_prime_freq(unit):
+    # GH 31809
+    start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00"
+    rng = date_range(start, end, freq="7min").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    exp_rng = date_range(
+        "2000-10-01 23:14:00", "2000-10-02 00:22:00", freq="17min"
+    ).as_unit(unit)
+    resampled = ts.resample("17min").mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+    resampled = ts.resample("17min", origin="start_day").mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+
+    exp_rng = date_range(
+        "2000-10-01 23:30:00", "2000-10-02 00:21:00", freq="17min"
+    ).as_unit(unit)
+    resampled = ts.resample("17min", origin="start").mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+    resampled = ts.resample("17min", offset="23h30min").mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+    resampled = ts.resample("17min", origin="start_day", offset="23h30min").mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+
+    exp_rng = date_range(
+        "2000-10-01 23:18:00", "2000-10-02 00:26:00", freq="17min"
+    ).as_unit(unit)
+    resampled = ts.resample("17min", origin="epoch").mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+
+    exp_rng = date_range(
+        "2000-10-01 23:24:00", "2000-10-02 00:15:00", freq="17min"
+    ).as_unit(unit)
+    resampled = ts.resample("17min", origin="2000-01-01").mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+
+
+def test_resample_origin_with_tz(unit):
+    # GH 31809
+    msg = "The origin must have the same timezone as the index."
+
+    tz = "Europe/Paris"
+    rng = date_range(
+        "2000-01-01 00:00:00", "2000-01-01 02:00", freq="s", tz=tz
+    ).as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    exp_rng = date_range(
+        "1999-12-31 23:57:00", "2000-01-01 01:57", freq="5min", tz=tz
+    ).as_unit(unit)
+    resampled = ts.resample("5min", origin="1999-12-31 23:57:00+00:00").mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+
+    # origin of '1999-31-12 12:02:00+03:00' should be equivalent for this case
+    resampled = ts.resample("5min", origin="1999-12-31 12:02:00+03:00").mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+
+    resampled = ts.resample("5min", origin="epoch", offset="2m").mean()
+    tm.assert_index_equal(resampled.index, exp_rng)
+
+    with pytest.raises(ValueError, match=msg):
+        ts.resample("5min", origin="12/31/1999 23:57:00").mean()
+
+    # if the series is not tz aware, origin should not be tz aware
+    rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+    with pytest.raises(ValueError, match=msg):
+        ts.resample("5min", origin="12/31/1999 23:57:00+03:00").mean()
+
+
+def test_resample_origin_epoch_with_tz_day_vs_24h(unit):
+    # GH 34474
+    start, end = "2000-10-01 23:30:00+0500", "2000-12-02 00:30:00+0500"
+    rng = date_range(start, end, freq="7min").as_unit(unit)
+    random_values = np.random.default_rng(2).standard_normal(len(rng))
+    ts_1 = Series(random_values, index=rng)
+
+    result_1 = ts_1.resample("D").mean()
+    result_2 = ts_1.resample("24h", origin="epoch").mean()
+    tm.assert_series_equal(result_1, result_2, check_freq=False)
+    # GH#41943 check_freq=False bc Day and Hour(24) no longer compare as equal
+
+    # check that we have the same behavior with epoch even if we are not timezone aware
+    ts_no_tz = ts_1.tz_localize(None)
+    result_3 = ts_no_tz.resample("D").mean()
+    result_4 = ts_no_tz.resample("24h", origin="epoch").mean()
+    tm.assert_series_equal(result_1, result_3.tz_localize(rng.tz), check_freq=False)
+    tm.assert_series_equal(result_1, result_4.tz_localize(rng.tz), check_freq=False)
+
+    # check that we have the similar results with two different timezones (+2H and +5H)
+    start, end = "2000-10-01 23:30:00+0200", "2000-12-02 00:30:00+0200"
+    rng = date_range(start, end, freq="7min").as_unit(unit)
+    ts_2 = Series(random_values, index=rng)
+    result_5 = ts_2.resample("D").mean()
+    result_6 = ts_2.resample("24h", origin="epoch").mean()
+    tm.assert_series_equal(result_1.tz_localize(None), result_5.tz_localize(None))
+    tm.assert_series_equal(result_1.tz_localize(None), result_6.tz_localize(None))
+
+
+def test_resample_origin_with_day_freq_on_dst(unit):
+    # GH 31809
+    tz = "America/Chicago"
+    msg = "The '(origin|offset)' keyword does not take effect"
+
+    def _create_series(values, timestamps, freq="D"):
+        return Series(
+            values,
+            index=DatetimeIndex(
+                [Timestamp(t, tz=tz) for t in timestamps], freq=freq, ambiguous=True
+            ).as_unit(unit),
+        )
+
+    # test classical behavior of origin in a DST context
+    start = Timestamp("2013-11-02", tz=tz)
+    end = Timestamp("2013-11-03 23:59", tz=tz)
+    rng = date_range(start, end, freq="1h").as_unit(unit)
+    ts = Series(np.ones(len(rng)), index=rng)
+
+    expected = _create_series([24.0, 25.0], ["2013-11-02", "2013-11-03"])
+    for origin in ["epoch", "start", "start_day", start, None]:
+        warn = RuntimeWarning if origin != "start_day" else None
+        with tm.assert_produces_warning(warn, match=msg):
+            result = ts.resample("D", origin=origin).sum()
+        tm.assert_series_equal(result, expected)
+
+    # test complex behavior of origin/offset in a DST context
+    start = Timestamp("2013-11-03", tz=tz)
+    end = Timestamp("2013-11-03 23:59", tz=tz)
+    rng = date_range(start, end, freq="1h").as_unit(unit)
+    ts = Series(np.ones(len(rng)), index=rng)
+
+    # GH#61985 changed this to behave like "B" rather than "24h"
+    expected_ts = ["2013-11-03 00:00-05:00"]
+    expected = _create_series([25.0], expected_ts)
+    with tm.assert_produces_warning(RuntimeWarning, match=msg):
+        result = ts.resample("D", origin="start", offset="-2h").sum()
+    tm.assert_series_equal(result, expected)
+
+    expected_ts = ["2013-11-02 22:00-05:00", "2013-11-03 21:00-06:00"]
+    expected = _create_series([22.0, 3.0], expected_ts, freq="24h")
+    result = ts.resample("24h", origin="start", offset="-2h").sum()
+    tm.assert_series_equal(result, expected)
+
+    # GH#61985 changed this to behave like "B" rather than "24h"
+    expected_ts = ["2013-11-03 00:00-05:00"]
+    expected = _create_series([25.0], expected_ts)
+    with tm.assert_produces_warning(RuntimeWarning, match=msg):
+        result = ts.resample("D", origin="start", offset="2h").sum()
+    tm.assert_series_equal(result, expected)
+
+    expected_ts = ["2013-11-03 00:00-05:00"]
+    expected = _create_series([25.0], expected_ts)
+    with tm.assert_produces_warning(RuntimeWarning, match=msg):
+        result = ts.resample("D", origin="start", offset="-1h").sum()
+    tm.assert_series_equal(result, expected)
+
+    expected_ts = ["2013-11-03 00:00-05:00"]
+    expected = _create_series([25.0], expected_ts)
+    with tm.assert_produces_warning(RuntimeWarning, match=msg):
+        result = ts.resample("D", origin="start", offset="1h").sum()
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_dst_midnight_last_nonexistent():
+    # GH 58380
+    ts = Series(
+        1,
+        date_range("2024-04-19", "2024-04-20", tz="Africa/Cairo", freq="15min"),
+    )
+
+    expected = Series([len(ts)], index=DatetimeIndex([ts.index[0]], freq="7D"))
+
+    result = ts.resample("7D").sum()
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_daily_anchored(unit):
+    rng = date_range("1/1/2000 0:00:00", periods=10000, freq="min").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+    ts[:2] = np.nan  # so results are the same
+
+    result = ts[2:].resample("D", closed="left", label="left").mean()
+    expected = ts.resample("D", closed="left", label="left").mean()
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_to_period_monthly_buglet(unit):
+    # GH #1259
+
+    rng = date_range("1/1/2000", "12/31/2000").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    result = ts.resample("ME").mean().to_period()
+    exp_index = period_range("Jan-2000", "Dec-2000", freq="M")
+    tm.assert_index_equal(result.index, exp_index)
+
+
+def test_period_with_agg():
+    # aggregate a period resampler with a lambda
+    s2 = Series(
+        np.random.default_rng(2).integers(0, 5, 50),
+        index=period_range("2012-01-01", freq="h", periods=50),
+        dtype="float64",
+    )
+
+    expected = s2.to_timestamp().resample("D").mean().to_period()
+    result = s2.resample("D").agg(lambda x: x.mean())
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_segfault(unit):
+    # GH 8573
+    # segfaulting in older versions
+    all_wins_and_wagers = [
+        (1, datetime(2013, 10, 1, 16, 20), 1, 0),
+        (2, datetime(2013, 10, 1, 16, 10), 1, 0),
+        (2, datetime(2013, 10, 1, 18, 15), 1, 0),
+        (2, datetime(2013, 10, 1, 16, 10, 31), 1, 0),
+    ]
+
+    df = DataFrame.from_records(
+        all_wins_and_wagers, columns=("ID", "timestamp", "A", "B")
+    ).set_index("timestamp")
+    df.index = df.index.as_unit(unit)
+    result = df.groupby("ID").resample("5min").sum()
+    expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum())
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_dtype_preservation(unit):
+    # GH 12202
+    # validation tests for dtype preservation
+
+    df = DataFrame(
+        {
+            "date": date_range(start="2016-01-01", periods=4, freq="W").as_unit(unit),
+            "group": [1, 1, 2, 2],
+            "val": Series([5, 6, 7, 8], dtype="int32"),
+        }
+    ).set_index("date")
+
+    result = df.resample("1D").ffill()
+    assert result.val.dtype == np.int32
+
+    result = df.groupby("group").resample("1D").ffill()
+    assert result.val.dtype == np.int32
+
+
+def test_resample_dtype_coercion(unit):
+    pytest.importorskip("scipy.interpolate")
+
+    # GH 16361
+    df = {"a": [1, 3, 1, 4]}
+    df = DataFrame(df, index=date_range("2017-01-01", "2017-01-04").as_unit(unit))
+
+    expected = df.astype("float64").resample("h").mean()["a"].interpolate("cubic")
+
+    result = df.resample("h")["a"].mean().interpolate("cubic")
+    tm.assert_series_equal(result, expected)
+
+    result = df.resample("h").mean()["a"].interpolate("cubic")
+    tm.assert_series_equal(result, expected)
+
+
+def test_weekly_resample_buglet(unit):
+    # #1327
+    rng = date_range("1/1/2000", freq="B", periods=20).as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    resampled = ts.resample("W").mean()
+    expected = ts.resample("W-SUN").mean()
+    tm.assert_series_equal(resampled, expected)
+
+
+def test_monthly_resample_error(unit):
+    # #1451
+    dates = date_range("4/16/2012 20:00", periods=5000, freq="h").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(dates)), index=dates)
+    # it works!
+    ts.resample("ME")
+
+
+def test_nanosecond_resample_error():
+    # GH 12307 - Values falls after last bin when
+    # Resampling using pd.tseries.offsets.Nano as period
+    start = 1443707890427
+    exp_start = 1443707890400
+    indx = date_range(start=pd.to_datetime(start), periods=10, freq="100ns")
+    ts = Series(range(len(indx)), index=indx)
+    r = ts.resample(pd.tseries.offsets.Nano(100))
+    result = r.agg("mean")
+
+    exp_indx = date_range(start=pd.to_datetime(exp_start), periods=10, freq="100ns")
+    exp = Series(range(len(exp_indx)), index=exp_indx, dtype=float)
+
+    tm.assert_series_equal(result, exp)
+
+
+def test_resample_anchored_intraday(unit):
+    # #1471, #1458
+
+    rng = date_range("1/1/2012", "4/1/2012", freq="100min").as_unit(unit)
+    df = DataFrame(rng.month, index=rng)
+
+    result = df.resample("ME").mean()
+    expected = df.resample("ME").mean().to_period()
+    expected = expected.to_timestamp(how="end")
+    expected.index += Timedelta(1, unit="us") - Timedelta(1, unit="D")
+    expected.index = expected.index.as_unit(unit)._with_freq("infer")
+    assert expected.index.freq == "ME"
+    tm.assert_frame_equal(result, expected)
+
+    result = df.resample("ME", closed="left").mean()
+    exp = df.shift(1, freq="D").resample("ME").mean().to_period()
+    exp = exp.to_timestamp(how="end")
+
+    exp.index = exp.index + Timedelta(1, unit="us") - Timedelta(1, unit="D")
+    exp.index = exp.index.as_unit(unit)._with_freq("infer")
+    assert exp.index.freq == "ME"
+    tm.assert_frame_equal(result, exp)
+
+
+def test_resample_anchored_intraday2(unit):
+    rng = date_range("1/1/2012", "4/1/2012", freq="100min").as_unit(unit)
+    df = DataFrame(rng.month, index=rng)
+
+    result = df.resample("QE").mean()
+    expected = df.resample("QE").mean().to_period()
+    expected = expected.to_timestamp(how="end")
+    expected.index += Timedelta(1, unit="us") - Timedelta(1, unit="D")
+    expected.index._data.freq = "QE"
+    expected.index._freq = lib.no_default
+    expected.index = expected.index.as_unit(unit)
+    tm.assert_frame_equal(result, expected)
+
+    result = df.resample("QE", closed="left").mean()
+    expected = df.shift(1, freq="D").resample("QE").mean()
+    expected = expected.to_period()
+    expected = expected.to_timestamp(how="end")
+    expected.index += Timedelta(1, unit="us") - Timedelta(1, unit="D")
+    expected.index._data.freq = "QE"
+    expected.index._freq = lib.no_default
+    expected.index = expected.index.as_unit(unit)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_anchored_intraday3(simple_date_range_series, unit):
+    ts = simple_date_range_series("2012-04-29 23:00", "2012-04-30 5:00", freq="h")
+    ts.index = ts.index.as_unit(unit)
+    resampled = ts.resample("ME").mean()
+    assert len(resampled) == 1
+
+
+@pytest.mark.parametrize("freq", ["MS", "BMS", "QS-MAR", "YS-DEC", "YS-JUN"])
+def test_resample_anchored_monthstart(simple_date_range_series, freq, unit):
+    ts = simple_date_range_series("1/1/2000", "12/31/2002")
+    ts.index = ts.index.as_unit(unit)
+    ts.resample(freq).mean()
+
+
+@pytest.mark.parametrize("label, sec", [[None, 2.0], ["right", "4.2"]])
+def test_resample_anchored_multiday(label, sec):
+    # When resampling a range spanning multiple days, ensure that the
+    # start date gets used to determine the offset.  Fixes issue where
+    # a one day period is not a multiple of the frequency.
+    #
+    # See: https://github.com/pandas-dev/pandas/issues/8683
+
+    index1 = date_range("2014-10-14 23:06:23.206", periods=3, freq="400ms")
+    index2 = date_range("2014-10-15 23:00:00", periods=2, freq="2200ms")
+    index = index1.union(index2)
+
+    s = Series(np.random.default_rng(2).standard_normal(5), index=index)
+
+    # Ensure left closing works
+    result = s.resample("2200ms", label=label).mean()
+    assert result.index[-1] == Timestamp(f"2014-10-15 23:00:{sec}00")
+
+
+def test_corner_cases(unit):
+    # miscellaneous test coverage
+
+    rng = date_range("1/1/2000", periods=12, freq="min").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    result = ts.resample("5min", closed="right", label="left").mean()
+    ex_index = date_range("1999-12-31 23:55", periods=4, freq="5min").as_unit(unit)
+    tm.assert_index_equal(result.index, ex_index)
+
+
+def test_corner_cases_date(simple_date_range_series, unit):
+    # resample to periods
+    ts = simple_date_range_series("2000-04-28", "2000-04-30 11:00", freq="h")
+    ts.index = ts.index.as_unit(unit)
+    result = ts.resample("ME").mean().to_period()
+    assert len(result) == 1
+    assert result.index[0] == Period("2000-04", freq="M")
+
+
+def test_anchored_lowercase_buglet(unit):
+    dates = date_range("4/16/2012 20:00", periods=50000, freq="s").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(dates)), index=dates)
+    # it works!
+    msg = "'d' is deprecated and will be removed in a future version."
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        ts.resample("d").mean()
+
+
+def test_upsample_apply_functions(unit):
+    # #1596
+    rng = date_range("2012-06-12", periods=4, freq="h").as_unit(unit)
+
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    result = ts.resample("20min").aggregate(["mean", "sum"])
+    assert isinstance(result, DataFrame)
+
+
+def test_resample_not_monotonic(unit):
+    rng = date_range("2012-06-12", periods=200, freq="h").as_unit(unit)
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    ts = ts.take(np.random.default_rng(2).permutation(len(ts)))
+
+    result = ts.resample("D").sum()
+    exp = ts.sort_index().resample("D").sum()
+    tm.assert_series_equal(result, exp)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "int64",
+        "int32",
+        "float64",
+        "float32",
+    ],
+)
+def test_resample_median_bug_1688(dtype, unit):
+    # GH#55958
+    dti = DatetimeIndex(
+        [datetime(2012, 1, 1, 0, 0, 0), datetime(2012, 1, 1, 0, 5, 0)]
+    ).as_unit(unit)
+    df = DataFrame(
+        [1, 2],
+        index=dti,
+        dtype=dtype,
+    )
+
+    result = df.resample("min").apply(lambda x: x.mean())
+    exp = df.asfreq("min")
+    tm.assert_frame_equal(result, exp)
+
+    result = df.resample("min").median()
+    exp = df.asfreq("min")
+    tm.assert_frame_equal(result, exp)
+
+
+def test_how_lambda_functions(simple_date_range_series, unit):
+    ts = simple_date_range_series("1/1/2000", "4/1/2000")
+    ts.index = ts.index.as_unit(unit)
+
+    result = ts.resample("ME").apply(lambda x: x.mean())
+    exp = ts.resample("ME").mean()
+    tm.assert_series_equal(result, exp)
+
+    foo_exp = ts.resample("ME").mean()
+    foo_exp.name = "foo"
+    bar_exp = ts.resample("ME").std()
+    bar_exp.name = "bar"
+
+    result = ts.resample("ME").apply([lambda x: x.mean(), lambda x: x.std(ddof=1)])
+    result.columns = ["foo", "bar"]
+    tm.assert_series_equal(result["foo"], foo_exp)
+    tm.assert_series_equal(result["bar"], bar_exp)
+
+    # this is a MI Series, so comparing the names of the results
+    # doesn't make sense
+    result = ts.resample("ME").aggregate(
+        {"foo": lambda x: x.mean(), "bar": lambda x: x.std(ddof=1)}
+    )
+    tm.assert_series_equal(result["foo"], foo_exp, check_names=False)
+    tm.assert_series_equal(result["bar"], bar_exp, check_names=False)
+
+
+def test_resample_unequal_times(unit):
+    # #1772
+    start = datetime(1999, 3, 1, 5)
+    # end hour is less than start
+    end = datetime(2012, 7, 31, 4)
+    bad_ind = date_range(start, end, freq="30min").as_unit(unit)
+    df = DataFrame({"close": 1}, index=bad_ind)
+
+    # it works!
+    df.resample("YS").sum()
+
+
+def test_resample_consistency(unit):
+    # GH 6418
+    # resample with bfill / limit / reindex consistency
+
+    i30 = date_range("2002-02-02", periods=4, freq="30min").as_unit(unit)
+    s = Series(np.arange(4.0), index=i30)
+    s.iloc[2] = np.nan
+
+    # Upsample by factor 3 with reindex() and resample() methods:
+    i10 = date_range(i30[0], i30[-1], freq="10min").as_unit(unit)
+
+    s10 = s.reindex(index=i10, method="bfill")
+    s10_2 = s.reindex(index=i10, method="bfill", limit=2)
+    with tm.assert_produces_warning(Pandas4Warning):
+        rl = s.reindex_like(s10, method="bfill", limit=2)
+    r10_2 = s.resample("10Min").bfill(limit=2)
+    r10 = s.resample("10Min").bfill()
+
+    # s10_2, r10, r10_2, rl should all be equal
+    tm.assert_series_equal(s10_2, r10)
+    tm.assert_series_equal(s10_2, r10_2)
+    tm.assert_series_equal(s10_2, rl)
+
+
+dates1: list[DatetimeNaTType] = [
+    datetime(2014, 10, 1),
+    datetime(2014, 9, 3),
+    datetime(2014, 11, 5),
+    datetime(2014, 9, 5),
+    datetime(2014, 10, 8),
+    datetime(2014, 7, 15),
+]
+
+dates2: list[DatetimeNaTType] = [*dates1[:2], pd.NaT, *dates1[2:4], pd.NaT, *dates1[4:]]
+dates3 = [pd.NaT, *dates1, pd.NaT]
+
+
+@pytest.mark.parametrize("dates", [dates1, dates2, dates3])
+def test_resample_timegrouper(dates, unit):
+    # GH 7227
+    dates = DatetimeIndex(dates).as_unit(unit)
+    df = DataFrame({"A": dates, "B": np.arange(len(dates))})
+    result = df.set_index("A").resample("ME").count()
+    exp_idx = DatetimeIndex(
+        ["2014-07-31", "2014-08-31", "2014-09-30", "2014-10-31", "2014-11-30"],
+        freq="ME",
+        name="A",
+    ).as_unit(unit)
+    expected = DataFrame({"B": [1, 0, 2, 2, 1]}, index=exp_idx)
+    if df["A"].isna().any():
+        expected.index = expected.index._with_freq(None)
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby(Grouper(freq="ME", key="A")).count()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dates", [dates1, dates2, dates3])
+def test_resample_timegrouper2(dates, unit):
+    dates = DatetimeIndex(dates).as_unit(unit)
+
+    df = DataFrame({"A": dates, "B": np.arange(len(dates)), "C": np.arange(len(dates))})
+    result = df.set_index("A").resample("ME").count()
+
+    exp_idx = DatetimeIndex(
+        ["2014-07-31", "2014-08-31", "2014-09-30", "2014-10-31", "2014-11-30"],
+        freq="ME",
+        name="A",
+    ).as_unit(unit)
+    expected = DataFrame(
+        {"B": [1, 0, 2, 2, 1], "C": [1, 0, 2, 2, 1]},
+        index=exp_idx,
+        columns=["B", "C"],
+    )
+    if df["A"].isna().any():
+        expected.index = expected.index._with_freq(None)
+    tm.assert_frame_equal(result, expected)
+
+    result = df.groupby(Grouper(freq="ME", key="A")).count()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_nunique(unit):
+    # GH 12352
+    df = DataFrame(
+        {
+            "ID": {
+                Timestamp("2015-06-05 00:00:00"): "0010100903",
+                Timestamp("2015-06-08 00:00:00"): "0010150847",
+            },
+            "DATE": {
+                Timestamp("2015-06-05 00:00:00"): "2015-06-05",
+                Timestamp("2015-06-08 00:00:00"): "2015-06-08",
+            },
+        }
+    )
+    df.index = df.index.as_unit(unit)
+    r = df.resample("D")
+    g = df.groupby(Grouper(freq="D"))
+    expected = df.groupby(Grouper(freq="D")).ID.apply(lambda x: x.nunique())
+    assert expected.name == "ID"
+
+    for t in [r, g]:
+        result = t.ID.nunique()
+        tm.assert_series_equal(result, expected)
+
+    result = df.ID.resample("D").nunique()
+    tm.assert_series_equal(result, expected)
+
+    result = df.ID.groupby(Grouper(freq="D")).nunique()
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_nunique_preserves_column_level_names(unit):
+    # see gh-23222
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((5, 4)),
+        columns=Index(list("ABCD"), dtype=object),
+        index=date_range("2000-01-01", periods=5, freq="D"),
+    ).abs()
+    df.index = df.index.as_unit(unit)
+    df.columns = pd.MultiIndex.from_arrays(
+        [df.columns.tolist()] * 2, names=["lev0", "lev1"]
+    )
+    result = df.resample("1h").nunique()
+    tm.assert_index_equal(df.columns, result.columns)
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda x: x.nunique(),
+        lambda x: x.agg(Series.nunique),
+        lambda x: x.agg("nunique"),
+    ],
+    ids=["nunique", "series_nunique", "nunique_str"],
+)
+def test_resample_nunique_with_date_gap(func, unit):
+    # GH 13453
+    # Since all elements are unique, these should all be the same
+    index = date_range("1-1-2000", "2-15-2000", freq="h").as_unit(unit)
+    index2 = date_range("4-15-2000", "5-15-2000", freq="h").as_unit(unit)
+    index3 = index.append(index2)
+    s = Series(range(len(index3)), index=index3, dtype="int64")
+    r = s.resample("ME")
+    result = r.count()
+    expected = func(r)
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_group_info(unit):
+    # GH10914
+
+    # use a fixed seed to always have the same uniques
+    n = 100
+    k = 10
+    prng = np.random.default_rng(2)
+
+    dr = date_range(start="2015-08-27", periods=n // 10, freq="min").as_unit(unit)
+    ts = Series(prng.integers(0, n // k, n).astype("int64"), index=prng.choice(dr, n))
+
+    left = ts.resample("30min").nunique()
+    ix = date_range(start=ts.index.min(), end=ts.index.max(), freq="30min").as_unit(
+        unit
+    )
+
+    vals = ts.values
+    bins = np.searchsorted(ix.values, ts.index, side="right")
+
+    sorter = np.lexsort((vals, bins))
+    vals, bins = vals[sorter], bins[sorter]
+
+    mask = np.r_[True, vals[1:] != vals[:-1]]
+    mask |= np.r_[True, bins[1:] != bins[:-1]]
+
+    arr = np.bincount(bins[mask] - 1, minlength=len(ix)).astype("int64", copy=False)
+    right = Series(arr, index=ix)
+
+    tm.assert_series_equal(left, right)
+
+
+def test_resample_size(unit):
+    n = 10000
+    dr = date_range("2015-09-19", periods=n, freq="min").as_unit(unit)
+    ts = Series(
+        np.random.default_rng(2).standard_normal(n),
+        index=np.random.default_rng(2).choice(dr, n),
+    )
+
+    left = ts.resample("7min").size()
+    ix = date_range(start=left.index.min(), end=ts.index.max(), freq="7min").as_unit(
+        unit
+    )
+
+    bins = np.searchsorted(ix.values, ts.index.values, side="right")
+    val = np.bincount(bins, minlength=len(ix) + 1)[1:].astype("int64", copy=False)
+
+    right = Series(val, index=ix)
+    tm.assert_series_equal(left, right)
+
+
+def test_resample_across_dst():
+    # The test resamples a DatetimeIndex with values before and after a
+    # DST change
+    # Issue: 14682
+
+    # The DatetimeIndex we will start with
+    # (note that DST happens at 03:00+02:00 -> 02:00+01:00)
+    # 2016-10-30 02:23:00+02:00, 2016-10-30 02:23:00+01:00
+    df1 = DataFrame([1477786980, 1477790580], columns=["ts"])
+    dti1 = DatetimeIndex(
+        pd.to_datetime(df1.ts, unit="s")
+        .dt.tz_localize("UTC")
+        .dt.tz_convert("Europe/Madrid")
+    )
+
+    # The expected DatetimeIndex after resampling.
+    # 2016-10-30 02:00:00+02:00, 2016-10-30 02:00:00+01:00
+    df2 = DataFrame([1477785600, 1477789200], columns=["ts"])
+    dti2 = DatetimeIndex(
+        pd.to_datetime(df2.ts, unit="s")
+        .dt.tz_localize("UTC")
+        .dt.tz_convert("Europe/Madrid"),
+        freq="h",
+    )
+    df = DataFrame([5, 5], index=dti1)
+
+    result = df.resample(rule="h").sum()
+    expected = DataFrame([5, 5], index=dti2)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_with_dst_time_change(unit):
+    # GH 24972
+    index = (
+        DatetimeIndex([1478064900001000000, 1480037118776792000], tz="UTC")
+        .tz_convert("America/Chicago")
+        .as_unit(unit)
+    )
+
+    df = DataFrame([1, 2], index=index)
+    result = df.groupby(Grouper(freq="1D")).last()
+    expected_index_values = date_range(
+        "2016-11-02", "2016-11-24", freq="D", tz="America/Chicago"
+    ).as_unit(unit)
+
+    index = DatetimeIndex(expected_index_values)
+    expected = DataFrame([1.0] + ([np.nan] * 21) + [2.0], index=index)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_dst_anchor(unit):
+    # 5172
+    dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz="US/Eastern").as_unit(unit)
+    df = DataFrame([5], index=dti)
+
+    dti = DatetimeIndex(df.index.normalize(), freq="D").as_unit(unit)
+    expected = DataFrame([5], index=dti)
+    tm.assert_frame_equal(df.resample(rule="D").sum(), expected)
+    df.resample(rule="MS").sum()
+    tm.assert_frame_equal(
+        df.resample(rule="MS").sum(),
+        DataFrame(
+            [5],
+            index=DatetimeIndex(
+                [datetime(2012, 11, 1)], tz="US/Eastern", freq="MS"
+            ).as_unit(unit),
+        ),
+    )
+
+
+def test_resample_dst_anchor2(unit):
+    dti = date_range(
+        "2013-09-30", "2013-11-02", freq="30Min", tz="Europe/Paris"
+    ).as_unit(unit)
+    values = range(dti.size)
+    df = DataFrame({"a": values, "b": values, "c": values}, index=dti, dtype="int64")
+    how = {"a": "min", "b": "max", "c": "count"}
+
+    rs = df.resample("W-MON")
+    result = rs.agg(how)[["a", "b", "c"]]
+    expected = DataFrame(
+        {
+            "a": [0, 48, 384, 720, 1056, 1394],
+            "b": [47, 383, 719, 1055, 1393, 1586],
+            "c": [48, 336, 336, 336, 338, 193],
+        },
+        index=date_range(
+            "9/30/2013", "11/4/2013", freq="W-MON", tz="Europe/Paris"
+        ).as_unit(unit),
+    )
+    tm.assert_frame_equal(
+        result,
+        expected,
+        "W-MON Frequency",
+    )
+
+    rs2 = df.resample("2W-MON")
+    result2 = rs2.agg(how)[["a", "b", "c"]]
+    expected2 = DataFrame(
+        {
+            "a": [0, 48, 720, 1394],
+            "b": [47, 719, 1393, 1586],
+            "c": [48, 672, 674, 193],
+        },
+        index=date_range(
+            "9/30/2013", "11/11/2013", freq="2W-MON", tz="Europe/Paris"
+        ).as_unit(unit),
+    )
+    tm.assert_frame_equal(
+        result2,
+        expected2,
+        "2W-MON Frequency",
+    )
+
+    rs3 = df.resample("MS")
+    result3 = rs3.agg(how)[["a", "b", "c"]]
+    expected3 = DataFrame(
+        {"a": [0, 48, 1538], "b": [47, 1537, 1586], "c": [48, 1490, 49]},
+        index=date_range("9/1/2013", "11/1/2013", freq="MS", tz="Europe/Paris").as_unit(
+            unit
+        ),
+    )
+    tm.assert_frame_equal(
+        result3,
+        expected3,
+        "MS Frequency",
+    )
+
+    rs4 = df.resample("2MS")
+    result4 = rs4.agg(how)[["a", "b", "c"]]
+    expected4 = DataFrame(
+        {"a": [0, 1538], "b": [1537, 1586], "c": [1538, 49]},
+        index=date_range(
+            "9/1/2013", "11/1/2013", freq="2MS", tz="Europe/Paris"
+        ).as_unit(unit),
+    )
+    tm.assert_frame_equal(
+        result4,
+        expected4,
+        "2MS Frequency",
+    )
+
+    df_daily = df["10/26/2013":"10/29/2013"]
+    rs_d = df_daily.resample("D")
+    result_d = rs_d.agg({"a": "min", "b": "max", "c": "count"})[["a", "b", "c"]]
+    expected_d = DataFrame(
+        {
+            "a": [1248, 1296, 1346, 1394],
+            "b": [1295, 1345, 1393, 1441],
+            "c": [48, 50, 48, 48],
+        },
+        index=date_range(
+            "10/26/2013", "10/29/2013", freq="D", tz="Europe/Paris"
+        ).as_unit(unit),
+    )
+    tm.assert_frame_equal(
+        result_d,
+        expected_d,
+        "D Frequency",
+    )
+
+
+def test_downsample_across_dst(unit):
+    # GH 8531
+    tz = zoneinfo.ZoneInfo("Europe/Berlin")
+    dt = datetime(2014, 10, 26)
+    dates = date_range(dt.astimezone(tz), periods=4, freq="2h").as_unit(unit)
+    result = Series(5, index=dates).resample("h").mean()
+    expected = Series(
+        [5.0, np.nan] * 3 + [5.0],
+        index=date_range(dt.astimezone(tz), periods=7, freq="h").as_unit(unit),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_downsample_across_dst_weekly(unit):
+    # GH 9119, GH 21459
+    df = DataFrame(
+        index=DatetimeIndex(
+            ["2017-03-25", "2017-03-26", "2017-03-27", "2017-03-28", "2017-03-29"],
+            tz="Europe/Amsterdam",
+        ).as_unit(unit),
+        data=[11, 12, 13, 14, 15],
+    )
+    result = df.resample("1W").sum()
+    expected = DataFrame(
+        [23, 42],
+        index=DatetimeIndex(
+            ["2017-03-26", "2017-04-02"], tz="Europe/Amsterdam", freq="W"
+        ).as_unit(unit),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_downsample_across_dst_weekly_2(unit):
+    # GH 9119, GH 21459
+    idx = date_range("2013-04-01", "2013-05-01", tz="Europe/London", freq="h").as_unit(
+        unit
+    )
+    s = Series(index=idx, dtype=np.float64)
+    result = s.resample("W").mean()
+    expected = Series(
+        index=date_range("2013-04-07", freq="W", periods=5, tz="Europe/London").as_unit(
+            unit
+        ),
+        dtype=np.float64,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_downsample_dst_at_midnight(unit):
+    # GH 25758
+    start = datetime(2018, 11, 3, 12)
+    end = datetime(2018, 11, 5, 12)
+    index = date_range(start, end, freq="1h").as_unit(unit)
+    index = index.tz_localize("UTC").tz_convert("America/Havana")
+    data = list(range(len(index)))
+    dataframe = DataFrame(data, index=index)
+    result = dataframe.groupby(Grouper(freq="1D")).mean()
+
+    dti = date_range("2018-11-03", periods=3).tz_localize(
+        "America/Havana", ambiguous=True
+    )
+    dti = DatetimeIndex(dti, freq="D").as_unit(unit)
+    expected = DataFrame([7.5, 28.0, 44.5], index=dti)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_with_nat(unit):
+    # GH 13020
+    index = DatetimeIndex(
+        [
+            pd.NaT,
+            "1970-01-01 00:00:00",
+            pd.NaT,
+            "1970-01-01 00:00:01",
+            "1970-01-01 00:00:02",
+        ]
+    ).as_unit(unit)
+    frame = DataFrame([2, 3, 5, 7, 11], index=index)
+
+    index_1s = DatetimeIndex(
+        ["1970-01-01 00:00:00", "1970-01-01 00:00:01", "1970-01-01 00:00:02"]
+    ).as_unit(unit)
+    frame_1s = DataFrame([3.0, 7.0, 11.0], index=index_1s)
+    tm.assert_frame_equal(frame.resample("1s").mean(), frame_1s)
+
+    index_2s = DatetimeIndex(["1970-01-01 00:00:00", "1970-01-01 00:00:02"]).as_unit(
+        unit
+    )
+    frame_2s = DataFrame([5.0, 11.0], index=index_2s)
+    tm.assert_frame_equal(frame.resample("2s").mean(), frame_2s)
+
+    index_3s = DatetimeIndex(["1970-01-01 00:00:00"]).as_unit(unit)
+    frame_3s = DataFrame([7.0], index=index_3s)
+    tm.assert_frame_equal(frame.resample("3s").mean(), frame_3s)
+
+    tm.assert_frame_equal(frame.resample("60s").mean(), frame_3s)
+
+
+def test_resample_datetime_values(unit):
+    # GH 13119
+    # check that datetime dtype is preserved when NaT values are
+    # introduced by the resampling
+
+    dates = [datetime(2016, 1, 15), datetime(2016, 1, 19)]
+    df = DataFrame({"timestamp": dates}, index=dates)
+    df.index = df.index.as_unit(unit)
+
+    exp = Series(
+        [datetime(2016, 1, 15), pd.NaT, datetime(2016, 1, 19)],
+        index=date_range("2016-01-15", periods=3, freq="2D").as_unit(unit),
+        name="timestamp",
+    )
+
+    res = df.resample("2D").first()["timestamp"]
+    tm.assert_series_equal(res, exp)
+    res = df["timestamp"].resample("2D").first()
+    tm.assert_series_equal(res, exp)
+
+
+def test_resample_apply_with_additional_args(unit):
+    # GH 14615
+    index = date_range("1/1/2000 00:00:00", "1/1/2000 00:13:00", freq="Min")
+    series = Series(range(len(index)), index=index)
+    series.index.name = "index"
+
+    def f(data, add_arg):
+        return np.mean(data) * add_arg
+
+    series.index = series.index.as_unit(unit)
+
+    multiplier = 10
+    result = series.resample("D").apply(f, multiplier)
+    expected = series.resample("D").mean().multiply(multiplier)
+    tm.assert_series_equal(result, expected)
+
+    # Testing as kwarg
+    result = series.resample("D").apply(f, add_arg=multiplier)
+    expected = series.resample("D").mean().multiply(multiplier)
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_apply_with_additional_args2():
+    # Testing dataframe
+    def f(data, add_arg):
+        return np.mean(data) * add_arg
+
+    multiplier = 10
+
+    df = DataFrame({"A": 1, "B": 2}, index=date_range("2017", periods=10))
+    result = df.groupby("A").resample("D").agg(f, multiplier).astype(float)
+    expected = df.groupby("A").resample("D").mean().multiply(multiplier)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("k", [1, 2, 3])
+@pytest.mark.parametrize(
+    "n1, freq1, n2, freq2",
+    [
+        (30, "s", 0.5, "Min"),
+        (60, "s", 1, "Min"),
+        (3600, "s", 1, "h"),
+        (60, "Min", 1, "h"),
+        (21600, "s", 0.25, "D"),
+        (86400, "s", 1, "D"),
+        (43200, "s", 0.5, "D"),
+        (1440, "Min", 1, "D"),
+        (12, "h", 0.5, "D"),
+        (24, "h", 1, "D"),
+    ],
+)
+def test_resample_equivalent_offsets(n1, freq1, n2, freq2, k, unit):
+    # GH 24127
+    n1_ = n1 * k
+    n2_ = n2 * k
+    dti = date_range("1991-09-05", "1991-09-06", freq=freq1).as_unit(unit)
+    ser = Series(range(len(dti)), index=dti)
+
+    result1 = ser.resample(str(n1_) + freq1).mean()
+    result2 = ser.resample(str(n2_) + freq2).mean()
+    if freq2 == "D" and isinstance(result2.index.freq, Day):
+        # GH#55502 Day is no longer a Tick so no longer compares as equivalent,
+        #  but the actual values we expect should still match
+        result2.index.freq = to_offset(Timedelta(days=result2.index.freq.n))
+    tm.assert_series_equal(result1, result2)
+
+
+@pytest.mark.parametrize(
+    "first,last,freq,exp_first,exp_last",
+    [
+        ("19910905", "19920406", "D", "19910905", "19920407"),
+        ("19910905 00:00", "19920406 06:00", "D", "19910905", "19920407"),
+        ("19910905 06:00", "19920406 06:00", "h", "19910905 06:00", "19920406 07:00"),
+        ("19910906", "19920406", "ME", "19910831", "19920430"),
+        ("19910831", "19920430", "ME", "19910831", "19920531"),
+        ("1991-08", "1992-04", "ME", "19910831", "19920531"),
+    ],
+)
+def test_get_timestamp_range_edges(first, last, freq, exp_first, exp_last, unit):
+    first = Period(first)
+    first = first.to_timestamp(first.freq).as_unit(unit)
+    last = Period(last)
+    last = last.to_timestamp(last.freq).as_unit(unit)
+
+    exp_first = Timestamp(exp_first)
+    exp_last = Timestamp(exp_last)
+
+    freq = pd.tseries.frequencies.to_offset(freq)
+    result = _get_timestamp_range_edges(first, last, freq, unit="ns")
+    expected = (exp_first, exp_last)
+    assert result == expected
+
+
+@pytest.mark.parametrize("duplicates", [True, False])
+def test_resample_apply_product(duplicates, unit):
+    # GH 5586
+    index = date_range(start="2012-01-31", freq="ME", periods=12).as_unit(unit)
+
+    ts = Series(range(12), index=index)
+    df = DataFrame({"A": ts, "B": ts + 2})
+    if duplicates:
+        df.columns = ["A", "A"]
+
+    result = df.resample("QE").apply(np.prod)
+    expected = DataFrame(
+        np.array([[0, 24], [60, 210], [336, 720], [990, 1716]], dtype=np.int64),
+        index=DatetimeIndex(
+            ["2012-03-31", "2012-06-30", "2012-09-30", "2012-12-31"], freq="QE-DEC"
+        ).as_unit(unit),
+        columns=df.columns,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "first,last,freq_in,freq_out,exp_last",
+    [
+        (
+            "2020-03-28",
+            "2020-03-31",
+            "D",
+            "24h",
+            "2020-03-30 01:00",
+        ),  # includes transition into DST
+        (
+            "2020-03-28",
+            "2020-10-27",
+            "D",
+            "24h",
+            "2020-10-27 00:00",
+        ),  # includes transition into and out of DST
+        (
+            "2020-10-25",
+            "2020-10-27",
+            "D",
+            "24h",
+            "2020-10-26 23:00",
+        ),  # includes transition out of DST
+        (
+            "2020-03-28",
+            "2020-03-31",
+            "24h",
+            "D",
+            "2020-03-30 00:00",
+        ),  # same as above, but from 24H to D
+        ("2020-03-28", "2020-10-27", "24h", "D", "2020-10-27 00:00"),
+        ("2020-10-25", "2020-10-27", "24h", "D", "2020-10-26 00:00"),
+    ],
+)
+def test_resample_calendar_day_with_dst(
+    first: str, last: str, freq_in: str, freq_out: str, exp_last: str, unit
+):
+    # GH 35219
+    ts = Series(
+        1.0, date_range(first, last, freq=freq_in, tz="Europe/Amsterdam").as_unit(unit)
+    )
+    result = ts.resample(freq_out).ffill()
+    expected = Series(
+        1.0,
+        date_range(first, exp_last, freq=freq_out, tz="Europe/Amsterdam").as_unit(unit),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["min", "max", "first", "last"])
+def test_resample_aggregate_functions_min_count(func, unit):
+    # GH#37768
+    index = date_range(start="2020", freq="ME", periods=3).as_unit(unit)
+    ser = Series([1, np.nan, np.nan], index)
+    result = getattr(ser.resample("QE"), func)(min_count=2)
+    expected = Series(
+        [np.nan],
+        index=DatetimeIndex(["2020-03-31"], freq="QE-DEC").as_unit(unit),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_unsigned_int(any_unsigned_int_numpy_dtype, unit):
+    # gh-43329
+    df = DataFrame(
+        index=date_range(start="2000-01-01", end="2000-01-03 23", freq="12h").as_unit(
+            unit
+        ),
+        columns=["x"],
+        data=[0, 1, 0] * 2,
+        dtype=any_unsigned_int_numpy_dtype,
+    )
+    df = df.loc[(df.index < "2000-01-02") | (df.index > "2000-01-03"), :]
+
+    result = df.resample("D").max()
+
+    expected = DataFrame(
+        [1, np.nan, 0],
+        columns=["x"],
+        index=date_range(start="2000-01-01", end="2000-01-03 23", freq="D").as_unit(
+            unit
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_long_rule_non_nano():
+    # https://github.com/pandas-dev/pandas/issues/51024
+    idx = date_range("0300-01-01", "2000-01-01", unit="s", freq="100YE")
+    ser = Series([1, 4, 2, 8, 5, 7, 1, 4, 2, 8, 5, 7, 1, 4, 2, 8, 5], index=idx)
+    result = ser.resample("200YE").mean()
+    expected_idx = DatetimeIndex(
+        np.array(
+            [
+                "0300-12-31",
+                "0500-12-31",
+                "0700-12-31",
+                "0900-12-31",
+                "1100-12-31",
+                "1300-12-31",
+                "1500-12-31",
+                "1700-12-31",
+                "1900-12-31",
+            ]
+        ).astype("datetime64[s]"),
+        freq="200YE-DEC",
+    )
+    expected = Series([1.0, 3.0, 6.5, 4.0, 3.0, 6.5, 4.0, 3.0, 6.5], index=expected_idx)
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_empty_series_with_tz():
+    # GH#53664
+    df = DataFrame({"ts": [], "values": []}).astype(
+        {"ts": "datetime64[ns, Atlantic/Faroe]"}
+    )
+    rs = df.resample("2MS", on="ts", closed="left", label="left")
+    result = rs["values"].sum()
+
+    expected_idx = DatetimeIndex(
+        [], freq="2MS", name="ts", dtype="datetime64[ns, Atlantic/Faroe]"
+    )
+    expected = Series([], index=expected_idx, name="values", dtype="float64")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("freq", ["2M", "2m", "2Q", "2Q-SEP", "2q-sep", "1Y", "2Y-MAR"])
+def test_resample_M_Q_Y_raises(freq):
+    msg = f"Invalid frequency: {freq}"
+
+    s = Series(range(10), index=date_range("20130101", freq="D", periods=10))
+    with pytest.raises(ValueError, match=msg):
+        s.resample(freq).mean()
+
+
+@pytest.mark.parametrize("freq", ["2BM", "1bm", "1BQ", "2BQ-MAR", "2bq=-mar"])
+def test_resample_BM_BQ_raises(freq):
+    msg = f"Invalid frequency: {freq}"
+
+    s = Series(range(10), index=date_range("20130101", freq="D", periods=10))
+    with pytest.raises(ValueError, match=msg):
+        s.resample(freq).mean()
+
+
+@pytest.mark.parametrize(
+    "freq,freq_depr,data",
+    [
+        ("1W-SUN", "1w-sun", ["2013-01-06"]),
+        ("1D", "1d", ["2013-01-01"]),
+        ("1B", "1b", ["2013-01-01"]),
+        ("1C", "1c", ["2013-01-01"]),
+    ],
+)
+def test_resample_depr_lowercase_frequency(freq, freq_depr, data):
+    msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a future version."
+
+    s = Series(range(5), index=date_range("20130101", freq="h", periods=5, unit="ns"))
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        result = s.resample(freq_depr).mean()
+
+    exp_dti = DatetimeIndex(data=data, dtype="datetime64[ns]", freq=freq)
+    expected = Series(2.0, index=exp_dti)
+    tm.assert_series_equal(result, expected, check_freq=False)
+    # GH#41943 check_freq=False bc 24H and D no longer compare as equal
+
+
+def test_resample_ms_closed_right(unit):
+    # https://github.com/pandas-dev/pandas/issues/55271
+    dti = date_range(start="2020-01-31", freq="1min", periods=6000, unit=unit)
+    df = DataFrame({"ts": dti}, index=dti)
+    grouped = df.resample("MS", closed="right")
+    result = grouped.last()
+    exp_dti = DatetimeIndex(
+        [datetime(2020, 1, 1), datetime(2020, 2, 1)], freq="MS"
+    ).as_unit(unit)
+    expected = DataFrame(
+        {"ts": [datetime(2020, 2, 1), datetime(2020, 2, 4, 3, 59)]},
+        index=exp_dti,
+    ).astype(f"M8[{unit}]")
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("freq", ["B", "C"])
+def test_resample_c_b_closed_right(freq: str, unit):
+    # https://github.com/pandas-dev/pandas/issues/55281
+    dti = date_range(start="2020-01-31", freq="1min", periods=6000, unit=unit)
+    df = DataFrame({"ts": dti}, index=dti)
+    grouped = df.resample(freq, closed="right")
+    result = grouped.last()
+
+    exp_dti = DatetimeIndex(
+        [
+            datetime(2020, 1, 30),
+            datetime(2020, 1, 31),
+            datetime(2020, 2, 3),
+            datetime(2020, 2, 4),
+        ],
+        freq=freq,
+    ).as_unit(unit)
+    expected = DataFrame(
+        {
+            "ts": [
+                datetime(2020, 1, 31),
+                datetime(2020, 2, 3),
+                datetime(2020, 2, 4),
+                datetime(2020, 2, 4, 3, 59),
+            ]
+        },
+        index=exp_dti,
+    ).astype(f"M8[{unit}]")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_b_55282(unit):
+    # https://github.com/pandas-dev/pandas/issues/55282
+    dti = date_range("2023-09-26", periods=6, freq="12h", unit=unit)
+    ser = Series([1, 2, 3, 4, 5, 6], index=dti)
+    result = ser.resample("B", closed="right", label="right").mean()
+
+    exp_dti = DatetimeIndex(
+        [
+            datetime(2023, 9, 26),
+            datetime(2023, 9, 27),
+            datetime(2023, 9, 28),
+            datetime(2023, 9, 29),
+        ],
+        freq="B",
+    ).as_unit(unit)
+    expected = Series(
+        [1.0, 2.5, 4.5, 6.0],
+        index=exp_dti,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+@pytest.mark.parametrize(
+    "tz",
+    [
+        None,
+        pytest.param(
+            "UTC",
+            marks=pytest.mark.xfail(
+                condition=is_platform_windows() and pa_version_under22p0,
+                reason="TODO: Set ARROW_TIMEZONE_DATABASE env var in CI",
+            ),
+        ),
+    ],
+)
+def test_arrow_timestamp_resample(tz):
+    # GH 56371
+    idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]")
+    if tz is not None:
+        idx = idx.dt.tz_localize(tz)
+    expected = Series(np.arange(5, dtype=np.float64), index=idx)
+    result = expected.resample("1D").mean()
+    tm.assert_series_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_arrow_timestamp_resample_keep_index_name():
+    # https://github.com/pandas-dev/pandas/issues/61222
+    idx = Series(date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]")
+    expected = Series(np.arange(5, dtype=np.float64), index=idx)
+    expected.index.name = "index_name"
+    result = expected.resample("1D").mean()
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_unit_second_large_years():
+    # GH#57427
+    index = DatetimeIndex(
+        date_range(start=Timestamp("1950-01-01"), periods=10, freq="1000YS", unit="s")
+    )
+    ser = Series(1, index=index)
+    result = ser.resample("2000YS").sum()
+    expected = Series(2, index=index[::2])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("freq", ["1A", "2A-MAR"])
+def test_resample_A_raises(freq):
+    msg = f"Invalid frequency: {freq[1:]}"
+
+    s = Series(range(10), index=date_range("20130101", freq="D", periods=10))
+    with pytest.raises(ValueError, match=msg):
+        s.resample(freq).mean()
diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7e2613b823f75f83bd1fdb840d2e25df26c8123
--- /dev/null
+++ b/pandas/tests/resample/test_period_index.py
@@ -0,0 +1,1032 @@
+from datetime import (
+    datetime,
+    timezone,
+)
+import re
+import warnings
+import zoneinfo
+
+import dateutil
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs.ccalendar import (
+    DAYS,
+    MONTHS,
+)
+from pandas._libs.tslibs.period import IncompatibleFrequency
+from pandas.errors import InvalidIndexError
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Series,
+    Timestamp,
+)
+import pandas._testing as tm
+from pandas.core.indexes.datetimes import date_range
+from pandas.core.indexes.period import (
+    Period,
+    PeriodIndex,
+    period_range,
+)
+from pandas.core.resample import _get_period_range_edges
+
+from pandas.tseries import offsets
+
+
+@pytest.fixture
+def simple_period_range_series():
+    """
+    Series with period range index and random data for test purposes.
+    """
+
+    def _simple_period_range_series(start, end, freq="D"):
+        with warnings.catch_warnings():
+            # suppress Period[B] deprecation warning
+            msg = "|".join(["Period with BDay freq", r"PeriodDtype\[B\] is deprecated"])
+            warnings.filterwarnings(
+                "ignore",
+                msg,
+                category=FutureWarning,
+            )
+            rng = period_range(start, end, freq=freq)
+        return Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    return _simple_period_range_series
+
+
+class TestPeriodIndex:
+    @pytest.mark.parametrize("freq", ["2D", "1h", "2h"])
+    def test_asfreq(self, frame_or_series, freq):
+        # GH 12884, 15944
+
+        obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5))
+
+        expected = obj.to_timestamp().resample(freq).asfreq()
+        result = obj.to_timestamp().resample(freq).asfreq()
+        tm.assert_almost_equal(result, expected)
+
+        start = obj.index[0].to_timestamp(how="start")
+        end = (obj.index[-1] + obj.index.freq).to_timestamp(how="start")
+        new_index = date_range(start=start, end=end, freq=freq, inclusive="left")
+        expected = obj.to_timestamp().reindex(new_index).to_period(freq)
+
+        result = obj.resample(freq).asfreq()
+        tm.assert_almost_equal(result, expected)
+
+        result = obj.resample(freq).asfreq().to_timestamp().to_period()
+        tm.assert_almost_equal(result, expected)
+
+    def test_asfreq_fill_value(self):
+        # test for fill value during resampling, issue 3715
+
+        index = period_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D")
+        s = Series(range(len(index)), index=index)
+        new_index = date_range(
+            s.index[0].to_timestamp(how="start"),
+            (s.index[-1]).to_timestamp(how="start"),
+            freq="1h",
+        )
+        expected = s.to_timestamp().reindex(new_index, fill_value=4.0)
+        result = s.to_timestamp().resample("1h").asfreq(fill_value=4.0)
+        tm.assert_series_equal(result, expected)
+
+        frame = s.to_frame("value")
+        new_index = date_range(
+            frame.index[0].to_timestamp(how="start"),
+            (frame.index[-1]).to_timestamp(how="start"),
+            freq="1h",
+        )
+        expected = frame.to_timestamp().reindex(new_index, fill_value=3.0)
+        result = frame.to_timestamp().resample("1h").asfreq(fill_value=3.0)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("freq", ["h", "12h", "2D", "W"])
+    @pytest.mark.parametrize("kwargs", [{"on": "date"}, {"level": "d"}])
+    def test_selection(self, freq, kwargs):
+        # This is a bug, these should be implemented
+        # GH 14008
+        index = period_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D")
+        rng = np.arange(len(index), dtype=np.int64)
+        df = DataFrame(
+            {"date": index, "a": rng},
+            index=pd.MultiIndex.from_arrays([rng, index], names=["v", "d"]),
+        )
+        msg = (
+            "Resampling from level= or on= selection with a PeriodIndex is "
+            r"not currently supported, use \.set_index\(\.\.\.\) to "
+            "explicitly set index"
+        )
+        with pytest.raises(NotImplementedError, match=msg):
+            df.resample(freq, **kwargs)
+
+    @pytest.mark.parametrize("month", MONTHS)
+    @pytest.mark.parametrize("meth", ["ffill", "bfill"])
+    @pytest.mark.parametrize("conv", ["start", "end"])
+    @pytest.mark.parametrize(
+        ("offset", "period"), [("D", "D"), ("B", "B"), ("ME", "M"), ("QE", "Q")]
+    )
+    def test_annual_upsample_cases(
+        self, offset, period, conv, meth, month, simple_period_range_series
+    ):
+        ts = simple_period_range_series("1/1/1990", "12/31/1990", freq=f"Y-{month}")
+        warn = FutureWarning if period == "B" else None
+        msg = r"PeriodDtype\[B\] is deprecated"
+        with tm.assert_produces_warning(warn, match=msg):
+            result = getattr(ts.resample(period, convention=conv), meth)()
+            expected = result.to_timestamp(period, how=conv)
+            expected = expected.asfreq(offset, meth).to_period()
+        tm.assert_series_equal(result, expected)
+
+    def test_basic_downsample(self, simple_period_range_series):
+        ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M")
+        result = ts.resample("Y-DEC").mean()
+
+        expected = ts.groupby(ts.index.year).mean()
+        expected.index = period_range("1/1/1990", "6/30/1995", freq="Y-DEC")
+        tm.assert_series_equal(result, expected)
+
+        # this is ok
+        tm.assert_series_equal(ts.resample("Y-DEC").mean(), result)
+        tm.assert_series_equal(ts.resample("Y").mean(), result)
+
+    @pytest.mark.parametrize(
+        "rule,expected_error_msg",
+        [
+            ("Y-DEC", "<YearEnd: month=12>"),
+            ("Q-MAR", "<QuarterEnd: startingMonth=3>"),
+            ("M", "<MonthEnd>"),
+            ("W-THU", "<Week: weekday=3>"),
+        ],
+    )
+    def test_not_subperiod(self, simple_period_range_series, rule, expected_error_msg):
+        # These are incompatible period rules for resampling
+        ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="W-WED")
+        msg = (
+            "Frequency <Week: weekday=2> cannot be resampled to "
+            f"{expected_error_msg}, as they are not sub or super periods"
+        )
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            ts.resample(rule).mean()
+
+    @pytest.mark.parametrize("freq", ["D", "2D"])
+    def test_basic_upsample(self, freq, simple_period_range_series):
+        ts = simple_period_range_series("1/1/1990", "6/30/1995", freq="M")
+        result = ts.resample("Y-DEC").mean()
+
+        resampled = result.resample(freq, convention="end").ffill()
+        expected = result.to_timestamp(freq, how="end")
+        expected = expected.asfreq(freq, "ffill").to_period(freq)
+        tm.assert_series_equal(resampled, expected)
+
+    def test_upsample_with_limit(self):
+        rng = period_range("1/1/2000", periods=5, freq="Y")
+        ts = Series(np.random.default_rng(2).standard_normal(len(rng)), rng)
+
+        result = ts.resample("M", convention="end").ffill(limit=2)
+        expected = ts.asfreq("M").reindex(result.index, method="ffill", limit=2)
+        tm.assert_series_equal(result, expected)
+
+    def test_annual_upsample(self, simple_period_range_series):
+        ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="Y-DEC")
+        df = DataFrame({"a": ts})
+        rdf = df.resample("D").ffill()
+        exp = df["a"].resample("D").ffill()
+        tm.assert_series_equal(rdf["a"], exp)
+
+    def test_annual_upsample2(self):
+        rng = period_range("2000", "2003", freq="Y-DEC")
+        ts = Series([1, 2, 3, 4], index=rng)
+
+        result = ts.resample("M").ffill()
+        ex_index = period_range("2000-01", "2003-12", freq="M")
+
+        expected = ts.asfreq("M", how="start").reindex(ex_index, method="ffill")
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("month", MONTHS)
+    @pytest.mark.parametrize("convention", ["start", "end"])
+    @pytest.mark.parametrize(
+        ("offset", "period"), [("D", "D"), ("B", "B"), ("ME", "M")]
+    )
+    def test_quarterly_upsample(
+        self, month, offset, period, convention, simple_period_range_series
+    ):
+        freq = f"Q-{month}"
+        ts = simple_period_range_series("1/1/1990", "12/31/1991", freq=freq)
+        warn = FutureWarning if period == "B" else None
+        msg = r"PeriodDtype\[B\] is deprecated"
+        with tm.assert_produces_warning(warn, match=msg):
+            result = ts.resample(period, convention=convention).ffill()
+            expected = result.to_timestamp(period, how=convention)
+            expected = expected.asfreq(offset, "ffill").to_period()
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("target", ["D", "B"])
+    @pytest.mark.parametrize("convention", ["start", "end"])
+    def test_monthly_upsample(self, target, convention, simple_period_range_series):
+        ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="M")
+
+        warn = None if target == "D" else FutureWarning
+        msg = r"PeriodDtype\[B\] is deprecated"
+        with tm.assert_produces_warning(warn, match=msg):
+            result = ts.resample(target, convention=convention).ffill()
+            expected = result.to_timestamp(target, how=convention)
+            expected = expected.asfreq(target, "ffill").to_period()
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_basic(self):
+        # GH3609
+        s = Series(
+            range(100),
+            index=date_range("20130101", freq="s", periods=100, name="idx"),
+            dtype="float",
+        )
+        s[10:30] = np.nan
+        index = PeriodIndex(
+            [Period("2013-01-01 00:00", "min"), Period("2013-01-01 00:01", "min")],
+            name="idx",
+        )
+        expected = Series([34.5, 79.5], index=index)
+        result = s.to_period().resample("min").mean()
+        tm.assert_series_equal(result, expected)
+        result2 = s.resample("min").mean().to_period()
+        tm.assert_series_equal(result2, expected)
+
+    @pytest.mark.parametrize(
+        "freq,expected_vals", [("M", [31, 29, 31, 9]), ("2M", [31 + 29, 31 + 9])]
+    )
+    def test_resample_count(self, freq, expected_vals):
+        # GH12774
+        series = Series(1, index=period_range(start="2000", periods=100))
+        result = series.resample(freq).count()
+        expected_index = period_range(
+            start="2000", freq=freq, periods=len(expected_vals)
+        )
+        expected = Series(expected_vals, index=expected_index)
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_same_freq(self, resample_method):
+        # GH12770
+        series = Series(range(3), index=period_range(start="2000", periods=3, freq="M"))
+        expected = series
+
+        result = getattr(series.resample("M"), resample_method)()
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_incompat_freq(self):
+        msg = (
+            "Frequency <MonthEnd> cannot be resampled to <Week: weekday=6>, "
+            "as they are not sub or super periods"
+        )
+        pi = period_range(start="2000", periods=3, freq="M")
+        ser = Series(range(3), index=pi)
+        rs = ser.resample("W")
+        with pytest.raises(IncompatibleFrequency, match=msg):
+            # TODO: should this raise at the resample call instead of at the mean call?
+            rs.mean()
+
+    @pytest.mark.parametrize(
+        "tz",
+        [
+            zoneinfo.ZoneInfo("America/Los_Angeles"),
+            dateutil.tz.gettz("America/Los_Angeles"),
+        ],
+    )
+    def test_with_local_timezone(self, tz):
+        # see gh-5430
+        local_timezone = tz
+
+        start = datetime(
+            year=2013, month=11, day=1, hour=0, minute=0, tzinfo=timezone.utc
+        )
+        # 1 day later
+        end = datetime(
+            year=2013, month=11, day=2, hour=0, minute=0, tzinfo=timezone.utc
+        )
+
+        index = date_range(start, end, freq="h", name="idx")
+
+        series = Series(1, index=index)
+        series = series.tz_convert(local_timezone)
+        msg = "Converting to PeriodArray/Index representation will drop timezone"
+        with tm.assert_produces_warning(UserWarning, match=msg):
+            result = series.resample("D").mean().to_period()
+
+        # Create the expected series
+        # Index is moved back a day with the timezone conversion from UTC to
+        # Pacific
+        expected_index = (
+            period_range(start=start, end=end, freq="D", name="idx") - offsets.Day()
+        )
+        expected = Series(1.0, index=expected_index)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "tz",
+        [
+            zoneinfo.ZoneInfo("America/Los_Angeles"),
+            dateutil.tz.gettz("America/Los_Angeles"),
+        ],
+    )
+    def test_resample_with_tz(self, tz, unit):
+        # GH 13238
+        dti = date_range("2017-01-01", periods=48, freq="h", tz=tz, unit=unit)
+        ser = Series(2, index=dti)
+        result = ser.resample("D").mean()
+        exp_dti = pd.DatetimeIndex(
+            ["2017-01-01", "2017-01-02"], tz=tz, freq="D"
+        ).as_unit(unit)
+        expected = Series(
+            2.0,
+            index=exp_dti,
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_nonexistent_time_bin_edge(self):
+        # GH 19375
+        index = date_range("2017-03-12", "2017-03-12 1:45:00", freq="15min")
+        s = Series(np.zeros(len(index)), index=index)
+        expected = s.tz_localize("US/Pacific")
+        expected.index = pd.DatetimeIndex(expected.index, freq="900s")
+        result = expected.resample("900s").mean()
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_nonexistent_time_bin_edge2(self):
+        # GH 23742
+        index = date_range(start="2017-10-10", end="2017-10-20", freq="1h")
+        index = index.tz_localize("UTC").tz_convert("America/Sao_Paulo")
+        df = DataFrame(data=list(range(len(index))), index=index)
+        result = df.groupby(pd.Grouper(freq="1D")).count()
+        expected = date_range(
+            start="2017-10-09",
+            end="2017-10-20",
+            freq="D",
+            tz="America/Sao_Paulo",
+            nonexistent="shift_forward",
+            inclusive="left",
+        )
+        tm.assert_index_equal(result.index, expected)
+
+    def test_resample_ambiguous_time_bin_edge(self):
+        # GH 10117
+        idx = date_range(
+            "2014-10-25 22:00:00",
+            "2014-10-26 00:30:00",
+            freq="30min",
+            tz="Europe/London",
+        )
+        expected = Series(np.zeros(len(idx)), index=idx)
+        result = expected.resample("30min").mean()
+        tm.assert_series_equal(result, expected)
+
+    def test_fill_method_and_how_upsample(self):
+        # GH2073
+        s = Series(
+            np.arange(9, dtype="int64"),
+            index=date_range("2010-01-01", periods=9, freq="QE"),
+        )
+        last = s.resample("ME").ffill()
+        both = s.resample("ME").ffill().resample("ME").last().astype("int64")
+        tm.assert_series_equal(last, both)
+
+    @pytest.mark.parametrize("day", DAYS)
+    @pytest.mark.parametrize("target", ["D", "B"])
+    @pytest.mark.parametrize("convention", ["start", "end"])
+    def test_weekly_upsample(self, day, target, convention, simple_period_range_series):
+        freq = f"W-{day}"
+        ts = simple_period_range_series("1/1/1990", "07/31/1990", freq=freq)
+        warn = None if target == "D" else FutureWarning
+        msg = r"PeriodDtype\[B\] is deprecated"
+        with tm.assert_produces_warning(warn, match=msg):
+            result = ts.resample(target, convention=convention).ffill()
+            expected = result.to_timestamp(target, how=convention)
+            expected = expected.asfreq(target, "ffill").to_period()
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_to_timestamps(self, simple_period_range_series):
+        ts = simple_period_range_series("1/1/1990", "12/31/1995", freq="M")
+
+        result = ts.resample("Y-DEC").mean().to_timestamp()
+        expected = ts.resample("Y-DEC").mean().to_timestamp(how="start")
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("month", MONTHS)
+    def test_resample_to_quarterly(self, simple_period_range_series, month):
+        ts = simple_period_range_series("1990", "1992", freq=f"Y-{month}")
+        quar_ts = ts.resample(f"Q-{month}").ffill()
+
+        stamps = ts.to_timestamp("D", how="start")
+        qdates = period_range(
+            ts.index[0].asfreq("D", "start"),
+            ts.index[-1].asfreq("D", "end"),
+            freq=f"Q-{month}",
+        )
+
+        expected = stamps.reindex(qdates.to_timestamp("D", "s"), method="ffill")
+        expected.index = qdates
+
+        tm.assert_series_equal(quar_ts, expected)
+
+    @pytest.mark.parametrize("how", ["start", "end"])
+    def test_resample_to_quarterly_start_end(self, simple_period_range_series, how):
+        # conforms, but different month
+        ts = simple_period_range_series("1990", "1992", freq="Y-JUN")
+        result = ts.resample("Q-MAR", convention=how).ffill()
+        expected = ts.asfreq("Q-MAR", how=how)
+        expected = expected.reindex(result.index, method="ffill")
+
+        # FIXME: don't leave commented-out
+        # .to_timestamp('D')
+        # expected = expected.resample('Q-MAR').ffill()
+
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_fill_missing(self):
+        rng = PeriodIndex([2000, 2005, 2007, 2009], freq="Y")
+
+        s = Series(np.random.default_rng(2).standard_normal(4), index=rng)
+
+        stamps = s.to_timestamp()
+        filled = s.resample("Y").ffill()
+        expected = stamps.resample("YE").ffill().to_period("Y")
+        tm.assert_series_equal(filled, expected)
+
+    def test_cant_fill_missing_dups(self):
+        rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq="Y")
+        s = Series(np.random.default_rng(2).standard_normal(5), index=rng)
+        msg = "Reindexing only valid with uniquely valued Index objects"
+        with pytest.raises(InvalidIndexError, match=msg):
+            s.resample("Y").ffill()
+
+    def test_resample_5minute(self):
+        rng = period_range("1/1/2000", "1/5/2000", freq="min")
+        ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+        expected = ts.to_timestamp().resample("5min").mean()
+        result = ts.resample("5min").mean().to_timestamp()
+        tm.assert_series_equal(result, expected)
+
+        expected = expected.to_period("5min")
+        result = ts.resample("5min").mean()
+        tm.assert_series_equal(result, expected)
+        result = ts.resample("5min").mean().to_timestamp().to_period()
+        tm.assert_series_equal(result, expected)
+
+    def test_upsample_daily_business_daily(self, simple_period_range_series):
+        ts = simple_period_range_series("1/1/2000", "2/1/2000", freq="B")
+
+        result = ts.resample("D").asfreq()
+        expected = ts.asfreq("D").reindex(period_range("1/3/2000", "2/1/2000"))
+        tm.assert_series_equal(result, expected)
+
+        ts = simple_period_range_series("1/1/2000", "2/1/2000")
+        result = ts.resample("h", convention="s").asfreq()
+        exp_rng = period_range("1/1/2000", "2/1/2000 23:00", freq="h")
+        expected = ts.asfreq("h", how="s").reindex(exp_rng)
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_irregular_sparse(self):
+        dr = date_range(start="1/1/2012", freq="5min", periods=1000)
+        s = Series(np.array(100), index=dr)
+        # subset the data.
+        subset = s[:"2012-01-04 06:55"]
+
+        result = subset.resample("10min").apply(len)
+        expected = s.resample("10min").apply(len).loc[result.index]
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_weekly_all_na(self):
+        rng = date_range("1/1/2000", periods=10, freq="W-WED")
+        ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+        result = ts.resample("W-THU").asfreq()
+
+        assert result.isna().all()
+
+        result = ts.resample("W-THU").asfreq().ffill()[:-1]
+        expected = ts.asfreq("W-THU").ffill()
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_tz_localized(self, unit):
+        dr = date_range(start="2012-4-13", end="2012-5-1", unit=unit)
+        ts = Series(range(len(dr)), index=dr)
+
+        ts_utc = ts.tz_localize("UTC")
+        ts_local = ts_utc.tz_convert("America/Los_Angeles")
+
+        result = ts_local.resample("W").mean()
+
+        ts_local_naive = ts_local.copy()
+        ts_local_naive.index = ts_local_naive.index.tz_localize(None)
+
+        exp = ts_local_naive.resample("W").mean().tz_localize("America/Los_Angeles")
+        exp.index = pd.DatetimeIndex(exp.index, freq="W")
+
+        tm.assert_series_equal(result, exp)
+
+        # it works
+        result = ts_local.resample("D").mean()
+
+    def test_resample_tz_localized2(self):
+        # #2245
+        idx = date_range(
+            "2001-09-20 15:59", "2001-09-20 16:00", freq="min", tz="Australia/Sydney"
+        )
+        s = Series([1, 2], index=idx)
+
+        # GH#61985 changed this to behave like "B" rather than "24h"
+        result = s.resample("D", closed="right", label="right").mean()
+        ex_index = date_range("2001-09-20", periods=2, freq="D", tz="Australia/Sydney")
+        expected = Series([np.nan, 1.5], index=ex_index)
+
+        tm.assert_series_equal(result, expected)
+
+        # for good measure
+        msg = "Converting to PeriodArray/Index representation will drop timezone "
+        with tm.assert_produces_warning(UserWarning, match=msg):
+            result = s.resample("D").mean().to_period()
+        ex_index = period_range("2001-09-20", periods=1, freq="D")
+        expected = Series([1.5], index=ex_index)
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_tz_localized3(self):
+        # GH 6397
+        # comparing an offset that doesn't propagate tz's
+        rng = date_range("1/1/2011", periods=20000, freq="h")
+        rng = rng.tz_localize("EST")
+        ts = DataFrame(index=rng)
+        ts["first"] = np.random.default_rng(2).standard_normal(len(rng))
+        ts["second"] = np.cumsum(np.random.default_rng(2).standard_normal(len(rng)))
+        expected = DataFrame(
+            {
+                "first": ts.resample("YE").sum()["first"],
+                "second": ts.resample("YE").mean()["second"],
+            },
+            columns=["first", "second"],
+        )
+        result = (
+            ts.resample("YE")
+            .agg({"first": "sum", "second": "mean"})
+            .reindex(columns=["first", "second"])
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_closed_left_corner(self):
+        # #1465
+        s = Series(
+            np.random.default_rng(2).standard_normal(21),
+            index=date_range(start="1/1/2012 9:30", freq="1min", periods=21),
+        )
+        s.iloc[0] = np.nan
+
+        result = s.resample("10min", closed="left", label="right").mean()
+        exp = s[1:].resample("10min", closed="left", label="right").mean()
+        tm.assert_series_equal(result, exp)
+
+        result = s.resample("10min", closed="left", label="left").mean()
+        exp = s[1:].resample("10min", closed="left", label="left").mean()
+
+        ex_index = date_range(start="1/1/2012 9:30", freq="10min", periods=3)
+
+        tm.assert_index_equal(result.index, ex_index)
+        tm.assert_series_equal(result, exp)
+
+    def test_quarterly_resampling(self):
+        rng = period_range("2000Q1", periods=10, freq="Q-DEC")
+        ts = Series(np.arange(10), index=rng)
+
+        result = ts.resample("Y").mean()
+        exp = ts.to_timestamp().resample("YE").mean().to_period()
+        tm.assert_series_equal(result, exp)
+
+    def test_resample_weekly_bug_1726(self):
+        # 8/6/12 is a Monday
+        ind = date_range(start="8/6/2012", end="8/26/2012", freq="D")
+        n = len(ind)
+        data = [[x] * 5 for x in range(n)]
+        df = DataFrame(data, columns=["open", "high", "low", "close", "vol"], index=ind)
+
+        # it works!
+        df.resample("W-MON", closed="left", label="left").first()
+
+    def test_resample_with_dst_time_change(self):
+        # GH 15549
+        index = (
+            pd.DatetimeIndex([1457537600000000000, 1458059600000000000])
+            .tz_localize("UTC")
+            .tz_convert("America/Chicago")
+        )
+        df = DataFrame([1, 2], index=index)
+        result = df.resample("12h", closed="right", label="right").last().ffill()
+
+        expected_index_values = [
+            "2016-03-09 12:00:00-06:00",
+            "2016-03-10 00:00:00-06:00",
+            "2016-03-10 12:00:00-06:00",
+            "2016-03-11 00:00:00-06:00",
+            "2016-03-11 12:00:00-06:00",
+            "2016-03-12 00:00:00-06:00",
+            "2016-03-12 12:00:00-06:00",
+            "2016-03-13 00:00:00-06:00",
+            "2016-03-13 13:00:00-05:00",
+            "2016-03-14 01:00:00-05:00",
+            "2016-03-14 13:00:00-05:00",
+            "2016-03-15 01:00:00-05:00",
+            "2016-03-15 13:00:00-05:00",
+        ]
+        index = (
+            pd.to_datetime(expected_index_values, utc=True)
+            .tz_convert("America/Chicago")
+            .as_unit(index.unit)
+        )
+        index = pd.DatetimeIndex(index, freq="12h")
+        expected = DataFrame(
+            [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0],
+            index=index,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_resample_bms_2752(self):
+        # GH2753
+        timeseries = Series(
+            index=pd.bdate_range("20000101", "20000201"), dtype=np.float64
+        )
+        res1 = timeseries.resample("BMS").mean()
+        res2 = timeseries.resample("BMS").mean().resample("B").mean()
+        assert res1.index[0] == Timestamp("20000103")
+        assert res1.index[0] == res2.index[0]
+
+    @pytest.mark.xfail(reason="Commented out for more than 3 years. Should this work?")
+    def test_monthly_convention_span(self):
+        rng = period_range("2000-01", periods=3, freq="ME")
+        ts = Series(np.arange(3), index=rng)
+
+        # hacky way to get same thing
+        exp_index = period_range("2000-01-01", "2000-03-31", freq="D")
+        expected = ts.asfreq("D", how="end").reindex(exp_index)
+        expected = expected.fillna(method="bfill")
+
+        result = ts.resample("D").mean()
+
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "from_freq, to_freq", [("D", "ME"), ("QE", "YE"), ("ME", "QE"), ("D", "W")]
+    )
+    def test_default_right_closed_label(self, from_freq, to_freq):
+        idx = date_range(start="8/15/2012", periods=100, freq=from_freq)
+        df = DataFrame(np.random.default_rng(2).standard_normal((len(idx), 2)), idx)
+
+        resampled = df.resample(to_freq).mean()
+        tm.assert_frame_equal(
+            resampled, df.resample(to_freq, closed="right", label="right").mean()
+        )
+
+    @pytest.mark.parametrize(
+        "from_freq, to_freq",
+        [("D", "MS"), ("QE", "YS"), ("ME", "QS"), ("h", "D"), ("min", "h")],
+    )
+    def test_default_left_closed_label(self, from_freq, to_freq):
+        idx = date_range(start="8/15/2012", periods=100, freq=from_freq)
+        df = DataFrame(np.random.default_rng(2).standard_normal((len(idx), 2)), idx)
+
+        resampled = df.resample(to_freq).mean()
+        tm.assert_frame_equal(
+            resampled, df.resample(to_freq, closed="left", label="left").mean()
+        )
+
+    def test_all_values_single_bin(self):
+        # GH#2070
+        index = period_range(start="2012-01-01", end="2012-12-31", freq="M")
+        ser = Series(np.random.default_rng(2).standard_normal(len(index)), index=index)
+
+        result = ser.resample("Y").mean()
+        tm.assert_almost_equal(result.iloc[0], ser.mean())
+
+    def test_evenly_divisible_with_no_extra_bins(self):
+        # GH#4076
+        # when the frequency is evenly divisible, sometimes extra bins
+
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((9, 3)),
+            index=date_range("2000-1-1", periods=9, unit="ns"),
+        )
+        result = df.resample("5D").mean()
+        expected = pd.concat([df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T
+        expected.index = pd.DatetimeIndex(
+            [Timestamp("2000-1-1"), Timestamp("2000-1-6")], dtype="M8[ns]", freq="5D"
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_evenly_divisible_with_no_extra_bins2(self):
+        index = date_range(start="2001-5-4", periods=28)
+        df = DataFrame(
+            [
+                {
+                    "REST_KEY": 1,
+                    "DLY_TRN_QT": 80,
+                    "DLY_SLS_AMT": 90,
+                    "COOP_DLY_TRN_QT": 30,
+                    "COOP_DLY_SLS_AMT": 20,
+                }
+            ]
+            * 28
+            + [
+                {
+                    "REST_KEY": 2,
+                    "DLY_TRN_QT": 70,
+                    "DLY_SLS_AMT": 10,
+                    "COOP_DLY_TRN_QT": 50,
+                    "COOP_DLY_SLS_AMT": 20,
+                }
+            ]
+            * 28,
+            index=index.append(index),
+        ).sort_index()
+
+        index = date_range("2001-5-4", periods=4, freq="7D")
+        expected = DataFrame(
+            [
+                {
+                    "REST_KEY": 14,
+                    "DLY_TRN_QT": 14,
+                    "DLY_SLS_AMT": 14,
+                    "COOP_DLY_TRN_QT": 14,
+                    "COOP_DLY_SLS_AMT": 14,
+                }
+            ]
+            * 4,
+            index=index,
+        )
+        result = df.resample("7D").count()
+        tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame(
+            [
+                {
+                    "REST_KEY": 21,
+                    "DLY_TRN_QT": 1050,
+                    "DLY_SLS_AMT": 700,
+                    "COOP_DLY_TRN_QT": 560,
+                    "COOP_DLY_SLS_AMT": 280,
+                }
+            ]
+            * 4,
+            index=index,
+        )
+        result = df.resample("7D").sum()
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("freq, period_mult", [("h", 24), ("12h", 2)])
+    def test_upsampling_ohlc(self, freq, period_mult):
+        # GH 13083
+        pi = period_range(start="2000", freq="D", periods=10)
+        s = Series(range(len(pi)), index=pi)
+        expected = s.to_timestamp().resample(freq).ohlc().to_period(freq)
+
+        # timestamp-based resampling doesn't include all sub-periods
+        # of the last original period, so extend accordingly:
+        new_index = period_range(start="2000", freq=freq, periods=period_mult * len(pi))
+        expected = expected.reindex(new_index)
+        result = s.resample(freq).ohlc()
+        tm.assert_frame_equal(result, expected)
+
+        result = s.resample(freq).ohlc().to_timestamp().to_period()
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "periods, values",
+        [
+            (
+                [
+                    pd.NaT,
+                    "1970-01-01 00:00:00",
+                    pd.NaT,
+                    "1970-01-01 00:00:02",
+                    "1970-01-01 00:00:03",
+                ],
+                [2, 3, 5, 7, 11],
+            ),
+            (
+                [
+                    pd.NaT,
+                    pd.NaT,
+                    "1970-01-01 00:00:00",
+                    pd.NaT,
+                    pd.NaT,
+                    pd.NaT,
+                    "1970-01-01 00:00:02",
+                    "1970-01-01 00:00:03",
+                    pd.NaT,
+                    pd.NaT,
+                ],
+                [1, 2, 3, 5, 6, 8, 7, 11, 12, 13],
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "freq, expected_values",
+        [
+            ("1s", [3, np.nan, 7, 11]),
+            ("2s", [3, (7 + 11) / 2]),
+            ("3s", [(3 + 7) / 2, 11]),
+        ],
+    )
+    def test_resample_with_nat(self, periods, values, freq, expected_values):
+        # GH 13224
+        index = PeriodIndex(periods, freq="s")
+        frame = DataFrame(values, index=index)
+
+        expected_index = period_range(
+            "1970-01-01 00:00:00", periods=len(expected_values), freq=freq
+        )
+        expected = DataFrame(expected_values, index=expected_index)
+        result = frame.resample(freq).mean()
+        tm.assert_frame_equal(result, expected)
+
+    def test_resample_with_only_nat(self):
+        # GH 13224
+        pi = PeriodIndex([pd.NaT] * 3, freq="s")
+        frame = DataFrame([2, 3, 5], index=pi, columns=["a"])
+        expected_index = PeriodIndex(data=[], freq=pi.freq)
+        expected = DataFrame(index=expected_index, columns=["a"], dtype="float64")
+        result = frame.resample("1s").mean()
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "start,end,start_freq,end_freq,offset",
+        [
+            ("19910905", "19910909 03:00", "h", "24h", "10h"),
+            ("19910905", "19910909 12:00", "h", "24h", "10h"),
+            ("19910905", "19910909 23:00", "h", "24h", "10h"),
+            ("19910905 10:00", "19910909", "h", "24h", "10h"),
+            ("19910905 10:00", "19910909 10:00", "h", "24h", "10h"),
+            ("19910905", "19910909 10:00", "h", "24h", "10h"),
+            ("19910905 12:00", "19910909", "h", "24h", "10h"),
+            ("19910905 12:00", "19910909 03:00", "h", "24h", "10h"),
+            ("19910905 12:00", "19910909 12:00", "h", "24h", "10h"),
+            ("19910905 12:00", "19910909 12:00", "h", "24h", "34h"),
+            ("19910905 12:00", "19910909 12:00", "h", "17h", "10h"),
+            ("19910905 12:00", "19910909 12:00", "h", "17h", "3h"),
+            ("19910905", "19910913 06:00", "2h", "24h", "10h"),
+            ("19910905", "19910905 01:39", "Min", "5Min", "3Min"),
+            ("19910905", "19910905 03:18", "2Min", "5Min", "3Min"),
+        ],
+    )
+    def test_resample_with_offset(self, start, end, start_freq, end_freq, offset):
+        # GH 23882 & 31809
+        pi = period_range(start, end, freq=start_freq)
+        ser = Series(np.arange(len(pi)), index=pi)
+        result = ser.resample(end_freq, offset=offset).mean()
+        result = result.to_timestamp(end_freq)
+
+        expected = ser.to_timestamp().resample(end_freq, offset=offset).mean()
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_with_offset_month(self):
+        # GH 23882 & 31809
+        pi = period_range("19910905 12:00", "19910909 1:00", freq="h")
+        ser = Series(np.arange(len(pi)), index=pi)
+        result = ser.resample("M").mean()
+        result = result.to_timestamp("M")
+        expected = ser.to_timestamp().resample("ME").mean()
+        # TODO: is non-tick the relevant characteristic? (GH 33815)
+        expected.index = expected.index._with_freq(None)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "first,last,freq,freq_to_offset,exp_first,exp_last",
+        [
+            ("19910905", "19920406", "D", "D", "19910905", "19920406"),
+            ("19910905 00:00", "19920406 06:00", "D", "D", "19910905", "19920406"),
+            (
+                "19910905 06:00",
+                "19920406 06:00",
+                "h",
+                "h",
+                "19910905 06:00",
+                "19920406 06:00",
+            ),
+            ("19910906", "19920406", "M", "ME", "1991-09", "1992-04"),
+            ("19910831", "19920430", "M", "ME", "1991-08", "1992-04"),
+            ("1991-08", "1992-04", "M", "ME", "1991-08", "1992-04"),
+        ],
+    )
+    def test_get_period_range_edges(
+        self, first, last, freq, freq_to_offset, exp_first, exp_last
+    ):
+        first = Period(first)
+        last = Period(last)
+
+        exp_first = Period(exp_first, freq=freq)
+        exp_last = Period(exp_last, freq=freq)
+
+        freq = pd.tseries.frequencies.to_offset(freq_to_offset)
+        result = _get_period_range_edges(first, last, freq)
+        expected = (exp_first, exp_last)
+        assert result == expected
+
+    def test_sum_min_count(self):
+        # GH 19974
+        index = date_range(start="2018", freq="ME", periods=6)
+        data = np.ones(6)
+        data[3:6] = np.nan
+        s = Series(data, index).to_period()
+        result = s.resample("Q").sum(min_count=1)
+        expected = Series(
+            [3.0, np.nan], index=PeriodIndex(["2018Q1", "2018Q2"], freq="Q-DEC")
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_resample_t_l_deprecated(self):
+        # GH#52536
+        msg_t = "Invalid frequency: T"
+        msg_l = "Invalid frequency: L"
+
+        with pytest.raises(ValueError, match=msg_l):
+            period_range(
+                "2020-01-01 00:00:00 00:00", "2020-01-01 00:00:00 00:01", freq="L"
+            )
+        rng_l = period_range(
+            "2020-01-01 00:00:00 00:00", "2020-01-01 00:00:00 00:01", freq="ms"
+        )
+        ser = Series(np.arange(len(rng_l)), index=rng_l)
+
+        with pytest.raises(ValueError, match=msg_t):
+            ser.resample("T").mean()
+
+    @pytest.mark.parametrize(
+        "freq, freq_depr, freq_depr_res",
+        [
+            ("2Q", "2q", "2y"),
+            ("2M", "2m", "2q"),
+        ],
+    )
+    def test_resample_lowercase_frequency_raises(self, freq, freq_depr, freq_depr_res):
+        msg = f"Invalid frequency: {freq_depr}"
+        with pytest.raises(ValueError, match=msg):
+            period_range("2020-01-01", "2020-08-01", freq=freq_depr)
+
+        msg = f"Invalid frequency: {freq_depr_res}"
+        rng = period_range("2020-01-01", "2020-08-01", freq=freq)
+        ser = Series(np.arange(len(rng)), index=rng)
+        with pytest.raises(ValueError, match=msg):
+            ser.resample(freq_depr_res).mean()
+
+    @pytest.mark.parametrize(
+        "offset",
+        [
+            offsets.MonthBegin(),
+            offsets.BYearBegin(2),
+            offsets.BusinessHour(2),
+        ],
+    )
+    def test_asfreq_invalid_period_offset(self, offset, frame_or_series):
+        # GH#55785
+        msg = re.escape(f"{offset} is not supported as period frequency")
+
+        obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5))
+        with pytest.raises(ValueError, match=msg):
+            obj.asfreq(freq=offset)
+
+
+@pytest.mark.parametrize(
+    "freq",
+    [
+        ("2ME"),
+        ("2QE"),
+        ("2QE-FEB"),
+        ("2YE"),
+        ("2YE-MAR"),
+        ("2me"),
+        ("2qe"),
+        ("2ye-mar"),
+    ],
+)
+def test_resample_frequency_ME_QE_YE_raises(frame_or_series, freq):
+    # GH#9586
+    msg = f"{freq[1:]} is not supported as period frequency"
+
+    obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5))
+    msg = f"Invalid frequency: {freq}"
+    with pytest.raises(ValueError, match=msg):
+        obj.resample(freq)
+
+
+def test_corner_cases_period(simple_period_range_series):
+    # miscellaneous test coverage
+    len0pts = simple_period_range_series("2007-01", "2010-05", freq="M")[:0]
+    # it works
+    result = len0pts.resample("Y-DEC").mean()
+    assert len(result) == 0
+
+
+@pytest.mark.parametrize("freq", ["2BME", "2CBME", "2SME", "2BQE-FEB", "2BYE-MAR"])
+def test_resample_frequency_invalid_freq(frame_or_series, freq):
+    # GH#9586
+    msg = f"Invalid frequency: {freq}"
+
+    obj = frame_or_series(range(5), index=period_range("2020-01-01", periods=5))
+    with pytest.raises(ValueError, match=msg):
+        obj.resample(freq)
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..36ef01178b3bc905bc4099fe6cc3e271f3872b5f
--- /dev/null
+++ b/pandas/tests/resample/test_resample_api.py
@@ -0,0 +1,1018 @@
+from datetime import datetime
+import re
+
+import numpy as np
+import pytest
+
+from pandas._libs import lib
+from pandas._libs.tslibs import Day
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    NamedAgg,
+    Series,
+)
+import pandas._testing as tm
+from pandas.core.indexes.datetimes import date_range
+
+
+@pytest.fixture
+def dti():
+    return date_range(start=datetime(2005, 1, 1), end=datetime(2005, 1, 10), freq="Min")
+
+
+@pytest.fixture
+def _test_series(dti):
+    return Series(np.random.default_rng(2).random(len(dti)), dti)
+
+
+@pytest.fixture
+def test_frame(dti, _test_series):
+    return DataFrame({"A": _test_series, "B": _test_series, "C": np.arange(len(dti))})
+
+
+def test_str(_test_series):
+    r = _test_series.resample("h")
+    assert (
+        "DatetimeIndexResampler [freq=<Hour>, closed=left, "
+        "label=left, convention=start, origin=start_day]" in str(r)
+    )
+
+    r = _test_series.resample("h", origin="2000-01-01")
+    assert (
+        "DatetimeIndexResampler [freq=<Hour>, closed=left, "
+        "label=left, convention=start, origin=2000-01-01 00:00:00]" in str(r)
+    )
+
+
+def test_api(_test_series):
+    r = _test_series.resample("h")
+    result = r.mean()
+    assert isinstance(result, Series)
+    assert len(result) == 217
+
+    r = _test_series.to_frame().resample("h")
+    result = r.mean()
+    assert isinstance(result, DataFrame)
+    assert len(result) == 217
+
+
+def test_groupby_resample_api():
+    # GH 12448
+    # .groupby(...).resample(...) hitting warnings
+    # when appropriate
+    df = DataFrame(
+        {
+            "date": date_range(start="2016-01-01", periods=4, freq="W"),
+            "group": [1, 1, 2, 2],
+            "val": [5, 6, 7, 8],
+        }
+    ).set_index("date")
+
+    # replication step
+    i = (
+        date_range("2016-01-03", periods=8).tolist()
+        + date_range("2016-01-17", periods=8).tolist()
+    )
+    index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i], names=["group", "date"])
+    expected = DataFrame({"val": [5] * 7 + [6] + [7] * 7 + [8]}, index=index)
+    result = df.groupby("group").apply(lambda x: x.resample("1D").ffill())[["val"]]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_resample_on_api():
+    # GH 15021
+    # .groupby(...).resample(on=...) results in an unexpected
+    # keyword warning.
+    df = DataFrame(
+        {
+            "key": ["A", "B"] * 5,
+            "dates": date_range("2016-01-01", periods=10),
+            "values": np.random.default_rng(2).standard_normal(10),
+        }
+    )
+
+    expected = df.set_index("dates").groupby("key").resample("D").mean()
+    result = df.groupby("key").resample("D", on="dates").mean()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_group_keys():
+    df = DataFrame({"A": 1, "B": 2}, index=date_range("2000", periods=10, unit="ns"))
+    expected = df.copy()
+
+    # group_keys=False
+    g = df.resample("5D", group_keys=False)
+    result = g.apply(lambda x: x)
+    tm.assert_frame_equal(result, expected)
+
+    # group_keys defaults to False
+    g = df.resample("5D")
+    result = g.apply(lambda x: x)
+    tm.assert_frame_equal(result, expected)
+
+    # group_keys=True
+    expected.index = pd.MultiIndex.from_arrays(
+        [
+            pd.to_datetime(["2000-01-01", "2000-01-06"]).as_unit("ns").repeat(5),
+            expected.index,
+        ]
+    )
+    g = df.resample("5D", group_keys=True)
+    result = g.apply(lambda x: x)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_pipe(test_frame, _test_series):
+    # GH17905
+
+    # series
+    r = _test_series.resample("h")
+    expected = r.max() - r.mean()
+    result = r.pipe(lambda x: x.max() - x.mean())
+    tm.assert_series_equal(result, expected)
+
+    # dataframe
+    r = test_frame.resample("h")
+    expected = r.max() - r.mean()
+    result = r.pipe(lambda x: x.max() - x.mean())
+    tm.assert_frame_equal(result, expected)
+
+
+def test_getitem(test_frame):
+    r = test_frame.resample("h")
+    tm.assert_index_equal(r._selected_obj.columns, test_frame.columns)
+
+    r = test_frame.resample("h")["B"]
+    assert r._selected_obj.name == test_frame.columns[1]
+
+    # technically this is allowed
+    r = test_frame.resample("h")["A", "B"]
+    tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]])
+
+    r = test_frame.resample("h")["A", "B"]
+    tm.assert_index_equal(r._selected_obj.columns, test_frame.columns[[0, 1]])
+
+
+@pytest.mark.parametrize("key", [["D"], ["A", "D"]])
+def test_select_bad_cols(key, test_frame):
+    g = test_frame.resample("h")
+    # 'A' should not be referenced as a bad column...
+    # will have to rethink regex if you change message!
+    msg = r"^\"Columns not found: 'D'\"$"
+    with pytest.raises(KeyError, match=msg):
+        g[key]
+
+
+def test_attribute_access(test_frame):
+    r = test_frame.resample("h")
+    tm.assert_series_equal(r.A.sum(), r["A"].sum())
+
+
+@pytest.mark.parametrize("attr", ["groups", "ngroups", "indices"])
+def test_api_compat_before_use(attr):
+    # make sure that we are setting the binner
+    # on these attributes
+    rng = date_range("1/1/2012", periods=100, freq="s")
+    ts = Series(np.arange(len(rng)), index=rng)
+    rs = ts.resample("30s")
+
+    # before use
+    getattr(rs, attr)
+
+    # after grouper is initialized is ok
+    rs.mean()
+    getattr(rs, attr)
+
+
+def tests_raises_on_nuisance(test_frame, using_infer_string):
+    df = test_frame
+    df["D"] = "foo"
+    r = df.resample("h")
+    result = r[["A", "B"]].mean()
+    expected = pd.concat([r.A.mean(), r.B.mean()], axis=1)
+    tm.assert_frame_equal(result, expected)
+
+    expected = r[["A", "B", "C"]].mean()
+    msg = re.escape("agg function failed [how->mean,dtype->")
+    if using_infer_string:
+        msg = "dtype 'str' does not support operation 'mean'"
+    with pytest.raises(TypeError, match=msg):
+        r.mean()
+    result = r.mean(numeric_only=True)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_downsample_but_actually_upsampling():
+    # this is reindex / asfreq
+    rng = date_range("1/1/2012", periods=100, freq="s")
+    ts = Series(np.arange(len(rng), dtype="int64"), index=rng)
+    result = ts.resample("20s").asfreq()
+    expected = Series(
+        [0, 20, 40, 60, 80],
+        index=date_range("2012-01-01 00:00:00", freq="20s", periods=5),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_combined_up_downsampling_of_irregular():
+    # since we are really doing an operation like this
+    # ts2.resample('2s').mean().ffill()
+    # preserve these semantics
+
+    rng = date_range("1/1/2012", periods=100, freq="s", unit="ns")
+    ts = Series(np.arange(len(rng)), index=rng)
+    ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]]
+
+    result = ts2.resample("2s").mean().ffill()
+    expected = Series(
+        [
+            0.5,
+            2.5,
+            5.0,
+            7.0,
+            7.0,
+            11.0,
+            11.0,
+            15.0,
+            16.0,
+            16.0,
+            16.0,
+            16.0,
+            25.0,
+            25.0,
+            25.0,
+            30.0,
+        ],
+        index=pd.DatetimeIndex(
+            [
+                "2012-01-01 00:00:00",
+                "2012-01-01 00:00:02",
+                "2012-01-01 00:00:04",
+                "2012-01-01 00:00:06",
+                "2012-01-01 00:00:08",
+                "2012-01-01 00:00:10",
+                "2012-01-01 00:00:12",
+                "2012-01-01 00:00:14",
+                "2012-01-01 00:00:16",
+                "2012-01-01 00:00:18",
+                "2012-01-01 00:00:20",
+                "2012-01-01 00:00:22",
+                "2012-01-01 00:00:24",
+                "2012-01-01 00:00:26",
+                "2012-01-01 00:00:28",
+                "2012-01-01 00:00:30",
+            ],
+            dtype="datetime64[ns]",
+            freq="2s",
+        ),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_transform_series(_test_series):
+    r = _test_series.resample("20min")
+    expected = _test_series.groupby(pd.Grouper(freq="20min")).transform("mean")
+    result = r.transform("mean")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("on", [None, "date"])
+def test_transform_frame(on):
+    # GH#47079
+    index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D")
+    index.name = "date"
+    df = DataFrame(
+        np.random.default_rng(2).random((10, 2)), columns=list("AB"), index=index
+    )
+    expected = df.groupby(pd.Grouper(freq="20min")).transform("mean")
+    if on == "date":
+        # Move date to being a column; result will then have a RangeIndex
+        expected = expected.reset_index(drop=True)
+        df = df.reset_index()
+
+    r = df.resample("20min", on=on)
+    result = r.transform("mean")
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda x: x.resample("20min", group_keys=False),
+        lambda x: x.groupby(pd.Grouper(freq="20min"), group_keys=False),
+    ],
+    ids=["resample", "groupby"],
+)
+def test_apply_without_aggregation(func, _test_series):
+    # both resample and groupby should work w/o aggregation
+    t = func(_test_series)
+    result = t.apply(lambda x: x)
+    tm.assert_series_equal(result, _test_series)
+
+
+def test_apply_without_aggregation2(_test_series):
+    grouped = _test_series.to_frame(name="foo").resample("20min", group_keys=False)
+    result = grouped["foo"].apply(lambda x: x)
+    tm.assert_series_equal(result, _test_series.rename("foo"))
+
+
+def test_agg_consistency():
+    # make sure that we are consistent across
+    # similar aggregations with and w/o selection list
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((1000, 3)),
+        index=date_range("1/1/2012", freq="s", periods=1000),
+        columns=["A", "B", "C"],
+    )
+
+    r = df.resample("3min")
+
+    msg = r"Label\(s\) \['r1', 'r2'\] do not exist"
+    with pytest.raises(KeyError, match=msg):
+        r.agg({"r1": "mean", "r2": "sum"})
+
+
+def test_agg_consistency_int_str_column_mix():
+    # GH#39025
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((1000, 2)),
+        index=date_range("1/1/2012", freq="s", periods=1000),
+        columns=[1, "a"],
+    )
+
+    r = df.resample("3min")
+
+    msg = r"Label\(s\) \[2, 'b'\] do not exist"
+    with pytest.raises(KeyError, match=msg):
+        r.agg({2: "mean", "b": "sum"})
+
+
+# TODO(GH#14008): once GH 14008 is fixed, move these tests into
+# `Base` test class
+
+
+@pytest.fixture
+def index():
+    index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D", unit="ns")
+    index.name = "date"
+    return index
+
+
+@pytest.fixture
+def df(index):
+    frame = DataFrame(
+        np.random.default_rng(2).random((10, 2)), columns=list("AB"), index=index
+    )
+    return frame
+
+
+@pytest.fixture
+def df_col(df):
+    return df.reset_index()
+
+
+@pytest.fixture
+def df_mult(df_col, index):
+    df_mult = df_col.copy()
+    df_mult.index = pd.MultiIndex.from_arrays(
+        [range(10), index], names=["index", "date"]
+    )
+    return df_mult
+
+
+@pytest.fixture
+def a_mean(df):
+    return df.resample("2D")["A"].mean()
+
+
+@pytest.fixture
+def a_std(df):
+    return df.resample("2D")["A"].std()
+
+
+@pytest.fixture
+def a_sum(df):
+    return df.resample("2D")["A"].sum()
+
+
+@pytest.fixture
+def b_mean(df):
+    return df.resample("2D")["B"].mean()
+
+
+@pytest.fixture
+def b_std(df):
+    return df.resample("2D")["B"].std()
+
+
+@pytest.fixture
+def b_sum(df):
+    return df.resample("2D")["B"].sum()
+
+
+@pytest.fixture
+def df_resample(df):
+    return df.resample("2D")
+
+
+@pytest.fixture
+def df_col_resample(df_col):
+    return df_col.resample("2D", on="date")
+
+
+@pytest.fixture
+def df_mult_resample(df_mult):
+    return df_mult.resample("2D", level="date")
+
+
+@pytest.fixture
+def df_grouper_resample(df):
+    return df.groupby(pd.Grouper(freq="2D"))
+
+
+@pytest.fixture(
+    params=["df_resample", "df_col_resample", "df_mult_resample", "df_grouper_resample"]
+)
+def cases(request):
+    return request.getfixturevalue(request.param)
+
+
+def test_agg_mixed_column_aggregation(cases, a_mean, a_std, b_mean, b_std, request):
+    expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
+    expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "<lambda_0>"]])
+    # "date" is an index and a column, so get included in the agg
+    if "df_mult" in request.node.callspec.id:
+        date_mean = cases["date"].mean()
+        date_std = cases["date"].std()
+        expected = pd.concat([date_mean, date_std, expected], axis=1)
+        expected.columns = pd.MultiIndex.from_product(
+            [["date", "A", "B"], ["mean", "<lambda_0>"]]
+        )
+    result = cases.aggregate([np.mean, lambda x: np.std(x, ddof=1)])
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "agg",
+    [
+        {"func": {"A": np.mean, "B": lambda x: np.std(x, ddof=1)}},
+        {"A": ("A", np.mean), "B": ("B", lambda x: np.std(x, ddof=1))},
+        {"A": NamedAgg("A", np.mean), "B": NamedAgg("B", lambda x: np.std(x, ddof=1))},
+    ],
+)
+def test_agg_both_mean_std_named_result(cases, a_mean, b_std, agg):
+    expected = pd.concat([a_mean, b_std], axis=1)
+    result = cases.aggregate(**agg)
+    tm.assert_frame_equal(result, expected, check_like=True)
+
+
+def test_agg_both_mean_std_dict_of_list(cases, a_mean, a_std):
+    expected = pd.concat([a_mean, a_std], axis=1)
+    expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")])
+    result = cases.aggregate({"A": ["mean", "std"]})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "agg", [{"func": ["mean", "sum"]}, {"mean": "mean", "sum": "sum"}]
+)
+def test_agg_both_mean_sum(cases, a_mean, a_sum, agg):
+    expected = pd.concat([a_mean, a_sum], axis=1)
+    expected.columns = ["mean", "sum"]
+    result = cases["A"].aggregate(**agg)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "agg",
+    [
+        {"A": {"mean": "mean", "sum": "sum"}},
+        {
+            "A": {"mean": "mean", "sum": "sum"},
+            "B": {"mean2": "mean", "sum2": "sum"},
+        },
+    ],
+)
+def test_agg_dict_of_dict_specificationerror(cases, agg):
+    msg = "nested renamer is not supported"
+    with pytest.raises(pd.errors.SpecificationError, match=msg):
+        cases.aggregate(agg)
+
+
+def test_agg_dict_of_lists(cases, a_mean, a_std, b_mean, b_std):
+    expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
+    expected.columns = pd.MultiIndex.from_tuples(
+        [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")]
+    )
+    result = cases.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]})
+    tm.assert_frame_equal(result, expected, check_like=True)
+
+
+@pytest.mark.parametrize(
+    "agg",
+    [
+        {"func": {"A": np.sum, "B": lambda x: np.std(x, ddof=1)}},
+        {"A": ("A", np.sum), "B": ("B", lambda x: np.std(x, ddof=1))},
+        {"A": NamedAgg("A", np.sum), "B": NamedAgg("B", lambda x: np.std(x, ddof=1))},
+    ],
+)
+def test_agg_with_lambda(cases, agg):
+    # passed lambda
+    rcustom = cases["B"].apply(lambda x: np.std(x, ddof=1))
+    expected = pd.concat([cases["A"].sum(), rcustom], axis=1)
+    result = cases.agg(**agg)
+    tm.assert_frame_equal(result, expected, check_like=True)
+
+
+@pytest.mark.parametrize(
+    "agg",
+    [
+        {"func": {"result1": np.sum, "result2": np.mean}},
+        {"A": ("result1", np.sum), "B": ("result2", np.mean)},
+        {"A": NamedAgg("result1", np.sum), "B": NamedAgg("result2", np.mean)},
+    ],
+)
+def test_agg_no_column(cases, agg):
+    msg = r"Label\(s\) \['result1', 'result2'\] do not exist"
+    with pytest.raises(KeyError, match=msg):
+        cases[["A", "B"]].agg(**agg)
+
+
+@pytest.mark.parametrize(
+    "cols, agg",
+    [
+        [None, {"A": ["sum", "std"], "B": ["mean", "std"]}],
+        [
+            [
+                "A",
+                "B",
+            ],
+            {"A": ["sum", "std"], "B": ["mean", "std"]},
+        ],
+    ],
+)
+def test_agg_specificationerror_nested(cases, cols, agg, a_sum, a_std, b_mean, b_std):
+    # agg with different hows
+    # equivalent of using a selection list / or not
+    expected = pd.concat([a_sum, a_std, b_mean, b_std], axis=1)
+    expected.columns = pd.MultiIndex.from_tuples(
+        [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")]
+    )
+    if cols is not None:
+        obj = cases[cols]
+    else:
+        obj = cases
+
+    result = obj.agg(agg)
+    tm.assert_frame_equal(result, expected, check_like=True)
+
+
+@pytest.mark.parametrize(
+    "agg", [{"A": ["sum", "std"]}, {"A": ["sum", "std"], "B": ["mean", "std"]}]
+)
+def test_agg_specificationerror_series(cases, agg):
+    msg = "nested renamer is not supported"
+
+    # series like aggs
+    with pytest.raises(pd.errors.SpecificationError, match=msg):
+        cases["A"].agg(agg)
+
+
+def test_agg_specificationerror_invalid_names(cases):
+    # errors
+    # invalid names in the agg specification
+    msg = r"Label\(s\) \['B'\] do not exist"
+    with pytest.raises(KeyError, match=msg):
+        cases[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]})
+
+
+def test_agg_nested_dicts():
+    index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D")
+    index.name = "date"
+    df = DataFrame(
+        np.random.default_rng(2).random((10, 2)), columns=list("AB"), index=index
+    )
+    df_col = df.reset_index()
+    df_mult = df_col.copy()
+    df_mult.index = pd.MultiIndex.from_arrays(
+        [range(10), df.index], names=["index", "date"]
+    )
+    r = df.resample("2D")
+    cases = [
+        r,
+        df_col.resample("2D", on="date"),
+        df_mult.resample("2D", level="date"),
+        df.groupby(pd.Grouper(freq="2D")),
+    ]
+
+    msg = "nested renamer is not supported"
+    for t in cases:
+        with pytest.raises(pd.errors.SpecificationError, match=msg):
+            t.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}})
+
+    for t in cases:
+        with pytest.raises(pd.errors.SpecificationError, match=msg):
+            t[["A", "B"]].agg(
+                {"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}}
+            )
+
+        with pytest.raises(pd.errors.SpecificationError, match=msg):
+            t.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}})
+
+
+def test_try_aggregate_non_existing_column():
+    # GH 16766
+    data = [
+        {"dt": datetime(2017, 6, 1, 0), "x": 1.0, "y": 2.0},
+        {"dt": datetime(2017, 6, 1, 1), "x": 2.0, "y": 2.0},
+        {"dt": datetime(2017, 6, 1, 2), "x": 3.0, "y": 1.5},
+    ]
+    df = DataFrame(data).set_index("dt")
+
+    # Error as we don't have 'z' column
+    msg = r"Label\(s\) \['z'\] do not exist"
+    with pytest.raises(KeyError, match=msg):
+        df.resample("30min").agg({"x": ["mean"], "y": ["median"], "z": ["sum"]})
+
+
+def test_agg_list_like_func_with_args():
+    # 50624
+    df = DataFrame(
+        {"x": [1, 2, 3]}, index=date_range("2020-01-01", periods=3, freq="D")
+    )
+
+    def foo1(x, a=1, c=0):
+        return x + a + c
+
+    def foo2(x, b=2, c=0):
+        return x + b + c
+
+    msg = r"foo1\(\) got an unexpected keyword argument 'b'"
+    with pytest.raises(TypeError, match=msg):
+        df.resample("D").agg([foo1, foo2], 3, b=3, c=4)
+
+    result = df.resample("D").agg([foo1, foo2], 3, c=4)
+    expected = DataFrame(
+        [[8, 8], [9, 9], [10, 10]],
+        index=date_range("2020-01-01", periods=3, freq="D"),
+        columns=pd.MultiIndex.from_tuples([("x", "foo1"), ("x", "foo2")]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_selection_api_validation():
+    # GH 13500
+    index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D")
+
+    rng = np.arange(len(index), dtype=np.int64)
+    df = DataFrame(
+        {"date": index, "a": rng},
+        index=pd.MultiIndex.from_arrays([rng, index], names=["v", "d"]),
+    )
+    df_exp = DataFrame({"a": rng}, index=index)
+
+    # non DatetimeIndex
+    msg = (
+        "Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, "
+        "but got an instance of 'Index'"
+    )
+    with pytest.raises(TypeError, match=msg):
+        df.resample("2D", level="v")
+
+    msg = "The Grouper cannot specify both a key and a level!"
+    with pytest.raises(ValueError, match=msg):
+        df.resample("2D", on="date", level="d")
+
+    msg = "unhashable type: 'list'"
+    with pytest.raises(TypeError, match=msg):
+        df.resample("2D", on=["a", "date"])
+
+    msg = r"\"Level \['a', 'date'\] not found\""
+    with pytest.raises(KeyError, match=msg):
+        df.resample("2D", level=["a", "date"])
+
+    # upsampling not allowed
+    msg = (
+        "Upsampling from level= or on= selection is not supported, use "
+        r"\.set_index\(\.\.\.\) to explicitly set index to datetime-like"
+    )
+    with pytest.raises(ValueError, match=msg):
+        df.resample("2D", level="d").asfreq()
+    with pytest.raises(ValueError, match=msg):
+        df.resample("2D", on="date").asfreq()
+
+    exp = df_exp.resample("2D").sum()
+    exp.index.name = "date"
+    result = df.resample("2D", on="date").sum()
+    tm.assert_frame_equal(exp, result)
+
+    exp.index.name = "d"
+    with pytest.raises(
+        TypeError, match="datetime64 type does not support operation 'sum'"
+    ):
+        df.resample("2D", level="d").sum()
+    result = df.resample("2D", level="d").sum(numeric_only=True)
+    tm.assert_frame_equal(exp, result)
+
+
+@pytest.mark.parametrize(
+    "col_name", ["t2", "t2x", "t2q", "T_2M", "t2p", "t2m", "t2m1", "T2M"]
+)
+def test_agg_with_datetime_index_list_agg_func(col_name):
+    # GH 22660
+    # The parametrized column names would get converted to dates by our
+    # date parser. Some would result in OutOfBoundsError (ValueError) while
+    # others would result in OverflowError when passed into Timestamp.
+    # We catch these errors and move on to the correct branch.
+    df = DataFrame(
+        list(range(200)),
+        index=date_range(
+            start="2017-01-01", freq="15min", periods=200, tz="Europe/Berlin"
+        ),
+        columns=[col_name],
+    )
+    result = df.resample("1D").aggregate(["mean"])
+    expected = DataFrame(
+        [47.5, 143.5, 195.5],
+        index=date_range(start="2017-01-01", freq="D", periods=3, tz="Europe/Berlin"),
+        columns=pd.MultiIndex(levels=[[col_name], ["mean"]], codes=[[0], [0]]),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_agg_readonly():
+    # GH#31710 cython needs to allow readonly data
+    index = date_range("2020-01-01", "2020-01-02", freq="1h", unit="ns")
+    arr = np.zeros_like(index)
+    arr.setflags(write=False)
+
+    ser = Series(arr, index=index)
+    rs = ser.resample("1D")
+
+    expected = Series([pd.Timestamp(0), pd.Timestamp(0)], index=index[::24])
+    expected.index.freq = Day(1)  # GH#41943 no longer equivalent to 24h
+
+    result = rs.agg("last")
+    tm.assert_series_equal(result, expected)
+
+    result = rs.agg("first")
+    tm.assert_series_equal(result, expected)
+
+    result = rs.agg("max")
+    tm.assert_series_equal(result, expected)
+
+    result = rs.agg("min")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "start,end,freq,data,resample_freq,origin,closed,exp_data,exp_end,exp_periods",
+    [
+        (
+            "2000-10-01 23:30:00",
+            "2000-10-02 00:26:00",
+            "7min",
+            [0, 3, 6, 9, 12, 15, 18, 21, 24],
+            "17min",
+            "end",
+            None,
+            [0, 18, 27, 63],
+            "20001002 00:26:00",
+            4,
+        ),
+        (
+            "20200101 8:26:35",
+            "20200101 9:31:58",
+            "77s",
+            [1] * 51,
+            "7min",
+            "end",
+            "right",
+            [1, 6, 5, 6, 5, 6, 5, 6, 5, 6],
+            "2020-01-01 09:30:45",
+            10,
+        ),
+        (
+            "2000-10-01 23:30:00",
+            "2000-10-02 00:26:00",
+            "7min",
+            [0, 3, 6, 9, 12, 15, 18, 21, 24],
+            "17min",
+            "end",
+            "left",
+            [0, 18, 27, 39, 24],
+            "20001002 00:43:00",
+            5,
+        ),
+        (
+            "2000-10-01 23:30:00",
+            "2000-10-02 00:26:00",
+            "7min",
+            [0, 3, 6, 9, 12, 15, 18, 21, 24],
+            "17min",
+            "end_day",
+            None,
+            [3, 15, 45, 45],
+            "2000-10-02 00:29:00",
+            4,
+        ),
+    ],
+)
+def test_end_and_end_day_origin(
+    start,
+    end,
+    freq,
+    data,
+    resample_freq,
+    origin,
+    closed,
+    exp_data,
+    exp_end,
+    exp_periods,
+):
+    rng = date_range(start, end, freq=freq)
+    ts = Series(data, index=rng)
+
+    res = ts.resample(resample_freq, origin=origin, closed=closed).sum()
+    expected = Series(
+        exp_data,
+        index=date_range(end=exp_end, freq=resample_freq, periods=exp_periods),
+    )
+
+    tm.assert_series_equal(res, expected)
+
+
+@pytest.mark.parametrize(
+    # expected_data is a string when op raises a ValueError
+    "method, numeric_only, expected_data",
+    [
+        ("sum", True, {"num": [25]}),
+        ("sum", False, {"cat": ["cat_1cat_2"], "num": [25]}),
+        ("sum", lib.no_default, {"cat": ["cat_1cat_2"], "num": [25]}),
+        ("prod", True, {"num": [100]}),
+        ("prod", False, "can't multiply sequence"),
+        ("prod", lib.no_default, "can't multiply sequence"),
+        ("min", True, {"num": [5]}),
+        ("min", False, {"cat": ["cat_1"], "num": [5]}),
+        ("min", lib.no_default, {"cat": ["cat_1"], "num": [5]}),
+        ("max", True, {"num": [20]}),
+        ("max", False, {"cat": ["cat_2"], "num": [20]}),
+        ("max", lib.no_default, {"cat": ["cat_2"], "num": [20]}),
+        ("first", True, {"num": [5]}),
+        ("first", False, {"cat": ["cat_1"], "num": [5]}),
+        ("first", lib.no_default, {"cat": ["cat_1"], "num": [5]}),
+        ("last", True, {"num": [20]}),
+        ("last", False, {"cat": ["cat_2"], "num": [20]}),
+        ("last", lib.no_default, {"cat": ["cat_2"], "num": [20]}),
+        ("mean", True, {"num": [12.5]}),
+        ("mean", False, "Could not convert"),
+        ("mean", lib.no_default, "Could not convert"),
+        ("median", True, {"num": [12.5]}),
+        ("median", False, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"),
+        ("median", lib.no_default, r"Cannot convert \['cat_1' 'cat_2'\] to numeric"),
+        ("std", True, {"num": [10.606601717798213]}),
+        ("std", False, "could not convert string to float"),
+        ("std", lib.no_default, "could not convert string to float"),
+        ("var", True, {"num": [112.5]}),
+        ("var", False, "could not convert string to float"),
+        ("var", lib.no_default, "could not convert string to float"),
+        ("sem", True, {"num": [7.5]}),
+        ("sem", False, "could not convert string to float"),
+        ("sem", lib.no_default, "could not convert string to float"),
+    ],
+)
+def test_frame_downsample_method(
+    method, numeric_only, expected_data, using_infer_string
+):
+    # GH#46442 test if `numeric_only` behave as expected for DataFrameGroupBy
+
+    index = date_range("2018-01-01", periods=2, freq="D")
+    expected_index = date_range("2018-12-31", periods=1, freq="YE")
+    df = DataFrame({"cat": ["cat_1", "cat_2"], "num": [5, 20]}, index=index)
+    resampled = df.resample("YE")
+    if numeric_only is lib.no_default:
+        kwargs = {}
+    else:
+        kwargs = {"numeric_only": numeric_only}
+
+    func = getattr(resampled, method)
+    if isinstance(expected_data, str):
+        if method in ("var", "mean", "median", "prod"):
+            klass = TypeError
+            msg = re.escape(f"agg function failed [how->{method},dtype->")
+            if using_infer_string:
+                msg = f"dtype 'str' does not support operation '{method}'"
+        elif method in ["sum", "std", "sem"] and using_infer_string:
+            klass = TypeError
+            msg = f"dtype 'str' does not support operation '{method}'"
+        else:
+            klass = ValueError
+            msg = expected_data
+        with pytest.raises(klass, match=msg):
+            _ = func(**kwargs)
+    else:
+        result = func(**kwargs)
+        expected = DataFrame(expected_data, index=expected_index)
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, numeric_only, expected_data",
+    [
+        ("sum", True, ()),
+        ("sum", False, ["cat_1cat_2"]),
+        ("sum", lib.no_default, ["cat_1cat_2"]),
+        ("prod", True, ()),
+        ("prod", False, ()),
+        ("prod", lib.no_default, ()),
+        ("min", True, ()),
+        ("min", False, ["cat_1"]),
+        ("min", lib.no_default, ["cat_1"]),
+        ("max", True, ()),
+        ("max", False, ["cat_2"]),
+        ("max", lib.no_default, ["cat_2"]),
+        ("first", True, ()),
+        ("first", False, ["cat_1"]),
+        ("first", lib.no_default, ["cat_1"]),
+        ("last", True, ()),
+        ("last", False, ["cat_2"]),
+        ("last", lib.no_default, ["cat_2"]),
+    ],
+)
+def test_series_downsample_method(
+    method, numeric_only, expected_data, using_infer_string
+):
+    # GH#46442 test if `numeric_only` behave as expected for SeriesGroupBy
+
+    index = date_range("2018-01-01", periods=2, freq="D")
+    expected_index = date_range("2018-12-31", periods=1, freq="YE")
+    df = Series(["cat_1", "cat_2"], index=index)
+    resampled = df.resample("YE")
+    kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only}
+
+    func = getattr(resampled, method)
+    if numeric_only and numeric_only is not lib.no_default:
+        msg = rf"Cannot use numeric_only=True with SeriesGroupBy\.{method}"
+        with pytest.raises(TypeError, match=msg):
+            func(**kwargs)
+    elif method == "prod":
+        msg = re.escape("agg function failed [how->prod,dtype->")
+        if using_infer_string:
+            msg = "dtype 'str' does not support operation 'prod'"
+        with pytest.raises(TypeError, match=msg):
+            func(**kwargs)
+
+    else:
+        result = func(**kwargs)
+        expected = Series(expected_data, index=expected_index)
+        tm.assert_series_equal(result, expected)
+
+
+def test_resample_empty():
+    # GH#52484
+    df = DataFrame(
+        index=pd.to_datetime(
+            ["2018-01-01 00:00:00", "2018-01-01 12:00:00", "2018-01-02 00:00:00"]
+        )
+    )
+    expected = DataFrame(
+        index=pd.to_datetime(
+            [
+                "2018-01-01 00:00:00",
+                "2018-01-01 08:00:00",
+                "2018-01-01 16:00:00",
+                "2018-01-02 00:00:00",
+            ]
+        )
+    )
+    result = df.resample("8h").mean()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_asfreq_respects_origin_with_fixed_freq_all_seconds_equal():
+    # GH#62725: Ensure Resampler.asfreq respects origin="start_day"
+    # when all datetimes share identical seconds values.
+    idx = [
+        datetime(2025, 10, 17, 17, 15, 10),
+        datetime(2025, 10, 17, 17, 16, 10),
+        datetime(2025, 10, 17, 17, 17, 10),
+    ]
+    df = DataFrame({"value": [0, 1, 2]}, index=idx)
+
+    result = df.resample("1min", origin="start_day").asfreq()
+
+    # Expected index: list of Timestamps, matching dtype
+    exp_idx = pd.DatetimeIndex(
+        [
+            pd.Timestamp("2025-10-17 17:15:00"),
+            pd.Timestamp("2025-10-17 17:16:00"),
+            pd.Timestamp("2025-10-17 17:17:00"),
+        ],
+        dtype=result.index.dtype,
+        freq="min",
+    )
+
+    exp = DataFrame({"value": [np.nan, np.nan, np.nan]}, index=exp_idx)
+    tm.assert_frame_equal(result, exp)
diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py
new file mode 100644
index 0000000000000000000000000000000000000000..862578decb782af4e056f0f076b1f4cba894c200
--- /dev/null
+++ b/pandas/tests/resample/test_resampler_grouper.py
@@ -0,0 +1,671 @@
+from textwrap import dedent
+
+import numpy as np
+import pytest
+
+from pandas.compat import is_platform_windows
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    TimedeltaIndex,
+    Timestamp,
+)
+import pandas._testing as tm
+from pandas.core.indexes.datetimes import date_range
+
+
+@pytest.fixture
+def test_frame():
+    return DataFrame(
+        {"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)},
+        index=date_range("1/1/2000", freq="s", periods=40, unit="ns"),
+    )
+
+
+def test_tab_complete_ipython6_warning(ip):
+    from IPython.core.completer import provisionalcompleter
+
+    code = dedent(
+        """\
+    import numpy as np
+    from pandas import Series, date_range
+    data = np.arange(10, dtype=np.float64)
+    index = date_range("2020-01-01", periods=len(data))
+    s = Series(data, index=index)
+    rs = s.resample("D")
+    """
+    )
+    ip.run_cell(code)
+
+    # GH 31324 newer jedi version raises Deprecation warning;
+    #  appears resolved 2021-02-02
+    with tm.assert_produces_warning(None, raise_on_extra_warnings=False):
+        with provisionalcompleter("ignore"):
+            list(ip.Completer.completions("rs.", 1))
+
+
+def test_deferred_with_groupby():
+    # GH 12486
+    # support deferred resample ops with groupby
+    data = [
+        ["2010-01-01", "A", 2],
+        ["2010-01-02", "A", 3],
+        ["2010-01-05", "A", 8],
+        ["2010-01-10", "A", 7],
+        ["2010-01-13", "A", 3],
+        ["2010-01-01", "B", 5],
+        ["2010-01-03", "B", 2],
+        ["2010-01-04", "B", 1],
+        ["2010-01-11", "B", 7],
+        ["2010-01-14", "B", 3],
+    ]
+
+    df = DataFrame(data, columns=["date", "id", "score"])
+    df.date = pd.to_datetime(df.date)
+
+    def f_0(x):
+        return x.set_index("date").resample("D").asfreq()
+
+    expected = df.groupby("id").apply(f_0)
+    result = df.set_index("date").groupby("id").resample("D").asfreq()
+    tm.assert_frame_equal(result, expected)
+
+    df = DataFrame(
+        {
+            "date": date_range(start="2016-01-01", periods=4, freq="W"),
+            "group": [1, 1, 2, 2],
+            "val": [5, 6, 7, 8],
+        }
+    ).set_index("date")
+
+    def f_1(x):
+        return x.resample("1D").ffill()
+
+    expected = df.groupby("group").apply(f_1)
+    result = df.groupby("group").resample("1D").ffill()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_getitem(test_frame):
+    g = test_frame.groupby("A")
+
+    expected = g.B.apply(lambda x: x.resample("2s").mean())
+
+    result = g.resample("2s").B.mean()
+    tm.assert_series_equal(result, expected)
+
+    result = g.B.resample("2s").mean()
+    tm.assert_series_equal(result, expected)
+
+    result = g.resample("2s").mean().B
+    tm.assert_series_equal(result, expected)
+
+
+def test_getitem_multiple():
+    # GH 13174
+    # multiple calls after selection causing an issue with aliasing
+    data = [{"id": 1, "buyer": "A"}, {"id": 2, "buyer": "B"}]
+    df = DataFrame(data, index=date_range("2016-01-01", periods=2))
+    r = df.groupby("id").resample("1D")
+    result = r["buyer"].count()
+
+    exp_mi = pd.MultiIndex.from_arrays([[1, 2], df.index], names=("id", None))
+    expected = Series(
+        [1, 1],
+        index=exp_mi,
+        name="buyer",
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = r["buyer"].count()
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_resample_on_api_with_getitem():
+    # GH 17813
+    df = DataFrame(
+        {"id": list("aabbb"), "date": date_range("1-1-2016", periods=5), "data": 1}
+    )
+    exp = df.set_index("date").groupby("id").resample("2D")["data"].sum()
+    result = df.groupby("id").resample("2D", on="date")["data"].sum()
+    tm.assert_series_equal(result, exp)
+
+
+def test_groupby_with_origin():
+    # GH 31809
+
+    freq = "1399min"  # prime number that is smaller than 24h
+    start, end = "1/1/2000 00:00:00", "1/31/2000 00:00"
+    middle = "1/15/2000 00:00:00"
+
+    rng = date_range(start, end, freq="1231min")  # prime number
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+    ts2 = ts[middle:end]
+
+    # proves that grouper without a fixed origin does not work
+    # when dealing with unusual frequencies
+    simple_grouper = pd.Grouper(freq=freq)
+    count_ts = ts.groupby(simple_grouper).agg("count")
+    count_ts = count_ts[middle:end]
+    count_ts2 = ts2.groupby(simple_grouper).agg("count")
+    with pytest.raises(AssertionError, match="Index are different"):
+        tm.assert_index_equal(count_ts.index, count_ts2.index)
+
+    # test origin on 1970-01-01 00:00:00
+    origin = Timestamp(0)
+    adjusted_grouper = pd.Grouper(freq=freq, origin=origin)
+    adjusted_count_ts = ts.groupby(adjusted_grouper).agg("count")
+    adjusted_count_ts = adjusted_count_ts[middle:end]
+    adjusted_count_ts2 = ts2.groupby(adjusted_grouper).agg("count")
+    tm.assert_series_equal(adjusted_count_ts, adjusted_count_ts2)
+
+    # test origin on 2049-10-18 20:00:00
+    origin_future = Timestamp(0) + pd.Timedelta("1399min") * 30_000
+    adjusted_grouper2 = pd.Grouper(freq=freq, origin=origin_future)
+    adjusted2_count_ts = ts.groupby(adjusted_grouper2).agg("count")
+    adjusted2_count_ts = adjusted2_count_ts[middle:end]
+    adjusted2_count_ts2 = ts2.groupby(adjusted_grouper2).agg("count")
+    tm.assert_series_equal(adjusted2_count_ts, adjusted2_count_ts2)
+
+    # both grouper use an adjusted timestamp that is a multiple of 1399 min
+    # they should be equals even if the adjusted_timestamp is in the future
+    tm.assert_series_equal(adjusted_count_ts, adjusted2_count_ts2)
+
+
+def test_nearest():
+    # GH 17496
+    # Resample nearest
+    index = date_range("1/1/2000", periods=3, freq="min", unit="ns")
+    result = Series(range(3), index=index).resample("20s").nearest()
+
+    expected = Series(
+        [0, 0, 1, 1, 1, 2, 2],
+        index=pd.DatetimeIndex(
+            [
+                "2000-01-01 00:00:00",
+                "2000-01-01 00:00:20",
+                "2000-01-01 00:00:40",
+                "2000-01-01 00:01:00",
+                "2000-01-01 00:01:20",
+                "2000-01-01 00:01:40",
+                "2000-01-01 00:02:00",
+            ],
+            dtype="datetime64[ns]",
+            freq="20s",
+        ),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "f",
+    [
+        "first",
+        "last",
+        "median",
+        "sem",
+        "sum",
+        "mean",
+        "min",
+        "max",
+        "size",
+        "count",
+        "nearest",
+        "bfill",
+        "ffill",
+        "asfreq",
+        "ohlc",
+    ],
+)
+def test_methods(f, test_frame):
+    g = test_frame.groupby("A")
+    r = g.resample("2s")
+
+    result = getattr(r, f)()
+    expected = g.apply(lambda x: getattr(x.resample("2s"), f)())
+    tm.assert_equal(result, expected)
+
+
+def test_methods_nunique(test_frame):
+    # series only
+    g = test_frame.groupby("A")
+    r = g.resample("2s")
+    result = r.B.nunique()
+    expected = g.B.apply(lambda x: x.resample("2s").nunique())
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("f", ["std", "var"])
+def test_methods_std_var(f, test_frame):
+    g = test_frame.groupby("A")
+    r = g.resample("2s")
+    result = getattr(r, f)(ddof=1)
+    expected = g.apply(lambda x: getattr(x.resample("2s"), f)(ddof=1))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply(test_frame):
+    g = test_frame.groupby("A")
+    r = g.resample("2s")
+
+    # reduction
+    expected = g.resample("2s").sum()
+
+    def f_0(x):
+        return x.resample("2s").sum()
+
+    result = r.apply(f_0)
+    tm.assert_frame_equal(result, expected)
+
+    def f_1(x):
+        return x.resample("2s").apply(lambda y: y.sum())
+
+    result = g.apply(f_1)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_with_mutated_index():
+    # GH 15169
+    index = date_range("1-1-2015", "12-31-15", freq="D")
+    df = DataFrame(
+        data={"col1": np.random.default_rng(2).random(len(index))}, index=index
+    )
+
+    def f(x):
+        s = Series([1, 2], index=["a", "b"])
+        return s
+
+    expected = df.groupby(pd.Grouper(freq="ME")).apply(f)
+
+    result = df.resample("ME").apply(f)
+    tm.assert_frame_equal(result, expected)
+
+    # A case for series
+    expected = df["col1"].groupby(pd.Grouper(freq="ME"), group_keys=False).apply(f)
+    result = df["col1"].resample("ME").apply(f)
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_columns_multilevel():
+    # GH 16231
+    cols = pd.MultiIndex.from_tuples([("A", "a", "", "one"), ("B", "b", "i", "two")])
+    ind = date_range(start="2017-01-01", freq="15Min", periods=8)
+    df = DataFrame(
+        np.array([0] * 16, dtype=np.int64).reshape(8, 2), index=ind, columns=cols
+    )
+    agg_dict = {col: (np.sum if col[3] == "one" else np.mean) for col in df.columns}
+    result = df.resample("h").apply(lambda x: agg_dict[x.name](x))
+    expected = DataFrame(
+        2 * [[0, 0.0]],
+        index=date_range(start="2017-01-01", freq="1h", periods=2),
+        columns=pd.MultiIndex.from_tuples(
+            [("A", "a", "", "one"), ("B", "b", "i", "two")]
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_non_naive_index():
+    def weighted_quantile(series, weights, q):
+        series = series.sort_values()
+        cumsum = weights.reindex(series.index).fillna(0).cumsum()
+        cutoff = cumsum.iloc[-1] * q
+        return series[cumsum >= cutoff].iloc[0]
+
+    times = date_range("2017-6-23 18:00", periods=8, freq="15min", tz="UTC")
+    data = Series([1.0, 1, 1, 1, 1, 2, 2, 0], index=times)
+    weights = Series([160.0, 91, 65, 43, 24, 10, 1, 0], index=times)
+
+    result = data.resample("D").apply(weighted_quantile, weights=weights, q=0.5)
+    ind = date_range(
+        "2017-06-23 00:00:00+00:00", "2017-06-23 00:00:00+00:00", freq="D", tz="UTC"
+    )
+    expected = Series([1.0], index=ind)
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_groupby_with_label(unit):
+    # GH 13235
+    index = date_range("2000-01-01", freq="2D", periods=5, unit=unit)
+    df = DataFrame(index=index, data={"col0": [0, 0, 1, 1, 2], "col1": [1, 1, 1, 1, 1]})
+    result = df.groupby("col0").resample("1W", label="left").sum()
+
+    mi = [
+        np.array([0, 0, 1, 2], dtype=np.int64),
+        np.array(
+            ["1999-12-26", "2000-01-02", "2000-01-02", "2000-01-02"],
+            dtype=f"M8[{unit}]",
+        ),
+    ]
+    mindex = pd.MultiIndex.from_arrays(mi, names=["col0", None])
+    expected = DataFrame(data={"col1": [1, 1, 2, 1]}, index=mindex)
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_consistency_with_window(test_frame):
+    # consistent return values with window
+    df = test_frame
+    expected = Index([1, 2, 3], name="A")
+    result = df.groupby("A").resample("2s").mean()
+    assert result.index.nlevels == 2
+    tm.assert_index_equal(result.index.levels[0], expected)
+
+    result = df.groupby("A").rolling(20).mean()
+    assert result.index.nlevels == 2
+    tm.assert_index_equal(result.index.levels[0], expected)
+
+
+def test_median_duplicate_columns():
+    # GH 14233
+
+    df = DataFrame(
+        np.random.default_rng(2).standard_normal((20, 3)),
+        columns=list("aaa"),
+        index=date_range("2012-01-01", periods=20, freq="s"),
+    )
+    result = df.resample("5s").median()
+    df.columns = ["a", "b", "c"]
+    expected = df.resample("5s").median()
+    expected.columns = result.columns
+    tm.assert_frame_equal(result, expected)
+
+
+def test_apply_to_one_column_of_df():
+    # GH: 36951
+    df = DataFrame(
+        {"col": range(10), "col1": range(10, 20)},
+        index=date_range("2012-01-01", periods=10, freq="20min"),
+    )
+
+    # access "col" via getattr -> make sure we handle AttributeError
+    result = df.resample("h").apply(lambda group: group.col.sum())
+    expected = Series(
+        [3, 12, 21, 9], index=date_range("2012-01-01", periods=4, freq="h")
+    )
+    tm.assert_series_equal(result, expected)
+
+    # access "col" via _getitem__ -> make sure we handle KeyErrpr
+    result = df.resample("h").apply(lambda group: group["col"].sum())
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_groupby_agg():
+    # GH: 33548
+    df = DataFrame(
+        {
+            "cat": [
+                "cat_1",
+                "cat_1",
+                "cat_2",
+                "cat_1",
+                "cat_2",
+                "cat_1",
+                "cat_2",
+                "cat_1",
+            ],
+            "num": [5, 20, 22, 3, 4, 30, 10, 50],
+            "date": [
+                "2019-2-1",
+                "2018-02-03",
+                "2020-3-11",
+                "2019-2-2",
+                "2019-2-2",
+                "2018-12-4",
+                "2020-3-11",
+                "2020-12-12",
+            ],
+        }
+    )
+    df["date"] = pd.to_datetime(df["date"])
+
+    resampled = df.groupby("cat").resample("YE", on="date")
+    expected = resampled[["num"]].sum()
+    result = resampled.agg({"num": "sum"})
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_groupby_agg_listlike():
+    # GH 42905
+    ts = Timestamp("2021-02-28 00:00:00")
+    df = DataFrame({"class": ["beta"], "value": [69]}, index=Index([ts], name="date"))
+    resampled = df.groupby("class").resample("ME")["value"]
+    result = resampled.agg(["sum", "size"])
+    expected = DataFrame(
+        [[69, 1]],
+        index=pd.MultiIndex.from_tuples([("beta", ts)], names=["class", "date"]),
+        columns=["sum", "size"],
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
+def test_empty(keys):
+    # GH 26411
+    df = DataFrame([], columns=["a", "b"], index=TimedeltaIndex([]))
+    result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean()
+    expected_columns = ["b"] if keys == ["a"] else []
+    expected = (
+        DataFrame(columns=["a", "b"])
+        .set_index(keys, drop=False)
+        .set_index(TimedeltaIndex([]), append=True)[expected_columns]
+    )
+    if len(keys) == 1:
+        expected.index.name = keys[0]
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("consolidate", [True, False])
+def test_resample_groupby_agg_object_dtype_all_nan(consolidate):
+    # https://github.com/pandas-dev/pandas/issues/39329
+
+    dates = date_range("2020-01-01", periods=15, freq="D", unit="ns")
+    df1 = DataFrame({"key": "A", "date": dates, "col1": range(15), "col_object": "val"})
+    df2 = DataFrame({"key": "B", "date": dates, "col1": range(15)})
+    df = pd.concat([df1, df2], ignore_index=True)
+    if consolidate:
+        df = df._consolidate()
+
+    result = df.groupby(["key"]).resample("W", on="date").min()
+    idx = pd.MultiIndex.from_arrays(
+        [
+            ["A"] * 3 + ["B"] * 3,
+            pd.to_datetime(["2020-01-05", "2020-01-12", "2020-01-19"] * 2).as_unit(
+                "ns"
+            ),
+        ],
+        names=["key", "date"],
+    )
+    expected = DataFrame(
+        {
+            "col1": [0, 5, 12] * 2,
+            "col_object": ["val"] * 3 + [np.nan] * 3,
+        },
+        index=idx,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("min_count", [0, 1])
+def test_groupby_resample_empty_sum_string(
+    string_dtype_no_object, test_frame, min_count
+):
+    # https://github.com/pandas-dev/pandas/issues/60229
+    dtype = string_dtype_no_object
+    test_frame = test_frame.assign(B=pd.array([pd.NA] * len(test_frame), dtype=dtype))
+    gbrs = test_frame.groupby("A").resample("40s")
+    result = gbrs.sum(min_count=min_count)
+
+    index = pd.MultiIndex(
+        levels=[[1, 2, 3], [pd.to_datetime("2000-01-01", unit="ns").as_unit("ns")]],
+        codes=[[0, 1, 2], [0, 0, 0]],
+        names=["A", None],
+    )
+    value = "" if min_count == 0 else pd.NA
+    expected = DataFrame({"B": value}, index=index, dtype=dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_resample_with_list_of_keys():
+    # GH 47362
+    df = DataFrame(
+        data={
+            "date": date_range(start="2016-01-01", periods=8),
+            "group": [0, 0, 0, 0, 1, 1, 1, 1],
+            "val": [1, 7, 5, 2, 3, 10, 5, 1],
+        }
+    )
+    result = df.groupby("group").resample("2D", on="date")[["val"]].mean()
+
+    mi_exp = pd.MultiIndex.from_arrays(
+        [[0, 0, 1, 1], df["date"]._values[::2]], names=["group", "date"]
+    )
+    expected = DataFrame(
+        data={
+            "val": [4.0, 3.5, 6.5, 3.0],
+        },
+        index=mi_exp,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
+def test_resample_no_index(keys):
+    # GH 47705
+    df = DataFrame([], columns=["a", "b", "date"])
+    df["date"] = pd.to_datetime(df["date"])
+    df = df.set_index("date")
+    result = df.groupby(keys).resample(rule=pd.to_timedelta("00:00:01")).mean()
+    expected_columns = ["b"] if keys == ["a"] else []
+    expected = DataFrame(columns=["a", "b", "date"]).set_index(keys, drop=False)
+    expected["date"] = pd.to_datetime(expected["date"])
+    expected = expected.set_index("date", append=True, drop=True)[expected_columns]
+    if len(keys) == 1:
+        expected.index.name = keys[0]
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_no_columns():
+    # GH#52484
+    df = DataFrame(
+        index=Index(
+            pd.to_datetime(
+                ["2018-01-01 00:00:00", "2018-01-01 12:00:00", "2018-01-02 00:00:00"]
+            ),
+            name="date",
+        )
+    )
+    result = df.groupby([0, 0, 1]).resample(rule=pd.to_timedelta("06:00:00")).mean()
+    index = pd.to_datetime(
+        [
+            "2018-01-01 00:00:00",
+            "2018-01-01 06:00:00",
+            "2018-01-01 12:00:00",
+            "2018-01-02 00:00:00",
+        ]
+    )
+    expected = DataFrame(
+        index=pd.MultiIndex(
+            levels=[np.array([0, 1], dtype=np.intp), index],
+            codes=[[0, 0, 0, 1], [0, 1, 2, 3]],
+            names=[None, "date"],
+        )
+    )
+
+    # GH#52710 - Index comes out as 32-bit on 64-bit Windows
+    tm.assert_frame_equal(result, expected, check_index_type=not is_platform_windows())
+
+
+def test_groupby_resample_size_all_index_same():
+    # GH 46826
+    df = DataFrame(
+        {"A": [1] * 3 + [2] * 3 + [1] * 3 + [2] * 3, "B": np.arange(12)},
+        index=date_range("31/12/2000 18:00", freq="h", periods=12, unit="ns"),
+    )
+    result = df.groupby("A").resample("D").size()
+
+    mi_exp = pd.MultiIndex.from_arrays(
+        [
+            [1, 1, 2, 2],
+            pd.DatetimeIndex(["2000-12-31", "2001-01-01"] * 2, dtype="M8[ns]"),
+        ],
+        names=["A", None],
+    )
+    expected = Series(
+        3,
+        index=mi_exp,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_groupby_resample_on_index_with_list_of_keys():
+    # GH 50840
+    df = DataFrame(
+        data={
+            "group": [0, 0, 0, 0, 1, 1, 1, 1],
+            "val": [3, 1, 4, 1, 5, 9, 2, 6],
+        },
+        index=date_range(start="2016-01-01", periods=8, name="date"),
+    )
+    result = df.groupby("group").resample("2D")[["val"]].mean()
+
+    mi_exp = pd.MultiIndex.from_arrays(
+        [[0, 0, 1, 1], df.index[::2]], names=["group", "date"]
+    )
+    expected = DataFrame(
+        data={
+            "val": [2.0, 2.5, 7.0, 4.0],
+        },
+        index=mi_exp,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_resample_on_index_with_list_of_keys_multi_columns():
+    # GH 50876
+    df = DataFrame(
+        data={
+            "group": [0, 0, 0, 0, 1, 1, 1, 1],
+            "first_val": [3, 1, 4, 1, 5, 9, 2, 6],
+            "second_val": [2, 7, 1, 8, 2, 8, 1, 8],
+            "third_val": [1, 4, 1, 4, 2, 1, 3, 5],
+        },
+        index=date_range(start="2016-01-01", periods=8, name="date"),
+    )
+    result = df.groupby("group").resample("2D")[["first_val", "second_val"]].mean()
+
+    mi_exp = pd.MultiIndex.from_arrays(
+        [[0, 0, 1, 1], df.index[::2]], names=["group", "date"]
+    )
+    expected = DataFrame(
+        data={
+            "first_val": [2.0, 2.5, 7.0, 4.0],
+            "second_val": [4.5, 4.5, 5.0, 4.5],
+        },
+        index=mi_exp,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_resample_on_index_with_list_of_keys_missing_column():
+    # GH 50876
+    df = DataFrame(
+        data={
+            "group": [0, 0, 0, 0, 1, 1, 1, 1],
+            "val": [3, 1, 4, 1, 5, 9, 2, 6],
+        },
+        index=Series(
+            date_range(start="2016-01-01", periods=8),
+            name="date",
+        ),
+    )
+    gb = df.groupby("group")
+    rs = gb.resample("2D")
+    with pytest.raises(KeyError, match="Columns not found"):
+        rs[["val_not_in_dataframe"]]
diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py
new file mode 100644
index 0000000000000000000000000000000000000000..e214a9f17824dbfc4dc00896b651d9113fdac472
--- /dev/null
+++ b/pandas/tests/resample/test_time_grouper.py
@@ -0,0 +1,439 @@
+from datetime import datetime
+from operator import methodcaller
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    Timestamp,
+)
+import pandas._testing as tm
+from pandas.core.groupby.grouper import Grouper
+from pandas.core.indexes.datetimes import date_range
+
+
+@pytest.fixture
+def test_series():
+    return Series(
+        np.random.default_rng(2).standard_normal(1000),
+        index=date_range("1/1/2000", periods=1000),
+    )
+
+
+def test_apply(test_series):
+    grouper = Grouper(freq="YE", label="right", closed="right")
+
+    grouped = test_series.groupby(grouper)
+
+    def f(x):
+        return x.sort_values()[-3:]
+
+    applied = grouped.apply(f)
+    expected = test_series.groupby(lambda x: x.year).apply(f)
+
+    applied.index = applied.index.droplevel(0)
+    expected.index = expected.index.droplevel(0)
+    tm.assert_series_equal(applied, expected)
+
+
+def test_count(test_series):
+    test_series[::3] = np.nan
+
+    expected = test_series.groupby(lambda x: x.year).count()
+
+    grouper = Grouper(freq="YE", label="right", closed="right")
+    result = test_series.groupby(grouper).count()
+    expected.index = result.index
+    tm.assert_series_equal(result, expected)
+
+    result = test_series.resample("YE").count()
+    expected.index = result.index
+    tm.assert_series_equal(result, expected)
+
+
+def test_numpy_reduction(test_series):
+    result = test_series.resample("YE", closed="right").prod()
+    expected = test_series.groupby(lambda x: x.year).agg(np.prod)
+    expected.index = result.index
+    tm.assert_series_equal(result, expected)
+
+
+def test_apply_iteration():
+    # #2300
+    N = 1000
+    ind = date_range(start="2000-01-01", freq="D", periods=N)
+    df = DataFrame({"open": 1, "close": 2}, index=ind)
+    tg = Grouper(freq="ME")
+
+    grouper, _ = tg._get_grouper(df)
+
+    # Errors
+    grouped = df.groupby(grouper, group_keys=False)
+
+    def f(df):
+        return df["close"] / df["open"]
+
+    # it works!
+    result = grouped.apply(f)
+    tm.assert_index_equal(result.index, df.index)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        Index([1, 2]),
+        Index(["a", "b"]),
+        Index([1.1, 2.2]),
+        pd.MultiIndex.from_arrays([[1, 2], ["a", "b"]]),
+    ],
+)
+def test_fails_on_no_datetime_index(index):
+    name = type(index).__name__
+    df = DataFrame({"a": range(len(index))}, index=index)
+
+    msg = (
+        "Only valid with DatetimeIndex, TimedeltaIndex "
+        f"or PeriodIndex, but got an instance of '{name}'"
+    )
+    with pytest.raises(TypeError, match=msg):
+        df.groupby(Grouper(freq="D"))
+
+
+def test_aaa_group_order():
+    # GH 12840
+    # check TimeGrouper perform stable sorts
+    n = 20
+    data = np.random.default_rng(2).standard_normal((n, 4))
+    df = DataFrame(data, columns=["A", "B", "C", "D"])
+    df["key"] = [
+        datetime(2013, 1, 1),
+        datetime(2013, 1, 2),
+        datetime(2013, 1, 3),
+        datetime(2013, 1, 4),
+        datetime(2013, 1, 5),
+    ] * 4
+    grouped = df.groupby(Grouper(key="key", freq="D"))
+
+    tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), df[::5])
+    tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)), df[1::5])
+    tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)), df[2::5])
+    tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)), df[3::5])
+    tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), df[4::5])
+
+
+def test_aggregate_normal(resample_method):
+    """Check TimeGrouper's aggregation is identical as normal groupby."""
+
+    data = np.random.default_rng(2).standard_normal((20, 4))
+    normal_df = DataFrame(data, columns=["A", "B", "C", "D"])
+    normal_df["key"] = [1, 2, 3, 4, 5] * 4
+
+    dt_df = DataFrame(data, columns=["A", "B", "C", "D"])
+    dt_df["key"] = Index(
+        [
+            datetime(2013, 1, 1),
+            datetime(2013, 1, 2),
+            datetime(2013, 1, 3),
+            datetime(2013, 1, 4),
+            datetime(2013, 1, 5),
+        ]
+        * 4,
+        dtype="M8[ns]",
+    )
+
+    normal_grouped = normal_df.groupby("key")
+    dt_grouped = dt_df.groupby(Grouper(key="key", freq="D"))
+
+    expected = getattr(normal_grouped, resample_method)()
+    dt_result = getattr(dt_grouped, resample_method)()
+    expected.index = date_range(
+        start="2013-01-01", freq="D", periods=5, unit="ns", name="key"
+    )
+    tm.assert_equal(expected, dt_result)
+
+
+@pytest.mark.xfail(reason="if TimeGrouper is used included, 'nth' doesn't work yet")
+def test_aggregate_nth():
+    """Check TimeGrouper's aggregation is identical as normal groupby."""
+
+    data = np.random.default_rng(2).standard_normal((20, 4))
+    normal_df = DataFrame(data, columns=["A", "B", "C", "D"])
+    normal_df["key"] = [1, 2, 3, 4, 5] * 4
+
+    dt_df = DataFrame(data, columns=["A", "B", "C", "D"])
+    dt_df["key"] = [
+        datetime(2013, 1, 1),
+        datetime(2013, 1, 2),
+        datetime(2013, 1, 3),
+        datetime(2013, 1, 4),
+        datetime(2013, 1, 5),
+    ] * 4
+
+    normal_grouped = normal_df.groupby("key")
+    dt_grouped = dt_df.groupby(Grouper(key="key", freq="D"))
+
+    expected = normal_grouped.nth(3)
+    expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key")
+    dt_result = dt_grouped.nth(3)
+    tm.assert_frame_equal(expected, dt_result)
+
+
+@pytest.mark.parametrize(
+    "method, method_args, unit",
+    [
+        ("sum", {}, 0),
+        ("sum", {"min_count": 0}, 0),
+        ("sum", {"min_count": 1}, np.nan),
+        ("prod", {}, 1),
+        ("prod", {"min_count": 0}, 1),
+        ("prod", {"min_count": 1}, np.nan),
+    ],
+)
+def test_resample_entirely_nat_window(method, method_args, unit):
+    ser = Series([0] * 2 + [np.nan] * 2, index=date_range("2017", periods=4, unit="ns"))
+    result = methodcaller(method, **method_args)(ser.resample("2D"))
+
+    exp_dti = pd.DatetimeIndex(["2017-01-01", "2017-01-03"], dtype="M8[ns]", freq="2D")
+    expected = Series([0.0, unit], index=exp_dti)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "func, fill_value",
+    [("min", np.nan), ("max", np.nan), ("sum", 0), ("prod", 1), ("count", 0)],
+)
+def test_aggregate_with_nat(func, fill_value):
+    # check TimeGrouper's aggregation is identical as normal groupby
+    # if NaT is included, 'var', 'std', 'mean', 'first','last'
+    # and 'nth' doesn't work yet
+
+    n = 20
+    data = np.random.default_rng(2).standard_normal((n, 4)).astype("int64")
+    normal_df = DataFrame(data, columns=["A", "B", "C", "D"])
+    normal_df["key"] = [1, 2, np.nan, 4, 5] * 4
+
+    dt_df = DataFrame(data, columns=["A", "B", "C", "D"])
+    dt_df["key"] = Index(
+        [
+            datetime(2013, 1, 1),
+            datetime(2013, 1, 2),
+            pd.NaT,
+            datetime(2013, 1, 4),
+            datetime(2013, 1, 5),
+        ]
+        * 4,
+        dtype="M8[ns]",
+    )
+
+    normal_grouped = normal_df.groupby("key")
+    dt_grouped = dt_df.groupby(Grouper(key="key", freq="D"))
+
+    normal_result = getattr(normal_grouped, func)()
+    dt_result = getattr(dt_grouped, func)()
+
+    pad = DataFrame([[fill_value] * 4], index=[3], columns=["A", "B", "C", "D"])
+    expected = pd.concat([normal_result, pad])
+    expected = expected.sort_index()
+    dti = date_range(
+        start="2013-01-01",
+        freq="D",
+        periods=5,
+        name="key",
+        unit=dt_df["key"]._values.unit,
+    )
+    expected.index = dti._with_freq(None)  # TODO: is this desired?
+    tm.assert_frame_equal(expected, dt_result)
+    assert dt_result.index.name == "key"
+
+
+def test_aggregate_with_nat_size():
+    # GH 9925
+    n = 20
+    data = np.random.default_rng(2).standard_normal((n, 4)).astype("int64")
+    normal_df = DataFrame(data, columns=["A", "B", "C", "D"])
+    normal_df["key"] = [1, 2, np.nan, 4, 5] * 4
+
+    dt_df = DataFrame(data, columns=["A", "B", "C", "D"])
+    dt_df["key"] = Index(
+        [
+            datetime(2013, 1, 1),
+            datetime(2013, 1, 2),
+            pd.NaT,
+            datetime(2013, 1, 4),
+            datetime(2013, 1, 5),
+        ]
+        * 4,
+        dtype="M8[ns]",
+    )
+
+    normal_grouped = normal_df.groupby("key")
+    dt_grouped = dt_df.groupby(Grouper(key="key", freq="D"))
+
+    normal_result = normal_grouped.size()
+    dt_result = dt_grouped.size()
+
+    pad = Series([0], index=[3])
+    expected = pd.concat([normal_result, pad])
+    expected = expected.sort_index()
+    expected.index = date_range(
+        start="2013-01-01",
+        freq="D",
+        periods=5,
+        name="key",
+        unit=dt_df["key"]._values.unit,
+    )._with_freq(None)
+    tm.assert_series_equal(expected, dt_result)
+    assert dt_result.index.name == "key"
+
+
+def test_repr():
+    # GH18203
+    result = repr(Grouper(key="A", freq="h"))
+    expected = (
+        "TimeGrouper(key='A', freq=<Hour>, sort=True, dropna=True, "
+        "closed='left', label='left', how='mean', "
+        "convention='e', origin='start_day')"
+    )
+    assert result == expected
+
+    result = repr(Grouper(key="A", freq="h", origin="2000-01-01"))
+    expected = (
+        "TimeGrouper(key='A', freq=<Hour>, sort=True, dropna=True, "
+        "closed='left', label='left', how='mean', "
+        "convention='e', origin=Timestamp('2000-01-01 00:00:00'))"
+    )
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "method, method_args, expected_values",
+    [
+        ("sum", {}, [1, 0, 1]),
+        ("sum", {"min_count": 0}, [1, 0, 1]),
+        ("sum", {"min_count": 1}, [1, np.nan, 1]),
+        ("sum", {"min_count": 2}, [np.nan, np.nan, np.nan]),
+        ("prod", {}, [1, 1, 1]),
+        ("prod", {"min_count": 0}, [1, 1, 1]),
+        ("prod", {"min_count": 1}, [1, np.nan, 1]),
+        ("prod", {"min_count": 2}, [np.nan, np.nan, np.nan]),
+    ],
+)
+def test_upsample_sum(method, method_args, expected_values):
+    ser = Series(1, index=date_range("2017", periods=2, freq="h", unit="ns"))
+    resampled = ser.resample("30min")
+    index = pd.DatetimeIndex(
+        ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"],
+        dtype="M8[ns]",
+        freq="30min",
+    )
+    result = methodcaller(method, **method_args)(resampled)
+    expected = Series(expected_values, index=index)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.fixture
+def groupy_test_df():
+    return DataFrame(
+        {"price": [10, 11, 9], "volume": [50, 60, 50]},
+        index=date_range("01/01/2018", periods=3, freq="W", unit="ns"),
+    )
+
+
+def test_groupby_resample_interpolate_raises(groupy_test_df):
+    # GH 35325
+
+    # Make a copy of the test data frame that has index.name=None
+    groupy_test_df_without_index_name = groupy_test_df.copy()
+    groupy_test_df_without_index_name.index.name = None
+
+    dfs = [groupy_test_df, groupy_test_df_without_index_name]
+
+    for df in dfs:
+        with pytest.raises(
+            NotImplementedError,
+            match="Direct interpolation of MultiIndex data frames is not supported",
+        ):
+            df.groupby("volume").resample("1D").interpolate(method="linear")
+
+
+def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df):
+    # GH 35325
+
+    # Make a copy of the test data frame that has index.name=None
+    groupy_test_df_without_index_name = groupy_test_df.copy()
+    groupy_test_df_without_index_name.index.name = None
+
+    dfs = [groupy_test_df, groupy_test_df_without_index_name]
+
+    for df in dfs:
+        result = df.groupby("volume").apply(
+            lambda x: x.resample("1D").interpolate(method="linear"),
+        )
+
+        volume = [50] * 15 + [60]
+        week_starting = [
+            *list(date_range("2018-01-07", "2018-01-21", unit="ns")),
+            Timestamp("2018-01-14"),
+        ]
+        expected_ind = pd.MultiIndex.from_arrays(
+            [volume, week_starting],
+            names=["volume", df.index.name],
+        )
+
+        expected = DataFrame(
+            data={
+                "price": [
+                    10.0,
+                    9.928571428571429,
+                    9.857142857142858,
+                    9.785714285714286,
+                    9.714285714285714,
+                    9.642857142857142,
+                    9.571428571428571,
+                    9.5,
+                    9.428571428571429,
+                    9.357142857142858,
+                    9.285714285714286,
+                    9.214285714285714,
+                    9.142857142857142,
+                    9.071428571428571,
+                    9.0,
+                    11.0,
+                ]
+            },
+            index=expected_ind,
+        )
+        tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df):
+    """Similar test as test_groupby_resample_interpolate_with_apply_syntax but
+    with resampling that results in missing anchor points when interpolating.
+    See GH#21351."""
+    # GH#21351
+    result = groupy_test_df.groupby("volume").apply(
+        lambda x: x.resample("265h").interpolate(method="linear")
+    )
+
+    volume = [50, 50, 60]
+    week_starting = pd.DatetimeIndex(
+        [
+            Timestamp("2018-01-07"),
+            Timestamp("2018-01-18 01:00:00"),
+            Timestamp("2018-01-14"),
+        ]
+    ).as_unit("ns")
+    expected_ind = pd.MultiIndex.from_arrays(
+        [volume, week_starting],
+        names=["volume", "week_starting"],
+    )
+
+    expected = DataFrame(
+        data={"price": [10.0, 9.5, 11.0]},
+        index=expected_ind,
+    )
+    tm.assert_frame_equal(result, expected, check_names=False)
diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bec66e3a1aa2c06bbcdac1be6a982a89f25d814
--- /dev/null
+++ b/pandas/tests/resample/test_timedelta.py
@@ -0,0 +1,218 @@
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+from pandas.core.indexes.timedeltas import timedelta_range
+
+
+def test_asfreq_bug():
+    df = DataFrame(data=[1, 3], index=[timedelta(), timedelta(minutes=3)])
+    result = df.resample("1min").asfreq()
+    expected = DataFrame(
+        data=[1, np.nan, np.nan, 3],
+        index=timedelta_range("0 day", periods=4, freq="1min", unit="us"),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_with_nat():
+    # GH 13223
+    index = pd.to_timedelta(["0s", pd.NaT, "2s"])
+    result = DataFrame({"value": [2, 3, 5]}, index).resample("1s").mean()
+    expected = DataFrame(
+        {"value": [2.5, np.nan, 5.0]},
+        index=timedelta_range("0 day", periods=3, freq="1s"),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_as_freq_with_subperiod():
+    # GH 13022
+    index = timedelta_range("00:00:00", "00:10:00", freq="5min")
+    df = DataFrame(data={"value": [1, 5, 10]}, index=index)
+    result = df.resample("2min").asfreq()
+    expected_data = {"value": [1, np.nan, np.nan, np.nan, np.nan, 10]}
+    expected = DataFrame(
+        data=expected_data, index=timedelta_range("00:00:00", "00:10:00", freq="2min")
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_with_timedeltas():
+    expected = DataFrame({"A": np.arange(1480)})
+    expected = expected.groupby(expected.index // 30).sum()
+    expected.index = timedelta_range("0 days", freq="30min", periods=50)
+
+    df = DataFrame(
+        {"A": np.arange(1480)},
+        index=pd.to_timedelta(np.arange(1480), unit="min").as_unit("us"),
+    )
+    result = df.resample("30min").sum()
+
+    tm.assert_frame_equal(result, expected)
+
+    s = df["A"]
+    result = s.resample("30min").sum()
+    tm.assert_series_equal(result, expected["A"])
+
+
+def test_resample_single_period_timedelta():
+    s = Series(list(range(5)), index=timedelta_range("1 day", freq="s", periods=5))
+    result = s.resample("2s").sum()
+    expected = Series([1, 5, 4], index=timedelta_range("1 day", freq="2s", periods=3))
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_timedelta_idempotency():
+    # GH 12072
+    index = timedelta_range("0", periods=9, freq="10ms")
+    series = Series(range(9), index=index)
+    result = series.resample("10ms").mean()
+    expected = series.astype(float)
+    tm.assert_series_equal(result, expected)
+
+
+def test_resample_offset_with_timedeltaindex():
+    # GH 10530 & 31809
+    rng = timedelta_range(start="0s", periods=25, freq="s")
+    ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+    with_base = ts.resample("2s", offset="5s").mean()
+    without_base = ts.resample("2s").mean()
+
+    exp_without_base = timedelta_range(start="0s", end="25s", freq="2s")
+    exp_with_base = timedelta_range(start="5s", end="29s", freq="2s")
+
+    tm.assert_index_equal(without_base.index, exp_without_base)
+    tm.assert_index_equal(with_base.index, exp_with_base)
+
+
+def test_resample_categorical_data_with_timedeltaindex():
+    # GH #12169
+    df = DataFrame({"Group_obj": "A"}, index=pd.to_timedelta(list(range(20)), unit="s"))
+    df["Group"] = df["Group_obj"].astype("category")
+    result = df.resample("10s").agg(lambda x: (x.value_counts().index[0]))
+    exp_tdi = pd.TimedeltaIndex(np.array([0, 10], dtype="m8[s]"), freq="10s")
+    expected = DataFrame(
+        {"Group_obj": ["A", "A"], "Group": ["A", "A"]},
+        index=exp_tdi,
+    )
+    expected = expected.reindex(["Group_obj", "Group"], axis=1)
+    expected["Group"] = expected["Group_obj"].astype("category")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_timedelta_values():
+    # GH 13119
+    # check that timedelta dtype is preserved when NaT values are
+    # introduced by the resampling
+
+    times = timedelta_range("1 day", "6 day", freq="4D")
+    df = DataFrame({"time": times}, index=times)
+
+    times2 = timedelta_range("1 day", "6 day", freq="2D")
+    exp = Series(times2, index=times2, name="time")
+    exp.iloc[1] = pd.NaT
+
+    res = df.resample("2D").first()["time"]
+    tm.assert_series_equal(res, exp)
+    res = df["time"].resample("2D").first()
+    tm.assert_series_equal(res, exp)
+
+
+@pytest.mark.parametrize(
+    "start, end, freq, resample_freq",
+    [
+        ("8h", "21h59min50s", "10s", "3h"),  # GH 30353 example
+        ("3h", "22h", "1h", "5h"),
+        ("527D", "5006D", "3D", "10D"),
+        ("1D", "10D", "1D", "2D"),  # GH 13022 example
+        # tests that worked before GH 33498:
+        ("8h", "21h59min50s", "10s", "2h"),
+        ("0h", "21h59min50s", "10s", "3h"),
+        ("10D", "85D", "D", "2D"),
+    ],
+)
+def test_resample_timedelta_edge_case(start, end, freq, resample_freq):
+    # GH 33498
+    # check that the timedelta bins does not contains an extra bin
+    idx = timedelta_range(start=start, end=end, freq=freq)
+    s = Series(np.arange(len(idx)), index=idx)
+    result = s.resample(resample_freq).min()
+    expected_index = timedelta_range(freq=resample_freq, start=start, end=end)
+    tm.assert_index_equal(result.index, expected_index)
+    assert result.index.freq == expected_index.freq
+    assert not np.isnan(result.iloc[-1])
+
+
+@pytest.mark.parametrize("duplicates", [True, False])
+def test_resample_with_timedelta_yields_no_empty_groups(duplicates):
+    # GH 10603
+    df = DataFrame(
+        np.random.default_rng(2).normal(size=(10000, 4)),
+        index=timedelta_range(start="0s", periods=10000, freq="3906250ns"),
+    )
+    if duplicates:
+        # case with non-unique columns
+        df.columns = ["A", "B", "A", "C"]
+
+    result = df.loc["1s":, :].resample("3s").apply(lambda x: len(x))
+
+    expected = DataFrame(
+        [[768] * 4] * 12 + [[528] * 4],
+        index=timedelta_range(start="1s", periods=13, freq="3s", unit="ns"),
+    )
+    expected.columns = df.columns
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_quantile_timedelta(unit):
+    # GH: 29485
+    dtype = np.dtype(f"m8[{unit}]")
+    df = DataFrame(
+        {"value": pd.to_timedelta(np.arange(4), unit="s").astype(dtype)},
+        index=pd.date_range("20200101", periods=4, tz="UTC"),
+    )
+    result = df.resample("2D").quantile(0.99)
+    expected = DataFrame(
+        {
+            "value": [
+                pd.Timedelta("0 days 00:00:00.990000"),
+                pd.Timedelta("0 days 00:00:02.990000"),
+            ]
+        },
+        index=pd.date_range("20200101", periods=2, tz="UTC", freq="2D"),
+    ).astype(dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_resample_closed_right():
+    # GH#45414
+    idx = pd.Index([pd.Timedelta(seconds=120 + i * 30) for i in range(10)])
+    ser = Series(range(10), index=idx)
+    result = ser.resample("min", closed="right", label="right").sum()
+    expected = Series(
+        [0, 3, 7, 11, 15, 9],
+        index=pd.TimedeltaIndex(
+            [pd.Timedelta(seconds=120 + i * 60) for i in range(6)], freq="min"
+        ),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_arrow_duration_resample():
+    # GH 56371
+    idx = pd.Index(timedelta_range("1 day", periods=5), dtype="duration[ns][pyarrow]")
+    expected = Series(np.arange(5, dtype=np.float64), index=idx)
+    result = expected.resample("1D").mean()
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/reshape/__init__.py b/pandas/tests/reshape/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py
new file mode 100644
index 0000000000000000000000000000000000000000..1482da8a074eb41b64d276683ffc7258b4e9d0bb
--- /dev/null
+++ b/pandas/tests/reshape/test_crosstab.py
@@ -0,0 +1,879 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    CategoricalDtype,
+    CategoricalIndex,
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    crosstab,
+)
+import pandas._testing as tm
+
+
+@pytest.fixture
+def df():
+    df = DataFrame(
+        {
+            "A": [
+                "foo",
+                "foo",
+                "foo",
+                "foo",
+                "bar",
+                "bar",
+                "bar",
+                "bar",
+                "foo",
+                "foo",
+                "foo",
+            ],
+            "B": [
+                "one",
+                "one",
+                "one",
+                "two",
+                "one",
+                "one",
+                "one",
+                "two",
+                "two",
+                "two",
+                "one",
+            ],
+            "C": [
+                "dull",
+                "dull",
+                "shiny",
+                "dull",
+                "dull",
+                "shiny",
+                "shiny",
+                "dull",
+                "shiny",
+                "shiny",
+                "shiny",
+            ],
+            "D": np.random.default_rng(2).standard_normal(11),
+            "E": np.random.default_rng(2).standard_normal(11),
+            "F": np.random.default_rng(2).standard_normal(11),
+        }
+    )
+
+    return pd.concat([df, df], ignore_index=True)
+
+
+class TestCrosstab:
+    def test_crosstab_single(self, df):
+        result = crosstab(df["A"], df["C"])
+        expected = df.groupby(["A", "C"]).size().unstack()
+        tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64))
+
+    def test_crosstab_multiple(self, df):
+        result = crosstab(df["A"], [df["B"], df["C"]])
+        expected = df.groupby(["A", "B", "C"]).size()
+        expected = expected.unstack("B").unstack("C").fillna(0).astype(np.int64)
+        tm.assert_frame_equal(result, expected)
+
+        result = crosstab([df["B"], df["C"]], df["A"])
+        expected = df.groupby(["B", "C", "A"]).size()
+        expected = expected.unstack("A").fillna(0).astype(np.int64)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("box", [np.array, list, tuple])
+    def test_crosstab_ndarray(self, box):
+        # GH 44076
+        a = box(np.random.default_rng(2).integers(0, 5, size=100))
+        b = box(np.random.default_rng(2).integers(0, 3, size=100))
+        c = box(np.random.default_rng(2).integers(0, 10, size=100))
+
+        df = DataFrame({"a": a, "b": b, "c": c})
+
+        result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"))
+        expected = crosstab(df["a"], [df["b"], df["c"]])
+        tm.assert_frame_equal(result, expected)
+
+        result = crosstab([b, c], a, colnames=["a"], rownames=("b", "c"))
+        expected = crosstab([df["b"], df["c"]], df["a"])
+        tm.assert_frame_equal(result, expected)
+
+        # assign arbitrary names
+        result = crosstab(a, c)
+        expected = crosstab(df["a"], df["c"])
+        expected.index.names = ["row_0"]
+        expected.columns.names = ["col_0"]
+        tm.assert_frame_equal(result, expected)
+
+    def test_crosstab_non_aligned(self):
+        # GH 17005
+        a = Series([0, 1, 1], index=["a", "b", "c"])
+        b = Series([3, 4, 3, 4, 3], index=["a", "b", "c", "d", "f"])
+        c = np.array([3, 4, 3], dtype=np.int64)
+
+        expected = DataFrame(
+            [[1, 0], [1, 1]],
+            index=Index([0, 1], name="row_0"),
+            columns=Index([3, 4], name="col_0"),
+        )
+
+        result = crosstab(a, b)
+        tm.assert_frame_equal(result, expected)
+
+        result = crosstab(a, c)
+        tm.assert_frame_equal(result, expected)
+
+    def test_crosstab_margins(self):
+        a = np.random.default_rng(2).integers(0, 7, size=100)
+        b = np.random.default_rng(2).integers(0, 3, size=100)
+        c = np.random.default_rng(2).integers(0, 5, size=100)
+
+        df = DataFrame({"a": a, "b": b, "c": c})
+
+        result = crosstab(a, [b, c], rownames=["a"], colnames=("b", "c"), margins=True)
+
+        assert result.index.names == ("a",)
+        assert result.columns.names == ["b", "c"]
+
+        all_cols = result["All", ""]
+        exp_cols = df.groupby(["a"]).size().astype("i8")
+        # to keep index.name
+        exp_margin = Series([len(df)], index=Index(["All"], name="a"))
+        exp_cols = pd.concat([exp_cols, exp_margin])
+        exp_cols.name = ("All", "")
+
+        tm.assert_series_equal(all_cols, exp_cols)
+
+        all_rows = result.loc["All"]
+        exp_rows = df.groupby(["b", "c"]).size().astype("i8")
+        exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("All", "")])])
+        exp_rows.name = "All"
+
+        exp_rows = exp_rows.reindex(all_rows.index)
+        exp_rows = exp_rows.fillna(0).astype(np.int64)
+        tm.assert_series_equal(all_rows, exp_rows)
+
+    def test_crosstab_margins_set_margin_name(self):
+        # GH 15972
+        a = np.random.default_rng(2).integers(0, 7, size=100)
+        b = np.random.default_rng(2).integers(0, 3, size=100)
+        c = np.random.default_rng(2).integers(0, 5, size=100)
+
+        df = DataFrame({"a": a, "b": b, "c": c})
+
+        result = crosstab(
+            a,
+            [b, c],
+            rownames=["a"],
+            colnames=("b", "c"),
+            margins=True,
+            margins_name="TOTAL",
+        )
+
+        assert result.index.names == ("a",)
+        assert result.columns.names == ["b", "c"]
+
+        all_cols = result["TOTAL", ""]
+        exp_cols = df.groupby(["a"]).size().astype("i8")
+        # to keep index.name
+        exp_margin = Series([len(df)], index=Index(["TOTAL"], name="a"))
+        exp_cols = pd.concat([exp_cols, exp_margin])
+        exp_cols.name = ("TOTAL", "")
+
+        tm.assert_series_equal(all_cols, exp_cols)
+
+        all_rows = result.loc["TOTAL"]
+        exp_rows = df.groupby(["b", "c"]).size().astype("i8")
+        exp_rows = pd.concat([exp_rows, Series([len(df)], index=[("TOTAL", "")])])
+        exp_rows.name = "TOTAL"
+
+        exp_rows = exp_rows.reindex(all_rows.index)
+        exp_rows = exp_rows.fillna(0).astype(np.int64)
+        tm.assert_series_equal(all_rows, exp_rows)
+
+        msg = "margins_name argument must be a string"
+        for margins_name in [666, None, ["a", "b"]]:
+            with pytest.raises(ValueError, match=msg):
+                crosstab(
+                    a,
+                    [b, c],
+                    rownames=["a"],
+                    colnames=("b", "c"),
+                    margins=True,
+                    margins_name=margins_name,
+                )
+
+    def test_crosstab_pass_values(self):
+        a = np.random.default_rng(2).integers(0, 7, size=100)
+        b = np.random.default_rng(2).integers(0, 3, size=100)
+        c = np.random.default_rng(2).integers(0, 5, size=100)
+        values = np.random.default_rng(2).standard_normal(100)
+
+        table = crosstab(
+            [a, b], c, values, aggfunc="sum", rownames=["foo", "bar"], colnames=["baz"]
+        )
+
+        df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values})
+
+        expected = df.pivot_table(
+            "values", index=["foo", "bar"], columns="baz", aggfunc="sum"
+        )
+        tm.assert_frame_equal(table, expected)
+
+    def test_crosstab_dropna(self):
+        # GH 3820
+        a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object)
+        b = np.array(["one", "one", "two", "one", "two", "two", "two"], dtype=object)
+        c = np.array(
+            ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object
+        )
+        res = crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"], dropna=False)
+        m = MultiIndex.from_tuples(
+            [("one", "dull"), ("one", "shiny"), ("two", "dull"), ("two", "shiny")],
+            names=["b", "c"],
+        )
+        tm.assert_index_equal(res.columns, m)
+
+    def test_crosstab_no_overlap(self):
+        # GS 10291
+
+        s1 = Series([1, 2, 3], index=[1, 2, 3])
+        s2 = Series([4, 5, 6], index=[4, 5, 6])
+
+        actual = crosstab(s1, s2)
+        expected = DataFrame(
+            index=Index([], dtype="int64", name="row_0"),
+            columns=Index([], dtype="int64", name="col_0"),
+        )
+
+        tm.assert_frame_equal(actual, expected)
+
+    def test_margin_dropna(self):
+        # GH 12577
+        # pivot_table counts null into margin ('All')
+        # when margins=true and dropna=true
+
+        df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
+        actual = crosstab(df.a, df.b, margins=True, dropna=True)
+        expected = DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]])
+        expected.index = Index([1.0, 2.0, "All"], name="a")
+        expected.columns = Index([3, 4, "All"], name="b")
+        tm.assert_frame_equal(actual, expected)
+
+    def test_margin_dropna2(self):
+        df = DataFrame(
+            {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]}
+        )
+        actual = crosstab(df.a, df.b, margins=True, dropna=True)
+        expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
+        expected.index = Index([1.0, 2.0, "All"], name="a")
+        expected.columns = Index([3.0, 4.0, "All"], name="b")
+        tm.assert_frame_equal(actual, expected)
+
+    def test_margin_dropna3(self):
+        df = DataFrame(
+            {"a": [1, np.nan, np.nan, np.nan, np.nan, 2], "b": [3, 3, 4, 4, 4, 4]}
+        )
+        actual = crosstab(df.a, df.b, margins=True, dropna=True)
+        expected = DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
+        expected.index = Index([1.0, 2.0, "All"], name="a")
+        expected.columns = Index([3, 4, "All"], name="b")
+        tm.assert_frame_equal(actual, expected)
+
+    def test_margin_dropna4(self):
+        # GH 12642
+        # _add_margins raises KeyError: Level None not found
+        # when margins=True and dropna=False
+        # GH: 10772: Keep np.nan in result with dropna=False
+        df = DataFrame({"a": [1, 2, 2, 2, 2, np.nan], "b": [3, 3, 4, 4, 4, 4]})
+        actual = crosstab(df.a, df.b, margins=True, dropna=False)
+        expected = DataFrame([[1, 0, 1], [1, 3, 4], [0, 1, 1], [2, 4, 6]])
+        expected.index = Index([1.0, 2.0, np.nan, "All"], name="a")
+        expected.columns = Index([3, 4, "All"], name="b")
+        tm.assert_frame_equal(actual, expected)
+
+    def test_margin_dropna5(self):
+        # GH: 10772: Keep np.nan in result with dropna=False
+        df = DataFrame(
+            {"a": [1, np.nan, np.nan, np.nan, 2, np.nan], "b": [3, np.nan, 4, 4, 4, 4]}
+        )
+        actual = crosstab(df.a, df.b, margins=True, dropna=False)
+        expected = DataFrame(
+            [[1, 0, 0, 1.0], [0, 1, 0, 1.0], [0, 3, 1, 4.0], [1, 4, 1, 6.0]]
+        )
+        expected.index = Index([1.0, 2.0, np.nan, "All"], name="a")
+        expected.columns = Index([3.0, 4.0, np.nan, "All"], name="b")
+        tm.assert_frame_equal(actual, expected, check_dtype=False)
+
+    def test_margin_dropna6(self):
+        # GH: 10772: Keep np.nan in result with dropna=False
+        a = np.array(["foo", "foo", "foo", "bar", "bar", "foo", "foo"], dtype=object)
+        b = np.array(["one", "one", "two", "one", "two", np.nan, "two"], dtype=object)
+        c = np.array(
+            ["dull", "dull", "dull", "dull", "dull", "shiny", "shiny"], dtype=object
+        )
+
+        actual = crosstab(
+            a, [b, c], rownames=["a"], colnames=["b", "c"], margins=True, dropna=False
+        )
+        m = MultiIndex.from_arrays(
+            [
+                ["one", "one", "two", "two", np.nan, np.nan, "All"],
+                ["dull", "shiny", "dull", "shiny", "dull", "shiny", ""],
+            ],
+            names=["b", "c"],
+        )
+        expected = DataFrame(
+            [[1, 0, 1, 0, 0, 0, 2], [2, 0, 1, 1, 0, 1, 5], [3, 0, 2, 1, 0, 1, 7]],
+            columns=m,
+        )
+        expected.index = Index(["bar", "foo", "All"], name="a")
+        tm.assert_frame_equal(actual, expected)
+
+        actual = crosstab(
+            [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=False
+        )
+        m = MultiIndex.from_arrays(
+            [
+                ["bar", "bar", "bar", "foo", "foo", "foo", "All"],
+                ["one", "two", np.nan, "one", "two", np.nan, ""],
+            ],
+            names=["a", "b"],
+        )
+        expected = DataFrame(
+            [
+                [1, 0, 1.0],
+                [1, 0, 1.0],
+                [0, 0, np.nan],
+                [2, 0, 2.0],
+                [1, 1, 2.0],
+                [0, 1, 1.0],
+                [5, 2, 7.0],
+            ],
+            index=m,
+        )
+        expected.columns = Index(["dull", "shiny", "All"], name="c")
+        tm.assert_frame_equal(actual, expected)
+
+        actual = crosstab(
+            [a, b], c, rownames=["a", "b"], colnames=["c"], margins=True, dropna=True
+        )
+        m = MultiIndex.from_arrays(
+            [["bar", "bar", "foo", "foo", "All"], ["one", "two", "one", "two", ""]],
+            names=["a", "b"],
+        )
+        expected = DataFrame(
+            [[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2], [5, 1, 6]], index=m
+        )
+        expected.columns = Index(["dull", "shiny", "All"], name="c")
+        tm.assert_frame_equal(actual, expected)
+
+    def test_crosstab_normalize(self):
+        # Issue 12578
+        df = DataFrame(
+            {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
+        )
+
+        rindex = Index([1, 2], name="a")
+        cindex = Index([3, 4], name="b")
+        full_normal = DataFrame([[0.2, 0], [0.2, 0.6]], index=rindex, columns=cindex)
+        row_normal = DataFrame([[1.0, 0], [0.25, 0.75]], index=rindex, columns=cindex)
+        col_normal = DataFrame([[0.5, 0], [0.5, 1.0]], index=rindex, columns=cindex)
+
+        # Check all normalize args
+        tm.assert_frame_equal(crosstab(df.a, df.b, normalize="all"), full_normal)
+        tm.assert_frame_equal(crosstab(df.a, df.b, normalize=True), full_normal)
+        tm.assert_frame_equal(crosstab(df.a, df.b, normalize="index"), row_normal)
+        tm.assert_frame_equal(crosstab(df.a, df.b, normalize="columns"), col_normal)
+        tm.assert_frame_equal(
+            crosstab(df.a, df.b, normalize=1),
+            crosstab(df.a, df.b, normalize="columns"),
+        )
+        tm.assert_frame_equal(
+            crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index")
+        )
+
+        row_normal_margins = DataFrame(
+            [[1.0, 0], [0.25, 0.75], [0.4, 0.6]],
+            index=Index([1, 2, "All"], name="a", dtype="object"),
+            columns=Index([3, 4], name="b", dtype="object"),
+        )
+        col_normal_margins = DataFrame(
+            [[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
+            index=Index([1, 2], name="a", dtype="object"),
+            columns=Index([3, 4, "All"], name="b", dtype="object"),
+        )
+
+        all_normal_margins = DataFrame(
+            [[0.2, 0, 0.2], [0.2, 0.6, 0.8], [0.4, 0.6, 1]],
+            index=Index([1, 2, "All"], name="a", dtype="object"),
+            columns=Index([3, 4, "All"], name="b", dtype="object"),
+        )
+        tm.assert_frame_equal(
+            crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins
+        )
+        tm.assert_frame_equal(
+            crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins
+        )
+        tm.assert_frame_equal(
+            crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins
+        )
+
+    def test_crosstab_normalize_arrays(self):
+        # GH#12578
+        df = DataFrame(
+            {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
+        )
+
+        # Test arrays
+        crosstab(
+            [np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], np.array([1, 2, 1, 2])
+        )
+
+        # Test with aggfunc
+        norm_counts = DataFrame(
+            [[0.25, 0, 0.25], [0.25, 0.5, 0.75], [0.5, 0.5, 1]],
+            index=Index([1, 2, "All"], name="a", dtype="object"),
+            columns=Index([3, 4, "All"], name="b"),
+        )
+        test_case = crosstab(
+            df.a, df.b, df.c, aggfunc="count", normalize="all", margins=True
+        )
+        tm.assert_frame_equal(test_case, norm_counts)
+
+        df = DataFrame(
+            {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [0, 4, np.nan, 3, 3]}
+        )
+
+        norm_sum = DataFrame(
+            [[0, 0, 0.0], [0.4, 0.6, 1], [0.4, 0.6, 1]],
+            index=Index([1, 2, "All"], name="a", dtype="object"),
+            columns=Index([3, 4, "All"], name="b", dtype="object"),
+        )
+        test_case = crosstab(
+            df.a, df.b, df.c, aggfunc=np.sum, normalize="all", margins=True
+        )
+        tm.assert_frame_equal(test_case, norm_sum)
+
+    def test_crosstab_with_empties(self):
+        # Check handling of empties
+        df = DataFrame(
+            {
+                "a": [1, 2, 2, 2, 2],
+                "b": [3, 3, 4, 4, 4],
+                "c": [np.nan, np.nan, np.nan, np.nan, np.nan],
+            }
+        )
+
+        empty = DataFrame(
+            [[0.0, 0.0], [0.0, 0.0]],
+            index=Index([1, 2], name="a", dtype="int64"),
+            columns=Index([3, 4], name="b"),
+        )
+
+        for i in [True, "index", "columns"]:
+            calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=i)
+            tm.assert_frame_equal(empty, calculated)
+
+        nans = DataFrame(
+            [[0.0, np.nan], [0.0, 0.0]],
+            index=Index([1, 2], name="a", dtype="int64"),
+            columns=Index([3, 4], name="b"),
+        )
+
+        calculated = crosstab(df.a, df.b, values=df.c, aggfunc="count", normalize=False)
+        tm.assert_frame_equal(nans, calculated)
+
+    def test_crosstab_errors(self):
+        # Issue 12578
+
+        df = DataFrame(
+            {"a": [1, 2, 2, 2, 2], "b": [3, 3, 4, 4, 4], "c": [1, 1, np.nan, 1, 1]}
+        )
+
+        error = "values cannot be used without an aggfunc."
+        with pytest.raises(ValueError, match=error):
+            crosstab(df.a, df.b, values=df.c)
+
+        error = "aggfunc cannot be used without values"
+        with pytest.raises(ValueError, match=error):
+            crosstab(df.a, df.b, aggfunc=np.mean)
+
+        error = "Not a valid normalize argument"
+        with pytest.raises(ValueError, match=error):
+            crosstab(df.a, df.b, normalize="42")
+
+        with pytest.raises(ValueError, match=error):
+            crosstab(df.a, df.b, normalize=42)
+
+        error = "Not a valid margins argument"
+        with pytest.raises(ValueError, match=error):
+            crosstab(df.a, df.b, normalize="all", margins=42)
+
+    def test_crosstab_with_categorial_columns(self):
+        # GH 8860
+        df = DataFrame(
+            {
+                "MAKE": ["Honda", "Acura", "Tesla", "Honda", "Honda", "Acura"],
+                "MODEL": ["Sedan", "Sedan", "Electric", "Pickup", "Sedan", "Sedan"],
+            }
+        )
+        categories = ["Sedan", "Electric", "Pickup"]
+        df["MODEL"] = df["MODEL"].astype("category").cat.set_categories(categories)
+        result = crosstab(df["MAKE"], df["MODEL"])
+
+        expected_index = Index(["Acura", "Honda", "Tesla"], name="MAKE")
+        expected_columns = CategoricalIndex(
+            categories, categories=categories, ordered=False, name="MODEL"
+        )
+        expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]]
+        expected = DataFrame(
+            expected_data, index=expected_index, columns=expected_columns
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_crosstab_with_numpy_size(self):
+        # GH 4003
+        df = DataFrame(
+            {
+                "A": ["one", "one", "two", "three"] * 6,
+                "B": ["A", "B", "C"] * 8,
+                "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
+                "D": np.random.default_rng(2).standard_normal(24),
+                "E": np.random.default_rng(2).standard_normal(24),
+            }
+        )
+        result = crosstab(
+            index=[df["A"], df["B"]],
+            columns=[df["C"]],
+            margins=True,
+            aggfunc=np.size,
+            values=df["D"],
+        )
+        expected_index = MultiIndex(
+            levels=[["All", "one", "three", "two"], ["", "A", "B", "C"]],
+            codes=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0], [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]],
+            names=["A", "B"],
+        )
+        expected_column = Index(["bar", "foo", "All"], name="C")
+        expected_data = np.array(
+            [
+                [2.0, 2.0, 4.0],
+                [2.0, 2.0, 4.0],
+                [2.0, 2.0, 4.0],
+                [2.0, np.nan, 2.0],
+                [np.nan, 2.0, 2.0],
+                [2.0, np.nan, 2.0],
+                [np.nan, 2.0, 2.0],
+                [2.0, np.nan, 2.0],
+                [np.nan, 2.0, 2.0],
+                [12.0, 12.0, 24.0],
+            ]
+        )
+        expected = DataFrame(
+            expected_data, index=expected_index, columns=expected_column
+        )
+        # aggfunc is np.size, resulting in integers
+        expected["All"] = expected["All"].astype("int64")
+        tm.assert_frame_equal(result, expected)
+
+    def test_crosstab_duplicate_names(self):
+        # GH 13279 / 22529
+
+        s1 = Series(range(3), name="foo")
+        s2_foo = Series(range(1, 4), name="foo")
+        s2_bar = Series(range(1, 4), name="bar")
+        s3 = Series(range(3), name="waldo")
+
+        # check result computed with duplicate labels against
+        # result computed with unique labels, then relabelled
+        mapper = {"bar": "foo"}
+
+        # duplicate row, column labels
+        result = crosstab(s1, s2_foo)
+        expected = crosstab(s1, s2_bar).rename_axis(columns=mapper, axis=1)
+        tm.assert_frame_equal(result, expected)
+
+        # duplicate row, unique column labels
+        result = crosstab([s1, s2_foo], s3)
+        expected = crosstab([s1, s2_bar], s3).rename_axis(index=mapper, axis=0)
+        tm.assert_frame_equal(result, expected)
+
+        # unique row, duplicate column labels
+        result = crosstab(s3, [s1, s2_foo])
+        expected = crosstab(s3, [s1, s2_bar]).rename_axis(columns=mapper, axis=1)
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("names", [["a", ("b", "c")], [("a", "b"), "c"]])
+    def test_crosstab_tuple_name(self, names):
+        s1 = Series(range(3), name=names[0])
+        s2 = Series(range(1, 4), name=names[1])
+
+        mi = MultiIndex.from_arrays([range(3), range(1, 4)], names=names)
+        expected = Series(1, index=mi).unstack(1, fill_value=0)
+
+        result = crosstab(s1, s2)
+        tm.assert_frame_equal(result, expected)
+
+    def test_crosstab_both_tuple_names(self):
+        # GH 18321
+        s1 = Series(range(3), name=("a", "b"))
+        s2 = Series(range(3), name=("c", "d"))
+
+        expected = DataFrame(
+            np.eye(3, dtype="int64"),
+            index=Index(range(3), name=("a", "b")),
+            columns=Index(range(3), name=("c", "d")),
+        )
+        result = crosstab(s1, s2)
+        tm.assert_frame_equal(result, expected)
+
+    def test_crosstab_unsorted_order(self):
+        df = DataFrame({"b": [3, 1, 2], "a": [5, 4, 6]}, index=["C", "A", "B"])
+        result = crosstab(df.index, [df.b, df.a])
+        e_idx = Index(["A", "B", "C"], name="row_0")
+        e_columns = MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)], names=["b", "a"])
+        expected = DataFrame(
+            [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_crosstab_normalize_multiple_columns(self):
+        # GH 15150
+        df = DataFrame(
+            {
+                "A": ["one", "one", "two", "three"] * 6,
+                "B": ["A", "B", "C"] * 8,
+                "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
+                "D": [0] * 24,
+                "E": [0] * 24,
+            }
+        )
+
+        result = crosstab(
+            [df.A, df.B],
+            df.C,
+            values=df.D,
+            aggfunc=np.sum,
+            normalize=True,
+            margins=True,
+        )
+        expected = DataFrame(
+            np.array([0] * 29 + [1], dtype=float).reshape(10, 3),
+            columns=Index(["bar", "foo", "All"], name="C"),
+            index=MultiIndex.from_tuples(
+                [
+                    ("one", "A"),
+                    ("one", "B"),
+                    ("one", "C"),
+                    ("three", "A"),
+                    ("three", "B"),
+                    ("three", "C"),
+                    ("two", "A"),
+                    ("two", "B"),
+                    ("two", "C"),
+                    ("All", ""),
+                ],
+                names=["A", "B"],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_margin_normalize(self):
+        # GH 27500
+        df = DataFrame(
+            {
+                "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
+                "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
+                "C": [
+                    "small",
+                    "large",
+                    "large",
+                    "small",
+                    "small",
+                    "large",
+                    "small",
+                    "small",
+                    "large",
+                ],
+                "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
+                "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
+            }
+        )
+        # normalize on index
+        result = crosstab(
+            [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0
+        )
+        expected = DataFrame(
+            [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]]
+        )
+        expected.index = MultiIndex(
+            levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
+            codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
+            names=["A", "B"],
+        )
+        expected.columns = Index(["large", "small"], name="C")
+        tm.assert_frame_equal(result, expected)
+
+        # normalize on columns
+        result = crosstab(
+            [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1
+        )
+        expected = DataFrame(
+            [
+                [0.25, 0.2, 0.222222],
+                [0.25, 0.2, 0.222222],
+                [0.5, 0.2, 0.333333],
+                [0, 0.4, 0.222222],
+            ]
+        )
+        expected.columns = Index(["large", "small", "Sub-Total"], name="C")
+        expected.index = MultiIndex(
+            levels=[["bar", "foo"], ["one", "two"]],
+            codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
+            names=["A", "B"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+        # normalize on both index and column
+        result = crosstab(
+            [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True
+        )
+        expected = DataFrame(
+            [
+                [0.111111, 0.111111, 0.222222],
+                [0.111111, 0.111111, 0.222222],
+                [0.222222, 0.111111, 0.333333],
+                [0.000000, 0.222222, 0.222222],
+                [0.444444, 0.555555, 1],
+            ]
+        )
+        expected.columns = Index(["large", "small", "Sub-Total"], name="C")
+        expected.index = MultiIndex(
+            levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
+            codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
+            names=["A", "B"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_margin_normalize_multiple_columns(self):
+        # GH 35144
+        # use multiple columns with margins and normalization
+        df = DataFrame(
+            {
+                "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
+                "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
+                "C": [
+                    "small",
+                    "large",
+                    "large",
+                    "small",
+                    "small",
+                    "large",
+                    "small",
+                    "small",
+                    "large",
+                ],
+                "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
+                "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
+            }
+        )
+        result = crosstab(
+            index=df.C,
+            columns=[df.A, df.B],
+            margins=True,
+            margins_name="margin",
+            normalize=True,
+        )
+        expected = DataFrame(
+            [
+                [0.111111, 0.111111, 0.222222, 0.000000, 0.444444],
+                [0.111111, 0.111111, 0.111111, 0.222222, 0.555556],
+                [0.222222, 0.222222, 0.333333, 0.222222, 1.0],
+            ],
+            index=["large", "small", "margin"],
+        )
+        expected.columns = MultiIndex(
+            levels=[["bar", "foo", "margin"], ["", "one", "two"]],
+            codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]],
+            names=["A", "B"],
+        )
+        expected.index.name = "C"
+        tm.assert_frame_equal(result, expected)
+
+    def test_margin_support_Float(self):
+        # GH 50313
+        # use Float64 formats and function aggfunc with margins
+        df = DataFrame(
+            {"A": [1, 2, 2, 1], "B": [3, 3, 4, 5], "C": [-1.0, 10.0, 1.0, 10.0]},
+            dtype="Float64",
+        )
+        result = crosstab(
+            df["A"],
+            df["B"],
+            values=df["C"],
+            aggfunc="sum",
+            margins=True,
+        )
+        expected = DataFrame(
+            [
+                [-1.0, pd.NA, 10.0, 9.0],
+                [10.0, 1.0, pd.NA, 11.0],
+                [9.0, 1.0, 10.0, 20.0],
+            ],
+            index=Index([1.0, 2.0, "All"], dtype="object", name="A"),
+            columns=Index([3.0, 4.0, 5.0, "All"], dtype="object", name="B"),
+            dtype="Float64",
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_margin_with_ordered_categorical_column(self):
+        # GH 25278
+        df = DataFrame(
+            {
+                "First": ["B", "B", "C", "A", "B", "C"],
+                "Second": ["C", "B", "B", "B", "C", "A"],
+            }
+        )
+        df["First"] = df["First"].astype(CategoricalDtype(ordered=True))
+        customized_categories_order = ["C", "A", "B"]
+        df["First"] = df["First"].cat.reorder_categories(customized_categories_order)
+        result = crosstab(df["First"], df["Second"], margins=True)
+
+        expected_index = Index(["C", "A", "B", "All"], name="First")
+        expected_columns = Index(["A", "B", "C", "All"], name="Second")
+        expected_data = [[1, 1, 0, 2], [0, 1, 0, 1], [0, 1, 2, 3], [1, 3, 2, 6]]
+        expected = DataFrame(
+            expected_data, index=expected_index, columns=expected_columns
+        )
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("a_dtype", ["category", "int64"])
+@pytest.mark.parametrize("b_dtype", ["category", "int64"])
+def test_categoricals(a_dtype, b_dtype):
+    # https://github.com/pandas-dev/pandas/issues/37465
+    g = np.random.default_rng(2)
+    a = Series(g.integers(0, 3, size=100)).astype(a_dtype)
+    b = Series(g.integers(0, 2, size=100)).astype(b_dtype)
+    result = crosstab(a, b, margins=True, dropna=False)
+    columns = Index([0, 1, "All"], dtype="object", name="col_0")
+    index = Index([0, 1, 2, "All"], dtype="object", name="row_0")
+    values = [[10, 18, 28], [23, 16, 39], [17, 16, 33], [50, 50, 100]]
+    expected = DataFrame(values, index, columns)
+    tm.assert_frame_equal(result, expected)
+
+    # Verify when categorical does not have all values present
+    a.loc[a == 1] = 2
+    a_is_cat = isinstance(a.dtype, CategoricalDtype)
+    assert not a_is_cat or a.value_counts().loc[1] == 0
+    result = crosstab(a, b, margins=True, dropna=False)
+    values = [[10, 18, 28], [0, 0, 0], [40, 32, 72], [50, 50, 100]]
+    expected = DataFrame(values, index, columns)
+    if not a_is_cat:
+        expected = expected.loc[[0, 2, "All"]]
+        expected["All"] = expected["All"].astype("int64")
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py
new file mode 100644
index 0000000000000000000000000000000000000000..909c10d3f73b20bf55ceba1663e89fd0c692cb30
--- /dev/null
+++ b/pandas/tests/reshape/test_cut.py
@@ -0,0 +1,828 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    Interval,
+    IntervalIndex,
+    Series,
+    TimedeltaIndex,
+    Timestamp,
+    cut,
+    date_range,
+    interval_range,
+    isna,
+    qcut,
+    timedelta_range,
+    to_datetime,
+)
+import pandas._testing as tm
+from pandas.api.types import CategoricalDtype
+import pandas.core.reshape.tile as tmod
+
+
+def test_simple():
+    data = np.ones(5, dtype="int64")
+    result = cut(data, 4, labels=False)
+
+    expected = np.array([1, 1, 1, 1, 1])
+    tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+
+
+@pytest.mark.parametrize("func", [list, np.array])
+def test_bins(func):
+    data = func([0.2, 1.4, 2.5, 6.2, 9.7, 2.1])
+    result, bins = cut(data, 3, retbins=True)
+
+    intervals = IntervalIndex.from_breaks(bins.round(3))
+    intervals = intervals.take([0, 0, 0, 1, 2, 0])
+    expected = Categorical(intervals, ordered=True)
+
+    tm.assert_categorical_equal(result, expected)
+    tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7]))
+
+
+def test_right():
+    data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
+    result, bins = cut(data, 4, right=True, retbins=True)
+
+    intervals = IntervalIndex.from_breaks(bins.round(3))
+    expected = Categorical(intervals, ordered=True)
+    expected = expected.take([0, 0, 0, 2, 3, 0, 0])
+
+    tm.assert_categorical_equal(result, expected)
+    tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7]))
+
+
+def test_no_right():
+    data = np.array([0.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
+    result, bins = cut(data, 4, right=False, retbins=True)
+
+    intervals = IntervalIndex.from_breaks(bins.round(3), closed="left")
+    intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
+    expected = Categorical(intervals, ordered=True)
+
+    tm.assert_categorical_equal(result, expected)
+    tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095]))
+
+
+def test_bins_from_interval_index():
+    c = cut(range(5), 3)
+    expected = c
+    result = cut(range(5), bins=expected.categories)
+    tm.assert_categorical_equal(result, expected)
+
+    expected = Categorical.from_codes(
+        np.append(c.codes, -1), categories=c.categories, ordered=True
+    )
+    result = cut(range(6), bins=expected.categories)
+    tm.assert_categorical_equal(result, expected)
+
+
+def test_bins_from_interval_index_doc_example():
+    # Make sure we preserve the bins.
+    ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
+    c = cut(ages, bins=[0, 18, 35, 70])
+    expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
+    tm.assert_index_equal(c.categories, expected)
+
+    result = cut([25, 20, 50], bins=c.categories)
+    tm.assert_index_equal(result.categories, expected)
+    tm.assert_numpy_array_equal(result.codes, np.array([1, 1, 2], dtype="int8"))
+
+
+def test_bins_not_overlapping_from_interval_index():
+    # see gh-23980
+    msg = "Overlapping IntervalIndex is not accepted"
+    ii = IntervalIndex.from_tuples([(0, 10), (2, 12), (4, 14)])
+
+    with pytest.raises(ValueError, match=msg):
+        cut([5, 6], bins=ii)
+
+
+def test_bins_not_monotonic():
+    msg = "bins must increase monotonically"
+    data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
+
+    with pytest.raises(ValueError, match=msg):
+        cut(data, [0.1, 1.5, 1, 10])
+
+
+@pytest.mark.parametrize(
+    "x, bins, expected",
+    [
+        (
+            date_range("2017-12-31", periods=3),
+            [Timestamp.min, Timestamp("2018-01-01"), Timestamp.max],
+            IntervalIndex.from_tuples(
+                [
+                    (Timestamp.min, Timestamp("2018-01-01")),
+                    (Timestamp("2018-01-01"), Timestamp.max),
+                ]
+            ),
+        ),
+        (
+            [-1, 0, 1],
+            np.array(
+                [np.iinfo(np.int64).min, 0, np.iinfo(np.int64).max], dtype="int64"
+            ),
+            IntervalIndex.from_tuples(
+                [(np.iinfo(np.int64).min, 0), (0, np.iinfo(np.int64).max)]
+            ),
+        ),
+        (
+            [
+                np.timedelta64(-1, "ns"),
+                np.timedelta64(0, "ns"),
+                np.timedelta64(1, "ns"),
+            ],
+            np.array(
+                [
+                    np.timedelta64(-np.iinfo(np.int64).max, "ns"),
+                    np.timedelta64(0, "ns"),
+                    np.timedelta64(np.iinfo(np.int64).max, "ns"),
+                ]
+            ),
+            IntervalIndex.from_tuples(
+                [
+                    (
+                        np.timedelta64(-np.iinfo(np.int64).max, "ns"),
+                        np.timedelta64(0, "ns"),
+                    ),
+                    (
+                        np.timedelta64(0, "ns"),
+                        np.timedelta64(np.iinfo(np.int64).max, "ns"),
+                    ),
+                ]
+            ),
+        ),
+    ],
+)
+def test_bins_monotonic_not_overflowing(x, bins, expected):
+    # GH 26045
+    result = cut(x, bins)
+    tm.assert_index_equal(result.categories, expected)
+
+
+def test_wrong_num_labels():
+    msg = "Bin labels must be one fewer than the number of bin edges"
+    data = [0.2, 1.4, 2.5, 6.2, 9.7, 2.1]
+
+    with pytest.raises(ValueError, match=msg):
+        cut(data, [0, 1, 10], labels=["foo", "bar", "baz"])
+
+
+@pytest.mark.parametrize(
+    "x,bins,msg",
+    [
+        ([], 2, "Cannot cut empty array"),
+        ([1, 2, 3], 0.5, "`bins` should be a positive integer"),
+    ],
+)
+def test_cut_corner(x, bins, msg):
+    with pytest.raises(ValueError, match=msg):
+        cut(x, bins)
+
+
+@pytest.mark.parametrize("arg", [2, np.eye(2), DataFrame(np.eye(2))])
+@pytest.mark.parametrize("cut_func", [cut, qcut])
+def test_cut_not_1d_arg(arg, cut_func):
+    msg = "Input array must be 1 dimensional"
+    with pytest.raises(ValueError, match=msg):
+        cut_func(arg, 2)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [0, 1, 2, 3, 4, np.inf],
+        [-np.inf, 0, 1, 2, 3, 4],
+        [-np.inf, 0, 1, 2, 3, 4, np.inf],
+    ],
+)
+def test_int_bins_with_inf(data):
+    # GH 24314
+    msg = "cannot specify integer `bins` when input data contains infinity"
+    with pytest.raises(ValueError, match=msg):
+        cut(data, bins=3)
+
+
+def test_cut_out_of_range_more():
+    # see gh-1511
+    name = "x"
+
+    ser = Series([0, -1, 0, 1, -3], name=name)
+    ind = cut(ser, [0, 1], labels=False)
+
+    exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name=name)
+    tm.assert_series_equal(ind, exp)
+
+
+@pytest.mark.parametrize(
+    "right,breaks,closed",
+    [
+        (True, [-1e-3, 0.25, 0.5, 0.75, 1], "right"),
+        (False, [0, 0.25, 0.5, 0.75, 1 + 1e-3], "left"),
+    ],
+)
+def test_labels(right, breaks, closed):
+    arr = np.tile(np.arange(0, 1.01, 0.1), 4)
+
+    result, bins = cut(arr, 4, retbins=True, right=right)
+    ex_levels = IntervalIndex.from_breaks(breaks, closed=closed)
+    tm.assert_index_equal(result.categories, ex_levels)
+
+
+def test_cut_pass_series_name_to_factor():
+    name = "foo"
+    ser = Series(np.random.default_rng(2).standard_normal(100), name=name)
+
+    factor = cut(ser, 4)
+    assert factor.name == name
+
+
+def test_label_precision():
+    arr = np.arange(0, 0.73, 0.01)
+    result = cut(arr, 4, precision=2)
+
+    ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36, 0.54, 0.72])
+    tm.assert_index_equal(result.categories, ex_levels)
+
+
+@pytest.mark.parametrize("labels", [None, False])
+def test_na_handling(labels):
+    arr = np.arange(0, 0.75, 0.01)
+    arr[::3] = np.nan
+
+    result = cut(arr, 4, labels=labels)
+    result = np.asarray(result)
+
+    expected = np.where(isna(arr), np.nan, result)
+    tm.assert_almost_equal(result, expected)
+
+
+def test_inf_handling():
+    data = np.arange(6)
+    data_ser = Series(data, dtype="int64")
+
+    bins = [-np.inf, 2, 4, np.inf]
+    result = cut(data, bins)
+    result_ser = cut(data_ser, bins)
+
+    ex_uniques = IntervalIndex.from_breaks(bins)
+    tm.assert_index_equal(result.categories, ex_uniques)
+
+    assert result[5] == Interval(4, np.inf)
+    assert result[0] == Interval(-np.inf, 2)
+    assert result_ser[5] == Interval(4, np.inf)
+    assert result_ser[0] == Interval(-np.inf, 2)
+
+
+def test_cut_out_of_bounds():
+    arr = np.random.default_rng(2).standard_normal(100)
+    result = cut(arr, [-1, 0, 1])
+
+    mask = isna(result)
+    ex_mask = (arr < -1) | (arr > 1)
+    tm.assert_numpy_array_equal(mask, ex_mask)
+
+
+@pytest.mark.parametrize(
+    "get_labels,get_expected",
+    [
+        (
+            lambda labels: labels,
+            lambda labels: Categorical(
+                ["Medium"] + 4 * ["Small"] + ["Medium", "Large"],
+                categories=labels,
+                ordered=True,
+            ),
+        ),
+        (
+            lambda labels: Categorical.from_codes([0, 1, 2], labels),
+            lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels),
+        ),
+    ],
+)
+def test_cut_pass_labels(get_labels, get_expected):
+    bins = [0, 25, 50, 100]
+    arr = [50, 5, 10, 15, 20, 30, 70]
+    labels = ["Small", "Medium", "Large"]
+
+    result = cut(arr, bins, labels=get_labels(labels))
+    tm.assert_categorical_equal(result, get_expected(labels))
+
+
+def test_cut_pass_labels_compat():
+    # see gh-16459
+    arr = [50, 5, 10, 15, 20, 30, 70]
+    labels = ["Good", "Medium", "Bad"]
+
+    result = cut(arr, 3, labels=labels)
+    exp = cut(arr, 3, labels=Categorical(labels, categories=labels, ordered=True))
+    tm.assert_categorical_equal(result, exp)
+
+
+@pytest.mark.parametrize("x", [np.arange(11.0), np.arange(11.0) / 1e10])
+def test_round_frac_just_works(x):
+    # It works.
+    cut(x, 2)
+
+
+@pytest.mark.parametrize(
+    "val,precision,expected",
+    [
+        (-117.9998, 3, -118),
+        (117.9998, 3, 118),
+        (117.9998, 2, 118),
+        (0.000123456, 2, 0.00012),
+    ],
+)
+def test_round_frac(val, precision, expected):
+    # see gh-1979
+    result = tmod._round_frac(val, precision=precision)
+    assert result == expected
+
+
+def test_cut_return_intervals():
+    ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
+    result = cut(ser, 3)
+
+    exp_bins = np.linspace(0, 8, num=4).round(3)
+    exp_bins[0] -= 0.008
+
+    expected = Series(
+        IntervalIndex.from_breaks(exp_bins, closed="right").take(
+            [0, 0, 0, 1, 1, 1, 2, 2, 2]
+        )
+    ).astype(CategoricalDtype(ordered=True))
+    tm.assert_series_equal(result, expected)
+
+
+def test_series_ret_bins():
+    # see gh-8589
+    ser = Series(np.arange(4))
+    result, bins = cut(ser, 2, retbins=True)
+
+    expected = Series(
+        IntervalIndex.from_breaks([-0.003, 1.5, 3], closed="right").repeat(2)
+    ).astype(CategoricalDtype(ordered=True))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "kwargs,msg",
+    [
+        ({"duplicates": "drop"}, None),
+        ({}, "Bin edges must be unique"),
+        ({"duplicates": "raise"}, "Bin edges must be unique"),
+        ({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
+    ],
+)
+def test_cut_duplicates_bin(kwargs, msg):
+    # see gh-20947
+    bins = [0, 2, 4, 6, 10, 10]
+    values = Series(np.array([1, 3, 5, 7, 9]), index=["a", "b", "c", "d", "e"])
+
+    if msg is not None:
+        with pytest.raises(ValueError, match=msg):
+            cut(values, bins, **kwargs)
+    else:
+        result = cut(values, bins, **kwargs)
+        expected = cut(values, pd.unique(np.asarray(bins)))
+        tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("data", [9.0, -9.0, 0.0])
+@pytest.mark.parametrize("length", [1, 2])
+def test_single_bin(data, length):
+    # see gh-14652, gh-15428
+    ser = Series([data] * length)
+    result = cut(ser, 1, labels=False)
+
+    expected = Series([0] * length, dtype=np.intp)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values,threshold",
+    [
+        ([0.1, 0.1, 0.1], 0.001),  # small positive values
+        ([-0.1, -0.1, -0.1], 0.001),  # negative values
+        ([0.01, 0.01, 0.01], 0.0001),  # very small values
+    ],
+)
+def test_single_bin_edge_adjustment(values, threshold):
+    # gh-58517 - edge adjustment mutation when all values are same
+    result, bins = cut(values, 3, retbins=True)
+
+    bin_range = bins[-1] - bins[0]
+    assert bin_range < threshold
+
+
+@pytest.mark.parametrize(
+    "array_1_writeable,array_2_writeable", [(True, True), (True, False), (False, False)]
+)
+def test_cut_read_only(array_1_writeable, array_2_writeable):
+    # issue 18773
+    array_1 = np.arange(0, 100, 10)
+    array_1.flags.writeable = array_1_writeable
+
+    array_2 = np.arange(0, 100, 10)
+    array_2.flags.writeable = array_2_writeable
+
+    hundred_elements = np.arange(100)
+    tm.assert_categorical_equal(
+        cut(hundred_elements, array_1), cut(hundred_elements, array_2)
+    )
+
+
+@pytest.mark.parametrize(
+    "conv",
+    [
+        lambda v: Timestamp(v),
+        lambda v: to_datetime(v),
+        lambda v: np.datetime64(v),
+        lambda v: Timestamp(v).to_pydatetime(),
+    ],
+)
+def test_datetime_bin(conv):
+    data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")]
+    bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"]
+
+    expected = Series(
+        IntervalIndex(
+            [
+                Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
+                Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])),
+            ]
+        )
+    )
+
+    bins = [conv(v) for v in bin_data]
+    result = Series(cut(data, bins=bins))
+
+    if type(bins[0]) is np.datetime64:
+        # The bins have microsecond dtype -> so does result
+        expected = expected.astype("interval[datetime64[s]]")
+
+    expected = expected.astype(CategoricalDtype(ordered=True))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("box", [Series, Index, np.array, list])
+def test_datetime_cut(unit, box):
+    # see gh-14714
+    #
+    # Testing time data when it comes in various collection types.
+    data = to_datetime(["2013-01-01", "2013-01-02", "2013-01-03"]).astype(f"M8[{unit}]")
+    data = box(data)
+    result, _ = cut(data, 3, retbins=True)
+
+    if unit == "s":
+        # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
+        # for why we round to 8 seconds instead of 7
+        left = DatetimeIndex(
+            ["2012-12-31 23:57:08", "2013-01-01 16:00:00", "2013-01-02 08:00:00"],
+            dtype=f"M8[{unit}]",
+        )
+    else:
+        left = DatetimeIndex(
+            [
+                "2012-12-31 23:57:07.200000",
+                "2013-01-01 16:00:00",
+                "2013-01-02 08:00:00",
+            ],
+            dtype=f"M8[{unit}]",
+        )
+    right = DatetimeIndex(
+        ["2013-01-01 16:00:00", "2013-01-02 08:00:00", "2013-01-03 00:00:00"],
+        dtype=f"M8[{unit}]",
+    )
+
+    exp_intervals = IntervalIndex.from_arrays(left, right)
+    expected = Series(exp_intervals).astype(CategoricalDtype(ordered=True))
+    tm.assert_series_equal(Series(result), expected)
+
+
+@pytest.mark.parametrize("box", [list, np.array, Index, Series])
+def test_datetime_tz_cut_mismatched_tzawareness(box):
+    # GH#54964
+    bins = box(
+        [
+            Timestamp("2013-01-01 04:57:07.200000"),
+            Timestamp("2013-01-01 21:00:00"),
+            Timestamp("2013-01-02 13:00:00"),
+            Timestamp("2013-01-03 05:00:00"),
+        ]
+    )
+    ser = Series(date_range("20130101", periods=3, tz="US/Eastern"))
+
+    msg = "Cannot use timezone-naive bins with timezone-aware values"
+    with pytest.raises(ValueError, match=msg):
+        cut(ser, bins)
+
+
+@pytest.mark.parametrize(
+    "bins",
+    [
+        3,
+        [
+            Timestamp("2013-01-01 04:57:07.200000", tz="UTC").tz_convert("US/Eastern"),
+            Timestamp("2013-01-01 21:00:00", tz="UTC").tz_convert("US/Eastern"),
+            Timestamp("2013-01-02 13:00:00", tz="UTC").tz_convert("US/Eastern"),
+            Timestamp("2013-01-03 05:00:00", tz="UTC").tz_convert("US/Eastern"),
+        ],
+    ],
+)
+@pytest.mark.parametrize("box", [list, np.array, Index, Series])
+def test_datetime_tz_cut(bins, box):
+    # see gh-19872
+    tz = "US/Eastern"
+    ser = Series(date_range("20130101", periods=3, tz=tz, unit="ns"))
+
+    if not isinstance(bins, int):
+        bins = box(bins)
+
+    result = cut(ser, bins)
+    ii = IntervalIndex(
+        [
+            Interval(
+                Timestamp("2012-12-31 23:57:07.200000", tz=tz),
+                Timestamp("2013-01-01 16:00:00", tz=tz),
+            ),
+            Interval(
+                Timestamp("2013-01-01 16:00:00", tz=tz),
+                Timestamp("2013-01-02 08:00:00", tz=tz),
+            ),
+            Interval(
+                Timestamp("2013-01-02 08:00:00", tz=tz),
+                Timestamp("2013-01-03 00:00:00", tz=tz),
+            ),
+        ]
+    )
+    if isinstance(bins, int):
+        # the dtype is inferred from ser, which has nanosecond unit
+        ii = ii.astype("interval[datetime64[ns, US/Eastern]]")
+    expected = Series(ii).astype(CategoricalDtype(ordered=True))
+    tm.assert_series_equal(result, expected)
+
+
+def test_datetime_nan_error():
+    msg = "bins must be of datetime64 dtype"
+
+    with pytest.raises(ValueError, match=msg):
+        cut(date_range("20130101", periods=3), bins=[0, 2, 4])
+
+
+def test_datetime_nan_mask():
+    result = cut(
+        date_range("20130102", periods=5), bins=date_range("20130101", periods=2)
+    )
+
+    mask = result.categories.isna()
+    tm.assert_numpy_array_equal(mask, np.array([False]))
+
+    mask = result.isna()
+    tm.assert_numpy_array_equal(mask, np.array([False, True, True, True, True]))
+
+
+@pytest.mark.parametrize("tz", [None, "UTC", "US/Pacific"])
+def test_datetime_cut_roundtrip(tz, unit):
+    # see gh-19891
+    ser = Series(date_range("20180101", periods=3, tz=tz, unit=unit))
+    result, result_bins = cut(ser, 2, retbins=True)
+
+    expected = cut(ser, result_bins)
+    tm.assert_series_equal(result, expected)
+
+    if unit == "s":
+        # TODO: constructing DatetimeIndex with dtype="M8[s]" without truncating
+        #  the first entry here raises in array_to_datetime. Should truncate
+        #  instead of raising?
+        # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425
+        # for why we round to 8 seconds instead of 7
+        expected_bins = DatetimeIndex(
+            ["2017-12-31 23:57:08", "2018-01-02 00:00:00", "2018-01-03 00:00:00"],
+            dtype=f"M8[{unit}]",
+        )
+    else:
+        expected_bins = DatetimeIndex(
+            [
+                "2017-12-31 23:57:07.200000",
+                "2018-01-02 00:00:00",
+                "2018-01-03 00:00:00",
+            ],
+            dtype=f"M8[{unit}]",
+        )
+    expected_bins = expected_bins.tz_localize(tz)
+    tm.assert_index_equal(result_bins, expected_bins)
+
+
+def test_timedelta_cut_roundtrip():
+    # see gh-19891
+    ser = Series(timedelta_range("1day", periods=3))
+    result, result_bins = cut(ser, 2, retbins=True)
+
+    expected = cut(ser, result_bins)
+    tm.assert_series_equal(result, expected)
+
+    expected_bins = TimedeltaIndex(
+        ["0 days 23:57:07.200000", "2 days 00:00:00", "3 days 00:00:00"]
+    )
+    tm.assert_index_equal(result_bins, expected_bins)
+
+
+@pytest.mark.parametrize("bins", [6, 7])
+@pytest.mark.parametrize(
+    "box, compare",
+    [
+        (Series, tm.assert_series_equal),
+        (np.array, tm.assert_categorical_equal),
+        (list, tm.assert_equal),
+    ],
+)
+def test_cut_bool_coercion_to_int(bins, box, compare):
+    # issue 20303
+    data_expected = box([0, 1, 1, 0, 1] * 10)
+    data_result = box([False, True, True, False, True] * 10)
+    expected = cut(data_expected, bins, duplicates="drop")
+    result = cut(data_result, bins, duplicates="drop")
+    compare(result, expected)
+
+
+@pytest.mark.parametrize("labels", ["foo", 1, True])
+def test_cut_incorrect_labels(labels):
+    # GH 13318
+    values = range(5)
+    msg = "Bin labels must either be False, None or passed in as a list-like argument"
+    with pytest.raises(ValueError, match=msg):
+        cut(values, 4, labels=labels)
+
+
+@pytest.mark.parametrize("bins", [3, [0, 5, 15]])
+@pytest.mark.parametrize("right", [True, False])
+@pytest.mark.parametrize("include_lowest", [True, False])
+def test_cut_nullable_integer(bins, right, include_lowest):
+    a = np.random.default_rng(2).integers(0, 10, size=50).astype(float)
+    a[::2] = np.nan
+    b = a.astype(object)
+    b[::2] = pd.NA
+    result = cut(
+        pd.array(b, dtype="Int64"), bins, right=right, include_lowest=include_lowest
+    )
+    expected = cut(a, bins, right=right, include_lowest=include_lowest)
+    tm.assert_categorical_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data, bins, labels, expected_codes, expected_labels",
+    [
+        ([15, 17, 19], [14, 16, 18, 20], ["A", "B", "A"], [0, 1, 0], ["A", "B"]),
+        ([1, 3, 5], [0, 2, 4, 6, 8], [2, 0, 1, 2], [2, 0, 1], [0, 1, 2]),
+    ],
+)
+def test_cut_non_unique_labels(data, bins, labels, expected_codes, expected_labels):
+    # GH 33141
+    result = cut(data, bins=bins, labels=labels, ordered=False)
+    expected = Categorical.from_codes(
+        expected_codes, categories=expected_labels, ordered=False
+    )
+    tm.assert_categorical_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data, bins, labels, expected_codes, expected_labels",
+    [
+        ([15, 17, 19], [14, 16, 18, 20], ["C", "B", "A"], [0, 1, 2], ["C", "B", "A"]),
+        ([1, 3, 5], [0, 2, 4, 6, 8], [3, 0, 1, 2], [0, 1, 2], [3, 0, 1, 2]),
+    ],
+)
+def test_cut_unordered_labels(data, bins, labels, expected_codes, expected_labels):
+    # GH 33141
+    result = cut(data, bins=bins, labels=labels, ordered=False)
+    expected = Categorical.from_codes(
+        expected_codes, categories=expected_labels, ordered=False
+    )
+    tm.assert_categorical_equal(result, expected)
+
+
+def test_cut_unordered_with_missing_labels_raises_error():
+    # GH 33141
+    msg = "'labels' must be provided if 'ordered = False'"
+    with pytest.raises(ValueError, match=msg):
+        cut([0.5, 3], bins=[0, 1, 2], ordered=False)
+
+
+def test_cut_unordered_with_series_labels():
+    # https://github.com/pandas-dev/pandas/issues/36603
+    ser = Series([1, 2, 3, 4, 5])
+    bins = Series([0, 2, 4, 6])
+    labels = Series(["a", "b", "c"])
+    result = cut(ser, bins=bins, labels=labels, ordered=False)
+    expected = Series(["a", "a", "b", "b", "c"], dtype="category")
+    tm.assert_series_equal(result, expected)
+
+
+def test_cut_no_warnings():
+    df = DataFrame({"value": np.random.default_rng(2).integers(0, 100, 20)})
+    labels = [f"{i} - {i + 9}" for i in range(0, 100, 10)]
+    with tm.assert_produces_warning(False):
+        df["group"] = cut(df.value, range(0, 105, 10), right=False, labels=labels)
+
+
+def test_cut_with_duplicated_index_lowest_included():
+    # GH 42185
+    expected = Series(
+        [Interval(-0.001, 2, closed="right")] * 3
+        + [Interval(2, 4, closed="right"), Interval(-0.001, 2, closed="right")],
+        index=[0, 1, 2, 3, 0],
+        dtype="category",
+    ).cat.as_ordered()
+
+    ser = Series([0, 1, 2, 3, 0], index=[0, 1, 2, 3, 0])
+    result = cut(ser, bins=[0, 2, 4], include_lowest=True)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.filterwarnings("ignore:invalid value encountered in cast:RuntimeWarning")
+def test_cut_with_nonexact_categorical_indices():
+    # GH 42424
+
+    ser = Series(range(100))
+    ser1 = cut(ser, 10).value_counts().head(5)
+    ser2 = cut(ser, 10).value_counts().tail(5)
+    result = DataFrame({"1": ser1, "2": ser2})
+
+    index = pd.CategoricalIndex(
+        [
+            Interval(-0.099, 9.9, closed="right"),
+            Interval(9.9, 19.8, closed="right"),
+            Interval(19.8, 29.7, closed="right"),
+            Interval(29.7, 39.6, closed="right"),
+            Interval(39.6, 49.5, closed="right"),
+            Interval(49.5, 59.4, closed="right"),
+            Interval(59.4, 69.3, closed="right"),
+            Interval(69.3, 79.2, closed="right"),
+            Interval(79.2, 89.1, closed="right"),
+            Interval(89.1, 99, closed="right"),
+        ],
+        ordered=True,
+    )
+
+    expected = DataFrame(
+        {"1": [10] * 5 + [np.nan] * 5, "2": [np.nan] * 5 + [10] * 5}, index=index
+    )
+
+    tm.assert_frame_equal(expected, result)
+
+
+def test_cut_with_timestamp_tuple_labels():
+    # GH 40661
+    labels = [(Timestamp(10),), (Timestamp(20),), (Timestamp(30),)]
+    result = cut([2, 4, 6], bins=[1, 3, 5, 7], labels=labels)
+
+    expected = Categorical.from_codes([0, 1, 2], labels, ordered=True)
+    tm.assert_categorical_equal(result, expected)
+
+
+def test_cut_bins_datetime_intervalindex():
+    # https://github.com/pandas-dev/pandas/issues/46218
+    bins = interval_range(Timestamp("2022-02-25"), Timestamp("2022-02-27"), freq="1D")
+    # passing Series instead of list is important to trigger bug
+    result = cut(Series([Timestamp("2022-02-26")]), bins=bins)
+    expected = Categorical.from_codes([0], bins, ordered=True)
+    tm.assert_categorical_equal(result.array, expected)
+
+
+def test_cut_with_nullable_int64():
+    # GH 30787
+    series = Series([0, 1, 2, 3, 4, pd.NA, 6, 7], dtype="Int64")
+    bins = [0, 2, 4, 6, 8]
+    intervals = IntervalIndex.from_breaks(bins)
+
+    expected = Series(
+        Categorical.from_codes([-1, 0, 0, 1, 1, -1, 2, 3], intervals, ordered=True)
+    )
+
+    result = cut(series, bins=bins)
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_cut_datetime_array_no_attributeerror():
+    # GH 55431
+    ser = Series(to_datetime(["2023-10-06 12:00:00+0000", "2023-10-07 12:00:00+0000"]))
+
+    result = cut(ser.array, bins=2)
+
+    categories = result.categories
+    expected = Categorical.from_codes([0, 1], categories=categories, ordered=True)
+
+    tm.assert_categorical_equal(
+        result, expected, check_dtype=True, check_category_order=True
+    )
diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py
new file mode 100644
index 0000000000000000000000000000000000000000..0997baf7c3f74bf6b73e62e02283cea9eb1bd696
--- /dev/null
+++ b/pandas/tests/reshape/test_from_dummies.py
@@ -0,0 +1,477 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Series,
+    from_dummies,
+    get_dummies,
+)
+import pandas._testing as tm
+
+
+@pytest.fixture
+def dummies_basic():
+    return DataFrame(
+        {
+            "col1_a": [1, 0, 1],
+            "col1_b": [0, 1, 0],
+            "col2_a": [0, 1, 0],
+            "col2_b": [1, 0, 0],
+            "col2_c": [0, 0, 1],
+        },
+    )
+
+
+@pytest.fixture
+def dummies_with_unassigned():
+    return DataFrame(
+        {
+            "col1_a": [1, 0, 0],
+            "col1_b": [0, 1, 0],
+            "col2_a": [0, 1, 0],
+            "col2_b": [0, 0, 0],
+            "col2_c": [0, 0, 1],
+        },
+    )
+
+
+def test_error_wrong_data_type():
+    dummies = [0, 1, 0]
+    with pytest.raises(
+        TypeError,
+        match=r"Expected 'data' to be a 'DataFrame'; Received 'data' of type: list",
+    ):
+        from_dummies(dummies)
+
+
+def test_error_no_prefix_contains_unassigned():
+    dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
+    with pytest.raises(
+        ValueError,
+        match=(
+            r"Dummy DataFrame contains unassigned value\(s\); "
+            r"First instance in row: 2"
+        ),
+    ):
+        from_dummies(dummies)
+
+
+def test_error_no_prefix_wrong_default_category_type():
+    dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
+    with pytest.raises(
+        TypeError,
+        match=(
+            r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
+            r"Received 'default_category' of type: list"
+        ),
+    ):
+        from_dummies(dummies, default_category=["c", "d"])
+
+
+def test_error_no_prefix_multi_assignment():
+    dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
+    with pytest.raises(
+        ValueError,
+        match=(
+            r"Dummy DataFrame contains multi-assignment\(s\); "
+            r"First instance in row: 2"
+        ),
+    ):
+        from_dummies(dummies)
+
+
+def test_error_no_prefix_contains_nan():
+    dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]})
+    with pytest.raises(
+        ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'"
+    ):
+        from_dummies(dummies)
+
+
+def test_error_contains_non_dummies():
+    dummies = DataFrame(
+        {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]}
+    )
+    with pytest.raises(
+        TypeError,
+        match=r"Passed DataFrame contains non-dummy data",
+    ):
+        from_dummies(dummies)
+
+
+def test_error_with_prefix_multiple_separators():
+    dummies = DataFrame(
+        {
+            "col1_a": [1, 0, 1],
+            "col1_b": [0, 1, 0],
+            "col2-a": [0, 1, 0],
+            "col2-b": [1, 0, 1],
+        },
+    )
+    with pytest.raises(
+        ValueError,
+        match=(r"Separator not specified for column: col2-a"),
+    ):
+        from_dummies(dummies, sep="_")
+
+
+def test_error_with_prefix_sep_wrong_type(dummies_basic):
+    with pytest.raises(
+        TypeError,
+        match=(
+            r"Expected 'sep' to be of type 'str' or 'None'; "
+            r"Received 'sep' of type: list"
+        ),
+    ):
+        from_dummies(dummies_basic, sep=["_"])
+
+
+def test_error_with_prefix_contains_unassigned(dummies_with_unassigned):
+    with pytest.raises(
+        ValueError,
+        match=(
+            r"Dummy DataFrame contains unassigned value\(s\); "
+            r"First instance in row: 2"
+        ),
+    ):
+        from_dummies(dummies_with_unassigned, sep="_")
+
+
+def test_error_with_prefix_default_category_wrong_type(dummies_with_unassigned):
+    with pytest.raises(
+        TypeError,
+        match=(
+            r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
+            r"Received 'default_category' of type: list"
+        ),
+    ):
+        from_dummies(dummies_with_unassigned, sep="_", default_category=["x", "y"])
+
+
+def test_error_with_prefix_default_category_dict_not_complete(
+    dummies_with_unassigned,
+):
+    with pytest.raises(
+        ValueError,
+        match=(
+            r"Length of 'default_category' \(1\) did not match "
+            r"the length of the columns being encoded \(2\)"
+        ),
+    ):
+        from_dummies(dummies_with_unassigned, sep="_", default_category={"col1": "x"})
+
+
+def test_error_with_prefix_contains_nan(dummies_basic):
+    # Set float64 dtype to avoid upcast when setting np.nan
+    dummies_basic["col2_c"] = dummies_basic["col2_c"].astype("float64")
+    dummies_basic.loc[2, "col2_c"] = np.nan
+    with pytest.raises(
+        ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'"
+    ):
+        from_dummies(dummies_basic, sep="_")
+
+
+def test_error_with_prefix_contains_non_dummies(dummies_basic):
+    # Set object dtype to avoid upcast when setting "str"
+    dummies_basic["col2_c"] = dummies_basic["col2_c"].astype(object)
+    dummies_basic.loc[2, "col2_c"] = "str"
+    with pytest.raises(TypeError, match=r"Passed DataFrame contains non-dummy data"):
+        from_dummies(dummies_basic, sep="_")
+
+
+def test_error_with_prefix_double_assignment():
+    dummies = DataFrame(
+        {
+            "col1_a": [1, 0, 1],
+            "col1_b": [1, 1, 0],
+            "col2_a": [0, 1, 0],
+            "col2_b": [1, 0, 0],
+            "col2_c": [0, 0, 1],
+        },
+    )
+    with pytest.raises(
+        ValueError,
+        match=(
+            r"Dummy DataFrame contains multi-assignment\(s\); "
+            r"First instance in row: 0"
+        ),
+    ):
+        from_dummies(dummies, sep="_")
+
+
+def test_roundtrip_series_to_dataframe():
+    categories = Series(["a", "b", "c", "a"])
+    dummies = get_dummies(categories)
+    result = from_dummies(dummies)
+    expected = DataFrame({"": ["a", "b", "c", "a"]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_roundtrip_single_column_dataframe():
+    categories = DataFrame({"": ["a", "b", "c", "a"]})
+    dummies = get_dummies(categories)
+    result = from_dummies(dummies, sep="_")
+    expected = categories
+    tm.assert_frame_equal(result, expected)
+
+
+def test_roundtrip_with_prefixes():
+    categories = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
+    dummies = get_dummies(categories)
+    result = from_dummies(dummies, sep="_")
+    expected = categories
+    tm.assert_frame_equal(result, expected)
+
+
+def test_no_prefix_string_cats_basic():
+    dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]})
+    expected = DataFrame({"": ["a", "b", "c", "a"]})
+    result = from_dummies(dummies)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_no_prefix_string_cats_basic_bool_values():
+    dummies = DataFrame(
+        {
+            "a": [True, False, False, True],
+            "b": [False, True, False, False],
+            "c": [False, False, True, False],
+        }
+    )
+    expected = DataFrame({"": ["a", "b", "c", "a"]})
+    result = from_dummies(dummies)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_no_prefix_string_cats_basic_mixed_bool_values():
+    dummies = DataFrame(
+        {"a": [1, 0, 0, 1], "b": [False, True, False, False], "c": [0, 0, 1, 0]}
+    )
+    expected = DataFrame({"": ["a", "b", "c", "a"]})
+    result = from_dummies(dummies)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_no_prefix_int_cats_basic():
+    dummies = DataFrame(
+        {1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]}
+    )
+    expected = DataFrame({"": [1, 25, 2, 5]})
+    result = from_dummies(dummies)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_no_prefix_float_cats_basic():
+    dummies = DataFrame(
+        {1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]}
+    )
+    expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]})
+    result = from_dummies(dummies)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_no_prefix_mixed_cats_basic():
+    dummies = DataFrame(
+        {
+            1.23: [1, 0, 0, 0, 0],
+            "c": [0, 1, 0, 0, 0],
+            2: [0, 0, 1, 0, 0],
+            False: [0, 0, 0, 1, 0],
+            None: [0, 0, 0, 0, 1],
+        }
+    )
+    expected = DataFrame({"": [1.23, "c", 2, False, None]}, dtype="object")
+    result = from_dummies(dummies)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_no_prefix_string_cats_contains_get_dummies_NaN_column():
+    dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]})
+    expected = DataFrame({"": ["a", "b", "NaN"]})
+    result = from_dummies(dummies)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "default_category, expected",
+    [
+        pytest.param(
+            "c",
+            {"": ["a", "b", "c"]},
+            id="default_category is a str",
+        ),
+        pytest.param(
+            1,
+            {"": ["a", "b", 1]},
+            id="default_category is an int",
+        ),
+        pytest.param(
+            1.25,
+            {"": ["a", "b", 1.25]},
+            id="default_category is a float",
+        ),
+        pytest.param(
+            0,
+            {"": ["a", "b", 0]},
+            id="default_category is a 0",
+        ),
+        pytest.param(
+            False,
+            {"": ["a", "b", False]},
+            id="default_category is a bool",
+        ),
+        pytest.param(
+            (1, 2),
+            {"": ["a", "b", (1, 2)]},
+            id="default_category is a tuple",
+        ),
+    ],
+)
+def test_no_prefix_string_cats_default_category(
+    default_category, expected, using_infer_string
+):
+    dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
+    result = from_dummies(dummies, default_category=default_category)
+    expected = DataFrame(expected, dtype=dummies.columns.dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_with_prefix_basic(dummies_basic):
+    expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
+    result = from_dummies(dummies_basic, sep="_")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_with_prefix_contains_get_dummies_NaN_column():
+    dummies = DataFrame(
+        {
+            "col1_a": [1, 0, 0],
+            "col1_b": [0, 1, 0],
+            "col1_NaN": [0, 0, 1],
+            "col2_a": [0, 1, 0],
+            "col2_b": [0, 0, 0],
+            "col2_c": [0, 0, 1],
+            "col2_NaN": [1, 0, 0],
+        },
+    )
+    expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]})
+    result = from_dummies(dummies, sep="_")
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "default_category, expected",
+    [
+        pytest.param(
+            "x",
+            {"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]},
+            id="default_category is a str",
+        ),
+        pytest.param(
+            0,
+            {"col1": ["a", "b", 0], "col2": [0, "a", "c"]},
+            id="default_category is a 0",
+        ),
+        pytest.param(
+            False,
+            {"col1": ["a", "b", False], "col2": [False, "a", "c"]},
+            id="default_category is a False",
+        ),
+        pytest.param(
+            {"col2": 1, "col1": 2.5},
+            {"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]},
+            id="default_category is a dict with int and float values",
+        ),
+        pytest.param(
+            {"col2": None, "col1": False},
+            {"col1": ["a", "b", False], "col2": [None, "a", "c"]},
+            id="default_category is a dict with bool and None values",
+        ),
+        pytest.param(
+            {"col2": (1, 2), "col1": [1.25, False]},
+            {"col1": ["a", "b", [1.25, False]], "col2": [(1, 2), "a", "c"]},
+            id="default_category is a dict with list and tuple values",
+        ),
+    ],
+)
+def test_with_prefix_default_category(
+    dummies_with_unassigned, default_category, expected, using_infer_string
+):
+    result = from_dummies(
+        dummies_with_unassigned, sep="_", default_category=default_category
+    )
+    expected = DataFrame(expected)
+    if using_infer_string:
+        expected = expected.astype("str")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_ea_categories():
+    # GH 54300
+    df = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]})
+    df.columns = df.columns.astype("string[python]")
+    result = from_dummies(df)
+    expected = DataFrame({"": Series(list("abca"), dtype="string[python]")})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_ea_categories_with_sep():
+    # GH 54300
+    df = DataFrame(
+        {
+            "col1_a": [1, 0, 1],
+            "col1_b": [0, 1, 0],
+            "col2_a": [0, 1, 0],
+            "col2_b": [1, 0, 0],
+            "col2_c": [0, 0, 1],
+        }
+    )
+    df.columns = df.columns.astype("string[python]")
+    result = from_dummies(df, sep="_")
+    expected = DataFrame(
+        {
+            "col1": Series(list("aba"), dtype="string[python]"),
+            "col2": Series(list("bac"), dtype="string[python]"),
+        }
+    )
+    expected.columns = expected.columns.astype("string[python]")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_maintain_original_index():
+    # GH 54300
+    df = DataFrame(
+        {"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]}, index=list("abcd")
+    )
+    result = from_dummies(df)
+    expected = DataFrame({"": list("abca")}, index=list("abcd"))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_int_columns_with_float_default():
+    # https://github.com/pandas-dev/pandas/pull/60694
+    df = DataFrame(
+        {
+            3: [1, 0, 0],
+            4: [0, 1, 0],
+        },
+    )
+    with pytest.raises(ValueError, match="Trying to coerce float values to integers"):
+        from_dummies(df, default_category=0.5)
+
+
+def test_object_dtype_preserved():
+    # https://github.com/pandas-dev/pandas/pull/60694
+    # When the input has object dtype, the result should as
+    # well even when infer_string is True.
+    df = DataFrame(
+        {
+            "x": [1, 0, 0],
+            "y": [0, 1, 0],
+        },
+    )
+    df.columns = df.columns.astype("object")
+    result = from_dummies(df, default_category="z")
+    expected = DataFrame({"": ["x", "y", "z"]}, dtype="object")
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py
new file mode 100644
index 0000000000000000000000000000000000000000..c776e7b2e3d9a4f6ab99042cf95a6996d1ba5b60
--- /dev/null
+++ b/pandas/tests/reshape/test_get_dummies.py
@@ -0,0 +1,741 @@
+import re
+import unicodedata
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.common import is_integer_dtype
+
+import pandas as pd
+from pandas import (
+    ArrowDtype,
+    Categorical,
+    CategoricalDtype,
+    CategoricalIndex,
+    DataFrame,
+    Index,
+    RangeIndex,
+    Series,
+    SparseDtype,
+    get_dummies,
+)
+import pandas._testing as tm
+from pandas.core.arrays.sparse import SparseArray
+
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+
+
+class TestGetDummies:
+    @pytest.fixture
+    def df(self):
+        return DataFrame({"A": ["a", "b", "a"], "B": ["b", "b", "c"], "C": [1, 2, 3]})
+
+    @pytest.fixture(params=["uint8", "i8", np.float64, bool, None])
+    def dtype(self, request):
+        return np.dtype(request.param)
+
+    @pytest.fixture(params=["dense", "sparse"])
+    def sparse(self, request):
+        # params are strings to simplify reading test results,
+        # e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
+        return request.param == "sparse"
+
+    def effective_dtype(self, dtype):
+        if dtype is None:
+            return np.uint8
+        return dtype
+
+    def test_get_dummies_raises_on_dtype_object(self, df):
+        msg = "dtype=object is not a valid dtype for get_dummies"
+        with pytest.raises(ValueError, match=msg):
+            get_dummies(df, dtype="object")
+
+    def test_get_dummies_basic(self, sparse, dtype):
+        s_list = list("abc")
+        s_series = Series(s_list)
+        s_series_index = Series(s_list, list("ABC"))
+
+        expected = DataFrame(
+            {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
+            dtype=self.effective_dtype(dtype),
+        )
+        if sparse:
+            if dtype.kind == "b":
+                expected = expected.apply(SparseArray, fill_value=False)
+            else:
+                expected = expected.apply(SparseArray, fill_value=0.0)
+        result = get_dummies(s_list, sparse=sparse, dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+        result = get_dummies(s_series, sparse=sparse, dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+        expected.index = list("ABC")
+        result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    def test_get_dummies_basic_types(self, sparse, dtype, using_infer_string):
+        # GH 10531
+        s_list = list("abc")
+        s_series = Series(s_list)
+        s_df = DataFrame(
+            {"a": [0, 1, 0, 1, 2], "b": ["A", "A", "B", "C", "C"], "c": [2, 3, 3, 3, 2]}
+        )
+
+        expected = DataFrame(
+            {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
+            dtype=self.effective_dtype(dtype),
+            columns=list("abc"),
+        )
+        if sparse:
+            if is_integer_dtype(dtype):
+                fill_value = 0
+            elif dtype == bool:
+                fill_value = False
+            else:
+                fill_value = 0.0
+
+            expected = expected.apply(SparseArray, fill_value=fill_value)
+        result = get_dummies(s_list, sparse=sparse, dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+        result = get_dummies(s_series, sparse=sparse, dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+        result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype)
+        if sparse:
+            dtype_name = f"Sparse[{self.effective_dtype(dtype).name}, {fill_value}]"
+        else:
+            dtype_name = self.effective_dtype(dtype).name
+
+        expected = Series({dtype_name: 8}, name="count")
+        result = result.dtypes.value_counts()
+        result.index = [str(i) for i in result.index]
+        tm.assert_series_equal(result, expected)
+
+        result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype)
+
+        key = "str" if using_infer_string else "object"
+        expected_counts = {"int64": 1, key: 1}
+        expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
+
+        expected = Series(expected_counts, name="count").sort_index()
+        result = result.dtypes.value_counts()
+        result.index = [str(i) for i in result.index]
+        result = result.sort_index()
+        tm.assert_series_equal(result, expected)
+
+    def test_get_dummies_just_na(self, sparse):
+        just_na_list = [np.nan]
+        just_na_series = Series(just_na_list)
+        just_na_series_index = Series(just_na_list, index=["A"])
+
+        res_list = get_dummies(just_na_list, sparse=sparse)
+        res_series = get_dummies(just_na_series, sparse=sparse)
+        res_series_index = get_dummies(just_na_series_index, sparse=sparse)
+
+        assert res_list.empty
+        assert res_series.empty
+        assert res_series_index.empty
+
+        assert res_list.index.tolist() == [0]
+        assert res_series.index.tolist() == [0]
+        assert res_series_index.index.tolist() == ["A"]
+
+    def test_get_dummies_include_na(self, sparse, dtype):
+        s = ["a", "b", np.nan]
+        res = get_dummies(s, sparse=sparse, dtype=dtype)
+        exp = DataFrame(
+            {"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype)
+        )
+        if sparse:
+            if dtype.kind == "b":
+                exp = exp.apply(SparseArray, fill_value=False)
+            else:
+                exp = exp.apply(SparseArray, fill_value=0.0)
+        tm.assert_frame_equal(res, exp)
+
+        # Sparse dataframes do not allow nan labelled columns, see #GH8822
+        res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
+        exp_na = DataFrame(
+            {np.nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]},
+            dtype=self.effective_dtype(dtype),
+        )
+        exp_na = exp_na.reindex(["a", "b", np.nan], axis=1)
+        # hack (NaN handling in assert_index_equal)
+        exp_na.columns = res_na.columns
+        if sparse:
+            if dtype.kind == "b":
+                exp_na = exp_na.apply(SparseArray, fill_value=False)
+            else:
+                exp_na = exp_na.apply(SparseArray, fill_value=0.0)
+        tm.assert_frame_equal(res_na, exp_na)
+
+        res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype)
+        exp_just_na = DataFrame(
+            Series(1, index=[0]), columns=[np.nan], dtype=self.effective_dtype(dtype)
+        )
+        tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
+
+    def test_get_dummies_unicode(self, sparse):
+        # See GH 6885 - get_dummies chokes on unicode values
+        e = "e"
+        eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE")
+        s = [e, eacute, eacute]
+        res = get_dummies(s, prefix="letter", sparse=sparse)
+        exp = DataFrame(
+            {"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
+        )
+        if sparse:
+            exp = exp.apply(SparseArray, fill_value=False)
+        tm.assert_frame_equal(res, exp)
+
+    def test_dataframe_dummies_all_obj(self, df, sparse):
+        df = df[["A", "B"]]
+        result = get_dummies(df, sparse=sparse)
+        expected = DataFrame(
+            {"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
+            dtype=bool,
+        )
+        if sparse:
+            expected = DataFrame(
+                {
+                    "A_a": SparseArray([1, 0, 1], dtype="bool"),
+                    "A_b": SparseArray([0, 1, 0], dtype="bool"),
+                    "B_b": SparseArray([1, 1, 0], dtype="bool"),
+                    "B_c": SparseArray([0, 0, 1], dtype="bool"),
+                }
+            )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_string_dtype(self, df, any_string_dtype):
+        # GH44965
+        df = df[["A", "B"]]
+        df = df.astype({"A": "str", "B": any_string_dtype})
+        result = get_dummies(df)
+        expected = DataFrame(
+            {
+                "A_a": [1, 0, 1],
+                "A_b": [0, 1, 0],
+                "B_b": [1, 1, 0],
+                "B_c": [0, 0, 1],
+            },
+            dtype=bool,
+        )
+        if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA:
+            expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean")
+        tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
+        result = get_dummies(df, sparse=sparse, dtype=dtype)
+        if sparse:
+            arr = SparseArray
+            if dtype.kind == "b":
+                typ = SparseDtype(dtype, False)
+            else:
+                typ = SparseDtype(dtype, 0)
+        else:
+            arr = np.array
+            typ = dtype
+        expected = DataFrame(
+            {
+                "C": [1, 2, 3],
+                "A_a": arr([1, 0, 1], dtype=typ),
+                "A_b": arr([0, 1, 0], dtype=typ),
+                "B_b": arr([1, 1, 0], dtype=typ),
+                "B_c": arr([0, 0, 1], dtype=typ),
+            }
+        )
+        expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_prefix_list(self, df, sparse):
+        prefixes = ["from_A", "from_B"]
+        result = get_dummies(df, prefix=prefixes, sparse=sparse)
+        expected = DataFrame(
+            {
+                "C": [1, 2, 3],
+                "from_A_a": [True, False, True],
+                "from_A_b": [False, True, False],
+                "from_B_b": [True, True, False],
+                "from_B_c": [False, False, True],
+            },
+        )
+        expected[["C"]] = df[["C"]]
+        cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
+        expected = expected[["C", *cols]]
+
+        typ = SparseArray if sparse else Series
+        expected[cols] = expected[cols].apply(lambda x: typ(x))
+        tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_prefix_str(self, df, sparse):
+        # not that you should do this...
+        result = get_dummies(df, prefix="bad", sparse=sparse)
+        bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
+        expected = DataFrame(
+            [
+                [1, True, False, True, False],
+                [2, False, True, True, False],
+                [3, True, False, False, True],
+            ],
+            columns=["C", *bad_columns],
+        )
+        expected = expected.astype({"C": np.int64})
+        if sparse:
+            # work around astyping & assigning with duplicate columns
+            # https://github.com/pandas-dev/pandas/issues/14427
+            expected = pd.concat(
+                [
+                    Series([1, 2, 3], name="C"),
+                    Series([True, False, True], name="bad_a", dtype="Sparse[bool]"),
+                    Series([False, True, False], name="bad_b", dtype="Sparse[bool]"),
+                    Series([True, True, False], name="bad_b", dtype="Sparse[bool]"),
+                    Series([False, False, True], name="bad_c", dtype="Sparse[bool]"),
+                ],
+                axis=1,
+            )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_subset(self, df, sparse):
+        result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse)
+        expected = DataFrame(
+            {
+                "B": ["b", "b", "c"],
+                "C": [1, 2, 3],
+                "from_A_a": [1, 0, 1],
+                "from_A_b": [0, 1, 0],
+            },
+        )
+        cols = expected.columns
+        expected[cols[1:]] = expected[cols[1:]].astype(bool)
+        expected[["C"]] = df[["C"]]
+        if sparse:
+            cols = ["from_A_a", "from_A_b"]
+            expected[cols] = expected[cols].astype(SparseDtype("bool", False))
+        tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_prefix_sep(self, df, sparse):
+        result = get_dummies(df, prefix_sep="..", sparse=sparse)
+        expected = DataFrame(
+            {
+                "C": [1, 2, 3],
+                "A..a": [True, False, True],
+                "A..b": [False, True, False],
+                "B..b": [True, True, False],
+                "B..c": [False, False, True],
+            },
+        )
+        expected[["C"]] = df[["C"]]
+        expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
+        if sparse:
+            cols = ["A..a", "A..b", "B..b", "B..c"]
+            expected[cols] = expected[cols].astype(SparseDtype("bool", False))
+
+        tm.assert_frame_equal(result, expected)
+
+        result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse)
+        expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"})
+        tm.assert_frame_equal(result, expected)
+
+        result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse)
+        tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
+        msg = re.escape(
+            "Length of 'prefix' (1) did not match the length of the columns being "
+            "encoded (2)"
+        )
+        with pytest.raises(ValueError, match=msg):
+            get_dummies(df, prefix=["too few"], sparse=sparse)
+
+    def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
+        msg = re.escape(
+            "Length of 'prefix_sep' (1) did not match the length of the columns being "
+            "encoded (2)"
+        )
+        with pytest.raises(ValueError, match=msg):
+            get_dummies(df, prefix_sep=["bad"], sparse=sparse)
+
+    def test_dataframe_dummies_prefix_dict(self, sparse):
+        prefixes = {"A": "from_A", "B": "from_B"}
+        df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]})
+        result = get_dummies(df, prefix=prefixes, sparse=sparse)
+
+        expected = DataFrame(
+            {
+                "C": [1, 2, 3],
+                "from_A_a": [1, 0, 1],
+                "from_A_b": [0, 1, 0],
+                "from_B_b": [1, 1, 0],
+                "from_B_c": [0, 0, 1],
+            }
+        )
+
+        columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
+        expected[columns] = expected[columns].astype(bool)
+        if sparse:
+            expected[columns] = expected[columns].astype(SparseDtype("bool", False))
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_with_na(self, df, sparse, dtype):
+        df.loc[3, :] = [np.nan, np.nan, np.nan]
+        result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index(
+            axis=1
+        )
+
+        if sparse:
+            arr = SparseArray
+            if dtype.kind == "b":
+                typ = SparseDtype(dtype, False)
+            else:
+                typ = SparseDtype(dtype, 0)
+        else:
+            arr = np.array
+            typ = dtype
+
+        expected = DataFrame(
+            {
+                "C": [1, 2, 3, np.nan],
+                "A_a": arr([1, 0, 1, 0], dtype=typ),
+                "A_b": arr([0, 1, 0, 0], dtype=typ),
+                "A_nan": arr([0, 0, 0, 1], dtype=typ),
+                "B_b": arr([1, 1, 0, 0], dtype=typ),
+                "B_c": arr([0, 0, 1, 0], dtype=typ),
+                "B_nan": arr([0, 0, 0, 1], dtype=typ),
+            }
+        ).sort_index(axis=1)
+
+        tm.assert_frame_equal(result, expected)
+
+        result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
+        expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
+        df["cat"] = Categorical(["x", "y", "y"])
+        result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
+        if sparse:
+            arr = SparseArray
+            if dtype.kind == "b":
+                typ = SparseDtype(dtype, False)
+            else:
+                typ = SparseDtype(dtype, 0)
+        else:
+            arr = np.array
+            typ = dtype
+
+        expected = DataFrame(
+            {
+                "C": [1, 2, 3],
+                "A_a": arr([1, 0, 1], dtype=typ),
+                "A_b": arr([0, 1, 0], dtype=typ),
+                "B_b": arr([1, 1, 0], dtype=typ),
+                "B_c": arr([0, 0, 1], dtype=typ),
+                "cat_x": arr([1, 0, 0], dtype=typ),
+                "cat_y": arr([0, 1, 1], dtype=typ),
+            }
+        ).sort_index(axis=1)
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "get_dummies_kwargs,expected",
+        [
+            (
+                {"data": DataFrame({"ä": ["a"]})},
+                "ä_a",
+            ),
+            (
+                {"data": DataFrame({"x": ["ä"]})},
+                "x_ä",
+            ),
+            (
+                {"data": DataFrame({"x": ["a"]}), "prefix": "ä"},
+                "ä_a",
+            ),
+            (
+                {"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"},
+                "xäa",
+            ),
+        ],
+    )
+    def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
+        # GH22084 get_dummies incorrectly encodes unicode characters
+        # in dataframe column names
+        result = get_dummies(**get_dummies_kwargs)
+        expected = DataFrame({expected: [True]})
+        tm.assert_frame_equal(result, expected)
+
+    def test_get_dummies_basic_drop_first(self, sparse):
+        # GH12402 Add a new parameter `drop_first` to avoid collinearity
+        # Basic case
+        s_list = list("abc")
+        s_series = Series(s_list)
+        s_series_index = Series(s_list, list("ABC"))
+
+        expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=bool)
+
+        result = get_dummies(s_list, drop_first=True, sparse=sparse)
+        if sparse:
+            expected = expected.apply(SparseArray, fill_value=False)
+        tm.assert_frame_equal(result, expected)
+
+        result = get_dummies(s_series, drop_first=True, sparse=sparse)
+        tm.assert_frame_equal(result, expected)
+
+        expected.index = list("ABC")
+        result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
+        tm.assert_frame_equal(result, expected)
+
+    def test_get_dummies_basic_drop_first_one_level(self, sparse):
+        # Test the case that categorical variable only has one level.
+        s_list = list("aaa")
+        s_series = Series(s_list)
+        s_series_index = Series(s_list, list("ABC"))
+
+        expected = DataFrame(index=RangeIndex(3))
+
+        result = get_dummies(s_list, drop_first=True, sparse=sparse)
+        tm.assert_frame_equal(result, expected)
+
+        result = get_dummies(s_series, drop_first=True, sparse=sparse)
+        tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame(index=list("ABC"))
+        result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
+        tm.assert_frame_equal(result, expected)
+
+    def test_get_dummies_basic_drop_first_NA(self, sparse):
+        # Test NA handling together with drop_first
+        s_NA = ["a", "b", np.nan]
+        res = get_dummies(s_NA, drop_first=True, sparse=sparse)
+        exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
+        if sparse:
+            exp = exp.apply(SparseArray, fill_value=False)
+
+        tm.assert_frame_equal(res, exp)
+
+        res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
+        exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=bool).reindex(
+            ["b", np.nan], axis=1
+        )
+        if sparse:
+            exp_na = exp_na.apply(SparseArray, fill_value=False)
+        tm.assert_frame_equal(res_na, exp_na)
+
+        res_just_na = get_dummies(
+            [np.nan], dummy_na=True, drop_first=True, sparse=sparse
+        )
+        exp_just_na = DataFrame(index=RangeIndex(1))
+        tm.assert_frame_equal(res_just_na, exp_just_na)
+
+    def test_dataframe_dummies_drop_first(self, df, sparse):
+        df = df[["A", "B"]]
+        result = get_dummies(df, drop_first=True, sparse=sparse)
+        expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
+        if sparse:
+            expected = expected.apply(SparseArray, fill_value=False)
+        tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
+        df["cat"] = Categorical(["x", "y", "y"])
+        result = get_dummies(df, drop_first=True, sparse=sparse)
+        expected = DataFrame(
+            {"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
+        )
+        cols = ["A_b", "B_c", "cat_y"]
+        expected[cols] = expected[cols].astype(bool)
+        expected = expected[["C", "A_b", "B_c", "cat_y"]]
+        if sparse:
+            for col in cols:
+                expected[col] = SparseArray(expected[col])
+        tm.assert_frame_equal(result, expected)
+
+    def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
+        df.loc[3, :] = [np.nan, np.nan, np.nan]
+        result = get_dummies(
+            df, dummy_na=True, drop_first=True, sparse=sparse
+        ).sort_index(axis=1)
+        expected = DataFrame(
+            {
+                "C": [1, 2, 3, np.nan],
+                "A_b": [0, 1, 0, 0],
+                "A_nan": [0, 0, 0, 1],
+                "B_c": [0, 0, 1, 0],
+                "B_nan": [0, 0, 0, 1],
+            }
+        )
+        cols = ["A_b", "A_nan", "B_c", "B_nan"]
+        expected[cols] = expected[cols].astype(bool)
+        expected = expected.sort_index(axis=1)
+        if sparse:
+            for col in cols:
+                expected[col] = SparseArray(expected[col])
+
+        tm.assert_frame_equal(result, expected)
+
+        result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse)
+        expected = expected[["C", "A_b", "B_c"]]
+        tm.assert_frame_equal(result, expected)
+
+    def test_get_dummies_int_int(self):
+        data = Series([1, 2, 1])
+        result = get_dummies(data)
+        expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=bool)
+        tm.assert_frame_equal(result, expected)
+
+        data = Series(Categorical(["a", "b", "a"]))
+        result = get_dummies(data)
+        expected = DataFrame(
+            [[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=bool
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_get_dummies_int_df(self, dtype):
+        data = DataFrame(
+            {
+                "A": [1, 2, 1],
+                "B": Categorical(["a", "b", "a"]),
+                "C": [1, 2, 1],
+                "D": [1.0, 2.0, 1.0],
+            }
+        )
+        columns = ["C", "D", "A_1", "A_2", "B_a", "B_b"]
+        expected = DataFrame(
+            [[1, 1.0, 1, 0, 1, 0], [2, 2.0, 0, 1, 0, 1], [1, 1.0, 1, 0, 1, 0]],
+            columns=columns,
+        )
+        expected[columns[2:]] = expected[columns[2:]].astype(dtype)
+        result = get_dummies(data, columns=["A", "B"], dtype=dtype)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("ordered", [True, False])
+    def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered):
+        # GH13854
+        cat = Categorical(list("xy"), categories=list("xyz"), ordered=ordered)
+        result = get_dummies(cat, dtype=dtype)
+
+        data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype))
+        cols = CategoricalIndex(
+            cat.categories, categories=cat.categories, ordered=ordered
+        )
+        expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype))
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("sparse", [True, False])
+    def test_get_dummies_dont_sparsify_all_columns(self, sparse):
+        # GH18914
+        df = DataFrame.from_dict({"GDP": [1, 2], "Nation": ["AB", "CD"]})
+        df = get_dummies(df, columns=["Nation"], sparse=sparse)
+        df2 = df.reindex(columns=["GDP"])
+
+        tm.assert_frame_equal(df[["GDP"]], df2)
+
+    def test_get_dummies_duplicate_columns(self, df):
+        # GH20839
+        df.columns = ["A", "A", "A"]
+        result = get_dummies(df).sort_index(axis=1)
+
+        expected = DataFrame(
+            [
+                [1, True, False, True, False],
+                [2, False, True, True, False],
+                [3, True, False, False, True],
+            ],
+            columns=["A", "A_a", "A_b", "A_b", "A_c"],
+        ).sort_index(axis=1)
+
+        expected = expected.astype({"A": np.int64})
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_get_dummies_all_sparse(self):
+        df = DataFrame({"A": [1, 2]})
+        result = get_dummies(df, columns=["A"], sparse=True)
+        dtype = SparseDtype("bool", False)
+        expected = DataFrame(
+            {
+                "A_1": SparseArray([1, 0], dtype=dtype),
+                "A_2": SparseArray([0, 1], dtype=dtype),
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("values", ["baz"])
+    def test_get_dummies_with_string_values(self, values):
+        # issue #28383
+        df = DataFrame(
+            {
+                "bar": [1, 2, 3, 4, 5, 6],
+                "foo": ["one", "one", "one", "two", "two", "two"],
+                "baz": ["A", "B", "C", "A", "B", "C"],
+                "zoo": ["x", "y", "z", "q", "w", "t"],
+            }
+        )
+
+        msg = "Input must be a list-like for parameter `columns`"
+
+        with pytest.raises(TypeError, match=msg):
+            get_dummies(df, columns=values)
+
+    def test_get_dummies_ea_dtype_series(self, any_numeric_ea_and_arrow_dtype):
+        # GH#32430
+        ser = Series(list("abca"))
+        result = get_dummies(ser, dtype=any_numeric_ea_and_arrow_dtype)
+        expected = DataFrame(
+            {"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]},
+            dtype=any_numeric_ea_and_arrow_dtype,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype):
+        # GH#32430
+        df = DataFrame({"x": list("abca")})
+        result = get_dummies(df, dtype=any_numeric_ea_and_arrow_dtype)
+        expected = DataFrame(
+            {"x_a": [1, 0, 0, 1], "x_b": [0, 1, 0, 0], "x_c": [0, 0, 1, 0]},
+            dtype=any_numeric_ea_and_arrow_dtype,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype_type", ["string", "category"])
+    def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object):
+        # GH#56273
+        dtype = string_dtype_no_object
+        exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool"
+        if dtype_type == "category":
+            dtype = CategoricalDtype(Index(["a"], dtype))
+        df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1})
+        result = get_dummies(df)
+        expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)})
+        tm.assert_frame_equal(result, expected)
+
+    @td.skip_if_no("pyarrow")
+    def test_get_dummies_arrow_dtype(self):
+        # GH#56273
+        df = DataFrame({"name": Series(["a"], dtype=ArrowDtype(pa.string())), "x": 1})
+        result = get_dummies(df)
+        expected = DataFrame({"x": 1, "name_a": Series([True], dtype="bool[pyarrow]")})
+        tm.assert_frame_equal(result, expected)
+
+        df = DataFrame(
+            {
+                "name": Series(
+                    ["a"],
+                    dtype=CategoricalDtype(Index(["a"], dtype=ArrowDtype(pa.string()))),
+                ),
+                "x": 1,
+            }
+        )
+        result = get_dummies(df)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba9c28282e9491ce5f237e4d92068a4320953ca
--- /dev/null
+++ b/pandas/tests/reshape/test_melt.py
@@ -0,0 +1,1280 @@
+import re
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    date_range,
+    lreshape,
+    melt,
+    wide_to_long,
+)
+import pandas._testing as tm
+
+
+@pytest.fixture
+def df():
+    res = DataFrame(
+        np.random.default_rng(2).standard_normal((10, 4)),
+        columns=Index(list("ABCD")),
+        index=date_range("2000-01-01", periods=10, freq="B"),
+    )
+    res["id1"] = (res["A"] > 0).astype(np.int64)
+    res["id2"] = (res["B"] > 0).astype(np.int64)
+    return res
+
+
+@pytest.fixture
+def df1():
+    res = DataFrame(
+        [
+            [1.067683, -1.110463, 0.20867],
+            [-1.321405, 0.368915, -1.055342],
+            [-0.807333, 0.08298, -0.873361],
+        ]
+    )
+    res.columns = [list("ABC"), list("abc")]
+    res.columns.names = ["CAP", "low"]
+    return res
+
+
+@pytest.fixture
+def var_name():
+    return "var"
+
+
+@pytest.fixture
+def value_name():
+    return "val"
+
+
+class TestMelt:
+    def test_top_level_method(self, df):
+        result = melt(df)
+        assert result.columns.tolist() == ["variable", "value"]
+
+    def test_method_signatures(self, df, df1, var_name, value_name):
+        tm.assert_frame_equal(df.melt(), melt(df))
+
+        tm.assert_frame_equal(
+            df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]),
+            melt(df, id_vars=["id1", "id2"], value_vars=["A", "B"]),
+        )
+
+        tm.assert_frame_equal(
+            df.melt(var_name=var_name, value_name=value_name),
+            melt(df, var_name=var_name, value_name=value_name),
+        )
+
+        tm.assert_frame_equal(df1.melt(col_level=0), melt(df1, col_level=0))
+
+    def test_default_col_names(self, df):
+        result = df.melt()
+        assert result.columns.tolist() == ["variable", "value"]
+
+        result1 = df.melt(id_vars=["id1"])
+        assert result1.columns.tolist() == ["id1", "variable", "value"]
+
+        result2 = df.melt(id_vars=["id1", "id2"])
+        assert result2.columns.tolist() == ["id1", "id2", "variable", "value"]
+
+    def test_value_vars(self, df):
+        result3 = df.melt(id_vars=["id1", "id2"], value_vars="A")
+        assert len(result3) == 10
+
+        result4 = df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"])
+        expected4 = DataFrame(
+            {
+                "id1": df["id1"].tolist() * 2,
+                "id2": df["id2"].tolist() * 2,
+                "variable": ["A"] * 10 + ["B"] * 10,
+                "value": (df["A"].tolist() + df["B"].tolist()),
+            },
+            columns=["id1", "id2", "variable", "value"],
+        )
+        tm.assert_frame_equal(result4, expected4)
+
+    @pytest.mark.parametrize("type_", (tuple, list, np.array))
+    def test_value_vars_types(self, type_, df):
+        # GH 15348
+        expected = DataFrame(
+            {
+                "id1": df["id1"].tolist() * 2,
+                "id2": df["id2"].tolist() * 2,
+                "variable": ["A"] * 10 + ["B"] * 10,
+                "value": (df["A"].tolist() + df["B"].tolist()),
+            },
+            columns=["id1", "id2", "variable", "value"],
+        )
+        result = df.melt(id_vars=["id1", "id2"], value_vars=type_(("A", "B")))
+        tm.assert_frame_equal(result, expected)
+
+    def test_vars_work_with_multiindex(self, df1):
+        expected = DataFrame(
+            {
+                ("A", "a"): df1[("A", "a")],
+                "CAP": ["B"] * len(df1),
+                "low": ["b"] * len(df1),
+                "value": df1[("B", "b")],
+            },
+            columns=[("A", "a"), "CAP", "low", "value"],
+        )
+
+        result = df1.melt(id_vars=[("A", "a")], value_vars=[("B", "b")])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "id_vars, value_vars, col_level, expected",
+        [
+            (
+                ["A"],
+                ["B"],
+                0,
+                {
+                    "A": {0: 1.067683, 1: -1.321405, 2: -0.807333},
+                    "CAP": {0: "B", 1: "B", 2: "B"},
+                    "value": {0: -1.110463, 1: 0.368915, 2: 0.08298},
+                },
+            ),
+            (
+                ["a"],
+                ["b"],
+                1,
+                {
+                    "a": {0: 1.067683, 1: -1.321405, 2: -0.807333},
+                    "low": {0: "b", 1: "b", 2: "b"},
+                    "value": {0: -1.110463, 1: 0.368915, 2: 0.08298},
+                },
+            ),
+        ],
+    )
+    def test_single_vars_work_with_multiindex(
+        self, id_vars, value_vars, col_level, expected, df1
+    ):
+        result = df1.melt(id_vars, value_vars, col_level=col_level)
+        expected = DataFrame(expected)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "id_vars, value_vars",
+        [
+            [("A", "a"), [("B", "b")]],
+            [[("A", "a")], ("B", "b")],
+            [("A", "a"), ("B", "b")],
+        ],
+    )
+    def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1):
+        # melt should fail with an informative error message if
+        # the columns have a MultiIndex and a tuple is passed
+        # for id_vars or value_vars.
+        msg = r"(id|value)_vars must be a list of tuples when columns are a MultiIndex"
+        with pytest.raises(ValueError, match=msg):
+            df1.melt(id_vars=id_vars, value_vars=value_vars)
+
+    def test_custom_var_name(self, df, var_name):
+        result5 = df.melt(var_name=var_name)
+        assert result5.columns.tolist() == ["var", "value"]
+
+        result6 = df.melt(id_vars=["id1"], var_name=var_name)
+        assert result6.columns.tolist() == ["id1", "var", "value"]
+
+        result7 = df.melt(id_vars=["id1", "id2"], var_name=var_name)
+        assert result7.columns.tolist() == ["id1", "id2", "var", "value"]
+
+        result8 = df.melt(id_vars=["id1", "id2"], value_vars="A", var_name=var_name)
+        assert result8.columns.tolist() == ["id1", "id2", "var", "value"]
+
+        result9 = df.melt(
+            id_vars=["id1", "id2"], value_vars=["A", "B"], var_name=var_name
+        )
+        expected9 = DataFrame(
+            {
+                "id1": df["id1"].tolist() * 2,
+                "id2": df["id2"].tolist() * 2,
+                var_name: ["A"] * 10 + ["B"] * 10,
+                "value": (df["A"].tolist() + df["B"].tolist()),
+            },
+            columns=["id1", "id2", var_name, "value"],
+        )
+        tm.assert_frame_equal(result9, expected9)
+
+    def test_custom_value_name(self, df, value_name):
+        result10 = df.melt(value_name=value_name)
+        assert result10.columns.tolist() == ["variable", "val"]
+
+        result11 = df.melt(id_vars=["id1"], value_name=value_name)
+        assert result11.columns.tolist() == ["id1", "variable", "val"]
+
+        result12 = df.melt(id_vars=["id1", "id2"], value_name=value_name)
+        assert result12.columns.tolist() == ["id1", "id2", "variable", "val"]
+
+        result13 = df.melt(
+            id_vars=["id1", "id2"], value_vars="A", value_name=value_name
+        )
+        assert result13.columns.tolist() == ["id1", "id2", "variable", "val"]
+
+        result14 = df.melt(
+            id_vars=["id1", "id2"], value_vars=["A", "B"], value_name=value_name
+        )
+        expected14 = DataFrame(
+            {
+                "id1": df["id1"].tolist() * 2,
+                "id2": df["id2"].tolist() * 2,
+                "variable": ["A"] * 10 + ["B"] * 10,
+                value_name: (df["A"].tolist() + df["B"].tolist()),
+            },
+            columns=["id1", "id2", "variable", value_name],
+        )
+        tm.assert_frame_equal(result14, expected14)
+
+    def test_custom_var_and_value_name(self, df, value_name, var_name):
+        result15 = df.melt(var_name=var_name, value_name=value_name)
+        assert result15.columns.tolist() == ["var", "val"]
+
+        result16 = df.melt(id_vars=["id1"], var_name=var_name, value_name=value_name)
+        assert result16.columns.tolist() == ["id1", "var", "val"]
+
+        result17 = df.melt(
+            id_vars=["id1", "id2"], var_name=var_name, value_name=value_name
+        )
+        assert result17.columns.tolist() == ["id1", "id2", "var", "val"]
+
+        result18 = df.melt(
+            id_vars=["id1", "id2"],
+            value_vars="A",
+            var_name=var_name,
+            value_name=value_name,
+        )
+        assert result18.columns.tolist() == ["id1", "id2", "var", "val"]
+
+        result19 = df.melt(
+            id_vars=["id1", "id2"],
+            value_vars=["A", "B"],
+            var_name=var_name,
+            value_name=value_name,
+        )
+        expected19 = DataFrame(
+            {
+                "id1": df["id1"].tolist() * 2,
+                "id2": df["id2"].tolist() * 2,
+                var_name: ["A"] * 10 + ["B"] * 10,
+                value_name: (df["A"].tolist() + df["B"].tolist()),
+            },
+            columns=["id1", "id2", var_name, value_name],
+        )
+        tm.assert_frame_equal(result19, expected19)
+
+        df20 = df.copy()
+        df20.columns.name = "foo"
+        result20 = df20.melt()
+        assert result20.columns.tolist() == ["foo", "value"]
+
+    @pytest.mark.parametrize("col_level", [0, "CAP"])
+    def test_col_level(self, col_level, df1):
+        res = df1.melt(col_level=col_level)
+        assert res.columns.tolist() == ["CAP", "value"]
+
+    def test_multiindex(self, df1):
+        res = df1.melt()
+        assert res.columns.tolist() == ["CAP", "low", "value"]
+
+    @pytest.mark.parametrize(
+        "col",
+        [
+            date_range("2010", periods=5, tz="US/Pacific"),
+            pd.Categorical(["a", "b", "c", "a", "d"]),
+            [0, 1, 0, 0, 0],
+        ],
+    )
+    def test_pandas_dtypes(self, col):
+        # GH 15785
+        col = pd.Series(col)
+        df = DataFrame(
+            {"klass": range(5), "col": col, "attr1": [1, 0, 0, 0, 0], "attr2": col}
+        )
+        expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], ignore_index=True)
+        result = melt(
+            df, id_vars=["klass", "col"], var_name="attribute", value_name="value"
+        )
+        expected = DataFrame(
+            {
+                0: list(range(5)) * 2,
+                1: pd.concat([col] * 2, ignore_index=True),
+                2: ["attr1"] * 5 + ["attr2"] * 5,
+                3: expected_value,
+            }
+        )
+        expected.columns = ["klass", "col", "attribute", "value"]
+        tm.assert_frame_equal(result, expected)
+
+    def test_preserve_category(self):
+        # GH 15853
+        data = DataFrame({"A": [1, 2], "B": pd.Categorical(["X", "Y"])})
+        result = melt(data, ["B"], ["A"])
+        expected = DataFrame(
+            {"B": pd.Categorical(["X", "Y"]), "variable": ["A", "A"], "value": [1, 2]}
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_melt_missing_columns_raises(self):
+        # GH-23575
+        # This test is to ensure that pandas raises an error if melting is
+        # attempted with column names absent from the dataframe
+
+        # Generate data
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((5, 4)), columns=list("abcd")
+        )
+
+        # Try to melt with missing `value_vars` column name
+        msg = "The following id_vars or value_vars are not present in the DataFrame:"
+        with pytest.raises(KeyError, match=msg):
+            df.melt(["a", "b"], ["C", "d"])
+
+        # Try to melt with missing `id_vars` column name
+        with pytest.raises(KeyError, match=msg):
+            df.melt(["A", "b"], ["c", "d"])
+
+        # Multiple missing
+        with pytest.raises(
+            KeyError,
+            match=msg,
+        ):
+            df.melt(["a", "b", "not_here", "or_there"], ["c", "d"])
+
+        # Multiindex melt fails if column is missing from multilevel melt
+        df.columns = [list("ABCD"), list("abcd")]
+        with pytest.raises(KeyError, match=msg):
+            df.melt([("E", "a")], [("B", "b")])
+        # Multiindex fails if column is missing from single level melt
+        with pytest.raises(KeyError, match=msg):
+            df.melt(["A"], ["F"], col_level=0)
+
+    def test_melt_mixed_int_str_id_vars(self):
+        # GH 29718
+        df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]})
+        result = melt(df, id_vars=[0, "a"], value_vars=["b", "d"])
+        expected = DataFrame(
+            {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]}
+        )
+        # the df's columns are mixed type and thus object -> preserves object dtype
+        expected["variable"] = expected["variable"].astype(object)
+        tm.assert_frame_equal(result, expected)
+
+    def test_melt_mixed_int_str_value_vars(self):
+        # GH 29718
+        df = DataFrame({0: ["foo"], "a": ["bar"]})
+        result = melt(df, value_vars=[0, "a"])
+        expected = DataFrame({"variable": [0, "a"], "value": ["foo", "bar"]})
+        tm.assert_frame_equal(result, expected)
+
+    def test_ignore_index(self):
+        # GH 17440
+        df = DataFrame({"foo": [0], "bar": [1]}, index=["first"])
+        result = melt(df, ignore_index=False)
+        expected = DataFrame(
+            {"variable": ["foo", "bar"], "value": [0, 1]}, index=["first", "first"]
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_ignore_multiindex(self):
+        # GH 17440
+        index = pd.MultiIndex.from_tuples(
+            [("first", "second"), ("first", "third")], names=["baz", "foobar"]
+        )
+        df = DataFrame({"foo": [0, 1], "bar": [2, 3]}, index=index)
+        result = melt(df, ignore_index=False)
+
+        expected_index = pd.MultiIndex.from_tuples(
+            [("first", "second"), ("first", "third")] * 2, names=["baz", "foobar"]
+        )
+        expected = DataFrame(
+            {"variable": ["foo"] * 2 + ["bar"] * 2, "value": [0, 1, 2, 3]},
+            index=expected_index,
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_ignore_index_name_and_type(self):
+        # GH 17440
+        index = Index(["foo", "bar"], dtype="category", name="baz")
+        df = DataFrame({"x": [0, 1], "y": [2, 3]}, index=index)
+        result = melt(df, ignore_index=False)
+
+        expected_index = Index(["foo", "bar"] * 2, dtype="category", name="baz")
+        expected = DataFrame(
+            {"variable": ["x", "x", "y", "y"], "value": [0, 1, 2, 3]},
+            index=expected_index,
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_melt_with_duplicate_columns(self):
+        # GH#41951
+        df = DataFrame([["id", 2, 3]], columns=["a", "b", "b"])
+        result = df.melt(id_vars=["a"], value_vars=["b"])
+        expected = DataFrame(
+            [["id", "b", 2], ["id", "b", 3]], columns=["a", "variable", "value"]
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype", ["Int8", "Int64"])
+    def test_melt_ea_dtype(self, dtype):
+        # GH#41570
+        df = DataFrame(
+            {
+                "a": pd.Series([1, 2], dtype="Int8"),
+                "b": pd.Series([3, 4], dtype=dtype),
+            }
+        )
+        result = df.melt()
+        expected = DataFrame(
+            {
+                "variable": ["a", "a", "b", "b"],
+                "value": pd.Series([1, 2, 3, 4], dtype=dtype),
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_melt_ea_columns(self):
+        # GH 54297
+        df = DataFrame(
+            {
+                "A": {0: "a", 1: "b", 2: "c"},
+                "B": {0: 1, 1: 3, 2: 5},
+                "C": {0: 2, 1: 4, 2: 6},
+            }
+        )
+        df.columns = df.columns.astype("string[python]")
+        result = df.melt(id_vars=["A"], value_vars=["B"])
+        expected = DataFrame(
+            {
+                "A": list("abc"),
+                "variable": pd.Series(["B"] * 3, dtype="string[python]"),
+                "value": [1, 3, 5],
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_melt_preserves_datetime(self):
+        df = DataFrame(
+            data=[
+                {
+                    "type": "A0",
+                    "start_date": pd.Timestamp("2023/03/01", tz="Asia/Tokyo"),
+                    "end_date": pd.Timestamp("2023/03/10", tz="Asia/Tokyo"),
+                },
+                {
+                    "type": "A1",
+                    "start_date": pd.Timestamp("2023/03/01", tz="Asia/Tokyo"),
+                    "end_date": pd.Timestamp("2023/03/11", tz="Asia/Tokyo"),
+                },
+            ],
+            index=["aaaa", "bbbb"],
+        )
+        result = df.melt(
+            id_vars=["type"],
+            value_vars=["start_date", "end_date"],
+            var_name="start/end",
+            value_name="date",
+        )
+        expected = DataFrame(
+            {
+                "type": {0: "A0", 1: "A1", 2: "A0", 3: "A1"},
+                "start/end": {
+                    0: "start_date",
+                    1: "start_date",
+                    2: "end_date",
+                    3: "end_date",
+                },
+                "date": {
+                    0: pd.Timestamp("2023-03-01 00:00:00+0900", tz="Asia/Tokyo"),
+                    1: pd.Timestamp("2023-03-01 00:00:00+0900", tz="Asia/Tokyo"),
+                    2: pd.Timestamp("2023-03-10 00:00:00+0900", tz="Asia/Tokyo"),
+                    3: pd.Timestamp("2023-03-11 00:00:00+0900", tz="Asia/Tokyo"),
+                },
+            }
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_melt_allows_non_scalar_id_vars(self):
+        df = DataFrame(
+            data={"a": [1, 2, 3], "b": [4, 5, 6]},
+            index=["11", "22", "33"],
+        )
+        result = df.melt(
+            id_vars="a",
+            var_name=0,
+            value_name=1,
+        )
+        expected = DataFrame({"a": [1, 2, 3], 0: ["b"] * 3, 1: [4, 5, 6]})
+        tm.assert_frame_equal(result, expected)
+
+    def test_melt_allows_non_string_var_name(self):
+        df = DataFrame(
+            data={"a": [1, 2, 3], "b": [4, 5, 6]},
+            index=["11", "22", "33"],
+        )
+        result = df.melt(
+            id_vars=["a"],
+            var_name=0,
+            value_name=1,
+        )
+        expected = DataFrame({"a": [1, 2, 3], 0: ["b"] * 3, 1: [4, 5, 6]})
+        tm.assert_frame_equal(result, expected)
+
+    def test_melt_non_scalar_var_name_raises(self):
+        df = DataFrame(
+            data={"a": [1, 2, 3], "b": [4, 5, 6]},
+            index=["11", "22", "33"],
+        )
+        with pytest.raises(ValueError, match=r".* must be a scalar."):
+            df.melt(id_vars=["a"], var_name=[1, 2])
+
+    def test_melt_multiindex_columns_var_name(self):
+        # GH 58033
+        df = DataFrame({("A", "a"): [1], ("A", "b"): [2]})
+
+        expected = DataFrame(
+            [("A", "a", 1), ("A", "b", 2)], columns=["first", "second", "value"]
+        )
+
+        tm.assert_frame_equal(df.melt(var_name=["first", "second"]), expected)
+        tm.assert_frame_equal(df.melt(var_name=["first"]), expected[["first", "value"]])
+
+    def test_melt_multiindex_columns_var_name_too_many(self):
+        # GH 58033
+        df = DataFrame({("A", "a"): [1], ("A", "b"): [2]})
+
+        with pytest.raises(
+            ValueError, match="but the dataframe columns only have 2 levels"
+        ):
+            df.melt(var_name=["first", "second", "third"])
+
+    def test_melt_duplicate_column_header_raises(self):
+        # GH61475
+        df = DataFrame([[1, 2, 3], [3, 4, 5]], columns=["A", "A", "B"])
+        msg = "id_vars cannot contain duplicate columns."
+
+        with pytest.raises(ValueError, match=msg):
+            df.melt(id_vars=["A"], value_vars=["B"])
+
+
+class TestLreshape:
+    def test_pairs(self):
+        data = {
+            "birthdt": [
+                "08jan2009",
+                "20dec2008",
+                "30dec2008",
+                "21dec2008",
+                "11jan2009",
+            ],
+            "birthwt": [1766, 3301, 1454, 3139, 4133],
+            "id": [101, 102, 103, 104, 105],
+            "sex": ["Male", "Female", "Female", "Female", "Female"],
+            "visitdt1": [
+                "11jan2009",
+                "22dec2008",
+                "04jan2009",
+                "29dec2008",
+                "20jan2009",
+            ],
+            "visitdt2": ["21jan2009", np.nan, "22jan2009", "31dec2008", "03feb2009"],
+            "visitdt3": ["05feb2009", np.nan, np.nan, "02jan2009", "15feb2009"],
+            "wt1": [1823, 3338, 1549, 3298, 4306],
+            "wt2": [2011.0, np.nan, 1892.0, 3338.0, 4575.0],
+            "wt3": [2293.0, np.nan, np.nan, 3377.0, 4805.0],
+        }
+
+        df = DataFrame(data)
+
+        spec = {
+            "visitdt": [f"visitdt{i:d}" for i in range(1, 4)],
+            "wt": [f"wt{i:d}" for i in range(1, 4)],
+        }
+        result = lreshape(df, spec)
+
+        exp_data = {
+            "birthdt": [
+                "08jan2009",
+                "20dec2008",
+                "30dec2008",
+                "21dec2008",
+                "11jan2009",
+                "08jan2009",
+                "30dec2008",
+                "21dec2008",
+                "11jan2009",
+                "08jan2009",
+                "21dec2008",
+                "11jan2009",
+            ],
+            "birthwt": [
+                1766,
+                3301,
+                1454,
+                3139,
+                4133,
+                1766,
+                1454,
+                3139,
+                4133,
+                1766,
+                3139,
+                4133,
+            ],
+            "id": [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105],
+            "sex": [
+                "Male",
+                "Female",
+                "Female",
+                "Female",
+                "Female",
+                "Male",
+                "Female",
+                "Female",
+                "Female",
+                "Male",
+                "Female",
+                "Female",
+            ],
+            "visitdt": [
+                "11jan2009",
+                "22dec2008",
+                "04jan2009",
+                "29dec2008",
+                "20jan2009",
+                "21jan2009",
+                "22jan2009",
+                "31dec2008",
+                "03feb2009",
+                "05feb2009",
+                "02jan2009",
+                "15feb2009",
+            ],
+            "wt": [
+                1823.0,
+                3338.0,
+                1549.0,
+                3298.0,
+                4306.0,
+                2011.0,
+                1892.0,
+                3338.0,
+                4575.0,
+                2293.0,
+                3377.0,
+                4805.0,
+            ],
+        }
+        exp = DataFrame(exp_data, columns=result.columns)
+        tm.assert_frame_equal(result, exp)
+
+        result = lreshape(df, spec, dropna=False)
+        exp_data = {
+            "birthdt": [
+                "08jan2009",
+                "20dec2008",
+                "30dec2008",
+                "21dec2008",
+                "11jan2009",
+                "08jan2009",
+                "20dec2008",
+                "30dec2008",
+                "21dec2008",
+                "11jan2009",
+                "08jan2009",
+                "20dec2008",
+                "30dec2008",
+                "21dec2008",
+                "11jan2009",
+            ],
+            "birthwt": [
+                1766,
+                3301,
+                1454,
+                3139,
+                4133,
+                1766,
+                3301,
+                1454,
+                3139,
+                4133,
+                1766,
+                3301,
+                1454,
+                3139,
+                4133,
+            ],
+            "id": [
+                101,
+                102,
+                103,
+                104,
+                105,
+                101,
+                102,
+                103,
+                104,
+                105,
+                101,
+                102,
+                103,
+                104,
+                105,
+            ],
+            "sex": [
+                "Male",
+                "Female",
+                "Female",
+                "Female",
+                "Female",
+                "Male",
+                "Female",
+                "Female",
+                "Female",
+                "Female",
+                "Male",
+                "Female",
+                "Female",
+                "Female",
+                "Female",
+            ],
+            "visitdt": [
+                "11jan2009",
+                "22dec2008",
+                "04jan2009",
+                "29dec2008",
+                "20jan2009",
+                "21jan2009",
+                np.nan,
+                "22jan2009",
+                "31dec2008",
+                "03feb2009",
+                "05feb2009",
+                np.nan,
+                np.nan,
+                "02jan2009",
+                "15feb2009",
+            ],
+            "wt": [
+                1823.0,
+                3338.0,
+                1549.0,
+                3298.0,
+                4306.0,
+                2011.0,
+                np.nan,
+                1892.0,
+                3338.0,
+                4575.0,
+                2293.0,
+                np.nan,
+                np.nan,
+                3377.0,
+                4805.0,
+            ],
+        }
+        exp = DataFrame(exp_data, columns=result.columns)
+        tm.assert_frame_equal(result, exp)
+
+        spec = {
+            "visitdt": [f"visitdt{i:d}" for i in range(1, 3)],
+            "wt": [f"wt{i:d}" for i in range(1, 4)],
+        }
+        msg = "All column lists must be same length"
+        with pytest.raises(ValueError, match=msg):
+            lreshape(df, spec)
+
+
+class TestWideToLong:
+    def test_simple(self):
+        x = np.random.default_rng(2).standard_normal(3)
+        df = DataFrame(
+            {
+                "A1970": {0: "a", 1: "b", 2: "c"},
+                "A1980": {0: "d", 1: "e", 2: "f"},
+                "B1970": {0: 2.5, 1: 1.2, 2: 0.7},
+                "B1980": {0: 3.2, 1: 1.3, 2: 0.1},
+                "X": dict(zip(range(3), x)),
+            }
+        )
+        df["id"] = df.index
+        exp_data = {
+            "X": x.tolist() + x.tolist(),
+            "A": ["a", "b", "c", "d", "e", "f"],
+            "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
+            "year": [1970, 1970, 1970, 1980, 1980, 1980],
+            "id": [0, 1, 2, 0, 1, 2],
+        }
+        expected = DataFrame(exp_data)
+        expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
+        result = wide_to_long(df, ["A", "B"], i="id", j="year")
+        tm.assert_frame_equal(result, expected)
+
+    def test_stubs(self):
+        # GH9204 wide_to_long call should not modify 'stubs' list
+        df = DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
+        df.columns = ["id", "inc1", "inc2", "edu1", "edu2"]
+        stubs = ["inc", "edu"]
+
+        wide_to_long(df, stubs, i="id", j="age")
+
+        assert stubs == ["inc", "edu"]
+
+    def test_separating_character(self):
+        # GH14779
+
+        x = np.random.default_rng(2).standard_normal(3)
+        df = DataFrame(
+            {
+                "A.1970": {0: "a", 1: "b", 2: "c"},
+                "A.1980": {0: "d", 1: "e", 2: "f"},
+                "B.1970": {0: 2.5, 1: 1.2, 2: 0.7},
+                "B.1980": {0: 3.2, 1: 1.3, 2: 0.1},
+                "X": dict(zip(range(3), x)),
+            }
+        )
+        df["id"] = df.index
+        exp_data = {
+            "X": x.tolist() + x.tolist(),
+            "A": ["a", "b", "c", "d", "e", "f"],
+            "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
+            "year": [1970, 1970, 1970, 1980, 1980, 1980],
+            "id": [0, 1, 2, 0, 1, 2],
+        }
+        expected = DataFrame(exp_data)
+        expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
+        result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
+        tm.assert_frame_equal(result, expected)
+
+    def test_escapable_characters(self):
+        x = np.random.default_rng(2).standard_normal(3)
+        df = DataFrame(
+            {
+                "A(quarterly)1970": {0: "a", 1: "b", 2: "c"},
+                "A(quarterly)1980": {0: "d", 1: "e", 2: "f"},
+                "B(quarterly)1970": {0: 2.5, 1: 1.2, 2: 0.7},
+                "B(quarterly)1980": {0: 3.2, 1: 1.3, 2: 0.1},
+                "X": dict(zip(range(3), x)),
+            }
+        )
+        df["id"] = df.index
+        exp_data = {
+            "X": x.tolist() + x.tolist(),
+            "A(quarterly)": ["a", "b", "c", "d", "e", "f"],
+            "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
+            "year": [1970, 1970, 1970, 1980, 1980, 1980],
+            "id": [0, 1, 2, 0, 1, 2],
+        }
+        expected = DataFrame(exp_data)
+        expected = expected.set_index(["id", "year"])[
+            ["X", "A(quarterly)", "B(quarterly)"]
+        ]
+        result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], i="id", j="year")
+        tm.assert_frame_equal(result, expected)
+
+    def test_unbalanced(self):
+        # test that we can have a varying amount of time variables
+        df = DataFrame(
+            {
+                "A2010": [1.0, 2.0],
+                "A2011": [3.0, 4.0],
+                "B2010": [5.0, 6.0],
+                "X": ["X1", "X2"],
+            }
+        )
+        df["id"] = df.index
+        exp_data = {
+            "X": ["X1", "X2", "X1", "X2"],
+            "A": [1.0, 2.0, 3.0, 4.0],
+            "B": [5.0, 6.0, np.nan, np.nan],
+            "id": [0, 1, 0, 1],
+            "year": [2010, 2010, 2011, 2011],
+        }
+        expected = DataFrame(exp_data)
+        expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
+        result = wide_to_long(df, ["A", "B"], i="id", j="year")
+        tm.assert_frame_equal(result, expected)
+
+    def test_character_overlap(self):
+        # Test we handle overlapping characters in both id_vars and value_vars
+        df = DataFrame(
+            {
+                "A11": ["a11", "a22", "a33"],
+                "A12": ["a21", "a22", "a23"],
+                "B11": ["b11", "b12", "b13"],
+                "B12": ["b21", "b22", "b23"],
+                "BB11": [1, 2, 3],
+                "BB12": [4, 5, 6],
+                "BBBX": [91, 92, 93],
+                "BBBZ": [91, 92, 93],
+            }
+        )
+        df["id"] = df.index
+        expected = DataFrame(
+            {
+                "BBBX": [91, 92, 93, 91, 92, 93],
+                "BBBZ": [91, 92, 93, 91, 92, 93],
+                "A": ["a11", "a22", "a33", "a21", "a22", "a23"],
+                "B": ["b11", "b12", "b13", "b21", "b22", "b23"],
+                "BB": [1, 2, 3, 4, 5, 6],
+                "id": [0, 1, 2, 0, 1, 2],
+                "year": [11, 11, 11, 12, 12, 12],
+            }
+        )
+        expected = expected.set_index(["id", "year"])[["BBBX", "BBBZ", "A", "B", "BB"]]
+        result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year")
+        tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
+
+    def test_invalid_separator(self):
+        # if an invalid separator is supplied an empty data frame is returned
+        sep = "nope!"
+        df = DataFrame(
+            {
+                "A2010": [1.0, 2.0],
+                "A2011": [3.0, 4.0],
+                "B2010": [5.0, 6.0],
+                "X": ["X1", "X2"],
+            }
+        )
+        df["id"] = df.index
+        exp_data = {
+            "X": "",
+            "A2010": [],
+            "A2011": [],
+            "B2010": [],
+            "id": [],
+            "year": [],
+            "A": [],
+            "B": [],
+        }
+        expected = DataFrame(exp_data).astype({"year": np.int64})
+        expected = expected.set_index(["id", "year"])[
+            ["X", "A2010", "A2011", "B2010", "A", "B"]
+        ]
+        expected.index = expected.index.set_levels([0, 1], level=0)
+        result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep)
+        tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
+
+    def test_num_string_disambiguation(self):
+        # Test that we can disambiguate number value_vars from
+        # string value_vars
+        df = DataFrame(
+            {
+                "A11": ["a11", "a22", "a33"],
+                "A12": ["a21", "a22", "a23"],
+                "B11": ["b11", "b12", "b13"],
+                "B12": ["b21", "b22", "b23"],
+                "BB11": [1, 2, 3],
+                "BB12": [4, 5, 6],
+                "Arating": [91, 92, 93],
+                "Arating_old": [91, 92, 93],
+            }
+        )
+        df["id"] = df.index
+        expected = DataFrame(
+            {
+                "Arating": [91, 92, 93, 91, 92, 93],
+                "Arating_old": [91, 92, 93, 91, 92, 93],
+                "A": ["a11", "a22", "a33", "a21", "a22", "a23"],
+                "B": ["b11", "b12", "b13", "b21", "b22", "b23"],
+                "BB": [1, 2, 3, 4, 5, 6],
+                "id": [0, 1, 2, 0, 1, 2],
+                "year": [11, 11, 11, 12, 12, 12],
+            }
+        )
+        expected = expected.set_index(["id", "year"])[
+            ["Arating", "Arating_old", "A", "B", "BB"]
+        ]
+        result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year")
+        tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
+
+    def test_invalid_suffixtype(self):
+        # If all stubs names end with a string, but a numeric suffix is
+        # assumed,  an empty data frame is returned
+        df = DataFrame(
+            {
+                "Aone": [1.0, 2.0],
+                "Atwo": [3.0, 4.0],
+                "Bone": [5.0, 6.0],
+                "X": ["X1", "X2"],
+            }
+        )
+        df["id"] = df.index
+        exp_data = {
+            "X": "",
+            "Aone": [],
+            "Atwo": [],
+            "Bone": [],
+            "id": [],
+            "year": [],
+            "A": [],
+            "B": [],
+        }
+        expected = DataFrame(exp_data).astype({"year": np.int64})
+
+        expected = expected.set_index(["id", "year"])
+        expected.index = expected.index.set_levels([0, 1], level=0)
+        result = wide_to_long(df, ["A", "B"], i="id", j="year")
+        tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
+
+    def test_multiple_id_columns(self):
+        # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
+        df = DataFrame(
+            {
+                "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3],
+                "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3],
+                "ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
+                "ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9],
+            }
+        )
+        expected = DataFrame(
+            {
+                "ht": [
+                    2.8,
+                    3.4,
+                    2.9,
+                    3.8,
+                    2.2,
+                    2.9,
+                    2.0,
+                    3.2,
+                    1.8,
+                    2.8,
+                    1.9,
+                    2.4,
+                    2.2,
+                    3.3,
+                    2.3,
+                    3.4,
+                    2.1,
+                    2.9,
+                ],
+                "famid": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
+                "birth": [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
+                "age": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
+            }
+        )
+        expected = expected.set_index(["famid", "birth", "age"])[["ht"]]
+        result = wide_to_long(df, "ht", i=["famid", "birth"], j="age")
+        tm.assert_frame_equal(result, expected)
+
+    def test_non_unique_idvars(self):
+        # GH16382
+        # Raise an error message if non unique id vars (i) are passed
+        df = DataFrame(
+            {"A_A1": [1, 2, 3, 4, 5], "B_B1": [1, 2, 3, 4, 5], "x": [1, 1, 1, 1, 1]}
+        )
+        msg = "the id variables need to uniquely identify each row"
+        with pytest.raises(ValueError, match=msg):
+            wide_to_long(df, ["A_A", "B_B"], i="x", j="colname")
+
+    def test_cast_j_int(self):
+        df = DataFrame(
+            {
+                "actor_1": ["CCH Pounder", "Johnny Depp", "Christoph Waltz"],
+                "actor_2": ["Joel David Moore", "Orlando Bloom", "Rory Kinnear"],
+                "actor_fb_likes_1": [1000.0, 40000.0, 11000.0],
+                "actor_fb_likes_2": [936.0, 5000.0, 393.0],
+                "title": ["Avatar", "Pirates of the Caribbean", "Spectre"],
+            }
+        )
+
+        expected = DataFrame(
+            {
+                "actor": [
+                    "CCH Pounder",
+                    "Johnny Depp",
+                    "Christoph Waltz",
+                    "Joel David Moore",
+                    "Orlando Bloom",
+                    "Rory Kinnear",
+                ],
+                "actor_fb_likes": [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
+                "num": [1, 1, 1, 2, 2, 2],
+                "title": [
+                    "Avatar",
+                    "Pirates of the Caribbean",
+                    "Spectre",
+                    "Avatar",
+                    "Pirates of the Caribbean",
+                    "Spectre",
+                ],
+            }
+        ).set_index(["title", "num"])
+        result = wide_to_long(
+            df, ["actor", "actor_fb_likes"], i="title", j="num", sep="_"
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_identical_stubnames(self):
+        df = DataFrame(
+            {
+                "A2010": [1.0, 2.0],
+                "A2011": [3.0, 4.0],
+                "B2010": [5.0, 6.0],
+                "A": ["X1", "X2"],
+            }
+        )
+        msg = "stubname can't be identical to a column name"
+        with pytest.raises(ValueError, match=msg):
+            wide_to_long(df, ["A", "B"], i="A", j="colname")
+
+    def test_nonnumeric_suffix(self):
+        df = DataFrame(
+            {
+                "treatment_placebo": [1.0, 2.0],
+                "treatment_test": [3.0, 4.0],
+                "result_placebo": [5.0, 6.0],
+                "A": ["X1", "X2"],
+            }
+        )
+        expected = DataFrame(
+            {
+                "A": ["X1", "X2", "X1", "X2"],
+                "colname": ["placebo", "placebo", "test", "test"],
+                "result": [5.0, 6.0, np.nan, np.nan],
+                "treatment": [1.0, 2.0, 3.0, 4.0],
+            }
+        )
+        expected = expected.set_index(["A", "colname"])
+        result = wide_to_long(
+            df, ["result", "treatment"], i="A", j="colname", suffix="[a-z]+", sep="_"
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_mixed_type_suffix(self):
+        df = DataFrame(
+            {
+                "A": ["X1", "X2"],
+                "result_1": [0, 9],
+                "result_foo": [5.0, 6.0],
+                "treatment_1": [1.0, 2.0],
+                "treatment_foo": [3.0, 4.0],
+            }
+        )
+        expected = DataFrame(
+            {
+                "A": ["X1", "X2", "X1", "X2"],
+                "colname": ["1", "1", "foo", "foo"],
+                "result": [0.0, 9.0, 5.0, 6.0],
+                "treatment": [1.0, 2.0, 3.0, 4.0],
+            }
+        ).set_index(["A", "colname"])
+        result = wide_to_long(
+            df, ["result", "treatment"], i="A", j="colname", suffix=".+", sep="_"
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_float_suffix(self):
+        df = DataFrame(
+            {
+                "treatment_1.1": [1.0, 2.0],
+                "treatment_2.1": [3.0, 4.0],
+                "result_1.2": [5.0, 6.0],
+                "result_1": [0, 9],
+                "A": ["X1", "X2"],
+            }
+        )
+        expected = DataFrame(
+            {
+                "A": ["X1", "X2", "X1", "X2", "X1", "X2", "X1", "X2"],
+                "colname": [1.2, 1.2, 1.0, 1.0, 1.1, 1.1, 2.1, 2.1],
+                "result": [5.0, 6.0, 0.0, 9.0, np.nan, np.nan, np.nan, np.nan],
+                "treatment": [np.nan, np.nan, np.nan, np.nan, 1.0, 2.0, 3.0, 4.0],
+            }
+        )
+        expected = expected.set_index(["A", "colname"])
+        result = wide_to_long(
+            df, ["result", "treatment"], i="A", j="colname", suffix="[0-9.]+", sep="_"
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_col_substring_of_stubname(self):
+        # GH22468
+        # Don't raise ValueError when a column name is a substring
+        # of a stubname that's been passed as a string
+        wide_data = {
+            "node_id": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
+            "A": {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81},
+            "PA0": {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6},
+            "PA1": {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67},
+            "PA3": {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67},
+        }
+        wide_df = DataFrame.from_dict(wide_data)
+        expected = wide_to_long(wide_df, stubnames=["PA"], i=["node_id", "A"], j="time")
+        result = wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time")
+        tm.assert_frame_equal(result, expected)
+
+    def test_raise_of_column_name_value(self):
+        # GH34731, enforced in 2.0
+        # raise a ValueError if the resultant value column name matches
+        # a name in the dataframe already (default name is "value")
+        df = DataFrame({"col": list("ABC"), "value": range(10, 16, 2)})
+
+        with pytest.raises(
+            ValueError, match=re.escape("value_name (value) cannot match")
+        ):
+            df.melt(id_vars="value", value_name="value")
+
+    def test_missing_stubname(self, any_string_dtype):
+        # GH46044
+        df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]})
+        df = df.astype({"id": any_string_dtype})
+        result = wide_to_long(
+            df,
+            stubnames=["a", "b"],
+            i="id",
+            j="num",
+            sep="-",
+        )
+        index = Index(
+            [("1", 1), ("2", 1), ("1", 2), ("2", 2)],
+            name=("id", "num"),
+        )
+        expected = DataFrame(
+            {"a": [100, 200, 300, 400], "b": [np.nan] * 4},
+            index=index,
+        )
+        new_level = expected.index.levels[0].astype(any_string_dtype)
+        if any_string_dtype == "object":
+            new_level = expected.index.levels[0].astype("str")
+        expected.index = expected.index.set_levels(new_level, level=0)
+        tm.assert_frame_equal(result, expected)
+
+
+def test_wide_to_long_string_columns(string_storage):
+    # GH 57066
+    string_dtype = pd.StringDtype(string_storage, na_value=np.nan)
+    df = DataFrame(
+        {
+            "ID": {0: 1},
+            "R_test1": {0: 1},
+            "R_test2": {0: 1},
+            "R_test3": {0: 2},
+            "D": {0: 1},
+        }
+    )
+    df.columns = df.columns.astype(string_dtype)
+    result = wide_to_long(
+        df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*"
+    )
+    expected = DataFrame(
+        [[1, 1], [1, 1], [1, 2]],
+        columns=Index(["D", "R"]),
+        index=pd.MultiIndex.from_arrays(
+            [
+                [1, 1, 1],
+                Index(["test1", "test2", "test3"], dtype=string_dtype),
+            ],
+            names=["ID", "UNPIVOTED"],
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
new file mode 100644
index 0000000000000000000000000000000000000000..6745ba0bac765a17b583724cc57dc207bb2e81a6
--- /dev/null
+++ b/pandas/tests/reshape/test_pivot.py
@@ -0,0 +1,2961 @@
+from datetime import (
+    date,
+    datetime,
+    timedelta,
+)
+from itertools import product
+import re
+
+import numpy as np
+import pytest
+
+from pandas._config import using_string_dtype
+
+import pandas as pd
+from pandas import (
+    ArrowDtype,
+    Categorical,
+    DataFrame,
+    Grouper,
+    Index,
+    MultiIndex,
+    Series,
+    concat,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.api.types import CategoricalDtype
+from pandas.core.reshape import reshape as reshape_lib
+from pandas.core.reshape.pivot import pivot_table
+
+
+class TestPivotTable:
+    @pytest.fixture
+    def data(self):
+        return DataFrame(
+            {
+                "A": [
+                    "foo",
+                    "foo",
+                    "foo",
+                    "foo",
+                    "bar",
+                    "bar",
+                    "bar",
+                    "bar",
+                    "foo",
+                    "foo",
+                    "foo",
+                ],
+                "B": [
+                    "one",
+                    "one",
+                    "one",
+                    "two",
+                    "one",
+                    "one",
+                    "one",
+                    "two",
+                    "two",
+                    "two",
+                    "one",
+                ],
+                "C": [
+                    "dull",
+                    "dull",
+                    "shiny",
+                    "dull",
+                    "dull",
+                    "shiny",
+                    "shiny",
+                    "dull",
+                    "shiny",
+                    "shiny",
+                    "shiny",
+                ],
+                "D": np.random.default_rng(2).standard_normal(11),
+                "E": np.random.default_rng(2).standard_normal(11),
+                "F": np.random.default_rng(2).standard_normal(11),
+            }
+        )
+
+    def test_pivot_table(self, observed, data):
+        index = ["A", "B"]
+        columns = "C"
+        table = pivot_table(
+            data, values="D", index=index, columns=columns, observed=observed
+        )
+
+        table2 = data.pivot_table(
+            values="D", index=index, columns=columns, observed=observed
+        )
+        tm.assert_frame_equal(table, table2)
+
+        # this works
+        pivot_table(data, values="D", index=index, observed=observed)
+
+        if len(index) > 1:
+            assert table.index.names == tuple(index)
+        else:
+            assert table.index.name == index[0]
+
+        if len(columns) > 1:
+            assert table.columns.names == columns
+        else:
+            assert table.columns.name == columns[0]
+
+        expected = data.groupby([*index, columns])["D"].agg("mean").unstack()
+        tm.assert_frame_equal(table, expected)
+
+    def test_pivot_table_categorical_observed_equal(self, observed):
+        # issue #24923
+        df = DataFrame(
+            {"col1": list("abcde"), "col2": list("fghij"), "col3": [1, 2, 3, 4, 5]}
+        )
+
+        expected = df.pivot_table(
+            index="col1", values="col3", columns="col2", aggfunc="sum", fill_value=0
+        )
+
+        expected.index = expected.index.astype("category")
+        expected.columns = expected.columns.astype("category")
+
+        df.col1 = df.col1.astype("category")
+        df.col2 = df.col2.astype("category")
+
+        result = df.pivot_table(
+            index="col1",
+            values="col3",
+            columns="col2",
+            aggfunc="sum",
+            fill_value=0,
+            observed=observed,
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_nocols(self):
+        df = DataFrame(
+            {"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]}
+        )
+        rs = df.pivot_table(columns="cols", aggfunc="sum")
+        xp = df.pivot_table(index="cols", aggfunc="sum").T
+        tm.assert_frame_equal(rs, xp)
+
+        rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"})
+        xp = df.pivot_table(index="cols", aggfunc={"values": "mean"}).T
+        tm.assert_frame_equal(rs, xp)
+
+    def test_pivot_table_dropna(self):
+        df = DataFrame(
+            {
+                "amount": {0: 60000, 1: 100000, 2: 50000, 3: 30000},
+                "customer": {0: "A", 1: "A", 2: "B", 3: "C"},
+                "month": {0: 201307, 1: 201309, 2: 201308, 3: 201310},
+                "product": {0: "a", 1: "b", 2: "c", 3: "d"},
+                "quantity": {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000},
+            }
+        )
+        pv_col = df.pivot_table(
+            "quantity", "month", ["customer", "product"], dropna=False
+        )
+        pv_ind = df.pivot_table(
+            "quantity", ["customer", "product"], "month", dropna=False
+        )
+
+        m = MultiIndex.from_tuples(
+            [
+                ("A", "a"),
+                ("A", "b"),
+                ("A", "c"),
+                ("A", "d"),
+                ("B", "a"),
+                ("B", "b"),
+                ("B", "c"),
+                ("B", "d"),
+                ("C", "a"),
+                ("C", "b"),
+                ("C", "c"),
+                ("C", "d"),
+            ],
+            names=["customer", "product"],
+        )
+        tm.assert_index_equal(pv_col.columns, m)
+        tm.assert_index_equal(pv_ind.index, m)
+
+    def test_pivot_table_categorical(self):
+        cat1 = Categorical(
+            ["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True
+        )
+        cat2 = Categorical(
+            ["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True
+        )
+        df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
+        result = pivot_table(
+            df, values="values", index=["A", "B"], dropna=True, observed=False
+        )
+
+        exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
+        expected = DataFrame({"values": [1.0, 2.0, 3.0, 4.0]}, index=exp_index)
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_dropna_categoricals(self, dropna):
+        # GH 15193
+        categories = ["a", "b", "c", "d"]
+
+        df = DataFrame(
+            {
+                "A": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+                "B": [1, 2, 3, 1, 2, 3, 1, 2, 3],
+                "C": range(9),
+            }
+        )
+
+        df["A"] = df["A"].astype(CategoricalDtype(categories, ordered=False))
+        result = df.pivot_table(
+            index="B", columns="A", values="C", dropna=dropna, observed=False
+        )
+        expected_columns = Series(["a", "b", "c"], name="A")
+        expected_columns = expected_columns.astype(
+            CategoricalDtype(categories, ordered=False)
+        )
+        expected_index = Series([1, 2, 3], name="B")
+        expected = DataFrame(
+            [[0.0, 3.0, 6.0], [1.0, 4.0, 7.0], [2.0, 5.0, 8.0]],
+            index=expected_index,
+            columns=expected_columns,
+        )
+        if not dropna:
+            # add back the non observed to compare
+            expected = expected.reindex(columns=Categorical(categories)).astype("float")
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_with_non_observable_dropna(self, dropna):
+        # gh-21133
+        df = DataFrame(
+            {
+                "A": Categorical(
+                    [np.nan, "low", "high", "low", "high"],
+                    categories=["low", "high"],
+                    ordered=True,
+                ),
+                "B": [0.0, 1.0, 2.0, 3.0, 4.0],
+            }
+        )
+
+        result = df.pivot_table(index="A", values="B", dropna=dropna, observed=False)
+        if dropna:
+            values = [2.0, 3.0]
+            codes = [0, 1]
+        else:
+            # GH: 10772
+            values = [2.0, 3.0, 0.0]
+            codes = [0, 1, -1]
+        expected = DataFrame(
+            {"B": values},
+            index=Index(
+                Categorical.from_codes(codes, categories=["low", "high"], ordered=True),
+                name="A",
+            ),
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_with_non_observable_dropna_multi_cat(self, dropna):
+        # gh-21378
+        df = DataFrame(
+            {
+                "A": Categorical(
+                    ["left", "low", "high", "low", "high"],
+                    categories=["low", "high", "left"],
+                    ordered=True,
+                ),
+                "B": range(5),
+            }
+        )
+
+        result = df.pivot_table(index="A", values="B", dropna=dropna, observed=False)
+        expected = DataFrame(
+            {"B": [2.0, 3.0, 0.0]},
+            index=Index(
+                Categorical.from_codes(
+                    [0, 1, 2], categories=["low", "high", "left"], ordered=True
+                ),
+                name="A",
+            ),
+        )
+        if not dropna:
+            expected["B"] = expected["B"].astype(float)
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "left_right", [([0] * 4, [1] * 4), (range(3), range(1, 4))]
+    )
+    def test_pivot_with_interval_index(self, left_right, dropna, closed):
+        # GH 25814
+        left, right = left_right
+        interval_values = Categorical(pd.IntervalIndex.from_arrays(left, right, closed))
+        df = DataFrame({"A": interval_values, "B": 1})
+
+        result = df.pivot_table(index="A", values="B", dropna=dropna, observed=False)
+        expected = DataFrame(
+            {"B": 1.0}, index=Index(interval_values.unique(), name="A")
+        )
+        if not dropna:
+            expected = expected.astype(float)
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_with_interval_index_margins(self):
+        # GH 25815
+        ordered_cat = pd.IntervalIndex.from_arrays([0, 0, 1, 1], [1, 1, 2, 2])
+        df = DataFrame(
+            {
+                "A": np.arange(4, 0, -1, dtype=np.intp),
+                "B": ["a", "b", "a", "b"],
+                "C": Categorical(ordered_cat, ordered=True).sort_values(
+                    ascending=False
+                ),
+            }
+        )
+
+        pivot_tab = pivot_table(
+            df,
+            index="C",
+            columns="B",
+            values="A",
+            aggfunc="sum",
+            margins=True,
+            observed=False,
+        )
+
+        result = pivot_tab["All"]
+        expected = Series(
+            [3, 7, 10],
+            index=Index([pd.Interval(0, 1), pd.Interval(1, 2), "All"], name="C"),
+            name="All",
+            dtype=np.intp,
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_pass_array(self, data):
+        result = data.pivot_table("D", index=data.A, columns=data.C)
+        expected = data.pivot_table("D", index="A", columns="C")
+        tm.assert_frame_equal(result, expected)
+
+    def test_pass_function(self, data):
+        result = data.pivot_table("D", index=lambda x: x // 5, columns=data.C)
+        expected = data.pivot_table("D", index=data.index // 5, columns="C")
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_multiple(self, data):
+        index = ["A", "B"]
+        columns = "C"
+        table = pivot_table(data, index=index, columns=columns)
+        expected = data.groupby([*index, columns]).agg("mean").unstack()
+        tm.assert_frame_equal(table, expected)
+
+    def test_pivot_dtypes(self):
+        # can convert dtypes
+        f = DataFrame(
+            {
+                "a": ["cat", "bat", "cat", "bat"],
+                "v": [1, 2, 3, 4],
+                "i": ["a", "b", "a", "b"],
+            }
+        )
+        assert f.dtypes["v"] == "int64"
+
+        z = pivot_table(
+            f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc="sum"
+        )
+        result = z.dtypes
+        expected = Series([np.dtype("int64")] * 2, index=Index(list("ab"), name="i"))
+        tm.assert_series_equal(result, expected)
+
+        # cannot convert dtypes
+        f = DataFrame(
+            {
+                "a": ["cat", "bat", "cat", "bat"],
+                "v": [1.5, 2.5, 3.5, 4.5],
+                "i": ["a", "b", "a", "b"],
+            }
+        )
+        assert f.dtypes["v"] == "float64"
+
+        z = pivot_table(
+            f, values="v", index=["a"], columns=["i"], fill_value=0, aggfunc="mean"
+        )
+        result = z.dtypes
+        expected = Series([np.dtype("float64")] * 2, index=Index(list("ab"), name="i"))
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "columns,values",
+        [
+            ("bool1", ["float1", "float2"]),
+            ("bool1", ["float1", "float2", "bool1"]),
+            ("bool2", ["float1", "float2", "bool1"]),
+        ],
+    )
+    def test_pivot_preserve_dtypes(self, columns, values):
+        # GH 7142 regression test
+        v = np.arange(5, dtype=np.float64)
+        df = DataFrame(
+            {"float1": v, "float2": v + 2.0, "bool1": v <= 2, "bool2": v <= 3}
+        )
+
+        df_res = df.reset_index().pivot_table(
+            index="index", columns=columns, values=values
+        )
+
+        result = dict(df_res.dtypes)
+        expected = {col: np.dtype("float64") for col in df_res}
+        assert result == expected
+
+    def test_pivot_no_values(self):
+        # GH 14380
+        idx = pd.DatetimeIndex(
+            ["2011-01-01", "2011-02-01", "2011-01-02", "2011-01-01", "2011-01-02"]
+        )
+        df = DataFrame({"A": [1, 2, 3, 4, 5]}, index=idx)
+        res = df.pivot_table(index=df.index.month, columns=df.index.day)
+
+        exp_columns = MultiIndex.from_tuples([("A", 1), ("A", 2)])
+        exp_columns = exp_columns.set_levels(
+            exp_columns.levels[1].astype(np.int32), level=1
+        )
+        exp = DataFrame(
+            [[2.5, 4.0], [2.0, np.nan]],
+            index=Index([1, 2], dtype=np.int32),
+            columns=exp_columns,
+        )
+        tm.assert_frame_equal(res, exp)
+
+        df = DataFrame(
+            {
+                "A": [1, 2, 3, 4, 5],
+                "dt": date_range("2011-01-01", freq="D", periods=5, unit="ns"),
+            },
+            index=idx,
+        )
+        res = df.pivot_table(index=df.index.month, columns=Grouper(key="dt", freq="ME"))
+        exp_columns = MultiIndex.from_arrays(
+            [["A"], pd.DatetimeIndex(["2011-01-31"], dtype="M8[ns]")],
+            names=[None, "dt"],
+        )
+        exp = DataFrame(
+            [3.25, 2.0], index=Index([1, 2], dtype=np.int32), columns=exp_columns
+        )
+        tm.assert_frame_equal(res, exp)
+
+        res = df.pivot_table(
+            index=Grouper(freq="YE"), columns=Grouper(key="dt", freq="ME")
+        )
+        exp = DataFrame(
+            [3.0],
+            index=pd.DatetimeIndex(["2011-12-31"], freq="YE"),
+            columns=exp_columns,
+        )
+        tm.assert_frame_equal(res, exp)
+
+    def test_pivot_multi_values(self, data):
+        result = pivot_table(
+            data, values=["D", "E"], index="A", columns=["B", "C"], fill_value=0
+        )
+        expected = pivot_table(
+            data.drop(["F"], axis=1), index="A", columns=["B", "C"], fill_value=0
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_multi_functions(self, data):
+        f = lambda func: pivot_table(
+            data, values=["D", "E"], index=["A", "B"], columns="C", aggfunc=func
+        )
+        result = f(["mean", "std"])
+        means = f("mean")
+        stds = f("std")
+        expected = concat([means, stds], keys=["mean", "std"], axis=1)
+        tm.assert_frame_equal(result, expected)
+
+        # margins not supported??
+        f = lambda func: pivot_table(
+            data,
+            values=["D", "E"],
+            index=["A", "B"],
+            columns="C",
+            aggfunc=func,
+            margins=True,
+        )
+        result = f(["mean", "std"])
+        means = f("mean")
+        stds = f("std")
+        expected = concat([means, stds], keys=["mean", "std"], axis=1)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("method", [True, False])
+    def test_pivot_index_with_nan(self, method):
+        # GH 3588
+        nan = np.nan
+        df = DataFrame(
+            {
+                "a": ["R1", "R2", nan, "R4"],
+                "b": ["C1", "C2", "C3", "C4"],
+                "c": [10, 15, 17, 20],
+            }
+        )
+        if method:
+            result = df.pivot(index="a", columns="b", values="c")
+        else:
+            result = pd.pivot(df, index="a", columns="b", values="c")
+        expected = DataFrame(
+            [
+                [nan, nan, 17, nan],
+                [10, nan, nan, nan],
+                [nan, 15, nan, nan],
+                [nan, nan, nan, 20],
+            ],
+            index=Index([nan, "R1", "R2", "R4"], name="a"),
+            columns=Index(["C1", "C2", "C3", "C4"], name="b"),
+        )
+        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(df.pivot(index="b", columns="a", values="c"), expected.T)
+
+    @pytest.mark.parametrize("method", [True, False])
+    def test_pivot_index_with_nan_dates(self, method):
+        # GH9491
+        df = DataFrame(
+            {
+                "a": date_range("2014-02-01", periods=6, freq="D"),
+                "c": 100 + np.arange(6),
+            }
+        )
+        df["b"] = df["a"] - pd.Timestamp("2014-02-02")
+        df.loc[1, "a"] = df.loc[3, "a"] = np.nan
+        df.loc[1, "b"] = df.loc[4, "b"] = np.nan
+
+        if method:
+            pv = df.pivot(index="a", columns="b", values="c")
+        else:
+            pv = pd.pivot(df, index="a", columns="b", values="c")
+        assert pv.notna().values.sum() == len(df)
+
+        for _, row in df.iterrows():
+            assert pv.loc[row["a"], row["b"]] == row["c"]
+
+        if method:
+            result = df.pivot(index="b", columns="a", values="c")
+        else:
+            result = pd.pivot(df, index="b", columns="a", values="c")
+        tm.assert_frame_equal(result, pv.T)
+
+    @pytest.mark.parametrize("method", [True, False])
+    def test_pivot_with_tz(self, method, unit):
+        # GH 5878
+        df = DataFrame(
+            {
+                "dt1": pd.DatetimeIndex(
+                    [
+                        datetime(2013, 1, 1, 9, 0),
+                        datetime(2013, 1, 2, 9, 0),
+                        datetime(2013, 1, 1, 9, 0),
+                        datetime(2013, 1, 2, 9, 0),
+                    ],
+                    dtype=f"M8[{unit}, US/Pacific]",
+                ),
+                "dt2": pd.DatetimeIndex(
+                    [
+                        datetime(2014, 1, 1, 9, 0),
+                        datetime(2014, 1, 1, 9, 0),
+                        datetime(2014, 1, 2, 9, 0),
+                        datetime(2014, 1, 2, 9, 0),
+                    ],
+                    dtype=f"M8[{unit}, Asia/Tokyo]",
+                ),
+                "data1": np.arange(4, dtype="int64"),
+                "data2": np.arange(4, dtype="int64"),
+            }
+        )
+
+        exp_col1 = Index(["data1", "data1", "data2", "data2"])
+        exp_col2 = pd.DatetimeIndex(
+            ["2014/01/01 09:00", "2014/01/02 09:00"] * 2,
+            name="dt2",
+            dtype=f"M8[{unit}, Asia/Tokyo]",
+        )
+        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
+        exp_idx = pd.DatetimeIndex(
+            ["2013/01/01 09:00", "2013/01/02 09:00"],
+            name="dt1",
+            dtype=f"M8[{unit}, US/Pacific]",
+        )
+        expected = DataFrame(
+            [[0, 2, 0, 2], [1, 3, 1, 3]],
+            index=exp_idx,
+            columns=exp_col,
+        )
+
+        if method:
+            pv = df.pivot(index="dt1", columns="dt2")
+        else:
+            pv = pd.pivot(df, index="dt1", columns="dt2")
+        tm.assert_frame_equal(pv, expected)
+
+        expected = DataFrame(
+            [[0, 2], [1, 3]],
+            index=exp_idx,
+            columns=exp_col2[:2],
+        )
+
+        if method:
+            pv = df.pivot(index="dt1", columns="dt2", values="data1")
+        else:
+            pv = pd.pivot(df, index="dt1", columns="dt2", values="data1")
+        tm.assert_frame_equal(pv, expected)
+
+    def test_pivot_tz_in_values(self):
+        # GH 14948
+        df = DataFrame(
+            [
+                {
+                    "uid": "aa",
+                    "ts": pd.Timestamp("2016-08-12 13:00:00-0700", tz="US/Pacific"),
+                },
+                {
+                    "uid": "aa",
+                    "ts": pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"),
+                },
+                {
+                    "uid": "aa",
+                    "ts": pd.Timestamp("2016-08-12 14:00:00-0700", tz="US/Pacific"),
+                },
+                {
+                    "uid": "aa",
+                    "ts": pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"),
+                },
+                {
+                    "uid": "aa",
+                    "ts": pd.Timestamp("2016-08-25 13:00:00-0700", tz="US/Pacific"),
+                },
+            ]
+        )
+
+        df = df.set_index("ts").reset_index()
+        mins = df.ts.map(lambda x: x.replace(hour=0, minute=0, second=0))
+
+        result = pivot_table(
+            df.set_index("ts").reset_index(),
+            values="ts",
+            index=["uid"],
+            columns=[mins],
+            aggfunc="min",
+        )
+        expected = DataFrame(
+            [
+                [
+                    pd.Timestamp("2016-08-12 08:00:00-0700", tz="US/Pacific"),
+                    pd.Timestamp("2016-08-25 11:00:00-0700", tz="US/Pacific"),
+                ]
+            ],
+            index=Index(["aa"], name="uid"),
+            columns=pd.DatetimeIndex(
+                [
+                    pd.Timestamp("2016-08-12 00:00:00", tz="US/Pacific"),
+                    pd.Timestamp("2016-08-25 00:00:00", tz="US/Pacific"),
+                ],
+                name="ts",
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("method", [True, False])
+    def test_pivot_periods(self, method):
+        df = DataFrame(
+            {
+                "p1": [
+                    pd.Period("2013-01-01", "D"),
+                    pd.Period("2013-01-02", "D"),
+                    pd.Period("2013-01-01", "D"),
+                    pd.Period("2013-01-02", "D"),
+                ],
+                "p2": [
+                    pd.Period("2013-01", "M"),
+                    pd.Period("2013-01", "M"),
+                    pd.Period("2013-02", "M"),
+                    pd.Period("2013-02", "M"),
+                ],
+                "data1": np.arange(4, dtype="int64"),
+                "data2": np.arange(4, dtype="int64"),
+            }
+        )
+
+        exp_col1 = Index(["data1", "data1", "data2", "data2"])
+        exp_col2 = pd.PeriodIndex(["2013-01", "2013-02"] * 2, name="p2", freq="M")
+        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
+        expected = DataFrame(
+            [[0, 2, 0, 2], [1, 3, 1, 3]],
+            index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"),
+            columns=exp_col,
+        )
+        if method:
+            pv = df.pivot(index="p1", columns="p2")
+        else:
+            pv = pd.pivot(df, index="p1", columns="p2")
+        tm.assert_frame_equal(pv, expected)
+
+        expected = DataFrame(
+            [[0, 2], [1, 3]],
+            index=pd.PeriodIndex(["2013-01-01", "2013-01-02"], name="p1", freq="D"),
+            columns=pd.PeriodIndex(["2013-01", "2013-02"], name="p2", freq="M"),
+        )
+        if method:
+            pv = df.pivot(index="p1", columns="p2", values="data1")
+        else:
+            pv = pd.pivot(df, index="p1", columns="p2", values="data1")
+        tm.assert_frame_equal(pv, expected)
+
+    def test_pivot_periods_with_margins(self):
+        # GH 28323
+        df = DataFrame(
+            {
+                "a": [1, 1, 2, 2],
+                "b": [
+                    pd.Period("2019Q1"),
+                    pd.Period("2019Q2"),
+                    pd.Period("2019Q1"),
+                    pd.Period("2019Q2"),
+                ],
+                "x": 1.0,
+            }
+        )
+
+        expected = DataFrame(
+            data=1.0,
+            index=Index([1, 2, "All"], name="a"),
+            columns=Index([pd.Period("2019Q1"), pd.Period("2019Q2"), "All"], name="b"),
+        )
+
+        result = df.pivot_table(index="a", columns="b", values="x", margins=True)
+        tm.assert_frame_equal(expected, result)
+
+    @pytest.mark.parametrize("box", [list, np.array, Series, Index])
+    @pytest.mark.parametrize("method", [True, False])
+    def test_pivot_with_list_like_values(self, box, method):
+        # issue #17160
+        values = box(["baz", "zoo"])
+        df = DataFrame(
+            {
+                "foo": ["one", "one", "one", "two", "two", "two"],
+                "bar": ["A", "B", "C", "A", "B", "C"],
+                "baz": [1, 2, 3, 4, 5, 6],
+                "zoo": ["x", "y", "z", "q", "w", "t"],
+            }
+        )
+
+        if method:
+            result = df.pivot(index="foo", columns="bar", values=values)
+        else:
+            result = pd.pivot(df, index="foo", columns="bar", values=values)
+
+        data = [[1, 2, 3, "x", "y", "z"], [4, 5, 6, "q", "w", "t"]]
+        index = Index(data=["one", "two"], name="foo")
+        columns = MultiIndex(
+            levels=[["baz", "zoo"], ["A", "B", "C"]],
+            codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]],
+            names=[None, "bar"],
+        )
+        expected = DataFrame(data=data, index=index, columns=columns)
+        expected["baz"] = expected["baz"].astype(object)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "values",
+        [
+            ["bar", "baz"],
+            np.array(["bar", "baz"]),
+            Series(["bar", "baz"]),
+            Index(["bar", "baz"]),
+        ],
+    )
+    @pytest.mark.parametrize("method", [True, False])
+    def test_pivot_with_list_like_values_nans(self, values, method):
+        # issue #17160
+        df = DataFrame(
+            {
+                "foo": ["one", "one", "one", "two", "two", "two"],
+                "bar": ["A", "B", "C", "A", "B", "C"],
+                "baz": [1, 2, 3, 4, 5, 6],
+                "zoo": ["x", "y", "z", "q", "w", "t"],
+            }
+        )
+
+        if method:
+            result = df.pivot(index="zoo", columns="foo", values=values)
+        else:
+            result = pd.pivot(df, index="zoo", columns="foo", values=values)
+
+        data = [
+            [np.nan, "A", np.nan, 4],
+            [np.nan, "C", np.nan, 6],
+            [np.nan, "B", np.nan, 5],
+            ["A", np.nan, 1, np.nan],
+            ["B", np.nan, 2, np.nan],
+            ["C", np.nan, 3, np.nan],
+        ]
+        index = Index(data=["q", "t", "w", "x", "y", "z"], name="zoo")
+        columns = MultiIndex(
+            levels=[["bar", "baz"], ["one", "two"]],
+            codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
+            names=[None, "foo"],
+        )
+        expected = DataFrame(data=data, index=index, columns=columns)
+        expected["baz"] = expected["baz"].astype(object)
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_columns_none_raise_error(self):
+        # GH 30924
+        df = DataFrame({"col1": ["a", "b", "c"], "col2": [1, 2, 3], "col3": [1, 2, 3]})
+        msg = r"pivot\(\) missing 1 required keyword-only argument: 'columns'"
+        with pytest.raises(TypeError, match=msg):
+            df.pivot(index="col1", values="col3")
+
+    @pytest.mark.xfail(
+        reason="MultiIndexed unstack with tuple names fails with KeyError GH#19966"
+    )
+    @pytest.mark.parametrize("method", [True, False])
+    def test_pivot_with_multiindex(self, method):
+        # issue #17160
+        index = Index(data=[0, 1, 2, 3, 4, 5])
+        data = [
+            ["one", "A", 1, "x"],
+            ["one", "B", 2, "y"],
+            ["one", "C", 3, "z"],
+            ["two", "A", 4, "q"],
+            ["two", "B", 5, "w"],
+            ["two", "C", 6, "t"],
+        ]
+        columns = MultiIndex(
+            levels=[["bar", "baz"], ["first", "second"]],
+            codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
+        )
+        df = DataFrame(data=data, index=index, columns=columns, dtype="object")
+        if method:
+            result = df.pivot(
+                index=("bar", "first"),
+                columns=("bar", "second"),
+                values=("baz", "first"),
+            )
+        else:
+            result = pd.pivot(
+                df,
+                index=("bar", "first"),
+                columns=("bar", "second"),
+                values=("baz", "first"),
+            )
+
+        data = {
+            "A": Series([1, 4], index=["one", "two"]),
+            "B": Series([2, 5], index=["one", "two"]),
+            "C": Series([3, 6], index=["one", "two"]),
+        }
+        expected = DataFrame(data)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("method", [True, False])
+    def test_pivot_with_tuple_of_values(self, method):
+        # issue #17160
+        df = DataFrame(
+            {
+                "foo": ["one", "one", "one", "two", "two", "two"],
+                "bar": ["A", "B", "C", "A", "B", "C"],
+                "baz": [1, 2, 3, 4, 5, 6],
+                "zoo": ["x", "y", "z", "q", "w", "t"],
+            }
+        )
+        with pytest.raises(KeyError, match=r"^\('bar', 'baz'\)$"):
+            # tuple is seen as a single column name
+            if method:
+                df.pivot(index="zoo", columns="foo", values=("bar", "baz"))
+            else:
+                pd.pivot(df, index="zoo", columns="foo", values=("bar", "baz"))
+
+    def _check_output(
+        self,
+        result,
+        values_col,
+        data,
+        index=None,
+        columns=None,
+        margins_col="All",
+    ):
+        if index is None:
+            index = ["A", "B"]
+        if columns is None:
+            columns = ["C"]
+        col_margins = result.loc[result.index[:-1], margins_col]
+        expected_col_margins = data.groupby(index)[values_col].mean()
+        tm.assert_series_equal(col_margins, expected_col_margins, check_names=False)
+        assert col_margins.name == margins_col
+
+        result = result.sort_index()
+        index_margins = result.loc[(margins_col, "")].iloc[:-1]
+
+        expected_ix_margins = data.groupby(columns)[values_col].mean()
+        tm.assert_series_equal(index_margins, expected_ix_margins, check_names=False)
+        assert index_margins.name == (margins_col, "")
+
+        grand_total_margins = result.loc[(margins_col, ""), margins_col]
+        expected_total_margins = data[values_col].mean()
+        assert grand_total_margins == expected_total_margins
+
+    def test_margins(self, data):
+        # column specified
+        result = data.pivot_table(
+            values="D", index=["A", "B"], columns="C", margins=True, aggfunc="mean"
+        )
+        self._check_output(result, "D", data)
+
+        # Set a different margins_name (not 'All')
+        result = data.pivot_table(
+            values="D",
+            index=["A", "B"],
+            columns="C",
+            margins=True,
+            aggfunc="mean",
+            margins_name="Totals",
+        )
+        self._check_output(result, "D", data, margins_col="Totals")
+
+        # no column specified
+        table = data.pivot_table(
+            index=["A", "B"], columns="C", margins=True, aggfunc="mean"
+        )
+        for value_col in table.columns.levels[0]:
+            self._check_output(table[value_col], value_col, data)
+
+    def test_no_col(self, data, using_infer_string):
+        # no col
+
+        # to help with a buglet
+        data.columns = [k * 2 for k in data.columns]
+        msg = re.escape("agg function failed [how->mean,dtype->")
+        if using_infer_string:
+            msg = "dtype 'str' does not support operation 'mean'"
+        with pytest.raises(TypeError, match=msg):
+            data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean")
+        table = data.drop(columns="CC").pivot_table(
+            index=["AA", "BB"], margins=True, aggfunc="mean"
+        )
+        for value_col in table.columns:
+            totals = table.loc[("All", ""), value_col]
+            assert totals == data[value_col].mean()
+
+        with pytest.raises(TypeError, match=msg):
+            data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean")
+        table = data.drop(columns="CC").pivot_table(
+            index=["AA", "BB"], margins=True, aggfunc="mean"
+        )
+        for item in ["DD", "EE", "FF"]:
+            totals = table.loc[("All", ""), item]
+            assert totals == data[item].mean()
+
+    @pytest.mark.parametrize(
+        "columns, aggfunc, values, expected_columns",
+        [
+            (
+                "A",
+                "mean",
+                [[5.5, 5.5, 2.2, 2.2], [8.0, 8.0, 4.4, 4.4]],
+                Index(["bar", "All", "foo", "All"], name="A"),
+            ),
+            (
+                ["A", "B"],
+                "sum",
+                [
+                    [9, 13, 22, 5, 6, 11],
+                    [14, 18, 32, 11, 11, 22],
+                ],
+                MultiIndex.from_tuples(
+                    [
+                        ("bar", "one"),
+                        ("bar", "two"),
+                        ("bar", "All"),
+                        ("foo", "one"),
+                        ("foo", "two"),
+                        ("foo", "All"),
+                    ],
+                    names=["A", "B"],
+                ),
+            ),
+        ],
+    )
+    def test_margin_with_only_columns_defined(
+        self, columns, aggfunc, values, expected_columns, using_infer_string
+    ):
+        # GH 31016
+        df = DataFrame(
+            {
+                "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
+                "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
+                "C": [
+                    "small",
+                    "large",
+                    "large",
+                    "small",
+                    "small",
+                    "large",
+                    "small",
+                    "small",
+                    "large",
+                ],
+                "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
+                "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
+            }
+        )
+        if aggfunc != "sum":
+            msg = re.escape("agg function failed [how->mean,dtype->")
+            if using_infer_string:
+                msg = "dtype 'str' does not support operation 'mean'"
+            with pytest.raises(TypeError, match=msg):
+                df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc)
+        if "B" not in columns:
+            df = df.drop(columns="B")
+        result = df.drop(columns="C").pivot_table(
+            columns=columns, margins=True, aggfunc=aggfunc
+        )
+        expected = DataFrame(values, index=Index(["D", "E"]), columns=expected_columns)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_margins_dtype(self, data):
+        # GH 17013
+
+        df = data.copy()
+        df[["D", "E", "F"]] = np.arange(len(df) * 3).reshape(len(df), 3).astype("i8")
+
+        mi_val = [*list(product(["bar", "foo"], ["one", "two"])), ("All", "")]
+        mi = MultiIndex.from_tuples(mi_val, names=("A", "B"))
+        expected = DataFrame(
+            {"dull": [12, 21, 3, 9, 45], "shiny": [33, 0, 36, 51, 120]}, index=mi
+        ).rename_axis("C", axis=1)
+        expected["All"] = expected["dull"] + expected["shiny"]
+
+        result = df.pivot_table(
+            values="D",
+            index=["A", "B"],
+            columns="C",
+            margins=True,
+            aggfunc="sum",
+            fill_value=0,
+        )
+
+        tm.assert_frame_equal(expected, result)
+
+    def test_margins_dtype_len(self, data):
+        mi_val = [*list(product(["bar", "foo"], ["one", "two"])), ("All", "")]
+        mi = MultiIndex.from_tuples(mi_val, names=("A", "B"))
+        expected = DataFrame(
+            {"dull": [1, 1, 2, 1, 5], "shiny": [2, 0, 2, 2, 6]}, index=mi
+        ).rename_axis("C", axis=1)
+        expected["All"] = expected["dull"] + expected["shiny"]
+
+        result = data.pivot_table(
+            values="D",
+            index=["A", "B"],
+            columns="C",
+            margins=True,
+            aggfunc=len,
+            fill_value=0,
+        )
+
+        tm.assert_frame_equal(expected, result)
+
+    @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)])
+    def test_pivot_table_multiindex_only(self, cols):
+        # GH 17038
+        df2 = DataFrame({cols[0]: [1, 2, 3], cols[1]: [1, 2, 3], "v": [4, 5, 6]})
+
+        result = df2.pivot_table(values="v", columns=cols)
+        expected = DataFrame(
+            [[4.0, 5.0, 6.0]],
+            columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols),
+            index=Index(["v"], dtype="str" if cols == ("a", "b") else "object"),
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_retains_tz(self):
+        dti = date_range("2016-01-01", periods=3, tz="Europe/Amsterdam")
+        df = DataFrame(
+            {
+                "A": np.random.default_rng(2).standard_normal(3),
+                "B": np.random.default_rng(2).standard_normal(3),
+                "C": dti,
+            }
+        )
+        result = df.pivot_table(index=["B", "C"], dropna=False)
+
+        # check tz retention
+        assert result.index.levels[1].equals(dti)
+
+    def test_pivot_integer_columns(self):
+        # caused by upstream bug in unstack
+
+        d = date.min
+        data = list(
+            product(
+                ["foo", "bar"],
+                ["A", "B", "C"],
+                ["x1", "x2"],
+                [d + timedelta(i) for i in range(20)],
+                [1.0],
+            )
+        )
+        df = DataFrame(data)
+        table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2])
+
+        df2 = df.rename(columns=str)
+        table2 = df2.pivot_table(values="4", index=["0", "1", "3"], columns=["2"])
+
+        tm.assert_frame_equal(table, table2, check_names=False)
+
+    def test_pivot_no_level_overlap(self):
+        # GH #1181
+
+        data = DataFrame(
+            {
+                "a": ["a", "a", "a", "a", "b", "b", "b", "b"] * 2,
+                "b": [0, 0, 0, 0, 1, 1, 1, 1] * 2,
+                "c": (["foo"] * 4 + ["bar"] * 4) * 2,
+                "value": np.random.default_rng(2).standard_normal(16),
+            }
+        )
+
+        table = data.pivot_table("value", index="a", columns=["b", "c"])
+
+        grouped = data.groupby(["a", "b", "c"])["value"].mean()
+        expected = grouped.unstack("b").unstack("c").dropna(axis=1, how="all")
+        tm.assert_frame_equal(table, expected)
+
+    def test_pivot_columns_lexsorted(self):
+        n = 10000
+
+        dtype = np.dtype(
+            [
+                ("Index", object),
+                ("Symbol", object),
+                ("Year", int),
+                ("Month", int),
+                ("Day", int),
+                ("Quantity", int),
+                ("Price", float),
+            ]
+        )
+
+        products = np.array(
+            [
+                ("SP500", "ADBE"),
+                ("SP500", "NVDA"),
+                ("SP500", "ORCL"),
+                ("NDQ100", "AAPL"),
+                ("NDQ100", "MSFT"),
+                ("NDQ100", "GOOG"),
+                ("FTSE", "DGE.L"),
+                ("FTSE", "TSCO.L"),
+                ("FTSE", "GSK.L"),
+            ],
+            dtype=[("Index", object), ("Symbol", object)],
+        )
+        items = np.empty(n, dtype=dtype)
+        iproduct = np.random.default_rng(2).integers(0, len(products), n)
+        items["Index"] = products["Index"][iproduct]
+        items["Symbol"] = products["Symbol"][iproduct]
+        dr = date_range(date(2000, 1, 1), date(2010, 12, 31))
+        dates = dr[np.random.default_rng(2).integers(0, len(dr), n)]
+        items["Year"] = dates.year
+        items["Month"] = dates.month
+        items["Day"] = dates.day
+        items["Price"] = np.random.default_rng(2).lognormal(4.0, 2.0, n)
+
+        df = DataFrame(items)
+
+        pivoted = df.pivot_table(
+            "Price",
+            index=["Month", "Day"],
+            columns=["Index", "Symbol", "Year"],
+            aggfunc="mean",
+        )
+
+        assert pivoted.columns.is_monotonic_increasing
+
+    def test_pivot_complex_aggfunc(self, data):
+        f = {"D": ["std"], "E": ["sum"]}
+        expected = data.groupby(["A", "B"]).agg(f).unstack("B")
+        result = data.pivot_table(index="A", columns="B", aggfunc=f)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_margins_no_values_no_cols(self, data):
+        # Regression test on pivot table: no values or cols passed.
+        result = data[["A", "B"]].pivot_table(
+            index=["A", "B"], aggfunc=len, margins=True
+        )
+        result_list = result.tolist()
+        assert sum(result_list[:-1]) == result_list[-1]
+
+    def test_margins_no_values_two_rows(self, data):
+        # Regression test on pivot table: no values passed but rows are a
+        # multi-index
+        result = data[["A", "B", "C"]].pivot_table(
+            index=["A", "B"], columns="C", aggfunc=len, margins=True
+        )
+        assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0]
+
+    def test_margins_no_values_one_row_one_col(self, data):
+        # Regression test on pivot table: no values passed but row and col
+        # defined
+        result = data[["A", "B"]].pivot_table(
+            index="A", columns="B", aggfunc=len, margins=True
+        )
+        assert result.All.tolist() == [4.0, 7.0, 11.0]
+
+    def test_margins_no_values_two_row_two_cols(self, data):
+        # Regression test on pivot table: no values passed but rows and cols
+        # are multi-indexed
+        data["D"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"]
+        result = data[["A", "B", "C", "D"]].pivot_table(
+            index=["A", "B"], columns=["C", "D"], aggfunc=len, margins=True
+        )
+        assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0]
+
+    @pytest.mark.parametrize("margin_name", ["foo", "one", 666, None, ["a", "b"]])
+    def test_pivot_table_with_margins_set_margin_name(self, margin_name, data):
+        # see gh-3335
+        msg = (
+            f'Conflicting name "{margin_name}" in margins|'
+            "margins_name argument must be a string"
+        )
+        with pytest.raises(ValueError, match=msg):
+            # multi-index index
+            pivot_table(
+                data,
+                values="D",
+                index=["A", "B"],
+                columns=["C"],
+                margins=True,
+                margins_name=margin_name,
+            )
+        with pytest.raises(ValueError, match=msg):
+            # multi-index column
+            pivot_table(
+                data,
+                values="D",
+                index=["C"],
+                columns=["A", "B"],
+                margins=True,
+                margins_name=margin_name,
+            )
+        with pytest.raises(ValueError, match=msg):
+            # non-multi-index index/column
+            pivot_table(
+                data,
+                values="D",
+                index=["A"],
+                columns=["B"],
+                margins=True,
+                margins_name=margin_name,
+            )
+
+    def test_pivot_timegrouper(self):
+        df = DataFrame(
+            {
+                "Branch": "A A A A A A A B".split(),
+                "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
+                "Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
+                "Date": [
+                    datetime(2013, 1, 1),
+                    datetime(2013, 1, 1),
+                    datetime(2013, 10, 1),
+                    datetime(2013, 10, 2),
+                    datetime(2013, 10, 1),
+                    datetime(2013, 10, 2),
+                    datetime(2013, 12, 2),
+                    datetime(2013, 12, 2),
+                ],
+            }
+        ).set_index("Date")
+
+        expected = DataFrame(
+            np.array([10, 18, 3], dtype="int64").reshape(1, 3),
+            index=pd.DatetimeIndex([datetime(2013, 12, 31)], freq="YE"),
+            columns="Carl Joe Mark".split(),
+        )
+        expected.index.name = "Date"
+        expected.columns.name = "Buyer"
+
+        result = pivot_table(
+            df,
+            index=Grouper(freq="YE"),
+            columns="Buyer",
+            values="Quantity",
+            aggfunc="sum",
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(
+            df,
+            index="Buyer",
+            columns=Grouper(freq="YE"),
+            values="Quantity",
+            aggfunc="sum",
+        )
+        tm.assert_frame_equal(result, expected.T)
+
+        expected = DataFrame(
+            np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3),
+            index=pd.DatetimeIndex(
+                [datetime(2013, 1, 1), datetime(2013, 7, 1)], freq="6MS"
+            ),
+            columns="Carl Joe Mark".split(),
+        )
+        expected.index.name = "Date"
+        expected.columns.name = "Buyer"
+
+        result = pivot_table(
+            df,
+            index=Grouper(freq="6MS"),
+            columns="Buyer",
+            values="Quantity",
+            aggfunc="sum",
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(
+            df,
+            index="Buyer",
+            columns=Grouper(freq="6MS"),
+            values="Quantity",
+            aggfunc="sum",
+        )
+        tm.assert_frame_equal(result, expected.T)
+
+        # passing the name
+        df = df.reset_index()
+        result = pivot_table(
+            df,
+            index=Grouper(freq="6MS", key="Date"),
+            columns="Buyer",
+            values="Quantity",
+            aggfunc="sum",
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(
+            df,
+            index="Buyer",
+            columns=Grouper(freq="6MS", key="Date"),
+            values="Quantity",
+            aggfunc="sum",
+        )
+        tm.assert_frame_equal(result, expected.T)
+
+        msg = "'The grouper name foo is not found'"
+        with pytest.raises(KeyError, match=msg):
+            pivot_table(
+                df,
+                index=Grouper(freq="6MS", key="foo"),
+                columns="Buyer",
+                values="Quantity",
+                aggfunc="sum",
+            )
+        with pytest.raises(KeyError, match=msg):
+            pivot_table(
+                df,
+                index="Buyer",
+                columns=Grouper(freq="6MS", key="foo"),
+                values="Quantity",
+                aggfunc="sum",
+            )
+
+        # passing the level
+        df = df.set_index("Date")
+        result = pivot_table(
+            df,
+            index=Grouper(freq="6MS", level="Date"),
+            columns="Buyer",
+            values="Quantity",
+            aggfunc="sum",
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(
+            df,
+            index="Buyer",
+            columns=Grouper(freq="6MS", level="Date"),
+            values="Quantity",
+            aggfunc="sum",
+        )
+        tm.assert_frame_equal(result, expected.T)
+
+        msg = "The level foo is not valid"
+        with pytest.raises(ValueError, match=msg):
+            pivot_table(
+                df,
+                index=Grouper(freq="6MS", level="foo"),
+                columns="Buyer",
+                values="Quantity",
+                aggfunc="sum",
+            )
+        with pytest.raises(ValueError, match=msg):
+            pivot_table(
+                df,
+                index="Buyer",
+                columns=Grouper(freq="6MS", level="foo"),
+                values="Quantity",
+                aggfunc="sum",
+            )
+
+    def test_pivot_timegrouper_double(self):
+        # double grouper
+        df = DataFrame(
+            {
+                "Branch": "A A A A A A A B".split(),
+                "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
+                "Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
+                "Date": [
+                    datetime(2013, 11, 1, 13, 0),
+                    datetime(2013, 9, 1, 13, 5),
+                    datetime(2013, 10, 1, 20, 0),
+                    datetime(2013, 10, 2, 10, 0),
+                    datetime(2013, 11, 1, 20, 0),
+                    datetime(2013, 10, 2, 10, 0),
+                    datetime(2013, 10, 2, 12, 0),
+                    datetime(2013, 12, 5, 14, 0),
+                ],
+                "PayDay": [
+                    datetime(2013, 10, 4, 0, 0),
+                    datetime(2013, 10, 15, 13, 5),
+                    datetime(2013, 9, 5, 20, 0),
+                    datetime(2013, 11, 2, 10, 0),
+                    datetime(2013, 10, 7, 20, 0),
+                    datetime(2013, 9, 5, 10, 0),
+                    datetime(2013, 12, 30, 12, 0),
+                    datetime(2013, 11, 20, 14, 0),
+                ],
+            }
+        )
+
+        result = pivot_table(
+            df,
+            index=Grouper(freq="ME", key="Date"),
+            columns=Grouper(freq="ME", key="PayDay"),
+            values="Quantity",
+            aggfunc="sum",
+        )
+        expected = DataFrame(
+            np.array(
+                [
+                    np.nan,
+                    3,
+                    np.nan,
+                    np.nan,
+                    6,
+                    np.nan,
+                    1,
+                    9,
+                    np.nan,
+                    9,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    3,
+                    np.nan,
+                ]
+            ).reshape(4, 4),
+            index=pd.DatetimeIndex(
+                [
+                    datetime(2013, 9, 30),
+                    datetime(2013, 10, 31),
+                    datetime(2013, 11, 30),
+                    datetime(2013, 12, 31),
+                ],
+                freq="ME",
+            ),
+            columns=pd.DatetimeIndex(
+                [
+                    datetime(2013, 9, 30),
+                    datetime(2013, 10, 31),
+                    datetime(2013, 11, 30),
+                    datetime(2013, 12, 31),
+                ],
+                freq="ME",
+            ),
+        )
+        expected.index.name = "Date"
+        expected.columns.name = "PayDay"
+
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(
+            df,
+            index=Grouper(freq="ME", key="PayDay"),
+            columns=Grouper(freq="ME", key="Date"),
+            values="Quantity",
+            aggfunc="sum",
+        )
+        tm.assert_frame_equal(result, expected.T)
+
+        tuples = [
+            (datetime(2013, 9, 30), datetime(2013, 10, 31)),
+            (datetime(2013, 10, 31), datetime(2013, 9, 30)),
+            (datetime(2013, 10, 31), datetime(2013, 11, 30)),
+            (datetime(2013, 10, 31), datetime(2013, 12, 31)),
+            (datetime(2013, 11, 30), datetime(2013, 10, 31)),
+            (datetime(2013, 12, 31), datetime(2013, 11, 30)),
+        ]
+        idx = MultiIndex.from_tuples(tuples, names=["Date", "PayDay"])
+        expected = DataFrame(
+            np.array(
+                [3, np.nan, 6, np.nan, 1, np.nan, 9, np.nan, 9, np.nan, np.nan, 3]
+            ).reshape(6, 2),
+            index=idx,
+            columns=["A", "B"],
+        )
+        expected.columns.name = "Branch"
+
+        result = pivot_table(
+            df,
+            index=[Grouper(freq="ME", key="Date"), Grouper(freq="ME", key="PayDay")],
+            columns=["Branch"],
+            values="Quantity",
+            aggfunc="sum",
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(
+            df,
+            index=["Branch"],
+            columns=[Grouper(freq="ME", key="Date"), Grouper(freq="ME", key="PayDay")],
+            values="Quantity",
+            aggfunc="sum",
+        )
+        tm.assert_frame_equal(result, expected.T)
+
+    def test_pivot_datetime_tz(self):
+        dates1 = pd.DatetimeIndex(
+            [
+                "2011-07-19 07:00:00",
+                "2011-07-19 08:00:00",
+                "2011-07-19 09:00:00",
+                "2011-07-19 07:00:00",
+                "2011-07-19 08:00:00",
+                "2011-07-19 09:00:00",
+            ],
+            dtype="M8[ns, US/Pacific]",
+            name="dt1",
+        )
+        dates2 = pd.DatetimeIndex(
+            [
+                "2013-01-01 15:00:00",
+                "2013-01-01 15:00:00",
+                "2013-01-01 15:00:00",
+                "2013-02-01 15:00:00",
+                "2013-02-01 15:00:00",
+                "2013-02-01 15:00:00",
+            ],
+            dtype="M8[ns, Asia/Tokyo]",
+        )
+        df = DataFrame(
+            {
+                "label": ["a", "a", "a", "b", "b", "b"],
+                "dt1": dates1,
+                "dt2": dates2,
+                "value1": np.arange(6, dtype="int64"),
+                "value2": [1, 2] * 3,
+            }
+        )
+
+        exp_idx = dates1[:3]
+        exp_col1 = Index(["value1", "value1"])
+        exp_col2 = Index(["a", "b"], name="label")
+        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
+        expected = DataFrame(
+            [[0.0, 3.0], [1.0, 4.0], [2.0, 5.0]], index=exp_idx, columns=exp_col
+        )
+        result = pivot_table(df, index=["dt1"], columns=["label"], values=["value1"])
+        tm.assert_frame_equal(result, expected)
+
+        exp_col1 = Index(["sum", "sum", "sum", "sum", "mean", "mean", "mean", "mean"])
+        exp_col2 = Index(["value1", "value1", "value2", "value2"] * 2)
+        exp_col3 = pd.DatetimeIndex(
+            ["2013-01-01 15:00:00", "2013-02-01 15:00:00"] * 4,
+            dtype="M8[ns, Asia/Tokyo]",
+            name="dt2",
+        )
+        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
+        expected1 = DataFrame(
+            np.array(
+                [
+                    [
+                        0,
+                        3,
+                        1,
+                        2,
+                    ],
+                    [1, 4, 2, 1],
+                    [2, 5, 1, 2],
+                ],
+                dtype="int64",
+            ),
+            index=exp_idx,
+            columns=exp_col[:4],
+        )
+        expected2 = DataFrame(
+            np.array(
+                [
+                    [0.0, 3.0, 1.0, 2.0],
+                    [1.0, 4.0, 2.0, 1.0],
+                    [2.0, 5.0, 1.0, 2.0],
+                ],
+            ),
+            index=exp_idx,
+            columns=exp_col[4:],
+        )
+        expected = concat([expected1, expected2], axis=1)
+
+        result = pivot_table(
+            df,
+            index=["dt1"],
+            columns=["dt2"],
+            values=["value1", "value2"],
+            aggfunc=["sum", "mean"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_dtaccessor(self):
+        # GH 8103
+        dates1 = pd.DatetimeIndex(
+            [
+                "2011-07-19 07:00:00",
+                "2011-07-19 08:00:00",
+                "2011-07-19 09:00:00",
+                "2011-07-19 07:00:00",
+                "2011-07-19 08:00:00",
+                "2011-07-19 09:00:00",
+            ]
+        )
+        dates2 = pd.DatetimeIndex(
+            [
+                "2013-01-01 15:00:00",
+                "2013-01-01 15:00:00",
+                "2013-01-01 15:00:00",
+                "2013-02-01 15:00:00",
+                "2013-02-01 15:00:00",
+                "2013-02-01 15:00:00",
+            ]
+        )
+        df = DataFrame(
+            {
+                "label": ["a", "a", "a", "b", "b", "b"],
+                "dt1": dates1,
+                "dt2": dates2,
+                "value1": np.arange(6, dtype="int64"),
+                "value2": [1, 2] * 3,
+            }
+        )
+
+        result = pivot_table(
+            df, index="label", columns=df["dt1"].dt.hour, values="value1"
+        )
+
+        exp_idx = Index(["a", "b"], name="label")
+        expected = DataFrame(
+            {7: [0.0, 3.0], 8: [1.0, 4.0], 9: [2.0, 5.0]},
+            index=exp_idx,
+            columns=Index([7, 8, 9], dtype=np.int32, name="dt1"),
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(
+            df, index=df["dt2"].dt.month, columns=df["dt1"].dt.hour, values="value1"
+        )
+
+        expected = DataFrame(
+            {7: [0.0, 3.0], 8: [1.0, 4.0], 9: [2.0, 5.0]},
+            index=Index([1, 2], dtype=np.int32, name="dt2"),
+            columns=Index([7, 8, 9], dtype=np.int32, name="dt1"),
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(
+            df,
+            index=df["dt2"].dt.year.values,
+            columns=[df["dt1"].dt.hour, df["dt2"].dt.month],
+            values="value1",
+        )
+
+        exp_col = MultiIndex.from_arrays(
+            [
+                np.array([7, 7, 8, 8, 9, 9], dtype=np.int32),
+                np.array([1, 2] * 3, dtype=np.int32),
+            ],
+            names=["dt1", "dt2"],
+        )
+        expected = DataFrame(
+            np.array([[0.0, 3.0, 1.0, 4.0, 2.0, 5.0]]),
+            index=Index([2013], dtype=np.int32),
+            columns=exp_col,
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(
+            df,
+            index=np.array(["X", "X", "X", "X", "Y", "Y"]),
+            columns=[df["dt1"].dt.hour, df["dt2"].dt.month],
+            values="value1",
+        )
+        expected = DataFrame(
+            np.array(
+                [[0, 3, 1, np.nan, 2, np.nan], [np.nan, np.nan, np.nan, 4, np.nan, 5]]
+            ),
+            index=["X", "Y"],
+            columns=exp_col,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_daily(self):
+        rng = date_range("1/1/2000", "12/31/2004", freq="D")
+        ts = Series(np.arange(len(rng)), index=rng)
+
+        result = pivot_table(
+            DataFrame(ts), index=ts.index.year, columns=ts.index.dayofyear
+        )
+        result.columns = result.columns.droplevel(0)
+
+        doy = np.asarray(ts.index.dayofyear)
+
+        expected = {}
+        for y in ts.index.year.unique().values:
+            mask = ts.index.year == y
+            expected[y] = Series(ts.values[mask], index=doy[mask])
+        expected = DataFrame(expected, dtype=float).T
+        expected.index = expected.index.astype(np.int32)
+        tm.assert_frame_equal(result, expected)
+
+    def test_monthly(self):
+        rng = date_range("1/1/2000", "12/31/2004", freq="ME")
+        ts = Series(np.arange(len(rng)), index=rng)
+
+        result = pivot_table(DataFrame(ts), index=ts.index.year, columns=ts.index.month)
+        result.columns = result.columns.droplevel(0)
+
+        month = np.asarray(ts.index.month)
+        expected = {}
+        for y in ts.index.year.unique().values:
+            mask = ts.index.year == y
+            expected[y] = Series(ts.values[mask], index=month[mask])
+        expected = DataFrame(expected, dtype=float).T
+        expected.index = expected.index.astype(np.int32)
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_with_iterator_values(self, data):
+        # GH 12017
+        aggs = {"D": "sum", "E": "mean"}
+
+        pivot_values_list = pivot_table(
+            data, index=["A"], values=list(aggs.keys()), aggfunc=aggs
+        )
+
+        pivot_values_keys = pivot_table(
+            data, index=["A"], values=aggs.keys(), aggfunc=aggs
+        )
+        tm.assert_frame_equal(pivot_values_keys, pivot_values_list)
+
+        agg_values_gen = (value for value in aggs)
+        pivot_values_gen = pivot_table(
+            data, index=["A"], values=agg_values_gen, aggfunc=aggs
+        )
+        tm.assert_frame_equal(pivot_values_gen, pivot_values_list)
+
+    def test_pivot_table_margins_name_with_aggfunc_list(self):
+        # GH 13354
+        margins_name = "Weekly"
+        costs = DataFrame(
+            {
+                "item": ["bacon", "cheese", "bacon", "cheese"],
+                "cost": [2.5, 4.5, 3.2, 3.3],
+                "day": ["ME", "ME", "T", "T"],
+            }
+        )
+        table = costs.pivot_table(
+            index="item",
+            columns="day",
+            margins=True,
+            margins_name=margins_name,
+            aggfunc=["mean", "max"],
+        )
+        ix = Index(["bacon", "cheese", margins_name], name="item")
+        tups = [
+            ("mean", "cost", "ME"),
+            ("mean", "cost", "T"),
+            ("mean", "cost", margins_name),
+            ("max", "cost", "ME"),
+            ("max", "cost", "T"),
+            ("max", "cost", margins_name),
+        ]
+        cols = MultiIndex.from_tuples(tups, names=[None, None, "day"])
+        expected = DataFrame(table.values, index=ix, columns=cols)
+        tm.assert_frame_equal(table, expected)
+
+    def test_categorical_margins(self, observed):
+        # GH 10989
+        df = DataFrame(
+            {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2}
+        )
+
+        expected = DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]])
+        expected.index = Index([0, 1, "All"], name="y")
+        expected.columns = Index([0, 1, "All"], name="z")
+
+        table = df.pivot_table("x", "y", "z", dropna=observed, margins=True)
+        tm.assert_frame_equal(table, expected)
+
+    def test_categorical_margins_category(self, observed):
+        df = DataFrame(
+            {"x": np.arange(8), "y": np.arange(8) // 4, "z": np.arange(8) % 2}
+        )
+
+        expected = DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]])
+        expected.index = Index([0, 1, "All"], name="y")
+        expected.columns = Index([0, 1, "All"], name="z")
+
+        df.y = df.y.astype("category")
+        df.z = df.z.astype("category")
+        table = df.pivot_table(
+            "x", "y", "z", dropna=observed, margins=True, observed=False
+        )
+        tm.assert_frame_equal(table, expected)
+
+    def test_margins_casted_to_float(self):
+        # GH 24893
+        df = DataFrame(
+            {
+                "A": [2, 4, 6, 8],
+                "B": [1, 4, 5, 8],
+                "C": [1, 3, 4, 6],
+                "D": ["X", "X", "Y", "Y"],
+            }
+        )
+
+        result = pivot_table(df, index="D", margins=True)
+        expected = DataFrame(
+            {"A": [3.0, 7.0, 5], "B": [2.5, 6.5, 4.5], "C": [2.0, 5.0, 3.5]},
+            index=Index(["X", "Y", "All"], name="D"),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_with_categorical(self, observed, ordered):
+        # gh-21370
+        idx = [np.nan, "low", "high", "low", np.nan]
+        col = [np.nan, "A", "B", np.nan, "A"]
+        df = DataFrame(
+            {
+                "In": Categorical(idx, categories=["low", "high"], ordered=ordered),
+                "Col": Categorical(col, categories=["A", "B"], ordered=ordered),
+                "Val": range(1, 6),
+            }
+        )
+        # case with index/columns/value
+        result = df.pivot_table(
+            index="In", columns="Col", values="Val", observed=observed
+        )
+
+        expected_cols = pd.CategoricalIndex(["A", "B"], ordered=ordered, name="Col")
+
+        expected = DataFrame(data=[[2.0, np.nan], [np.nan, 3.0]], columns=expected_cols)
+        expected.index = Index(
+            Categorical(["low", "high"], categories=["low", "high"], ordered=ordered),
+            name="In",
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+        # case with columns/value
+        result = df.pivot_table(columns="Col", values="Val", observed=observed)
+
+        expected = DataFrame(
+            data=[[3.5, 3.0]], columns=expected_cols, index=Index(["Val"])
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_categorical_aggfunc(self, observed):
+        # GH 9534
+        df = DataFrame(
+            {"C1": ["A", "B", "C", "C"], "C2": ["a", "a", "b", "b"], "V": [1, 2, 3, 4]}
+        )
+        df["C1"] = df["C1"].astype("category")
+        result = df.pivot_table(
+            "V",
+            index="C1",
+            columns="C2",
+            dropna=observed,
+            aggfunc="count",
+            observed=False,
+        )
+
+        expected_index = pd.CategoricalIndex(
+            ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1"
+        )
+        expected_columns = Index(["a", "b"], name="C2")
+        expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64)
+        expected = DataFrame(
+            expected_data, index=expected_index, columns=expected_columns
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_categorical_pivot_index_ordering(self, observed):
+        # GH 8731
+        df = DataFrame(
+            {
+                "Sales": [100, 120, 220],
+                "Month": ["January", "January", "January"],
+                "Year": [2013, 2014, 2013],
+            }
+        )
+        months = [
+            "January",
+            "February",
+            "March",
+            "April",
+            "May",
+            "June",
+            "July",
+            "August",
+            "September",
+            "October",
+            "November",
+            "December",
+        ]
+        df["Month"] = df["Month"].astype("category").cat.set_categories(months)
+        result = df.pivot_table(
+            values="Sales",
+            index="Month",
+            columns="Year",
+            observed=observed,
+            aggfunc="sum",
+        )
+        expected_columns = Index([2013, 2014], name="Year", dtype="int64")
+        expected_index = pd.CategoricalIndex(
+            months, categories=months, ordered=False, name="Month"
+        )
+        expected_data = [[320, 120]] + [[0, 0]] * 11
+        expected = DataFrame(
+            expected_data, index=expected_index, columns=expected_columns
+        )
+        if observed:
+            expected = expected.loc[["January"]]
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_not_series(self):
+        # GH 4386
+        # pivot_table always returns a DataFrame
+        # when values is not list like and columns is None
+        # and aggfunc is not instance of list
+        df = DataFrame({"col1": [3, 4, 5], "col2": ["C", "D", "E"], "col3": [1, 3, 9]})
+
+        result = df.pivot_table("col1", index=["col3", "col2"], aggfunc="sum")
+        m = MultiIndex.from_arrays([[1, 3, 9], ["C", "D", "E"]], names=["col3", "col2"])
+        expected = DataFrame([3, 4, 5], index=m, columns=["col1"])
+
+        tm.assert_frame_equal(result, expected)
+
+        result = df.pivot_table("col1", index="col3", columns="col2", aggfunc="sum")
+        expected = DataFrame(
+            [[3, np.nan, np.nan], [np.nan, 4, np.nan], [np.nan, np.nan, 5]],
+            index=Index([1, 3, 9], name="col3"),
+            columns=Index(["C", "D", "E"], name="col2"),
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+        result = df.pivot_table("col1", index="col3", aggfunc=["sum"])
+        m = MultiIndex.from_arrays([["sum"], ["col1"]])
+        expected = DataFrame([3, 4, 5], index=Index([1, 3, 9], name="col3"), columns=m)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_margins_name_unicode(self):
+        # issue #13292
+        greek = "\u0394\u03bf\u03ba\u03b9\u03bc\u03ae"
+        frame = DataFrame({"foo": [1, 2, 3]}, columns=Index(["foo"], dtype=object))
+        table = pivot_table(
+            frame, index=["foo"], aggfunc=len, margins=True, margins_name=greek
+        )
+        index = Index([1, 2, 3, greek], dtype="object", name="foo")
+        expected = DataFrame(index=index, columns=[])
+        tm.assert_frame_equal(table, expected)
+
+    def test_pivot_string_as_func(self):
+        # GH #18713
+        # for correctness purposes
+        data = DataFrame(
+            {
+                "A": [
+                    "foo",
+                    "foo",
+                    "foo",
+                    "foo",
+                    "bar",
+                    "bar",
+                    "bar",
+                    "bar",
+                    "foo",
+                    "foo",
+                    "foo",
+                ],
+                "B": [
+                    "one",
+                    "one",
+                    "one",
+                    "two",
+                    "one",
+                    "one",
+                    "one",
+                    "two",
+                    "two",
+                    "two",
+                    "one",
+                ],
+                "C": range(11),
+            }
+        )
+
+        result = pivot_table(data, index="A", columns="B", aggfunc="sum")
+        mi = MultiIndex(
+            levels=[["C"], ["one", "two"]], codes=[[0, 0], [0, 1]], names=[None, "B"]
+        )
+        expected = DataFrame(
+            {("C", "one"): {"bar": 15, "foo": 13}, ("C", "two"): {"bar": 7, "foo": 20}},
+            columns=mi,
+        ).rename_axis("A")
+        tm.assert_frame_equal(result, expected)
+
+        result = pivot_table(data, index="A", columns="B", aggfunc=["sum", "mean"])
+        mi = MultiIndex(
+            levels=[["sum", "mean"], ["C"], ["one", "two"]],
+            codes=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]],
+            names=[None, None, "B"],
+        )
+        expected = DataFrame(
+            {
+                ("mean", "C", "one"): {"bar": 5.0, "foo": 3.25},
+                ("mean", "C", "two"): {"bar": 7.0, "foo": 6.666666666666667},
+                ("sum", "C", "one"): {"bar": 15, "foo": 13},
+                ("sum", "C", "two"): {"bar": 7, "foo": 20},
+            },
+            columns=mi,
+        ).rename_axis("A")
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("kwargs", [{"a": 2}, {"a": 2, "b": 3}, {"b": 3, "a": 2}])
+    def test_pivot_table_kwargs(self, kwargs):
+        # GH#57884
+        def f(x, a, b=3):
+            return x.sum() * a + b
+
+        def g(x):
+            return f(x, **kwargs)
+
+        df = DataFrame(
+            {
+                "A": ["good", "bad", "good", "bad", "good"],
+                "B": ["one", "two", "one", "three", "two"],
+                "X": [2, 5, 4, 20, 10],
+            }
+        )
+        result = pivot_table(
+            df, index="A", columns="B", values="X", aggfunc=f, **kwargs
+        )
+        expected = pivot_table(df, index="A", columns="B", values="X", aggfunc=g)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "kwargs", [{}, {"b": 10}, {"a": 3}, {"a": 3, "b": 10}, {"b": 10, "a": 3}]
+    )
+    def test_pivot_table_kwargs_margin(self, data, kwargs):
+        # GH#57884
+        def f(x, a=5, b=7):
+            return (x.sum() + b) * a
+
+        def g(x):
+            return f(x, **kwargs)
+
+        result = data.pivot_table(
+            values="D",
+            index=["A", "B"],
+            columns="C",
+            aggfunc=f,
+            margins=True,
+            fill_value=0,
+            **kwargs,
+        )
+
+        expected = data.pivot_table(
+            values="D",
+            index=["A", "B"],
+            columns="C",
+            aggfunc=g,
+            margins=True,
+            fill_value=0,
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "f, f_numpy",
+        [
+            ("sum", np.sum),
+            ("mean", np.mean),
+            ("min", np.min),
+            (["sum", "mean"], [np.sum, np.mean]),
+            (["sum", "min"], [np.sum, np.min]),
+            (["max", "mean"], [np.max, np.mean]),
+        ],
+    )
+    def test_pivot_string_func_vs_func(self, f, f_numpy, data):
+        # GH #18713
+        # for consistency purposes
+        data = data.drop(columns="C")
+        result = pivot_table(data, index="A", columns="B", aggfunc=f)
+        expected = pivot_table(data, index="A", columns="B", aggfunc=f_numpy)
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.slow
+    def test_pivot_number_of_levels_larger_than_int32_warns(
+        self, performance_warning, monkeypatch
+    ):
+        # GH 20601
+        # GH 26314: Change ValueError to PerformanceWarning
+        class MockUnstacker(reshape_lib._Unstacker):
+            def __init__(self, *args, **kwargs) -> None:
+                # __init__ will raise the warning
+                super().__init__(*args, **kwargs)
+                raise Exception("Don't compute final result.")
+
+            def _make_selectors(self) -> None:
+                pass
+
+        with monkeypatch.context() as m:
+            m.setattr(reshape_lib, "_Unstacker", MockUnstacker)
+            df = DataFrame(
+                {"ind1": np.arange(2**16), "ind2": np.arange(2**16), "count": 0}
+            )
+
+            msg = "The following operation may generate"
+            with tm.assert_produces_warning(performance_warning, match=msg):
+                with pytest.raises(Exception, match="Don't compute final result."):
+                    df.pivot_table(
+                        index="ind1", columns="ind2", values="count", aggfunc="count"
+                    )
+
+    def test_pivot_table_aggfunc_dropna(self, dropna):
+        # GH 22159
+        df = DataFrame(
+            {
+                "fruit": ["apple", "peach", "apple"],
+                "size": [1, 1, 2],
+                "taste": [7, 6, 6],
+            }
+        )
+
+        def ret_one(x):
+            return 1
+
+        def ret_sum(x):
+            return sum(x)
+
+        def ret_none(x):
+            return np.nan
+
+        result = pivot_table(
+            df, columns="fruit", aggfunc=[ret_sum, ret_none, ret_one], dropna=dropna
+        )
+
+        data = [[3, 1, np.nan, np.nan, 1, 1], [13, 6, np.nan, np.nan, 1, 1]]
+        col = MultiIndex.from_product(
+            [["ret_sum", "ret_none", "ret_one"], ["apple", "peach"]],
+            names=[None, "fruit"],
+        )
+        expected = DataFrame(data, index=["size", "taste"], columns=col)
+
+        if dropna:
+            expected = expected.dropna(axis="columns")
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_aggfunc_scalar_dropna(self, dropna):
+        # GH 22159
+        df = DataFrame(
+            {"A": ["one", "two", "one"], "x": [3, np.nan, 2], "y": [1, np.nan, np.nan]}
+        )
+
+        result = pivot_table(df, columns="A", aggfunc="mean", dropna=dropna)
+
+        data = [[2.5, np.nan], [1, np.nan]]
+        col = Index(["one", "two"], name="A")
+        expected = DataFrame(data, index=["x", "y"], columns=col)
+
+        if dropna:
+            expected = expected.dropna(axis="columns")
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("margins", [True, False])
+    def test_pivot_table_empty_aggfunc(self, margins):
+        # GH 9186 & GH 13483 & GH 49240
+        df = DataFrame(
+            {
+                "A": [2, 2, 3, 3, 2],
+                "id": [5, 6, 7, 8, 9],
+                "C": ["p", "q", "q", "p", "q"],
+                "D": [None, None, None, None, None],
+            }
+        )
+        result = df.pivot_table(
+            index="A", columns="D", values="id", aggfunc=np.size, margins=margins
+        )
+        exp_cols = Index([], name="D")
+        expected = DataFrame(index=Index([], dtype="int64", name="A"), columns=exp_cols)
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_no_column_raises(self):
+        # GH 10326
+        def agg(arr):
+            return np.mean(arr)
+
+        df = DataFrame({"X": [0, 0, 1, 1], "Y": [0, 1, 0, 1], "Z": [10, 20, 30, 40]})
+        with pytest.raises(KeyError, match="notpresent"):
+            df.pivot_table("notpresent", "X", "Y", aggfunc=agg)
+
+    def test_pivot_table_multiindex_columns_doctest_case(self):
+        # The relevant characteristic is that the call
+        #  to maybe_downcast_to_dtype(agged[v], data[v].dtype) in
+        #  __internal_pivot_table has `agged[v]` a DataFrame instead of Series,
+        #  In this case this is because agged.columns is a MultiIndex and 'v'
+        #  is only indexing on its first level.
+        df = DataFrame(
+            {
+                "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
+                "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
+                "C": [
+                    "small",
+                    "large",
+                    "large",
+                    "small",
+                    "small",
+                    "large",
+                    "small",
+                    "small",
+                    "large",
+                ],
+                "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
+                "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
+            }
+        )
+
+        table = pivot_table(
+            df,
+            values=["D", "E"],
+            index=["A", "C"],
+            aggfunc={"D": "mean", "E": ["min", "max", "mean"]},
+        )
+        cols = MultiIndex.from_tuples(
+            [("D", "mean"), ("E", "max"), ("E", "mean"), ("E", "min")]
+        )
+        index = MultiIndex.from_tuples(
+            [("bar", "large"), ("bar", "small"), ("foo", "large"), ("foo", "small")],
+            names=["A", "C"],
+        )
+        vals = np.array(
+            [
+                [5.5, 9.0, 7.5, 6.0],
+                [5.5, 9.0, 8.5, 8.0],
+                [2.0, 5.0, 4.5, 4.0],
+                [2.33333333, 6.0, 4.33333333, 2.0],
+            ]
+        )
+        expected = DataFrame(vals, columns=cols, index=index)
+        expected[("E", "min")] = expected[("E", "min")].astype(np.int64)
+        expected[("E", "max")] = expected[("E", "max")].astype(np.int64)
+        tm.assert_frame_equal(table, expected)
+
+    def test_pivot_table_sort_false(self):
+        # GH#39143
+        df = DataFrame(
+            {
+                "a": ["d1", "d4", "d3"],
+                "col": ["a", "b", "c"],
+                "num": [23, 21, 34],
+                "year": ["2018", "2018", "2019"],
+            }
+        )
+        result = df.pivot_table(
+            index=["a", "col"], columns="year", values="num", aggfunc="sum", sort=False
+        )
+        expected = DataFrame(
+            [[23, np.nan], [21, np.nan], [np.nan, 34]],
+            columns=Index(["2018", "2019"], name="year"),
+            index=MultiIndex.from_arrays(
+                [["d1", "d4", "d3"], ["a", "b", "c"]], names=["a", "col"]
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_nullable_margins(self):
+        # GH#48681
+        df = DataFrame(
+            {"a": "A", "b": [1, 2], "sales": Series([10, 11], dtype="Int64")}
+        )
+
+        result = df.pivot_table(index="b", columns="a", margins=True, aggfunc="sum")
+        expected = DataFrame(
+            [[10, 10], [11, 11], [21, 21]],
+            index=Index([1, 2, "All"], name="b"),
+            columns=MultiIndex.from_tuples(
+                [("sales", "A"), ("sales", "All")], names=[None, "a"]
+            ),
+            dtype="Int64",
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_sort_false_with_multiple_values(self):
+        df = DataFrame(
+            {
+                "firstname": ["John", "Michael"],
+                "lastname": ["Foo", "Bar"],
+                "height": [173, 182],
+                "age": [47, 33],
+            }
+        )
+        result = df.pivot_table(
+            index=["lastname", "firstname"], values=["height", "age"], sort=False
+        )
+        expected = DataFrame(
+            [[173.0, 47.0], [182.0, 33.0]],
+            columns=["height", "age"],
+            index=MultiIndex.from_tuples(
+                [("Foo", "John"), ("Bar", "Michael")],
+                names=["lastname", "firstname"],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_with_margins_and_numeric_columns(self):
+        # GH 26568
+        df = DataFrame([["a", "x", 1], ["a", "y", 2], ["b", "y", 3], ["b", "z", 4]])
+        df.columns = [10, 20, 30]
+
+        result = df.pivot_table(
+            index=10, columns=20, values=30, aggfunc="sum", fill_value=0, margins=True
+        )
+
+        expected = DataFrame([[1, 2, 0, 3], [0, 3, 4, 7], [1, 5, 4, 10]])
+        expected.columns = ["x", "y", "z", "All"]
+        expected.index = ["a", "b", "All"]
+        expected.columns.name = 20
+        expected.index.name = 10
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "dtype,expected_dtype", [("Int64", "Float64"), ("int64", "float64")]
+    )
+    def test_pivot_ea_dtype_dropna(self, dropna, dtype, expected_dtype):
+        # GH#47477
+        # GH#47971
+        df = DataFrame({"x": "a", "y": "b", "age": Series([20, 40], dtype=dtype)})
+        result = df.pivot_table(
+            index="x", columns="y", values="age", aggfunc="mean", dropna=dropna
+        )
+        expected = DataFrame(
+            [[30]],
+            index=Index(["a"], name="x"),
+            columns=Index(["b"], name="y"),
+            dtype=expected_dtype,
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_datetime_warning(self):
+        # GH#48683
+        df = DataFrame(
+            {
+                "a": "A",
+                "b": [1, 2],
+                "date": pd.Timestamp("2019-12-31"),
+                "sales": [10.0, 11],
+            }
+        )
+        with tm.assert_produces_warning(None):
+            result = df.pivot_table(
+                index=["b", "date"], columns="a", margins=True, aggfunc="sum"
+            )
+        expected = DataFrame(
+            [[10.0, 10.0], [11.0, 11.0], [21.0, 21.0]],
+            index=MultiIndex.from_arrays(
+                [
+                    Index([1, 2, "All"], name="b"),
+                    Index(
+                        [pd.Timestamp("2019-12-31"), pd.Timestamp("2019-12-31"), ""],
+                        dtype=object,
+                        name="date",
+                    ),
+                ]
+            ),
+            columns=MultiIndex.from_tuples(
+                [("sales", "A"), ("sales", "All")], names=[None, "a"]
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_with_mixed_nested_tuples(self):
+        # GH 50342
+        df = DataFrame(
+            {
+                "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
+                "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
+                "C": [
+                    "small",
+                    "large",
+                    "large",
+                    "small",
+                    "small",
+                    "large",
+                    "small",
+                    "small",
+                    "large",
+                ],
+                "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
+                "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
+                ("col5",): [
+                    "foo",
+                    "foo",
+                    "foo",
+                    "foo",
+                    "foo",
+                    "bar",
+                    "bar",
+                    "bar",
+                    "bar",
+                ],
+                ("col6", 6): [
+                    "one",
+                    "one",
+                    "one",
+                    "two",
+                    "two",
+                    "one",
+                    "one",
+                    "two",
+                    "two",
+                ],
+                (7, "seven"): [
+                    "small",
+                    "large",
+                    "large",
+                    "small",
+                    "small",
+                    "large",
+                    "small",
+                    "small",
+                    "large",
+                ],
+            }
+        )
+        result = pivot_table(
+            df, values="D", index=["A", "B"], columns=[(7, "seven")], aggfunc="sum"
+        )
+        expected = DataFrame(
+            [[4.0, 5.0], [7.0, 6.0], [4.0, 1.0], [np.nan, 6.0]],
+            columns=Index(["large", "small"], name=(7, "seven")),
+            index=MultiIndex.from_arrays(
+                [["bar", "bar", "foo", "foo"], ["one", "two"] * 2], names=["A", "B"]
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_aggfunc_nunique_with_different_values(self):
+        test = DataFrame(
+            {
+                "a": range(10),
+                "b": range(10),
+                "c": range(10),
+                "d": range(10),
+            }
+        )
+
+        columnval = MultiIndex.from_arrays(
+            [
+                ["nunique" for i in range(10)],
+                ["c" for i in range(10)],
+                range(10),
+            ],
+            names=(None, None, "b"),
+        )
+        nparr = np.full((10, 10), np.nan)
+        np.fill_diagonal(nparr, 1.0)
+
+        expected = DataFrame(nparr, index=Index(range(10), name="a"), columns=columnval)
+        result = test.pivot_table(
+            index=[
+                "a",
+            ],
+            columns=[
+                "b",
+            ],
+            values=[
+                "c",
+            ],
+            aggfunc=["nunique"],
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_index_and_column_keys_with_nan(self, dropna):
+        # GH#61113
+        data = {"row": [None, *range(4)], "col": [*range(4), None], "val": range(5)}
+        df = DataFrame(data)
+        result = df.pivot_table(values="val", index="row", columns="col", dropna=dropna)
+        e_axis = [*range(4), None]
+        nan = np.nan
+        e_data = [
+            [nan, 1.0, nan, nan, nan],
+            [nan, nan, 2.0, nan, nan],
+            [nan, nan, nan, 3.0, nan],
+            [nan, nan, nan, nan, 4.0],
+            [0.0, nan, nan, nan, nan],
+        ]
+        expected = DataFrame(
+            data=e_data,
+            index=Index(data=e_axis, name="row"),
+            columns=Index(data=e_axis, name="col"),
+        )
+        if dropna:
+            expected = expected.loc[[0, 1, 2], [1, 2, 3]]
+
+        tm.assert_frame_equal(left=result, right=expected)
+
+    @pytest.mark.parametrize(
+        "index, columns, e_data, e_index, e_cols",
+        [
+            (
+                "Category",
+                "Value",
+                [
+                    [1.0, np.nan, 1.0, np.nan],
+                    [np.nan, 1.0, np.nan, 1.0],
+                ],
+                Index(data=["A", "B"], name="Category"),
+                Index(data=[10, 20, 40, 50], name="Value"),
+            ),
+            (
+                "Value",
+                "Category",
+                [
+                    [1.0, np.nan],
+                    [np.nan, 1.0],
+                    [1.0, np.nan],
+                    [np.nan, 1.0],
+                ],
+                Index(data=[10, 20, 40, 50], name="Value"),
+                Index(data=["A", "B"], name="Category"),
+            ),
+        ],
+        ids=["values-and-columns", "values-and-index"],
+    )
+    def test_pivot_table_values_as_two_params(
+        self, index, columns, e_data, e_index, e_cols
+    ):
+        # GH#57876
+        data = {"Category": ["A", "B", "A", "B"], "Value": [10, 20, 40, 50]}
+        df = DataFrame(data)
+        result = df.pivot_table(
+            index=index, columns=columns, values="Value", aggfunc="count"
+        )
+        expected = DataFrame(data=e_data, index=e_index, columns=e_cols)
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_table_margins_include_nan_groups(self):
+        # GH#61509
+        df = DataFrame(
+            {
+                "i": [1, 2, 3],
+                "g1": ["a", "b", "b"],
+                "g2": ["x", None, None],
+            }
+        )
+
+        result = df.pivot_table(
+            index="g1",
+            columns="g2",
+            values="i",
+            aggfunc="count",
+            dropna=False,
+            margins=True,
+        )
+
+        expected = DataFrame(
+            {
+                "x": {"a": 1.0, "b": np.nan, "All": 1.0},
+                np.nan: {"a": np.nan, "b": 2.0, "All": 2.0},
+                "All": {"a": 1.0, "b": 2.0, "All": 3.0},
+            }
+        )
+        expected.index.name = "g1"
+        expected.columns.name = "g2"
+        tm.assert_frame_equal(result, expected, check_dtype=False)
+
+
+class TestPivot:
+    def test_pivot(self):
+        data = {
+            "index": ["A", "B", "C", "C", "B", "A"],
+            "columns": ["One", "One", "One", "Two", "Two", "Two"],
+            "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
+        }
+
+        frame = DataFrame(data)
+        pivoted = frame.pivot(index="index", columns="columns", values="values")
+
+        expected = DataFrame(
+            {
+                "One": {"A": 1.0, "B": 2.0, "C": 3.0},
+                "Two": {"A": 1.0, "B": 2.0, "C": 3.0},
+            }
+        )
+
+        expected.index.name, expected.columns.name = "index", "columns"
+        tm.assert_frame_equal(pivoted, expected)
+
+        # name tracking
+        assert pivoted.index.name == "index"
+        assert pivoted.columns.name == "columns"
+
+        # don't specify values
+        pivoted = frame.pivot(index="index", columns="columns")
+        assert pivoted.index.name == "index"
+        assert pivoted.columns.names == (None, "columns")
+
+    def test_pivot_duplicates(self):
+        data = DataFrame(
+            {
+                "a": ["bar", "bar", "foo", "foo", "foo"],
+                "b": ["one", "two", "one", "one", "two"],
+                "c": [1.0, 2.0, 3.0, 3.0, 4.0],
+            }
+        )
+        with pytest.raises(ValueError, match="duplicate entries"):
+            data.pivot(index="a", columns="b", values="c")
+
+    def test_pivot_empty(self):
+        df = DataFrame(columns=["a", "b", "c"])
+        result = df.pivot(index="a", columns="b", values="c")
+        expected = DataFrame(index=[], columns=[])
+        tm.assert_frame_equal(result, expected, check_names=False)
+
+    def test_pivot_integer_bug(self, any_string_dtype):
+        df = DataFrame(
+            data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=any_string_dtype
+        )
+
+        result = df.pivot(index=1, columns=0, values=2)
+        expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype)
+        tm.assert_index_equal(result.columns, expected_columns)
+
+    def test_pivot_index_none(self):
+        # GH#3962
+        data = {
+            "index": ["A", "B", "C", "C", "B", "A"],
+            "columns": ["One", "One", "One", "Two", "Two", "Two"],
+            "values": [1.0, 2.0, 3.0, 3.0, 2.0, 1.0],
+        }
+
+        frame = DataFrame(data).set_index("index")
+        result = frame.pivot(columns="columns", values="values")
+        expected = DataFrame(
+            {
+                "One": {"A": 1.0, "B": 2.0, "C": 3.0},
+                "Two": {"A": 1.0, "B": 2.0, "C": 3.0},
+            }
+        )
+
+        expected.index.name, expected.columns.name = "index", "columns"
+        tm.assert_frame_equal(result, expected)
+
+        # omit values
+        result = frame.pivot(columns="columns")
+
+        expected.columns = MultiIndex.from_tuples(
+            [("values", "One"), ("values", "Two")], names=[None, "columns"]
+        )
+        expected.index.name = "index"
+        tm.assert_frame_equal(result, expected, check_names=False)
+        assert result.index.name == "index"
+        assert result.columns.names == (None, "columns")
+        expected.columns = expected.columns.droplevel(0)
+        result = frame.pivot(columns="columns", values="values")
+
+        expected.columns.name = "columns"
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_index_list_values_none_immutable_args(self):
+        # GH37635
+        df = DataFrame(
+            {
+                "lev1": [1, 1, 1, 2, 2, 2],
+                "lev2": [1, 1, 2, 1, 1, 2],
+                "lev3": [1, 2, 1, 2, 1, 2],
+                "lev4": [1, 2, 3, 4, 5, 6],
+                "values": [0, 1, 2, 3, 4, 5],
+            }
+        )
+        index = ["lev1", "lev2"]
+        columns = ["lev3"]
+        result = df.pivot(index=index, columns=columns)
+
+        expected = DataFrame(
+            np.array(
+                [
+                    [1.0, 2.0, 0.0, 1.0],
+                    [3.0, np.nan, 2.0, np.nan],
+                    [5.0, 4.0, 4.0, 3.0],
+                    [np.nan, 6.0, np.nan, 5.0],
+                ]
+            ),
+            index=MultiIndex.from_arrays(
+                [(1, 1, 2, 2), (1, 2, 1, 2)], names=["lev1", "lev2"]
+            ),
+            columns=MultiIndex.from_arrays(
+                [("lev4", "lev4", "values", "values"), (1, 2, 1, 2)],
+                names=[None, "lev3"],
+            ),
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+        assert index == ["lev1", "lev2"]
+        assert columns == ["lev3"]
+
+    def test_pivot_columns_not_given(self):
+        # GH#48293
+        df = DataFrame({"a": [1], "b": 1})
+        with pytest.raises(TypeError, match="missing 1 required keyword-only argument"):
+            df.pivot()
+
+    # this still fails because columns=None gets passed down to unstack as level=None
+    # while at that point None was converted to NaN
+    @pytest.mark.xfail(
+        using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
+    )
+    def test_pivot_columns_is_none(self):
+        # GH#48293
+        df = DataFrame({None: [1], "b": 2, "c": 3})
+        result = df.pivot(columns=None)
+        expected = DataFrame({("b", 1): [2], ("c", 1): 3})
+        tm.assert_frame_equal(result, expected)
+
+        result = df.pivot(columns=None, index="b")
+        expected = DataFrame({("c", 1): 3}, index=Index([2], name="b"))
+        tm.assert_frame_equal(result, expected)
+
+        result = df.pivot(columns=None, index="b", values="c")
+        expected = DataFrame({1: 3}, index=Index([2], name="b"))
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_index_is_none(self, using_infer_string):
+        # GH#48293
+        df = DataFrame({None: [1], "b": 2, "c": 3})
+
+        result = df.pivot(columns="b", index=None)
+        expected = DataFrame({("c", 2): 3}, index=[1])
+        expected.columns.names = [None, "b"]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.pivot(columns="b", index=None, values="c")
+        expected = DataFrame(3, index=[1], columns=Index([2], name="b"))
+        if using_infer_string:
+            expected.index.name = np.nan
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_values_is_none(self):
+        # GH#48293
+        df = DataFrame({None: [1], "b": 2, "c": 3})
+
+        result = df.pivot(columns="b", index="c", values=None)
+        expected = DataFrame(
+            1, index=Index([3], name="c"), columns=Index([2], name="b")
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = df.pivot(columns="b", values=None)
+        expected = DataFrame(1, index=[0], columns=Index([2], name="b"))
+        tm.assert_frame_equal(result, expected)
+
+    def test_pivot_not_changing_index_name(self):
+        # GH#52692
+        df = DataFrame({"one": ["a"], "two": 0, "three": 1})
+        expected = df.copy(deep=True)
+        df.pivot(index="one", columns="two", values="three")
+        tm.assert_frame_equal(df, expected)
+
+    def test_pivot_table_empty_dataframe_correct_index(self):
+        # GH 21932
+        df = DataFrame([], columns=["a", "b", "value"])
+        pivot = df.pivot_table(index="a", columns="b", values="value", aggfunc="count")
+
+        expected = Index([], dtype="object", name="b")
+        tm.assert_index_equal(pivot.columns, expected)
+
+    def test_pivot_table_handles_explicit_datetime_types(self):
+        # GH#43574
+        df = DataFrame(
+            [
+                {"a": "x", "date_str": "2023-01-01", "amount": 1},
+                {"a": "y", "date_str": "2023-01-02", "amount": 2},
+                {"a": "z", "date_str": "2023-01-03", "amount": 3},
+            ]
+        )
+        df["date"] = pd.to_datetime(df["date_str"])
+
+        with tm.assert_produces_warning(False):
+            pivot = df.pivot_table(
+                index=["a", "date"], values=["amount"], aggfunc="sum", margins=True
+            )
+
+        expected = MultiIndex.from_tuples(
+            [
+                ("x", datetime.strptime("2023-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")),
+                ("y", datetime.strptime("2023-01-02 00:00:00", "%Y-%m-%d %H:%M:%S")),
+                ("z", datetime.strptime("2023-01-03 00:00:00", "%Y-%m-%d %H:%M:%S")),
+                ("All", ""),
+            ],
+            names=["a", "date"],
+        )
+        tm.assert_index_equal(pivot.index, expected)
+
+    def test_pivot_table_with_margins_and_numeric_column_names(self):
+        # GH#26568
+        df = DataFrame([["a", "x", 1], ["a", "y", 2], ["b", "y", 3], ["b", "z", 4]])
+
+        result = df.pivot_table(
+            index=0, columns=1, values=2, aggfunc="sum", fill_value=0, margins=True
+        )
+
+        expected = DataFrame(
+            [[1, 2, 0, 3], [0, 3, 4, 7], [1, 5, 4, 10]],
+            columns=Index(["x", "y", "z", "All"], name=1),
+            index=Index(["a", "b", "All"], name=0),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("m", [1, 10])
+    def test_unstack_copy(self, m):
+        # GH#56633
+        levels = np.arange(m)
+        index = MultiIndex.from_product([levels] * 2)
+        values = np.arange(m * m * 100).reshape(m * m, 100)
+        df = DataFrame(values, index, np.arange(100))
+        df_orig = df.copy()
+        result = df.unstack(sort=False)
+        result.iloc[0, 0] = -1
+        tm.assert_frame_equal(df, df_orig)
+
+    def test_pivot_empty_with_datetime(self):
+        # GH#59126
+        df = DataFrame(
+            {
+                "timestamp": Series([], dtype=pd.DatetimeTZDtype(tz="UTC")),
+                "category": Series([], dtype=str),
+                "value": Series([], dtype=str),
+            }
+        )
+        df_pivoted = df.pivot_table(
+            index="category", columns="value", values="timestamp"
+        )
+        assert df_pivoted.empty
+
+    def test_pivot_margins_with_none_index(self):
+        # GH#58722
+        df = DataFrame(
+            {
+                "x": [1, 1, 2],
+                "y": [3, 3, 4],
+                "z": [5, 5, 6],
+                "w": [7, 8, 9],
+            }
+        )
+        result = df.pivot_table(
+            index=None,
+            columns=["y", "z"],
+            values="w",
+            margins=True,
+            aggfunc="count",
+        )
+        expected = DataFrame(
+            [[2, 2, 1, 1]],
+            index=["w"],
+            columns=MultiIndex(
+                levels=[[3, 4], [5, 6, "All"]],
+                codes=[[0, 0, 1, 1], [0, 2, 1, 2]],
+                names=["y", "z"],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
+    def test_pivot_with_pyarrow_categorical(self):
+        # GH#53051
+        pa = pytest.importorskip("pyarrow")
+
+        df = DataFrame(
+            {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]}
+        ).astype(
+            {
+                "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())),
+                "number_column": "float[pyarrow]",
+            }
+        )
+
+        df = df.pivot(columns=["string_column"], values=["number_column"])
+
+        multi_index = MultiIndex.from_arrays(
+            [["number_column", "number_column", "number_column"], ["A", "B", "C"]],
+            names=(None, "string_column"),
+        )
+        df_expected = DataFrame(
+            [[1.0, np.nan, np.nan], [np.nan, 2.0, np.nan], [np.nan, np.nan, 3.0]],
+            columns=multi_index,
+        )
+        tm.assert_frame_equal(
+            df, df_expected, check_dtype=False, check_column_type=False
+        )
+
+    @pytest.mark.parametrize("freq", ["D", "M", "Q", "Y"])
+    def test_pivot_empty_dataframe_period_dtype(self, freq):
+        # GH#62705
+
+        dtype = pd.PeriodDtype(freq=freq)
+        df = DataFrame({"index": [], "columns": [], "values": []})
+        df = df.astype({"values": dtype})
+        result = df.pivot(index="index", columns="columns", values="values")
+
+        expected_index = Index([], name="index", dtype="float64")
+        expected_columns = Index([], name="columns", dtype="float64")
+        expected = DataFrame(
+            index=expected_index, columns=expected_columns, dtype=dtype
+        )
+
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_pivot_multilevel.py b/pandas/tests/reshape/test_pivot_multilevel.py
new file mode 100644
index 0000000000000000000000000000000000000000..af70210b37f3c0396b1dd972f64822001c04dc50
--- /dev/null
+++ b/pandas/tests/reshape/test_pivot_multilevel.py
@@ -0,0 +1,301 @@
+import numpy as np
+import pytest
+
+from pandas._libs import lib
+
+import pandas as pd
+from pandas import (
+    Index,
+    MultiIndex,
+)
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize(
+    "input_index, input_columns, input_values, "
+    "expected_values, expected_columns, expected_index",
+    [
+        (
+            ["lev4"],
+            "lev3",
+            "values",
+            [
+                [0.0, np.nan],
+                [np.nan, 1.0],
+                [2.0, np.nan],
+                [np.nan, 3.0],
+                [4.0, np.nan],
+                [np.nan, 5.0],
+                [6.0, np.nan],
+                [np.nan, 7.0],
+            ],
+            Index([1, 2], name="lev3"),
+            Index([1, 2, 3, 4, 5, 6, 7, 8], name="lev4"),
+        ),
+        (
+            ["lev4"],
+            "lev3",
+            lib.no_default,
+            [
+                [1.0, np.nan, 1.0, np.nan, 0.0, np.nan],
+                [np.nan, 1.0, np.nan, 1.0, np.nan, 1.0],
+                [1.0, np.nan, 2.0, np.nan, 2.0, np.nan],
+                [np.nan, 1.0, np.nan, 2.0, np.nan, 3.0],
+                [2.0, np.nan, 1.0, np.nan, 4.0, np.nan],
+                [np.nan, 2.0, np.nan, 1.0, np.nan, 5.0],
+                [2.0, np.nan, 2.0, np.nan, 6.0, np.nan],
+                [np.nan, 2.0, np.nan, 2.0, np.nan, 7.0],
+            ],
+            MultiIndex.from_tuples(
+                [
+                    ("lev1", 1),
+                    ("lev1", 2),
+                    ("lev2", 1),
+                    ("lev2", 2),
+                    ("values", 1),
+                    ("values", 2),
+                ],
+                names=[None, "lev3"],
+            ),
+            Index([1, 2, 3, 4, 5, 6, 7, 8], name="lev4"),
+        ),
+        (
+            ["lev1", "lev2"],
+            "lev3",
+            "values",
+            [[0, 1], [2, 3], [4, 5], [6, 7]],
+            Index([1, 2], name="lev3"),
+            MultiIndex.from_tuples(
+                [(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
+            ),
+        ),
+        (
+            ["lev1", "lev2"],
+            "lev3",
+            lib.no_default,
+            [[1, 2, 0, 1], [3, 4, 2, 3], [5, 6, 4, 5], [7, 8, 6, 7]],
+            MultiIndex.from_tuples(
+                [("lev4", 1), ("lev4", 2), ("values", 1), ("values", 2)],
+                names=[None, "lev3"],
+            ),
+            MultiIndex.from_tuples(
+                [(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
+            ),
+        ),
+    ],
+)
+def test_pivot_list_like_index(
+    input_index,
+    input_columns,
+    input_values,
+    expected_values,
+    expected_columns,
+    expected_index,
+):
+    # GH 21425, test when index is given a list
+    df = pd.DataFrame(
+        {
+            "lev1": [1, 1, 1, 1, 2, 2, 2, 2],
+            "lev2": [1, 1, 2, 2, 1, 1, 2, 2],
+            "lev3": [1, 2, 1, 2, 1, 2, 1, 2],
+            "lev4": [1, 2, 3, 4, 5, 6, 7, 8],
+            "values": [0, 1, 2, 3, 4, 5, 6, 7],
+        }
+    )
+
+    result = df.pivot(index=input_index, columns=input_columns, values=input_values)
+    expected = pd.DataFrame(
+        expected_values, columns=expected_columns, index=expected_index
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "input_index, input_columns, input_values, "
+    "expected_values, expected_columns, expected_index",
+    [
+        (
+            "lev4",
+            ["lev3"],
+            "values",
+            [
+                [0.0, np.nan],
+                [np.nan, 1.0],
+                [2.0, np.nan],
+                [np.nan, 3.0],
+                [4.0, np.nan],
+                [np.nan, 5.0],
+                [6.0, np.nan],
+                [np.nan, 7.0],
+            ],
+            Index([1, 2], name="lev3"),
+            Index([1, 2, 3, 4, 5, 6, 7, 8], name="lev4"),
+        ),
+        (
+            ["lev1", "lev2"],
+            ["lev3"],
+            "values",
+            [[0, 1], [2, 3], [4, 5], [6, 7]],
+            Index([1, 2], name="lev3"),
+            MultiIndex.from_tuples(
+                [(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
+            ),
+        ),
+        (
+            ["lev1"],
+            ["lev2", "lev3"],
+            "values",
+            [[0, 1, 2, 3], [4, 5, 6, 7]],
+            MultiIndex.from_tuples(
+                [(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev2", "lev3"]
+            ),
+            Index([1, 2], name="lev1"),
+        ),
+        (
+            ["lev1", "lev2"],
+            ["lev3", "lev4"],
+            "values",
+            [
+                [0.0, 1.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
+                [np.nan, np.nan, 2.0, 3.0, np.nan, np.nan, np.nan, np.nan],
+                [np.nan, np.nan, np.nan, np.nan, 4.0, 5.0, np.nan, np.nan],
+                [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 6.0, 7.0],
+            ],
+            MultiIndex.from_tuples(
+                [(1, 1), (2, 2), (1, 3), (2, 4), (1, 5), (2, 6), (1, 7), (2, 8)],
+                names=["lev3", "lev4"],
+            ),
+            MultiIndex.from_tuples(
+                [(1, 1), (1, 2), (2, 1), (2, 2)], names=["lev1", "lev2"]
+            ),
+        ),
+    ],
+)
+def test_pivot_list_like_columns(
+    input_index,
+    input_columns,
+    input_values,
+    expected_values,
+    expected_columns,
+    expected_index,
+):
+    # GH 21425, test when columns is given a list
+    df = pd.DataFrame(
+        {
+            "lev1": [1, 1, 1, 1, 2, 2, 2, 2],
+            "lev2": [1, 1, 2, 2, 1, 1, 2, 2],
+            "lev3": [1, 2, 1, 2, 1, 2, 1, 2],
+            "lev4": [1, 2, 3, 4, 5, 6, 7, 8],
+            "values": [0, 1, 2, 3, 4, 5, 6, 7],
+        }
+    )
+
+    result = df.pivot(index=input_index, columns=input_columns, values=input_values)
+    expected = pd.DataFrame(
+        expected_values, columns=expected_columns, index=expected_index
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_pivot_multiindexed_rows_and_cols():
+    # GH 36360
+
+    df = pd.DataFrame(
+        data=np.arange(12).reshape(4, 3),
+        columns=MultiIndex.from_tuples(
+            [(0, 0), (0, 1), (0, 2)], names=["col_L0", "col_L1"]
+        ),
+        index=MultiIndex.from_tuples(
+            [(0, 0, 0), (0, 0, 1), (1, 1, 1), (1, 0, 0)],
+            names=["idx_L0", "idx_L1", "idx_L2"],
+        ),
+    )
+
+    res = df.pivot_table(
+        index=["idx_L0"],
+        columns=["idx_L1"],
+        values=[(0, 1)],
+        aggfunc=lambda col: col.values.sum(),
+    )
+
+    expected = pd.DataFrame(
+        data=[[5, np.nan], [10, 7.0]],
+        columns=MultiIndex.from_tuples(
+            [(0, 1, 0), (0, 1, 1)], names=["col_L0", "col_L1", "idx_L1"]
+        ),
+        index=Index([0, 1], dtype="int64", name="idx_L0"),
+    )
+    expected = expected.astype("float64")
+
+    tm.assert_frame_equal(res, expected)
+
+
+def test_pivot_df_multiindex_index_none():
+    # GH 23955
+    df = pd.DataFrame(
+        [
+            ["A", "A1", "label1", 1],
+            ["A", "A2", "label2", 2],
+            ["B", "A1", "label1", 3],
+            ["B", "A2", "label2", 4],
+        ],
+        columns=["index_1", "index_2", "label", "value"],
+    )
+    df = df.set_index(["index_1", "index_2"])
+
+    result = df.pivot(columns="label", values="value")
+    expected = pd.DataFrame(
+        [[1.0, np.nan], [np.nan, 2.0], [3.0, np.nan], [np.nan, 4.0]],
+        index=df.index,
+        columns=Index(["label1", "label2"], name="label"),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "index, columns, e_data, e_index, e_cols",
+    [
+        (
+            "index",
+            ["col", "value"],
+            [
+                [50.0, np.nan, 100.0, np.nan],
+                [np.nan, 100.0, np.nan, 200.0],
+            ],
+            Index(data=["A", "B"], name="index"),
+            MultiIndex.from_arrays(
+                arrays=[[1, 1, 2, 2], [50, 100, 100, 200]], names=["col", "value"]
+            ),
+        ),
+        (
+            ["index", "value"],
+            "col",
+            [
+                [50.0, np.nan],
+                [np.nan, 100.0],
+                [100.0, np.nan],
+                [np.nan, 200.0],
+            ],
+            MultiIndex.from_arrays(
+                arrays=[["A", "A", "B", "B"], [50, 100, 100, 200]],
+                names=["index", "value"],
+            ),
+            Index(data=[1, 2], name="col"),
+        ),
+    ],
+    ids=["values-and-columns", "values-and-index"],
+)
+def test_pivot_table_multiindex_values_as_two_params(
+    index, columns, e_data, e_index, e_cols
+):
+    # GH#61292
+    data = [
+        ["A", 1, 50, -1],
+        ["B", 1, 100, -2],
+        ["A", 2, 100, -2],
+        ["B", 2, 200, -4],
+    ]
+    df = pd.DataFrame(data=data, columns=["index", "col", "value", "extra"])
+    result = df.pivot_table(values="value", index=index, columns=columns)
+    expected = pd.DataFrame(data=e_data, index=e_index, columns=e_cols)
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py
new file mode 100644
index 0000000000000000000000000000000000000000..51617bc3536807fca79f06c8476dba0694f5d445
--- /dev/null
+++ b/pandas/tests/reshape/test_qcut.py
@@ -0,0 +1,308 @@
+import os
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    DatetimeIndex,
+    Interval,
+    IntervalIndex,
+    NaT,
+    Series,
+    Timedelta,
+    TimedeltaIndex,
+    Timestamp,
+    cut,
+    date_range,
+    isna,
+    qcut,
+    timedelta_range,
+)
+import pandas._testing as tm
+from pandas.api.types import CategoricalDtype
+
+from pandas.tseries.offsets import Day
+
+
+def test_qcut():
+    arr = np.random.default_rng(2).standard_normal(1000)
+
+    # We store the bins as Index that have been
+    # rounded to comparisons are a bit tricky.
+    labels, _ = qcut(arr, 4, retbins=True)
+    ex_bins = np.quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
+
+    result = labels.categories.left.values
+    assert np.allclose(result, ex_bins[:-1], atol=1e-2)
+
+    result = labels.categories.right.values
+    assert np.allclose(result, ex_bins[1:], atol=1e-2)
+
+    ex_levels = cut(arr, ex_bins, include_lowest=True)
+    tm.assert_categorical_equal(labels, ex_levels)
+
+
+def test_qcut_bounds():
+    arr = np.random.default_rng(2).standard_normal(1000)
+
+    factor = qcut(arr, 10, labels=False)
+    assert len(np.unique(factor)) == 10
+
+
+def test_qcut_specify_quantiles():
+    arr = np.random.default_rng(2).standard_normal(100)
+    factor = qcut(arr, [0, 0.25, 0.5, 0.75, 1.0])
+
+    expected = qcut(arr, 4)
+    tm.assert_categorical_equal(factor, expected)
+
+
+def test_qcut_all_bins_same():
+    with pytest.raises(ValueError, match="edges.*unique"):
+        qcut([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
+
+
+def test_qcut_include_lowest():
+    values = np.arange(10)
+    ii = qcut(values, 4)
+
+    ex_levels = IntervalIndex(
+        [
+            Interval(-0.001, 2.25),
+            Interval(2.25, 4.5),
+            Interval(4.5, 6.75),
+            Interval(6.75, 9),
+        ]
+    )
+    tm.assert_index_equal(ii.categories, ex_levels)
+
+
+def test_qcut_nas():
+    arr = np.random.default_rng(2).standard_normal(100)
+    arr[:20] = np.nan
+
+    result = qcut(arr, 4)
+    assert isna(result[:20]).all()
+
+
+def test_qcut_index():
+    result = qcut([0, 2], 2)
+    intervals = [Interval(-0.001, 1), Interval(1, 2)]
+
+    expected = Categorical(intervals, ordered=True)
+    tm.assert_categorical_equal(result, expected)
+
+
+def test_qcut_binning_issues(datapath):
+    # see gh-1978, gh-1979
+    cut_file = datapath(os.path.join("reshape", "data", "cut_data.csv"))
+    arr = np.loadtxt(cut_file)
+    result = qcut(arr, 20)
+
+    starts = result.categories.left
+    ends = result.categories.right
+    assert (starts < ends).all()
+    assert (starts[1:] <= ends[:-1]).all()
+
+
+def test_qcut_return_intervals():
+    ser = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
+    res = qcut(ser, [0, 0.333, 0.666, 1])
+
+    exp_levels = np.array(
+        [Interval(-0.001, 2.664), Interval(2.664, 5.328), Interval(5.328, 8)]
+    )
+    exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
+        CategoricalDtype(ordered=True)
+    )
+    tm.assert_series_equal(res, exp)
+
+
+@pytest.mark.parametrize("labels", ["foo", 1, True])
+def test_qcut_incorrect_labels(labels):
+    # GH 13318
+    values = range(5)
+    msg = "Bin labels must either be False, None or passed in as a list-like argument"
+    with pytest.raises(ValueError, match=msg):
+        qcut(values, 4, labels=labels)
+
+
+@pytest.mark.parametrize("labels", [["a", "b", "c"], list(range(3))])
+def test_qcut_wrong_length_labels(labels):
+    # GH 13318
+    values = range(10)
+    msg = "Bin labels must be one fewer than the number of bin edges"
+    with pytest.raises(ValueError, match=msg):
+        qcut(values, 4, labels=labels)
+
+
+@pytest.mark.parametrize(
+    "labels, expected",
+    [
+        (["a", "b", "c"], ["a", "b", "c"]),
+        (list(range(3)), [0, 1, 2]),
+    ],
+)
+def test_qcut_list_like_labels(labels, expected):
+    # GH 13318
+    values = range(3)
+    result = qcut(values, 3, labels=labels)
+    expected = Categorical(expected, ordered=True)
+    tm.assert_categorical_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "kwargs,msg",
+    [
+        ({"duplicates": "drop"}, None),
+        ({}, "Bin edges must be unique"),
+        ({"duplicates": "raise"}, "Bin edges must be unique"),
+        ({"duplicates": "foo"}, "invalid value for 'duplicates' parameter"),
+    ],
+)
+def test_qcut_duplicates_bin(kwargs, msg):
+    # see gh-7751
+    values = [0, 0, 0, 0, 1, 2, 3]
+
+    if msg is not None:
+        with pytest.raises(ValueError, match=msg):
+            qcut(values, 3, **kwargs)
+    else:
+        result = qcut(values, 3, **kwargs)
+        expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
+        tm.assert_index_equal(result.categories, expected)
+
+
+@pytest.mark.parametrize(
+    "data,start,end", [(9.0, 8.999, 9.0), (0.0, -0.001, 0.0), (-9.0, -9.001, -9.0)]
+)
+@pytest.mark.parametrize("length", [1, 2])
+@pytest.mark.parametrize("labels", [None, False])
+def test_single_quantile(data, start, end, length, labels):
+    # see gh-15431
+    ser = Series([data] * length)
+    result = qcut(ser, 1, labels=labels)
+
+    if labels is None:
+        intervals = IntervalIndex([Interval(start, end)] * length, closed="right")
+        expected = Series(intervals).astype(CategoricalDtype(ordered=True))
+    else:
+        expected = Series([0] * length, dtype=np.intp)
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ser",
+    [
+        DatetimeIndex(["20180101", NaT, "20180103"]),
+        TimedeltaIndex(["0 days", NaT, "2 days"]),
+    ],
+    ids=lambda x: str(x.dtype),
+)
+def test_qcut_nat(ser, unit):
+    # see gh-19768
+    ser = Series(ser)
+    ser = ser.dt.as_unit(unit)
+    td = Timedelta(1, unit=unit).as_unit(unit)
+
+    left = Series([ser[0] - td, np.nan, ser[2] - Day()], dtype=ser.dtype)
+    right = Series([ser[2] - Day(), np.nan, ser[2]], dtype=ser.dtype)
+    intervals = IntervalIndex.from_arrays(left, right)
+    expected = Series(Categorical(intervals, ordered=True))
+
+    result = qcut(ser, 2)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("bins", [3, np.linspace(0, 1, 4)])
+def test_datetime_tz_qcut(bins):
+    # see gh-19872
+    tz = "US/Eastern"
+    ser = Series(date_range("20130101", periods=3, tz=tz, unit="ns"))
+
+    result = qcut(ser, bins)
+    expected = Series(
+        IntervalIndex(
+            [
+                Interval(
+                    Timestamp("2012-12-31 23:59:59.999999999", tz=tz),
+                    Timestamp("2013-01-01 16:00:00", tz=tz),
+                ),
+                Interval(
+                    Timestamp("2013-01-01 16:00:00", tz=tz),
+                    Timestamp("2013-01-02 08:00:00", tz=tz),
+                ),
+                Interval(
+                    Timestamp("2013-01-02 08:00:00", tz=tz),
+                    Timestamp("2013-01-03 00:00:00", tz=tz),
+                ),
+            ]
+        )
+    ).astype(CategoricalDtype(ordered=True))
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "arg,expected_bins",
+    [
+        [
+            timedelta_range("1day", periods=3),
+            TimedeltaIndex(["1 days", "2 days", "3 days"]),
+        ],
+        [
+            date_range("20180101", periods=3),
+            DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"]),
+        ],
+    ],
+)
+def test_date_like_qcut_bins(arg, expected_bins, unit):
+    # see gh-19891
+    arg = arg.as_unit(unit)
+    expected_bins = expected_bins.as_unit(unit)
+    ser = Series(arg)
+    result, result_bins = qcut(ser, 2, retbins=True)
+    tm.assert_index_equal(result_bins, expected_bins)
+
+
+@pytest.mark.parametrize("bins", [6, 7])
+@pytest.mark.parametrize(
+    "box, compare",
+    [
+        (Series, tm.assert_series_equal),
+        (np.array, tm.assert_categorical_equal),
+        (list, tm.assert_equal),
+    ],
+)
+def test_qcut_bool_coercion_to_int(bins, box, compare):
+    # issue 20303
+    data_expected = box([0, 1, 1, 0, 1] * 10)
+    data_result = box([False, True, True, False, True] * 10)
+    expected = qcut(data_expected, bins, duplicates="drop")
+    result = qcut(data_result, bins, duplicates="drop")
+    compare(result, expected)
+
+
+@pytest.mark.parametrize("q", [2, 5, 10])
+def test_qcut_nullable_integer(q, any_numeric_ea_dtype):
+    arr = pd.array(np.arange(100), dtype=any_numeric_ea_dtype)
+    arr[::2] = pd.NA
+
+    result = qcut(arr, q)
+    expected = qcut(arr.astype(float), q)
+
+    tm.assert_categorical_equal(result, expected)
+
+
+@pytest.mark.parametrize("scale", [1.0, 1 / 3, 17.0])
+@pytest.mark.parametrize("q", [3, 7, 9])
+@pytest.mark.parametrize("precision", [1, 3, 16])
+def test_qcut_contains(scale, q, precision):
+    # GH-59355
+    arr = (scale * np.arange(q + 1)).round(precision)
+    result = qcut(arr, q, precision=precision)
+
+    for value, bucket in zip(arr, result):
+        assert value in bucket
diff --git a/pandas/tests/reshape/test_union_categoricals.py b/pandas/tests/reshape/test_union_categoricals.py
new file mode 100644
index 0000000000000000000000000000000000000000..081feae6fc43fef590d546ad040d4dc0f412028d
--- /dev/null
+++ b/pandas/tests/reshape/test_union_categoricals.py
@@ -0,0 +1,369 @@
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.concat import union_categoricals
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    CategoricalIndex,
+    Series,
+)
+import pandas._testing as tm
+
+
+class TestUnionCategoricals:
+    @pytest.mark.parametrize(
+        "a, b, combined",
+        [
+            (list("abc"), list("abd"), list("abcabd")),
+            ([0, 1, 2], [2, 3, 4], [0, 1, 2, 2, 3, 4]),
+            ([0, 1.2, 2], [2, 3.4, 4], [0, 1.2, 2, 2, 3.4, 4]),
+            (
+                ["b", "b", np.nan, "a"],
+                ["a", np.nan, "c"],
+                ["b", "b", np.nan, "a", "a", np.nan, "c"],
+            ),
+            (
+                pd.date_range("2014-01-01", "2014-01-05"),
+                pd.date_range("2014-01-06", "2014-01-07"),
+                pd.date_range("2014-01-01", "2014-01-07"),
+            ),
+            (
+                pd.date_range("2014-01-01", "2014-01-05", tz="US/Central"),
+                pd.date_range("2014-01-06", "2014-01-07", tz="US/Central"),
+                pd.date_range("2014-01-01", "2014-01-07", tz="US/Central"),
+            ),
+            (
+                pd.period_range("2014-01-01", "2014-01-05"),
+                pd.period_range("2014-01-06", "2014-01-07"),
+                pd.period_range("2014-01-01", "2014-01-07"),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize("box", [Categorical, CategoricalIndex, Series])
+    def test_union_categorical(self, a, b, combined, box):
+        # GH 13361
+        result = union_categoricals([box(Categorical(a)), box(Categorical(b))])
+        expected = Categorical(combined)
+        tm.assert_categorical_equal(result, expected)
+
+    def test_union_categorical_ordered_appearance(self):
+        # new categories ordered by appearance
+        s = Categorical(["x", "y", "z"])
+        s2 = Categorical(["a", "b", "c"])
+        result = union_categoricals([s, s2])
+        expected = Categorical(
+            ["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
+        )
+        tm.assert_categorical_equal(result, expected)
+
+    def test_union_categorical_ordered_true(self):
+        s = Categorical([0, 1.2, 2], ordered=True)
+        s2 = Categorical([0, 1.2, 2], ordered=True)
+        result = union_categoricals([s, s2])
+        expected = Categorical([0, 1.2, 2, 0, 1.2, 2], ordered=True)
+        tm.assert_categorical_equal(result, expected)
+
+    def test_union_categorical_match_types(self):
+        # must exactly match types
+        s = Categorical([0, 1.2, 2])
+        s2 = Categorical([2, 3, 4])
+        msg = "dtype of categories must be the same"
+        with pytest.raises(TypeError, match=msg):
+            union_categoricals([s, s2])
+
+    def test_union_categorical_empty(self):
+        msg = "No Categoricals to union"
+        with pytest.raises(ValueError, match=msg):
+            union_categoricals([])
+
+    def test_union_categoricals_nan(self):
+        # GH 13759
+        res = union_categoricals(
+            [Categorical([1, 2, np.nan]), Categorical([3, 2, np.nan])]
+        )
+        exp = Categorical([1, 2, np.nan, 3, 2, np.nan])
+        tm.assert_categorical_equal(res, exp)
+
+        res = union_categoricals(
+            [Categorical(["A", "B"]), Categorical(["B", "B", np.nan])]
+        )
+        exp = Categorical(["A", "B", "B", "B", np.nan])
+        tm.assert_categorical_equal(res, exp)
+
+        val1 = [pd.Timestamp("2011-01-01"), pd.Timestamp("2011-03-01"), pd.NaT]
+        val2 = [pd.NaT, pd.Timestamp("2011-01-01"), pd.Timestamp("2011-02-01")]
+
+        res = union_categoricals([Categorical(val1), Categorical(val2)])
+        exp = Categorical(
+            val1 + val2,
+            categories=[
+                pd.Timestamp("2011-01-01"),
+                pd.Timestamp("2011-03-01"),
+                pd.Timestamp("2011-02-01"),
+            ],
+        )
+        tm.assert_categorical_equal(res, exp)
+
+        # all NaN
+        res = union_categoricals(
+            [
+                Categorical(np.array([np.nan, np.nan], dtype=object)),
+                Categorical(["X"], categories=pd.Index(["X"], dtype=object)),
+            ]
+        )
+        exp = Categorical([np.nan, np.nan, "X"])
+        tm.assert_categorical_equal(res, exp)
+
+        res = union_categoricals(
+            [Categorical([np.nan, np.nan]), Categorical([np.nan, np.nan])]
+        )
+        exp = Categorical([np.nan, np.nan, np.nan, np.nan])
+        tm.assert_categorical_equal(res, exp)
+
+    @pytest.mark.parametrize("val", [[], ["1"]])
+    def test_union_categoricals_empty(self, val, request, using_infer_string):
+        # GH 13759
+        if using_infer_string and val == ["1"]:
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason="TDOD(infer_string) object and strings dont match"
+                )
+            )
+        res = union_categoricals([Categorical([]), Categorical(val)])
+        exp = Categorical(val)
+        tm.assert_categorical_equal(res, exp)
+
+    def test_union_categorical_same_category(self):
+        # check fastpath
+        c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
+        c2 = Categorical([3, 2, 1, np.nan], categories=[1, 2, 3, 4])
+        res = union_categoricals([c1, c2])
+        exp = Categorical([1, 2, 3, 4, 3, 2, 1, np.nan], categories=[1, 2, 3, 4])
+        tm.assert_categorical_equal(res, exp)
+
+    def test_union_categorical_same_category_str(self):
+        c1 = Categorical(["z", "z", "z"], categories=["x", "y", "z"])
+        c2 = Categorical(["x", "x", "x"], categories=["x", "y", "z"])
+        res = union_categoricals([c1, c2])
+        exp = Categorical(["z", "z", "z", "x", "x", "x"], categories=["x", "y", "z"])
+        tm.assert_categorical_equal(res, exp)
+
+    def test_union_categorical_same_categories_different_order(self):
+        # https://github.com/pandas-dev/pandas/issues/19096
+        c1 = Categorical(["a", "b", "c"], categories=["a", "b", "c"])
+        c2 = Categorical(["a", "b", "c"], categories=["b", "a", "c"])
+        result = union_categoricals([c1, c2])
+        expected = Categorical(
+            ["a", "b", "c", "a", "b", "c"], categories=["a", "b", "c"]
+        )
+        tm.assert_categorical_equal(result, expected)
+
+    def test_union_categoricals_ordered(self):
+        c1 = Categorical([1, 2, 3], ordered=True)
+        c2 = Categorical([1, 2, 3], ordered=False)
+
+        msg = "Categorical.ordered must be the same"
+        with pytest.raises(TypeError, match=msg):
+            union_categoricals([c1, c2])
+
+        res = union_categoricals([c1, c1])
+        exp = Categorical([1, 2, 3, 1, 2, 3], ordered=True)
+        tm.assert_categorical_equal(res, exp)
+
+        c1 = Categorical([1, 2, 3, np.nan], ordered=True)
+        c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
+
+        res = union_categoricals([c1, c2])
+        exp = Categorical([1, 2, 3, np.nan, 3, 2], ordered=True)
+        tm.assert_categorical_equal(res, exp)
+
+        c1 = Categorical([1, 2, 3], ordered=True)
+        c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
+
+        msg = "to union ordered Categoricals, all categories must be the same"
+        with pytest.raises(TypeError, match=msg):
+            union_categoricals([c1, c2])
+
+    def test_union_categoricals_ignore_order(self):
+        # GH 15219
+        c1 = Categorical([1, 2, 3], ordered=True)
+        c2 = Categorical([1, 2, 3], ordered=False)
+
+        res = union_categoricals([c1, c2], ignore_order=True)
+        exp = Categorical([1, 2, 3, 1, 2, 3])
+        tm.assert_categorical_equal(res, exp)
+
+        msg = "Categorical.ordered must be the same"
+        with pytest.raises(TypeError, match=msg):
+            union_categoricals([c1, c2], ignore_order=False)
+
+        res = union_categoricals([c1, c1], ignore_order=True)
+        exp = Categorical([1, 2, 3, 1, 2, 3])
+        tm.assert_categorical_equal(res, exp)
+
+        res = union_categoricals([c1, c1], ignore_order=False)
+        exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3], ordered=True)
+        tm.assert_categorical_equal(res, exp)
+
+        c1 = Categorical([1, 2, 3, np.nan], ordered=True)
+        c2 = Categorical([3, 2], categories=[1, 2, 3], ordered=True)
+
+        res = union_categoricals([c1, c2], ignore_order=True)
+        exp = Categorical([1, 2, 3, np.nan, 3, 2])
+        tm.assert_categorical_equal(res, exp)
+
+        c1 = Categorical([1, 2, 3], ordered=True)
+        c2 = Categorical([1, 2, 3], categories=[3, 2, 1], ordered=True)
+
+        res = union_categoricals([c1, c2], ignore_order=True)
+        exp = Categorical([1, 2, 3, 1, 2, 3])
+        tm.assert_categorical_equal(res, exp)
+
+        res = union_categoricals([c2, c1], ignore_order=True, sort_categories=True)
+        exp = Categorical([1, 2, 3, 1, 2, 3], categories=[1, 2, 3])
+        tm.assert_categorical_equal(res, exp)
+
+        c1 = Categorical([1, 2, 3], ordered=True)
+        c2 = Categorical([4, 5, 6], ordered=True)
+        result = union_categoricals([c1, c2], ignore_order=True)
+        expected = Categorical([1, 2, 3, 4, 5, 6])
+        tm.assert_categorical_equal(result, expected)
+
+        msg = "to union ordered Categoricals, all categories must be the same"
+        with pytest.raises(TypeError, match=msg):
+            union_categoricals([c1, c2], ignore_order=False)
+
+        with pytest.raises(TypeError, match=msg):
+            union_categoricals([c1, c2])
+
+    def test_union_categoricals_sort(self):
+        # GH 13846
+        c1 = Categorical(["x", "y", "z"])
+        c2 = Categorical(["a", "b", "c"])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(
+            ["x", "y", "z", "a", "b", "c"], categories=["a", "b", "c", "x", "y", "z"]
+        )
+        tm.assert_categorical_equal(result, expected)
+
+        # fastpath
+        c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
+        c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical(["a", "b"], categories=["c", "a", "b"])
+        c2 = Categorical(["b", "c"], categories=["c", "a", "b"])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
+        tm.assert_categorical_equal(result, expected)
+
+        # fastpath - skip resort
+        c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
+        c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical(["x", np.nan])
+        c2 = Categorical([np.nan, "b"])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical(["x", np.nan, np.nan, "b"], categories=["b", "x"])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical([np.nan])
+        c2 = Categorical([np.nan])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical([np.nan, np.nan])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical([])
+        c2 = Categorical([])
+        result = union_categoricals([c1, c2], sort_categories=True)
+        expected = Categorical([])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
+        c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
+        msg = "Cannot use sort_categories=True with ordered Categoricals"
+        with pytest.raises(TypeError, match=msg):
+            union_categoricals([c1, c2], sort_categories=True)
+
+    def test_union_categoricals_sort_false(self):
+        # GH 13846
+        c1 = Categorical(["x", "y", "z"])
+        c2 = Categorical(["a", "b", "c"])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(
+            ["x", "y", "z", "a", "b", "c"], categories=["x", "y", "z", "a", "b", "c"]
+        )
+        tm.assert_categorical_equal(result, expected)
+
+    def test_union_categoricals_sort_false_fastpath(self):
+        # fastpath
+        c1 = Categorical(["a", "b"], categories=["b", "a", "c"])
+        c2 = Categorical(["b", "c"], categories=["b", "a", "c"])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(["a", "b", "b", "c"], categories=["b", "a", "c"])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_union_categoricals_sort_false_skipresort(self):
+        # fastpath - skip resort
+        c1 = Categorical(["a", "b"], categories=["a", "b", "c"])
+        c2 = Categorical(["b", "c"], categories=["a", "b", "c"])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(["a", "b", "b", "c"], categories=["a", "b", "c"])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_union_categoricals_sort_false_one_nan(self):
+        c1 = Categorical(["x", np.nan])
+        c2 = Categorical([np.nan, "b"])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(["x", np.nan, np.nan, "b"], categories=["x", "b"])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_union_categoricals_sort_false_only_nan(self):
+        c1 = Categorical([np.nan])
+        c2 = Categorical([np.nan])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical([np.nan, np.nan])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_union_categoricals_sort_false_empty(self):
+        c1 = Categorical([])
+        c2 = Categorical([])
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical([])
+        tm.assert_categorical_equal(result, expected)
+
+    def test_union_categoricals_sort_false_ordered_true(self):
+        c1 = Categorical(["b", "a"], categories=["b", "a", "c"], ordered=True)
+        c2 = Categorical(["a", "c"], categories=["b", "a", "c"], ordered=True)
+        result = union_categoricals([c1, c2], sort_categories=False)
+        expected = Categorical(
+            ["b", "a", "a", "c"], categories=["b", "a", "c"], ordered=True
+        )
+        tm.assert_categorical_equal(result, expected)
+
+    def test_union_categorical_unwrap(self):
+        # GH 14173
+        c1 = Categorical(["a", "b"])
+        c2 = Series(["b", "c"], dtype="category")
+        result = union_categoricals([c1, c2])
+        expected = Categorical(["a", "b", "b", "c"])
+        tm.assert_categorical_equal(result, expected)
+
+        c2 = CategoricalIndex(c2)
+        result = union_categoricals([c1, c2])
+        tm.assert_categorical_equal(result, expected)
+
+        c1 = Series(c1)
+        result = union_categoricals([c1, c2])
+        tm.assert_categorical_equal(result, expected)
+
+        msg = "all components to combine must be Categorical"
+        with pytest.raises(TypeError, match=msg):
+            union_categoricals([c1, ["a", "b", "c"]])
diff --git a/pandas/tests/series/__init__.py b/pandas/tests/series/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b369bb0bc86935ac94113a858cfb5d99082e34f
--- /dev/null
+++ b/pandas/tests/series/test_api.py
@@ -0,0 +1,278 @@
+import inspect
+import pydoc
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    date_range,
+    period_range,
+    timedelta_range,
+)
+import pandas._testing as tm
+
+
+class TestSeriesMisc:
+    def test_tab_completion(self):
+        # GH 9910
+        s = Series(list("abcd"))
+        # Series of str values should have .str but not .dt/.cat in __dir__
+        assert "str" in dir(s)
+        assert "dt" not in dir(s)
+        assert "cat" not in dir(s)
+
+    def test_tab_completion_dt(self):
+        # similarly for .dt
+        s = Series(date_range("1/1/2015", periods=5))
+        assert "dt" in dir(s)
+        assert "str" not in dir(s)
+        assert "cat" not in dir(s)
+
+    def test_tab_completion_cat(self):
+        # Similarly for .cat, but with the twist that str and dt should be
+        # there if the categories are of that type first cat and str.
+        s = Series(list("abbcd"), dtype="category")
+        assert "cat" in dir(s)
+        assert "str" in dir(s)  # as it is a string categorical
+        assert "dt" not in dir(s)
+
+    def test_tab_completion_cat_str(self):
+        # similar to cat and str
+        s = Series(date_range("1/1/2015", periods=5)).astype("category")
+        assert "cat" in dir(s)
+        assert "str" not in dir(s)
+        assert "dt" in dir(s)  # as it is a datetime categorical
+
+    def test_tab_completion_with_categorical(self):
+        # test the tab completion display
+        ok_for_cat = [
+            "categories",
+            "codes",
+            "ordered",
+            "set_categories",
+            "add_categories",
+            "remove_categories",
+            "rename_categories",
+            "reorder_categories",
+            "remove_unused_categories",
+            "as_ordered",
+            "as_unordered",
+        ]
+
+        s = Series(list("aabbcde")).astype("category")
+        results = sorted({r for r in s.cat.__dir__() if not r.startswith("_")})
+        tm.assert_almost_equal(results, sorted(set(ok_for_cat)))
+
+    @pytest.mark.parametrize(
+        "index",
+        [
+            Index(list("ab") * 5, dtype="category"),
+            Index([str(i) for i in range(10)]),
+            Index(["foo", "bar", "baz"] * 2),
+            date_range("2020-01-01", periods=10),
+            period_range("2020-01-01", periods=10, freq="D"),
+            timedelta_range("1 day", periods=10),
+            Index(np.arange(10), dtype=np.uint64),
+            Index(np.arange(10), dtype=np.int64),
+            Index(np.arange(10), dtype=np.float64),
+            Index([True, False]),
+            Index([f"a{i}" for i in range(101)]),
+            pd.MultiIndex.from_tuples(zip("ABCD", "EFGH")),
+            pd.MultiIndex.from_tuples(zip([0, 1, 2, 3], "EFGH")),
+        ],
+    )
+    def test_index_tab_completion(self, index):
+        # dir contains string-like values of the Index.
+        s = Series(index=index, dtype=object)
+        dir_s = dir(s)
+        for i, x in enumerate(s.index.unique(level=0)):
+            if i < 100:
+                assert not isinstance(x, str) or not x.isidentifier() or x in dir_s
+            else:
+                assert x not in dir_s
+
+    @pytest.mark.parametrize("ser", [Series(dtype=object), Series([1])])
+    def test_not_hashable(self, ser):
+        msg = "unhashable type: 'Series'"
+        with pytest.raises(TypeError, match=msg):
+            hash(ser)
+
+    def test_contains(self, datetime_series):
+        tm.assert_contains_all(datetime_series.index, datetime_series)
+
+    def test_axis_alias(self):
+        s = Series([1, 2, np.nan])
+        tm.assert_series_equal(s.dropna(axis="rows"), s.dropna(axis="index"))
+        assert s.dropna().sum(axis="rows") == 3
+        assert s._get_axis_number("rows") == 0
+        assert s._get_axis_name("rows") == "index"
+
+    def test_class_axis(self):
+        # https://github.com/pandas-dev/pandas/issues/18147
+        # no exception and no empty docstring
+        assert pydoc.getdoc(Series.index)
+
+    def test_ndarray_compat(self):
+        # test numpy compat with Series as sub-class of NDFrame
+        tsdf = DataFrame(
+            np.random.default_rng(2).standard_normal((1000, 3)),
+            columns=["A", "B", "C"],
+            index=date_range("1/1/2000", periods=1000),
+        )
+
+        def f(x):
+            return x[x.idxmax()]
+
+        result = tsdf.apply(f)
+        expected = tsdf.max()
+        tm.assert_series_equal(result, expected)
+
+    def test_ndarray_compat_like_func(self):
+        # using an ndarray like function
+        s = Series(np.random.default_rng(2).standard_normal(10))
+        result = Series(np.ones_like(s))
+        expected = Series(1, index=range(10), dtype="float64")
+        tm.assert_series_equal(result, expected)
+
+    def test_empty_method(self):
+        s_empty = Series(dtype=object)
+        assert s_empty.empty
+
+    @pytest.mark.parametrize("dtype", ["int64", object])
+    def test_empty_method_full_series(self, dtype):
+        full_series = Series(index=[1], dtype=dtype)
+        assert not full_series.empty
+
+    @pytest.mark.parametrize("dtype", [None, "Int64"])
+    def test_integer_series_size(self, dtype):
+        # GH 25580
+        s = Series(range(9), dtype=dtype)
+        assert s.size == 9
+
+    def test_attrs(self):
+        s = Series([0, 1], name="abc")
+        assert s.attrs == {}
+        s.attrs["version"] = 1
+        result = s + 1
+        assert result.attrs == {"version": 1}
+
+    def test_inspect_getmembers(self):
+        # GH38782
+        ser = Series(dtype=object)
+        inspect.getmembers(ser)
+
+    def test_unknown_attribute(self):
+        # GH#9680
+        tdi = timedelta_range(start=0, periods=10, freq="1s")
+        ser = Series(np.random.default_rng(2).normal(size=10), index=tdi)
+        assert "foo" not in ser.__dict__
+        msg = "'Series' object has no attribute 'foo'"
+        with pytest.raises(AttributeError, match=msg):
+            ser.foo
+
+    @pytest.mark.parametrize("op", ["year", "day", "second", "weekday"])
+    def test_datetime_series_no_datelike_attrs(self, op, datetime_series):
+        # GH#7206
+        msg = f"'Series' object has no attribute '{op}'"
+        with pytest.raises(AttributeError, match=msg):
+            getattr(datetime_series, op)
+
+    def test_series_datetimelike_attribute_access(self):
+        # attribute access should still work!
+        ser = Series({"year": 2000, "month": 1, "day": 10})
+        assert ser.year == 2000
+        assert ser.month == 1
+        assert ser.day == 10
+
+    def test_series_datetimelike_attribute_access_invalid(self):
+        ser = Series({"year": 2000, "month": 1, "day": 10})
+        msg = "'Series' object has no attribute 'weekday'"
+        with pytest.raises(AttributeError, match=msg):
+            ser.weekday
+
+    @pytest.mark.parametrize(
+        "kernel, has_numeric_only",
+        [
+            ("skew", True),
+            ("var", True),
+            ("all", False),
+            ("prod", True),
+            ("any", False),
+            ("idxmin", False),
+            ("quantile", False),
+            ("idxmax", False),
+            ("min", True),
+            ("sem", True),
+            ("mean", True),
+            ("nunique", False),
+            ("max", True),
+            ("sum", True),
+            ("count", False),
+            ("median", True),
+            ("std", True),
+            ("rank", True),
+            ("pct_change", False),
+            ("cummax", False),
+            ("shift", False),
+            ("diff", False),
+            ("cumsum", False),
+            ("cummin", False),
+            ("cumprod", False),
+            ("fillna", False),
+            ("ffill", False),
+            ("bfill", False),
+            ("sample", False),
+            ("tail", False),
+            ("take", False),
+            ("head", False),
+            ("cov", False),
+            ("corr", False),
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [bool, int, float, object])
+    def test_numeric_only(self, kernel, has_numeric_only, dtype):
+        # GH#47500
+        ser = Series([0, 1, 1], dtype=dtype)
+        if kernel == "corrwith":
+            args = (ser,)
+        elif kernel == "corr":
+            args = (ser,)
+        elif kernel == "cov":
+            args = (ser,)
+        elif kernel == "nth":
+            args = (0,)
+        elif kernel == "fillna":
+            args = (True,)
+        elif kernel == "fillna":
+            args = ("ffill",)
+        elif kernel == "take":
+            args = ([0],)
+        elif kernel == "quantile":
+            args = (0.5,)
+        else:
+            args = ()
+        method = getattr(ser, kernel)
+        if not has_numeric_only:
+            msg = (
+                "(got an unexpected keyword argument 'numeric_only'"
+                "|too many arguments passed in)"
+            )
+            with pytest.raises(TypeError, match=msg):
+                method(*args, numeric_only=True)
+        elif dtype is object:
+            msg = f"Series.{kernel} does not allow numeric_only=True with non-numeric"
+            with pytest.raises(TypeError, match=msg):
+                method(*args, numeric_only=True)
+        else:
+            result = method(*args, numeric_only=True)
+            expected = method(*args, numeric_only=False)
+            if isinstance(expected, Series):
+                # transformer
+                tm.assert_series_equal(result, expected)
+            else:
+                # reducer
+                assert result == expected
diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c2bba16b86cd64353203dcdf473bbe155c13b13
--- /dev/null
+++ b/pandas/tests/series/test_arithmetic.py
@@ -0,0 +1,1085 @@
+from datetime import (
+    date,
+    timedelta,
+    timezone,
+)
+from decimal import Decimal
+from enum import (
+    Enum,
+    auto,
+)
+import operator
+
+import numpy as np
+import pytest
+
+from pandas._libs import lib
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    DatetimeTZDtype,
+    Index,
+    Series,
+    Timedelta,
+    bdate_range,
+    date_range,
+    isna,
+)
+import pandas._testing as tm
+from pandas.core import ops
+from pandas.core.computation import expressions as expr
+
+
+@pytest.fixture(autouse=True, params=[0, 1000000], ids=["numexpr", "python"])
+def switch_numexpr_min_elements(request, monkeypatch):
+    with monkeypatch.context() as m:
+        m.setattr(expr, "_MIN_ELEMENTS", request.param)
+        yield
+
+
+def _permute(obj):
+    return obj.take(np.random.default_rng(2).permutation(len(obj)))
+
+
+class TestSeriesFlexArithmetic:
+    @pytest.mark.parametrize(
+        "ts",
+        [
+            (lambda x: x, lambda x: x * 2, False),
+            (lambda x: x, lambda x: x[::2], False),
+            (lambda x: x, lambda x: 5, True),
+            (
+                lambda x: Series(range(10), dtype=np.float64),
+                lambda x: Series(range(10), dtype=np.float64),
+                True,
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "opname", ["add", "sub", "mul", "floordiv", "truediv", "pow"]
+    )
+    def test_flex_method_equivalence(self, opname, ts):
+        # check that Series.{opname} behaves like Series.__{opname}__,
+        tser = Series(
+            np.arange(20, dtype=np.float64),
+            index=date_range("2020-01-01", periods=20),
+            name="ts",
+        )
+
+        series = ts[0](tser)
+        other = ts[1](tser)
+        check_reverse = ts[2]
+
+        op = getattr(Series, opname)
+        alt = getattr(operator, opname)
+
+        result = op(series, other)
+        expected = alt(series, other)
+        tm.assert_almost_equal(result, expected)
+        if check_reverse:
+            rop = getattr(Series, "r" + opname)
+            result = rop(series, other)
+            expected = alt(other, series)
+            tm.assert_almost_equal(result, expected)
+
+    def test_flex_method_subclass_metadata_preservation(self, all_arithmetic_operators):
+        # GH 13208
+        class MySeries(Series):
+            _metadata = ["x"]
+
+            @property
+            def _constructor(self):
+                return MySeries
+
+        opname = all_arithmetic_operators
+        op = getattr(Series, opname)
+        m = MySeries([1, 2, 3], name="test")
+        m.x = 42
+        result = op(m, 1)
+        assert result.x == 42
+
+    def test_flex_add_scalar_fill_value(self):
+        # GH12723
+        ser = Series([0, 1, np.nan, 3, 4, 5])
+
+        exp = ser.fillna(0).add(2)
+        res = ser.add(2, fill_value=0)
+        tm.assert_series_equal(res, exp)
+
+    pairings = [(Series.div, operator.truediv, 1), (Series.rdiv, ops.rtruediv, 1)]
+    for op in ["add", "sub", "mul", "pow", "truediv", "floordiv"]:
+        fv = 0
+        lop = getattr(Series, op)
+        lequiv = getattr(operator, op)
+        rop = getattr(Series, "r" + op)
+        # bind op at definition time...
+        requiv = lambda x, y, op=op: getattr(operator, op)(y, x)
+        pairings.append((lop, lequiv, fv))
+        pairings.append((rop, requiv, fv))
+
+    @pytest.mark.parametrize("op, equiv_op, fv", pairings)
+    def test_operators_combine(self, op, equiv_op, fv):
+        def _check_fill(meth, op, a, b, fill_value=0):
+            exp_index = a.index.union(b.index)
+            a = a.reindex(exp_index)
+            b = b.reindex(exp_index)
+
+            amask = isna(a)
+            bmask = isna(b)
+
+            exp_values = []
+            for i in range(len(exp_index)):
+                with np.errstate(all="ignore"):
+                    if amask[i]:
+                        if bmask[i]:
+                            exp_values.append(np.nan)
+                            continue
+                        exp_values.append(op(fill_value, b[i]))
+                    elif bmask[i]:
+                        if amask[i]:
+                            exp_values.append(np.nan)
+                            continue
+                        exp_values.append(op(a[i], fill_value))
+                    else:
+                        exp_values.append(op(a[i], b[i]))
+
+            result = meth(a, b, fill_value=fill_value)
+            expected = Series(exp_values, exp_index)
+            tm.assert_series_equal(result, expected)
+
+        a = Series([np.nan, 1.0, 2.0, 3.0, np.nan], index=np.arange(5))
+        b = Series([np.nan, 1, np.nan, 3, np.nan, 4.0], index=np.arange(6))
+
+        result = op(a, b)
+        exp = equiv_op(a, b)
+        tm.assert_series_equal(result, exp)
+        _check_fill(op, equiv_op, a, b, fill_value=fv)
+        # should accept axis=0 or axis='rows'
+        op(a, b, axis=0)
+
+    @pytest.mark.parametrize("kind", ["datetime", "timedelta"])
+    def test_rhs_extension_array_sub_with_fill_value(self, kind):
+        # GH:62467
+        if kind == "datetime":
+            left = Series(
+                [pd.Timestamp("2025-08-20"), pd.Timestamp("2025-08-21")],
+                dtype=np.dtype("datetime64[ns]"),
+            )
+        else:
+            left = Series(
+                [Timedelta(days=1), Timedelta(days=2)],
+                dtype=np.dtype("timedelta64[ns]"),
+            )
+
+        right = (
+            left._values
+        )  # DatetimeArray or TimedeltaArray which is an ExtensionArray
+
+        result = left.sub(right, fill_value=left.iloc[0])
+        expected = Series(np.zeros(len(left), dtype=np.dtype("timedelta64[ns]")))
+        tm.assert_series_equal(result, expected)
+
+    def test_flex_disallows_dataframe(self):
+        # GH#46179
+        df = pd.DataFrame(
+            {2010: [1], 2020: [3]},
+            index=pd.MultiIndex.from_product([["a"], ["b"]], names=["scen", "mod"]),
+        )
+
+        ser = Series(
+            [10.0, 20.0, 30.0],
+            index=pd.MultiIndex.from_product(
+                [["a"], ["b"], [0, 1, 2]], names=["scen", "mod", "id"]
+            ),
+        )
+
+        msg = "Series.add does not support a DataFrame `other`"
+        with pytest.raises(TypeError, match=msg):
+            ser.add(df, axis=0)
+
+
+class TestSeriesArithmetic:
+    # Some of these may end up in tests/arithmetic, but are not yet sorted
+
+    def test_add_series_with_period_index(self):
+        rng = pd.period_range("1/1/2000", "1/1/2010", freq="Y")
+        ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+        result = ts + ts[::2]
+        expected = ts + ts
+        expected.iloc[1::2] = np.nan
+        tm.assert_series_equal(result, expected)
+
+        result = ts + _permute(ts[::2])
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "target_add,input_value,expected_value",
+        [
+            ("!", ["hello", "world"], ["hello!", "world!"]),
+            ("m", ["hello", "world"], ["hellom", "worldm"]),
+        ],
+    )
+    def test_string_addition(self, target_add, input_value, expected_value):
+        # GH28658 - ensure adding 'm' does not raise an error
+        a = Series(input_value)
+
+        result = a + target_add
+        expected = Series(expected_value)
+        tm.assert_series_equal(result, expected)
+
+    def test_divmod(self):
+        # GH#25557
+        a = Series([1, 1, 1, np.nan], index=["a", "b", "c", "d"])
+        b = Series([2, np.nan, 1, np.nan], index=["a", "b", "d", "e"])
+
+        result = a.divmod(b)
+        expected = divmod(a, b)
+        tm.assert_series_equal(result[0], expected[0])
+        tm.assert_series_equal(result[1], expected[1])
+
+        result = a.rdivmod(b)
+        expected = divmod(b, a)
+        tm.assert_series_equal(result[0], expected[0])
+        tm.assert_series_equal(result[1], expected[1])
+
+    @pytest.mark.parametrize("index", [None, range(9)])
+    def test_series_integer_mod(self, index):
+        # GH#24396
+        s1 = Series(range(1, 10))
+        s2 = Series("foo", index=index)
+
+        msg = "not all arguments converted during string formatting|'mod' not supported"
+
+        with pytest.raises(TypeError, match=msg):
+            s2 % s1
+
+    def test_add_with_duplicate_index(self):
+        # GH14227
+        s1 = Series([1, 2], index=[1, 1])
+        s2 = Series([10, 10], index=[1, 2])
+        result = s1 + s2
+        expected = Series([11, 12, np.nan], index=[1, 1, 2])
+        tm.assert_series_equal(result, expected)
+
+    def test_add_na_handling(self):
+        ser = Series(
+            [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)]
+        )
+
+        result = ser + ser.shift(1)
+        result2 = ser.shift(1) + ser
+        assert isna(result.iloc[0])
+        assert isna(result2.iloc[0])
+
+    def test_add_corner_cases(self, datetime_series):
+        empty = Series([], index=Index([]), dtype=np.float64)
+
+        result = datetime_series + empty
+        assert np.isnan(result).all()
+
+        result = empty + empty
+        assert len(result) == 0
+
+    def test_add_float_plus_int(self, datetime_series):
+        # float + int
+        int_ts = datetime_series.astype(int)[:-5]
+        added = datetime_series + int_ts
+        expected = Series(
+            datetime_series.values[:-5] + int_ts.values,
+            index=datetime_series.index[:-5],
+            name="ts",
+        )
+        tm.assert_series_equal(added[:-5], expected)
+
+    def test_mul_empty_int_corner_case(self):
+        s1 = Series([], [], dtype=np.int32)
+        s2 = Series({"x": 0.0})
+        tm.assert_series_equal(s1 * s2, Series([np.nan], index=["x"]))
+
+    def test_sub_datetimelike_align(self):
+        # GH#7500
+        # datetimelike ops need to align
+        dt = Series(date_range("2012-1-1", periods=3, freq="D", unit="ns"))
+        dt.iloc[2] = np.nan
+        dt2 = dt[::-1]
+
+        expected = Series([timedelta(0), timedelta(0), pd.NaT], dtype="m8[ns]")
+        # name is reset
+        result = dt2 - dt
+        tm.assert_series_equal(result, expected)
+
+        expected = Series(expected, name=0)
+        result = (dt2.to_frame() - dt.to_frame())[0]
+        tm.assert_series_equal(result, expected)
+
+    def test_alignment_doesnt_change_tz(self):
+        # GH#33671
+        dti = date_range("2016-01-01", periods=10, tz="CET")
+        dti_utc = dti.tz_convert("UTC")
+        ser = Series(10, index=dti)
+        ser_utc = Series(10, index=dti_utc)
+
+        # we don't care about the result, just that original indexes are unchanged
+        ser * ser_utc
+
+        assert ser.index is dti
+        assert ser_utc.index is dti_utc
+
+    def test_alignment_categorical(self):
+        # GH13365
+        cat = Categorical(["3z53", "3z53", "LoJG", "LoJG", "LoJG", "N503"])
+        ser1 = Series(2, index=cat)
+        ser2 = Series(2, index=cat[:-1])
+        result = ser1 * ser2
+
+        exp_index = ["3z53"] * 4 + ["LoJG"] * 9 + ["N503"]
+        exp_index = pd.CategoricalIndex(exp_index, categories=cat.categories)
+        exp_values = [4.0] * 13 + [np.nan]
+        expected = Series(exp_values, exp_index)
+
+        tm.assert_series_equal(result, expected)
+
+    def test_arithmetic_with_duplicate_index(self):
+        # GH#8363
+        # integer ops with a non-unique index
+        index = [2, 2, 3, 3, 4]
+        ser = Series(np.arange(1, 6, dtype="int64"), index=index)
+        other = Series(np.arange(5, dtype="int64"), index=index)
+        result = ser - other
+        expected = Series(1, index=[2, 2, 3, 3, 4])
+        tm.assert_series_equal(result, expected)
+
+        # GH#8363
+        # datetime ops with a non-unique index
+        ser = Series(date_range("20130101 09:00:00", periods=5, unit="ns"), index=index)
+        other = Series(date_range("20130101", periods=5, unit="ns"), index=index)
+        result = ser - other
+        expected = Series(Timedelta("9 hours"), index=[2, 2, 3, 3, 4], dtype="m8[ns]")
+        tm.assert_series_equal(result, expected)
+
+    def test_masked_and_non_masked_propagate_na(self):
+        # GH#45810
+        ser1 = Series([0, np.nan], dtype="float")
+        ser2 = Series([0, 1], dtype="Int64")
+        result = ser1 * ser2
+        expected = Series([0, pd.NA], dtype="Float64")
+        tm.assert_series_equal(result, expected)
+
+    def test_mask_div_propagate_na_for_non_na_dtype(self):
+        # GH#42630
+        ser1 = Series([15, pd.NA, 5, 4], dtype="Int64")
+        ser2 = Series([15, 5, np.nan, 4])
+        result = ser1 / ser2
+        expected = Series([1.0, pd.NA, pd.NA, 1.0], dtype="Float64")
+        tm.assert_series_equal(result, expected)
+
+        result = ser2 / ser1
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("val", [3, 3.5])
+    def test_add_list_to_masked_array(self, val):
+        # GH#22962, behavior changed by GH#62552
+        ser = Series([1, None, 3], dtype="Int64")
+        result = ser + [1, None, val]  # noqa: RUF005
+        expected = Series([2, pd.NA, 3 + val], dtype="Float64")
+        tm.assert_series_equal(result, expected)
+
+        result = [1, None, val] + ser  # noqa: RUF005
+        tm.assert_series_equal(result, expected)
+
+    def test_add_list_to_masked_array_boolean(self):
+        # GH#22962
+        ser = Series([True, None, False], dtype="boolean")
+        result = ser + [True, None, True]  # noqa: RUF005
+        expected = Series([2, pd.NA, 1], dtype=object)
+        tm.assert_series_equal(result, expected)
+
+        result = [True, None, True] + ser  # noqa: RUF005
+        tm.assert_series_equal(result, expected)
+
+
+# ------------------------------------------------------------------
+# Comparisons
+
+
+class TestSeriesFlexComparison:
+    @pytest.mark.parametrize("axis", [0, None, "index"])
+    def test_comparison_flex_basic(self, axis, comparison_op):
+        left = Series(np.random.default_rng(2).standard_normal(10))
+        right = Series(np.random.default_rng(2).standard_normal(10))
+        result = getattr(left, comparison_op.__name__)(right, axis=axis)
+        expected = comparison_op(left, right)
+        tm.assert_series_equal(result, expected)
+
+    def test_comparison_bad_axis(self, comparison_op):
+        left = Series(np.random.default_rng(2).standard_normal(10))
+        right = Series(np.random.default_rng(2).standard_normal(10))
+
+        msg = "No axis named 1 for object type"
+        with pytest.raises(ValueError, match=msg):
+            getattr(left, comparison_op.__name__)(right, axis=1)
+
+    @pytest.mark.parametrize(
+        "values, op",
+        [
+            ([False, False, True, False], "eq"),
+            ([True, True, False, True], "ne"),
+            ([False, False, True, False], "le"),
+            ([False, False, False, False], "lt"),
+            ([False, True, True, False], "ge"),
+            ([False, True, False, False], "gt"),
+        ],
+    )
+    def test_comparison_flex_alignment(self, values, op):
+        left = Series([1, 3, 2], index=list("abc"))
+        right = Series([2, 2, 2], index=list("bcd"))
+        result = getattr(left, op)(right)
+        expected = Series(values, index=list("abcd"))
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "left",
+        [
+            Series(Categorical(["a", "b", "a"])),
+            Series(pd.period_range("2020Q1", periods=3, freq="Q")),
+        ],
+        ids=["categorical", "period"],
+    )
+    def test_rhs_extension_array_eq_with_fill_value(self, left):
+        # GH:#62467
+        right = left._values  # this is an ExtensionArray
+
+        result = left.eq(right, fill_value=left.iloc[0])
+        expected = Series([True, True, True])
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "values, op, fill_value",
+        [
+            ([False, False, True, True], "eq", 2),
+            ([True, True, False, False], "ne", 2),
+            ([False, False, True, True], "le", 0),
+            ([False, False, False, True], "lt", 0),
+            ([True, True, True, False], "ge", 0),
+            ([True, True, False, False], "gt", 0),
+        ],
+    )
+    def test_comparison_flex_alignment_fill(self, values, op, fill_value):
+        left = Series([1, 3, 2], index=list("abc"))
+        right = Series([2, 2, 2], index=list("bcd"))
+        result = getattr(left, op)(right, fill_value=fill_value)
+        expected = Series(values, index=list("abcd"))
+        tm.assert_series_equal(result, expected)
+
+    def test_eq_objects(self) -> None:
+        # GH#62191 Test eq with Enum and List elements
+
+        class Thing(Enum):
+            FIRST = auto()
+            SECOND = auto()
+
+        left = Series([Thing.FIRST, Thing.SECOND])
+        py_l = [Thing.FIRST, Thing.SECOND]
+
+        result = left.eq(Thing.FIRST)
+        expected = Series([True, False])
+        tm.assert_series_equal(result, expected)
+
+        result = left.eq(py_l)
+        expected = Series([True, True])
+        tm.assert_series_equal(result, expected)
+
+        result = left.eq(np.asarray(py_l))
+        expected = Series([True, True])
+        tm.assert_series_equal(result, expected)
+
+        result = left.eq(Series(py_l))
+        expected = Series([True, True])
+        tm.assert_series_equal(result, expected)
+
+        result = Series([[1, 2], [3, 4]]).eq([1, 2])
+        expected = Series([True, False])
+        with pytest.raises(AssertionError):
+            tm.assert_series_equal(result, expected)
+        expected = Series([False, False])
+        tm.assert_series_equal(result, expected)
+
+    def test_eq_with_index(self) -> None:
+        # GH#62191 Test eq with non-trivial indices
+        left = Series([1, 2], index=[1, 0])
+        py_l = [1, 2]
+
+        # assuming Python list has the same index as the Series
+        result = left.eq(py_l)
+        expected = Series([True, True], index=[1, 0])
+        tm.assert_series_equal(result, expected)
+
+        # assuming np.ndarray has the same index as the Series
+        result = left.eq(np.asarray(py_l))
+        expected = Series([True, True], index=[1, 0])
+        tm.assert_series_equal(result, expected)
+
+        result = left.eq(Series(py_l))
+        expected = Series([False, False])
+        tm.assert_series_equal(result, expected)
+
+        result = left.eq(Series([2, 1]))
+        expected = Series([True, True])
+        tm.assert_series_equal(result, expected)
+
+
+class TestSeriesComparison:
+    def test_comparison_different_length(self):
+        a = Series(["a", "b", "c"])
+        b = Series(["b", "a"])
+        msg = "only compare identically-labeled Series"
+        with pytest.raises(ValueError, match=msg):
+            a < b
+
+        a = Series([1, 2])
+        b = Series([2, 3, 4])
+        with pytest.raises(ValueError, match=msg):
+            a == b
+
+    @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"])
+    def test_ser_flex_cmp_return_dtypes(self, opname):
+        # GH#15115
+        ser = Series([1, 3, 2], index=range(3))
+        const = 2
+        result = getattr(ser, opname)(const).dtypes
+        expected = np.dtype("bool")
+        assert result == expected
+
+    @pytest.mark.parametrize("opname", ["eq", "ne", "gt", "lt", "ge", "le"])
+    def test_ser_flex_cmp_return_dtypes_empty(self, opname):
+        # GH#15115 empty Series case
+        ser = Series([1, 3, 2], index=range(3))
+        empty = ser.iloc[:0]
+        const = 2
+        result = getattr(empty, opname)(const).dtypes
+        expected = np.dtype("bool")
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "names", [(None, None, None), ("foo", "bar", None), ("baz", "baz", "baz")]
+    )
+    def test_ser_cmp_result_names(self, names, comparison_op):
+        # datetime64 dtype
+        op = comparison_op
+        dti = date_range("1949-06-07 03:00:00", freq="h", periods=5, name=names[0])
+        ser = Series(dti).rename(names[1])
+        result = op(ser, dti)
+        assert result.name == names[2]
+
+        # datetime64tz dtype
+        dti = dti.tz_localize("US/Central")
+        dti = pd.DatetimeIndex(dti, freq="infer")  # freq not preserved by tz_localize
+        ser = Series(dti).rename(names[1])
+        result = op(ser, dti)
+        assert result.name == names[2]
+
+        # timedelta64 dtype
+        tdi = dti - dti.shift(1)
+        ser = Series(tdi).rename(names[1])
+        result = op(ser, tdi)
+        assert result.name == names[2]
+
+        # interval dtype
+        if op in [operator.eq, operator.ne]:
+            # interval dtype comparisons not yet implemented
+            ii = pd.interval_range(start=0, periods=5, name=names[0])
+            ser = Series(ii).rename(names[1])
+            result = op(ser, ii)
+            assert result.name == names[2]
+
+        # categorical
+        if op in [operator.eq, operator.ne]:
+            # categorical dtype comparisons raise for inequalities
+            cidx = tdi.astype("category")
+            ser = Series(cidx).rename(names[1])
+            result = op(ser, cidx)
+            assert result.name == names[2]
+
+    def test_comparisons(self):
+        s = Series(["a", "b", "c"])
+        s2 = Series([False, True, False])
+
+        # it works!
+        exp = Series([False, False, False])
+        tm.assert_series_equal(s == s2, exp)
+        tm.assert_series_equal(s2 == s, exp)
+
+    # -----------------------------------------------------------------
+    # Categorical Dtype Comparisons
+
+    def test_categorical_comparisons(self):
+        # GH#8938
+        # allow equality comparisons
+        a = Series(list("abc"), dtype="category")
+        b = Series(list("abc"), dtype="object")
+        c = Series(["a", "b", "cc"], dtype="object")
+        d = Series(list("acb"), dtype="object")
+        e = Categorical(list("abc"))
+        f = Categorical(list("acb"))
+
+        # vs scalar
+        assert not (a == "a").all()
+        assert ((a != "a") == ~(a == "a")).all()
+
+        assert not ("a" == a).all()
+        assert (a == "a")[0]
+        assert ("a" == a)[0]
+        assert not ("a" != a)[0]
+
+        # vs list-like
+        assert (a == a).all()
+        assert not (a != a).all()
+
+        assert (a == list(a)).all()
+        assert (a == b).all()
+        assert (b == a).all()
+        assert ((~(a == b)) == (a != b)).all()
+        assert ((~(b == a)) == (b != a)).all()
+
+        assert not (a == c).all()
+        assert not (c == a).all()
+        assert not (a == d).all()
+        assert not (d == a).all()
+
+        # vs a cat-like
+        assert (a == e).all()
+        assert (e == a).all()
+        assert not (a == f).all()
+        assert not (f == a).all()
+
+        assert (~(a == e) == (a != e)).all()
+        assert (~(e == a) == (e != a)).all()
+        assert (~(a == f) == (a != f)).all()
+        assert (~(f == a) == (f != a)).all()
+
+        # non-equality is not comparable
+        msg = "can only compare equality or not"
+        with pytest.raises(TypeError, match=msg):
+            a < b
+        with pytest.raises(TypeError, match=msg):
+            b < a
+        with pytest.raises(TypeError, match=msg):
+            a > b
+        with pytest.raises(TypeError, match=msg):
+            b > a
+
+    def test_unequal_categorical_comparison_raises_type_error(self):
+        # unequal comparison should raise for unordered cats
+        cat = Series(Categorical(list("abc")))
+        msg = "can only compare equality or not"
+        with pytest.raises(TypeError, match=msg):
+            cat > "b"
+
+        cat = Series(Categorical(list("abc"), ordered=False))
+        with pytest.raises(TypeError, match=msg):
+            cat > "b"
+
+        # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
+        # and following comparisons with scalars not in categories should raise
+        # for unequal comps, but not for equal/not equal
+        cat = Series(Categorical(list("abc"), ordered=True))
+
+        msg = "Invalid comparison between dtype=category and str"
+        with pytest.raises(TypeError, match=msg):
+            cat < "d"
+        with pytest.raises(TypeError, match=msg):
+            cat > "d"
+        with pytest.raises(TypeError, match=msg):
+            "d" < cat
+        with pytest.raises(TypeError, match=msg):
+            "d" > cat
+
+        tm.assert_series_equal(cat == "d", Series([False, False, False]))
+        tm.assert_series_equal(cat != "d", Series([True, True, True]))
+
+    # -----------------------------------------------------------------
+
+    def test_comparison_tuples(self):
+        # GH#11339
+        # comparisons vs tuple
+        s = Series([(1, 1), (1, 2)])
+
+        result = s == (1, 2)
+        expected = Series([False, True])
+        tm.assert_series_equal(result, expected)
+
+        result = s != (1, 2)
+        expected = Series([True, False])
+        tm.assert_series_equal(result, expected)
+
+        result = s == (0, 0)
+        expected = Series([False, False])
+        tm.assert_series_equal(result, expected)
+
+        result = s != (0, 0)
+        expected = Series([True, True])
+        tm.assert_series_equal(result, expected)
+
+        s = Series([(1, 1), (1, 1)])
+
+        result = s == (1, 1)
+        expected = Series([True, True])
+        tm.assert_series_equal(result, expected)
+
+        result = s != (1, 1)
+        expected = Series([False, False])
+        tm.assert_series_equal(result, expected)
+
+    def test_comparison_frozenset(self):
+        ser = Series([frozenset([1]), frozenset([1, 2])])
+
+        result = ser == frozenset([1])
+        expected = Series([True, False])
+        tm.assert_series_equal(result, expected)
+
+    def test_comparison_operators_with_nas(self, comparison_op):
+        ser = Series(bdate_range("1/1/2000", periods=10), dtype=object)
+        ser[::2] = np.nan
+
+        # test that comparisons work
+        val = ser[5]
+
+        result = comparison_op(ser, val)
+        expected = comparison_op(ser.dropna(), val).reindex(ser.index)
+
+        if comparison_op is operator.ne:
+            expected = expected.fillna(True).astype(bool)
+        else:
+            expected = expected.fillna(False).astype(bool)
+
+        tm.assert_series_equal(result, expected)
+
+    def test_ne(self):
+        ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float)
+        expected = np.array([True, True, False, True, True])
+        tm.assert_numpy_array_equal(ts.index != 5, expected)
+        tm.assert_numpy_array_equal(~(ts.index == 5), expected)
+
+    @pytest.mark.parametrize("right_data", [[2, 2, 2], [2, 2, 2, 2]])
+    def test_comp_ops_df_compat(self, right_data, frame_or_series):
+        # GH 1134
+        # GH 50083 to clarify that index and columns must be identically labeled
+        left = Series([1, 2, 3], index=list("ABC"), name="x")
+        right = Series(right_data, index=list("ABDC")[: len(right_data)], name="x")
+        if frame_or_series is not Series:
+            msg = (
+                rf"Can only compare identically-labeled \(both index and columns\) "
+                f"{frame_or_series.__name__} objects"
+            )
+            left = left.to_frame()
+            right = right.to_frame()
+        else:
+            msg = (
+                f"Can only compare identically-labeled {frame_or_series.__name__} "
+                f"objects"
+            )
+
+        with pytest.raises(ValueError, match=msg):
+            left == right
+        with pytest.raises(ValueError, match=msg):
+            right == left
+
+        with pytest.raises(ValueError, match=msg):
+            left != right
+        with pytest.raises(ValueError, match=msg):
+            right != left
+
+        with pytest.raises(ValueError, match=msg):
+            left < right
+        with pytest.raises(ValueError, match=msg):
+            right < left
+
+    def test_compare_series_interval_keyword(self):
+        # GH#25338
+        ser = Series(["IntervalA", "IntervalB", "IntervalC"])
+        result = ser == "IntervalA"
+        expected = Series([True, False, False])
+        tm.assert_series_equal(result, expected)
+
+
+# ------------------------------------------------------------------
+# Unsorted
+#  These arithmetic tests were previously in other files, eventually
+#  should be parametrized and put into tests.arithmetic
+
+
+class TestTimeSeriesArithmetic:
+    def test_series_add_tz_mismatch_converts_to_utc(self):
+        rng = date_range("1/1/2011", periods=100, freq="h", tz="utc")
+
+        perm = np.random.default_rng(2).permutation(100)[:90]
+        ser1 = Series(
+            np.random.default_rng(2).standard_normal(90),
+            index=rng.take(perm).tz_convert("US/Eastern"),
+        )
+
+        perm = np.random.default_rng(2).permutation(100)[:90]
+        ser2 = Series(
+            np.random.default_rng(2).standard_normal(90),
+            index=rng.take(perm).tz_convert("Europe/Berlin"),
+        )
+
+        result = ser1 + ser2
+
+        uts1 = ser1.tz_convert("utc")
+        uts2 = ser2.tz_convert("utc")
+        expected = uts1 + uts2
+
+        # sort since input indexes are not equal
+        expected = expected.sort_index()
+
+        assert result.index.tz is timezone.utc
+        tm.assert_series_equal(result, expected)
+
+    def test_series_add_aware_naive_raises(self):
+        rng = date_range("1/1/2011", periods=10, freq="h")
+        ser = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)
+
+        ser_utc = ser.tz_localize("utc")
+
+        msg = "Cannot join tz-naive with tz-aware DatetimeIndex"
+        with pytest.raises(Exception, match=msg):
+            ser + ser_utc
+
+        with pytest.raises(Exception, match=msg):
+            ser_utc + ser
+
+    # TODO: belongs in tests/arithmetic?
+    def test_datetime_understood(self, unit):
+        # Ensures it doesn't fail to create the right series
+        # reported in issue#16726
+        series = Series(date_range("2012-01-01", periods=3, unit=unit))
+        offset = pd.offsets.DateOffset(days=6)
+        result = series - offset
+        exp_dti = pd.to_datetime(["2011-12-26", "2011-12-27", "2011-12-28"]).as_unit(
+            unit
+        )
+        expected = Series(exp_dti)
+        tm.assert_series_equal(result, expected)
+
+    def test_align_date_objects_with_datetimeindex(self):
+        rng = date_range("1/1/2000", periods=20)
+        ts = Series(np.random.default_rng(2).standard_normal(20), index=rng)
+
+        ts_slice = ts[5:]
+        ts2 = ts_slice.copy()
+        ts2.index = [x.date() for x in ts2.index]
+
+        result = ts + ts2
+        result2 = ts2 + ts
+        expected = ts + ts[5:]
+        expected.index = expected.index._with_freq(None)
+        tm.assert_series_equal(result, expected)
+        tm.assert_series_equal(result2, expected)
+
+
+class TestNamePreservation:
+    @pytest.mark.parametrize("box", [list, tuple, np.array, Index, Series, pd.array])
+    @pytest.mark.parametrize("flex", [True, False])
+    def test_series_ops_name_retention(self, flex, box, names, all_binary_operators):
+        # GH#33930 consistent name-retention
+        op = all_binary_operators
+
+        left = Series(range(10), name=names[0])
+        right = Series(range(10), name=names[1])
+
+        name = op.__name__.strip("_")
+        is_logical = name in ["and", "rand", "xor", "rxor", "or", "ror"]
+
+        msg = (
+            r"Logical ops \(and, or, xor\) between Pandas objects and "
+            "dtype-less sequences"
+        )
+
+        right = box(right)
+        if flex:
+            if is_logical:
+                # Series doesn't have these as flex methods
+                return
+            result = getattr(left, name)(right)
+        else:
+            if is_logical and box in [list, tuple]:
+                with pytest.raises(TypeError, match=msg):
+                    # GH#52264 logical ops with dtype-less sequences deprecated
+                    op(left, right)
+                return
+            result = op(left, right)
+
+        assert isinstance(result, Series)
+        if box in [Index, Series]:
+            assert result.name is names[2] or result.name == names[2]
+        else:
+            assert result.name is names[0] or result.name == names[0]
+
+    def test_binop_maybe_preserve_name(self, datetime_series):
+        # names match, preserve
+        result = datetime_series * datetime_series
+        assert result.name == datetime_series.name
+        result = datetime_series.mul(datetime_series)
+        assert result.name == datetime_series.name
+
+        result = datetime_series * datetime_series[:-2]
+        assert result.name == datetime_series.name
+
+        # names don't match, don't preserve
+        cp = datetime_series.copy()
+        cp.name = "something else"
+        result = datetime_series + cp
+        assert result.name is None
+        result = datetime_series.add(cp)
+        assert result.name is None
+
+        ops = ["add", "sub", "mul", "div", "truediv", "floordiv", "mod", "pow"]
+        ops = ops + ["r" + op for op in ops]
+        for op in ops:
+            # names match, preserve
+            ser = datetime_series.copy()
+            result = getattr(ser, op)(ser)
+            assert result.name == datetime_series.name
+
+            # names don't match, don't preserve
+            cp = datetime_series.copy()
+            cp.name = "changed"
+            result = getattr(ser, op)(cp)
+            assert result.name is None
+
+    def test_scalarop_preserve_name(self, datetime_series):
+        result = datetime_series * 2
+        assert result.name == datetime_series.name
+
+
+class TestInplaceOperations:
+    @pytest.mark.parametrize(
+        "dtype1, dtype2, dtype_expected, dtype_mul",
+        (
+            ("Int64", "Int64", "Int64", "Int64"),
+            ("float", "float", "float", "float"),
+            ("Int64", "float", "Float64", "Float64"),
+            ("Int64", "Float64", "Float64", "Float64"),
+        ),
+    )
+    def test_series_inplace_ops(self, dtype1, dtype2, dtype_expected, dtype_mul):
+        # GH 37910
+
+        ser1 = Series([1], dtype=dtype1)
+        ser2 = Series([2], dtype=dtype2)
+        ser1 += ser2
+        expected = Series([3], dtype=dtype_expected)
+        tm.assert_series_equal(ser1, expected)
+
+        ser1 -= ser2
+        expected = Series([1], dtype=dtype_expected)
+        tm.assert_series_equal(ser1, expected)
+
+        ser1 *= ser2
+        expected = Series([2], dtype=dtype_mul)
+        tm.assert_series_equal(ser1, expected)
+
+
+def test_none_comparison(request, series_with_simple_index):
+    series = series_with_simple_index
+
+    if len(series) < 1:
+        request.applymarker(
+            pytest.mark.xfail(reason="Test doesn't make sense on empty data")
+        )
+
+    # bug brought up by #1079
+    # changed from TypeError in 0.17.0
+    series.iloc[0] = np.nan
+
+    # noinspection PyComparisonWithNone
+    result = series == None  # noqa: E711
+    assert not result.iat[0]
+    assert not result.iat[1]
+
+    # noinspection PyComparisonWithNone
+    result = series != None  # noqa: E711
+    assert result.iat[0]
+    assert result.iat[1]
+
+    result = None == series  # noqa: E711
+    assert not result.iat[0]
+    assert not result.iat[1]
+
+    result = None != series  # noqa: E711
+    assert result.iat[0]
+    assert result.iat[1]
+
+    if lib.is_np_dtype(series.dtype, "M") or isinstance(series.dtype, DatetimeTZDtype):
+        # Following DatetimeIndex (and Timestamp) convention,
+        # inequality comparisons with Series[datetime64] raise
+        msg = "Invalid comparison"
+        with pytest.raises(TypeError, match=msg):
+            None > series
+        with pytest.raises(TypeError, match=msg):
+            series > None
+    else:
+        result = None > series
+        assert not result.iat[0]
+        assert not result.iat[1]
+
+        result = series < None
+        assert not result.iat[0]
+        assert not result.iat[1]
+
+
+def test_series_varied_multiindex_alignment():
+    # GH 20414
+    s1 = Series(
+        range(8),
+        index=pd.MultiIndex.from_product(
+            [list("ab"), list("xy"), [1, 2]], names=["ab", "xy", "num"]
+        ),
+    )
+    s2 = Series(
+        [1000 * i for i in range(1, 5)],
+        index=pd.MultiIndex.from_product([list("xy"), [1, 2]], names=["xy", "num"]),
+    )
+    result = s1.loc[pd.IndexSlice[["a"], :, :]] + s2
+    expected = Series(
+        [1000, 2001, 3002, 4003],
+        index=pd.MultiIndex.from_tuples(
+            [("a", "x", 1), ("a", "x", 2), ("a", "y", 1), ("a", "y", 2)],
+            names=["ab", "xy", "num"],
+        ),
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_rmod_consistent_large_series():
+    # GH 29602
+    result = Series([2] * 10001).rmod(-1)
+    expected = Series([1] * 10001)
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        date_range("2016-01-01", periods=3),
+        date_range("2016-01-01", tz="US/Pacific", periods=3),
+        pd.timedelta_range("1 Day", periods=3),
+    ],
+)
+def test_comparison_mismatched_datetime_units(index):
+    # GH#63459
+
+    ser = Series(1, index=index)
+    ser2 = Series(1, index=index.as_unit("ns"))
+
+    result = ser == ser2
+    expected = Series([True, True, True], index=ser.index)
+    tm.assert_series_equal(result, expected)
+
+    result2 = ser2 < ser
+    expected2 = Series([False, False, False], index=ser2.index)
+    tm.assert_series_equal(result2, expected2)
diff --git a/pandas/tests/series/test_arrow_interface.py b/pandas/tests/series/test_arrow_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b4340064ea1bc4827c858d4deff602538240eb2
--- /dev/null
+++ b/pandas/tests/series/test_arrow_interface.py
@@ -0,0 +1,117 @@
+import ctypes
+
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas._testing as tm
+
+pa = pytest.importorskip("pyarrow", minversion="16.0")
+
+
+def test_series_arrow_interface():
+    s = pd.Series([1, 4, 2])
+
+    capsule = s.__arrow_c_stream__()
+    assert (
+        ctypes.pythonapi.PyCapsule_IsValid(
+            ctypes.py_object(capsule), b"arrow_array_stream"
+        )
+        == 1
+    )
+
+    ca = pa.chunked_array(s)
+    expected = pa.chunked_array([[1, 4, 2]])
+    assert ca.equals(expected)
+    ca = pa.chunked_array(s, type=pa.int32())
+    expected = pa.chunked_array([[1, 4, 2]], type=pa.int32())
+    assert ca.equals(expected)
+
+
+def test_series_arrow_interface_arrow_dtypes():
+    s = pd.Series([1, 4, 2], dtype="Int64[pyarrow]")
+
+    capsule = s.__arrow_c_stream__()
+    assert (
+        ctypes.pythonapi.PyCapsule_IsValid(
+            ctypes.py_object(capsule), b"arrow_array_stream"
+        )
+        == 1
+    )
+
+    ca = pa.chunked_array(s)
+    expected = pa.chunked_array([[1, 4, 2]])
+    assert ca.equals(expected)
+    ca = pa.chunked_array(s, type=pa.int32())
+    expected = pa.chunked_array([[1, 4, 2]], type=pa.int32())
+    assert ca.equals(expected)
+
+
+def test_series_arrow_interface_stringdtype():
+    s = pd.Series(["foo", "bar"], dtype="string[pyarrow]")
+
+    capsule = s.__arrow_c_stream__()
+    assert (
+        ctypes.pythonapi.PyCapsule_IsValid(
+            ctypes.py_object(capsule), b"arrow_array_stream"
+        )
+        == 1
+    )
+
+    ca = pa.chunked_array(s)
+    expected = pa.chunked_array([["foo", "bar"]], type=pa.large_string())
+    assert ca.equals(expected)
+
+
+class ArrowArrayWrapper:
+    def __init__(self, array):
+        self.array = array
+
+    def __arrow_c_array__(self, requested_schema=None):
+        return self.array.__arrow_c_array__(requested_schema)
+
+
+class ArrowStreamWrapper:
+    def __init__(self, chunked_array):
+        self.stream = chunked_array
+
+    def __arrow_c_stream__(self, requested_schema=None):
+        return self.stream.__arrow_c_stream__(requested_schema)
+
+
+@td.skip_if_no("pyarrow", min_version="14.0")
+def test_dataframe_from_arrow():
+    # objects with __arrow_c_stream__
+    arr = pa.chunked_array([[1, 2, 3], [4, 5]])
+
+    result = pd.Series.from_arrow(arr)
+    expected = pd.Series([1, 2, 3, 4, 5])
+    tm.assert_series_equal(result, expected)
+
+    # not only pyarrow object are supported
+    result = pd.Series.from_arrow(ArrowStreamWrapper(arr))
+    tm.assert_series_equal(result, expected)
+
+    # table works as well, but will be seen as a StructArray
+    table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+
+    result = pd.Series.from_arrow(table)
+    expected = pd.Series([{"a": 1, "b": "a"}, {"a": 2, "b": "b"}, {"a": 3, "b": "c"}])
+    tm.assert_series_equal(result, expected)
+
+    # objects with __arrow_c_array__
+    arr = pa.array([1, 2, 3])
+
+    expected = pd.Series([1, 2, 3])
+    result = pd.Series.from_arrow(arr)
+    tm.assert_series_equal(result, expected)
+
+    result = pd.Series.from_arrow(ArrowArrayWrapper(arr))
+    tm.assert_series_equal(result, expected)
+
+    # only accept actual Arrow objects
+    with pytest.raises(
+        TypeError, match="Expected an Arrow-compatible array-like object"
+    ):
+        pd.Series.from_arrow([1, 2, 3])
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b0dcb9ec5facdbd08ba5608c64c4a454b6419db
--- /dev/null
+++ b/pandas/tests/series/test_constructors.py
@@ -0,0 +1,2294 @@
+from collections import OrderedDict
+from collections.abc import Iterator
+from datetime import (
+    datetime,
+    timedelta,
+)
+
+from dateutil.tz import tzoffset
+import numpy as np
+from numpy import ma
+import pytest
+
+from pandas._libs import (
+    iNaT,
+    lib,
+)
+from pandas.compat import HAS_PYARROW
+from pandas.compat.numpy import np_version_gt2
+from pandas.errors import (
+    IntCastingNaNError,
+    Pandas4Warning,
+)
+
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    DataFrame,
+    DatetimeIndex,
+    DatetimeTZDtype,
+    Index,
+    Interval,
+    IntervalIndex,
+    MultiIndex,
+    NaT,
+    Period,
+    RangeIndex,
+    Series,
+    Timestamp,
+    date_range,
+    isna,
+    period_range,
+    timedelta_range,
+)
+import pandas._testing as tm
+from pandas.core.arrays import (
+    IntegerArray,
+    IntervalArray,
+    period_array,
+)
+from pandas.core.internals.blocks import NumpyBlock
+
+
+class TestSeriesConstructors:
+    def test_from_ints_with_non_nano_dt64_dtype(self, index_or_series):
+        values = np.arange(10)
+
+        res = index_or_series(values, dtype="M8[s]")
+        expected = index_or_series(values.astype("M8[s]"))
+        tm.assert_equal(res, expected)
+
+        res = index_or_series(list(values), dtype="M8[s]")
+        tm.assert_equal(res, expected)
+
+    def test_from_na_value_and_interval_of_datetime_dtype(self):
+        # GH#41805
+        ser = Series([None], dtype="interval[datetime64[ns]]")
+        assert ser.isna().all()
+        assert ser.dtype == "interval[datetime64[ns], right]"
+
+    def test_infer_with_date_and_datetime(self):
+        # GH#49341 pre-2.0 we inferred datetime-and-date to datetime64, which
+        #  was inconsistent with Index behavior
+        ts = Timestamp(2016, 1, 1)
+        vals = [ts.to_pydatetime(), ts.date()]
+
+        ser = Series(vals)
+        expected = Series(vals, dtype=object)
+        tm.assert_series_equal(ser, expected)
+
+        idx = Index(vals)
+        expected = Index(vals, dtype=object)
+        tm.assert_index_equal(idx, expected)
+
+    def test_unparsable_strings_with_dt64_dtype(self):
+        # pre-2.0 these would be silently ignored and come back with object dtype
+        vals = ["aa"]
+        msg = "^Unknown datetime string format, unable to parse: aa$"
+        with pytest.raises(ValueError, match=msg):
+            Series(vals, dtype="datetime64[ns]")
+
+        with pytest.raises(ValueError, match=msg):
+            Series(np.array(vals, dtype=object), dtype="datetime64[ns]")
+
+    def test_invalid_dtype_conversion_datetime_to_timedelta(self):
+        # GH#60728
+        vals = Series([NaT, Timestamp(2025, 1, 1)], dtype="datetime64[ns]")
+        msg = r"^Cannot cast DatetimeArray to dtype timedelta64\[ns\]$"
+        with pytest.raises(TypeError, match=msg):
+            Series(vals, dtype="timedelta64[ns]")
+
+    @pytest.mark.parametrize(
+        "constructor",
+        [
+            # NOTE: some overlap with test_constructor_empty but that test does not
+            # test for None or an empty generator.
+            # test_constructor_pass_none tests None but only with the index also
+            # passed.
+            (lambda idx: Series(index=idx)),
+            (lambda idx: Series(None, index=idx)),
+            (lambda idx: Series({}, index=idx)),
+            (lambda idx: Series((), index=idx)),
+            (lambda idx: Series([], index=idx)),
+            (lambda idx: Series((_ for _ in []), index=idx)),
+            (lambda idx: Series(data=None, index=idx)),
+            (lambda idx: Series(data={}, index=idx)),
+            (lambda idx: Series(data=(), index=idx)),
+            (lambda idx: Series(data=[], index=idx)),
+            (lambda idx: Series(data=(_ for _ in []), index=idx)),
+        ],
+    )
+    @pytest.mark.parametrize("empty_index", [None, []])
+    def test_empty_constructor(self, constructor, empty_index):
+        # GH 49573 (addition of empty_index parameter)
+        expected = Series(index=empty_index)
+        result = constructor(empty_index)
+
+        assert result.dtype == object
+        assert len(result.index) == 0
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+    def test_invalid_dtype(self):
+        # GH15520
+        msg = "not understood"
+        invalid_list = [Timestamp, "Timestamp", list]
+        for dtype in invalid_list:
+            with pytest.raises(TypeError, match=msg):
+                Series([], name="time", dtype=dtype)
+
+    def test_invalid_compound_dtype(self):
+        # GH#13296
+        c_dtype = np.dtype([("a", "i8"), ("b", "f4")])
+        cdt_arr = np.array([(1, 0.4), (256, -13)], dtype=c_dtype)
+
+        with pytest.raises(ValueError, match="Use DataFrame instead"):
+            Series(cdt_arr, index=["A", "B"])
+
+    def test_scalar_conversion(self):
+        # Pass in scalar is disabled
+        scalar = Series(0.5)
+        assert not isinstance(scalar, float)
+
+    def test_scalar_extension_dtype(self, ea_scalar_and_dtype):
+        # GH 28401
+
+        ea_scalar, ea_dtype = ea_scalar_and_dtype
+
+        ser = Series(ea_scalar, index=range(3))
+        expected = Series([ea_scalar] * 3, dtype=ea_dtype)
+
+        assert ser.dtype == ea_dtype
+        tm.assert_series_equal(ser, expected)
+
+    def test_constructor(self, datetime_series, using_infer_string):
+        empty_series = Series()
+        assert datetime_series.index._is_all_dates
+
+        # Pass in Series
+        derived = Series(datetime_series)
+        assert derived.index._is_all_dates
+
+        tm.assert_index_equal(derived.index, datetime_series.index)
+        # Ensure new index is not created
+        assert id(datetime_series.index) == id(derived.index)
+
+        # Mixed type Series
+        mixed = Series(["hello", np.nan], index=[0, 1])
+        assert mixed.dtype == np.object_ if not using_infer_string else "str"
+        assert np.isnan(mixed[1])
+
+        assert not empty_series.index._is_all_dates
+        assert not Series().index._is_all_dates
+
+        # exception raised is of type ValueError GH35744
+        with pytest.raises(
+            ValueError,
+            match=r"Data must be 1-dimensional, got ndarray of shape \(3, 3\) instead",
+        ):
+            Series(np.random.default_rng(2).standard_normal((3, 3)), index=np.arange(3))
+
+        mixed.name = "Series"
+        rs = Series(mixed).name
+        xp = "Series"
+        assert rs == xp
+
+        # raise on MultiIndex GH4187
+        m = MultiIndex.from_arrays([[1, 2], [3, 4]])
+        msg = "initializing a Series from a MultiIndex is not supported"
+        with pytest.raises(NotImplementedError, match=msg):
+            Series(m)
+
+    def test_constructor_index_ndim_gt_1_raises(self):
+        # GH#18579
+        df = DataFrame([[1, 2], [3, 4], [5, 6]], index=[3, 6, 9])
+        with pytest.raises(ValueError, match="Index data must be 1-dimensional"):
+            Series([1, 3, 2], index=df)
+
+    @pytest.mark.parametrize("input_class", [list, dict, OrderedDict])
+    def test_constructor_empty(self, input_class, using_infer_string):
+        empty = Series()
+        empty2 = Series(input_class())
+
+        # these are Index() and RangeIndex() which don't compare type equal
+        # but are just .equals
+        tm.assert_series_equal(empty, empty2, check_index_type=False)
+
+        # With explicit dtype:
+        empty = Series(dtype="float64")
+        empty2 = Series(input_class(), dtype="float64")
+        tm.assert_series_equal(empty, empty2, check_index_type=False)
+
+        # GH 18515 : with dtype=category:
+        empty = Series(dtype="category")
+        empty2 = Series(input_class(), dtype="category")
+        tm.assert_series_equal(empty, empty2, check_index_type=False)
+
+        if input_class is not list:
+            # With index:
+            empty = Series(index=range(10))
+            empty2 = Series(input_class(), index=range(10))
+            tm.assert_series_equal(empty, empty2)
+
+            # With index and dtype float64:
+            empty = Series(np.nan, index=range(10))
+            empty2 = Series(input_class(), index=range(10), dtype="float64")
+            tm.assert_series_equal(empty, empty2)
+
+            # GH 19853 : with empty string, index and dtype str
+            empty = Series("", dtype=str, index=range(3))
+            if using_infer_string:
+                empty2 = Series("", index=range(3), dtype="str")
+            else:
+                empty2 = Series("", index=range(3))
+            tm.assert_series_equal(empty, empty2)
+
+    @pytest.mark.parametrize("input_arg", [np.nan, float("nan")])
+    def test_constructor_nan(self, input_arg):
+        empty = Series(dtype="float64", index=range(10))
+        empty2 = Series(input_arg, index=range(10))
+
+        tm.assert_series_equal(empty, empty2, check_index_type=False)
+
+    @pytest.mark.parametrize(
+        "dtype",
+        ["f8", "i8", "M8[ns]", "m8[ns]", "category", "object", "datetime64[ns, UTC]"],
+    )
+    @pytest.mark.parametrize("index", [None, Index([])])
+    def test_constructor_dtype_only(self, dtype, index):
+        # GH-20865
+        result = Series(dtype=dtype, index=index)
+        assert result.dtype == dtype
+        assert len(result) == 0
+
+    def test_constructor_no_data_index_order(self):
+        result = Series(index=["b", "a", "c"])
+        assert result.index.tolist() == ["b", "a", "c"]
+
+    def test_constructor_no_data_string_type(self):
+        # GH 22477
+        result = Series(index=[1], dtype=str)
+        assert np.isnan(result.iloc[0])
+
+    @pytest.mark.parametrize("item", ["entry", "ѐ", 13])
+    def test_constructor_string_element_string_type(self, item):
+        # GH 22477
+        result = Series(item, index=[1], dtype=str)
+        assert result.iloc[0] == str(item)
+
+    def test_constructor_dtype_str_na_values(self, string_dtype):
+        # https://github.com/pandas-dev/pandas/issues/21083
+        ser = Series(["x", None], dtype=string_dtype)
+        result = ser.isna()
+        expected = Series([False, True])
+        tm.assert_series_equal(result, expected)
+        assert ser.iloc[1] is None
+
+        ser = Series(["x", np.nan], dtype=string_dtype)
+        assert np.isnan(ser.iloc[1])
+
+    def test_constructor_series(self):
+        index1 = ["d", "b", "a", "c"]
+        index2 = sorted(index1)
+        s1 = Series([4, 7, -5, 3], index=index1)
+        s2 = Series(s1, index=index2)
+
+        tm.assert_series_equal(s2, s1.sort_index())
+
+    def test_constructor_iterable(self):
+        # GH 21987
+        class Iter:
+            def __iter__(self) -> Iterator:
+                yield from range(10)
+
+        expected = Series(list(range(10)), dtype="int64")
+        result = Series(Iter(), dtype="int64")
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_sequence(self):
+        # GH 21987
+        expected = Series(list(range(10)), dtype="int64")
+        result = Series(range(10), dtype="int64")
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_single_str(self):
+        # GH 21987
+        expected = Series(["abc"])
+        result = Series("abc")
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_list_like(self):
+        # make sure that we are coercing different
+        # list-likes to standard dtypes and not
+        # platform specific
+        expected = Series([1, 2, 3], dtype="int64")
+        for obj in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype="int64")]:
+            result = Series(obj, index=[0, 1, 2])
+            tm.assert_series_equal(result, expected)
+
+    def test_constructor_boolean_index(self):
+        # GH#18579
+        s1 = Series([1, 2, 3], index=[4, 5, 6])
+
+        index = s1 == 2
+        result = Series([1, 3, 2], index=index)
+        expected = Series([1, 3, 2], index=[False, True, False])
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype", ["bool", "int32", "int64", "float64"])
+    def test_constructor_index_dtype(self, dtype):
+        # GH 17088
+
+        s = Series(Index([0, 2, 4]), dtype=dtype)
+        assert s.dtype == dtype
+
+    @pytest.mark.parametrize(
+        "input_vals",
+        [
+            [1, 2],
+            ["1", "2"],
+            list(date_range("1/1/2011", periods=2, freq="h")),
+            list(date_range("1/1/2011", periods=2, freq="h", tz="US/Eastern")),
+            [Interval(left=0, right=5)],
+        ],
+    )
+    def test_constructor_list_str(self, input_vals, string_dtype):
+        # GH 16605
+        # Ensure that data elements from a list are converted to strings
+        # when dtype is str, 'str', or 'U'
+        result = Series(input_vals, dtype=string_dtype)
+        expected = Series(input_vals).astype(string_dtype)
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_list_str_na(self, string_dtype):
+        result = Series([1.0, 2.0, np.nan], dtype=string_dtype)
+        expected = Series(["1.0", "2.0", np.nan], dtype=object)
+        tm.assert_series_equal(result, expected)
+        assert np.isnan(result[2])
+
+    def test_constructor_generator(self):
+        gen = (i for i in range(10))
+
+        result = Series(gen)
+        exp = Series(range(10))
+        tm.assert_series_equal(result, exp)
+
+        # same but with non-default index
+        gen = (i for i in range(10))
+        result = Series(gen, index=range(10, 20))
+        exp.index = range(10, 20)
+        tm.assert_series_equal(result, exp)
+
+    def test_constructor_map(self):
+        # GH8909
+        m = (x for x in range(10))
+
+        result = Series(m)
+        exp = Series(range(10))
+        tm.assert_series_equal(result, exp)
+
+        # same but with non-default index
+        m = (x for x in range(10))
+        result = Series(m, index=range(10, 20))
+        exp.index = range(10, 20)
+        tm.assert_series_equal(result, exp)
+
+    def test_constructor_categorical(self):
+        msg = "Constructing a Categorical with a dtype and values containing"
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            cat = Categorical([0, 1, 2, 0, 1, 2], ["a", "b", "c"])
+        res = Series(cat)
+        tm.assert_categorical_equal(res.values, cat)
+
+        # can cast to a new dtype
+        result = Series(Categorical([1, 2, 3]), dtype="int64")
+        expected = Series([1, 2, 3], dtype="int64")
+        tm.assert_series_equal(result, expected)
+
+    def test_construct_from_categorical_with_dtype(self):
+        # GH12574
+        ser = Series(Categorical([1, 2, 3]), dtype="category")
+        assert isinstance(ser.dtype, CategoricalDtype)
+
+    def test_construct_intlist_values_category_dtype(self):
+        ser = Series([1, 2, 3], dtype="category")
+        assert isinstance(ser.dtype, CategoricalDtype)
+
+    def test_constructor_categorical_with_coercion(self):
+        factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])
+        # test basic creation / coercion of categoricals
+        s = Series(factor, name="A")
+        assert s.dtype == "category"
+        assert len(s) == len(factor)
+
+        # in a frame
+        df = DataFrame({"A": factor})
+        result = df["A"]
+        tm.assert_series_equal(result, s)
+        result = df.iloc[:, 0]
+        tm.assert_series_equal(result, s)
+        assert len(df) == len(factor)
+
+        df = DataFrame({"A": s})
+        result = df["A"]
+        tm.assert_series_equal(result, s)
+        assert len(df) == len(factor)
+
+        # multiples
+        df = DataFrame({"A": s, "B": s, "C": 1})
+        result1 = df["A"]
+        result2 = df["B"]
+        tm.assert_series_equal(result1, s)
+        tm.assert_series_equal(result2, s, check_names=False)
+        assert result2.name == "B"
+        assert len(df) == len(factor)
+
+    def test_constructor_categorical_with_coercion2(self):
+        # GH8623
+        x = DataFrame(
+            [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
+            columns=["person_id", "person_name"],
+        )
+        x["person_name"] = Categorical(x.person_name)  # doing this breaks transform
+
+        expected = x.iloc[0].person_name
+        result = x.person_name.iloc[0]
+        assert result == expected
+
+        result = x.person_name[0]
+        assert result == expected
+
+        result = x.person_name.loc[0]
+        assert result == expected
+
+    def test_constructor_series_to_categorical(self):
+        # see GH#16524: test conversion of Series to Categorical
+        series = Series(["a", "b", "c"])
+
+        result = Series(series, dtype="category")
+        expected = Series(["a", "b", "c"], dtype="category")
+
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_categorical_dtype(self):
+        result = Series(
+            ["a", "b"], dtype=CategoricalDtype(["a", "b", "c"], ordered=True)
+        )
+        assert isinstance(result.dtype, CategoricalDtype)
+        tm.assert_index_equal(result.cat.categories, Index(["a", "b", "c"]))
+        assert result.cat.ordered
+
+        result = Series(["a", "b"], dtype=CategoricalDtype(["b", "a"]))
+        assert isinstance(result.dtype, CategoricalDtype)
+        tm.assert_index_equal(result.cat.categories, Index(["b", "a"]))
+        assert result.cat.ordered is False
+
+        # GH 19565 - Check broadcasting of scalar with Categorical dtype
+        result = Series(
+            "a", index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True)
+        )
+        expected = Series(
+            ["a", "a"], index=[0, 1], dtype=CategoricalDtype(["a", "b"], ordered=True)
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_categorical_string(self):
+        # GH 26336: the string 'category' maintains existing CategoricalDtype
+        cdt = CategoricalDtype(categories=list("dabc"), ordered=True)
+        expected = Series(list("abcabc"), dtype=cdt)
+
+        # Series(Categorical, dtype='category') keeps existing dtype
+        cat = Categorical(list("abcabc"), dtype=cdt)
+        result = Series(cat, dtype="category")
+        tm.assert_series_equal(result, expected)
+
+        # Series(Series[Categorical], dtype='category') keeps existing dtype
+        result = Series(result, dtype="category")
+        tm.assert_series_equal(result, expected)
+
+    def test_categorical_sideeffects_free(self):
+        # Passing a categorical to a Series and then changing values in either
+        # the series or the categorical should not change the values in the
+        # other one, IF you specify copy!
+        cat = Categorical(["a", "b", "c", "a"])
+        s = Series(cat, copy=True)
+        assert s.cat is not cat
+        s = s.cat.rename_categories([1, 2, 3])
+        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
+        exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
+        tm.assert_numpy_array_equal(s.__array__(), exp_s)
+        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
+
+        # setting
+        s[0] = 2
+        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
+        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
+        tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
+
+        # however, copy is False by default
+        # so this WILL change values
+        cat = Categorical(["a", "b", "c", "a"])
+        s = Series(cat, copy=False)
+        assert s._values is cat
+        s = s.cat.rename_categories([1, 2, 3])
+        assert s._values is not cat
+        exp_s = np.array([1, 2, 3, 1], dtype=np.int64)
+        tm.assert_numpy_array_equal(s.__array__(), exp_s)
+
+        s[0] = 2
+        exp_s2 = np.array([2, 2, 3, 1], dtype=np.int64)
+        tm.assert_numpy_array_equal(s.__array__(), exp_s2)
+
+    def test_unordered_compare_equal(self):
+        left = Series(["a", "b", None], dtype=CategoricalDtype(["a", "b"]))
+        right = Series(Categorical(["a", "b", np.nan], categories=["a", "b"]))
+        tm.assert_series_equal(left, right)
+
+    def test_constructor_maskedarray(self):
+        data = ma.masked_all((3,), dtype=float)
+        result = Series(data)
+        expected = Series([np.nan, np.nan, np.nan])
+        tm.assert_series_equal(result, expected)
+
+        data[0] = 0.0
+        data[2] = 2.0
+        index = ["a", "b", "c"]
+        result = Series(data, index=index)
+        expected = Series([0.0, np.nan, 2.0], index=index)
+        tm.assert_series_equal(result, expected)
+
+        data[1] = 1.0
+        result = Series(data, index=index)
+        expected = Series([0.0, 1.0, 2.0], index=index)
+        tm.assert_series_equal(result, expected)
+
+        data = ma.masked_all((3,), dtype=int)
+        result = Series(data)
+        expected = Series([np.nan, np.nan, np.nan], dtype=float)
+        tm.assert_series_equal(result, expected)
+
+        data[0] = 0
+        data[2] = 2
+        index = ["a", "b", "c"]
+        result = Series(data, index=index)
+        expected = Series([0, np.nan, 2], index=index, dtype=float)
+        tm.assert_series_equal(result, expected)
+
+        data[1] = 1
+        result = Series(data, index=index)
+        expected = Series([0, 1, 2], index=index, dtype=int)
+        with pytest.raises(AssertionError, match="Series classes are different"):
+            # TODO should this be raising at all?
+            # https://github.com/pandas-dev/pandas/issues/56131
+            tm.assert_series_equal(result, expected)
+
+        data = ma.masked_all((3,), dtype=bool)
+        result = Series(data)
+        expected = Series([np.nan, np.nan, np.nan], dtype=object)
+        tm.assert_series_equal(result, expected)
+
+        data[0] = True
+        data[2] = False
+        index = ["a", "b", "c"]
+        result = Series(data, index=index)
+        expected = Series([True, np.nan, False], index=index, dtype=object)
+        tm.assert_series_equal(result, expected)
+
+        data[1] = True
+        result = Series(data, index=index)
+        expected = Series([True, True, False], index=index, dtype=bool)
+        with pytest.raises(AssertionError, match="Series classes are different"):
+            # TODO should this be raising at all?
+            # https://github.com/pandas-dev/pandas/issues/56131
+            tm.assert_series_equal(result, expected)
+
+        data = ma.masked_all((3,), dtype="M8[ns]")
+        result = Series(data)
+        expected = Series([iNaT, iNaT, iNaT], dtype="M8[ns]")
+        tm.assert_series_equal(result, expected)
+
+        data[0] = datetime(2001, 1, 1)
+        data[2] = datetime(2001, 1, 3)
+        index = ["a", "b", "c"]
+        result = Series(data, index=index)
+        expected = Series(
+            [datetime(2001, 1, 1), iNaT, datetime(2001, 1, 3)],
+            index=index,
+            dtype="M8[ns]",
+        )
+        tm.assert_series_equal(result, expected)
+
+        data[1] = datetime(2001, 1, 2)
+        result = Series(data, index=index)
+        expected = Series(
+            [datetime(2001, 1, 1), datetime(2001, 1, 2), datetime(2001, 1, 3)],
+            index=index,
+            dtype="M8[ns]",
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_maskedarray_hardened(self):
+        # Check numpy masked arrays with hard masks -- from GH24574
+        data = ma.masked_all((3,), dtype=float).harden_mask()
+        result = Series(data)
+        expected = Series([np.nan, np.nan, np.nan])
+        tm.assert_series_equal(result, expected)
+
+    def test_series_ctor_plus_datetimeindex(self):
+        rng = date_range("20090415", "20090519", freq="B")
+        data = dict.fromkeys(rng, 1)
+
+        result = Series(data, index=rng)
+        assert result.index.is_(rng)
+
+    def test_constructor_default_index(self):
+        s = Series([0, 1, 2])
+        tm.assert_index_equal(s.index, Index(range(3)), exact=True)
+
+    @pytest.mark.parametrize(
+        "input",
+        [
+            [1, 2, 3],
+            (1, 2, 3),
+            list(range(3)),
+            Categorical(["a", "b", "a"]),
+            (i for i in range(3)),
+            (x for x in range(3)),
+        ],
+    )
+    def test_constructor_index_mismatch(self, input):
+        # GH 19342
+        # test that construction of a Series with an index of different length
+        # raises an error
+        msg = r"Length of values \(3\) does not match length of index \(4\)"
+        with pytest.raises(ValueError, match=msg):
+            Series(input, index=np.arange(4))
+
+    def test_constructor_numpy_scalar(self):
+        # GH 19342
+        # construction with a numpy scalar
+        # should not raise
+        result = Series(np.array(100), index=np.arange(4), dtype="int64")
+        expected = Series(100, index=np.arange(4), dtype="int64")
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_broadcast_list(self):
+        # GH 19342
+        # construction with single-element container and index
+        # should raise
+        msg = r"Length of values \(1\) does not match length of index \(3\)"
+        with pytest.raises(ValueError, match=msg):
+            Series(["foo"], index=["a", "b", "c"])
+
+    def test_constructor_corner(self):
+        df = DataFrame(range(5), index=date_range("2020-01-01", periods=5))
+        objs = [df, df]
+        s = Series(objs, index=[0, 1])
+        assert isinstance(s, Series)
+
+    def test_constructor_sanitize(self):
+        s = Series(np.array([1.0, 1.0, 8.0]), dtype="i8")
+        assert s.dtype == np.dtype("i8")
+
+        msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
+        with pytest.raises(IntCastingNaNError, match=msg):
+            Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8")
+
+    def test_constructor_copy(self):
+        # GH15125
+        # test dtype parameter has no side effects on copy=True
+        for data in [[1.0], np.array([1.0])]:
+            x = Series(data)
+            y = Series(x, copy=True, dtype=float)
+
+            # copy=True maintains original data in Series
+            tm.assert_series_equal(x, y)
+
+            # changes to origin of copy does not affect the copy
+            x[0] = 2.0
+            assert not x.equals(y)
+            assert x[0] == 2.0
+            assert y[0] == 1.0
+
+    @pytest.mark.parametrize(
+        "index",
+        [
+            date_range("20170101", periods=3, tz="US/Eastern"),
+            date_range("20170101", periods=3),
+            timedelta_range("1 day", periods=3),
+            period_range("2012Q1", periods=3, freq="Q"),
+            Index(list("abc")),
+            Index([1, 2, 3]),
+            RangeIndex(0, 3),
+        ],
+        ids=lambda x: type(x).__name__,
+    )
+    def test_constructor_limit_copies(self, index):
+        # GH 17449
+        # limit copies of input
+        s = Series(index)
+
+        # we make 1 copy; this is just a smoke test here
+        assert s._mgr.blocks[0].values is not index
+
+    def test_constructor_shallow_copy(self):
+        # constructing a Series from Series with copy=False should still
+        # give a "shallow" copy (share data, not attributes)
+        # https://github.com/pandas-dev/pandas/issues/49523
+        s = Series([1, 2, 3])
+        s_orig = s.copy()
+        s2 = Series(s)
+        assert s2._mgr is not s._mgr
+        # Overwriting index of s2 doesn't change s
+        s2.index = ["a", "b", "c"]
+        tm.assert_series_equal(s, s_orig)
+
+    def test_constructor_pass_none(self):
+        s = Series(None, index=range(5))
+        assert s.dtype == np.float64
+
+        s = Series(None, index=range(5), dtype=object)
+        assert s.dtype == np.object_
+
+        # GH 7431
+        # inference on the index
+        s = Series(index=np.array([None]))
+        expected = Series(index=Index([None]))
+        tm.assert_series_equal(s, expected)
+
+    def test_constructor_pass_nan_nat(self):
+        # GH 13467
+        exp = Series([np.nan, np.nan], dtype=np.float64)
+        assert exp.dtype == np.float64
+        tm.assert_series_equal(Series([np.nan, np.nan]), exp)
+        tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp)
+
+        exp = Series([NaT, NaT])
+        assert exp.dtype == "datetime64[s]"
+        tm.assert_series_equal(Series([NaT, NaT]), exp)
+        tm.assert_series_equal(Series(np.array([NaT, NaT])), exp)
+
+        tm.assert_series_equal(Series([NaT, np.nan]), exp)
+        tm.assert_series_equal(Series(np.array([NaT, np.nan])), exp)
+
+        tm.assert_series_equal(Series([np.nan, NaT]), exp)
+        tm.assert_series_equal(Series(np.array([np.nan, NaT])), exp)
+
+    def test_constructor_cast(self):
+        msg = "could not convert string to float"
+        with pytest.raises(ValueError, match=msg):
+            Series(["a", "b", "c"], dtype=float)
+
+    def test_constructor_signed_int_overflow_raises(self):
+        # GH#41734 disallow silent overflow, enforced in 2.0
+        if np_version_gt2:
+            msg = "The elements provided in the data cannot all be casted to the dtype"
+            err = OverflowError
+        else:
+            msg = "Values are too large to be losslessly converted"
+            err = ValueError
+        with pytest.raises(err, match=msg):
+            Series([1, 200, 923442], dtype="int8")
+
+        with pytest.raises(err, match=msg):
+            Series([1, 200, 923442], dtype="uint8")
+
+    @pytest.mark.parametrize(
+        "values",
+        [
+            np.array([1], dtype=np.uint16),
+            np.array([1], dtype=np.uint32),
+            np.array([1], dtype=np.uint64),
+            [np.uint16(1)],
+            [np.uint32(1)],
+            [np.uint64(1)],
+        ],
+    )
+    def test_constructor_numpy_uints(self, values):
+        # GH#47294
+        value = values[0]
+        result = Series(values)
+
+        assert result[0].dtype == value.dtype
+        assert result[0] == value
+
+    def test_constructor_unsigned_dtype_overflow(self, any_unsigned_int_numpy_dtype):
+        # see gh-15832
+        if np_version_gt2:
+            msg = (
+                f"The elements provided in the data cannot "
+                f"all be casted to the dtype {any_unsigned_int_numpy_dtype}"
+            )
+        else:
+            msg = "Trying to coerce negative values to unsigned integers"
+        with pytest.raises(OverflowError, match=msg):
+            Series([-1], dtype=any_unsigned_int_numpy_dtype)
+
+    def test_constructor_floating_data_int_dtype(self, frame_or_series):
+        # GH#40110
+        arr = np.random.default_rng(2).standard_normal(2)
+
+        # Long-standing behavior (for Series, new in 2.0 for DataFrame)
+        #  has been to ignore the dtype on these;
+        #  not clear if this is what we want long-term
+        # expected = frame_or_series(arr)
+
+        # GH#49599 as of 2.0 we raise instead of silently retaining float dtype
+        msg = "Trying to coerce float values to integer"
+        with pytest.raises(ValueError, match=msg):
+            frame_or_series(arr, dtype="i8")
+
+        with pytest.raises(ValueError, match=msg):
+            frame_or_series(list(arr), dtype="i8")
+
+        # pre-2.0, when we had NaNs, we silently ignored the integer dtype
+        arr[0] = np.nan
+        # expected = frame_or_series(arr)
+
+        msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
+        with pytest.raises(IntCastingNaNError, match=msg):
+            frame_or_series(arr, dtype="i8")
+
+        exc = IntCastingNaNError
+        if frame_or_series is Series:
+            # TODO: try to align these
+            exc = ValueError
+            msg = "cannot convert float NaN to integer"
+        with pytest.raises(exc, match=msg):
+            # same behavior if we pass list instead of the ndarray
+            frame_or_series(list(arr), dtype="i8")
+
+        # float array that can be losslessly cast to integers
+        arr = np.array([1.0, 2.0], dtype="float64")
+        expected = frame_or_series(arr.astype("i8"))
+
+        obj = frame_or_series(arr, dtype="i8")
+        tm.assert_equal(obj, expected)
+
+        obj = frame_or_series(list(arr), dtype="i8")
+        tm.assert_equal(obj, expected)
+
+    def test_constructor_coerce_float_fail(self, any_int_numpy_dtype):
+        # see gh-15832
+        # Updated: make sure we treat this list the same as we would treat
+        #  the equivalent ndarray
+        # GH#49599 pre-2.0 we silently retained float dtype, in 2.0 we raise
+        vals = [1, 2, 3.5]
+
+        msg = "Trying to coerce float values to integer"
+        with pytest.raises(ValueError, match=msg):
+            Series(vals, dtype=any_int_numpy_dtype)
+        with pytest.raises(ValueError, match=msg):
+            Series(np.array(vals), dtype=any_int_numpy_dtype)
+
+    def test_constructor_coerce_float_valid(self, float_numpy_dtype):
+        s = Series([1, 2, 3.5], dtype=float_numpy_dtype)
+        expected = Series([1, 2, 3.5]).astype(float_numpy_dtype)
+        tm.assert_series_equal(s, expected)
+
+    def test_constructor_invalid_coerce_ints_with_float_nan(self, any_int_numpy_dtype):
+        # GH 22585
+        # Updated: make sure we treat this list the same as we would treat the
+        # equivalent ndarray
+        vals = [1, 2, np.nan]
+        # pre-2.0 this would return with a float dtype, in 2.0 we raise
+
+        msg = "cannot convert float NaN to integer"
+        with pytest.raises(ValueError, match=msg):
+            Series(vals, dtype=any_int_numpy_dtype)
+        msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
+        with pytest.raises(IntCastingNaNError, match=msg):
+            Series(np.array(vals), dtype=any_int_numpy_dtype)
+
+    def test_constructor_dtype_no_cast(self):
+        # see gh-1572
+        s = Series([1, 2, 3])
+        s2 = Series(s, dtype=np.int64)
+
+        s2[1] = 5
+        assert s[1] == 2
+
+    def test_constructor_datelike_coercion(self):
+        # GH 9477
+        # incorrectly inferring on dateimelike looking when object dtype is
+        # specified
+        s = Series([Timestamp("20130101"), "NOV"], dtype=object)
+        assert s.iloc[0] == Timestamp("20130101")
+        assert s.iloc[1] == "NOV"
+        assert s.dtype == object
+
+    def test_constructor_datelike_coercion2(self):
+        # the dtype was being reset on the slicing and re-inferred to datetime
+        # even thought the blocks are mixed
+        belly = "216 3T19".split()
+        wing1 = "2T15 4H19".split()
+        wing2 = "416 4T20".split()
+        mat = pd.to_datetime("2016-01-22 2019-09-07".split())
+        df = DataFrame({"wing1": wing1, "wing2": wing2, "mat": mat}, index=belly)
+
+        result = df.loc["3T19"]
+        assert result.dtype == object
+        result = df.loc["216"]
+        assert result.dtype == object
+
+    def test_constructor_mixed_int_and_timestamp(self, frame_or_series):
+        # specifically Timestamp with nanos, not datetimes
+        objs = [Timestamp(9), 10, NaT._value]
+        result = frame_or_series(objs, dtype="M8[ns]")
+
+        expected = frame_or_series([Timestamp(9), Timestamp(10), NaT])
+        tm.assert_equal(result, expected)
+
+    def test_constructor_datetimes_with_nulls(self):
+        # gh-15869
+        for arr in [
+            np.array([None, None, None, None, datetime.now(), None]),
+            np.array([None, None, datetime.now(), None]),
+        ]:
+            result = Series(arr)
+            assert result.dtype == "M8[us]"
+
+    def test_constructor_dtype_datetime64(self):
+        s = Series(iNaT, dtype="M8[ns]", index=range(5))
+        assert isna(s).all()
+
+        # in theory this should be all nulls, but since
+        # we are not specifying a dtype is ambiguous
+        s = Series(iNaT, index=range(5))
+        assert not isna(s).all()
+
+        s = Series(np.nan, dtype="M8[ns]", index=range(5))
+        assert isna(s).all()
+
+        s = Series([datetime(2001, 1, 2, 0, 0), iNaT], dtype="M8[ns]")
+        assert isna(s[1])
+        assert s.dtype == "M8[ns]"
+
+        s = Series([datetime(2001, 1, 2, 0, 0), np.nan], dtype="M8[ns]")
+        assert isna(s[1])
+        assert s.dtype == "M8[ns]"
+
+    def test_constructor_dtype_datetime64_10(self):
+        # GH3416
+        pydates = [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3)]
+        dates = [np.datetime64(x) for x in pydates]
+
+        ser = Series(dates)
+        assert ser.dtype == "M8[us]"
+
+        ser.iloc[0] = np.nan
+        assert ser.dtype == "M8[us]"
+
+        # GH3414 related
+        expected = Series(pydates, dtype="datetime64[ms]")
+
+        result = Series(Series(dates).astype(np.int64) / 1000, dtype="M8[ms]")
+        tm.assert_series_equal(result, expected)
+
+        result = Series(dates, dtype="datetime64[ms]")
+        tm.assert_series_equal(result, expected)
+
+        expected = Series(
+            [NaT, datetime(2013, 1, 2), datetime(2013, 1, 3)], dtype="datetime64[ns]"
+        )
+        result = Series([np.nan, *dates[1:]], dtype="datetime64[ns]")
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_dtype_datetime64_11(self):
+        pydates = [datetime(2013, 1, 1), datetime(2013, 1, 2), datetime(2013, 1, 3)]
+        dates = [np.datetime64(x) for x in pydates]
+
+        dts = Series(dates, dtype="datetime64[ns]")
+
+        # valid astype
+        dts.astype("int64")
+
+        # invalid casting
+        msg = r"Converting from datetime64\[ns\] to int32 is not supported"
+        with pytest.raises(TypeError, match=msg):
+            dts.astype("int32")
+
+        # ints are ok
+        # we test with np.int64 to get similar results on
+        # windows / 32-bit platforms
+        result = Series(dts, dtype=np.int64)
+        expected = Series(dts.astype(np.int64))
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_dtype_datetime64_9(self):
+        # invalid dates can be help as object
+        result = Series([datetime(2, 1, 1)])
+        assert result[0] == datetime(2, 1, 1, 0, 0)
+
+        result = Series([datetime(3000, 1, 1)])
+        assert result[0] == datetime(3000, 1, 1, 0, 0)
+
+    def test_constructor_dtype_datetime64_8(self):
+        # don't mix types
+        result = Series([Timestamp("20130101"), 1], index=["a", "b"])
+        assert result["a"] == Timestamp("20130101")
+        assert result["b"] == 1
+
+    def test_constructor_dtype_datetime64_7(self):
+        # GH6529
+        # coerce datetime64 non-ns properly
+        dates = date_range("01-Jan-2015", "01-Dec-2015", freq="ME")
+        values2 = dates.view(np.ndarray).astype("datetime64[ns]")
+        expected = Series(values2, index=dates)
+
+        for unit in ["s", "D", "ms", "us", "ns"]:
+            dtype = np.dtype(f"M8[{unit}]")
+            values1 = dates.view(np.ndarray).astype(dtype)
+            result = Series(values1, dates)
+            if unit == "D":
+                # for unit="D" we cast to nearest-supported reso, i.e. "s"
+                dtype = np.dtype("M8[s]")
+            assert result.dtype == dtype
+            tm.assert_series_equal(result, expected.astype(dtype))
+
+        # GH 13876
+        # coerce to non-ns to object properly
+        expected = Series(values2, index=dates, dtype=object)
+        for dtype in ["s", "D", "ms", "us", "ns"]:
+            values1 = dates.view(np.ndarray).astype(f"M8[{dtype}]")
+            result = Series(values1, index=dates, dtype=object)
+            tm.assert_series_equal(result, expected)
+
+        # leave datetime.date alone
+        dates2 = np.array([d.date() for d in dates.to_pydatetime()], dtype=object)
+        series1 = Series(dates2, dates)
+        tm.assert_numpy_array_equal(series1.values, dates2)
+        assert series1.dtype == object
+
+    def test_constructor_dtype_datetime64_6(self):
+        # as of 2.0, these no longer infer datetime64 based on the strings,
+        #  matching the Index behavior
+
+        ser = Series([None, NaT, "2013-08-05 15:30:00.000001"])
+        assert ser.dtype == object
+
+        ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
+        assert ser.dtype == object
+
+        ser = Series([NaT, None, "2013-08-05 15:30:00.000001"])
+        assert ser.dtype == object
+
+        ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
+        assert ser.dtype == object
+
+    def test_constructor_dtype_datetime64_5(self):
+        # tz-aware (UTC and other tz's)
+        # GH 8411
+        dr = date_range("20130101", periods=3)
+        assert Series(dr).iloc[0].tz is None
+        dr = date_range("20130101", periods=3, tz="UTC")
+        assert str(Series(dr).iloc[0].tz) == "UTC"
+        dr = date_range("20130101", periods=3, tz="US/Eastern")
+        assert str(Series(dr).iloc[0].tz) == "US/Eastern"
+
+    def test_constructor_dtype_datetime64_4(self):
+        # non-convertible
+        ser = Series([1479596223000, -1479590, NaT])
+        assert ser.dtype == "object"
+        assert ser[2] is NaT
+        assert "NaT" in str(ser)
+
+    def test_constructor_dtype_datetime64_3(self):
+        # if we passed a NaT it remains
+        ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), NaT])
+        assert ser.dtype == "M8[us]"
+        assert ser[2] is NaT
+        assert "NaT" in str(ser)
+
+    def test_constructor_dtype_datetime64_2(self):
+        # if we passed a nan it remains
+        ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan])
+        assert ser.dtype == "M8[us]"
+        assert ser[2] is NaT
+        assert "NaT" in str(ser)
+
+    def test_constructor_with_datetime_tz(self):
+        # 8260
+        # support datetime64 with tz
+
+        dr = date_range("20130101", periods=3, tz="US/Eastern", unit="ns")
+        s = Series(dr)
+        assert s.dtype.name == "datetime64[ns, US/Eastern]"
+        assert s.dtype == "datetime64[ns, US/Eastern]"
+        assert isinstance(s.dtype, DatetimeTZDtype)
+        assert "datetime64[ns, US/Eastern]" in str(s)
+
+        # export
+        result = s.values
+        assert isinstance(result, np.ndarray)
+        assert result.dtype == "datetime64[ns]"
+
+        exp = DatetimeIndex(result)
+        exp = exp.tz_localize("UTC").tz_convert(tz=s.dt.tz)
+        tm.assert_index_equal(dr, exp)
+
+        # indexing
+        result = s.iloc[0]
+        assert result == Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern")
+        result = s[0]
+        assert result == Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern")
+
+        result = s[Series([True, True, False], index=s.index)]
+        tm.assert_series_equal(result, s[0:2])
+
+        result = s.iloc[0:1]
+        tm.assert_series_equal(result, Series(dr[0:1]))
+
+        # concat
+        result = pd.concat([s.iloc[0:1], s.iloc[1:]])
+        tm.assert_series_equal(result, s)
+
+        # short str
+        assert "datetime64[ns, US/Eastern]" in str(s)
+
+        # formatting with NaT
+        result = s.shift()
+        assert "datetime64[ns, US/Eastern]" in str(result)
+        assert "NaT" in str(result)
+
+        result = DatetimeIndex(s, freq="infer")
+        tm.assert_index_equal(result, dr)
+
+    def test_constructor_with_datetime_tz5(self):
+        # long str
+        ser = Series(date_range("20130101", periods=1000, tz="US/Eastern", unit="ns"))
+        assert "datetime64[ns, US/Eastern]" in str(ser)
+
+    def test_constructor_with_datetime_tz4(self):
+        # inference
+        ser = Series(
+            [
+                Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific").as_unit("s"),
+                Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific").as_unit("s"),
+            ]
+        )
+        assert ser.dtype == "datetime64[s, US/Pacific]"
+        assert lib.infer_dtype(ser, skipna=True) == "datetime64"
+
+    def test_constructor_with_datetime_tz3(self):
+        ser = Series(
+            [
+                Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"),
+                Timestamp("2013-01-02 14:00:00-0800", tz="US/Eastern"),
+            ]
+        )
+        assert ser.dtype == "object"
+        assert lib.infer_dtype(ser, skipna=True) == "datetime"
+
+    def test_constructor_with_datetime_tz2(self):
+        # with all NaT
+        ser = Series(NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]")
+        dti = DatetimeIndex(["NaT", "NaT"], tz="US/Eastern").as_unit("ns")
+        expected = Series(dti)
+        tm.assert_series_equal(ser, expected)
+
+    def test_constructor_no_partial_datetime_casting(self):
+        # GH#40111
+        vals = [
+            "nan",
+            Timestamp("1990-01-01"),
+            "2015-03-14T16:15:14.123-08:00",
+            "2019-03-04T21:56:32.620-07:00",
+            None,
+        ]
+        ser = Series(vals)
+        assert all(ser[i] is vals[i] for i in range(len(vals)))
+
+    @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64])
+    @pytest.mark.parametrize("kind", ["M", "m"])
+    @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"])
+    def test_construction_to_datetimelike_unit(self, arr_dtype, kind, unit):
+        # tests all units
+        # gh-19223
+        # TODO: GH#19223 was about .astype, doesn't belong here
+        dtype = f"{kind}8[{unit}]"
+        arr = np.array([1, 2, 3], dtype=arr_dtype)
+        ser = Series(arr)
+        result = ser.astype(dtype)
+
+        expected = Series(arr.astype(dtype))
+
+        if unit in ["ns", "us", "ms", "s"]:
+            assert result.dtype == dtype
+            assert expected.dtype == dtype
+        else:
+            # Otherwise we cast to nearest-supported unit, i.e. seconds
+            assert result.dtype == f"{kind}8[s]"
+            assert expected.dtype == f"{kind}8[s]"
+
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("arg", ["2013-01-01 00:00:00", NaT, np.nan, None])
+    def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg):
+        # GH 17415: With naive string
+        result = Series([arg], dtype="datetime64[ns, CET]")
+        expected = Series([Timestamp(arg)], dtype="M8[ns]").dt.tz_localize("CET")
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_datetime64_bigendian(self):
+        # GH#30976
+        ms = np.datetime64(1, "ms")
+        arr = np.array([np.datetime64(1, "ms")], dtype=">M8[ms]")
+
+        result = Series(arr)
+        expected = Series([Timestamp(ms)]).astype("M8[ms]")
+        assert expected.dtype == "M8[ms]"
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("interval_constructor", [IntervalIndex, IntervalArray])
+    def test_construction_interval(self, interval_constructor):
+        # construction from interval & array of intervals
+        intervals = interval_constructor.from_breaks(np.arange(3), closed="right")
+        result = Series(intervals)
+        assert result.dtype == "interval[int64, right]"
+        tm.assert_index_equal(Index(result.values), Index(intervals))
+
+    @pytest.mark.parametrize(
+        "data_constructor", [list, np.array], ids=["list", "ndarray[object]"]
+    )
+    def test_constructor_infer_interval(self, data_constructor):
+        # GH 23563: consistent closed results in interval dtype
+        data = [Interval(0, 1), Interval(0, 2), None]
+        result = Series(data_constructor(data))
+        expected = Series(IntervalArray(data))
+        assert result.dtype == "interval[float64, right]"
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "data_constructor", [list, np.array], ids=["list", "ndarray[object]"]
+    )
+    def test_constructor_interval_mixed_closed(self, data_constructor):
+        # GH 23563: mixed closed results in object dtype (not interval dtype)
+        data = [Interval(0, 1, closed="both"), Interval(0, 2, closed="neither")]
+        result = Series(data_constructor(data))
+        assert result.dtype == object
+        assert result.tolist() == data
+
+    def test_construction_consistency(self):
+        # make sure that we are not re-localizing upon construction
+        # GH 14928
+        ser = Series(date_range("20130101", periods=3, tz="US/Eastern"))
+
+        result = Series(ser, dtype=ser.dtype)
+        tm.assert_series_equal(result, ser)
+
+        result = Series(ser.dt.tz_convert("UTC"), dtype=ser.dtype)
+        tm.assert_series_equal(result, ser)
+
+        # Pre-2.0 dt64 values were treated as utc, which was inconsistent
+        #  with DatetimeIndex, which treats them as wall times, see GH#33401
+        result = Series(ser.values, dtype=ser.dtype)
+        expected = Series(ser.values).dt.tz_localize(ser.dtype.tz)
+        tm.assert_series_equal(result, expected)
+
+        with tm.assert_produces_warning(None):
+            # one suggested alternative to the deprecated (changed in 2.0) usage
+            middle = Series(ser.values).dt.tz_localize("UTC")
+            result = middle.dt.tz_convert(ser.dtype.tz)
+        tm.assert_series_equal(result, ser)
+
+        with tm.assert_produces_warning(None):
+            # the other suggested alternative to the deprecated usage
+            result = Series(ser.values.view("int64"), dtype=ser.dtype)
+        tm.assert_series_equal(result, ser)
+
+    @pytest.mark.parametrize(
+        "data_constructor", [list, np.array], ids=["list", "ndarray[object]"]
+    )
+    def test_constructor_infer_period(self, data_constructor):
+        data = [Period("2000", "D"), Period("2001", "D"), None]
+        result = Series(data_constructor(data))
+        expected = Series(period_array(data))
+        tm.assert_series_equal(result, expected)
+        assert result.dtype == "Period[D]"
+
+    @pytest.mark.xfail(reason="PeriodDtype Series not supported yet")
+    def test_construct_from_ints_including_iNaT_scalar_period_dtype(self):
+        series = Series([0, 1000, 2000, pd._libs.iNaT], dtype="period[D]")
+
+        val = series[3]
+        assert isna(val)
+
+        series[2] = val
+        assert isna(series[2])
+
+    def test_constructor_period_incompatible_frequency(self):
+        data = [Period("2000", "D"), Period("2001", "Y")]
+        result = Series(data)
+        assert result.dtype == object
+        assert result.tolist() == data
+
+    def test_constructor_periodindex(self):
+        # GH7932
+        # converting a PeriodIndex when put in a Series
+
+        pi = period_range("20130101", periods=5, freq="D")
+        s = Series(pi)
+        assert s.dtype == "Period[D]"
+        expected = Series(pi.astype(object))
+        assert expected.dtype == object
+
+    def test_constructor_dict(self):
+        d = {"a": 0.0, "b": 1.0, "c": 2.0}
+
+        result = Series(d)
+        expected = Series(d, index=sorted(d.keys()))
+        tm.assert_series_equal(result, expected)
+
+        result = Series(d, index=["b", "c", "d", "a"])
+        expected = Series([1, 2, np.nan, 0], index=["b", "c", "d", "a"])
+        tm.assert_series_equal(result, expected)
+
+        pidx = period_range("2020-01-01", periods=10, freq="D")
+        d = {pidx[0]: 0, pidx[1]: 1}
+        result = Series(d, index=pidx)
+        expected = Series(np.nan, pidx, dtype=np.float64)
+        expected.iloc[0] = 0
+        expected.iloc[1] = 1
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_dict_list_value_explicit_dtype(self):
+        # GH 18625
+        d = {"a": [[2], [3], [4]]}
+        result = Series(d, index=["a"], dtype="object")
+        expected = Series(d, index=["a"])
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_dict_order(self):
+        # GH19018
+        # initialization ordering: by insertion order
+        d = {"b": 1, "a": 0, "c": 2}
+        result = Series(d)
+        expected = Series([1, 0, 2], index=list("bac"))
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_dict_extension(self, ea_scalar_and_dtype):
+        ea_scalar, ea_dtype = ea_scalar_and_dtype
+        d = {"a": ea_scalar}
+        result = Series(d, index=["a"])
+        expected = Series(ea_scalar, index=["a"], dtype=ea_dtype)
+
+        assert result.dtype == ea_dtype
+
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")])
+    def test_constructor_dict_nan_key(self, value):
+        # GH 18480
+        d = {1: "a", value: "b", float("nan"): "c", 4: "d"}
+        result = Series(d).sort_values()
+        expected = Series(["a", "b", "c", "d"], index=[1, value, np.nan, 4])
+        tm.assert_series_equal(result, expected)
+
+        # MultiIndex:
+        d = {(1, 1): "a", (2, np.nan): "b", (3, value): "c"}
+        result = Series(d).sort_values()
+        expected = Series(
+            ["a", "b", "c"], index=Index([(1, 1), (2, np.nan), (3, value)])
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_dict_datetime64_index(self):
+        # GH 9456
+
+        dates_as_str = ["1984-02-19", "1988-11-06", "1989-12-03", "1990-03-15"]
+        values = [42544017.198965244, 1234565, 40512335.181958228, -1]
+
+        def create_data(constructor):
+            return dict(zip((constructor(x) for x in dates_as_str), values))
+
+        data_datetime64 = create_data(np.datetime64)
+        data_datetime = create_data(lambda x: datetime.strptime(x, "%Y-%m-%d"))
+        data_Timestamp = create_data(Timestamp)
+
+        expected = Series(values, (Timestamp(x) for x in dates_as_str))
+
+        result_datetime64 = Series(data_datetime64)
+        result_datetime = Series(data_datetime)
+        result_Timestamp = Series(data_Timestamp)
+
+        tm.assert_series_equal(
+            result_datetime64, expected.set_axis(expected.index.as_unit("s"))
+        )
+        tm.assert_series_equal(result_datetime, expected)
+        tm.assert_series_equal(result_Timestamp, expected)
+
+    def test_constructor_dict_tuple_indexer(self):
+        # GH 12948
+        data = {(1, 1, None): -1.0}
+        result = Series(data)
+        expected = Series(
+            -1.0, index=MultiIndex(levels=[[1], [1], [np.nan]], codes=[[0], [0], [-1]])
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_mapping(self, non_dict_mapping_subclass):
+        # GH 29788
+        ndm = non_dict_mapping_subclass({3: "three"})
+        result = Series(ndm)
+        expected = Series(["three"], index=[3])
+
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_list_of_tuples(self):
+        data = [(1, 1), (2, 2), (2, 3)]
+        s = Series(data)
+        assert list(s) == data
+
+    def test_constructor_tuple_of_tuples(self):
+        data = ((1, 1), (2, 2), (2, 3))
+        s = Series(data)
+        assert tuple(s) == data
+
+    @pytest.mark.parametrize(
+        "data, expected_values, expected_index",
+        [
+            ({(1, 2): 3, (None, 5): 6}, [3, 6], [(1, 2), (None, 5)]),
+            ({(1,): 3, (4, 5): 6}, [3, 6], [(1, None), (4, 5)]),
+        ],
+    )
+    def test_constructor_dict_of_tuples(self, data, expected_values, expected_index):
+        # GH 60695
+        result = Series(data).sort_values()
+        expected = Series(expected_values, index=MultiIndex.from_tuples(expected_index))
+        tm.assert_series_equal(result, expected)
+
+    # https://github.com/pandas-dev/pandas/issues/22698
+    @pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning")
+    def test_fromDict(self, using_infer_string):
+        data = {"a": 0, "b": 1, "c": 2, "d": 3}
+
+        series = Series(data)
+        tm.assert_is_sorted(series.index)
+
+        data = {"a": 0, "b": "1", "c": "2", "d": datetime.now()}
+        series = Series(data)
+        assert series.dtype == np.object_
+
+        data = {"a": 0, "b": "1", "c": "2", "d": "3"}
+        series = Series(data)
+        assert series.dtype == np.object_ if not using_infer_string else "str"
+
+        data = {"a": "0", "b": "1"}
+        series = Series(data, dtype=float)
+        assert series.dtype == np.float64
+
+    def test_fromValue(self, datetime_series, using_infer_string):
+        nans = Series(np.nan, index=datetime_series.index, dtype=np.float64)
+        assert nans.dtype == np.float64
+        assert len(nans) == len(datetime_series)
+
+        strings = Series("foo", index=datetime_series.index)
+        assert strings.dtype == np.object_ if not using_infer_string else "str"
+        assert len(strings) == len(datetime_series)
+
+        d = datetime.now()
+        dates = Series(d, index=datetime_series.index)
+        assert dates.dtype == "M8[us]"
+        assert len(dates) == len(datetime_series)
+
+        # GH12336
+        # Test construction of categorical series from value
+        categorical = Series(0, index=datetime_series.index, dtype="category")
+        expected = Series(0, index=datetime_series.index).astype("category")
+        assert categorical.dtype == "category"
+        assert len(categorical) == len(datetime_series)
+        tm.assert_series_equal(categorical, expected)
+
+    def test_constructor_dtype_timedelta64(self):
+        # basic
+        td = Series([timedelta(days=i) for i in range(3)])
+        assert td.dtype == "timedelta64[us]"
+
+        td = Series([timedelta(days=1)])
+        assert td.dtype == "timedelta64[us]"
+
+        td = Series([timedelta(days=1), timedelta(days=2), np.timedelta64(1, "s")])
+
+        assert td.dtype == "timedelta64[us]"
+
+        # mixed with NaT
+        td = Series([timedelta(days=1), NaT], dtype="m8[ns]")
+        assert td.dtype == "timedelta64[ns]"
+
+        td = Series([timedelta(days=1), np.nan], dtype="m8[ns]")
+        assert td.dtype == "timedelta64[ns]"
+
+        td = Series([np.timedelta64(300000000), NaT], dtype="m8[ns]")
+        assert td.dtype == "timedelta64[ns]"
+
+        # improved inference
+        # GH5689
+        td = Series([np.timedelta64(300000000), NaT])
+        assert td.dtype == "timedelta64[ns]"
+
+        # because iNaT is int, not coerced to timedelta
+        td = Series([np.timedelta64(300000000), iNaT])
+        assert td.dtype == "object"
+
+        td = Series([np.timedelta64(300000000), np.nan])
+        assert td.dtype == "timedelta64[ns]"
+
+        td = Series([NaT, np.timedelta64(300000000)])
+        assert td.dtype == "timedelta64[ns]"
+
+        td = Series([np.timedelta64(1, "s")])
+        assert td.dtype == "timedelta64[s]"
+
+        # valid astype
+        td.astype("int64")
+
+        # invalid casting
+        msg = r"Converting from timedelta64\[s\] to int32 is not supported"
+        with pytest.raises(TypeError, match=msg):
+            td.astype("int32")
+
+        # this is an invalid casting
+        msg = "|".join(
+            [
+                "Could not convert object to NumPy timedelta",
+                "Could not convert 'foo' to NumPy timedelta",
+            ]
+        )
+        with pytest.raises(ValueError, match=msg):
+            Series([timedelta(days=1), "foo"], dtype="m8[ns]")
+
+        # leave as object here
+        td = Series([timedelta(days=i) for i in range(3)] + ["foo"])
+        assert td.dtype == "object"
+
+        # as of 2.0, these no longer infer timedelta64 based on the strings,
+        #  matching Index behavior
+        ser = Series([None, NaT, "1 Day"])
+        assert ser.dtype == object
+
+        ser = Series([np.nan, NaT, "1 Day"])
+        assert ser.dtype == object
+
+        ser = Series([NaT, None, "1 Day"])
+        assert ser.dtype == object
+
+        ser = Series([NaT, np.nan, "1 Day"])
+        assert ser.dtype == object
+
+    # GH 16406
+    def test_constructor_mixed_tz(self):
+        s = Series([Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")])
+        expected = Series(
+            [Timestamp("20130101"), Timestamp("20130101", tz="US/Eastern")],
+            dtype="object",
+        )
+        tm.assert_series_equal(s, expected)
+
+    def test_NaT_scalar(self):
+        series = Series([0, 1000, 2000, iNaT], dtype="M8[ns]")
+
+        val = series[3]
+        assert isna(val)
+
+        series[2] = val
+        assert isna(series[2])
+
+    def test_NaT_cast(self):
+        # GH10747
+        result = Series([np.nan]).astype("M8[ns]")
+        expected = Series([NaT], dtype="M8[ns]")
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_name_hashable(self):
+        for n in [777, 777.0, "name", datetime(2001, 11, 11), (1,), "\u05d0"]:
+            for data in [[1, 2, 3], np.ones(3), {"a": 0, "b": 1}]:
+                s = Series(data, name=n)
+                assert s.name == n
+
+    def test_constructor_name_unhashable(self):
+        msg = r"Series\.name must be a hashable type"
+        for n in [["name_list"], np.ones(2), {1: 2}]:
+            for data in [["name_list"], np.ones(2), {1: 2}]:
+                with pytest.raises(TypeError, match=msg):
+                    Series(data, name=n)
+
+    def test_auto_conversion(self):
+        series = Series(list(date_range("1/1/2000", periods=10, unit="ns")))
+        assert series.dtype == "M8[ns]"
+
+    def test_convert_non_ns(self):
+        # convert from a numpy array of non-ns timedelta64
+        arr = np.array([1, 2, 3], dtype="timedelta64[s]")
+        ser = Series(arr)
+        assert ser.dtype == arr.dtype
+
+        tdi = timedelta_range("00:00:01", periods=3, freq="s").as_unit("s")
+        expected = Series(tdi)
+        assert expected.dtype == arr.dtype
+        tm.assert_series_equal(ser, expected)
+
+        # convert from a numpy array of non-ns datetime64
+        arr = np.array(
+            ["2013-01-01", "2013-01-02", "2013-01-03"], dtype="datetime64[D]"
+        )
+        ser = Series(arr)
+        expected = Series(date_range("20130101", periods=3, freq="D"), dtype="M8[s]")
+        assert expected.dtype == "M8[s]"
+        tm.assert_series_equal(ser, expected)
+
+        arr = np.array(
+            ["2013-01-01 00:00:01", "2013-01-01 00:00:02", "2013-01-01 00:00:03"],
+            dtype="datetime64[s]",
+        )
+        ser = Series(arr)
+        expected = Series(
+            date_range("20130101 00:00:01", periods=3, freq="s"), dtype="M8[s]"
+        )
+        assert expected.dtype == "M8[s]"
+        tm.assert_series_equal(ser, expected)
+
+    @pytest.mark.parametrize(
+        "index",
+        [
+            date_range("1/1/2000", periods=10),
+            timedelta_range("1 day", periods=10),
+            period_range("2000-Q1", periods=10, freq="Q"),
+        ],
+        ids=lambda x: type(x).__name__,
+    )
+    def test_constructor_cant_cast_datetimelike(self, index):
+        # floats are not ok
+        # strip Index to convert PeriodIndex -> Period
+        # We don't care whether the error message says
+        # PeriodIndex or PeriodArray
+        msg = f"Cannot cast {type(index).__name__.rstrip('Index')}.*? to "
+
+        with pytest.raises(TypeError, match=msg):
+            Series(index, dtype=float)
+
+        # ints are ok
+        # we test with np.int64 to get similar results on
+        # windows / 32-bit platforms
+        result = Series(index, dtype=np.int64)
+        expected = Series(index.astype(np.int64))
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "index",
+        [
+            date_range("1/1/2000", periods=10),
+            timedelta_range("1 day", periods=10),
+            period_range("2000-Q1", periods=10, freq="Q"),
+        ],
+        ids=lambda x: type(x).__name__,
+    )
+    def test_constructor_cast_object(self, index):
+        s = Series(index, dtype=object)
+        exp = Series(index).astype(object)
+        tm.assert_series_equal(s, exp)
+
+        s = Series(Index(index, dtype=object), dtype=object)
+        exp = Series(index).astype(object)
+        tm.assert_series_equal(s, exp)
+
+        s = Series(index.astype(object), dtype=object)
+        exp = Series(index).astype(object)
+        tm.assert_series_equal(s, exp)
+
+    @pytest.mark.parametrize("dtype", [np.datetime64, np.timedelta64])
+    def test_constructor_generic_timestamp_no_frequency(self, dtype, request):
+        # see gh-15524, gh-15987
+        msg = "dtype has no unit. Please pass in"
+
+        if np.dtype(dtype).name not in ["timedelta64", "datetime64"]:
+            mark = pytest.mark.xfail(reason="GH#33890 Is assigned ns unit")
+            request.applymarker(mark)
+
+        with pytest.raises(ValueError, match=msg):
+            Series([], dtype=dtype)
+
+    @pytest.mark.parametrize("unit", ["ps", "as", "fs", "Y", "M", "W", "D", "h", "m"])
+    @pytest.mark.parametrize("kind", ["m", "M"])
+    def test_constructor_generic_timestamp_bad_frequency(self, kind, unit):
+        # see gh-15524, gh-15987
+        # as of 2.0 we raise on any non-supported unit rather than silently
+        #  cast to nanos; previously we only raised for frequencies higher
+        #  than ns
+        dtype = f"{kind}8[{unit}]"
+
+        msg = "dtype=.* is not supported. Supported resolutions are"
+        with pytest.raises(TypeError, match=msg):
+            Series([], dtype=dtype)
+
+        with pytest.raises(TypeError, match=msg):
+            # pre-2.0 the DataFrame cast raised but the Series case did not
+            DataFrame([[0]], dtype=dtype)
+
+    @pytest.mark.parametrize("dtype", [None, "uint8", "category"])
+    def test_constructor_range_dtype(self, dtype):
+        # GH 16804
+        expected = Series([0, 1, 2, 3, 4], dtype=dtype or "int64")
+        result = Series(range(5), dtype=dtype)
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_range_overflows(self):
+        # GH#30173 range objects that overflow int64
+        rng = range(2**63, 2**63 + 4)
+        ser = Series(rng)
+        expected = Series(list(rng))
+        tm.assert_series_equal(ser, expected)
+        assert list(ser) == list(rng)
+        assert ser.dtype == np.uint64
+
+        rng2 = range(2**63 + 4, 2**63, -1)
+        ser2 = Series(rng2)
+        expected2 = Series(list(rng2))
+        tm.assert_series_equal(ser2, expected2)
+        assert list(ser2) == list(rng2)
+        assert ser2.dtype == np.uint64
+
+        rng3 = range(-(2**63), -(2**63) - 4, -1)
+        ser3 = Series(rng3)
+        expected3 = Series(list(rng3))
+        tm.assert_series_equal(ser3, expected3)
+        assert list(ser3) == list(rng3)
+        assert ser3.dtype == object
+
+        rng4 = range(2**73, 2**73 + 4)
+        ser4 = Series(rng4)
+        expected4 = Series(list(rng4))
+        tm.assert_series_equal(ser4, expected4)
+        assert list(ser4) == list(rng4)
+        assert ser4.dtype == object
+
+    def test_constructor_tz_mixed_data(self):
+        # GH 13051
+        dt_list = [
+            Timestamp("2016-05-01 02:03:37"),
+            Timestamp("2016-04-30 19:03:37-0700", tz="US/Pacific"),
+        ]
+        result = Series(dt_list)
+        expected = Series(dt_list, dtype=object)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("pydt", [True, False])
+    def test_constructor_data_aware_dtype_naive(self, tz_aware_fixture, pydt):
+        # GH#25843, GH#41555, GH#33401
+        tz = tz_aware_fixture
+        ts = Timestamp("2019", tz=tz)
+        if pydt:
+            ts = ts.to_pydatetime()
+
+        msg = (
+            "Cannot convert timezone-aware data to timezone-naive dtype. "
+            r"Use pd.Series\(values\).dt.tz_localize\(None\) instead."
+        )
+        with pytest.raises(ValueError, match=msg):
+            Series([ts], dtype="datetime64[ns]")
+
+        with pytest.raises(ValueError, match=msg):
+            Series(np.array([ts], dtype=object), dtype="datetime64[ns]")
+
+        with pytest.raises(ValueError, match=msg):
+            Series({0: ts}, dtype="datetime64[ns]")
+
+        msg = "Cannot unbox tzaware Timestamp to tznaive dtype"
+        with pytest.raises(TypeError, match=msg):
+            Series(ts, index=[0], dtype="datetime64[ns]")
+
+    def test_constructor_datetime64(self):
+        rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
+        dates = np.asarray(rng)
+
+        series = Series(dates)
+        assert np.issubdtype(series.dtype, np.dtype("M8[ns]"))
+
+    def test_constructor_datetimelike_scalar_to_string_dtype(
+        self, nullable_string_dtype
+    ):
+        # https://github.com/pandas-dev/pandas/pull/33846
+        result = Series("M", index=[1, 2, 3], dtype=nullable_string_dtype)
+        expected = Series(["M", "M", "M"], index=[1, 2, 3], dtype=nullable_string_dtype)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("box", [lambda x: x, np.datetime64])
+    def test_constructor_sparse_datetime64(self, box):
+        # https://github.com/pandas-dev/pandas/issues/35762
+        values = [box("2012-01-01"), box("2013-01-01")]
+        dtype = pd.SparseDtype("datetime64[ns]")
+        result = Series(values, dtype=dtype)
+        arr = pd.arrays.SparseArray(values, dtype=dtype)
+        expected = Series(arr)
+        tm.assert_series_equal(result, expected)
+
+    def test_construction_from_ordered_collection(self):
+        # https://github.com/pandas-dev/pandas/issues/36044
+        result = Series({"a": 1, "b": 2}.keys())
+        expected = Series(["a", "b"])
+        tm.assert_series_equal(result, expected)
+
+        result = Series({"a": 1, "b": 2}.values())
+        expected = Series([1, 2])
+        tm.assert_series_equal(result, expected)
+
+    def test_construction_from_large_int_scalar_no_overflow(self):
+        # https://github.com/pandas-dev/pandas/issues/36291
+        n = 1_000_000_000_000_000_000_000
+        result = Series(n, index=[0])
+        expected = Series(n)
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_list_of_periods_infers_period_dtype(self):
+        series = Series(list(period_range("2000-01-01", periods=10, freq="D")))
+        assert series.dtype == "Period[D]"
+
+        series = Series(
+            [Period("2011-01-01", freq="D"), Period("2011-02-01", freq="D")]
+        )
+        assert series.dtype == "Period[D]"
+
+    def test_constructor_subclass_dict(self, dict_subclass):
+        data = dict_subclass((x, 10.0 * x) for x in range(10))
+        series = Series(data)
+        expected = Series(dict(data.items()))
+        tm.assert_series_equal(series, expected)
+
+    def test_constructor_ordereddict(self):
+        # GH3283
+        data = OrderedDict(
+            (f"col{i}", np.random.default_rng(2).random()) for i in range(12)
+        )
+
+        series = Series(data)
+        expected = Series(list(data.values()), list(data.keys()))
+        tm.assert_series_equal(series, expected)
+
+        # Test with subclass
+        class A(OrderedDict):
+            pass
+
+        series = Series(A(data))
+        tm.assert_series_equal(series, expected)
+
+    @pytest.mark.parametrize(
+        "data, expected_index_multi",
+        [
+            ({("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0}, True),
+            ({("a",): 0.0, ("a", "b"): 1.0}, True),
+            ({"z": 111.0, ("a", "a"): 0.0, ("b", "a"): 1.0, ("b", "c"): 2.0}, False),
+        ],
+    )
+    def test_constructor_dict_multiindex(self, data, expected_index_multi):
+        # GH#60695
+        result = Series(data)
+
+        if expected_index_multi:
+            expected = Series(
+                list(data.values()),
+                index=MultiIndex.from_tuples(list(data.keys())),
+            )
+            tm.assert_series_equal(result, expected)
+        else:
+            expected = Series(
+                list(data.values()),
+                index=Index(list(data.keys())),
+            )
+            tm.assert_series_equal(result, expected)
+
+    def test_constructor_dict_multiindex_reindex_flat(self):
+        # construction involves reindexing with a MultiIndex corner case
+        data = {("i", "i"): 0, ("i", "j"): 1, ("j", "i"): 2, "j": np.nan}
+        expected = Series(data)
+
+        result = Series(expected[:-1].to_dict(), index=expected.index)
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_dict_timedelta_index(self):
+        # GH #12169 : Resample category data with timedelta index
+        # construct Series from dict as data and TimedeltaIndex as index
+        # will result NaN in result Series data
+        expected = Series(
+            data=["A", "B", "C"], index=pd.to_timedelta([0, 10, 20], unit="s")
+        )
+
+        result = Series(
+            data={
+                pd.to_timedelta(0, unit="s"): "A",
+                pd.to_timedelta(10, unit="s"): "B",
+                pd.to_timedelta(20, unit="s"): "C",
+            },
+            index=pd.to_timedelta([0, 10, 20], unit="s"),
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_infer_index_tz(self):
+        values = [188.5, 328.25]
+        tzinfo = tzoffset(None, 7200)
+        index = [
+            datetime(2012, 5, 11, 11, tzinfo=tzinfo),
+            datetime(2012, 5, 11, 12, tzinfo=tzinfo),
+        ]
+        series = Series(data=values, index=index)
+
+        assert series.index.tz == tzinfo
+
+        # it works! GH#2443
+        repr(series.index[0])
+
+    def test_constructor_with_pandas_dtype(self):
+        # going through 2D->1D path
+        vals = [(1,), (2,), (3,)]
+        ser = Series(vals)
+        dtype = ser.array.dtype  # NumpyEADtype
+        ser2 = Series(vals, dtype=dtype)
+        tm.assert_series_equal(ser, ser2)
+
+    def test_constructor_int_dtype_missing_values(self):
+        # GH#43017
+        result = Series(index=[0], dtype="int64")
+        expected = Series(np.nan, index=[0], dtype="float64")
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_bool_dtype_missing_values(self):
+        # GH#43018
+        result = Series(index=[0], dtype="bool")
+        expected = Series(True, index=[0], dtype="bool")
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_int64_dtype(self, any_int_dtype):
+        # GH#44923
+        result = Series(["0", "1", "2"], dtype=any_int_dtype)
+        expected = Series([0, 1, 2], dtype=any_int_dtype)
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_raise_on_lossy_conversion_of_strings(self):
+        # GH#44923
+        if not np_version_gt2:
+            raises = pytest.raises(
+                ValueError, match="string values cannot be losslessly cast to int8"
+            )
+        else:
+            raises = pytest.raises(
+                OverflowError, match="The elements provided in the data"
+            )
+        with raises:
+            Series(["128"], dtype="int8")
+
+    def test_constructor_dtype_timedelta_alternative_construct(self):
+        # GH#35465
+        result = Series([1000000, 200000, 3000000], dtype="timedelta64[us]")
+        expected = Series(pd.to_timedelta([1000000, 200000, 3000000], unit="us"))
+        tm.assert_series_equal(result, expected)
+
+    def test_constructor_dtype_timedelta_ns_s_astype_int64(self):
+        # GH#35465
+        result = Series([1000000, 200000, 3000000], dtype="timedelta64[ns]").astype(
+            "int64"
+        )
+        expected = Series([1000000, 200000, 3000000], dtype="timedelta64[s]").astype(
+            "int64"
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.filterwarnings(
+        "ignore:elementwise comparison failed:DeprecationWarning"
+    )
+    @pytest.mark.parametrize("func", [Series, DataFrame, Index, pd.array])
+    def test_constructor_mismatched_null_nullable_dtype(
+        self, func, any_numeric_ea_dtype
+    ):
+        # GH#44514
+        msg = "|".join(
+            [
+                "cannot safely cast non-equivalent object",
+                r"int\(\) argument must be a string, a bytes-like object "
+                "or a (real )?number",
+                r"Cannot cast array data from dtype\('O'\) to dtype\('float64'\) "
+                "according to the rule 'safe'",
+                "object cannot be converted to a FloatingDtype",
+                "'values' contains non-numeric NA",
+            ]
+        )
+
+        for null in [*tm.NP_NAT_OBJECTS, NaT]:
+            with pytest.raises(TypeError, match=msg):
+                func([null, 1.0, 3.0], dtype=any_numeric_ea_dtype)
+
+    def test_series_constructor_ea_int_from_bool(self):
+        # GH#42137
+        result = Series([True, False, True, pd.NA], dtype="Int64")
+        expected = Series([1, 0, 1, pd.NA], dtype="Int64")
+        tm.assert_series_equal(result, expected)
+
+        result = Series([True, False, True], dtype="Int64")
+        expected = Series([1, 0, 1], dtype="Int64")
+        tm.assert_series_equal(result, expected)
+
+    def test_series_constructor_ea_int_from_string_bool(self):
+        # GH#42137
+        with pytest.raises(ValueError, match="invalid literal"):
+            Series(["True", "False", "True", pd.NA], dtype="Int64")
+
+    @pytest.mark.parametrize("val", [1, 1.0])
+    def test_series_constructor_overflow_uint_ea(self, val):
+        # GH#38798
+        max_val = np.iinfo(np.uint64).max - 1
+        result = Series([max_val, val], dtype="UInt64")
+        expected = Series(np.array([max_val, 1], dtype="uint64"), dtype="UInt64")
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("val", [1, 1.0])
+    def test_series_constructor_overflow_uint_ea_with_na(self, val):
+        # GH#38798
+        max_val = np.iinfo(np.uint64).max - 1
+        result = Series([max_val, val, pd.NA], dtype="UInt64")
+        expected = Series(
+            IntegerArray(
+                np.array([max_val, 1, 0], dtype="uint64"),
+                np.array([0, 0, 1], dtype=np.bool_),
+            )
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_series_constructor_overflow_uint_with_nan(self):
+        # GH#38798
+        max_val = np.iinfo(np.uint64).max - 1
+        result = Series([max_val, pd.NA], dtype="UInt64")
+        expected = Series(
+            IntegerArray(
+                np.array([max_val, 1], dtype="uint64"),
+                np.array([0, 1], dtype=np.bool_),
+            )
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_series_constructor_ea_all_na(self):
+        # GH#38798
+        result = Series([pd.NA, pd.NA], dtype="UInt64")
+        expected = Series(
+            IntegerArray(
+                np.array([1, 1], dtype="uint64"),
+                np.array([1, 1], dtype=np.bool_),
+            )
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_series_from_index_dtype_equal_does_not_copy(self):
+        # GH#52008
+        idx = Index([1, 2, 3])
+        expected = idx.copy(deep=True)
+        ser = Series(idx, dtype="int64")
+        ser.iloc[0] = 100
+        tm.assert_index_equal(idx, expected)
+
+    def test_series_string_inference(self):
+        # GH#54430
+        with pd.option_context("future.infer_string", True):
+            ser = Series(["a", "b"])
+        dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan)
+        expected = Series(["a", "b"], dtype=dtype)
+        tm.assert_series_equal(ser, expected)
+
+        expected = Series(["a", 1], dtype="object")
+        with pd.option_context("future.infer_string", True):
+            ser = Series(["a", 1])
+        tm.assert_series_equal(ser, expected)
+
+    @pytest.mark.parametrize("na_value", [None, np.nan, pd.NA])
+    def test_series_string_with_na_inference(self, na_value):
+        # GH#54430
+        with pd.option_context("future.infer_string", True):
+            ser = Series(["a", na_value])
+        dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan)
+        expected = Series(["a", None], dtype=dtype)
+        tm.assert_series_equal(ser, expected)
+
+    def test_series_string_inference_scalar(self):
+        # GH#54430
+        with pd.option_context("future.infer_string", True):
+            ser = Series("a", index=[1])
+        dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan)
+        expected = Series("a", index=[1], dtype=dtype)
+        tm.assert_series_equal(ser, expected)
+
+    def test_series_string_inference_array_string_dtype(self):
+        # GH#54496
+        with pd.option_context("future.infer_string", True):
+            ser = Series(np.array(["a", "b"]))
+        dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan)
+        expected = Series(["a", "b"], dtype=dtype)
+        tm.assert_series_equal(ser, expected)
+
+    def test_series_string_inference_storage_definition(self):
+        # https://github.com/pandas-dev/pandas/issues/54793
+        # but after PDEP-14 (string dtype), it was decided to keep dtype="string"
+        # returning the NA string dtype, so expected is changed from
+        # "string[pyarrow_numpy]" to "string[python]"
+        expected = Series(
+            ["a", "b"], dtype="string[pyarrow]" if HAS_PYARROW else "string[python]"
+        )
+        with pd.option_context("future.infer_string", True):
+            result = Series(["a", "b"], dtype="string")
+        tm.assert_series_equal(result, expected)
+
+        expected = Series(["a", "b"], dtype=pd.StringDtype(na_value=np.nan))
+        with pd.option_context("future.infer_string", True):
+            result = Series(["a", "b"], dtype="str")
+        tm.assert_series_equal(result, expected)
+
+    def test_series_constructor_infer_string_scalar(self):
+        # GH#55537
+        with pd.option_context("future.infer_string", True):
+            ser = Series("a", index=[1, 2], dtype="string[python]")
+        expected = Series(["a", "a"], index=[1, 2], dtype="string[python]")
+        tm.assert_series_equal(ser, expected)
+        assert ser.dtype.storage == "python"
+
+    def test_series_string_inference_na_first(self):
+        # GH#55655
+        with pd.option_context("future.infer_string", True):
+            result = Series([pd.NA, "b"])
+        dtype = pd.StringDtype("pyarrow" if HAS_PYARROW else "python", na_value=np.nan)
+        expected = Series([None, "b"], dtype=dtype)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("klass", [Series, Index])
+    def test_inference_on_pandas_objects(self, klass):
+        # GH#56012
+        obj = klass([Timestamp("2019-12-31")], dtype=object)
+        # This doesn't do inference
+        result = Series(obj)
+        assert result.dtype == np.object_
+
+
+class TestSeriesConstructorIndexCoercion:
+    def test_series_constructor_datetimelike_index_coercion(self):
+        idx = date_range("2020-01-01", periods=5)
+        ser = Series(
+            np.random.default_rng(2).standard_normal(len(idx)), idx.astype(object)
+        )
+        # as of 2.0, we no longer silently cast the object-dtype index
+        #  to DatetimeIndex GH#39307, GH#23598
+        assert not isinstance(ser.index, DatetimeIndex)
+
+    @pytest.mark.parametrize("container", [None, np.array, Series, Index])
+    @pytest.mark.parametrize("data", [1.0, range(4)])
+    def test_series_constructor_infer_multiindex(self, container, data):
+        indexes = [["a", "a", "b", "b"], ["x", "y", "x", "y"]]
+        if container is not None:
+            indexes = [container(ind) for ind in indexes]
+
+        multi = Series(data, index=indexes)
+        assert isinstance(multi.index, MultiIndex)
+
+    # TODO: make this not cast to object in pandas 3.0
+    @pytest.mark.skipif(
+        not np_version_gt2, reason="StringDType only available in numpy 2 and above"
+    )
+    @pytest.mark.parametrize(
+        "data",
+        [
+            ["a", "b", "c"],
+            ["a", "b", np.nan],
+        ],
+    )
+    def test_np_string_array_object_cast(self, data):
+        from numpy.dtypes import StringDType
+
+        arr = np.array(data, dtype=StringDType())
+        res = Series(arr)
+        assert res.dtype == np.object_
+
+        if data[-1] is np.nan:
+            # as of GH#62522 the comparison op for `res==data` casts data
+            #  using sanitize_array, which casts to 'str' dtype, which does not
+            #  consider string 'nan' to be equal to np.nan,
+            #  (which apparently numpy does?  weird.)
+            assert (res.iloc[:-1] == data[:-1]).all()
+            assert res.iloc[-1] == "nan"
+        else:
+            assert (res == data).all()
+
+
+class TestSeriesConstructorInternals:
+    def test_constructor_no_pandas_array(self):
+        ser = Series([1, 2, 3])
+        result = Series(ser.array)
+        tm.assert_series_equal(ser, result)
+        assert isinstance(result._mgr.blocks[0], NumpyBlock)
+        assert result._mgr.blocks[0].is_numeric
+
+    def test_from_array(self):
+        result = Series(pd.array(["1h", "2h"], dtype="timedelta64[ns]"))
+        assert result._mgr.blocks[0].is_extension is False
+
+        result = Series(pd.array(["2015"], dtype="datetime64[ns]"))
+        assert result._mgr.blocks[0].is_extension is False
+
+    def test_from_list_dtype(self):
+        result = Series(["1h", "2h"], dtype="timedelta64[ns]")
+        assert result._mgr.blocks[0].is_extension is False
+
+        result = Series(["2015"], dtype="datetime64[ns]")
+        assert result._mgr.blocks[0].is_extension is False
+
+
+def test_constructor(rand_series_with_duplicate_datetimeindex):
+    dups = rand_series_with_duplicate_datetimeindex
+    assert isinstance(dups, Series)
+    assert isinstance(dups.index, DatetimeIndex)
+
+
+@pytest.mark.parametrize(
+    "input_dict,expected",
+    [
+        ({0: 0}, np.array([[0]], dtype=np.int64)),
+        ({"a": "a"}, np.array([["a"]], dtype=object)),
+        ({1: 1}, np.array([[1]], dtype=np.int64)),
+    ],
+)
+def test_numpy_array(input_dict, expected):
+    result = np.array([Series(input_dict)])
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_index_ordered_dict_keys():
+    # GH 22077
+
+    param_index = OrderedDict(
+        [
+            ((("a", "b"), ("c", "d")), 1),
+            ((("a", None), ("c", "d")), 2),
+        ]
+    )
+    series = Series([1, 2], index=param_index.keys())
+    expected = Series(
+        [1, 2],
+        index=MultiIndex.from_tuples(
+            [(("a", "b"), ("c", "d")), (("a", None), ("c", "d"))]
+        ),
+    )
+    tm.assert_series_equal(series, expected)
+
+
+@pytest.mark.parametrize(
+    "input_list",
+    [
+        [1, complex("nan"), 2],
+        [1 + 1j, complex("nan"), 2 + 2j],
+    ],
+)
+def test_series_with_complex_nan(input_list):
+    # GH#53627
+    ser = Series(input_list)
+    result = Series(ser.array)
+    assert ser.dtype == "complex128"
+    tm.assert_series_equal(ser, result)
+
+
+def test_dict_keys_rangeindex():
+    result = Series({0: 1, 1: 2})
+    expected = Series([1, 2], index=RangeIndex(2))
+    tm.assert_series_equal(result, expected, check_index_type=True)
diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py
new file mode 100644
index 0000000000000000000000000000000000000000..db83cf1112e7452df3328dd52b2ebe8a6232161c
--- /dev/null
+++ b/pandas/tests/series/test_cumulative.py
@@ -0,0 +1,284 @@
+"""
+Tests for Series cumulative operations.
+
+See also
+--------
+tests.frame.test_cumulative
+"""
+
+import re
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+methods = {
+    "cumsum": np.cumsum,
+    "cumprod": np.cumprod,
+    "cummin": np.minimum.accumulate,
+    "cummax": np.maximum.accumulate,
+}
+
+
+class TestSeriesCumulativeOps:
+    @pytest.mark.parametrize("func", [np.cumsum, np.cumprod])
+    def test_datetime_series(self, datetime_series, func):
+        tm.assert_numpy_array_equal(
+            func(datetime_series).values,
+            func(np.array(datetime_series)),
+            check_dtype=True,
+        )
+
+        # with missing values
+        ts = datetime_series.copy()
+        ts[::2] = np.nan
+
+        result = func(ts)[1::2]
+        expected = func(np.array(ts.dropna()))
+
+        tm.assert_numpy_array_equal(result.values, expected, check_dtype=False)
+
+    @pytest.mark.parametrize("method", ["cummin", "cummax"])
+    def test_cummin_cummax(self, datetime_series, method):
+        ufunc = methods[method]
+
+        result = getattr(datetime_series, method)().values
+        expected = ufunc(np.array(datetime_series))
+
+        tm.assert_numpy_array_equal(result, expected)
+        ts = datetime_series.copy()
+        ts[::2] = np.nan
+        result = getattr(ts, method)()[1::2]
+        expected = ufunc(ts.dropna())
+
+        result.index = result.index._with_freq(None)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "ts",
+        [
+            pd.Timedelta(0),
+            pd.Timestamp("1999-12-31"),
+            pd.Timestamp("1999-12-31").tz_localize("US/Pacific"),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "method, skipna, exp_tdi",
+        [
+            ["cummax", True, ["NaT", "2 days", "NaT", "2 days", "NaT", "3 days"]],
+            ["cummin", True, ["NaT", "2 days", "NaT", "1 days", "NaT", "1 days"]],
+            [
+                "cummax",
+                False,
+                ["NaT", "NaT", "NaT", "NaT", "NaT", "NaT"],
+            ],
+            [
+                "cummin",
+                False,
+                ["NaT", "NaT", "NaT", "NaT", "NaT", "NaT"],
+            ],
+        ],
+    )
+    def test_cummin_cummax_datetimelike(self, ts, method, skipna, exp_tdi):
+        # with ts==pd.Timedelta(0), we are testing td64; with naive Timestamp
+        #  we are testing datetime64[ns]; with Timestamp[US/Pacific]
+        #  we are testing dt64tz
+        tdi = pd.to_timedelta(["NaT", "2 days", "NaT", "1 days", "NaT", "3 days"])
+        ser = pd.Series(tdi + ts)
+
+        exp_tdi = pd.to_timedelta(exp_tdi)
+        expected = pd.Series(exp_tdi + ts)
+        result = getattr(ser, method)(skipna=skipna)
+        tm.assert_series_equal(expected, result)
+
+    def test_cumsum_datetimelike(self):
+        # GH#57956
+        df = pd.DataFrame(
+            [
+                [pd.Timedelta(0), pd.Timedelta(days=1)],
+                [pd.Timedelta(days=2), pd.NaT],
+                [pd.Timedelta(hours=-6), pd.Timedelta(hours=12)],
+            ]
+        )
+        result = df.cumsum()
+        expected = pd.DataFrame(
+            [
+                [pd.Timedelta(0), pd.Timedelta(days=1)],
+                [pd.Timedelta(days=2), pd.NaT],
+                [pd.Timedelta(days=1, hours=18), pd.Timedelta(days=1, hours=12)],
+            ]
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "func, exp",
+        [
+            ("cummin", "2012-1-1"),
+            ("cummax", "2012-1-2"),
+        ],
+    )
+    def test_cummin_cummax_period(self, func, exp):
+        # GH#28385
+        ser = pd.Series(
+            [pd.Period("2012-1-1", freq="D"), pd.NaT, pd.Period("2012-1-2", freq="D")]
+        )
+        result = getattr(ser, func)(skipna=False)
+        expected = pd.Series([pd.Period("2012-1-1", freq="D"), pd.NaT, pd.NaT])
+        tm.assert_series_equal(result, expected)
+
+        result = getattr(ser, func)(skipna=True)
+        exp = pd.Period(exp, freq="D")
+        expected = pd.Series([pd.Period("2012-1-1", freq="D"), pd.NaT, exp])
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "arg",
+        [
+            [False, False, False, True, True, False, False],
+            [False, False, False, False, False, False, False],
+        ],
+    )
+    @pytest.mark.parametrize(
+        "func", [lambda x: x, lambda x: ~x], ids=["identity", "inverse"]
+    )
+    @pytest.mark.parametrize("method", methods.keys())
+    def test_cummethods_bool(self, arg, func, method):
+        # GH#6270
+        # checking Series method vs the ufunc applied to the values
+
+        ser = func(pd.Series(arg))
+        ufunc = methods[method]
+
+        exp_vals = ufunc(ser.values)
+        expected = pd.Series(exp_vals)
+
+        result = getattr(ser, method)()
+
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "method, expected",
+        [
+            ["cumsum", pd.Series([0, 1, np.nan, 1], dtype=object)],
+            ["cumprod", pd.Series([False, 0, np.nan, 0])],
+            ["cummin", pd.Series([False, False, np.nan, False])],
+            ["cummax", pd.Series([False, True, np.nan, True])],
+        ],
+    )
+    def test_cummethods_bool_in_object_dtype(self, method, expected):
+        ser = pd.Series([False, True, np.nan, False])
+        result = getattr(ser, method)()
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "method, order",
+        [
+            ["cummax", "abc"],
+            ["cummin", "cba"],
+        ],
+    )
+    def test_cummax_cummin_on_ordered_categorical(self, method, order):
+        # GH#52335
+        cat = pd.CategoricalDtype(list(order), ordered=True)
+        ser = pd.Series(
+            list("ababcab"),
+            dtype=cat,
+        )
+        result = getattr(ser, method)()
+        expected = pd.Series(
+            list("abbbccc"),
+            dtype=cat,
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "skip, exp",
+        [
+            [True, ["a", np.nan, "b", "b", "c"]],
+            [False, ["a", np.nan, np.nan, np.nan, np.nan]],
+        ],
+    )
+    @pytest.mark.parametrize(
+        "method, order",
+        [
+            ["cummax", "abc"],
+            ["cummin", "cba"],
+        ],
+    )
+    def test_cummax_cummin_ordered_categorical_nan(self, skip, exp, method, order):
+        # GH#52335
+        cat = pd.CategoricalDtype(list(order), ordered=True)
+        ser = pd.Series(
+            ["a", np.nan, "b", "a", "c"],
+            dtype=cat,
+        )
+        result = getattr(ser, method)(skipna=skip)
+        expected = pd.Series(
+            exp,
+            dtype=cat,
+        )
+        tm.assert_series_equal(
+            result,
+            expected,
+        )
+
+    def test_cumprod_timedelta(self):
+        # GH#48111
+        ser = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=3)])
+        with pytest.raises(TypeError, match="cumprod not supported for Timedelta"):
+            ser.cumprod()
+
+    @pytest.mark.parametrize(
+        "data, op, skipna, expected_data",
+        [
+            ([], "cumsum", True, []),
+            ([], "cumsum", False, []),
+            (["x", "z", "y"], "cumsum", True, ["x", "xz", "xzy"]),
+            (["x", "z", "y"], "cumsum", False, ["x", "xz", "xzy"]),
+            (["x", pd.NA, "y"], "cumsum", True, ["x", pd.NA, "xy"]),
+            (["x", pd.NA, "y"], "cumsum", False, ["x", pd.NA, pd.NA]),
+            ([pd.NA, "x", "y"], "cumsum", True, [pd.NA, "x", "xy"]),
+            ([pd.NA, "x", "y"], "cumsum", False, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cumsum", True, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cumsum", False, [pd.NA, pd.NA, pd.NA]),
+            ([], "cummin", True, []),
+            ([], "cummin", False, []),
+            (["y", "z", "x"], "cummin", True, ["y", "y", "x"]),
+            (["y", "z", "x"], "cummin", False, ["y", "y", "x"]),
+            (["y", pd.NA, "x"], "cummin", True, ["y", pd.NA, "x"]),
+            (["y", pd.NA, "x"], "cummin", False, ["y", pd.NA, pd.NA]),
+            ([pd.NA, "y", "x"], "cummin", True, [pd.NA, "y", "x"]),
+            ([pd.NA, "y", "x"], "cummin", False, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cummin", True, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cummin", False, [pd.NA, pd.NA, pd.NA]),
+            ([], "cummax", True, []),
+            ([], "cummax", False, []),
+            (["x", "z", "y"], "cummax", True, ["x", "z", "z"]),
+            (["x", "z", "y"], "cummax", False, ["x", "z", "z"]),
+            (["x", pd.NA, "y"], "cummax", True, ["x", pd.NA, "y"]),
+            (["x", pd.NA, "y"], "cummax", False, ["x", pd.NA, pd.NA]),
+            ([pd.NA, "x", "y"], "cummax", True, [pd.NA, "x", "y"]),
+            ([pd.NA, "x", "y"], "cummax", False, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cummax", True, [pd.NA, pd.NA, pd.NA]),
+            ([pd.NA, pd.NA, pd.NA], "cummax", False, [pd.NA, pd.NA, pd.NA]),
+        ],
+    )
+    def test_cum_methods_ea_strings(
+        self, string_dtype_no_object, data, op, skipna, expected_data
+    ):
+        # https://github.com/pandas-dev/pandas/pull/60633 - pyarrow
+        # https://github.com/pandas-dev/pandas/pull/60938 - Python
+        ser = pd.Series(data, dtype=string_dtype_no_object)
+        method = getattr(ser, op)
+        expected = pd.Series(expected_data, dtype=string_dtype_no_object)
+        result = method(skipna=skipna)
+        tm.assert_series_equal(result, expected)
+
+    def test_cumprod_pyarrow_strings(self, pyarrow_string_dtype, skipna):
+        # https://github.com/pandas-dev/pandas/pull/60633
+        ser = pd.Series(list("xyz"), dtype=pyarrow_string_dtype)
+        msg = re.escape(f"operation 'cumprod' not supported for dtype '{ser.dtype}'")
+        with pytest.raises(TypeError, match=msg):
+            ser.cumprod(skipna=skipna)
diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py
new file mode 100644
index 0000000000000000000000000000000000000000..76c8914e60b76c71a39604c9fde1f8e731fc7e8e
--- /dev/null
+++ b/pandas/tests/series/test_formats.py
@@ -0,0 +1,592 @@
+from datetime import (
+    datetime,
+    timedelta,
+)
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    DataFrame,
+    Index,
+    Series,
+    date_range,
+    option_context,
+    period_range,
+    timedelta_range,
+)
+
+
+class TestSeriesRepr:
+    def test_multilevel_name_print_0(self):
+        # GH#55415 None does not get printed, but 0 does
+        # (matching DataFrame and flat index behavior)
+        mi = pd.MultiIndex.from_product([range(2, 3), range(3, 4)], names=[0, None])
+        ser = Series(1.5, index=mi)
+
+        res = repr(ser)
+        expected = "0   \n2  3    1.5\ndtype: float64"
+        assert res == expected
+
+    def test_multilevel_name_print(self, lexsorted_two_level_string_multiindex):
+        index = lexsorted_two_level_string_multiindex
+        ser = Series(range(len(index)), index=index, name="sth")
+        expected = [
+            "first  second",
+            "foo    one       0",
+            "       two       1",
+            "       three     2",
+            "bar    one       3",
+            "       two       4",
+            "baz    two       5",
+            "       three     6",
+            "qux    one       7",
+            "       two       8",
+            "       three     9",
+            "Name: sth, dtype: int64",
+        ]
+        expected = "\n".join(expected)
+        assert repr(ser) == expected
+
+    def test_small_name_printing(self):
+        # Test small Series.
+        s = Series([0, 1, 2])
+
+        s.name = "test"
+        assert "Name: test" in repr(s)
+
+        s.name = None
+        assert "Name:" not in repr(s)
+
+    def test_big_name_printing(self):
+        # Test big Series (diff code path).
+        s = Series(range(1000))
+
+        s.name = "test"
+        assert "Name: test" in repr(s)
+
+        s.name = None
+        assert "Name:" not in repr(s)
+
+    def test_empty_name_printing(self):
+        s = Series(index=date_range("20010101", "20020101"), name="test", dtype=object)
+        assert "Name: test" in repr(s)
+
+    @pytest.mark.parametrize("args", [(), (0, -1)])
+    def test_float_range(self, args):
+        str(
+            Series(
+                np.random.default_rng(2).standard_normal(1000),
+                index=np.arange(1000, *args),
+            )
+        )
+
+    def test_empty_object(self):
+        # empty
+        str(Series(dtype=object))
+
+    def test_string(self, string_series):
+        str(string_series)
+        str(string_series.astype(int))
+
+        # with NaNs
+        string_series[5:7] = np.nan
+        str(string_series)
+
+    def test_object(self, object_series):
+        str(object_series)
+
+    def test_datetime(self, datetime_series):
+        str(datetime_series)
+        # with Nones
+        ots = datetime_series.astype("O")
+        ots[::2] = None
+        repr(ots)
+
+    @pytest.mark.parametrize(
+        "name",
+        [
+            "",
+            1,
+            1.2,
+            "foo",
+            "\u03b1\u03b2\u03b3",
+            "loooooooooooooooooooooooooooooooooooooooooooooooooooong",
+            ("foo", "bar", "baz"),
+            (1, 2),
+            ("foo", 1, 2.3),
+            ("\u03b1", "\u03b2", "\u03b3"),
+            ("\u03b1", "bar"),
+        ],
+    )
+    def test_various_names(self, name, string_series):
+        # various names
+        string_series.name = name
+        repr(string_series)
+
+    def test_tuple_name(self):
+        biggie = Series(
+            np.random.default_rng(2).standard_normal(1000),
+            index=np.arange(1000),
+            name=("foo", "bar", "baz"),
+        )
+        repr(biggie)
+
+    @pytest.mark.parametrize("arg", [100, 1001])
+    def test_tidy_repr_name_0(self, arg):
+        # tidy repr
+        ser = Series(np.random.default_rng(2).standard_normal(arg), name=0)
+        rep_str = repr(ser)
+        assert "Name: 0" in rep_str
+
+    def test_newline(self, any_string_dtype):
+        ser = Series(
+            ["a\n\r\tb"],
+            name="a\n\r\td",
+            index=Index(["a\n\r\tf"], dtype=any_string_dtype),
+            dtype=any_string_dtype,
+        )
+        assert "\t" not in repr(ser)
+        assert "\r" not in repr(ser)
+        assert "a\n" not in repr(ser)
+
+    @pytest.mark.parametrize(
+        "name, expected",
+        [
+            ["foo", "Series([], Name: foo, dtype: int64)"],
+            [None, "Series([], dtype: int64)"],
+        ],
+    )
+    def test_empty_int64(self, name, expected):
+        # with empty series (#4651)
+        s = Series([], dtype=np.int64, name=name)
+        assert repr(s) == expected
+
+    def test_repr_bool_fails(self, capsys):
+        s = Series(
+            [
+                DataFrame(np.random.default_rng(2).standard_normal((2, 2)))
+                for i in range(5)
+            ]
+        )
+
+        # It works (with no Cython exception barf)!
+        repr(s)
+
+        captured = capsys.readouterr()
+        assert captured.err == ""
+
+    def test_repr_name_iterable_indexable(self):
+        s = Series([1, 2, 3], name=np.int64(3))
+
+        # it works!
+        repr(s)
+
+        s.name = ("\u05d0",) * 2
+        repr(s)
+
+    def test_repr_max_rows(self):
+        # GH 6863
+        with option_context("display.max_rows", None):
+            str(Series(range(1001)))  # should not raise exception
+
+    def test_unicode_string_with_unicode(self):
+        df = Series(["\u05d0"], name="\u05d1")
+        str(df)
+
+        ser = Series(["\u03c3"] * 10)
+        repr(ser)
+
+        ser2 = Series(["\u05d0"] * 1000)
+        ser2.name = "title1"
+        repr(ser2)
+
+    def test_str_to_bytes_raises(self):
+        # GH 26447
+        df = Series(["abc"], name="abc")
+        msg = "^'str' object cannot be interpreted as an integer$"
+        with pytest.raises(TypeError, match=msg):
+            bytes(df)
+
+    def test_timeseries_repr_object_dtype(self):
+        index = Index(
+            [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], dtype=object
+        )
+        ts = Series(np.random.default_rng(2).standard_normal(len(index)), index)
+        repr(ts)
+
+        ts = Series(
+            np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20)
+        )
+        assert repr(ts).splitlines()[-1].startswith("Freq:")
+
+        ts2 = ts.iloc[np.random.default_rng(2).integers(0, len(ts) - 1, 400)]
+        repr(ts2).splitlines()[-1]
+
+    def test_latex_repr(self):
+        pytest.importorskip("jinja2")  # uses Styler implementation
+        result = r"""\begin{tabular}{ll}
+\toprule
+ & 0 \\
+\midrule
+0 & $\alpha$ \\
+1 & b \\
+2 & c \\
+\bottomrule
+\end{tabular}
+"""
+        with option_context(
+            "styler.format.escape", None, "styler.render.repr", "latex"
+        ):
+            s = Series([r"$\alpha$", "b", "c"])
+            assert result == s._repr_latex_()
+
+        assert s._repr_latex_() is None
+
+    def test_index_repr_in_frame_with_nan(self):
+        # see gh-25061
+        i = Index([1, np.nan])
+        s = Series([1, 2], index=i)
+        exp = """1.0    1\nNaN    2\ndtype: int64"""
+
+        assert repr(s) == exp
+
+    def test_series_repr_nat(self):
+        series = Series([0, 1000, 2000, pd.NaT._value], dtype="M8[ns]")
+
+        result = repr(series)
+        expected = (
+            "0   1970-01-01 00:00:00.000000\n"
+            "1   1970-01-01 00:00:00.000001\n"
+            "2   1970-01-01 00:00:00.000002\n"
+            "3                          NaT\n"
+            "dtype: datetime64[ns]"
+        )
+        assert result == expected
+
+    def test_float_repr(self):
+        # GH#35603
+        # check float format when cast to object
+        ser = Series([1.0]).astype(object)
+        expected = "0    1.0\ndtype: object"
+        assert repr(ser) == expected
+
+    def test_different_null_objects(self):
+        # GH#45263
+        ser = Series([1, 2, 3, 4], [True, None, np.nan, pd.NaT])
+        result = repr(ser)
+        expected = "True    1\nNone    2\nNaN     3\nNaT     4\ndtype: int64"
+        assert result == expected
+
+    def test_2d_extension_type(self):
+        # GH#33770
+
+        # Define a stub extension type with just enough code to run Series.__repr__()
+        class DtypeStub(pd.api.extensions.ExtensionDtype):
+            @property
+            def type(self):
+                return np.ndarray
+
+            @property
+            def name(self):
+                return "DtypeStub"
+
+        class ExtTypeStub(pd.api.extensions.ExtensionArray):
+            def __len__(self) -> int:
+                return 2
+
+            def __getitem__(self, ix):
+                return [ix == 1, ix == 0]
+
+            @property
+            def dtype(self):
+                return DtypeStub()
+
+        series = Series(ExtTypeStub(), copy=False)
+        res = repr(series)  # This line crashed before GH#33770 was fixed.
+        expected = "\n".join(
+            ["0    [False True]", "1    [True False]", "dtype: DtypeStub"]
+        )
+        assert res == expected
+
+
+class TestCategoricalRepr:
+    def test_categorical_repr_unicode(self):
+        # see gh-21002
+
+        class County:
+            name = "San Sebastián"
+            state = "PR"
+
+            def __repr__(self) -> str:
+                return self.name + ", " + self.state
+
+        cat = Categorical([County() for _ in range(61)])
+        idx = Index(cat)
+        ser = idx.to_series()
+
+        repr(ser)
+        str(ser)
+
+    def test_categorical_repr(self, using_infer_string):
+        a = Series(Categorical([1, 2, 3, 4]))
+        exp = (
+            "0    1\n1    2\n2    3\n3    4\n"
+            "dtype: category\nCategories (4, int64): [1, 2, 3, 4]"
+        )
+
+        assert exp == a.__str__()
+
+        a = Series(Categorical(["a", "b"] * 25))
+        exp = (
+            "0     a\n1     b\n"
+            "     ..\n"
+            "48    a\n49    b\n"
+            "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']"
+        )
+        if using_infer_string:
+            exp = exp.replace("object", "str")
+        with option_context("display.max_rows", 5):
+            assert exp == repr(a)
+
+        levs = list("abcdefghijklmnopqrstuvwxyz")
+        a = Series(Categorical(["a", "b"], categories=levs, ordered=True))
+        exp = (
+            "0    a\n1    b\n"
+            "dtype: category\n"
+            "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... "
+            "'w' < 'x' < 'y' < 'z']"
+        )
+        if using_infer_string:
+            exp = exp.replace("object", "str")
+        assert exp == a.__str__()
+
+    def test_categorical_series_repr(self):
+        s = Series(Categorical([1, 2, 3]))
+        exp = """0    1
+1    2
+2    3
+dtype: category
+Categories (3, int64): [1, 2, 3]"""
+
+        assert repr(s) == exp
+
+        s = Series(Categorical(np.arange(10)))
+        exp = f"""0    0
+1    1
+2    2
+3    3
+4    4
+5    5
+6    6
+7    7
+8    8
+9    9
+dtype: category
+Categories (10, {np.dtype(int)}): [0, 1, 2, 3, ..., 6, 7, 8, 9]"""
+
+        assert repr(s) == exp
+
+    def test_categorical_series_repr_ordered(self):
+        s = Series(Categorical([1, 2, 3], ordered=True))
+        exp = """0    1
+1    2
+2    3
+dtype: category
+Categories (3, int64): [1 < 2 < 3]"""
+
+        assert repr(s) == exp
+
+        s = Series(Categorical(np.arange(10), ordered=True))
+        exp = f"""0    0
+1    1
+2    2
+3    3
+4    4
+5    5
+6    6
+7    7
+8    8
+9    9
+dtype: category
+Categories (10, {np.dtype(int)}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]"""
+
+        assert repr(s) == exp
+
+    def test_categorical_series_repr_datetime(self):
+        idx = date_range("2011-01-01 09:00", freq="h", periods=5, unit="ns")
+        s = Series(Categorical(idx))
+        exp = """0   2011-01-01 09:00:00
+1   2011-01-01 10:00:00
+2   2011-01-01 11:00:00
+3   2011-01-01 12:00:00
+4   2011-01-01 13:00:00
+dtype: category
+Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00,
+                                 2011-01-01 12:00:00, 2011-01-01 13:00:00]"""  # noqa: E501
+
+        assert repr(s) == exp
+
+        idx = date_range(
+            "2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern", unit="ns"
+        )
+        s = Series(Categorical(idx))
+        exp = """0   2011-01-01 09:00:00-05:00
+1   2011-01-01 10:00:00-05:00
+2   2011-01-01 11:00:00-05:00
+3   2011-01-01 12:00:00-05:00
+4   2011-01-01 13:00:00-05:00
+dtype: category
+Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00,
+                                             2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00,
+                                             2011-01-01 13:00:00-05:00]"""  # noqa: E501
+
+        assert repr(s) == exp
+
+    def test_categorical_series_repr_datetime_ordered(self):
+        idx = date_range("2011-01-01 09:00", freq="h", periods=5, unit="ns")
+        s = Series(Categorical(idx, ordered=True))
+        exp = """0   2011-01-01 09:00:00
+1   2011-01-01 10:00:00
+2   2011-01-01 11:00:00
+3   2011-01-01 12:00:00
+4   2011-01-01 13:00:00
+dtype: category
+Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 <
+                                 2011-01-01 12:00:00 < 2011-01-01 13:00:00]"""  # noqa: E501
+
+        assert repr(s) == exp
+
+        idx = date_range(
+            "2011-01-01 09:00", freq="h", periods=5, tz="US/Eastern", unit="ns"
+        )
+        s = Series(Categorical(idx, ordered=True))
+        exp = """0   2011-01-01 09:00:00-05:00
+1   2011-01-01 10:00:00-05:00
+2   2011-01-01 11:00:00-05:00
+3   2011-01-01 12:00:00-05:00
+4   2011-01-01 13:00:00-05:00
+dtype: category
+Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 <
+                                             2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 <
+                                             2011-01-01 13:00:00-05:00]"""  # noqa: E501
+
+        assert repr(s) == exp
+
+    def test_categorical_series_repr_period(self):
+        idx = period_range("2011-01-01 09:00", freq="h", periods=5)
+        s = Series(Categorical(idx))
+        exp = """0    2011-01-01 09:00
+1    2011-01-01 10:00
+2    2011-01-01 11:00
+3    2011-01-01 12:00
+4    2011-01-01 13:00
+dtype: category
+Categories (5, period[h]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00,
+                            2011-01-01 13:00]"""  # noqa: E501
+
+        assert repr(s) == exp
+
+        idx = period_range("2011-01", freq="M", periods=5)
+        s = Series(Categorical(idx))
+        exp = """0    2011-01
+1    2011-02
+2    2011-03
+3    2011-04
+4    2011-05
+dtype: category
+Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]"""
+
+        assert repr(s) == exp
+
+    def test_categorical_series_repr_period_ordered(self):
+        idx = period_range("2011-01-01 09:00", freq="h", periods=5)
+        s = Series(Categorical(idx, ordered=True))
+        exp = """0    2011-01-01 09:00
+1    2011-01-01 10:00
+2    2011-01-01 11:00
+3    2011-01-01 12:00
+4    2011-01-01 13:00
+dtype: category
+Categories (5, period[h]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
+                            2011-01-01 13:00]"""  # noqa: E501
+
+        assert repr(s) == exp
+
+        idx = period_range("2011-01", freq="M", periods=5)
+        s = Series(Categorical(idx, ordered=True))
+        exp = """0    2011-01
+1    2011-02
+2    2011-03
+3    2011-04
+4    2011-05
+dtype: category
+Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
+
+        assert repr(s) == exp
+
+    def test_categorical_series_repr_timedelta(self):
+        idx = timedelta_range("1 days", periods=5)
+        s = Series(Categorical(idx))
+        exp = """0   1 days
+1   2 days
+2   3 days
+3   4 days
+4   5 days
+dtype: category
+Categories (5, timedelta64[us]): [1 days, 2 days, 3 days, 4 days, 5 days]"""
+
+        assert repr(s) == exp
+
+        idx = timedelta_range("1 hours", periods=10)
+        s = Series(Categorical(idx))
+        exp = """0   0 days 01:00:00
+1   1 days 01:00:00
+2   2 days 01:00:00
+3   3 days 01:00:00
+4   4 days 01:00:00
+5   5 days 01:00:00
+6   6 days 01:00:00
+7   7 days 01:00:00
+8   8 days 01:00:00
+9   9 days 01:00:00
+dtype: category
+Categories (10, timedelta64[us]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00,
+                                   3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00,
+                                   8 days 01:00:00, 9 days 01:00:00]"""  # noqa: E501
+
+        assert repr(s) == exp
+
+    def test_categorical_series_repr_timedelta_ordered(self):
+        idx = timedelta_range("1 days", periods=5)
+        s = Series(Categorical(idx, ordered=True))
+        exp = """0   1 days
+1   2 days
+2   3 days
+3   4 days
+4   5 days
+dtype: category
+Categories (5, timedelta64[us]): [1 days < 2 days < 3 days < 4 days < 5 days]"""
+
+        assert repr(s) == exp
+
+        idx = timedelta_range("1 hours", periods=10)
+        s = Series(Categorical(idx, ordered=True))
+        exp = """0   0 days 01:00:00
+1   1 days 01:00:00
+2   2 days 01:00:00
+3   3 days 01:00:00
+4   4 days 01:00:00
+5   5 days 01:00:00
+6   6 days 01:00:00
+7   7 days 01:00:00
+8   8 days 01:00:00
+9   9 days 01:00:00
+dtype: category
+Categories (10, timedelta64[us]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 <
+                                   3 days 01:00:00 ... 6 days 01:00:00 < 7 days 01:00:00 <
+                                   8 days 01:00:00 < 9 days 01:00:00]"""  # noqa: E501
+
+        assert repr(s) == exp
diff --git a/pandas/tests/series/test_iteration.py b/pandas/tests/series/test_iteration.py
new file mode 100644
index 0000000000000000000000000000000000000000..db5d80b3798b91b6c3ba212c2dbe7c5c5c56a5a6
--- /dev/null
+++ b/pandas/tests/series/test_iteration.py
@@ -0,0 +1,33 @@
+class TestIteration:
+    def test_keys(self, datetime_series):
+        assert datetime_series.keys() is datetime_series.index
+
+    def test_iter_datetimes(self, datetime_series):
+        for i, val in enumerate(datetime_series):
+            assert val == datetime_series.iloc[i]
+
+    def test_iter_strings(self, string_series):
+        for i, val in enumerate(string_series):
+            assert val == string_series.iloc[i]
+
+    def test_iteritems_datetimes(self, datetime_series):
+        for idx, val in datetime_series.items():
+            assert val == datetime_series[idx]  # noqa: PLR1733
+
+    def test_iteritems_strings(self, string_series):
+        for idx, val in string_series.items():
+            assert val == string_series[idx]  # noqa: PLR1733
+
+        # assert is lazy (generators don't define reverse, lists do)
+        assert not hasattr(string_series.items(), "reverse")
+
+    def test_items_datetimes(self, datetime_series):
+        for idx, val in datetime_series.items():
+            assert val == datetime_series[idx]  # noqa: PLR1733
+
+    def test_items_strings(self, string_series):
+        for idx, val in string_series.items():
+            assert val == string_series[idx]  # noqa: PLR1733
+
+        # assert is lazy (generators don't define reverse, lists do)
+        assert not hasattr(string_series.items(), "reverse")
diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32a88b77c3a1bd864e819e5041a1844025bc4b5
--- /dev/null
+++ b/pandas/tests/series/test_logical_ops.py
@@ -0,0 +1,510 @@
+from datetime import datetime
+import operator
+
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Index,
+    Series,
+    bdate_range,
+)
+import pandas._testing as tm
+from pandas.core import ops
+
+
+class TestSeriesLogicalOps:
+    @pytest.mark.parametrize("bool_op", [operator.and_, operator.or_, operator.xor])
+    def test_bool_operators_with_nas(self, bool_op):
+        # boolean &, |, ^ should work with object arrays and propagate NAs
+        ser = Series(bdate_range("1/1/2000", periods=10), dtype=object)
+        ser[::2] = np.nan
+
+        mask = ser.isna()
+        filled = ser.fillna(ser[0])
+
+        result = bool_op(ser < ser[9], ser > ser[3])
+
+        expected = bool_op(filled < filled[9], filled > filled[3])
+        expected[mask] = False
+        tm.assert_series_equal(result, expected)
+
+    def test_logical_operators_bool_dtype_with_empty(self):
+        # GH#9016: support bitwise op for integer types
+        index = list("bca")
+
+        s_tft = Series([True, False, True], index=index)
+        s_fff = Series([False, False, False], index=index)
+        s_empty = Series([], dtype=object)
+
+        res = s_tft & s_empty
+        expected = s_fff.sort_index()
+        tm.assert_series_equal(res, expected)
+
+        res = s_tft | s_empty
+        expected = s_tft.sort_index()
+        tm.assert_series_equal(res, expected)
+
+    def test_logical_operators_int_dtype_with_int_dtype(self):
+        # GH#9016: support bitwise op for integer types
+
+        s_0123 = Series(range(4), dtype="int64")
+        s_3333 = Series([3] * 4)
+        s_4444 = Series([4] * 4)
+
+        res = s_0123 & s_3333
+        expected = Series(range(4), dtype="int64")
+        tm.assert_series_equal(res, expected)
+
+        res = s_0123 | s_4444
+        expected = Series(range(4, 8), dtype="int64")
+        tm.assert_series_equal(res, expected)
+
+        s_1111 = Series([1] * 4, dtype="int8")
+        res = s_0123 & s_1111
+        expected = Series([0, 1, 0, 1], dtype="int64")
+        tm.assert_series_equal(res, expected)
+
+        res = s_0123.astype(np.int16) | s_1111.astype(np.int32)
+        expected = Series([1, 1, 3, 3], dtype="int32")
+        tm.assert_series_equal(res, expected)
+
+    def test_logical_operators_int_dtype_with_int_scalar(self):
+        # GH#9016: support bitwise op for integer types
+        s_0123 = Series(range(4), dtype="int64")
+
+        res = s_0123 & 0
+        expected = Series([0] * 4)
+        tm.assert_series_equal(res, expected)
+
+        res = s_0123 & 1
+        expected = Series([0, 1, 0, 1])
+        tm.assert_series_equal(res, expected)
+
+    def test_logical_operators_int_dtype_with_float(self):
+        # GH#9016: support bitwise op for integer types
+        s_0123 = Series(range(4), dtype="int64")
+
+        err_msg = (
+            r"Logical ops \(and, or, xor\) between Pandas objects and "
+            "dtype-less sequences"
+        )
+
+        msg = "Cannot perform.+with a dtyped.+array and scalar of type"
+        with pytest.raises(TypeError, match=msg):
+            s_0123 & np.nan
+        with pytest.raises(TypeError, match=msg):
+            s_0123 & 3.14
+        msg = "unsupported operand type.+for &:"
+        with pytest.raises(TypeError, match=err_msg):
+            s_0123 & [0.1, 4, 3.14, 2]
+        with pytest.raises(TypeError, match=msg):
+            s_0123 & np.array([0.1, 4, 3.14, 2])
+        with pytest.raises(TypeError, match=msg):
+            s_0123 & Series([0.1, 4, -3.14, 2])
+
+    def test_logical_operators_int_dtype_with_str(self):
+        s_1111 = Series([1] * 4, dtype="int8")
+
+        err_msg = (
+            r"Logical ops \(and, or, xor\) between Pandas objects and "
+            "dtype-less sequences"
+        )
+
+        msg = "Cannot perform 'and_' with a dtyped.+array and scalar of type"
+        with pytest.raises(TypeError, match=msg):
+            s_1111 & "a"
+        with pytest.raises(TypeError, match=err_msg):
+            s_1111 & ["a", "b", "c", "d"]
+
+    def test_logical_operators_int_dtype_with_bool(self):
+        # GH#9016: support bitwise op for integer types
+        s_0123 = Series(range(4), dtype="int64")
+
+        expected = Series([False] * 4)
+
+        result = s_0123 & False
+        tm.assert_series_equal(result, expected)
+
+        msg = (
+            r"Logical ops \(and, or, xor\) between Pandas objects and "
+            "dtype-less sequences"
+        )
+        with pytest.raises(TypeError, match=msg):
+            s_0123 & [False]
+
+        with pytest.raises(TypeError, match=msg):
+            s_0123 & (False,)
+
+        result = s_0123 ^ False
+        expected = Series([False, True, True, True])
+        tm.assert_series_equal(result, expected)
+
+    def test_logical_operators_int_dtype_with_object(self):
+        # GH#9016: support bitwise op for integer types
+        s_0123 = Series(range(4), dtype="int64")
+
+        result = s_0123 & Series([False, np.nan, False, False])
+        expected = Series([False] * 4)
+        tm.assert_series_equal(result, expected)
+
+        s_abNd = Series(["a", "b", np.nan, "d"])
+        with pytest.raises(
+            TypeError, match="unsupported.* 'int' and 'str'|'rand_' not supported"
+        ):
+            s_0123 & s_abNd
+
+    def test_logical_operators_bool_dtype_with_int(self):
+        index = list("bca")
+
+        s_tft = Series([True, False, True], index=index)
+        s_fff = Series([False, False, False], index=index)
+
+        res = s_tft & 0
+        expected = s_fff
+        tm.assert_series_equal(res, expected)
+
+        res = s_tft & 1
+        expected = s_tft
+        tm.assert_series_equal(res, expected)
+
+    def test_logical_ops_bool_dtype_with_ndarray(self):
+        # make sure we operate on ndarray the same as Series
+        left = Series([True, True, True, False, True])
+        right = [True, False, None, True, np.nan]
+
+        msg = (
+            r"Logical ops \(and, or, xor\) between Pandas objects and "
+            "dtype-less sequences"
+        )
+
+        expected = Series([True, False, False, False, False])
+        with pytest.raises(TypeError, match=msg):
+            left & right
+        result = left & np.array(right)
+        tm.assert_series_equal(result, expected)
+        result = left & Index(right)
+        tm.assert_series_equal(result, expected)
+        result = left & Series(right)
+        tm.assert_series_equal(result, expected)
+
+        expected = Series([True, True, True, True, True])
+        with pytest.raises(TypeError, match=msg):
+            left | right
+        result = left | np.array(right)
+        tm.assert_series_equal(result, expected)
+        result = left | Index(right)
+        tm.assert_series_equal(result, expected)
+        result = left | Series(right)
+        tm.assert_series_equal(result, expected)
+
+        expected = Series([False, True, True, True, True])
+        with pytest.raises(TypeError, match=msg):
+            left ^ right
+        result = left ^ np.array(right)
+        tm.assert_series_equal(result, expected)
+        result = left ^ Index(right)
+        tm.assert_series_equal(result, expected)
+        result = left ^ Series(right)
+        tm.assert_series_equal(result, expected)
+
+    def test_logical_operators_int_dtype_with_bool_dtype_and_reindex(self):
+        # GH#9016: support bitwise op for integer types
+
+        index = list("bca")
+
+        s_tft = Series([True, False, True], index=index)
+        s_tft = Series([True, False, True], index=index)
+        s_tff = Series([True, False, False], index=index)
+
+        s_0123 = Series(range(4), dtype="int64")
+
+        # s_0123 will be all false now because of reindexing like s_tft
+        expected = Series([False] * 7, index=[0, 1, 2, 3, "a", "b", "c"])
+        result = s_tft & s_0123
+        tm.assert_series_equal(result, expected)
+
+        # GH#52538: no longer to object type when reindex is needed;
+        # matches DataFrame behavior
+        msg = r"unsupported operand type\(s\) for &: 'float' and 'bool'"
+        with pytest.raises(TypeError, match=msg):
+            s_0123 & s_tft
+
+        s_a0b1c0 = Series([1], list("b"))
+
+        res = s_tft & s_a0b1c0
+        expected = s_tff.reindex(list("abc"))
+        tm.assert_series_equal(res, expected)
+
+        res = s_tft | s_a0b1c0
+        expected = s_tft.reindex(list("abc"))
+        tm.assert_series_equal(res, expected)
+
+    def test_scalar_na_logical_ops_corners(self):
+        s = Series([2, 3, 4, 5, 6, 7, 8, 9, 10])
+
+        msg = "Cannot perform.+with a dtyped.+array and scalar of type"
+        with pytest.raises(TypeError, match=msg):
+            s & datetime(2005, 1, 1)
+
+        s = Series([2, 3, 4, 5, 6, 7, 8, 9, datetime(2005, 1, 1)])
+        s[::2] = np.nan
+
+        expected = Series(True, index=s.index)
+        expected[::2] = False
+
+        msg = (
+            r"Logical ops \(and, or, xor\) between Pandas objects and "
+            "dtype-less sequences"
+        )
+        with pytest.raises(TypeError, match=msg):
+            s & list(s)
+
+    def test_scalar_na_logical_ops_corners_aligns(self):
+        s = Series([2, 3, 4, 5, 6, 7, 8, 9, datetime(2005, 1, 1)])
+        s[::2] = np.nan
+        d = DataFrame({"A": s})
+
+        expected = DataFrame(False, index=range(9), columns=["A", *list(range(9))])
+
+        result = s & d
+        tm.assert_frame_equal(result, expected)
+
+        result = d & s
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("op", [operator.and_, operator.or_, operator.xor])
+    def test_logical_ops_with_index(self, op):
+        # GH#22092, GH#19792
+        ser = Series([True, True, False, False])
+        idx1 = Index([True, False, True, False])
+        idx2 = Index([1, 0, 1, 0])
+
+        expected = Series([op(ser[n], idx1[n]) for n in range(len(ser))])
+
+        result = op(ser, idx1)
+        tm.assert_series_equal(result, expected)
+
+        expected = Series([op(ser[n], idx2[n]) for n in range(len(ser))], dtype=bool)
+
+        result = op(ser, idx2)
+        tm.assert_series_equal(result, expected)
+
+    def test_reversed_xor_with_index_returns_series(self):
+        # GH#22092, GH#19792 pre-2.0 these were aliased to setops
+        ser = Series([True, True, False, False])
+        idx1 = Index([True, False, True, False], dtype=bool)
+        idx2 = Index([1, 0, 1, 0])
+
+        expected = Series([False, True, True, False])
+        result = idx1 ^ ser
+        tm.assert_series_equal(result, expected)
+
+        result = idx2 ^ ser
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "op",
+        [
+            ops.rand_,
+            ops.ror_,
+        ],
+    )
+    def test_reversed_logical_op_with_index_returns_series(self, op):
+        # GH#22092, GH#19792
+        ser = Series([True, True, False, False])
+        idx1 = Index([True, False, True, False])
+        idx2 = Index([1, 0, 1, 0])
+
+        expected = Series(op(idx1.values, ser.values))
+        result = op(ser, idx1)
+        tm.assert_series_equal(result, expected)
+
+        expected = op(ser, Series(idx2))
+        result = op(ser, idx2)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "op, expected",
+        [
+            (ops.rand_, [False, False]),
+            (ops.ror_, [True, True]),
+            (ops.rxor, [True, True]),
+        ],
+    )
+    def test_reverse_ops_with_index(self, op, expected):
+        # https://github.com/pandas-dev/pandas/pull/23628
+        # multi-set Index ops are buggy, so let's avoid duplicates...
+        # GH#49503
+        ser = Series([True, False])
+        idx = Index([False, True])
+
+        result = op(ser, idx)
+        expected = Series(expected)
+        tm.assert_series_equal(result, expected)
+
+    def test_logical_ops_label_based(self, using_infer_string):
+        # GH#4947
+        # logical ops should be label based
+
+        a = Series([True, False, True], list("bca"))
+        b = Series([False, True, False], list("abc"))
+
+        expected = Series([False, True, False], list("abc"))
+        result = a & b
+        tm.assert_series_equal(result, expected)
+
+        expected = Series([True, True, False], list("abc"))
+        result = a | b
+        tm.assert_series_equal(result, expected)
+
+        expected = Series([True, False, False], list("abc"))
+        result = a ^ b
+        tm.assert_series_equal(result, expected)
+
+        # rhs is bigger
+        a = Series([True, False, True], list("bca"))
+        b = Series([False, True, False, True], list("abcd"))
+
+        expected = Series([False, True, False, False], list("abcd"))
+        result = a & b
+        tm.assert_series_equal(result, expected)
+
+        expected = Series([True, True, False, False], list("abcd"))
+        result = a | b
+        tm.assert_series_equal(result, expected)
+
+        # filling
+
+        # vs empty
+        empty = Series([], dtype=object)
+
+        result = a & empty
+        expected = Series([False, False, False], list("abc"))
+        tm.assert_series_equal(result, expected)
+
+        result = a | empty
+        expected = Series([True, True, False], list("abc"))
+        tm.assert_series_equal(result, expected)
+
+        # vs non-matching
+        result = a & Series([1], ["z"])
+        expected = Series([False, False, False, False], list("abcz"))
+        tm.assert_series_equal(result, expected)
+
+        result = a | Series([1], ["z"])
+        expected = Series([True, True, False, False], list("abcz"))
+        tm.assert_series_equal(result, expected)
+
+        # identity
+        # we would like s[s|e] == s to hold for any e, whether empty or not
+        for e in [
+            empty.copy(),
+            Series([1], ["z"]),
+            Series(np.nan, b.index),
+            Series(np.nan, a.index),
+        ]:
+            result = a[a | e]
+            tm.assert_series_equal(result, a[a])
+
+        for e in [Series(["z"])]:
+            if using_infer_string:
+                # TODO(infer_string) should this behave differently?
+                # -> https://github.com/pandas-dev/pandas/issues/60234
+                with pytest.raises(
+                    TypeError, match="not supported for dtype|unsupported operand type"
+                ):
+                    result = a[a | e]
+            else:
+                result = a[a | e]
+            tm.assert_series_equal(result, a[a])
+
+        # vs scalars
+        index = list("bca")
+        t = Series([True, False, True])
+
+        for v in [True, 1, 2]:
+            result = Series([True, False, True], index=index) | v
+            expected = Series([True, True, True], index=index)
+            tm.assert_series_equal(result, expected)
+
+        msg = "Cannot perform.+with a dtyped.+array and scalar of type"
+        for v in [np.nan, "foo"]:
+            with pytest.raises(TypeError, match=msg):
+                t | v
+
+        for v in [False, 0]:
+            result = Series([True, False, True], index=index) | v
+            expected = Series([True, False, True], index=index)
+            tm.assert_series_equal(result, expected)
+
+        for v in [True, 1]:
+            result = Series([True, False, True], index=index) & v
+            expected = Series([True, False, True], index=index)
+            tm.assert_series_equal(result, expected)
+
+        for v in [False, 0]:
+            result = Series([True, False, True], index=index) & v
+            expected = Series([False, False, False], index=index)
+            tm.assert_series_equal(result, expected)
+        msg = "Cannot perform.+with a dtyped.+array and scalar of type"
+        for v in [np.nan]:
+            with pytest.raises(TypeError, match=msg):
+                t & v
+
+    def test_logical_ops_df_compat(self):
+        # GH#1134
+        s1 = Series([True, False, True], index=list("ABC"), name="x")
+        s2 = Series([True, True, False], index=list("ABD"), name="x")
+
+        exp = Series([True, False, False, False], index=list("ABCD"), name="x")
+        tm.assert_series_equal(s1 & s2, exp)
+        tm.assert_series_equal(s2 & s1, exp)
+
+        # True | np.nan => True
+        exp_or1 = Series([True, True, True, False], index=list("ABCD"), name="x")
+        tm.assert_series_equal(s1 | s2, exp_or1)
+        # np.nan | True => np.nan, filled with False
+        exp_or = Series([True, True, False, False], index=list("ABCD"), name="x")
+        tm.assert_series_equal(s2 | s1, exp_or)
+
+        # DataFrame doesn't fill nan with False
+        tm.assert_frame_equal(s1.to_frame() & s2.to_frame(), exp.to_frame())
+        tm.assert_frame_equal(s2.to_frame() & s1.to_frame(), exp.to_frame())
+
+        exp = DataFrame({"x": [True, True, np.nan, np.nan]}, index=list("ABCD"))
+        tm.assert_frame_equal(s1.to_frame() | s2.to_frame(), exp_or1.to_frame())
+        tm.assert_frame_equal(s2.to_frame() | s1.to_frame(), exp_or.to_frame())
+
+        # different length
+        s3 = Series([True, False, True], index=list("ABC"), name="x")
+        s4 = Series([True, True, True, True], index=list("ABCD"), name="x")
+
+        exp = Series([True, False, True, False], index=list("ABCD"), name="x")
+        tm.assert_series_equal(s3 & s4, exp)
+        tm.assert_series_equal(s4 & s3, exp)
+
+        # np.nan | True => np.nan, filled with False
+        exp_or1 = Series([True, True, True, False], index=list("ABCD"), name="x")
+        tm.assert_series_equal(s3 | s4, exp_or1)
+        # True | np.nan => True
+        exp_or = Series([True, True, True, True], index=list("ABCD"), name="x")
+        tm.assert_series_equal(s4 | s3, exp_or)
+
+        tm.assert_frame_equal(s3.to_frame() & s4.to_frame(), exp.to_frame())
+        tm.assert_frame_equal(s4.to_frame() & s3.to_frame(), exp.to_frame())
+
+        tm.assert_frame_equal(s3.to_frame() | s4.to_frame(), exp_or1.to_frame())
+        tm.assert_frame_equal(s4.to_frame() | s3.to_frame(), exp_or.to_frame())
+
+    def test_int_dtype_different_index_not_bool(self):
+        # GH 52500
+        ser1 = Series([1, 2, 3], index=[10, 11, 23], name="a")
+        ser2 = Series([10, 20, 30], index=[11, 10, 23], name="a")
+        result = np.bitwise_xor(ser1, ser2)
+        expected = Series([21, 8, 29], index=[10, 11, 23], name="a")
+        tm.assert_series_equal(result, expected)
+
+        result = ser1 ^ ser2
+        tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c88329a83b0ef5f67468b5b6db8e595ea55d2a4
--- /dev/null
+++ b/pandas/tests/series/test_missing.py
@@ -0,0 +1,88 @@
+from datetime import timedelta
+
+import numpy as np
+import pytest
+
+from pandas._libs import iNaT
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    Index,
+    NaT,
+    Series,
+    isna,
+)
+import pandas._testing as tm
+
+
+class TestSeriesMissingData:
+    def test_categorical_nan_handling(self):
+        # NaNs are represented as -1 in labels
+        s = Series(Categorical(["a", "b", np.nan, "a"]))
+        tm.assert_index_equal(s.cat.categories, Index(["a", "b"]))
+        tm.assert_numpy_array_equal(
+            s.values.codes, np.array([0, 1, -1, 0], dtype=np.int8)
+        )
+
+    def test_timedelta64_nan(self):
+        td = Series([timedelta(days=i) for i in range(10)])
+
+        # nan ops on timedeltas
+        td1 = td.copy()
+        td1[0] = np.nan
+        assert isna(td1[0])
+        assert td1[0]._value == iNaT
+        td1[0] = td[0]
+        assert not isna(td1[0])
+
+        # GH#16674 iNaT is treated as an integer when given by the user
+        with pytest.raises(TypeError, match="Invalid value"):
+            td1[1] = iNaT
+
+        td1[2] = NaT
+        assert isna(td1[2])
+        assert td1[2]._value == iNaT
+        td1[2] = td[2]
+        assert not isna(td1[2])
+
+        # boolean setting
+        # GH#2899 boolean setting
+        td3 = np.timedelta64(timedelta(days=3))
+        td7 = np.timedelta64(timedelta(days=7))
+        td[(td > td3) & (td < td7)] = np.nan
+        assert isna(td).sum() == 3
+
+    @pytest.mark.xfail(
+        reason="Chained inequality raises when trying to define 'selector'"
+    )
+    def test_logical_range_select(self, datetime_series):
+        # NumPy limitation =(
+        # https://github.com/pandas-dev/pandas/commit/9030dc021f07c76809848925cb34828f6c8484f3
+
+        selector = -0.5 <= datetime_series <= 0.5
+        expected = (datetime_series >= -0.5) & (datetime_series <= 0.5)
+        tm.assert_series_equal(selector, expected)
+
+    def test_valid(self, datetime_series):
+        ts = datetime_series.copy()
+        ts.index = ts.index._with_freq(None)
+        ts[::2] = np.nan
+
+        result = ts.dropna()
+        assert len(result) == ts.count()
+        tm.assert_series_equal(result, ts[1::2])
+        tm.assert_series_equal(result, ts[pd.notna(ts)])
+
+
+def test_hasnans_uncached_for_series():
+    # GH#19700
+    # set float64 dtype to avoid upcast when setting nan
+    idx = Index([0, 1], dtype="float64")
+    assert idx.hasnans is False
+    assert "hasnans" in idx._cache
+    ser = idx.to_series()
+    assert ser.hasnans is False
+    assert not hasattr(ser, "_cache")
+    ser.iloc[-1] = np.nan
+    assert ser.hasnans is True
diff --git a/pandas/tests/series/test_npfuncs.py b/pandas/tests/series/test_npfuncs.py
new file mode 100644
index 0000000000000000000000000000000000000000..f30c01b49639935a8ace4ed1d91baa84f899d3b6
--- /dev/null
+++ b/pandas/tests/series/test_npfuncs.py
@@ -0,0 +1,52 @@
+"""
+Tests for np.foo applied to Series, not necessarily ufuncs.
+"""
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas import Series
+import pandas._testing as tm
+
+
+class TestPtp:
+    def test_ptp(self):
+        # GH#21614
+        N = 1000
+        arr = np.random.default_rng(2).standard_normal(N)
+        ser = Series(arr)
+        assert np.ptp(ser) == np.ptp(arr)
+
+
+def test_numpy_unique(datetime_series):
+    # it works!
+    np.unique(datetime_series)
+
+
+@pytest.mark.parametrize("index", [["a", "b", "c", "d", "e"], None])
+def test_numpy_argwhere(index):
+    # GH#35331
+
+    s = Series(range(5), index=index, dtype=np.int64)
+
+    result = np.argwhere(s > 2).astype(np.int64)
+    expected = np.array([[3], [4]], dtype=np.int64)
+
+    tm.assert_numpy_array_equal(result, expected)
+
+
+@td.skip_if_no("pyarrow")
+def test_log_arrow_backed_missing_value(using_nan_is_na):
+    # GH#56285
+    ser = Series([1, 2, None], dtype="float64[pyarrow]")
+    if using_nan_is_na:
+        result = np.log(ser)
+        expected = np.log(Series([1, 2, None], dtype="float64[pyarrow]"))
+        tm.assert_series_equal(result, expected)
+    else:
+        # we get cast to object which raises
+        msg = "loop of ufunc does not support argument"
+        with pytest.raises(TypeError, match=msg):
+            np.log(ser)
diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0be93324dde56e6d393873d3c62fc3c750dee8e
--- /dev/null
+++ b/pandas/tests/series/test_reductions.py
@@ -0,0 +1,233 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Series
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize("operation, expected", [("min", "a"), ("max", "b")])
+def test_reductions_series_strings(operation, expected):
+    # GH#31746
+    ser = Series(["a", "b"], dtype="string")
+    res_operation_serie = getattr(ser, operation)()
+    assert res_operation_serie == expected
+
+
+@pytest.mark.parametrize("as_period", [True, False])
+def test_mode_extension_dtype(as_period):
+    # GH#41927 preserve dt64tz dtype
+    ser = Series([pd.Timestamp(1979, 4, n) for n in range(1, 5)])
+
+    if as_period:
+        ser = ser.dt.to_period("D")
+    else:
+        ser = ser.dt.tz_localize("US/Central")
+
+    res = ser.mode()
+    assert res.dtype == ser.dtype
+    tm.assert_series_equal(res, ser)
+
+
+def test_mode_nullable_dtype(any_numeric_ea_dtype):
+    # GH#55340
+    ser = Series([1, 3, 2, pd.NA, 3, 2, pd.NA], dtype=any_numeric_ea_dtype)
+    result = ser.mode(dropna=False)
+    expected = Series([2, 3, pd.NA], dtype=any_numeric_ea_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.mode(dropna=True)
+    expected = Series([2, 3], dtype=any_numeric_ea_dtype)
+    tm.assert_series_equal(result, expected)
+
+    ser[-1] = pd.NA
+
+    result = ser.mode(dropna=True)
+    expected = Series([2, 3], dtype=any_numeric_ea_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.mode(dropna=False)
+    expected = Series([pd.NA], dtype=any_numeric_ea_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_mode_nullable_dtype_edge_case(any_numeric_ea_dtype):
+    # GH##58926
+    ser = Series([1, 2, 3, 1], dtype=any_numeric_ea_dtype)
+    result = ser.mode(dropna=False)
+    expected = Series([1], dtype=any_numeric_ea_dtype)
+    tm.assert_series_equal(result, expected)
+
+    ser2 = Series([1, 1, 2, 3, pd.NA], dtype=any_numeric_ea_dtype)
+    result = ser2.mode(dropna=False)
+    expected = Series([1], dtype=any_numeric_ea_dtype)
+    tm.assert_series_equal(result, expected)
+
+    ser3 = Series([1, pd.NA, pd.NA], dtype=any_numeric_ea_dtype)
+    result = ser3.mode(dropna=False)
+    expected = Series([pd.NA], dtype=any_numeric_ea_dtype)
+    tm.assert_series_equal(result, expected)
+
+    ser4 = Series([1, 1, pd.NA, pd.NA], dtype=any_numeric_ea_dtype)
+    result = ser4.mode(dropna=False)
+    expected = Series([1, pd.NA], dtype=any_numeric_ea_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_mode_infer_string():
+    # GH#56183
+    pytest.importorskip("pyarrow")
+    ser = Series(["a", "b"], dtype=object)
+    with pd.option_context("future.infer_string", True):
+        result = ser.mode()
+    expected = Series(["a", "b"], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+
+def test_reductions_td64_with_nat():
+    # GH#8617
+    ser = Series([0, pd.NaT], dtype="m8[ns]")
+    exp = ser[0]
+    assert ser.median() == exp
+    assert ser.min() == exp
+    assert ser.max() == exp
+
+
+def test_td64_sum_empty(skipna):
+    # GH#37151
+    ser = Series([], dtype="timedelta64[ns]")
+
+    result = ser.sum(skipna=skipna)
+    assert isinstance(result, pd.Timedelta)
+    assert result == pd.Timedelta(0)
+
+
+def test_td64_summation_overflow():
+    # GH#9442
+    ser = Series(pd.date_range("20130101", periods=100000, freq="h", unit="ns"))
+    ser[0] += pd.Timedelta("1s 1ms")
+
+    # mean
+    result = (ser - ser.min()).mean()
+    expected = pd.Timedelta((pd.TimedeltaIndex(ser - ser.min()).asi8 / len(ser)).sum())
+
+    # the computation is converted to float so
+    # might be some loss of precision
+    assert np.allclose(result._value / 1000, expected._value / 1000)
+
+    # sum
+    msg = "overflow in timedelta operation"
+    with pytest.raises(ValueError, match=msg):
+        (ser - ser.min()).sum()
+
+    s1 = ser[0:10000]
+    with pytest.raises(ValueError, match=msg):
+        (s1 - s1.min()).sum()
+    s2 = ser[0:1000]
+    (s2 - s2.min()).sum()
+
+
+def test_prod_numpy16_bug():
+    ser = Series([1.0, 1.0, 1.0], index=range(3))
+    result = ser.prod()
+
+    assert not isinstance(result, Series)
+
+
+@pytest.mark.parametrize("func", [np.any, np.all])
+@pytest.mark.parametrize("kwargs", [{"keepdims": True}, {"out": object()}])
+def test_validate_any_all_out_keepdims_raises(kwargs, func):
+    ser = Series([1, 2])
+    param = next(iter(kwargs))
+    name = func.__name__
+
+    msg = (
+        f"the '{param}' parameter is not "
+        "supported in the pandas "
+        rf"implementation of {name}\(\)"
+    )
+    with pytest.raises(ValueError, match=msg):
+        func(ser, **kwargs)
+
+
+def test_validate_sum_initial():
+    ser = Series([1, 2])
+    msg = (
+        r"the 'initial' parameter is not "
+        r"supported in the pandas "
+        r"implementation of sum\(\)"
+    )
+    with pytest.raises(ValueError, match=msg):
+        np.sum(ser, initial=10)
+
+
+def test_validate_median_initial():
+    ser = Series([1, 2])
+    msg = (
+        r"the 'overwrite_input' parameter is not "
+        r"supported in the pandas "
+        r"implementation of median\(\)"
+    )
+    with pytest.raises(ValueError, match=msg):
+        # It seems like np.median doesn't dispatch, so we use the
+        # method instead of the ufunc.
+        ser.median(overwrite_input=True)
+
+
+def test_validate_stat_keepdims():
+    ser = Series([1, 2])
+    msg = (
+        r"the 'keepdims' parameter is not "
+        r"supported in the pandas "
+        r"implementation of sum\(\)"
+    )
+    with pytest.raises(ValueError, match=msg):
+        np.sum(ser, keepdims=True)
+
+
+def test_mean_with_convertible_string_raises():
+    # GH#44008
+    ser = Series(["1", "2"])
+    assert ser.sum() == "12"
+
+    msg = "Could not convert string '12' to numeric|does not support|Cannot perform"
+    with pytest.raises(TypeError, match=msg):
+        ser.mean()
+
+    df = ser.to_frame()
+    msg = r"Could not convert \['12'\] to numeric|does not support|Cannot perform"
+    with pytest.raises(TypeError, match=msg):
+        df.mean()
+
+
+def test_mean_dont_convert_j_to_complex():
+    # GH#36703
+    df = pd.DataFrame([{"db": "J", "numeric": 123}])
+    msg = r"Could not convert \['J'\] to numeric|does not support|Cannot perform"
+    with pytest.raises(TypeError, match=msg):
+        df.mean()
+
+    with pytest.raises(TypeError, match=msg):
+        df.agg("mean")
+
+    msg = "Could not convert string 'J' to numeric|does not support|Cannot perform"
+    with pytest.raises(TypeError, match=msg):
+        df["db"].mean()
+    msg = "Could not convert string 'J' to numeric|ufunc 'divide'|Cannot perform"
+    with pytest.raises(TypeError, match=msg):
+        np.mean(df["db"].astype("string").array)
+
+
+def test_median_with_convertible_string_raises():
+    # GH#34671 this _could_ return a string "2", but definitely not float 2.0
+    msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support|Cannot perform"
+    ser = Series(["1", "2", "3"])
+    with pytest.raises(TypeError, match=msg):
+        ser.median()
+
+    msg = (
+        r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support|Cannot perform"
+    )
+    df = ser.to_frame()
+    with pytest.raises(TypeError, match=msg):
+        df.median()
diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2d5afcf884b12b3007905061b7c503359e71a5d
--- /dev/null
+++ b/pandas/tests/series/test_subclass.py
@@ -0,0 +1,82 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:Passing a BlockManager|Passing a SingleBlockManager:DeprecationWarning"
+)
+
+
+class TestSeriesSubclassing:
+    @pytest.mark.parametrize(
+        "idx_method, indexer, exp_data, exp_idx",
+        [
+            ["loc", ["a", "b"], [1, 2], "ab"],
+            ["iloc", [2, 3], [3, 4], "cd"],
+        ],
+    )
+    def test_indexing_sliced(self, idx_method, indexer, exp_data, exp_idx):
+        s = tm.SubclassedSeries([1, 2, 3, 4], index=list("abcd"))
+        res = getattr(s, idx_method)[indexer]
+        exp = tm.SubclassedSeries(exp_data, index=list(exp_idx))
+        tm.assert_series_equal(res, exp)
+
+    def test_to_frame(self):
+        s = tm.SubclassedSeries([1, 2, 3, 4], index=list("abcd"), name="xxx")
+        res = s.to_frame()
+        exp = tm.SubclassedDataFrame({"xxx": [1, 2, 3, 4]}, index=list("abcd"))
+        tm.assert_frame_equal(res, exp)
+
+    def test_subclass_unstack(self):
+        # GH 15564
+        s = tm.SubclassedSeries([1, 2, 3, 4], index=[list("aabb"), list("xyxy")])
+
+        res = s.unstack()
+        exp = tm.SubclassedDataFrame({"x": [1, 3], "y": [2, 4]}, index=["a", "b"])
+
+        tm.assert_frame_equal(res, exp)
+
+    def test_subclass_empty_repr(self):
+        sub_series = tm.SubclassedSeries()
+        assert "SubclassedSeries" in repr(sub_series)
+
+    def test_asof(self):
+        N = 3
+        rng = pd.date_range("1/1/1990", periods=N, freq="53s")
+        s = tm.SubclassedSeries({"A": [np.nan, np.nan, np.nan]}, index=rng)
+
+        result = s.asof(rng[-2:])
+        assert isinstance(result, tm.SubclassedSeries)
+
+    def test_explode(self):
+        s = tm.SubclassedSeries([[1, 2, 3], "foo", [], [3, 4]])
+        result = s.explode()
+        assert isinstance(result, tm.SubclassedSeries)
+
+    def test_equals(self):
+        # https://github.com/pandas-dev/pandas/pull/34402
+        # allow subclass in both directions
+        s1 = pd.Series([1, 2, 3])
+        s2 = tm.SubclassedSeries([1, 2, 3])
+        assert s1.equals(s2)
+        assert s2.equals(s1)
+
+
+class SubclassedSeries(pd.Series):
+    @property
+    def _constructor(self):
+        def _new(*args, **kwargs):
+            # some constructor logic that accesses the Series' name
+            if self.name == "test":
+                return pd.Series(*args, **kwargs)
+            return SubclassedSeries(*args, **kwargs)
+
+        return _new
+
+
+def test_constructor_from_dict():
+    # https://github.com/pandas-dev/pandas/issues/52445
+    result = SubclassedSeries({"a": 1, "b": 2, "c": 3})
+    assert isinstance(result, SubclassedSeries)
diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eaf1632528d87be4d79b69631151842e9050cdc
--- /dev/null
+++ b/pandas/tests/series/test_ufunc.py
@@ -0,0 +1,475 @@
+from collections import deque
+import re
+import string
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.arrays import SparseArray
+
+
+@pytest.fixture(params=[np.add, np.logaddexp])
+def ufunc(request):
+    # dunder op
+    return request.param
+
+
+@pytest.fixture(
+    params=[pytest.param(True, marks=pytest.mark.fails_arm_wheels), False],
+    ids=["sparse", "dense"],
+)
+def sparse(request):
+    return request.param
+
+
+@pytest.fixture
+def arrays_for_binary_ufunc():
+    """
+    A pair of random, length-100 integer-dtype arrays, that are mostly 0.
+    """
+    a1 = np.random.default_rng(2).integers(0, 10, 100, dtype="int64")
+    a2 = np.random.default_rng(2).integers(0, 10, 100, dtype="int64")
+    a1[::3] = 0
+    a2[::4] = 0
+    return a1, a2
+
+
+@pytest.mark.parametrize("ufunc", [np.positive, np.floor, np.exp])
+def test_unary_ufunc(ufunc, sparse):
+    # Test that ufunc(pd.Series) == pd.Series(ufunc)
+    arr = np.random.default_rng(2).integers(0, 10, 10, dtype="int64")
+    arr[::2] = 0
+    if sparse:
+        arr = SparseArray(arr, dtype=pd.SparseDtype("int64", 0))
+
+    index = list(string.ascii_letters[:10])
+    name = "name"
+    series = pd.Series(arr, index=index, name=name)
+
+    result = ufunc(series)
+    expected = pd.Series(ufunc(arr), index=index, name=name)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"])
+def test_binary_ufunc_with_array(flip, sparse, ufunc, arrays_for_binary_ufunc):
+    # Test that ufunc(pd.Series(a), array) == pd.Series(ufunc(a, b))
+    a1, a2 = arrays_for_binary_ufunc
+    if sparse:
+        a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0))
+        a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0))
+
+    name = "name"  # op(pd.Series, array) preserves the name.
+    series = pd.Series(a1, name=name)
+    other = a2
+
+    array_args = (a1, a2)
+    series_args = (series, other)  # ufunc(series, array)
+
+    if flip:
+        array_args = reversed(array_args)
+        series_args = reversed(series_args)  # ufunc(array, series)
+
+    expected = pd.Series(ufunc(*array_args), name=name)
+    result = ufunc(*series_args)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"])
+def test_binary_ufunc_with_index(flip, sparse, ufunc, arrays_for_binary_ufunc):
+    # Test that
+    #   * func(pd.Series(a), pd.Series(b)) == pd.Series(ufunc(a, b))
+    #   * ufunc(Index, pd.Series) dispatches to pd.Series (returns a pd.Series)
+    a1, a2 = arrays_for_binary_ufunc
+    if sparse:
+        a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0))
+        a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0))
+
+    name = "name"  # op(pd.Series, array) preserves the name.
+    series = pd.Series(a1, name=name)
+
+    other = pd.Index(a2, name=name).astype("int64")
+
+    array_args = (a1, a2)
+    series_args = (series, other)  # ufunc(series, array)
+
+    if flip:
+        array_args = reversed(array_args)
+        series_args = reversed(series_args)  # ufunc(array, series)
+
+    expected = pd.Series(ufunc(*array_args), name=name)
+    result = ufunc(*series_args)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("shuffle", [True, False], ids=["unaligned", "aligned"])
+@pytest.mark.parametrize("flip", [True, False], ids=["flipped", "straight"])
+def test_binary_ufunc_with_series(
+    flip, shuffle, sparse, ufunc, arrays_for_binary_ufunc
+):
+    # Test that
+    #   * func(pd.Series(a), pd.Series(b)) == pd.Series(ufunc(a, b))
+    #   with alignment between the indices
+    a1, a2 = arrays_for_binary_ufunc
+    if sparse:
+        a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0))
+        a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0))
+
+    name = "name"  # op(pd.Series, array) preserves the name.
+    series = pd.Series(a1, name=name)
+    other = pd.Series(a2, name=name)
+
+    idx = np.random.default_rng(2).permutation(len(a1))
+
+    if shuffle:
+        other = other.take(idx)
+        if flip:
+            index = other.align(series)[0].index
+        else:
+            index = series.align(other)[0].index
+    else:
+        index = series.index
+
+    array_args = (a1, a2)
+    series_args = (series, other)  # ufunc(series, array)
+
+    if flip:
+        array_args = tuple(reversed(array_args))
+        series_args = tuple(reversed(series_args))  # ufunc(array, series)
+
+    expected = pd.Series(ufunc(*array_args), index=index, name=name)
+    result = ufunc(*series_args)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("flip", [True, False])
+def test_binary_ufunc_scalar(ufunc, sparse, flip, arrays_for_binary_ufunc):
+    # Test that
+    #   * ufunc(pd.Series, scalar) == pd.Series(ufunc(array, scalar))
+    #   * ufunc(pd.Series, scalar) == ufunc(scalar, pd.Series)
+    arr, _ = arrays_for_binary_ufunc
+    if sparse:
+        arr = SparseArray(arr)
+    other = 2
+    series = pd.Series(arr, name="name")
+
+    series_args = (series, other)
+    array_args = (arr, other)
+
+    if flip:
+        series_args = tuple(reversed(series_args))
+        array_args = tuple(reversed(array_args))
+
+    expected = pd.Series(ufunc(*array_args), name="name")
+    result = ufunc(*series_args)
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("ufunc", [np.divmod])  # TODO: np.modf, np.frexp
+@pytest.mark.parametrize("shuffle", [True, False])
+@pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning")
+def test_multiple_output_binary_ufuncs(ufunc, sparse, shuffle, arrays_for_binary_ufunc):
+    # Test that
+    #  the same conditions from binary_ufunc_scalar apply to
+    #  ufuncs with multiple outputs.
+
+    a1, a2 = arrays_for_binary_ufunc
+    # work around https://github.com/pandas-dev/pandas/issues/26987
+    a1[a1 == 0] = 1
+    a2[a2 == 0] = 1
+
+    if sparse:
+        a1 = SparseArray(a1, dtype=pd.SparseDtype("int64", 0))
+        a2 = SparseArray(a2, dtype=pd.SparseDtype("int64", 0))
+
+    s1 = pd.Series(a1)
+    s2 = pd.Series(a2)
+
+    if shuffle:
+        # ensure we align before applying the ufunc
+        s2 = s2.sample(frac=1)
+
+    expected = ufunc(a1, a2)
+    assert isinstance(expected, tuple)
+
+    result = ufunc(s1, s2)
+    assert isinstance(result, tuple)
+    tm.assert_series_equal(result[0], pd.Series(expected[0]))
+    tm.assert_series_equal(result[1], pd.Series(expected[1]))
+
+
+def test_multiple_output_ufunc(sparse, arrays_for_binary_ufunc):
+    # Test that the same conditions from unary input apply to multi-output
+    # ufuncs
+    arr, _ = arrays_for_binary_ufunc
+
+    if sparse:
+        arr = SparseArray(arr)
+
+    series = pd.Series(arr, name="name")
+    result = np.modf(series)
+    expected = np.modf(arr)
+
+    assert isinstance(result, tuple)
+    assert isinstance(expected, tuple)
+
+    tm.assert_series_equal(result[0], pd.Series(expected[0], name="name"))
+    tm.assert_series_equal(result[1], pd.Series(expected[1], name="name"))
+
+
+def test_binary_ufunc_drops_series_name(ufunc, sparse, arrays_for_binary_ufunc):
+    # Drop the names when they differ.
+    a1, a2 = arrays_for_binary_ufunc
+    s1 = pd.Series(a1, name="a")
+    s2 = pd.Series(a2, name="b")
+
+    result = ufunc(s1, s2)
+    assert result.name is None
+
+
+def test_object_series_ok():
+    class Dummy:
+        def __init__(self, value) -> None:
+            self.value = value
+
+        def __add__(self, other):
+            return self.value + other.value
+
+    arr = np.array([Dummy(0), Dummy(1)])
+    ser = pd.Series(arr)
+    tm.assert_series_equal(np.add(ser, ser), pd.Series(np.add(ser, arr)))
+    tm.assert_series_equal(np.add(ser, Dummy(1)), pd.Series(np.add(ser, Dummy(1))))
+
+
+@pytest.fixture(
+    params=[
+        pd.array([1, 3, 2], dtype=np.int64),
+        pd.array([1, 3, 2], dtype="Int64"),
+        pd.array([1, 3, 2], dtype="Float32"),
+        pd.array([1, 10, 2], dtype="Sparse[int]"),
+        pd.to_datetime(["2000", "2010", "2001"]),
+        pd.to_datetime(["2000", "2010", "2001"]).tz_localize("CET"),
+        pd.to_datetime(["2000", "2010", "2001"]).to_period(freq="D"),
+        pd.to_timedelta(["1 Day", "3 Days", "2 Days"]),
+        pd.IntervalIndex([pd.Interval(0, 1), pd.Interval(2, 3), pd.Interval(1, 2)]),
+    ],
+    ids=lambda x: str(x.dtype),
+)
+def values_for_np_reduce(request):
+    # min/max tests assume that these are monotonic increasing
+    return request.param
+
+
+class TestNumpyReductions:
+    # TODO: cases with NAs, axis kwarg for DataFrame
+
+    def test_multiply(self, values_for_np_reduce, box_with_array, request):
+        box = box_with_array
+        values = values_for_np_reduce
+
+        with tm.assert_produces_warning(None):
+            obj = box(values)
+
+        if isinstance(values, pd.core.arrays.SparseArray):
+            mark = pytest.mark.xfail(reason="SparseArray has no 'prod'")
+            request.applymarker(mark)
+
+        if values.dtype.kind in "iuf":
+            result = np.multiply.reduce(obj)
+            if box is pd.DataFrame:
+                expected = obj.prod(numeric_only=False)
+                tm.assert_series_equal(result, expected)
+            elif box is pd.Index:
+                # Index has no 'prod'
+                expected = obj._values.prod()
+                assert result == expected
+            else:
+                expected = obj.prod()
+                assert result == expected
+        else:
+            msg = "|".join(
+                [
+                    "does not support operation",
+                    "unsupported operand type",
+                    "ufunc 'multiply' cannot use operands",
+                ]
+            )
+            with pytest.raises(TypeError, match=msg):
+                np.multiply.reduce(obj)
+
+    def test_add(self, values_for_np_reduce, box_with_array):
+        box = box_with_array
+        values = values_for_np_reduce
+
+        with tm.assert_produces_warning(None):
+            obj = box(values)
+
+        if values.dtype.kind in "miuf":
+            result = np.add.reduce(obj)
+            if box is pd.DataFrame:
+                expected = obj.sum(numeric_only=False)
+                tm.assert_series_equal(result, expected)
+            elif box is pd.Index:
+                # Index has no 'sum'
+                expected = obj._values.sum()
+                assert result == expected
+            else:
+                expected = obj.sum()
+                assert result == expected
+        else:
+            msg = "|".join(
+                [
+                    "does not support operation",
+                    "unsupported operand type",
+                    "ufunc 'add' cannot use operands",
+                ]
+            )
+            with pytest.raises(TypeError, match=msg):
+                np.add.reduce(obj)
+
+    def test_max(self, values_for_np_reduce, box_with_array, using_python_scalars):
+        box = box_with_array
+        values = values_for_np_reduce
+
+        same_type = True
+        if box is pd.Index and values.dtype.kind in "if":
+            # ATM Index casts to object, so we get python ints/floats
+            same_type = False
+
+        with tm.assert_produces_warning(None):
+            obj = box(values)
+
+        result = np.maximum.reduce(obj)
+        if box is pd.DataFrame:
+            # TODO: cases with axis kwarg
+            expected = obj.max(numeric_only=False)
+            tm.assert_series_equal(result, expected)
+        else:
+            expected = values[1]
+            if using_python_scalars and values.dtype.kind in "if":
+                expected = expected.item()
+            assert result == expected
+            if same_type:
+                # check we have e.g. Timestamp instead of dt64
+                assert type(result) == type(expected)
+
+    def test_min(self, values_for_np_reduce, box_with_array, using_python_scalars):
+        box = box_with_array
+        values = values_for_np_reduce
+
+        same_type = True
+        if box is pd.Index and values.dtype.kind in "if":
+            # ATM Index casts to object, so we get python ints/floats
+            same_type = False
+
+        with tm.assert_produces_warning(None):
+            obj = box(values)
+
+        result = np.minimum.reduce(obj)
+        if box is pd.DataFrame:
+            expected = obj.min(numeric_only=False)
+            tm.assert_series_equal(result, expected)
+        else:
+            expected = values[0]
+            if using_python_scalars and values.dtype.kind in ["i", "f"]:
+                expected = expected.item()
+            assert result == expected
+            if same_type:
+                # check we have e.g. Timestamp instead of dt64
+                assert type(result) == type(expected)
+
+
+@pytest.mark.parametrize("type_", [list, deque, tuple])
+def test_binary_ufunc_other_types(type_):
+    a = pd.Series([1, 2, 3], name="name")
+    b = type_([3, 4, 5])
+
+    result = np.add(a, b)
+    expected = pd.Series(np.add(a.to_numpy(), b), name="name")
+    tm.assert_series_equal(result, expected)
+
+
+def test_object_dtype_ok():
+    class Thing:
+        def __init__(self, value) -> None:
+            self.value = value
+
+        def __add__(self, other):
+            other = getattr(other, "value", other)
+            return type(self)(self.value + other)
+
+        def __eq__(self, other) -> bool:
+            return type(other) is Thing and self.value == other.value
+
+        def __repr__(self) -> str:
+            return f"Thing({self.value})"
+
+    s = pd.Series([Thing(1), Thing(2)])
+    result = np.add(s, Thing(1))
+    expected = pd.Series([Thing(2), Thing(3)])
+    tm.assert_series_equal(result, expected)
+
+
+def test_outer():
+    # https://github.com/pandas-dev/pandas/issues/27186
+    ser = pd.Series([1, 2, 3])
+    obj = np.array([1, 2, 3])
+
+    with pytest.raises(NotImplementedError, match="^$"):
+        np.subtract.outer(ser, obj)
+
+
+def test_np_matmul():
+    # GH26650
+    df1 = pd.DataFrame(data=[[-1, 1, 10]])
+    df2 = pd.DataFrame(data=[-1, 1, 10])
+    expected = pd.DataFrame(data=[102])
+
+    result = np.matmul(df1, df2)
+    tm.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize("box", [pd.Index, pd.Series])
+def test_np_matmul_1D(box, using_python_scalars):
+    result = np.matmul(box([1, 2]), box([2, 3]))
+    assert result == 8
+    if using_python_scalars:
+        assert type(result) == int, type(result)
+    else:
+        assert type(result) == np.int64, type(result)
+
+
+def test_array_ufuncs_for_many_arguments():
+    # GH39853
+    def add3(x, y, z):
+        return x + y + z
+
+    ufunc = np.frompyfunc(add3, 3, 1)
+    ser = pd.Series([1, 2])
+
+    result = ufunc(ser, ser, 1)
+    expected = pd.Series([3, 5], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+    df = pd.DataFrame([[1, 2]])
+
+    msg = (
+        "Cannot apply ufunc <ufunc 'add3 (vectorized)'> "
+        "to mixed DataFrame and Series inputs."
+    )
+    with pytest.raises(NotImplementedError, match=re.escape(msg)):
+        ufunc(ser, ser, df)
+
+
+def test_np_trunc():
+    # This used to test np.fix, which is not a ufunc but is composed of
+    # several ufunc calls under the hood with `out` and `where` keywords. But numpy
+    # is deprecating that (or at least discussing deprecating) in favor of np.trunc,
+    # which _is_ a ufunc without the out keyword usage.
+    ser = pd.Series([-1.5, -0.5, 0.5, 1.5])
+    result = np.trunc(ser)
+    expected = pd.Series([-1.0, -0.0, 0.0, 1.0])
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/test_unary.py b/pandas/tests/series/test_unary.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f153788e413c5e9198dc35867bc628823555dbf
--- /dev/null
+++ b/pandas/tests/series/test_unary.py
@@ -0,0 +1,50 @@
+import pytest
+
+from pandas import Series
+import pandas._testing as tm
+
+
+class TestSeriesUnaryOps:
+    # __neg__, __pos__, __invert__
+
+    def test_neg(self):
+        ser = Series(range(5), dtype="float64", name="series")
+        tm.assert_series_equal(-ser, -1 * ser)
+
+    def test_invert(self):
+        ser = Series(range(5), dtype="float64", name="series")
+        tm.assert_series_equal(-(ser < 0), ~(ser < 0))
+
+    @pytest.mark.parametrize(
+        "source, neg_target, abs_target",
+        [
+            ([1, 2, 3], [-1, -2, -3], [1, 2, 3]),
+            ([1, 2, None], [-1, -2, None], [1, 2, None]),
+        ],
+    )
+    def test_all_numeric_unary_operators(
+        self, any_numeric_ea_dtype, source, neg_target, abs_target
+    ):
+        # GH38794
+        dtype = any_numeric_ea_dtype
+        ser = Series(source, dtype=dtype)
+        neg_result, pos_result, abs_result = -ser, +ser, abs(ser)
+        if dtype.startswith("U"):
+            neg_target = -Series(source, dtype=dtype)
+        else:
+            neg_target = Series(neg_target, dtype=dtype)
+
+        abs_target = Series(abs_target, dtype=dtype)
+
+        tm.assert_series_equal(neg_result, neg_target)
+        tm.assert_series_equal(pos_result, ser)
+        tm.assert_series_equal(abs_result, abs_target)
+
+    @pytest.mark.parametrize("op", ["__neg__", "__abs__"])
+    def test_unary_float_op_mask(self, float_ea_dtype, op):
+        dtype = float_ea_dtype
+        ser = Series([1.1, 2.2, 3.3], dtype=dtype)
+        result = getattr(ser, op)()
+        target = result.copy(deep=True)
+        ser[0] = None
+        tm.assert_series_equal(result, target)
diff --git a/pandas/tests/series/test_validate.py b/pandas/tests/series/test_validate.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c867f7582b7d3250bf5e009ffbf7545da404712
--- /dev/null
+++ b/pandas/tests/series/test_validate.py
@@ -0,0 +1,26 @@
+import pytest
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        "reset_index",
+        "_set_name",
+        "sort_values",
+        "sort_index",
+        "rename",
+        "dropna",
+        "drop_duplicates",
+    ],
+)
+@pytest.mark.parametrize("inplace", [1, "True", [1, 2, 3], 5.0])
+def test_validate_bool_args(string_series, func, inplace):
+    """Tests for error handling related to data types of method arguments."""
+    msg = 'For argument "inplace" expected type bool'
+    kwargs = {"inplace": inplace}
+
+    if func == "_set_name":
+        kwargs["name"] = "hello"
+
+    with pytest.raises(ValueError, match=msg):
+        getattr(string_series, func)(**kwargs)
diff --git a/pandas/tests/strings/__init__.py b/pandas/tests/strings/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c4bec6a2378932b9d49302a04b4f0f80a9e3e3b
--- /dev/null
+++ b/pandas/tests/strings/__init__.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+import pandas as pd
+
+
+def is_object_or_nan_string_dtype(dtype):
+    """
+    Check if string-like dtype is following NaN semantics, i.e. is object
+    dtype or a NaN-variant of the StringDtype.
+    """
+    return (isinstance(dtype, np.dtype) and dtype == "object") or (
+        dtype.na_value is np.nan
+    )
+
+
+def _convert_na_value(ser, expected):
+    if ser.dtype != object:
+        if ser.dtype.na_value is np.nan:
+            expected = expected.fillna(np.nan)
+        else:
+            # GH#18463
+            expected = expected.fillna(pd.NA)
+    return expected
diff --git a/pandas/tests/strings/conftest.py b/pandas/tests/strings/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..d84d0db2c019df9206c33b0edf166f11df8eef57
--- /dev/null
+++ b/pandas/tests/strings/conftest.py
@@ -0,0 +1,132 @@
+import pytest
+
+from pandas import Series
+from pandas.core.strings.accessor import StringMethods
+
+_any_string_method = [
+    ("cat", (), {"sep": ","}),
+    ("cat", (Series(list("zyx")),), {"sep": ",", "join": "left"}),
+    ("center", (10,), {}),
+    ("contains", ("a",), {}),
+    ("count", ("a",), {}),
+    ("decode", ("UTF-8",), {}),
+    ("encode", ("UTF-8",), {}),
+    ("endswith", ("a",), {}),
+    ("endswith", ((),), {}),
+    ("endswith", (("a",),), {}),
+    ("endswith", (("a", "b"),), {}),
+    ("endswith", (("a", "MISSING"),), {}),
+    ("endswith", ("a",), {"na": True}),
+    ("endswith", ("a",), {"na": False}),
+    ("extract", ("([a-z]*)",), {"expand": False}),
+    ("extract", ("([a-z]*)",), {"expand": True}),
+    ("extractall", ("([a-z]*)",), {}),
+    ("find", ("a",), {}),
+    ("findall", ("a",), {}),
+    ("get", (0,), {}),
+    # because "index" (and "rindex") fail intentionally
+    # if the string is not found, search only for empty string
+    ("index", ("",), {}),
+    ("join", (",",), {}),
+    ("ljust", (10,), {}),
+    ("match", ("a",), {}),
+    ("fullmatch", ("a",), {}),
+    ("normalize", ("NFC",), {}),
+    ("pad", (10,), {}),
+    ("partition", (" ",), {"expand": False}),
+    ("partition", (" ",), {"expand": True}),
+    ("repeat", (3,), {}),
+    ("replace", ("a", "z"), {}),
+    ("rfind", ("a",), {}),
+    ("rindex", ("",), {}),
+    ("rjust", (10,), {}),
+    ("rpartition", (" ",), {"expand": False}),
+    ("rpartition", (" ",), {"expand": True}),
+    ("slice", (0, 1), {}),
+    ("slice_replace", (0, 1, "z"), {}),
+    ("split", (" ",), {"expand": False}),
+    ("split", (" ",), {"expand": True}),
+    ("startswith", ("a",), {}),
+    ("startswith", (("a",),), {}),
+    ("startswith", (("a", "b"),), {}),
+    ("startswith", (("a", "MISSING"),), {}),
+    ("startswith", ((),), {}),
+    ("startswith", ("a",), {"na": True}),
+    ("startswith", ("a",), {"na": False}),
+    ("removeprefix", ("a",), {}),
+    ("removesuffix", ("a",), {}),
+    # translating unicode points of "a" to "d"
+    ("translate", ({97: 100},), {}),
+    ("wrap", (2,), {}),
+    ("zfill", (10,), {}),
+    # methods without positional arguments: zip with empty tuple and empty dict
+    *zip(
+        [
+            "capitalize",
+            "cat",
+            "get_dummies",
+            "isalnum",
+            "isalpha",
+            "isascii",
+            "isdecimal",
+            "isdigit",
+            "islower",
+            "isnumeric",
+            "isspace",
+            "istitle",
+            "isupper",
+            "len",
+            "lower",
+            "lstrip",
+            "partition",
+            "rpartition",
+            "rsplit",
+            "rstrip",
+            "slice",
+            "slice_replace",
+            "split",
+            "strip",
+            "swapcase",
+            "title",
+            "upper",
+            "casefold",
+        ],
+        [()] * 100,
+        [{}] * 100,
+    ),
+]
+ids, _, _ = zip(*_any_string_method)  # use method name as fixture-id
+missing_methods = {f for f in dir(StringMethods) if not f.startswith("_")} - set(ids)
+
+# test that the above list captures all methods of StringMethods
+assert not missing_methods
+
+
+@pytest.fixture(params=_any_string_method, ids=ids)
+def any_string_method(request):
+    """
+    Fixture for all public methods of `StringMethods`
+
+    This fixture returns a tuple of the method name and sample arguments
+    necessary to call the method.
+
+    Returns
+    -------
+    method_name : str
+        The name of the method in `StringMethods`
+    args : tuple
+        Sample values for the positional arguments
+    kwargs : dict
+        Sample values for the keyword arguments
+
+    Examples
+    --------
+    >>> def test_something(any_string_method):
+    ...     s = Series(["a", "b", np.nan, "d"])
+    ...
+    ...     method_name, args, kwargs = any_string_method
+    ...     method = getattr(s.str, method_name)
+    ...     # will not raise
+    ...     method(*args, **kwargs)
+    """
+    return request.param
diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbb663cdca4ad6ceadce4f13176d306f5b388ebe
--- /dev/null
+++ b/pandas/tests/strings/test_api.py
@@ -0,0 +1,216 @@
+import weakref
+
+import numpy as np
+import pytest
+
+from pandas import (
+    CategoricalDtype,
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    _testing as tm,
+)
+from pandas.core.strings.accessor import StringMethods
+
+# subset of the full set from pandas/conftest.py
+_any_allowed_skipna_inferred_dtype = [
+    ("string", ["a", np.nan, "c"]),
+    ("bytes", [b"a", np.nan, b"c"]),
+    ("empty", [np.nan, np.nan, np.nan]),
+    ("empty", []),
+    ("mixed-integer", ["a", np.nan, 2]),
+]
+ids, _ = zip(
+    *_any_allowed_skipna_inferred_dtype, strict=True
+)  # use inferred type as id
+
+
+@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
+def any_allowed_skipna_inferred_dtype(request):
+    """
+    Fixture for all (inferred) dtypes allowed in StringMethods.__init__
+
+    The covered (inferred) types are:
+    * 'string'
+    * 'empty'
+    * 'bytes'
+    * 'mixed'
+    * 'mixed-integer'
+
+    Returns
+    -------
+    inferred_dtype : str
+        The string for the inferred dtype from _libs.lib.infer_dtype
+    values : np.ndarray
+        An array of object dtype that will be inferred to have
+        `inferred_dtype`
+
+    Examples
+    --------
+    >>> from pandas._libs import lib
+    >>>
+    >>> def test_something(any_allowed_skipna_inferred_dtype):
+    ...     inferred_dtype, values = any_allowed_skipna_inferred_dtype
+    ...     # will pass
+    ...     assert lib.infer_dtype(values, skipna=True) == inferred_dtype
+    ...
+    ...     # constructor for .str-accessor will also pass
+    ...     Series(values).str
+    """
+    inferred_dtype, values = request.param
+    values = np.array(values, dtype=object)  # object dtype to avoid casting
+
+    # correctness of inference tested in tests/dtypes/test_inference.py
+    return inferred_dtype, values
+
+
+def test_api(any_string_dtype):
+    # GH 6106, GH 9322
+    assert Series.str is StringMethods
+    assert isinstance(Series([""], dtype=any_string_dtype).str, StringMethods)
+
+
+def test_no_circular_reference(any_string_dtype):
+    # GH 47667
+    ser = Series([""], dtype=any_string_dtype)
+    ref = weakref.ref(ser)
+    ser.str  # Used to cache and cause circular reference
+    del ser
+    assert ref() is None
+
+
+def test_api_mi_raises():
+    # GH 23679
+    mi = MultiIndex.from_arrays([["a", "b", "c"]])
+    msg = "Can only use .str accessor with Index, not MultiIndex"
+    with pytest.raises(AttributeError, match=msg):
+        mi.str
+    assert not hasattr(mi, "str")
+
+
+@pytest.mark.parametrize("dtype", [object, "category"])
+def test_api_per_dtype(index_or_series, dtype, any_skipna_inferred_dtype):
+    # one instance of parametrized fixture
+    box = index_or_series
+    inferred_dtype, values = any_skipna_inferred_dtype
+
+    t = box(values, dtype=dtype)  # explicit dtype to avoid casting
+
+    types_passing_constructor = [
+        "string",
+        "unicode",
+        "empty",
+        "bytes",
+        "mixed",
+        "mixed-integer",
+    ]
+    if inferred_dtype in types_passing_constructor:
+        # GH 6106
+        assert isinstance(t.str, StringMethods)
+    else:
+        # GH 9184, GH 23011, GH 23163
+        msg = "Can only use .str accessor with string values.*"
+        with pytest.raises(AttributeError, match=msg):
+            t.str
+        assert not hasattr(t, "str")
+
+
+@pytest.mark.parametrize("dtype", [object, "category"])
+def test_api_per_method(
+    index_or_series,
+    dtype,
+    any_allowed_skipna_inferred_dtype,
+    any_string_method,
+    request,
+    using_infer_string,
+):
+    # this test does not check correctness of the different methods,
+    # just that the methods work on the specified (inferred) dtypes,
+    # and raise on all others
+    box = index_or_series
+
+    # one instance of each parametrized fixture
+    inferred_dtype, values = any_allowed_skipna_inferred_dtype
+    method_name, args, kwargs = any_string_method
+
+    reason = None
+    if box is Index and values.size == 0:
+        if method_name in ["partition", "rpartition"] and kwargs.get("expand", True):
+            raises = TypeError
+            reason = "Method cannot deal with empty Index"
+        elif method_name == "split" and kwargs.get("expand", None):
+            raises = TypeError
+            reason = "Split fails on empty Series when expand=True"
+        elif method_name == "get_dummies":
+            raises = ValueError
+            reason = "Need to fortify get_dummies corner cases"
+
+    elif (
+        box is Index
+        and inferred_dtype == "empty"
+        and dtype == object
+        and method_name == "get_dummies"
+    ):
+        raises = ValueError
+        reason = "Need to fortify get_dummies corner cases"
+
+    if reason is not None:
+        mark = pytest.mark.xfail(raises=raises, reason=reason)
+        request.applymarker(mark)
+
+    t = box(values, dtype=dtype)  # explicit dtype to avoid casting
+    method = getattr(t.str, method_name)
+
+    if using_infer_string and dtype == "category":
+        string_allowed = method_name not in ["decode"]
+    else:
+        string_allowed = True
+    bytes_allowed = method_name in ["decode", "get", "len", "slice"]
+    # as of v0.23.4, all methods except 'cat' are very lenient with the
+    # allowed data types, just returning NaN for entries that error.
+    # This could be changed with an 'errors'-kwarg to the `str`-accessor,
+    # see discussion in GH 13877
+    mixed_allowed = method_name not in ["cat"]
+
+    allowed_types = (
+        ["empty"]
+        + ["string", "unicode"] * string_allowed
+        + ["bytes"] * bytes_allowed
+        + ["mixed", "mixed-integer"] * mixed_allowed
+    )
+
+    if inferred_dtype in allowed_types:
+        # xref GH 23555, GH 23556
+        method(*args, **kwargs)  # works!
+    else:
+        # GH 23011, GH 23163
+        msg = (
+            f"Cannot use .str.{method_name} with values of "
+            f"inferred dtype {inferred_dtype!r}."
+            "|a bytes-like object is required, not 'str'"
+        )
+        with pytest.raises(TypeError, match=msg):
+            method(*args, **kwargs)
+
+
+def test_api_for_categorical(any_string_method, any_string_dtype):
+    # https://github.com/pandas-dev/pandas/issues/10661
+    s = Series(list("aabb"), dtype=any_string_dtype)
+    s = s + " " + s
+    c = s.astype("category")
+    c = c.astype(CategoricalDtype(c.dtype.categories.astype("object")))
+    assert isinstance(c.str, StringMethods)
+
+    method_name, args, kwargs = any_string_method
+
+    result = getattr(c.str, method_name)(*args, **kwargs)
+    expected = getattr(s.astype("object").str, method_name)(*args, **kwargs)
+
+    if isinstance(result, DataFrame):
+        tm.assert_frame_equal(result, expected)
+    elif isinstance(result, Series):
+        tm.assert_series_equal(result, expected)
+    else:
+        # str.cat(others=None) returns string, for example
+        assert result == expected
diff --git a/pandas/tests/strings/test_case_justify.py b/pandas/tests/strings/test_case_justify.py
new file mode 100644
index 0000000000000000000000000000000000000000..819556f961fa39fa2e93388fd12d37b0f9aefa4d
--- /dev/null
+++ b/pandas/tests/strings/test_case_justify.py
@@ -0,0 +1,423 @@
+from datetime import datetime
+import operator
+
+import numpy as np
+import pytest
+
+from pandas import (
+    Series,
+    _testing as tm,
+)
+
+
+def test_title(any_string_dtype):
+    s = Series(["FOO", "BAR", np.nan, "Blah", "blurg"], dtype=any_string_dtype)
+    result = s.str.title()
+    expected = Series(["Foo", "Bar", np.nan, "Blah", "Blurg"], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_title_mixed_object():
+    s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0])
+    result = s.str.title()
+    expected = Series(
+        ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan],
+        dtype=object,
+    )
+    tm.assert_almost_equal(result, expected)
+
+
+def test_lower_upper(any_string_dtype):
+    s = Series(["om", np.nan, "nom", "nom"], dtype=any_string_dtype)
+
+    result = s.str.upper()
+    expected = Series(["OM", np.nan, "NOM", "NOM"], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = result.str.lower()
+    tm.assert_series_equal(result, s)
+
+
+def test_lower_upper_mixed_object():
+    s = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0])
+
+    result = s.str.upper()
+    expected = Series(
+        ["A", np.nan, "B", np.nan, np.nan, "FOO", None, np.nan, np.nan], dtype=object
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.lower()
+    expected = Series(
+        ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data, expected",
+    [
+        (
+            ["FOO", "BAR", np.nan, "Blah", "blurg"],
+            ["Foo", "Bar", np.nan, "Blah", "Blurg"],
+        ),
+        (["a", "b", "c"], ["A", "B", "C"]),
+        (["a b", "a bc. de"], ["A b", "A bc. de"]),
+    ],
+)
+def test_capitalize(data, expected, any_string_dtype):
+    s = Series(data, dtype=any_string_dtype)
+    result = s.str.capitalize()
+    expected = Series(expected, dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_capitalize_mixed_object():
+    s = Series(["FOO", np.nan, "bar", True, datetime.today(), "blah", None, 1, 2.0])
+    result = s.str.capitalize()
+    expected = Series(
+        ["Foo", np.nan, "Bar", np.nan, np.nan, "Blah", None, np.nan, np.nan],
+        dtype=object,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_swapcase(any_string_dtype):
+    s = Series(["FOO", "BAR", np.nan, "Blah", "blurg"], dtype=any_string_dtype)
+    result = s.str.swapcase()
+    expected = Series(["foo", "bar", np.nan, "bLAH", "BLURG"], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_swapcase_mixed_object():
+    s = Series(["FOO", np.nan, "bar", True, datetime.today(), "Blah", None, 1, 2.0])
+    result = s.str.swapcase()
+    expected = Series(
+        ["foo", np.nan, "BAR", np.nan, np.nan, "bLAH", None, np.nan, np.nan],
+        dtype=object,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_casefold():
+    # GH25405
+    expected = Series(["ss", np.nan, "case", "ssd"])
+    s = Series(["ß", np.nan, "case", "ßd"])
+    result = s.str.casefold()
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_casemethods(any_string_dtype):
+    values = ["aaa", "bbb", "CCC", "Dddd", "eEEE"]
+    s = Series(values, dtype=any_string_dtype)
+    assert s.str.lower().tolist() == [v.lower() for v in values]
+    assert s.str.upper().tolist() == [v.upper() for v in values]
+    assert s.str.title().tolist() == [v.title() for v in values]
+    assert s.str.capitalize().tolist() == [v.capitalize() for v in values]
+    assert s.str.swapcase().tolist() == [v.swapcase() for v in values]
+
+
+def test_pad(any_string_dtype):
+    s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype)
+
+    result = s.str.pad(5, side="left")
+    expected = Series(
+        ["    a", "    b", np.nan, "    c", np.nan, "eeeeee"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.pad(5, side="right")
+    expected = Series(
+        ["a    ", "b    ", np.nan, "c    ", np.nan, "eeeeee"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.pad(5, side="both")
+    expected = Series(
+        ["  a  ", "  b  ", np.nan, "  c  ", np.nan, "eeeeee"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_pad_mixed_object():
+    s = Series(["a", np.nan, "b", True, datetime.today(), "ee", None, 1, 2.0])
+
+    result = s.str.pad(5, side="left")
+    expected = Series(
+        ["    a", np.nan, "    b", np.nan, np.nan, "   ee", None, np.nan, np.nan],
+        dtype=object,
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.pad(5, side="right")
+    expected = Series(
+        ["a    ", np.nan, "b    ", np.nan, np.nan, "ee   ", None, np.nan, np.nan],
+        dtype=object,
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.pad(5, side="both")
+    expected = Series(
+        ["  a  ", np.nan, "  b  ", np.nan, np.nan, "  ee ", None, np.nan, np.nan],
+        dtype=object,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_pad_fillchar(any_string_dtype):
+    s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype)
+
+    result = s.str.pad(5, side="left", fillchar="X")
+    expected = Series(
+        ["XXXXa", "XXXXb", np.nan, "XXXXc", np.nan, "eeeeee"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.pad(5, side="right", fillchar="X")
+    expected = Series(
+        ["aXXXX", "bXXXX", np.nan, "cXXXX", np.nan, "eeeeee"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.pad(5, side="both", fillchar="X")
+    expected = Series(
+        ["XXaXX", "XXbXX", np.nan, "XXcXX", np.nan, "eeeeee"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_pad_fillchar_bad_arg_raises(any_string_dtype):
+    s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype)
+
+    msg = "fillchar must be a character, not str"
+    with pytest.raises(TypeError, match=msg):
+        s.str.pad(5, fillchar="XY")
+
+    msg = "fillchar must be a character, not int"
+    with pytest.raises(TypeError, match=msg):
+        s.str.pad(5, fillchar=5)
+
+
+@pytest.mark.parametrize("method_name", ["center", "ljust", "rjust", "zfill", "pad"])
+def test_pad_width_bad_arg_raises(method_name, any_string_dtype):
+    # see gh-13598
+    s = Series(["1", "22", "a", "bb"], dtype=any_string_dtype)
+    op = operator.methodcaller(method_name, "f")
+
+    msg = "width must be of integer type, not str"
+    with pytest.raises(TypeError, match=msg):
+        op(s.str)
+
+
+def test_center_ljust_rjust(any_string_dtype):
+    s = Series(["a", "b", np.nan, "c", np.nan, "eeeeee"], dtype=any_string_dtype)
+
+    result = s.str.center(5)
+    expected = Series(
+        ["  a  ", "  b  ", np.nan, "  c  ", np.nan, "eeeeee"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.ljust(5)
+    expected = Series(
+        ["a    ", "b    ", np.nan, "c    ", np.nan, "eeeeee"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.rjust(5)
+    expected = Series(
+        ["    a", "    b", np.nan, "    c", np.nan, "eeeeee"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_center_ljust_rjust_mixed_object():
+    s = Series(["a", np.nan, "b", True, datetime.today(), "c", "eee", None, 1, 2.0])
+
+    result = s.str.center(5)
+    expected = Series(
+        [
+            "  a  ",
+            np.nan,
+            "  b  ",
+            np.nan,
+            np.nan,
+            "  c  ",
+            " eee ",
+            None,
+            np.nan,
+            np.nan,
+        ],
+        dtype=object,
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.ljust(5)
+    expected = Series(
+        [
+            "a    ",
+            np.nan,
+            "b    ",
+            np.nan,
+            np.nan,
+            "c    ",
+            "eee  ",
+            None,
+            np.nan,
+            np.nan,
+        ],
+        dtype=object,
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.rjust(5)
+    expected = Series(
+        [
+            "    a",
+            np.nan,
+            "    b",
+            np.nan,
+            np.nan,
+            "    c",
+            "  eee",
+            None,
+            np.nan,
+            np.nan,
+        ],
+        dtype=object,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_center_ljust_rjust_fillchar(any_string_dtype):
+    # GH#54533, GH#54792
+    s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype)
+
+    result = s.str.center(5, fillchar="X")
+    expected = Series(
+        ["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+    expected = np.array([v.center(5, "X") for v in np.array(s)], dtype=np.object_)
+    tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
+
+    result = s.str.ljust(5, fillchar="X")
+    expected = Series(
+        ["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+    expected = np.array([v.ljust(5, "X") for v in np.array(s)], dtype=np.object_)
+    tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
+
+    result = s.str.rjust(5, fillchar="X")
+    expected = Series(
+        ["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+    expected = np.array([v.rjust(5, "X") for v in np.array(s)], dtype=np.object_)
+    tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
+
+
+def test_center_ljust_rjust_fillchar_bad_arg_raises(any_string_dtype):
+    s = Series(["a", "bb", "cccc", "ddddd", "eeeeee"], dtype=any_string_dtype)
+
+    # If fillchar is not a character, normal str raises TypeError
+    # 'aaa'.ljust(5, 'XY')
+    # TypeError: must be char, not str
+    template = "fillchar must be a character, not {dtype}"
+
+    with pytest.raises(TypeError, match=template.format(dtype="str")):
+        s.str.center(5, fillchar="XY")
+
+    with pytest.raises(TypeError, match=template.format(dtype="str")):
+        s.str.ljust(5, fillchar="XY")
+
+    with pytest.raises(TypeError, match=template.format(dtype="str")):
+        s.str.rjust(5, fillchar="XY")
+
+    with pytest.raises(TypeError, match=template.format(dtype="int")):
+        s.str.center(5, fillchar=1)
+
+    with pytest.raises(TypeError, match=template.format(dtype="int")):
+        s.str.ljust(5, fillchar=1)
+
+    with pytest.raises(TypeError, match=template.format(dtype="int")):
+        s.str.rjust(5, fillchar=1)
+
+
+def test_zfill(any_string_dtype):
+    s = Series(["1", "22", "aaa", "333", "45678"], dtype=any_string_dtype)
+
+    result = s.str.zfill(5)
+    expected = Series(
+        ["00001", "00022", "00aaa", "00333", "45678"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+    expected = np.array([v.zfill(5) for v in np.array(s)], dtype=np.object_)
+    tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
+
+    result = s.str.zfill(3)
+    expected = Series(["001", "022", "aaa", "333", "45678"], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+    expected = np.array([v.zfill(3) for v in np.array(s)], dtype=np.object_)
+    tm.assert_numpy_array_equal(np.array(result, dtype=np.object_), expected)
+
+    s = Series(["1", np.nan, "aaa", np.nan, "45678"], dtype=any_string_dtype)
+    result = s.str.zfill(5)
+    expected = Series(
+        ["00001", np.nan, "00aaa", np.nan, "45678"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_wrap(any_string_dtype):
+    # test values are: two words less than width, two words equal to width,
+    # two words greater than width, one word less than width, one word
+    # equal to width, one word greater than width, multiple tokens with
+    # trailing whitespace equal to width
+    s = Series(
+        [
+            "hello world",
+            "hello world!",
+            "hello world!!",
+            "abcdefabcde",
+            "abcdefabcdef",
+            "abcdefabcdefa",
+            "ab ab ab ab ",
+            "ab ab ab ab a",
+            "\t",
+        ],
+        dtype=any_string_dtype,
+    )
+
+    # expected values
+    expected = Series(
+        [
+            "hello world",
+            "hello world!",
+            "hello\nworld!!",
+            "abcdefabcde",
+            "abcdefabcdef",
+            "abcdefabcdef\na",
+            "ab ab ab ab",
+            "ab ab ab ab\na",
+            "",
+        ],
+        dtype=any_string_dtype,
+    )
+
+    result = s.str.wrap(12, break_long_words=True)
+    tm.assert_series_equal(result, expected)
+
+
+def test_wrap_unicode(any_string_dtype):
+    # test with pre and post whitespace (non-unicode), NaN, and non-ascii Unicode
+    s = Series(
+        ["  pre  ", np.nan, "\xac\u20ac\U00008000 abadcafe"], dtype=any_string_dtype
+    )
+    expected = Series(
+        ["  pre", np.nan, "\xac\u20ac\U00008000 ab\nadcafe"], dtype=any_string_dtype
+    )
+    result = s.str.wrap(6)
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/strings/test_cat.py b/pandas/tests/strings/test_cat.py
new file mode 100644
index 0000000000000000000000000000000000000000..40883fd9c756f4cad31758495b22590cd8607f4c
--- /dev/null
+++ b/pandas/tests/strings/test_cat.py
@@ -0,0 +1,444 @@
+from datetime import datetime
+import re
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    _testing as tm,
+    concat,
+    option_context,
+)
+
+
+@pytest.fixture
+def index_or_series2(index_or_series):
+    return index_or_series
+
+
+@pytest.mark.parametrize("other", [None, Series, Index])
+def test_str_cat_name(index_or_series, other):
+    # GH 21053
+    box = index_or_series
+    values = ["a", "b"]
+    if other:
+        other = other(values)
+    else:
+        other = values
+    result = box(values, name="name").str.cat(other, sep=",")
+    assert result.name == "name"
+
+
+@pytest.mark.parametrize(
+    "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
+)
+def test_str_cat(index_or_series, infer_string):
+    with option_context("future.infer_string", infer_string):
+        box = index_or_series
+        # test_cat above tests "str_cat" from ndarray;
+        # here testing "str.cat" from Series/Index to ndarray/list
+        s = box(["a", "a", "b", "b", "c", np.nan])
+
+        # single array
+        result = s.str.cat()
+        expected = "aabbc"
+        assert result == expected
+
+        result = s.str.cat(na_rep="-")
+        expected = "aabbc-"
+        assert result == expected
+
+        result = s.str.cat(sep="_", na_rep="NA")
+        expected = "a_a_b_b_c_NA"
+        assert result == expected
+
+        t = np.array(["a", np.nan, "b", "d", "foo", np.nan], dtype=object)
+        expected = box(["aa", "a-", "bb", "bd", "cfoo", "--"])
+
+        # Series/Index with array
+        result = s.str.cat(t, na_rep="-")
+        tm.assert_equal(result, expected)
+
+        # Series/Index with list
+        result = s.str.cat(list(t), na_rep="-")
+        tm.assert_equal(result, expected)
+
+        # errors for incorrect lengths
+        rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
+        z = Series(["1", "2", "3"])
+
+        with pytest.raises(ValueError, match=rgx):
+            s.str.cat(z.values)
+
+        with pytest.raises(ValueError, match=rgx):
+            s.str.cat(list(z))
+
+
+def test_str_cat_raises_intuitive_error(index_or_series):
+    # GH 11334
+    box = index_or_series
+    s = box(["a", "b", "c", "d"])
+    message = "Did you mean to supply a `sep` keyword?"
+    with pytest.raises(ValueError, match=message):
+        s.str.cat("|")
+    with pytest.raises(ValueError, match=message):
+        s.str.cat("    ")
+
+
+@pytest.mark.parametrize(
+    "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
+)
+@pytest.mark.parametrize("sep", ["", None])
+@pytest.mark.parametrize("dtype_target", ["object", "category"])
+@pytest.mark.parametrize("dtype_caller", ["object", "category"])
+def test_str_cat_categorical(
+    index_or_series, dtype_caller, dtype_target, sep, infer_string
+):
+    box = index_or_series
+
+    with option_context("future.infer_string", infer_string):
+        s = Index(["a", "a", "b", "a"], dtype=dtype_caller)
+        s = s if box == Index else Series(s, index=s, dtype=s.dtype)
+        t = Index(["b", "a", "b", "c"], dtype=dtype_target)
+
+        expected = Index(
+            ["ab", "aa", "bb", "ac"], dtype=object if dtype_caller == "object" else None
+        )
+        expected = (
+            expected
+            if box == Index
+            else Series(
+                expected, index=Index(s, dtype=dtype_caller), dtype=expected.dtype
+            )
+        )
+
+        # Series/Index with unaligned Index -> t.values
+        result = s.str.cat(t.values, sep=sep)
+        tm.assert_equal(result, expected)
+
+        # Series/Index with Series having matching Index
+        t = Series(t.values, index=Index(s, dtype=dtype_caller))
+        result = s.str.cat(t, sep=sep)
+        tm.assert_equal(result, expected)
+
+        # Series/Index with Series.values
+        result = s.str.cat(t.values, sep=sep)
+        tm.assert_equal(result, expected)
+
+        # Series/Index with Series having different Index
+        t = Series(t.values, index=t.values)
+        expected = Index(
+            ["aa", "aa", "bb", "bb", "aa"],
+            dtype=object if dtype_caller == "object" else None,
+        )
+        dtype = object if dtype_caller == "object" else s.dtype.categories.dtype
+        expected = (
+            expected
+            if box == Index
+            else Series(
+                expected,
+                index=Index(expected.str[:1], dtype=dtype),
+                dtype=expected.dtype,
+            )
+        )
+
+        result = s.str.cat(t, sep=sep)
+        tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [[1, 2, 3], [0.1, 0.2, 0.3], [1, 2, "b"]],
+    ids=["integers", "floats", "mixed"],
+)
+# without dtype=object, np.array would cast [1, 2, 'b'] to ['1', '2', 'b']
+@pytest.mark.parametrize(
+    "box",
+    [Series, Index, list, lambda x: np.array(x, dtype=object)],
+    ids=["Series", "Index", "list", "np.array"],
+)
+def test_str_cat_wrong_dtype_raises(box, data):
+    # GH 22722
+    s = Series(["a", "b", "c"])
+    t = box(data)
+
+    msg = "Concatenation requires list-likes containing only strings.*"
+    with pytest.raises(TypeError, match=msg):
+        # need to use outer and na_rep, as otherwise Index would not raise
+        s.str.cat(t, join="outer", na_rep="-")
+
+
+def test_str_cat_mixed_inputs(index_or_series):
+    box = index_or_series
+    s = Index(["a", "b", "c", "d"])
+    s = s if box == Index else Series(s, index=s)
+
+    t = Series(["A", "B", "C", "D"], index=s.values)
+    d = concat([t, Series(s, index=s)], axis=1)
+
+    expected = Index(["aAa", "bBb", "cCc", "dDd"])
+    expected = expected if box == Index else Series(expected.values, index=s.values)
+
+    # Series/Index with DataFrame
+    result = s.str.cat(d)
+    tm.assert_equal(result, expected)
+
+    # Series/Index with two-dimensional ndarray
+    result = s.str.cat(d.values)
+    tm.assert_equal(result, expected)
+
+    # Series/Index with list of Series
+    result = s.str.cat([t, s])
+    tm.assert_equal(result, expected)
+
+    # Series/Index with mixed list of Series/array
+    result = s.str.cat([t, s.values])
+    tm.assert_equal(result, expected)
+
+    # Series/Index with list of Series; different indexes
+    t.index = ["b", "c", "d", "a"]
+    expected = box(["aDa", "bAb", "cBc", "dCd"])
+    expected = expected if box == Index else Series(expected.values, index=s.values)
+    result = s.str.cat([t, s])
+    tm.assert_equal(result, expected)
+
+    # Series/Index with mixed list; different index
+    result = s.str.cat([t, s.values])
+    tm.assert_equal(result, expected)
+
+    # Series/Index with DataFrame; different indexes
+    d.index = ["b", "c", "d", "a"]
+    expected = box(["aDd", "bAa", "cBb", "dCc"])
+    expected = expected if box == Index else Series(expected.values, index=s.values)
+    result = s.str.cat(d)
+    tm.assert_equal(result, expected)
+
+    # errors for incorrect lengths
+    rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
+    z = Series(["1", "2", "3"])
+    e = concat([z, z], axis=1)
+
+    # two-dimensional ndarray
+    with pytest.raises(ValueError, match=rgx):
+        s.str.cat(e.values)
+
+    # list of list-likes
+    with pytest.raises(ValueError, match=rgx):
+        s.str.cat([z.values, s.values])
+
+    # mixed list of Series/list-like
+    with pytest.raises(ValueError, match=rgx):
+        s.str.cat([z.values, s])
+
+    # errors for incorrect arguments in list-like
+    rgx = "others must be Series, Index, DataFrame,.*"
+    # make sure None/NaN do not crash checks in _get_series_list
+    u = Series(["a", np.nan, "c", None])
+
+    # mix of string and Series
+    with pytest.raises(TypeError, match=rgx):
+        s.str.cat([u, "u"])
+
+    # DataFrame in list
+    with pytest.raises(TypeError, match=rgx):
+        s.str.cat([u, d])
+
+    # 2-dim ndarray in list
+    with pytest.raises(TypeError, match=rgx):
+        s.str.cat([u, d.values])
+
+    # nested lists
+    with pytest.raises(TypeError, match=rgx):
+        s.str.cat([u, [u, d]])
+
+    # forbidden input type: set
+    # GH 23009
+    with pytest.raises(TypeError, match=rgx):
+        s.str.cat(set(u))
+
+    # forbidden input type: set in list
+    # GH 23009
+    with pytest.raises(TypeError, match=rgx):
+        s.str.cat([u, set(u)])
+
+    # other forbidden input type, e.g. int
+    with pytest.raises(TypeError, match=rgx):
+        s.str.cat(1)
+
+    # nested list-likes
+    with pytest.raises(TypeError, match=rgx):
+        s.str.cat(iter([t.values, list(s)]))
+
+
+def test_str_cat_align_indexed(index_or_series, join_type):
+    # https://github.com/pandas-dev/pandas/issues/18657
+    box = index_or_series
+
+    s = Series(["a", "b", "c", "d"], index=["a", "b", "c", "d"])
+    t = Series(["D", "A", "E", "B"], index=["d", "a", "e", "b"])
+    sa, ta = s.align(t, join=join_type)
+    # result after manual alignment of inputs
+    expected = sa.str.cat(ta, na_rep="-")
+
+    if box == Index:
+        s = Index(s)
+        sa = Index(sa)
+        expected = Index(expected)
+
+    result = s.str.cat(t, join=join_type, na_rep="-")
+    tm.assert_equal(result, expected)
+
+
+def test_str_cat_align_mixed_inputs(join_type):
+    s = Series(["a", "b", "c", "d"])
+    t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
+    d = concat([t, t], axis=1)
+
+    expected_outer = Series(["aaa", "bbb", "c--", "ddd", "-ee"])
+    expected = expected_outer.loc[s.index.join(t.index, how=join_type)]
+
+    # list of Series
+    result = s.str.cat([t, t], join=join_type, na_rep="-")
+    tm.assert_series_equal(result, expected)
+
+    # DataFrame
+    result = s.str.cat(d, join=join_type, na_rep="-")
+    tm.assert_series_equal(result, expected)
+
+    # mixed list of indexed/unindexed
+    u = np.array(["A", "B", "C", "D"])
+    expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"])
+    # joint index of rhs [t, u]; u will be forced have index of s
+    rhs_idx = (
+        t.index.intersection(s.index)
+        if join_type == "inner"
+        else t.index.union(s.index)
+        if join_type == "outer"
+        else t.index.append(s.index.difference(t.index))
+    )
+
+    expected = expected_outer.loc[s.index.join(rhs_idx, how=join_type)]
+    result = s.str.cat([t, u], join=join_type, na_rep="-")
+    tm.assert_series_equal(result, expected)
+
+    with pytest.raises(TypeError, match="others must be Series,.*"):
+        # nested lists are forbidden
+        s.str.cat([t, list(u)], join=join_type)
+
+    # errors for incorrect lengths
+    rgx = r"If `others` contains arrays or lists \(or other list-likes.*"
+    z = Series(["1", "2", "3"]).values
+
+    # unindexed object of wrong length
+    with pytest.raises(ValueError, match=rgx):
+        s.str.cat(z, join=join_type)
+
+    # unindexed object of wrong length in list
+    with pytest.raises(ValueError, match=rgx):
+        s.str.cat([t, z], join=join_type)
+
+
+def test_str_cat_datetime_index_unsorted(join_type):
+    # https://github.com/pandas-dev/pandas/pull/62843
+    values = [datetime(2024, 1, 1), datetime(2024, 1, 2)]
+    s = Series(["a", "b"], index=[values[1], values[0]])
+    others = Series(["c", "d"], index=[values[0], values[1]])
+    result = s.str.cat(others, join=join_type)
+    if join_type in {"outer", "right"}:
+        expected = Series(["bc", "ad"], index=[values[0], values[1]])
+    else:
+        expected = Series(["ad", "bc"], index=[values[1], values[0]])
+    tm.assert_series_equal(result, expected)
+
+
+def test_str_cat_all_na(index_or_series, index_or_series2):
+    # GH 24044
+    box = index_or_series
+    other = index_or_series2
+
+    # check that all NaNs in caller / target work
+    s = Index(["a", "b", "c", "d"])
+    s = s if box == Index else Series(s, index=s)
+    t = other([np.nan] * 4, dtype=object)
+    # add index of s for alignment
+    t = t if other == Index else Series(t, index=s)
+
+    # all-NA target
+    if box == Series:
+        expected = Series([np.nan] * 4, index=s.index, dtype=s.dtype)
+    else:  # box == Index
+        # TODO: Strimg option, this should return string dtype
+        expected = Index([np.nan] * 4, dtype=object)
+    result = s.str.cat(t, join="left")
+    tm.assert_equal(result, expected)
+
+    # all-NA caller (only for Series)
+    if other == Series:
+        expected = Series([np.nan] * 4, dtype=object, index=t.index)
+        result = t.str.cat(s, join="left")
+        tm.assert_series_equal(result, expected)
+
+
+def test_str_cat_special_cases():
+    s = Series(["a", "b", "c", "d"])
+    t = Series(["d", "a", "e", "b"], index=[3, 0, 4, 1])
+
+    # iterator of elements with different types
+    expected = Series(["aaa", "bbb", "c-c", "ddd", "-e-"])
+    result = s.str.cat(iter([t, s.values]), join="outer", na_rep="-")
+    tm.assert_series_equal(result, expected)
+
+    # right-align with different indexes in others
+    expected = Series(["aa-", "d-d"], index=[0, 3])
+    result = s.str.cat([t.loc[[0]], t.loc[[3]]], join="right", na_rep="-")
+    tm.assert_series_equal(result, expected)
+
+
+def test_cat_on_filtered_index():
+    df = DataFrame(
+        index=MultiIndex.from_product(
+            [[2011, 2012], [1, 2, 3]], names=["year", "month"]
+        )
+    )
+
+    df = df.reset_index()
+    df = df[df.month > 1]
+
+    str_year = df.year.astype("str")
+    str_month = df.month.astype("str")
+    str_both = str_year.str.cat(str_month, sep=" ")
+
+    assert str_both.loc[1] == "2011 2"
+
+    str_multiple = str_year.str.cat([str_month, str_month], sep=" ")
+
+    assert str_multiple.loc[1] == "2011 2 2"
+
+
+@pytest.mark.parametrize("klass", [tuple, list, np.array, Series, Index])
+def test_cat_different_classes(klass):
+    # https://github.com/pandas-dev/pandas/issues/33425
+    s = Series(["a", "b", "c"])
+    result = s.str.cat(klass(["x", "y", "z"]))
+    expected = Series(["ax", "by", "cz"])
+    tm.assert_series_equal(result, expected)
+
+
+def test_cat_on_series_dot_str():
+    # GH 28277
+    ps = Series(["AbC", "de", "FGHI", "j", "kLLLm"])
+
+    message = re.escape(
+        "others must be Series, Index, DataFrame, np.ndarray "
+        "or list-like (either containing only strings or "
+        "containing only objects of type Series/Index/"
+        "np.ndarray[1-dim])"
+    )
+    with pytest.raises(TypeError, match=message):
+        ps.str.cat(others=ps.str)
diff --git a/pandas/tests/strings/test_extract.py b/pandas/tests/strings/test_extract.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a77ce618a88d8325d2bf47bbcc55083141c1b0e
--- /dev/null
+++ b/pandas/tests/strings/test_extract.py
@@ -0,0 +1,784 @@
+from datetime import datetime
+import re
+
+import numpy as np
+import pytest
+
+from pandas.core.dtypes.dtypes import ArrowDtype
+
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    _testing as tm,
+)
+
+
+def test_extract_expand_kwarg_wrong_type_raises(any_string_dtype):
+    # TODO: should this raise TypeError
+    values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
+    with pytest.raises(ValueError, match="expand must be True or False"):
+        values.str.extract(".*(BAD[_]+).*(BAD)", expand=None)
+
+
+def test_extract_expand_kwarg(any_string_dtype):
+    s = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
+    expected = DataFrame(["BAD__", np.nan, np.nan], dtype=any_string_dtype)
+
+    result = s.str.extract(".*(BAD[_]+).*")
+    tm.assert_frame_equal(result, expected)
+
+    result = s.str.extract(".*(BAD[_]+).*", expand=True)
+    tm.assert_frame_equal(result, expected)
+
+    expected = DataFrame(
+        [["BAD__", "BAD"], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
+    )
+    result = s.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extract_expand_False_mixed_object():
+    ser = Series(
+        ["aBAD_BAD", np.nan, "BAD_b_BAD", True, datetime.today(), "foo", None, 1, 2.0]
+    )
+
+    # two groups
+    result = ser.str.extract(".*(BAD[_]+).*(BAD)", expand=False)
+    er = [np.nan, np.nan]  # empty row
+    expected = DataFrame(
+        [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # single group
+    result = ser.str.extract(".*(BAD[_]+).*BAD", expand=False)
+    expected = Series(
+        ["BAD_", np.nan, "BAD_", np.nan, np.nan, np.nan, None, np.nan, np.nan],
+        dtype=object,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_extract_expand_index_raises():
+    # GH9980
+    # Index only works with one regex group since
+    # multi-group would expand to a frame
+    idx = Index(["A1", "A2", "A3", "A4", "B5"])
+    msg = "only one regex group is supported with Index"
+    with pytest.raises(ValueError, match=msg):
+        idx.str.extract("([AB])([123])", expand=False)
+
+
+def test_extract_expand_no_capture_groups_raises(index_or_series, any_string_dtype):
+    s_or_idx = index_or_series(["A1", "B2", "C3"], dtype=any_string_dtype)
+    msg = "pattern contains no capture groups"
+
+    # no groups
+    with pytest.raises(ValueError, match=msg):
+        s_or_idx.str.extract("[ABC][123]", expand=False)
+
+    # only non-capturing groups
+    with pytest.raises(ValueError, match=msg):
+        s_or_idx.str.extract("(?:[AB]).*", expand=False)
+
+
+def test_extract_expand_single_capture_group(index_or_series, any_string_dtype):
+    # single group renames series/index properly
+    s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype)
+    result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=False)
+
+    expected = index_or_series(["A", "A"], name="uno", dtype=any_string_dtype)
+    if index_or_series == Series:
+        tm.assert_series_equal(result, expected)
+    else:
+        tm.assert_index_equal(result, expected)
+
+
+def test_extract_expand_capture_groups(any_string_dtype):
+    s = Series(["A1", "B2", "C3"], dtype=any_string_dtype)
+    # one group, no matches
+    result = s.str.extract("(_)", expand=False)
+    expected = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # two groups, no matches
+    result = s.str.extract("(_)(_)", expand=False)
+    expected = DataFrame(
+        [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # one group, some matches
+    result = s.str.extract("([AB])[123]", expand=False)
+    expected = Series(["A", "B", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # two groups, some matches
+    result = s.str.extract("([AB])([123])", expand=False)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # one named group
+    result = s.str.extract("(?P<letter>[AB])", expand=False)
+    expected = Series(["A", "B", np.nan], name="letter", dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # two named groups
+    result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=False)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], [np.nan, np.nan]],
+        columns=["letter", "number"],
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # mix named and unnamed groups
+    result = s.str.extract("([AB])(?P<number>[123])", expand=False)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], [np.nan, np.nan]],
+        columns=[0, "number"],
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # one normal group, one non-capturing group
+    result = s.str.extract("([AB])(?:[123])", expand=False)
+    expected = Series(["A", "B", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # two normal groups, one non-capturing group
+    s = Series(["A11", "B22", "C33"], dtype=any_string_dtype)
+    result = s.str.extract("([AB])([123])(?:[123])", expand=False)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # one optional group followed by one normal group
+    s = Series(["A1", "B2", "3"], dtype=any_string_dtype)
+    result = s.str.extract("(?P<letter>[AB])?(?P<number>[123])", expand=False)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], [np.nan, "3"]],
+        columns=["letter", "number"],
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # one normal group followed by one optional group
+    s = Series(["A1", "B2", "C"], dtype=any_string_dtype)
+    result = s.str.extract("(?P<letter>[ABC])(?P<number>[123])?", expand=False)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], ["C", np.nan]],
+        columns=["letter", "number"],
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extract_expand_capture_groups_index(index, any_string_dtype):
+    # https://github.com/pandas-dev/pandas/issues/6348
+    # not passing index to the extractor
+    data = ["A1", "B2", "C"]
+
+    if len(index) == 0:
+        pytest.skip("Test requires len(index) > 0")
+    while len(index) < len(data):
+        index = index.repeat(2)
+
+    index = index[: len(data)]
+    ser = Series(data, index=index, dtype=any_string_dtype)
+
+    result = ser.str.extract(r"(\d)", expand=False)
+    expected = Series(["1", "2", np.nan], index=index, dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.extract(r"(?P<letter>\D)(?P<number>\d)?", expand=False)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], ["C", np.nan]],
+        columns=["letter", "number"],
+        index=index,
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extract_single_series_name_is_preserved(any_string_dtype):
+    s = Series(["a3", "b3", "c2"], name="bob", dtype=any_string_dtype)
+    result = s.str.extract(r"(?P<sue>[a-z])", expand=False)
+    expected = Series(["a", "b", "c"], name="sue", dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_extract_expand_True(any_string_dtype):
+    # Contains tests like those in test_match and some others.
+    s = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
+
+    result = s.str.extract(".*(BAD[_]+).*(BAD)", expand=True)
+    expected = DataFrame(
+        [["BAD__", "BAD"], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extract_expand_True_mixed_object():
+    er = [np.nan, np.nan]  # empty row
+    mixed = Series(
+        [
+            "aBAD_BAD",
+            np.nan,
+            "BAD_b_BAD",
+            True,
+            datetime.today(),
+            "foo",
+            None,
+            1,
+            2.0,
+        ]
+    )
+
+    result = mixed.str.extract(".*(BAD[_]+).*(BAD)", expand=True)
+    expected = DataFrame(
+        [["BAD_", "BAD"], er, ["BAD_", "BAD"], er, er, er, er, er, er], dtype=object
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extract_expand_True_single_capture_group_raises(
+    index_or_series, any_string_dtype
+):
+    # these should work for both Series and Index
+    # no groups
+    s_or_idx = index_or_series(["A1", "B2", "C3"], dtype=any_string_dtype)
+    msg = "pattern contains no capture groups"
+    with pytest.raises(ValueError, match=msg):
+        s_or_idx.str.extract("[ABC][123]", expand=True)
+
+    # only non-capturing groups
+    with pytest.raises(ValueError, match=msg):
+        s_or_idx.str.extract("(?:[AB]).*", expand=True)
+
+
+def test_extract_expand_True_single_capture_group(index_or_series, any_string_dtype):
+    # single group renames series/index properly
+    s_or_idx = index_or_series(["A1", "A2"], dtype=any_string_dtype)
+    result = s_or_idx.str.extract(r"(?P<uno>A)\d", expand=True)
+    expected = DataFrame({"uno": ["A", "A"]}, dtype=any_string_dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("name", [None, "series_name"])
+def test_extract_series(name, any_string_dtype):
+    # extract should give the same result whether or not the series has a name.
+    s = Series(["A1", "B2", "C3"], name=name, dtype=any_string_dtype)
+
+    # one group, no matches
+    result = s.str.extract("(_)", expand=True)
+    expected = DataFrame([np.nan, np.nan, np.nan], dtype=any_string_dtype)
+    tm.assert_frame_equal(result, expected)
+
+    # two groups, no matches
+    result = s.str.extract("(_)(_)", expand=True)
+    expected = DataFrame(
+        [[np.nan, np.nan], [np.nan, np.nan], [np.nan, np.nan]], dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # one group, some matches
+    result = s.str.extract("([AB])[123]", expand=True)
+    expected = DataFrame(["A", "B", np.nan], dtype=any_string_dtype)
+    tm.assert_frame_equal(result, expected)
+
+    # two groups, some matches
+    result = s.str.extract("([AB])([123])", expand=True)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # one named group
+    result = s.str.extract("(?P<letter>[AB])", expand=True)
+    expected = DataFrame({"letter": ["A", "B", np.nan]}, dtype=any_string_dtype)
+    tm.assert_frame_equal(result, expected)
+
+    # two named groups
+    result = s.str.extract("(?P<letter>[AB])(?P<number>[123])", expand=True)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], [np.nan, np.nan]],
+        columns=["letter", "number"],
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # mix named and unnamed groups
+    result = s.str.extract("([AB])(?P<number>[123])", expand=True)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], [np.nan, np.nan]],
+        columns=[0, "number"],
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # one normal group, one non-capturing group
+    result = s.str.extract("([AB])(?:[123])", expand=True)
+    expected = DataFrame(["A", "B", np.nan], dtype=any_string_dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extract_optional_groups(any_string_dtype):
+    # two normal groups, one non-capturing group
+    s = Series(["A11", "B22", "C33"], dtype=any_string_dtype)
+    result = s.str.extract("([AB])([123])(?:[123])", expand=True)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], [np.nan, np.nan]], dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # one optional group followed by one normal group
+    s = Series(["A1", "B2", "3"], dtype=any_string_dtype)
+    result = s.str.extract("(?P<letter>[AB])?(?P<number>[123])", expand=True)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], [np.nan, "3"]],
+        columns=["letter", "number"],
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # one normal group followed by one optional group
+    s = Series(["A1", "B2", "C"], dtype=any_string_dtype)
+    result = s.str.extract("(?P<letter>[ABC])(?P<number>[123])?", expand=True)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], ["C", np.nan]],
+        columns=["letter", "number"],
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extract_dataframe_capture_groups_index(index, any_string_dtype):
+    # GH6348
+    # not passing index to the extractor
+
+    data = ["A1", "B2", "C"]
+
+    if len(index) < len(data):
+        pytest.skip(f"Index needs more than {len(data)} values")
+
+    index = index[: len(data)]
+    s = Series(data, index=index, dtype=any_string_dtype)
+
+    result = s.str.extract(r"(\d)", expand=True)
+    expected = DataFrame(["1", "2", np.nan], index=index, dtype=any_string_dtype)
+    tm.assert_frame_equal(result, expected)
+
+    result = s.str.extract(r"(?P<letter>\D)(?P<number>\d)?", expand=True)
+    expected = DataFrame(
+        [["A", "1"], ["B", "2"], ["C", np.nan]],
+        columns=["letter", "number"],
+        index=index,
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extract_single_group_returns_frame(any_string_dtype):
+    # GH11386 extract should always return DataFrame, even when
+    # there is only one group. Prior to v0.18.0, extract returned
+    # Series when there was only one group in the regex.
+    s = Series(["a3", "b3", "c2"], name="series_name", dtype=any_string_dtype)
+    result = s.str.extract(r"(?P<letter>[a-z])", expand=True)
+    expected = DataFrame({"letter": ["a", "b", "c"]}, dtype=any_string_dtype)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extractall(any_string_dtype):
+    data = [
+        "dave@google.com",
+        "tdhock5@gmail.com",
+        "maudelaperriere@gmail.com",
+        "rob@gmail.com some text steve@gmail.com",
+        "a@b.com some text c@d.com and e@f.com",
+        np.nan,
+        "",
+    ]
+    expected_tuples = [
+        ("dave", "google", "com"),
+        ("tdhock5", "gmail", "com"),
+        ("maudelaperriere", "gmail", "com"),
+        ("rob", "gmail", "com"),
+        ("steve", "gmail", "com"),
+        ("a", "b", "com"),
+        ("c", "d", "com"),
+        ("e", "f", "com"),
+    ]
+    pat = r"""
+    (?P<user>[a-z0-9]+)
+    @
+    (?P<domain>[a-z]+)
+    \.
+    (?P<tld>[a-z]{2,4})
+    """
+    expected_columns = ["user", "domain", "tld"]
+    s = Series(data, dtype=any_string_dtype)
+    # extractall should return a DataFrame with one row for each match, indexed by the
+    # subject from which the match came.
+    expected_index = MultiIndex.from_tuples(
+        [(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (4, 1), (4, 2)],
+        names=(None, "match"),
+    )
+    expected = DataFrame(
+        expected_tuples, expected_index, expected_columns, dtype=any_string_dtype
+    )
+    result = s.str.extractall(pat, flags=re.VERBOSE)
+    tm.assert_frame_equal(result, expected)
+
+    # The index of the input Series should be used to construct the index of the output
+    # DataFrame:
+    mi = MultiIndex.from_tuples(
+        [
+            ("single", "Dave"),
+            ("single", "Toby"),
+            ("single", "Maude"),
+            ("multiple", "robAndSteve"),
+            ("multiple", "abcdef"),
+            ("none", "missing"),
+            ("none", "empty"),
+        ]
+    )
+    s = Series(data, index=mi, dtype=any_string_dtype)
+    expected_index = MultiIndex.from_tuples(
+        [
+            ("single", "Dave", 0),
+            ("single", "Toby", 0),
+            ("single", "Maude", 0),
+            ("multiple", "robAndSteve", 0),
+            ("multiple", "robAndSteve", 1),
+            ("multiple", "abcdef", 0),
+            ("multiple", "abcdef", 1),
+            ("multiple", "abcdef", 2),
+        ],
+        names=(None, None, "match"),
+    )
+    expected = DataFrame(
+        expected_tuples, expected_index, expected_columns, dtype=any_string_dtype
+    )
+    result = s.str.extractall(pat, flags=re.VERBOSE)
+    tm.assert_frame_equal(result, expected)
+
+    # MultiIndexed subject with names.
+    s = Series(data, index=mi, dtype=any_string_dtype)
+    s.index.names = ("matches", "description")
+    expected_index.names = ("matches", "description", "match")
+    expected = DataFrame(
+        expected_tuples, expected_index, expected_columns, dtype=any_string_dtype
+    )
+    result = s.str.extractall(pat, flags=re.VERBOSE)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pat,expected_names",
+    [
+        # optional groups.
+        ("(?P<letter>[AB])?(?P<number>[123])", ["letter", "number"]),
+        # only one of two groups has a name.
+        ("([AB])?(?P<number>[123])", [0, "number"]),
+    ],
+)
+def test_extractall_column_names(pat, expected_names, any_string_dtype):
+    s = Series(["", "A1", "32"], dtype=any_string_dtype)
+
+    result = s.str.extractall(pat)
+    expected = DataFrame(
+        [("A", "1"), (np.nan, "3"), (np.nan, "2")],
+        index=MultiIndex.from_tuples([(1, 0), (2, 0), (2, 1)], names=(None, "match")),
+        columns=expected_names,
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extractall_single_group(any_string_dtype):
+    s = Series(["a3", "b3", "d4c2"], name="series_name", dtype=any_string_dtype)
+    expected_index = MultiIndex.from_tuples(
+        [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match")
+    )
+
+    # extractall(one named group) returns DataFrame with one named column.
+    result = s.str.extractall(r"(?P<letter>[a-z])")
+    expected = DataFrame(
+        {"letter": ["a", "b", "d", "c"]}, index=expected_index, dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # extractall(one un-named group) returns DataFrame with one un-named column.
+    result = s.str.extractall(r"([a-z])")
+    expected = DataFrame(
+        ["a", "b", "d", "c"], index=expected_index, dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extractall_single_group_with_quantifier(any_string_dtype):
+    # GH#13382
+    # extractall(one un-named group with quantifier) returns DataFrame with one un-named
+    # column.
+    s = Series(["ab3", "abc3", "d4cd2"], name="series_name", dtype=any_string_dtype)
+    result = s.str.extractall(r"([a-z]+)")
+    expected = DataFrame(
+        ["ab", "abc", "d", "cd"],
+        index=MultiIndex.from_tuples(
+            [(0, 0), (1, 0), (2, 0), (2, 1)], names=(None, "match")
+        ),
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data, names",
+    [
+        ([], (None,)),
+        ([], ("i1",)),
+        ([], (None, "i2")),
+        ([], ("i1", "i2")),
+        (["a3", "b3", "d4c2"], (None,)),
+        (["a3", "b3", "d4c2"], ("i1", "i2")),
+        (["a3", "b3", "d4c2"], (None, "i2")),
+    ],
+)
+def test_extractall_no_matches(data, names, any_string_dtype):
+    # GH19075 extractall with no matches should return a valid MultiIndex
+    n = len(data)
+    if len(names) == 1:
+        index = Index(range(n), name=names[0])
+    else:
+        tuples = (tuple([i] * (n - 1)) for i in range(n))
+        index = MultiIndex.from_tuples(tuples, names=names)
+    s = Series(data, name="series_name", index=index, dtype=any_string_dtype)
+    expected_index = MultiIndex.from_tuples([], names=((*names, "match")))
+
+    # one un-named group.
+    result = s.str.extractall("(z)")
+    expected = DataFrame(columns=range(1), index=expected_index, dtype=any_string_dtype)
+    tm.assert_frame_equal(result, expected, check_column_type=True)
+
+    # two un-named groups.
+    result = s.str.extractall("(z)(z)")
+    expected = DataFrame(columns=range(2), index=expected_index, dtype=any_string_dtype)
+    tm.assert_frame_equal(result, expected, check_column_type=True)
+
+    # one named group.
+    result = s.str.extractall("(?P<first>z)")
+    expected = DataFrame(
+        columns=["first"], index=expected_index, dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # two named groups.
+    result = s.str.extractall("(?P<first>z)(?P<second>z)")
+    expected = DataFrame(
+        columns=["first", "second"], index=expected_index, dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # one named, one un-named.
+    result = s.str.extractall("(z)(?P<second>z)")
+    expected = DataFrame(
+        columns=[0, "second"], index=expected_index, dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extractall_stringindex(any_string_dtype):
+    s = Series(["a1a2", "b1", "c1"], name="xxx", dtype=any_string_dtype)
+    result = s.str.extractall(r"[ab](?P<digit>\d)")
+    expected = DataFrame(
+        {"digit": ["1", "2", "1"]},
+        index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], names=[None, "match"]),
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+    # index should return the same result as the default index without name thus
+    # index.name doesn't affect to the result
+    if any_string_dtype == "object":
+        for idx in [
+            Index(["a1a2", "b1", "c1"], dtype=object),
+            Index(["a1a2", "b1", "c1"], name="xxx", dtype=object),
+        ]:
+            result = idx.str.extractall(r"[ab](?P<digit>\d)")
+            tm.assert_frame_equal(result, expected)
+
+    s = Series(
+        ["a1a2", "b1", "c1"],
+        name="s_name",
+        index=Index(["XX", "yy", "zz"], name="idx_name"),
+        dtype=any_string_dtype,
+    )
+    result = s.str.extractall(r"[ab](?P<digit>\d)")
+    expected = DataFrame(
+        {"digit": ["1", "2", "1"]},
+        index=MultiIndex.from_tuples(
+            [("XX", 0), ("XX", 1), ("yy", 0)], names=["idx_name", "match"]
+        ),
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extractall_no_capture_groups_raises(any_string_dtype):
+    # Does not make sense to use extractall with a regex that has no capture groups.
+    # (it returns DataFrame with one column for each capture group)
+    s = Series(["a3", "b3", "d4c2"], name="series_name", dtype=any_string_dtype)
+    with pytest.raises(ValueError, match="no capture groups"):
+        s.str.extractall(r"[a-z]")
+
+
+def test_extract_index_one_two_groups():
+    s = Series(["a3", "b3", "d4c2"], index=["A3", "B3", "D4"], name="series_name")
+    r = s.index.str.extract(r"([A-Z])", expand=True)
+    e = DataFrame(["A", "B", "D"])
+    tm.assert_frame_equal(r, e)
+
+    # Prior to v0.18.0, index.str.extract(regex with one group)
+    # returned Index. With more than one group, extract raised an
+    # error (GH9980). Now extract always returns DataFrame.
+    r = s.index.str.extract(r"(?P<letter>[A-Z])(?P<digit>[0-9])", expand=True)
+    e_list = [("A", "3"), ("B", "3"), ("D", "4")]
+    e = DataFrame(e_list, columns=["letter", "digit"])
+    tm.assert_frame_equal(r, e)
+
+
+def test_extractall_same_as_extract(any_string_dtype):
+    s = Series(["a3", "b3", "c2"], name="series_name", dtype=any_string_dtype)
+
+    pattern_two_noname = r"([a-z])([0-9])"
+    extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
+    has_multi_index = s.str.extractall(pattern_two_noname)
+    no_multi_index = has_multi_index.xs(0, level="match")
+    tm.assert_frame_equal(extract_two_noname, no_multi_index)
+
+    pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])"
+    extract_two_named = s.str.extract(pattern_two_named, expand=True)
+    has_multi_index = s.str.extractall(pattern_two_named)
+    no_multi_index = has_multi_index.xs(0, level="match")
+    tm.assert_frame_equal(extract_two_named, no_multi_index)
+
+    pattern_one_named = r"(?P<group_name>[a-z])"
+    extract_one_named = s.str.extract(pattern_one_named, expand=True)
+    has_multi_index = s.str.extractall(pattern_one_named)
+    no_multi_index = has_multi_index.xs(0, level="match")
+    tm.assert_frame_equal(extract_one_named, no_multi_index)
+
+    pattern_one_noname = r"([a-z])"
+    extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
+    has_multi_index = s.str.extractall(pattern_one_noname)
+    no_multi_index = has_multi_index.xs(0, level="match")
+    tm.assert_frame_equal(extract_one_noname, no_multi_index)
+
+
+def test_extractall_same_as_extract_subject_index(any_string_dtype):
+    # same as above tests, but s has a MultiIndex.
+    mi = MultiIndex.from_tuples(
+        [("A", "first"), ("B", "second"), ("C", "third")],
+        names=("capital", "ordinal"),
+    )
+    s = Series(["a3", "b3", "c2"], index=mi, name="series_name", dtype=any_string_dtype)
+
+    pattern_two_noname = r"([a-z])([0-9])"
+    extract_two_noname = s.str.extract(pattern_two_noname, expand=True)
+    has_match_index = s.str.extractall(pattern_two_noname)
+    no_match_index = has_match_index.xs(0, level="match")
+    tm.assert_frame_equal(extract_two_noname, no_match_index)
+
+    pattern_two_named = r"(?P<letter>[a-z])(?P<digit>[0-9])"
+    extract_two_named = s.str.extract(pattern_two_named, expand=True)
+    has_match_index = s.str.extractall(pattern_two_named)
+    no_match_index = has_match_index.xs(0, level="match")
+    tm.assert_frame_equal(extract_two_named, no_match_index)
+
+    pattern_one_named = r"(?P<group_name>[a-z])"
+    extract_one_named = s.str.extract(pattern_one_named, expand=True)
+    has_match_index = s.str.extractall(pattern_one_named)
+    no_match_index = has_match_index.xs(0, level="match")
+    tm.assert_frame_equal(extract_one_named, no_match_index)
+
+    pattern_one_noname = r"([a-z])"
+    extract_one_noname = s.str.extract(pattern_one_noname, expand=True)
+    has_match_index = s.str.extractall(pattern_one_noname)
+    no_match_index = has_match_index.xs(0, level="match")
+    tm.assert_frame_equal(extract_one_noname, no_match_index)
+
+
+def test_extractall_preserves_dtype():
+    # Ensure that when extractall is called on a series with specific dtypes set, that
+    # the dtype is preserved in the resulting DataFrame's column.
+    pa = pytest.importorskip("pyarrow")
+
+    result = Series(["abc", "ab"], dtype=ArrowDtype(pa.string())).str.extractall("(ab)")
+    assert result.dtypes[0] == "string[pyarrow]"
+
+
+@pytest.mark.parametrize(
+    "pat, expected_data",
+    [
+        (r"(a(?=b))", [None, "a", None, None]),
+        (r"((?<=a)b)", [None, "b", None, None]),
+        (r"(a(?!b))", ["a", None, "a", None]),
+        (r"((?<!b)a)", ["a", "a", None, None]),
+        ("(ab)", [None, "ab", None, None]),
+    ],
+)
+def test_extract_lookarounds(any_string_dtype, pat, expected_data):
+    # https://github.com/pandas-dev/pandas/issues/60833
+    ser = Series(["aa", "ab", "ba", "bb", None], dtype=any_string_dtype)
+    result = ser.str.extract(pat, expand=False)
+    if any_string_dtype == "object":
+        # object input will preserve None but any result with no matches gets NaN
+        expected_data = [np.nan if e is None else e for e in expected_data]
+    expected = Series([*expected_data, None], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pat, expected_data",
+    [
+        (r"(a(?=b))", {(1, 0): "a"}),
+        (r"((?<=a)b)", {(1, 0): "b"}),
+        (r"(a(?!b))", {(0, 0): "a", (0, 1): "a", (2, 0): "a"}),
+        (r"((?<!b)a)", {(0, 0): "a", (0, 1): "a", (1, 0): "a"}),
+        ("(ab)", {(1, 0): "ab"}),
+    ],
+)
+def test_extractall_lookarounds(any_string_dtype, pat, expected_data):
+    # https://github.com/pandas-dev/pandas/issues/60833
+    ser = Series(["aa", "ab", "ba", "bb", None], dtype=any_string_dtype)
+    result = ser.str.extractall(pat)
+    expected = Series(expected_data, dtype=any_string_dtype).to_frame()
+    expected.index.names = [None, "match"]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_extract_end_of_string(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/pull/63613
+    ser = Series(["aa", "abc", "bb\n"], dtype=any_string_dtype)
+
+    # with dollar sign
+    result = ser.str.extract("([ab]+)$")
+    expected = Series(["aa", np.nan, "bb"], dtype=any_string_dtype).to_frame()
+    tm.assert_frame_equal(result, expected)
+
+    # with \Z (ensure this is translated to \z for pyarrow)
+    result = ser.str.extract(r"([ab]+)\Z")
+    expected = Series(["aa", np.nan, np.nan], dtype=any_string_dtype).to_frame()
+    tm.assert_frame_equal(result, expected)
+
+    # ensure finding a literal \Z still works
+    ser = Series([r"aa\Z", "abc", "bb\\Z\n"], dtype=any_string_dtype)
+    result = ser.str.extract(r"([ab]+)\\Z")
+    expected = Series(["aa", np.nan, "bb"], dtype=any_string_dtype).to_frame()
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
new file mode 100644
index 0000000000000000000000000000000000000000..5709391c5e992e473c0213a5a70907d75a91f2e2
--- /dev/null
+++ b/pandas/tests/strings/test_find_replace.py
@@ -0,0 +1,1682 @@
+from datetime import datetime
+import re
+
+import numpy as np
+import pytest
+
+from pandas._libs import lib
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    Series,
+    StringDtype,
+    _testing as tm,
+)
+from pandas.tests.strings import (
+    _convert_na_value,
+    is_object_or_nan_string_dtype,
+)
+
+# --------------------------------------------------------------------------------------
+# str.contains
+# --------------------------------------------------------------------------------------
+
+
+def test_contains(any_string_dtype):
+    values = np.array(
+        ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_
+    )
+    values = Series(values, dtype=any_string_dtype)
+    pat = "mmm[_]+"
+
+    result = values.str.contains(pat)
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series([False, False, True, True, False], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series(
+            np.array([False, np.nan, True, True, False], dtype=np.object_),
+            dtype=expected_dtype,
+        )
+
+    tm.assert_series_equal(result, expected)
+
+    result = values.str.contains(pat, regex=False)
+    if any_string_dtype == "str":
+        expected = Series([False, False, False, False, True], dtype=bool)
+    else:
+        expected = Series(
+            np.array([False, np.nan, False, False, True], dtype=np.object_),
+            dtype=expected_dtype,
+        )
+    tm.assert_series_equal(result, expected)
+
+    values = Series(
+        np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object),
+        dtype=any_string_dtype,
+    )
+    result = values.str.contains(pat)
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+    expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # case insensitive using regex
+    values = Series(
+        np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object),
+        dtype=any_string_dtype,
+    )
+
+    result = values.str.contains("FOO|mmm", case=False)
+    expected = Series(np.array([True, False, True, True]), dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # case insensitive without regex
+    result = values.str.contains("foo", regex=False, case=False)
+    expected = Series(np.array([True, False, True, False]), dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # unicode
+    values = Series(
+        np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_),
+        dtype=any_string_dtype,
+    )
+    pat = "mmm[_]+"
+
+    result = values.str.contains(pat)
+    if any_string_dtype == "str":
+        expected = Series([False, False, True, True], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series(
+            np.array([False, np.nan, True, True], dtype=np.object_),
+            dtype=expected_dtype,
+        )
+    tm.assert_series_equal(result, expected)
+
+    result = values.str.contains(pat, na=False)
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+    expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    values = Series(
+        np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_),
+        dtype=any_string_dtype,
+    )
+    result = values.str.contains(pat)
+    expected = Series(np.array([False, False, True, True]), dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_contains_object_mixed():
+    mixed = Series(
+        np.array(
+            ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
+            dtype=object,
+        )
+    )
+    result = mixed.str.contains("o")
+    expected = Series(
+        np.array(
+            [False, np.nan, False, np.nan, np.nan, True, None, np.nan, np.nan],
+            dtype=np.object_,
+        )
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_contains_na_kwarg_for_object_category():
+    # gh 22158
+
+    # na for category
+    values = Series(["a", "b", "c", "a", np.nan], dtype="category")
+    result = values.str.contains("a", na=True)
+    expected = Series([True, False, False, True, True])
+    tm.assert_series_equal(result, expected)
+
+    result = values.str.contains("a", na=False)
+    expected = Series([True, False, False, True, False])
+    tm.assert_series_equal(result, expected)
+
+    # na for objects
+    values = Series(["a", "b", "c", "a", np.nan])
+    result = values.str.contains("a", na=True)
+    expected = Series([True, False, False, True, True])
+    tm.assert_series_equal(result, expected)
+
+    result = values.str.contains("a", na=False)
+    expected = Series([True, False, False, True, False])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "na, expected",
+    [
+        (None, pd.NA),
+        (True, True),
+        (False, False),
+        (0, False),
+        (3, True),
+        (np.nan, pd.NA),
+    ],
+)
+@pytest.mark.parametrize("regex", [True, False])
+def test_contains_na_kwarg_for_nullable_string_dtype(
+    nullable_string_dtype, na, expected, regex
+):
+    # https://github.com/pandas-dev/pandas/pull/41025#issuecomment-824062416
+
+    values = Series(["a", "b", "c", "a", np.nan], dtype=nullable_string_dtype)
+
+    if na in [0, 3] and na is not False:
+        msg = f"na must be None, pd.NA, np.nan, True, or False; got {na}"
+        with pytest.raises(ValueError, match=msg):
+            values.str.contains("a", na=na, regex=regex)
+    else:
+        result = values.str.contains("a", na=na, regex=regex)
+        expected = Series([True, False, False, True, expected], dtype="boolean")
+        tm.assert_series_equal(result, expected)
+
+
+def test_contains_moar(any_string_dtype):
+    # PR #1179
+    s = Series(
+        ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
+        dtype=any_string_dtype,
+    )
+
+    result = s.str.contains("a")
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        na_value = False
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        na_value = np.nan
+    expected = Series(
+        [False, False, False, True, True, False, na_value, False, False, True],
+        dtype=expected_dtype,
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.contains("a", case=False)
+    expected = Series(
+        [True, False, False, True, True, False, na_value, True, False, True],
+        dtype=expected_dtype,
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.contains("Aa")
+    expected = Series(
+        [False, False, False, True, False, False, na_value, False, False, False],
+        dtype=expected_dtype,
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.contains("ba")
+    expected = Series(
+        [False, False, False, True, False, False, na_value, False, False, False],
+        dtype=expected_dtype,
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.contains("ba", case=False)
+    expected = Series(
+        [False, False, False, True, True, False, na_value, True, False, False],
+        dtype=expected_dtype,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_contains_nan(any_string_dtype):
+    # PR #14171
+    s = Series([np.nan, np.nan, np.nan], dtype=any_string_dtype)
+
+    result = s.str.contains("foo", na=False)
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+    expected = Series([False, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.contains("foo", na=True)
+    expected = Series([True, True, True], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    msg = "na must be None, pd.NA, np.nan, True, or False; got foo"
+    with pytest.raises(ValueError, match=msg):
+        s.str.contains("foo", na="foo")
+
+    result = s.str.contains("foo")
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series([False, False, False], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_contains_compiled_regex(any_string_dtype):
+    # GH#61942
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+
+    ser = Series(["foo", "bar", "Baz"], dtype=any_string_dtype)
+
+    pat = re.compile("ba.")
+    result = ser.str.contains(pat)
+    expected = Series([False, True, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # TODO this currently works for pyarrow-backed dtypes but raises for python
+    if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
+        result = ser.str.contains(pat, case=False)
+        expected = Series([False, True, True], dtype=expected_dtype)
+        tm.assert_series_equal(result, expected)
+    else:
+        with pytest.raises(
+            ValueError, match="cannot process flags argument with a compiled pattern"
+        ):
+            ser.str.contains(pat, case=False)
+
+    pat = re.compile("ba.", flags=re.IGNORECASE)
+    result = ser.str.contains(pat)
+    expected = Series([False, True, True], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # TODO should this be supported?
+    with pytest.raises(
+        ValueError, match="cannot process flags argument with a compiled pattern"
+    ):
+        ser.str.contains(pat, flags=re.IGNORECASE)
+
+
+def test_contains_compiled_regex_flags(any_string_dtype):
+    # ensure other (than ignorecase) flags are respected
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+
+    ser = Series(["foobar", "foo\nbar", "Baz"], dtype=any_string_dtype)
+
+    pat = re.compile("^ba")
+    result = ser.str.contains(pat)
+    expected = Series([False, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    pat = re.compile("^ba", flags=re.MULTILINE)
+    result = ser.str.contains(pat)
+    expected = Series([False, True, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    pat = re.compile("^ba", flags=re.MULTILINE | re.IGNORECASE)
+    result = ser.str.contains(pat)
+    expected = Series([False, True, True], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pat, expected_data",
+    [
+        (r"a(?=b)", [False, True, False, False]),
+        (r"(?<=a)b", [False, True, False, False]),
+        (r"a(?!b)", [True, False, True, False]),
+        (r"(?<!b)a", [True, True, False, False]),
+        ("ab", [False, True, False, False]),
+    ],
+)
+@pytest.mark.parametrize("na", [lib.no_default, True, False, np.nan, None, pd.NA])
+def test_contains_lookarounds(any_string_dtype, pat, expected_data, na):
+    # https://github.com/pandas-dev/pandas/issues/60833
+    if any_string_dtype == "object" and not isinstance(na, bool):
+        expected_dtype = "object"
+    else:
+        expected_dtype = (
+            np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+    if any_string_dtype == "object":
+        # The behavior here for `na=pd.NA` looks wrong.
+        if (na is lib.no_default or pd.isna(na)) and na is not pd.NA:
+            na_result = None
+        else:
+            na_result = na
+    elif na is lib.no_default or pd.isna(na):
+        if any_string_dtype == "str":
+            na_result = False
+        elif any_string_dtype == "string":
+            na_result = pd.NA
+        else:
+            raise ValueError(f"Unrecognized string dtype {any_string_dtype}")
+    else:
+        na_result = na
+    expected_data = expected_data.copy()
+    expected_data.append(na_result)
+    ser = Series(["aa", "ab", "ba", "bb", None], dtype=any_string_dtype)
+    result = ser.str.contains(pat, regex=True, na=na)
+    expected = Series(expected_data, dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_contains_end_of_string(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/pull/63613
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+
+    ser = Series(["baz", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+
+    # with dollar sign
+    result = ser.str.contains("bar$")
+    if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
+        # pyarrow (RE2) only matches $ at the very end of the line
+        expected = Series([False, True, False, False], dtype=expected_dtype)
+    else:
+        # python matches $ before or after an ending newline
+        expected = Series([False, True, False, True], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # with \Z (ensure this is translated to \z for pyarrow)
+    result = ser.str.contains(r"bar\Z")
+    expected = Series([False, True, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # ensure finding a literal \Z still works
+    ser = Series(
+        ["bar", r"bar{}".format("\\"), r"bar\Z", r"bar\\Z", "bars", "bar\n"],
+        dtype=any_string_dtype,
+    )
+
+    result = ser.str.contains(r"bar\\Z")
+    expected = Series([False, False, True, False, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.contains(r"bar\\\Z")
+    expected = Series([False, True, False, False, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.contains(r"bar\\\\Z")
+    expected = Series([False, False, False, True, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+# --------------------------------------------------------------------------------------
+# str.startswith
+# --------------------------------------------------------------------------------------
+
+
+def test_startswith_endswith_validate_na(any_string_dtype):
+    # GH#59615
+    ser = Series(
+        ["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"],
+        dtype=any_string_dtype,
+    )
+    msg = "na must be None, pd.NA, np.nan, True, or False; got baz"
+    with pytest.raises(ValueError, match=msg):
+        ser.str.startswith("kapow", na="baz")
+    with pytest.raises(ValueError, match=msg):
+        ser.str.endswith("bar", na="baz")
+
+
+@pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])
+@pytest.mark.parametrize("dtype", ["object", "category"])
+@pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
+@pytest.mark.parametrize("na", [True, False])
+def test_startswith(pat, dtype, null_value, na, using_infer_string):
+    # add category dtype parametrizations for GH-36241
+    values = Series(
+        ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
+        dtype=dtype,
+    )
+
+    result = values.str.startswith(pat)
+    exp = Series([False, np.nan, True, False, False, np.nan, True])
+    if dtype == "object" and null_value is pd.NA:
+        # GH#18463
+        exp = exp.fillna(null_value)
+    elif dtype == "object" and null_value is None:
+        exp[exp.isna()] = None
+    elif using_infer_string and dtype == "category":
+        exp = exp.fillna(False).astype(bool)
+    tm.assert_series_equal(result, exp)
+
+    result = values.str.startswith(pat, na=na)
+    exp = Series([False, na, True, False, False, na, True])
+    tm.assert_series_equal(result, exp)
+
+    # mixed
+    mixed = np.array(
+        ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
+        dtype=np.object_,
+    )
+    rs = Series(mixed).str.startswith("f")
+    xp = Series([False, np.nan, False, np.nan, np.nan, True, None, np.nan, np.nan])
+    tm.assert_series_equal(rs, xp)
+
+
+@pytest.mark.parametrize("na", [None, True, False])
+def test_startswith_string_dtype(any_string_dtype, na):
+    values = Series(
+        ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."],
+        dtype=any_string_dtype,
+    )
+    result = values.str.startswith("foo", na=na)
+
+    expected_dtype = (
+        (object if na is None else bool)
+        if is_object_or_nan_string_dtype(any_string_dtype)
+        else "boolean"
+    )
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        if na is None:
+            na = False
+    exp = Series(
+        [False, na, True, False, False, na, True, False, False], dtype=expected_dtype
+    )
+    tm.assert_series_equal(result, exp)
+
+    result = values.str.startswith("rege.", na=na)
+    exp = Series(
+        [False, na, False, False, False, na, False, False, True], dtype=expected_dtype
+    )
+    tm.assert_series_equal(result, exp)
+
+
+# --------------------------------------------------------------------------------------
+# str.endswith
+# --------------------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("pat", ["foo", ("foo", "baz")])
+@pytest.mark.parametrize("dtype", ["object", "category"])
+@pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
+@pytest.mark.parametrize("na", [True, False])
+def test_endswith(pat, dtype, null_value, na, using_infer_string):
+    # add category dtype parametrizations for GH-36241
+    values = Series(
+        ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
+        dtype=dtype,
+    )
+
+    result = values.str.endswith(pat)
+    exp = Series([False, np.nan, False, False, True, np.nan, True])
+    if dtype == "object" and null_value is pd.NA:
+        # GH#18463
+        exp = exp.fillna(null_value)
+    elif dtype == "object" and null_value is None:
+        exp[exp.isna()] = None
+    elif using_infer_string and dtype == "category":
+        exp = exp.fillna(False).astype(bool)
+    tm.assert_series_equal(result, exp)
+
+    result = values.str.endswith(pat, na=na)
+    exp = Series([False, na, False, False, True, na, True])
+    tm.assert_series_equal(result, exp)
+
+    # mixed
+    mixed = np.array(
+        ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
+        dtype=object,
+    )
+    rs = Series(mixed).str.endswith("f")
+    xp = Series([False, np.nan, False, np.nan, np.nan, False, None, np.nan, np.nan])
+    tm.assert_series_equal(rs, xp)
+
+
+@pytest.mark.parametrize("na", [None, True, False])
+def test_endswith_string_dtype(any_string_dtype, na):
+    values = Series(
+        ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."],
+        dtype=any_string_dtype,
+    )
+    result = values.str.endswith("foo", na=na)
+    expected_dtype = (
+        (object if na is None else bool)
+        if is_object_or_nan_string_dtype(any_string_dtype)
+        else "boolean"
+    )
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        if na is None:
+            na = False
+    exp = Series(
+        [False, na, False, False, True, na, True, False, False], dtype=expected_dtype
+    )
+    tm.assert_series_equal(result, exp)
+
+    result = values.str.endswith("rege.", na=na)
+    exp = Series(
+        [False, na, False, False, False, na, False, False, True], dtype=expected_dtype
+    )
+    tm.assert_series_equal(result, exp)
+
+
+# --------------------------------------------------------------------------------------
+# str.replace
+# --------------------------------------------------------------------------------------
+def test_replace_dict_invalid(any_string_dtype):
+    # GH 51914
+    series = Series(data=["A", "B_junk", "C_gunk"], name="my_messy_col")
+    msg = "repl cannot be used when pat is a dictionary"
+
+    with pytest.raises(ValueError, match=msg):
+        series.str.replace(pat={"A": "a", "B": "b"}, repl="A")
+
+
+def test_replace_dict(any_string_dtype):
+    # GH 51914
+    series = Series(data=["A", "B", "C"], name="my_messy_col")
+    new_series = series.str.replace(pat={"A": "a", "B": "b"})
+    expected = Series(data=["a", "b", "C"], name="my_messy_col")
+    tm.assert_series_equal(new_series, expected)
+
+
+def test_replace(any_string_dtype):
+    ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
+
+    result = ser.str.replace("BAD[_]*", "", regex=True)
+    expected = Series(["foobar", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_replace_max_replacements(any_string_dtype):
+    ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
+
+    expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
+    result = ser.str.replace("BAD[_]*", "", n=1, regex=True)
+    tm.assert_series_equal(result, expected)
+
+    expected = Series(["foo__barBAD", np.nan], dtype=any_string_dtype)
+    result = ser.str.replace("BAD", "", n=1, regex=False)
+    tm.assert_series_equal(result, expected)
+
+
+def test_replace_mixed_object():
+    ser = Series(
+        ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
+    )
+    result = Series(ser).str.replace("BAD[_]*", "", regex=True)
+    expected = Series(
+        ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_replace_unicode(any_string_dtype):
+    ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
+    expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
+    result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("repl", [None, 3, {"a": "b"}])
+@pytest.mark.parametrize("data", [["a", "b", None], ["a", "b", "c", "ad"]])
+def test_replace_wrong_repl_type_raises(any_string_dtype, index_or_series, repl, data):
+    # https://github.com/pandas-dev/pandas/issues/13438
+    msg = "repl must be a string or callable"
+    obj = index_or_series(data, dtype=any_string_dtype)
+    with pytest.raises(TypeError, match=msg):
+        obj.str.replace("a", repl)
+
+
+def test_replace_callable(any_string_dtype):
+    # GH 15055
+    ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
+
+    # test with callable
+    repl = lambda m: m.group(0).swapcase()
+    result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
+    expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None]
+)
+def test_replace_callable_raises(any_string_dtype, repl):
+    # GH 15055
+    values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
+
+    # test with wrong number of arguments, raising an error
+    msg = (
+        r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
+        r"(?(3)required )positional arguments?"
+    )
+    with pytest.raises(TypeError, match=msg):
+        values.str.replace("a", repl, regex=True)
+
+
+@pytest.mark.parametrize(
+    "repl, expected_list",
+    [
+        (
+            r"\g<three> \g<two> \g<one>",
+            ["Three Two One", "Baz Bar Foo"],
+        ),
+        (
+            r"\3 \2 \1",
+            ["Three Two One", "Baz Bar Foo"],
+        ),
+        (
+            r"\g<3> \g<2> \g<1>",
+            ["Three Two One", "Baz Bar Foo"],
+        ),
+        (
+            r"\g<2>0",
+            ["Two0", "Bar0"],
+        ),
+        (
+            r"\g<2>0 \1",
+            ["Two0 One", "Bar0 Foo"],
+        ),
+    ],
+    ids=[
+        "named_groups_full_swap",
+        "numbered_groups_no_g_full_swap",
+        "numbered_groups_full_swap",
+        "single_group_with_literal",
+        "mixed_group_reference_with_literal",
+    ],
+)
+@pytest.mark.parametrize("use_compile", [True, False])
+def test_replace_named_groups_regex_swap(
+    any_string_dtype, use_compile, repl, expected_list
+):
+    # GH#57636
+    ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype)
+    pattern = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
+    if use_compile:
+        pattern = re.compile(pattern)
+    result = ser.str.replace(pattern, repl, regex=True)
+    expected = Series(expected_list, dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "repl",
+    [
+        r"\g<20>",
+        r"\20",
+        r"\40",
+        r"\4",
+    ],
+)
+@pytest.mark.parametrize("use_compile", [True, False])
+def test_replace_named_groups_regex_swap_expected_fail(
+    any_string_dtype, repl, use_compile, request
+):
+    # GH#57636
+    if (
+        not use_compile
+        and r"\g" not in repl
+        and isinstance(any_string_dtype, StringDtype)
+        and any_string_dtype.storage == "pyarrow"
+    ):
+        # calls pyarrow method directly
+        if repl == r"\20":
+            mark = pytest.mark.xfail(reason="PyArrow interprets as group + literal")
+            request.applymarker(mark)
+
+        pa = pytest.importorskip("pyarrow")
+        error_type = pa.ArrowInvalid
+        error_msg = r"only has \d parenthesized subexpressions"
+    else:
+        error_type = re.error
+        error_msg = "invalid group reference"
+
+    pattern = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
+    if use_compile:
+        pattern = re.compile(pattern)
+    ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype)
+
+    with pytest.raises(error_type, match=error_msg):
+        ser.str.replace(pattern, repl, regex=True)
+
+
+@pytest.mark.parametrize(
+    "pattern, repl",
+    [
+        (r"(\w+) (\w+) (\w+)", r"\20"),
+        (r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)", r"\20"),
+    ],
+)
+def test_pyarrow_ambiguous_group_references(pyarrow_string_dtype, pattern, repl):
+    # GH#62653
+    ser = Series(["One Two Three", "Foo Bar Baz"], dtype=pyarrow_string_dtype)
+
+    result = ser.str.replace(pattern, repl, regex=True)
+    expected = Series(["Two0", "Bar0"], dtype=pyarrow_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pattern, repl, expected_list",
+    [
+        (
+            r"\[(?P<one>\d+)\]",
+            r"(\1)",
+            ["var.one(0)", "var.two(1)", "var.three(2)"],
+        ),
+        (
+            r"\[(\d+)\]",
+            r"(\1)",
+            ["var.one(0)", "var.two(1)", "var.three(2)"],
+        ),
+    ],
+)
+@td.skip_if_no("pyarrow")
+def test_pyarrow_backend_group_replacement(pattern, repl, expected_list):
+    ser = Series(["var.one[0]", "var.two[1]", "var.three[2]"]).convert_dtypes(
+        dtype_backend="pyarrow"
+    )
+    result = ser.str.replace(pattern, repl, regex=True)
+    expected = Series(expected_list).convert_dtypes(dtype_backend="pyarrow")
+    tm.assert_series_equal(result, expected)
+
+
+def test_replace_callable_named_groups(any_string_dtype):
+    # test regex named groups
+    ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype)
+    pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
+    repl = lambda m: m.group("middle").swapcase()
+    result = ser.str.replace(pat, repl, regex=True)
+    expected = Series(["bAR", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_replace_compiled_regex(any_string_dtype):
+    # GH 15446
+    ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
+
+    # test with compiled regex
+    pat = re.compile(r"BAD_*")
+    result = ser.str.replace(pat, "", regex=True)
+    expected = Series(["foobar", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.replace(pat, "", n=1, regex=True)
+    expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_replace_compiled_regex_mixed_object():
+    pat = re.compile(r"BAD_*")
+    ser = Series(
+        ["aBAD", np.nan, "bBAD", True, datetime.today(), "fooBAD", None, 1, 2.0]
+    )
+    result = Series(ser).str.replace(pat, "", regex=True)
+    expected = Series(
+        ["a", np.nan, "b", np.nan, np.nan, "foo", None, np.nan, np.nan], dtype=object
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_replace_compiled_regex_unicode(any_string_dtype):
+    ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
+    expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
+    pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
+    result = ser.str.replace(pat, ", ", regex=True)
+    tm.assert_series_equal(result, expected)
+
+
+def test_replace_compiled_regex_raises(any_string_dtype):
+    # case and flags provided to str.replace will have no effect
+    # and will produce warnings
+    ser = Series(["fooBAD__barBAD__bad", np.nan], dtype=any_string_dtype)
+    pat = re.compile(r"BAD_*")
+
+    msg = "case and flags cannot be set when pat is a compiled regex"
+
+    with pytest.raises(ValueError, match=msg):
+        ser.str.replace(pat, "", flags=re.IGNORECASE, regex=True)
+
+    with pytest.raises(ValueError, match=msg):
+        ser.str.replace(pat, "", case=False, regex=True)
+
+    with pytest.raises(ValueError, match=msg):
+        ser.str.replace(pat, "", case=True, regex=True)
+
+
+def test_replace_compiled_regex_callable(any_string_dtype):
+    # test with callable
+    ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
+    repl = lambda m: m.group(0).swapcase()
+    pat = re.compile("[a-z][A-Z]{2}")
+    result = ser.str.replace(pat, repl, n=2, regex=True)
+    expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("regex,expected_val", [(True, "bao"), (False, "foo")])
+def test_replace_literal(regex, expected_val, any_string_dtype):
+    # GH16808 literal replace (regex=False vs regex=True)
+    ser = Series(["f.o", "foo", np.nan], dtype=any_string_dtype)
+    expected = Series(["bao", expected_val, np.nan], dtype=any_string_dtype)
+    result = ser.str.replace("f.", "ba", regex=regex)
+    tm.assert_series_equal(result, expected)
+
+
+def test_replace_literal_callable_raises(any_string_dtype):
+    ser = Series([], dtype=any_string_dtype)
+    repl = lambda m: m.group(0).swapcase()
+
+    msg = "Cannot use a callable replacement when regex=False"
+    with pytest.raises(ValueError, match=msg):
+        ser.str.replace("abc", repl, regex=False)
+
+
+def test_replace_literal_compiled_raises(any_string_dtype):
+    ser = Series([], dtype=any_string_dtype)
+    pat = re.compile("[a-z][A-Z]{2}")
+
+    msg = "Cannot use a compiled regex as replacement pattern with regex=False"
+    with pytest.raises(ValueError, match=msg):
+        ser.str.replace(pat, "", regex=False)
+
+
+def test_replace_moar(any_string_dtype):
+    # PR #1179
+    ser = Series(
+        ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
+        dtype=any_string_dtype,
+    )
+
+    result = ser.str.replace("A", "YYY")
+    expected = Series(
+        ["YYY", "B", "C", "YYYaba", "Baca", "", np.nan, "CYYYBYYY", "dog", "cat"],
+        dtype=any_string_dtype,
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.replace("A", "YYY", case=False)
+    expected = Series(
+        [
+            "YYY",
+            "B",
+            "C",
+            "YYYYYYbYYY",
+            "BYYYcYYY",
+            "",
+            np.nan,
+            "CYYYBYYY",
+            "dog",
+            "cYYYt",
+        ],
+        dtype=any_string_dtype,
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
+    expected = Series(
+        [
+            "A",
+            "B",
+            "C",
+            "XX-XX ba",
+            "XX-XX ca",
+            "",
+            np.nan,
+            "XX-XX BA",
+            "XX-XX ",
+            "XX-XX t",
+        ],
+        dtype=any_string_dtype,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_replace_not_case_sensitive_not_regex(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/issues/41602
+    ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype)
+
+    result = ser.str.replace("a", "c", case=False, regex=False)
+    expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.replace("a.", "c.", case=False, regex=False)
+    expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_replace_regex(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/pull/24809
+    s = Series(["a", "b", "ac", np.nan, ""], dtype=any_string_dtype)
+    result = s.str.replace("^.$", "a", regex=True)
+    expected = Series(["a", "a", "ac", np.nan, ""], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("regex", [True, False])
+def test_replace_regex_single_character(regex, any_string_dtype):
+    # https://github.com/pandas-dev/pandas/pull/24809, enforced in 2.0
+    # GH 24804
+    s = Series(["a.b", ".", "b", np.nan, ""], dtype=any_string_dtype)
+
+    result = s.str.replace(".", "a", regex=regex)
+    if regex:
+        expected = Series(["aaa", "a", "a", np.nan, ""], dtype=any_string_dtype)
+    else:
+        expected = Series(["aab", "a", "b", np.nan, ""], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pat, expected_data",
+    [
+        (r"a(?=b)", ["aa", "xb", "ba", "bb"]),
+        (r"(?<=a)b", ["aa", "ax", "ba", "bb"]),
+        (r"a(?!b)", ["xx", "ab", "bx", "bb"]),
+        (r"(?<!b)a", ["xx", "xb", "ba", "bb"]),
+        ("ab", ["aa", "x", "ba", "bb"]),
+    ],
+)
+def test_replace_lookarounds(any_string_dtype, pat, expected_data):
+    # https://github.com/pandas-dev/pandas/issues/60833
+    ser = Series(["aa", "ab", "ba", "bb", None], dtype=any_string_dtype)
+    result = ser.str.replace(pat, "x", regex=True)
+    if any_string_dtype == "object":
+        null_result = None
+    elif any_string_dtype == "str":
+        null_result = np.nan
+    elif any_string_dtype == "string":
+        null_result = pd.NA
+    else:
+        raise ValueError(f"Unrecognized dtype: {any_string_dtype}")
+    expected = Series([*expected_data, null_result], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_replace_end_of_string(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/pull/63613
+    ser = Series(["baz", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+
+    # with dollar sign
+    result = ser.str.replace("bar$", "x", regex=True)
+    if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
+        # pyarrow (RE2) only matches $ at the very end of the line
+        expected = Series(["baz", "x", "bars", "bar\n"], dtype=any_string_dtype)
+    else:
+        # python matches $ before or after an ending newline
+        expected = Series(["baz", "x", "bars", "x\n"], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # with \Z (ensure this is translated to \z for pyarrow)
+    result = ser.str.replace(r"bar\Z", "x", regex=True)
+    expected = Series(["baz", "x", "bars", "bar\n"], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # ensure finding a literal \Z still works
+    ser = Series([r"bar\Z", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+    result = ser.str.replace(r"bar\\Z", "x", regex=True)
+    expected = Series(["x", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+# --------------------------------------------------------------------------------------
+# str.match
+# --------------------------------------------------------------------------------------
+
+
+def test_match(any_string_dtype):
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        na_value = False
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        na_value = np.nan
+
+    values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
+    result = values.str.match(".*(BAD[_]+).*(BAD)")
+    expected = Series([True, na_value, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    values = Series(
+        ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
+    )
+    result = values.str.match(".*BAD[_]+.*BAD")
+    expected = Series([True, True, na_value, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = values.str.match("BAD[_]+.*BAD")
+    expected = Series([False, True, na_value, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    values = Series(
+        ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
+    )
+    result = values.str.match("^BAD[_]+.*BAD")
+    expected = Series([False, False, na_value, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = values.str.match("\\^BAD[_]+.*BAD")
+    expected = Series([False, True, na_value, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_match_mixed_object():
+    mixed = Series(
+        [
+            "aBAD_BAD",
+            np.nan,
+            "BAD_b_BAD",
+            True,
+            datetime.today(),
+            "foo",
+            None,
+            1,
+            2.0,
+        ]
+    )
+    result = Series(mixed).str.match(".*(BAD[_]+).*(BAD)")
+    expected = Series([True, np.nan, True, np.nan, np.nan, False, None, np.nan, np.nan])
+    assert isinstance(result, Series)
+    tm.assert_series_equal(result, expected)
+
+
+def test_match_na_kwarg(any_string_dtype):
+    # GH #6609
+    s = Series(["a", "b", np.nan], dtype=any_string_dtype)
+
+    result = s.str.match("a", na=False)
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+    expected = Series([True, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = s.str.match("a")
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        na_value = False
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        na_value = np.nan
+
+    expected = Series([True, False, na_value], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_match_case_kwarg(any_string_dtype):
+    values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
+    result = values.str.match("ab", case=False)
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+    expected = Series([True, True, True, True], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_match_compiled_regex(any_string_dtype):
+    # GH#61952
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+
+    values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
+
+    result = values.str.match(re.compile("ab"))
+    expected = Series([True, False, True, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    msg = (
+        "Cannot both specify 'case' and pass a compiled "
+        "regexp object with conflicting case-sensitivity"
+    )
+    with pytest.raises(ValueError, match=msg):
+        values.str.match(re.compile("ab"), case=False)
+
+    result = values.str.match(re.compile("ab", flags=re.IGNORECASE))
+    expected = Series([True, True, True, True], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    msg = (
+        "Cannot both specify 'flags' and pass a compiled "
+        "regexp object with conflicting flags"
+    )
+    with pytest.raises(ValueError, match=msg):
+        values.str.match(re.compile("ab"), flags=re.IGNORECASE)
+
+    # But if the flags match you're OK
+    values.str.match(re.compile("ab", flags=re.IGNORECASE), flags=re.IGNORECASE)
+
+
+@pytest.mark.parametrize(
+    "pat, case, exp",
+    [
+        ["ab", False, [True, False]],
+        ["Ab", True, [False, False]],
+        ["bc", True, [False, False]],
+        ["a[a-z]{1}", False, [True, False]],
+        ["A[a-z]{1}", True, [False, False]],
+        # https://github.com/pandas-dev/pandas/issues/61072
+        ["(bc)|(ab)", True, [True, False]],
+        ["((bc)|(ab))", True, [True, False]],
+    ],
+)
+def test_str_match_extra_cases(any_string_dtype, pat, case, exp):
+    ser = Series(["abc", "Xab"], dtype=any_string_dtype)
+    result = ser.str.match(pat, case=case)
+
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+    expected = Series(exp, dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pat, expected_data",
+    [
+        (r"a(?=b)", [False, True, False, False]),
+        (r"(?<=a)b", [False, False, False, False]),
+        (r"a(?!b)", [True, False, False, False]),
+        (r"(?<!b)a", [True, True, False, False]),
+        ("ab", [False, True, False, False]),
+    ],
+)
+def test_match_lookarounds(any_string_dtype, pat, expected_data):
+    # https://github.com/pandas-dev/pandas/issues/60833
+    if any_string_dtype == "object":
+        expected_dtype, null_result = "object", None
+    elif any_string_dtype == "str":
+        expected_dtype, null_result = "bool", False
+    elif any_string_dtype == "string":
+        expected_dtype, null_result = "boolean", pd.NA
+    else:
+        raise ValueError(f"Unrecognized dtype: {any_string_dtype}")
+    ser = Series(["aa", "ab", "ba", "bb", None], dtype=any_string_dtype)
+    result = ser.str.match(pat)
+    expected = Series([*expected_data, null_result], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_match_end_of_string(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/pull/63613
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+
+    ser = Series(["baz", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+
+    # with dollar sign
+    result = ser.str.match("bar$")
+    if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
+        # pyarrow (RE2) only matches $ at the very end of the line
+        expected = Series([False, True, False, False], dtype=expected_dtype)
+    else:
+        # python matches $ before or after an ending newline
+        expected = Series([False, True, False, True], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # with \Z (ensure this is translated to \z for pyarrow)
+    result = ser.str.match(r"bar\Z")
+    expected = Series([False, True, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # ensure finding a literal \Z still works
+    ser = Series([r"bar\Z", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+    result = ser.str.match(r"bar\\Z")
+    expected = Series([True, False, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+# --------------------------------------------------------------------------------------
+# str.fullmatch
+# --------------------------------------------------------------------------------------
+
+
+def test_fullmatch(any_string_dtype):
+    # GH 32806
+    ser = Series(
+        ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
+    )
+    result = ser.str.fullmatch(".*BAD[_]+.*BAD")
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series([True, False, False, False], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series([True, False, np.nan, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_fullmatch_dollar_literal(any_string_dtype):
+    # GH 56652
+    ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype)
+    result = ser.str.fullmatch("foo\\$")
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series([False, False, False, True], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series([False, False, np.nan, True], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_fullmatch_na_kwarg(any_string_dtype):
+    ser = Series(
+        ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
+    )
+    result = ser.str.fullmatch(".*BAD[_]+.*BAD", na=False)
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+    expected = Series([True, False, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_fullmatch_case_kwarg(any_string_dtype):
+    ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+
+    expected = Series([True, False, False, False], dtype=expected_dtype)
+
+    result = ser.str.fullmatch("ab", case=True)
+    tm.assert_series_equal(result, expected)
+
+    expected = Series([True, True, False, False], dtype=expected_dtype)
+
+    result = ser.str.fullmatch("ab", case=False)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.fullmatch("ab", flags=re.IGNORECASE)
+    tm.assert_series_equal(result, expected)
+
+
+def test_fullmatch_compiled_regex(any_string_dtype):
+    # GH#61952
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+
+    values = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
+
+    result = values.str.fullmatch(re.compile("ab"))
+    expected = Series([True, False, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # TODO this currently works for pyarrow-backed dtypes but raises for python
+    if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
+        result = values.str.fullmatch(re.compile("ab"), case=False)
+        expected = Series([True, True, False, False], dtype=expected_dtype)
+        tm.assert_series_equal(result, expected)
+    else:
+        with pytest.raises(
+            ValueError, match="cannot process flags argument with a compiled pattern"
+        ):
+            values.str.fullmatch(re.compile("ab"), case=False)
+
+    result = values.str.fullmatch(re.compile("ab", flags=re.IGNORECASE))
+    expected = Series([True, True, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    with pytest.raises(
+        ValueError, match="cannot process flags argument with a compiled pattern"
+    ):
+        values.str.fullmatch(re.compile("ab"), flags=re.IGNORECASE)
+
+
+@pytest.mark.parametrize(
+    "pat, case, na, exp",
+    # Note: keep cases in sync with
+    # pandas/tests/extension/test_arrow.py::test_str_fullmatch
+    [
+        ["abc", False, None, [True, False, False, None]],
+        ["Abc", True, None, [False, False, False, None]],
+        ["bc", True, None, [False, False, False, None]],
+        ["ab", False, None, [False, False, False, None]],
+        ["a[a-z]{2}", False, None, [True, False, False, None]],
+        ["A[a-z]{1}", True, None, [False, False, False, None]],
+        # GH Issue: #56652
+        ["abc$", False, None, [True, False, False, None]],
+        ["abc\\$", False, None, [False, True, False, None]],
+        ["Abc$", True, None, [False, False, False, None]],
+        ["Abc\\$", True, None, [False, False, False, None]],
+        # https://github.com/pandas-dev/pandas/issues/61072
+        ["(abc)|(abx)", True, None, [True, False, False, None]],
+        ["((abc)|(abx))", True, None, [True, False, False, None]],
+    ],
+)
+def test_str_fullmatch_extra_cases(any_string_dtype, pat, case, na, exp):
+    ser = Series(["abc", "abc$", "$abc", None], dtype=any_string_dtype)
+    result = ser.str.fullmatch(pat, case=case, na=na)
+
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        exp[-1] = False
+        expected_dtype = bool
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+    expected = Series(exp, dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pat",
+    [(r"a(?=b)"), (r"(?<=a)b"), (r"a(?!b)"), (r"(?<!b)a"), ("ab")],
+)
+def test_fullmatch_lookarounds(any_string_dtype, pat):
+    # https://github.com/pandas-dev/pandas/issues/60833
+    # Note: By definition, any match with a lookaround is not a full match.
+    if any_string_dtype == "object":
+        expected_dtype, null_result = "object", None
+    elif any_string_dtype == "str":
+        expected_dtype, null_result = "bool", False
+    elif any_string_dtype == "string":
+        expected_dtype, null_result = "boolean", pd.NA
+    else:
+        raise ValueError(f"Unrecognized dtype: {any_string_dtype}")
+    ser = Series(["aa", "ab", "ba", "bb", None], dtype=any_string_dtype)
+    result = ser.str.fullmatch(pat)
+    expected = Series(
+        [False, True if pat == "ab" else False, False, False, null_result],
+        dtype=expected_dtype,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_fullmatch_end_of_string(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/pull/63613
+    expected_dtype = (
+        np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+
+    ser = Series(["baz", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+
+    # with dollar sign (for fullmatch, no difference between python and pyarrow)
+    result = ser.str.fullmatch("bar$")
+    expected = Series([False, True, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # with \Z (ensure this is translated to \z for pyarrow)
+    result = ser.str.fullmatch(r"bar\Z")
+    tm.assert_series_equal(result, expected)
+
+    # ensure finding a literal \Z still works
+    ser = Series([r"bar\Z", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+    result = ser.str.fullmatch(r"bar\\Z")
+    expected = Series([True, False, False, False], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+# --------------------------------------------------------------------------------------
+# str.findall
+# --------------------------------------------------------------------------------------
+
+
+def test_findall(any_string_dtype):
+    ser = Series(["fooBAD__barBAD", np.nan, "foo", "BAD"], dtype=any_string_dtype)
+    result = ser.str.findall("BAD[_]*")
+    expected = Series([["BAD__", "BAD"], np.nan, [], ["BAD"]])
+    expected = _convert_na_value(ser, expected)
+    tm.assert_series_equal(result, expected)
+
+
+def test_findall_mixed_object():
+    ser = Series(
+        [
+            "fooBAD__barBAD",
+            np.nan,
+            "foo",
+            True,
+            datetime.today(),
+            "BAD",
+            None,
+            1,
+            2.0,
+        ]
+    )
+
+    result = ser.str.findall("BAD[_]*")
+    expected = Series(
+        [
+            ["BAD__", "BAD"],
+            np.nan,
+            [],
+            np.nan,
+            np.nan,
+            ["BAD"],
+            None,
+            np.nan,
+            np.nan,
+        ]
+    )
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pat, expected_data",
+    [
+        (r"a(?=b)", [[], ["a"], [], []]),
+        (r"(?<=a)b", [[], ["b"], [], []]),
+        (r"a(?!b)", [["a", "a"], [], ["a"], []]),
+        (r"(?<!b)a", [["a", "a"], ["a"], [], []]),
+        ("ab", [[], ["ab"], [], []]),
+    ],
+)
+def test_findall_lookarounds(any_string_dtype, pat, expected_data):
+    # https://github.com/pandas-dev/pandas/issues/60833
+    ser = Series(["aa", "ab", "ba", "bb", None], dtype=any_string_dtype)
+    result = ser.str.findall(pat)
+    if any_string_dtype == "object":
+        null_result = None
+    elif any_string_dtype == "str":
+        null_result = np.nan
+    elif any_string_dtype == "string":
+        null_result = pd.NA
+    else:
+        raise ValueError(f"Unrecognized dtype: {any_string_dtype}")
+    expected = Series([*expected_data, null_result])
+    tm.assert_series_equal(result, expected)
+
+
+def test_findall_end_of_string(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/pull/63613
+    ser = Series(["baz", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+
+    # with dollar sign
+    result = ser.str.findall("bar$")
+    expected = Series([[], ["bar"], [], ["bar"]], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+    # with \Z (ensure this is translated to \z for pyarrow)
+    result = ser.str.findall(r"bar\Z")
+    expected = Series([[], ["bar"], [], []], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+    # ensure finding a literal \Z still works
+    ser = Series([r"bar\Z", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+    result = ser.str.findall(r"bar\\Z")
+    expected = Series([["bar\\Z"], [], [], []], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+
+# --------------------------------------------------------------------------------------
+# str.find
+# --------------------------------------------------------------------------------------
+
+
+def test_find(any_string_dtype):
+    ser = Series(
+        ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXXX"], dtype=any_string_dtype
+    )
+    expected_dtype = (
+        np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
+    )
+
+    result = ser.str.find("EF")
+    expected = Series([4, 3, 1, 0, -1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+    expected = np.array([v.find("EF") for v in np.array(ser)], dtype=np.int64)
+    tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
+
+    result = ser.str.rfind("EF")
+    expected = Series([4, 5, 7, 4, -1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+    expected = np.array([v.rfind("EF") for v in np.array(ser)], dtype=np.int64)
+    tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
+
+    result = ser.str.find("EF", 3)
+    expected = Series([4, 3, 7, 4, -1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+    expected = np.array([v.find("EF", 3) for v in np.array(ser)], dtype=np.int64)
+    tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
+
+    result = ser.str.rfind("EF", 3)
+    expected = Series([4, 5, 7, 4, -1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+    expected = np.array([v.rfind("EF", 3) for v in np.array(ser)], dtype=np.int64)
+    tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
+
+    result = ser.str.find("EF", 3, 6)
+    expected = Series([4, 3, -1, 4, -1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+    expected = np.array([v.find("EF", 3, 6) for v in np.array(ser)], dtype=np.int64)
+    tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
+
+    result = ser.str.rfind("EF", 3, 6)
+    expected = Series([4, 3, -1, 4, -1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+    expected = np.array([v.rfind("EF", 3, 6) for v in np.array(ser)], dtype=np.int64)
+    tm.assert_numpy_array_equal(np.array(result, dtype=np.int64), expected)
+
+
+def test_find_bad_arg_raises(any_string_dtype):
+    ser = Series([], dtype=any_string_dtype)
+    with pytest.raises(TypeError, match="expected a string object, not int"):
+        ser.str.find(0)
+
+    with pytest.raises(TypeError, match="expected a string object, not int"):
+        ser.str.rfind(0)
+
+
+def test_find_nan(any_string_dtype):
+    ser = Series(
+        ["ABCDEFG", np.nan, "DEFGHIJEF", np.nan, "XXXX"], dtype=any_string_dtype
+    )
+    if is_object_or_nan_string_dtype(any_string_dtype):
+        expected_dtype = np.float64
+        item = np.nan
+    else:
+        expected_dtype = "Int64"
+        item = pd.NA
+
+    result = ser.str.find("EF")
+    expected = Series([4, item, 1, item, -1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.rfind("EF")
+    expected = Series([4, item, 7, item, -1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.find("EF", 3)
+    expected = Series([4, item, 7, item, -1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.rfind("EF", 3)
+    expected = Series([4, item, 7, item, -1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.find("EF", 3, 6)
+    expected = Series([4, item, -1, item, -1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.rfind("EF", 3, 6)
+    expected = Series([4, item, -1, item, -1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+# --------------------------------------------------------------------------------------
+# str.translate
+# --------------------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
+)
+def test_translate(index_or_series, any_string_dtype, infer_string):
+    obj = index_or_series(
+        ["abcdefg", "abcc", "cdddfg", "cdefggg"], dtype=any_string_dtype
+    )
+    table = str.maketrans("abc", "cde")
+    result = obj.str.translate(table)
+    expected = index_or_series(
+        ["cdedefg", "cdee", "edddfg", "edefggg"], dtype=any_string_dtype
+    )
+    tm.assert_equal(result, expected)
+
+
+def test_translate_mixed_object():
+    # Series with non-string values
+    s = Series(["a", "b", "c", 1.2])
+    table = str.maketrans("abc", "cde")
+    expected = Series(["c", "d", "e", np.nan], dtype=object)
+    result = s.str.translate(table)
+    tm.assert_series_equal(result, expected)
+
+
+# --------------------------------------------------------------------------------------
+
+
+def test_flags_kwarg(any_string_dtype):
+    data = {
+        "Dave": "dave@google.com",
+        "Steve": "steve@gmail.com",
+        "Rob": "rob@gmail.com",
+        "Wes": np.nan,
+    }
+    data = Series(data, dtype=any_string_dtype)
+
+    pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
+
+    result = data.str.extract(pat, flags=re.IGNORECASE, expand=True)
+    assert result.iloc[0].tolist() == ["dave", "google", "com"]
+
+    result = data.str.match(pat, flags=re.IGNORECASE)
+    assert result.iloc[0]
+
+    result = data.str.fullmatch(pat, flags=re.IGNORECASE)
+    assert result.iloc[0]
+
+    result = data.str.findall(pat, flags=re.IGNORECASE)
+    assert result.iloc[0][0] == ("dave", "google", "com")
+
+    result = data.str.count(pat, flags=re.IGNORECASE)
+    assert result.iloc[0] == 1
+
+    msg = "has match groups"
+    with tm.assert_produces_warning(UserWarning, match=msg):
+        result = data.str.contains(pat, flags=re.IGNORECASE)
+    assert result.iloc[0]
diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
new file mode 100644
index 0000000000000000000000000000000000000000..16e10c6fcdccdf8ffad7743068c437187b1a435e
--- /dev/null
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -0,0 +1,102 @@
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    _testing as tm,
+)
+
+
+def test_get_dummies(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|")
+    expected = DataFrame([[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"))
+    tm.assert_frame_equal(result, expected)
+
+    s = Series(["a;b", "a", 7], dtype=any_string_dtype)
+    result = s.str.get_dummies(";")
+    expected = DataFrame([[0, 1, 1], [0, 1, 0], [1, 0, 0]], columns=list("7ab"))
+    tm.assert_frame_equal(result, expected)
+
+
+def test_get_dummies_index():
+    # GH9980, GH8028
+    idx = Index(["a|b", "a|c", "b|c"])
+    result = idx.str.get_dummies("|")
+
+    expected = MultiIndex.from_tuples(
+        [(1, 1, 0), (1, 0, 1), (0, 1, 1)], names=("a", "b", "c")
+    )
+    tm.assert_index_equal(result, expected)
+
+
+# GH#47872
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.uint8,
+        np.int16,
+        np.uint16,
+        np.int32,
+        np.uint32,
+        np.int64,
+        np.uint64,
+        bool,
+        "Int8",
+        "Int16",
+        "Int32",
+        "Int64",
+        "boolean",
+    ],
+)
+def test_get_dummies_with_dtype(any_string_dtype, dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=dtype)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+# GH#47872
+@td.skip_if_no("pyarrow")
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "int8[pyarrow]",
+        "uint8[pyarrow]",
+        "int16[pyarrow]",
+        "uint16[pyarrow]",
+        "int32[pyarrow]",
+        "uint32[pyarrow]",
+        "int64[pyarrow]",
+        "uint64[pyarrow]",
+        "bool[pyarrow]",
+    ],
+)
+def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=dtype)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=list("abc"),
+        dtype=dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+# GH#47872
+def test_get_dummies_with_str_dtype(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+
+    msg = "Only numeric or boolean dtypes are supported for 'dtype'"
+    with pytest.raises(ValueError, match=msg):
+        s.str.get_dummies("|", dtype=str)
+
+    with pytest.raises(ValueError, match=msg):
+        s.str.get_dummies("|", dtype="datetime64[ns]")
diff --git a/pandas/tests/strings/test_split_partition.py b/pandas/tests/strings/test_split_partition.py
new file mode 100644
index 0000000000000000000000000000000000000000..df16e70d85471a23f9f404c48dbfe1c44e94b1a9
--- /dev/null
+++ b/pandas/tests/strings/test_split_partition.py
@@ -0,0 +1,772 @@
+from datetime import datetime
+import re
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    _testing as tm,
+)
+from pandas.tests.strings import (
+    _convert_na_value,
+    is_object_or_nan_string_dtype,
+)
+
+
+@pytest.mark.parametrize("method", ["split", "rsplit"])
+def test_split(any_string_dtype, method):
+    values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
+
+    result = getattr(values.str, method)("_")
+    exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
+    exp = _convert_na_value(values, exp)
+    tm.assert_series_equal(result, exp)
+
+
+@pytest.mark.parametrize("method", ["split", "rsplit"])
+def test_split_more_than_one_char(any_string_dtype, method):
+    # more than one char
+    values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype)
+    result = getattr(values.str, method)("__")
+    exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
+    exp = _convert_na_value(values, exp)
+    tm.assert_series_equal(result, exp)
+
+    result = getattr(values.str, method)("__", expand=False)
+    tm.assert_series_equal(result, exp)
+
+
+def test_split_more_regex_split(any_string_dtype):
+    # regex split
+    values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
+    result = values.str.split("[,_]")
+    exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]])
+    exp = _convert_na_value(values, exp)
+    tm.assert_series_equal(result, exp)
+
+
+def test_split_regex(any_string_dtype):
+    # GH 43563
+    # explicit regex = True split
+    values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
+    result = values.str.split(r"\.jpg", regex=True)
+    exp = Series([["xxxjpgzzz", ""]])
+    tm.assert_series_equal(result, exp)
+
+
+def test_split_regex_explicit(any_string_dtype):
+    # explicit regex = True split with compiled regex
+    regex_pat = re.compile(r".jpg")
+    values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype)
+    result = values.str.split(regex_pat)
+    exp = Series([["xx", "zzz", ""]])
+    tm.assert_series_equal(result, exp)
+
+    # explicit regex = False split
+    result = values.str.split(r"\.jpg", regex=False)
+    exp = Series([["xxxjpgzzz.jpg"]])
+    tm.assert_series_equal(result, exp)
+
+    # non explicit regex split, pattern length == 1
+    result = values.str.split(r".")
+    exp = Series([["xxxjpgzzz", "jpg"]])
+    tm.assert_series_equal(result, exp)
+
+    # non explicit regex split, pattern length != 1
+    result = values.str.split(r".jpg")
+    exp = Series([["xx", "zzz", ""]])
+    tm.assert_series_equal(result, exp)
+
+    # regex=False with pattern compiled regex raises error
+    with pytest.raises(
+        ValueError,
+        match="Cannot use a compiled regex as replacement pattern with regex=False",
+    ):
+        values.str.split(regex_pat, regex=False)
+
+
+@pytest.mark.parametrize("expand", [None, False])
+@pytest.mark.parametrize("method", ["split", "rsplit"])
+def test_split_object_mixed(expand, method):
+    mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0])
+    result = getattr(mixed.str, method)("_", expand=expand)
+    exp = Series(
+        [
+            ["a", "b", "c"],
+            np.nan,
+            ["d", "e", "f"],
+            np.nan,
+            np.nan,
+            None,
+            np.nan,
+            np.nan,
+        ]
+    )
+    assert isinstance(result, Series)
+    tm.assert_almost_equal(result, exp)
+
+
+@pytest.mark.parametrize("method", ["split", "rsplit"])
+@pytest.mark.parametrize("n", [None, 0])
+def test_split_n(any_string_dtype, method, n):
+    s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype)
+    expected = Series([["a", "b"], pd.NA, ["b", "c"]])
+    result = getattr(s.str, method)(" ", n=n)
+    expected = _convert_na_value(s, expected)
+    tm.assert_series_equal(result, expected)
+
+
+def test_rsplit(any_string_dtype):
+    # regex split is not supported by rsplit
+    values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype)
+    result = values.str.rsplit("[,_]")
+    exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]])
+    exp = _convert_na_value(values, exp)
+    tm.assert_series_equal(result, exp)
+
+
+def test_rsplit_max_number(any_string_dtype):
+    # setting max number of splits, make sure it's from reverse
+    values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
+    result = values.str.rsplit("_", n=1)
+    exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]])
+    exp = _convert_na_value(values, exp)
+    tm.assert_series_equal(result, exp)
+
+
+def test_split_blank_string(any_string_dtype):
+    # expand blank split GH 20067
+    values = Series([""], name="test", dtype=any_string_dtype)
+    result = values.str.split(expand=True)
+    exp = DataFrame([[]], dtype=any_string_dtype)  # NOTE: this is NOT an empty df
+    tm.assert_frame_equal(result, exp)
+
+
+def test_split_blank_string_with_non_empty(any_string_dtype):
+    values = Series(["a b c", "a b", "", " "], name="test", dtype=any_string_dtype)
+    result = values.str.split(expand=True)
+    exp = DataFrame(
+        [
+            ["a", "b", "c"],
+            ["a", "b", None],
+            [None, None, None],
+            [None, None, None],
+        ],
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, exp)
+
+
+@pytest.mark.parametrize("method", ["split", "rsplit"])
+def test_split_noargs(any_string_dtype, method):
+    # #1859
+    s = Series(["Wes McKinney", "Travis  Oliphant"], dtype=any_string_dtype)
+    result = getattr(s.str, method)()
+    expected = ["Travis", "Oliphant"]
+    assert result[1] == expected
+
+
+@pytest.mark.parametrize(
+    "data, pat",
+    [
+        (["bd asdf jfg", "kjasdflqw asdfnfk"], None),
+        (["bd asdf jfg", "kjasdflqw asdfnfk"], "asdf"),
+        (["bd_asdf_jfg", "kjasdflqw_asdfnfk"], "_"),
+    ],
+)
+@pytest.mark.parametrize("n", [-1, 0])
+def test_split_maxsplit(data, pat, any_string_dtype, n):
+    # re.split 0, str.split -1
+    s = Series(data, dtype=any_string_dtype)
+
+    result = s.str.split(pat=pat, n=n)
+    xp = s.str.split(pat=pat)
+    tm.assert_series_equal(result, xp)
+
+
+@pytest.mark.parametrize(
+    "data, pat, expected_val",
+    [
+        (
+            ["split once", "split once too!"],
+            None,
+            "once too!",
+        ),
+        (
+            ["split_once", "split_once_too!"],
+            "_",
+            "once_too!",
+        ),
+    ],
+)
+def test_split_no_pat_with_nonzero_n(data, pat, expected_val, any_string_dtype):
+    s = Series(data, dtype=any_string_dtype)
+    result = s.str.split(pat=pat, n=1)
+    expected = Series({0: ["split", "once"], 1: ["split", expected_val]})
+    tm.assert_series_equal(expected, result, check_index_type=False)
+
+
+def test_split_to_dataframe_no_splits(any_string_dtype):
+    s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
+    result = s.str.split("_", expand=True)
+    exp = DataFrame({0: Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)})
+    tm.assert_frame_equal(result, exp)
+
+
+def test_split_to_dataframe(any_string_dtype):
+    s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype)
+    result = s.str.split("_", expand=True)
+    exp = DataFrame(
+        {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, exp)
+
+
+def test_split_to_dataframe_unequal_splits(any_string_dtype):
+    s = Series(
+        ["some_unequal_splits", "one_of_these_things_is_not"], dtype=any_string_dtype
+    )
+    result = s.str.split("_", expand=True)
+    exp = DataFrame(
+        {
+            0: ["some", "one"],
+            1: ["unequal", "of"],
+            2: ["splits", "these"],
+            3: [None, "things"],
+            4: [None, "is"],
+            5: [None, "not"],
+        },
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, exp)
+
+
+def test_split_to_dataframe_with_index(any_string_dtype):
+    s = Series(
+        ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype
+    )
+    result = s.str.split("_", expand=True)
+    exp = DataFrame(
+        {0: ["some", "with"], 1: ["splits", "index"]},
+        index=["preserve", "me"],
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, exp)
+
+    with pytest.raises(ValueError, match="expand must be"):
+        s.str.split("_", expand="not_a_boolean")
+
+
+def test_split_to_multiindex_expand_no_splits():
+    # https://github.com/pandas-dev/pandas/issues/23677
+
+    idx = Index(["nosplit", "alsonosplit", np.nan])
+    result = idx.str.split("_", expand=True)
+    exp = idx
+    tm.assert_index_equal(result, exp)
+    assert result.nlevels == 1
+
+
+def test_split_to_multiindex_expand():
+    idx = Index(["some_equal_splits", "with_no_nans", np.nan, None])
+    result = idx.str.split("_", expand=True)
+    exp = MultiIndex.from_tuples(
+        [
+            ("some", "equal", "splits"),
+            ("with", "no", "nans"),
+            [np.nan, np.nan, np.nan],
+            [None, None, None],
+        ]
+    )
+    tm.assert_index_equal(result, exp)
+    assert result.nlevels == 3
+
+
+def test_split_to_multiindex_expand_unequal_splits():
+    idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None])
+    result = idx.str.split("_", expand=True)
+    exp = MultiIndex.from_tuples(
+        [
+            ("some", "unequal", "splits", np.nan, np.nan, np.nan),
+            ("one", "of", "these", "things", "is", "not"),
+            (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan),
+            (None, None, None, None, None, None),
+        ]
+    )
+    tm.assert_index_equal(result, exp)
+    assert result.nlevels == 6
+
+    with pytest.raises(ValueError, match="expand must be"):
+        idx.str.split("_", expand="not_a_boolean")
+
+
+@pytest.mark.parametrize(
+    "pat, expected_data",
+    [
+        (r"a(?=b)", [["aa"], ["", "b"], ["ba"], ["bb"]]),
+        (r"(?<=a)b", [["aa"], ["a", ""], ["ba"], ["bb"]]),
+        (r"a(?!b)", [["", "", ""], ["ab"], ["b", ""], ["bb"]]),
+        (r"(?<!b)a", [["", "", ""], ["", "b"], ["ba"], ["bb"]]),
+        ("ab", [["aa"], ["", ""], ["ba"], ["bb"]]),
+    ],
+)
+def test_split_lookarounds(any_string_dtype, pat, expected_data):
+    # https://github.com/pandas-dev/pandas/issues/60833
+    ser = Series(["aa", "ab", "ba", "bb", None], dtype=any_string_dtype)
+    result = ser.str.split(pat, regex=True)
+    if any_string_dtype == "object":
+        null_result = None
+    elif any_string_dtype == "str":
+        null_result = np.nan
+    elif any_string_dtype == "string":
+        null_result = pd.NA
+    else:
+        raise ValueError(f"Unrecognized dtype: {any_string_dtype}")
+    expected = Series([*expected_data, null_result])
+    tm.assert_series_equal(result, expected)
+
+
+def test_split_regex_end_of_string(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/pull/63613
+    ser = Series(["baz", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+
+    # with dollar sign
+    result = ser.str.split("r$", regex=True)
+    expected = Series([["baz"], ["ba", ""], ["bars"], ["ba", "\n"]], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+    # with \Z (ensure this is translated to \z for pyarrow)
+    result = ser.str.split(r"r\Z", regex=True)
+    expected = Series([["baz"], ["ba", ""], ["bars"], ["bar\n"]], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+    # ensure finding a literal \Z still works
+    ser = Series([r"bar\Z", "bar", r"bar\Zs", "bar\n"], dtype=any_string_dtype)
+    result = ser.str.split(r"r\\Z", regex=True)
+    expected = Series([["ba", ""], ["bar"], ["ba", "s"], ["bar\n"]], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+
+def test_rsplit_to_dataframe_expand_no_splits(any_string_dtype):
+    s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)
+    result = s.str.rsplit("_", expand=True)
+    exp = DataFrame({0: Series(["nosplit", "alsonosplit"])}, dtype=any_string_dtype)
+    tm.assert_frame_equal(result, exp)
+
+
+def test_rsplit_to_dataframe_expand(any_string_dtype):
+    s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype)
+    result = s.str.rsplit("_", expand=True)
+    exp = DataFrame(
+        {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, exp)
+
+    result = s.str.rsplit("_", expand=True, n=2)
+    exp = DataFrame(
+        {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]},
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, exp)
+
+    result = s.str.rsplit("_", expand=True, n=1)
+    exp = DataFrame(
+        {0: ["some_equal", "with_no"], 1: ["splits", "nans"]}, dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, exp)
+
+
+def test_rsplit_to_dataframe_expand_with_index(any_string_dtype):
+    s = Series(
+        ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype
+    )
+    result = s.str.rsplit("_", expand=True)
+    exp = DataFrame(
+        {0: ["some", "with"], 1: ["splits", "index"]},
+        index=["preserve", "me"],
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, exp)
+
+
+def test_rsplit_to_multiindex_expand_no_split():
+    idx = Index(["nosplit", "alsonosplit"])
+    result = idx.str.rsplit("_", expand=True)
+    exp = idx
+    tm.assert_index_equal(result, exp)
+    assert result.nlevels == 1
+
+
+def test_rsplit_to_multiindex_expand():
+    idx = Index(["some_equal_splits", "with_no_nans"])
+    result = idx.str.rsplit("_", expand=True)
+    exp = MultiIndex.from_tuples([("some", "equal", "splits"), ("with", "no", "nans")])
+    tm.assert_index_equal(result, exp)
+    assert result.nlevels == 3
+
+
+def test_rsplit_to_multiindex_expand_n():
+    idx = Index(["some_equal_splits", "with_no_nans"])
+    result = idx.str.rsplit("_", expand=True, n=1)
+    exp = MultiIndex.from_tuples([("some_equal", "splits"), ("with_no", "nans")])
+    tm.assert_index_equal(result, exp)
+    assert result.nlevels == 2
+
+
+def test_split_nan_expand(any_string_dtype):
+    # gh-18450
+    s = Series(["foo,bar,baz", np.nan], dtype=any_string_dtype)
+    result = s.str.split(",", expand=True)
+    exp = DataFrame(
+        [["foo", "bar", "baz"], [np.nan, np.nan, np.nan]], dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, exp)
+
+    # check that these are actually np.nan/pd.NA and not None
+    # TODO see GH 18463
+    # tm.assert_frame_equal does not differentiate
+    if is_object_or_nan_string_dtype(any_string_dtype):
+        assert all(np.isnan(x) for x in result.iloc[1])
+    else:
+        assert all(x is pd.NA for x in result.iloc[1])
+
+
+def test_split_with_name_series(any_string_dtype):
+    # GH 12617
+
+    # should preserve name
+    s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
+    res = s.str.split(",")
+    exp = Series([["a", "b"], ["c", "d"]], name="xxx")
+    tm.assert_series_equal(res, exp)
+
+    res = s.str.split(",", expand=True)
+    exp = DataFrame([["a", "b"], ["c", "d"]], dtype=any_string_dtype)
+    tm.assert_frame_equal(res, exp)
+
+
+def test_split_with_name_index():
+    # GH 12617
+    idx = Index(["a,b", "c,d"], name="xxx")
+    res = idx.str.split(",")
+    exp = Index([["a", "b"], ["c", "d"]], name="xxx")
+    assert res.nlevels == 1
+    tm.assert_index_equal(res, exp)
+
+    res = idx.str.split(",", expand=True)
+    exp = MultiIndex.from_tuples([("a", "b"), ("c", "d")])
+    assert res.nlevels == 2
+    tm.assert_index_equal(res, exp)
+
+
+@pytest.mark.parametrize(
+    "method, exp",
+    [
+        [
+            "partition",
+            [
+                ("a", "__", "b__c"),
+                ("c", "__", "d__e"),
+                np.nan,
+                ("f", "__", "g__h"),
+                None,
+            ],
+        ],
+        [
+            "rpartition",
+            [
+                ("a__b", "__", "c"),
+                ("c__d", "__", "e"),
+                np.nan,
+                ("f__g", "__", "h"),
+                None,
+            ],
+        ],
+    ],
+)
+def test_partition_series_more_than_one_char(method, exp, any_string_dtype):
+    # https://github.com/pandas-dev/pandas/issues/23558
+    # more than one char
+    s = Series(["a__b__c", "c__d__e", np.nan, "f__g__h", None], dtype=any_string_dtype)
+    result = getattr(s.str, method)("__", expand=False)
+    expected = Series(exp)
+    expected = _convert_na_value(s, expected)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, exp",
+    [
+        [
+            "partition",
+            [("a", " ", "b c"), ("c", " ", "d e"), np.nan, ("f", " ", "g h"), None],
+        ],
+        [
+            "rpartition",
+            [("a b", " ", "c"), ("c d", " ", "e"), np.nan, ("f g", " ", "h"), None],
+        ],
+    ],
+)
+def test_partition_series_none(any_string_dtype, method, exp):
+    # https://github.com/pandas-dev/pandas/issues/23558
+    # None
+    s = Series(["a b c", "c d e", np.nan, "f g h", None], dtype=any_string_dtype)
+    result = getattr(s.str, method)(expand=False)
+    expected = Series(exp)
+    expected = _convert_na_value(s, expected)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, exp",
+    [
+        [
+            "partition",
+            [("abc", "", ""), ("cde", "", ""), np.nan, ("fgh", "", ""), None],
+        ],
+        [
+            "rpartition",
+            [("", "", "abc"), ("", "", "cde"), np.nan, ("", "", "fgh"), None],
+        ],
+    ],
+)
+def test_partition_series_not_split(any_string_dtype, method, exp):
+    # https://github.com/pandas-dev/pandas/issues/23558
+    # Not split
+    s = Series(["abc", "cde", np.nan, "fgh", None], dtype=any_string_dtype)
+    result = getattr(s.str, method)("_", expand=False)
+    expected = Series(exp)
+    expected = _convert_na_value(s, expected)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, exp",
+    [
+        [
+            "partition",
+            [("a", "_", "b_c"), ("c", "_", "d_e"), np.nan, ("f", "_", "g_h")],
+        ],
+        [
+            "rpartition",
+            [("a_b", "_", "c"), ("c_d", "_", "e"), np.nan, ("f_g", "_", "h")],
+        ],
+    ],
+)
+def test_partition_series_unicode(any_string_dtype, method, exp):
+    # https://github.com/pandas-dev/pandas/issues/23558
+    # unicode
+    s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
+
+    result = getattr(s.str, method)("_", expand=False)
+    expected = Series(exp)
+    expected = _convert_na_value(s, expected)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["partition", "rpartition"])
+def test_partition_series_stdlib(any_string_dtype, method):
+    # https://github.com/pandas-dev/pandas/issues/23558
+    # compare to standard lib
+    s = Series(["A_B_C", "B_C_D", "E_F_G", "EFGHEF"], dtype=any_string_dtype)
+    result = getattr(s.str, method)("_", expand=False).tolist()
+    assert result == [getattr(v, method)("_") for v in s]
+
+
+@pytest.mark.parametrize(
+    "method, exp",
+    [
+        [
+            "partition",
+            [("a", "_", "b_c"), ("c", "_", "d_e"), ("f", "_", "g_h"), np.nan, None],
+        ],
+        [
+            "rpartition",
+            [("a_b", "_", "c"), ("c_d", "_", "e"), ("f_g", "_", "h"), np.nan, None],
+        ],
+    ],
+)
+def test_partition_index(method, exp):
+    # https://github.com/pandas-dev/pandas/issues/23558
+
+    values = Index(["a_b_c", "c_d_e", "f_g_h", np.nan, None])
+
+    result = getattr(values.str, method)("_", expand=False)
+    exp = Index(np.array(exp, dtype=object), dtype=object)
+    tm.assert_index_equal(result, exp)
+    assert result.nlevels == 1
+
+
+@pytest.mark.parametrize(
+    "method, exp",
+    [
+        [
+            "partition",
+            {
+                0: ["a", "c", np.nan, "f", None],
+                1: ["_", "_", np.nan, "_", None],
+                2: ["b_c", "d_e", np.nan, "g_h", None],
+            },
+        ],
+        [
+            "rpartition",
+            {
+                0: ["a_b", "c_d", np.nan, "f_g", None],
+                1: ["_", "_", np.nan, "_", None],
+                2: ["c", "e", np.nan, "h", None],
+            },
+        ],
+    ],
+)
+def test_partition_to_dataframe(any_string_dtype, method, exp):
+    # https://github.com/pandas-dev/pandas/issues/23558
+
+    s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype)
+    result = getattr(s.str, method)("_")
+    expected = DataFrame(
+        exp,
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, exp",
+    [
+        [
+            "partition",
+            {
+                0: ["a", "c", np.nan, "f", None],
+                1: ["_", "_", np.nan, "_", None],
+                2: ["b_c", "d_e", np.nan, "g_h", None],
+            },
+        ],
+        [
+            "rpartition",
+            {
+                0: ["a_b", "c_d", np.nan, "f_g", None],
+                1: ["_", "_", np.nan, "_", None],
+                2: ["c", "e", np.nan, "h", None],
+            },
+        ],
+    ],
+)
+def test_partition_to_dataframe_from_series(any_string_dtype, method, exp):
+    # https://github.com/pandas-dev/pandas/issues/23558
+    s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h", None], dtype=any_string_dtype)
+    result = getattr(s.str, method)("_", expand=True)
+    expected = DataFrame(
+        exp,
+        dtype=any_string_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_partition_with_name(any_string_dtype):
+    # GH 12617
+
+    s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
+    result = s.str.partition(",")
+    expected = DataFrame(
+        {0: ["a", "c"], 1: [",", ","], 2: ["b", "d"]}, dtype=any_string_dtype
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_partition_with_name_expand(any_string_dtype):
+    # GH 12617
+    # should preserve name
+    s = Series(["a,b", "c,d"], name="xxx", dtype=any_string_dtype)
+    result = s.str.partition(",", expand=False)
+    expected = Series([("a", ",", "b"), ("c", ",", "d")], name="xxx")
+    tm.assert_series_equal(result, expected)
+
+
+def test_partition_index_with_name():
+    idx = Index(["a,b", "c,d"], name="xxx")
+    result = idx.str.partition(",")
+    expected = MultiIndex.from_tuples([("a", ",", "b"), ("c", ",", "d")])
+    assert result.nlevels == 3
+    tm.assert_index_equal(result, expected)
+
+
+def test_partition_index_with_name_expand_false():
+    idx = Index(["a,b", "c,d"], name="xxx")
+    # should preserve name
+    result = idx.str.partition(",", expand=False)
+    expected = Index(np.array([("a", ",", "b"), ("c", ",", "d")]), name="xxx")
+    assert result.nlevels == 1
+    tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["partition", "rpartition"])
+def test_partition_sep_kwarg(any_string_dtype, method):
+    # GH 22676; depr kwarg "pat" in favor of "sep"
+    s = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
+
+    expected = getattr(s.str, method)(sep="_")
+    result = getattr(s.str, method)("_")
+    tm.assert_frame_equal(result, expected)
+
+
+def test_get():
+    ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"])
+    result = ser.str.split("_").str.get(1)
+    expected = Series(["b", "d", np.nan, "g"], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+
+def test_get_mixed_object():
+    ser = Series(["a_b_c", np.nan, "c_d_e", True, datetime.today(), None, 1, 2.0])
+    result = ser.str.split("_").str.get(1)
+    expected = Series(
+        ["b", np.nan, "d", np.nan, np.nan, None, np.nan, np.nan], dtype=object
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("idx", [2, -3])
+def test_get_bounds(idx):
+    ser = Series(["1_2_3_4_5", "6_7_8_9_10", "11_12"])
+    result = ser.str.split("_").str.get(idx)
+    expected = Series(["3", "8", np.nan], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "idx, exp", [[2, [3, 3, np.nan, "b"]], [-1, [3, 3, np.nan, np.nan]]]
+)
+def test_get_complex(idx, exp):
+    # GH 20671, getting value not in dict raising `KeyError`
+    ser = Series([(1, 2, 3), [1, 2, 3], {1, 2, 3}, {1: "a", 2: "b", 3: "c"}])
+
+    result = ser.str.get(idx)
+    expected = Series(exp)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("to_type", [tuple, list, np.array])
+def test_get_complex_nested(to_type):
+    ser = Series([to_type([to_type([1, 2])])])
+
+    result = ser.str.get(0)
+    expected = Series([to_type([1, 2])])
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.get(1)
+    expected = Series([np.nan])
+    tm.assert_series_equal(result, expected)
+
+
+def test_get_strings(any_string_dtype):
+    ser = Series(["a", "ab", np.nan, "abc"], dtype=any_string_dtype)
+    result = ser.str.get(2)
+    expected = Series([np.nan, np.nan, np.nan, "c"], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9824973e480ab958f7ebada1b98272e32c91720
--- /dev/null
+++ b/pandas/tests/strings/test_string_array.py
@@ -0,0 +1,126 @@
+import numpy as np
+import pytest
+
+from pandas._libs import lib
+
+from pandas import (
+    NA,
+    DataFrame,
+    Series,
+    _testing as tm,
+)
+
+
+def test_string_array(nullable_string_dtype, any_string_method):
+    method_name, args, kwargs = any_string_method
+
+    data = ["a", "bb", np.nan, "ccc"]
+    a = Series(data, dtype=object)
+    b = Series(data, dtype=nullable_string_dtype)
+
+    if method_name == "decode":
+        with pytest.raises(TypeError, match="a bytes-like object is required"):
+            getattr(b.str, method_name)(*args, **kwargs)
+        return
+
+    expected = getattr(a.str, method_name)(*args, **kwargs)
+    result = getattr(b.str, method_name)(*args, **kwargs)
+
+    if isinstance(expected, Series):
+        if expected.dtype == "object" and lib.is_string_array(
+            expected.dropna().values,
+        ):
+            assert result.dtype == nullable_string_dtype
+            result = result.astype(object)
+
+        elif expected.dtype == "object" and lib.is_bool_array(
+            expected.values, skipna=True
+        ):
+            assert result.dtype == "boolean"
+            expected = expected.astype("boolean")
+
+        elif expected.dtype == "bool":
+            assert result.dtype == "boolean"
+            result = result.astype("bool")
+
+        elif expected.dtype == "float" and expected.isna().any():
+            assert result.dtype == "Int64"
+            result = result.astype("float")
+
+        if expected.dtype == object:
+            # GH#18463
+            expected[expected.isna()] = NA
+
+    elif isinstance(expected, DataFrame):
+        columns = expected.select_dtypes(include="object").columns
+        assert all(result[columns].dtypes == nullable_string_dtype)
+        result[columns] = result[columns].astype(object)
+        expected[columns] = expected[columns].fillna(NA)  # GH#18463
+
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method,expected",
+    [
+        ("count", [2, None]),
+        ("find", [0, None]),
+        ("index", [0, None]),
+        ("rindex", [2, None]),
+    ],
+)
+def test_string_array_numeric_integer_array(nullable_string_dtype, method, expected):
+    s = Series(["aba", None], dtype=nullable_string_dtype)
+    result = getattr(s.str, method)("a")
+    expected = Series(expected, dtype="Int64")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method,expected",
+    [
+        ("isdigit", [False, None, True]),
+        ("isalpha", [True, None, False]),
+        ("isascii", [True, None, True]),
+        ("isalnum", [True, None, True]),
+        ("isnumeric", [False, None, True]),
+    ],
+)
+def test_string_array_boolean_array(nullable_string_dtype, method, expected):
+    s = Series(["a", None, "1"], dtype=nullable_string_dtype)
+    result = getattr(s.str, method)()
+    expected = Series(expected, dtype="boolean")
+    tm.assert_series_equal(result, expected)
+
+
+def test_string_array_extract(nullable_string_dtype):
+    # https://github.com/pandas-dev/pandas/issues/30969
+    # Only expand=False & multiple groups was failing
+
+    a = Series(["a1", "b2", "cc"], dtype=nullable_string_dtype)
+    b = Series(["a1", "b2", "cc"], dtype="object")
+    pat = r"(\w)(\d)"
+
+    result = a.str.extract(pat, expand=False)
+    expected = b.str.extract(pat, expand=False)
+    expected = expected.fillna(NA)  # GH#18463
+    assert all(result.dtypes == nullable_string_dtype)
+
+    result = result.astype(object)
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values, width, expected",
+    [
+        (["a", "ab", "abc", None], 4, ["000a", "00ab", "0abc", None]),
+        (["1", "-1", "+1", None], 4, ["0001", "-001", "+001", None]),
+        (["1234", "-1234"], 3, ["1234", "-1234"]),
+    ],
+)
+def test_string_array_zfill(nullable_string_dtype, values, width, expected):
+    # GH #61485
+    s = Series(values, dtype=nullable_string_dtype)
+    result = s.str.zfill(width)
+    expected = Series(expected, dtype=nullable_string_dtype)
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
new file mode 100644
index 0000000000000000000000000000000000000000..37b970578a6d743d7972e8b5a0909060c592b25d
--- /dev/null
+++ b/pandas/tests/strings/test_strings.py
@@ -0,0 +1,952 @@
+from datetime import (
+    datetime,
+    timedelta,
+)
+
+import numpy as np
+import pytest
+
+from pandas.compat import pa_version_under21p0
+
+from pandas import (
+    NA,
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    StringDtype,
+    option_context,
+)
+import pandas._testing as tm
+from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
+from pandas.core.strings.accessor import StringMethods
+from pandas.tests.strings import is_object_or_nan_string_dtype
+
+
+@pytest.mark.parametrize("pattern", [0, True, Series(["foo", "bar"])])
+def test_startswith_endswith_non_str_patterns(pattern):
+    # GH3485
+    ser = Series(["foo", "bar"])
+    msg = f"expected a string or tuple, not {type(pattern).__name__}"
+    with pytest.raises(TypeError, match=msg):
+        ser.str.startswith(pattern)
+    with pytest.raises(TypeError, match=msg):
+        ser.str.endswith(pattern)
+
+
+def test_iter_raises():
+    # GH 54173
+    ser = Series(["foo", "bar"])
+    with pytest.raises(TypeError, match="'StringMethods' object is not iterable"):
+        iter(ser.str)
+
+
+# test integer/float dtypes (inferred by constructor) and mixed
+
+
+def test_count(any_string_dtype):
+    ser = Series(["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=any_string_dtype)
+    result = ser.str.count("f[o]+")
+    if is_object_or_nan_string_dtype(any_string_dtype):
+        expected_dtype = np.float64
+        item = np.nan
+    else:
+        expected_dtype = "Int64"
+        item = NA
+
+    expected = Series([1, 2, item, 4], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_count_mixed_object():
+    ser = Series(
+        ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0],
+        dtype=object,
+    )
+    result = ser.str.count("a")
+    expected = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "pat, expected_data",
+    [
+        (r"a(?=b)", [0, 1, 0, 0, None]),
+        (r"(?<=a)b", [0, 1, 0, 0, None]),
+        (r"a(?!b)", [2, 0, 1, 0, None]),
+        (r"(?<!b)a", [2, 1, 0, 0, None]),
+        ("ab", [0, 1, 0, 0, None]),
+    ],
+)
+def test_count_lookarounds(any_string_dtype, pat, expected_data):
+    # https://github.com/pandas-dev/pandas/issues/60833
+    expected_dtype = (
+        "float64" if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
+    )
+    ser = Series(["aa", "ab", "ba", "bb", None], dtype=any_string_dtype)
+    result = ser.str.count(pat)
+    expected = Series(expected_data, dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_count_end_of_string(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/pull/63613
+    expected_dtype = (
+        "int64" if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
+    )
+
+    ser = Series(["baz", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+
+    # with dollar sign
+    result = ser.str.count("bar$")
+    if any_string_dtype == "string" and any_string_dtype.storage == "pyarrow":
+        # pyarrow (RE2) only matches $ at the very end of the line
+        expected = Series([0, 1, 0, 0], dtype=expected_dtype)
+    else:
+        # python matches $ before or after an ending newline
+        expected = Series([0, 1, 0, 1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # with \Z (ensure this is translated to \z for pyarrow)
+    result = ser.str.count(r"bar\Z")
+    expected = Series([0, 1, 0, 0], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+    # ensure finding a literal \Z still works
+    ser = Series([r"bar\Z", "bar", "bars", "bar\n"], dtype=any_string_dtype)
+    result = ser.str.count(r"bar\\Z")
+    expected = Series([1, 0, 0, 0], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_repeat(any_string_dtype):
+    ser = Series(["a", "b", np.nan, "c", np.nan, "d"], dtype=any_string_dtype)
+
+    result = ser.str.repeat(3)
+    expected = Series(
+        ["aaa", "bbb", np.nan, "ccc", np.nan, "ddd"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.repeat([1, 2, 3, 4, 5, 6])
+    expected = Series(
+        ["a", "bb", np.nan, "cccc", np.nan, "dddddd"], dtype=any_string_dtype
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_repeat_mixed_object():
+    ser = Series(["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0])
+    result = ser.str.repeat(3)
+    expected = Series(
+        ["aaa", np.nan, "bbb", np.nan, np.nan, "foofoofoo", None, np.nan, np.nan],
+        dtype=object,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("arg, repeat", [[None, 4], ["b", None]])
+def test_repeat_with_null(any_string_dtype, arg, repeat):
+    # GH: 31632
+    ser = Series(["a", arg], dtype=any_string_dtype)
+    result = ser.str.repeat([3, repeat])
+    expected = Series(["aaa", None], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_empty_str_methods(any_string_dtype):
+    empty_str = empty = Series(dtype=any_string_dtype)
+    empty_inferred_str = Series(dtype="str")
+    if is_object_or_nan_string_dtype(any_string_dtype):
+        empty_int = Series(dtype="int64")
+        empty_bool = Series(dtype=bool)
+    else:
+        empty_int = Series(dtype="Int64")
+        empty_bool = Series(dtype="boolean")
+    empty_object = Series(dtype=object)
+    empty_bytes = Series(dtype=object)
+    empty_df = DataFrame()
+
+    # GH7241
+    # (extract) on empty series
+
+    tm.assert_series_equal(empty_str, empty.str.cat(empty))
+    assert "" == empty.str.cat()
+    tm.assert_series_equal(empty_str, empty.str.title())
+    tm.assert_series_equal(empty_int, empty.str.count("a"))
+    tm.assert_series_equal(empty_bool, empty.str.contains("a"))
+    tm.assert_series_equal(empty_bool, empty.str.startswith("a"))
+    tm.assert_series_equal(empty_bool, empty.str.endswith("a"))
+    tm.assert_series_equal(empty_str, empty.str.lower())
+    tm.assert_series_equal(empty_str, empty.str.upper())
+    tm.assert_series_equal(empty_str, empty.str.replace("a", "b"))
+    tm.assert_series_equal(empty_str, empty.str.repeat(3))
+    tm.assert_series_equal(empty_bool, empty.str.match("^a"))
+    tm.assert_frame_equal(
+        DataFrame(columns=range(1), dtype=any_string_dtype),
+        empty.str.extract("()", expand=True),
+    )
+    tm.assert_frame_equal(
+        DataFrame(columns=range(2), dtype=any_string_dtype),
+        empty.str.extract("()()", expand=True),
+    )
+    tm.assert_series_equal(empty_str, empty.str.extract("()", expand=False))
+    tm.assert_frame_equal(
+        DataFrame(columns=range(2), dtype=any_string_dtype),
+        empty.str.extract("()()", expand=False),
+    )
+    tm.assert_frame_equal(empty_df.set_axis([], axis=1), empty.str.get_dummies())
+    tm.assert_series_equal(empty_str, empty_str.str.join(""))
+    tm.assert_series_equal(empty_int, empty.str.len())
+    tm.assert_series_equal(empty_object, empty_str.str.findall("a"))
+    tm.assert_series_equal(empty_int, empty.str.find("a"))
+    tm.assert_series_equal(empty_int, empty.str.rfind("a"))
+    tm.assert_series_equal(empty_str, empty.str.pad(42))
+    tm.assert_series_equal(empty_str, empty.str.center(42))
+    tm.assert_series_equal(empty_object, empty.str.split("a"))
+    tm.assert_series_equal(empty_object, empty.str.rsplit("a"))
+    tm.assert_series_equal(empty_object, empty.str.partition("a", expand=False))
+    tm.assert_frame_equal(empty_df, empty.str.partition("a"))
+    tm.assert_series_equal(empty_object, empty.str.rpartition("a", expand=False))
+    tm.assert_frame_equal(empty_df, empty.str.rpartition("a"))
+    tm.assert_series_equal(empty_str, empty.str.slice(stop=1))
+    tm.assert_series_equal(empty_str, empty.str.slice(step=1))
+    tm.assert_series_equal(empty_str, empty.str.strip())
+    tm.assert_series_equal(empty_str, empty.str.lstrip())
+    tm.assert_series_equal(empty_str, empty.str.rstrip())
+    tm.assert_series_equal(empty_str, empty.str.wrap(42))
+    tm.assert_series_equal(empty_str, empty.str.get(0))
+    tm.assert_series_equal(empty_inferred_str, empty_bytes.str.decode("ascii"))
+    tm.assert_series_equal(empty_bytes, empty.str.encode("ascii"))
+    # ismethods should always return boolean (GH 29624)
+    tm.assert_series_equal(empty_bool, empty.str.isalnum())
+    tm.assert_series_equal(empty_bool, empty.str.isalpha())
+    tm.assert_series_equal(empty_bool, empty.str.isascii())
+    tm.assert_series_equal(empty_bool, empty.str.isdigit())
+    tm.assert_series_equal(empty_bool, empty.str.isspace())
+    tm.assert_series_equal(empty_bool, empty.str.islower())
+    tm.assert_series_equal(empty_bool, empty.str.isupper())
+    tm.assert_series_equal(empty_bool, empty.str.istitle())
+    tm.assert_series_equal(empty_bool, empty.str.isnumeric())
+    tm.assert_series_equal(empty_bool, empty.str.isdecimal())
+    tm.assert_series_equal(empty_str, empty.str.capitalize())
+    tm.assert_series_equal(empty_str, empty.str.swapcase())
+    tm.assert_series_equal(empty_str, empty.str.normalize("NFC"))
+
+    table = str.maketrans("a", "b")
+    tm.assert_series_equal(empty_str, empty.str.translate(table))
+
+
+@pytest.mark.parametrize(
+    "method, expected",
+    [
+        ("isascii", [True, True, True, True, True, True, True, True, True, True]),
+        ("isalnum", [True, True, True, True, True, False, True, True, False, False]),
+        ("isalpha", [True, True, True, False, False, False, True, False, False, False]),
+        (
+            "isdigit",
+            [False, False, False, True, False, False, False, True, False, False],
+        ),
+        (
+            "isnumeric",
+            [False, False, False, True, False, False, False, True, False, False],
+        ),
+        (
+            "isspace",
+            [False, False, False, False, False, False, False, False, False, True],
+        ),
+        (
+            "islower",
+            [False, True, False, False, False, False, False, False, False, False],
+        ),
+        (
+            "isupper",
+            [True, False, False, False, True, False, True, False, False, False],
+        ),
+        (
+            "istitle",
+            [True, False, True, False, True, False, False, False, False, False],
+        ),
+    ],
+)
+def test_ismethods(method, expected, any_string_dtype):
+    ser = Series(
+        ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", "  "], dtype=any_string_dtype
+    )
+    expected_dtype = (
+        "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+    expected = Series(expected, dtype=expected_dtype)
+    result = getattr(ser.str, method)()
+    tm.assert_series_equal(result, expected)
+
+    # compare with standard library
+    expected_stdlib = [getattr(item, method)() for item in ser]
+    assert list(result) == expected_stdlib
+
+    # with missing value
+    ser.iloc[[1, 2, 3, 4]] = np.nan
+    result = getattr(ser.str, method)()
+    if ser.dtype == "object":
+        expected = expected.astype(object)
+        expected.iloc[[1, 2, 3, 4]] = np.nan
+    elif ser.dtype == "str":
+        # NaN propagates as False
+        expected.iloc[[1, 2, 3, 4]] = False
+    else:
+        # nullable dtypes propagate NaN
+        expected.iloc[[1, 2, 3, 4]] = np.nan
+
+
+@pytest.mark.parametrize(
+    "method, expected",
+    [
+        ("isnumeric", [False, True, True, True, False, True, True, False]),
+        ("isdecimal", [False, True, False, False, False, False, True, False]),
+        ("isdigit", [False, True, True, False, False, False, True, False]),
+    ],
+)
+def test_isnumeric_unicode(method, expected, any_string_dtype):
+    # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER
+    # 0x2605: ★ not number
+    # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY
+    # 0xFF13: ３ Em 3  # noqa: RUF003
+    ser = Series(
+        ["A", "3", "³", "¼", "★", "፸", "３", "four"],  # noqa: RUF001
+        dtype=any_string_dtype,
+    )
+    expected_dtype = (
+        "bool" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+    )
+    expected = Series(expected, dtype=expected_dtype)
+    if (
+        method == "isdigit"
+        and isinstance(ser.dtype, StringDtype)
+        and ser.dtype.storage == "pyarrow"
+        and not pa_version_under21p0
+    ):
+        # known difference in behavior between python and pyarrow unicode handling
+        # pyarrow 21+ considers ¼ and ፸ as a digit, while python does not
+        expected.iloc[3] = True
+        expected.iloc[5] = True
+
+    result = getattr(ser.str, method)()
+    tm.assert_series_equal(result, expected)
+
+    # compare with standard library
+    # (only for non-pyarrow storage given the above differences)
+    if any_string_dtype == "object" or (
+        isinstance(any_string_dtype, StringDtype)
+        and any_string_dtype.storage == "python"
+    ):
+        expected = [getattr(item, method)() for item in ser]
+        assert list(result) == expected
+
+
+@pytest.mark.parametrize(
+    "method, expected",
+    [
+        ("isnumeric", [False, np.nan, True, False, np.nan, True, False]),
+        ("isdecimal", [False, np.nan, False, False, np.nan, True, False]),
+    ],
+)
+def test_isnumeric_unicode_missing(method, expected, any_string_dtype):
+    values = ["A", np.nan, "¼", "★", np.nan, "３", "four"]  # noqa: RUF001
+    ser = Series(values, dtype=any_string_dtype)
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series(expected, dtype=object).fillna(False).astype(bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series(expected, dtype=expected_dtype)
+    result = getattr(ser.str, method)()
+    tm.assert_series_equal(result, expected)
+
+
+def test_split_join_roundtrip(any_string_dtype):
+    ser = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype)
+    result = ser.str.split("_").str.join("_")
+    expected = ser.astype(object)
+    tm.assert_series_equal(result, expected)
+
+
+def test_split_join_roundtrip_mixed_object():
+    ser = Series(
+        ["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0]
+    )
+    result = ser.str.split("_").str.join("_")
+    expected = Series(
+        ["a_b", np.nan, "asdf_cas_asdf", np.nan, np.nan, "foo", None, np.nan, np.nan],
+        dtype=object,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_len(any_string_dtype):
+    ser = Series(
+        ["foo", "fooo", "fooooo", np.nan, "fooooooo", "foo\n", "あ"],
+        dtype=any_string_dtype,
+    )
+    result = ser.str.len()
+    if is_object_or_nan_string_dtype(any_string_dtype):
+        expected_dtype = "float64"
+        item = np.nan
+    else:
+        expected_dtype = "Int64"
+        item = NA
+    expected = Series([3, 4, 6, item, 8, 4, 1], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_len_mixed():
+    ser = Series(
+        ["a_b", np.nan, "asdf_cas_asdf", True, datetime.today(), "foo", None, 1, 2.0]
+    )
+    result = ser.str.len()
+    expected = Series([3, np.nan, 13, np.nan, np.nan, 3, np.nan, np.nan, np.nan])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method,sub,start,end,expected",
+    [
+        ("index", "EF", None, None, [4, 3, 1, 0]),
+        ("rindex", "EF", None, None, [4, 5, 7, 4]),
+        ("index", "EF", 3, None, [4, 3, 7, 4]),
+        ("rindex", "EF", 3, None, [4, 5, 7, 4]),
+        ("index", "E", 4, 8, [4, 5, 7, 4]),
+        ("rindex", "E", 0, 5, [4, 3, 1, 4]),
+    ],
+)
+def test_index(method, sub, start, end, index_or_series, any_string_dtype, expected):
+    obj = index_or_series(
+        ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype
+    )
+    expected_dtype = (
+        np.int64 if is_object_or_nan_string_dtype(any_string_dtype) else "Int64"
+    )
+    expected = index_or_series(expected, dtype=expected_dtype)
+
+    result = getattr(obj.str, method)(sub, start, end)
+
+    if index_or_series is Series:
+        tm.assert_series_equal(result, expected)
+    else:
+        tm.assert_index_equal(result, expected)
+
+    # compare with standard library
+    expected = [getattr(item, method)(sub, start, end) for item in obj]
+    assert list(result) == expected
+
+
+def test_index_not_found_raises(index_or_series, any_string_dtype):
+    obj = index_or_series(
+        ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"], dtype=any_string_dtype
+    )
+    with pytest.raises(ValueError, match="substring not found"):
+        obj.str.index("DE")
+
+
+@pytest.mark.parametrize("method", ["index", "rindex"])
+def test_index_wrong_type_raises(index_or_series, any_string_dtype, method):
+    obj = index_or_series([], dtype=any_string_dtype)
+    msg = "expected a string object, not int"
+
+    with pytest.raises(TypeError, match=msg):
+        getattr(obj.str, method)(0)
+
+
+@pytest.mark.parametrize(
+    "method, exp",
+    [
+        ["index", [1, 1, 0]],
+        ["rindex", [3, 1, 2]],
+    ],
+)
+def test_index_missing(any_string_dtype, method, exp):
+    ser = Series(["abcb", "ab", "bcbe", np.nan], dtype=any_string_dtype)
+    if is_object_or_nan_string_dtype(any_string_dtype):
+        expected_dtype = np.float64
+        item = np.nan
+    else:
+        expected_dtype = "Int64"
+        item = NA
+
+    result = getattr(ser.str, method)("b")
+    expected = Series([*exp, item], dtype=expected_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_pipe_failures(any_string_dtype):
+    # #2119
+    ser = Series(["A|B|C"], dtype=any_string_dtype)
+
+    result = ser.str.split("|")
+    expected = Series([["A", "B", "C"]], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str.replace("|", " ", regex=False)
+    expected = Series(["A B C"], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "start, stop, step, expected",
+    [
+        (2, 5, None, ["foo", "bar", np.nan, "baz"]),
+        (0, 3, -1, ["", "", np.nan, ""]),
+        (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]),
+        (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]),
+        (3, 10, 2, ["oto", "ato", np.nan, "aqx"]),
+        (3, 0, -1, ["ofa", "aba", np.nan, "aba"]),
+    ],
+)
+def test_slice(start, stop, step, expected, any_string_dtype):
+    ser = Series(["aafootwo", "aabartwo", np.nan, "aabazqux"], dtype=any_string_dtype)
+    result = ser.str.slice(start, stop, step)
+    expected = Series(expected, dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "start, stop, step, expected",
+    [
+        (2, 5, None, ["foo", np.nan, "bar", np.nan, np.nan, None, np.nan, np.nan]),
+        (4, 1, -1, ["oof", np.nan, "rab", np.nan, np.nan, None, np.nan, np.nan]),
+    ],
+)
+def test_slice_mixed_object(start, stop, step, expected):
+    ser = Series(["aafootwo", np.nan, "aabartwo", True, datetime.today(), None, 1, 2.0])
+    result = ser.str.slice(start, stop, step)
+    expected = Series(expected, dtype=object)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "start,stop,repl,expected",
+    [
+        (2, 3, None, ["shrt", "a it longer", "evnlongerthanthat", "", np.nan]),
+        (2, 3, "z", ["shzrt", "a zit longer", "evznlongerthanthat", "z", np.nan]),
+        (2, 2, "z", ["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]),
+        (2, 1, "z", ["shzort", "a zbit longer", "evzenlongerthanthat", "z", np.nan]),
+        (-1, None, "z", ["shorz", "a bit longez", "evenlongerthanthaz", "z", np.nan]),
+        (None, -2, "z", ["zrt", "zer", "zat", "z", np.nan]),
+        (6, 8, "z", ["shortz", "a bit znger", "evenlozerthanthat", "z", np.nan]),
+        (-10, 3, "z", ["zrt", "a zit longer", "evenlongzerthanthat", "z", np.nan]),
+    ],
+)
+def test_slice_replace(start, stop, repl, expected, any_string_dtype):
+    ser = Series(
+        ["short", "a bit longer", "evenlongerthanthat", "", np.nan],
+        dtype=any_string_dtype,
+    )
+    expected = Series(expected, dtype=any_string_dtype)
+    result = ser.str.slice_replace(start, stop, repl)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, exp",
+    [
+        ["strip", ["aa", "bb", np.nan, "cc"]],
+        ["lstrip", ["aa   ", "bb \n", np.nan, "cc  "]],
+        ["rstrip", ["  aa", " bb", np.nan, "cc"]],
+    ],
+)
+def test_strip_lstrip_rstrip(any_string_dtype, method, exp):
+    ser = Series(["  aa   ", " bb \n", np.nan, "cc  "], dtype=any_string_dtype)
+
+    result = getattr(ser.str, method)()
+    expected = Series(exp, dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, exp",
+    [
+        ["strip", ["aa", np.nan, "bb"]],
+        ["lstrip", ["aa  ", np.nan, "bb \t\n"]],
+        ["rstrip", ["  aa", np.nan, " bb"]],
+    ],
+)
+def test_strip_lstrip_rstrip_mixed_object(method, exp):
+    ser = Series(["  aa  ", np.nan, " bb \t\n", True, datetime.today(), None, 1, 2.0])
+
+    result = getattr(ser.str, method)()
+    expected = Series([*exp, np.nan, np.nan, None, np.nan, np.nan], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, exp",
+    [
+        ["strip", ["ABC", " BNSD", "LDFJH "]],
+        ["lstrip", ["ABCxx", " BNSD", "LDFJH xx"]],
+        ["rstrip", ["xxABC", "xx BNSD", "LDFJH "]],
+    ],
+)
+def test_strip_lstrip_rstrip_args(any_string_dtype, method, exp):
+    ser = Series(["xxABCxx", "xx BNSD", "LDFJH xx"], dtype=any_string_dtype)
+
+    result = getattr(ser.str, method)("x")
+    expected = Series(exp, dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "prefix, expected",
+    [
+        ("a", ["b", " b c", "bc"]),
+        ("ab", ["", "a b c", "bc"]),
+        ("", ["ab", "a b c", "bc"]),
+    ],
+)
+def test_removeprefix(any_string_dtype, prefix, expected):
+    ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
+    result = ser.str.removeprefix(prefix)
+    ser_expected = Series(expected, dtype=any_string_dtype)
+    tm.assert_series_equal(result, ser_expected)
+
+
+@pytest.mark.parametrize(
+    "suffix, expected",
+    [
+        ("c", ["ab", "a b ", "b"]),
+        ("bc", ["ab", "a b c", ""]),
+        ("", ["ab", "a b c", "bc"]),
+    ],
+)
+def test_removesuffix(any_string_dtype, suffix, expected):
+    ser = Series(["ab", "a b c", "bc"], dtype=any_string_dtype)
+    result = ser.str.removesuffix(suffix)
+    ser_expected = Series(expected, dtype=any_string_dtype)
+    tm.assert_series_equal(result, ser_expected)
+
+
+def test_string_slice_get_syntax(any_string_dtype):
+    ser = Series(
+        ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", np.nan, "CYYYBYYY", "dog", "cYYYt"],
+        dtype=any_string_dtype,
+    )
+
+    result = ser.str[0]
+    expected = ser.str.get(0)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str[:3]
+    expected = ser.str.slice(stop=3)
+    tm.assert_series_equal(result, expected)
+
+    result = ser.str[2::-1]
+    expected = ser.str.slice(start=2, step=-1)
+    tm.assert_series_equal(result, expected)
+
+
+def test_string_slice_out_of_bounds_nested():
+    ser = Series([(1, 2), (1,), (3, 4, 5)])
+    result = ser.str[1]
+    expected = Series([2, np.nan, 4])
+    tm.assert_series_equal(result, expected)
+
+
+def test_string_slice_out_of_bounds(any_string_dtype):
+    ser = Series(["foo", "b", "ba"], dtype=any_string_dtype)
+    result = ser.str[1]
+    expected = Series(["o", np.nan, "a"], dtype=any_string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_encode_decode(any_string_dtype):
+    ser = Series(["a", "b", "a\xe4"], dtype=any_string_dtype).str.encode("utf-8")
+    result = ser.str.decode("utf-8")
+    expected = Series(["a", "b", "a\xe4"], dtype="str")
+    tm.assert_series_equal(result, expected)
+
+
+def test_encode_errors_kwarg(any_string_dtype):
+    ser = Series(["a", "b", "a\x9d"], dtype=any_string_dtype)
+
+    msg = (
+        r"'charmap' codec can't encode character '\\x9d' in position 1: "
+        "character maps to <undefined>"
+    )
+    with pytest.raises(UnicodeEncodeError, match=msg):
+        ser.str.encode("cp1252")
+
+    result = ser.str.encode("cp1252", "ignore")
+    expected = ser.map(lambda x: x.encode("cp1252", "ignore"))
+    tm.assert_series_equal(result, expected)
+
+
+def test_decode_errors_kwarg():
+    ser = Series([b"a", b"b", b"a\x9d"])
+
+    msg = (
+        "'charmap' codec can't decode byte 0x9d in position 1: "
+        "character maps to <undefined>"
+    )
+    with pytest.raises(UnicodeDecodeError, match=msg):
+        ser.str.decode("cp1252")
+
+    result = ser.str.decode("cp1252", "ignore")
+    expected = ser.map(lambda x: x.decode("cp1252", "ignore")).astype("str")
+    tm.assert_series_equal(result, expected)
+
+
+def test_decode_string_dtype(string_dtype):
+    # https://github.com/pandas-dev/pandas/pull/60940
+    ser = Series([b"a", b"b"])
+    result = ser.str.decode("utf-8", dtype=string_dtype)
+    expected = Series(["a", "b"], dtype=string_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_decode_object_dtype(object_dtype):
+    # https://github.com/pandas-dev/pandas/pull/60940
+    ser = Series([b"a", rb"\ud800"])
+    result = ser.str.decode("utf-8", dtype=object_dtype)
+    expected = Series(["a", r"\ud800"], dtype=object_dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_decode_bad_dtype():
+    # https://github.com/pandas-dev/pandas/pull/60940
+    ser = Series([b"a", b"b"])
+    msg = "dtype must be string or object, got dtype='int64'"
+    with pytest.raises(ValueError, match=msg):
+        ser.str.decode("utf-8", dtype="int64")
+
+
+@pytest.mark.parametrize(
+    "form, expected",
+    [
+        ("NFKC", ["ABC", "ABC", "123", np.nan, "アイエ"]),
+        ("NFC", ["ABC", "ＡＢＣ", "１２３", np.nan, "ｱｲｴ"]),  # noqa: RUF001
+    ],
+)
+def test_normalize(form, expected, any_string_dtype):
+    ser = Series(
+        ["ABC", "ＡＢＣ", "１２３", np.nan, "ｱｲｴ"],  # noqa: RUF001
+        index=["a", "b", "c", "d", "e"],
+        dtype=any_string_dtype,
+    )
+    expected = Series(expected, index=["a", "b", "c", "d", "e"], dtype=any_string_dtype)
+    result = ser.str.normalize(form)
+    tm.assert_series_equal(result, expected)
+
+
+def test_normalize_bad_arg_raises(any_string_dtype):
+    ser = Series(
+        ["ABC", "ＡＢＣ", "１２３", np.nan, "ｱｲｴ"],  # noqa: RUF001
+        index=["a", "b", "c", "d", "e"],
+        dtype=any_string_dtype,
+    )
+    with pytest.raises(ValueError, match="invalid normalization form"):
+        ser.str.normalize("xxx")
+
+
+def test_normalize_index():
+    idx = Index(["ＡＢＣ", "１２３", "ｱｲｴ"])  # noqa: RUF001
+    expected = Index(["ABC", "123", "アイエ"])
+    result = idx.str.normalize("NFKC")
+    tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values,inferred_type",
+    [
+        (["a", "b"], "string"),
+        (["a", "b", 1], "mixed-integer"),
+        (["a", "b", 1.3], "mixed"),
+        (["a", "b", 1.3, 1], "mixed-integer"),
+        (["aa", datetime(2011, 1, 1)], "mixed"),
+    ],
+)
+def test_index_str_accessor_visibility(values, inferred_type, index_or_series):
+    obj = index_or_series(values)
+    if index_or_series is Index:
+        assert obj.inferred_type == inferred_type
+
+    assert isinstance(obj.str, StringMethods)
+
+
+@pytest.mark.parametrize(
+    "values,inferred_type",
+    [
+        ([1, np.nan], "floating"),
+        ([datetime(2011, 1, 1)], "datetime64"),
+        ([timedelta(1)], "timedelta64"),
+    ],
+)
+def test_index_str_accessor_non_string_values_raises(
+    values, inferred_type, index_or_series
+):
+    obj = index_or_series(values)
+    if index_or_series is Index:
+        assert obj.inferred_type == inferred_type
+
+    msg = "Can only use .str accessor with string values"
+    with pytest.raises(AttributeError, match=msg):
+        obj.str
+
+
+def test_index_str_accessor_multiindex_raises():
+    # MultiIndex has mixed dtype, but not allow to use accessor
+    idx = MultiIndex.from_tuples([("a", "b"), ("a", "b")])
+    assert idx.inferred_type == "mixed"
+
+    msg = "Can only use .str accessor with Index, not MultiIndex"
+    with pytest.raises(AttributeError, match=msg):
+        idx.str
+
+
+def test_str_accessor_no_new_attributes(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/issues/10673
+    ser = Series(list("aabbcde"), dtype=any_string_dtype)
+    with pytest.raises(AttributeError, match="You cannot add any new attribute"):
+        ser.str.xlabel = "a"
+
+
+def test_cat_on_bytes_raises():
+    lhs = Series(np.array(list("abc"), "S1").astype(object))
+    rhs = Series(np.array(list("def"), "S1").astype(object))
+    msg = "Cannot use .str.cat with values of inferred dtype 'bytes'"
+    with pytest.raises(TypeError, match=msg):
+        lhs.str.cat(rhs)
+
+
+def test_str_accessor_in_apply_func():
+    # https://github.com/pandas-dev/pandas/issues/38979
+    df = DataFrame(zip("abc", "def", strict=True))
+    expected = Series(["A/D", "B/E", "C/F"])
+    result = df.apply(lambda f: "/".join(f.str.upper()), axis=1)
+    tm.assert_series_equal(result, expected)
+
+
+def test_zfill():
+    # https://github.com/pandas-dev/pandas/issues/20868
+    value = Series(["-1", "1", "1000", 10, np.nan])
+    expected = Series(["-01", "001", "1000", np.nan, np.nan], dtype=object)
+    tm.assert_series_equal(value.str.zfill(3), expected)
+
+    value = Series(["-2", "+5"])
+    expected = Series(["-0002", "+0005"])
+    tm.assert_series_equal(value.str.zfill(5), expected)
+
+
+def test_zfill_with_non_integer_argument():
+    value = Series(["-2", "+5"])
+    wid = "a"
+    msg = f"width must be of integer type, not {type(wid).__name__}"
+    with pytest.raises(TypeError, match=msg):
+        value.str.zfill(wid)
+
+
+def test_zfill_with_leading_sign():
+    value = Series(["-cat", "-1", "+dog"])
+    expected = Series(["-0cat", "-0001", "+0dog"])
+    tm.assert_series_equal(value.str.zfill(5), expected)
+
+
+def test_get_with_dict_label():
+    # GH47911
+    s = Series(
+        [
+            {"name": "Hello", "value": "World"},
+            {"name": "Goodbye", "value": "Planet"},
+            {"value": "Sea"},
+        ]
+    )
+    result = s.str.get("name")
+    expected = Series(["Hello", "Goodbye", None], dtype=object)
+    tm.assert_series_equal(result, expected)
+    result = s.str.get("value")
+    expected = Series(["World", "Planet", "Sea"], dtype=object)
+    tm.assert_series_equal(result, expected)
+
+
+def test_series_str_decode():
+    # GH 22613
+    result = Series([b"x", b"y"]).str.decode(encoding="UTF-8", errors="strict")
+    expected = Series(["x", "y"], dtype="str")
+    tm.assert_series_equal(result, expected)
+
+
+def test_decode_with_dtype_none():
+    with option_context("future.infer_string", True):
+        ser = Series([b"a", b"b", b"c"])
+        result = ser.str.decode("utf-8", dtype=None)
+        expected = Series(["a", "b", "c"], dtype="str")
+        tm.assert_series_equal(result, expected)
+
+
+def test_setitem_with_different_string_storage():
+    # GH#52987
+    # Test setitem with values from different string storage type
+    pytest.importorskip("pyarrow")
+
+    # Test Series[string[python]].__setitem__(Series[string[pyarrow]])
+    ser_python = Series(range(5), dtype="string[python]")
+    ser_pyarrow = ser_python.astype("string[pyarrow]")
+
+    ser_python[:2] = ser_pyarrow[:2]
+    expected = Series(["0", "1", "2", "3", "4"], dtype="string[python]")
+    tm.assert_series_equal(ser_python, expected)
+
+    # Test Series[string[pyarrow]].__setitem__(Series[string[python]])
+    ser_pyarrow = Series(range(5), dtype="string[pyarrow]")
+    ser_python = ser_pyarrow.astype("string[python]")
+
+    ser_pyarrow[:2] = ser_python[:2]
+    expected = Series(["0", "1", "2", "3", "4"], dtype="string[pyarrow]")
+    tm.assert_series_equal(ser_pyarrow, expected)
+
+    # Test with slice and missing values
+    ser_python = Series(["a", "b", None, "d", "e"], dtype="string[python]")
+    ser_pyarrow = Series(["X", "Y", None], dtype="string[pyarrow]")
+
+    ser_python[1:4] = ser_pyarrow
+    expected = Series(["a", "X", "Y", NA, "e"], dtype="string[python]")
+    tm.assert_series_equal(ser_python, expected)
+
+
+@pytest.mark.parametrize(
+    "pat, expected",
+    [
+        # lookaround assertions
+        (r"(?=abc)", True),
+        (r"(?<=123)", True),
+        (r"(?!xyz)", True),
+        (r"(?<!\d)", True),
+        (r"(?=a|b)(?<=c)", True),
+        (r"abc", False),
+        (r"\d+", False),
+        (r"(abc)", False),
+        (r"a|b", False),
+        (r"a*", False),
+        (r"", False),
+        (r"\\(?=abc)", True),
+        (r"(?=.*[A-Z])", True),
+        (r"a(?=)", True),
+        (r"(?![0-9])", True),
+        (r"(?=(?!nested))", True),
+        (r"test\(\?\=ing\)", False),
+        (r"[(?=)]", False),
+        (r"(?#(?=comment)", False),
+        (r"(test # (?=comment))", True),
+        (r"(?=test)+", False),
+        (r"(?=test)*", False),
+        (r"(?=test)?", False),
+        (r"abc|(?=test)", True),
+        (r"^(?=test)$", True),
+        # backreferences
+        (r"(abc)\1", True),
+        (r"\b(\w+)\s+\1\b", True),
+        (r"\b(?P<word>\w+)\s+(?P=word)\b", True),
+    ],
+)
+def test_has_regex_unsupported_code(pat, expected):
+    # https://github.com/pandas-dev/pandas/issues/60833
+    assert ArrowStringArrayMixin._has_unsupported_regex(pat) == expected
diff --git a/pandas/tests/test_aggregation.py b/pandas/tests/test_aggregation.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a01805cc2365c3f5024064465341d1fe664eeed
--- /dev/null
+++ b/pandas/tests/test_aggregation.py
@@ -0,0 +1,93 @@
+import numpy as np
+import pytest
+
+from pandas.core.apply import (
+    _make_unique_kwarg_list,
+    maybe_mangle_lambdas,
+)
+
+
+def test_maybe_mangle_lambdas_passthrough():
+    assert maybe_mangle_lambdas("mean") == "mean"
+    assert maybe_mangle_lambdas(lambda x: x).__name__ == "<lambda>"
+    # don't mangle single lambda.
+    assert maybe_mangle_lambdas([lambda x: x])[0].__name__ == "<lambda>"
+
+
+def test_maybe_mangle_lambdas_listlike():
+    aggfuncs = [lambda x: 1, lambda x: 2]
+    result = maybe_mangle_lambdas(aggfuncs)
+    assert result[0].__name__ == "<lambda_0>"
+    assert result[1].__name__ == "<lambda_1>"
+    assert aggfuncs[0](None) == result[0](None)
+    assert aggfuncs[1](None) == result[1](None)
+
+
+def test_maybe_mangle_lambdas():
+    func = {"A": [lambda x: 0, lambda x: 1]}
+    result = maybe_mangle_lambdas(func)
+    assert result["A"][0].__name__ == "<lambda_0>"
+    assert result["A"][1].__name__ == "<lambda_1>"
+
+
+def test_maybe_mangle_lambdas_args():
+    func = {"A": [lambda x, a, b=1: (0, a, b), lambda x: 1]}
+    result = maybe_mangle_lambdas(func)
+    assert result["A"][0].__name__ == "<lambda_0>"
+    assert result["A"][1].__name__ == "<lambda_1>"
+
+    assert func["A"][0](0, 1) == (0, 1, 1)
+    assert func["A"][0](0, 1, 2) == (0, 1, 2)
+    assert func["A"][0](0, 2, b=3) == (0, 2, 3)
+
+
+def test_maybe_mangle_lambdas_named():
+    func = {"C": np.mean, "D": {"foo": np.mean, "bar": np.mean}}
+    result = maybe_mangle_lambdas(func)
+    assert result == func
+
+
+@pytest.mark.parametrize(
+    "order, expected_reorder",
+    [
+        (
+            [
+                ("height", "<lambda>"),
+                ("height", "max"),
+                ("weight", "max"),
+                ("height", "<lambda>"),
+                ("weight", "<lambda>"),
+            ],
+            [
+                ("height", "<lambda>_0"),
+                ("height", "max"),
+                ("weight", "max"),
+                ("height", "<lambda>_1"),
+                ("weight", "<lambda>"),
+            ],
+        ),
+        (
+            [
+                ("col2", "min"),
+                ("col1", "<lambda>"),
+                ("col1", "<lambda>"),
+                ("col1", "<lambda>"),
+            ],
+            [
+                ("col2", "min"),
+                ("col1", "<lambda>_0"),
+                ("col1", "<lambda>_1"),
+                ("col1", "<lambda>_2"),
+            ],
+        ),
+        (
+            [("col", "<lambda>"), ("col", "<lambda>"), ("col", "<lambda>")],
+            [("col", "<lambda>_0"), ("col", "<lambda>_1"), ("col", "<lambda>_2")],
+        ),
+    ],
+)
+def test_make_unique(order, expected_reorder):
+    # GH 27519, test if make_unique function reorders correctly
+    result = _make_unique_kwarg_list(order)
+
+    assert result == expected_reorder
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee34dff8446955cb4363db28d6f8d3115f2cb768
--- /dev/null
+++ b/pandas/tests/test_algos.py
@@ -0,0 +1,2083 @@
+from datetime import datetime
+import struct
+
+import numpy as np
+import pytest
+
+from pandas._libs import (
+    algos as libalgos,
+    hashtable as ht,
+)
+
+from pandas.core.dtypes.common import (
+    is_bool_dtype,
+    is_complex_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+    is_object_dtype,
+)
+from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
+    DatetimeTZDtype,
+)
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    CategoricalIndex,
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    IntervalIndex,
+    MultiIndex,
+    NaT,
+    Period,
+    PeriodIndex,
+    Series,
+    Timedelta,
+    Timestamp,
+    cut,
+    date_range,
+    timedelta_range,
+    to_datetime,
+    to_timedelta,
+)
+import pandas._testing as tm
+import pandas.core.algorithms as algos
+from pandas.core.arrays import (
+    DatetimeArray,
+    TimedeltaArray,
+)
+import pandas.core.common as com
+
+
+class TestFactorize:
+    def test_factorize_complex(self):
+        # GH#17927
+        array = np.array([1, 2, 2 + 1j], dtype=complex)
+        labels, uniques = algos.factorize(array)
+
+        expected_labels = np.array([0, 1, 2], dtype=np.intp)
+        tm.assert_numpy_array_equal(labels, expected_labels)
+
+        expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=complex)
+        tm.assert_numpy_array_equal(uniques, expected_uniques)
+
+    def test_factorize(self, index_or_series_obj, sort):
+        obj = index_or_series_obj
+        result_codes, result_uniques = obj.factorize(sort=sort)
+
+        constructor = Index
+        if isinstance(obj, MultiIndex):
+            constructor = MultiIndex.from_tuples
+        expected_arr = obj.unique()
+        if expected_arr.dtype == np.float16:
+            expected_arr = expected_arr.astype(np.float32)
+        expected_uniques = constructor(expected_arr)
+        if (
+            isinstance(obj, Index)
+            and expected_uniques.dtype == bool
+            and obj.dtype == object
+        ):
+            expected_uniques = expected_uniques.astype(object)
+
+        if sort:
+            expected_uniques = expected_uniques.sort_values()
+
+        # construct an integer ndarray so that
+        # `expected_uniques.take(expected_codes)` is equal to `obj`
+        expected_uniques_list = list(expected_uniques)
+        expected_codes = [expected_uniques_list.index(val) for val in obj]
+        expected_codes = np.asarray(expected_codes, dtype=np.intp)
+
+        tm.assert_numpy_array_equal(result_codes, expected_codes)
+        tm.assert_index_equal(result_uniques, expected_uniques, exact=True)
+
+    def test_series_factorize_use_na_sentinel_false(self):
+        # GH#35667
+        values = np.array([1, 2, 1, np.nan])
+        ser = Series(values)
+        codes, uniques = ser.factorize(use_na_sentinel=False)
+
+        expected_codes = np.array([0, 1, 0, 2], dtype=np.intp)
+        expected_uniques = Index([1.0, 2.0, np.nan])
+
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_index_equal(uniques, expected_uniques)
+
+    def test_basic(self):
+        items = np.array(["a", "b", "b", "a", "a", "c", "c", "c"], dtype=object)
+        codes, uniques = algos.factorize(items)
+        tm.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object))
+
+        codes, uniques = algos.factorize(items, sort=True)
+        exp = np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        exp = np.array(["a", "b", "c"], dtype=object)
+        tm.assert_numpy_array_equal(uniques, exp)
+
+        arr = np.arange(5, dtype=np.intp)[::-1]
+
+        codes, uniques = algos.factorize(arr)
+        exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        exp = np.array([4, 3, 2, 1, 0], dtype=arr.dtype)
+        tm.assert_numpy_array_equal(uniques, exp)
+
+        codes, uniques = algos.factorize(arr, sort=True)
+        exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        exp = np.array([0, 1, 2, 3, 4], dtype=arr.dtype)
+        tm.assert_numpy_array_equal(uniques, exp)
+
+        arr = np.arange(5.0)[::-1]
+
+        codes, uniques = algos.factorize(arr)
+        exp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        exp = np.array([4.0, 3.0, 2.0, 1.0, 0.0], dtype=arr.dtype)
+        tm.assert_numpy_array_equal(uniques, exp)
+
+        codes, uniques = algos.factorize(arr, sort=True)
+        exp = np.array([4, 3, 2, 1, 0], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        exp = np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=arr.dtype)
+        tm.assert_numpy_array_equal(uniques, exp)
+
+    def test_mixed(self):
+        # doc example reshaping.rst
+        x = Series(["A", "A", np.nan, "B", 3.14, np.inf])
+        codes, uniques = algos.factorize(x)
+
+        exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        exp = Index(["A", "B", 3.14, np.inf])
+        tm.assert_index_equal(uniques, exp)
+
+        codes, uniques = algos.factorize(x, sort=True)
+        exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        exp = Index([3.14, np.inf, "A", "B"])
+        tm.assert_index_equal(uniques, exp)
+
+    def test_factorize_datetime64(self):
+        # M8
+        v1 = Timestamp("20130101 09:00:00.00004")
+        v2 = Timestamp("20130101")
+        x = Series([v1, v1, v1, v2, v2, v1])
+        codes, uniques = algos.factorize(x)
+
+        exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        exp = DatetimeIndex([v1, v2])
+        tm.assert_index_equal(uniques, exp)
+
+        codes, uniques = algos.factorize(x, sort=True)
+        exp = np.array([1, 1, 1, 0, 0, 1], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        exp = DatetimeIndex([v2, v1])
+        tm.assert_index_equal(uniques, exp)
+
+    def test_factorize_period(self):
+        # period
+        v1 = Period("201302", freq="M")
+        v2 = Period("201303", freq="M")
+        x = Series([v1, v1, v1, v2, v2, v1])
+
+        # periods are not 'sorted' as they are converted back into an index
+        codes, uniques = algos.factorize(x)
+        exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))
+
+        codes, uniques = algos.factorize(x, sort=True)
+        exp = np.array([0, 0, 0, 1, 1, 0], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        tm.assert_index_equal(uniques, PeriodIndex([v1, v2]))
+
+    def test_factorize_timedelta(self):
+        # GH 5986
+        v1 = to_timedelta("1 day 1 min")
+        v2 = to_timedelta("1 day")
+        x = Series([v1, v2, v1, v1, v2, v2, v1])
+        codes, uniques = algos.factorize(x)
+        exp = np.array([0, 1, 0, 0, 1, 1, 0], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        tm.assert_index_equal(uniques, to_timedelta([v1, v2]))
+
+        codes, uniques = algos.factorize(x, sort=True)
+        exp = np.array([1, 0, 1, 1, 0, 0, 1], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp)
+        tm.assert_index_equal(uniques, to_timedelta([v2, v1]))
+
+    def test_factorize_nan(self):
+        # nan should map to na_sentinel, not reverse_indexer[na_sentinel]
+        # rizer.factorize should not raise an exception if na_sentinel indexes
+        # outside of reverse_indexer
+        key = np.array([1, 2, 1, np.nan], dtype="O")
+        rizer = ht.ObjectFactorizer(len(key))
+        for na_sentinel in (-1, 20):
+            ids = rizer.factorize(key, na_sentinel=na_sentinel)
+            expected = np.array([0, 1, 0, na_sentinel], dtype=np.intp)
+            assert len(set(key)) == len(set(expected))
+            tm.assert_numpy_array_equal(pd.isna(key), expected == na_sentinel)
+            tm.assert_numpy_array_equal(ids, expected)
+
+    def test_factorizer_with_mask(self):
+        # GH#49549
+        data = np.array([1, 2, 3, 1, 1, 0], dtype="int64")
+        mask = np.array([False, False, False, False, False, True])
+        rizer = ht.Int64Factorizer(len(data))
+        result = rizer.factorize(data, mask=mask)
+        expected = np.array([0, 1, 2, 0, 0, -1], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+        expected_uniques = np.array([1, 2, 3], dtype="int64")
+        tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)
+
+    def test_factorizer_object_with_nan(self):
+        # GH#49549
+        data = np.array([1, 2, 3, 1, np.nan])
+        rizer = ht.ObjectFactorizer(len(data))
+        result = rizer.factorize(data.astype(object))
+        expected = np.array([0, 1, 2, 0, -1], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+        expected_uniques = np.array([1, 2, 3], dtype=object)
+        tm.assert_numpy_array_equal(rizer.uniques.to_array(), expected_uniques)
+
+    @pytest.mark.parametrize(
+        "data, expected_codes, expected_uniques",
+        [
+            (
+                [(1, 1), (1, 2), (0, 0), (1, 2), "nonsense"],
+                [0, 1, 2, 1, 3],
+                [(1, 1), (1, 2), (0, 0), "nonsense"],
+            ),
+            (
+                [(1, 1), (1, 2), (0, 0), (1, 2), (1, 2, 3)],
+                [0, 1, 2, 1, 3],
+                [(1, 1), (1, 2), (0, 0), (1, 2, 3)],
+            ),
+            ([(1, 1), (1, 2), (0, 0), (1, 2)], [0, 1, 2, 1], [(1, 1), (1, 2), (0, 0)]),
+        ],
+    )
+    def test_factorize_tuple_list(self, data, expected_codes, expected_uniques):
+        # GH9454
+        data = com.asarray_tuplesafe(data, dtype=object)
+        codes, uniques = pd.factorize(data)
+
+        tm.assert_numpy_array_equal(codes, np.array(expected_codes, dtype=np.intp))
+
+        expected_uniques_array = com.asarray_tuplesafe(expected_uniques, dtype=object)
+        tm.assert_numpy_array_equal(uniques, expected_uniques_array)
+
+    def test_complex_sorting(self):
+        # gh 12666 - check no segfault
+        x17 = np.array([complex(i) for i in range(17)], dtype=object)
+
+        msg = "'[<>]' not supported between instances of .*"
+        with pytest.raises(TypeError, match=msg):
+            algos.factorize(x17[::-1], sort=True)
+
+    def test_numeric_dtype_factorize(self, any_real_numpy_dtype):
+        # GH41132
+        dtype = any_real_numpy_dtype
+        data = np.array([1, 2, 2, 1], dtype=dtype)
+        expected_codes = np.array([0, 1, 1, 0], dtype=np.intp)
+        expected_uniques = np.array([1, 2], dtype=dtype)
+
+        codes, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques)
+
+    def test_float64_factorize(self, writable):
+        data = np.array([1.0, 1e8, 1.0, 1e-8, 1e8, 1.0], dtype=np.float64)
+        data.setflags(write=writable)
+        expected_codes = np.array([0, 1, 0, 2, 1, 0], dtype=np.intp)
+        expected_uniques = np.array([1.0, 1e8, 1e-8], dtype=np.float64)
+
+        codes, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques)
+
+    def test_uint64_factorize(self, writable):
+        data = np.array([2**64 - 1, 1, 2**64 - 1], dtype=np.uint64)
+        data.setflags(write=writable)
+        expected_codes = np.array([0, 1, 0], dtype=np.intp)
+        expected_uniques = np.array([2**64 - 1, 1], dtype=np.uint64)
+
+        codes, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques)
+
+    def test_int64_factorize(self, writable):
+        data = np.array([2**63 - 1, -(2**63), 2**63 - 1], dtype=np.int64)
+        data.setflags(write=writable)
+        expected_codes = np.array([0, 1, 0], dtype=np.intp)
+        expected_uniques = np.array([2**63 - 1, -(2**63)], dtype=np.int64)
+
+        codes, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques)
+
+    def test_string_factorize(self, writable):
+        data = np.array(["a", "c", "a", "b", "c"], dtype=object)
+        data.setflags(write=writable)
+        expected_codes = np.array([0, 1, 0, 2, 1], dtype=np.intp)
+        expected_uniques = np.array(["a", "c", "b"], dtype=object)
+
+        codes, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques)
+
+    def test_object_factorize(self, writable):
+        data = np.array(["a", "c", None, np.nan, "a", "b", NaT, "c"], dtype=object)
+        data.setflags(write=writable)
+        expected_codes = np.array([0, 1, -1, -1, 0, 2, -1, 1], dtype=np.intp)
+        expected_uniques = np.array(["a", "c", "b"], dtype=object)
+
+        codes, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques)
+
+    def test_datetime64_factorize(self, writable):
+        # GH35650 Verify whether read-only datetime64 array can be factorized
+        data = np.array([np.datetime64("2020-01-01T00:00:00.000")], dtype="M8[ns]")
+        data.setflags(write=writable)
+        expected_codes = np.array([0], dtype=np.intp)
+        expected_uniques = np.array(
+            ["2020-01-01T00:00:00.000000000"], dtype="datetime64[ns]"
+        )
+
+        codes, uniques = pd.factorize(data)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques)
+
+    def test_factorize_rangeindex(self, sort):
+        # increasing -> sort doesn't matter
+        ri = pd.RangeIndex.from_range(range(10))
+        expected = np.arange(10, dtype=np.intp), ri
+
+        result = algos.factorize(ri, sort=sort)
+        tm.assert_numpy_array_equal(result[0], expected[0])
+        tm.assert_index_equal(result[1], expected[1], exact=True)
+
+        result = ri.factorize(sort=sort)
+        tm.assert_numpy_array_equal(result[0], expected[0])
+        tm.assert_index_equal(result[1], expected[1], exact=True)
+
+    def test_factorize_rangeindex_decreasing(self, sort):
+        # decreasing -> sort matters
+        ri = pd.RangeIndex.from_range(range(10))
+        expected = np.arange(10, dtype=np.intp), ri
+
+        ri2 = ri[::-1]
+        expected = expected[0], ri2
+        if sort:
+            expected = expected[0][::-1], expected[1][::-1]
+
+        result = algos.factorize(ri2, sort=sort)
+        tm.assert_numpy_array_equal(result[0], expected[0])
+        tm.assert_index_equal(result[1], expected[1], exact=True)
+
+        result = ri2.factorize(sort=sort)
+        tm.assert_numpy_array_equal(result[0], expected[0])
+        tm.assert_index_equal(result[1], expected[1], exact=True)
+
+    def test_deprecate_order(self):
+        # gh 19727 - check warning is raised for deprecated keyword, order.
+        # Test not valid once order keyword is removed.
+        data = np.array([2**63, 1, 2**63], dtype=np.uint64)
+        with pytest.raises(TypeError, match="got an unexpected keyword"):
+            algos.factorize(data, order=True)
+        with tm.assert_produces_warning(False):
+            algos.factorize(data)
+
+    @pytest.mark.parametrize(
+        "data",
+        [
+            np.array([0, 1, 0], dtype="u8"),
+            np.array([-(2**63), 1, -(2**63)], dtype="i8"),
+            np.array(["__nan__", "foo", "__nan__"], dtype="object"),
+        ],
+    )
+    def test_parametrized_factorize_na_value_default(self, data):
+        # arrays that include the NA default for that type, but isn't used.
+        codes, uniques = algos.factorize(data)
+        expected_uniques = data[[0, 1]]
+        expected_codes = np.array([0, 1, 0], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques)
+
+    @pytest.mark.parametrize(
+        "data, na_value",
+        [
+            (np.array([0, 1, 0, 2], dtype="u8"), 0),
+            (np.array([1, 0, 1, 2], dtype="u8"), 1),
+            (np.array([-(2**63), 1, -(2**63), 0], dtype="i8"), -(2**63)),
+            (np.array([1, -(2**63), 1, 0], dtype="i8"), 1),
+            (np.array(["a", "", "a", "b"], dtype=object), "a"),
+            (np.array([(), ("a", 1), (), ("a", 2)], dtype=object), ()),
+            (np.array([("a", 1), (), ("a", 1), ("a", 2)], dtype=object), ("a", 1)),
+        ],
+    )
+    def test_parametrized_factorize_na_value(self, data, na_value):
+        codes, uniques = algos.factorize_array(data, na_value=na_value)
+        expected_uniques = data[[1, 3]]
+        expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_numpy_array_equal(uniques, expected_uniques)
+
+    @pytest.mark.parametrize(
+        "data, uniques",
+        [
+            (
+                np.array(["b", "a", None, "b"], dtype=object),
+                np.array(["b", "a"], dtype=object),
+            ),
+            (
+                pd.array([2, 1, pd.NA, 2], dtype="Int64"),
+                pd.array([2, 1], dtype="Int64"),
+            ),
+        ],
+        ids=["numpy_array", "extension_array"],
+    )
+    def test_factorize_use_na_sentinel(self, sort, data, uniques):
+        codes, uniques = algos.factorize(data, sort=sort, use_na_sentinel=True)
+        if sort:
+            expected_codes = np.array([1, 0, -1, 1], dtype=np.intp)
+            expected_uniques = algos.safe_sort(uniques)
+        else:
+            expected_codes = np.array([0, 1, -1, 0], dtype=np.intp)
+            expected_uniques = uniques
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        if isinstance(data, np.ndarray):
+            tm.assert_numpy_array_equal(uniques, expected_uniques)
+        else:
+            tm.assert_extension_array_equal(uniques, expected_uniques)
+
+    @pytest.mark.parametrize(
+        "data, expected_codes, expected_uniques",
+        [
+            (
+                ["a", None, "b", "a"],
+                np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
+                np.array(["a", np.nan, "b"], dtype=object),
+            ),
+            (
+                ["a", np.nan, "b", "a"],
+                np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
+                np.array(["a", np.nan, "b"], dtype=object),
+            ),
+        ],
+    )
+    def test_object_factorize_use_na_sentinel_false(
+        self, data, expected_codes, expected_uniques
+    ):
+        codes, uniques = algos.factorize(
+            np.array(data, dtype=object), use_na_sentinel=False
+        )
+
+        tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
+        tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
+
+    @pytest.mark.parametrize(
+        "data, expected_codes, expected_uniques",
+        [
+            (
+                np.array([1, None, 1, 2], dtype=object),
+                np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
+                np.array([1, np.nan, 2], dtype="O"),
+            ),
+            (
+                np.array([1, np.nan, 1, 2], dtype=np.float64),
+                np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
+                np.array([1, np.nan, 2], dtype=np.float64),
+            ),
+        ],
+    )
+    def test_int_factorize_use_na_sentinel_false(
+        self, data, expected_codes, expected_uniques
+    ):
+        codes, uniques = algos.factorize(data, use_na_sentinel=False)
+
+        tm.assert_numpy_array_equal(uniques, expected_uniques, strict_nan=True)
+        tm.assert_numpy_array_equal(codes, expected_codes, strict_nan=True)
+
+    @pytest.mark.parametrize(
+        "data, expected_codes, expected_uniques",
+        [
+            (
+                Index(Categorical(["a", "a", "b"])),
+                np.array([0, 0, 1], dtype=np.intp),
+                CategoricalIndex(["a", "b"], categories=["a", "b"], dtype="category"),
+            ),
+            (
+                Series(Categorical(["a", "a", "b"])),
+                np.array([0, 0, 1], dtype=np.intp),
+                CategoricalIndex(["a", "b"], categories=["a", "b"], dtype="category"),
+            ),
+            (
+                Series(DatetimeIndex(["2017", "2017"], tz="US/Eastern")),
+                np.array([0, 0], dtype=np.intp),
+                DatetimeIndex(["2017"], tz="US/Eastern"),
+            ),
+        ],
+    )
+    def test_factorize_mixed_values(self, data, expected_codes, expected_uniques):
+        # GH 19721
+        codes, uniques = algos.factorize(data)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+        tm.assert_index_equal(uniques, expected_uniques)
+
+    def test_factorize_interval_non_nano(self, unit):
+        # GH#56099
+        left = DatetimeIndex(["2016-01-01", np.nan, "2015-10-11"]).as_unit(unit)
+        right = DatetimeIndex(["2016-01-02", np.nan, "2015-10-15"]).as_unit(unit)
+        idx = IntervalIndex.from_arrays(left, right)
+        codes, cats = idx.factorize()
+        assert cats.dtype == f"interval[datetime64[{unit}], right]"
+
+        ts = Timestamp(0).as_unit(unit)
+        idx2 = IntervalIndex.from_arrays(left - ts, right - ts)
+        codes2, cats2 = idx2.factorize()
+        assert cats2.dtype == f"interval[timedelta64[{unit}], right]"
+
+        idx3 = IntervalIndex.from_arrays(
+            left.tz_localize("US/Pacific"), right.tz_localize("US/Pacific")
+        )
+        codes3, cats3 = idx3.factorize()
+        assert cats3.dtype == f"interval[datetime64[{unit}, US/Pacific], right]"
+
+
+class TestUnique:
+    def test_ints(self):
+        arr = np.random.default_rng(2).integers(0, 100, size=50)
+
+        result = algos.unique(arr)
+        assert isinstance(result, np.ndarray)
+
+    def test_objects(self):
+        arr = np.random.default_rng(2).integers(0, 100, size=50).astype("O")
+
+        result = algos.unique(arr)
+        assert isinstance(result, np.ndarray)
+
+    def test_object_refcount_bug(self):
+        lst = np.array(["A", "B", "C", "D", "E"], dtype=object)
+        for i in range(1000):
+            len(algos.unique(lst))
+
+    def test_index_returned(self, index):
+        # GH#57043
+        index = index.repeat(2)
+        result = algos.unique(index)
+
+        # dict.fromkeys preserves the order
+        unique_values = list(dict.fromkeys(index.values))
+        if isinstance(index, MultiIndex):
+            expected = MultiIndex.from_tuples(unique_values, names=index.names)
+        else:
+            expected = Index(unique_values, dtype=index.dtype)
+            if isinstance(index.dtype, DatetimeTZDtype):
+                expected = expected.normalize()
+        tm.assert_index_equal(result, expected, exact=True)
+
+    def test_factorize_multiindex_empty(self):
+        # GH#57517
+        mi = MultiIndex.from_product(
+            [Index([], name="a", dtype=object), Index([], name="i", dtype="f4")]
+        )
+        codes, uniques = mi.factorize()
+        exp_codes = np.array([], dtype=np.intp)
+        tm.assert_numpy_array_equal(codes, exp_codes)
+        tm.assert_index_equal(uniques, mi[:0])
+
+    def test_dtype_preservation(self, any_numpy_dtype):
+        # GH 15442
+        if any_numpy_dtype in (tm.BYTES_DTYPES + tm.STRING_DTYPES):
+            data = [1, 2, 2]
+            uniques = [1, 2]
+        elif is_integer_dtype(any_numpy_dtype):
+            data = [1, 2, 2]
+            uniques = [1, 2]
+        elif is_float_dtype(any_numpy_dtype):
+            data = [1, 2, 2]
+            uniques = [1.0, 2.0]
+        elif is_complex_dtype(any_numpy_dtype):
+            data = [complex(1, 0), complex(2, 0), complex(2, 0)]
+            uniques = [complex(1, 0), complex(2, 0)]
+        elif is_bool_dtype(any_numpy_dtype):
+            data = [True, True, False]
+            uniques = [True, False]
+        elif is_object_dtype(any_numpy_dtype):
+            data = ["A", "B", "B"]
+            uniques = ["A", "B"]
+        else:
+            # datetime64[ns]/M8[ns]/timedelta64[ns]/m8[ns] tested elsewhere
+            data = [1, 2, 2]
+            uniques = [1, 2]
+
+        result = Series(data, dtype=any_numpy_dtype).unique()
+        expected = np.array(uniques, dtype=any_numpy_dtype)
+
+        if any_numpy_dtype in tm.STRING_DTYPES:
+            expected = expected.astype(object)
+
+        if expected.dtype.kind in ["m", "M"]:
+            # We get TimedeltaArray/DatetimeArray
+            assert isinstance(result, (DatetimeArray, TimedeltaArray))
+            result = np.array(result)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_datetime64_dtype_array_returned(self):
+        # GH 9431
+        dt_arr = np.array(
+            [
+                "2015-01-03T00:00:00.000000000",
+                "2015-01-01T00:00:00.000000000",
+            ],
+            dtype="M8[ns]",
+        )
+
+        dt_index = to_datetime(
+            [
+                "2015-01-03T00:00:00.000000000",
+                "2015-01-01T00:00:00.000000000",
+                "2015-01-01T00:00:00.000000000",
+            ]
+        )
+        result = algos.unique(dt_index)
+        expected = to_datetime(dt_arr)
+        tm.assert_index_equal(result, expected, exact=True)
+
+        s = Series(dt_index)
+        result = algos.unique(s)
+        tm.assert_numpy_array_equal(result, dt_arr)
+        assert result.dtype == dt_arr.dtype
+
+        arr = s.values
+        result = algos.unique(arr)
+        tm.assert_numpy_array_equal(result, dt_arr)
+        assert result.dtype == dt_arr.dtype
+
+    def test_datetime_non_ns(self):
+        a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]")
+        result = pd.unique(a)
+        expected = np.array(["2000", "2001"], dtype="datetime64[s]")
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_timedelta_non_ns(self):
+        a = np.array(["2000", "2000", "2001"], dtype="timedelta64[s]")
+        result = pd.unique(a)
+        expected = np.array([2000, 2001], dtype="timedelta64[s]")
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_timedelta64_dtype_array_returned(self):
+        # GH 9431
+        td_arr = np.array([31200, 45678, 10000], dtype="m8[ns]")
+
+        td_index = to_timedelta([31200, 45678, 31200, 10000, 45678])
+        result = algos.unique(td_index)
+        expected = to_timedelta(td_arr)
+        tm.assert_index_equal(result, expected)
+        assert result.dtype == expected.dtype
+
+        s = Series(td_index)
+        result = algos.unique(s)
+        tm.assert_numpy_array_equal(result, td_arr)
+        assert result.dtype == td_arr.dtype
+
+        arr = s.values
+        result = algos.unique(arr)
+        tm.assert_numpy_array_equal(result, td_arr)
+        assert result.dtype == td_arr.dtype
+
+    def test_uint64_overflow(self):
+        s = Series([1, 2, 2**63, 2**63], dtype=np.uint64)
+        exp = np.array([1, 2, 2**63], dtype=np.uint64)
+        tm.assert_numpy_array_equal(algos.unique(s), exp)
+
+    def test_nan_in_object_array(self):
+        duplicated_items = ["a", np.nan, "c", "c"]
+        result = pd.unique(np.array(duplicated_items, dtype=object))
+        expected = np.array(["a", np.nan, "c"], dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_categorical(self):
+        # we are expecting to return in the order
+        # of appearance
+        expected = Categorical(list("bac"))
+
+        # we are expecting to return in the order
+        # of the categories
+        expected_o = Categorical(list("bac"), categories=list("abc"), ordered=True)
+
+        # GH 15939
+        c = Categorical(list("baabc"))
+        result = c.unique()
+        tm.assert_categorical_equal(result, expected)
+
+        result = algos.unique(c)
+        tm.assert_categorical_equal(result, expected)
+
+        c = Categorical(list("baabc"), ordered=True)
+        result = c.unique()
+        tm.assert_categorical_equal(result, expected_o)
+
+        result = algos.unique(c)
+        tm.assert_categorical_equal(result, expected_o)
+
+        # Series of categorical dtype
+        s = Series(Categorical(list("baabc")), name="foo")
+        result = s.unique()
+        tm.assert_categorical_equal(result, expected)
+
+        result = pd.unique(s)
+        tm.assert_categorical_equal(result, expected)
+
+        # CI -> return CI
+        ci = CategoricalIndex(Categorical(list("baabc"), categories=list("abc")))
+        expected = CategoricalIndex(expected)
+        result = ci.unique()
+        tm.assert_index_equal(result, expected)
+
+        result = pd.unique(ci)
+        tm.assert_index_equal(result, expected)
+
+    def test_datetime64tz_aware(self, unit):
+        # GH 15939
+
+        dti = Index(
+            [
+                Timestamp("20160101", tz="US/Eastern"),
+                Timestamp("20160101", tz="US/Eastern"),
+            ]
+        ).as_unit(unit)
+        ser = Series(dti)
+
+        result = ser.unique()
+        expected = dti[:1]._data
+        tm.assert_extension_array_equal(result, expected)
+
+        result = dti.unique()
+        expected = dti[:1]
+        tm.assert_index_equal(result, expected)
+
+        result = pd.unique(ser)
+        expected = dti[:1]._data
+        tm.assert_extension_array_equal(result, expected)
+
+        result = pd.unique(dti)
+        expected = dti[:1]
+        tm.assert_index_equal(result, expected)
+
+    def test_order_of_appearance(self):
+        # 9346
+        # light testing of guarantee of order of appearance
+        # these also are the doc-examples
+        result = pd.unique(Series([2, 1, 3, 3]))
+        tm.assert_numpy_array_equal(result, np.array([2, 1, 3], dtype="int64"))
+
+        result = pd.unique(Series([2] + [1] * 5))
+        tm.assert_numpy_array_equal(result, np.array([2, 1], dtype="int64"))
+
+        data = np.array(["a", "a", "b", "c"], dtype=object)
+        result = pd.unique(data)
+        expected = np.array(["a", "b", "c"], dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = pd.unique(Series(Categorical(list("aabc"))))
+        expected = Categorical(list("abc"))
+        tm.assert_categorical_equal(result, expected)
+
+    def test_order_of_appearance_dt64(self, unit):
+        ser = Series([Timestamp("20160101"), Timestamp("20160101")]).dt.as_unit(unit)
+        result = pd.unique(ser)
+        expected = np.array(["2016-01-01T00:00:00.000000000"], dtype=f"M8[{unit}]")
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_order_of_appearance_dt64tz(self, unit):
+        dti = DatetimeIndex(
+            [
+                Timestamp("20160101", tz="US/Eastern"),
+                Timestamp("20160101", tz="US/Eastern"),
+            ]
+        ).as_unit(unit)
+        result = pd.unique(dti)
+        expected = DatetimeIndex(
+            ["2016-01-01 00:00:00"], dtype=f"datetime64[{unit}, US/Eastern]", freq=None
+        )
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "arg ,expected",
+        [
+            (("1", "1", "2"), np.array(["1", "2"], dtype=object)),
+            (("foo",), np.array(["foo"], dtype=object)),
+        ],
+    )
+    def test_tuple_with_strings(self, arg, expected):
+        # see GH 17108
+        arg = com.asarray_tuplesafe(arg, dtype=object)
+        result = pd.unique(arg)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_obj_none_preservation(self):
+        # GH 20866
+        arr = np.array(["foo", None], dtype=object)
+        result = pd.unique(arr)
+        expected = np.array(["foo", None], dtype=object)
+
+        tm.assert_numpy_array_equal(result, expected, strict_nan=True)
+
+    def test_signed_zero(self):
+        # GH 21866
+        a = np.array([-0.0, 0.0])
+        result = pd.unique(a)
+        expected = np.array([-0.0])  # 0.0 and -0.0 are equivalent
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_different_nans(self):
+        # GH 21866
+        # create different nans from bit-patterns:
+        NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
+        NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
+        assert NAN1 != NAN1
+        assert NAN2 != NAN2
+        a = np.array([NAN1, NAN2])  # NAN1 and NAN2 are equivalent
+        result = pd.unique(a)
+        expected = np.array([np.nan])
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("el_type", [np.float64, object])
+    def test_first_nan_kept(self, el_type):
+        # GH 22295
+        # create different nans from bit-patterns:
+        bits_for_nan1 = 0xFFF8000000000001
+        bits_for_nan2 = 0x7FF8000000000001
+        NAN1 = struct.unpack("d", struct.pack("=Q", bits_for_nan1))[0]
+        NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0]
+        assert NAN1 != NAN1
+        assert NAN2 != NAN2
+        a = np.array([NAN1, NAN2], dtype=el_type)
+        result = pd.unique(a)
+        assert result.size == 1
+        # use bit patterns to identify which nan was kept:
+        result_nan_bits = struct.unpack("=Q", struct.pack("d", result[0]))[0]
+        assert result_nan_bits == bits_for_nan1
+
+    def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixture2):
+        # GH 22295
+        if unique_nulls_fixture is unique_nulls_fixture2:
+            return  # skip it, values not unique
+        a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=object)
+        result = pd.unique(a)
+        assert result.size == 2
+        assert a[0] is unique_nulls_fixture
+        assert a[1] is unique_nulls_fixture2
+
+    def test_unique_masked(self, any_numeric_ea_dtype):
+        # GH#48019
+        ser = Series([1, pd.NA, 2] * 3, dtype=any_numeric_ea_dtype)
+        result = pd.unique(ser)
+        expected = pd.array([1, pd.NA, 2], dtype=any_numeric_ea_dtype)
+        tm.assert_extension_array_equal(result, expected)
+
+    def test_unique_NumpyExtensionArray(self):
+        arr_complex = pd.array(
+            [1 + 1j, 2, 3]
+        )  # NumpyEADtype('complex128') => NumpyExtensionArray
+        result = pd.unique(arr_complex)
+        expected = pd.array([1 + 1j, 2 + 0j, 3 + 0j])
+        tm.assert_extension_array_equal(result, expected)
+
+
+def test_nunique_ints(index_or_series_or_array):
+    # GH#36327
+    values = index_or_series_or_array(np.random.default_rng(2).integers(0, 20, 30))
+    result = algos.nunique_ints(values)
+    expected = len(algos.unique(values))
+    assert result == expected
+
+
+class TestIsin:
+    def test_invalid(self):
+        msg = (
+            r"only list-like objects are allowed to be passed to isin\(\), "
+            r"you passed a `int`"
+        )
+        with pytest.raises(TypeError, match=msg):
+            algos.isin(1, 1)
+        with pytest.raises(TypeError, match=msg):
+            algos.isin(1, [1])
+        with pytest.raises(TypeError, match=msg):
+            algos.isin([1], 1)
+
+    def test_basic(self):
+        result = algos.isin(np.array([1, 2]), [1])
+        expected = np.array([True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(Series([1, 2]), [1])
+        expected = np.array([True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(Series([1, 2]), Series([1]))
+        expected = np.array([True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(Series([1, 2]), {1})
+        expected = np.array([True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        arg = np.array(["a", "b"], dtype=object)
+        result = algos.isin(arg, ["a"])
+        expected = np.array([True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(Series(arg), Series(["a"]))
+        expected = np.array([True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(Series(arg), {"a"})
+        expected = np.array([True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(arg, [1])
+        expected = np.array([False, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_i8(self):
+        arr = date_range("20130101", periods=3).values
+        result = algos.isin(arr, [arr[0]])
+        expected = np.array([True, False, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(arr, arr[0:2])
+        expected = np.array([True, True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(arr, set(arr[0:2]))
+        expected = np.array([True, True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        arr = timedelta_range("1 day", periods=3).values
+        result = algos.isin(arr, [arr[0]])
+        expected = np.array([True, False, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(arr, arr[0:2])
+        expected = np.array([True, True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.isin(arr, set(arr[0:2]))
+        expected = np.array([True, True, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype1", ["m8[ns]", "M8[ns]", "M8[ns, UTC]", "period[D]"])
+    @pytest.mark.parametrize("dtype", ["i8", "f8", "u8"])
+    def test_isin_datetimelike_values_numeric_comps(self, dtype, dtype1):
+        # Anything but object and we get all-False shortcut
+
+        dta = date_range("2013-01-01", periods=3)._values
+        arr = Series(dta.view("i8")).array.view(dtype1)
+
+        comps = arr.view("i8").astype(dtype)
+
+        result = algos.isin(comps, arr)
+        expected = np.zeros(comps.shape, dtype=bool)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_large(self):
+        s = date_range("20000101", periods=2000000, freq="s").values
+        result = algos.isin(s, s[0:2])
+        expected = np.zeros(len(s), dtype=bool)
+        expected[0] = True
+        expected[1] = True
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]", "period[D]"])
+    def test_isin_datetimelike_all_nat(self, dtype):
+        # GH#56427
+        dta = date_range("2013-01-01", periods=3)._values
+        arr = Series(dta.view("i8")).array.view(dtype)
+
+        arr[0] = NaT
+        result = algos.isin(arr, [NaT])
+        expected = np.array([True, False, False], dtype=bool)
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype", ["m8[ns]", "M8[ns]", "M8[ns, UTC]"])
+    def test_isin_datetimelike_strings_returns_false(self, dtype):
+        # GH#53111
+        dta = date_range("2013-01-01", periods=3)._values
+        arr = Series(dta.view("i8")).array.view(dtype)
+
+        vals = [str(x) for x in arr]
+        res = algos.isin(arr, vals)
+        assert not res.any()
+
+        vals2 = np.array(vals, dtype=str)
+        res2 = algos.isin(arr, vals2)
+        assert not res2.any()
+
+    def test_isin_dt64tz_with_nat(self):
+        # the all-NaT values used to get inferred to tznaive, which was evaluated
+        #  as non-matching GH#56427
+        dti = date_range("2016-01-01", periods=3, tz="UTC")
+        ser = Series(dti)
+        ser[0] = NaT
+
+        res = algos.isin(ser._values, [NaT])
+        exp = np.array([True, False, False], dtype=bool)
+        tm.assert_numpy_array_equal(res, exp)
+
+    def test_categorical_from_codes(self):
+        # GH 16639
+        vals = np.array([0, 1, 2, 0])
+        cats = ["a", "b", "c"]
+        Sd = Series(Categorical([1]).from_codes(vals, cats))
+        St = Series(Categorical([1]).from_codes(np.array([0, 1]), cats))
+        expected = np.array([True, True, False, True])
+        result = algos.isin(Sd, St)
+        tm.assert_numpy_array_equal(expected, result)
+
+    def test_categorical_isin(self):
+        vals = np.array([0, 1, 2, 0])
+        cats = ["a", "b", "c"]
+        cat = Categorical([1]).from_codes(vals, cats)
+        other = Categorical([1]).from_codes(np.array([0, 1]), cats)
+
+        expected = np.array([True, True, False, True])
+        result = algos.isin(cat, other)
+        tm.assert_numpy_array_equal(expected, result)
+
+    def test_same_nan_is_in(self):
+        # GH 22160
+        # nan is special, because from " a is b" doesn't follow "a == b"
+        # at least, isin() should follow python's "np.nan in [nan] == True"
+        # casting to -> np.float64 -> another float-object somewhere on
+        # the way could lead jeopardize this behavior
+        comps = np.array([np.nan], dtype=object)  # could be casted to float64
+        values = [np.nan]
+        expected = np.array([True])
+        result = algos.isin(comps, values)
+        tm.assert_numpy_array_equal(expected, result)
+
+    def test_same_nan_is_in_large(self):
+        # https://github.com/pandas-dev/pandas/issues/22205
+        s = np.tile(1.0, 1_000_001)
+        s[0] = np.nan
+        result = algos.isin(s, np.array([np.nan, 1]))
+        expected = np.ones(len(s), dtype=bool)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_same_nan_is_in_large_series(self):
+        # https://github.com/pandas-dev/pandas/issues/22205
+        s = np.tile(1.0, 1_000_001)
+        series = Series(s)
+        s[0] = np.nan
+        result = series.isin(np.array([np.nan, 1]))
+        expected = Series(np.ones(len(s), dtype=bool))
+        tm.assert_series_equal(result, expected)
+
+    def test_same_object_is_in(self):
+        # GH 22160
+        # there could be special treatment for nans
+        # the user however could define a custom class
+        # with similar behavior, then we at least should
+        # fall back to usual python's behavior: "a in [a] == True"
+        class LikeNan:
+            def __eq__(self, other) -> bool:
+                return False
+
+            def __hash__(self):
+                return 0
+
+        a, b = LikeNan(), LikeNan()
+
+        arg = np.array([a], dtype=object)
+
+        # same object -> True
+        tm.assert_numpy_array_equal(algos.isin(arg, [a]), np.array([True]))
+        # different objects -> False
+        tm.assert_numpy_array_equal(algos.isin(arg, [b]), np.array([False]))
+
+    def test_different_nans(self):
+        # GH 22160
+        # all nans are handled as equivalent
+
+        comps = [float("nan")]
+        values = [float("nan")]
+        assert comps[0] is not values[0]  # different nan-objects
+
+        # as list of python-objects:
+        result = algos.isin(np.array(comps), values)
+        tm.assert_numpy_array_equal(np.array([True]), result)
+
+        # as object-array:
+        result = algos.isin(
+            np.asarray(comps, dtype=object), np.asarray(values, dtype=object)
+        )
+        tm.assert_numpy_array_equal(np.array([True]), result)
+
+        # as float64-array:
+        result = algos.isin(
+            np.asarray(comps, dtype=np.float64), np.asarray(values, dtype=np.float64)
+        )
+        tm.assert_numpy_array_equal(np.array([True]), result)
+
+    def test_no_cast(self):
+        # GH 22160
+        # ensure 42 is not casted to a string
+        comps = np.array(["ss", 42], dtype=object)
+        values = ["42"]
+        expected = np.array([False, False])
+
+        result = algos.isin(comps, values)
+        tm.assert_numpy_array_equal(expected, result)
+
+    @pytest.mark.parametrize("empty", [[], Series(dtype=object), np.array([])])
+    def test_empty(self, empty):
+        # see gh-16991
+        vals = Index(["a", "b"])
+        expected = np.array([False, False])
+
+        result = algos.isin(vals, empty)
+        tm.assert_numpy_array_equal(expected, result)
+
+    def test_different_nan_objects(self):
+        # GH 22119
+        comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=object)
+        vals = np.array([float("nan")], dtype=object)
+        expected = np.array([False, False, True])
+        result = algos.isin(comps, vals)
+        tm.assert_numpy_array_equal(expected, result)
+
+    def test_different_nans_as_float64(self):
+        # GH 21866
+        # create different nans from bit-patterns,
+        # these nans will land in different buckets in the hash-table
+        # if no special care is taken
+        NAN1 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000000))[0]
+        NAN2 = struct.unpack("d", struct.pack("=Q", 0x7FF8000000000001))[0]
+        assert NAN1 != NAN1
+        assert NAN2 != NAN2
+
+        # check that NAN1 and NAN2 are equivalent:
+        arr = np.array([NAN1, NAN2], dtype=np.float64)
+        lookup1 = np.array([NAN1], dtype=np.float64)
+        result = algos.isin(arr, lookup1)
+        expected = np.array([True, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        lookup2 = np.array([NAN2], dtype=np.float64)
+        result = algos.isin(arr, lookup2)
+        expected = np.array([True, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_isin_int_df_string_search(self):
+        """Comparing df with int`s (1,2) with a string at isin() ("1")
+        -> should not match values because int 1 is not equal str 1"""
+        df = DataFrame({"values": [1, 2]})
+        result = df.isin(["1"])
+        expected_false = DataFrame({"values": [False, False]})
+        tm.assert_frame_equal(result, expected_false)
+
+    def test_isin_nan_df_string_search(self):
+        """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN")
+        -> should not match values because np.nan is not equal str NaN"""
+        df = DataFrame({"values": [np.nan, 2]})
+        result = df.isin(np.array(["NaN"], dtype=object))
+        expected_false = DataFrame({"values": [False, False]})
+        tm.assert_frame_equal(result, expected_false)
+
+    def test_isin_float_df_string_search(self):
+        """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245")
+        -> should not match values because float 1.4245 is not equal str 1.4245"""
+        df = DataFrame({"values": [1.4245, 2.32441]})
+        result = df.isin(np.array(["1.4245"], dtype=object))
+        expected_false = DataFrame({"values": [False, False]})
+        tm.assert_frame_equal(result, expected_false)
+
+    def test_isin_unsigned_dtype(self):
+        # GH#46485
+        ser = Series([1378774140726870442], dtype=np.uint64)
+        result = ser.isin([1378774140726870528])
+        expected = Series(False)
+        tm.assert_series_equal(result, expected)
+
+
+class TestValueCounts:
+    def test_value_counts(self):
+        arr = np.random.default_rng(1234).standard_normal(4)
+        factor = cut(arr, 4)
+
+        # assert isinstance(factor, n)
+        result = algos.value_counts_internal(factor)
+        breaks = [-1.606, -1.018, -0.431, 0.155, 0.741]
+        index = IntervalIndex.from_breaks(breaks).astype(CategoricalDtype(ordered=True))
+        expected = Series([1, 0, 2, 1], index=index, name="count")
+        tm.assert_series_equal(result.sort_index(), expected.sort_index())
+
+    def test_value_counts_bins(self):
+        s = [1, 2, 3, 4]
+        result = algos.value_counts_internal(s, bins=1)
+        expected = Series(
+            [4], index=IntervalIndex.from_tuples([(0.996, 4.0)]), name="count"
+        )
+        tm.assert_series_equal(result, expected)
+
+        result = algos.value_counts_internal(s, bins=2, sort=False)
+        expected = Series(
+            [2, 2],
+            index=IntervalIndex.from_tuples([(0.996, 2.5), (2.5, 4.0)]),
+            name="count",
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_value_counts_dtypes(self):
+        result = algos.value_counts_internal(np.array([1, 1.0]))
+        assert len(result) == 1
+
+        result = algos.value_counts_internal(np.array([1, 1.0]), bins=1)
+        assert len(result) == 1
+
+        result = algos.value_counts_internal(Series([1, 1.0, "1"]))  # object
+        assert len(result) == 2
+
+        msg = "bins argument only works with numeric data"
+        with pytest.raises(TypeError, match=msg):
+            algos.value_counts_internal(np.array(["1", 1], dtype=object), bins=1)
+
+    def test_value_counts_nat(self):
+        td = Series([np.timedelta64(10000), NaT], dtype="timedelta64[ns]")
+        dt = to_datetime(["NaT", "2014-01-01"])
+
+        for ser in [td, dt]:
+            vc = algos.value_counts_internal(ser)
+            vc_with_na = algos.value_counts_internal(ser, dropna=False)
+            assert len(vc) == 1
+            assert len(vc_with_na) == 2
+
+        exp_dt = Series({Timestamp("2014-01-01 00:00:00"): 1}, name="count")
+        result_dt = algos.value_counts_internal(dt)
+        tm.assert_series_equal(result_dt, exp_dt)
+
+        exp_td = Series([1], index=[np.timedelta64(10000)], name="count")
+        result_td = algos.value_counts_internal(td)
+        tm.assert_series_equal(result_td, exp_td)
+
+    @pytest.mark.parametrize("dtype", [object, "M8[us]"])
+    def test_value_counts_datetime_outofbounds(self, dtype):
+        # GH 13663
+        ser = Series(
+            [
+                datetime(3000, 1, 1),
+                datetime(5000, 1, 1),
+                datetime(5000, 1, 1),
+                datetime(6000, 1, 1),
+                datetime(3000, 1, 1),
+                datetime(3000, 1, 1),
+            ],
+            dtype=dtype,
+        )
+
+        res = ser.value_counts()
+
+        exp_index = Index(
+            [datetime(3000, 1, 1), datetime(5000, 1, 1), datetime(6000, 1, 1)],
+            dtype=dtype,
+        )
+        exp = Series([3, 2, 1], index=exp_index, name="count")
+        tm.assert_series_equal(res, exp)
+
+    def test_categorical(self):
+        s = Series(Categorical(list("aaabbc")))
+        result = s.value_counts()
+        expected = Series(
+            [3, 2, 1], index=CategoricalIndex(["a", "b", "c"]), name="count"
+        )
+
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+        # preserve order?
+        s = s.cat.as_ordered()
+        result = s.value_counts()
+        expected.index = expected.index.as_ordered()
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+    def test_categorical_nans(self):
+        s = Series(Categorical(list("aaaaabbbcc")))  # 4,3,2,1 (nan)
+        s.iloc[1] = np.nan
+        result = s.value_counts()
+        expected = Series(
+            [4, 3, 2],
+            index=CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c"]),
+            name="count",
+        )
+        tm.assert_series_equal(result, expected, check_index_type=True)
+        result = s.value_counts(dropna=False)
+        expected = Series(
+            [4, 3, 2, 1], index=CategoricalIndex(["a", "b", "c", np.nan]), name="count"
+        )
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+        # out of order
+        s = Series(
+            Categorical(list("aaaaabbbcc"), ordered=True, categories=["b", "a", "c"])
+        )
+        s.iloc[1] = np.nan
+        result = s.value_counts()
+        expected = Series(
+            [4, 3, 2],
+            index=CategoricalIndex(
+                ["a", "b", "c"],
+                categories=["b", "a", "c"],
+                ordered=True,
+            ),
+            name="count",
+        )
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+        result = s.value_counts(dropna=False)
+        expected = Series(
+            [4, 3, 2, 1],
+            index=CategoricalIndex(
+                ["a", "b", "c", np.nan], categories=["b", "a", "c"], ordered=True
+            ),
+            name="count",
+        )
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+    def test_categorical_zeroes(self):
+        # keep the `d` category with 0
+        s = Series(Categorical(list("bbbaac"), categories=list("abcd"), ordered=True))
+        result = s.value_counts()
+        expected = Series(
+            [3, 2, 1, 0],
+            index=Categorical(
+                ["b", "a", "c", "d"], categories=list("abcd"), ordered=True
+            ),
+            name="count",
+        )
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+    def test_value_counts_dropna(self):
+        # https://github.com/pandas-dev/pandas/issues/9443#issuecomment-73719328
+
+        tm.assert_series_equal(
+            Series([True, True, False]).value_counts(dropna=True),
+            Series([2, 1], index=[True, False], name="count"),
+        )
+        tm.assert_series_equal(
+            Series([True, True, False]).value_counts(dropna=False),
+            Series([2, 1], index=[True, False], name="count"),
+        )
+
+        tm.assert_series_equal(
+            Series([True] * 3 + [False] * 2 + [None] * 5).value_counts(dropna=True),
+            Series([3, 2], index=Index([True, False], dtype=object), name="count"),
+        )
+        tm.assert_series_equal(
+            Series([True] * 5 + [False] * 3 + [None] * 2).value_counts(dropna=False),
+            Series([5, 3, 2], index=[True, False, None], name="count"),
+        )
+        tm.assert_series_equal(
+            Series([10.3, 5.0, 5.0]).value_counts(dropna=True),
+            Series([2, 1], index=[5.0, 10.3], name="count"),
+        )
+        tm.assert_series_equal(
+            Series([10.3, 5.0, 5.0]).value_counts(dropna=False),
+            Series([2, 1], index=[5.0, 10.3], name="count"),
+        )
+
+        tm.assert_series_equal(
+            Series([10.3, 5.0, 5.0, None]).value_counts(dropna=True),
+            Series([2, 1], index=[5.0, 10.3], name="count"),
+        )
+
+        result = Series([10.3, 10.3, 5.0, 5.0, 5.0, None]).value_counts(dropna=False)
+        expected = Series([3, 2, 1], index=[5.0, 10.3, None], name="count")
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("dtype", (np.float64, object, "M8[ns]"))
+    def test_value_counts_normalized(self, dtype):
+        # GH12558
+        s = Series([1] * 2 + [2] * 3 + [np.nan] * 5)
+        s_typed = s.astype(dtype)
+        result = s_typed.value_counts(normalize=True, dropna=False)
+        expected = Series(
+            [0.5, 0.3, 0.2],
+            index=Series([np.nan, 2.0, 1.0], dtype=dtype),
+            name="proportion",
+        )
+        tm.assert_series_equal(result, expected)
+
+        result = s_typed.value_counts(normalize=True, dropna=True)
+        expected = Series(
+            [0.6, 0.4], index=Series([2.0, 1.0], dtype=dtype), name="proportion"
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_value_counts_uint64(self):
+        arr = np.array([2**63], dtype=np.uint64)
+        expected = Series([1], index=[2**63], name="count")
+        result = algos.value_counts_internal(arr)
+
+        tm.assert_series_equal(result, expected)
+
+        arr = np.array([-1, 2**63], dtype=object)
+        expected = Series([1, 1], index=[-1, 2**63], name="count")
+        result = algos.value_counts_internal(arr)
+
+        tm.assert_series_equal(result, expected)
+
+    def test_value_counts_series(self):
+        # GH#54857
+        values = np.array([3, 1, 2, 3, 4, np.nan])
+        result = Series(values).value_counts(bins=3)
+        expected = Series(
+            [2, 2, 1],
+            index=IntervalIndex.from_tuples(
+                [(0.996, 2.0), (2.0, 3.0), (3.0, 4.0)], dtype="interval[float64, right]"
+            ),
+            name="count",
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_value_counts_stability(self):
+        # GH 63155
+        arr = np.random.default_rng(2).integers(0, 32, 64)
+        result = algos.value_counts_internal(arr, sort=True)
+
+        value_counts = Series(arr).value_counts(sort=False)
+        expected = value_counts.sort_values(ascending=False, kind="stable")
+        tm.assert_series_equal(result, expected)
+
+        unstable_sorted = value_counts.sort_values(ascending=False, kind="quicksort")
+        with pytest.raises(AssertionError):
+            tm.assert_series_equal(result, unstable_sorted)
+
+
+class TestDuplicated:
+    def test_duplicated_with_nas(self):
+        keys = np.array([0, 1, np.nan, 0, 2, np.nan], dtype=object)
+
+        result = algos.duplicated(keys)
+        expected = np.array([False, False, False, True, False, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.duplicated(keys, keep="first")
+        expected = np.array([False, False, False, True, False, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.duplicated(keys, keep="last")
+        expected = np.array([True, False, True, False, False, False])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.duplicated(keys, keep=False)
+        expected = np.array([True, False, True, True, False, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+        keys = np.empty(8, dtype=object)
+        for i, t in enumerate(
+            zip([0, 0, np.nan, np.nan] * 2, [0, np.nan, 0, np.nan] * 2, strict=True)
+        ):
+            keys[i] = t
+
+        result = algos.duplicated(keys)
+        falses = [False] * 4
+        trues = [True] * 4
+        expected = np.array(falses + trues)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.duplicated(keys, keep="last")
+        expected = np.array(trues + falses)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.duplicated(keys, keep=False)
+        expected = np.array(trues + trues)
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "case",
+        [
+            np.array([1, 2, 1, 5, 3, 2, 4, 1, 5, 6]),
+            np.array([1.1, 2.2, 1.1, np.nan, 3.3, 2.2, 4.4, 1.1, np.nan, 6.6]),
+            np.array(
+                [
+                    1 + 1j,
+                    2 + 2j,
+                    1 + 1j,
+                    5 + 5j,
+                    3 + 3j,
+                    2 + 2j,
+                    4 + 4j,
+                    1 + 1j,
+                    5 + 5j,
+                    6 + 6j,
+                ]
+            ),
+            np.array(["a", "b", "a", "e", "c", "b", "d", "a", "e", "f"], dtype=object),
+            np.array([1, 2**63, 1, 3**5, 10, 2**63, 39, 1, 3**5, 7], dtype=np.uint64),
+        ],
+    )
+    def test_numeric_object_likes(self, case):
+        exp_first = np.array(
+            [False, False, True, False, False, True, False, True, True, False]
+        )
+        exp_last = np.array(
+            [True, True, True, True, False, False, False, False, False, False]
+        )
+        exp_false = exp_first | exp_last
+
+        res_first = algos.duplicated(case, keep="first")
+        tm.assert_numpy_array_equal(res_first, exp_first)
+
+        res_last = algos.duplicated(case, keep="last")
+        tm.assert_numpy_array_equal(res_last, exp_last)
+
+        res_false = algos.duplicated(case, keep=False)
+        tm.assert_numpy_array_equal(res_false, exp_false)
+
+        # index
+        for idx in [Index(case), Index(case, dtype="category")]:
+            res_first = idx.duplicated(keep="first")
+            tm.assert_numpy_array_equal(res_first, exp_first)
+
+            res_last = idx.duplicated(keep="last")
+            tm.assert_numpy_array_equal(res_last, exp_last)
+
+            res_false = idx.duplicated(keep=False)
+            tm.assert_numpy_array_equal(res_false, exp_false)
+
+        # series
+        for s in [Series(case), Series(case, dtype="category")]:
+            res_first = s.duplicated(keep="first")
+            tm.assert_series_equal(res_first, Series(exp_first))
+
+            res_last = s.duplicated(keep="last")
+            tm.assert_series_equal(res_last, Series(exp_last))
+
+            res_false = s.duplicated(keep=False)
+            tm.assert_series_equal(res_false, Series(exp_false))
+
+    def test_datetime_likes(self):
+        dt = [
+            "2011-01-01",
+            "2011-01-02",
+            "2011-01-01",
+            "NaT",
+            "2011-01-03",
+            "2011-01-02",
+            "2011-01-04",
+            "2011-01-01",
+            "NaT",
+            "2011-01-06",
+        ]
+        td = [
+            "1 days",
+            "2 days",
+            "1 days",
+            "NaT",
+            "3 days",
+            "2 days",
+            "4 days",
+            "1 days",
+            "NaT",
+            "6 days",
+        ]
+
+        cases = [
+            np.array([Timestamp(d) for d in dt]),
+            np.array([Timestamp(d, tz="US/Eastern") for d in dt]),
+            np.array([Period(d, freq="D") for d in dt]),
+            np.array([np.datetime64(d) for d in dt]),
+            np.array([Timedelta(d) for d in td]),
+        ]
+
+        exp_first = np.array(
+            [False, False, True, False, False, True, False, True, True, False]
+        )
+        exp_last = np.array(
+            [True, True, True, True, False, False, False, False, False, False]
+        )
+        exp_false = exp_first | exp_last
+
+        for case in cases:
+            res_first = algos.duplicated(case, keep="first")
+            tm.assert_numpy_array_equal(res_first, exp_first)
+
+            res_last = algos.duplicated(case, keep="last")
+            tm.assert_numpy_array_equal(res_last, exp_last)
+
+            res_false = algos.duplicated(case, keep=False)
+            tm.assert_numpy_array_equal(res_false, exp_false)
+
+            # index
+            for idx in [
+                Index(case),
+                Index(case, dtype="category"),
+                Index(case, dtype=object),
+            ]:
+                res_first = idx.duplicated(keep="first")
+                tm.assert_numpy_array_equal(res_first, exp_first)
+
+                res_last = idx.duplicated(keep="last")
+                tm.assert_numpy_array_equal(res_last, exp_last)
+
+                res_false = idx.duplicated(keep=False)
+                tm.assert_numpy_array_equal(res_false, exp_false)
+
+            # series
+            for s in [
+                Series(case),
+                Series(case, dtype="category"),
+                Series(case, dtype=object),
+            ]:
+                res_first = s.duplicated(keep="first")
+                tm.assert_series_equal(res_first, Series(exp_first))
+
+                res_last = s.duplicated(keep="last")
+                tm.assert_series_equal(res_last, Series(exp_last))
+
+                res_false = s.duplicated(keep=False)
+                tm.assert_series_equal(res_false, Series(exp_false))
+
+    @pytest.mark.parametrize("case", [Index([1, 2, 3]), pd.RangeIndex(0, 3)])
+    def test_unique_index(self, case):
+        assert case.is_unique is True
+        tm.assert_numpy_array_equal(case.duplicated(), np.array([False, False, False]))
+
+    @pytest.mark.parametrize(
+        "arr, uniques",
+        [
+            (
+                [(0, 0), (0, 1), (1, 0), (1, 1), (0, 0), (0, 1), (1, 0), (1, 1)],
+                [(0, 0), (0, 1), (1, 0), (1, 1)],
+            ),
+            (
+                [("b", "c"), ("a", "b"), ("a", "b"), ("b", "c")],
+                [("b", "c"), ("a", "b")],
+            ),
+            ([("a", 1), ("b", 2), ("a", 3), ("a", 1)], [("a", 1), ("b", 2), ("a", 3)]),
+        ],
+    )
+    def test_unique_tuples(self, arr, uniques):
+        # https://github.com/pandas-dev/pandas/issues/16519
+        expected = np.empty(len(uniques), dtype=object)
+        expected[:] = uniques
+
+        msg = (
+            r"unique requires a Series, Index, ExtensionArray, np.ndarray "
+            r"or NumpyExtensionArray got list"
+        )
+        with pytest.raises(TypeError, match=msg):
+            # GH#52986
+            pd.unique(arr)
+
+        res = pd.unique(com.asarray_tuplesafe(arr, dtype=object))
+        tm.assert_numpy_array_equal(res, expected)
+
+    @pytest.mark.parametrize(
+        "array,expected",
+        [
+            (
+                [1 + 1j, 0, 1, 1j, 1 + 2j, 1 + 2j],
+                np.array([(1 + 1j), 0j, (1 + 0j), 1j, (1 + 2j)], dtype=complex),
+            )
+        ],
+    )
+    def test_unique_complex_numbers(self, array, expected):
+        # GH 17927
+        msg = (
+            r"unique requires a Series, Index, ExtensionArray, np.ndarray "
+            r"or NumpyExtensionArray got list"
+        )
+
+        with pytest.raises(TypeError, match=msg):
+            # GH#52986
+            pd.unique(array)
+
+        res = pd.unique(np.array(array))
+        tm.assert_numpy_array_equal(res, expected)
+
+
+class TestHashTable:
+    @pytest.mark.parametrize(
+        "htable, data",
+        [
+            (
+                ht.PyObjectHashTable,
+                np.array([f"foo_{i}" for i in range(1000)], dtype=object),
+            ),
+            (
+                ht.StringHashTable,
+                np.array([f"foo_{i}" for i in range(1000)], dtype=object),
+            ),
+            (ht.Float64HashTable, np.arange(1000, dtype=np.float64)),
+            (ht.Int64HashTable, np.arange(1000, dtype=np.int64)),
+            (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)),
+        ],
+    )
+    def test_hashtable_unique(self, htable, data, writable):
+        # output of maker has guaranteed unique elements
+        s = Series(data, dtype=data.dtype)
+        if htable == ht.Float64HashTable:
+            # add NaN for float column
+            s.loc[500] = np.nan
+        elif htable == ht.PyObjectHashTable:
+            # use different NaN types for object column
+            s.loc[500:502] = [np.nan, None, NaT]
+
+        # create duplicated selection
+        s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
+        s_duplicated.values.setflags(write=writable)
+
+        # drop_duplicates has own cython code (hash_table_func_helper.pxi)
+        # and is tested separately; keeps first occurrence like ht.unique()
+        expected_unique = s_duplicated.drop_duplicates(keep="first").values
+        result_unique = htable().unique(s_duplicated.values)
+        tm.assert_numpy_array_equal(result_unique, expected_unique)
+
+        # test return_inverse=True
+        # reconstruction can only succeed if the inverse is correct
+        result_unique, result_inverse = htable().unique(
+            s_duplicated.values, return_inverse=True
+        )
+        tm.assert_numpy_array_equal(result_unique, expected_unique)
+        reconstr = result_unique[result_inverse]
+        tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
+
+    @pytest.mark.parametrize(
+        "htable, data",
+        [
+            (
+                ht.PyObjectHashTable,
+                np.array([f"foo_{i}" for i in range(1000)], dtype=object),
+            ),
+            (
+                ht.StringHashTable,
+                np.array([f"foo_{i}" for i in range(1000)], dtype=object),
+            ),
+            (ht.Float64HashTable, np.arange(1000, dtype=np.float64)),
+            (ht.Int64HashTable, np.arange(1000, dtype=np.int64)),
+            (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)),
+        ],
+    )
+    def test_hashtable_factorize(self, htable, writable, data):
+        # output of maker has guaranteed unique elements
+        s = Series(data, dtype=data.dtype)
+        if htable == ht.Float64HashTable:
+            # add NaN for float column
+            s.loc[500] = np.nan
+        elif htable == ht.PyObjectHashTable:
+            # use different NaN types for object column
+            s.loc[500:502] = [np.nan, None, NaT]
+
+        # create duplicated selection
+        s_duplicated = s.sample(frac=3, replace=True).reset_index(drop=True)
+        s_duplicated.values.setflags(write=writable)
+        na_mask = s_duplicated.isna().values
+
+        result_unique, result_inverse = htable().factorize(s_duplicated.values)
+
+        # drop_duplicates has own cython code (hash_table_func_helper.pxi)
+        # and is tested separately; keeps first occurrence like ht.factorize()
+        # since factorize removes all NaNs, we do the same here
+        expected_unique = s_duplicated.dropna().drop_duplicates().values
+        tm.assert_numpy_array_equal(result_unique, expected_unique)
+
+        # reconstruction can only succeed if the inverse is correct. Since
+        # factorize removes the NaNs, those have to be excluded here as well
+        result_reconstruct = result_unique[result_inverse[~na_mask]]
+        expected_reconstruct = s_duplicated.dropna().values
+        tm.assert_numpy_array_equal(result_reconstruct, expected_reconstruct)
+
+
+class TestRank:
+    @pytest.mark.parametrize(
+        "arr",
+        [
+            [np.nan, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 3, np.nan],
+            [4.0, np.nan, 5.0, 5.0, 5.0, np.nan, 1, 2, 4.0, np.nan],
+        ],
+    )
+    def test_scipy_compat(self, arr):
+        sp_stats = pytest.importorskip("scipy.stats")
+
+        arr = np.array(arr)
+
+        mask = ~np.isfinite(arr)
+        result = libalgos.rank_1d(arr)
+        arr[mask] = np.inf
+        exp = sp_stats.rankdata(arr)
+        exp[mask] = np.nan
+        tm.assert_almost_equal(result, exp)
+
+    def test_basic(self, writable, any_int_numpy_dtype):
+        exp = np.array([1, 2], dtype=np.float64)
+
+        data = np.array([1, 100], dtype=any_int_numpy_dtype)
+        data.setflags(write=writable)
+        ser = Series(data)
+        result = algos.rank(ser)
+        tm.assert_numpy_array_equal(result, exp)
+
+    @pytest.mark.parametrize("dtype", [np.float64, np.uint64])
+    def test_uint64_overflow(self, dtype):
+        exp = np.array([1, 2], dtype=np.float64)
+
+        s = Series([1, 2**63], dtype=dtype)
+        tm.assert_numpy_array_equal(algos.rank(s), exp)
+
+    @pytest.mark.parametrize("method", ["average", "min", "max"])
+    def test_rank_tiny_values(self, method):
+        # GH62036: regression test for ranking with tiny float values
+        exp = np.array([4.0, 1.0, 3.0, np.nan, 2.0], dtype=np.float64)
+        s = Series(
+            [5.4954145e29, -9.791984e-21, 9.3715776e-26, pd.NA, 1.8790257e-28],
+            dtype="Float64",
+        )
+        s = s.astype(object)
+        result = algos.rank(s, method=method)
+        tm.assert_numpy_array_equal(result, exp)
+
+    def test_too_many_ndims(self):
+        arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
+        msg = "Array with ndim > 2 are not supported"
+
+        with pytest.raises(TypeError, match=msg):
+            algos.rank(arr)
+
+    @pytest.mark.single_cpu
+    def test_pct_max_many_rows(self):
+        # GH 18271
+        values = np.arange(2**24 + 1)
+        result = algos.rank(values, pct=True).max()
+        assert result == 1
+
+        values = np.arange(2**25 + 2).reshape(2**24 + 1, 2)
+        result = algos.rank(values, pct=True).max()
+        assert result == 1
+
+
+class TestMode:
+    def test_no_mode(self):
+        exp = Series([], dtype=np.float64, index=Index([], dtype=int))
+        result, _ = algos.mode(np.array([]))
+        tm.assert_numpy_array_equal(result, exp.values)
+
+    def test_mode_single(self, any_real_numpy_dtype):
+        # GH 15714
+        exp_single = [1]
+        data_single = [1]
+
+        exp_multi = [1]
+        data_multi = [1, 1]
+
+        ser = Series(data_single, dtype=any_real_numpy_dtype)
+        exp = Series(exp_single, dtype=any_real_numpy_dtype)
+        result, _ = algos.mode(ser.values)
+        tm.assert_numpy_array_equal(result, exp.values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+        ser = Series(data_multi, dtype=any_real_numpy_dtype)
+        exp = Series(exp_multi, dtype=any_real_numpy_dtype)
+        result, _ = algos.mode(ser.values)
+        tm.assert_numpy_array_equal(result, exp.values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+    def test_mode_obj_int(self):
+        exp = Series([1], dtype=int)
+        result, _ = algos.mode(exp.values)
+        tm.assert_numpy_array_equal(result, exp.values)
+
+        exp = Series(["a", "b", "c"], dtype=object)
+        result, _ = algos.mode(exp.values)
+        tm.assert_numpy_array_equal(result, exp.values)
+
+    def test_number_mode(self, any_real_numpy_dtype):
+        exp_single = [1]
+        data_single = [1] * 5 + [2] * 3
+
+        exp_multi = [1, 3]
+        data_multi = [1] * 5 + [2] * 3 + [3] * 5
+
+        ser = Series(data_single, dtype=any_real_numpy_dtype)
+        exp = Series(exp_single, dtype=any_real_numpy_dtype)
+        result, _ = algos.mode(ser.values)
+        tm.assert_numpy_array_equal(result, exp.values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+        ser = Series(data_multi, dtype=any_real_numpy_dtype)
+        exp = Series(exp_multi, dtype=any_real_numpy_dtype)
+        result, _ = algos.mode(ser.values)
+        tm.assert_numpy_array_equal(result, exp.values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+    def test_strobj_mode(self):
+        exp = ["b"]
+        data = ["a"] * 2 + ["b"] * 3
+
+        ser = Series(data, dtype="c")
+        exp = Series(exp, dtype="c")
+        result, _ = algos.mode(ser.values)
+        tm.assert_numpy_array_equal(result, exp.values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+    @pytest.mark.parametrize("dt", [str, object])
+    def test_strobj_multi_char(self, dt, using_infer_string):
+        exp = ["bar"]
+        data = ["foo"] * 2 + ["bar"] * 3
+
+        ser = Series(data, dtype=dt)
+        exp = Series(exp, dtype=dt)
+        result, _ = algos.mode(ser.values)
+        if using_infer_string and dt is str:
+            tm.assert_extension_array_equal(result, exp.values)
+        else:
+            tm.assert_numpy_array_equal(result, exp.values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+    def test_datelike_mode(self):
+        exp = Series(["1900-05-03", "2011-01-03", "2013-01-02"], dtype="M8[ns]")
+        ser = Series(["2011-01-03", "2013-01-02", "1900-05-03"], dtype="M8[ns]")
+        tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+        exp = Series(["2011-01-03", "2013-01-02"], dtype="M8[ns]")
+        ser = Series(
+            ["2011-01-03", "2013-01-02", "1900-05-03", "2011-01-03", "2013-01-02"],
+            dtype="M8[ns]",
+        )
+        tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+    def test_timedelta_mode(self):
+        exp = Series(["-1 days", "0 days", "1 days"], dtype="timedelta64[ns]")
+        ser = Series(["1 days", "-1 days", "0 days"], dtype="timedelta64[ns]")
+        tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+        exp = Series(["2 min", "1 day"], dtype="timedelta64[ns]")
+        ser = Series(
+            ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
+            dtype="timedelta64[ns]",
+        )
+        tm.assert_extension_array_equal(algos.mode(ser.values), exp._values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+    def test_mixed_dtype(self):
+        exp = Series(["foo"], dtype=object)
+        ser = Series([1, "foo", "foo"])
+        result, _ = algos.mode(ser.values)
+        tm.assert_numpy_array_equal(result, exp.values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+    def test_uint64_overflow(self):
+        exp = Series([2**63], dtype=np.uint64)
+        ser = Series([1, 2**63, 2**63], dtype=np.uint64)
+        result, _ = algos.mode(ser.values)
+        tm.assert_numpy_array_equal(result, exp.values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+        exp = Series([1, 2**63], dtype=np.uint64)
+        ser = Series([1, 2**63], dtype=np.uint64)
+        result, _ = algos.mode(ser.values)
+        tm.assert_numpy_array_equal(result, exp.values)
+        tm.assert_series_equal(ser.mode(), exp)
+
+    def test_categorical(self):
+        c = Categorical([1, 2])
+        exp = c
+        res = Series(c).mode()._values
+        tm.assert_categorical_equal(res, exp)
+
+        c = Categorical([1, "a", "a"])
+        exp = Categorical(["a"], categories=[1, "a"])
+        res = Series(c).mode()._values
+        tm.assert_categorical_equal(res, exp)
+
+        c = Categorical([1, 1, 2, 3, 3])
+        exp = Categorical([1, 3], categories=[1, 2, 3])
+        res = Series(c).mode()._values
+        tm.assert_categorical_equal(res, exp)
+
+    def test_index(self):
+        idx = Index([1, 2, 3])
+        exp = Series([1, 2, 3], dtype=np.int64)
+        result, _ = algos.mode(idx)
+        tm.assert_numpy_array_equal(result, exp.values)
+
+        idx = Index([1, "a", "a"])
+        exp = Series(["a"], dtype=object)
+        result, _ = algos.mode(idx)
+        tm.assert_numpy_array_equal(result, exp.values)
+
+        idx = Index([1, 1, 2, 3, 3])
+        exp = Series([1, 3], dtype=np.int64)
+        result, _ = algos.mode(idx)
+        tm.assert_numpy_array_equal(result, exp.values)
+
+        idx = Index(
+            ["1 day", "1 day", "-1 day", "-1 day 2 min", "2 min", "2 min"],
+            dtype="timedelta64[ns]",
+        )
+        with pytest.raises(AttributeError, match="TimedeltaIndex"):
+            # algos.mode expects Arraylike, does *not* unwrap TimedeltaIndex
+            algos.mode(idx)
+
+    def test_ser_mode_with_name(self):
+        # GH 46737
+        ser = Series([1, 1, 3], name="foo")
+        result = ser.mode()
+        expected = Series([1], name="foo")
+        tm.assert_series_equal(result, expected)
+
+
+class TestDiff:
+    @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
+    def test_diff_datetimelike_nat(self, dtype):
+        # NaT - NaT is NaT, not 0
+        arr = np.arange(12).astype(np.int64).view(dtype).reshape(3, 4)
+        arr[:, 2] = arr.dtype.type("NaT", "ns")
+        result = algos.diff(arr, 1, axis=0)
+
+        expected = np.ones(arr.shape, dtype="timedelta64[ns]") * 4
+        expected[:, 2] = np.timedelta64("NaT", "ns")
+        expected[0, :] = np.timedelta64("NaT", "ns")
+
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.diff(arr.T, 1, axis=1)
+        tm.assert_numpy_array_equal(result, expected.T)
+
+    def test_diff_ea_axis(self):
+        dta = date_range("2016-01-01", periods=3, tz="US/Pacific")._data
+
+        msg = "cannot diff DatetimeArray on axis=1"
+        with pytest.raises(ValueError, match=msg):
+            algos.diff(dta, 1, axis=1)
+
+    @pytest.mark.parametrize("dtype", ["int8", "int16"])
+    def test_diff_low_precision_int(self, dtype):
+        arr = np.array([0, 1, 1, 0, 0], dtype=dtype)
+        result = algos.diff(arr, 1)
+        expected = np.array([np.nan, 1, 0, -1, 0], dtype="float32")
+        tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("op", [np.array, pd.array])
+def test_union_with_duplicates(op):
+    # GH#36289
+    lvals = op([3, 1, 3, 4])
+    rvals = op([2, 3, 1, 1])
+    expected = op([3, 3, 1, 1, 4, 2])
+    if isinstance(expected, np.ndarray):
+        result = algos.union_with_duplicates(lvals, rvals)
+        tm.assert_numpy_array_equal(result, expected)
+    else:
+        result = algos.union_with_duplicates(lvals, rvals)
+        tm.assert_extension_array_equal(result, expected)
diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py
new file mode 100644
index 0000000000000000000000000000000000000000..74cac1b8d1c1e0b105dc33270fb48e55708622c8
--- /dev/null
+++ b/pandas/tests/test_col.py
@@ -0,0 +1,293 @@
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas._libs.properties import cache_readonly
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.api.typing import Expression
+from pandas.tests.test_register_accessor import ensure_removed
+
+
+@pytest.mark.parametrize(
+    ("expr", "expected_values", "expected_str"),
+    [
+        (pd.col("a"), [1, 2], "col('a')"),
+        (pd.col("a") * 2, [2, 4], "col('a') * 2"),
+        (pd.col("a").sum(), [3, 3], "col('a').sum()"),
+        (pd.col("a") + 1, [2, 3], "col('a') + 1"),
+        (1 + pd.col("a"), [2, 3], "1 + col('a')"),
+        (pd.col("a") - 1, [0, 1], "col('a') - 1"),
+        (1 - pd.col("a"), [0, -1], "1 - col('a')"),
+        (pd.col("a") * 1, [1, 2], "col('a') * 1"),
+        (1 * pd.col("a"), [1, 2], "1 * col('a')"),
+        (pd.col("a") / 1, [1.0, 2.0], "col('a') / 1"),
+        (1 / pd.col("a"), [1.0, 0.5], "1 / col('a')"),
+        (pd.col("a") // 1, [1, 2], "col('a') // 1"),
+        (1 // pd.col("a"), [1, 0], "1 // col('a')"),
+        (pd.col("a") % 1, [0, 0], "col('a') % 1"),
+        (1 % pd.col("a"), [0, 1], "1 % col('a')"),
+        (pd.col("a") > 1, [False, True], "col('a') > 1"),
+        (pd.col("a") >= 1, [True, True], "col('a') >= 1"),
+        (pd.col("a") < 1, [False, False], "col('a') < 1"),
+        (pd.col("a") <= 1, [True, False], "col('a') <= 1"),
+        (pd.col("a") == 1, [True, False], "col('a') == 1"),
+        (np.power(pd.col("a"), 2), [1, 4], "power(col('a'), 2)"),
+        (np.divide(pd.col("a"), pd.col("a")), [1.0, 1.0], "divide(col('a'), col('a'))"),
+        (
+            (pd.col("a") + 1) * (pd.col("b") + 2),
+            [10, 18],
+            "(col('a') + 1) * (col('b') + 2)",
+        ),
+        (
+            (pd.col("a") - 1).astype("bool"),
+            [False, True],
+            "(col('a') - 1).astype('bool')",
+        ),
+        # Unary operators
+        (-pd.col("a"), [-1, -2], "-col('a')"),
+        (+pd.col("a"), [1, 2], "+col('a')"),
+        (-(pd.col("a") + 1), [-2, -3], "-(col('a') + 1)"),
+        (-pd.col("a") * 2, [-2, -4], "(-col('a')) * 2"),
+        (abs(pd.col("a")), [1, 2], "abs(col('a'))"),
+        (abs(pd.col("a") - 2), [1, 0], "abs(col('a') - 2)"),
+    ],
+)
+def test_col_simple(
+    expr: Expression, expected_values: list[object], expected_str: str
+) -> None:
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    result = df.assign(c=expr)
+    expected = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": expected_values})
+    tm.assert_frame_equal(result, expected)
+    assert str(expr) == expected_str
+
+
+def test_frame_getitem() -> None:
+    # https://github.com/pandas-dev/pandas/pull/63439
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    expr = pd.col("a") == 2
+    result = df[expr]
+    expected = df.iloc[[1]]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_frame_setitem() -> None:
+    # https://github.com/pandas-dev/pandas/pull/63439
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    expr = pd.col("a") == 2
+
+    result = df.copy()
+    result[expr] = 100
+    expected = pd.DataFrame({"a": [1, 100], "b": [3, 100]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_frame_loc() -> None:
+    # https://github.com/pandas-dev/pandas/pull/63439
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    expr = pd.col("a") == 2
+    result = df.copy()
+    result.loc[expr, "b"] = 100
+    expected = pd.DataFrame({"a": [1, 2], "b": [3, 100]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_frame_iloc() -> None:
+    # https://github.com/pandas-dev/pandas/pull/63439
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    expr = pd.col("a") == 2
+    result = df.copy()
+    result.iloc[expr, 1] = 100
+    expected = pd.DataFrame({"a": [1, 2], "b": [3, 100]})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("expr", "expected_values", "expected_str"),
+    [
+        (pd.col("a").dt.year, [2020], "col('a').dt.year"),
+        (pd.col("a").dt.strftime("%B"), ["January"], "col('a').dt.strftime('%B')"),
+        (pd.col("b").str.upper(), ["FOO"], "col('b').str.upper()"),
+    ],
+)
+def test_namespaces(
+    expr: Expression, expected_values: list[object], expected_str: str
+) -> None:
+    df = pd.DataFrame({"a": [datetime(2020, 1, 1)], "b": ["foo"]})
+    result = df.assign(c=expr)
+    expected = pd.DataFrame(
+        {"a": [datetime(2020, 1, 1)], "b": ["foo"], "c": expected_values}
+    )
+    tm.assert_frame_equal(result, expected, check_dtype=False)
+    assert str(expr) == expected_str
+
+
+def test_invalid() -> None:
+    df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
+    with pytest.raises(ValueError, match=r"did you mean one of \['a', 'b'\] instead"):
+        df.assign(c=pd.col("c").mean())
+    df = pd.DataFrame({f"col_{i}": [0] for i in range(11)})
+    msg = (
+        "did you mean one of "
+        r"\['col_0', 'col_1', 'col_2', 'col_3', "
+        "'col_4', 'col_5', 'col_6', 'col_7', "
+        r"'col_8', 'col_9',\.\.\.\] instead"
+    )
+    ""
+    with pytest.raises(ValueError, match=msg):
+        df.assign(c=pd.col("c").mean())
+
+
+def test_custom_accessor() -> None:
+    df = pd.DataFrame({"a": [1, 2, 3]})
+
+    class XYZAccessor:
+        def __init__(self, pandas_obj):
+            self._obj = pandas_obj
+
+        def mean(self):
+            return self._obj.mean()
+
+    with ensure_removed(pd.Series, "xyz"):
+        pd.api.extensions.register_series_accessor("xyz")(XYZAccessor)
+        result = df.assign(b=pd.col("a").xyz.mean())
+    expected = pd.DataFrame({"a": [1, 2, 3], "b": [2.0, 2.0, 2.0]})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("expr", "expected_values", "expected_str"),
+    [
+        (
+            pd.col("a") & pd.col("b"),
+            [False, False, True, False],
+            "col('a') & col('b')",
+        ),
+        (
+            pd.col("a") & True,
+            [True, False, True, False],
+            "col('a') & True",
+        ),
+        (
+            pd.col("a") | pd.col("b"),
+            [True, True, True, True],
+            "col('a') | col('b')",
+        ),
+        (
+            pd.col("a") | False,
+            [True, False, True, False],
+            "col('a') | False",
+        ),
+        (
+            pd.col("a") ^ pd.col("b"),
+            [True, True, False, True],
+            "col('a') ^ col('b')",
+        ),
+        (
+            pd.col("a") ^ True,
+            [False, True, False, True],
+            "col('a') ^ True",
+        ),
+        (
+            ~pd.col("a"),
+            [False, True, False, True],
+            "~col('a')",
+        ),
+    ],
+)
+def test_col_logical_ops(
+    expr: Expression, expected_values: list[bool], expected_str: str
+) -> None:
+    # https://github.com/pandas-dev/pandas/issues/63322
+    df = pd.DataFrame({"a": [True, False, True, False], "b": [False, True, True, True]})
+    result = df.assign(c=expr)
+    expected = pd.DataFrame(
+        {
+            "a": [True, False, True, False],
+            "b": [False, True, True, True],
+            "c": expected_values,
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+    assert str(expr) == expected_str
+
+    # Test that the expression works with .loc
+    result = df.loc[expr]
+    expected = df[expected_values]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_expression_getitem() -> None:
+    # https://github.com/pandas-dev/pandas/pull/63439
+    df = pd.DataFrame({"a": [1, 2, 3]})
+    expr = pd.col("a")[1]
+    expected_str = "col('a')[1]"
+
+    assert str(expr) == expected_str
+
+    result = df.assign(b=expr)
+    expected = pd.DataFrame({"a": [1, 2, 3], "b": [2, 2, 2]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_property() -> None:
+    # https://github.com/pandas-dev/pandas/pull/63439
+    df = pd.DataFrame({"a": [1, 2, 3]})
+    expr = pd.col("a").index
+    expected_str = "col('a').index"
+
+    assert str(expr) == expected_str
+
+    result = df.assign(b=expr)
+    expected = pd.DataFrame({"a": [1, 2, 3], "b": [0, 1, 2]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_cached_property() -> None:
+    # https://github.com/pandas-dev/pandas/pull/63439
+    # Ensure test is valid
+    assert isinstance(pd.Index.dtype, cache_readonly)
+
+    df = pd.DataFrame({"a": [1, 2, 3]})
+    expr = pd.col("a").index.dtype
+    expected_str = "col('a').index.dtype"
+    assert str(expr) == expected_str
+
+    result = df.assign(b=expr)
+    expected = pd.DataFrame({"a": [1, 2, 3], "b": np.int64})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_qcut() -> None:
+    # https://github.com/pandas-dev/pandas/pull/63439
+    df = pd.DataFrame({"a": [1, 2, 3]})
+    expr = pd.qcut(pd.col("a"), 3)
+    expected_str = "qcut(x=col('a'), q=3, labels=None, retbins=False, precision=3)"
+    assert str(expr) == expected_str, str(expr)
+
+    result = df.assign(b=expr)
+    expected = pd.DataFrame({"a": [1, 2, 3], "b": pd.qcut(df["a"], 3)})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_where() -> None:
+    # https://github.com/pandas-dev/pandas/pull/63439
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    expr = pd.col("a").where(pd.col("b") == 5, 100)
+    expected_str = "col('a').where(col('b') == 5, 100)"
+    assert str(expr) == expected_str, str(expr)
+
+    result = df.assign(c=expr)
+    expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [100, 2, 100]})
+    tm.assert_frame_equal(result, expected)
+
+    expr = pd.col("a").where(pd.col("b") == 5, pd.col("a") + 1)
+    expected_str = "col('a').where(col('b') == 5, col('a') + 1)"
+    assert str(expr) == expected_str, str(expr)
+
+    result = df.assign(c=expr)
+    expected = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [2, 2, 4]})
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
new file mode 100644
index 0000000000000000000000000000000000000000..eab2ce6a2ea943a96471cdfe919dfb8909ce1e8e
--- /dev/null
+++ b/pandas/tests/test_common.py
@@ -0,0 +1,273 @@
+import collections
+from functools import partial
+import string
+import subprocess
+import sys
+
+import numpy as np
+import pytest
+
+from pandas.compat import WASM
+
+import pandas as pd
+from pandas import Series
+import pandas._testing as tm
+from pandas.core import ops
+import pandas.core.common as com
+from pandas.util.version import Version
+
+
+class TestGetCallableName:
+    def fn(self, x):
+        return x
+
+    partial1 = partial(fn)
+    partial2 = partial(partial1)
+    lambda_ = lambda x: x
+
+    class SomeCall:
+        def __call__(self):
+            # This shouldn't actually get called below; SomeCall.__init__
+            #  should.
+            raise NotImplementedError
+
+    @pytest.mark.parametrize(
+        "func, expected",
+        [
+            (fn, "fn"),
+            (partial1, "fn"),
+            (partial2, "fn"),
+            (lambda_, "<lambda>"),
+            (SomeCall(), "SomeCall"),
+            (1, None),
+        ],
+    )
+    def test_get_callable_name(self, func, expected):
+        assert com.get_callable_name(func) == expected
+
+
+class TestRandomState:
+    def test_seed(self):
+        seed = 5
+        assert com.random_state(seed).uniform() == np.random.RandomState(seed).uniform()
+
+    def test_object(self):
+        seed = 10
+        state_obj = np.random.RandomState(seed)
+        assert (
+            com.random_state(state_obj).uniform()
+            == np.random.RandomState(seed).uniform()
+        )
+
+    def test_default(self):
+        assert com.random_state() is np.random
+
+    def test_array_like(self):
+        state = np.random.default_rng(None).integers(0, 2**31, size=624, dtype="uint32")
+        assert (
+            com.random_state(state).uniform() == np.random.RandomState(state).uniform()
+        )
+
+    def test_bit_generators(self):
+        seed = 3
+        assert (
+            com.random_state(np.random.MT19937(seed)).uniform()
+            == np.random.RandomState(np.random.MT19937(seed)).uniform()
+        )
+
+        seed = 11
+        assert (
+            com.random_state(np.random.PCG64(seed)).uniform()
+            == np.random.RandomState(np.random.PCG64(seed)).uniform()
+        )
+
+    @pytest.mark.parametrize("state", ["test", 5.5])
+    def test_error(self, state):
+        msg = (
+            "random_state must be an integer, array-like, a BitGenerator, Generator, "
+            "a numpy RandomState, or None"
+        )
+        with pytest.raises(ValueError, match=msg):
+            com.random_state(state)
+
+
+@pytest.mark.parametrize("args, expected", [((1, 2, None), True), ((1, 2, 3), False)])
+def test_any_none(args, expected):
+    assert com.any_none(*args) is expected
+
+
+@pytest.mark.parametrize(
+    "args, expected",
+    [((1, 2, 3), True), ((1, 2, None), False), ((None, None, None), False)],
+)
+def test_all_not_none(args, expected):
+    assert com.all_not_none(*args) is expected
+
+
+@pytest.mark.parametrize(
+    "left, right, expected",
+    [
+        (Series([1], name="x"), Series([2], name="x"), "x"),
+        (Series([1], name="x"), Series([2], name="y"), None),
+        (Series([1]), Series([2], name="x"), None),
+        (Series([1], name="x"), Series([2]), None),
+        (Series([1], name="x"), [2], "x"),
+        ([1], Series([2], name="y"), "y"),
+        # matching NAs
+        (Series([1], name=np.nan), pd.Index([], name=np.nan), np.nan),
+        (Series([1], name=np.nan), pd.Index([], name=pd.NaT), None),
+        (Series([1], name=pd.NA), pd.Index([], name=pd.NA), pd.NA),
+        # tuple name GH#39757
+        (
+            Series([1], name=np.int64(1)),
+            pd.Index([], name=(np.int64(1), np.int64(2))),
+            None,
+        ),
+        (
+            Series([1], name=(np.int64(1), np.int64(2))),
+            pd.Index([], name=(np.int64(1), np.int64(2))),
+            (np.int64(1), np.int64(2)),
+        ),
+        pytest.param(
+            Series([1], name=(np.float64("nan"), np.int64(2))),
+            pd.Index([], name=(np.float64("nan"), np.int64(2))),
+            (np.float64("nan"), np.int64(2)),
+            marks=pytest.mark.xfail(
+                reason="Not checking for matching NAs inside tuples."
+            ),
+        ),
+    ],
+)
+def test_maybe_match_name(left, right, expected):
+    res = ops.common._maybe_match_name(left, right)
+    assert res is expected or res == expected
+
+
+@pytest.mark.parametrize(
+    "into, msg",
+    [
+        (
+            # uninitialized defaultdict
+            collections.defaultdict,
+            r"to_dict\(\) only accepts initialized defaultdicts",
+        ),
+        (
+            # non-mapping subtypes,, instance
+            [],
+            "unsupported type: <class 'list'>",
+        ),
+        (
+            # non-mapping subtypes, class
+            list,
+            "unsupported type: <class 'list'>",
+        ),
+    ],
+)
+def test_standardize_mapping_type_error(into, msg):
+    with pytest.raises(TypeError, match=msg):
+        com.standardize_mapping(into)
+
+
+def test_standardize_mapping():
+    fill = {"bad": "data"}
+    assert com.standardize_mapping(fill) == dict
+
+    # Convert instance to type
+    assert com.standardize_mapping({}) == dict
+
+    dd = collections.defaultdict(list)
+    assert isinstance(com.standardize_mapping(dd), partial)
+
+
+def test_git_version():
+    # GH 21295
+    git_version = pd.__git_version__
+    assert len(git_version) == 40
+    assert all(c in string.hexdigits for c in git_version)
+
+
+def test_version_tag():
+    version = Version(pd.__version__)
+    try:
+        version > Version("0.0.1")
+    except TypeError as err:
+        raise ValueError(
+            "No git tags exist, please sync tags between upstream and your repo"
+        ) from err
+
+
+@pytest.mark.parametrize("obj", [obj for obj in pd.__dict__.values() if callable(obj)])
+def test_serializable(obj, temp_file):
+    # GH 35611
+    unpickled = tm.round_trip_pickle(obj, temp_file)
+    assert type(obj) == type(unpickled)
+
+
+class TestIsBoolIndexer:
+    def test_non_bool_array_with_na(self):
+        # in particular, this should not raise
+        arr = np.array(["A", "B", np.nan], dtype=object)
+        assert not com.is_bool_indexer(arr)
+
+    def test_list_subclass(self):
+        # GH#42433
+
+        class MyList(list):
+            pass
+
+        val = MyList(["a"])
+
+        assert not com.is_bool_indexer(val)
+
+        val = MyList([True])
+        assert com.is_bool_indexer(val)
+
+    def test_frozenlist(self):
+        # GH#42461
+        data = {"col1": [1, 2], "col2": [3, 4]}
+        df = pd.DataFrame(data=data)
+
+        frozen = df.index.names[1:]
+        assert not com.is_bool_indexer(frozen)
+
+        result = df[frozen]
+        expected = df[[]]
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("scalar", [1, True])
+    def test_numpyextensionarray(self, scalar):
+        # GH 63391
+        arr = pd.arrays.NumpyExtensionArray(np.array([scalar]))
+        assert com.is_bool_indexer(arr) is isinstance(scalar, bool)
+
+
+@pytest.mark.parametrize("with_exception", [True, False])
+def test_temp_setattr(with_exception):
+    # GH#45954
+    ser = Series(dtype=object)
+    ser.name = "first"
+    # Raise a ValueError in either case to satisfy pytest.raises
+    match = "Inside exception raised" if with_exception else "Outside exception raised"
+    with pytest.raises(ValueError, match=match):
+        with com.temp_setattr(ser, "name", "second"):
+            assert ser.name == "second"
+            if with_exception:
+                raise ValueError("Inside exception raised")
+        raise ValueError("Outside exception raised")
+    assert ser.name == "first"
+
+
+@pytest.mark.skipif(WASM, reason="Can't start subprocesses in WASM")
+@pytest.mark.single_cpu
+def test_str_size():
+    # GH#21758
+    a = "a"
+    expected = sys.getsizeof(a)
+    pyexe = sys.executable.replace("\\", "/")
+    call = [
+        pyexe,
+        "-c",
+        "a='a';import sys;sys.getsizeof(a);import pandas;print(sys.getsizeof(a));",
+    ]
+    result = subprocess.check_output(call).decode()[-4:-1].strip("\n")
+    assert int(result) == int(expected)
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py
new file mode 100644
index 0000000000000000000000000000000000000000..60b6537e11bac0ae201dd52577ae59542b4e81f2
--- /dev/null
+++ b/pandas/tests/test_downstream.py
@@ -0,0 +1,302 @@
+"""
+Testing that we work in the downstream packages
+"""
+
+import array
+from functools import partial
+import importlib
+import subprocess
+import sys
+
+import numpy as np
+import pytest
+
+from pandas.errors import IntCastingNaNError
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    Series,
+    TimedeltaIndex,
+)
+import pandas._testing as tm
+from pandas.util.version import Version
+
+
+@pytest.fixture
+def df():
+    return DataFrame({"A": [1, 2, 3]})
+
+
+def test_dask(df):
+    # dask sets "compute.use_numexpr" to False, so catch the current value
+    # and ensure to reset it afterwards to avoid impacting other tests
+    olduse = pd.get_option("compute.use_numexpr")
+
+    try:
+        pytest.importorskip("toolz")
+        dd = pytest.importorskip("dask.dataframe")
+
+        ddf = dd.from_pandas(df, npartitions=3)
+        assert ddf.A is not None
+        assert ddf.compute() is not None
+    finally:
+        pd.set_option("compute.use_numexpr", olduse)
+
+
+# TODO(CoW) see https://github.com/pandas-dev/pandas/pull/51082
+@pytest.mark.skip(reason="not implemented with CoW")
+def test_dask_ufunc():
+    # dask sets "compute.use_numexpr" to False, so catch the current value
+    # and ensure to reset it afterwards to avoid impacting other tests
+    olduse = pd.get_option("compute.use_numexpr")
+
+    try:
+        da = pytest.importorskip("dask.array")
+        dd = pytest.importorskip("dask.dataframe")
+
+        s = Series([1.5, 2.3, 3.7, 4.0])
+        ds = dd.from_pandas(s, npartitions=2)
+
+        result = da.log(ds).compute()
+        expected = np.log(s)
+        tm.assert_series_equal(result, expected)
+    finally:
+        pd.set_option("compute.use_numexpr", olduse)
+
+
+def test_construct_dask_float_array_int_dtype_match_ndarray():
+    # GH#40110 make sure we treat a float-dtype dask array with the same
+    #  rules we would for an ndarray
+    dd = pytest.importorskip("dask.dataframe")
+
+    arr = np.array([1, 2.5, 3])
+    darr = dd.from_array(arr)
+
+    res = Series(darr)
+    expected = Series(arr)
+    tm.assert_series_equal(res, expected)
+
+    # GH#49599 in 2.0 we raise instead of silently ignoring the dtype
+    msg = "Trying to coerce float values to integers"
+    with pytest.raises(ValueError, match=msg):
+        Series(darr, dtype="i8")
+
+    msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
+    arr[2] = np.nan
+    with pytest.raises(IntCastingNaNError, match=msg):
+        Series(darr, dtype="i8")
+    # which is the same as we get with a numpy input
+    with pytest.raises(IntCastingNaNError, match=msg):
+        Series(arr, dtype="i8")
+
+
+def test_xarray(df):
+    pytest.importorskip("xarray")
+
+    assert df.to_xarray() is not None
+
+
+def test_xarray_cftimeindex_nearest():
+    # https://github.com/pydata/xarray/issues/3751
+    cftime = pytest.importorskip("cftime")
+    xarray = pytest.importorskip("xarray")
+
+    times = xarray.date_range("0001", periods=2, use_cftime=True)
+    key = cftime.DatetimeGregorian(2000, 1, 1)
+    result = times.get_indexer([key], method="nearest")
+    expected = 1
+    assert result == expected
+
+
+@pytest.mark.single_cpu
+def test_oo_optimizable():
+    # GH 21071
+    subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"])
+
+
+@pytest.mark.single_cpu
+def test_oo_optimized_datetime_index_unpickle():
+    # GH 42866
+    subprocess.check_call(
+        [
+            sys.executable,
+            "-OO",
+            "-c",
+            (
+                "import pandas as pd, pickle; "
+                "pickle.loads(pickle.dumps(pd.date_range('2021-01-01', periods=1)))"
+            ),
+        ]
+    )
+
+
+def test_statsmodels():
+    smf = pytest.importorskip("statsmodels.formula.api")
+
+    df = DataFrame(
+        {"Lottery": range(5), "Literacy": range(5), "Pop1831": range(100, 105)}
+    )
+    smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=df).fit()
+
+
+def test_scikit_learn():
+    pytest.importorskip("sklearn")
+    from sklearn import (
+        datasets,
+        svm,
+    )
+
+    digits = datasets.load_digits()
+    clf = svm.SVC(gamma=0.001, C=100.0)
+    clf.fit(digits.data[:-1], digits.target[:-1])
+    clf.predict(digits.data[-1:])
+
+
+def test_seaborn(mpl_cleanup):
+    seaborn = pytest.importorskip("seaborn")
+    tips = DataFrame(
+        {"day": pd.date_range("2023", freq="D", periods=5), "total_bill": range(5)}
+    )
+    seaborn.stripplot(x="day", y="total_bill", data=tips)
+
+
+@pytest.mark.xfail(reason="pandas_datareader uses old variant of deprecate_kwarg")
+def test_pandas_datareader():
+    # https://github.com/pandas-dev/pandas/pull/61468
+    # https://github.com/pydata/pandas-datareader/issues/1005
+    pytest.importorskip("pandas_datareader")
+
+
+@pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
+def test_pyarrow(df):
+    pyarrow = pytest.importorskip("pyarrow")
+    table = pyarrow.Table.from_pandas(df)
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+def test_yaml_dump(df):
+    # GH#42748
+    yaml = pytest.importorskip("yaml")
+
+    dumped = yaml.dump(df)
+
+    loaded = yaml.load(dumped, Loader=yaml.Loader)
+    tm.assert_frame_equal(df, loaded)
+
+    loaded2 = yaml.load(dumped, Loader=yaml.UnsafeLoader)
+    tm.assert_frame_equal(df, loaded2)
+
+
+@pytest.mark.parametrize("dependency", ["numpy", "dateutil"])
+def test_missing_required_dependency(monkeypatch, dependency):
+    # GH#61030
+    original_import = __import__
+    mock_error = ImportError(f"Mock error for {dependency}")
+
+    def mock_import(name, *args, **kwargs):
+        if name == dependency:
+            raise mock_error
+        return original_import(name, *args, **kwargs)
+
+    monkeypatch.setattr("builtins.__import__", mock_import)
+
+    with pytest.raises(ImportError, match=dependency):
+        importlib.reload(importlib.import_module("pandas"))
+
+
+def test_frame_setitem_dask_array_into_new_col(request):
+    # GH#47128
+
+    # dask sets "compute.use_numexpr" to False, so catch the current value
+    # and ensure to reset it afterwards to avoid impacting other tests
+    olduse = pd.get_option("compute.use_numexpr")
+
+    try:
+        dask = pytest.importorskip("dask")
+        da = pytest.importorskip("dask.array")
+        if Version(dask.__version__) <= Version("2025.1.0") and Version(
+            np.__version__
+        ) >= Version("2.1"):
+            request.applymarker(
+                pytest.mark.xfail(reason="loc.__setitem__ incorrectly mutated column c")
+            )
+
+        dda = da.array([1, 2])
+        df = DataFrame({"a": ["a", "b"]})
+        df["b"] = dda
+        df["c"] = dda
+        df.loc[[False, True], "b"] = 100
+        result = df.loc[[1], :]
+        expected = DataFrame({"a": ["b"], "b": [100], "c": [2]}, index=[1])
+        tm.assert_frame_equal(result, expected)
+    finally:
+        pd.set_option("compute.use_numexpr", olduse)
+
+
+def test_pandas_priority():
+    # GH#48347
+
+    class MyClass:
+        __pandas_priority__ = 5000
+
+        def __radd__(self, other):
+            return self
+
+    left = MyClass()
+    right = Series(range(3))
+
+    assert right.__add__(left) is NotImplemented
+    assert right + left is left
+
+
+@pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"])
+@pytest.mark.parametrize(
+    "box", [memoryview, partial(array.array, "i"), "dask", "xarray"]
+)
+def test_from_obscure_array(dtype, box):
+    # GH#24539 recognize e.g xarray, dask, ...
+    # Note: we dont do this for PeriodArray bc _from_sequence won't accept
+    #  an array of integers
+    # TODO: could check with arraylike of Period objects
+    # GH#24539 recognize e.g xarray, dask, ...
+    arr = np.array([1, 2, 3], dtype=np.int64)
+    if box == "dask":
+        da = pytest.importorskip("dask.array")
+        data = da.array(arr)
+    elif box == "xarray":
+        xr = pytest.importorskip("xarray")
+        data = xr.DataArray(arr)
+    else:
+        data = box(arr)
+
+    func = {"M8[ns]": pd.to_datetime, "m8[ns]": pd.to_timedelta}[dtype]
+    result = func(arr).array
+    expected = func(data).array
+    tm.assert_equal(result, expected)
+
+    # Let's check the Indexes while we're here
+    idx_cls = {"M8[ns]": DatetimeIndex, "m8[ns]": TimedeltaIndex}[dtype]
+    result = idx_cls(arr)
+    expected = idx_cls(data)
+    tm.assert_index_equal(result, expected)
+
+
+def test_xarray_coerce_unit():
+    # GH44053
+    xr = pytest.importorskip("xarray")
+
+    arr = xr.DataArray([1, 2, 3])
+    result = pd.to_datetime(arr, unit="ns")
+    expected = DatetimeIndex(
+        [
+            "1970-01-01 00:00:00.000000001",
+            "1970-01-01 00:00:00.000000002",
+            "1970-01-01 00:00:00.000000003",
+        ],
+        dtype="datetime64[ns]",
+        freq=None,
+    )
+    tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9895e89cbf24c893f4a8801da715e0b491a6964
--- /dev/null
+++ b/pandas/tests/test_errors.py
@@ -0,0 +1,144 @@
+import warnings
+
+import pytest
+
+from pandas.errors import (
+    AbstractMethodError,
+    Pandas4Warning,
+    Pandas5Warning,
+    PandasChangeWarning,
+    PandasDeprecationWarning,
+    PandasPendingDeprecationWarning,
+    UndefinedVariableError,
+)
+
+import pandas as pd
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize(
+    "exc",
+    [
+        "AttributeConflictWarning",
+        "CSSWarning",
+        "CategoricalConversionWarning",
+        "ClosedFileError",
+        "DataError",
+        "DatabaseError",
+        "DtypeWarning",
+        "EmptyDataError",
+        "IncompatibilityWarning",
+        "IndexingError",
+        "InvalidColumnName",
+        "InvalidComparison",
+        "InvalidVersion",
+        "LossySetitemError",
+        "MergeError",
+        "NoBufferPresent",
+        "NumExprClobberingError",
+        "NumbaUtilError",
+        "OptionError",
+        "OutOfBoundsDatetime",
+        "ParserError",
+        "ParserWarning",
+        "PerformanceWarning",
+        "PossibleDataLossError",
+        "PossiblePrecisionLoss",
+        "PyperclipException",
+        "SpecificationError",
+        "UnsortedIndexError",
+        "UnsupportedFunctionCall",
+        "ValueLabelTypeMismatch",
+    ],
+)
+def test_exception_importable(exc):
+    from pandas import errors
+
+    err = getattr(errors, exc)
+    assert err is not None
+
+    # check that we can raise on them
+
+    msg = "^$"
+
+    with pytest.raises(err, match=msg):
+        raise err()
+
+
+def test_catch_oob():
+    from pandas import errors
+
+    msg = "Cannot cast 1500-01-01 00:00:00 to unit='ns' without overflow"
+    with pytest.raises(errors.OutOfBoundsDatetime, match=msg):
+        pd.Timestamp("15000101").as_unit("ns")
+
+
+@pytest.mark.parametrize("is_local", [True, False])
+def test_catch_undefined_variable_error(is_local):
+    variable_name = "x"
+    if is_local:
+        msg = f"local variable '{variable_name}' is not defined"
+    else:
+        msg = f"name '{variable_name}' is not defined"
+
+    with pytest.raises(UndefinedVariableError, match=msg):
+        raise UndefinedVariableError(variable_name, is_local)
+
+
+class Foo:
+    @classmethod
+    def classmethod(cls):
+        raise AbstractMethodError(cls, methodtype="classmethod")
+
+    @property
+    def property(self):
+        raise AbstractMethodError(self, methodtype="property")
+
+    def method(self):
+        raise AbstractMethodError(self)
+
+
+def test_AbstractMethodError_classmethod():
+    xpr = "This classmethod must be defined in the concrete class Foo"
+    with pytest.raises(AbstractMethodError, match=xpr):
+        Foo.classmethod()
+
+    xpr = "This property must be defined in the concrete class Foo"
+    with pytest.raises(AbstractMethodError, match=xpr):
+        Foo().property
+
+    xpr = "This method must be defined in the concrete class Foo"
+    with pytest.raises(AbstractMethodError, match=xpr):
+        Foo().method()
+
+
+@pytest.mark.parametrize(
+    "warn_category, catch_category",
+    [
+        (Pandas4Warning, PandasChangeWarning),
+        (Pandas4Warning, PandasDeprecationWarning),
+        (Pandas5Warning, PandasChangeWarning),
+        (Pandas5Warning, PandasPendingDeprecationWarning),
+    ],
+)
+def test_pandas_warnings(warn_category, catch_category):
+    # https://github.com/pandas-dev/pandas/pull/61468
+    with tm.assert_produces_warning(catch_category):
+        warnings.warn("test", category=warn_category)
+
+
+@pytest.mark.parametrize(
+    "warn_category, filter_category",
+    [
+        (Pandas4Warning, PandasChangeWarning),
+        (Pandas4Warning, PandasDeprecationWarning),
+        (Pandas5Warning, PandasChangeWarning),
+        (Pandas5Warning, PandasPendingDeprecationWarning),
+    ],
+)
+def test_pandas_warnings_filter(warn_category, filter_category):
+    # https://github.com/pandas-dev/pandas/pull/61468
+    # Ensure users can suppress warnings.
+    with tm.assert_produces_warning(None), warnings.catch_warnings():
+        warnings.filterwarnings(category=filter_category, action="ignore")
+        warnings.warn("test", category=warn_category)
diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py
new file mode 100644
index 0000000000000000000000000000000000000000..11a87f5e353c64bfca6b4642bcc60d42ae3b36d8
--- /dev/null
+++ b/pandas/tests/test_expressions.py
@@ -0,0 +1,475 @@
+import operator
+import re
+
+import numpy as np
+import pytest
+
+from pandas.compat._optional import import_optional_dependency
+
+from pandas import option_context
+import pandas._testing as tm
+from pandas.core.api import DataFrame
+from pandas.core.computation import expressions as expr
+from pandas.util.version import Version
+
+
+@pytest.fixture
+def _frame():
+    return DataFrame(
+        np.random.default_rng(2).standard_normal((10001, 4)),
+        columns=list("ABCD"),
+        dtype="float64",
+    )
+
+
+@pytest.fixture
+def _frame2():
+    return DataFrame(
+        np.random.default_rng(2).standard_normal((100, 4)),
+        columns=list("ABCD"),
+        dtype="float64",
+    )
+
+
+@pytest.fixture
+def _mixed(_frame):
+    return DataFrame(
+        {
+            "A": _frame["A"],
+            "B": _frame["B"].astype("float32"),
+            "C": _frame["C"].astype("int64"),
+            "D": _frame["D"].astype("int32"),
+        }
+    )
+
+
+@pytest.fixture
+def _mixed2(_frame2):
+    return DataFrame(
+        {
+            "A": _frame2["A"],
+            "B": _frame2["B"].astype("float32"),
+            "C": _frame2["C"].astype("int64"),
+            "D": _frame2["D"].astype("int32"),
+        }
+    )
+
+
+@pytest.fixture
+def _integer():
+    return DataFrame(
+        np.random.default_rng(2).integers(1, 100, size=(10001, 4)),
+        columns=list("ABCD"),
+        dtype="int64",
+    )
+
+
+@pytest.fixture
+def _integer_integers(_integer):
+    # integers to get a case with zeros
+    return _integer * np.random.default_rng(2).integers(0, 2, size=np.shape(_integer))
+
+
+@pytest.fixture
+def _integer2():
+    return DataFrame(
+        np.random.default_rng(2).integers(1, 100, size=(101, 4)),
+        columns=list("ABCD"),
+        dtype="int64",
+    )
+
+
+@pytest.fixture
+def _array(_frame):
+    return _frame["A"].to_numpy()
+
+
+@pytest.fixture
+def _array2(_frame2):
+    return _frame2["A"].to_numpy()
+
+
+@pytest.fixture
+def _array_mixed(_mixed):
+    return _mixed["D"].to_numpy()
+
+
+@pytest.fixture
+def _array_mixed2(_mixed2):
+    return _mixed2["D"].to_numpy()
+
+
+@pytest.mark.skipif(not expr.USE_NUMEXPR, reason="not using numexpr")
+class TestExpressions:
+    @staticmethod
+    def call_op(df, other, flex: bool, opname: str):
+        if flex:
+            op = lambda x, y: getattr(x, opname)(y)
+            op.__name__ = opname
+        else:
+            op = getattr(operator, opname)
+
+        with option_context("compute.use_numexpr", False):
+            expected = op(df, other)
+
+        expr.get_test_result()
+
+        result = op(df, other)
+        return result, expected
+
+    @pytest.mark.parametrize(
+        "fixture",
+        [
+            "_integer",
+            "_integer2",
+            "_integer_integers",
+            "_frame",
+            "_frame2",
+            "_mixed",
+            "_mixed2",
+        ],
+    )
+    @pytest.mark.parametrize("flex", [True, False])
+    @pytest.mark.parametrize(
+        "arith", ["add", "sub", "mul", "mod", "truediv", "floordiv"]
+    )
+    def test_run_arithmetic(self, request, fixture, flex, arith, monkeypatch):
+        df = request.getfixturevalue(fixture)
+        with monkeypatch.context() as m:
+            m.setattr(expr, "_MIN_ELEMENTS", 0)
+            result, expected = self.call_op(df, df, flex, arith)
+
+            if arith == "truediv":
+                assert all(x.kind == "f" for x in expected.dtypes.values)
+            tm.assert_equal(expected, result)
+
+            for i in range(len(df.columns)):
+                result, expected = self.call_op(
+                    df.iloc[:, i], df.iloc[:, i], flex, arith
+                )
+                if arith == "truediv":
+                    assert expected.dtype.kind == "f"
+                tm.assert_equal(expected, result)
+
+    @pytest.mark.parametrize(
+        "fixture",
+        [
+            "_integer",
+            "_integer2",
+            "_integer_integers",
+            "_frame",
+            "_frame2",
+            "_mixed",
+            "_mixed2",
+        ],
+    )
+    @pytest.mark.parametrize("flex", [True, False])
+    def test_run_binary(self, request, fixture, flex, comparison_op, monkeypatch):
+        """
+        tests solely that the result is the same whether or not numexpr is
+        enabled.  Need to test whether the function does the correct thing
+        elsewhere.
+        """
+        df = request.getfixturevalue(fixture)
+        arith = comparison_op.__name__
+        with option_context("compute.use_numexpr", False):
+            other = df + 1
+
+        with monkeypatch.context() as m:
+            m.setattr(expr, "_MIN_ELEMENTS", 0)
+            expr.set_test_mode(True)
+
+            result, expected = self.call_op(df, other, flex, arith)
+
+            used_numexpr = expr.get_test_result()
+            assert used_numexpr, "Did not use numexpr as expected."
+            tm.assert_equal(expected, result)
+
+            for i in range(len(df.columns)):
+                binary_comp = other.iloc[:, i] + 1
+                self.call_op(df.iloc[:, i], binary_comp, flex, "add")
+
+    def test_invalid(self):
+        array = np.random.default_rng(2).standard_normal(1_000_001)
+        array2 = np.random.default_rng(2).standard_normal(100)
+
+        # no op
+        result = expr._can_use_numexpr(operator.add, None, array, array, "evaluate")
+        assert not result
+
+        # min elements
+        result = expr._can_use_numexpr(operator.add, "+", array2, array2, "evaluate")
+        assert not result
+
+        # ok, we only check on first part of expression
+        result = expr._can_use_numexpr(operator.add, "+", array, array2, "evaluate")
+        assert result
+
+    @pytest.mark.filterwarnings("ignore:invalid value encountered in:RuntimeWarning")
+    @pytest.mark.parametrize(
+        "opname,op_str",
+        [("add", "+"), ("sub", "-"), ("mul", "*"), ("truediv", "/"), ("pow", "**")],
+    )
+    @pytest.mark.parametrize(
+        "left_fix,right_fix", [("_array", "_array2"), ("_array_mixed", "_array_mixed2")]
+    )
+    def test_binary_ops(self, request, opname, op_str, left_fix, right_fix):
+        left = request.getfixturevalue(left_fix)
+        right = request.getfixturevalue(right_fix)
+
+        def testit(left, right, opname, op_str):
+            if opname == "pow":
+                left = np.abs(left)
+
+            op = getattr(operator, opname)
+
+            # array has 0s
+            result = expr.evaluate(op, left, left, use_numexpr=True)
+            expected = expr.evaluate(op, left, left, use_numexpr=False)
+            tm.assert_numpy_array_equal(result, expected)
+
+            result = expr._can_use_numexpr(op, op_str, right, right, "evaluate")
+            assert not result
+
+        with option_context("compute.use_numexpr", False):
+            testit(left, right, opname, op_str)
+
+        expr.set_numexpr_threads(1)
+        testit(left, right, opname, op_str)
+        expr.set_numexpr_threads()
+        testit(left, right, opname, op_str)
+
+    @pytest.mark.parametrize(
+        "left_fix,right_fix", [("_array", "_array2"), ("_array_mixed", "_array_mixed2")]
+    )
+    def test_comparison_ops(self, request, comparison_op, left_fix, right_fix):
+        left = request.getfixturevalue(left_fix)
+        right = request.getfixturevalue(right_fix)
+
+        def testit():
+            f12 = left + 1
+            f22 = right + 1
+
+            op = comparison_op
+
+            result = expr.evaluate(op, left, f12, use_numexpr=True)
+            expected = expr.evaluate(op, left, f12, use_numexpr=False)
+            tm.assert_numpy_array_equal(result, expected)
+
+            result = expr._can_use_numexpr(op, op, right, f22, "evaluate")
+            assert not result
+
+        with option_context("compute.use_numexpr", False):
+            testit()
+
+        expr.set_numexpr_threads(1)
+        testit()
+        expr.set_numexpr_threads()
+        testit()
+
+    @pytest.mark.parametrize("cond", [True, False])
+    @pytest.mark.parametrize("fixture", ["_frame", "_frame2", "_mixed", "_mixed2"])
+    def test_where(self, request, cond, fixture):
+        df = request.getfixturevalue(fixture)
+
+        def testit():
+            c = np.empty(df.shape, dtype=np.bool_)
+            c.fill(cond)
+            result = expr.where(c, df.values, df.values + 1)
+            expected = np.where(c, df.values, df.values + 1)
+            tm.assert_numpy_array_equal(result, expected)
+
+        with option_context("compute.use_numexpr", False):
+            testit()
+
+        expr.set_numexpr_threads(1)
+        testit()
+        expr.set_numexpr_threads()
+        testit()
+
+    @pytest.mark.parametrize(
+        "op_str,opname", [("/", "truediv"), ("//", "floordiv"), ("**", "pow")]
+    )
+    def test_bool_ops_raise_on_arithmetic(self, op_str, opname):
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).random(10) > 0.5,
+                "b": np.random.default_rng(2).random(10) > 0.5,
+            }
+        )
+
+        msg = f"operator '{opname}' not implemented for bool dtypes"
+        f = getattr(operator, opname)
+        err_msg = re.escape(msg)
+
+        with pytest.raises(NotImplementedError, match=err_msg):
+            f(df, df)
+
+        with pytest.raises(NotImplementedError, match=err_msg):
+            f(df.a, df.b)
+
+        with pytest.raises(NotImplementedError, match=err_msg):
+            f(df.a, True)
+
+        with pytest.raises(NotImplementedError, match=err_msg):
+            f(False, df.a)
+
+        with pytest.raises(NotImplementedError, match=err_msg):
+            f(False, df)
+
+        with pytest.raises(NotImplementedError, match=err_msg):
+            f(df, True)
+
+    @pytest.mark.parametrize(
+        "op_str,opname", [("+", "add"), ("*", "mul"), ("-", "sub")]
+    )
+    def test_bool_ops_warn_on_arithmetic(self, op_str, opname, monkeypatch):
+        n = 10
+        df = DataFrame(
+            {
+                "a": np.random.default_rng(2).random(n) > 0.5,
+                "b": np.random.default_rng(2).random(n) > 0.5,
+            }
+        )
+
+        subs = {"+": "|", "*": "&", "-": "^"}
+        sub_funcs = {"|": "or_", "&": "and_", "^": "xor"}
+
+        f = getattr(operator, opname)
+        fe = getattr(operator, sub_funcs[subs[op_str]])
+
+        if op_str == "-":
+            # raises TypeError
+            return
+
+        msg = "operator is not supported by numexpr"
+        ne = import_optional_dependency("numexpr", errors="ignore")
+        warning = (
+            UserWarning
+            if ne
+            and op_str in {"+", "*"}
+            and Version(ne.__version__) < Version("2.13.1")
+            else None
+        )
+        with monkeypatch.context() as m:
+            m.setattr(expr, "_MIN_ELEMENTS", 5)
+            with option_context("compute.use_numexpr", True):
+                with tm.assert_produces_warning(warning, match=msg):
+                    r = f(df, df)
+                    e = fe(df, df)
+                    tm.assert_frame_equal(r, e)
+
+                with tm.assert_produces_warning(warning, match=msg):
+                    r = f(df.a, df.b)
+                    e = fe(df.a, df.b)
+                    tm.assert_series_equal(r, e)
+
+                with tm.assert_produces_warning(warning, match=msg):
+                    r = f(df.a, True)
+                    e = fe(df.a, True)
+                    tm.assert_series_equal(r, e)
+
+                with tm.assert_produces_warning(warning, match=msg):
+                    r = f(False, df.a)
+                    e = fe(False, df.a)
+                    tm.assert_series_equal(r, e)
+
+                with tm.assert_produces_warning(warning, match=msg):
+                    r = f(False, df)
+                    e = fe(False, df)
+                    tm.assert_frame_equal(r, e)
+
+                with tm.assert_produces_warning(warning, match=msg):
+                    r = f(df, True)
+                    e = fe(df, True)
+                    tm.assert_frame_equal(r, e)
+
+    @pytest.mark.parametrize(
+        "test_input,expected",
+        [
+            (
+                DataFrame(
+                    [[0, 1, 2, "aa"], [0, 1, 2, "aa"]], columns=["a", "b", "c", "dtype"]
+                ),
+                DataFrame([[False, False], [False, False]], columns=["a", "dtype"]),
+            ),
+            (
+                DataFrame(
+                    [[0, 3, 2, "aa"], [0, 4, 2, "aa"], [0, 1, 1, "bb"]],
+                    columns=["a", "b", "c", "dtype"],
+                ),
+                DataFrame(
+                    [[False, False], [False, False], [False, False]],
+                    columns=["a", "dtype"],
+                ),
+            ),
+        ],
+    )
+    def test_bool_ops_column_name_dtype(self, test_input, expected):
+        # GH 22383 - .ne fails if columns containing column name 'dtype'
+        result = test_input.loc[:, ["a", "dtype"]].ne(test_input.loc[:, ["a", "dtype"]])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "arith", ("add", "sub", "mul", "mod", "truediv", "floordiv")
+    )
+    @pytest.mark.parametrize("axis", (0, 1))
+    def test_frame_series_axis(self, axis, arith, _frame, monkeypatch):
+        # GH#26736 Dataframe.floordiv(Series, axis=1) fails
+
+        df = _frame
+        if axis == 1:
+            other = df.iloc[0, :]
+        else:
+            other = df.iloc[:, 0]
+
+        with monkeypatch.context() as m:
+            m.setattr(expr, "_MIN_ELEMENTS", 0)
+
+            op_func = getattr(df, arith)
+
+            with option_context("compute.use_numexpr", False):
+                expected = op_func(other, axis=axis)
+
+            result = op_func(other, axis=axis)
+            tm.assert_frame_equal(expected, result)
+
+    @pytest.mark.parametrize(
+        "op",
+        [
+            "__mod__",
+            "__rmod__",
+            "__floordiv__",
+            "__rfloordiv__",
+        ],
+    )
+    @pytest.mark.parametrize("scalar", [-5, 5])
+    def test_python_semantics_with_numexpr_installed(
+        self, op, box_with_array, scalar, monkeypatch
+    ):
+        # https://github.com/pandas-dev/pandas/issues/36047
+        with monkeypatch.context() as m:
+            m.setattr(expr, "_MIN_ELEMENTS", 0)
+            data = np.arange(-50, 50)
+            obj = box_with_array(data)
+            method = getattr(obj, op)
+            result = method(scalar)
+
+            # compare result with numpy
+            with option_context("compute.use_numexpr", False):
+                expected = method(scalar)
+
+            tm.assert_equal(result, expected)
+
+            # compare result element-wise with Python
+            for i, elem in enumerate(data):
+                if box_with_array == DataFrame:
+                    scalar_result = result.iloc[i, 0]
+                else:
+                    scalar_result = result[i]
+                try:
+                    expected = getattr(int(elem), op)(scalar)
+                except ZeroDivisionError:
+                    pass
+                else:
+                    assert scalar_result == expected
diff --git a/pandas/tests/test_flags.py b/pandas/tests/test_flags.py
new file mode 100644
index 0000000000000000000000000000000000000000..9294b3fc3319b78b59d5637acdf3fd75737cd836
--- /dev/null
+++ b/pandas/tests/test_flags.py
@@ -0,0 +1,48 @@
+import pytest
+
+import pandas as pd
+
+
+class TestFlags:
+    def test_equality(self):
+        a = pd.DataFrame().set_flags(allows_duplicate_labels=True).flags
+        b = pd.DataFrame().set_flags(allows_duplicate_labels=False).flags
+
+        assert a == a
+        assert b == b
+        assert a != b
+        assert a != 2
+
+    def test_set(self):
+        df = pd.DataFrame().set_flags(allows_duplicate_labels=True)
+        a = df.flags
+        a.allows_duplicate_labels = False
+        assert a.allows_duplicate_labels is False
+        a["allows_duplicate_labels"] = True
+        assert a.allows_duplicate_labels is True
+
+    def test_repr(self):
+        a = repr(pd.DataFrame({"A"}).set_flags(allows_duplicate_labels=True).flags)
+        assert a == "<Flags(allows_duplicate_labels=True)>"
+        a = repr(pd.DataFrame({"A"}).set_flags(allows_duplicate_labels=False).flags)
+        assert a == "<Flags(allows_duplicate_labels=False)>"
+
+    def test_obj_ref(self):
+        df = pd.DataFrame()
+        flags = df.flags
+        del df
+        with pytest.raises(ValueError, match="object has been deleted"):
+            flags.allows_duplicate_labels = True
+
+    def test_getitem(self):
+        df = pd.DataFrame()
+        flags = df.flags
+        assert flags["allows_duplicate_labels"] is True
+        flags["allows_duplicate_labels"] = False
+        assert flags["allows_duplicate_labels"] is False
+
+        with pytest.raises(KeyError, match="a"):
+            flags["a"]
+
+        with pytest.raises(ValueError, match="a"):
+            flags["a"] = 10
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff7ab22c197d8467bf19ed7a42cdbd9305cc88b5
--- /dev/null
+++ b/pandas/tests/test_multilevel.py
@@ -0,0 +1,376 @@
+import datetime
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    ArrowDtype,
+    DataFrame,
+    MultiIndex,
+    Series,
+)
+import pandas._testing as tm
+
+
+class TestMultiLevel:
+    def test_reindex_level(self, multiindex_year_month_day_dataframe_random_data):
+        # axis=0
+        ymd = multiindex_year_month_day_dataframe_random_data
+
+        month_sums = ymd.groupby("month").sum()
+        result = month_sums.reindex(ymd.index, level=1)
+        expected = ymd.groupby(level="month").transform("sum")
+
+        tm.assert_frame_equal(result, expected)
+
+        # Series
+        result = month_sums["A"].reindex(ymd.index, level=1)
+        expected = ymd["A"].groupby(level="month").transform("sum")
+        tm.assert_series_equal(result, expected, check_names=False)
+
+    def test_reindex(self, multiindex_dataframe_random_data):
+        frame = multiindex_dataframe_random_data
+
+        expected = frame.iloc[[0, 3]]
+        reindexed = frame.loc[[("foo", "one"), ("bar", "one")]]
+        tm.assert_frame_equal(reindexed, expected)
+
+    def test_reindex_preserve_levels(
+        self, multiindex_year_month_day_dataframe_random_data
+    ):
+        ymd = multiindex_year_month_day_dataframe_random_data
+
+        new_index = ymd.index[::10]
+        chunk = ymd.reindex(new_index)
+        assert chunk.index.is_(new_index)
+
+        chunk = ymd.loc[new_index]
+        assert chunk.index.equals(new_index)
+
+        ymdT = ymd.T
+        chunk = ymdT.reindex(columns=new_index)
+        assert chunk.columns.is_(new_index)
+
+        chunk = ymdT.loc[:, new_index]
+        assert chunk.columns.equals(new_index)
+
+    def test_groupby_transform(self, multiindex_dataframe_random_data):
+        frame = multiindex_dataframe_random_data
+
+        s = frame["A"]
+        grouper = s.index.get_level_values(0)
+
+        grouped = s.groupby(grouper, group_keys=False)
+
+        applied = grouped.apply(lambda x: x * 2)
+        expected = grouped.transform(lambda x: x * 2)
+        result = applied.reindex(expected.index)
+        tm.assert_series_equal(result, expected, check_names=False)
+
+    def test_groupby_corner(self):
+        midx = MultiIndex(
+            levels=[["foo"], ["bar"], ["baz"]],
+            codes=[[0], [0], [0]],
+            names=["one", "two", "three"],
+        )
+        df = DataFrame(
+            [np.random.default_rng(2).random(4)],
+            columns=["a", "b", "c", "d"],
+            index=midx,
+        )
+        # should work
+        df.groupby(level="three")
+
+    def test_setitem_with_expansion_multiindex_columns(
+        self, multiindex_year_month_day_dataframe_random_data
+    ):
+        ymd = multiindex_year_month_day_dataframe_random_data
+
+        df = ymd[:5].T
+        df[2000, 1, 10] = df[2000, 1, 7]
+        assert isinstance(df.columns, MultiIndex)
+        assert (df[2000, 1, 10] == df[2000, 1, 7]).all()
+
+    def test_alignment(self):
+        x = Series(
+            data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)])
+        )
+
+        y = Series(
+            data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)])
+        )
+
+        res = x - y
+        exp_index = x.index.union(y.index)
+        exp = x.reindex(exp_index) - y.reindex(exp_index)
+        tm.assert_series_equal(res, exp)
+
+        # hit non-monotonic code path
+        res = x[::-1] - y[::-1]
+        exp_index = x.index.union(y.index)
+        exp = x.reindex(exp_index) - y.reindex(exp_index)
+        tm.assert_series_equal(res, exp)
+
+    def test_groupby_multilevel(self, multiindex_year_month_day_dataframe_random_data):
+        ymd = multiindex_year_month_day_dataframe_random_data
+
+        result = ymd.groupby(level=[0, 1]).mean()
+
+        k1 = ymd.index.get_level_values(0)
+        k2 = ymd.index.get_level_values(1)
+
+        expected = ymd.groupby([k1, k2]).mean()
+
+        tm.assert_frame_equal(result, expected)
+        assert result.index.names == ymd.index.names[:2]
+
+        result2 = ymd.groupby(level=ymd.index.names[:2]).mean()
+        tm.assert_frame_equal(result, result2)
+
+    def test_multilevel_consolidate(self):
+        index = MultiIndex.from_tuples(
+            [("foo", "one"), ("foo", "two"), ("bar", "one"), ("bar", "two")]
+        )
+        df = DataFrame(
+            np.random.default_rng(2).standard_normal((4, 4)), index=index, columns=index
+        )
+        df["Totals", ""] = df.sum(axis=1)
+        df = df._consolidate()
+
+    def test_level_with_tuples(self):
+        index = MultiIndex(
+            levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]],
+            codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
+        )
+
+        series = Series(np.random.default_rng(2).standard_normal(6), index=index)
+        frame = DataFrame(np.random.default_rng(2).standard_normal((6, 4)), index=index)
+
+        result = series[("foo", "bar", 0)]
+        result2 = series.loc[("foo", "bar", 0)]
+        expected = series[:2]
+        expected.index = expected.index.droplevel(0)
+        tm.assert_series_equal(result, expected)
+        tm.assert_series_equal(result2, expected)
+
+        with pytest.raises(KeyError, match=r"^\(\('foo', 'bar', 0\), 2\)$"):
+            series[("foo", "bar", 0), 2]
+
+        result = frame.loc[("foo", "bar", 0)]
+        result2 = frame.xs(("foo", "bar", 0))
+        expected = frame[:2]
+        expected.index = expected.index.droplevel(0)
+        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(result2, expected)
+
+        index = MultiIndex(
+            levels=[[("foo", "bar"), ("foo", "baz"), ("foo", "qux")], [0, 1]],
+            codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
+        )
+
+        series = Series(np.random.default_rng(2).standard_normal(6), index=index)
+        frame = DataFrame(np.random.default_rng(2).standard_normal((6, 4)), index=index)
+
+        result = series[("foo", "bar")]
+        result2 = series.loc[("foo", "bar")]
+        expected = series[:2]
+        expected.index = expected.index.droplevel(0)
+        tm.assert_series_equal(result, expected)
+        tm.assert_series_equal(result2, expected)
+
+        result = frame.loc[("foo", "bar")]
+        result2 = frame.xs(("foo", "bar"))
+        expected = frame[:2]
+        expected.index = expected.index.droplevel(0)
+        tm.assert_frame_equal(result, expected)
+        tm.assert_frame_equal(result2, expected)
+
+    def test_reindex_level_partial_selection(self, multiindex_dataframe_random_data):
+        frame = multiindex_dataframe_random_data
+
+        result = frame.reindex(["foo", "qux"], level=0)
+        expected = frame.iloc[[0, 1, 2, 7, 8, 9]]
+        tm.assert_frame_equal(result, expected)
+
+        result = frame.T.reindex(["foo", "qux"], axis=1, level=0)
+        tm.assert_frame_equal(result, expected.T)
+
+        result = frame.loc[["foo", "qux"]]
+        tm.assert_frame_equal(result, expected)
+
+        result = frame["A"].loc[["foo", "qux"]]
+        tm.assert_series_equal(result, expected["A"])
+
+        result = frame.T.loc[:, ["foo", "qux"]]
+        tm.assert_frame_equal(result, expected.T)
+
+    @pytest.mark.parametrize("d", [4, "d"])
+    def test_empty_frame_groupby_dtypes_consistency(self, d):
+        # GH 20888
+        group_keys = ["a", "b", "c"]
+        df = DataFrame({"a": [1], "b": [2], "c": [3], "d": [d]})
+
+        g = df[df.a == 2].groupby(group_keys)
+        result = g.first().index
+        expected = MultiIndex(
+            levels=[[1], [2], [3]], codes=[[], [], []], names=["a", "b", "c"]
+        )
+
+        tm.assert_index_equal(result, expected)
+
+    def test_duplicate_groupby_issues(self):
+        idx_tp = [
+            ("600809", "20061231"),
+            ("600809", "20070331"),
+            ("600809", "20070630"),
+            ("600809", "20070331"),
+        ]
+        dt = ["demo", "demo", "demo", "demo"]
+
+        idx = MultiIndex.from_tuples(idx_tp, names=["STK_ID", "RPT_Date"])
+        s = Series(dt, index=idx)
+
+        result = s.groupby(s.index).first()
+        assert len(result) == 3
+
+    def test_subsets_multiindex_dtype(self):
+        # GH 20757
+        data = [["x", 1]]
+        columns = [("a", "b", np.nan), ("a", "c", 0.0)]
+        df = DataFrame(data, columns=MultiIndex.from_tuples(columns))
+        expected = df.dtypes.a.b
+        result = df.a.b.dtypes
+        tm.assert_series_equal(result, expected)
+
+    def test_datetime_object_multiindex(self):
+        data_dic = {
+            (0, datetime.date(2018, 3, 3)): {"A": 1, "B": 10},
+            (0, datetime.date(2018, 3, 4)): {"A": 2, "B": 11},
+            (1, datetime.date(2018, 3, 3)): {"A": 3, "B": 12},
+            (1, datetime.date(2018, 3, 4)): {"A": 4, "B": 13},
+        }
+        result = DataFrame.from_dict(data_dic, orient="index")
+        data = {"A": [1, 2, 3, 4], "B": [10, 11, 12, 13]}
+        index = [
+            [0, 0, 1, 1],
+            [
+                datetime.date(2018, 3, 3),
+                datetime.date(2018, 3, 4),
+                datetime.date(2018, 3, 3),
+                datetime.date(2018, 3, 4),
+            ],
+        ]
+        expected = DataFrame(data=data, index=index)
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_multiindex_with_na(self):
+        df = DataFrame(
+            [
+                ["A", np.nan, 1.23, 4.56],
+                ["A", "G", 1.23, 4.56],
+                ["A", "D", 9.87, 10.54],
+            ],
+            columns=["pivot_0", "pivot_1", "col_1", "col_2"],
+        ).set_index(["pivot_0", "pivot_1"])
+
+        df.at[("A", "F"), "col_2"] = 0.0
+
+        expected = DataFrame(
+            [
+                ["A", np.nan, 1.23, 4.56],
+                ["A", "G", 1.23, 4.56],
+                ["A", "D", 9.87, 10.54],
+                ["A", "F", np.nan, 0.0],
+            ],
+            columns=["pivot_0", "pivot_1", "col_1", "col_2"],
+        ).set_index(["pivot_0", "pivot_1"])
+
+        tm.assert_frame_equal(df, expected)
+
+    @pytest.mark.parametrize("na", [None, np.nan])
+    def test_multiindex_insert_level_with_na(self, na):
+        # GH 59003
+        df = DataFrame([0], columns=[["A"], ["B"]])
+        df[na, "B"] = 1
+        tm.assert_frame_equal(df[na], DataFrame([1], columns=["B"]))
+
+    def test_multiindex_dt_with_nan(self):
+        # GH#60388
+        df = DataFrame(
+            [
+                [1, np.nan, 5, np.nan],
+                [2, np.nan, 6, np.nan],
+                [np.nan, 3, np.nan, 7],
+                [np.nan, 4, np.nan, 8],
+            ],
+            index=Series(["a", "b", "c", "d"], dtype=object, name="sub"),
+            columns=MultiIndex.from_product(
+                [
+                    ["value1", "value2"],
+                    [datetime.datetime(2024, 11, 1), datetime.datetime(2024, 11, 2)],
+                ],
+                names=[None, "Date"],
+            ),
+        )
+        df = df.reset_index()
+        result = df[df.columns[0]]
+        expected = Series(["a", "b", "c", "d"], name=("sub", np.nan))
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.filterwarnings("ignore:Passing a BlockManager:DeprecationWarning")
+    def test_multiindex_with_pyarrow_categorical(self):
+        # GH#53051
+        pa = pytest.importorskip("pyarrow")
+
+        df = DataFrame(
+            {"string_column": ["A", "B", "C"], "number_column": [1, 2, 3]}
+        ).astype(
+            {
+                "string_column": ArrowDtype(pa.dictionary(pa.int32(), pa.string())),
+                "number_column": "float[pyarrow]",
+            }
+        )
+
+        df = df.set_index(["string_column", "number_column"])
+
+        df_expected = DataFrame(
+            index=MultiIndex.from_arrays(
+                [["A", "B", "C"], [1, 2, 3]], names=["string_column", "number_column"]
+            )
+        )
+        tm.assert_frame_equal(
+            df,
+            df_expected,
+            check_index_type=False,
+            check_column_type=False,
+        )
+
+
+class TestSorted:
+    """everything you wanted to test about sorting"""
+
+    def test_sort_non_lexsorted(self):
+        # degenerate case where we sort but don't
+        # have a satisfying result :<
+        # GH 15797
+        idx = MultiIndex(
+            [["A", "B", "C"], ["c", "b", "a"]], [[0, 1, 2, 0, 1, 2], [0, 2, 1, 1, 0, 2]]
+        )
+
+        df = DataFrame({"col": range(len(idx))}, index=idx, dtype="int64")
+        assert df.index.is_monotonic_increasing is False
+
+        sorted = df.sort_index()
+        assert sorted.index.is_monotonic_increasing is True
+
+        expected = DataFrame(
+            {"col": [1, 4, 5, 2]},
+            index=MultiIndex.from_tuples(
+                [("B", "a"), ("B", "c"), ("C", "a"), ("C", "b")]
+            ),
+            dtype="int64",
+        )
+        result = sorted.loc[pd.IndexSlice["B":"C", "a":"c"], :]
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
new file mode 100644
index 0000000000000000000000000000000000000000..531019e7222c75df5874de0175519c9416f940eb
--- /dev/null
+++ b/pandas/tests/test_nanops.py
@@ -0,0 +1,1319 @@
+from functools import partial
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.common import is_integer_dtype
+
+import pandas as pd
+from pandas import (
+    Series,
+    isna,
+)
+import pandas._testing as tm
+from pandas.core import nanops
+
+use_bn = nanops._USE_BOTTLENECK
+
+
+@pytest.fixture
+def disable_bottleneck(monkeypatch):
+    with monkeypatch.context() as m:
+        m.setattr(nanops, "_USE_BOTTLENECK", False)
+        yield
+
+
+@pytest.fixture
+def arr_shape():
+    return 11, 7
+
+
+@pytest.fixture
+def arr_float(arr_shape):
+    return np.random.default_rng(2).standard_normal(arr_shape)
+
+
+@pytest.fixture
+def arr_complex(arr_float):
+    return arr_float + arr_float * 1j
+
+
+@pytest.fixture
+def arr_int(arr_shape):
+    return np.random.default_rng(2).integers(-10, 10, arr_shape)
+
+
+@pytest.fixture
+def arr_bool(arr_shape):
+    return np.random.default_rng(2).integers(0, 2, arr_shape) == 0
+
+
+@pytest.fixture
+def arr_str(arr_float):
+    return np.abs(arr_float).astype("S")
+
+
+@pytest.fixture
+def arr_utf(arr_float):
+    return np.abs(arr_float).astype("U")
+
+
+@pytest.fixture
+def arr_date(arr_shape):
+    return np.random.default_rng(2).integers(0, 20000, arr_shape).astype("M8[ns]")
+
+
+@pytest.fixture
+def arr_tdelta(arr_shape):
+    return np.random.default_rng(2).integers(0, 20000, arr_shape).astype("m8[ns]")
+
+
+@pytest.fixture
+def arr_nan(arr_shape):
+    return np.tile(np.nan, arr_shape)
+
+
+@pytest.fixture
+def arr_float_nan(arr_float, arr_nan):
+    return np.vstack([arr_float, arr_nan])
+
+
+@pytest.fixture
+def arr_nan_float1(arr_nan, arr_float):
+    return np.vstack([arr_nan, arr_float])
+
+
+@pytest.fixture
+def arr_nan_nan(arr_nan):
+    return np.vstack([arr_nan, arr_nan])
+
+
+@pytest.fixture
+def arr_inf(arr_float):
+    return arr_float * np.inf
+
+
+@pytest.fixture
+def arr_float_inf(arr_float, arr_inf):
+    return np.vstack([arr_float, arr_inf])
+
+
+@pytest.fixture
+def arr_nan_inf(arr_nan, arr_inf):
+    return np.vstack([arr_nan, arr_inf])
+
+
+@pytest.fixture
+def arr_float_nan_inf(arr_float, arr_nan, arr_inf):
+    return np.vstack([arr_float, arr_nan, arr_inf])
+
+
+@pytest.fixture
+def arr_nan_nan_inf(arr_nan, arr_inf):
+    return np.vstack([arr_nan, arr_nan, arr_inf])
+
+
+@pytest.fixture
+def arr_obj(
+    arr_float, arr_int, arr_bool, arr_complex, arr_str, arr_utf, arr_date, arr_tdelta
+):
+    return np.vstack(
+        [
+            arr_float.astype("O"),
+            arr_int.astype("O"),
+            arr_bool.astype("O"),
+            arr_complex.astype("O"),
+            arr_str.astype("O"),
+            arr_utf.astype("O"),
+            arr_date.astype("O"),
+            arr_tdelta.astype("O"),
+        ]
+    )
+
+
+@pytest.fixture
+def arr_nan_nanj(arr_nan):
+    with np.errstate(invalid="ignore"):
+        return arr_nan + arr_nan * 1j
+
+
+@pytest.fixture
+def arr_complex_nan(arr_complex, arr_nan_nanj):
+    with np.errstate(invalid="ignore"):
+        return np.vstack([arr_complex, arr_nan_nanj])
+
+
+@pytest.fixture
+def arr_nan_infj(arr_inf):
+    with np.errstate(invalid="ignore"):
+        return arr_inf * 1j
+
+
+@pytest.fixture
+def arr_complex_nan_infj(arr_complex, arr_nan_infj):
+    with np.errstate(invalid="ignore"):
+        return np.vstack([arr_complex, arr_nan_infj])
+
+
+@pytest.fixture
+def arr_float_1d(arr_float):
+    return arr_float[:, 0]
+
+
+@pytest.fixture
+def arr_nan_1d(arr_nan):
+    return arr_nan[:, 0]
+
+
+@pytest.fixture
+def arr_float_nan_1d(arr_float_nan):
+    return arr_float_nan[:, 0]
+
+
+@pytest.fixture
+def arr_float1_nan_1d(arr_float1_nan):
+    return arr_float1_nan[:, 0]
+
+
+@pytest.fixture
+def arr_nan_float1_1d(arr_nan_float1):
+    return arr_nan_float1[:, 0]
+
+
+class TestnanopsDataFrame:
+    def setup_method(self):
+        nanops._USE_BOTTLENECK = False
+
+        arr_shape = (11, 7)
+
+        self.arr_float = np.random.default_rng(2).standard_normal(arr_shape)
+        self.arr_float1 = np.random.default_rng(2).standard_normal(arr_shape)
+        self.arr_complex = self.arr_float + self.arr_float1 * 1j
+        self.arr_int = np.random.default_rng(2).integers(-10, 10, arr_shape)
+        self.arr_bool = np.random.default_rng(2).integers(0, 2, arr_shape) == 0
+        self.arr_str = np.abs(self.arr_float).astype("S")
+        self.arr_utf = np.abs(self.arr_float).astype("U")
+        self.arr_date = (
+            np.random.default_rng(2).integers(0, 20000, arr_shape).astype("M8[ns]")
+        )
+        self.arr_tdelta = (
+            np.random.default_rng(2).integers(0, 20000, arr_shape).astype("m8[ns]")
+        )
+
+        self.arr_nan = np.tile(np.nan, arr_shape)
+        self.arr_float_nan = np.vstack([self.arr_float, self.arr_nan])
+        self.arr_float1_nan = np.vstack([self.arr_float1, self.arr_nan])
+        self.arr_nan_float1 = np.vstack([self.arr_nan, self.arr_float1])
+        self.arr_nan_nan = np.vstack([self.arr_nan, self.arr_nan])
+
+        self.arr_inf = self.arr_float * np.inf
+        self.arr_float_inf = np.vstack([self.arr_float, self.arr_inf])
+
+        self.arr_nan_inf = np.vstack([self.arr_nan, self.arr_inf])
+        self.arr_float_nan_inf = np.vstack([self.arr_float, self.arr_nan, self.arr_inf])
+        self.arr_nan_nan_inf = np.vstack([self.arr_nan, self.arr_nan, self.arr_inf])
+        self.arr_obj = np.vstack(
+            [
+                self.arr_float.astype("O"),
+                self.arr_int.astype("O"),
+                self.arr_bool.astype("O"),
+                self.arr_complex.astype("O"),
+                self.arr_str.astype("O"),
+                self.arr_utf.astype("O"),
+                self.arr_date.astype("O"),
+                self.arr_tdelta.astype("O"),
+            ]
+        )
+
+        with np.errstate(invalid="ignore"):
+            self.arr_nan_nanj = self.arr_nan + self.arr_nan * 1j
+            self.arr_complex_nan = np.vstack([self.arr_complex, self.arr_nan_nanj])
+
+            self.arr_nan_infj = self.arr_inf * 1j
+            self.arr_complex_nan_infj = np.vstack([self.arr_complex, self.arr_nan_infj])
+
+        self.arr_float_2d = self.arr_float
+        self.arr_float1_2d = self.arr_float1
+
+        self.arr_nan_2d = self.arr_nan
+        self.arr_float_nan_2d = self.arr_float_nan
+        self.arr_float1_nan_2d = self.arr_float1_nan
+        self.arr_nan_float1_2d = self.arr_nan_float1
+
+        self.arr_float_1d = self.arr_float[:, 0]
+        self.arr_float1_1d = self.arr_float1[:, 0]
+
+        self.arr_nan_1d = self.arr_nan[:, 0]
+        self.arr_float_nan_1d = self.arr_float_nan[:, 0]
+        self.arr_float1_nan_1d = self.arr_float1_nan[:, 0]
+        self.arr_nan_float1_1d = self.arr_nan_float1[:, 0]
+
+    def teardown_method(self):
+        nanops._USE_BOTTLENECK = use_bn
+
+    def check_results(self, targ, res, axis, check_dtype=True):
+        res = getattr(res, "asm8", res)
+
+        if (
+            axis != 0
+            and hasattr(targ, "shape")
+            and targ.ndim
+            and targ.shape != res.shape
+        ):
+            res = np.split(res, [targ.shape[0]], axis=0)[0]
+
+        try:
+            tm.assert_almost_equal(targ, res, check_dtype=check_dtype)
+        except AssertionError:
+            # handle timedelta dtypes
+            if hasattr(targ, "dtype") and targ.dtype == "m8[ns]":
+                raise
+
+            # There are sometimes rounding errors with
+            # complex and object dtypes.
+            # If it isn't one of those, re-raise the error.
+            if not hasattr(res, "dtype") or res.dtype.kind not in ["c", "O"]:
+                raise
+            # convert object dtypes to something that can be split into
+            # real and imaginary parts
+            if res.dtype.kind == "O":
+                if targ.dtype.kind != "O":
+                    res = res.astype(targ.dtype)
+                else:
+                    cast_dtype = "c16" if hasattr(np, "complex128") else "f8"
+                    res = res.astype(cast_dtype)
+                    targ = targ.astype(cast_dtype)
+            # there should never be a case where numpy returns an object
+            # but nanops doesn't, so make that an exception
+            elif targ.dtype.kind == "O":
+                raise
+            tm.assert_almost_equal(np.real(targ), np.real(res), check_dtype=check_dtype)
+            tm.assert_almost_equal(np.imag(targ), np.imag(res), check_dtype=check_dtype)
+
+    def check_fun_data(
+        self,
+        testfunc,
+        targfunc,
+        testar,
+        testarval,
+        targarval,
+        skipna,
+        check_dtype=True,
+        empty_targfunc=None,
+        **kwargs,
+    ):
+        for axis in [*list(range(targarval.ndim)), None]:
+            targartempval = targarval if skipna else testarval
+            if skipna and empty_targfunc and isna(targartempval).all():
+                targ = empty_targfunc(targartempval, axis=axis, **kwargs)
+            else:
+                targ = targfunc(targartempval, axis=axis, **kwargs)
+
+            if targartempval.dtype == object and (
+                targfunc is np.any or targfunc is np.all
+            ):
+                # GH#12863 the numpy functions will retain e.g. floatiness
+                if isinstance(targ, np.ndarray):
+                    targ = targ.astype(bool)
+                else:
+                    targ = bool(targ)
+
+            if testfunc.__name__ in ["nanargmax", "nanargmin"] and (
+                testar.startswith("arr_nan")
+                or (testar.endswith("nan") and (not skipna or axis == 1))
+            ):
+                with pytest.raises(ValueError, match="Encountered .* NA value"):
+                    testfunc(testarval, axis=axis, skipna=skipna, **kwargs)
+                return
+            res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs)
+
+            if (
+                isinstance(targ, np.complex128)
+                and isinstance(res, float)
+                and np.isnan(targ)
+                and np.isnan(res)
+            ):
+                # GH#18463
+                targ = res
+
+            self.check_results(targ, res, axis, check_dtype=check_dtype)
+            if skipna:
+                res = testfunc(testarval, axis=axis, **kwargs)
+                self.check_results(targ, res, axis, check_dtype=check_dtype)
+            if axis is None:
+                res = testfunc(testarval, skipna=skipna, **kwargs)
+                self.check_results(targ, res, axis, check_dtype=check_dtype)
+            if skipna and axis is None:
+                res = testfunc(testarval, **kwargs)
+                self.check_results(targ, res, axis, check_dtype=check_dtype)
+
+        if testarval.ndim <= 1:
+            return
+
+        # Recurse on lower-dimension
+        testarval2 = np.take(testarval, 0, axis=-1)
+        targarval2 = np.take(targarval, 0, axis=-1)
+        self.check_fun_data(
+            testfunc,
+            targfunc,
+            testar,
+            testarval2,
+            targarval2,
+            skipna=skipna,
+            check_dtype=check_dtype,
+            empty_targfunc=empty_targfunc,
+            **kwargs,
+        )
+
+    def check_fun(
+        self, testfunc, targfunc, testar, skipna, empty_targfunc=None, **kwargs
+    ):
+        targar = testar
+        if testar.endswith("_nan") and hasattr(self, testar[:-4]):
+            targar = testar[:-4]
+
+        testarval = getattr(self, testar)
+        targarval = getattr(self, targar)
+        self.check_fun_data(
+            testfunc,
+            targfunc,
+            testar,
+            testarval,
+            targarval,
+            skipna=skipna,
+            empty_targfunc=empty_targfunc,
+            **kwargs,
+        )
+
+    def check_funs(
+        self,
+        testfunc,
+        targfunc,
+        skipna,
+        allow_complex=True,
+        allow_all_nan=True,
+        allow_date=True,
+        allow_tdelta=True,
+        allow_obj=True,
+        **kwargs,
+    ):
+        self.check_fun(testfunc, targfunc, "arr_float", skipna, **kwargs)
+        self.check_fun(testfunc, targfunc, "arr_float_nan", skipna, **kwargs)
+        self.check_fun(testfunc, targfunc, "arr_int", skipna, **kwargs)
+        self.check_fun(testfunc, targfunc, "arr_bool", skipna, **kwargs)
+        objs = [
+            self.arr_float.astype("O"),
+            self.arr_int.astype("O"),
+            self.arr_bool.astype("O"),
+        ]
+
+        if allow_all_nan:
+            self.check_fun(testfunc, targfunc, "arr_nan", skipna, **kwargs)
+
+        if allow_complex:
+            self.check_fun(testfunc, targfunc, "arr_complex", skipna, **kwargs)
+            self.check_fun(testfunc, targfunc, "arr_complex_nan", skipna, **kwargs)
+            if allow_all_nan:
+                self.check_fun(testfunc, targfunc, "arr_nan_nanj", skipna, **kwargs)
+            objs += [self.arr_complex.astype("O")]
+
+        if allow_date:
+            targfunc(self.arr_date)
+            self.check_fun(testfunc, targfunc, "arr_date", skipna, **kwargs)
+            objs += [self.arr_date.astype("O")]
+
+        if allow_tdelta:
+            try:
+                targfunc(self.arr_tdelta)
+            except TypeError:
+                pass
+            else:
+                self.check_fun(testfunc, targfunc, "arr_tdelta", skipna, **kwargs)
+                objs += [self.arr_tdelta.astype("O")]
+
+        if allow_obj:
+            self.arr_obj = np.vstack(objs)
+            # some nanops handle object dtypes better than their numpy
+            # counterparts, so the numpy functions need to be given something
+            # else
+            if allow_obj == "convert":
+                targfunc = partial(
+                    self._badobj_wrap, func=targfunc, allow_complex=allow_complex
+                )
+            self.check_fun(testfunc, targfunc, "arr_obj", skipna, **kwargs)
+
+    def _badobj_wrap(self, value, func, allow_complex=True, **kwargs):
+        if value.dtype.kind == "O":
+            if allow_complex:
+                value = value.astype("c16")
+            else:
+                value = value.astype("f8")
+        return func(value, **kwargs)
+
+    @pytest.mark.parametrize(
+        "nan_op,np_op", [(nanops.nanany, np.any), (nanops.nanall, np.all)]
+    )
+    def test_nan_funcs(self, nan_op, np_op, skipna):
+        self.check_funs(nan_op, np_op, skipna, allow_all_nan=False, allow_date=False)
+
+    def test_nansum(self, skipna):
+        self.check_funs(
+            nanops.nansum,
+            np.sum,
+            skipna,
+            allow_date=False,
+            check_dtype=False,
+            empty_targfunc=np.nansum,
+        )
+
+    def test_nanmean(self, skipna):
+        self.check_funs(
+            nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False
+        )
+
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    def test_nanmedian(self, skipna):
+        self.check_funs(
+            nanops.nanmedian,
+            np.median,
+            skipna,
+            allow_complex=False,
+            allow_date=False,
+            allow_obj="convert",
+        )
+
+    @pytest.mark.parametrize("ddof", range(3))
+    def test_nanvar(self, ddof, skipna):
+        self.check_funs(
+            nanops.nanvar,
+            np.var,
+            skipna,
+            allow_complex=False,
+            allow_date=False,
+            allow_obj="convert",
+            ddof=ddof,
+        )
+
+    @pytest.mark.parametrize("ddof", range(3))
+    def test_nanstd(self, ddof, skipna):
+        self.check_funs(
+            nanops.nanstd,
+            np.std,
+            skipna,
+            allow_complex=False,
+            allow_date=False,
+            allow_obj="convert",
+            ddof=ddof,
+        )
+
+    @pytest.mark.parametrize("ddof", range(3))
+    def test_nansem(self, ddof, skipna):
+        sp_stats = pytest.importorskip("scipy.stats")
+
+        with np.errstate(invalid="ignore"):
+            self.check_funs(
+                nanops.nansem,
+                sp_stats.sem,
+                skipna,
+                allow_complex=False,
+                allow_date=False,
+                allow_tdelta=False,
+                allow_obj="convert",
+                ddof=ddof,
+            )
+
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    @pytest.mark.parametrize(
+        "nan_op,np_op", [(nanops.nanmin, np.min), (nanops.nanmax, np.max)]
+    )
+    def test_nanops_with_warnings(self, nan_op, np_op, skipna):
+        self.check_funs(nan_op, np_op, skipna, allow_obj=False)
+
+    def _argminmax_wrap(self, value, axis=None, func=None):
+        res = func(value, axis)
+        nans = np.min(value, axis)
+        nullnan = isna(nans)
+        if res.ndim:
+            res[nullnan] = -1
+        elif (hasattr(nullnan, "all") and nullnan.all()) or (
+            not hasattr(nullnan, "all") and nullnan
+        ):
+            res = -1
+        return res
+
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    def test_nanargmax(self, skipna):
+        func = partial(self._argminmax_wrap, func=np.argmax)
+        self.check_funs(nanops.nanargmax, func, skipna, allow_obj=False)
+
+    @pytest.mark.filterwarnings("ignore::RuntimeWarning")
+    def test_nanargmin(self, skipna):
+        func = partial(self._argminmax_wrap, func=np.argmin)
+        self.check_funs(nanops.nanargmin, func, skipna, allow_obj=False)
+
+    def _skew_kurt_wrap(self, values, axis=None, func=None):
+        if not isinstance(values.dtype.type, np.floating):
+            values = values.astype("f8")
+        result = func(values, axis=axis, bias=False)
+        # fix for handling cases where all elements in an axis are the same
+        if isinstance(result, np.ndarray):
+            result[np.max(values, axis=axis) == np.min(values, axis=axis)] = 0
+            return result
+        elif np.max(values) == np.min(values):
+            return 0.0
+        return result
+
+    def test_nanskew(self, skipna):
+        sp_stats = pytest.importorskip("scipy.stats")
+
+        func = partial(self._skew_kurt_wrap, func=sp_stats.skew)
+        with np.errstate(invalid="ignore"):
+            self.check_funs(
+                nanops.nanskew,
+                func,
+                skipna,
+                allow_complex=False,
+                allow_date=False,
+                allow_tdelta=False,
+            )
+
+    def test_nankurt(self, skipna):
+        sp_stats = pytest.importorskip("scipy.stats")
+
+        func1 = partial(sp_stats.kurtosis, fisher=True)
+        func = partial(self._skew_kurt_wrap, func=func1)
+        with np.errstate(invalid="ignore"):
+            self.check_funs(
+                nanops.nankurt,
+                func,
+                skipna,
+                allow_complex=False,
+                allow_date=False,
+                allow_tdelta=False,
+            )
+
+    def test_nanprod(self, skipna):
+        self.check_funs(
+            nanops.nanprod,
+            np.prod,
+            skipna,
+            allow_date=False,
+            allow_tdelta=False,
+            empty_targfunc=np.nanprod,
+        )
+
+    def check_nancorr_nancov_2d(self, checkfun, targ0, targ1, **kwargs):
+        res00 = checkfun(self.arr_float_2d, self.arr_float1_2d, **kwargs)
+        res01 = checkfun(
+            self.arr_float_2d,
+            self.arr_float1_2d,
+            min_periods=len(self.arr_float_2d) - 1,
+            **kwargs,
+        )
+        tm.assert_almost_equal(targ0, res00)
+        tm.assert_almost_equal(targ0, res01)
+
+        res10 = checkfun(self.arr_float_nan_2d, self.arr_float1_nan_2d, **kwargs)
+        res11 = checkfun(
+            self.arr_float_nan_2d,
+            self.arr_float1_nan_2d,
+            min_periods=len(self.arr_float_2d) - 1,
+            **kwargs,
+        )
+        tm.assert_almost_equal(targ1, res10)
+        tm.assert_almost_equal(targ1, res11)
+
+        targ2 = np.nan
+        res20 = checkfun(self.arr_nan_2d, self.arr_float1_2d, **kwargs)
+        res21 = checkfun(self.arr_float_2d, self.arr_nan_2d, **kwargs)
+        res22 = checkfun(self.arr_nan_2d, self.arr_nan_2d, **kwargs)
+        res23 = checkfun(self.arr_float_nan_2d, self.arr_nan_float1_2d, **kwargs)
+        res24 = checkfun(
+            self.arr_float_nan_2d,
+            self.arr_nan_float1_2d,
+            min_periods=len(self.arr_float_2d) - 1,
+            **kwargs,
+        )
+        res25 = checkfun(
+            self.arr_float_2d,
+            self.arr_float1_2d,
+            min_periods=len(self.arr_float_2d) + 1,
+            **kwargs,
+        )
+        tm.assert_almost_equal(targ2, res20)
+        tm.assert_almost_equal(targ2, res21)
+        tm.assert_almost_equal(targ2, res22)
+        tm.assert_almost_equal(targ2, res23)
+        tm.assert_almost_equal(targ2, res24)
+        tm.assert_almost_equal(targ2, res25)
+
+    def check_nancorr_nancov_1d(self, checkfun, targ0, targ1, **kwargs):
+        res00 = checkfun(self.arr_float_1d, self.arr_float1_1d, **kwargs)
+        res01 = checkfun(
+            self.arr_float_1d,
+            self.arr_float1_1d,
+            min_periods=len(self.arr_float_1d) - 1,
+            **kwargs,
+        )
+        tm.assert_almost_equal(targ0, res00)
+        tm.assert_almost_equal(targ0, res01)
+
+        res10 = checkfun(self.arr_float_nan_1d, self.arr_float1_nan_1d, **kwargs)
+        res11 = checkfun(
+            self.arr_float_nan_1d,
+            self.arr_float1_nan_1d,
+            min_periods=len(self.arr_float_1d) - 1,
+            **kwargs,
+        )
+        tm.assert_almost_equal(targ1, res10)
+        tm.assert_almost_equal(targ1, res11)
+
+        targ2 = np.nan
+        res20 = checkfun(self.arr_nan_1d, self.arr_float1_1d, **kwargs)
+        res21 = checkfun(self.arr_float_1d, self.arr_nan_1d, **kwargs)
+        res22 = checkfun(self.arr_nan_1d, self.arr_nan_1d, **kwargs)
+        res23 = checkfun(self.arr_float_nan_1d, self.arr_nan_float1_1d, **kwargs)
+        res24 = checkfun(
+            self.arr_float_nan_1d,
+            self.arr_nan_float1_1d,
+            min_periods=len(self.arr_float_1d) - 1,
+            **kwargs,
+        )
+        res25 = checkfun(
+            self.arr_float_1d,
+            self.arr_float1_1d,
+            min_periods=len(self.arr_float_1d) + 1,
+            **kwargs,
+        )
+        tm.assert_almost_equal(targ2, res20)
+        tm.assert_almost_equal(targ2, res21)
+        tm.assert_almost_equal(targ2, res22)
+        tm.assert_almost_equal(targ2, res23)
+        tm.assert_almost_equal(targ2, res24)
+        tm.assert_almost_equal(targ2, res25)
+
+    def test_nancorr(self):
+        targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1]
+        targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
+        self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1)
+        targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1]
+        targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1]
+        self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson")
+
+    def test_nancorr_pearson(self):
+        targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1]
+        targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
+        self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="pearson")
+        targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1]
+        targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1]
+        self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson")
+
+    def test_nancorr_kendall(self):
+        sp_stats = pytest.importorskip("scipy.stats")
+
+        targ0 = sp_stats.kendalltau(self.arr_float_2d, self.arr_float1_2d)[0]
+        targ1 = sp_stats.kendalltau(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0]
+        self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="kendall")
+        targ0 = sp_stats.kendalltau(self.arr_float_1d, self.arr_float1_1d)[0]
+        targ1 = sp_stats.kendalltau(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0]
+        self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="kendall")
+
+    def test_nancorr_spearman(self):
+        sp_stats = pytest.importorskip("scipy.stats")
+
+        targ0 = sp_stats.spearmanr(self.arr_float_2d, self.arr_float1_2d)[0]
+        targ1 = sp_stats.spearmanr(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0]
+        self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="spearman")
+        targ0 = sp_stats.spearmanr(self.arr_float_1d, self.arr_float1_1d)[0]
+        targ1 = sp_stats.spearmanr(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0]
+        self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="spearman")
+
+    def test_invalid_method(self):
+        pytest.importorskip("scipy")
+        targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1]
+        targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
+        msg = "Unknown method 'foo', expected one of 'kendall', 'spearman'"
+        with pytest.raises(ValueError, match=msg):
+            self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="foo")
+
+    def test_nancov(self):
+        targ0 = np.cov(self.arr_float_2d, self.arr_float1_2d)[0, 1]
+        targ1 = np.cov(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1]
+        self.check_nancorr_nancov_2d(nanops.nancov, targ0, targ1)
+        targ0 = np.cov(self.arr_float_1d, self.arr_float1_1d)[0, 1]
+        targ1 = np.cov(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1]
+        self.check_nancorr_nancov_1d(nanops.nancov, targ0, targ1)
+
+
+@pytest.mark.parametrize(
+    "arr, correct",
+    [
+        ("arr_complex", False),
+        ("arr_int", False),
+        ("arr_bool", False),
+        ("arr_str", False),
+        ("arr_utf", False),
+        ("arr_complex_nan", False),
+        ("arr_nan_nanj", False),
+        ("arr_nan_infj", True),
+        ("arr_complex_nan_infj", True),
+    ],
+)
+def test_has_infs_non_float(request, arr, correct, disable_bottleneck):
+    val = request.getfixturevalue(arr)
+    while getattr(val, "ndim", True):
+        res0 = nanops._has_infs(val)
+        if correct:
+            assert res0
+        else:
+            assert not res0
+
+        if not hasattr(val, "ndim"):
+            break
+
+        # Reduce dimension for next step in the loop
+        val = np.take(val, 0, axis=-1)
+
+
+@pytest.mark.parametrize(
+    "arr, correct",
+    [
+        ("arr_float", False),
+        ("arr_nan", False),
+        ("arr_float_nan", False),
+        ("arr_nan_nan", False),
+        ("arr_float_inf", True),
+        ("arr_inf", True),
+        ("arr_nan_inf", True),
+        ("arr_float_nan_inf", True),
+        ("arr_nan_nan_inf", True),
+    ],
+)
+@pytest.mark.parametrize("astype", [None, "f4", "f2"])
+def test_has_infs_floats(request, arr, correct, astype, disable_bottleneck):
+    val = request.getfixturevalue(arr)
+    if astype is not None:
+        val = val.astype(astype)
+    while getattr(val, "ndim", True):
+        res0 = nanops._has_infs(val)
+        if correct:
+            assert res0
+        else:
+            assert not res0
+
+        if not hasattr(val, "ndim"):
+            break
+
+        # Reduce dimension for next step in the loop
+        val = np.take(val, 0, axis=-1)
+
+
+@pytest.mark.parametrize(
+    "fixture", ["arr_float", "arr_complex", "arr_int", "arr_bool", "arr_str", "arr_utf"]
+)
+def test_bn_ok_dtype(fixture, request, disable_bottleneck):
+    obj = request.getfixturevalue(fixture)
+    assert nanops._bn_ok_dtype(obj.dtype, "test")
+
+
+@pytest.mark.parametrize(
+    "fixture",
+    [
+        "arr_date",
+        "arr_tdelta",
+        "arr_obj",
+    ],
+)
+def test_bn_not_ok_dtype(fixture, request, disable_bottleneck):
+    obj = request.getfixturevalue(fixture)
+    assert not nanops._bn_ok_dtype(obj.dtype, "test")
+
+
+class TestEnsureNumeric:
+    def test_numeric_values(self):
+        # Test integer
+        assert nanops._ensure_numeric(1) == 1
+
+        # Test float
+        assert nanops._ensure_numeric(1.1) == 1.1
+
+        # Test complex
+        assert nanops._ensure_numeric(1 + 2j) == 1 + 2j
+
+    def test_ndarray(self):
+        # Test numeric ndarray
+        values = np.array([1, 2, 3])
+        assert np.allclose(nanops._ensure_numeric(values), values)
+
+        # Test object ndarray
+        o_values = values.astype(object)
+        assert np.allclose(nanops._ensure_numeric(o_values), values)
+
+        # Test convertible string ndarray
+        s_values = np.array(["1", "2", "3"], dtype=object)
+        msg = r"Could not convert \['1' '2' '3'\] to numeric"
+        with pytest.raises(TypeError, match=msg):
+            nanops._ensure_numeric(s_values)
+
+        # Test non-convertible string ndarray
+        s_values = np.array(["foo", "bar", "baz"], dtype=object)
+        msg = r"Could not convert .* to numeric"
+        with pytest.raises(TypeError, match=msg):
+            nanops._ensure_numeric(s_values)
+
+    def test_convertable_values(self):
+        with pytest.raises(TypeError, match="Could not convert string '1' to numeric"):
+            nanops._ensure_numeric("1")
+        with pytest.raises(
+            TypeError, match="Could not convert string '1.1' to numeric"
+        ):
+            nanops._ensure_numeric("1.1")
+        with pytest.raises(
+            TypeError, match=r"Could not convert string '1\+1j' to numeric"
+        ):
+            nanops._ensure_numeric("1+1j")
+
+    def test_non_convertable_values(self):
+        msg = "Could not convert string 'foo' to numeric"
+        with pytest.raises(TypeError, match=msg):
+            nanops._ensure_numeric("foo")
+
+        # with the wrong type, python raises TypeError for us
+        msg = "argument must be a string or a number"
+        with pytest.raises(TypeError, match=msg):
+            nanops._ensure_numeric({})
+        with pytest.raises(TypeError, match=msg):
+            nanops._ensure_numeric([])
+
+
+class TestNanvarFixedValues:
+    # xref GH10242
+    # Samples from a normal distribution.
+    @pytest.fixture
+    def variance(self):
+        return 3.0
+
+    @pytest.fixture
+    def samples(self, variance):
+        return self.prng.normal(scale=variance**0.5, size=100000)
+
+    def test_nanvar_all_finite(self, samples, variance):
+        actual_variance = nanops.nanvar(samples)
+        tm.assert_almost_equal(actual_variance, variance, rtol=1e-2)
+
+    def test_nanvar_nans(self, samples, variance):
+        samples_test = np.nan * np.ones(2 * samples.shape[0])
+        samples_test[::2] = samples
+
+        actual_variance = nanops.nanvar(samples_test, skipna=True)
+        tm.assert_almost_equal(actual_variance, variance, rtol=1e-2)
+
+        actual_variance = nanops.nanvar(samples_test, skipna=False)
+        tm.assert_almost_equal(actual_variance, np.nan, rtol=1e-2)
+
+    def test_nanstd_nans(self, samples, variance):
+        samples_test = np.nan * np.ones(2 * samples.shape[0])
+        samples_test[::2] = samples
+
+        actual_std = nanops.nanstd(samples_test, skipna=True)
+        tm.assert_almost_equal(actual_std, variance**0.5, rtol=1e-2)
+
+        actual_std = nanops.nanvar(samples_test, skipna=False)
+        tm.assert_almost_equal(actual_std, np.nan, rtol=1e-2)
+
+    def test_nanvar_axis(self, samples, variance):
+        # Generate some sample data.
+        samples_unif = self.prng.uniform(size=samples.shape[0])
+        samples = np.vstack([samples, samples_unif])
+
+        actual_variance = nanops.nanvar(samples, axis=1)
+        tm.assert_almost_equal(
+            actual_variance, np.array([variance, 1.0 / 12]), rtol=1e-2
+        )
+
+    def test_nanvar_ddof(self):
+        n = 5
+        samples = self.prng.uniform(size=(10000, n + 1))
+        samples[:, -1] = np.nan  # Force use of our own algorithm.
+
+        variance_0 = nanops.nanvar(samples, axis=1, skipna=True, ddof=0).mean()
+        variance_1 = nanops.nanvar(samples, axis=1, skipna=True, ddof=1).mean()
+        variance_2 = nanops.nanvar(samples, axis=1, skipna=True, ddof=2).mean()
+
+        # The unbiased estimate.
+        var = 1.0 / 12
+        tm.assert_almost_equal(variance_1, var, rtol=1e-2)
+
+        # The underestimated variance.
+        tm.assert_almost_equal(variance_0, (n - 1.0) / n * var, rtol=1e-2)
+
+        # The overestimated variance.
+        tm.assert_almost_equal(variance_2, (n - 1.0) / (n - 2.0) * var, rtol=1e-2)
+
+    @pytest.mark.parametrize("axis", range(2))
+    @pytest.mark.parametrize("ddof", range(3))
+    def test_ground_truth(self, axis, ddof):
+        # Test against values that were precomputed with Numpy.
+        samples = np.empty((4, 4))
+        samples[:3, :3] = np.array(
+            [
+                [0.97303362, 0.21869576, 0.55560287],
+                [0.72980153, 0.03109364, 0.99155171],
+                [0.09317602, 0.60078248, 0.15871292],
+            ]
+        )
+        samples[3] = samples[:, 3] = np.nan
+
+        # Actual variances along axis=0, 1 for ddof=0, 1, 2
+        variance = np.array(
+            [
+                [
+                    [0.13762259, 0.05619224, 0.11568816],
+                    [0.20643388, 0.08428837, 0.17353224],
+                    [0.41286776, 0.16857673, 0.34706449],
+                ],
+                [
+                    [0.09519783, 0.16435395, 0.05082054],
+                    [0.14279674, 0.24653093, 0.07623082],
+                    [0.28559348, 0.49306186, 0.15246163],
+                ],
+            ]
+        )
+
+        # Test nanvar.
+        var = nanops.nanvar(samples, skipna=True, axis=axis, ddof=ddof)
+        tm.assert_almost_equal(var[:3], variance[axis, ddof])
+        assert np.isnan(var[3])
+
+        # Test nanstd.
+        std = nanops.nanstd(samples, skipna=True, axis=axis, ddof=ddof)
+        tm.assert_almost_equal(std[:3], variance[axis, ddof] ** 0.5)
+        assert np.isnan(std[3])
+
+    @pytest.mark.parametrize("ddof", range(3))
+    def test_nanstd_roundoff(self, ddof):
+        # Regression test for GH 10242 (test data taken from GH 10489). Ensure
+        # that variance is stable.
+        data = Series(766897346 * np.ones(10))
+        result = data.std(ddof=ddof)
+        assert result == 0.0
+
+    @property
+    def prng(self):
+        return np.random.default_rng(2)
+
+
+class TestNanskewFixedValues:
+    # xref GH 11974
+    # Test data + skewness value (computed with scipy.stats.skew)
+    @pytest.fixture
+    def samples(self):
+        return np.sin(np.linspace(0, 1, 200))
+
+    @pytest.fixture
+    def actual_skew(self):
+        return -0.1875895205961754
+
+    @pytest.mark.parametrize("val", [3075.2, 3075.3, 3075.5])
+    def test_constant_series(self, val):
+        # xref GH 11974
+        data = val * np.ones(300)
+        skew = nanops.nanskew(data)
+        assert skew == 0.0
+
+    def test_all_finite(self):
+        alpha, beta = 0.3, 0.1
+        left_tailed = self.prng.beta(alpha, beta, size=100)
+        assert nanops.nanskew(left_tailed) < 0
+
+        alpha, beta = 0.1, 0.3
+        right_tailed = self.prng.beta(alpha, beta, size=100)
+        assert nanops.nanskew(right_tailed) > 0
+
+    def test_ground_truth(self, samples, actual_skew):
+        skew = nanops.nanskew(samples)
+        tm.assert_almost_equal(skew, actual_skew)
+
+    def test_axis(self, samples, actual_skew):
+        samples = np.vstack([samples, np.nan * np.ones(len(samples))])
+        skew = nanops.nanskew(samples, axis=1)
+        tm.assert_almost_equal(skew, np.array([actual_skew, np.nan]))
+
+    def test_nans(self, samples):
+        samples = np.hstack([samples, np.nan])
+        skew = nanops.nanskew(samples, skipna=False)
+        assert np.isnan(skew)
+
+    def test_nans_skipna(self, samples, actual_skew):
+        samples = np.hstack([samples, np.nan])
+        skew = nanops.nanskew(samples, skipna=True)
+        tm.assert_almost_equal(skew, actual_skew)
+
+    @pytest.mark.parametrize(
+        "initial_data, nobs",
+        [
+            ([-2.05191341e-05, -4.10391103e-05], 27),
+            ([-2.05191341e-10, -4.10391103e-10], 27),
+            ([-2.05191341e-05, -4.10391103e-05], 10_000),
+            ([-2.05191341e-10, -4.10391103e-10], 10_000),
+        ],
+    )
+    def test_low_variance(self, initial_data, nobs):
+        st = pytest.importorskip("scipy.stats")
+        data = np.zeros((nobs,), dtype=np.float64)
+        data[: len(initial_data)] = initial_data
+        skew = nanops.nanskew(data)
+        expected = st.skew(data, bias=False)
+        tm.assert_almost_equal(skew, expected)
+
+    @property
+    def prng(self):
+        return np.random.default_rng(2)
+
+
+class TestNankurtFixedValues:
+    # xref GH 11974
+    # Test data + kurtosis value (computed with scipy.stats.kurtosis)
+    @pytest.fixture
+    def samples(self):
+        return np.sin(np.linspace(0, 1, 200))
+
+    @pytest.fixture
+    def actual_kurt(self):
+        return -1.2058303433799713
+
+    @pytest.mark.parametrize("val", [3075.2, 3075.3, 3075.5])
+    def test_constant_series(self, val):
+        # xref GH 11974
+        data = val * np.ones(300)
+        kurt = nanops.nankurt(data)
+        tm.assert_equal(kurt, 0.0)
+
+    def test_all_finite(self):
+        alpha, beta = 0.3, 0.1
+        left_tailed = self.prng.beta(alpha, beta, size=100)
+        assert nanops.nankurt(left_tailed) < 2
+
+        alpha, beta = 0.1, 0.3
+        right_tailed = self.prng.beta(alpha, beta, size=100)
+        assert nanops.nankurt(right_tailed) < 0
+
+    def test_ground_truth(self, samples, actual_kurt):
+        kurt = nanops.nankurt(samples)
+        tm.assert_almost_equal(kurt, actual_kurt)
+
+    def test_axis(self, samples, actual_kurt):
+        samples = np.vstack([samples, np.nan * np.ones(len(samples))])
+        kurt = nanops.nankurt(samples, axis=1)
+        tm.assert_almost_equal(kurt, np.array([actual_kurt, np.nan]))
+
+    def test_nans(self, samples):
+        samples = np.hstack([samples, np.nan])
+        kurt = nanops.nankurt(samples, skipna=False)
+        assert np.isnan(kurt)
+
+    def test_nans_skipna(self, samples, actual_kurt):
+        samples = np.hstack([samples, np.nan])
+        kurt = nanops.nankurt(samples, skipna=True)
+        tm.assert_almost_equal(kurt, actual_kurt)
+
+    @pytest.mark.parametrize(
+        "initial_data, nobs",
+        [
+            ([-2.05191341e-05, -4.10391103e-05], 27),
+            ([-2.05191341e-10, -4.10391103e-10], 27),
+            ([-2.05191341e-05, -4.10391103e-05], 10_000),
+            ([-2.05191341e-10, -4.10391103e-10], 10_000),
+        ],
+    )
+    def test_low_variance(self, initial_data, nobs):
+        # GH#57972
+        st = pytest.importorskip("scipy.stats")
+        data = np.zeros((nobs,), dtype=np.float64)
+        data[: len(initial_data)] = initial_data
+        kurt = nanops.nankurt(data)
+        expected = st.kurtosis(data, bias=False)
+        tm.assert_almost_equal(kurt, expected)
+
+    @property
+    def prng(self):
+        return np.random.default_rng(2)
+
+
+class TestDatetime64NaNOps:
+    # Enabling mean changes the behavior of DataFrame.mean
+    # See https://github.com/pandas-dev/pandas/issues/24752
+    def test_nanmean(self, unit):
+        dti = pd.date_range("2016-01-01", periods=3).as_unit(unit)
+        expected = dti[1]
+
+        for obj in [dti, dti._data]:
+            result = nanops.nanmean(obj)
+            assert result == expected
+
+        dti2 = dti.insert(1, pd.NaT)
+
+        for obj in [dti2, dti2._data]:
+            result = nanops.nanmean(obj)
+            assert result == expected
+
+    @pytest.mark.parametrize("constructor", ["M8", "m8"])
+    def test_nanmean_skipna_false(self, constructor, unit):
+        dtype = f"{constructor}[{unit}]"
+        arr = np.arange(12).astype(np.int64).view(dtype).reshape(4, 3)
+
+        arr[-1, -1] = "NaT"
+
+        result = nanops.nanmean(arr, skipna=False)
+        assert np.isnat(result)
+        assert result.dtype == dtype
+
+        result = nanops.nanmean(arr, axis=0, skipna=False)
+        expected = np.array([4, 5, "NaT"], dtype=arr.dtype)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = nanops.nanmean(arr, axis=1, skipna=False)
+        expected = np.array([arr[0, 1], arr[1, 1], arr[2, 1], arr[-1, -1]])
+        tm.assert_numpy_array_equal(result, expected)
+
+
+def test_use_bottleneck():
+    if nanops._BOTTLENECK_INSTALLED:
+        with pd.option_context("use_bottleneck", True):
+            assert pd.get_option("use_bottleneck")
+
+        with pd.option_context("use_bottleneck", False):
+            assert not pd.get_option("use_bottleneck")
+
+
+@pytest.mark.parametrize(
+    "numpy_op, expected",
+    [
+        (np.sum, 10),
+        (np.nansum, 10),
+        (np.mean, 2.5),
+        (np.nanmean, 2.5),
+        (np.median, 2.5),
+        (np.nanmedian, 2.5),
+        (np.min, 1),
+        (np.max, 4),
+        (np.nanmin, 1),
+        (np.nanmax, 4),
+    ],
+)
+def test_numpy_ops(numpy_op, expected):
+    # GH8383
+    result = numpy_op(Series([1, 2, 3, 4]))
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "operation",
+    [
+        nanops.nanany,
+        nanops.nanall,
+        nanops.nansum,
+        nanops.nanmean,
+        nanops.nanmedian,
+        nanops.nanstd,
+        nanops.nanvar,
+        nanops.nansem,
+        nanops.nanargmax,
+        nanops.nanargmin,
+        nanops.nanmax,
+        nanops.nanmin,
+        nanops.nanskew,
+        nanops.nankurt,
+        nanops.nanprod,
+    ],
+)
+def test_nanops_independent_of_mask_param(operation):
+    # GH22764
+    ser = Series([1, 2, np.nan, 3, np.nan, 4])
+    mask = ser.isna()
+    median_expected = operation(ser._values)
+    median_result = operation(ser._values, mask=mask)
+    assert median_expected == median_result
+
+
+@pytest.mark.parametrize("min_count", [-1, 0])
+def test_check_below_min_count_negative_or_zero_min_count(min_count):
+    # GH35227
+    result = nanops.check_below_min_count((21, 37), None, min_count)
+    expected_result = False
+    assert result == expected_result
+
+
+@pytest.mark.parametrize(
+    "mask", [None, np.array([False, False, True]), np.array([True] + 9 * [False])]
+)
+@pytest.mark.parametrize("min_count, expected_result", [(1, False), (101, True)])
+def test_check_below_min_count_positive_min_count(mask, min_count, expected_result):
+    # GH35227
+    shape = (10, 10)
+    result = nanops.check_below_min_count(shape, mask, min_count)
+    assert result == expected_result
+
+
+@td.skip_if_windows
+@td.skip_if_32bit
+@pytest.mark.parametrize("min_count, expected_result", [(1, False), (2812191852, True)])
+def test_check_below_min_count_large_shape(min_count, expected_result):
+    # GH35227 large shape used to show that the issue is fixed
+    shape = (2244367, 1253)
+    result = nanops.check_below_min_count(shape, mask=None, min_count=min_count)
+    assert result == expected_result
+
+
+@pytest.mark.parametrize("func", ["nanmean", "nansum"])
+def test_check_bottleneck_disallow(any_real_numpy_dtype, func):
+    # GH 42878 bottleneck sometimes produces unreliable results for mean and sum
+    assert not nanops._bn_ok_dtype(np.dtype(any_real_numpy_dtype).type, func)
+
+
+@pytest.mark.parametrize("val", [2**55, -(2**55), 20150515061816532])
+def test_nanmean_overflow(disable_bottleneck, val, using_python_scalars):
+    # GH 10155
+    # In the previous implementation mean can overflow for int dtypes, it
+    # is now consistent with numpy
+
+    ser = Series(val, index=range(500), dtype=np.int64)
+    result = ser.mean()
+    assert result == val
+    if using_python_scalars:
+        assert type(result) == float
+    else:
+        np_result = ser.values.mean()
+        assert result == np_result
+        assert result.dtype == np.float64
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.int16,
+        np.int32,
+        np.int64,
+        np.float32,
+        np.float64,
+        getattr(np, "float128", None),
+    ],
+)
+@pytest.mark.parametrize("method", ["mean", "std", "var", "skew", "kurt", "min", "max"])
+def test_returned_dtype(disable_bottleneck, dtype, method, using_python_scalars):
+    if dtype is None:
+        pytest.skip("np.float128 not available")
+
+    ser = Series(range(10), dtype=dtype)
+    result = getattr(ser, method)()
+    if using_python_scalars:
+        if is_integer_dtype(dtype) and method in ["min", "max"]:
+            assert isinstance(result, int)
+        else:
+            assert type(result) == float
+    elif is_integer_dtype(dtype) and method not in ["min", "max"]:
+        assert result.dtype == np.float64
+    else:
+        assert result.dtype == dtype
diff --git a/pandas/tests/test_optional_dependency.py b/pandas/tests/test_optional_dependency.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd276914bfb21217c602c40f2aef07d8eff2255d
--- /dev/null
+++ b/pandas/tests/test_optional_dependency.py
@@ -0,0 +1,100 @@
+import sys
+import types
+
+import pytest
+
+from pandas.compat._optional import (
+    VERSIONS,
+    import_optional_dependency,
+)
+
+import pandas._testing as tm
+
+
+def test_import_optional():
+    match = "Import .*notapackage.* pip .* conda .* notapackage"
+    with pytest.raises(ImportError, match=match) as exc_info:
+        import_optional_dependency("notapackage")
+    # The original exception should be there as context:
+    assert isinstance(exc_info.value.__context__, ImportError)
+
+    result = import_optional_dependency("notapackage", errors="ignore")
+    assert result is None
+
+
+def test_xlrd_version_fallback():
+    pytest.importorskip("xlrd")
+    import_optional_dependency("xlrd")
+
+
+def test_bad_version(monkeypatch):
+    name = "fakemodule"
+    module = types.ModuleType(name)
+    module.__version__ = "0.9.0"
+    sys.modules[name] = module
+    monkeypatch.setitem(VERSIONS, name, "1.0.0")
+
+    match = "Pandas requires .*1.0.0.* of .fakemodule.*'0.9.0'"
+    with pytest.raises(ImportError, match=match):
+        import_optional_dependency("fakemodule")
+
+    # Test min_version parameter
+    result = import_optional_dependency("fakemodule", min_version="0.8")
+    assert result is module
+
+    with tm.assert_produces_warning(UserWarning, match=match):
+        result = import_optional_dependency("fakemodule", errors="warn")
+    assert result is None
+
+    module.__version__ = "1.0.0"  # exact match is OK
+    result = import_optional_dependency("fakemodule")
+    assert result is module
+
+    with pytest.raises(ImportError, match="Pandas requires version '1.1.0'"):
+        import_optional_dependency("fakemodule", min_version="1.1.0")
+
+    with tm.assert_produces_warning(UserWarning, match="Pandas requires version"):
+        result = import_optional_dependency(
+            "fakemodule", errors="warn", min_version="1.1.0"
+        )
+    assert result is None
+
+    result = import_optional_dependency(
+        "fakemodule", errors="ignore", min_version="1.1.0"
+    )
+    assert result is None
+
+
+def test_submodule(monkeypatch):
+    # Create a fake module with a submodule
+    name = "fakemodule"
+    module = types.ModuleType(name)
+    module.__version__ = "0.9.0"
+    sys.modules[name] = module
+    sub_name = "submodule"
+    submodule = types.ModuleType(sub_name)
+    setattr(module, sub_name, submodule)
+    sys.modules[f"{name}.{sub_name}"] = submodule
+    monkeypatch.setitem(VERSIONS, name, "1.0.0")
+
+    match = "Pandas requires .*1.0.0.* of .fakemodule.*'0.9.0'"
+    with pytest.raises(ImportError, match=match):
+        import_optional_dependency("fakemodule.submodule")
+
+    with tm.assert_produces_warning(UserWarning, match=match):
+        result = import_optional_dependency("fakemodule.submodule", errors="warn")
+    assert result is None
+
+    module.__version__ = "1.0.0"  # exact match is OK
+    result = import_optional_dependency("fakemodule.submodule")
+    assert result is submodule
+
+
+def test_no_version_raises(monkeypatch):
+    name = "fakemodule"
+    module = types.ModuleType(name)
+    sys.modules[name] = module
+    monkeypatch.setitem(VERSIONS, name, "1.0.0")
+
+    with pytest.raises(ImportError, match="Can't determine .* fakemodule"):
+        import_optional_dependency(name)
diff --git a/pandas/tests/test_register_accessor.py b/pandas/tests/test_register_accessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9deff5613939412f0e51a8784f3e0958e9989140
--- /dev/null
+++ b/pandas/tests/test_register_accessor.py
@@ -0,0 +1,123 @@
+from collections.abc import Generator
+import contextlib
+import weakref
+
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.core import accessor
+
+
+def test_dirname_mixin() -> None:
+    # GH37173
+
+    class X(accessor.DirNamesMixin):
+        x = 1
+        y: int
+
+        def __init__(self) -> None:
+            self.z = 3
+
+    result = [attr_name for attr_name in dir(X()) if not attr_name.startswith("_")]
+
+    assert result == ["x", "z"]
+
+
+@contextlib.contextmanager
+def ensure_removed(obj, attr) -> Generator[None, None, None]:
+    """Ensure that an attribute added to 'obj' during the test is
+    removed when we're done
+    """
+    try:
+        yield
+    finally:
+        try:
+            delattr(obj, attr)
+        except AttributeError:
+            pass
+        obj._accessors.discard(attr)
+
+
+class MyAccessor:
+    def __init__(self, obj) -> None:
+        self.obj = obj
+        self.item = "item"
+
+    @property
+    def prop(self):
+        return self.item
+
+    def method(self):
+        return self.item
+
+
+@pytest.mark.parametrize(
+    "obj, registrar",
+    [
+        (pd.Series, pd.api.extensions.register_series_accessor),
+        (pd.DataFrame, pd.api.extensions.register_dataframe_accessor),
+        (pd.Index, pd.api.extensions.register_index_accessor),
+    ],
+)
+def test_register(obj, registrar):
+    with ensure_removed(obj, "mine"):
+        before = set(dir(obj))
+        registrar("mine")(MyAccessor)
+        o = obj([]) if obj is not pd.Series else obj([], dtype=object)
+        assert o.mine.prop == "item"
+        after = set(dir(obj))
+        assert (before ^ after) == {"mine"}
+        assert "mine" in obj._accessors
+
+
+def test_accessor_works():
+    with ensure_removed(pd.Series, "mine"):
+        pd.api.extensions.register_series_accessor("mine")(MyAccessor)
+
+        s = pd.Series([1, 2])
+        assert s.mine.obj is s
+
+        assert s.mine.prop == "item"
+        assert s.mine.method() == "item"
+
+
+def test_overwrite_warns():
+    match = r".*MyAccessor.*fake.*Series.*"
+    with tm.assert_produces_warning(UserWarning, match=match):
+        with ensure_removed(pd.Series, "fake"):
+            setattr(pd.Series, "fake", 123)
+            pd.api.extensions.register_series_accessor("fake")(MyAccessor)
+            s = pd.Series([1, 2])
+            assert s.fake.prop == "item"
+
+
+def test_raises_attribute_error():
+    with ensure_removed(pd.Series, "bad"):
+
+        @pd.api.extensions.register_series_accessor("bad")
+        class Bad:
+            def __init__(self, data) -> None:
+                raise AttributeError("whoops")
+
+        with pytest.raises(AttributeError, match="whoops"):
+            pd.Series([], dtype=object).bad
+
+
+@pytest.mark.parametrize(
+    "klass, registrar",
+    [
+        (pd.Series, pd.api.extensions.register_series_accessor),
+        (pd.DataFrame, pd.api.extensions.register_dataframe_accessor),
+        (pd.Index, pd.api.extensions.register_index_accessor),
+    ],
+)
+def test_no_circular_reference(klass, registrar):
+    # GH 41357
+    with ensure_removed(klass, "access"):
+        registrar("access")(MyAccessor)
+        obj = klass([0])
+        ref = weakref.ref(obj)
+        assert obj.access.obj is obj
+        del obj
+        assert ref() is None
diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py
new file mode 100644
index 0000000000000000000000000000000000000000..4596238946c62fd788ffb67aba8394ba41745dd4
--- /dev/null
+++ b/pandas/tests/test_sorting.py
@@ -0,0 +1,475 @@
+from collections import defaultdict
+from datetime import datetime
+from itertools import product
+
+import numpy as np
+import pytest
+
+from pandas import (
+    NA,
+    DataFrame,
+    MultiIndex,
+    Series,
+    array,
+    concat,
+    merge,
+)
+import pandas._testing as tm
+from pandas.core.algorithms import safe_sort
+import pandas.core.common as com
+from pandas.core.sorting import (
+    _decons_group_index,
+    get_group_index,
+    is_int64_overflow_possible,
+    lexsort_indexer,
+    nargsort,
+)
+
+
+@pytest.fixture
+def left_right():
+    low, high, n = -1 << 10, 1 << 10, 1 << 20
+    left = DataFrame(
+        np.random.default_rng(2).integers(low, high, (n, 7)), columns=list("ABCDEFG")
+    )
+    left["left"] = left.sum(axis=1)
+    right = left.sample(
+        frac=1, random_state=np.random.default_rng(2), ignore_index=True
+    )
+    right.columns = [*right.columns[:-1].tolist(), "right"]
+    right["right"] *= -1
+    return left, right
+
+
+class TestSorting:
+    @pytest.mark.slow
+    def test_int64_overflow(self):
+        B = np.concatenate((np.arange(1000), np.arange(1000), np.arange(500)))
+        A = np.arange(2500)
+        df = DataFrame(
+            {
+                "A": A,
+                "B": B,
+                "C": A,
+                "D": B,
+                "E": A,
+                "F": B,
+                "G": A,
+                "H": B,
+                "values": np.random.default_rng(2).standard_normal(2500),
+            }
+        )
+
+        lg = df.groupby(["A", "B", "C", "D", "E", "F", "G", "H"])
+        rg = df.groupby(["H", "G", "F", "E", "D", "C", "B", "A"])
+
+        left = lg.sum()["values"]
+        right = rg.sum()["values"]
+
+        exp_index, _ = left.index.sortlevel()
+        tm.assert_index_equal(left.index, exp_index)
+
+        exp_index, _ = right.index.sortlevel(0)
+        tm.assert_index_equal(right.index, exp_index)
+
+        tups = list(map(tuple, df[["A", "B", "C", "D", "E", "F", "G", "H"]].values))
+        tups = com.asarray_tuplesafe(tups)
+
+        expected = df.groupby(tups).sum()["values"]
+
+        for k, v in expected.items():
+            assert left[k] == right[k[::-1]]
+            assert left[k] == v
+        assert len(left) == len(right)
+
+    def test_int64_overflow_groupby_large_range(self):
+        # GH9096
+        values = range(55109)
+        data = DataFrame.from_dict({"a": values, "b": values, "c": values, "d": values})
+        grouped = data.groupby(["a", "b", "c", "d"])
+        assert len(grouped) == len(values)
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize("agg", ["mean", "median"])
+    def test_int64_overflow_groupby_large_df_shuffled(self, agg):
+        rs = np.random.default_rng(2)
+        arr = rs.integers(-1 << 12, 1 << 12, (1 << 15, 5))
+        i = rs.choice(len(arr), len(arr) * 4)
+        arr = np.vstack((arr, arr[i]))  # add some duplicate rows
+
+        i = rs.permutation(len(arr))
+        arr = arr[i]  # shuffle rows
+
+        df = DataFrame(arr, columns=list("abcde"))
+        df["jim"], df["joe"] = np.zeros((2, len(df)))
+        gr = df.groupby(list("abcde"))
+
+        # verify this is testing what it is supposed to test!
+        assert is_int64_overflow_possible(
+            tuple(ping.ngroups for ping in gr._grouper.groupings)
+        )
+
+        mi = MultiIndex.from_arrays(
+            [ar.ravel() for ar in np.array_split(np.unique(arr, axis=0), 5, axis=1)],
+            names=list("abcde"),
+        )
+
+        res = DataFrame(
+            np.zeros((len(mi), 2)), columns=["jim", "joe"], index=mi
+        ).sort_index()
+
+        tm.assert_frame_equal(getattr(gr, agg)(), res)
+
+    @pytest.mark.parametrize(
+        "order, na_position, exp",
+        [
+            [
+                True,
+                "last",
+                list(range(5, 105)) + list(range(5)) + list(range(105, 110)),
+            ],
+            [
+                True,
+                "first",
+                list(range(5)) + list(range(105, 110)) + list(range(5, 105)),
+            ],
+            [
+                False,
+                "last",
+                list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)),
+            ],
+            [
+                False,
+                "first",
+                list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)),
+            ],
+        ],
+    )
+    def test_lexsort_indexer(self, order, na_position, exp):
+        keys = [[np.nan] * 5 + list(range(100)) + [np.nan] * 5]
+        result = lexsort_indexer(keys, orders=order, na_position=na_position)
+        tm.assert_numpy_array_equal(result, np.array(exp, dtype=np.intp))
+
+    @pytest.mark.parametrize(
+        "ascending, na_position, exp",
+        [
+            [
+                True,
+                "last",
+                list(range(5, 105)) + list(range(5)) + list(range(105, 110)),
+            ],
+            [
+                True,
+                "first",
+                list(range(5)) + list(range(105, 110)) + list(range(5, 105)),
+            ],
+            [
+                False,
+                "last",
+                list(range(104, 4, -1)) + list(range(5)) + list(range(105, 110)),
+            ],
+            [
+                False,
+                "first",
+                list(range(5)) + list(range(105, 110)) + list(range(104, 4, -1)),
+            ],
+        ],
+    )
+    def test_nargsort(self, ascending, na_position, exp):
+        # list places NaNs last, np.array(..., dtype="O") may not place NaNs first
+        items = np.array([np.nan] * 5 + list(range(100)) + [np.nan] * 5, dtype="O")
+
+        # mergesort is the most difficult to get right because we want it to be
+        # stable.
+
+        # According to numpy/core/tests/test_multiarray, """The number of
+        # sorted items must be greater than ~50 to check the actual algorithm
+        # because quick and merge sort fall over to insertion sort for small
+        # arrays."""
+
+        result = nargsort(
+            items, kind="mergesort", ascending=ascending, na_position=na_position
+        )
+        tm.assert_numpy_array_equal(result, np.array(exp), check_dtype=False)
+
+
+class TestMerge:
+    def test_int64_overflow_outer_merge(self):
+        # #2690, combinatorial explosion
+        df1 = DataFrame(
+            np.random.default_rng(2).standard_normal((1000, 7)),
+            columns=[*list("ABCDEF"), "G1"],
+        )
+        df2 = DataFrame(
+            np.random.default_rng(3).standard_normal((1000, 7)),
+            columns=[*list("ABCDEF"), "G2"],
+        )
+        result = merge(df1, df2, how="outer")
+        assert len(result) == 2000
+
+    @pytest.mark.slow
+    def test_int64_overflow_check_sum_col(self, left_right):
+        left, right = left_right
+
+        out = merge(left, right, how="outer")
+        assert len(out) == len(left)
+        tm.assert_series_equal(out["left"], -out["right"], check_names=False)
+        result = out.iloc[:, :-2].sum(axis=1)
+        tm.assert_series_equal(out["left"], result, check_names=False)
+        assert result.name is None
+
+    @pytest.mark.slow
+    def test_int64_overflow_how_merge(self, left_right, join_type):
+        left, right = left_right
+
+        out = merge(left, right, how="outer")
+        out.sort_values(out.columns.tolist(), inplace=True)
+        tm.assert_frame_equal(out, merge(left, right, how=join_type, sort=True))
+
+    @pytest.mark.slow
+    def test_int64_overflow_sort_false_order(self, left_right):
+        left, right = left_right
+
+        # check that left merge w/ sort=False maintains left frame order
+        out = merge(left, right, how="left", sort=False)
+        tm.assert_frame_equal(left, out[left.columns.tolist()])
+
+        out = merge(right, left, how="left", sort=False)
+        tm.assert_frame_equal(right, out[right.columns.tolist()])
+
+    @pytest.mark.slow
+    def test_int64_overflow_one_to_many_none_match(self, join_type, sort):
+        # one-2-many/none match
+        how = join_type
+        low, high, n = -1 << 10, 1 << 10, 1 << 11
+        left = DataFrame(
+            np.random.default_rng(2).integers(low, high, (n, 7)).astype("int64"),
+            columns=list("ABCDEFG"),
+        )
+
+        # confirm that this is checking what it is supposed to check
+        shape = left.apply(Series.nunique).values
+        assert is_int64_overflow_possible(shape)
+
+        # add duplicates to left frame
+        left = concat([left, left], ignore_index=True)
+
+        right = DataFrame(
+            np.random.default_rng(3).integers(low, high, (n // 2, 7)).astype("int64"),
+            columns=list("ABCDEFG"),
+        )
+
+        # add duplicates & overlap with left to the right frame
+        i = np.random.default_rng(4).choice(len(left), n)
+        right = concat([right, right, left.iloc[i]], ignore_index=True)
+
+        left["left"] = np.random.default_rng(2).standard_normal(len(left))
+        right["right"] = np.random.default_rng(2).standard_normal(len(right))
+
+        # shuffle left & right frames
+        left = left.sample(
+            frac=1, ignore_index=True, random_state=np.random.default_rng(5)
+        )
+        right = right.sample(
+            frac=1, ignore_index=True, random_state=np.random.default_rng(6)
+        )
+
+        # manually compute outer merge
+        ldict, rdict = defaultdict(list), defaultdict(list)
+
+        for idx, row in left.set_index(list("ABCDEFG")).iterrows():
+            ldict[idx].append(row["left"])
+
+        for idx, row in right.set_index(list("ABCDEFG")).iterrows():
+            rdict[idx].append(row["right"])
+
+        vals = []
+        for k, lval in ldict.items():
+            rval = rdict.get(k, [np.nan])
+            for lv, rv in product(lval, rval):
+                vals.append((*k, lv, rv))
+
+        for k, rval in rdict.items():
+            if k not in ldict:
+                vals.extend((*k, np.nan, rv) for rv in rval)
+
+        out = DataFrame(vals, columns=[*list("ABCDEFG"), "left", "right"])
+        out = out.sort_values(out.columns.to_list(), ignore_index=True)
+
+        jmask = {
+            "left": out["left"].notna(),
+            "right": out["right"].notna(),
+            "inner": out["left"].notna() & out["right"].notna(),
+            "outer": np.ones(len(out), dtype="bool"),
+        }
+
+        mask = jmask[how]
+        frame = out[mask].sort_values(out.columns.to_list(), ignore_index=True)
+        assert mask.all() ^ mask.any() or how == "outer"
+
+        res = merge(left, right, how=how, sort=sort)
+        if sort:
+            kcols = list("ABCDEFG")
+            tm.assert_frame_equal(
+                res[kcols], res[kcols].sort_values(kcols, kind="mergesort")
+            )
+
+        # as in GH9092 dtypes break with outer/right join
+        # 2021-12-18: dtype does not break anymore
+        tm.assert_frame_equal(
+            frame, res.sort_values(res.columns.to_list(), ignore_index=True)
+        )
+
+
+@pytest.mark.parametrize(
+    "codes_list, shape",
+    [
+        [
+            [
+                np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100).astype(np.int64),
+                np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100).astype(np.int64),
+                np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100).astype(np.int64),
+            ],
+            (4, 5, 6),
+        ],
+        [
+            [
+                np.tile(np.arange(10000, dtype=np.int64), 5),
+                np.tile(np.arange(10000, dtype=np.int64), 5),
+            ],
+            (10000, 10000),
+        ],
+    ],
+)
+def test_decons(codes_list, shape):
+    group_index = get_group_index(codes_list, shape, sort=True, xnull=True)
+    codes_list2 = _decons_group_index(group_index, shape)
+
+    for a, b in zip(codes_list, codes_list2, strict=True):
+        tm.assert_numpy_array_equal(a, b)
+
+
+class TestSafeSort:
+    @pytest.mark.parametrize(
+        "arg, exp",
+        [
+            [[3, 1, 2, 0, 4], [0, 1, 2, 3, 4]],
+            [
+                np.array(list("baaacb"), dtype=object),
+                np.array(list("aaabbc"), dtype=object),
+            ],
+            [[], []],
+        ],
+    )
+    def test_basic_sort(self, arg, exp):
+        result = safe_sort(np.array(arg))
+        expected = np.array(exp)
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("verify", [True, False])
+    @pytest.mark.parametrize(
+        "codes, exp_codes",
+        [
+            [[0, 1, 1, 2, 3, 0, -1, 4], [3, 1, 1, 2, 0, 3, -1, 4]],
+            [[], []],
+        ],
+    )
+    def test_codes(self, verify, codes, exp_codes):
+        values = np.array([3, 1, 2, 0, 4])
+        expected = np.array([0, 1, 2, 3, 4])
+
+        result, result_codes = safe_sort(
+            values, codes, use_na_sentinel=True, verify=verify
+        )
+        expected_codes = np.array(exp_codes, dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+        tm.assert_numpy_array_equal(result_codes, expected_codes)
+
+    def test_codes_out_of_bound(self):
+        values = np.array([3, 1, 2, 0, 4])
+        expected = np.array([0, 1, 2, 3, 4])
+
+        # out of bound indices
+        codes = [0, 101, 102, 2, 3, 0, 99, 4]
+        result, result_codes = safe_sort(values, codes, use_na_sentinel=True)
+        expected_codes = np.array([3, -1, -1, 2, 0, 3, -1, 4], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+        tm.assert_numpy_array_equal(result_codes, expected_codes)
+
+    @pytest.mark.parametrize("codes", [[-1, -1], [2, -1], [2, 2]])
+    def test_codes_empty_array_out_of_bound(self, codes):
+        empty_values = np.array([])
+        expected_codes = -np.ones_like(codes, dtype=np.intp)
+        _, result_codes = safe_sort(empty_values, codes)
+        tm.assert_numpy_array_equal(result_codes, expected_codes)
+
+    def test_mixed_integer(self):
+        values = np.array(["b", 1, 0, "a", 0, "b"], dtype=object)
+        result = safe_sort(values)
+        expected = np.array([0, 0, 1, "a", "b", "b"], dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_mixed_integer_with_codes(self):
+        values = np.array(["b", 1, 0, "a"], dtype=object)
+        codes = [0, 1, 2, 3, 0, -1, 1]
+        result, result_codes = safe_sort(values, codes)
+        expected = np.array([0, 1, "a", "b"], dtype=object)
+        expected_codes = np.array([3, 1, 0, 2, 3, -1, 1], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+        tm.assert_numpy_array_equal(result_codes, expected_codes)
+
+    def test_unsortable(self):
+        # GH 13714
+        arr = np.array([1, 2, datetime.now(), 0, 3], dtype=object)
+        msg = "'[<>]' not supported between instances of .*"
+        with pytest.raises(TypeError, match=msg):
+            safe_sort(arr)
+
+    @pytest.mark.parametrize(
+        "arg, codes, err, msg",
+        [
+            [1, None, TypeError, "Only np.ndarray, ExtensionArray, and Index"],
+            [np.array([0, 1, 2]), 1, TypeError, "Only list-like objects or None"],
+            [np.array([0, 1, 2, 1]), [0, 1], ValueError, "values should be unique"],
+        ],
+    )
+    def test_exceptions(self, arg, codes, err, msg):
+        with pytest.raises(err, match=msg):
+            safe_sort(values=arg, codes=codes)
+
+    @pytest.mark.parametrize(
+        "arg, exp", [[[1, 3, 2], [1, 2, 3]], [[1, 3, NA, 2], [1, 2, 3, NA]]]
+    )
+    def test_extension_array(self, arg, exp):
+        a = array(arg, dtype="Int64")
+        result = safe_sort(a)
+        expected = array(exp, dtype="Int64")
+        tm.assert_extension_array_equal(result, expected)
+
+    @pytest.mark.parametrize("verify", [True, False])
+    def test_extension_array_codes(self, verify):
+        a = array([1, 3, 2], dtype="Int64")
+        result, codes = safe_sort(a, [0, 1, -1, 2], use_na_sentinel=True, verify=verify)
+        expected_values = array([1, 2, 3], dtype="Int64")
+        expected_codes = np.array([0, 2, -1, 1], dtype=np.intp)
+        tm.assert_extension_array_equal(result, expected_values)
+        tm.assert_numpy_array_equal(codes, expected_codes)
+
+
+def test_mixed_str_null(nulls_fixture):
+    values = np.array(["b", nulls_fixture, "a", "b"], dtype=object)
+    result = safe_sort(values)
+    expected = np.array(["a", "b", "b", nulls_fixture], dtype=object)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_safe_sort_multiindex():
+    # GH#48412
+    arr1 = Series([2, 1, NA, NA], dtype="Int64")
+    arr2 = [2, 1, 3, 3]
+    midx = MultiIndex.from_arrays([arr1, arr2])
+    result = safe_sort(midx)
+    expected = MultiIndex.from_arrays(
+        [Series([1, 2, NA, NA], dtype="Int64"), [1, 2, 3, 3]]
+    )
+    tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py
new file mode 100644
index 0000000000000000000000000000000000000000..451ef42fff3d170682f5f0a6440df5ba9cb85a08
--- /dev/null
+++ b/pandas/tests/test_take.py
@@ -0,0 +1,317 @@
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+from pandas._libs import iNaT
+
+from pandas import array
+import pandas._testing as tm
+import pandas.core.algorithms as algos
+
+
+@pytest.fixture(
+    params=[
+        (np.int8, np.int16(127), np.int8),
+        (np.int8, np.int16(128), np.int16),
+        (np.int32, 1, np.int32),
+        (np.int32, 2.0, np.float64),
+        (np.int32, 3.0 + 4.0j, np.complex128),
+        (np.int32, True, np.object_),
+        (np.int32, "", np.object_),
+        (np.float64, 1, np.float64),
+        (np.float64, 2.0, np.float64),
+        (np.float64, 3.0 + 4.0j, np.complex128),
+        (np.float64, True, np.object_),
+        (np.float64, "", np.object_),
+        (np.complex128, 1, np.complex128),
+        (np.complex128, 2.0, np.complex128),
+        (np.complex128, 3.0 + 4.0j, np.complex128),
+        (np.complex128, True, np.object_),
+        (np.complex128, "", np.object_),
+        (np.bool_, 1, np.object_),
+        (np.bool_, 2.0, np.object_),
+        (np.bool_, 3.0 + 4.0j, np.object_),
+        (np.bool_, True, np.bool_),
+        (np.bool_, "", np.object_),
+    ]
+)
+def dtype_fill_out_dtype(request):
+    return request.param
+
+
+class TestTake:
+    def test_1d_fill_nonna(self, dtype_fill_out_dtype):
+        dtype, fill_value, out_dtype = dtype_fill_out_dtype
+        data = np.random.default_rng(2).integers(0, 2, 4).astype(dtype)
+        indexer = [2, 1, 0, -1]
+
+        result = algos.take_nd(data, indexer, fill_value=fill_value)
+        assert (result[[0, 1, 2]] == data[[2, 1, 0]]).all()
+        assert result[3] == fill_value
+        assert result.dtype == out_dtype
+
+        indexer = [2, 1, 0, 1]
+
+        result = algos.take_nd(data, indexer, fill_value=fill_value)
+        assert (result[[0, 1, 2, 3]] == data[indexer]).all()
+        assert result.dtype == dtype
+
+    def test_2d_fill_nonna(self, dtype_fill_out_dtype):
+        dtype, fill_value, out_dtype = dtype_fill_out_dtype
+        data = np.random.default_rng(2).integers(0, 2, (5, 3)).astype(dtype)
+        indexer = [2, 1, 0, -1]
+
+        result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value)
+        assert (result[[0, 1, 2], :] == data[[2, 1, 0], :]).all()
+        assert (result[3, :] == fill_value).all()
+        assert result.dtype == out_dtype
+
+        result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value)
+        assert (result[:, [0, 1, 2]] == data[:, [2, 1, 0]]).all()
+        assert (result[:, 3] == fill_value).all()
+        assert result.dtype == out_dtype
+
+        indexer = [2, 1, 0, 1]
+        result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value)
+        assert (result[[0, 1, 2, 3], :] == data[indexer, :]).all()
+        assert result.dtype == dtype
+
+        result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value)
+        assert (result[:, [0, 1, 2, 3]] == data[:, indexer]).all()
+        assert result.dtype == dtype
+
+    def test_3d_fill_nonna(self, dtype_fill_out_dtype):
+        dtype, fill_value, out_dtype = dtype_fill_out_dtype
+
+        data = np.random.default_rng(2).integers(0, 2, (5, 4, 3)).astype(dtype)
+        indexer = [2, 1, 0, -1]
+
+        result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value)
+        assert (result[[0, 1, 2], :, :] == data[[2, 1, 0], :, :]).all()
+        assert (result[3, :, :] == fill_value).all()
+        assert result.dtype == out_dtype
+
+        result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value)
+        assert (result[:, [0, 1, 2], :] == data[:, [2, 1, 0], :]).all()
+        assert (result[:, 3, :] == fill_value).all()
+        assert result.dtype == out_dtype
+
+        result = algos.take_nd(data, indexer, axis=2, fill_value=fill_value)
+        assert (result[:, :, [0, 1, 2]] == data[:, :, [2, 1, 0]]).all()
+        assert (result[:, :, 3] == fill_value).all()
+        assert result.dtype == out_dtype
+
+        indexer = [2, 1, 0, 1]
+        result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value)
+        assert (result[[0, 1, 2, 3], :, :] == data[indexer, :, :]).all()
+        assert result.dtype == dtype
+
+        result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value)
+        assert (result[:, [0, 1, 2, 3], :] == data[:, indexer, :]).all()
+        assert result.dtype == dtype
+
+        result = algos.take_nd(data, indexer, axis=2, fill_value=fill_value)
+        assert (result[:, :, [0, 1, 2, 3]] == data[:, :, indexer]).all()
+        assert result.dtype == dtype
+
+    def test_1d_other_dtypes(self):
+        arr = np.random.default_rng(2).standard_normal(10).astype(np.float32)
+
+        indexer = [1, 2, 3, -1]
+        result = algos.take_nd(arr, indexer)
+        expected = arr.take(indexer)
+        expected[-1] = np.nan
+        tm.assert_almost_equal(result, expected)
+
+    def test_2d_other_dtypes(self):
+        arr = np.random.default_rng(2).standard_normal((10, 5)).astype(np.float32)
+
+        indexer = [1, 2, 3, -1]
+
+        # axis=0
+        result = algos.take_nd(arr, indexer, axis=0)
+        expected = arr.take(indexer, axis=0)
+        expected[-1] = np.nan
+        tm.assert_almost_equal(result, expected)
+
+        # axis=1
+        result = algos.take_nd(arr, indexer, axis=1)
+        expected = arr.take(indexer, axis=1)
+        expected[:, -1] = np.nan
+        tm.assert_almost_equal(result, expected)
+
+    def test_1d_bool(self):
+        arr = np.array([0, 1, 0], dtype=bool)
+
+        result = algos.take_nd(arr, [0, 2, 2, 1])
+        expected = arr.take([0, 2, 2, 1])
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.take_nd(arr, [0, 2, -1])
+        assert result.dtype == np.object_
+
+    def test_2d_bool(self):
+        arr = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 1]], dtype=bool)
+
+        result = algos.take_nd(arr, [0, 2, 2, 1])
+        expected = arr.take([0, 2, 2, 1], axis=0)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.take_nd(arr, [0, 2, 2, 1], axis=1)
+        expected = arr.take([0, 2, 2, 1], axis=1)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = algos.take_nd(arr, [0, 2, -1])
+        assert result.dtype == np.object_
+
+    def test_2d_float32(self):
+        arr = np.random.default_rng(2).standard_normal((4, 3)).astype(np.float32)
+        indexer = [0, 2, -1, 1, -1]
+
+        # axis=0
+        result = algos.take_nd(arr, indexer, axis=0)
+
+        expected = arr.take(indexer, axis=0)
+        expected[[2, 4], :] = np.nan
+        tm.assert_almost_equal(result, expected)
+
+        # axis=1
+        result = algos.take_nd(arr, indexer, axis=1)
+        expected = arr.take(indexer, axis=1)
+        expected[:, [2, 4]] = np.nan
+        tm.assert_almost_equal(result, expected)
+
+    def test_2d_datetime64(self):
+        # 2005/01/01 - 2006/01/01
+        arr = (
+            np.random.default_rng(2).integers(11_045_376, 11_360_736, (5, 3))
+            * 100_000_000_000
+        )
+        arr = arr.view(dtype="datetime64[ns]")
+        indexer = [0, 2, -1, 1, -1]
+
+        # axis=0
+        result = algos.take_nd(arr, indexer, axis=0)
+        expected = arr.take(indexer, axis=0)
+        expected.view(np.int64)[[2, 4], :] = iNaT
+        tm.assert_almost_equal(result, expected)
+
+        result = algos.take_nd(arr, indexer, axis=0, fill_value=datetime(2007, 1, 1))
+        expected = arr.take(indexer, axis=0)
+        expected[[2, 4], :] = datetime(2007, 1, 1)
+        tm.assert_almost_equal(result, expected)
+
+        # axis=1
+        result = algos.take_nd(arr, indexer, axis=1)
+        expected = arr.take(indexer, axis=1)
+        expected.view(np.int64)[:, [2, 4]] = iNaT
+        tm.assert_almost_equal(result, expected)
+
+        result = algos.take_nd(arr, indexer, axis=1, fill_value=datetime(2007, 1, 1))
+        expected = arr.take(indexer, axis=1)
+        expected[:, [2, 4]] = datetime(2007, 1, 1)
+        tm.assert_almost_equal(result, expected)
+
+    def test_take_axis_0(self):
+        arr = np.arange(12).reshape(4, 3)
+        result = algos.take(arr, [0, -1])
+        expected = np.array([[0, 1, 2], [9, 10, 11]])
+        tm.assert_numpy_array_equal(result, expected)
+
+        # allow_fill=True
+        result = algos.take(arr, [0, -1], allow_fill=True, fill_value=0)
+        expected = np.array([[0, 1, 2], [0, 0, 0]])
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_take_axis_1(self):
+        arr = np.arange(12).reshape(4, 3)
+        result = algos.take(arr, [0, -1], axis=1)
+        expected = np.array([[0, 2], [3, 5], [6, 8], [9, 11]])
+        tm.assert_numpy_array_equal(result, expected)
+
+        # allow_fill=True
+        result = algos.take(arr, [0, -1], axis=1, allow_fill=True, fill_value=0)
+        expected = np.array([[0, 0], [3, 0], [6, 0], [9, 0]])
+        tm.assert_numpy_array_equal(result, expected)
+
+        # GH#26976 make sure we validate along the correct axis
+        with pytest.raises(IndexError, match="indices are out-of-bounds"):
+            algos.take(arr, [0, 3], axis=1, allow_fill=True, fill_value=0)
+
+    def test_take_non_hashable_fill_value(self):
+        arr = np.array([1, 2, 3])
+        indexer = np.array([1, -1])
+        with pytest.raises(ValueError, match="fill_value must be a scalar"):
+            algos.take(arr, indexer, allow_fill=True, fill_value=[1])
+
+        # with object dtype it is allowed
+        arr = np.array([1, 2, 3], dtype=object)
+        result = algos.take(arr, indexer, allow_fill=True, fill_value=[1])
+        expected = np.array([2, [1]], dtype=object)
+        tm.assert_numpy_array_equal(result, expected)
+
+
+class TestExtensionTake:
+    # The take method found in pd.api.extensions
+
+    def test_bounds_check_large(self):
+        arr = np.array([1, 2])
+
+        msg = "indices are out-of-bounds"
+        with pytest.raises(IndexError, match=msg):
+            algos.take(arr, [2, 3], allow_fill=True)
+
+        msg = "index 2 is out of bounds for( axis 0 with)? size 2"
+        with pytest.raises(IndexError, match=msg):
+            algos.take(arr, [2, 3], allow_fill=False)
+
+    def test_bounds_check_small(self):
+        arr = np.array([1, 2, 3], dtype=np.int64)
+        indexer = [0, -1, -2]
+
+        msg = r"'indices' contains values less than allowed \(-2 < -1\)"
+        with pytest.raises(ValueError, match=msg):
+            algos.take(arr, indexer, allow_fill=True)
+
+        result = algos.take(arr, indexer)
+        expected = np.array([1, 3, 2], dtype=np.int64)
+        tm.assert_numpy_array_equal(result, expected)
+
+    @pytest.mark.parametrize("allow_fill", [True, False])
+    def test_take_empty(self, allow_fill):
+        arr = np.array([], dtype=np.int64)
+        # empty take is ok
+        result = algos.take(arr, [], allow_fill=allow_fill)
+        tm.assert_numpy_array_equal(arr, result)
+
+        msg = "|".join(
+            [
+                "cannot do a non-empty take from an empty axes.",
+                "indices are out-of-bounds",
+            ]
+        )
+        with pytest.raises(IndexError, match=msg):
+            algos.take(arr, [0], allow_fill=allow_fill)
+
+    def test_take_na_empty(self):
+        result = algos.take(np.array([]), [-1, -1], allow_fill=True, fill_value=0.0)
+        expected = np.array([0.0, 0.0])
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_take_coerces_list(self):
+        # GH#52981 coercing is deprecated, disabled in 3.0
+        arr = [1, 2, 3]
+        msg = (
+            "pd.api.extensions.take requires a numpy.ndarray, ExtensionArray, "
+            "Index, Series, or NumpyExtensionArray got list"
+        )
+        with pytest.raises(TypeError, match=msg):
+            algos.take(arr, [0, 0])
+
+    def test_take_NumpyExtensionArray(self):
+        # GH#59177
+        arr = array([1 + 1j, 2, 3])  # NumpyEADtype('complex128') (NumpyExtensionArray)
+        assert algos.take(arr, [2]) == 2
+        arr = array([1, 2, 3])  # Int64Dtype() (ExtensionArray)
+        assert algos.take(arr, [2]) == 2
diff --git a/pandas/tests/tools/__init__.py b/pandas/tests/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec97985b496fd6367517a2449e793d5140949138
--- /dev/null
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -0,0 +1,3829 @@
+"""test to_datetime"""
+
+import calendar
+from collections import deque
+from datetime import (
+    date,
+    datetime,
+    timedelta,
+    timezone,
+)
+from decimal import Decimal
+import locale
+import zoneinfo
+
+from dateutil.parser import parse
+import numpy as np
+import pytest
+
+from pandas._libs import tslib
+from pandas._libs.tslibs import (
+    iNaT,
+    parsing,
+)
+from pandas.compat import (
+    PY314,
+    WASM,
+)
+from pandas.errors import (
+    OutOfBoundsDatetime,
+    OutOfBoundsTimedelta,
+)
+import pandas.util._test_decorators as td
+
+from pandas.core.dtypes.common import is_datetime64_ns_dtype
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    NaT,
+    Series,
+    Timestamp,
+    date_range,
+    isna,
+    to_datetime,
+)
+import pandas._testing as tm
+from pandas.core.arrays import DatetimeArray
+from pandas.core.tools import datetimes as tools
+from pandas.core.tools.datetimes import start_caching_at
+
+PARSING_ERR_MSG = (
+    r"You might want to try:\n"
+    r"    - passing `format` if your strings have a consistent format;\n"
+    r"    - passing `format=\'ISO8601\'` if your strings are all ISO8601 "
+    r"but not necessarily in exactly the same format;\n"
+    r"    - passing `format=\'mixed\'`, and the format will be inferred "
+    r"for each element individually. You might want to use `dayfirst` "
+    r"alongside this."
+)
+
+if PY314:
+    NOT_99 = ", not 99"
+    DAY_IS_OUT_OF_RANGE = (
+        r"day \d{1,2} must be in range 1\.\.\d{1,2} for "
+        r"month \d{1,2} in year \d{4}"
+    )
+else:
+    NOT_99 = ""
+    DAY_IS_OUT_OF_RANGE = "day is out of range for month"
+
+
+class TestTimeConversionFormats:
+    def test_to_datetime_readonly(self, writable):
+        # GH#34857
+        arr = np.array([], dtype=object)
+        arr.setflags(write=writable)
+        result = to_datetime(arr)
+        expected = to_datetime([])
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "format, expected",
+        [
+            [
+                "%d/%m/%Y",
+                [Timestamp("20000101"), Timestamp("20000201"), Timestamp("20000301")],
+            ],
+            [
+                "%m/%d/%Y",
+                [Timestamp("20000101"), Timestamp("20000102"), Timestamp("20000103")],
+            ],
+        ],
+    )
+    def test_to_datetime_format(self, cache, index_or_series, format, expected):
+        values = index_or_series(["1/1/2000", "1/2/2000", "1/3/2000"])
+        result = to_datetime(values, format=format, cache=cache)
+        expected = index_or_series(expected)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "arg, expected, format",
+        [
+            ["1/1/2000", "20000101", "%d/%m/%Y"],
+            ["1/1/2000", "20000101", "%m/%d/%Y"],
+            ["1/2/2000", "20000201", "%d/%m/%Y"],
+            ["1/2/2000", "20000102", "%m/%d/%Y"],
+            ["1/3/2000", "20000301", "%d/%m/%Y"],
+            ["1/3/2000", "20000103", "%m/%d/%Y"],
+        ],
+    )
+    def test_to_datetime_format_scalar(self, cache, arg, expected, format):
+        result = to_datetime(arg, format=format, cache=cache)
+        expected = Timestamp(expected)
+        assert result == expected
+
+    def test_to_datetime_format_YYYYMMDD(self, cache):
+        ser = Series([19801222, 19801222] + [19810105] * 5)
+        expected = Series([Timestamp(x) for x in ser.apply(str)])
+
+        result = to_datetime(ser, format="%Y%m%d", cache=cache)
+        tm.assert_series_equal(result, expected)
+
+        result = to_datetime(ser.apply(str), format="%Y%m%d", cache=cache)
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_format_YYYYMMDD_with_nat(self, cache):
+        # Explicit cast to float to explicit cast when setting np.nan
+        ser = Series([19801222, 19801222] + [19810105] * 5, dtype="float")
+        # with NaT
+        expected = Series(
+            [Timestamp("19801222"), Timestamp("19801222")]
+            + [Timestamp("19810105")] * 5,
+            dtype="M8[us]",
+        )
+        expected[2] = np.nan
+        ser[2] = np.nan
+
+        result = to_datetime(ser, format="%Y%m%d", cache=cache)
+        tm.assert_series_equal(result, expected)
+
+        # string with NaT
+        ser2 = ser.apply(str)
+        ser2[2] = "nat"
+        with pytest.raises(
+            ValueError,
+            match=(
+                'unconverted data remains when parsing with format "%Y%m%d": ".0". '
+            ),
+        ):
+            # https://github.com/pandas-dev/pandas/issues/50051
+            to_datetime(ser2, format="%Y%m%d", cache=cache)
+
+    def test_to_datetime_format_YYYYMM_with_nat(self, cache):
+        # https://github.com/pandas-dev/pandas/issues/50237
+        # Explicit cast to float to explicit cast when setting np.nan
+        ser = Series([198012, 198012] + [198101] * 5, dtype="float")
+        expected = Series(
+            [Timestamp("19801201"), Timestamp("19801201")]
+            + [Timestamp("19810101")] * 5,
+            dtype="M8[us]",
+        )
+        expected[2] = np.nan
+        ser[2] = np.nan
+        result = to_datetime(ser, format="%Y%m", cache=cache)
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_format_YYYYMMDD_oob_for_ns(self, cache):
+        # coercion
+        # GH 7930, GH 14487
+        ser = Series([20121231, 20141231, 99991231])
+        result = to_datetime(ser, format="%Y%m%d", errors="raise", cache=cache)
+        expected = Series(
+            np.array(["2012-12-31", "2014-12-31", "9999-12-31"], dtype="M8[s]"),
+            dtype="M8[us]",
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_format_YYYYMMDD_coercion(self, cache):
+        # coercion
+        # GH 7930
+        ser = Series([20121231, 20141231, 999999999999999999999999999991231])
+        result = to_datetime(ser, format="%Y%m%d", errors="coerce", cache=cache)
+        expected = Series(["20121231", "20141231", "NaT"], dtype="M8[us]")
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "input_s",
+        [
+            # Null values with Strings
+            ["19801222", "20010112", None],
+            ["19801222", "20010112", np.nan],
+            ["19801222", "20010112", NaT],
+            ["19801222", "20010112", "NaT"],
+            # Null values with Integers
+            [19801222, 20010112, None],
+            [19801222, 20010112, np.nan],
+            [19801222, 20010112, NaT],
+            [19801222, 20010112, "NaT"],
+        ],
+    )
+    def test_to_datetime_format_YYYYMMDD_with_none(self, input_s):
+        # GH 30011
+        # format='%Y%m%d'
+        # with None
+        expected = Series([Timestamp("19801222"), Timestamp("20010112"), NaT])
+        result = Series(to_datetime(input_s, format="%Y%m%d"))
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "input_s, expected",
+        [
+            # NaN before strings with invalid date values
+            [
+                ["19801222", np.nan, "20010012", "10019999"],
+                [Timestamp("19801222"), np.nan, np.nan, np.nan],
+            ],
+            # NaN after strings with invalid date values
+            [
+                ["19801222", "20010012", "10019999", np.nan],
+                [Timestamp("19801222"), np.nan, np.nan, np.nan],
+            ],
+            # NaN before integers with invalid date values
+            [
+                [20190813, np.nan, 20010012, 20019999],
+                [Timestamp("20190813"), np.nan, np.nan, np.nan],
+            ],
+            # NaN after integers with invalid date values
+            [
+                [20190813, 20010012, np.nan, 20019999],
+                [Timestamp("20190813"), np.nan, np.nan, np.nan],
+            ],
+        ],
+    )
+    def test_to_datetime_format_YYYYMMDD_overflow(self, input_s, expected):
+        # GH 25512
+        # format='%Y%m%d', errors='coerce'
+        input_s = Series(input_s)
+        result = to_datetime(input_s, format="%Y%m%d", errors="coerce")
+        expected = Series(expected)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "data, format, expected",
+        [
+            ([pd.NA], "%Y%m%d%H%M%S", ["NaT"]),
+            ([pd.NA], None, ["NaT"]),
+            (
+                [pd.NA, "20210202202020"],
+                "%Y%m%d%H%M%S",
+                ["NaT", "2021-02-02 20:20:20"],
+            ),
+            (["201010", pd.NA], "%y%m%d", ["2020-10-10", "NaT"]),
+            (["201010", pd.NA], "%d%m%y", ["2010-10-20", "NaT"]),
+            ([None, np.nan, pd.NA], None, ["NaT", "NaT", "NaT"]),
+            ([None, np.nan, pd.NA], "%Y%m%d", ["NaT", "NaT", "NaT"]),
+        ],
+    )
+    def test_to_datetime_with_NA(self, data, format, expected):
+        # GH#42957
+        result = to_datetime(data, format=format)
+        expected = DatetimeIndex(expected)
+        tm.assert_index_equal(result, expected)
+
+    def test_to_datetime_with_NA_with_warning(self):
+        # GH#42957
+        result = to_datetime(["201010", pd.NA])
+        expected = DatetimeIndex(["2010-10-20", "NaT"])
+        tm.assert_index_equal(result, expected)
+
+    def test_to_datetime_format_integer(self, cache):
+        # GH 10178
+        ser = Series([2000, 2001, 2002])
+        expected = Series([Timestamp(x) for x in ser.apply(str)])
+
+        result = to_datetime(ser, format="%Y", cache=cache)
+        tm.assert_series_equal(result, expected)
+
+        ser = Series([200001, 200105, 200206])
+        expected = Series([Timestamp(x[:4] + "-" + x[4:]) for x in ser.apply(str)])
+
+        result = to_datetime(ser, format="%Y%m", cache=cache)
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_format_microsecond(self, cache):
+        month_abbr = calendar.month_abbr[4]
+        val = f"01-{month_abbr}-2011 00:00:01.978"
+
+        format = "%d-%b-%Y %H:%M:%S.%f"
+        result = to_datetime(val, format=format, cache=cache)
+        exp = datetime.strptime(val, format)
+        assert result == exp
+
+    @pytest.mark.parametrize(
+        "value, format, dt",
+        [
+            ["01/10/2010 15:20", "%m/%d/%Y %H:%M", Timestamp("2010-01-10 15:20")],
+            ["01/10/2010 05:43", "%m/%d/%Y %I:%M", Timestamp("2010-01-10 05:43")],
+            [
+                "01/10/2010 13:56:01",
+                "%m/%d/%Y %H:%M:%S",
+                Timestamp("2010-01-10 13:56:01"),
+            ],
+            # The 3 tests below are locale-dependent.
+            # They pass, except when the machine locale is zh_CN or it_IT .
+            pytest.param(
+                "01/10/2010 08:14 PM",
+                "%m/%d/%Y %I:%M %p",
+                Timestamp("2010-01-10 20:14"),
+                marks=pytest.mark.xfail(
+                    locale.getlocale()[0] in ("zh_CN", "it_IT"),
+                    reason="fail on a CI build with LC_ALL=zh_CN.utf8/it_IT.utf8",
+                    strict=False,
+                ),
+            ),
+            pytest.param(
+                "01/10/2010 07:40 AM",
+                "%m/%d/%Y %I:%M %p",
+                Timestamp("2010-01-10 07:40"),
+                marks=pytest.mark.xfail(
+                    locale.getlocale()[0] in ("zh_CN", "it_IT"),
+                    reason="fail on a CI build with LC_ALL=zh_CN.utf8/it_IT.utf8",
+                    strict=False,
+                ),
+            ),
+            pytest.param(
+                "01/10/2010 09:12:56 AM",
+                "%m/%d/%Y %I:%M:%S %p",
+                Timestamp("2010-01-10 09:12:56"),
+                marks=pytest.mark.xfail(
+                    locale.getlocale()[0] in ("zh_CN", "it_IT"),
+                    reason="fail on a CI build with LC_ALL=zh_CN.utf8/it_IT.utf8",
+                    strict=False,
+                ),
+            ),
+        ],
+    )
+    def test_to_datetime_format_time(self, cache, value, format, dt):
+        assert to_datetime(value, format=format, cache=cache) == dt
+
+    @td.skip_if_not_us_locale
+    def test_to_datetime_with_non_exact(self, cache):
+        # GH 10834
+        # 8904
+        # exact kw
+        ser = Series(
+            ["19MAY11", "foobar19MAY11", "19MAY11:00:00:00", "19MAY11 00:00:00Z"]
+        )
+        result = to_datetime(ser, format="%d%b%y", exact=False, cache=cache)
+        expected = to_datetime(
+            ser.str.extract(r"(\d+\w+\d+)", expand=False), format="%d%b%y", cache=cache
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "format, expected",
+        [
+            ("%Y-%m-%d", Timestamp(2000, 1, 3)),
+            ("%Y-%d-%m", Timestamp(2000, 3, 1)),
+            ("%Y-%m-%d %H", Timestamp(2000, 1, 3, 12)),
+            ("%Y-%d-%m %H", Timestamp(2000, 3, 1, 12)),
+            ("%Y-%m-%d %H:%M", Timestamp(2000, 1, 3, 12, 34)),
+            ("%Y-%d-%m %H:%M", Timestamp(2000, 3, 1, 12, 34)),
+            ("%Y-%m-%d %H:%M:%S", Timestamp(2000, 1, 3, 12, 34, 56)),
+            ("%Y-%d-%m %H:%M:%S", Timestamp(2000, 3, 1, 12, 34, 56)),
+            ("%Y-%m-%d %H:%M:%S.%f", Timestamp(2000, 1, 3, 12, 34, 56, 123456)),
+            ("%Y-%d-%m %H:%M:%S.%f", Timestamp(2000, 3, 1, 12, 34, 56, 123456)),
+            (
+                "%Y-%m-%d %H:%M:%S.%f%z",
+                Timestamp(2000, 1, 3, 12, 34, 56, 123456, tz="UTC+01:00"),
+            ),
+            (
+                "%Y-%d-%m %H:%M:%S.%f%z",
+                Timestamp(2000, 3, 1, 12, 34, 56, 123456, tz="UTC+01:00"),
+            ),
+        ],
+    )
+    def test_non_exact_doesnt_parse_whole_string(self, cache, format, expected):
+        # https://github.com/pandas-dev/pandas/issues/50412
+        # the formats alternate between ISO8601 and non-ISO8601 to check both paths
+        result = to_datetime(
+            "2000-01-03 12:34:56.123456+01:00", format=format, exact=False
+        )
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "arg",
+        [
+            "2012-01-01 09:00:00.000000001",
+            "2012-01-01 09:00:00.000001",
+            "2012-01-01 09:00:00.001",
+            "2012-01-01 09:00:00.001000",
+            "2012-01-01 09:00:00.001000000",
+        ],
+    )
+    def test_parse_nanoseconds_with_formula(self, cache, arg):
+        # GH8989
+        # truncating the nanoseconds when a format was provided
+        expected = to_datetime(arg, cache=cache)
+        result = to_datetime(arg, format="%Y-%m-%d %H:%M:%S.%f", cache=cache)
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "value,fmt,expected",
+        [
+            ["2009324", "%Y%W%w", "2009-08-13"],
+            ["2013020", "%Y%U%w", "2013-01-13"],
+        ],
+    )
+    def test_to_datetime_format_weeks(self, value, fmt, expected, cache):
+        assert to_datetime(value, format=fmt, cache=cache) == Timestamp(expected)
+
+    @pytest.mark.parametrize(
+        "fmt,dates,expected_dates",
+        [
+            [
+                "%Y-%m-%d %H:%M:%S %Z",
+                ["2010-01-01 12:00:00 UTC"] * 2,
+                [Timestamp("2010-01-01 12:00:00", tz="UTC")] * 2,
+            ],
+            [
+                "%Y-%m-%d %H:%M:%S%z",
+                ["2010-01-01 12:00:00+0100"] * 2,
+                [
+                    Timestamp(
+                        "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=60))
+                    )
+                ]
+                * 2,
+            ],
+            [
+                "%Y-%m-%d %H:%M:%S %z",
+                ["2010-01-01 12:00:00 +0100"] * 2,
+                [
+                    Timestamp(
+                        "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=60))
+                    )
+                ]
+                * 2,
+            ],
+            [
+                "%Y-%m-%d %H:%M:%S %z",
+                ["2010-01-01 12:00:00 Z", "2010-01-01 12:00:00 Z"],
+                [
+                    Timestamp(
+                        "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=0))
+                    ),
+                    Timestamp(
+                        "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=0))
+                    ),
+                ],
+            ],
+        ],
+    )
+    def test_to_datetime_parse_tzname_or_tzoffset(self, fmt, dates, expected_dates):
+        # GH 13486
+        result = to_datetime(dates, format=fmt)
+        expected = Index(expected_dates)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "fmt,dates,expected_dates",
+        [
+            [
+                "%Y-%m-%d %H:%M:%S %Z",
+                [
+                    "2010-01-01 12:00:00 UTC",
+                    "2010-01-01 12:00:00 GMT",
+                    "2010-01-01 12:00:00 US/Pacific",
+                ],
+                [
+                    Timestamp("2010-01-01 12:00:00", tz="UTC"),
+                    Timestamp("2010-01-01 12:00:00", tz="GMT"),
+                    Timestamp("2010-01-01 12:00:00", tz="US/Pacific"),
+                ],
+            ],
+            [
+                "%Y-%m-%d %H:%M:%S %z",
+                ["2010-01-01 12:00:00 +0100", "2010-01-01 12:00:00 -0100"],
+                [
+                    Timestamp(
+                        "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=60))
+                    ),
+                    Timestamp(
+                        "2010-01-01 12:00:00", tzinfo=timezone(timedelta(minutes=-60))
+                    ),
+                ],
+            ],
+        ],
+    )
+    def test_to_datetime_parse_tzname_or_tzoffset_utc_false_removed(
+        self, fmt, dates, expected_dates
+    ):
+        # GH#13486, GH#50887, GH#57275
+        msg = "Mixed timezones detected. Pass utc=True in to_datetime"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(dates, format=fmt)
+
+    def test_to_datetime_parse_tzname_or_tzoffset_different_tz_to_utc(self):
+        # GH 32792
+        dates = [
+            "2010-01-01 12:00:00 +0100",
+            "2010-01-01 12:00:00 -0100",
+            "2010-01-01 12:00:00 +0300",
+            "2010-01-01 12:00:00 +0400",
+        ]
+        expected_dates = [
+            "2010-01-01 11:00:00+00:00",
+            "2010-01-01 13:00:00+00:00",
+            "2010-01-01 09:00:00+00:00",
+            "2010-01-01 08:00:00+00:00",
+        ]
+        fmt = "%Y-%m-%d %H:%M:%S %z"
+
+        result = to_datetime(dates, format=fmt, utc=True)
+        expected = DatetimeIndex(expected_dates)
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "offset", ["+0", "-1foo", "UTCbar", ":10", "+01:000:01", ""]
+    )
+    def test_to_datetime_parse_timezone_malformed(self, offset):
+        fmt = "%Y-%m-%d %H:%M:%S %z"
+        date = "2010-01-01 12:00:00 " + offset
+
+        msg = "|".join(
+            [
+                r'^time data ".*" doesn\'t match format ".*". ' f"{PARSING_ERR_MSG}$",
+                r'^unconverted data remains when parsing with format ".*": ".*". '
+                f"{PARSING_ERR_MSG}$",
+            ]
+        )
+        with pytest.raises(ValueError, match=msg):
+            to_datetime([date], format=fmt)
+
+    def test_to_datetime_parse_timezone_keeps_name(self):
+        # GH 21697
+        fmt = "%Y-%m-%d %H:%M:%S %z"
+        arg = Index(["2010-01-01 12:00:00 Z"], name="foo")
+        result = to_datetime(arg, format=fmt)
+        expected = DatetimeIndex(["2010-01-01 12:00:00"], tz="UTC", name="foo")
+        tm.assert_index_equal(result, expected)
+
+
+class TestToDatetime:
+    def test_to_datetime_mixed_string_resos(self):
+        # GH#62801
+        vals = [
+            "2016-01-01 01:02:03",
+            "2016-01-01 01:02:03.001",
+            "2016-01-01 01:02:03.001002",
+            "2016-01-01 01:02:03.001002003",
+        ]
+        expected = DatetimeIndex([Timestamp(x).as_unit("ns") for x in vals])
+
+        result1 = DatetimeIndex(vals)
+        tm.assert_index_equal(result1, expected)
+
+        result2 = to_datetime(vals, format="ISO8601")
+        tm.assert_index_equal(result2, expected)
+
+        result3 = to_datetime(vals, format="mixed")
+        tm.assert_index_equal(result3, expected)
+
+    def test_to_datetime_none(self):
+        # GH#23055
+        assert to_datetime(None) is NaT
+
+    @pytest.mark.filterwarnings("ignore:Could not infer format")
+    def test_to_datetime_overflow(self):
+        # we should get an OutOfBoundsDatetime, NOT OverflowError
+        # TODO: Timestamp raises ValueError("could not convert string to Timestamp")
+        #  can we make these more consistent?
+        arg = "08335394550"
+        msg = 'Parsing "08335394550" to datetime overflows'
+        with pytest.raises(OutOfBoundsDatetime, match=msg):
+            to_datetime(arg)
+
+        with pytest.raises(OutOfBoundsDatetime, match=msg):
+            to_datetime([arg])
+
+        res = to_datetime(arg, errors="coerce")
+        assert res is NaT
+        res = to_datetime([arg], errors="coerce")
+        exp = Index([NaT], dtype="M8[s]")
+        tm.assert_index_equal(res, exp)
+
+    def test_to_datetime_mixed_datetime_and_string(self):
+        # GH#47018 adapted old doctest with new behavior
+        d1 = datetime(2020, 1, 1, 17, tzinfo=timezone(-timedelta(hours=1)))
+        d2 = datetime(2020, 1, 1, 18, tzinfo=timezone(-timedelta(hours=1)))
+        res = to_datetime(["2020-01-01 17:00 -0100", d2])
+        expected = to_datetime([d1, d2]).tz_convert(timezone(timedelta(minutes=-60)))
+        tm.assert_index_equal(res, expected)
+
+    def test_to_datetime_mixed_string_and_numeric(self):
+        # GH#55780 np.array(vals) would incorrectly cast the number to str
+        vals = ["2016-01-01", 0]
+        expected = DatetimeIndex([Timestamp(x) for x in vals])
+        result = to_datetime(vals, format="mixed")
+        result2 = to_datetime(vals[::-1], format="mixed")[::-1]
+        result3 = DatetimeIndex(vals)
+        result4 = DatetimeIndex(vals[::-1])[::-1]
+
+        tm.assert_index_equal(result, expected)
+        tm.assert_index_equal(result2, expected)
+        tm.assert_index_equal(result3, expected)
+        tm.assert_index_equal(result4, expected)
+
+    @pytest.mark.parametrize(
+        "format", ["%Y-%m-%d", "%Y-%d-%m"], ids=["ISO8601", "non-ISO8601"]
+    )
+    def test_to_datetime_mixed_date_and_string(self, format):
+        # https://github.com/pandas-dev/pandas/issues/50108
+        d1 = date(2020, 1, 2)
+        res = to_datetime(["2020-01-01", d1], format=format)
+        expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[us]")
+        tm.assert_index_equal(res, expected)
+
+    @pytest.mark.parametrize(
+        "fmt",
+        ["%Y-%d-%m %H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z"],
+        ids=["non-ISO8601 format", "ISO8601 format"],
+    )
+    @pytest.mark.parametrize(
+        "utc, args, expected",
+        [
+            pytest.param(
+                True,
+                ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-08:00"],
+                DatetimeIndex(
+                    ["2000-01-01 09:00:00+00:00", "2000-01-01 10:00:00+00:00"],
+                    dtype="datetime64[us, UTC]",
+                ),
+                id="all tz-aware, with utc",
+            ),
+            pytest.param(
+                False,
+                ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"],
+                DatetimeIndex(
+                    ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"],
+                ).as_unit("us"),
+                id="all tz-aware, without utc",
+            ),
+            pytest.param(
+                True,
+                ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00+00:00"],
+                DatetimeIndex(
+                    ["2000-01-01 09:00:00+00:00", "2000-01-01 02:00:00+00:00"],
+                    dtype="datetime64[us, UTC]",
+                ),
+                id="all tz-aware, mixed offsets, with utc",
+            ),
+            pytest.param(
+                True,
+                ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"],
+                DatetimeIndex(
+                    ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"],
+                    dtype="datetime64[us, UTC]",
+                ),
+                id="tz-aware string, naive pydatetime, with utc",
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "constructor",
+        [Timestamp, lambda x: Timestamp(x).to_pydatetime()],
+    )
+    def test_to_datetime_mixed_datetime_and_string_with_format(
+        self, fmt, utc, args, expected, constructor
+    ):
+        # https://github.com/pandas-dev/pandas/issues/49298
+        # https://github.com/pandas-dev/pandas/issues/50254
+        # note: ISO8601 formats go down a fastpath, so we need to check both
+        # an ISO8601 format and a non-ISO8601 one
+        ts1 = constructor(args[0])
+        ts2 = args[1]
+        result = to_datetime([ts1, ts2], format=fmt, utc=utc)
+        if constructor is Timestamp:
+            expected = expected.as_unit("us")
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "fmt",
+        ["%Y-%d-%m %H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z"],
+        ids=["non-ISO8601 format", "ISO8601 format"],
+    )
+    @pytest.mark.parametrize(
+        "constructor",
+        [Timestamp, lambda x: Timestamp(x).to_pydatetime()],
+    )
+    def test_to_datetime_mixed_dt_and_str_with_format_mixed_offsets_utc_false_removed(
+        self, fmt, constructor
+    ):
+        # https://github.com/pandas-dev/pandas/issues/49298
+        # https://github.com/pandas-dev/pandas/issues/50254
+        # GH#57275
+        # note: ISO8601 formats go down a fastpath, so we need to check both
+        # an ISO8601 format and a non-ISO8601 one
+        args = ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"]
+        ts1 = constructor(args[0])
+        ts2 = args[1]
+        msg = "Mixed timezones detected. Pass utc=True in to_datetime"
+
+        with pytest.raises(ValueError, match=msg):
+            to_datetime([ts1, ts2], format=fmt, utc=False)
+
+    @pytest.mark.parametrize(
+        "fmt, expected",
+        [
+            pytest.param(
+                "%Y-%m-%d %H:%M:%S%z",
+                [
+                    Timestamp("2000-01-01 09:00:00+0100", tz="UTC+01:00"),
+                    Timestamp("2000-01-02 02:00:00+0200", tz="UTC+02:00"),
+                    NaT,
+                ],
+                id="ISO8601, non-UTC",
+            ),
+            pytest.param(
+                "%Y-%d-%m %H:%M:%S%z",
+                [
+                    Timestamp("2000-01-01 09:00:00+0100", tz="UTC+01:00"),
+                    Timestamp("2000-02-01 02:00:00+0200", tz="UTC+02:00"),
+                    NaT,
+                ],
+                id="non-ISO8601, non-UTC",
+            ),
+        ],
+    )
+    def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed(
+        self, fmt, expected
+    ):
+        # https://github.com/pandas-dev/pandas/issues/50071
+        # GH#57275
+        msg = "Mixed timezones detected. Pass utc=True in to_datetime"
+
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(
+                ["2000-01-01 09:00:00+01:00", "2000-01-02 02:00:00+02:00", None],
+                format=fmt,
+                utc=False,
+            )
+
+    @pytest.mark.parametrize(
+        "fmt, expected",
+        [
+            pytest.param(
+                "%Y-%m-%d %H:%M:%S%z",
+                DatetimeIndex(
+                    ["2000-01-01 08:00:00+00:00", "2000-01-02 00:00:00+00:00", "NaT"],
+                    dtype="datetime64[us, UTC]",
+                ),
+                id="ISO8601, UTC",
+            ),
+            pytest.param(
+                "%Y-%d-%m %H:%M:%S%z",
+                DatetimeIndex(
+                    ["2000-01-01 08:00:00+00:00", "2000-02-01 00:00:00+00:00", "NaT"],
+                    dtype="datetime64[us, UTC]",
+                ),
+                id="non-ISO8601, UTC",
+            ),
+        ],
+    )
+    def test_to_datetime_mixed_offsets_with_none(self, fmt, expected):
+        # https://github.com/pandas-dev/pandas/issues/50071
+        result = to_datetime(
+            ["2000-01-01 09:00:00+01:00", "2000-01-02 02:00:00+02:00", None],
+            format=fmt,
+            utc=True,
+        )
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "fmt",
+        ["%Y-%d-%m %H:%M:%S%z", "%Y-%m-%d %H:%M:%S%z"],
+        ids=["non-ISO8601 format", "ISO8601 format"],
+    )
+    @pytest.mark.parametrize(
+        "args",
+        [
+            pytest.param(
+                ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-07:00"],
+                id="all tz-aware, mixed timezones, without utc",
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "constructor",
+        [Timestamp, lambda x: Timestamp(x).to_pydatetime()],
+    )
+    def test_to_datetime_mixed_datetime_and_string_with_format_raises(
+        self, fmt, args, constructor
+    ):
+        # https://github.com/pandas-dev/pandas/issues/49298
+        # note: ISO8601 formats go down a fastpath, so we need to check both
+        # an ISO8601 format and a non-ISO8601 one
+        ts1 = constructor(args[0])
+        ts2 = constructor(args[1])
+        with pytest.raises(
+            ValueError, match="cannot be converted to datetime64 unless utc=True"
+        ):
+            to_datetime([ts1, ts2], format=fmt, utc=False)
+
+    def test_to_datetime_np_str(self):
+        # GH#32264
+        # GH#48969
+        value = np.str_("2019-02-04 10:18:46.297000+0000")
+
+        ser = Series([value])
+
+        exp = Timestamp("2019-02-04 10:18:46.297000", tz="UTC")
+
+        assert to_datetime(value) == exp
+        assert to_datetime(ser.iloc[0]) == exp
+
+        res = to_datetime([value])
+        expected = Index([exp])
+        tm.assert_index_equal(res, expected)
+
+        res = to_datetime(ser)
+        expected = Series(expected)
+        tm.assert_series_equal(res, expected)
+
+    @pytest.mark.parametrize(
+        "s, _format, dt",
+        [
+            ["2015-1-1", "%G-%V-%u", datetime(2014, 12, 29, 0, 0)],
+            ["2015-1-4", "%G-%V-%u", datetime(2015, 1, 1, 0, 0)],
+            ["2015-1-7", "%G-%V-%u", datetime(2015, 1, 4, 0, 0)],
+            ["2024-52-1", "%G-%V-%u", datetime(2024, 12, 23, 0, 0)],
+            ["2024-52-7", "%G-%V-%u", datetime(2024, 12, 29, 0, 0)],
+            ["2025-1-1", "%G-%V-%u", datetime(2024, 12, 30, 0, 0)],
+            ["2020-53-1", "%G-%V-%u", datetime(2020, 12, 28, 0, 0)],
+        ],
+    )
+    def test_to_datetime_iso_week_year_format(self, s, _format, dt):
+        # See GH#16607
+        assert to_datetime(s, format=_format) == dt
+
+    @pytest.mark.parametrize(
+        "msg, s, _format",
+        [
+            [
+                "Week 53 does not exist in ISO year 2024",
+                "2024 53 1",
+                "%G %V %u",
+            ],
+            [
+                "Week 53 does not exist in ISO year 2023",
+                "2023 53 1",
+                "%G %V %u",
+            ],
+        ],
+    )
+    def test_invalid_iso_week_53(self, msg, s, _format):
+        # See GH#60885
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(s, format=_format)
+
+    @pytest.mark.parametrize(
+        "msg, s, _format",
+        [
+            [
+                "ISO week directive '%V' is incompatible with the year directive "
+                "'%Y'. Use the ISO year '%G' instead.",
+                "1999 50",
+                "%Y %V",
+            ],
+            [
+                "ISO year directive '%G' must be used with the ISO week directive "
+                "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.",
+                "1999 51",
+                "%G %V",
+            ],
+            [
+                "ISO year directive '%G' must be used with the ISO week directive "
+                "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.",
+                "1999 Monday",
+                "%G %A",
+            ],
+            [
+                "ISO year directive '%G' must be used with the ISO week directive "
+                "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.",
+                "1999 Mon",
+                "%G %a",
+            ],
+            [
+                "ISO year directive '%G' must be used with the ISO week directive "
+                "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.",
+                "1999 6",
+                "%G %w",
+            ],
+            [
+                "ISO year directive '%G' must be used with the ISO week directive "
+                "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.",
+                "1999 6",
+                "%G %u",
+            ],
+            [
+                "ISO year directive '%G' must be used with the ISO week directive "
+                "'%V' and a weekday directive '%A', '%a', '%w', or '%u'.",
+                "2051",
+                "%G",
+            ],
+            [
+                "Day of the year directive '%j' is not compatible with ISO year "
+                "directive '%G'. Use '%Y' instead.",
+                "1999 51 6 256",
+                "%G %V %u %j",
+            ],
+            [
+                "ISO week directive '%V' is incompatible with the year directive "
+                "'%Y'. Use the ISO year '%G' instead.",
+                "1999 51 Sunday",
+                "%Y %V %A",
+            ],
+            [
+                "ISO week directive '%V' is incompatible with the year directive "
+                "'%Y'. Use the ISO year '%G' instead.",
+                "1999 51 Sun",
+                "%Y %V %a",
+            ],
+            [
+                "ISO week directive '%V' is incompatible with the year directive "
+                "'%Y'. Use the ISO year '%G' instead.",
+                "1999 51 1",
+                "%Y %V %w",
+            ],
+            [
+                "ISO week directive '%V' is incompatible with the year directive "
+                "'%Y'. Use the ISO year '%G' instead.",
+                "1999 51 1",
+                "%Y %V %u",
+            ],
+            [
+                "ISO week directive '%V' must be used with the ISO year directive "
+                "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.",
+                "20",
+                "%V",
+            ],
+            [
+                "ISO week directive '%V' must be used with the ISO year directive "
+                "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.",
+                "1999 51 Sunday",
+                "%V %A",
+            ],
+            [
+                "ISO week directive '%V' must be used with the ISO year directive "
+                "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.",
+                "1999 51 Sun",
+                "%V %a",
+            ],
+            [
+                "ISO week directive '%V' must be used with the ISO year directive "
+                "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.",
+                "1999 51 1",
+                "%V %w",
+            ],
+            [
+                "ISO week directive '%V' must be used with the ISO year directive "
+                "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.",
+                "1999 51 1",
+                "%V %u",
+            ],
+            [
+                "Day of the year directive '%j' is not compatible with ISO year "
+                "directive '%G'. Use '%Y' instead.",
+                "1999 50",
+                "%G %j",
+            ],
+            [
+                "ISO week directive '%V' must be used with the ISO year directive "
+                "'%G' and a weekday directive '%A', '%a', '%w', or '%u'.",
+                "20 Monday",
+                "%V %A",
+            ],
+        ],
+    )
+    @pytest.mark.parametrize("errors", ["raise", "coerce"])
+    def test_error_iso_week_year(self, msg, s, _format, errors):
+        # See GH#16607, GH#50308
+        # This test checks for errors thrown when giving the wrong format
+        # However, as discussed on PR#25541, overriding the locale
+        # causes a different error to be thrown due to the format being
+        # locale specific, but the test data is in english.
+        # Therefore, the tests only run when locale is not overwritten,
+        # as a sort of solution to this problem.
+        if locale.getlocale() != ("zh_CN", "UTF-8") and locale.getlocale() != (
+            "it_IT",
+            "UTF-8",
+        ):
+            with pytest.raises(ValueError, match=msg):
+                to_datetime(s, format=_format, errors=errors)
+
+    @pytest.mark.parametrize("tz", [None, "US/Central"])
+    def test_to_datetime_dtarr(self, tz):
+        # DatetimeArray
+        dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz)
+        arr = dti._data
+
+        result = to_datetime(arr)
+        assert result is arr
+
+    # Doesn't work on Windows since tzpath not set correctly
+    @td.skip_if_windows
+    @pytest.mark.parametrize("utc", [True, False])
+    @pytest.mark.parametrize("tz", [None, "US/Central"])
+    def test_to_datetime_arrow(self, tz, utc, index_or_series):
+        pa = pytest.importorskip("pyarrow")
+
+        dti = date_range("1965-04-03", periods=19, freq="2W", tz=tz)
+        dti = index_or_series(dti)
+
+        dti_arrow = dti.astype(pd.ArrowDtype(pa.timestamp(unit="ns", tz=tz)))
+
+        result = to_datetime(dti_arrow, utc=utc)
+        expected = to_datetime(dti, utc=utc).astype(
+            pd.ArrowDtype(pa.timestamp(unit="ns", tz=tz if not utc else "UTC"))
+        )
+        if not utc and index_or_series is not Series:
+            # Doesn't hold for utc=True, since that will astype
+            # to_datetime also returns a new object for series
+            assert result is dti_arrow
+        if index_or_series is Series:
+            tm.assert_series_equal(result, expected)
+        else:
+            tm.assert_index_equal(result, expected)
+
+    def test_to_datetime_pydatetime(self):
+        actual = to_datetime(datetime(2008, 1, 15))
+        assert actual == datetime(2008, 1, 15)
+
+    def test_to_datetime_YYYYMMDD(self):
+        actual = to_datetime("20080115")
+        assert actual == datetime(2008, 1, 15)
+
+    @td.skip_if_windows  # `tm.set_timezone` does not work in windows
+    @pytest.mark.skipif(WASM, reason="tzset is not available on WASM")
+    def test_to_datetime_now(self):
+        # See GH#18666
+        with tm.set_timezone("US/Eastern"):
+            # GH#18705
+            now = Timestamp("now")
+            pdnow = to_datetime("now")
+            pdnow2 = to_datetime(["now"])[0]
+
+            # These should all be equal with infinite perf; this gives
+            # a generous margin of 10 seconds
+            assert abs(pdnow._value - now._value) < 1e10
+            assert abs(pdnow2._value - now._value) < 1e10
+
+            assert pdnow.tzinfo is None
+            assert pdnow2.tzinfo is None
+
+    @td.skip_if_windows  # `tm.set_timezone` does not work on Windows
+    @pytest.mark.skipif(WASM, reason="tzset is not available on WASM")
+    @pytest.mark.parametrize("tz", ["Pacific/Auckland", "US/Samoa"])
+    def test_to_datetime_today(self, tz):
+        # See GH#18666
+        # Test with one timezone far ahead of UTC and another far behind, so
+        # one of these will _almost_ always be in a different day from UTC.
+        # Unfortunately this test between 12 and 1 AM Samoa time
+        # this both of these timezones _and_ UTC will all be in the same day,
+        # so this test will not detect the regression introduced in #18666.
+        with tm.set_timezone(tz):
+            nptoday = np.datetime64("today").astype("datetime64[us]").astype(np.int64)
+            pdtoday = to_datetime("today")
+            pdtoday2 = to_datetime(["today"])[0]
+
+            tstoday = Timestamp("today")
+            tstoday2 = Timestamp.today()
+
+            # These should all be equal with infinite perf; this gives
+            # a generous margin of 10 seconds
+            assert abs(pdtoday.normalize()._value - nptoday) < 1e10
+            assert abs(pdtoday2.normalize()._value - nptoday) < 1e10
+            assert abs(pdtoday._value - tstoday._value) < 1e10
+            assert abs(pdtoday._value - tstoday2._value) < 1e10
+
+            assert pdtoday.tzinfo is None
+            assert pdtoday2.tzinfo is None
+
+    @pytest.mark.parametrize("arg", ["now", "today"])
+    def test_to_datetime_today_now_unicode_bytes(self, arg):
+        to_datetime([arg])
+
+    @pytest.mark.filterwarnings(
+        "ignore:Timestamp.utcnow is deprecated:DeprecationWarning"
+    )
+    @pytest.mark.skipif(WASM, reason="tzset is not available on WASM")
+    @pytest.mark.parametrize(
+        "format, expected_ds",
+        [
+            ("%Y-%m-%d %H:%M:%S%z", "2020-01-03"),
+            ("%Y-%d-%m %H:%M:%S%z", "2020-03-01"),
+            (None, "2020-01-03"),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "string, attribute",
+        [
+            ("now", "utcnow"),
+            ("today", "today"),
+        ],
+    )
+    def test_to_datetime_now_with_format(self, format, expected_ds, string, attribute):
+        # https://github.com/pandas-dev/pandas/issues/50359
+        result = to_datetime(["2020-01-03 00:00:00Z", string], format=format, utc=True)
+        expected = DatetimeIndex(
+            [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[s, UTC]"
+        )
+        assert (expected - result).max().total_seconds() < 1
+
+    @pytest.mark.parametrize(
+        "dt", [np.datetime64("2000-01-01"), np.datetime64("2000-01-02")]
+    )
+    def test_to_datetime_dt64s(self, cache, dt):
+        assert to_datetime(dt, cache=cache) == Timestamp(dt)
+
+    @pytest.mark.parametrize(
+        "arg, format",
+        [
+            ("2001-01-01", "%Y-%m-%d"),
+            ("01-01-2001", "%d-%m-%Y"),
+        ],
+    )
+    def test_to_datetime_dt64s_and_str(self, arg, format):
+        # https://github.com/pandas-dev/pandas/issues/50036
+        result = to_datetime([arg, np.datetime64("2020-01-01")], format=format)
+        expected = DatetimeIndex(["2001-01-01", "2020-01-01"])
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "dt", [np.datetime64("1000-01-01"), np.datetime64("5000-01-02")]
+    )
+    @pytest.mark.parametrize("errors", ["raise", "coerce"])
+    def test_to_datetime_dt64s_out_of_ns_bounds(self, cache, dt, errors):
+        # GH#50369 We cast to the nearest supported reso, i.e. "s"
+        ts = to_datetime(dt, errors=errors, cache=cache)
+        assert isinstance(ts, Timestamp)
+        assert ts.unit == "s"
+        assert ts.asm8 == dt
+
+        ts = Timestamp(dt)
+        assert ts.unit == "s"
+        assert ts.asm8 == dt
+
+    def test_to_datetime_dt64d_out_of_bounds(self, cache):
+        dt64 = np.datetime64(np.iinfo(np.int64).max, "D")
+
+        msg = "Out of bounds second timestamp: 25252734927768524-07-27"
+        with pytest.raises(OutOfBoundsDatetime, match=msg):
+            Timestamp(dt64)
+        with pytest.raises(OutOfBoundsDatetime, match=msg):
+            to_datetime(dt64, errors="raise", cache=cache)
+
+        assert to_datetime(dt64, errors="coerce", cache=cache) is NaT
+
+    @pytest.mark.parametrize("unit", ["s", "D"])
+    def test_to_datetime_array_of_dt64s(self, cache, unit):
+        # https://github.com/pandas-dev/pandas/issues/31491
+        # Need at least 50 to ensure cache is used.
+        dts = [
+            np.datetime64("2000-01-01", unit),
+            np.datetime64("2000-01-02", unit),
+        ] * 30
+        # Assuming all datetimes are in bounds, to_datetime() returns
+        # an array that is equal to Timestamp() parsing
+        result = to_datetime(dts, cache=cache)
+        expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]")
+
+        tm.assert_index_equal(result, expected)
+
+        # A list of datetimes where the last one is out of bounds
+        dts_with_oob = [*dts, np.datetime64("9999-01-01")]
+
+        # As of GH#51978 we do not raise in this case
+        to_datetime(dts_with_oob, errors="raise")
+
+        result = to_datetime(dts_with_oob, errors="coerce", cache=cache)
+        expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]"))
+        tm.assert_index_equal(result, expected)
+
+    def test_to_datetime_tz(self, cache):
+        # xref 8260
+        # uniform returns a DatetimeIndex
+        arr = [
+            Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"),
+            Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"),
+        ]
+        result = to_datetime(arr, cache=cache)
+        expected = DatetimeIndex(
+            ["2013-01-01 13:00:00", "2013-01-02 14:00:00"], tz="US/Pacific"
+        )
+        tm.assert_index_equal(result, expected)
+
+    def test_to_datetime_tz_mixed(self, cache):
+        # mixed tzs will raise if errors='raise'
+        # https://github.com/pandas-dev/pandas/issues/50585
+        arr = [
+            Timestamp("2013-01-01 13:00:00", tz="US/Pacific"),
+            Timestamp("2013-01-02 14:00:00", tz="US/Eastern"),
+        ]
+        msg = (
+            "Tz-aware datetime.datetime cannot be "
+            "converted to datetime64 unless utc=True"
+        )
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(arr, cache=cache)
+
+        result = to_datetime(arr, cache=cache, errors="coerce")
+        expected = DatetimeIndex(
+            ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[us, US/Pacific]"
+        )
+        tm.assert_index_equal(result, expected)
+
+    def test_to_datetime_different_offsets_removed(self, cache):
+        # inspired by asv timeseries.ToDatetimeNONISO8601 benchmark
+        # see GH-26097 for more
+        # GH#57275
+        ts_string_1 = "March 1, 2018 12:00:00+0400"
+        ts_string_2 = "March 1, 2018 12:00:00+0500"
+        arr = [ts_string_1] * 5 + [ts_string_2] * 5
+        msg = "Mixed timezones detected. Pass utc=True in to_datetime"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(arr, cache=cache)
+
+    def test_to_datetime_tz_pytz(self, cache):
+        # see gh-8260
+        pytz = pytest.importorskip("pytz")
+        us_eastern = pytz.timezone("US/Eastern")
+        arr = np.array(
+            [
+                us_eastern.localize(
+                    datetime(year=2000, month=1, day=1, hour=3, minute=0)
+                ),
+                us_eastern.localize(
+                    datetime(year=2000, month=6, day=1, hour=3, minute=0)
+                ),
+            ],
+            dtype=object,
+        )
+        result = to_datetime(arr, utc=True, cache=cache)
+        expected = DatetimeIndex(
+            ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"],
+            dtype="datetime64[us, UTC]",
+            freq=None,
+        )
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "init_constructor, end_constructor",
+        [
+            (Index, DatetimeIndex),
+            (list, DatetimeIndex),
+            (np.array, DatetimeIndex),
+            (Series, Series),
+        ],
+    )
+    def test_to_datetime_utc_true(self, cache, init_constructor, end_constructor):
+        # See gh-11934 & gh-6415
+        data = ["20100102 121314", "20100102 121315"]
+        expected_data = [
+            Timestamp("2010-01-02 12:13:14", tz="utc"),
+            Timestamp("2010-01-02 12:13:15", tz="utc"),
+        ]
+
+        result = to_datetime(
+            init_constructor(data), format="%Y%m%d %H%M%S", utc=True, cache=cache
+        )
+        expected = end_constructor(expected_data)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "scalar, expected",
+        [
+            ["20100102 121314", Timestamp("2010-01-02 12:13:14", tz="utc")],
+            ["20100102 121315", Timestamp("2010-01-02 12:13:15", tz="utc")],
+        ],
+    )
+    def test_to_datetime_utc_true_scalar(self, cache, scalar, expected):
+        # Test scalar case as well
+        result = to_datetime(scalar, format="%Y%m%d %H%M%S", utc=True, cache=cache)
+        assert result == expected
+
+    def test_to_datetime_utc_true_with_series_single_value(self, cache):
+        # GH 15760 UTC=True with Series
+        ts = 1.5e18
+        result = to_datetime(Series([ts]), utc=True, cache=cache)
+        expected = Series([Timestamp(ts, tz="utc")])
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_utc_true_with_series_tzaware_string(self, cache):
+        ts = "2013-01-01 00:00:00-01:00"
+        expected_ts = "2013-01-01 01:00:00"
+        data = Series([ts] * 3)
+        result = to_datetime(data, utc=True, cache=cache)
+        expected = Series([Timestamp(expected_ts, tz="utc")] * 3)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "date, dtype",
+        [
+            ("2013-01-01 01:00:00", "datetime64[ns]"),
+            ("2013-01-01 01:00:00", "datetime64[ns, UTC]"),
+        ],
+    )
+    def test_to_datetime_utc_true_with_series_datetime_ns(self, cache, date, dtype):
+        expected = Series(
+            [Timestamp("2013-01-01 01:00:00", tz="UTC")], dtype="M8[ns, UTC]"
+        )
+        result = to_datetime(Series([date], dtype=dtype), utc=True, cache=cache)
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_tz_psycopg2(self, request, cache):
+        # xref 8260
+        psycopg2_tz = pytest.importorskip("psycopg2.tz")
+
+        # misc cases
+        tz1 = psycopg2_tz.FixedOffsetTimezone(offset=-300, name=None)
+        tz2 = psycopg2_tz.FixedOffsetTimezone(offset=-240, name=None)
+        arr = np.array(
+            [
+                datetime(2000, 1, 1, 3, 0, tzinfo=tz1),
+                datetime(2000, 6, 1, 3, 0, tzinfo=tz2),
+            ],
+            dtype=object,
+        )
+
+        result = to_datetime(arr, errors="coerce", utc=True, cache=cache)
+        expected = DatetimeIndex(
+            ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"],
+            dtype="datetime64[us, UTC]",
+            freq=None,
+        )
+        tm.assert_index_equal(result, expected)
+
+        # dtype coercion
+        i = DatetimeIndex(
+            ["2000-01-01 08:00:00"],
+            tz=psycopg2_tz.FixedOffsetTimezone(offset=-300, name=None),
+        ).as_unit("us")
+        assert not is_datetime64_ns_dtype(i)
+
+        # tz coercion
+        result = to_datetime(i, errors="coerce", cache=cache)
+        tm.assert_index_equal(result, i)
+
+        result = to_datetime(i, errors="coerce", utc=True, cache=cache)
+        expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[us, UTC]")
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize("arg", [True, False])
+    def test_datetime_bool(self, cache, arg):
+        # GH13176
+        msg = r"dtype bool cannot be converted to datetime64\[ns\]"
+        with pytest.raises(TypeError, match=msg):
+            to_datetime(arg)
+        assert to_datetime(arg, errors="coerce", cache=cache) is NaT
+
+    def test_datetime_bool_arrays_mixed(self, cache):
+        msg = f"{type(cache)} is not convertible to datetime"
+        with pytest.raises(TypeError, match=msg):
+            to_datetime([False, datetime.today()], cache=cache)
+        with pytest.raises(
+            ValueError,
+            match=(
+                r'^time data "True" doesn\'t match format "%Y%m%d". '
+                f"{PARSING_ERR_MSG}$"
+            ),
+        ):
+            to_datetime(["20130101", True], cache=cache)
+        tm.assert_index_equal(
+            to_datetime([0, False, NaT, 0.0], errors="coerce", cache=cache),
+            DatetimeIndex(
+                [to_datetime(0, cache=cache), NaT, NaT, to_datetime(0, cache=cache)]
+            ),
+        )
+
+    @pytest.mark.parametrize("arg", [bool, to_datetime])
+    def test_datetime_invalid_datatype(self, arg):
+        # GH13176
+        msg = "is not convertible to datetime"
+        with pytest.raises(TypeError, match=msg):
+            to_datetime(arg)
+
+    @pytest.mark.parametrize("errors", ["coerce", "raise"])
+    def test_invalid_format_raises(self, errors):
+        # https://github.com/pandas-dev/pandas/issues/50255
+        with pytest.raises(
+            ValueError, match="':' is a bad directive in format 'H%:M%:S%"
+        ):
+            to_datetime(["00:00:00"], format="H%:M%:S%", errors=errors)
+
+    @pytest.mark.parametrize("value", ["a", "00:01:99"])
+    @pytest.mark.parametrize("format", [None, "%H:%M:%S"])
+    def test_datetime_invalid_scalar(self, value, format):
+        # GH24763
+        res = to_datetime(value, errors="coerce", format=format)
+        assert res is NaT
+
+        msg = "|".join(
+            [
+                r'^time data "a" doesn\'t match format "%H:%M:%S". '
+                f"{PARSING_ERR_MSG}$",
+                r'^Given date string "a" not likely a datetime$',
+                r'^unconverted data remains when parsing with format "%H:%M:%S": "9". '
+                f"{PARSING_ERR_MSG}$",
+                rf"^second must be in 0..59{NOT_99}: 00:01:99$",
+            ]
+        )
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(value, errors="raise", format=format)
+
+    @pytest.mark.parametrize("value", ["3000/12/11 00:00:00"])
+    @pytest.mark.parametrize("format", [None, "%H:%M:%S"])
+    def test_datetime_outofbounds_scalar(self, value, format):
+        # GH24763
+        res = to_datetime(value, errors="coerce", format=format)
+        if format is None:
+            assert isinstance(res, Timestamp)
+            assert res == Timestamp(value)
+        else:
+            assert res is NaT
+
+        if format is not None:
+            msg = r'^time data ".*" doesn\'t match format ".*"'
+            with pytest.raises(ValueError, match=msg):
+                to_datetime(value, errors="raise", format=format)
+        else:
+            res = to_datetime(value, errors="raise", format=format)
+            assert isinstance(res, Timestamp)
+            assert res == Timestamp(value)
+
+    @pytest.mark.parametrize(
+        ("values"), [(["a"]), (["00:01:99"]), (["a", "b", "99:00:00"])]
+    )
+    @pytest.mark.parametrize("format", [(None), ("%H:%M:%S")])
+    def test_datetime_invalid_index(self, values, format):
+        # GH24763
+        # Not great to have logic in tests, but this one's hard to
+        # parametrise over
+        if format is None and len(values) > 1:
+            warn = UserWarning
+        else:
+            warn = None
+
+        with tm.assert_produces_warning(
+            warn, match="Could not infer format", raise_on_extra_warnings=False
+        ):
+            res = to_datetime(values, errors="coerce", format=format)
+        tm.assert_index_equal(res, DatetimeIndex([NaT] * len(values)))
+
+        msg = "|".join(
+            [
+                r'^Given date string "a" not likely a datetime$',
+                r'^time data "a" doesn\'t match format "%H:%M:%S". '
+                f"{PARSING_ERR_MSG}$",
+                r'^unconverted data remains when parsing with format "%H:%M:%S": "9". '
+                f"{PARSING_ERR_MSG}$",
+                rf"^second must be in 0..59{NOT_99}: 00:01:99$",
+            ]
+        )
+        with pytest.raises(ValueError, match=msg):
+            with tm.assert_produces_warning(
+                warn, match="Could not infer format", raise_on_extra_warnings=False
+            ):
+                to_datetime(values, errors="raise", format=format)
+
+    @pytest.mark.parametrize("utc", [True, None])
+    @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None])
+    @pytest.mark.parametrize("constructor", [list, tuple, np.array, Index, deque])
+    def test_to_datetime_cache(self, utc, format, constructor):
+        date = "20130101 00:00:00"
+        test_dates = [date] * 10**5
+        data = constructor(test_dates)
+
+        result = to_datetime(data, utc=utc, format=format, cache=True)
+        expected = to_datetime(data, utc=utc, format=format, cache=False)
+
+        tm.assert_index_equal(result, expected)
+
+    def test_to_datetime_from_deque(self):
+        # GH 29403
+        result = to_datetime(deque([Timestamp("2010-06-02 09:30:00")] * 51))
+        expected = to_datetime([Timestamp("2010-06-02 09:30:00")] * 51)
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize("utc", [True, None])
+    @pytest.mark.parametrize("format", ["%Y%m%d %H:%M:%S", None])
+    def test_to_datetime_cache_series(self, utc, format):
+        date = "20130101 00:00:00"
+        test_dates = [date] * 10**5
+        data = Series(test_dates)
+        result = to_datetime(data, utc=utc, format=format, cache=True)
+        expected = to_datetime(data, utc=utc, format=format, cache=False)
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_cache_scalar(self):
+        date = "20130101 00:00:00"
+        result = to_datetime(date, cache=True)
+        expected = Timestamp("20130101 00:00:00")
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "datetimelikes,expected_values,exp_unit",
+        (
+            (
+                (None, np.nan) + (NaT,) * start_caching_at,
+                (NaT,) * (start_caching_at + 2),
+                "s",
+            ),
+            (
+                (None, Timestamp("2012-07-26").as_unit("s"))
+                + (NaT,) * start_caching_at,
+                (NaT, Timestamp("2012-07-26").as_unit("s")) + (NaT,) * start_caching_at,
+                "s",
+            ),
+            (
+                (None,)
+                + (NaT,) * start_caching_at
+                + ("2012 July 26", Timestamp("2012-07-26")),
+                (NaT,) * (start_caching_at + 1)
+                + (Timestamp("2012-07-26"), Timestamp("2012-07-26")),
+                "us",
+            ),
+        ),
+    )
+    def test_convert_object_to_datetime_with_cache(
+        self, datetimelikes, expected_values, exp_unit
+    ):
+        # GH#39882
+        ser = Series(
+            datetimelikes,
+            dtype="object",
+        )
+        result_series = to_datetime(ser, errors="coerce")
+        expected_series = Series(
+            expected_values,
+            dtype=f"datetime64[{exp_unit}]",
+        )
+        tm.assert_series_equal(result_series, expected_series)
+
+    @pytest.mark.parametrize(
+        "input",
+        [
+            Series([NaT] * 20 + [None] * 20, dtype="object"),
+            Series([NaT] * 60 + [None] * 60, dtype="object"),
+            Series([None] * 20),
+            Series([None] * 60),
+            Series([""] * 20),
+            Series([""] * 60),
+            Series([pd.NA] * 20),
+            Series([pd.NA] * 60),
+            Series([np.nan] * 20),
+            Series([np.nan] * 60),
+        ],
+    )
+    def test_to_datetime_converts_null_like_to_nat(self, cache, input):
+        # GH35888
+        expected = Series([NaT] * len(input), dtype="M8[s]")
+        result = to_datetime(input, cache=cache)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "date, format",
+        [
+            ("2017-20", "%Y-%W"),
+            ("20 Sunday", "%W %A"),
+            ("20 Sun", "%W %a"),
+            ("2017-21", "%Y-%U"),
+            ("20 Sunday", "%U %A"),
+            ("20 Sun", "%U %a"),
+        ],
+    )
+    def test_week_without_day_and_calendar_year(self, date, format):
+        # GH16774
+
+        msg = "Cannot use '%W' or '%U' without day and year"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(date, format=format)
+
+    def test_to_datetime_coerce(self):
+        # GH#26122, GH#57275
+        ts_strings = [
+            "March 1, 2018 12:00:00+0400",
+            "March 1, 2018 12:00:00+0500",
+            "20100240",
+        ]
+        msg = "Mixed timezones detected. Pass utc=True in to_datetime"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(ts_strings, errors="coerce")
+
+    @pytest.mark.parametrize(
+        "string_arg, format",
+        [("March 1, 2018", "%B %d, %Y"), ("2018-03-01", "%Y-%m-%d")],
+    )
+    @pytest.mark.parametrize(
+        "outofbounds",
+        [
+            datetime(9999, 1, 1),
+            date(9999, 1, 1),
+            np.datetime64("9999-01-01"),
+            "January 1, 9999",
+            "9999-01-01",
+        ],
+    )
+    def test_to_datetime_coerce_oob(self, string_arg, format, outofbounds):
+        # https://github.com/pandas-dev/pandas/issues/50255
+        ts_strings = [string_arg, outofbounds]
+        result = to_datetime(ts_strings, errors="coerce", format=format)
+        if isinstance(outofbounds, str) and (
+            format.startswith("%B") ^ outofbounds.startswith("J")
+        ):
+            # the strings don't match the given format, so they raise and we coerce
+            expected = DatetimeIndex([datetime(2018, 3, 1), NaT], dtype="M8[us]")
+        elif isinstance(outofbounds, datetime):
+            expected = DatetimeIndex(
+                [datetime(2018, 3, 1), outofbounds], dtype="M8[us]"
+            )
+        else:
+            expected = DatetimeIndex(
+                [datetime(2018, 3, 1), outofbounds], dtype="M8[us]"
+            )
+        tm.assert_index_equal(result, expected)
+
+    def test_to_datetime_malformed_no_raise(self):
+        # GH 28299
+        # GH 48633
+        ts_strings = ["200622-12-31", "111111-24-11"]
+        with tm.assert_produces_warning(
+            UserWarning, match="Could not infer format", raise_on_extra_warnings=False
+        ):
+            result = to_datetime(ts_strings, errors="coerce")
+        # TODO: should Index get "s" by default here?
+        exp = Index([NaT, NaT], dtype="M8[s]")
+        tm.assert_index_equal(result, exp)
+
+    def test_to_datetime_malformed_raise(self):
+        # GH 48633
+        ts_strings = ["200622-12-31", "111111-24-11"]
+        msg = (
+            'Parsed string "200622-12-31" gives an invalid tzoffset, which must '
+            r"be between -timedelta\(hours=24\) and timedelta\(hours=24\)"
+        )
+        with pytest.raises(
+            ValueError,
+            match=msg,
+        ):
+            with tm.assert_produces_warning(
+                UserWarning, match="Could not infer format"
+            ):
+                to_datetime(
+                    ts_strings,
+                    errors="raise",
+                )
+
+    def test_iso_8601_strings_with_same_offset(self):
+        # GH 17697, 11736
+        ts_str = "2015-11-18 15:30:00+05:30"
+        result = to_datetime(ts_str)
+        expected = Timestamp(ts_str)
+        assert result == expected
+
+        expected = DatetimeIndex([Timestamp(ts_str)] * 2)
+        result = to_datetime([ts_str] * 2)
+        tm.assert_index_equal(result, expected)
+
+        result = DatetimeIndex([ts_str] * 2)
+        tm.assert_index_equal(result, expected)
+
+    def test_iso_8601_strings_with_different_offsets_removed(self):
+        # GH#17697, GH#11736, GH#50887, GH#57275
+        ts_strings = ["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30", NaT]
+        msg = "Mixed timezones detected. Pass utc=True in to_datetime"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(ts_strings)
+
+    def test_iso_8601_strings_with_different_offsets_utc(self):
+        ts_strings = ["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30", NaT]
+        result = to_datetime(ts_strings, utc=True)
+        expected = DatetimeIndex(
+            [Timestamp(2015, 11, 18, 10), Timestamp(2015, 11, 18, 10), NaT], tz="UTC"
+        )
+        tm.assert_index_equal(result, expected)
+
+    def test_mixed_offsets_with_native_datetime_utc_false_raises(self):
+        # GH#25978, GH#57275
+
+        vals = [
+            "nan",
+            Timestamp("1990-01-01"),
+            "2015-03-14T16:15:14.123-08:00",
+            "2019-03-04T21:56:32.620-07:00",
+            None,
+            "today",
+            "now",
+        ]
+        ser = Series(vals)
+        assert all(ser[i] is vals[i] for i in range(len(vals)))  # GH#40111
+
+        msg = "Mixed timezones detected. Pass utc=True in to_datetime"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(ser)
+
+    def test_non_iso_strings_with_tz_offset(self):
+        result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2)
+        expected = DatetimeIndex(
+            [datetime(2018, 3, 1, 12, tzinfo=timezone(timedelta(minutes=240)))] * 2
+        )
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "ts, expected",
+        [
+            (Timestamp("2018-01-01"), Timestamp("2018-01-01", tz="UTC")),
+            (
+                Timestamp("2018-01-01", tz="US/Pacific"),
+                Timestamp("2018-01-01 08:00", tz="UTC"),
+            ),
+        ],
+    )
+    def test_timestamp_utc_true(self, ts, expected):
+        # GH 24415
+        result = to_datetime(ts, utc=True)
+        assert result == expected
+
+    @pytest.mark.parametrize("dt_str", ["00010101", "13000101", "30000101", "99990101"])
+    def test_to_datetime_with_format_out_of_bounds(self, dt_str):
+        # GH 9107
+        res = to_datetime(dt_str, format="%Y%m%d")
+        dtobj = datetime.strptime(dt_str, "%Y%m%d")
+        expected = Timestamp(dtobj)
+        assert res == expected
+        assert res.unit == expected.unit
+
+    def test_to_datetime_utc(self):
+        arr = np.array([parse("2012-06-13T01:39:00Z")], dtype=object)
+
+        result = to_datetime(arr, utc=True)
+        assert result.tz is timezone.utc
+
+    def test_to_datetime_fixed_offset(self):
+        from pandas.tests.indexes.datetimes.test_timezones import FixedOffset
+
+        fixed_off = FixedOffset(-420, "-07:00")
+
+        dates = [
+            datetime(2000, 1, 1, tzinfo=fixed_off),
+            datetime(2000, 1, 2, tzinfo=fixed_off),
+            datetime(2000, 1, 3, tzinfo=fixed_off),
+        ]
+        result = to_datetime(dates)
+        assert result.tz == fixed_off
+
+    @pytest.mark.parametrize(
+        "date",
+        [
+            ["2020-10-26 00:00:00+06:00", "2020-10-26 00:00:00+01:00"],
+            ["2020-10-26 00:00:00+06:00", Timestamp("2018-01-01", tz="US/Pacific")],
+            [
+                "2020-10-26 00:00:00+06:00",
+                datetime(2020, 1, 1, 18).astimezone(
+                    zoneinfo.ZoneInfo("Australia/Melbourne")
+                ),
+            ],
+        ],
+    )
+    def test_to_datetime_mixed_offsets_with_utc_false_removed(self, date):
+        # GH#50887, GH#57275
+        msg = "Mixed timezones detected. Pass utc=True in to_datetime"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(date, utc=False)
+
+
+class TestToDatetimeUnit:
+    @pytest.mark.parametrize("unit", ["Y", "M"])
+    @pytest.mark.parametrize("item", [150, float(150)])
+    def test_to_datetime_month_or_year_unit_int(self, cache, unit, item, request):
+        # GH#50870 Note we have separate tests that pd.Timestamp gets these right
+        ts = Timestamp(item, unit=unit)
+        dtype = "M8[s]"
+        expected = DatetimeIndex([ts], dtype=dtype)
+
+        result = to_datetime([item], unit=unit, cache=cache)
+        tm.assert_index_equal(result, expected)
+
+        result = to_datetime(np.array([item], dtype=object), unit=unit, cache=cache)
+        tm.assert_index_equal(result, expected)
+
+        result = to_datetime(np.array([item]), unit=unit, cache=cache)
+        tm.assert_index_equal(result, expected)
+
+        # with a nan!
+        result = to_datetime(np.array([item, np.nan]), unit=unit, cache=cache)
+        assert result.isna()[1]
+        tm.assert_index_equal(result[:1], expected.astype("M8[s]"))
+
+    @pytest.mark.parametrize("unit", ["Y", "M"])
+    def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit):
+        # GH#50301
+        # Match Timestamp behavior in disallowing non-round floats with
+        #  Y or M unit
+        msg = f"Conversion of non-round float with unit={unit} is ambiguous"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime([1.5], unit=unit, errors="raise")
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(np.array([1.5]), unit=unit, errors="raise")
+
+        msg = r"Given date string \"1.5\" not likely a datetime"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(["1.5"], unit=unit, errors="raise")
+
+        res = to_datetime([1.5], unit=unit, errors="coerce")
+        expected = Index([NaT], dtype="M8[ns]")
+        tm.assert_index_equal(res, expected)
+
+        # In 3.0, the string "1.5" is parsed as as it would be without unit,
+        #  which fails. With errors="coerce" this becomes NaT.
+        res = to_datetime(["1.5"], unit=unit, errors="coerce")
+        expected = to_datetime([NaT])
+        tm.assert_index_equal(res, expected)
+
+        # round floats are OK; treated like integers to give
+        #  closest-to-supported unit
+        res = to_datetime([1.0], unit=unit)
+        expected = to_datetime([1], unit=unit).as_unit("s")
+        tm.assert_index_equal(res, expected)
+
+    def test_unit(self, cache):
+        # GH 11758
+        # test proper behavior with errors
+        msg = "cannot specify both format and unit"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime([1], unit="D", format="%Y%m%d", cache=cache)
+
+    def test_unit_array_mixed_nans(self, cache):
+        values = [11111111111111111, 1, 1.0, iNaT, NaT, np.nan, "NaT", ""]
+
+        result = to_datetime(values, unit="D", errors="coerce", cache=cache)
+        expected = DatetimeIndex(
+            ["NaT", "1970-01-02", "1970-01-02", "NaT", "NaT", "NaT", "NaT", "NaT"],
+            dtype="M8[s]",
+        )
+        tm.assert_index_equal(result, expected)
+
+        msg = "cannot convert input 11111111111111111 with the unit 'D'"
+        with pytest.raises(OutOfBoundsDatetime, match=msg):
+            to_datetime(values, unit="D", errors="raise", cache=cache)
+
+    def test_unit_array_mixed_nans_large_int(self, cache):
+        values = [1420043460000000000000000, iNaT, NaT, np.nan, "NaT"]
+
+        result = to_datetime(values, errors="coerce", unit="s", cache=cache)
+        expected = DatetimeIndex(["NaT", "NaT", "NaT", "NaT", "NaT"], dtype="M8[s]")
+        tm.assert_index_equal(result, expected)
+
+        msg = "cannot convert input 1420043460000000000000000 with the unit 's'"
+        with pytest.raises(OutOfBoundsDatetime, match=msg):
+            to_datetime(values, errors="raise", unit="s", cache=cache)
+
+    def test_to_datetime_invalid_str_not_out_of_bounds_valuerror(self, cache):
+        # if we have a string, then we raise a ValueError
+        # and NOT an OutOfBoundsDatetime
+        msg = "Unknown datetime string format, unable to parse: foo"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime("foo", errors="raise", unit="s", cache=cache)
+
+    @pytest.mark.parametrize("error", ["raise", "coerce"])
+    def test_unit_consistency(self, cache, error):
+        # consistency of conversions
+        expected = Timestamp("1970-05-09 14:25:11")
+        result = to_datetime(11111111, unit="s", errors=error, cache=cache)
+        assert result == expected
+        assert isinstance(result, Timestamp)
+
+    @pytest.mark.parametrize("errors", ["raise", "coerce"])
+    @pytest.mark.parametrize("dtype", ["float64", "int64"])
+    def test_unit_with_numeric(self, cache, errors, dtype):
+        # GH 13180
+        # coercions from floats/ints are ok
+        expected = DatetimeIndex(
+            ["2015-06-19 05:33:20", "2015-05-27 22:33:20"], dtype="M8[ns]"
+        )
+        arr = np.array([1.434692e18, 1.432766e18]).astype(dtype)
+        result = to_datetime(arr, errors=errors, cache=cache)
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "exp, arr, warning",
+        [
+            [
+                ["NaT", "2015-06-19 05:33:20", "2015-05-27 22:33:20"],
+                ["foo", 1.434692e18, 1.432766e18],
+                UserWarning,
+            ],
+            [
+                ["2015-06-19 05:33:20", "2015-05-27 22:33:20", "NaT", "NaT"],
+                [1.434692e18, 1.432766e18, "foo", "NaT"],
+                None,
+            ],
+        ],
+    )
+    def test_unit_with_numeric_coerce(self, cache, exp, arr, warning):
+        # but we want to make sure that we are coercing
+        # if we have ints/strings
+        expected = DatetimeIndex(exp, dtype="M8[ns]")
+        with tm.assert_produces_warning(warning, match="Could not infer format"):
+            result = to_datetime(arr, errors="coerce", cache=cache)
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "arr",
+        [
+            [Timestamp("20130101"), 1.434692e18, 1.432766e18],
+            [1.434692e18, 1.432766e18, Timestamp("20130101")],
+        ],
+    )
+    def test_unit_mixed(self, cache, arr):
+        # GH#50453 pre-2.0 with mixed numeric/datetimes and errors="coerce"
+        #  the numeric entries would be coerced to NaT, was never clear exactly
+        #  why.
+        # mixed integers/datetimes
+        expected = Index([Timestamp(x) for x in arr], dtype="M8[ns]")
+        result = to_datetime(arr, errors="coerce", cache=cache)
+        tm.assert_index_equal(result, expected)
+
+        # GH#49037 pre-2.0 this raised, but it always worked with Series,
+        #  was never clear why it was disallowed
+        result = to_datetime(arr, errors="raise", cache=cache)
+        tm.assert_index_equal(result, expected)
+
+        result = DatetimeIndex(arr)
+        tm.assert_index_equal(result, expected)
+
+    def test_unit_rounding(self, cache):
+        # GH 14156 & GH 20445: argument will incur floating point errors
+        # but no premature rounding
+        value = 1434743731.8770001
+        result = to_datetime(value, unit="s", cache=cache)
+        expected = Timestamp("2015-06-19 19:55:31.877000093")
+        assert result == expected
+
+        alt = Timestamp(value, unit="s")
+        assert alt == result
+
+    @pytest.mark.parametrize("dtype", [int, float])
+    def test_to_datetime_unit(self, dtype):
+        epoch = 1370745748
+        ser = Series([epoch + t for t in range(20)]).astype(dtype)
+        result = to_datetime(ser, unit="s")
+        unit = "s"
+        expected = Series(
+            [
+                Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t)
+                for t in range(20)
+            ],
+            dtype=f"M8[{unit}]",
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("null", [iNaT, np.nan])
+    def test_to_datetime_unit_with_nulls(self, null):
+        epoch = 1370745748
+        ser = Series([epoch + t for t in range(20)] + [null])
+        result = to_datetime(ser, unit="s")
+        # With np.nan, the list gets cast to a float64 array, which always
+        #  gets ns unit.
+        unit = "s"
+        expected = Series(
+            [Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t) for t in range(20)]
+            + [NaT],
+            dtype=f"M8[{unit}]",
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_unit_fractional_seconds(self):
+        # GH13834
+        epoch = 1370745748
+        ser = Series([epoch + t for t in np.arange(0, 2, 0.25)] + [iNaT]).astype(float)
+        result = to_datetime(ser, unit="s")
+        expected = Series(
+            [
+                Timestamp("2013-06-09 02:42:28") + timedelta(seconds=t)
+                for t in np.arange(0, 2, 0.25)
+            ]
+            + [NaT],
+            dtype="M8[ns]",
+        )
+        # GH20455 argument will incur floating point errors but no premature rounding
+        result = result.dt.round("ms")
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_unit_na_values(self):
+        result = to_datetime([1, 2, "NaT", NaT, np.nan], unit="D")
+        expected = DatetimeIndex(
+            [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 3,
+            dtype="M8[s]",
+        )
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize("bad_val", ["foo", 111111111111111])
+    def test_to_datetime_unit_invalid(self, bad_val):
+        if bad_val == "foo":
+            msg = f"Unknown datetime string format, unable to parse: {bad_val}"
+        else:
+            msg = "cannot convert input 111111111111111 with the unit 'D'"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime([1, 2, bad_val], unit="D")
+
+    @pytest.mark.parametrize("bad_val", ["foo", 111111111111111])
+    def test_to_timestamp_unit_coerce(self, bad_val):
+        # coerce we can process
+        expected = DatetimeIndex(
+            [Timestamp("1970-01-02"), Timestamp("1970-01-03")] + ["NaT"] * 1,
+            dtype="M8[s]",
+        )
+        result = to_datetime([1, 2, bad_val], unit="D", errors="coerce")
+        tm.assert_index_equal(result, expected)
+
+    def test_float_to_datetime_raise_near_bounds(self):
+        # GH50183
+        msg = "cannot convert input with unit 'D'"
+        oneday_in_ns = 1e9 * 60 * 60 * 24
+        tsmax_in_days = 2**63 / oneday_in_ns  # 2**63 ns, in days
+        # just in bounds
+        should_succeed = Series(
+            [0, tsmax_in_days - 0.005, -tsmax_in_days + 0.005], dtype=float
+        )
+        expected = (should_succeed * oneday_in_ns).astype(np.int64)
+        for error_mode in ["raise", "coerce"]:
+            result1 = to_datetime(should_succeed, unit="D", errors=error_mode)
+            # Cast to `np.float64` so that `rtol` and inexact checking kick in
+            # (`check_exact` doesn't take place for integer dtypes)
+            tm.assert_almost_equal(
+                result1.astype(np.int64).astype(np.float64),
+                expected.astype(np.float64),
+                rtol=1e-10,
+            )
+        # just out of bounds
+        should_fail1 = Series([0, tsmax_in_days + 0.005], dtype=float)
+        should_fail2 = Series([0, -tsmax_in_days - 0.005], dtype=float)
+        with pytest.raises(OutOfBoundsDatetime, match=msg):
+            to_datetime(should_fail1, unit="D", errors="raise")
+        with pytest.raises(OutOfBoundsDatetime, match=msg):
+            to_datetime(should_fail2, unit="D", errors="raise")
+
+
+class TestToDatetimeDataFrame:
+    @pytest.fixture
+    def df(self):
+        return DataFrame(
+            {
+                "year": [2015, 2016],
+                "month": [2, 3],
+                "day": [4, 5],
+                "hour": [6, 7],
+                "minute": [58, 59],
+                "second": [10, 11],
+                "ms": [1, 1],
+                "us": [2, 2],
+                "ns": [3, 3],
+            }
+        )
+
+    def test_dataframe(self, df, cache):
+        result = to_datetime(
+            {"year": df["year"], "month": df["month"], "day": df["day"]}, cache=cache
+        )
+        expected = Series(
+            [Timestamp("20150204 00:00:00"), Timestamp("20160305 00:0:00")]
+        )
+        tm.assert_series_equal(result, expected)
+
+        # dict-like
+        result = to_datetime(df[["year", "month", "day"]].to_dict(), cache=cache)
+        expected.index = Index([0, 1])
+        tm.assert_series_equal(result, expected)
+
+    def test_dataframe_dict_with_constructable(self, df, cache):
+        # dict but with constructable
+        df2 = df[["year", "month", "day"]].to_dict()
+        df2["month"] = 2
+        result = to_datetime(df2, cache=cache)
+        expected2 = Series(
+            [Timestamp("20150204 00:00:00"), Timestamp("20160205 00:0:00")],
+            index=Index([0, 1]),
+        )
+        tm.assert_series_equal(result, expected2)
+
+    @pytest.mark.parametrize(
+        "unit",
+        [
+            {
+                "year": "years",
+                "month": "months",
+                "day": "days",
+                "hour": "hours",
+                "minute": "minutes",
+                "second": "seconds",
+            },
+            {
+                "year": "year",
+                "month": "month",
+                "day": "day",
+                "hour": "hour",
+                "minute": "minute",
+                "second": "second",
+            },
+        ],
+    )
+    def test_dataframe_field_aliases_column_subset(self, df, cache, unit):
+        # unit mappings
+        result = to_datetime(df[list(unit.keys())].rename(columns=unit), cache=cache)
+        expected = Series(
+            [Timestamp("20150204 06:58:10"), Timestamp("20160305 07:59:11")],
+            dtype="M8[us]",
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_dataframe_field_aliases(self, df, cache):
+        d = {
+            "year": "year",
+            "month": "month",
+            "day": "day",
+            "hour": "hour",
+            "minute": "minute",
+            "second": "second",
+            "ms": "ms",
+            "us": "us",
+            "ns": "ns",
+        }
+
+        result = to_datetime(df.rename(columns=d), cache=cache)
+        expected = Series(
+            [
+                Timestamp("20150204 06:58:10.001002003"),
+                Timestamp("20160305 07:59:11.001002003"),
+            ]
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_dataframe_str_dtype(self, df, cache):
+        # coerce back to int
+        result = to_datetime(df.astype(str), cache=cache)
+        expected = Series(
+            [
+                Timestamp("20150204 06:58:10.001002003"),
+                Timestamp("20160305 07:59:11.001002003"),
+            ]
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_dataframe_float32_dtype(self, df, cache):
+        # GH#60506
+        # coerce to float64
+        result = to_datetime(df.astype(np.float32), cache=cache)
+        expected = Series(
+            [
+                Timestamp("20150204 06:58:10.001002003"),
+                Timestamp("20160305 07:59:11.001002003"),
+            ]
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_dataframe_coerce(self, cache):
+        # passing coerce
+        df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]})
+
+        msg = (
+            r'^cannot assemble the datetimes: time data ".+" doesn\'t '
+            r'match format "%Y%m%d"\.'
+        )
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(df2, cache=cache)
+
+        result = to_datetime(df2, errors="coerce", cache=cache)
+        expected = Series([Timestamp("20150204 00:00:00"), NaT])
+        tm.assert_series_equal(result, expected)
+
+    def test_dataframe_extra_keys_raises(self, df, cache):
+        # extra columns
+        msg = r"extra keys have been passed to the datetime assemblage: \[foo\]"
+        df2 = df.copy()
+        df2["foo"] = 1
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(df2, cache=cache)
+
+    @pytest.mark.parametrize(
+        "cols",
+        [
+            ["year"],
+            ["year", "month"],
+            ["year", "month", "second"],
+            ["month", "day"],
+            ["year", "day", "second"],
+        ],
+    )
+    def test_dataframe_missing_keys_raises(self, df, cache, cols):
+        # not enough
+        msg = (
+            r"to assemble mappings requires at least that \[year, month, "
+            r"day\] be specified: \[.+\] is missing"
+        )
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(df[cols], cache=cache)
+
+    def test_dataframe_duplicate_columns_raises(self, cache):
+        # duplicates
+        msg = "cannot assemble with duplicate keys"
+        df2 = DataFrame({"year": [2015, 2016], "month": [2, 20], "day": [4, 5]})
+        df2.columns = ["year", "year", "day"]
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(df2, cache=cache)
+
+        df2 = DataFrame(
+            {"year": [2015, 2016], "month": [2, 20], "day": [4, 5], "hour": [4, 5]}
+        )
+        df2.columns = ["year", "month", "day", "day"]
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(df2, cache=cache)
+
+    def test_dataframe_int16(self, cache):
+        # GH#13451
+        df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]})
+
+        # int16
+        result = to_datetime(df.astype("int16"), cache=cache)
+        expected = Series(
+            [Timestamp("20150204 00:00:00"), Timestamp("20160305 00:00:00")]
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_dataframe_mixed(self, cache):
+        # mixed dtypes
+        df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]})
+        df["month"] = df["month"].astype("int8")
+        df["day"] = df["day"].astype("int8")
+        result = to_datetime(df, cache=cache)
+        expected = Series(
+            [Timestamp("20150204 00:00:00"), Timestamp("20160305 00:00:00")]
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_dataframe_float(self, cache):
+        # float
+        df = DataFrame({"year": [2000, 2001], "month": [1.5, 1], "day": [1, 1]})
+        msg = (
+            r"^cannot assemble the datetimes: unconverted data remains when parsing "
+            r'with format ".*": "1".'
+        )
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(df, cache=cache)
+
+    def test_dataframe_utc_true(self):
+        # GH#23760
+        df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]})
+        result = to_datetime(df, utc=True)
+        expected = Series(
+            np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[us]")
+        ).dt.tz_localize("UTC")
+        tm.assert_series_equal(result, expected)
+
+
+class TestToDatetimeMisc:
+    def test_to_datetime_barely_out_of_bounds(self):
+        # GH#19529
+        # GH#19382 close enough to bounds that dropping nanos would result
+        # in an in-bounds datetime
+        arr = np.array(["2262-04-11 23:47:16.854775808"], dtype=object)
+
+        msg = "^Out of bounds nanosecond timestamp: .*"
+        with pytest.raises(OutOfBoundsDatetime, match=msg):
+            to_datetime(arr)
+
+    @pytest.mark.parametrize(
+        "arg, exp_str",
+        [
+            ["2012-01-01 00:00:00", "2012-01-01 00:00:00"],
+            ["20121001", "2012-10-01"],  # bad iso 8601
+        ],
+    )
+    def test_to_datetime_iso8601(self, cache, arg, exp_str):
+        result = to_datetime([arg], cache=cache)
+        exp = Timestamp(exp_str)
+        assert result[0] == exp
+
+    @pytest.mark.parametrize(
+        "input, format",
+        [
+            ("2012", "%Y-%m"),
+            ("2012-01", "%Y-%m-%d"),
+            ("2012-01-01", "%Y-%m-%d %H"),
+            ("2012-01-01 10", "%Y-%m-%d %H:%M"),
+            ("2012-01-01 10:00", "%Y-%m-%d %H:%M:%S"),
+            ("2012-01-01 10:00:00", "%Y-%m-%d %H:%M:%S.%f"),
+            ("2012-01-01 10:00:00.123", "%Y-%m-%d %H:%M:%S.%f%z"),
+            (0, "%Y-%m-%d"),
+        ],
+    )
+    @pytest.mark.parametrize("exact", [True, False])
+    def test_to_datetime_iso8601_fails(self, input, format, exact):
+        # https://github.com/pandas-dev/pandas/issues/12649
+        # `format` is longer than the string, so this fails regardless of `exact`
+        with pytest.raises(
+            ValueError,
+            match=(rf"time data \"{input}\" doesn't match format " rf"\"{format}\""),
+        ):
+            to_datetime(input, format=format, exact=exact)
+
+    @pytest.mark.parametrize(
+        "input, format",
+        [
+            ("2012-01-01", "%Y-%m"),
+            ("2012-01-01 10", "%Y-%m-%d"),
+            ("2012-01-01 10:00", "%Y-%m-%d %H"),
+            ("2012-01-01 10:00:00", "%Y-%m-%d %H:%M"),
+            (0, "%Y-%m-%d"),
+        ],
+    )
+    def test_to_datetime_iso8601_exact_fails(self, input, format):
+        # https://github.com/pandas-dev/pandas/issues/12649
+        # `format` is shorter than the date string, so only fails with `exact=True`
+        msg = "|".join(
+            [
+                '^unconverted data remains when parsing with format ".*": ".*". '
+                f"{PARSING_ERR_MSG}$",
+                f'^time data ".*" doesn\'t match format ".*". {PARSING_ERR_MSG}$',
+            ]
+        )
+        with pytest.raises(
+            ValueError,
+            match=(msg),
+        ):
+            to_datetime(input, format=format)
+
+    @pytest.mark.parametrize(
+        "input, format",
+        [
+            ("2012-01-01", "%Y-%m"),
+            ("2012-01-01 00", "%Y-%m-%d"),
+            ("2012-01-01 00:00", "%Y-%m-%d %H"),
+            ("2012-01-01 00:00:00", "%Y-%m-%d %H:%M"),
+        ],
+    )
+    def test_to_datetime_iso8601_non_exact(self, input, format):
+        # https://github.com/pandas-dev/pandas/issues/12649
+        expected = Timestamp(2012, 1, 1)
+        result = to_datetime(input, format=format, exact=False)
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "input, format",
+        [
+            ("2020-01", "%Y/%m"),
+            ("2020-01-01", "%Y/%m/%d"),
+            ("2020-01-01 00", "%Y/%m/%dT%H"),
+            ("2020-01-01T00", "%Y/%m/%d %H"),
+            ("2020-01-01 00:00", "%Y/%m/%dT%H:%M"),
+            ("2020-01-01T00:00", "%Y/%m/%d %H:%M"),
+            ("2020-01-01 00:00:00", "%Y/%m/%dT%H:%M:%S"),
+            ("2020-01-01T00:00:00", "%Y/%m/%d %H:%M:%S"),
+        ],
+    )
+    def test_to_datetime_iso8601_separator(self, input, format):
+        # https://github.com/pandas-dev/pandas/issues/12649
+        with pytest.raises(
+            ValueError,
+            match=(rf"time data \"{input}\" doesn\'t match format " rf"\"{format}\""),
+        ):
+            to_datetime(input, format=format)
+
+    @pytest.mark.parametrize(
+        "input, format",
+        [
+            ("2020-01", "%Y-%m"),
+            ("2020-01-01", "%Y-%m-%d"),
+            ("2020-01-01 00", "%Y-%m-%d %H"),
+            ("2020-01-01T00", "%Y-%m-%dT%H"),
+            ("2020-01-01 00:00", "%Y-%m-%d %H:%M"),
+            ("2020-01-01T00:00", "%Y-%m-%dT%H:%M"),
+            ("2020-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"),
+            ("2020-01-01T00:00:00", "%Y-%m-%dT%H:%M:%S"),
+            ("2020-01-01T00:00:00.000", "%Y-%m-%dT%H:%M:%S.%f"),
+            ("2020-01-01T00:00:00.000000", "%Y-%m-%dT%H:%M:%S.%f"),
+            ("2020-01-01T00:00:00.000000000", "%Y-%m-%dT%H:%M:%S.%f"),
+        ],
+    )
+    def test_to_datetime_iso8601_valid(self, input, format):
+        # https://github.com/pandas-dev/pandas/issues/12649
+        expected = Timestamp(2020, 1, 1)
+        result = to_datetime(input, format=format)
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "input, format",
+        [
+            ("2020-1", "%Y-%m"),
+            ("2020-1-1", "%Y-%m-%d"),
+            ("2020-1-1 0", "%Y-%m-%d %H"),
+            ("2020-1-1T0", "%Y-%m-%dT%H"),
+            ("2020-1-1 0:0", "%Y-%m-%d %H:%M"),
+            ("2020-1-1T0:0", "%Y-%m-%dT%H:%M"),
+            ("2020-1-1 0:0:0", "%Y-%m-%d %H:%M:%S"),
+            ("2020-1-1T0:0:0", "%Y-%m-%dT%H:%M:%S"),
+            ("2020-1-1T0:0:0.000", "%Y-%m-%dT%H:%M:%S.%f"),
+            ("2020-1-1T0:0:0.000000", "%Y-%m-%dT%H:%M:%S.%f"),
+            ("2020-1-1T0:0:0.000000000", "%Y-%m-%dT%H:%M:%S.%f"),
+        ],
+    )
+    def test_to_datetime_iso8601_non_padded(self, input, format):
+        # https://github.com/pandas-dev/pandas/issues/21422
+        expected = Timestamp(2020, 1, 1)
+        result = to_datetime(input, format=format)
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "input, format",
+        [
+            ("2020-01-01T00:00:00.000000000+00:00", "%Y-%m-%dT%H:%M:%S.%f%z"),
+            ("2020-01-01T00:00:00+00:00", "%Y-%m-%dT%H:%M:%S%z"),
+            ("2020-01-01T00:00:00Z", "%Y-%m-%dT%H:%M:%S%z"),
+        ],
+    )
+    def test_to_datetime_iso8601_with_timezone_valid(self, input, format):
+        # https://github.com/pandas-dev/pandas/issues/12649
+        expected = Timestamp(2020, 1, 1, tzinfo=timezone.utc)
+        result = to_datetime(input, format=format)
+        assert result == expected
+
+    def test_to_datetime_default(self, cache):
+        rs = to_datetime("2001", cache=cache)
+        xp = datetime(2001, 1, 1)
+        assert rs == xp
+
+    @pytest.mark.xfail(reason="fails to enforce dayfirst=True, which would raise")
+    def test_to_datetime_respects_dayfirst(self, cache):
+        # dayfirst is essentially broken
+
+        # The msg here is not important since it isn't actually raised yet.
+        msg = "Invalid date specified"
+        with pytest.raises(ValueError, match=msg):
+            # if dayfirst is respected, then this would parse as month=13, which
+            #  would raise
+            with tm.assert_produces_warning(UserWarning, match="Provide format"):
+                to_datetime("01-13-2012", dayfirst=True, cache=cache)
+
+    def test_to_datetime_on_datetime64_series(self, cache):
+        # #2699
+        ser = Series(date_range("1/1/2000", periods=10))
+
+        result = to_datetime(ser, cache=cache)
+        assert result[0] == ser[0]
+
+    def test_to_datetime_with_space_in_series(self, cache):
+        # GH 6428
+        ser = Series(["10/18/2006", "10/18/2008", " "])
+        msg = (
+            r'^time data " " doesn\'t match format "%m/%d/%Y". ' rf"{PARSING_ERR_MSG}$"
+        )
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(ser, errors="raise", cache=cache)
+        result_coerce = to_datetime(ser, errors="coerce", cache=cache)
+        expected_coerce = Series([datetime(2006, 10, 18), datetime(2008, 10, 18), NaT])
+        tm.assert_series_equal(result_coerce, expected_coerce)
+
+    @td.skip_if_not_us_locale
+    def test_to_datetime_with_apply(self, cache):
+        # this is only locale tested with US/None locales
+        # GH 5195
+        # with a format and coerce a single item to_datetime fails
+        td = Series(["May 04", "Jun 02", "Dec 11"], index=[1, 2, 3])
+        expected = to_datetime(td, format="%b %y", cache=cache)
+        result = td.apply(to_datetime, format="%b %y", cache=cache)
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_timezone_name(self):
+        # https://github.com/pandas-dev/pandas/issues/49748
+        result = to_datetime("2020-01-01 00:00:00UTC", format="%Y-%m-%d %H:%M:%S%Z")
+        expected = Timestamp(2020, 1, 1).tz_localize("UTC")
+        assert result == expected
+
+    @td.skip_if_not_us_locale
+    @pytest.mark.parametrize("errors", ["raise", "coerce"])
+    def test_to_datetime_with_apply_with_empty_str(self, cache, errors):
+        # this is only locale tested with US/None locales
+        # GH 5195, GH50251
+        # with a format and coerce a single item to_datetime fails
+        td = Series(["May 04", "Jun 02", ""], index=[1, 2, 3])
+        expected = to_datetime(td, format="%b %y", errors=errors, cache=cache)
+
+        result = td.apply(
+            lambda x: to_datetime(x, format="%b %y", errors="coerce", cache=cache)
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_empty_stt(self, cache):
+        # empty string
+        result = to_datetime("", cache=cache)
+        assert result is NaT
+
+    def test_to_datetime_empty_str_list(self, cache):
+        result = to_datetime(["", ""], cache=cache)
+        assert isna(result).all()
+
+    def test_to_datetime_zero(self, cache):
+        # ints
+        result = Timestamp(0)
+        expected = to_datetime(0, cache=cache)
+        assert result == expected
+
+    def test_to_datetime_strings(self, cache):
+        # GH 3888 (strings)
+        expected = to_datetime(["2012"], cache=cache)[0]
+        result = to_datetime("2012", cache=cache)
+        assert result == expected
+
+    def test_to_datetime_strings_variation(self, cache):
+        array = ["2012", "20120101", "20120101 12:01:01"]
+        expected = [to_datetime(dt_str, cache=cache) for dt_str in array]
+        result = [Timestamp(date_str) for date_str in array]
+        tm.assert_almost_equal(result, expected)
+
+    @pytest.mark.parametrize("result", [Timestamp("2012"), to_datetime("2012")])
+    def test_to_datetime_strings_vs_constructor(self, result):
+        expected = Timestamp(2012, 1, 1)
+        assert result == expected
+
+    def test_to_datetime_unprocessable_input(self, cache):
+        # GH 4928
+        # GH 21864
+        msg = '^Given date string "1" not likely a datetime$'
+        with pytest.raises(ValueError, match=msg):
+            to_datetime([1, "1"], errors="raise", cache=cache)
+
+    def test_to_datetime_other_datetime64_units(self):
+        # 5/25/2012
+        scalar = np.int64(1337904000000000).view("M8[us]")
+        as_obj = scalar.astype("O")
+
+        index = DatetimeIndex([scalar])
+        assert index[0] == scalar.astype("O")
+
+        value = Timestamp(scalar)
+        assert value == as_obj
+
+    def test_to_datetime_list_of_integers(self):
+        rng = date_range("1/1/2000", periods=20, unit="ns")
+        rng = DatetimeIndex(rng.values)
+
+        ints = list(rng.asi8)
+
+        result = DatetimeIndex(ints)
+
+        tm.assert_index_equal(rng, result)
+
+    def test_to_datetime_overflow(self):
+        # gh-17637
+        # we are overflowing Timedelta range here
+        msg = "Cannot cast 139999 days 00:00:00 to unit='ns' without overflow"
+        with pytest.raises(OutOfBoundsTimedelta, match=msg):
+            date_range(start="1/1/1700", freq="B", periods=100000, unit="ns")
+
+    def test_to_datetime_float_with_nans_floating_point_error(self):
+        # GH#58419
+        ser = Series([np.nan] * 1000 + [1712219033.0], dtype=np.float64)
+        result = to_datetime(ser, unit="s", errors="coerce")
+        expected = Series(
+            [NaT] * 1000 + [Timestamp("2024-04-04 08:23:53")], dtype="datetime64[s]"
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_string_invalid_operation(self, cache):
+        invalid = np.array(["87156549591102612381000001219H5"], dtype=object)
+        # GH #51084
+
+        with pytest.raises(ValueError, match="Unknown datetime string format"):
+            to_datetime(invalid, errors="raise", cache=cache)
+
+    def test_string_na_nat_conversion(self, cache):
+        # GH #999, #858
+
+        strings = np.array(["1/1/2000", "1/2/2000", np.nan, "1/4/2000"], dtype=object)
+
+        expected = np.empty(4, dtype="M8[us]")
+        for i, val in enumerate(strings):
+            if isna(val):
+                expected[i] = iNaT
+            else:
+                expected[i] = parse(val)
+
+        result = tslib.array_to_datetime(strings)[0]
+        tm.assert_almost_equal(result, expected)
+
+        result2 = to_datetime(strings, cache=cache)
+        assert isinstance(result2, DatetimeIndex)
+        tm.assert_numpy_array_equal(result, result2.values)
+
+    def test_string_na_nat_conversion_malformed(self, cache):
+        malformed = np.array(["1/100/2000", np.nan], dtype=object)
+
+        # GH 10636, default is now 'raise'
+        msg = r"Unknown datetime string format"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(malformed, errors="raise", cache=cache)
+
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(malformed, errors="raise", cache=cache)
+
+    def test_string_na_nat_conversion_with_name(self, cache):
+        idx = ["a", "b", "c", "d", "e"]
+        series = Series(
+            ["1/1/2000", np.nan, "1/3/2000", np.nan, "1/5/2000"], index=idx, name="foo"
+        )
+        dseries = Series(
+            [
+                to_datetime("1/1/2000", cache=cache),
+                np.nan,
+                to_datetime("1/3/2000", cache=cache),
+                np.nan,
+                to_datetime("1/5/2000", cache=cache),
+            ],
+            index=idx,
+            name="foo",
+        )
+
+        result = to_datetime(series, cache=cache)
+        dresult = to_datetime(dseries, cache=cache)
+
+        expected = Series(np.empty(5, dtype="M8[us]"), index=idx)
+        for i in range(5):
+            x = series.iloc[i]
+            if isna(x):
+                expected.iloc[i] = NaT
+            else:
+                expected.iloc[i] = to_datetime(x, cache=cache)
+
+        tm.assert_series_equal(result, expected, check_names=False)
+        assert result.name == "foo"
+
+        tm.assert_series_equal(dresult, expected, check_names=False)
+        assert dresult.name == "foo"
+
+    @pytest.mark.parametrize(
+        "unit",
+        ["h", "m", "s", "ms", "us", "ns"],
+    )
+    def test_dti_constructor_numpy_timeunits(self, cache, unit):
+        # GH 9114
+        dtype = np.dtype(f"M8[{unit}]")
+        base = to_datetime(["2000-01-01T00:00", "2000-01-02T00:00", "NaT"], cache=cache)
+
+        values = base.values.astype(dtype)
+
+        if unit in ["h", "m"]:
+            # we cast to closest supported unit
+            unit = "s"
+        exp_dtype = np.dtype(f"M8[{unit}]")
+        expected = DatetimeIndex(base.astype(exp_dtype))
+        assert expected.dtype == exp_dtype
+
+        tm.assert_index_equal(DatetimeIndex(values), expected)
+        tm.assert_index_equal(to_datetime(values, cache=cache), expected)
+
+    def test_dayfirst(self, cache):
+        # GH 5917
+        arr = ["10/02/2014", "11/02/2014", "12/02/2014"]
+        expected = DatetimeIndex(
+            [datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)]
+        )
+        idx1 = DatetimeIndex(arr, dayfirst=True)
+        idx2 = DatetimeIndex(np.array(arr), dayfirst=True)
+        idx3 = to_datetime(arr, dayfirst=True, cache=cache)
+        idx4 = to_datetime(np.array(arr), dayfirst=True, cache=cache)
+        idx5 = DatetimeIndex(Index(arr), dayfirst=True)
+        idx6 = DatetimeIndex(Series(arr), dayfirst=True)
+        tm.assert_index_equal(expected, idx1)
+        tm.assert_index_equal(expected, idx2)
+        tm.assert_index_equal(expected, idx3)
+        tm.assert_index_equal(expected, idx4)
+        tm.assert_index_equal(expected, idx5)
+        tm.assert_index_equal(expected, idx6)
+
+    def test_dayfirst_warnings_valid_input(self):
+        # GH 12585
+        warning_msg = (
+            "Parsing dates in .* format when dayfirst=.* was specified. "
+            "Pass `dayfirst=.*` or specify a format to silence this warning."
+        )
+
+        # CASE 1: valid input
+        arr = ["31/12/2014", "10/03/2011"]
+        expected = DatetimeIndex(
+            ["2014-12-31", "2011-03-10"], dtype="datetime64[us]", freq=None
+        )
+
+        # A. dayfirst arg correct, no warning
+        res1 = to_datetime(arr, dayfirst=True)
+        tm.assert_index_equal(expected, res1)
+
+        # B. dayfirst arg incorrect, warning
+        with tm.assert_produces_warning(UserWarning, match=warning_msg):
+            res2 = to_datetime(arr, dayfirst=False)
+        tm.assert_index_equal(expected, res2)
+
+    def test_dayfirst_warnings_invalid_input(self):
+        # CASE 2: invalid input
+        # cannot consistently process with single format
+        # ValueError *always* raised
+
+        # first in DD/MM/YYYY, second in MM/DD/YYYY
+        arr = ["31/12/2014", "03/30/2011"]
+
+        with pytest.raises(
+            ValueError,
+            match=(
+                r'^time data "03/30/2011" doesn\'t match format '
+                rf'"%d/%m/%Y". {PARSING_ERR_MSG}$'
+            ),
+        ):
+            to_datetime(arr, dayfirst=True)
+
+    @pytest.mark.parametrize("klass", [DatetimeIndex, DatetimeArray._from_sequence])
+    def test_to_datetime_dta_tz(self, klass):
+        # GH#27733
+        dti = date_range("2015-04-05", periods=3).rename("foo")
+        expected = dti.tz_localize("UTC")
+
+        obj = klass(dti)
+        expected = klass(expected)
+
+        result = to_datetime(obj, utc=True)
+        tm.assert_equal(result, expected)
+
+
+class TestGuessDatetimeFormat:
+    @pytest.mark.parametrize(
+        "test_list",
+        [
+            [
+                "2011-12-30 00:00:00.000000",
+                "2011-12-30 00:00:00.000000",
+                "2011-12-30 00:00:00.000000",
+            ],
+            [np.nan, np.nan, "2011-12-30 00:00:00.000000"],
+            ["", "2011-12-30 00:00:00.000000"],
+            ["NaT", "2011-12-30 00:00:00.000000"],
+            ["2011-12-30 00:00:00.000000", "random_string"],
+            ["now", "2011-12-30 00:00:00.000000"],
+            ["today", "2011-12-30 00:00:00.000000"],
+        ],
+    )
+    def test_guess_datetime_format_for_array(self, test_list):
+        expected_format = "%Y-%m-%d %H:%M:%S.%f"
+        test_array = np.array(test_list, dtype=object)
+        assert tools._guess_datetime_format_for_array(test_array) == expected_format
+
+    @td.skip_if_not_us_locale
+    def test_guess_datetime_format_for_array_all_nans(self):
+        format_for_string_of_nans = tools._guess_datetime_format_for_array(
+            np.array([np.nan, np.nan, np.nan], dtype="O")
+        )
+        assert format_for_string_of_nans is None
+
+
+class TestToDatetimeInferFormat:
+    @pytest.mark.parametrize(
+        "test_format", ["%m-%d-%Y", "%m/%d/%Y %H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S.%f"]
+    )
+    def test_to_datetime_infer_datetime_format_consistent_format(
+        self, cache, test_format
+    ):
+        ser = Series(date_range("20000101", periods=50, freq="h"))
+
+        s_as_dt_strings = ser.apply(lambda x: x.strftime(test_format))
+
+        with_format = to_datetime(s_as_dt_strings, format=test_format, cache=cache)
+        without_format = to_datetime(s_as_dt_strings, cache=cache)
+
+        # Whether the format is explicitly passed, or
+        # it is inferred, the results should all be the same
+        tm.assert_series_equal(with_format, without_format)
+
+    def test_to_datetime_inconsistent_format(self, cache):
+        data = ["01/01/2011 00:00:00", "01-02-2011 00:00:00", "2011-01-03T00:00:00"]
+        ser = Series(np.array(data))
+        msg = (
+            r'^time data "01-02-2011 00:00:00" doesn\'t match format '
+            rf'"%m/%d/%Y %H:%M:%S". {PARSING_ERR_MSG}$'
+        )
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(ser, cache=cache)
+
+    def test_to_datetime_consistent_format(self, cache):
+        data = ["Jan/01/2011", "Feb/01/2011", "Mar/01/2011"]
+        ser = Series(np.array(data))
+        result = to_datetime(ser, cache=cache)
+        expected = Series(
+            ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[us]"
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_series_with_nans(self, cache):
+        ser = Series(
+            np.array(
+                ["01/01/2011 00:00:00", np.nan, "01/03/2011 00:00:00", np.nan],
+                dtype=object,
+            )
+        )
+        result = to_datetime(ser, cache=cache)
+        expected = Series(
+            ["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[us]"
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_to_datetime_series_start_with_nans(self, cache):
+        ser = Series(
+            np.array(
+                [
+                    np.nan,
+                    np.nan,
+                    "01/01/2011 00:00:00",
+                    "01/02/2011 00:00:00",
+                    "01/03/2011 00:00:00",
+                ],
+                dtype=object,
+            )
+        )
+
+        result = to_datetime(ser, cache=cache)
+        expected = Series(
+            [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[us]"
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "tz_name, offset",
+        [("UTC", 0), ("UTC-3", 180), ("UTC+3", -180)],
+    )
+    def test_infer_datetime_format_tz_name(self, tz_name, offset):
+        # GH 33133
+        ser = Series([f"2019-02-02 08:07:13 {tz_name}"])
+        result = to_datetime(ser)
+        tz = timezone(timedelta(minutes=offset))
+        expected = Series([Timestamp("2019-02-02 08:07:13").tz_localize(tz)])
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "ts,zero_tz",
+        [
+            ("2019-02-02 08:07:13", "Z"),
+            ("2019-02-02 08:07:13", ""),
+            ("2019-02-02 08:07:13.012345", "Z"),
+            ("2019-02-02 08:07:13.012345", ""),
+        ],
+    )
+    def test_infer_datetime_format_zero_tz(self, ts, zero_tz):
+        # GH 41047
+        ser = Series([ts + zero_tz])
+        result = to_datetime(ser)
+        tz = timezone.utc if zero_tz == "Z" else None
+        expected = Series([Timestamp(ts, tz=tz)])
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("format", [None, "%Y-%m-%d"])
+    def test_to_datetime_iso8601_noleading_0s(self, cache, format):
+        # GH 11871
+        ser = Series(["2014-1-1", "2014-2-2", "2015-3-3"])
+        expected = Series(
+            [
+                Timestamp("2014-01-01"),
+                Timestamp("2014-02-02"),
+                Timestamp("2015-03-03"),
+            ]
+        )
+        result = to_datetime(ser, format=format, cache=cache)
+        tm.assert_series_equal(result, expected)
+
+
+class TestDaysInMonth:
+    # tests for issue #10154
+
+    @pytest.mark.parametrize(
+        "arg, format",
+        [
+            ["2015-02-29", None],
+            ["2015-02-29", "%Y-%m-%d"],
+            ["2015-02-32", "%Y-%m-%d"],
+            ["2015-04-31", "%Y-%m-%d"],
+        ],
+    )
+    def test_day_not_in_month_coerce(self, cache, arg, format):
+        assert isna(to_datetime(arg, errors="coerce", format=format, cache=cache))
+
+    def test_day_not_in_month_raise(self, cache):
+        if PY314:
+            msg = "day 29 must be in range 1..28 for month 2 in year 2015: 2015-02-29"
+        else:
+            msg = "day is out of range for month: 2015-02-29"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime("2015-02-29", errors="raise", cache=cache)
+
+    @pytest.mark.parametrize(
+        "arg, format, msg",
+        [
+            (
+                "2015-02-29",
+                "%Y-%m-%d",
+                f"^{DAY_IS_OUT_OF_RANGE}. {PARSING_ERR_MSG}$",
+            ),
+            (
+                "2015-29-02",
+                "%Y-%d-%m",
+                f"^{DAY_IS_OUT_OF_RANGE}. {PARSING_ERR_MSG}$",
+            ),
+            (
+                "2015-02-32",
+                "%Y-%m-%d",
+                '^unconverted data remains when parsing with format "%Y-%m-%d": "2". '
+                f"{PARSING_ERR_MSG}$",
+            ),
+            (
+                "2015-32-02",
+                "%Y-%d-%m",
+                '^time data "2015-32-02" doesn\'t match format "%Y-%d-%m". '
+                f"{PARSING_ERR_MSG}$",
+            ),
+            (
+                "2015-04-31",
+                "%Y-%m-%d",
+                f"^{DAY_IS_OUT_OF_RANGE}. {PARSING_ERR_MSG}$",
+            ),
+            (
+                "2015-31-04",
+                "%Y-%d-%m",
+                f"^{DAY_IS_OUT_OF_RANGE}. {PARSING_ERR_MSG}$",
+            ),
+        ],
+    )
+    def test_day_not_in_month_raise_value(self, cache, arg, format, msg):
+        # https://github.com/pandas-dev/pandas/issues/50462
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(arg, errors="raise", format=format, cache=cache)
+
+
+class TestDatetimeParsingWrappers:
+    @pytest.mark.parametrize(
+        "date_str, expected",
+        [
+            ("2011-01-01", datetime(2011, 1, 1)),
+            ("2Q2005", datetime(2005, 4, 1)),
+            ("2Q05", datetime(2005, 4, 1)),
+            ("2005Q1", datetime(2005, 1, 1)),
+            ("05Q1", datetime(2005, 1, 1)),
+            ("2011Q3", datetime(2011, 7, 1)),
+            ("11Q3", datetime(2011, 7, 1)),
+            ("3Q2011", datetime(2011, 7, 1)),
+            ("3Q11", datetime(2011, 7, 1)),
+            # quarterly without space
+            ("2000Q4", datetime(2000, 10, 1)),
+            ("00Q4", datetime(2000, 10, 1)),
+            ("4Q2000", datetime(2000, 10, 1)),
+            ("4Q00", datetime(2000, 10, 1)),
+            ("2000q4", datetime(2000, 10, 1)),
+            ("2000-Q4", datetime(2000, 10, 1)),
+            ("00-Q4", datetime(2000, 10, 1)),
+            ("4Q-2000", datetime(2000, 10, 1)),
+            ("4Q-00", datetime(2000, 10, 1)),
+            ("00q4", datetime(2000, 10, 1)),
+            ("2005", datetime(2005, 1, 1)),
+            ("2005-11", datetime(2005, 11, 1)),
+            ("2005 11", datetime(2005, 11, 1)),
+            ("11-2005", datetime(2005, 11, 1)),
+            ("11 2005", datetime(2005, 11, 1)),
+            ("200511", datetime(2020, 5, 11)),
+            ("20051109", datetime(2005, 11, 9)),
+            ("20051109 10:15", datetime(2005, 11, 9, 10, 15)),
+            ("20051109 08H", datetime(2005, 11, 9, 8, 0)),
+            ("2005-11-09 10:15", datetime(2005, 11, 9, 10, 15)),
+            ("2005-11-09 08H", datetime(2005, 11, 9, 8, 0)),
+            ("2005/11/09 10:15", datetime(2005, 11, 9, 10, 15)),
+            ("2005/11/09 10:15:32", datetime(2005, 11, 9, 10, 15, 32)),
+            ("2005/11/09 10:15:32 AM", datetime(2005, 11, 9, 10, 15, 32)),
+            ("2005/11/09 10:15:32 PM", datetime(2005, 11, 9, 22, 15, 32)),
+            ("2005/11/09 08H", datetime(2005, 11, 9, 8, 0)),
+            ("Thu Sep 25 10:36:28 2003", datetime(2003, 9, 25, 10, 36, 28)),
+            ("Thu Sep 25 2003", datetime(2003, 9, 25)),
+            ("Sep 25 2003", datetime(2003, 9, 25)),
+            ("January 1 2014", datetime(2014, 1, 1)),
+            # GH#10537
+            ("2014-06", datetime(2014, 6, 1)),
+            ("06-2014", datetime(2014, 6, 1)),
+            ("2014-6", datetime(2014, 6, 1)),
+            ("6-2014", datetime(2014, 6, 1)),
+            ("20010101 12", datetime(2001, 1, 1, 12)),
+            ("20010101 1234", datetime(2001, 1, 1, 12, 34)),
+            ("20010101 123456", datetime(2001, 1, 1, 12, 34, 56)),
+        ],
+    )
+    def test_parsers(self, date_str, expected, cache):
+        # dateutil >= 2.5.0 defaults to yearfirst=True
+        # https://github.com/dateutil/dateutil/issues/217
+        yearfirst = True
+
+        result1, reso_attrname = parsing.parse_datetime_string_with_reso(
+            date_str, yearfirst=yearfirst
+        )
+
+        reso = {
+            "nanosecond": "ns",
+        }.get(reso_attrname, "us")
+        result2 = to_datetime(date_str, yearfirst=yearfirst)
+        result3 = to_datetime([date_str], yearfirst=yearfirst)
+        # result5 is used below
+        result4 = to_datetime(
+            np.array([date_str], dtype=object), yearfirst=yearfirst, cache=cache
+        )
+        result6 = DatetimeIndex([date_str], yearfirst=yearfirst)
+        # result7 is used below
+        result8 = DatetimeIndex(Index([date_str]), yearfirst=yearfirst)
+        result9 = DatetimeIndex(Series([date_str]), yearfirst=yearfirst)
+
+        for res in [result1, result2]:
+            assert res == expected
+        for res in [result3, result4, result6, result8, result9]:
+            exp = DatetimeIndex([Timestamp(expected)]).as_unit(reso)
+            tm.assert_index_equal(res, exp)
+
+        # these really need to have yearfirst, but we don't support
+        if not yearfirst:
+            result5 = Timestamp(date_str)
+            assert result5 == expected
+            result7 = date_range(date_str, freq="S", periods=1, yearfirst=yearfirst)
+            assert result7 == expected
+
+    def test_na_values_with_cache(
+        self, cache, unique_nulls_fixture, unique_nulls_fixture2
+    ):
+        # GH22305
+        expected = Index([NaT, NaT], dtype="datetime64[s]")
+        result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], cache=cache)
+        tm.assert_index_equal(result, expected)
+
+    def test_parsers_nat(self):
+        # Test that each of several string-accepting methods return pd.NaT
+        result1, _ = parsing.parse_datetime_string_with_reso("NaT")
+        result2 = to_datetime("NaT")
+        result3 = Timestamp("NaT")
+        result4 = DatetimeIndex(["NaT"])[0]
+        assert result1 is NaT
+        assert result2 is NaT
+        assert result3 is NaT
+        assert result4 is NaT
+
+    @pytest.mark.parametrize(
+        "date_str, dayfirst, yearfirst, expected",
+        [
+            ("10-11-12", False, False, datetime(2012, 10, 11)),
+            ("10-11-12", True, False, datetime(2012, 11, 10)),
+            ("10-11-12", False, True, datetime(2010, 11, 12)),
+            ("10-11-12", True, True, datetime(2010, 12, 11)),
+            ("20/12/21", False, False, datetime(2021, 12, 20)),
+            ("20/12/21", True, False, datetime(2021, 12, 20)),
+            ("20/12/21", False, True, datetime(2020, 12, 21)),
+            ("20/12/21", True, True, datetime(2020, 12, 21)),
+            # GH 58859
+            ("20201012", True, False, datetime(2020, 12, 10)),
+        ],
+    )
+    def test_parsers_dayfirst_yearfirst(
+        self, cache, date_str, dayfirst, yearfirst, expected
+    ):
+        # OK
+        # 2.5.1 10-11-12   [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00
+        # 2.5.2 10-11-12   [dayfirst=0, yearfirst=1] -> 2012-10-11 00:00:00
+        # 2.5.3 10-11-12   [dayfirst=0, yearfirst=0] -> 2012-10-11 00:00:00
+
+        # OK
+        # 2.5.1 10-11-12   [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00
+        # 2.5.2 10-11-12   [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00
+        # 2.5.3 10-11-12   [dayfirst=0, yearfirst=1] -> 2010-11-12 00:00:00
+
+        # bug fix in 2.5.2
+        # 2.5.1 10-11-12   [dayfirst=1, yearfirst=1] -> 2010-11-12 00:00:00
+        # 2.5.2 10-11-12   [dayfirst=1, yearfirst=1] -> 2010-12-11 00:00:00
+        # 2.5.3 10-11-12   [dayfirst=1, yearfirst=1] -> 2010-12-11 00:00:00
+
+        # OK
+        # 2.5.1 10-11-12   [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00
+        # 2.5.2 10-11-12   [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00
+        # 2.5.3 10-11-12   [dayfirst=1, yearfirst=0] -> 2012-11-10 00:00:00
+
+        # OK
+        # 2.5.1 20/12/21   [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00
+        # 2.5.2 20/12/21   [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00
+        # 2.5.3 20/12/21   [dayfirst=0, yearfirst=0] -> 2021-12-20 00:00:00
+
+        # OK
+        # 2.5.1 20/12/21   [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00
+        # 2.5.2 20/12/21   [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00
+        # 2.5.3 20/12/21   [dayfirst=0, yearfirst=1] -> 2020-12-21 00:00:00
+
+        # revert of bug in 2.5.2
+        # 2.5.1 20/12/21   [dayfirst=1, yearfirst=1] -> 2020-12-21 00:00:00
+        # 2.5.2 20/12/21   [dayfirst=1, yearfirst=1] -> month must be in 1..12
+        # 2.5.3 20/12/21   [dayfirst=1, yearfirst=1] -> 2020-12-21 00:00:00
+
+        # OK
+        # 2.5.1 20/12/21   [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00
+        # 2.5.2 20/12/21   [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00
+        # 2.5.3 20/12/21   [dayfirst=1, yearfirst=0] -> 2021-12-20 00:00:00
+
+        # str : dayfirst, yearfirst, expected
+
+        # compare with dateutil result
+        dateutil_result = parse(date_str, dayfirst=dayfirst, yearfirst=yearfirst)
+        assert dateutil_result == expected
+
+        result1, _ = parsing.parse_datetime_string_with_reso(
+            date_str, dayfirst=dayfirst, yearfirst=yearfirst
+        )
+
+        # we don't support dayfirst/yearfirst here:
+        if not dayfirst and not yearfirst:
+            result2 = Timestamp(date_str)
+            assert result2 == expected
+
+        result3 = to_datetime(
+            date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache
+        )
+
+        result4 = DatetimeIndex([date_str], dayfirst=dayfirst, yearfirst=yearfirst)[0]
+
+        assert result1 == expected
+        assert result3 == expected
+        assert result4 == expected
+
+    @pytest.mark.parametrize(
+        "date_str, exp_def",
+        [["10:15", datetime(1, 1, 1, 10, 15)], ["9:05", datetime(1, 1, 1, 9, 5)]],
+    )
+    def test_parsers_timestring(self, date_str, exp_def):
+        # must be the same as dateutil result
+        exp_now = parse(date_str)
+
+        result1, _ = parsing.parse_datetime_string_with_reso(date_str)
+        result2 = to_datetime(date_str)
+        result3 = to_datetime([date_str])
+        result4 = Timestamp(date_str)
+        result5 = DatetimeIndex([date_str])[0]
+        # parse time string return time string based on default date
+        # others are not, and can't be changed because it is used in
+        # time series plot
+        assert result1 == exp_def
+        assert result2 == exp_now
+        assert result3 == exp_now
+        assert result4 == exp_now
+        assert result5 == exp_now
+
+    @pytest.mark.parametrize(
+        "dt_string, tz, dt_string_repr",
+        [
+            (
+                "2013-01-01 05:45+0545",
+                timezone(timedelta(minutes=345)),
+                "Timestamp('2013-01-01 05:45:00+0545', tz='UTC+05:45')",
+            ),
+            (
+                "2013-01-01 05:30+0530",
+                timezone(timedelta(minutes=330)),
+                "Timestamp('2013-01-01 05:30:00+0530', tz='UTC+05:30')",
+            ),
+        ],
+    )
+    def test_parsers_timezone_minute_offsets_roundtrip(
+        self, cache, dt_string, tz, dt_string_repr
+    ):
+        # GH11708
+        base = to_datetime("2013-01-01 00:00:00", cache=cache)
+        base = base.tz_localize("UTC").tz_convert(tz)
+        dt_time = to_datetime(dt_string, cache=cache)
+        assert base == dt_time
+        assert dt_string_repr == repr(dt_time)
+
+
+@pytest.fixture(params=["D", "s", "ms", "us", "ns"])
+def units(request):
+    """Day and some time units.
+
+    * D
+    * s
+    * ms
+    * us
+    * ns
+    """
+    return request.param
+
+
+@pytest.fixture
+def julian_dates():
+    return date_range("2014-1-1", periods=10).to_julian_date().values
+
+
+class TestOrigin:
+    def test_origin_and_unit(self):
+        # GH#42624
+        ts = to_datetime(1, unit="s", origin=1)
+        expected = Timestamp("1970-01-01 00:00:02")
+        assert ts == expected
+
+        ts = to_datetime(1, unit="s", origin=1_000_000_000)
+        expected = Timestamp("2001-09-09 01:46:41")
+        assert ts == expected
+
+    def test_julian(self, julian_dates):
+        # gh-11276, gh-11745
+        # for origin as julian
+
+        result = Series(to_datetime(julian_dates, unit="D", origin="julian"))
+        expected = Series(
+            to_datetime(julian_dates - Timestamp(0).to_julian_date(), unit="D")
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_unix(self):
+        result = Series(to_datetime([0, 1, 2], unit="D", origin="unix"))
+        expected = Series(
+            [Timestamp("1970-01-01"), Timestamp("1970-01-02"), Timestamp("1970-01-03")],
+            dtype="M8[s]",
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_julian_round_trip(self):
+        result = to_datetime(2456658, origin="julian", unit="D")
+        assert result.to_julian_date() == 2456658
+
+        # out-of-bounds
+        msg = "1 is Out of Bounds for origin='julian'"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(1, origin="julian", unit="D")
+
+    def test_invalid_unit(self, units, julian_dates):
+        # checking for invalid combination of origin='julian' and unit != D
+        if units != "D":
+            msg = "unit must be 'D' for origin='julian'"
+            with pytest.raises(ValueError, match=msg):
+                to_datetime(julian_dates, unit=units, origin="julian")
+
+    @pytest.mark.parametrize("unit", ["ns", "D"])
+    def test_invalid_origin(self, unit):
+        # need to have a numeric specified
+        msg = "it must be numeric with a unit specified"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime("2005-01-01", origin="1960-01-01", unit=unit)
+
+    @pytest.mark.parametrize(
+        "epochs",
+        [
+            Timestamp(1960, 1, 1),
+            datetime(1960, 1, 1),
+            "1960-01-01",
+            np.datetime64("1960-01-01"),
+        ],
+    )
+    def test_epoch(self, units, epochs):
+        epoch_1960 = Timestamp(1960, 1, 1)
+        units_from_epochs = np.arange(5, dtype=np.int64)
+        exp_unit = "s" if units == "D" else units
+        expected = Series(
+            [pd.Timedelta(x, unit=units) + epoch_1960 for x in units_from_epochs],
+            dtype=f"M8[{exp_unit}]",
+        )
+
+        result = Series(to_datetime(units_from_epochs, unit=units, origin=epochs))
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "origin, exc",
+        [
+            ("random_string", ValueError),
+            ("epoch", ValueError),
+            ("13-24-1990", ValueError),
+            (datetime(1, 1, 1), OutOfBoundsDatetime),
+        ],
+    )
+    def test_invalid_origins(self, origin, exc, units):
+        msg = "|".join(
+            [
+                f"origin {origin} is Out of Bounds",
+                f"origin {origin} cannot be converted to a Timestamp",
+                "Cannot cast .* to unit='ns' without overflow",
+            ]
+        )
+        with pytest.raises(exc, match=msg):
+            to_datetime(list(range(5)), unit=units, origin=origin)
+
+    def test_invalid_origins_tzinfo(self):
+        # GH16842
+        with pytest.raises(ValueError, match="must be tz-naive"):
+            to_datetime(1, unit="D", origin=datetime(2000, 1, 1, tzinfo=timezone.utc))
+
+    def test_incorrect_value_exception(self):
+        # GH47495
+        msg = "Unknown datetime string format, unable to parse: yesterday"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(["today", "yesterday"])
+
+    @pytest.mark.parametrize(
+        "format, warning",
+        [
+            (None, UserWarning),
+            ("%Y-%m-%d %H:%M:%S", None),
+            ("%Y-%d-%m %H:%M:%S", None),
+        ],
+    )
+    def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning):
+        # see gh-23830
+        if format is None:
+            res = to_datetime("2417-10-10 00:00:00.00", format=format)
+            assert isinstance(res, Timestamp)
+            assert res.year == 2417
+            assert res.month == 10
+            assert res.day == 10
+        else:
+            msg = "unconverted data remains when parsing with format.*"
+            with pytest.raises(ValueError, match=msg):
+                to_datetime("2417-10-10 00:00:00.00", format=format)
+
+    @pytest.mark.parametrize(
+        "arg, origin, expected_str",
+        [
+            [200 * 365, "unix", "2169-11-13 00:00:00"],
+            [200 * 365, "1870-01-01", "2069-11-13 00:00:00"],
+            [300 * 365, "1870-01-01", "2169-10-20 00:00:00"],
+        ],
+    )
+    def test_processing_order(self, arg, origin, expected_str):
+        # make sure we handle out-of-bounds *before*
+        # constructing the dates
+
+        result = to_datetime(arg, unit="D", origin=origin)
+        expected = Timestamp(expected_str)
+        assert result == expected
+
+        result = to_datetime(200 * 365, unit="D", origin="1870-01-01")
+        expected = Timestamp("2069-11-13 00:00:00")
+        assert result == expected
+
+        result = to_datetime(300 * 365, unit="D", origin="1870-01-01")
+        expected = Timestamp("2169-10-20 00:00:00")
+        assert result == expected
+
+    @pytest.mark.parametrize(
+        "offset,utc,exp",
+        [
+            ["Z", True, "2019-01-01T00:00:00.000Z"],
+            ["Z", None, "2019-01-01T00:00:00.000Z"],
+            ["-01:00", True, "2019-01-01T01:00:00.000Z"],
+            ["-01:00", None, "2019-01-01T00:00:00.000-01:00"],
+        ],
+    )
+    def test_arg_tz_ns_unit(self, offset, utc, exp):
+        # GH 25546
+        arg = "2019-01-01T00:00:00.000" + offset
+        result = to_datetime([arg], unit="ns", utc=utc)
+        expected = to_datetime([exp]).as_unit("us")
+        tm.assert_index_equal(result, expected)
+
+
+class TestShouldCache:
+    @pytest.mark.parametrize(
+        "listlike,do_caching",
+        [
+            ([1, 2, 3, 4, 5, 6, 7, 8, 9, 0], False),
+            ([1, 1, 1, 1, 4, 5, 6, 7, 8, 9], True),
+        ],
+    )
+    def test_should_cache(self, listlike, do_caching):
+        assert (
+            tools.should_cache(listlike, check_count=len(listlike), unique_share=0.7)
+            == do_caching
+        )
+
+    @pytest.mark.parametrize(
+        "unique_share,check_count, err_message",
+        [
+            (0.5, 11, r"check_count must be in next bounds: \[0; len\(arg\)\]"),
+            (10, 2, r"unique_share must be in next bounds: \(0; 1\)"),
+        ],
+    )
+    def test_should_cache_errors(self, unique_share, check_count, err_message):
+        arg = [5] * 10
+
+        with pytest.raises(AssertionError, match=err_message):
+            tools.should_cache(arg, unique_share, check_count)
+
+    @pytest.mark.parametrize(
+        "listlike",
+        [
+            (deque([Timestamp("2010-06-02 09:30:00")] * 51)),
+            ([Timestamp("2010-06-02 09:30:00")] * 51),
+            (tuple([Timestamp("2010-06-02 09:30:00")] * 51)),
+        ],
+    )
+    def test_no_slicing_errors_in_should_cache(self, listlike):
+        # GH#29403
+        assert tools.should_cache(listlike) is True
+
+
+def test_nullable_integer_to_datetime():
+    # Test for #30050
+    ser = Series([1, 2, None, 2**61, None], dtype="Int64")
+    ser_copy = ser.copy()
+
+    res = to_datetime(ser, unit="ns")
+
+    expected = Series(
+        [
+            np.datetime64("1970-01-01 00:00:00.000000001"),
+            np.datetime64("1970-01-01 00:00:00.000000002"),
+            np.datetime64("NaT"),
+            np.datetime64("2043-01-25 23:56:49.213693952"),
+            np.datetime64("NaT"),
+        ]
+    )
+    tm.assert_series_equal(res, expected)
+    # Check that ser isn't mutated
+    tm.assert_series_equal(ser, ser_copy)
+
+
+@pytest.mark.parametrize("klass", [np.array, list])
+def test_na_to_datetime(nulls_fixture, klass):
+    if isinstance(nulls_fixture, Decimal):
+        with pytest.raises(TypeError, match="not convertible to datetime"):
+            to_datetime(klass([nulls_fixture]))
+
+    else:
+        result = to_datetime(klass([nulls_fixture]))
+
+        assert result[0] is NaT
+
+
+@pytest.mark.parametrize("errors", ["raise", "coerce"])
+@pytest.mark.parametrize(
+    "args, format",
+    [
+        (["03/24/2016", "03/25/2016", ""], "%m/%d/%Y"),
+        (["2016-03-24", "2016-03-25", ""], "%Y-%m-%d"),
+    ],
+    ids=["non-ISO8601", "ISO8601"],
+)
+def test_empty_string_datetime(errors, args, format):
+    # GH13044, GH50251
+    td = Series(args)
+
+    # coerce empty string to pd.NaT
+    result = to_datetime(td, format=format, errors=errors)
+    expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[us]")
+    tm.assert_series_equal(expected, result)
+
+
+def test_empty_string_datetime_coerce__unit():
+    # GH13044
+    # coerce empty string to pd.NaT
+    result = to_datetime([1, ""], unit="s", errors="coerce")
+    expected = DatetimeIndex(["1970-01-01 00:00:01", "NaT"], dtype="datetime64[s]")
+    tm.assert_index_equal(expected, result)
+
+    # verify that no exception is raised even when errors='raise' is set
+    result = to_datetime([1, ""], unit="s", errors="raise")
+    tm.assert_index_equal(expected, result)
+
+
+def test_to_datetime_monotonic_increasing_index(cache):
+    # GH28238
+    cstart = start_caching_at
+    times = date_range(Timestamp("1980"), periods=cstart, freq="YS")
+    times = times.to_frame(index=False, name="DT").sample(n=cstart, random_state=1)
+    times.index = times.index.to_series().astype(float) / 1000
+    result = to_datetime(times.iloc[:, 0], cache=cache)
+    expected = times.iloc[:, 0]
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "series_length",
+    [40, start_caching_at, (start_caching_at + 1), (start_caching_at + 5)],
+)
+def test_to_datetime_cache_coerce_50_lines_outofbounds(series_length):
+    # GH#45319
+    ser = Series(
+        [datetime.fromisoformat("1446-04-12 00:00:00+00:00")]
+        + ([datetime.fromisoformat("1991-10-20 00:00:00+00:00")] * series_length),
+        dtype=object,
+    )
+    result1 = to_datetime(ser, errors="coerce", utc=True)
+
+    expected1 = Series([Timestamp(x) for x in ser])
+    assert expected1.dtype == "M8[us, UTC]"
+    tm.assert_series_equal(result1, expected1)
+
+    result3 = to_datetime(ser, errors="raise", utc=True)
+    tm.assert_series_equal(result3, expected1)
+
+
+def test_to_datetime_format_f_parse_nanos():
+    # GH 48767
+    timestamp = "15/02/2020 02:03:04.123456789"
+    timestamp_format = "%d/%m/%Y %H:%M:%S.%f"
+    result = to_datetime(timestamp, format=timestamp_format)
+    expected = Timestamp(
+        year=2020,
+        month=2,
+        day=15,
+        hour=2,
+        minute=3,
+        second=4,
+        microsecond=123456,
+        nanosecond=789,
+    )
+    assert result == expected
+
+
+def test_to_datetime_mixed_iso8601():
+    # https://github.com/pandas-dev/pandas/issues/50411
+    result = to_datetime(["2020-01-01", "2020-01-01 05:00:00"], format="ISO8601")
+    expected = DatetimeIndex(["2020-01-01 00:00:00", "2020-01-01 05:00:00"])
+    tm.assert_index_equal(result, expected)
+
+
+def test_to_datetime_mixed_other():
+    # https://github.com/pandas-dev/pandas/issues/50411
+    result = to_datetime(["01/11/2000", "12 January 2000"], format="mixed")
+    expected = DatetimeIndex(["2000-01-11", "2000-01-12"])
+    tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("exact", [True, False])
+@pytest.mark.parametrize("format", ["ISO8601", "mixed"])
+def test_to_datetime_mixed_or_iso_exact(exact, format):
+    msg = "Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'"
+    with pytest.raises(ValueError, match=msg):
+        to_datetime(["2020-01-01"], exact=exact, format=format)
+
+
+def test_to_datetime_mixed_not_necessarily_iso8601_raise():
+    # https://github.com/pandas-dev/pandas/issues/50411
+    with pytest.raises(ValueError, match="Time data 01-01-2000 is not ISO8601 format"):
+        to_datetime(["2020-01-01", "01-01-2000"], format="ISO8601")
+
+
+def test_to_datetime_mixed_not_necessarily_iso8601_coerce():
+    # https://github.com/pandas-dev/pandas/issues/50411
+    result = to_datetime(
+        ["2020-01-01", "01-01-2000"], format="ISO8601", errors="coerce"
+    )
+    tm.assert_index_equal(result, DatetimeIndex(["2020-01-01 00:00:00", NaT]))
+
+
+def test_to_datetime_iso8601_utc_single_naive():
+    # GH#61389
+    result = to_datetime("2023-10-15T14:30:00", utc=True, format="ISO8601")
+    expected = Timestamp("2023-10-15 14:30:00+00:00")
+    assert result == expected
+
+
+def test_to_datetime_iso8601_utc_mixed_negative_offset():
+    # GH#61389
+    data = ["2023-10-15T10:30:00-12:00", "2023-10-15T14:30:00"]
+    result = to_datetime(data, utc=True, format="ISO8601")
+
+    expected = DatetimeIndex(
+        [Timestamp("2023-10-15 22:30:00+00:00"), Timestamp("2023-10-15 14:30:00+00:00")]
+    )
+    tm.assert_index_equal(result, expected)
+
+
+def test_to_datetime_iso8601_utc_mixed_positive_offset():
+    # GH#61389
+    data = ["2023-10-15T10:30:00+08:00", "2023-10-15T14:30:00"]
+    result = to_datetime(data, utc=True, format="ISO8601")
+
+    expected = DatetimeIndex(
+        [Timestamp("2023-10-15 02:30:00+00:00"), Timestamp("2023-10-15 14:30:00+00:00")]
+    )
+    tm.assert_index_equal(result, expected)
+
+
+def test_to_datetime_iso8601_utc_mixed_both_offsets():
+    # GH#61389
+    data = [
+        "2023-10-15T10:30:00+08:00",
+        "2023-10-15T12:30:00-05:00",
+        "2023-10-15T14:30:00",
+    ]
+    result = to_datetime(data, utc=True, format="ISO8601")
+
+    expected = DatetimeIndex(
+        [
+            Timestamp("2023-10-15 02:30:00+00:00"),
+            Timestamp("2023-10-15 17:30:00+00:00"),
+            Timestamp("2023-10-15 14:30:00+00:00"),
+        ]
+    )
+    tm.assert_index_equal(result, expected)
+
+
+def test_unknown_tz_raises():
+    # GH#18702, GH#51476
+    dtstr = "2014 Jan 9 05:15 FAKE"
+    msg = '.*un-recognized timezone "FAKE".'
+    with pytest.raises(ValueError, match=msg):
+        Timestamp(dtstr)
+
+    with pytest.raises(ValueError, match=msg):
+        to_datetime(dtstr)
+    with pytest.raises(ValueError, match=msg):
+        to_datetime([dtstr])
+
+
+def test_unformatted_input_raises():
+    valid, invalid = "2024-01-01", "N"
+    ser = Series([valid] * start_caching_at + [invalid])
+    msg = 'time data "N" doesn\'t match format "%Y-%m-%d"'
+
+    with pytest.raises(ValueError, match=msg):
+        to_datetime(ser, format="%Y-%m-%d", exact=True, cache=True)
+
+
+def test_from_numeric_arrow_dtype(any_numeric_ea_dtype):
+    # GH 52425
+    pytest.importorskip("pyarrow")
+    ser = Series([1, 2], dtype=f"{any_numeric_ea_dtype.lower()}[pyarrow]")
+    result = to_datetime(ser)
+    expected = Series([1, 2], dtype="datetime64[ns]")
+    tm.assert_series_equal(result, expected)
+
+
+def test_to_datetime_with_empty_str_utc_false_format_mixed():
+    # GH 50887
+    vals = ["2020-01-01 00:00+00:00", ""]
+    result = to_datetime(vals, format="mixed")
+    expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[us, UTC]")
+    tm.assert_index_equal(result, expected)
+
+    # Check that a couple of other similar paths work the same way
+    alt = to_datetime(vals)
+    tm.assert_index_equal(alt, expected)
+    alt2 = DatetimeIndex(vals)
+    tm.assert_index_equal(alt2, expected)
+
+
+def test_to_datetime_with_empty_str_utc_false_offsets_and_format_mixed():
+    # GH#50887, GH#57275
+    msg = "Mixed timezones detected. Pass utc=True in to_datetime"
+
+    with pytest.raises(ValueError, match=msg):
+        to_datetime(
+            ["2020-01-01 00:00+00:00", "2020-01-01 00:00+02:00", ""], format="mixed"
+        )
+
+
+def test_to_datetime_mixed_tzs_mixed_types():
+    # GH#55793, GH#55693 mismatched tzs but one is str and other is
+    #  datetime object
+    ts = Timestamp("2016-01-02 03:04:05", tz="US/Pacific")
+    dtstr = "2023-10-30 15:06+01"
+    arr = [ts, dtstr]
+
+    msg = (
+        "Mixed timezones detected. Pass utc=True in to_datetime or tz='UTC' "
+        "in DatetimeIndex to convert to a common timezone"
+    )
+    with pytest.raises(ValueError, match=msg):
+        to_datetime(arr)
+    with pytest.raises(ValueError, match=msg):
+        to_datetime(arr, format="mixed")
+    with pytest.raises(ValueError, match=msg):
+        DatetimeIndex(arr)
+
+
+def test_to_datetime_mixed_types_matching_tzs():
+    # GH#55793
+    dtstr = "2023-11-01 09:22:03-07:00"
+    ts = Timestamp(dtstr)
+    arr = [ts, dtstr]
+    res1 = to_datetime(arr)
+    res2 = to_datetime(arr[::-1])[::-1]
+    res3 = to_datetime(arr, format="mixed")
+    res4 = DatetimeIndex(arr)
+
+    expected = DatetimeIndex([ts, ts])
+    tm.assert_index_equal(res1, expected)
+    tm.assert_index_equal(res2, expected)
+    tm.assert_index_equal(res3, expected)
+    tm.assert_index_equal(res4, expected)
+
+
+dtstr = "2020-01-01 00:00+00:00"
+ts = Timestamp(dtstr)
+
+
+@pytest.mark.filterwarnings("ignore:Could not infer format:UserWarning")
+@pytest.mark.parametrize(
+    "aware_val",
+    [dtstr, Timestamp(dtstr)],
+    ids=lambda x: type(x).__name__,
+)
+@pytest.mark.parametrize(
+    "naive_val",
+    [dtstr[:-6], ts.tz_localize(None), ts.date(), ts.asm8, ts.value, float(ts.value)],
+    ids=lambda x: type(x).__name__,
+)
+@pytest.mark.parametrize("naive_first", [True, False])
+def test_to_datetime_mixed_awareness_mixed_types(aware_val, naive_val, naive_first):
+    # GH#55793, GH#55693, GH#57275
+    # Empty string parses to NaT
+    vals = [aware_val, naive_val, ""]
+
+    vec = vals
+    if naive_first:
+        # alas, the behavior is order-dependent, so we test both ways
+        vec = [naive_val, aware_val, ""]
+
+    # both_strs-> paths that were previously already deprecated with warning
+    #  issued in _array_to_datetime_object
+    both_strs = isinstance(aware_val, str) and isinstance(naive_val, str)
+    has_numeric = isinstance(naive_val, (int, float))
+    both_datetime = isinstance(naive_val, datetime) and isinstance(aware_val, datetime)
+
+    mixed_msg = (
+        "Mixed timezones detected. Pass utc=True in to_datetime or tz='UTC' "
+        "in DatetimeIndex to convert to a common timezone"
+    )
+
+    first_non_null = next(x for x in vec if x != "")
+    # if first_non_null is a not a string, _guess_datetime_format_for_array
+    #  doesn't guess a format so we don't go through array_strptime
+    if not isinstance(first_non_null, str):
+        # that case goes through array_strptime which has different behavior
+        msg = mixed_msg
+        if naive_first and isinstance(aware_val, Timestamp):
+            if isinstance(naive_val, Timestamp):
+                msg = "Tz-aware datetime.datetime cannot be converted to datetime64"
+            with pytest.raises(ValueError, match=msg):
+                to_datetime(vec)
+        else:
+            if not naive_first and both_datetime:
+                msg = "Cannot mix tz-aware with tz-naive values"
+            with pytest.raises(ValueError, match=msg):
+                to_datetime(vec)
+
+        # No warning/error with utc=True
+        to_datetime(vec, utc=True)
+
+    elif has_numeric and vec.index(aware_val) < vec.index(naive_val):
+        msg = "time data .* doesn't match format"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(vec)
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(vec, utc=True)
+
+    elif both_strs and vec.index(aware_val) < vec.index(naive_val):
+        msg = r"time data \"2020-01-01 00:00\" doesn't match format"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(vec)
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(vec, utc=True)
+
+    elif both_strs and vec.index(naive_val) < vec.index(aware_val):
+        msg = "unconverted data remains when parsing with format"
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(vec)
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(vec, utc=True)
+
+    else:
+        msg = mixed_msg
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(vec)
+
+        # No warning/error with utc=True
+        to_datetime(vec, utc=True)
+
+    if both_strs:
+        msg = mixed_msg
+        with pytest.raises(ValueError, match=msg):
+            to_datetime(vec, format="mixed")
+        with pytest.raises(ValueError, match=msg):
+            DatetimeIndex(vec)
+    else:
+        msg = mixed_msg
+        if naive_first and isinstance(aware_val, Timestamp):
+            if isinstance(naive_val, Timestamp):
+                msg = "Tz-aware datetime.datetime cannot be converted to datetime64"
+            with pytest.raises(ValueError, match=msg):
+                to_datetime(vec, format="mixed")
+            with pytest.raises(ValueError, match=msg):
+                DatetimeIndex(vec)
+        else:
+            if not naive_first and both_datetime:
+                msg = "Cannot mix tz-aware with tz-naive values"
+            with pytest.raises(ValueError, match=msg):
+                to_datetime(vec, format="mixed")
+            with pytest.raises(ValueError, match=msg):
+                DatetimeIndex(vec)
+
+
+def test_to_datetime_wrapped_datetime64_ps():
+    # GH#60341
+    result = to_datetime([np.datetime64(1901901901901, "ps")])
+    expected = DatetimeIndex(
+        ["1970-01-01 00:00:01.901901901"], dtype="datetime64[ns]", freq=None
+    )
+    tm.assert_index_equal(result, expected)
+
+
+def test_to_datetime_lxml_elementunicoderesult_with_format(cache):
+    etree = pytest.importorskip("lxml.etree")
+
+    s = "2025-02-05 16:59:57"
+    node = etree.XML(f"<date>{s}</date>")
+    val = node.xpath("/date/node()")[0]  # _ElementUnicodeResult
+
+    out = to_datetime(Series([val]), format="%Y-%m-%d %H:%M:%S", cache=cache)
+    assert out.iloc[0] == Timestamp(s)
diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcbc91d4c632f76f064f6389279834c74a458019
--- /dev/null
+++ b/pandas/tests/tools/test_to_numeric.py
@@ -0,0 +1,904 @@
+import decimal
+
+import numpy as np
+from numpy import iinfo
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+from pandas import (
+    ArrowDtype,
+    DataFrame,
+    Index,
+    Series,
+    option_context,
+    to_numeric,
+)
+import pandas._testing as tm
+
+
+@pytest.fixture(params=[None, "raise", "coerce"])
+def errors(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def signed(request):
+    return request.param
+
+
+@pytest.fixture(params=[lambda x: x, str], ids=["identity", "str"])
+def transform(request):
+    return request.param
+
+
+@pytest.fixture(params=[47393996303418497800, 100000000000000000000])
+def large_val(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def multiple_elts(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        (lambda x: Index(x, name="idx"), tm.assert_index_equal),
+        (lambda x: Series(x, name="ser"), tm.assert_series_equal),
+        (lambda x: np.array(Index(x).values), tm.assert_numpy_array_equal),
+    ]
+)
+def transform_assert_equal(request):
+    return request.param
+
+
+@pytest.mark.parametrize(
+    "input_kwargs,result_kwargs",
+    [
+        ({}, {"dtype": np.int64}),
+        ({"errors": "coerce", "downcast": "integer"}, {"dtype": np.int8}),
+    ],
+)
+def test_empty(input_kwargs, result_kwargs):
+    # see gh-16302
+    ser = Series([], dtype=object)
+    result = to_numeric(ser, **input_kwargs)
+
+    expected = Series([], **result_kwargs)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "infer_string", [False, pytest.param(True, marks=td.skip_if_no("pyarrow"))]
+)
+@pytest.mark.parametrize("last_val", ["7", 7])
+def test_series(last_val, infer_string):
+    with option_context("future.infer_string", infer_string):
+        ser = Series(["1", "-3.14", last_val])
+        result = to_numeric(ser)
+
+    expected = Series([1, -3.14, 7])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, 3, 4, 5],
+        [1.0, 3.0, 4.0, 5.0],
+        # Bool is regarded as numeric.
+        [True, False, True, True],
+    ],
+)
+def test_series_numeric(data):
+    ser = Series(data, index=list("ABCD"), name="EFG")
+
+    result = to_numeric(ser)
+    tm.assert_series_equal(result, ser)
+
+
+@pytest.mark.parametrize(
+    "data,msg",
+    [
+        ([1, -3.14, "apple"], 'Unable to parse string "apple" at position 2'),
+        (
+            ["orange", 1, -3.14, "apple"],
+            'Unable to parse string "orange" at position 0',
+        ),
+    ],
+)
+def test_error(data, msg):
+    ser = Series(data)
+
+    with pytest.raises(ValueError, match=msg):
+        to_numeric(ser, errors="raise")
+
+
+def test_ignore_error():
+    ser = Series([1, -3.14, "apple"])
+    result = to_numeric(ser, errors="coerce")
+
+    expected = Series([1, -3.14, np.nan])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "errors,exp",
+    [
+        ("raise", 'Unable to parse string "apple" at position 2'),
+        # Coerces to float.
+        ("coerce", [1.0, 0.0, np.nan]),
+    ],
+)
+def test_bool_handling(errors, exp):
+    ser = Series([True, False, "apple"])
+
+    if isinstance(exp, str):
+        with pytest.raises(ValueError, match=exp):
+            to_numeric(ser, errors=errors)
+    else:
+        result = to_numeric(ser, errors=errors)
+        expected = Series(exp)
+
+        tm.assert_series_equal(result, expected)
+
+
+def test_list():
+    ser = ["1", "-3.14", "7"]
+    res = to_numeric(ser)
+
+    expected = np.array([1, -3.14, 7])
+    tm.assert_numpy_array_equal(res, expected)
+
+
+@pytest.mark.parametrize(
+    "data,arr_kwargs",
+    [
+        ([1, 3, 4, 5], {"dtype": np.int64}),
+        ([1.0, 3.0, 4.0, 5.0], {}),
+        # Boolean is regarded as numeric.
+        ([True, False, True, True], {}),
+    ],
+)
+def test_list_numeric(data, arr_kwargs):
+    result = to_numeric(data)
+    expected = np.array(data, **arr_kwargs)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("kwargs", [{"dtype": "O"}, {}])
+def test_numeric(kwargs):
+    data = [1, -3.14, 7]
+
+    ser = Series(data, **kwargs)
+    result = to_numeric(ser)
+
+    expected = Series(data)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "columns",
+    [
+        # One column.
+        "a",
+        # Multiple columns.
+        ["a", "b"],
+    ],
+)
+def test_numeric_df_columns(columns):
+    # see gh-14827
+    df = DataFrame(
+        {
+            "a": [1.2, decimal.Decimal("3.14"), decimal.Decimal("infinity"), "0.1"],
+            "b": [1.0, 2.0, 3.0, 4.0],
+        }
+    )
+
+    expected = DataFrame({"a": [1.2, 3.14, np.inf, 0.1], "b": [1.0, 2.0, 3.0, 4.0]})
+    df[columns] = df[columns].apply(to_numeric)
+
+    tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.parametrize(
+    "data,exp_data",
+    [
+        (
+            [[decimal.Decimal("3.14"), 1.0], decimal.Decimal("1.6"), 0.1],
+            [[3.14, 1.0], 1.6, 0.1],
+        ),
+        ([np.array([decimal.Decimal("3.14"), 1.0]), 0.1], [[3.14, 1.0], 0.1]),
+    ],
+)
+def test_numeric_embedded_arr_likes(data, exp_data):
+    # Test to_numeric with embedded lists and arrays
+    df = DataFrame({"a": data})
+    df["a"] = df["a"].apply(to_numeric)
+
+    expected = DataFrame({"a": exp_data})
+    tm.assert_frame_equal(df, expected)
+
+
+def test_all_nan():
+    ser = Series(["a", "b", "c"])
+    result = to_numeric(ser, errors="coerce")
+
+    expected = Series([np.nan, np.nan, np.nan])
+    tm.assert_series_equal(result, expected)
+
+
+def test_type_check(errors):
+    # see gh-11776
+    df = DataFrame({"a": [1, -3.14, 7], "b": ["4", "5", "6"]})
+    kwargs = {"errors": errors} if errors is not None else {}
+    with pytest.raises(TypeError, match="1-d array"):
+        to_numeric(df, **kwargs)
+
+
+@pytest.mark.parametrize("val", [1, 1.1, 20001])
+def test_scalar(val, signed, transform):
+    val = -val if signed else val
+    assert to_numeric(transform(val)) == float(val)
+
+
+def test_really_large_scalar(large_val, signed, transform, errors):
+    # see gh-24910
+    kwargs = {"errors": errors} if errors is not None else {}
+    val = -large_val if signed else large_val
+
+    val = transform(val)
+
+    expected = float(val) if errors == "coerce" else int(val)
+    tm.assert_almost_equal(to_numeric(val, **kwargs), expected)
+
+
+def test_really_large_in_arr(large_val, signed, transform, multiple_elts, errors):
+    # see gh-24910
+    kwargs = {"errors": errors} if errors is not None else {}
+    val = -large_val if signed else large_val
+    val = transform(val)
+
+    extra_elt = "string"
+    arr = [val] + multiple_elts * [extra_elt]
+
+    coercing = errors == "coerce"
+
+    if errors in (None, "raise") and multiple_elts:
+        msg = 'Unable to parse string "string" at position 1'
+
+        with pytest.raises(ValueError, match=msg):
+            to_numeric(arr, **kwargs)
+    else:
+        result = to_numeric(arr, **kwargs)
+
+        exp_val = float(val) if (coercing) else int(val)
+        expected = [exp_val]
+
+        if multiple_elts:
+            if coercing:
+                expected.append(np.nan)
+                exp_dtype = float
+            else:
+                expected.append(extra_elt)
+                exp_dtype = object
+        else:
+            exp_dtype = float if isinstance(exp_val, float) else object
+
+        tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
+
+
+def test_really_large_in_arr_consistent(large_val, signed, multiple_elts, errors):
+    # see gh-24910
+    #
+    # Even if we discover that we have to hold float, does not mean
+    # we should be lenient on subsequent elements that fail to be integer.
+    kwargs = {"errors": errors} if errors is not None else {}
+    arr = [str(-large_val if signed else large_val)]
+
+    if multiple_elts:
+        arr.insert(0, large_val)
+
+    result = to_numeric(arr, **kwargs)
+    expected = [float(i) if errors == "coerce" else int(i) for i in arr]
+    exp_dtype = float if errors == "coerce" else object
+
+    tm.assert_almost_equal(result, np.array(expected, dtype=exp_dtype))
+
+
+@pytest.mark.parametrize(
+    "errors,checker",
+    [
+        ("raise", 'Unable to parse string "fail" at position 0'),
+        ("coerce", lambda x: np.isnan(x)),
+    ],
+)
+def test_scalar_fail(errors, checker):
+    scalar = "fail"
+
+    if isinstance(checker, str):
+        with pytest.raises(ValueError, match=checker):
+            to_numeric(scalar, errors=errors)
+    else:
+        assert checker(to_numeric(scalar, errors=errors))
+
+
+@pytest.mark.parametrize("data", [[1, 2, 3], [1.0, np.nan, 3, np.nan]])
+def test_numeric_dtypes(data, transform_assert_equal):
+    transform, assert_equal = transform_assert_equal
+    data = transform(data)
+
+    result = to_numeric(data)
+    assert_equal(result, data)
+
+
+@pytest.mark.parametrize(
+    "data,exp",
+    [
+        (["1", "2", "3"], np.array([1, 2, 3], dtype="int64")),
+        (["1.5", "2.7", "3.4"], np.array([1.5, 2.7, 3.4])),
+    ],
+)
+def test_str(data, exp, transform_assert_equal):
+    transform, assert_equal = transform_assert_equal
+    result = to_numeric(transform(data))
+
+    expected = transform(exp)
+    assert_equal(result, expected)
+
+
+def test_datetime_like(tz_naive_fixture, transform_assert_equal):
+    transform, assert_equal = transform_assert_equal
+    idx = pd.date_range("20130101", periods=3, tz=tz_naive_fixture)
+
+    result = to_numeric(transform(idx))
+    expected = transform(idx.asi8)
+    assert_equal(result, expected)
+
+
+def test_timedelta(transform_assert_equal):
+    transform, assert_equal = transform_assert_equal
+    idx = pd.timedelta_range("1 days", periods=3, freq="D")
+
+    result = to_numeric(transform(idx))
+    expected = transform(idx.asi8)
+    assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        pd.Timedelta(1, "D"),
+        pd.Timestamp("2017-01-01T12"),
+        pd.Timestamp("2017-01-01T12", tz="US/Pacific"),
+    ],
+)
+def test_timedelta_timestamp_scalar(scalar):
+    # GH#59944
+    result = to_numeric(scalar)
+    expected = to_numeric(Series(scalar))[0]
+    assert result == expected
+
+
+def test_period(request, transform_assert_equal):
+    transform, assert_equal = transform_assert_equal
+
+    idx = pd.period_range("2011-01", periods=3, freq="M", name="")
+    inp = transform(idx)
+
+    if not isinstance(inp, Index):
+        request.applymarker(
+            pytest.mark.xfail(reason="Missing PeriodDtype support in to_numeric")
+        )
+    result = to_numeric(inp)
+    expected = transform(idx.asi8)
+    assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "errors,expected",
+    [
+        ("raise", "Invalid object type at position 0"),
+        ("coerce", Series([np.nan, 1.0, np.nan])),
+    ],
+)
+def test_non_hashable(errors, expected):
+    # see gh-13324
+    ser = Series([[10.0, 2], 1.0, "apple"])
+
+    if isinstance(expected, str):
+        with pytest.raises(TypeError, match=expected):
+            to_numeric(ser, errors=errors)
+    else:
+        result = to_numeric(ser, errors=errors)
+        tm.assert_series_equal(result, expected)
+
+
+def test_downcast_invalid_cast():
+    # see gh-13352
+    data = ["1", 2, 3]
+    invalid_downcast = "unsigned-integer"
+    msg = "invalid downcasting method provided"
+
+    with pytest.raises(ValueError, match=msg):
+        to_numeric(data, downcast=invalid_downcast)
+
+
+def test_errors_invalid_value():
+    # see gh-26466
+    data = ["1", 2, 3]
+    invalid_error_value = "invalid"
+    msg = "invalid error value specified"
+
+    with pytest.raises(ValueError, match=msg):
+        to_numeric(data, errors=invalid_error_value)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        ["1", 2, 3],
+        [1, 2, 3],
+        np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"),
+    ],
+)
+@pytest.mark.parametrize(
+    "kwargs,exp_dtype",
+    [
+        # Basic function tests.
+        ({}, np.int64),
+        ({"downcast": None}, np.int64),
+        # Support below np.float32 is rare and far between.
+        ({"downcast": "float"}, np.dtype(np.float32).char),
+        # Basic dtype support.
+        ({"downcast": "unsigned"}, np.dtype(np.typecodes["UnsignedInteger"][0])),
+    ],
+)
+def test_downcast_basic(data, kwargs, exp_dtype):
+    # see gh-13352
+    result = to_numeric(data, **kwargs)
+    expected = np.array([1, 2, 3], dtype=exp_dtype)
+    tm.assert_numpy_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("signed_downcast", ["integer", "signed"])
+@pytest.mark.parametrize(
+    "data",
+    [
+        ["1", 2, 3],
+        [1, 2, 3],
+        np.array(["1970-01-02", "1970-01-03", "1970-01-04"], dtype="datetime64[D]"),
+    ],
+)
+def test_signed_downcast(data, signed_downcast):
+    # see gh-13352
+    smallest_int_dtype = np.dtype(np.typecodes["Integer"][0])
+    expected = np.array([1, 2, 3], dtype=smallest_int_dtype)
+
+    res = to_numeric(data, downcast=signed_downcast)
+    tm.assert_numpy_array_equal(res, expected)
+
+
+def test_ignore_downcast_neg_to_unsigned():
+    # Cannot cast to an unsigned integer
+    # because we have a negative number.
+    data = ["-1", 2, 3]
+    expected = np.array([-1, 2, 3], dtype=np.int64)
+
+    res = to_numeric(data, downcast="unsigned")
+    tm.assert_numpy_array_equal(res, expected)
+
+
+# Warning in 32 bit platforms
+@pytest.mark.parametrize("downcast", ["integer", "signed", "unsigned"])
+@pytest.mark.parametrize(
+    "data,expected",
+    [
+        (["1.1", 2, 3], np.array([1.1, 2, 3], dtype=np.float64)),
+        (
+            [10000.0, 20000, 3000, 40000.36, 50000, 50000.00],
+            np.array(
+                [10000.0, 20000, 3000, 40000.36, 50000, 50000.00], dtype=np.float64
+            ),
+        ),
+    ],
+)
+def test_ignore_downcast_cannot_convert_float(data, expected, downcast):
+    # Cannot cast to an integer (signed or unsigned)
+    # because we have a float number.
+    res = to_numeric(data, downcast=downcast)
+    tm.assert_numpy_array_equal(res, expected)
+
+
+@pytest.mark.parametrize(
+    "downcast,expected_dtype",
+    [("integer", np.int16), ("signed", np.int16), ("unsigned", np.uint16)],
+)
+def test_downcast_not8bit(downcast, expected_dtype):
+    # the smallest integer dtype need not be np.(u)int8
+    data = ["256", 257, 258]
+
+    expected = np.array([256, 257, 258], dtype=expected_dtype)
+    res = to_numeric(data, downcast=downcast)
+    tm.assert_numpy_array_equal(res, expected)
+
+
+@pytest.mark.parametrize(
+    "dtype,downcast,min_max",
+    [
+        ("int8", "integer", [iinfo(np.int8).min, iinfo(np.int8).max]),
+        ("int16", "integer", [iinfo(np.int16).min, iinfo(np.int16).max]),
+        ("int32", "integer", [iinfo(np.int32).min, iinfo(np.int32).max]),
+        ("int64", "integer", [iinfo(np.int64).min, iinfo(np.int64).max]),
+        ("uint8", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max]),
+        ("uint16", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max]),
+        ("uint32", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max]),
+        ("uint64", "unsigned", [iinfo(np.uint64).min, iinfo(np.uint64).max]),
+        ("int16", "integer", [iinfo(np.int8).min, iinfo(np.int8).max + 1]),
+        ("int32", "integer", [iinfo(np.int16).min, iinfo(np.int16).max + 1]),
+        ("int64", "integer", [iinfo(np.int32).min, iinfo(np.int32).max + 1]),
+        ("int16", "integer", [iinfo(np.int8).min - 1, iinfo(np.int16).max]),
+        ("int32", "integer", [iinfo(np.int16).min - 1, iinfo(np.int32).max]),
+        ("int64", "integer", [iinfo(np.int32).min - 1, iinfo(np.int64).max]),
+        ("uint16", "unsigned", [iinfo(np.uint8).min, iinfo(np.uint8).max + 1]),
+        ("uint32", "unsigned", [iinfo(np.uint16).min, iinfo(np.uint16).max + 1]),
+        ("uint64", "unsigned", [iinfo(np.uint32).min, iinfo(np.uint32).max + 1]),
+    ],
+)
+def test_downcast_limits(dtype, downcast, min_max):
+    # see gh-14404: test the limits of each downcast.
+    series = to_numeric(Series(min_max), downcast=downcast)
+    assert series.dtype == dtype
+
+
+def test_downcast_float64_to_float32():
+    # GH-43693: Check float64 preservation when >= 16,777,217
+    series = Series([16777217.0, np.finfo(np.float64).max, np.nan], dtype=np.float64)
+    result = to_numeric(series, downcast="float")
+
+    assert series.dtype == result.dtype
+
+
+def test_downcast_uint64():
+    # see gh-14422:
+    # BUG: to_numeric doesn't work uint64 numbers
+    ser = Series([0, 9223372036854775808])
+    result = to_numeric(ser, downcast="unsigned")
+    expected = Series([0, 9223372036854775808], dtype=np.uint64)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data,exp_data",
+    [
+        (
+            [200, 300, "", "NaN", 30000000000000000000],
+            [200, 300, np.nan, np.nan, 30000000000000000000],
+        ),
+        (
+            ["12345678901234567890", "1234567890", "ITEM"],
+            [12345678901234567890, 1234567890, np.nan],
+        ),
+    ],
+)
+def test_coerce_uint64_conflict(data, exp_data):
+    # see gh-17007 and gh-17125
+    #
+    # Still returns float despite the uint64-nan conflict,
+    # which would normally force the casting to object.
+    result = to_numeric(Series(data), errors="coerce")
+    expected = Series(exp_data, dtype=float)
+    tm.assert_series_equal(result, expected)
+
+
+def test_non_coerce_uint64_conflict():
+    # see gh-17007 and gh-17125
+    #
+    # For completeness.
+    ser = Series(["12345678901234567890", "1234567890", "ITEM"])
+
+    with pytest.raises(ValueError, match="Unable to parse string"):
+        to_numeric(ser, errors="raise")
+
+
+@pytest.mark.parametrize("dc1", ["integer", "float", "unsigned"])
+@pytest.mark.parametrize("dc2", ["integer", "float", "unsigned"])
+def test_downcast_empty(dc1, dc2):
+    # GH32493
+
+    tm.assert_numpy_array_equal(
+        to_numeric([], downcast=dc1),
+        to_numeric([], downcast=dc2),
+        check_dtype=False,
+    )
+
+
+def test_failure_to_convert_uint64_string_to_NaN():
+    # GH 32394
+    result = to_numeric("uint64", errors="coerce")
+    assert np.isnan(result)
+
+    ser = Series([32, 64, np.nan])
+    result = to_numeric(Series(["32", "64", "uint64"]), errors="coerce")
+    tm.assert_series_equal(result, ser)
+
+
+@pytest.mark.parametrize(
+    "strrep",
+    [
+        "243.164",
+        "245.968",
+        "249.585",
+        "259.745",
+        "265.742",
+        "272.567",
+        "279.196",
+        "280.366",
+        "275.034",
+        "271.351",
+        "272.889",
+        "270.627",
+        "280.828",
+        "290.383",
+        "308.153",
+        "319.945",
+        "336.0",
+        "344.09",
+        "351.385",
+        "356.178",
+        "359.82",
+        "361.03",
+        "367.701",
+        "380.812",
+        "387.98",
+        "391.749",
+        "391.171",
+        "385.97",
+        "385.345",
+        "386.121",
+        "390.996",
+        "399.734",
+        "413.073",
+        "421.532",
+        "430.221",
+        "437.092",
+        "439.746",
+        "446.01",
+        "451.191",
+        "460.463",
+        "469.779",
+        "472.025",
+        "479.49",
+        "474.864",
+        "467.54",
+        "471.978",
+    ],
+)
+def test_precision_float_conversion(strrep):
+    # GH 31364
+    result = to_numeric(strrep)
+
+    assert result == float(strrep)
+
+
+@pytest.mark.parametrize(
+    "values, expected",
+    [
+        (["1", "2", None], Series([1, 2, pd.NA], dtype="Int64")),
+        (["1", "2", "3"], Series([1, 2, 3], dtype="Int64")),
+        (["1", "2", 3], Series([1, 2, 3], dtype="Int64")),
+        (["1", "2", 3.5], Series([1, 2, 3.5], dtype="Float64")),
+        (["1", None, 3.5], Series([1, pd.NA, 3.5], dtype="Float64")),
+        (["1", "2", "3.5"], Series([1, 2, 3.5], dtype="Float64")),
+    ],
+)
+def test_to_numeric_from_nullable_string(values, nullable_string_dtype, expected):
+    # https://github.com/pandas-dev/pandas/issues/37262
+    s = Series(values, dtype=nullable_string_dtype)
+    result = to_numeric(s)
+    tm.assert_series_equal(result, expected)
+
+
+def test_to_numeric_from_nullable_string_coerce(nullable_string_dtype):
+    # GH#52146
+    values = ["a", "1"]
+    ser = Series(values, dtype=nullable_string_dtype)
+    result = to_numeric(ser, errors="coerce")
+    expected = Series([pd.NA, 1], dtype="Int64")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "data, input_dtype, downcast, expected_dtype",
+    (
+        ([1, 1], "Int64", "integer", "Int8"),
+        ([1.0, pd.NA], "Float64", "integer", "Int8"),
+        ([1.0, 1.1], "Float64", "integer", "Float64"),
+        ([1, pd.NA], "Int64", "integer", "Int8"),
+        ([450, 300], "Int64", "integer", "Int16"),
+        ([1, 1], "Float64", "integer", "Int8"),
+        ([np.iinfo(np.int64).max - 1, 1], "Int64", "integer", "Int64"),
+        ([1, 1], "Int64", "signed", "Int8"),
+        ([1.0, 1.0], "Float32", "signed", "Int8"),
+        ([1.0, 1.1], "Float64", "signed", "Float64"),
+        ([1, pd.NA], "Int64", "signed", "Int8"),
+        ([450, -300], "Int64", "signed", "Int16"),
+        ([np.iinfo(np.uint64).max - 1, 1], "UInt64", "signed", "UInt64"),
+        ([1, 1], "Int64", "unsigned", "UInt8"),
+        ([1.0, 1.0], "Float32", "unsigned", "UInt8"),
+        ([1.0, 1.1], "Float64", "unsigned", "Float64"),
+        ([1, pd.NA], "Int64", "unsigned", "UInt8"),
+        ([450, -300], "Int64", "unsigned", "Int64"),
+        ([-1, -1], "Int32", "unsigned", "Int32"),
+        ([1, 1], "Float64", "float", "Float32"),
+        ([1, 1.1], "Float64", "float", "Float32"),
+        ([1, 1], "Float32", "float", "Float32"),
+        ([1, 1.1], "Float32", "float", "Float32"),
+    ),
+)
+def test_downcast_nullable_numeric(data, input_dtype, downcast, expected_dtype):
+    arr = pd.array(data, dtype=input_dtype)
+    result = to_numeric(arr, downcast=downcast)
+    expected = pd.array(data, dtype=expected_dtype)
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_downcast_nullable_mask_is_copied():
+    # GH38974
+
+    arr = pd.array([1, 2, pd.NA], dtype="Int64")
+
+    result = to_numeric(arr, downcast="integer")
+    expected = pd.array([1, 2, pd.NA], dtype="Int8")
+    tm.assert_extension_array_equal(result, expected)
+
+    arr[1] = pd.NA  # should not modify result
+    tm.assert_extension_array_equal(result, expected)
+
+
+def test_to_numeric_scientific_notation():
+    # GH 15898
+    result = to_numeric("1.7e+308")
+    expected = np.float64(1.7e308)
+    assert result == expected
+
+
+@pytest.mark.parametrize("val", [9876543210.0, 2.0**128])
+def test_to_numeric_large_float_not_downcast_to_float_32(val):
+    # GH 19729
+    expected = Series([val])
+    result = to_numeric(expected, downcast="float")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "val, dtype", [(1, "Int64"), (1.5, "Float64"), (True, "boolean")]
+)
+def test_to_numeric_dtype_backend(val, dtype):
+    # GH#50505
+    ser = Series([val], dtype=object)
+    result = to_numeric(ser, dtype_backend="numpy_nullable")
+    expected = Series([val], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "val, dtype",
+    [
+        (1, "Int64"),
+        (1.5, "Float64"),
+        (True, "boolean"),
+        (1, "int64[pyarrow]"),
+        (1.5, "float64[pyarrow]"),
+        (True, "bool[pyarrow]"),
+    ],
+)
+def test_to_numeric_dtype_backend_na(val, dtype):
+    # GH#50505
+    if "pyarrow" in dtype:
+        pytest.importorskip("pyarrow")
+        dtype_backend = "pyarrow"
+    else:
+        dtype_backend = "numpy_nullable"
+    ser = Series([val, None], dtype=object)
+    result = to_numeric(ser, dtype_backend=dtype_backend)
+    expected = Series([val, pd.NA], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "val, dtype, downcast",
+    [
+        (1, "Int8", "integer"),
+        (1.5, "Float32", "float"),
+        (1, "Int8", "signed"),
+        (1, "int8[pyarrow]", "integer"),
+        (1.5, "float[pyarrow]", "float"),
+        (1, "int8[pyarrow]", "signed"),
+    ],
+)
+def test_to_numeric_dtype_backend_downcasting(val, dtype, downcast):
+    # GH#50505
+    if "pyarrow" in dtype:
+        pytest.importorskip("pyarrow")
+        dtype_backend = "pyarrow"
+    else:
+        dtype_backend = "numpy_nullable"
+    ser = Series([val, None], dtype=object)
+    result = to_numeric(ser, dtype_backend=dtype_backend, downcast=downcast)
+    expected = Series([val, pd.NA], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "smaller, dtype_backend",
+    [["UInt8", "numpy_nullable"], ["uint8[pyarrow]", "pyarrow"]],
+)
+def test_to_numeric_dtype_backend_downcasting_uint(smaller, dtype_backend):
+    # GH#50505
+    if dtype_backend == "pyarrow":
+        pytest.importorskip("pyarrow")
+    ser = Series([1, pd.NA], dtype="UInt64")
+    result = to_numeric(ser, dtype_backend=dtype_backend, downcast="unsigned")
+    expected = Series([1, pd.NA], dtype=smaller)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "Int64",
+        "UInt64",
+        "Float64",
+        "boolean",
+        "int64[pyarrow]",
+        "uint64[pyarrow]",
+        "float64[pyarrow]",
+        "bool[pyarrow]",
+    ],
+)
+def test_to_numeric_dtype_backend_already_nullable(dtype):
+    # GH#50505
+    if "pyarrow" in dtype:
+        pytest.importorskip("pyarrow")
+    ser = Series([1, pd.NA], dtype=dtype)
+    result = to_numeric(ser, dtype_backend="numpy_nullable")
+    expected = Series([1, pd.NA], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_to_numeric_dtype_backend_error(dtype_backend):
+    # GH#50505
+    ser = Series(["a", "b", ""])
+    expected = ser.copy()
+    with pytest.raises(ValueError, match="Unable to parse string"):
+        to_numeric(ser, dtype_backend=dtype_backend)
+
+    result = to_numeric(ser, dtype_backend=dtype_backend, errors="coerce")
+    if dtype_backend == "pyarrow":
+        dtype = "double[pyarrow]"
+    else:
+        dtype = "Float64"
+    expected = Series([pd.NA, pd.NA, pd.NA], dtype=dtype)
+    tm.assert_series_equal(result, expected)
+
+
+def test_invalid_dtype_backend():
+    ser = Series([1, 2, 3])
+    msg = (
+        "dtype_backend numpy is invalid, only 'numpy_nullable' and "
+        "'pyarrow' are allowed."
+    )
+    with pytest.raises(ValueError, match=msg):
+        to_numeric(ser, dtype_backend="numpy")
+
+
+def test_coerce_pyarrow_backend():
+    # GH 52588
+    pa = pytest.importorskip("pyarrow")
+    ser = Series(list("12x"), dtype=ArrowDtype(pa.string()))
+    result = to_numeric(ser, errors="coerce", dtype_backend="pyarrow")
+    expected = Series([1, 2, None], dtype=ArrowDtype(pa.int64()))
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/tools/test_to_time.py b/pandas/tests/tools/test_to_time.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4f48c9e0721e51dbc4c003e61f145d28f5ccae6
--- /dev/null
+++ b/pandas/tests/tools/test_to_time.py
@@ -0,0 +1,64 @@
+from datetime import time
+import locale
+
+import numpy as np
+import pytest
+
+from pandas import Series
+import pandas._testing as tm
+from pandas.core.tools.times import to_time
+
+# The tests marked with this are locale-dependent.
+# They pass, except when the machine locale is zh_CN or it_IT.
+fails_on_non_english = pytest.mark.xfail(
+    locale.getlocale()[0] in ("zh_CN", "it_IT"),
+    reason="fail on a CI build with LC_ALL=zh_CN.utf8/it_IT.utf8",
+    strict=False,
+)
+
+
+class TestToTime:
+    @pytest.mark.parametrize(
+        "time_string",
+        [
+            "14:15",
+            "1415",
+            pytest.param("2:15pm", marks=fails_on_non_english),
+            pytest.param("0215pm", marks=fails_on_non_english),
+            "14:15:00",
+            "141500",
+            pytest.param("2:15:00pm", marks=fails_on_non_english),
+            pytest.param("021500pm", marks=fails_on_non_english),
+            time(14, 15),
+        ],
+    )
+    def test_parsers_time(self, time_string):
+        # GH#11818
+        assert to_time(time_string) == time(14, 15)
+
+    def test_odd_format(self):
+        new_string = "14.15"
+        assert to_time(new_string, format="%H.%M") == time(14, 15)
+
+    def test_arraylike(self):
+        arg = ["14:15", "20:20"]
+        expected_arr = [time(14, 15), time(20, 20)]
+        assert to_time(arg) == expected_arr
+        assert to_time(arg, format="%H:%M") == expected_arr
+        assert to_time(arg, infer_time_format=True) == expected_arr
+        assert to_time(arg, format="%I:%M%p", errors="coerce") == [None, None]
+
+        with pytest.raises(ValueError, match="errors must be"):
+            to_time(arg, format="%I:%M%p", errors="ignore")
+
+        msg = "Cannot convert.+to a time with given format"
+        with pytest.raises(ValueError, match=msg):
+            to_time(arg, format="%I:%M%p", errors="raise")
+
+        tm.assert_series_equal(
+            to_time(Series(arg, name="test")), Series(expected_arr, name="test")
+        )
+
+        res = to_time(np.array(arg))
+        assert isinstance(res, list)
+        assert res == expected_arr
diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..25c89401c6b3426ab6a866ee454d2b242e370f46
--- /dev/null
+++ b/pandas/tests/tools/test_to_timedelta.py
@@ -0,0 +1,377 @@
+from datetime import (
+    time,
+    timedelta,
+)
+
+import numpy as np
+import pytest
+
+from pandas.compat import (
+    IS64,
+    WASM,
+)
+from pandas.errors import (
+    OutOfBoundsTimedelta,
+    Pandas4Warning,
+)
+
+import pandas as pd
+from pandas import (
+    Series,
+    TimedeltaIndex,
+    isna,
+    to_timedelta,
+)
+import pandas._testing as tm
+from pandas.core.arrays import TimedeltaArray
+
+
+class TestTimedeltas:
+    def test_to_timedelta_mixed_unit_strings(self):
+        # https://github.com/pandas-dev/pandas/pull/63196#issuecomment-3595743721
+        result = to_timedelta(["1 days 06:05:01.00003", "15.5us"])
+
+        expected = TimedeltaIndex([108_301_000_030_000, 15_500], dtype="m8[ns]")
+        tm.assert_index_equal(result, expected)
+
+    def test_to_timedelta_all_nat_unit(self):
+        # With all-NaT entries, we get "s" unit
+        result = to_timedelta([None])
+        assert result.unit == "s"
+
+        result = TimedeltaIndex([None])
+        assert result.unit == "s"
+
+    def test_to_timedelta_month_raises(self):
+        obj = np.timedelta64(1, "M")
+
+        msg = "Unit M is not supported."
+        with pytest.raises(ValueError, match=msg):
+            to_timedelta(obj)
+        with pytest.raises(ValueError, match=msg):
+            pd.Timedelta(obj)
+        with pytest.raises(ValueError, match=msg):
+            to_timedelta([obj])
+        with pytest.raises(ValueError, match=msg):
+            TimedeltaIndex([obj])
+
+    def test_to_timedelta_none(self):
+        # GH#23055
+        assert to_timedelta(None) is pd.NaT
+
+    def test_to_timedelta_dt64_raises(self):
+        # Passing datetime64-dtype data to TimedeltaIndex is no longer
+        #  supported GH#29794
+        msg = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]"
+
+        ser = Series([pd.NaT], dtype="M8[ns]")
+        with pytest.raises(TypeError, match=msg):
+            to_timedelta(ser)
+        with pytest.raises(TypeError, match=msg):
+            ser.to_frame().apply(to_timedelta)
+
+    def test_to_timedelta_readonly(self, writable):
+        # GH#34857
+        arr = np.array([], dtype=object)
+        arr.setflags(write=writable)
+        result = to_timedelta(arr)
+        expected = to_timedelta([])
+        tm.assert_index_equal(result, expected)
+
+    def test_to_timedelta_null(self):
+        result = to_timedelta(["", ""])
+        assert isna(result).all()
+
+    def test_to_timedelta_same_np_timedelta64(self):
+        # pass thru
+        result = to_timedelta(np.array([np.timedelta64(1, "s")]))
+        expected = pd.Index(np.array([np.timedelta64(1, "s")]))
+        tm.assert_index_equal(result, expected)
+
+    def test_to_timedelta_series(self):
+        # Series
+        expected = Series(
+            [timedelta(days=1), timedelta(days=1, seconds=1)], dtype="m8[us]"
+        )
+
+        msg = "'d' is deprecated and will be removed in a future version."
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            result = to_timedelta(Series(["1d", "1days 00:00:01"]))
+        tm.assert_series_equal(result, expected)
+
+    def test_to_timedelta_units(self):
+        # with units
+        result = TimedeltaIndex(
+            [np.timedelta64(0, "ns"), np.timedelta64(10, "s").astype("m8[ns]")]
+        )
+        expected = to_timedelta([0, 10], unit="s").as_unit("ns")
+        tm.assert_index_equal(result, expected)
+
+    def test_to_timedelta_mixed_dtype(self):
+        # https://github.com/pandas-dev/pandas/issues/64044
+        result = to_timedelta(np.array([0.5, 2]), unit="m")
+        expected = TimedeltaIndex(
+            ["0 days 00:00:30", "0 days 00:02:00"], dtype="timedelta64[ns]", freq=None
+        )
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "dtype, unit",
+        [
+            ["int64", "s"],
+            ["int64", "m"],
+            ["int64", "h"],
+            ["timedelta64[s]", "s"],
+            ["timedelta64[D]", "D"],
+        ],
+    )
+    def test_to_timedelta_units_dtypes(self, dtype, unit):
+        # arrays of various dtypes
+        arr = np.array([1] * 5, dtype=dtype)
+        result = to_timedelta(arr, unit=unit)
+        exp_dtype = "m8[s]"
+        expected = TimedeltaIndex([np.timedelta64(1, unit)] * 5, dtype=exp_dtype)
+        tm.assert_index_equal(result, expected)
+
+    def test_to_timedelta_oob_non_nano(self):
+        arr = np.array([pd.NaT._value + 1], dtype="timedelta64[m]")
+
+        msg = (
+            "Cannot convert -9223372036854775807 minutes to "
+            r"timedelta64\[s\] without overflow"
+        )
+        with pytest.raises(OutOfBoundsTimedelta, match=msg):
+            to_timedelta(arr)
+
+        with pytest.raises(OutOfBoundsTimedelta, match=msg):
+            TimedeltaIndex(arr)
+
+        with pytest.raises(OutOfBoundsTimedelta, match=msg):
+            TimedeltaArray._from_sequence(arr, dtype="m8[s]")
+
+    @pytest.mark.parametrize("box", [lambda x: x, pd.DataFrame])
+    @pytest.mark.parametrize("errors", ["raise", "coerce"])
+    def test_to_timedelta_dataframe(self, box, errors):
+        # GH 11776
+        arg = box(np.arange(10).reshape(2, 5))
+        with pytest.raises(TypeError, match="1-d array"):
+            to_timedelta(arg, errors=errors)
+
+    def test_to_timedelta_invalid_errors(self):
+        # bad value for errors parameter
+        msg = "errors must be one of"
+        with pytest.raises(ValueError, match=msg):
+            to_timedelta(["foo"], errors="never")
+
+    @pytest.mark.parametrize("arg", [[1, 2], 1])
+    def test_to_timedelta_invalid_unit(self, arg):
+        # these will error
+        msg = "invalid unit abbreviation: foo"
+        with pytest.raises(ValueError, match=msg):
+            to_timedelta(arg, unit="foo")
+
+    def test_to_timedelta_time(self):
+        # time not supported ATM
+        msg = (
+            "Value must be Timedelta, string, integer, float, timedelta or convertible"
+        )
+        with pytest.raises(ValueError, match=msg):
+            to_timedelta(time(second=1))
+        assert to_timedelta(time(second=1), errors="coerce") is pd.NaT
+
+    def test_to_timedelta_bad_value(self):
+        msg = "Could not convert 'foo' to NumPy timedelta"
+        with pytest.raises(ValueError, match=msg):
+            to_timedelta(["foo", "bar"])
+
+    def test_to_timedelta_bad_value_coerce(self):
+        tm.assert_index_equal(
+            TimedeltaIndex([pd.NaT, pd.NaT]),
+            to_timedelta(["foo", "bar"], errors="coerce"),
+        )
+
+        tm.assert_index_equal(
+            TimedeltaIndex(["1 day", pd.NaT, "1 min"]),
+            to_timedelta(["1 day", "bar", "1 min"], errors="coerce"),
+        )
+
+    @pytest.mark.parametrize(
+        "val, errors",
+        [
+            ("1M", True),
+            ("1 M", True),
+            ("1Y", True),
+            ("1 Y", True),
+            ("1y", True),
+            ("1 y", True),
+            ("1m", False),
+            ("1 m", False),
+            ("1 day", False),
+            ("2day", False),
+        ],
+    )
+    def test_unambiguous_timedelta_values(self, val, errors):
+        # GH36666 Deprecate use of strings denoting units with 'M', 'Y', 'm' or 'y'
+        # in pd.to_timedelta
+        msg = "Units 'M', 'Y' and 'y' do not represent unambiguous timedelta"
+        if errors:
+            with pytest.raises(ValueError, match=msg):
+                to_timedelta(val)
+        else:
+            # check it doesn't raise
+            to_timedelta(val)
+
+    def test_to_timedelta_via_apply(self):
+        # GH 5458
+        expected = Series([np.timedelta64(1, "s")], dtype="m8[us]")
+        result = Series(["00:00:01"]).apply(to_timedelta)
+        tm.assert_series_equal(result, expected)
+
+        result = Series([to_timedelta("00:00:01")])
+        tm.assert_series_equal(result, expected)
+
+    def test_to_timedelta_inference_without_warning(self):
+        # GH#41731 inference produces a warning in the Series constructor,
+        #  but _not_ in to_timedelta
+        vals = ["00:00:01", pd.NaT]
+        with tm.assert_produces_warning(None):
+            result = to_timedelta(vals)
+
+        expected = TimedeltaIndex([pd.Timedelta(seconds=1), pd.NaT], dtype="m8[us]")
+        tm.assert_index_equal(result, expected)
+
+    def test_to_timedelta_on_missing_values(self):
+        # GH5438
+        timedelta_NaT = np.timedelta64("NaT")
+
+        actual = to_timedelta(Series(["00:00:01", np.nan]))
+        expected = Series(
+            [np.timedelta64(1000000000, "ns"), timedelta_NaT],
+            dtype=f"{tm.ENDIAN}m8[us]",
+        )
+        tm.assert_series_equal(actual, expected)
+
+        ser = Series(["00:00:01", pd.NaT], dtype="m8[us]")
+        actual = to_timedelta(ser)
+        tm.assert_series_equal(actual, expected)
+
+    @pytest.mark.parametrize("val", [np.nan, pd.NaT, pd.NA])
+    def test_to_timedelta_on_missing_values_scalar(self, val):
+        actual = to_timedelta(val)
+        assert actual._value == np.timedelta64("NaT").astype("int64")
+
+    @pytest.mark.parametrize("val", [np.nan, pd.NaT, pd.NA])
+    def test_to_timedelta_on_missing_values_list(self, val):
+        actual = to_timedelta([val])
+        assert actual[0]._value == np.timedelta64("NaT").astype("int64")
+
+    @pytest.mark.skipif(WASM, reason="No fp exception support in WASM")
+    @pytest.mark.xfail(not IS64, reason="Floating point error")
+    def test_to_timedelta_float(self):
+        # https://github.com/pandas-dev/pandas/issues/25077
+        arr = np.arange(0, 1, 1e-6)[-10:]
+        result = to_timedelta(arr, unit="s")
+        expected_asi8 = np.arange(999990000, 10**9, 1000, dtype="int64")
+        tm.assert_numpy_array_equal(result.asi8, expected_asi8)
+
+    def test_to_timedelta_coerce_strings_unit(self):
+        arr = np.array([1, 2, "error"], dtype=object)
+        result = to_timedelta(arr, unit="ns", errors="coerce")
+        expected = to_timedelta([1, 2, pd.NaT], unit="ns")
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "expected_val, result_val", [[timedelta(days=2), 2], [None, None]]
+    )
+    def test_to_timedelta_nullable_int64_dtype(self, expected_val, result_val):
+        # GH 35574
+        expected = Series([timedelta(days=1), expected_val], dtype="m8[s]")
+        result = to_timedelta(Series([1, result_val], dtype="Int64"), unit="days")
+
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        ("input", "expected"),
+        [
+            ("8:53:08.71800000001", "8:53:08.718"),
+            ("8:53:08.718001", "8:53:08.718001"),
+            ("8:53:08.7180000001", "8:53:08.7180000001"),
+            ("-8:53:08.71800000001", "-8:53:08.718"),
+            ("8:53:08.7180000089", "8:53:08.718000008"),
+        ],
+    )
+    @pytest.mark.parametrize("func", [pd.Timedelta, to_timedelta])
+    def test_to_timedelta_precision_over_nanos(self, input, expected, func):
+        # GH: 36738
+        expected = pd.Timedelta(expected)
+        result = func(input)
+        assert result == expected
+
+    def test_to_timedelta_zerodim(self, fixed_now_ts):
+        # ndarray.item() incorrectly returns int for dt64[ns] and td64[ns]
+        dt64 = fixed_now_ts.to_datetime64()
+        arg = np.array(dt64)
+
+        msg = (
+            "Value must be Timedelta, string, integer, float, timedelta "
+            "or convertible, not datetime64"
+        )
+        with pytest.raises(ValueError, match=msg):
+            to_timedelta(arg)
+
+        arg2 = arg.view("m8[ns]")
+        result = to_timedelta(arg2)
+        assert isinstance(result, pd.Timedelta)
+        assert result._value == dt64.view("i8")
+
+    def test_to_timedelta_numeric_ea(self, any_numeric_ea_dtype):
+        # GH#48796
+        ser = Series([1, pd.NA], dtype=any_numeric_ea_dtype)
+        result = to_timedelta(ser)
+        expected = Series([pd.Timedelta(1, unit="ns"), pd.NaT])
+        tm.assert_series_equal(result, expected)
+
+    def test_to_timedelta_fraction(self):
+        result = to_timedelta(1.0 / 3, unit="h")
+        expected = pd.Timedelta("0 days 00:19:59.999999998")
+        assert result == expected
+
+    def test_to_timedelta_unit_round_floats(self):
+        # When the float is round, we give the requested unit
+        #  (or nearest-supported) like we do with integers
+        arr = np.array([45.0], dtype=object)
+        result = to_timedelta(arr, unit="s")
+        expected = to_timedelta([45], unit="s")
+        tm.assert_index_equal(result, expected)
+
+        arr2 = arr.astype(np.float64)
+        result2 = to_timedelta(arr2, unit="s")
+        tm.assert_index_equal(result2, expected)
+
+    def test_to_timedelta_unit_non_round_floats(self):
+        # With non-round floats, we have to give nanosecond
+        arr = np.array([45.5], dtype=object)
+        result = to_timedelta(arr, unit="s")
+        assert result.unit == "ns"
+
+        arr2 = arr.astype(np.float64)
+        result2 = to_timedelta(arr2, unit="s")
+        assert result2.unit == "ns"
+
+
+def test_from_numeric_arrow_dtype(any_numeric_ea_dtype):
+    # GH 52425
+    pytest.importorskip("pyarrow")
+    ser = Series([1, 2], dtype=f"{any_numeric_ea_dtype.lower()}[pyarrow]")
+    result = to_timedelta(ser)
+    expected = Series([1, 2], dtype="timedelta64[ns]")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("unit", ["ns", "ms"])
+def test_from_timedelta_arrow_dtype(unit):
+    # GH 54298
+    pytest.importorskip("pyarrow")
+    expected = Series([timedelta(1)], dtype=f"duration[{unit}][pyarrow]")
+    result = to_timedelta(expected)
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/tseries/__init__.py b/pandas/tests/tseries/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/tslibs/__init__.py b/pandas/tests/tslibs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/tslibs/test_ccalendar.py b/pandas/tests/tslibs/test_ccalendar.py
new file mode 100644
index 0000000000000000000000000000000000000000..aeb9d3387c91a84baa83148021503b445b4793ce
--- /dev/null
+++ b/pandas/tests/tslibs/test_ccalendar.py
@@ -0,0 +1,64 @@
+from datetime import (
+    date,
+    datetime,
+)
+
+from hypothesis import given
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import ccalendar
+
+from pandas._testing._hypothesis import DATETIME_IN_PD_TIMESTAMP_RANGE_NO_TZ
+
+
+@pytest.mark.parametrize(
+    "date_tuple,expected",
+    [
+        ((2001, 3, 1), 60),
+        ((2004, 3, 1), 61),
+        ((1907, 12, 31), 365),  # End-of-year, non-leap year.
+        ((2004, 12, 31), 366),  # End-of-year, leap year.
+    ],
+)
+def test_get_day_of_year_numeric(date_tuple, expected):
+    assert ccalendar.get_day_of_year(*date_tuple) == expected
+
+
+def test_get_day_of_year_dt():
+    dt = datetime.fromordinal(1 + np.random.default_rng(2).integers(365 * 4000))
+    result = ccalendar.get_day_of_year(dt.year, dt.month, dt.day)
+
+    expected = (dt - dt.replace(month=1, day=1)).days + 1
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "input_date_tuple, expected_iso_tuple",
+    [
+        [(2020, 1, 1), (2020, 1, 3)],
+        [(2019, 12, 31), (2020, 1, 2)],
+        [(2019, 12, 30), (2020, 1, 1)],
+        [(2009, 12, 31), (2009, 53, 4)],
+        [(2010, 1, 1), (2009, 53, 5)],
+        [(2010, 1, 3), (2009, 53, 7)],
+        [(2010, 1, 4), (2010, 1, 1)],
+        [(2006, 1, 1), (2005, 52, 7)],
+        [(2005, 12, 31), (2005, 52, 6)],
+        [(2008, 12, 28), (2008, 52, 7)],
+        [(2008, 12, 29), (2009, 1, 1)],
+    ],
+)
+def test_dt_correct_iso_8601_year_week_and_day(input_date_tuple, expected_iso_tuple):
+    result = ccalendar.get_iso_calendar(*input_date_tuple)
+    expected_from_date_isocalendar = date(*input_date_tuple).isocalendar()
+    assert result == expected_from_date_isocalendar
+    assert result == expected_iso_tuple
+
+
+@pytest.mark.slow
+@given(DATETIME_IN_PD_TIMESTAMP_RANGE_NO_TZ)
+def test_isocalendar(dt):
+    expected = dt.isocalendar()
+    result = ccalendar.get_iso_calendar(dt.year, dt.month, dt.day)
+    assert result == expected
diff --git a/pandas/tests/tslibs/test_np_datetime.py b/pandas/tests/tslibs/test_np_datetime.py
new file mode 100644
index 0000000000000000000000000000000000000000..02edf1a09387766d71097ea0baedc2640cfb824b
--- /dev/null
+++ b/pandas/tests/tslibs/test_np_datetime.py
@@ -0,0 +1,222 @@
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs.dtypes import NpyDatetimeUnit
+from pandas._libs.tslibs.np_datetime import (
+    OutOfBoundsDatetime,
+    OutOfBoundsTimedelta,
+    astype_overflowsafe,
+    is_unitless,
+    py_get_unit_from_dtype,
+    py_td64_to_tdstruct,
+)
+
+import pandas._testing as tm
+
+
+def test_is_unitless():
+    dtype = np.dtype("M8[ns]")
+    assert not is_unitless(dtype)
+
+    dtype = np.dtype("datetime64")
+    assert is_unitless(dtype)
+
+    dtype = np.dtype("m8[ns]")
+    assert not is_unitless(dtype)
+
+    dtype = np.dtype("timedelta64")
+    assert is_unitless(dtype)
+
+    msg = "dtype must be datetime64 or timedelta64"
+    with pytest.raises(ValueError, match=msg):
+        is_unitless(np.dtype(np.int64))
+
+    msg = "Argument 'dtype' has incorrect type"
+    with pytest.raises(TypeError, match=msg):
+        is_unitless("foo")
+
+
+def test_get_unit_from_dtype():
+    # datetime64
+    assert py_get_unit_from_dtype(np.dtype("M8[Y]")) == NpyDatetimeUnit.NPY_FR_Y.value
+    assert py_get_unit_from_dtype(np.dtype("M8[M]")) == NpyDatetimeUnit.NPY_FR_M.value
+    assert py_get_unit_from_dtype(np.dtype("M8[W]")) == NpyDatetimeUnit.NPY_FR_W.value
+    # B has been deprecated and removed -> no 3
+    assert py_get_unit_from_dtype(np.dtype("M8[D]")) == NpyDatetimeUnit.NPY_FR_D.value
+    assert py_get_unit_from_dtype(np.dtype("M8[h]")) == NpyDatetimeUnit.NPY_FR_h.value
+    assert py_get_unit_from_dtype(np.dtype("M8[m]")) == NpyDatetimeUnit.NPY_FR_m.value
+    assert py_get_unit_from_dtype(np.dtype("M8[s]")) == NpyDatetimeUnit.NPY_FR_s.value
+    assert py_get_unit_from_dtype(np.dtype("M8[ms]")) == NpyDatetimeUnit.NPY_FR_ms.value
+    assert py_get_unit_from_dtype(np.dtype("M8[us]")) == NpyDatetimeUnit.NPY_FR_us.value
+    assert py_get_unit_from_dtype(np.dtype("M8[ns]")) == NpyDatetimeUnit.NPY_FR_ns.value
+    assert py_get_unit_from_dtype(np.dtype("M8[ps]")) == NpyDatetimeUnit.NPY_FR_ps.value
+    assert py_get_unit_from_dtype(np.dtype("M8[fs]")) == NpyDatetimeUnit.NPY_FR_fs.value
+    assert py_get_unit_from_dtype(np.dtype("M8[as]")) == NpyDatetimeUnit.NPY_FR_as.value
+
+    # timedelta64
+    assert py_get_unit_from_dtype(np.dtype("m8[Y]")) == NpyDatetimeUnit.NPY_FR_Y.value
+    assert py_get_unit_from_dtype(np.dtype("m8[M]")) == NpyDatetimeUnit.NPY_FR_M.value
+    assert py_get_unit_from_dtype(np.dtype("m8[W]")) == NpyDatetimeUnit.NPY_FR_W.value
+    # B has been deprecated and removed -> no 3
+    assert py_get_unit_from_dtype(np.dtype("m8[D]")) == NpyDatetimeUnit.NPY_FR_D.value
+    assert py_get_unit_from_dtype(np.dtype("m8[h]")) == NpyDatetimeUnit.NPY_FR_h.value
+    assert py_get_unit_from_dtype(np.dtype("m8[m]")) == NpyDatetimeUnit.NPY_FR_m.value
+    assert py_get_unit_from_dtype(np.dtype("m8[s]")) == NpyDatetimeUnit.NPY_FR_s.value
+    assert py_get_unit_from_dtype(np.dtype("m8[ms]")) == NpyDatetimeUnit.NPY_FR_ms.value
+    assert py_get_unit_from_dtype(np.dtype("m8[us]")) == NpyDatetimeUnit.NPY_FR_us.value
+    assert py_get_unit_from_dtype(np.dtype("m8[ns]")) == NpyDatetimeUnit.NPY_FR_ns.value
+    assert py_get_unit_from_dtype(np.dtype("m8[ps]")) == NpyDatetimeUnit.NPY_FR_ps.value
+    assert py_get_unit_from_dtype(np.dtype("m8[fs]")) == NpyDatetimeUnit.NPY_FR_fs.value
+    assert py_get_unit_from_dtype(np.dtype("m8[as]")) == NpyDatetimeUnit.NPY_FR_as.value
+
+
+def test_td64_to_tdstruct():
+    val = 12454636234  # arbitrary value
+
+    res1 = py_td64_to_tdstruct(val, NpyDatetimeUnit.NPY_FR_ns.value)
+    exp1 = {
+        "days": 0,
+        "hrs": 0,
+        "min": 0,
+        "sec": 12,
+        "ms": 454,
+        "us": 636,
+        "ns": 234,
+        "seconds": 12,
+        "microseconds": 454636,
+        "nanoseconds": 234,
+    }
+    assert res1 == exp1
+
+    res2 = py_td64_to_tdstruct(val, NpyDatetimeUnit.NPY_FR_us.value)
+    exp2 = {
+        "days": 0,
+        "hrs": 3,
+        "min": 27,
+        "sec": 34,
+        "ms": 636,
+        "us": 234,
+        "ns": 0,
+        "seconds": 12454,
+        "microseconds": 636234,
+        "nanoseconds": 0,
+    }
+    assert res2 == exp2
+
+    res3 = py_td64_to_tdstruct(val, NpyDatetimeUnit.NPY_FR_ms.value)
+    exp3 = {
+        "days": 144,
+        "hrs": 3,
+        "min": 37,
+        "sec": 16,
+        "ms": 234,
+        "us": 0,
+        "ns": 0,
+        "seconds": 13036,
+        "microseconds": 234000,
+        "nanoseconds": 0,
+    }
+    assert res3 == exp3
+
+    # Note this out of bounds for nanosecond Timedelta
+    res4 = py_td64_to_tdstruct(val, NpyDatetimeUnit.NPY_FR_s.value)
+    exp4 = {
+        "days": 144150,
+        "hrs": 21,
+        "min": 10,
+        "sec": 34,
+        "ms": 0,
+        "us": 0,
+        "ns": 0,
+        "seconds": 76234,
+        "microseconds": 0,
+        "nanoseconds": 0,
+    }
+    assert res4 == exp4
+
+
+class TestAstypeOverflowSafe:
+    def test_pass_non_dt64_array(self):
+        # check that we raise, not segfault
+        arr = np.arange(5)
+        dtype = np.dtype("M8[ns]")
+
+        msg = (
+            "astype_overflowsafe values.dtype and dtype must be either "
+            "both-datetime64 or both-timedelta64"
+        )
+        with pytest.raises(TypeError, match=msg):
+            astype_overflowsafe(arr, dtype, copy=True)
+
+        with pytest.raises(TypeError, match=msg):
+            astype_overflowsafe(arr, dtype, copy=False)
+
+    def test_pass_non_dt64_dtype(self):
+        # check that we raise, not segfault
+        arr = np.arange(5, dtype="i8").view("M8[D]")
+        dtype = np.dtype("m8[ns]")
+
+        msg = (
+            "astype_overflowsafe values.dtype and dtype must be either "
+            "both-datetime64 or both-timedelta64"
+        )
+        with pytest.raises(TypeError, match=msg):
+            astype_overflowsafe(arr, dtype, copy=True)
+
+        with pytest.raises(TypeError, match=msg):
+            astype_overflowsafe(arr, dtype, copy=False)
+
+    def test_astype_overflowsafe_dt64(self):
+        dtype = np.dtype("M8[ns]")
+
+        dt = np.datetime64("2262-04-05", "D")
+        arr = dt + np.arange(10, dtype="m8[D]")
+
+        # arr.astype silently overflows, so this
+        wrong = arr.astype(dtype)
+        roundtrip = wrong.astype(arr.dtype)
+        assert not (wrong == roundtrip).all()
+
+        msg = "Out of bounds nanosecond timestamp"
+        with pytest.raises(OutOfBoundsDatetime, match=msg):
+            astype_overflowsafe(arr, dtype)
+
+        # But converting to microseconds is fine, and we match numpy's results.
+        dtype2 = np.dtype("M8[us]")
+        result = astype_overflowsafe(arr, dtype2)
+        expected = arr.astype(dtype2)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_astype_overflowsafe_td64(self):
+        dtype = np.dtype("m8[ns]")
+
+        dt = np.datetime64("2262-04-05", "D")
+        arr = dt + np.arange(10, dtype="m8[D]")
+        arr = arr.view("m8[D]")
+
+        # arr.astype silently overflows, so this
+        wrong = arr.astype(dtype)
+        roundtrip = wrong.astype(arr.dtype)
+        assert not (wrong == roundtrip).all()
+
+        msg = r"Cannot convert 106752 days to timedelta64\[ns\] without overflow"
+        with pytest.raises(OutOfBoundsTimedelta, match=msg):
+            astype_overflowsafe(arr, dtype)
+
+        # But converting to microseconds is fine, and we match numpy's results.
+        dtype2 = np.dtype("m8[us]")
+        result = astype_overflowsafe(arr, dtype2)
+        expected = arr.astype(dtype2)
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_astype_overflowsafe_disallow_rounding(self):
+        arr = np.array([-1500, 1500], dtype="M8[ns]")
+        dtype = np.dtype("M8[us]")
+
+        msg = "Cannot losslessly cast '-1500 ns' to us"
+        with pytest.raises(ValueError, match=msg):
+            astype_overflowsafe(arr, dtype, round_ok=False)
+
+        result = astype_overflowsafe(arr, dtype, round_ok=True)
+        expected = arr.astype(dtype)
+        tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/tslibs/test_npy_units.py b/pandas/tests/tslibs/test_npy_units.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d05dc79fbb2cf52688547b672365802463ce6f2
--- /dev/null
+++ b/pandas/tests/tslibs/test_npy_units.py
@@ -0,0 +1,27 @@
+import numpy as np
+
+from pandas._libs.tslibs.dtypes import abbrev_to_npy_unit
+from pandas._libs.tslibs.vectorized import is_date_array_normalized
+
+# a datetime64 ndarray which *is* normalized
+day_arr = np.arange(10, dtype="i8").view("M8[D]")
+
+
+class TestIsDateArrayNormalized:
+    def test_is_date_array_normalized_day(self):
+        arr = day_arr
+        abbrev = "D"
+        unit = abbrev_to_npy_unit(abbrev)
+        result = is_date_array_normalized(arr.view("i8"), None, unit)
+        assert result is True
+
+    def test_is_date_array_normalized_seconds(self):
+        abbrev = "s"
+        arr = day_arr.astype(f"M8[{abbrev}]")
+        unit = abbrev_to_npy_unit(abbrev)
+        result = is_date_array_normalized(arr.view("i8"), None, unit)
+        assert result is True
+
+        arr[0] += np.timedelta64(1, abbrev)
+        result2 = is_date_array_normalized(arr.view("i8"), None, unit)
+        assert result2 is False
diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d4e2e8ddb2349459ffd738cb99ceb7c5cd341ad
--- /dev/null
+++ b/pandas/tests/tslibs/test_parsing.py
@@ -0,0 +1,426 @@
+"""
+Tests for Timestamp parsing, aimed at pandas/_libs/tslibs/parsing.pyx
+"""
+
+from datetime import datetime
+import re
+
+from dateutil.parser import parse as du_parse
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import (
+    parsing,
+    strptime,
+)
+from pandas._libs.tslibs.parsing import parse_datetime_string_with_reso
+from pandas.compat import (
+    ISMUSL,
+    WASM,
+    is_platform_windows,
+)
+import pandas.util._test_decorators as td
+
+# Usually we wouldn't want this import in this test file (which is targeted at
+#  tslibs.parsing), but it is convenient to test the Timestamp constructor at
+#  the same time as the other parsing functions.
+from pandas import (
+    Timestamp,
+    option_context,
+)
+import pandas._testing as tm
+
+
+@pytest.mark.skipif(WASM, reason="tzset is not available on WASM")
+@pytest.mark.skipif(
+    is_platform_windows() or ISMUSL,
+    reason="TZ setting incorrect on Windows and MUSL Linux",
+)
+def test_parsing_tzlocal_deprecated():
+    # GH#50791
+    msg = "|".join(
+        [
+            r"Parsing 'EST' as tzlocal \(dependent on system timezone\) "
+            r"is no longer supported\. "
+            "Pass the 'tz' keyword or call tz_localize after construction instead",
+            ".*included an un-recognized timezone",
+        ]
+    )
+    dtstr = "Jan 15 2004 03:00 EST"
+
+    with tm.set_timezone("US/Eastern"):
+        with pytest.raises(ValueError, match=msg):
+            parse_datetime_string_with_reso(dtstr)
+
+        with pytest.raises(ValueError, match=msg):
+            parsing.py_parse_datetime_string(dtstr)
+
+        with pytest.raises(ValueError, match=msg):
+            Timestamp(dtstr)
+
+
+def test_parse_datetime_string_with_reso():
+    (parsed, reso) = parse_datetime_string_with_reso("4Q1984")
+    (parsed_lower, reso_lower) = parse_datetime_string_with_reso("4q1984")
+
+    assert reso == reso_lower
+    assert parsed == parsed_lower
+
+
+def test_parse_datetime_string_with_reso_nanosecond_reso():
+    # GH#46811
+    parsed, reso = parse_datetime_string_with_reso("2022-04-20 09:19:19.123456789")
+    assert reso == "nanosecond"
+
+
+def test_parse_datetime_string_with_reso_invalid_type():
+    # Raise on invalid input, don't just return it
+    msg = "Argument 'date_string' has incorrect type (expected str, got tuple)"
+    with pytest.raises(TypeError, match=re.escape(msg)):
+        parse_datetime_string_with_reso((4, 5))
+
+
+@pytest.mark.parametrize(
+    "dashed,normal", [("1988-Q2", "1988Q2"), ("2Q-1988", "2Q1988")]
+)
+def test_parse_time_quarter_with_dash(dashed, normal):
+    # see gh-9688
+    (parsed_dash, reso_dash) = parse_datetime_string_with_reso(dashed)
+    (parsed, reso) = parse_datetime_string_with_reso(normal)
+
+    assert parsed_dash == parsed
+    assert reso_dash == reso
+
+
+@pytest.mark.parametrize("dashed", ["-2Q1992", "2-Q1992", "4-4Q1992"])
+def test_parse_time_quarter_with_dash_error(dashed):
+    msg = f"Unknown datetime string format, unable to parse: {dashed}"
+
+    with pytest.raises(parsing.DateParseError, match=msg):
+        parse_datetime_string_with_reso(dashed)
+
+
+@pytest.mark.parametrize(
+    "date_string,expected",
+    [
+        ("123.1234", False),
+        ("-50000", False),
+        ("999", False),
+        ("m", False),
+        ("T", False),
+        ("Mon Sep 16, 2013", True),
+        ("2012-01-01", True),
+        ("01/01/2012", True),
+        ("01012012", True),
+        ("0101", True),
+        ("1-1", True),
+    ],
+)
+def test_does_not_convert_mixed_integer(date_string, expected):
+    assert parsing._does_string_look_like_datetime(date_string) is expected
+
+
+@pytest.mark.parametrize(
+    "date_str,kwargs,msg",
+    [
+        (
+            "2013Q5",
+            {},
+            (
+                "Incorrect quarterly string is given, "
+                "quarter must be between 1 and 4: 2013Q5"
+            ),
+        ),
+        # see gh-5418
+        (
+            "2013Q1",
+            {"freq": "INVLD-L-DEC-SAT"},
+            ("Unable to retrieve month information from given freq: INVLD-L-DEC-SAT"),
+        ),
+    ],
+)
+def test_parsers_quarterly_with_freq_error(date_str, kwargs, msg):
+    with pytest.raises(parsing.DateParseError, match=msg):
+        parsing.parse_datetime_string_with_reso(date_str, **kwargs)
+
+
+@pytest.mark.parametrize(
+    "date_str,freq,expected",
+    [
+        ("2013Q2", None, datetime(2013, 4, 1)),
+        ("2013Q2", "Y-APR", datetime(2012, 8, 1)),
+        ("2013-Q2", "Y-DEC", datetime(2013, 4, 1)),
+    ],
+)
+def test_parsers_quarterly_with_freq(date_str, freq, expected):
+    result, _ = parsing.parse_datetime_string_with_reso(date_str, freq=freq)
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "date_str", ["2Q 2005", "2Q-200Y", "2Q-200", "22Q2005", "2Q200.", "6Q-20"]
+)
+def test_parsers_quarter_invalid(date_str):
+    if date_str == "6Q-20":
+        msg = (
+            "Incorrect quarterly string is given, quarter "
+            f"must be between 1 and 4: {date_str}"
+        )
+    else:
+        msg = f"Unknown datetime string format, unable to parse: {date_str}"
+
+    with pytest.raises(ValueError, match=msg):
+        parsing.parse_datetime_string_with_reso(date_str)
+
+
+@pytest.mark.parametrize(
+    "date_str,expected",
+    [("201101", datetime(2011, 1, 1, 0, 0)), ("200005", datetime(2000, 5, 1, 0, 0))],
+)
+def test_parsers_month_freq(date_str, expected):
+    result, _ = parsing.parse_datetime_string_with_reso(date_str, freq="ME")
+    assert result == expected
+
+
+@td.skip_if_not_us_locale
+@pytest.mark.parametrize(
+    "string,fmt",
+    [
+        ("20111230", "%Y%m%d"),
+        ("201112300000", "%Y%m%d%H%M"),
+        ("20111230000000", "%Y%m%d%H%M%S"),
+        ("20111230T00", "%Y%m%dT%H"),
+        ("20111230T0000", "%Y%m%dT%H%M"),
+        ("20111230T000000", "%Y%m%dT%H%M%S"),
+        ("2011-12-30", "%Y-%m-%d"),
+        ("2011", "%Y"),
+        ("2011-01", "%Y-%m"),
+        ("30-12-2011", "%d-%m-%Y"),
+        ("2011-12-30 00:00:00", "%Y-%m-%d %H:%M:%S"),
+        ("2011-12-30T00:00:00", "%Y-%m-%dT%H:%M:%S"),
+        ("2011-12-30T00:00:00UTC", "%Y-%m-%dT%H:%M:%S%Z"),
+        ("2011-12-30T00:00:00Z", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+9", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+09", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+090", None),
+        ("2011-12-30T00:00:00+0900", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00-0900", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+09:00", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+09:000", None),
+        ("2011-12-30T00:00:00+9:0", "%Y-%m-%dT%H:%M:%S%z"),
+        ("2011-12-30T00:00:00+09:", None),
+        ("2011-12-30T00:00:00.000000UTC", "%Y-%m-%dT%H:%M:%S.%f%Z"),
+        ("2011-12-30T00:00:00.000000Z", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+9", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+09", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+090", None),
+        ("2011-12-30T00:00:00.000000+0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000-0900", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+09:00", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+09:000", None),
+        ("2011-12-30T00:00:00.000000+9:0", "%Y-%m-%dT%H:%M:%S.%f%z"),
+        ("2011-12-30T00:00:00.000000+09:", None),
+        ("2011-12-30 00:00:00.000000", "%Y-%m-%d %H:%M:%S.%f"),
+        ("Tue 24 Aug 2021 01:30:48", "%a %d %b %Y %H:%M:%S"),
+        ("Tuesday 24 Aug 2021 01:30:48", "%A %d %b %Y %H:%M:%S"),
+        ("Tue 24 Aug 2021 01:30:48 AM", "%a %d %b %Y %I:%M:%S %p"),
+        ("Tuesday 24 Aug 2021 01:30:48 AM", "%A %d %b %Y %I:%M:%S %p"),
+        ("27.03.2003 14:55:00.000", "%d.%m.%Y %H:%M:%S.%f"),  # GH50317
+        ("2023-11-09T20:23:46Z", "%Y-%m-%dT%H:%M:%S%z"),  # GH57452
+    ],
+)
+def test_guess_datetime_format_with_parseable_formats(string, fmt):
+    with tm.maybe_produces_warning(
+        UserWarning, fmt is not None and re.search(r"%d.*%m", fmt)
+    ):
+        result = parsing.guess_datetime_format(string)
+    assert result == fmt
+
+
+@pytest.mark.parametrize("dayfirst,expected", [(True, "%d/%m/%Y"), (False, "%m/%d/%Y")])
+def test_guess_datetime_format_with_dayfirst(dayfirst, expected):
+    ambiguous_string = "01/01/2011"
+    result = parsing.guess_datetime_format(ambiguous_string, dayfirst=dayfirst)
+    assert result == expected
+
+
+@td.skip_if_not_us_locale
+@pytest.mark.parametrize(
+    "string,fmt",
+    [
+        ("30/Dec/2011", "%d/%b/%Y"),
+        ("30/December/2011", "%d/%B/%Y"),
+        ("30/Dec/2011 00:00:00", "%d/%b/%Y %H:%M:%S"),
+    ],
+)
+def test_guess_datetime_format_with_locale_specific_formats(string, fmt):
+    result = parsing.guess_datetime_format(string)
+    assert result == fmt
+
+
+@pytest.mark.parametrize(
+    "invalid_dt",
+    [
+        "01/2013",
+        "12:00:00",
+        "1/1/1/1",
+        "this_is_not_a_datetime",
+        "51a",
+        "13/2019",
+        "202001",  # YYYYMM isn't ISO8601
+        "2020/01",  # YYYY/MM isn't ISO8601 either
+        "87156549591102612381000001219H5",
+    ],
+)
+def test_guess_datetime_format_invalid_inputs(invalid_dt):
+    # A datetime string must include a year, month and a day for it to be
+    # guessable, in addition to being a string that looks like a datetime.
+    assert parsing.guess_datetime_format(invalid_dt) is None
+
+
+@pytest.mark.parametrize("invalid_type_dt", [9, datetime(2011, 1, 1)])
+def test_guess_datetime_format_wrong_type_inputs(invalid_type_dt):
+    # A datetime string must include a year, month and a day for it to be
+    # guessable, in addition to being a string that looks like a datetime.
+    with pytest.raises(
+        TypeError,
+        match=r"^Argument 'dt_str' has incorrect type \(expected str, got .*\)$",
+    ):
+        parsing.guess_datetime_format(invalid_type_dt)
+
+
+@pytest.mark.parametrize(
+    "string,fmt,dayfirst,warning",
+    [
+        ("2011-1-1", "%Y-%m-%d", False, None),
+        ("2011-1-1", "%Y-%d-%m", True, None),
+        ("1/1/2011", "%m/%d/%Y", False, None),
+        ("1/1/2011", "%d/%m/%Y", True, None),
+        ("30-1-2011", "%d-%m-%Y", False, UserWarning),
+        ("30-1-2011", "%d-%m-%Y", True, None),
+        ("2011-1-1 0:0:0", "%Y-%m-%d %H:%M:%S", False, None),
+        ("2011-1-1 0:0:0", "%Y-%d-%m %H:%M:%S", True, None),
+        ("2011-1-3T00:00:0", "%Y-%m-%dT%H:%M:%S", False, None),
+        ("2011-1-3T00:00:0", "%Y-%d-%mT%H:%M:%S", True, None),
+        ("2011-1-1 00:00:00", "%Y-%m-%d %H:%M:%S", False, None),
+        ("2011-1-1 00:00:00", "%Y-%d-%m %H:%M:%S", True, None),
+    ],
+)
+def test_guess_datetime_format_no_padding(string, fmt, dayfirst, warning):
+    # see gh-11142
+    msg = (
+        rf"Parsing dates in {fmt} format when dayfirst=False \(the default\) "
+        "was specified. "
+        "Pass `dayfirst=True` or specify a format to silence this warning."
+    )
+    with tm.assert_produces_warning(warning, match=msg):
+        result = parsing.guess_datetime_format(string, dayfirst=dayfirst)
+    assert result == fmt
+
+
+def test_try_parse_dates():
+    arr = np.array(["5/1/2000", "6/1/2000", "7/1/2000"], dtype=object)
+    result = parsing.try_parse_dates(arr, parser=lambda x: du_parse(x, dayfirst=True))
+
+    expected = np.array([du_parse(d, dayfirst=True) for d in arr])
+    tm.assert_numpy_array_equal(result, expected)
+
+
+def test_parse_datetime_string_with_reso_check_instance_type_raise_exception():
+    # issue 20684
+    msg = "Argument 'date_string' has incorrect type (expected str, got tuple)"
+    with pytest.raises(TypeError, match=re.escape(msg)):
+        parse_datetime_string_with_reso((1, 2, 3))
+
+    result = parse_datetime_string_with_reso("2019")
+    expected = (datetime(2019, 1, 1), "year")
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "fmt,expected",
+    [
+        ("%Y %m %d %H:%M:%S", True),
+        ("%Y/%m/%d %H:%M:%S", True),
+        (r"%Y\%m\%d %H:%M:%S", True),
+        ("%Y-%m-%d %H:%M:%S", True),
+        ("%Y.%m.%d %H:%M:%S", True),
+        ("%Y%m%d %H:%M:%S", True),
+        ("%Y-%m-%dT%H:%M:%S", True),
+        ("%Y-%m-%dT%H:%M:%S%z", True),
+        ("%Y-%m-%dT%H:%M:%S%Z", False),
+        ("%Y-%m-%dT%H:%M:%S.%f", True),
+        ("%Y-%m-%dT%H:%M:%S.%f%z", True),
+        ("%Y-%m-%dT%H:%M:%S.%f%Z", False),
+        ("%Y%m%d", True),
+        ("%Y%m", False),
+        ("%Y", True),
+        ("%Y-%m-%d", True),
+        ("%Y-%m", True),
+    ],
+)
+def test_is_iso_format(fmt, expected):
+    # see gh-41047
+    result = strptime._test_format_is_iso(fmt)
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "input",
+    [
+        "2018-01-01T00:00:00.123456789",
+        "2018-01-01T00:00:00.123456",
+        "2018-01-01T00:00:00.123",
+    ],
+)
+def test_guess_datetime_format_f(input):
+    # https://github.com/pandas-dev/pandas/issues/49043
+    result = parsing.guess_datetime_format(input)
+    expected = "%Y-%m-%dT%H:%M:%S.%f"
+    assert result == expected
+
+
+def _helper_hypothesis_delimited_date(call, date_string, **kwargs):
+    msg, result = None, None
+    try:
+        result = call(date_string, **kwargs)
+    except ValueError as err:
+        msg = str(err)
+    return msg, result
+
+
+@pytest.mark.parametrize("input", ["21-01-01", "01-01-21"])
+@pytest.mark.parametrize("dayfirst", [True, False])
+def test_parse_datetime_string_with_reso_dayfirst(dayfirst, input):
+    with option_context("display.date_dayfirst", dayfirst):
+        except_out_dateutil, result = _helper_hypothesis_delimited_date(
+            parsing.parse_datetime_string_with_reso, input
+        )
+
+        except_in_dateutil, expected = _helper_hypothesis_delimited_date(
+            du_parse,
+            input,
+            default=datetime(1, 1, 1),
+            dayfirst=dayfirst,
+            yearfirst=False,
+        )
+        assert except_out_dateutil == except_in_dateutil
+        assert result[0] == expected
+
+
+@pytest.mark.parametrize("input", ["21-01-01", "01-01-21"])
+@pytest.mark.parametrize("yearfirst", [True, False])
+def test_parse_datetime_string_with_reso_yearfirst(yearfirst, input):
+    with option_context("display.date_yearfirst", yearfirst):
+        except_out_dateutil, result = _helper_hypothesis_delimited_date(
+            parsing.parse_datetime_string_with_reso, input
+        )
+        except_in_dateutil, expected = _helper_hypothesis_delimited_date(
+            du_parse,
+            input,
+            default=datetime(1, 1, 1),
+            dayfirst=False,
+            yearfirst=yearfirst,
+        )
+        assert except_out_dateutil == except_in_dateutil
+        assert result[0] == expected
diff --git a/pandas/tests/tslibs/test_period.py b/pandas/tests/tslibs/test_period.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c17caabae327adf3af24a9b13c7c5da7d576cd4
--- /dev/null
+++ b/pandas/tests/tslibs/test_period.py
@@ -0,0 +1,123 @@
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import (
+    iNaT,
+    to_offset,
+)
+from pandas._libs.tslibs.period import (
+    extract_ordinals,
+    get_period_field_arr,
+    period_asfreq,
+    period_ordinal,
+)
+
+import pandas._testing as tm
+
+
+def get_freq_code(freqstr: str) -> int:
+    off = to_offset(freqstr, is_period=True)
+    # error: "BaseOffset" has no attribute "_period_dtype_code"
+    code = off._period_dtype_code  # type: ignore[attr-defined]
+    return code
+
+
+@pytest.mark.parametrize(
+    "freq1,freq2,expected",
+    [
+        ("D", "h", 24),
+        ("D", "min", 1440),
+        ("D", "s", 86400),
+        ("D", "ms", 86400000),
+        ("D", "us", 86400000000),
+        ("D", "ns", 86400000000000),
+        ("h", "min", 60),
+        ("h", "s", 3600),
+        ("h", "ms", 3600000),
+        ("h", "us", 3600000000),
+        ("h", "ns", 3600000000000),
+        ("min", "s", 60),
+        ("min", "ms", 60000),
+        ("min", "us", 60000000),
+        ("min", "ns", 60000000000),
+        ("s", "ms", 1000),
+        ("s", "us", 1000000),
+        ("s", "ns", 1000000000),
+        ("ms", "us", 1000),
+        ("ms", "ns", 1000000),
+        ("us", "ns", 1000),
+    ],
+)
+def test_intra_day_conversion_factors(freq1, freq2, expected):
+    assert (
+        period_asfreq(1, get_freq_code(freq1), get_freq_code(freq2), False) == expected
+    )
+
+
+@pytest.mark.parametrize(
+    "freq,expected", [("Y", 0), ("M", 0), ("W", 1), ("D", 0), ("B", 0)]
+)
+def test_period_ordinal_start_values(freq, expected):
+    # information for Jan. 1, 1970.
+    assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq_code(freq)) == expected
+
+
+@pytest.mark.parametrize(
+    "dt,expected",
+    [
+        ((1970, 1, 4, 0, 0, 0, 0, 0), 1),
+        ((1970, 1, 5, 0, 0, 0, 0, 0), 2),
+        ((2013, 10, 6, 0, 0, 0, 0, 0), 2284),
+        ((2013, 10, 7, 0, 0, 0, 0, 0), 2285),
+    ],
+)
+def test_period_ordinal_week(dt, expected):
+    args = (*dt, get_freq_code("W"))
+    assert period_ordinal(*args) == expected
+
+
+@pytest.mark.parametrize(
+    "day,expected",
+    [
+        # Thursday (Oct. 3, 2013).
+        (3, 11415),
+        # Friday (Oct. 4, 2013).
+        (4, 11416),
+        # Saturday (Oct. 5, 2013).
+        (5, 11417),
+        # Sunday (Oct. 6, 2013).
+        (6, 11417),
+        # Monday (Oct. 7, 2013).
+        (7, 11417),
+        # Tuesday (Oct. 8, 2013).
+        (8, 11418),
+    ],
+)
+def test_period_ordinal_business_day(day, expected):
+    # 5000 is PeriodDtypeCode for BusinessDay
+    args = (2013, 10, day, 0, 0, 0, 0, 0, 5000)
+    assert period_ordinal(*args) == expected
+
+
+class TestExtractOrdinals:
+    def test_extract_ordinals_raises(self):
+        # with non-object, make sure we raise TypeError, not segfault
+        arr = np.arange(5)
+        freq = to_offset("D")
+        with pytest.raises(TypeError, match="values must be object-dtype"):
+            extract_ordinals(arr, freq)
+
+    def test_extract_ordinals_2d(self):
+        freq = to_offset("D")
+        arr = np.empty(10, dtype=object)
+        arr[:] = iNaT
+
+        res = extract_ordinals(arr, freq)
+        res2 = extract_ordinals(arr.reshape(5, 2), freq)
+        tm.assert_numpy_array_equal(res, res2.reshape(-1))
+
+
+def test_get_period_field_array_raises_on_out_of_range():
+    msg = "Buffer dtype mismatch, expected 'const int64_t' but got 'double'"
+    with pytest.raises(ValueError, match=msg):
+        get_period_field_arr(-1, np.empty(1), 0)
diff --git a/pandas/tests/tslibs/test_resolution.py b/pandas/tests/tslibs/test_resolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..59004d2cabdeee68b479c125a6cc050259c41b5d
--- /dev/null
+++ b/pandas/tests/tslibs/test_resolution.py
@@ -0,0 +1,56 @@
+import datetime
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs import (
+    Resolution,
+    get_resolution,
+)
+from pandas._libs.tslibs.dtypes import NpyDatetimeUnit
+
+
+def test_get_resolution_nano():
+    # don't return the fallback RESO_DAY
+    arr = np.array([1], dtype=np.int64)
+    res = get_resolution(arr)
+    assert res == Resolution.RESO_NS
+
+
+def test_get_resolution_non_nano_data():
+    arr = np.array([1], dtype=np.int64)
+    res = get_resolution(arr, None, NpyDatetimeUnit.NPY_FR_us.value)
+    assert res == Resolution.RESO_US
+
+    res = get_resolution(arr, datetime.UTC, NpyDatetimeUnit.NPY_FR_us.value)
+    assert res == Resolution.RESO_US
+
+
+@pytest.mark.parametrize(
+    "freqstr,expected",
+    [
+        ("Y", "year"),
+        ("Q", "quarter"),
+        ("M", "month"),
+        ("D", "day"),
+        ("h", "hour"),
+        ("min", "minute"),
+        ("s", "second"),
+        ("ms", "millisecond"),
+        ("us", "microsecond"),
+        ("ns", "nanosecond"),
+    ],
+)
+def test_get_attrname_from_abbrev(freqstr, expected):
+    reso = Resolution.get_reso_from_freqstr(freqstr)
+    assert reso.attr_abbrev == freqstr
+    assert reso.attrname == expected
+
+
+@pytest.mark.parametrize("freq", ["H", "S"])
+def test_unit_H_S_raises(freq):
+    # GH#59143
+    msg = f"Invalid frequency: {freq}"
+
+    with pytest.raises(ValueError, match=msg):
+        Resolution.get_reso_from_freqstr(freq)
diff --git a/pandas/tests/tslibs/test_strptime.py b/pandas/tests/tslibs/test_strptime.py
new file mode 100644
index 0000000000000000000000000000000000000000..c63d3dbd9f5c7410f8be929d58047e4a3c8bf9ed
--- /dev/null
+++ b/pandas/tests/tslibs/test_strptime.py
@@ -0,0 +1,111 @@
+from datetime import (
+    datetime,
+    timezone,
+)
+
+import numpy as np
+import pytest
+
+from pandas._libs.tslibs.dtypes import NpyDatetimeUnit
+from pandas._libs.tslibs.strptime import array_strptime
+
+from pandas import (
+    NaT,
+    Timestamp,
+)
+import pandas._testing as tm
+
+creso_infer = NpyDatetimeUnit.NPY_FR_GENERIC.value
+
+
+class TestArrayStrptimeResolutionInference:
+    def test_array_strptime_resolution_all_nat(self):
+        arr = np.array([NaT, np.nan], dtype=object)
+
+        fmt = "%Y-%m-%d %H:%M:%S"
+        res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer)
+        assert res.dtype == "M8[s]"
+
+        res, _ = array_strptime(arr, fmt=fmt, utc=True, creso=creso_infer)
+        assert res.dtype == "M8[s]"
+
+    @pytest.mark.parametrize("tz", [None, timezone.utc])
+    def test_array_strptime_resolution_inference_homogeneous_strings(self, tz):
+        dt = datetime(2016, 1, 2, 3, 4, 5, 678900, tzinfo=tz)
+        dt0 = dt.replace(microsecond=0)
+
+        fmt = "%Y-%m-%d %H:%M:%S"
+        dtstr = dt.strftime(fmt)
+        arr = np.array([dtstr] * 3, dtype=object)
+        expected = np.array([dt0.replace(tzinfo=None)] * 3, dtype="M8[us]")
+
+        res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer)
+        tm.assert_numpy_array_equal(res, expected)
+
+        fmt = "%Y-%m-%d %H:%M:%S.%f"
+        dtstr = dt.strftime(fmt)
+        arr = np.array([dtstr] * 3, dtype=object)
+        expected = np.array([dt.replace(tzinfo=None)] * 3, dtype="M8[us]")
+
+        res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer)
+        tm.assert_numpy_array_equal(res, expected)
+
+        fmt = "ISO8601"
+        res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer)
+        tm.assert_numpy_array_equal(res, expected)
+
+    @pytest.mark.parametrize("tz", [None, timezone.utc])
+    def test_array_strptime_resolution_mixed(self, tz):
+        dt = datetime(2016, 1, 2, 3, 4, 5, 678900, tzinfo=tz)
+
+        ts = Timestamp(dt).as_unit("ns")
+
+        arr = np.array([dt, ts], dtype=object)
+        expected = np.array(
+            [Timestamp(dt).as_unit("ns").asm8, ts.asm8],
+            dtype="M8[ns]",
+        )
+
+        fmt = "%Y-%m-%d %H:%M:%S"
+        res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer)
+        tm.assert_numpy_array_equal(res, expected)
+
+        fmt = "ISO8601"
+        res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer)
+        tm.assert_numpy_array_equal(res, expected)
+
+    def test_array_strptime_resolution_todaynow(self):
+        # specifically case where today/now is the *first* item
+        vals = np.array(["today", np.datetime64("2017-01-01", "us")], dtype=object)
+
+        now = Timestamp("now").asm8
+        res, _ = array_strptime(vals, fmt="%Y-%m-%d", utc=False, creso=creso_infer)
+        res2, _ = array_strptime(
+            vals[::-1], fmt="%Y-%m-%d", utc=False, creso=creso_infer
+        )
+
+        # 1s is an arbitrary cutoff for call overhead; in local testing the
+        #  actual difference is about 250us
+        tolerance = np.timedelta64(1, "s")
+
+        assert res.dtype == "M8[us]"
+        assert abs(res[0] - now) < tolerance
+        assert res[1] == vals[1]
+
+        assert res2.dtype == "M8[us]"
+        assert abs(res2[1] - now) < tolerance * 2
+        assert res2[0] == vals[1]
+
+    def test_array_strptime_str_outside_nano_range(self):
+        vals = np.array(["2401-09-15"], dtype=object)
+        expected = np.array(["2401-09-15"], dtype="M8[us]")
+        fmt = "ISO8601"
+        res, _ = array_strptime(vals, fmt=fmt, creso=creso_infer)
+        tm.assert_numpy_array_equal(res, expected)
+
+        # non-iso -> different path
+        vals2 = np.array(["Sep 15, 2401"], dtype=object)
+        expected2 = np.array(["2401-09-15"], dtype="M8[us]")
+        fmt2 = "%b %d, %Y"
+        res2, _ = array_strptime(vals2, fmt=fmt2, creso=creso_infer)
+        tm.assert_numpy_array_equal(res2, expected2)
diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py
new file mode 100644
index 0000000000000000000000000000000000000000..c48986c597356f9e8f1070771fdc671e42380c0e
--- /dev/null
+++ b/pandas/tests/tslibs/test_timezones.py
@@ -0,0 +1,193 @@
+from datetime import (
+    datetime,
+    timedelta,
+    timezone,
+)
+import subprocess
+import sys
+import textwrap
+
+import dateutil.tz
+import pytest
+
+from pandas._libs.tslibs import (
+    conversion,
+    timezones,
+)
+from pandas.compat import is_platform_windows
+
+from pandas import Timestamp
+
+
+@pytest.mark.single_cpu
+def test_no_timezone_data():
+    # https://github.com/pandas-dev/pandas/pull/63335
+    # Test error message when timezone data is not available.
+    msg = "'No time zone found with key Europe/Brussels'"
+    code = textwrap.dedent(
+        f"""\
+        import sys, zoneinfo, pandas as pd
+        sys.modules['tzdata'] = None
+        zoneinfo.reset_tzpath(['/path/to/nowhere'])
+        try:
+            pd.to_datetime('2012-01-01').tz_localize('Europe/Brussels')
+        except zoneinfo.ZoneInfoNotFoundError as err:
+            assert str(err) == "{msg}"
+        """
+    )
+    subprocess.check_call([sys.executable, "-c", code])
+
+
+def test_is_utc(utc_fixture):
+    tz = timezones.maybe_get_tz(utc_fixture)
+    assert timezones.is_utc(tz)
+
+
+def test_cache_keys_are_distinct_for_pytz_vs_dateutil():
+    pytz = pytest.importorskip("pytz")
+    for tz_name in pytz.common_timezones:
+        tz_p = timezones.maybe_get_tz(tz_name)
+        tz_d = timezones.maybe_get_tz("dateutil/" + tz_name)
+
+    if tz_d is None:
+        pytest.skip(tz_name + ": dateutil does not know about this one")
+
+    if not (tz_name == "UTC" and is_platform_windows()):
+        # they both end up as tzwin("UTC") on windows
+        assert timezones._p_tz_cache_key(tz_p) != timezones._p_tz_cache_key(tz_d)
+
+
+def test_tzlocal_repr():
+    # see gh-13583
+    ts = Timestamp("2011-01-01", tz=dateutil.tz.tzlocal())
+    assert ts.tz == dateutil.tz.tzlocal()
+    assert "tz='tzlocal()')" in repr(ts)
+
+
+def test_tzlocal_maybe_get_tz():
+    # see gh-13583
+    tz = timezones.maybe_get_tz("tzlocal()")
+    assert tz == dateutil.tz.tzlocal()
+
+
+def test_tzlocal_offset():
+    # see gh-13583
+    #
+    # Get offset using normal datetime for test.
+    ts = Timestamp("2011-01-01", tz=dateutil.tz.tzlocal()).as_unit("s")
+
+    offset = dateutil.tz.tzlocal().utcoffset(datetime(2011, 1, 1))
+    offset = offset.total_seconds()
+
+    assert ts._value + offset == Timestamp("2011-01-01").as_unit("s")._value
+
+
+def test_tzlocal_is_not_utc():
+    # even if the machine running the test is localized to UTC
+    tz = dateutil.tz.tzlocal()
+    assert not timezones.is_utc(tz)
+
+    assert not timezones.tz_compare(tz, dateutil.tz.tzutc())
+
+
+def test_tz_compare_utc(utc_fixture, utc_fixture2):
+    tz = timezones.maybe_get_tz(utc_fixture)
+    tz2 = timezones.maybe_get_tz(utc_fixture2)
+    assert timezones.tz_compare(tz, tz2)
+
+
+@pytest.fixture(
+    params=[
+        ("pytz/US/Eastern", lambda tz, x: tz.localize(x)),
+        (dateutil.tz.gettz("US/Eastern"), lambda tz, x: x.replace(tzinfo=tz)),
+    ]
+)
+def infer_setup(request):
+    eastern, localize = request.param
+    if isinstance(eastern, str) and eastern.startswith("pytz/"):
+        pytz = pytest.importorskip("pytz")
+        eastern = pytz.timezone(eastern.removeprefix("pytz/"))
+
+    start_naive = datetime(2001, 1, 1)
+    end_naive = datetime(2009, 1, 1)
+
+    start = localize(eastern, start_naive)
+    end = localize(eastern, end_naive)
+
+    return eastern, localize, start, end, start_naive, end_naive
+
+
+def test_infer_tz_compat(infer_setup):
+    eastern, _, start, end, start_naive, end_naive = infer_setup
+
+    assert (
+        timezones.infer_tzinfo(start, end)
+        is conversion.localize_pydatetime(start_naive, eastern).tzinfo
+    )
+    assert (
+        timezones.infer_tzinfo(start, None)
+        is conversion.localize_pydatetime(start_naive, eastern).tzinfo
+    )
+    assert (
+        timezones.infer_tzinfo(None, end)
+        is conversion.localize_pydatetime(end_naive, eastern).tzinfo
+    )
+
+
+def test_infer_tz_utc_localize(infer_setup):
+    _, _, start, end, start_naive, end_naive = infer_setup
+    utc = timezone.utc
+
+    start = start_naive.astimezone(utc)
+    end = end_naive.astimezone(utc)
+
+    assert timezones.infer_tzinfo(start, end) is utc
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_infer_tz_mismatch(infer_setup, ordered):
+    eastern, _, _, _, start_naive, end_naive = infer_setup
+    msg = "Inputs must both have the same timezone"
+
+    utc = timezone.utc
+    start = start_naive.astimezone(utc)
+    end = conversion.localize_pydatetime(end_naive, eastern)
+
+    args = (start, end) if ordered else (end, start)
+
+    with pytest.raises(AssertionError, match=msg):
+        timezones.infer_tzinfo(*args)
+
+
+def test_maybe_get_tz_invalid_types():
+    with pytest.raises(TypeError, match="<class 'float'>"):
+        timezones.maybe_get_tz(44.0)
+
+    with pytest.raises(TypeError, match="<class 'module'>"):
+        timezones.maybe_get_tz(pytest)
+
+    msg = "<class 'pandas.Timestamp'>"
+    with pytest.raises(TypeError, match=msg):
+        timezones.maybe_get_tz(Timestamp("2021-01-01", tz="UTC"))
+
+
+def test_maybe_get_tz_offset_only():
+    # see gh-36004
+
+    # timezone.utc
+    tz = timezones.maybe_get_tz(timezone.utc)
+    assert tz == timezone(timedelta(hours=0, minutes=0))
+
+    # without UTC+- prefix
+    tz = timezones.maybe_get_tz("+01:15")
+    assert tz == timezone(timedelta(hours=1, minutes=15))
+
+    tz = timezones.maybe_get_tz("-01:15")
+    assert tz == timezone(-timedelta(hours=1, minutes=15))
+
+    # with UTC+- prefix
+    tz = timezones.maybe_get_tz("UTC+02:45")
+    assert tz == timezone(timedelta(hours=2, minutes=45))
+
+    tz = timezones.maybe_get_tz("UTC-02:45")
+    assert tz == timezone(-timedelta(hours=2, minutes=45))
diff --git a/pandas/tests/util/__init__.py b/pandas/tests/util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/util/conftest.py b/pandas/tests/util/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e931ff42fe1528945dba38ae91e6cae0853afda
--- /dev/null
+++ b/pandas/tests/util/conftest.py
@@ -0,0 +1,46 @@
+import pytest
+
+
+@pytest.fixture(params=[True, False])
+def check_dtype(request):
+    """
+    Fixture returning `True` or `False`, determining whether to check
+    if the `dtype` is identical or not, when comparing two data structures,
+    e.g. `Series`, `SparseArray` or `DataFrame`.
+    """
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def check_exact(request):
+    """
+    Fixture returning `True` or `False`, determining whether to
+    compare floating point numbers exactly or not.
+    """
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def check_index_type(request):
+    """
+    Fixture returning `True` or `False`, determining whether to check
+    if the `Index` types are identical or not.
+    """
+    return request.param
+
+
+@pytest.fixture(params=[0.5e-3, 0.5e-5])
+def rtol(request):
+    """
+    Fixture returning 0.5e-3 or 0.5e-5. Those values are used as relative tolerance.
+    """
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def check_categorical(request):
+    """
+    Fixture returning `True` or `False`, determining whether to
+    compare internal `Categorical` exactly or not.
+    """
+    return request.param
diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..091670ed69f11b72700766ba20024af37dbf68f5
--- /dev/null
+++ b/pandas/tests/util/test_assert_almost_equal.py
@@ -0,0 +1,586 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    NA,
+    DataFrame,
+    Index,
+    NaT,
+    Series,
+    Timestamp,
+)
+import pandas._testing as tm
+
+
+def _assert_almost_equal_both(a, b, **kwargs):
+    """
+    Check that two objects are approximately equal.
+
+    This check is performed commutatively.
+
+    Parameters
+    ----------
+    a : object
+        The first object to compare.
+    b : object
+        The second object to compare.
+    **kwargs
+        The arguments passed to `tm.assert_almost_equal`.
+    """
+    tm.assert_almost_equal(a, b, **kwargs)
+    tm.assert_almost_equal(b, a, **kwargs)
+
+
+def _assert_not_almost_equal(a, b, **kwargs):
+    """
+    Check that two objects are not approximately equal.
+
+    Parameters
+    ----------
+    a : object
+        The first object to compare.
+    b : object
+        The second object to compare.
+    **kwargs
+        The arguments passed to `tm.assert_almost_equal`.
+    """
+    try:
+        tm.assert_almost_equal(a, b, **kwargs)
+        msg = f"{a} and {b} were approximately equal when they shouldn't have been"
+        pytest.fail(reason=msg)
+    except AssertionError:
+        pass
+
+
+def _assert_not_almost_equal_both(a, b, **kwargs):
+    """
+    Check that two objects are not approximately equal.
+
+    This check is performed commutatively.
+
+    Parameters
+    ----------
+    a : object
+        The first object to compare.
+    b : object
+        The second object to compare.
+    **kwargs
+        The arguments passed to `tm.assert_almost_equal`.
+    """
+    _assert_not_almost_equal(a, b, **kwargs)
+    _assert_not_almost_equal(b, a, **kwargs)
+
+
+@pytest.mark.parametrize(
+    "a,b",
+    [
+        (1.1, 1.1),
+        (1.1, 1.100001),
+        (np.int16(1), 1.000001),
+        (np.float64(1.1), 1.1),
+        (np.uint32(5), 5),
+    ],
+)
+def test_assert_almost_equal_numbers(a, b):
+    _assert_almost_equal_both(a, b)
+
+
+@pytest.mark.parametrize(
+    "a,b",
+    [
+        (1.1, 1),
+        (1.1, True),
+        (1, 2),
+        (1.0001, np.int16(1)),
+        # The following two examples are not "almost equal" due to tol.
+        (0.1, 0.1001),
+        (0.0011, 0.0012),
+    ],
+)
+def test_assert_not_almost_equal_numbers(a, b):
+    _assert_not_almost_equal_both(a, b)
+
+
+@pytest.mark.parametrize(
+    "a,b",
+    [
+        (1.1, 1.1),
+        (1.1, 1.100001),
+        (1.1, 1.1001),
+        (0.000001, 0.000005),
+        (1000.0, 1000.0005),
+        # Testing this example, as per #13357
+        (0.000011, 0.000012),
+    ],
+)
+def test_assert_almost_equal_numbers_atol(a, b):
+    # Equivalent to the deprecated check_less_precise=True, enforced in 2.0
+    _assert_almost_equal_both(a, b, rtol=0.5e-3, atol=0.5e-3)
+
+
+@pytest.mark.parametrize("a,b", [(1.1, 1.11), (0.1, 0.101), (0.000011, 0.001012)])
+def test_assert_not_almost_equal_numbers_atol(a, b):
+    _assert_not_almost_equal_both(a, b, atol=1e-3)
+
+
+@pytest.mark.parametrize(
+    "a,b",
+    [
+        (1.1, 1.1),
+        (1.1, 1.100001),
+        (1.1, 1.1001),
+        (1000.0, 1000.0005),
+        (1.1, 1.11),
+        (0.1, 0.101),
+    ],
+)
+def test_assert_almost_equal_numbers_rtol(a, b):
+    _assert_almost_equal_both(a, b, rtol=0.05)
+
+
+@pytest.mark.parametrize("a,b", [(0.000011, 0.000012), (0.000001, 0.000005)])
+def test_assert_not_almost_equal_numbers_rtol(a, b):
+    _assert_not_almost_equal_both(a, b, rtol=0.05)
+
+
+@pytest.mark.parametrize(
+    "a,b,rtol",
+    [
+        (1.00001, 1.00005, 0.001),
+        (-0.908356 + 0.2j, -0.908358 + 0.2j, 1e-3),
+        (0.1 + 1.009j, 0.1 + 1.006j, 0.1),
+        (0.1001 + 2.0j, 0.1 + 2.001j, 0.01),
+    ],
+)
+def test_assert_almost_equal_complex_numbers(a, b, rtol):
+    _assert_almost_equal_both(a, b, rtol=rtol)
+    _assert_almost_equal_both(np.complex64(a), np.complex64(b), rtol=rtol)
+    _assert_almost_equal_both(np.complex128(a), np.complex128(b), rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "a,b,rtol",
+    [
+        (0.58310768, 0.58330768, 1e-7),
+        (-0.908 + 0.2j, -0.978 + 0.2j, 0.001),
+        (0.1 + 1j, 0.1 + 2j, 0.01),
+        (-0.132 + 1.001j, -0.132 + 1.005j, 1e-5),
+        (0.58310768j, 0.58330768j, 1e-9),
+    ],
+)
+def test_assert_not_almost_equal_complex_numbers(a, b, rtol):
+    _assert_not_almost_equal_both(a, b, rtol=rtol)
+    _assert_not_almost_equal_both(np.complex64(a), np.complex64(b), rtol=rtol)
+    _assert_not_almost_equal_both(np.complex128(a), np.complex128(b), rtol=rtol)
+
+
+@pytest.mark.parametrize("a,b", [(0, 0), (0, 0.0), (0, np.float64(0)), (0.00000001, 0)])
+def test_assert_almost_equal_numbers_with_zeros(a, b):
+    _assert_almost_equal_both(a, b)
+
+
+@pytest.mark.parametrize("a,b", [(0.001, 0), (1, 0)])
+def test_assert_not_almost_equal_numbers_with_zeros(a, b):
+    _assert_not_almost_equal_both(a, b)
+
+
+@pytest.mark.parametrize("a,b", [(1, "abc"), (1, [1]), (1, object())])
+def test_assert_not_almost_equal_numbers_with_mixed(a, b):
+    _assert_not_almost_equal_both(a, b)
+
+
+@pytest.mark.parametrize(
+    "left_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"]
+)
+@pytest.mark.parametrize(
+    "right_dtype", ["M8[ns]", "m8[ns]", "float64", "int64", "object"]
+)
+def test_assert_almost_equal_edge_case_ndarrays(left_dtype, right_dtype):
+    # Empty compare.
+    _assert_almost_equal_both(
+        np.array([], dtype=left_dtype),
+        np.array([], dtype=right_dtype),
+        check_dtype=False,
+    )
+
+
+def test_assert_almost_equal_sets():
+    # GH#51727
+    _assert_almost_equal_both({1, 2, 3}, {1, 2, 3})
+
+
+def test_assert_almost_not_equal_sets():
+    # GH#51727
+    msg = r"{1, 2, 3} != {1, 2, 4}"
+    with pytest.raises(AssertionError, match=msg):
+        _assert_almost_equal_both({1, 2, 3}, {1, 2, 4})
+
+
+def test_assert_almost_equal_dicts():
+    _assert_almost_equal_both({"a": 1, "b": 2}, {"a": 1, "b": 2})
+
+
+@pytest.mark.parametrize(
+    "a,b",
+    [
+        ({"a": 1, "b": 2}, {"a": 1, "b": 3}),
+        ({"a": 1, "b": 2}, {"a": 1, "b": 2, "c": 3}),
+        ({"a": 1}, 1),
+        ({"a": 1}, "abc"),
+        ({"a": 1}, [1]),
+    ],
+)
+def test_assert_not_almost_equal_dicts(a, b):
+    _assert_not_almost_equal_both(a, b)
+
+
+@pytest.mark.parametrize("val", [1, 2])
+def test_assert_almost_equal_dict_like_object(val):
+    dict_val = 1
+    real_dict = {"a": val}
+
+    class DictLikeObj:
+        def keys(self):
+            return ("a",)
+
+        def __getitem__(self, item):
+            if item == "a":
+                return dict_val
+
+    func = (
+        _assert_almost_equal_both if val == dict_val else _assert_not_almost_equal_both
+    )
+    func(real_dict, DictLikeObj(), check_dtype=False)
+
+
+def test_assert_almost_equal_strings():
+    _assert_almost_equal_both("abc", "abc")
+
+
+@pytest.mark.parametrize("b", ["abcd", "abd", 1, [1]])
+def test_assert_not_almost_equal_strings(b):
+    _assert_not_almost_equal_both("abc", b)
+
+
+@pytest.mark.parametrize("box", [list, np.array])
+def test_assert_almost_equal_iterables(box):
+    _assert_almost_equal_both(box([1, 2, 3]), box([1, 2, 3]))
+
+
+@pytest.mark.parametrize(
+    "a,b",
+    [
+        # Class is different.
+        (np.array([1, 2, 3]), [1, 2, 3]),
+        # Dtype is different.
+        (np.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])),
+        # Can't compare generators.
+        (iter([1, 2, 3]), [1, 2, 3]),
+        ([1, 2, 3], [1, 2, 4]),
+        ([1, 2, 3], [1, 2, 3, 4]),
+        ([1, 2, 3], 1),
+    ],
+)
+def test_assert_not_almost_equal_iterables(a, b):
+    _assert_not_almost_equal(a, b)
+
+
+def test_assert_almost_equal_null():
+    _assert_almost_equal_both(None, None)
+
+
+@pytest.mark.parametrize("a,b", [(None, np.nan), (None, 0), (np.nan, 0)])
+def test_assert_not_almost_equal_null(a, b):
+    _assert_not_almost_equal(a, b)
+
+
+@pytest.mark.parametrize(
+    "a,b",
+    [
+        (np.inf, np.inf),
+        (np.inf, float("inf")),
+        (np.array([np.inf, np.nan, -np.inf]), np.array([np.inf, np.nan, -np.inf])),
+    ],
+)
+def test_assert_almost_equal_inf(a, b):
+    _assert_almost_equal_both(a, b)
+
+
+objs = [NA, np.nan, NaT, None, np.datetime64("NaT"), np.timedelta64("NaT")]
+
+
+@pytest.mark.parametrize("left", objs)
+@pytest.mark.parametrize("right", objs)
+def test_mismatched_na_assert_almost_equal(left, right):
+    left_arr = np.array([left], dtype=object)
+    right_arr = np.array([right], dtype=object)
+
+    msg = "Mismatched null-like values"
+
+    if left is right:
+        _assert_almost_equal_both(left, right, check_dtype=False)
+        tm.assert_numpy_array_equal(left_arr, right_arr)
+        tm.assert_index_equal(
+            Index(left_arr, dtype=object), Index(right_arr, dtype=object)
+        )
+        tm.assert_series_equal(
+            Series(left_arr, dtype=object), Series(right_arr, dtype=object)
+        )
+        tm.assert_frame_equal(
+            DataFrame(left_arr, dtype=object), DataFrame(right_arr, dtype=object)
+        )
+
+    else:
+        with pytest.raises(AssertionError, match=msg):
+            _assert_almost_equal_both(left, right, check_dtype=False)
+
+        # TODO: to get the same deprecation in assert_numpy_array_equal we need
+        #  to change/deprecate the default for strict_nan to become True
+        # TODO: to get the same deprecation in assert_index_equal we need to
+        #  change/deprecate array_equivalent_object to be stricter, as
+        #  assert_index_equal uses Index.equal which uses array_equivalent.
+        with pytest.raises(AssertionError, match="Series are different"):
+            tm.assert_series_equal(
+                Series(left_arr, dtype=object), Series(right_arr, dtype=object)
+            )
+        with pytest.raises(AssertionError, match="DataFrame.iloc.* are different"):
+            tm.assert_frame_equal(
+                DataFrame(left_arr, dtype=object), DataFrame(right_arr, dtype=object)
+            )
+
+
+def test_assert_not_almost_equal_inf():
+    _assert_not_almost_equal_both(np.inf, 0)
+
+
+@pytest.mark.parametrize(
+    "a,b",
+    [
+        (Index([1.0, 1.1]), Index([1.0, 1.100001])),
+        (Series([1.0, 1.1]), Series([1.0, 1.100001])),
+        (np.array([1.1, 2.000001]), np.array([1.1, 2.0])),
+        (DataFrame({"a": [1.0, 1.1]}), DataFrame({"a": [1.0, 1.100001]})),
+    ],
+)
+def test_assert_almost_equal_pandas(a, b):
+    _assert_almost_equal_both(a, b)
+
+
+def test_assert_almost_equal_object():
+    a = [Timestamp("2011-01-01"), Timestamp("2011-01-01")]
+    b = [Timestamp("2011-01-01"), Timestamp("2011-01-01")]
+    _assert_almost_equal_both(a, b)
+
+
+def test_assert_almost_equal_value_mismatch():
+    msg = "expected 2\\.00000 but got 1\\.00000, with rtol=1e-05, atol=1e-08"
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_almost_equal(1, 2)
+
+
+@pytest.mark.parametrize(
+    "a,b,klass1,klass2",
+    [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")],
+)
+def test_assert_almost_equal_class_mismatch(a, b, klass1, klass2):
+    msg = f"""numpy array are different
+
+numpy array classes are different
+\\[left\\]:  {klass1}
+\\[right\\]: {klass2}"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_almost_equal(a, b)
+
+
+def test_assert_almost_equal_value_mismatch1():
+    msg = """numpy array are different
+
+numpy array values are different \\(66\\.66667 %\\)
+\\[left\\]:  \\[nan, 2\\.0, 3\\.0\\]
+\\[right\\]: \\[1\\.0, nan, 3\\.0\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_almost_equal(np.array([np.nan, 2, 3]), np.array([1, np.nan, 3]))
+
+
+def test_assert_almost_equal_value_mismatch2():
+    msg = """numpy array are different
+
+numpy array values are different \\(50\\.0 %\\)
+\\[left\\]:  \\[1, 2\\]
+\\[right\\]: \\[1, 3\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_almost_equal(np.array([1, 2]), np.array([1, 3]))
+
+
+def test_assert_almost_equal_value_mismatch3():
+    msg = """numpy array are different
+
+numpy array values are different \\(16\\.66667 %\\)
+\\[left\\]:  \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\]
+\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_almost_equal(
+            np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]])
+        )
+
+
+def test_assert_almost_equal_value_mismatch4():
+    msg = """numpy array are different
+
+numpy array values are different \\(25\\.0 %\\)
+\\[left\\]:  \\[\\[1, 2\\], \\[3, 4\\]\\]
+\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_almost_equal(np.array([[1, 2], [3, 4]]), np.array([[1, 3], [3, 4]]))
+
+
+def test_assert_almost_equal_shape_mismatch_override():
+    msg = """Index are different
+
+Index shapes are different
+\\[left\\]:  \\(2L*,\\)
+\\[right\\]: \\(3L*,\\)"""
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_almost_equal(np.array([1, 2]), np.array([3, 4, 5]), obj="Index")
+
+
+def test_assert_almost_equal_unicode():
+    # see gh-20503
+    msg = """numpy array are different
+
+numpy array values are different \\(33\\.33333 %\\)
+\\[left\\]:  \\[á, à, ä\\]
+\\[right\\]: \\[á, à, å\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_almost_equal(np.array(["á", "à", "ä"]), np.array(["á", "à", "å"]))
+
+
+def test_assert_almost_equal_timestamp():
+    a = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-01")])
+    b = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")])
+
+    msg = """numpy array are different
+
+numpy array values are different \\(50\\.0 %\\)
+\\[left\\]:  \\[2011-01-01 00:00:00, 2011-01-01 00:00:00\\]
+\\[right\\]: \\[2011-01-01 00:00:00, 2011-01-02 00:00:00\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_almost_equal(a, b)
+
+
+def test_assert_almost_equal_iterable_length_mismatch():
+    msg = """Iterable are different
+
+Iterable length are different
+\\[left\\]:  2
+\\[right\\]: 3"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_almost_equal([1, 2], [3, 4, 5])
+
+
+def test_assert_almost_equal_iterable_values_mismatch():
+    msg = """Iterable are different
+
+Iterable values are different \\(50\\.0 %\\)
+\\[left\\]:  \\[1, 2\\]
+\\[right\\]: \\[1, 3\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_almost_equal([1, 2], [1, 3])
+
+
+subarr = np.empty(2, dtype=object)
+subarr[:] = [np.array([None, "b"], dtype=object), np.array(["c", "d"], dtype=object)]
+
+NESTED_CASES = [
+    # nested array
+    (
+        np.array([np.array([50, 70, 90]), np.array([20, 30])], dtype=object),
+        np.array([np.array([50, 70, 90]), np.array([20, 30])], dtype=object),
+    ),
+    # >1 level of nesting
+    (
+        np.array(
+            [
+                np.array([np.array([50, 70]), np.array([90])], dtype=object),
+                np.array([np.array([20, 30])], dtype=object),
+            ],
+            dtype=object,
+        ),
+        np.array(
+            [
+                np.array([np.array([50, 70]), np.array([90])], dtype=object),
+                np.array([np.array([20, 30])], dtype=object),
+            ],
+            dtype=object,
+        ),
+    ),
+    # lists
+    (
+        np.array([[50, 70, 90], [20, 30]], dtype=object),
+        np.array([[50, 70, 90], [20, 30]], dtype=object),
+    ),
+    # mixed array/list
+    (
+        np.array([np.array([1, 2, 3]), np.array([4, 5])], dtype=object),
+        np.array([[1, 2, 3], [4, 5]], dtype=object),
+    ),
+    (
+        np.array([np.array([], dtype=object), None], dtype=object),
+        np.array([[], None], dtype=object),
+    ),
+    (
+        np.array(
+            [
+                np.array([np.array([1, 2, 3]), np.array([4, 5])], dtype=object),
+                np.array(
+                    [np.array([6]), np.array([7, 8]), np.array([9])], dtype=object
+                ),
+            ],
+            dtype=object,
+        ),
+        np.array([[[1, 2, 3], [4, 5]], [[6], [7, 8], [9]]], dtype=object),
+    ),
+    # same-length lists
+    (
+        np.array([subarr, None], dtype=object),
+        np.array([[[None, "b"], ["c", "d"]], None], dtype=object),
+    ),
+    # dicts
+    (
+        np.array([{"f1": 1, "f2": np.array(["a", "b"], dtype=object)}], dtype=object),
+        np.array([{"f1": 1, "f2": np.array(["a", "b"], dtype=object)}], dtype=object),
+    ),
+    (
+        np.array([{"f1": 1, "f2": np.array(["a", "b"], dtype=object)}], dtype=object),
+        np.array([{"f1": 1, "f2": ["a", "b"]}], dtype=object),
+    ),
+    # array/list of dicts
+    (
+        np.array(
+            [
+                np.array(
+                    [{"f1": 1, "f2": np.array(["a", "b"], dtype=object)}], dtype=object
+                ),
+                np.array([], dtype=object),
+            ],
+            dtype=object,
+        ),
+        np.array([[{"f1": 1, "f2": ["a", "b"]}], []], dtype=object),
+    ),
+]
+
+
+@pytest.mark.filterwarnings("ignore:elementwise comparison failed:DeprecationWarning")
+@pytest.mark.parametrize("a,b", NESTED_CASES)
+def test_assert_almost_equal_array_nested(a, b):
+    _assert_almost_equal_both(a, b)
diff --git a/pandas/tests/util/test_assert_attr_equal.py b/pandas/tests/util/test_assert_attr_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbbb0bf2172b12f93c9f0f6a97751854d1566a99
--- /dev/null
+++ b/pandas/tests/util/test_assert_attr_equal.py
@@ -0,0 +1,33 @@
+from types import SimpleNamespace
+
+import pytest
+
+from pandas.core.dtypes.common import is_float
+
+import pandas._testing as tm
+
+
+def test_assert_attr_equal(nulls_fixture):
+    obj = SimpleNamespace()
+    obj.na_value = nulls_fixture
+    tm.assert_attr_equal("na_value", obj, obj)
+
+
+def test_assert_attr_equal_different_nulls(nulls_fixture, nulls_fixture2):
+    obj = SimpleNamespace()
+    obj.na_value = nulls_fixture
+
+    obj2 = SimpleNamespace()
+    obj2.na_value = nulls_fixture2
+
+    if nulls_fixture is nulls_fixture2:
+        tm.assert_attr_equal("na_value", obj, obj2)
+    elif is_float(nulls_fixture) and is_float(nulls_fixture2):
+        # we consider float("nan") and np.float64("nan") to be equivalent
+        tm.assert_attr_equal("na_value", obj, obj2)
+    elif type(nulls_fixture) is type(nulls_fixture2):
+        # e.g. Decimal("NaN")
+        tm.assert_attr_equal("na_value", obj, obj2)
+    else:
+        with pytest.raises(AssertionError, match='"na_value" are different'):
+            tm.assert_attr_equal("na_value", obj, obj2)
diff --git a/pandas/tests/util/test_assert_categorical_equal.py b/pandas/tests/util/test_assert_categorical_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..c17156457470839906ead0f965880e9d3638ab7f
--- /dev/null
+++ b/pandas/tests/util/test_assert_categorical_equal.py
@@ -0,0 +1,88 @@
+import pytest
+
+from pandas import Categorical
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize("c", [None, [1, 2, 3, 4, 5]])
+def test_categorical_equal(c):
+    c = Categorical([1, 2, 3, 4], categories=c)
+    tm.assert_categorical_equal(c, c)
+
+
+@pytest.mark.parametrize("check_category_order", [True, False])
+def test_categorical_equal_order_mismatch(check_category_order):
+    c1 = Categorical([1, 2, 3, 4], categories=[1, 2, 3, 4])
+    c2 = Categorical([1, 2, 3, 4], categories=[4, 3, 2, 1])
+    kwargs = {"check_category_order": check_category_order}
+
+    if check_category_order:
+        msg = """Categorical\\.categories are different
+
+Categorical\\.categories values are different \\(100\\.0 %\\)
+\\[left\\]:  Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)
+\\[right\\]: Index\\(\\[4, 3, 2, 1\\], dtype='int64'\\)"""
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_categorical_equal(c1, c2, **kwargs)
+    else:
+        tm.assert_categorical_equal(c1, c2, **kwargs)
+
+
+def test_categorical_equal_categories_mismatch():
+    msg = """Categorical\\.categories are different
+
+Categorical\\.categories values are different \\(25\\.0 %\\)
+\\[left\\]:  Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)
+\\[right\\]: Index\\(\\[1, 2, 3, 5\\], dtype='int64'\\)"""
+
+    c1 = Categorical([1, 2, 3, 4])
+    c2 = Categorical([1, 2, 3, 5])
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_categorical_equal(c1, c2)
+
+
+def test_categorical_equal_codes_mismatch():
+    categories = [1, 2, 3, 4]
+    msg = """Categorical\\.codes are different
+
+Categorical\\.codes values are different \\(50\\.0 %\\)
+\\[left\\]:  \\[0, 1, 3, 2\\]
+\\[right\\]: \\[0, 1, 2, 3\\]"""
+
+    c1 = Categorical([1, 2, 4, 3], categories=categories)
+    c2 = Categorical([1, 2, 3, 4], categories=categories)
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_categorical_equal(c1, c2)
+
+
+def test_categorical_equal_ordered_mismatch():
+    data = [1, 2, 3, 4]
+    msg = """Categorical are different
+
+Attribute "ordered" are different
+\\[left\\]:  False
+\\[right\\]: True"""
+
+    c1 = Categorical(data, ordered=False)
+    c2 = Categorical(data, ordered=True)
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_categorical_equal(c1, c2)
+
+
+@pytest.mark.parametrize("obj", ["index", "foo", "pandas"])
+def test_categorical_equal_object_override(obj):
+    data = [1, 2, 3, 4]
+    msg = f"""{obj} are different
+
+Attribute "ordered" are different
+\\[left\\]:  False
+\\[right\\]: True"""
+
+    c1 = Categorical(data, ordered=False)
+    c2 = Categorical(data, ordered=True)
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_categorical_equal(c1, c2, obj=obj)
diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d82ae9af0e9573c916fbbb836a5dd64920794c3
--- /dev/null
+++ b/pandas/tests/util/test_assert_extension_array_equal.py
@@ -0,0 +1,125 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    Timestamp,
+    array,
+)
+import pandas._testing as tm
+from pandas.core.arrays.sparse import SparseArray
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {},  # Default is check_exact=False
+        {"check_exact": False},
+        {"check_exact": True},
+    ],
+)
+def test_assert_extension_array_equal_not_exact(kwargs):
+    # see gh-23709
+    arr1 = SparseArray([-0.17387645482451206, 0.3414148016424936])
+    arr2 = SparseArray([-0.17387645482451206, 0.3414148016424937])
+
+    if kwargs.get("check_exact", False):
+        msg = """\
+ExtensionArray are different
+
+ExtensionArray values are different \\(50\\.0 %\\)
+\\[left\\]:  \\[-0\\.17387645482.*, 0\\.341414801642.*\\]
+\\[right\\]: \\[-0\\.17387645482.*, 0\\.341414801642.*\\]"""
+
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_extension_array_equal(arr1, arr2, **kwargs)
+    else:
+        tm.assert_extension_array_equal(arr1, arr2, **kwargs)
+
+
+@pytest.mark.parametrize("decimals", range(10))
+def test_assert_extension_array_equal_less_precise(decimals):
+    rtol = 0.5 * 10**-decimals
+    arr1 = SparseArray([0.5, 0.123456])
+    arr2 = SparseArray([0.5, 0.123457])
+
+    if decimals >= 5:
+        msg = """\
+ExtensionArray are different
+
+ExtensionArray values are different \\(50\\.0 %\\)
+\\[left\\]:  \\[0\\.5, 0\\.123456\\]
+\\[right\\]: \\[0\\.5, 0\\.123457\\]"""
+
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_extension_array_equal(arr1, arr2, rtol=rtol)
+    else:
+        tm.assert_extension_array_equal(arr1, arr2, rtol=rtol)
+
+
+def test_assert_extension_array_equal_dtype_mismatch(check_dtype):
+    end = 5
+    kwargs = {"check_dtype": check_dtype}
+
+    arr1 = SparseArray(np.arange(end, dtype="int64"))
+    arr2 = SparseArray(np.arange(end, dtype="int32"))
+
+    if check_dtype:
+        msg = """\
+ExtensionArray are different
+
+Attribute "dtype" are different
+\\[left\\]:  Sparse\\[int64, 0\\]
+\\[right\\]: Sparse\\[int32, 0\\]"""
+
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_extension_array_equal(arr1, arr2, **kwargs)
+    else:
+        tm.assert_extension_array_equal(arr1, arr2, **kwargs)
+
+
+def test_assert_extension_array_equal_missing_values():
+    arr1 = SparseArray([np.nan, 1, 2, np.nan])
+    arr2 = SparseArray([np.nan, 1, 2, 3])
+
+    msg = """\
+ExtensionArray NA mask are different
+
+ExtensionArray NA mask values are different \\(25\\.0 %\\)
+\\[left\\]:  \\[True, False, False, True\\]
+\\[right\\]: \\[True, False, False, False\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_extension_array_equal(arr1, arr2)
+
+
+@pytest.mark.parametrize("side", ["left", "right"])
+def test_assert_extension_array_equal_non_extension_array(side):
+    numpy_array = np.arange(5)
+    extension_array = SparseArray(numpy_array)
+
+    msg = f"{side} is not an ExtensionArray"
+    args = (
+        (numpy_array, extension_array)
+        if side == "left"
+        else (extension_array, numpy_array)
+    )
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_extension_array_equal(*args)
+
+
+def test_assert_extension_array_equal_ignore_dtype_mismatch(any_int_dtype):
+    # https://github.com/pandas-dev/pandas/issues/35715
+    left = array([1, 2, 3], dtype="Int64")
+    right = array([1, 2, 3], dtype=any_int_dtype)
+    tm.assert_extension_array_equal(left, right, check_dtype=False)
+
+
+def test_assert_extension_array_equal_time_units():
+    # https://github.com/pandas-dev/pandas/issues/55730
+    timestamp = Timestamp("2023-11-04T12")
+    naive = array([timestamp], dtype="datetime64[ns]")
+    utc = array([timestamp], dtype="datetime64[ns, UTC]")
+
+    tm.assert_extension_array_equal(naive, utc, check_dtype=False)
+    tm.assert_extension_array_equal(utc, naive, check_dtype=False)
diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..19abfe727fb4b3d4046d1ec99a84c8d6e2afc6c8
--- /dev/null
+++ b/pandas/tests/util/test_assert_frame_equal.py
@@ -0,0 +1,425 @@
+import numpy as np
+import pytest
+
+from pandas.errors import Pandas4Warning
+
+import pandas as pd
+from pandas import DataFrame
+import pandas._testing as tm
+
+
+@pytest.fixture(params=[True, False])
+def by_blocks_fixture(request):
+    return request.param
+
+
+def _assert_frame_equal_both(a, b, **kwargs):
+    """
+    Check that two DataFrame equal.
+
+    This check is performed commutatively.
+
+    Parameters
+    ----------
+    a : DataFrame
+        The first DataFrame to compare.
+    b : DataFrame
+        The second DataFrame to compare.
+    kwargs : dict
+        The arguments passed to `tm.assert_frame_equal`.
+    """
+    tm.assert_frame_equal(a, b, **kwargs)
+    tm.assert_frame_equal(b, a, **kwargs)
+
+
+@pytest.mark.parametrize("check_like", [True, False])
+def test_frame_equal_row_order_mismatch(check_like, frame_or_series):
+    df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"])
+    df2 = DataFrame({"A": [3, 2, 1], "B": [6, 5, 4]}, index=["c", "b", "a"])
+
+    if not check_like:  # Do not ignore row-column orderings.
+        msg = f"{frame_or_series.__name__}.index are different"
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_frame_equal(
+                df1, df2, check_like=check_like, obj=frame_or_series.__name__
+            )
+    else:
+        _assert_frame_equal_both(
+            df1, df2, check_like=check_like, obj=frame_or_series.__name__
+        )
+
+
+@pytest.mark.parametrize(
+    "df1,df2",
+    [
+        ({"A": [1, 2, 3]}, {"A": [1, 2, 3, 4]}),
+        ({"A": [1, 2, 3], "B": [4, 5, 6]}, {"A": [1, 2, 3]}),
+    ],
+)
+def test_frame_equal_shape_mismatch(df1, df2, frame_or_series):
+    df1 = DataFrame(df1)
+    df2 = DataFrame(df2)
+    msg = f"{frame_or_series.__name__} are different"
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_frame_equal(df1, df2, obj=frame_or_series.__name__)
+
+
+@pytest.mark.parametrize(
+    "df1,df2,msg",
+    [
+        # Index
+        (
+            DataFrame.from_records({"a": [1, 2], "c": ["l1", "l2"]}, index=["a"]),
+            DataFrame.from_records({"a": [1.0, 2.0], "c": ["l1", "l2"]}, index=["a"]),
+            "DataFrame\\.index are different",
+        ),
+        # MultiIndex
+        (
+            DataFrame.from_records(
+                {"a": [1, 2], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"]
+            ),
+            DataFrame.from_records(
+                {"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"]
+            ),
+            "DataFrame\\.index level \\[0\\] are different",
+        ),
+    ],
+)
+def test_frame_equal_index_dtype_mismatch(df1, df2, msg, check_index_type):
+    kwargs = {"check_index_type": check_index_type}
+
+    if check_index_type:
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_frame_equal(df1, df2, **kwargs)
+    else:
+        tm.assert_frame_equal(df1, df2, **kwargs)
+
+
+def test_empty_dtypes(check_dtype):
+    columns = ["col1", "col2"]
+    df1 = DataFrame(columns=columns)
+    df2 = DataFrame(columns=columns)
+
+    kwargs = {"check_dtype": check_dtype}
+    df1["col1"] = df1["col1"].astype("int64")
+
+    if check_dtype:
+        msg = r"Attributes of DataFrame\..* are different"
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_frame_equal(df1, df2, **kwargs)
+    else:
+        tm.assert_frame_equal(df1, df2, **kwargs)
+
+
+@pytest.mark.parametrize("check_like", [True, False])
+def test_frame_equal_index_mismatch(check_like, frame_or_series, using_infer_string):
+    if using_infer_string:
+        dtype = "str"
+    else:
+        dtype = "object"
+    msg = f"""{frame_or_series.__name__}\\.index are different
+
+{frame_or_series.__name__}\\.index values are different \\(33\\.33333 %\\)
+\\[left\\]:  Index\\(\\['a', 'b', 'c'\\], dtype='{dtype}'\\)
+\\[right\\]: Index\\(\\['a', 'b', 'd'\\], dtype='{dtype}'\\)
+At positional index 2, first diff: c != d"""
+
+    df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"])
+    df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "d"])
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_frame_equal(
+            df1, df2, check_like=check_like, obj=frame_or_series.__name__
+        )
+
+
+@pytest.mark.parametrize("check_like", [True, False])
+def test_frame_equal_columns_mismatch(check_like, frame_or_series, using_infer_string):
+    if using_infer_string:
+        dtype = "str"
+    else:
+        dtype = "object"
+    msg = f"""{frame_or_series.__name__}\\.columns are different
+
+{frame_or_series.__name__}\\.columns values are different \\(50\\.0 %\\)
+\\[left\\]:  Index\\(\\['A', 'B'\\], dtype='{dtype}'\\)
+\\[right\\]: Index\\(\\['A', 'b'\\], dtype='{dtype}'\\)"""
+
+    df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=["a", "b", "c"])
+    df2 = DataFrame({"A": [1, 2, 3], "b": [4, 5, 6]}, index=["a", "b", "c"])
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_frame_equal(
+            df1, df2, check_like=check_like, obj=frame_or_series.__name__
+        )
+
+
+def test_frame_equal_block_mismatch(by_blocks_fixture, frame_or_series):
+    obj = frame_or_series.__name__
+    msg = f"""{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) are different
+
+{obj}\\.iloc\\[:, 1\\] \\(column name="B"\\) values are different \\(33\\.33333 %\\)
+\\[index\\]: \\[0, 1, 2\\]
+\\[left\\]:  \\[4, 5, 6\\]
+\\[right\\]: \\[4, 5, 7\\]"""
+
+    df1 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
+    df2 = DataFrame({"A": [1, 2, 3], "B": [4, 5, 7]})
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_frame_equal(df1, df2, by_blocks=by_blocks_fixture, obj=obj)
+
+
+@pytest.mark.parametrize(
+    "df1,df2,msg",
+    [
+        (
+            {"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]},
+            {"A": ["á", "à", "ä"], "E": ["é", "è", "e̊"]},
+            """{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) are different
+
+{obj}\\.iloc\\[:, 1\\] \\(column name="E"\\) values are different \\(33\\.33333 %\\)
+\\[index\\]: \\[0, 1, 2\\]
+\\[left\\]:  \\[é, è, ë\\]
+\\[right\\]: \\[é, è, e̊\\]""",
+        ),
+        (
+            {"A": ["á", "à", "ä"], "E": ["é", "è", "ë"]},
+            {"A": ["a", "a", "a"], "E": ["e", "e", "e"]},
+            """{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) are different
+
+{obj}\\.iloc\\[:, 0\\] \\(column name="A"\\) values are different \\(100\\.0 %\\)
+\\[index\\]: \\[0, 1, 2\\]
+\\[left\\]:  \\[á, à, ä\\]
+\\[right\\]: \\[a, a, a\\]""",
+        ),
+    ],
+)
+def test_frame_equal_unicode(df1, df2, msg, by_blocks_fixture, frame_or_series):
+    # see gh-20503
+    #
+    # Test ensures that `tm.assert_frame_equals` raises the right exception
+    # when comparing DataFrames containing differing unicode objects.
+    df1 = DataFrame(df1)
+    df2 = DataFrame(df2)
+    msg = msg.format(obj=frame_or_series.__name__)
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_frame_equal(
+            df1, df2, by_blocks=by_blocks_fixture, obj=frame_or_series.__name__
+        )
+
+
+def test_assert_frame_equal_extension_dtype_mismatch():
+    # https://github.com/pandas-dev/pandas/issues/32747
+    left = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
+    right = left.astype(int)
+
+    msg = (
+        "Attributes of DataFrame\\.iloc\\[:, 0\\] "
+        '\\(column name="a"\\) are different\n\n'
+        'Attribute "dtype" are different\n'
+        "\\[left\\]:  Int64\n"
+        "\\[right\\]: int[32|64]"
+    )
+
+    tm.assert_frame_equal(left, right, check_dtype=False)
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_frame_equal(left, right, check_dtype=True)
+
+
+def test_assert_frame_equal_interval_dtype_mismatch():
+    # https://github.com/pandas-dev/pandas/issues/32747
+    left = DataFrame({"a": [pd.Interval(0, 1)]}, dtype="interval")
+    right = left.astype(object)
+
+    msg = (
+        "Attributes of DataFrame\\.iloc\\[:, 0\\] "
+        '\\(column name="a"\\) are different\n\n'
+        'Attribute "dtype" are different\n'
+        "\\[left\\]:  interval\\[int64, right\\]\n"
+        "\\[right\\]: object"
+    )
+
+    tm.assert_frame_equal(left, right, check_dtype=False)
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_frame_equal(left, right, check_dtype=True)
+
+
+def test_assert_frame_equal_ignore_extension_dtype_mismatch():
+    # https://github.com/pandas-dev/pandas/issues/35715
+    left = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
+    right = DataFrame({"a": [1, 2, 3]}, dtype="Int32")
+    tm.assert_frame_equal(left, right, check_dtype=False)
+
+
+def test_assert_frame_equal_ignore_extension_dtype_mismatch_cross_class():
+    # https://github.com/pandas-dev/pandas/issues/35715
+    left = DataFrame({"a": [1, 2, 3]}, dtype="Int64")
+    right = DataFrame({"a": [1, 2, 3]}, dtype="int64")
+    tm.assert_frame_equal(left, right, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "dtype", ["timedelta64[ns]", "datetime64[ns, UTC]", "Period[D]"]
+)
+def test_assert_frame_equal_datetime_like_dtype_mismatch(dtype):
+    df1 = DataFrame({"a": []}, dtype=dtype)
+    df2 = DataFrame({"a": []})
+    tm.assert_frame_equal(df1, df2, check_dtype=False)
+
+
+def test_allows_duplicate_labels():
+    left = DataFrame()
+    right = DataFrame().set_flags(allows_duplicate_labels=False)
+    tm.assert_frame_equal(left, left)
+    tm.assert_frame_equal(right, right)
+    tm.assert_frame_equal(left, right, check_flags=False)
+    tm.assert_frame_equal(right, left, check_flags=False)
+
+    with pytest.raises(AssertionError, match="<Flags"):
+        tm.assert_frame_equal(left, right)
+
+    with pytest.raises(AssertionError, match="<Flags"):
+        tm.assert_frame_equal(left, right)
+
+
+def test_assert_frame_equal_columns_mixed_dtype():
+    # GH#39168
+    df = DataFrame([[0, 1, 2]], columns=["foo", "bar", 42], index=[1, "test", 2])
+    tm.assert_frame_equal(df, df, check_like=True)
+
+
+def test_frame_equal_extension_dtype(frame_or_series, any_numeric_ea_dtype):
+    # GH#39410
+    obj = frame_or_series([1, 2], dtype=any_numeric_ea_dtype)
+    tm.assert_equal(obj, obj, check_exact=True)
+
+
+@pytest.mark.parametrize("indexer", [(0, 1), (1, 0)])
+def test_frame_equal_mixed_dtypes(frame_or_series, any_numeric_ea_dtype, indexer):
+    dtypes = (any_numeric_ea_dtype, "int64")
+    obj1 = frame_or_series([1, 2], dtype=dtypes[indexer[0]])
+    obj2 = frame_or_series([1, 2], dtype=dtypes[indexer[1]])
+    tm.assert_equal(obj1, obj2, check_exact=True, check_dtype=False)
+
+
+def test_assert_frame_equal_check_like_different_indexes():
+    # GH#39739
+    df1 = DataFrame(index=pd.Index([], dtype="object"))
+    df2 = DataFrame(index=pd.RangeIndex(start=0, stop=0, step=1))
+    with pytest.raises(AssertionError, match="DataFrame.index are different"):
+        tm.assert_frame_equal(df1, df2, check_like=True)
+
+
+def test_assert_frame_equal_checking_allow_dups_flag():
+    # GH#45554
+    left = DataFrame([[1, 2], [3, 4]])
+    left.flags.allows_duplicate_labels = False
+
+    right = DataFrame([[1, 2], [3, 4]])
+    right.flags.allows_duplicate_labels = True
+    tm.assert_frame_equal(left, right, check_flags=False)
+
+    with pytest.raises(AssertionError, match="allows_duplicate_labels"):
+        tm.assert_frame_equal(left, right, check_flags=True)
+
+
+def test_assert_frame_equal_check_like_categorical_midx():
+    # GH#48975
+    left = DataFrame(
+        [[1], [2], [3]],
+        index=pd.MultiIndex.from_arrays(
+            [
+                pd.Categorical(["a", "b", "c"]),
+                pd.Categorical(["a", "b", "c"]),
+            ]
+        ),
+    )
+    right = DataFrame(
+        [[3], [2], [1]],
+        index=pd.MultiIndex.from_arrays(
+            [
+                pd.Categorical(["c", "b", "a"]),
+                pd.Categorical(["c", "b", "a"]),
+            ]
+        ),
+    )
+    tm.assert_frame_equal(left, right, check_like=True)
+
+
+def test_assert_frame_equal_ea_column_definition_in_exception_mask():
+    # GH#50323
+    df1 = DataFrame({"a": pd.Series([pd.NA, 1], dtype="Int64")})
+    df2 = DataFrame({"a": pd.Series([1, 1], dtype="Int64")})
+
+    msg = r'DataFrame.iloc\[:, 0\] \(column name="a"\) NA mask values are different'
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_frame_equal(df1, df2)
+
+
+def test_assert_frame_equal_ea_column_definition_in_exception():
+    # GH#50323
+    df1 = DataFrame({"a": pd.Series([pd.NA, 1], dtype="Int64")})
+    df2 = DataFrame({"a": pd.Series([pd.NA, 2], dtype="Int64")})
+
+    msg = r'DataFrame.iloc\[:, 0\] \(column name="a"\) values are different'
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_frame_equal(df1, df2)
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_frame_equal(df1, df2, check_exact=True)
+
+
+def test_assert_frame_equal_ts_column():
+    # GH#50323
+    df1 = DataFrame({"a": [pd.Timestamp("2019-12-31"), pd.Timestamp("2020-12-31")]})
+    df2 = DataFrame({"a": [pd.Timestamp("2020-12-31"), pd.Timestamp("2020-12-31")]})
+
+    msg = r'DataFrame.iloc\[:, 0\] \(column name="a"\) values are different'
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_frame_equal(df1, df2)
+
+
+def test_assert_frame_equal_set():
+    # GH#51727
+    df1 = DataFrame({"set_column": [{1, 2, 3}, {4, 5, 6}]})
+    df2 = DataFrame({"set_column": [{1, 2, 3}, {4, 5, 6}]})
+    tm.assert_frame_equal(df1, df2)
+
+
+def test_assert_frame_equal_set_mismatch():
+    # GH#51727
+    df1 = DataFrame({"set_column": [{1, 2, 3}, {4, 5, 6}]})
+    df2 = DataFrame({"set_column": [{1, 2, 3}, {4, 5, 7}]})
+
+    msg = r'DataFrame.iloc\[:, 0\] \(column name="set_column"\) values are different'
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_frame_equal(df1, df2)
+
+
+def test_datetimelike_compat_deprecated():
+    # GH#55638
+    df = DataFrame({"a": [1]})
+
+    msg = "the 'check_datetimelike_compat' keyword is deprecated"
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        tm.assert_frame_equal(df, df, check_datetimelike_compat=True)
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        tm.assert_frame_equal(df, df, check_datetimelike_compat=False)
+
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        tm.assert_series_equal(df["a"], df["a"], check_datetimelike_compat=True)
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        tm.assert_series_equal(df["a"], df["a"], check_datetimelike_compat=False)
+
+
+@pytest.mark.parametrize("na_value", [pd.NA, np.nan, None])
+def test_assert_frame_equal_nested_df_na(na_value):
+    # GH#43022
+    inner = DataFrame({"a": [1, na_value]})
+    df1 = DataFrame({"df": [inner]})
+    df2 = DataFrame({"df": [inner]})
+    tm.assert_frame_equal(df1, df2)
diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..8baabe97a3219b32f2a6ad2f92cd2feaaf594793
--- /dev/null
+++ b/pandas/tests/util/test_assert_index_equal.py
@@ -0,0 +1,327 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    NA,
+    Categorical,
+    CategoricalIndex,
+    Index,
+    MultiIndex,
+    NaT,
+    RangeIndex,
+)
+import pandas._testing as tm
+
+
+def test_index_equal_levels_mismatch():
+    msg = """Index are different
+
+Index levels are different
+\\[left\\]:  1, Index\\(\\[1, 2, 3\\], dtype='int64'\\)
+\\[right\\]: 2, MultiIndex\\(\\[\\('A', 1\\),
+            \\('A', 2\\),
+            \\('B', 3\\),
+            \\('B', 4\\)\\],
+           \\)"""
+
+    idx1 = Index([1, 2, 3])
+    idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)])
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_index_equal(idx1, idx2, exact=False)
+
+
+def test_index_equal_values_mismatch(check_exact):
+    msg = """MultiIndex level \\[1\\] are different
+
+MultiIndex level \\[1\\] values are different \\(25\\.0 %\\)
+\\[left\\]:  Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\)
+\\[right\\]: Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)"""
+
+    idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), ("B", 3), ("B", 4)])
+    idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)])
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_index_equal(idx1, idx2, check_exact=check_exact)
+
+
+def test_index_equal_length_mismatch(check_exact):
+    msg = """Index are different
+
+Index length are different
+\\[left\\]:  3, Index\\(\\[1, 2, 3\\], dtype='int64'\\)
+\\[right\\]: 4, Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)"""
+
+    idx1 = Index([1, 2, 3])
+    idx2 = Index([1, 2, 3, 4])
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_index_equal(idx1, idx2, check_exact=check_exact)
+
+
+@pytest.mark.parametrize("exact", [False, "equiv"])
+def test_index_equal_class(exact):
+    idx1 = Index([0, 1, 2])
+    idx2 = RangeIndex(3)
+
+    tm.assert_index_equal(idx1, idx2, exact=exact)
+
+
+def test_int_float_index_equal_class_mismatch(check_exact):
+    msg = """Index are different
+
+Attribute "inferred_type" are different
+\\[left\\]:  integer
+\\[right\\]: floating"""
+
+    idx1 = Index([1, 2, 3])
+    idx2 = Index([1, 2, 3], dtype=np.float64)
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_index_equal(idx1, idx2, exact=True, check_exact=check_exact)
+
+
+def test_range_index_equal_class_mismatch(check_exact):
+    msg = """Index are different
+
+Index classes are different
+\\[left\\]:  Index\\(\\[1, 2, 3\\], dtype='int64'\\)
+\\[right\\]: """
+
+    idx1 = Index([1, 2, 3])
+    idx2 = RangeIndex(range(3))
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_index_equal(idx1, idx2, exact=True, check_exact=check_exact)
+
+
+def test_index_equal_values_close(check_exact):
+    idx1 = Index([1, 2, 3.0])
+    idx2 = Index([1, 2, 3.0000000001])
+
+    if check_exact:
+        msg = """Index are different
+
+Index values are different \\(33\\.33333 %\\)
+\\[left\\]:  Index\\(\\[1.0, 2.0, 3.0], dtype='float64'\\)
+\\[right\\]: Index\\(\\[1.0, 2.0, 3.0000000001\\], dtype='float64'\\)"""
+
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_index_equal(idx1, idx2, check_exact=check_exact)
+    else:
+        tm.assert_index_equal(idx1, idx2, check_exact=check_exact)
+
+
+def test_index_equal_values_less_close(check_exact, rtol):
+    idx1 = Index([1, 2, 3.0])
+    idx2 = Index([1, 2, 3.0001])
+    kwargs = {"check_exact": check_exact, "rtol": rtol}
+
+    if check_exact or rtol < 0.5e-3:
+        msg = """Index are different
+
+Index values are different \\(33\\.33333 %\\)
+\\[left\\]:  Index\\(\\[1.0, 2.0, 3.0], dtype='float64'\\)
+\\[right\\]: Index\\(\\[1.0, 2.0, 3.0001\\], dtype='float64'\\)"""
+
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_index_equal(idx1, idx2, **kwargs)
+    else:
+        tm.assert_index_equal(idx1, idx2, **kwargs)
+
+
+def test_index_equal_values_too_far(check_exact, rtol):
+    idx1 = Index([1, 2, 3])
+    idx2 = Index([1, 2, 4])
+    kwargs = {"check_exact": check_exact, "rtol": rtol}
+
+    msg = """Index are different
+
+Index values are different \\(33\\.33333 %\\)
+\\[left\\]:  Index\\(\\[1, 2, 3\\], dtype='int64'\\)
+\\[right\\]: Index\\(\\[1, 2, 4\\], dtype='int64'\\)"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_index_equal(idx1, idx2, **kwargs)
+
+
+@pytest.mark.parametrize("check_order", [True, False])
+def test_index_equal_value_order_mismatch(check_exact, rtol, check_order):
+    idx1 = Index([1, 2, 3])
+    idx2 = Index([3, 2, 1])
+
+    msg = """Index are different
+
+Index values are different \\(66\\.66667 %\\)
+\\[left\\]:  Index\\(\\[1, 2, 3\\], dtype='int64'\\)
+\\[right\\]: Index\\(\\[3, 2, 1\\], dtype='int64'\\)"""
+
+    if check_order:
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_index_equal(
+                idx1, idx2, check_exact=check_exact, rtol=rtol, check_order=True
+            )
+    else:
+        tm.assert_index_equal(
+            idx1, idx2, check_exact=check_exact, rtol=rtol, check_order=False
+        )
+
+
+def test_index_equal_level_values_mismatch(check_exact, rtol):
+    idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), ("B", 3), ("B", 4)])
+    idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)])
+    kwargs = {"check_exact": check_exact, "rtol": rtol}
+
+    msg = """MultiIndex level \\[1\\] are different
+
+MultiIndex level \\[1\\] values are different \\(25\\.0 %\\)
+\\[left\\]:  Index\\(\\[2, 2, 3, 4\\], dtype='int64'\\)
+\\[right\\]: Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_index_equal(idx1, idx2, **kwargs)
+
+
+@pytest.mark.parametrize(
+    "name1,name2",
+    [(None, "x"), ("x", "x"), (np.nan, np.nan), (NaT, NaT), (np.nan, NaT)],
+)
+def test_index_equal_names(name1, name2):
+    idx1 = Index([1, 2, 3], name=name1)
+    idx2 = Index([1, 2, 3], name=name2)
+
+    if name1 == name2 or name1 is name2:
+        tm.assert_index_equal(idx1, idx2)
+    else:
+        name1 = "'x'" if name1 == "x" else name1
+        name2 = "'x'" if name2 == "x" else name2
+        msg = f"""Index are different
+
+Attribute "names" are different
+\\[left\\]:  \\[{name1}\\]
+\\[right\\]: \\[{name2}\\]"""
+
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_index_equal(idx1, idx2)
+
+
+def test_index_equal_category_mismatch(check_categorical, using_infer_string):
+    if using_infer_string:
+        dtype = "str"
+    else:
+        dtype = "object"
+    msg = f"""Index are different
+
+Attribute "dtype" are different
+\\[left\\]:  CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False, \
+categories_dtype={dtype}\\)
+\\[right\\]: CategoricalDtype\\(categories=\\['a', 'b', 'c'\\], \
+ordered=False, categories_dtype={dtype}\\)"""
+
+    idx1 = Index(Categorical(["a", "b"]))
+    idx2 = Index(Categorical(["a", "b"], categories=["a", "b", "c"]))
+
+    if check_categorical:
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_index_equal(idx1, idx2, check_categorical=check_categorical)
+    else:
+        tm.assert_index_equal(idx1, idx2, check_categorical=check_categorical)
+
+
+@pytest.mark.parametrize("exact", [False, True])
+def test_index_equal_range_categories(check_categorical, exact):
+    # GH41263
+    msg = """\
+Index are different
+
+Index classes are different
+\\[left\\]:  RangeIndex\\(start=0, stop=10, step=1\\)
+\\[right\\]: Index\\(\\[0, 1, 2, 3, 4, 5, 6, 7, 8, 9\\], dtype='int64'\\)"""
+
+    rcat = CategoricalIndex(RangeIndex(10))
+    icat = CategoricalIndex(list(range(10)))
+
+    if check_categorical and exact:
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_index_equal(rcat, icat, check_categorical=True, exact=True)
+    else:
+        tm.assert_index_equal(
+            rcat, icat, check_categorical=check_categorical, exact=exact
+        )
+
+
+def test_assert_index_equal_different_inferred_types():
+    # GH#31884
+    msg = """\
+Index are different
+
+Attribute "inferred_type" are different
+\\[left\\]:  mixed
+\\[right\\]: datetime"""
+
+    idx1 = Index([NA, np.datetime64("nat")])
+    idx2 = Index([NA, NaT])
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_index_equal(idx1, idx2)
+
+
+def test_assert_index_equal_different_names_check_order_false():
+    # GH#47328
+    idx1 = Index([1, 3], name="a")
+    idx2 = Index([3, 1], name="b")
+    with pytest.raises(AssertionError, match='"names" are different'):
+        tm.assert_index_equal(idx1, idx2, check_order=False, check_names=True)
+
+
+def test_assert_index_equal_mixed_dtype():
+    # GH#39168
+    idx = Index(["foo", "bar", 42])
+    tm.assert_index_equal(idx, idx, check_order=False)
+
+
+def test_assert_index_equal_ea_dtype_order_false(any_numeric_ea_dtype):
+    # GH#47207
+    idx1 = Index([1, 3], dtype=any_numeric_ea_dtype)
+    idx2 = Index([3, 1], dtype=any_numeric_ea_dtype)
+    tm.assert_index_equal(idx1, idx2, check_order=False)
+
+
+def test_assert_index_equal_object_ints_order_false():
+    # GH#47207
+    idx1 = Index([1, 3], dtype="object")
+    idx2 = Index([3, 1], dtype="object")
+    tm.assert_index_equal(idx1, idx2, check_order=False)
+
+
+@pytest.mark.parametrize("check_categorical", [True, False])
+@pytest.mark.parametrize("check_names", [True, False])
+def test_assert_ea_index_equal_non_matching_na(check_names, check_categorical):
+    # GH#48608
+    idx1 = Index([1, 2], dtype="Int64")
+    idx2 = Index([1, NA], dtype="Int64")
+    with pytest.raises(AssertionError, match="50.0 %"):
+        tm.assert_index_equal(
+            idx1, idx2, check_names=check_names, check_categorical=check_categorical
+        )
+
+
+@pytest.mark.parametrize("check_categorical", [True, False])
+def test_assert_multi_index_dtype_check_categorical(check_categorical):
+    # GH#52126
+    idx1 = MultiIndex.from_arrays([Categorical(np.array([1, 2], dtype=np.uint64))])
+    idx2 = MultiIndex.from_arrays([Categorical(np.array([1, 2], dtype=np.int64))])
+    if check_categorical:
+        with pytest.raises(
+            AssertionError, match=r"^MultiIndex level \[0\] are different"
+        ):
+            tm.assert_index_equal(idx1, idx2, check_categorical=check_categorical)
+    else:
+        tm.assert_index_equal(idx1, idx2, check_categorical=check_categorical)
+
+
+def test_assert_index_equal_categorical_incomparable_categories():
+    # GH#61935
+    left = Index([1, 2, 3], name="a", dtype="category")
+    right = Index([1, 2, 6], name="a", dtype="category")
+    with pytest.raises(AssertionError, match="types are not comparable"):
+        tm.assert_index_equal(left, right, check_categorical=True, exact=False)
diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..aad27672c0f6fa51eefe49d3ca5f32097a3ee8b8
--- /dev/null
+++ b/pandas/tests/util/test_assert_interval_array_equal.py
@@ -0,0 +1,100 @@
+import pytest
+
+from pandas import (
+    Interval,
+    interval_range,
+)
+import pandas._testing as tm
+from pandas.arrays import IntervalArray
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"start": 0, "periods": 4},
+        {"start": 1, "periods": 5},
+        {"start": 5, "end": 10, "closed": "left"},
+    ],
+)
+def test_interval_array_equal(kwargs):
+    arr = interval_range(**kwargs).values
+    tm.assert_interval_array_equal(arr, arr)
+
+
+def test_interval_array_equal_closed_mismatch():
+    kwargs = {"start": 0, "periods": 5}
+    arr1 = interval_range(closed="left", **kwargs).values
+    arr2 = interval_range(closed="right", **kwargs).values
+
+    msg = """\
+IntervalArray are different
+
+Attribute "closed" are different
+\\[left\\]:  left
+\\[right\\]: right"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_interval_array_equal(arr1, arr2)
+
+
+def test_interval_array_equal_periods_mismatch():
+    kwargs = {"start": 0}
+    arr1 = interval_range(periods=5, **kwargs).values
+    arr2 = interval_range(periods=6, **kwargs).values
+
+    msg = """\
+IntervalArray.left are different
+
+IntervalArray.left shapes are different
+\\[left\\]:  \\(5,\\)
+\\[right\\]: \\(6,\\)"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_interval_array_equal(arr1, arr2)
+
+
+def test_interval_array_equal_end_mismatch():
+    kwargs = {"start": 0, "periods": 5}
+    arr1 = interval_range(end=10, **kwargs).values
+    arr2 = interval_range(end=20, **kwargs).values
+
+    msg = """\
+IntervalArray.left are different
+
+IntervalArray.left values are different \\(80.0 %\\)
+\\[left\\]:  \\[0, 2, 4, 6, 8\\]
+\\[right\\]: \\[0, 4, 8, 12, 16\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_interval_array_equal(arr1, arr2)
+
+
+def test_interval_array_equal_start_mismatch():
+    kwargs = {"periods": 4}
+    arr1 = interval_range(start=0, **kwargs).values
+    arr2 = interval_range(start=1, **kwargs).values
+
+    msg = """\
+IntervalArray.left are different
+
+IntervalArray.left values are different \\(100.0 %\\)
+\\[left\\]:  \\[0, 1, 2, 3\\]
+\\[right\\]: \\[1, 2, 3, 4\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_interval_array_equal(arr1, arr2)
+
+
+def test_interval_array_equal_end_mismatch_only():
+    arr1 = IntervalArray([Interval(0, 1), Interval(0, 5)])
+    arr2 = IntervalArray([Interval(0, 1), Interval(0, 6)])
+
+    msg = """\
+IntervalArray.right are different
+
+IntervalArray.right values are different \\(50.0 %\\)
+\\[left\\]:  \\[1, 5\\]
+\\[right\\]: \\[1, 6\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_interval_array_equal(arr1, arr2)
diff --git a/pandas/tests/util/test_assert_numpy_array_equal.py b/pandas/tests/util/test_assert_numpy_array_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..660402ee857e351abe81e1d0e094baf4a7f451f3
--- /dev/null
+++ b/pandas/tests/util/test_assert_numpy_array_equal.py
@@ -0,0 +1,223 @@
+import copy
+
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Timestamp
+import pandas._testing as tm
+
+
+def test_assert_numpy_array_equal_shape_mismatch():
+    msg = """numpy array are different
+
+numpy array shapes are different
+\\[left\\]:  \\(2L*,\\)
+\\[right\\]: \\(3L*,\\)"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5]))
+
+
+def test_assert_numpy_array_equal_bad_type():
+    expected = "Expected type"
+
+    with pytest.raises(AssertionError, match=expected):
+        tm.assert_numpy_array_equal(1, 2)
+
+
+@pytest.mark.parametrize(
+    "a,b,klass1,klass2",
+    [(np.array([1]), 1, "ndarray", "int"), (1, np.array([1]), "int", "ndarray")],
+)
+def test_assert_numpy_array_equal_class_mismatch(a, b, klass1, klass2):
+    msg = f"""numpy array are different
+
+numpy array classes are different
+\\[left\\]:  {klass1}
+\\[right\\]: {klass2}"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(a, b)
+
+
+def test_assert_numpy_array_equal_value_mismatch1():
+    msg = """numpy array are different
+
+numpy array values are different \\(66\\.66667 %\\)
+\\[left\\]:  \\[nan, 2\\.0, 3\\.0\\]
+\\[right\\]: \\[1\\.0, nan, 3\\.0\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(np.array([np.nan, 2, 3]), np.array([1, np.nan, 3]))
+
+
+def test_assert_numpy_array_equal_value_mismatch2():
+    msg = """numpy array are different
+
+numpy array values are different \\(50\\.0 %\\)
+\\[left\\]:  \\[1, 2\\]
+\\[right\\]: \\[1, 3\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(np.array([1, 2]), np.array([1, 3]))
+
+
+def test_assert_numpy_array_equal_value_mismatch3():
+    msg = """numpy array are different
+
+numpy array values are different \\(16\\.66667 %\\)
+\\[left\\]:  \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\]
+\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(
+            np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]])
+        )
+
+
+def test_assert_numpy_array_equal_value_mismatch4():
+    msg = """numpy array are different
+
+numpy array values are different \\(50\\.0 %\\)
+\\[left\\]:  \\[1\\.1, 2\\.000001\\]
+\\[right\\]: \\[1\\.1, 2.0\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(np.array([1.1, 2.000001]), np.array([1.1, 2.0]))
+
+
+def test_assert_numpy_array_equal_value_mismatch5():
+    msg = """numpy array are different
+
+numpy array values are different \\(16\\.66667 %\\)
+\\[left\\]:  \\[\\[1, 2\\], \\[3, 4\\], \\[5, 6\\]\\]
+\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\], \\[5, 6\\]\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(
+            np.array([[1, 2], [3, 4], [5, 6]]), np.array([[1, 3], [3, 4], [5, 6]])
+        )
+
+
+def test_assert_numpy_array_equal_value_mismatch6():
+    msg = """numpy array are different
+
+numpy array values are different \\(25\\.0 %\\)
+\\[left\\]:  \\[\\[1, 2\\], \\[3, 4\\]\\]
+\\[right\\]: \\[\\[1, 3\\], \\[3, 4\\]\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(
+            np.array([[1, 2], [3, 4]]), np.array([[1, 3], [3, 4]])
+        )
+
+
+def test_assert_numpy_array_equal_shape_mismatch_override():
+    msg = """Index are different
+
+Index shapes are different
+\\[left\\]:  \\(2L*,\\)
+\\[right\\]: \\(3L*,\\)"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(np.array([1, 2]), np.array([3, 4, 5]), obj="Index")
+
+
+def test_numpy_array_equal_unicode():
+    # see gh-20503
+    #
+    # Test ensures that `tm.assert_numpy_array_equals` raises the right
+    # exception when comparing np.arrays containing differing unicode objects.
+    msg = """numpy array are different
+
+numpy array values are different \\(33\\.33333 %\\)
+\\[left\\]:  \\[á, à, ä\\]
+\\[right\\]: \\[á, à, å\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(
+            np.array(["á", "à", "ä"]), np.array(["á", "à", "å"])
+        )
+
+
+def test_numpy_array_equal_object():
+    a = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-01")])
+    b = np.array([Timestamp("2011-01-01"), Timestamp("2011-01-02")])
+
+    msg = """numpy array are different
+
+numpy array values are different \\(50\\.0 %\\)
+\\[left\\]:  \\[2011-01-01 00:00:00, 2011-01-01 00:00:00\\]
+\\[right\\]: \\[2011-01-01 00:00:00, 2011-01-02 00:00:00\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(a, b)
+
+
+@pytest.mark.parametrize("other_type", ["same", "copy"])
+@pytest.mark.parametrize("check_same", ["same", "copy"])
+def test_numpy_array_equal_copy_flag(other_type, check_same):
+    a = np.array([1, 2, 3])
+    msg = None
+
+    if other_type == "same":
+        other = a.view()
+    else:
+        other = a.copy()
+
+    if check_same != other_type:
+        msg = (
+            r"array\(\[1, 2, 3\]\) is not array\(\[1, 2, 3\]\)"
+            if check_same == "same"
+            else r"array\(\[1, 2, 3\]\) is array\(\[1, 2, 3\]\)"
+        )
+
+    if msg is not None:
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_numpy_array_equal(a, other, check_same=check_same)
+    else:
+        tm.assert_numpy_array_equal(a, other, check_same=check_same)
+
+
+def test_numpy_array_equal_contains_na():
+    # https://github.com/pandas-dev/pandas/issues/31881
+    a = np.array([True, False])
+    b = np.array([True, pd.NA], dtype=object)
+
+    msg = """numpy array are different
+
+numpy array values are different \\(50.0 %\\)
+\\[left\\]:  \\[True, False\\]
+\\[right\\]: \\[True, <NA>\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(a, b)
+
+
+def test_numpy_array_equal_identical_na(nulls_fixture):
+    a = np.array([nulls_fixture], dtype=object)
+
+    tm.assert_numpy_array_equal(a, a)
+
+    # matching but not the identical object
+    if hasattr(nulls_fixture, "copy"):
+        other = nulls_fixture.copy()
+    else:
+        other = copy.copy(nulls_fixture)
+    b = np.array([other], dtype=object)
+    tm.assert_numpy_array_equal(a, b)
+
+
+def test_numpy_array_equal_different_na():
+    a = np.array([np.nan], dtype=object)
+    b = np.array([pd.NA], dtype=object)
+
+    msg = """numpy array are different
+
+numpy array values are different \\(100.0 %\\)
+\\[left\\]:  \\[nan\\]
+\\[right\\]: \\[<NA>\\]"""
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_numpy_array_equal(a, b)
diff --git a/pandas/tests/util/test_assert_produces_warning.py b/pandas/tests/util/test_assert_produces_warning.py
new file mode 100644
index 0000000000000000000000000000000000000000..9316f1452477c8b665339929cc8faf9498539b8f
--- /dev/null
+++ b/pandas/tests/util/test_assert_produces_warning.py
@@ -0,0 +1,277 @@
+""" "
+Test module for testing ``pandas._testing.assert_produces_warning``.
+"""
+
+import warnings
+
+import pytest
+
+from pandas.errors import (
+    DtypeWarning,
+    PerformanceWarning,
+)
+
+import pandas._testing as tm
+
+
+@pytest.fixture(
+    params=[
+        (RuntimeWarning, UserWarning),
+        (UserWarning, FutureWarning),
+        (FutureWarning, RuntimeWarning),
+        (DeprecationWarning, PerformanceWarning),
+        (PerformanceWarning, FutureWarning),
+        (DtypeWarning, DeprecationWarning),
+        (ResourceWarning, DeprecationWarning),
+        (FutureWarning, DeprecationWarning),
+    ],
+    ids=lambda x: type(x).__name__,
+)
+def pair_different_warnings(request):
+    """
+    Return pair or different warnings.
+
+    Useful for testing how several different warnings are handled
+    in tm.assert_produces_warning.
+    """
+    return request.param
+
+
+def f():
+    warnings.warn("f1", FutureWarning)  # pdlint: ignore[warning_class]
+    warnings.warn("f2", RuntimeWarning)
+
+
+def test_assert_produces_warning_honors_filter():
+    # Raise by default.
+    msg = r"Caused unexpected warning\(s\)"
+    with pytest.raises(AssertionError, match=msg):
+        with tm.assert_produces_warning(RuntimeWarning):
+            f()
+
+    with tm.assert_produces_warning(RuntimeWarning, raise_on_extra_warnings=False):
+        f()
+
+
+@pytest.mark.parametrize(
+    "category",
+    [
+        RuntimeWarning,
+        ResourceWarning,
+        UserWarning,
+        FutureWarning,
+        DeprecationWarning,
+        PerformanceWarning,
+        DtypeWarning,
+    ],
+)
+@pytest.mark.parametrize(
+    "message, match",
+    [
+        ("", None),
+        ("", ""),
+        ("Warning message", r".*"),
+        ("Warning message", "War"),
+        ("Warning message", r"[Ww]arning"),
+        ("Warning message", "age"),
+        ("Warning message", r"age$"),
+        ("Message 12-234 with numbers", r"\d{2}-\d{3}"),
+        ("Message 12-234 with numbers", r"^Mes.*\d{2}-\d{3}"),
+        ("Message 12-234 with numbers", r"\d{2}-\d{3}\s\S+"),
+        ("Message, which we do not match", None),
+    ],
+)
+def test_catch_warning_category_and_match(category, message, match):
+    with tm.assert_produces_warning(category, match=match):
+        warnings.warn(message, category)
+
+
+def test_fail_to_match_runtime_warning():
+    category = RuntimeWarning
+    match = "Did not see this warning"
+    unmatched = (
+        r"Did not see warning 'RuntimeWarning' matching 'Did not see this warning'. "
+        r"The emitted warning messages are "
+        r"\[RuntimeWarning\('This is not a match.'\), "
+        r"RuntimeWarning\('Another unmatched warning.'\)\]"
+    )
+    with pytest.raises(AssertionError, match=unmatched):
+        with tm.assert_produces_warning(category, match=match):
+            warnings.warn("This is not a match.", category)
+            warnings.warn("Another unmatched warning.", category)
+
+
+def test_fail_to_match_future_warning():
+    category = FutureWarning
+    match = "Warning"
+    unmatched = (
+        r"Did not see warning 'FutureWarning' matching 'Warning'. "
+        r"The emitted warning messages are "
+        r"\[FutureWarning\('This is not a match.'\), "
+        r"FutureWarning\('Another unmatched warning.'\)\]"
+    )
+    with pytest.raises(AssertionError, match=unmatched):
+        with tm.assert_produces_warning(category, match=match):
+            warnings.warn("This is not a match.", category)
+            warnings.warn("Another unmatched warning.", category)
+
+
+def test_fail_to_match_resource_warning():
+    category = ResourceWarning
+    match = r"\d+"
+    unmatched = (
+        r"Did not see warning 'ResourceWarning' matching '\\d\+'. "
+        r"The emitted warning messages are "
+        r"\[ResourceWarning\('This is not a match.'\), "
+        r"ResourceWarning\('Another unmatched warning.'\)\]"
+    )
+    with pytest.raises(AssertionError, match=unmatched):
+        with tm.assert_produces_warning(category, match=match):
+            warnings.warn("This is not a match.", category)
+            warnings.warn("Another unmatched warning.", category)
+
+
+def test_fail_to_catch_actual_warning(pair_different_warnings):
+    expected_category, actual_category = pair_different_warnings
+    match = "Did not see expected warning of class"
+    with pytest.raises(AssertionError, match=match):
+        with tm.assert_produces_warning(expected_category):
+            warnings.warn("warning message", actual_category)
+
+
+def test_ignore_extra_warning(pair_different_warnings):
+    expected_category, extra_category = pair_different_warnings
+    with tm.assert_produces_warning(expected_category, raise_on_extra_warnings=False):
+        warnings.warn("Expected warning", expected_category)
+        warnings.warn("Unexpected warning OK", extra_category)
+
+
+def test_raise_on_extra_warning(pair_different_warnings):
+    expected_category, extra_category = pair_different_warnings
+    match = r"Caused unexpected warning\(s\)"
+    with pytest.raises(AssertionError, match=match):
+        with tm.assert_produces_warning(expected_category):
+            warnings.warn("Expected warning", expected_category)
+            warnings.warn("Unexpected warning NOT OK", extra_category)
+
+
+def test_same_category_different_messages_first_match():
+    category = UserWarning
+    with tm.assert_produces_warning(category, match=r"^Match this"):
+        warnings.warn("Match this", category)
+        warnings.warn("Do not match that", category)
+        warnings.warn("Do not match that either", category)
+
+
+def test_same_category_different_messages_last_match():
+    category = DeprecationWarning
+    with tm.assert_produces_warning(category, match=r"^Match this"):
+        warnings.warn("Do not match that", category)
+        warnings.warn("Do not match that either", category)
+        warnings.warn("Match this", category)
+
+
+def test_match_multiple_warnings():
+    # https://github.com/pandas-dev/pandas/issues/47829
+    category = (FutureWarning, UserWarning)
+    with tm.assert_produces_warning(category, match=r"^Match this"):
+        warnings.warn("Match this", FutureWarning)  # pdlint: ignore[warning_class]
+        warnings.warn("Match this too", UserWarning)
+
+
+def test_must_match_multiple_warnings():
+    # https://github.com/pandas-dev/pandas/issues/56555
+    category = (FutureWarning, UserWarning)
+    msg = "Did not see expected warning of class 'UserWarning'"
+    with pytest.raises(AssertionError, match=msg):
+        with tm.assert_produces_warning(category, match=r"^Match this"):
+            warnings.warn("Match this", FutureWarning)  # pdlint: ignore[warning_class]
+
+
+def test_must_match_multiple_warnings_messages():
+    # https://github.com/pandas-dev/pandas/issues/56555
+    category = (FutureWarning, UserWarning)
+    msg = r"The emitted warning messages are \[UserWarning\('Not this'\)\]"
+    with pytest.raises(AssertionError, match=msg):
+        with tm.assert_produces_warning(category, match=r"^Match this"):
+            warnings.warn("Match this", FutureWarning)  # pdlint: ignore[warning_class]
+            warnings.warn("Not this", UserWarning)
+
+
+def test_allow_partial_match_for_multiple_warnings():
+    # https://github.com/pandas-dev/pandas/issues/56555
+    category = (FutureWarning, UserWarning)
+    with tm.assert_produces_warning(
+        category, match=r"^Match this", must_find_all_warnings=False
+    ):
+        warnings.warn("Match this", FutureWarning)  # pdlint: ignore[warning_class]
+
+
+def test_allow_partial_match_for_multiple_warnings_messages():
+    # https://github.com/pandas-dev/pandas/issues/56555
+    category = (FutureWarning, UserWarning)
+    with tm.assert_produces_warning(
+        category, match=r"^Match this", must_find_all_warnings=False
+    ):
+        warnings.warn("Match this", FutureWarning)  # pdlint: ignore[warning_class]
+        warnings.warn("Not this", UserWarning)
+
+
+def test_right_category_wrong_match_raises(pair_different_warnings):
+    target_category, other_category = pair_different_warnings
+    with pytest.raises(AssertionError, match="Did not see warning.*matching"):
+        with tm.assert_produces_warning(target_category, match=r"^Match this"):
+            warnings.warn("Do not match it", target_category)
+            warnings.warn("Match this", other_category)
+
+
+@pytest.mark.parametrize("false_or_none", [False, None])
+class TestFalseOrNoneExpectedWarning:
+    def test_raise_on_warning(self, false_or_none):
+        msg = r"Caused unexpected warning\(s\)"
+        with pytest.raises(AssertionError, match=msg):
+            with tm.assert_produces_warning(false_or_none):
+                f()
+
+    def test_no_raise_without_warning(self, false_or_none):
+        with tm.assert_produces_warning(false_or_none):
+            pass
+
+    def test_no_raise_with_false_raise_on_extra(self, false_or_none):
+        with tm.assert_produces_warning(false_or_none, raise_on_extra_warnings=False):
+            f()
+
+
+def test_raises_during_exception():
+    msg = "Did not see expected warning of class 'UserWarning'"
+    with pytest.raises(AssertionError, match=msg):
+        with tm.assert_produces_warning(UserWarning):
+            raise ValueError
+
+    with pytest.raises(AssertionError, match=msg):
+        with tm.assert_produces_warning(UserWarning):
+            warnings.warn(
+                "FutureWarning", FutureWarning
+            )  # pdlint: ignore[warning_class]
+            raise IndexError
+
+    msg = "Caused unexpected warning"
+    with pytest.raises(AssertionError, match=msg):
+        with tm.assert_produces_warning(None):
+            warnings.warn(
+                "FutureWarning", FutureWarning
+            )  # pdlint: ignore[warning_class]
+            raise SystemError
+
+
+def test_passes_during_exception():
+    with pytest.raises(SyntaxError, match="Error"):
+        with tm.assert_produces_warning(None):
+            raise SyntaxError("Error")
+
+    with pytest.raises(ValueError, match="Error"):
+        with tm.assert_produces_warning(FutureWarning, match="FutureWarning"):
+            warnings.warn(
+                "FutureWarning", FutureWarning
+            )  # pdlint: ignore[warning_class]
+            raise ValueError("Error")
diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py
new file mode 100644
index 0000000000000000000000000000000000000000..683ca1d875ac5efde9d13498b5b36f17983196db
--- /dev/null
+++ b/pandas/tests/util/test_assert_series_equal.py
@@ -0,0 +1,509 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+
+
+def _assert_series_equal_both(a, b, **kwargs):
+    """
+    Check that two Series equal.
+
+    This check is performed commutatively.
+
+    Parameters
+    ----------
+    a : Series
+        The first Series to compare.
+    b : Series
+        The second Series to compare.
+    kwargs : dict
+        The arguments passed to `tm.assert_series_equal`.
+    """
+    tm.assert_series_equal(a, b, **kwargs)
+    tm.assert_series_equal(b, a, **kwargs)
+
+
+def _assert_not_series_equal(a, b, **kwargs):
+    """
+    Check that two Series are not equal.
+
+    Parameters
+    ----------
+    a : Series
+        The first Series to compare.
+    b : Series
+        The second Series to compare.
+    kwargs : dict
+        The arguments passed to `tm.assert_series_equal`.
+    """
+    try:
+        tm.assert_series_equal(a, b, **kwargs)
+        msg = "The two Series were equal when they shouldn't have been"
+
+        pytest.fail(msg=msg)
+    except AssertionError:
+        pass
+
+
+def _assert_not_series_equal_both(a, b, **kwargs):
+    """
+    Check that two Series are not equal.
+
+    This check is performed commutatively.
+
+    Parameters
+    ----------
+    a : Series
+        The first Series to compare.
+    b : Series
+        The second Series to compare.
+    kwargs : dict
+        The arguments passed to `tm.assert_series_equal`.
+    """
+    _assert_not_series_equal(a, b, **kwargs)
+    _assert_not_series_equal(b, a, **kwargs)
+
+
+@pytest.mark.parametrize("data", [range(3), list("abc"), list("áàä")])
+def test_series_equal(data):
+    _assert_series_equal_both(Series(data), Series(data))
+
+
+@pytest.mark.parametrize(
+    "data1,data2",
+    [
+        (range(3), range(1, 4)),
+        (list("abc"), list("xyz")),
+        (list("áàä"), list("éèë")),
+        (list("áàä"), list(b"aaa")),
+        (range(3), range(4)),
+    ],
+)
+def test_series_not_equal_value_mismatch(data1, data2):
+    _assert_not_series_equal_both(Series(data1), Series(data2))
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"dtype": "float64"},  # dtype mismatch
+        {"index": [1, 2, 4]},  # index mismatch
+        {"name": "foo"},  # name mismatch
+    ],
+)
+def test_series_not_equal_metadata_mismatch(kwargs):
+    data = range(3)
+    s1 = Series(data)
+
+    s2 = Series(data, **kwargs)
+    _assert_not_series_equal_both(s1, s2)
+
+
+@pytest.mark.parametrize("data1,data2", [(0.12345, 0.12346), (0.1235, 0.1236)])
+@pytest.mark.parametrize("decimals", [0, 1, 2, 3, 5, 10])
+def test_less_precise(data1, data2, any_float_dtype, decimals):
+    rtol = 10**-decimals
+    s1 = Series([data1], dtype=any_float_dtype)
+    s2 = Series([data2], dtype=any_float_dtype)
+
+    if decimals in (5, 10) or (decimals >= 3 and abs(data1 - data2) >= 0.0005):
+        msg = "Series values are different"
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_series_equal(s1, s2, rtol=rtol)
+    else:
+        _assert_series_equal_both(s1, s2, rtol=rtol)
+
+
+@pytest.mark.parametrize(
+    "s1,s2,msg",
+    [
+        # Index
+        (
+            Series(["l1", "l2"], index=[1, 2]),
+            Series(["l1", "l2"], index=[1.0, 2.0]),
+            "Series\\.index are different",
+        ),
+        # MultiIndex
+        (
+            DataFrame.from_records(
+                {"a": [1, 2], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"]
+            ).c,
+            DataFrame.from_records(
+                {"a": [1.0, 2.0], "b": [2.1, 1.5], "c": ["l1", "l2"]}, index=["a", "b"]
+            ).c,
+            "Series\\.index level \\[0\\] are different",
+        ),
+    ],
+)
+def test_series_equal_index_dtype(s1, s2, msg, check_index_type):
+    kwargs = {"check_index_type": check_index_type}
+
+    if check_index_type:
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_series_equal(s1, s2, **kwargs)
+    else:
+        tm.assert_series_equal(s1, s2, **kwargs)
+
+
+@pytest.mark.parametrize("check_like", [True, False])
+def test_series_equal_order_mismatch(check_like):
+    s1 = Series([1, 2, 3], index=["a", "b", "c"])
+    s2 = Series([3, 2, 1], index=["c", "b", "a"])
+
+    if not check_like:  # Do not ignore index ordering.
+        with pytest.raises(AssertionError, match="Series.index are different"):
+            tm.assert_series_equal(s1, s2, check_like=check_like)
+    else:
+        _assert_series_equal_both(s1, s2, check_like=check_like)
+
+
+@pytest.mark.parametrize("check_index", [True, False])
+def test_series_equal_index_mismatch(check_index):
+    s1 = Series([1, 2, 3], index=["a", "b", "c"])
+    s2 = Series([1, 2, 3], index=["c", "b", "a"])
+
+    if check_index:  # Do not ignore index.
+        with pytest.raises(AssertionError, match="Series.index are different"):
+            tm.assert_series_equal(s1, s2, check_index=check_index)
+    else:
+        _assert_series_equal_both(s1, s2, check_index=check_index)
+
+
+def test_series_invalid_param_combination():
+    left = Series(dtype=object)
+    right = Series(dtype=object)
+    with pytest.raises(
+        ValueError, match="check_like must be False if check_index is False"
+    ):
+        tm.assert_series_equal(left, right, check_index=False, check_like=True)
+
+
+def test_series_equal_length_mismatch(rtol):
+    msg = """Series are different
+
+Series length are different
+\\[left\\]:  3, RangeIndex\\(start=0, stop=3, step=1\\)
+\\[right\\]: 4, RangeIndex\\(start=0, stop=4, step=1\\)"""
+
+    s1 = Series([1, 2, 3])
+    s2 = Series([1, 2, 3, 4])
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_series_equal(s1, s2, rtol=rtol)
+
+
+def test_series_equal_numeric_values_mismatch(rtol):
+    msg = """Series are different
+
+Series values are different \\(33\\.33333 %\\)
+\\[index\\]: \\[0, 1, 2\\]
+\\[left\\]:  \\[1, 2, 3\\]
+\\[right\\]: \\[1, 2, 4\\]"""
+
+    s1 = Series([1, 2, 3])
+    s2 = Series([1, 2, 4])
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_series_equal(s1, s2, rtol=rtol)
+
+
+def test_series_equal_categorical_values_mismatch(rtol, using_infer_string):
+    dtype = "str" if using_infer_string else "object"
+    msg = f"""Series are different
+
+Series values are different \\(66\\.66667 %\\)
+\\[index\\]: \\[0, 1, 2\\]
+\\[left\\]:  \\['a', 'b', 'c'\\]
+Categories \\(3, {dtype}\\): \\['a', 'b', 'c'\\]
+\\[right\\]: \\['a', 'c', 'b'\\]
+Categories \\(3, {dtype}\\): \\['a', 'b', 'c'\\]"""
+
+    s1 = Series(Categorical(["a", "b", "c"]))
+    s2 = Series(Categorical(["a", "c", "b"]))
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_series_equal(s1, s2, rtol=rtol)
+
+
+def test_series_equal_datetime_values_mismatch(rtol):
+    msg = """Series are different
+
+Series values are different \\(100.0 %\\)
+\\[index\\]: \\[0, 1, 2\\]
+\\[left\\]:  \\[1514764800000000000, 1514851200000000000, 1514937600000000000\\]
+\\[right\\]: \\[1549065600000000000, 1549152000000000000, 1549238400000000000\\]"""
+
+    s1 = Series(pd.date_range("2018-01-01", periods=3, freq="D", unit="ns"))
+    s2 = Series(pd.date_range("2019-02-02", periods=3, freq="D", unit="ns"))
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_series_equal(s1, s2, rtol=rtol)
+
+
+def test_series_equal_categorical_mismatch(check_categorical, using_infer_string):
+    if using_infer_string:
+        dtype = "str"
+    else:
+        dtype = "object"
+    msg = f"""Attributes of Series are different
+
+Attribute "dtype" are different
+\\[left\\]:  CategoricalDtype\\(categories=\\['a', 'b'\\], ordered=False, \
+categories_dtype={dtype}\\)
+\\[right\\]: CategoricalDtype\\(categories=\\['a', 'b', 'c'\\], \
+ordered=False, categories_dtype={dtype}\\)"""
+
+    s1 = Series(Categorical(["a", "b"]))
+    s2 = Series(Categorical(["a", "b"], categories=list("abc")))
+
+    if check_categorical:
+        with pytest.raises(AssertionError, match=msg):
+            tm.assert_series_equal(s1, s2, check_categorical=check_categorical)
+    else:
+        _assert_series_equal_both(s1, s2, check_categorical=check_categorical)
+
+
+def test_assert_series_equal_extension_dtype_mismatch():
+    # https://github.com/pandas-dev/pandas/issues/32747
+    left = Series(pd.array([1, 2, 3], dtype="Int64"))
+    right = left.astype(int)
+
+    msg = """Attributes of Series are different
+
+Attribute "dtype" are different
+\\[left\\]:  Int64
+\\[right\\]: int[32|64]"""
+
+    tm.assert_series_equal(left, right, check_dtype=False)
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_series_equal(left, right, check_dtype=True)
+
+
+def test_assert_series_equal_interval_dtype_mismatch():
+    # https://github.com/pandas-dev/pandas/issues/32747
+    left = Series([pd.Interval(0, 1)], dtype="interval")
+    right = left.astype(object)
+
+    msg = """Attributes of Series are different
+
+Attribute "dtype" are different
+\\[left\\]:  interval\\[int64, right\\]
+\\[right\\]: object"""
+
+    tm.assert_series_equal(left, right, check_dtype=False)
+
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_series_equal(left, right, check_dtype=True)
+
+
+def test_series_equal_series_type():
+    class MySeries(Series):
+        pass
+
+    s1 = Series([1, 2])
+    s2 = Series([1, 2])
+    s3 = MySeries([1, 2])
+
+    tm.assert_series_equal(s1, s2, check_series_type=False)
+    tm.assert_series_equal(s1, s2, check_series_type=True)
+
+    tm.assert_series_equal(s1, s3, check_series_type=False)
+    tm.assert_series_equal(s3, s1, check_series_type=False)
+
+    with pytest.raises(AssertionError, match="Series classes are different"):
+        tm.assert_series_equal(s1, s3, check_series_type=True)
+
+    with pytest.raises(AssertionError, match="Series classes are different"):
+        tm.assert_series_equal(s3, s1, check_series_type=True)
+
+
+def test_series_equal_exact_for_nonnumeric():
+    # https://github.com/pandas-dev/pandas/issues/35446
+    s1 = Series(["a", "b"])
+    s2 = Series(["a", "b"])
+    s3 = Series(["b", "a"])
+
+    tm.assert_series_equal(s1, s2, check_exact=True)
+    tm.assert_series_equal(s2, s1, check_exact=True)
+
+    msg = """Series are different
+
+Series values are different \\(100\\.0 %\\)
+\\[index\\]: \\[0, 1\\]
+\\[left\\]:  \\[a, b\\]
+\\[right\\]: \\[b, a\\]"""
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_series_equal(s1, s3, check_exact=True)
+
+    msg = """Series are different
+
+Series values are different \\(100\\.0 %\\)
+\\[index\\]: \\[0, 1\\]
+\\[left\\]:  \\[b, a\\]
+\\[right\\]: \\[a, b\\]"""
+    with pytest.raises(AssertionError, match=msg):
+        tm.assert_series_equal(s3, s1, check_exact=True)
+
+
+def test_assert_series_equal_ignore_extension_dtype_mismatch():
+    # https://github.com/pandas-dev/pandas/issues/35715
+    left = Series([1, 2, 3], dtype="Int64")
+    right = Series([1, 2, 3], dtype="Int32")
+    tm.assert_series_equal(left, right, check_dtype=False)
+
+
+def test_assert_series_equal_ignore_extension_dtype_mismatch_cross_class():
+    # https://github.com/pandas-dev/pandas/issues/35715
+    left = Series([1, 2, 3], dtype="Int64")
+    right = Series([1, 2, 3], dtype="int64")
+    tm.assert_series_equal(left, right, check_dtype=False)
+
+
+def test_allows_duplicate_labels():
+    left = Series([1])
+    right = Series([1]).set_flags(allows_duplicate_labels=False)
+    tm.assert_series_equal(left, left)
+    tm.assert_series_equal(right, right)
+    tm.assert_series_equal(left, right, check_flags=False)
+    tm.assert_series_equal(right, left, check_flags=False)
+
+    with pytest.raises(AssertionError, match="<Flags"):
+        tm.assert_series_equal(left, right)
+
+    with pytest.raises(AssertionError, match="<Flags"):
+        tm.assert_series_equal(left, right)
+
+
+def test_assert_series_equal_identical_na(nulls_fixture):
+    ser = Series([nulls_fixture])
+
+    tm.assert_series_equal(ser, ser.copy())
+
+    # while we're here do Index too
+    idx = pd.Index(ser)
+    tm.assert_index_equal(idx, idx.copy(deep=True))
+
+
+def test_identical_nested_series_is_equal():
+    # GH#22400
+    x = Series(
+        [
+            0,
+            0.0131142231938,
+            1.77774652865e-05,
+            np.array([0.4722720840328748, 0.4216929783681722]),
+        ]
+    )
+    y = Series(
+        [
+            0,
+            0.0131142231938,
+            1.77774652865e-05,
+            np.array([0.4722720840328748, 0.4216929783681722]),
+        ]
+    )
+    # These two arrays should be equal, nesting could cause issue
+
+    tm.assert_series_equal(x, x)
+    tm.assert_series_equal(x, x, check_exact=True)
+    tm.assert_series_equal(x, y)
+    tm.assert_series_equal(x, y, check_exact=True)
+
+
+@pytest.mark.parametrize("dtype", ["datetime64", "timedelta64"])
+def test_check_dtype_false_different_reso(dtype):
+    # GH 52449
+    ser_s = Series([1000213, 2131232, 21312331]).astype(f"{dtype}[s]")
+    ser_ms = ser_s.astype(f"{dtype}[ms]")
+    with pytest.raises(AssertionError, match="Attributes of Series are different"):
+        tm.assert_series_equal(ser_s, ser_ms)
+    tm.assert_series_equal(ser_ms, ser_s, check_dtype=False)
+
+    ser_ms -= Series([1, 1, 1]).astype(f"{dtype}[ms]")
+
+    with pytest.raises(AssertionError, match="Series are different"):
+        tm.assert_series_equal(ser_s, ser_ms)
+
+    with pytest.raises(AssertionError, match="Series are different"):
+        tm.assert_series_equal(ser_s, ser_ms, check_dtype=False)
+
+
+@pytest.mark.parametrize("dtype", ["Int64", "int64"])
+def test_large_unequal_ints(dtype):
+    # https://github.com/pandas-dev/pandas/issues/55882
+    left = Series([1577840521123000], dtype=dtype)
+    right = Series([1577840521123543], dtype=dtype)
+    with pytest.raises(AssertionError, match="Series are different"):
+        tm.assert_series_equal(left, right)
+
+
+@pytest.mark.parametrize("dtype", [None, object])
+@pytest.mark.parametrize("check_exact", [True, False])
+@pytest.mark.parametrize("val", [3, 3.5])
+def test_ea_and_numpy_no_dtype_check(val, check_exact, dtype):
+    # GH#56651
+    left = Series([1, 2, val], dtype=dtype)
+    right = Series(pd.array([1, 2, val]))
+    tm.assert_series_equal(left, right, check_dtype=False, check_exact=check_exact)
+
+
+def test_assert_series_equal_int_tol():
+    # GH#56646
+    left = Series([81, 18, 121, 38, 74, 72, 81, 81, 146, 81, 81, 170, 74, 74])
+    right = Series([72, 9, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72, 72])
+    tm.assert_series_equal(left, right, rtol=1.5)
+
+    tm.assert_frame_equal(left.to_frame(), right.to_frame(), rtol=1.5)
+    tm.assert_extension_array_equal(
+        left.astype("Int64").values, right.astype("Int64").values, rtol=1.5
+    )
+
+
+@pytest.mark.parametrize(
+    "left_idx, right_idx",
+    [
+        (
+            pd.Index([0, 0.2, 0.4, 0.6, 0.8, 1]),
+            pd.Index(np.linspace(0, 1, 6)),
+        ),
+        (
+            pd.MultiIndex.from_arrays([[0, 0, 0, 0, 1, 1], [0, 0.2, 0.4, 0.6, 0.8, 1]]),
+            pd.MultiIndex.from_arrays([[0, 0, 0, 0, 1, 1], np.linspace(0, 1, 6)]),
+        ),
+        (
+            pd.MultiIndex.from_arrays(
+                [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 4, 5, 10000000000001]]
+            ),
+            pd.MultiIndex.from_arrays(
+                [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 4, 5, 10000000000002]]
+            ),
+        ),
+        pytest.param(
+            pd.Index([1, 2, 3, 4, 5, 10000000000001]),
+            pd.Index([1, 2, 3, 4, 5, 10000000000002]),
+            marks=pytest.mark.xfail(reason="check_exact_index defaults to True"),
+        ),
+        pytest.param(
+            pd.MultiIndex.from_arrays(
+                [[0, 0, 0, 0, 1, 1], [1, 2, 3, 4, 5, 10000000000001]]
+            ),
+            pd.MultiIndex.from_arrays(
+                [[0, 0, 0, 0, 1, 1], [1, 2, 3, 4, 5, 10000000000002]]
+            ),
+            marks=pytest.mark.xfail(reason="check_exact_index defaults to True"),
+        ),
+    ],
+)
+def test_assert_series_equal_check_exact_index_default(left_idx, right_idx):
+    # GH#57067
+    ser1 = Series(np.zeros(6, dtype=int), left_idx)
+    ser2 = Series(np.zeros(6, dtype=int), right_idx)
+    tm.assert_series_equal(ser1, ser2)
+    tm.assert_frame_equal(ser1.to_frame(), ser2.to_frame())
diff --git a/pandas/tests/util/test_deprecate.py b/pandas/tests/util/test_deprecate.py
new file mode 100644
index 0000000000000000000000000000000000000000..94c8fe7fd45d12b78e7830d0b4cfda69d081642f
--- /dev/null
+++ b/pandas/tests/util/test_deprecate.py
@@ -0,0 +1,73 @@
+from textwrap import dedent
+
+import pytest
+
+from pandas.util._decorators import deprecate
+
+import pandas._testing as tm
+
+
+def new_func():
+    """
+    This is the summary. The deprecate directive goes next.
+
+    This is the extended summary. The deprecate directive goes before this.
+    """
+    return "new_func called"
+
+
+def new_func_no_docstring():
+    return "new_func_no_docstring called"
+
+
+def new_func_wrong_docstring():
+    """Summary should be in the next line."""
+    return "new_func_wrong_docstring called"
+
+
+def new_func_with_deprecation():
+    """
+    This is the summary. The deprecate directive goes next.
+
+    .. deprecated:: 1.0
+        Use new_func instead.
+
+    This is the extended summary. The deprecate directive goes before this.
+    """
+
+
+def test_deprecate_ok():
+    depr_func = deprecate(
+        FutureWarning, "depr_func", new_func, "1.0", msg="Use new_func instead."
+    )
+
+    with tm.assert_produces_warning(FutureWarning):
+        result = depr_func()
+
+    assert result == "new_func called"
+    assert depr_func.__doc__ == dedent(new_func_with_deprecation.__doc__)
+
+
+def test_deprecate_no_docstring():
+    depr_func = deprecate(
+        FutureWarning,
+        "depr_func",
+        new_func_no_docstring,
+        "1.0",
+        msg="Use new_func instead.",
+    )
+    with tm.assert_produces_warning(FutureWarning):
+        result = depr_func()
+    assert result == "new_func_no_docstring called"
+
+
+def test_deprecate_wrong_docstring():
+    msg = "deprecate needs a correctly formatted docstring"
+    with pytest.raises(AssertionError, match=msg):
+        deprecate(
+            FutureWarning,
+            "depr_func",
+            new_func_wrong_docstring,
+            "1.0",
+            msg="Use new_func instead.",
+        )
diff --git a/pandas/tests/util/test_deprecate_kwarg.py b/pandas/tests/util/test_deprecate_kwarg.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae3519638ca03655d1025ddfa3ed74d395bb0fc9
--- /dev/null
+++ b/pandas/tests/util/test_deprecate_kwarg.py
@@ -0,0 +1,90 @@
+import pytest
+
+from pandas.util._decorators import deprecate_kwarg
+
+import pandas._testing as tm
+
+
+@deprecate_kwarg(FutureWarning, "old", new_arg_name="new")
+def _f1(new=False):
+    return new
+
+
+_f2_mappings = {"yes": True, "no": False}
+
+
+@deprecate_kwarg(FutureWarning, "old", new_arg_name="new", mapping=_f2_mappings)
+def _f2(new=False):
+    return new
+
+
+def _f3_mapping(x):
+    return x + 1
+
+
+@deprecate_kwarg(FutureWarning, "old", new_arg_name="new", mapping=_f3_mapping)
+def _f3(new=0):
+    return new
+
+
+@pytest.mark.parametrize("key,klass", [("old", FutureWarning), ("new", None)])
+def test_deprecate_kwarg(key, klass):
+    x = 78
+
+    with tm.assert_produces_warning(klass):
+        assert _f1(**{key: x}) == x
+
+
+@pytest.mark.parametrize("key", list(_f2_mappings.keys()))
+def test_dict_deprecate_kwarg(key):
+    with tm.assert_produces_warning(FutureWarning):
+        assert _f2(old=key) == _f2_mappings[key]
+
+
+@pytest.mark.parametrize("key", ["bogus", 12345, -1.23])
+def test_missing_deprecate_kwarg(key):
+    with tm.assert_produces_warning(FutureWarning):
+        assert _f2(old=key) == key
+
+
+@pytest.mark.parametrize("x", [1, -1.4, 0])
+def test_callable_deprecate_kwarg(x):
+    with tm.assert_produces_warning(FutureWarning):
+        assert _f3(old=x) == _f3_mapping(x)
+
+
+def test_callable_deprecate_kwarg_fail():
+    msg = "((can only|cannot) concatenate)|(must be str)|(Can't convert)"
+
+    with pytest.raises(TypeError, match=msg):
+        _f3(old="hello")
+
+
+def test_bad_deprecate_kwarg():
+    msg = "mapping from old to new argument values must be dict or callable!"
+
+    with pytest.raises(TypeError, match=msg):
+
+        @deprecate_kwarg(FutureWarning, "old", "new", 0)
+        def f4(new=None):
+            return new
+
+
+@deprecate_kwarg(FutureWarning, "old", new_arg_name=None)
+def _f4(old=True, unchanged=True):
+    return old, unchanged
+
+
+@pytest.mark.parametrize("key", ["old", "unchanged"])
+def test_deprecate_keyword(key):
+    x = 9
+
+    if key == "old":
+        klass = FutureWarning
+        expected = (x, True)
+    else:
+        klass = None
+        expected = (True, x)
+
+    with tm.assert_produces_warning(klass):
+        assert _f4(**{key: x}) == expected
diff --git a/pandas/tests/util/test_deprecate_nonkeyword_arguments.py b/pandas/tests/util/test_deprecate_nonkeyword_arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9300adffc0d6079fe0d38d7b05407a54072c8f9
--- /dev/null
+++ b/pandas/tests/util/test_deprecate_nonkeyword_arguments.py
@@ -0,0 +1,151 @@
+"""
+Tests for the `deprecate_nonkeyword_arguments` decorator
+"""
+
+import inspect
+
+from pandas.errors import _CurrentDeprecationWarning
+from pandas.util._decorators import deprecate_nonkeyword_arguments
+
+import pandas._testing as tm
+
+WARNING_CATEGORY = _CurrentDeprecationWarning
+
+
+@deprecate_nonkeyword_arguments(
+    WARNING_CATEGORY, allowed_args=["a", "b"], name="f_add_inputs"
+)
+def f(a, b=0, c=0, d=0):
+    return a + b + c + d
+
+
+def test_f_signature():
+    assert str(inspect.signature(f)) == "(a, b=0, *, c=0, d=0)"
+
+
+def test_one_argument():
+    with tm.assert_produces_warning(None):
+        assert f(19) == 19
+
+
+def test_one_and_one_arguments():
+    with tm.assert_produces_warning(None):
+        assert f(19, d=6) == 25
+
+
+def test_two_arguments():
+    with tm.assert_produces_warning(None):
+        assert f(1, 5) == 6
+
+
+def test_two_and_two_arguments():
+    with tm.assert_produces_warning(None):
+        assert f(1, 3, c=3, d=5) == 12
+
+
+def test_three_arguments():
+    with tm.assert_produces_warning(WARNING_CATEGORY):
+        assert f(6, 3, 3) == 12
+
+
+def test_four_arguments():
+    with tm.assert_produces_warning(WARNING_CATEGORY):
+        assert f(1, 2, 3, 4) == 10
+
+
+def test_three_arguments_with_name_in_warning():
+    msg = (
+        f"Starting with pandas version {WARNING_CATEGORY.version()} all arguments of "
+        "f_add_inputs except for the arguments 'a' and 'b' will be keyword-only."
+    )
+    with tm.assert_produces_warning(WARNING_CATEGORY, match=msg):
+        assert f(6, 3, 3) == 12
+
+
+@deprecate_nonkeyword_arguments(WARNING_CATEGORY)
+def g(a, b=0, c=0, d=0):
+    with tm.assert_produces_warning(None):
+        return a + b + c + d
+
+
+def test_g_signature():
+    assert str(inspect.signature(g)) == "(a, *, b=0, c=0, d=0)"
+
+
+def test_one_and_three_arguments_default_allowed_args():
+    with tm.assert_produces_warning(None):
+        assert g(1, b=3, c=3, d=5) == 12
+
+
+def test_three_arguments_default_allowed_args():
+    with tm.assert_produces_warning(WARNING_CATEGORY):
+        assert g(6, 3, 3) == 12
+
+
+def test_three_positional_argument_with_warning_message_analysis():
+    msg = (
+        f"Starting with pandas version {WARNING_CATEGORY.version()} all arguments of g "
+        "except for the argument 'a' will be keyword-only."
+    )
+    with tm.assert_produces_warning(WARNING_CATEGORY, match=msg):
+        assert g(6, 3, 3) == 12
+
+
+@deprecate_nonkeyword_arguments(WARNING_CATEGORY)
+def h(a=0, b=0, c=0, d=0):
+    return a + b + c + d
+
+
+def test_h_signature():
+    assert str(inspect.signature(h)) == "(*, a=0, b=0, c=0, d=0)"
+
+
+def test_all_keyword_arguments():
+    with tm.assert_produces_warning(None):
+        assert h(a=1, b=2) == 3
+
+
+def test_one_positional_argument():
+    with tm.assert_produces_warning(WARNING_CATEGORY):
+        assert h(23) == 23
+
+
+def test_one_positional_argument_with_warning_message_analysis():
+    msg = (
+        f"Starting with pandas version {WARNING_CATEGORY.version()} all arguments "
+        "of h will be keyword-only."
+    )
+    with tm.assert_produces_warning(WARNING_CATEGORY, match=msg):
+        assert h(19) == 19
+
+
+@deprecate_nonkeyword_arguments(WARNING_CATEGORY)
+def i(a=0, /, b=0, *, c=0, d=0):
+    return a + b + c + d
+
+
+def test_i_signature():
+    assert str(inspect.signature(i)) == "(*, a=0, b=0, c=0, d=0)"
+
+
+def test_i_warns_klass():
+    with tm.assert_produces_warning(WARNING_CATEGORY):
+        assert i(1, 2) == 3
+
+
+class Foo:
+    @deprecate_nonkeyword_arguments(WARNING_CATEGORY, allowed_args=["self", "bar"])
+    def baz(self, bar=None, foobar=None): ...
+
+
+def test_foo_signature():
+    assert str(inspect.signature(Foo.baz)) == "(self, bar=None, *, foobar=None)"
+
+
+def test_class():
+    msg = (
+        rf"Starting with pandas version {WARNING_CATEGORY.version()} all arguments "
+        r"of Foo\.baz except for the argument \'bar\' will be keyword-only"
+    )
+    with tm.assert_produces_warning(WARNING_CATEGORY, match=msg):
+        Foo().baz("qux", "quox")
diff --git a/pandas/tests/util/test_doc.py b/pandas/tests/util/test_doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..50859564e654fd0cffc0abdc7d37fc51d550223f
--- /dev/null
+++ b/pandas/tests/util/test_doc.py
@@ -0,0 +1,90 @@
+from textwrap import dedent
+
+from pandas.util._decorators import doc
+
+
+@doc(method="cumsum", operation="sum")
+def cumsum(whatever):
+    """
+    This is the {method} method.
+
+    It computes the cumulative {operation}.
+    """
+
+
+@doc(
+    cumsum,
+    dedent(
+        """
+        Examples
+        --------
+
+        >>> cumavg([1, 2, 3])
+        2
+        """
+    ),
+    method="cumavg",
+    operation="average",
+)
+def cumavg(whatever):
+    pass
+
+
+@doc(cumsum, method="cummax", operation="maximum")
+def cummax(whatever):
+    pass
+
+
+@doc(cummax, method="cummin", operation="minimum")
+def cummin(whatever):
+    pass
+
+
+def test_docstring_formatting():
+    docstr = dedent(
+        """
+        This is the cumsum method.
+
+        It computes the cumulative sum.
+        """
+    )
+    assert cumsum.__doc__ == docstr
+
+
+def test_docstring_appending():
+    docstr = dedent(
+        """
+        This is the cumavg method.
+
+        It computes the cumulative average.
+
+        Examples
+        --------
+
+        >>> cumavg([1, 2, 3])
+        2
+        """
+    )
+    assert cumavg.__doc__ == docstr
+
+
+def test_doc_template_from_func():
+    docstr = dedent(
+        """
+        This is the cummax method.
+
+        It computes the cumulative maximum.
+        """
+    )
+    assert cummax.__doc__ == docstr
+
+
+def test_inherit_doc_template():
+    docstr = dedent(
+        """
+        This is the cummin method.
+
+        It computes the cumulative minimum.
+        """
+    )
+    assert cummin.__doc__ == docstr
diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6bc7017c2483c7ee0996a81b56f9e8e67cc894d
--- /dev/null
+++ b/pandas/tests/util/test_hashing.py
@@ -0,0 +1,418 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    period_range,
+    timedelta_range,
+)
+import pandas._testing as tm
+from pandas.core.util.hashing import hash_tuples
+from pandas.util import (
+    hash_array,
+    hash_pandas_object,
+)
+
+
+@pytest.fixture(
+    params=[
+        Series([1, 2, 3] * 3, dtype="int32"),
+        Series([None, 2.5, 3.5] * 3, dtype="float32"),
+        Series(["a", "b", "c"] * 3, dtype="category"),
+        Series(["d", "e", "f"] * 3),
+        Series([True, False, True] * 3),
+        Series(pd.date_range("20130101", periods=9)),
+        Series(pd.date_range("20130101", periods=9, tz="US/Eastern")),
+        Series(timedelta_range("2000", periods=9)),
+    ]
+)
+def series(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def index(request):
+    return request.param
+
+
+def test_consistency():
+    # Check that our hash doesn't change because of a mistake
+    # in the actual code; this is the ground truth.
+    result = hash_pandas_object(Index(["foo", "bar", "baz"]))
+    expected = Series(
+        np.array(
+            [3600424527151052760, 1374399572096150070, 477881037637427054],
+            dtype="uint64",
+        ),
+        index=["foo", "bar", "baz"],
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_hash_array(series):
+    arr = series.values
+    tm.assert_numpy_array_equal(hash_array(arr), hash_array(arr))
+
+
+@pytest.mark.parametrize("dtype", ["U", object])
+def test_hash_array_mixed(dtype):
+    result1 = hash_array(np.array(["3", "4", "All"]))
+    result2 = hash_array(np.array([3, 4, "All"], dtype=dtype))
+
+    tm.assert_numpy_array_equal(result1, result2)
+
+
+@pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])
+def test_hash_array_errors(val):
+    msg = "must pass an ndarray-like"
+    with pytest.raises(TypeError, match=msg):
+        hash_array(val)
+
+
+def test_hash_array_index_exception():
+    # GH42003 TypeError instead of AttributeError
+    obj = pd.DatetimeIndex(["2018-10-28 01:20:00"], tz="Europe/Berlin")
+
+    msg = "Use hash_pandas_object instead"
+    with pytest.raises(TypeError, match=msg):
+        hash_array(obj)
+
+
+def test_hash_tuples():
+    tuples = [(1, "one"), (1, "two"), (2, "one")]
+    result = hash_tuples(tuples)
+
+    expected = hash_pandas_object(MultiIndex.from_tuples(tuples)).values
+    tm.assert_numpy_array_equal(result, expected)
+
+    # We only need to support MultiIndex and list-of-tuples
+    msg = "|".join(["object is not iterable", "zip argument #1 must support iteration"])
+    with pytest.raises(TypeError, match=msg):
+        hash_tuples(tuples[0])
+
+
+@pytest.mark.parametrize("val", [5, "foo", pd.Timestamp("20130101")])
+def test_hash_tuples_err(val):
+    msg = "must be convertible to a list-of-tuples"
+    with pytest.raises(TypeError, match=msg):
+        hash_tuples(val)
+
+
+def test_multiindex_unique():
+    mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)])
+    assert mi.is_unique is True
+
+    result = hash_pandas_object(mi)
+    assert result.is_unique is True
+
+
+def test_multiindex_objects():
+    mi = MultiIndex(
+        levels=[["b", "d", "a"], [1, 2, 3]],
+        codes=[[0, 1, 0, 2], [2, 0, 0, 1]],
+        names=["col1", "col2"],
+    )
+    recons = mi._sort_levels_monotonic()
+
+    # These are equal.
+    assert mi.equals(recons)
+    assert Index(mi.values).equals(Index(recons.values))
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        Series([1, 2, 3]),
+        Series([1.0, 1.5, 3.2]),
+        Series([1.0, 1.5, np.nan]),
+        Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
+        Series(["a", "b", "c"]),
+        Series(["a", np.nan, "c"]),
+        Series(["a", None, "c"]),
+        Series([True, False, True]),
+        Series(dtype=object),
+        DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
+        DataFrame(),
+        DataFrame(np.full((10, 4), np.nan)),
+        DataFrame(
+            {
+                "A": [0.0, 1.0, 2.0, 3.0, 4.0],
+                "B": [0.0, 1.0, 0.0, 1.0, 0.0],
+                "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
+                "D": pd.date_range("20130101", periods=5),
+            }
+        ),
+        DataFrame(range(5), index=pd.date_range("2020-01-01", periods=5)),
+        Series(range(5), index=pd.date_range("2020-01-01", periods=5)),
+        Series(period_range("2020-01-01", periods=10, freq="D")),
+        Series(pd.date_range("20130101", periods=3, tz="US/Eastern")),
+    ],
+)
+def test_hash_pandas_object(obj, index):
+    a = hash_pandas_object(obj, index=index)
+    b = hash_pandas_object(obj, index=index)
+    tm.assert_series_equal(a, b)
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        Series([1, 2, 3]),
+        Series([1.0, 1.5, 3.2]),
+        Series([1.0, 1.5, np.nan]),
+        Series([1.0, 1.5, 3.2], index=[1.5, 1.1, 3.3]),
+        Series(["a", "b", "c"]),
+        Series(["a", np.nan, "c"]),
+        Series(["a", None, "c"]),
+        Series([True, False, True]),
+        DataFrame({"x": ["a", "b", "c"], "y": [1, 2, 3]}),
+        DataFrame(np.full((10, 4), np.nan)),
+        DataFrame(
+            {
+                "A": [0.0, 1.0, 2.0, 3.0, 4.0],
+                "B": [0.0, 1.0, 0.0, 1.0, 0.0],
+                "C": Index(["foo1", "foo2", "foo3", "foo4", "foo5"], dtype=object),
+                "D": pd.date_range("20130101", periods=5),
+            }
+        ),
+        DataFrame(range(5), index=pd.date_range("2020-01-01", periods=5)),
+        Series(range(5), index=pd.date_range("2020-01-01", periods=5)),
+        Series(period_range("2020-01-01", periods=10, freq="D")),
+        Series(pd.date_range("20130101", periods=3, tz="US/Eastern")),
+    ],
+)
+def test_hash_pandas_object_diff_index_non_empty(obj):
+    a = hash_pandas_object(obj, index=True)
+    b = hash_pandas_object(obj, index=False)
+    assert not (a == b).all()
+
+
+@pytest.mark.parametrize(
+    "obj",
+    [
+        Index([1, 2, 3]),
+        Index([True, False, True]),
+        timedelta_range("1 day", periods=2),
+        period_range("2020-01-01", freq="D", periods=2),
+        MultiIndex.from_product(
+            [range(5), ["foo", "bar", "baz"], pd.date_range("20130101", periods=2)]
+        ),
+        MultiIndex.from_product([pd.CategoricalIndex(list("aabc")), range(3)]),
+    ],
+)
+def test_hash_pandas_index(obj, index):
+    a = hash_pandas_object(obj, index=index)
+    b = hash_pandas_object(obj, index=index)
+    tm.assert_series_equal(a, b)
+
+
+def test_hash_pandas_series(series, index):
+    a = hash_pandas_object(series, index=index)
+    b = hash_pandas_object(series, index=index)
+    tm.assert_series_equal(a, b)
+
+
+def test_hash_pandas_series_diff_index(series):
+    a = hash_pandas_object(series, index=True)
+    b = hash_pandas_object(series, index=False)
+    assert not (a == b).all()
+
+
+@pytest.mark.parametrize("klass", [Index, Series])
+@pytest.mark.parametrize("dtype", ["float64", "object"])
+def test_hash_pandas_empty_object(klass, dtype, index):
+    # These are by-definition the same with
+    # or without the index as the data is empty.
+    obj = klass([], dtype=dtype)
+    a = hash_pandas_object(obj, index=index)
+    b = hash_pandas_object(obj, index=index)
+    tm.assert_series_equal(a, b)
+
+
+@pytest.mark.parametrize(
+    "s1",
+    [
+        ["a", "b", "c", "d"],
+        [1000, 2000, 3000, 4000],
+        pd.date_range(0, periods=4),
+    ],
+)
+@pytest.mark.parametrize("categorize", [True, False])
+def test_categorical_consistency(s1, categorize):
+    # see gh-15143
+    #
+    # Check that categoricals hash consistent with their values,
+    # not codes. This should work for categoricals of any dtype.
+    s1 = Series(s1)
+    s2 = s1.astype("category").cat.set_categories(s1)
+    s3 = s2.cat.set_categories(list(reversed(s1)))
+
+    # These should all hash identically.
+    h1 = hash_pandas_object(s1, categorize=categorize)
+    h2 = hash_pandas_object(s2, categorize=categorize)
+    h3 = hash_pandas_object(s3, categorize=categorize)
+
+    tm.assert_series_equal(h1, h2)
+    tm.assert_series_equal(h1, h3)
+
+
+def test_categorical_with_nan_consistency(unit):
+    dti = pd.date_range("2012-01-01", periods=5, name="B", unit=unit)
+    cat = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4], categories=dti)
+    expected = hash_array(cat, categorize=False)
+
+    ts = pd.Timestamp("2012-01-01").as_unit(unit)
+    cat2 = pd.Categorical.from_codes([-1, 0], categories=[ts])
+    result = hash_array(cat2, categorize=False)
+
+    assert result[0] in expected
+    assert result[1] in expected
+
+
+def test_pandas_errors():
+    msg = "Unexpected type for hashing"
+    with pytest.raises(TypeError, match=msg):
+        hash_pandas_object(pd.Timestamp("20130101"))
+
+
+def test_hash_keys():
+    # Using different hash keys, should have
+    # different hashes for the same data.
+    #
+    # This only matters for object dtypes.
+    obj = Series(list("abc"))
+
+    a = hash_pandas_object(obj, hash_key="9876543210123456")
+    b = hash_pandas_object(obj, hash_key="9876543210123465")
+
+    assert (a != b).all()
+
+
+def test_df_hash_keys():
+    # DataFrame version of the test_hash_keys.
+    # https://github.com/pandas-dev/pandas/issues/41404
+    obj = DataFrame({"x": np.arange(3), "y": list("abc")})
+
+    a = hash_pandas_object(obj, hash_key="9876543210123456")
+    b = hash_pandas_object(obj, hash_key="9876543210123465")
+
+    assert (a != b).all()
+
+
+def test_df_encoding():
+    # Check that DataFrame recognizes optional encoding.
+    # https://github.com/pandas-dev/pandas/issues/41404
+    # https://github.com/pandas-dev/pandas/pull/42049
+    obj = DataFrame({"x": np.arange(3), "y": list("a+c")})
+
+    a = hash_pandas_object(obj, encoding="utf8")
+    b = hash_pandas_object(obj, encoding="utf7")
+
+    # Note that the "+" is encoded as "+-" in utf-7.
+    assert a[0] == b[0]
+    assert a[1] != b[1]
+    assert a[2] == b[2]
+
+
+def test_invalid_key():
+    # This only matters for object dtypes.
+    msg = "key should be a 16-byte string encoded"
+
+    with pytest.raises(ValueError, match=msg):
+        hash_pandas_object(Series(list("abc")), hash_key="foo")
+
+
+def test_already_encoded(index):
+    # If already encoded, then ok.
+    obj = Series(list("abc")).str.encode("utf8")
+    a = hash_pandas_object(obj, index=index)
+    b = hash_pandas_object(obj, index=index)
+    tm.assert_series_equal(a, b)
+
+
+def test_alternate_encoding(index):
+    obj = Series(list("abc"))
+    a = hash_pandas_object(obj, index=index)
+    b = hash_pandas_object(obj, index=index)
+    tm.assert_series_equal(a, b)
+
+
+@pytest.mark.parametrize("l_exp", range(8))
+@pytest.mark.parametrize("l_add", [0, 1])
+def test_same_len_hash_collisions(l_exp, l_add):
+    length = 2 ** (l_exp + 8) + l_add
+    idx = np.array([str(i) for i in range(length)], dtype=object)
+
+    result = hash_array(idx, "utf8")
+    assert not result[0] == result[1]
+
+
+def test_hash_collisions():
+    # Hash collisions are bad.
+    #
+    # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
+    hashes = [
+        "Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9",
+        "Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe",
+    ]
+
+    # These should be different.
+    result1 = hash_array(np.asarray(hashes[0:1], dtype=object), "utf8")
+    expected1 = np.array([14963968704024874985], dtype=np.uint64)
+    tm.assert_numpy_array_equal(result1, expected1)
+
+    result2 = hash_array(np.asarray(hashes[1:2], dtype=object), "utf8")
+    expected2 = np.array([16428432627716348016], dtype=np.uint64)
+    tm.assert_numpy_array_equal(result2, expected2)
+
+    result = hash_array(np.asarray(hashes, dtype=object), "utf8")
+    tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0))
+
+
+@pytest.mark.parametrize(
+    "data, result_data",
+    [
+        [[tuple("1"), tuple("2")], [10345501319357378243, 8331063931016360761]],
+        [[(1,), (2,)], [9408946347443669104, 3278256261030523334]],
+    ],
+)
+def test_hash_with_tuple(data, result_data):
+    # GH#28969 array containing a tuple raises on call to arr.astype(str)
+    #  apparently a numpy bug github.com/numpy/numpy/issues/9441
+
+    df = DataFrame({"data": data})
+    result = hash_pandas_object(df)
+    expected = Series(result_data, dtype=np.uint64)
+    tm.assert_series_equal(result, expected)
+
+
+def test_hashable_tuple_args():
+    # require that the elements of such tuples are themselves hashable
+
+    df3 = DataFrame(
+        {
+            "data": [
+                (
+                    1,
+                    [],
+                ),
+                (
+                    2,
+                    {},
+                ),
+            ]
+        }
+    )
+    with pytest.raises(TypeError, match="unhashable type: 'list'"):
+        hash_pandas_object(df3)
+
+
+def test_hash_object_none_key():
+    # https://github.com/pandas-dev/pandas/issues/30887
+    result = pd.util.hash_pandas_object(Series(["a", "b"]), hash_key=None)
+    expected = Series([4578374827886788867, 17338122309987883691], dtype="uint64")
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/util/test_numba.py b/pandas/tests/util/test_numba.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dc4fa96f15ae5e279082747eca22b11c8bd4cc0
--- /dev/null
+++ b/pandas/tests/util/test_numba.py
@@ -0,0 +1,12 @@
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas import option_context
+
+
+@td.skip_if_installed("numba")
+def test_numba_not_installed_option_context():
+    with pytest.raises(ImportError, match="`Import numba` failed"):
+        with option_context("compute.use_numba", True):
+            pass
diff --git a/pandas/tests/util/test_rewrite_warning.py b/pandas/tests/util/test_rewrite_warning.py
new file mode 100644
index 0000000000000000000000000000000000000000..3db5e44d4fceaaa07f000abf9abc79764c6effbc
--- /dev/null
+++ b/pandas/tests/util/test_rewrite_warning.py
@@ -0,0 +1,42 @@
+import warnings
+
+import pytest
+
+from pandas.util._exceptions import rewrite_warning
+
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize(
+    "target_category, target_message, hit",
+    [
+        (FutureWarning, "Target message", True),
+        (FutureWarning, "Target", True),
+        (FutureWarning, "get mess", True),
+        (FutureWarning, "Missed message", False),
+        (DeprecationWarning, "Target message", False),
+    ],
+)
+@pytest.mark.parametrize(
+    "new_category",
+    [
+        None,
+        DeprecationWarning,
+    ],
+)
+def test_rewrite_warning(target_category, target_message, hit, new_category):
+    new_message = "Rewritten message"
+    if hit:
+        expected_category = new_category if new_category else target_category
+        expected_message = new_message
+    else:
+        expected_category = FutureWarning
+        expected_message = "Target message"
+    with tm.assert_produces_warning(expected_category, match=expected_message):
+        with rewrite_warning(
+            target_message, target_category, new_message, new_category
+        ):
+            warnings.warn(
+                message="Target message",
+                category=FutureWarning,  # pdlint: ignore[warning_class]
+            )
diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..94bc51dca3f60c5c3765a0c30e5738c52629dfa7
--- /dev/null
+++ b/pandas/tests/util/test_shares_memory.py
@@ -0,0 +1,46 @@
+import numpy as np
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas._testing as tm
+
+
+def test_shares_memory_interval():
+    obj = pd.interval_range(1, 5)
+
+    assert tm.shares_memory(obj, obj)
+    assert tm.shares_memory(obj, obj._data)
+    assert tm.shares_memory(obj, obj[::-1])
+    assert tm.shares_memory(obj, obj[:2])
+
+    assert not tm.shares_memory(obj, obj._data.copy())
+
+
+@td.skip_if_no("pyarrow")
+def test_shares_memory_string():
+    # GH#55823
+    import pyarrow as pa
+
+    obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=pd.NA))
+    assert tm.shares_memory(obj, obj)
+
+    obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=np.nan))
+    assert tm.shares_memory(obj, obj)
+
+    obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string()))
+    assert tm.shares_memory(obj, obj)
+
+
+def test_shares_memory_numpy():
+    arr = np.arange(10)
+    view = arr[:5]
+    assert tm.shares_memory(arr, view)
+    arr2 = np.arange(10)
+    assert not tm.shares_memory(arr, arr2)
+
+
+def test_shares_memory_rangeindex():
+    idx = pd.RangeIndex(10)
+    arr = np.arange(10)
+    assert not tm.shares_memory(idx, arr)
diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py
new file mode 100644
index 0000000000000000000000000000000000000000..72c9db23b210880793f37227c99e99e804800f08
--- /dev/null
+++ b/pandas/tests/util/test_show_versions.py
@@ -0,0 +1,81 @@
+import json
+import os
+import re
+
+from pandas.util._print_versions import (
+    _get_dependency_info,
+    _get_sys_info,
+)
+
+import pandas as pd
+
+
+def test_show_versions(tmpdir):
+    # GH39701
+    as_json = os.path.join(tmpdir, "test_output.json")
+
+    pd.show_versions(as_json=as_json)
+
+    with open(as_json, encoding="utf-8") as fd:
+        # check if file output is valid JSON, will raise an exception if not
+        result = json.load(fd)
+
+    # Basic check that each version element is found in output
+    expected = {
+        "system": _get_sys_info(),
+        "dependencies": _get_dependency_info(),
+    }
+
+    assert result == expected
+
+
+def test_show_versions_console_json(capsys):
+    # GH39701
+    pd.show_versions(as_json=True)
+    stdout = capsys.readouterr().out
+
+    # check valid json is printed to the console if as_json is True
+    result = json.loads(stdout)
+
+    # Basic check that each version element is found in output
+    expected = {
+        "system": _get_sys_info(),
+        "dependencies": _get_dependency_info(),
+    }
+
+    assert result == expected
+
+
+def test_show_versions_console(capsys):
+    # gh-32041
+    # gh-32041
+    pd.show_versions(as_json=False)
+    result = capsys.readouterr().out
+
+    # check header
+    assert "INSTALLED VERSIONS" in result
+
+    # check full commit hash
+    assert re.search(r"commit\s*:\s[0-9a-f]{40}\n", result)
+
+    # check required dependency
+    # 2020-12-09 npdev has "dirty" in the tag
+    # 2022-05-25 npdev released with RC wo/ "dirty".
+    # Just ensure we match [0-9]+\..* since npdev version is variable
+    assert re.search(r"numpy\s*:\s[0-9]+\..*\n", result)
+
+    # check optional dependency
+    assert re.search(r"pyarrow\s*:\s([0-9]+.*|None)\n", result)
+
+
+def test_json_output_match(capsys, tmpdir):
+    # GH39701
+    pd.show_versions(as_json=True)
+    result_console = capsys.readouterr().out
+
+    out_path = os.path.join(tmpdir, "test_json.json")
+    pd.show_versions(as_json=out_path)
+    with open(out_path, encoding="utf-8") as out_fd:
+        result_file = out_fd.read()
+
+    assert result_console == result_file
diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfb8587d3924e1441ac9da0aeeaa5585c6b4fe6c
--- /dev/null
+++ b/pandas/tests/util/test_util.py
@@ -0,0 +1,58 @@
+import os
+
+import pytest
+
+from pandas import (
+    array,
+    compat,
+)
+import pandas._testing as tm
+
+
+def test_numpy_err_state_is_default():
+    expected = {"over": "warn", "divide": "warn", "invalid": "warn", "under": "ignore"}
+    import numpy as np
+
+    # The error state should be unchanged after that import.
+    assert np.geterr() == expected
+
+
+def test_convert_rows_list_to_csv_str():
+    rows_list = ["aaa", "bbb", "ccc"]
+    ret = tm.convert_rows_list_to_csv_str(rows_list)
+
+    if compat.is_platform_windows():
+        expected = "aaa\r\nbbb\r\nccc\r\n"
+    else:
+        expected = "aaa\nbbb\nccc\n"
+
+    assert ret == expected
+
+
+@pytest.mark.parametrize("strict_data_files", [True, False])
+def test_datapath_missing(datapath):
+    with pytest.raises(ValueError, match="Could not find file"):
+        datapath("not_a_file")
+
+
+def test_datapath(datapath):
+    args = ("io", "data", "csv", "iris.csv")
+
+    result = datapath(*args)
+    expected = os.path.join(os.path.dirname(os.path.dirname(__file__)), *args)
+
+    assert result == expected
+
+
+def test_external_error_raised():
+    with tm.external_error_raised(TypeError):
+        raise TypeError("Should not check this error message, so it will pass")
+
+
+def test_is_sorted():
+    arr = array([1, 2, 3], dtype="Int64")
+    tm.assert_is_sorted(arr)
+
+    arr = array([4, 2, 3], dtype="Int64")
+    with pytest.raises(AssertionError, match="ExtensionArray are different"):
+        tm.assert_is_sorted(arr)
diff --git a/pandas/tests/util/test_validate_args.py b/pandas/tests/util/test_validate_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..eef0931ec28efd02e3db7a85b0b3260742c1ff2d
--- /dev/null
+++ b/pandas/tests/util/test_validate_args.py
@@ -0,0 +1,70 @@
+import pytest
+
+from pandas.util._validators import validate_args
+
+
+@pytest.fixture
+def _fname():
+    return "func"
+
+
+def test_bad_min_fname_arg_count(_fname):
+    msg = "'max_fname_arg_count' must be non-negative"
+
+    with pytest.raises(ValueError, match=msg):
+        validate_args(_fname, (None,), -1, "foo")
+
+
+def test_bad_arg_length_max_value_single(_fname):
+    args = (None, None)
+    compat_args = ("foo",)
+
+    min_fname_arg_count = 0
+    max_length = len(compat_args) + min_fname_arg_count
+    actual_length = len(args) + min_fname_arg_count
+    msg = (
+        rf"{_fname}\(\) takes at most {max_length} "
+        rf"argument \({actual_length} given\)"
+    )
+
+    with pytest.raises(TypeError, match=msg):
+        validate_args(_fname, args, min_fname_arg_count, compat_args)
+
+
+def test_bad_arg_length_max_value_multiple(_fname):
+    args = (None, None)
+    compat_args = {"foo": None}
+
+    min_fname_arg_count = 2
+    max_length = len(compat_args) + min_fname_arg_count
+    actual_length = len(args) + min_fname_arg_count
+    msg = (
+        rf"{_fname}\(\) takes at most {max_length} "
+        rf"arguments \({actual_length} given\)"
+    )
+
+    with pytest.raises(TypeError, match=msg):
+        validate_args(_fname, args, min_fname_arg_count, compat_args)
+
+
+@pytest.mark.parametrize("i", range(1, 3))
+def test_not_all_defaults(i, _fname):
+    bad_arg = "foo"
+    msg = (
+        f"the '{bad_arg}' parameter is not supported "
+        rf"in the pandas implementation of {_fname}\(\)"
+    )
+
+    compat_args = {"foo": 2, "bar": -1, "baz": 3}
+    arg_vals = (1, -1, 3)
+
+    with pytest.raises(ValueError, match=msg):
+        validate_args(_fname, arg_vals[:i], 2, compat_args)
+
+
+def test_validation(_fname):
+    # No exceptions should be raised.
+    validate_args(_fname, (None,), 2, {"out": None})
+
+    compat_args = {"axis": 1, "out": None}
+    validate_args(_fname, (1, None), 2, compat_args)
diff --git a/pandas/tests/util/test_validate_args_and_kwargs.py b/pandas/tests/util/test_validate_args_and_kwargs.py
new file mode 100644
index 0000000000000000000000000000000000000000..215026d648471c04cb8751506c03626fda73fc68
--- /dev/null
+++ b/pandas/tests/util/test_validate_args_and_kwargs.py
@@ -0,0 +1,84 @@
+import pytest
+
+from pandas.util._validators import validate_args_and_kwargs
+
+
+@pytest.fixture
+def _fname():
+    return "func"
+
+
+def test_invalid_total_length_max_length_one(_fname):
+    compat_args = ("foo",)
+    kwargs = {"foo": "FOO"}
+    args = ("FoO", "BaZ")
+
+    min_fname_arg_count = 0
+    max_length = len(compat_args) + min_fname_arg_count
+    actual_length = len(kwargs) + len(args) + min_fname_arg_count
+
+    msg = (
+        rf"{_fname}\(\) takes at most {max_length} "
+        rf"argument \({actual_length} given\)"
+    )
+
+    with pytest.raises(TypeError, match=msg):
+        validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args)
+
+
+def test_invalid_total_length_max_length_multiple(_fname):
+    compat_args = ("foo", "bar", "baz")
+    kwargs = {"foo": "FOO", "bar": "BAR"}
+    args = ("FoO", "BaZ")
+
+    min_fname_arg_count = 2
+    max_length = len(compat_args) + min_fname_arg_count
+    actual_length = len(kwargs) + len(args) + min_fname_arg_count
+
+    msg = (
+        rf"{_fname}\(\) takes at most {max_length} "
+        rf"arguments \({actual_length} given\)"
+    )
+
+    with pytest.raises(TypeError, match=msg):
+        validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args)
+
+
+@pytest.mark.parametrize("args,kwargs", [((), {"foo": -5, "bar": 2}), ((-5, 2), {})])
+def test_missing_args_or_kwargs(args, kwargs, _fname):
+    bad_arg = "bar"
+    min_fname_arg_count = 2
+
+    compat_args = {"foo": -5, bad_arg: 1}
+
+    msg = (
+        rf"the '{bad_arg}' parameter is not supported "
+        rf"in the pandas implementation of {_fname}\(\)"
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args)
+
+
+def test_duplicate_argument(_fname):
+    min_fname_arg_count = 2
+
+    compat_args = {"foo": None, "bar": None, "baz": None}
+    kwargs = {"foo": None, "bar": None}
+    args = (None,)  # duplicate value for "foo"
+
+    msg = rf"{_fname}\(\) got multiple values for keyword argument 'foo'"
+
+    with pytest.raises(TypeError, match=msg):
+        validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args)
+
+
+def test_validation(_fname):
+    # No exceptions should be raised.
+    compat_args = {"foo": 1, "bar": None, "baz": -2}
+    kwargs = {"baz": -2}
+
+    args = (1, None)
+    min_fname_arg_count = 2
+
+    validate_args_and_kwargs(_fname, args, kwargs, min_fname_arg_count, compat_args)
diff --git a/pandas/tests/util/test_validate_inclusive.py b/pandas/tests/util/test_validate_inclusive.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1254c614ab305c447090b148ea6a036569f76e6
--- /dev/null
+++ b/pandas/tests/util/test_validate_inclusive.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pytest
+
+from pandas.util._validators import validate_inclusive
+
+import pandas as pd
+
+
+@pytest.mark.parametrize(
+    "invalid_inclusive",
+    (
+        "ccc",
+        2,
+        object(),
+        None,
+        np.nan,
+        pd.NA,
+        pd.DataFrame(),
+    ),
+)
+def test_invalid_inclusive(invalid_inclusive):
+    with pytest.raises(
+        ValueError,
+        match="Inclusive has to be either 'both', 'neither', 'left' or 'right'",
+    ):
+        validate_inclusive(invalid_inclusive)
+
+
+@pytest.mark.parametrize(
+    "valid_inclusive, expected_tuple",
+    (
+        ("left", (True, False)),
+        ("right", (False, True)),
+        ("both", (True, True)),
+        ("neither", (False, False)),
+    ),
+)
+def test_valid_inclusive(valid_inclusive, expected_tuple):
+    resultant_tuple = validate_inclusive(valid_inclusive)
+    assert expected_tuple == resultant_tuple
diff --git a/pandas/tests/util/test_validate_kwargs.py b/pandas/tests/util/test_validate_kwargs.py
new file mode 100644
index 0000000000000000000000000000000000000000..85d93638f788fa16a32ac3d83392a71c17f3cd7c
--- /dev/null
+++ b/pandas/tests/util/test_validate_kwargs.py
@@ -0,0 +1,69 @@
+import pytest
+
+from pandas.util._validators import (
+    validate_bool_kwarg,
+    validate_kwargs,
+)
+
+
+@pytest.fixture
+def _fname():
+    return "func"
+
+
+def test_bad_kwarg(_fname):
+    good_arg = "f"
+    bad_arg = good_arg + "o"
+
+    compat_args = {good_arg: "foo", bad_arg + "o": "bar"}
+    kwargs = {good_arg: "foo", bad_arg: "bar"}
+
+    msg = rf"{_fname}\(\) got an unexpected keyword argument '{bad_arg}'"
+
+    with pytest.raises(TypeError, match=msg):
+        validate_kwargs(_fname, kwargs, compat_args)
+
+
+@pytest.mark.parametrize("i", range(1, 3))
+def test_not_all_none(i, _fname):
+    bad_arg = "foo"
+    msg = (
+        rf"the '{bad_arg}' parameter is not supported "
+        rf"in the pandas implementation of {_fname}\(\)"
+    )
+
+    compat_args = {"foo": 1, "bar": "s", "baz": None}
+
+    kwarg_keys = ("foo", "bar", "baz")
+    kwarg_vals = (2, "s", None)
+
+    kwargs = dict(zip(kwarg_keys[:i], kwarg_vals[:i], strict=True))
+
+    with pytest.raises(ValueError, match=msg):
+        validate_kwargs(_fname, kwargs, compat_args)
+
+
+def test_validation(_fname):
+    # No exceptions should be raised.
+    compat_args = {"f": None, "b": 1, "ba": "s"}
+
+    kwargs = {"f": None, "b": 1}
+    validate_kwargs(_fname, kwargs, compat_args)
+
+
+@pytest.mark.parametrize("name", ["inplace", "copy"])
+@pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
+def test_validate_bool_kwarg_fail(name, value):
+    msg = (
+        f'For argument "{name}" expected type bool, '
+        f"received type {type(value).__name__}"
+    )
+
+    with pytest.raises(ValueError, match=msg):
+        validate_bool_kwarg(value, name)
+
+
+@pytest.mark.parametrize("name", ["inplace", "copy"])
+@pytest.mark.parametrize("value", [True, False, None])
+def test_validate_bool_kwarg(name, value):
+    assert validate_bool_kwarg(value, name) == value
diff --git a/pandas/tests/window/__init__.py b/pandas/tests/window/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe873b3b74254c5aeb6fbec48db19cd27e37dc1b
--- /dev/null
+++ b/pandas/tests/window/conftest.py
@@ -0,0 +1,124 @@
+from datetime import (
+    datetime,
+    timedelta,
+)
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas import (
+    DataFrame,
+    Series,
+    bdate_range,
+)
+
+
+@pytest.fixture(params=[True, False])
+def raw(request):
+    """raw keyword argument for rolling.apply"""
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        "sum",
+        "mean",
+        "median",
+        "max",
+        "min",
+        "var",
+        "std",
+        "kurt",
+        "skew",
+        "count",
+        "sem",
+    ]
+)
+def arithmetic_win_operators(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def center(request):
+    return request.param
+
+
+@pytest.fixture(params=[None, 1])
+def min_periods(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def adjust(request):
+    """adjust keyword argument for ewm"""
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def ignore_na(request):
+    """ignore_na keyword argument for ewm"""
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def numeric_only(request):
+    """numeric_only keyword argument"""
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        pytest.param("numba", marks=[td.skip_if_no("numba"), pytest.mark.single_cpu]),
+        "cython",
+    ]
+)
+def engine(request):
+    """engine keyword argument for rolling.apply"""
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        pytest.param(
+            ("numba", True), marks=[td.skip_if_no("numba"), pytest.mark.single_cpu]
+        ),
+        ("cython", True),
+        ("cython", False),
+    ]
+)
+def engine_and_raw(request):
+    """engine and raw keyword arguments for rolling.apply"""
+    return request.param
+
+
+@pytest.fixture(params=["1 day", timedelta(days=1), np.timedelta64(1, "D")])
+def halflife_with_times(request):
+    """Halflife argument for EWM when times is specified."""
+    return request.param
+
+
+@pytest.fixture
+def series():
+    """Make mocked series as fixture."""
+    arr = np.random.default_rng(2).standard_normal(100)
+    locs = np.arange(20, 40)
+    arr[locs] = np.nan
+    series = Series(arr, index=bdate_range(datetime(2009, 1, 1), periods=100))
+    return series
+
+
+@pytest.fixture
+def frame():
+    """Make mocked frame as fixture."""
+    return DataFrame(
+        np.random.default_rng(2).standard_normal((100, 10)),
+        index=bdate_range(datetime(2009, 1, 1), periods=100),
+    )
+
+
+@pytest.fixture(params=[None, 1, 2, 5, 10])
+def step(request):
+    """step keyword argument for rolling window operations."""
+    return request.param
diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..877b50e37670c0f2a5b9d9f816083a35e265ccbe
--- /dev/null
+++ b/pandas/tests/window/test_api.py
@@ -0,0 +1,385 @@
+import numpy as np
+import pytest
+
+from pandas.errors import (
+    DataError,
+    SpecificationError,
+)
+
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Period,
+    Series,
+    Timestamp,
+    concat,
+    date_range,
+    timedelta_range,
+)
+import pandas._testing as tm
+
+
+def test_getitem(step):
+    frame = DataFrame(np.random.default_rng(2).standard_normal((5, 5)))
+    r = frame.rolling(window=5, step=step)
+    tm.assert_index_equal(r._selected_obj.columns, frame[::step].columns)
+
+    r = frame.rolling(window=5, step=step)[1]
+    assert r._selected_obj.name == frame[::step].columns[1]
+
+    # technically this is allowed
+    r = frame.rolling(window=5, step=step)[1, 3]
+    tm.assert_index_equal(r._selected_obj.columns, frame[::step].columns[[1, 3]])
+
+    r = frame.rolling(window=5, step=step)[[1, 3]]
+    tm.assert_index_equal(r._selected_obj.columns, frame[::step].columns[[1, 3]])
+
+
+def test_select_bad_cols():
+    df = DataFrame([[1, 2]], columns=["A", "B"])
+    g = df.rolling(window=5)
+    with pytest.raises(KeyError, match="Columns not found: 'C'"):
+        g[["C"]]
+    with pytest.raises(KeyError, match="^[^A]+$"):
+        # A should not be referenced as a bad column...
+        # will have to rethink regex if you change message!
+        g[["A", "C"]]
+
+
+def test_attribute_access():
+    df = DataFrame([[1, 2]], columns=["A", "B"])
+    r = df.rolling(window=5)
+    tm.assert_series_equal(r.A.sum(), r["A"].sum())
+    msg = "'Rolling' object has no attribute 'F'"
+    with pytest.raises(AttributeError, match=msg):
+        r.F
+
+
+def tests_skip_nuisance(step):
+    df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"})
+    r = df.rolling(window=3, step=step)
+    result = r[["A", "B"]].sum()
+    expected = DataFrame(
+        {"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]},
+        columns=list("AB"),
+    )[::step]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_sum_object_str_raises(step):
+    df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"})
+    r = df.rolling(window=3, step=step)
+    with pytest.raises(
+        DataError, match="Cannot aggregate non-numeric type: object|str"
+    ):
+        # GH#42738, enforced in 2.0
+        r.sum()
+
+
+def test_agg(step):
+    df = DataFrame({"A": range(5), "B": range(0, 10, 2)})
+
+    r = df.rolling(window=3, step=step)
+    a_mean = r["A"].mean()
+    a_std = r["A"].std()
+    a_sum = r["A"].sum()
+    b_mean = r["B"].mean()
+    b_std = r["B"].std()
+
+    result = r.aggregate([np.mean, lambda x: np.std(x, ddof=1)])
+    expected = concat([a_mean, a_std, b_mean, b_std], axis=1)
+    expected.columns = MultiIndex.from_product([["A", "B"], ["mean", "<lambda>"]])
+    tm.assert_frame_equal(result, expected)
+
+    result = r.aggregate({"A": np.mean, "B": lambda x: np.std(x, ddof=1)})
+
+    expected = concat([a_mean, b_std], axis=1)
+    tm.assert_frame_equal(result, expected, check_like=True)
+
+    result = r.aggregate({"A": ["mean", "std"]})
+    expected = concat([a_mean, a_std], axis=1)
+    expected.columns = MultiIndex.from_tuples([("A", "mean"), ("A", "std")])
+    tm.assert_frame_equal(result, expected)
+
+    result = r["A"].aggregate(["mean", "sum"])
+    expected = concat([a_mean, a_sum], axis=1)
+    expected.columns = ["mean", "sum"]
+    tm.assert_frame_equal(result, expected)
+
+    msg = "nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        # using a dict with renaming
+        r.aggregate({"A": {"mean": "mean", "sum": "sum"}})
+
+    with pytest.raises(SpecificationError, match=msg):
+        r.aggregate(
+            {"A": {"mean": "mean", "sum": "sum"}, "B": {"mean2": "mean", "sum2": "sum"}}
+        )
+
+    result = r.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]})
+    expected = concat([a_mean, a_std, b_mean, b_std], axis=1)
+
+    exp_cols = [("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")]
+    expected.columns = MultiIndex.from_tuples(exp_cols)
+    tm.assert_frame_equal(result, expected, check_like=True)
+
+
+def test_agg_apply(raw):
+    # passed lambda
+    df = DataFrame({"A": range(5), "B": range(0, 10, 2)})
+
+    r = df.rolling(window=3)
+    a_sum = r["A"].sum()
+
+    result = r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)})
+    rcustom = r["B"].apply(lambda x: np.std(x, ddof=1), raw=raw)
+    expected = concat([a_sum, rcustom], axis=1)
+    tm.assert_frame_equal(result, expected, check_like=True)
+
+
+def test_agg_consistency(step):
+    df = DataFrame({"A": range(5), "B": range(0, 10, 2)})
+    r = df.rolling(window=3, step=step)
+
+    result = r.agg([np.sum, np.mean]).columns
+    expected = MultiIndex.from_product([list("AB"), ["sum", "mean"]])
+    tm.assert_index_equal(result, expected)
+
+    result = r["A"].agg([np.sum, np.mean]).columns
+    expected = Index(["sum", "mean"])
+    tm.assert_index_equal(result, expected)
+
+    result = r.agg({"A": [np.sum, np.mean]}).columns
+    expected = MultiIndex.from_tuples([("A", "sum"), ("A", "mean")])
+    tm.assert_index_equal(result, expected)
+
+
+def test_agg_nested_dicts():
+    # API change for disallowing these types of nested dicts
+    df = DataFrame({"A": range(5), "B": range(0, 10, 2)})
+    r = df.rolling(window=3)
+
+    msg = "nested renamer is not supported"
+    with pytest.raises(SpecificationError, match=msg):
+        r.aggregate({"r1": {"A": ["mean", "sum"]}, "r2": {"B": ["mean", "sum"]}})
+
+    expected = concat(
+        [r["A"].mean(), r["A"].std(), r["B"].mean(), r["B"].std()], axis=1
+    )
+    expected.columns = MultiIndex.from_tuples(
+        [("ra", "mean"), ("ra", "std"), ("rb", "mean"), ("rb", "std")]
+    )
+    with pytest.raises(SpecificationError, match=msg):
+        r[["A", "B"]].agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}})
+
+    with pytest.raises(SpecificationError, match=msg):
+        r.agg({"A": {"ra": ["mean", "std"]}, "B": {"rb": ["mean", "std"]}})
+
+
+@pytest.mark.parametrize(
+    "func,window_size",
+    [
+        (
+            "rolling",
+            2,
+        ),
+        (
+            "expanding",
+            None,
+        ),
+    ],
+)
+def test_pipe(func, window_size):
+    # Issue #57076
+    df = DataFrame(
+        {
+            "B": np.random.default_rng(2).standard_normal(10),
+            "C": np.random.default_rng(2).standard_normal(10),
+        }
+    )
+    r = getattr(df, func)(window_size)
+
+    expected = r.max() - r.mean()
+    result = r.pipe(lambda x: x.max() - x.mean())
+    tm.assert_frame_equal(result, expected)
+
+    expected = r.max() - 2 * r.min()
+    result = r.pipe(lambda x, k: x.max() - k * x.min(), k=2)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_count_nonnumeric_types(step):
+    # GH12541
+    cols = [
+        "int",
+        "float",
+        "string",
+        "datetime",
+        "timedelta",
+        "periods",
+        "fl_inf",
+        "fl_nan",
+        "str_nan",
+        "dt_nat",
+        "periods_nat",
+    ]
+    dt_nat_col = [Timestamp("20170101"), Timestamp("20170203"), Timestamp(None)]
+
+    df = DataFrame(
+        {
+            "int": [1, 2, 3],
+            "float": [4.0, 5.0, 6.0],
+            "string": list("abc"),
+            "datetime": date_range("20170101", periods=3),
+            "timedelta": timedelta_range("1 s", periods=3, freq="s"),
+            "periods": [
+                Period("2012-01"),
+                Period("2012-02"),
+                Period("2012-03"),
+            ],
+            "fl_inf": [1.0, 2.0, np.inf],
+            "fl_nan": [1.0, 2.0, np.nan],
+            "str_nan": ["aa", "bb", np.nan],
+            "dt_nat": dt_nat_col,
+            "periods_nat": [
+                Period("2012-01"),
+                Period("2012-02"),
+                Period(None),
+            ],
+        },
+        columns=cols,
+    )
+
+    expected = DataFrame(
+        {
+            "int": [1.0, 2.0, 2.0],
+            "float": [1.0, 2.0, 2.0],
+            "string": [1.0, 2.0, 2.0],
+            "datetime": [1.0, 2.0, 2.0],
+            "timedelta": [1.0, 2.0, 2.0],
+            "periods": [1.0, 2.0, 2.0],
+            "fl_inf": [1.0, 2.0, 2.0],
+            "fl_nan": [1.0, 2.0, 1.0],
+            "str_nan": [1.0, 2.0, 1.0],
+            "dt_nat": [1.0, 2.0, 1.0],
+            "periods_nat": [1.0, 2.0, 1.0],
+        },
+        columns=cols,
+    )[::step]
+
+    result = df.rolling(window=2, min_periods=0, step=step).count()
+    tm.assert_frame_equal(result, expected)
+
+    result = df.rolling(1, min_periods=0, step=step).count()
+    expected = df.notna().astype(float)[::step]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_preserve_metadata():
+    # GH 10565
+    s = Series(np.arange(100), name="foo")
+
+    s2 = s.rolling(30).sum()
+    s3 = s.rolling(20).sum()
+    assert s2.name == "foo"
+    assert s3.name == "foo"
+
+
+@pytest.mark.parametrize(
+    "func,window_size,expected_vals",
+    [
+        (
+            "rolling",
+            2,
+            [
+                [np.nan, np.nan, np.nan, np.nan],
+                [15.0, 20.0, 25.0, 20.0],
+                [25.0, 30.0, 35.0, 30.0],
+                [np.nan, np.nan, np.nan, np.nan],
+                [20.0, 30.0, 35.0, 30.0],
+                [35.0, 40.0, 60.0, 40.0],
+                [60.0, 80.0, 85.0, 80],
+            ],
+        ),
+        (
+            "expanding",
+            None,
+            [
+                [10.0, 10.0, 20.0, 20.0],
+                [15.0, 20.0, 25.0, 20.0],
+                [20.0, 30.0, 30.0, 20.0],
+                [10.0, 10.0, 30.0, 30.0],
+                [20.0, 30.0, 35.0, 30.0],
+                [26.666667, 40.0, 50.0, 30.0],
+                [40.0, 80.0, 60.0, 30.0],
+            ],
+        ),
+    ],
+)
+def test_multiple_agg_funcs(func, window_size, expected_vals):
+    # GH 15072
+    df = DataFrame(
+        [
+            ["A", 10, 20],
+            ["A", 20, 30],
+            ["A", 30, 40],
+            ["B", 10, 30],
+            ["B", 30, 40],
+            ["B", 40, 80],
+            ["B", 80, 90],
+        ],
+        columns=["stock", "low", "high"],
+    )
+
+    f = getattr(df.groupby("stock"), func)
+    if window_size:
+        window = f(window_size)
+    else:
+        window = f()
+
+    index = MultiIndex.from_tuples(
+        [("A", 0), ("A", 1), ("A", 2), ("B", 3), ("B", 4), ("B", 5), ("B", 6)],
+        names=["stock", None],
+    )
+    columns = MultiIndex.from_tuples(
+        [("low", "mean"), ("low", "max"), ("high", "mean"), ("high", "min")]
+    )
+    expected = DataFrame(expected_vals, index=index, columns=columns)
+
+    result = window.agg({"low": ["mean", "max"], "high": ["mean", "min"]})
+
+    tm.assert_frame_equal(result, expected)
+
+
+def test_dont_modify_attributes_after_methods(
+    arithmetic_win_operators, closed, center, min_periods, step
+):
+    # GH 39554
+    roll_obj = Series(range(1)).rolling(
+        1, center=center, closed=closed, min_periods=min_periods, step=step
+    )
+    expected = {attr: getattr(roll_obj, attr) for attr in roll_obj._attributes}
+    getattr(roll_obj, arithmetic_win_operators)()
+    result = {attr: getattr(roll_obj, attr) for attr in roll_obj._attributes}
+    assert result == expected
+
+
+def test_rolling_min_min_periods(step):
+    a = Series([1, 2, 3, 4, 5])
+    result = a.rolling(window=100, min_periods=1, step=step).min()
+    expected = Series(np.ones(len(a)))[::step]
+    tm.assert_series_equal(result, expected)
+    msg = "min_periods 5 must be <= window 3"
+    with pytest.raises(ValueError, match=msg):
+        Series([1, 2, 3]).rolling(window=3, min_periods=5, step=step).min()
+
+
+def test_rolling_max_min_periods(step):
+    a = Series([1, 2, 3, 4, 5], dtype=np.float64)
+    result = a.rolling(window=100, min_periods=1, step=step).max()
+    expected = a[::step]
+    tm.assert_almost_equal(result, expected)
+    msg = "min_periods 5 must be <= window 3"
+    with pytest.raises(ValueError, match=msg):
+        Series([1, 2, 3]).rolling(window=3, min_periods=5, step=step).max()
diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py
new file mode 100644
index 0000000000000000000000000000000000000000..2398713585cfbe673c511ee41cffc8172a3595b8
--- /dev/null
+++ b/pandas/tests/window/test_apply.py
@@ -0,0 +1,318 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    Timestamp,
+    concat,
+    date_range,
+    isna,
+    notna,
+)
+import pandas._testing as tm
+
+from pandas.tseries import offsets
+
+# suppress warnings about empty slices, as we are deliberately testing
+# with a 0-length Series
+pytestmark = pytest.mark.filterwarnings(
+    "ignore:.*(empty slice|0 for slice).*:RuntimeWarning"
+)
+
+
+def f(x):
+    return x[np.isfinite(x)].mean()
+
+
+@pytest.mark.parametrize("bad_raw", [None, 1, 0])
+def test_rolling_apply_invalid_raw(bad_raw):
+    with pytest.raises(ValueError, match="raw parameter must be `True` or `False`"):
+        Series(range(3)).rolling(1).apply(len, raw=bad_raw)
+
+
+def test_rolling_apply_out_of_bounds(engine_and_raw):
+    # gh-1850
+    engine, raw = engine_and_raw
+
+    vals = Series([1, 2, 3, 4])
+
+    result = vals.rolling(10).apply(np.sum, engine=engine, raw=raw)
+    assert result.isna().all()
+
+    result = vals.rolling(10, min_periods=1).apply(np.sum, engine=engine, raw=raw)
+    expected = Series([1, 3, 6, 10], dtype=float)
+    tm.assert_almost_equal(result, expected)
+
+
+@pytest.mark.parametrize("window", [2, "2s"])
+def test_rolling_apply_with_pandas_objects(window):
+    # 5071
+    df = DataFrame(
+        {
+            "A": np.random.default_rng(2).standard_normal(5),
+            "B": np.random.default_rng(2).integers(0, 10, size=5),
+        },
+        index=date_range("20130101", periods=5, freq="s"),
+    )
+
+    # we have an equal spaced timeseries index
+    # so simulate removing the first period
+    def f(x):
+        if x.index[0] == df.index[0]:
+            return np.nan
+        return x.iloc[-1]
+
+    result = df.rolling(window).apply(f, raw=False)
+    expected = df.iloc[2:].reindex_like(df)
+    tm.assert_frame_equal(result, expected)
+
+    with tm.external_error_raised(AttributeError):
+        df.rolling(window).apply(f, raw=True)
+
+
+def test_rolling_apply(engine_and_raw, step):
+    engine, raw = engine_and_raw
+
+    expected = Series([], dtype="float64")
+    result = expected.rolling(10, step=step).apply(
+        lambda x: x.mean(), engine=engine, raw=raw
+    )
+    tm.assert_series_equal(result, expected)
+
+    # gh-8080
+    s = Series([None, None, None])
+    result = s.rolling(2, min_periods=0, step=step).apply(
+        lambda x: len(x), engine=engine, raw=raw
+    )
+    expected = Series([1.0, 2.0, 2.0])[::step]
+    tm.assert_series_equal(result, expected)
+
+    result = s.rolling(2, min_periods=0, step=step).apply(len, engine=engine, raw=raw)
+    tm.assert_series_equal(result, expected)
+
+
+def test_all_apply(engine_and_raw):
+    engine, raw = engine_and_raw
+
+    df = (
+        DataFrame(
+            {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)}
+        ).set_index("A")
+        * 2
+    )
+    er = df.rolling(window=1)
+    r = df.rolling(window="1s")
+
+    result = r.apply(lambda x: 1, engine=engine, raw=raw)
+    expected = er.apply(lambda x: 1, engine=engine, raw=raw)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_ragged_apply(engine_and_raw):
+    engine, raw = engine_and_raw
+
+    df = DataFrame({"B": range(5)})
+    df.index = [
+        Timestamp("20130101 09:00:00"),
+        Timestamp("20130101 09:00:02"),
+        Timestamp("20130101 09:00:03"),
+        Timestamp("20130101 09:00:05"),
+        Timestamp("20130101 09:00:06"),
+    ]
+
+    f = lambda x: 1
+    result = df.rolling(window="1s", min_periods=1).apply(f, engine=engine, raw=raw)
+    expected = df.copy()
+    expected["B"] = 1.0
+    tm.assert_frame_equal(result, expected)
+
+    result = df.rolling(window="2s", min_periods=1).apply(f, engine=engine, raw=raw)
+    expected = df.copy()
+    expected["B"] = 1.0
+    tm.assert_frame_equal(result, expected)
+
+    result = df.rolling(window="5s", min_periods=1).apply(f, engine=engine, raw=raw)
+    expected = df.copy()
+    expected["B"] = 1.0
+    tm.assert_frame_equal(result, expected)
+
+
+def test_invalid_engine():
+    with pytest.raises(ValueError, match="engine must be either 'numba' or 'cython'"):
+        Series(range(1)).rolling(1).apply(lambda x: x, engine="foo")
+
+
+def test_invalid_engine_kwargs_cython():
+    with pytest.raises(ValueError, match="cython engine does not accept engine_kwargs"):
+        Series(range(1)).rolling(1).apply(
+            lambda x: x, engine="cython", engine_kwargs={"nopython": False}
+        )
+
+
+def test_invalid_raw_numba():
+    with pytest.raises(
+        ValueError, match="raw must be `True` when using the numba engine"
+    ):
+        Series(range(1)).rolling(1).apply(lambda x: x, raw=False, engine="numba")
+
+
+@pytest.mark.parametrize("args_kwargs", [[None, {"par": 10}], [(10,), None]])
+def test_rolling_apply_args_kwargs(args_kwargs):
+    # GH 33433
+    def numpysum(x, par):
+        return np.sum(x + par)
+
+    df = DataFrame({"gr": [1, 1], "a": [1, 2]})
+
+    idx = Index(["gr", "a"])
+    expected = DataFrame([[11.0, 11.0], [11.0, 12.0]], columns=idx)
+
+    result = df.rolling(1).apply(numpysum, args=args_kwargs[0], kwargs=args_kwargs[1])
+    tm.assert_frame_equal(result, expected)
+
+    midx = MultiIndex.from_tuples([(1, 0), (1, 1)], names=["gr", None])
+    expected = Series([11.0, 12.0], index=midx, name="a")
+
+    gb_rolling = df.groupby("gr")["a"].rolling(1)
+
+    result = gb_rolling.apply(numpysum, args=args_kwargs[0], kwargs=args_kwargs[1])
+    tm.assert_series_equal(result, expected)
+
+
+def test_nans(raw):
+    obj = Series(np.random.default_rng(2).standard_normal(50))
+    obj[:10] = np.nan
+    obj[-10:] = np.nan
+
+    result = obj.rolling(50, min_periods=30).apply(f, raw=raw)
+    tm.assert_almost_equal(result.iloc[-1], np.mean(obj[10:-10]))
+
+    # min_periods is working correctly
+    result = obj.rolling(20, min_periods=15).apply(f, raw=raw)
+    assert isna(result.iloc[23])
+    assert not isna(result.iloc[24])
+
+    assert not isna(result.iloc[-6])
+    assert isna(result.iloc[-5])
+
+    obj2 = Series(np.random.default_rng(2).standard_normal(20))
+    result = obj2.rolling(10, min_periods=5).apply(f, raw=raw)
+    assert isna(result.iloc[3])
+    assert notna(result.iloc[4])
+
+    result0 = obj.rolling(20, min_periods=0).apply(f, raw=raw)
+    result1 = obj.rolling(20, min_periods=1).apply(f, raw=raw)
+    tm.assert_almost_equal(result0, result1)
+
+
+def test_center(raw):
+    obj = Series(np.random.default_rng(2).standard_normal(50))
+    obj[:10] = np.nan
+    obj[-10:] = np.nan
+
+    result = obj.rolling(20, min_periods=15, center=True).apply(f, raw=raw)
+    expected = (
+        concat([obj, Series([np.nan] * 9)])
+        .rolling(20, min_periods=15)
+        .apply(f, raw=raw)
+        .iloc[9:]
+        .reset_index(drop=True)
+    )
+    tm.assert_series_equal(result, expected)
+
+
+def test_series(raw, series):
+    result = series.rolling(50).apply(f, raw=raw)
+    assert isinstance(result, Series)
+    tm.assert_almost_equal(result.iloc[-1], np.mean(series[-50:]))
+
+
+def test_frame(raw, frame):
+    result = frame.rolling(50).apply(f, raw=raw)
+    assert isinstance(result, DataFrame)
+    tm.assert_series_equal(
+        result.iloc[-1, :],
+        frame.iloc[-50:, :].apply(np.mean, axis=0, raw=raw),
+        check_names=False,
+    )
+
+
+def test_time_rule_series(raw, series):
+    win = 25
+    minp = 10
+    ser = series[::2].resample("B").mean()
+    series_result = ser.rolling(window=win, min_periods=minp).apply(f, raw=raw)
+    last_date = series_result.index[-1]
+    prev_date = last_date - 24 * offsets.BDay()
+
+    trunc_series = series[::2].truncate(prev_date, last_date)
+    tm.assert_almost_equal(series_result.iloc[-1], np.mean(trunc_series))
+
+
+def test_time_rule_frame(raw, frame):
+    win = 25
+    minp = 10
+    frm = frame[::2].resample("B").mean()
+    frame_result = frm.rolling(window=win, min_periods=minp).apply(f, raw=raw)
+    last_date = frame_result.index[-1]
+    prev_date = last_date - 24 * offsets.BDay()
+
+    trunc_frame = frame[::2].truncate(prev_date, last_date)
+    tm.assert_series_equal(
+        frame_result.xs(last_date),
+        trunc_frame.apply(np.mean, raw=raw),
+        check_names=False,
+    )
+
+
+@pytest.mark.parametrize("minp", [0, 99, 100])
+def test_min_periods(raw, series, minp, step):
+    result = series.rolling(len(series) + 1, min_periods=minp, step=step).apply(
+        f, raw=raw
+    )
+    expected = series.rolling(len(series), min_periods=minp, step=step).apply(
+        f, raw=raw
+    )
+    nan_mask = isna(result)
+    tm.assert_series_equal(nan_mask, isna(expected))
+
+    nan_mask = ~nan_mask
+    tm.assert_almost_equal(result[nan_mask], expected[nan_mask])
+
+
+def test_center_reindex_series(raw, series):
+    # shifter index
+    s = [f"x{x:d}" for x in range(12)]
+    minp = 10
+
+    series_xp = (
+        series.reindex(list(series.index) + s)
+        .rolling(window=25, min_periods=minp)
+        .apply(f, raw=raw)
+        .shift(-12)
+        .reindex(series.index)
+    )
+    series_rs = series.rolling(window=25, min_periods=minp, center=True).apply(
+        f, raw=raw
+    )
+    tm.assert_series_equal(series_xp, series_rs)
+
+
+def test_center_reindex_frame(raw):
+    # shifter index
+    frame = DataFrame(range(100), index=date_range("2020-01-01", freq="D", periods=100))
+    s = [f"x{x:d}" for x in range(12)]
+    minp = 10
+
+    frame_xp = (
+        frame.reindex(list(frame.index) + s)
+        .rolling(window=25, min_periods=minp)
+        .apply(f, raw=raw)
+        .shift(-12)
+        .reindex(frame.index)
+    )
+    frame_rs = frame.rolling(window=25, min_periods=minp, center=True).apply(f, raw=raw)
+    tm.assert_frame_equal(frame_xp, frame_rs)
diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c62ecc836c5043468acab308a5c26727d7652a5
--- /dev/null
+++ b/pandas/tests/window/test_base_indexer.py
@@ -0,0 +1,519 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    MultiIndex,
+    Series,
+    concat,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.api.indexers import (
+    BaseIndexer,
+    FixedForwardWindowIndexer,
+)
+from pandas.core.indexers.objects import (
+    ExpandingIndexer,
+    FixedWindowIndexer,
+    VariableOffsetWindowIndexer,
+)
+
+from pandas.tseries.offsets import BusinessDay
+
+
+def test_bad_get_window_bounds_signature():
+    class BadIndexer(BaseIndexer):
+        def get_window_bounds(self):
+            return None
+
+    indexer = BadIndexer()
+    with pytest.raises(ValueError, match="BadIndexer does not implement"):
+        Series(range(5)).rolling(indexer)
+
+
+def test_expanding_indexer():
+    s = Series(range(10))
+    indexer = ExpandingIndexer()
+    result = s.rolling(indexer).mean()
+    expected = s.expanding().mean()
+    tm.assert_series_equal(result, expected)
+
+
+def test_indexer_constructor_arg():
+    # Example found in computation.rst
+    use_expanding = [True, False, True, False, True]
+    df = DataFrame({"values": range(5)})
+
+    class CustomIndexer(BaseIndexer):
+        def get_window_bounds(self, num_values, min_periods, center, closed, step):
+            start = np.empty(num_values, dtype=np.int64)
+            end = np.empty(num_values, dtype=np.int64)
+            for i in range(num_values):
+                if self.use_expanding[i]:
+                    start[i] = 0
+                    end[i] = i + 1
+                else:
+                    start[i] = i
+                    end[i] = i + self.window_size
+            return start, end
+
+    indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
+    result = df.rolling(indexer).sum()
+    expected = DataFrame({"values": [0.0, 1.0, 3.0, 3.0, 10.0]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_indexer_accepts_rolling_args():
+    df = DataFrame({"values": range(5)})
+
+    class CustomIndexer(BaseIndexer):
+        def get_window_bounds(self, num_values, min_periods, center, closed, step):
+            start = np.empty(num_values, dtype=np.int64)
+            end = np.empty(num_values, dtype=np.int64)
+            for i in range(num_values):
+                if (
+                    center
+                    and min_periods == 1
+                    and closed == "both"
+                    and step == 1
+                    and i == 2
+                ):
+                    start[i] = 0
+                    end[i] = num_values
+                else:
+                    start[i] = i
+                    end[i] = i + self.window_size
+            return start, end
+
+    indexer = CustomIndexer(window_size=1)
+    result = df.rolling(
+        indexer, center=True, min_periods=1, closed="both", step=1
+    ).sum()
+    expected = DataFrame({"values": [0.0, 1.0, 10.0, 3.0, 4.0]})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "func,np_func,expected,np_kwargs",
+    [
+        ("count", len, [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, np.nan], {}),
+        ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], {}),
+        (
+            "max",
+            np.max,
+            [2.0, 3.0, 4.0, 100.0, 100.0, 100.0, 8.0, 9.0, 9.0, np.nan],
+            {},
+        ),
+        (
+            "std",
+            np.std,
+            [
+                1.0,
+                1.0,
+                1.0,
+                55.71654452,
+                54.85739087,
+                53.9845657,
+                1.0,
+                1.0,
+                0.70710678,
+                np.nan,
+            ],
+            {"ddof": 1},
+        ),
+        (
+            "var",
+            np.var,
+            [
+                1.0,
+                1.0,
+                1.0,
+                3104.333333,
+                3009.333333,
+                2914.333333,
+                1.0,
+                1.0,
+                0.500000,
+                np.nan,
+            ],
+            {"ddof": 1},
+        ),
+        (
+            "median",
+            np.median,
+            [1.0, 2.0, 3.0, 4.0, 6.0, 7.0, 7.0, 8.0, 8.5, np.nan],
+            {},
+        ),
+    ],
+)
+def test_rolling_forward_window(
+    frame_or_series, func, np_func, expected, np_kwargs, step
+):
+    # GH 32865
+    values = np.arange(10.0)
+    values[5] = 100.0
+
+    indexer = FixedForwardWindowIndexer(window_size=3)
+
+    match = "Forward-looking windows can't have center=True"
+    rolling = frame_or_series(values).rolling(window=indexer, center=True)
+    with pytest.raises(ValueError, match=match):
+        getattr(rolling, func)()
+
+    match = "Forward-looking windows don't support setting the closed argument"
+    rolling = frame_or_series(values).rolling(window=indexer, closed="right")
+    with pytest.raises(ValueError, match=match):
+        getattr(rolling, func)()
+
+    rolling = frame_or_series(values).rolling(window=indexer, min_periods=2, step=step)
+    result = getattr(rolling, func)()
+
+    # Check that the function output matches the explicitly provided array
+    expected = frame_or_series(expected)[::step]
+    tm.assert_equal(result, expected)
+
+    # Check that the rolling function output matches applying an alternative
+    # function to the rolling window object
+    expected2 = frame_or_series(rolling.apply(lambda x: np_func(x, **np_kwargs)))
+    tm.assert_equal(result, expected2)
+
+    # Check that the function output matches applying an alternative function
+    # if min_periods isn't specified
+    # GH 39604: After count-min_periods deprecation, apply(lambda x: len(x))
+    # is equivalent to count after setting min_periods=0
+    min_periods = 0 if func == "count" else None
+    rolling3 = frame_or_series(values).rolling(window=indexer, min_periods=min_periods)
+    result3 = getattr(rolling3, func)()
+    expected3 = frame_or_series(rolling3.apply(lambda x: np_func(x, **np_kwargs)))
+    tm.assert_equal(result3, expected3)
+
+
+def test_rolling_forward_skewness(frame_or_series, step):
+    values = np.arange(10.0)
+    values[5] = 100.0
+
+    indexer = FixedForwardWindowIndexer(window_size=5)
+    rolling = frame_or_series(values).rolling(window=indexer, min_periods=3, step=step)
+    result = rolling.skew()
+
+    expected = frame_or_series(
+        [
+            0.0,
+            2.232396,
+            2.229508,
+            2.228340,
+            2.229091,
+            2.231989,
+            0.0,
+            0.0,
+            np.nan,
+            np.nan,
+        ]
+    )[::step]
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "func,expected",
+    [
+        ("cov", [2.0, 2.0, 2.0, 97.0, 2.0, -93.0, 2.0, 2.0, np.nan, np.nan]),
+        (
+            "corr",
+            [
+                1.0,
+                1.0,
+                1.0,
+                0.8704775290207161,
+                0.018229084250926637,
+                -0.861357304646493,
+                1.0,
+                1.0,
+                np.nan,
+                np.nan,
+            ],
+        ),
+    ],
+)
+def test_rolling_forward_cov_corr(func, expected):
+    values1 = np.arange(10).reshape(-1, 1)
+    values2 = values1 * 2
+    values1[5, 0] = 100
+    values = np.concatenate([values1, values2], axis=1)
+
+    indexer = FixedForwardWindowIndexer(window_size=3)
+    rolling = DataFrame(values).rolling(window=indexer, min_periods=3)
+    # We are interested in checking only pairwise covariance / correlation
+    result = getattr(rolling, func)().loc[(slice(None), 1), 0]
+    result = result.reset_index(drop=True)
+    expected = Series(expected).reset_index(drop=True)
+    expected.name = result.name
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "closed,expected_data",
+    [
+        ["right", [0.0, 1.0, 2.0, 3.0, 7.0, 12.0, 6.0, 7.0, 8.0, 9.0]],
+        ["left", [0.0, 0.0, 1.0, 2.0, 5.0, 9.0, 5.0, 6.0, 7.0, 8.0]],
+    ],
+)
+def test_non_fixed_variable_window_indexer(closed, expected_data):
+    index = date_range("2020", periods=10)
+    df = DataFrame(range(10), index=index)
+    offset = BusinessDay(1)
+    indexer = VariableOffsetWindowIndexer(index=index, offset=offset)
+    result = df.rolling(indexer, closed=closed).sum()
+    expected = DataFrame(expected_data, index=index)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_variableoffsetwindowindexer_not_dti():
+    # GH 54379
+    with pytest.raises(ValueError, match="index must be a DatetimeIndex."):
+        VariableOffsetWindowIndexer(index="foo", offset=BusinessDay(1))
+
+
+def test_variableoffsetwindowindexer_not_offset():
+    # GH 54379
+    idx = date_range("2020", periods=10)
+    with pytest.raises(ValueError, match="offset must be a DateOffset-like object."):
+        VariableOffsetWindowIndexer(index=idx, offset="foo")
+
+
+def test_fixed_forward_indexer_count(step):
+    # GH: 35579
+    df = DataFrame({"b": [None, None, None, 7]})
+    indexer = FixedForwardWindowIndexer(window_size=2)
+    result = df.rolling(window=indexer, min_periods=0, step=step).count()
+    expected = DataFrame({"b": [0.0, 0.0, 1.0, 1.0]})[::step]
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("end_value", "values"), [(1, [0.0, 1, 1, 3, 2]), (-1, [0.0, 1, 0, 3, 1])]
+)
+@pytest.mark.parametrize(("func", "args"), [("median", []), ("quantile", [0.5])])
+def test_indexer_quantile_sum(end_value, values, func, args):
+    # GH 37153
+    class CustomIndexer(BaseIndexer):
+        def get_window_bounds(self, num_values, min_periods, center, closed, step):
+            start = np.empty(num_values, dtype=np.int64)
+            end = np.empty(num_values, dtype=np.int64)
+            for i in range(num_values):
+                if self.use_expanding[i]:
+                    start[i] = 0
+                    end[i] = max(i + end_value, 1)
+                else:
+                    start[i] = i
+                    end[i] = i + self.window_size
+            return start, end
+
+    use_expanding = [True, False, True, False, True]
+    df = DataFrame({"values": range(5)})
+
+    indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
+    result = getattr(df.rolling(indexer), func)(*args)
+    expected = DataFrame({"values": values})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "indexer_class", [FixedWindowIndexer, FixedForwardWindowIndexer, ExpandingIndexer]
+)
+@pytest.mark.parametrize("window_size", [1, 2, 12])
+@pytest.mark.parametrize(
+    "df_data",
+    [
+        {"a": [1, 1], "b": [0, 1]},
+        {"a": [1, 2], "b": [0, 1]},
+        {"a": [1] * 16, "b": [np.nan, 1, 2, np.nan, *list(range(4, 16))]},
+    ],
+)
+def test_indexers_are_reusable_after_groupby_rolling(
+    indexer_class, window_size, df_data
+):
+    # GH 43267
+    df = DataFrame(df_data)
+    num_trials = 3
+    indexer = indexer_class(window_size=window_size)
+    original_window_size = indexer.window_size
+    for i in range(num_trials):
+        df.groupby("a")["b"].rolling(window=indexer, min_periods=1).mean()
+        assert indexer.window_size == original_window_size
+
+
+@pytest.mark.parametrize(
+    "window_size, num_values, expected_start, expected_end",
+    [
+        (1, 1, [0], [1]),
+        (1, 2, [0, 1], [1, 2]),
+        (2, 1, [0], [1]),
+        (2, 2, [0, 1], [2, 2]),
+        (5, 12, range(12), list(range(5, 12)) + [12] * 5),
+        (12, 5, range(5), [5] * 5),
+        (0, 0, np.array([]), np.array([])),
+        (1, 0, np.array([]), np.array([])),
+        (0, 1, [0], [0]),
+    ],
+)
+def test_fixed_forward_indexer_bounds(
+    window_size, num_values, expected_start, expected_end, step
+):
+    # GH 43267
+    indexer = FixedForwardWindowIndexer(window_size=window_size)
+    start, end = indexer.get_window_bounds(num_values=num_values, step=step)
+
+    tm.assert_numpy_array_equal(
+        start, np.array(expected_start[::step]), check_dtype=False
+    )
+    tm.assert_numpy_array_equal(end, np.array(expected_end[::step]), check_dtype=False)
+    assert len(start) == len(end)
+
+
+@pytest.mark.parametrize(
+    "df, window_size, expected",
+    [
+        (
+            DataFrame({"b": [0, 1, 2], "a": [1, 2, 2]}),
+            2,
+            Series(
+                [0, 1.5, 2.0],
+                index=MultiIndex.from_arrays([[1, 2, 2], range(3)], names=["a", None]),
+                name="b",
+                dtype=np.float64,
+            ),
+        ),
+        (
+            DataFrame(
+                {
+                    "b": [np.nan, 1, 2, np.nan, *list(range(4, 18))],
+                    "a": [1] * 7 + [2] * 11,
+                    "c": range(18),
+                }
+            ),
+            12,
+            Series(
+                [
+                    3.6,
+                    3.6,
+                    4.25,
+                    5.0,
+                    5.0,
+                    5.5,
+                    6.0,
+                    12.0,
+                    12.5,
+                    13.0,
+                    13.5,
+                    14.0,
+                    14.5,
+                    15.0,
+                    15.5,
+                    16.0,
+                    16.5,
+                    17.0,
+                ],
+                index=MultiIndex.from_arrays(
+                    [[1] * 7 + [2] * 11, range(18)], names=["a", None]
+                ),
+                name="b",
+                dtype=np.float64,
+            ),
+        ),
+    ],
+)
+def test_rolling_groupby_with_fixed_forward_specific(df, window_size, expected):
+    # GH 43267
+    indexer = FixedForwardWindowIndexer(window_size=window_size)
+    result = df.groupby("a")["b"].rolling(window=indexer, min_periods=1).mean()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "group_keys",
+    [
+        (1,),
+        (1, 2),
+        (2, 1),
+        (1, 1, 2),
+        (1, 2, 1),
+        (1, 1, 2, 2),
+        (1, 2, 3, 2, 3),
+        (1, 1, 2) * 4,
+        (1, 2, 3) * 5,
+    ],
+)
+@pytest.mark.parametrize("window_size", [1, 2, 3, 4, 5, 8, 20])
+def test_rolling_groupby_with_fixed_forward_many(group_keys, window_size):
+    # GH 43267
+    df = DataFrame(
+        {
+            "a": np.array(list(group_keys)),
+            "b": np.arange(len(group_keys), dtype=np.float64) + 17,
+            "c": np.arange(len(group_keys), dtype=np.int64),
+        }
+    )
+
+    indexer = FixedForwardWindowIndexer(window_size=window_size)
+    result = df.groupby("a")["b"].rolling(window=indexer, min_periods=1).sum()
+    result.index.names = ["a", "c"]
+
+    groups = df.groupby("a")[["a", "b", "c"]]
+    manual = concat(
+        [
+            g.assign(
+                b=[
+                    g["b"].iloc[i : i + window_size].sum(min_count=1)
+                    for i in range(len(g))
+                ]
+            )
+            for _, g in groups
+        ]
+    )
+    manual = manual.set_index(["a", "c"])["b"]
+
+    tm.assert_series_equal(result, manual)
+
+
+def test_unequal_start_end_bounds():
+    class CustomIndexer(BaseIndexer):
+        def get_window_bounds(self, num_values, min_periods, center, closed, step):
+            return np.array([1]), np.array([1, 2])
+
+    indexer = CustomIndexer()
+    roll = Series(1).rolling(indexer)
+    match = "start"
+    with pytest.raises(ValueError, match=match):
+        roll.mean()
+
+    with pytest.raises(ValueError, match=match):
+        next(iter(roll))
+
+    with pytest.raises(ValueError, match=match):
+        roll.corr(pairwise=True)
+
+    with pytest.raises(ValueError, match=match):
+        roll.cov(pairwise=True)
+
+
+def test_unequal_bounds_to_object():
+    # GH 44470
+    class CustomIndexer(BaseIndexer):
+        def get_window_bounds(self, num_values, min_periods, center, closed, step):
+            return np.array([1]), np.array([2])
+
+    indexer = CustomIndexer()
+    roll = Series([1, 1]).rolling(indexer)
+    match = "start and end"
+    with pytest.raises(ValueError, match=match):
+        roll.mean()
+
+    with pytest.raises(ValueError, match=match):
+        next(iter(roll))
+
+    with pytest.raises(ValueError, match=match):
+        roll.corr(pairwise=True)
+
+    with pytest.raises(ValueError, match=match):
+        roll.cov(pairwise=True)
diff --git a/pandas/tests/window/test_cython_aggregations.py b/pandas/tests/window/test_cython_aggregations.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e23618a3a201ce35a4217153510389eef791590
--- /dev/null
+++ b/pandas/tests/window/test_cython_aggregations.py
@@ -0,0 +1,114 @@
+from functools import partial
+import sys
+
+import numpy as np
+import pytest
+
+import pandas._libs.window.aggregations as window_aggregations
+
+from pandas import Series
+import pandas._testing as tm
+
+
+def _get_rolling_aggregations():
+    # list pairs of name and function
+    # each function has this signature:
+    # (const float64_t[:] values, ndarray[int64_t] start,
+    #  ndarray[int64_t] end, int64_t minp) -> np.ndarray
+    named_roll_aggs = (
+        [
+            ("roll_sum", window_aggregations.roll_sum),
+            ("roll_mean", window_aggregations.roll_mean),
+        ]
+        + [
+            (f"roll_var({ddof})", partial(window_aggregations.roll_var, ddof=ddof))
+            for ddof in [0, 1]
+        ]
+        + [
+            ("roll_skew", window_aggregations.roll_skew),
+            ("roll_kurt", window_aggregations.roll_kurt),
+            ("roll_median_c", window_aggregations.roll_median_c),
+            ("roll_max", window_aggregations.roll_max),
+            ("roll_min", window_aggregations.roll_min),
+            ("roll_first", window_aggregations.roll_first),
+            ("roll_last", window_aggregations.roll_last),
+            ("roll_nunique", window_aggregations.roll_nunique),
+        ]
+        + [
+            (
+                f"roll_quantile({quantile},{interpolation})",
+                partial(
+                    window_aggregations.roll_quantile,
+                    quantile=quantile,
+                    interpolation=interpolation,
+                ),
+            )
+            for quantile in [0.0001, 0.5, 0.9999]
+            for interpolation in window_aggregations.interpolation_types
+        ]
+        + [
+            (
+                f"roll_rank({percentile},{method},{ascending})",
+                partial(
+                    window_aggregations.roll_rank,
+                    percentile=percentile,
+                    method=method,
+                    ascending=ascending,
+                ),
+            )
+            for percentile in [True, False]
+            for method in window_aggregations.rolling_rank_tiebreakers.keys()
+            for ascending in [True, False]
+        ]
+    )
+    # unzip to a list of 2 tuples, names and functions
+    unzipped = list(zip(*named_roll_aggs, strict=True))
+    return {"ids": unzipped[0], "params": unzipped[1]}
+
+
+_rolling_aggregations = _get_rolling_aggregations()
+
+
+@pytest.fixture(
+    params=_rolling_aggregations["params"], ids=_rolling_aggregations["ids"]
+)
+def rolling_aggregation(request):
+    """Make a rolling aggregation function as fixture."""
+    return request.param
+
+
+def test_rolling_aggregation_boundary_consistency(rolling_aggregation):
+    # GH-45647
+    minp, step, width, size, selection = 0, 1, 3, 11, [2, 7]
+    values = np.arange(1, 1 + size, dtype=np.float64)
+    end = np.arange(width, size, step, dtype=np.int64)
+    start = end - width
+    selarr = np.array(selection, dtype=np.int32)
+    result = Series(rolling_aggregation(values, start[selarr], end[selarr], minp))
+    expected = Series(rolling_aggregation(values, start, end, minp)[selarr])
+    tm.assert_equal(expected, result)
+
+
+def test_rolling_aggregation_with_unused_elements(rolling_aggregation):
+    # GH-45647
+    minp, width = 0, 5  # width at least 4 for kurt
+    size = 2 * width + 5
+    values = np.arange(1, size + 1, dtype=np.float64)
+    values[width : width + 2] = sys.float_info.min
+    values[width + 2] = np.nan
+    values[width + 3 : width + 5] = sys.float_info.max
+    start = np.array([0, size - width], dtype=np.int64)
+    end = np.array([width, size], dtype=np.int64)
+    loc = np.array(
+        [j for i in range(len(start)) for j in range(start[i], end[i])],
+        dtype=np.int32,
+    )
+    result = Series(rolling_aggregation(values, start, end, minp))
+    compact_values = np.array(values[loc], dtype=np.float64)
+    compact_start = np.arange(0, len(start) * width, width, dtype=np.int64)
+    compact_end = compact_start + width
+    expected = Series(
+        rolling_aggregation(compact_values, compact_start, compact_end, minp)
+    )
+    assert np.isfinite(expected.values).all(), "Not all expected values are finite"
+    tm.assert_equal(expected, result)
diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py
new file mode 100644
index 0000000000000000000000000000000000000000..4007320b5de332ee4aef40b1ad1be9092eeb3347
--- /dev/null
+++ b/pandas/tests/window/test_dtypes.py
@@ -0,0 +1,173 @@
+import numpy as np
+import pytest
+
+from pandas.errors import DataError
+
+from pandas.core.dtypes.common import pandas_dtype
+
+from pandas import (
+    NA,
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+
+# gh-12373 : rolling functions error on float32 data
+# make sure rolling functions works for different dtypes
+#
+# further note that we are only checking rolling for fully dtype
+# compliance (though both expanding and ewm inherit)
+
+
+def get_dtype(dtype, coerce_int=None):
+    if coerce_int is False and "int" in dtype:
+        return None
+    return pandas_dtype(dtype)
+
+
+@pytest.fixture(
+    params=[
+        "object",
+        "category",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "uint8",
+        "uint16",
+        "uint32",
+        "uint64",
+        "float16",
+        "float32",
+        "float64",
+        "m8[ns]",
+        "M8[ns]",
+        "datetime64[ns, UTC]",
+    ]
+)
+def dtypes(request):
+    """Dtypes for window tests"""
+    return request.param
+
+
+@pytest.mark.parametrize(
+    "method, data, expected_data, coerce_int, min_periods",
+    [
+        ("count", np.arange(5), [1, 2, 2, 2, 2], True, 0),
+        ("count", np.arange(10, 0, -2), [1, 2, 2, 2, 2], True, 0),
+        ("count", [0, 1, 2, np.nan, 4], [1, 2, 2, 1, 1], False, 0),
+        ("max", np.arange(5), [np.nan, 1, 2, 3, 4], True, None),
+        ("max", np.arange(10, 0, -2), [np.nan, 10, 8, 6, 4], True, None),
+        ("max", [0, 1, 2, np.nan, 4], [np.nan, 1, 2, np.nan, np.nan], False, None),
+        ("min", np.arange(5), [np.nan, 0, 1, 2, 3], True, None),
+        ("min", np.arange(10, 0, -2), [np.nan, 8, 6, 4, 2], True, None),
+        ("min", [0, 1, 2, np.nan, 4], [np.nan, 0, 1, np.nan, np.nan], False, None),
+        ("sum", np.arange(5), [np.nan, 1, 3, 5, 7], True, None),
+        ("sum", np.arange(10, 0, -2), [np.nan, 18, 14, 10, 6], True, None),
+        ("sum", [0, 1, 2, np.nan, 4], [np.nan, 1, 3, np.nan, np.nan], False, None),
+        ("mean", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True, None),
+        ("mean", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True, None),
+        ("mean", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 1.5, np.nan, np.nan], False, None),
+        ("std", np.arange(5), [np.nan] + [np.sqrt(0.5)] * 4, True, None),
+        ("std", np.arange(10, 0, -2), [np.nan] + [np.sqrt(2)] * 4, True, None),
+        (
+            "std",
+            [0, 1, 2, np.nan, 4],
+            [np.nan] + [np.sqrt(0.5)] * 2 + [np.nan] * 2,
+            False,
+            None,
+        ),
+        ("var", np.arange(5), [np.nan, 0.5, 0.5, 0.5, 0.5], True, None),
+        ("var", np.arange(10, 0, -2), [np.nan, 2, 2, 2, 2], True, None),
+        ("var", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 0.5, np.nan, np.nan], False, None),
+        ("median", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True, None),
+        ("median", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True, None),
+        (
+            "median",
+            [0, 1, 2, np.nan, 4],
+            [np.nan, 0.5, 1.5, np.nan, np.nan],
+            False,
+            None,
+        ),
+    ],
+)
+def test_series_dtypes(
+    method, data, expected_data, coerce_int, dtypes, min_periods, step
+):
+    ser = Series(data, dtype=get_dtype(dtypes, coerce_int=coerce_int))
+    rolled = ser.rolling(2, min_periods=min_periods, step=step)
+
+    if dtypes in ("m8[ns]", "M8[ns]", "datetime64[ns, UTC]") and method != "count":
+        msg = "No numeric types to aggregate"
+        with pytest.raises(DataError, match=msg):
+            getattr(rolled, method)()
+    else:
+        result = getattr(rolled, method)()
+        expected = Series(expected_data, dtype="float64")[::step]
+        tm.assert_almost_equal(result, expected)
+
+
+def test_series_nullable_int(any_signed_int_ea_dtype, step):
+    # GH 43016
+    ser = Series([0, 1, NA], dtype=any_signed_int_ea_dtype)
+    result = ser.rolling(2, step=step).mean()
+    expected = Series([np.nan, 0.5, np.nan])[::step]
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, expected_data, min_periods",
+    [
+        ("count", {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, 0),
+        (
+            "max",
+            {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])},
+            None,
+        ),
+        (
+            "min",
+            {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])},
+            None,
+        ),
+        (
+            "sum",
+            {0: Series([np.nan, 2, 6, 10, 14]), 1: Series([np.nan, 4, 8, 12, 16])},
+            None,
+        ),
+        (
+            "mean",
+            {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])},
+            None,
+        ),
+        (
+            "std",
+            {
+                0: Series([np.nan] + [np.sqrt(2)] * 4),
+                1: Series([np.nan] + [np.sqrt(2)] * 4),
+            },
+            None,
+        ),
+        (
+            "var",
+            {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])},
+            None,
+        ),
+        (
+            "median",
+            {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])},
+            None,
+        ),
+    ],
+)
+def test_dataframe_dtypes(method, expected_data, dtypes, min_periods, step):
+    df = DataFrame(np.arange(10).reshape((5, 2)), dtype=get_dtype(dtypes))
+    rolled = df.rolling(2, min_periods=min_periods, step=step)
+
+    if dtypes in ("m8[ns]", "M8[ns]", "datetime64[ns, UTC]") and method != "count":
+        msg = "Cannot aggregate non-numeric type"
+        with pytest.raises(DataError, match=msg):
+            getattr(rolled, method)()
+    else:
+        result = getattr(rolled, method)()
+        expected = DataFrame(expected_data, dtype="float64")[::step]
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ea6c805a2ee4936501fbe9c1973572167efc914
--- /dev/null
+++ b/pandas/tests/window/test_ewm.py
@@ -0,0 +1,737 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    Series,
+    date_range,
+)
+import pandas._testing as tm
+
+
+def test_doc_string():
+    df = DataFrame({"B": [0, 1, 2, np.nan, 4]})
+    df
+    df.ewm(com=0.5).mean()
+
+
+def test_constructor(frame_or_series):
+    c = frame_or_series(range(5)).ewm
+
+    # valid
+    c(com=0.5)
+    c(span=1.5)
+    c(alpha=0.5)
+    c(halflife=0.75)
+    c(com=0.5, span=None)
+    c(alpha=0.5, com=None)
+    c(halflife=0.75, alpha=None)
+
+    # not valid: mutually exclusive
+    msg = "comass, span, halflife, and alpha are mutually exclusive"
+    with pytest.raises(ValueError, match=msg):
+        c(com=0.5, alpha=0.5)
+    with pytest.raises(ValueError, match=msg):
+        c(span=1.5, halflife=0.75)
+    with pytest.raises(ValueError, match=msg):
+        c(alpha=0.5, span=1.5)
+
+    # not valid: com < 0
+    msg = "comass must satisfy: comass >= 0"
+    with pytest.raises(ValueError, match=msg):
+        c(com=-0.5)
+
+    # not valid: span < 1
+    msg = "span must satisfy: span >= 1"
+    with pytest.raises(ValueError, match=msg):
+        c(span=0.5)
+
+    # not valid: halflife <= 0
+    msg = "halflife must satisfy: halflife > 0"
+    with pytest.raises(ValueError, match=msg):
+        c(halflife=0)
+
+    # not valid: alpha <= 0 or alpha > 1
+    msg = "alpha must satisfy: 0 < alpha <= 1"
+    for alpha in (-0.5, 1.5):
+        with pytest.raises(ValueError, match=msg):
+            c(alpha=alpha)
+
+
+def test_ewma_times_not_datetime_type():
+    msg = r"times must be datetime64 dtype."
+    with pytest.raises(ValueError, match=msg):
+        Series(range(5)).ewm(times=np.arange(5))
+
+
+def test_ewma_times_not_same_length():
+    msg = "times must be the same length as the object."
+    with pytest.raises(ValueError, match=msg):
+        Series(range(5)).ewm(times=np.arange(4).astype("datetime64[ns]"))
+
+
+def test_ewma_halflife_not_correct_type():
+    msg = "halflife must be a timedelta convertible object"
+    with pytest.raises(ValueError, match=msg):
+        Series(range(5)).ewm(halflife=1, times=np.arange(5).astype("datetime64[ns]"))
+
+
+def test_ewma_halflife_without_times(halflife_with_times):
+    msg = "halflife can only be a timedelta convertible argument if times is not None."
+    with pytest.raises(ValueError, match=msg):
+        Series(range(5)).ewm(halflife=halflife_with_times)
+
+
+@pytest.mark.parametrize(
+    "times",
+    [
+        np.arange(10).astype("datetime64[D]").astype("datetime64[ns]"),
+        date_range("2000", freq="D", periods=10),
+        date_range("2000", freq="D", periods=10).tz_localize("UTC"),
+    ],
+)
+@pytest.mark.parametrize("min_periods", [0, 2])
+def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods):
+    halflife = halflife_with_times
+    data = np.arange(10.0)
+    data[::2] = np.nan
+    df = DataFrame({"A": data})
+    result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean()
+    expected = df.ewm(halflife=1.0, min_periods=min_periods).mean()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_ewma_with_times_variable_spacing(tz_aware_fixture, unit, adjust):
+    # GH 54328
+    tz = tz_aware_fixture
+    halflife = "23 days"
+    times = (
+        DatetimeIndex(["2020-01-01", "2020-01-10T00:04:05", "2020-02-23T05:00:23"])
+        .tz_localize(tz)
+        .as_unit(unit)
+    )
+    data = np.arange(3)
+    df = DataFrame(data)
+    result = df.ewm(halflife=halflife, times=times, adjust=adjust).mean()
+    if adjust:
+        expected = DataFrame([0.0, 0.5674161888241773, 1.545239952073459])
+    else:
+        expected = DataFrame([0.0, 0.23762518642226227, 1.534926369128742])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_ewm_with_nat_raises(halflife_with_times):
+    # GH#38535
+    ser = Series(range(1))
+    times = DatetimeIndex(["NaT"])
+    with pytest.raises(ValueError, match="Cannot convert NaT values to integer"):
+        ser.ewm(com=0.1, halflife=halflife_with_times, times=times)
+
+
+def test_ewm_with_times_getitem(halflife_with_times):
+    # GH 40164
+    halflife = halflife_with_times
+    data = np.arange(10.0)
+    data[::2] = np.nan
+    times = date_range("2000", freq="D", periods=10)
+    df = DataFrame({"A": data, "B": data})
+    result = df.ewm(halflife=halflife, times=times)["A"].mean()
+    expected = df.ewm(halflife=1.0)["A"].mean()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("arg", ["com", "halflife", "span", "alpha"])
+def test_ewm_getitem_attributes_retained(arg, adjust, ignore_na):
+    # GH 40164
+    kwargs = {arg: 1, "adjust": adjust, "ignore_na": ignore_na}
+    ewm = DataFrame({"A": range(1), "B": range(1)}).ewm(**kwargs)
+    expected = {attr: getattr(ewm, attr) for attr in ewm._attributes}
+    ewm_slice = ewm["A"]
+    result = {attr: getattr(ewm, attr) for attr in ewm_slice._attributes}
+    assert result == expected
+
+
+def test_ewma_times_adjust_false_with_disallowed_com():
+    # GH 54328
+    with pytest.raises(
+        NotImplementedError,
+        match=(
+            "None of com, span, or alpha can be specified "
+            "if times is provided and adjust=False"
+        ),
+    ):
+        Series(range(1)).ewm(
+            0.1,
+            adjust=False,
+            times=date_range("2000", freq="D", periods=1),
+            halflife="1D",
+        )
+
+
+def test_ewma_times_adjust_false_with_disallowed_alpha():
+    # GH 54328
+    with pytest.raises(
+        NotImplementedError,
+        match=(
+            "None of com, span, or alpha can be specified "
+            "if times is provided and adjust=False"
+        ),
+    ):
+        Series(range(1)).ewm(
+            0.1,
+            adjust=False,
+            times=date_range("2000", freq="D", periods=1),
+            alpha=0.5,
+            halflife="1D",
+        )
+
+
+def test_ewma_times_adjust_false_with_disallowed_span():
+    # GH 54328
+    with pytest.raises(
+        NotImplementedError,
+        match=(
+            "None of com, span, or alpha can be specified "
+            "if times is provided and adjust=False"
+        ),
+    ):
+        Series(range(1)).ewm(
+            0.1,
+            adjust=False,
+            times=date_range("2000", freq="D", periods=1),
+            span=10,
+            halflife="1D",
+        )
+
+
+def test_times_string_col_raises():
+    # GH 43265
+    df = DataFrame(
+        {"A": np.arange(10.0), "time_col": date_range("2000", freq="D", periods=10)}
+    )
+    with pytest.raises(ValueError, match="times must be datetime64"):
+        df.ewm(halflife="1 day", min_periods=0, times="time_col")
+
+
+def test_ewm_sum_adjust_false_notimplemented():
+    data = Series(range(1)).ewm(com=1, adjust=False)
+    with pytest.raises(NotImplementedError, match="sum is not"):
+        data.sum()
+
+
+@pytest.mark.parametrize("method", ["sum", "std", "var", "cov", "corr"])
+def test_times_only_mean_implemented(frame_or_series, method):
+    # GH 51695
+    halflife = "1 day"
+    times = date_range("2000", freq="D", periods=10)
+    ewm = frame_or_series(range(10)).ewm(halflife=halflife, times=times)
+    with pytest.raises(
+        NotImplementedError, match=f"{method} is not implemented with times"
+    ):
+        getattr(ewm, method)()
+
+
+@pytest.mark.parametrize(
+    "expected_data, ignore",
+    [[[10.0, 5.0, 2.5, 11.25], False], [[10.0, 5.0, 5.0, 12.5], True]],
+)
+def test_ewm_sum(expected_data, ignore):
+    # xref from Numbagg tests
+    # https://github.com/numbagg/numbagg/blob/v0.2.1/numbagg/test/test_moving.py#L50
+    data = Series([10, 0, np.nan, 10])
+    result = data.ewm(alpha=0.5, ignore_na=ignore).sum()
+    expected = Series(expected_data)
+    tm.assert_series_equal(result, expected)
+
+
+def test_ewma_adjust():
+    vals = Series(np.zeros(1000))
+    vals[5] = 1
+    result = vals.ewm(span=100, adjust=False).mean().sum()
+    assert np.abs(result - 1) < 1e-2
+
+
+def test_ewma_cases(adjust, ignore_na):
+    # try adjust/ignore_na args matrix
+
+    s = Series([1.0, 2.0, 4.0, 8.0])
+
+    if adjust:
+        expected = Series([1.0, 1.6, 2.736842, 4.923077])
+    else:
+        expected = Series([1.0, 1.333333, 2.222222, 4.148148])
+
+    result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean()
+    tm.assert_series_equal(result, expected)
+
+
+def test_ewma_nan_handling():
+    s = Series([1.0] + [np.nan] * 5 + [1.0])
+    result = s.ewm(com=5).mean()
+    tm.assert_series_equal(result, Series([1.0] * len(s)))
+
+    s = Series([np.nan] * 2 + [1.0] + [np.nan] * 2 + [1.0])
+    result = s.ewm(com=5).mean()
+    tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4))
+
+
+@pytest.mark.parametrize(
+    "s, adjust, ignore_na, w",
+    [
+        (
+            [np.nan, 1.0, 101.0],
+            True,
+            False,
+            [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), 1.0],
+        ),
+        (
+            [np.nan, 1.0, 101.0],
+            True,
+            True,
+            [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), 1.0],
+        ),
+        (
+            [np.nan, 1.0, 101.0],
+            False,
+            False,
+            [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), (1.0 / (1.0 + 2.0))],
+        ),
+        (
+            [np.nan, 1.0, 101.0],
+            False,
+            True,
+            [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), (1.0 / (1.0 + 2.0))],
+        ),
+        (
+            [1.0, np.nan, 101.0],
+            True,
+            False,
+            [(1.0 - (1.0 / (1.0 + 2.0))) ** 2, np.nan, 1.0],
+        ),
+        (
+            [1.0, np.nan, 101.0],
+            True,
+            True,
+            [(1.0 - (1.0 / (1.0 + 2.0))), np.nan, 1.0],
+        ),
+        (
+            [1.0, np.nan, 101.0],
+            False,
+            False,
+            [(1.0 - (1.0 / (1.0 + 2.0))) ** 2, np.nan, (1.0 / (1.0 + 2.0))],
+        ),
+        (
+            [1.0, np.nan, 101.0],
+            False,
+            True,
+            [(1.0 - (1.0 / (1.0 + 2.0))), np.nan, (1.0 / (1.0 + 2.0))],
+        ),
+        (
+            [np.nan, 1.0, np.nan, np.nan, 101.0, np.nan],
+            True,
+            False,
+            [np.nan, (1.0 - (1.0 / (1.0 + 2.0))) ** 3, np.nan, np.nan, 1.0, np.nan],
+        ),
+        (
+            [np.nan, 1.0, np.nan, np.nan, 101.0, np.nan],
+            True,
+            True,
+            [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), np.nan, np.nan, 1.0, np.nan],
+        ),
+        (
+            [np.nan, 1.0, np.nan, np.nan, 101.0, np.nan],
+            False,
+            False,
+            [
+                np.nan,
+                (1.0 - (1.0 / (1.0 + 2.0))) ** 3,
+                np.nan,
+                np.nan,
+                (1.0 / (1.0 + 2.0)),
+                np.nan,
+            ],
+        ),
+        (
+            [np.nan, 1.0, np.nan, np.nan, 101.0, np.nan],
+            False,
+            True,
+            [
+                np.nan,
+                (1.0 - (1.0 / (1.0 + 2.0))),
+                np.nan,
+                np.nan,
+                (1.0 / (1.0 + 2.0)),
+                np.nan,
+            ],
+        ),
+        (
+            [1.0, np.nan, 101.0, 50.0],
+            True,
+            False,
+            [
+                (1.0 - (1.0 / (1.0 + 2.0))) ** 3,
+                np.nan,
+                (1.0 - (1.0 / (1.0 + 2.0))),
+                1.0,
+            ],
+        ),
+        (
+            [1.0, np.nan, 101.0, 50.0],
+            True,
+            True,
+            [
+                (1.0 - (1.0 / (1.0 + 2.0))) ** 2,
+                np.nan,
+                (1.0 - (1.0 / (1.0 + 2.0))),
+                1.0,
+            ],
+        ),
+        (
+            [1.0, np.nan, 101.0, 50.0],
+            False,
+            False,
+            [
+                (1.0 - (1.0 / (1.0 + 2.0))) ** 3,
+                np.nan,
+                (1.0 - (1.0 / (1.0 + 2.0))) * (1.0 / (1.0 + 2.0)),
+                (1.0 / (1.0 + 2.0))
+                * ((1.0 - (1.0 / (1.0 + 2.0))) ** 2 + (1.0 / (1.0 + 2.0))),
+            ],
+        ),
+        (
+            [1.0, np.nan, 101.0, 50.0],
+            False,
+            True,
+            [
+                (1.0 - (1.0 / (1.0 + 2.0))) ** 2,
+                np.nan,
+                (1.0 - (1.0 / (1.0 + 2.0))) * (1.0 / (1.0 + 2.0)),
+                (1.0 / (1.0 + 2.0)),
+            ],
+        ),
+    ],
+)
+def test_ewma_nan_handling_cases(s, adjust, ignore_na, w):
+    # GH 7603
+    s = Series(s)
+    expected = (s.multiply(w).cumsum() / Series(w).cumsum()).ffill()
+    result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean()
+
+    tm.assert_series_equal(result, expected)
+    if ignore_na is False:
+        # check that ignore_na defaults to False
+        result = s.ewm(com=2.0, adjust=adjust).mean()
+        tm.assert_series_equal(result, expected)
+
+
+def test_ewm_alpha():
+    # GH 10789
+    arr = np.random.default_rng(2).standard_normal(100)
+    locs = np.arange(20, 40)
+    arr[locs] = np.nan
+
+    s = Series(arr)
+    a = s.ewm(alpha=0.61722699889169674).mean()
+    b = s.ewm(com=0.62014947789973052).mean()
+    c = s.ewm(span=2.240298955799461).mean()
+    d = s.ewm(halflife=0.721792864318).mean()
+    tm.assert_series_equal(a, b)
+    tm.assert_series_equal(a, c)
+    tm.assert_series_equal(a, d)
+
+
+def test_ewm_domain_checks():
+    # GH 12492
+    arr = np.random.default_rng(2).standard_normal(100)
+    locs = np.arange(20, 40)
+    arr[locs] = np.nan
+
+    s = Series(arr)
+    msg = "comass must satisfy: comass >= 0"
+    with pytest.raises(ValueError, match=msg):
+        s.ewm(com=-0.1)
+    s.ewm(com=0.0)
+    s.ewm(com=0.1)
+
+    msg = "span must satisfy: span >= 1"
+    with pytest.raises(ValueError, match=msg):
+        s.ewm(span=-0.1)
+    with pytest.raises(ValueError, match=msg):
+        s.ewm(span=0.0)
+    with pytest.raises(ValueError, match=msg):
+        s.ewm(span=0.9)
+    s.ewm(span=1.0)
+    s.ewm(span=1.1)
+
+    msg = "halflife must satisfy: halflife > 0"
+    with pytest.raises(ValueError, match=msg):
+        s.ewm(halflife=-0.1)
+    with pytest.raises(ValueError, match=msg):
+        s.ewm(halflife=0.0)
+    s.ewm(halflife=0.1)
+
+    msg = "alpha must satisfy: 0 < alpha <= 1"
+    with pytest.raises(ValueError, match=msg):
+        s.ewm(alpha=-0.1)
+    with pytest.raises(ValueError, match=msg):
+        s.ewm(alpha=0.0)
+    s.ewm(alpha=0.1)
+    s.ewm(alpha=1.0)
+    with pytest.raises(ValueError, match=msg):
+        s.ewm(alpha=1.1)
+
+
+@pytest.mark.parametrize("method", ["mean", "std", "var"])
+def test_ew_empty_series(method):
+    vals = Series([], dtype=np.float64)
+
+    ewm = vals.ewm(3)
+    result = getattr(ewm, method)()
+    tm.assert_almost_equal(result, vals)
+
+
+@pytest.mark.parametrize("min_periods", [0, 1])
+@pytest.mark.parametrize("name", ["mean", "var", "std"])
+def test_ew_min_periods(min_periods, name):
+    # excluding NaNs correctly
+    arr = np.random.default_rng(2).standard_normal(50)
+    arr[:10] = np.nan
+    arr[-10:] = np.nan
+    s = Series(arr)
+
+    # check min_periods
+    # GH 7898
+    result = getattr(s.ewm(com=50, min_periods=2), name)()
+    assert result[:11].isna().all()
+    assert not result[11:].isna().any()
+
+    result = getattr(s.ewm(com=50, min_periods=min_periods), name)()
+    if name == "mean":
+        assert result[:10].isna().all()
+        assert not result[10:].isna().any()
+    else:
+        # ewm.std, ewm.var (with bias=False) require at least
+        # two values
+        assert result[:11].isna().all()
+        assert not result[11:].isna().any()
+
+    # check series of length 0
+    result = getattr(Series(dtype=object).ewm(com=50, min_periods=min_periods), name)()
+    tm.assert_series_equal(result, Series(dtype="float64"))
+
+    # check series of length 1
+    result = getattr(Series([1.0]).ewm(50, min_periods=min_periods), name)()
+    if name == "mean":
+        tm.assert_series_equal(result, Series([1.0]))
+    else:
+        # ewm.std, ewm.var with bias=False require at least
+        # two values
+        tm.assert_series_equal(result, Series([np.nan]))
+
+    # pass in ints
+    result2 = getattr(Series(np.arange(50)).ewm(span=10), name)()
+    assert result2.dtype == np.float64
+
+
+@pytest.mark.parametrize("name", ["cov", "corr"])
+def test_ewm_corr_cov(name):
+    A = Series(np.random.default_rng(2).standard_normal(50), index=range(50))
+    B = A[2:] + np.random.default_rng(2).standard_normal(48)
+
+    A[:10] = np.nan
+    B.iloc[-10:] = np.nan
+
+    result = getattr(A.ewm(com=20, min_periods=5), name)(B)
+    assert np.isnan(result.values[:14]).all()
+    assert not np.isnan(result.values[14:]).any()
+
+
+@pytest.mark.parametrize("min_periods", [0, 1, 2])
+@pytest.mark.parametrize("name", ["cov", "corr"])
+def test_ewm_corr_cov_min_periods(name, min_periods):
+    # GH 7898
+    A = Series(np.random.default_rng(2).standard_normal(50), index=range(50))
+    B = A[2:] + np.random.default_rng(2).standard_normal(48)
+
+    A[:10] = np.nan
+    B.iloc[-10:] = np.nan
+
+    result = getattr(A.ewm(com=20, min_periods=min_periods), name)(B)
+    # binary functions (ewmcov, ewmcorr) with bias=False require at
+    # least two values
+    assert np.isnan(result.values[:11]).all()
+    assert not np.isnan(result.values[11:]).any()
+
+    # check series of length 0
+    empty = Series([], dtype=np.float64)
+    result = getattr(empty.ewm(com=50, min_periods=min_periods), name)(empty)
+    tm.assert_series_equal(result, empty)
+
+    # check series of length 1
+    result = getattr(Series([1.0]).ewm(com=50, min_periods=min_periods), name)(
+        Series([1.0])
+    )
+    tm.assert_series_equal(result, Series([np.nan]))
+
+
+@pytest.mark.parametrize("name", ["cov", "corr"])
+def test_different_input_array_raise_exception(name):
+    A = Series(np.random.default_rng(2).standard_normal(50), index=range(50))
+    A[:10] = np.nan
+
+    msg = "other must be a DataFrame or Series"
+    # exception raised is Exception
+    with pytest.raises(ValueError, match=msg):
+        getattr(A.ewm(com=20, min_periods=5), name)(
+            np.random.default_rng(2).standard_normal(50)
+        )
+
+
+@pytest.mark.parametrize("name", ["var", "std", "mean"])
+def test_ewma_series(series, name):
+    series_result = getattr(series.ewm(com=10), name)()
+    assert isinstance(series_result, Series)
+
+
+@pytest.mark.parametrize("name", ["var", "std", "mean"])
+def test_ewma_frame(frame, name):
+    frame_result = getattr(frame.ewm(com=10), name)()
+    assert isinstance(frame_result, DataFrame)
+
+
+def test_ewma_span_com_args(series):
+    A = series.ewm(com=9.5).mean()
+    B = series.ewm(span=20).mean()
+    tm.assert_almost_equal(A, B)
+    msg = "comass, span, halflife, and alpha are mutually exclusive"
+    with pytest.raises(ValueError, match=msg):
+        series.ewm(com=9.5, span=20)
+
+    msg = "Must pass one of comass, span, halflife, or alpha"
+    with pytest.raises(ValueError, match=msg):
+        series.ewm().mean()
+
+
+def test_ewma_halflife_arg(series):
+    A = series.ewm(com=13.932726172912965).mean()
+    B = series.ewm(halflife=10.0).mean()
+    tm.assert_almost_equal(A, B)
+    msg = "comass, span, halflife, and alpha are mutually exclusive"
+    with pytest.raises(ValueError, match=msg):
+        series.ewm(span=20, halflife=50)
+    with pytest.raises(ValueError, match=msg):
+        series.ewm(com=9.5, halflife=50)
+    with pytest.raises(ValueError, match=msg):
+        series.ewm(com=9.5, span=20, halflife=50)
+    msg = "Must pass one of comass, span, halflife, or alpha"
+    with pytest.raises(ValueError, match=msg):
+        series.ewm()
+
+
+def test_ewm_alpha_arg(series):
+    # GH 10789
+    s = series
+    msg = "Must pass one of comass, span, halflife, or alpha"
+    with pytest.raises(ValueError, match=msg):
+        s.ewm()
+
+    msg = "comass, span, halflife, and alpha are mutually exclusive"
+    with pytest.raises(ValueError, match=msg):
+        s.ewm(com=10.0, alpha=0.5)
+    with pytest.raises(ValueError, match=msg):
+        s.ewm(span=10.0, alpha=0.5)
+    with pytest.raises(ValueError, match=msg):
+        s.ewm(halflife=10.0, alpha=0.5)
+
+
+@pytest.mark.parametrize("func", ["cov", "corr"])
+def test_ewm_pairwise_cov_corr(func, frame):
+    result = getattr(frame.ewm(span=10, min_periods=5), func)()
+    result = result.loc[(slice(None), 1), 5]
+    result.index = result.index.droplevel(1)
+    expected = getattr(frame[1].ewm(span=10, min_periods=5), func)(frame[5])
+    tm.assert_series_equal(result, expected, check_names=False)
+
+
+def test_numeric_only_frame(arithmetic_win_operators, numeric_only):
+    # GH#46560
+    kernel = arithmetic_win_operators
+    df = DataFrame({"a": [1], "b": 2, "c": 3})
+    df["c"] = df["c"].astype(object)
+    ewm = df.ewm(span=2, min_periods=1)
+    op = getattr(ewm, kernel, None)
+    if op is not None:
+        result = op(numeric_only=numeric_only)
+
+        columns = ["a", "b"] if numeric_only else ["a", "b", "c"]
+        expected = df[columns].agg([kernel]).reset_index(drop=True).astype(float)
+        assert list(expected.columns) == columns
+
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("kernel", ["corr", "cov"])
+@pytest.mark.parametrize("use_arg", [True, False])
+def test_numeric_only_corr_cov_frame(kernel, numeric_only, use_arg):
+    # GH#46560
+    df = DataFrame({"a": [1, 2, 3], "b": 2, "c": 3})
+    df["c"] = df["c"].astype(object)
+    arg = (df,) if use_arg else ()
+    ewm = df.ewm(span=2, min_periods=1)
+    op = getattr(ewm, kernel)
+    result = op(*arg, numeric_only=numeric_only)
+
+    # Compare result to op using float dtypes, dropping c when numeric_only is True
+    columns = ["a", "b"] if numeric_only else ["a", "b", "c"]
+    df2 = df[columns].astype(float)
+    arg2 = (df2,) if use_arg else ()
+    ewm2 = df2.ewm(span=2, min_periods=1)
+    op2 = getattr(ewm2, kernel)
+    expected = op2(*arg2, numeric_only=numeric_only)
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", [int, object])
+def test_numeric_only_series(arithmetic_win_operators, numeric_only, dtype):
+    # GH#46560
+    kernel = arithmetic_win_operators
+    ser = Series([1], dtype=dtype)
+    ewm = ser.ewm(span=2, min_periods=1)
+    op = getattr(ewm, kernel, None)
+    if op is None:
+        # Nothing to test
+        pytest.skip("No op to test")
+    if numeric_only and dtype is object:
+        msg = f"ExponentialMovingWindow.{kernel} does not implement numeric_only"
+        with pytest.raises(NotImplementedError, match=msg):
+            op(numeric_only=numeric_only)
+    else:
+        result = op(numeric_only=numeric_only)
+        expected = ser.agg([kernel]).reset_index(drop=True).astype(float)
+        tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("kernel", ["corr", "cov"])
+@pytest.mark.parametrize("use_arg", [True, False])
+@pytest.mark.parametrize("dtype", [int, object])
+def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype):
+    # GH#46560
+    ser = Series([1, 2, 3], dtype=dtype)
+    arg = (ser,) if use_arg else ()
+    ewm = ser.ewm(span=2, min_periods=1)
+    op = getattr(ewm, kernel)
+    if numeric_only and dtype is object:
+        msg = f"ExponentialMovingWindow.{kernel} does not implement numeric_only"
+        with pytest.raises(NotImplementedError, match=msg):
+            op(*arg, numeric_only=numeric_only)
+    else:
+        result = op(*arg, numeric_only=numeric_only)
+
+        ser2 = ser.astype(float)
+        arg2 = (ser2,) if use_arg else ()
+        ewm2 = ser2.ewm(span=2, min_periods=1)
+        op2 = getattr(ewm2, kernel)
+        expected = op2(*arg2, numeric_only=numeric_only)
+        tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0bd68214bcba7f8365bfbfd9d77d9a2e6235400
--- /dev/null
+++ b/pandas/tests/window/test_expanding.py
@@ -0,0 +1,830 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    MultiIndex,
+    Series,
+    isna,
+    notna,
+)
+import pandas._testing as tm
+
+
+def test_doc_string():
+    df = DataFrame({"B": [0, 1, 2, np.nan, 4]})
+    df
+    df.expanding(2).sum()
+
+
+def test_constructor(frame_or_series):
+    # GH 12669
+
+    c = frame_or_series(range(5)).expanding
+
+    # valid
+    c(min_periods=1)
+
+
+@pytest.mark.parametrize("w", [2.0, "foo", np.array([2])])
+def test_constructor_invalid(frame_or_series, w):
+    # not valid
+
+    c = frame_or_series(range(5)).expanding
+    msg = "min_periods must be an integer"
+    with pytest.raises(ValueError, match=msg):
+        c(min_periods=w)
+
+
+@pytest.mark.parametrize(
+    "expander",
+    [
+        1,
+        pytest.param(
+            "ls",
+            marks=pytest.mark.xfail(
+                reason="GH#16425 expanding with offset not supported"
+            ),
+        ),
+    ],
+)
+def test_empty_df_expanding(expander):
+    # GH 15819 Verifies that datetime and integer expanding windows can be
+    # applied to empty DataFrames
+
+    expected = DataFrame()
+    result = DataFrame().expanding(expander).sum()
+    tm.assert_frame_equal(result, expected)
+
+    # Verifies that datetime and integer expanding windows can be applied
+    # to empty DataFrames with datetime index
+    expected = DataFrame(index=DatetimeIndex([]))
+    result = DataFrame(index=DatetimeIndex([])).expanding(expander).sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_missing_minp_zero():
+    # https://github.com/pandas-dev/pandas/pull/18921
+    # minp=0
+    x = Series([np.nan])
+    result = x.expanding(min_periods=0).sum()
+    expected = Series([0.0])
+    tm.assert_series_equal(result, expected)
+
+    # minp=1
+    result = x.expanding(min_periods=1).sum()
+    expected = Series([np.nan])
+    tm.assert_series_equal(result, expected)
+
+
+def test_expanding():
+    # see gh-23372.
+    df = DataFrame(np.ones((10, 20)))
+
+    expected = DataFrame(
+        {i: [np.nan] * 2 + [float(j) for j in range(3, 11)] for i in range(20)}
+    )
+    result = df.expanding(3).sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_expanding_count_with_min_periods(frame_or_series):
+    # GH 26996
+    result = frame_or_series(range(5)).expanding(min_periods=3).count()
+    expected = frame_or_series([np.nan, np.nan, 3.0, 4.0, 5.0])
+    tm.assert_equal(result, expected)
+
+
+def test_expanding_count_default_min_periods_with_null_values(frame_or_series):
+    # GH 26996
+    values = [1, 2, 3, np.nan, 4, 5, 6]
+    expected_counts = [1.0, 2.0, 3.0, 3.0, 4.0, 5.0, 6.0]
+
+    result = frame_or_series(values).expanding().count()
+    expected = frame_or_series(expected_counts)
+    tm.assert_equal(result, expected)
+
+
+def test_expanding_count_with_min_periods_exceeding_series_length(frame_or_series):
+    # GH 25857
+    result = frame_or_series(range(5)).expanding(min_periods=6).count()
+    expected = frame_or_series([np.nan, np.nan, np.nan, np.nan, np.nan])
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "df,expected,min_periods",
+    [
+        (
+            {"A": [1, 2, 3], "B": [4, 5, 6]},
+            [
+                ({"A": [1], "B": [4]}, [0]),
+                ({"A": [1, 2], "B": [4, 5]}, [0, 1]),
+                ({"A": [1, 2, 3], "B": [4, 5, 6]}, [0, 1, 2]),
+            ],
+            3,
+        ),
+        (
+            {"A": [1, 2, 3], "B": [4, 5, 6]},
+            [
+                ({"A": [1], "B": [4]}, [0]),
+                ({"A": [1, 2], "B": [4, 5]}, [0, 1]),
+                ({"A": [1, 2, 3], "B": [4, 5, 6]}, [0, 1, 2]),
+            ],
+            2,
+        ),
+        (
+            {"A": [1, 2, 3], "B": [4, 5, 6]},
+            [
+                ({"A": [1], "B": [4]}, [0]),
+                ({"A": [1, 2], "B": [4, 5]}, [0, 1]),
+                ({"A": [1, 2, 3], "B": [4, 5, 6]}, [0, 1, 2]),
+            ],
+            1,
+        ),
+        ({"A": [1], "B": [4]}, [], 2),
+        (None, [({}, [])], 1),
+        (
+            {"A": [1, np.nan, 3], "B": [np.nan, 5, 6]},
+            [
+                ({"A": [1.0], "B": [np.nan]}, [0]),
+                ({"A": [1, np.nan], "B": [np.nan, 5]}, [0, 1]),
+                ({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}, [0, 1, 2]),
+            ],
+            3,
+        ),
+        (
+            {"A": [1, np.nan, 3], "B": [np.nan, 5, 6]},
+            [
+                ({"A": [1.0], "B": [np.nan]}, [0]),
+                ({"A": [1, np.nan], "B": [np.nan, 5]}, [0, 1]),
+                ({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}, [0, 1, 2]),
+            ],
+            2,
+        ),
+        (
+            {"A": [1, np.nan, 3], "B": [np.nan, 5, 6]},
+            [
+                ({"A": [1.0], "B": [np.nan]}, [0]),
+                ({"A": [1, np.nan], "B": [np.nan, 5]}, [0, 1]),
+                ({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}, [0, 1, 2]),
+            ],
+            1,
+        ),
+    ],
+)
+def test_iter_expanding_dataframe(df, expected, min_periods):
+    # GH 11704
+    df = DataFrame(df)
+    expecteds = [DataFrame(values, index=index) for (values, index) in expected]
+
+    for expected, actual in zip(expecteds, df.expanding(min_periods), strict=False):
+        tm.assert_frame_equal(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "ser,expected,min_periods",
+    [
+        (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([1, 2, 3], [0, 1, 2])], 3),
+        (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([1, 2, 3], [0, 1, 2])], 2),
+        (Series([1, 2, 3]), [([1], [0]), ([1, 2], [0, 1]), ([1, 2, 3], [0, 1, 2])], 1),
+        (Series([1, 2]), [([1], [0]), ([1, 2], [0, 1])], 2),
+        (Series([np.nan, 2]), [([np.nan], [0]), ([np.nan, 2], [0, 1])], 2),
+        (Series([], dtype="int64"), [], 2),
+    ],
+)
+def test_iter_expanding_series(ser, expected, min_periods):
+    # GH 11704
+    expecteds = [Series(values, index=index) for (values, index) in expected]
+
+    for expected, actual in zip(expecteds, ser.expanding(min_periods), strict=True):
+        tm.assert_series_equal(actual, expected)
+
+
+def test_center_invalid():
+    # GH 20647
+    df = DataFrame()
+    with pytest.raises(TypeError, match=".* got an unexpected keyword"):
+        df.expanding(center=True)
+
+
+def test_expanding_sem(frame_or_series):
+    # GH: 26476
+    obj = frame_or_series([0, 1, 2])
+    result = obj.expanding().sem()
+    if isinstance(result, DataFrame):
+        result = Series(result[0].values)
+    expected = Series([np.nan, 0.5, (1 / 3) ** 0.5])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["skew", "kurt"])
+def test_expanding_skew_kurt_numerical_stability(method):
+    # GH: 6929
+    s = Series(np.random.default_rng(2).random(10))
+    expected = getattr(s.expanding(3), method)()
+    s = s + 5000
+    result = getattr(s.expanding(3), method)()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("window", [1, 3, 10, 20])
+@pytest.mark.parametrize("method", ["min", "max", "average"])
+@pytest.mark.parametrize("pct", [True, False])
+@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"])
+def test_rank(window, method, pct, ascending, test_data):
+    length = 20
+    if test_data == "default":
+        ser = Series(data=np.random.default_rng(2).random(length))
+    elif test_data == "duplicates":
+        ser = Series(data=np.random.default_rng(2).choice(3, length))
+    elif test_data == "nans":
+        ser = Series(
+            data=np.random.default_rng(2).choice(
+                [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
+            )
+        )
+
+    expected = ser.expanding(window).apply(
+        lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1]
+    )
+    result = ser.expanding(window).rank(method=method, pct=pct, ascending=ascending)
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("window", [1, 3, 10, 20])
+@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"])
+def test_nunique(window, test_data):
+    length = 20
+    if test_data == "default":
+        ser = Series(data=np.random.default_rng(2).random(length))
+    elif test_data == "duplicates":
+        ser = Series(data=np.random.default_rng(2).choice(3, length))
+    elif test_data == "nans":
+        ser = Series(
+            data=np.random.default_rng(2).choice(
+                [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
+            )
+        )
+    elif test_data == "precision":
+        ser = Series(
+            data=[
+                0.3,
+                0.1 * 3,  # Not necessarily exactly 0.3
+                0.6,
+                0.2 * 3,  # Not necessarily exactly 0.6
+                0.9,
+                0.3 * 3,  # Not necessarily exactly 0.9
+                0.5,
+                0.1 * 5,  # Not necessarily exactly 0.5
+                0.8,
+                0.2 * 4,  # Not necessarily exactly 0.8
+            ],
+            dtype=np.float64,
+        )
+
+    expected = ser.expanding(window).apply(lambda x: x.nunique())
+    result = ser.expanding(window).nunique()
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_expanding_corr(series):
+    A = series.dropna()
+    B = (A + np.random.default_rng(2).standard_normal(len(A)))[:-5]
+
+    result = A.expanding().corr(B)
+
+    rolling_result = A.rolling(window=len(A), min_periods=1).corr(B)
+
+    tm.assert_almost_equal(rolling_result, result)
+
+
+def test_expanding_count(series):
+    result = series.expanding(min_periods=0).count()
+    tm.assert_almost_equal(
+        result, series.rolling(window=len(series), min_periods=0).count()
+    )
+
+
+def test_expanding_quantile(series):
+    result = series.expanding().quantile(0.5)
+
+    rolling_result = series.rolling(window=len(series), min_periods=1).quantile(0.5)
+
+    tm.assert_almost_equal(result, rolling_result)
+
+
+def test_expanding_cov(series):
+    A = series
+    B = (A + np.random.default_rng(2).standard_normal(len(A)))[:-5]
+
+    result = A.expanding().cov(B)
+
+    rolling_result = A.rolling(window=len(A), min_periods=1).cov(B)
+
+    tm.assert_almost_equal(rolling_result, result)
+
+
+def test_expanding_cov_pairwise(frame):
+    result = frame.expanding().cov()
+
+    rolling_result = frame.rolling(window=len(frame), min_periods=1).cov()
+
+    tm.assert_frame_equal(result, rolling_result)
+
+
+def test_expanding_corr_pairwise(frame):
+    result = frame.expanding().corr()
+
+    rolling_result = frame.rolling(window=len(frame), min_periods=1).corr()
+    tm.assert_frame_equal(result, rolling_result)
+
+
+@pytest.mark.parametrize(
+    "func,static_comp",
+    [
+        ("sum", lambda x: np.sum(x, axis=0)),
+        ("mean", lambda x: np.mean(x, axis=0)),
+        ("max", lambda x: np.max(x, axis=0)),
+        ("min", lambda x: np.min(x, axis=0)),
+    ],
+    ids=["sum", "mean", "max", "min"],
+)
+def test_expanding_func(func, static_comp, frame_or_series):
+    data = frame_or_series(np.array(list(range(10)) + [np.nan] * 10))
+
+    obj = data.expanding(min_periods=1)
+    result = getattr(obj, func)()
+    assert isinstance(result, frame_or_series)
+
+    expected = static_comp(data[:11])
+    if frame_or_series is Series:
+        tm.assert_almost_equal(result[10], expected)
+    else:
+        tm.assert_series_equal(result.iloc[10], expected, check_names=False)
+
+
+@pytest.mark.parametrize(
+    "func,static_comp",
+    [("sum", np.sum), ("mean", np.mean), ("max", np.max), ("min", np.min)],
+    ids=["sum", "mean", "max", "min"],
+)
+def test_expanding_min_periods(func, static_comp):
+    ser = Series(np.random.default_rng(2).standard_normal(50))
+
+    result = getattr(ser.expanding(min_periods=30), func)()
+    assert result[:29].isna().all()
+    tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50]))
+
+    # min_periods is working correctly
+    result = getattr(ser.expanding(min_periods=15), func)()
+    assert isna(result.iloc[13])
+    assert notna(result.iloc[14])
+
+    ser2 = Series(np.random.default_rng(2).standard_normal(20))
+    result = getattr(ser2.expanding(min_periods=5), func)()
+    assert isna(result[3])
+    assert notna(result[4])
+
+    # min_periods=0
+    result0 = getattr(ser.expanding(min_periods=0), func)()
+    result1 = getattr(ser.expanding(min_periods=1), func)()
+    tm.assert_almost_equal(result0, result1)
+
+    result = getattr(ser.expanding(min_periods=1), func)()
+    tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50]))
+
+
+def test_expanding_apply(engine_and_raw, frame_or_series):
+    engine, raw = engine_and_raw
+    data = frame_or_series(np.array(list(range(10)) + [np.nan] * 10))
+    result = data.expanding(min_periods=1).apply(
+        lambda x: x.mean(), raw=raw, engine=engine
+    )
+    assert isinstance(result, frame_or_series)
+
+    if frame_or_series is Series:
+        tm.assert_almost_equal(result[9], np.mean(data[:11], axis=0))
+    else:
+        tm.assert_series_equal(
+            result.iloc[9], np.mean(data[:11], axis=0), check_names=False
+        )
+
+
+def test_expanding_min_periods_apply(engine_and_raw):
+    engine, raw = engine_and_raw
+    ser = Series(np.random.default_rng(2).standard_normal(50))
+
+    result = ser.expanding(min_periods=30).apply(
+        lambda x: x.mean(), raw=raw, engine=engine
+    )
+    assert result[:29].isna().all()
+    tm.assert_almost_equal(result.iloc[-1], np.mean(ser[:50]))
+
+    # min_periods is working correctly
+    result = ser.expanding(min_periods=15).apply(
+        lambda x: x.mean(), raw=raw, engine=engine
+    )
+    assert isna(result.iloc[13])
+    assert notna(result.iloc[14])
+
+    ser2 = Series(np.random.default_rng(2).standard_normal(20))
+    result = ser2.expanding(min_periods=5).apply(
+        lambda x: x.mean(), raw=raw, engine=engine
+    )
+    assert isna(result[3])
+    assert notna(result[4])
+
+    # min_periods=0
+    result0 = ser.expanding(min_periods=0).apply(
+        lambda x: x.mean(), raw=raw, engine=engine
+    )
+    result1 = ser.expanding(min_periods=1).apply(
+        lambda x: x.mean(), raw=raw, engine=engine
+    )
+    tm.assert_almost_equal(result0, result1)
+
+    result = ser.expanding(min_periods=1).apply(
+        lambda x: x.mean(), raw=raw, engine=engine
+    )
+    tm.assert_almost_equal(result.iloc[-1], np.mean(ser[:50]))
+
+
+@pytest.mark.parametrize(
+    "f",
+    [
+        lambda x: (x.expanding(min_periods=5).cov(x, pairwise=True)),
+        lambda x: (x.expanding(min_periods=5).corr(x, pairwise=True)),
+    ],
+)
+def test_moment_functions_zero_length_pairwise(f):
+    df1 = DataFrame()
+    df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar"))
+    df2["a"] = df2["a"].astype("float64")
+
+    df1_expected = DataFrame(index=MultiIndex.from_product([df1.index, df1.columns]))
+    df2_expected = DataFrame(
+        index=MultiIndex.from_product([df2.index, df2.columns], names=["bar", "foo"]),
+        columns=Index(["a"], name="foo"),
+        dtype="float64",
+    )
+
+    df1_result = f(df1)
+    tm.assert_frame_equal(df1_result, df1_expected)
+
+    df2_result = f(df2)
+    tm.assert_frame_equal(df2_result, df2_expected)
+
+
+@pytest.mark.parametrize(
+    "f",
+    [
+        lambda x: x.expanding().count(),
+        lambda x: x.expanding(min_periods=5).cov(x, pairwise=False),
+        lambda x: x.expanding(min_periods=5).corr(x, pairwise=False),
+        lambda x: x.expanding(min_periods=5).max(),
+        lambda x: x.expanding(min_periods=5).min(),
+        lambda x: x.expanding(min_periods=5).first(),
+        lambda x: x.expanding(min_periods=5).last(),
+        lambda x: x.expanding(min_periods=5).sum(),
+        lambda x: x.expanding(min_periods=5).mean(),
+        lambda x: x.expanding(min_periods=5).std(),
+        lambda x: x.expanding(min_periods=5).var(),
+        lambda x: x.expanding(min_periods=5).skew(),
+        lambda x: x.expanding(min_periods=5).kurt(),
+        lambda x: x.expanding(min_periods=5).quantile(0.5),
+        lambda x: x.expanding(min_periods=5).median(),
+        lambda x: x.expanding(min_periods=5).apply(sum, raw=False),
+        lambda x: x.expanding(min_periods=5).apply(sum, raw=True),
+    ],
+)
+def test_moment_functions_zero_length(f):
+    # GH 8056
+    s = Series(dtype=np.float64)
+    s_expected = s
+    df1 = DataFrame()
+    df1_expected = df1
+    df2 = DataFrame(columns=["a"])
+    df2["a"] = df2["a"].astype("float64")
+    df2_expected = df2
+
+    s_result = f(s)
+    tm.assert_series_equal(s_result, s_expected)
+
+    df1_result = f(df1)
+    tm.assert_frame_equal(df1_result, df1_expected)
+
+    df2_result = f(df2)
+    tm.assert_frame_equal(df2_result, df2_expected)
+
+
+def test_expanding_apply_empty_series(engine_and_raw):
+    engine, raw = engine_and_raw
+    ser = Series([], dtype=np.float64)
+    tm.assert_series_equal(
+        ser, ser.expanding().apply(lambda x: x.mean(), raw=raw, engine=engine)
+    )
+
+
+def test_expanding_apply_min_periods_0(engine_and_raw):
+    # GH 8080
+    engine, raw = engine_and_raw
+    s = Series([None, None, None])
+    result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw, engine=engine)
+    expected = Series([1.0, 2.0, 3.0])
+    tm.assert_series_equal(result, expected)
+
+
+def test_expanding_cov_diff_index():
+    # GH 7512
+    s1 = Series([1, 2, 3], index=range(3))
+    s2 = Series([1, 3], index=range(0, 4, 2))
+    result = s1.expanding().cov(s2)
+    expected = Series([None, None, 2.0])
+    tm.assert_series_equal(result, expected)
+
+    s2a = Series([1, None, 3], index=[0, 1, 2])
+    result = s1.expanding().cov(s2a)
+    tm.assert_series_equal(result, expected)
+
+    s1 = Series([7, 8, 10], index=[0, 1, 3])
+    s2 = Series([7, 9, 10], index=[0, 2, 3])
+    result = s1.expanding().cov(s2)
+    expected = Series([None, None, None, 4.5], index=list(range(4)))
+    tm.assert_series_equal(result, expected)
+
+
+def test_expanding_corr_diff_index():
+    # GH 7512
+    s1 = Series([1, 2, 3], index=range(3))
+    s2 = Series([1, 3], index=range(0, 4, 2))
+    result = s1.expanding().corr(s2)
+    expected = Series([None, None, 1.0])
+    tm.assert_series_equal(result, expected)
+
+    s2a = Series([1, None, 3], index=[0, 1, 2])
+    result = s1.expanding().corr(s2a)
+    tm.assert_series_equal(result, expected)
+
+    s1 = Series([7, 8, 10], index=[0, 1, 3])
+    s2 = Series([7, 9, 10], index=[0, 2, 3])
+    result = s1.expanding().corr(s2)
+    expected = Series([None, None, None, 1.0], index=list(range(4)))
+    tm.assert_series_equal(result, expected)
+
+
+def test_expanding_cov_pairwise_diff_length():
+    # GH 7512
+    df1 = DataFrame([[1, 5], [3, 2], [3, 9]], columns=Index(["A", "B"], name="foo"))
+    df1a = DataFrame(
+        [[1, 5], [3, 9]], index=[0, 2], columns=Index(["A", "B"], name="foo")
+    )
+    df2 = DataFrame(
+        [[5, 6], [None, None], [2, 1]], columns=Index(["X", "Y"], name="foo")
+    )
+    df2a = DataFrame(
+        [[5, 6], [2, 1]], index=[0, 2], columns=Index(["X", "Y"], name="foo")
+    )
+    # xref gh-15826
+    # .loc is not preserving the names
+    result1 = df1.expanding().cov(df2, pairwise=True).loc[2]
+    result2 = df1.expanding().cov(df2a, pairwise=True).loc[2]
+    result3 = df1a.expanding().cov(df2, pairwise=True).loc[2]
+    result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2]
+    expected = DataFrame(
+        [[-3.0, -6.0], [-5.0, -10.0]],
+        columns=Index(["A", "B"], name="foo"),
+        index=Index(["X", "Y"], name="foo"),
+    )
+    tm.assert_frame_equal(result1, expected)
+    tm.assert_frame_equal(result2, expected)
+    tm.assert_frame_equal(result3, expected)
+    tm.assert_frame_equal(result4, expected)
+
+
+def test_expanding_corr_pairwise_diff_length():
+    # GH 7512
+    df1 = DataFrame(
+        [[1, 2], [3, 2], [3, 4]], columns=["A", "B"], index=Index(range(3), name="bar")
+    )
+    df1a = DataFrame(
+        [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"]
+    )
+    df2 = DataFrame(
+        [[5, 6], [None, None], [2, 1]],
+        columns=["X", "Y"],
+        index=Index(range(3), name="bar"),
+    )
+    df2a = DataFrame(
+        [[5, 6], [2, 1]], index=Index([0, 2], name="bar"), columns=["X", "Y"]
+    )
+    result1 = df1.expanding().corr(df2, pairwise=True).loc[2]
+    result2 = df1.expanding().corr(df2a, pairwise=True).loc[2]
+    result3 = df1a.expanding().corr(df2, pairwise=True).loc[2]
+    result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2]
+    expected = DataFrame(
+        [[-1.0, -1.0], [-1.0, -1.0]], columns=["A", "B"], index=Index(["X", "Y"])
+    )
+    tm.assert_frame_equal(result1, expected)
+    tm.assert_frame_equal(result2, expected)
+    tm.assert_frame_equal(result3, expected)
+    tm.assert_frame_equal(result4, expected)
+
+
+@pytest.mark.parametrize(
+    "values,method,expected",
+    [
+        (
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+            "first",
+            [float("nan"), float("nan"), 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+        ),
+        (
+            [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan],
+            "first",
+            [
+                float("nan"),
+                float("nan"),
+                float("nan"),
+                float("nan"),
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+                1.0,
+            ],
+        ),
+        (
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+            "last",
+            [float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+        ),
+        (
+            [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan],
+            "last",
+            [
+                float("nan"),
+                float("nan"),
+                float("nan"),
+                float("nan"),
+                5.0,
+                5.0,
+                7.0,
+                7.0,
+                9.0,
+                9.0,
+            ],
+        ),
+    ],
+)
+def test_expanding_first_last(values, method, expected):
+    # GH#33155
+    x = Series(values)
+    result = getattr(x.expanding(3), method)()
+    expected = Series(expected)
+    tm.assert_almost_equal(result, expected)
+
+    x = DataFrame({"A": values})
+    result = getattr(x.expanding(3), method)()
+    expected = DataFrame({"A": expected})
+    tm.assert_almost_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values,method,expected",
+    [
+        (
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+            "first",
+            [1.0] * 10,
+        ),
+        (
+            [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan],
+            "first",
+            [1.0] * 10,
+        ),
+        (
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+            "last",
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+        ),
+        (
+            [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan],
+            "last",
+            [1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0],
+        ),
+    ],
+)
+def test_expanding_first_last_no_minp(values, method, expected):
+    # GH#33155
+    x = Series(values)
+    result = getattr(x.expanding(min_periods=0), method)()
+    expected = Series(expected)
+    tm.assert_almost_equal(result, expected)
+
+    x = DataFrame({"A": values})
+    result = getattr(x.expanding(min_periods=0), method)()
+    expected = DataFrame({"A": expected})
+    tm.assert_almost_equal(result, expected)
+
+
+def test_expanding_apply_args_kwargs(engine_and_raw):
+    def mean_w_arg(x, const):
+        return np.mean(x) + const
+
+    engine, raw = engine_and_raw
+
+    df = DataFrame(np.random.default_rng(2).random((20, 3)))
+
+    expected = df.expanding().apply(np.mean, engine=engine, raw=raw) + 20.0
+
+    result = df.expanding().apply(mean_w_arg, engine=engine, raw=raw, args=(20,))
+    tm.assert_frame_equal(result, expected)
+
+    result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_numeric_only_frame(arithmetic_win_operators, numeric_only):
+    # GH#46560
+    kernel = arithmetic_win_operators
+    df = DataFrame({"a": [1], "b": 2, "c": 3})
+    df["c"] = df["c"].astype(object)
+    expanding = df.expanding()
+    op = getattr(expanding, kernel, None)
+    if op is not None:
+        result = op(numeric_only=numeric_only)
+
+        columns = ["a", "b"] if numeric_only else ["a", "b", "c"]
+        expected = df[columns].agg([kernel]).reset_index(drop=True).astype(float)
+        assert list(expected.columns) == columns
+
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("kernel", ["corr", "cov"])
+@pytest.mark.parametrize("use_arg", [True, False])
+def test_numeric_only_corr_cov_frame(kernel, numeric_only, use_arg):
+    # GH#46560
+    df = DataFrame({"a": [1, 2, 3], "b": 2, "c": 3})
+    df["c"] = df["c"].astype(object)
+    arg = (df,) if use_arg else ()
+    expanding = df.expanding()
+    op = getattr(expanding, kernel)
+    result = op(*arg, numeric_only=numeric_only)
+
+    # Compare result to op using float dtypes, dropping c when numeric_only is True
+    columns = ["a", "b"] if numeric_only else ["a", "b", "c"]
+    df2 = df[columns].astype(float)
+    arg2 = (df2,) if use_arg else ()
+    expanding2 = df2.expanding()
+    op2 = getattr(expanding2, kernel)
+    expected = op2(*arg2, numeric_only=numeric_only)
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", [int, object])
+def test_numeric_only_series(arithmetic_win_operators, numeric_only, dtype):
+    # GH#46560
+    kernel = arithmetic_win_operators
+    ser = Series([1], dtype=dtype)
+    expanding = ser.expanding()
+    op = getattr(expanding, kernel)
+    if numeric_only and dtype is object:
+        msg = f"Expanding.{kernel} does not implement numeric_only"
+        with pytest.raises(NotImplementedError, match=msg):
+            op(numeric_only=numeric_only)
+    else:
+        result = op(numeric_only=numeric_only)
+        expected = ser.agg([kernel]).reset_index(drop=True).astype(float)
+        tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("kernel", ["corr", "cov"])
+@pytest.mark.parametrize("use_arg", [True, False])
+@pytest.mark.parametrize("dtype", [int, object])
+def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype):
+    # GH#46560
+    ser = Series([1, 2, 3], dtype=dtype)
+    arg = (ser,) if use_arg else ()
+    expanding = ser.expanding()
+    op = getattr(expanding, kernel)
+    if numeric_only and dtype is object:
+        msg = f"Expanding.{kernel} does not implement numeric_only"
+        with pytest.raises(NotImplementedError, match=msg):
+            op(*arg, numeric_only=numeric_only)
+    else:
+        result = op(*arg, numeric_only=numeric_only)
+
+        ser2 = ser.astype(float)
+        arg2 = (ser2,) if use_arg else ()
+        expanding2 = ser2.expanding()
+        op2 = getattr(expanding2, kernel)
+        expected = op2(*arg2, numeric_only=numeric_only)
+        tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py
new file mode 100644
index 0000000000000000000000000000000000000000..543ae095b1cb432fff6e85d422f984a741ba06d2
--- /dev/null
+++ b/pandas/tests/window/test_groupby.py
@@ -0,0 +1,1389 @@
+import numpy as np
+import pytest
+
+from pandas.errors import Pandas4Warning
+
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    MultiIndex,
+    NamedAgg,
+    Series,
+    Timestamp,
+    date_range,
+    to_datetime,
+)
+import pandas._testing as tm
+from pandas.api.indexers import BaseIndexer
+from pandas.core.groupby.groupby import get_groupby
+
+
+@pytest.fixture
+def times_frame():
+    """Frame for testing times argument in EWM groupby."""
+    return DataFrame(
+        {
+            "A": ["a", "b", "c", "a", "b", "c", "a", "b", "c", "a"],
+            "B": [0, 0, 0, 1, 1, 1, 2, 2, 2, 3],
+            "C": to_datetime(
+                [
+                    "2020-01-01",
+                    "2020-01-01",
+                    "2020-01-01",
+                    "2020-01-02",
+                    "2020-01-10",
+                    "2020-01-22",
+                    "2020-01-03",
+                    "2020-01-23",
+                    "2020-01-23",
+                    "2020-01-04",
+                ]
+            ),
+        }
+    )
+
+
+@pytest.fixture
+def roll_frame():
+    return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})
+
+
+class TestRolling:
+    def test_groupby_unsupported_argument(self, roll_frame):
+        msg = r"groupby\(\) got an unexpected keyword argument 'foo'"
+        with pytest.raises(TypeError, match=msg):
+            roll_frame.groupby("A", foo=1)
+
+    def test_getitem(self, roll_frame):
+        g = roll_frame.groupby("A")
+        g_mutated = get_groupby(roll_frame, by="A")
+
+        expected = g_mutated.B.apply(lambda x: x.rolling(2).mean())
+
+        result = g.rolling(2).mean().B
+        tm.assert_series_equal(result, expected)
+
+        result = g.rolling(2).B.mean()
+        tm.assert_series_equal(result, expected)
+
+        result = g.B.rolling(2).mean()
+        tm.assert_series_equal(result, expected)
+
+        result = roll_frame.B.groupby(roll_frame.A).rolling(2).mean()
+        tm.assert_series_equal(result, expected)
+
+    def test_getitem_multiple(self, roll_frame):
+        # GH 13174
+        g = roll_frame.groupby("A")
+        r = g.rolling(2, min_periods=0)
+        g_mutated = get_groupby(roll_frame, by="A")
+        expected = g_mutated.B.apply(lambda x: x.rolling(2, min_periods=0).count())
+
+        result = r.B.count()
+        tm.assert_series_equal(result, expected)
+
+        result = r.B.count()
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "f",
+        [
+            "sum",
+            "mean",
+            "min",
+            "max",
+            "first",
+            "last",
+            "count",
+            "kurt",
+            "skew",
+            "nunique",
+        ],
+    )
+    def test_rolling(self, f, roll_frame):
+        g = roll_frame.groupby("A", group_keys=False)
+        r = g.rolling(window=4)
+
+        result = getattr(r, f)()
+        expected = g.apply(lambda x: getattr(x.rolling(4), f)())
+        # GH 39732
+        expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)])
+        expected.index = expected_index
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("f", ["std", "var"])
+    def test_rolling_ddof(self, f, roll_frame):
+        g = roll_frame.groupby("A", group_keys=False)
+        r = g.rolling(window=4)
+
+        result = getattr(r, f)(ddof=1)
+        expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1))
+        # GH 39732
+        expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)])
+        expected.index = expected_index
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"]
+    )
+    def test_rolling_quantile(self, interpolation, roll_frame):
+        g = roll_frame.groupby("A", group_keys=False)
+        r = g.rolling(window=4)
+
+        result = r.quantile(0.4, interpolation=interpolation)
+        expected = g.apply(
+            lambda x: x.rolling(4).quantile(0.4, interpolation=interpolation)
+        )
+        # GH 39732
+        expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)])
+        expected.index = expected_index
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("f, expected_val", [["corr", 1], ["cov", 0.5]])
+    def test_rolling_corr_cov_other_same_size_as_groups(self, f, expected_val):
+        # GH 42915
+        df = DataFrame(
+            {"value": range(10), "idx1": [1] * 5 + [2] * 5, "idx2": [1, 2, 3, 4, 5] * 2}
+        ).set_index(["idx1", "idx2"])
+        other = DataFrame({"value": range(5), "idx2": [1, 2, 3, 4, 5]}).set_index(
+            "idx2"
+        )
+        result = getattr(df.groupby(level=0).rolling(2), f)(other)
+        expected_data = ([np.nan] + [expected_val] * 4) * 2
+        expected = DataFrame(
+            expected_data,
+            columns=["value"],
+            index=MultiIndex.from_arrays(
+                [
+                    [1] * 5 + [2] * 5,
+                    [1] * 5 + [2] * 5,
+                    list(range(1, 6)) * 2,
+                ],
+                names=["idx1", "idx1", "idx2"],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("f", ["corr", "cov"])
+    def test_rolling_corr_cov_other_diff_size_as_groups(self, f, roll_frame):
+        g = roll_frame.groupby("A")
+        r = g.rolling(window=4)
+
+        result = getattr(r, f)(roll_frame)
+
+        def func(x):
+            return getattr(x.rolling(4), f)(roll_frame)
+
+        expected = g.apply(func)
+        # GH 39591: The grouped column should be all np.nan
+        # (groupby.apply inserts 0s for cov)
+        expected["A"] = np.nan
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("f", ["corr", "cov"])
+    def test_rolling_corr_cov_pairwise(self, f, roll_frame):
+        g = roll_frame.groupby("A")
+        r = g.rolling(window=4)
+
+        result = getattr(r.B, f)(pairwise=True)
+
+        def func(x):
+            return getattr(x.B.rolling(4), f)(pairwise=True)
+
+        expected = g.apply(func)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "func, expected_values",
+        [("cov", [[1.0, 1.0], [1.0, 4.0]]), ("corr", [[1.0, 0.5], [0.5, 1.0]])],
+    )
+    def test_rolling_corr_cov_unordered(self, func, expected_values):
+        # GH 43386
+        df = DataFrame(
+            {
+                "a": ["g1", "g2", "g1", "g1"],
+                "b": [0, 0, 1, 2],
+                "c": [2, 0, 6, 4],
+            }
+        )
+        rol = df.groupby("a").rolling(3)
+        result = getattr(rol, func)()
+        expected = DataFrame(
+            {
+                "b": 4 * [np.nan] + expected_values[0] + 2 * [np.nan],
+                "c": 4 * [np.nan] + expected_values[1] + 2 * [np.nan],
+            },
+            index=MultiIndex.from_tuples(
+                [
+                    ("g1", 0, "b"),
+                    ("g1", 0, "c"),
+                    ("g1", 2, "b"),
+                    ("g1", 2, "c"),
+                    ("g1", 3, "b"),
+                    ("g1", 3, "c"),
+                    ("g2", 1, "b"),
+                    ("g2", 1, "c"),
+                ],
+                names=["a", None, None],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_rolling_apply(self, raw, roll_frame):
+        g = roll_frame.groupby("A", group_keys=False)
+        r = g.rolling(window=4)
+
+        # reduction
+        result = r.apply(lambda x: x.sum(), raw=raw)
+        expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw))
+        # GH 39732
+        expected_index = MultiIndex.from_arrays([roll_frame["A"], range(40)])
+        expected.index = expected_index
+        tm.assert_frame_equal(result, expected)
+
+    def test_rolling_apply_mutability(self):
+        # GH 14013
+        df = DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6})
+        g = df.groupby("A")
+
+        mi = MultiIndex.from_tuples(
+            [("bar", 3), ("bar", 4), ("bar", 5), ("foo", 0), ("foo", 1), ("foo", 2)]
+        )
+
+        mi.names = ["A", None]
+        # Grouped column should not be a part of the output
+        expected = DataFrame([np.nan, 2.0, 2.0] * 2, columns=["B"], index=mi)
+
+        result = g.rolling(window=2).sum()
+        tm.assert_frame_equal(result, expected)
+
+        # Call an arbitrary function on the groupby
+        g.sum()
+
+        # Make sure nothing has been mutated
+        result = g.rolling(window=2).sum()
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("expected_value,raw_value", [[1.0, True], [0.0, False]])
+    def test_groupby_rolling(self, expected_value, raw_value):
+        # GH 31754
+
+        def isnumpyarray(x):
+            return int(isinstance(x, np.ndarray))
+
+        df = DataFrame({"id": [1, 1, 1], "value": [1, 2, 3]})
+        result = df.groupby("id").value.rolling(1).apply(isnumpyarray, raw=raw_value)
+        expected = Series(
+            [expected_value] * 3,
+            index=MultiIndex.from_tuples(((1, 0), (1, 1), (1, 2)), names=["id", None]),
+            name="value",
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_rolling_center_center(self):
+        # GH 35552
+        series = Series(range(1, 6))
+        result = series.groupby(series).rolling(center=True, window=3).mean()
+        expected = Series(
+            [np.nan] * 5,
+            index=MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3), (5, 4))),
+        )
+        tm.assert_series_equal(result, expected)
+
+        series = Series(range(1, 5))
+        result = series.groupby(series).rolling(center=True, window=3).mean()
+        expected = Series(
+            [np.nan] * 4,
+            index=MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3))),
+        )
+        tm.assert_series_equal(result, expected)
+
+        df = DataFrame({"a": ["a"] * 5 + ["b"] * 6, "b": range(11)})
+        result = df.groupby("a").rolling(center=True, window=3).mean()
+        expected = DataFrame(
+            [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, 9, np.nan],
+            index=MultiIndex.from_tuples(
+                (
+                    ("a", 0),
+                    ("a", 1),
+                    ("a", 2),
+                    ("a", 3),
+                    ("a", 4),
+                    ("b", 5),
+                    ("b", 6),
+                    ("b", 7),
+                    ("b", 8),
+                    ("b", 9),
+                    ("b", 10),
+                ),
+                names=["a", None],
+            ),
+            columns=["b"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+        df = DataFrame({"a": ["a"] * 5 + ["b"] * 5, "b": range(10)})
+        result = df.groupby("a").rolling(center=True, window=3).mean()
+        expected = DataFrame(
+            [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, np.nan],
+            index=MultiIndex.from_tuples(
+                (
+                    ("a", 0),
+                    ("a", 1),
+                    ("a", 2),
+                    ("a", 3),
+                    ("a", 4),
+                    ("b", 5),
+                    ("b", 6),
+                    ("b", 7),
+                    ("b", 8),
+                    ("b", 9),
+                ),
+                names=["a", None],
+            ),
+            columns=["b"],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_rolling_center_on(self):
+        # GH 37141
+        df = DataFrame(
+            data={
+                "Date": date_range("2020-01-01", "2020-01-10"),
+                "gb": ["group_1"] * 6 + ["group_2"] * 4,
+                "value": range(10),
+            }
+        )
+        result = (
+            df.groupby("gb")
+            .rolling(6, on="Date", center=True, min_periods=1)
+            .value.mean()
+        )
+        mi = MultiIndex.from_arrays([df["gb"], df["Date"]], names=["gb", "Date"])
+        expected = Series(
+            [1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 7.0, 7.5, 7.5, 7.5],
+            name="value",
+            index=mi,
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("min_periods", [5, 4, 3])
+    def test_groupby_rolling_center_min_periods(self, min_periods):
+        # GH 36040
+        df = DataFrame({"group": ["A"] * 10 + ["B"] * 10, "data": range(20)})
+
+        window_size = 5
+        result = (
+            df.groupby("group")
+            .rolling(window_size, center=True, min_periods=min_periods)
+            .mean()
+        )
+        result = result.reset_index()[["group", "data"]]
+
+        grp_A_mean = [1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 7.5, 8.0]
+        grp_B_mean = [x + 10.0 for x in grp_A_mean]
+
+        num_nans = max(0, min_periods - 3)  # For window_size of 5
+        nans = [np.nan] * num_nans
+        grp_A_expected = nans + grp_A_mean[num_nans : 10 - num_nans] + nans
+        grp_B_expected = nans + grp_B_mean[num_nans : 10 - num_nans] + nans
+
+        expected = DataFrame(
+            {"group": ["A"] * 10 + ["B"] * 10, "data": grp_A_expected + grp_B_expected}
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_subselect_rolling(self):
+        # GH 35486
+        df = DataFrame(
+            {"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0], "c": [10, 20, 30, 20]}
+        )
+        result = df.groupby("a")[["b"]].rolling(2).max()
+        expected = DataFrame(
+            [np.nan, np.nan, 2.0, np.nan],
+            columns=["b"],
+            index=MultiIndex.from_tuples(
+                ((1, 0), (2, 1), (2, 3), (3, 2)), names=["a", None]
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = df.groupby("a")["b"].rolling(2).max()
+        expected = Series(
+            [np.nan, np.nan, 2.0, np.nan],
+            index=MultiIndex.from_tuples(
+                ((1, 0), (2, 1), (2, 3), (3, 2)), names=["a", None]
+            ),
+            name="b",
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_rolling_custom_indexer(self):
+        # GH 35557
+        class SimpleIndexer(BaseIndexer):
+            def get_window_bounds(
+                self,
+                num_values=0,
+                min_periods=None,
+                center=None,
+                closed=None,
+                step=None,
+            ):
+                min_periods = self.window_size if min_periods is None else 0
+                end = np.arange(num_values, dtype=np.int64) + 1
+                start = end - self.window_size
+                start[start < 0] = min_periods
+                return start, end
+
+        df = DataFrame(
+            {"a": [1.0, 2.0, 3.0, 4.0, 5.0] * 3}, index=[0] * 5 + [1] * 5 + [2] * 5
+        )
+        result = (
+            df.groupby(df.index)
+            .rolling(SimpleIndexer(window_size=3), min_periods=1)
+            .sum()
+        )
+        expected = df.groupby(df.index).rolling(window=3, min_periods=1).sum()
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_rolling_subset_with_closed(self):
+        # GH 35549
+        df = DataFrame(
+            {
+                "column1": range(8),
+                "column2": range(8),
+                "group": ["A"] * 4 + ["B"] * 4,
+                "date": [
+                    Timestamp(date)
+                    for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
+                ]
+                * 2,
+            }
+        )
+        result = (
+            df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum()
+        )
+        expected = Series(
+            [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
+            index=MultiIndex.from_frame(
+                df[["group", "date"]],
+                names=["group", "date"],
+            ),
+            name="column1",
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_rolling_agg_namedagg(self):
+        # GH#28333
+        df = DataFrame(
+            {
+                "kind": ["cat", "dog", "cat", "dog", "cat", "dog"],
+                "height": [9.1, 6.0, 9.5, 34.0, 12.0, 8.0],
+                "weight": [7.9, 7.5, 9.9, 198.0, 10.0, 42.0],
+            }
+        )
+        result = (
+            df.groupby("kind")
+            .rolling(2)
+            .agg(
+                total_weight=NamedAgg(column="weight", aggfunc=sum),
+                min_height=NamedAgg(column="height", aggfunc=min),
+            )
+        )
+        expected = DataFrame(
+            {
+                "total_weight": [np.nan, 17.8, 19.9, np.nan, 205.5, 240.0],
+                "min_height": [np.nan, 9.1, 9.5, np.nan, 6.0, 8.0],
+            },
+            index=MultiIndex(
+                [["cat", "dog"], [0, 1, 2, 3, 4, 5]],
+                [[0, 0, 0, 1, 1, 1], [0, 2, 4, 1, 3, 5]],
+                names=["kind", None],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_subset_rolling_subset_with_closed(self):
+        # GH 35549
+        df = DataFrame(
+            {
+                "column1": range(8),
+                "column2": range(8),
+                "group": ["A"] * 4 + ["B"] * 4,
+                "date": [
+                    Timestamp(date)
+                    for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
+                ]
+                * 2,
+            }
+        )
+
+        result = (
+            df.groupby("group")[["column1", "date"]]
+            .rolling("1D", on="date", closed="left")["column1"]
+            .sum()
+        )
+        expected = Series(
+            [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
+            index=MultiIndex.from_frame(
+                df[["group", "date"]],
+                names=["group", "date"],
+            ),
+            name="column1",
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize("func", ["max", "min"])
+    def test_groupby_rolling_index_changed(self, func):
+        # GH: #36018 nlevels of MultiIndex changed
+        ds = Series(
+            [1, 2, 2],
+            index=MultiIndex.from_tuples(
+                [("a", "x"), ("a", "y"), ("c", "z")], names=["1", "2"]
+            ),
+            name="a",
+        )
+
+        result = getattr(ds.groupby(ds).rolling(2), func)()
+        expected = Series(
+            [np.nan, np.nan, 2.0],
+            index=MultiIndex.from_tuples(
+                [(1, "a", "x"), (2, "a", "y"), (2, "c", "z")], names=["a", "1", "2"]
+            ),
+            name="a",
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_rolling_empty_frame(self):
+        # GH 36197
+        expected = DataFrame({"s1": []})
+        result = expected.groupby("s1").rolling(window=1).sum()
+        # GH 32262
+        expected = expected.drop(columns="s1")
+        # GH-38057 from_tuples gives empty object dtype, we now get float/int levels
+        # expected.index = MultiIndex.from_tuples([], names=["s1", None])
+        expected.index = MultiIndex.from_product(
+            [Index([], dtype="float64"), Index([], dtype="int64")], names=["s1", None]
+        )
+        tm.assert_frame_equal(result, expected)
+
+        expected = DataFrame({"s1": [], "s2": []})
+        result = expected.groupby(["s1", "s2"]).rolling(window=1).sum()
+        # GH 32262
+        expected = expected.drop(columns=["s1", "s2"])
+        expected.index = MultiIndex.from_product(
+            [
+                Index([], dtype="float64"),
+                Index([], dtype="float64"),
+                Index([], dtype="int64"),
+            ],
+            names=["s1", "s2", None],
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_rolling_string_index(self):
+        # GH: 36727
+        df = DataFrame(
+            [
+                ["A", "group_1", Timestamp(2019, 1, 1, 9)],
+                ["B", "group_1", Timestamp(2019, 1, 2, 9)],
+                ["Z", "group_2", Timestamp(2019, 1, 3, 9)],
+                ["H", "group_1", Timestamp(2019, 1, 6, 9)],
+                ["E", "group_2", Timestamp(2019, 1, 20, 9)],
+            ],
+            columns=["index", "group", "eventTime"],
+        ).set_index("index")
+
+        groups = df.groupby("group")
+        df["count_to_date"] = groups.cumcount()
+        rolling_groups = groups.rolling("10D", on="eventTime")
+        result = rolling_groups.apply(lambda df: df.shape[0])
+        expected = DataFrame(
+            [
+                ["A", "group_1", Timestamp(2019, 1, 1, 9), 1.0],
+                ["B", "group_1", Timestamp(2019, 1, 2, 9), 2.0],
+                ["H", "group_1", Timestamp(2019, 1, 6, 9), 3.0],
+                ["Z", "group_2", Timestamp(2019, 1, 3, 9), 1.0],
+                ["E", "group_2", Timestamp(2019, 1, 20, 9), 1.0],
+            ],
+            columns=["index", "group", "eventTime", "count_to_date"],
+        ).set_index(["group", "index"])
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_rolling_no_sort(self):
+        # GH 36889
+        result = (
+            DataFrame({"foo": [2, 1], "bar": [2, 1]})
+            .groupby("foo", sort=False)
+            .rolling(1)
+            .min()
+        )
+        expected = DataFrame(
+            np.array([[2.0, 2.0], [1.0, 1.0]]),
+            columns=["foo", "bar"],
+            index=MultiIndex.from_tuples([(2, 0), (1, 1)], names=["foo", None]),
+        )
+        # GH 32262
+        expected = expected.drop(columns="foo")
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_rolling_count_closed_on(self, unit):
+        # GH 35869
+        df = DataFrame(
+            {
+                "column1": range(6),
+                "column2": range(6),
+                "group": 3 * ["A", "B"],
+                "date": date_range(end="20190101", periods=6, unit=unit),
+            }
+        )
+        msg = "'d' is deprecated and will be removed in a future version."
+
+        with tm.assert_produces_warning(Pandas4Warning, match=msg):
+            result = (
+                df.groupby("group")
+                .rolling("3d", on="date", closed="left")["column1"]
+                .count()
+            )
+        dti = DatetimeIndex(
+            [
+                "2018-12-27",
+                "2018-12-29",
+                "2018-12-31",
+                "2018-12-28",
+                "2018-12-30",
+                "2019-01-01",
+            ],
+            dtype=f"M8[{unit}]",
+        )
+        mi = MultiIndex.from_arrays(
+            [
+                ["A", "A", "A", "B", "B", "B"],
+                dti,
+            ],
+            names=["group", "date"],
+        )
+        expected = Series(
+            [np.nan, 1.0, 1.0, np.nan, 1.0, 1.0],
+            name="column1",
+            index=mi,
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        ("func", "kwargs", "expected_values"),
+        [
+            (
+                "rolling",
+                {"window": 2, "min_periods": 1},
+                [np.nan, 0.5, np.nan, 0.5, 0.5],
+            ),
+            ("expanding", {}, [np.nan, 0.5, np.nan, 0.5, (1 / 3) ** 0.5]),
+        ],
+    )
+    def test_groupby_rolling_sem(self, func, kwargs, expected_values):
+        # GH: 26476
+        df = DataFrame(
+            [["a", 1], ["a", 2], ["b", 1], ["b", 2], ["b", 3]], columns=["a", "b"]
+        )
+        result = getattr(df.groupby("a"), func)(**kwargs).sem()
+        expected = DataFrame(
+            {"a": [np.nan] * 5, "b": expected_values},
+            index=MultiIndex.from_tuples(
+                [("a", 0), ("a", 1), ("b", 2), ("b", 3), ("b", 4)], names=["a", None]
+            ),
+        )
+        # GH 32262
+        expected = expected.drop(columns="a")
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        ("rollings", "key"), [({"on": "a"}, "a"), ({"on": None}, "index")]
+    )
+    def test_groupby_rolling_nans_in_index(self, rollings, key):
+        # GH: 34617
+        df = DataFrame(
+            {
+                "a": to_datetime(["2020-06-01 12:00", "2020-06-01 14:00", np.nan]),
+                "b": [1, 2, 3],
+                "c": [1, 1, 1],
+            }
+        )
+        if key == "index":
+            df = df.set_index("a")
+        with pytest.raises(ValueError, match=f"{key} values must not have NaT"):
+            df.groupby("c").rolling("60min", **rollings)
+
+    @pytest.mark.parametrize("group_keys", [True, False])
+    def test_groupby_rolling_group_keys(self, group_keys):
+        # GH 37641
+        # GH 38523: GH 37641 actually was not a bug.
+        # group_keys only applies to groupby.apply directly
+        arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]]
+        index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))
+
+        s = Series([1, 2, 3], index=index)
+        result = s.groupby(["idx1", "idx2"], group_keys=group_keys).rolling(1).mean()
+        expected = Series(
+            [1.0, 2.0, 3.0],
+            index=MultiIndex.from_tuples(
+                [
+                    ("val1", "val1", "val1", "val1"),
+                    ("val1", "val1", "val1", "val1"),
+                    ("val2", "val2", "val2", "val2"),
+                ],
+                names=["idx1", "idx2", "idx1", "idx2"],
+            ),
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_rolling_index_level_and_column_label(self):
+        # The groupby keys should not appear as a resulting column
+        arrays = [["val1", "val1", "val2"], ["val1", "val1", "val2"]]
+        index = MultiIndex.from_arrays(arrays, names=("idx1", "idx2"))
+
+        df = DataFrame({"A": [1, 1, 2], "B": range(3)}, index=index)
+        result = df.groupby(["idx1", "A"]).rolling(1).mean()
+        expected = DataFrame(
+            {"B": [0.0, 1.0, 2.0]},
+            index=MultiIndex.from_tuples(
+                [
+                    ("val1", 1, "val1", "val1"),
+                    ("val1", 1, "val1", "val1"),
+                    ("val2", 2, "val2", "val2"),
+                ],
+                names=["idx1", "A", "idx1", "idx2"],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_rolling_resulting_multiindex(self):
+        # a few different cases checking the created MultiIndex of the result
+        # https://github.com/pandas-dev/pandas/pull/38057
+
+        # grouping by 1 columns -> 2-level MI as result
+        df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4})
+        result = df.groupby("b").rolling(3).mean()
+        expected_index = MultiIndex.from_tuples(
+            [(1, 0), (1, 2), (1, 4), (1, 6), (2, 1), (2, 3), (2, 5), (2, 7)],
+            names=["b", None],
+        )
+        tm.assert_index_equal(result.index, expected_index)
+
+    def test_groupby_rolling_resulting_multiindex2(self):
+        # grouping by 2 columns -> 3-level MI as result
+        df = DataFrame({"a": np.arange(12.0), "b": [1, 2] * 6, "c": [1, 2, 3, 4] * 3})
+        result = df.groupby(["b", "c"]).rolling(2).sum()
+        expected_index = MultiIndex.from_tuples(
+            [
+                (1, 1, 0),
+                (1, 1, 4),
+                (1, 1, 8),
+                (1, 3, 2),
+                (1, 3, 6),
+                (1, 3, 10),
+                (2, 2, 1),
+                (2, 2, 5),
+                (2, 2, 9),
+                (2, 4, 3),
+                (2, 4, 7),
+                (2, 4, 11),
+            ],
+            names=["b", "c", None],
+        )
+        tm.assert_index_equal(result.index, expected_index)
+
+    def test_groupby_rolling_resulting_multiindex3(self):
+        # grouping with 1 level on dataframe with 2-level MI -> 3-level MI as result
+        df = DataFrame({"a": np.arange(8.0), "b": [1, 2] * 4, "c": [1, 2, 3, 4] * 2})
+        df = df.set_index("c", append=True)
+        result = df.groupby("b").rolling(3).mean()
+        expected_index = MultiIndex.from_tuples(
+            [
+                (1, 0, 1),
+                (1, 2, 3),
+                (1, 4, 1),
+                (1, 6, 3),
+                (2, 1, 2),
+                (2, 3, 4),
+                (2, 5, 2),
+                (2, 7, 4),
+            ],
+            names=["b", None, "c"],
+        )
+        tm.assert_index_equal(result.index, expected_index, exact="equiv")
+
+    def test_groupby_rolling_object_doesnt_affect_groupby_apply(self, roll_frame):
+        # GH 39732
+        g = roll_frame.groupby("A", group_keys=False)
+        expected = g.apply(lambda x: x.rolling(4).sum()).index
+        _ = g.rolling(window=4)
+        result = g.apply(lambda x: x.rolling(4).sum()).index
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        ("window", "min_periods", "closed", "expected"),
+        [
+            (2, 0, "left", [None, 0.0, 1.0, 1.0, None, 0.0, 1.0, 1.0]),
+            (2, 2, "left", [None, None, 1.0, 1.0, None, None, 1.0, 1.0]),
+            (4, 4, "left", [None, None, None, None, None, None, None, None]),
+            (4, 4, "right", [None, None, None, 5.0, None, None, None, 5.0]),
+        ],
+    )
+    def test_groupby_rolling_var(self, window, min_periods, closed, expected):
+        df = DataFrame([1, 2, 3, 4, 5, 6, 7, 8])
+        result = (
+            df.groupby([1, 2, 1, 2, 1, 2, 1, 2])
+            .rolling(window=window, min_periods=min_periods, closed=closed)
+            .var(0)
+        )
+        expected_result = DataFrame(
+            np.array(expected, dtype="float64"),
+            index=MultiIndex(
+                levels=[np.array([1, 2]), [0, 1, 2, 3, 4, 5, 6, 7]],
+                codes=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 2, 4, 6, 1, 3, 5, 7]],
+            ),
+        )
+        tm.assert_frame_equal(result, expected_result)
+
+    @pytest.mark.parametrize(
+        "columns", [MultiIndex.from_tuples([("A", ""), ("B", "C")]), ["A", "B"]]
+    )
+    def test_by_column_not_in_values(self, columns):
+        # GH 32262
+        df = DataFrame([[1, 0]] * 20 + [[2, 0]] * 12 + [[3, 0]] * 8, columns=columns)
+        g = df.groupby("A")
+        original_obj = g.obj.copy(deep=True)
+        r = g.rolling(4)
+        result = r.sum()
+        assert "A" not in result.columns
+        tm.assert_frame_equal(g.obj, original_obj)
+
+    def test_groupby_level(self):
+        # GH 38523, 38787
+        arrays = [
+            ["Falcon", "Falcon", "Parrot", "Parrot"],
+            ["Captive", "Wild", "Captive", "Wild"],
+        ]
+        index = MultiIndex.from_arrays(arrays, names=("Animal", "Type"))
+        df = DataFrame({"Max Speed": [390.0, 350.0, 30.0, 20.0]}, index=index)
+        result = df.groupby(level=0)["Max Speed"].rolling(2).sum()
+        expected = Series(
+            [np.nan, 740.0, np.nan, 50.0],
+            index=MultiIndex.from_tuples(
+                [
+                    ("Falcon", "Falcon", "Captive"),
+                    ("Falcon", "Falcon", "Wild"),
+                    ("Parrot", "Parrot", "Captive"),
+                    ("Parrot", "Parrot", "Wild"),
+                ],
+                names=["Animal", "Animal", "Type"],
+            ),
+            name="Max Speed",
+        )
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "by, expected_data",
+        [
+            [["id"], {"num": [100.0, 150.0, 150.0, 200.0]}],
+            [
+                ["id", "index"],
+                {
+                    "date": [
+                        Timestamp("2018-01-01"),
+                        Timestamp("2018-01-02"),
+                        Timestamp("2018-01-01"),
+                        Timestamp("2018-01-02"),
+                    ],
+                    "num": [100.0, 200.0, 150.0, 250.0],
+                },
+            ],
+        ],
+    )
+    def test_as_index_false(self, by, expected_data, unit):
+        # GH 39433
+        data = [
+            ["A", "2018-01-01", 100.0],
+            ["A", "2018-01-02", 200.0],
+            ["B", "2018-01-01", 150.0],
+            ["B", "2018-01-02", 250.0],
+        ]
+        df = DataFrame(data, columns=["id", "date", "num"])
+        df["date"] = df["date"].astype(f"M8[{unit}]")
+        df = df.set_index(["date"])
+
+        gp_by = [getattr(df, attr) for attr in by]
+        result = (
+            df.groupby(gp_by, as_index=False).rolling(window=2, min_periods=1).mean()
+        )
+
+        expected = {"id": ["A", "A", "B", "B"]}
+        expected.update(expected_data)
+        expected = DataFrame(
+            expected,
+            index=df.index,
+        )
+        if "date" in expected_data:
+            expected["date"] = expected["date"].astype(f"M8[{unit}]")
+        tm.assert_frame_equal(result, expected)
+
+    def test_nan_and_zero_endpoints(self, any_int_numpy_dtype):
+        # https://github.com/twosigma/pandas/issues/53
+        typ = np.dtype(any_int_numpy_dtype).type
+        size = 1000
+        idx = np.repeat(typ(0), size)
+        idx[-1] = 1
+
+        val = 5e25
+        arr = np.repeat(val, size)
+        arr[0] = np.nan
+        arr[-1] = 0
+
+        df = DataFrame(
+            {
+                "index": idx,
+                "adl2": arr,
+            }
+        ).set_index("index")
+        result = df.groupby("index")["adl2"].rolling(window=10, min_periods=1).mean()
+        expected = Series(
+            arr,
+            name="adl2",
+            index=MultiIndex.from_arrays(
+                [
+                    Index([0] * 999 + [1], dtype=typ, name="index"),
+                    Index([0] * 999 + [1], dtype=typ, name="index"),
+                ],
+            ),
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_groupby_rolling_non_monotonic(self):
+        # GH 43909
+
+        shuffled = [3, 0, 1, 2]
+        sec = 1_000
+        df = DataFrame(
+            [{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled]
+        )
+        with pytest.raises(ValueError, match=r".* must be monotonic"):
+            df.groupby("c").rolling(on="t", window="3s")
+
+    def test_groupby_monotonic(self):
+        # GH 15130
+        # we don't need to validate monotonicity when grouping
+
+        # GH 43909 we should raise an error here to match
+        # behaviour of non-groupby rolling.
+
+        data = [
+            ["David", "1/1/2015", 100],
+            ["David", "1/5/2015", 500],
+            ["David", "5/30/2015", 50],
+            ["David", "7/25/2015", 50],
+            ["Ryan", "1/4/2014", 100],
+            ["Ryan", "1/19/2015", 500],
+            ["Ryan", "3/31/2016", 50],
+            ["Joe", "7/1/2015", 100],
+            ["Joe", "9/9/2015", 500],
+            ["Joe", "10/15/2015", 50],
+        ]
+
+        df = DataFrame(data=data, columns=["name", "date", "amount"])
+        df["date"] = to_datetime(df["date"])
+        df = df.sort_values("date")
+
+        expected = (
+            df.set_index("date")
+            .groupby("name")
+            .apply(lambda x: x.rolling("180D")["amount"].sum())
+        )
+        result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
+        tm.assert_series_equal(result, expected)
+
+    def test_datelike_on_monotonic_within_each_group(self):
+        # GH 13966 (similar to #15130, closed by #15175)
+
+        # superseded by 43909
+        # GH 46061: OK if the on is monotonic relative to each each group
+
+        dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
+        df = DataFrame(
+            {
+                "A": [1] * 20 + [2] * 12 + [3] * 8,
+                "B": np.concatenate((dates, dates)),
+                "C": np.arange(40),
+            }
+        )
+
+        expected = (
+            df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean())
+        )
+        result = df.groupby("A").rolling("4s", on="B").C.mean()
+        tm.assert_series_equal(result, expected)
+
+    def test_datelike_on_not_monotonic_within_each_group(self):
+        # GH 46061
+        df = DataFrame(
+            {
+                "A": [1] * 3 + [2] * 3,
+                "B": [Timestamp(year, 1, 1) for year in [2020, 2021, 2019]] * 2,
+                "C": range(6),
+            }
+        )
+        with pytest.raises(ValueError, match="Each group within B must be monotonic."):
+            df.groupby("A").rolling("365D", on="B")
+
+
+class TestExpanding:
+    @pytest.fixture
+    def frame(self):
+        return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})
+
+    @pytest.mark.parametrize(
+        "f",
+        [
+            "sum",
+            "mean",
+            "min",
+            "max",
+            "first",
+            "last",
+            "count",
+            "kurt",
+            "skew",
+            "nunique",
+        ],
+    )
+    def test_expanding(self, f, frame):
+        g = frame.groupby("A", group_keys=False)
+        r = g.expanding()
+
+        result = getattr(r, f)()
+        expected = g.apply(lambda x: getattr(x.expanding(), f)())
+        # GH 39732
+        expected_index = MultiIndex.from_arrays([frame["A"], range(40)])
+        expected.index = expected_index
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("f", ["std", "var"])
+    def test_expanding_ddof(self, f, frame):
+        g = frame.groupby("A", group_keys=False)
+        r = g.expanding()
+
+        result = getattr(r, f)(ddof=0)
+        expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0))
+        # GH 39732
+        expected_index = MultiIndex.from_arrays([frame["A"], range(40)])
+        expected.index = expected_index
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"]
+    )
+    def test_expanding_quantile(self, interpolation, frame):
+        g = frame.groupby("A", group_keys=False)
+        r = g.expanding()
+
+        result = r.quantile(0.4, interpolation=interpolation)
+        expected = g.apply(
+            lambda x: x.expanding().quantile(0.4, interpolation=interpolation)
+        )
+        # GH 39732
+        expected_index = MultiIndex.from_arrays([frame["A"], range(40)])
+        expected.index = expected_index
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("f", ["corr", "cov"])
+    def test_expanding_corr_cov(self, f, frame):
+        g = frame.groupby("A")
+        r = g.expanding()
+
+        result = getattr(r, f)(frame)
+
+        def func_0(x):
+            return getattr(x.expanding(), f)(frame)
+
+        expected = g.apply(func_0)
+        # GH 39591: groupby.apply returns 1 instead of nan for windows
+        # with all nan values
+        null_idx = list(range(20, 61)) + list(range(72, 113))
+        expected.iloc[null_idx, 1] = np.nan
+        # GH 39591: The grouped column should be all np.nan
+        # (groupby.apply inserts 0s for cov)
+        expected["A"] = np.nan
+        tm.assert_frame_equal(result, expected)
+
+        result = getattr(r.B, f)(pairwise=True)
+
+        def func_1(x):
+            return getattr(x.B.expanding(), f)(pairwise=True)
+
+        expected = g.apply(func_1)
+        tm.assert_series_equal(result, expected)
+
+    def test_expanding_apply(self, raw, frame):
+        g = frame.groupby("A", group_keys=False)
+        r = g.expanding()
+
+        # reduction
+        result = r.apply(lambda x: x.sum(), raw=raw)
+        expected = g.apply(lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw))
+        # GH 39732
+        expected_index = MultiIndex.from_arrays([frame["A"], range(40)])
+        expected.index = expected_index
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_expanding_agg_namedagg(self):
+        # GH#28333
+        df = DataFrame(
+            {
+                "kind": ["cat", "dog", "cat", "dog", "cat", "dog"],
+                "height": [9.1, 6.0, 9.5, 34.0, 12.0, 8.0],
+                "weight": [7.9, 7.5, 9.9, 198.0, 10.0, 42.0],
+            }
+        )
+        result = (
+            df.groupby("kind")
+            .expanding(1)
+            .agg(
+                total_weight=NamedAgg(column="weight", aggfunc=sum),
+                min_height=NamedAgg(column="height", aggfunc=min),
+            )
+        )
+        expected = DataFrame(
+            {
+                "total_weight": [7.9, 17.8, 27.8, 7.5, 205.5, 247.5],
+                "min_height": [9.1, 9.1, 9.1, 6.0, 6.0, 6.0],
+            },
+            index=MultiIndex(
+                [["cat", "dog"], [0, 1, 2, 3, 4, 5]],
+                [[0, 0, 0, 1, 1, 1], [0, 2, 4, 1, 3, 5]],
+                names=["kind", None],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+
+class TestEWM:
+    @pytest.mark.parametrize(
+        "method, expected_data",
+        [
+            ["mean", [0.0, 0.6666666666666666, 1.4285714285714286, 2.2666666666666666]],
+            ["std", [np.nan, 0.707107, 0.963624, 1.177164]],
+            ["var", [np.nan, 0.5, 0.9285714285714286, 1.3857142857142857]],
+        ],
+    )
+    def test_methods(self, method, expected_data):
+        # GH 16037
+        df = DataFrame({"A": ["a"] * 4, "B": range(4)})
+        result = getattr(df.groupby("A").ewm(com=1.0), method)()
+        expected = DataFrame(
+            {"B": expected_data},
+            index=MultiIndex.from_tuples(
+                [
+                    ("a", 0),
+                    ("a", 1),
+                    ("a", 2),
+                    ("a", 3),
+                ],
+                names=["A", None],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_groupby_ewm_agg_namedagg(self):
+        # GH#28333
+        df = DataFrame({"A": ["a"] * 4, "B": range(4)})
+        result = (
+            df.groupby("A")
+            .ewm(com=1.0)
+            .agg(
+                B_mean=NamedAgg(column="B", aggfunc="mean"),
+                B_std=NamedAgg(column="B", aggfunc="std"),
+                B_var=NamedAgg(column="B", aggfunc="var"),
+            )
+        )
+        expected = DataFrame(
+            {
+                "B_mean": [
+                    0.0,
+                    0.6666666666666666,
+                    1.4285714285714286,
+                    2.2666666666666666,
+                ],
+                "B_std": [np.nan, 0.707107, 0.963624, 1.177164],
+                "B_var": [np.nan, 0.5, 0.9285714285714286, 1.3857142857142857],
+            },
+            index=MultiIndex.from_tuples(
+                [
+                    ("a", 0),
+                    ("a", 1),
+                    ("a", 2),
+                    ("a", 3),
+                ],
+                names=["A", None],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "method, expected_data",
+        [["corr", [np.nan, 1.0, 1.0, 1]], ["cov", [np.nan, 0.5, 0.928571, 1.385714]]],
+    )
+    def test_pairwise_methods(self, method, expected_data):
+        # GH 16037
+        df = DataFrame({"A": ["a"] * 4, "B": range(4)})
+        result = getattr(df.groupby("A").ewm(com=1.0), method)()
+        expected = DataFrame(
+            {"B": expected_data},
+            index=MultiIndex.from_tuples(
+                [
+                    ("a", 0, "B"),
+                    ("a", 1, "B"),
+                    ("a", 2, "B"),
+                    ("a", 3, "B"),
+                ],
+                names=["A", None, None],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+        expected = df.groupby("A")[["B"]].apply(
+            lambda x: getattr(x.ewm(com=1.0), method)()
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_times(self, times_frame):
+        # GH 40951
+        halflife = "23 days"
+        # GH#42738
+        times = times_frame.pop("C")
+        result = times_frame.groupby("A").ewm(halflife=halflife, times=times).mean()
+        expected = DataFrame(
+            {
+                "B": [
+                    0.0,
+                    0.507534,
+                    1.020088,
+                    1.537661,
+                    0.0,
+                    0.567395,
+                    1.221209,
+                    0.0,
+                    0.653141,
+                    1.195003,
+                ]
+            },
+            index=MultiIndex.from_tuples(
+                [
+                    ("a", 0),
+                    ("a", 3),
+                    ("a", 6),
+                    ("a", 9),
+                    ("b", 1),
+                    ("b", 4),
+                    ("b", 7),
+                    ("c", 2),
+                    ("c", 5),
+                    ("c", 8),
+                ],
+                names=["A", None],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_times_array(self, times_frame):
+        # GH 40951
+        halflife = "23 days"
+        times = times_frame.pop("C")
+        gb = times_frame.groupby("A")
+        result = gb.ewm(halflife=halflife, times=times).mean()
+        expected = gb.ewm(halflife=halflife, times=times.values).mean()
+        tm.assert_frame_equal(result, expected)
+
+    def test_dont_mutate_obj_after_slicing(self):
+        # GH 43355
+        df = DataFrame(
+            {
+                "id": ["a", "a", "b", "b", "b"],
+                "timestamp": date_range("2021-9-1", periods=5, freq="h"),
+                "y": range(5),
+            }
+        )
+        grp = df.groupby("id").rolling("1h", on="timestamp")
+        result = grp.count()
+        expected_df = DataFrame(
+            {
+                "timestamp": date_range("2021-9-1", periods=5, freq="h"),
+                "y": [1.0] * 5,
+            },
+            index=MultiIndex.from_arrays(
+                [["a", "a", "b", "b", "b"], list(range(5))], names=["id", None]
+            ),
+        )
+        tm.assert_frame_equal(result, expected_df)
+
+        result = grp["y"].count()
+        expected_series = Series(
+            [1.0] * 5,
+            index=MultiIndex.from_arrays(
+                [
+                    ["a", "a", "b", "b", "b"],
+                    date_range("2021-9-1", periods=5, freq="h"),
+                ],
+                names=["id", "timestamp"],
+            ),
+            name="y",
+        )
+        tm.assert_series_equal(result, expected_series)
+        # This is the key test
+        result = grp.count()
+        tm.assert_frame_equal(result, expected_df)
+
+
+def test_rolling_corr_with_single_integer_in_index():
+    # GH 44078
+    df = DataFrame({"a": [(1,), (1,), (1,)], "b": [4, 5, 6]})
+    gb = df.groupby(["a"])
+    result = gb.rolling(2).corr(other=df)
+    index = MultiIndex.from_tuples([((1,), 0), ((1,), 1), ((1,), 2)], names=["a", None])
+    expected = DataFrame(
+        {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_rolling_corr_with_tuples_in_index():
+    # GH 44078
+    df = DataFrame(
+        {
+            "a": [
+                (
+                    1,
+                    2,
+                ),
+                (
+                    1,
+                    2,
+                ),
+                (
+                    1,
+                    2,
+                ),
+            ],
+            "b": [4, 5, 6],
+        }
+    )
+    gb = df.groupby(["a"])
+    result = gb.rolling(2).corr(other=df)
+    index = MultiIndex.from_tuples(
+        [((1, 2), 0), ((1, 2), 1), ((1, 2), 2)], names=["a", None]
+    )
+    expected = DataFrame(
+        {"a": [np.nan, np.nan, np.nan], "b": [np.nan, 1.0, 1.0]}, index=index
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff6a616bc526427d0c2f3abfec9b66273c65fa00
--- /dev/null
+++ b/pandas/tests/window/test_numba.py
@@ -0,0 +1,648 @@
+import numpy as np
+import pytest
+
+from pandas.compat import is_platform_arm
+from pandas.errors import NumbaUtilError
+import pandas.util._test_decorators as td
+
+from pandas import (
+    DataFrame,
+    Series,
+    option_context,
+    to_datetime,
+)
+import pandas._testing as tm
+from pandas.api.indexers import BaseIndexer
+from pandas.util.version import Version
+
+pytestmark = [pytest.mark.single_cpu]
+
+numba = pytest.importorskip("numba")
+pytestmark.append(
+    pytest.mark.skipif(
+        Version(numba.__version__) == Version("0.61") and is_platform_arm(),
+        reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
+    )
+)
+
+
+@pytest.fixture(params=["single", "table"])
+def method(request):
+    """method keyword in rolling/expanding/ewm constructor"""
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        ["sum", {}],
+        ["mean", {}],
+        ["median", {}],
+        ["max", {}],
+        ["min", {}],
+        ["var", {}],
+        ["var", {"ddof": 0}],
+        ["std", {}],
+        ["std", {"ddof": 0}],
+    ]
+)
+def arithmetic_numba_supported_operators(request):
+    return request.param
+
+
+@pytest.fixture
+def roll_frame():
+    return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})
+
+
+@td.skip_if_no("numba")
+@pytest.mark.filterwarnings("ignore")
+# Filter warnings when parallel=True and the function can't be parallelized by Numba
+class TestEngine:
+    @pytest.mark.parametrize("jit", [True, False])
+    def test_numba_vs_cython_apply(self, jit, nogil, parallel, nopython, center, step):
+        def f(x, *args):
+            arg_sum = 0
+            for arg in args:
+                arg_sum += arg
+            return np.mean(x) + arg_sum
+
+        if jit:
+            import numba
+
+            f = numba.jit(f)
+
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+        args = (2,)
+
+        s = Series(range(10))
+        result = s.rolling(2, center=center, step=step).apply(
+            f, args=args, engine="numba", engine_kwargs=engine_kwargs, raw=True
+        )
+        expected = s.rolling(2, center=center, step=step).apply(
+            f, engine="cython", args=args, raw=True
+        )
+        tm.assert_series_equal(result, expected)
+
+    def test_apply_numba_with_kwargs(self, roll_frame):
+        # GH 58995
+        # rolling apply
+        def func(sr, a=0):
+            return sr.sum() + a
+
+        data = DataFrame(range(10))
+
+        result = data.rolling(5).apply(func, engine="numba", raw=True, kwargs={"a": 1})
+        expected = data.rolling(5).sum() + 1
+        tm.assert_frame_equal(result, expected)
+
+        result = data.rolling(5).apply(func, engine="numba", raw=True, args=(1,))
+        tm.assert_frame_equal(result, expected)
+
+        # expanding apply
+
+        result = data.expanding().apply(func, engine="numba", raw=True, kwargs={"a": 1})
+        expected = data.expanding().sum() + 1
+        tm.assert_frame_equal(result, expected)
+
+        result = data.expanding().apply(func, engine="numba", raw=True, args=(1,))
+        tm.assert_frame_equal(result, expected)
+
+        # groupby rolling
+        result = (
+            roll_frame.groupby("A")
+            .rolling(5)
+            .apply(func, engine="numba", raw=True, kwargs={"a": 1})
+        )
+        expected = roll_frame.groupby("A").rolling(5).sum() + 1
+        tm.assert_frame_equal(result, expected)
+
+        result = (
+            roll_frame.groupby("A")
+            .rolling(5)
+            .apply(func, engine="numba", raw=True, args=(1,))
+        )
+        tm.assert_frame_equal(result, expected)
+        # groupby expanding
+
+        result = (
+            roll_frame.groupby("A")
+            .expanding()
+            .apply(func, engine="numba", raw=True, kwargs={"a": 1})
+        )
+        expected = roll_frame.groupby("A").expanding().sum() + 1
+        tm.assert_frame_equal(result, expected)
+
+        result = (
+            roll_frame.groupby("A")
+            .expanding()
+            .apply(func, engine="numba", raw=True, args=(1,))
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_numba_min_periods(self):
+        # GH 58868
+        def last_row(x):
+            assert len(x) == 3
+            return x[-1]
+
+        df = DataFrame([[1, 2], [3, 4], [5, 6], [7, 8]])
+
+        result = df.rolling(3, method="table", min_periods=3).apply(
+            last_row, raw=True, engine="numba"
+        )
+
+        expected = DataFrame([[np.nan, np.nan], [np.nan, np.nan], [5, 6], [7, 8]])
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "data",
+        [
+            DataFrame(np.eye(5)),
+            DataFrame(
+                [
+                    [5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3],
+                    [5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3],
+                    [np.nan, np.nan, 5, 6, 7, 5, 5, 5, 5, 5],
+                ]
+            ).T,
+            Series(range(5), name="foo"),
+            Series([20, 10, 10, np.inf, 1, 1, 2, 3]),
+            Series([20, 10, 10, np.nan, 10, 1, 2, 3]),
+        ],
+    )
+    def test_numba_vs_cython_rolling_methods(
+        self,
+        data,
+        nogil,
+        parallel,
+        nopython,
+        arithmetic_numba_supported_operators,
+        step,
+    ):
+        method, kwargs = arithmetic_numba_supported_operators
+
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+
+        roll = data.rolling(3, step=step)
+        result = getattr(roll, method)(
+            engine="numba", engine_kwargs=engine_kwargs, **kwargs
+        )
+        expected = getattr(roll, method)(engine="cython", **kwargs)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "data", [DataFrame(np.eye(5)), Series(range(5), name="foo")]
+    )
+    def test_numba_vs_cython_expanding_methods(
+        self, data, nogil, parallel, nopython, arithmetic_numba_supported_operators
+    ):
+        method, kwargs = arithmetic_numba_supported_operators
+
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+
+        data = DataFrame(np.eye(5))
+        expand = data.expanding()
+        result = getattr(expand, method)(
+            engine="numba", engine_kwargs=engine_kwargs, **kwargs
+        )
+        expected = getattr(expand, method)(engine="cython", **kwargs)
+        tm.assert_equal(result, expected)
+
+    @pytest.mark.parametrize("jit", [True, False])
+    def test_cache_apply(self, jit, nogil, parallel, nopython, step):
+        # Test that the functions are cached correctly if we switch functions
+        def func_1(x):
+            return np.mean(x) + 4
+
+        def func_2(x):
+            return np.std(x) * 5
+
+        if jit:
+            import numba
+
+            func_1 = numba.jit(func_1)
+            func_2 = numba.jit(func_2)
+
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+
+        roll = Series(range(10)).rolling(2, step=step)
+        result = roll.apply(
+            func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True
+        )
+        expected = roll.apply(func_1, engine="cython", raw=True)
+        tm.assert_series_equal(result, expected)
+
+        result = roll.apply(
+            func_2, engine="numba", engine_kwargs=engine_kwargs, raw=True
+        )
+        expected = roll.apply(func_2, engine="cython", raw=True)
+        tm.assert_series_equal(result, expected)
+        # This run should use the cached func_1
+        result = roll.apply(
+            func_1, engine="numba", engine_kwargs=engine_kwargs, raw=True
+        )
+        expected = roll.apply(func_1, engine="cython", raw=True)
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "window,window_kwargs",
+        [
+            ["rolling", {"window": 3, "min_periods": 0}],
+            ["expanding", {}],
+        ],
+    )
+    def test_dont_cache_args(
+        self, window, window_kwargs, nogil, parallel, nopython, method
+    ):
+        # GH 42287
+
+        def add(values, x):
+            return np.sum(values) + x
+
+        engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
+        df = DataFrame({"value": [0, 0, 0]})
+        result = getattr(df, window)(method=method, **window_kwargs).apply(
+            add, raw=True, engine="numba", engine_kwargs=engine_kwargs, args=(1,)
+        )
+        expected = DataFrame({"value": [1.0, 1.0, 1.0]})
+        tm.assert_frame_equal(result, expected)
+
+        result = getattr(df, window)(method=method, **window_kwargs).apply(
+            add, raw=True, engine="numba", engine_kwargs=engine_kwargs, args=(2,)
+        )
+        expected = DataFrame({"value": [2.0, 2.0, 2.0]})
+        tm.assert_frame_equal(result, expected)
+
+    def test_dont_cache_engine_kwargs(self):
+        # If the user passes a different set of engine_kwargs don't return the same
+        # jitted function
+        nogil = False
+        parallel = True
+        nopython = True
+
+        def func(x):
+            return nogil + parallel + nopython
+
+        engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
+        df = DataFrame({"value": [0, 0, 0]})
+        result = df.rolling(1).apply(
+            func, raw=True, engine="numba", engine_kwargs=engine_kwargs
+        )
+        expected = DataFrame({"value": [2.0, 2.0, 2.0]})
+        tm.assert_frame_equal(result, expected)
+
+        parallel = False
+        engine_kwargs = {"nopython": nopython, "nogil": nogil, "parallel": parallel}
+        result = df.rolling(1).apply(
+            func, raw=True, engine="numba", engine_kwargs=engine_kwargs
+        )
+        expected = DataFrame({"value": [1.0, 1.0, 1.0]})
+        tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("numba")
+class TestEWM:
+    @pytest.mark.parametrize(
+        "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"]
+    )
+    @pytest.mark.parametrize("method", ["mean", "sum"])
+    def test_invalid_engine(self, grouper, method):
+        df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)})
+        with pytest.raises(ValueError, match="engine must be either"):
+            getattr(grouper(df).ewm(com=1.0), method)(engine="foo")
+
+    @pytest.mark.parametrize(
+        "grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"]
+    )
+    @pytest.mark.parametrize("method", ["mean", "sum"])
+    def test_invalid_engine_kwargs(self, grouper, method):
+        df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)})
+        with pytest.raises(ValueError, match="cython engine does not"):
+            getattr(grouper(df).ewm(com=1.0), method)(
+                engine="cython", engine_kwargs={"nopython": True}
+            )
+
+    @pytest.mark.parametrize("grouper", ["None", "groupby"])
+    @pytest.mark.parametrize("method", ["mean", "sum"])
+    def test_cython_vs_numba(
+        self, grouper, method, nogil, parallel, nopython, ignore_na, adjust
+    ):
+        df = DataFrame({"B": range(4)})
+        if grouper == "None":
+            grouper = lambda x: x
+        else:
+            df["A"] = ["a", "b", "a", "b"]
+            grouper = lambda x: x.groupby("A")
+        if method == "sum":
+            adjust = True
+        ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na)
+
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+        result = getattr(ewm, method)(engine="numba", engine_kwargs=engine_kwargs)
+        expected = getattr(ewm, method)(engine="cython")
+
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("grouper", ["None", "groupby"])
+    def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na):
+        # GH 40951
+
+        df = DataFrame({"B": [0, 0, 1, 1, 2, 2]})
+        if grouper == "None":
+            grouper = lambda x: x
+        else:
+            grouper = lambda x: x.groupby("A")
+            df["A"] = ["a", "b", "a", "b", "b", "a"]
+
+        halflife = "23 days"
+        times = to_datetime(
+            [
+                "2020-01-01",
+                "2020-01-01",
+                "2020-01-02",
+                "2020-01-10",
+                "2020-02-23",
+                "2020-01-03",
+            ]
+        )
+        ewm = grouper(df).ewm(
+            halflife=halflife, adjust=True, ignore_na=ignore_na, times=times
+        )
+
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+
+        result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
+        expected = ewm.mean(engine="cython")
+
+        tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("numba")
+def test_use_global_config():
+    def f(x):
+        return np.mean(x) + 2
+
+    s = Series(range(10))
+    with option_context("compute.use_numba", True):
+        result = s.rolling(2).apply(f, engine=None, raw=True)
+    expected = s.rolling(2).apply(f, engine="numba", raw=True)
+    tm.assert_series_equal(expected, result)
+
+
+@td.skip_if_no("numba")
+def test_invalid_kwargs_nopython():
+    with pytest.raises(TypeError, match="got an unexpected keyword argument 'a'"):
+        Series(range(1)).rolling(1).apply(
+            lambda x: x, kwargs={"a": 1}, engine="numba", raw=True
+        )
+    with pytest.raises(
+        NumbaUtilError, match="numba does not support keyword-only arguments"
+    ):
+        Series(range(1)).rolling(1).apply(
+            lambda x, *, a: x, kwargs={"a": 1}, engine="numba", raw=True
+        )
+
+    tm.assert_series_equal(
+        Series(range(1), dtype=float) + 1,
+        Series(range(1))
+        .rolling(1)
+        .apply(lambda x, a: (x + a).sum(), kwargs={"a": 1}, engine="numba", raw=True),
+    )
+
+
+@td.skip_if_no("numba")
+@pytest.mark.slow
+@pytest.mark.filterwarnings("ignore")
+# Filter warnings when parallel=True and the function can't be parallelized by Numba
+class TestTableMethod:
+    def test_table_series_valueerror(self):
+        def f(x):
+            return np.sum(x, axis=0) + 1
+
+        with pytest.raises(
+            ValueError, match="method='table' not applicable for Series objects."
+        ):
+            Series(range(1)).rolling(1, method="table").apply(
+                f, engine="numba", raw=True
+            )
+
+    def test_table_method_rolling_methods(
+        self,
+        nogil,
+        parallel,
+        nopython,
+        arithmetic_numba_supported_operators,
+        step,
+    ):
+        method, kwargs = arithmetic_numba_supported_operators
+
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+
+        df = DataFrame(np.eye(3))
+        roll_table = df.rolling(2, method="table", min_periods=0, step=step)
+        if method in ("var", "std"):
+            with pytest.raises(NotImplementedError, match=f"{method} not supported"):
+                getattr(roll_table, method)(
+                    engine_kwargs=engine_kwargs, engine="numba", **kwargs
+                )
+        else:
+            roll_single = df.rolling(2, method="single", min_periods=0, step=step)
+            result = getattr(roll_table, method)(
+                engine_kwargs=engine_kwargs, engine="numba", **kwargs
+            )
+            expected = getattr(roll_single, method)(
+                engine_kwargs=engine_kwargs, engine="numba", **kwargs
+            )
+            tm.assert_frame_equal(result, expected)
+
+    def test_table_method_rolling_apply(self, nogil, parallel, nopython, step):
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+
+        def f(x):
+            return np.sum(x, axis=0) + 1
+
+        df = DataFrame(np.eye(3))
+        result = df.rolling(2, method="table", min_periods=0, step=step).apply(
+            f, raw=True, engine_kwargs=engine_kwargs, engine="numba"
+        )
+        expected = df.rolling(2, method="single", min_periods=0, step=step).apply(
+            f, raw=True, engine_kwargs=engine_kwargs, engine="numba"
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_table_method_rolling_apply_col_order(self):
+        # GH#59666
+        def f(x):
+            return np.nanmean(x[:, 0] - x[:, 1])
+
+        df = DataFrame(
+            {
+                "a": [1, 2, 3, 4, 5, 6],
+                "b": [6, 7, 8, 5, 6, 7],
+            }
+        )
+        result = df.rolling(3, method="table", min_periods=0)[["a", "b"]].apply(
+            f, raw=True, engine="numba"
+        )
+        expected = DataFrame(
+            {
+                "a": [-5, -5, -5, -3.66667, -2.33333, -1],
+                "b": [-5, -5, -5, -3.66667, -2.33333, -1],
+            }
+        )
+        tm.assert_almost_equal(result, expected)
+        result = df.rolling(3, method="table", min_periods=0)[["b", "a"]].apply(
+            f, raw=True, engine="numba"
+        )
+        expected = DataFrame(
+            {
+                "b": [5, 5, 5, 3.66667, 2.33333, 1],
+                "a": [5, 5, 5, 3.66667, 2.33333, 1],
+            }
+        )
+        tm.assert_almost_equal(result, expected)
+
+    def test_table_method_rolling_weighted_mean(self, step):
+        def weighted_mean(x):
+            arr = np.ones((1, x.shape[1]))
+            arr[:, :2] = (x[:, :2] * x[:, 2]).sum(axis=0) / x[:, 2].sum()
+            return arr
+
+        df = DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]])
+        result = df.rolling(2, method="table", min_periods=0, step=step).apply(
+            weighted_mean, raw=True, engine="numba"
+        )
+        expected = DataFrame(
+            [
+                [1.0, 2.0, 1.0],
+                [1.8, 2.0, 1.0],
+                [3.333333, 2.333333, 1.0],
+                [1.555556, 7, 1.0],
+            ]
+        )[::step]
+        tm.assert_frame_equal(result, expected)
+
+    def test_table_method_expanding_apply(self, nogil, parallel, nopython):
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+
+        def f(x):
+            return np.sum(x, axis=0) + 1
+
+        df = DataFrame(np.eye(3))
+        result = df.expanding(method="table").apply(
+            f, raw=True, engine_kwargs=engine_kwargs, engine="numba"
+        )
+        expected = df.expanding(method="single").apply(
+            f, raw=True, engine_kwargs=engine_kwargs, engine="numba"
+        )
+        tm.assert_frame_equal(result, expected)
+
+    def test_table_method_expanding_methods(
+        self, nogil, parallel, nopython, arithmetic_numba_supported_operators
+    ):
+        method, kwargs = arithmetic_numba_supported_operators
+
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+
+        df = DataFrame(np.eye(3))
+        expand_table = df.expanding(method="table")
+        if method in ("var", "std"):
+            with pytest.raises(NotImplementedError, match=f"{method} not supported"):
+                getattr(expand_table, method)(
+                    engine_kwargs=engine_kwargs, engine="numba", **kwargs
+                )
+        else:
+            expand_single = df.expanding(method="single")
+            result = getattr(expand_table, method)(
+                engine_kwargs=engine_kwargs, engine="numba", **kwargs
+            )
+            expected = getattr(expand_single, method)(
+                engine_kwargs=engine_kwargs, engine="numba", **kwargs
+            )
+            tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize("data", [np.eye(3), np.ones((2, 3)), np.ones((3, 2))])
+    @pytest.mark.parametrize("method", ["mean", "sum"])
+    def test_table_method_ewm(self, data, method, nogil, parallel, nopython):
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+
+        df = DataFrame(data)
+
+        result = getattr(df.ewm(com=1, method="table"), method)(
+            engine_kwargs=engine_kwargs, engine="numba"
+        )
+        expected = getattr(df.ewm(com=1, method="single"), method)(
+            engine_kwargs=engine_kwargs, engine="numba"
+        )
+        tm.assert_frame_equal(result, expected)
+
+
+@td.skip_if_no("numba")
+def test_npfunc_no_warnings():
+    df = DataFrame({"col1": [1, 2, 3, 4, 5]})
+    with tm.assert_produces_warning(False):
+        df.col1.rolling(2).apply(np.prod, raw=True, engine="numba")
+
+
+class PrescribedWindowIndexer(BaseIndexer):
+    def __init__(self, start, end):
+        self._start = start
+        self._end = end
+        super().__init__()
+
+    def get_window_bounds(
+        self, num_values=None, min_periods=None, center=None, closed=None, step=None
+    ):
+        if num_values is None:
+            num_values = len(self._start)
+        start = np.clip(self._start, 0, num_values)
+        end = np.clip(self._end, 0, num_values)
+        return start, end
+
+
+@td.skip_if_no("numba")
+class TestMinMaxNumba:
+    @pytest.mark.parametrize(
+        "is_max, has_nan, exp_list",
+        [
+            (True, False, [3.0, 5.0, 2.0, 5.0, 1.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
+            (True, True, [3.0, 4.0, 2.0, 4.0, 1.0, 4.0, 6.0, 7.0, 7.0, 9.0]),
+            (False, False, [3.0, 2.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 7.0, 0.0]),
+            (False, True, [3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 6.0, 6.0, 7.0, 1.0]),
+        ],
+    )
+    def test_minmax(self, is_max, has_nan, exp_list):
+        nan_idx = [0, 5, 8]
+        df = DataFrame(
+            {
+                "data": [5.0, 4.0, 3.0, 2.0, 1.0, 0.0, 6.0, 7.0, 8.0, 9.0],
+                "start": [2, 0, 3, 0, 4, 0, 5, 5, 7, 3],
+                "end": [3, 4, 4, 5, 5, 6, 7, 8, 9, 10],
+            }
+        )
+        if has_nan:
+            df.loc[nan_idx, "data"] = np.nan
+        expected = Series(exp_list, name="data")
+        r = df.data.rolling(
+            PrescribedWindowIndexer(df.start.to_numpy(), df.end.to_numpy())
+        )
+        if is_max:
+            result = r.max(engine="numba")
+        else:
+            result = r.min(engine="numba")
+
+        tm.assert_series_equal(result, expected)
+
+    def test_wrong_order(self):
+        start = np.array(range(5), dtype=np.int64)
+        end = start + 1
+        end[3] = end[2]
+        start[3] = start[2] - 1
+
+        df = DataFrame({"data": start * 1.0, "start": start, "end": end})
+
+        r = df.data.rolling(PrescribedWindowIndexer(start, end))
+        with pytest.raises(
+            ValueError, match="Start/End ordering requirement is violated at index 3"
+        ):
+            r.max(engine="numba")
diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d55a7992b3ce52255a6813e8b9e93b82a45324
--- /dev/null
+++ b/pandas/tests/window/test_online.py
@@ -0,0 +1,112 @@
+import numpy as np
+import pytest
+
+from pandas.compat import is_platform_arm
+
+from pandas import (
+    DataFrame,
+    Series,
+)
+import pandas._testing as tm
+from pandas.util.version import Version
+
+pytestmark = [pytest.mark.single_cpu]
+
+numba = pytest.importorskip("numba")
+pytestmark.append(
+    pytest.mark.skipif(
+        Version(numba.__version__) == Version("0.61") and is_platform_arm(),
+        reason=f"Segfaults on ARM platforms with numba {numba.__version__}",
+    )
+)
+
+
+@pytest.mark.filterwarnings("ignore")
+# Filter warnings when parallel=True and the function can't be parallelized by Numba
+class TestEWM:
+    def test_invalid_update(self):
+        df = DataFrame({"a": range(5), "b": range(5)})
+        online_ewm = df.head(2).ewm(0.5).online()
+        with pytest.raises(
+            ValueError,
+            match="Must call mean with update=None first before passing update",
+        ):
+            online_ewm.mean(update=df.head(1))
+
+    @pytest.mark.slow
+    @pytest.mark.parametrize(
+        "obj", [DataFrame({"a": range(5), "b": range(5)}), Series(range(5), name="foo")]
+    )
+    def test_online_vs_non_online_mean(
+        self, obj, nogil, parallel, nopython, adjust, ignore_na
+    ):
+        expected = obj.ewm(0.5, adjust=adjust, ignore_na=ignore_na).mean()
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+
+        online_ewm = (
+            obj.head(2)
+            .ewm(0.5, adjust=adjust, ignore_na=ignore_na)
+            .online(engine_kwargs=engine_kwargs)
+        )
+        # Test resetting once
+        for _ in range(2):
+            result = online_ewm.mean()
+            tm.assert_equal(result, expected.head(2))
+
+            result = online_ewm.mean(update=obj.tail(3))
+            tm.assert_equal(result, expected.tail(3))
+
+            online_ewm.reset()
+
+    @pytest.mark.xfail(raises=NotImplementedError)
+    @pytest.mark.parametrize(
+        "obj", [DataFrame({"a": range(5), "b": range(5)}), Series(range(5), name="foo")]
+    )
+    def test_update_times_mean(
+        self, obj, nogil, parallel, nopython, adjust, ignore_na, halflife_with_times
+    ):
+        times = Series(
+            np.array(
+                ["2020-01-01", "2020-01-05", "2020-01-07", "2020-01-17", "2020-01-21"],
+                dtype="datetime64[ns]",
+            )
+        )
+        expected = obj.ewm(
+            0.5,
+            adjust=adjust,
+            ignore_na=ignore_na,
+            times=times,
+            halflife=halflife_with_times,
+        ).mean()
+
+        engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
+        online_ewm = (
+            obj.head(2)
+            .ewm(
+                0.5,
+                adjust=adjust,
+                ignore_na=ignore_na,
+                times=times.head(2),
+                halflife=halflife_with_times,
+            )
+            .online(engine_kwargs=engine_kwargs)
+        )
+        # Test resetting once
+        for _ in range(2):
+            result = online_ewm.mean()
+            tm.assert_equal(result, expected.head(2))
+
+            result = online_ewm.mean(update=obj.tail(3), update_times=times.tail(3))
+            tm.assert_equal(result, expected.tail(3))
+
+            online_ewm.reset()
+
+    @pytest.mark.parametrize("method", ["aggregate", "std", "corr", "cov", "var"])
+    def test_ewm_notimplementederror_raises(self, method):
+        ser = Series(range(10))
+        kwargs = {}
+        if method == "aggregate":
+            kwargs["func"] = lambda x: x
+
+        with pytest.raises(NotImplementedError, match=".* is not implemented."):
+            getattr(ser.ewm(1).online(), method)(**kwargs)
diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb22502fd648b3b90e3aec80b66d1f19bcd5c0f8
--- /dev/null
+++ b/pandas/tests/window/test_pairwise.py
@@ -0,0 +1,457 @@
+import numpy as np
+import pytest
+
+from pandas.compat import IS64
+
+from pandas import (
+    DataFrame,
+    Index,
+    MultiIndex,
+    Series,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.core.algorithms import safe_sort
+
+
+@pytest.fixture(
+    params=[
+        DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]),
+        DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]),
+        DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", "C"]),
+        DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1.0, 0]),
+        DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0.0, 1]),
+        DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", 1]),
+        DataFrame([[2.0, 4.0], [1.0, 2.0], [5.0, 2.0], [8.0, 1.0]], columns=[1, 0.0]),
+        DataFrame([[2, 4.0], [1, 2.0], [5, 2.0], [8, 1.0]], columns=[0, 1.0]),
+        DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.0]], columns=[1.0, "X"]),
+    ]
+)
+def pairwise_frames(request):
+    """Pairwise frames test_pairwise"""
+    return request.param
+
+
+@pytest.fixture
+def pairwise_target_frame():
+    """Pairwise target frame for test_pairwise"""
+    return DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1])
+
+
+@pytest.fixture
+def pairwise_other_frame():
+    """Pairwise other frame for test_pairwise"""
+    return DataFrame(
+        [[None, 1, 1], [None, 1, 2], [None, 3, 2], [None, 8, 1]],
+        columns=["Y", "Z", "X"],
+    )
+
+
+def test_rolling_cov(series):
+    A = series
+    B = A + np.random.default_rng(2).standard_normal(len(A))
+
+    result = A.rolling(window=50, min_periods=25).cov(B)
+    tm.assert_almost_equal(result.iloc[-1], np.cov(A[-50:], B[-50:])[0, 1])
+
+
+def test_rolling_corr(series):
+    A = series
+    B = A + np.random.default_rng(2).standard_normal(len(A))
+
+    result = A.rolling(window=50, min_periods=25).corr(B)
+    tm.assert_almost_equal(result.iloc[-1], np.corrcoef(A[-50:], B[-50:])[0, 1])
+
+
+def test_rolling_corr_bias_correction():
+    # test for correct bias correction
+    a = Series(
+        np.arange(20, dtype=np.float64), index=date_range("2020-01-01", periods=20)
+    )
+    b = a.copy()
+    a[:5] = np.nan
+    b[:10] = np.nan
+
+    result = a.rolling(window=len(a), min_periods=1).corr(b)
+    tm.assert_almost_equal(result.iloc[-1], a.corr(b))
+
+
+@pytest.mark.parametrize("func", ["cov", "corr"])
+def test_rolling_pairwise_cov_corr(func, frame):
+    result = getattr(frame.rolling(window=10, min_periods=5), func)()
+    result = result.loc[(slice(None), 1), 5]
+    result.index = result.index.droplevel(1)
+    expected = getattr(frame[1].rolling(window=10, min_periods=5), func)(frame[5])
+    tm.assert_series_equal(result, expected, check_names=False)
+
+
+@pytest.mark.parametrize("method", ["corr", "cov"])
+def test_flex_binary_frame(method, frame):
+    series = frame[1]
+
+    res = getattr(series.rolling(window=10), method)(frame)
+    res2 = getattr(frame.rolling(window=10), method)(series)
+    exp = frame.apply(lambda x: getattr(series.rolling(window=10), method)(x))
+
+    tm.assert_frame_equal(res, exp)
+    tm.assert_frame_equal(res2, exp)
+
+    frame2 = DataFrame(
+        np.random.default_rng(2).standard_normal(frame.shape),
+        index=frame.index,
+        columns=frame.columns,
+    )
+
+    res3 = getattr(frame.rolling(window=10), method)(frame2)
+    res3.columns = Index(list(res3.columns))
+    exp = DataFrame(
+        {k: getattr(frame[k].rolling(window=10), method)(frame2[k]) for k in frame}
+    )
+    tm.assert_frame_equal(res3, exp)
+
+
+@pytest.mark.parametrize("window", range(7))
+def test_rolling_corr_with_zero_variance(window):
+    # GH 18430
+    s = Series(np.zeros(20))
+    other = Series(np.arange(20))
+
+    assert s.rolling(window=window).corr(other=other).isna().all()
+
+
+def test_corr_sanity():
+    # GH 3155
+    df = DataFrame(
+        np.array(
+            [
+                [0.87024726, 0.18505595],
+                [0.64355431, 0.3091617],
+                [0.92372966, 0.50552513],
+                [0.00203756, 0.04520709],
+                [0.84780328, 0.33394331],
+                [0.78369152, 0.63919667],
+            ]
+        )
+    )
+
+    res = df[0].rolling(5, center=True).corr(df[1])
+    assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res)
+
+    df = DataFrame(np.random.default_rng(2).random((30, 2)))
+    res = df[0].rolling(5, center=True).corr(df[1])
+    assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res)
+
+
+def test_rolling_cov_diff_length():
+    # GH 7512
+    s1 = Series([1, 2, 3], index=range(3))
+    s2 = Series([1, 3], index=range(0, 4, 2))
+    result = s1.rolling(window=3, min_periods=2).cov(s2)
+    expected = Series([None, None, 2.0])
+    tm.assert_series_equal(result, expected)
+
+    s2a = Series([1, None, 3], index=range(3))
+    result = s1.rolling(window=3, min_periods=2).cov(s2a)
+    tm.assert_series_equal(result, expected)
+
+
+def test_rolling_corr_diff_length():
+    # GH 7512
+    s1 = Series([1, 2, 3], index=range(3))
+    s2 = Series([1, 3], index=range(0, 4, 2))
+    result = s1.rolling(window=3, min_periods=2).corr(s2)
+    expected = Series([None, None, 1.0])
+    tm.assert_series_equal(result, expected)
+
+    s2a = Series([1, None, 3], index=range(3))
+    result = s1.rolling(window=3, min_periods=2).corr(s2a)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["cov", "corr"])
+def test_time_based_rolling_other_longer_raises(func):
+    # GH#62937
+    idx_short = date_range("2020-01-01", periods=3, freq="D")
+    idx_long = date_range("2020-01-01", periods=5, freq="D")
+    s = Series([1, 2, 3], index=idx_short)
+    other = Series([1, 2, 3, 4, 5], index=idx_long)
+    msg = "Variable rolling window requires .* Got 3 < 5"
+    with pytest.raises(ValueError, match=msg):
+        getattr(s.rolling("2D"), func)(other)
+
+
+@pytest.mark.parametrize(
+    "f",
+    [
+        lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)),
+        lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)),
+    ],
+)
+def test_rolling_functions_window_non_shrinkage_binary(f):
+    # corr/cov return a MI DataFrame
+    df = DataFrame(
+        [[1, 5], [3, 2], [3, 9], [-1, 0]],
+        columns=Index(["A", "B"], name="foo"),
+        index=Index(range(4), name="bar"),
+    )
+    df_expected = DataFrame(
+        columns=Index(["A", "B"], name="foo"),
+        index=MultiIndex.from_product([df.index, df.columns], names=["bar", "foo"]),
+        dtype="float64",
+    )
+    df_result = f(df)
+    tm.assert_frame_equal(df_result, df_expected)
+
+
+@pytest.mark.parametrize(
+    "f",
+    [
+        lambda x: (x.rolling(window=10, min_periods=5).cov(x, pairwise=True)),
+        lambda x: (x.rolling(window=10, min_periods=5).corr(x, pairwise=True)),
+    ],
+)
+def test_moment_functions_zero_length_pairwise(f):
+    df1 = DataFrame()
+    df2 = DataFrame(columns=Index(["a"], name="foo"), index=Index([], name="bar"))
+    df2["a"] = df2["a"].astype("float64")
+
+    df1_expected = DataFrame(index=MultiIndex.from_product([df1.index, df1.columns]))
+    df2_expected = DataFrame(
+        index=MultiIndex.from_product([df2.index, df2.columns], names=["bar", "foo"]),
+        columns=Index(["a"], name="foo"),
+        dtype="float64",
+    )
+
+    df1_result = f(df1)
+    tm.assert_frame_equal(df1_result, df1_expected)
+
+    df2_result = f(df2)
+    tm.assert_frame_equal(df2_result, df2_expected)
+
+
+class TestPairwise:
+    # GH 7738
+    @pytest.mark.parametrize("f", [lambda x: x.cov(), lambda x: x.corr()])
+    def test_no_flex(self, pairwise_frames, pairwise_target_frame, f):
+        # DataFrame methods (which do not call flex_binary_moment())
+
+        result = f(pairwise_frames)
+        tm.assert_index_equal(result.index, pairwise_frames.columns)
+        tm.assert_index_equal(result.columns, pairwise_frames.columns)
+        expected = f(pairwise_target_frame)
+        # since we have sorted the results
+        # we can only compare non-nans
+        result = result.dropna().values
+        expected = expected.dropna().values
+
+        tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+
+    @pytest.mark.parametrize(
+        "f",
+        [
+            lambda x: x.expanding().cov(pairwise=True),
+            lambda x: x.expanding().corr(pairwise=True),
+            lambda x: x.rolling(window=3).cov(pairwise=True),
+            lambda x: x.rolling(window=3).corr(pairwise=True),
+            lambda x: x.ewm(com=3).cov(pairwise=True),
+            lambda x: x.ewm(com=3).corr(pairwise=True),
+        ],
+    )
+    def test_pairwise_with_self(self, pairwise_frames, pairwise_target_frame, f):
+        # DataFrame with itself, pairwise=True
+        # note that we may construct the 1st level of the MI
+        # in a non-monotonic way, so compare accordingly
+        result = f(pairwise_frames)
+        tm.assert_index_equal(
+            result.index.levels[0], pairwise_frames.index, check_names=False
+        )
+        tm.assert_index_equal(
+            safe_sort(result.index.levels[1]),
+            safe_sort(pairwise_frames.columns.unique()),
+        )
+        tm.assert_index_equal(result.columns, pairwise_frames.columns)
+        expected = f(pairwise_target_frame)
+        # since we have sorted the results
+        # we can only compare non-nans
+        result = result.dropna().values
+        expected = expected.dropna().values
+
+        tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+
+    @pytest.mark.parametrize(
+        "f",
+        [
+            lambda x: x.expanding().cov(pairwise=False),
+            lambda x: x.expanding().corr(pairwise=False),
+            lambda x: x.rolling(window=3).cov(pairwise=False),
+            lambda x: x.rolling(window=3).corr(pairwise=False),
+            lambda x: x.ewm(com=3).cov(pairwise=False),
+            lambda x: x.ewm(com=3).corr(pairwise=False),
+        ],
+    )
+    def test_no_pairwise_with_self(self, pairwise_frames, pairwise_target_frame, f):
+        # DataFrame with itself, pairwise=False
+        result = f(pairwise_frames)
+        tm.assert_index_equal(result.index, pairwise_frames.index)
+        tm.assert_index_equal(result.columns, pairwise_frames.columns)
+        expected = f(pairwise_target_frame)
+        # since we have sorted the results
+        # we can only compare non-nans
+        result = result.dropna().values
+        expected = expected.dropna().values
+
+        tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+
+    @pytest.mark.parametrize(
+        "f",
+        [
+            lambda x, y: x.expanding().cov(y, pairwise=True),
+            lambda x, y: x.expanding().corr(y, pairwise=True),
+            lambda x, y: x.rolling(window=3).cov(y, pairwise=True),
+            # TODO: We're missing a flag somewhere in meson
+            pytest.param(
+                lambda x, y: x.rolling(window=3).corr(y, pairwise=True),
+                marks=pytest.mark.xfail(
+                    not IS64, reason="Precision issues on 32 bit", strict=False
+                ),
+            ),
+            lambda x, y: x.ewm(com=3).cov(y, pairwise=True),
+            lambda x, y: x.ewm(com=3).corr(y, pairwise=True),
+        ],
+    )
+    def test_pairwise_with_other(
+        self, pairwise_frames, pairwise_target_frame, pairwise_other_frame, f
+    ):
+        # DataFrame with another DataFrame, pairwise=True
+        result = f(pairwise_frames, pairwise_other_frame)
+        tm.assert_index_equal(
+            result.index.levels[0], pairwise_frames.index, check_names=False
+        )
+        tm.assert_index_equal(
+            safe_sort(result.index.levels[1]),
+            safe_sort(pairwise_other_frame.columns.unique()),
+        )
+        expected = f(pairwise_target_frame, pairwise_other_frame)
+        # since we have sorted the results
+        # we can only compare non-nans
+        result = result.dropna().values
+        expected = expected.dropna().values
+
+        tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+
+    @pytest.mark.filterwarnings("ignore:RuntimeWarning")
+    @pytest.mark.parametrize(
+        "f",
+        [
+            lambda x, y: x.expanding().cov(y, pairwise=False),
+            lambda x, y: x.expanding().corr(y, pairwise=False),
+            lambda x, y: x.rolling(window=3).cov(y, pairwise=False),
+            lambda x, y: x.rolling(window=3).corr(y, pairwise=False),
+            lambda x, y: x.ewm(com=3).cov(y, pairwise=False),
+            lambda x, y: x.ewm(com=3).corr(y, pairwise=False),
+        ],
+    )
+    def test_no_pairwise_with_other(self, pairwise_frames, pairwise_other_frame, f):
+        # DataFrame with another DataFrame, pairwise=False
+        result = (
+            f(pairwise_frames, pairwise_other_frame)
+            if pairwise_frames.columns.is_unique
+            else None
+        )
+        if result is not None:
+            # we can have int and str columns
+            expected_index = pairwise_frames.index.union(pairwise_other_frame.index)
+            expected_columns = pairwise_frames.columns.union(
+                pairwise_other_frame.columns
+            )
+            tm.assert_index_equal(result.index, expected_index)
+            tm.assert_index_equal(result.columns, expected_columns)
+        else:
+            with pytest.raises(ValueError, match="'arg1' columns are not unique"):
+                f(pairwise_frames, pairwise_other_frame)
+            with pytest.raises(ValueError, match="'arg2' columns are not unique"):
+                f(pairwise_other_frame, pairwise_frames)
+
+    @pytest.mark.parametrize(
+        "f",
+        [
+            lambda x, y: x.expanding().cov(y),
+            lambda x, y: x.expanding().corr(y),
+            lambda x, y: x.rolling(window=3).cov(y),
+            lambda x, y: x.rolling(window=3).corr(y),
+            lambda x, y: x.ewm(com=3).cov(y),
+            lambda x, y: x.ewm(com=3).corr(y),
+        ],
+    )
+    def test_pairwise_with_series(self, pairwise_frames, pairwise_target_frame, f):
+        # DataFrame with a Series
+        result = f(pairwise_frames, Series([1, 1, 3, 8]))
+        tm.assert_index_equal(result.index, pairwise_frames.index)
+        tm.assert_index_equal(result.columns, pairwise_frames.columns)
+        expected = f(pairwise_target_frame, Series([1, 1, 3, 8]))
+        # since we have sorted the results
+        # we can only compare non-nans
+        result = result.dropna().values
+        expected = expected.dropna().values
+        tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+
+        result = f(Series([1, 1, 3, 8]), pairwise_frames)
+        tm.assert_index_equal(result.index, pairwise_frames.index)
+        tm.assert_index_equal(result.columns, pairwise_frames.columns)
+        expected = f(Series([1, 1, 3, 8]), pairwise_target_frame)
+        # since we have sorted the results
+        # we can only compare non-nans
+        result = result.dropna().values
+        expected = expected.dropna().values
+        tm.assert_numpy_array_equal(result, expected, check_dtype=False)
+
+    def test_corr_freq_memory_error(self):
+        # GH 31789
+        s = Series(range(5), index=date_range("2020", periods=5))
+        result = s.rolling("12h").corr(s)
+        expected = Series([np.nan] * 5, index=date_range("2020", periods=5))
+        tm.assert_series_equal(result, expected)
+
+    def test_cov_mulittindex(self):
+        # GH 34440
+
+        columns = MultiIndex.from_product([list("ab"), list("xy"), list("AB")])
+        index = range(3)
+        df = DataFrame(np.arange(24).reshape(3, 8), index=index, columns=columns)
+
+        result = df.ewm(alpha=0.1).cov()
+
+        index = MultiIndex.from_product([range(3), list("ab"), list("xy"), list("AB")])
+        columns = MultiIndex.from_product([list("ab"), list("xy"), list("AB")])
+        expected = DataFrame(
+            np.vstack(
+                (
+                    np.full((8, 8), np.nan),
+                    np.full((8, 8), 32.000000),
+                    np.full((8, 8), 63.881919),
+                )
+            ),
+            index=index,
+            columns=columns,
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_multindex_columns_pairwise_func(self):
+        # GH 21157
+        columns = MultiIndex.from_arrays([["M", "N"], ["P", "Q"]], names=["a", "b"])
+        df = DataFrame(np.ones((5, 2)), columns=columns)
+        result = df.rolling(3).corr()
+        expected = DataFrame(
+            np.nan,
+            index=MultiIndex.from_arrays(
+                [
+                    np.repeat(np.arange(5, dtype=np.int64), 2),
+                    ["M", "N"] * 5,
+                    ["P", "Q"] * 5,
+                ],
+                names=[None, "a", "b"],
+            ),
+            columns=columns,
+        )
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a232751a82de73aeb892148e03f2617df13f528
--- /dev/null
+++ b/pandas/tests/window/test_rolling.py
@@ -0,0 +1,2115 @@
+from datetime import (
+    datetime,
+    timedelta,
+)
+
+import numpy as np
+import pytest
+
+from pandas.compat import (
+    IS64,
+)
+from pandas.errors import Pandas4Warning
+
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    MultiIndex,
+    Series,
+    Timedelta,
+    Timestamp,
+    date_range,
+    period_range,
+)
+import pandas._testing as tm
+from pandas.api.indexers import BaseIndexer
+from pandas.core.indexers.objects import VariableOffsetWindowIndexer
+
+from pandas.tseries.offsets import BusinessDay
+
+
+def test_doc_string():
+    df = DataFrame({"B": [0, 1, 2, np.nan, 4]})
+    df
+    df.rolling(2).sum()
+    df.rolling(2, min_periods=1).sum()
+
+
+def test_constructor(frame_or_series):
+    # GH 12669
+
+    c = frame_or_series(range(5)).rolling
+
+    # valid
+    c(0)
+    c(window=2)
+    c(window=2, min_periods=1)
+    c(window=2, min_periods=1, center=True)
+    c(window=2, min_periods=1, center=False)
+
+    # GH 13383
+
+    msg = "window must be an integer 0 or greater"
+
+    with pytest.raises(ValueError, match=msg):
+        c(-1)
+
+
+@pytest.mark.parametrize("w", [2.0, "foo", np.array([2])])
+def test_invalid_constructor(frame_or_series, w):
+    # not valid
+
+    c = frame_or_series(range(5)).rolling
+
+    msg = "|".join(
+        [
+            "window must be an integer",
+            "passed window foo is not compatible with a datetimelike index",
+        ]
+    )
+    with pytest.raises(ValueError, match=msg):
+        c(window=w)
+
+    msg = "min_periods must be an integer"
+    with pytest.raises(ValueError, match=msg):
+        c(window=2, min_periods=w)
+
+    msg = "center must be a boolean"
+    with pytest.raises(ValueError, match=msg):
+        c(window=2, min_periods=1, center=w)
+
+
+@pytest.mark.parametrize(
+    "window",
+    [
+        timedelta(days=3),
+        Timedelta(days=3),
+        "3D",
+        VariableOffsetWindowIndexer(
+            index=date_range("2015-12-25", periods=5), offset=BusinessDay(1)
+        ),
+    ],
+)
+def test_freq_window_not_implemented(window):
+    # GH 15354
+    df = DataFrame(
+        np.arange(10),
+        index=date_range("2015-12-24", periods=10, freq="D"),
+    )
+    with pytest.raises(
+        NotImplementedError, match="^step (not implemented|is not supported)"
+    ):
+        df.rolling(window, step=3).sum()
+
+
+@pytest.mark.parametrize("agg", ["cov", "corr"])
+def test_step_not_implemented_for_cov_corr(agg):
+    # GH 15354
+    roll = DataFrame(range(2)).rolling(1, step=2)
+    with pytest.raises(NotImplementedError, match="step not implemented"):
+        getattr(roll, agg)()
+
+
+@pytest.mark.parametrize("window", [timedelta(days=3), Timedelta(days=3)])
+def test_constructor_with_timedelta_window(window):
+    # GH 15440
+    n = 10
+    df = DataFrame(
+        {"value": np.arange(n)},
+        index=date_range("2015-12-24", periods=n, freq="D"),
+    )
+    expected_data = np.append([0.0, 1.0], np.arange(3.0, 27.0, 3))
+
+    result = df.rolling(window=window).sum()
+    expected = DataFrame(
+        {"value": expected_data},
+        index=date_range("2015-12-24", periods=n, freq="D"),
+    )
+    tm.assert_frame_equal(result, expected)
+    expected = df.rolling("3D").sum()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("window", [timedelta(days=3), Timedelta(days=3), "3D"])
+def test_constructor_timedelta_window_and_minperiods(window, raw):
+    # GH 15305
+    n = 10
+    df = DataFrame(
+        {"value": np.arange(n)},
+        index=date_range("2017-08-08", periods=n, freq="D"),
+    )
+    expected = DataFrame(
+        {"value": np.append([np.nan, 1.0], np.arange(3.0, 27.0, 3))},
+        index=date_range("2017-08-08", periods=n, freq="D"),
+    )
+    result_roll_sum = df.rolling(window=window, min_periods=2).sum()
+    result_roll_generic = df.rolling(window=window, min_periods=2).apply(sum, raw=raw)
+    tm.assert_frame_equal(result_roll_sum, expected)
+    tm.assert_frame_equal(result_roll_generic, expected)
+
+
+def test_closed_fixed(closed, arithmetic_win_operators):
+    # GH 34315
+    func_name = arithmetic_win_operators
+    df_fixed = DataFrame({"A": [0, 1, 2, 3, 4]})
+    df_time = DataFrame({"A": [0, 1, 2, 3, 4]}, index=date_range("2020", periods=5))
+
+    result = getattr(
+        df_fixed.rolling(2, closed=closed, min_periods=1),
+        func_name,
+    )()
+    expected = getattr(
+        df_time.rolling("2D", closed=closed, min_periods=1),
+        func_name,
+    )().reset_index(drop=True)
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "closed, window_selections",
+    [
+        (
+            "both",
+            [
+                [True, True, False, False, False],
+                [True, True, True, False, False],
+                [False, True, True, True, False],
+                [False, False, True, True, True],
+                [False, False, False, True, True],
+            ],
+        ),
+        (
+            "left",
+            [
+                [True, False, False, False, False],
+                [True, True, False, False, False],
+                [False, True, True, False, False],
+                [False, False, True, True, False],
+                [False, False, False, True, True],
+            ],
+        ),
+        (
+            "right",
+            [
+                [True, True, False, False, False],
+                [False, True, True, False, False],
+                [False, False, True, True, False],
+                [False, False, False, True, True],
+                [False, False, False, False, True],
+            ],
+        ),
+        (
+            "neither",
+            [
+                [True, False, False, False, False],
+                [False, True, False, False, False],
+                [False, False, True, False, False],
+                [False, False, False, True, False],
+                [False, False, False, False, True],
+            ],
+        ),
+    ],
+)
+def test_datetimelike_centered_selections(
+    closed, window_selections, arithmetic_win_operators
+):
+    # GH 34315
+    func_name = arithmetic_win_operators
+    df_time = DataFrame(
+        {"A": [0.0, 1.0, 2.0, 3.0, 4.0]}, index=date_range("2020", periods=5)
+    )
+
+    expected = DataFrame(
+        {"A": [getattr(df_time["A"].iloc[s], func_name)() for s in window_selections]},
+        index=date_range("2020", periods=5),
+    )
+
+    result = getattr(
+        df_time.rolling("2D", closed=closed, min_periods=1, center=True),
+        func_name,
+    )()
+
+    tm.assert_frame_equal(result, expected, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "window,closed,expected",
+    [
+        ("3s", "right", [3.0, 3.0, 3.0]),
+        ("3s", "both", [3.0, 3.0, 3.0]),
+        ("3s", "left", [3.0, 3.0, 3.0]),
+        ("3s", "neither", [3.0, 3.0, 3.0]),
+        ("2s", "right", [3.0, 2.0, 2.0]),
+        ("2s", "both", [3.0, 3.0, 3.0]),
+        ("2s", "left", [1.0, 3.0, 3.0]),
+        ("2s", "neither", [1.0, 2.0, 2.0]),
+    ],
+)
+def test_datetimelike_centered_offset_covers_all(
+    window, closed, expected, frame_or_series
+):
+    # GH 42753
+
+    index = [
+        Timestamp("20130101 09:00:01"),
+        Timestamp("20130101 09:00:02"),
+        Timestamp("20130101 09:00:02"),
+    ]
+    df = frame_or_series([1, 1, 1], index=index)
+
+    result = df.rolling(window, closed=closed, center=True).sum()
+    expected = frame_or_series(expected, index=index)
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "window,closed,expected",
+    [
+        ("2D", "right", [4, 4, 4, 4, 4, 4, 2, 2]),
+        ("2D", "left", [2, 2, 4, 4, 4, 4, 4, 4]),
+        ("2D", "both", [4, 4, 6, 6, 6, 6, 4, 4]),
+        ("2D", "neither", [2, 2, 2, 2, 2, 2, 2, 2]),
+    ],
+)
+def test_datetimelike_nonunique_index_centering(
+    window, closed, expected, frame_or_series
+):
+    index = DatetimeIndex(
+        [
+            "2020-01-01",
+            "2020-01-01",
+            "2020-01-02",
+            "2020-01-02",
+            "2020-01-03",
+            "2020-01-03",
+            "2020-01-04",
+            "2020-01-04",
+        ]
+    )
+
+    df = frame_or_series([1] * 8, index=index, dtype=float)
+    expected = frame_or_series(expected, index=index, dtype=float)
+
+    result = df.rolling(window, center=True, closed=closed).sum()
+
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "closed,expected",
+    [
+        ("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14, 18, 21]),
+        ("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5, 13, 8]),
+        ("right", [0, 1, 3, 6, 10, 14, 11, 18, 21, 17]),
+        ("both", [0, 1, 3, 6, 10, 15, 20, 27, 26, 30]),
+    ],
+)
+def test_variable_window_nonunique(closed, expected, frame_or_series):
+    # GH 20712
+    index = DatetimeIndex(
+        [
+            "2011-01-01",
+            "2011-01-01",
+            "2011-01-02",
+            "2011-01-02",
+            "2011-01-02",
+            "2011-01-03",
+            "2011-01-04",
+            "2011-01-04",
+            "2011-01-05",
+            "2011-01-06",
+        ]
+    )
+
+    df = frame_or_series(range(10), index=index, dtype=float)
+    expected = frame_or_series(expected, index=index, dtype=float)
+
+    result = df.rolling("2D", closed=closed).sum()
+
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "closed,expected",
+    [
+        ("left", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 18, 21]),
+        ("neither", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 13, 8]),
+        ("right", [0, 1, 3, 6, 10, 15, 21, 28, 21, 17]),
+        ("both", [0, 1, 3, 6, 10, 15, 21, 28, 26, 30]),
+    ],
+)
+def test_variable_offset_window_nonunique(closed, expected, frame_or_series):
+    # GH 20712
+    index = DatetimeIndex(
+        [
+            "2011-01-01",
+            "2011-01-01",
+            "2011-01-02",
+            "2011-01-02",
+            "2011-01-02",
+            "2011-01-03",
+            "2011-01-04",
+            "2011-01-04",
+            "2011-01-05",
+            "2011-01-06",
+        ]
+    )
+
+    df = frame_or_series(range(10), index=index, dtype=float)
+    expected = frame_or_series(expected, index=index, dtype=float)
+
+    offset = BusinessDay(2)
+    indexer = VariableOffsetWindowIndexer(index=index, offset=offset)
+    result = df.rolling(indexer, closed=closed, min_periods=1).sum()
+
+    tm.assert_equal(result, expected)
+
+
+def test_even_number_window_alignment():
+    # see discussion in GH 38780
+    s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3))
+
+    # behavior of index- and datetime-based windows differs here!
+    # s.rolling(window=2, min_periods=1, center=True).mean()
+
+    result = s.rolling(window="2D", min_periods=1, center=True).mean()
+
+    expected = Series([0.5, 1.5, 2], index=s.index)
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_closed_fixed_binary_col(center, step):
+    # GH 34315
+    data = [0, 1, 1, 0, 0, 1, 0, 1]
+    df = DataFrame(
+        {"binary_col": data},
+        index=date_range(start="2020-01-01", freq="min", periods=len(data)),
+    )
+
+    if center:
+        expected_data = [2 / 3, 0.5, 0.4, 0.5, 0.428571, 0.5, 0.571429, 0.5]
+    else:
+        expected_data = [np.nan, 0, 0.5, 2 / 3, 0.5, 0.4, 0.5, 0.428571]
+
+    expected = DataFrame(
+        expected_data,
+        columns=["binary_col"],
+        index=date_range(start="2020-01-01", freq="min", periods=len(expected_data)),
+    )[::step]
+
+    rolling = df.rolling(
+        window=len(df), closed="left", min_periods=1, center=center, step=step
+    )
+    result = rolling.mean()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("closed", ["neither", "left"])
+def test_closed_empty(closed, arithmetic_win_operators):
+    # GH 26005
+    func_name = arithmetic_win_operators
+    ser = Series(data=np.arange(5), index=date_range("2000", periods=5, freq="2D"))
+    roll = ser.rolling("1D", closed=closed)
+
+    result = getattr(roll, func_name)()
+    expected = Series([np.nan] * 5, index=ser.index)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["min", "max"])
+def test_closed_one_entry(func):
+    # GH24718
+    ser = Series(data=[2], index=date_range("2000", periods=1))
+    result = getattr(ser.rolling("10D", closed="left"), func)()
+    tm.assert_series_equal(result, Series([np.nan], index=ser.index))
+
+
+@pytest.mark.parametrize("func", ["min", "max"])
+def test_closed_one_entry_groupby(func):
+    # GH24718
+    ser = DataFrame(
+        data={"A": [1, 1, 2], "B": [3, 2, 1]},
+        index=date_range("2000", periods=3),
+    )
+    result = getattr(
+        ser.groupby("A", sort=False)["B"].rolling("10D", closed="left"), func
+    )()
+    exp_idx = MultiIndex.from_arrays(arrays=[[1, 1, 2], ser.index], names=("A", None))
+    expected = Series(data=[np.nan, 3, np.nan], index=exp_idx, name="B")
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("input_dtype", ["int", "float"])
+@pytest.mark.parametrize(
+    "func,closed,expected",
+    [
+        ("min", "right", [0.0, 0, 0, 1, 2, 3, 4, 5, 6, 7]),
+        ("min", "both", [0.0, 0, 0, 0, 1, 2, 3, 4, 5, 6]),
+        ("min", "neither", [np.nan, 0, 0, 1, 2, 3, 4, 5, 6, 7]),
+        ("min", "left", [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, 6]),
+        ("max", "right", [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        ("max", "both", [0.0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
+        ("max", "neither", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]),
+        ("max", "left", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8]),
+    ],
+)
+def test_closed_min_max_datetime(input_dtype, func, closed, expected):
+    # see gh-21704
+    ser = Series(
+        data=np.arange(10).astype(input_dtype),
+        index=date_range("2000", periods=10),
+    )
+
+    result = getattr(ser.rolling("3D", closed=closed), func)()
+    expected = Series(expected, index=ser.index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_closed_uneven():
+    # see gh-21704
+    ser = Series(data=np.arange(10), index=date_range("2000", periods=10))
+
+    # uneven
+    ser = ser.drop(index=ser.index[[1, 5]])
+    result = ser.rolling("3D", closed="left").min()
+    expected = Series([np.nan, 0, 0, 2, 3, 4, 6, 6], index=ser.index)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "func,closed,expected",
+    [
+        ("min", "right", [np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan, np.nan]),
+        ("min", "both", [np.nan, 0, 0, 0, 1, 2, 3, 4, 5, np.nan]),
+        ("min", "neither", [np.nan, np.nan, 0, 1, 2, 3, 4, 5, np.nan, np.nan]),
+        ("min", "left", [np.nan, np.nan, 0, 0, 1, 2, 3, 4, 5, np.nan]),
+        ("max", "right", [np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan, np.nan]),
+        ("max", "both", [np.nan, 1, 2, 3, 4, 5, 6, 6, 6, np.nan]),
+        ("max", "neither", [np.nan, np.nan, 1, 2, 3, 4, 5, 6, np.nan, np.nan]),
+        ("max", "left", [np.nan, np.nan, 1, 2, 3, 4, 5, 6, 6, np.nan]),
+    ],
+)
+def test_closed_min_max_minp(func, closed, expected):
+    # see gh-21704
+    ser = Series(data=np.arange(10), index=date_range("2000", periods=10))
+    # Explicit cast to float to avoid implicit cast when setting nan
+    ser = ser.astype("float")
+    ser[ser.index[-3:]] = np.nan
+    result = getattr(ser.rolling("3D", min_periods=2, closed=closed), func)()
+    expected = Series(expected, index=ser.index)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "closed,expected",
+    [
+        ("right", [0, 0.5, 1, 2, 3, 4, 5, 6, 7, 8]),
+        ("both", [0, 0.5, 1, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]),
+        ("neither", [np.nan, 0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]),
+        ("left", [np.nan, 0, 0.5, 1, 2, 3, 4, 5, 6, 7]),
+    ],
+)
+def test_closed_median_quantile(closed, expected):
+    # GH 26005
+    ser = Series(data=np.arange(10), index=date_range("2000", periods=10))
+    roll = ser.rolling("3D", closed=closed)
+    expected = Series(expected, index=ser.index)
+
+    result = roll.median()
+    tm.assert_series_equal(result, expected)
+
+    result = roll.quantile(0.5)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("roller", ["1s", 1])
+def tests_empty_df_rolling(roller):
+    # GH 15819 Verifies that datetime and integer rolling windows can be
+    # applied to empty DataFrames
+    expected = DataFrame()
+    result = DataFrame().rolling(roller).sum()
+    tm.assert_frame_equal(result, expected)
+
+    # Verifies that datetime and integer rolling windows can be applied to
+    # empty DataFrames with datetime index
+    expected = DataFrame(index=DatetimeIndex([]))
+    result = DataFrame(index=DatetimeIndex([])).rolling(roller).sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_empty_window_median_quantile():
+    # GH 26005
+    expected = Series([np.nan, np.nan, np.nan])
+    roll = Series(np.arange(3)).rolling(0)
+
+    result = roll.median()
+    tm.assert_series_equal(result, expected)
+
+    result = roll.quantile(0.1)
+    tm.assert_series_equal(result, expected)
+
+
+def test_missing_minp_zero():
+    # https://github.com/pandas-dev/pandas/pull/18921
+    # minp=0
+    x = Series([np.nan])
+    result = x.rolling(1, min_periods=0).sum()
+    expected = Series([0.0])
+    tm.assert_series_equal(result, expected)
+
+    # minp=1
+    result = x.rolling(1, min_periods=1).sum()
+    expected = Series([np.nan])
+    tm.assert_series_equal(result, expected)
+
+
+def test_missing_minp_zero_variable():
+    # https://github.com/pandas-dev/pandas/pull/18921
+    x = Series(
+        [np.nan] * 4,
+        index=DatetimeIndex(["2017-01-01", "2017-01-04", "2017-01-06", "2017-01-07"]),
+    )
+    result = x.rolling(Timedelta("2D"), min_periods=0).sum()
+    expected = Series(0.0, index=x.index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_multi_index_names():
+    # GH 16789, 16825
+    cols = MultiIndex.from_product([["A", "B"], ["C", "D", "E"]], names=["1", "2"])
+    df = DataFrame(np.ones((10, 6)), columns=cols)
+    result = df.rolling(3).cov()
+
+    tm.assert_index_equal(result.columns, df.columns)
+    assert result.index.names == [None, "1", "2"]
+
+
+def test_rolling_axis_sum():
+    # see gh-23372.
+    df = DataFrame(np.ones((10, 20)))
+    expected = DataFrame({i: [np.nan] * 2 + [3.0] * 8 for i in range(20)})
+    result = df.rolling(3).sum()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_rolling_axis_count():
+    # see gh-26055
+    df = DataFrame({"x": range(3), "y": range(3)})
+
+    expected = DataFrame({"x": [1.0, 2.0, 2.0], "y": [1.0, 2.0, 2.0]})
+    result = df.rolling(2, min_periods=0).count()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_readonly_array():
+    # GH-27766
+    arr = np.array([1, 3, np.nan, 3, 5])
+    arr.setflags(write=False)
+    result = Series(arr).rolling(2).mean()
+    expected = Series([np.nan, 2, np.nan, np.nan, 4])
+    tm.assert_series_equal(result, expected)
+
+
+def test_rolling_datetime(tz_naive_fixture):
+    # GH-28192
+    tz = tz_naive_fixture
+    df = DataFrame(
+        {i: [1] * 2 for i in date_range("2019-8-01", "2019-08-03", freq="D", tz=tz)}
+    )
+
+    result = df.T.rolling("2D").sum().T
+    expected = DataFrame(
+        {
+            **{
+                i: [1.0] * 2
+                for i in date_range("2019-8-01", periods=1, freq="D", tz=tz)
+            },
+            **{
+                i: [2.0] * 2
+                for i in date_range("2019-8-02", "2019-8-03", freq="D", tz=tz)
+            },
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_rolling_window_as_string(center):
+    # see gh-22590
+    date_today = datetime.now()
+    days = date_range(date_today, date_today + timedelta(365), freq="D")
+
+    data = np.ones(len(days))
+    df = DataFrame({"DateCol": days, "metric": data})
+
+    df.set_index("DateCol", inplace=True)
+    result = df.rolling(window="21D", min_periods=2, closed="left", center=center)[
+        "metric"
+    ].agg("max")
+
+    index = days.rename("DateCol")
+    index = index._with_freq(None)
+    expected_data = np.ones(len(days), dtype=np.float64)
+    if not center:
+        expected_data[:2] = np.nan
+    expected = Series(expected_data, index=index, name="metric")
+    tm.assert_series_equal(result, expected)
+
+
+def test_min_periods1():
+    # GH#6795
+    df = DataFrame([0, 1, 2, 1, 0], columns=["a"])
+    result = df["a"].rolling(3, center=True, min_periods=1).max()
+    expected = Series([1.0, 2.0, 2.0, 2.0, 1.0], name="a")
+    tm.assert_series_equal(result, expected)
+
+
+def test_rolling_count_with_min_periods(frame_or_series):
+    # GH 26996
+    result = frame_or_series(range(5)).rolling(3, min_periods=3).count()
+    expected = frame_or_series([np.nan, np.nan, 3.0, 3.0, 3.0])
+    tm.assert_equal(result, expected)
+
+
+def test_rolling_count_default_min_periods_with_null_values(frame_or_series):
+    # GH 26996
+    values = [1, 2, 3, np.nan, 4, 5, 6]
+    expected_counts = [1.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0]
+
+    # GH 31302
+    result = frame_or_series(values).rolling(3, min_periods=0).count()
+    expected = frame_or_series(expected_counts)
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "df,expected,window,min_periods",
+    [
+        (
+            {"A": [1, 2, 3], "B": [4, 5, 6]},
+            [
+                ({"A": [1], "B": [4]}, [0]),
+                ({"A": [1, 2], "B": [4, 5]}, [0, 1]),
+                ({"A": [1, 2, 3], "B": [4, 5, 6]}, [0, 1, 2]),
+            ],
+            3,
+            None,
+        ),
+        (
+            {"A": [1, 2, 3], "B": [4, 5, 6]},
+            [
+                ({"A": [1], "B": [4]}, [0]),
+                ({"A": [1, 2], "B": [4, 5]}, [0, 1]),
+                ({"A": [2, 3], "B": [5, 6]}, [1, 2]),
+            ],
+            2,
+            1,
+        ),
+        (
+            {"A": [1, 2, 3], "B": [4, 5, 6]},
+            [
+                ({"A": [1], "B": [4]}, [0]),
+                ({"A": [1, 2], "B": [4, 5]}, [0, 1]),
+                ({"A": [2, 3], "B": [5, 6]}, [1, 2]),
+            ],
+            2,
+            2,
+        ),
+        (
+            {"A": [1, 2, 3], "B": [4, 5, 6]},
+            [
+                ({"A": [1], "B": [4]}, [0]),
+                ({"A": [2], "B": [5]}, [1]),
+                ({"A": [3], "B": [6]}, [2]),
+            ],
+            1,
+            1,
+        ),
+        (
+            {"A": [1, 2, 3], "B": [4, 5, 6]},
+            [
+                ({"A": [1], "B": [4]}, [0]),
+                ({"A": [2], "B": [5]}, [1]),
+                ({"A": [3], "B": [6]}, [2]),
+            ],
+            1,
+            0,
+        ),
+        ({"A": [1], "B": [4]}, [], 2, None),
+        ({"A": [1], "B": [4]}, [], 2, 1),
+        (None, [({}, [])], 2, None),
+        (
+            {"A": [1, np.nan, 3], "B": [np.nan, 5, 6]},
+            [
+                ({"A": [1.0], "B": [np.nan]}, [0]),
+                ({"A": [1, np.nan], "B": [np.nan, 5]}, [0, 1]),
+                ({"A": [1, np.nan, 3], "B": [np.nan, 5, 6]}, [0, 1, 2]),
+            ],
+            3,
+            2,
+        ),
+    ],
+)
+def test_iter_rolling_dataframe(df, expected, window, min_periods):
+    # GH 11704
+    df = DataFrame(df)
+    expecteds = [DataFrame(values, index=index) for (values, index) in expected]
+
+    for expected, actual in zip(
+        expecteds, df.rolling(window, min_periods=min_periods), strict=False
+    ):
+        tm.assert_frame_equal(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "expected,window",
+    [
+        (
+            [
+                ({"A": [1], "B": [4]}, [0]),
+                ({"A": [1, 2], "B": [4, 5]}, [0, 1]),
+                ({"A": [2, 3], "B": [5, 6]}, [1, 2]),
+            ],
+            "2D",
+        ),
+        (
+            [
+                ({"A": [1], "B": [4]}, [0]),
+                ({"A": [1, 2], "B": [4, 5]}, [0, 1]),
+                ({"A": [1, 2, 3], "B": [4, 5, 6]}, [0, 1, 2]),
+            ],
+            "3D",
+        ),
+        (
+            [
+                ({"A": [1], "B": [4]}, [0]),
+                ({"A": [2], "B": [5]}, [1]),
+                ({"A": [3], "B": [6]}, [2]),
+            ],
+            "1D",
+        ),
+    ],
+)
+def test_iter_rolling_on_dataframe(expected, window):
+    # GH 11704, 40373
+    df = DataFrame(
+        {
+            "A": [1, 2, 3, 4, 5],
+            "B": [4, 5, 6, 7, 8],
+            "C": date_range(start="2016-01-01", periods=5, freq="D"),
+        }
+    )
+
+    expecteds = [
+        DataFrame(values, index=df.loc[index, "C"]) for (values, index) in expected
+    ]
+    for expected, actual in zip(expecteds, df.rolling(window, on="C"), strict=False):
+        tm.assert_frame_equal(actual, expected)
+
+
+def test_iter_rolling_on_dataframe_unordered():
+    # GH 43386
+    df = DataFrame({"a": ["x", "y", "x"], "b": [0, 1, 2]})
+    results = list(df.groupby("a").rolling(2))
+    expecteds = [df.iloc[idx, [1]] for idx in [[0], [0, 2], [1]]]
+    for result, expected in zip(results, expecteds, strict=True):
+        tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "ser,expected,window, min_periods",
+    [
+        (
+            Series([1, 2, 3]),
+            [([1], [0]), ([1, 2], [0, 1]), ([1, 2, 3], [0, 1, 2])],
+            3,
+            None,
+        ),
+        (
+            Series([1, 2, 3]),
+            [([1], [0]), ([1, 2], [0, 1]), ([1, 2, 3], [0, 1, 2])],
+            3,
+            1,
+        ),
+        (
+            Series([1, 2, 3]),
+            [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])],
+            2,
+            1,
+        ),
+        (
+            Series([1, 2, 3]),
+            [([1], [0]), ([1, 2], [0, 1]), ([2, 3], [1, 2])],
+            2,
+            2,
+        ),
+        (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 0),
+        (Series([1, 2, 3]), [([1], [0]), ([2], [1]), ([3], [2])], 1, 1),
+        (Series([1, 2]), [([1], [0]), ([1, 2], [0, 1])], 2, 0),
+        (Series([], dtype="int64"), [], 2, 1),
+    ],
+)
+def test_iter_rolling_series(ser, expected, window, min_periods):
+    # GH 11704
+    expecteds = [Series(values, index=index) for (values, index) in expected]
+
+    for expected, actual in zip(
+        expecteds, ser.rolling(window, min_periods=min_periods), strict=True
+    ):
+        tm.assert_series_equal(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "expected,expected_index,window",
+    [
+        (
+            [[0], [1], [2], [3], [4]],
+            [
+                date_range("2020-01-01", periods=1, freq="D"),
+                date_range("2020-01-02", periods=1, freq="D"),
+                date_range("2020-01-03", periods=1, freq="D"),
+                date_range("2020-01-04", periods=1, freq="D"),
+                date_range("2020-01-05", periods=1, freq="D"),
+            ],
+            "1D",
+        ),
+        (
+            [[0], [0, 1], [1, 2], [2, 3], [3, 4]],
+            [
+                date_range("2020-01-01", periods=1, freq="D"),
+                date_range("2020-01-01", periods=2, freq="D"),
+                date_range("2020-01-02", periods=2, freq="D"),
+                date_range("2020-01-03", periods=2, freq="D"),
+                date_range("2020-01-04", periods=2, freq="D"),
+            ],
+            "2D",
+        ),
+        (
+            [[0], [0, 1], [0, 1, 2], [1, 2, 3], [2, 3, 4]],
+            [
+                date_range("2020-01-01", periods=1, freq="D"),
+                date_range("2020-01-01", periods=2, freq="D"),
+                date_range("2020-01-01", periods=3, freq="D"),
+                date_range("2020-01-02", periods=3, freq="D"),
+                date_range("2020-01-03", periods=3, freq="D"),
+            ],
+            "3D",
+        ),
+    ],
+)
+def test_iter_rolling_datetime(expected, expected_index, window):
+    # GH 11704
+    ser = Series(range(5), index=date_range(start="2020-01-01", periods=5, freq="D"))
+
+    expecteds = [
+        Series(values, index=idx)
+        for (values, idx) in zip(expected, expected_index, strict=True)
+    ]
+
+    for expected, actual in zip(expecteds, ser.rolling(window), strict=True):
+        tm.assert_series_equal(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "grouping,_index",
+    [
+        (
+            {"level": 0},
+            MultiIndex.from_tuples(
+                [(0, 0), (0, 0), (1, 1), (1, 1), (1, 1)], names=[None, None]
+            ),
+        ),
+        (
+            {"by": "X"},
+            MultiIndex.from_tuples(
+                [(0, 0), (1, 0), (2, 1), (3, 1), (4, 1)], names=["X", None]
+            ),
+        ),
+    ],
+)
+def test_rolling_positional_argument(grouping, _index, raw):
+    # GH 34605
+
+    def scaled_sum(*args):
+        if len(args) < 2:
+            raise ValueError("The function needs two arguments")
+        array, scale = args
+        return array.sum() / scale
+
+    df = DataFrame(data={"X": range(5)}, index=[0, 0, 1, 1, 1])
+
+    expected = DataFrame(data={"X": [0.0, 0.5, 1.0, 1.5, 2.0]}, index=_index)
+    # GH 40341
+    if "by" in grouping:
+        expected = expected.drop(columns="X", errors="ignore")
+    result = df.groupby(**grouping).rolling(1).apply(scaled_sum, raw=raw, args=(2,))
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("add", [0.0, 2.0])
+def test_rolling_numerical_accuracy_kahan_mean(add, unit):
+    # GH: 36031 implementing kahan summation
+    dti = DatetimeIndex(
+        [
+            Timestamp("19700101 09:00:00"),
+            Timestamp("19700101 09:00:03"),
+            Timestamp("19700101 09:00:06"),
+        ]
+    ).as_unit(unit)
+    df = DataFrame(
+        {"A": [3002399751580331.0 + add, -0.0, -0.0]},
+        index=dti,
+    )
+    result = (
+        df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean()
+    )
+    dates = date_range("19700101 09:00:00", periods=7, freq="s", unit=unit)
+    expected = DataFrame(
+        {
+            "A": [
+                np.nan,
+                np.nan,
+                np.nan,
+                3002399751580330.5,
+                2001599834386887.25,
+                1000799917193443.625,
+                0.0,
+            ]
+        },
+        index=dates,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_rolling_numerical_accuracy_kahan_sum():
+    # GH: 13254
+    df = DataFrame([2.186, -1.647, 0.0, 0.0, 0.0, 0.0], columns=["x"])
+    result = df["x"].rolling(3).sum()
+    expected = Series([np.nan, np.nan, 0.539, -1.647, 0.0, 0.0], name="x")
+    tm.assert_series_equal(result, expected)
+
+
+def test_rolling_numerical_accuracy_jump():
+    # GH: 32761
+    index = date_range(start="2020-01-01", end="2020-01-02", freq="60s").append(
+        DatetimeIndex(["2020-01-03"])
+    )
+    data = np.random.default_rng(2).random(len(index))
+
+    df = DataFrame({"data": data}, index=index)
+    result = df.rolling("60s").mean()
+    tm.assert_frame_equal(result, df[["data"]])
+
+
+def test_rolling_numerical_accuracy_small_values():
+    # GH: 10319
+    s = Series(
+        data=[0.00012456, 0.0003, -0.0, -0.0],
+        index=date_range("1999-02-03", "1999-02-06"),
+    )
+    result = s.rolling(1).mean()
+    tm.assert_series_equal(result, s)
+
+
+def test_rolling_numerical_too_large_numbers():
+    # GH: 11645
+    dates = date_range("2015-01-01", periods=10, freq="D")
+    ds = Series(data=range(10), index=dates, dtype=np.float64)
+    ds.iloc[2] = -9e33
+    result = ds.rolling(5).mean()
+    expected = Series(
+        [
+            np.nan,
+            np.nan,
+            np.nan,
+            np.nan,
+            -1.8e33,
+            -1.8e33,
+            -1.8e33,
+            5.0,
+            6.0,
+            7.0,
+        ],
+        index=dates,
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("index", "window"),
+    [
+        (
+            period_range(start="2020-01-01 08:00", end="2020-01-01 08:08", freq="min"),
+            "2min",
+        ),
+        (
+            period_range(
+                start="2020-01-01 08:00", end="2020-01-01 12:00", freq="30min"
+            ),
+            "1h",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    ("func", "values"),
+    [
+        ("min", [np.nan, 0, 0, 1, 2, 3, 4, 5, 6]),
+        ("max", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7]),
+        ("sum", [np.nan, 0, 1, 3, 5, 7, 9, 11, 13]),
+    ],
+)
+def test_rolling_period_index(index, window, func, values):
+    # GH: 34225
+    ds = Series([0, 1, 2, 3, 4, 5, 6, 7, 8], index=index)
+    result = getattr(ds.rolling(window, closed="left"), func)()
+    expected = Series(values, index=index)
+    tm.assert_series_equal(result, expected)
+
+
+def test_rolling_sem(frame_or_series):
+    # GH: 26476
+    obj = frame_or_series([0, 1, 2])
+    result = obj.rolling(2, min_periods=1).sem()
+    if isinstance(result, DataFrame):
+        result = Series(result[0].values)
+    expected = Series([np.nan] + [0.5] * 2)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("func", "values", "window", "ddof", "expected_values"),
+    [
+        ("var", [99999999999999999, 1, 1, 2, 3, 1, 1], 2, 1, [5e33, 0, 0.5, 0.5, 2, 0]),
+        (
+            "std",
+            [99999999999999999, 1, 1, 2, 3, 1, 1],
+            2,
+            1,
+            [7.071068e16, 0, 0.7071068, 0.7071068, 1.414214, 0],
+        ),
+        ("var", [99999999999999999, 1, 2, 2, 3, 1, 1], 2, 1, [5e33, 0.5, 0, 0.5, 2, 0]),
+        (
+            "std",
+            [99999999999999999, 1, 2, 2, 3, 1, 1],
+            2,
+            1,
+            [7.071068e16, 0.7071068, 0, 0.7071068, 1.414214, 0],
+        ),
+        (
+            "std",
+            [1.2e03, 1.3e17, 1.5e17, 1.995e03, 1.990e03],
+            2,
+            1,
+            [9.192388e16, 1.414214e16, 1.060660e17, 3.535534e00],
+        ),
+        (
+            "var",
+            [
+                0.00000000e00,
+                0.00000000e00,
+                3.16188252e-18,
+                2.95781651e-16,
+                2.23153542e-51,
+                0.00000000e00,
+                0.00000000e00,
+                5.39943432e-48,
+                1.38206260e-73,
+                0.00000000e00,
+            ],
+            3,
+            1,
+            [
+                3.33250036e-036,
+                2.88538519e-032,
+                2.88538519e-032,
+                2.91622617e-032,
+                1.65991678e-102,
+                9.71796366e-096,
+                9.71796366e-096,
+                9.71796366e-096,
+            ],
+        ),
+        (
+            "std",
+            [1, -1, 0, 1, 3, 2, -2, 10000000000, 1, 2, 0, -2, 1, 3, 0, 1],
+            6,
+            1,
+            [
+                1.41421356e00,
+                1.87082869e00,
+                4.08248290e09,
+                4.08248290e09,
+                4.08248290e09,
+                4.08248290e09,
+                4.08248290e09,
+                4.08248290e09,
+                1.72240142e00,
+                1.75119007e00,
+                1.64316767e00,
+            ],
+        ),
+    ],
+)
+def test_rolling_var_correctness(func, values, window, ddof, expected_values):
+    # GH: 37051, 42064, 54518, 52407, 47721
+    ts = Series(values)
+    result = getattr(ts.rolling(window=window), func)(ddof=ddof)
+    if result.last_valid_index():
+        result = result[
+            result.first_valid_index() : result.last_valid_index() + 1
+        ].reset_index(drop=True)
+    expected = Series(expected_values)
+    tm.assert_series_equal(result, expected, atol=1e-55)
+    # GH 42064
+    tm.assert_series_equal(result == 0, expected == 0)
+
+
+def test_timeoffset_as_window_parameter_for_corr(unit):
+    # GH: 28266
+    dti = DatetimeIndex(
+        [
+            Timestamp("20130101 09:00:00"),
+            Timestamp("20130102 09:00:02"),
+            Timestamp("20130103 09:00:03"),
+            Timestamp("20130105 09:00:05"),
+            Timestamp("20130106 09:00:06"),
+        ]
+    ).as_unit(unit)
+    mi = MultiIndex.from_product([dti, ["B", "A"]])
+
+    exp = DataFrame(
+        {
+            "B": [
+                np.nan,
+                np.nan,
+                0.9999999999999998,
+                -1.0,
+                1.0,
+                -0.3273268353539892,
+                0.9999999999999998,
+                1.0,
+                0.9999999999999998,
+                1.0,
+            ],
+            "A": [
+                np.nan,
+                np.nan,
+                -1.0,
+                1.0000000000000002,
+                -0.3273268353539892,
+                0.9999999999999966,
+                1.0,
+                1.0000000000000002,
+                1.0,
+                1.0000000000000002,
+            ],
+        },
+        index=mi,
+    )
+
+    df = DataFrame(
+        {"B": [0, 1, 2, 4, 3], "A": [7, 4, 6, 9, 3]},
+        index=dti,
+    )
+
+    res = df.rolling(window="3D").corr()
+
+    tm.assert_frame_equal(exp, res)
+
+
+@pytest.mark.parametrize("method", ["var", "sum", "mean", "skew", "kurt", "min", "max"])
+def test_rolling_decreasing_indices(method):
+    """
+    Make sure that decreasing indices give the same results as increasing indices.
+
+    GH 36933
+    """
+    df = DataFrame({"values": np.arange(-15, 10) ** 2})
+    df_reverse = DataFrame({"values": df["values"][::-1]}, index=df.index[::-1])
+
+    increasing = getattr(df.rolling(window=5), method)()
+    decreasing = getattr(df_reverse.rolling(window=5), method)()
+
+    tm.assert_almost_equal(
+        decreasing.values[::-1][:-4], increasing.values[4:], atol=1e-12
+    )
+
+
+@pytest.mark.parametrize(
+    "window,closed,expected",
+    [
+        ("2s", "right", [1.0, 3.0, 5.0, 3.0]),
+        ("2s", "left", [0.0, 1.0, 3.0, 5.0]),
+        ("2s", "both", [1.0, 3.0, 6.0, 5.0]),
+        ("2s", "neither", [0.0, 1.0, 2.0, 3.0]),
+        ("3s", "right", [1.0, 3.0, 6.0, 5.0]),
+        ("3s", "left", [1.0, 3.0, 6.0, 5.0]),
+        ("3s", "both", [1.0, 3.0, 6.0, 5.0]),
+        ("3s", "neither", [1.0, 3.0, 6.0, 5.0]),
+    ],
+)
+def test_rolling_decreasing_indices_centered(window, closed, expected, frame_or_series):
+    """
+    Ensure that a symmetrical inverted index return same result as non-inverted.
+    """
+    #  GH 43927
+
+    index = date_range("2020", periods=4, freq="1s")
+    df_inc = frame_or_series(range(4), index=index)
+    df_dec = frame_or_series(range(4), index=index[::-1])
+
+    expected_inc = frame_or_series(expected, index=index)
+    expected_dec = frame_or_series(expected, index=index[::-1])
+
+    result_inc = df_inc.rolling(window, closed=closed, center=True).sum()
+    result_dec = df_dec.rolling(window, closed=closed, center=True).sum()
+
+    tm.assert_equal(result_inc, expected_inc)
+    tm.assert_equal(result_dec, expected_dec)
+
+
+@pytest.mark.parametrize(
+    "window,expected",
+    [
+        ("1ns", [1.0, 1.0, 1.0, 1.0]),
+        ("3ns", [2.0, 3.0, 3.0, 2.0]),
+    ],
+)
+def test_rolling_center_nanosecond_resolution(
+    window, closed, expected, frame_or_series
+):
+    index = date_range("2020", periods=4, freq="1ns")
+    df = frame_or_series([1, 1, 1, 1], index=index, dtype=float)
+    expected = frame_or_series(expected, index=index, dtype=float)
+    result = df.rolling(window, closed=closed, center=True).sum()
+    tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method,expected",
+    [
+        (
+            "var",
+            [
+                float("nan"),
+                43.0,
+                float("nan"),
+                136.333333,
+                43.5,
+                94.966667,
+                182.0,
+                318.0,
+            ],
+        ),
+        (
+            "mean",
+            [float("nan"), 7.5, float("nan"), 21.5, 6.0, 9.166667, 13.0, 17.5],
+        ),
+        (
+            "sum",
+            [float("nan"), 30.0, float("nan"), 86.0, 30.0, 55.0, 91.0, 140.0],
+        ),
+        (
+            "skew",
+            [
+                float("nan"),
+                0.709296,
+                float("nan"),
+                0.407073,
+                0.984656,
+                0.919184,
+                0.874674,
+                0.842418,
+            ],
+        ),
+        (
+            "kurt",
+            [
+                float("nan"),
+                -0.5916711736073559,
+                float("nan"),
+                -1.0028993131317954,
+                -0.06103844629409494,
+                -0.254143227116194,
+                -0.37362637362637585,
+                -0.45439658241367054,
+            ],
+        ),
+    ],
+)
+def test_rolling_non_monotonic(method, expected):
+    """
+    Make sure the (rare) branch of non-monotonic indices is covered by a test.
+
+    output from 1.1.3 is assumed to be the expected output. Output of sum/mean has
+    manually been verified.
+
+    GH 36933.
+    """
+    # Based on an example found in computation.rst
+    use_expanding = [True, False, True, False, True, True, True, True]
+    df = DataFrame({"values": np.arange(len(use_expanding)) ** 2})
+
+    class CustomIndexer(BaseIndexer):
+        def get_window_bounds(self, num_values, min_periods, center, closed, step):
+            start = np.empty(num_values, dtype=np.int64)
+            end = np.empty(num_values, dtype=np.int64)
+            for i in range(num_values):
+                if self.use_expanding[i]:
+                    start[i] = 0
+                    end[i] = i + 1
+                else:
+                    start[i] = i
+                    end[i] = i + self.window_size
+            return start, end
+
+    indexer = CustomIndexer(window_size=4, use_expanding=use_expanding)
+
+    result = getattr(df.rolling(indexer), method)()
+    expected = DataFrame({"values": expected})
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("index", "window"),
+    [
+        ([0, 1, 2, 3, 4], 2),
+        (date_range("2001-01-01", freq="D", periods=5), "2D"),
+    ],
+)
+def test_rolling_corr_timedelta_index(index, window):
+    # GH: 31286
+    x = Series([1, 2, 3, 4, 5], index=index)
+    y = x.copy()
+    x.iloc[0:2] = 0.0
+    result = x.rolling(window).corr(y)
+    expected = Series([np.nan, np.nan, 1, 1, 1], index=index)
+    tm.assert_almost_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values,method,expected",
+    [
+        (
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+            "first",
+            [float("nan"), float("nan"), 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        ),
+        (
+            [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan],
+            "first",
+            [float("nan")] * 10,
+        ),
+        (
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+            "last",
+            [float("nan"), float("nan"), 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+        ),
+        (
+            [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan],
+            "last",
+            [float("nan")] * 10,
+        ),
+    ],
+)
+def test_rolling_first_last(values, method, expected):
+    # GH#33155
+    x = Series(values)
+    result = getattr(x.rolling(3), method)()
+    expected = Series(expected)
+    tm.assert_almost_equal(result, expected)
+
+    x = DataFrame({"A": values})
+    result = getattr(x.rolling(3), method)()
+    expected = DataFrame({"A": expected})
+    tm.assert_almost_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values,method,expected",
+    [
+        (
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+            "first",
+            [1.0, 1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+        ),
+        (
+            [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan],
+            "first",
+            [1.0, 1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0],
+        ),
+        (
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+            "last",
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+        ),
+        (
+            [1.0, np.nan, 3.0, np.nan, 5.0, np.nan, 7.0, np.nan, 9.0, np.nan],
+            "last",
+            [1.0, 1.0, 3.0, 3.0, 5.0, 5.0, 7.0, 7.0, 9.0, 9.0],
+        ),
+    ],
+)
+def test_rolling_first_last_no_minp(values, method, expected):
+    # GH#33155
+    x = Series(values)
+    result = getattr(x.rolling(3, min_periods=0), method)()
+    expected = Series(expected)
+    tm.assert_almost_equal(result, expected)
+
+    x = DataFrame({"A": values})
+    result = getattr(x.rolling(3, min_periods=0), method)()
+    expected = DataFrame({"A": expected})
+    tm.assert_almost_equal(result, expected)
+
+
+def test_groupby_rolling_nan_included():
+    # GH 35542
+    data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
+    df = DataFrame(data)
+    result = df.groupby("group", dropna=False).rolling(1, min_periods=1).mean()
+    expected = DataFrame(
+        {"B": [0.0, 2.0, 3.0, 1.0, 4.0]},
+        # GH-38057 from_tuples puts the NaNs in the codes, result expects them
+        # to be in the levels, at the moment
+        # index=MultiIndex.from_tuples(
+        #     [("g1", 0), ("g1", 2), ("g2", 3), (np.nan, 1), (np.nan, 4)],
+        #     names=["group", None],
+        # ),
+        index=MultiIndex(
+            [["g1", "g2", np.nan], [0, 1, 2, 3, 4]],
+            [[0, 0, 1, 2, 2], [0, 2, 3, 1, 4]],
+            names=["group", None],
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["skew", "kurt"])
+def test_rolling_skew_kurt_numerical_stability(method):
+    # GH#6929
+    ser = Series(np.random.default_rng(2).random(10))
+    ser_copy = ser.copy()
+    expected = getattr(ser.rolling(3), method)()
+    tm.assert_series_equal(ser, ser_copy)
+    ser = ser + 50000
+    result = getattr(ser.rolling(3), method)()
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    ("method", "data", "values"),
+    [
+        (
+            "skew",
+            [3000000, 1, 1, 2, 3, 4, 999],
+            [np.nan] * 3 + [2.0, 0.854563, 0.0, 1.999984],
+        ),
+        (
+            "skew",
+            [1e6, -1e6, 1, 2, 3, 4, 5, 6],
+            [np.nan] * 3 + [-5.51135192e-06, -2.0, 0.0, 0.0, 0.0],
+        ),
+        (
+            "kurt",
+            [3000000, 1, 1, 2, 3, 4, 999],
+            [np.nan] * 3 + [4.0, -1.289256, -1.2, 3.999946],
+        ),
+        (
+            "kurt",
+            [1e6, -1e6, 1, 2, 3, 4, 5, 6],
+            [np.nan] * 3 + [1.5, 4.0, -1.2, -1.2, -1.2],
+        ),
+    ],
+)
+def test_rolling_skew_kurt_large_value_range(method, data, values):
+    # GH: 37557, 47461, 61416
+    s = Series(data)
+    result = getattr(s.rolling(4), method)()
+    expected = Series(values)
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["skew", "kurt"])
+def test_same_result_with_different_lengths(method):
+    # GH-54380
+    len_smaller = 10
+    len_bigger = 12
+    window_size = 8
+
+    rng = np.random.default_rng(2)
+    data = rng.normal(loc=0.0, scale=1e3, size=len_bigger)
+    window_smaller = Series(data[:len_smaller]).rolling(window_size)
+    window_bigger = Series(data).rolling(window_size)
+
+    result_smaller = getattr(window_smaller, method)()
+    result_bigger = getattr(window_bigger, method)()
+
+    result_bigger_trimmed = result_bigger[:len_smaller]
+
+    tm.assert_series_equal(result_smaller, result_bigger_trimmed, check_exact=True)
+
+
+def test_invalid_method():
+    with pytest.raises(ValueError, match="method must be 'table' or 'single"):
+        Series(range(1)).rolling(1, method="foo")
+
+
+def test_rolling_descending_date_order_with_offset(frame_or_series):
+    # GH#40002
+    msg = "'d' is deprecated and will be removed in a future version."
+
+    with tm.assert_produces_warning(Pandas4Warning, match=msg):
+        idx = date_range(start="2020-01-01", end="2020-01-03", freq="1d")
+        obj = frame_or_series(range(1, 4), index=idx)
+        result = obj.rolling("1d", closed="left").sum()
+
+    expected = frame_or_series([np.nan, 1, 2], index=idx)
+    tm.assert_equal(result, expected)
+
+    result = obj.iloc[::-1].rolling("1D", closed="left").sum()
+    idx = date_range(start="2020-01-03", end="2020-01-01", freq="-1D")
+    expected = frame_or_series([np.nan, 3, 2], index=idx)
+    tm.assert_equal(result, expected)
+
+
+def test_rolling_var_floating_artifact_precision():
+    # GH 37051
+    s = Series([7, 5, 5, 5])
+    result = s.rolling(3).var()
+    expected = Series([np.nan, np.nan, 4 / 3, 0])
+    tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15)
+    # GH 42064
+    # new `roll_var` will output 0.0 correctly
+    tm.assert_series_equal(result == 0, expected == 0)
+
+
+def test_rolling_std_small_values():
+    # GH 37051
+    s = Series(
+        [
+            0.00000054,
+            0.00000053,
+            0.00000054,
+        ]
+    )
+    result = s.rolling(2).std()
+    expected = Series([np.nan, 7.071068e-9, 7.071068e-9])
+    tm.assert_series_equal(result, expected, atol=1.0e-15, rtol=1.0e-15)
+
+
+@pytest.mark.parametrize(
+    "start, exp_values",
+    [
+        (1, [0.03, 0.0155, 0.0155, 0.011, 0.01025]),
+        (2, [0.001, 0.001, 0.0015, 0.00366666]),
+    ],
+)
+def test_rolling_mean_all_nan_window_floating_artifacts(start, exp_values):
+    # GH#41053
+    df = DataFrame(
+        [
+            0.03,
+            0.03,
+            0.001,
+            np.nan,
+            0.002,
+            0.008,
+            np.nan,
+            np.nan,
+            np.nan,
+            np.nan,
+            np.nan,
+            np.nan,
+            0.005,
+            0.2,
+        ]
+    )
+
+    values = [
+        *exp_values,
+        0.00366666,
+        0.005,
+        0.005,
+        0.008,
+        np.nan,
+        np.nan,
+        0.005,
+        0.102500,
+    ]
+    expected = DataFrame(
+        values,
+        index=list(range(start, len(values) + start)),
+    )
+    result = df.iloc[start:].rolling(5, min_periods=0).mean()
+    tm.assert_frame_equal(result, expected)
+
+
+def test_rolling_sum_all_nan_window_floating_artifacts():
+    # GH#41053
+    df = DataFrame([0.002, 0.008, 0.005, np.nan, np.nan, np.nan])
+    result = df.rolling(3, min_periods=0).sum()
+    expected = DataFrame([0.002, 0.010, 0.015, 0.013, 0.005, 0.0])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_rolling_zero_window():
+    # GH 22719
+    s = Series(range(1))
+    result = s.rolling(0).min()
+    expected = Series([np.nan])
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("window", [1, 3, 10, 20])
+@pytest.mark.parametrize("method", ["min", "max", "average"])
+@pytest.mark.parametrize("pct", [True, False])
+@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans"])
+def test_rank(window, method, pct, ascending, test_data):
+    length = 20
+    if test_data == "default":
+        ser = Series(data=np.random.default_rng(2).random(length))
+    elif test_data == "duplicates":
+        ser = Series(data=np.random.default_rng(2).choice(3, length))
+    elif test_data == "nans":
+        ser = Series(
+            data=np.random.default_rng(2).choice(
+                [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
+            )
+        )
+
+    expected = ser.rolling(window).apply(
+        lambda x: x.rank(method=method, pct=pct, ascending=ascending).iloc[-1]
+    )
+    result = ser.rolling(window).rank(method=method, pct=pct, ascending=ascending)
+
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("window", [1, 3, 10, 20])
+@pytest.mark.parametrize("test_data", ["default", "duplicates", "nans", "precision"])
+def test_nunique(window, test_data):
+    length = 20
+    if test_data == "default":
+        ser = Series(data=np.random.default_rng(2).random(length))
+    elif test_data == "duplicates":
+        ser = Series(data=np.random.default_rng(2).choice(3, length))
+    elif test_data == "nans":
+        ser = Series(
+            data=np.random.default_rng(2).choice(
+                [1.0, 0.25, 0.75, np.nan, np.inf, -np.inf], length
+            )
+        )
+    elif test_data == "precision":
+        ser = Series(
+            data=[
+                0.3,
+                0.1 * 3,  # Not necessarily exactly 0.3
+                0.6,
+                0.2 * 3,  # Not necessarily exactly 0.6
+                0.9,
+                0.3 * 3,  # Not necessarily exactly 0.9
+                0.5,
+                0.1 * 5,  # Not necessarily exactly 0.5
+                0.8,
+                0.2 * 4,  # Not necessarily exactly 0.8
+            ],
+            dtype=np.float64,
+        )
+
+    expected = ser.rolling(window).apply(lambda x: x.nunique())
+    result = ser.rolling(window).nunique()
+
+    tm.assert_series_equal(result, expected)
+
+
+def test_rolling_quantile_np_percentile():
+    # #9413: Tests that rolling window's quantile default behavior
+    # is analogous to Numpy's percentile
+    row = 10
+    col = 5
+    idx = date_range("20100101", periods=row, freq="B")
+    df = DataFrame(
+        np.random.default_rng(2).random(row * col).reshape((row, -1)), index=idx
+    )
+
+    df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0)
+    np_percentile = np.percentile(df, [25, 50, 75], axis=0)
+
+    tm.assert_almost_equal(df_quantile.values, np.array(np_percentile))
+
+
+@pytest.mark.parametrize("quantile", [0.0, 0.1, 0.45, 0.5, 1])
+@pytest.mark.parametrize(
+    "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
+)
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
+        [8.0, 1.0, 3.0, 4.0, 5.0, 2.0, 6.0, 7.0],
+        [0.0, np.nan, 0.2, np.nan, 0.4],
+        [np.nan, np.nan, np.nan, np.nan],
+        [np.nan, 0.1, np.nan, 0.3, 0.4, 0.5],
+        [0.5],
+        [np.nan, 0.7, 0.6],
+    ],
+)
+def test_rolling_quantile_interpolation_options(quantile, interpolation, data):
+    # Tests that rolling window's quantile behavior is analogous to
+    # Series' quantile for each interpolation option
+    s = Series(data)
+
+    q1 = s.quantile(quantile, interpolation)
+    q2 = s.expanding(min_periods=1).quantile(quantile, interpolation).iloc[-1]
+
+    if np.isnan(q1):
+        assert np.isnan(q2)
+    elif not IS64:
+        # Less precision on 32-bit
+        assert np.allclose([q1], [q2], rtol=1e-07, atol=0)
+    else:
+        assert q1 == q2
+
+
+def test_invalid_quantile_value():
+    data = np.arange(5)
+    s = Series(data)
+
+    msg = "Interpolation 'invalid' is not supported"
+    with pytest.raises(ValueError, match=msg):
+        s.rolling(len(data), min_periods=1).quantile(0.5, interpolation="invalid")
+
+
+def test_rolling_quantile_param():
+    ser = Series([0.0, 0.1, 0.5, 0.9, 1.0])
+    msg = "quantile value -0.1 not in \\[0, 1\\]"
+    with pytest.raises(ValueError, match=msg):
+        ser.rolling(3).quantile(-0.1)
+
+    msg = "quantile value 10.0 not in \\[0, 1\\]"
+    with pytest.raises(ValueError, match=msg):
+        ser.rolling(3).quantile(10.0)
+
+    msg = "must be real number, not str"
+    with pytest.raises(TypeError, match=msg):
+        ser.rolling(3).quantile("foo")
+
+
+def test_rolling_std_1obs():
+    vals = Series([1.0, 2.0, 3.0, 4.0, 5.0])
+
+    result = vals.rolling(1, min_periods=1).std()
+    expected = Series([np.nan] * 5)
+    tm.assert_series_equal(result, expected)
+
+    result = vals.rolling(1, min_periods=1).std(ddof=0)
+    expected = Series([0.0] * 5)
+    tm.assert_series_equal(result, expected)
+
+    result = Series([np.nan, np.nan, 3, 4, 5]).rolling(3, min_periods=2).std()
+    assert np.isnan(result[2])
+
+
+def test_rolling_std_neg_sqrt():
+    # unit test from Bottleneck
+
+    # Test move_nanstd for neg sqrt.
+
+    a = Series(
+        [
+            0.0011448196318903589,
+            0.00028718669878572767,
+            0.00028718669878572767,
+            0.00028718669878572767,
+            0.00028718669878572767,
+        ]
+    )
+    b = a.rolling(window=3).std()
+    assert np.isfinite(b[2:]).all()
+
+    b = a.ewm(span=3).std()
+    assert np.isfinite(b[2:]).all()
+
+
+def test_step_not_integer_raises():
+    with pytest.raises(ValueError, match="step must be an integer"):
+        DataFrame(range(2)).rolling(1, step="foo")
+
+
+def test_step_not_positive_raises():
+    with pytest.raises(ValueError, match="step must be >= 0"):
+        DataFrame(range(2)).rolling(1, step=-1)
+
+
+@pytest.mark.parametrize(
+    ["values", "window", "min_periods", "expected"],
+    [
+        [
+            [20, 10, 10, np.inf, 1, 1, 2, 3],
+            3,
+            1,
+            [np.nan, 50, 100 / 3, 0, 40.5, 0, 1 / 3, 1],
+        ],
+        [
+            [20, 10, 10, np.nan, 10, 1, 2, 3],
+            3,
+            1,
+            [np.nan, 50, 100 / 3, 0, 0, 40.5, 73 / 3, 1],
+        ],
+        [
+            [np.nan, 5, 6, 7, 5, 5, 5],
+            3,
+            3,
+            [np.nan] * 3 + [1, 1, 4 / 3, 0],
+        ],
+        [
+            [5, 7, 7, 7, np.nan, np.inf, 4, 3, 3, 3],
+            3,
+            3,
+            [np.nan] * 2 + [4 / 3, 0] + [np.nan] * 4 + [1 / 3, 0],
+        ],
+        [
+            [5, 7, 7, 7, np.nan, np.inf, 7, 3, 3, 3],
+            3,
+            3,
+            [np.nan] * 2 + [4 / 3, 0] + [np.nan] * 4 + [16 / 3, 0],
+        ],
+        [
+            [5, 7] * 4,
+            3,
+            3,
+            [np.nan] * 2 + [4 / 3] * 6,
+        ],
+        [
+            [5, 7, 5, np.nan, 7, 5, 7],
+            3,
+            2,
+            [np.nan, 2, 4 / 3] + [2] * 3 + [4 / 3],
+        ],
+    ],
+)
+def test_rolling_var_same_value_count_logic(values, window, min_periods, expected):
+    # GH 42064.
+
+    expected = Series(expected)
+    sr = Series(values)
+
+    # With new algo implemented, result will be set to .0 in rolling var
+    # if sufficient amount of consecutively same values are found.
+    result_var = sr.rolling(window, min_periods=min_periods).var()
+
+    # use `assert_series_equal` twice to check for equality,
+    # because `check_exact=True` will fail in 32-bit tests due to
+    # precision loss.
+
+    # 1. result should be close to correct value
+    # non-zero values can still differ slightly from "truth"
+    # as the result of online algorithm
+    tm.assert_series_equal(result_var, expected)
+    # 2. zeros should be exactly the same since the new algo takes effect here
+    tm.assert_series_equal(expected == 0, result_var == 0)
+
+    # std should also pass as it's just a sqrt of var
+    result_std = sr.rolling(window, min_periods=min_periods).std()
+    tm.assert_series_equal(result_std, np.sqrt(expected))
+    tm.assert_series_equal(expected == 0, result_std == 0)
+
+
+def test_rolling_mean_sum_floating_artifacts():
+    # GH 42064.
+
+    sr = Series([1 / 3, 4, 0, 0, 0, 0, 0])
+    r = sr.rolling(3)
+    result = r.mean()
+    assert (result[-3:] == 0).all()
+    result = r.sum()
+    assert (result[-3:] == 0).all()
+
+
+def test_rolling_skew_kurt_floating_artifacts():
+    # GH 42064 46431
+
+    sr = Series([1 / 3, 4, 0, 0, 0, 0, 0])
+    r = sr.rolling(4)
+    result = r.skew()
+    expected = Series([np.nan, np.nan, np.nan, 1.9619045191072484, 2.0, 0.0, 0.0])
+    tm.assert_series_equal(result, expected)
+    result = r.kurt()
+    expected = Series([np.nan, np.nan, np.nan, 3.8636048803878786, 4.0, -3.0, -3.0])
+    tm.assert_series_equal(result, expected)
+
+
+def test_numeric_only_frame(arithmetic_win_operators, numeric_only):
+    # GH#46560
+    kernel = arithmetic_win_operators
+    df = DataFrame({"a": [1], "b": 2, "c": 3})
+    df["c"] = df["c"].astype(object)
+    rolling = df.rolling(2, min_periods=1)
+    op = getattr(rolling, kernel)
+    result = op(numeric_only=numeric_only)
+
+    columns = ["a", "b"] if numeric_only else ["a", "b", "c"]
+    expected = df[columns].agg([kernel]).reset_index(drop=True).astype(float)
+    assert list(expected.columns) == columns
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("kernel", ["corr", "cov"])
+@pytest.mark.parametrize("use_arg", [True, False])
+def test_numeric_only_corr_cov_frame(kernel, numeric_only, use_arg):
+    # GH#46560
+    df = DataFrame({"a": [1, 2, 3], "b": 2, "c": 3})
+    df["c"] = df["c"].astype(object)
+    arg = (df,) if use_arg else ()
+    rolling = df.rolling(2, min_periods=1)
+    op = getattr(rolling, kernel)
+    result = op(*arg, numeric_only=numeric_only)
+
+    # Compare result to op using float dtypes, dropping c when numeric_only is True
+    columns = ["a", "b"] if numeric_only else ["a", "b", "c"]
+    df2 = df[columns].astype(float)
+    arg2 = (df2,) if use_arg else ()
+    rolling2 = df2.rolling(2, min_periods=1)
+    op2 = getattr(rolling2, kernel)
+    expected = op2(*arg2, numeric_only=numeric_only)
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dtype", [int, object])
+def test_numeric_only_series(arithmetic_win_operators, numeric_only, dtype):
+    # GH#46560
+    kernel = arithmetic_win_operators
+    ser = Series([1], dtype=dtype)
+    rolling = ser.rolling(2, min_periods=1)
+    op = getattr(rolling, kernel)
+    if numeric_only and dtype is object:
+        msg = f"Rolling.{kernel} does not implement numeric_only"
+        with pytest.raises(NotImplementedError, match=msg):
+            op(numeric_only=numeric_only)
+    else:
+        result = op(numeric_only=numeric_only)
+        expected = ser.agg([kernel]).reset_index(drop=True).astype(float)
+        tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("kernel", ["corr", "cov"])
+@pytest.mark.parametrize("use_arg", [True, False])
+@pytest.mark.parametrize("dtype", [int, object])
+def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype):
+    # GH#46560
+    ser = Series([1, 2, 3], dtype=dtype)
+    arg = (ser,) if use_arg else ()
+    rolling = ser.rolling(2, min_periods=1)
+    op = getattr(rolling, kernel)
+    if numeric_only and dtype is object:
+        msg = f"Rolling.{kernel} does not implement numeric_only"
+        with pytest.raises(NotImplementedError, match=msg):
+            op(*arg, numeric_only=numeric_only)
+    else:
+        result = op(*arg, numeric_only=numeric_only)
+
+        ser2 = ser.astype(float)
+        arg2 = (ser2,) if use_arg else ()
+        rolling2 = ser2.rolling(2, min_periods=1)
+        op2 = getattr(rolling2, kernel)
+        expected = op2(*arg2, numeric_only=numeric_only)
+        tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("tz", [None, "UTC", "Europe/Prague"])
+def test_rolling_timedelta_window_non_nanoseconds(unit, tz):
+    # Test Sum, GH#55106
+    df_time = DataFrame(
+        {"A": range(5)},
+        index=date_range("2013-01-01", freq="1s", periods=5, tz=tz, unit="ns"),
+    )
+    sum_in_nanosecs = df_time.rolling("1s").sum()
+    # microseconds / milliseconds should not break the correct rolling
+    df_time.index = df_time.index.as_unit(unit)
+    sum_in_microsecs = df_time.rolling("1s").sum()
+    sum_in_microsecs.index = sum_in_microsecs.index.as_unit("ns")
+    tm.assert_frame_equal(sum_in_nanosecs, sum_in_microsecs)
+
+    # Test max, GH#55026
+    ref_dates = date_range("2023-01-01", "2023-01-10", unit="ns", tz=tz)
+    ref_series = Series(0, index=ref_dates)
+    ref_series.iloc[0] = 1
+    ref_max_series = ref_series.rolling(Timedelta(days=4)).max()
+
+    dates = date_range("2023-01-01", "2023-01-10", unit=unit, tz=tz)
+    series = Series(0, index=dates)
+    series.iloc[0] = 1
+    max_series = series.rolling(Timedelta(days=4)).max()
+
+    ref_df = DataFrame(ref_max_series)
+    df = DataFrame(max_series)
+    df.index = df.index.as_unit("ns")
+
+    tm.assert_frame_equal(ref_df, df)
+
+
+class PrescribedWindowIndexer(BaseIndexer):
+    def __init__(self, start, end):
+        self._start = start
+        self._end = end
+        super().__init__()
+
+    def get_window_bounds(
+        self, num_values=None, min_periods=None, center=None, closed=None, step=None
+    ):
+        if num_values is None:
+            num_values = len(self._start)
+        start = np.clip(self._start, 0, num_values)
+        end = np.clip(self._end, 0, num_values)
+        return start, end
+
+
+class TestMinMax:
+    @pytest.mark.parametrize(
+        "is_max, has_nan, exp_list",
+        [
+            (True, False, [3.0, 5.0, 2.0, 5.0, 1.0, 5.0, 6.0, 7.0, 8.0, 9.0]),
+            (True, True, [3.0, 4.0, 2.0, 4.0, 1.0, 4.0, 6.0, 7.0, 7.0, 9.0]),
+            (False, False, [3.0, 2.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 7.0, 0.0]),
+            (False, True, [3.0, 2.0, 2.0, 1.0, 1.0, 1.0, 6.0, 6.0, 7.0, 1.0]),
+        ],
+    )
+    def test_minmax(self, is_max, has_nan, exp_list):
+        nan_idx = [0, 5, 8]
+        df = DataFrame(
+            {
+                "data": [5.0, 4.0, 3.0, 2.0, 1.0, 0.0, 6.0, 7.0, 8.0, 9.0],
+                "start": [2, 0, 3, 0, 4, 0, 5, 5, 7, 3],
+                "end": [3, 4, 4, 5, 5, 6, 7, 8, 9, 10],
+            }
+        )
+        if has_nan:
+            df.loc[nan_idx, "data"] = np.nan
+        expected = Series(exp_list, name="data")
+        r = df.data.rolling(
+            PrescribedWindowIndexer(df.start.to_numpy(), df.end.to_numpy())
+        )
+        if is_max:
+            result = r.max()
+        else:
+            result = r.min()
+
+        tm.assert_series_equal(result, expected)
+
+    def test_wrong_order(self):
+        start = np.array(range(5), dtype=np.int64)
+        end = start + 1
+        end[3] = end[2]
+        start[3] = start[2] - 1
+
+        df = DataFrame({"data": start * 1.0, "start": start, "end": end})
+
+        r = df.data.rolling(PrescribedWindowIndexer(start, end))
+        with pytest.raises(
+            ValueError, match="Start/End ordering requirement is violated at index 3"
+        ):
+            r.max()
diff --git a/pandas/tests/window/test_rolling_functions.py b/pandas/tests/window/test_rolling_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..36ae7f3b7dfe47038901a9d154495f157b0ea23a
--- /dev/null
+++ b/pandas/tests/window/test_rolling_functions.py
@@ -0,0 +1,535 @@
+from datetime import datetime
+
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    Series,
+    concat,
+    isna,
+    notna,
+)
+import pandas._testing as tm
+
+from pandas.tseries import offsets
+
+
+@pytest.mark.parametrize(
+    "compare_func, roll_func, kwargs",
+    [
+        [np.mean, "mean", {}],
+        [np.nansum, "sum", {}],
+        [
+            lambda x: np.isfinite(x).astype(float).sum(),
+            "count",
+            {},
+        ],
+        [np.median, "median", {}],
+        [np.min, "min", {}],
+        [np.max, "max", {}],
+        [lambda x: np.std(x, ddof=1), "std", {}],
+        [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}],
+        [lambda x: np.var(x, ddof=1), "var", {}],
+        [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}],
+    ],
+)
+def test_series(series, compare_func, roll_func, kwargs, step):
+    result = getattr(series.rolling(50, step=step), roll_func)(**kwargs)
+    assert isinstance(result, Series)
+    end = range(0, len(series), step or 1)[-1] + 1
+    tm.assert_almost_equal(result.iloc[-1], compare_func(series[end - 50 : end]))
+
+
+@pytest.mark.parametrize(
+    "compare_func, roll_func, kwargs",
+    [
+        [np.mean, "mean", {}],
+        [np.nansum, "sum", {}],
+        [
+            lambda x: np.isfinite(x).astype(float).sum(),
+            "count",
+            {},
+        ],
+        [np.median, "median", {}],
+        [np.min, "min", {}],
+        [np.max, "max", {}],
+        [lambda x: np.std(x, ddof=1), "std", {}],
+        [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}],
+        [lambda x: np.var(x, ddof=1), "var", {}],
+        [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}],
+    ],
+)
+def test_frame(raw, frame, compare_func, roll_func, kwargs, step):
+    result = getattr(frame.rolling(50, step=step), roll_func)(**kwargs)
+    assert isinstance(result, DataFrame)
+    end = range(0, len(frame), step or 1)[-1] + 1
+    tm.assert_series_equal(
+        result.iloc[-1, :],
+        frame.iloc[end - 50 : end, :].apply(compare_func, axis=0, raw=raw),
+        check_names=False,
+    )
+
+
+@pytest.mark.parametrize(
+    "compare_func, roll_func, kwargs, minp",
+    [
+        [np.mean, "mean", {}, 10],
+        [np.nansum, "sum", {}, 10],
+        [lambda x: np.isfinite(x).astype(float).sum(), "count", {}, 0],
+        [np.median, "median", {}, 10],
+        [np.min, "min", {}, 10],
+        [np.max, "max", {}, 10],
+        [lambda x: np.std(x, ddof=1), "std", {}, 10],
+        [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}, 10],
+        [lambda x: np.var(x, ddof=1), "var", {}, 10],
+        [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}, 10],
+    ],
+)
+def test_time_rule_series(series, compare_func, roll_func, kwargs, minp):
+    win = 25
+    ser = series[::2].resample("B").mean()
+    series_result = getattr(ser.rolling(window=win, min_periods=minp), roll_func)(
+        **kwargs
+    )
+    last_date = series_result.index[-1]
+    prev_date = last_date - 24 * offsets.BDay()
+
+    trunc_series = series[::2].truncate(prev_date, last_date)
+    tm.assert_almost_equal(series_result.iloc[-1], compare_func(trunc_series))
+
+
+@pytest.mark.parametrize(
+    "compare_func, roll_func, kwargs, minp",
+    [
+        [np.mean, "mean", {}, 10],
+        [np.nansum, "sum", {}, 10],
+        [lambda x: np.isfinite(x).astype(float).sum(), "count", {}, 0],
+        [np.median, "median", {}, 10],
+        [np.min, "min", {}, 10],
+        [np.max, "max", {}, 10],
+        [lambda x: np.std(x, ddof=1), "std", {}, 10],
+        [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}, 10],
+        [lambda x: np.var(x, ddof=1), "var", {}, 10],
+        [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}, 10],
+    ],
+)
+def test_time_rule_frame(raw, frame, compare_func, roll_func, kwargs, minp):
+    win = 25
+    frm = frame[::2].resample("B").mean()
+    frame_result = getattr(frm.rolling(window=win, min_periods=minp), roll_func)(
+        **kwargs
+    )
+    last_date = frame_result.index[-1]
+    prev_date = last_date - 24 * offsets.BDay()
+
+    trunc_frame = frame[::2].truncate(prev_date, last_date)
+    tm.assert_series_equal(
+        frame_result.xs(last_date),
+        trunc_frame.apply(compare_func, raw=raw),
+        check_names=False,
+    )
+
+
+@pytest.mark.parametrize(
+    "compare_func, roll_func, kwargs",
+    [
+        [np.mean, "mean", {}],
+        [np.nansum, "sum", {}],
+        [np.median, "median", {}],
+        [np.min, "min", {}],
+        [np.max, "max", {}],
+        [lambda x: np.std(x, ddof=1), "std", {}],
+        [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}],
+        [lambda x: np.var(x, ddof=1), "var", {}],
+        [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}],
+    ],
+)
+def test_nans(compare_func, roll_func, kwargs):
+    obj = Series(np.random.default_rng(2).standard_normal(50))
+    obj[:10] = np.nan
+    obj[-10:] = np.nan
+
+    result = getattr(obj.rolling(50, min_periods=30), roll_func)(**kwargs)
+    tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10]))
+
+    # min_periods is working correctly
+    result = getattr(obj.rolling(20, min_periods=15), roll_func)(**kwargs)
+    assert isna(result.iloc[23])
+    assert not isna(result.iloc[24])
+
+    assert not isna(result.iloc[-6])
+    assert isna(result.iloc[-5])
+
+    obj2 = Series(np.random.default_rng(2).standard_normal(20))
+    result = getattr(obj2.rolling(10, min_periods=5), roll_func)(**kwargs)
+    assert isna(result.iloc[3])
+    assert notna(result.iloc[4])
+
+    if roll_func != "sum":
+        result0 = getattr(obj.rolling(20, min_periods=0), roll_func)(**kwargs)
+        result1 = getattr(obj.rolling(20, min_periods=1), roll_func)(**kwargs)
+        tm.assert_almost_equal(result0, result1)
+
+
+def test_nans_count():
+    obj = Series(np.random.default_rng(2).standard_normal(50))
+    obj[:10] = np.nan
+    obj[-10:] = np.nan
+    result = obj.rolling(50, min_periods=30).count()
+    tm.assert_almost_equal(
+        result.iloc[-1], np.isfinite(obj[10:-10]).astype(float).sum()
+    )
+
+
+@pytest.mark.parametrize(
+    "roll_func, kwargs",
+    [
+        ["mean", {}],
+        ["sum", {}],
+        ["median", {}],
+        ["min", {}],
+        ["max", {}],
+        ["std", {}],
+        ["std", {"ddof": 0}],
+        ["var", {}],
+        ["var", {"ddof": 0}],
+    ],
+)
+@pytest.mark.parametrize("minp", [0, 99, 100])
+def test_min_periods(series, minp, roll_func, kwargs, step):
+    result = getattr(
+        series.rolling(len(series) + 1, min_periods=minp, step=step), roll_func
+    )(**kwargs)
+    expected = getattr(
+        series.rolling(len(series), min_periods=minp, step=step), roll_func
+    )(**kwargs)
+    nan_mask = isna(result)
+    tm.assert_series_equal(nan_mask, isna(expected))
+
+    nan_mask = ~nan_mask
+    tm.assert_almost_equal(result[nan_mask], expected[nan_mask])
+
+
+def test_min_periods_count(series, step):
+    result = series.rolling(len(series) + 1, min_periods=0, step=step).count()
+    expected = series.rolling(len(series), min_periods=0, step=step).count()
+    nan_mask = isna(result)
+    tm.assert_series_equal(nan_mask, isna(expected))
+
+    nan_mask = ~nan_mask
+    tm.assert_almost_equal(result[nan_mask], expected[nan_mask])
+
+
+@pytest.mark.parametrize(
+    "roll_func, kwargs, minp",
+    [
+        ["mean", {}, 15],
+        ["sum", {}, 15],
+        ["count", {}, 0],
+        ["median", {}, 15],
+        ["min", {}, 15],
+        ["max", {}, 15],
+        ["std", {}, 15],
+        ["std", {"ddof": 0}, 15],
+        ["var", {}, 15],
+        ["var", {"ddof": 0}, 15],
+    ],
+)
+def test_center(roll_func, kwargs, minp):
+    obj = Series(np.random.default_rng(2).standard_normal(50))
+    obj[:10] = np.nan
+    obj[-10:] = np.nan
+
+    result = getattr(obj.rolling(20, min_periods=minp, center=True), roll_func)(
+        **kwargs
+    )
+    expected = (
+        getattr(
+            concat([obj, Series([np.nan] * 9)]).rolling(20, min_periods=minp), roll_func
+        )(**kwargs)
+        .iloc[9:]
+        .reset_index(drop=True)
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "roll_func, kwargs, minp, fill_value",
+    [
+        ["mean", {}, 10, None],
+        ["sum", {}, 10, None],
+        ["count", {}, 0, 0],
+        ["median", {}, 10, None],
+        ["min", {}, 10, None],
+        ["max", {}, 10, None],
+        ["std", {}, 10, None],
+        ["std", {"ddof": 0}, 10, None],
+        ["var", {}, 10, None],
+        ["var", {"ddof": 0}, 10, None],
+    ],
+)
+def test_center_reindex_series(series, roll_func, kwargs, minp, fill_value):
+    # shifter index
+    s = [f"x{x:d}" for x in range(12)]
+
+    series_xp = (
+        getattr(
+            series.reindex(list(series.index) + s).rolling(window=25, min_periods=minp),
+            roll_func,
+        )(**kwargs)
+        .shift(-12)
+        .reindex(series.index)
+    )
+    series_rs = getattr(
+        series.rolling(window=25, min_periods=minp, center=True), roll_func
+    )(**kwargs)
+    if fill_value is not None:
+        series_xp = series_xp.fillna(fill_value)
+    tm.assert_series_equal(series_xp, series_rs)
+
+
+@pytest.mark.parametrize(
+    "roll_func, kwargs, minp, fill_value",
+    [
+        ["mean", {}, 10, None],
+        ["sum", {}, 10, None],
+        ["count", {}, 0, 0],
+        ["median", {}, 10, None],
+        ["min", {}, 10, None],
+        ["max", {}, 10, None],
+        ["std", {}, 10, None],
+        ["std", {"ddof": 0}, 10, None],
+        ["var", {}, 10, None],
+        ["var", {"ddof": 0}, 10, None],
+    ],
+)
+def test_center_reindex_frame(frame, roll_func, kwargs, minp, fill_value):
+    # shifter index
+    s = [f"x{x:d}" for x in range(12)]
+
+    frame_xp = (
+        getattr(
+            frame.reindex(list(frame.index) + s).rolling(window=25, min_periods=minp),
+            roll_func,
+        )(**kwargs)
+        .shift(-12)
+        .reindex(frame.index)
+    )
+    frame_rs = getattr(
+        frame.rolling(window=25, min_periods=minp, center=True), roll_func
+    )(**kwargs)
+    if fill_value is not None:
+        frame_xp = frame_xp.fillna(fill_value)
+    tm.assert_frame_equal(frame_xp, frame_rs)
+
+
+@pytest.mark.parametrize(
+    "f",
+    [
+        lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False),
+        lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False),
+        lambda x: x.rolling(window=10, min_periods=5).max(),
+        lambda x: x.rolling(window=10, min_periods=5).min(),
+        lambda x: x.rolling(window=10, min_periods=5).sum(),
+        lambda x: x.rolling(window=10, min_periods=5).mean(),
+        lambda x: x.rolling(window=10, min_periods=5).std(),
+        lambda x: x.rolling(window=10, min_periods=5).var(),
+        lambda x: x.rolling(window=10, min_periods=5).skew(),
+        lambda x: x.rolling(window=10, min_periods=5).kurt(),
+        lambda x: x.rolling(window=10, min_periods=5).first(),
+        lambda x: x.rolling(window=10, min_periods=5).last(),
+        lambda x: x.rolling(window=10, min_periods=5).quantile(q=0.5),
+        lambda x: x.rolling(window=10, min_periods=5).median(),
+        lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False),
+        lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True),
+        pytest.param(
+            lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(),
+            marks=td.skip_if_no("scipy"),
+        ),
+    ],
+)
+def test_rolling_functions_window_non_shrinkage(f):
+    # GH 7764
+    s = Series(range(4))
+    s_expected = Series(np.nan, index=s.index)
+    df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=["A", "B"])
+    df_expected = DataFrame(np.nan, index=df.index, columns=df.columns)
+
+    s_result = f(s)
+    tm.assert_series_equal(s_result, s_expected)
+
+    df_result = f(df)
+    tm.assert_frame_equal(df_result, df_expected)
+
+
+def test_rolling_max_gh6297(step):
+    """Replicate result expected in GH #6297"""
+    indices = [datetime(1975, 1, i) for i in range(1, 6)]
+    # So that we can have 2 datapoints on one of the days
+    indices.append(datetime(1975, 1, 3, 6, 0))
+    series = Series(range(1, 7), index=indices)
+    # Use floats instead of ints as values
+    series = series.map(lambda x: float(x))
+    # Sort chronologically
+    series = series.sort_index()
+
+    expected = Series(
+        [1.0, 2.0, 6.0, 4.0, 5.0],
+        index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"),
+    )[::step]
+    x = series.resample("D").max().rolling(window=1, step=step).max()
+    tm.assert_series_equal(expected, x)
+
+
+def test_rolling_max_resample(step):
+    indices = [datetime(1975, 1, i) for i in range(1, 6)]
+    # So that we can have 3 datapoints on last day (4, 10, and 20)
+    indices.append(datetime(1975, 1, 5, 1))
+    indices.append(datetime(1975, 1, 5, 2))
+    series = Series([*list(range(5)), 10, 20], index=indices)
+    # Use floats instead of ints as values
+    series = series.map(lambda x: float(x))
+    # Sort chronologically
+    series = series.sort_index()
+
+    # Default how should be max
+    expected = Series(
+        [0.0, 1.0, 2.0, 3.0, 20.0],
+        index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"),
+    )[::step]
+    x = series.resample("D").max().rolling(window=1, step=step).max()
+    tm.assert_series_equal(expected, x)
+
+    # Now specify median (10.0)
+    expected = Series(
+        [0.0, 1.0, 2.0, 3.0, 10.0],
+        index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"),
+    )[::step]
+    x = series.resample("D").median().rolling(window=1, step=step).max()
+    tm.assert_series_equal(expected, x)
+
+    # Now specify mean (4+10+20)/3
+    v = (4.0 + 10.0 + 20.0) / 3.0
+    expected = Series(
+        [0.0, 1.0, 2.0, 3.0, v],
+        index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"),
+    )[::step]
+    x = series.resample("D").mean().rolling(window=1, step=step).max()
+    tm.assert_series_equal(expected, x)
+
+
+def test_rolling_min_resample(step):
+    indices = [datetime(1975, 1, i) for i in range(1, 6)]
+    # So that we can have 3 datapoints on last day (4, 10, and 20)
+    indices.append(datetime(1975, 1, 5, 1))
+    indices.append(datetime(1975, 1, 5, 2))
+    series = Series([*list(range(5)), 10, 20], index=indices)
+    # Use floats instead of ints as values
+    series = series.map(lambda x: float(x))
+    # Sort chronologically
+    series = series.sort_index()
+
+    # Default how should be min
+    expected = Series(
+        [0.0, 1.0, 2.0, 3.0, 4.0],
+        index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"),
+    )[::step]
+    r = series.resample("D").min().rolling(window=1, step=step)
+    tm.assert_series_equal(expected, r.min())
+
+
+def test_rolling_median_resample():
+    indices = [datetime(1975, 1, i) for i in range(1, 6)]
+    # So that we can have 3 datapoints on last day (4, 10, and 20)
+    indices.append(datetime(1975, 1, 5, 1))
+    indices.append(datetime(1975, 1, 5, 2))
+    series = Series([*list(range(5)), 10, 20], index=indices)
+    # Use floats instead of ints as values
+    series = series.map(lambda x: float(x))
+    # Sort chronologically
+    series = series.sort_index()
+
+    # Default how should be median
+    expected = Series(
+        [0.0, 1.0, 2.0, 3.0, 10],
+        index=DatetimeIndex([datetime(1975, 1, i, 0) for i in range(1, 6)], freq="D"),
+    )
+    x = series.resample("D").median().rolling(window=1).median()
+    tm.assert_series_equal(expected, x)
+
+
+def test_rolling_median_memory_error():
+    # GH11722
+    n = 20000
+    Series(np.random.default_rng(2).standard_normal(n)).rolling(
+        window=2, center=False
+    ).median()
+    Series(np.random.default_rng(2).standard_normal(n)).rolling(
+        window=2, center=False
+    ).median()
+
+
+def test_rolling_min_max_numeric_types(any_real_numpy_dtype):
+    # GH12373
+
+    # Just testing that these don't throw exceptions and that
+    # the return type is float64. Other tests will cover quantitative
+    # correctness
+    result = (
+        DataFrame(np.arange(20, dtype=any_real_numpy_dtype)).rolling(window=5).max()
+    )
+    assert result.dtypes[0] == np.dtype("f8")
+    result = (
+        DataFrame(np.arange(20, dtype=any_real_numpy_dtype)).rolling(window=5).min()
+    )
+    assert result.dtypes[0] == np.dtype("f8")
+
+
+@pytest.mark.parametrize(
+    "f",
+    [
+        lambda x: x.rolling(window=10, min_periods=0).count(),
+        lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False),
+        lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False),
+        lambda x: x.rolling(window=10, min_periods=5).max(),
+        lambda x: x.rolling(window=10, min_periods=5).min(),
+        lambda x: x.rolling(window=10, min_periods=5).sum(),
+        lambda x: x.rolling(window=10, min_periods=5).mean(),
+        lambda x: x.rolling(window=10, min_periods=5).std(),
+        lambda x: x.rolling(window=10, min_periods=5).var(),
+        lambda x: x.rolling(window=10, min_periods=5).skew(),
+        lambda x: x.rolling(window=10, min_periods=5).kurt(),
+        lambda x: x.rolling(window=10, min_periods=5).first(),
+        lambda x: x.rolling(window=10, min_periods=5).last(),
+        lambda x: x.rolling(window=10, min_periods=5).quantile(0.5),
+        lambda x: x.rolling(window=10, min_periods=5).median(),
+        lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=False),
+        lambda x: x.rolling(window=10, min_periods=5).apply(sum, raw=True),
+        pytest.param(
+            lambda x: x.rolling(win_type="boxcar", window=10, min_periods=5).mean(),
+            marks=td.skip_if_no("scipy"),
+        ),
+    ],
+)
+def test_moment_functions_zero_length(f):
+    # GH 8056
+    s = Series(dtype=np.float64)
+    s_expected = s
+    df1 = DataFrame()
+    df1_expected = df1
+    df2 = DataFrame(columns=["a"])
+    df2["a"] = df2["a"].astype("float64")
+    df2_expected = df2
+
+    s_result = f(s)
+    tm.assert_series_equal(s_result, s_expected)
+
+    df1_result = f(df1)
+    tm.assert_frame_equal(df1_result, df1_expected)
+
+    df2_result = f(df2)
+    tm.assert_frame_equal(df2_result, df2_expected)
diff --git a/pandas/tests/window/test_rolling_quantile.py b/pandas/tests/window/test_rolling_quantile.py
new file mode 100644
index 0000000000000000000000000000000000000000..66713f1cfaa8dbd8dfbb10158da9a56b42287c6c
--- /dev/null
+++ b/pandas/tests/window/test_rolling_quantile.py
@@ -0,0 +1,175 @@
+from functools import partial
+
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Series,
+    concat,
+    isna,
+    notna,
+)
+import pandas._testing as tm
+
+from pandas.tseries import offsets
+
+
+def scoreatpercentile(a, per):
+    values = np.sort(a, axis=0)
+
+    idx = int(per / 1.0 * (values.shape[0] - 1))
+
+    if idx == values.shape[0] - 1:
+        retval = values[-1]
+
+    else:
+        qlow = idx / (values.shape[0] - 1)
+        qhig = (idx + 1) / (values.shape[0] - 1)
+        vlow = values[idx]
+        vhig = values[idx + 1]
+        retval = vlow + (vhig - vlow) * (per - qlow) / (qhig - qlow)
+
+    return retval
+
+
+@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0])
+def test_series(series, q, step):
+    compare_func = partial(scoreatpercentile, per=q)
+    result = series.rolling(50, step=step).quantile(q)
+    assert isinstance(result, Series)
+    end = range(0, len(series), step or 1)[-1] + 1
+    tm.assert_almost_equal(result.iloc[-1], compare_func(series[end - 50 : end]))
+
+
+@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0])
+def test_frame(raw, frame, q, step):
+    compare_func = partial(scoreatpercentile, per=q)
+    result = frame.rolling(50, step=step).quantile(q)
+    assert isinstance(result, DataFrame)
+    end = range(0, len(frame), step or 1)[-1] + 1
+    tm.assert_series_equal(
+        result.iloc[-1, :],
+        frame.iloc[end - 50 : end, :].apply(compare_func, axis=0, raw=raw),
+        check_names=False,
+    )
+
+
+@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0])
+def test_time_rule_series(series, q):
+    compare_func = partial(scoreatpercentile, per=q)
+    win = 25
+    ser = series[::2].resample("B").mean()
+    series_result = ser.rolling(window=win, min_periods=10).quantile(q)
+    last_date = series_result.index[-1]
+    prev_date = last_date - 24 * offsets.BDay()
+
+    trunc_series = series[::2].truncate(prev_date, last_date)
+    tm.assert_almost_equal(series_result.iloc[-1], compare_func(trunc_series))
+
+
+@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0])
+def test_time_rule_frame(raw, frame, q):
+    compare_func = partial(scoreatpercentile, per=q)
+    win = 25
+    frm = frame[::2].resample("B").mean()
+    frame_result = frm.rolling(window=win, min_periods=10).quantile(q)
+    last_date = frame_result.index[-1]
+    prev_date = last_date - 24 * offsets.BDay()
+
+    trunc_frame = frame[::2].truncate(prev_date, last_date)
+    tm.assert_series_equal(
+        frame_result.xs(last_date),
+        trunc_frame.apply(compare_func, raw=raw),
+        check_names=False,
+    )
+
+
+@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0])
+def test_nans(q):
+    compare_func = partial(scoreatpercentile, per=q)
+    obj = Series(np.random.default_rng(2).standard_normal(50))
+    obj[:10] = np.nan
+    obj[-10:] = np.nan
+
+    result = obj.rolling(50, min_periods=30).quantile(q)
+    tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10]))
+
+    # min_periods is working correctly
+    result = obj.rolling(20, min_periods=15).quantile(q)
+    assert isna(result.iloc[23])
+    assert not isna(result.iloc[24])
+
+    assert not isna(result.iloc[-6])
+    assert isna(result.iloc[-5])
+
+    obj2 = Series(np.random.default_rng(2).standard_normal(20))
+    result = obj2.rolling(10, min_periods=5).quantile(q)
+    assert isna(result.iloc[3])
+    assert notna(result.iloc[4])
+
+    result0 = obj.rolling(20, min_periods=0).quantile(q)
+    result1 = obj.rolling(20, min_periods=1).quantile(q)
+    tm.assert_almost_equal(result0, result1)
+
+
+@pytest.mark.parametrize("minp", [0, 99, 100])
+@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0])
+def test_min_periods(series, minp, q, step):
+    result = series.rolling(len(series) + 1, min_periods=minp, step=step).quantile(q)
+    expected = series.rolling(len(series), min_periods=minp, step=step).quantile(q)
+    nan_mask = isna(result)
+    tm.assert_series_equal(nan_mask, isna(expected))
+
+    nan_mask = ~nan_mask
+    tm.assert_almost_equal(result[nan_mask], expected[nan_mask])
+
+
+@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0])
+def test_center(q):
+    obj = Series(np.random.default_rng(2).standard_normal(50))
+    obj[:10] = np.nan
+    obj[-10:] = np.nan
+
+    result = obj.rolling(20, center=True).quantile(q)
+    expected = (
+        concat([obj, Series([np.nan] * 9)])
+        .rolling(20)
+        .quantile(q)
+        .iloc[9:]
+        .reset_index(drop=True)
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0])
+def test_center_reindex_series(series, q):
+    # shifter index
+    s = [f"x{x:d}" for x in range(12)]
+
+    series_xp = (
+        series.reindex(list(series.index) + s)
+        .rolling(window=25)
+        .quantile(q)
+        .shift(-12)
+        .reindex(series.index)
+    )
+
+    series_rs = series.rolling(window=25, center=True).quantile(q)
+    tm.assert_series_equal(series_xp, series_rs)
+
+
+@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0])
+def test_center_reindex_frame(frame, q):
+    # shifter index
+    s = [f"x{x:d}" for x in range(12)]
+
+    frame_xp = (
+        frame.reindex(list(frame.index) + s)
+        .rolling(window=25)
+        .quantile(q)
+        .shift(-12)
+        .reindex(frame.index)
+    )
+    frame_rs = frame.rolling(window=25, center=True).quantile(q)
+    tm.assert_frame_equal(frame_xp, frame_rs)
diff --git a/pandas/tests/window/test_rolling_skew_kurt.py b/pandas/tests/window/test_rolling_skew_kurt.py
new file mode 100644
index 0000000000000000000000000000000000000000..79c14f243e7cc93b395ea84e05ec6bc79942b79b
--- /dev/null
+++ b/pandas/tests/window/test_rolling_skew_kurt.py
@@ -0,0 +1,227 @@
+from functools import partial
+
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Series,
+    concat,
+    isna,
+    notna,
+)
+import pandas._testing as tm
+
+from pandas.tseries import offsets
+
+
+@pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]])
+def test_series(series, sp_func, roll_func):
+    sp_stats = pytest.importorskip("scipy.stats")
+
+    compare_func = partial(getattr(sp_stats, sp_func), bias=False)
+    result = getattr(series.rolling(50), roll_func)()
+    assert isinstance(result, Series)
+    tm.assert_almost_equal(result.iloc[-1], compare_func(series[-50:]))
+
+
+@pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]])
+def test_frame(raw, frame, sp_func, roll_func):
+    sp_stats = pytest.importorskip("scipy.stats")
+
+    compare_func = partial(getattr(sp_stats, sp_func), bias=False)
+    result = getattr(frame.rolling(50), roll_func)()
+    assert isinstance(result, DataFrame)
+    tm.assert_series_equal(
+        result.iloc[-1, :],
+        frame.iloc[-50:, :].apply(compare_func, axis=0, raw=raw),
+        check_names=False,
+    )
+
+
+@pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]])
+def test_time_rule_series(series, sp_func, roll_func):
+    sp_stats = pytest.importorskip("scipy.stats")
+
+    compare_func = partial(getattr(sp_stats, sp_func), bias=False)
+    win = 25
+    ser = series[::2].resample("B").mean()
+    series_result = getattr(ser.rolling(window=win, min_periods=10), roll_func)()
+    last_date = series_result.index[-1]
+    prev_date = last_date - 24 * offsets.BDay()
+
+    trunc_series = series[::2].truncate(prev_date, last_date)
+    tm.assert_almost_equal(series_result.iloc[-1], compare_func(trunc_series))
+
+
+@pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]])
+def test_time_rule_frame(raw, frame, sp_func, roll_func):
+    sp_stats = pytest.importorskip("scipy.stats")
+
+    compare_func = partial(getattr(sp_stats, sp_func), bias=False)
+    win = 25
+    frm = frame[::2].resample("B").mean()
+    frame_result = getattr(frm.rolling(window=win, min_periods=10), roll_func)()
+    last_date = frame_result.index[-1]
+    prev_date = last_date - 24 * offsets.BDay()
+
+    trunc_frame = frame[::2].truncate(prev_date, last_date)
+    tm.assert_series_equal(
+        frame_result.xs(last_date),
+        trunc_frame.apply(compare_func, raw=raw),
+        check_names=False,
+    )
+
+
+@pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]])
+def test_nans(sp_func, roll_func):
+    sp_stats = pytest.importorskip("scipy.stats")
+
+    compare_func = partial(getattr(sp_stats, sp_func), bias=False)
+    obj = Series(np.random.default_rng(2).standard_normal(50))
+    obj[:10] = np.nan
+    obj[-10:] = np.nan
+
+    result = getattr(obj.rolling(50, min_periods=30), roll_func)()
+    tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10]))
+
+    # min_periods is working correctly
+    result = getattr(obj.rolling(20, min_periods=15), roll_func)()
+    assert isna(result.iloc[23])
+    assert not isna(result.iloc[24])
+
+    assert not isna(result.iloc[-6])
+    assert isna(result.iloc[-5])
+
+    obj2 = Series(np.random.default_rng(2).standard_normal(20))
+    result = getattr(obj2.rolling(10, min_periods=5), roll_func)()
+    assert isna(result.iloc[3])
+    assert notna(result.iloc[4])
+
+    result0 = getattr(obj.rolling(20, min_periods=0), roll_func)()
+    result1 = getattr(obj.rolling(20, min_periods=1), roll_func)()
+    tm.assert_almost_equal(result0, result1)
+
+
+@pytest.mark.parametrize("minp", [0, 99, 100])
+@pytest.mark.parametrize("roll_func", ["kurt", "skew"])
+def test_min_periods(series, minp, roll_func, step):
+    result = getattr(
+        series.rolling(len(series) + 1, min_periods=minp, step=step), roll_func
+    )()
+    expected = getattr(
+        series.rolling(len(series), min_periods=minp, step=step), roll_func
+    )()
+    nan_mask = isna(result)
+    tm.assert_series_equal(nan_mask, isna(expected))
+
+    nan_mask = ~nan_mask
+    tm.assert_almost_equal(result[nan_mask], expected[nan_mask])
+
+
+@pytest.mark.parametrize("roll_func", ["kurt", "skew"])
+def test_center(roll_func):
+    obj = Series(np.random.default_rng(2).standard_normal(50))
+    obj[:10] = np.nan
+    obj[-10:] = np.nan
+
+    result = getattr(obj.rolling(20, center=True), roll_func)()
+    expected = (
+        getattr(concat([obj, Series([np.nan] * 9)]).rolling(20), roll_func)()
+        .iloc[9:]
+        .reset_index(drop=True)
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("roll_func", ["kurt", "skew"])
+def test_center_reindex_series(series, roll_func):
+    # shifter index
+    s = [f"x{x:d}" for x in range(12)]
+
+    series_xp = (
+        getattr(
+            series.reindex(list(series.index) + s).rolling(window=25),
+            roll_func,
+        )()
+        .shift(-12)
+        .reindex(series.index)
+    )
+    series_rs = getattr(series.rolling(window=25, center=True), roll_func)()
+    tm.assert_series_equal(series_xp, series_rs)
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("roll_func", ["kurt", "skew"])
+def test_center_reindex_frame(frame, roll_func):
+    # shifter index
+    s = [f"x{x:d}" for x in range(12)]
+
+    frame_xp = (
+        getattr(
+            frame.reindex(list(frame.index) + s).rolling(window=25),
+            roll_func,
+        )()
+        .shift(-12)
+        .reindex(frame.index)
+    )
+    frame_rs = getattr(frame.rolling(window=25, center=True), roll_func)()
+    tm.assert_frame_equal(frame_xp, frame_rs)
+
+
+def test_rolling_skew_edge_cases(step):
+    expected = Series([np.nan] * 4 + [0.0])[::step]
+    # yields all NaN (0 variance)
+    d = Series([1] * 5)
+    x = d.rolling(window=5, step=step).skew()
+    # index 4 should be 0 as it contains 5 same obs
+    tm.assert_series_equal(expected, x)
+
+    expected = Series([np.nan] * 5)[::step]
+    # yields all NaN (window too small)
+    d = Series(np.random.default_rng(2).standard_normal(5))
+    x = d.rolling(window=2, step=step).skew()
+    tm.assert_series_equal(expected, x)
+
+    # yields [NaN, NaN, NaN, 0.177994, 1.548824]
+    d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401])
+    expected = Series([np.nan, np.nan, np.nan, 0.177994, 1.548824])[::step]
+    x = d.rolling(window=4, step=step).skew()
+    tm.assert_series_equal(expected, x)
+
+
+def test_rolling_kurt_edge_cases(step):
+    expected = Series([np.nan] * 4 + [-3.0])[::step]
+
+    # yields all NaN (0 variance)
+    d = Series([1] * 5)
+    x = d.rolling(window=5, step=step).kurt()
+    tm.assert_series_equal(expected, x)
+
+    # yields all NaN (window too small)
+    expected = Series([np.nan] * 5)[::step]
+    d = Series(np.random.default_rng(2).standard_normal(5))
+    x = d.rolling(window=3, step=step).kurt()
+    tm.assert_series_equal(expected, x)
+
+    # yields [NaN, NaN, NaN, 1.224307, 2.671499]
+    d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401])
+    expected = Series([np.nan, np.nan, np.nan, 1.224307, 2.671499])[::step]
+    x = d.rolling(window=4, step=step).kurt()
+    tm.assert_series_equal(expected, x)
+
+
+def test_rolling_skew_eq_value_fperr(step):
+    # #18804 all rolling skew for all equal values should return Nan
+    # #46717 update: all equal values should return 0 instead of NaN
+    a = Series([1.1] * 15).rolling(window=10, step=step).skew()
+    assert (a[a.index >= 9] == 0).all()
+    assert a[a.index < 9].isna().all()
+
+
+def test_rolling_kurt_eq_value_fperr(step):
+    # #18804 all rolling kurt for all equal values should return Nan
+    # #46717 update: all equal values should return -3 instead of NaN
+    a = Series([1.1] * 15).rolling(window=10, step=step).kurt()
+    assert (a[a.index >= 9] == -3).all()
+    assert a[a.index < 9].isna().all()
diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py
new file mode 100644
index 0000000000000000000000000000000000000000..043f369566a5df4b87b6d486fa3b9f23b8d208cf
--- /dev/null
+++ b/pandas/tests/window/test_timeseries_window.py
@@ -0,0 +1,747 @@
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+from pandas import (
+    DataFrame,
+    DatetimeIndex,
+    Index,
+    MultiIndex,
+    NaT,
+    Series,
+    Timestamp,
+    date_range,
+)
+import pandas._testing as tm
+
+from pandas.tseries import offsets
+
+
+@pytest.fixture
+def regular():
+    return DataFrame(
+        {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)}
+    ).set_index("A")
+
+
+@pytest.fixture
+def ragged():
+    df = DataFrame({"B": range(5)})
+    df.index = [
+        Timestamp("20130101 09:00:00"),
+        Timestamp("20130101 09:00:02"),
+        Timestamp("20130101 09:00:03"),
+        Timestamp("20130101 09:00:05"),
+        Timestamp("20130101 09:00:06"),
+    ]
+    return df
+
+
+class TestRollingTS:
+    # rolling time-series friendly
+    # xref GH13327
+
+    def test_doc_string(self):
+        df = DataFrame(
+            {"B": [0, 1, 2, np.nan, 4]},
+            index=[
+                Timestamp("20130101 09:00:00"),
+                Timestamp("20130101 09:00:02"),
+                Timestamp("20130101 09:00:03"),
+                Timestamp("20130101 09:00:05"),
+                Timestamp("20130101 09:00:06"),
+            ],
+        )
+        df
+        df.rolling("2s").sum()
+
+    def test_invalid_window_non_int(self, regular):
+        # not a valid freq
+        msg = "passed window foobar is not compatible with a datetimelike index"
+        with pytest.raises(ValueError, match=msg):
+            regular.rolling(window="foobar")
+        # not a datetimelike index
+        msg = "window must be an integer"
+        with pytest.raises(ValueError, match=msg):
+            regular.reset_index().rolling(window="foobar")
+
+    @pytest.mark.parametrize("freq", ["2MS", offsets.MonthBegin(2)])
+    def test_invalid_window_nonfixed(self, freq, regular):
+        # non-fixed freqs
+        msg = "\\<2 \\* MonthBegins\\> is a non-fixed frequency"
+        with pytest.raises(ValueError, match=msg):
+            regular.rolling(window=freq)
+
+    @pytest.mark.parametrize("freq", ["1D", offsets.Day(2), "2ms"])
+    def test_valid_window(self, freq, regular):
+        regular.rolling(window=freq)
+
+    @pytest.mark.parametrize("minp", [1.0, "foo", np.array([1, 2, 3])])
+    def test_invalid_minp(self, minp, regular):
+        # non-integer min_periods
+        msg = (
+            r"local variable 'minp' referenced before assignment|"
+            "min_periods must be an integer"
+        )
+        with pytest.raises(ValueError, match=msg):
+            regular.rolling(window="1D", min_periods=minp)
+
+    def test_on(self, regular):
+        df = regular
+
+        # not a valid column
+        msg = (
+            r"invalid on specified as foobar, must be a column "
+            "\\(of DataFrame\\), an Index or None"
+        )
+        with pytest.raises(ValueError, match=msg):
+            df.rolling(window="2s", on="foobar")
+
+        # column is valid
+        df = df.copy()
+        df["C"] = date_range("20130101", periods=len(df))
+        df.rolling(window="2D", on="C").sum()
+
+        # invalid columns
+        msg = "window must be an integer"
+        with pytest.raises(ValueError, match=msg):
+            df.rolling(window="2d", on="B")
+
+        # ok even though on non-selected
+        df.rolling(window="2D", on="C").B.sum()
+
+    def test_monotonic_on(self):
+        # on/index must be monotonic
+        df = DataFrame(
+            {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)}
+        )
+
+        assert df.A.is_monotonic_increasing
+        df.rolling("2s", on="A").sum()
+
+        df = df.set_index("A")
+        assert df.index.is_monotonic_increasing
+        df.rolling("2s").sum()
+
+    def test_non_monotonic_on(self):
+        # GH 19248
+        df = DataFrame(
+            {"A": date_range("20130101", periods=5, freq="s"), "B": range(5)}
+        )
+        df = df.set_index("A")
+        non_monotonic_index = df.index.to_list()
+        non_monotonic_index[0] = non_monotonic_index[3]
+        df.index = non_monotonic_index
+
+        assert not df.index.is_monotonic_increasing
+
+        msg = "index values must be monotonic"
+        with pytest.raises(ValueError, match=msg):
+            df.rolling("2s").sum()
+
+        df = df.reset_index()
+
+        msg = (
+            r"invalid on specified as A, must be a column "
+            "\\(of DataFrame\\), an Index or None"
+        )
+        with pytest.raises(ValueError, match=msg):
+            df.rolling("2s", on="A").sum()
+
+    def test_frame_on(self):
+        df = DataFrame(
+            {"B": range(5), "C": date_range("20130101 09:00:00", periods=5, freq="3s")}
+        )
+
+        df["A"] = [
+            Timestamp("20130101 09:00:00"),
+            Timestamp("20130101 09:00:02"),
+            Timestamp("20130101 09:00:03"),
+            Timestamp("20130101 09:00:05"),
+            Timestamp("20130101 09:00:06"),
+        ]
+
+        # we are doing simulating using 'on'
+        expected = df.set_index("A").rolling("2s").B.sum().reset_index(drop=True)
+
+        result = df.rolling("2s", on="A").B.sum()
+        tm.assert_series_equal(result, expected)
+
+        # test as a frame
+        # we should be ignoring the 'on' as an aggregation column
+        # note that the expected is setting, computing, and resetting
+        # so the columns need to be switched compared
+        # to the actual result where they are ordered as in the
+        # original
+        expected = (
+            df.set_index("A").rolling("2s")[["B"]].sum().reset_index()[["B", "A"]]
+        )
+
+        result = df.rolling("2s", on="A")[["B"]].sum()
+        tm.assert_frame_equal(result, expected)
+
+    def test_frame_on2(self, unit):
+        # using multiple aggregation columns
+        dti = DatetimeIndex(
+            [
+                Timestamp("20130101 09:00:00"),
+                Timestamp("20130101 09:00:02"),
+                Timestamp("20130101 09:00:03"),
+                Timestamp("20130101 09:00:05"),
+                Timestamp("20130101 09:00:06"),
+            ]
+        ).as_unit(unit)
+        df = DataFrame(
+            {
+                "A": [0, 1, 2, 3, 4],
+                "B": [0, 1, 2, np.nan, 4],
+                "C": dti,
+            },
+            columns=["A", "C", "B"],
+        )
+
+        expected1 = DataFrame(
+            {"A": [0.0, 1, 3, 3, 7], "B": [0, 1, 3, np.nan, 4], "C": df["C"]},
+            columns=["A", "C", "B"],
+        )
+
+        result = df.rolling("2s", on="C").sum()
+        expected = expected1
+        tm.assert_frame_equal(result, expected)
+
+        expected = Series([0, 1, 3, np.nan, 4], name="B")
+        result = df.rolling("2s", on="C").B.sum()
+        tm.assert_series_equal(result, expected)
+
+        expected = expected1[["A", "B", "C"]]
+        result = df.rolling("2s", on="C")[["A", "B", "C"]].sum()
+        tm.assert_frame_equal(result, expected)
+
+    def test_basic_regular(self, regular):
+        df = regular.copy()
+
+        df.index = date_range("20130101", periods=5, freq="D")
+        expected = df.rolling(window=1, min_periods=1).sum()
+        result = df.rolling(window="1D").sum()
+        tm.assert_frame_equal(result, expected)
+
+        df.index = date_range("20130101", periods=5, freq="2D")
+        expected = df.rolling(window=1, min_periods=1).sum()
+        result = df.rolling(window="2D", min_periods=1).sum()
+        tm.assert_frame_equal(result, expected)
+
+        expected = df.rolling(window=1, min_periods=1).sum()
+        result = df.rolling(window="2D", min_periods=1).sum()
+        tm.assert_frame_equal(result, expected)
+
+        expected = df.rolling(window=1).sum()
+        result = df.rolling(window="2D").sum()
+        tm.assert_frame_equal(result, expected)
+
+    def test_min_periods(self, regular):
+        # compare for min_periods
+        df = regular
+
+        # these slightly different
+        expected = df.rolling(2, min_periods=1).sum()
+        result = df.rolling("2s").sum()
+        tm.assert_frame_equal(result, expected)
+
+        expected = df.rolling(2, min_periods=1).sum()
+        result = df.rolling("2s", min_periods=1).sum()
+        tm.assert_frame_equal(result, expected)
+
+    def test_closed(self, regular, unit):
+        # xref GH13965
+
+        dti = DatetimeIndex(
+            [
+                Timestamp("20130101 09:00:01"),
+                Timestamp("20130101 09:00:02"),
+                Timestamp("20130101 09:00:03"),
+                Timestamp("20130101 09:00:04"),
+                Timestamp("20130101 09:00:06"),
+            ]
+        ).as_unit(unit)
+
+        df = DataFrame(
+            {"A": [1] * 5},
+            index=dti,
+        )
+
+        # closed must be 'right', 'left', 'both', 'neither'
+        msg = "closed must be 'right', 'left', 'both' or 'neither'"
+        with pytest.raises(ValueError, match=msg):
+            regular.rolling(window="2s", closed="blabla")
+
+        expected = df.copy()
+        expected["A"] = [1.0, 2, 2, 2, 1]
+        result = df.rolling("2s", closed="right").sum()
+        tm.assert_frame_equal(result, expected)
+
+        # default should be 'right'
+        result = df.rolling("2s").sum()
+        tm.assert_frame_equal(result, expected)
+
+        expected = df.copy()
+        expected["A"] = [1.0, 2, 3, 3, 2]
+        result = df.rolling("2s", closed="both").sum()
+        tm.assert_frame_equal(result, expected)
+
+        expected = df.copy()
+        expected["A"] = [np.nan, 1.0, 2, 2, 1]
+        result = df.rolling("2s", closed="left").sum()
+        tm.assert_frame_equal(result, expected)
+
+        expected = df.copy()
+        expected["A"] = [np.nan, 1.0, 1, 1, np.nan]
+        result = df.rolling("2s", closed="neither").sum()
+        tm.assert_frame_equal(result, expected)
+
+    def test_ragged_sum(self, ragged):
+        df = ragged
+        result = df.rolling(window="1s", min_periods=1).sum()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="2s", min_periods=1).sum()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 3, 3, 7]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="2s", min_periods=2).sum()
+        expected = df.copy()
+        expected["B"] = [np.nan, np.nan, 3, np.nan, 7]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="3s", min_periods=1).sum()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 3, 5, 7]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="3s").sum()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 3, 5, 7]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="4s", min_periods=1).sum()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 3, 6, 9]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="4s", min_periods=3).sum()
+        expected = df.copy()
+        expected["B"] = [np.nan, np.nan, 3, 6, 9]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="5s", min_periods=1).sum()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 3, 6, 10]
+        tm.assert_frame_equal(result, expected)
+
+    def test_ragged_mean(self, ragged):
+        df = ragged
+        result = df.rolling(window="1s", min_periods=1).mean()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="2s", min_periods=1).mean()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 1.5, 3.0, 3.5]
+        tm.assert_frame_equal(result, expected)
+
+    def test_ragged_median(self, ragged):
+        df = ragged
+        result = df.rolling(window="1s", min_periods=1).median()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="2s", min_periods=1).median()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 1.5, 3.0, 3.5]
+        tm.assert_frame_equal(result, expected)
+
+    def test_ragged_quantile(self, ragged):
+        df = ragged
+        result = df.rolling(window="1s", min_periods=1).quantile(0.5)
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="2s", min_periods=1).quantile(0.5)
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 1.5, 3.0, 3.5]
+        tm.assert_frame_equal(result, expected)
+
+    def test_ragged_std(self, ragged):
+        df = ragged
+        result = df.rolling(window="1s", min_periods=1).std(ddof=0)
+        expected = df.copy()
+        expected["B"] = [0.0] * 5
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="1s", min_periods=1).std(ddof=1)
+        expected = df.copy()
+        expected["B"] = [np.nan] * 5
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="3s", min_periods=1).std(ddof=0)
+        expected = df.copy()
+        expected["B"] = [0.0] + [0.5] * 4
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="5s", min_periods=1).std(ddof=1)
+        expected = df.copy()
+        expected["B"] = [np.nan, 0.707107, 1.0, 1.0, 1.290994]
+        tm.assert_frame_equal(result, expected)
+
+    def test_ragged_var(self, ragged):
+        df = ragged
+        result = df.rolling(window="1s", min_periods=1).var(ddof=0)
+        expected = df.copy()
+        expected["B"] = [0.0] * 5
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="1s", min_periods=1).var(ddof=1)
+        expected = df.copy()
+        expected["B"] = [np.nan] * 5
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="3s", min_periods=1).var(ddof=0)
+        expected = df.copy()
+        expected["B"] = [0.0] + [0.25] * 4
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="5s", min_periods=1).var(ddof=1)
+        expected = df.copy()
+        expected["B"] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.0]
+        tm.assert_frame_equal(result, expected)
+
+    def test_ragged_skew(self, ragged):
+        df = ragged
+        result = df.rolling(window="3s", min_periods=1).skew()
+        expected = df.copy()
+        expected["B"] = [np.nan] * 5
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="5s", min_periods=1).skew()
+        expected = df.copy()
+        expected["B"] = [np.nan] * 2 + [0.0, 0.0, 0.0]
+        tm.assert_frame_equal(result, expected)
+
+    def test_ragged_kurt(self, ragged):
+        df = ragged
+        result = df.rolling(window="3s", min_periods=1).kurt()
+        expected = df.copy()
+        expected["B"] = [np.nan] * 5
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="5s", min_periods=1).kurt()
+        expected = df.copy()
+        expected["B"] = [np.nan] * 4 + [-1.2]
+        tm.assert_frame_equal(result, expected)
+
+    def test_ragged_count(self, ragged):
+        df = ragged
+        result = df.rolling(window="1s", min_periods=1).count()
+        expected = df.copy()
+        expected["B"] = [1.0, 1, 1, 1, 1]
+        tm.assert_frame_equal(result, expected)
+
+        df = ragged
+        result = df.rolling(window="1s").count()
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="2s", min_periods=1).count()
+        expected = df.copy()
+        expected["B"] = [1.0, 1, 2, 1, 2]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="2s", min_periods=2).count()
+        expected = df.copy()
+        expected["B"] = [np.nan, np.nan, 2, np.nan, 2]
+        tm.assert_frame_equal(result, expected)
+
+    def test_regular_min(self):
+        df = DataFrame(
+            {"A": date_range("20130101", periods=5, freq="s"), "B": [0.0, 1, 2, 3, 4]}
+        ).set_index("A")
+        result = df.rolling("1s").min()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+        df = DataFrame(
+            {"A": date_range("20130101", periods=5, freq="s"), "B": [5, 4, 3, 4, 5]}
+        ).set_index("A")
+
+        tm.assert_frame_equal(result, expected)
+        result = df.rolling("2s").min()
+        expected = df.copy()
+        expected["B"] = [5.0, 4, 3, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling("5s").min()
+        expected = df.copy()
+        expected["B"] = [5.0, 4, 3, 3, 3]
+        tm.assert_frame_equal(result, expected)
+
+    def test_ragged_min(self, ragged):
+        df = ragged
+
+        result = df.rolling(window="1s", min_periods=1).min()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="2s", min_periods=1).min()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 1, 3, 3]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="5s", min_periods=1).min()
+        expected = df.copy()
+        expected["B"] = [0.0, 0, 0, 1, 1]
+        tm.assert_frame_equal(result, expected)
+
+    def test_perf_min(self):
+        N = 10000
+
+        dfp = DataFrame(
+            {"B": np.random.default_rng(2).standard_normal(N)},
+            index=date_range("20130101", periods=N, freq="s"),
+        )
+        expected = dfp.rolling(2, min_periods=1).min()
+        result = dfp.rolling("2s").min()
+        assert ((result - expected) < 0.01).all().all()
+
+        expected = dfp.rolling(200, min_periods=1).min()
+        result = dfp.rolling("200s").min()
+        assert ((result - expected) < 0.01).all().all()
+
+    def test_ragged_max(self, ragged):
+        df = ragged
+
+        result = df.rolling(window="1s", min_periods=1).max()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="2s", min_periods=1).max()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="5s", min_periods=1).max()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+    def test_ragged_first(self, ragged):
+        df = ragged
+
+        result = df.rolling(window="1s", min_periods=1).first()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="2s", min_periods=1).first()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 1, 3, 3]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="5s", min_periods=1).first()
+        expected = df.copy()
+        expected["B"] = [0.0, 0, 0, 1, 1]
+        tm.assert_frame_equal(result, expected)
+
+    def test_ragged_last(self, ragged):
+        df = ragged
+
+        result = df.rolling(window="1s", min_periods=1).last()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="2s", min_periods=1).last()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+        result = df.rolling(window="5s", min_periods=1).last()
+        expected = df.copy()
+        expected["B"] = [0.0, 1, 2, 3, 4]
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "freq, op, result_data",
+        [
+            ("ms", "min", [0.0] * 10),
+            ("ms", "mean", [0.0] * 9 + [2.0 / 9]),
+            ("ms", "max", [0.0] * 9 + [2.0]),
+            ("s", "min", [0.0] * 10),
+            ("s", "mean", [0.0] * 9 + [2.0 / 9]),
+            ("s", "max", [0.0] * 9 + [2.0]),
+            ("min", "min", [0.0] * 10),
+            ("min", "mean", [0.0] * 9 + [2.0 / 9]),
+            ("min", "max", [0.0] * 9 + [2.0]),
+            ("h", "min", [0.0] * 10),
+            ("h", "mean", [0.0] * 9 + [2.0 / 9]),
+            ("h", "max", [0.0] * 9 + [2.0]),
+            ("D", "min", [0.0] * 10),
+            ("D", "mean", [0.0] * 9 + [2.0 / 9]),
+            ("D", "max", [0.0] * 9 + [2.0]),
+        ],
+    )
+    def test_freqs_ops(self, freq, op, result_data):
+        # GH 21096
+        index = date_range(start="2018-1-1 01:00:00", freq=f"1{freq}", periods=10)
+        # Explicit cast to float to avoid implicit cast when setting nan
+        s = Series(data=0, index=index, dtype="float")
+        s.iloc[1] = np.nan
+        s.iloc[-1] = 2
+        result = getattr(s.rolling(window=f"10{freq}"), op)()
+        expected = Series(data=result_data, index=index)
+
+        tm.assert_series_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "f",
+        [
+            "sum",
+            "mean",
+            "count",
+            "median",
+            "std",
+            "var",
+            "kurt",
+            "skew",
+            "min",
+            "max",
+            "first",
+            "last",
+        ],
+    )
+    def test_all(self, f, regular):
+        # simple comparison of integer vs time-based windowing
+        df = regular * 2
+        er = df.rolling(window=1)
+        r = df.rolling(window="1s")
+
+        result = getattr(r, f)()
+        expected = getattr(er, f)()
+        tm.assert_frame_equal(result, expected)
+
+        result = r.quantile(0.5)
+        expected = er.quantile(0.5)
+        tm.assert_frame_equal(result, expected)
+
+    def test_all2(self, arithmetic_win_operators):
+        f = arithmetic_win_operators
+        # more sophisticated comparison of integer vs.
+        # time-based windowing
+        df = DataFrame(
+            {"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="h")
+        )
+        # in-range data
+        dft = df.between_time("09:00", "16:00")
+
+        r = dft.rolling(window="5h")
+
+        result = getattr(r, f)()
+
+        # we need to roll the days separately
+        # to compare with a time-based roll
+        # finally groupby-apply will return a multi-index
+        # so we need to drop the day
+        def agg_by_day(x):
+            x = x.between_time("09:00", "16:00")
+            return getattr(x.rolling(5, min_periods=1), f)()
+
+        expected = (
+            df.groupby(df.index.day).apply(agg_by_day).reset_index(level=0, drop=True)
+        )
+
+        tm.assert_frame_equal(result, expected)
+
+    def test_rolling_cov_offset(self):
+        # GH16058
+
+        idx = date_range("2017-01-01", periods=24, freq="1h")
+        ss = Series(np.arange(len(idx)), index=idx)
+
+        result = ss.rolling("2h").cov()
+        expected = Series([np.nan] + [0.5] * (len(idx) - 1), index=idx)
+        tm.assert_series_equal(result, expected)
+
+        expected2 = ss.rolling(2, min_periods=1).cov()
+        tm.assert_series_equal(result, expected2)
+
+        result = ss.rolling("3h").cov()
+        expected = Series([np.nan, 0.5] + [1.0] * (len(idx) - 2), index=idx)
+        tm.assert_series_equal(result, expected)
+
+        expected2 = ss.rolling(3, min_periods=1).cov()
+        tm.assert_series_equal(result, expected2)
+
+    def test_rolling_on_decreasing_index(self, unit):
+        # GH-19248, GH-32385
+        index = DatetimeIndex(
+            [
+                Timestamp("20190101 09:00:30"),
+                Timestamp("20190101 09:00:27"),
+                Timestamp("20190101 09:00:20"),
+                Timestamp("20190101 09:00:18"),
+                Timestamp("20190101 09:00:10"),
+            ]
+        ).as_unit(unit)
+
+        df = DataFrame({"column": [3, 4, 4, 5, 6]}, index=index)
+        result = df.rolling("5s").min()
+        expected = DataFrame({"column": [3.0, 3.0, 4.0, 4.0, 6.0]}, index=index)
+        tm.assert_frame_equal(result, expected)
+
+    def test_rolling_on_empty(self):
+        # GH-32385
+        df = DataFrame({"column": []}, index=[])
+        result = df.rolling("5s").min()
+        expected = DataFrame({"column": []}, index=[])
+        tm.assert_frame_equal(result, expected)
+
+    def test_rolling_on_multi_index_level(self):
+        # GH-15584
+        df = DataFrame(
+            {"column": range(6)},
+            index=MultiIndex.from_product(
+                [date_range("20190101", periods=3), range(2)], names=["date", "seq"]
+            ),
+        )
+        result = df.rolling("10D", on=df.index.get_level_values("date")).sum()
+        expected = DataFrame(
+            {"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index
+        )
+        tm.assert_frame_equal(result, expected)
+
+
+def test_nat_axis_error():
+    idx = [Timestamp("2020"), NaT]
+    df = DataFrame(np.eye(2), index=idx)
+    with pytest.raises(ValueError, match="index values must not have NaT"):
+        df.rolling("D").mean()
+
+
+@td.skip_if_no("pyarrow")
+def test_arrow_datetime_axis():
+    # GH 55849
+    expected = Series(
+        np.arange(5, dtype=np.float64),
+        index=Index(
+            date_range("2020-01-01", periods=5), dtype="timestamp[ns][pyarrow]"
+        ),
+    )
+    result = expected.rolling("1D").sum()
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/window/test_win_type.py b/pandas/tests/window/test_win_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..574dfc34b6d267169bd66b73b42a73829383f78d
--- /dev/null
+++ b/pandas/tests/window/test_win_type.py
@@ -0,0 +1,670 @@
+import numpy as np
+import pytest
+
+from pandas import (
+    DataFrame,
+    Series,
+    Timedelta,
+    concat,
+    date_range,
+)
+import pandas._testing as tm
+from pandas.api.indexers import BaseIndexer
+
+
+@pytest.fixture(
+    params=[
+        "triang",
+        "blackman",
+        "hamming",
+        "bartlett",
+        "bohman",
+        "blackmanharris",
+        "nuttall",
+        "barthann",
+    ]
+)
+def win_types(request):
+    return request.param
+
+
+@pytest.fixture(params=["kaiser", "gaussian", "general_gaussian", "exponential"])
+def win_types_special(request):
+    return request.param
+
+
+def test_constructor(frame_or_series):
+    # GH 12669
+    pytest.importorskip("scipy")
+    c = frame_or_series(range(5)).rolling
+
+    # valid
+    c(win_type="boxcar", window=2, min_periods=1)
+    c(win_type="boxcar", window=2, min_periods=1, center=True)
+    c(win_type="boxcar", window=2, min_periods=1, center=False)
+
+
+@pytest.mark.parametrize("w", [2.0, "foo", np.array([2])])
+def test_invalid_constructor(frame_or_series, w):
+    # not valid
+    pytest.importorskip("scipy")
+    c = frame_or_series(range(5)).rolling
+    with pytest.raises(ValueError, match="min_periods must be an integer"):
+        c(win_type="boxcar", window=2, min_periods=w)
+    with pytest.raises(ValueError, match="center must be a boolean"):
+        c(win_type="boxcar", window=2, min_periods=1, center=w)
+
+
+@pytest.mark.parametrize("wt", ["foobar", 1])
+def test_invalid_constructor_wintype(frame_or_series, wt):
+    pytest.importorskip("scipy")
+    c = frame_or_series(range(5)).rolling
+    with pytest.raises(ValueError, match="Invalid win_type"):
+        c(win_type=wt, window=2)
+
+
+def test_constructor_with_win_type(frame_or_series, win_types):
+    # GH 12669
+    pytest.importorskip("scipy")
+    c = frame_or_series(range(5)).rolling
+    c(win_type=win_types, window=2)
+
+
+@pytest.mark.parametrize("arg", ["median", "kurt", "skew"])
+def test_agg_function_support(arg):
+    pytest.importorskip("scipy")
+    df = DataFrame({"A": np.arange(5)})
+    roll = df.rolling(2, win_type="triang")
+
+    msg = f"'{arg}' is not a valid function for 'Window' object"
+    with pytest.raises(AttributeError, match=msg):
+        roll.agg(arg)
+
+    with pytest.raises(AttributeError, match=msg):
+        roll.agg([arg])
+
+    with pytest.raises(AttributeError, match=msg):
+        roll.agg({"A": arg})
+
+
+def test_invalid_scipy_arg():
+    # This error is raised by scipy
+    pytest.importorskip("scipy")
+    msg = r"boxcar\(\) got an unexpected"
+    with pytest.raises(TypeError, match=msg):
+        Series(range(3)).rolling(1, win_type="boxcar").mean(foo="bar")
+
+
+def test_constructor_with_win_type_invalid(frame_or_series):
+    # GH 13383
+    pytest.importorskip("scipy")
+    c = frame_or_series(range(5)).rolling
+
+    msg = "window must be an integer 0 or greater"
+
+    with pytest.raises(ValueError, match=msg):
+        c(-1, win_type="boxcar")
+
+
+def test_window_with_args(step):
+    # make sure that we are aggregating window functions correctly with arg
+    pytest.importorskip("scipy")
+    r = Series(np.random.default_rng(2).standard_normal(100)).rolling(
+        window=10, min_periods=1, win_type="gaussian", step=step
+    )
+    expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1)
+    expected.columns = ["<lambda>", "<lambda>"]
+    result = r.aggregate([lambda x: x.mean(std=10), lambda x: x.mean(std=0.01)])
+    tm.assert_frame_equal(result, expected)
+
+    def a(x):
+        return x.mean(std=10)
+
+    def b(x):
+        return x.mean(std=0.01)
+
+    expected = concat([r.mean(std=10), r.mean(std=0.01)], axis=1)
+    expected.columns = ["a", "b"]
+    result = r.aggregate([a, b])
+    tm.assert_frame_equal(result, expected)
+
+
+def test_win_type_with_method_invalid():
+    pytest.importorskip("scipy")
+    with pytest.raises(
+        NotImplementedError, match="'single' is the only supported method type."
+    ):
+        Series(range(1)).rolling(1, win_type="triang", method="table")
+
+
+@pytest.mark.parametrize("arg", [2000000000, "2s", Timedelta("2s")])
+def test_consistent_win_type_freq(arg):
+    # GH 15969
+    pytest.importorskip("scipy")
+    s = Series(range(1))
+    with pytest.raises(ValueError, match="Invalid win_type freq"):
+        s.rolling(arg, win_type="freq")
+
+
+def test_win_type_freq_return_none():
+    # GH 48838
+    freq_roll = Series(range(2), index=date_range("2020", periods=2)).rolling("2s")
+    assert freq_roll.win_type is None
+
+
+def test_win_type_not_implemented():
+    pytest.importorskip("scipy")
+
+    class CustomIndexer(BaseIndexer):
+        def get_window_bounds(self, num_values, min_periods, center, closed, step):
+            return np.array([0, 1]), np.array([1, 2])
+
+    df = DataFrame({"values": range(2)})
+    indexer = CustomIndexer()
+    with pytest.raises(NotImplementedError, match="BaseIndexer subclasses not"):
+        df.rolling(indexer, win_type="boxcar")
+
+
+def test_cmov_mean(step):
+    # GH 8238
+    pytest.importorskip("scipy")
+    vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48])
+    result = Series(vals).rolling(5, center=True, step=step).mean()
+    expected_values = [
+        np.nan,
+        np.nan,
+        9.962,
+        11.27,
+        11.564,
+        12.516,
+        12.818,
+        12.952,
+        np.nan,
+        np.nan,
+    ]
+    expected = Series(expected_values)[::step]
+    tm.assert_series_equal(expected, result)
+
+
+def test_cmov_window(step):
+    # GH 8238
+    pytest.importorskip("scipy")
+    vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48])
+    result = Series(vals).rolling(5, win_type="boxcar", center=True, step=step).mean()
+    expected_values = [
+        np.nan,
+        np.nan,
+        9.962,
+        11.27,
+        11.564,
+        12.516,
+        12.818,
+        12.952,
+        np.nan,
+        np.nan,
+    ]
+    expected = Series(expected_values)[::step]
+    tm.assert_series_equal(expected, result)
+
+
+def test_cmov_window_corner(step):
+    # GH 8238
+    # all nan
+    pytest.importorskip("scipy")
+    vals = Series([np.nan] * 10)
+    result = vals.rolling(5, center=True, win_type="boxcar", step=step).mean()
+    assert np.isnan(result).all()
+
+    # empty
+    vals = Series([], dtype=object)
+    result = vals.rolling(5, center=True, win_type="boxcar", step=step).mean()
+    assert len(result) == 0
+
+    # shorter than window
+    vals = Series(np.random.default_rng(2).standard_normal(5))
+    result = vals.rolling(10, win_type="boxcar", step=step).mean()
+    assert np.isnan(result).all()
+    assert len(result) == len(range(0, 5, step or 1))
+
+
+@pytest.mark.parametrize(
+    "f,xp",
+    [
+        (
+            "mean",
+            [
+                [np.nan, np.nan],
+                [np.nan, np.nan],
+                [9.252, 9.392],
+                [8.644, 9.906],
+                [8.87, 10.208],
+                [6.81, 8.588],
+                [7.792, 8.644],
+                [9.05, 7.824],
+                [np.nan, np.nan],
+                [np.nan, np.nan],
+            ],
+        ),
+        (
+            "std",
+            [
+                [np.nan, np.nan],
+                [np.nan, np.nan],
+                [3.789706, 4.068313],
+                [3.429232, 3.237411],
+                [3.589269, 3.220810],
+                [3.405195, 2.380655],
+                [3.281839, 2.369869],
+                [3.676846, 1.801799],
+                [np.nan, np.nan],
+                [np.nan, np.nan],
+            ],
+        ),
+        (
+            "var",
+            [
+                [np.nan, np.nan],
+                [np.nan, np.nan],
+                [14.36187, 16.55117],
+                [11.75963, 10.48083],
+                [12.88285, 10.37362],
+                [11.59535, 5.66752],
+                [10.77047, 5.61628],
+                [13.51920, 3.24648],
+                [np.nan, np.nan],
+                [np.nan, np.nan],
+            ],
+        ),
+        (
+            "sum",
+            [
+                [np.nan, np.nan],
+                [np.nan, np.nan],
+                [46.26, 46.96],
+                [43.22, 49.53],
+                [44.35, 51.04],
+                [34.05, 42.94],
+                [38.96, 43.22],
+                [45.25, 39.12],
+                [np.nan, np.nan],
+                [np.nan, np.nan],
+            ],
+        ),
+    ],
+)
+def test_cmov_window_frame(f, xp, step):
+    # Gh 8238
+    pytest.importorskip("scipy")
+    df = DataFrame(
+        np.array(
+            [
+                [12.18, 3.64],
+                [10.18, 9.16],
+                [13.24, 14.61],
+                [4.51, 8.11],
+                [6.15, 11.44],
+                [9.14, 6.21],
+                [11.31, 10.67],
+                [2.94, 6.51],
+                [9.42, 8.39],
+                [12.44, 7.34],
+            ]
+        )
+    )
+    xp = DataFrame(np.array(xp))[::step]
+
+    roll = df.rolling(5, win_type="boxcar", center=True, step=step)
+    rs = getattr(roll, f)()
+
+    tm.assert_frame_equal(xp, rs)
+
+
+@pytest.mark.parametrize("min_periods", [0, 1, 2, 3, 4, 5])
+def test_cmov_window_na_min_periods(step, min_periods):
+    pytest.importorskip("scipy")
+    vals = Series(np.random.default_rng(2).standard_normal(10))
+    vals[4] = np.nan
+    vals[8] = np.nan
+
+    xp = vals.rolling(5, min_periods=min_periods, center=True, step=step).mean()
+    rs = vals.rolling(
+        5, win_type="boxcar", min_periods=min_periods, center=True, step=step
+    ).mean()
+    tm.assert_series_equal(xp, rs)
+
+
+def test_cmov_window_regular(win_types, step):
+    # GH 8238
+    pytest.importorskip("scipy")
+    vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48])
+    xps = {
+        "hamming": [
+            np.nan,
+            np.nan,
+            8.71384,
+            9.56348,
+            12.38009,
+            14.03687,
+            13.8567,
+            11.81473,
+            np.nan,
+            np.nan,
+        ],
+        "triang": [
+            np.nan,
+            np.nan,
+            9.28667,
+            10.34667,
+            12.00556,
+            13.33889,
+            13.38,
+            12.33667,
+            np.nan,
+            np.nan,
+        ],
+        "barthann": [
+            np.nan,
+            np.nan,
+            8.4425,
+            9.1925,
+            12.5575,
+            14.3675,
+            14.0825,
+            11.5675,
+            np.nan,
+            np.nan,
+        ],
+        "bohman": [
+            np.nan,
+            np.nan,
+            7.61599,
+            9.1764,
+            12.83559,
+            14.17267,
+            14.65923,
+            11.10401,
+            np.nan,
+            np.nan,
+        ],
+        "blackmanharris": [
+            np.nan,
+            np.nan,
+            6.97691,
+            9.16438,
+            13.05052,
+            14.02156,
+            15.10512,
+            10.74574,
+            np.nan,
+            np.nan,
+        ],
+        "nuttall": [
+            np.nan,
+            np.nan,
+            7.04618,
+            9.16786,
+            13.02671,
+            14.03559,
+            15.05657,
+            10.78514,
+            np.nan,
+            np.nan,
+        ],
+        "blackman": [
+            np.nan,
+            np.nan,
+            7.73345,
+            9.17869,
+            12.79607,
+            14.20036,
+            14.57726,
+            11.16988,
+            np.nan,
+            np.nan,
+        ],
+        "bartlett": [
+            np.nan,
+            np.nan,
+            8.4425,
+            9.1925,
+            12.5575,
+            14.3675,
+            14.0825,
+            11.5675,
+            np.nan,
+            np.nan,
+        ],
+    }
+
+    xp = Series(xps[win_types])[::step]
+    rs = Series(vals).rolling(5, win_type=win_types, center=True, step=step).mean()
+    tm.assert_series_equal(xp, rs)
+
+
+def test_cmov_window_regular_linear_range(win_types, step):
+    # GH 8238
+    pytest.importorskip("scipy")
+    vals = np.array(range(10), dtype=float)
+    rs = Series(vals).rolling(5, win_type=win_types, center=True, step=step).mean()
+    xp = vals
+    xp[:2] = np.nan
+    xp[-2:] = np.nan
+    xp = Series(xp)[::step]
+
+    tm.assert_series_equal(xp, rs)
+
+
+def test_cmov_window_regular_missing_data(win_types, step):
+    # GH 8238
+    pytest.importorskip("scipy")
+    vals = np.array(
+        [6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan, 10.63, 14.48]
+    )
+    xps = {
+        "bartlett": [
+            np.nan,
+            np.nan,
+            9.70333,
+            10.5225,
+            8.4425,
+            9.1925,
+            12.5575,
+            14.3675,
+            15.61667,
+            13.655,
+        ],
+        "blackman": [
+            np.nan,
+            np.nan,
+            9.04582,
+            11.41536,
+            7.73345,
+            9.17869,
+            12.79607,
+            14.20036,
+            15.8706,
+            13.655,
+        ],
+        "barthann": [
+            np.nan,
+            np.nan,
+            9.70333,
+            10.5225,
+            8.4425,
+            9.1925,
+            12.5575,
+            14.3675,
+            15.61667,
+            13.655,
+        ],
+        "bohman": [
+            np.nan,
+            np.nan,
+            8.9444,
+            11.56327,
+            7.61599,
+            9.1764,
+            12.83559,
+            14.17267,
+            15.90976,
+            13.655,
+        ],
+        "hamming": [
+            np.nan,
+            np.nan,
+            9.59321,
+            10.29694,
+            8.71384,
+            9.56348,
+            12.38009,
+            14.20565,
+            15.24694,
+            13.69758,
+        ],
+        "nuttall": [
+            np.nan,
+            np.nan,
+            8.47693,
+            12.2821,
+            7.04618,
+            9.16786,
+            13.02671,
+            14.03673,
+            16.08759,
+            13.65553,
+        ],
+        "triang": [
+            np.nan,
+            np.nan,
+            9.33167,
+            9.76125,
+            9.28667,
+            10.34667,
+            12.00556,
+            13.82125,
+            14.49429,
+            13.765,
+        ],
+        "blackmanharris": [
+            np.nan,
+            np.nan,
+            8.42526,
+            12.36824,
+            6.97691,
+            9.16438,
+            13.05052,
+            14.02175,
+            16.1098,
+            13.65509,
+        ],
+    }
+
+    xp = Series(xps[win_types])[::step]
+    rs = Series(vals).rolling(5, win_type=win_types, min_periods=3, step=step).mean()
+    tm.assert_series_equal(xp, rs)
+
+
+def test_cmov_window_special(win_types_special, step):
+    # GH 8238
+    pytest.importorskip("scipy")
+    kwds = {
+        "kaiser": {"beta": 1.0},
+        "gaussian": {"std": 1.0},
+        "general_gaussian": {"p": 2.0, "sig": 2.0},
+        "exponential": {"tau": 10},
+    }
+
+    vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, 10.63, 14.48])
+
+    xps = {
+        "gaussian": [
+            np.nan,
+            np.nan,
+            8.97297,
+            9.76077,
+            12.24763,
+            13.89053,
+            13.65671,
+            12.01002,
+            np.nan,
+            np.nan,
+        ],
+        "general_gaussian": [
+            np.nan,
+            np.nan,
+            9.85011,
+            10.71589,
+            11.73161,
+            13.08516,
+            12.95111,
+            12.74577,
+            np.nan,
+            np.nan,
+        ],
+        "kaiser": [
+            np.nan,
+            np.nan,
+            9.86851,
+            11.02969,
+            11.65161,
+            12.75129,
+            12.90702,
+            12.83757,
+            np.nan,
+            np.nan,
+        ],
+        "exponential": [
+            np.nan,
+            np.nan,
+            9.83364,
+            11.10472,
+            11.64551,
+            12.66138,
+            12.92379,
+            12.83770,
+            np.nan,
+            np.nan,
+        ],
+    }
+
+    xp = Series(xps[win_types_special])[::step]
+    rs = (
+        Series(vals)
+        .rolling(5, win_type=win_types_special, center=True, step=step)
+        .mean(**kwds[win_types_special])
+    )
+    tm.assert_series_equal(xp, rs)
+
+
+def test_cmov_window_special_linear_range(win_types_special, step):
+    # GH 8238
+    pytest.importorskip("scipy")
+    kwds = {
+        "kaiser": {"beta": 1.0},
+        "gaussian": {"std": 1.0},
+        "general_gaussian": {"p": 2.0, "sig": 2.0},
+        "slepian": {"width": 0.5},
+        "exponential": {"tau": 10},
+    }
+
+    vals = np.array(range(10), dtype=float)
+    rs = (
+        Series(vals)
+        .rolling(5, win_type=win_types_special, center=True, step=step)
+        .mean(**kwds[win_types_special])
+    )
+    xp = vals
+    xp[:2] = np.nan
+    xp[-2:] = np.nan
+    xp = Series(xp)[::step]
+    tm.assert_series_equal(xp, rs)
+
+
+def test_weighted_var_big_window_no_segfault(win_types, center):
+    # GitHub Issue #46772
+    pytest.importorskip("scipy")
+    x = Series(0)
+    result = x.rolling(window=16, center=center, win_type=win_types).var()
+    expected = Series(np.nan)
+
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tseries/__init__.py b/pandas/tseries/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c00843ecac418a41b01470db93388f6c5568ea6b
--- /dev/null
+++ b/pandas/tseries/__init__.py
@@ -0,0 +1,12 @@
+# ruff: noqa: TC004
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    # import modules that have public classes/functions:
+    from pandas.tseries import (
+        frequencies,
+        offsets,
+    )
+
+    # and mark only those modules as public
+    __all__ = ["frequencies", "offsets"]
diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ea899f1610a7ef223f1c069552bedf4c502a8b4
--- /dev/null
+++ b/pandas/tseries/api.py
@@ -0,0 +1,10 @@
+"""
+Timeseries API
+"""
+
+from pandas._libs.tslibs.parsing import guess_datetime_format
+
+from pandas.tseries import offsets
+from pandas.tseries.frequencies import infer_freq
+
+__all__ = ["guess_datetime_format", "infer_freq", "offsets"]
diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py
new file mode 100644
index 0000000000000000000000000000000000000000..196b3aadccaefe4bbc4cb862d36b035feeb92e93
--- /dev/null
+++ b/pandas/tseries/frequencies.py
@@ -0,0 +1,623 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas._libs.algos import unique_deltas
+from pandas._libs.tslibs import (
+    Timestamp,
+    get_unit_from_dtype,
+    periods_per_day,
+    tz_convert_from_utc,
+)
+from pandas._libs.tslibs.ccalendar import (
+    DAYS,
+    MONTH_ALIASES,
+    MONTH_NUMBERS,
+    MONTHS,
+    int_to_weekday,
+)
+from pandas._libs.tslibs.dtypes import OFFSET_TO_PERIOD_FREQSTR
+from pandas._libs.tslibs.fields import (
+    build_field_sarray,
+    month_position_check,
+)
+from pandas._libs.tslibs.offsets import (
+    DateOffset,
+    Day,
+    to_offset,
+)
+from pandas._libs.tslibs.parsing import get_rule_month
+from pandas.util._decorators import (
+    cache_readonly,
+    set_module,
+)
+
+from pandas.core.dtypes.common import is_numeric_dtype
+from pandas.core.dtypes.dtypes import (
+    ArrowDtype,
+    DatetimeTZDtype,
+    PeriodDtype,
+)
+from pandas.core.dtypes.generic import (
+    ABCIndex,
+    ABCSeries,
+)
+
+from pandas.core.algorithms import unique
+
+if TYPE_CHECKING:
+    from pandas._typing import npt
+
+    from pandas import (
+        DatetimeIndex,
+        Series,
+        TimedeltaIndex,
+    )
+    from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
+# --------------------------------------------------------------------
+# Offset related functions
+
+_need_suffix = ["QS", "BQE", "BQS", "YS", "BYE", "BYS"]
+
+for _prefix in _need_suffix:
+    for _m in MONTHS:
+        key = f"{_prefix}-{_m}"
+        OFFSET_TO_PERIOD_FREQSTR[key] = OFFSET_TO_PERIOD_FREQSTR[_prefix]
+
+for _prefix in ["Y", "Q"]:
+    for _m in MONTHS:
+        _alias = f"{_prefix}-{_m}"
+        OFFSET_TO_PERIOD_FREQSTR[_alias] = _alias
+
+for _d in DAYS:
+    OFFSET_TO_PERIOD_FREQSTR[f"W-{_d}"] = f"W-{_d}"
+
+
+def get_period_alias(offset_str: str) -> str | None:
+    """
+    Alias to closest period strings BQ->Q etc.
+    """
+    return OFFSET_TO_PERIOD_FREQSTR.get(offset_str, None)
+
+
+# ---------------------------------------------------------------------
+# Period codes
+
+
+@set_module("pandas")
+def infer_freq(
+    index: DatetimeIndex | TimedeltaIndex | Series | DatetimeLikeArrayMixin,
+) -> str | None:
+    """
+    Infer the most likely frequency given the input index.
+
+    This method attempts to deduce the most probable frequency (e.g., 'D' for daily,
+    'H' for hourly) from a sequence of datetime-like objects. It is particularly useful
+    when the frequency of a time series is not explicitly set or known but can be
+    inferred from its values.
+
+    Parameters
+    ----------
+    index : DatetimeIndex, TimedeltaIndex, Series or array-like
+      If passed a Series will use the values of the series (NOT THE INDEX).
+
+    Returns
+    -------
+    str or None
+        None if no discernible frequency.
+
+    Raises
+    ------
+    TypeError
+        If the index is not datetime-like.
+    ValueError
+        If there are fewer than three values.
+
+    See Also
+    --------
+    date_range : Return a fixed frequency DatetimeIndex.
+    timedelta_range : Return a fixed frequency TimedeltaIndex with day as the default.
+    period_range : Return a fixed frequency PeriodIndex.
+    DatetimeIndex.freq : Return the frequency object if it is set, otherwise None.
+
+    Examples
+    --------
+    >>> idx = pd.date_range(start="2020/12/01", end="2020/12/30", periods=30)
+    >>> pd.infer_freq(idx)
+    'D'
+    """
+    from pandas.core.api import DatetimeIndex
+
+    if isinstance(index, ABCSeries):
+        values = index._values
+
+        if isinstance(index.dtype, ArrowDtype):
+            import pyarrow as pa
+
+            if pa.types.is_timestamp(values.dtype.pyarrow_dtype):
+                # GH#58403
+                values = values._to_datetimearray()
+
+        if not (
+            lib.is_np_dtype(values.dtype, "mM")
+            or isinstance(values.dtype, DatetimeTZDtype)
+            or values.dtype == object
+        ):
+            raise TypeError(
+                "cannot infer freq from a non-convertible dtype "
+                f"on a Series of {index.dtype}"
+            )
+        index = values
+
+    inferer: _FrequencyInferer
+
+    if not hasattr(index, "dtype"):
+        pass
+    elif isinstance(index.dtype, PeriodDtype):
+        raise TypeError(
+            "PeriodIndex given. Check the `freq` attribute instead of using infer_freq."
+        )
+    elif lib.is_np_dtype(index.dtype, "m"):
+        # Allow TimedeltaIndex and TimedeltaArray
+        inferer = _TimedeltaFrequencyInferer(index)
+        return inferer.get_freq()
+
+    elif is_numeric_dtype(index.dtype):
+        raise TypeError(
+            f"cannot infer freq from a non-convertible index of dtype {index.dtype}"
+        )
+
+    if not isinstance(index, DatetimeIndex):
+        index = DatetimeIndex(index, copy=False)
+
+    inferer = _FrequencyInferer(index)
+    return inferer.get_freq()
+
+
+class _FrequencyInferer:
+    """
+    Not sure if I can avoid the state machine here
+    """
+
+    def __init__(self, index) -> None:
+        self.index = index
+        self.i8values = index.asi8
+
+        # For get_unit_from_dtype we need the dtype to the underlying ndarray,
+        #  which for tz-aware is not the same as index.dtype
+        if isinstance(index, ABCIndex):
+            # error: Item "ndarray[Any, Any]" of "Union[ExtensionArray,
+            # ndarray[Any, Any]]" has no attribute "_ndarray"
+            self._creso = get_unit_from_dtype(
+                index._data._ndarray.dtype  # type: ignore[union-attr]
+            )
+        else:
+            # otherwise we have DTA/TDA
+            self._creso = get_unit_from_dtype(index._ndarray.dtype)
+
+        # This moves the values, which are implicitly in UTC, to the
+        # the timezone so they are in local time
+        if hasattr(index, "tz"):
+            if index.tz is not None:
+                self.i8values = tz_convert_from_utc(
+                    self.i8values, index.tz, reso=self._creso
+                )
+
+        if len(index) < 3:
+            raise ValueError("Need at least 3 dates to infer frequency")
+
+        self.is_monotonic = (
+            self.index._is_monotonic_increasing or self.index._is_monotonic_decreasing
+        )
+
+    @cache_readonly
+    def deltas(self) -> npt.NDArray[np.int64]:
+        return unique_deltas(self.i8values)
+
+    @cache_readonly
+    def deltas_asi8(self) -> npt.NDArray[np.int64]:
+        # NB: we cannot use self.i8values here because we may have converted
+        #  the tz in __init__
+        return unique_deltas(self.index.asi8)
+
+    @cache_readonly
+    def is_unique(self) -> bool:
+        return len(self.deltas) == 1
+
+    @cache_readonly
+    def is_unique_asi8(self) -> bool:
+        return len(self.deltas_asi8) == 1
+
+    def get_freq(self) -> str | None:
+        """
+        Find the appropriate frequency string to describe the inferred
+        frequency of self.i8values
+
+        Returns
+        -------
+        str or None
+        """
+        if not self.is_monotonic or not self.index._is_unique:
+            return None
+
+        delta = self.deltas[0]
+        ppd = periods_per_day(self._creso)
+        if delta and _is_multiple(delta, ppd):
+            return self._infer_daily_rule()
+
+        # Business hourly, maybe. 17: one day / 65: one weekend
+        if self.hour_deltas in ([1, 17], [1, 65], [1, 17, 65]):
+            return "bh"
+
+        # Possibly intraday frequency.  Here we use the
+        # original .asi8 values as the modified values
+        # will not work around DST transitions.  See #8772
+        if not self.is_unique_asi8:
+            return None
+
+        delta = self.deltas_asi8[0]
+        pph = ppd // 24
+        ppm = pph // 60
+        pps = ppm // 60
+        if _is_multiple(delta, pph):
+            # Hours
+            return _maybe_add_count("h", delta / pph)
+        elif _is_multiple(delta, ppm):
+            # Minutes
+            return _maybe_add_count("min", delta / ppm)
+        elif _is_multiple(delta, pps):
+            # Seconds
+            return _maybe_add_count("s", delta / pps)
+        elif _is_multiple(delta, (pps // 1000)):
+            # Milliseconds
+            return _maybe_add_count("ms", delta / (pps // 1000))
+        elif _is_multiple(delta, (pps // 1_000_000)):
+            # Microseconds
+            return _maybe_add_count("us", delta / (pps // 1_000_000))
+        else:
+            # Nanoseconds
+            return _maybe_add_count("ns", delta)
+
+    @cache_readonly
+    def day_deltas(self) -> list[int]:
+        ppd = periods_per_day(self._creso)
+        return [x / ppd for x in self.deltas]
+
+    @cache_readonly
+    def hour_deltas(self) -> list[int]:
+        pph = periods_per_day(self._creso) // 24
+        return [x / pph for x in self.deltas]
+
+    @cache_readonly
+    def fields(self) -> np.ndarray:  # structured array of fields
+        return build_field_sarray(self.i8values, reso=self._creso)
+
+    @cache_readonly
+    def rep_stamp(self) -> Timestamp:
+        return Timestamp(self.i8values[0], unit=self.index.unit)
+
+    def month_position_check(self) -> str | None:
+        return month_position_check(self.fields, self.index.dayofweek)
+
+    @cache_readonly
+    def mdiffs(self) -> npt.NDArray[np.int64]:
+        nmonths = self.fields["Y"] * 12 + self.fields["M"]
+        return unique_deltas(nmonths.astype("i8"))
+
+    @cache_readonly
+    def ydiffs(self) -> npt.NDArray[np.int64]:
+        return unique_deltas(self.fields["Y"].astype("i8"))
+
+    def _infer_daily_rule(self) -> str | None:
+        annual_rule = self._get_annual_rule()
+        if annual_rule:
+            nyears = self.ydiffs[0]
+            month = MONTH_ALIASES[self.rep_stamp.month]
+            alias = f"{annual_rule}-{month}"
+            return _maybe_add_count(alias, nyears)
+
+        quarterly_rule = self._get_quarterly_rule()
+        if quarterly_rule:
+            nquarters = self.mdiffs[0] / 3
+            mod_dict = {0: 12, 2: 11, 1: 10}
+            month = MONTH_ALIASES[mod_dict[self.rep_stamp.month % 3]]
+            alias = f"{quarterly_rule}-{month}"
+            return _maybe_add_count(alias, nquarters)
+
+        monthly_rule = self._get_monthly_rule()
+        if monthly_rule:
+            return _maybe_add_count(monthly_rule, self.mdiffs[0])
+
+        if self.is_unique:
+            return self._get_daily_rule()
+
+        if self._is_business_daily():
+            return "B"
+
+        wom_rule = self._get_wom_rule()
+        if wom_rule:
+            return wom_rule
+
+        return None
+
+    def _get_daily_rule(self) -> str | None:
+        ppd = periods_per_day(self._creso)
+        days = self.deltas[0] / ppd
+        if days % 7 == 0:
+            # Weekly
+            wd = int_to_weekday[self.rep_stamp.weekday()]
+            alias = f"W-{wd}"
+            return _maybe_add_count(alias, days / 7)
+        else:
+            return _maybe_add_count("D", days)
+
+    def _get_annual_rule(self) -> str | None:
+        if len(self.ydiffs) > 1:
+            return None
+
+        if len(unique(self.fields["M"])) > 1:
+            return None
+
+        pos_check = self.month_position_check()
+
+        if pos_check is None:
+            return None
+        else:
+            return {"cs": "YS", "bs": "BYS", "ce": "YE", "be": "BYE"}.get(pos_check)
+
+    def _get_quarterly_rule(self) -> str | None:
+        if len(self.mdiffs) > 1:
+            return None
+
+        if not self.mdiffs[0] % 3 == 0:
+            return None
+
+        pos_check = self.month_position_check()
+
+        if pos_check is None:
+            return None
+        else:
+            return {"cs": "QS", "bs": "BQS", "ce": "QE", "be": "BQE"}.get(pos_check)
+
+    def _get_monthly_rule(self) -> str | None:
+        if len(self.mdiffs) > 1:
+            return None
+        pos_check = self.month_position_check()
+
+        if pos_check is None:
+            return None
+        else:
+            return {"cs": "MS", "bs": "BMS", "ce": "ME", "be": "BME"}.get(pos_check)
+
+    def _is_business_daily(self) -> bool:
+        # quick check: cannot be business daily
+        if self.day_deltas != [1, 3]:
+            return False
+
+        # probably business daily, but need to confirm
+        first_weekday = self.index[0].weekday()
+        shifts = np.diff(self.i8values)
+        ppd = periods_per_day(self._creso)
+        shifts = np.floor_divide(shifts, ppd)
+        weekdays = np.mod(first_weekday + np.cumsum(shifts), 7)
+
+        return bool(
+            np.all(
+                ((weekdays == 0) & (shifts == 3))
+                | ((weekdays > 0) & (weekdays <= 4) & (shifts == 1))
+            )
+        )
+
+    def _get_wom_rule(self) -> str | None:
+        weekdays = unique(self.index.weekday)
+        if len(weekdays) > 1:
+            return None
+
+        week_of_months = unique((self.index.day - 1) // 7)
+        # Only attempt to infer up to WOM-4. See #9425
+        week_of_months = week_of_months[week_of_months < 4]
+        if len(week_of_months) == 0 or len(week_of_months) > 1:
+            return None
+
+        # get which week
+        week = week_of_months[0] + 1
+        wd = int_to_weekday[weekdays[0]]
+
+        return f"WOM-{week}{wd}"
+
+
+class _TimedeltaFrequencyInferer(_FrequencyInferer):
+    def _infer_daily_rule(self):
+        if self.is_unique:
+            return self._get_daily_rule()
+
+
+def _is_multiple(us, mult: int) -> bool:
+    return us % mult == 0
+
+
+def _maybe_add_count(base: str, count: float) -> str:
+    if count != 1:
+        assert count == int(count)
+        count = int(count)
+        return f"{count}{base}"
+    else:
+        return base
+
+
+# ----------------------------------------------------------------------
+# Frequency comparison
+
+
+def is_subperiod(source, target) -> bool:
+    """
+    Returns True if downsampling is possible between source and target
+    frequencies
+
+    Parameters
+    ----------
+    source : str or DateOffset
+        Frequency converting from
+    target : str or DateOffset
+        Frequency converting to
+
+    Returns
+    -------
+    bool
+    """
+    if target is None or source is None:
+        return False
+    source = _maybe_coerce_freq(source)
+    target = _maybe_coerce_freq(target)
+
+    if _is_annual(target):
+        if _is_quarterly(source):
+            return _quarter_months_conform(
+                get_rule_month(source), get_rule_month(target)
+            )
+        return source in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"}
+    elif _is_quarterly(target):
+        return source in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"}
+    elif _is_monthly(target):
+        return source in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
+    elif _is_weekly(target):
+        return source in {target, "D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
+    elif target == "B":
+        return source in {"B", "h", "min", "s", "ms", "us", "ns"}
+    elif target == "C":
+        return source in {"C", "h", "min", "s", "ms", "us", "ns"}
+    elif target == "D":
+        return source in {"D", "h", "min", "s", "ms", "us", "ns"}
+    elif target == "h":
+        return source in {"h", "min", "s", "ms", "us", "ns"}
+    elif target == "min":
+        return source in {"min", "s", "ms", "us", "ns"}
+    elif target == "s":
+        return source in {"s", "ms", "us", "ns"}
+    elif target == "ms":
+        return source in {"ms", "us", "ns"}
+    elif target == "us":
+        return source in {"us", "ns"}
+    elif target == "ns":
+        return source in {"ns"}
+    else:
+        return False
+
+
+def is_superperiod(source, target) -> bool:
+    """
+    Returns True if upsampling is possible between source and target
+    frequencies
+
+    Parameters
+    ----------
+    source : str or DateOffset
+        Frequency converting from
+    target : str or DateOffset
+        Frequency converting to
+
+    Returns
+    -------
+    bool
+    """
+    if target is None or source is None:
+        return False
+    source = _maybe_coerce_freq(source)
+    target = _maybe_coerce_freq(target)
+
+    if _is_annual(source):
+        if _is_annual(target):
+            return get_rule_month(source) == get_rule_month(target)
+
+        if _is_quarterly(target):
+            smonth = get_rule_month(source)
+            tmonth = get_rule_month(target)
+            return _quarter_months_conform(smonth, tmonth)
+        return target in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"}
+    elif _is_quarterly(source):
+        return target in {"D", "C", "B", "M", "h", "min", "s", "ms", "us", "ns"}
+    elif _is_monthly(source):
+        return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
+    elif _is_weekly(source):
+        return target in {source, "D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
+    elif source == "B":
+        return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
+    elif source == "C":
+        return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
+    elif source == "D":
+        return target in {"D", "C", "B", "h", "min", "s", "ms", "us", "ns"}
+    elif source == "h":
+        return target in {"h", "min", "s", "ms", "us", "ns"}
+    elif source == "min":
+        return target in {"min", "s", "ms", "us", "ns"}
+    elif source == "s":
+        return target in {"s", "ms", "us", "ns"}
+    elif source == "ms":
+        return target in {"ms", "us", "ns"}
+    elif source == "us":
+        return target in {"us", "ns"}
+    elif source == "ns":
+        return target in {"ns"}
+    else:
+        return False
+
+
+def _maybe_coerce_freq(code) -> str:
+    """we might need to coerce a code to a rule_code
+    and uppercase it
+
+    Parameters
+    ----------
+    source : str or DateOffset
+        Frequency converting from
+
+    Returns
+    -------
+    str
+    """
+    assert code is not None
+    if isinstance(code, DateOffset):
+        code = PeriodDtype(to_offset(code.name))._freqstr
+    if code in {"h", "min", "s", "ms", "us", "ns"}:
+        return code
+    else:
+        return code.upper()
+
+
+def _quarter_months_conform(source: str, target: str) -> bool:
+    snum = MONTH_NUMBERS[source]
+    tnum = MONTH_NUMBERS[target]
+    return snum % 3 == tnum % 3
+
+
+def _is_annual(rule: str) -> bool:
+    rule = rule.upper()
+    return rule == "Y" or rule.startswith("Y-")
+
+
+def _is_quarterly(rule: str) -> bool:
+    rule = rule.upper()
+    return rule == "Q" or rule.startswith(("Q-", "BQ"))
+
+
+def _is_monthly(rule: str) -> bool:
+    rule = rule.upper()
+    return rule in ("M", "BM")
+
+
+def _is_weekly(rule: str) -> bool:
+    rule = rule.upper()
+    return rule == "W" or rule.startswith("W-")
+
+
+__all__ = [
+    "Day",
+    "get_period_alias",
+    "infer_freq",
+    "is_subperiod",
+    "is_superperiod",
+    "to_offset",
+]
diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5ab8cb2eb8bea8e239baca2ff0794931e264b7d
--- /dev/null
+++ b/pandas/tseries/holiday.py
@@ -0,0 +1,682 @@
+from __future__ import annotations
+
+from datetime import (
+    datetime,
+    timedelta,
+)
+from typing import (
+    TYPE_CHECKING,
+    Literal,
+    overload,
+)
+import warnings
+
+from dateutil.relativedelta import (
+    FR,
+    MO,
+    SA,
+    SU,
+    TH,
+    TU,
+    WE,
+)
+import numpy as np
+
+from pandas._libs.tslibs.offsets import BaseOffset
+from pandas.errors import PerformanceWarning
+
+from pandas import (
+    DateOffset,
+    DatetimeIndex,
+    Series,
+    Timestamp,
+    concat,
+    date_range,
+)
+
+from pandas.tseries.offsets import (
+    Day,
+    Easter,
+)
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
+def next_monday(dt: datetime) -> datetime:
+    """
+    If holiday falls on Saturday, use following Monday instead;
+    if holiday falls on Sunday, use Monday instead
+    """
+    if dt.weekday() == 5:
+        return dt + timedelta(2)
+    elif dt.weekday() == 6:
+        return dt + timedelta(1)
+    return dt
+
+
+def next_monday_or_tuesday(dt: datetime) -> datetime:
+    """
+    For second holiday of two adjacent ones!
+    If holiday falls on Saturday, use following Monday instead;
+    if holiday falls on Sunday or Monday, use following Tuesday instead
+    (because Monday is already taken by adjacent holiday on the day before)
+    """
+    dow = dt.weekday()
+    if dow in (5, 6):
+        return dt + timedelta(2)
+    if dow == 0:
+        return dt + timedelta(1)
+    return dt
+
+
+def previous_friday(dt: datetime) -> datetime:
+    """
+    If holiday falls on Saturday or Sunday, use previous Friday instead.
+    """
+    if dt.weekday() == 5:
+        return dt - timedelta(1)
+    elif dt.weekday() == 6:
+        return dt - timedelta(2)
+    return dt
+
+
+def sunday_to_monday(dt: datetime) -> datetime:
+    """
+    If holiday falls on Sunday, use day thereafter (Monday) instead.
+    """
+    if dt.weekday() == 6:
+        return dt + timedelta(1)
+    return dt
+
+
+def weekend_to_monday(dt: datetime) -> datetime:
+    """
+    If holiday falls on Sunday or Saturday,
+    use day thereafter (Monday) instead.
+    Needed for holidays such as Christmas observation in Europe
+    """
+    if dt.weekday() == 6:
+        return dt + timedelta(1)
+    elif dt.weekday() == 5:
+        return dt + timedelta(2)
+    return dt
+
+
+def nearest_workday(dt: datetime) -> datetime:
+    """
+    If holiday falls on Saturday, use day before (Friday) instead;
+    if holiday falls on Sunday, use day thereafter (Monday) instead.
+    """
+    if dt.weekday() == 5:
+        return dt - timedelta(1)
+    elif dt.weekday() == 6:
+        return dt + timedelta(1)
+    return dt
+
+
+def next_workday(dt: datetime) -> datetime:
+    """
+    returns next workday used for observances
+    """
+    dt += timedelta(days=1)
+    while dt.weekday() > 4:
+        # Mon-Fri are 0-4
+        dt += timedelta(days=1)
+    return dt
+
+
+def previous_workday(dt: datetime) -> datetime:
+    """
+    returns previous workday used for observances
+    """
+    dt -= timedelta(days=1)
+    while dt.weekday() > 4:
+        # Mon-Fri are 0-4
+        dt -= timedelta(days=1)
+    return dt
+
+
+def before_nearest_workday(dt: datetime) -> datetime:
+    """
+    returns previous workday before nearest workday
+    """
+    return previous_workday(nearest_workday(dt))
+
+
+def after_nearest_workday(dt: datetime) -> datetime:
+    """
+    returns next workday after nearest workday
+    needed for Boxing day or multiple holidays in a series
+    """
+    return next_workday(nearest_workday(dt))
+
+
+class Holiday:
+    """
+    Class that defines a holiday with start/end dates and rules
+    for observance.
+    """
+
+    start_date: Timestamp | None
+    end_date: Timestamp | None
+    days_of_week: tuple[int, ...] | None
+
+    def __init__(
+        self,
+        name: str,
+        year=None,
+        month=None,
+        day=None,
+        offset: BaseOffset | list[BaseOffset] | None = None,
+        observance: Callable | None = None,
+        start_date=None,
+        end_date=None,
+        days_of_week: tuple | None = None,
+        exclude_dates: DatetimeIndex | None = None,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        name : str
+            Name of the holiday , defaults to class name
+        year : int, default None
+            Year of the holiday
+        month : int, default None
+            Month of the holiday
+        day : int, default None
+            Day of the holiday
+        offset : list of pandas.tseries.offsets or
+                class from pandas.tseries.offsets, default None
+            Computes offset from date
+        observance : function, default None
+            Computes when holiday is given a pandas Timestamp
+        start_date : datetime-like, default None
+            First date the holiday is observed
+        end_date : datetime-like, default None
+            Last date the holiday is observed
+        days_of_week : tuple of int or dateutil.relativedelta weekday strs, default None
+            Provide a tuple of days e.g  (0,1,2,3,) for Monday through Thursday
+            Monday=0,..,Sunday=6
+            Only instances of the holiday included in days_of_week will be computed
+        exclude_dates : DatetimeIndex or default None
+            Specific dates to exclude e.g. skipping a specific year's holiday
+
+        Examples
+        --------
+        >>> from dateutil.relativedelta import MO
+
+        >>> USMemorialDay = pd.tseries.holiday.Holiday(
+        ...     "Memorial Day", month=5, day=31, offset=pd.DateOffset(weekday=MO(-1))
+        ... )
+        >>> USMemorialDay
+        Holiday: Memorial Day (month=5, day=31, offset=<DateOffset: weekday=MO(-1)>)
+
+        >>> USLaborDay = pd.tseries.holiday.Holiday(
+        ...     "Labor Day", month=9, day=1, offset=pd.DateOffset(weekday=MO(1))
+        ... )
+        >>> USLaborDay
+        Holiday: Labor Day (month=9, day=1, offset=<DateOffset: weekday=MO(+1)>)
+
+        >>> July3rd = pd.tseries.holiday.Holiday("July 3rd", month=7, day=3)
+        >>> July3rd
+        Holiday: July 3rd (month=7, day=3, )
+
+        >>> NewYears = pd.tseries.holiday.Holiday(
+        ...     "New Years Day",
+        ...     month=1,
+        ...     day=1,
+        ...     observance=pd.tseries.holiday.nearest_workday,
+        ... )
+        >>> NewYears  # doctest: +SKIP
+        Holiday: New Years Day (
+            month=1, day=1, observance=<function nearest_workday at 0x66545e9bc440>
+        )
+
+        >>> July3rd = pd.tseries.holiday.Holiday(
+        ...     "July 3rd", month=7, day=3, days_of_week=(0, 1, 2, 3)
+        ... )
+        >>> July3rd
+        Holiday: July 3rd (month=7, day=3, )
+        """
+        if offset is not None:
+            if observance is not None:
+                raise NotImplementedError("Cannot use both offset and observance.")
+            if not (
+                isinstance(offset, BaseOffset)
+                or (
+                    isinstance(offset, list)
+                    and all(isinstance(off, BaseOffset) for off in offset)
+                )
+            ):
+                raise ValueError(
+                    "Only BaseOffsets and flat lists of them are supported for offset."
+                )
+
+        self.name = name
+        self.year = year
+        self.month = month
+        self.day = day
+        self.offset = offset
+        self.start_date = (
+            Timestamp(start_date) if start_date is not None else start_date
+        )
+        self.end_date = Timestamp(end_date) if end_date is not None else end_date
+        self.observance = observance
+        if not (days_of_week is None or isinstance(days_of_week, tuple)):
+            raise ValueError("days_of_week must be None or tuple.")
+        self.days_of_week = days_of_week
+        if not (exclude_dates is None or isinstance(exclude_dates, DatetimeIndex)):
+            raise ValueError("exclude_dates must be None or of type DatetimeIndex.")
+        self.exclude_dates = exclude_dates
+
+    def __repr__(self) -> str:
+        info = ""
+        if self.year is not None:
+            info += f"year={self.year}, "
+        info += f"month={self.month}, day={self.day}, "
+
+        if self.offset is not None:
+            info += f"offset={self.offset}"
+
+        if self.observance is not None:
+            info += f"observance={self.observance}"
+
+        repr = f"Holiday: {self.name} ({info})"
+        return repr
+
+    @overload
+    def dates(self, start_date, end_date, return_name: Literal[True]) -> Series: ...
+
+    @overload
+    def dates(
+        self, start_date, end_date, return_name: Literal[False]
+    ) -> DatetimeIndex: ...
+
+    @overload
+    def dates(self, start_date, end_date) -> DatetimeIndex: ...
+
+    def dates(
+        self, start_date, end_date, return_name: bool = False
+    ) -> Series | DatetimeIndex:
+        """
+        Calculate holidays observed between start date and end date
+
+        Parameters
+        ----------
+        start_date : starting date, datetime-like, optional
+        end_date : ending date, datetime-like, optional
+        return_name : bool, optional, default=False
+            If True, return a series that has dates and holiday names.
+            False will only return dates.
+
+        Returns
+        -------
+        Series or DatetimeIndex
+            Series if return_name is True
+        """
+        start_date = Timestamp(start_date)
+        end_date = Timestamp(end_date)
+
+        filter_start_date = start_date
+        filter_end_date = end_date
+
+        if self.year is not None:
+            dt = Timestamp(datetime(self.year, self.month, self.day))
+            dti = DatetimeIndex([dt])
+            if return_name:
+                return Series(self.name, index=dti)
+            else:
+                return dti
+
+        dates = self._reference_dates(start_date, end_date)
+        holiday_dates = self._apply_rule(dates)
+        if self.days_of_week is not None:
+            holiday_dates = holiday_dates[
+                np.isin(
+                    # error: "DatetimeIndex" has no attribute "dayofweek"
+                    holiday_dates.dayofweek,  # type: ignore[attr-defined]
+                    self.days_of_week,
+                ).ravel()
+            ]
+
+        if self.start_date is not None:
+            filter_start_date = max(
+                self.start_date.tz_localize(filter_start_date.tz), filter_start_date
+            )
+        if self.end_date is not None:
+            filter_end_date = min(
+                self.end_date.tz_localize(filter_end_date.tz), filter_end_date
+            )
+        holiday_dates = holiday_dates[
+            (holiday_dates >= filter_start_date) & (holiday_dates <= filter_end_date)
+        ]
+
+        if self.exclude_dates is not None:
+            holiday_dates = holiday_dates.difference(self.exclude_dates)
+        if return_name:
+            return Series(self.name, index=holiday_dates)
+        return holiday_dates
+
+    def _reference_dates(
+        self, start_date: Timestamp, end_date: Timestamp
+    ) -> DatetimeIndex:
+        """
+        Get reference dates for the holiday.
+
+        Return reference dates for the holiday also returning the year
+        prior to the start_date and year following the end_date.  This ensures
+        that any offsets to be applied will yield the holidays within
+        the passed in dates.
+        """
+        if self.start_date is not None:
+            start_date = self.start_date.tz_localize(start_date.tz)
+
+        if self.end_date is not None:
+            end_date = self.end_date.tz_localize(start_date.tz)
+
+        year_offset = DateOffset(years=1)
+        reference_start_date = Timestamp(
+            datetime(start_date.year - 1, self.month, self.day)
+        )
+
+        reference_end_date = Timestamp(
+            datetime(end_date.year + 1, self.month, self.day)
+        )
+        # Don't process unnecessary holidays
+        dates = date_range(
+            start=reference_start_date,
+            end=reference_end_date,
+            freq=year_offset,
+            tz=start_date.tz,
+        )
+
+        return dates
+
+    def _apply_rule(self, dates: DatetimeIndex) -> DatetimeIndex:
+        """
+        Apply the given offset/observance to a DatetimeIndex of dates.
+
+        Parameters
+        ----------
+        dates : DatetimeIndex
+            Dates to apply the given offset/observance rule
+
+        Returns
+        -------
+        Dates with rules applied
+        """
+        if dates.empty:
+            return dates.copy()
+
+        if self.observance is not None:
+            return dates.map(lambda d: self.observance(d))
+
+        if self.offset is not None:
+            if not isinstance(self.offset, list):
+                offsets = [self.offset]
+            else:
+                offsets = self.offset
+            for offset in offsets:
+                # if we are adding a non-vectorized value
+                # ignore the PerformanceWarnings:
+                with warnings.catch_warnings():
+                    warnings.simplefilter("ignore", PerformanceWarning)
+                    dates += offset
+        return dates
+
+
+holiday_calendars: dict[str, type[AbstractHolidayCalendar]] = {}
+
+
+def register(cls) -> None:
+    try:
+        name = cls.name
+    except AttributeError:
+        name = cls.__name__
+    holiday_calendars[name] = cls
+
+
+def get_calendar(name: str) -> AbstractHolidayCalendar:
+    """
+    Return an instance of a calendar based on its name.
+
+    Parameters
+    ----------
+    name : str
+        Calendar name to return an instance of
+    """
+    return holiday_calendars[name]()
+
+
+class HolidayCalendarMetaClass(type):
+    def __new__(cls, clsname: str, bases, attrs):
+        calendar_class = super().__new__(cls, clsname, bases, attrs)
+        register(calendar_class)
+        return calendar_class
+
+
+class AbstractHolidayCalendar(metaclass=HolidayCalendarMetaClass):
+    """
+    Abstract interface to create holidays following certain rules.
+    """
+
+    rules: list[Holiday] = []
+    start_date = Timestamp(datetime(1970, 1, 1))
+    end_date = Timestamp(datetime(2200, 12, 31))
+    _cache: tuple[Timestamp, Timestamp, Series] | None = None
+
+    def __init__(self, name: str = "", rules=None) -> None:
+        """
+        Initializes holiday object with a given set a rules.  Normally
+        classes just have the rules defined within them.
+
+        Parameters
+        ----------
+        name : str
+            Name of the holiday calendar, defaults to class name
+        rules : array of Holiday objects
+            A set of rules used to create the holidays.
+        """
+        super().__init__()
+        if not name:
+            name = type(self).__name__
+        self.name = name
+
+        if rules is not None:
+            self.rules = rules
+
+    def rule_from_name(self, name: str) -> Holiday | None:
+        for rule in self.rules:
+            if rule.name == name:
+                return rule
+
+        return None
+
+    def holidays(
+        self, start=None, end=None, return_name: bool = False
+    ) -> DatetimeIndex | Series:
+        """
+        Returns a curve with holidays between start_date and end_date
+
+        Parameters
+        ----------
+        start : starting date, datetime-like, optional
+        end : ending date, datetime-like, optional
+        return_name : bool, optional
+            If True, return a series that has dates and holiday names.
+            False will only return a DatetimeIndex of dates.
+
+        Returns
+        -------
+            DatetimeIndex of holidays
+        """
+        if self.rules is None:
+            raise Exception(
+                f"Holiday Calendar {self.name} does not have any rules specified"
+            )
+
+        if start is None:
+            start = AbstractHolidayCalendar.start_date
+
+        if end is None:
+            end = AbstractHolidayCalendar.end_date
+
+        start = Timestamp(start)
+        end = Timestamp(end)
+
+        # If we don't have a cache or the dates are outside the prior cache, we
+        # get them again
+        if self._cache is None or start < self._cache[0] or end > self._cache[1]:
+            pre_holidays = [
+                rule.dates(start, end, return_name=True) for rule in self.rules
+            ]
+            if pre_holidays:
+                holidays = concat(pre_holidays)
+            else:
+                holidays = Series(index=DatetimeIndex([]), dtype=object)
+
+            self._cache = (start, end, holidays.sort_index())
+
+        holidays = self._cache[2]
+        holidays = holidays[start:end]
+
+        if return_name:
+            return holidays
+        else:
+            return holidays.index
+
+    @staticmethod
+    def merge_class(base, other):
+        """
+        Merge holiday calendars together. The base calendar
+        will take precedence to other. The merge will be done
+        based on each holiday's name.
+
+        Parameters
+        ----------
+        base : AbstractHolidayCalendar
+          instance/subclass or array of Holiday objects
+        other : AbstractHolidayCalendar
+          instance/subclass or array of Holiday objects
+        """
+        try:
+            other = other.rules
+        except AttributeError:
+            pass
+
+        if not isinstance(other, list):
+            other = [other]
+        other_holidays = {holiday.name: holiday for holiday in other}
+
+        try:
+            base = base.rules
+        except AttributeError:
+            pass
+
+        if not isinstance(base, list):
+            base = [base]
+        base_holidays = {holiday.name: holiday for holiday in base}
+
+        other_holidays.update(base_holidays)
+        return list(other_holidays.values())
+
+    def merge(self, other, inplace: bool = False):
+        """
+        Merge holiday calendars together.  The caller's class
+        rules take precedence.  The merge will be done
+        based on each holiday's name.
+
+        Parameters
+        ----------
+        other : holiday calendar
+        inplace : bool (default=False)
+            If True set rule_table to holidays, else return array of Holidays
+        """
+        holidays = self.merge_class(self, other)
+        if inplace:
+            self.rules = holidays
+        else:
+            return holidays
+
+
+USMemorialDay = Holiday(
+    "Memorial Day", month=5, day=31, offset=DateOffset(weekday=MO(-1))
+)
+USLaborDay = Holiday("Labor Day", month=9, day=1, offset=DateOffset(weekday=MO(1)))
+USColumbusDay = Holiday(
+    "Columbus Day", month=10, day=1, offset=DateOffset(weekday=MO(2))
+)
+USThanksgivingDay = Holiday(
+    "Thanksgiving Day", month=11, day=1, offset=DateOffset(weekday=TH(4))
+)
+USMartinLutherKingJr = Holiday(
+    "Birthday of Martin Luther King, Jr.",
+    start_date=datetime(1986, 1, 1),
+    month=1,
+    day=1,
+    offset=DateOffset(weekday=MO(3)),
+)
+USPresidentsDay = Holiday(
+    "Washington's Birthday", month=2, day=1, offset=DateOffset(weekday=MO(3))
+)
+GoodFriday = Holiday("Good Friday", month=1, day=1, offset=[Easter(), Day(-2)])
+
+EasterMonday = Holiday("Easter Monday", month=1, day=1, offset=[Easter(), Day(1)])
+
+
+class USFederalHolidayCalendar(AbstractHolidayCalendar):
+    """
+    US Federal Government Holiday Calendar based on rules specified by:
+    https://www.opm.gov/policy-data-oversight/pay-leave/federal-holidays/
+    """
+
+    rules = [
+        Holiday("New Year's Day", month=1, day=1, observance=nearest_workday),
+        USMartinLutherKingJr,
+        USPresidentsDay,
+        USMemorialDay,
+        Holiday(
+            "Juneteenth National Independence Day",
+            month=6,
+            day=19,
+            start_date="2021-06-18",
+            observance=nearest_workday,
+        ),
+        Holiday("Independence Day", month=7, day=4, observance=nearest_workday),
+        USLaborDay,
+        USColumbusDay,
+        Holiday("Veterans Day", month=11, day=11, observance=nearest_workday),
+        USThanksgivingDay,
+        Holiday("Christmas Day", month=12, day=25, observance=nearest_workday),
+    ]
+
+
+def HolidayCalendarFactory(name: str, base, other, base_class=AbstractHolidayCalendar):
+    rules = AbstractHolidayCalendar.merge_class(base, other)
+    calendar_class = type(name, (base_class,), {"rules": rules, "name": name})
+    return calendar_class
+
+
+__all__ = [
+    "FR",
+    "MO",
+    "SA",
+    "SU",
+    "TH",
+    "TU",
+    "WE",
+    "HolidayCalendarFactory",
+    "after_nearest_workday",
+    "before_nearest_workday",
+    "get_calendar",
+    "nearest_workday",
+    "next_monday",
+    "next_monday_or_tuesday",
+    "next_workday",
+    "previous_friday",
+    "previous_workday",
+    "register",
+    "sunday_to_monday",
+    "weekend_to_monday",
+]
diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f0c4281ffc773ae49f7d9190609cbf2e57f8564
--- /dev/null
+++ b/pandas/tseries/offsets.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+from pandas._libs.tslibs.offsets import (
+    FY5253,
+    BaseOffset,
+    BDay,
+    BHalfYearBegin,
+    BHalfYearEnd,
+    BMonthBegin,
+    BMonthEnd,
+    BQuarterBegin,
+    BQuarterEnd,
+    BusinessDay,
+    BusinessHour,
+    BusinessMonthBegin,
+    BusinessMonthEnd,
+    BYearBegin,
+    BYearEnd,
+    CBMonthBegin,
+    CBMonthEnd,
+    CDay,
+    CustomBusinessDay,
+    CustomBusinessHour,
+    CustomBusinessMonthBegin,
+    CustomBusinessMonthEnd,
+    DateOffset,
+    Day,
+    Easter,
+    FY5253Quarter,
+    HalfYearBegin,
+    HalfYearEnd,
+    Hour,
+    LastWeekOfMonth,
+    Micro,
+    Milli,
+    Minute,
+    MonthBegin,
+    MonthEnd,
+    Nano,
+    QuarterBegin,
+    QuarterEnd,
+    Second,
+    SemiMonthBegin,
+    SemiMonthEnd,
+    Tick,
+    Week,
+    WeekOfMonth,
+    YearBegin,
+    YearEnd,
+)
+
+__all__ = [
+    "FY5253",
+    "BDay",
+    "BHalfYearBegin",
+    "BHalfYearEnd",
+    "BMonthBegin",
+    "BMonthEnd",
+    "BQuarterBegin",
+    "BQuarterEnd",
+    "BYearBegin",
+    "BYearEnd",
+    "BaseOffset",
+    "BusinessDay",
+    "BusinessHour",
+    "BusinessMonthBegin",
+    "BusinessMonthEnd",
+    "CBMonthBegin",
+    "CBMonthEnd",
+    "CDay",
+    "CustomBusinessDay",
+    "CustomBusinessHour",
+    "CustomBusinessMonthBegin",
+    "CustomBusinessMonthEnd",
+    "DateOffset",
+    "Day",
+    "Easter",
+    "FY5253Quarter",
+    "HalfYearBegin",
+    "HalfYearEnd",
+    "Hour",
+    "LastWeekOfMonth",
+    "Micro",
+    "Milli",
+    "Minute",
+    "MonthBegin",
+    "MonthEnd",
+    "Nano",
+    "QuarterBegin",
+    "QuarterEnd",
+    "Second",
+    "SemiMonthBegin",
+    "SemiMonthEnd",
+    "Tick",
+    "Week",
+    "WeekOfMonth",
+    "YearBegin",
+    "YearEnd",
+]
diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a462080f328f4c7111a45147beb09f866597b063
--- /dev/null
+++ b/pandas/util/__init__.py
@@ -0,0 +1,29 @@
+def __getattr__(key: str):
+    # These imports need to be lazy to avoid circular import errors
+    if key == "hash_array":
+        from pandas.core.util.hashing import hash_array
+
+        return hash_array
+    if key == "hash_pandas_object":
+        from pandas.core.util.hashing import hash_pandas_object
+
+        return hash_pandas_object
+    if key == "Appender":
+        from pandas.util._decorators import Appender
+
+        return Appender
+    if key == "Substitution":
+        from pandas.util._decorators import Substitution
+
+        return Substitution
+
+    if key == "cache_readonly":
+        from pandas.util._decorators import cache_readonly
+
+        return cache_readonly
+
+    raise AttributeError(f"module 'pandas.util' has no attribute '{key}'")
+
+
+def __dir__() -> list[str]:
+    return [*list(globals().keys()), "hash_array", "hash_pandas_object"]
diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd2ed6c00e48c8531838b9987c39c1131374961a
--- /dev/null
+++ b/pandas/util/_decorators.py
@@ -0,0 +1,532 @@
+from __future__ import annotations
+
+from functools import wraps
+import inspect
+from textwrap import dedent
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    cast,
+)
+import warnings
+
+from pandas._libs.properties import cache_readonly
+from pandas._typing import (
+    F,
+    T,
+)
+from pandas.util._exceptions import find_stack_level
+
+if TYPE_CHECKING:
+    from collections.abc import (
+        Callable,
+        Mapping,
+    )
+
+    from pandas.errors import PandasChangeWarning
+
+
+def deprecate(
+    klass: type[Warning],
+    name: str,
+    alternative: Callable[..., Any],
+    version: str,
+    alt_name: str | None = None,
+    stacklevel: int = 2,
+    msg: str | None = None,
+) -> Callable[[F], F]:
+    """
+    Return a new function that emits a deprecation warning on use.
+
+    To use this method for a deprecated function, another function
+    `alternative` with the same signature must exist. The deprecated
+    function will emit a deprecation warning, and in the docstring
+    it will contain the deprecation directive with the provided version
+    so it can be detected for future removal.
+
+    Parameters
+    ----------
+    klass : Warning
+        The warning class to use.
+    name : str
+        Name of function to deprecate.
+    alternative : func
+        Function to use instead.
+    version : str
+        Version of pandas in which the method has been deprecated.
+    alt_name : str, optional
+        Name to use in preference of alternative.__name__.
+    stacklevel : int, default 2
+    msg : str
+        The message to display in the warning.
+        Default is '{name} is deprecated. Use {alt_name} instead.'
+    """
+    alt_name = alt_name or alternative.__name__
+    warning_msg = msg or f"{name} is deprecated, use {alt_name} instead."
+
+    @wraps(alternative)
+    def wrapper(*args, **kwargs) -> Callable[..., Any]:
+        warnings.warn(warning_msg, klass, stacklevel=stacklevel)
+        return alternative(*args, **kwargs)
+
+    # adding deprecated directive to the docstring
+    msg = msg or f"Use `{alt_name}` instead."
+    doc_error_msg = (
+        "deprecate needs a correctly formatted docstring in "
+        "the target function (should have a one liner short "
+        "summary, and opening quotes should be in their own "
+        f"line). Found:\n{alternative.__doc__}"
+    )
+
+    # when python is running in optimized mode (i.e. `-OO`), docstrings are
+    # removed, so we check that a docstring with correct formatting is used
+    # but we allow empty docstrings
+    if alternative.__doc__:
+        if alternative.__doc__.count("\n") < 3:
+            raise AssertionError(doc_error_msg)
+        empty1, summary, empty2, doc_string = alternative.__doc__.split("\n", 3)
+        if empty1 or (empty2 and not summary):
+            raise AssertionError(doc_error_msg)
+        wrapper.__doc__ = dedent(
+            f"""
+        {summary.strip()}
+
+        .. deprecated:: {version}
+            {msg}
+
+        {dedent(doc_string)}"""
+        )
+    # error: Incompatible return value type (got "Callable[[VarArg(Any), KwArg(Any)],
+    # Callable[...,Any]]", expected "Callable[[F], F]")
+    return wrapper  # type: ignore[return-value]
+
+
+def deprecate_kwarg(
+    klass: type[Warning],
+    old_arg_name: str,
+    new_arg_name: str | None,
+    mapping: Mapping[Any, Any] | Callable[[Any], Any] | None = None,
+    stacklevel: int = 2,
+) -> Callable[[F], F]:
+    """
+    Decorator to deprecate a keyword argument of a function.
+
+    Parameters
+    ----------
+    klass : Warning
+        The warning class to use.
+    old_arg_name : str
+        Name of argument in function to deprecate.
+    new_arg_name : str or None
+        Name of preferred argument in function. Use None to raise warning that
+        ``old_arg_name`` keyword is deprecated.
+    mapping : dict or callable
+        If mapping is present, use it to translate old arguments to
+        new arguments. A callable must do its own value checking;
+        values not found in a dict will be forwarded unchanged.
+    stacklevel : int, default 2
+
+    Examples
+    --------
+    The following deprecates 'cols', using 'columns' instead
+
+    >>> @deprecate_kwarg(FutureWarning, old_arg_name="cols", new_arg_name="columns")
+    ... def f(columns=""):
+    ...     print(columns)
+    >>> f(columns="should work ok")
+    should work ok
+
+    >>> f(cols="should raise warning")  # doctest: +SKIP
+    FutureWarning: cols is deprecated, use columns instead
+      warnings.warn(msg, FutureWarning)
+    should raise warning
+
+    >>> f(cols="should error", columns="can't pass do both")  # doctest: +SKIP
+    TypeError: Can only specify 'cols' or 'columns', not both
+
+    >>> @deprecate_kwarg(FutureWarning, "old", "new", {"yes": True, "no": False})
+    ... def f(new=False):
+    ...     print("yes!" if new else "no!")
+    >>> f(old="yes")  # doctest: +SKIP
+    FutureWarning: old='yes' is deprecated, use new=True instead
+      warnings.warn(msg, FutureWarning)
+    yes!
+
+    To raise a warning that a keyword will be removed entirely in the future
+
+    >>> @deprecate_kwarg(FutureWarning, old_arg_name="cols", new_arg_name=None)
+    ... def f(cols="", another_param=""):
+    ...     print(cols)
+    >>> f(cols="should raise warning")  # doctest: +SKIP
+    FutureWarning: the 'cols' keyword is deprecated and will be removed in a
+    future version. Please take steps to stop the use of 'cols'
+    should raise warning
+    >>> f(another_param="should not raise warning")  # doctest: +SKIP
+    should not raise warning
+
+    >>> f(cols="should raise warning", another_param="")  # doctest: +SKIP
+    FutureWarning: the 'cols' keyword is deprecated and will be removed in a
+    future version. Please take steps to stop the use of 'cols'
+    should raise warning
+    """
+    if mapping is not None and not hasattr(mapping, "get") and not callable(mapping):
+        raise TypeError(
+            "mapping from old to new argument values must be dict or callable!"
+        )
+
+    def _deprecate_kwarg(func: F) -> F:
+        @wraps(func)
+        def wrapper(*args, **kwargs) -> Callable[..., Any]:
+            __tracebackhide__ = True
+
+            old_arg_value = kwargs.pop(old_arg_name, None)
+
+            if old_arg_value is not None:
+                if new_arg_name is None:
+                    msg = (
+                        f"the {old_arg_name!r} keyword is deprecated and "
+                        "will be removed in a future version. Please take "
+                        f"steps to stop the use of {old_arg_name!r}"
+                    )
+                    warnings.warn(msg, klass, stacklevel=stacklevel)
+                    kwargs[old_arg_name] = old_arg_value
+                    return func(*args, **kwargs)
+
+                elif mapping is not None:
+                    if callable(mapping):
+                        new_arg_value = mapping(old_arg_value)
+                    else:
+                        new_arg_value = mapping.get(old_arg_value, old_arg_value)
+                    msg = (
+                        f"the {old_arg_name}={old_arg_value!r} keyword is "
+                        "deprecated, use "
+                        f"{new_arg_name}={new_arg_value!r} instead."
+                    )
+                else:
+                    new_arg_value = old_arg_value
+                    msg = (
+                        f"the {old_arg_name!r} keyword is deprecated, "
+                        f"use {new_arg_name!r} instead."
+                    )
+
+                warnings.warn(msg, klass, stacklevel=stacklevel)
+                if kwargs.get(new_arg_name) is not None:
+                    msg = (
+                        f"Can only specify {old_arg_name!r} "
+                        f"or {new_arg_name!r}, not both."
+                    )
+                    raise TypeError(msg)
+                kwargs[new_arg_name] = new_arg_value
+            return func(*args, **kwargs)
+
+        return cast(F, wrapper)
+
+    return _deprecate_kwarg
+
+
+def _format_argument_list(allow_args: list[str]) -> str:
+    """
+    Convert the allow_args argument (either string or integer) of
+    `deprecate_nonkeyword_arguments` function to a string describing
+    it to be inserted into warning message.
+
+    Parameters
+    ----------
+    allowed_args : list, tuple or int
+        The `allowed_args` argument for `deprecate_nonkeyword_arguments`,
+        but None value is not allowed.
+
+    Returns
+    -------
+    str
+        The substring describing the argument list in best way to be
+        inserted to the warning message.
+
+    Examples
+    --------
+    `format_argument_list([])` -> ''
+    `format_argument_list(['a'])` -> "except for the arguments 'a'"
+    `format_argument_list(['a', 'b'])` -> "except for the arguments 'a' and 'b'"
+    `format_argument_list(['a', 'b', 'c'])` ->
+        "except for the arguments 'a', 'b' and 'c'"
+    """
+    if "self" in allow_args:
+        allow_args.remove("self")
+    if not allow_args:
+        return ""
+    elif len(allow_args) == 1:
+        return f" except for the argument '{allow_args[0]}'"
+    else:
+        last = allow_args[-1]
+        args = ", ".join(["'" + x + "'" for x in allow_args[:-1]])
+        return f" except for the arguments {args} and '{last}'"
+
+
+def future_version_msg(version: str | None) -> str:
+    """Specify which version of pandas the deprecation will take place in."""
+    if version is None:
+        return "In a future version of pandas"
+    else:
+        return f"Starting with pandas version {version}"
+
+
+def deprecate_nonkeyword_arguments(
+    klass: type[PandasChangeWarning],
+    allowed_args: list[str] | None = None,
+    name: str | None = None,
+) -> Callable[[F], F]:
+    """
+    Decorator to deprecate a use of non-keyword arguments of a function.
+
+    Parameters
+    ----------
+    klass : Warning
+        The warning class to use.
+    allowed_args : list, optional
+        In case of list, it must be the list of names of some
+        first arguments of the decorated functions that are
+        OK to be given as positional arguments. In case of None value,
+        defaults to list of all arguments not having the
+        default value.
+    name : str, optional
+        The specific name of the function to show in the warning
+        message. If None, then the Qualified name of the function
+        is used.
+    """
+
+    def decorate(func):
+        old_sig = inspect.signature(func)
+
+        if allowed_args is not None:
+            allow_args = allowed_args
+        else:
+            allow_args = [
+                p.name
+                for p in old_sig.parameters.values()
+                if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
+                and p.default is p.empty
+            ]
+
+        new_params = [
+            p.replace(kind=p.KEYWORD_ONLY)
+            if (
+                p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD)
+                and p.name not in allow_args
+            )
+            else p
+            for p in old_sig.parameters.values()
+        ]
+        new_params.sort(key=lambda p: p.kind)
+        new_sig = old_sig.replace(parameters=new_params)
+
+        num_allow_args = len(allow_args)
+        msg = (
+            f"{future_version_msg(klass.version())} all arguments of "
+            f"{name or func.__qualname__}{{arguments}} will be keyword-only."
+        )
+
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            if len(args) > num_allow_args:
+                warnings.warn(
+                    msg.format(arguments=_format_argument_list(allow_args)),
+                    klass,
+                    stacklevel=find_stack_level(),
+                )
+            return func(*args, **kwargs)
+
+        # error: "Callable[[VarArg(Any), KwArg(Any)], Any]" has no
+        # attribute "__signature__"
+        wrapper.__signature__ = new_sig  # type: ignore[attr-defined]
+        return wrapper
+
+    return decorate
+
+
+def doc(*docstrings: None | str | Callable, **params: object) -> Callable[[F], F]:
+    """
+    A decorator to take docstring templates, concatenate them and perform string
+    substitution on them.
+
+    This decorator will add a variable "_docstring_components" to the wrapped
+    callable to keep track the original docstring template for potential usage.
+    If it should be consider as a template, it will be saved as a string.
+    Otherwise, it will be saved as callable, and later user __doc__ and dedent
+    to get docstring.
+
+    Parameters
+    ----------
+    *docstrings : None, str, or callable
+        The string / docstring / docstring template to be appended in order
+        after default docstring under callable.
+    **params
+        The string which would be used to format docstring template.
+    """
+
+    def decorator(decorated: F) -> F:
+        # collecting docstring and docstring templates
+        docstring_components: list[str | Callable] = []
+        if decorated.__doc__:
+            docstring_components.append(dedent(decorated.__doc__))
+
+        for docstring in docstrings:
+            if docstring is None:
+                continue
+            if hasattr(docstring, "_docstring_components"):
+                docstring_components.extend(
+                    docstring._docstring_components  # pyright: ignore[reportAttributeAccessIssue]
+                )
+            elif isinstance(docstring, str) or docstring.__doc__:
+                docstring_components.append(docstring)
+
+        params_applied = [
+            component.format(**params)
+            if isinstance(component, str) and len(params) > 0
+            else component
+            for component in docstring_components
+        ]
+
+        decorated.__doc__ = "".join(
+            [
+                component
+                if isinstance(component, str)
+                else dedent(component.__doc__ or "")
+                for component in params_applied
+            ]
+        )
+
+        # error: "F" has no attribute "_docstring_components"
+        decorated._docstring_components = (  # type: ignore[attr-defined]
+            docstring_components
+        )
+        return decorated
+
+    return decorator
+
+
+# Substitution and Appender are derived from matplotlib.docstring (1.1.0)
+# module https://matplotlib.org/users/license.html
+
+
+class Substitution:
+    """
+    A decorator to take a function's docstring and perform string
+    substitution on it.
+
+    This decorator should be robust even if func.__doc__ is None
+    (for example, if -OO was passed to the interpreter)
+
+    Usage: construct a docstring.Substitution with a sequence or
+    dictionary suitable for performing substitution; then
+    decorate a suitable function with the constructed object. e.g.
+
+    sub_author_name = Substitution(author='Jason')
+
+    @sub_author_name
+    def some_function(x):
+        "%(author)s wrote this function"
+
+    # note that some_function.__doc__ is now "Jason wrote this function"
+
+    One can also use positional arguments.
+
+    sub_first_last_names = Substitution('Edgar Allen', 'Poe')
+
+    @sub_first_last_names
+    def some_function(x):
+        "%s %s wrote the Raven"
+    """
+
+    def __init__(self, *args, **kwargs) -> None:
+        if args and kwargs:
+            raise AssertionError("Only positional or keyword args are allowed")
+
+        self.params = args or kwargs
+
+    def __call__(self, func: F) -> F:
+        func.__doc__ = func.__doc__ and func.__doc__ % self.params
+        return func
+
+    def update(self, *args, **kwargs) -> None:
+        """
+        Update self.params with supplied args.
+        """
+        if isinstance(self.params, dict):
+            self.params.update(*args, **kwargs)
+
+
+class Appender:
+    """
+    A function decorator that will append an addendum to the docstring
+    of the target function.
+
+    This decorator should be robust even if func.__doc__ is None
+    (for example, if -OO was passed to the interpreter).
+
+    Usage: construct a docstring.Appender with a string to be joined to
+    the original docstring. An optional 'join' parameter may be supplied
+    which will be used to join the docstring and addendum. e.g.
+
+    add_copyright = Appender("Copyright (c) 2009", join='\n')
+
+    @add_copyright
+    def my_dog(has='fleas'):
+        "This docstring will have a copyright below"
+        pass
+    """
+
+    addendum: str | None
+
+    def __init__(self, addendum: str | None, join: str = "", indents: int = 0) -> None:
+        if indents > 0:
+            self.addendum = indent(addendum, indents=indents)
+        else:
+            self.addendum = addendum
+        self.join = join
+
+    def __call__(self, func: T) -> T:
+        func.__doc__ = func.__doc__ if func.__doc__ else ""
+        self.addendum = self.addendum if self.addendum else ""
+        docitems = [func.__doc__, self.addendum]
+        func.__doc__ = dedent(self.join.join(docitems))
+        return func
+
+
+def indent(text: str | None, indents: int = 1) -> str:
+    if not text or not isinstance(text, str):
+        return ""
+    jointext = "".join(["\n"] + ["    "] * indents)
+    return jointext.join(text.split("\n"))
+
+
+__all__ = [
+    "Appender",
+    "Substitution",
+    "cache_readonly",
+    "deprecate",
+    "deprecate_kwarg",
+    "deprecate_nonkeyword_arguments",
+    "doc",
+    "future_version_msg",
+]
+
+
+def set_module(module) -> Callable[[F], F]:
+    """Private decorator for overriding __module__ on a function or class.
+
+    Example usage::
+
+        @set_module("pandas")
+        def example():
+            pass
+
+
+        assert example.__module__ == "pandas"
+    """
+
+    def decorator(func: F) -> F:
+        if module is not None:
+            func.__module__ = module
+        return func
+
+    return decorator
diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py
new file mode 100644
index 0000000000000000000000000000000000000000..61bb456aec59fa7005c4e3ccadad3b9d9f0b2e37
--- /dev/null
+++ b/pandas/util/_doctools.py
@@ -0,0 +1,206 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+
+import pandas as pd
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from matplotlib.figure import Figure
+
+
+class TablePlotter:
+    """
+    Layout some DataFrames in vertical/horizontal layout for explanation.
+    Used in merging.rst
+    """
+
+    def __init__(
+        self,
+        cell_width: float = 0.37,
+        cell_height: float = 0.25,
+        font_size: float = 7.5,
+    ) -> None:
+        self.cell_width = cell_width
+        self.cell_height = cell_height
+        self.font_size = font_size
+
+    def _shape(self, df: pd.DataFrame) -> tuple[int, int]:
+        """
+        Calculate table shape considering index levels.
+        """
+        row, col = df.shape
+        return row + df.columns.nlevels, col + df.index.nlevels
+
+    def _get_cells(self, left, right, vertical) -> tuple[int, int]:
+        """
+        Calculate appropriate figure size based on left and right data.
+        """
+        if vertical:
+            # calculate required number of cells
+            vcells = max(sum(self._shape(df)[0] for df in left), self._shape(right)[0])
+            hcells = max(self._shape(df)[1] for df in left) + self._shape(right)[1]
+        else:
+            vcells = max([self._shape(df)[0] for df in left] + [self._shape(right)[0]])
+            hcells = sum([self._shape(df)[1] for df in left] + [self._shape(right)[1]])
+        return hcells, vcells
+
+    def plot(
+        self, left, right, labels: Iterable[str] = (), vertical: bool = True
+    ) -> Figure:
+        """
+        Plot left / right DataFrames in specified layout.
+
+        Parameters
+        ----------
+        left : list of DataFrames before operation is applied
+        right : DataFrame of operation result
+        labels : list of str to be drawn as titles of left DataFrames
+        vertical : bool, default True
+            If True, use vertical layout. If False, use horizontal layout.
+        """
+        from matplotlib import gridspec
+        import matplotlib.pyplot as plt
+
+        if not isinstance(left, list):
+            left = [left]
+        left = [self._conv(df) for df in left]
+        right = self._conv(right)
+
+        hcells, vcells = self._get_cells(left, right, vertical)
+
+        if vertical:
+            figsize = self.cell_width * hcells, self.cell_height * vcells
+        else:
+            # include margin for titles
+            figsize = self.cell_width * hcells, self.cell_height * vcells
+        fig = plt.figure(figsize=figsize)
+
+        if vertical:
+            gs = gridspec.GridSpec(len(left), hcells)
+            # left
+            max_left_cols = max(self._shape(df)[1] for df in left)
+            max_left_rows = max(self._shape(df)[0] for df in left)
+            for i, (_left, _label) in enumerate(zip(left, labels, strict=True)):
+                ax = fig.add_subplot(gs[i, 0:max_left_cols])
+                self._make_table(ax, _left, title=_label, height=1.0 / max_left_rows)
+            # right
+            ax = plt.subplot(gs[:, max_left_cols:])
+            self._make_table(ax, right, title="Result", height=1.05 / vcells)
+            fig.subplots_adjust(top=0.9, bottom=0.05, left=0.05, right=0.95)
+        else:
+            max_rows = max(self._shape(df)[0] for df in [*left, right])
+            height = 1.0 / np.max(max_rows)
+            gs = gridspec.GridSpec(1, hcells)
+            # left
+            i = 0
+            for df, _label in zip(left, labels, strict=True):
+                sp = self._shape(df)
+                ax = fig.add_subplot(gs[0, i : i + sp[1]])
+                self._make_table(ax, df, title=_label, height=height)
+                i += sp[1]
+            # right
+            ax = plt.subplot(gs[0, i:])
+            self._make_table(ax, right, title="Result", height=height)
+            fig.subplots_adjust(top=0.85, bottom=0.05, left=0.05, right=0.95)
+
+        return fig
+
+    def _conv(self, data):
+        """
+        Convert each input to appropriate for table outplot.
+        """
+        if isinstance(data, pd.Series):
+            if data.name is None:
+                data = data.to_frame(name="")
+            else:
+                data = data.to_frame()
+        data = data.fillna("NaN")
+        return data
+
+    def _insert_index(self, data):
+        # insert is destructive
+        data = data.copy()
+        idx_nlevels = data.index.nlevels
+        if idx_nlevels == 1:
+            data.insert(0, "Index", data.index)
+        else:
+            for i in range(idx_nlevels):
+                data.insert(i, f"Index{i}", data.index._get_level_values(i))
+
+        col_nlevels = data.columns.nlevels
+        if col_nlevels > 1:
+            col = data.columns._get_level_values(0)
+            values = [
+                data.columns._get_level_values(i)._values for i in range(1, col_nlevels)
+            ]
+            col_df = pd.DataFrame(values)
+            data.columns = col_df.columns
+            data = pd.concat([col_df, data])
+            data.columns = col
+        return data
+
+    def _make_table(self, ax, df, title: str, height: float | None = None) -> None:
+        if df is None:
+            ax.set_visible(False)
+            return
+
+        from pandas import plotting
+
+        idx_nlevels = df.index.nlevels
+        col_nlevels = df.columns.nlevels
+        # must be convert here to get index levels for colorization
+        df = self._insert_index(df)
+        tb = plotting.table(ax, df, loc=9)
+        tb.set_fontsize(self.font_size)
+
+        if height is None:
+            height = 1.0 / (len(df) + 1)
+
+        props = tb.properties()
+        for (r, c), cell in props["celld"].items():
+            if c == -1:
+                cell.set_visible(False)
+            elif r < col_nlevels and c < idx_nlevels:
+                cell.set_visible(False)
+            elif r < col_nlevels or c < idx_nlevels:
+                cell.set_facecolor("#AAAAAA")
+            cell.set_height(height)
+
+        ax.set_title(title, size=self.font_size)
+        ax.axis("off")
+
+
+def main() -> None:
+    import matplotlib.pyplot as plt
+
+    p = TablePlotter()
+
+    df1 = pd.DataFrame({"A": [10, 11, 12], "B": [20, 21, 22], "C": [30, 31, 32]})
+    df2 = pd.DataFrame({"A": [10, 12], "C": [30, 32]})
+
+    p.plot([df1, df2], pd.concat([df1, df2]), labels=["df1", "df2"], vertical=True)
+    plt.show()
+
+    df3 = pd.DataFrame({"X": [10, 12], "Z": [30, 32]})
+
+    p.plot(
+        [df1, df3], pd.concat([df1, df3], axis=1), labels=["df1", "df2"], vertical=False
+    )
+    plt.show()
+
+    idx = pd.MultiIndex.from_tuples(
+        [(1, "A"), (1, "B"), (1, "C"), (2, "A"), (2, "B"), (2, "C")]
+    )
+    column = pd.MultiIndex.from_tuples([(1, "A"), (1, "B")])
+    df3 = pd.DataFrame({"v1": [1, 2, 3, 4, 5, 6], "v2": [5, 6, 7, 8, 9, 10]}, index=idx)
+    df3.columns = column
+    p.plot(df3, df3, labels=["df3"])
+    plt.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3c8e54d3ca7f778f9ba27c6f3c7ebb59b8a980a
--- /dev/null
+++ b/pandas/util/_exceptions.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+import contextlib
+import inspect
+import os
+import re
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+import warnings
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+    from types import FrameType
+
+
+@contextlib.contextmanager
+def rewrite_exception(old_name: str, new_name: str) -> Generator[None]:
+    """
+    Rewrite the message of an exception.
+    """
+    try:
+        yield
+    except Exception as err:
+        if not err.args:
+            raise
+        msg = str(err.args[0])
+        msg = msg.replace(old_name, new_name)
+        args: tuple[Any, ...] = (msg,)
+        if len(err.args) > 1:
+            args = args + err.args[1:]
+        err.args = args
+        raise
+
+
+def find_stack_level() -> int:
+    """
+    Find the first place in the stack that is not inside pandas
+    (tests notwithstanding).
+    """
+
+    import pandas as pd
+
+    pkg_dir = os.path.dirname(pd.__file__)
+    test_dir = os.path.join(pkg_dir, "tests")
+
+    # https://stackoverflow.com/questions/17407119/python-inspect-stack-is-slow
+    frame: FrameType | None = inspect.currentframe()
+    try:
+        n = 0
+        while frame:
+            filename = inspect.getfile(frame)
+            if filename.startswith(pkg_dir) and not filename.startswith(test_dir):
+                frame = frame.f_back
+                n += 1
+            else:
+                break
+    finally:
+        # See note in
+        # https://docs.python.org/3/library/inspect.html#inspect.Traceback
+        del frame
+    return n
+
+
+@contextlib.contextmanager
+def rewrite_warning(
+    target_message: str,
+    target_category: type[Warning],
+    new_message: str,
+    new_category: type[Warning] | None = None,
+) -> Generator[None]:
+    """
+    Rewrite the message of a warning.
+
+    Parameters
+    ----------
+    target_message : str
+        Warning message to match.
+    target_category : Warning
+        Warning type to match.
+    new_message : str
+        New warning message to emit.
+    new_category : Warning or None, default None
+        New warning type to emit. When None, will be the same as target_category.
+    """
+    if new_category is None:
+        new_category = target_category
+    with warnings.catch_warnings(record=True) as record:
+        yield
+    if len(record) > 0:
+        match = re.compile(target_message)
+        for warning in record:
+            if warning.category is target_category and re.search(
+                match, str(warning.message)
+            ):
+                category = new_category
+                message: Warning | str = new_message
+            else:
+                category, message = warning.category, warning.message
+            warnings.warn_explicit(
+                message=message,
+                category=category,
+                filename=warning.filename,
+                lineno=warning.lineno,
+            )
diff --git a/pandas/util/_print_versions.py b/pandas/util/_print_versions.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a97c700802289af49c3f9301fe12f21fc63ec4c
--- /dev/null
+++ b/pandas/util/_print_versions.py
@@ -0,0 +1,164 @@
+from __future__ import annotations
+
+import json
+import locale
+import os
+import platform
+import struct
+import sys
+from typing import TYPE_CHECKING
+
+from pandas.util._decorators import set_module
+
+if TYPE_CHECKING:
+    from pandas._typing import JSONSerializable
+
+from pandas.compat._optional import (
+    VERSIONS,
+    get_version,
+    import_optional_dependency,
+)
+
+
+def _get_commit_hash() -> str | None:
+    """
+    Use vendored versioneer code to get git hash, which handles
+    git worktree correctly.
+    """
+    try:
+        from pandas._version_meson import (  # pyright: ignore [reportMissingImports]
+            __git_version__,
+        )
+
+        return __git_version__
+    except ImportError:
+        from pandas._version import get_versions
+
+        versions = get_versions()
+        return versions["full-revisionid"]
+
+
+def _get_sys_info() -> dict[str, JSONSerializable]:
+    """
+    Returns system information as a JSON serializable dictionary.
+    """
+    uname_result = platform.uname()
+    language_code, encoding = locale.getlocale()
+    return {
+        "commit": _get_commit_hash(),
+        "python": platform.python_version(),
+        "python-bits": struct.calcsize("P") * 8,
+        "OS": uname_result.system,
+        "OS-release": uname_result.release,
+        "Version": uname_result.version,
+        "machine": uname_result.machine,
+        "processor": uname_result.processor,
+        "byteorder": sys.byteorder,
+        "LC_ALL": os.environ.get("LC_ALL"),
+        "LANG": os.environ.get("LANG"),
+        "LOCALE": {"language-code": language_code, "encoding": encoding},
+    }
+
+
+def _get_dependency_info() -> dict[str, JSONSerializable]:
+    """
+    Returns dependency information as a JSON serializable dictionary.
+    """
+    deps = [
+        "pandas",
+        # required
+        "numpy",
+        "dateutil",
+        # install / build,
+        "pip",
+        "Cython",
+        # docs
+        "sphinx",
+        # Other, not imported.
+        "IPython",
+    ]
+    # Optional dependencies
+    deps.extend(list(VERSIONS))
+
+    result: dict[str, JSONSerializable] = {}
+    for modname in deps:
+        try:
+            mod = import_optional_dependency(modname, errors="ignore")
+        except Exception:
+            # Dependency conflicts may cause a non ImportError
+            result[modname] = "N/A"
+        else:
+            result[modname] = get_version(mod) if mod else None
+    return result
+
+
+@set_module("pandas")
+def show_versions(as_json: str | bool = False) -> None:
+    """
+    Provide useful information, important for bug reports.
+
+    It comprises info about hosting operation system, pandas version,
+    and versions of other installed relative packages.
+
+    Parameters
+    ----------
+    as_json : str or bool, default False
+        * If False, outputs info in a human readable form to the console.
+        * If str, it will be considered as a path to a file.
+          Info will be written to that file in JSON format.
+        * If True, outputs info in JSON format to the console.
+
+    See Also
+    --------
+    get_option : Retrieve the value of the specified option.
+    set_option : Set the value of the specified option or options.
+
+    Examples
+    --------
+    >>> pd.show_versions()  # doctest: +SKIP
+    Your output may look something like this:
+    INSTALLED VERSIONS
+    ------------------
+    commit           : 37ea63d540fd27274cad6585082c91b1283f963d
+    python           : 3.10.6.final.0
+    python-bits      : 64
+    OS               : Linux
+    OS-release       : 5.10.102.1-microsoft-standard-WSL2
+    Version          : #1 SMP Wed Mar 2 00:30:59 UTC 2022
+    machine          : x86_64
+    processor        : x86_64
+    byteorder        : little
+    LC_ALL           : None
+    LANG             : en_GB.UTF-8
+    LOCALE           : en_GB.UTF-8
+    pandas           : 2.0.1
+    numpy            : 1.24.3
+    ...
+    """
+    sys_info = _get_sys_info()
+    deps = _get_dependency_info()
+
+    if as_json:
+        j = {"system": sys_info, "dependencies": deps}
+
+        if as_json is True:
+            sys.stdout.writelines(json.dumps(j, indent=2))
+        else:
+            assert isinstance(as_json, str)  # needed for mypy
+            with open(as_json, "w", encoding="utf-8") as f:
+                json.dump(j, f, indent=2)
+
+    else:
+        assert isinstance(sys_info["LOCALE"], dict)  # needed for mypy
+        language_code = sys_info["LOCALE"]["language-code"]
+        encoding = sys_info["LOCALE"]["encoding"]
+        sys_info["LOCALE"] = f"{language_code}.{encoding}"
+
+        maxlen = max(len(x) for x in deps)
+        print("\nINSTALLED VERSIONS")
+        print("------------------")
+        for k, v in sys_info.items():
+            print(f"{k:<{maxlen}}: {v}")
+        print("")
+        for k, v in deps.items():
+            print(f"{k:<{maxlen}}: {v}")
diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c5c349a5a93df4175973c0f224d274ebf15f99d
--- /dev/null
+++ b/pandas/util/_test_decorators.py
@@ -0,0 +1,152 @@
+"""
+This module provides decorator functions which can be applied to test objects
+in order to skip those objects when certain conditions occur. A sample use case
+is to detect if the platform is missing ``matplotlib``. If so, any test objects
+which require ``matplotlib`` and decorated with ``@td.skip_if_no("matplotlib")``
+will be skipped by ``pytest`` during the execution of the test suite.
+
+To illustrate, after importing this module:
+
+import pandas.util._test_decorators as td
+
+The decorators can be applied to classes:
+
+@td.skip_if_no("package")
+class Foo:
+    ...
+
+Or individual functions:
+
+@td.skip_if_no("package")
+def test_foo():
+    ...
+
+For more information, refer to the ``pytest`` documentation on ``skipif``.
+"""
+
+from __future__ import annotations
+
+import locale
+import sys
+from typing import TYPE_CHECKING
+
+import pytest
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from pandas._typing import F
+
+from pandas.compat import (
+    IS64,
+    WASM,
+    is_platform_windows,
+)
+from pandas.compat._optional import import_optional_dependency
+
+
+def skip_if_installed(package: str) -> pytest.MarkDecorator:
+    """
+    Skip a test if a package is installed.
+
+    Parameters
+    ----------
+    package : str
+        The name of the package.
+
+    Returns
+    -------
+    pytest.MarkDecorator
+        a pytest.mark.skipif to use as either a test decorator or a
+        parametrization mark.
+    """
+    return pytest.mark.skipif(
+        bool(import_optional_dependency(package, errors="ignore")),
+        reason=f"Skipping because {package} is installed.",
+    )
+
+
+def skip_if_no(package: str, min_version: str | None = None) -> pytest.MarkDecorator:
+    """
+    Generic function to help skip tests when required packages are not
+    present on the testing system.
+
+    This function returns a pytest mark with a skip condition that will be
+    evaluated during test collection. An attempt will be made to import the
+    specified ``package`` and optionally ensure it meets the ``min_version``
+
+    The mark can be used as either a decorator for a test class or to be
+    applied to parameters in pytest.mark.parametrize calls or parametrized
+    fixtures. Use pytest.importorskip if an imported moduled is later needed
+    or for test functions.
+
+    If the import and version check are unsuccessful, then the test function
+    (or test case when used in conjunction with parametrization) will be
+    skipped.
+
+    Parameters
+    ----------
+    package: str
+        The name of the required package.
+    min_version: str or None, default None
+        Optional minimum version of the package.
+
+    Returns
+    -------
+    pytest.MarkDecorator
+        a pytest.mark.skipif to use as either a test decorator or a
+        parametrization mark.
+    """
+    msg = f"Could not import '{package}'"
+    if min_version:
+        msg += f" satisfying a min_version of {min_version}"
+    return pytest.mark.skipif(
+        not bool(
+            import_optional_dependency(
+                package, errors="ignore", min_version=min_version
+            )
+        ),
+        reason=msg,
+    )
+
+
+skip_if_32bit = pytest.mark.skipif(not IS64, reason="skipping for 32 bit")
+skip_if_windows = pytest.mark.skipif(is_platform_windows(), reason="Running on Windows")
+skip_if_not_us_locale = pytest.mark.skipif(
+    locale.getlocale()[0] != "en_US",
+    reason=f"Set local {locale.getlocale()[0]} is not en_US",
+)
+skip_if_wasm = pytest.mark.skipif(
+    WASM,
+    reason="does not support wasm",
+)
+skip_if_thread_unsafe_warnings = pytest.mark.skipif(
+    not getattr(sys.flags, "context_aware_warnings", 0),
+    reason="Python warnings must be thread-safe for consistent results",
+)
+
+
+def parametrize_fixture_doc(*args) -> Callable[[F], F]:
+    """
+    Intended for use as a decorator for parametrized fixture,
+    this function will wrap the decorated function with a pytest
+    ``parametrize_fixture_doc`` mark. That mark will format
+    initial fixture docstring by replacing placeholders {0}, {1} etc
+    with parameters passed as arguments.
+
+    Parameters
+    ----------
+    args: iterable
+        Positional arguments for docstring.
+
+    Returns
+    -------
+    function
+        The decorated function wrapped within a pytest
+        ``parametrize_fixture_doc`` mark
+    """
+
+    def documented_fixture(fixture):
+        fixture.__doc__ = fixture.__doc__.format(*args)
+        return fixture
+
+    return documented_fixture
diff --git a/pandas/util/_tester.py b/pandas/util/_tester.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69ec8e123b0857af4d81118c57d2e337d7e7e50
--- /dev/null
+++ b/pandas/util/_tester.py
@@ -0,0 +1,60 @@
+"""
+Entrypoint for testing from the top-level namespace.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+
+from pandas.compat._optional import import_optional_dependency
+from pandas.util._decorators import set_module
+
+PKG = os.path.dirname(os.path.dirname(__file__))
+
+
+@set_module("pandas")
+def test(extra_args: list[str] | None = None, run_doctests: bool = False) -> None:  # noqa: PT028
+    """
+    Run the pandas test suite using pytest.
+
+    By default, runs with the marks -m "not slow and not network and not db"
+
+    Parameters
+    ----------
+    extra_args : list[str], default None
+        Extra marks to run the tests.
+    run_doctests : bool, default False
+        Whether to only run the Python and Cython doctests. If you would like to run
+        both doctests/regular tests, just append "--doctest-modules"/"--doctest-cython"
+        to extra_args.
+
+    See Also
+    --------
+    pytest.main : The main entry point for pytest testing framework.
+
+    Examples
+    --------
+    >>> pd.test()  # doctest: +SKIP
+    running: pytest...
+    """
+    pytest = import_optional_dependency("pytest")
+    import_optional_dependency("hypothesis")
+    cmd = ["-m not slow and not network and not db"]
+    if extra_args:
+        if not isinstance(extra_args, list):
+            extra_args = [extra_args]
+        cmd = extra_args
+    if run_doctests:
+        cmd = [
+            "--doctest-modules",
+            "--doctest-cython",
+            f"--ignore={os.path.join(PKG, 'tests')}",
+        ]
+    cmd += [PKG]
+    joined = " ".join(cmd)
+    print(f"running: pytest {joined}")
+    sys.exit(pytest.main(cmd))
+
+
+__all__ = ["test"]
diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py
new file mode 100644
index 0000000000000000000000000000000000000000..9097875782d227db0a7b342fb0fd096800c5accb
--- /dev/null
+++ b/pandas/util/_validators.py
@@ -0,0 +1,482 @@
+"""
+Module that contains many useful utilities
+for validating data or function arguments
+"""
+
+from __future__ import annotations
+
+from collections.abc import (
+    Iterable,
+    Sequence,
+)
+from typing import (
+    TypeVar,
+    overload,
+)
+
+import numpy as np
+
+from pandas._libs import lib
+from pandas._libs.missing import NA
+
+from pandas.core.dtypes.common import (
+    is_bool,
+    is_integer,
+)
+
+BoolishT = TypeVar("BoolishT", bool, int)
+BoolishNoneT = TypeVar("BoolishNoneT", bool, int, None)
+
+
+def _check_arg_length(fname, args, max_fname_arg_count, compat_args) -> None:
+    """
+    Checks whether 'args' has length of at most 'compat_args'. Raises
+    a TypeError if that is not the case, similar to in Python when a
+    function is called with too many arguments.
+    """
+    if max_fname_arg_count < 0:
+        raise ValueError("'max_fname_arg_count' must be non-negative")
+
+    if len(args) > len(compat_args):
+        max_arg_count = len(compat_args) + max_fname_arg_count
+        actual_arg_count = len(args) + max_fname_arg_count
+        argument = "argument" if max_arg_count == 1 else "arguments"
+
+        raise TypeError(
+            f"{fname}() takes at most {max_arg_count} {argument} "
+            f"({actual_arg_count} given)"
+        )
+
+
+def _check_for_default_values(fname, arg_val_dict, compat_args) -> None:
+    """
+    Check that the keys in `arg_val_dict` are mapped to their
+    default values as specified in `compat_args`.
+
+    Note that this function is to be called only when it has been
+    checked that arg_val_dict.keys() is a subset of compat_args
+    """
+    for key in arg_val_dict:
+        # try checking equality directly with '=' operator,
+        # as comparison may have been overridden for the left
+        # hand object
+        try:
+            v1 = arg_val_dict[key]
+            v2 = compat_args[key]
+
+            # check for None-ness otherwise we could end up
+            # comparing a numpy array vs None
+            if (v1 is not None and v2 is None) or (v1 is None and v2 is not None):
+                match = False
+            else:
+                match = v1 == v2
+
+            if not is_bool(match):
+                raise ValueError("'match' is not a boolean")
+
+        # could not compare them directly, so try comparison
+        # using the 'is' operator
+        except ValueError:
+            match = arg_val_dict[key] is compat_args[key]
+
+        if not match:
+            raise ValueError(
+                f"the '{key}' parameter is not supported in "
+                f"the pandas implementation of {fname}()"
+            )
+
+
+def validate_args(fname, args, max_fname_arg_count, compat_args) -> None:
+    """
+    Checks whether the length of the `*args` argument passed into a function
+    has at most `len(compat_args)` arguments and whether or not all of these
+    elements in `args` are set to their default values.
+
+    Parameters
+    ----------
+    fname : str
+        The name of the function being passed the `*args` parameter
+    args : tuple
+        The `*args` parameter passed into a function
+    max_fname_arg_count : int
+        The maximum number of arguments that the function `fname`
+        can accept, excluding those in `args`. Used for displaying
+        appropriate error messages. Must be non-negative.
+    compat_args : dict
+        A dictionary of keys and their associated default values.
+        In order to accommodate buggy behaviour in some versions of `numpy`,
+        where a signature displayed keyword arguments but then passed those
+        arguments **positionally** internally when calling downstream
+        implementations, a dict ensures that the original
+        order of the keyword arguments is enforced.
+
+    Raises
+    ------
+    TypeError
+        If `args` contains more values than there are `compat_args`
+    ValueError
+        If `args` contains values that do not correspond to those
+        of the default values specified in `compat_args`
+    """
+    _check_arg_length(fname, args, max_fname_arg_count, compat_args)
+
+    # We do this so that we can provide a more informative
+    # error message about the parameters that we are not
+    # supporting in the pandas implementation of 'fname'
+    kwargs = dict(zip(compat_args, args, strict=False))
+    _check_for_default_values(fname, kwargs, compat_args)
+
+
+def _check_for_invalid_keys(fname, kwargs, compat_args) -> None:
+    """
+    Checks whether 'kwargs' contains any keys that are not
+    in 'compat_args' and raises a TypeError if there is one.
+    """
+    # set(dict) --> set of the dictionary's keys
+    diff = set(kwargs) - set(compat_args)
+
+    if diff:
+        bad_arg = next(iter(diff))
+        raise TypeError(f"{fname}() got an unexpected keyword argument '{bad_arg}'")
+
+
+def validate_kwargs(fname, kwargs, compat_args) -> None:
+    """
+    Checks whether parameters passed to the **kwargs argument in a
+    function `fname` are valid parameters as specified in `*compat_args`
+    and whether or not they are set to their default values.
+
+    Parameters
+    ----------
+    fname : str
+        The name of the function being passed the `**kwargs` parameter
+    kwargs : dict
+        The `**kwargs` parameter passed into `fname`
+    compat_args: dict
+        A dictionary of keys that `kwargs` is allowed to have and their
+        associated default values
+
+    Raises
+    ------
+    TypeError if `kwargs` contains keys not in `compat_args`
+    ValueError if `kwargs` contains keys in `compat_args` that do not
+    map to the default values specified in `compat_args`
+    """
+    kwds = kwargs.copy()
+    _check_for_invalid_keys(fname, kwargs, compat_args)
+    _check_for_default_values(fname, kwds, compat_args)
+
+
+def validate_args_and_kwargs(
+    fname, args, kwargs, max_fname_arg_count, compat_args
+) -> None:
+    """
+    Checks whether parameters passed to the *args and **kwargs argument in a
+    function `fname` are valid parameters as specified in `*compat_args`
+    and whether or not they are set to their default values.
+
+    Parameters
+    ----------
+    fname: str
+        The name of the function being passed the `**kwargs` parameter
+    args: tuple
+        The `*args` parameter passed into a function
+    kwargs: dict
+        The `**kwargs` parameter passed into `fname`
+    max_fname_arg_count: int
+        The minimum number of arguments that the function `fname`
+        requires, excluding those in `args`. Used for displaying
+        appropriate error messages. Must be non-negative.
+    compat_args: dict
+        A dictionary of keys that `kwargs` is allowed to
+        have and their associated default values.
+
+    Raises
+    ------
+    TypeError if `args` contains more values than there are
+    `compat_args` OR `kwargs` contains keys not in `compat_args`
+    ValueError if `args` contains values not at the default value (`None`)
+    `kwargs` contains keys in `compat_args` that do not map to the default
+    value as specified in `compat_args`
+
+    See Also
+    --------
+    validate_args : Purely args validation.
+    validate_kwargs : Purely kwargs validation.
+
+    """
+    # Check that the total number of arguments passed in (i.e.
+    # args and kwargs) does not exceed the length of compat_args
+    _check_arg_length(
+        fname, args + tuple(kwargs.values()), max_fname_arg_count, compat_args
+    )
+
+    # Check there is no overlap with the positional and keyword
+    # arguments, similar to what is done in actual Python functions
+    args_dict = dict(zip(compat_args, args, strict=False))
+
+    for key in args_dict:
+        if key in kwargs:
+            raise TypeError(
+                f"{fname}() got multiple values for keyword argument '{key}'"
+            )
+
+    kwargs.update(args_dict)
+    validate_kwargs(fname, kwargs, compat_args)
+
+
+def validate_bool_kwarg(
+    value: BoolishNoneT,
+    arg_name: str,
+    none_allowed: bool = True,
+    int_allowed: bool = False,
+) -> BoolishNoneT:
+    """
+    Ensure that argument passed in arg_name can be interpreted as boolean.
+
+    Parameters
+    ----------
+    value : bool
+        Value to be validated.
+    arg_name : str
+        Name of the argument. To be reflected in the error message.
+    none_allowed : bool, default True
+        Whether to consider None to be a valid boolean.
+    int_allowed : bool, default False
+        Whether to consider integer value to be a valid boolean.
+
+    Returns
+    -------
+    value
+        The same value as input.
+
+    Raises
+    ------
+    ValueError
+        If the value is not a valid boolean.
+    """
+    good_value = is_bool(value)
+    if none_allowed:
+        good_value = good_value or (value is None)
+
+    if int_allowed:
+        good_value = good_value or isinstance(value, int)
+
+    if not good_value:
+        raise ValueError(
+            f'For argument "{arg_name}" expected type bool, received '
+            f"type {type(value).__name__}."
+        )
+    return value
+
+
+def validate_na_arg(value, name: str):
+    """
+    Validate na arguments.
+
+    Parameters
+    ----------
+    value : object
+        Value to validate.
+    name : str
+        Name of the argument, used to raise an informative error message.
+
+    Raises
+    ______
+    ValueError
+        When ``value`` is determined to be invalid.
+    """
+    if (
+        value is lib.no_default
+        or isinstance(value, bool)
+        or value is None
+        or value is NA
+        or (lib.is_float(value) and np.isnan(value))
+    ):
+        return
+    raise ValueError(f"{name} must be None, pd.NA, np.nan, True, or False; got {value}")
+
+
+def validate_fillna_kwargs(value, method, validate_scalar_dict_value: bool = True):
+    """
+    Validate the keyword arguments to 'fillna'.
+
+    This checks that exactly one of 'value' and 'method' is specified.
+    If 'method' is specified, this validates that it's a valid method.
+
+    Parameters
+    ----------
+    value, method : object
+        The 'value' and 'method' keyword arguments for 'fillna'.
+    validate_scalar_dict_value : bool, default True
+        Whether to validate that 'value' is a scalar or dict. Specifically,
+        validate that it is not a list or tuple.
+
+    Returns
+    -------
+    value, method : object
+    """
+    from pandas.core.missing import clean_fill_method
+
+    if value is None and method is None:
+        raise ValueError("Must specify a fill 'value' or 'method'.")
+    if value is None and method is not None:
+        method = clean_fill_method(method)
+
+    elif value is not None and method is None:
+        if validate_scalar_dict_value and isinstance(value, (list, tuple)):
+            raise TypeError(
+                '"value" parameter must be a scalar or dict, but '
+                f'you passed a "{type(value).__name__}"'
+            )
+
+    elif value is not None and method is not None:
+        raise ValueError("Cannot specify both 'value' and 'method'.")
+
+    return value, method
+
+
+def validate_percentile(q: float | Iterable[float]) -> np.ndarray:
+    """
+    Validate percentiles (used by describe and quantile).
+
+    This function checks if the given float or iterable of floats is a valid percentile
+    otherwise raises a ValueError.
+
+    Parameters
+    ----------
+    q: float or iterable of floats
+        A single percentile or an iterable of percentiles.
+
+    Returns
+    -------
+    ndarray
+        An ndarray of the percentiles if valid.
+
+    Raises
+    ------
+    ValueError if percentiles are not in given interval([0, 1]).
+    """
+    q_arr = np.asarray(q)
+    # Don't change this to an f-string. The string formatting
+    # is too expensive for cases where we don't need it.
+    msg = "percentiles should all be in the interval [0, 1]"
+    if q_arr.ndim == 0:
+        if not 0 <= q_arr <= 1:
+            raise ValueError(msg)
+    elif not all(0 <= qs <= 1 for qs in q_arr):
+        raise ValueError(msg)
+    return q_arr
+
+
+@overload
+def validate_ascending(ascending: BoolishT) -> BoolishT: ...
+
+
+@overload
+def validate_ascending(ascending: Sequence[BoolishT]) -> list[BoolishT]: ...
+
+
+def validate_ascending(
+    ascending: bool | int | Sequence[BoolishT],
+) -> bool | int | list[BoolishT]:
+    """Validate ``ascending`` kwargs for ``sort_index`` method."""
+    kwargs = {"none_allowed": False, "int_allowed": True}
+    if not isinstance(ascending, Sequence):
+        return validate_bool_kwarg(ascending, "ascending", **kwargs)
+
+    return [validate_bool_kwarg(item, "ascending", **kwargs) for item in ascending]
+
+
+def validate_endpoints(closed: str | None) -> tuple[bool, bool]:
+    """
+    Check that the `closed` argument is among [None, "left", "right"]
+
+    Parameters
+    ----------
+    closed : {None, "left", "right"}
+
+    Returns
+    -------
+    left_closed : bool
+    right_closed : bool
+
+    Raises
+    ------
+    ValueError : if argument is not among valid values
+    """
+    left_closed = False
+    right_closed = False
+
+    if closed is None:
+        left_closed = True
+        right_closed = True
+    elif closed == "left":
+        left_closed = True
+    elif closed == "right":
+        right_closed = True
+    else:
+        raise ValueError("Closed has to be either 'left', 'right' or None")
+
+    return left_closed, right_closed
+
+
+def validate_inclusive(inclusive: str | None) -> tuple[bool, bool]:
+    """
+    Check that the `inclusive` argument is among {"both", "neither", "left", "right"}.
+
+    Parameters
+    ----------
+    inclusive : {"both", "neither", "left", "right"}
+
+    Returns
+    -------
+    left_right_inclusive : tuple[bool, bool]
+
+    Raises
+    ------
+    ValueError : if argument is not among valid values
+    """
+    left_right_inclusive: tuple[bool, bool] | None = None
+
+    if isinstance(inclusive, str):
+        left_right_inclusive = {
+            "both": (True, True),
+            "left": (True, False),
+            "right": (False, True),
+            "neither": (False, False),
+        }.get(inclusive)
+
+    if left_right_inclusive is None:
+        raise ValueError(
+            "Inclusive has to be either 'both', 'neither', 'left' or 'right'"
+        )
+
+    return left_right_inclusive
+
+
+def validate_insert_loc(loc: int, length: int) -> int:
+    """
+    Check that we have an integer between -length and length, inclusive.
+
+    Standardize negative loc to within [0, length].
+
+    The exceptions we raise on failure match np.insert.
+    """
+    if not is_integer(loc):
+        raise TypeError(f"loc must be an integer between -{length} and {length}")
+
+    if loc < 0:
+        loc += length
+    if not 0 <= loc <= length:
+        raise IndexError(f"loc must be an integer between -{length} and {length}")
+    return loc  # pyright: ignore[reportReturnType]
+
+
+def check_dtype_backend(dtype_backend) -> None:
+    if dtype_backend is not lib.no_default:
+        if dtype_backend not in ["numpy_nullable", "pyarrow"]:
+            raise ValueError(
+                f"dtype_backend {dtype_backend} is invalid, only 'numpy_nullable' and "
+                f"'pyarrow' are allowed.",
+            )
diff --git a/pyarrow/include/arrow/acero/accumulation_queue.h b/pyarrow/include/arrow/acero/accumulation_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0e0b85a4f3d0504ad0e09237e498c001c55f96a
--- /dev/null
+++ b/pyarrow/include/arrow/acero/accumulation_queue.h
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <vector>
+
+#include "arrow/acero/visibility.h"
+#include "arrow/compute/exec.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace acero {
+namespace util {
+
+using arrow::compute::ExecBatch;
+
+/// \brief A container that accumulates batches until they are ready to
+///        be processed.
+class ARROW_ACERO_EXPORT AccumulationQueue {
+ public:
+  AccumulationQueue() : row_count_(0) {}
+  ~AccumulationQueue() = default;
+
+  // We should never be copying ExecBatch around
+  AccumulationQueue(const AccumulationQueue&) = delete;
+  AccumulationQueue& operator=(const AccumulationQueue&) = delete;
+
+  AccumulationQueue(AccumulationQueue&& that);
+  AccumulationQueue& operator=(AccumulationQueue&& that);
+
+  void Concatenate(AccumulationQueue&& that);
+  void InsertBatch(ExecBatch batch);
+  int64_t row_count() { return row_count_; }
+  size_t batch_count() { return batches_.size(); }
+  bool empty() const { return batches_.empty(); }
+  void Clear();
+  ExecBatch& operator[](size_t i);
+
+ private:
+  int64_t row_count_;
+  std::vector<ExecBatch> batches_;
+};
+
+/// A queue that sequences incoming batches
+///
+/// This can be used when a node needs to do some kind of ordered processing on
+/// the stream.
+///
+/// Batches can be inserted in any order.  The process_callback will be called on
+/// the batches, in order, without reentrant calls. For this reason the callback
+/// should be quick.
+///
+/// For example, in a top-n node, the process callback should determine how many
+/// rows need to be delivered for the given batch, and then return a task to actually
+/// deliver those rows.
+class ARROW_ACERO_EXPORT SequencingQueue {
+ public:
+  using Task = std::function<Status()>;
+
+  /// Strategy that describes how to handle items
+  class Processor {
+   public:
+    /// Process the batch, potentially generating a task
+    ///
+    /// This method will be called on each batch in order.  Calls to this method
+    /// will be serialized and it will not be called reentrantly.  This makes it
+    /// safe to do things that rely on order but minimal time should be spent here
+    /// to avoid becoming a bottleneck.
+    ///
+    /// \return a follow-up task that will be scheduled.  The follow-up task(s) are
+    ///         is not guaranteed to run in any particular order.  If nullopt is
+    ///         returned then nothing will be scheduled.
+    virtual Result<std::optional<Task>> Process(ExecBatch batch) = 0;
+    /// Schedule a task
+    virtual void Schedule(Task task) = 0;
+  };
+
+  virtual ~SequencingQueue() = default;
+
+  /// Insert a batch into the queue
+  ///
+  /// This will insert the batch into the queue.  If this batch was the next batch
+  /// to deliver then this will trigger 1+ calls to the process callback to generate
+  /// 1+ tasks.
+  ///
+  /// The task generated by this call will be executed immediately.  The remaining
+  /// tasks will be scheduled using the schedule callback.
+  ///
+  /// From a data pipeline perspective the sequencing queue is a "sometimes" breaker.  If
+  /// a task arrives in order then this call will usually execute the downstream pipeline.
+  /// If this task arrives early then this call will only queue the data.
+  virtual Status InsertBatch(ExecBatch batch) = 0;
+
+  /// Create a queue
+  /// \param processor describes how to process the batches, must outlive the queue
+  static std::unique_ptr<SequencingQueue> Make(Processor* processor);
+};
+
+/// A queue that sequences incoming batches
+///
+/// Unlike SequencingQueue the Process method is not expected to schedule new tasks.
+///
+/// If a batch arrives and another thread is currently processing then the batch
+/// will be queued and control will return.  In other words, delivery of batches will
+/// not block on the Process method.
+///
+/// It can be helpful to think of this as if a dedicated thread is running Process as
+/// batches arrive
+class ARROW_ACERO_EXPORT SerialSequencingQueue {
+ public:
+  /// Strategy that describes how to handle items
+  class Processor {
+   public:
+    virtual ~Processor() = default;
+    /// Process the batch
+    ///
+    /// This method will be called on each batch in order.  Calls to this method
+    /// will be serialized and it will not be called reentrantly.  This makes it
+    /// safe to do things that rely on order.
+    ///
+    /// If this falls behind then data may accumulate
+    ///
+    /// TODO: Could add backpressure if needed but right now all uses of this should
+    ///       be pretty fast and so are unlikely to block.
+    virtual Status Process(ExecBatch batch) = 0;
+  };
+
+  virtual ~SerialSequencingQueue() = default;
+
+  /// Insert a batch into the queue
+  ///
+  /// This will insert the batch into the queue.  If this batch was the next batch
+  /// to deliver then this may trigger calls to the processor which will be run
+  /// as part of this call.
+  virtual Status InsertBatch(ExecBatch batch) = 0;
+
+  /// Create a queue
+  /// \param processor describes how to process the batches, must outlive the queue
+  static std::unique_ptr<SerialSequencingQueue> Make(Processor* processor);
+};
+
+}  // namespace util
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/aggregate_node.h b/pyarrow/include/arrow/acero/aggregate_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c6fea16a8acc75046309708221189d368f605c0
--- /dev/null
+++ b/pyarrow/include/arrow/acero/aggregate_node.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "arrow/acero/visibility.h"
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/test_util_internal.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace acero {
+namespace aggregate {
+
+using compute::Aggregate;
+using compute::default_exec_context;
+using compute::ExecContext;
+
+/// \brief Make the output schema of an aggregate node
+///
+/// The output schema is determined by the aggregation kernels, which may depend on the
+/// ExecContext argument. To guarantee correct results, the same ExecContext argument
+/// should be used in execution.
+///
+/// \param[in] input_schema the schema of the input to the node
+/// \param[in] keys the grouping keys for the aggregation
+/// \param[in] segment_keys the segmenting keys for the aggregation
+/// \param[in] aggregates the aggregates for the aggregation
+/// \param[in] exec_ctx the execution context for the aggregation
+ARROW_ACERO_EXPORT Result<std::shared_ptr<Schema>> MakeOutputSchema(
+    const std::shared_ptr<Schema>& input_schema, const std::vector<FieldRef>& keys,
+    const std::vector<FieldRef>& segment_keys, const std::vector<Aggregate>& aggregates,
+    ExecContext* exec_ctx = default_exec_context());
+
+}  // namespace aggregate
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/api.h b/pyarrow/include/arrow/acero/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9724fd512d0b56dfa3a24647b3885677c92b534
--- /dev/null
+++ b/pyarrow/include/arrow/acero/api.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+/// \defgroup acero-api Utilities for creating and executing execution plans
+/// @{
+/// @}
+
+/// \defgroup acero-nodes Options classes for the various exec nodes
+/// @{
+/// @}
+
+#include "arrow/acero/exec_plan.h"
+#include "arrow/acero/options.h"
diff --git a/pyarrow/include/arrow/acero/asof_join_node.h b/pyarrow/include/arrow/acero/asof_join_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a0ce8fd386b01ac868bac3d4d026a309e351cb3
--- /dev/null
+++ b/pyarrow/include/arrow/acero/asof_join_node.h
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <vector>
+
+#include "arrow/acero/options.h"
+#include "arrow/acero/visibility.h"
+#include "arrow/compute/exec.h"
+#include "arrow/type.h"
+
+namespace arrow {
+namespace acero {
+namespace asofjoin {
+
+using AsofJoinKeys = AsofJoinNodeOptions::Keys;
+
+/// \brief Make the output schema of an as-of-join node
+///
+/// \param[in] input_schema the schema of each input to the node
+/// \param[in] input_keys the key of each input to the node
+ARROW_ACERO_EXPORT Result<std::shared_ptr<Schema>> MakeOutputSchema(
+    const std::vector<std::shared_ptr<Schema>>& input_schema,
+    const std::vector<AsofJoinKeys>& input_keys);
+
+}  // namespace asofjoin
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/backpressure_handler.h b/pyarrow/include/arrow/acero/backpressure_handler.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6a47e60197a51f85c2279f00ff8851c78a264f5
--- /dev/null
+++ b/pyarrow/include/arrow/acero/backpressure_handler.h
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include "arrow/acero/exec_plan.h"
+#include "arrow/acero/options.h"
+
+#include <memory>
+
+namespace arrow::acero {
+
+class BackpressureHandler {
+ private:
+  BackpressureHandler(size_t low_threshold, size_t high_threshold,
+                      std::unique_ptr<BackpressureControl> backpressure_control)
+      : low_threshold_(low_threshold),
+        high_threshold_(high_threshold),
+        backpressure_control_(std::move(backpressure_control)) {}
+
+ public:
+  static Result<BackpressureHandler> Make(
+      size_t low_threshold, size_t high_threshold,
+      std::unique_ptr<BackpressureControl> backpressure_control) {
+    if (low_threshold >= high_threshold) {
+      return Status::Invalid("low threshold (", low_threshold,
+                             ") must be less than high threshold (", high_threshold, ")");
+    }
+    if (backpressure_control == NULLPTR) {
+      return Status::Invalid("null backpressure control parameter");
+    }
+    BackpressureHandler backpressure_handler(low_threshold, high_threshold,
+                                             std::move(backpressure_control));
+    return backpressure_handler;
+  }
+
+  void Handle(size_t start_level, size_t end_level) {
+    if (start_level < high_threshold_ && end_level >= high_threshold_) {
+      backpressure_control_->Pause();
+    } else if (start_level > low_threshold_ && end_level <= low_threshold_) {
+      backpressure_control_->Resume();
+    }
+  }
+
+ private:
+  size_t low_threshold_;
+  size_t high_threshold_;
+  std::unique_ptr<BackpressureControl> backpressure_control_;
+};
+
+}  // namespace arrow::acero
diff --git a/pyarrow/include/arrow/acero/benchmark_util.h b/pyarrow/include/arrow/acero/benchmark_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ba8553887c03f876b6e08f031f5641170c2e09f
--- /dev/null
+++ b/pyarrow/include/arrow/acero/benchmark_util.h
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/acero/exec_plan.h"
+#include "arrow/acero/test_util_internal.h"
+#include "arrow/compute/exec.h"
+
+namespace arrow {
+
+namespace acero {
+
+Status BenchmarkNodeOverhead(benchmark::State& state, int32_t num_batches,
+                             int32_t batch_size, arrow::acero::BatchesWithSchema data,
+                             std::vector<arrow::acero::Declaration>& node_declarations,
+                             arrow::MemoryPool* pool = default_memory_pool());
+
+Status BenchmarkIsolatedNodeOverhead(benchmark::State& state,
+                                     arrow::compute::Expression expr, int32_t num_batches,
+                                     int32_t batch_size,
+                                     arrow::acero::BatchesWithSchema data,
+                                     std::string factory_name,
+                                     arrow::acero::ExecNodeOptions& options,
+                                     arrow::MemoryPool* pool = default_memory_pool());
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/bloom_filter.h b/pyarrow/include/arrow/acero/bloom_filter.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f9fe171baeb39f5347d112921666ba057cb56b6
--- /dev/null
+++ b/pyarrow/include/arrow/acero/bloom_filter.h
@@ -0,0 +1,323 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+
+#include "arrow/acero/partition_util.h"
+#include "arrow/acero/util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/simd.h"
+
+namespace arrow {
+namespace acero {
+
+// A set of pre-generated bit masks from a 64-bit word.
+//
+// It is used to map selected bits of hash to a bit mask that will be used in
+// a Bloom filter.
+//
+// These bit masks need to look random and need to have a similar fractions of
+// bits set in order for a Bloom filter to have a low false positives rate.
+//
+struct ARROW_ACERO_EXPORT BloomFilterMasks {
+  // Generate all masks as a single bit vector. Each bit offset in this bit
+  // vector corresponds to a single mask.
+  // In each consecutive kBitsPerMask bits, there must be between
+  // kMinBitsSet and kMaxBitsSet bits set.
+  //
+  BloomFilterMasks();
+
+  inline uint64_t mask(int bit_offset) {
+#if ARROW_LITTLE_ENDIAN
+    return (arrow::util::SafeLoadAs<uint64_t>(masks_ + bit_offset / 8) >>
+            (bit_offset % 8)) &
+           kFullMask;
+#else
+    return (BYTESWAP(arrow::util::SafeLoadAs<uint64_t>(masks_ + bit_offset / 8)) >>
+            (bit_offset % 8)) &
+           kFullMask;
+#endif
+  }
+
+  // Masks are 57 bits long because then they can be accessed at an
+  // arbitrary bit offset using a single unaligned 64-bit load instruction.
+  //
+  static constexpr int kBitsPerMask = 57;
+  static constexpr uint64_t kFullMask = (1ULL << kBitsPerMask) - 1;
+
+  // Minimum and maximum number of bits set in each mask.
+  // This constraint is enforced when generating the bit masks.
+  // Values should be close to each other and chosen as to minimize a Bloom
+  // filter false positives rate.
+  //
+  static constexpr int kMinBitsSet = 4;
+  static constexpr int kMaxBitsSet = 5;
+
+  // Number of generated masks.
+  // Having more masks to choose will improve false positives rate of Bloom
+  // filter but will also use more memory, which may lead to more CPU cache
+  // misses.
+  // The chosen value results in using only a few cache-lines for mask lookups,
+  // while providing a good variety of available bit masks.
+  //
+  static constexpr int kLogNumMasks = 10;
+  static constexpr int kNumMasks = 1 << kLogNumMasks;
+
+  // Data of masks. Masks are stored in a single bit vector. Nth mask is
+  // kBitsPerMask bits starting at bit offset N.
+  //
+  static constexpr int kTotalBytes = (kNumMasks + 64) / 8;
+  uint8_t masks_[kTotalBytes];
+};
+
+// A variant of a blocked Bloom filter implementation.
+// A Bloom filter is a data structure that provides approximate membership test
+// functionality based only on the hash of the key. Membership test may return
+// false positives but not false negatives. Approximation of the result allows
+// in general case (for arbitrary data types of keys) to save on both memory and
+// lookup cost compared to the accurate membership test.
+// The accurate test may sometimes still be cheaper for a specific data types
+// and inputs, e.g. integers from a small range.
+//
+// This blocked Bloom filter is optimized for use in hash joins, to achieve a
+// good balance between the size of the filter, the cost of its building and
+// querying and the rate of false positives.
+//
+class ARROW_ACERO_EXPORT BlockedBloomFilter {
+  friend class BloomFilterBuilder_SingleThreaded;
+  friend class BloomFilterBuilder_Parallel;
+
+ public:
+  BlockedBloomFilter() : log_num_blocks_(0), num_blocks_(0), blocks_(NULLPTR) {}
+
+  inline bool Find(uint64_t hash) const {
+    uint64_t m = mask(hash);
+    uint64_t b = blocks_[block_id(hash)];
+    return (b & m) == m;
+  }
+
+  // Uses SIMD if available for smaller Bloom filters.
+  // Uses memory prefetching for larger Bloom filters.
+  //
+  void Find(int64_t hardware_flags, int64_t num_rows, const uint32_t* hashes,
+            uint8_t* result_bit_vector, bool enable_prefetch = true) const;
+  void Find(int64_t hardware_flags, int64_t num_rows, const uint64_t* hashes,
+            uint8_t* result_bit_vector, bool enable_prefetch = true) const;
+
+  int log_num_blocks() const { return log_num_blocks_; }
+
+  int NumHashBitsUsed() const;
+
+  bool IsSameAs(const BlockedBloomFilter* other) const;
+
+  int64_t NumBitsSet() const;
+
+  // Folding of a block Bloom filter after the initial version
+  // has been built.
+  //
+  // One of the parameters for creation of Bloom filter is the number
+  // of bits allocated for it. The more bits allocated, the lower the
+  // probability of false positives. A good heuristic is to aim for
+  // half of the bits set in the constructed Bloom filter. This should
+  // result in a good trade off between size (and following cost of
+  // memory accesses) and false positives rate.
+  //
+  // There might have been many duplicate keys in the input provided
+  // to Bloom filter builder. In that case the resulting bit vector
+  // would be more sparse then originally intended. It is possible to
+  // easily correct that and cut in half the size of Bloom filter
+  // after it has already been constructed. The process to do that is
+  // approximately equal to OR-ing bits from upper and lower half (the
+  // way we address these bits when inserting or querying a hash makes
+  // such folding in half possible).
+  //
+  // We will keep folding as long as the fraction of bits set is less
+  // than 1/4. The resulting bit vector density should be in the [1/4,
+  // 1/2) range.
+  //
+  void Fold();
+
+ private:
+  Status CreateEmpty(int64_t num_rows_to_insert, MemoryPool* pool);
+
+  inline void Insert(uint64_t hash) {
+    uint64_t m = mask(hash);
+    uint64_t& b = blocks_[block_id(hash)];
+    b |= m;
+  }
+
+  void Insert(int64_t hardware_flags, int64_t num_rows, const uint32_t* hashes);
+  void Insert(int64_t hardware_flags, int64_t num_rows, const uint64_t* hashes);
+
+  inline uint64_t mask(uint64_t hash) const {
+    // The lowest bits of hash are used to pick mask index.
+    //
+    int mask_id = static_cast<int>(hash & (BloomFilterMasks::kNumMasks - 1));
+    uint64_t result = masks_.mask(mask_id);
+
+    // The next set of hash bits is used to pick the amount of bit
+    // rotation of the mask.
+    //
+    int rotation = (hash >> BloomFilterMasks::kLogNumMasks) & 63;
+    result = ROTL64(result, rotation);
+
+    return result;
+  }
+
+  inline int64_t block_id(uint64_t hash) const {
+    // The next set of hash bits following the bits used to select a
+    // mask is used to pick block id (index of 64-bit word in a bit
+    // vector).
+    //
+    return (hash >> (BloomFilterMasks::kLogNumMasks + 6)) & (num_blocks_ - 1);
+  }
+
+  template <typename T>
+  inline void InsertImp(int64_t num_rows, const T* hashes);
+
+  template <typename T>
+  inline void FindImp(int64_t num_rows, const T* hashes, uint8_t* result_bit_vector,
+                      bool enable_prefetch) const;
+
+  void SingleFold(int num_folds);
+
+#if defined(ARROW_HAVE_RUNTIME_AVX2)
+  inline __m256i mask_avx2(__m256i hash) const;
+  inline __m256i block_id_avx2(__m256i hash) const;
+  int64_t Insert_avx2(int64_t num_rows, const uint32_t* hashes);
+  int64_t Insert_avx2(int64_t num_rows, const uint64_t* hashes);
+  template <typename T>
+  int64_t InsertImp_avx2(int64_t num_rows, const T* hashes);
+  int64_t Find_avx2(int64_t num_rows, const uint32_t* hashes,
+                    uint8_t* result_bit_vector) const;
+  int64_t Find_avx2(int64_t num_rows, const uint64_t* hashes,
+                    uint8_t* result_bit_vector) const;
+  template <typename T>
+  int64_t FindImp_avx2(int64_t num_rows, const T* hashes,
+                       uint8_t* result_bit_vector) const;
+#endif
+
+  bool UsePrefetch() const {
+    return num_blocks_ * sizeof(uint64_t) > kPrefetchLimitBytes;
+  }
+
+  static constexpr int64_t kPrefetchLimitBytes = 256 * 1024;
+
+  static BloomFilterMasks masks_;
+
+  // Total number of bits used by block Bloom filter must be a power
+  // of 2.
+  //
+  int log_num_blocks_;
+  int64_t num_blocks_;
+
+  // Buffer allocated to store an array of power of 2 64-bit blocks.
+  //
+  std::shared_ptr<Buffer> buf_;
+  // Pointer to mutable data owned by Buffer
+  //
+  uint64_t* blocks_;
+};
+
+// We have two separate implementations of building a Bloom filter, multi-threaded and
+// single-threaded.
+//
+// Single threaded version is useful in two ways:
+// a) It allows to verify parallel implementation in tests (the single threaded one is
+// simpler and can be used as the source of truth).
+// b) It is preferred for small and medium size Bloom filters, because it skips extra
+// synchronization related steps from parallel variant (partitioning and taking locks).
+//
+enum class BloomFilterBuildStrategy {
+  SINGLE_THREADED = 0,
+  PARALLEL = 1,
+};
+
+class ARROW_ACERO_EXPORT BloomFilterBuilder {
+ public:
+  virtual ~BloomFilterBuilder() = default;
+  virtual Status Begin(size_t num_threads, int64_t hardware_flags, MemoryPool* pool,
+                       int64_t num_rows, int64_t num_batches,
+                       BlockedBloomFilter* build_target) = 0;
+  virtual int64_t num_tasks() const { return 0; }
+  virtual Status PushNextBatch(size_t thread_index, int64_t num_rows,
+                               const uint32_t* hashes) = 0;
+  virtual Status PushNextBatch(size_t thread_index, int64_t num_rows,
+                               const uint64_t* hashes) = 0;
+  virtual void CleanUp() {}
+  static std::unique_ptr<BloomFilterBuilder> Make(BloomFilterBuildStrategy strategy);
+};
+
+class ARROW_ACERO_EXPORT BloomFilterBuilder_SingleThreaded : public BloomFilterBuilder {
+ public:
+  Status Begin(size_t num_threads, int64_t hardware_flags, MemoryPool* pool,
+               int64_t num_rows, int64_t num_batches,
+               BlockedBloomFilter* build_target) override;
+
+  Status PushNextBatch(size_t /*thread_index*/, int64_t num_rows,
+                       const uint32_t* hashes) override;
+
+  Status PushNextBatch(size_t /*thread_index*/, int64_t num_rows,
+                       const uint64_t* hashes) override;
+
+ private:
+  template <typename T>
+  void PushNextBatchImp(int64_t num_rows, const T* hashes);
+
+  int64_t hardware_flags_;
+  BlockedBloomFilter* build_target_;
+};
+
+class ARROW_ACERO_EXPORT BloomFilterBuilder_Parallel : public BloomFilterBuilder {
+ public:
+  Status Begin(size_t num_threads, int64_t hardware_flags, MemoryPool* pool,
+               int64_t num_rows, int64_t num_batches,
+               BlockedBloomFilter* build_target) override;
+
+  Status PushNextBatch(size_t thread_id, int64_t num_rows,
+                       const uint32_t* hashes) override;
+
+  Status PushNextBatch(size_t thread_id, int64_t num_rows,
+                       const uint64_t* hashes) override;
+
+  void CleanUp() override;
+
+ private:
+  template <typename T>
+  void PushNextBatchImp(size_t thread_id, int64_t num_rows, const T* hashes);
+
+  int64_t hardware_flags_;
+  BlockedBloomFilter* build_target_;
+  int log_num_prtns_;
+  struct ThreadLocalState {
+    std::vector<uint32_t> partitioned_hashes_32;
+    std::vector<uint64_t> partitioned_hashes_64;
+    std::vector<uint16_t> partition_ranges;
+    std::vector<int> unprocessed_partition_ids;
+  };
+  std::vector<ThreadLocalState> thread_local_states_;
+  PartitionLocks prtn_locks_;
+};
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/exec_plan.h b/pyarrow/include/arrow/acero/exec_plan.h
new file mode 100644
index 0000000000000000000000000000000000000000..dba6c64ddc8379f7a8e6aa666f55555ced6c78aa
--- /dev/null
+++ b/pyarrow/include/arrow/acero/exec_plan.h
@@ -0,0 +1,819 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/acero/type_fwd.h"
+#include "arrow/acero/visibility.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/ordering.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/future.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/tracing.h"
+#include "arrow/util/type_fwd.h"
+
+namespace arrow {
+
+using compute::ExecBatch;
+using compute::ExecContext;
+using compute::FunctionRegistry;
+using compute::GetFunctionRegistry;
+using compute::Ordering;
+using compute::threaded_exec_context;
+
+namespace acero {
+
+/// \addtogroup acero-internals
+/// @{
+
+class ARROW_ACERO_EXPORT ExecPlan : public std::enable_shared_from_this<ExecPlan> {
+ public:
+  // This allows operators to rely on signed 16-bit indices
+  static const uint32_t kMaxBatchSize = 1 << 15;
+  using NodeVector = std::vector<ExecNode*>;
+
+  virtual ~ExecPlan() = default;
+
+  QueryContext* query_context();
+
+  /// \brief retrieve the nodes in the plan
+  const NodeVector& nodes() const;
+
+  /// Make an empty exec plan
+  static Result<std::shared_ptr<ExecPlan>> Make(
+      QueryOptions options, ExecContext exec_context = *threaded_exec_context(),
+      std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+  static Result<std::shared_ptr<ExecPlan>> Make(
+      ExecContext exec_context = *threaded_exec_context(),
+      std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+  static Result<std::shared_ptr<ExecPlan>> Make(
+      QueryOptions options, ExecContext* exec_context,
+      std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+  static Result<std::shared_ptr<ExecPlan>> Make(
+      ExecContext* exec_context,
+      std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+  ExecNode* AddNode(std::unique_ptr<ExecNode> node);
+
+  template <typename Node, typename... Args>
+  Node* EmplaceNode(Args&&... args) {
+    std::unique_ptr<Node> node{new Node{std::forward<Args>(args)...}};
+    auto out = node.get();
+    AddNode(std::move(node));
+    return out;
+  }
+
+  Status Validate();
+
+  /// \brief Start producing on all nodes
+  ///
+  /// Nodes are started in reverse topological order, such that any node
+  /// is started before all of its inputs.
+  void StartProducing();
+
+  /// \brief Stop producing on all nodes
+  ///
+  /// Triggers all sources to stop producing new data.  In order to cleanly stop the plan
+  /// will continue to run any tasks that are already in progress.  The caller should
+  /// still wait for `finished` to complete before destroying the plan.
+  void StopProducing();
+
+  /// \brief A future which will be marked finished when all tasks have finished.
+  Future<> finished();
+
+  /// \brief Return whether the plan has non-empty metadata
+  bool HasMetadata() const;
+
+  /// \brief Return the plan's attached metadata
+  std::shared_ptr<const KeyValueMetadata> metadata() const;
+
+  std::string ToString() const;
+};
+
+// Acero can be extended by providing custom implementations of ExecNode.  The methods
+// below are documented in detail and provide careful instruction on how to fulfill the
+// ExecNode contract.  It's suggested you familiarize yourself with the Acero
+// documentation in the C++ user guide.
+class ARROW_ACERO_EXPORT ExecNode {
+ public:
+  using NodeVector = std::vector<ExecNode*>;
+
+  virtual ~ExecNode() = default;
+
+  virtual const char* kind_name() const = 0;
+
+  // The number of inputs expected by this node
+  int num_inputs() const { return static_cast<int>(inputs_.size()); }
+
+  /// This node's predecessors in the exec plan
+  const NodeVector& inputs() const { return inputs_; }
+
+  /// True if the plan has no output schema (is a sink)
+  bool is_sink() const { return !output_schema_; }
+
+  /// \brief Labels identifying the function of each input.
+  const std::vector<std::string>& input_labels() const { return input_labels_; }
+
+  /// This node's successor in the exec plan
+  const ExecNode* output() const { return output_; }
+
+  /// The datatypes for batches produced by this node
+  const std::shared_ptr<Schema>& output_schema() const { return output_schema_; }
+
+  /// This node's exec plan
+  ExecPlan* plan() { return plan_; }
+
+  /// \brief An optional label, for display and debugging
+  ///
+  /// There is no guarantee that this value is non-empty or unique.
+  const std::string& label() const { return label_; }
+  void SetLabel(std::string label) { label_ = std::move(label); }
+
+  virtual Status Validate() const;
+
+  /// \brief the ordering of the output batches
+  ///
+  /// This does not guarantee the batches will be emitted by this node
+  /// in order.  Instead it guarantees that the batches will have their
+  /// ExecBatch::index property set in a way that respects this ordering.
+  ///
+  /// In other words, given the ordering {{"x", SortOrder::Ascending}} we
+  /// know that all values of x in a batch with index N will be less than
+  /// or equal to all values of x in a batch with index N+k (assuming k > 0).
+  /// Furthermore, we also know that values will be sorted within a batch.
+  /// Any row N will have a value of x that is less than the value for
+  /// any row N+k.
+  ///
+  /// Note that an ordering can be both Ordering::Unordered and Ordering::Implicit.
+  /// A node's output should be marked Ordering::Unordered if the order is
+  /// non-deterministic.  For example, a hash-join has no predictable output order.
+  ///
+  /// If the ordering is Ordering::Implicit then there is a meaningful order but that
+  /// ordering is not represented by any column in the data.  The most common case for
+  /// this is when reading data from an in-memory table.  The data has an implicit "row
+  /// order" which is not necessarily represented in the data set.
+  ///
+  /// A filter or project node will not modify the ordering.  Nothing needs to be done
+  /// other than ensure the index assigned to output batches is the same as the
+  /// input batch that was mapped.
+  ///
+  /// Other nodes may introduce order.  For example, an order-by node will emit
+  /// a brand new ordering independent of the input ordering.
+  ///
+  /// Finally, as described above, such as a hash-join or aggregation may may
+  /// destroy ordering (although these nodes could also choose to establish a
+  /// new ordering based on the hash keys).
+  ///
+  /// Some nodes will require an ordering.  For example, a fetch node or an
+  /// asof join node will only function if the input data is ordered (for fetch
+  /// it is enough to be implicitly ordered.  For an asof join the ordering must
+  /// be explicit and compatible with the on key.)
+  ///
+  /// Nodes that maintain ordering should be careful to avoid introducing gaps
+  /// in the batch index.  This may require emitting empty batches in order to
+  /// maintain continuity.
+  virtual const Ordering& ordering() const;
+
+  /// Upstream API:
+  /// These functions are called by input nodes that want to inform this node
+  /// about an updated condition (a new input batch or an impending
+  /// end of stream).
+  ///
+  /// Implementation rules:
+  /// - these may be called anytime after StartProducing() has succeeded
+  ///   (and even during or after StopProducing())
+  /// - these may be called concurrently
+  /// - these are allowed to call back into PauseProducing(), ResumeProducing()
+  ///   and StopProducing()
+
+  /// Transfer input batch to ExecNode
+  ///
+  /// A node will typically perform some kind of operation on the batch
+  /// and then call InputReceived on its outputs with the result.
+  ///
+  /// Other nodes may need to accumulate some number of inputs before any
+  /// output can be produced.  These nodes will add the batch to some kind
+  /// of in-memory accumulation queue and return.
+  virtual Status InputReceived(ExecNode* input, ExecBatch batch) = 0;
+
+  /// Mark the inputs finished after the given number of batches.
+  ///
+  /// This may be called before all inputs are received.  This simply fixes
+  /// the total number of incoming batches for an input, so that the ExecNode
+  /// knows when it has received all input, regardless of order.
+  virtual Status InputFinished(ExecNode* input, int total_batches) = 0;
+
+  /// \brief Perform any needed initialization
+  ///
+  /// This hook performs any actions in between creation of ExecPlan and the call to
+  /// StartProducing. An example could be Bloom filter pushdown. The order of ExecNodes
+  /// that executes this method is undefined, but the calls are made synchronously.
+  ///
+  /// At this point a node can rely on all inputs & outputs (and the input schemas)
+  /// being well defined.
+  virtual Status Init();
+
+  /// Lifecycle API:
+  /// - start / stop to initiate and terminate production
+  /// - pause / resume to apply backpressure
+  ///
+  /// Implementation rules:
+  /// - StartProducing() should not recurse into the inputs, as it is
+  ///   handled by ExecPlan::StartProducing()
+  /// - PauseProducing(), ResumeProducing(), StopProducing() may be called
+  ///   concurrently, potentially even before the call to StartProducing
+  ///   has finished.
+  /// - PauseProducing(), ResumeProducing(), StopProducing() may be called
+  ///   by the downstream nodes' InputReceived(), InputFinished() methods
+  ///
+  /// StopProducing may be called due to an error, by the user (e.g. cancel), or
+  /// because a node has all the data it needs (e.g. limit, top-k on sorted data).
+  /// This means the method may be called multiple times and we have the following
+  /// additional rules
+  /// - StopProducing() must be idempotent
+  /// - StopProducing() must be forwarded to inputs (this is needed for the limit/top-k
+  ///     case because we may not be stopping the entire plan)
+
+  // Right now, since synchronous calls happen in both directions (input to
+  // output and then output to input), a node must be careful to be reentrant
+  // against synchronous calls from its output, *and* also concurrent calls from
+  // other threads.  The most reliable solution is to update the internal state
+  // first, and notify outputs only at the end.
+  //
+  // Concurrent calls to PauseProducing and ResumeProducing can be hard to sequence
+  // as they may travel at different speeds through the plan.
+  //
+  // For example, consider a resume that comes quickly after a pause.  If the source
+  // receives the resume before the pause the source may think the destination is full
+  // and halt production which would lead to deadlock.
+  //
+  // To resolve this a counter is sent for all calls to pause/resume.  Only the call with
+  // the highest counter value is valid.  So if a call to PauseProducing(5) comes after
+  // a call to ResumeProducing(6) then the source should continue producing.
+
+  /// \brief Start producing
+  ///
+  /// This must only be called once.
+  ///
+  /// This is typically called automatically by ExecPlan::StartProducing().
+  virtual Status StartProducing() = 0;
+
+  /// \brief Pause producing temporarily
+  ///
+  /// \param output Pointer to the output that is full
+  /// \param counter Counter used to sequence calls to pause/resume
+  ///
+  /// This call is a hint that an output node is currently not willing
+  /// to receive data.
+  ///
+  /// This may be called any number of times.
+  /// However, the node is still free to produce data (which may be difficult
+  /// to prevent anyway if data is produced using multiple threads).
+  virtual void PauseProducing(ExecNode* output, int32_t counter) = 0;
+
+  /// \brief Resume producing after a temporary pause
+  ///
+  /// \param output Pointer to the output that is now free
+  /// \param counter Counter used to sequence calls to pause/resume
+  ///
+  /// This call is a hint that an output node is willing to receive data again.
+  ///
+  /// This may be called any number of times.
+  virtual void ResumeProducing(ExecNode* output, int32_t counter) = 0;
+
+  /// \brief Stop producing new data
+  ///
+  /// If this node is a source then the source should stop generating data
+  /// as quickly as possible.  If this node is not a source then there is typically
+  /// nothing that needs to be done although a node may choose to start ignoring incoming
+  /// data.
+  ///
+  /// This method will be called when an error occurs in the plan
+  /// This method may also be called by the user if they wish to end a plan early
+  /// Finally, this method may be called if a node determines it no longer needs any more
+  /// input (for example, a limit node).
+  ///
+  /// This method may be called multiple times.
+  ///
+  /// This is not a pause.  There will be no way to start the source again after this has
+  /// been called.
+  virtual Status StopProducing();
+
+  std::string ToString(int indent = 0) const;
+
+ protected:
+  ExecNode(ExecPlan* plan, NodeVector inputs, std::vector<std::string> input_labels,
+           std::shared_ptr<Schema> output_schema);
+
+  virtual Status StopProducingImpl() = 0;
+
+  /// Provide extra info to include in the string representation.
+  virtual std::string ToStringExtra(int indent = 0) const;
+
+  std::atomic<bool> stopped_;
+  ExecPlan* plan_;
+  std::string label_;
+
+  NodeVector inputs_;
+  std::vector<std::string> input_labels_;
+
+  std::shared_ptr<Schema> output_schema_;
+  ExecNode* output_ = NULLPTR;
+};
+
+/// \brief An extensible registry for factories of ExecNodes
+class ARROW_ACERO_EXPORT ExecFactoryRegistry {
+ public:
+  using Factory = std::function<Result<ExecNode*>(ExecPlan*, std::vector<ExecNode*>,
+                                                  const ExecNodeOptions&)>;
+
+  virtual ~ExecFactoryRegistry() = default;
+
+  /// \brief Get the named factory from this registry
+  ///
+  /// will raise if factory_name is not found
+  virtual Result<Factory> GetFactory(const std::string& factory_name) = 0;
+
+  /// \brief Add a factory to this registry with the provided name
+  ///
+  /// will raise if factory_name is already in the registry
+  virtual Status AddFactory(std::string factory_name, Factory factory) = 0;
+};
+
+/// The default registry, which includes built-in factories.
+ARROW_ACERO_EXPORT
+ExecFactoryRegistry* default_exec_factory_registry();
+
+/// \brief Construct an ExecNode using the named factory
+inline Result<ExecNode*> MakeExecNode(
+    const std::string& factory_name, ExecPlan* plan, std::vector<ExecNode*> inputs,
+    const ExecNodeOptions& options,
+    ExecFactoryRegistry* registry = default_exec_factory_registry()) {
+  ARROW_ASSIGN_OR_RAISE(auto factory, registry->GetFactory(factory_name));
+  return factory(plan, std::move(inputs), options);
+}
+
+/// @}
+
+/// \addtogroup acero-api
+/// @{
+
+/// \brief Helper class for declaring execution nodes
+///
+/// A Declaration represents an unconstructed ExecNode (and potentially an entire graph
+/// since its inputs may also be Declarations)
+///
+/// A Declaration can be converted to a plan and executed using one of the
+/// DeclarationToXyz methods.
+///
+/// For more direct control, a Declaration can be added to an existing execution
+/// plan with Declaration::AddToPlan, which will recursively construct any inputs as
+/// necessary.
+struct ARROW_ACERO_EXPORT Declaration {
+  using Input = std::variant<ExecNode*, Declaration>;
+
+  Declaration() {}
+
+  /// \brief construct a declaration
+  /// \param factory_name the name of the exec node to construct.  The node must have
+  ///                     been added to the exec node registry with this name.
+  /// \param inputs the inputs to the node, these should be other declarations
+  /// \param options options that control the behavior of the node.  You must use
+  ///                the appropriate subclass.  For example, if `factory_name` is
+  ///                "project" then `options` should be ProjectNodeOptions.
+  /// \param label a label to give the node.  Can be used to distinguish it from other
+  ///              nodes of the same type in the plan.
+  Declaration(std::string factory_name, std::vector<Input> inputs,
+              std::shared_ptr<ExecNodeOptions> options, std::string label)
+      : factory_name{std::move(factory_name)},
+        inputs{std::move(inputs)},
+        options{std::move(options)},
+        label{std::move(label)} {}
+
+  template <typename Options>
+  Declaration(std::string factory_name, std::vector<Input> inputs, Options options,
+              std::string label)
+      : Declaration{std::move(factory_name), std::move(inputs),
+                    std::shared_ptr<ExecNodeOptions>(
+                        std::make_shared<Options>(std::move(options))),
+                    std::move(label)} {}
+
+  template <typename Options>
+  Declaration(std::string factory_name, std::vector<Input> inputs, Options options)
+      : Declaration{std::move(factory_name), std::move(inputs), std::move(options),
+                    /*label=*/""} {}
+
+  template <typename Options>
+  Declaration(std::string factory_name, Options options)
+      : Declaration{std::move(factory_name), {}, std::move(options), /*label=*/""} {}
+
+  template <typename Options>
+  Declaration(std::string factory_name, Options options, std::string label)
+      : Declaration{std::move(factory_name), {}, std::move(options), std::move(label)} {}
+
+  /// \brief Convenience factory for the common case of a simple sequence of nodes.
+  ///
+  /// Each of decls will be appended to the inputs of the subsequent declaration,
+  /// and the final modified declaration will be returned.
+  ///
+  /// Without this convenience factory, constructing a sequence would require explicit,
+  /// difficult-to-read nesting:
+  ///
+  ///     Declaration{"n3",
+  ///                   {
+  ///                       Declaration{"n2",
+  ///                                   {
+  ///                                       Declaration{"n1",
+  ///                                                   {
+  ///                                                       Declaration{"n0", N0Opts{}},
+  ///                                                   },
+  ///                                                   N1Opts{}},
+  ///                                   },
+  ///                                   N2Opts{}},
+  ///                   },
+  ///                   N3Opts{}};
+  ///
+  /// An equivalent Declaration can be constructed more tersely using Sequence:
+  ///
+  ///     Declaration::Sequence({
+  ///         {"n0", N0Opts{}},
+  ///         {"n1", N1Opts{}},
+  ///         {"n2", N2Opts{}},
+  ///         {"n3", N3Opts{}},
+  ///     });
+  static Declaration Sequence(std::vector<Declaration> decls);
+
+  /// \brief add the declaration to an already created execution plan
+  /// \param plan the plan to add the node to
+  /// \param registry the registry to use to lookup the node factory
+  ///
+  /// This method will recursively call AddToPlan on all of the declaration's inputs.
+  /// This method is only for advanced use when the DeclarationToXyz methods are not
+  /// sufficient.
+  ///
+  /// \return the instantiated execution node
+  Result<ExecNode*> AddToPlan(ExecPlan* plan, ExecFactoryRegistry* registry =
+                                                  default_exec_factory_registry()) const;
+
+  // Validate a declaration
+  bool IsValid(ExecFactoryRegistry* registry = default_exec_factory_registry()) const;
+
+  /// \brief the name of the factory to use when creating a node
+  std::string factory_name;
+  /// \brief the declarations's inputs
+  std::vector<Input> inputs;
+  /// \brief options to control the behavior of the node
+  std::shared_ptr<ExecNodeOptions> options;
+  /// \brief a label to give the node in the plan
+  std::string label;
+};
+
+/// \brief How to handle unaligned buffers
+enum class UnalignedBufferHandling { kWarn, kIgnore, kReallocate, kError };
+
+/// \brief get the default behavior of unaligned buffer handling
+///
+/// This is configurable via the ACERO_ALIGNMENT_HANDLING environment variable which
+/// can be set to "warn", "ignore", "reallocate", or "error".  If the environment
+/// variable is not set, or is set to an invalid value, this will return kWarn
+UnalignedBufferHandling GetDefaultUnalignedBufferHandling();
+
+/// \brief plan-wide options that can be specified when executing an execution plan
+struct ARROW_ACERO_EXPORT QueryOptions {
+  /// \brief Should the plan use a legacy batching strategy
+  ///
+  /// This is currently in place only to support the Scanner::ToTable
+  /// method.  This method relies on batch indices from the scanner
+  /// remaining consistent.  This is impractical in the ExecPlan which
+  /// might slice batches as needed (e.g. for a join)
+  ///
+  /// However, it still works for simple plans and this is the only way
+  /// we have at the moment for maintaining implicit order.
+  bool use_legacy_batching = false;
+
+  /// If the output has a meaningful order then sequence the output of the plan
+  ///
+  /// The default behavior (std::nullopt) will sequence output batches if there
+  /// is a meaningful ordering in the final node and will emit batches immediately
+  /// otherwise.
+  ///
+  /// If explicitly set to true then plan execution will fail if there is no
+  /// meaningful ordering.  This can be useful to validate a query that should
+  /// be emitting ordered results.
+  ///
+  /// If explicitly set to false then batches will be emit immediately even if there
+  /// is a meaningful ordering.  This could cause batches to be emit out of order but
+  /// may offer a small decrease to latency.
+  std::optional<bool> sequence_output = std::nullopt;
+
+  /// \brief should the plan use multiple background threads for CPU-intensive work
+  ///
+  /// If this is false then all CPU work will be done on the calling thread.  I/O tasks
+  /// will still happen on the I/O executor and may be multi-threaded (but should not use
+  /// significant CPU resources).
+  ///
+  /// Will be ignored if custom_cpu_executor is set
+  bool use_threads = true;
+
+  /// \brief custom executor to use for CPU-intensive work
+  ///
+  /// Must be null or remain valid for the duration of the plan.  If this is null then
+  /// a default thread pool will be chosen whose behavior will be controlled by
+  /// the `use_threads` option.
+  ::arrow::internal::Executor* custom_cpu_executor = NULLPTR;
+
+  /// \brief custom executor to use for IO work
+  ///
+  /// Must be null or remain valid for the duration of the plan.  If this is null then
+  /// the global io thread pool will be chosen whose behavior will be controlled by
+  /// the "ARROW_IO_THREADS" environment.
+  ::arrow::internal::Executor* custom_io_executor = NULLPTR;
+
+  /// \brief a memory pool to use for allocations
+  ///
+  /// Must remain valid for the duration of the plan.
+  MemoryPool* memory_pool = default_memory_pool();
+
+  /// \brief a function registry to use for the plan
+  ///
+  /// Must remain valid for the duration of the plan.
+  FunctionRegistry* function_registry = GetFunctionRegistry();
+  /// \brief the names of the output columns
+  ///
+  /// If this is empty then names will be generated based on the input columns
+  ///
+  /// If set then the number of names must equal the number of output columns
+  std::vector<std::string> field_names;
+
+  /// \brief Policy for unaligned buffers in source data
+  ///
+  /// Various compute functions and acero internals will type pun array
+  /// buffers from uint8_t* to some kind of value type (e.g. we might
+  /// cast to int32_t* to add two int32 arrays)
+  ///
+  /// If the buffer is poorly aligned (e.g. an int32 array is not aligned
+  /// on a 4-byte boundary) then this is technically undefined behavior in C++.
+  /// However, most modern compilers and CPUs are fairly tolerant of this
+  /// behavior and nothing bad (beyond a small hit to performance) is likely
+  /// to happen.
+  ///
+  /// Note that this only applies to source buffers.  All buffers allocated internally
+  /// by Acero will be suitably aligned.
+  ///
+  /// If this field is set to kWarn then Acero will check if any buffers are unaligned
+  /// and, if they are, will emit a warning.
+  ///
+  /// If this field is set to kReallocate then Acero will allocate a new, suitably aligned
+  /// buffer and copy the contents from the old buffer into this new buffer.
+  ///
+  /// If this field is set to kError then Acero will gracefully abort the plan instead.
+  ///
+  /// If this field is set to kIgnore then Acero will not even check if the buffers are
+  /// unaligned.
+  ///
+  /// If this field is not set then it will be treated as kWarn unless overridden
+  /// by the ACERO_ALIGNMENT_HANDLING environment variable
+  std::optional<UnalignedBufferHandling> unaligned_buffer_handling;
+};
+
+/// \brief Calculate the output schema of a declaration
+///
+/// This does not actually execute the plan.  This operation may fail if the
+/// declaration represents an invalid plan (e.g. a project node with multiple inputs)
+///
+/// \param declaration A declaration describing an execution plan
+/// \param function_registry The function registry to use for function execution.  If null
+///                          then the default function registry will be used.
+///
+/// \return the schema that batches would have after going through the execution plan
+ARROW_ACERO_EXPORT Result<std::shared_ptr<Schema>> DeclarationToSchema(
+    const Declaration& declaration, FunctionRegistry* function_registry = NULLPTR);
+
+/// \brief Create a string representation of a plan
+///
+/// This representation is for debug purposes only.
+///
+/// Conversion to a string may fail if the declaration represents an
+/// invalid plan.
+///
+/// Use Substrait for complete serialization of plans
+///
+/// \param declaration A declaration describing an execution plan
+/// \param function_registry The function registry to use for function execution.  If null
+///                          then the default function registry will be used.
+///
+/// \return a string representation of the plan suitable for debugging output
+ARROW_ACERO_EXPORT Result<std::string> DeclarationToString(
+    const Declaration& declaration, FunctionRegistry* function_registry = NULLPTR);
+
+/// \brief Utility method to run a declaration and collect the results into a table
+///
+/// \param declaration A declaration describing the plan to run
+/// \param use_threads If `use_threads` is false then all CPU work will be done on the
+///                    calling thread.  I/O tasks will still happen on the I/O executor
+///                    and may be multi-threaded (but should not use significant CPU
+///                    resources).
+/// \param memory_pool The memory pool to use for allocations made while running the plan.
+/// \param function_registry The function registry to use for function execution.  If null
+///                          then the default function registry will be used.
+///
+/// This method will add a sink node to the declaration to collect results into a
+/// table.  It will then create an ExecPlan from the declaration, start the exec plan,
+/// block until the plan has finished, and return the created table.
+ARROW_ACERO_EXPORT Result<std::shared_ptr<Table>> DeclarationToTable(
+    Declaration declaration, bool use_threads = true,
+    MemoryPool* memory_pool = default_memory_pool(),
+    FunctionRegistry* function_registry = NULLPTR);
+
+ARROW_ACERO_EXPORT Result<std::shared_ptr<Table>> DeclarationToTable(
+    Declaration declaration, QueryOptions query_options);
+
+/// \brief Asynchronous version of \see DeclarationToTable
+///
+/// \param declaration A declaration describing the plan to run
+/// \param use_threads The behavior of use_threads is slightly different than the
+///                    synchronous version since we cannot run synchronously on the
+///                    calling thread. Instead, if use_threads=false then a new thread
+///                    pool will be created with a single thread and this will be used for
+///                    all compute work.
+/// \param memory_pool The memory pool to use for allocations made while running the plan.
+/// \param function_registry The function registry to use for function execution. If null
+///                          then the default function registry will be used.
+ARROW_ACERO_EXPORT Future<std::shared_ptr<Table>> DeclarationToTableAsync(
+    Declaration declaration, bool use_threads = true,
+    MemoryPool* memory_pool = default_memory_pool(),
+    FunctionRegistry* function_registry = NULLPTR);
+
+/// \brief Overload of \see DeclarationToTableAsync accepting a custom exec context
+///
+/// The executor must be specified (cannot be null) and must be kept alive until the
+/// returned future finishes.
+ARROW_ACERO_EXPORT Future<std::shared_ptr<Table>> DeclarationToTableAsync(
+    Declaration declaration, ExecContext custom_exec_context);
+
+/// \brief a collection of exec batches with a common schema
+struct BatchesWithCommonSchema {
+  std::vector<ExecBatch> batches;
+  std::shared_ptr<Schema> schema;
+};
+
+/// \brief Utility method to run a declaration and collect the results into ExecBatch
+/// vector
+///
+/// \see DeclarationToTable for details on threading & execution
+ARROW_ACERO_EXPORT Result<BatchesWithCommonSchema> DeclarationToExecBatches(
+    Declaration declaration, bool use_threads = true,
+    MemoryPool* memory_pool = default_memory_pool(),
+    FunctionRegistry* function_registry = NULLPTR);
+
+ARROW_ACERO_EXPORT Result<BatchesWithCommonSchema> DeclarationToExecBatches(
+    Declaration declaration, QueryOptions query_options);
+
+/// \brief Asynchronous version of \see DeclarationToExecBatches
+///
+/// \see DeclarationToTableAsync for details on threading & execution
+ARROW_ACERO_EXPORT Future<BatchesWithCommonSchema> DeclarationToExecBatchesAsync(
+    Declaration declaration, bool use_threads = true,
+    MemoryPool* memory_pool = default_memory_pool(),
+    FunctionRegistry* function_registry = NULLPTR);
+
+/// \brief Overload of \see DeclarationToExecBatchesAsync accepting a custom exec context
+///
+/// \see DeclarationToTableAsync for details on threading & execution
+ARROW_ACERO_EXPORT Future<BatchesWithCommonSchema> DeclarationToExecBatchesAsync(
+    Declaration declaration, ExecContext custom_exec_context);
+
+/// \brief Utility method to run a declaration and collect the results into a vector
+///
+/// \see DeclarationToTable for details on threading & execution
+ARROW_ACERO_EXPORT Result<std::vector<std::shared_ptr<RecordBatch>>> DeclarationToBatches(
+    Declaration declaration, bool use_threads = true,
+    MemoryPool* memory_pool = default_memory_pool(),
+    FunctionRegistry* function_registry = NULLPTR);
+
+ARROW_ACERO_EXPORT Result<std::vector<std::shared_ptr<RecordBatch>>> DeclarationToBatches(
+    Declaration declaration, QueryOptions query_options);
+
+/// \brief Asynchronous version of \see DeclarationToBatches
+///
+/// \see DeclarationToTableAsync for details on threading & execution
+ARROW_ACERO_EXPORT Future<std::vector<std::shared_ptr<RecordBatch>>>
+DeclarationToBatchesAsync(Declaration declaration, bool use_threads = true,
+                          MemoryPool* memory_pool = default_memory_pool(),
+                          FunctionRegistry* function_registry = NULLPTR);
+
+/// \brief Overload of \see DeclarationToBatchesAsync accepting a custom exec context
+///
+/// \see DeclarationToTableAsync for details on threading & execution
+ARROW_ACERO_EXPORT Future<std::vector<std::shared_ptr<RecordBatch>>>
+DeclarationToBatchesAsync(Declaration declaration, ExecContext exec_context);
+
+/// \brief Utility method to run a declaration and return results as a RecordBatchReader
+///
+/// If an exec context is not provided then a default exec context will be used based
+/// on the value of `use_threads`.  If `use_threads` is false then the CPU executor will
+/// be a serial executor and all CPU work will be done on the calling thread.  I/O tasks
+/// will still happen on the I/O executor and may be multi-threaded.
+///
+/// If `use_threads` is false then all CPU work will happen during the calls to
+/// RecordBatchReader::Next and no CPU work will happen in the background.  If
+/// `use_threads` is true then CPU work will happen on the CPU thread pool and tasks may
+/// run in between calls to RecordBatchReader::Next.  If the returned reader is not
+/// consumed quickly enough then the plan will eventually pause as the backpressure queue
+/// fills up.
+///
+/// If a custom exec context is provided then the value of `use_threads` will be ignored.
+///
+/// The returned RecordBatchReader can be closed early to cancel the computation of record
+/// batches. In this case, only errors encountered by the computation may be reported. In
+/// particular, no cancellation error may be reported.
+ARROW_ACERO_EXPORT Result<std::unique_ptr<RecordBatchReader>> DeclarationToReader(
+    Declaration declaration, bool use_threads = true,
+    MemoryPool* memory_pool = default_memory_pool(),
+    FunctionRegistry* function_registry = NULLPTR);
+
+ARROW_ACERO_EXPORT Result<std::unique_ptr<RecordBatchReader>> DeclarationToReader(
+    Declaration declaration, QueryOptions query_options);
+
+/// \brief Utility method to run a declaration and ignore results
+///
+/// This can be useful when the data are consumed as part of the plan itself, for
+/// example, when the plan ends with a write node.
+///
+/// \see DeclarationToTable for details on threading & execution
+ARROW_ACERO_EXPORT Status
+DeclarationToStatus(Declaration declaration, bool use_threads = true,
+                    MemoryPool* memory_pool = default_memory_pool(),
+                    FunctionRegistry* function_registry = NULLPTR);
+
+ARROW_ACERO_EXPORT Status DeclarationToStatus(Declaration declaration,
+                                              QueryOptions query_options);
+
+/// \brief Asynchronous version of \see DeclarationToStatus
+///
+/// This can be useful when the data are consumed as part of the plan itself, for
+/// example, when the plan ends with a write node.
+///
+/// \see DeclarationToTableAsync for details on threading & execution
+ARROW_ACERO_EXPORT Future<> DeclarationToStatusAsync(
+    Declaration declaration, bool use_threads = true,
+    MemoryPool* memory_pool = default_memory_pool(),
+    FunctionRegistry* function_registry = NULLPTR);
+
+/// \brief Overload of \see DeclarationToStatusAsync accepting a custom exec context
+///
+/// \see DeclarationToTableAsync for details on threading & execution
+ARROW_ACERO_EXPORT Future<> DeclarationToStatusAsync(Declaration declaration,
+                                                     ExecContext exec_context);
+
+/// @}
+
+/// \brief Wrap an ExecBatch generator in a RecordBatchReader.
+///
+/// The RecordBatchReader does not impose any ordering on emitted batches.
+ARROW_ACERO_EXPORT
+std::shared_ptr<RecordBatchReader> MakeGeneratorReader(
+    std::shared_ptr<Schema>, std::function<Future<std::optional<ExecBatch>>()>,
+    MemoryPool*);
+
+constexpr int kDefaultBackgroundMaxQ = 32;
+constexpr int kDefaultBackgroundQRestart = 16;
+
+/// \brief Make a generator of RecordBatchReaders
+///
+/// Useful as a source node for an Exec plan
+ARROW_ACERO_EXPORT
+Result<std::function<Future<std::optional<ExecBatch>>()>> MakeReaderGenerator(
+    std::shared_ptr<RecordBatchReader> reader, arrow::internal::Executor* io_executor,
+    int max_q = kDefaultBackgroundMaxQ, int q_restart = kDefaultBackgroundQRestart);
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/hash_join.h b/pyarrow/include/arrow/acero/hash_join.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0faacf04baf02e865a61a0301a0cfa92b3fab1b
--- /dev/null
+++ b/pyarrow/include/arrow/acero/hash_join.h
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "arrow/acero/accumulation_queue.h"
+#include "arrow/acero/bloom_filter.h"
+#include "arrow/acero/options.h"
+#include "arrow/acero/query_context.h"
+#include "arrow/acero/schema_util.h"
+#include "arrow/acero/task_util.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/tracing.h"
+
+namespace arrow {
+namespace acero {
+
+using util::AccumulationQueue;
+
+class ARROW_ACERO_EXPORT HashJoinImpl {
+ public:
+  using OutputBatchCallback = std::function<Status(int64_t, ExecBatch)>;
+  using BuildFinishedCallback = std::function<Status(size_t)>;
+  using FinishedCallback = std::function<Status(int64_t)>;
+  using RegisterTaskGroupCallback = std::function<int(
+      std::function<Status(size_t, int64_t)>, std::function<Status(size_t)>)>;
+  using StartTaskGroupCallback = std::function<Status(int, int64_t)>;
+  using AbortContinuationImpl = std::function<void()>;
+
+  virtual ~HashJoinImpl() = default;
+  virtual Status Init(QueryContext* ctx, JoinType join_type, size_t num_threads,
+                      const HashJoinProjectionMaps* proj_map_left,
+                      const HashJoinProjectionMaps* proj_map_right,
+                      std::vector<JoinKeyCmp> key_cmp, Expression filter,
+                      RegisterTaskGroupCallback register_task_group_callback,
+                      StartTaskGroupCallback start_task_group_callback,
+                      OutputBatchCallback output_batch_callback,
+                      FinishedCallback finished_callback) = 0;
+
+  virtual Status BuildHashTable(size_t thread_index, AccumulationQueue batches,
+                                BuildFinishedCallback on_finished) = 0;
+  virtual Status ProbeSingleBatch(size_t thread_index, ExecBatch batch) = 0;
+  virtual Status ProbingFinished(size_t thread_index) = 0;
+  virtual void Abort(TaskScheduler::AbortContinuationImpl pos_abort_callback) = 0;
+  virtual std::string ToString() const = 0;
+
+  static Result<std::unique_ptr<HashJoinImpl>> MakeBasic();
+  static Result<std::unique_ptr<HashJoinImpl>> MakeSwiss();
+
+ protected:
+  arrow::util::tracing::Span span_;
+};
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/hash_join_dict.h b/pyarrow/include/arrow/acero/hash_join_dict.h
new file mode 100644
index 0000000000000000000000000000000000000000..02454a7146278176e27379e6033f79547574a367
--- /dev/null
+++ b/pyarrow/include/arrow/acero/hash_join_dict.h
@@ -0,0 +1,318 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#include "arrow/acero/schema_util.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/row/row_encoder_internal.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+
+// This file contains hash join logic related to handling of dictionary encoded key
+// columns.
+//
+// A key column from probe side of the join can be matched against a key column from build
+// side of the join, as long as the underlying value types are equal. That means that:
+// - both scalars and arrays can be used and even mixed in the same column
+// - dictionary column can be matched against non-dictionary column if underlying value
+// types are equal
+// - dictionary column can be matched against dictionary column with a different index
+// type, and potentially using a different dictionary, if underlying value types are equal
+//
+// We currently require in hash join that for all dictionary encoded columns, the same
+// dictionary is used in all input exec batches.
+//
+// In order to allow matching columns with different dictionaries, different dictionary
+// index types, and dictionary key against non-dictionary key, internally comparisons will
+// be evaluated after remapping values on both sides of the join to a common
+// representation (which will be called "unified representation"). This common
+// representation is a column of int32() type (not a dictionary column). It represents an
+// index in the unified dictionary computed for the (only) dictionary present on build
+// side (an empty dictionary is still created for an empty build side). Null value is
+// always represented in this common representation as null int32 value, unified
+// dictionary will never contain a null value (so there is no ambiguity of representing
+// nulls as either index to a null entry in the dictionary or null index).
+//
+// Unified dictionary represents values present on build side. There may be values on
+// probe side that are not present in it. All such values, that are not null, are mapped
+// in the common representation to a special constant kMissingValueId.
+//
+
+namespace arrow {
+
+using compute::ExecBatch;
+using compute::ExecContext;
+using compute::internal::RowEncoder;
+
+namespace acero {
+
+/// Helper class with operations that are stateless and common to processing of dictionary
+/// keys on both build and probe side.
+class HashJoinDictUtil {
+ public:
+  // Null values in unified representation are always represented as null that has
+  // corresponding integer set to this constant
+  static constexpr int32_t kNullId = 0;
+  // Constant representing a value, that is not null, missing on the build side, in
+  // unified representation.
+  static constexpr int32_t kMissingValueId = -1;
+
+  // Check if data types of corresponding pair of key column on build and probe side are
+  // compatible
+  static bool KeyDataTypesValid(const std::shared_ptr<DataType>& probe_data_type,
+                                const std::shared_ptr<DataType>& build_data_type);
+
+  // Input must be dictionary array or dictionary scalar.
+  // A precomputed and provided here lookup table in the form of int32() array will be
+  // used to remap input indices to unified representation.
+  //
+  static Result<std::shared_ptr<ArrayData>> IndexRemapUsingLUT(
+      ExecContext* ctx, const Datum& indices, int64_t batch_length,
+      const std::shared_ptr<ArrayData>& map_array,
+      const std::shared_ptr<DataType>& data_type);
+
+  // Return int32() array that contains indices of input dictionary array or scalar after
+  // type casting.
+  static Result<std::shared_ptr<ArrayData>> ConvertToInt32(
+      const std::shared_ptr<DataType>& from_type, const Datum& input,
+      int64_t batch_length, ExecContext* ctx);
+
+  // Return an array that contains elements of input int32() array after casting to a
+  // given integer type. This is used for mapping unified representation stored in the
+  // hash table on build side back to original input data type of hash join, when
+  // outputting hash join results to parent exec node.
+  //
+  static Result<std::shared_ptr<ArrayData>> ConvertFromInt32(
+      const std::shared_ptr<DataType>& to_type, const Datum& input, int64_t batch_length,
+      ExecContext* ctx);
+
+  // Return dictionary referenced in either dictionary array or dictionary scalar
+  static std::shared_ptr<Array> ExtractDictionary(const Datum& data);
+};
+
+/// Implements processing of dictionary arrays/scalars in key columns on the build side of
+/// a hash join.
+/// Each instance of this class corresponds to a single column and stores and
+/// processes only the information related to that column.
+/// Const methods are thread-safe, non-const methods are not (the caller must make sure
+/// that only one thread at any time will access them).
+///
+class HashJoinDictBuild {
+ public:
+  // Returns true if the key column (described in input by its data type) requires any
+  // pre- or post-processing related to handling dictionaries.
+  //
+  static bool KeyNeedsProcessing(const std::shared_ptr<DataType>& build_data_type) {
+    return (build_data_type->id() == Type::DICTIONARY);
+  }
+
+  // Data type of unified representation
+  static std::shared_ptr<DataType> DataTypeAfterRemapping() { return int32(); }
+
+  // Should be called only once in hash join, before processing any build or probe
+  // batches.
+  //
+  // Takes a pointer to the dictionary for a corresponding key column on the build side as
+  // an input. If the build side is empty, it still needs to be called, but with
+  // dictionary pointer set to null.
+  //
+  // Currently it is required that all input batches on build side share the same
+  // dictionary. For each input batch during its pre-processing, dictionary will be
+  // checked and error will be returned if it is different then the one provided in the
+  // call to this method.
+  //
+  // Unifies the dictionary. The order of the values is still preserved.
+  // Null and duplicate entries are removed. If the dictionary is already unified, its
+  // copy will be produced and stored within this class.
+  //
+  // Prepares the mapping from ids within original dictionary to the ids in the resulting
+  // dictionary. This is used later on to pre-process (map to unified representation) key
+  // column on build side.
+  //
+  // Prepares the reverse mapping (in the form of hash table) from values to the ids in
+  // the resulting dictionary. This will be used later on to pre-process (map to unified
+  // representation) key column on probe side. Values on probe side that are not present
+  // in the original dictionary will be mapped to a special constant kMissingValueId. The
+  // exception is made for nulls, which get always mapped to nulls (both when null is
+  // represented as a dictionary id pointing to a null and a null dictionary id).
+  //
+  Status Init(ExecContext* ctx, std::shared_ptr<Array> dictionary,
+              std::shared_ptr<DataType> index_type, std::shared_ptr<DataType> value_type);
+
+  // Remap array or scalar values into unified representation (array of int32()).
+  // Outputs kMissingValueId if input value is not found in the unified dictionary.
+  // Outputs null for null input value (with corresponding data set to kNullId).
+  //
+  Result<std::shared_ptr<ArrayData>> RemapInputValues(ExecContext* ctx,
+                                                      const Datum& values,
+                                                      int64_t batch_length) const;
+
+  // Remap dictionary array or dictionary scalar on build side to unified representation.
+  // Dictionary referenced in the input must match the dictionary that was
+  // given during initialization.
+  // The output is a dictionary array that references unified dictionary.
+  //
+  Result<std::shared_ptr<ArrayData>> RemapInput(
+      ExecContext* ctx, const Datum& indices, int64_t batch_length,
+      const std::shared_ptr<DataType>& data_type) const;
+
+  // Outputs dictionary array referencing unified dictionary, given an array with 32-bit
+  // ids.
+  // Used to post-process values looked up in a hash table on build side of the hash join
+  // before outputting to the parent exec node.
+  //
+  Result<std::shared_ptr<ArrayData>> RemapOutput(const ArrayData& indices32Bit,
+                                                 ExecContext* ctx) const;
+
+  // Release shared pointers and memory
+  void CleanUp();
+
+ private:
+  // Data type of dictionary ids for the input dictionary on build side
+  std::shared_ptr<DataType> index_type_;
+  // Data type of values for the input dictionary on build side
+  std::shared_ptr<DataType> value_type_;
+  // Mapping from (encoded as string) values to the ids in unified dictionary
+  std::unordered_map<std::string, int32_t> hash_table_;
+  // Mapping from input dictionary ids to unified dictionary ids
+  std::shared_ptr<ArrayData> remapped_ids_;
+  // Input dictionary
+  std::shared_ptr<Array> dictionary_;
+  // Unified dictionary
+  std::shared_ptr<ArrayData> unified_dictionary_;
+};
+
+/// Implements processing of dictionary arrays/scalars in key columns on the probe side of
+/// a hash join.
+/// Each instance of this class corresponds to a single column and stores and
+/// processes only the information related to that column.
+/// It is not thread-safe - every participating thread should use its own instance of
+/// this class.
+///
+class HashJoinDictProbe {
+ public:
+  static bool KeyNeedsProcessing(const std::shared_ptr<DataType>& probe_data_type,
+                                 const std::shared_ptr<DataType>& build_data_type);
+
+  // Data type of the result of remapping input key column.
+  //
+  // The result of remapping is what is used in hash join for matching keys on build and
+  // probe side. The exact data types may be different, as described below, and therefore
+  // a common representation is needed for simplifying comparisons of pairs of keys on
+  // both sides.
+  //
+  // We support matching key that is of non-dictionary type with key that is of dictionary
+  // type, as long as the underlying value types are equal. We support matching when both
+  // keys are of dictionary type, regardless whether underlying dictionary index types are
+  // the same or not.
+  //
+  static std::shared_ptr<DataType> DataTypeAfterRemapping(
+      const std::shared_ptr<DataType>& build_data_type);
+
+  // Should only be called if KeyNeedsProcessing method returns true for a pair of
+  // corresponding key columns from build and probe side.
+  // Converts values in order to match the common representation for
+  // both build and probe side used in hash table comparison.
+  // Supports arrays and scalars as input.
+  // Argument opt_build_side should be null if dictionary key on probe side is matched
+  // with non-dictionary key on build side.
+  //
+  Result<std::shared_ptr<ArrayData>> RemapInput(
+      const HashJoinDictBuild* opt_build_side, const Datum& data, int64_t batch_length,
+      const std::shared_ptr<DataType>& probe_data_type,
+      const std::shared_ptr<DataType>& build_data_type, ExecContext* ctx);
+
+  void CleanUp();
+
+ private:
+  // May be null if probe side key is non-dictionary. Otherwise it is used to verify that
+  // only a single dictionary is referenced in exec batch on probe side of hash join.
+  std::shared_ptr<Array> dictionary_;
+  // Mapping from dictionary on probe side of hash join (if it is used) to unified
+  // representation.
+  std::shared_ptr<ArrayData> remapped_ids_;
+  // Encoder of key columns that uses unified representation instead of original data type
+  // for key columns that need to use it (have dictionaries on either side of the join).
+  RowEncoder encoder_;
+};
+
+// Encapsulates dictionary handling logic for build side of hash join.
+//
+class HashJoinDictBuildMulti {
+ public:
+  Status Init(const SchemaProjectionMaps<HashJoinProjection>& proj_map,
+              const ExecBatch* opt_non_empty_batch, ExecContext* ctx);
+  static void InitEncoder(const SchemaProjectionMaps<HashJoinProjection>& proj_map,
+                          RowEncoder* encoder, ExecContext* ctx);
+  Status EncodeBatch(size_t thread_index,
+                     const SchemaProjectionMaps<HashJoinProjection>& proj_map,
+                     const ExecBatch& batch, RowEncoder* encoder, ExecContext* ctx) const;
+  Status PostDecode(const SchemaProjectionMaps<HashJoinProjection>& proj_map,
+                    ExecBatch* decoded_key_batch, ExecContext* ctx);
+  const HashJoinDictBuild& get_dict_build(int icol) const { return remap_imp_[icol]; }
+
+ private:
+  std::vector<bool> needs_remap_;
+  std::vector<HashJoinDictBuild> remap_imp_;
+};
+
+// Encapsulates dictionary handling logic for probe side of hash join
+//
+class HashJoinDictProbeMulti {
+ public:
+  void Init(size_t num_threads);
+  bool BatchRemapNeeded(size_t thread_index,
+                        const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
+                        const SchemaProjectionMaps<HashJoinProjection>& proj_map_build,
+                        ExecContext* ctx);
+  Status EncodeBatch(size_t thread_index,
+                     const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
+                     const SchemaProjectionMaps<HashJoinProjection>& proj_map_build,
+                     const HashJoinDictBuildMulti& dict_build, const ExecBatch& batch,
+                     RowEncoder** out_encoder, ExecBatch* opt_out_key_batch,
+                     ExecContext* ctx);
+
+ private:
+  void InitLocalStateIfNeeded(
+      size_t thread_index, const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
+      const SchemaProjectionMaps<HashJoinProjection>& proj_map_build, ExecContext* ctx);
+  static void InitEncoder(const SchemaProjectionMaps<HashJoinProjection>& proj_map_probe,
+                          const SchemaProjectionMaps<HashJoinProjection>& proj_map_build,
+                          RowEncoder* encoder, ExecContext* ctx);
+  struct ThreadLocalState {
+    bool is_initialized;
+    // Whether any key column needs remapping (because of dictionaries used) before doing
+    // join hash table lookups
+    bool any_needs_remap;
+    // Whether each key column needs remapping before doing join hash table lookups
+    std::vector<bool> needs_remap;
+    std::vector<HashJoinDictProbe> remap_imp;
+    // Encoder of key columns that uses unified representation instead of original data
+    // type for key columns that need to use it (have dictionaries on either side of the
+    // join).
+    RowEncoder post_remap_encoder;
+  };
+  std::vector<ThreadLocalState> local_states_;
+};
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/hash_join_node.h b/pyarrow/include/arrow/acero/hash_join_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..19745b8675cf0c63ed92c6e5448c9e6a68467f59
--- /dev/null
+++ b/pyarrow/include/arrow/acero/hash_join_node.h
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <vector>
+
+#include "arrow/acero/options.h"
+#include "arrow/acero/schema_util.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+namespace arrow {
+
+using compute::ExecContext;
+
+namespace acero {
+
+class ARROW_ACERO_EXPORT HashJoinSchema {
+ public:
+  Status Init(JoinType join_type, const Schema& left_schema,
+              const std::vector<FieldRef>& left_keys, const Schema& right_schema,
+              const std::vector<FieldRef>& right_keys, const Expression& filter,
+              const std::string& left_field_name_prefix,
+              const std::string& right_field_name_prefix);
+
+  Status Init(JoinType join_type, const Schema& left_schema,
+              const std::vector<FieldRef>& left_keys,
+              const std::vector<FieldRef>& left_output, const Schema& right_schema,
+              const std::vector<FieldRef>& right_keys,
+              const std::vector<FieldRef>& right_output, const Expression& filter,
+              const std::string& left_field_name_prefix,
+              const std::string& right_field_name_prefix);
+
+  static Status ValidateSchemas(JoinType join_type, const Schema& left_schema,
+                                const std::vector<FieldRef>& left_keys,
+                                const std::vector<FieldRef>& left_output,
+                                const Schema& right_schema,
+                                const std::vector<FieldRef>& right_keys,
+                                const std::vector<FieldRef>& right_output,
+                                const std::string& left_field_name_prefix,
+                                const std::string& right_field_name_prefix);
+
+  bool HasDictionaries() const;
+
+  bool HasLargeBinary() const;
+
+  Result<Expression> BindFilter(Expression filter, const Schema& left_schema,
+                                const Schema& right_schema, ExecContext* exec_context);
+  std::shared_ptr<Schema> MakeOutputSchema(const std::string& left_field_name_suffix,
+                                           const std::string& right_field_name_suffix);
+
+  bool LeftPayloadIsEmpty() const { return PayloadIsEmpty(0); }
+
+  bool RightPayloadIsEmpty() const { return PayloadIsEmpty(1); }
+
+  static int kMissingField() {
+    return SchemaProjectionMaps<HashJoinProjection>::kMissingField;
+  }
+
+  SchemaProjectionMaps<HashJoinProjection> proj_maps[2];
+
+ private:
+  static bool IsTypeSupported(const DataType& type);
+
+  Status CollectFilterColumns(std::vector<FieldRef>& left_filter,
+                              std::vector<FieldRef>& right_filter,
+                              const Expression& filter, const Schema& left_schema,
+                              const Schema& right_schema);
+
+  Expression RewriteFilterToUseFilterSchema(int right_filter_offset,
+                                            const SchemaProjectionMap& left_to_filter,
+                                            const SchemaProjectionMap& right_to_filter,
+                                            const Expression& filter);
+
+  bool PayloadIsEmpty(int side) const {
+    assert(side == 0 || side == 1);
+    return proj_maps[side].num_cols(HashJoinProjection::PAYLOAD) == 0;
+  }
+
+  static Result<std::vector<FieldRef>> ComputePayload(const Schema& schema,
+                                                      const std::vector<FieldRef>& output,
+                                                      const std::vector<FieldRef>& filter,
+                                                      const std::vector<FieldRef>& key);
+};
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/map_node.h b/pyarrow/include/arrow/acero/map_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bdd0ab2ca3854c6561aa3735ae143e7c58b4f77
--- /dev/null
+++ b/pyarrow/include/arrow/acero/map_node.h
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "arrow/acero/exec_plan.h"
+#include "arrow/acero/util.h"
+#include "arrow/acero/visibility.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/cancel.h"
+#include "arrow/util/type_fwd.h"
+
+namespace arrow {
+namespace acero {
+
+/// A utility base class for simple exec nodes with one input
+///
+/// Pause/Resume Producing are forwarded appropriately
+/// There is nothing to do in StopProducingImpl
+///
+/// An AtomicCounter is used to keep track of when all data has arrived.  When it
+/// has the Finish() method will be invoked
+class ARROW_ACERO_EXPORT MapNode : public ExecNode, public TracedNode {
+ public:
+  MapNode(ExecPlan* plan, std::vector<ExecNode*> inputs,
+          std::shared_ptr<Schema> output_schema);
+
+  Status InputFinished(ExecNode* input, int total_batches) override;
+
+  Status StartProducing() override;
+
+  void PauseProducing(ExecNode* output, int32_t counter) override;
+
+  void ResumeProducing(ExecNode* output, int32_t counter) override;
+
+  Status InputReceived(ExecNode* input, ExecBatch batch) override;
+
+  const Ordering& ordering() const override;
+
+ protected:
+  Status StopProducingImpl() override;
+
+  /// Transform a batch
+  ///
+  /// The output batch will have the same guarantee as the input batch
+  /// If this was the last batch this call may trigger Finish()
+  virtual Result<ExecBatch> ProcessBatch(ExecBatch batch) = 0;
+
+  /// Function called after all data has been received
+  ///
+  /// By default this does nothing.  Override this to provide a custom implementation.
+  virtual void Finish();
+
+ protected:
+  // Counter for the number of batches received
+  AtomicCounter input_counter_;
+};
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/options.h b/pyarrow/include/arrow/acero/options.h
new file mode 100644
index 0000000000000000000000000000000000000000..827e9ea775d7b8e892d05f9b81a79ec25991cc3c
--- /dev/null
+++ b/pyarrow/include/arrow/acero/options.h
@@ -0,0 +1,874 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "arrow/acero/type_fwd.h"
+#include "arrow/acero/visibility.h"
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/expression.h"
+#include "arrow/result.h"
+#include "arrow/util/future.h"
+
+namespace arrow {
+
+using compute::Aggregate;
+using compute::ExecBatch;
+using compute::Expression;
+using compute::literal;
+using compute::Ordering;
+using compute::SelectKOptions;
+using compute::SortOptions;
+
+namespace internal {
+
+class Executor;
+
+}  // namespace internal
+
+namespace acero {
+
+/// \brief This must not be used in release-mode
+struct DebugOptions;
+
+using AsyncExecBatchGenerator = std::function<Future<std::optional<ExecBatch>>()>;
+
+/// \addtogroup acero-nodes
+/// @{
+
+/// \brief A base class for all options objects
+///
+/// The only time this is used directly is when a node has no configuration
+class ARROW_ACERO_EXPORT ExecNodeOptions {
+ public:
+  virtual ~ExecNodeOptions() = default;
+
+  /// \brief This must not be used in release-mode
+  std::shared_ptr<DebugOptions> debug_opts;
+};
+
+/// \brief A node representing a generic source of data for Acero
+///
+/// The source node will start calling `generator` during StartProducing.  An initial
+/// task will be created that will call `generator`.  It will not call `generator`
+/// reentrantly.  If the source can be read in parallel then those details should be
+/// encapsulated within `generator`.
+///
+/// For each batch received a new task will be created to push that batch downstream.
+/// This task will slice smaller units of size `ExecPlan::kMaxBatchSize` from the
+/// parent batch and call InputReceived.  Thus, if the `generator` yields a large
+/// batch it may result in several calls to InputReceived.
+///
+/// The SourceNode will, by default, assign an implicit ordering to outgoing batches.
+/// This is valid as long as the generator generates batches in a deterministic fashion.
+/// Currently, the only way to override this is to subclass the SourceNode.
+///
+/// This node is not generally used directly but can serve as the basis for various
+/// specialized nodes.
+class ARROW_ACERO_EXPORT SourceNodeOptions : public ExecNodeOptions {
+ public:
+  /// Create an instance from values
+  SourceNodeOptions(std::shared_ptr<Schema> output_schema,
+                    std::function<Future<std::optional<ExecBatch>>()> generator,
+                    Ordering ordering = Ordering::Unordered())
+      : output_schema(std::move(output_schema)),
+        generator(std::move(generator)),
+        ordering(std::move(ordering)) {}
+
+  /// \brief the schema for batches that will be generated by this source
+  std::shared_ptr<Schema> output_schema;
+  /// \brief an asynchronous stream of batches ending with std::nullopt
+  std::function<Future<std::optional<ExecBatch>>()> generator;
+  /// \brief the order of the data, defaults to Ordering::Unordered
+  Ordering ordering;
+};
+
+/// \brief a node that generates data from a table already loaded in memory
+///
+/// The table source node will slice off chunks, defined by `max_batch_size`
+/// for parallel processing.  The table source node extends source node and so these
+/// chunks will be iteratively processed in small batches.  \see SourceNodeOptions
+/// for details.
+class ARROW_ACERO_EXPORT TableSourceNodeOptions : public ExecNodeOptions {
+ public:
+  static constexpr int64_t kDefaultMaxBatchSize = 1 << 20;
+
+  /// Create an instance from values
+  TableSourceNodeOptions(std::shared_ptr<Table> table,
+                         int64_t max_batch_size = kDefaultMaxBatchSize)
+      : table(std::move(table)), max_batch_size(max_batch_size) {}
+
+  /// \brief a table which acts as the data source
+  std::shared_ptr<Table> table;
+  /// \brief size of batches to emit from this node
+  /// If the table is larger the node will emit multiple batches from the
+  /// the table to be processed in parallel.
+  int64_t max_batch_size;
+};
+
+/// \brief define a lazily resolved Arrow table.
+///
+/// The table uniquely identified by the names can typically be resolved at the time when
+/// the plan is to be consumed.
+///
+/// This node is for serialization purposes only and can never be executed.
+class ARROW_ACERO_EXPORT NamedTableNodeOptions : public ExecNodeOptions {
+ public:
+  /// Create an instance from values
+  NamedTableNodeOptions(std::vector<std::string> names, std::shared_ptr<Schema> schema)
+      : names(std::move(names)), schema(std::move(schema)) {}
+
+  /// \brief the names to put in the serialized plan
+  std::vector<std::string> names;
+  /// \brief the output schema of the table
+  std::shared_ptr<Schema> schema;
+};
+
+/// \brief a source node which feeds data from a synchronous iterator of batches
+///
+/// ItMaker is a maker of an iterator of tabular data.
+///
+/// The node can be configured to use an I/O executor.  If set then each time the
+/// iterator is polled a new I/O thread task will be created to do the polling.  This
+/// allows a blocking iterator to stay off the CPU thread pool.
+template <typename ItMaker>
+class ARROW_ACERO_EXPORT SchemaSourceNodeOptions : public ExecNodeOptions {
+ public:
+  /// Create an instance that will create a new task on io_executor for each iteration
+  SchemaSourceNodeOptions(std::shared_ptr<Schema> schema, ItMaker it_maker,
+                          arrow::internal::Executor* io_executor)
+      : schema(std::move(schema)),
+        it_maker(std::move(it_maker)),
+        io_executor(io_executor),
+        requires_io(true) {}
+
+  /// Create an instance that will either iterate synchronously or use the default I/O
+  /// executor
+  SchemaSourceNodeOptions(std::shared_ptr<Schema> schema, ItMaker it_maker,
+                          bool requires_io = false)
+      : schema(std::move(schema)),
+        it_maker(std::move(it_maker)),
+        io_executor(NULLPTR),
+        requires_io(requires_io) {}
+
+  /// \brief The schema of the record batches from the iterator
+  std::shared_ptr<Schema> schema;
+
+  /// \brief A maker of an iterator which acts as the data source
+  ItMaker it_maker;
+
+  /// \brief The executor to use for scanning the iterator
+  ///
+  /// Defaults to the default I/O executor.  Only used if requires_io is true.
+  /// If requires_io is false then this MUST be nullptr.
+  arrow::internal::Executor* io_executor;
+
+  /// \brief If true then items will be fetched from the iterator on a dedicated I/O
+  ///        thread to keep I/O off the CPU thread
+  bool requires_io;
+};
+
+/// a source node that reads from a RecordBatchReader
+///
+/// Each iteration of the RecordBatchReader will be run on a new thread task created
+/// on the I/O thread pool.
+class ARROW_ACERO_EXPORT RecordBatchReaderSourceNodeOptions : public ExecNodeOptions {
+ public:
+  /// Create an instance from values
+  RecordBatchReaderSourceNodeOptions(std::shared_ptr<RecordBatchReader> reader,
+                                     arrow::internal::Executor* io_executor = NULLPTR)
+      : reader(std::move(reader)), io_executor(io_executor) {}
+
+  /// \brief The RecordBatchReader which acts as the data source
+  std::shared_ptr<RecordBatchReader> reader;
+
+  /// \brief The executor to use for the reader
+  ///
+  /// Defaults to the default I/O executor.
+  arrow::internal::Executor* io_executor;
+};
+
+/// a source node that reads from an iterator of array vectors
+using ArrayVectorIteratorMaker = std::function<Iterator<std::shared_ptr<ArrayVector>>()>;
+/// \brief An extended Source node which accepts a schema and array-vectors
+class ARROW_ACERO_EXPORT ArrayVectorSourceNodeOptions
+    : public SchemaSourceNodeOptions<ArrayVectorIteratorMaker> {
+  using SchemaSourceNodeOptions::SchemaSourceNodeOptions;
+};
+
+/// a source node that reads from an iterator of ExecBatch
+using ExecBatchIteratorMaker = std::function<Iterator<std::shared_ptr<ExecBatch>>()>;
+/// \brief An extended Source node which accepts a schema and exec-batches
+class ARROW_ACERO_EXPORT ExecBatchSourceNodeOptions
+    : public SchemaSourceNodeOptions<ExecBatchIteratorMaker> {
+ public:
+  using SchemaSourceNodeOptions::SchemaSourceNodeOptions;
+  ExecBatchSourceNodeOptions(std::shared_ptr<Schema> schema,
+                             std::vector<ExecBatch> batches,
+                             ::arrow::internal::Executor* io_executor);
+  ExecBatchSourceNodeOptions(std::shared_ptr<Schema> schema,
+                             std::vector<ExecBatch> batches, bool requires_io = false);
+};
+
+using RecordBatchIteratorMaker = std::function<Iterator<std::shared_ptr<RecordBatch>>()>;
+/// a source node that reads from an iterator of RecordBatch
+class ARROW_ACERO_EXPORT RecordBatchSourceNodeOptions
+    : public SchemaSourceNodeOptions<RecordBatchIteratorMaker> {
+  using SchemaSourceNodeOptions::SchemaSourceNodeOptions;
+};
+
+/// \brief a node which excludes some rows from batches passed through it
+///
+/// filter_expression will be evaluated against each batch which is pushed to
+/// this node. Any rows for which filter_expression does not evaluate to `true` will be
+/// excluded in the batch emitted by this node.
+///
+/// This node will emit empty batches if all rows are excluded.  This is done
+/// to avoid gaps in the ordering.
+class ARROW_ACERO_EXPORT FilterNodeOptions : public ExecNodeOptions {
+ public:
+  /// \brief create an instance from values
+  explicit FilterNodeOptions(Expression filter_expression)
+      : filter_expression(std::move(filter_expression)) {}
+
+  /// \brief the expression to filter batches
+  ///
+  /// The return type of this expression must be boolean
+  Expression filter_expression;
+};
+
+/// \brief a node which selects a specified subset from the input
+class ARROW_ACERO_EXPORT FetchNodeOptions : public ExecNodeOptions {
+ public:
+  static constexpr std::string_view kName = "fetch";
+  /// \brief create an instance from values
+  FetchNodeOptions(int64_t offset, int64_t count) : offset(offset), count(count) {}
+  /// \brief the number of rows to skip
+  int64_t offset;
+  /// \brief the number of rows to keep (not counting skipped rows)
+  int64_t count;
+};
+
+/// \brief a node which executes expressions on input batches, producing batches
+/// of the same length with new columns.
+///
+/// Each expression will be evaluated against each batch which is pushed to
+/// this node to produce a corresponding output column.
+///
+/// If names are not provided, the string representations of exprs will be used.
+class ARROW_ACERO_EXPORT ProjectNodeOptions : public ExecNodeOptions {
+ public:
+  /// \brief create an instance from values
+  explicit ProjectNodeOptions(std::vector<Expression> expressions,
+                              std::vector<std::string> names = {})
+      : expressions(std::move(expressions)), names(std::move(names)) {}
+
+  /// \brief the expressions to run on the batches
+  ///
+  /// The output will have one column for each expression.  If you wish to keep any of
+  /// the columns from the input then you should create a simple field_ref expression
+  /// for that column.
+  std::vector<Expression> expressions;
+  /// \brief the names of the output columns
+  ///
+  /// If this is not specified then the result of calling ToString on the expression will
+  /// be used instead
+  ///
+  /// This list should either be empty or have the same length as `expressions`
+  std::vector<std::string> names;
+};
+
+/// \brief a node which aggregates input batches and calculates summary statistics
+///
+/// The node can summarize the entire input or it can group the input with grouping keys
+/// and segment keys.
+///
+/// By default, the aggregate node is a pipeline breaker.  It must accumulate all input
+/// before any output is produced.  Segment keys are a performance optimization.  If
+/// you know your input is already partitioned by one or more columns then you can
+/// specify these as segment keys.  At each change in the segment keys the node will
+/// emit values for all data seen so far.
+///
+/// Segment keys are currently limited to single-threaded mode.
+///
+/// Both keys and segment-keys determine the group.  However segment-keys are also used
+/// for determining grouping segments, which should be large, and allow streaming a
+/// partial aggregation result after processing each segment.  One common use-case for
+/// segment-keys is ordered aggregation, in which the segment-key attribute specifies a
+/// column with non-decreasing values or a lexicographically-ordered set of such columns.
+///
+/// If the keys attribute is a non-empty vector, then each aggregate in `aggregates` is
+/// expected to be a HashAggregate function. If the keys attribute is an empty vector,
+/// then each aggregate is assumed to be a ScalarAggregate function.
+///
+/// If the segment_keys attribute is a non-empty vector, then segmented aggregation, as
+/// described above, applies.
+///
+/// The keys and segment_keys vectors must be disjoint.
+///
+/// If no measures are provided then you will simply get the list of unique keys.
+///
+/// This node outputs segment keys first, followed by regular keys, followed by one
+/// column for each aggregate.
+class ARROW_ACERO_EXPORT AggregateNodeOptions : public ExecNodeOptions {
+ public:
+  /// \brief create an instance from values
+  explicit AggregateNodeOptions(std::vector<Aggregate> aggregates,
+                                std::vector<FieldRef> keys = {},
+                                std::vector<FieldRef> segment_keys = {})
+      : aggregates(std::move(aggregates)),
+        keys(std::move(keys)),
+        segment_keys(std::move(segment_keys)) {}
+
+  // aggregations which will be applied to the targeted fields
+  std::vector<Aggregate> aggregates;
+  // keys by which aggregations will be grouped (optional)
+  std::vector<FieldRef> keys;
+  // keys by which aggregations will be segmented (optional)
+  std::vector<FieldRef> segment_keys;
+};
+
+/// \brief a default value at which backpressure will be applied
+constexpr int32_t kDefaultBackpressureHighBytes = 1 << 30;  // 1GiB
+/// \brief a default value at which backpressure will be removed
+constexpr int32_t kDefaultBackpressureLowBytes = 1 << 28;  // 256MiB
+
+/// \brief an interface that can be queried for backpressure statistics
+class ARROW_ACERO_EXPORT BackpressureMonitor {
+ public:
+  virtual ~BackpressureMonitor() = default;
+  /// \brief fetches the number of bytes currently queued up
+  virtual uint64_t bytes_in_use() = 0;
+  /// \brief checks to see if backpressure is currently applied
+  virtual bool is_paused() = 0;
+};
+
+/// \brief Options to control backpressure behavior
+struct ARROW_ACERO_EXPORT BackpressureOptions {
+  /// \brief Create default options that perform no backpressure
+  BackpressureOptions() : resume_if_below(0), pause_if_above(0) {}
+  /// \brief Create options that will perform backpressure
+  ///
+  /// \param resume_if_below The producer should resume producing if the backpressure
+  ///                        queue has fewer than resume_if_below items.
+  /// \param pause_if_above The producer should pause producing if the backpressure
+  ///                       queue has more than pause_if_above items
+  BackpressureOptions(uint64_t resume_if_below, uint64_t pause_if_above)
+      : resume_if_below(resume_if_below), pause_if_above(pause_if_above) {}
+
+  /// \brief create an instance using default values for backpressure limits
+  static BackpressureOptions DefaultBackpressure() {
+    return BackpressureOptions(kDefaultBackpressureLowBytes,
+                               kDefaultBackpressureHighBytes);
+  }
+
+  /// \brief helper method to determine if backpressure is disabled
+  /// \return true if pause_if_above is greater than zero, false otherwise
+  bool should_apply_backpressure() const { return pause_if_above > 0; }
+
+  /// \brief the number of bytes at which the producer should resume producing
+  uint64_t resume_if_below;
+  /// \brief the number of bytes at which the producer should pause producing
+  ///
+  /// If this is <= 0 then backpressure will be disabled
+  uint64_t pause_if_above;
+};
+
+/// \brief a sink node which collects results in a queue
+///
+/// Emitted batches will only be ordered if there is a meaningful ordering
+/// and sequence_output is not set to false.
+class ARROW_ACERO_EXPORT SinkNodeOptions : public ExecNodeOptions {
+ public:
+  explicit SinkNodeOptions(std::function<Future<std::optional<ExecBatch>>()>* generator,
+                           std::shared_ptr<Schema>* schema,
+                           BackpressureOptions backpressure = {},
+                           BackpressureMonitor** backpressure_monitor = NULLPTR,
+                           std::optional<bool> sequence_output = std::nullopt)
+      : generator(generator),
+        schema(schema),
+        backpressure(backpressure),
+        backpressure_monitor(backpressure_monitor),
+        sequence_output(sequence_output) {}
+
+  explicit SinkNodeOptions(std::function<Future<std::optional<ExecBatch>>()>* generator,
+                           BackpressureOptions backpressure = {},
+                           BackpressureMonitor** backpressure_monitor = NULLPTR,
+                           std::optional<bool> sequence_output = std::nullopt)
+      : generator(generator),
+        schema(NULLPTR),
+        backpressure(std::move(backpressure)),
+        backpressure_monitor(backpressure_monitor),
+        sequence_output(sequence_output) {}
+
+  /// \brief A pointer to a generator of batches.
+  ///
+  /// This will be set when the node is added to the plan and should be used to consume
+  /// data from the plan.  If this function is not called frequently enough then the sink
+  /// node will start to accumulate data and may apply backpressure.
+  std::function<Future<std::optional<ExecBatch>>()>* generator;
+  /// \brief A pointer which will be set to the schema of the generated batches
+  ///
+  /// This is optional, if nullptr is passed in then it will be ignored.
+  /// This will be set when the node is added to the plan, before StartProducing is called
+  std::shared_ptr<Schema>* schema;
+  /// \brief Options to control when to apply backpressure
+  ///
+  /// This is optional, the default is to never apply backpressure.  If the plan is not
+  /// consumed quickly enough the system may eventually run out of memory.
+  BackpressureOptions backpressure;
+  /// \brief A pointer to a backpressure monitor
+  ///
+  /// This will be set when the node is added to the plan.  This can be used to inspect
+  /// the amount of data currently queued in the sink node.  This is an optional utility
+  /// and backpressure can be applied even if this is not used.
+  BackpressureMonitor** backpressure_monitor;
+  /// \brief Controls whether batches should be emitted immediately or sequenced in order
+  ///
+  /// \see QueryOptions for more details
+  std::optional<bool> sequence_output;
+};
+
+/// \brief Control used by a SinkNodeConsumer to pause & resume
+///
+/// Callers should ensure that they do not call Pause and Resume simultaneously and they
+/// should sequence things so that a call to Pause() is always followed by an eventual
+/// call to Resume()
+class ARROW_ACERO_EXPORT BackpressureControl {
+ public:
+  virtual ~BackpressureControl() = default;
+  /// \brief Ask the input to pause
+  ///
+  /// This is best effort, batches may continue to arrive
+  /// Must eventually be followed by a call to Resume() or deadlock will occur
+  virtual void Pause() = 0;
+  /// \brief Ask the input to resume
+  virtual void Resume() = 0;
+};
+
+/// \brief a sink node that consumes the data as part of the plan using callbacks
+class ARROW_ACERO_EXPORT SinkNodeConsumer {
+ public:
+  virtual ~SinkNodeConsumer() = default;
+  /// \brief Prepare any consumer state
+  ///
+  /// This will be run once the schema is finalized as the plan is starting and
+  /// before any calls to Consume.  A common use is to save off the schema so that
+  /// batches can be interpreted.
+  virtual Status Init(const std::shared_ptr<Schema>& schema,
+                      BackpressureControl* backpressure_control, ExecPlan* plan) = 0;
+  /// \brief Consume a batch of data
+  virtual Status Consume(ExecBatch batch) = 0;
+  /// \brief Signal to the consumer that the last batch has been delivered
+  ///
+  /// The returned future should only finish when all outstanding tasks have completed
+  ///
+  /// If the plan is ended early or aborts due to an error then this will not be
+  /// called.
+  virtual Future<> Finish() = 0;
+};
+
+/// \brief Add a sink node which consumes data within the exec plan run
+class ARROW_ACERO_EXPORT ConsumingSinkNodeOptions : public ExecNodeOptions {
+ public:
+  explicit ConsumingSinkNodeOptions(std::shared_ptr<SinkNodeConsumer> consumer,
+                                    std::vector<std::string> names = {},
+                                    std::optional<bool> sequence_output = std::nullopt)
+      : consumer(std::move(consumer)),
+        names(std::move(names)),
+        sequence_output(sequence_output) {}
+
+  std::shared_ptr<SinkNodeConsumer> consumer;
+  /// \brief Names to rename the sink's schema fields to
+  ///
+  /// If specified then names must be provided for all fields. Currently, only a flat
+  /// schema is supported (see GH-31875).
+  ///
+  /// If not specified then names will be generated based on the source data.
+  std::vector<std::string> names;
+  /// \brief Controls whether batches should be emitted immediately or sequenced in order
+  ///
+  /// \see QueryOptions for more details
+  std::optional<bool> sequence_output;
+};
+
+/// \brief Make a node which sorts rows passed through it
+///
+/// All batches pushed to this node will be accumulated, then sorted, by the given
+/// fields. Then sorted batches will be forwarded to the generator in sorted order.
+class ARROW_ACERO_EXPORT OrderBySinkNodeOptions : public SinkNodeOptions {
+ public:
+  /// \brief create an instance from values
+  explicit OrderBySinkNodeOptions(
+      SortOptions sort_options,
+      std::function<Future<std::optional<ExecBatch>>()>* generator)
+      : SinkNodeOptions(generator), sort_options(std::move(sort_options)) {}
+
+  /// \brief options describing which columns and direction to sort
+  SortOptions sort_options;
+};
+
+/// \brief Apply a new ordering to data
+///
+/// Currently this node works by accumulating all data, sorting, and then emitting
+/// the new data with an updated batch index.
+///
+/// Larger-than-memory sort is not currently supported.
+class ARROW_ACERO_EXPORT OrderByNodeOptions : public ExecNodeOptions {
+ public:
+  static constexpr std::string_view kName = "order_by";
+  explicit OrderByNodeOptions(Ordering ordering) : ordering(std::move(ordering)) {}
+
+  /// \brief The new ordering to apply to outgoing data
+  Ordering ordering;
+};
+
+enum class JoinType {
+  LEFT_SEMI,
+  RIGHT_SEMI,
+  LEFT_ANTI,
+  RIGHT_ANTI,
+  INNER,
+  LEFT_OUTER,
+  RIGHT_OUTER,
+  FULL_OUTER
+};
+
+std::string ToString(JoinType t);
+
+enum class JoinKeyCmp { EQ, IS };
+
+/// \brief a node which implements a join operation using a hash table
+class ARROW_ACERO_EXPORT HashJoinNodeOptions : public ExecNodeOptions {
+ public:
+  static constexpr const char* default_output_suffix_for_left = "";
+  static constexpr const char* default_output_suffix_for_right = "";
+  /// \brief create an instance from values that outputs all columns
+  HashJoinNodeOptions(
+      JoinType in_join_type, std::vector<FieldRef> in_left_keys,
+      std::vector<FieldRef> in_right_keys, Expression filter = literal(true),
+      std::string output_suffix_for_left = default_output_suffix_for_left,
+      std::string output_suffix_for_right = default_output_suffix_for_right,
+      bool disable_bloom_filter = false)
+      : join_type(in_join_type),
+        left_keys(std::move(in_left_keys)),
+        right_keys(std::move(in_right_keys)),
+        output_all(true),
+        output_suffix_for_left(std::move(output_suffix_for_left)),
+        output_suffix_for_right(std::move(output_suffix_for_right)),
+        filter(std::move(filter)),
+        disable_bloom_filter(disable_bloom_filter) {
+    this->key_cmp.resize(this->left_keys.size());
+    for (size_t i = 0; i < this->left_keys.size(); ++i) {
+      this->key_cmp[i] = JoinKeyCmp::EQ;
+    }
+  }
+  /// \brief create an instance from keys
+  ///
+  /// This will create an inner join that outputs all columns and has no post join filter
+  ///
+  /// `in_left_keys` should have the same length and types as `in_right_keys`
+  /// @param in_left_keys the keys in the left input
+  /// @param in_right_keys the keys in the right input
+  HashJoinNodeOptions(std::vector<FieldRef> in_left_keys,
+                      std::vector<FieldRef> in_right_keys)
+      : left_keys(std::move(in_left_keys)), right_keys(std::move(in_right_keys)) {
+    this->join_type = JoinType::INNER;
+    this->output_all = true;
+    this->output_suffix_for_left = default_output_suffix_for_left;
+    this->output_suffix_for_right = default_output_suffix_for_right;
+    this->key_cmp.resize(this->left_keys.size());
+    for (size_t i = 0; i < this->left_keys.size(); ++i) {
+      this->key_cmp[i] = JoinKeyCmp::EQ;
+    }
+    this->filter = literal(true);
+  }
+  /// \brief create an instance from values using JoinKeyCmp::EQ for all comparisons
+  HashJoinNodeOptions(
+      JoinType join_type, std::vector<FieldRef> left_keys,
+      std::vector<FieldRef> right_keys, std::vector<FieldRef> left_output,
+      std::vector<FieldRef> right_output, Expression filter = literal(true),
+      std::string output_suffix_for_left = default_output_suffix_for_left,
+      std::string output_suffix_for_right = default_output_suffix_for_right,
+      bool disable_bloom_filter = false)
+      : join_type(join_type),
+        left_keys(std::move(left_keys)),
+        right_keys(std::move(right_keys)),
+        output_all(false),
+        left_output(std::move(left_output)),
+        right_output(std::move(right_output)),
+        output_suffix_for_left(std::move(output_suffix_for_left)),
+        output_suffix_for_right(std::move(output_suffix_for_right)),
+        filter(std::move(filter)),
+        disable_bloom_filter(disable_bloom_filter) {
+    this->key_cmp.resize(this->left_keys.size());
+    for (size_t i = 0; i < this->left_keys.size(); ++i) {
+      this->key_cmp[i] = JoinKeyCmp::EQ;
+    }
+  }
+  /// \brief create an instance from values
+  HashJoinNodeOptions(
+      JoinType join_type, std::vector<FieldRef> left_keys,
+      std::vector<FieldRef> right_keys, std::vector<FieldRef> left_output,
+      std::vector<FieldRef> right_output, std::vector<JoinKeyCmp> key_cmp,
+      Expression filter = literal(true),
+      std::string output_suffix_for_left = default_output_suffix_for_left,
+      std::string output_suffix_for_right = default_output_suffix_for_right,
+      bool disable_bloom_filter = false)
+      : join_type(join_type),
+        left_keys(std::move(left_keys)),
+        right_keys(std::move(right_keys)),
+        output_all(false),
+        left_output(std::move(left_output)),
+        right_output(std::move(right_output)),
+        key_cmp(std::move(key_cmp)),
+        output_suffix_for_left(std::move(output_suffix_for_left)),
+        output_suffix_for_right(std::move(output_suffix_for_right)),
+        filter(std::move(filter)),
+        disable_bloom_filter(disable_bloom_filter) {}
+
+  HashJoinNodeOptions() = default;
+
+  // type of join (inner, left, semi...)
+  JoinType join_type = JoinType::INNER;
+  // key fields from left input
+  std::vector<FieldRef> left_keys;
+  // key fields from right input
+  std::vector<FieldRef> right_keys;
+  // if set all valid fields from both left and right input will be output
+  // (and field ref vectors for output fields will be ignored)
+  bool output_all = false;
+  // output fields passed from left input
+  std::vector<FieldRef> left_output;
+  // output fields passed from right input
+  std::vector<FieldRef> right_output;
+  // key comparison function (determines whether a null key is equal another null
+  // key or not)
+  std::vector<JoinKeyCmp> key_cmp;
+  // suffix added to names of output fields coming from left input (used to distinguish,
+  // if necessary, between fields of the same name in left and right input and can be left
+  // empty if there are no name collisions)
+  std::string output_suffix_for_left;
+  // suffix added to names of output fields coming from right input
+  std::string output_suffix_for_right;
+  // residual filter which is applied to matching rows.  Rows that do not match
+  // the filter are not included.  The filter is applied against the
+  // concatenated input schema (left fields then right fields) and can reference
+  // fields that are not included in the output.
+  Expression filter = literal(true);
+  // whether or not to disable Bloom filters in this join
+  bool disable_bloom_filter = false;
+};
+
+/// \brief a node which implements the asof join operation
+///
+/// Note, this API is experimental and will change in the future
+///
+/// This node takes one left table and any number of right tables, and asof joins them
+/// together. Batches produced by each input must be ordered by the "on" key.
+/// This node will output one row for each row in the left table.
+class ARROW_ACERO_EXPORT AsofJoinNodeOptions : public ExecNodeOptions {
+ public:
+  /// \brief Keys for one input table of the AsofJoin operation
+  ///
+  /// The keys must be consistent across the input tables:
+  /// Each "on" key must refer to a field of the same type and units across the tables.
+  /// Each "by" key must refer to a list of fields of the same types across the tables.
+  struct Keys {
+    /// \brief "on" key for the join.
+    ///
+    /// The input table must be sorted by the "on" key. Must be a single field of a common
+    /// type. An inexact match is used on the "on" key, i.e. a row is considered a
+    /// match if and only if `right.on - left.on` is in the range
+    /// `[min(0, tolerance), max(0, tolerance)]`.
+    /// Currently, the "on" key must be of an integer, date, or timestamp type.
+    FieldRef on_key;
+    /// \brief "by" key for the join.
+    ///
+    /// Each input table must have each field of the "by" key.  Exact equality is used for
+    /// each field of the "by" key.
+    /// Currently, each field of the "by" key must be of an integer, date, timestamp, or
+    /// base-binary type.
+    std::vector<FieldRef> by_key;
+  };
+
+  AsofJoinNodeOptions(std::vector<Keys> input_keys, int64_t tolerance)
+      : input_keys(std::move(input_keys)), tolerance(tolerance) {}
+
+  /// \brief AsofJoin keys per input table. At least two keys must be given. The first key
+  /// corresponds to a left table and all other keys correspond to right tables for the
+  /// as-of-join.
+  ///
+  /// \see `Keys` for details.
+  std::vector<Keys> input_keys;
+  /// \brief Tolerance for inexact "on" key matching. A right row is considered a match
+  /// with a left row if `right.on - left.on` is in the range
+  /// `[min(0, tolerance), max(0, tolerance)]`. `tolerance` may be:
+  /// - negative, in which case a past-as-of-join occurs (match iff
+  ///   `tolerance <= right.on - left.on <= 0`);
+  /// - or positive, in which case a future-as-of-join occurs (match iff
+  ///   `0 <= right.on - left.on <= tolerance`);
+  /// - or zero, in which case an exact-as-of-join occurs (match iff
+  ///   `right.on == left.on`).
+  ///
+  /// The tolerance is interpreted in the same units as the "on" key.
+  int64_t tolerance;
+};
+
+/// \brief a node which select top_k/bottom_k rows passed through it
+///
+/// All batches pushed to this node will be accumulated, then selected, by the given
+/// fields. Then sorted batches will be forwarded to the generator in sorted order.
+class ARROW_ACERO_EXPORT SelectKSinkNodeOptions : public SinkNodeOptions {
+ public:
+  explicit SelectKSinkNodeOptions(
+      SelectKOptions select_k_options,
+      std::function<Future<std::optional<ExecBatch>>()>* generator)
+      : SinkNodeOptions(generator), select_k_options(std::move(select_k_options)) {}
+
+  /// SelectK options
+  SelectKOptions select_k_options;
+};
+
+/// \brief a sink node which accumulates all output into a table
+class ARROW_ACERO_EXPORT TableSinkNodeOptions : public ExecNodeOptions {
+ public:
+  /// \brief create an instance from values
+  explicit TableSinkNodeOptions(std::shared_ptr<Table>* output_table,
+                                std::optional<bool> sequence_output = std::nullopt)
+      : output_table(output_table), sequence_output(sequence_output) {}
+
+  /// \brief an "out parameter" specifying the table that will be created
+  ///
+  /// Must not be null and remain valid for the entirety of the plan execution.  After the
+  /// plan has completed this will be set to point to the result table
+  std::shared_ptr<Table>* output_table;
+  /// \brief Controls whether batches should be emitted immediately or sequenced in order
+  ///
+  /// \see QueryOptions for more details
+  std::optional<bool> sequence_output;
+  /// \brief Custom names to use for the columns.
+  ///
+  /// If specified then names must be provided for all fields. Currently, only a flat
+  /// schema is supported (see GH-31875).
+  ///
+  /// If not specified then names will be generated based on the source data.
+  std::vector<std::string> names;
+};
+
+/// \brief a row template that describes one row that will be generated for each input row
+struct ARROW_ACERO_EXPORT PivotLongerRowTemplate {
+  PivotLongerRowTemplate(std::vector<std::string> feature_values,
+                         std::vector<std::optional<FieldRef>> measurement_values)
+      : feature_values(std::move(feature_values)),
+        measurement_values(std::move(measurement_values)) {}
+  /// A (typically unique) set of feature values for the template, usually derived from a
+  /// column name
+  ///
+  /// These will be used to populate the feature columns
+  std::vector<std::string> feature_values;
+  /// The fields containing the measurements to use for this row
+  ///
+  /// These will be used to populate the measurement columns.  If nullopt then nulls
+  /// will be inserted for the given value.
+  std::vector<std::optional<FieldRef>> measurement_values;
+};
+
+/// \brief Reshape a table by turning some columns into additional rows
+///
+/// This operation is sometimes also referred to as UNPIVOT
+///
+/// This is typically done when there are multiple observations in each row in order to
+/// transform to a table containing a single observation per row.
+///
+/// For example:
+///
+/// | time | left_temp | right_temp |
+/// | ---- | --------- | ---------- |
+/// | 1    | 10        | 20         |
+/// | 2    | 15        | 18         |
+///
+/// The above table contains two observations per row.  There is an implicit feature
+/// "location" (left vs right) and a measurement "temp".  What we really want is:
+///
+/// | time | location | temp |
+/// | ---  | ---      | ---  |
+/// | 1    | left     | 10   |
+/// | 1    | right    | 20   |
+/// | 2    | left     | 15   |
+/// | 2    | right    | 18   |
+///
+/// For a more complex example consider:
+///
+/// | time | ax1 | ay1 | bx1 | ay2 |
+/// | ---- | --- | --- | --- | --- |
+/// | 0    | 1   | 2   | 3   | 4   |
+///
+/// We can pretend a vs b and x vs y are features while 1 and 2 are two different
+/// kinds of measurements.  We thus want to pivot to
+///
+/// | time | a/b | x/y |  f1  |  f2  |
+/// | ---- | --- | --- | ---- | ---- |
+/// | 0    | a   | x   | 1    | null |
+/// | 0    | a   | y   | 2    | 4    |
+/// | 0    | b   | x   | 3    | null |
+///
+/// To do this we create a row template for each combination of features.  One should
+/// be able to do this purely by looking at the column names.  For example, given the
+/// above columns "ax1", "ay1", "bx1", and "ay2" we know we have three feature
+/// combinations (a, x), (a, y), and (b, x).  Similarly, we know we have two possible
+/// measurements, "1" and "2".
+///
+/// For each combination of features we create a row template.  In each row template we
+/// describe the combination and then list which columns to use for the measurements.
+/// If a measurement doesn't exist for a given combination then we use nullopt.
+///
+/// So, for our above example, we have:
+///
+/// (a, x): names={"a", "x"}, values={"ax1", nullopt}
+/// (a, y): names={"a", "y"}, values={"ay1", "ay2"}
+/// (b, x): names={"b", "x"}, values={"bx1", nullopt}
+///
+/// Finishing it off we name our new columns:
+/// feature_field_names={"a/b","x/y"}
+/// measurement_field_names={"f1", "f2"}
+class ARROW_ACERO_EXPORT PivotLongerNodeOptions : public ExecNodeOptions {
+ public:
+  static constexpr std::string_view kName = "pivot_longer";
+  /// One or more row templates to create new output rows
+  ///
+  /// Normally there are at least two row templates.  The output # of rows
+  /// will be the input # of rows * the number of row templates
+  std::vector<PivotLongerRowTemplate> row_templates;
+  /// The names of the columns which describe the new features
+  std::vector<std::string> feature_field_names;
+  /// The names of the columns which represent the measurements
+  std::vector<std::string> measurement_field_names;
+};
+
+/// @}
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/order_by_impl.h b/pyarrow/include/arrow/acero/order_by_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b5a0f69a69ffc8f23fb5416e82777d2d06f0a00
--- /dev/null
+++ b/pyarrow/include/arrow/acero/order_by_impl.h
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "arrow/acero/options.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+
+namespace arrow {
+
+using compute::ExecContext;
+
+namespace acero {
+
+class OrderByImpl {
+ public:
+  virtual ~OrderByImpl() = default;
+
+  virtual void InputReceived(const std::shared_ptr<RecordBatch>& batch) = 0;
+
+  virtual Result<Datum> DoFinish() = 0;
+
+  virtual std::string ToString() const = 0;
+
+  static Result<std::unique_ptr<OrderByImpl>> MakeSort(
+      ExecContext* ctx, const std::shared_ptr<Schema>& output_schema,
+      const SortOptions& options);
+
+  static Result<std::unique_ptr<OrderByImpl>> MakeSelectK(
+      ExecContext* ctx, const std::shared_ptr<Schema>& output_schema,
+      const SelectKOptions& options);
+};
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/partition_util.h b/pyarrow/include/arrow/acero/partition_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..52cc47bb8a99f5fcc32defa09698c715025f322b
--- /dev/null
+++ b/pyarrow/include/arrow/acero/partition_util.h
@@ -0,0 +1,186 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <random>
+
+#include "arrow/acero/util.h"
+#include "arrow/buffer.h"
+#include "arrow/util/pcg_random.h"
+
+namespace arrow {
+namespace acero {
+
+class PartitionSort {
+ public:
+  /// \brief Bucket sort rows on partition ids in O(num_rows) time.
+  ///
+  /// Include in the output exclusive cumulative sum of bucket sizes.
+  /// This corresponds to ranges in the sorted array containing all row ids for
+  /// each of the partitions.
+  ///
+  /// prtn_ranges must be initialized and have at least num_prtns + 1 elements
+  /// when this method returns prtn_ranges[i] will contains the total number of
+  /// elements in partitions 0 through i.  prtn_ranges[0] will be 0.
+  ///
+  /// prtn_id_impl must be a function that takes in a row id (int) and returns
+  /// a partition id (int).  The returned partition id must be between 0 and
+  /// num_prtns (exclusive).
+  ///
+  /// output_pos_impl is a function that takes in a row id (int) and a position (int)
+  /// in the bucket sorted output.  The function should insert the row in the
+  /// output.
+  ///
+  /// For example:
+  ///
+  /// in_arr: [5, 7, 2, 3, 5, 4]
+  /// num_prtns: 3
+  /// prtn_id_impl: [&in_arr] (int row_id) { return in_arr[row_id] / 3; }
+  /// output_pos_impl: [&sorted_row_ids] (int row_id, int pos) { sorted_row_ids[pos] =
+  /// row_id; }
+  ///
+  /// After Execution
+  /// sorted_row_ids: [2, 0, 3, 4, 5, 1]
+  /// prtn_ranges: [0, 1, 5, 6]
+  template <class INPUT_PRTN_ID_FN, class OUTPUT_POS_FN>
+  static void Eval(int64_t num_rows, int num_prtns, uint16_t* prtn_ranges,
+                   INPUT_PRTN_ID_FN prtn_id_impl, OUTPUT_POS_FN output_pos_impl) {
+    ARROW_DCHECK(num_rows > 0 && num_rows <= (1 << 15));
+    ARROW_DCHECK(num_prtns >= 1 && num_prtns <= (1 << 15));
+
+    memset(prtn_ranges, 0, (num_prtns + 1) * sizeof(uint16_t));
+
+    for (int64_t i = 0; i < num_rows; ++i) {
+      int prtn_id = static_cast<int>(prtn_id_impl(i));
+      ++prtn_ranges[prtn_id + 1];
+    }
+
+    uint16_t sum = 0;
+    for (int i = 0; i < num_prtns; ++i) {
+      uint16_t sum_next = sum + prtn_ranges[i + 1];
+      prtn_ranges[i + 1] = sum;
+      sum = sum_next;
+    }
+
+    for (int64_t i = 0; i < num_rows; ++i) {
+      int prtn_id = static_cast<int>(prtn_id_impl(i));
+      int pos = prtn_ranges[prtn_id + 1]++;
+      output_pos_impl(i, pos);
+    }
+  }
+};
+
+/// \brief A control for synchronizing threads on a partitionable workload
+class PartitionLocks {
+ public:
+  PartitionLocks();
+  ~PartitionLocks();
+  /// \brief Initializes the control, must be called before use
+  ///
+  /// \param num_threads Maximum number of threads that will access the partitions
+  /// \param num_prtns Number of partitions to synchronize
+  void Init(size_t num_threads, int num_prtns);
+  /// \brief Cleans up the control, it should not be used after this call
+  void CleanUp();
+  /// \brief Acquire a partition to work on one
+  ///
+  /// \param thread_id The index of the thread trying to acquire the partition lock
+  /// \param num_prtns Length of prtns_to_try, must be <= num_prtns used in Init
+  /// \param prtns_to_try An array of partitions that still have remaining work
+  /// \param limit_retries If false, this method will spinwait forever until success
+  /// \param max_retries Max times to attempt checking out work before returning false
+  /// \param[out] locked_prtn_id The id of the partition locked
+  /// \param[out] locked_prtn_id_pos The index of the partition locked in prtns_to_try
+  /// \return True if a partition was locked, false if max_retries was attempted
+  ///         without successfully acquiring a lock
+  ///
+  /// This method is thread safe
+  bool AcquirePartitionLock(size_t thread_id, int num_prtns, const int* prtns_to_try,
+                            bool limit_retries, int max_retries, int* locked_prtn_id,
+                            int* locked_prtn_id_pos);
+  /// \brief Release a partition so that other threads can work on it
+  void ReleasePartitionLock(int prtn_id);
+
+  // Executes (synchronously and using current thread) the same operation on a set of
+  // multiple partitions. Tries to minimize partition locking overhead by randomizing and
+  // adjusting order in which partitions are processed.
+  //
+  // PROCESS_PRTN_FN is a callback which will be executed for each partition after
+  // acquiring the lock for that partition. It gets partition id as an argument.
+  // IS_PRTN_EMPTY_FN is a callback which filters out (when returning true) partitions
+  // with specific ids from processing.
+  //
+  template <typename IS_PRTN_EMPTY_FN, typename PROCESS_PRTN_FN>
+  Status ForEachPartition(size_t thread_id,
+                          /*scratch space buffer with space for one element per partition;
+                             dirty in and dirty out*/
+                          int* temp_unprocessed_prtns, IS_PRTN_EMPTY_FN is_prtn_empty_fn,
+                          PROCESS_PRTN_FN process_prtn_fn) {
+    int num_unprocessed_partitions = 0;
+    for (int i = 0; i < num_prtns_; ++i) {
+      bool is_prtn_empty = is_prtn_empty_fn(i);
+      if (!is_prtn_empty) {
+        temp_unprocessed_prtns[num_unprocessed_partitions++] = i;
+      }
+    }
+    while (num_unprocessed_partitions > 0) {
+      int locked_prtn_id;
+      int locked_prtn_id_pos;
+      AcquirePartitionLock(thread_id, num_unprocessed_partitions, temp_unprocessed_prtns,
+                           /*limit_retries=*/false, /*max_retries=*/-1, &locked_prtn_id,
+                           &locked_prtn_id_pos);
+      {
+        class AutoReleaseLock {
+         public:
+          AutoReleaseLock(PartitionLocks* locks, int prtn_id)
+              : locks(locks), prtn_id(prtn_id) {}
+          ~AutoReleaseLock() { locks->ReleasePartitionLock(prtn_id); }
+          PartitionLocks* locks;
+          int prtn_id;
+        } auto_release_lock(this, locked_prtn_id);
+        ARROW_RETURN_NOT_OK(process_prtn_fn(locked_prtn_id));
+      }
+      if (locked_prtn_id_pos < num_unprocessed_partitions - 1) {
+        temp_unprocessed_prtns[locked_prtn_id_pos] =
+            temp_unprocessed_prtns[num_unprocessed_partitions - 1];
+      }
+      --num_unprocessed_partitions;
+    }
+    return Status::OK();
+  }
+
+ private:
+  std::atomic<bool>* lock_ptr(int prtn_id);
+  int random_int(size_t thread_id, int num_values);
+
+  struct PartitionLock {
+    static constexpr int kCacheLineBytes = 64;
+    std::atomic<bool> lock;
+    uint8_t padding[kCacheLineBytes];
+  };
+  int num_prtns_;
+  std::unique_ptr<PartitionLock[]> locks_;
+  std::unique_ptr<arrow::random::pcg32_fast[]> rngs_;
+};
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/query_context.h b/pyarrow/include/arrow/acero/query_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..3eff299439828e602558e5ebc278660bb7ce37eb
--- /dev/null
+++ b/pyarrow/include/arrow/acero/query_context.h
@@ -0,0 +1,151 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <string_view>
+
+#include "arrow/acero/exec_plan.h"
+#include "arrow/acero/task_util.h"
+#include "arrow/acero/util.h"
+#include "arrow/compute/exec.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/util/async_util.h"
+#include "arrow/util/type_fwd.h"
+
+namespace arrow {
+
+using compute::default_exec_context;
+using io::IOContext;
+
+namespace acero {
+
+class ARROW_ACERO_EXPORT QueryContext {
+ public:
+  QueryContext(QueryOptions opts = {},
+               ExecContext exec_context = *default_exec_context());
+
+  Status Init(arrow::util::AsyncTaskScheduler* scheduler);
+
+  const ::arrow::internal::CpuInfo* cpu_info() const;
+  int64_t hardware_flags() const;
+  const QueryOptions& options() const { return options_; }
+  MemoryPool* memory_pool() const { return exec_context_.memory_pool(); }
+  ::arrow::internal::Executor* executor() const { return exec_context_.executor(); }
+  ExecContext* exec_context() { return &exec_context_; }
+  IOContext* io_context() { return &io_context_; }
+  TaskScheduler* scheduler() { return task_scheduler_.get(); }
+  arrow::util::AsyncTaskScheduler* async_scheduler() { return async_scheduler_; }
+
+  size_t GetThreadIndex();
+  size_t max_concurrency() const;
+
+  /// \brief Start an external task
+  ///
+  /// This should be avoided if possible.  It is kept in for now for legacy
+  /// purposes.  This should be called before the external task is started.  If
+  /// a valid future is returned then it should be marked complete when the
+  /// external task has finished.
+  ///
+  /// \param name A name to give the task for traceability and debugging
+  ///
+  /// \return an invalid future if the plan has already ended, otherwise this
+  ///         returns a future that must be completed when the external task
+  ///         finishes.
+  Result<Future<>> BeginExternalTask(std::string_view name);
+
+  /// \brief Add a single function as a task to the query's task group
+  ///        on the compute threadpool.
+  ///
+  /// \param fn The task to run. Takes no arguments and returns a Status.
+  /// \param name A name to give the task for traceability and debugging
+  void ScheduleTask(std::function<Status()> fn, std::string_view name);
+  /// \brief Add a single function as a task to the query's task group
+  ///        on the compute threadpool.
+  ///
+  /// \param fn The task to run. Takes the thread index and returns a Status.
+  /// \param name A name to give the task for traceability and debugging
+  void ScheduleTask(std::function<Status(size_t)> fn, std::string_view name);
+  /// \brief Add a single function as a task to the query's task group on
+  ///        the IO thread pool
+  ///
+  /// \param fn The task to run. Returns a status.
+  /// \param name A name to give the task for traceability and debugging
+  void ScheduleIOTask(std::function<Status()> fn, std::string_view name);
+
+  // Register/Start TaskGroup is a way of performing a "Parallel For" pattern:
+  // - The task function takes the thread index and the index of the task
+  // - The on_finished function takes the thread index
+  // Returns an integer ID that will be used to reference the task group in
+  // StartTaskGroup. At runtime, call StartTaskGroup with the ID and the number of times
+  // you'd like the task to be executed. The need to register a task group before use will
+  // be removed after we rewrite the scheduler.
+  /// \brief Register a "parallel for" task group with the scheduler
+  ///
+  /// \param task The function implementing the task. Takes the thread_index and
+  ///             the task index.
+  /// \param on_finished The function that gets run once all tasks have been completed.
+  /// Takes the thread_index.
+  ///
+  /// Must be called inside of ExecNode::Init.
+  int RegisterTaskGroup(std::function<Status(size_t, int64_t)> task,
+                        std::function<Status(size_t)> on_finished);
+
+  /// \brief Start the task group with the specified ID. This can only
+  ///        be called once per task_group_id.
+  ///
+  /// \param task_group_id The ID  of the task group to run
+  /// \param num_tasks The number of times to run the task
+  Status StartTaskGroup(int task_group_id, int64_t num_tasks);
+
+  // This is an RAII class for keeping track of in-flight file IO. Useful for getting
+  // an estimate of memory use, and how much memory we expect to be freed soon.
+  // Returned by ReportTempFileIO.
+  struct [[nodiscard]] TempFileIOMark {
+    QueryContext* ctx_;
+    size_t bytes_;
+
+    TempFileIOMark(QueryContext* ctx, size_t bytes) : ctx_(ctx), bytes_(bytes) {
+      ctx_->in_flight_bytes_to_disk_.fetch_add(bytes_, std::memory_order_acquire);
+    }
+
+    ARROW_DISALLOW_COPY_AND_ASSIGN(TempFileIOMark);
+
+    ~TempFileIOMark() {
+      ctx_->in_flight_bytes_to_disk_.fetch_sub(bytes_, std::memory_order_release);
+    }
+  };
+
+  TempFileIOMark ReportTempFileIO(size_t bytes) { return {this, bytes}; }
+
+  size_t GetCurrentTempFileIO() { return in_flight_bytes_to_disk_.load(); }
+
+ private:
+  QueryOptions options_;
+  // To be replaced with Acero-specific context once scheduler is done and
+  // we don't need ExecContext for kernels
+  ExecContext exec_context_;
+  IOContext io_context_;
+
+  arrow::util::AsyncTaskScheduler* async_scheduler_ = NULLPTR;
+  std::unique_ptr<TaskScheduler> task_scheduler_ = TaskScheduler::Make();
+
+  ThreadIndexer thread_indexer_;
+
+  std::atomic<size_t> in_flight_bytes_to_disk_{0};
+};
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/schema_util.h b/pyarrow/include/arrow/acero/schema_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..db3076a58841a6cb85fcc3d5033ef3b74ed18898
--- /dev/null
+++ b/pyarrow/include/arrow/acero/schema_util.h
@@ -0,0 +1,226 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/type.h"  // for DataType, FieldRef, Field and Schema
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace acero {
+
+// Identifiers for all different row schemas that are used in a join
+//
+enum class HashJoinProjection : int {
+  INPUT = 0,
+  KEY = 1,
+  PAYLOAD = 2,
+  FILTER = 3,
+  OUTPUT = 4
+};
+
+struct SchemaProjectionMap {
+  static constexpr int kMissingField = -1;
+  int num_cols;
+  const int* source_to_base;
+  const int* base_to_target;
+  inline int get(int i) const {
+    assert(i >= 0 && i < num_cols);
+    assert(source_to_base[i] != kMissingField);
+    return base_to_target[source_to_base[i]];
+  }
+};
+
+/// Helper class for managing different projections of the same row schema.
+/// Used to efficiently map any field in one projection to a corresponding field in
+/// another projection.
+/// Materialized mappings are generated lazily at the time of the first access.
+/// Thread-safe apart from initialization.
+template <typename ProjectionIdEnum>
+class SchemaProjectionMaps {
+ public:
+  static constexpr int kMissingField = -1;
+
+  Status Init(ProjectionIdEnum full_schema_handle, const Schema& schema,
+              const std::vector<ProjectionIdEnum>& projection_handles,
+              const std::vector<const std::vector<FieldRef>*>& projections) {
+    assert(projection_handles.size() == projections.size());
+    ARROW_RETURN_NOT_OK(RegisterSchema(full_schema_handle, schema));
+    for (size_t i = 0; i < projections.size(); ++i) {
+      ARROW_RETURN_NOT_OK(
+          RegisterProjectedSchema(projection_handles[i], *(projections[i]), schema));
+    }
+    RegisterEnd();
+    return Status::OK();
+  }
+
+  int num_cols(ProjectionIdEnum schema_handle) const {
+    int id = schema_id(schema_handle);
+    return static_cast<int>(schemas_[id].second.data_types.size());
+  }
+
+  bool is_empty(ProjectionIdEnum schema_handle) const {
+    return num_cols(schema_handle) == 0;
+  }
+
+  const std::string& field_name(ProjectionIdEnum schema_handle, int field_id) const {
+    int id = schema_id(schema_handle);
+    return schemas_[id].second.field_names[field_id];
+  }
+
+  const std::shared_ptr<DataType>& data_type(ProjectionIdEnum schema_handle,
+                                             int field_id) const {
+    int id = schema_id(schema_handle);
+    return schemas_[id].second.data_types[field_id];
+  }
+
+  const std::vector<std::shared_ptr<DataType>>& data_types(
+      ProjectionIdEnum schema_handle) const {
+    int id = schema_id(schema_handle);
+    return schemas_[id].second.data_types;
+  }
+
+  SchemaProjectionMap map(ProjectionIdEnum from, ProjectionIdEnum to) const {
+    int id_from = schema_id(from);
+    int id_to = schema_id(to);
+    SchemaProjectionMap result;
+    result.num_cols = num_cols(from);
+    result.source_to_base = mappings_[id_from].data();
+    result.base_to_target = inverse_mappings_[id_to].data();
+    return result;
+  }
+
+ protected:
+  struct FieldInfos {
+    std::vector<int> field_paths;
+    std::vector<std::string> field_names;
+    std::vector<std::shared_ptr<DataType>> data_types;
+  };
+
+  Status RegisterSchema(ProjectionIdEnum handle, const Schema& schema) {
+    FieldInfos out_fields;
+    const FieldVector& in_fields = schema.fields();
+    out_fields.field_paths.resize(in_fields.size());
+    out_fields.field_names.resize(in_fields.size());
+    out_fields.data_types.resize(in_fields.size());
+    for (size_t i = 0; i < in_fields.size(); ++i) {
+      const std::string& name = in_fields[i]->name();
+      const std::shared_ptr<DataType>& type = in_fields[i]->type();
+      out_fields.field_paths[i] = static_cast<int>(i);
+      out_fields.field_names[i] = name;
+      out_fields.data_types[i] = type;
+    }
+    schemas_.push_back(std::make_pair(handle, out_fields));
+    return Status::OK();
+  }
+
+  Status RegisterProjectedSchema(ProjectionIdEnum handle,
+                                 const std::vector<FieldRef>& selected_fields,
+                                 const Schema& full_schema) {
+    FieldInfos out_fields;
+    const FieldVector& in_fields = full_schema.fields();
+    out_fields.field_paths.resize(selected_fields.size());
+    out_fields.field_names.resize(selected_fields.size());
+    out_fields.data_types.resize(selected_fields.size());
+    for (size_t i = 0; i < selected_fields.size(); ++i) {
+      // All fields must be found in schema without ambiguity
+      ARROW_ASSIGN_OR_RAISE(auto match, selected_fields[i].FindOne(full_schema));
+      const std::string& name = in_fields[match[0]]->name();
+      const std::shared_ptr<DataType>& type = in_fields[match[0]]->type();
+      out_fields.field_paths[i] = match[0];
+      out_fields.field_names[i] = name;
+      out_fields.data_types[i] = type;
+    }
+    schemas_.push_back(std::make_pair(handle, out_fields));
+    return Status::OK();
+  }
+
+  void RegisterEnd() {
+    size_t size = schemas_.size();
+    mappings_.resize(size);
+    inverse_mappings_.resize(size);
+    int id_base = 0;
+    for (size_t i = 0; i < size; ++i) {
+      GenerateMapForProjection(static_cast<int>(i), id_base);
+    }
+  }
+
+  int schema_id(ProjectionIdEnum schema_handle) const {
+    for (size_t i = 0; i < schemas_.size(); ++i) {
+      if (schemas_[i].first == schema_handle) {
+        return static_cast<int>(i);
+      }
+    }
+    // We should never get here
+    assert(false);
+    return -1;
+  }
+
+  void GenerateMapForProjection(int id_proj, int id_base) {
+    int num_cols_proj = static_cast<int>(schemas_[id_proj].second.data_types.size());
+    int num_cols_base = static_cast<int>(schemas_[id_base].second.data_types.size());
+
+    std::vector<int>& mapping = mappings_[id_proj];
+    std::vector<int>& inverse_mapping = inverse_mappings_[id_proj];
+    mapping.resize(num_cols_proj);
+    inverse_mapping.resize(num_cols_base);
+
+    if (id_proj == id_base) {
+      for (int i = 0; i < num_cols_base; ++i) {
+        mapping[i] = inverse_mapping[i] = i;
+      }
+    } else {
+      const FieldInfos& fields_proj = schemas_[id_proj].second;
+      const FieldInfos& fields_base = schemas_[id_base].second;
+      for (int i = 0; i < num_cols_base; ++i) {
+        inverse_mapping[i] = SchemaProjectionMap::kMissingField;
+      }
+      for (int i = 0; i < num_cols_proj; ++i) {
+        int field_id = SchemaProjectionMap::kMissingField;
+        for (int j = 0; j < num_cols_base; ++j) {
+          if (fields_proj.field_paths[i] == fields_base.field_paths[j]) {
+            field_id = j;
+            // If there are multiple matches for the same input field,
+            // it will be mapped to the first match.
+            break;
+          }
+        }
+        assert(field_id != SchemaProjectionMap::kMissingField);
+        mapping[i] = field_id;
+        inverse_mapping[field_id] = i;
+      }
+    }
+  }
+
+  // vector used as a mapping from ProjectionIdEnum to fields
+  std::vector<std::pair<ProjectionIdEnum, FieldInfos>> schemas_;
+  std::vector<std::vector<int>> mappings_;
+  std::vector<std::vector<int>> inverse_mappings_;
+};
+
+using HashJoinProjectionMaps = SchemaProjectionMaps<HashJoinProjection>;
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/task_util.h b/pyarrow/include/arrow/acero/task_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbd4af699d12795bd92bd385f23a036d63adde38
--- /dev/null
+++ b/pyarrow/include/arrow/acero/task_util.h
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "arrow/acero/visibility.h"
+#include "arrow/status.h"
+#include "arrow/util/config.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace acero {
+
+// Atomic value surrounded by padding bytes to avoid cache line invalidation
+// whenever it is modified by a concurrent thread on a different CPU core.
+//
+template <typename T>
+class AtomicWithPadding {
+ private:
+  static constexpr int kCacheLineSize = 64;
+  uint8_t padding_before[kCacheLineSize];
+
+ public:
+  std::atomic<T> value;
+
+ private:
+  uint8_t padding_after[kCacheLineSize];
+};
+
+// Used for asynchronous execution of operations that can be broken into
+// a fixed number of symmetric tasks that can be executed concurrently.
+//
+// Implements priorities between multiple such operations, called task groups.
+//
+// Allows to specify the maximum number of in-flight tasks at any moment.
+//
+// Also allows for executing next pending tasks immediately using a caller thread.
+//
+class ARROW_ACERO_EXPORT TaskScheduler {
+ public:
+  using TaskImpl = std::function<Status(size_t, int64_t)>;
+  using TaskGroupContinuationImpl = std::function<Status(size_t)>;
+  using ScheduleImpl = std::function<Status(TaskGroupContinuationImpl)>;
+  using AbortContinuationImpl = std::function<void()>;
+
+  virtual ~TaskScheduler() = default;
+
+  // Order in which task groups are registered represents priorities of their tasks
+  // (the first group has the highest priority).
+  //
+  // Returns task group identifier that is used to request operations on the task group.
+  virtual int RegisterTaskGroup(TaskImpl task_impl,
+                                TaskGroupContinuationImpl cont_impl) = 0;
+
+  virtual void RegisterEnd() = 0;
+
+  // total_num_tasks may be zero, in which case task group continuation will be executed
+  // immediately
+  virtual Status StartTaskGroup(size_t thread_id, int group_id,
+                                int64_t total_num_tasks) = 0;
+
+  // Execute given number of tasks immediately using caller thread
+  virtual Status ExecuteMore(size_t thread_id, int num_tasks_to_execute,
+                             bool execute_all) = 0;
+
+  // Begin scheduling tasks using provided callback and
+  // the limit on the number of in-flight tasks at any moment.
+  //
+  // Scheduling will continue as long as there are waiting tasks.
+  //
+  // It will automatically resume whenever new task group gets started.
+  virtual Status StartScheduling(size_t thread_id, ScheduleImpl schedule_impl,
+                                 int num_concurrent_tasks, bool use_sync_execution) = 0;
+
+  // Abort scheduling and execution.
+  // Used in case of being notified about unrecoverable error for the entire query.
+  virtual void Abort(AbortContinuationImpl impl) = 0;
+
+  static std::unique_ptr<TaskScheduler> Make();
+};
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/test_nodes.h b/pyarrow/include/arrow/acero/test_nodes.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e31aa31b34d7b423ab85ff2e77c1cec0087fa5b
--- /dev/null
+++ b/pyarrow/include/arrow/acero/test_nodes.h
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/acero/options.h"
+#include "arrow/acero/test_util_internal.h"
+#include "arrow/testing/random.h"
+
+namespace arrow {
+namespace acero {
+
+// \brief Make a delaying source that is optionally noisy (prints when it emits)
+AsyncGenerator<std::optional<ExecBatch>> MakeDelayedGen(
+    Iterator<std::optional<ExecBatch>> src, std::string label, double delay_sec,
+    bool noisy = false);
+
+// \brief Make a delaying source that is optionally noisy (prints when it emits)
+AsyncGenerator<std::optional<ExecBatch>> MakeDelayedGen(
+    AsyncGenerator<std::optional<ExecBatch>> src, std::string label, double delay_sec,
+    bool noisy = false);
+
+// \brief Make a delaying source that is optionally noisy (prints when it emits)
+AsyncGenerator<std::optional<ExecBatch>> MakeDelayedGen(BatchesWithSchema src,
+                                                        std::string label,
+                                                        double delay_sec,
+                                                        bool noisy = false);
+
+/// A node that slightly resequences the input at random
+struct JitterNodeOptions : public ExecNodeOptions {
+  random::SeedType seed;
+  /// The max amount to add to a node's "cost".
+  int max_jitter_modifier;
+
+  explicit JitterNodeOptions(random::SeedType seed, int max_jitter_modifier = 5)
+      : seed(seed), max_jitter_modifier(max_jitter_modifier) {}
+  static constexpr std::string_view kName = "jitter";
+};
+
+class GateImpl;
+
+class Gate {
+ public:
+  static std::shared_ptr<Gate> Make();
+
+  Gate();
+  virtual ~Gate();
+
+  void ReleaseAllBatches();
+  void ReleaseOneBatch();
+  Future<> WaitForNextReleasedBatch();
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Gate);
+
+  GateImpl* impl_;
+};
+
+// A node that holds all input batches until a given gate is released
+struct GatedNodeOptions : public ExecNodeOptions {
+  explicit GatedNodeOptions(Gate* gate) : gate(gate) {}
+  Gate* gate;
+
+  static constexpr std::string_view kName = "gated";
+};
+
+void RegisterTestNodes();
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/time_series_util.h b/pyarrow/include/arrow/acero/time_series_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..97707f43bf20b95387f463a9c07e37f54c33998c
--- /dev/null
+++ b/pyarrow/include/arrow/acero/time_series_util.h
@@ -0,0 +1,31 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/record_batch.h"
+#include "arrow/type_traits.h"
+
+namespace arrow::acero {
+
+// normalize the value to unsigned 64-bits while preserving ordering of values
+template <typename T, enable_if_t<std::is_integral<T>::value, bool> = true>
+uint64_t NormalizeTime(T t);
+
+uint64_t GetTime(const RecordBatch* batch, Type::type time_type, int col, uint64_t row);
+
+}  // namespace arrow::acero
diff --git a/pyarrow/include/arrow/acero/tpch_node.h b/pyarrow/include/arrow/acero/tpch_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6476b57ad6b4108af56777c029d932f4af94726
--- /dev/null
+++ b/pyarrow/include/arrow/acero/tpch_node.h
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "arrow/acero/type_fwd.h"
+#include "arrow/acero/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+namespace arrow {
+namespace acero {
+namespace internal {
+
+class ARROW_ACERO_EXPORT TpchGen {
+ public:
+  virtual ~TpchGen() = default;
+
+  /*
+   * \brief Create a factory for nodes that generate TPC-H data
+   *
+   * Note: Individual tables will reference each other.  It is important that you only
+   * create a single TpchGen instance for each plan and then you can create nodes for each
+   * table from that single TpchGen instance. Note: Every batch will be scheduled as a new
+   * task using the ExecPlan's scheduler.
+   */
+  static Result<std::unique_ptr<TpchGen>> Make(
+      ExecPlan* plan, double scale_factor = 1.0, int64_t batch_size = 4096,
+      std::optional<int64_t> seed = std::nullopt);
+
+  // The below methods will create and add an ExecNode to the plan that generates
+  // data for the desired table. If columns is empty, all columns will be generated.
+  // The methods return the added ExecNode, which should be used for inputs.
+  virtual Result<ExecNode*> Supplier(std::vector<std::string> columns = {}) = 0;
+  virtual Result<ExecNode*> Part(std::vector<std::string> columns = {}) = 0;
+  virtual Result<ExecNode*> PartSupp(std::vector<std::string> columns = {}) = 0;
+  virtual Result<ExecNode*> Customer(std::vector<std::string> columns = {}) = 0;
+  virtual Result<ExecNode*> Orders(std::vector<std::string> columns = {}) = 0;
+  virtual Result<ExecNode*> Lineitem(std::vector<std::string> columns = {}) = 0;
+  virtual Result<ExecNode*> Nation(std::vector<std::string> columns = {}) = 0;
+  virtual Result<ExecNode*> Region(std::vector<std::string> columns = {}) = 0;
+};
+
+}  // namespace internal
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/type_fwd.h b/pyarrow/include/arrow/acero/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0410de9f7830a7d0e55a04eb514ae9d82e6958c
--- /dev/null
+++ b/pyarrow/include/arrow/acero/type_fwd.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/compute/type_fwd.h"
+
+namespace arrow {
+
+namespace acero {
+
+class ExecNode;
+class ExecPlan;
+class ExecNodeOptions;
+class ExecFactoryRegistry;
+class QueryContext;
+struct QueryOptions;
+struct Declaration;
+class SinkNodeConsumer;
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/util.h b/pyarrow/include/arrow/acero/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee46e8527422abae4f97804058639593dd6b159c
--- /dev/null
+++ b/pyarrow/include/arrow/acero/util.h
@@ -0,0 +1,184 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <optional>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/acero/options.h"
+#include "arrow/acero/type_fwd.h"
+#include "arrow/buffer.h"
+#include "arrow/compute/expression.h"
+#include "arrow/compute/util.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/mutex.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/type_fwd.h"
+
+namespace arrow {
+
+namespace acero {
+
+ARROW_ACERO_EXPORT
+Status ValidateExecNodeInputs(ExecPlan* plan, const std::vector<ExecNode*>& inputs,
+                              int expected_num_inputs, const char* kind_name);
+
+ARROW_ACERO_EXPORT
+Result<std::shared_ptr<Table>> TableFromExecBatches(
+    const std::shared_ptr<Schema>& schema, const std::vector<ExecBatch>& exec_batches);
+
+class ARROW_ACERO_EXPORT AtomicCounter {
+ public:
+  AtomicCounter() = default;
+
+  int count() const { return count_.load(); }
+
+  std::optional<int> total() const {
+    int total = total_.load();
+    if (total == -1) return {};
+    return total;
+  }
+
+  // return true if the counter is complete
+  bool Increment() {
+    ARROW_DCHECK_NE(count_.load(), total_.load());
+    int count = count_.fetch_add(1) + 1;
+    if (count != total_.load()) return false;
+    return DoneOnce();
+  }
+
+  // return true if the counter is complete
+  bool SetTotal(int total) {
+    total_.store(total);
+    if (count_.load() != total) return false;
+    return DoneOnce();
+  }
+
+  // return true if the counter has not already been completed
+  bool Cancel() { return DoneOnce(); }
+
+  // return true if the counter has finished or been cancelled
+  bool Completed() { return complete_.load(); }
+
+ private:
+  // ensure there is only one true return from Increment(), SetTotal(), or Cancel()
+  bool DoneOnce() {
+    bool expected = false;
+    return complete_.compare_exchange_strong(expected, true);
+  }
+
+  std::atomic<int> count_{0}, total_{-1};
+  std::atomic<bool> complete_{false};
+};
+
+class ARROW_ACERO_EXPORT ThreadIndexer {
+ public:
+  size_t operator()();
+
+  static size_t Capacity();
+
+ private:
+  static size_t Check(size_t thread_index);
+
+  arrow::util::Mutex mutex_;
+  std::unordered_map<std::thread::id, size_t> id_to_index_;
+};
+
+/// \brief A consumer that collects results into an in-memory table
+struct ARROW_ACERO_EXPORT TableSinkNodeConsumer : public SinkNodeConsumer {
+ public:
+  TableSinkNodeConsumer(std::shared_ptr<Table>* out, MemoryPool* pool)
+      : out_(out), pool_(pool) {}
+  Status Init(const std::shared_ptr<Schema>& schema,
+              BackpressureControl* backpressure_control, ExecPlan* plan) override;
+  Status Consume(ExecBatch batch) override;
+  Future<> Finish() override;
+
+ private:
+  std::shared_ptr<Table>* out_;
+  MemoryPool* pool_;
+  std::shared_ptr<Schema> schema_;
+  std::vector<std::shared_ptr<RecordBatch>> batches_;
+  arrow::util::Mutex consume_mutex_;
+};
+
+class ARROW_ACERO_EXPORT NullSinkNodeConsumer : public SinkNodeConsumer {
+ public:
+  Status Init(const std::shared_ptr<Schema>&, BackpressureControl*,
+              ExecPlan* plan) override {
+    return Status::OK();
+  }
+  Status Consume(ExecBatch exec_batch) override { return Status::OK(); }
+  Future<> Finish() override { return Status::OK(); }
+
+ public:
+  static std::shared_ptr<NullSinkNodeConsumer> Make() {
+    return std::make_shared<NullSinkNodeConsumer>();
+  }
+};
+
+/// CRTP helper for tracing helper functions
+
+class ARROW_ACERO_EXPORT TracedNode {
+ public:
+  // All nodes should call TraceStartProducing or NoteStartProducing exactly once
+  // Most nodes will be fine with a call to NoteStartProducing since the StartProducing
+  // call is usually fairly cheap and simply schedules tasks to fetch the actual data.
+
+  explicit TracedNode(ExecNode* node) : node_(node) {}
+
+  // Create a span to record the StartProducing work
+  [[nodiscard]] ::arrow::internal::tracing::Scope TraceStartProducing(
+      std::string extra_details) const;
+
+  // Record a call to StartProducing without creating with a span
+  void NoteStartProducing(std::string extra_details) const;
+
+  // All nodes should call TraceInputReceived for each batch they receive.  This call
+  // should track the time spent processing the batch.  NoteInputReceived is available
+  // but usually won't be used unless a node is simply adding batches to a trivial queue.
+
+  // Create a span to record the InputReceived work
+  [[nodiscard]] ::arrow::internal::tracing::Scope TraceInputReceived(
+      const ExecBatch& batch) const;
+
+  // Record a call to InputReceived without creating with a span
+  void NoteInputReceived(const ExecBatch& batch) const;
+
+  // Create a span to record any "finish" work.  This should NOT be called as part of
+  // InputFinished and many nodes may not need to call this at all.  This should be used
+  // when a node has some extra work that has to be done once it has received all of its
+  // data.  For example, an aggregation node calculating aggregations.  This will
+  // typically be called as a result of InputFinished OR InputReceived.
+  [[nodiscard]] ::arrow::internal::tracing::Scope TraceFinish() const;
+
+ private:
+  ExecNode* node_;
+};
+
+}  // namespace acero
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/acero/visibility.h b/pyarrow/include/arrow/acero/visibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..21a697a56eca962602b34b2766d74442d185c3d7
--- /dev/null
+++ b/pyarrow/include/arrow/acero/visibility.h
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_ACERO_STATIC
+#    define ARROW_ACERO_EXPORT
+#  elif defined(ARROW_ACERO_EXPORTING)
+#    define ARROW_ACERO_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_ACERO_EXPORT __declspec(dllimport)
+#  endif
+
+#  define ARROW_ACERO_NO_EXPORT
+#else  // Not Windows
+#  ifndef ARROW_ACERO_EXPORT
+#    define ARROW_ACERO_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_ACERO_NO_EXPORT
+#    define ARROW_ACERO_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif  // Not-Windows
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/pyarrow/include/arrow/adapters/orc/adapter.h b/pyarrow/include/arrow/adapters/orc/adapter.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ffff81f355f1ddcdc19516746c61b8021477de4
--- /dev/null
+++ b/pyarrow/include/arrow/adapters/orc/adapter.h
@@ -0,0 +1,323 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/adapters/orc/options.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/memory_pool.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace adapters {
+namespace orc {
+
+/// \brief Information about an ORC stripe
+struct StripeInformation {
+  /// \brief Offset of the stripe from the start of the file, in bytes
+  int64_t offset;
+  /// \brief Length of the stripe, in bytes
+  int64_t length;
+  /// \brief Number of rows in the stripe
+  int64_t num_rows;
+  /// \brief Index of the first row of the stripe
+  int64_t first_row_id;
+};
+
+/// \class ORCFileReader
+/// \brief Read an Arrow Table or RecordBatch from an ORC file.
+class ARROW_EXPORT ORCFileReader {
+ public:
+  ~ORCFileReader();
+
+  /// \brief Creates a new ORC reader
+  ///
+  /// \param[in] file the data source
+  /// \param[in] pool a MemoryPool to use for buffer allocations
+  /// \return the returned reader object
+  static Result<std::unique_ptr<ORCFileReader>> Open(
+      const std::shared_ptr<io::RandomAccessFile>& file, MemoryPool* pool);
+
+  /// \brief Return the schema read from the ORC file
+  ///
+  /// \return the returned Schema object
+  Result<std::shared_ptr<Schema>> ReadSchema();
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \return the returned Table
+  Result<std::shared_ptr<Table>> Read();
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \param[in] schema the Table schema
+  /// \return the returned Table
+  Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema);
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \param[in] include_indices the selected field indices to read
+  /// \return the returned Table
+  Result<std::shared_ptr<Table>> Read(const std::vector<int>& include_indices);
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \param[in] include_names the selected field names to read
+  /// \return the returned Table
+  Result<std::shared_ptr<Table>> Read(const std::vector<std::string>& include_names);
+
+  /// \brief Read the file as a Table
+  ///
+  /// The table will be composed of one record batch per stripe.
+  ///
+  /// \param[in] schema the Table schema
+  /// \param[in] include_indices the selected field indices to read
+  /// \return the returned Table
+  Result<std::shared_ptr<Table>> Read(const std::shared_ptr<Schema>& schema,
+                                      const std::vector<int>& include_indices);
+
+  /// \brief Read a single stripe as a RecordBatch
+  ///
+  /// \param[in] stripe the stripe index
+  /// \return the returned RecordBatch
+  Result<std::shared_ptr<RecordBatch>> ReadStripe(int64_t stripe);
+
+  /// \brief Read a single stripe as a RecordBatch
+  ///
+  /// \param[in] stripe the stripe index
+  /// \param[in] include_indices the selected field indices to read
+  /// \return the returned RecordBatch
+  Result<std::shared_ptr<RecordBatch>> ReadStripe(
+      int64_t stripe, const std::vector<int>& include_indices);
+
+  /// \brief Read a single stripe as a RecordBatch
+  ///
+  /// \param[in] stripe the stripe index
+  /// \param[in] include_names the selected field names to read
+  /// \return the returned RecordBatch
+  Result<std::shared_ptr<RecordBatch>> ReadStripe(
+      int64_t stripe, const std::vector<std::string>& include_names);
+
+  /// \brief Seek to designated row. Invoke NextStripeReader() after seek
+  ///        will return stripe reader starting from designated row.
+  ///
+  /// \param[in] row_number the rows number to seek
+  Status Seek(int64_t row_number);
+
+  /// \brief Get a stripe level record batch iterator.
+  ///
+  /// Each record batch will have up to `batch_size` rows.
+  /// NextStripeReader serves as a fine-grained alternative to ReadStripe
+  /// which may cause OOM issues by loading the whole stripe into memory.
+  ///
+  /// Note this will only read rows for the current stripe, not the entire
+  /// file.
+  ///
+  /// \param[in] batch_size the maximum number of rows in each record batch
+  /// \return the returned stripe reader
+  Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(int64_t batch_size);
+
+  /// \brief Get a stripe level record batch iterator.
+  ///
+  /// Each record batch will have up to `batch_size` rows.
+  /// NextStripeReader serves as a fine-grained alternative to ReadStripe
+  /// which may cause OOM issues by loading the whole stripe into memory.
+  ///
+  /// Note this will only read rows for the current stripe, not the entire
+  /// file.
+  ///
+  /// \param[in] batch_size the maximum number of rows in each record batch
+  /// \param[in] include_indices the selected field indices to read
+  /// \return the stripe reader
+  Result<std::shared_ptr<RecordBatchReader>> NextStripeReader(
+      int64_t batch_size, const std::vector<int>& include_indices);
+
+  /// \brief Get a record batch iterator for the entire file.
+  ///
+  /// Each record batch will have up to `batch_size` rows.
+  ///
+  /// \param[in] batch_size the maximum number of rows in each record batch
+  /// \param[in] include_names the selected field names to read, if not empty
+  /// (otherwise all fields are read)
+  /// \return the record batch iterator
+  Result<std::shared_ptr<RecordBatchReader>> GetRecordBatchReader(
+      int64_t batch_size, const std::vector<std::string>& include_names);
+
+  /// \brief The number of stripes in the file
+  int64_t NumberOfStripes();
+
+  /// \brief The number of rows in the file
+  int64_t NumberOfRows();
+
+  /// \brief StripeInformation for each stripe.
+  StripeInformation GetStripeInformation(int64_t stripe);
+
+  /// \brief Get the format version of the file.
+  ///         Currently known values are 0.11 and 0.12.
+  ///
+  /// \return The FileVersion of the ORC file.
+  FileVersion GetFileVersion();
+
+  /// \brief Get the software instance and version that wrote this file.
+  ///
+  /// \return a user-facing string that specifies the software version
+  std::string GetSoftwareVersion();
+
+  /// \brief Get the compression kind of the file.
+  ///
+  /// \return The kind of compression in the ORC file.
+  Result<Compression::type> GetCompression();
+
+  /// \brief Get the buffer size for the compression.
+  ///
+  /// \return Number of bytes to buffer for the compression codec.
+  int64_t GetCompressionSize();
+
+  /// \brief Get the number of rows per an entry in the row index.
+  /// \return the number of rows per an entry in the row index or 0 if there
+  ///          is no row index.
+  int64_t GetRowIndexStride();
+
+  /// \brief Get ID of writer that generated the file.
+  ///
+  /// \return UNKNOWN_WRITER if the writer ID is undefined
+  WriterId GetWriterId();
+
+  /// \brief Get the writer id value when getWriterId() returns an unknown writer.
+  ///
+  /// \return the integer value of the writer ID.
+  int32_t GetWriterIdValue();
+
+  /// \brief Get the version of the writer.
+  ///
+  /// \return the version of the writer.
+
+  WriterVersion GetWriterVersion();
+
+  /// \brief Get the number of stripe statistics in the file.
+  ///
+  /// \return the number of stripe statistics
+  int64_t GetNumberOfStripeStatistics();
+
+  /// \brief Get the length of the data stripes in the file.
+  ///
+  /// \return return the number of bytes in stripes
+  int64_t GetContentLength();
+
+  /// \brief Get the length of the file stripe statistics.
+  ///
+  /// \return the number of compressed bytes in the file stripe statistics
+  int64_t GetStripeStatisticsLength();
+
+  /// \brief Get the length of the file footer.
+  ///
+  /// \return the number of compressed bytes in the file footer
+  int64_t GetFileFooterLength();
+
+  /// \brief Get the length of the file postscript.
+  ///
+  /// \return the number of bytes in the file postscript
+  int64_t GetFilePostscriptLength();
+
+  /// \brief Get the total length of the file.
+  ///
+  /// \return the number of bytes in the file
+  int64_t GetFileLength();
+
+  /// \brief Get the serialized file tail.
+  ///         Useful if another reader of the same file wants to avoid re-reading
+  ///         the file tail. See ReadOptions.SetSerializedFileTail().
+  ///
+  /// \return a string of bytes with the file tail
+  std::string GetSerializedFileTail();
+
+  /// \brief Return the metadata read from the ORC file
+  ///
+  /// \return A KeyValueMetadata object containing the ORC metadata
+  Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+  ORCFileReader();
+};
+
+/// \class ORCFileWriter
+/// \brief Write an Arrow Table or RecordBatch to an ORC file.
+class ARROW_EXPORT ORCFileWriter {
+ public:
+  ~ORCFileWriter();
+  /// \brief Creates a new ORC writer.
+  ///
+  /// \param[in] output_stream a pointer to the io::OutputStream to write into
+  /// \param[in] write_options the ORC writer options for Arrow
+  /// \return the returned writer object
+  static Result<std::unique_ptr<ORCFileWriter>> Open(
+      io::OutputStream* output_stream,
+      const WriteOptions& write_options = WriteOptions());
+
+  /// \brief Write a table. This can be called multiple times.
+  ///
+  /// Tables passed in subsequent calls must match the schema of the table that was
+  /// written first.
+  ///
+  /// \param[in] table the Arrow table from which data is extracted.
+  /// \return Status
+  Status Write(const Table& table);
+
+  /// \brief Write a RecordBatch. This can be called multiple times.
+  ///
+  /// RecordBatches passed in subsequent calls must match the schema of the
+  /// RecordBatch that was written first.
+  ///
+  /// \param[in] record_batch the Arrow RecordBatch from which data is extracted.
+  /// \return Status
+  Status Write(const RecordBatch& record_batch);
+
+  /// \brief Close an ORC writer (orc::Writer)
+  ///
+  /// \return Status
+  Status Close();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+
+ private:
+  ORCFileWriter();
+};
+
+}  // namespace orc
+}  // namespace adapters
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/adapters/orc/options.h b/pyarrow/include/arrow/adapters/orc/options.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a300da678db98c24949203be7ab471a57502640
--- /dev/null
+++ b/pyarrow/include/arrow/adapters/orc/options.h
@@ -0,0 +1,120 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace adapters {
+
+namespace orc {
+
+enum class WriterId : int32_t {
+  kOrcJava = 0,
+  kOrcCpp = 1,
+  kPresto = 2,
+  kScritchleyGo = 3,
+  kTrino = 4,
+  kUnknown = INT32_MAX
+};
+
+enum class WriterVersion : int32_t {
+  kOriginal = 0,
+  kHive8732 = 1,
+  kHive4243 = 2,
+  kHive12055 = 3,
+  kHive13083 = 4,
+  kOrc101 = 5,
+  kOrc135 = 6,
+  kOrc517 = 7,
+  kOrc203 = 8,
+  kOrc14 = 9,
+  kMax = INT32_MAX
+};
+
+enum class CompressionStrategy : int32_t { kSpeed = 0, kCompression };
+
+class ARROW_EXPORT FileVersion {
+ private:
+  int32_t major_version_;
+  int32_t minor_version_;
+
+ public:
+  static const FileVersion& v_0_11();
+  static const FileVersion& v_0_12();
+
+  FileVersion(int32_t major, int32_t minor)
+      : major_version_(major), minor_version_(minor) {}
+
+  /**
+   * Get major version
+   */
+  int32_t major_version() const { return this->major_version_; }
+
+  /**
+   * Get minor version
+   */
+  int32_t minor_version() const { return this->minor_version_; }
+
+  bool operator==(const FileVersion& right) const {
+    return this->major_version() == right.major_version() &&
+           this->minor_version() == right.minor_version();
+  }
+
+  bool operator!=(const FileVersion& right) const { return !(*this == right); }
+
+  std::string ToString() const;
+};
+
+/// Options for the ORC Writer
+struct ARROW_EXPORT WriteOptions {
+  /// Number of rows the ORC writer writes at a time, default 1024
+  int64_t batch_size = 1024;
+  /// Which ORC file version to use, default FileVersion(0, 12)
+  FileVersion file_version = FileVersion(0, 12);
+  /// Size of each ORC stripe in bytes, default 64 MiB
+  int64_t stripe_size = 64 * 1024 * 1024;
+  /// The compression codec of the ORC file, there is no compression by default
+  Compression::type compression = Compression::UNCOMPRESSED;
+  /// The size of each compression block in bytes, default 64 KiB
+  int64_t compression_block_size = 64 * 1024;
+  /// The compression strategy i.e. speed vs size reduction, default
+  /// CompressionStrategy::kSpeed
+  CompressionStrategy compression_strategy = CompressionStrategy::kSpeed;
+  /// The number of rows per an entry in the row index, default 10000
+  int64_t row_index_stride = 10000;
+  /// The padding tolerance, default 0.0
+  double padding_tolerance = 0.0;
+  /// The dictionary key size threshold. 0 to disable dictionary encoding.
+  /// 1 to always enable dictionary encoding, default 0.0
+  double dictionary_key_size_threshold = 0.0;
+  /// The array of columns that use the bloom filter, default empty
+  std::vector<int64_t> bloom_filter_columns;
+  /// The upper limit of the false-positive rate of the bloom filter, default 0.05
+  double bloom_filter_fpp = 0.05;
+};
+
+}  // namespace orc
+}  // namespace adapters
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/adapters/tensorflow/convert.h b/pyarrow/include/arrow/adapters/tensorflow/convert.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d093eddf6b598150ddb55da0e84699a5b7ef4b8
--- /dev/null
+++ b/pyarrow/include/arrow/adapters/tensorflow/convert.h
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "tensorflow/core/framework/op.h"
+
+#include "arrow/type.h"
+
+// These utilities are supposed to be included in TensorFlow operators
+// that need to be compiled separately from Arrow because of ABI issues.
+// They therefore need to be header-only.
+
+namespace arrow {
+
+namespace adapters {
+
+namespace tensorflow {
+
+Status GetArrowType(::tensorflow::DataType dtype, std::shared_ptr<DataType>* out) {
+  switch (dtype) {
+    case ::tensorflow::DT_BOOL:
+      *out = arrow::boolean();
+      break;
+    case ::tensorflow::DT_FLOAT:
+      *out = arrow::float32();
+      break;
+    case ::tensorflow::DT_DOUBLE:
+      *out = arrow::float64();
+      break;
+    case ::tensorflow::DT_HALF:
+      *out = arrow::float16();
+      break;
+    case ::tensorflow::DT_INT8:
+      *out = arrow::int8();
+      break;
+    case ::tensorflow::DT_INT16:
+      *out = arrow::int16();
+      break;
+    case ::tensorflow::DT_INT32:
+      *out = arrow::int32();
+      break;
+    case ::tensorflow::DT_INT64:
+      *out = arrow::int64();
+      break;
+    case ::tensorflow::DT_UINT8:
+      *out = arrow::uint8();
+      break;
+    case ::tensorflow::DT_UINT16:
+      *out = arrow::uint16();
+      break;
+    case ::tensorflow::DT_UINT32:
+      *out = arrow::uint32();
+      break;
+    case ::tensorflow::DT_UINT64:
+      *out = arrow::uint64();
+      break;
+    default:
+      return Status::TypeError("TensorFlow data type is not supported");
+  }
+  return Status::OK();
+}
+
+Status GetTensorFlowType(std::shared_ptr<DataType> dtype, ::tensorflow::DataType* out) {
+  switch (dtype->id()) {
+    case Type::BOOL:
+      *out = ::tensorflow::DT_BOOL;
+      break;
+    case Type::UINT8:
+      *out = ::tensorflow::DT_UINT8;
+      break;
+    case Type::INT8:
+      *out = ::tensorflow::DT_INT8;
+      break;
+    case Type::UINT16:
+      *out = ::tensorflow::DT_UINT16;
+      break;
+    case Type::INT16:
+      *out = ::tensorflow::DT_INT16;
+      break;
+    case Type::UINT32:
+      *out = ::tensorflow::DT_UINT32;
+      break;
+    case Type::INT32:
+      *out = ::tensorflow::DT_INT32;
+      break;
+    case Type::UINT64:
+      *out = ::tensorflow::DT_UINT64;
+      break;
+    case Type::INT64:
+      *out = ::tensorflow::DT_INT64;
+      break;
+    case Type::HALF_FLOAT:
+      *out = ::tensorflow::DT_HALF;
+      break;
+    case Type::FLOAT:
+      *out = ::tensorflow::DT_FLOAT;
+      break;
+    case Type::DOUBLE:
+      *out = ::tensorflow::DT_DOUBLE;
+      break;
+    default:
+      return Status::TypeError("Arrow data type is not supported");
+  }
+  return arrow::Status::OK();
+}
+
+}  // namespace tensorflow
+
+}  // namespace adapters
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/api.h b/pyarrow/include/arrow/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac568a00eedc32984758f4675b58ac626c9c947a
--- /dev/null
+++ b/pyarrow/include/arrow/api.h
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Coarse public API while the library is in development
+
+#pragma once
+
+#include "arrow/array.h"                    // IWYU pragma: export
+#include "arrow/array/array_run_end.h"      // IWYU pragma: export
+#include "arrow/array/concatenate.h"        // IWYU pragma: export
+#include "arrow/buffer.h"                   // IWYU pragma: export
+#include "arrow/builder.h"                  // IWYU pragma: export
+#include "arrow/chunked_array.h"            // IWYU pragma: export
+#include "arrow/compare.h"                  // IWYU pragma: export
+#include "arrow/config.h"                   // IWYU pragma: export
+#include "arrow/datum.h"                    // IWYU pragma: export
+#include "arrow/extension_type.h"           // IWYU pragma: export
+#include "arrow/memory_pool.h"              // IWYU pragma: export
+#include "arrow/pretty_print.h"             // IWYU pragma: export
+#include "arrow/record_batch.h"             // IWYU pragma: export
+#include "arrow/result.h"                   // IWYU pragma: export
+#include "arrow/status.h"                   // IWYU pragma: export
+#include "arrow/table.h"                    // IWYU pragma: export
+#include "arrow/table_builder.h"            // IWYU pragma: export
+#include "arrow/tensor.h"                   // IWYU pragma: export
+#include "arrow/type.h"                     // IWYU pragma: export
+#include "arrow/util/key_value_metadata.h"  // IWYU pragma: export
+#include "arrow/visit_array_inline.h"       // IWYU pragma: export
+#include "arrow/visit_scalar_inline.h"      // IWYU pragma: export
+#include "arrow/visitor.h"                  // IWYU pragma: export
+
+/// \brief Top-level namespace for Apache Arrow C++ API
+namespace arrow {}
diff --git a/pyarrow/include/arrow/array.h b/pyarrow/include/arrow/array.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d72ea9506a414fd6e50d5c7d0af437084045e05
--- /dev/null
+++ b/pyarrow/include/arrow/array.h
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Kitchen-sink public API for arrow::Array data structures. C++ library code
+// (especially header files) in Apache Arrow should use more specific headers
+// unless it's a file that uses most or all Array types in which case using
+// arrow/array.h is fine.
+
+#pragma once
+
+/// \defgroup numeric-arrays Concrete classes for numeric arrays
+/// @{
+/// @}
+
+/// \defgroup binary-arrays Concrete classes for binary/string arrays
+/// @{
+/// @}
+
+/// \defgroup nested-arrays Concrete classes for nested arrays
+/// @{
+/// @}
+
+/// \defgroup run-end-encoded-arrays Concrete classes for run-end encoded arrays
+/// @{
+/// @}
+
+#include "arrow/array/array_base.h"       // IWYU pragma: keep
+#include "arrow/array/array_binary.h"     // IWYU pragma: keep
+#include "arrow/array/array_decimal.h"    // IWYU pragma: keep
+#include "arrow/array/array_dict.h"       // IWYU pragma: keep
+#include "arrow/array/array_nested.h"     // IWYU pragma: keep
+#include "arrow/array/array_primitive.h"  // IWYU pragma: keep
+#include "arrow/array/array_run_end.h"    // IWYU pragma: keep
+#include "arrow/array/data.h"             // IWYU pragma: keep
+#include "arrow/array/util.h"             // IWYU pragma: keep
diff --git a/pyarrow/include/arrow/array/array_base.h b/pyarrow/include/arrow/array/array_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..60df45357e5d2fd8bc31cfa714aa4d9d89288508
--- /dev/null
+++ b/pyarrow/include/arrow/array/array_base.h
@@ -0,0 +1,323 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/buffer.h"
+#include "arrow/compare.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+#include "arrow/visitor.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// User array accessor types
+
+/// \brief Array base type
+/// Immutable data array with some logical type and some length.
+///
+/// Any memory is owned by the respective Buffer instance (or its parents).
+///
+/// The base class is only required to have a null bitmap buffer if the null
+/// count is greater than 0
+///
+/// If known, the null count can be provided in the base Array constructor. If
+/// the null count is not known, pass -1 to indicate that the null count is to
+/// be computed on the first call to null_count()
+class ARROW_EXPORT Array {
+ public:
+  virtual ~Array() = default;
+
+  /// \brief Return true if value at index is null. Does not boundscheck
+  bool IsNull(int64_t i) const { return !IsValid(i); }
+
+  /// \brief Return true if value at index is valid (not null). Does not
+  /// boundscheck
+  bool IsValid(int64_t i) const {
+    if (null_bitmap_data_ != NULLPTR) {
+      return bit_util::GetBit(null_bitmap_data_, i + data_->offset);
+    }
+    // Dispatching with a few conditionals like this makes IsNull more
+    // efficient for how it is used in practice. Making IsNull virtual
+    // would add a vtable lookup to every call and prevent inlining +
+    // a potential inner-branch removal.
+    if (type_id() == Type::SPARSE_UNION) {
+      return !internal::IsNullSparseUnion(*data_, i);
+    }
+    if (type_id() == Type::DENSE_UNION) {
+      return !internal::IsNullDenseUnion(*data_, i);
+    }
+    if (type_id() == Type::RUN_END_ENCODED) {
+      return !internal::IsNullRunEndEncoded(*data_, i);
+    }
+    return data_->null_count != data_->length;
+  }
+
+  /// \brief Return a Scalar containing the value of this array at i
+  Result<std::shared_ptr<Scalar>> GetScalar(int64_t i) const;
+
+  /// Size in the number of elements this array contains.
+  int64_t length() const { return data_->length; }
+
+  /// A relative position into another array's data, to enable zero-copy
+  /// slicing. This value defaults to zero
+  int64_t offset() const { return data_->offset; }
+
+  /// The number of null entries in the array. If the null count was not known
+  /// at time of construction (and set to a negative value), then the null
+  /// count will be computed and cached on the first invocation of this
+  /// function
+  int64_t null_count() const;
+
+  /// \brief Computes the logical null count for arrays of all types including
+  /// those that do not have a validity bitmap like union and run-end encoded
+  /// arrays
+  ///
+  /// If the array has a validity bitmap, this function behaves the same as
+  /// null_count(). For types that have no validity bitmap, this function will
+  /// recompute the null count every time it is called.
+  ///
+  /// \see GetNullCount
+  int64_t ComputeLogicalNullCount() const;
+
+  const std::shared_ptr<DataType>& type() const { return data_->type; }
+  Type::type type_id() const { return data_->type->id(); }
+
+  /// Buffer for the validity (null) bitmap, if any. Note that Union types
+  /// never have a null bitmap.
+  ///
+  /// Note that for `null_count == 0` or for null type, this will be null.
+  /// This buffer does not account for any slice offset
+  const std::shared_ptr<Buffer>& null_bitmap() const { return data_->buffers[0]; }
+
+  /// Raw pointer to the null bitmap.
+  ///
+  /// Note that for `null_count == 0` or for null type, this will be null.
+  /// This buffer does not account for any slice offset
+  const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }
+
+  /// Equality comparison with another array
+  ///
+  /// Note that arrow::ArrayStatistics is not included in the comparison.
+  bool Equals(const Array& arr, const EqualOptions& = EqualOptions::Defaults()) const;
+  bool Equals(const std::shared_ptr<Array>& arr,
+              const EqualOptions& = EqualOptions::Defaults()) const;
+
+  /// \brief Return the formatted unified diff of arrow::Diff between this
+  /// Array and another Array
+  std::string Diff(const Array& other) const;
+
+  /// Approximate equality comparison with another array
+  ///
+  /// epsilon is only used if this is FloatArray or DoubleArray
+  ///
+  /// Note that arrow::ArrayStatistics is not included in the comparison.
+  bool ApproxEquals(const std::shared_ptr<Array>& arr,
+                    const EqualOptions& = EqualOptions::Defaults()) const;
+  bool ApproxEquals(const Array& arr,
+                    const EqualOptions& = EqualOptions::Defaults()) const;
+
+  /// Compare if the range of slots specified are equal for the given array and
+  /// this array.  end_idx exclusive.  This methods does not bounds check.
+  ///
+  /// Note that arrow::ArrayStatistics is not included in the comparison.
+  bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
+                   const Array& other,
+                   const EqualOptions& = EqualOptions::Defaults()) const;
+  bool RangeEquals(int64_t start_idx, int64_t end_idx, int64_t other_start_idx,
+                   const std::shared_ptr<Array>& other,
+                   const EqualOptions& = EqualOptions::Defaults()) const;
+  bool RangeEquals(const Array& other, int64_t start_idx, int64_t end_idx,
+                   int64_t other_start_idx,
+                   const EqualOptions& = EqualOptions::Defaults()) const;
+  bool RangeEquals(const std::shared_ptr<Array>& other, int64_t start_idx,
+                   int64_t end_idx, int64_t other_start_idx,
+                   const EqualOptions& = EqualOptions::Defaults()) const;
+
+  /// \brief Apply the ArrayVisitor::Visit() method specialized to the array type
+  Status Accept(ArrayVisitor* visitor) const;
+
+  /// Construct a zero-copy view of this array with the given type.
+  ///
+  /// This method checks if the types are layout-compatible.
+  /// Nested types are traversed in depth-first order. Data buffers must have
+  /// the same item sizes, even though the logical types may be different.
+  /// An error is returned if the types are not layout-compatible.
+  Result<std::shared_ptr<Array>> View(const std::shared_ptr<DataType>& type) const;
+
+  /// \brief Construct a copy of the array with all buffers on destination
+  /// Memory Manager
+  ///
+  /// This method recursively copies the array's buffers and those of its children
+  /// onto the destination MemoryManager device and returns the new Array.
+  Result<std::shared_ptr<Array>> CopyTo(const std::shared_ptr<MemoryManager>& to) const;
+
+  /// \brief Construct a new array attempting to zero-copy view if possible.
+  ///
+  /// Like CopyTo this method recursively goes through all of the array's buffers
+  /// and those of it's children and first attempts to create zero-copy
+  /// views on the destination MemoryManager device. If it can't, it falls back
+  /// to performing a copy. See Buffer::ViewOrCopy.
+  Result<std::shared_ptr<Array>> ViewOrCopyTo(
+      const std::shared_ptr<MemoryManager>& to) const;
+
+  /// Construct a zero-copy slice of the array with the indicated offset and
+  /// length
+  ///
+  /// \param[in] offset the position of the first element in the constructed
+  /// slice
+  /// \param[in] length the length of the slice. If there are not enough
+  /// elements in the array, the length will be adjusted accordingly
+  ///
+  /// \return a new object wrapped in std::shared_ptr<Array>
+  std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const;
+
+  /// Slice from offset until end of the array
+  std::shared_ptr<Array> Slice(int64_t offset) const;
+
+  /// Input-checking variant of Array::Slice
+  Result<std::shared_ptr<Array>> SliceSafe(int64_t offset, int64_t length) const;
+  /// Input-checking variant of Array::Slice
+  Result<std::shared_ptr<Array>> SliceSafe(int64_t offset) const;
+
+  const std::shared_ptr<ArrayData>& data() const { return data_; }
+
+  int num_fields() const { return static_cast<int>(data_->child_data.size()); }
+
+  /// \return PrettyPrint representation of array suitable for debugging
+  std::string ToString() const;
+
+  /// \brief Perform cheap validation checks to determine obvious inconsistencies
+  /// within the array's internal data.
+  ///
+  /// This is O(k) where k is the number of descendents.
+  ///
+  /// \return Status
+  Status Validate() const;
+
+  /// \brief Perform extensive validation checks to determine inconsistencies
+  /// within the array's internal data.
+  ///
+  /// This is potentially O(k*n) where k is the number of descendents and n
+  /// is the array length.
+  ///
+  /// \return Status
+  Status ValidateFull() const;
+
+  /// \brief Return the device_type that this array's data is allocated on
+  ///
+  /// This just delegates to calling device_type on the underlying ArrayData
+  /// object which backs this Array.
+  ///
+  /// \return DeviceAllocationType
+  DeviceAllocationType device_type() const { return data_->device_type(); }
+
+  /// \brief Return the statistics of this Array
+  ///
+  /// This just delegates to calling statistics on the underlying ArrayData
+  /// object which backs this Array.
+  ///
+  /// \return const std::shared_ptr<ArrayStatistics>&
+  const std::shared_ptr<ArrayStatistics>& statistics() const { return data_->statistics; }
+
+ protected:
+  Array() = default;
+  ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
+
+  std::shared_ptr<ArrayData> data_;
+  const uint8_t* null_bitmap_data_ = NULLPTR;
+
+  /// Protected method for constructors
+  void SetData(const std::shared_ptr<ArrayData>& data) {
+    if (data->buffers.size() > 0) {
+      null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0);
+    } else {
+      null_bitmap_data_ = NULLPTR;
+    }
+    data_ = data;
+  }
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Array);
+};
+
+ARROW_EXPORT void PrintTo(const Array& x, std::ostream* os);
+
+static inline std::ostream& operator<<(std::ostream& os, const Array& x) {
+  os << x.ToString();
+  return os;
+}
+
+/// Base class for non-nested arrays
+class ARROW_EXPORT FlatArray : public Array {
+ protected:
+  using Array::Array;
+};
+
+/// Base class for arrays of fixed-size logical types
+class ARROW_EXPORT PrimitiveArray : public FlatArray {
+ public:
+  /// Does not account for any slice offset
+  const std::shared_ptr<Buffer>& values() const { return data_->buffers[1]; }
+
+ protected:
+  PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
+                 const std::shared_ptr<Buffer>& data,
+                 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+                 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  PrimitiveArray() : raw_values_(NULLPTR) {}
+
+  void SetData(const std::shared_ptr<ArrayData>& data) {
+    this->Array::SetData(data);
+    raw_values_ = data->GetValuesSafe<uint8_t>(1, /*offset=*/0);
+  }
+
+  explicit PrimitiveArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
+
+  const uint8_t* raw_values_;
+};
+
+/// Degenerate null type Array
+class ARROW_EXPORT NullArray : public FlatArray {
+ public:
+  using TypeClass = NullType;
+
+  explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
+  explicit NullArray(int64_t length);
+
+ private:
+  void SetData(const std::shared_ptr<ArrayData>& data) {
+    null_bitmap_data_ = NULLPTR;
+    data->null_count = data->length;
+    data_ = data;
+  }
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/array_binary.h b/pyarrow/include/arrow/array/array_binary.h
new file mode 100644
index 0000000000000000000000000000000000000000..63903eac46d413c24ccaeb048273e8f5e6c8d3c6
--- /dev/null
+++ b/pyarrow/include/arrow/array/array_binary.h
@@ -0,0 +1,321 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Array accessor classes for Binary, LargeBinary, String, LargeString,
+// FixedSizeBinary
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/buffer.h"
+#include "arrow/stl_iterator.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \addtogroup binary-arrays
+///
+/// @{
+
+// ----------------------------------------------------------------------
+// Binary and String
+
+/// Base class for variable-sized binary arrays, regardless of offset size
+/// and logical interpretation.
+template <typename TYPE>
+class BaseBinaryArray : public FlatArray {
+ public:
+  using TypeClass = TYPE;
+  using offset_type = typename TypeClass::offset_type;
+  using IteratorType = stl::ArrayIterator<BaseBinaryArray<TYPE>>;
+
+  /// Return the pointer to the given elements bytes
+  // XXX should GetValue(int64_t i) return a string_view?
+  const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
+    const offset_type pos = raw_value_offsets_[i];
+    *out_length = raw_value_offsets_[i + 1] - pos;
+    return raw_data_ + pos;
+  }
+
+  /// \brief Get binary value as a string_view
+  ///
+  /// \param i the value index
+  /// \return the view over the selected value
+  std::string_view GetView(int64_t i) const {
+    const offset_type pos = raw_value_offsets_[i];
+    return std::string_view(reinterpret_cast<const char*>(raw_data_ + pos),
+                            raw_value_offsets_[i + 1] - pos);
+  }
+
+  std::optional<std::string_view> operator[](int64_t i) const {
+    return *IteratorType(*this, i);
+  }
+
+  /// \brief Get binary value as a string_view
+  /// Provided for consistency with other arrays.
+  ///
+  /// \param i the value index
+  /// \return the view over the selected value
+  std::string_view Value(int64_t i) const { return GetView(i); }
+
+  /// \brief Get binary value as a std::string
+  ///
+  /// \param i the value index
+  /// \return the value copied into a std::string
+  std::string GetString(int64_t i) const { return std::string(GetView(i)); }
+
+  /// Note that this buffer does not account for any slice offset
+  std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
+
+  /// Note that this buffer does not account for any slice offset
+  std::shared_ptr<Buffer> value_data() const { return data_->buffers[2]; }
+
+  const offset_type* raw_value_offsets() const { return raw_value_offsets_; }
+
+  const uint8_t* raw_data() const { return raw_data_; }
+
+  /// \brief Return the data buffer absolute offset of the data for the value
+  /// at the passed index.
+  ///
+  /// Does not perform boundschecking
+  offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; }
+
+  /// \brief Return the length of the data for the value at the passed index.
+  ///
+  /// Does not perform boundschecking
+  offset_type value_length(int64_t i) const {
+    return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
+  }
+
+  /// \brief Return the total length of the memory in the data buffer
+  /// referenced by this array. If the array has been sliced then this may be
+  /// less than the size of the data buffer (data_->buffers[2]).
+  offset_type total_values_length() const {
+    if (data_->length > 0) {
+      return raw_value_offsets_[data_->length] - raw_value_offsets_[0];
+    } else {
+      return 0;
+    }
+  }
+
+  IteratorType begin() const { return IteratorType(*this); }
+
+  IteratorType end() const { return IteratorType(*this, length()); }
+
+ protected:
+  // For subclasses
+  BaseBinaryArray() = default;
+
+  // Protected method for constructors
+  void SetData(const std::shared_ptr<ArrayData>& data) {
+    this->Array::SetData(data);
+    raw_value_offsets_ = data->GetValuesSafe<offset_type>(1);
+    raw_data_ = data->GetValuesSafe<uint8_t>(2, /*offset=*/0);
+  }
+
+  const offset_type* raw_value_offsets_ = NULLPTR;
+  const uint8_t* raw_data_ = NULLPTR;
+};
+
+/// Concrete Array class for variable-size binary data
+class ARROW_EXPORT BinaryArray : public BaseBinaryArray<BinaryType> {
+ public:
+  explicit BinaryArray(const std::shared_ptr<ArrayData>& data);
+
+  BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
+              const std::shared_ptr<Buffer>& data,
+              const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+              int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ protected:
+  // For subclasses such as StringArray
+  BinaryArray() : BaseBinaryArray() {}
+};
+
+/// Concrete Array class for variable-size string (utf-8) data
+class ARROW_EXPORT StringArray : public BinaryArray {
+ public:
+  using TypeClass = StringType;
+
+  explicit StringArray(const std::shared_ptr<ArrayData>& data);
+
+  StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
+              const std::shared_ptr<Buffer>& data,
+              const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+              int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  /// \brief Validate that this array contains only valid UTF8 entries
+  ///
+  /// This check is also implied by ValidateFull()
+  Status ValidateUTF8() const;
+};
+
+/// Concrete Array class for large variable-size binary data
+class ARROW_EXPORT LargeBinaryArray : public BaseBinaryArray<LargeBinaryType> {
+ public:
+  explicit LargeBinaryArray(const std::shared_ptr<ArrayData>& data);
+
+  LargeBinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
+                   const std::shared_ptr<Buffer>& data,
+                   const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+                   int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ protected:
+  // For subclasses such as LargeStringArray
+  LargeBinaryArray() : BaseBinaryArray() {}
+};
+
+/// Concrete Array class for large variable-size string (utf-8) data
+class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
+ public:
+  using TypeClass = LargeStringType;
+
+  explicit LargeStringArray(const std::shared_ptr<ArrayData>& data);
+
+  LargeStringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
+                   const std::shared_ptr<Buffer>& data,
+                   const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+                   int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  /// \brief Validate that this array contains only valid UTF8 entries
+  ///
+  /// This check is also implied by ValidateFull()
+  Status ValidateUTF8() const;
+};
+
+// ----------------------------------------------------------------------
+// BinaryView and StringView
+
+/// Concrete Array class for variable-size binary view data using the
+/// BinaryViewType::c_type struct to reference in-line or out-of-line string values
+class ARROW_EXPORT BinaryViewArray : public FlatArray {
+ public:
+  using TypeClass = BinaryViewType;
+  using IteratorType = stl::ArrayIterator<BinaryViewArray>;
+  using c_type = BinaryViewType::c_type;
+
+  explicit BinaryViewArray(std::shared_ptr<ArrayData> data);
+
+  BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,
+                  std::shared_ptr<Buffer> views, BufferVector data_buffers,
+                  std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+                  int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  // For API compatibility with BinaryArray etc.
+  std::string_view GetView(int64_t i) const;
+  std::string GetString(int64_t i) const { return std::string{GetView(i)}; }
+
+  const auto& values() const { return data_->buffers[1]; }
+  const c_type* raw_values() const { return raw_values_; }
+
+  std::optional<std::string_view> operator[](int64_t i) const {
+    return *IteratorType(*this, i);
+  }
+
+  IteratorType begin() const { return IteratorType(*this); }
+  IteratorType end() const { return IteratorType(*this, length()); }
+
+ protected:
+  using FlatArray::FlatArray;
+
+  void SetData(std::shared_ptr<ArrayData> data) {
+    FlatArray::SetData(std::move(data));
+    raw_values_ = data_->GetValuesSafe<c_type>(1);
+  }
+
+  const c_type* raw_values_;
+};
+
+/// Concrete Array class for variable-size string view (utf-8) data using
+/// BinaryViewType::c_type to reference in-line or out-of-line string values
+class ARROW_EXPORT StringViewArray : public BinaryViewArray {
+ public:
+  using TypeClass = StringViewType;
+
+  explicit StringViewArray(std::shared_ptr<ArrayData> data);
+
+  using BinaryViewArray::BinaryViewArray;
+
+  /// \brief Validate that this array contains only valid UTF8 entries
+  ///
+  /// This check is also implied by ValidateFull()
+  Status ValidateUTF8() const;
+};
+
+// ----------------------------------------------------------------------
+// Fixed width binary
+
+/// Concrete Array class for fixed-size binary data
+class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
+ public:
+  using TypeClass = FixedSizeBinaryType;
+  using IteratorType = stl::ArrayIterator<FixedSizeBinaryArray>;
+
+  explicit FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data);
+
+  FixedSizeBinaryArray(const std::shared_ptr<DataType>& type, int64_t length,
+                       const std::shared_ptr<Buffer>& data,
+                       const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+                       int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  const uint8_t* GetValue(int64_t i) const { return values_ + i * byte_width_; }
+  const uint8_t* Value(int64_t i) const { return GetValue(i); }
+
+  std::string_view GetView(int64_t i) const {
+    return std::string_view(reinterpret_cast<const char*>(GetValue(i)), byte_width_);
+  }
+
+  std::optional<std::string_view> operator[](int64_t i) const {
+    return *IteratorType(*this, i);
+  }
+
+  std::string GetString(int64_t i) const { return std::string(GetView(i)); }
+
+  int32_t byte_width() const { return byte_width_; }
+
+  const uint8_t* raw_values() const { return values_; }
+
+  IteratorType begin() const { return IteratorType(*this); }
+
+  IteratorType end() const { return IteratorType(*this, length()); }
+
+ protected:
+  void SetData(const std::shared_ptr<ArrayData>& data) {
+    this->PrimitiveArray::SetData(data);
+    byte_width_ =
+        internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
+    values_ = raw_values_ + data_->offset * byte_width_;
+  }
+
+  const uint8_t* values_;
+  int32_t byte_width_;
+};
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/array_decimal.h b/pyarrow/include/arrow/array/array_decimal.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f10bb842999640a8cada703ff12ea29c0e5f718
--- /dev/null
+++ b/pyarrow/include/arrow/array/array_decimal.h
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "arrow/array/array_binary.h"
+#include "arrow/array/data.h"
+#include "arrow/type.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \addtogroup numeric-arrays
+///
+/// @{
+
+// ----------------------------------------------------------------------
+// Decimal32Array
+
+/// Concrete Array class for 32-bit decimal data
+class ARROW_EXPORT Decimal32Array : public FixedSizeBinaryArray {
+ public:
+  using TypeClass = Decimal32Type;
+
+  using FixedSizeBinaryArray::FixedSizeBinaryArray;
+
+  /// \brief Construct Decimal32Array from ArrayData instance
+  explicit Decimal32Array(const std::shared_ptr<ArrayData>& data);
+
+  std::string FormatValue(int64_t i) const;
+};
+
+// ----------------------------------------------------------------------
+// Decimal64Array
+
+/// Concrete Array class for 64-bit decimal data
+class ARROW_EXPORT Decimal64Array : public FixedSizeBinaryArray {
+ public:
+  using TypeClass = Decimal64Type;
+
+  using FixedSizeBinaryArray::FixedSizeBinaryArray;
+
+  /// \brief Construct Decimal64Array from ArrayData instance
+  explicit Decimal64Array(const std::shared_ptr<ArrayData>& data);
+
+  std::string FormatValue(int64_t i) const;
+};
+
+// ----------------------------------------------------------------------
+// Decimal128Array
+
+/// Concrete Array class for 128-bit decimal data
+class ARROW_EXPORT Decimal128Array : public FixedSizeBinaryArray {
+ public:
+  using TypeClass = Decimal128Type;
+
+  using FixedSizeBinaryArray::FixedSizeBinaryArray;
+
+  /// \brief Construct Decimal128Array from ArrayData instance
+  explicit Decimal128Array(const std::shared_ptr<ArrayData>& data);
+
+  std::string FormatValue(int64_t i) const;
+};
+
+// Backward compatibility
+using DecimalArray = Decimal128Array;
+
+// ----------------------------------------------------------------------
+// Decimal256Array
+
+/// Concrete Array class for 256-bit decimal data
+class ARROW_EXPORT Decimal256Array : public FixedSizeBinaryArray {
+ public:
+  using TypeClass = Decimal256Type;
+
+  using FixedSizeBinaryArray::FixedSizeBinaryArray;
+
+  /// \brief Construct Decimal256Array from ArrayData instance
+  explicit Decimal256Array(const std::shared_ptr<ArrayData>& data);
+
+  std::string FormatValue(int64_t i) const;
+};
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/array_dict.h b/pyarrow/include/arrow/array/array_dict.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf376b51f8c9470d2b4e4c7ed950c9a513fddc9b
--- /dev/null
+++ b/pyarrow/include/arrow/array/array_dict.h
@@ -0,0 +1,182 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// DictionaryArray
+
+/// \brief Array type for dictionary-encoded data with a
+/// data-dependent dictionary
+///
+/// A dictionary array contains an array of non-negative integers (the
+/// "dictionary indices") along with a data type containing a "dictionary"
+/// corresponding to the distinct values represented in the data.
+///
+/// For example, the array
+///
+///   ["foo", "bar", "foo", "bar", "foo", "bar"]
+///
+/// with dictionary ["bar", "foo"], would have dictionary array representation
+///
+///   indices: [1, 0, 1, 0, 1, 0]
+///   dictionary: ["bar", "foo"]
+///
+/// The indices in principle may be any integer type.
+class ARROW_EXPORT DictionaryArray : public Array {
+ public:
+  using TypeClass = DictionaryType;
+
+  explicit DictionaryArray(const std::shared_ptr<ArrayData>& data);
+
+  DictionaryArray(const std::shared_ptr<DataType>& type,
+                  const std::shared_ptr<Array>& indices,
+                  const std::shared_ptr<Array>& dictionary);
+
+  /// \brief Construct DictionaryArray from dictionary and indices
+  /// array and validate
+  ///
+  /// This function does the validation of the indices and input type. It checks if
+  /// all indices are non-negative and smaller than the size of the dictionary.
+  ///
+  /// \param[in] type a dictionary type
+  /// \param[in] dictionary the dictionary with same value type as the
+  /// type object
+  /// \param[in] indices an array of non-negative integers smaller than the
+  /// size of the dictionary
+  static Result<std::shared_ptr<Array>> FromArrays(
+      const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices,
+      const std::shared_ptr<Array>& dictionary);
+
+  static Result<std::shared_ptr<Array>> FromArrays(
+      const std::shared_ptr<Array>& indices, const std::shared_ptr<Array>& dictionary) {
+    return FromArrays(::arrow::dictionary(indices->type(), dictionary->type()), indices,
+                      dictionary);
+  }
+
+  /// \brief Transpose this DictionaryArray
+  ///
+  /// This method constructs a new dictionary array with the given dictionary
+  /// type, transposing indices using the transpose map.  The type and the
+  /// transpose map are typically computed using DictionaryUnifier.
+  ///
+  /// \param[in] type the new type object
+  /// \param[in] dictionary the new dictionary
+  /// \param[in] transpose_map transposition array of this array's indices
+  ///   into the target array's indices
+  /// \param[in] pool a pool to allocate the array data from
+  Result<std::shared_ptr<Array>> Transpose(
+      const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
+      const int32_t* transpose_map, MemoryPool* pool = default_memory_pool()) const;
+
+  Result<std::shared_ptr<Array>> Compact(MemoryPool* pool = default_memory_pool()) const;
+
+  /// \brief Determine whether dictionary arrays may be compared without unification
+  bool CanCompareIndices(const DictionaryArray& other) const;
+
+  /// \brief Return the dictionary for this array, which is stored as
+  /// a member of the ArrayData internal structure
+  const std::shared_ptr<Array>& dictionary() const;
+  const std::shared_ptr<Array>& indices() const;
+
+  /// \brief Return the ith value of indices, cast to int64_t. Not recommended
+  /// for use in performance-sensitive code. Does not validate whether the
+  /// value is null or out-of-bounds.
+  int64_t GetValueIndex(int64_t i) const;
+
+  const DictionaryType* dict_type() const { return dict_type_; }
+
+ private:
+  void SetData(const std::shared_ptr<ArrayData>& data);
+  const DictionaryType* dict_type_;
+  std::shared_ptr<Array> indices_;
+
+  // Lazily initialized when invoking dictionary()
+  mutable std::shared_ptr<Array> dictionary_;
+};
+
+/// \brief Helper class for incremental dictionary unification
+class ARROW_EXPORT DictionaryUnifier {
+ public:
+  virtual ~DictionaryUnifier() = default;
+
+  /// \brief Construct a DictionaryUnifier
+  /// \param[in] value_type the data type of the dictionaries
+  /// \param[in] pool MemoryPool to use for memory allocations
+  static Result<std::unique_ptr<DictionaryUnifier>> Make(
+      std::shared_ptr<DataType> value_type, MemoryPool* pool = default_memory_pool());
+
+  /// \brief Unify dictionaries across array chunks
+  ///
+  /// The dictionaries in the array chunks will be unified, their indices
+  /// accordingly transposed.
+  ///
+  /// Only dictionaries with a primitive value type are currently supported.
+  /// However, dictionaries nested inside a more complex type are correctly unified.
+  static Result<std::shared_ptr<ChunkedArray>> UnifyChunkedArray(
+      const std::shared_ptr<ChunkedArray>& array,
+      MemoryPool* pool = default_memory_pool());
+
+  /// \brief Unify dictionaries across the chunks of each table column
+  ///
+  /// The dictionaries in each table column will be unified, their indices
+  /// accordingly transposed.
+  ///
+  /// Only dictionaries with a primitive value type are currently supported.
+  /// However, dictionaries nested inside a more complex type are correctly unified.
+  static Result<std::shared_ptr<Table>> UnifyTable(
+      const Table& table, MemoryPool* pool = default_memory_pool());
+
+  /// \brief Append dictionary to the internal memo
+  virtual Status Unify(const Array& dictionary) = 0;
+
+  /// \brief Append dictionary and compute transpose indices
+  /// \param[in] dictionary the dictionary values to unify
+  /// \param[out] out_transpose a Buffer containing computed transpose indices
+  /// as int32_t values equal in length to the passed dictionary. The value in
+  /// each slot corresponds to the new index value for each original index
+  /// for a DictionaryArray with the old dictionary
+  virtual Status Unify(const Array& dictionary,
+                       std::shared_ptr<Buffer>* out_transpose) = 0;
+
+  /// \brief Return a result DictionaryType with the smallest possible index
+  /// type to accommodate the unified dictionary. The unifier cannot be used
+  /// after this is called
+  virtual Status GetResult(std::shared_ptr<DataType>* out_type,
+                           std::shared_ptr<Array>* out_dict) = 0;
+
+  /// \brief Return a unified dictionary with the given index type.  If
+  /// the index type is not large enough then an invalid status will be returned.
+  /// The unifier cannot be used after this is called
+  virtual Status GetResultWithIndexType(const std::shared_ptr<DataType>& index_type,
+                                        std::shared_ptr<Array>* out_dict) = 0;
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/array_nested.h b/pyarrow/include/arrow/array/array_nested.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf84f802b1ab502fc50794997645b52756bb6df2
--- /dev/null
+++ b/pyarrow/include/arrow/array/array_nested.h
@@ -0,0 +1,899 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Array accessor classes for List, LargeList, ListView, LargeListView, FixedSizeList,
+// Map, Struct, and Union
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \addtogroup nested-arrays
+///
+/// @{
+
+// ----------------------------------------------------------------------
+// VarLengthListLikeArray
+
+template <typename TYPE>
+class VarLengthListLikeArray;
+
+namespace internal {
+
+// Private helper for [Large]List[View]Array::SetData.
+// Unfortunately, trying to define VarLengthListLikeArray::SetData outside of this header
+// doesn't play well with MSVC.
+template <typename TYPE>
+void SetListData(VarLengthListLikeArray<TYPE>* self,
+                 const std::shared_ptr<ArrayData>& data,
+                 Type::type expected_type_id = TYPE::type_id);
+
+/// \brief A version of Flatten that keeps recursively flattening until an array of
+/// non-list values is reached.
+///
+/// Array types considered to be lists by this function:
+///  - list
+///  - large_list
+///  - list_view
+///  - large_list_view
+///  - fixed_size_list
+///
+/// \see ListArray::Flatten
+ARROW_EXPORT Result<std::shared_ptr<Array>> FlattenLogicalListRecursively(
+    const Array& in_array, MemoryPool* memory_pool);
+
+}  // namespace internal
+
+/// Base class for variable-sized list and list-view arrays, regardless of offset size.
+template <typename TYPE>
+class VarLengthListLikeArray : public Array {
+ public:
+  using TypeClass = TYPE;
+  using offset_type = typename TypeClass::offset_type;
+
+  const TypeClass* var_length_list_like_type() const { return this->list_type_; }
+
+  /// \brief Return array object containing the list's values
+  ///
+  /// Note that this buffer does not account for any slice offset or length.
+  const std::shared_ptr<Array>& values() const { return values_; }
+
+  /// Note that this buffer does not account for any slice offset or length.
+  const std::shared_ptr<Buffer>& value_offsets() const { return data_->buffers[1]; }
+
+  const std::shared_ptr<DataType>& value_type() const { return list_type_->value_type(); }
+
+  /// Return pointer to raw value offsets accounting for any slice offset
+  const offset_type* raw_value_offsets() const { return raw_value_offsets_; }
+
+  // The following functions will not perform boundschecking
+
+  offset_type value_offset(int64_t i) const { return raw_value_offsets_[i]; }
+
+  /// \brief Return the size of the value at a particular index
+  ///
+  /// Since non-empty null lists and list-views are possible, avoid calling this
+  /// function when the list at slot i is null.
+  ///
+  /// \pre IsValid(i)
+  virtual offset_type value_length(int64_t i) const = 0;
+
+  /// \pre IsValid(i)
+  std::shared_ptr<Array> value_slice(int64_t i) const {
+    return values_->Slice(value_offset(i), value_length(i));
+  }
+
+  /// \brief Flatten all level recursively until reach a non-list type, and return
+  /// a non-list type Array.
+  ///
+  /// \see internal::FlattenLogicalListRecursively
+  Result<std::shared_ptr<Array>> FlattenRecursively(
+      MemoryPool* memory_pool = default_memory_pool()) const {
+    return internal::FlattenLogicalListRecursively(*this, memory_pool);
+  }
+
+ protected:
+  friend void internal::SetListData<TYPE>(VarLengthListLikeArray<TYPE>* self,
+                                          const std::shared_ptr<ArrayData>& data,
+                                          Type::type expected_type_id);
+
+  const TypeClass* list_type_ = NULLPTR;
+  std::shared_ptr<Array> values_;
+  const offset_type* raw_value_offsets_ = NULLPTR;
+};
+
+// ----------------------------------------------------------------------
+// ListArray / LargeListArray
+
+template <typename TYPE>
+class BaseListArray : public VarLengthListLikeArray<TYPE> {
+ public:
+  using TypeClass = TYPE;
+  using offset_type = typename TYPE::offset_type;
+
+  const TypeClass* list_type() const { return this->var_length_list_like_type(); }
+
+  /// \brief Return the size of the value at a particular index
+  ///
+  /// Since non-empty null lists are possible, avoid calling this
+  /// function when the list at slot i is null.
+  ///
+  /// \pre IsValid(i)
+  offset_type value_length(int64_t i) const final {
+    return this->raw_value_offsets_[i + 1] - this->raw_value_offsets_[i];
+  }
+};
+
+/// Concrete Array class for list data
+class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
+ public:
+  explicit ListArray(std::shared_ptr<ArrayData> data);
+
+  ListArray(std::shared_ptr<DataType> type, int64_t length,
+            std::shared_ptr<Buffer> value_offsets, std::shared_ptr<Array> values,
+            std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+            int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  /// \brief Construct ListArray from array of offsets and child value array
+  ///
+  /// This function does the bare minimum of validation of the offsets and
+  /// input types, and will allocate a new offsets array if necessary (i.e. if
+  /// the offsets contain any nulls). If the offsets do not have nulls, they
+  /// are assumed to be well-formed.
+  ///
+  /// If a null_bitmap is not provided, the nulls will be inferred from the offsets'
+  /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls.
+  ///
+  /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an
+  /// array with offset() > 0).
+  ///
+  /// \param[in] offsets Array containing n + 1 offsets encoding length and
+  /// size. Must be of int32 type
+  /// \param[in] values Array containing list values
+  /// \param[in] pool MemoryPool in case new offsets array needs to be
+  /// allocated because of null values
+  /// \param[in] null_bitmap Optional validity bitmap
+  /// \param[in] null_count Optional null count in null_bitmap
+  static Result<std::shared_ptr<ListArray>> FromArrays(
+      const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(),
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount);
+
+  static Result<std::shared_ptr<ListArray>> FromArrays(
+      std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
+      MemoryPool* pool = default_memory_pool(),
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount);
+
+  /// \brief Build a ListArray from a ListViewArray
+  static Result<std::shared_ptr<ListArray>> FromListView(const ListViewArray& source,
+                                                         MemoryPool* pool);
+
+  /// \brief Return an Array that is a concatenation of the lists in this array.
+  ///
+  /// Note that it's different from `values()` in that it takes into
+  /// consideration of this array's offsets as well as null elements backed
+  /// by non-empty lists (they are skipped, thus copying may be needed).
+  Result<std::shared_ptr<Array>> Flatten(
+      MemoryPool* memory_pool = default_memory_pool()) const;
+
+  /// \brief Return list offsets as an Int32Array
+  ///
+  /// The returned array will not have a validity bitmap, so you cannot expect
+  /// to pass it to ListArray::FromArrays() and get back the same list array
+  /// if the original one has nulls.
+  std::shared_ptr<Array> offsets() const;
+
+ protected:
+  // This constructor defers SetData to a derived array class
+  ListArray() = default;
+
+  void SetData(const std::shared_ptr<ArrayData>& data);
+};
+
+/// Concrete Array class for large list data (with 64-bit offsets)
+class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
+ public:
+  explicit LargeListArray(const std::shared_ptr<ArrayData>& data);
+
+  LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
+                 const std::shared_ptr<Buffer>& value_offsets,
+                 const std::shared_ptr<Array>& values,
+                 const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+                 int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  /// \brief Construct LargeListArray from array of offsets and child value array
+  ///
+  /// This function does the bare minimum of validation of the offsets and
+  /// input types, and will allocate a new offsets array if necessary (i.e. if
+  /// the offsets contain any nulls). If the offsets do not have nulls, they
+  /// are assumed to be well-formed.
+  ///
+  /// If a null_bitmap is not provided, the nulls will be inferred from the offsets'
+  /// null bitmap. But if a null_bitmap is provided, the offsets array can't have nulls.
+  ///
+  /// And when a null_bitmap is provided, the offsets array cannot be a slice (i.e. an
+  /// array with offset() > 0).
+  ///
+  /// \param[in] offsets Array containing n + 1 offsets encoding length and
+  /// size. Must be of int64 type
+  /// \param[in] values Array containing list values
+  /// \param[in] pool MemoryPool in case new offsets array needs to be
+  /// allocated because of null values
+  /// \param[in] null_bitmap Optional validity bitmap
+  /// \param[in] null_count Optional null count in null_bitmap
+  static Result<std::shared_ptr<LargeListArray>> FromArrays(
+      const Array& offsets, const Array& values, MemoryPool* pool = default_memory_pool(),
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount);
+
+  static Result<std::shared_ptr<LargeListArray>> FromArrays(
+      std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
+      MemoryPool* pool = default_memory_pool(),
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount);
+
+  /// \brief Build a LargeListArray from a LargeListViewArray
+  static Result<std::shared_ptr<LargeListArray>> FromListView(
+      const LargeListViewArray& source, MemoryPool* pool);
+
+  /// \brief Return an Array that is a concatenation of the lists in this array.
+  ///
+  /// Note that it's different from `values()` in that it takes into
+  /// consideration of this array's offsets as well as null elements backed
+  /// by non-empty lists (they are skipped, thus copying may be needed).
+  Result<std::shared_ptr<Array>> Flatten(
+      MemoryPool* memory_pool = default_memory_pool()) const;
+
+  /// \brief Return list offsets as an Int64Array
+  std::shared_ptr<Array> offsets() const;
+
+ protected:
+  void SetData(const std::shared_ptr<ArrayData>& data);
+};
+
+// ----------------------------------------------------------------------
+// ListViewArray / LargeListViewArray
+
+template <typename TYPE>
+class BaseListViewArray : public VarLengthListLikeArray<TYPE> {
+ public:
+  using TypeClass = TYPE;
+  using offset_type = typename TYPE::offset_type;
+
+  const TypeClass* list_view_type() const { return this->var_length_list_like_type(); }
+
+  /// \brief Note that this buffer does not account for any slice offset or length.
+  const std::shared_ptr<Buffer>& value_sizes() const { return this->data_->buffers[2]; }
+
+  /// \brief Return pointer to raw value offsets accounting for any slice offset
+  const offset_type* raw_value_sizes() const { return raw_value_sizes_; }
+
+  /// \brief Return the size of the value at a particular index
+  ///
+  /// This should not be called if the list-view at slot i is null.
+  /// The returned size in those cases could be any value from 0 to the
+  /// length of the child values array.
+  ///
+  /// \pre IsValid(i)
+  offset_type value_length(int64_t i) const final { return this->raw_value_sizes_[i]; }
+
+ protected:
+  const offset_type* raw_value_sizes_ = NULLPTR;
+};
+
+/// \brief Concrete Array class for list-view data
+class ARROW_EXPORT ListViewArray : public BaseListViewArray<ListViewType> {
+ public:
+  explicit ListViewArray(std::shared_ptr<ArrayData> data);
+
+  ListViewArray(std::shared_ptr<DataType> type, int64_t length,
+                std::shared_ptr<Buffer> value_offsets,
+                std::shared_ptr<Buffer> value_sizes, std::shared_ptr<Array> values,
+                std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+                int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  /// \brief Construct ListViewArray from array of offsets, sizes, and child
+  /// value array
+  ///
+  /// Construct a ListViewArray using buffers from offsets and sizes arrays
+  /// that project views into the child values array.
+  ///
+  /// This function does the bare minimum of validation of the offsets/sizes and
+  /// input types. The offset and length of the offsets and sizes arrays must
+  /// match and that will be checked, but their contents will be assumed to be
+  /// well-formed.
+  ///
+  /// If a null_bitmap is not provided, the nulls will be inferred from the
+  /// offsets's null bitmap. But if a null_bitmap is provided, the offsets array
+  /// can't have nulls.
+  ///
+  /// And when a null_bitmap is provided, neither the offsets or sizes array can be a
+  /// slice (i.e. an array with offset() > 0).
+  ///
+  /// \param[in] offsets An array of int32 offsets into the values array. NULL values are
+  /// supported if the corresponding values in sizes is NULL or 0.
+  /// \param[in] sizes An array containing the int32 sizes of every view. NULL values are
+  /// taken to represent a NULL list-view in the array being created.
+  /// \param[in] values Array containing list values
+  /// \param[in] pool MemoryPool
+  /// \param[in] null_bitmap Optional validity bitmap
+  /// \param[in] null_count Optional null count in null_bitmap
+  static Result<std::shared_ptr<ListViewArray>> FromArrays(
+      const Array& offsets, const Array& sizes, const Array& values,
+      MemoryPool* pool = default_memory_pool(),
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount);
+
+  static Result<std::shared_ptr<ListViewArray>> FromArrays(
+      std::shared_ptr<DataType> type, const Array& offsets, const Array& sizes,
+      const Array& values, MemoryPool* pool = default_memory_pool(),
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount);
+
+  /// \brief Build a ListViewArray from a ListArray
+  static Result<std::shared_ptr<ListViewArray>> FromList(const ListArray& list_array,
+                                                         MemoryPool* pool);
+
+  /// \brief Return an Array that is a concatenation of the list-views in this array.
+  ///
+  /// Note that it's different from `values()` in that it takes into
+  /// consideration this array's offsets (which can be in any order)
+  /// and sizes. Nulls are skipped.
+  ///
+  /// This function invokes Concatenate() if list-views are non-contiguous. It
+  /// will try to minimize the number of array slices passed to Concatenate() by
+  /// maximizing the size of each slice (containing as many contiguous
+  /// list-views as possible).
+  Result<std::shared_ptr<Array>> Flatten(
+      MemoryPool* memory_pool = default_memory_pool()) const;
+
+  /// \brief Return list-view offsets as an Int32Array
+  ///
+  /// The returned array will not have a validity bitmap, so you cannot expect
+  /// to pass it to ListArray::FromArrays() and get back the same list array
+  /// if the original one has nulls.
+  std::shared_ptr<Array> offsets() const;
+
+  /// \brief Return list-view sizes as an Int32Array
+  ///
+  /// The returned array will not have a validity bitmap, so you cannot expect
+  /// to pass it to ListViewArray::FromArrays() and get back the same list
+  /// array if the original one has nulls.
+  std::shared_ptr<Array> sizes() const;
+
+ protected:
+  // This constructor defers SetData to a derived array class
+  ListViewArray() = default;
+
+  void SetData(const std::shared_ptr<ArrayData>& data);
+};
+
+/// \brief Concrete Array class for large list-view data (with 64-bit offsets
+/// and sizes)
+class ARROW_EXPORT LargeListViewArray : public BaseListViewArray<LargeListViewType> {
+ public:
+  explicit LargeListViewArray(std::shared_ptr<ArrayData> data);
+
+  LargeListViewArray(std::shared_ptr<DataType> type, int64_t length,
+                     std::shared_ptr<Buffer> value_offsets,
+                     std::shared_ptr<Buffer> value_sizes, std::shared_ptr<Array> values,
+                     std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+                     int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  /// \brief Construct LargeListViewArray from array of offsets, sizes, and child
+  /// value array
+  ///
+  /// Construct an LargeListViewArray using buffers from offsets and sizes arrays
+  /// that project views into the values array.
+  ///
+  /// This function does the bare minimum of validation of the offsets/sizes and
+  /// input types. The offset and length of the offsets and sizes arrays must
+  /// match and that will be checked, but their contents will be assumed to be
+  /// well-formed.
+  ///
+  /// If a null_bitmap is not provided, the nulls will be inferred from the offsets' or
+  /// sizes' null bitmap. Only one of these two is allowed to have a null bitmap. But if a
+  /// null_bitmap is provided, the offsets array and the sizes array can't have nulls.
+  ///
+  /// And when a null_bitmap is provided, neither the offsets or sizes array can be a
+  /// slice (i.e. an array with offset() > 0).
+  ///
+  /// \param[in] offsets An array of int64 offsets into the values array. NULL values are
+  /// supported if the corresponding values in sizes is NULL or 0.
+  /// \param[in] sizes An array containing the int64 sizes of every view. NULL values are
+  /// taken to represent a NULL list-view in the array being created.
+  /// \param[in] values Array containing list values
+  /// \param[in] pool MemoryPool
+  /// \param[in] null_bitmap Optional validity bitmap
+  /// \param[in] null_count Optional null count in null_bitmap
+  static Result<std::shared_ptr<LargeListViewArray>> FromArrays(
+      const Array& offsets, const Array& sizes, const Array& values,
+      MemoryPool* pool = default_memory_pool(),
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount);
+
+  static Result<std::shared_ptr<LargeListViewArray>> FromArrays(
+      std::shared_ptr<DataType> type, const Array& offsets, const Array& sizes,
+      const Array& values, MemoryPool* pool = default_memory_pool(),
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount);
+
+  /// \brief Build a LargeListViewArray from a LargeListArray
+  static Result<std::shared_ptr<LargeListViewArray>> FromList(
+      const LargeListArray& list_array, MemoryPool* pool);
+
+  /// \brief Return an Array that is a concatenation of the large list-views in this
+  /// array.
+  ///
+  /// Note that it's different from `values()` in that it takes into
+  /// consideration this array's offsets (which can be in any order)
+  /// and sizes. Nulls are skipped.
+  Result<std::shared_ptr<Array>> Flatten(
+      MemoryPool* memory_pool = default_memory_pool()) const;
+
+  /// \brief Return list-view offsets as an Int64Array
+  ///
+  /// The returned array will not have a validity bitmap, so you cannot expect
+  /// to pass it to LargeListArray::FromArrays() and get back the same list array
+  /// if the original one has nulls.
+  std::shared_ptr<Array> offsets() const;
+
+  /// \brief Return list-view sizes as an Int64Array
+  ///
+  /// The returned array will not have a validity bitmap, so you cannot expect
+  /// to pass it to LargeListViewArray::FromArrays() and get back the same list
+  /// array if the original one has nulls.
+  std::shared_ptr<Array> sizes() const;
+
+ protected:
+  // This constructor defers SetData to a derived array class
+  LargeListViewArray() = default;
+
+  void SetData(const std::shared_ptr<ArrayData>& data);
+};
+
+// ----------------------------------------------------------------------
+// MapArray
+
+/// Concrete Array class for map data
+///
+/// NB: "value" in this context refers to a pair of a key and the corresponding item
+class ARROW_EXPORT MapArray : public ListArray {
+ public:
+  using TypeClass = MapType;
+
+  explicit MapArray(const std::shared_ptr<ArrayData>& data);
+
+  MapArray(const std::shared_ptr<DataType>& type, int64_t length,
+           const std::shared_ptr<Buffer>& value_offsets,
+           const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
+           const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+           int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  MapArray(const std::shared_ptr<DataType>& type, int64_t length, BufferVector buffers,
+           const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
+           int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  MapArray(const std::shared_ptr<DataType>& type, int64_t length,
+           const std::shared_ptr<Buffer>& value_offsets,
+           const std::shared_ptr<Array>& values,
+           const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+           int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  /// \brief Construct MapArray from array of offsets and child key, item arrays
+  ///
+  /// This function does the bare minimum of validation of the offsets and
+  /// input types, and will allocate a new offsets array if necessary (i.e. if
+  /// the offsets contain any nulls). If the offsets do not have nulls, they
+  /// are assumed to be well-formed
+  ///
+  /// \param[in] offsets Array containing n + 1 offsets encoding length and
+  /// size. Must be of int32 type
+  /// \param[in] keys Array containing key values
+  /// \param[in] items Array containing item values
+  /// \param[in] pool MemoryPool in case new offsets array needs to be
+  /// \param[in] null_bitmap Optional validity bitmap
+  /// allocated because of null values
+  static Result<std::shared_ptr<Array>> FromArrays(
+      const std::shared_ptr<Array>& offsets, const std::shared_ptr<Array>& keys,
+      const std::shared_ptr<Array>& items, MemoryPool* pool = default_memory_pool(),
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR);
+
+  static Result<std::shared_ptr<Array>> FromArrays(
+      std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
+      const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
+      MemoryPool* pool = default_memory_pool(),
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR);
+
+  const MapType* map_type() const { return map_type_; }
+
+  /// \brief Return array object containing all map keys
+  const std::shared_ptr<Array>& keys() const { return keys_; }
+
+  /// \brief Return array object containing all mapped items
+  const std::shared_ptr<Array>& items() const { return items_; }
+
+  /// Validate child data before constructing the actual MapArray.
+  static Status ValidateChildData(
+      const std::vector<std::shared_ptr<ArrayData>>& child_data);
+
+ protected:
+  void SetData(const std::shared_ptr<ArrayData>& data);
+
+  static Result<std::shared_ptr<Array>> FromArraysInternal(
+      std::shared_ptr<DataType> type, const std::shared_ptr<Array>& offsets,
+      const std::shared_ptr<Array>& keys, const std::shared_ptr<Array>& items,
+      MemoryPool* pool, std::shared_ptr<Buffer> null_bitmap = NULLPTR);
+
+ private:
+  const MapType* map_type_;
+  std::shared_ptr<Array> keys_, items_;
+};
+
+// ----------------------------------------------------------------------
+// FixedSizeListArray
+
+/// Concrete Array class for fixed size list data
+class ARROW_EXPORT FixedSizeListArray : public Array {
+ public:
+  using TypeClass = FixedSizeListType;
+  using offset_type = TypeClass::offset_type;
+
+  explicit FixedSizeListArray(const std::shared_ptr<ArrayData>& data);
+
+  FixedSizeListArray(const std::shared_ptr<DataType>& type, int64_t length,
+                     const std::shared_ptr<Array>& values,
+                     const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+                     int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  const FixedSizeListType* list_type() const;
+
+  /// \brief Return array object containing the list's values
+  const std::shared_ptr<Array>& values() const;
+
+  const std::shared_ptr<DataType>& value_type() const;
+
+  // The following functions will not perform boundschecking
+  int64_t value_offset(int64_t i) const {
+    i += data_->offset;
+    return list_size_ * i;
+  }
+  /// \brief Return the fixed-size of the values
+  ///
+  /// No matter the value of the index parameter, the result is the same.
+  /// So even when the value at slot i is null, this function will return a
+  /// non-zero size.
+  ///
+  /// \pre IsValid(i)
+  int32_t value_length(int64_t i = 0) const {
+    ARROW_UNUSED(i);
+    return list_size_;
+  }
+  /// \pre IsValid(i)
+  std::shared_ptr<Array> value_slice(int64_t i) const {
+    return values_->Slice(value_offset(i), value_length(i));
+  }
+
+  /// \brief Return an Array that is a concatenation of the lists in this array.
+  ///
+  /// Note that it's different from `values()` in that it takes into
+  /// consideration null elements (they are skipped, thus copying may be needed).
+  Result<std::shared_ptr<Array>> Flatten(
+      MemoryPool* memory_pool = default_memory_pool()) const;
+
+  /// \brief Flatten all level recursively until reach a non-list type, and return
+  /// a non-list type Array.
+  ///
+  /// \see internal::FlattenLogicalListRecursively
+  Result<std::shared_ptr<Array>> FlattenRecursively(
+      MemoryPool* memory_pool = default_memory_pool()) const {
+    return internal::FlattenLogicalListRecursively(*this, memory_pool);
+  }
+
+  /// \brief Construct FixedSizeListArray from child value array and value_length
+  ///
+  /// \param[in] values Array containing list values
+  /// \param[in] list_size The fixed length of each list
+  /// \param[in] null_bitmap Optional validity bitmap
+  /// \param[in] null_count Optional null count in null_bitmap
+  /// \return Will have length equal to values.length() / list_size
+  static Result<std::shared_ptr<Array>> FromArrays(
+      const std::shared_ptr<Array>& values, int32_t list_size,
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount);
+
+  /// \brief Construct FixedSizeListArray from child value array and type
+  ///
+  /// \param[in] values Array containing list values
+  /// \param[in] type The fixed sized list type
+  /// \param[in] null_bitmap Optional validity bitmap
+  /// \param[in] null_count Optional null count in null_bitmap
+  /// \return Will have length equal to values.length() / type.list_size()
+  static Result<std::shared_ptr<Array>> FromArrays(
+      const std::shared_ptr<Array>& values, std::shared_ptr<DataType> type,
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount);
+
+ protected:
+  void SetData(const std::shared_ptr<ArrayData>& data);
+  int32_t list_size_;
+
+ private:
+  std::shared_ptr<Array> values_;
+};
+
+// ----------------------------------------------------------------------
+// Struct
+
+/// Concrete Array class for struct data
+class ARROW_EXPORT StructArray : public Array {
+ public:
+  using TypeClass = StructType;
+
+  ~StructArray() override;
+
+  explicit StructArray(const std::shared_ptr<ArrayData>& data);
+
+  StructArray(const std::shared_ptr<DataType>& type, int64_t length,
+              const std::vector<std::shared_ptr<Array>>& children,
+              std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+              int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  /// \brief Return a StructArray from child arrays and field names.
+  ///
+  /// The length and data type are automatically inferred from the arguments.
+  /// There should be at least one child array.
+  static Result<std::shared_ptr<StructArray>> Make(
+      const ArrayVector& children, const std::vector<std::string>& field_names,
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  /// \brief Return a StructArray from child arrays and fields.
+  ///
+  /// The length is automatically inferred from the arguments.
+  /// There should be at least one child array.  This method does not
+  /// check that field types and child array types are consistent.
+  static Result<std::shared_ptr<StructArray>> Make(
+      const ArrayVector& children, const FieldVector& fields,
+      std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+      int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  const StructType* struct_type() const;
+
+  // Return a shared pointer in case the requestor desires to share ownership
+  // with this array.  The returned array has its offset, length and null
+  // count adjusted.
+  std::shared_ptr<Array> field(int pos) const;
+
+  const ArrayVector& fields() const;
+
+  /// Returns null if name not found
+  std::shared_ptr<Array> GetFieldByName(const std::string& name) const;
+
+  /// Indicate if field named `name` can be found unambiguously in the struct.
+  Status CanReferenceFieldByName(const std::string& name) const;
+
+  /// Indicate if fields named `names` can be found unambiguously in the struct.
+  Status CanReferenceFieldsByNames(const std::vector<std::string>& names) const;
+
+  /// \brief Flatten this array as a vector of arrays, one for each field
+  ///
+  /// \param[in] pool The pool to allocate null bitmaps from, if necessary
+  Result<ArrayVector> Flatten(MemoryPool* pool = default_memory_pool()) const;
+
+  /// \brief Get one of the child arrays, combining its null bitmap
+  /// with the parent struct array's bitmap.
+  ///
+  /// \param[in] index Which child array to get
+  /// \param[in] pool The pool to allocate null bitmaps from, if necessary
+  Result<std::shared_ptr<Array>> GetFlattenedField(
+      int index, MemoryPool* pool = default_memory_pool()) const;
+
+ private:
+  // For caching boxed child data
+  struct ARROW_NO_EXPORT Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+// ----------------------------------------------------------------------
+// Union
+
+/// Base class for SparseUnionArray and DenseUnionArray
+class ARROW_EXPORT UnionArray : public Array {
+ public:
+  using type_code_t = int8_t;
+
+  ~UnionArray() override;
+
+  /// Note that this buffer does not account for any slice offset
+  const std::shared_ptr<Buffer>& type_codes() const { return data_->buffers[1]; }
+
+  const type_code_t* raw_type_codes() const { return raw_type_codes_; }
+
+  /// The logical type code of the value at index.
+  type_code_t type_code(int64_t i) const { return raw_type_codes_[i]; }
+
+  /// The physical child id containing value at index.
+  int child_id(int64_t i) const { return union_type_->child_ids()[raw_type_codes_[i]]; }
+
+  const UnionType* union_type() const { return union_type_; }
+
+  UnionMode::type mode() const { return union_type_->mode(); }
+
+  /// \brief Return the given field as an individual array.
+  ///
+  /// For sparse unions, the returned array has its offset, length and null
+  /// count adjusted.
+  std::shared_ptr<Array> field(int pos) const;
+
+ protected:
+  UnionArray();
+
+  void SetData(std::shared_ptr<ArrayData> data);
+
+  const type_code_t* raw_type_codes_;
+  const UnionType* union_type_;
+
+ private:
+  // For caching boxed child data
+  struct ARROW_NO_EXPORT Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+/// Concrete Array class for sparse union data
+class ARROW_EXPORT SparseUnionArray : public UnionArray {
+ public:
+  using TypeClass = SparseUnionType;
+
+  ~SparseUnionArray() override;
+
+  explicit SparseUnionArray(std::shared_ptr<ArrayData> data);
+
+  SparseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
+                   std::shared_ptr<Buffer> type_ids, int64_t offset = 0);
+
+  /// \brief Construct SparseUnionArray from type_ids and children
+  ///
+  /// This function does the bare minimum of validation of the input types.
+  ///
+  /// \param[in] type_ids An array of logical type ids for the union type
+  /// \param[in] children Vector of children Arrays containing the data for each type.
+  /// \param[in] type_codes Vector of type codes.
+  static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
+                                             std::vector<type_code_t> type_codes) {
+    return Make(std::move(type_ids), std::move(children), std::vector<std::string>{},
+                std::move(type_codes));
+  }
+
+  /// \brief Construct SparseUnionArray with custom field names from type_ids and children
+  ///
+  /// This function does the bare minimum of validation of the input types.
+  ///
+  /// \param[in] type_ids An array of logical type ids for the union type
+  /// \param[in] children Vector of children Arrays containing the data for each type.
+  /// \param[in] field_names Vector of strings containing the name of each field.
+  /// \param[in] type_codes Vector of type codes.
+  static Result<std::shared_ptr<Array>> Make(const Array& type_ids, ArrayVector children,
+                                             std::vector<std::string> field_names = {},
+                                             std::vector<type_code_t> type_codes = {});
+
+  const SparseUnionType* union_type() const {
+    return internal::checked_cast<const SparseUnionType*>(union_type_);
+  }
+
+  /// \brief Get one of the child arrays, adjusting its null bitmap
+  /// where the union array type code does not match.
+  ///
+  /// \param[in] index Which child array to get (i.e. the physical index, not the type
+  /// code) \param[in] pool The pool to allocate null bitmaps from, if necessary
+  Result<std::shared_ptr<Array>> GetFlattenedField(
+      int index, MemoryPool* pool = default_memory_pool()) const;
+
+ protected:
+  void SetData(std::shared_ptr<ArrayData> data);
+};
+
+/// \brief Concrete Array class for dense union data
+///
+/// Note that union types do not have a validity bitmap
+class ARROW_EXPORT DenseUnionArray : public UnionArray {
+ public:
+  using TypeClass = DenseUnionType;
+
+  ~DenseUnionArray() override;
+
+  explicit DenseUnionArray(const std::shared_ptr<ArrayData>& data);
+
+  DenseUnionArray(std::shared_ptr<DataType> type, int64_t length, ArrayVector children,
+                  std::shared_ptr<Buffer> type_ids,
+                  std::shared_ptr<Buffer> value_offsets = NULLPTR, int64_t offset = 0);
+
+  /// \brief Construct DenseUnionArray from type_ids, value_offsets, and children
+  ///
+  /// This function does the bare minimum of validation of the offsets and
+  /// input types.
+  ///
+  /// \param[in] type_ids An array of logical type ids for the union type
+  /// \param[in] value_offsets An array of signed int32 values indicating the
+  /// relative offset into the respective child array for the type in a given slot.
+  /// The respective offsets for each child value array must be in order / increasing.
+  /// \param[in] children Vector of children Arrays containing the data for each type.
+  /// \param[in] type_codes Vector of type codes.
+  static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
+                                             const Array& value_offsets,
+                                             ArrayVector children,
+                                             std::vector<type_code_t> type_codes) {
+    return Make(type_ids, value_offsets, std::move(children), std::vector<std::string>{},
+                std::move(type_codes));
+  }
+
+  /// \brief Construct DenseUnionArray with custom field names from type_ids,
+  /// value_offsets, and children
+  ///
+  /// This function does the bare minimum of validation of the offsets and
+  /// input types.
+  ///
+  /// \param[in] type_ids An array of logical type ids for the union type
+  /// \param[in] value_offsets An array of signed int32 values indicating the
+  /// relative offset into the respective child array for the type in a given slot.
+  /// The respective offsets for each child value array must be in order / increasing.
+  /// \param[in] children Vector of children Arrays containing the data for each type.
+  /// \param[in] field_names Vector of strings containing the name of each field.
+  /// \param[in] type_codes Vector of type codes.
+  static Result<std::shared_ptr<Array>> Make(const Array& type_ids,
+                                             const Array& value_offsets,
+                                             ArrayVector children,
+                                             std::vector<std::string> field_names = {},
+                                             std::vector<type_code_t> type_codes = {});
+
+  const DenseUnionType* union_type() const {
+    return internal::checked_cast<const DenseUnionType*>(union_type_);
+  }
+
+  /// Note that this buffer does not account for any slice offset
+  const std::shared_ptr<Buffer>& value_offsets() const { return data_->buffers[2]; }
+
+  int32_t value_offset(int64_t i) const { return raw_value_offsets_[i]; }
+
+  const int32_t* raw_value_offsets() const { return raw_value_offsets_; }
+
+ protected:
+  const int32_t* raw_value_offsets_;
+
+  void SetData(const std::shared_ptr<ArrayData>& data);
+};
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/array_primitive.h b/pyarrow/include/arrow/array/array_primitive.h
new file mode 100644
index 0000000000000000000000000000000000000000..cebf47ad93d8aa719328007f3c4fa6d960855027
--- /dev/null
+++ b/pyarrow/include/arrow/array/array_primitive.h
@@ -0,0 +1,220 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Array accessor types for primitive/C-type-based arrays, such as numbers,
+// boolean, and temporal types.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/stl_iterator.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"  // IWYU pragma: export
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// Concrete Array class for boolean data
+class ARROW_EXPORT BooleanArray : public PrimitiveArray {
+ public:
+  using TypeClass = BooleanType;
+  using IteratorType = stl::ArrayIterator<BooleanArray>;
+
+  explicit BooleanArray(const std::shared_ptr<ArrayData>& data);
+
+  BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
+               const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+               int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  bool Value(int64_t i) const {
+    return bit_util::GetBit(reinterpret_cast<const uint8_t*>(raw_values_),
+                            i + data_->offset);
+  }
+
+  bool GetView(int64_t i) const { return Value(i); }
+
+  std::optional<bool> operator[](int64_t i) const { return *IteratorType(*this, i); }
+
+  /// \brief Return the number of false (0) values among the valid
+  /// values. Result is not cached.
+  int64_t false_count() const;
+
+  /// \brief Return the number of true (1) values among the valid
+  /// values. Result is not cached.
+  int64_t true_count() const;
+
+  IteratorType begin() const { return IteratorType(*this); }
+
+  IteratorType end() const { return IteratorType(*this, length()); }
+
+ protected:
+  using PrimitiveArray::PrimitiveArray;
+};
+
+/// \addtogroup numeric-arrays
+///
+/// @{
+
+/// \brief Concrete Array class for numeric data with a corresponding C type
+///
+/// This class is templated on the corresponding DataType subclass for the
+/// given data, for example NumericArray<Int8Type> or NumericArray<Date32Type>.
+///
+/// Note that convenience aliases are available for all accepted types
+/// (for example Int8Array for NumericArray<Int8Type>).
+template <typename TYPE>
+class NumericArray : public PrimitiveArray {
+ public:
+  using TypeClass = TYPE;
+  using value_type = typename TypeClass::c_type;
+  using IteratorType = stl::ArrayIterator<NumericArray<TYPE>>;
+
+  explicit NumericArray(const std::shared_ptr<ArrayData>& data) {
+    NumericArray::SetData(data);
+  }
+
+  // Only enable this constructor without a type argument for types without additional
+  // metadata
+  template <typename T1 = TYPE>
+  NumericArray(enable_if_parameter_free<T1, int64_t> length,
+               const std::shared_ptr<Buffer>& data,
+               const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+               int64_t null_count = kUnknownNullCount, int64_t offset = 0) {
+    NumericArray::SetData(ArrayData::Make(TypeTraits<T1>::type_singleton(), length,
+                                          {null_bitmap, data}, null_count, offset));
+  }
+
+  NumericArray(std::shared_ptr<DataType> type, int64_t length,
+               const std::shared_ptr<Buffer>& data,
+               const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+               int64_t null_count = kUnknownNullCount, int64_t offset = 0) {
+    NumericArray::SetData(ArrayData::Make(std::move(type), length, {null_bitmap, data},
+                                          null_count, offset));
+  }
+
+  const value_type* raw_values() const { return values_; }
+
+  value_type Value(int64_t i) const { return values_[i]; }
+
+  // For API compatibility with BinaryArray etc.
+  value_type GetView(int64_t i) const { return values_[i]; }
+
+  std::optional<value_type> operator[](int64_t i) const {
+    return *IteratorType(*this, i);
+  }
+
+  IteratorType begin() const { return IteratorType(*this); }
+
+  IteratorType end() const { return IteratorType(*this, length()); }
+
+ protected:
+  NumericArray() : values_(NULLPTR) {}
+
+  void SetData(const std::shared_ptr<ArrayData>& data) {
+    this->PrimitiveArray::SetData(data);
+    values_ = raw_values_
+                  ? (reinterpret_cast<const value_type*>(raw_values_) + data_->offset)
+                  : NULLPTR;
+  }
+
+  const value_type* values_;
+};
+
+/// DayTimeArray
+/// ---------------------
+/// \brief Array of Day and Millisecond values.
+class ARROW_EXPORT DayTimeIntervalArray : public PrimitiveArray {
+ public:
+  using TypeClass = DayTimeIntervalType;
+  using IteratorType = stl::ArrayIterator<DayTimeIntervalArray>;
+
+  explicit DayTimeIntervalArray(const std::shared_ptr<ArrayData>& data);
+
+  DayTimeIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
+                       const std::shared_ptr<Buffer>& data,
+                       const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+                       int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  DayTimeIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
+                       const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+                       int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  TypeClass::DayMilliseconds GetValue(int64_t i) const;
+  TypeClass::DayMilliseconds Value(int64_t i) const { return GetValue(i); }
+
+  // For compatibility with Take kernel.
+  TypeClass::DayMilliseconds GetView(int64_t i) const { return GetValue(i); }
+
+  IteratorType begin() const { return IteratorType(*this); }
+
+  IteratorType end() const { return IteratorType(*this, length()); }
+
+  std::optional<TypeClass::DayMilliseconds> operator[](int64_t i) const {
+    return *IteratorType(*this, i);
+  }
+
+  int32_t byte_width() const { return sizeof(TypeClass::DayMilliseconds); }
+
+  const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
+};
+
+/// \brief Array of Month, Day and nanosecond values.
+class ARROW_EXPORT MonthDayNanoIntervalArray : public PrimitiveArray {
+ public:
+  using TypeClass = MonthDayNanoIntervalType;
+  using IteratorType = stl::ArrayIterator<MonthDayNanoIntervalArray>;
+
+  explicit MonthDayNanoIntervalArray(const std::shared_ptr<ArrayData>& data);
+
+  MonthDayNanoIntervalArray(const std::shared_ptr<DataType>& type, int64_t length,
+                            const std::shared_ptr<Buffer>& data,
+                            const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+                            int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  MonthDayNanoIntervalArray(int64_t length, const std::shared_ptr<Buffer>& data,
+                            const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+                            int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  TypeClass::MonthDayNanos GetValue(int64_t i) const;
+  TypeClass::MonthDayNanos Value(int64_t i) const { return GetValue(i); }
+
+  // For compatibility with Take kernel.
+  TypeClass::MonthDayNanos GetView(int64_t i) const { return GetValue(i); }
+
+  IteratorType begin() const { return IteratorType(*this); }
+
+  IteratorType end() const { return IteratorType(*this, length()); }
+
+  std::optional<TypeClass::MonthDayNanos> operator[](int64_t i) const {
+    return *IteratorType(*this, i);
+  }
+
+  int32_t byte_width() const { return sizeof(TypeClass::MonthDayNanos); }
+
+  const uint8_t* raw_values() const { return raw_values_ + data_->offset * byte_width(); }
+};
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/array_run_end.h b/pyarrow/include/arrow/array/array_run_end.h
new file mode 100644
index 0000000000000000000000000000000000000000..b46b0855ab36776eec4e22cef1a35112e2d18fa8
--- /dev/null
+++ b/pyarrow/include/arrow/array/array_run_end.h
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Array accessor classes run-end encoded arrays
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \addtogroup run-end-encoded-arrays
+///
+/// @{
+
+// ----------------------------------------------------------------------
+// RunEndEncoded
+
+/// \brief Array type for run-end encoded data
+class ARROW_EXPORT RunEndEncodedArray : public Array {
+ private:
+  std::shared_ptr<Array> run_ends_array_;
+  std::shared_ptr<Array> values_array_;
+
+ public:
+  using TypeClass = RunEndEncodedType;
+
+  explicit RunEndEncodedArray(const std::shared_ptr<ArrayData>& data);
+
+  /// \brief Construct a RunEndEncodedArray from all parameters
+  ///
+  /// The length and offset parameters refer to the dimensions of the logical
+  /// array which is the array we would get after expanding all the runs into
+  /// repeated values. As such, length can be much greater than the length of
+  /// the child run_ends and values arrays.
+  RunEndEncodedArray(const std::shared_ptr<DataType>& type, int64_t length,
+                     const std::shared_ptr<Array>& run_ends,
+                     const std::shared_ptr<Array>& values, int64_t offset = 0);
+
+  /// \brief Construct a RunEndEncodedArray from all parameters
+  ///
+  /// The length and offset parameters refer to the dimensions of the logical
+  /// array which is the array we would get after expanding all the runs into
+  /// repeated values. As such, length can be much greater than the length of
+  /// the child run_ends and values arrays.
+  static Result<std::shared_ptr<RunEndEncodedArray>> Make(
+      const std::shared_ptr<DataType>& type, int64_t logical_length,
+      const std::shared_ptr<Array>& run_ends, const std::shared_ptr<Array>& values,
+      int64_t logical_offset = 0);
+
+  /// \brief Construct a RunEndEncodedArray from values and run ends arrays
+  ///
+  /// The data type is automatically inferred from the arguments.
+  /// The run_ends and values arrays must have the same length.
+  static Result<std::shared_ptr<RunEndEncodedArray>> Make(
+      int64_t logical_length, const std::shared_ptr<Array>& run_ends,
+      const std::shared_ptr<Array>& values, int64_t logical_offset = 0);
+
+ protected:
+  void SetData(const std::shared_ptr<ArrayData>& data);
+
+ public:
+  /// \brief Returns an array holding the logical indexes of each run-end
+  ///
+  /// The physical offset to the array is applied.
+  const std::shared_ptr<Array>& run_ends() const { return run_ends_array_; }
+
+  /// \brief Returns an array holding the values of each run
+  ///
+  /// The physical offset to the array is applied.
+  const std::shared_ptr<Array>& values() const { return values_array_; }
+
+  /// \brief Returns an array holding the logical indexes of each run end
+  ///
+  /// If a non-zero logical offset is set, this function allocates a new
+  /// array and rewrites all the run end values to be relative to the logical
+  /// offset and cuts the end of the array to the logical length.
+  Result<std::shared_ptr<Array>> LogicalRunEnds(MemoryPool* pool) const;
+
+  /// \brief Returns an array holding the values of each run
+  ///
+  /// If a non-zero logical offset is set, this function allocates a new
+  /// array containing only the values within the logical range.
+  std::shared_ptr<Array> LogicalValues() const;
+
+  /// \brief Find the physical offset of this REE array
+  ///
+  /// This function uses binary-search, so it has a O(log N) cost.
+  int64_t FindPhysicalOffset() const;
+
+  /// \brief Find the physical length of this REE array
+  ///
+  /// The physical length of an REE is the number of physical values (and
+  /// run-ends) necessary to represent the logical range of values from offset
+  /// to length.
+  ///
+  /// Avoid calling this function if the physical length can be established in
+  /// some other way (e.g. when iterating over the runs sequentially until the
+  /// end). This function uses binary-search, so it has a O(log N) cost.
+  int64_t FindPhysicalLength() const;
+};
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/builder_adaptive.h b/pyarrow/include/arrow/array/builder_adaptive.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cea571be3e3244741f3df15f87c8958eedddf76
--- /dev/null
+++ b/pyarrow/include/arrow/array/builder_adaptive.h
@@ -0,0 +1,215 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <type_traits>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \addtogroup numeric-builders
+///
+/// @{
+
+namespace internal {
+
+class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
+ public:
+  AdaptiveIntBuilderBase(uint8_t start_int_size, MemoryPool* pool,
+                         int64_t alignment = kDefaultBufferAlignment);
+
+  explicit AdaptiveIntBuilderBase(MemoryPool* pool,
+                                  int64_t alignment = kDefaultBufferAlignment)
+      : AdaptiveIntBuilderBase(sizeof(uint8_t), pool, alignment) {}
+
+  /// \brief Append multiple nulls
+  /// \param[in] length the number of nulls to append
+  Status AppendNulls(int64_t length) final {
+    ARROW_RETURN_NOT_OK(CommitPendingData());
+    if (ARROW_PREDICT_TRUE(length > 0)) {
+      ARROW_RETURN_NOT_OK(Reserve(length));
+      memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
+      UnsafeSetNull(length);
+    }
+    return Status::OK();
+  }
+
+  Status AppendNull() final {
+    pending_data_[pending_pos_] = 0;
+    pending_valid_[pending_pos_] = 0;
+    pending_has_nulls_ = true;
+    ++pending_pos_;
+    ++length_;
+    ++null_count_;
+
+    if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
+      return CommitPendingData();
+    }
+    return Status::OK();
+  }
+
+  Status AppendEmptyValues(int64_t length) final {
+    ARROW_RETURN_NOT_OK(CommitPendingData());
+    if (ARROW_PREDICT_TRUE(length > 0)) {
+      ARROW_RETURN_NOT_OK(Reserve(length));
+      memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
+      UnsafeSetNotNull(length);
+    }
+    return Status::OK();
+  }
+
+  Status AppendEmptyValue() final {
+    pending_data_[pending_pos_] = 0;
+    pending_valid_[pending_pos_] = 1;
+    ++pending_pos_;
+    ++length_;
+
+    if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
+      return CommitPendingData();
+    }
+    return Status::OK();
+  }
+
+  void Reset() override;
+  Status Resize(int64_t capacity) override;
+
+ protected:
+  Status AppendInternal(const uint64_t val) {
+    pending_data_[pending_pos_] = val;
+    pending_valid_[pending_pos_] = 1;
+    ++pending_pos_;
+    ++length_;
+
+    if (ARROW_PREDICT_FALSE(pending_pos_ >= pending_size_)) {
+      return CommitPendingData();
+    }
+    return Status::OK();
+  }
+
+  virtual Status CommitPendingData() = 0;
+
+  template <typename new_type, typename old_type>
+  typename std::enable_if<sizeof(old_type) >= sizeof(new_type), Status>::type
+  ExpandIntSizeInternal();
+  template <typename new_type, typename old_type>
+  typename std::enable_if<(sizeof(old_type) < sizeof(new_type)), Status>::type
+  ExpandIntSizeInternal();
+
+  std::shared_ptr<ResizableBuffer> data_;
+  uint8_t* raw_data_ = NULLPTR;
+
+  const uint8_t start_int_size_;
+  uint8_t int_size_;
+
+  static constexpr int32_t pending_size_ = 1024;
+  uint8_t pending_valid_[pending_size_];
+  uint64_t pending_data_[pending_size_];
+  int32_t pending_pos_ = 0;
+  bool pending_has_nulls_ = false;
+};
+
+}  // namespace internal
+
+class ARROW_EXPORT AdaptiveUIntBuilder : public internal::AdaptiveIntBuilderBase {
+ public:
+  explicit AdaptiveUIntBuilder(uint8_t start_int_size,
+                               MemoryPool* pool = default_memory_pool());
+
+  explicit AdaptiveUIntBuilder(MemoryPool* pool = default_memory_pool())
+      : AdaptiveUIntBuilder(sizeof(uint8_t), pool) {}
+
+  using internal::AdaptiveIntBuilderBase::Reset;
+
+  /// Scalar append
+  Status Append(const uint64_t val) { return AppendInternal(val); }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous C array of values
+  /// \param[in] length the number of values to append
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value
+  /// \return Status
+  Status AppendValues(const uint64_t* values, int64_t length,
+                      const uint8_t* valid_bytes = NULLPTR);
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  std::shared_ptr<DataType> type() const override;
+
+ protected:
+  Status CommitPendingData() override;
+  Status ExpandIntSize(uint8_t new_int_size);
+
+  Status AppendValuesInternal(const uint64_t* values, int64_t length,
+                              const uint8_t* valid_bytes);
+
+  template <typename new_type>
+  Status ExpandIntSizeN();
+};
+
+class ARROW_EXPORT AdaptiveIntBuilder : public internal::AdaptiveIntBuilderBase {
+ public:
+  explicit AdaptiveIntBuilder(uint8_t start_int_size,
+                              MemoryPool* pool = default_memory_pool(),
+                              int64_t alignment = kDefaultBufferAlignment);
+
+  explicit AdaptiveIntBuilder(MemoryPool* pool = default_memory_pool(),
+                              int64_t alignment = kDefaultBufferAlignment)
+      : AdaptiveIntBuilder(sizeof(uint8_t), pool, alignment) {}
+
+  using internal::AdaptiveIntBuilderBase::Reset;
+
+  /// Scalar append
+  Status Append(const int64_t val) { return AppendInternal(static_cast<uint64_t>(val)); }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous C array of values
+  /// \param[in] length the number of values to append
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value
+  /// \return Status
+  Status AppendValues(const int64_t* values, int64_t length,
+                      const uint8_t* valid_bytes = NULLPTR);
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  std::shared_ptr<DataType> type() const override;
+
+ protected:
+  Status CommitPendingData() override;
+  Status ExpandIntSize(uint8_t new_int_size);
+
+  Status AppendValuesInternal(const int64_t* values, int64_t length,
+                              const uint8_t* valid_bytes);
+
+  template <typename new_type>
+  Status ExpandIntSizeN();
+};
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/builder_base.h b/pyarrow/include/arrow/array/builder_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecd2136f5d20ba126bd359977ea17f76c4fe23ed
--- /dev/null
+++ b/pyarrow/include/arrow/array/builder_base.h
@@ -0,0 +1,371 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>  // IWYU pragma: keep
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_primitive.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace internal {
+
+template <class Builder, class V>
+class ArrayBuilderExtraOps {
+ public:
+  /// \brief Append a value from an optional or null if it has no value.
+  Status AppendOrNull(const std::optional<V>& value) {
+    auto* self = static_cast<Builder*>(this);
+    return value.has_value() ? self->Append(*value) : self->AppendNull();
+  }
+
+  /// \brief Append a value from an optional or null if it has no value.
+  ///
+  /// Unsafe methods don't check existing size.
+  void UnsafeAppendOrNull(const std::optional<V>& value) {
+    auto* self = static_cast<Builder*>(this);
+    return value.has_value() ? self->UnsafeAppend(*value) : self->UnsafeAppendNull();
+  }
+};
+
+}  // namespace internal
+
+/// \defgroup numeric-builders Concrete builder subclasses for numeric types
+/// @{
+/// @}
+
+/// \defgroup temporal-builders Concrete builder subclasses for temporal types
+/// @{
+/// @}
+
+/// \defgroup binary-builders Concrete builder subclasses for binary types
+/// @{
+/// @}
+
+/// \defgroup nested-builders Concrete builder subclasses for nested types
+/// @{
+/// @}
+
+/// \defgroup dictionary-builders Concrete builder subclasses for dictionary types
+/// @{
+/// @}
+
+/// \defgroup run-end-encoded-builders Concrete builder subclasses for run-end encoded
+/// arrays
+/// @{
+/// @}
+
+constexpr int64_t kMinBuilderCapacity = 1 << 5;
+constexpr int64_t kListMaximumElements = std::numeric_limits<int32_t>::max() - 1;
+
+/// Base class for all data array builders.
+///
+/// This class provides a facilities for incrementally building the null bitmap
+/// (see Append methods) and as a side effect the current number of slots and
+/// the null count.
+///
+/// \note Users are expected to use builders as one of the concrete types below.
+/// For example, ArrayBuilder* pointing to BinaryBuilder should be downcast before use.
+class ARROW_EXPORT ArrayBuilder {
+ public:
+  explicit ArrayBuilder(MemoryPool* pool, int64_t alignment = kDefaultBufferAlignment)
+      : pool_(pool), alignment_(alignment), null_bitmap_builder_(pool, alignment) {}
+
+  ARROW_DEFAULT_MOVE_AND_ASSIGN(ArrayBuilder);
+
+  virtual ~ArrayBuilder() = default;
+
+  /// For nested types. Since the objects are owned by this class instance, we
+  /// skip shared pointers and just return a raw pointer
+  ArrayBuilder* child(int i) { return children_[i].get(); }
+
+  const std::shared_ptr<ArrayBuilder>& child_builder(int i) const { return children_[i]; }
+
+  int num_children() const { return static_cast<int>(children_.size()); }
+
+  virtual int64_t length() const { return length_; }
+  int64_t null_count() const { return null_count_; }
+  int64_t capacity() const { return capacity_; }
+
+  /// \brief Ensure that enough memory has been allocated to fit the indicated
+  /// number of total elements in the builder, including any that have already
+  /// been appended. Does not account for reallocations that may be due to
+  /// variable size data, like binary values. To make space for incremental
+  /// appends, use Reserve instead.
+  ///
+  /// \param[in] capacity the minimum number of total array values to
+  ///            accommodate. Must be greater than the current capacity.
+  /// \return Status
+  virtual Status Resize(int64_t capacity);
+
+  /// \brief Ensure that there is enough space allocated to append the indicated
+  /// number of elements without any further reallocation. Overallocation is
+  /// used in order to minimize the impact of incremental Reserve() calls.
+  /// Note that additional_capacity is relative to the current number of elements
+  /// rather than to the current capacity, so calls to Reserve() which are not
+  /// interspersed with addition of new elements may not increase the capacity.
+  ///
+  /// \param[in] additional_capacity the number of additional array values
+  /// \return Status
+  Status Reserve(int64_t additional_capacity) {
+    auto current_capacity = capacity();
+    auto min_capacity = length() + additional_capacity;
+    if (min_capacity <= current_capacity) return Status::OK();
+
+    // leave growth factor up to BufferBuilder
+    auto new_capacity = BufferBuilder::GrowByFactor(current_capacity, min_capacity);
+    return Resize(new_capacity);
+  }
+
+  /// Reset the builder.
+  virtual void Reset();
+
+  /// \brief Append a null value to builder
+  virtual Status AppendNull() = 0;
+  /// \brief Append a number of null values to builder
+  virtual Status AppendNulls(int64_t length) = 0;
+
+  /// \brief Append a non-null value to builder
+  ///
+  /// The appended value is an implementation detail, but the corresponding
+  /// memory slot is guaranteed to be initialized.
+  /// This method is useful when appending a null value to a parent nested type.
+  virtual Status AppendEmptyValue() = 0;
+
+  /// \brief Append a number of non-null values to builder
+  ///
+  /// The appended values are an implementation detail, but the corresponding
+  /// memory slot is guaranteed to be initialized.
+  /// This method is useful when appending null values to a parent nested type.
+  virtual Status AppendEmptyValues(int64_t length) = 0;
+
+  /// \brief Append a value from a scalar
+  Status AppendScalar(const Scalar& scalar) { return AppendScalar(scalar, 1); }
+  virtual Status AppendScalar(const Scalar& scalar, int64_t n_repeats);
+  virtual Status AppendScalars(const ScalarVector& scalars);
+
+  /// \brief Append a range of values from an array.
+  ///
+  /// The given array must be the same type as the builder.
+  virtual Status AppendArraySlice(const ArraySpan& ARROW_ARG_UNUSED(array),
+                                  int64_t ARROW_ARG_UNUSED(offset),
+                                  int64_t ARROW_ARG_UNUSED(length)) {
+    return Status::NotImplemented("AppendArraySlice for builder for ", *type());
+  }
+
+  /// \brief Return result of builder as an internal generic ArrayData
+  /// object. Resets builder except for dictionary builder
+  ///
+  /// \param[out] out the finalized ArrayData object
+  /// \return Status
+  virtual Status FinishInternal(std::shared_ptr<ArrayData>* out) = 0;
+
+  /// \brief Return result of builder as an Array object.
+  ///
+  /// The builder is reset except for DictionaryBuilder.
+  ///
+  /// \param[out] out the finalized Array object
+  /// \return Status
+  Status Finish(std::shared_ptr<Array>* out);
+
+  /// \brief Return result of builder as an Array object.
+  ///
+  /// The builder is reset except for DictionaryBuilder.
+  ///
+  /// \return The finalized Array object
+  Result<std::shared_ptr<Array>> Finish();
+
+  /// \brief Return the type of the built Array
+  virtual std::shared_ptr<DataType> type() const = 0;
+
+ protected:
+  /// Append to null bitmap
+  Status AppendToBitmap(bool is_valid);
+
+  /// Vector append. Treat each zero byte as a null.   If valid_bytes is null
+  /// assume all of length bits are valid.
+  Status AppendToBitmap(const uint8_t* valid_bytes, int64_t length);
+
+  /// Uniform append.  Append N times the same validity bit.
+  Status AppendToBitmap(int64_t num_bits, bool value);
+
+  /// Set the next length bits to not null (i.e. valid).
+  Status SetNotNull(int64_t length);
+
+  // Unsafe operations (don't check capacity/don't resize)
+
+  void UnsafeAppendNull() { UnsafeAppendToBitmap(false); }
+
+  // Append to null bitmap, update the length
+  void UnsafeAppendToBitmap(bool is_valid) {
+    null_bitmap_builder_.UnsafeAppend(is_valid);
+    ++length_;
+    if (!is_valid) ++null_count_;
+  }
+
+  // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null
+  // assume all of length bits are valid.
+  void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) {
+    if (valid_bytes == NULLPTR) {
+      return UnsafeSetNotNull(length);
+    }
+    null_bitmap_builder_.UnsafeAppend(valid_bytes, length);
+    length_ += length;
+    null_count_ = null_bitmap_builder_.false_count();
+  }
+
+  // Vector append. Copy from a given bitmap. If bitmap is null assume
+  // all of length bits are valid.
+  void UnsafeAppendToBitmap(const uint8_t* bitmap, int64_t offset, int64_t length) {
+    if (bitmap == NULLPTR) {
+      return UnsafeSetNotNull(length);
+    }
+    null_bitmap_builder_.UnsafeAppend(bitmap, offset, length);
+    length_ += length;
+    null_count_ = null_bitmap_builder_.false_count();
+  }
+
+  // Append the same validity value a given number of times.
+  void UnsafeAppendToBitmap(const int64_t num_bits, bool value) {
+    if (value) {
+      UnsafeSetNotNull(num_bits);
+    } else {
+      UnsafeSetNull(num_bits);
+    }
+  }
+
+  void UnsafeAppendToBitmap(const std::vector<bool>& is_valid);
+
+  // Set the next validity bits to not null (i.e. valid).
+  void UnsafeSetNotNull(int64_t length);
+
+  // Set the next validity bits to null (i.e. invalid).
+  void UnsafeSetNull(int64_t length);
+
+  static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer);
+
+  /// \brief Finish to an array of the specified ArrayType
+  template <typename ArrayType>
+  Status FinishTyped(std::shared_ptr<ArrayType>* out) {
+    std::shared_ptr<Array> out_untyped;
+    ARROW_RETURN_NOT_OK(Finish(&out_untyped));
+    *out = std::static_pointer_cast<ArrayType>(std::move(out_untyped));
+    return Status::OK();
+  }
+
+  // Check the requested capacity for validity
+  Status CheckCapacity(int64_t new_capacity) {
+    if (ARROW_PREDICT_FALSE(new_capacity < 0)) {
+      return Status::Invalid(
+          "Resize capacity must be positive (requested: ", new_capacity, ")");
+    }
+
+    if (ARROW_PREDICT_FALSE(new_capacity < length_)) {
+      return Status::Invalid("Resize cannot downsize (requested: ", new_capacity,
+                             ", current length: ", length_, ")");
+    }
+
+    return Status::OK();
+  }
+
+  // Check for array type
+  Status CheckArrayType(const std::shared_ptr<DataType>& expected_type,
+                        const Array& array, const char* message);
+  Status CheckArrayType(Type::type expected_type, const Array& array,
+                        const char* message);
+
+  MemoryPool* pool_;
+  int64_t alignment_;
+
+  TypedBufferBuilder<bool> null_bitmap_builder_;
+  int64_t null_count_ = 0;
+
+  // Array length, so far. Also, the index of the next element to be added
+  int64_t length_ = 0;
+  int64_t capacity_ = 0;
+
+  // Child value array builders. These are owned by this class
+  std::vector<std::shared_ptr<ArrayBuilder>> children_;
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(ArrayBuilder);
+};
+
+/// \brief Construct an empty ArrayBuilder corresponding to the data
+/// type
+/// \param[in] pool the MemoryPool to use for allocations
+/// \param[in] type the data type to create the builder for
+/// \param[out] out the created ArrayBuilder
+ARROW_EXPORT
+Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+                   std::unique_ptr<ArrayBuilder>* out);
+
+inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilder(
+    const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
+  std::unique_ptr<ArrayBuilder> out;
+  ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out));
+  return out;
+}
+
+/// \brief Construct an empty ArrayBuilder corresponding to the data
+/// type, where any top-level or nested dictionary builders return the
+/// exact index type specified by the type.
+ARROW_EXPORT
+Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+                             std::unique_ptr<ArrayBuilder>* out);
+
+inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilderExactIndex(
+    const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
+  std::unique_ptr<ArrayBuilder> out;
+  ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out));
+  return out;
+}
+
+/// \brief Construct an empty DictionaryBuilder initialized optionally
+/// with a preexisting dictionary
+/// \param[in] pool the MemoryPool to use for allocations
+/// \param[in] type the dictionary type to create the builder for
+/// \param[in] dictionary the initial dictionary, if any. May be nullptr
+/// \param[out] out the created ArrayBuilder
+ARROW_EXPORT
+Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+                             const std::shared_ptr<Array>& dictionary,
+                             std::unique_ptr<ArrayBuilder>* out);
+
+inline Result<std::unique_ptr<ArrayBuilder>> MakeDictionaryBuilder(
+    const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
+    MemoryPool* pool = default_memory_pool()) {
+  std::unique_ptr<ArrayBuilder> out;
+  ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out));
+  return out;
+}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/builder_binary.h b/pyarrow/include/arrow/array/builder_binary.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0e761ae9684132240f21ee335a996bdda081a63
--- /dev/null
+++ b/pyarrow/include/arrow/array/builder_binary.h
@@ -0,0 +1,993 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_binary.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/array/data.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/binary_view_util.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \addtogroup binary-builders
+///
+/// @{
+
+// ----------------------------------------------------------------------
+// Binary and String
+
+template <typename TYPE>
+class BaseBinaryBuilder
+    : public ArrayBuilder,
+      public internal::ArrayBuilderExtraOps<BaseBinaryBuilder<TYPE>, std::string_view> {
+ public:
+  using TypeClass = TYPE;
+  using offset_type = typename TypeClass::offset_type;
+
+  explicit BaseBinaryBuilder(MemoryPool* pool = default_memory_pool(),
+                             int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        offsets_builder_(pool, alignment),
+        value_data_builder_(pool, alignment) {}
+
+  BaseBinaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+      : BaseBinaryBuilder(pool) {}
+
+  Status Append(const uint8_t* value, offset_type length) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppendNextOffset();
+    // Safety check for UBSAN.
+    if (ARROW_PREDICT_TRUE(length > 0)) {
+      ARROW_RETURN_NOT_OK(ValidateOverflow(length));
+      ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
+    }
+
+    UnsafeAppendToBitmap(true);
+    return Status::OK();
+  }
+
+  Status Append(const char* value, offset_type length) {
+    return Append(reinterpret_cast<const uint8_t*>(value), length);
+  }
+
+  Status Append(std::string_view value) {
+    return Append(value.data(), static_cast<offset_type>(value.size()));
+  }
+
+  /// Extend the last appended value by appending more data at the end
+  ///
+  /// Unlike Append, this does not create a new offset.
+  Status ExtendCurrent(const uint8_t* value, offset_type length) {
+    // Safety check for UBSAN.
+    if (ARROW_PREDICT_TRUE(length > 0)) {
+      ARROW_RETURN_NOT_OK(ValidateOverflow(length));
+      ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
+    }
+    return Status::OK();
+  }
+
+  Status ExtendCurrent(std::string_view value) {
+    return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
+                         static_cast<offset_type>(value.size()));
+  }
+
+  Status AppendNulls(int64_t length) final {
+    const int64_t num_bytes = value_data_builder_.length();
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    for (int64_t i = 0; i < length; ++i) {
+      offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+    }
+    UnsafeAppendToBitmap(length, false);
+    return Status::OK();
+  }
+
+  Status AppendNull() final {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppendNextOffset();
+    UnsafeAppendToBitmap(false);
+    return Status::OK();
+  }
+
+  Status AppendEmptyValue() final {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppendNextOffset();
+    UnsafeAppendToBitmap(true);
+    return Status::OK();
+  }
+
+  Status AppendEmptyValues(int64_t length) final {
+    const int64_t num_bytes = value_data_builder_.length();
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    for (int64_t i = 0; i < length; ++i) {
+      offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+    }
+    UnsafeAppendToBitmap(length, true);
+    return Status::OK();
+  }
+
+  /// \brief Append without checking capacity
+  ///
+  /// Offsets and data should have been presized using Reserve() and
+  /// ReserveData(), respectively.
+  void UnsafeAppend(const uint8_t* value, offset_type length) {
+    UnsafeAppendNextOffset();
+    value_data_builder_.UnsafeAppend(value, length);
+    UnsafeAppendToBitmap(true);
+  }
+
+  void UnsafeAppend(const char* value, offset_type length) {
+    UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
+  }
+
+  void UnsafeAppend(const std::string& value) {
+    UnsafeAppend(value.c_str(), static_cast<offset_type>(value.size()));
+  }
+
+  void UnsafeAppend(std::string_view value) {
+    UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
+  }
+
+  /// Like ExtendCurrent, but do not check capacity
+  void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
+    value_data_builder_.UnsafeAppend(value, length);
+  }
+
+  void UnsafeExtendCurrent(std::string_view value) {
+    UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
+                        static_cast<offset_type>(value.size()));
+  }
+
+  void UnsafeAppendNull() {
+    const int64_t num_bytes = value_data_builder_.length();
+    offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+    UnsafeAppendToBitmap(false);
+  }
+
+  void UnsafeAppendEmptyValue() {
+    const int64_t num_bytes = value_data_builder_.length();
+    offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+    UnsafeAppendToBitmap(true);
+  }
+
+  /// \brief Append a sequence of strings in one shot.
+  ///
+  /// \param[in] values a vector of strings
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value
+  /// \return Status
+  Status AppendValues(const std::vector<std::string>& values,
+                      const uint8_t* valid_bytes = NULLPTR) {
+    std::size_t total_length = std::accumulate(
+        values.begin(), values.end(), 0ULL,
+        [](uint64_t sum, const std::string& str) { return sum + str.size(); });
+    ARROW_RETURN_NOT_OK(Reserve(values.size()));
+    ARROW_RETURN_NOT_OK(ReserveData(total_length));
+
+    if (valid_bytes != NULLPTR) {
+      for (std::size_t i = 0; i < values.size(); ++i) {
+        UnsafeAppendNextOffset();
+        if (valid_bytes[i]) {
+          value_data_builder_.UnsafeAppend(
+              reinterpret_cast<const uint8_t*>(values[i].data()), values[i].size());
+        }
+      }
+    } else {
+      for (const auto& value : values) {
+        UnsafeAppendNextOffset();
+        value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()),
+                                         value.size());
+      }
+    }
+
+    UnsafeAppendToBitmap(valid_bytes, values.size());
+    return Status::OK();
+  }
+
+  /// \brief Append a sequence of nul-terminated strings in one shot.
+  ///        If one of the values is NULL, it is processed as a null
+  ///        value even if the corresponding valid_bytes entry is 1.
+  ///
+  /// \param[in] values a contiguous C array of nul-terminated char *
+  /// \param[in] length the number of values to append
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value
+  /// \return Status
+  Status AppendValues(const char** values, int64_t length,
+                      const uint8_t* valid_bytes = NULLPTR) {
+    std::size_t total_length = 0;
+    std::vector<std::size_t> value_lengths(length);
+    bool have_null_value = false;
+    for (int64_t i = 0; i < length; ++i) {
+      if (values[i] != NULLPTR) {
+        auto value_length = strlen(values[i]);
+        value_lengths[i] = value_length;
+        total_length += value_length;
+      } else {
+        have_null_value = true;
+      }
+    }
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    ARROW_RETURN_NOT_OK(ReserveData(total_length));
+
+    if (valid_bytes) {
+      int64_t valid_bytes_offset = 0;
+      for (int64_t i = 0; i < length; ++i) {
+        UnsafeAppendNextOffset();
+        if (valid_bytes[i]) {
+          if (values[i]) {
+            value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
+                                             value_lengths[i]);
+          } else {
+            UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset,
+                                 i - valid_bytes_offset);
+            UnsafeAppendToBitmap(false);
+            valid_bytes_offset = i + 1;
+          }
+        }
+      }
+      UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset);
+    } else {
+      if (have_null_value) {
+        std::vector<uint8_t> valid_vector(length, 0);
+        for (int64_t i = 0; i < length; ++i) {
+          UnsafeAppendNextOffset();
+          if (values[i]) {
+            value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
+                                             value_lengths[i]);
+            valid_vector[i] = 1;
+          }
+        }
+        UnsafeAppendToBitmap(valid_vector.data(), length);
+      } else {
+        for (int64_t i = 0; i < length; ++i) {
+          UnsafeAppendNextOffset();
+          value_data_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values[i]),
+                                           value_lengths[i]);
+        }
+        UnsafeAppendToBitmap(NULLPTR, length);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+                          int64_t length) override {
+    auto bitmap = array.GetValues<uint8_t>(0, 0);
+    auto offsets = array.GetValues<offset_type>(1);
+    auto data = array.GetValues<uint8_t>(2, 0);
+    auto total_length = offsets[offset + length] - offsets[offset];
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    ARROW_RETURN_NOT_OK(ReserveData(total_length));
+    for (int64_t i = 0; i < length; i++) {
+      if (!bitmap || bit_util::GetBit(bitmap, array.offset + offset + i)) {
+        const offset_type start = offsets[offset + i];
+        const offset_type end = offsets[offset + i + 1];
+        UnsafeAppend(data + start, end - start);
+      } else {
+        UnsafeAppendNull();
+      }
+    }
+    return Status::OK();
+  }
+
+  void Reset() override {
+    ArrayBuilder::Reset();
+    offsets_builder_.Reset();
+    value_data_builder_.Reset();
+  }
+
+  Status ValidateOverflow(int64_t new_bytes) {
+    auto new_size = value_data_builder_.length() + new_bytes;
+    if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
+      return Status::CapacityError("array cannot contain more than ", memory_limit(),
+                                   " bytes, have ", new_size);
+    } else {
+      return Status::OK();
+    }
+  }
+
+  Status Resize(int64_t capacity) override {
+    ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+    // One more than requested for offsets
+    ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
+    return ArrayBuilder::Resize(capacity);
+  }
+
+  /// \brief Ensures there is enough allocated capacity to append the indicated
+  /// number of bytes to the value data buffer without additional allocations
+  Status ReserveData(int64_t elements) {
+    ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
+    return value_data_builder_.Reserve(elements);
+  }
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+    // Write final offset (values length)
+    ARROW_RETURN_NOT_OK(AppendNextOffset());
+
+    // These buffers' padding zeroed by BufferBuilder
+    std::shared_ptr<Buffer> offsets, value_data, null_bitmap;
+    ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
+    ARROW_RETURN_NOT_OK(value_data_builder_.Finish(&value_data));
+    ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+
+    *out = ArrayData::Make(type(), length_, {null_bitmap, offsets, value_data},
+                           null_count_, 0);
+    Reset();
+    return Status::OK();
+  }
+
+  /// \return data pointer of the value date builder
+  const uint8_t* value_data() const { return value_data_builder_.data(); }
+  /// \return size of values buffer so far
+  int64_t value_data_length() const { return value_data_builder_.length(); }
+  /// \return capacity of values buffer
+  int64_t value_data_capacity() const { return value_data_builder_.capacity(); }
+
+  /// \return data pointer of the value date builder
+  const offset_type* offsets_data() const { return offsets_builder_.data(); }
+
+  /// Temporary access to a value.
+  ///
+  /// This pointer becomes invalid on the next modifying operation.
+  const uint8_t* GetValue(int64_t i, offset_type* out_length) const {
+    const offset_type* offsets = offsets_builder_.data();
+    const auto offset = offsets[i];
+    if (i == (length_ - 1)) {
+      *out_length = static_cast<offset_type>(value_data_builder_.length()) - offset;
+    } else {
+      *out_length = offsets[i + 1] - offset;
+    }
+    return value_data_builder_.data() + offset;
+  }
+
+  offset_type offset(int64_t i) const { return offsets_data()[i]; }
+
+  /// Temporary access to a value.
+  ///
+  /// This view becomes invalid on the next modifying operation.
+  std::string_view GetView(int64_t i) const {
+    offset_type value_length;
+    const uint8_t* value_data = GetValue(i, &value_length);
+    return std::string_view(reinterpret_cast<const char*>(value_data), value_length);
+  }
+
+  // Cannot make this a static attribute because of linking issues
+  static constexpr int64_t memory_limit() {
+    return std::numeric_limits<offset_type>::max() - 1;
+  }
+
+ protected:
+  TypedBufferBuilder<offset_type> offsets_builder_;
+  TypedBufferBuilder<uint8_t> value_data_builder_;
+
+  Status AppendNextOffset() {
+    const int64_t num_bytes = value_data_builder_.length();
+    return offsets_builder_.Append(static_cast<offset_type>(num_bytes));
+  }
+
+  void UnsafeAppendNextOffset() {
+    const int64_t num_bytes = value_data_builder_.length();
+    offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
+  }
+};
+
+/// \class BinaryBuilder
+/// \brief Builder class for variable-length binary data
+class ARROW_EXPORT BinaryBuilder : public BaseBinaryBuilder<BinaryType> {
+ public:
+  using BaseBinaryBuilder::BaseBinaryBuilder;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<BinaryArray>* out) { return FinishTyped(out); }
+
+  std::shared_ptr<DataType> type() const override { return binary(); }
+};
+
+/// \class StringBuilder
+/// \brief Builder class for UTF8 strings
+class ARROW_EXPORT StringBuilder : public BinaryBuilder {
+ public:
+  using BinaryBuilder::BinaryBuilder;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<StringArray>* out) { return FinishTyped(out); }
+
+  std::shared_ptr<DataType> type() const override { return utf8(); }
+};
+
+/// \class LargeBinaryBuilder
+/// \brief Builder class for large variable-length binary data
+class ARROW_EXPORT LargeBinaryBuilder : public BaseBinaryBuilder<LargeBinaryType> {
+ public:
+  using BaseBinaryBuilder::BaseBinaryBuilder;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<LargeBinaryArray>* out) { return FinishTyped(out); }
+
+  std::shared_ptr<DataType> type() const override { return large_binary(); }
+};
+
+/// \class LargeStringBuilder
+/// \brief Builder class for large UTF8 strings
+class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder {
+ public:
+  using LargeBinaryBuilder::LargeBinaryBuilder;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<LargeStringArray>* out) { return FinishTyped(out); }
+
+  std::shared_ptr<DataType> type() const override { return large_utf8(); }
+};
+
+// ----------------------------------------------------------------------
+// BinaryViewBuilder, StringViewBuilder
+//
+// These builders do not support building raw pointer view arrays.
+
+namespace internal {
+
+// We allocate medium-sized memory chunks and accumulate data in those, which
+// may result in some waste if there are many large-ish strings. If a string
+// comes along that does not fit into a block, we allocate a new block and
+// write into that.
+//
+// Later we can implement optimizations to continuing filling underfull blocks
+// after encountering a large string that required allocating a new block.
+class ARROW_EXPORT StringHeapBuilder {
+ public:
+  static constexpr int64_t kDefaultBlocksize = 32 << 10;  // 32KB
+
+  StringHeapBuilder(MemoryPool* pool, int64_t alignment)
+      : pool_(pool), alignment_(alignment) {}
+
+  void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; }
+
+  using c_type = BinaryViewType::c_type;
+
+  template <bool Safe>
+  std::conditional_t<Safe, Result<c_type>, c_type> Append(const uint8_t* value,
+                                                          int64_t length) {
+    if (length <= BinaryViewType::kInlineSize) {
+      return util::ToInlineBinaryView(value, static_cast<int32_t>(length));
+    }
+
+    if constexpr (Safe) {
+      ARROW_RETURN_NOT_OK(Reserve(length));
+    }
+
+    auto v = util::ToNonInlineBinaryView(value, static_cast<int32_t>(length),
+                                         static_cast<int32_t>(blocks_.size() - 1),
+                                         current_offset_);
+
+    memcpy(current_out_buffer_, value, static_cast<size_t>(length));
+    current_out_buffer_ += length;
+    current_remaining_bytes_ -= length;
+    current_offset_ += static_cast<int32_t>(length);
+    return v;
+  }
+
+  static constexpr int64_t ValueSizeLimit() {
+    return std::numeric_limits<int32_t>::max();
+  }
+
+  /// \brief Ensure that the indicated number of bytes can be appended via
+  /// UnsafeAppend operations without the need to allocate more memory
+  Status Reserve(int64_t num_bytes) {
+    if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) {
+      return Status::CapacityError(
+          "BinaryView or StringView elements cannot reference "
+          "strings larger than 2GB");
+    }
+    if (num_bytes > current_remaining_bytes_) {
+      ARROW_RETURN_NOT_OK(FinishLastBlock());
+      current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_;
+      ARROW_ASSIGN_OR_RAISE(
+          std::shared_ptr<ResizableBuffer> new_block,
+          AllocateResizableBuffer(current_remaining_bytes_, alignment_, pool_));
+      current_offset_ = 0;
+      current_out_buffer_ = new_block->mutable_data();
+      blocks_.emplace_back(std::move(new_block));
+    }
+    return Status::OK();
+  }
+
+  void Reset() {
+    current_offset_ = 0;
+    current_out_buffer_ = NULLPTR;
+    current_remaining_bytes_ = 0;
+    blocks_.clear();
+  }
+
+  int64_t current_remaining_bytes() const { return current_remaining_bytes_; }
+
+  Result<std::vector<std::shared_ptr<ResizableBuffer>>> Finish() {
+    if (!blocks_.empty()) {
+      ARROW_RETURN_NOT_OK(FinishLastBlock());
+    }
+    current_offset_ = 0;
+    current_out_buffer_ = NULLPTR;
+    current_remaining_bytes_ = 0;
+    return std::move(blocks_);
+  }
+
+ private:
+  Status FinishLastBlock() {
+    if (current_remaining_bytes_ > 0) {
+      // Avoid leaking uninitialized bytes from the allocator
+      ARROW_RETURN_NOT_OK(
+          blocks_.back()->Resize(blocks_.back()->size() - current_remaining_bytes_,
+                                 /*shrink_to_fit=*/true));
+      blocks_.back()->ZeroPadding();
+    }
+    return Status::OK();
+  }
+
+  MemoryPool* pool_;
+  int64_t alignment_;
+  int64_t blocksize_ = kDefaultBlocksize;
+  std::vector<std::shared_ptr<ResizableBuffer>> blocks_;
+
+  int32_t current_offset_ = 0;
+  uint8_t* current_out_buffer_ = NULLPTR;
+  int64_t current_remaining_bytes_ = 0;
+};
+
+}  // namespace internal
+
+class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder {
+ public:
+  using TypeClass = BinaryViewType;
+
+  // this constructor provided for MakeBuilder compatibility
+  BinaryViewBuilder(const std::shared_ptr<DataType>&, MemoryPool* pool);
+
+  explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(),
+                             int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        data_builder_(pool, alignment),
+        data_heap_builder_(pool, alignment) {}
+
+  /// Set the size for future preallocated data buffers.
+  ///
+  /// The default size is 32KB, so after each 32KB of string data appended to the builder
+  /// a new data buffer will be allocated. Adjust this to a larger value to decrease the
+  /// frequency of allocation, or to a smaller value to lower the overhead of each
+  /// allocation.
+  void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); }
+
+  /// The number of bytes which can be appended to this builder without allocating another
+  /// data buffer.
+  int64_t current_block_bytes_remaining() const {
+    return data_heap_builder_.current_remaining_bytes();
+  }
+
+  Status Append(const uint8_t* value, int64_t length) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppendToBitmap(true);
+    ARROW_ASSIGN_OR_RAISE(auto v,
+                          data_heap_builder_.Append</*Safe=*/true>(value, length));
+    data_builder_.UnsafeAppend(v);
+    return Status::OK();
+  }
+
+  Status Append(const char* value, int64_t length) {
+    return Append(reinterpret_cast<const uint8_t*>(value), length);
+  }
+
+  Status Append(std::string_view value) {
+    return Append(value.data(), static_cast<int64_t>(value.size()));
+  }
+
+  /// \brief Append without checking capacity
+  ///
+  /// Builder should have been presized using Reserve() and ReserveData(),
+  /// respectively, and the value must not be larger than 2GB
+  void UnsafeAppend(const uint8_t* value, int64_t length) {
+    UnsafeAppendToBitmap(true);
+    auto v = data_heap_builder_.Append</*Safe=*/false>(value, length);
+    data_builder_.UnsafeAppend(v);
+  }
+
+  void UnsafeAppend(const char* value, int64_t length) {
+    UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
+  }
+
+  void UnsafeAppend(const std::string& value) {
+    UnsafeAppend(value.c_str(), static_cast<int64_t>(value.size()));
+  }
+
+  void UnsafeAppend(std::string_view value) {
+    UnsafeAppend(value.data(), static_cast<int64_t>(value.size()));
+  }
+
+  /// \brief Ensures there is enough allocated available capacity in the
+  /// out-of-line data heap to append the indicated number of bytes without
+  /// additional allocations
+  Status ReserveData(int64_t length);
+
+  Status AppendNulls(int64_t length) final {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
+    UnsafeSetNull(length);
+    return Status::OK();
+  }
+
+  /// \brief Append a single null element
+  Status AppendNull() final {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    data_builder_.UnsafeAppend(BinaryViewType::c_type{});
+    UnsafeAppendToBitmap(false);
+    return Status::OK();
+  }
+
+  /// \brief Append a empty element (length-0 inline string)
+  Status AppendEmptyValue() final {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    data_builder_.UnsafeAppend(BinaryViewType::c_type{});
+    UnsafeAppendToBitmap(true);
+    return Status::OK();
+  }
+
+  /// \brief Append several empty elements
+  Status AppendEmptyValues(int64_t length) final {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(length, BinaryViewType::c_type{});
+    UnsafeSetNotNull(length);
+    return Status::OK();
+  }
+
+  void UnsafeAppendNull() {
+    data_builder_.UnsafeAppend(BinaryViewType::c_type{});
+    UnsafeAppendToBitmap(false);
+  }
+
+  void UnsafeAppendEmptyValue() {
+    data_builder_.UnsafeAppend(BinaryViewType::c_type{});
+    UnsafeAppendToBitmap(true);
+  }
+
+  /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies
+  /// the underlying out-of-line string memory to avoid memory lifetime issues
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+                          int64_t length) override;
+
+  void Reset() override;
+
+  Status Resize(int64_t capacity) override {
+    ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+    capacity = std::max(capacity, kMinBuilderCapacity);
+    ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
+    return ArrayBuilder::Resize(capacity);
+  }
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  std::shared_ptr<DataType> type() const override { return binary_view(); }
+
+ protected:
+  TypedBufferBuilder<BinaryViewType::c_type> data_builder_;
+
+  // Accumulates out-of-line data in fixed-size chunks which are then attached
+  // to the resulting ArrayData
+  internal::StringHeapBuilder data_heap_builder_;
+};
+
+class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder {
+ public:
+  using BinaryViewBuilder::BinaryViewBuilder;
+  std::shared_ptr<DataType> type() const override { return utf8_view(); }
+};
+
+// ----------------------------------------------------------------------
+// FixedSizeBinaryBuilder
+
+class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
+ public:
+  using TypeClass = FixedSizeBinaryType;
+
+  explicit FixedSizeBinaryBuilder(const std::shared_ptr<DataType>& type,
+                                  MemoryPool* pool = default_memory_pool(),
+                                  int64_t alignment = kDefaultBufferAlignment);
+
+  Status Append(const uint8_t* value) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppend(value);
+    return Status::OK();
+  }
+
+  Status Append(const char* value) {
+    return Append(reinterpret_cast<const uint8_t*>(value));
+  }
+
+  Status Append(std::string_view view) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppend(view);
+    return Status::OK();
+  }
+
+  Status Append(const std::string& s) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppend(s);
+    return Status::OK();
+  }
+
+  Status Append(const Buffer& s) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppend(s);
+    return Status::OK();
+  }
+
+  Status Append(const std::shared_ptr<Buffer>& s) { return Append(*s); }
+
+  template <size_t NBYTES>
+  Status Append(const std::array<uint8_t, NBYTES>& value) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppend(
+        std::string_view(reinterpret_cast<const char*>(value.data()), value.size()));
+    return Status::OK();
+  }
+
+  Status AppendValues(const uint8_t* data, int64_t length,
+                      const uint8_t* valid_bytes = NULLPTR);
+
+  Status AppendValues(const uint8_t* data, int64_t length, const uint8_t* validity,
+                      int64_t bitmap_offset);
+
+  Status AppendNull() final;
+  Status AppendNulls(int64_t length) final;
+
+  Status AppendEmptyValue() final;
+  Status AppendEmptyValues(int64_t length) final;
+
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+                          int64_t length) override {
+    return AppendValues(
+        array.GetValues<uint8_t>(1, 0) + ((array.offset + offset) * byte_width_), length,
+        array.GetValues<uint8_t>(0, 0), array.offset + offset);
+  }
+
+  void UnsafeAppend(const uint8_t* value) {
+    UnsafeAppendToBitmap(true);
+    if (ARROW_PREDICT_TRUE(byte_width_ > 0)) {
+      byte_builder_.UnsafeAppend(value, byte_width_);
+    }
+  }
+
+  void UnsafeAppend(const char* value) {
+    UnsafeAppend(reinterpret_cast<const uint8_t*>(value));
+  }
+
+  void UnsafeAppend(std::string_view value) {
+#ifndef NDEBUG
+    CheckValueSize(static_cast<size_t>(value.size()));
+#endif
+    UnsafeAppend(reinterpret_cast<const uint8_t*>(value.data()));
+  }
+
+  void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); }
+
+  void UnsafeAppend(const std::shared_ptr<Buffer>& s) { UnsafeAppend(*s); }
+
+  void UnsafeAppendNull() {
+    UnsafeAppendToBitmap(false);
+    byte_builder_.UnsafeAppend(/*num_copies=*/byte_width_, 0);
+  }
+
+  Status ValidateOverflow(int64_t new_bytes) const {
+    auto new_size = byte_builder_.length() + new_bytes;
+    if (ARROW_PREDICT_FALSE(new_size > memory_limit())) {
+      return Status::CapacityError("array cannot contain more than ", memory_limit(),
+                                   " bytes, have ", new_size);
+    } else {
+      return Status::OK();
+    }
+  }
+
+  /// \brief Ensures there is enough allocated capacity to append the indicated
+  /// number of bytes to the value data buffer without additional allocations
+  Status ReserveData(int64_t elements) {
+    ARROW_RETURN_NOT_OK(ValidateOverflow(elements));
+    return byte_builder_.Reserve(elements);
+  }
+
+  void Reset() override;
+  Status Resize(int64_t capacity) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<FixedSizeBinaryArray>* out) { return FinishTyped(out); }
+
+  /// \return size of values buffer so far
+  int64_t value_data_length() const { return byte_builder_.length(); }
+
+  int32_t byte_width() const { return byte_width_; }
+
+  /// Temporary access to a value.
+  ///
+  /// This pointer becomes invalid on the next modifying operation.
+  const uint8_t* GetValue(int64_t i) const;
+
+  /// Temporary mutable access to a value.
+  ///
+  /// This pointer becomes invalid on the next modifying operation.
+  uint8_t* GetMutableValue(int64_t i) {
+    uint8_t* data_ptr = byte_builder_.mutable_data();
+    return data_ptr + i * byte_width_;
+  }
+
+  /// Temporary mutable access to a value.
+  ///
+  /// This view becomes invalid on the next modifying operation.
+  std::string_view GetView(int64_t i) const;
+
+  /// Advance builder without allocating nor writing any values
+  ///
+  /// The internal pointer is advanced by `length` values and the same number
+  /// of non-null entries are appended to the validity bitmap.
+  /// This method assumes that the `length` values were populated directly,
+  /// for example using `GetMutableValue`.
+  void UnsafeAdvance(int64_t length) {
+    byte_builder_.UnsafeAdvance(length * byte_width_);
+    UnsafeAppendToBitmap(length, true);
+  }
+
+  /// Advance builder without allocating nor writing any values
+  ///
+  /// The internal pointer is advanced by `length` values and the same number
+  /// of validity bits are appended to the validity bitmap.
+  /// This method assumes that the `length` values were populated directly,
+  /// for example using `GetMutableValue`.
+  void UnsafeAdvance(int64_t length, const uint8_t* validity, int64_t valid_bits_offset) {
+    byte_builder_.UnsafeAdvance(length * byte_width_);
+    UnsafeAppendToBitmap(validity, valid_bits_offset, length);
+  }
+
+  static constexpr int64_t memory_limit() {
+    return std::numeric_limits<int64_t>::max() - 1;
+  }
+
+  std::shared_ptr<DataType> type() const override {
+    return fixed_size_binary(byte_width_);
+  }
+
+ protected:
+  int32_t byte_width_;
+  BufferBuilder byte_builder_;
+
+  void CheckValueSize(int64_t size);
+};
+
+/// @}
+
+// ----------------------------------------------------------------------
+// Chunked builders: build a sequence of BinaryArray or StringArray that are
+// limited to a particular size (to the upper limit of 2GB)
+
+namespace internal {
+
+class ARROW_EXPORT ChunkedBinaryBuilder {
+ public:
+  explicit ChunkedBinaryBuilder(int32_t max_chunk_value_length,
+                                MemoryPool* pool = default_memory_pool());
+
+  ChunkedBinaryBuilder(int32_t max_chunk_value_length, int32_t max_chunk_length,
+                       MemoryPool* pool = default_memory_pool());
+
+  virtual ~ChunkedBinaryBuilder() = default;
+
+  Status Append(const uint8_t* value, int32_t length) {
+    if (ARROW_PREDICT_FALSE(length + builder_->value_data_length() >
+                            max_chunk_value_length_)) {
+      if (builder_->value_data_length() == 0) {
+        // The current item is larger than max_chunk_size_;
+        // this chunk will be oversize and hold *only* this item
+        ARROW_RETURN_NOT_OK(builder_->Append(value, length));
+        return NextChunk();
+      }
+      // The current item would cause builder_->value_data_length() to exceed
+      // max_chunk_size_, so finish this chunk and append the current item to the next
+      // chunk
+      ARROW_RETURN_NOT_OK(NextChunk());
+      return Append(value, length);
+    }
+
+    if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
+      // The current item would cause builder_->length() to exceed max_chunk_length_, so
+      // finish this chunk and append the current item to the next chunk
+      ARROW_RETURN_NOT_OK(NextChunk());
+    }
+
+    return builder_->Append(value, length);
+  }
+
+  Status Append(std::string_view value) {
+    return Append(reinterpret_cast<const uint8_t*>(value.data()),
+                  static_cast<int32_t>(value.size()));
+  }
+
+  Status AppendNull() {
+    if (ARROW_PREDICT_FALSE(builder_->length() == max_chunk_length_)) {
+      ARROW_RETURN_NOT_OK(NextChunk());
+    }
+    return builder_->AppendNull();
+  }
+
+  Status Reserve(int64_t values);
+
+  virtual Status Finish(ArrayVector* out);
+
+ protected:
+  Status NextChunk();
+
+  // maximum total character data size per chunk
+  int64_t max_chunk_value_length_;
+
+  // maximum elements allowed per chunk
+  int64_t max_chunk_length_ = kListMaximumElements;
+
+  // when Reserve() would cause builder_ to exceed its max_chunk_length_,
+  // add to extra_capacity_ instead and wait to reserve until the next chunk
+  int64_t extra_capacity_ = 0;
+
+  std::unique_ptr<BinaryBuilder> builder_;
+  std::vector<std::shared_ptr<Array>> chunks_;
+};
+
+class ARROW_EXPORT ChunkedStringBuilder : public ChunkedBinaryBuilder {
+ public:
+  using ChunkedBinaryBuilder::ChunkedBinaryBuilder;
+
+  Status Finish(ArrayVector* out) override;
+};
+
+}  // namespace internal
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/builder_decimal.h b/pyarrow/include/arrow/array/builder_decimal.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0bf0a04220842cceada0d0754ad6be4e41a3093
--- /dev/null
+++ b/pyarrow/include/arrow/array/builder_decimal.h
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/array/array_decimal.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/data.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \addtogroup numeric-builders
+///
+/// @{
+
+class ARROW_EXPORT Decimal32Builder : public FixedSizeBinaryBuilder {
+ public:
+  using TypeClass = Decimal32Type;
+  using ValueType = Decimal32;
+
+  explicit Decimal32Builder(const std::shared_ptr<DataType>& type,
+                            MemoryPool* pool = default_memory_pool(),
+                            int64_t alignment = kDefaultBufferAlignment);
+
+  using FixedSizeBinaryBuilder::Append;
+  using FixedSizeBinaryBuilder::AppendValues;
+  using FixedSizeBinaryBuilder::Reset;
+
+  Status Append(Decimal32 val);
+  void UnsafeAppend(Decimal32 val);
+  void UnsafeAppend(std::string_view val);
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<Decimal32Array>* out) { return FinishTyped(out); }
+
+  std::shared_ptr<DataType> type() const override { return decimal_type_; }
+
+ protected:
+  std::shared_ptr<Decimal32Type> decimal_type_;
+};
+
+class ARROW_EXPORT Decimal64Builder : public FixedSizeBinaryBuilder {
+ public:
+  using TypeClass = Decimal64Type;
+  using ValueType = Decimal64;
+
+  explicit Decimal64Builder(const std::shared_ptr<DataType>& type,
+                            MemoryPool* pool = default_memory_pool(),
+                            int64_t alignment = kDefaultBufferAlignment);
+
+  using FixedSizeBinaryBuilder::Append;
+  using FixedSizeBinaryBuilder::AppendValues;
+  using FixedSizeBinaryBuilder::Reset;
+
+  Status Append(Decimal64 val);
+  void UnsafeAppend(Decimal64 val);
+  void UnsafeAppend(std::string_view val);
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<Decimal64Array>* out) { return FinishTyped(out); }
+
+  std::shared_ptr<DataType> type() const override { return decimal_type_; }
+
+ protected:
+  std::shared_ptr<Decimal64Type> decimal_type_;
+};
+
+class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
+ public:
+  using TypeClass = Decimal128Type;
+  using ValueType = Decimal128;
+
+  explicit Decimal128Builder(const std::shared_ptr<DataType>& type,
+                             MemoryPool* pool = default_memory_pool(),
+                             int64_t alignment = kDefaultBufferAlignment);
+
+  using FixedSizeBinaryBuilder::Append;
+  using FixedSizeBinaryBuilder::AppendValues;
+  using FixedSizeBinaryBuilder::Reset;
+
+  Status Append(Decimal128 val);
+  void UnsafeAppend(Decimal128 val);
+  void UnsafeAppend(std::string_view val);
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<Decimal128Array>* out) { return FinishTyped(out); }
+
+  std::shared_ptr<DataType> type() const override { return decimal_type_; }
+
+ protected:
+  std::shared_ptr<Decimal128Type> decimal_type_;
+};
+
+class ARROW_EXPORT Decimal256Builder : public FixedSizeBinaryBuilder {
+ public:
+  using TypeClass = Decimal256Type;
+  using ValueType = Decimal256;
+
+  explicit Decimal256Builder(const std::shared_ptr<DataType>& type,
+                             MemoryPool* pool = default_memory_pool(),
+                             int64_t alignment = kDefaultBufferAlignment);
+
+  using FixedSizeBinaryBuilder::Append;
+  using FixedSizeBinaryBuilder::AppendValues;
+  using FixedSizeBinaryBuilder::Reset;
+
+  Status Append(const Decimal256& val);
+  void UnsafeAppend(const Decimal256& val);
+  void UnsafeAppend(std::string_view val);
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<Decimal256Array>* out) { return FinishTyped(out); }
+
+  std::shared_ptr<DataType> type() const override { return decimal_type_; }
+
+ protected:
+  std::shared_ptr<Decimal256Type> decimal_type_;
+};
+
+using DecimalBuilder = Decimal128Builder;
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/builder_dict.h b/pyarrow/include/arrow/array/builder_dict.h
new file mode 100644
index 0000000000000000000000000000000000000000..116c82049eea9ea49a716452090297f57be4eb6b
--- /dev/null
+++ b/pyarrow/include/arrow/array/builder_dict.h
@@ -0,0 +1,728 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_binary.h"
+#include "arrow/array/builder_adaptive.h"   // IWYU pragma: export
+#include "arrow/array/builder_base.h"       // IWYU pragma: export
+#include "arrow/array/builder_primitive.h"  // IWYU pragma: export
+#include "arrow/array/data.h"
+#include "arrow/array/util.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Dictionary builder
+
+namespace internal {
+
+template <typename T, typename Enable = void>
+struct DictionaryValue {
+  using type = typename T::c_type;
+  using PhysicalType = T;
+};
+
+template <typename T>
+struct DictionaryValue<T, enable_if_base_binary<T>> {
+  using type = std::string_view;
+  using PhysicalType =
+      typename std::conditional<std::is_same<typename T::offset_type, int32_t>::value,
+                                BinaryType, LargeBinaryType>::type;
+};
+
+template <typename T>
+struct DictionaryValue<T, enable_if_binary_view_like<T>> {
+  using type = std::string_view;
+  using PhysicalType = BinaryViewType;
+};
+
+template <typename T>
+struct DictionaryValue<T, enable_if_fixed_size_binary<T>> {
+  using type = std::string_view;
+  using PhysicalType = BinaryType;
+};
+
+class ARROW_EXPORT DictionaryMemoTable {
+ public:
+  DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<DataType>& type);
+  DictionaryMemoTable(MemoryPool* pool, const std::shared_ptr<Array>& dictionary);
+  ~DictionaryMemoTable();
+
+  Status GetArrayData(int64_t start_offset, std::shared_ptr<ArrayData>* out);
+
+  /// \brief Insert new memo values
+  Status InsertValues(const Array& values);
+
+  int32_t size() const;
+
+  template <typename T>
+  Status GetOrInsert(typename DictionaryValue<T>::type value, int32_t* out) {
+    // We want to keep the DictionaryMemoTable implementation private, also we can't
+    // use extern template classes because of compiler issues (MinGW?).  Instead,
+    // we expose explicit function overrides for each supported physical type.
+    const typename DictionaryValue<T>::PhysicalType* physical_type = NULLPTR;
+    return GetOrInsert(physical_type, value, out);
+  }
+
+ private:
+  Status GetOrInsert(const BooleanType*, bool value, int32_t* out);
+  Status GetOrInsert(const Int8Type*, int8_t value, int32_t* out);
+  Status GetOrInsert(const Int16Type*, int16_t value, int32_t* out);
+  Status GetOrInsert(const Int32Type*, int32_t value, int32_t* out);
+  Status GetOrInsert(const Int64Type*, int64_t value, int32_t* out);
+  Status GetOrInsert(const UInt8Type*, uint8_t value, int32_t* out);
+  Status GetOrInsert(const UInt16Type*, uint16_t value, int32_t* out);
+  Status GetOrInsert(const UInt32Type*, uint32_t value, int32_t* out);
+  Status GetOrInsert(const UInt64Type*, uint64_t value, int32_t* out);
+  Status GetOrInsert(const DurationType*, int64_t value, int32_t* out);
+  Status GetOrInsert(const TimestampType*, int64_t value, int32_t* out);
+  Status GetOrInsert(const Date32Type*, int32_t value, int32_t* out);
+  Status GetOrInsert(const Date64Type*, int64_t value, int32_t* out);
+  Status GetOrInsert(const Time32Type*, int32_t value, int32_t* out);
+  Status GetOrInsert(const Time64Type*, int64_t value, int32_t* out);
+  Status GetOrInsert(const MonthDayNanoIntervalType*,
+                     MonthDayNanoIntervalType::MonthDayNanos value, int32_t* out);
+  Status GetOrInsert(const DayTimeIntervalType*,
+                     DayTimeIntervalType::DayMilliseconds value, int32_t* out);
+  Status GetOrInsert(const MonthIntervalType*, int32_t value, int32_t* out);
+  Status GetOrInsert(const FloatType*, float value, int32_t* out);
+  Status GetOrInsert(const DoubleType*, double value, int32_t* out);
+
+  Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out);
+  Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out);
+  Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out);
+
+  class DictionaryMemoTableImpl;
+  std::unique_ptr<DictionaryMemoTableImpl> impl_;
+};
+
+}  // namespace internal
+
+/// \addtogroup dictionary-builders
+///
+/// @{
+
+namespace internal {
+
+/// \brief Array builder for created encoded DictionaryArray from
+/// dense array
+///
+/// Unlike other builders, dictionary builder does not completely
+/// reset the state on Finish calls.
+template <typename BuilderType, typename T>
+class DictionaryBuilderBase : public ArrayBuilder {
+ public:
+  using TypeClass = DictionaryType;
+  using Value = typename DictionaryValue<T>::type;
+
+  // WARNING: the type given below is the value type, not the DictionaryType.
+  // The DictionaryType is instantiated on the Finish() call.
+  template <typename B = BuilderType, typename T1 = T>
+  DictionaryBuilderBase(uint8_t start_int_size,
+                        enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
+                                        !is_fixed_size_binary_type<T1>::value,
+                                    const std::shared_ptr<DataType>&>
+                            value_type,
+                        MemoryPool* pool = default_memory_pool(),
+                        int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
+        delta_offset_(0),
+        byte_width_(-1),
+        indices_builder_(start_int_size, pool, alignment),
+        value_type_(value_type) {}
+
+  template <typename T1 = T>
+  explicit DictionaryBuilderBase(
+      enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
+          value_type,
+      MemoryPool* pool = default_memory_pool(),
+      int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
+        delta_offset_(0),
+        byte_width_(-1),
+        indices_builder_(pool, alignment),
+        value_type_(value_type) {}
+
+  template <typename T1 = T>
+  explicit DictionaryBuilderBase(
+      const std::shared_ptr<DataType>& index_type,
+      enable_if_t<!is_fixed_size_binary_type<T1>::value, const std::shared_ptr<DataType>&>
+          value_type,
+      MemoryPool* pool = default_memory_pool(),
+      int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
+        delta_offset_(0),
+        byte_width_(-1),
+        indices_builder_(index_type, pool, alignment),
+        value_type_(value_type) {}
+
+  template <typename B = BuilderType, typename T1 = T>
+  DictionaryBuilderBase(uint8_t start_int_size,
+                        enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value &&
+                                        is_fixed_size_binary_type<T1>::value,
+                                    const std::shared_ptr<DataType>&>
+                            value_type,
+                        MemoryPool* pool = default_memory_pool(),
+                        int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
+        delta_offset_(0),
+        byte_width_(static_cast<const T1&>(*value_type).byte_width()),
+        indices_builder_(start_int_size, pool, alignment),
+        value_type_(value_type) {}
+
+  template <typename T1 = T>
+  explicit DictionaryBuilderBase(
+      enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
+      MemoryPool* pool = default_memory_pool(),
+      int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
+        delta_offset_(0),
+        byte_width_(static_cast<const T1&>(*value_type).byte_width()),
+        indices_builder_(pool, alignment),
+        value_type_(value_type) {}
+
+  template <typename T1 = T>
+  explicit DictionaryBuilderBase(
+      const std::shared_ptr<DataType>& index_type,
+      enable_if_fixed_size_binary<T1, const std::shared_ptr<DataType>&> value_type,
+      MemoryPool* pool = default_memory_pool(),
+      int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        memo_table_(new internal::DictionaryMemoTable(pool, value_type)),
+        delta_offset_(0),
+        byte_width_(static_cast<const T1&>(*value_type).byte_width()),
+        indices_builder_(index_type, pool, alignment),
+        value_type_(value_type) {}
+
+  template <typename T1 = T>
+  explicit DictionaryBuilderBase(
+      enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool())
+      : DictionaryBuilderBase<BuilderType, T1>(TypeTraits<T1>::type_singleton(), pool) {}
+
+  // This constructor doesn't check for errors. Use InsertMemoValues instead.
+  explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
+                                 MemoryPool* pool = default_memory_pool(),
+                                 int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        memo_table_(new internal::DictionaryMemoTable(pool, dictionary)),
+        delta_offset_(0),
+        byte_width_(-1),
+        indices_builder_(pool, alignment),
+        value_type_(dictionary->type()) {}
+
+  ~DictionaryBuilderBase() override = default;
+
+  /// \brief The current number of entries in the dictionary
+  int64_t dictionary_length() const { return memo_table_->size(); }
+
+  /// \brief The value byte width (for FixedSizeBinaryType)
+  template <typename T1 = T>
+  enable_if_fixed_size_binary<T1, int32_t> byte_width() const {
+    return byte_width_;
+  }
+
+  /// \brief Append a scalar value
+  Status Append(Value value) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+
+    int32_t memo_index;
+    ARROW_RETURN_NOT_OK(memo_table_->GetOrInsert<T>(value, &memo_index));
+    ARROW_RETURN_NOT_OK(indices_builder_.Append(memo_index));
+    length_ += 1;
+
+    return Status::OK();
+  }
+
+  /// \brief Append a fixed-width string (only for FixedSizeBinaryType)
+  template <typename T1 = T>
+  enable_if_fixed_size_binary<T1, Status> Append(const uint8_t* value) {
+    return Append(std::string_view(reinterpret_cast<const char*>(value), byte_width_));
+  }
+
+  /// \brief Append a fixed-width string (only for FixedSizeBinaryType)
+  template <typename T1 = T>
+  enable_if_fixed_size_binary<T1, Status> Append(const char* value) {
+    return Append(std::string_view(value, byte_width_));
+  }
+
+  /// \brief Append a string (only for binary types)
+  template <typename T1 = T>
+  enable_if_binary_like<T1, Status> Append(const uint8_t* value, int32_t length) {
+    return Append(reinterpret_cast<const char*>(value), length);
+  }
+
+  /// \brief Append a string (only for binary types)
+  template <typename T1 = T>
+  enable_if_binary_like<T1, Status> Append(const char* value, int32_t length) {
+    return Append(std::string_view(value, length));
+  }
+
+  /// \brief Append a string (only for string types)
+  template <typename T1 = T>
+  enable_if_string_like<T1, Status> Append(const char* value, int32_t length) {
+    return Append(std::string_view(value, length));
+  }
+
+  /// \brief Append a decimal (only for Decimal32/64/128/256 Type)
+  template <typename T1 = T, typename CType = typename TypeTraits<T1>::CType>
+  enable_if_decimal<T1, Status> Append(const CType& value) {
+    auto bytes = value.ToBytes();
+    return Append(bytes.data(), static_cast<int32_t>(bytes.size()));
+  }
+
+  /// \brief Append a scalar null value
+  Status AppendNull() final {
+    length_ += 1;
+    null_count_ += 1;
+
+    return indices_builder_.AppendNull();
+  }
+
+  Status AppendNulls(int64_t length) final {
+    length_ += length;
+    null_count_ += length;
+
+    return indices_builder_.AppendNulls(length);
+  }
+
+  Status AppendEmptyValue() final {
+    length_ += 1;
+
+    return indices_builder_.AppendEmptyValue();
+  }
+
+  Status AppendEmptyValues(int64_t length) final {
+    length_ += length;
+
+    return indices_builder_.AppendEmptyValues(length);
+  }
+
+  Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override {
+    if (!scalar.is_valid) return AppendNulls(n_repeats);
+
+    const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*scalar.type);
+    const DictionaryScalar& dict_scalar =
+        internal::checked_cast<const DictionaryScalar&>(scalar);
+    const auto& dict = internal::checked_cast<const typename TypeTraits<T>::ArrayType&>(
+        *dict_scalar.value.dictionary);
+    ARROW_RETURN_NOT_OK(Reserve(n_repeats));
+    switch (dict_ty.index_type()->id()) {
+      case Type::UINT8:
+        return AppendScalarImpl<UInt8Type>(dict, *dict_scalar.value.index, n_repeats);
+      case Type::INT8:
+        return AppendScalarImpl<Int8Type>(dict, *dict_scalar.value.index, n_repeats);
+      case Type::UINT16:
+        return AppendScalarImpl<UInt16Type>(dict, *dict_scalar.value.index, n_repeats);
+      case Type::INT16:
+        return AppendScalarImpl<Int16Type>(dict, *dict_scalar.value.index, n_repeats);
+      case Type::UINT32:
+        return AppendScalarImpl<UInt32Type>(dict, *dict_scalar.value.index, n_repeats);
+      case Type::INT32:
+        return AppendScalarImpl<Int32Type>(dict, *dict_scalar.value.index, n_repeats);
+      case Type::UINT64:
+        return AppendScalarImpl<UInt64Type>(dict, *dict_scalar.value.index, n_repeats);
+      case Type::INT64:
+        return AppendScalarImpl<Int64Type>(dict, *dict_scalar.value.index, n_repeats);
+      default:
+        return Status::TypeError("Invalid index type: ", dict_ty);
+    }
+    return Status::OK();
+  }
+
+  Status AppendScalars(const ScalarVector& scalars) override {
+    for (const auto& scalar : scalars) {
+      ARROW_RETURN_NOT_OK(AppendScalar(*scalar, /*n_repeats=*/1));
+    }
+    return Status::OK();
+  }
+
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final {
+    // Visit the indices and insert the unpacked values.
+    const auto& dict_ty = internal::checked_cast<const DictionaryType&>(*array.type);
+    // See if possible to avoid using ToArrayData here
+    const typename TypeTraits<T>::ArrayType dict(array.dictionary().ToArrayData());
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    switch (dict_ty.index_type()->id()) {
+      case Type::UINT8:
+        return AppendArraySliceImpl<uint8_t>(dict, array, offset, length);
+      case Type::INT8:
+        return AppendArraySliceImpl<int8_t>(dict, array, offset, length);
+      case Type::UINT16:
+        return AppendArraySliceImpl<uint16_t>(dict, array, offset, length);
+      case Type::INT16:
+        return AppendArraySliceImpl<int16_t>(dict, array, offset, length);
+      case Type::UINT32:
+        return AppendArraySliceImpl<uint32_t>(dict, array, offset, length);
+      case Type::INT32:
+        return AppendArraySliceImpl<int32_t>(dict, array, offset, length);
+      case Type::UINT64:
+        return AppendArraySliceImpl<uint64_t>(dict, array, offset, length);
+      case Type::INT64:
+        return AppendArraySliceImpl<int64_t>(dict, array, offset, length);
+      default:
+        return Status::TypeError("Invalid index type: ", dict_ty);
+    }
+    return Status::OK();
+  }
+
+  /// \brief Insert values into the dictionary's memo, but do not append any
+  /// indices. Can be used to initialize a new builder with known dictionary
+  /// values
+  /// \param[in] values dictionary values to add to memo. Type must match
+  /// builder type
+  Status InsertMemoValues(const Array& values) {
+    return memo_table_->InsertValues(values);
+  }
+
+  /// \brief Append a whole dense array to the builder
+  template <typename T1 = T>
+  enable_if_t<!is_fixed_size_binary_type<T1>::value, Status> AppendArray(
+      const Array& array) {
+    using ArrayType = typename TypeTraits<T>::ArrayType;
+
+#ifndef NDEBUG
+    ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
+        value_type_, array, "Wrong value type of array to be appended"));
+#endif
+
+    const auto& concrete_array = static_cast<const ArrayType&>(array);
+    for (int64_t i = 0; i < array.length(); i++) {
+      if (array.IsNull(i)) {
+        ARROW_RETURN_NOT_OK(AppendNull());
+      } else {
+        ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i)));
+      }
+    }
+    return Status::OK();
+  }
+
+  template <typename T1 = T>
+  enable_if_fixed_size_binary<T1, Status> AppendArray(const Array& array) {
+#ifndef NDEBUG
+    ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
+        value_type_, array, "Wrong value type of array to be appended"));
+#endif
+
+    const auto& concrete_array = static_cast<const FixedSizeBinaryArray&>(array);
+    for (int64_t i = 0; i < array.length(); i++) {
+      if (array.IsNull(i)) {
+        ARROW_RETURN_NOT_OK(AppendNull());
+      } else {
+        ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i)));
+      }
+    }
+    return Status::OK();
+  }
+
+  void Reset() override {
+    // Perform a partial reset. Call ResetFull to also reset the accumulated
+    // dictionary values
+    ArrayBuilder::Reset();
+    indices_builder_.Reset();
+  }
+
+  /// \brief Reset and also clear accumulated dictionary values in memo table
+  void ResetFull() {
+    Reset();
+    memo_table_.reset(new internal::DictionaryMemoTable(pool_, value_type_));
+  }
+
+  Status Resize(int64_t capacity) override {
+    ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+    capacity = std::max(capacity, kMinBuilderCapacity);
+    ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
+    capacity_ = indices_builder_.capacity();
+    return Status::OK();
+  }
+
+  /// \brief Return dictionary indices and a delta dictionary since the last
+  /// time that Finish or FinishDelta were called, and reset state of builder
+  /// (except the memo table)
+  Status FinishDelta(std::shared_ptr<Array>* out_indices,
+                     std::shared_ptr<Array>* out_delta) {
+    std::shared_ptr<ArrayData> indices_data;
+    std::shared_ptr<ArrayData> delta_data;
+    ARROW_RETURN_NOT_OK(FinishWithDictOffset(delta_offset_, &indices_data, &delta_data));
+    *out_indices = MakeArray(indices_data);
+    *out_delta = MakeArray(delta_data);
+    return Status::OK();
+  }
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
+
+  std::shared_ptr<DataType> type() const override {
+    return ::arrow::dictionary(indices_builder_.type(), value_type_);
+  }
+
+ protected:
+  template <typename c_type>
+  Status AppendArraySliceImpl(const typename TypeTraits<T>::ArrayType& dict,
+                              const ArraySpan& array, int64_t offset, int64_t length) {
+    const c_type* values = array.GetValues<c_type>(1) + offset;
+    return VisitBitBlocks(
+        array.buffers[0].data, array.offset + offset, length,
+        [&](const int64_t position) {
+          const int64_t index = static_cast<int64_t>(values[position]);
+          if (dict.IsValid(index)) {
+            return Append(dict.GetView(index));
+          }
+          return AppendNull();
+        },
+        [&]() { return AppendNull(); });
+  }
+
+  template <typename IndexType>
+  Status AppendScalarImpl(const typename TypeTraits<T>::ArrayType& dict,
+                          const Scalar& index_scalar, int64_t n_repeats) {
+    using ScalarType = typename TypeTraits<IndexType>::ScalarType;
+    const auto index = internal::checked_cast<const ScalarType&>(index_scalar).value;
+    if (index_scalar.is_valid && dict.IsValid(index)) {
+      const auto& value = dict.GetView(index);
+      for (int64_t i = 0; i < n_repeats; i++) {
+        ARROW_RETURN_NOT_OK(Append(value));
+      }
+      return Status::OK();
+    }
+    return AppendNulls(n_repeats);
+  }
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+    std::shared_ptr<ArrayData> dictionary;
+    ARROW_RETURN_NOT_OK(FinishWithDictOffset(/*offset=*/0, out, &dictionary));
+
+    // Set type of array data to the right dictionary type
+    (*out)->type = type();
+    (*out)->dictionary = dictionary;
+    return Status::OK();
+  }
+
+  Status FinishWithDictOffset(int64_t dict_offset,
+                              std::shared_ptr<ArrayData>* out_indices,
+                              std::shared_ptr<ArrayData>* out_dictionary) {
+    // Finalize indices array
+    ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out_indices));
+
+    // Generate dictionary array from hash table contents
+    ARROW_RETURN_NOT_OK(memo_table_->GetArrayData(dict_offset, out_dictionary));
+    delta_offset_ = memo_table_->size();
+
+    // Update internals for further uses of this DictionaryBuilder
+    ArrayBuilder::Reset();
+    return Status::OK();
+  }
+
+  std::unique_ptr<DictionaryMemoTable> memo_table_;
+
+  // The size of the dictionary memo at last invocation of Finish, to use in
+  // FinishDelta for computing dictionary deltas
+  int32_t delta_offset_;
+
+  // Only used for FixedSizeBinaryType
+  int32_t byte_width_;
+
+  BuilderType indices_builder_;
+  std::shared_ptr<DataType> value_type_;
+};
+
+template <typename BuilderType>
+class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
+ public:
+  template <typename B = BuilderType>
+  DictionaryBuilderBase(
+      enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
+          start_int_size,
+      const std::shared_ptr<DataType>& value_type,
+      MemoryPool* pool = default_memory_pool())
+      : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
+
+  explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& value_type,
+                                 MemoryPool* pool = default_memory_pool())
+      : ArrayBuilder(pool), indices_builder_(pool) {}
+
+  explicit DictionaryBuilderBase(const std::shared_ptr<DataType>& index_type,
+                                 const std::shared_ptr<DataType>& value_type,
+                                 MemoryPool* pool = default_memory_pool())
+      : ArrayBuilder(pool), indices_builder_(index_type, pool) {}
+
+  template <typename B = BuilderType>
+  explicit DictionaryBuilderBase(
+      enable_if_t<std::is_base_of<AdaptiveIntBuilderBase, B>::value, uint8_t>
+          start_int_size,
+      MemoryPool* pool = default_memory_pool())
+      : ArrayBuilder(pool), indices_builder_(start_int_size, pool) {}
+
+  explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
+      : ArrayBuilder(pool), indices_builder_(pool) {}
+
+  explicit DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
+                                 MemoryPool* pool = default_memory_pool())
+      : ArrayBuilder(pool), indices_builder_(pool) {}
+
+  /// \brief Append a scalar null value
+  Status AppendNull() final {
+    length_ += 1;
+    null_count_ += 1;
+
+    return indices_builder_.AppendNull();
+  }
+
+  Status AppendNulls(int64_t length) final {
+    length_ += length;
+    null_count_ += length;
+
+    return indices_builder_.AppendNulls(length);
+  }
+
+  Status AppendEmptyValue() final {
+    length_ += 1;
+
+    return indices_builder_.AppendEmptyValue();
+  }
+
+  Status AppendEmptyValues(int64_t length) final {
+    length_ += length;
+
+    return indices_builder_.AppendEmptyValues(length);
+  }
+
+  /// \brief Append a whole dense array to the builder
+  Status AppendArray(const Array& array) {
+#ifndef NDEBUG
+    ARROW_RETURN_NOT_OK(ArrayBuilder::CheckArrayType(
+        Type::NA, array, "Wrong value type of array to be appended"));
+#endif
+    for (int64_t i = 0; i < array.length(); i++) {
+      ARROW_RETURN_NOT_OK(AppendNull());
+    }
+    return Status::OK();
+  }
+
+  Status Resize(int64_t capacity) override {
+    ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+    capacity = std::max(capacity, kMinBuilderCapacity);
+
+    ARROW_RETURN_NOT_OK(indices_builder_.Resize(capacity));
+    capacity_ = indices_builder_.capacity();
+    return Status::OK();
+  }
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+    ARROW_RETURN_NOT_OK(indices_builder_.FinishInternal(out));
+    (*out)->type = dictionary((*out)->type, null());
+    (*out)->dictionary = NullArray(0).data();
+    return Status::OK();
+  }
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
+
+  std::shared_ptr<DataType> type() const override {
+    return ::arrow::dictionary(indices_builder_.type(), null());
+  }
+
+ protected:
+  BuilderType indices_builder_;
+};
+
+}  // namespace internal
+
+/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the
+/// smallest index size that can accommodate the dictionary indices
+template <typename T>
+class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuilder, T> {
+ public:
+  using BASE = internal::DictionaryBuilderBase<AdaptiveIntBuilder, T>;
+  using BASE::BASE;
+
+  /// \brief Append dictionary indices directly without modifying memo
+  ///
+  /// NOTE: Experimental API
+  Status AppendIndices(const int64_t* values, int64_t length,
+                       const uint8_t* valid_bytes = NULLPTR) {
+    int64_t null_count_before = this->indices_builder_.null_count();
+    ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
+    this->capacity_ = this->indices_builder_.capacity();
+    this->length_ += length;
+    this->null_count_ += this->indices_builder_.null_count() - null_count_before;
+    return Status::OK();
+  }
+};
+
+/// \brief A DictionaryArray builder that always returns int32 dictionary
+/// indices so that data cast to dictionary form will have a consistent index
+/// type, e.g. for creating a ChunkedArray
+template <typename T>
+class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder, T> {
+ public:
+  using BASE = internal::DictionaryBuilderBase<Int32Builder, T>;
+  using BASE::BASE;
+
+  /// \brief Append dictionary indices directly without modifying memo
+  ///
+  /// NOTE: Experimental API
+  Status AppendIndices(const int32_t* values, int64_t length,
+                       const uint8_t* valid_bytes = NULLPTR) {
+    int64_t null_count_before = this->indices_builder_.null_count();
+    ARROW_RETURN_NOT_OK(this->indices_builder_.AppendValues(values, length, valid_bytes));
+    this->capacity_ = this->indices_builder_.capacity();
+    this->length_ += length;
+    this->null_count_ += this->indices_builder_.null_count() - null_count_before;
+    return Status::OK();
+  }
+};
+
+// ----------------------------------------------------------------------
+// Binary / Unicode builders
+// (compatibility aliases; those used to be derived classes with additional
+//  Append() overloads, but they have been folded into DictionaryBuilderBase)
+
+using BinaryDictionaryBuilder = DictionaryBuilder<BinaryType>;
+using StringDictionaryBuilder = DictionaryBuilder<StringType>;
+using BinaryDictionary32Builder = Dictionary32Builder<BinaryType>;
+using StringDictionary32Builder = Dictionary32Builder<StringType>;
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/builder_nested.h b/pyarrow/include/arrow/array/builder_nested.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdbeb0cd7d17b40b929d2ba73dba6f425d01c968
--- /dev/null
+++ b/pyarrow/include/arrow/array/builder_nested.h
@@ -0,0 +1,836 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/array/data.h"
+#include "arrow/buffer.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \addtogroup nested-builders
+///
+/// @{
+
+// ----------------------------------------------------------------------
+// VarLengthListLikeBuilder
+
+template <typename TYPE>
+class VarLengthListLikeBuilder : public ArrayBuilder {
+ public:
+  using TypeClass = TYPE;
+  using offset_type = typename TypeClass::offset_type;
+
+  /// Use this constructor to incrementally build the value array along with offsets and
+  /// null bitmap.
+  VarLengthListLikeBuilder(MemoryPool* pool,
+                           const std::shared_ptr<ArrayBuilder>& value_builder,
+                           const std::shared_ptr<DataType>& type,
+                           int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        offsets_builder_(pool, alignment),
+        value_builder_(value_builder),
+        value_field_(type->field(0)->WithType(NULLPTR)) {}
+
+  VarLengthListLikeBuilder(MemoryPool* pool,
+                           const std::shared_ptr<ArrayBuilder>& value_builder,
+                           int64_t alignment = kDefaultBufferAlignment)
+      : VarLengthListLikeBuilder(pool, value_builder,
+                                 std::make_shared<TYPE>(value_builder->type()),
+                                 alignment) {}
+
+  ~VarLengthListLikeBuilder() override = default;
+
+  Status Resize(int64_t capacity) override {
+    if (ARROW_PREDICT_FALSE(capacity > maximum_elements())) {
+      return Status::CapacityError(type_name(),
+                                   " array cannot reserve space for more than ",
+                                   maximum_elements(), " got ", capacity);
+    }
+    ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+
+    // One more than requested for list offsets
+    const int64_t offsets_capacity =
+        is_list_view(TYPE::type_id) ? capacity : capacity + 1;
+    ARROW_RETURN_NOT_OK(offsets_builder_.Resize(offsets_capacity));
+    return ArrayBuilder::Resize(capacity);
+  }
+
+  void Reset() override {
+    ArrayBuilder::Reset();
+    offsets_builder_.Reset();
+    value_builder_->Reset();
+  }
+
+  /// \brief Start a new variable-length list slot
+  ///
+  /// This function should be called before appending elements to the
+  /// value builder. Elements appended to the value builder before this function
+  /// is called for the first time, will not be members of any list value.
+  ///
+  /// After this function is called, list_length elements SHOULD be appended to
+  /// the values builder. If this contract is violated, the behavior is defined by
+  /// the concrete builder implementation and SHOULD NOT be relied upon unless
+  /// the caller is specifically building a [Large]List or [Large]ListView array.
+  ///
+  /// For [Large]List arrays, the list slot length will be the number of elements
+  /// appended to the values builder before the next call to Append* or Finish. For
+  /// [Large]ListView arrays, the list slot length will be exactly list_length, but if
+  /// Append* is called before at least list_length elements are appended to the values
+  /// builder, the current list slot will share elements with the next list
+  /// slots or an invalid [Large]ListView array will be generated because there
+  /// aren't enough elements in the values builder to fill the list slots.
+  ///
+  /// If you're building a [Large]List and don't need to be compatible
+  /// with [Large]ListView, then `BaseListBuilder::Append(bool is_valid)`
+  /// is a simpler API.
+  ///
+  /// \pre if is_valid is false, list_length MUST be 0
+  /// \param is_valid Whether the new list slot is valid
+  /// \param list_length The number of elements in the list
+  Status Append(bool is_valid, int64_t list_length) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    assert(is_valid || list_length == 0);
+    UnsafeAppendToBitmap(is_valid);
+    UnsafeAppendDimensions(/*offset=*/value_builder_->length(), /*size=*/list_length);
+    return Status::OK();
+  }
+
+  Status AppendNull() final {
+    // Append() a null list slot with list_length=0.
+    //
+    // When building [Large]List arrays, elements being appended to the values builder
+    // before the next call to Append* or Finish will extend the list slot length, but
+    // that is totally fine because list arrays admit non-empty null list slots.
+    //
+    // In the case of [Large]ListViews that's not a problem either because the
+    // list slot length remains zero.
+    return Append(false, 0);
+  }
+
+  Status AppendNulls(int64_t length) final {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(length, false);
+    UnsafeAppendEmptyDimensions(/*num_values=*/length);
+    return Status::OK();
+  }
+
+  /// \brief Append an empty list slot
+  ///
+  /// \post Another call to Append* or Finish should be made before appending to
+  /// the values builder to ensure list slot remains empty
+  Status AppendEmptyValue() final { return Append(true, 0); }
+
+  /// \brief Append an empty list slot
+  ///
+  /// \post Another call to Append* or Finish should be made before appending to
+  /// the values builder to ensure the last list slot remains empty
+  Status AppendEmptyValues(int64_t length) final {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(length, true);
+    UnsafeAppendEmptyDimensions(/*num_values=*/length);
+    return Status::OK();
+  }
+
+  /// \brief Vector append
+  ///
+  /// For list-array builders, the sizes are inferred from the offsets.
+  /// BaseListBuilder<T> provides an implementation that doesn't take sizes, but
+  /// this virtual function allows dispatching calls to both list-array and
+  /// list-view-array builders (which need the sizes)
+  ///
+  /// \param offsets The offsets of the variable-length lists
+  /// \param sizes The sizes of the variable-length lists
+  /// \param length The number of offsets, sizes, and validity bits to append
+  /// \param valid_bytes If passed, valid_bytes is of equal length to values,
+  /// and any zero byte will be considered as a null for that slot
+  virtual Status AppendValues(const offset_type* offsets, const offset_type* sizes,
+                              int64_t length, const uint8_t* valid_bytes) = 0;
+
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+                          int64_t length) override {
+    const offset_type* offsets = array.GetValues<offset_type>(1);
+    [[maybe_unused]] const offset_type* sizes = NULLPTR;
+    if constexpr (is_list_view(TYPE::type_id)) {
+      sizes = array.GetValues<offset_type>(2);
+    }
+    static_assert(internal::may_have_validity_bitmap(TYPE::type_id));
+    const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    for (int64_t row = offset; row < offset + length; row++) {
+      const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row);
+      int64_t size = 0;
+      if (is_valid) {
+        if constexpr (is_list_view(TYPE::type_id)) {
+          size = sizes[row];
+        } else {
+          size = offsets[row + 1] - offsets[row];
+        }
+      }
+      UnsafeAppendToBitmap(is_valid);
+      UnsafeAppendDimensions(/*offset=*/value_builder_->length(), size);
+      if (is_valid) {
+        ARROW_RETURN_NOT_OK(
+            value_builder_->AppendArraySlice(array.child_data[0], offsets[row], size));
+      }
+    }
+    return Status::OK();
+  }
+
+  Status ValidateOverflow(int64_t new_elements) const {
+    auto new_length = value_builder_->length() + new_elements;
+    if (ARROW_PREDICT_FALSE(new_length > maximum_elements())) {
+      return Status::CapacityError(type_name(), " array cannot contain more than ",
+                                   maximum_elements(), " elements, have ", new_elements);
+    } else {
+      return Status::OK();
+    }
+  }
+
+  ArrayBuilder* value_builder() const { return value_builder_.get(); }
+
+  // Cannot make this a static attribute because of linking issues
+  static constexpr int64_t maximum_elements() {
+    return std::numeric_limits<offset_type>::max() - 1;
+  }
+
+  std::shared_ptr<DataType> type() const override {
+    return std::make_shared<TYPE>(value_field_->WithType(value_builder_->type()));
+  }
+
+ private:
+  static constexpr const char* type_name() {
+    if constexpr (is_list_view(TYPE::type_id)) {
+      return "ListView";
+    } else {
+      return "List";
+    }
+  }
+
+ protected:
+  /// \brief Append dimensions for num_values empty list slots.
+  ///
+  /// ListViewBuilder overrides this to also append the sizes.
+  virtual void UnsafeAppendEmptyDimensions(int64_t num_values) {
+    const int64_t offset = value_builder_->length();
+    for (int64_t i = 0; i < num_values; ++i) {
+      offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
+    }
+  }
+
+  /// \brief Append dimensions for a single list slot.
+  ///
+  /// ListViewBuilder overrides this to also append the size.
+  virtual void UnsafeAppendDimensions(int64_t offset, int64_t ARROW_ARG_UNUSED(size)) {
+    offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
+  }
+
+  TypedBufferBuilder<offset_type> offsets_builder_;
+  std::shared_ptr<ArrayBuilder> value_builder_;
+  std::shared_ptr<Field> value_field_;
+};
+
+// ----------------------------------------------------------------------
+// ListBuilder / LargeListBuilder
+
+template <typename TYPE>
+class BaseListBuilder : public VarLengthListLikeBuilder<TYPE> {
+ private:
+  using BASE = VarLengthListLikeBuilder<TYPE>;
+
+ public:
+  using TypeClass = TYPE;
+  using offset_type = typename BASE::offset_type;
+
+  using BASE::BASE;
+
+  using BASE::Append;
+
+  ~BaseListBuilder() override = default;
+
+  /// \brief Start a new variable-length list slot
+  ///
+  /// This function should be called before beginning to append elements to the
+  /// value builder
+  Status Append(bool is_valid = true) {
+    // The value_length parameter to BASE::Append(bool, int64_t) is ignored when
+    // building a list array, so we can pass 0 here.
+    return BASE::Append(is_valid, 0);
+  }
+
+  /// \brief Vector append
+  ///
+  /// If passed, valid_bytes is of equal length to values, and any zero byte
+  /// will be considered as a null for that slot
+  Status AppendValues(const offset_type* offsets, int64_t length,
+                      const uint8_t* valid_bytes = NULLPTR) {
+    ARROW_RETURN_NOT_OK(this->Reserve(length));
+    this->UnsafeAppendToBitmap(valid_bytes, length);
+    this->offsets_builder_.UnsafeAppend(offsets, length);
+    return Status::OK();
+  }
+
+  Status AppendValues(const offset_type* offsets, const offset_type* sizes,
+                      int64_t length, const uint8_t* valid_bytes) final {
+    // Offsets are assumed to be valid, but the first length-1 sizes have to be
+    // consistent with the offsets to partially rule out the possibility that the
+    // caller is passing sizes that could work if building a list-view, but don't
+    // work on building a list that requires offsets to be non-decreasing.
+    //
+    // CAUTION: the last size element (`sizes[length - 1]`) is not
+    // validated and could be inconsistent with the offsets given in a
+    // subsequent call to AppendValues.
+#ifndef NDEBUG
+    if (sizes) {
+      for (int64_t i = 0; i < length - 1; ++i) {
+        if (ARROW_PREDICT_FALSE(offsets[i] != offsets[i + 1] - sizes[i])) {
+          if (!valid_bytes || valid_bytes[i]) {
+            return Status::Invalid(
+                "BaseListBuilder: sizes are inconsistent with offsets provided");
+          }
+        }
+      }
+    }
+#endif
+    return AppendValues(offsets, length, valid_bytes);
+  }
+
+  Status AppendValues(const offset_type* offsets, const offset_type* sizes,
+                      int64_t length) {
+    return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR);
+  }
+
+  Status AppendNextOffset() {
+    ARROW_RETURN_NOT_OK(this->ValidateOverflow(0));
+    const int64_t num_values = this->value_builder_->length();
+    return this->offsets_builder_.Append(static_cast<offset_type>(num_values));
+  }
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+    ARROW_RETURN_NOT_OK(AppendNextOffset());
+
+    // Offset padding zeroed by BufferBuilder
+    std::shared_ptr<Buffer> offsets;
+    std::shared_ptr<Buffer> null_bitmap;
+    ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets));
+    ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap));
+
+    if (this->value_builder_->length() == 0) {
+      // Try to make sure we get a non-null values buffer (ARROW-2744)
+      ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0));
+    }
+
+    std::shared_ptr<ArrayData> items;
+    ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items));
+
+    *out = ArrayData::Make(this->type(), this->length_,
+                           {std::move(null_bitmap), std::move(offsets)},
+                           {std::move(items)}, this->null_count_);
+    this->Reset();
+    return Status::OK();
+  }
+};
+
+/// \class ListBuilder
+/// \brief Builder class for variable-length list array value types
+///
+/// To use this class, you must append values to the child array builder and use
+/// the Append function to delimit each distinct list value (once the values
+/// have been appended to the child array) or use the bulk API to append
+/// a sequence of offsets and null values.
+///
+/// A note on types.  Per arrow/type.h all types in the c++ implementation are
+/// logical so even though this class always builds list array, this can
+/// represent multiple different logical types.  If no logical type is provided
+/// at construction time, the class defaults to List<T> where t is taken from the
+/// value_builder/values that the object is constructed with.
+class ARROW_EXPORT ListBuilder : public BaseListBuilder<ListType> {
+ public:
+  using BaseListBuilder::BaseListBuilder;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); }
+};
+
+/// \class LargeListBuilder
+/// \brief Builder class for large variable-length list array value types
+///
+/// Like ListBuilder, but to create large list arrays (with 64-bit offsets).
+class ARROW_EXPORT LargeListBuilder : public BaseListBuilder<LargeListType> {
+ public:
+  using BaseListBuilder::BaseListBuilder;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<LargeListArray>* out) { return FinishTyped(out); }
+};
+
+// ----------------------------------------------------------------------
+// ListViewBuilder / LargeListViewBuilder
+
+template <typename TYPE>
+class BaseListViewBuilder : public VarLengthListLikeBuilder<TYPE> {
+ private:
+  using BASE = VarLengthListLikeBuilder<TYPE>;
+
+ public:
+  using TypeClass = TYPE;
+  using offset_type = typename BASE::offset_type;
+
+  using BASE::BASE;
+
+  ~BaseListViewBuilder() override = default;
+
+  Status Resize(int64_t capacity) override {
+    ARROW_RETURN_NOT_OK(BASE::Resize(capacity));
+    return sizes_builder_.Resize(capacity);
+  }
+
+  void Reset() override {
+    BASE::Reset();
+    sizes_builder_.Reset();
+  }
+
+  /// \brief Vector append
+  ///
+  /// If passed, valid_bytes is of equal length to values, and any zero byte
+  /// will be considered as a null for that slot
+  Status AppendValues(const offset_type* offsets, const offset_type* sizes,
+                      int64_t length, const uint8_t* valid_bytes) final {
+    ARROW_RETURN_NOT_OK(this->Reserve(length));
+    this->UnsafeAppendToBitmap(valid_bytes, length);
+    this->offsets_builder_.UnsafeAppend(offsets, length);
+    this->sizes_builder_.UnsafeAppend(sizes, length);
+    return Status::OK();
+  }
+
+  Status AppendValues(const offset_type* offsets, const offset_type* sizes,
+                      int64_t length) {
+    return AppendValues(offsets, sizes, length, /*valid_bytes=*/NULLPTR);
+  }
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+    // Offset and sizes padding zeroed by BufferBuilder
+    std::shared_ptr<Buffer> null_bitmap;
+    std::shared_ptr<Buffer> offsets;
+    std::shared_ptr<Buffer> sizes;
+    ARROW_RETURN_NOT_OK(this->null_bitmap_builder_.Finish(&null_bitmap));
+    ARROW_RETURN_NOT_OK(this->offsets_builder_.Finish(&offsets));
+    ARROW_RETURN_NOT_OK(this->sizes_builder_.Finish(&sizes));
+
+    if (this->value_builder_->length() == 0) {
+      // Try to make sure we get a non-null values buffer (ARROW-2744)
+      ARROW_RETURN_NOT_OK(this->value_builder_->Resize(0));
+    }
+
+    std::shared_ptr<ArrayData> items;
+    ARROW_RETURN_NOT_OK(this->value_builder_->FinishInternal(&items));
+
+    *out = ArrayData::Make(this->type(), this->length_,
+                           {std::move(null_bitmap), std::move(offsets), std::move(sizes)},
+                           {std::move(items)}, this->null_count_);
+    this->Reset();
+    return Status::OK();
+  }
+
+ protected:
+  void UnsafeAppendEmptyDimensions(int64_t num_values) override {
+    for (int64_t i = 0; i < num_values; ++i) {
+      this->offsets_builder_.UnsafeAppend(0);
+    }
+    for (int64_t i = 0; i < num_values; ++i) {
+      this->sizes_builder_.UnsafeAppend(0);
+    }
+  }
+
+  void UnsafeAppendDimensions(int64_t offset, int64_t size) override {
+    this->offsets_builder_.UnsafeAppend(static_cast<offset_type>(offset));
+    this->sizes_builder_.UnsafeAppend(static_cast<offset_type>(size));
+  }
+
+ private:
+  TypedBufferBuilder<offset_type> sizes_builder_;
+};
+
+class ARROW_EXPORT ListViewBuilder final : public BaseListViewBuilder<ListViewType> {
+ public:
+  using BaseListViewBuilder::BaseListViewBuilder;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<ListViewArray>* out) { return FinishTyped(out); }
+};
+
+class ARROW_EXPORT LargeListViewBuilder final
+    : public BaseListViewBuilder<LargeListViewType> {
+ public:
+  using BaseListViewBuilder::BaseListViewBuilder;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<LargeListViewArray>* out) { return FinishTyped(out); }
+};
+
+// ----------------------------------------------------------------------
+// Map builder
+
+/// \class MapBuilder
+/// \brief Builder class for arrays of variable-size maps
+///
+/// To use this class, you must use the Append function to delimit each distinct
+/// map before appending values to the key and item array builders, or use the
+/// bulk API to append a sequence of offsets and null maps.
+///
+/// Key uniqueness and ordering are not validated.
+class ARROW_EXPORT MapBuilder : public ArrayBuilder {
+ public:
+  /// Use this constructor to define the built array's type explicitly. If key_builder
+  /// or item_builder has indeterminate type, this builder will also.
+  MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
+             const std::shared_ptr<ArrayBuilder>& item_builder,
+             const std::shared_ptr<DataType>& type);
+
+  /// Use this constructor to infer the built array's type. If key_builder or
+  /// item_builder has indeterminate type, this builder will also.
+  MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
+             const std::shared_ptr<ArrayBuilder>& item_builder, bool keys_sorted = false);
+
+  MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& item_builder,
+             const std::shared_ptr<DataType>& type);
+
+  Status Resize(int64_t capacity) override;
+  void Reset() override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<MapArray>* out) { return FinishTyped(out); }
+
+  /// \brief Vector append
+  ///
+  /// If passed, valid_bytes is of equal length to values, and any zero byte
+  /// will be considered as a null for that slot
+  Status AppendValues(const int32_t* offsets, int64_t length,
+                      const uint8_t* valid_bytes = NULLPTR);
+
+  /// \brief Start a new variable-length map slot
+  ///
+  /// This function should be called before beginning to append elements to the
+  /// key and item builders
+  Status Append();
+
+  Status AppendNull() final;
+
+  Status AppendNulls(int64_t length) final;
+
+  Status AppendEmptyValue() final;
+
+  Status AppendEmptyValues(int64_t length) final;
+
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+                          int64_t length) override {
+    const auto* offsets = array.GetValues<int32_t>(1);
+    static_assert(internal::may_have_validity_bitmap(MapType::type_id));
+    const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
+    for (int64_t row = offset; row < offset + length; row++) {
+      const bool is_valid = !validity || bit_util::GetBit(validity, array.offset + row);
+      if (is_valid) {
+        ARROW_RETURN_NOT_OK(Append());
+        const int64_t slot_length = offsets[row + 1] - offsets[row];
+        // Add together the inner StructArray offset to the Map/List offset
+        int64_t key_value_offset = array.child_data[0].offset + offsets[row];
+        ARROW_RETURN_NOT_OK(key_builder_->AppendArraySlice(
+            array.child_data[0].child_data[0], key_value_offset, slot_length));
+        ARROW_RETURN_NOT_OK(item_builder_->AppendArraySlice(
+            array.child_data[0].child_data[1], key_value_offset, slot_length));
+      } else {
+        ARROW_RETURN_NOT_OK(AppendNull());
+      }
+    }
+    return Status::OK();
+  }
+
+  /// \brief Get builder to append keys.
+  ///
+  /// Append a key with this builder should be followed by appending
+  /// an item or null value with item_builder().
+  ArrayBuilder* key_builder() const { return key_builder_.get(); }
+
+  /// \brief Get builder to append items
+  ///
+  /// Appending an item with this builder should have been preceded
+  /// by appending a key with key_builder().
+  ArrayBuilder* item_builder() const { return item_builder_.get(); }
+
+  /// \brief Get builder to add Map entries as struct values.
+  ///
+  /// This is used instead of key_builder()/item_builder() and allows
+  /// the Map to be built as a list of struct values.
+  ArrayBuilder* value_builder() const { return list_builder_->value_builder(); }
+
+  std::shared_ptr<DataType> type() const override {
+    // Key and Item builder may update types, but they don't contain the field names,
+    // so we need to reconstruct the type. (See ARROW-13735.)
+    return std::make_shared<MapType>(
+        field(entries_name_,
+              struct_({field(key_name_, key_builder_->type(), false),
+                       field(item_name_, item_builder_->type(), item_nullable_)}),
+              false),
+        keys_sorted_);
+  }
+
+  Status ValidateOverflow(int64_t new_elements) {
+    return list_builder_->ValidateOverflow(new_elements);
+  }
+
+ protected:
+  inline Status AdjustStructBuilderLength();
+
+ protected:
+  bool keys_sorted_ = false;
+  bool item_nullable_ = false;
+  std::string entries_name_;
+  std::string key_name_;
+  std::string item_name_;
+  std::shared_ptr<ListBuilder> list_builder_;
+  std::shared_ptr<ArrayBuilder> key_builder_;
+  std::shared_ptr<ArrayBuilder> item_builder_;
+};
+
+// ----------------------------------------------------------------------
+// FixedSizeList builder
+
+/// \class FixedSizeListBuilder
+/// \brief Builder class for fixed-length list array value types
+class ARROW_EXPORT FixedSizeListBuilder : public ArrayBuilder {
+ public:
+  using TypeClass = FixedSizeListType;
+
+  /// Use this constructor to define the built array's type explicitly. If value_builder
+  /// has indeterminate type, this builder will also.
+  FixedSizeListBuilder(MemoryPool* pool,
+                       const std::shared_ptr<ArrayBuilder>& value_builder,
+                       int32_t list_size);
+
+  /// Use this constructor to infer the built array's type. If value_builder has
+  /// indeterminate type, this builder will also.
+  FixedSizeListBuilder(MemoryPool* pool,
+                       const std::shared_ptr<ArrayBuilder>& value_builder,
+                       const std::shared_ptr<DataType>& type);
+
+  Status Resize(int64_t capacity) override;
+  void Reset() override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<FixedSizeListArray>* out) { return FinishTyped(out); }
+
+  /// \brief Append a valid fixed length list.
+  ///
+  /// This function affects only the validity bitmap; the child values must be appended
+  /// using the child array builder.
+  Status Append();
+
+  /// \brief Vector append
+  ///
+  /// If passed, valid_bytes will be read and any zero byte
+  /// will cause the corresponding slot to be null
+  ///
+  /// This function affects only the validity bitmap; the child values must be appended
+  /// using the child array builder. This includes appending nulls for null lists.
+  /// XXX this restriction is confusing, should this method be omitted?
+  Status AppendValues(int64_t length, const uint8_t* valid_bytes = NULLPTR);
+
+  /// \brief Append a null fixed length list.
+  ///
+  /// The child array builder will have the appropriate number of nulls appended
+  /// automatically.
+  Status AppendNull() final;
+
+  /// \brief Append length null fixed length lists.
+  ///
+  /// The child array builder will have the appropriate number of nulls appended
+  /// automatically.
+  Status AppendNulls(int64_t length) final;
+
+  Status ValidateOverflow(int64_t new_elements);
+
+  Status AppendEmptyValue() final;
+
+  Status AppendEmptyValues(int64_t length) final;
+
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length) final {
+    const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
+    for (int64_t row = offset; row < offset + length; row++) {
+      if (!validity || bit_util::GetBit(validity, array.offset + row)) {
+        ARROW_RETURN_NOT_OK(value_builder_->AppendArraySlice(
+            array.child_data[0], list_size_ * (array.offset + row), list_size_));
+        ARROW_RETURN_NOT_OK(Append());
+      } else {
+        ARROW_RETURN_NOT_OK(AppendNull());
+      }
+    }
+    return Status::OK();
+  }
+
+  ArrayBuilder* value_builder() const { return value_builder_.get(); }
+
+  std::shared_ptr<DataType> type() const override {
+    return fixed_size_list(value_field_->WithType(value_builder_->type()), list_size_);
+  }
+
+  // Cannot make this a static attribute because of linking issues
+  static constexpr int64_t maximum_elements() {
+    return std::numeric_limits<FixedSizeListType::offset_type>::max() - 1;
+  }
+
+ protected:
+  std::shared_ptr<Field> value_field_;
+  const int32_t list_size_;
+  std::shared_ptr<ArrayBuilder> value_builder_;
+};
+
+// ----------------------------------------------------------------------
+// Struct
+
+// ---------------------------------------------------------------------------------
+// StructArray builder
+/// Append, Resize and Reserve methods are acting on StructBuilder.
+/// Please make sure all these methods of all child-builders' are consistently
+/// called to maintain data-structure consistency.
+class ARROW_EXPORT StructBuilder : public ArrayBuilder {
+ public:
+  /// If any of field_builders has indeterminate type, this builder will also
+  StructBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
+                std::vector<std::shared_ptr<ArrayBuilder>> field_builders);
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<StructArray>* out) { return FinishTyped(out); }
+
+  /// Null bitmap is of equal length to every child field, and any zero byte
+  /// will be considered as a null for that field, but users must using app-
+  /// end methods or advance methods of the child builders' independently to
+  /// insert data.
+  Status AppendValues(int64_t length, const uint8_t* valid_bytes) {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(valid_bytes, length);
+    return Status::OK();
+  }
+
+  /// Append an element to the Struct. All child-builders' Append method must
+  /// be called independently to maintain data-structure consistency.
+  Status Append(bool is_valid = true) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppendToBitmap(is_valid);
+    return Status::OK();
+  }
+
+  /// \brief Append a null value. Automatically appends an empty value to each child
+  /// builder.
+  Status AppendNull() final {
+    for (const auto& field : children_) {
+      ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
+    }
+    return Append(false);
+  }
+
+  /// \brief Append multiple null values. Automatically appends empty values to each
+  /// child builder.
+  Status AppendNulls(int64_t length) final {
+    for (const auto& field : children_) {
+      ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
+    }
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(length, false);
+    return Status::OK();
+  }
+
+  Status AppendEmptyValue() final {
+    for (const auto& field : children_) {
+      ARROW_RETURN_NOT_OK(field->AppendEmptyValue());
+    }
+    return Append(true);
+  }
+
+  Status AppendEmptyValues(int64_t length) final {
+    for (const auto& field : children_) {
+      ARROW_RETURN_NOT_OK(field->AppendEmptyValues(length));
+    }
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(length, true);
+    return Status::OK();
+  }
+
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+                          int64_t length) override {
+    for (int i = 0; static_cast<size_t>(i) < children_.size(); i++) {
+      ARROW_RETURN_NOT_OK(children_[i]->AppendArraySlice(array.child_data[i],
+                                                         array.offset + offset, length));
+    }
+    const uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    UnsafeAppendToBitmap(validity, array.offset + offset, length);
+    return Status::OK();
+  }
+
+  void Reset() override;
+
+  ArrayBuilder* field_builder(int i) const { return children_[i].get(); }
+
+  int num_fields() const { return static_cast<int>(children_.size()); }
+
+  std::shared_ptr<DataType> type() const override;
+
+ private:
+  std::shared_ptr<DataType> type_;
+};
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/builder_primitive.h b/pyarrow/include/arrow/array/builder_primitive.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d79d6e9649994e99b85b233cc81ba8c1a8a1ba1
--- /dev/null
+++ b/pyarrow/include/arrow/array/builder_primitive.h
@@ -0,0 +1,689 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/float16.h"
+
+namespace arrow {
+
+class ARROW_EXPORT NullBuilder : public ArrayBuilder {
+ public:
+  explicit NullBuilder(MemoryPool* pool = default_memory_pool(),
+                       int64_t ARROW_ARG_UNUSED(alignment) = kDefaultBufferAlignment)
+      : ArrayBuilder(pool) {}
+
+  explicit NullBuilder(const std::shared_ptr<DataType>& ARROW_ARG_UNUSED(type),
+                       MemoryPool* pool = default_memory_pool(),
+                       int64_t alignment = kDefaultBufferAlignment)
+      : NullBuilder(pool, alignment) {}
+
+  /// \brief Append the specified number of null elements
+  Status AppendNulls(int64_t length) final {
+    if (length < 0) return Status::Invalid("length must be positive");
+    null_count_ += length;
+    length_ += length;
+    return Status::OK();
+  }
+
+  /// \brief Append a single null element
+  Status AppendNull() final { return AppendNulls(1); }
+
+  Status AppendEmptyValues(int64_t length) final { return AppendNulls(length); }
+
+  Status AppendEmptyValue() final { return AppendEmptyValues(1); }
+
+  Status Append(std::nullptr_t) { return AppendNull(); }
+
+  Status AppendArraySlice(const ArraySpan&, int64_t, int64_t length) override {
+    return AppendNulls(length);
+  }
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  std::shared_ptr<DataType> type() const override { return null(); }
+
+  Status Finish(std::shared_ptr<NullArray>* out) { return FinishTyped(out); }
+};
+
+/// \addtogroup numeric-builders
+///
+/// @{
+
+/// Base class for all Builders that emit an Array of a scalar numerical type.
+template <typename T>
+class NumericBuilder
+    : public ArrayBuilder,
+      public internal::ArrayBuilderExtraOps<NumericBuilder<T>, typename T::c_type> {
+ public:
+  using TypeClass = T;
+  using value_type = typename T::c_type;
+  using ArrayType = typename TypeTraits<T>::ArrayType;
+
+  template <typename T1 = T>
+  explicit NumericBuilder(
+      enable_if_parameter_free<T1, MemoryPool*> pool = default_memory_pool(),
+      int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment),
+        type_(TypeTraits<T>::type_singleton()),
+        data_builder_(pool, alignment) {}
+
+  NumericBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool,
+                 int64_t alignment = kDefaultBufferAlignment)
+      : ArrayBuilder(pool, alignment), type_(type), data_builder_(pool, alignment) {}
+
+  /// Append a single scalar and increase the size if necessary.
+  Status Append(const value_type val) {
+    ARROW_RETURN_NOT_OK(ArrayBuilder::Reserve(1));
+    UnsafeAppend(val);
+    return Status::OK();
+  }
+
+  /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
+  /// The memory at the corresponding data slot is set to 0 to prevent
+  /// uninitialized memory access
+  Status AppendNulls(int64_t length) final {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(length, value_type{});  // zero
+    UnsafeSetNull(length);
+    return Status::OK();
+  }
+
+  /// \brief Append a single null element
+  Status AppendNull() final {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    data_builder_.UnsafeAppend(value_type{});  // zero
+    UnsafeAppendToBitmap(false);
+    return Status::OK();
+  }
+
+  /// \brief Append a empty element
+  Status AppendEmptyValue() final {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    data_builder_.UnsafeAppend(value_type{});  // zero
+    UnsafeAppendToBitmap(true);
+    return Status::OK();
+  }
+
+  /// \brief Append several empty elements
+  Status AppendEmptyValues(int64_t length) final {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(length, value_type{});  // zero
+    UnsafeSetNotNull(length);
+    return Status::OK();
+  }
+
+  value_type GetValue(int64_t index) const { return data_builder_.data()[index]; }
+
+  value_type* GetMutableValue(int64_t index) {
+    return &data_builder_.mutable_data()[index];
+  }
+
+  void Reset() override {
+    data_builder_.Reset();
+    ArrayBuilder::Reset();
+  }
+
+  Status Resize(int64_t capacity) override {
+    ARROW_RETURN_NOT_OK(CheckCapacity(capacity));
+    capacity = std::max(capacity, kMinBuilderCapacity);
+    ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity));
+    return ArrayBuilder::Resize(capacity);
+  }
+
+  value_type operator[](int64_t index) const { return GetValue(index); }
+
+  value_type& operator[](int64_t index) {
+    return reinterpret_cast<value_type*>(data_builder_.mutable_data())[index];
+  }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous C array of values
+  /// \param[in] length the number of values to append
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value
+  /// \return Status
+  Status AppendValues(const value_type* values, int64_t length,
+                      const uint8_t* valid_bytes = NULLPTR) {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(values, length);
+    // length_ is update by these
+    ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length);
+    return Status::OK();
+  }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous C array of values
+  /// \param[in] length the number of values to append
+  /// \param[in] bitmap a validity bitmap to copy (may be null)
+  /// \param[in] bitmap_offset an offset into the validity bitmap
+  /// \return Status
+  Status AppendValues(const value_type* values, int64_t length, const uint8_t* bitmap,
+                      int64_t bitmap_offset) {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(values, length);
+    // length_ is update by these
+    ArrayBuilder::UnsafeAppendToBitmap(bitmap, bitmap_offset, length);
+    return Status::OK();
+  }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous C array of values
+  /// \param[in] length the number of values to append
+  /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+  /// (0). Equal in length to values
+  /// \return Status
+  Status AppendValues(const value_type* values, int64_t length,
+                      const std::vector<bool>& is_valid) {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(values, length);
+    // length_ is update by these
+    ArrayBuilder::UnsafeAppendToBitmap(is_valid);
+    return Status::OK();
+  }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a std::vector of values
+  /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+  /// (0). Equal in length to values
+  /// \return Status
+  Status AppendValues(const std::vector<value_type>& values,
+                      const std::vector<bool>& is_valid) {
+    if (values.empty()) {
+      return Status::OK();
+    }
+    return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
+  }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a std::vector of values
+  /// \return Status
+  Status AppendValues(const std::vector<value_type>& values) {
+    if (values.empty()) {
+      return Status::OK();
+    }
+    return AppendValues(values.data(), static_cast<int64_t>(values.size()));
+  }
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+    ARROW_ASSIGN_OR_RAISE(auto null_bitmap,
+                          null_bitmap_builder_.FinishWithLength(length_));
+    ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_));
+    *out = ArrayData::Make(type(), length_, {null_bitmap, data}, null_count_);
+    capacity_ = length_ = null_count_ = 0;
+    return Status::OK();
+  }
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<ArrayType>* out) { return FinishTyped(out); }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values_begin InputIterator to the beginning of the values
+  /// \param[in] values_end InputIterator pointing to the end of the values
+  /// \return Status
+  template <typename ValuesIter>
+  Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
+    int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(values_begin, values_end);
+    // this updates the length_
+    UnsafeSetNotNull(length);
+    return Status::OK();
+  }
+
+  /// \brief Append a sequence of elements in one shot, with a specified nullmap
+  /// \param[in] values_begin InputIterator to the beginning of the values
+  /// \param[in] values_end InputIterator pointing to the end of the values
+  /// \param[in] valid_begin InputIterator with elements indication valid(1)
+  ///  or null(0) values.
+  /// \return Status
+  template <typename ValuesIter, typename ValidIter>
+  enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
+      ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
+    static_assert(!internal::is_null_pointer<ValidIter>::value,
+                  "Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
+                  "version instead");
+    int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(values_begin, values_end);
+    null_bitmap_builder_.UnsafeAppend<true>(
+        length, [&valid_begin]() -> bool { return *valid_begin++; });
+    length_ = null_bitmap_builder_.length();
+    null_count_ = null_bitmap_builder_.false_count();
+    return Status::OK();
+  }
+
+  // Same as above, with a pointer type ValidIter
+  template <typename ValuesIter, typename ValidIter>
+  enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
+      ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
+    int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(values_begin, values_end);
+    // this updates the length_
+    if (valid_begin == NULLPTR) {
+      UnsafeSetNotNull(length);
+    } else {
+      null_bitmap_builder_.UnsafeAppend<true>(
+          length, [&valid_begin]() -> bool { return *valid_begin++; });
+      length_ = null_bitmap_builder_.length();
+      null_count_ = null_bitmap_builder_.false_count();
+    }
+
+    return Status::OK();
+  }
+
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+                          int64_t length) override {
+    return AppendValues(array.GetValues<value_type>(1) + offset, length,
+                        array.GetValues<uint8_t>(0, 0), array.offset + offset);
+  }
+
+  /// Append a single scalar under the assumption that the underlying Buffer is
+  /// large enough.
+  ///
+  /// This method does not capacity-check; make sure to call Reserve
+  /// beforehand.
+  void UnsafeAppend(const value_type val) {
+    ArrayBuilder::UnsafeAppendToBitmap(true);
+    data_builder_.UnsafeAppend(val);
+  }
+
+  void UnsafeAppendNull() {
+    ArrayBuilder::UnsafeAppendToBitmap(false);
+    data_builder_.UnsafeAppend(value_type{});  // zero
+  }
+
+  /// Advance builder without allocating nor writing any values
+  ///
+  /// The internal pointer is advanced by `length` values and the same number
+  /// of non-null entries are appended to the validity bitmap.
+  /// This method assumes that the `length` values were populated directly,
+  /// for example using `GetMutableValue`.
+  void UnsafeAdvance(int64_t length) {
+    data_builder_.UnsafeAdvance(length);
+    UnsafeAppendToBitmap(length, true);
+  }
+
+  /// Advance builder without allocating nor writing any values
+  ///
+  /// The internal pointer is advanced by `length` values and the same number
+  /// of validity bits are appended to the validity bitmap.
+  /// This method assumes that the `length` values were populated directly,
+  /// for example using `GetMutableValue`.
+  void UnsafeAdvance(int64_t length, const uint8_t* validity, int64_t valid_bits_offset) {
+    data_builder_.UnsafeAdvance(length);
+    UnsafeAppendToBitmap(validity, valid_bits_offset, length);
+  }
+
+  std::shared_ptr<DataType> type() const override { return type_; }
+
+ protected:
+  std::shared_ptr<DataType> type_;
+  TypedBufferBuilder<value_type> data_builder_;
+};
+
+// Builders
+
+using UInt8Builder = NumericBuilder<UInt8Type>;
+using UInt16Builder = NumericBuilder<UInt16Type>;
+using UInt32Builder = NumericBuilder<UInt32Type>;
+using UInt64Builder = NumericBuilder<UInt64Type>;
+
+using Int8Builder = NumericBuilder<Int8Type>;
+using Int16Builder = NumericBuilder<Int16Type>;
+using Int32Builder = NumericBuilder<Int32Type>;
+using Int64Builder = NumericBuilder<Int64Type>;
+
+using FloatBuilder = NumericBuilder<FloatType>;
+using DoubleBuilder = NumericBuilder<DoubleType>;
+
+/// @}
+
+/// \addtogroup temporal-builders
+///
+/// @{
+
+using Date32Builder = NumericBuilder<Date32Type>;
+using Date64Builder = NumericBuilder<Date64Type>;
+using Time32Builder = NumericBuilder<Time32Type>;
+using Time64Builder = NumericBuilder<Time64Type>;
+using TimestampBuilder = NumericBuilder<TimestampType>;
+using MonthIntervalBuilder = NumericBuilder<MonthIntervalType>;
+using DurationBuilder = NumericBuilder<DurationType>;
+
+/// @}
+
+/// \addtogroup numeric-builders
+///
+/// @{
+
+class ARROW_EXPORT HalfFloatBuilder : public NumericBuilder<HalfFloatType> {
+ public:
+  using BaseClass = NumericBuilder<HalfFloatType>;
+  using Float16 = arrow::util::Float16;
+
+  using BaseClass::Append;
+  using BaseClass::AppendValues;
+  using BaseClass::BaseClass;
+  using BaseClass::GetValue;
+  using BaseClass::UnsafeAppend;
+
+  /// Scalar append a arrow::util::Float16
+  Status Append(const Float16 val) { return Append(val.bits()); }
+
+  /// Scalar append a arrow::util::Float16, without checking for capacity
+  void UnsafeAppend(const Float16 val) { UnsafeAppend(val.bits()); }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous array of arrow::util::Float16
+  /// \param[in] length the number of values to append
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value
+  /// \return Status
+  Status AppendValues(const Float16* values, int64_t length,
+                      const uint8_t* valid_bytes = NULLPTR) {
+    return BaseClass::AppendValues(reinterpret_cast<const uint16_t*>(values), length,
+                                   valid_bytes);
+  }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous array of arrow::util::Float16
+  /// \param[in] length the number of values to append
+  /// \param[in] bitmap a validity bitmap to copy (may be null)
+  /// \param[in] bitmap_offset an offset into the validity bitmap
+  /// \return Status
+  Status AppendValues(const Float16* values, int64_t length, const uint8_t* bitmap,
+                      int64_t bitmap_offset) {
+    return BaseClass::AppendValues(reinterpret_cast<const uint16_t*>(values), length,
+                                   bitmap, bitmap_offset);
+  }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous array of arrow::util::Float16
+  /// \param[in] length the number of values to append
+  /// \param[in] is_valid a std::vector<bool> indicating valid (1) or null
+  /// (0). Equal in length to values
+  /// \return Status
+  Status AppendValues(const Float16* values, int64_t length,
+                      const std::vector<bool>& is_valid) {
+    return BaseClass::AppendValues(reinterpret_cast<const uint16_t*>(values), length,
+                                   is_valid);
+  }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a std::vector<arrow::util::Float16>
+  /// \param[in] is_valid a std::vector<bool> indicating valid (1) or null
+  /// (0). Equal in length to values
+  /// \return Status
+  Status AppendValues(const std::vector<Float16>& values,
+                      const std::vector<bool>& is_valid) {
+    return AppendValues(values.data(), static_cast<int64_t>(values.size()), is_valid);
+  }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a std::vector<arrow::util::Float16>
+  /// \return Status
+  Status AppendValues(const std::vector<Float16>& values) {
+    return AppendValues(values.data(), static_cast<int64_t>(values.size()));
+  }
+
+  /// \brief Append one value many times in one shot
+  /// \param[in] length the number of values to append
+  /// \param[in] value a arrow::util::Float16
+  Status AppendValues(int64_t length, Float16 value) {
+    RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(length, value.bits());
+    ArrayBuilder::UnsafeSetNotNull(length);
+    return Status::OK();
+  }
+
+  /// \brief Get the value at a certain index
+  /// \param[in] index the zero-based index
+  /// @tparam T arrow::util::Float16 or value_type (uint16_t)
+  template <typename T = BaseClass::value_type>
+  T GetValue(int64_t index) const {
+    static_assert(std::is_same_v<T, BaseClass::value_type> ||
+                  std::is_same_v<T, arrow::util::Float16>);
+    if constexpr (std::is_same_v<T, BaseClass::value_type>) {
+      return BaseClass::GetValue(index);
+    } else {
+      return Float16::FromBits(BaseClass::GetValue(index));
+    }
+  }
+};
+
+/// @}
+
+class ARROW_EXPORT BooleanBuilder
+    : public ArrayBuilder,
+      public internal::ArrayBuilderExtraOps<BooleanBuilder, bool> {
+ public:
+  using TypeClass = BooleanType;
+  using value_type = bool;
+
+  explicit BooleanBuilder(MemoryPool* pool = default_memory_pool(),
+                          int64_t alignment = kDefaultBufferAlignment);
+
+  BooleanBuilder(const std::shared_ptr<DataType>& type,
+                 MemoryPool* pool = default_memory_pool(),
+                 int64_t alignment = kDefaultBufferAlignment);
+
+  /// Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory
+  Status AppendNulls(int64_t length) final {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(length, false);
+    UnsafeSetNull(length);
+    return Status::OK();
+  }
+
+  Status AppendNull() final {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppendNull();
+    return Status::OK();
+  }
+
+  Status AppendEmptyValue() final {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    data_builder_.UnsafeAppend(false);
+    UnsafeSetNotNull(1);
+    return Status::OK();
+  }
+
+  Status AppendEmptyValues(int64_t length) final {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend(length, false);
+    UnsafeSetNotNull(length);
+    return Status::OK();
+  }
+
+  /// Scalar append
+  Status Append(const bool val) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppend(val);
+    return Status::OK();
+  }
+
+  Status Append(const uint8_t val) { return Append(val != 0); }
+
+  /// Scalar append, without checking for capacity
+  void UnsafeAppend(const bool val) {
+    data_builder_.UnsafeAppend(val);
+    UnsafeAppendToBitmap(true);
+  }
+
+  void UnsafeAppendNull() {
+    data_builder_.UnsafeAppend(false);
+    UnsafeAppendToBitmap(false);
+  }
+
+  void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous array of bytes (non-zero is 1)
+  /// \param[in] length the number of values to append
+  /// \param[in] valid_bytes an optional sequence of bytes where non-zero
+  /// indicates a valid (non-null) value
+  /// \return Status
+  Status AppendValues(const uint8_t* values, int64_t length,
+                      const uint8_t* valid_bytes = NULLPTR);
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a bitmap of values
+  /// \param[in] length the number of values to append
+  /// \param[in] validity a validity bitmap to copy (may be null)
+  /// \param[in] offset an offset into the values and validity bitmaps
+  /// \return Status
+  Status AppendValues(const uint8_t* values, int64_t length, const uint8_t* validity,
+                      int64_t offset);
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a contiguous C array of values
+  /// \param[in] length the number of values to append
+  /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+  /// (0). Equal in length to values
+  /// \return Status
+  Status AppendValues(const uint8_t* values, int64_t length,
+                      const std::vector<bool>& is_valid);
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a std::vector of bytes
+  /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+  /// (0). Equal in length to values
+  /// \return Status
+  Status AppendValues(const std::vector<uint8_t>& values,
+                      const std::vector<bool>& is_valid);
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values a std::vector of bytes
+  /// \return Status
+  Status AppendValues(const std::vector<uint8_t>& values);
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values an std::vector<bool> indicating true (1) or false
+  /// \param[in] is_valid an std::vector<bool> indicating valid (1) or null
+  /// (0). Equal in length to values
+  /// \return Status
+  Status AppendValues(const std::vector<bool>& values, const std::vector<bool>& is_valid);
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values an std::vector<bool> indicating true (1) or false
+  /// \return Status
+  Status AppendValues(const std::vector<bool>& values);
+
+  /// \brief Append a sequence of elements in one shot
+  /// \param[in] values_begin InputIterator to the beginning of the values
+  /// \param[in] values_end InputIterator pointing to the end of the values
+  ///  or null(0) values
+  /// \return Status
+  template <typename ValuesIter>
+  Status AppendValues(ValuesIter values_begin, ValuesIter values_end) {
+    int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend<false>(
+        length, [&values_begin]() -> bool { return *values_begin++; });
+    // this updates length_
+    UnsafeSetNotNull(length);
+    return Status::OK();
+  }
+
+  /// \brief Append a sequence of elements in one shot, with a specified nullmap
+  /// \param[in] values_begin InputIterator to the beginning of the values
+  /// \param[in] values_end InputIterator pointing to the end of the values
+  /// \param[in] valid_begin InputIterator with elements indication valid(1)
+  ///  or null(0) values
+  /// \return Status
+  template <typename ValuesIter, typename ValidIter>
+  enable_if_t<!std::is_pointer<ValidIter>::value, Status> AppendValues(
+      ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
+    static_assert(!internal::is_null_pointer<ValidIter>::value,
+                  "Don't pass a NULLPTR directly as valid_begin, use the 2-argument "
+                  "version instead");
+    int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
+    ARROW_RETURN_NOT_OK(Reserve(length));
+
+    data_builder_.UnsafeAppend<false>(
+        length, [&values_begin]() -> bool { return *values_begin++; });
+    null_bitmap_builder_.UnsafeAppend<true>(
+        length, [&valid_begin]() -> bool { return *valid_begin++; });
+    length_ = null_bitmap_builder_.length();
+    null_count_ = null_bitmap_builder_.false_count();
+    return Status::OK();
+  }
+
+  // Same as above, for a pointer type ValidIter
+  template <typename ValuesIter, typename ValidIter>
+  enable_if_t<std::is_pointer<ValidIter>::value, Status> AppendValues(
+      ValuesIter values_begin, ValuesIter values_end, ValidIter valid_begin) {
+    int64_t length = static_cast<int64_t>(std::distance(values_begin, values_end));
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    data_builder_.UnsafeAppend<false>(
+        length, [&values_begin]() -> bool { return *values_begin++; });
+
+    if (valid_begin == NULLPTR) {
+      UnsafeSetNotNull(length);
+    } else {
+      null_bitmap_builder_.UnsafeAppend<true>(
+          length, [&valid_begin]() -> bool { return *valid_begin++; });
+    }
+    length_ = null_bitmap_builder_.length();
+    null_count_ = null_bitmap_builder_.false_count();
+    return Status::OK();
+  }
+
+  Status AppendValues(int64_t length, bool value);
+
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+                          int64_t length) override {
+    return AppendValues(array.GetValues<uint8_t>(1, 0), length,
+                        array.GetValues<uint8_t>(0, 0), array.offset + offset);
+  }
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<BooleanArray>* out) { return FinishTyped(out); }
+
+  void Reset() override;
+  Status Resize(int64_t capacity) override;
+
+  std::shared_ptr<DataType> type() const override { return boolean(); }
+
+ protected:
+  TypedBufferBuilder<bool> data_builder_;
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/builder_run_end.h b/pyarrow/include/arrow/array/builder_run_end.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac92efbd0dbe6b470b8275219e75b41aa3f7ab3a
--- /dev/null
+++ b/pyarrow/include/arrow/array/builder_run_end.h
@@ -0,0 +1,303 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_base.h"
+
+namespace arrow {
+
+/// \addtogroup run-end-encoded-builders
+///
+/// @{
+
+namespace internal {
+
+/// \brief An ArrayBuilder that deduplicates repeated values as they are
+/// appended to the inner-ArrayBuilder and reports the length of the current run
+/// of identical values.
+///
+/// The following sequence of calls
+///
+///     Append(2)
+///     Append(2)
+///     Append(2)
+///     Append(7)
+///     Append(7)
+///     Append(2)
+///     FinishInternal()
+///
+/// will cause the inner-builder to receive only 3 Append calls
+///
+///     Append(2)
+///     Append(7)
+///     Append(2)
+///     FinishInternal()
+///
+/// Note that values returned by length(), null_count() and capacity() are
+/// related to the compressed array built by the inner-ArrayBuilder.
+class RunCompressorBuilder : public ArrayBuilder {
+ public:
+  RunCompressorBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> inner_builder,
+                       std::shared_ptr<DataType> type);
+
+  ~RunCompressorBuilder() override;
+
+  ARROW_DISALLOW_COPY_AND_ASSIGN(RunCompressorBuilder);
+
+  /// \brief Called right before a run is being closed
+  ///
+  /// Subclasses can override this function to perform an additional action when
+  /// a run is closed (i.e. run-length is known and value is appended to the
+  /// inner builder).
+  ///
+  /// \param value can be NULLPTR if closing a run of NULLs
+  /// \param length the greater than 0 length of the value run being closed
+  virtual Status WillCloseRun(const std::shared_ptr<const Scalar>& value,
+                              int64_t length) {
+    return Status::OK();
+  }
+
+  /// \brief Called right before a run of empty values is being closed
+  ///
+  /// Subclasses can override this function to perform an additional action when
+  /// a run of empty values is appended (i.e. run-length is known and a single
+  /// empty value is appended to the inner builder).
+  ///
+  /// \param length the greater than 0 length of the value run being closed
+  virtual Status WillCloseRunOfEmptyValues(int64_t length) { return Status::OK(); }
+
+  /// \brief Allocate enough memory for a given number of array elements.
+  ///
+  /// NOTE: Conservatively resizing a run-length compressed array for a given
+  /// number of logical elements is not possible, since the physical length will
+  /// vary depending on the values to be appended in the future. But we can
+  /// pessimistically assume that each run will contain a single value and
+  /// allocate that number of runs.
+  Status Resize(int64_t capacity) override { return ResizePhysical(capacity); }
+
+  /// \brief Allocate enough memory for a given number of runs.
+  ///
+  /// Like Resize on non-encoded builders, it does not account for variable size
+  /// data.
+  Status ResizePhysical(int64_t capacity);
+
+  Status ReservePhysical(int64_t additional_capacity) {
+    return Reserve(additional_capacity);
+  }
+
+  void Reset() override;
+
+  Status AppendNull() final { return AppendNulls(1); }
+  Status AppendNulls(int64_t length) override;
+
+  Status AppendEmptyValue() final { return AppendEmptyValues(1); }
+  Status AppendEmptyValues(int64_t length) override;
+
+  Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override;
+  Status AppendScalars(const ScalarVector& scalars) override;
+
+  // AppendArraySlice() is not implemented.
+
+  /// \brief Append a slice of an array containing values from already
+  /// compressed runs.
+  ///
+  /// NOTE: WillCloseRun() is not called as the length of each run cannot be
+  /// determined at this point. Caller should ensure that !has_open_run() by
+  /// calling FinishCurrentRun() before calling this.
+  ///
+  /// Pre-condition: !has_open_run()
+  Status AppendRunCompressedArraySlice(const ArraySpan& array, int64_t offset,
+                                       int64_t length);
+
+  /// \brief Forces the closing of the current run if one is currently open.
+  ///
+  /// This can be called when one wants to ensure the current run will not be
+  /// extended. This may cause identical values to appear close to each other in
+  /// the underlying array (i.e. two runs that could be a single run) if more
+  /// values are appended after this is called.
+  ///
+  /// Finish() and FinishInternal() call this automatically.
+  virtual Status FinishCurrentRun();
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  ArrayBuilder& inner_builder() const { return *inner_builder_; }
+
+  std::shared_ptr<DataType> type() const override { return inner_builder_->type(); }
+
+  bool has_open_run() const { return current_run_length_ > 0; }
+  int64_t open_run_length() const { return current_run_length_; }
+
+ private:
+  inline void UpdateDimensions() {
+    capacity_ = inner_builder_->capacity();
+    length_ = inner_builder_->length();
+    null_count_ = inner_builder_->null_count();
+  }
+
+ private:
+  std::shared_ptr<ArrayBuilder> inner_builder_;
+  std::shared_ptr<const Scalar> current_value_ = NULLPTR;
+  int64_t current_run_length_ = 0;
+};
+
+}  // namespace internal
+
+// ----------------------------------------------------------------------
+// RunEndEncoded builder
+
+/// \brief Run-end encoded array builder.
+///
+/// NOTE: the value returned by and capacity() is related to the
+/// compressed array (physical) and not the decoded array (logical) that is
+/// run-end encoded. null_count() always returns 0. length(), on the other hand,
+/// returns the logical length of the run-end encoded array.
+class ARROW_EXPORT RunEndEncodedBuilder : public ArrayBuilder {
+ private:
+  // An internal::RunCompressorBuilder that produces a run-end in the
+  // RunEndEncodedBuilder every time a value-run is closed.
+  class ValueRunBuilder : public internal::RunCompressorBuilder {
+   public:
+    ValueRunBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& value_builder,
+                    const std::shared_ptr<DataType>& value_type,
+                    RunEndEncodedBuilder& ree_builder);
+
+    ~ValueRunBuilder() override = default;
+
+    Status WillCloseRun(const std::shared_ptr<const Scalar>&, int64_t length) override {
+      return ree_builder_.CloseRun(length);
+    }
+
+    Status WillCloseRunOfEmptyValues(int64_t length) override {
+      return ree_builder_.CloseRun(length);
+    }
+
+   private:
+    RunEndEncodedBuilder& ree_builder_;
+  };
+
+ public:
+  RunEndEncodedBuilder(MemoryPool* pool,
+                       const std::shared_ptr<ArrayBuilder>& run_end_builder,
+                       const std::shared_ptr<ArrayBuilder>& value_builder,
+                       std::shared_ptr<DataType> type);
+
+  /// \brief Allocate enough memory for a given number of array elements.
+  ///
+  /// NOTE: Conservatively resizing an REE for a given number of logical
+  /// elements is not possible, since the physical length will vary depending on
+  /// the values to be appended in the future. But we can pessimistically assume
+  /// that each run will contain a single value and allocate that number of
+  /// runs.
+  Status Resize(int64_t capacity) override { return ResizePhysical(capacity); }
+
+  /// \brief Allocate enough memory for a given number of runs.
+  Status ResizePhysical(int64_t capacity);
+
+  /// \brief Ensure that there is enough space allocated to append the indicated
+  /// number of run without any further reallocation. Overallocation is
+  /// used in order to minimize the impact of incremental ReservePhysical() calls.
+  /// Note that additional_capacity is relative to the current number of elements
+  /// rather than to the current capacity, so calls to Reserve() which are not
+  /// interspersed with addition of new elements may not increase the capacity.
+  ///
+  /// \param[in] additional_capacity the number of additional runs
+  /// \return Status
+  Status ReservePhysical(int64_t additional_capacity) {
+    return Reserve(additional_capacity);
+  }
+
+  void Reset() override;
+
+  Status AppendNull() final { return AppendNulls(1); }
+  Status AppendNulls(int64_t length) override;
+
+  Status AppendEmptyValue() final { return AppendEmptyValues(1); }
+  Status AppendEmptyValues(int64_t length) override;
+  Status AppendScalar(const Scalar& scalar, int64_t n_repeats) override;
+  Status AppendScalars(const ScalarVector& scalars) override;
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+                          int64_t length) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<RunEndEncodedArray>* out) { return FinishTyped(out); }
+
+  /// \brief Forces the closing of the current run if one is currently open.
+  ///
+  /// This can be called when one wants to ensure the current run will not be
+  /// extended. This may cause identical values to appear close to each other in
+  /// the values array (i.e. two runs that could be a single run) if more
+  /// values are appended after this is called.
+  Status FinishCurrentRun();
+
+  std::shared_ptr<DataType> type() const override;
+
+ private:
+  /// \brief Update physical capacity and logical length
+  ///
+  /// \param committed_logical_length number of logical values that have been
+  ///                                 committed to the values array
+  /// \param open_run_length number of logical values in the currently open run if any
+  inline void UpdateDimensions(int64_t committed_logical_length,
+                               int64_t open_run_length) {
+    capacity_ = run_end_builder().capacity();
+    length_ = committed_logical_length + open_run_length;
+    committed_logical_length_ = committed_logical_length;
+  }
+
+  // Pre-condition: !value_run_builder_.has_open_run()
+  template <typename RunEndCType>
+  Status DoAppendArraySlice(const ArraySpan& array, int64_t offset, int64_t length);
+
+  template <typename RunEndCType>
+  Status DoAppendRunEnd(int64_t run_end);
+
+  /// \brief Cast run_end to the appropriate type and appends it to the run_ends
+  /// array.
+  Status AppendRunEnd(int64_t run_end);
+
+  /// \brief Close a run by appending a value to the run_ends array and updating
+  /// length_ to reflect the new run.
+  ///
+  /// Pre-condition: run_length > 0.
+  [[nodiscard]] Status CloseRun(int64_t run_length);
+
+  ArrayBuilder& run_end_builder();
+  ArrayBuilder& value_builder();
+
+ private:
+  std::shared_ptr<RunEndEncodedType> type_;
+  ValueRunBuilder* value_run_builder_;
+  // The length not counting the current open run in the value_run_builder_
+  int64_t committed_logical_length_ = 0;
+};
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/builder_time.h b/pyarrow/include/arrow/array/builder_time.h
new file mode 100644
index 0000000000000000000000000000000000000000..b471e9621cd4b125fd44e8f2f4239c7f720ac95d
--- /dev/null
+++ b/pyarrow/include/arrow/array/builder_time.h
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Contains declarations of time related Arrow builder types.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/array/builder_primitive.h"
+
+namespace arrow {
+
+/// \addtogroup temporal-builders
+///
+/// @{
+
+class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder<DayTimeIntervalType> {
+ public:
+  using DayMilliseconds = DayTimeIntervalType::DayMilliseconds;
+
+  explicit DayTimeIntervalBuilder(MemoryPool* pool = default_memory_pool(),
+                                  int64_t alignment = kDefaultBufferAlignment)
+      : DayTimeIntervalBuilder(day_time_interval(), pool, alignment) {}
+
+  explicit DayTimeIntervalBuilder(std::shared_ptr<DataType> type,
+                                  MemoryPool* pool = default_memory_pool(),
+                                  int64_t alignment = kDefaultBufferAlignment)
+      : NumericBuilder<DayTimeIntervalType>(type, pool, alignment) {}
+};
+
+class ARROW_EXPORT MonthDayNanoIntervalBuilder
+    : public NumericBuilder<MonthDayNanoIntervalType> {
+ public:
+  using MonthDayNanos = MonthDayNanoIntervalType::MonthDayNanos;
+
+  explicit MonthDayNanoIntervalBuilder(MemoryPool* pool = default_memory_pool(),
+                                       int64_t alignment = kDefaultBufferAlignment)
+      : MonthDayNanoIntervalBuilder(month_day_nano_interval(), pool, alignment) {}
+
+  explicit MonthDayNanoIntervalBuilder(std::shared_ptr<DataType> type,
+                                       MemoryPool* pool = default_memory_pool(),
+                                       int64_t alignment = kDefaultBufferAlignment)
+      : NumericBuilder<MonthDayNanoIntervalType>(type, pool, alignment) {}
+};
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/builder_union.h b/pyarrow/include/arrow/array/builder_union.h
new file mode 100644
index 0000000000000000000000000000000000000000..718ef4c32cebef1d30e4f7c036a7ab8f4b333e4a
--- /dev/null
+++ b/pyarrow/include/arrow/array/builder_union.h
@@ -0,0 +1,254 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/array/data.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \addtogroup nested-builders
+///
+/// @{
+
+/// \brief Base class for union array builds.
+///
+/// Note that while we subclass ArrayBuilder, as union types do not have a
+/// validity bitmap, the bitmap builder member of ArrayBuilder is not used.
+class ARROW_EXPORT BasicUnionBuilder : public ArrayBuilder {
+ public:
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+  /// \cond FALSE
+  using ArrayBuilder::Finish;
+  /// \endcond
+
+  Status Finish(std::shared_ptr<UnionArray>* out) { return FinishTyped(out); }
+
+  /// \brief Make a new child builder available to the UnionArray
+  ///
+  /// \param[in] new_child the child builder
+  /// \param[in] field_name the name of the field in the union array type
+  /// if type inference is used
+  /// \return child index, which is the "type" argument that needs
+  /// to be passed to the "Append" method to add a new element to
+  /// the union array.
+  int8_t AppendChild(const std::shared_ptr<ArrayBuilder>& new_child,
+                     const std::string& field_name = "");
+
+  std::shared_ptr<DataType> type() const override;
+
+  int64_t length() const override { return types_builder_.length(); }
+
+ protected:
+  BasicUnionBuilder(MemoryPool* pool, int64_t alignment,
+                    const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+                    const std::shared_ptr<DataType>& type);
+
+  int8_t NextTypeId();
+
+  std::vector<std::shared_ptr<Field>> child_fields_;
+  std::vector<int8_t> type_codes_;
+  UnionMode::type mode_;
+
+  std::vector<ArrayBuilder*> type_id_to_children_;
+  std::vector<int> type_id_to_child_id_;
+  // for all type_id < dense_type_id_, type_id_to_children_[type_id] != nullptr
+  int8_t dense_type_id_ = 0;
+  TypedBufferBuilder<int8_t> types_builder_;
+};
+
+/// \class DenseUnionBuilder
+///
+/// This API is EXPERIMENTAL.
+class ARROW_EXPORT DenseUnionBuilder : public BasicUnionBuilder {
+ public:
+  /// Use this constructor to initialize the UnionBuilder with no child builders,
+  /// allowing type to be inferred. You will need to call AppendChild for each of the
+  /// children builders you want to use.
+  explicit DenseUnionBuilder(MemoryPool* pool,
+                             int64_t alignment = kDefaultBufferAlignment)
+      : BasicUnionBuilder(pool, alignment, {}, dense_union(FieldVector{})),
+        offsets_builder_(pool, alignment) {}
+
+  /// Use this constructor to specify the type explicitly.
+  /// You can still add child builders to the union after using this constructor
+  DenseUnionBuilder(MemoryPool* pool,
+                    const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+                    const std::shared_ptr<DataType>& type,
+                    int64_t alignment = kDefaultBufferAlignment)
+      : BasicUnionBuilder(pool, alignment, children, type),
+        offsets_builder_(pool, alignment) {}
+
+  Status AppendNull() final {
+    const int8_t first_child_code = type_codes_[0];
+    ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
+    ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
+    ARROW_RETURN_NOT_OK(
+        offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
+    // Append a null arbitrarily to the first child
+    return child_builder->AppendNull();
+  }
+
+  Status AppendNulls(int64_t length) final {
+    const int8_t first_child_code = type_codes_[0];
+    ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
+    ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
+    ARROW_RETURN_NOT_OK(
+        offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
+    // Append just a single null to the first child
+    return child_builder->AppendNull();
+  }
+
+  Status AppendEmptyValue() final {
+    const int8_t first_child_code = type_codes_[0];
+    ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
+    ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
+    ARROW_RETURN_NOT_OK(
+        offsets_builder_.Append(static_cast<int32_t>(child_builder->length())));
+    // Append an empty value arbitrarily to the first child
+    return child_builder->AppendEmptyValue();
+  }
+
+  Status AppendEmptyValues(int64_t length) final {
+    const int8_t first_child_code = type_codes_[0];
+    ArrayBuilder* child_builder = type_id_to_children_[first_child_code];
+    ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
+    ARROW_RETURN_NOT_OK(
+        offsets_builder_.Append(length, static_cast<int32_t>(child_builder->length())));
+    // Append just a single empty value to the first child
+    return child_builder->AppendEmptyValue();
+  }
+
+  /// \brief Append an element to the UnionArray. This must be followed
+  ///        by an append to the appropriate child builder.
+  ///
+  /// \param[in] next_type type_id of the child to which the next value will be appended.
+  ///
+  /// The corresponding child builder must be appended to independently after this method
+  /// is called.
+  Status Append(int8_t next_type) {
+    ARROW_RETURN_NOT_OK(types_builder_.Append(next_type));
+    if (type_id_to_children_[next_type]->length() == kListMaximumElements) {
+      return Status::CapacityError(
+          "a dense UnionArray cannot contain more than 2^31 - 1 elements from a single "
+          "child");
+    }
+    auto offset = static_cast<int32_t>(type_id_to_children_[next_type]->length());
+    return offsets_builder_.Append(offset);
+  }
+
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+                          int64_t length) override;
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ private:
+  TypedBufferBuilder<int32_t> offsets_builder_;
+};
+
+/// \class SparseUnionBuilder
+///
+/// This API is EXPERIMENTAL.
+class ARROW_EXPORT SparseUnionBuilder : public BasicUnionBuilder {
+ public:
+  /// Use this constructor to initialize the UnionBuilder with no child builders,
+  /// allowing type to be inferred. You will need to call AppendChild for each of the
+  /// children builders you want to use.
+  explicit SparseUnionBuilder(MemoryPool* pool,
+                              int64_t alignment = kDefaultBufferAlignment)
+      : BasicUnionBuilder(pool, alignment, {}, sparse_union(FieldVector{})) {}
+
+  /// Use this constructor to specify the type explicitly.
+  /// You can still add child builders to the union after using this constructor
+  SparseUnionBuilder(MemoryPool* pool,
+                     const std::vector<std::shared_ptr<ArrayBuilder>>& children,
+                     const std::shared_ptr<DataType>& type,
+                     int64_t alignment = kDefaultBufferAlignment)
+      : BasicUnionBuilder(pool, alignment, children, type) {}
+
+  /// \brief Append a null value.
+  ///
+  /// A null is appended to the first child, empty values to the other children.
+  Status AppendNull() final {
+    const auto first_child_code = type_codes_[0];
+    ARROW_RETURN_NOT_OK(types_builder_.Append(first_child_code));
+    ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNull());
+    for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
+      ARROW_RETURN_NOT_OK(type_id_to_children_[type_codes_[i]]->AppendEmptyValue());
+    }
+    return Status::OK();
+  }
+
+  /// \brief Append multiple null values.
+  ///
+  /// Nulls are appended to the first child, empty values to the other children.
+  Status AppendNulls(int64_t length) final {
+    const auto first_child_code = type_codes_[0];
+    ARROW_RETURN_NOT_OK(types_builder_.Append(length, first_child_code));
+    ARROW_RETURN_NOT_OK(type_id_to_children_[first_child_code]->AppendNulls(length));
+    for (int i = 1; i < static_cast<int>(type_codes_.size()); ++i) {
+      ARROW_RETURN_NOT_OK(
+          type_id_to_children_[type_codes_[i]]->AppendEmptyValues(length));
+    }
+    return Status::OK();
+  }
+
+  Status AppendEmptyValue() final {
+    ARROW_RETURN_NOT_OK(types_builder_.Append(type_codes_[0]));
+    for (int8_t code : type_codes_) {
+      ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValue());
+    }
+    return Status::OK();
+  }
+
+  Status AppendEmptyValues(int64_t length) final {
+    ARROW_RETURN_NOT_OK(types_builder_.Append(length, type_codes_[0]));
+    for (int8_t code : type_codes_) {
+      ARROW_RETURN_NOT_OK(type_id_to_children_[code]->AppendEmptyValues(length));
+    }
+    return Status::OK();
+  }
+
+  /// \brief Append an element to the UnionArray. This must be followed
+  ///        by an append to the appropriate child builder.
+  ///
+  /// \param[in] next_type type_id of the child to which the next value will be appended.
+  ///
+  /// The corresponding child builder must be appended to independently after this method
+  /// is called, and all other child builders must have null or empty value appended.
+  Status Append(int8_t next_type) { return types_builder_.Append(next_type); }
+
+  Status AppendArraySlice(const ArraySpan& array, int64_t offset,
+                          int64_t length) override;
+};
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/concatenate.h b/pyarrow/include/arrow/array/concatenate.h
new file mode 100644
index 0000000000000000000000000000000000000000..aada5624d63a3052edddf0182799c474bee0c528
--- /dev/null
+++ b/pyarrow/include/arrow/array/concatenate.h
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief Concatenate arrays
+///
+/// \param[in] arrays a vector of arrays to be concatenated
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \param[out] out_suggested_cast if a non-OK Result is returned, the function might set
+///   out_suggested_cast to a cast suggestion that would allow concatenating the arrays
+///   without overflow of offsets (e.g. string to large_string)
+///
+/// \return the concatenated array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays, MemoryPool* pool,
+                                           std::shared_ptr<DataType>* out_suggested_cast);
+
+}  // namespace internal
+
+/// \brief Concatenate arrays
+///
+/// \param[in] arrays a vector of arrays to be concatenated
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return the concatenated array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays,
+                                           MemoryPool* pool = default_memory_pool());
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/data.h b/pyarrow/include/arrow/array/data.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6636df9bb3025de78b00b1f5b4265783c05e148
--- /dev/null
+++ b/pyarrow/include/arrow/array/data.h
@@ -0,0 +1,750 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>  // IWYU pragma: export
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/statistics.h"
+#include "arrow/buffer.h"
+#include "arrow/result.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/span.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace internal {
+// ----------------------------------------------------------------------
+// Null handling for types without a validity bitmap and the dictionary type
+
+ARROW_EXPORT bool IsNullSparseUnion(const ArrayData& data, int64_t i);
+ARROW_EXPORT bool IsNullDenseUnion(const ArrayData& data, int64_t i);
+ARROW_EXPORT bool IsNullRunEndEncoded(const ArrayData& data, int64_t i);
+
+ARROW_EXPORT bool UnionMayHaveLogicalNulls(const ArrayData& data);
+ARROW_EXPORT bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data);
+ARROW_EXPORT bool DictionaryMayHaveLogicalNulls(const ArrayData& data);
+
+}  // namespace internal
+
+// When slicing, we do not know the null count of the sliced range without
+// doing some computation. To avoid doing this eagerly, we set the null count
+// to -1 (any negative number will do). When Array::null_count is called the
+// first time, the null count will be computed. See ARROW-33
+constexpr int64_t kUnknownNullCount = -1;
+
+// ----------------------------------------------------------------------
+// Generic array data container
+
+/// \class ArrayData
+/// \brief Mutable container for generic Arrow array data
+///
+/// This data structure is a self-contained representation of the memory and
+/// metadata inside an Arrow array data structure (called vectors in Java). The
+/// Array class and its concrete subclasses provide strongly-typed accessors
+/// with support for the visitor pattern and other affordances.
+///
+/// This class is designed for easy internal data manipulation, analytical data
+/// processing, and data transport to and from IPC messages.
+///
+/// This class is also useful in an analytics setting where memory may be
+/// efficiently reused. For example, computing the Abs of a numeric array
+/// should return null iff the input is null: therefore, an Abs function can
+/// reuse the validity bitmap (a Buffer) of its input as the validity bitmap
+/// of its output.
+///
+/// This class is meant mostly for immutable data access. Any mutable access
+/// (either to ArrayData members or to the contents of its Buffers) should take
+/// into account the fact that ArrayData instances are typically wrapped in a
+/// shared_ptr and can therefore have multiple owners at any given time.
+/// Therefore, mutable access is discouraged except when initially populating
+/// the ArrayData.
+struct ARROW_EXPORT ArrayData {
+  ArrayData() = default;
+
+  ArrayData(std::shared_ptr<DataType> type, int64_t length,
+            int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+      : type(std::move(type)), length(length), null_count(null_count), offset(offset) {}
+
+  ArrayData(std::shared_ptr<DataType> type, int64_t length,
+            std::vector<std::shared_ptr<Buffer>> buffers,
+            int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+      : ArrayData(std::move(type), length, null_count, offset) {
+    this->buffers = std::move(buffers);
+#ifndef NDEBUG
+    // in debug mode, call the `device_type` function to trigger
+    // the DCHECKs that validate all the buffers are on the same device
+    ARROW_UNUSED(this->device_type());
+#endif
+  }
+
+  ArrayData(std::shared_ptr<DataType> type, int64_t length,
+            std::vector<std::shared_ptr<Buffer>> buffers,
+            std::vector<std::shared_ptr<ArrayData>> child_data,
+            int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+      : ArrayData(std::move(type), length, null_count, offset) {
+    this->buffers = std::move(buffers);
+    this->child_data = std::move(child_data);
+#ifndef NDEBUG
+    // in debug mode, call the `device_type` function to trigger
+    // the DCHECKs that validate all the buffers (including children)
+    // are on the same device
+    ARROW_UNUSED(this->device_type());
+#endif
+  }
+
+  static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
+                                         std::vector<std::shared_ptr<Buffer>> buffers,
+                                         int64_t null_count = kUnknownNullCount,
+                                         int64_t offset = 0);
+
+  static std::shared_ptr<ArrayData> Make(
+      std::shared_ptr<DataType> type, int64_t length,
+      std::vector<std::shared_ptr<Buffer>> buffers,
+      std::vector<std::shared_ptr<ArrayData>> child_data,
+      int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+  static std::shared_ptr<ArrayData> Make(
+      std::shared_ptr<DataType> type, int64_t length,
+      std::vector<std::shared_ptr<Buffer>> buffers,
+      std::vector<std::shared_ptr<ArrayData>> child_data,
+      std::shared_ptr<ArrayData> dictionary, int64_t null_count = kUnknownNullCount,
+      int64_t offset = 0);
+
+  static std::shared_ptr<ArrayData> Make(std::shared_ptr<DataType> type, int64_t length,
+                                         int64_t null_count = kUnknownNullCount,
+                                         int64_t offset = 0);
+
+  // Move constructor
+  ArrayData(ArrayData&& other) noexcept
+      : type(std::move(other.type)),
+        length(other.length),
+        null_count(other.null_count.load()),
+        offset(other.offset),
+        buffers(std::move(other.buffers)),
+        child_data(std::move(other.child_data)),
+        dictionary(std::move(other.dictionary)),
+        statistics(std::move(other.statistics)) {}
+
+  // Copy constructor
+  ArrayData(const ArrayData& other) noexcept
+      : type(other.type),
+        length(other.length),
+        null_count(other.null_count.load()),
+        offset(other.offset),
+        buffers(other.buffers),
+        child_data(other.child_data),
+        dictionary(other.dictionary),
+        statistics(other.statistics) {}
+
+  // Move assignment
+  ArrayData& operator=(ArrayData&& other) {
+    type = std::move(other.type);
+    length = other.length;
+    SetNullCount(other.null_count);
+    offset = other.offset;
+    buffers = std::move(other.buffers);
+    child_data = std::move(other.child_data);
+    dictionary = std::move(other.dictionary);
+    statistics = std::move(other.statistics);
+    return *this;
+  }
+
+  // Copy assignment
+  ArrayData& operator=(const ArrayData& other) {
+    type = other.type;
+    length = other.length;
+    SetNullCount(other.null_count);
+    offset = other.offset;
+    buffers = other.buffers;
+    child_data = other.child_data;
+    dictionary = other.dictionary;
+    statistics = other.statistics;
+    return *this;
+  }
+
+  /// \brief Return a shallow copy of this ArrayData
+  std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
+
+  /// \brief Deep copy this ArrayData to destination memory manager
+  ///
+  /// Returns a new ArrayData object with buffers and all child buffers
+  /// copied to the destination memory manager. This includes dictionaries
+  /// if applicable.
+  Result<std::shared_ptr<ArrayData>> CopyTo(
+      const std::shared_ptr<MemoryManager>& to) const;
+
+  /// \brief View or copy this ArrayData to destination memory manager
+  ///
+  /// Tries to view the buffer contents on the given memory manager's device
+  /// if possible (to avoid a copy) but falls back to copying if a no-copy view
+  /// isn't supported.
+  Result<std::shared_ptr<ArrayData>> ViewOrCopyTo(
+      const std::shared_ptr<MemoryManager>& to) const;
+
+  /// \brief Return the null-ness of a given array element
+  ///
+  /// Calling `IsNull(i)` is the same as `!IsValid(i)`.
+  bool IsNull(int64_t i) const { return !IsValid(i); }
+
+  /// \brief Return the validity of a given array element
+  ///
+  /// For most data types, this will simply query the validity bitmap.
+  /// For union and run-end-encoded arrays, the underlying child data is
+  /// queried instead.
+  /// For dictionary arrays, this reflects the validity of the dictionary
+  /// index, but the corresponding dictionary value might still be null.
+  /// For null arrays, this always returns false.
+  bool IsValid(int64_t i) const {
+    if (buffers[0] != NULLPTR) {
+      return bit_util::GetBit(buffers[0]->data(), i + offset);
+    }
+    const auto type = this->type->id();
+    if (type == Type::SPARSE_UNION) {
+      return !internal::IsNullSparseUnion(*this, i);
+    }
+    if (type == Type::DENSE_UNION) {
+      return !internal::IsNullDenseUnion(*this, i);
+    }
+    if (type == Type::RUN_END_ENCODED) {
+      return !internal::IsNullRunEndEncoded(*this, i);
+    }
+    return null_count.load() != length;
+  }
+
+  /// \brief Access a buffer's data as a typed C pointer
+  ///
+  /// \param i the buffer index
+  /// \param absolute_offset the offset into the buffer
+  ///
+  /// If `absolute_offset` is non-zero, the type `T` must match the
+  /// layout of buffer number `i` for the array's data type; otherwise
+  /// offset computation would be incorrect.
+  ///
+  /// If the given buffer is bit-packed (such as a validity bitmap, or
+  /// the data buffer of a boolean array), then `absolute_offset` must be
+  /// zero for correct results, and any bit offset must be applied manually
+  /// by the caller.
+  template <typename T>
+  inline const T* GetValues(int i, int64_t absolute_offset) const {
+    if (buffers[i]) {
+      return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
+    } else {
+      return NULLPTR;
+    }
+  }
+
+  /// \brief Access a buffer's data as a typed C pointer
+  ///
+  /// \param i the buffer index
+  ///
+  /// This method uses the array's offset to index into buffer number `i`.
+  ///
+  /// Calling this method on a bit-packed buffer (such as a validity bitmap, or
+  /// the data buffer of a boolean array) will lead to incorrect results.
+  /// You should instead call `GetValues(i, 0)` and apply the bit offset manually.
+  template <typename T>
+  inline const T* GetValues(int i) const {
+    return GetValues<T>(i, offset);
+  }
+
+  /// \brief Access a buffer's data as a typed C pointer
+  ///
+  /// \param i the buffer index
+  /// \param absolute_offset the offset into the buffer
+  ///
+  /// Like `GetValues(i, absolute_offset)`, but returns nullptr if the given buffer
+  /// is not a CPU buffer.
+  template <typename T>
+  inline const T* GetValuesSafe(int i, int64_t absolute_offset) const {
+    if (buffers[i] && buffers[i]->is_cpu()) {
+      return reinterpret_cast<const T*>(buffers[i]->data()) + absolute_offset;
+    } else {
+      return NULLPTR;
+    }
+  }
+
+  /// \brief Access a buffer's data as a typed C pointer
+  ///
+  /// \param i the buffer index
+  ///
+  /// Like `GetValues(i)`, but returns nullptr if the given buffer is not a CPU buffer.
+  template <typename T>
+  inline const T* GetValuesSafe(int i) const {
+    return GetValuesSafe<T>(i, offset);
+  }
+
+  /// \brief Access a buffer's data as a mutable typed C pointer
+  ///
+  /// \param i the buffer index
+  /// \param absolute_offset the offset into the buffer
+  ///
+  /// Like `GetValues(i, absolute_offset)`, but allows mutating buffer contents.
+  /// This should only be used when initially populating the ArrayData, before
+  /// it is attached to a Array instance.
+  template <typename T>
+  inline T* GetMutableValues(int i, int64_t absolute_offset) {
+    if (buffers[i]) {
+      return reinterpret_cast<T*>(buffers[i]->mutable_data()) + absolute_offset;
+    } else {
+      return NULLPTR;
+    }
+  }
+
+  /// \brief Access a buffer's data as a mutable typed C pointer
+  ///
+  /// \param i the buffer index
+  ///
+  /// Like `GetValues(i)`, but allows mutating buffer contents.
+  /// This should only be used when initially populating the ArrayData, before
+  /// it is attached to a Array instance.
+  template <typename T>
+  inline T* GetMutableValues(int i) {
+    return GetMutableValues<T>(i, offset);
+  }
+
+  /// \brief Construct a zero-copy slice of the data with the given offset and length
+  ///
+  /// This method applies the given slice to this ArrayData, taking into account
+  /// its existing offset and length.
+  /// If the given `length` is too large, the slice length is clamped so as not
+  /// to go past the offset end.
+  /// If the given `often` is too large, or if either `offset` or `length` is negative,
+  /// behavior is undefined.
+  ///
+  /// The associated ArrayStatistics is always discarded in a sliced
+  /// ArrayData, even if the slice is trivially equal to the original ArrayData.
+  /// If you want to reuse the statistics from the original ArrayData, you must
+  /// explicitly reattach them.
+  std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;
+
+  /// \brief Construct a zero-copy slice of the data with the given offset and length
+  ///
+  /// Like `Slice(offset, length)`, but returns an error if the requested slice
+  /// falls out of bounds.
+  /// Unlike Slice, `length` isn't clamped to the available buffer size.
+  Result<std::shared_ptr<ArrayData>> SliceSafe(int64_t offset, int64_t length) const;
+
+  /// \brief Set the cached physical null count
+  ///
+  /// \param v the number of nulls in the ArrayData
+  ///
+  /// This should only be used when initially populating the ArrayData, if
+  /// it possible to compute the null count without visiting the entire validity
+  /// bitmap. In most cases, relying on `GetNullCount` is sufficient.
+  void SetNullCount(int64_t v) { null_count.store(v); }
+
+  /// \brief Return the physical null count
+  ///
+  /// This method returns the number of array elements for which `IsValid` would
+  /// return false.
+  ///
+  /// A cached value is returned if already available, otherwise it is first
+  /// computed and stored.
+  /// How it is is computed depends on the data type, see `IsValid` for details.
+  ///
+  /// Note that this method is typically much faster than calling `IsValid`
+  /// for all elements. Therefore, it helps avoid per-element validity bitmap
+  /// lookups in the common cases where the array contains zero or only nulls.
+  int64_t GetNullCount() const;
+
+  /// \brief Return true if the array may have nulls in its validity bitmap
+  ///
+  /// This method returns true if the data has a validity bitmap, and the physical
+  /// null count is either known to be non-zero or not yet known.
+  ///
+  /// Unlike `MayHaveLogicalNulls`, this does not check for the presence of nulls
+  /// in child data for data types such as unions and run-end encoded types.
+  ///
+  /// \see HasValidityBitmap
+  /// \see MayHaveLogicalNulls
+  bool MayHaveNulls() const {
+    // If an ArrayData is slightly malformed it may have kUnknownNullCount set
+    // but no buffer
+    return null_count.load() != 0 && buffers[0] != NULLPTR;
+  }
+
+  /// \brief Return true if the array has a validity bitmap
+  bool HasValidityBitmap() const { return buffers[0] != NULLPTR; }
+
+  /// \brief Return true if the array may have logical nulls
+  ///
+  /// Unlike `MayHaveNulls`, this method checks for null child values
+  /// for types without a validity bitmap, such as unions and run-end encoded
+  /// types, and for null dictionary values for dictionary types.
+  ///
+  /// This implies that `MayHaveLogicalNulls` may return true for arrays that
+  /// don't have a top-level validity bitmap. It is therefore necessary
+  /// to call `HasValidityBitmap` before accessing a top-level validity bitmap.
+  ///
+  /// Code that previously used MayHaveNulls and then dealt with the validity
+  /// bitmap directly can be fixed to handle all types correctly without
+  /// performance degradation when handling most types by adopting
+  /// HasValidityBitmap and MayHaveLogicalNulls.
+  ///
+  /// Before:
+  ///
+  ///     uint8_t* validity = array.MayHaveNulls() ? array.buffers[0].data : NULLPTR;
+  ///     for (int64_t i = 0; i < array.length; ++i) {
+  ///       if (validity && !bit_util::GetBit(validity, i)) {
+  ///         continue;  // skip a NULL
+  ///       }
+  ///       ...
+  ///     }
+  ///
+  /// After:
+  ///
+  ///     bool all_valid = !array.MayHaveLogicalNulls();
+  ///     uint8_t* validity = array.HasValidityBitmap() ? array.buffers[0].data : NULLPTR;
+  ///     for (int64_t i = 0; i < array.length; ++i) {
+  ///       bool is_valid = all_valid ||
+  ///                       (validity && bit_util::GetBit(validity, i)) ||
+  ///                       array.IsValid(i);
+  ///       if (!is_valid) {
+  ///         continue;  // skip a NULL
+  ///       }
+  ///       ...
+  ///     }
+  bool MayHaveLogicalNulls() const {
+    if (buffers[0] != NULLPTR) {
+      return null_count.load() != 0;
+    }
+    const auto t = type->id();
+    if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) {
+      return internal::UnionMayHaveLogicalNulls(*this);
+    }
+    if (t == Type::RUN_END_ENCODED) {
+      return internal::RunEndEncodedMayHaveLogicalNulls(*this);
+    }
+    if (t == Type::DICTIONARY) {
+      return internal::DictionaryMayHaveLogicalNulls(*this);
+    }
+    return null_count.load() != 0;
+  }
+
+  /// \brief Compute the logical null count for arrays of all types
+  ///
+  /// If the array has a validity bitmap, this function behaves the same as
+  /// GetNullCount. For arrays that have no validity bitmap but whose values
+  /// may be logically null (such as union arrays and run-end encoded arrays),
+  /// this function recomputes the null count every time it is called.
+  ///
+  /// \see GetNullCount
+  int64_t ComputeLogicalNullCount() const;
+
+  /// \brief Return the device_type of the underlying buffers and children
+  ///
+  /// If there are no buffers in this ArrayData object, it just returns
+  /// DeviceAllocationType::kCPU as a default. We also assume that all buffers
+  /// should be allocated on the same device type and perform DCHECKs to confirm
+  /// this in debug mode.
+  ///
+  /// \return DeviceAllocationType
+  DeviceAllocationType device_type() const;
+
+  std::shared_ptr<DataType> type;
+  int64_t length = 0;
+  mutable std::atomic<int64_t> null_count{0};
+  // The logical start point into the physical buffers (in values, not bytes).
+  // Note that, for child data, this must be *added* to the child data's own offset.
+  int64_t offset = 0;
+  std::vector<std::shared_ptr<Buffer>> buffers;
+  std::vector<std::shared_ptr<ArrayData>> child_data;
+
+  // The dictionary for this Array, if any. Only used for dictionary type
+  std::shared_ptr<ArrayData> dictionary;
+
+  // The statistics for this Array.
+  std::shared_ptr<ArrayStatistics> statistics;
+};
+
+/// \brief A non-owning Buffer reference
+struct ARROW_EXPORT BufferSpan {
+  // It is the user of this class's responsibility to ensure that
+  // buffers that were const originally are not written to
+  // accidentally.
+  uint8_t* data = NULLPTR;
+  int64_t size = 0;
+  // Pointer back to buffer that owns this memory
+  const std::shared_ptr<Buffer>* owner = NULLPTR;
+
+  template <typename T>
+  const T* data_as() const {
+    return reinterpret_cast<const T*>(data);
+  }
+  template <typename T>
+  T* mutable_data_as() {
+    return reinterpret_cast<T*>(data);
+  }
+};
+
+/// \brief EXPERIMENTAL: A non-owning array data container
+///
+/// Unlike ArrayData, this class doesn't own its referenced data type nor data buffers.
+/// It is cheaply copyable and can therefore be suitable for use cases where
+/// shared_ptr overhead is not acceptable. However, care should be taken to
+/// keep alive the referenced objects and memory while the ArraySpan object is in use.
+/// For this reason, this should not be exposed in most public APIs (apart from
+/// compute kernel interfaces).
+struct ARROW_EXPORT ArraySpan {
+  const DataType* type = NULLPTR;
+  int64_t length = 0;
+  mutable int64_t null_count = kUnknownNullCount;
+  int64_t offset = 0;
+  BufferSpan buffers[3];
+
+  ArraySpan() = default;
+
+  explicit ArraySpan(const DataType* type, int64_t length) : type(type), length(length) {}
+
+  ArraySpan(const ArrayData& data) {  // NOLINT implicit conversion
+    SetMembers(data);
+  }
+  explicit ArraySpan(const Scalar& data) { FillFromScalar(data); }
+
+  /// If dictionary-encoded, put dictionary in the first entry
+  std::vector<ArraySpan> child_data;
+
+  /// \brief Populate ArraySpan to look like an array of length 1 pointing at
+  /// the data members of a Scalar value
+  void FillFromScalar(const Scalar& value);
+
+  void SetMembers(const ArrayData& data);
+
+  void SetBuffer(int index, const std::shared_ptr<Buffer>& buffer) {
+    this->buffers[index].data = const_cast<uint8_t*>(buffer->data());
+    this->buffers[index].size = buffer->size();
+    this->buffers[index].owner = &buffer;
+  }
+
+  const ArraySpan& dictionary() const { return child_data[0]; }
+
+  /// \brief Return the number of buffers (out of 3) that are used to
+  /// constitute this array
+  int num_buffers() const;
+
+  // Access a buffer's data as a typed C pointer
+  template <typename T>
+  inline T* GetValues(int i, int64_t absolute_offset) {
+    return reinterpret_cast<T*>(buffers[i].data) + absolute_offset;
+  }
+
+  template <typename T>
+  inline T* GetValues(int i) {
+    return GetValues<T>(i, this->offset);
+  }
+
+  // Access a buffer's data as a typed C pointer
+  template <typename T>
+  inline const T* GetValues(int i, int64_t absolute_offset) const {
+    return reinterpret_cast<const T*>(buffers[i].data) + absolute_offset;
+  }
+
+  template <typename T>
+  inline const T* GetValues(int i) const {
+    return GetValues<T>(i, this->offset);
+  }
+
+  /// \brief Access a buffer's data as a span
+  ///
+  /// \param i The buffer index
+  /// \param length The required length (in number of typed values) of the requested span
+  /// \pre i > 0
+  /// \pre length <= the length of the buffer (in number of values) that's expected for
+  /// this array type
+  /// \return A span<const T> of the requested length
+  template <typename T>
+  util::span<const T> GetSpan(int i, int64_t length) const {
+    const int64_t buffer_length = buffers[i].size / static_cast<int64_t>(sizeof(T));
+    assert(i > 0 && length + offset <= buffer_length);
+    ARROW_UNUSED(buffer_length);
+    return util::span<const T>(buffers[i].data_as<T>() + this->offset, length);
+  }
+
+  /// \brief Access a buffer's data as a span
+  ///
+  /// \param i The buffer index
+  /// \param length The required length (in number of typed values) of the requested span
+  /// \pre i > 0
+  /// \pre length <= the length of the buffer (in number of values) that's expected for
+  /// this array type
+  /// \return A span<T> of the requested length
+  template <typename T>
+  util::span<T> GetSpan(int i, int64_t length) {
+    const int64_t buffer_length = buffers[i].size / static_cast<int64_t>(sizeof(T));
+    assert(i > 0 && length + offset <= buffer_length);
+    ARROW_UNUSED(buffer_length);
+    return util::span<T>(buffers[i].mutable_data_as<T>() + this->offset, length);
+  }
+
+  inline bool IsNull(int64_t i) const { return !IsValid(i); }
+
+  inline bool IsValid(int64_t i) const {
+    if (this->buffers[0].data != NULLPTR) {
+      return bit_util::GetBit(this->buffers[0].data, i + this->offset);
+    } else {
+      const auto type = this->type->id();
+      if (type == Type::SPARSE_UNION) {
+        return !IsNullSparseUnion(i);
+      }
+      if (type == Type::DENSE_UNION) {
+        return !IsNullDenseUnion(i);
+      }
+      if (type == Type::RUN_END_ENCODED) {
+        return !IsNullRunEndEncoded(i);
+      }
+      return this->null_count != this->length;
+    }
+  }
+
+  std::shared_ptr<ArrayData> ToArrayData() const;
+
+  std::shared_ptr<Array> ToArray() const;
+
+  std::shared_ptr<Buffer> GetBuffer(int index) const {
+    const BufferSpan& buf = this->buffers[index];
+    if (buf.owner) {
+      return *buf.owner;
+    } else if (buf.data != NULLPTR) {
+      // Buffer points to some memory without an owning buffer
+      return std::make_shared<Buffer>(buf.data, buf.size);
+    } else {
+      return NULLPTR;
+    }
+  }
+
+  void SetSlice(int64_t offset, int64_t length) {
+    this->offset = offset;
+    this->length = length;
+    if (this->type->id() == Type::NA) {
+      this->null_count = this->length;
+    } else if (buffers[0].data != NULLPTR) {
+      this->null_count = kUnknownNullCount;
+    } else {
+      this->null_count = 0;
+    }
+  }
+
+  /// \brief Return physical null count, or compute and set it if it's not known
+  int64_t GetNullCount() const;
+
+  /// \brief Return true if the array has a validity bitmap and the physical null
+  /// count is known to be non-zero or not yet known
+  ///
+  /// Note that this is not the same as MayHaveLogicalNulls, which also checks
+  /// for the presence of nulls in child data for types like unions and run-end
+  /// encoded types.
+  ///
+  /// \see HasValidityBitmap
+  /// \see MayHaveLogicalNulls
+  bool MayHaveNulls() const {
+    // If an ArrayData is slightly malformed it may have kUnknownNullCount set
+    // but no buffer
+    return null_count != 0 && buffers[0].data != NULLPTR;
+  }
+
+  /// \brief Return true if the array has a validity bitmap
+  bool HasValidityBitmap() const { return buffers[0].data != NULLPTR; }
+
+  /// \brief Return true if the validity bitmap may have 0's in it, or if the
+  /// child arrays (in the case of types without a validity bitmap) may have
+  /// nulls, or if the dictionary of dictionay array may have nulls.
+  ///
+  /// \see ArrayData::MayHaveLogicalNulls
+  bool MayHaveLogicalNulls() const {
+    if (buffers[0].data != NULLPTR) {
+      return null_count != 0;
+    }
+    const auto t = type->id();
+    if (t == Type::SPARSE_UNION || t == Type::DENSE_UNION) {
+      return UnionMayHaveLogicalNulls();
+    }
+    if (t == Type::RUN_END_ENCODED) {
+      return RunEndEncodedMayHaveLogicalNulls();
+    }
+    if (t == Type::DICTIONARY) {
+      return DictionaryMayHaveLogicalNulls();
+    }
+    return null_count != 0;
+  }
+
+  /// \brief Compute the logical null count for arrays of all types including
+  /// those that do not have a validity bitmap like union and run-end encoded
+  /// arrays
+  ///
+  /// If the array has a validity bitmap, this function behaves the same as
+  /// GetNullCount. For types that have no validity bitmap, this function will
+  /// recompute the logical null count every time it is called.
+  ///
+  /// \see GetNullCount
+  int64_t ComputeLogicalNullCount() const;
+
+  /// Some DataTypes (StringView, BinaryView) may have an arbitrary number of variadic
+  /// buffers. Since ArraySpan only has 3 buffers, we pack the variadic buffers into
+  /// buffers[2]; IE buffers[2].data points to the first shared_ptr<Buffer> of the
+  /// variadic set and buffers[2].size is the number of variadic buffers times
+  /// sizeof(shared_ptr<Buffer>).
+  ///
+  /// \see HasVariadicBuffers
+  util::span<const std::shared_ptr<Buffer>> GetVariadicBuffers() const;
+  bool HasVariadicBuffers() const;
+
+ private:
+  ARROW_FRIEND_EXPORT friend bool internal::IsNullRunEndEncoded(const ArrayData& data,
+                                                                int64_t i);
+
+  bool IsNullSparseUnion(int64_t i) const;
+  bool IsNullDenseUnion(int64_t i) const;
+
+  /// \brief Return true if the value at logical index i is null
+  ///
+  /// This function uses binary-search, so it has a O(log N) cost.
+  /// Iterating over the whole array and calling IsNull is O(N log N), so
+  /// for better performance it is recommended to use a
+  /// ree_util::RunEndEncodedArraySpan to iterate run by run instead.
+  bool IsNullRunEndEncoded(int64_t i) const;
+
+  bool UnionMayHaveLogicalNulls() const;
+  bool RunEndEncodedMayHaveLogicalNulls() const;
+  bool DictionaryMayHaveLogicalNulls() const;
+};
+
+namespace internal {
+
+void FillZeroLengthArray(const DataType* type, ArraySpan* span);
+
+/// Construct a zero-copy view of this ArrayData with the given type.
+///
+/// This method checks if the types are layout-compatible.
+/// Nested types are traversed in depth-first order. Data buffers must have
+/// the same item sizes, even though the logical types may be different.
+/// An error is returned if the types are not layout-compatible.
+ARROW_EXPORT
+Result<std::shared_ptr<ArrayData>> GetArrayView(const std::shared_ptr<ArrayData>& data,
+                                                const std::shared_ptr<DataType>& type);
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/diff.h b/pyarrow/include/arrow/array/diff.h
new file mode 100644
index 0000000000000000000000000000000000000000..a405164b333f3b21a17e8414ef59a8a628c28579
--- /dev/null
+++ b/pyarrow/include/arrow/array/diff.h
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <iosfwd>
+#include <memory>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/array_nested.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \brief Compare two arrays, returning an edit script which expresses the difference
+/// between them
+///
+/// An edit script is an array of struct(insert: bool, run_length: int64_t).
+/// Each element of "insert" determines whether an element was inserted into (true)
+/// or deleted from (false) base. Each insertion or deletion is followed by a run of
+/// elements which are unchanged from base to target; the length of this run is stored
+/// in "run_length". (Note that the edit script begins and ends with a run of shared
+/// elements but both fields of the struct must have the same length. To accommodate this
+/// the first element of "insert" should be ignored.)
+///
+/// For example for base "hlloo" and target "hello", the edit script would be
+/// [
+///   {"insert": false, "run_length": 1}, // leading run of length 1 ("h")
+///   {"insert": true, "run_length": 3}, // insert("e") then a run of length 3 ("llo")
+///   {"insert": false, "run_length": 0} // delete("o") then an empty run
+/// ]
+///
+/// Diffing arrays containing nulls is not currently supported.
+///
+/// \param[in] base baseline for comparison
+/// \param[in] target an array of identical type to base whose elements differ from base's
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return an edit script array which can be applied to base to produce target
+ARROW_EXPORT
+Result<std::shared_ptr<StructArray>> Diff(const Array& base, const Array& target,
+                                          MemoryPool* pool = default_memory_pool());
+
+/// \brief visitor interface for easy traversal of an edit script
+///
+/// visitor will be called for each hunk of insertions and deletions.
+ARROW_EXPORT Status VisitEditScript(
+    const Array& edits,
+    const std::function<Status(int64_t delete_begin, int64_t delete_end,
+                               int64_t insert_begin, int64_t insert_end)>& visitor);
+
+/// \brief return a function which will format an edit script in unified
+/// diff format to os, given base and target arrays of type
+ARROW_EXPORT Result<
+    std::function<Status(const Array& edits, const Array& base, const Array& target)>>
+MakeUnifiedDiffFormatter(const DataType& type, std::ostream* os);
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/statistics.h b/pyarrow/include/arrow/array/statistics.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae78dca0b0c6b19b4ad4bfc8deb9962260afb466
--- /dev/null
+++ b/pyarrow/include/arrow/array/statistics.h
@@ -0,0 +1,175 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <variant>
+
+#include "arrow/compare.h"
+#include "arrow/type.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \class ArrayStatistics
+/// \brief Statistics for an Array
+///
+/// Apache Arrow format doesn't have statistics but data source such
+/// as Apache Parquet may have statistics. Statistics associated with
+/// data source can be read unified API via this class.
+struct ARROW_EXPORT ArrayStatistics {
+  /// \brief The type for maximum and minimum values. If the target
+  /// value exists, one of them is used. `std::nullopt` is used
+  /// otherwise.
+  using ValueType = std::variant<bool, int64_t, uint64_t, double, std::string>;
+  using NumericType = std::variant<int64_t, double>;
+  using CountType = NumericType;
+  using SizeType = NumericType;
+
+  static const std::shared_ptr<DataType>& ValueToArrowType(
+      const std::optional<ValueType>& value,
+      const std::shared_ptr<DataType>& array_type) {
+    if (!value.has_value()) {
+      return null();
+    }
+
+    struct Visitor {
+      const std::shared_ptr<DataType>& array_type;
+
+      const std::shared_ptr<DataType>& operator()(const bool&) { return boolean(); }
+      const std::shared_ptr<DataType>& operator()(const int64_t&) { return int64(); }
+      const std::shared_ptr<DataType>& operator()(const uint64_t&) { return uint64(); }
+      const std::shared_ptr<DataType>& operator()(const double&) { return float64(); }
+      const std::shared_ptr<DataType>& operator()(const std::string&) {
+        switch (array_type->id()) {
+          case Type::STRING:
+          case Type::BINARY:
+          case Type::FIXED_SIZE_BINARY:
+          case Type::LARGE_STRING:
+          case Type::LARGE_BINARY:
+          case Type::BINARY_VIEW:
+          case Type::STRING_VIEW:
+            return array_type;
+          default:
+            return utf8();
+        }
+      }
+    } visitor{array_type};
+    return std::visit(visitor, value.value());
+  }
+
+  /// \brief The number of rows, may not be set
+  /// Note: when set to `int64_t`, it represents `exact_row_count`,
+  /// and when set to `double`, it represents `approximate_row_count`.
+  /// Note: this value is not used by \ref arrow::RecordBatch::MakeStatisticsArray.
+  std::optional<CountType> row_count = std::nullopt;
+
+  /// \brief The number of null values, may not be set
+  /// Note: when set to `int64_t`, it represents `exact_null_count`,
+  /// and when set to `double`, it represents `approximate_null_count`.
+  std::optional<CountType> null_count = std::nullopt;
+
+  /// \brief The number of distinct values, may not be set
+  /// Note: when set to `int64_t`, it represents `exact_distinct_count`,
+  /// and when set to `double`, it represents `approximate_distinct_count`.
+  std::optional<CountType> distinct_count = std::nullopt;
+
+  /// \brief The maximum length in bytes of the rows in an array; may not be set
+  /// Note: when the type is `int64_t`, it represents `max_byte_width_exact`,
+  /// and when the type is `double`, it represents `max_byte_width_approximate`.
+  std::optional<SizeType> max_byte_width = std::nullopt;
+
+  /// \brief The average size in bytes of a row in an array, may not be set.
+  std::optional<double> average_byte_width = std::nullopt;
+
+  /// \brief Whether the average size in bytes is exact or not.
+  bool is_average_byte_width_exact = false;
+
+  /// \brief The minimum value, may not be set
+  std::optional<ValueType> min = std::nullopt;
+
+  /// \brief Compute Arrow type of the minimum value.
+  ///
+  /// If \ref ValueType is `std::string`, `array_type` may be
+  /// used. If `array_type` is a binary-like type such as \ref
+  /// arrow::binary and \ref arrow::large_utf8, `array_type` is
+  /// returned. \ref arrow::utf8 is returned otherwise.
+  ///
+  /// If \ref ValueType isn't `std::string`, `array_type` isn't used.
+  ///
+  /// \param array_type The Arrow type of the associated array.
+  ///
+  /// \return \ref arrow::null if the minimum value is `std::nullopt`,
+  ///         Arrow type based on \ref ValueType of the \ref min
+  ///         otherwise.
+  const std::shared_ptr<DataType>& MinArrowType(
+      const std::shared_ptr<DataType>& array_type) {
+    return ValueToArrowType(min, array_type);
+  }
+
+  /// \brief Whether the minimum value is exact or not
+  bool is_min_exact = false;
+
+  /// \brief The maximum value, may not be set
+  std::optional<ValueType> max = std::nullopt;
+
+  /// \brief Compute Arrow type of the maximum value.
+  ///
+  /// If \ref ValueType is `std::string`, `array_type` may be
+  /// used. If `array_type` is a binary-like type such as \ref
+  /// arrow::binary and \ref arrow::large_utf8, `array_type` is
+  /// returned. \ref arrow::utf8 is returned otherwise.
+  ///
+  /// If \ref ValueType isn't `std::string`, `array_type` isn't used.
+  ///
+  /// \param array_type The Arrow type of the associated array.
+  ///
+  /// \return \ref arrow::null if the maximum value is `std::nullopt`,
+  ///         Arrow type based on \ref ValueType of the \ref max
+  ///         otherwise.
+  const std::shared_ptr<DataType>& MaxArrowType(
+      const std::shared_ptr<DataType>& array_type) {
+    return ValueToArrowType(max, array_type);
+  }
+
+  /// \brief Whether the maximum value is exact or not
+  bool is_max_exact = false;
+
+  /// \brief Check two \ref arrow::ArrayStatistics for equality
+  ///
+  /// \param other The \ref arrow::ArrayStatistics instance to compare against.
+  ///
+  /// \param equal_options Options used to compare double values for equality.
+  ///
+  /// \return True if the two \ref arrow::ArrayStatistics instances are equal; otherwise,
+  /// false.
+  bool Equals(const ArrayStatistics& other,
+              const EqualOptions& equal_options = EqualOptions::Defaults()) const {
+    return ArrayStatisticsEquals(*this, other, equal_options);
+  }
+
+  /// \brief Check two statistics for equality
+  bool operator==(const ArrayStatistics& other) const { return Equals(other); }
+
+  /// \brief Check two statistics for not equality
+  bool operator!=(const ArrayStatistics& other) const { return !Equals(other); }
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/util.h b/pyarrow/include/arrow/array/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd8e75ddb86405c523a8083f559dab0e72364e24
--- /dev/null
+++ b/pyarrow/include/arrow/array/util.h
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/compare.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \defgroup array-factories Array factory functions
+///
+/// @{
+
+/// \brief Create a strongly-typed Array instance from generic ArrayData
+/// \param[in] data the array contents
+/// \return the resulting Array instance
+ARROW_EXPORT
+std::shared_ptr<Array> MakeArray(const std::shared_ptr<ArrayData>& data);
+
+/// \brief Create a strongly-typed Array instance with all elements null
+/// \param[in] type the array type
+/// \param[in] length the array length
+/// \param[in] pool the memory pool to allocate memory from
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> MakeArrayOfNull(const std::shared_ptr<DataType>& type,
+                                               int64_t length,
+                                               MemoryPool* pool = default_memory_pool());
+
+/// \brief Create an Array instance whose slots are the given scalar
+/// \param[in] scalar the value with which to fill the array
+/// \param[in] length the array length
+/// \param[in] pool the memory pool to allocate memory from
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> MakeArrayFromScalar(
+    const Scalar& scalar, int64_t length, MemoryPool* pool = default_memory_pool());
+
+/// \brief Create an empty Array of a given type
+///
+/// The output Array will be of the given type.
+///
+/// \param[in] type the data type of the empty Array
+/// \param[in] pool the memory pool to allocate memory from
+/// \return the resulting Array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> MakeEmptyArray(std::shared_ptr<DataType> type,
+                                              MemoryPool* pool = default_memory_pool());
+
+/// @}
+
+namespace internal {
+
+/// \brief Swap endian of each element in a generic ArrayData
+///
+/// As dictionaries are often shared between different arrays, dictionaries
+/// are not swapped by this function and should be handled separately.
+///
+/// \param[in] data the array contents
+/// \param[in] pool the memory pool to allocate memory from
+/// \return the resulting ArrayData whose elements were swapped
+ARROW_EXPORT
+Result<std::shared_ptr<ArrayData>> SwapEndianArrayData(
+    const std::shared_ptr<ArrayData>& data, MemoryPool* pool = default_memory_pool());
+
+/// Given a number of ArrayVectors, treat each ArrayVector as the
+/// chunks of a chunked array.  Then rechunk each ArrayVector such that
+/// all ArrayVectors are chunked identically.  It is mandatory that
+/// all ArrayVectors contain the same total number of elements.
+ARROW_EXPORT
+std::vector<ArrayVector> RechunkArraysConsistently(const std::vector<ArrayVector>&);
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/array/validate.h b/pyarrow/include/arrow/array/validate.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ebfa0a51edce21ca585862b1dbb074b6cf8d9c8
--- /dev/null
+++ b/pyarrow/include/arrow/array/validate.h
@@ -0,0 +1,56 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+// Internal functions implementing Array::Validate() and friends.
+
+// O(1) array metadata validation
+
+ARROW_EXPORT
+Status ValidateArray(const Array& array);
+
+ARROW_EXPORT
+Status ValidateArray(const ArrayData& data);
+
+// O(N) array data validation.
+// Note that, starting from 7.0.0, "full" routines also validate metadata.
+// Before, ValidateArray() needed to be called before ValidateArrayFull()
+// to ensure metadata correctness, otherwise invalid memory accesses
+// may occur.
+
+ARROW_EXPORT
+Status ValidateArrayFull(const Array& array);
+
+ARROW_EXPORT
+Status ValidateArrayFull(const ArrayData& data);
+
+ARROW_EXPORT
+Status ValidateUTF8(const Array& array);
+
+ARROW_EXPORT
+Status ValidateUTF8(const ArrayData& data);
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/buffer.h b/pyarrow/include/arrow/buffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce909a3ea182f4d1d8fb294512ccc74e55bf0030
--- /dev/null
+++ b/pyarrow/include/arrow/buffer.h
@@ -0,0 +1,587 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "arrow/device.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/span.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Buffer classes
+
+/// \class Buffer
+/// \brief Object containing a pointer to a piece of contiguous memory with a
+/// particular size.
+///
+/// Buffers have two related notions of length: size and capacity. Size is
+/// the number of bytes that might have valid data. Capacity is the number
+/// of bytes that were allocated for the buffer in total.
+///
+/// The Buffer base class does not own its memory, but subclasses often do.
+///
+/// The following invariant is always true: Size <= Capacity
+class ARROW_EXPORT Buffer {
+ public:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Buffer);
+
+  /// \brief Construct from buffer and size without copying memory
+  ///
+  /// \param[in] data a memory buffer
+  /// \param[in] size buffer size
+  ///
+  /// \note The passed memory must be kept alive through some other means
+  Buffer(const uint8_t* data, int64_t size)
+      : is_mutable_(false),
+        is_cpu_(true),
+        data_(data),
+        size_(size),
+        capacity_(size),
+        device_type_(DeviceAllocationType::kCPU) {
+    SetMemoryManager(default_cpu_memory_manager());
+  }
+
+  Buffer(const uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm,
+         std::shared_ptr<Buffer> parent = NULLPTR,
+         std::optional<DeviceAllocationType> device_type_override = std::nullopt)
+      : is_mutable_(false),
+        data_(data),
+        size_(size),
+        capacity_(size),
+        parent_(std::move(parent)) {
+    // SetMemoryManager will also set device_type_
+    SetMemoryManager(std::move(mm));
+    // If a device type is specified, use that instead. Example of when this can be
+    // useful: the CudaMemoryManager can set device_type_ to kCUDA, but you can specify
+    // device_type_override=kCUDA_HOST as the device type to override it.
+    if (device_type_override != std::nullopt) {
+      device_type_ = *device_type_override;
+    }
+  }
+
+  Buffer(uintptr_t address, int64_t size, std::shared_ptr<MemoryManager> mm,
+         std::shared_ptr<Buffer> parent = NULLPTR)
+      : Buffer(reinterpret_cast<const uint8_t*>(address), size, std::move(mm),
+               std::move(parent)) {}
+
+  /// \brief Construct from string_view without copying memory
+  ///
+  /// \param[in] data a string_view object
+  ///
+  /// \note The memory viewed by data must not be deallocated in the lifetime of the
+  /// Buffer; temporary rvalue strings must be stored in an lvalue somewhere
+  explicit Buffer(std::string_view data)
+      : Buffer(reinterpret_cast<const uint8_t*>(data.data()),
+               static_cast<int64_t>(data.size())) {}
+
+  virtual ~Buffer() = default;
+
+  /// An offset into data that is owned by another buffer, but we want to be
+  /// able to retain a valid pointer to it even after other shared_ptr's to the
+  /// parent buffer have been destroyed
+  ///
+  /// This method makes no assertions about alignment or padding of the buffer but
+  /// in general we expected buffers to be aligned and padded to 64 bytes.  In the future
+  /// we might add utility methods to help determine if a buffer satisfies this contract.
+  Buffer(std::shared_ptr<Buffer> parent, const int64_t offset, const int64_t size)
+      : Buffer(parent->data_ + offset, size) {
+    parent_ = std::move(parent);
+    SetMemoryManager(parent_->memory_manager_);
+  }
+
+  uint8_t operator[](std::size_t i) const { return data_[i]; }
+
+  /// \brief Construct a new std::string with a hexadecimal representation of the buffer.
+  /// \return std::string
+  std::string ToHexString();
+
+  /// Return true if both buffers are the same size and contain the same bytes
+  /// up to the number of compared bytes
+  bool Equals(const Buffer& other, int64_t nbytes) const;
+
+  /// Return true if both buffers are the same size and contain the same bytes
+  bool Equals(const Buffer& other) const;
+
+  /// Copy a section of the buffer into a new Buffer.
+  Result<std::shared_ptr<Buffer>> CopySlice(
+      const int64_t start, const int64_t nbytes,
+      MemoryPool* pool = default_memory_pool()) const;
+
+  /// Zero bytes in padding, i.e. bytes between size_ and capacity_.
+  void ZeroPadding() {
+#ifndef NDEBUG
+    CheckMutable();
+#endif
+    // A zero-capacity buffer can have a null data pointer
+    if (capacity_ != 0) {
+      memset(mutable_data() + size_, 0, static_cast<size_t>(capacity_ - size_));
+    }
+  }
+
+  /// \brief Construct an immutable buffer that takes ownership of the contents
+  /// of an std::string (without copying it).
+  ///
+  /// \param[in] data a string to own
+  /// \return a new Buffer instance
+  static std::shared_ptr<Buffer> FromString(std::string data);
+
+  /// \brief Construct an immutable buffer that takes ownership of the contents
+  /// of an std::vector (without copying it). Only vectors of TrivialType objects
+  /// (integers, floating point numbers, ...) can be wrapped by this function.
+  ///
+  /// \param[in] vec a vector to own
+  /// \return a new Buffer instance
+  template <typename T>
+  static std::shared_ptr<Buffer> FromVector(std::vector<T> vec) {
+    static_assert(std::is_trivial_v<T>,
+                  "Buffer::FromVector can only wrap vectors of trivial objects");
+
+    if (vec.empty()) {
+      return std::shared_ptr<Buffer>{new Buffer()};
+    }
+
+    auto* data = reinterpret_cast<uint8_t*>(vec.data());
+    auto size_in_bytes = static_cast<int64_t>(vec.size() * sizeof(T));
+    return std::shared_ptr<Buffer>{
+        new Buffer{data, size_in_bytes},
+        // Keep the vector's buffer alive inside the shared_ptr's destructor until after
+        // we have deleted the Buffer. Note we can't use this trick in FromString since
+        // std::string's data is inline for short strings so moving invalidates pointers
+        // into the string's buffer.
+        [vec = std::move(vec)](Buffer* buffer) { delete buffer; }};
+  }
+
+  /// \brief Create buffer referencing typed memory with some length without
+  /// copying
+  /// \param[in] data the typed memory as C array
+  /// \param[in] length the number of values in the array
+  /// \return a new shared_ptr<Buffer>
+  template <typename T, typename SizeType = int64_t>
+  static std::shared_ptr<Buffer> Wrap(const T* data, SizeType length) {
+    return std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(data),
+                                    static_cast<int64_t>(sizeof(T) * length));
+  }
+
+  /// \brief Create buffer referencing std::vector with some length without
+  /// copying
+  /// \param[in] data the vector to be referenced. If this vector is changed,
+  /// the buffer may become invalid
+  /// \return a new shared_ptr<Buffer>
+  template <typename T>
+  static std::shared_ptr<Buffer> Wrap(const std::vector<T>& data) {
+    return std::make_shared<Buffer>(reinterpret_cast<const uint8_t*>(data.data()),
+                                    static_cast<int64_t>(sizeof(T) * data.size()));
+  }
+
+  /// \brief Copy buffer contents into a new std::string
+  /// \return std::string
+  /// \note Can throw std::bad_alloc if buffer is large
+  std::string ToString() const;
+
+  /// \brief View buffer contents as a std::string_view
+  /// \return std::string_view
+  explicit operator std::string_view() const {
+    return {reinterpret_cast<const char*>(data_), static_cast<size_t>(size_)};
+  }
+
+  /// \brief Return a pointer to the buffer's data
+  ///
+  /// The buffer has to be a CPU buffer (`is_cpu()` is true).
+  /// Otherwise, an assertion may be thrown or a null pointer may be returned.
+  ///
+  /// To get the buffer's data address regardless of its device, call `address()`.
+  const uint8_t* data() const {
+#ifndef NDEBUG
+    CheckCPU();
+#endif
+    return ARROW_PREDICT_TRUE(is_cpu_) ? data_ : NULLPTR;
+  }
+
+  /// \brief Return a pointer to the buffer's data cast to a specific type
+  ///
+  /// The buffer has to be a CPU buffer (`is_cpu()` is true).
+  /// Otherwise, an assertion may be thrown or a null pointer may be returned.
+  template <typename T>
+  const T* data_as() const {
+    return reinterpret_cast<const T*>(data());
+  }
+
+  /// \brief Return the buffer's data as a span
+  template <typename T>
+  util::span<const T> span_as() const {
+    return util::span(data_as<T>(), static_cast<size_t>(size() / sizeof(T)));
+  }
+
+  /// \brief Return a writable pointer to the buffer's data
+  ///
+  /// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()`
+  /// are true).  Otherwise, an assertion may be thrown or a null pointer may
+  /// be returned.
+  ///
+  /// To get the buffer's mutable data address regardless of its device, call
+  /// `mutable_address()`.
+  uint8_t* mutable_data() {
+#ifndef NDEBUG
+    CheckCPU();
+    CheckMutable();
+#endif
+    return ARROW_PREDICT_TRUE(is_cpu_ && is_mutable_) ? const_cast<uint8_t*>(data_)
+                                                      : NULLPTR;
+  }
+
+  /// \brief Return a writable pointer to the buffer's data cast to a specific type
+  ///
+  /// The buffer has to be a mutable CPU buffer (`is_cpu()` and `is_mutable()`
+  /// are true).  Otherwise, an assertion may be thrown or a null pointer may
+  /// be returned.
+  template <typename T>
+  T* mutable_data_as() {
+    return reinterpret_cast<T*>(mutable_data());
+  }
+
+  /// \brief Return the buffer's mutable data as a span
+  template <typename T>
+  util::span<T> mutable_span_as() {
+    return util::span(mutable_data_as<T>(), static_cast<size_t>(size() / sizeof(T)));
+  }
+
+  /// \brief Return the device address of the buffer's data
+  uintptr_t address() const { return reinterpret_cast<uintptr_t>(data_); }
+
+  /// \brief Return a writable device address to the buffer's data
+  ///
+  /// The buffer has to be a mutable buffer (`is_mutable()` is true).
+  /// Otherwise, an assertion may be thrown or 0 may be returned.
+  uintptr_t mutable_address() const {
+#ifndef NDEBUG
+    CheckMutable();
+#endif
+    return ARROW_PREDICT_TRUE(is_mutable_) ? reinterpret_cast<uintptr_t>(data_) : 0;
+  }
+
+  /// \brief Return the buffer's size in bytes
+  int64_t size() const { return size_; }
+
+  /// \brief Return the buffer's capacity (number of allocated bytes)
+  int64_t capacity() const { return capacity_; }
+
+  /// \brief Whether the buffer is directly CPU-accessible
+  ///
+  /// If this function returns true, you can read directly from the buffer's
+  /// `data()` pointer.  Otherwise, you'll have to `View()` or `Copy()` it.
+  bool is_cpu() const { return is_cpu_; }
+
+  /// \brief Whether the buffer is mutable
+  ///
+  /// If this function returns true, you are allowed to modify buffer contents
+  /// using the pointer returned by `mutable_data()` or `mutable_address()`.
+  bool is_mutable() const { return is_mutable_; }
+
+  const std::shared_ptr<Device>& device() const { return memory_manager_->device(); }
+
+  const std::shared_ptr<MemoryManager>& memory_manager() const { return memory_manager_; }
+
+  DeviceAllocationType device_type() const { return device_type_; }
+
+  std::shared_ptr<Buffer> parent() const { return parent_; }
+
+  /// \brief Get a RandomAccessFile for reading a buffer
+  ///
+  /// The returned file object reads from this buffer's underlying memory.
+  static Result<std::shared_ptr<io::RandomAccessFile>> GetReader(std::shared_ptr<Buffer>);
+
+  /// \brief Get a OutputStream for writing to a buffer
+  ///
+  /// The buffer must be mutable.  The returned stream object writes into the buffer's
+  /// underlying memory (but it won't resize it).
+  static Result<std::shared_ptr<io::OutputStream>> GetWriter(std::shared_ptr<Buffer>);
+
+  /// \brief Copy buffer
+  ///
+  /// The buffer contents will be copied into a new buffer allocated by the
+  /// given MemoryManager.  This function supports cross-device copies.
+  static Result<std::shared_ptr<Buffer>> Copy(std::shared_ptr<Buffer> source,
+                                              const std::shared_ptr<MemoryManager>& to);
+
+  /// \brief Copy a non-owned buffer
+  ///
+  /// This is useful for cases where the source memory area is externally managed
+  /// (its lifetime not tied to the source Buffer), otherwise please use Copy().
+  static Result<std::unique_ptr<Buffer>> CopyNonOwned(
+      const Buffer& source, const std::shared_ptr<MemoryManager>& to);
+
+  /// \brief View buffer
+  ///
+  /// Return a Buffer that reflects this buffer, seen potentially from another
+  /// device, without making an explicit copy of the contents.  The underlying
+  /// mechanism is typically implemented by the kernel or device driver, and may
+  /// involve lazy caching of parts of the buffer contents on the destination
+  /// device's memory.
+  ///
+  /// If a non-copy view is unsupported for the buffer on the given device,
+  /// nullptr is returned.  An error can be returned if some low-level
+  /// operation fails (such as an out-of-memory condition).
+  static Result<std::shared_ptr<Buffer>> View(std::shared_ptr<Buffer> source,
+                                              const std::shared_ptr<MemoryManager>& to);
+
+  /// \brief View or copy buffer
+  ///
+  /// Try to view buffer contents on the given MemoryManager's device, but
+  /// fall back to copying if a no-copy view isn't supported.
+  static Result<std::shared_ptr<Buffer>> ViewOrCopy(
+      std::shared_ptr<Buffer> source, const std::shared_ptr<MemoryManager>& to);
+
+  virtual std::shared_ptr<Device::SyncEvent> device_sync_event() const { return NULLPTR; }
+
+ protected:
+  bool is_mutable_;
+  bool is_cpu_;
+  const uint8_t* data_;
+  int64_t size_;
+  int64_t capacity_;
+  DeviceAllocationType device_type_;
+
+  // null by default, but may be set
+  std::shared_ptr<Buffer> parent_;
+
+ private:
+  // private so that subclasses are forced to call SetMemoryManager()
+  std::shared_ptr<MemoryManager> memory_manager_;
+
+ protected:
+  Buffer();
+
+  void CheckMutable() const;
+  void CheckCPU() const;
+
+  void SetMemoryManager(std::shared_ptr<MemoryManager> mm) {
+    memory_manager_ = std::move(mm);
+    is_cpu_ = memory_manager_->is_cpu();
+    device_type_ = memory_manager_->device()->device_type();
+  }
+};
+
+/// \defgroup buffer-slicing-functions Functions for slicing buffers
+///
+/// @{
+
+/// \brief Construct a view on a buffer at the given offset and length.
+///
+/// This function cannot fail and does not check for errors (except in debug builds)
+static inline std::shared_ptr<Buffer> SliceBuffer(std::shared_ptr<Buffer> buffer,
+                                                  const int64_t offset,
+                                                  const int64_t length) {
+  return std::make_shared<Buffer>(std::move(buffer), offset, length);
+}
+
+/// \brief Construct a view on a buffer at the given offset, up to the buffer's end.
+///
+/// This function cannot fail and does not check for errors (except in debug builds)
+static inline std::shared_ptr<Buffer> SliceBuffer(std::shared_ptr<Buffer> buffer,
+                                                  const int64_t offset) {
+  int64_t length = buffer->size() - offset;
+  return SliceBuffer(std::move(buffer), offset, length);
+}
+
+/// \brief Input-checking version of SliceBuffer
+///
+/// An Invalid Status is returned if the requested slice falls out of bounds.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SliceBufferSafe(std::shared_ptr<Buffer> buffer,
+                                                int64_t offset);
+/// \brief Input-checking version of SliceBuffer
+///
+/// An Invalid Status is returned if the requested slice falls out of bounds.
+/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SliceBufferSafe(std::shared_ptr<Buffer> buffer,
+                                                int64_t offset, int64_t length);
+
+/// \brief Like SliceBuffer, but construct a mutable buffer slice.
+///
+/// If the parent buffer is not mutable, behavior is undefined (it may abort
+/// in debug builds).
+ARROW_EXPORT
+std::shared_ptr<Buffer> SliceMutableBuffer(std::shared_ptr<Buffer> buffer,
+                                           const int64_t offset, const int64_t length);
+
+/// \brief Like SliceBuffer, but construct a mutable buffer slice.
+///
+/// If the parent buffer is not mutable, behavior is undefined (it may abort
+/// in debug builds).
+static inline std::shared_ptr<Buffer> SliceMutableBuffer(std::shared_ptr<Buffer> buffer,
+                                                         const int64_t offset) {
+  int64_t length = buffer->size() - offset;
+  return SliceMutableBuffer(std::move(buffer), offset, length);
+}
+
+/// \brief Input-checking version of SliceMutableBuffer
+///
+/// An Invalid Status is returned if the requested slice falls out of bounds.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SliceMutableBufferSafe(std::shared_ptr<Buffer> buffer,
+                                                       int64_t offset);
+/// \brief Input-checking version of SliceMutableBuffer
+///
+/// An Invalid Status is returned if the requested slice falls out of bounds.
+/// Note that unlike SliceBuffer, `length` isn't clamped to the available buffer size.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SliceMutableBufferSafe(std::shared_ptr<Buffer> buffer,
+                                                       int64_t offset, int64_t length);
+
+/// @}
+
+/// \class MutableBuffer
+/// \brief A Buffer whose contents can be mutated. May or may not own its data.
+class ARROW_EXPORT MutableBuffer : public Buffer {
+ public:
+  MutableBuffer(uint8_t* data, const int64_t size) : Buffer(data, size) {
+    is_mutable_ = true;
+  }
+
+  MutableBuffer(uint8_t* data, const int64_t size, std::shared_ptr<MemoryManager> mm)
+      : Buffer(data, size, std::move(mm)) {
+    is_mutable_ = true;
+  }
+
+  MutableBuffer(const std::shared_ptr<Buffer>& parent, const int64_t offset,
+                const int64_t size);
+
+  /// \brief Create buffer referencing typed memory with some length
+  /// \param[in] data the typed memory as C array
+  /// \param[in] length the number of values in the array
+  /// \return a new shared_ptr<Buffer>
+  template <typename T, typename SizeType = int64_t>
+  static std::shared_ptr<Buffer> Wrap(T* data, SizeType length) {
+    return std::make_shared<MutableBuffer>(reinterpret_cast<uint8_t*>(data),
+                                           static_cast<int64_t>(sizeof(T) * length));
+  }
+
+ protected:
+  MutableBuffer() : Buffer(NULLPTR, 0) {}
+};
+
+/// \class ResizableBuffer
+/// \brief A mutable buffer that can be resized
+class ARROW_EXPORT ResizableBuffer : public MutableBuffer {
+ public:
+  /// Change buffer reported size to indicated size, allocating memory if
+  /// necessary.  This will ensure that the capacity of the buffer is a multiple
+  /// of 64 bytes as defined in Layout.md.
+  /// Consider using ZeroPadding afterwards, to conform to the Arrow layout
+  /// specification.
+  ///
+  /// @param new_size The new size for the buffer.
+  /// @param shrink_to_fit Whether to shrink the capacity if new size < current size
+  virtual Status Resize(const int64_t new_size, bool shrink_to_fit) = 0;
+  Status Resize(const int64_t new_size) {
+    return Resize(new_size, /*shrink_to_fit=*/true);
+  }
+
+  /// Ensure that buffer has enough memory allocated to fit the indicated
+  /// capacity (and meets the 64 byte padding requirement in Layout.md).
+  /// It does not change buffer's reported size and doesn't zero the padding.
+  virtual Status Reserve(const int64_t new_capacity) = 0;
+
+  template <class T>
+  Status TypedResize(const int64_t new_nb_elements, bool shrink_to_fit = true) {
+    return Resize(sizeof(T) * new_nb_elements, shrink_to_fit);
+  }
+
+  template <class T>
+  Status TypedReserve(const int64_t new_nb_elements) {
+    return Reserve(sizeof(T) * new_nb_elements);
+  }
+
+ protected:
+  ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {}
+  ResizableBuffer(uint8_t* data, int64_t size, std::shared_ptr<MemoryManager> mm)
+      : MutableBuffer(data, size, std::move(mm)) {}
+};
+
+/// \defgroup buffer-allocation-functions Functions for allocating buffers
+///
+/// @{
+
+/// \brief Allocate a fixed size mutable buffer from a memory pool, zero its padding.
+///
+/// \param[in] size size of buffer to allocate
+/// \param[in] pool a memory pool
+ARROW_EXPORT
+Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size,
+                                               MemoryPool* pool = NULLPTR);
+ARROW_EXPORT
+Result<std::unique_ptr<Buffer>> AllocateBuffer(const int64_t size, int64_t alignment,
+                                               MemoryPool* pool = NULLPTR);
+
+/// \brief Allocate a resizeable buffer from a memory pool, zero its padding.
+///
+/// \param[in] size size of buffer to allocate
+/// \param[in] pool a memory pool
+ARROW_EXPORT
+Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(
+    const int64_t size, MemoryPool* pool = NULLPTR);
+ARROW_EXPORT
+Result<std::unique_ptr<ResizableBuffer>> AllocateResizableBuffer(
+    const int64_t size, const int64_t alignment, MemoryPool* pool = NULLPTR);
+
+/// \brief Allocate a bitmap buffer from a memory pool
+/// no guarantee on values is provided.
+///
+/// \param[in] length size in bits of bitmap to allocate
+/// \param[in] pool memory pool to allocate memory from
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> AllocateBitmap(int64_t length,
+                                               MemoryPool* pool = NULLPTR);
+
+/// \brief Allocate a zero-initialized bitmap buffer from a memory pool
+///
+/// \param[in] length size in bits of bitmap to allocate
+/// \param[in] pool memory pool to allocate memory from
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> AllocateEmptyBitmap(int64_t length,
+                                                    MemoryPool* pool = NULLPTR);
+
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> AllocateEmptyBitmap(int64_t length, int64_t alignment,
+                                                    MemoryPool* pool = NULLPTR);
+
+/// \brief Concatenate multiple buffers into a single buffer
+///
+/// \param[in] buffers to be concatenated
+/// \param[in] pool memory pool to allocate the new buffer from
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> ConcatenateBuffers(const BufferVector& buffers,
+                                                   MemoryPool* pool = NULLPTR);
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/buffer_builder.h b/pyarrow/include/arrow/buffer_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9177c656c021939405328fedd7a1e2704212650
--- /dev/null
+++ b/pyarrow/include/arrow/buffer_builder.h
@@ -0,0 +1,488 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_generate.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Buffer builder classes
+
+/// \class BufferBuilder
+/// \brief A class for incrementally building a contiguous chunk of in-memory
+/// data
+class ARROW_EXPORT BufferBuilder {
+ public:
+  explicit BufferBuilder(MemoryPool* pool = default_memory_pool(),
+                         int64_t alignment = kDefaultBufferAlignment)
+      : pool_(pool),
+        data_(/*ensure never null to make ubsan happy and avoid check penalties below*/
+              util::MakeNonNull<uint8_t>()),
+        capacity_(0),
+        size_(0),
+        alignment_(alignment) {}
+
+  /// \brief Constructs new Builder that will start using
+  /// the provided buffer until Finish/Reset are called.
+  /// The buffer is not resized.
+  explicit BufferBuilder(std::shared_ptr<ResizableBuffer> buffer,
+                         MemoryPool* pool = default_memory_pool(),
+                         int64_t alignment = kDefaultBufferAlignment)
+      : buffer_(std::move(buffer)),
+        pool_(pool),
+        data_(buffer_->mutable_data()),
+        capacity_(buffer_->capacity()),
+        size_(buffer_->size()),
+        alignment_(alignment) {}
+
+  /// \brief Resize the buffer to the nearest multiple of 64 bytes
+  ///
+  /// \param new_capacity the new capacity of the builder. Will be
+  /// rounded up to a multiple of 64 bytes for padding
+  /// \param shrink_to_fit if new capacity is smaller than the existing,
+  /// reallocate internal buffer. Set to false to avoid reallocations when
+  /// shrinking the builder.
+  /// \return Status
+  Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
+    if (buffer_ == NULLPTR) {
+      ARROW_ASSIGN_OR_RAISE(buffer_,
+                            AllocateResizableBuffer(new_capacity, alignment_, pool_));
+    } else {
+      ARROW_RETURN_NOT_OK(buffer_->Resize(new_capacity, shrink_to_fit));
+    }
+    capacity_ = buffer_->capacity();
+    data_ = buffer_->mutable_data();
+    return Status::OK();
+  }
+
+  /// \brief Ensure that builder can accommodate the additional number of bytes
+  /// without the need to perform allocations
+  ///
+  /// \param[in] additional_bytes number of additional bytes to make space for
+  /// \return Status
+  Status Reserve(const int64_t additional_bytes) {
+    auto min_capacity = size_ + additional_bytes;
+    if (min_capacity <= capacity_) {
+      return Status::OK();
+    }
+    return Resize(GrowByFactor(capacity_, min_capacity), false);
+  }
+
+  /// \brief Return a capacity expanded by the desired growth factor
+  static int64_t GrowByFactor(int64_t current_capacity, int64_t new_capacity) {
+    // Doubling capacity except for large Reserve requests. 2x growth strategy
+    // (versus 1.5x) seems to have slightly better performance when using
+    // jemalloc, but significantly better performance when using the system
+    // allocator. See ARROW-6450 for further discussion
+    return std::max(new_capacity, current_capacity * 2);
+  }
+
+  /// \brief Append the given data to the buffer
+  ///
+  /// The buffer is automatically expanded if necessary.
+  Status Append(const void* data, const int64_t length) {
+    if (ARROW_PREDICT_FALSE(size_ + length > capacity_)) {
+      ARROW_RETURN_NOT_OK(Resize(GrowByFactor(capacity_, size_ + length), false));
+    }
+    UnsafeAppend(data, length);
+    return Status::OK();
+  }
+
+  /// \brief Append the given data to the buffer
+  ///
+  /// The buffer is automatically expanded if necessary.
+  Status Append(std::string_view v) { return Append(v.data(), v.size()); }
+
+  /// \brief Append copies of a value to the buffer
+  ///
+  /// The buffer is automatically expanded if necessary.
+  Status Append(const int64_t num_copies, uint8_t value) {
+    ARROW_RETURN_NOT_OK(Reserve(num_copies));
+    UnsafeAppend(num_copies, value);
+    return Status::OK();
+  }
+
+  // Advance pointer and zero out memory
+  Status Advance(const int64_t length) { return Append(length, 0); }
+
+  // Advance pointer, but don't allocate or zero memory
+  void UnsafeAdvance(const int64_t length) { size_ += length; }
+
+  // Unsafe methods don't check existing size
+  void UnsafeAppend(const void* data, const int64_t length) {
+    memcpy(data_ + size_, data, static_cast<size_t>(length));
+    size_ += length;
+  }
+
+  void UnsafeAppend(std::string_view v) {
+    UnsafeAppend(v.data(), static_cast<int64_t>(v.size()));
+  }
+
+  void UnsafeAppend(const int64_t num_copies, uint8_t value) {
+    memset(data_ + size_, value, static_cast<size_t>(num_copies));
+    size_ += num_copies;
+  }
+
+  /// \brief Return result of builder as a Buffer object.
+  ///
+  /// The builder is reset and can be reused afterwards.
+  ///
+  /// \param[out] out the finalized Buffer object
+  /// \param shrink_to_fit if the buffer size is smaller than its capacity,
+  /// reallocate to fit more tightly in memory. Set to false to avoid
+  /// a reallocation, at the expense of potentially more memory consumption.
+  /// \return Status
+  Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
+    ARROW_RETURN_NOT_OK(Resize(size_, shrink_to_fit));
+    if (size_ != 0) buffer_->ZeroPadding();
+    *out = buffer_;
+    if (*out == NULLPTR) {
+      ARROW_ASSIGN_OR_RAISE(*out, AllocateBuffer(0, alignment_, pool_));
+    }
+    Reset();
+    return Status::OK();
+  }
+
+  Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+    std::shared_ptr<Buffer> out;
+    ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+    return out;
+  }
+
+  /// \brief Like Finish, but override the final buffer size
+  ///
+  /// This is useful after writing data directly into the builder memory
+  /// without calling the Append methods (basically, when using BufferBuilder
+  /// mostly for memory allocation).
+  Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+                                                   bool shrink_to_fit = true) {
+    size_ = final_length;
+    return Finish(shrink_to_fit);
+  }
+
+  void Reset() {
+    buffer_ = NULLPTR;
+    capacity_ = size_ = 0;
+  }
+
+  /// \brief Set size to a smaller value without modifying builder
+  /// contents. For reusable BufferBuilder classes
+  /// \param[in] position must be non-negative and less than or equal
+  /// to the current length()
+  void Rewind(int64_t position) { size_ = position; }
+
+  int64_t capacity() const { return capacity_; }
+  int64_t length() const { return size_; }
+  const uint8_t* data() const { return data_; }
+  uint8_t* mutable_data() { return data_; }
+  template <typename T>
+  const T* data_as() const {
+    return reinterpret_cast<const T*>(data_);
+  }
+  template <typename T>
+  T* mutable_data_as() {
+    return reinterpret_cast<T*>(data_);
+  }
+
+ private:
+  std::shared_ptr<ResizableBuffer> buffer_;
+  MemoryPool* pool_;
+  uint8_t* data_;
+  int64_t capacity_;
+  int64_t size_;
+  int64_t alignment_;
+};
+
+template <typename T, typename Enable = void>
+class TypedBufferBuilder;
+
+/// \brief A BufferBuilder for building a buffer of arithmetic elements
+template <typename T>
+class TypedBufferBuilder<
+    T, typename std::enable_if<std::is_arithmetic<T>::value ||
+                               std::is_standard_layout<T>::value>::type> {
+ public:
+  explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool(),
+                              int64_t alignment = kDefaultBufferAlignment)
+      : bytes_builder_(pool, alignment) {}
+
+  explicit TypedBufferBuilder(std::shared_ptr<ResizableBuffer> buffer,
+                              MemoryPool* pool = default_memory_pool())
+      : bytes_builder_(std::move(buffer), pool) {}
+
+  explicit TypedBufferBuilder(BufferBuilder builder)
+      : bytes_builder_(std::move(builder)) {}
+
+  BufferBuilder* bytes_builder() { return &bytes_builder_; }
+
+  Status Append(T value) {
+    return bytes_builder_.Append(reinterpret_cast<uint8_t*>(&value), sizeof(T));
+  }
+
+  Status Append(const T* values, int64_t num_elements) {
+    return bytes_builder_.Append(reinterpret_cast<const uint8_t*>(values),
+                                 num_elements * sizeof(T));
+  }
+
+  Status Append(const int64_t num_copies, T value) {
+    ARROW_RETURN_NOT_OK(Reserve(num_copies + length()));
+    UnsafeAppend(num_copies, value);
+    return Status::OK();
+  }
+
+  void UnsafeAppend(T value) {
+    bytes_builder_.UnsafeAppend(reinterpret_cast<uint8_t*>(&value), sizeof(T));
+  }
+
+  void UnsafeAppend(const T* values, int64_t num_elements) {
+    bytes_builder_.UnsafeAppend(reinterpret_cast<const uint8_t*>(values),
+                                num_elements * sizeof(T));
+  }
+
+  template <typename Iter>
+  void UnsafeAppend(Iter values_begin, Iter values_end) {
+    auto num_elements = static_cast<int64_t>(std::distance(values_begin, values_end));
+    auto data = mutable_data() + length();
+    bytes_builder_.UnsafeAdvance(num_elements * sizeof(T));
+    std::copy(values_begin, values_end, data);
+  }
+
+  void UnsafeAppend(const int64_t num_copies, T value) {
+    auto data = mutable_data() + length();
+    bytes_builder_.UnsafeAdvance(num_copies * sizeof(T));
+    std::fill(data, data + num_copies, value);
+  }
+
+  Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
+    return bytes_builder_.Resize(new_capacity * sizeof(T), shrink_to_fit);
+  }
+
+  Status Reserve(const int64_t additional_elements) {
+    return bytes_builder_.Reserve(additional_elements * sizeof(T));
+  }
+
+  Status Advance(const int64_t length) {
+    return bytes_builder_.Advance(length * sizeof(T));
+  }
+
+  void UnsafeAdvance(const int64_t length) {
+    bytes_builder_.UnsafeAdvance(length * sizeof(T));
+  }
+
+  Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
+    return bytes_builder_.Finish(out, shrink_to_fit);
+  }
+
+  Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+    std::shared_ptr<Buffer> out;
+    ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+    return out;
+  }
+
+  /// \brief Like Finish, but override the final buffer size
+  ///
+  /// This is useful after writing data directly into the builder memory
+  /// without calling the Append methods (basically, when using TypedBufferBuilder
+  /// only for memory allocation).
+  Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+                                                   bool shrink_to_fit = true) {
+    return bytes_builder_.FinishWithLength(final_length * sizeof(T), shrink_to_fit);
+  }
+
+  void Reset() { bytes_builder_.Reset(); }
+
+  int64_t length() const { return bytes_builder_.length() / sizeof(T); }
+  int64_t capacity() const { return bytes_builder_.capacity() / sizeof(T); }
+  const T* data() const { return reinterpret_cast<const T*>(bytes_builder_.data()); }
+  T* mutable_data() { return reinterpret_cast<T*>(bytes_builder_.mutable_data()); }
+
+ private:
+  BufferBuilder bytes_builder_;
+};
+
+/// \brief A BufferBuilder for building a buffer containing a bitmap
+template <>
+class TypedBufferBuilder<bool> {
+ public:
+  explicit TypedBufferBuilder(MemoryPool* pool = default_memory_pool(),
+                              int64_t alignment = kDefaultBufferAlignment)
+      : bytes_builder_(pool, alignment) {}
+
+  explicit TypedBufferBuilder(BufferBuilder builder)
+      : bytes_builder_(std::move(builder)) {}
+
+  BufferBuilder* bytes_builder() { return &bytes_builder_; }
+
+  Status Append(bool value) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppend(value);
+    return Status::OK();
+  }
+
+  Status Append(const uint8_t* valid_bytes, int64_t num_elements) {
+    ARROW_RETURN_NOT_OK(Reserve(num_elements));
+    UnsafeAppend(valid_bytes, num_elements);
+    return Status::OK();
+  }
+
+  Status Append(const int64_t num_copies, bool value) {
+    ARROW_RETURN_NOT_OK(Reserve(num_copies));
+    UnsafeAppend(num_copies, value);
+    return Status::OK();
+  }
+
+  void UnsafeAppend(bool value) {
+    bit_util::SetBitTo(mutable_data(), bit_length_, value);
+    if (!value) {
+      ++false_count_;
+    }
+    ++bit_length_;
+  }
+
+  /// \brief Append bits from an array of bytes (one value per byte)
+  void UnsafeAppend(const uint8_t* bytes, int64_t num_elements) {
+    if (num_elements == 0) return;
+    int64_t i = 0;
+    internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] {
+      bool value = bytes[i++];
+      false_count_ += !value;
+      return value;
+    });
+    bit_length_ += num_elements;
+  }
+
+  /// \brief Append bits from a packed bitmap
+  void UnsafeAppend(const uint8_t* bitmap, int64_t offset, int64_t num_elements) {
+    if (num_elements == 0) return;
+    internal::CopyBitmap(bitmap, offset, num_elements, mutable_data(), bit_length_);
+    false_count_ += num_elements - internal::CountSetBits(bitmap, offset, num_elements);
+    bit_length_ += num_elements;
+  }
+
+  void UnsafeAppend(const int64_t num_copies, bool value) {
+    bit_util::SetBitsTo(mutable_data(), bit_length_, num_copies, value);
+    false_count_ += num_copies * !value;
+    bit_length_ += num_copies;
+  }
+
+  template <bool count_falses, typename Generator>
+  void UnsafeAppend(const int64_t num_elements, Generator&& gen) {
+    if (num_elements == 0) return;
+
+    if (count_falses) {
+      internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements, [&] {
+        bool value = gen();
+        false_count_ += !value;
+        return value;
+      });
+    } else {
+      internal::GenerateBitsUnrolled(mutable_data(), bit_length_, num_elements,
+                                     std::forward<Generator>(gen));
+    }
+    bit_length_ += num_elements;
+  }
+
+  Status Resize(const int64_t new_capacity, bool shrink_to_fit = true) {
+    const int64_t old_byte_capacity = bytes_builder_.capacity();
+    ARROW_RETURN_NOT_OK(
+        bytes_builder_.Resize(bit_util::BytesForBits(new_capacity), shrink_to_fit));
+    // Resize() may have chosen a larger capacity (e.g. for padding),
+    // so ask it again before calling memset().
+    const int64_t new_byte_capacity = bytes_builder_.capacity();
+    if (new_byte_capacity > old_byte_capacity) {
+      // The additional buffer space is 0-initialized for convenience,
+      // so that other methods can simply bump the length.
+      memset(mutable_data() + old_byte_capacity, 0,
+             static_cast<size_t>(new_byte_capacity - old_byte_capacity));
+    }
+    return Status::OK();
+  }
+
+  Status Reserve(const int64_t additional_elements) {
+    return Resize(
+        BufferBuilder::GrowByFactor(bit_length_, bit_length_ + additional_elements),
+        false);
+  }
+
+  Status Advance(const int64_t length) {
+    ARROW_RETURN_NOT_OK(Reserve(length));
+    bit_length_ += length;
+    false_count_ += length;
+    return Status::OK();
+  }
+
+  Status Finish(std::shared_ptr<Buffer>* out, bool shrink_to_fit = true) {
+    // set bytes_builder_.size_ == byte size of data
+    bytes_builder_.UnsafeAdvance(bit_util::BytesForBits(bit_length_) -
+                                 bytes_builder_.length());
+    bit_length_ = false_count_ = 0;
+    return bytes_builder_.Finish(out, shrink_to_fit);
+  }
+
+  Result<std::shared_ptr<Buffer>> Finish(bool shrink_to_fit = true) {
+    std::shared_ptr<Buffer> out;
+    ARROW_RETURN_NOT_OK(Finish(&out, shrink_to_fit));
+    return out;
+  }
+
+  /// \brief Like Finish, but override the final buffer size
+  ///
+  /// This is useful after writing data directly into the builder memory
+  /// without calling the Append methods (basically, when using TypedBufferBuilder
+  /// only for memory allocation).
+  Result<std::shared_ptr<Buffer>> FinishWithLength(int64_t final_length,
+                                                   bool shrink_to_fit = true) {
+    const auto final_byte_length = bit_util::BytesForBits(final_length);
+    bytes_builder_.UnsafeAdvance(final_byte_length - bytes_builder_.length());
+    bit_length_ = false_count_ = 0;
+    return bytes_builder_.FinishWithLength(final_byte_length, shrink_to_fit);
+  }
+
+  void Reset() {
+    bytes_builder_.Reset();
+    bit_length_ = false_count_ = 0;
+  }
+
+  int64_t length() const { return bit_length_; }
+  int64_t capacity() const { return bytes_builder_.capacity() * 8; }
+  const uint8_t* data() const { return bytes_builder_.data(); }
+  uint8_t* mutable_data() { return bytes_builder_.mutable_data(); }
+  int64_t false_count() const { return false_count_; }
+
+ private:
+  BufferBuilder bytes_builder_;
+  int64_t bit_length_ = 0;
+  int64_t false_count_ = 0;
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/builder.h b/pyarrow/include/arrow/builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0aa14c1e0612d1872a5959998651a12668f449f
--- /dev/null
+++ b/pyarrow/include/arrow/builder.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/array/builder_adaptive.h"   // IWYU pragma: keep
+#include "arrow/array/builder_base.h"       // IWYU pragma: keep
+#include "arrow/array/builder_binary.h"     // IWYU pragma: keep
+#include "arrow/array/builder_decimal.h"    // IWYU pragma: keep
+#include "arrow/array/builder_dict.h"       // IWYU pragma: keep
+#include "arrow/array/builder_nested.h"     // IWYU pragma: keep
+#include "arrow/array/builder_primitive.h"  // IWYU pragma: keep
+#include "arrow/array/builder_run_end.h"    // IWYU pragma: keep
+#include "arrow/array/builder_time.h"       // IWYU pragma: keep
+#include "arrow/array/builder_union.h"      // IWYU pragma: keep
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
diff --git a/pyarrow/include/arrow/c/abi.h b/pyarrow/include/arrow/c/abi.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae632f2dbd2601135cb02bc203dd085afd0acaf7
--- /dev/null
+++ b/pyarrow/include/arrow/c/abi.h
@@ -0,0 +1,460 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// \file abi.h Arrow C Data Interface
+///
+/// The Arrow C Data interface defines a very small, stable set
+/// of C definitions which can be easily copied into any project's
+/// source code and vendored to be used for columnar data interchange
+/// in the Arrow format. For non-C/C++ languages and runtimes,
+/// it should be almost as easy to translate the C definitions into
+/// the corresponding C FFI declarations.
+///
+/// Applications and libraries can therefore work with Arrow memory
+/// without necessarily using the Arrow libraries or reinventing
+/// the wheel. Developers can choose between tight integration
+/// with the Arrow software project or minimal integration with
+/// the Arrow format only.
+
+#pragma once
+
+#include <stdint.h>
+
+// Spec and documentation: https://arrow.apache.org/docs/format/CDataInterface.html
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef ARROW_C_DATA_INTERFACE
+#  define ARROW_C_DATA_INTERFACE
+
+#  define ARROW_FLAG_DICTIONARY_ORDERED 1
+#  define ARROW_FLAG_NULLABLE 2
+#  define ARROW_FLAG_MAP_KEYS_SORTED 4
+
+struct ArrowSchema {
+  // Array type description
+  const char* format;
+  const char* name;
+  const char* metadata;
+  int64_t flags;
+  int64_t n_children;
+  struct ArrowSchema** children;
+  struct ArrowSchema* dictionary;
+
+  // Release callback
+  void (*release)(struct ArrowSchema*);
+  // Opaque producer-specific data
+  void* private_data;
+};
+
+struct ArrowArray {
+  // Array data description
+  int64_t length;
+  int64_t null_count;
+  int64_t offset;
+  int64_t n_buffers;
+  int64_t n_children;
+  const void** buffers;
+  struct ArrowArray** children;
+  struct ArrowArray* dictionary;
+
+  // Release callback
+  void (*release)(struct ArrowArray*);
+  // Opaque producer-specific data
+  void* private_data;
+};
+
+#  define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT "ARROW:average_byte_width:exact"
+#  define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE \
+    "ARROW:average_byte_width:approximate"
+#  define ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT "ARROW:distinct_count:exact"
+#  define ARROW_STATISTICS_KEY_DISTINCT_COUNT_APPROXIMATE \
+    "ARROW:distinct_count:approximate"
+#  define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT "ARROW:max_byte_width:exact"
+#  define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE \
+    "ARROW:max_byte_width:approximate"
+#  define ARROW_STATISTICS_KEY_MAX_VALUE_EXACT "ARROW:max_value:exact"
+#  define ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE "ARROW:max_value:approximate"
+#  define ARROW_STATISTICS_KEY_MIN_VALUE_EXACT "ARROW:min_value:exact"
+#  define ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE "ARROW:min_value:approximate"
+#  define ARROW_STATISTICS_KEY_NULL_COUNT_EXACT "ARROW:null_count:exact"
+#  define ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE "ARROW:null_count:approximate"
+#  define ARROW_STATISTICS_KEY_ROW_COUNT_EXACT "ARROW:row_count:exact"
+#  define ARROW_STATISTICS_KEY_ROW_COUNT_APPROXIMATE "ARROW:row_count:approximate"
+
+#endif  // ARROW_C_DATA_INTERFACE
+
+#ifndef ARROW_C_DEVICE_DATA_INTERFACE
+#  define ARROW_C_DEVICE_DATA_INTERFACE
+
+// Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html
+
+// DeviceType for the allocated memory
+typedef int32_t ArrowDeviceType;
+
+// CPU device, same as using ArrowArray directly
+#  define ARROW_DEVICE_CPU 1
+// CUDA GPU Device
+#  define ARROW_DEVICE_CUDA 2
+// Pinned CUDA CPU memory by cudaMallocHost
+#  define ARROW_DEVICE_CUDA_HOST 3
+// OpenCL Device
+#  define ARROW_DEVICE_OPENCL 4
+// Vulkan buffer for next-gen graphics
+#  define ARROW_DEVICE_VULKAN 7
+// Metal for Apple GPU
+#  define ARROW_DEVICE_METAL 8
+// Verilog simulator buffer
+#  define ARROW_DEVICE_VPI 9
+// ROCm GPUs for AMD GPUs
+#  define ARROW_DEVICE_ROCM 10
+// Pinned ROCm CPU memory allocated by hipMallocHost
+#  define ARROW_DEVICE_ROCM_HOST 11
+// Reserved for extension
+#  define ARROW_DEVICE_EXT_DEV 12
+// CUDA managed/unified memory allocated by cudaMallocManaged
+#  define ARROW_DEVICE_CUDA_MANAGED 13
+// unified shared memory allocated on a oneAPI non-partitioned device.
+#  define ARROW_DEVICE_ONEAPI 14
+// GPU support for next-gen WebGPU standard
+#  define ARROW_DEVICE_WEBGPU 15
+// Qualcomm Hexagon DSP
+#  define ARROW_DEVICE_HEXAGON 16
+
+struct ArrowDeviceArray {
+  // the Allocated Array
+  //
+  // the buffers in the array (along with the buffers of any
+  // children) are what is allocated on the device.
+  struct ArrowArray array;
+  // The device id to identify a specific device
+  int64_t device_id;
+  // The type of device which can access this memory.
+  ArrowDeviceType device_type;
+  // An event-like object to synchronize on if needed.
+  void* sync_event;
+  // Reserved bytes for future expansion.
+  int64_t reserved[3];
+};
+
+#endif  // ARROW_C_DEVICE_DATA_INTERFACE
+
+#ifndef ARROW_C_STREAM_INTERFACE
+#  define ARROW_C_STREAM_INTERFACE
+
+struct ArrowArrayStream {
+  // Callback to get the stream type
+  // (will be the same for all arrays in the stream).
+  //
+  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
+  //
+  // If successful, the ArrowSchema must be released independently from the stream.
+  int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);
+
+  // Callback to get the next array
+  // (if no error and the array is released, the stream has ended)
+  //
+  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
+  //
+  // If successful, the ArrowArray must be released independently from the stream.
+  int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);
+
+  // Callback to get optional detailed error information.
+  // This must only be called if the last stream operation failed
+  // with a non-0 return code.
+  //
+  // Return value: pointer to a null-terminated character array describing
+  // the last error, or NULL if no description is available.
+  //
+  // The returned pointer is only valid until the next operation on this stream
+  // (including release).
+  const char* (*get_last_error)(struct ArrowArrayStream*);
+
+  // Release callback: release the stream's own resources.
+  // Note that arrays returned by `get_next` must be individually released.
+  void (*release)(struct ArrowArrayStream*);
+
+  // Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_STREAM_INTERFACE
+
+#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
+#  define ARROW_C_DEVICE_STREAM_INTERFACE
+
+// Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
+//
+// This stream is intended to provide a stream of data on a single
+// device, if a producer wants data to be produced on multiple devices
+// then multiple streams should be provided. One per device.
+struct ArrowDeviceArrayStream {
+  // The device that this stream produces data on.
+  ArrowDeviceType device_type;
+
+  // Callback to get the stream schema
+  // (will be the same for all arrays in the stream).
+  //
+  // Return value 0 if successful, an `errno`-compatible error code otherwise.
+  //
+  // If successful, the ArrowSchema must be released independently from the stream.
+  // The schema should be accessible via CPU memory.
+  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);
+
+  // Callback to get the next array
+  // (if no error and the array is released, the stream has ended)
+  //
+  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
+  //
+  // If successful, the ArrowDeviceArray must be released independently from the stream.
+  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);
+
+  // Callback to get optional detailed error information.
+  // This must only be called if the last stream operation failed
+  // with a non-0 return code.
+  //
+  // Return value: pointer to a null-terminated character array describing
+  // the last error, or NULL if no description is available.
+  //
+  // The returned pointer is only valid until the next operation on this stream
+  // (including release).
+  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);
+
+  // Release callback: release the stream's own resources.
+  // Note that arrays returned by `get_next` must be individually released.
+  void (*release)(struct ArrowDeviceArrayStream* self);
+
+  // Opaque producer-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_DEVICE_STREAM_INTERFACE
+
+#ifndef ARROW_C_ASYNC_STREAM_INTERFACE
+#  define ARROW_C_ASYNC_STREAM_INTERFACE
+
+// EXPERIMENTAL: ArrowAsyncTask represents available data from a producer that was passed
+// to an invocation of `on_next_task` on the ArrowAsyncDeviceStreamHandler.
+//
+// The reason for this Task approach instead of the Async interface returning
+// the Array directly is to allow for more complex thread handling and reducing
+// context switching and data transfers between CPU cores (e.g. from one L1/L2
+// cache to another) if desired.
+//
+// For example, the `on_next_task` callback can be called when data is ready, while
+// the producer puts potential "decoding" logic in the `ArrowAsyncTask` object. This
+// allows for the producer to manage the I/O on one thread which calls `on_next_task`
+// and the consumer can determine when the decoding (producer logic in the `extract_data`
+// callback of the task) occurs and on which thread, to avoid a CPU core transfer
+// (data staying in the L2 cache).
+struct ArrowAsyncTask {
+  // This callback should populate the ArrowDeviceArray associated with this task.
+  // The order of ArrowAsyncTasks provided by the producer enables a consumer to
+  // ensure the order of data to process.
+  //
+  // This function is expected to be synchronous, but should not perform any blocking
+  // I/O. Ideally it should be as cheap as possible so as to not tie up the consumer
+  // thread unnecessarily.
+  //
+  // Returns: 0 if successful, errno-compatible error otherwise.
+  //
+  // If a non-0 value is returned then it should be followed by a call to `on_error`
+  // on the appropriate ArrowAsyncDeviceStreamHandler. This is because it's highly
+  // likely that whatever is calling this function may be entirely disconnected from
+  // the current control flow. Indicating an error here with a non-zero return allows
+  // the current flow to be aware of the error occurring, while still allowing any
+  // logging or error handling to still be centralized in the `on_error` callback of
+  // the original Async handler.
+  //
+  // Rather than a release callback, any required cleanup should be performed as part
+  // of the invocation of `extract_data`. Ownership of the Array is passed to the consumer
+  // calling this, and so it must be released separately.
+  //
+  // It is only valid to call this method exactly once.
+  int (*extract_data)(struct ArrowAsyncTask* self, struct ArrowDeviceArray* out);
+
+  // opaque task-specific data
+  void* private_data;
+};
+
+// EXPERIMENTAL: ArrowAsyncProducer represents a 1-to-1 relationship between an async
+// producer and consumer. This object allows the consumer to perform backpressure and flow
+// control on the asynchronous stream processing. This object must be owned by the
+// producer who creates it, and thus is responsible for cleaning it up.
+struct ArrowAsyncProducer {
+  // The device type that this stream produces data on.
+  ArrowDeviceType device_type;
+
+  // A consumer must call this function to start receiving on_next_task calls.
+  //
+  // It *must* be valid to call this synchronously from within `on_next_task` or
+  // `on_schema`, but this function *must not* immediately call `on_next_task` so as
+  // to avoid recursion and reentrant callbacks.
+  //
+  // After cancel has been called, additional calls to this function must be NOPs,
+  // but allowed. While not cancelled, calling this function must register the
+  // given number of additional arrays/batches to be produced with the producer.
+  // The producer should only call `on_next_task` at most the registered number
+  // of arrays before propagating backpressure.
+  //
+  // Any error encountered by calling request must be propagated by calling the `on_error`
+  // callback of the ArrowAsyncDeviceStreamHandler.
+  //
+  // While not cancelled, any subsequent calls to `on_next_task`, `on_error` or
+  // `release` should be scheduled by the producer to be called later.
+  //
+  // It is invalid for a consumer to call this with a value of n <= 0, producers should
+  // error if given such a value.
+  void (*request)(struct ArrowAsyncProducer* self, int64_t n);
+
+  // This cancel callback signals a producer that it must eventually stop making calls
+  // to on_next_task. It must be idempotent and thread-safe. After calling cancel once,
+  // subsequent calls must be NOPs. This must not call any consumer-side handlers other
+  // than `on_error`.
+  //
+  // It is not required that calling cancel affect the producer immediately, only that it
+  // must eventually stop calling on_next_task and subsequently call release on the
+  // async handler. As such, a consumer must be prepared to receive one or more calls to
+  // `on_next_task` even after calling cancel if there are still requested arrays pending.
+  //
+  // Successful cancellation should *not* result in the producer calling `on_error`, it
+  // should finish out any remaining tasks and eventually call `release`.
+  //
+  // Any error encountered during handling a call to cancel must be reported via the
+  // on_error callback on the async stream handler.
+  void (*cancel)(struct ArrowAsyncProducer* self);
+
+  // Any additional metadata tied to a specific stream of data. This must either be NULL
+  // or a valid pointer to metadata which is encoded in the same way schema metadata
+  // would be. Non-null metadata must be valid for the lifetime of this object. As an
+  // example a producer could use this to provide the total number of rows and/or batches
+  // in the stream if known.
+  const char* additional_metadata;
+
+  // producer-specific opaque data.
+  void* private_data;
+};
+
+// EXPERIMENTAL: Similar to ArrowDeviceArrayStream, except designed for an asynchronous
+// style of interaction. While ArrowDeviceArrayStream provides producer
+// defined callbacks, this is intended to be created by the consumer instead.
+// The consumer passes this handler to the producer, which in turn uses the
+// callbacks to inform the consumer of events in the stream.
+struct ArrowAsyncDeviceStreamHandler {
+  // Handler for receiving a schema. The passed in stream_schema must be
+  // released or moved by the handler (producer is giving ownership of the schema to
+  // the handler, but not ownership of the top level object itself).
+  //
+  // With the exception of an error occurring (on_error), this must be the first
+  // callback function which is called by a producer and must only be called exactly
+  // once. As such, the producer should provide a valid ArrowAsyncProducer instance
+  // so the consumer can control the flow. See the documentation on ArrowAsyncProducer
+  // for how it works. The ArrowAsyncProducer is owned by the producer who calls this
+  // function and thus the producer is responsible for cleaning it up when calling
+  // the release callback of this handler.
+  //
+  // If there is any additional metadata tied to this stream, it will be provided as
+  // a non-null value for the `additional_metadata` field of the ArrowAsyncProducer
+  // which will be valid at least until the release callback is called.
+  //
+  // Return value: 0 if successful, `errno`-compatible error otherwise
+  //
+  // A producer that receives a non-zero return here should stop producing and eventually
+  // call release instead.
+  int (*on_schema)(struct ArrowAsyncDeviceStreamHandler* self,
+                   struct ArrowSchema* stream_schema);
+
+  // Handler for receiving data. This is called when data is available providing an
+  // ArrowAsyncTask struct to signify it. The producer indicates the end of the stream
+  // by passing NULL as the value for the task rather than a valid pointer to a task.
+  // The task object is only valid for the lifetime of this function call, if a consumer
+  // wants to utilize it after this function returns, it must copy or move the contents
+  // of it to a new ArrowAsyncTask object.
+  //
+  // The `request` callback of a provided ArrowAsyncProducer must be called in order
+  // to start receiving calls to this handler.
+  //
+  // The metadata argument can be null or can be used by a producer
+  // to pass arbitrary extra information to the consumer (such as total number
+  // of rows, context info, or otherwise). The data should be passed using the same
+  // encoding as the metadata within the ArrowSchema struct itself (defined in
+  // the spec at
+  // https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata)
+  //
+  // If metadata is non-null then it only needs to exist for the lifetime of this call,
+  // a consumer who wants it to live after that must copy it to ensure lifetime.
+  //
+  // A producer *must not* call this concurrently from multiple different threads.
+  //
+  // A consumer must be prepared to receive one or more calls to this callback even
+  // after calling cancel on the corresponding ArrowAsyncProducer, as cancel does not
+  // guarantee it happens immediately.
+  //
+  // Return value: 0 if successful, `errno`-compatible error otherwise.
+  //
+  // If the consumer returns a non-zero return from this method, that indicates to the
+  // producer that it should stop propagating data as an error occurred. After receiving
+  // such a return, the only interaction with this object is for the producer to call
+  // the `release` callback.
+  int (*on_next_task)(struct ArrowAsyncDeviceStreamHandler* self,
+                      struct ArrowAsyncTask* task, const char* metadata);
+
+  // Handler for encountering an error. The producer should call release after
+  // this returns to clean up any resources. The `code` passed in can be any error
+  // code that a producer wants, but should be errno-compatible for consistency.
+  //
+  // If the message or metadata are non-null, they will only last as long as this
+  // function call. The consumer would need to perform a copy of the data if it is
+  // necessary for them to live past the lifetime of this call.
+  //
+  // Error metadata should be encoded as with metadata in ArrowSchema, defined in
+  // the spec at
+  // https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata
+  //
+  // It is valid for this to be called by a producer with or without a preceding call
+  // to ArrowAsyncProducer.request.
+  //
+  // This callback must not call any methods of an ArrowAsyncProducer object.
+  void (*on_error)(struct ArrowAsyncDeviceStreamHandler* self, int code,
+                   const char* message, const char* metadata);
+
+  // Release callback to release any resources for the handler. Should always be
+  // called by a producer when it is done utilizing a handler. No callbacks should
+  // be called after this is called.
+  //
+  // It is valid for the release callback to be called by a producer with or without
+  // a preceding call to ArrowAsyncProducer.request.
+  //
+  // The release callback must not call any methods of an ArrowAsyncProducer object.
+  void (*release)(struct ArrowAsyncDeviceStreamHandler* self);
+
+  // MUST be populated by the producer BEFORE calling any callbacks other than release.
+  // This provides the connection between a handler and its producer, and must exist until
+  // the release callback is called.
+  struct ArrowAsyncProducer* producer;
+
+  // Opaque handler-specific data
+  void* private_data;
+};
+
+#endif  // ARROW_C_ASYNC_STREAM_INTERFACE
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/pyarrow/include/arrow/c/bridge.h b/pyarrow/include/arrow/c/bridge.h
new file mode 100644
index 0000000000000000000000000000000000000000..78860e0650e741a95e7f8bc0c5ab35bc1c01cf79
--- /dev/null
+++ b/pyarrow/include/arrow/c/bridge.h
@@ -0,0 +1,489 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "arrow/c/abi.h"
+#include "arrow/device.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/async_generator_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \defgroup c-data-interface Functions for working with the C data interface.
+///
+/// @{
+
+/// \brief Export C++ DataType using the C data interface format.
+///
+/// The root type is considered to have empty name and metadata.
+/// If you want the root type to have a name and/or metadata, pass
+/// a Field instead.
+///
+/// \param[in] type DataType object to export
+/// \param[out] out C struct where to export the datatype
+ARROW_EXPORT
+Status ExportType(const DataType& type, struct ArrowSchema* out);
+
+/// \brief Export C++ Field using the C data interface format.
+///
+/// \param[in] field Field object to export
+/// \param[out] out C struct where to export the field
+ARROW_EXPORT
+Status ExportField(const Field& field, struct ArrowSchema* out);
+
+/// \brief Export C++ Schema using the C data interface format.
+///
+/// \param[in] schema Schema object to export
+/// \param[out] out C struct where to export the field
+ARROW_EXPORT
+Status ExportSchema(const Schema& schema, struct ArrowSchema* out);
+
+/// \brief Export C++ Array using the C data interface format.
+///
+/// The resulting ArrowArray struct keeps the array data and buffers alive
+/// until its release callback is called by the consumer.
+///
+/// \param[in] array Array object to export
+/// \param[out] out C struct where to export the array
+/// \param[out] out_schema optional C struct where to export the array type
+ARROW_EXPORT
+Status ExportArray(const Array& array, struct ArrowArray* out,
+                   struct ArrowSchema* out_schema = NULLPTR);
+
+/// \brief Export C++ RecordBatch using the C data interface format.
+///
+/// The record batch is exported as if it were a struct array.
+/// The resulting ArrowArray struct keeps the record batch data and buffers alive
+/// until its release callback is called by the consumer.
+///
+/// \param[in] batch Record batch to export
+/// \param[out] out C struct where to export the record batch
+/// \param[out] out_schema optional C struct where to export the record batch schema
+ARROW_EXPORT
+Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out,
+                         struct ArrowSchema* out_schema = NULLPTR);
+
+/// \brief Import C++ DataType from the C data interface.
+///
+/// The given ArrowSchema struct is released (as per the C data interface
+/// specification), even if this function fails.
+///
+/// \param[in,out] schema C data interface struct representing the data type
+/// \return Imported type object
+ARROW_EXPORT
+Result<std::shared_ptr<DataType>> ImportType(struct ArrowSchema* schema);
+
+/// \brief Import C++ Field from the C data interface.
+///
+/// The given ArrowSchema struct is released (as per the C data interface
+/// specification), even if this function fails.
+///
+/// \param[in,out] schema C data interface struct representing the field
+/// \return Imported field object
+ARROW_EXPORT
+Result<std::shared_ptr<Field>> ImportField(struct ArrowSchema* schema);
+
+/// \brief Import C++ Schema from the C data interface.
+///
+/// The given ArrowSchema struct is released (as per the C data interface
+/// specification), even if this function fails.
+///
+/// \param[in,out] schema C data interface struct representing the field
+/// \return Imported field object
+ARROW_EXPORT
+Result<std::shared_ptr<Schema>> ImportSchema(struct ArrowSchema* schema);
+
+/// \brief Import C++ array from the C data interface.
+///
+/// The ArrowArray struct has its contents moved (as per the C data interface
+/// specification) to a private object held alive by the resulting array.
+///
+/// \param[in,out] array C data interface struct holding the array data
+/// \param[in] type type of the imported array
+/// \return Imported array object
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> ImportArray(struct ArrowArray* array,
+                                           std::shared_ptr<DataType> type);
+
+/// \brief Import C++ array and its type from the C data interface.
+///
+/// The ArrowArray struct has its contents moved (as per the C data interface
+/// specification) to a private object held alive by the resulting array.
+/// The ArrowSchema struct is released, even if this function fails.
+///
+/// \param[in,out] array C data interface struct holding the array data
+/// \param[in,out] type C data interface struct holding the array type
+/// \return Imported array object
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> ImportArray(struct ArrowArray* array,
+                                           struct ArrowSchema* type);
+
+/// \brief Import C++ record batch from the C data interface.
+///
+/// The ArrowArray struct has its contents moved (as per the C data interface
+/// specification) to a private object held alive by the resulting record batch.
+///
+/// \param[in,out] array C data interface struct holding the record batch data
+/// \param[in] schema schema of the imported record batch
+/// \return Imported record batch object
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ImportRecordBatch(struct ArrowArray* array,
+                                                       std::shared_ptr<Schema> schema);
+
+/// \brief Import C++ record batch and its schema from the C data interface.
+///
+/// The type represented by the ArrowSchema struct must be a struct type array.
+/// The ArrowArray struct has its contents moved (as per the C data interface
+/// specification) to a private object held alive by the resulting record batch.
+/// The ArrowSchema struct is released, even if this function fails.
+///
+/// \param[in,out] array C data interface struct holding the record batch data
+/// \param[in,out] schema C data interface struct holding the record batch schema
+/// \return Imported record batch object
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ImportRecordBatch(struct ArrowArray* array,
+                                                       struct ArrowSchema* schema);
+
+/// @}
+
+/// \defgroup c-data-device-interface Functions for working with the C data device
+/// interface.
+///
+/// @{
+
+/// \brief EXPERIMENTAL: Export C++ Array as an ArrowDeviceArray.
+///
+/// The resulting ArrowDeviceArray struct keeps the array data and buffers alive
+/// until its release callback is called by the consumer. All buffers in
+/// the provided array MUST have the same device_type, otherwise an error
+/// will be returned.
+///
+/// If sync is non-null, get_event will be called on it in order to
+/// potentially provide an event for consumers to synchronize on.
+///
+/// \param[in] array Array object to export
+/// \param[in] sync shared_ptr to object derived from Device::SyncEvent or null
+/// \param[out] out C struct to export the array to
+/// \param[out] out_schema optional C struct to export the array type to
+ARROW_EXPORT
+Status ExportDeviceArray(const Array& array, std::shared_ptr<Device::SyncEvent> sync,
+                         struct ArrowDeviceArray* out,
+                         struct ArrowSchema* out_schema = NULLPTR);
+
+/// \brief EXPERIMENTAL: Export C++ RecordBatch as an ArrowDeviceArray.
+///
+/// The record batch is exported as if it were a struct array.
+/// The resulting ArrowDeviceArray struct keeps the record batch data and buffers alive
+/// until its release callback is called by the consumer.
+///
+/// All buffers of all columns in the record batch must have the same device_type
+/// otherwise an error will be returned. If columns are on different devices,
+/// they should be exported using different ArrowDeviceArray instances.
+///
+/// If sync is non-null, get_event will be called on it in order to
+/// potentially provide an event for consumers to synchronize on.
+///
+/// \param[in] batch Record batch to export
+/// \param[in] sync shared_ptr to object derived from Device::SyncEvent or null
+/// \param[out] out C struct where to export the record batch
+/// \param[out] out_schema optional C struct where to export the record batch schema
+ARROW_EXPORT
+Status ExportDeviceRecordBatch(const RecordBatch& batch,
+                               std::shared_ptr<Device::SyncEvent> sync,
+                               struct ArrowDeviceArray* out,
+                               struct ArrowSchema* out_schema = NULLPTR);
+
+using DeviceMemoryMapper =
+    std::function<Result<std::shared_ptr<MemoryManager>>(ArrowDeviceType, int64_t)>;
+
+ARROW_EXPORT
+Result<std::shared_ptr<MemoryManager>> DefaultDeviceMemoryMapper(
+    ArrowDeviceType device_type, int64_t device_id);
+
+/// \brief EXPERIMENTAL: Import C++ device array from the C data interface.
+///
+/// The ArrowArray struct has its contents moved (as per the C data interface
+/// specification) to a private object held alive by the resulting array. The
+/// buffers of the Array are located on the device indicated by the device_type.
+///
+/// \param[in,out] array C data interface struct holding the array data
+/// \param[in] type type of the imported array
+/// \param[in] mapper A function to map device + id to memory manager. If not
+/// specified, defaults to map "cpu" to the built-in default memory manager.
+/// \return Imported array object
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> ImportDeviceArray(
+    struct ArrowDeviceArray* array, std::shared_ptr<DataType> type,
+    const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper);
+
+/// \brief EXPERIMENTAL: Import C++ device array and its type from the C data interface.
+///
+/// The ArrowArray struct has its contents moved (as per the C data interface
+/// specification) to a private object held alive by the resulting array.
+/// The ArrowSchema struct is released, even if this function fails. The
+/// buffers of the Array are located on the device indicated by the device_type.
+///
+/// \param[in,out] array C data interface struct holding the array data
+/// \param[in,out] type C data interface struct holding the array type
+/// \param[in] mapper A function to map device + id to memory manager. If not
+/// specified, defaults to map "cpu" to the built-in default memory manager.
+/// \return Imported array object
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> ImportDeviceArray(
+    struct ArrowDeviceArray* array, struct ArrowSchema* type,
+    const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper);
+
+/// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device from the C data
+/// interface.
+///
+/// The ArrowArray struct has its contents moved (as per the C data interface
+/// specification) to a private object held alive by the resulting record batch.
+/// The buffers of all columns of the record batch are located on the device
+/// indicated by the device type.
+///
+/// \param[in,out] array C data interface struct holding the record batch data
+/// \param[in] schema schema of the imported record batch
+/// \param[in] mapper A function to map device + id to memory manager. If not
+/// specified, defaults to map "cpu" to the built-in default memory manager.
+/// \return Imported record batch object
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ImportDeviceRecordBatch(
+    struct ArrowDeviceArray* array, std::shared_ptr<Schema> schema,
+    const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper);
+
+/// \brief EXPERIMENTAL: Import C++ record batch with buffers on a device and its schema
+/// from the C data interface.
+///
+/// The type represented by the ArrowSchema struct must be a struct type array.
+/// The ArrowArray struct has its contents moved (as per the C data interface
+/// specification) to a private object held alive by the resulting record batch.
+/// The ArrowSchema struct is released, even if this function fails. The buffers
+/// of all columns of the record batch are located on the device indicated by the
+/// device type.
+///
+/// \param[in,out] array C data interface struct holding the record batch data
+/// \param[in,out] schema C data interface struct holding the record batch schema
+/// \param[in] mapper A function to map device + id to memory manager. If not
+/// specified, defaults to map "cpu" to the built-in default memory manager.
+/// \return Imported record batch object
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ImportDeviceRecordBatch(
+    struct ArrowDeviceArray* array, struct ArrowSchema* schema,
+    const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper);
+
+/// @}
+
+/// \defgroup c-stream-interface Functions for working with the C data interface.
+///
+/// @{
+
+/// \brief Export C++ RecordBatchReader using the C stream interface.
+///
+/// The resulting ArrowArrayStream struct keeps the record batch reader alive
+/// until its release callback is called by the consumer.
+///
+/// \param[in] reader RecordBatchReader object to export
+/// \param[out] out C struct where to export the stream
+ARROW_EXPORT
+Status ExportRecordBatchReader(std::shared_ptr<RecordBatchReader> reader,
+                               struct ArrowArrayStream* out);
+
+/// \brief Export C++ ChunkedArray using the C data interface format.
+///
+/// The resulting ArrowArrayStream struct keeps the chunked array data and buffers alive
+/// until its release callback is called by the consumer.
+///
+/// \param[in] chunked_array ChunkedArray object to export
+/// \param[out] out C struct where to export the stream
+ARROW_EXPORT
+Status ExportChunkedArray(std::shared_ptr<ChunkedArray> chunked_array,
+                          struct ArrowArrayStream* out);
+
+/// \brief Export C++ RecordBatchReader using the C device stream interface
+///
+/// The resulting ArrowDeviceArrayStream struct keeps the record batch reader
+/// alive until its release callback is called by the consumer. The device
+/// type is determined by calling device_type() on the RecordBatchReader.
+///
+/// \param[in] reader RecordBatchReader object to export
+/// \param[out] out C struct to export the stream to
+ARROW_EXPORT
+Status ExportDeviceRecordBatchReader(std::shared_ptr<RecordBatchReader> reader,
+                                     struct ArrowDeviceArrayStream* out);
+
+/// \brief Export C++ ChunkedArray using the C device data interface format.
+///
+/// The resulting ArrowDeviceArrayStream keeps the chunked array data and buffers
+/// alive until its release callback is called by the consumer.
+///
+/// \param[in] chunked_array ChunkedArray object to export
+/// \param[in] device_type the device type the data is located on
+/// \param[out] out C struct to export the stream to
+ARROW_EXPORT
+Status ExportDeviceChunkedArray(std::shared_ptr<ChunkedArray> chunked_array,
+                                DeviceAllocationType device_type,
+                                struct ArrowDeviceArrayStream* out);
+
+/// \brief Import C++ RecordBatchReader from the C stream interface.
+///
+/// The ArrowArrayStream struct has its contents moved to a private object
+/// held alive by the resulting record batch reader.
+///
+/// \param[in,out] stream C stream interface struct
+/// \return Imported RecordBatchReader object
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchReader>> ImportRecordBatchReader(
+    struct ArrowArrayStream* stream);
+
+/// \brief Import C++ ChunkedArray from the C stream interface
+///
+/// The ArrowArrayStream struct has its contents moved to a private object,
+/// is consumed in its entirity, and released before returning all chunks
+/// as a ChunkedArray.
+///
+/// \param[in,out] stream C stream interface struct
+/// \return Imported ChunkedArray object
+ARROW_EXPORT
+Result<std::shared_ptr<ChunkedArray>> ImportChunkedArray(struct ArrowArrayStream* stream);
+
+/// \brief Import C++ RecordBatchReader from the C device stream interface
+///
+/// The ArrowDeviceArrayStream struct has its contents moved to a private object
+/// held alive by the resulting record batch reader.
+///
+/// \note If there was a required sync event, sync events are accessible by individual
+/// buffers of columns. We are not yet bubbling the sync events from the buffers up to
+/// the `GetSyncEvent` method of an imported RecordBatch. This will be added in a future
+/// update.
+///
+/// \param[in,out] stream C device stream interface struct
+/// \param[in] mapper mapping from device type and ID to memory manager
+/// \return Imported RecordBatchReader object
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchReader>> ImportDeviceRecordBatchReader(
+    struct ArrowDeviceArrayStream* stream,
+    const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper);
+
+/// \brief Import C++ ChunkedArray from the C device stream interface
+///
+/// The ArrowDeviceArrayStream struct has its contents moved to a private object,
+/// is consumed in its entirety, and released before returning all chunks as a
+/// ChunkedArray.
+///
+/// \note Any chunks that require synchronization for their device memory will have
+/// the SyncEvent objects available by checking the individual buffers of each chunk.
+/// These SyncEvents should be checked before accessing the data in those buffers.
+///
+/// \param[in,out] stream C device stream interface struct
+/// \param[in] mapper mapping from device type and ID to memory manager
+/// \return Imported ChunkedArray object
+ARROW_EXPORT
+Result<std::shared_ptr<ChunkedArray>> ImportDeviceChunkedArray(
+    struct ArrowDeviceArrayStream* stream,
+    const DeviceMemoryMapper& mapper = DefaultDeviceMemoryMapper);
+
+/// @}
+
+/// \defgroup c-async-stream-interface Functions for working with the async C data
+/// interface.
+///
+/// @{
+
+/// \brief EXPERIMENTAL: AsyncErrorDetail is a StatusDetail that contains an error code
+/// and message from an asynchronous operation.
+class AsyncErrorDetail : public StatusDetail {
+ public:
+  AsyncErrorDetail(int code, std::string message, std::string metadata)
+      : code_(code), message_(std::move(message)), metadata_(std::move(metadata)) {}
+  const char* type_id() const override { return "AsyncErrorDetail"; }
+  // ToString just returns the error message that was returned with the error
+  std::string ToString() const override { return message_; }
+  // code is an errno-compatible error code
+  int code() const { return code_; }
+  // returns any metadata that was returned with the error, likely in a
+  // key-value format similar to ArrowSchema metadata
+  const std::string& ErrorMetadataString() const { return metadata_; }
+  std::shared_ptr<KeyValueMetadata> ErrorMetadata() const;
+
+ private:
+  int code_{0};
+  std::string message_;
+  std::string metadata_;
+};
+
+struct AsyncRecordBatchGenerator {
+  std::shared_ptr<Schema> schema;
+  DeviceAllocationType device_type;
+  AsyncGenerator<RecordBatchWithMetadata> generator;
+};
+
+namespace internal {
+class Executor;
+}
+
+/// \brief EXPERIMENTAL: Create an AsyncRecordBatchReader and populate a corresponding
+/// handler to pass to a producer
+///
+/// The ArrowAsyncDeviceStreamHandler struct is intended to have its callbacks populated
+/// and then be passed to a producer to call the appropriate callbacks when data is ready.
+/// This inverts the traditional flow of control, and so we construct a corresponding
+/// AsyncRecordBatchGenerator to provide an interface for the consumer to retrieve data as
+/// it is pushed to the handler.
+///
+/// \param[in,out] handler C struct to be populated
+/// \param[in] executor the executor to use for waiting and populating record batches
+/// \param[in] queue_size initial number of record batches to request for queueing
+/// \param[in] mapper mapping from device type and ID to memory manager
+/// \return Future that resolves to either an error or AsyncRecordBatchGenerator once a
+/// schema is available or an error is received.
+ARROW_EXPORT
+Future<AsyncRecordBatchGenerator> CreateAsyncDeviceStreamHandler(
+    struct ArrowAsyncDeviceStreamHandler* handler, internal::Executor* executor,
+    uint64_t queue_size = 5, DeviceMemoryMapper mapper = DefaultDeviceMemoryMapper);
+
+/// \brief EXPERIMENTAL: Export an AsyncGenerator of record batches using a provided
+/// handler
+///
+/// This function calls the callbacks on the consumer-provided async handler as record
+/// batches become available from the AsyncGenerator which is provided. It will first call
+/// on_schema using the provided schema, and then serially visit each record batch from
+/// the generator, calling the on_next_task callback. If an error occurs, on_error will be
+/// called appropriately.
+///
+/// \param[in] schema the schema of the stream being exported
+/// \param[in] generator a generator that asynchronously produces record batches
+/// \param[in] device_type the device type that the record batches will be located on
+/// \param[in] handler the handler whose callbacks to utilize as data is available
+/// \return Future that will resolve once the generator is exhausted or an error occurs
+ARROW_EXPORT
+Future<> ExportAsyncRecordBatchReader(
+    std::shared_ptr<Schema> schema,
+    AsyncGenerator<std::shared_ptr<RecordBatch>> generator,
+    DeviceAllocationType device_type, struct ArrowAsyncDeviceStreamHandler* handler);
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/c/dlpack.h b/pyarrow/include/arrow/c/dlpack.h
new file mode 100644
index 0000000000000000000000000000000000000000..65da38423c2ad62fce26fc115024ef843fb802b5
--- /dev/null
+++ b/pyarrow/include/arrow/c/dlpack.h
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/array/array_base.h"
+#include "arrow/c/dlpack_abi.h"
+
+namespace arrow::dlpack {
+
+/// \brief Export Arrow array as DLPack tensor.
+///
+/// DLMangedTensor is produced as defined by the DLPack protocol,
+/// see https://dmlc.github.io/dlpack/latest/.
+///
+/// Data types for which the protocol is supported are
+/// integer and floating-point data types.
+///
+/// DLPack protocol only supports arrays with one contiguous
+/// memory region which means Arrow Arrays with validity buffers
+/// are not supported.
+///
+/// \param[in] arr Arrow array
+/// \return DLManagedTensor struct
+ARROW_EXPORT
+Result<DLManagedTensor*> ExportArray(const std::shared_ptr<Array>& arr);
+
+ARROW_EXPORT
+Result<DLManagedTensor*> ExportTensor(const std::shared_ptr<Tensor>& t);
+
+/// \brief Get DLDevice with enumerator specifying the
+/// type of the device data is stored on and index of the
+/// device which is 0 by default for CPU.
+///
+/// \param[in] arr Arrow array
+/// \return DLDevice struct
+ARROW_EXPORT
+Result<DLDevice> ExportDevice(const std::shared_ptr<Array>& arr);
+
+ARROW_EXPORT
+Result<DLDevice> ExportDevice(const std::shared_ptr<Tensor>& t);
+
+}  // namespace arrow::dlpack
diff --git a/pyarrow/include/arrow/c/dlpack_abi.h b/pyarrow/include/arrow/c/dlpack_abi.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbe2a56a344b373f3d3e950e434ba5392036a080
--- /dev/null
+++ b/pyarrow/include/arrow/c/dlpack_abi.h
@@ -0,0 +1,321 @@
+// Taken from:
+// https://github.com/dmlc/dlpack/blob/ca4d00ad3e2e0f410eeab3264d21b8a39397f362/include/dlpack/dlpack.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+/**
+ * \brief Compatibility with C++
+ */
+#ifdef __cplusplus
+#  define DLPACK_EXTERN_C extern "C"
+#else
+#  define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current major version of dlpack */
+#define DLPACK_MAJOR_VERSION 1
+
+/*! \brief The current minor version of dlpack */
+#define DLPACK_MINOR_VERSION 0
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#  ifdef DLPACK_EXPORTS
+#    define DLPACK_DLL __declspec(dllexport)
+#  else
+#    define DLPACK_DLL __declspec(dllimport)
+#  endif
+#else
+#  define DLPACK_DLL
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \brief The DLPack version.
+ *
+ * A change in major version indicates that we have changed the
+ * data layout of the ABI - DLManagedTensorVersioned.
+ *
+ * A change in minor version indicates that we have added new
+ * code, such as a new device type, but the ABI is kept the same.
+ *
+ * If an obtained DLPack tensor has a major version that disagrees
+ * with the version number specified in this header file
+ * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
+ * (and it is safe to do so). It is not safe to access any other fields
+ * as the memory layout will have changed.
+ *
+ * In the case of a minor version mismatch, the tensor can be safely used as
+ * long as the consumer knows how to interpret all fields. Minor version
+ * updates indicate the addition of enumeration values.
+ */
+typedef struct {
+  /*! \brief DLPack major version. */
+  uint32_t major;
+  /*! \brief DLPack minor version. */
+  uint32_t minor;
+} DLPackVersion;
+
+/*!
+ * \brief The device type in DLDevice.
+ */
+#ifdef __cplusplus
+typedef enum : int32_t {
+#else
+typedef enum {
+#endif
+  /*! \brief CPU device */
+  kDLCPU = 1,
+  /*! \brief CUDA GPU device */
+  kDLCUDA = 2,
+  /*!
+   * \brief Pinned CUDA CPU memory by cudaMallocHost
+   */
+  kDLCUDAHost = 3,
+  /*! \brief OpenCL devices. */
+  kDLOpenCL = 4,
+  /*! \brief Vulkan buffer for next generation graphics. */
+  kDLVulkan = 7,
+  /*! \brief Metal for Apple GPU. */
+  kDLMetal = 8,
+  /*! \brief Verilog simulator buffer */
+  kDLVPI = 9,
+  /*! \brief ROCm GPUs for AMD GPUs */
+  kDLROCM = 10,
+  /*!
+   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+   */
+  kDLROCMHost = 11,
+  /*!
+   * \brief Reserved extension device type,
+   * used for quickly test extension device
+   * The semantics can differ depending on the implementation.
+   */
+  kDLExtDev = 12,
+  /*!
+   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+   */
+  kDLCUDAManaged = 13,
+  /*!
+   * \brief Unified shared memory allocated on a oneAPI non-partititioned
+   * device. Call to oneAPI runtime is required to determine the device
+   * type, the USM allocation type and the sycl context it is bound to.
+   *
+   */
+  kDLOneAPI = 14,
+  /*! \brief GPU support for next generation WebGPU standard. */
+  kDLWebGPU = 15,
+  /*! \brief Qualcomm Hexagon DSP */
+  kDLHexagon = 16,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+typedef struct {
+  /*! \brief The device type used in the device. */
+  DLDeviceType device_type;
+  /*!
+   * \brief The device index.
+   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+   */
+  int32_t device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+  /*! \brief signed integer */
+  kDLInt = 0U,
+  /*! \brief unsigned integer */
+  kDLUInt = 1U,
+  /*! \brief IEEE floating point */
+  kDLFloat = 2U,
+  /*!
+   * \brief Opaque handle type, reserved for testing purposes.
+   * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+   */
+  kDLOpaqueHandle = 3U,
+  /*! \brief bfloat16 */
+  kDLBfloat = 4U,
+  /*!
+   * \brief complex number
+   * (C/C++/Python layout: compact struct per complex number)
+   */
+  kDLComplex = 5U,
+  /*! \brief boolean */
+  kDLBool = 6U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes = 1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+ *   - int8: type_code = 0, bits = 8, lanes = 1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention,
+ * the underlying storage size of bool is 8 bits)
+ */
+typedef struct {
+  /*!
+   * \brief Type code of base types.
+   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+   * footprint, but the value should be one of DLDataTypeCode enum values.
+   * */
+  uint8_t code;
+  /*!
+   * \brief Number of bits, common choices are 8, 16, 32.
+   */
+  uint8_t bits;
+  /*! \brief Number of lanes in the type, used for vector types. */
+  uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+  /*!
+   * \brief The data pointer points to the allocated data. This will be CUDA
+   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+   * types. This pointer is always aligned to 256 bytes as in CUDA. The
+   * `byte_offset` field should be used to point to the beginning of the data.
+   *
+   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+   * (after which this note will be updated); at the moment it is recommended
+   * to not rely on the data pointer being correctly aligned.
+   *
+   * For given DLTensor, the size of memory required to store the contents of
+   * data is calculated as follows:
+   *
+   * \code{.c}
+   * static inline size_t GetDataSize(const DLTensor* t) {
+   *   size_t size = 1;
+   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+   *     size *= t->shape[i];
+   *   }
+   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+   *   return size;
+   * }
+   * \endcode
+   */
+  void* data;
+  /*! \brief The device of the tensor */
+  DLDevice device;
+  /*! \brief Number of dimensions */
+  int32_t ndim;
+  /*! \brief The data type of the pointer*/
+  DLDataType dtype;
+  /*! \brief The shape of the tensor */
+  int64_t* shape;
+  /*!
+   * \brief strides of the tensor (in number of elements, not bytes)
+   *  can be NULL, indicating tensor is compact and row-majored.
+   */
+  int64_t* strides;
+  /*! \brief The offset in bytes to the beginning pointer to data */
+  uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ *
+ * \note This data structure is used as Legacy DLManagedTensor
+ *       in DLPack exchange and is deprecated after DLPack v0.8
+ *       Use DLManagedTensorVersioned instead.
+ *       This data structure may get renamed or deleted in future versions.
+ *
+ * \sa DLManagedTensorVersioned
+ */
+typedef struct DLManagedTensor {
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+  /*! \brief the context of the original host framework of DLManagedTensor in
+   *   which DLManagedTensor is used in the framework. It can also be NULL.
+   */
+  void* manager_ctx;
+  /*!
+   * \brief Destructor - this should be called
+   * to destruct the manager_ctx  which backs the DLManagedTensor. It can be
+   * NULL if there is no way for the caller to provide a reasonable destructor.
+   * The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensor* self);
+} DLManagedTensor;
+
+// bit masks used in in the DLManagedTensorVersioned
+
+/*! \brief bit mask to indicate that the tensor is read only. */
+#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
+
+/*!
+ * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
+ *
+ * This data structure is intended to facilitate the borrowing of DLTensor by
+ * another framework. It is not meant to transfer the tensor. When the borrowing
+ * framework doesn't need the tensor, it should call the deleter to notify the
+ * host that the resource is no longer needed.
+ *
+ * \note This is the current standard DLPack exchange data structure.
+ */
+struct DLManagedTensorVersioned {
+  /*!
+   * \brief The API and ABI version of the current managed Tensor
+   */
+  DLPackVersion version;
+  /*!
+   * \brief the context of the original host framework.
+   *
+   * Stores DLManagedTensorVersioned is used in the
+   * framework. It can also be NULL.
+   */
+  void* manager_ctx;
+  /*!
+   * \brief Destructor.
+   *
+   * This should be called to destruct manager_ctx which holds the
+   * DLManagedTensorVersioned. It can be NULL if there is no way for the caller to provide
+   * a reasonable destructor. The destructors deletes the argument self as well.
+   */
+  void (*deleter)(struct DLManagedTensorVersioned* self);
+  /*!
+   * \brief Additional bitmask flags information about the tensor.
+   *
+   * By default the flags should be set to 0.
+   *
+   * \note Future ABI changes should keep everything until this field
+   *       stable, to ensure that deleter can be correctly called.
+   *
+   * \sa DLPACK_FLAG_BITMASK_READ_ONLY
+   */
+  uint64_t flags;
+  /*! \brief DLTensor which is being memory managed */
+  DLTensor dl_tensor;
+};
+
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_
diff --git a/pyarrow/include/arrow/c/helpers.h b/pyarrow/include/arrow/c/helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e4df17f43ebfe238484056fedbd4e6d575460f0
--- /dev/null
+++ b/pyarrow/include/arrow/c/helpers.h
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "arrow/c/abi.h"
+
+#define ARROW_C_ASSERT(condition, msg)                          \
+  do {                                                          \
+    if (!(condition)) {                                         \
+      fprintf(stderr, "%s:%d:: %s", __FILE__, __LINE__, (msg)); \
+      abort();                                                  \
+    }                                                           \
+  } while (0)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/// Query whether the C schema is released
+inline int ArrowSchemaIsReleased(const struct ArrowSchema* schema) {
+  return schema->release == NULL;
+}
+
+/// Mark the C schema released (for use in release callbacks)
+inline void ArrowSchemaMarkReleased(struct ArrowSchema* schema) {
+  schema->release = NULL;
+}
+
+/// Move the C schema from `src` to `dest`
+///
+/// Note `dest` must *not* point to a valid schema already, otherwise there
+/// will be a memory leak.
+inline void ArrowSchemaMove(struct ArrowSchema* src, struct ArrowSchema* dest) {
+  assert(dest != src);
+  assert(!ArrowSchemaIsReleased(src));
+  memcpy(dest, src, sizeof(struct ArrowSchema));
+  ArrowSchemaMarkReleased(src);
+}
+
+/// Release the C schema, if necessary, by calling its release callback
+inline void ArrowSchemaRelease(struct ArrowSchema* schema) {
+  if (!ArrowSchemaIsReleased(schema)) {
+    schema->release(schema);
+    ARROW_C_ASSERT(ArrowSchemaIsReleased(schema),
+                   "ArrowSchemaRelease did not cleanup release callback");
+  }
+}
+
+/// Query whether the C array is released
+inline int ArrowArrayIsReleased(const struct ArrowArray* array) {
+  return array->release == NULL;
+}
+
+inline int ArrowDeviceArrayIsReleased(const struct ArrowDeviceArray* array) {
+  return ArrowArrayIsReleased(&array->array);
+}
+
+/// Mark the C array released (for use in release callbacks)
+inline void ArrowArrayMarkReleased(struct ArrowArray* array) { array->release = NULL; }
+
+inline void ArrowDeviceArrayMarkReleased(struct ArrowDeviceArray* array) {
+  ArrowArrayMarkReleased(&array->array);
+}
+
+/// Move the C array from `src` to `dest`
+///
+/// Note `dest` must *not* point to a valid array already, otherwise there
+/// will be a memory leak.
+inline void ArrowArrayMove(struct ArrowArray* src, struct ArrowArray* dest) {
+  assert(dest != src);
+  assert(!ArrowArrayIsReleased(src));
+  memcpy(dest, src, sizeof(struct ArrowArray));
+  ArrowArrayMarkReleased(src);
+}
+
+inline void ArrowDeviceArrayMove(struct ArrowDeviceArray* src,
+                                 struct ArrowDeviceArray* dest) {
+  assert(dest != src);
+  assert(!ArrowDeviceArrayIsReleased(src));
+  memcpy(dest, src, sizeof(struct ArrowDeviceArray));
+  ArrowDeviceArrayMarkReleased(src);
+}
+
+/// Release the C array, if necessary, by calling its release callback
+inline void ArrowArrayRelease(struct ArrowArray* array) {
+  if (!ArrowArrayIsReleased(array)) {
+    array->release(array);
+    ARROW_C_ASSERT(ArrowArrayIsReleased(array),
+                   "ArrowArrayRelease did not cleanup release callback");
+  }
+}
+
+inline void ArrowDeviceArrayRelease(struct ArrowDeviceArray* array) {
+  if (!ArrowDeviceArrayIsReleased(array)) {
+    array->array.release(&array->array);
+    ARROW_C_ASSERT(ArrowDeviceArrayIsReleased(array),
+                   "ArrowDeviceArrayRelease did not cleanup release callback");
+  }
+}
+
+/// Query whether the C array stream is released
+inline int ArrowArrayStreamIsReleased(const struct ArrowArrayStream* stream) {
+  return stream->release == NULL;
+}
+
+inline int ArrowDeviceArrayStreamIsReleased(const struct ArrowDeviceArrayStream* stream) {
+  return stream->release == NULL;
+}
+
+/// Mark the C array stream released (for use in release callbacks)
+inline void ArrowArrayStreamMarkReleased(struct ArrowArrayStream* stream) {
+  stream->release = NULL;
+}
+
+inline void ArrowDeviceArrayStreamMarkReleased(struct ArrowDeviceArrayStream* stream) {
+  stream->release = NULL;
+}
+
+/// Move the C array stream from `src` to `dest`
+///
+/// Note `dest` must *not* point to a valid stream already, otherwise there
+/// will be a memory leak.
+inline void ArrowArrayStreamMove(struct ArrowArrayStream* src,
+                                 struct ArrowArrayStream* dest) {
+  assert(dest != src);
+  assert(!ArrowArrayStreamIsReleased(src));
+  memcpy(dest, src, sizeof(struct ArrowArrayStream));
+  ArrowArrayStreamMarkReleased(src);
+}
+
+inline void ArrowDeviceArrayStreamMove(struct ArrowDeviceArrayStream* src,
+                                       struct ArrowDeviceArrayStream* dest) {
+  assert(dest != src);
+  assert(!ArrowDeviceArrayStreamIsReleased(src));
+  memcpy(dest, src, sizeof(struct ArrowDeviceArrayStream));
+  ArrowDeviceArrayStreamMarkReleased(src);
+}
+
+/// Release the C array stream, if necessary, by calling its release callback
+inline void ArrowArrayStreamRelease(struct ArrowArrayStream* stream) {
+  if (!ArrowArrayStreamIsReleased(stream)) {
+    stream->release(stream);
+    ARROW_C_ASSERT(ArrowArrayStreamIsReleased(stream),
+                   "ArrowArrayStreamRelease did not cleanup release callback");
+  }
+}
+
+inline void ArrowDeviceArrayStreamRelease(struct ArrowDeviceArrayStream* stream) {
+  if (!ArrowDeviceArrayStreamIsReleased(stream)) {
+    stream->release(stream);
+    ARROW_C_ASSERT(ArrowDeviceArrayStreamIsReleased(stream),
+                   "ArrowDeviceArrayStreamRelease did not cleanup release callback");
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/pyarrow/include/arrow/chunk_resolver.h b/pyarrow/include/arrow/chunk_resolver.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d6458167fac979c2d6c6c112fa00194b9818092
--- /dev/null
+++ b/pyarrow/include/arrow/chunk_resolver.h
@@ -0,0 +1,294 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/span.h"
+
+namespace arrow {
+
+class ChunkResolver;
+
+template <typename IndexType>
+struct ARROW_EXPORT TypedChunkLocation {
+  /// \brief Index of the chunk in the array of chunks
+  ///
+  /// The value is always in the range `[0, chunks.size()]`. `chunks.size()` is used
+  /// to represent out-of-bounds locations.
+  IndexType chunk_index = 0;
+
+  /// \brief Index of the value in the chunk
+  ///
+  /// The value is UNDEFINED if `chunk_index >= chunks.size()`
+  IndexType index_in_chunk = 0;
+
+  TypedChunkLocation() = default;
+
+  TypedChunkLocation(IndexType chunk_index, IndexType index_in_chunk)
+      : chunk_index(chunk_index), index_in_chunk(index_in_chunk) {
+    static_assert(sizeof(TypedChunkLocation<IndexType>) == 2 * sizeof(IndexType));
+    static_assert(alignof(TypedChunkLocation<IndexType>) == alignof(IndexType));
+  }
+
+  bool operator==(TypedChunkLocation other) const {
+    return chunk_index == other.chunk_index && index_in_chunk == other.index_in_chunk;
+  }
+};
+
+using ChunkLocation = TypedChunkLocation<int64_t>;
+
+/// \brief An utility that incrementally resolves logical indices into
+/// physical indices in a chunked array.
+class ARROW_EXPORT ChunkResolver {
+ private:
+  /// \brief Array containing `chunks.size() + 1` offsets.
+  ///
+  /// `offsets_[i]` is the starting logical index of chunk `i`. `offsets_[0]` is always 0
+  /// and `offsets_[chunks.size()]` is the logical length of the chunked array.
+  std::vector<int64_t> offsets_;
+
+  /// \brief Cache of the index of the last resolved chunk.
+  ///
+  /// \invariant `cached_chunk_ in [0, chunks.size()]`
+  mutable std::atomic<int32_t> cached_chunk_;
+
+ public:
+  explicit ChunkResolver(const ArrayVector& chunks) noexcept;
+  explicit ChunkResolver(util::span<const Array* const> chunks) noexcept;
+  explicit ChunkResolver(const RecordBatchVector& batches) noexcept;
+
+  /// \brief Construct a ChunkResolver from a vector of chunks.size() + 1 offsets.
+  ///
+  /// The first offset must be 0 and the last offset must be the logical length of the
+  /// chunked array. Each offset before the last represents the starting logical index of
+  /// the corresponding chunk.
+  explicit ChunkResolver(std::vector<int64_t> offsets) noexcept
+      : offsets_(std::move(offsets)), cached_chunk_(0) {
+#ifndef NDEBUG
+    assert(offsets_.size() >= 1);
+    assert(offsets_[0] == 0);
+    for (size_t i = 1; i < offsets_.size(); i++) {
+      assert(offsets_[i] >= offsets_[i - 1]);
+    }
+    assert(offsets_.size() - 1 <=
+           static_cast<size_t>(std::numeric_limits<int32_t>::max()));
+#endif
+  }
+
+  ChunkResolver(ChunkResolver&& other) noexcept;
+  ChunkResolver& operator=(ChunkResolver&& other) noexcept;
+
+  ChunkResolver(const ChunkResolver& other) noexcept;
+  ChunkResolver& operator=(const ChunkResolver& other) noexcept;
+
+  int64_t logical_array_length() const { return offsets_.back(); }
+  int32_t num_chunks() const { return static_cast<int32_t>(offsets_.size() - 1); }
+
+  int64_t chunk_length(int64_t chunk_index) const {
+    return offsets_[chunk_index + 1] - offsets_[chunk_index];
+  }
+
+  /// \brief Resolve a logical index to a ChunkLocation.
+  ///
+  /// The returned ChunkLocation contains the chunk index and the within-chunk index
+  /// equivalent to the logical index.
+  ///
+  /// \pre `index >= 0`
+  /// \post `location.chunk_index` in `[0, chunks.size()]`
+  /// \param index The logical index to resolve
+  /// \return ChunkLocation with a valid chunk_index if index is within
+  ///         bounds, or with `chunk_index == chunks.size()` if logical index is
+  ///         `>= chunked_array.length()`.
+  inline ChunkLocation Resolve(int64_t index) const {
+    const auto cached_chunk = cached_chunk_.load(std::memory_order_relaxed);
+    const auto chunk_index =
+        ResolveChunkIndex</*StoreCachedChunk=*/true>(index, cached_chunk);
+    return ChunkLocation{chunk_index, index - offsets_[chunk_index]};
+  }
+
+  /// \brief Resolve a logical index to a ChunkLocation.
+  ///
+  /// The returned ChunkLocation contains the chunk index and the within-chunk index
+  /// equivalent to the logical index.
+  ///
+  /// \pre `index >= 0`
+  /// \post `location.chunk_index` in `[0, chunks.size()]`
+  /// \param index The logical index to resolve
+  /// \param hint ChunkLocation{} or the last ChunkLocation returned by
+  ///             this ChunkResolver.
+  /// \return ChunkLocation with a valid chunk_index if index is within
+  ///         bounds, or with `chunk_index == chunks.size()` if logical index is
+  ///         `>= chunked_array.length()`.
+  inline ChunkLocation ResolveWithHint(int64_t index, ChunkLocation hint) const {
+    assert(hint.chunk_index < static_cast<uint32_t>(offsets_.size()));
+    const auto chunk_index = ResolveChunkIndex</*StoreCachedChunk=*/false>(
+        index, static_cast<int32_t>(hint.chunk_index));
+    return ChunkLocation{chunk_index, index - offsets_[chunk_index]};
+  }
+
+  /// \brief Resolve `n_indices` logical indices to chunk indices.
+  ///
+  /// \pre 0 <= logical_index_vec[i] < logical_array_length()
+  ///      (for well-defined and valid chunk index results)
+  /// \pre out_chunk_location_vec has space for `n_indices` locations
+  /// \pre chunk_hint in [0, chunks.size()]
+  /// \post out_chunk_location_vec[i].chunk_index in [0, chunks.size()] for i in [0, n)
+  /// \post if logical_index_vec[i] >= chunked_array.length(), then
+  ///       out_chunk_location_vec[i].chunk_index == chunks.size()
+  ///       and out_chunk_location_vec[i].index_in_chunk is UNDEFINED (can be
+  ///       out-of-bounds)
+  /// \post if logical_index_vec[i] < 0, then both values in out_chunk_index_vec[i]
+  ///       are UNDEFINED
+  ///
+  /// \param n_indices The number of logical indices to resolve
+  /// \param logical_index_vec The logical indices to resolve
+  /// \param out_chunk_location_vec The output array where the locations will be written
+  /// \param chunk_hint 0 or the last chunk_index produced by ResolveMany
+  /// \return false iff chunks.size() > std::numeric_limits<IndexType>::max()
+  template <typename IndexType>
+  [[nodiscard]] bool ResolveMany(int64_t n_indices, const IndexType* logical_index_vec,
+                                 TypedChunkLocation<IndexType>* out_chunk_location_vec,
+                                 IndexType chunk_hint = 0) const {
+    if constexpr (sizeof(IndexType) < sizeof(uint32_t)) {
+      // The max value returned by Bisect is `offsets.size() - 1` (= chunks.size()).
+      constexpr int64_t kMaxIndexTypeValue = std::numeric_limits<IndexType>::max();
+      // A ChunkedArray with enough empty chunks can make the index of a chunk
+      // exceed the logical index and thus the maximum value of IndexType.
+      const bool chunk_index_fits_on_type = num_chunks() <= kMaxIndexTypeValue;
+      if (ARROW_PREDICT_FALSE(!chunk_index_fits_on_type)) {
+        return false;
+      }
+      // Since an index-in-chunk cannot possibly exceed the logical index being
+      // queried, we don't have to worry about these values not fitting on IndexType.
+    }
+    if constexpr (std::is_signed_v<IndexType>) {
+      // We interpret signed integers as unsigned and avoid having to generate double
+      // the amount of binary code to handle each integer width.
+      //
+      // Negative logical indices can become large values when cast to unsigned, and
+      // they are gracefully handled by ResolveManyImpl, but both the chunk index
+      // and the index in chunk values will be undefined in these cases. This
+      // happend because int8_t(-1) == uint8_t(255) and 255 could be a valid
+      // logical index in the chunked array.
+      using U = std::make_unsigned_t<IndexType>;
+      ResolveManyImpl(n_indices, reinterpret_cast<const U*>(logical_index_vec),
+                      reinterpret_cast<TypedChunkLocation<U>*>(out_chunk_location_vec),
+                      static_cast<int32_t>(chunk_hint));
+    } else {
+      static_assert(std::is_unsigned_v<IndexType>);
+      ResolveManyImpl(n_indices, logical_index_vec, out_chunk_location_vec,
+                      static_cast<int32_t>(chunk_hint));
+    }
+    return true;
+  }
+
+ private:
+  template <bool StoreCachedChunk>
+  inline int64_t ResolveChunkIndex(int64_t index, int32_t cached_chunk) const {
+    // It is common for algorithms sequentially processing arrays to make consecutive
+    // accesses at a relatively small distance from each other, hence often falling in the
+    // same chunk.
+    //
+    // This is guaranteed when merging (assuming each side of the merge uses its
+    // own resolver), and is the most common case in recursive invocations of
+    // partitioning.
+    const auto num_offsets = static_cast<uint32_t>(offsets_.size());
+    const int64_t* offsets = offsets_.data();
+    if (ARROW_PREDICT_TRUE(index >= offsets[cached_chunk]) &&
+        (static_cast<uint32_t>(cached_chunk + 1) == num_offsets ||
+         index < offsets[cached_chunk + 1])) {
+      return cached_chunk;
+    }
+    // lo < hi is guaranteed by `num_offsets = chunks.size() + 1`
+    const auto chunk_index = Bisect(index, offsets, /*lo=*/0, /*hi=*/num_offsets);
+    if constexpr (StoreCachedChunk) {
+      assert(static_cast<uint32_t>(chunk_index) < static_cast<uint32_t>(offsets_.size()));
+      cached_chunk_.store(chunk_index, std::memory_order_relaxed);
+    }
+    return chunk_index;
+  }
+
+  /// \pre all the pre-conditions of ChunkResolver::ResolveMany()
+  /// \pre num_offsets - 1 <= std::numeric_limits<IndexType>::max()
+  void ResolveManyImpl(int64_t, const uint8_t*, TypedChunkLocation<uint8_t>*,
+                       int32_t) const;
+  void ResolveManyImpl(int64_t, const uint16_t*, TypedChunkLocation<uint16_t>*,
+                       int32_t) const;
+  void ResolveManyImpl(int64_t, const uint32_t*, TypedChunkLocation<uint32_t>*,
+                       int32_t) const;
+  void ResolveManyImpl(int64_t, const uint64_t*, TypedChunkLocation<uint64_t>*,
+                       int32_t) const;
+
+ public:
+  /// \brief Find the index of the chunk that contains the logical index.
+  ///
+  /// Any non-negative index is accepted. When `hi=num_offsets`, the largest
+  /// possible return value is `num_offsets-1` which is equal to
+  /// `chunks.size()`. Which is returned when the logical index is greater or
+  /// equal the logical length of the chunked array.
+  ///
+  /// \pre index >= 0 (otherwise, when index is negative, hi-1 is returned)
+  /// \pre lo < hi
+  /// \pre lo >= 0 && hi <= offsets_.size()
+  static inline int32_t Bisect(int64_t index, const int64_t* offsets, int32_t lo,
+                               int32_t hi) {
+    return Bisect(static_cast<uint64_t>(index),
+                  reinterpret_cast<const uint64_t*>(offsets), static_cast<uint32_t>(lo),
+                  static_cast<uint32_t>(hi));
+  }
+
+  static inline int32_t Bisect(uint64_t index, const uint64_t* offsets, uint32_t lo,
+                               uint32_t hi) {
+    // Similar to std::upper_bound(), but slightly different as our offsets
+    // array always starts with 0.
+    auto n = hi - lo;
+    // First iteration does not need to check for n > 1
+    // (lo < hi is guaranteed by the precondition).
+    assert(n > 1 && "lo < hi is a precondition of Bisect");
+    do {
+      const uint32_t m = n >> 1;
+      const uint32_t mid = lo + m;
+      if (index >= offsets[mid]) {
+        lo = mid;
+        n -= m;
+      } else {
+        n = m;
+      }
+    } while (n > 1);
+    return lo;
+  }
+};
+
+// Explicitly instantiate template base struct, for DLL linking on Windows
+template struct TypedChunkLocation<int32_t>;
+template struct TypedChunkLocation<int16_t>;
+template struct TypedChunkLocation<int8_t>;
+template struct TypedChunkLocation<uint8_t>;
+template struct TypedChunkLocation<uint16_t>;
+template struct TypedChunkLocation<uint32_t>;
+template struct TypedChunkLocation<int64_t>;
+template struct TypedChunkLocation<uint64_t>;
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/chunked_array.h b/pyarrow/include/arrow/chunked_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..02bcd0f9026bc7ba8ac9ef2daf2a3bd7ab31d56f
--- /dev/null
+++ b/pyarrow/include/arrow/chunked_array.h
@@ -0,0 +1,283 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/chunk_resolver.h"
+#include "arrow/compare.h"
+#include "arrow/device_allocation_type_set.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class DataType;
+class MemoryPool;
+namespace stl {
+template <typename T, typename V>
+class ChunkedArrayIterator;
+}  // namespace stl
+
+/// \class ChunkedArray
+/// \brief A data structure managing a list of primitive Arrow arrays logically
+/// as one large array
+///
+/// Data chunking is treated throughout this project largely as an
+/// implementation detail for performance and memory use optimization.
+/// ChunkedArray allows Array objects to be collected and interpreted
+/// as a single logical array without requiring an expensive concatenation
+/// step.
+///
+/// In some cases, data produced by a function may exceed the capacity of an
+/// Array (like BinaryArray or StringArray) and so returning multiple Arrays is
+/// the only possibility. In these cases, we recommend returning a ChunkedArray
+/// instead of vector of Arrays or some alternative.
+///
+/// When data is processed in parallel, it may not be practical or possible to
+/// create large contiguous memory allocations and write output into them. With
+/// some data types, like binary and string types, it is not possible at all to
+/// produce non-chunked array outputs without requiring a concatenation step at
+/// the end of processing.
+///
+/// Application developers may tune chunk sizes based on analysis of
+/// performance profiles but many developer-users will not need to be
+/// especially concerned with the chunking details.
+///
+/// Preserving the chunk layout/sizes in processing steps is generally not
+/// considered to be a contract in APIs. A function may decide to alter the
+/// chunking of its result. Similarly, APIs accepting multiple ChunkedArray
+/// inputs should not expect the chunk layout to be the same in each input.
+class ARROW_EXPORT ChunkedArray {
+ public:
+  ChunkedArray(ChunkedArray&&) = default;
+  ChunkedArray& operator=(ChunkedArray&&) = default;
+
+  /// \brief Construct a chunked array from a single Array
+  explicit ChunkedArray(std::shared_ptr<Array> chunk)
+      : ChunkedArray(ArrayVector{std::move(chunk)}) {}
+
+  /// \brief Construct a chunked array from a vector of arrays and an optional data type
+  ///
+  /// The vector elements must have the same data type.
+  /// If the data type is passed explicitly, the vector may be empty.
+  /// If the data type is omitted, the vector must be non-empty.
+  explicit ChunkedArray(ArrayVector chunks, std::shared_ptr<DataType> type = NULLPTR);
+
+  // \brief Constructor with basic input validation.
+  static Result<std::shared_ptr<ChunkedArray>> Make(
+      ArrayVector chunks, std::shared_ptr<DataType> type = NULLPTR);
+
+  /// \brief Create an empty ChunkedArray of a given type
+  ///
+  /// The output ChunkedArray will have one chunk with an empty
+  /// array of the given type.
+  ///
+  /// \param[in] type the data type of the empty ChunkedArray
+  /// \param[in] pool the memory pool to allocate memory from
+  /// \return the resulting ChunkedArray
+  static Result<std::shared_ptr<ChunkedArray>> MakeEmpty(
+      std::shared_ptr<DataType> type, MemoryPool* pool = default_memory_pool());
+
+  /// \return the total length of the chunked array; computed on construction
+  int64_t length() const { return length_; }
+
+  /// \return the total number of nulls among all chunks
+  int64_t null_count() const { return null_count_; }
+
+  /// \return the total number of chunks in the chunked array
+  int num_chunks() const { return static_cast<int>(chunks_.size()); }
+
+  /// \return chunk a particular chunk from the chunked array
+  const std::shared_ptr<Array>& chunk(int i) const { return chunks_[i]; }
+
+  /// \return an ArrayVector of chunks
+  const ArrayVector& chunks() const { return chunks_; }
+
+  /// \return The set of device allocation types used by the chunks in this
+  /// chunked array.
+  DeviceAllocationTypeSet device_types() const;
+
+  /// \return true if all chunks are allocated on CPU-accessible memory.
+  bool is_cpu() const { return device_types().is_cpu_only(); }
+
+  /// \brief Construct a zero-copy slice of the chunked array with the
+  /// indicated offset and length
+  ///
+  /// \param[in] offset the position of the first element in the constructed
+  /// slice
+  /// \param[in] length the length of the slice. If there are not enough
+  /// elements in the chunked array, the length will be adjusted accordingly
+  ///
+  /// \return a new object wrapped in std::shared_ptr<ChunkedArray>
+  std::shared_ptr<ChunkedArray> Slice(int64_t offset, int64_t length) const;
+
+  /// \brief Slice from offset until end of the chunked array
+  std::shared_ptr<ChunkedArray> Slice(int64_t offset) const;
+
+  /// \brief Flatten this chunked array as a vector of chunked arrays, one
+  /// for each struct field
+  ///
+  /// \param[in] pool The pool for buffer allocations, if any
+  Result<std::vector<std::shared_ptr<ChunkedArray>>> Flatten(
+      MemoryPool* pool = default_memory_pool()) const;
+
+  /// Construct a zero-copy view of this chunked array with the given
+  /// type. Calls Array::View on each constituent chunk. Always succeeds if
+  /// there are zero chunks
+  Result<std::shared_ptr<ChunkedArray>> View(const std::shared_ptr<DataType>& type) const;
+
+  /// \brief Return the type of the chunked array
+  const std::shared_ptr<DataType>& type() const { return type_; }
+
+  /// \brief Return a Scalar containing the value of this array at index
+  Result<std::shared_ptr<Scalar>> GetScalar(int64_t index) const;
+
+  /// \brief Determine if two chunked arrays are equal.
+  ///
+  /// Two chunked arrays can be equal only if they have equal datatypes.
+  /// However, they may be equal even if they have different chunkings.
+  bool Equals(const ChunkedArray& other,
+              const EqualOptions& opts = EqualOptions::Defaults()) const;
+  /// \brief Determine if two chunked arrays are equal.
+  bool Equals(const std::shared_ptr<ChunkedArray>& other,
+              const EqualOptions& opts = EqualOptions::Defaults()) const;
+  /// \brief Determine if two chunked arrays approximately equal
+  bool ApproxEquals(const ChunkedArray& other,
+                    const EqualOptions& = EqualOptions::Defaults()) const;
+
+  /// \return PrettyPrint representation suitable for debugging
+  std::string ToString() const;
+
+  /// \brief Perform cheap validation checks to determine obvious inconsistencies
+  /// within the chunk array's internal data.
+  ///
+  /// This is O(k*m) where k is the number of array descendents,
+  /// and m is the number of chunks.
+  ///
+  /// \return Status
+  Status Validate() const;
+
+  /// \brief Perform extensive validation checks to determine inconsistencies
+  /// within the chunk array's internal data.
+  ///
+  /// This is O(k*n) where k is the number of array descendents,
+  /// and n is the length in elements.
+  ///
+  /// \return Status
+  Status ValidateFull() const;
+
+ protected:
+  ArrayVector chunks_;
+  std::shared_ptr<DataType> type_;
+  int64_t length_;
+  int64_t null_count_;
+
+ private:
+  template <typename T, typename V>
+  friend class ::arrow::stl::ChunkedArrayIterator;
+  ChunkResolver chunk_resolver_;
+  ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray);
+};
+
+namespace internal {
+
+/// \brief EXPERIMENTAL: Utility for incremental iteration over contiguous
+/// pieces of potentially differently-chunked ChunkedArray objects
+class ARROW_EXPORT MultipleChunkIterator {
+ public:
+  MultipleChunkIterator(const ChunkedArray& left, const ChunkedArray& right)
+      : left_(left),
+        right_(right),
+        pos_(0),
+        length_(left.length()),
+        chunk_idx_left_(0),
+        chunk_idx_right_(0),
+        chunk_pos_left_(0),
+        chunk_pos_right_(0) {}
+
+  bool Next(std::shared_ptr<Array>* next_left, std::shared_ptr<Array>* next_right);
+
+  int64_t position() const { return pos_; }
+
+ private:
+  const ChunkedArray& left_;
+  const ChunkedArray& right_;
+
+  // The amount of the entire ChunkedArray consumed
+  int64_t pos_;
+
+  // Length of the chunked array(s)
+  int64_t length_;
+
+  // Current left chunk
+  int chunk_idx_left_;
+
+  // Current right chunk
+  int chunk_idx_right_;
+
+  // Offset into the current left chunk
+  int64_t chunk_pos_left_;
+
+  // Offset into the current right chunk
+  int64_t chunk_pos_right_;
+};
+
+/// \brief Evaluate binary function on two ChunkedArray objects having possibly
+/// different chunk layouts. The passed binary function / functor should have
+/// the following signature.
+///
+///    Status(const Array&, const Array&, int64_t)
+///
+/// The third argument is the absolute position relative to the start of each
+/// ChunkedArray. The function is executed against each contiguous pair of
+/// array segments, slicing if necessary.
+///
+/// For example, if two arrays have chunk sizes
+///
+///   left: [10, 10, 20]
+///   right: [15, 10, 15]
+///
+/// Then the following invocations take place (pseudocode)
+///
+///   func(left.chunk[0][0:10], right.chunk[0][0:10], 0)
+///   func(left.chunk[1][0:5], right.chunk[0][10:15], 10)
+///   func(left.chunk[1][5:10], right.chunk[1][0:5], 15)
+///   func(left.chunk[2][0:5], right.chunk[1][5:10], 20)
+///   func(left.chunk[2][5:20], right.chunk[2][:], 25)
+template <typename Action>
+Status ApplyBinaryChunked(const ChunkedArray& left, const ChunkedArray& right,
+                          Action&& action) {
+  MultipleChunkIterator iterator(left, right);
+  std::shared_ptr<Array> left_piece, right_piece;
+  while (iterator.Next(&left_piece, &right_piece)) {
+    ARROW_RETURN_NOT_OK(action(*left_piece, *right_piece, iterator.position()));
+  }
+  return Status::OK();
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compare.h b/pyarrow/include/arrow/compare.h
new file mode 100644
index 0000000000000000000000000000000000000000..2198495d7d20371d86aef50b8beb00541d323e74
--- /dev/null
+++ b/pyarrow/include/arrow/compare.h
@@ -0,0 +1,213 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for comparing Arrow data structures
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+struct ArrayStatistics;
+class Array;
+class DataType;
+class Tensor;
+class SparseTensor;
+struct Scalar;
+
+static constexpr double kDefaultAbsoluteTolerance = 1E-5;
+
+/// A container of options for equality comparisons
+class EqualOptions {
+ public:
+  /// Whether or not NaNs are considered equal.
+  bool nans_equal() const { return nans_equal_; }
+
+  /// Return a new EqualOptions object with the "nans_equal" property changed.
+  EqualOptions nans_equal(bool v) const {
+    auto res = EqualOptions(*this);
+    res.nans_equal_ = v;
+    return res;
+  }
+
+  /// Whether or not zeros with differing signs are considered equal.
+  bool signed_zeros_equal() const { return signed_zeros_equal_; }
+
+  /// Return a new EqualOptions object with the "signed_zeros_equal" property changed.
+  EqualOptions signed_zeros_equal(bool v) const {
+    auto res = EqualOptions(*this);
+    res.signed_zeros_equal_ = v;
+    return res;
+  }
+
+  /// Whether the "atol" property is used in the comparison.
+  ///
+  /// This option only affects the Equals methods
+  /// and has no effect on ApproxEquals methods.
+  bool use_atol() const { return use_atol_; }
+
+  /// Return a new EqualOptions object with the "use_atol" property changed.
+  EqualOptions use_atol(bool v) const {
+    auto res = EqualOptions(*this);
+    res.use_atol_ = v;
+    return res;
+  }
+
+  /// The absolute tolerance for approximate comparisons of floating-point values.
+  /// Note that this option is ignored if "use_atol" is set to false.
+  double atol() const { return atol_; }
+
+  /// Return a new EqualOptions object with the "atol" property changed.
+  EqualOptions atol(double v) const {
+    auto res = EqualOptions(*this);
+    res.atol_ = v;
+    return res;
+  }
+
+  /// Whether the \ref arrow::Schema property is used in the comparison.
+  ///
+  /// This option only affects the Equals methods
+  /// and has no effect on ApproxEquals methods.
+  bool use_schema() const { return use_schema_; }
+
+  /// Return a new EqualOptions object with the "use_schema_" property changed.
+  ///
+  /// Setting this option is false making the value of \ref EqualOptions::use_metadata
+  /// is ignored.
+  EqualOptions use_schema(bool v) const {
+    auto res = EqualOptions(*this);
+    res.use_schema_ = v;
+    return res;
+  }
+
+  /// Whether the "metadata" in \ref arrow::Schema is used in the comparison.
+  ///
+  /// This option only affects the Equals methods
+  /// and has no effect on the ApproxEquals methods.
+  ///
+  /// Note: This option is only considered when \ref arrow::EqualOptions::use_schema is
+  /// set to true.
+  bool use_metadata() const { return use_metadata_; }
+
+  /// Return a new EqualOptions object with the "use_metadata" property changed.
+  EqualOptions use_metadata(bool v) const {
+    auto res = EqualOptions(*this);
+    res.use_metadata_ = v;
+    return res;
+  }
+
+  /// The ostream to which a diff will be formatted if arrays disagree.
+  /// If this is null (the default) no diff will be formatted.
+  std::ostream* diff_sink() const { return diff_sink_; }
+
+  /// Return a new EqualOptions object with the "diff_sink" property changed.
+  /// This option will be ignored if diff formatting of the types of compared arrays is
+  /// not supported.
+  EqualOptions diff_sink(std::ostream* diff_sink) const {
+    auto res = EqualOptions(*this);
+    res.diff_sink_ = diff_sink;
+    return res;
+  }
+
+  static EqualOptions Defaults() { return {}; }
+
+ protected:
+  double atol_ = kDefaultAbsoluteTolerance;
+  bool nans_equal_ = false;
+  bool signed_zeros_equal_ = true;
+  bool use_atol_ = false;
+  bool use_schema_ = true;
+  bool use_metadata_ = false;
+
+  std::ostream* diff_sink_ = NULLPTR;
+};
+
+/// Returns true if the arrays are exactly equal
+///
+/// Note that arrow::ArrayStatistics is not included in the comparison.
+ARROW_EXPORT bool ArrayEquals(const Array& left, const Array& right,
+                              const EqualOptions& = EqualOptions::Defaults());
+
+/// Returns true if the arrays are approximately equal. For non-floating point
+/// types, this is equivalent to ArrayEquals(left, right)
+///
+/// Note that arrow::ArrayStatistics is not included in the comparison.
+ARROW_EXPORT bool ArrayApproxEquals(const Array& left, const Array& right,
+                                    const EqualOptions& = EqualOptions::Defaults());
+
+/// Returns true if indicated equal-length segment of arrays are exactly equal
+///
+/// Note that arrow::ArrayStatistics is not included in the comparison.
+ARROW_EXPORT bool ArrayRangeEquals(const Array& left, const Array& right,
+                                   int64_t start_idx, int64_t end_idx,
+                                   int64_t other_start_idx,
+                                   const EqualOptions& = EqualOptions::Defaults());
+
+/// Returns true if indicated equal-length segment of arrays are approximately equal
+///
+/// Note that arrow::ArrayStatistics is not included in the comparison.
+ARROW_EXPORT bool ArrayRangeApproxEquals(const Array& left, const Array& right,
+                                         int64_t start_idx, int64_t end_idx,
+                                         int64_t other_start_idx,
+                                         const EqualOptions& = EqualOptions::Defaults());
+
+ARROW_EXPORT bool TensorEquals(const Tensor& left, const Tensor& right,
+                               const EqualOptions& = EqualOptions::Defaults());
+
+/// EXPERIMENTAL: Returns true if the given sparse tensors are exactly equal
+ARROW_EXPORT bool SparseTensorEquals(const SparseTensor& left, const SparseTensor& right,
+                                     const EqualOptions& = EqualOptions::Defaults());
+
+/// Returns true if the type metadata are exactly equal
+/// \param[in] left a DataType
+/// \param[in] right a DataType
+/// \param[in] check_metadata whether to compare KeyValueMetadata for child
+/// fields
+ARROW_EXPORT bool TypeEquals(const DataType& left, const DataType& right,
+                             bool check_metadata = true);
+
+/// \brief Check two \ref arrow::ArrayStatistics for equality
+/// \param[in] left an \ref arrow::ArrayStatistics
+/// \param[in] right an \ref arrow::ArrayStatistics
+/// \param[in] options Options used to compare double values for equality.
+/// \return True if the two \ref arrow::ArrayStatistics instances are equal; otherwise,
+/// false.
+ARROW_EXPORT bool ArrayStatisticsEquals(
+    const ArrayStatistics& left, const ArrayStatistics& right,
+    const EqualOptions& options = EqualOptions::Defaults());
+
+/// Returns true if scalars are equal
+/// \param[in] left a Scalar
+/// \param[in] right a Scalar
+/// \param[in] options comparison options
+ARROW_EXPORT bool ScalarEquals(const Scalar& left, const Scalar& right,
+                               const EqualOptions& options = EqualOptions::Defaults());
+
+/// Returns true if scalars are approximately equal
+/// \param[in] left a Scalar
+/// \param[in] right a Scalar
+/// \param[in] options comparison options
+ARROW_EXPORT bool ScalarApproxEquals(
+    const Scalar& left, const Scalar& right,
+    const EqualOptions& options = EqualOptions::Defaults());
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/api.h b/pyarrow/include/arrow/compute/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..343e30643cfd31916caafc4a84a3fd393c9a84ef
--- /dev/null
+++ b/pyarrow/include/arrow/compute/api.h
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+/// \defgroup compute-functions Abstract compute function API
+/// @{
+/// @}
+
+/// \defgroup compute-concrete-options Concrete option classes for compute functions
+/// @{
+/// @}
+
+#include "arrow/compute/api_aggregate.h"     // IWYU pragma: export
+#include "arrow/compute/api_scalar.h"        // IWYU pragma: export
+#include "arrow/compute/api_vector.h"        // IWYU pragma: export
+#include "arrow/compute/cast.h"              // IWYU pragma: export
+#include "arrow/compute/function.h"          // IWYU pragma: export
+#include "arrow/compute/function_options.h"  // IWYU pragma: export
+#include "arrow/compute/initialize.h"        // IWYU pragma: export
+#include "arrow/compute/kernel.h"            // IWYU pragma: export
+#include "arrow/compute/registry.h"          // IWYU pragma: export
+#include "arrow/datum.h"                     // IWYU pragma: export
+
+#include "arrow/compute/expression.h"  // IWYU pragma: export
+
+/// \defgroup execnode-row Utilities for working with data in a row-major format
+/// @{
+/// @}
+
+#include "arrow/compute/row/grouper.h"  // IWYU pragma: export
+
+/// \defgroup acero-internals Acero internals, useful for those extending Acero
+/// @{
+/// @}
+
+#include "arrow/compute/exec.h"  // IWYU pragma: export
diff --git a/pyarrow/include/arrow/compute/api_aggregate.h b/pyarrow/include/arrow/compute/api_aggregate.h
new file mode 100644
index 0000000000000000000000000000000000000000..d31e0a73156dc8007e0fefaeabc5e9b3e60618fa
--- /dev/null
+++ b/pyarrow/include/arrow/compute/api_aggregate.h
@@ -0,0 +1,596 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Eager evaluation convenience APIs for invoking common functions, including
+// necessary memory allocations
+
+#pragma once
+
+#include <vector>
+
+#include "arrow/compute/function_options.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+
+namespace compute {
+
+class ExecContext;
+
+// ----------------------------------------------------------------------
+// Aggregate functions
+
+/// \addtogroup compute-concrete-options
+/// @{
+
+/// \brief Control general scalar aggregate kernel behavior
+///
+/// By default, null values are ignored (skip_nulls = true).
+class ARROW_EXPORT ScalarAggregateOptions : public FunctionOptions {
+ public:
+  explicit ScalarAggregateOptions(bool skip_nulls = true, uint32_t min_count = 1);
+  static constexpr const char kTypeName[] = "ScalarAggregateOptions";
+  static ScalarAggregateOptions Defaults() { return ScalarAggregateOptions{}; }
+
+  /// If true (the default), null values are ignored. Otherwise, if any value is null,
+  /// emit null.
+  bool skip_nulls;
+  /// If less than this many non-null values are observed, emit null.
+  uint32_t min_count;
+};
+
+/// \brief Control count aggregate kernel behavior.
+///
+/// By default, only non-null values are counted.
+class ARROW_EXPORT CountOptions : public FunctionOptions {
+ public:
+  enum CountMode {
+    /// Count only non-null values.
+    ONLY_VALID = 0,
+    /// Count only null values.
+    ONLY_NULL,
+    /// Count both non-null and null values.
+    ALL,
+  };
+  explicit CountOptions(CountMode mode = CountMode::ONLY_VALID);
+  static constexpr const char kTypeName[] = "CountOptions";
+  static CountOptions Defaults() { return CountOptions{}; }
+
+  CountMode mode;
+};
+
+/// \brief Control Mode kernel behavior
+///
+/// Returns top-n common values and counts.
+/// By default, returns the most common value and count.
+class ARROW_EXPORT ModeOptions : public FunctionOptions {
+ public:
+  explicit ModeOptions(int64_t n = 1, bool skip_nulls = true, uint32_t min_count = 0);
+  static constexpr const char kTypeName[] = "ModeOptions";
+  static ModeOptions Defaults() { return ModeOptions{}; }
+
+  int64_t n = 1;
+  /// If true (the default), null values are ignored. Otherwise, if any value is null,
+  /// emit null.
+  bool skip_nulls;
+  /// If less than this many non-null values are observed, emit null.
+  uint32_t min_count;
+};
+
+/// \brief Control Delta Degrees of Freedom (ddof) of Variance and Stddev kernel
+///
+/// The divisor used in calculations is N - ddof, where N is the number of elements.
+/// By default, ddof is zero, and population variance or stddev is returned.
+class ARROW_EXPORT VarianceOptions : public FunctionOptions {
+ public:
+  explicit VarianceOptions(int ddof = 0, bool skip_nulls = true, uint32_t min_count = 0);
+  static constexpr const char kTypeName[] = "VarianceOptions";
+  static VarianceOptions Defaults() { return VarianceOptions{}; }
+
+  int ddof = 0;
+  /// If true (the default), null values are ignored. Otherwise, if any value is null,
+  /// emit null.
+  bool skip_nulls;
+  /// If less than this many non-null values are observed, emit null.
+  uint32_t min_count;
+};
+
+/// \brief Control Skew and Kurtosis kernel behavior
+class ARROW_EXPORT SkewOptions : public FunctionOptions {
+ public:
+  explicit SkewOptions(bool skip_nulls = true, bool biased = true,
+                       uint32_t min_count = 0);
+  static constexpr const char kTypeName[] = "SkewOptions";
+  static SkewOptions Defaults() { return SkewOptions{}; }
+
+  /// If true (the default), null values are ignored. Otherwise, if any value is null,
+  /// emit null.
+  bool skip_nulls;
+  /// If true (the default), the calculated value is biased. If false, the calculated
+  /// value includes a correction factor to reduce bias, making it more accurate for
+  /// small sample sizes.
+  bool biased;
+  /// If less than this many non-null values are observed, emit null.
+  uint32_t min_count;
+};
+
+/// \brief Control Quantile kernel behavior
+///
+/// By default, returns the median value.
+class ARROW_EXPORT QuantileOptions : public FunctionOptions {
+ public:
+  /// Interpolation method to use when quantile lies between two data points
+  enum Interpolation {
+    LINEAR = 0,
+    LOWER,
+    HIGHER,
+    NEAREST,
+    MIDPOINT,
+  };
+
+  explicit QuantileOptions(double q = 0.5, enum Interpolation interpolation = LINEAR,
+                           bool skip_nulls = true, uint32_t min_count = 0);
+
+  explicit QuantileOptions(std::vector<double> q,
+                           enum Interpolation interpolation = LINEAR,
+                           bool skip_nulls = true, uint32_t min_count = 0);
+
+  static constexpr const char kTypeName[] = "QuantileOptions";
+  static QuantileOptions Defaults() { return QuantileOptions{}; }
+
+  /// probability level of quantile must be between 0 and 1 inclusive
+  std::vector<double> q;
+  enum Interpolation interpolation;
+  /// If true (the default), null values are ignored. Otherwise, if any value is null,
+  /// emit null.
+  bool skip_nulls;
+  /// If less than this many non-null values are observed, emit null.
+  uint32_t min_count;
+};
+
+/// \brief Control TDigest approximate quantile kernel behavior
+///
+/// By default, returns the median value.
+class ARROW_EXPORT TDigestOptions : public FunctionOptions {
+ public:
+  explicit TDigestOptions(double q = 0.5, uint32_t delta = 100,
+                          uint32_t buffer_size = 500, bool skip_nulls = true,
+                          uint32_t min_count = 0);
+  explicit TDigestOptions(std::vector<double> q, uint32_t delta = 100,
+                          uint32_t buffer_size = 500, bool skip_nulls = true,
+                          uint32_t min_count = 0);
+  static constexpr const char kTypeName[] = "TDigestOptions";
+  static TDigestOptions Defaults() { return TDigestOptions{}; }
+
+  /// probability level of quantile must be between 0 and 1 inclusive
+  std::vector<double> q;
+  /// compression parameter, default 100
+  uint32_t delta;
+  /// input buffer size, default 500
+  uint32_t buffer_size;
+  /// If true (the default), null values are ignored. Otherwise, if any value is null,
+  /// emit null.
+  bool skip_nulls;
+  /// If less than this many non-null values are observed, emit null.
+  uint32_t min_count;
+};
+
+/// \brief Control Pivot kernel behavior
+///
+/// These options apply to the "pivot_wider" and "hash_pivot_wider" functions.
+///
+/// Constraints:
+/// - The corresponding `Aggregate::target` must have two FieldRef elements;
+///   the first one points to the pivot key column, the second points to the
+///   pivoted data column.
+/// - The pivot key column can be string, binary or integer; its values will be
+///   matched against `key_names` in order to dispatch the pivoted data into
+///   the output. If the pivot key column is not string-like, the `key_names`
+///   will be cast to the pivot key type.
+///
+/// "pivot_wider" example
+/// ---------------------
+///
+/// Assuming the following two input columns with types utf8 and int16 (respectively):
+/// ```
+/// width   |  11
+/// height  |  13
+/// ```
+/// and the options `PivotWiderOptions(.key_names = {"height", "width"})`
+///
+/// then the output will be a scalar with the type
+/// `struct{"height": int16, "width": int16}`
+/// and the value `{"height": 13, "width": 11}`.
+///
+/// "hash_pivot_wider" example
+/// --------------------------
+///
+/// Assuming the following input with schema
+/// `{"group": int32, "key": utf8, "value": int16}`:
+/// ```
+///  group |  key     |  value
+/// -----------------------------
+///   1    |  height  |    11
+///   1    |  width   |    12
+///   2    |  width   |    13
+///   3    |  height  |    14
+///   3    |  depth   |    15
+/// ```
+/// and the following settings:
+/// - a hash grouping key "group"
+/// - Aggregate(
+///     .function = "hash_pivot_wider",
+///     .options = PivotWiderOptions(.key_names = {"height", "width"}),
+///     .target = {"key", "value"},
+///     .name = {"properties"})
+///
+/// then the output will have the schema
+/// `{"group": int32, "properties": struct{"height": int16, "width": int16}}`
+/// and the following value:
+/// ```
+///  group |     properties
+///        |  height  |   width
+/// -----------------------------
+///   1    |   11     |    12
+///   2    |   null   |    13
+///   3    |   14     |    null
+/// ```
+class ARROW_EXPORT PivotWiderOptions : public FunctionOptions {
+ public:
+  /// Configure the behavior of pivot keys not in `key_names`
+  enum UnexpectedKeyBehavior {
+    /// Unexpected pivot keys are ignored silently
+    kIgnore,
+    /// Unexpected pivot keys return a KeyError
+    kRaise
+  };
+
+  explicit PivotWiderOptions(std::vector<std::string> key_names,
+                             UnexpectedKeyBehavior unexpected_key_behavior = kIgnore);
+  // Default constructor for serialization
+  PivotWiderOptions();
+  static constexpr const char kTypeName[] = "PivotWiderOptions";
+  static PivotWiderOptions Defaults() { return PivotWiderOptions{}; }
+
+  /// The values expected in the pivot key column
+  std::vector<std::string> key_names;
+  /// The behavior when pivot keys not in `key_names` are encountered
+  UnexpectedKeyBehavior unexpected_key_behavior = kIgnore;
+};
+
+/// \brief Control Index kernel behavior
+class ARROW_EXPORT IndexOptions : public FunctionOptions {
+ public:
+  explicit IndexOptions(std::shared_ptr<Scalar> value);
+  // Default constructor for serialization
+  IndexOptions();
+  static constexpr const char kTypeName[] = "IndexOptions";
+
+  std::shared_ptr<Scalar> value;
+};
+
+/// \brief Configure a grouped aggregation
+struct ARROW_EXPORT Aggregate {
+  Aggregate() = default;
+
+  Aggregate(std::string function, std::shared_ptr<FunctionOptions> options,
+            std::vector<FieldRef> target, std::string name = "")
+      : function(std::move(function)),
+        options(std::move(options)),
+        target(std::move(target)),
+        name(std::move(name)) {}
+
+  Aggregate(std::string function, std::shared_ptr<FunctionOptions> options,
+            FieldRef target, std::string name = "")
+      : Aggregate(std::move(function), std::move(options),
+                  std::vector<FieldRef>{std::move(target)}, std::move(name)) {}
+
+  Aggregate(std::string function, FieldRef target, std::string name)
+      : Aggregate(std::move(function), /*options=*/NULLPTR,
+                  std::vector<FieldRef>{std::move(target)}, std::move(name)) {}
+
+  Aggregate(std::string function, std::string name)
+      : Aggregate(std::move(function), /*options=*/NULLPTR,
+                  /*target=*/std::vector<FieldRef>{}, std::move(name)) {}
+
+  /// the name of the aggregation function
+  std::string function;
+
+  /// options for the aggregation function
+  std::shared_ptr<FunctionOptions> options;
+
+  /// zero or more fields to which aggregations will be applied
+  std::vector<FieldRef> target;
+
+  /// optional output field name for aggregations
+  std::string name;
+};
+
+/// @}
+
+/// \brief Count values in an array.
+///
+/// \param[in] options counting options, see CountOptions for more information
+/// \param[in] datum to count
+/// \param[in] ctx the function execution context, optional
+/// \return out resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Count(const Datum& datum,
+                    const CountOptions& options = CountOptions::Defaults(),
+                    ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the mean of a numeric array.
+///
+/// \param[in] value datum to compute the mean, expecting Array
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed mean as a DoubleScalar
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Mean(
+    const Datum& value,
+    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the product of values of a numeric array.
+///
+/// \param[in] value datum to compute product of, expecting Array or ChunkedArray
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed sum as a Scalar
+///
+/// \since 6.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Product(
+    const Datum& value,
+    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Sum values of a numeric array.
+///
+/// \param[in] value datum to sum, expecting Array or ChunkedArray
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed sum as a Scalar
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Sum(
+    const Datum& value,
+    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the first value of an array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed first as Scalar
+///
+/// \since 13.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> First(
+    const Datum& value,
+    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the last value of an array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed last as a Scalar
+///
+/// \since 13.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Last(
+    const Datum& value,
+    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the min / max of a numeric array
+///
+/// This function returns both the min and max as a struct scalar, with type
+/// struct<min: T, max: T>, where T is the input type
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as a struct<min: T, max: T> scalar
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> MinMax(
+    const Datum& value,
+    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Test whether any element in a boolean array evaluates to true.
+///
+/// This function returns true if any of the elements in the array evaluates
+/// to true and false otherwise. Null values are ignored by default.
+/// If null values are taken into account by setting ScalarAggregateOptions
+/// parameter skip_nulls = false then Kleene logic is used.
+/// See KleeneOr for more details on Kleene logic.
+///
+/// \param[in] value input datum, expecting a boolean array
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as a BooleanScalar
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Any(
+    const Datum& value,
+    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Test whether all elements in a boolean array evaluate to true.
+///
+/// This function returns true if all of the elements in the array evaluate
+/// to true and false otherwise. Null values are ignored by default.
+/// If null values are taken into account by setting ScalarAggregateOptions
+/// parameter skip_nulls = false then Kleene logic is used.
+/// See KleeneAnd for more details on Kleene logic.
+///
+/// \param[in] value input datum, expecting a boolean array
+/// \param[in] options see ScalarAggregateOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as a BooleanScalar
+
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> All(
+    const Datum& value,
+    const ScalarAggregateOptions& options = ScalarAggregateOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the modal (most common) value of a numeric array
+///
+/// This function returns top-n most common values and number of times they occur as
+/// an array of `struct<mode: T, count: int64>`, where T is the input type.
+/// Values with larger counts are returned before smaller ones.
+/// If there are more than one values with same count, smaller value is returned first.
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see ModeOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as an array of struct<mode: T, count: int64>
+///
+/// \since 2.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Mode(const Datum& value,
+                   const ModeOptions& options = ModeOptions::Defaults(),
+                   ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the standard deviation of a numeric array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see VarianceOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed standard deviation as a DoubleScalar
+///
+/// \since 2.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Stddev(const Datum& value,
+                     const VarianceOptions& options = VarianceOptions::Defaults(),
+                     ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the variance of a numeric array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see VarianceOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed variance as a DoubleScalar
+///
+/// \since 2.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Variance(const Datum& value,
+                       const VarianceOptions& options = VarianceOptions::Defaults(),
+                       ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the skewness of a numeric array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see SkewOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed skewness as a DoubleScalar
+///
+/// \since 20.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Skew(const Datum& value,
+                   const SkewOptions& options = SkewOptions::Defaults(),
+                   ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the kurtosis of a numeric array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see SkewOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return datum of the computed kurtosis as a DoubleScalar
+///
+/// \since 20.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Kurtosis(const Datum& value,
+                       const SkewOptions& options = SkewOptions::Defaults(),
+                       ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the quantiles of a numeric array
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see QuantileOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as an array
+///
+/// \since 4.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Quantile(const Datum& value,
+                       const QuantileOptions& options = QuantileOptions::Defaults(),
+                       ExecContext* ctx = NULLPTR);
+
+/// \brief Calculate the approximate quantiles of a numeric array with T-Digest algorithm
+///
+/// \param[in] value input datum, expecting Array or ChunkedArray
+/// \param[in] options see TDigestOptions for more information
+/// \param[in] ctx the function execution context, optional
+/// \return resulting datum as an array
+///
+/// \since 4.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> TDigest(const Datum& value,
+                      const TDigestOptions& options = TDigestOptions::Defaults(),
+                      ExecContext* ctx = NULLPTR);
+
+/// \brief Find the first index of a value in an array.
+///
+/// \param[in] value The array to search.
+/// \param[in] options The array to search for. See IndexOptions.
+/// \param[in] ctx the function execution context, optional
+/// \return out a Scalar containing the index (or -1 if not found).
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Index(const Datum& value, const IndexOptions& options,
+                    ExecContext* ctx = NULLPTR);
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/api_scalar.h b/pyarrow/include/arrow/compute/api_scalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b341e865a1665ee18229d0b78ad1aaf2d778325
--- /dev/null
+++ b/pyarrow/include/arrow/compute/api_scalar.h
@@ -0,0 +1,1802 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Eager evaluation convenience APIs for invoking common functions, including
+// necessary memory allocations
+
+#pragma once
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "arrow/compute/function_options.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+/// \addtogroup compute-concrete-options
+///
+/// @{
+
+class ARROW_EXPORT ArithmeticOptions : public FunctionOptions {
+ public:
+  explicit ArithmeticOptions(bool check_overflow = false);
+  static constexpr const char kTypeName[] = "ArithmeticOptions";
+  bool check_overflow;
+};
+
+class ARROW_EXPORT ElementWiseAggregateOptions : public FunctionOptions {
+ public:
+  explicit ElementWiseAggregateOptions(bool skip_nulls = true);
+  static constexpr const char kTypeName[] = "ElementWiseAggregateOptions";
+  static ElementWiseAggregateOptions Defaults() { return ElementWiseAggregateOptions{}; }
+  bool skip_nulls;
+};
+
+/// Rounding and tie-breaking modes for round compute functions.
+/// Additional details and examples are provided in compute.rst.
+enum class RoundMode : int8_t {
+  /// Round to nearest integer less than or equal in magnitude (aka "floor")
+  DOWN,
+  /// Round to nearest integer greater than or equal in magnitude (aka "ceil")
+  UP,
+  /// Get the integral part without fractional digits (aka "trunc")
+  TOWARDS_ZERO,
+  /// Round negative values with DOWN rule
+  /// and positive values with UP rule (aka "away from zero")
+  TOWARDS_INFINITY,
+  /// Round ties with DOWN rule (also called "round half towards negative infinity")
+  HALF_DOWN,
+  /// Round ties with UP rule (also called "round half towards positive infinity")
+  HALF_UP,
+  /// Round ties with TOWARDS_ZERO rule (also called "round half away from infinity")
+  HALF_TOWARDS_ZERO,
+  /// Round ties with TOWARDS_INFINITY rule (also called "round half away from zero")
+  HALF_TOWARDS_INFINITY,
+  /// Round ties to nearest even integer
+  HALF_TO_EVEN,
+  /// Round ties to nearest odd integer
+  HALF_TO_ODD,
+};
+
+class ARROW_EXPORT RoundOptions : public FunctionOptions {
+ public:
+  explicit RoundOptions(int64_t ndigits = 0,
+                        RoundMode round_mode = RoundMode::HALF_TO_EVEN);
+  static constexpr const char kTypeName[] = "RoundOptions";
+  static RoundOptions Defaults() { return RoundOptions(); }
+  /// Rounding precision (number of digits to round to)
+  int64_t ndigits;
+  /// Rounding and tie-breaking mode
+  RoundMode round_mode;
+};
+
+class ARROW_EXPORT RoundBinaryOptions : public FunctionOptions {
+ public:
+  explicit RoundBinaryOptions(RoundMode round_mode = RoundMode::HALF_TO_EVEN);
+  static constexpr const char kTypeName[] = "RoundBinaryOptions";
+  static RoundBinaryOptions Defaults() { return RoundBinaryOptions(); }
+  /// Rounding and tie-breaking mode
+  RoundMode round_mode;
+};
+
+enum class CalendarUnit : int8_t {
+  NANOSECOND,
+  MICROSECOND,
+  MILLISECOND,
+  SECOND,
+  MINUTE,
+  HOUR,
+  DAY,
+  WEEK,
+  MONTH,
+  QUARTER,
+  YEAR
+};
+
+class ARROW_EXPORT RoundTemporalOptions : public FunctionOptions {
+ public:
+  explicit RoundTemporalOptions(int multiple = 1, CalendarUnit unit = CalendarUnit::DAY,
+                                bool week_starts_monday = true,
+                                bool ceil_is_strictly_greater = false,
+                                bool calendar_based_origin = false);
+  static constexpr const char kTypeName[] = "RoundTemporalOptions";
+  static RoundTemporalOptions Defaults() { return RoundTemporalOptions(); }
+
+  /// Number of units to round to
+  int multiple;
+  /// The unit used for rounding of time
+  CalendarUnit unit;
+  /// What day does the week start with (Monday=true, Sunday=false)
+  bool week_starts_monday;
+  /// Enable this flag to return a rounded value that is strictly greater than the input.
+  /// For example: ceiling 1970-01-01T00:00:00 to 3 hours would yield 1970-01-01T03:00:00
+  /// if set to true and 1970-01-01T00:00:00 if set to false.
+  /// This applies for ceiling only.
+  bool ceil_is_strictly_greater;
+  /// By default time is rounded to a multiple of units since 1970-01-01T00:00:00.
+  /// By setting calendar_based_origin to true, time will be rounded to a number
+  /// of units since the last greater calendar unit.
+  /// For example: rounding to a multiple of days since the beginning of the month or
+  /// to hours since the beginning of the day.
+  /// Exceptions: week and quarter are not used as greater units, therefore days will
+  /// will be rounded to the beginning of the month not week. Greater unit of week
+  /// is year.
+  /// Note that ceiling and rounding might change sorting order of an array near greater
+  /// unit change. For example rounding YYYY-mm-dd 23:00:00 to 5 hours will ceil and
+  /// round to YYYY-mm-dd+1 01:00:00 and floor to YYYY-mm-dd 20:00:00. On the other hand
+  /// YYYY-mm-dd+1 00:00:00 will ceil, round and floor to YYYY-mm-dd+1 00:00:00. This
+  /// can break the order of an already ordered array.
+  bool calendar_based_origin;
+};
+
+class ARROW_EXPORT RoundToMultipleOptions : public FunctionOptions {
+ public:
+  explicit RoundToMultipleOptions(double multiple = 1.0,
+                                  RoundMode round_mode = RoundMode::HALF_TO_EVEN);
+  explicit RoundToMultipleOptions(std::shared_ptr<Scalar> multiple,
+                                  RoundMode round_mode = RoundMode::HALF_TO_EVEN);
+  static constexpr const char kTypeName[] = "RoundToMultipleOptions";
+  static RoundToMultipleOptions Defaults() { return RoundToMultipleOptions(); }
+  /// Rounding scale (multiple to round to).
+  ///
+  /// Should be a positive numeric scalar of a type compatible with the
+  /// argument to be rounded. The cast kernel is used to convert the rounding
+  /// multiple to match the result type.
+  std::shared_ptr<Scalar> multiple;
+  /// Rounding and tie-breaking mode
+  RoundMode round_mode;
+};
+
+/// Options for var_args_join.
+class ARROW_EXPORT JoinOptions : public FunctionOptions {
+ public:
+  /// How to handle null values. (A null separator always results in a null output.)
+  enum NullHandlingBehavior {
+    /// A null in any input results in a null in the output.
+    EMIT_NULL,
+    /// Nulls in inputs are skipped.
+    SKIP,
+    /// Nulls in inputs are replaced with the replacement string.
+    REPLACE,
+  };
+  explicit JoinOptions(NullHandlingBehavior null_handling = EMIT_NULL,
+                       std::string null_replacement = "");
+  static constexpr const char kTypeName[] = "JoinOptions";
+  static JoinOptions Defaults() { return JoinOptions(); }
+  NullHandlingBehavior null_handling;
+  std::string null_replacement;
+};
+
+class ARROW_EXPORT MatchSubstringOptions : public FunctionOptions {
+ public:
+  explicit MatchSubstringOptions(std::string pattern, bool ignore_case = false);
+  MatchSubstringOptions();
+  static constexpr const char kTypeName[] = "MatchSubstringOptions";
+
+  /// The exact substring (or regex, depending on kernel) to look for inside input values.
+  std::string pattern;
+  /// Whether to perform a case-insensitive match.
+  bool ignore_case;
+};
+
+class ARROW_EXPORT SplitOptions : public FunctionOptions {
+ public:
+  explicit SplitOptions(int64_t max_splits = -1, bool reverse = false);
+  static constexpr const char kTypeName[] = "SplitOptions";
+
+  /// Maximum number of splits allowed, or unlimited when -1
+  int64_t max_splits;
+  /// Start splitting from the end of the string (only relevant when max_splits != -1)
+  bool reverse;
+};
+
+class ARROW_EXPORT SplitPatternOptions : public FunctionOptions {
+ public:
+  explicit SplitPatternOptions(std::string pattern, int64_t max_splits = -1,
+                               bool reverse = false);
+  SplitPatternOptions();
+  static constexpr const char kTypeName[] = "SplitPatternOptions";
+
+  /// The exact substring to split on.
+  std::string pattern;
+  /// Maximum number of splits allowed, or unlimited when -1
+  int64_t max_splits;
+  /// Start splitting from the end of the string (only relevant when max_splits != -1)
+  bool reverse;
+};
+
+class ARROW_EXPORT ReplaceSliceOptions : public FunctionOptions {
+ public:
+  explicit ReplaceSliceOptions(int64_t start, int64_t stop, std::string replacement);
+  ReplaceSliceOptions();
+  static constexpr const char kTypeName[] = "ReplaceSliceOptions";
+
+  /// Index to start slicing at
+  int64_t start;
+  /// Index to stop slicing at
+  int64_t stop;
+  /// String to replace the slice with
+  std::string replacement;
+};
+
+class ARROW_EXPORT ReplaceSubstringOptions : public FunctionOptions {
+ public:
+  explicit ReplaceSubstringOptions(std::string pattern, std::string replacement,
+                                   int64_t max_replacements = -1);
+  ReplaceSubstringOptions();
+  static constexpr const char kTypeName[] = "ReplaceSubstringOptions";
+
+  /// Pattern to match, literal, or regular expression depending on which kernel is used
+  std::string pattern;
+  /// String to replace the pattern with
+  std::string replacement;
+  /// Max number of substrings to replace (-1 means unbounded)
+  int64_t max_replacements;
+};
+
+class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions {
+ public:
+  explicit ExtractRegexOptions(std::string pattern);
+  ExtractRegexOptions();
+  static constexpr const char kTypeName[] = "ExtractRegexOptions";
+
+  /// Regular expression with named capture fields
+  std::string pattern;
+};
+
+class ARROW_EXPORT ExtractRegexSpanOptions : public FunctionOptions {
+ public:
+  explicit ExtractRegexSpanOptions(std::string pattern);
+  ExtractRegexSpanOptions();
+  static constexpr const char kTypeName[] = "ExtractRegexSpanOptions";
+
+  /// Regular expression with named capture fields
+  std::string pattern;
+};
+
+/// Options for IsIn and IndexIn functions
+class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
+ public:
+  /// How to handle null values.
+  enum NullMatchingBehavior {
+    /// MATCH, any null in `value_set` is successfully matched in
+    /// the input.
+    MATCH,
+    /// SKIP, any null in `value_set` is ignored and nulls in the input
+    /// produce null (IndexIn) or false (IsIn) values in the output.
+    SKIP,
+    /// EMIT_NULL, any null in `value_set` is ignored and nulls in the
+    /// input produce null (IndexIn and IsIn) values in the output.
+    EMIT_NULL,
+    /// INCONCLUSIVE, null values are regarded as unknown values, which is
+    /// sql-compatible. nulls in the input produce null (IndexIn and IsIn)
+    /// values in the output. Besides, if `value_set` contains a null,
+    /// non-null unmatched values in the input also produce null values
+    /// (IndexIn and IsIn) in the output.
+    INCONCLUSIVE
+  };
+
+  explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH);
+  SetLookupOptions();
+
+  // DEPRECATED(will be removed after removing of skip_nulls)
+  explicit SetLookupOptions(Datum value_set, bool skip_nulls);
+
+  static constexpr const char kTypeName[] = "SetLookupOptions";
+
+  /// The set of values to look up input values into.
+  Datum value_set;
+
+  NullMatchingBehavior null_matching_behavior;
+
+  // DEPRECATED(will be removed after removing of skip_nulls)
+  NullMatchingBehavior GetNullMatchingBehavior() const;
+
+  // DEPRECATED(use null_matching_behavior instead)
+  /// Whether nulls in `value_set` count for lookup.
+  ///
+  /// If true, any null in `value_set` is ignored and nulls in the input
+  /// produce null (IndexIn) or false (IsIn) values in the output.
+  /// If false, any null in `value_set` is successfully matched in
+  /// the input.
+  std::optional<bool> skip_nulls;
+};
+
+/// Options for struct_field function
+class ARROW_EXPORT StructFieldOptions : public FunctionOptions {
+ public:
+  explicit StructFieldOptions(std::vector<int> indices);
+  explicit StructFieldOptions(std::initializer_list<int>);
+  explicit StructFieldOptions(FieldRef field_ref);
+  StructFieldOptions();
+  static constexpr const char kTypeName[] = "StructFieldOptions";
+
+  /// The FieldRef specifying what to extract from struct or union.
+  FieldRef field_ref;
+};
+
+class ARROW_EXPORT StrptimeOptions : public FunctionOptions {
+ public:
+  explicit StrptimeOptions(std::string format, TimeUnit::type unit,
+                           bool error_is_null = false);
+  StrptimeOptions();
+  static constexpr const char kTypeName[] = "StrptimeOptions";
+
+  /// The desired format string.
+  std::string format;
+  /// The desired time resolution
+  TimeUnit::type unit;
+  /// Return null on parsing errors if true or raise if false
+  bool error_is_null;
+};
+
+class ARROW_EXPORT StrftimeOptions : public FunctionOptions {
+ public:
+  explicit StrftimeOptions(std::string format, std::string locale = "C");
+  StrftimeOptions();
+
+  static constexpr const char kTypeName[] = "StrftimeOptions";
+
+  static constexpr const char* kDefaultFormat = "%Y-%m-%dT%H:%M:%S";
+
+  /// The desired format string.
+  std::string format;
+  /// The desired output locale string.
+  std::string locale;
+};
+
+class ARROW_EXPORT PadOptions : public FunctionOptions {
+ public:
+  explicit PadOptions(int64_t width, std::string padding = " ",
+                      bool lean_left_on_odd_padding = true);
+  PadOptions();
+  static constexpr const char kTypeName[] = "PadOptions";
+
+  /// The desired string length.
+  int64_t width;
+  /// What to pad the string with. Should be one codepoint (Unicode)/byte (ASCII).
+  std::string padding;
+  /// What to do if there is an odd number of padding characters (in case of centered
+  /// padding). Defaults to aligning on the left (i.e. adding the extra padding character
+  /// on the right)
+  bool lean_left_on_odd_padding = true;
+};
+
+class ARROW_EXPORT ZeroFillOptions : public FunctionOptions {
+ public:
+  explicit ZeroFillOptions(int64_t width, std::string padding = "0");
+  ZeroFillOptions();
+  static constexpr const char kTypeName[] = "ZeroFillOptions";
+
+  /// The desired string length.
+  int64_t width;
+  /// What to pad the string with. Should be one codepoint (Unicode).
+  std::string padding;
+};
+
+class ARROW_EXPORT TrimOptions : public FunctionOptions {
+ public:
+  explicit TrimOptions(std::string characters);
+  TrimOptions();
+  static constexpr const char kTypeName[] = "TrimOptions";
+
+  /// The individual characters to be trimmed from the string.
+  std::string characters;
+};
+
+class ARROW_EXPORT SliceOptions : public FunctionOptions {
+ public:
+  explicit SliceOptions(int64_t start, int64_t stop = std::numeric_limits<int64_t>::max(),
+                        int64_t step = 1);
+  SliceOptions();
+  static constexpr const char kTypeName[] = "SliceOptions";
+  int64_t start, stop, step;
+};
+
+class ARROW_EXPORT ListSliceOptions : public FunctionOptions {
+ public:
+  explicit ListSliceOptions(int64_t start, std::optional<int64_t> stop = std::nullopt,
+                            int64_t step = 1,
+                            std::optional<bool> return_fixed_size_list = std::nullopt);
+  ListSliceOptions();
+  static constexpr const char kTypeName[] = "ListSliceOptions";
+  /// The start of list slicing.
+  int64_t start;
+  /// Optional stop of list slicing. If not set, then slice to end. (NotImplemented)
+  std::optional<int64_t> stop;
+  /// Slicing step
+  int64_t step;
+  // Whether to return a FixedSizeListArray. If true _and_ stop is after
+  // a list element's length, nulls will be appended to create the requested slice size.
+  // Default of `nullopt` will return whatever type it got in.
+  std::optional<bool> return_fixed_size_list;
+};
+
+class ARROW_EXPORT NullOptions : public FunctionOptions {
+ public:
+  explicit NullOptions(bool nan_is_null = false);
+  static constexpr const char kTypeName[] = "NullOptions";
+  static NullOptions Defaults() { return NullOptions{}; }
+
+  bool nan_is_null;
+};
+
+enum CompareOperator : int8_t {
+  EQUAL,
+  NOT_EQUAL,
+  GREATER,
+  GREATER_EQUAL,
+  LESS,
+  LESS_EQUAL,
+};
+
+struct ARROW_EXPORT CompareOptions {
+  explicit CompareOptions(CompareOperator op) : op(op) {}
+  CompareOptions() : CompareOptions(CompareOperator::EQUAL) {}
+  enum CompareOperator op;
+};
+
+class ARROW_EXPORT MakeStructOptions : public FunctionOptions {
+ public:
+  MakeStructOptions(std::vector<std::string> n, std::vector<bool> r,
+                    std::vector<std::shared_ptr<const KeyValueMetadata>> m);
+  explicit MakeStructOptions(std::vector<std::string> n);
+  MakeStructOptions();
+  static constexpr const char kTypeName[] = "MakeStructOptions";
+
+  /// Names for wrapped columns
+  std::vector<std::string> field_names;
+
+  /// Nullability bits for wrapped columns
+  std::vector<bool> field_nullability;
+
+  /// Metadata attached to wrapped columns
+  std::vector<std::shared_ptr<const KeyValueMetadata>> field_metadata;
+};
+
+struct ARROW_EXPORT DayOfWeekOptions : public FunctionOptions {
+ public:
+  explicit DayOfWeekOptions(bool count_from_zero = true, uint32_t week_start = 1);
+  static constexpr const char kTypeName[] = "DayOfWeekOptions";
+  static DayOfWeekOptions Defaults() { return DayOfWeekOptions(); }
+
+  /// Number days from 0 if true and from 1 if false
+  bool count_from_zero;
+  /// What day does the week start with (Monday=1, Sunday=7).
+  /// The numbering is unaffected by the count_from_zero parameter.
+  uint32_t week_start;
+};
+
+/// Used to control timestamp timezone conversion and handling ambiguous/nonexistent
+/// times.
+struct ARROW_EXPORT AssumeTimezoneOptions : public FunctionOptions {
+ public:
+  /// \brief How to interpret ambiguous local times that can be interpreted as
+  /// multiple instants (normally two) due to DST shifts.
+  ///
+  /// AMBIGUOUS_EARLIEST emits the earliest instant amongst possible interpretations.
+  /// AMBIGUOUS_LATEST emits the latest instant amongst possible interpretations.
+  enum Ambiguous { AMBIGUOUS_RAISE, AMBIGUOUS_EARLIEST, AMBIGUOUS_LATEST };
+
+  /// \brief How to handle local times that do not exist due to DST shifts.
+  ///
+  /// NONEXISTENT_EARLIEST emits the instant "just before" the DST shift instant
+  /// in the given timestamp precision (for example, for a nanoseconds precision
+  /// timestamp, this is one nanosecond before the DST shift instant).
+  /// NONEXISTENT_LATEST emits the DST shift instant.
+  enum Nonexistent { NONEXISTENT_RAISE, NONEXISTENT_EARLIEST, NONEXISTENT_LATEST };
+
+  explicit AssumeTimezoneOptions(std::string timezone,
+                                 Ambiguous ambiguous = AMBIGUOUS_RAISE,
+                                 Nonexistent nonexistent = NONEXISTENT_RAISE);
+  AssumeTimezoneOptions();
+  static constexpr const char kTypeName[] = "AssumeTimezoneOptions";
+
+  /// Timezone to convert timestamps from
+  std::string timezone;
+
+  /// How to interpret ambiguous local times (due to DST shifts)
+  Ambiguous ambiguous;
+  /// How to interpret nonexistent local times (due to DST shifts)
+  Nonexistent nonexistent;
+};
+
+struct ARROW_EXPORT WeekOptions : public FunctionOptions {
+ public:
+  explicit WeekOptions(bool week_starts_monday = true, bool count_from_zero = false,
+                       bool first_week_is_fully_in_year = false);
+  static constexpr const char kTypeName[] = "WeekOptions";
+  static WeekOptions Defaults() { return WeekOptions{}; }
+  static WeekOptions ISODefaults() {
+    return WeekOptions{/*week_starts_monday*/ true,
+                       /*count_from_zero=*/false,
+                       /*first_week_is_fully_in_year=*/false};
+  }
+  static WeekOptions USDefaults() {
+    return WeekOptions{/*week_starts_monday*/ false,
+                       /*count_from_zero=*/false,
+                       /*first_week_is_fully_in_year=*/false};
+  }
+
+  /// What day does the week start with (Monday=true, Sunday=false)
+  bool week_starts_monday;
+  /// Dates from current year that fall into last ISO week of the previous year return
+  /// 0 if true and 52 or 53 if false.
+  bool count_from_zero;
+  /// Must the first week be fully in January (true), or is a week that begins on
+  /// December 29, 30, or 31 considered to be the first week of the new year (false)?
+  bool first_week_is_fully_in_year;
+};
+
+struct ARROW_EXPORT Utf8NormalizeOptions : public FunctionOptions {
+ public:
+  enum Form { NFC, NFKC, NFD, NFKD };
+
+  explicit Utf8NormalizeOptions(Form form = NFC);
+  static Utf8NormalizeOptions Defaults() { return Utf8NormalizeOptions(); }
+  static constexpr const char kTypeName[] = "Utf8NormalizeOptions";
+
+  /// The Unicode normalization form to apply
+  Form form;
+};
+
+class ARROW_EXPORT RandomOptions : public FunctionOptions {
+ public:
+  enum Initializer { SystemRandom, Seed };
+
+  static RandomOptions FromSystemRandom() { return RandomOptions{SystemRandom, 0}; }
+  static RandomOptions FromSeed(uint64_t seed) { return RandomOptions{Seed, seed}; }
+
+  RandomOptions(Initializer initializer, uint64_t seed);
+  RandomOptions();
+  static constexpr const char kTypeName[] = "RandomOptions";
+  static RandomOptions Defaults() { return RandomOptions(); }
+
+  /// The type of initialization for random number generation - system or provided seed.
+  Initializer initializer;
+  /// The seed value used to initialize the random number generation.
+  uint64_t seed;
+};
+
+/// Options for map_lookup function
+class ARROW_EXPORT MapLookupOptions : public FunctionOptions {
+ public:
+  enum Occurrence {
+    /// Return the first matching value
+    FIRST,
+    /// Return the last matching value
+    LAST,
+    /// Return all matching values
+    ALL
+  };
+
+  explicit MapLookupOptions(std::shared_ptr<Scalar> query_key, Occurrence occurrence);
+  MapLookupOptions();
+
+  constexpr static const char kTypeName[] = "MapLookupOptions";
+
+  /// The key to lookup in the map
+  std::shared_ptr<Scalar> query_key;
+
+  /// Whether to return the first, last, or all matching values
+  Occurrence occurrence;
+};
+
+/// @}
+
+/// \brief Get the absolute value of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the value transformed
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise absolute value
+ARROW_EXPORT
+Result<Datum> AbsoluteValue(const Datum& arg,
+                            ArithmeticOptions options = ArithmeticOptions(),
+                            ExecContext* ctx = NULLPTR);
+
+/// \brief Add two values together. Array values must be the same length. If
+/// either addend is null the result will be null.
+///
+/// \param[in] left the first addend
+/// \param[in] right the second addend
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise sum
+ARROW_EXPORT
+Result<Datum> Add(const Datum& left, const Datum& right,
+                  ArithmeticOptions options = ArithmeticOptions(),
+                  ExecContext* ctx = NULLPTR);
+
+/// \brief Subtract two values. Array values must be the same length. If the
+/// minuend or subtrahend is null the result will be null.
+///
+/// \param[in] left the value subtracted from (minuend)
+/// \param[in] right the value by which the minuend is reduced (subtrahend)
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise difference
+ARROW_EXPORT
+Result<Datum> Subtract(const Datum& left, const Datum& right,
+                       ArithmeticOptions options = ArithmeticOptions(),
+                       ExecContext* ctx = NULLPTR);
+
+/// \brief Multiply two values. Array values must be the same length. If either
+/// factor is null the result will be null.
+///
+/// \param[in] left the first factor
+/// \param[in] right the second factor
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise product
+ARROW_EXPORT
+Result<Datum> Multiply(const Datum& left, const Datum& right,
+                       ArithmeticOptions options = ArithmeticOptions(),
+                       ExecContext* ctx = NULLPTR);
+
+/// \brief Divide two values. Array values must be the same length. If either
+/// argument is null the result will be null. For integer types, if there is
+/// a zero divisor, an error will be raised.
+///
+/// \param[in] left the dividend
+/// \param[in] right the divisor
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise quotient
+ARROW_EXPORT
+Result<Datum> Divide(const Datum& left, const Datum& right,
+                     ArithmeticOptions options = ArithmeticOptions(),
+                     ExecContext* ctx = NULLPTR);
+
+/// \brief Negate values.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the value negated
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise negation
+ARROW_EXPORT
+Result<Datum> Negate(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                     ExecContext* ctx = NULLPTR);
+
+/// \brief Raise the values of base array to the power of the exponent array values.
+/// Array values must be the same length. If either base or exponent is null the result
+/// will be null.
+///
+/// \param[in] left the base
+/// \param[in] right the exponent
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise base value raised to the power of exponent
+ARROW_EXPORT
+Result<Datum> Power(const Datum& left, const Datum& right,
+                    ArithmeticOptions options = ArithmeticOptions(),
+                    ExecContext* ctx = NULLPTR);
+
+/// \brief Raise Euler's number to the power of specified exponent, element-wise.
+/// If the exponent value is null the result will be null.
+///
+/// \param[in] arg the exponent
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise Euler's number raised to the power of exponent
+ARROW_EXPORT
+Result<Datum> Exp(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief More accurately calculate `exp(arg) - 1` for values close to zero.
+/// If the exponent value is null the result will be null.
+///
+/// This function is more accurate than calculating `exp(value) - 1` directly for values
+/// close to zero.
+///
+/// \param[in] arg the exponent
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise Euler's number raised to the power of exponent minus 1
+ARROW_EXPORT
+Result<Datum> Expm1(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Left shift the left array by the right array. Array values must be the
+/// same length. If either operand is null, the result will be null.
+///
+/// \param[in] left the value to shift
+/// \param[in] right the value to shift by
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise left value shifted left by the right value
+ARROW_EXPORT
+Result<Datum> ShiftLeft(const Datum& left, const Datum& right,
+                        ArithmeticOptions options = ArithmeticOptions(),
+                        ExecContext* ctx = NULLPTR);
+
+/// \brief Right shift the left array by the right array. Array values must be the
+/// same length. If either operand is null, the result will be null. Performs a
+/// logical shift for unsigned values, and an arithmetic shift for signed values.
+///
+/// \param[in] left the value to shift
+/// \param[in] right the value to shift by
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise left value shifted right by the right value
+ARROW_EXPORT
+Result<Datum> ShiftRight(const Datum& left, const Datum& right,
+                         ArithmeticOptions options = ArithmeticOptions(),
+                         ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the sine of the array values.
+/// \param[in] arg The values to compute the sine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise sine of the values
+ARROW_EXPORT
+Result<Datum> Sin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                  ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the cosine of the array values.
+/// \param[in] arg The values to compute the cosine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise cosine of the values
+ARROW_EXPORT
+Result<Datum> Cos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                  ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse sine (arcsine) of the array values.
+/// \param[in] arg The values to compute the inverse sine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse sine of the values
+ARROW_EXPORT
+Result<Datum> Asin(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                   ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse cosine (arccosine) of the array values.
+/// \param[in] arg The values to compute the inverse cosine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse cosine of the values
+ARROW_EXPORT
+Result<Datum> Acos(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                   ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the tangent of the array values.
+/// \param[in] arg The values to compute the tangent for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise tangent of the values
+ARROW_EXPORT
+Result<Datum> Tan(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                  ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse tangent (arctangent) of the array values.
+/// \param[in] arg The values to compute the inverse tangent for.
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse tangent of the values
+ARROW_EXPORT
+Result<Datum> Atan(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse tangent (arctangent) of y/x, using the
+/// argument signs to determine the correct quadrant.
+/// \param[in] y The y-values to compute the inverse tangent for.
+/// \param[in] x The x-values to compute the inverse tangent for.
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse tangent of the values
+ARROW_EXPORT
+Result<Datum> Atan2(const Datum& y, const Datum& x, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the hyperbolic sine of the array values.
+/// \param[in] arg The values to compute the hyperbolic sine for.
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise hyperbolic sine of the values
+ARROW_EXPORT
+Result<Datum> Sinh(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the hyperbolic cosine of the array values.
+/// \param[in] arg The values to compute the hyperbolic cosine for.
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise hyperbolic cosine of the values
+ARROW_EXPORT
+Result<Datum> Cosh(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the hyperbolic tangent of the array values.
+/// \param[in] arg The values to compute the hyperbolic tangent for.
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise hyperbolic tangent of the values
+ARROW_EXPORT
+Result<Datum> Tanh(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse hyperbolic sine of the array values.
+/// \param[in] arg The values to compute the inverse hyperbolic sine for.
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse hyperbolic sine of the values
+ARROW_EXPORT
+Result<Datum> Asinh(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse hyperbolic cosine of the array values.
+/// \param[in] arg The values to compute the inverse hyperbolic cosine for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse hyperbolic cosine of the values
+ARROW_EXPORT
+Result<Datum> Acosh(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                    ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the inverse hyperbolic tangent of the array values.
+/// \param[in] arg The values to compute the inverse hyperbolic tangent for.
+/// \param[in] options arithmetic options (enable/disable overflow checking), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise inverse hyperbolic tangent of the values
+ARROW_EXPORT
+Result<Datum> Atanh(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                    ExecContext* ctx = NULLPTR);
+
+/// \brief Get the natural log of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise natural log
+ARROW_EXPORT
+Result<Datum> Ln(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                 ExecContext* ctx = NULLPTR);
+
+/// \brief Get the log base 10 of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise log base 10
+ARROW_EXPORT
+Result<Datum> Log10(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                    ExecContext* ctx = NULLPTR);
+
+/// \brief Get the log base 2 of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise log base 2
+ARROW_EXPORT
+Result<Datum> Log2(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                   ExecContext* ctx = NULLPTR);
+
+/// \brief Get the natural log of (1 + value).
+///
+/// If argument is null the result will be null.
+/// This function may be more accurate than Log(1 + value) for values close to zero.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise natural log
+ARROW_EXPORT
+Result<Datum> Log1p(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                    ExecContext* ctx = NULLPTR);
+
+/// \brief Get the log of a value to the given base.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the logarithm for.
+/// \param[in] base The given base.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise log to the given base
+ARROW_EXPORT
+Result<Datum> Logb(const Datum& arg, const Datum& base,
+                   ArithmeticOptions options = ArithmeticOptions(),
+                   ExecContext* ctx = NULLPTR);
+
+/// \brief Get the square-root of a value.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg The values to compute the square-root for.
+/// \param[in] options arithmetic options (overflow handling), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the elementwise square-root
+ARROW_EXPORT
+Result<Datum> Sqrt(const Datum& arg, ArithmeticOptions options = ArithmeticOptions(),
+                   ExecContext* ctx = NULLPTR);
+
+/// \brief Round to the nearest integer less than or equal in magnitude to the
+/// argument.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the value to round
+/// \param[in] ctx the function execution context, optional
+/// \return the rounded value
+ARROW_EXPORT
+Result<Datum> Floor(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Round to the nearest integer greater than or equal in magnitude to the
+/// argument.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the value to round
+/// \param[in] ctx the function execution context, optional
+/// \return the rounded value
+ARROW_EXPORT
+Result<Datum> Ceil(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Get the integral part without fractional digits.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the value to truncate
+/// \param[in] ctx the function execution context, optional
+/// \return the truncated value
+ARROW_EXPORT
+Result<Datum> Trunc(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Find the element-wise maximum of any number of arrays or scalars.
+/// Array values must be the same length.
+///
+/// \param[in] args arrays or scalars to operate on.
+/// \param[in] options options for handling nulls, optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise maximum
+ARROW_EXPORT
+Result<Datum> MaxElementWise(
+    const std::vector<Datum>& args,
+    ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Find the element-wise minimum of any number of arrays or scalars.
+/// Array values must be the same length.
+///
+/// \param[in] args arrays or scalars to operate on.
+/// \param[in] options options for handling nulls, optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise minimum
+ARROW_EXPORT
+Result<Datum> MinElementWise(
+    const std::vector<Datum>& args,
+    ElementWiseAggregateOptions options = ElementWiseAggregateOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Get the sign of a value. Array values can be of arbitrary length. If argument
+/// is null the result will be null.
+///
+/// \param[in] arg the value to extract sign from
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise sign function
+ARROW_EXPORT
+Result<Datum> Sign(const Datum& arg, ExecContext* ctx = NULLPTR);
+
+/// \brief Round a value to a given precision.
+///
+/// If arg is null the result will be null.
+///
+/// \param[in] arg the value to be rounded
+/// \param[in] options rounding options (rounding mode and number of digits), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise rounded value
+ARROW_EXPORT
+Result<Datum> Round(const Datum& arg, RoundOptions options = RoundOptions::Defaults(),
+                    ExecContext* ctx = NULLPTR);
+
+/// \brief Round a value to a given precision.
+///
+/// If arg1 is null the result will be null.
+/// If arg2 is null then the result will be null. If arg2 is negative, then the rounding
+/// place will be shifted to the left (thus -1 would correspond to rounding to the nearest
+/// ten).  If positive, the rounding place will shift to the right (and +1 would
+/// correspond to rounding to the nearest tenth).
+///
+/// \param[in] arg1 the value to be rounded
+/// \param[in] arg2 the number of significant digits to round to
+/// \param[in] options rounding options, optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise rounded value
+ARROW_EXPORT
+Result<Datum> RoundBinary(const Datum& arg1, const Datum& arg2,
+                          RoundBinaryOptions options = RoundBinaryOptions::Defaults(),
+                          ExecContext* ctx = NULLPTR);
+
+/// \brief Round a value to a given multiple.
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the value to round
+/// \param[in] options rounding options (rounding mode and multiple), optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise rounded value
+ARROW_EXPORT
+Result<Datum> RoundToMultiple(
+    const Datum& arg, RoundToMultipleOptions options = RoundToMultipleOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Ceil a temporal value to a given frequency
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the temporal value to ceil
+/// \param[in] options temporal rounding options, optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise rounded value
+///
+/// \since 7.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> CeilTemporal(
+    const Datum& arg, RoundTemporalOptions options = RoundTemporalOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Floor a temporal value to a given frequency
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the temporal value to floor
+/// \param[in] options temporal rounding options, optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise rounded value
+///
+/// \since 7.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> FloorTemporal(
+    const Datum& arg, RoundTemporalOptions options = RoundTemporalOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Round a temporal value to a given frequency
+///
+/// If argument is null the result will be null.
+///
+/// \param[in] arg the temporal value to round
+/// \param[in] options temporal rounding options, optional
+/// \param[in] ctx the function execution context, optional
+/// \return the element-wise rounded value
+///
+/// \since 7.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> RoundTemporal(
+    const Datum& arg, RoundTemporalOptions options = RoundTemporalOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Invert the values of a boolean datum
+/// \param[in] value datum to invert
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Invert(const Datum& value, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise AND of two boolean datums which always propagates nulls
+/// (null and false is null).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> And(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise AND of two boolean datums with a Kleene truth table
+/// (null and false is false).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> KleeneAnd(const Datum& left, const Datum& right,
+                        ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise OR of two boolean datums which always propagates nulls
+/// (null and true is null).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Or(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise OR of two boolean datums with a Kleene truth table
+/// (null or true is true).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> KleeneOr(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise XOR of two boolean datums
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Xor(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise AND NOT of two boolean datums which always propagates nulls
+/// (null and not true is null).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> AndNot(const Datum& left, const Datum& right, ExecContext* ctx = NULLPTR);
+
+/// \brief Element-wise AND NOT of two boolean datums with a Kleene truth table
+/// (false and not null is false, null and not true is false).
+///
+/// \param[in] left left operand
+/// \param[in] right right operand
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> KleeneAndNot(const Datum& left, const Datum& right,
+                           ExecContext* ctx = NULLPTR);
+
+/// \brief IsIn returns true for each element of `values` that is contained in
+/// `value_set`
+///
+/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
+///
+/// \param[in] values array-like input to look up in value_set
+/// \param[in] options SetLookupOptions
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IsIn(const Datum& values, const SetLookupOptions& options,
+                   ExecContext* ctx = NULLPTR);
+ARROW_EXPORT
+Result<Datum> IsIn(const Datum& values, const Datum& value_set,
+                   ExecContext* ctx = NULLPTR);
+
+/// \brief IndexIn examines each slot in the values against a value_set array.
+/// If the value is not found in value_set, null will be output.
+/// If found, the index of occurrence within value_set (ignoring duplicates)
+/// will be output.
+///
+/// For example given values = [99, 42, 3, null] and
+/// value_set = [3, 3, 99], the output will be = [2, null, 0, null]
+///
+/// Behaviour of nulls is governed by SetLookupOptions::skip_nulls.
+///
+/// \param[in] values array-like input
+/// \param[in] options SetLookupOptions
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IndexIn(const Datum& values, const SetLookupOptions& options,
+                      ExecContext* ctx = NULLPTR);
+ARROW_EXPORT
+Result<Datum> IndexIn(const Datum& values, const Datum& value_set,
+                      ExecContext* ctx = NULLPTR);
+
+/// \brief IsValid returns true for each element of `values` that is not null,
+/// false otherwise
+///
+/// \param[in] values input to examine for validity
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IsValid(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief IsNull returns true for each element of `values` that is null,
+/// false otherwise
+///
+/// \param[in] values input to examine for nullity
+/// \param[in] options NullOptions
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IsNull(const Datum& values, NullOptions options = NullOptions::Defaults(),
+                     ExecContext* ctx = NULLPTR);
+
+/// \brief IsNan returns true for each element of `values` that is NaN,
+/// false otherwise
+///
+/// \param[in] values input to look for NaN
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 3.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IsNan(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief IfElse returns elements chosen from `left` or `right`
+/// depending on `cond`. `null` values in `cond` will be promoted to the result
+///
+/// \param[in] cond `Boolean` condition Scalar/ Array
+/// \param[in] left Scalar/ Array
+/// \param[in] right Scalar/ Array
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IfElse(const Datum& cond, const Datum& left, const Datum& right,
+                     ExecContext* ctx = NULLPTR);
+
+/// \brief CaseWhen behaves like a switch/case or if-else if-else statement: for
+/// each row, select the first value for which the corresponding condition is
+/// true, or (if given) select the 'else' value, else emit null. Note that a
+/// null condition is the same as false.
+///
+/// \param[in] cond Conditions (Boolean)
+/// \param[in] cases Values (any type), along with an optional 'else' value.
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> CaseWhen(const Datum& cond, const std::vector<Datum>& cases,
+                       ExecContext* ctx = NULLPTR);
+
+/// \brief Year returns year for each element of `values`
+///
+/// \param[in] values input to extract year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Year(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief IsLeapYear returns if a year is a leap year for each element of `values`
+///
+/// \param[in] values input to extract leap year indicator from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> IsLeapYear(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Month returns month for each element of `values`.
+/// Month is encoded as January=1, December=12
+///
+/// \param[in] values input to extract month from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Month(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Day returns day number for each element of `values`
+///
+/// \param[in] values input to extract day from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Day(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief YearMonthDay returns a struct containing the Year, Month and Day value for
+/// each element of `values`.
+///
+/// \param[in] values input to extract (year, month, day) struct from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 7.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> YearMonthDay(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief DayOfWeek returns number of the day of the week value for each element of
+/// `values`.
+///
+/// By default week starts on Monday denoted by 0 and ends on Sunday denoted
+/// by 6. Start day of the week (Monday=1, Sunday=7) and numbering base (0 or 1) can be
+/// set using DayOfWeekOptions
+///
+/// \param[in] values input to extract number of the day of the week from
+/// \param[in] options for setting start of the week and day numbering
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> DayOfWeek(const Datum& values,
+                                     DayOfWeekOptions options = DayOfWeekOptions(),
+                                     ExecContext* ctx = NULLPTR);
+
+/// \brief DayOfYear returns number of day of the year for each element of `values`.
+/// January 1st maps to day number 1, February 1st to 32, etc.
+///
+/// \param[in] values input to extract number of day of the year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> DayOfYear(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief ISOYear returns ISO year number for each element of `values`.
+/// First week of an ISO year has the majority (4 or more) of its days in January.
+///
+/// \param[in] values input to extract ISO year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> ISOYear(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief USYear returns US epidemiological year number for each element of `values`.
+/// First week of US epidemiological year has the majority (4 or more) of it's
+/// days in January. Last week of US epidemiological year has the year's last
+/// Wednesday in it. US epidemiological week starts on Sunday.
+///
+/// \param[in] values input to extract US epidemiological year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> USYear(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief ISOWeek returns ISO week of year number for each element of `values`.
+/// First ISO week has the majority (4 or more) of its days in January.
+/// ISO week starts on Monday. Year can have 52 or 53 weeks.
+/// Week numbering can start with 1.
+///
+/// \param[in] values input to extract ISO week of year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> ISOWeek(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief USWeek returns US week of year number for each element of `values`.
+/// First US week has the majority (4 or more) of its days in January.
+/// US week starts on Sunday. Year can have 52 or 53 weeks.
+/// Week numbering starts with 1.
+///
+/// \param[in] values input to extract US week of year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 6.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> USWeek(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Week returns week of year number for each element of `values`.
+/// First ISO week has the majority (4 or more) of its days in January.
+/// Year can have 52 or 53 weeks. Week numbering can start with 0 or 1
+/// depending on DayOfWeekOptions.count_from_zero.
+///
+/// \param[in] values input to extract week of year from
+/// \param[in] options for setting numbering start
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 6.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> Week(const Datum& values, WeekOptions options = WeekOptions(),
+                                ExecContext* ctx = NULLPTR);
+
+/// \brief ISOCalendar returns a (ISO year, ISO week, ISO day of week) struct for
+/// each element of `values`.
+/// ISO week starts on Monday denoted by 1 and ends on Sunday denoted by 7.
+///
+/// \param[in] values input to ISO calendar struct from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> ISOCalendar(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Quarter returns the quarter of year number for each element of `values`
+/// First quarter maps to 1 and fourth quarter maps to 4.
+///
+/// \param[in] values input to extract quarter of year from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> Quarter(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Hour returns hour value for each element of `values`
+///
+/// \param[in] values input to extract hour from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Hour(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Minute returns minutes value for each element of `values`
+///
+/// \param[in] values input to extract minutes from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Minute(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Second returns seconds value for each element of `values`
+///
+/// \param[in] values input to extract seconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Second(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Millisecond returns number of milliseconds since the last full second
+/// for each element of `values`
+///
+/// \param[in] values input to extract milliseconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Millisecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Microsecond returns number of microseconds since the last full millisecond
+/// for each element of `values`
+///
+/// \param[in] values input to extract microseconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Microsecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Nanosecond returns number of nanoseconds since the last full millisecond
+/// for each element of `values`
+///
+/// \param[in] values input to extract nanoseconds from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Nanosecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Subsecond returns the fraction of second elapsed since last full second
+/// as a float for each element of `values`
+///
+/// \param[in] values input to extract subsecond from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> Subsecond(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Format timestamps according to a format string
+///
+/// Return formatted time strings according to the format string
+/// `StrftimeOptions::format` and to the locale specifier `Strftime::locale`.
+///
+/// \param[in] values input timestamps
+/// \param[in] options for setting format string and locale
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 6.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> Strftime(const Datum& values, StrftimeOptions options,
+                                    ExecContext* ctx = NULLPTR);
+
+/// \brief Parse timestamps according to a format string
+///
+/// Return parsed timestamps according to the format string
+/// `StrptimeOptions::format` at time resolution `Strftime::unit`. Parse errors are
+/// raised depending on the `Strftime::error_is_null` setting.
+///
+/// \param[in] values input strings
+/// \param[in] options for setting format string, unit and error_is_null
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> Strptime(const Datum& values, StrptimeOptions options,
+                                    ExecContext* ctx = NULLPTR);
+
+/// \brief Converts timestamps from local timestamp without a timezone to a timestamp with
+/// timezone, interpreting the local timestamp as being in the specified timezone for each
+/// element of `values`
+///
+/// \param[in] values input to convert
+/// \param[in] options for setting source timezone, exception and ambiguous timestamp
+/// handling.
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 6.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> AssumeTimezone(const Datum& values,
+                                          AssumeTimezoneOptions options,
+                                          ExecContext* ctx = NULLPTR);
+
+/// \brief IsDaylightSavings extracts if currently observing daylight savings for each
+/// element of `values`
+///
+/// \param[in] values input to extract daylight savings indicator from
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> IsDaylightSavings(const Datum& values,
+                                             ExecContext* ctx = NULLPTR);
+
+/// \brief LocalTimestamp converts timestamp to timezone naive local timestamp
+///
+/// \param[in] values input to convert to local time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 12.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> LocalTimestamp(const Datum& values,
+                                          ExecContext* ctx = NULLPTR);
+
+/// \brief Years Between finds the number of years between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> YearsBetween(const Datum& left, const Datum& right,
+                                        ExecContext* ctx = NULLPTR);
+
+/// \brief Quarters Between finds the number of quarters between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> QuartersBetween(const Datum& left, const Datum& right,
+                                           ExecContext* ctx = NULLPTR);
+
+/// \brief Months Between finds the number of month between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> MonthsBetween(const Datum& left, const Datum& right,
+                                         ExecContext* ctx = NULLPTR);
+
+/// \brief Weeks Between finds the number of weeks between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> WeeksBetween(const Datum& left, const Datum& right,
+                                        ExecContext* ctx = NULLPTR);
+
+/// \brief Month Day Nano Between finds the number of months, days, and nanoseconds
+/// between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> MonthDayNanoBetween(const Datum& left, const Datum& right,
+                                               ExecContext* ctx = NULLPTR);
+
+/// \brief DayTime Between finds the number of days and milliseconds between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> DayTimeBetween(const Datum& left, const Datum& right,
+                                          ExecContext* ctx = NULLPTR);
+
+/// \brief Days Between finds the number of days between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> DaysBetween(const Datum& left, const Datum& right,
+                                       ExecContext* ctx = NULLPTR);
+
+/// \brief Hours Between finds the number of hours between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> HoursBetween(const Datum& left, const Datum& right,
+                                        ExecContext* ctx = NULLPTR);
+
+/// \brief Minutes Between finds the number of minutes between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> MinutesBetween(const Datum& left, const Datum& right,
+                                          ExecContext* ctx = NULLPTR);
+
+/// \brief Seconds Between finds the number of hours between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> SecondsBetween(const Datum& left, const Datum& right,
+                                          ExecContext* ctx = NULLPTR);
+
+/// \brief Milliseconds Between finds the number of milliseconds between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> MillisecondsBetween(const Datum& left, const Datum& right,
+                                               ExecContext* ctx = NULLPTR);
+
+/// \brief Microseconds Between finds the number of microseconds between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> MicrosecondsBetween(const Datum& left, const Datum& right,
+                                               ExecContext* ctx = NULLPTR);
+
+/// \brief Nanoseconds Between finds the number of nanoseconds between two values
+///
+/// \param[in] left input treated as the start time
+/// \param[in] right input treated as the end time
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> NanosecondsBetween(const Datum& left, const Datum& right,
+                                              ExecContext* ctx = NULLPTR);
+
+/// \brief Finds either the FIRST, LAST, or ALL items with a key that matches the given
+/// query key in a map.
+///
+/// Returns an array of items for FIRST and LAST, and an array of list of items for ALL.
+///
+/// \param[in] map to look in
+/// \param[in] options to pass a query key and choose which matching keys to return
+/// (FIRST, LAST or ALL)
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 8.0.0
+/// \note API not yet finalized
+ARROW_EXPORT Result<Datum> MapLookup(const Datum& map, MapLookupOptions options,
+                                     ExecContext* ctx = NULLPTR);
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/api_vector.h b/pyarrow/include/arrow/compute/api_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..159a787641ee5216ac2f19ec304d3d8e25303e39
--- /dev/null
+++ b/pyarrow/include/arrow/compute/api_vector.h
@@ -0,0 +1,835 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "arrow/compute/function_options.h"
+#include "arrow/compute/ordering.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace compute {
+
+class ExecContext;
+
+/// \addtogroup compute-concrete-options
+/// @{
+
+class ARROW_EXPORT FilterOptions : public FunctionOptions {
+ public:
+  /// Configure the action taken when a slot of the selection mask is null
+  enum NullSelectionBehavior {
+    /// The corresponding filtered value will be removed in the output.
+    DROP,
+    /// The corresponding filtered value will be null in the output.
+    EMIT_NULL,
+  };
+
+  explicit FilterOptions(NullSelectionBehavior null_selection = DROP);
+  static constexpr const char kTypeName[] = "FilterOptions";
+  static FilterOptions Defaults() { return FilterOptions(); }
+
+  NullSelectionBehavior null_selection_behavior = DROP;
+};
+
+class ARROW_EXPORT TakeOptions : public FunctionOptions {
+ public:
+  explicit TakeOptions(bool boundscheck = true);
+  static constexpr const char kTypeName[] = "TakeOptions";
+  static TakeOptions BoundsCheck() { return TakeOptions(true); }
+  static TakeOptions NoBoundsCheck() { return TakeOptions(false); }
+  static TakeOptions Defaults() { return BoundsCheck(); }
+
+  bool boundscheck = true;
+};
+
+/// \brief Options for the dictionary encode function
+class ARROW_EXPORT DictionaryEncodeOptions : public FunctionOptions {
+ public:
+  /// Configure how null values will be encoded
+  enum NullEncodingBehavior {
+    /// The null value will be added to the dictionary with a proper index.
+    ENCODE,
+    /// The null value will be masked in the indices array.
+    MASK
+  };
+
+  explicit DictionaryEncodeOptions(NullEncodingBehavior null_encoding = MASK);
+  static constexpr const char kTypeName[] = "DictionaryEncodeOptions";
+  static DictionaryEncodeOptions Defaults() { return DictionaryEncodeOptions(); }
+
+  NullEncodingBehavior null_encoding_behavior = MASK;
+};
+
+/// \brief Options for the run-end encode function
+class ARROW_EXPORT RunEndEncodeOptions : public FunctionOptions {
+ public:
+  explicit RunEndEncodeOptions(std::shared_ptr<DataType> run_end_type = int32());
+  static constexpr const char kTypeName[] = "RunEndEncodeOptions";
+  static RunEndEncodeOptions Defaults() { return RunEndEncodeOptions(); }
+
+  std::shared_ptr<DataType> run_end_type;
+};
+
+class ARROW_EXPORT ArraySortOptions : public FunctionOptions {
+ public:
+  explicit ArraySortOptions(SortOrder order = SortOrder::Ascending,
+                            NullPlacement null_placement = NullPlacement::AtEnd);
+  static constexpr const char kTypeName[] = "ArraySortOptions";
+  static ArraySortOptions Defaults() { return ArraySortOptions(); }
+
+  /// Sorting order
+  SortOrder order;
+  /// Whether nulls and NaNs are placed at the start or at the end
+  NullPlacement null_placement;
+};
+
+class ARROW_EXPORT SortOptions : public FunctionOptions {
+ public:
+  explicit SortOptions(std::vector<SortKey> sort_keys = {},
+                       NullPlacement null_placement = NullPlacement::AtEnd);
+  explicit SortOptions(const Ordering& ordering);
+  static constexpr const char kTypeName[] = "SortOptions";
+  static SortOptions Defaults() { return SortOptions(); }
+  /// Convenience constructor to create an ordering from SortOptions
+  ///
+  /// Note: Both classes contain the exact same information.  However,
+  /// sort_options should only be used in a "function options" context while Ordering
+  /// is used more generally.
+  Ordering AsOrdering() && { return Ordering(std::move(sort_keys), null_placement); }
+  Ordering AsOrdering() const& { return Ordering(sort_keys, null_placement); }
+
+  /// Column key(s) to order by and how to order by these sort keys.
+  std::vector<SortKey> sort_keys;
+  /// Whether nulls and NaNs are placed at the start or at the end
+  NullPlacement null_placement;
+};
+
+/// \brief SelectK options
+class ARROW_EXPORT SelectKOptions : public FunctionOptions {
+ public:
+  explicit SelectKOptions(int64_t k = -1, std::vector<SortKey> sort_keys = {});
+  static constexpr const char kTypeName[] = "SelectKOptions";
+  static SelectKOptions Defaults() { return SelectKOptions(); }
+
+  static SelectKOptions TopKDefault(int64_t k, std::vector<std::string> key_names = {}) {
+    std::vector<SortKey> keys;
+    for (const auto& name : key_names) {
+      keys.emplace_back(SortKey(name, SortOrder::Descending));
+    }
+    if (key_names.empty()) {
+      keys.emplace_back(SortKey("not-used", SortOrder::Descending));
+    }
+    return SelectKOptions{k, keys};
+  }
+  static SelectKOptions BottomKDefault(int64_t k,
+                                       std::vector<std::string> key_names = {}) {
+    std::vector<SortKey> keys;
+    for (const auto& name : key_names) {
+      keys.emplace_back(SortKey(name, SortOrder::Ascending));
+    }
+    if (key_names.empty()) {
+      keys.emplace_back(SortKey("not-used", SortOrder::Ascending));
+    }
+    return SelectKOptions{k, keys};
+  }
+
+  /// The number of `k` elements to keep.
+  int64_t k;
+  /// Column key(s) to order by and how to order by these sort keys.
+  std::vector<SortKey> sort_keys;
+};
+
+/// \brief Rank options
+class ARROW_EXPORT RankOptions : public FunctionOptions {
+ public:
+  /// Configure how ties between equal values are handled
+  enum Tiebreaker {
+    /// Ties get the smallest possible rank in sorted order.
+    Min,
+    /// Ties get the largest possible rank in sorted order.
+    Max,
+    /// Ranks are assigned in order of when ties appear in the input.
+    /// This ensures the ranks are a stable permutation of the input.
+    First,
+    /// The ranks span a dense [1, M] interval where M is the number
+    /// of distinct values in the input.
+    Dense
+  };
+
+  explicit RankOptions(std::vector<SortKey> sort_keys = {},
+                       NullPlacement null_placement = NullPlacement::AtEnd,
+                       Tiebreaker tiebreaker = RankOptions::First);
+  /// Convenience constructor for array inputs
+  explicit RankOptions(SortOrder order,
+                       NullPlacement null_placement = NullPlacement::AtEnd,
+                       Tiebreaker tiebreaker = RankOptions::First)
+      : RankOptions({SortKey("", order)}, null_placement, tiebreaker) {}
+
+  static constexpr const char kTypeName[] = "RankOptions";
+  static RankOptions Defaults() { return RankOptions(); }
+
+  /// Column key(s) to order by and how to order by these sort keys.
+  std::vector<SortKey> sort_keys;
+  /// Whether nulls and NaNs are placed at the start or at the end
+  NullPlacement null_placement;
+  /// Tiebreaker for dealing with equal values in ranks
+  Tiebreaker tiebreaker;
+};
+
+/// \brief Quantile rank options
+class ARROW_EXPORT RankQuantileOptions : public FunctionOptions {
+ public:
+  explicit RankQuantileOptions(std::vector<SortKey> sort_keys = {},
+                               NullPlacement null_placement = NullPlacement::AtEnd);
+  /// Convenience constructor for array inputs
+  explicit RankQuantileOptions(SortOrder order,
+                               NullPlacement null_placement = NullPlacement::AtEnd)
+      : RankQuantileOptions({SortKey("", order)}, null_placement) {}
+
+  static constexpr const char kTypeName[] = "RankQuantileOptions";
+  static RankQuantileOptions Defaults() { return RankQuantileOptions(); }
+
+  /// Column key(s) to order by and how to order by these sort keys.
+  std::vector<SortKey> sort_keys;
+  /// Whether nulls and NaNs are placed at the start or at the end
+  NullPlacement null_placement;
+};
+
+/// \brief Partitioning options for NthToIndices
+class ARROW_EXPORT PartitionNthOptions : public FunctionOptions {
+ public:
+  explicit PartitionNthOptions(int64_t pivot,
+                               NullPlacement null_placement = NullPlacement::AtEnd);
+  PartitionNthOptions() : PartitionNthOptions(0) {}
+  static constexpr const char kTypeName[] = "PartitionNthOptions";
+
+  /// The index into the equivalent sorted array of the partition pivot element.
+  int64_t pivot;
+  /// Whether nulls and NaNs are partitioned at the start or at the end
+  NullPlacement null_placement;
+};
+
+class ARROW_EXPORT WinsorizeOptions : public FunctionOptions {
+ public:
+  WinsorizeOptions(double lower_limit, double upper_limit);
+  WinsorizeOptions() : WinsorizeOptions(0, 1) {}
+  static constexpr const char kTypeName[] = "WinsorizeOptions";
+
+  /// The quantile below which all values are replaced with the quantile's value.
+  ///
+  /// For example, if lower_limit = 0.05, then all values in the lower 5% percentile
+  /// will be replaced with the 5% percentile value.
+  double lower_limit;
+
+  /// The quantile above which all values are replaced with the quantile's value.
+  ///
+  /// For example, if upper_limit = 0.95, then all values in the upper 95% percentile
+  /// will be replaced with the 95% percentile value.
+  double upper_limit;
+};
+
+/// \brief Options for cumulative functions
+/// \note Also aliased as CumulativeSumOptions for backward compatibility
+class ARROW_EXPORT CumulativeOptions : public FunctionOptions {
+ public:
+  explicit CumulativeOptions(bool skip_nulls = false);
+  explicit CumulativeOptions(double start, bool skip_nulls = false);
+  explicit CumulativeOptions(std::shared_ptr<Scalar> start, bool skip_nulls = false);
+  static constexpr const char kTypeName[] = "CumulativeOptions";
+  static CumulativeOptions Defaults() { return CumulativeOptions(); }
+
+  /// Optional starting value for cumulative operation computation, default depends on the
+  /// operation and input type.
+  /// - sum: 0
+  /// - prod: 1
+  /// - min: maximum of the input type
+  /// - max: minimum of the input type
+  /// - mean: start is ignored because it has no meaning for mean
+  std::optional<std::shared_ptr<Scalar>> start;
+
+  /// If true, nulls in the input are ignored and produce a corresponding null output.
+  /// When false, the first null encountered is propagated through the remaining output.
+  bool skip_nulls = false;
+};
+using CumulativeSumOptions = CumulativeOptions;  // For backward compatibility
+
+/// \brief Options for pairwise functions
+class ARROW_EXPORT PairwiseOptions : public FunctionOptions {
+ public:
+  explicit PairwiseOptions(int64_t periods = 1);
+  static constexpr const char kTypeName[] = "PairwiseOptions";
+  static PairwiseOptions Defaults() { return PairwiseOptions(); }
+
+  /// Periods to shift for applying the binary operation, accepts negative values.
+  int64_t periods = 1;
+};
+
+/// \brief Options for list_flatten function
+class ARROW_EXPORT ListFlattenOptions : public FunctionOptions {
+ public:
+  explicit ListFlattenOptions(bool recursive = false);
+  static constexpr const char kTypeName[] = "ListFlattenOptions";
+  static ListFlattenOptions Defaults() { return ListFlattenOptions(); }
+
+  /// \brief If true, the list is flattened recursively until a non-list
+  /// array is formed.
+  bool recursive = false;
+};
+
+/// \brief Options for inverse_permutation function
+class ARROW_EXPORT InversePermutationOptions : public FunctionOptions {
+ public:
+  explicit InversePermutationOptions(
+      int64_t max_index = -1,
+      std::optional<std::shared_ptr<DataType>> output_type = std::nullopt);
+  static constexpr const char kTypeName[] = "InversePermutationOptions";
+  static InversePermutationOptions Defaults() { return InversePermutationOptions(); }
+
+  /// \brief The max value in the input indices to allow. The length of the function's
+  /// output will be this value plus 1. If negative, this value will be set to the length
+  /// of the input indices minus 1 and the length of the function's output will be the
+  /// length of the input indices.
+  int64_t max_index = -1;
+  /// \brief The data type for the output array of inverse permutation. Defaults to the
+  /// type of the input indices when `nullopt`. Must be a signed integer type. An
+  /// invalid error will be reported if this type is not able to store the length of the
+  /// input indices.
+  std::optional<std::shared_ptr<DataType>> output_type;
+};
+
+/// \brief Options for scatter function
+class ARROW_EXPORT ScatterOptions : public FunctionOptions {
+ public:
+  explicit ScatterOptions(int64_t max_index = -1);
+  static constexpr const char kTypeName[] = "ScatterOptions";
+  static ScatterOptions Defaults() { return ScatterOptions(); }
+
+  /// \brief The max value in the input indices to allow. The length of the function's
+  /// output will be this value plus 1. If negative, this value will be set to the length
+  /// of the input indices minus 1 and the length of the function's output will be the
+  /// length of the input indices.
+  int64_t max_index = -1;
+};
+
+/// @}
+
+/// \brief Filter with a boolean selection filter
+///
+/// The output will be populated with values from the input at positions
+/// where the selection filter is not 0. Nulls in the filter will be handled
+/// based on options.null_selection_behavior.
+///
+/// For example given values = ["a", "b", "c", null, "e", "f"] and
+/// filter = [0, 1, 1, 0, null, 1], the output will be
+/// (null_selection_behavior == DROP)      = ["b", "c", "f"]
+/// (null_selection_behavior == EMIT_NULL) = ["b", "c", null, "f"]
+///
+/// \param[in] values array to filter
+/// \param[in] filter indicates which values should be filtered out
+/// \param[in] options configures null_selection_behavior
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+ARROW_EXPORT
+Result<Datum> Filter(const Datum& values, const Datum& filter,
+                     const FilterOptions& options = FilterOptions::Defaults(),
+                     ExecContext* ctx = NULLPTR);
+
+namespace internal {
+
+// These internal functions are implemented in kernels/vector_selection.cc
+
+/// \brief Return the number of selected indices in the boolean filter
+///
+/// \param filter a plain or run-end encoded boolean array with or without nulls
+/// \param null_selection how to handle nulls in the filter
+ARROW_EXPORT
+int64_t GetFilterOutputSize(const ArraySpan& filter,
+                            FilterOptions::NullSelectionBehavior null_selection);
+
+/// \brief Compute uint64 selection indices for use with Take given a boolean
+/// filter
+///
+/// \param filter a plain or run-end encoded boolean array with or without nulls
+/// \param null_selection how to handle nulls in the filter
+ARROW_EXPORT
+Result<std::shared_ptr<ArrayData>> GetTakeIndices(
+    const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
+    MemoryPool* memory_pool = default_memory_pool());
+
+}  // namespace internal
+
+/// \brief ReplaceWithMask replaces each value in the array corresponding
+/// to a true value in the mask with the next element from `replacements`.
+///
+/// \param[in] values Array input to replace
+/// \param[in] mask Array or Scalar of Boolean mask values
+/// \param[in] replacements The replacement values to draw from. There must
+/// be as many replacement values as true values in the mask.
+/// \param[in] ctx the function execution context, optional
+///
+/// \return the resulting datum
+///
+/// \since 5.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> ReplaceWithMask(const Datum& values, const Datum& mask,
+                              const Datum& replacements, ExecContext* ctx = NULLPTR);
+
+/// \brief FillNullForward fill null values in forward direction
+///
+/// The output array will be of the same type as the input values
+/// array, with replaced null values in forward direction.
+///
+/// For example given values = ["a", "b", "c", null, null, "f"],
+/// the output will be = ["a", "b", "c", "c", "c", "f"]
+///
+/// \param[in] values datum from which to take
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+ARROW_EXPORT
+Result<Datum> FillNullForward(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief FillNullBackward fill null values in backward direction
+///
+/// The output array will be of the same type as the input values
+/// array, with replaced null values in backward direction.
+///
+/// For example given values = ["a", "b", "c", null, null, "f"],
+/// the output will be = ["a", "b", "c", "f", "f", "f"]
+///
+/// \param[in] values datum from which to take
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+ARROW_EXPORT
+Result<Datum> FillNullBackward(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Take from an array of values at indices in another array
+///
+/// The output array will be of the same type as the input values
+/// array, with elements taken from the values array at the given
+/// indices. If an index is null then the taken element will be null.
+///
+/// For example given values = ["a", "b", "c", null, "e", "f"] and
+/// indices = [2, 1, null, 3], the output will be
+/// = [values[2], values[1], null, values[3]]
+/// = ["c", "b", null, null]
+///
+/// \param[in] values datum from which to take
+/// \param[in] indices which values to take
+/// \param[in] options options
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+ARROW_EXPORT
+Result<Datum> Take(const Datum& values, const Datum& indices,
+                   const TakeOptions& options = TakeOptions::Defaults(),
+                   ExecContext* ctx = NULLPTR);
+
+/// \brief Take with Array inputs and output
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Take(const Array& values, const Array& indices,
+                                    const TakeOptions& options = TakeOptions::Defaults(),
+                                    ExecContext* ctx = NULLPTR);
+
+/// \brief Drop Null from an array of values
+///
+/// The output array will be of the same type as the input values
+/// array, with elements taken from the values array without nulls.
+///
+/// For example given values = ["a", "b", "c", null, "e", "f"],
+/// the output will be = ["a", "b", "c", "e", "f"]
+///
+/// \param[in] values datum from which to take
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+ARROW_EXPORT
+Result<Datum> DropNull(const Datum& values, ExecContext* ctx = NULLPTR);
+
+/// \brief DropNull with Array inputs and output
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> DropNull(const Array& values, ExecContext* ctx = NULLPTR);
+
+/// \brief Return indices that partition an array around n-th sorted element.
+///
+/// Find index of n-th(0 based) smallest value and perform indirect
+/// partition of an array around that element. Output indices[0 ~ n-1]
+/// holds values no greater than n-th element, and indices[n+1 ~ end]
+/// holds values no less than n-th element. Elements in each partition
+/// is not sorted. Nulls will be partitioned to the end of the output.
+/// Output is not guaranteed to be stable.
+///
+/// \param[in] values array to be partitioned
+/// \param[in] n pivot array around sorted n-th element
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would partition an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> NthToIndices(const Array& values, int64_t n,
+                                            ExecContext* ctx = NULLPTR);
+
+/// \brief Return indices that partition an array around n-th sorted element.
+///
+/// This overload takes a PartitionNthOptions specifying the pivot index
+/// and the null handling.
+///
+/// \param[in] values array to be partitioned
+/// \param[in] options options including pivot index and null handling
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would partition an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> NthToIndices(const Array& values,
+                                            const PartitionNthOptions& options,
+                                            ExecContext* ctx = NULLPTR);
+
+/// \brief Return indices that would select the first `k` elements.
+///
+/// Perform an indirect sort of the datum, keeping only the first `k` elements. The output
+/// array will contain indices such that the item indicated by the k-th index will be in
+/// the position it would be if the datum were sorted by `options.sort_keys`. However,
+/// indices of null values will not be part of the output. The sort is not guaranteed to
+/// be stable.
+///
+/// \param[in] datum datum to be partitioned
+/// \param[in] options options
+/// \param[in] ctx the function execution context, optional
+/// \return a datum with the same schema as the input
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SelectKUnstable(const Datum& datum,
+                                               const SelectKOptions& options,
+                                               ExecContext* ctx = NULLPTR);
+
+/// \brief Return the indices that would sort an array.
+///
+/// Perform an indirect sort of array. The output array will contain
+/// indices that would sort an array, which would be the same length
+/// as input. Nulls will be stably partitioned to the end of the output
+/// regardless of order.
+///
+/// For example given array = [null, 1, 3.3, null, 2, 5.3] and order
+/// = SortOrder::DESCENDING, the output will be [5, 2, 4, 1, 0,
+/// 3].
+///
+/// \param[in] array array to sort
+/// \param[in] order ascending or descending
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortIndices(const Array& array,
+                                           SortOrder order = SortOrder::Ascending,
+                                           ExecContext* ctx = NULLPTR);
+
+/// \brief Return the indices that would sort an array.
+///
+/// This overload takes a ArraySortOptions specifying the sort order
+/// and the null handling.
+///
+/// \param[in] array array to sort
+/// \param[in] options options including sort order and null handling
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortIndices(const Array& array,
+                                           const ArraySortOptions& options,
+                                           ExecContext* ctx = NULLPTR);
+
+/// \brief Return the indices that would sort a chunked array.
+///
+/// Perform an indirect sort of chunked array. The output array will
+/// contain indices that would sort a chunked array, which would be
+/// the same length as input. Nulls will be stably partitioned to the
+/// end of the output regardless of order.
+///
+/// For example given chunked_array = [[null, 1], [3.3], [null, 2,
+/// 5.3]] and order = SortOrder::DESCENDING, the output will be [5, 2,
+/// 4, 1, 0, 3].
+///
+/// \param[in] chunked_array chunked array to sort
+/// \param[in] order ascending or descending
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
+                                           SortOrder order = SortOrder::Ascending,
+                                           ExecContext* ctx = NULLPTR);
+
+/// \brief Return the indices that would sort a chunked array.
+///
+/// This overload takes a ArraySortOptions specifying the sort order
+/// and the null handling.
+///
+/// \param[in] chunked_array chunked array to sort
+/// \param[in] options options including sort order and null handling
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort an array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortIndices(const ChunkedArray& chunked_array,
+                                           const ArraySortOptions& options,
+                                           ExecContext* ctx = NULLPTR);
+
+/// \brief Return the indices that would sort an input in the
+/// specified order. Input is one of array, chunked array record batch
+/// or table.
+///
+/// Perform an indirect sort of input. The output array will contain
+/// indices that would sort an input, which would be the same length
+/// as input. Nulls will be stably partitioned to the start or to the end
+/// of the output depending on SortOrder::null_placement.
+///
+/// For example given input (table) = {
+/// "column1": [[null,   1], [   3, null, 2, 1]],
+/// "column2": [[   5], [3,   null, null, 5, 5]],
+/// } and options = {
+/// {"column1", SortOrder::Ascending},
+/// {"column2", SortOrder::Descending},
+/// }, the output will be [5, 1, 4, 2, 0, 3].
+///
+/// \param[in] datum array, chunked array, record batch or table to sort
+/// \param[in] options options
+/// \param[in] ctx the function execution context, optional
+/// \return offsets indices that would sort a table
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> SortIndices(const Datum& datum, const SortOptions& options,
+                                           ExecContext* ctx = NULLPTR);
+
+/// \brief Compute unique elements from an array-like object
+///
+/// Note if a null occurs in the input it will NOT be included in the output.
+///
+/// \param[in] datum array-like input
+/// \param[in] ctx the function execution context, optional
+/// \return result as Array
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Unique(const Datum& datum, ExecContext* ctx = NULLPTR);
+
+// Constants for accessing the output of ValueCounts
+ARROW_EXPORT extern const char kValuesFieldName[];
+ARROW_EXPORT extern const char kCountsFieldName[];
+ARROW_EXPORT extern const int32_t kValuesFieldIndex;
+ARROW_EXPORT extern const int32_t kCountsFieldIndex;
+
+/// \brief Return counts of unique elements from an array-like object.
+///
+/// Note that the counts do not include counts for nulls in the array.  These can be
+/// obtained separately from metadata.
+///
+/// For floating point arrays there is no attempt to normalize -0.0, 0.0 and NaN values
+/// which can lead to unexpected results if the input Array has these values.
+///
+/// \param[in] value array-like input
+/// \param[in] ctx the function execution context, optional
+/// \return counts An array of  <input type "Values", int64_t "Counts"> structs.
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<StructArray>> ValueCounts(const Datum& value,
+                                                 ExecContext* ctx = NULLPTR);
+
+/// \brief Dictionary-encode values in an array-like object
+///
+/// Any nulls encountered in the dictionary will be handled according to the
+/// specified null encoding behavior.
+///
+/// For example, given values ["a", "b", null, "a", null] the output will be
+/// (null_encoding == ENCODE) Indices: [0, 1, 2, 0, 2] / Dict: ["a", "b", null]
+/// (null_encoding == MASK)   Indices: [0, 1, null, 0, null] / Dict: ["a", "b"]
+///
+/// If the input is already dictionary encoded this function is a no-op unless
+/// it needs to modify the null_encoding (TODO)
+///
+/// \param[in] data array-like input
+/// \param[in] ctx the function execution context, optional
+/// \param[in] options configures null encoding behavior
+/// \return result with same shape and type as input
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> DictionaryEncode(
+    const Datum& data,
+    const DictionaryEncodeOptions& options = DictionaryEncodeOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Run-end-encode values in an array-like object
+///
+/// The returned run-end encoded type uses the same value type of the input and
+/// run-end type defined in the options.
+///
+/// \param[in] value array-like input
+/// \param[in] options configures encoding behavior
+/// \param[in] ctx the function execution context, optional
+/// \return result with same shape but run-end encoded
+///
+/// \since 12.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> RunEndEncode(
+    const Datum& value,
+    const RunEndEncodeOptions& options = RunEndEncodeOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Decode a Run-End Encoded array to a plain array
+///
+/// The output data type is the same as the values array type of run-end encoded
+/// input.
+///
+/// \param[in] value run-end-encoded input
+/// \param[in] ctx the function execution context, optional
+/// \return plain array resulting from decoding the run-end encoded input
+///
+/// \since 12.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> RunEndDecode(const Datum& value, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the cumulative sum of an array-like object
+///
+/// \param[in] values array-like input
+/// \param[in] options configures cumulative sum behavior
+/// \param[in] check_overflow whether to check for overflow, if true, return Invalid
+/// status on overflow, otherwise wrap around on overflow
+/// \param[in] ctx the function execution context, optional
+ARROW_EXPORT
+Result<Datum> CumulativeSum(
+    const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
+    bool check_overflow = false, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the cumulative product of an array-like object
+///
+/// \param[in] values array-like input
+/// \param[in] options configures cumulative prod behavior
+/// \param[in] check_overflow whether to check for overflow, if true, return Invalid
+/// status on overflow, otherwise wrap around on overflow
+/// \param[in] ctx the function execution context, optional
+ARROW_EXPORT
+Result<Datum> CumulativeProd(
+    const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
+    bool check_overflow = false, ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the cumulative max of an array-like object
+///
+/// \param[in] values array-like input
+/// \param[in] options configures cumulative max behavior
+/// \param[in] ctx the function execution context, optional
+ARROW_EXPORT
+Result<Datum> CumulativeMax(
+    const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the cumulative min of an array-like object
+///
+/// \param[in] values array-like input
+/// \param[in] options configures cumulative min behavior
+/// \param[in] ctx the function execution context, optional
+ARROW_EXPORT
+Result<Datum> CumulativeMin(
+    const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Compute the cumulative mean of an array-like object
+///
+/// \param[in] values array-like input
+/// \param[in] options configures cumulative mean behavior, `start` is ignored
+/// \param[in] ctx the function execution context, optional
+ARROW_EXPORT
+Result<Datum> CumulativeMean(
+    const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Return the first order difference of an array.
+///
+/// Computes the first order difference of an array, i.e.
+///   output[i] = input[i] - input[i - p]  if i >= p
+///   output[i] = null                     otherwise
+/// where p is the period. For example, with p = 1,
+///   Diff([1, 4, 9, 10, 15]) = [null, 3, 5, 1, 5].
+/// With p = 2,
+///   Diff([1, 4, 9, 10, 15]) = [null, null, 8, 6, 6]
+/// p can also be negative, in which case the diff is computed in
+/// the opposite direction.
+/// \param[in] array array input
+/// \param[in] options options, specifying overflow behavior and period
+/// \param[in] check_overflow whether to return error on overflow
+/// \param[in] ctx the function execution context, optional
+/// \return result as array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> PairwiseDiff(const Array& array,
+                                            const PairwiseOptions& options,
+                                            bool check_overflow = false,
+                                            ExecContext* ctx = NULLPTR);
+
+/// \brief Return the inverse permutation of the given indices.
+///
+/// For indices[i] = x, inverse_permutation[x] = i. And inverse_permutation[x] = null if x
+/// does not appear in the input indices. Indices must be in the range of [0, max_index],
+/// or null, which will be ignored. If multiple indices point to the same value, the last
+/// one is used.
+///
+/// For example, with
+///   indices = [null, 0, null, 2, 4, 1, 1]
+/// the inverse permutation is
+///   [1, 6, 3, null, 4, null, null]
+/// if max_index = 6.
+///
+/// \param[in] indices array-like indices
+/// \param[in] options configures the max index and the output type
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting inverse permutation
+///
+/// \since 20.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> InversePermutation(
+    const Datum& indices,
+    const InversePermutationOptions& options = InversePermutationOptions::Defaults(),
+    ExecContext* ctx = NULLPTR);
+
+/// \brief Scatter the values into specified positions according to the indices.
+///
+/// For indices[i] = x, output[x] = values[i]. And output[x] = null if x does not appear
+/// in the input indices. Indices must be in the range of [0, max_index], or null, in
+/// which case the corresponding value will be ignored. If multiple indices point to the
+/// same value, the last one is used.
+///
+/// For example, with
+///   values = [a, b, c, d, e, f, g]
+///   indices = [null, 0, null, 2, 4, 1, 1]
+/// the output is
+///   [b, g, d, null, e, null, null]
+/// if max_index = 6.
+///
+/// \param[in] values datum to scatter
+/// \param[in] indices array-like indices
+/// \param[in] options configures the max index of to scatter
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 20.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Scatter(const Datum& values, const Datum& indices,
+                      const ScatterOptions& options = ScatterOptions::Defaults(),
+                      ExecContext* ctx = NULLPTR);
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/cast.h b/pyarrow/include/arrow/compute/cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec5818239acb1ab52b06945a5fb4e60f84b58b61
--- /dev/null
+++ b/pyarrow/include/arrow/compute/cast.h
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/compute/function.h"
+#include "arrow/compute/function_options.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+
+namespace compute {
+
+class ExecContext;
+
+/// \addtogroup compute-concrete-options
+/// @{
+
+class ARROW_EXPORT CastOptions : public FunctionOptions {
+ public:
+  explicit CastOptions(bool safe = true);
+
+  static constexpr const char kTypeName[] = "CastOptions";
+  static CastOptions Safe(TypeHolder to_type = {}) {
+    CastOptions safe(true);
+    safe.to_type = std::move(to_type);
+    return safe;
+  }
+
+  static CastOptions Unsafe(TypeHolder to_type = {}) {
+    CastOptions unsafe(false);
+    unsafe.to_type = std::move(to_type);
+    return unsafe;
+  }
+
+  // Type being casted to. May be passed separate to eager function
+  // compute::Cast
+  TypeHolder to_type;
+
+  bool allow_int_overflow;
+  bool allow_time_truncate;
+  bool allow_time_overflow;
+  bool allow_decimal_truncate;
+  bool allow_float_truncate;
+  // Indicate if conversions from Binary/FixedSizeBinary to string must
+  // validate the utf8 payload.
+  bool allow_invalid_utf8;
+
+  /// true if the safety options all match CastOptions::Safe
+  ///
+  /// Note, if this returns false it does not mean is_unsafe will return true
+  bool is_safe() const;
+  /// true if the safety options all match CastOptions::Unsafe
+  ///
+  /// Note, if this returns false it does not mean is_safe will return true
+  bool is_unsafe() const;
+};
+
+/// @}
+
+/// \brief Return true if a cast function is defined
+ARROW_EXPORT
+bool CanCast(const DataType& from_type, const DataType& to_type);
+
+// ----------------------------------------------------------------------
+// Convenience invocation APIs for a number of kernels
+
+/// \brief Cast from one array type to another
+/// \param[in] value array to cast
+/// \param[in] to_type type to cast to
+/// \param[in] options casting options
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting array
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Cast(const Array& value, const TypeHolder& to_type,
+                                    const CastOptions& options = CastOptions::Safe(),
+                                    ExecContext* ctx = NULLPTR);
+
+/// \brief Cast from one array type to another
+/// \param[in] value array to cast
+/// \param[in] options casting options. The "to_type" field must be populated
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting array
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Cast(const Datum& value, const CastOptions& options,
+                   ExecContext* ctx = NULLPTR);
+
+/// \brief Cast from one value to another
+/// \param[in] value datum to cast
+/// \param[in] to_type type to cast to
+/// \param[in] options casting options
+/// \param[in] ctx the function execution context, optional
+/// \return the resulting datum
+///
+/// \since 1.0.0
+/// \note API not yet finalized
+ARROW_EXPORT
+Result<Datum> Cast(const Datum& value, const TypeHolder& to_type,
+                   const CastOptions& options = CastOptions::Safe(),
+                   ExecContext* ctx = NULLPTR);
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/exec.h b/pyarrow/include/arrow/compute/exec.h
new file mode 100644
index 0000000000000000000000000000000000000000..dae7e1ea686829fcf9b11bf07489d2cca8610f2b
--- /dev/null
+++ b/pyarrow/include/arrow/compute/exec.h
@@ -0,0 +1,489 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/compute/expression.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+// It seems like 64K might be a good default chunksize to use for execution
+// based on the experience of other query processing systems. The current
+// default is not to chunk contiguous arrays, though, but this may change in
+// the future once parallel execution is implemented
+static constexpr int64_t kDefaultExecChunksize = UINT16_MAX;
+
+/// \brief Context for expression-global variables and options used by
+/// function evaluation
+class ARROW_EXPORT ExecContext {
+ public:
+  // If no function registry passed, the default is used.
+  explicit ExecContext(MemoryPool* pool = default_memory_pool(),
+                       ::arrow::internal::Executor* executor = NULLPTR,
+                       FunctionRegistry* func_registry = NULLPTR);
+
+  /// \brief The MemoryPool used for allocations, default is
+  /// default_memory_pool().
+  MemoryPool* memory_pool() const { return pool_; }
+
+  const ::arrow::internal::CpuInfo* cpu_info() const;
+
+  /// \brief An Executor which may be used to parallelize execution.
+  ::arrow::internal::Executor* executor() const { return executor_; }
+
+  /// \brief The FunctionRegistry for looking up functions by name and
+  /// selecting kernels for execution. Defaults to the library-global function
+  /// registry provided by GetFunctionRegistry.
+  FunctionRegistry* func_registry() const { return func_registry_; }
+
+  // \brief Set maximum length unit of work for kernel execution. Larger
+  // contiguous array inputs will be split into smaller chunks, and, if
+  // possible and enabled, processed in parallel. The default chunksize is
+  // INT64_MAX, so contiguous arrays are not split.
+  void set_exec_chunksize(int64_t chunksize) { exec_chunksize_ = chunksize; }
+
+  // \brief Maximum length for ExecBatch data chunks processed by
+  // kernels. Contiguous array inputs with longer length will be split into
+  // smaller chunks.
+  int64_t exec_chunksize() const { return exec_chunksize_; }
+
+  /// \brief Set whether to use multiple threads for function execution. This
+  /// is not yet used.
+  void set_use_threads(bool use_threads = true) { use_threads_ = use_threads; }
+
+  /// \brief If true, then utilize multiple threads where relevant for function
+  /// execution. This is not yet used.
+  bool use_threads() const { return use_threads_; }
+
+  // Set the preallocation strategy for kernel execution as it relates to
+  // chunked execution. For chunked execution, whether via ChunkedArray inputs
+  // or splitting larger Array arguments into smaller pieces, contiguous
+  // allocation (if permitted by the kernel) will allocate one large array to
+  // write output into yielding it to the caller at the end. If this option is
+  // set to off, then preallocations will be performed independently for each
+  // chunk of execution
+  //
+  // TODO: At some point we might want the limit the size of contiguous
+  // preallocations. For example, even if the exec_chunksize is 64K or less, we
+  // might limit contiguous allocations to 1M records, say.
+  void set_preallocate_contiguous(bool preallocate) {
+    preallocate_contiguous_ = preallocate;
+  }
+
+  /// \brief If contiguous preallocations should be used when doing chunked
+  /// execution as specified by exec_chunksize(). See
+  /// set_preallocate_contiguous() for more information.
+  bool preallocate_contiguous() const { return preallocate_contiguous_; }
+
+ private:
+  MemoryPool* pool_;
+  ::arrow::internal::Executor* executor_;
+  FunctionRegistry* func_registry_;
+  int64_t exec_chunksize_ = std::numeric_limits<int64_t>::max();
+  bool preallocate_contiguous_ = true;
+  bool use_threads_ = true;
+};
+
+// TODO: Consider standardizing on uint16 selection vectors and only use them
+// when we can ensure that each value is 64K length or smaller
+
+/// \brief Container for an array of value selection indices that were
+/// materialized from a filter.
+///
+/// Columnar query engines (see e.g. [1]) have found that rather than
+/// materializing filtered data, the filter can instead be converted to an
+/// array of the "on" indices and then "fusing" these indices in operator
+/// implementations. This is especially relevant for aggregations but also
+/// applies to scalar operations.
+///
+/// We are not yet using this so this is mostly a placeholder for now.
+///
+/// [1]: http://cidrdb.org/cidr2005/papers/P19.pdf
+class ARROW_EXPORT SelectionVector {
+ public:
+  explicit SelectionVector(std::shared_ptr<ArrayData> data);
+
+  explicit SelectionVector(const Array& arr);
+
+  /// \brief Create SelectionVector from boolean mask
+  static Result<std::shared_ptr<SelectionVector>> FromMask(const BooleanArray& arr);
+
+  const int32_t* indices() const { return indices_; }
+  int32_t length() const;
+
+ private:
+  std::shared_ptr<ArrayData> data_;
+  const int32_t* indices_;
+};
+
+/// An index to represent that a batch does not belong to an ordered stream
+constexpr int64_t kUnsequencedIndex = -1;
+
+/// \brief A unit of work for kernel execution. It contains a collection of
+/// Array and Scalar values and an optional SelectionVector indicating that
+/// there is an unmaterialized filter that either must be materialized, or (if
+/// the kernel supports it) pushed down into the kernel implementation.
+///
+/// ExecBatch is semantically similar to RecordBatch in that in a SQL context
+/// it represents a collection of records, but constant "columns" are
+/// represented by Scalar values rather than having to be converted into arrays
+/// with repeated values.
+///
+/// TODO: Datum uses arrow/util/variant.h which may be a bit heavier-weight
+/// than is desirable for this class. Microbenchmarks would help determine for
+/// sure. See ARROW-8928.
+
+/// \addtogroup acero-internals
+/// @{
+
+struct ARROW_EXPORT ExecBatch {
+  ExecBatch() = default;
+  ExecBatch(std::vector<Datum> values, int64_t length)
+      : values(std::move(values)), length(length) {}
+
+  explicit ExecBatch(const RecordBatch& batch);
+
+  /// \brief Infer the ExecBatch length from values.
+  static Result<int64_t> InferLength(const std::vector<Datum>& values);
+
+  /// Creates an ExecBatch with length-validation.
+  ///
+  /// If any value is given, then all values must have a common length. If the given
+  /// length is negative, then the length of the ExecBatch is set to this common length,
+  /// or to 1 if no values are given. Otherwise, the given length must equal the common
+  /// length, if any value is given.
+  static Result<ExecBatch> Make(std::vector<Datum> values, int64_t length = -1);
+
+  Result<std::shared_ptr<RecordBatch>> ToRecordBatch(
+      std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool()) const;
+
+  /// The values representing positional arguments to be passed to a kernel's
+  /// exec function for processing.
+  std::vector<Datum> values;
+
+  /// A deferred filter represented as an array of indices into the values.
+  ///
+  /// For example, the filter [true, true, false, true] would be represented as
+  /// the selection vector [0, 1, 3]. When the selection vector is set,
+  /// ExecBatch::length is equal to the length of this array.
+  std::shared_ptr<SelectionVector> selection_vector;
+
+  /// A predicate Expression guaranteed to evaluate to true for all rows in this batch.
+  Expression guarantee = literal(true);
+
+  /// The semantic length of the ExecBatch. When the values are all scalars,
+  /// the length should be set to 1 for non-aggregate kernels, otherwise the
+  /// length is taken from the array values, except when there is a selection
+  /// vector. When there is a selection vector set, the length of the batch is
+  /// the length of the selection. Aggregate kernels can have an ExecBatch
+  /// formed by projecting just the partition columns from a batch in which
+  /// case, it would have scalar rows with length greater than 1.
+  ///
+  /// If the array values are of length 0 then the length is 0 regardless of
+  /// whether any values are Scalar.
+  int64_t length = 0;
+
+  /// \brief index of this batch in a sorted stream of batches
+  ///
+  /// This index must be strictly monotonic starting at 0 without gaps or
+  /// it can be set to kUnsequencedIndex if there is no meaningful order
+  int64_t index = kUnsequencedIndex;
+
+  /// \brief The sum of bytes in each buffer referenced by the batch
+  ///
+  /// Note: Scalars are not counted
+  /// Note: Some values may referenced only part of a buffer, for
+  ///       example, an array with an offset.  The actual data
+  ///       visible to this batch will be smaller than the total
+  ///       buffer size in this case.
+  int64_t TotalBufferSize() const;
+
+  /// \brief Return the value at the i-th index
+  template <typename index_type>
+  inline const Datum& operator[](index_type i) const {
+    return values[i];
+  }
+
+  bool Equals(const ExecBatch& other) const;
+
+  /// \brief A convenience for the number of values / arguments.
+  int num_values() const { return static_cast<int>(values.size()); }
+
+  ExecBatch Slice(int64_t offset, int64_t length) const;
+
+  Result<ExecBatch> SelectValues(const std::vector<int>& ids) const;
+
+  /// \brief A convenience for returning the types from the batch.
+  std::vector<TypeHolder> GetTypes() const {
+    std::vector<TypeHolder> result;
+    for (const auto& value : this->values) {
+      result.emplace_back(value.type());
+    }
+    return result;
+  }
+
+  std::string ToString() const;
+};
+
+inline bool operator==(const ExecBatch& l, const ExecBatch& r) { return l.Equals(r); }
+inline bool operator!=(const ExecBatch& l, const ExecBatch& r) { return !l.Equals(r); }
+
+ARROW_EXPORT void PrintTo(const ExecBatch&, std::ostream*);
+
+/// @}
+
+/// \defgroup compute-internals Utilities for calling functions, useful for those
+/// extending the function registry
+///
+/// @{
+
+struct ExecValue {
+  ArraySpan array = {};
+  const Scalar* scalar = NULLPTR;
+
+  ExecValue(const Scalar* scalar)  // NOLINT implicit conversion
+      : scalar(scalar) {}
+
+  ExecValue(ArraySpan array)  // NOLINT implicit conversion
+      : array(std::move(array)) {}
+
+  ExecValue(const ArrayData& array) {  // NOLINT implicit conversion
+    this->array.SetMembers(array);
+  }
+
+  ExecValue() = default;
+  ExecValue(const ExecValue& other) = default;
+  ExecValue& operator=(const ExecValue& other) = default;
+  ExecValue(ExecValue&& other) = default;
+  ExecValue& operator=(ExecValue&& other) = default;
+
+  int64_t length() const { return this->is_array() ? this->array.length : 1; }
+
+  bool is_array() const { return this->scalar == NULLPTR; }
+  bool is_scalar() const { return !this->is_array(); }
+
+  void SetArray(const ArrayData& array) {
+    this->array.SetMembers(array);
+    this->scalar = NULLPTR;
+  }
+
+  void SetScalar(const Scalar* scalar) { this->scalar = scalar; }
+
+  template <typename ExactType>
+  const ExactType& scalar_as() const {
+    return ::arrow::internal::checked_cast<const ExactType&>(*this->scalar);
+  }
+
+  /// XXX: here temporarily for compatibility with datum, see
+  /// e.g. MakeStructExec in scalar_nested.cc
+  int64_t null_count() const {
+    if (this->is_array()) {
+      return this->array.GetNullCount();
+    } else {
+      return this->scalar->is_valid ? 0 : 1;
+    }
+  }
+
+  const DataType* type() const {
+    if (this->is_array()) {
+      return array.type;
+    } else {
+      return scalar->type.get();
+    }
+  }
+};
+
+struct ARROW_EXPORT ExecResult {
+  // The default value of the variant is ArraySpan
+  std::variant<ArraySpan, std::shared_ptr<ArrayData>> value;
+
+  int64_t length() const {
+    if (this->is_array_span()) {
+      return this->array_span()->length;
+    } else {
+      return this->array_data()->length;
+    }
+  }
+
+  const DataType* type() const {
+    if (this->is_array_span()) {
+      return this->array_span()->type;
+    } else {
+      return this->array_data()->type.get();
+    }
+  }
+
+  const ArraySpan* array_span() const { return &std::get<ArraySpan>(this->value); }
+  ArraySpan* array_span_mutable() { return &std::get<ArraySpan>(this->value); }
+
+  bool is_array_span() const { return this->value.index() == 0; }
+
+  const std::shared_ptr<ArrayData>& array_data() const {
+    return std::get<std::shared_ptr<ArrayData>>(this->value);
+  }
+  ArrayData* array_data_mutable() {
+    return std::get<std::shared_ptr<ArrayData>>(this->value).get();
+  }
+
+  bool is_array_data() const { return this->value.index() == 1; }
+};
+
+/// \brief A "lightweight" column batch object which contains no
+/// std::shared_ptr objects and does not have any memory ownership
+/// semantics. Can represent a view onto an "owning" ExecBatch.
+struct ARROW_EXPORT ExecSpan {
+  ExecSpan() = default;
+  ExecSpan(const ExecSpan& other) = default;
+  ExecSpan& operator=(const ExecSpan& other) = default;
+  ExecSpan(ExecSpan&& other) = default;
+  ExecSpan& operator=(ExecSpan&& other) = default;
+
+  explicit ExecSpan(std::vector<ExecValue> values, int64_t length)
+      : length(length), values(std::move(values)) {}
+
+  explicit ExecSpan(const ExecBatch& batch) {
+    this->length = batch.length;
+    this->values.resize(batch.values.size());
+    for (size_t i = 0; i < batch.values.size(); ++i) {
+      const Datum& in_value = batch[i];
+      ExecValue* out_value = &this->values[i];
+      if (in_value.is_array()) {
+        out_value->SetArray(*in_value.array());
+      } else {
+        out_value->SetScalar(in_value.scalar().get());
+      }
+    }
+  }
+
+  /// \brief Return the value at the i-th index
+  template <typename index_type>
+  inline const ExecValue& operator[](index_type i) const {
+    return values[i];
+  }
+
+  /// \brief A convenience for the number of values / arguments.
+  int num_values() const { return static_cast<int>(values.size()); }
+
+  std::vector<TypeHolder> GetTypes() const {
+    std::vector<TypeHolder> result;
+    for (const auto& value : this->values) {
+      result.emplace_back(value.type());
+    }
+    return result;
+  }
+
+  ExecBatch ToExecBatch() const {
+    ExecBatch result;
+    result.length = this->length;
+    for (const ExecValue& value : this->values) {
+      if (value.is_array()) {
+        result.values.push_back(value.array.ToArrayData());
+      } else {
+        result.values.push_back(value.scalar->GetSharedPtr());
+      }
+    }
+    return result;
+  }
+
+  int64_t length = 0;
+  std::vector<ExecValue> values;
+};
+
+/// \defgroup compute-call-function One-shot calls to compute functions
+///
+/// @{
+
+/// \brief One-shot invoker for all types of functions.
+///
+/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs,
+/// and wrapping of outputs.
+ARROW_EXPORT
+Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
+                           const FunctionOptions* options, ExecContext* ctx = NULLPTR);
+
+/// \brief Variant of CallFunction which uses a function's default options.
+///
+/// NB: Some functions require FunctionOptions be provided.
+ARROW_EXPORT
+Result<Datum> CallFunction(const std::string& func_name, const std::vector<Datum>& args,
+                           ExecContext* ctx = NULLPTR);
+
+/// \brief One-shot invoker for all types of functions.
+///
+/// Does kernel dispatch, argument checking, iteration of ChunkedArray inputs,
+/// and wrapping of outputs.
+ARROW_EXPORT
+Result<Datum> CallFunction(const std::string& func_name, const ExecBatch& batch,
+                           const FunctionOptions* options, ExecContext* ctx = NULLPTR);
+
+/// \brief Variant of CallFunction which uses a function's default options.
+///
+/// NB: Some functions require FunctionOptions be provided.
+ARROW_EXPORT
+Result<Datum> CallFunction(const std::string& func_name, const ExecBatch& batch,
+                           ExecContext* ctx = NULLPTR);
+
+/// @}
+
+/// \defgroup compute-function-executor One-shot calls to obtain function executors
+///
+/// @{
+
+/// \brief One-shot executor provider for all types of functions.
+///
+/// This function creates and initializes a `FunctionExecutor` appropriate
+/// for the given function name, input types and function options.
+ARROW_EXPORT
+Result<std::shared_ptr<FunctionExecutor>> GetFunctionExecutor(
+    const std::string& func_name, std::vector<TypeHolder> in_types,
+    const FunctionOptions* options = NULLPTR, FunctionRegistry* func_registry = NULLPTR);
+
+/// \brief One-shot executor provider for all types of functions.
+///
+/// This function creates and initializes a `FunctionExecutor` appropriate
+/// for the given function name, input types (taken from the Datum arguments)
+/// and function options.
+ARROW_EXPORT
+Result<std::shared_ptr<FunctionExecutor>> GetFunctionExecutor(
+    const std::string& func_name, const std::vector<Datum>& args,
+    const FunctionOptions* options = NULLPTR, FunctionRegistry* func_registry = NULLPTR);
+
+/// @}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/expression.h b/pyarrow/include/arrow/compute/expression.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8ce50675c8c9bb0a3a7081a23c6bd3c2002f2d1
--- /dev/null
+++ b/pyarrow/include/arrow/compute/expression.h
@@ -0,0 +1,295 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "arrow/compute/type_fwd.h"
+#include "arrow/datum.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/small_vector.h"
+
+namespace arrow {
+namespace compute {
+
+/// \defgroup expression-core Expressions to describe data transformations
+///
+/// @{
+
+/// An unbound expression which maps a single Datum to another Datum.
+/// An expression is one of
+/// - A literal Datum.
+/// - A reference to a single (potentially nested) field of the input Datum.
+/// - A call to a compute function, with arguments specified by other Expressions.
+class ARROW_EXPORT Expression {
+ public:
+  struct Call {
+    std::string function_name;
+    std::vector<Expression> arguments;
+    std::shared_ptr<FunctionOptions> options;
+    // Cached hash value
+    size_t hash;
+
+    // post-Bind properties:
+    std::shared_ptr<Function> function;
+    const Kernel* kernel = NULLPTR;
+    std::shared_ptr<KernelState> kernel_state;
+    TypeHolder type;
+
+    void ComputeHash();
+  };
+
+  std::string ToString() const;
+  bool Equals(const Expression& other) const;
+  size_t hash() const;
+  struct Hash {
+    size_t operator()(const Expression& expr) const { return expr.hash(); }
+  };
+
+  /// Bind this expression to the given input type, looking up Kernels and field types.
+  /// Some expression simplification may be performed and implicit casts will be inserted.
+  /// Any state necessary for execution will be initialized and returned.
+  Result<Expression> Bind(const TypeHolder& in, ExecContext* = NULLPTR) const;
+  Result<Expression> Bind(const Schema& in_schema, ExecContext* = NULLPTR) const;
+
+  // XXX someday
+  // Clone all KernelState in this bound expression. If any function referenced by this
+  // expression has mutable KernelState, it is not safe to execute or apply simplification
+  // passes to it (or copies of it!) from multiple threads. Cloning state produces new
+  // KernelStates where necessary to ensure that Expressions may be manipulated safely
+  // on multiple threads.
+  // Result<ExpressionState> CloneState() const;
+  // Status SetState(ExpressionState);
+
+  /// Return true if all an expression's field references have explicit types
+  /// and all of its functions' kernels are looked up.
+  bool IsBound() const;
+
+  /// Return true if this expression is composed only of Scalar literals, field
+  /// references, and calls to ScalarFunctions.
+  bool IsScalarExpression() const;
+
+  /// Return true if this expression is literal and entirely null.
+  bool IsNullLiteral() const;
+
+  /// Return true if this expression could evaluate to true. Will return true for any
+  /// unbound or non-boolean Expressions. IsSatisfiable does not (currently) do any
+  /// canonicalization or simplification of the expression, so even Expressions
+  /// which are unsatisfiable may spuriously return `true` here. This function is
+  /// intended for use in predicate pushdown where a filter expression is simplified
+  /// by a guarantee, so it assumes that trying to simplify again would be redundant.
+  bool IsSatisfiable() const;
+
+  // XXX someday
+  // Result<PipelineGraph> GetPipelines();
+
+  bool is_valid() const { return impl_ != NULLPTR; }
+
+  /// Access a Call or return nullptr if this expression is not a call
+  const Call* call() const;
+  /// Access a Datum or return nullptr if this expression is not a literal
+  const Datum* literal() const;
+  /// Access a FieldRef or return nullptr if this expression is not a field_ref
+  const FieldRef* field_ref() const;
+
+  /// The type to which this expression will evaluate
+  const DataType* type() const;
+  // XXX someday
+  // NullGeneralization::type nullable() const;
+
+  struct Parameter {
+    FieldRef ref;
+
+    // post-bind properties
+    TypeHolder type;
+    ::arrow::internal::SmallVector<int, 2> indices;
+  };
+  const Parameter* parameter() const;
+
+  Expression() = default;
+  explicit Expression(Call call);
+  explicit Expression(Datum literal);
+  explicit Expression(Parameter parameter);
+
+  static bool Identical(const Expression& l, const Expression& r);
+
+ private:
+  using Impl = std::variant<Datum, Parameter, Call>;
+  std::shared_ptr<Impl> impl_;
+};
+
+inline bool operator==(const Expression& l, const Expression& r) { return l.Equals(r); }
+inline bool operator!=(const Expression& l, const Expression& r) { return !l.Equals(r); }
+
+ARROW_EXPORT void PrintTo(const Expression&, std::ostream*);
+
+// Factories
+
+ARROW_EXPORT
+Expression literal(Datum lit);
+
+template <typename Arg>
+Expression literal(Arg&& arg) {
+  return literal(Datum(std::forward<Arg>(arg)));
+}
+
+ARROW_EXPORT
+Expression field_ref(FieldRef ref);
+
+ARROW_EXPORT
+Expression call(std::string function, std::vector<Expression> arguments,
+                std::shared_ptr<FunctionOptions> options = NULLPTR);
+
+template <typename Options, typename = typename std::enable_if<
+                                std::is_base_of<FunctionOptions, Options>::value>::type>
+Expression call(std::string function, std::vector<Expression> arguments,
+                Options options) {
+  return call(std::move(function), std::move(arguments),
+              std::make_shared<Options>(std::move(options)));
+}
+
+/// Assemble a list of all fields referenced by an Expression at any depth.
+ARROW_EXPORT
+std::vector<FieldRef> FieldsInExpression(const Expression&);
+
+/// Check if the expression references any fields.
+ARROW_EXPORT
+bool ExpressionHasFieldRefs(const Expression&);
+
+struct ARROW_EXPORT KnownFieldValues;
+
+/// Assemble a mapping from field references to known values. This derives known values
+/// from "equal" and "is_null" Expressions referencing a field and a literal.
+ARROW_EXPORT
+Result<KnownFieldValues> ExtractKnownFieldValues(
+    const Expression& guaranteed_true_predicate);
+
+/// @}
+
+/// \defgroup expression-passes Functions for modification of Expressions
+///
+/// @{
+///
+/// These transform bound expressions. Some transforms utilize a guarantee, which is
+/// provided as an Expression which is guaranteed to evaluate to true. The
+/// guaranteed_true_predicate need not be bound, but canonicalization is currently
+/// deferred to producers of guarantees. For example in order to be recognized as a
+/// guarantee on a field value, an Expression must be a call to "equal" with field_ref LHS
+/// and literal RHS. Flipping the arguments, "is_in" with a one-long value_set, ... or
+/// other semantically identical Expressions will not be recognized.
+
+/// Weak canonicalization which establishes guarantees for subsequent passes. Even
+/// equivalent Expressions may result in different canonicalized expressions.
+/// TODO this could be a strong canonicalization
+ARROW_EXPORT
+Result<Expression> Canonicalize(Expression, ExecContext* = NULLPTR);
+
+/// Simplify Expressions based on literal arguments (for example, add(null, x) will always
+/// be null so replace the call with a null literal). Includes early evaluation of all
+/// calls whose arguments are entirely literal.
+ARROW_EXPORT
+Result<Expression> FoldConstants(Expression);
+
+/// Simplify Expressions by replacing with known values of the fields which it references.
+ARROW_EXPORT
+Result<Expression> ReplaceFieldsWithKnownValues(const KnownFieldValues& known_values,
+                                                Expression);
+
+/// Simplify an expression by replacing subexpressions based on a guarantee:
+/// a boolean expression which is guaranteed to evaluate to `true`. For example, this is
+/// used to remove redundant function calls from a filter expression or to replace a
+/// reference to a constant-value field with a literal.
+ARROW_EXPORT
+Result<Expression> SimplifyWithGuarantee(Expression,
+                                         const Expression& guaranteed_true_predicate);
+
+/// Replace all named field refs (e.g. "x" or "x.y") with field paths (e.g. [0] or [1,3])
+///
+/// This isn't usually needed and does not offer any simplification by itself.  However,
+/// it can be useful to normalize an expression to paths to make it simpler to work with.
+ARROW_EXPORT Result<Expression> RemoveNamedRefs(Expression expression);
+
+/// @}
+
+// Execution
+
+/// Create an ExecBatch suitable for passing to ExecuteScalarExpression() from a
+/// RecordBatch which may have missing or incorrectly ordered columns.
+/// Missing fields will be replaced with null scalars.
+ARROW_EXPORT Result<ExecBatch> MakeExecBatch(const Schema& full_schema,
+                                             const Datum& partial,
+                                             Expression guarantee = literal(true));
+
+/// Execute a scalar expression against the provided state and input ExecBatch. This
+/// expression must be bound.
+ARROW_EXPORT
+Result<Datum> ExecuteScalarExpression(const Expression&, const ExecBatch& input,
+                                      ExecContext* = NULLPTR);
+
+/// Convenience function for invoking against a RecordBatch
+ARROW_EXPORT
+Result<Datum> ExecuteScalarExpression(const Expression&, const Schema& full_schema,
+                                      const Datum& partial_input, ExecContext* = NULLPTR);
+
+// Serialization
+
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> Serialize(const Expression&);
+
+ARROW_EXPORT
+Result<Expression> Deserialize(std::shared_ptr<Buffer>);
+
+/// \defgroup expression-convenience Helpers for convenient expression creation
+///
+/// @{
+
+ARROW_EXPORT Expression project(std::vector<Expression> values,
+                                std::vector<std::string> names);
+
+ARROW_EXPORT Expression equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression not_equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression less(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression less_equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression greater(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression greater_equal(Expression lhs, Expression rhs);
+
+ARROW_EXPORT Expression is_null(Expression lhs, bool nan_is_null = false);
+
+ARROW_EXPORT Expression is_valid(Expression lhs);
+
+ARROW_EXPORT Expression and_(Expression lhs, Expression rhs);
+ARROW_EXPORT Expression and_(const std::vector<Expression>&);
+ARROW_EXPORT Expression or_(Expression lhs, Expression rhs);
+ARROW_EXPORT Expression or_(const std::vector<Expression>&);
+ARROW_EXPORT Expression not_(Expression operand);
+
+/// @}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/function.h b/pyarrow/include/arrow/compute/function.h
new file mode 100644
index 0000000000000000000000000000000000000000..399081e2a7371f7e39c7cc5da73af8f524ee9b99
--- /dev/null
+++ b/pyarrow/include/arrow/compute/function.h
@@ -0,0 +1,410 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle.
+
+#pragma once
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+/// \addtogroup compute-functions
+/// @{
+
+/// \brief Contains the number of required arguments for the function.
+///
+/// Naming conventions taken from https://en.wikipedia.org/wiki/Arity.
+struct ARROW_EXPORT Arity {
+  /// \brief A function taking no arguments
+  static Arity Nullary() { return Arity(0, false); }
+
+  /// \brief A function taking 1 argument
+  static Arity Unary() { return Arity(1, false); }
+
+  /// \brief A function taking 2 arguments
+  static Arity Binary() { return Arity(2, false); }
+
+  /// \brief A function taking 3 arguments
+  static Arity Ternary() { return Arity(3, false); }
+
+  /// \brief A function taking a variable number of arguments
+  ///
+  /// \param[in] min_args the minimum number of arguments required when
+  /// invoking the function
+  static Arity VarArgs(int min_args = 0) { return Arity(min_args, true); }
+
+  // NOTE: the 0-argument form (default constructor) is required for Cython
+  explicit Arity(int num_args = 0, bool is_varargs = false)
+      : num_args(num_args), is_varargs(is_varargs) {}
+
+  /// The number of required arguments (or the minimum number for varargs
+  /// functions).
+  int num_args;
+
+  /// If true, then the num_args is the minimum number of required arguments.
+  bool is_varargs = false;
+};
+
+struct ARROW_EXPORT FunctionDoc {
+  /// \brief A one-line summary of the function, using a verb.
+  ///
+  /// For example, "Add two numeric arrays or scalars".
+  std::string summary;
+
+  /// \brief A detailed description of the function, meant to follow the summary.
+  std::string description;
+
+  /// \brief Symbolic names (identifiers) for the function arguments.
+  ///
+  /// Some bindings may use this to generate nicer function signatures.
+  std::vector<std::string> arg_names;
+
+  // TODO add argument descriptions?
+
+  /// \brief Name of the options class, if any.
+  std::string options_class;
+
+  /// \brief Whether options are required for function execution
+  ///
+  /// If false, then either the function does not have an options class
+  /// or there is a usable default options value.
+  bool options_required;
+
+  FunctionDoc() = default;
+
+  FunctionDoc(std::string summary, std::string description,
+              std::vector<std::string> arg_names, std::string options_class = "",
+              bool options_required = false)
+      : summary(std::move(summary)),
+        description(std::move(description)),
+        arg_names(std::move(arg_names)),
+        options_class(std::move(options_class)),
+        options_required(options_required) {}
+
+  static const FunctionDoc& Empty();
+};
+
+/// \brief An executor of a function with a preconfigured kernel
+class ARROW_EXPORT FunctionExecutor {
+ public:
+  virtual ~FunctionExecutor() = default;
+  /// \brief Initialize or re-initialize the preconfigured kernel
+  ///
+  /// This method may be called zero or more times. Depending on how
+  /// the FunctionExecutor was obtained, it may already have been initialized.
+  virtual Status Init(const FunctionOptions* options = NULLPTR,
+                      ExecContext* exec_ctx = NULLPTR) = 0;
+  /// \brief Execute the preconfigured kernel with arguments that must fit it
+  ///
+  /// The method requires the arguments be castable to the preconfigured types.
+  ///
+  /// \param[in] args Arguments to execute the function on
+  /// \param[in] length Length of arguments batch or -1 to default it. If the
+  /// function has no parameters, this determines the batch length, defaulting
+  /// to 0. Otherwise, if the function is scalar, this must equal the argument
+  /// batch's inferred length or be -1 to default to it. This is ignored for
+  /// vector functions.
+  virtual Result<Datum> Execute(const std::vector<Datum>& args, int64_t length = -1) = 0;
+};
+
+/// \brief Base class for compute functions. Function implementations contain a
+/// collection of "kernels" which are implementations of the function for
+/// specific argument types. Selecting a viable kernel for executing a function
+/// is referred to as "dispatching".
+class ARROW_EXPORT Function {
+ public:
+  /// \brief The kind of function, which indicates in what contexts it is
+  /// valid for use.
+  enum Kind {
+    /// A function that performs scalar data operations on whole arrays of
+    /// data. Can generally process Array or Scalar values. The size of the
+    /// output will be the same as the size (or broadcasted size, in the case
+    /// of mixing Array and Scalar inputs) of the input.
+    SCALAR,
+
+    /// A function with array input and output whose behavior depends on the
+    /// values of the entire arrays passed, rather than the value of each scalar
+    /// value.
+    VECTOR,
+
+    /// A function that computes scalar summary statistics from array input.
+    SCALAR_AGGREGATE,
+
+    /// A function that computes grouped summary statistics from array input
+    /// and an array of group identifiers.
+    HASH_AGGREGATE,
+
+    /// A function that dispatches to other functions and does not contain its
+    /// own kernels.
+    META
+  };
+
+  virtual ~Function() = default;
+
+  /// \brief The name of the kernel. The registry enforces uniqueness of names.
+  const std::string& name() const { return name_; }
+
+  /// \brief The kind of kernel, which indicates in what contexts it is valid
+  /// for use.
+  Function::Kind kind() const { return kind_; }
+
+  /// \brief Contains the number of arguments the function requires, or if the
+  /// function accepts variable numbers of arguments.
+  const Arity& arity() const { return arity_; }
+
+  /// \brief Return the function documentation
+  const FunctionDoc& doc() const { return doc_; }
+
+  /// \brief Returns the number of registered kernels for this function.
+  virtual int num_kernels() const = 0;
+
+  /// \brief Return a kernel that can execute the function given the exact
+  /// argument types (without implicit type casts).
+  ///
+  /// NB: This function is overridden in CastFunction.
+  virtual Result<const Kernel*> DispatchExact(const std::vector<TypeHolder>& types) const;
+
+  /// \brief Return a best-match kernel that can execute the function given the argument
+  /// types, after implicit casts are applied.
+  ///
+  /// \param[in,out] values Argument types. An element may be modified to
+  /// indicate that the returned kernel only approximately matches the input
+  /// value descriptors; callers are responsible for casting inputs to the type
+  /// required by the kernel.
+  virtual Result<const Kernel*> DispatchBest(std::vector<TypeHolder>* values) const;
+
+  /// \brief Get a function executor with a best-matching kernel
+  ///
+  /// The returned executor will by default work with the default FunctionOptions
+  /// and KernelContext. If you want to change that, call `FunctionExecutor::Init`.
+  virtual Result<std::shared_ptr<FunctionExecutor>> GetBestExecutor(
+      std::vector<TypeHolder> inputs) const;
+
+  /// \brief Execute the function eagerly with the passed input arguments with
+  /// kernel dispatch, batch iteration, and memory allocation details taken
+  /// care of.
+  ///
+  /// If the `options` pointer is null, then `default_options()` will be used.
+  ///
+  /// This function can be overridden in subclasses.
+  virtual Result<Datum> Execute(const std::vector<Datum>& args,
+                                const FunctionOptions* options, ExecContext* ctx) const;
+
+  virtual Result<Datum> Execute(const ExecBatch& batch, const FunctionOptions* options,
+                                ExecContext* ctx) const;
+
+  /// \brief Returns the default options for this function.
+  ///
+  /// Whatever option semantics a Function has, implementations must guarantee
+  /// that default_options() is valid to pass to Execute as options.
+  const FunctionOptions* default_options() const { return default_options_; }
+
+  virtual Status Validate() const;
+
+  /// \brief Returns the pure property for this function.
+  ///
+  /// Impure functions are those that may return different results for the same
+  /// input arguments. For example, a function that returns a random number is
+  /// not pure. An expression containing only pure functions can be simplified by
+  /// pre-evaluating any sub-expressions that have constant arguments.
+  virtual bool is_pure() const { return true; }
+
+ protected:
+  Function(std::string name, Function::Kind kind, const Arity& arity, FunctionDoc doc,
+           const FunctionOptions* default_options)
+      : name_(std::move(name)),
+        kind_(kind),
+        arity_(arity),
+        doc_(std::move(doc)),
+        default_options_(default_options) {}
+
+  Status CheckArity(size_t num_args) const;
+
+  std::string name_;
+  Function::Kind kind_;
+  Arity arity_;
+  const FunctionDoc doc_;
+  const FunctionOptions* default_options_ = NULLPTR;
+};
+
+namespace detail {
+
+template <typename KernelType>
+class FunctionImpl : public Function {
+ public:
+  /// \brief Return pointers to current-available kernels for inspection
+  std::vector<const KernelType*> kernels() const {
+    std::vector<const KernelType*> result;
+    for (const auto& kernel : kernels_) {
+      result.push_back(&kernel);
+    }
+    return result;
+  }
+
+  int num_kernels() const override { return static_cast<int>(kernels_.size()); }
+
+ protected:
+  FunctionImpl(std::string name, Function::Kind kind, const Arity& arity, FunctionDoc doc,
+               const FunctionOptions* default_options)
+      : Function(std::move(name), kind, arity, std::move(doc), default_options) {}
+
+  std::vector<KernelType> kernels_;
+};
+
+/// \brief Look up a kernel in a function. If no Kernel is found, nullptr is returned.
+ARROW_EXPORT
+const Kernel* DispatchExactImpl(const Function* func, const std::vector<TypeHolder>&);
+
+/// \brief Return an error message if no Kernel is found.
+ARROW_EXPORT
+Status NoMatchingKernel(const Function* func, const std::vector<TypeHolder>&);
+
+}  // namespace detail
+
+/// \brief A function that executes elementwise operations on arrays or
+/// scalars, and therefore whose results generally do not depend on the order
+/// of the values in the arguments. Accepts and returns arrays that are all of
+/// the same size. These functions roughly correspond to the functions used in
+/// SQL expressions.
+class ARROW_EXPORT ScalarFunction : public detail::FunctionImpl<ScalarKernel> {
+ public:
+  using KernelType = ScalarKernel;
+
+  ScalarFunction(std::string name, const Arity& arity, FunctionDoc doc,
+                 const FunctionOptions* default_options = NULLPTR, bool is_pure = true)
+      : detail::FunctionImpl<ScalarKernel>(std::move(name), Function::SCALAR, arity,
+                                           std::move(doc), default_options),
+        is_pure_(is_pure) {}
+
+  /// \brief Add a kernel with given input/output types, no required state
+  /// initialization, preallocation for fixed-width types, and default null
+  /// handling (intersect validity bitmaps of inputs).
+  Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
+                   ArrayKernelExec exec, KernelInit init = NULLPTR,
+                   std::shared_ptr<MatchConstraint> constraint = NULLPTR);
+
+  /// \brief Add a kernel (function implementation). Returns error if the
+  /// kernel's signature does not match the function's arity.
+  Status AddKernel(ScalarKernel kernel);
+
+  /// \brief Returns the pure property for this function.
+  bool is_pure() const override { return is_pure_; }
+
+ private:
+  const bool is_pure_;
+};
+
+/// \brief A function that executes general array operations that may yield
+/// outputs of different sizes or have results that depend on the whole array
+/// contents. These functions roughly correspond to the functions found in
+/// non-SQL array languages like APL and its derivatives.
+class ARROW_EXPORT VectorFunction : public detail::FunctionImpl<VectorKernel> {
+ public:
+  using KernelType = VectorKernel;
+
+  VectorFunction(std::string name, const Arity& arity, FunctionDoc doc,
+                 const FunctionOptions* default_options = NULLPTR)
+      : detail::FunctionImpl<VectorKernel>(std::move(name), Function::VECTOR, arity,
+                                           std::move(doc), default_options) {}
+
+  /// \brief Add a simple kernel with given input/output types, no required
+  /// state initialization, no data preallocation, and no preallocation of the
+  /// validity bitmap.
+  Status AddKernel(std::vector<InputType> in_types, OutputType out_type,
+                   ArrayKernelExec exec, KernelInit init = NULLPTR);
+
+  /// \brief Add a kernel (function implementation). Returns error if the
+  /// kernel's signature does not match the function's arity.
+  Status AddKernel(VectorKernel kernel);
+};
+
+class ARROW_EXPORT ScalarAggregateFunction
+    : public detail::FunctionImpl<ScalarAggregateKernel> {
+ public:
+  using KernelType = ScalarAggregateKernel;
+
+  ScalarAggregateFunction(std::string name, const Arity& arity, FunctionDoc doc,
+                          const FunctionOptions* default_options = NULLPTR)
+      : detail::FunctionImpl<ScalarAggregateKernel>(std::move(name),
+                                                    Function::SCALAR_AGGREGATE, arity,
+                                                    std::move(doc), default_options) {}
+
+  /// \brief Add a kernel (function implementation). Returns error if the
+  /// kernel's signature does not match the function's arity.
+  Status AddKernel(ScalarAggregateKernel kernel);
+};
+
+class ARROW_EXPORT HashAggregateFunction
+    : public detail::FunctionImpl<HashAggregateKernel> {
+ public:
+  using KernelType = HashAggregateKernel;
+
+  HashAggregateFunction(std::string name, const Arity& arity, FunctionDoc doc,
+                        const FunctionOptions* default_options = NULLPTR)
+      : detail::FunctionImpl<HashAggregateKernel>(std::move(name),
+                                                  Function::HASH_AGGREGATE, arity,
+                                                  std::move(doc), default_options) {}
+
+  /// \brief Add a kernel (function implementation). Returns error if the
+  /// kernel's signature does not match the function's arity.
+  Status AddKernel(HashAggregateKernel kernel);
+};
+
+/// \brief A function that dispatches to other functions. Must implement
+/// MetaFunction::ExecuteImpl.
+///
+/// For Array, ChunkedArray, and Scalar Datum kinds, may rely on the execution
+/// of concrete Function types, but must handle other Datum kinds on its own.
+class ARROW_EXPORT MetaFunction : public Function {
+ public:
+  int num_kernels() const override { return 0; }
+
+  Result<Datum> Execute(const std::vector<Datum>& args, const FunctionOptions* options,
+                        ExecContext* ctx) const override;
+
+  Result<Datum> Execute(const ExecBatch& batch, const FunctionOptions* options,
+                        ExecContext* ctx) const override;
+
+ protected:
+  virtual Result<Datum> ExecuteImpl(const std::vector<Datum>& args,
+                                    const FunctionOptions* options,
+                                    ExecContext* ctx) const = 0;
+
+  MetaFunction(std::string name, const Arity& arity, FunctionDoc doc,
+               const FunctionOptions* default_options = NULLPTR)
+      : Function(std::move(name), Function::META, arity, std::move(doc),
+                 default_options) {}
+};
+
+/// @}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/function_options.h b/pyarrow/include/arrow/compute/function_options.h
new file mode 100644
index 0000000000000000000000000000000000000000..88ec2fd2d0679b5c849549179aa652bec9b37b56
--- /dev/null
+++ b/pyarrow/include/arrow/compute/function_options.h
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle.
+
+#pragma once
+
+#include "arrow/compute/type_fwd.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+/// \addtogroup compute-functions
+/// @{
+
+/// \brief Extension point for defining options outside libarrow (but
+/// still within this project).
+class ARROW_EXPORT FunctionOptionsType {
+ public:
+  virtual ~FunctionOptionsType() = default;
+
+  virtual const char* type_name() const = 0;
+  virtual std::string Stringify(const FunctionOptions&) const = 0;
+  virtual bool Compare(const FunctionOptions&, const FunctionOptions&) const = 0;
+  virtual Result<std::shared_ptr<Buffer>> Serialize(const FunctionOptions&) const;
+  virtual Result<std::unique_ptr<FunctionOptions>> Deserialize(
+      const Buffer& buffer) const;
+  virtual std::unique_ptr<FunctionOptions> Copy(const FunctionOptions&) const = 0;
+};
+
+/// \brief Base class for specifying options configuring a function's behavior,
+/// such as error handling.
+class ARROW_EXPORT FunctionOptions : public util::EqualityComparable<FunctionOptions> {
+ public:
+  virtual ~FunctionOptions() = default;
+
+  const FunctionOptionsType* options_type() const { return options_type_; }
+  const char* type_name() const { return options_type()->type_name(); }
+
+  bool Equals(const FunctionOptions& other) const;
+  std::string ToString() const;
+  std::unique_ptr<FunctionOptions> Copy() const;
+  /// \brief Serialize an options struct to a buffer.
+  Result<std::shared_ptr<Buffer>> Serialize() const;
+  /// \brief Deserialize an options struct from a buffer.
+  /// Note: this will only look for `type_name` in the default FunctionRegistry;
+  /// to use a custom FunctionRegistry, look up the FunctionOptionsType, then
+  /// call FunctionOptionsType::Deserialize().
+  static Result<std::unique_ptr<FunctionOptions>> Deserialize(
+      const std::string& type_name, const Buffer& buffer);
+
+ protected:
+  explicit FunctionOptions(const FunctionOptionsType* type) : options_type_(type) {}
+  const FunctionOptionsType* options_type_;
+};
+
+ARROW_EXPORT void PrintTo(const FunctionOptions&, std::ostream*);
+
+/// @}
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/initialize.h b/pyarrow/include/arrow/compute/initialize.h
new file mode 100644
index 0000000000000000000000000000000000000000..db5e231325bab4c944e086078780ac7302008c77
--- /dev/null
+++ b/pyarrow/include/arrow/compute/initialize.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/compute/visibility.h"
+#include "arrow/status.h"
+
+namespace arrow::compute {
+
+/// \brief Initialize the compute module.
+///
+/// Register the compute kernel functions to be available on the
+/// global FunctionRegistry.
+/// This function will only be available if ARROW_COMPUTE is enabled.
+ARROW_COMPUTE_EXPORT Status Initialize();
+
+}  // namespace arrow::compute
diff --git a/pyarrow/include/arrow/compute/kernel.h b/pyarrow/include/arrow/compute/kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d4f9d6ff436de470801f061b4e66a5e58876286
--- /dev/null
+++ b/pyarrow/include/arrow/compute/kernel.h
@@ -0,0 +1,772 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compute/exec.h"
+#include "arrow/datum.h"
+#include "arrow/device_allocation_type_set.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+// macOS defines PREALLOCATE as a preprocessor macro in the header sys/vnode.h.
+// No other BSD seems to do so. The name is used as an identifier in MemAllocation enum.
+#if defined(__APPLE__) && defined(PREALLOCATE)
+#  undef PREALLOCATE
+#endif
+
+namespace arrow {
+namespace compute {
+
+class FunctionOptions;
+
+/// \brief Base class for opaque kernel-specific state. For example, if there
+/// is some kind of initialization required.
+struct ARROW_EXPORT KernelState {
+  virtual ~KernelState() = default;
+};
+
+/// \brief Context/state for the execution of a particular kernel.
+class ARROW_EXPORT KernelContext {
+ public:
+  // Can pass optional backreference; not used consistently for the
+  // moment but will be made so in the future
+  explicit KernelContext(ExecContext* exec_ctx, const Kernel* kernel = NULLPTR)
+      : exec_ctx_(exec_ctx), kernel_(kernel) {}
+
+  /// \brief Allocate buffer from the context's memory pool. The contents are
+  /// not initialized.
+  Result<std::shared_ptr<ResizableBuffer>> Allocate(int64_t nbytes);
+
+  /// \brief Allocate buffer for bitmap from the context's memory pool. Like
+  /// Allocate, the contents of the buffer are not initialized but the last
+  /// byte is preemptively zeroed to help avoid ASAN or valgrind issues.
+  Result<std::shared_ptr<ResizableBuffer>> AllocateBitmap(int64_t num_bits);
+
+  /// \brief Assign the active KernelState to be utilized for each stage of
+  /// kernel execution. Ownership and memory lifetime of the KernelState must
+  /// be minded separately.
+  void SetState(KernelState* state) { state_ = state; }
+
+  // Set kernel that is being invoked since some kernel
+  // implementations will examine the kernel state.
+  void SetKernel(const Kernel* kernel) { kernel_ = kernel; }
+
+  KernelState* state() { return state_; }
+
+  /// \brief Configuration related to function execution that is to be shared
+  /// across multiple kernels.
+  ExecContext* exec_context() { return exec_ctx_; }
+
+  /// \brief The memory pool to use for allocations. For now, it uses the
+  /// MemoryPool contained in the ExecContext used to create the KernelContext.
+  MemoryPool* memory_pool() { return exec_ctx_->memory_pool(); }
+
+  const Kernel* kernel() const { return kernel_; }
+
+ private:
+  ExecContext* exec_ctx_;
+  KernelState* state_ = NULLPTR;
+  const Kernel* kernel_ = NULLPTR;
+};
+
+/// \brief An type-checking interface to permit customizable validation rules
+/// for use with InputType and KernelSignature. This is for scenarios where the
+/// acceptance is not an exact type instance, such as a TIMESTAMP type for a
+/// specific TimeUnit, but permitting any time zone.
+struct ARROW_EXPORT TypeMatcher {
+  virtual ~TypeMatcher() = default;
+
+  /// \brief Return true if this matcher accepts the data type.
+  virtual bool Matches(const DataType& type) const = 0;
+
+  /// \brief A human-interpretable string representation of what the type
+  /// matcher checks for, usable when printing KernelSignature or formatting
+  /// error messages.
+  virtual std::string ToString() const = 0;
+
+  /// \brief Return true if this TypeMatcher contains the same matching rule as
+  /// the other. Currently depends on RTTI.
+  virtual bool Equals(const TypeMatcher& other) const = 0;
+};
+
+namespace match {
+
+/// \brief Match any DataType instance having the same DataType::id.
+ARROW_EXPORT std::shared_ptr<TypeMatcher> SameTypeId(Type::type type_id);
+
+/// \brief Match any TimestampType instance having the same unit, but the time
+/// zones can be different.
+ARROW_EXPORT std::shared_ptr<TypeMatcher> TimestampTypeUnit(TimeUnit::type unit);
+ARROW_EXPORT std::shared_ptr<TypeMatcher> Time32TypeUnit(TimeUnit::type unit);
+ARROW_EXPORT std::shared_ptr<TypeMatcher> Time64TypeUnit(TimeUnit::type unit);
+ARROW_EXPORT std::shared_ptr<TypeMatcher> DurationTypeUnit(TimeUnit::type unit);
+
+// \brief Match any integer type
+ARROW_EXPORT std::shared_ptr<TypeMatcher> Integer();
+
+// Match types using 32-bit varbinary representation
+ARROW_EXPORT std::shared_ptr<TypeMatcher> BinaryLike();
+
+// Match types using 64-bit varbinary representation
+ARROW_EXPORT std::shared_ptr<TypeMatcher> LargeBinaryLike();
+
+// Match any fixed binary type
+ARROW_EXPORT std::shared_ptr<TypeMatcher> FixedSizeBinaryLike();
+
+// \brief Match any primitive type (boolean or any type representable as a C
+// Type)
+ARROW_EXPORT std::shared_ptr<TypeMatcher> Primitive();
+
+// \brief Match any integer type that can be used as run-end in run-end encoded
+// arrays
+ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndInteger();
+
+/// \brief Match run-end encoded types that use any valid run-end type and
+/// encode specific value types
+///
+/// @param[in] value_type_matcher a matcher that is applied to the values field
+ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(
+    std::shared_ptr<TypeMatcher> value_type_matcher);
+
+/// \brief Match run-end encoded types that use any valid run-end type and
+/// encode specific value types
+///
+/// @param[in] value_type_id a type id that the type of the values field should match
+ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(Type::type value_type_id);
+
+/// \brief Match run-end encoded types that encode specific run-end and value types
+///
+/// @param[in] run_end_type_matcher a matcher that is applied to the run_ends field
+/// @param[in] value_type_matcher a matcher that is applied to the values field
+ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(
+    std::shared_ptr<TypeMatcher> run_end_type_matcher,
+    std::shared_ptr<TypeMatcher> value_type_matcher);
+
+}  // namespace match
+
+/// \brief An object used for type-checking arguments to be passed to a kernel
+/// and stored in a KernelSignature. The type-checking rule can be supplied
+/// either with an exact DataType instance or a custom TypeMatcher.
+class ARROW_EXPORT InputType {
+ public:
+  /// \brief The kind of type-checking rule that the InputType contains.
+  enum Kind {
+    /// \brief Accept any value type.
+    ANY_TYPE,
+
+    /// \brief A fixed arrow::DataType and will only exact match having this
+    /// exact type (e.g. same TimestampType unit, same decimal scale and
+    /// precision, or same nested child types).
+    EXACT_TYPE,
+
+    /// \brief Uses a TypeMatcher implementation to check the type.
+    USE_TYPE_MATCHER
+  };
+
+  /// \brief Accept any value type
+  InputType() : kind_(ANY_TYPE) {}
+
+  /// \brief Accept an exact value type.
+  InputType(std::shared_ptr<DataType> type)  // NOLINT implicit construction
+      : kind_(EXACT_TYPE), type_(std::move(type)) {}
+
+  /// \brief Use the passed TypeMatcher to type check.
+  InputType(std::shared_ptr<TypeMatcher> type_matcher)  // NOLINT implicit construction
+      : kind_(USE_TYPE_MATCHER), type_matcher_(std::move(type_matcher)) {}
+
+  /// \brief Match any type with the given Type::type. Uses a TypeMatcher for
+  /// its implementation.
+  InputType(Type::type type_id)  // NOLINT implicit construction
+      : InputType(match::SameTypeId(type_id)) {}
+
+  InputType(const InputType& other) { CopyInto(other); }
+
+  void operator=(const InputType& other) { CopyInto(other); }
+
+  InputType(InputType&& other) { MoveInto(std::forward<InputType>(other)); }
+
+  void operator=(InputType&& other) { MoveInto(std::forward<InputType>(other)); }
+
+  // \brief Match any input (array, scalar of any type)
+  static InputType Any() { return InputType(); }
+
+  /// \brief Return true if this input type matches the same type cases as the
+  /// other.
+  bool Equals(const InputType& other) const;
+
+  bool operator==(const InputType& other) const { return this->Equals(other); }
+
+  bool operator!=(const InputType& other) const { return !(*this == other); }
+
+  /// \brief Return hash code.
+  size_t Hash() const;
+
+  /// \brief Render a human-readable string representation.
+  std::string ToString() const;
+
+  /// \brief Return true if the Datum matches this argument kind in
+  /// type (and only allows scalar or array-like Datums).
+  bool Matches(const Datum& value) const;
+
+  /// \brief Return true if the type matches this InputType
+  bool Matches(const DataType& type) const;
+
+  /// \brief The type matching rule that this InputType uses.
+  Kind kind() const { return kind_; }
+
+  /// \brief For InputType::EXACT_TYPE kind, the exact type that this InputType
+  /// must match. Otherwise this function should not be used and will assert in
+  /// debug builds.
+  const std::shared_ptr<DataType>& type() const;
+
+  /// \brief For InputType::USE_TYPE_MATCHER, the TypeMatcher to be used for
+  /// checking the type of a value. Otherwise this function should not be used
+  /// and will assert in debug builds.
+  const TypeMatcher& type_matcher() const;
+
+ private:
+  void CopyInto(const InputType& other) {
+    this->kind_ = other.kind_;
+    this->type_ = other.type_;
+    this->type_matcher_ = other.type_matcher_;
+  }
+
+  void MoveInto(InputType&& other) {
+    this->kind_ = other.kind_;
+    this->type_ = std::move(other.type_);
+    this->type_matcher_ = std::move(other.type_matcher_);
+  }
+
+  Kind kind_;
+
+  // For EXACT_TYPE Kind
+  std::shared_ptr<DataType> type_;
+
+  // For USE_TYPE_MATCHER Kind
+  std::shared_ptr<TypeMatcher> type_matcher_;
+};
+
+/// \brief Container to capture both exact and input-dependent output types.
+class ARROW_EXPORT OutputType {
+ public:
+  /// \brief An enum indicating whether the value type is an invariant fixed
+  /// value or one that's computed by a kernel-defined resolver function.
+  enum ResolveKind { FIXED, COMPUTED };
+
+  /// Type resolution function. Given input types, return output type.  This
+  /// function MAY may use the kernel state to decide the output type based on
+  /// the FunctionOptions.
+  ///
+  /// This function SHOULD _not_ be used to check for arity, that is to be
+  /// performed one or more layers above.
+  using Resolver =
+      std::function<Result<TypeHolder>(KernelContext*, const std::vector<TypeHolder>&)>;
+
+  /// \brief Output an exact type
+  OutputType(std::shared_ptr<DataType> type)  // NOLINT implicit construction
+      : kind_(FIXED), type_(std::move(type)) {}
+
+  /// \brief Output a computed type depending on actual input types
+  template <typename Fn>
+  OutputType(Fn resolver)  // NOLINT implicit construction
+      : kind_(COMPUTED), resolver_(std::move(resolver)) {}
+
+  OutputType(const OutputType& other) {
+    this->kind_ = other.kind_;
+    this->type_ = other.type_;
+    this->resolver_ = other.resolver_;
+  }
+
+  OutputType(OutputType&& other) {
+    this->kind_ = other.kind_;
+    this->type_ = std::move(other.type_);
+    this->resolver_ = other.resolver_;
+  }
+
+  OutputType& operator=(const OutputType&) = default;
+  OutputType& operator=(OutputType&&) = default;
+
+  /// \brief Return the type of the expected output value of the kernel given
+  /// the input argument types. The resolver may make use of state information
+  /// kept in the KernelContext.
+  Result<TypeHolder> Resolve(KernelContext* ctx,
+                             const std::vector<TypeHolder>& args) const;
+
+  /// \brief The exact output value type for the FIXED kind.
+  const std::shared_ptr<DataType>& type() const;
+
+  /// \brief For use with COMPUTED resolution strategy. It may be more
+  /// convenient to invoke this with OutputType::Resolve returned from this
+  /// method.
+  const Resolver& resolver() const;
+
+  /// \brief Render a human-readable string representation.
+  std::string ToString() const;
+
+  /// \brief Return the kind of type resolution of this output type, whether
+  /// fixed/invariant or computed by a resolver.
+  ResolveKind kind() const { return kind_; }
+
+ private:
+  ResolveKind kind_;
+
+  // For FIXED resolution
+  std::shared_ptr<DataType> type_;
+
+  // For COMPUTED resolution
+  Resolver resolver_ = NULLPTR;
+};
+
+/// \brief Additional constraints to apply to the input types of a kernel when matching a
+/// specific kernel signature.
+class ARROW_EXPORT MatchConstraint {
+ public:
+  virtual ~MatchConstraint() = default;
+
+  /// \brief Return true if the input types satisfy the constraint.
+  virtual bool Matches(const std::vector<TypeHolder>& types) const = 0;
+
+  /// \brief Convenience function to create a MatchConstraint from a match function.
+  static std::shared_ptr<MatchConstraint> Make(
+      std::function<bool(const std::vector<TypeHolder>&)> matches);
+};
+
+/// \brief Constraint that all input types are decimal types and have the same scale.
+ARROW_EXPORT std::shared_ptr<MatchConstraint> DecimalsHaveSameScale();
+
+/// \brief Holds the input types, optional match constraint and output type of the kernel.
+///
+/// VarArgs functions with minimum N arguments should pass up to N input types to be
+/// used to validate the input types of a function invocation. The first N-1 types
+/// will be matched against the first N-1 arguments, and the last type will be
+/// matched against the remaining arguments.
+class ARROW_EXPORT KernelSignature {
+ public:
+  KernelSignature(std::vector<InputType> in_types, OutputType out_type,
+                  bool is_varargs = false,
+                  std::shared_ptr<MatchConstraint> constraint = NULLPTR);
+
+  /// \brief Convenience ctor since make_shared can be awkward
+  static std::shared_ptr<KernelSignature> Make(
+      std::vector<InputType> in_types, OutputType out_type, bool is_varargs = false,
+      std::shared_ptr<MatchConstraint> constraint = NULLPTR);
+
+  /// \brief Return true if the signature is compatible with the list of input
+  /// value descriptors and satisfies the match constraint, if any.
+  bool MatchesInputs(const std::vector<TypeHolder>& types) const;
+
+  /// \brief Returns true if the input types of each signature are
+  /// equal. Well-formed functions should have a deterministic output type
+  /// given input types, but currently it is the responsibility of the
+  /// developer to ensure this.
+  bool Equals(const KernelSignature& other) const;
+
+  bool operator==(const KernelSignature& other) const { return this->Equals(other); }
+
+  bool operator!=(const KernelSignature& other) const { return !(*this == other); }
+
+  /// \brief Compute a hash code for the signature
+  size_t Hash() const;
+
+  /// \brief The input types for the kernel. For VarArgs functions, this should
+  /// generally contain a single validator to use for validating all of the
+  /// function arguments.
+  const std::vector<InputType>& in_types() const { return in_types_; }
+
+  /// \brief The output type for the kernel. Use Resolve to return the
+  /// exact output given input argument types, since many kernels'
+  /// output types depend on their input types (or their type
+  /// metadata).
+  const OutputType& out_type() const { return out_type_; }
+
+  /// \brief Render a human-readable string representation
+  std::string ToString() const;
+
+  bool is_varargs() const { return is_varargs_; }
+
+ private:
+  std::vector<InputType> in_types_;
+  OutputType out_type_;
+  bool is_varargs_;
+  std::shared_ptr<MatchConstraint> constraint_;
+
+  // For caching the hash code after it's computed the first time
+  mutable uint64_t hash_code_;
+};
+
+/// \brief A function may contain multiple variants of a kernel for a given
+/// type combination for different SIMD levels. Based on the active system's
+/// CPU info or the user's preferences, we can elect to use one over the other.
+struct SimdLevel {
+  enum type { NONE = 0, SSE4_2, AVX, AVX2, AVX512, NEON, MAX };
+};
+
+/// \brief The strategy to use for propagating or otherwise populating the
+/// validity bitmap of a kernel output.
+struct NullHandling {
+  enum type {
+    /// Compute the output validity bitmap by intersecting the validity bitmaps
+    /// of the arguments using bitwise-and operations. This means that values
+    /// in the output are valid/non-null only if the corresponding values in
+    /// all input arguments were valid/non-null. Kernel generally need not
+    /// touch the bitmap thereafter, but a kernel's exec function is permitted
+    /// to alter the bitmap after the null intersection is computed if it needs
+    /// to.
+    INTERSECTION,
+
+    /// Kernel expects a pre-allocated buffer to write the result bitmap
+    /// into. The preallocated memory is not zeroed (except for the last byte),
+    /// so the kernel should ensure to completely populate the bitmap.
+    COMPUTED_PREALLOCATE,
+
+    /// Kernel allocates and sets the validity bitmap of the output.
+    COMPUTED_NO_PREALLOCATE,
+
+    /// Kernel output is never null and a validity bitmap does not need to be
+    /// allocated.
+    OUTPUT_NOT_NULL
+  };
+};
+
+/// \brief The preference for memory preallocation of fixed-width type outputs
+/// in kernel execution.
+struct MemAllocation {
+  enum type {
+    // For data types that support pre-allocation (i.e. fixed-width), the
+    // kernel expects to be provided a pre-allocated data buffer to write
+    // into. Non-fixed-width types must always allocate their own data
+    // buffers. The allocation made for the same length as the execution batch,
+    // so vector kernels yielding differently sized output should not use this.
+    //
+    // It is valid for the data to not be preallocated but the validity bitmap
+    // is (or is computed using the intersection/bitwise-and method).
+    //
+    // For variable-size output types like BinaryType or StringType, or for
+    // nested types, this option has no effect.
+    PREALLOCATE,
+
+    // The kernel is responsible for allocating its own data buffer for
+    // fixed-width type outputs.
+    NO_PREALLOCATE
+  };
+};
+
+struct Kernel;
+
+/// \brief Arguments to pass to an KernelInit function. A struct is used to help
+/// avoid API breakage should the arguments passed need to be expanded.
+struct KernelInitArgs {
+  /// \brief A pointer to the kernel being initialized. The init function may
+  /// depend on the kernel's KernelSignature or other data contained there.
+  const Kernel* kernel;
+
+  /// \brief The types of the input arguments that the kernel is
+  /// about to be executed against.
+  const std::vector<TypeHolder>& inputs;
+
+  /// \brief Opaque options specific to this kernel. May be nullptr for functions
+  /// that do not require options.
+  const FunctionOptions* options;
+};
+
+/// \brief Common initializer function for all kernel types.
+using KernelInit = std::function<Result<std::unique_ptr<KernelState>>(
+    KernelContext*, const KernelInitArgs&)>;
+
+/// \brief Base type for kernels. Contains the function signature and
+/// optionally the state initialization function, along with some common
+/// attributes
+struct ARROW_EXPORT Kernel {
+  Kernel() = default;
+
+  Kernel(std::shared_ptr<KernelSignature> sig, KernelInit init)
+      : signature(std::move(sig)), init(std::move(init)) {}
+
+  Kernel(std::vector<InputType> in_types, OutputType out_type, KernelInit init)
+      : Kernel(KernelSignature::Make(std::move(in_types), std::move(out_type)),
+               std::move(init)) {}
+
+  /// \brief The "signature" of the kernel containing the InputType input
+  /// argument validators and OutputType output type resolver.
+  std::shared_ptr<KernelSignature> signature;
+
+  /// \brief Create a new KernelState for invocations of this kernel, e.g. to
+  /// set up any options or state relevant for execution.
+  KernelInit init;
+
+  /// \brief Create a vector of new KernelState for invocations of this kernel.
+  static Status InitAll(KernelContext*, const KernelInitArgs&,
+                        std::vector<std::unique_ptr<KernelState>>*);
+
+  /// \brief Indicates whether execution can benefit from parallelization
+  /// (splitting large chunks into smaller chunks and using multiple
+  /// threads). Some kernels may not support parallel execution at
+  /// all. Synchronization and concurrency-related issues are currently the
+  /// responsibility of the Kernel's implementation.
+  bool parallelizable = true;
+
+  /// \brief Indicates the level of SIMD instruction support in the host CPU is
+  /// required to use the function. The intention is for functions to be able to
+  /// contain multiple kernels with the same signature but different levels of SIMD,
+  /// so that the most optimized kernel supported on a host's processor can be chosen.
+  SimdLevel::type simd_level = SimdLevel::NONE;
+
+  // Additional kernel-specific data
+  std::shared_ptr<KernelState> data;
+};
+
+/// \brief The scalar kernel execution API that must be implemented for SCALAR
+/// kernel types. This includes both stateless and stateful kernels. Kernels
+/// depending on some execution state access that state via subclasses of
+/// KernelState set on the KernelContext object. Implementations should
+/// endeavor to write into pre-allocated memory if they are able, though for
+/// some kernels (e.g. in cases when a builder like StringBuilder) must be
+/// employed this may not be possible.
+using ArrayKernelExec = Status (*)(KernelContext*, const ExecSpan&, ExecResult*);
+
+/// \brief Kernel data structure for implementations of ScalarFunction. In
+/// addition to the members found in Kernel, contains the null handling
+/// and memory pre-allocation preferences.
+struct ARROW_EXPORT ScalarKernel : public Kernel {
+  ScalarKernel() = default;
+
+  ScalarKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
+               KernelInit init = NULLPTR)
+      : Kernel(std::move(sig), init), exec(exec) {}
+
+  ScalarKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
+               KernelInit init = NULLPTR)
+      : Kernel(std::move(in_types), std::move(out_type), std::move(init)), exec(exec) {}
+
+  /// \brief Perform a single invocation of this kernel. Depending on the
+  /// implementation, it may only write into preallocated memory, while in some
+  /// cases it will allocate its own memory. Any required state is managed
+  /// through the KernelContext.
+  ArrayKernelExec exec;
+
+  /// \brief Writing execution results into larger contiguous allocations
+  /// requires that the kernel be able to write into sliced output ArrayData*,
+  /// including sliced output validity bitmaps. Some kernel implementations may
+  /// not be able to do this, so setting this to false disables this
+  /// functionality.
+  bool can_write_into_slices = true;
+
+  // For scalar functions preallocated data and intersecting arg validity
+  // bitmaps is a reasonable default
+  NullHandling::type null_handling = NullHandling::INTERSECTION;
+  MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE;
+};
+
+// ----------------------------------------------------------------------
+// VectorKernel (for VectorFunction)
+
+/// \brief Kernel data structure for implementations of VectorFunction. In
+/// contains an optional finalizer function, the null handling and memory
+/// pre-allocation preferences (which have different defaults from
+/// ScalarKernel), and some other execution-related options.
+struct ARROW_EXPORT VectorKernel : public Kernel {
+  /// \brief See VectorKernel::finalize member for usage
+  using FinalizeFunc = std::function<Status(KernelContext*, std::vector<Datum>*)>;
+
+  /// \brief Function for executing a stateful VectorKernel against a
+  /// ChunkedArray input. Does not need to be defined for all VectorKernels
+  using ChunkedExec = Status (*)(KernelContext*, const ExecBatch&, Datum* out);
+
+  VectorKernel() = default;
+
+  VectorKernel(std::vector<InputType> in_types, OutputType out_type, ArrayKernelExec exec,
+               KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
+      : Kernel(std::move(in_types), std::move(out_type), std::move(init)),
+        exec(exec),
+        finalize(std::move(finalize)) {}
+
+  VectorKernel(std::shared_ptr<KernelSignature> sig, ArrayKernelExec exec,
+               KernelInit init = NULLPTR, FinalizeFunc finalize = NULLPTR)
+      : Kernel(std::move(sig), std::move(init)),
+        exec(exec),
+        finalize(std::move(finalize)) {}
+
+  /// \brief Perform a single invocation of this kernel. Any required state is
+  /// managed through the KernelContext.
+  ArrayKernelExec exec;
+
+  /// \brief Execute the kernel on a ChunkedArray. Does not need to be defined
+  ChunkedExec exec_chunked = NULLPTR;
+
+  /// \brief For VectorKernel, convert intermediate results into finalized
+  /// results. Mutates input argument. Some kernels may accumulate state
+  /// (example: hashing-related functions) through processing chunked inputs, and
+  /// then need to attach some accumulated state to each of the outputs of
+  /// processing each chunk of data.
+  FinalizeFunc finalize;
+
+  /// Since vector kernels generally are implemented rather differently from
+  /// scalar/elementwise kernels (and they may not even yield arrays of the same
+  /// size), so we make the developer opt-in to any memory preallocation rather
+  /// than having to turn it off.
+  NullHandling::type null_handling = NullHandling::COMPUTED_NO_PREALLOCATE;
+  MemAllocation::type mem_allocation = MemAllocation::NO_PREALLOCATE;
+
+  /// \brief Writing execution results into larger contiguous allocations
+  /// requires that the kernel be able to write into sliced output ArrayData*,
+  /// including sliced output validity bitmaps. Some kernel implementations may
+  /// not be able to do this, so setting this to false disables this
+  /// functionality.
+  bool can_write_into_slices = true;
+
+  /// Some vector kernels can do chunkwise execution using ExecSpanIterator,
+  /// in some cases accumulating some state. Other kernels (like Take) need to
+  /// be passed whole arrays and don't work on ChunkedArray inputs
+  bool can_execute_chunkwise = true;
+
+  /// Some kernels (like unique and value_counts) yield non-chunked output from
+  /// chunked-array inputs. This option controls how the results are boxed when
+  /// returned from ExecVectorFunction
+  ///
+  /// true -> ChunkedArray
+  /// false -> Array
+  bool output_chunked = true;
+};
+
+// ----------------------------------------------------------------------
+// ScalarAggregateKernel (for ScalarAggregateFunction)
+
+using ScalarAggregateConsume = Status (*)(KernelContext*, const ExecSpan&);
+using ScalarAggregateMerge = Status (*)(KernelContext*, KernelState&&, KernelState*);
+// Finalize returns Datum to permit multiple return values
+using ScalarAggregateFinalize = Status (*)(KernelContext*, Datum*);
+
+/// \brief Kernel data structure for implementations of
+/// ScalarAggregateFunction. The four necessary components of an aggregation
+/// kernel are the init, consume, merge, and finalize functions.
+///
+/// * init: creates a new KernelState for a kernel.
+/// * consume: processes an ExecSpan and updates the KernelState found in the
+///   KernelContext.
+/// * merge: combines one KernelState with another.
+/// * finalize: produces the end result of the aggregation using the
+///   KernelState in the KernelContext.
+struct ARROW_EXPORT ScalarAggregateKernel : public Kernel {
+  ScalarAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+                        ScalarAggregateConsume consume, ScalarAggregateMerge merge,
+                        ScalarAggregateFinalize finalize, const bool ordered)
+      : Kernel(std::move(sig), std::move(init)),
+        consume(consume),
+        merge(merge),
+        finalize(finalize),
+        ordered(ordered) {}
+
+  ScalarAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
+                        KernelInit init, ScalarAggregateConsume consume,
+                        ScalarAggregateMerge merge, ScalarAggregateFinalize finalize,
+                        const bool ordered)
+      : ScalarAggregateKernel(
+            KernelSignature::Make(std::move(in_types), std::move(out_type)),
+            std::move(init), consume, merge, finalize, ordered) {}
+
+  /// \brief Merge a vector of KernelStates into a single KernelState.
+  /// The merged state will be returned and will be set on the KernelContext.
+  static Result<std::unique_ptr<KernelState>> MergeAll(
+      const ScalarAggregateKernel* kernel, KernelContext* ctx,
+      std::vector<std::unique_ptr<KernelState>> states);
+
+  ScalarAggregateConsume consume;
+  ScalarAggregateMerge merge;
+  ScalarAggregateFinalize finalize;
+  /// \brief Whether this kernel requires ordering
+  /// Some aggregations, such as, "first", requires some kind of input order. The
+  /// order can be implicit, e.g., the order of the input data, or explicit, e.g.
+  /// the ordering specified with a window aggregation.
+  /// The caller of the aggregate kernel is responsible for passing data in some
+  /// defined order to the kernel. The flag here is a way for the kernel to tell
+  /// the caller that data passed to the kernel must be defined in some order.
+  bool ordered = false;
+};
+
+// ----------------------------------------------------------------------
+// HashAggregateKernel (for HashAggregateFunction)
+
+using HashAggregateResize = Status (*)(KernelContext*, int64_t);
+using HashAggregateConsume = Status (*)(KernelContext*, const ExecSpan&);
+using HashAggregateMerge = Status (*)(KernelContext*, KernelState&&, const ArrayData&);
+
+// Finalize returns Datum to permit multiple return values
+using HashAggregateFinalize = Status (*)(KernelContext*, Datum*);
+
+/// \brief Kernel data structure for implementations of
+/// HashAggregateFunction. The four necessary components of an aggregation
+/// kernel are the init, consume, merge, and finalize functions.
+///
+/// * init: creates a new KernelState for a kernel.
+/// * resize: ensure that the KernelState can accommodate the specified number of groups.
+/// * consume: processes an ExecSpan (which includes the argument as well
+///   as an array of group identifiers) and updates the KernelState found in the
+///   KernelContext.
+/// * merge: combines one KernelState with another.
+/// * finalize: produces the end result of the aggregation using the
+///   KernelState in the KernelContext.
+struct ARROW_EXPORT HashAggregateKernel : public Kernel {
+  HashAggregateKernel() = default;
+
+  HashAggregateKernel(std::shared_ptr<KernelSignature> sig, KernelInit init,
+                      HashAggregateResize resize, HashAggregateConsume consume,
+                      HashAggregateMerge merge, HashAggregateFinalize finalize,
+                      const bool ordered)
+      : Kernel(std::move(sig), std::move(init)),
+        resize(resize),
+        consume(consume),
+        merge(merge),
+        finalize(finalize),
+        ordered(ordered) {}
+
+  HashAggregateKernel(std::vector<InputType> in_types, OutputType out_type,
+                      KernelInit init, HashAggregateConsume consume,
+                      HashAggregateResize resize, HashAggregateMerge merge,
+                      HashAggregateFinalize finalize, const bool ordered)
+      : HashAggregateKernel(
+            KernelSignature::Make(std::move(in_types), std::move(out_type)),
+            std::move(init), resize, consume, merge, finalize, ordered) {}
+
+  HashAggregateResize resize;
+  HashAggregateConsume consume;
+  HashAggregateMerge merge;
+  HashAggregateFinalize finalize;
+  /// @brief whether the summarizer requires ordering
+  /// This is similar to ScalarAggregateKernel. See ScalarAggregateKernel
+  /// for detailed doc of this variable.
+  bool ordered = false;
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/ordering.h b/pyarrow/include/arrow/compute/ordering.h
new file mode 100644
index 0000000000000000000000000000000000000000..61caa2b570dd31dc988d34406f9b05c3573333e2
--- /dev/null
+++ b/pyarrow/include/arrow/compute/ordering.h
@@ -0,0 +1,120 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "arrow/type.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+enum class SortOrder {
+  /// Arrange values in increasing order
+  Ascending,
+  /// Arrange values in decreasing order
+  Descending,
+};
+
+enum class NullPlacement {
+  /// Place nulls and NaNs before any non-null values.
+  /// NaNs will come after nulls.
+  AtStart,
+  /// Place nulls and NaNs after any non-null values.
+  /// NaNs will come before nulls.
+  AtEnd,
+};
+
+/// \brief One sort key for PartitionNthIndices (TODO) and SortIndices
+class ARROW_EXPORT SortKey : public util::EqualityComparable<SortKey> {
+ public:
+  explicit SortKey(FieldRef target, SortOrder order = SortOrder::Ascending)
+      : target(std::move(target)), order(order) {}
+
+  bool Equals(const SortKey& other) const;
+  std::string ToString() const;
+
+  /// A FieldRef targeting the sort column.
+  FieldRef target;
+  /// How to order by this sort key.
+  SortOrder order;
+};
+
+class ARROW_EXPORT Ordering : public util::EqualityComparable<Ordering> {
+ public:
+  Ordering(std::vector<SortKey> sort_keys,
+           NullPlacement null_placement = NullPlacement::AtStart)
+      : sort_keys_(std::move(sort_keys)), null_placement_(null_placement) {}
+  /// true if data ordered by other is also ordered by this
+  ///
+  /// For example, if data is ordered by [a, b, c] then it is also ordered
+  /// by [a, b] but not by [b, c] or [a, b, c, d].
+  ///
+  /// [a, b].IsSuborderOf([a, b, c]) - true
+  /// [a, b, c].IsSuborderOf([a, b, c]) - true
+  /// [b, c].IsSuborderOf([a, b, c]) - false
+  /// [a, b, c, d].IsSuborderOf([a, b, c]) - false
+  ///
+  /// The implicit ordering is not a suborder of any other ordering and
+  /// no other ordering is a suborder of it.  The implicit ordering is not a
+  /// suborder of itself.
+  ///
+  /// The unordered ordering is a suborder of all other orderings but no
+  /// other ordering is a suborder of it.  The unordered ordering is a suborder
+  /// of itself.
+  ///
+  /// The unordered ordering is a suborder of the implicit ordering.
+  bool IsSuborderOf(const Ordering& other) const;
+
+  bool Equals(const Ordering& other) const;
+  std::string ToString() const;
+
+  bool is_implicit() const { return is_implicit_; }
+  bool is_unordered() const { return !is_implicit_ && sort_keys_.empty(); }
+
+  const std::vector<SortKey>& sort_keys() const { return sort_keys_; }
+  NullPlacement null_placement() const { return null_placement_; }
+
+  static const Ordering& Implicit() {
+    static const Ordering kImplicit(true);
+    return kImplicit;
+  }
+
+  static const Ordering& Unordered() {
+    static const Ordering kUnordered(false);
+    // It is also possible to get an unordered ordering by passing in an empty vector
+    // using the normal constructor.  This is ok and useful when ordering comes from user
+    // input.
+    return kUnordered;
+  }
+
+ private:
+  explicit Ordering(bool is_implicit)
+      : null_placement_(NullPlacement::AtStart), is_implicit_(is_implicit) {}
+  /// Column key(s) to order by and how to order by these sort keys.
+  std::vector<SortKey> sort_keys_;
+  /// Whether nulls and NaNs are placed at the start or at the end
+  NullPlacement null_placement_;
+  bool is_implicit_ = false;
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/registry.h b/pyarrow/include/arrow/compute/registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..f31c4c1ba5920626578a4e4170e3cd2d28288545
--- /dev/null
+++ b/pyarrow/include/arrow/compute/registry.h
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// NOTE: API is EXPERIMENTAL and will change without going through a
+// deprecation cycle
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+class Function;
+class FunctionOptionsType;
+
+/// \brief A mutable central function registry for built-in functions as well
+/// as user-defined functions. Functions are implementations of
+/// arrow::compute::Function.
+///
+/// Generally, each function contains kernels which are implementations of a
+/// function for a specific argument signature. After looking up a function in
+/// the registry, one can either execute it eagerly with Function::Execute or
+/// use one of the function's dispatch methods to pick a suitable kernel for
+/// lower-level function execution.
+class ARROW_EXPORT FunctionRegistry {
+ public:
+  ~FunctionRegistry();
+
+  /// \brief Construct a new registry.
+  ///
+  /// Most users only need to use the global registry.
+  static std::unique_ptr<FunctionRegistry> Make();
+
+  /// \brief Construct a new nested registry with the given parent.
+  ///
+  /// Most users only need to use the global registry. The returned registry never changes
+  /// its parent, even when an operation allows overwriting.
+  static std::unique_ptr<FunctionRegistry> Make(FunctionRegistry* parent);
+
+  /// \brief Check whether a new function can be added to the registry.
+  ///
+  /// \returns Status::KeyError if a function with the same name is already registered.
+  Status CanAddFunction(std::shared_ptr<Function> function, bool allow_overwrite = false);
+
+  /// \brief Add a new function to the registry.
+  ///
+  /// \returns Status::KeyError if a function with the same name is already registered.
+  Status AddFunction(std::shared_ptr<Function> function, bool allow_overwrite = false);
+
+  /// \brief Check whether an alias can be added for the given function name.
+  ///
+  /// \returns Status::KeyError if the function with the given name is not registered.
+  Status CanAddAlias(const std::string& target_name, const std::string& source_name);
+
+  /// \brief Add alias for the given function name.
+  ///
+  /// \returns Status::KeyError if the function with the given name is not registered.
+  Status AddAlias(const std::string& target_name, const std::string& source_name);
+
+  /// \brief Check whether a new function options type can be added to the registry.
+  ///
+  /// \return Status::KeyError if a function options type with the same name is already
+  /// registered.
+  Status CanAddFunctionOptionsType(const FunctionOptionsType* options_type,
+                                   bool allow_overwrite = false);
+
+  /// \brief Add a new function options type to the registry.
+  ///
+  /// \returns Status::KeyError if a function options type with the same name is already
+  /// registered.
+  Status AddFunctionOptionsType(const FunctionOptionsType* options_type,
+                                bool allow_overwrite = false);
+
+  /// \brief Retrieve a function by name from the registry.
+  Result<std::shared_ptr<Function>> GetFunction(const std::string& name) const;
+
+  /// \brief Return vector of all entry names in the registry.
+  ///
+  /// Helpful for displaying a manifest of available functions.
+  std::vector<std::string> GetFunctionNames() const;
+
+  /// \brief Retrieve a function options type by name from the registry.
+  Result<const FunctionOptionsType*> GetFunctionOptionsType(
+      const std::string& name) const;
+
+  /// \brief The number of currently registered functions.
+  int num_functions() const;
+
+  /// \brief The cast function object registered in AddFunction.
+  ///
+  /// Helpful for get cast function as needed.
+  const Function* cast_function() const;
+
+ private:
+  FunctionRegistry();
+
+  // Use PIMPL pattern to not have std::unordered_map here
+  class FunctionRegistryImpl;
+  std::unique_ptr<FunctionRegistryImpl> impl_;
+
+  explicit FunctionRegistry(FunctionRegistryImpl* impl);
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/row/grouper.h b/pyarrow/include/arrow/compute/row/grouper.h
new file mode 100644
index 0000000000000000000000000000000000000000..9424559385b7391d4dc7d46ddbbb542803c9001e
--- /dev/null
+++ b/pyarrow/include/arrow/compute/row/grouper.h
@@ -0,0 +1,198 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/visibility.h"
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace compute {
+
+/// \brief A segment
+/// A segment group is a chunk of continuous rows that have the same segment key. (For
+/// example, in ordered time series processing, segment key can be "date", and a segment
+/// group can be all the rows that belong to the same date.) A segment group can span
+/// across multiple exec batches. A segment is a chunk of continuous rows that has the
+/// same segment key within a given batch. When a segment group span cross batches, it
+/// will have multiple segments. A segment never spans cross batches. The segment data
+/// structure only makes sense when used along with a exec batch.
+struct ARROW_COMPUTE_EXPORT Segment {
+  /// \brief the offset into the batch where the segment starts
+  int64_t offset;
+  /// \brief the length of the segment
+  int64_t length;
+  /// \brief whether the segment may be extended by a next one
+  bool is_open;
+  /// \brief whether the segment extends a preceeding one
+  bool extends;
+};
+
+inline bool operator==(const Segment& segment1, const Segment& segment2) {
+  return segment1.offset == segment2.offset && segment1.length == segment2.length &&
+         segment1.is_open == segment2.is_open && segment1.extends == segment2.extends;
+}
+inline bool operator!=(const Segment& segment1, const Segment& segment2) {
+  return !(segment1 == segment2);
+}
+
+/// \brief a helper class to divide a batch into segments of equal values
+///
+/// For example, given a batch with two columns specifed as segment keys:
+///
+/// A A [other columns]...
+/// A A ...
+/// A B ...
+/// A B ...
+/// A A ...
+///
+/// Then the batch could be divided into 3 segments.  The first would be rows 0 & 1,
+/// the second would be rows 2 & 3, and the third would be row 4.
+///
+/// Further, a segmenter keeps track of the last value seen.  This allows it to calculate
+/// segments which span batches.  In our above example the last batch we emit would set
+/// the "open" flag, which indicates whether the segment may extend into the next batch.
+///
+/// If the next call to the segmenter starts with `A A` then that segment would set the
+/// "extends" flag, which indicates whether the segment continues the last open batch.
+class ARROW_COMPUTE_EXPORT RowSegmenter {
+ public:
+  virtual ~RowSegmenter() = default;
+
+  /// \brief Construct a Segmenter which segments on the specified key types
+  ///
+  /// \param[in] key_types the specified key types
+  /// \param[in] nullable_keys whether values of the specified keys may be null
+  /// \param[in] ctx the execution context to use
+  static Result<std::unique_ptr<RowSegmenter>> Make(
+      const std::vector<TypeHolder>& key_types, bool nullable_keys, ExecContext* ctx);
+
+  /// \brief Return the key types of this segmenter
+  virtual const std::vector<TypeHolder>& key_types() const = 0;
+
+  /// \brief Reset this segmenter
+  ///
+  /// A segmenter normally extends (see `Segment`) a segment from one batch to the next.
+  /// If segment-extension is undesirable, for example when each batch is processed
+  /// independently, then `Reset` should be invoked before processing the next batch.
+  virtual Status Reset() = 0;
+
+  /// \brief Get all segments for the given batch
+  virtual Result<std::vector<Segment>> GetSegments(const ExecSpan& batch) = 0;
+};
+
+/// Consumes batches of keys and yields batches of the group ids.
+class ARROW_COMPUTE_EXPORT Grouper {
+ public:
+  virtual ~Grouper() = default;
+
+  /// Construct a Grouper which receives the specified key types
+  static Result<std::unique_ptr<Grouper>> Make(const std::vector<TypeHolder>& key_types,
+                                               ExecContext* ctx = default_exec_context());
+
+  /// Reset all intermediate state, make the grouper logically as just `Make`ed.
+  /// The underlying buffers, if any, may or may not be released though.
+  virtual Status Reset() = 0;
+
+  /// Consume a batch of keys, producing the corresponding group ids as an integer array,
+  /// over a slice defined by an offset and length, which defaults to the batch length.
+  /// Currently only uint32 indices will be produced, eventually the bit width will only
+  /// be as wide as necessary.
+  virtual Result<Datum> Consume(const ExecSpan& batch, int64_t offset = 0,
+                                int64_t length = -1) = 0;
+
+  /// Like Consume, but groups not already encountered emit null instead of
+  /// generating a new group id.
+  virtual Result<Datum> Lookup(const ExecSpan& batch, int64_t offset = 0,
+                               int64_t length = -1) = 0;
+
+  /// Like Consume, but only populates the Grouper without returning the group ids.
+  virtual Status Populate(const ExecSpan& batch, int64_t offset = 0,
+                          int64_t length = -1) = 0;
+
+  /// Get current unique keys. May be called multiple times.
+  virtual Result<ExecBatch> GetUniques() = 0;
+
+  /// Get the current number of groups.
+  virtual uint32_t num_groups() const = 0;
+
+  /// \brief Assemble lists of indices of identical elements.
+  ///
+  /// \param[in] ids An unsigned, all-valid integral array which will be
+  ///                used as grouping criteria.
+  /// \param[in] num_groups An upper bound for the elements of ids
+  /// \param[in] ctx Execution context to use during the operation
+  /// \return A num_groups-long ListArray where the slot at i contains a
+  ///         list of indices where i appears in ids.
+  ///
+  ///   MakeGroupings([
+  ///       2,
+  ///       2,
+  ///       5,
+  ///       5,
+  ///       2,
+  ///       3
+  ///   ], 8) == [
+  ///       [],
+  ///       [],
+  ///       [0, 1, 4],
+  ///       [5],
+  ///       [],
+  ///       [2, 3],
+  ///       [],
+  ///       []
+  ///   ]
+  static Result<std::shared_ptr<ListArray>> MakeGroupings(
+      const UInt32Array& ids, uint32_t num_groups,
+      ExecContext* ctx = default_exec_context());
+
+  /// \brief Produce a ListArray whose slots are selections of `array` which correspond to
+  /// the provided groupings.
+  ///
+  /// For example,
+  ///   ApplyGroupings([
+  ///       [],
+  ///       [],
+  ///       [0, 1, 4],
+  ///       [5],
+  ///       [],
+  ///       [2, 3],
+  ///       [],
+  ///       []
+  ///   ], [2, 2, 5, 5, 2, 3]) == [
+  ///       [],
+  ///       [],
+  ///       [2, 2, 2],
+  ///       [3],
+  ///       [],
+  ///       [5, 5],
+  ///       [],
+  ///       []
+  ///   ]
+  static Result<std::shared_ptr<ListArray>> ApplyGroupings(
+      const ListArray& groupings, const Array& array,
+      ExecContext* ctx = default_exec_context());
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/type_fwd.h b/pyarrow/include/arrow/compute/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..016d97a0dbc2b3b77be0b07e7effca3669439eb8
--- /dev/null
+++ b/pyarrow/include/arrow/compute/type_fwd.h
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+struct Datum;
+struct TypeHolder;
+
+namespace compute {
+
+class Function;
+class ScalarAggregateFunction;
+class FunctionExecutor;
+class FunctionOptions;
+class FunctionRegistry;
+
+/// \brief Return the process-global function registry.
+// Defined in registry.cc
+ARROW_EXPORT FunctionRegistry* GetFunctionRegistry();
+
+class CastOptions;
+
+struct ExecBatch;
+class ExecContext;
+struct ExecValue;
+class KernelContext;
+
+struct Kernel;
+struct ScalarKernel;
+struct ScalarAggregateKernel;
+struct VectorKernel;
+
+struct KernelState;
+
+class Expression;
+
+ARROW_EXPORT ExecContext* default_exec_context();
+ARROW_EXPORT ExecContext* threaded_exec_context();
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/util.h b/pyarrow/include/arrow/compute/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..51a24b50fe60d4e22a9de02111568efd7f9cf334
--- /dev/null
+++ b/pyarrow/include/arrow/compute/util.h
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <optional>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/compute/expression.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/compute/visibility.h"
+#include "arrow/result.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/simd.h"
+
+#if defined(__clang__) || defined(__GNUC__)
+#  define BYTESWAP(x) __builtin_bswap64(x)
+#  define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31)))
+#  define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63)))
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#  define BYTESWAP(x) _byteswap_uint64(x)
+#  define ROTL(x, n) _rotl((x), (n))
+#  define ROTL64(x, n) _rotl64((x), (n))
+#endif
+
+namespace arrow {
+namespace util {
+
+// Some platforms typedef int64_t as long int instead of long long int,
+// which breaks the _mm256_i64gather_epi64 and _mm256_i32gather_epi64 intrinsics
+// which need long long.
+// We use the cast to the type below in these intrinsics to make the code
+// compile in all cases.
+//
+using int64_for_gather_t = const long long int;  // NOLINT runtime-int
+
+// All MiniBatch... classes use TempVectorStack for vector allocations and can
+// only work with vectors up to 1024 elements.
+//
+// They should only be allocated on the stack to guarantee the right sequence
+// of allocation and deallocation of vectors from TempVectorStack.
+//
+class MiniBatch {
+ public:
+  static constexpr int kLogMiniBatchLength = 10;
+  static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength;
+};
+
+namespace bit_util {
+
+ARROW_COMPUTE_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags,
+                                          const int num_bits, const uint8_t* bits,
+                                          int* num_indexes, uint16_t* indexes,
+                                          int bit_offset = 0);
+
+ARROW_COMPUTE_EXPORT void bits_filter_indexes(int bit_to_search, int64_t hardware_flags,
+                                              const int num_bits, const uint8_t* bits,
+                                              const uint16_t* input_indexes,
+                                              int* num_indexes, uint16_t* indexes,
+                                              int bit_offset = 0);
+
+// Input and output indexes may be pointing to the same data (in-place filtering).
+ARROW_COMPUTE_EXPORT void bits_split_indexes(int64_t hardware_flags, const int num_bits,
+                                             const uint8_t* bits, int* num_indexes_bit0,
+                                             uint16_t* indexes_bit0,
+                                             uint16_t* indexes_bit1, int bit_offset = 0);
+
+// Bit 1 is replaced with byte 0xFF.
+ARROW_COMPUTE_EXPORT void bits_to_bytes(int64_t hardware_flags, const int num_bits,
+                                        const uint8_t* bits, uint8_t* bytes,
+                                        int bit_offset = 0);
+
+// Return highest bit of each byte.
+ARROW_COMPUTE_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits,
+                                        const uint8_t* bytes, uint8_t* bits,
+                                        int bit_offset = 0);
+
+ARROW_COMPUTE_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes,
+                                             uint32_t num_bytes);
+
+#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2)
+// The functions below use BMI2 instructions, be careful before calling!
+
+namespace avx2 {
+ARROW_COMPUTE_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits,
+                                                   const uint8_t* bits,
+                                                   const uint16_t* input_indexes,
+                                                   int* num_indexes, uint16_t* indexes);
+ARROW_COMPUTE_EXPORT void bits_to_indexes_avx2(int bit_to_search, const int num_bits,
+                                               const uint8_t* bits, int* num_indexes,
+                                               uint16_t* indexes,
+                                               uint16_t base_index = 0);
+ARROW_COMPUTE_EXPORT void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits,
+                                             uint8_t* bytes);
+ARROW_COMPUTE_EXPORT void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes,
+                                             uint8_t* bits);
+ARROW_COMPUTE_EXPORT bool are_all_bytes_zero_avx2(const uint8_t* bytes,
+                                                  uint32_t num_bytes);
+}  // namespace avx2
+
+#endif
+
+}  // namespace bit_util
+}  // namespace util
+
+namespace compute {
+
+/// Modify an Expression with pre-order and post-order visitation.
+/// `pre` will be invoked on each Expression. `pre` will visit Calls before their
+/// arguments, `post_call` will visit Calls (and no other Expressions) after their
+/// arguments. Visitors should return the Identical expression to indicate no change; this
+/// will prevent unnecessary construction in the common case where a modification is not
+/// possible/necessary/...
+///
+/// If an argument was modified, `post_call` visits a reconstructed Call with the modified
+/// arguments but also receives a pointer to the unmodified Expression as a second
+/// argument. If no arguments were modified the unmodified Expression* will be nullptr.
+template <typename PreVisit, typename PostVisitCall>
+Result<Expression> ModifyExpression(Expression expr, const PreVisit& pre,
+                                    const PostVisitCall& post_call) {
+  ARROW_ASSIGN_OR_RAISE(expr, Result<Expression>(pre(std::move(expr))));
+
+  auto call = expr.call();
+  if (!call) return expr;
+
+  bool at_least_one_modified = false;
+  std::vector<Expression> modified_arguments;
+
+  for (size_t i = 0; i < call->arguments.size(); ++i) {
+    ARROW_ASSIGN_OR_RAISE(auto modified_argument,
+                          ModifyExpression(call->arguments[i], pre, post_call));
+
+    if (Expression::Identical(modified_argument, call->arguments[i])) {
+      continue;
+    }
+
+    if (!at_least_one_modified) {
+      modified_arguments = call->arguments;
+      at_least_one_modified = true;
+    }
+
+    modified_arguments[i] = std::move(modified_argument);
+  }
+
+  if (at_least_one_modified) {
+    // reconstruct the call expression with the modified arguments
+    auto modified_call = *call;
+    modified_call.arguments = std::move(modified_arguments);
+    return post_call(Expression(std::move(modified_call)), &expr);
+  }
+
+  return post_call(std::move(expr), NULLPTR);
+}
+
+// Helper class to calculate the modified number of rows to process using SIMD.
+//
+// Some array elements at the end will be skipped in order to avoid buffer
+// overrun, when doing memory loads and stores using larger word size than a
+// single array element.
+//
+class TailSkipForSIMD {
+ public:
+  static int64_t FixBitAccess(int num_bytes_accessed_together, int64_t num_rows,
+                              int bit_offset) {
+    int64_t num_bytes = bit_util::BytesForBits(num_rows + bit_offset);
+    int64_t num_bytes_safe =
+        std::max(static_cast<int64_t>(0LL), num_bytes - num_bytes_accessed_together + 1);
+    int64_t num_rows_safe =
+        std::max(static_cast<int64_t>(0LL), 8 * num_bytes_safe - bit_offset);
+    return std::min(num_rows_safe, num_rows);
+  }
+  static int64_t FixBinaryAccess(int num_bytes_accessed_together, int64_t num_rows,
+                                 int64_t length) {
+    int64_t num_rows_to_skip = bit_util::CeilDiv(length, num_bytes_accessed_together);
+    int64_t num_rows_safe =
+        std::max(static_cast<int64_t>(0LL), num_rows - num_rows_to_skip);
+    return num_rows_safe;
+  }
+  static int64_t FixVarBinaryAccess(int num_bytes_accessed_together, int64_t num_rows,
+                                    const uint32_t* offsets) {
+    // Do not process rows that could read past the end of the buffer using N
+    // byte loads/stores.
+    //
+    int64_t num_rows_safe = num_rows;
+    while (num_rows_safe > 0 &&
+           offsets[num_rows_safe] + num_bytes_accessed_together > offsets[num_rows]) {
+      --num_rows_safe;
+    }
+    return num_rows_safe;
+  }
+  static int FixSelection(int64_t num_rows_safe, int num_selected,
+                          const uint16_t* selection) {
+    int num_selected_safe = num_selected;
+    while (num_selected_safe > 0 && selection[num_selected_safe - 1] >= num_rows_safe) {
+      --num_selected_safe;
+    }
+    return num_selected_safe;
+  }
+};
+
+}  // namespace compute
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/compute/visibility.h b/pyarrow/include/arrow/compute/visibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae994bd233329ff9ab0456191d3f8adf85ee3068
--- /dev/null
+++ b/pyarrow/include/arrow/compute/visibility.h
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_COMPUTE_STATIC
+#    define ARROW_COMPUTE_EXPORT
+#  elif defined(ARROW_COMPUTE_EXPORTING)
+#    define ARROW_COMPUTE_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_COMPUTE_EXPORT __declspec(dllimport)
+#  endif
+
+#  define ARROW_COMPUTE_NO_EXPORT
+
+#  if defined(_MSC_VER)
+#    pragma warning(pop)
+#  endif
+
+#else  // Not Windows
+#  ifndef ARROW_COMPUTE_EXPORT
+#    define ARROW_COMPUTE_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_COMPUTE_NO_EXPORT
+#    define ARROW_COMPUTE_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif
diff --git a/pyarrow/include/arrow/config.h b/pyarrow/include/arrow/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..617d6c268b55ea344a3fe7f96141ff0f7e4d3f88
--- /dev/null
+++ b/pyarrow/include/arrow/config.h
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <optional>
+#include <string>
+
+#include "arrow/status.h"
+#include "arrow/util/config.h"  // IWYU pragma: export
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+struct BuildInfo {
+  /// The packed version number, e.g. 1002003 (decimal) for Arrow 1.2.3
+  int version;
+  /// The "major" version number, e.g. 1 for Arrow 1.2.3
+  int version_major;
+  /// The "minor" version number, e.g. 2 for Arrow 1.2.3
+  int version_minor;
+  /// The "patch" version number, e.g. 3 for Arrow 1.2.3
+  int version_patch;
+  /// The version string, e.g. "1.2.3"
+  std::string version_string;
+  std::string so_version;
+  std::string full_so_version;
+
+  /// The CMake compiler identifier, e.g. "GNU"
+  std::string compiler_id;
+  std::string compiler_version;
+  std::string compiler_flags;
+
+  /// The git changeset id, if available
+  std::string git_id;
+  /// The git changeset description, if available
+  std::string git_description;
+  std::string package_kind;
+
+  /// The uppercase build type, e.g. "DEBUG" or "RELEASE"
+  std::string build_type;
+};
+
+struct RuntimeInfo {
+  /// The enabled SIMD level
+  ///
+  /// This can be less than `detected_simd_level` if the ARROW_USER_SIMD_LEVEL
+  /// environment variable is set to another value.
+  std::string simd_level;
+
+  /// The SIMD level available on the OS and CPU
+  std::string detected_simd_level;
+
+  /// Whether using the OS-based timezone database
+  /// This is set at compile-time.
+  bool using_os_timezone_db;
+
+  /// The path to the timezone database; by default None.
+  std::optional<std::string> timezone_db_path;
+};
+
+/// \brief Get runtime build info.
+///
+/// The returned values correspond to exact loaded version of the Arrow library,
+/// rather than the values frozen at application compile-time through the `ARROW_*`
+/// preprocessor definitions.
+ARROW_EXPORT
+const BuildInfo& GetBuildInfo();
+
+/// \brief Get runtime info.
+///
+ARROW_EXPORT
+RuntimeInfo GetRuntimeInfo();
+
+struct GlobalOptions {
+  /// Path to text timezone database. This is only configurable on Windows,
+  /// which does not have a compatible OS timezone database.
+  std::optional<std::string> timezone_db_path;
+};
+
+ARROW_EXPORT
+Status Initialize(const GlobalOptions& options) noexcept;
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/csv/api.h b/pyarrow/include/arrow/csv/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..4af1835cd709d43e0abe3b39b46531cae9a047fc
--- /dev/null
+++ b/pyarrow/include/arrow/csv/api.h
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/csv/options.h"
+#include "arrow/csv/reader.h"
+#include "arrow/csv/writer.h"
diff --git a/pyarrow/include/arrow/csv/chunker.h b/pyarrow/include/arrow/csv/chunker.h
new file mode 100644
index 0000000000000000000000000000000000000000..662b16ec40a9485547ce01b32ea0325a23122711
--- /dev/null
+++ b/pyarrow/include/arrow/csv/chunker.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/csv/options.h"
+#include "arrow/status.h"
+#include "arrow/util/delimiting.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace csv {
+
+ARROW_EXPORT
+std::unique_ptr<Chunker> MakeChunker(const ParseOptions& options);
+
+}  // namespace csv
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/csv/column_builder.h b/pyarrow/include/arrow/csv/column_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fc4643d9d4c3d4cccff7f9d179e62b5f720ac6f
--- /dev/null
+++ b/pyarrow/include/arrow/csv/column_builder.h
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace csv {
+
+class BlockParser;
+struct ConvertOptions;
+
+class ARROW_EXPORT ColumnBuilder {
+ public:
+  virtual ~ColumnBuilder() = default;
+
+  /// Spawn a task that will try to convert and append the given CSV block.
+  /// All calls to Append() should happen on the same thread, otherwise
+  /// call Insert() instead.
+  virtual void Append(const std::shared_ptr<BlockParser>& parser) = 0;
+
+  /// Spawn a task that will try to convert and insert the given CSV block
+  virtual void Insert(int64_t block_index,
+                      const std::shared_ptr<BlockParser>& parser) = 0;
+
+  /// Return the final chunked array.  The TaskGroup _must_ have finished!
+  virtual Result<std::shared_ptr<ChunkedArray>> Finish() = 0;
+
+  std::shared_ptr<arrow::internal::TaskGroup> task_group() { return task_group_; }
+
+  /// Construct a strictly-typed ColumnBuilder.
+  static Result<std::shared_ptr<ColumnBuilder>> Make(
+      MemoryPool* pool, const std::shared_ptr<DataType>& type, int32_t col_index,
+      std::shared_ptr<ConvertOptions> options,
+      std::shared_ptr<arrow::internal::TaskGroup> task_group);
+
+  /// Construct a type-inferring ColumnBuilder.
+  static Result<std::shared_ptr<ColumnBuilder>> Make(
+      MemoryPool* pool, int32_t col_index, std::shared_ptr<ConvertOptions> options,
+      std::shared_ptr<arrow::internal::TaskGroup> task_group);
+
+  /// Construct a ColumnBuilder for a column of nulls
+  /// (i.e. not present in the CSV file).
+  static Result<std::shared_ptr<ColumnBuilder>> MakeNull(
+      MemoryPool* pool, const std::shared_ptr<DataType>& type,
+      std::shared_ptr<arrow::internal::TaskGroup> task_group);
+
+ protected:
+  explicit ColumnBuilder(std::shared_ptr<arrow::internal::TaskGroup> task_group)
+      : task_group_(std::move(task_group)) {}
+
+  std::shared_ptr<arrow::internal::TaskGroup> task_group_;
+};
+
+}  // namespace csv
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/csv/column_decoder.h b/pyarrow/include/arrow/csv/column_decoder.h
new file mode 100644
index 0000000000000000000000000000000000000000..5fbbd5df58b1c588b88e16b68da50b9399211abc
--- /dev/null
+++ b/pyarrow/include/arrow/csv/column_decoder.h
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace csv {
+
+class BlockParser;
+struct ConvertOptions;
+
+class ARROW_EXPORT ColumnDecoder {
+ public:
+  virtual ~ColumnDecoder() = default;
+
+  /// Spawn a task that will try to convert and insert the given CSV block
+  virtual Future<std::shared_ptr<Array>> Decode(
+      const std::shared_ptr<BlockParser>& parser) = 0;
+
+  /// Construct a strictly-typed ColumnDecoder.
+  static Result<std::shared_ptr<ColumnDecoder>> Make(MemoryPool* pool,
+                                                     std::shared_ptr<DataType> type,
+                                                     int32_t col_index,
+                                                     const ConvertOptions& options);
+
+  /// Construct a type-inferring ColumnDecoder.
+  /// Inference will run only on the first block, the type will be frozen afterwards.
+  static Result<std::shared_ptr<ColumnDecoder>> Make(MemoryPool* pool, int32_t col_index,
+                                                     const ConvertOptions& options);
+
+  /// Construct a ColumnDecoder for a column of nulls
+  /// (i.e. not present in the CSV file).
+  static Result<std::shared_ptr<ColumnDecoder>> MakeNull(MemoryPool* pool,
+                                                         std::shared_ptr<DataType> type);
+
+ protected:
+  ColumnDecoder() = default;
+};
+
+}  // namespace csv
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/csv/converter.h b/pyarrow/include/arrow/csv/converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..639f692f26a1ba3a134caac68a432ac22f068917
--- /dev/null
+++ b/pyarrow/include/arrow/csv/converter.h
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/csv/options.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace csv {
+
+class BlockParser;
+
+class ARROW_EXPORT Converter {
+ public:
+  Converter(const std::shared_ptr<DataType>& type, const ConvertOptions& options,
+            MemoryPool* pool);
+  virtual ~Converter() = default;
+
+  virtual Result<std::shared_ptr<Array>> Convert(const BlockParser& parser,
+                                                 int32_t col_index) = 0;
+
+  std::shared_ptr<DataType> type() const { return type_; }
+
+  // Create a Converter for the given data type
+  static Result<std::shared_ptr<Converter>> Make(
+      const std::shared_ptr<DataType>& type, const ConvertOptions& options,
+      MemoryPool* pool = default_memory_pool());
+
+ protected:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Converter);
+
+  virtual Status Initialize() = 0;
+
+  // CAUTION: ConvertOptions can grow large (if it customizes hundreds or
+  // thousands of columns), so avoid copying it in each Converter.
+  const ConvertOptions& options_;
+  MemoryPool* pool_;
+  std::shared_ptr<DataType> type_;
+};
+
+class ARROW_EXPORT DictionaryConverter : public Converter {
+ public:
+  DictionaryConverter(const std::shared_ptr<DataType>& value_type,
+                      const ConvertOptions& options, MemoryPool* pool);
+
+  // If the dictionary length goes above this value, conversion will fail
+  // with Status::IndexError.
+  virtual void SetMaxCardinality(int32_t max_length) = 0;
+
+  // Create a Converter for the given dictionary value type.
+  // The dictionary index type will always be Int32.
+  static Result<std::shared_ptr<DictionaryConverter>> Make(
+      const std::shared_ptr<DataType>& value_type, const ConvertOptions& options,
+      MemoryPool* pool = default_memory_pool());
+
+ protected:
+  std::shared_ptr<DataType> value_type_;
+};
+
+}  // namespace csv
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/csv/invalid_row.h b/pyarrow/include/arrow/csv/invalid_row.h
new file mode 100644
index 0000000000000000000000000000000000000000..4360ceaaea6ac07dd218c93ce13c3ab14c16fc63
--- /dev/null
+++ b/pyarrow/include/arrow/csv/invalid_row.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <string_view>
+
+namespace arrow {
+namespace csv {
+
+/// \brief Description of an invalid row
+struct InvalidRow {
+  /// \brief Number of columns expected in the row
+  int32_t expected_columns;
+  /// \brief Actual number of columns found in the row
+  int32_t actual_columns;
+  /// \brief The physical row number if known or -1
+  ///
+  /// This number is one-based and also accounts for non-data rows (such as
+  /// CSV header rows).
+  int64_t number;
+  /// \brief View of the entire row. Memory will be freed after callback returns
+  const std::string_view text;
+};
+
+/// \brief Result returned by an InvalidRowHandler
+enum class InvalidRowResult {
+  // Generate an error describing this row
+  Error,
+  // Skip over this row
+  Skip
+};
+
+/// \brief callback for handling a row with an invalid number of columns while parsing
+/// \return result indicating if an error should be returned from the parser or the row is
+/// skipped
+using InvalidRowHandler = std::function<InvalidRowResult(const InvalidRow&)>;
+
+}  // namespace csv
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/csv/options.h b/pyarrow/include/arrow/csv/options.h
new file mode 100644
index 0000000000000000000000000000000000000000..10e55bf838c33f00ab520bce7f4e145a7db8819a
--- /dev/null
+++ b/pyarrow/include/arrow/csv/options.h
@@ -0,0 +1,226 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/csv/invalid_row.h"
+#include "arrow/csv/type_fwd.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/status.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class DataType;
+class TimestampParser;
+
+namespace csv {
+
+// Silly workaround for https://github.com/michaeljones/breathe/issues/453
+constexpr char kDefaultEscapeChar = '\\';
+
+struct ARROW_EXPORT ParseOptions {
+  // Parsing options
+
+  /// Field delimiter
+  char delimiter = ',';
+  /// Whether quoting is used
+  bool quoting = true;
+  /// Quoting character (if `quoting` is true)
+  char quote_char = '"';
+  /// Whether a quote inside a value is double-quoted
+  bool double_quote = true;
+  /// Whether escaping is used
+  bool escaping = false;
+  /// Escaping character (if `escaping` is true)
+  char escape_char = kDefaultEscapeChar;
+  /// Whether values are allowed to contain CR (0x0d) and LF (0x0a) characters
+  bool newlines_in_values = false;
+  /// Whether empty lines are ignored.  If false, an empty line represents
+  /// a single empty value (assuming a one-column CSV file).
+  bool ignore_empty_lines = true;
+  /// A handler function for rows which do not have the correct number of columns
+  InvalidRowHandler invalid_row_handler;
+
+  /// Create parsing options with default values
+  static ParseOptions Defaults();
+
+  /// \brief Test that all set options are valid
+  Status Validate() const;
+};
+
+struct ARROW_EXPORT ConvertOptions {
+  // Conversion options
+
+  /// Whether to check UTF8 validity of string columns
+  bool check_utf8 = true;
+  /// Optional per-column types (disabling type inference on those columns)
+  std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
+  /// Recognized spellings for null values
+  std::vector<std::string> null_values;
+  /// Recognized spellings for boolean true values
+  std::vector<std::string> true_values;
+  /// Recognized spellings for boolean false values
+  std::vector<std::string> false_values;
+
+  /// Whether string / binary columns can have null values.
+  ///
+  /// If true, then strings in "null_values" are considered null for string columns.
+  /// If false, then all strings are valid string values.
+  bool strings_can_be_null = false;
+
+  /// Whether quoted values can be null.
+  ///
+  /// If true, then strings in "null_values" are also considered null when they
+  /// appear quoted in the CSV file. Otherwise, quoted values are never considered null.
+  bool quoted_strings_can_be_null = true;
+
+  /// Whether to try to automatically dict-encode string / binary data.
+  /// If true, then when type inference detects a string or binary column,
+  /// it is dict-encoded up to `auto_dict_max_cardinality` distinct values
+  /// (per chunk), after which it switches to regular encoding.
+  ///
+  /// This setting is ignored for non-inferred columns (those in `column_types`).
+  bool auto_dict_encode = false;
+  int32_t auto_dict_max_cardinality = 50;
+
+  /// Decimal point character for floating-point and decimal data
+  char decimal_point = '.';
+
+  // XXX Should we have a separate FilterOptions?
+
+  /// If non-empty, indicates the names of columns from the CSV file that should
+  /// be actually read and converted (in the vector's order).
+  /// Columns not in this vector will be ignored.
+  std::vector<std::string> include_columns;
+  /// If false, columns in `include_columns` but not in the CSV file will error out.
+  /// If true, columns in `include_columns` but not in the CSV file will produce
+  /// a column of nulls (whose type is selected using `column_types`,
+  /// or null by default)
+  /// This option is ignored if `include_columns` is empty.
+  bool include_missing_columns = false;
+
+  /// User-defined timestamp parsers, using the virtual parser interface in
+  /// arrow/util/value_parsing.h. More than one parser can be specified, and
+  /// the CSV conversion logic will try parsing values starting from the
+  /// beginning of this vector. If no parsers are specified, we use the default
+  /// built-in ISO-8601 parser.
+  std::vector<std::shared_ptr<TimestampParser>> timestamp_parsers;
+
+  /// Create conversion options with default values, including conventional
+  /// values for `null_values`, `true_values` and `false_values`
+  static ConvertOptions Defaults();
+
+  /// \brief Test that all set options are valid
+  Status Validate() const;
+};
+
+struct ARROW_EXPORT ReadOptions {
+  // Reader options
+
+  /// Whether to use the global CPU thread pool
+  bool use_threads = true;
+
+  /// \brief Block size we request from the IO layer.
+  ///
+  /// This will determine multi-threading granularity as well as
+  /// the size of individual record batches.
+  /// Minimum valid value for block size is 1
+  int32_t block_size = 1 << 20;  // 1 MB
+
+  /// Number of header rows to skip (not including the row of column names, if any)
+  int32_t skip_rows = 0;
+
+  /// Number of rows to skip after the column names are read, if any
+  int32_t skip_rows_after_names = 0;
+
+  /// Column names for the target table.
+  /// If empty, fall back on autogenerate_column_names.
+  std::vector<std::string> column_names;
+
+  /// Whether to autogenerate column names if `column_names` is empty.
+  /// If true, column names will be of the form "f0", "f1"...
+  /// If false, column names will be read from the first CSV row after `skip_rows`.
+  bool autogenerate_column_names = false;
+
+  /// Create read options with default values
+  static ReadOptions Defaults();
+
+  /// \brief Test that all set options are valid
+  Status Validate() const;
+};
+
+/// \brief Quoting style for CSV writing
+enum class ARROW_EXPORT QuotingStyle {
+  /// Only enclose values in quotes which need them, because their CSV rendering can
+  /// contain quotes itself (e.g. strings or binary values)
+  Needed,
+  /// Enclose all valid values in quotes. Nulls are not quoted. May cause readers to
+  /// interpret all values as strings if schema is inferred.
+  AllValid,
+  /// Do not enclose any values in quotes. Prevents values from containing quotes ("),
+  /// cell delimiters (,) or line endings (\\r, \\n), (following RFC4180). If values
+  /// contain these characters, an error is caused when attempting to write.
+  None
+};
+
+struct ARROW_EXPORT WriteOptions {
+  /// Whether to write an initial header line with column names
+  bool include_header = true;
+
+  /// \brief Maximum number of rows processed at a time
+  ///
+  /// The CSV writer converts and writes data in batches of N rows.
+  /// This number can impact performance.
+  int32_t batch_size = 1024;
+
+  /// Field delimiter
+  char delimiter = ',';
+
+  /// \brief The string to write for null values. Quotes are not allowed in this string.
+  std::string null_string;
+
+  /// \brief IO context for writing.
+  io::IOContext io_context;
+
+  /// \brief The end of line character to use for ending rows
+  std::string eol = "\n";
+
+  /// \brief Quoting style
+  QuotingStyle quoting_style = QuotingStyle::Needed;
+
+  /// \brief Quoting style of header
+  ///
+  /// Note that `QuotingStyle::Needed` and `QuotingStyle::AllValid` have the same
+  /// effect of quoting all column names.
+  QuotingStyle quoting_header = QuotingStyle::Needed;
+
+  /// Create write options with default values
+  static WriteOptions Defaults();
+
+  /// \brief Test that all set options are valid
+  Status Validate() const;
+};
+
+}  // namespace csv
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/csv/parser.h b/pyarrow/include/arrow/csv/parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..c73e52ce831ed95b4abe83084b483c15660bae7e
--- /dev/null
+++ b/pyarrow/include/arrow/csv/parser.h
@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string_view>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/csv/options.h"
+#include "arrow/csv/type_fwd.h"
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+namespace csv {
+
+/// Skip at most num_rows from the given input.  The input pointer is updated
+/// and the number of actually skipped rows is returns (may be less than
+/// requested if the input is too short).
+ARROW_EXPORT
+int32_t SkipRows(const uint8_t* data, uint32_t size, int32_t num_rows,
+                 const uint8_t** out_data);
+
+class BlockParserImpl;
+
+namespace detail {
+
+struct ParsedValueDesc {
+  uint32_t offset : 31;
+  bool quoted : 1;
+};
+
+class ARROW_EXPORT DataBatch {
+ public:
+  explicit DataBatch(int32_t num_cols) : num_cols_(num_cols) {}
+
+  /// \brief Return the number of parsed rows (not skipped)
+  int32_t num_rows() const { return num_rows_; }
+  /// \brief Return the number of parsed columns
+  int32_t num_cols() const { return num_cols_; }
+  /// \brief Return the total size in bytes of parsed data
+  uint32_t num_bytes() const { return parsed_size_; }
+  /// \brief Return the number of skipped rows
+  int32_t num_skipped_rows() const { return static_cast<int32_t>(skipped_rows_.size()); }
+
+  template <typename Visitor>
+  Status VisitColumn(int32_t col_index, int64_t first_row, Visitor&& visit) const {
+    using detail::ParsedValueDesc;
+
+    int32_t batch_row = 0;
+    for (size_t buf_index = 0; buf_index < values_buffers_.size(); ++buf_index) {
+      const auto& values_buffer = values_buffers_[buf_index];
+      const auto values = reinterpret_cast<const ParsedValueDesc*>(values_buffer->data());
+      const auto max_pos =
+          static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc)) - 1;
+      for (int32_t pos = col_index; pos < max_pos; pos += num_cols_, ++batch_row) {
+        auto start = values[pos].offset;
+        auto stop = values[pos + 1].offset;
+        auto quoted = values[pos + 1].quoted;
+        Status status = visit(parsed_ + start, stop - start, quoted);
+        if (ARROW_PREDICT_FALSE(!status.ok())) {
+          return DecorateWithRowNumber(std::move(status), first_row, batch_row);
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  template <typename Visitor>
+  Status VisitLastRow(Visitor&& visit) const {
+    using detail::ParsedValueDesc;
+
+    const auto& values_buffer = values_buffers_.back();
+    const auto values = reinterpret_cast<const ParsedValueDesc*>(values_buffer->data());
+    const auto start_pos =
+        static_cast<int32_t>(values_buffer->size() / sizeof(ParsedValueDesc)) -
+        num_cols_ - 1;
+    for (int32_t col_index = 0; col_index < num_cols_; ++col_index) {
+      auto start = values[start_pos + col_index].offset;
+      auto stop = values[start_pos + col_index + 1].offset;
+      auto quoted = values[start_pos + col_index + 1].quoted;
+      ARROW_RETURN_NOT_OK(visit(parsed_ + start, stop - start, quoted));
+    }
+    return Status::OK();
+  }
+
+ protected:
+  Status DecorateWithRowNumber(Status&& status, int64_t first_row,
+                               int32_t batch_row) const {
+    if (first_row >= 0) {
+      // `skipped_rows_` is in ascending order by construction, so use bisection
+      // to find out how many rows were skipped before `batch_row`.
+      const auto skips_before =
+          std::upper_bound(skipped_rows_.begin(), skipped_rows_.end(), batch_row) -
+          skipped_rows_.begin();
+      status = status.WithMessage("Row #", batch_row + skips_before + first_row, ": ",
+                                  status.message());
+    }
+    // Use return_if so that when extra context is enabled it will be added
+    ARROW_RETURN_IF_(true, std::move(status), ARROW_STRINGIFY(status));
+    return std::move(status);
+  }
+
+  // The number of rows in this batch (not including any skipped ones)
+  int32_t num_rows_ = 0;
+  // The number of columns
+  int32_t num_cols_ = 0;
+
+  // XXX should we ensure the parsed buffer is padded with 8 or 16 excess zero bytes?
+  // It may help with null parsing...
+  std::vector<std::shared_ptr<Buffer>> values_buffers_;
+  std::shared_ptr<Buffer> parsed_buffer_;
+  const uint8_t* parsed_ = NULLPTR;
+  int32_t parsed_size_ = 0;
+
+  // Record the current num_rows_ each time a row is skipped
+  std::vector<int32_t> skipped_rows_;
+
+  friend class ::arrow::csv::BlockParserImpl;
+};
+
+}  // namespace detail
+
+constexpr int32_t kMaxParserNumRows = 100000;
+
+/// \class BlockParser
+/// \brief A reusable block-based parser for CSV data
+///
+/// The parser takes a block of CSV data and delimits rows and fields,
+/// unquoting and unescaping them on the fly.  Parsed data is own by the
+/// parser, so the original buffer can be discarded after Parse() returns.
+///
+/// If the block is truncated (i.e. not all data can be parsed), it is up
+/// to the caller to arrange the next block to start with the trailing data.
+/// Also, if the previous block ends with CR (0x0d) and a new block starts
+/// with LF (0x0a), the parser will consider the leading newline as an empty
+/// line; the caller should therefore strip it.
+class ARROW_EXPORT BlockParser {
+ public:
+  explicit BlockParser(ParseOptions options, int32_t num_cols = -1,
+                       int64_t first_row = -1, int32_t max_num_rows = kMaxParserNumRows);
+  explicit BlockParser(MemoryPool* pool, ParseOptions options, int32_t num_cols = -1,
+                       int64_t first_row = -1, int32_t max_num_rows = kMaxParserNumRows);
+  ~BlockParser();
+
+  /// \brief Parse a block of data
+  ///
+  /// Parse a block of CSV data, ingesting up to max_num_rows rows.
+  /// The number of bytes actually parsed is returned in out_size.
+  Status Parse(std::string_view data, uint32_t* out_size);
+
+  /// \brief Parse sequential blocks of data
+  ///
+  /// Only the last block is allowed to be truncated.
+  Status Parse(const std::vector<std::string_view>& data, uint32_t* out_size);
+
+  /// \brief Parse the final block of data
+  ///
+  /// Like Parse(), but called with the final block in a file.
+  /// The last row may lack a trailing line separator.
+  Status ParseFinal(std::string_view data, uint32_t* out_size);
+
+  /// \brief Parse the final sequential blocks of data
+  ///
+  /// Only the last block is allowed to be truncated.
+  Status ParseFinal(const std::vector<std::string_view>& data, uint32_t* out_size);
+
+  /// \brief Return the number of parsed rows
+  int32_t num_rows() const { return parsed_batch().num_rows(); }
+  /// \brief Return the number of parsed columns
+  int32_t num_cols() const { return parsed_batch().num_cols(); }
+  /// \brief Return the total size in bytes of parsed data
+  uint32_t num_bytes() const { return parsed_batch().num_bytes(); }
+
+  /// \brief Return the total number of rows including rows which were skipped
+  int32_t total_num_rows() const {
+    return parsed_batch().num_rows() + parsed_batch().num_skipped_rows();
+  }
+
+  /// \brief Return the row number of the first row in the block or -1 if unsupported
+  int64_t first_row_num() const;
+
+  /// \brief Visit parsed values in a column
+  ///
+  /// The signature of the visitor is
+  /// Status(const uint8_t* data, uint32_t size, bool quoted)
+  template <typename Visitor>
+  Status VisitColumn(int32_t col_index, Visitor&& visit) const {
+    return parsed_batch().VisitColumn(col_index, first_row_num(),
+                                      std::forward<Visitor>(visit));
+  }
+
+  template <typename Visitor>
+  Status VisitLastRow(Visitor&& visit) const {
+    return parsed_batch().VisitLastRow(std::forward<Visitor>(visit));
+  }
+
+ protected:
+  std::unique_ptr<BlockParserImpl> impl_;
+
+  const detail::DataBatch& parsed_batch() const;
+};
+
+}  // namespace csv
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/csv/reader.h b/pyarrow/include/arrow/csv/reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..bae301dc14815a6fdf9388a08c4f9068155f20a6
--- /dev/null
+++ b/pyarrow/include/arrow/csv/reader.h
@@ -0,0 +1,112 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/csv/options.h"  // IWYU pragma: keep
+#include "arrow/io/interfaces.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/future.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+class InputStream;
+}  // namespace io
+
+namespace csv {
+
+/// A class that reads an entire CSV file into a Arrow Table
+class ARROW_EXPORT TableReader {
+ public:
+  virtual ~TableReader() = default;
+
+  /// Read the entire CSV file and convert it to a Arrow Table
+  virtual Result<std::shared_ptr<Table>> Read() = 0;
+  /// Read the entire CSV file and convert it to a Arrow Table
+  virtual Future<std::shared_ptr<Table>> ReadAsync() = 0;
+
+  /// Create a TableReader instance
+  static Result<std::shared_ptr<TableReader>> Make(io::IOContext io_context,
+                                                   std::shared_ptr<io::InputStream> input,
+                                                   const ReadOptions&,
+                                                   const ParseOptions&,
+                                                   const ConvertOptions&);
+};
+
+/// \brief A class that reads a CSV file incrementally
+///
+/// Caveats:
+/// - For now, this is always single-threaded (regardless of `ReadOptions::use_threads`.
+/// - Type inference is done on the first block and types are frozen afterwards;
+///   to make sure the right data types are inferred, either set
+///   `ReadOptions::block_size` to a large enough value, or use
+///   `ConvertOptions::column_types` to set the desired data types explicitly.
+class ARROW_EXPORT StreamingReader : public RecordBatchReader {
+ public:
+  virtual ~StreamingReader() = default;
+
+  virtual Future<std::shared_ptr<RecordBatch>> ReadNextAsync() = 0;
+
+  /// \brief Return the number of bytes which have been read and processed
+  ///
+  /// The returned number includes CSV bytes which the StreamingReader has
+  /// finished processing, but not bytes for which some processing (e.g.
+  /// CSV parsing or conversion to Arrow layout) is still ongoing.
+  ///
+  /// Furthermore, the following rules apply:
+  /// - bytes skipped by `ReadOptions.skip_rows` are counted as being read before
+  /// any records are returned.
+  /// - bytes read while parsing the header are counted as being read before any
+  /// records are returned.
+  /// - bytes skipped by `ReadOptions.skip_rows_after_names` are counted after the
+  /// first batch is returned.
+  virtual int64_t bytes_read() const = 0;
+
+  /// Create a StreamingReader instance
+  ///
+  /// This involves some I/O as the first batch must be loaded during the creation process
+  /// so it is returned as a future
+  ///
+  /// Currently, the StreamingReader is not async-reentrant and does not do any fan-out
+  /// parsing (see ARROW-11889)
+  static Future<std::shared_ptr<StreamingReader>> MakeAsync(
+      io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+      arrow::internal::Executor* cpu_executor, const ReadOptions&, const ParseOptions&,
+      const ConvertOptions&);
+
+  static Result<std::shared_ptr<StreamingReader>> Make(
+      io::IOContext io_context, std::shared_ptr<io::InputStream> input,
+      const ReadOptions&, const ParseOptions&, const ConvertOptions&);
+};
+
+/// \brief Count the logical rows of data in a CSV file (i.e. the
+/// number of rows you would get if you read the file into a table).
+ARROW_EXPORT
+Future<int64_t> CountRowsAsync(io::IOContext io_context,
+                               std::shared_ptr<io::InputStream> input,
+                               arrow::internal::Executor* cpu_executor,
+                               const ReadOptions&, const ParseOptions&);
+
+}  // namespace csv
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/csv/test_common.h b/pyarrow/include/arrow/csv/test_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..07a41604478e81ac760e8d0b3501ef24996b0a4e
--- /dev/null
+++ b/pyarrow/include/arrow/csv/test_common.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/csv/parser.h"
+#include "arrow/testing/visibility.h"
+
+namespace arrow {
+namespace csv {
+
+ARROW_TESTING_EXPORT
+std::string MakeCSVData(std::vector<std::string> lines);
+
+// Make a BlockParser from a vector of lines representing a CSV file
+ARROW_TESTING_EXPORT
+void MakeCSVParser(std::vector<std::string> lines, ParseOptions options, int32_t num_cols,
+                   MemoryPool* pool, std::shared_ptr<BlockParser>* out);
+
+ARROW_TESTING_EXPORT
+void MakeCSVParser(std::vector<std::string> lines, ParseOptions options,
+                   std::shared_ptr<BlockParser>* out);
+
+ARROW_TESTING_EXPORT
+void MakeCSVParser(std::vector<std::string> lines, std::shared_ptr<BlockParser>* out);
+
+// Make a BlockParser from a vector of strings representing a single CSV column
+ARROW_TESTING_EXPORT
+void MakeColumnParser(std::vector<std::string> items, std::shared_ptr<BlockParser>* out);
+
+ARROW_TESTING_EXPORT
+Result<std::shared_ptr<Buffer>> MakeSampleCsvBuffer(
+    size_t num_rows, std::function<bool(size_t row_num)> is_valid = {});
+
+}  // namespace csv
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/csv/type_fwd.h b/pyarrow/include/arrow/csv/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0a53847a90ddb82067e0c9ac955cf4222c61742
--- /dev/null
+++ b/pyarrow/include/arrow/csv/type_fwd.h
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+namespace arrow {
+namespace csv {
+
+class TableReader;
+struct ConvertOptions;
+struct ReadOptions;
+struct ParseOptions;
+struct WriteOptions;
+
+}  // namespace csv
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/csv/writer.h b/pyarrow/include/arrow/csv/writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9d79e16608671859357e3adab88416fb0a9d04f
--- /dev/null
+++ b/pyarrow/include/arrow/csv/writer.h
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/csv/options.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/ipc/type_fwd.h"
+#include "arrow/record_batch.h"
+#include "arrow/table.h"
+
+namespace arrow {
+namespace csv {
+
+// Functionality for converting Arrow data to Comma separated value text.
+// This library supports all primitive types that can be cast to a StringArray or
+// a LargeStringArray.
+// It applies to following formatting rules:
+//  - For non-binary types no quotes surround values.  Nulls are represented as the empty
+//  string.
+//  - For binary types all non-null data is quoted (and quotes within data are escaped
+//  with an additional quote).
+//    Null values are empty and unquoted.
+
+/// \defgroup csv-write-functions High-level functions for writing CSV files
+/// @{
+
+/// \brief Convert table to CSV and write the result to output.
+/// Experimental
+ARROW_EXPORT Status WriteCSV(const Table& table, const WriteOptions& options,
+                             arrow::io::OutputStream* output);
+/// \brief Convert batch to CSV and write the result to output.
+/// Experimental
+ARROW_EXPORT Status WriteCSV(const RecordBatch& batch, const WriteOptions& options,
+                             arrow::io::OutputStream* output);
+/// \brief Convert batches read through a RecordBatchReader
+/// to CSV and write the results to output.
+/// Experimental
+ARROW_EXPORT Status WriteCSV(const std::shared_ptr<RecordBatchReader>& reader,
+                             const WriteOptions& options,
+                             arrow::io::OutputStream* output);
+
+/// @}
+
+/// \defgroup csv-writer-factories Functions for creating an incremental CSV writer
+/// @{
+
+/// \brief Create a new CSV writer. User is responsible for closing the
+/// actual OutputStream.
+///
+/// \param[in] sink output stream to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization
+/// \return Result<std::shared_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::shared_ptr<ipc::RecordBatchWriter>> MakeCSVWriter(
+    std::shared_ptr<io::OutputStream> sink, const std::shared_ptr<Schema>& schema,
+    const WriteOptions& options = WriteOptions::Defaults());
+
+/// \brief Create a new CSV writer.
+///
+/// \param[in] sink output stream to write to (does not take ownership)
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization
+/// \return Result<std::shared_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::shared_ptr<ipc::RecordBatchWriter>> MakeCSVWriter(
+    io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+    const WriteOptions& options = WriteOptions::Defaults());
+
+/// @}
+
+}  // namespace csv
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/api.h b/pyarrow/include/arrow/dataset/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..38caa1cff19def66d09d0d6ed25c67ce52259f9a
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/api.h
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include "arrow/compute/expression.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/discovery.h"
+#include "arrow/dataset/file_base.h"
+#ifdef ARROW_CSV
+#  include "arrow/dataset/file_csv.h"
+#endif
+#ifdef ARROW_JSON
+#  include "arrow/dataset/file_json.h"
+#endif
+#include "arrow/dataset/file_ipc.h"
+#ifdef ARROW_ORC
+#  include "arrow/dataset/file_orc.h"
+#endif
+#ifdef ARROW_PARQUET
+#  include "arrow/dataset/file_parquet.h"
+#endif
+#include "arrow/dataset/scanner.h"
diff --git a/pyarrow/include/arrow/dataset/dataset.h b/pyarrow/include/arrow/dataset/dataset.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c788ef5581c62fe1fc145b289ca74929bd54606
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/dataset.h
@@ -0,0 +1,491 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/expression.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/util/async_generator_fwd.h"
+#include "arrow/util/future.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/mutex.h"
+
+namespace arrow {
+
+namespace internal {
+class Executor;
+}  // namespace internal
+
+namespace dataset {
+
+using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
+
+/// \brief Description of a column to scan
+struct ARROW_DS_EXPORT FragmentSelectionColumn {
+  /// \brief The path to the column to load
+  FieldPath path;
+  /// \brief The type of the column in the dataset schema
+  ///
+  /// A format may choose to ignore this field completely.  For example, when
+  /// reading from IPC the reader can just return the column in the data type
+  /// that is stored on disk.  There is no point in doing anything special.
+  ///
+  /// However, some formats may be capable of casting on the fly.  For example,
+  /// when reading from CSV, if we know the target type of the column, we can
+  /// convert from string to the target type as we read.
+  DataType* requested_type;
+};
+
+/// \brief A list of columns that should be loaded from a fragment
+///
+/// The paths in this selection should be referring to the fragment schema.  This class
+/// contains a virtual destructor as it is expected evolution strategies will need to
+/// extend this to add any information needed to later evolve the batches.
+///
+/// For example, in the basic evolution strategy, we keep track of which columns
+/// were missing from the file so that we can fill those in with null when evolving.
+class ARROW_DS_EXPORT FragmentSelection {
+ public:
+  explicit FragmentSelection(std::vector<FragmentSelectionColumn> columns)
+      : columns_(std::move(columns)) {}
+  virtual ~FragmentSelection() = default;
+  /// The columns that should be loaded from the fragment
+  const std::vector<FragmentSelectionColumn>& columns() const { return columns_; }
+
+ private:
+  std::vector<FragmentSelectionColumn> columns_;
+};
+
+/// \brief Instructions for scanning a particular fragment
+///
+/// The fragment scan request is derived from ScanV2Options.  The main
+/// difference is that the scan options are based on the dataset schema
+/// while the fragment request is based on the fragment schema.
+struct ARROW_DS_EXPORT FragmentScanRequest {
+  /// \brief A row filter
+  ///
+  /// The filter expression should be written against the fragment schema.
+  ///
+  /// \see ScanV2Options for details on how this filter should be applied
+  compute::Expression filter = compute::literal(true);
+
+  /// \brief The columns to scan
+  ///
+  /// These indices refer to the fragment schema
+  ///
+  /// Note: This is NOT a simple list of top-level column indices.
+  /// For more details \see ScanV2Options
+  ///
+  /// If possible a fragment should only read from disk the data needed
+  /// to satisfy these columns.  If a format cannot partially read a nested
+  /// column (e.g. JSON) then it must apply the column selection (in memory)
+  /// before returning the scanned batch.
+  std::shared_ptr<FragmentSelection> fragment_selection;
+  /// \brief Options specific to the format being scanned
+  const FragmentScanOptions* format_scan_options;
+};
+
+/// \brief An iterator-like object that can yield batches created from a fragment
+class ARROW_DS_EXPORT FragmentScanner {
+ public:
+  /// This instance will only be destroyed after all ongoing scan futures
+  /// have been completed.
+  ///
+  /// This means any callbacks created as part of the scan can safely
+  /// capture `this`
+  virtual ~FragmentScanner() = default;
+  /// \brief Scan a batch of data from the file
+  /// \param batch_number The index of the batch to read
+  virtual Future<std::shared_ptr<RecordBatch>> ScanBatch(int batch_number) = 0;
+  /// \brief Calculate an estimate of how many data bytes the given batch will represent
+  ///
+  /// "Data bytes" should be the total size of all the buffers once the data has been
+  /// decoded into the Arrow format.
+  virtual int64_t EstimatedDataBytes(int batch_number) = 0;
+  /// \brief The number of batches in the fragment to scan
+  virtual int NumBatches() = 0;
+};
+
+/// \brief Information learned about a fragment through inspection
+///
+/// This information can be used to figure out which fields need
+/// to be read from a file and how the data read in should be evolved
+/// to match the dataset schema.
+///
+/// For example, from a CSV file we can inspect and learn the column
+/// names and use those column names to determine which columns to load
+/// from the CSV file.
+struct ARROW_DS_EXPORT InspectedFragment {
+  explicit InspectedFragment(std::vector<std::string> column_names)
+      : column_names(std::move(column_names)) {}
+  std::vector<std::string> column_names;
+};
+
+/// \brief A granular piece of a Dataset, such as an individual file.
+///
+/// A Fragment can be read/scanned separately from other fragments. It yields a
+/// collection of RecordBatches when scanned
+///
+/// Note that Fragments have well defined physical schemas which are reconciled by
+/// the Datasets which contain them; these physical schemas may differ from a parent
+/// Dataset's schema and the physical schemas of sibling Fragments.
+class ARROW_DS_EXPORT Fragment : public std::enable_shared_from_this<Fragment> {
+ public:
+  /// \brief An expression that represents no known partition information
+  static const compute::Expression kNoPartitionInformation;
+
+  /// \brief Return the physical schema of the Fragment.
+  ///
+  /// The physical schema is also called the writer schema.
+  /// This method is blocking and may suffer from high latency filesystem.
+  /// The schema is cached after being read once, or may be specified at construction.
+  Result<std::shared_ptr<Schema>> ReadPhysicalSchema();
+
+  /// An asynchronous version of Scan
+  virtual Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options) = 0;
+
+  /// \brief Inspect a fragment to learn basic information
+  ///
+  /// This will be called before a scan and a fragment should attach whatever
+  /// information will be needed to figure out an evolution strategy.  This information
+  /// will then be passed to the call to BeginScan
+  virtual Future<std::shared_ptr<InspectedFragment>> InspectFragment(
+      const FragmentScanOptions* format_options, compute::ExecContext* exec_context);
+
+  /// \brief Start a scan operation
+  virtual Future<std::shared_ptr<FragmentScanner>> BeginScan(
+      const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
+      const FragmentScanOptions* format_options, compute::ExecContext* exec_context);
+
+  /// \brief Count the number of rows in this fragment matching the filter using metadata
+  /// only. That is, this method may perform I/O, but will not load data.
+  ///
+  /// If this is not possible, resolve with an empty optional. The fragment can perform
+  /// I/O (e.g. to read metadata) before it deciding whether it can satisfy the request.
+  virtual Future<std::optional<int64_t>> CountRows(
+      compute::Expression predicate, const std::shared_ptr<ScanOptions>& options);
+
+  /// \brief Clear any metadata that may have been cached by this object.
+  ///
+  /// A fragment may typically cache metadata to speed up repeated accesses.
+  /// In use cases when memory use is more critical than CPU time, calling
+  /// this function can help reclaim memory.
+  virtual Status ClearCachedMetadata();
+
+  virtual std::string type_name() const = 0;
+  virtual std::string ToString() const { return type_name(); }
+
+  /// \brief An expression which evaluates to true for all data viewed by this
+  /// Fragment.
+  const compute::Expression& partition_expression() const {
+    return partition_expression_;
+  }
+
+  virtual ~Fragment() = default;
+
+ protected:
+  Fragment() = default;
+  explicit Fragment(compute::Expression partition_expression,
+                    std::shared_ptr<Schema> physical_schema);
+
+  virtual Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() = 0;
+
+  util::Mutex physical_schema_mutex_;
+  compute::Expression partition_expression_ = compute::literal(true);
+  // The physical schema that is inferred from the Fragment
+  std::shared_ptr<Schema> physical_schema_;
+  // The physical schema that was passed to the Fragment constructor
+  std::shared_ptr<Schema> given_physical_schema_;
+};
+
+/// \brief Per-scan options for fragment(s) in a dataset.
+///
+/// These options are not intrinsic to the format or fragment itself, but do affect
+/// the results of a scan. These are options which make sense to change between
+/// repeated reads of the same dataset, such as format-specific conversion options
+/// (that do not affect the schema).
+///
+/// \ingroup dataset-scanning
+class ARROW_DS_EXPORT FragmentScanOptions {
+ public:
+  virtual std::string type_name() const = 0;
+  virtual std::string ToString() const { return type_name(); }
+  virtual ~FragmentScanOptions() = default;
+};
+
+/// \defgroup dataset-implementations Concrete implementations
+///
+/// @{
+
+/// \brief A trivial Fragment that yields ScanTask out of a fixed set of
+/// RecordBatch.
+class ARROW_DS_EXPORT InMemoryFragment : public Fragment {
+ public:
+  class Scanner;
+  InMemoryFragment(std::shared_ptr<Schema> schema, RecordBatchVector record_batches,
+                   compute::Expression = compute::literal(true));
+  explicit InMemoryFragment(RecordBatchVector record_batches,
+                            compute::Expression = compute::literal(true));
+
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options) override;
+  Future<std::optional<int64_t>> CountRows(
+      compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options) override;
+
+  Future<std::shared_ptr<InspectedFragment>> InspectFragment(
+      const FragmentScanOptions* format_options,
+      compute::ExecContext* exec_context) override;
+  Future<std::shared_ptr<FragmentScanner>> BeginScan(
+      const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
+      const FragmentScanOptions* format_options,
+      compute::ExecContext* exec_context) override;
+
+  std::string type_name() const override { return "in-memory"; }
+
+ protected:
+  Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override;
+
+  RecordBatchVector record_batches_;
+};
+
+/// @}
+
+using FragmentGenerator = AsyncGenerator<std::shared_ptr<Fragment>>;
+
+/// \brief Rules for converting the dataset schema to and from fragment schemas
+class ARROW_DS_EXPORT FragmentEvolutionStrategy {
+ public:
+  /// This instance will only be destroyed when all scan operations for the
+  /// fragment have completed.
+  virtual ~FragmentEvolutionStrategy() = default;
+  /// \brief A guarantee that applies to all batches of this fragment
+  ///
+  /// For example, if a fragment is missing one of the fields in the dataset
+  /// schema then a typical evolution strategy is to set that field to null.
+  ///
+  /// So if the column at index 3 is missing then the guarantee is
+  /// FieldRef(3) == null
+  ///
+  /// Individual field guarantees should be AND'd together and returned
+  /// as a single expression.
+  virtual Result<compute::Expression> GetGuarantee(
+      const std::vector<FieldPath>& dataset_schema_selection) const = 0;
+
+  /// \brief Return a fragment schema selection given a dataset schema selection
+  ///
+  /// For example, if the user wants fields 2 & 4 of the dataset schema and
+  /// in this fragment the field 2 is missing and the field 4 is at index 1 then
+  /// this should return {1}
+  virtual Result<std::unique_ptr<FragmentSelection>> DevolveSelection(
+      const std::vector<FieldPath>& dataset_schema_selection) const = 0;
+
+  /// \brief Return a filter expression bound to the fragment schema given
+  ///        a filter expression bound to the dataset schema
+  ///
+  /// The dataset scan filter will first be simplified by the guarantee returned
+  /// by GetGuarantee.  This means an evolution that only handles dropping or casting
+  /// fields doesn't need to do anything here except return the given filter.
+  ///
+  /// On the other hand, an evolution that is doing some kind of aliasing will likely
+  /// need to convert field references in the filter to the aliased field references
+  /// where appropriate.
+  virtual Result<compute::Expression> DevolveFilter(
+      const compute::Expression& filter) const = 0;
+
+  /// \brief Convert a batch from the fragment schema to the dataset schema
+  ///
+  /// Typically this involves casting columns from the data type stored on disk
+  /// to the data type of the dataset schema.  For example, this fragment might
+  /// have columns stored as int32 and the dataset schema might have int64 for
+  /// the column.  In this case we should cast the column from int32 to int64.
+  ///
+  /// Note: A fragment may perform this cast as the data is read from disk.  In
+  /// that case a cast might not be needed.
+  virtual Result<compute::ExecBatch> EvolveBatch(
+      const std::shared_ptr<RecordBatch>& batch,
+      const std::vector<FieldPath>& dataset_selection,
+      const FragmentSelection& selection) const = 0;
+
+  /// \brief Return a string description of this strategy
+  virtual std::string ToString() const = 0;
+};
+
+/// \brief Lookup to create a FragmentEvolutionStrategy for a given fragment
+class ARROW_DS_EXPORT DatasetEvolutionStrategy {
+ public:
+  virtual ~DatasetEvolutionStrategy() = default;
+  /// \brief Create a strategy for evolving from the given fragment
+  ///        to the schema of the given dataset
+  virtual std::unique_ptr<FragmentEvolutionStrategy> GetStrategy(
+      const Dataset& dataset, const Fragment& fragment,
+      const InspectedFragment& inspected_fragment) = 0;
+
+  /// \brief Return a string description of this strategy
+  virtual std::string ToString() const = 0;
+};
+
+ARROW_DS_EXPORT std::unique_ptr<DatasetEvolutionStrategy>
+MakeBasicDatasetEvolutionStrategy();
+
+/// \brief A container of zero or more Fragments.
+///
+/// A Dataset acts as a union of Fragments, e.g. files deeply nested in a
+/// directory. A Dataset has a schema to which Fragments must align during a
+/// scan operation. This is analogous to Avro's reader and writer schema.
+class ARROW_DS_EXPORT Dataset : public std::enable_shared_from_this<Dataset> {
+ public:
+  /// \brief Begin to build a new Scan operation against this Dataset
+  Result<std::shared_ptr<ScannerBuilder>> NewScan();
+
+  /// \brief GetFragments returns an iterator of Fragments given a predicate.
+  Result<FragmentIterator> GetFragments(compute::Expression predicate);
+  Result<FragmentIterator> GetFragments();
+
+  /// \brief Async versions of `GetFragments`.
+  Result<FragmentGenerator> GetFragmentsAsync(compute::Expression predicate);
+  Result<FragmentGenerator> GetFragmentsAsync();
+
+  const std::shared_ptr<Schema>& schema() const { return schema_; }
+
+  /// \brief An expression which evaluates to true for all data viewed by this Dataset.
+  /// May be null, which indicates no information is available.
+  const compute::Expression& partition_expression() const {
+    return partition_expression_;
+  }
+
+  /// \brief The name identifying the kind of Dataset
+  virtual std::string type_name() const = 0;
+
+  /// \brief Return a copy of this Dataset with a different schema.
+  ///
+  /// The copy will view the same Fragments. If the new schema is not compatible with the
+  /// original dataset's schema then an error will be raised.
+  virtual Result<std::shared_ptr<Dataset>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const = 0;
+
+  /// \brief Rules used by this dataset to handle schema evolution
+  DatasetEvolutionStrategy* evolution_strategy() { return evolution_strategy_.get(); }
+
+  virtual ~Dataset() = default;
+
+ protected:
+  explicit Dataset(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
+
+  Dataset(std::shared_ptr<Schema> schema, compute::Expression partition_expression);
+
+  virtual Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) = 0;
+  /// \brief Default non-virtual implementation method for the base
+  /// `GetFragmentsAsyncImpl` method, which creates a fragment generator for
+  /// the dataset, possibly filtering results with a predicate (forwarding to
+  /// the synchronous `GetFragmentsImpl` method and moving the computations
+  /// to the background, using the IO thread pool).
+  ///
+  /// Currently, `executor` is always the same as `internal::GetCPUThreadPool()`,
+  /// which means the results from the underlying fragment generator will be
+  /// transferred to the default CPU thread pool. The generator itself is
+  /// offloaded to run on the default IO thread pool.
+  virtual Result<FragmentGenerator> GetFragmentsAsyncImpl(
+      compute::Expression predicate, arrow::internal::Executor* executor);
+
+  std::shared_ptr<Schema> schema_;
+  compute::Expression partition_expression_ = compute::literal(true);
+  std::unique_ptr<DatasetEvolutionStrategy> evolution_strategy_ =
+      MakeBasicDatasetEvolutionStrategy();
+};
+
+/// \addtogroup dataset-implementations
+///
+/// @{
+
+/// \brief A Source which yields fragments wrapping a stream of record batches.
+///
+/// The record batches must match the schema provided to the source at construction.
+class ARROW_DS_EXPORT InMemoryDataset : public Dataset {
+ public:
+  class RecordBatchGenerator {
+   public:
+    virtual ~RecordBatchGenerator() = default;
+    virtual RecordBatchIterator Get() const = 0;
+  };
+
+  /// Construct a dataset from a schema and a factory of record batch iterators.
+  InMemoryDataset(std::shared_ptr<Schema> schema,
+                  std::shared_ptr<RecordBatchGenerator> get_batches)
+      : Dataset(std::move(schema)), get_batches_(std::move(get_batches)) {}
+
+  /// Convenience constructor taking a fixed list of batches
+  InMemoryDataset(std::shared_ptr<Schema> schema, RecordBatchVector batches);
+
+  /// Convenience constructor taking a Table
+  explicit InMemoryDataset(std::shared_ptr<Table> table);
+
+  std::string type_name() const override { return "in-memory"; }
+
+  Result<std::shared_ptr<Dataset>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const override;
+
+ protected:
+  Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
+
+  std::shared_ptr<RecordBatchGenerator> get_batches_;
+};
+
+/// \brief A Dataset wrapping child Datasets.
+class ARROW_DS_EXPORT UnionDataset : public Dataset {
+ public:
+  /// \brief Construct a UnionDataset wrapping child Datasets.
+  ///
+  /// \param[in] schema the schema of the resulting dataset.
+  /// \param[in] children one or more child Datasets. Their schemas must be identical to
+  /// schema.
+  static Result<std::shared_ptr<UnionDataset>> Make(std::shared_ptr<Schema> schema,
+                                                    DatasetVector children);
+
+  const DatasetVector& children() const { return children_; }
+
+  std::string type_name() const override { return "union"; }
+
+  Result<std::shared_ptr<Dataset>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const override;
+
+ protected:
+  Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
+
+  explicit UnionDataset(std::shared_ptr<Schema> schema, DatasetVector children)
+      : Dataset(std::move(schema)), children_(std::move(children)) {}
+
+  DatasetVector children_;
+
+  friend class UnionDatasetFactory;
+};
+
+/// @}
+
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/dataset_writer.h b/pyarrow/include/arrow/dataset/dataset_writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..edb1649b5f196aa3c6cd923c9e6540c4173fc102
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/dataset_writer.h
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/dataset/file_base.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/util/async_util.h"
+#include "arrow/util/future.h"
+
+namespace arrow {
+namespace dataset {
+namespace internal {
+
+// This lines up with our other defaults in the scanner and execution plan
+constexpr uint64_t kDefaultDatasetWriterMaxRowsQueued = 8 * 1024 * 1024;
+
+/// \brief Utility class that manages a set of writers to different paths
+///
+/// Writers may be closed and reopened (and a new file created) based on the dataset
+/// write options (for example, max_rows_per_file or max_open_files)
+///
+/// The dataset writer enforces its own back pressure based on the # of rows (as opposed
+/// to # of batches which is how it is typically enforced elsewhere) and # of files.
+class ARROW_DS_EXPORT DatasetWriter {
+ public:
+  /// \brief Create a dataset writer
+  ///
+  /// Will fail if basename_template is invalid or if there is existing data and
+  /// existing_data_behavior is kError
+  ///
+  /// \param write_options options to control how the data should be written
+  /// \param max_rows_queued max # of rows allowed to be queued before the dataset_writer
+  ///                        will ask for backpressure
+  static Result<std::unique_ptr<DatasetWriter>> Make(
+      FileSystemDatasetWriteOptions write_options, util::AsyncTaskScheduler* scheduler,
+      std::function<void()> pause_callback, std::function<void()> resume_callback,
+      std::function<void()> finish_callback,
+      uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued);
+
+  ~DatasetWriter();
+
+  /// \brief Write a batch to the dataset
+  /// \param[in] batch The batch to write
+  /// \param[in] directory The directory to write to
+  ///
+  /// Note: The written filename will be {directory}/{filename_factory(i)} where i is a
+  /// counter controlled by `max_open_files` and `max_rows_per_file`
+  ///
+  /// If multiple WriteRecordBatch calls arrive with the same `directory` then the batches
+  /// may be written to the same file.
+  ///
+  /// The returned future will be marked finished when the record batch has been queued
+  /// to be written.  If the returned future is unfinished then this indicates the dataset
+  /// writer's queue is full and the data provider should pause.
+  ///
+  /// This method is NOT async reentrant.  The returned future will only be unfinished
+  /// if back pressure needs to be applied.  Async reentrancy is not necessary for
+  /// concurrent writes to happen.  Calling this method again before the previous future
+  /// completes will not just violate max_rows_queued but likely lead to race conditions.
+  ///
+  /// One thing to note is that the ordering of your data can affect your maximum
+  /// potential parallelism.  If this seems odd then consider a dataset where the first
+  /// 1000 batches go to the same directory and then the 1001st batch goes to a different
+  /// directory.  The only way to get two parallel writes immediately would be to queue
+  /// all 1000 pending writes to the first directory.
+  void WriteRecordBatch(std::shared_ptr<RecordBatch> batch, const std::string& directory,
+                        const std::string& prefix = "");
+
+  /// Finish all pending writes and close any open files
+  void Finish();
+
+ protected:
+  DatasetWriter(FileSystemDatasetWriteOptions write_options,
+                util::AsyncTaskScheduler* scheduler, std::function<void()> pause_callback,
+                std::function<void()> resume_callback,
+                std::function<void()> finish_callback,
+                uint64_t max_rows_queued = kDefaultDatasetWriterMaxRowsQueued);
+
+  class DatasetWriterImpl;
+  std::unique_ptr<DatasetWriterImpl> impl_;
+};
+
+}  // namespace internal
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/discovery.h b/pyarrow/include/arrow/dataset/discovery.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d76dcef727e7643ba559d8802665755a4f8a870
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/discovery.h
@@ -0,0 +1,275 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Logic for automatically determining the structure of multi-file
+/// dataset with possible partitioning according to available
+/// partitioning
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "arrow/dataset/partition.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/filesystem/type_fwd.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace dataset {
+
+/// \defgroup dataset-discovery Discovery API
+///
+/// @{
+
+struct InspectOptions {
+  /// See `fragments` property.
+  static constexpr int kInspectAllFragments = -1;
+
+  /// Indicate how many fragments should be inspected to infer the unified dataset
+  /// schema. Limiting the number of fragments accessed improves the latency of
+  /// the discovery process when dealing with a high number of fragments and/or
+  /// high latency file systems.
+  ///
+  /// The default value of `1` inspects the schema of the first (in no particular
+  /// order) fragment only. If the dataset has a uniform schema for all fragments,
+  /// this default is the optimal value. In order to inspect all fragments and
+  /// robustly unify their potentially varying schemas, set this option to
+  /// `kInspectAllFragments`. A value of `0` disables inspection of fragments
+  /// altogether so only the partitioning schema will be inspected.
+  int fragments = 1;
+
+  /// Control how to unify types. By default, types are merged strictly (the
+  /// type must match exactly, except nulls can be merged with other types).
+  Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults();
+};
+
+struct FinishOptions {
+  /// Finalize the dataset with this given schema. If the schema is not
+  /// provided, infer the schema via the Inspect, see the `inspect_options`
+  /// property.
+  std::shared_ptr<Schema> schema = NULLPTR;
+
+  /// If the schema is not provided, it will be discovered by passing the
+  /// following options to `DatasetDiscovery::Inspect`.
+  InspectOptions inspect_options{};
+
+  /// Indicate if the given Schema (when specified), should be validated against
+  /// the fragments' schemas. `inspect_options` will control how many fragments
+  /// are checked.
+  bool validate_fragments = false;
+};
+
+/// \brief DatasetFactory provides a way to inspect/discover a Dataset's expected
+/// schema before materializing said Dataset.
+class ARROW_DS_EXPORT DatasetFactory {
+ public:
+  /// \brief Get the schemas of the Fragments and Partitioning.
+  virtual Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
+      InspectOptions options) = 0;
+
+  /// \brief Get unified schema for the resulting Dataset.
+  Result<std::shared_ptr<Schema>> Inspect(InspectOptions options = {});
+
+  /// \brief Create a Dataset
+  Result<std::shared_ptr<Dataset>> Finish();
+  /// \brief Create a Dataset with the given schema (see \a InspectOptions::schema)
+  Result<std::shared_ptr<Dataset>> Finish(std::shared_ptr<Schema> schema);
+  /// \brief Create a Dataset with the given options
+  virtual Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) = 0;
+
+  /// \brief Optional root partition for the resulting Dataset.
+  const compute::Expression& root_partition() const { return root_partition_; }
+  /// \brief Set the root partition for the resulting Dataset.
+  Status SetRootPartition(compute::Expression partition) {
+    root_partition_ = std::move(partition);
+    return Status::OK();
+  }
+
+  virtual ~DatasetFactory() = default;
+
+ protected:
+  DatasetFactory();
+
+  compute::Expression root_partition_;
+};
+
+/// @}
+
+/// \brief DatasetFactory provides a way to inspect/discover a Dataset's
+/// expected schema before materialization.
+/// \ingroup dataset-implementations
+class ARROW_DS_EXPORT UnionDatasetFactory : public DatasetFactory {
+ public:
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      std::vector<std::shared_ptr<DatasetFactory>> factories);
+
+  /// \brief Return the list of child DatasetFactory
+  const std::vector<std::shared_ptr<DatasetFactory>>& factories() const {
+    return factories_;
+  }
+
+  /// \brief Get the schemas of the Datasets.
+  ///
+  /// Instead of applying options globally, it applies at each child factory.
+  /// This will not respect `options.fragments` exactly, but will respect the
+  /// spirit of peeking the first fragments or all of them.
+  Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
+      InspectOptions options) override;
+
+  /// \brief Create a Dataset.
+  Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
+
+ protected:
+  explicit UnionDatasetFactory(std::vector<std::shared_ptr<DatasetFactory>> factories);
+
+  std::vector<std::shared_ptr<DatasetFactory>> factories_;
+};
+
+/// \ingroup dataset-filesystem
+struct FileSystemFactoryOptions {
+  /// Either an explicit Partitioning or a PartitioningFactory to discover one.
+  ///
+  /// If a factory is provided, it will be used to infer a schema for partition fields
+  /// based on file and directory paths then construct a Partitioning. The default
+  /// is a Partitioning which will yield no partition information.
+  ///
+  /// The (explicit or discovered) partitioning will be applied to discovered files
+  /// and the resulting partition information embedded in the Dataset.
+  PartitioningOrFactory partitioning{Partitioning::Default()};
+
+  /// For the purposes of applying the partitioning, paths will be stripped
+  /// of the partition_base_dir. Files not matching the partition_base_dir
+  /// prefix will be skipped for partition discovery. The ignored files will still
+  /// be part of the Dataset, but will not have partition information.
+  ///
+  /// Example:
+  /// partition_base_dir = "/dataset";
+  ///
+  /// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning
+  ///
+  /// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery.
+  ///
+  /// This is useful for partitioning which parses directory when ordering
+  /// is important, e.g. DirectoryPartitioning.
+  std::string partition_base_dir;
+
+  /// Invalid files (via selector or explicitly) will be excluded by checking
+  /// with the FileFormat::IsSupported method.  This will incur IO for each files
+  /// in a serial and single threaded fashion. Disabling this feature will skip the
+  /// IO, but unsupported files may be present in the Dataset
+  /// (resulting in an error at scan time).
+  bool exclude_invalid_files = false;
+
+  /// When discovering from a Selector (and not from an explicit file list), ignore
+  /// files and directories matching any of these prefixes.
+  ///
+  /// Example (with selector = "/dataset/**"):
+  /// selector_ignore_prefixes = {"_", ".DS_STORE" };
+  ///
+  /// - "/dataset/data.csv" -> not ignored
+  /// - "/dataset/_metadata" -> ignored
+  /// - "/dataset/.DS_STORE" -> ignored
+  /// - "/dataset/_hidden/dat" -> ignored
+  /// - "/dataset/nested/.DS_STORE" -> ignored
+  std::vector<std::string> selector_ignore_prefixes = {
+      ".",
+      "_",
+  };
+};
+
+/// \brief FileSystemDatasetFactory creates a Dataset from a vector of
+/// fs::FileInfo or a fs::FileSelector.
+/// \ingroup dataset-filesystem
+class ARROW_DS_EXPORT FileSystemDatasetFactory : public DatasetFactory {
+ public:
+  /// \brief Build a FileSystemDatasetFactory from an explicit list of
+  /// paths.
+  ///
+  /// \param[in] filesystem passed to FileSystemDataset
+  /// \param[in] paths passed to FileSystemDataset
+  /// \param[in] format passed to FileSystemDataset
+  /// \param[in] options see FileSystemFactoryOptions for more information.
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      std::shared_ptr<fs::FileSystem> filesystem, const std::vector<std::string>& paths,
+      std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
+
+  /// \brief Build a FileSystemDatasetFactory from a fs::FileSelector.
+  ///
+  /// The selector will expand to a vector of FileInfo. The expansion/crawling
+  /// is performed in this function call. Thus, the finalized Dataset is
+  /// working with a snapshot of the filesystem.
+  //
+  /// If options.partition_base_dir is not provided, it will be overwritten
+  /// with selector.base_dir.
+  ///
+  /// \param[in] filesystem passed to FileSystemDataset
+  /// \param[in] selector used to crawl and search files
+  /// \param[in] format passed to FileSystemDataset
+  /// \param[in] options see FileSystemFactoryOptions for more information.
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      std::shared_ptr<fs::FileSystem> filesystem, fs::FileSelector selector,
+      std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
+
+  /// \brief Build a FileSystemDatasetFactory from an uri including filesystem
+  /// information.
+  ///
+  /// \param[in] uri passed to FileSystemDataset
+  /// \param[in] format passed to FileSystemDataset
+  /// \param[in] options see FileSystemFactoryOptions for more information.
+  static Result<std::shared_ptr<DatasetFactory>> Make(std::string uri,
+                                                      std::shared_ptr<FileFormat> format,
+                                                      FileSystemFactoryOptions options);
+
+  /// \brief Build a FileSystemDatasetFactory from an explicit list of
+  /// file information.
+  ///
+  /// \param[in] filesystem passed to FileSystemDataset
+  /// \param[in] files passed to FileSystemDataset
+  /// \param[in] format passed to FileSystemDataset
+  /// \param[in] options see FileSystemFactoryOptions for more information.
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      std::shared_ptr<fs::FileSystem> filesystem, const std::vector<fs::FileInfo>& files,
+      std::shared_ptr<FileFormat> format, FileSystemFactoryOptions options);
+
+  Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
+      InspectOptions options) override;
+
+  Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
+
+ protected:
+  FileSystemDatasetFactory(std::vector<fs::FileInfo> files,
+                           std::shared_ptr<fs::FileSystem> filesystem,
+                           std::shared_ptr<FileFormat> format,
+                           FileSystemFactoryOptions options);
+
+  Result<std::shared_ptr<Schema>> PartitionSchema();
+
+  std::vector<fs::FileInfo> files_;
+  std::shared_ptr<fs::FileSystem> fs_;
+  std::shared_ptr<FileFormat> format_;
+  FileSystemFactoryOptions options_;
+};
+
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/file_base.h b/pyarrow/include/arrow/dataset/file_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..e13c1312a479f57047b54cd38de680fd3c5a2d0f
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/file_base.h
@@ -0,0 +1,499 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/partition.h"
+#include "arrow/dataset/scanner.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/io/file.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/compression.h"
+
+namespace arrow {
+
+namespace dataset {
+
+/// \defgroup dataset-file-formats File formats for reading and writing datasets
+/// \defgroup dataset-filesystem File system datasets
+///
+/// @{
+
+/// \brief The path and filesystem where an actual file is located or a buffer which can
+/// be read like a file
+class ARROW_DS_EXPORT FileSource : public util::EqualityComparable<FileSource> {
+ public:
+  FileSource(std::string path, std::shared_ptr<fs::FileSystem> filesystem,
+             Compression::type compression = Compression::UNCOMPRESSED)
+      : file_info_(std::move(path)),
+        filesystem_(std::move(filesystem)),
+        compression_(compression) {}
+
+  FileSource(fs::FileInfo info, std::shared_ptr<fs::FileSystem> filesystem,
+             Compression::type compression = Compression::UNCOMPRESSED)
+      : file_info_(std::move(info)),
+        filesystem_(std::move(filesystem)),
+        compression_(compression) {}
+
+  explicit FileSource(std::shared_ptr<Buffer> buffer,
+                      Compression::type compression = Compression::UNCOMPRESSED)
+      : buffer_(std::move(buffer)), compression_(compression) {}
+
+  using CustomOpen = std::function<Result<std::shared_ptr<io::RandomAccessFile>>()>;
+  FileSource(CustomOpen open, int64_t size)
+      : custom_open_(std::move(open)), custom_size_(size) {}
+
+  using CustomOpenWithCompression =
+      std::function<Result<std::shared_ptr<io::RandomAccessFile>>(Compression::type)>;
+  FileSource(CustomOpenWithCompression open_with_compression, int64_t size,
+             Compression::type compression = Compression::UNCOMPRESSED)
+      : custom_open_(std::bind(std::move(open_with_compression), compression)),
+        custom_size_(size),
+        compression_(compression) {}
+
+  FileSource(std::shared_ptr<io::RandomAccessFile> file, int64_t size,
+             Compression::type compression = Compression::UNCOMPRESSED)
+      : custom_open_([=] { return ToResult(file); }),
+        custom_size_(size),
+        compression_(compression) {}
+
+  explicit FileSource(std::shared_ptr<io::RandomAccessFile> file,
+                      Compression::type compression = Compression::UNCOMPRESSED);
+
+  FileSource() : custom_open_(CustomOpen{&InvalidOpen}) {}
+
+  static std::vector<FileSource> FromPaths(const std::shared_ptr<fs::FileSystem>& fs,
+                                           std::vector<std::string> paths) {
+    std::vector<FileSource> sources;
+    for (auto&& path : paths) {
+      sources.emplace_back(std::move(path), fs);
+    }
+    return sources;
+  }
+
+  /// \brief Return the type of raw compression on the file, if any.
+  Compression::type compression() const { return compression_; }
+
+  /// \brief Return the file path, if any. Only valid when file source wraps a path.
+  const std::string& path() const {
+    static std::string buffer_path = "<Buffer>";
+    static std::string custom_open_path = "<Buffer>";
+    return filesystem_ ? file_info_.path() : buffer_ ? buffer_path : custom_open_path;
+  }
+
+  /// \brief Return the filesystem, if any. Otherwise returns nullptr
+  const std::shared_ptr<fs::FileSystem>& filesystem() const { return filesystem_; }
+
+  /// \brief Return the buffer containing the file, if any. Otherwise returns nullptr
+  const std::shared_ptr<Buffer>& buffer() const { return buffer_; }
+
+  /// \brief Get a RandomAccessFile which views this file source
+  Result<std::shared_ptr<io::RandomAccessFile>> Open() const;
+  Future<std::shared_ptr<io::RandomAccessFile>> OpenAsync() const;
+
+  /// \brief Get the size (in bytes) of the file or buffer
+  /// If the file is compressed this should be the compressed (on-disk) size.
+  int64_t Size() const;
+
+  /// \brief Get an InputStream which views this file source (and decompresses if needed)
+  /// \param[in] compression If nullopt, guess the compression scheme from the
+  ///     filename, else decompress with the given codec
+  Result<std::shared_ptr<io::InputStream>> OpenCompressed(
+      std::optional<Compression::type> compression = std::nullopt) const;
+
+  /// \brief equality comparison with another FileSource
+  bool Equals(const FileSource& other) const;
+
+ private:
+  static Result<std::shared_ptr<io::RandomAccessFile>> InvalidOpen() {
+    return Status::Invalid("Called Open() on an uninitialized FileSource");
+  }
+
+  fs::FileInfo file_info_;
+  std::shared_ptr<fs::FileSystem> filesystem_;
+  std::shared_ptr<Buffer> buffer_;
+  CustomOpen custom_open_;
+  int64_t custom_size_ = 0;
+  Compression::type compression_ = Compression::UNCOMPRESSED;
+};
+
+/// \brief Base class for file format implementation
+class ARROW_DS_EXPORT FileFormat : public std::enable_shared_from_this<FileFormat> {
+ public:
+  /// Options affecting how this format is scanned.
+  ///
+  /// The options here can be overridden at scan time.
+  std::shared_ptr<FragmentScanOptions> default_fragment_scan_options;
+
+  virtual ~FileFormat() = default;
+
+  /// \brief The name identifying the kind of file format
+  virtual std::string type_name() const = 0;
+
+  virtual bool Equals(const FileFormat& other) const = 0;
+
+  /// \brief Indicate if the FileSource is supported/readable by this format.
+  virtual Result<bool> IsSupported(const FileSource& source) const = 0;
+
+  /// \brief Return the schema of the file if possible.
+  virtual Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const = 0;
+
+  /// \brief Learn what we need about the file before we start scanning it
+  virtual Future<std::shared_ptr<InspectedFragment>> InspectFragment(
+      const FileSource& source, const FragmentScanOptions* format_options,
+      compute::ExecContext* exec_context) const;
+
+  virtual Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options,
+      const std::shared_ptr<FileFragment>& file) const = 0;
+
+  virtual Future<std::optional<int64_t>> CountRows(
+      const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options);
+
+  virtual Future<std::shared_ptr<FragmentScanner>> BeginScan(
+      const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
+      const FragmentScanOptions* format_options,
+      compute::ExecContext* exec_context) const;
+
+  /// \brief Open a fragment
+  virtual Result<std::shared_ptr<FileFragment>> MakeFragment(
+      FileSource source, compute::Expression partition_expression,
+      std::shared_ptr<Schema> physical_schema);
+
+  /// \brief Create a FileFragment for a FileSource.
+  Result<std::shared_ptr<FileFragment>> MakeFragment(
+      FileSource source, compute::Expression partition_expression);
+
+  /// \brief Create a FileFragment for a FileSource.
+  Result<std::shared_ptr<FileFragment>> MakeFragment(
+      FileSource source, std::shared_ptr<Schema> physical_schema = NULLPTR);
+
+  /// \brief Create a writer for this format.
+  virtual Result<std::shared_ptr<FileWriter>> MakeWriter(
+      std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const = 0;
+
+  /// \brief Get default write options for this format.
+  ///
+  /// May return null shared_ptr if this file format does not yet support
+  /// writing datasets.
+  virtual std::shared_ptr<FileWriteOptions> DefaultWriteOptions() = 0;
+
+ protected:
+  explicit FileFormat(std::shared_ptr<FragmentScanOptions> default_fragment_scan_options)
+      : default_fragment_scan_options(std::move(default_fragment_scan_options)) {}
+};
+
+/// \brief A Fragment that is stored in a file with a known format
+class ARROW_DS_EXPORT FileFragment : public Fragment,
+                                     public util::EqualityComparable<FileFragment> {
+ public:
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options) override;
+  Future<std::optional<int64_t>> CountRows(
+      compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options) override;
+  Future<std::shared_ptr<FragmentScanner>> BeginScan(
+      const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
+      const FragmentScanOptions* format_options,
+      compute::ExecContext* exec_context) override;
+  Future<std::shared_ptr<InspectedFragment>> InspectFragment(
+      const FragmentScanOptions* format_options,
+      compute::ExecContext* exec_context) override;
+
+  std::string type_name() const override { return format_->type_name(); }
+  std::string ToString() const override { return source_.path(); };
+
+  const FileSource& source() const { return source_; }
+  const std::shared_ptr<FileFormat>& format() const { return format_; }
+
+  bool Equals(const FileFragment& other) const;
+
+ protected:
+  FileFragment(FileSource source, std::shared_ptr<FileFormat> format,
+               compute::Expression partition_expression,
+               std::shared_ptr<Schema> physical_schema)
+      : Fragment(std::move(partition_expression), std::move(physical_schema)),
+        source_(std::move(source)),
+        format_(std::move(format)) {}
+
+  Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override;
+
+  FileSource source_;
+  std::shared_ptr<FileFormat> format_;
+
+  friend class FileFormat;
+};
+
+/// \brief A Dataset of FileFragments.
+///
+/// A FileSystemDataset is composed of one or more FileFragment. The fragments
+/// are independent and don't need to share the same format and/or filesystem.
+class ARROW_DS_EXPORT FileSystemDataset : public Dataset {
+ public:
+  /// \brief Create a FileSystemDataset.
+  ///
+  /// \param[in] schema the schema of the dataset
+  /// \param[in] root_partition the partition expression of the dataset
+  /// \param[in] format the format of each FileFragment.
+  /// \param[in] filesystem the filesystem of each FileFragment, or nullptr if the
+  ///            fragments wrap buffers.
+  /// \param[in] fragments list of fragments to create the dataset from.
+  /// \param[in] partitioning the Partitioning object in case the dataset is created
+  ///            with a known partitioning (e.g. from a discovered partitioning
+  ///            through a DatasetFactory), or nullptr if not known.
+  ///
+  /// Note that fragments wrapping files resident in differing filesystems are not
+  /// permitted; to work with multiple filesystems use a UnionDataset.
+  ///
+  /// \return A constructed dataset.
+  static Result<std::shared_ptr<FileSystemDataset>> Make(
+      std::shared_ptr<Schema> schema, compute::Expression root_partition,
+      std::shared_ptr<FileFormat> format, std::shared_ptr<fs::FileSystem> filesystem,
+      std::vector<std::shared_ptr<FileFragment>> fragments,
+      std::shared_ptr<Partitioning> partitioning = NULLPTR);
+
+  /// \brief Write a dataset.
+  static Status Write(const FileSystemDatasetWriteOptions& write_options,
+                      std::shared_ptr<Scanner> scanner);
+
+  /// \brief Return the type name of the dataset.
+  std::string type_name() const override { return "filesystem"; }
+
+  /// \brief Replace the schema of the dataset.
+  Result<std::shared_ptr<Dataset>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const override;
+
+  /// \brief Return the path of files.
+  std::vector<std::string> files() const;
+
+  /// \brief Return the format.
+  const std::shared_ptr<FileFormat>& format() const { return format_; }
+
+  /// \brief Return the filesystem. May be nullptr if the fragments wrap buffers.
+  const std::shared_ptr<fs::FileSystem>& filesystem() const { return filesystem_; }
+
+  /// \brief Return the partitioning. May be nullptr if the dataset was not constructed
+  /// with a partitioning.
+  const std::shared_ptr<Partitioning>& partitioning() const { return partitioning_; }
+
+  std::string ToString() const;
+
+ protected:
+  struct FragmentSubtrees;
+
+  explicit FileSystemDataset(std::shared_ptr<Schema> schema)
+      : Dataset(std::move(schema)) {}
+
+  FileSystemDataset(std::shared_ptr<Schema> schema,
+                    compute::Expression partition_expression)
+      : Dataset(std::move(schema), partition_expression) {}
+
+  Result<FragmentIterator> GetFragmentsImpl(compute::Expression predicate) override;
+
+  void SetupSubtreePruning();
+
+  std::shared_ptr<FileFormat> format_;
+  std::shared_ptr<fs::FileSystem> filesystem_;
+  std::vector<std::shared_ptr<FileFragment>> fragments_;
+  std::shared_ptr<Partitioning> partitioning_;
+
+  std::shared_ptr<FragmentSubtrees> subtrees_;
+};
+
+/// \brief Options for writing a file of this format.
+class ARROW_DS_EXPORT FileWriteOptions {
+ public:
+  virtual ~FileWriteOptions() = default;
+
+  const std::shared_ptr<FileFormat>& format() const { return format_; }
+
+  std::string type_name() const { return format_->type_name(); }
+
+ protected:
+  explicit FileWriteOptions(std::shared_ptr<FileFormat> format)
+      : format_(std::move(format)) {}
+
+  std::shared_ptr<FileFormat> format_;
+};
+
+/// \brief A writer for this format.
+class ARROW_DS_EXPORT FileWriter {
+ public:
+  virtual ~FileWriter() = default;
+
+  /// \brief Write the given batch.
+  virtual Status Write(const std::shared_ptr<RecordBatch>& batch) = 0;
+
+  /// \brief Write all batches from the reader.
+  Status Write(RecordBatchReader* batches);
+
+  /// \brief Indicate that writing is done.
+  virtual Future<> Finish();
+
+  const std::shared_ptr<FileFormat>& format() const { return options_->format(); }
+  const std::shared_ptr<Schema>& schema() const { return schema_; }
+  const std::shared_ptr<FileWriteOptions>& options() const { return options_; }
+  const fs::FileLocator& destination() const { return destination_locator_; }
+
+  /// \brief After Finish() is called, provides number of bytes written to file.
+  Result<int64_t> GetBytesWritten() const;
+
+ protected:
+  FileWriter(std::shared_ptr<Schema> schema, std::shared_ptr<FileWriteOptions> options,
+             std::shared_ptr<io::OutputStream> destination,
+             fs::FileLocator destination_locator)
+      : schema_(std::move(schema)),
+        options_(std::move(options)),
+        destination_(std::move(destination)),
+        destination_locator_(std::move(destination_locator)) {}
+
+  virtual Future<> FinishInternal() = 0;
+
+  std::shared_ptr<Schema> schema_;
+  std::shared_ptr<FileWriteOptions> options_;
+  std::shared_ptr<io::OutputStream> destination_;
+  fs::FileLocator destination_locator_;
+  std::optional<int64_t> bytes_written_;
+};
+
+/// \brief Options for writing a dataset.
+struct ARROW_DS_EXPORT FileSystemDatasetWriteOptions {
+  /// Options for individual fragment writing.
+  std::shared_ptr<FileWriteOptions> file_write_options;
+
+  /// FileSystem into which a dataset will be written.
+  std::shared_ptr<fs::FileSystem> filesystem;
+
+  /// Root directory into which the dataset will be written.
+  std::string base_dir;
+
+  /// Partitioning used to generate fragment paths.
+  std::shared_ptr<Partitioning> partitioning;
+
+  /// If true the order of rows in the dataset is preserved when writing with
+  /// multiple threads. This may cause notable performance degradation.
+  bool preserve_order = false;
+
+  /// Maximum number of partitions any batch may be written into, default is 1K.
+  int max_partitions = 1024;
+
+  /// Template string used to generate fragment basenames.
+  /// {i} will be replaced by an auto incremented integer.
+  std::string basename_template;
+
+  /// A functor which will be applied on an incremented counter.  The result will be
+  /// inserted into the basename_template in place of {i}.
+  ///
+  /// This can be used, for example, to left-pad the file counter.
+  std::function<std::string(int)> basename_template_functor;
+
+  /// If greater than 0 then this will limit the maximum number of files that can be left
+  /// open. If an attempt is made to open too many files then the least recently used file
+  /// will be closed.  If this setting is set too low you may end up fragmenting your data
+  /// into many small files.
+  ///
+  /// The default is 900 which also allows some # of files to be open by the scanner
+  /// before hitting the default Linux limit of 1024
+  uint32_t max_open_files = 900;
+
+  /// If greater than 0 then this will limit how many rows are placed in any single file.
+  /// Otherwise there will be no limit and one file will be created in each output
+  /// directory unless files need to be closed to respect max_open_files
+  uint64_t max_rows_per_file = 0;
+
+  /// If greater than 0 then this will cause the dataset writer to batch incoming data
+  /// and only write the row groups to the disk when sufficient rows have accumulated.
+  /// The final row group size may be less than this value and other options such as
+  /// `max_open_files` or `max_rows_per_file` lead to smaller row group sizes.
+  uint64_t min_rows_per_group = 0;
+
+  /// If greater than 0 then the dataset writer may split up large incoming batches into
+  /// multiple row groups.  If this value is set then min_rows_per_group should also be
+  /// set or else you may end up with very small row groups (e.g. if the incoming row
+  /// group size is just barely larger than this value).
+  uint64_t max_rows_per_group = 1 << 20;
+
+  /// Controls what happens if an output directory already exists.
+  ExistingDataBehavior existing_data_behavior = ExistingDataBehavior::kError;
+
+  /// \brief If false the dataset writer will not create directories
+  /// This is mainly intended for filesystems that do not require directories such as S3.
+  bool create_dir = true;
+
+  /// Callback to be invoked against all FileWriters before
+  /// they are finalized with FileWriter::Finish().
+  std::function<Status(FileWriter*)> writer_pre_finish = [](FileWriter*) {
+    return Status::OK();
+  };
+
+  /// Callback to be invoked against all FileWriters after they have
+  /// called FileWriter::Finish().
+  std::function<Status(FileWriter*)> writer_post_finish = [](FileWriter*) {
+    return Status::OK();
+  };
+
+  const std::shared_ptr<FileFormat>& format() const {
+    return file_write_options->format();
+  }
+};
+
+/// \brief Wraps FileSystemDatasetWriteOptions for consumption as compute::ExecNodeOptions
+class ARROW_DS_EXPORT WriteNodeOptions : public acero::ExecNodeOptions {
+ public:
+  explicit WriteNodeOptions(
+      FileSystemDatasetWriteOptions options,
+      std::shared_ptr<const KeyValueMetadata> custom_metadata = NULLPTR)
+      : write_options(std::move(options)), custom_metadata(std::move(custom_metadata)) {}
+
+  /// \brief Options to control how to write the dataset
+  FileSystemDatasetWriteOptions write_options;
+  /// \brief Optional schema to attach to all written batches
+  ///
+  /// By default, we will use the output schema of the input.
+  ///
+  /// This can be used to alter schema metadata, field nullability, or field metadata.
+  /// However, this cannot be used to change the type of data.  If the custom schema does
+  /// not have the same number of fields and the same data types as the input then the
+  /// plan will fail.
+  std::shared_ptr<Schema> custom_schema;
+  /// \brief Optional metadata to attach to written batches
+  std::shared_ptr<const KeyValueMetadata> custom_metadata;
+};
+
+/// @}
+
+namespace internal {
+ARROW_DS_EXPORT void InitializeDatasetWriter(arrow::acero::ExecFactoryRegistry* registry);
+}
+
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/file_csv.h b/pyarrow/include/arrow/dataset/file_csv.h
new file mode 100644
index 0000000000000000000000000000000000000000..42e3fd7246988e625e0d2e69a29bd40c553e3219
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/file_csv.h
@@ -0,0 +1,144 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/csv/options.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/ipc/type_fwd.h"
+#include "arrow/status.h"
+#include "arrow/util/compression.h"
+
+namespace arrow {
+namespace dataset {
+
+constexpr char kCsvTypeName[] = "csv";
+
+/// \addtogroup dataset-file-formats
+///
+/// @{
+
+/// \brief A FileFormat implementation that reads from and writes to Csv files
+class ARROW_DS_EXPORT CsvFileFormat : public FileFormat {
+ public:
+  // TODO(ARROW-18328) Remove this, moved to CsvFragmentScanOptions
+  /// Options affecting the parsing of CSV files
+  csv::ParseOptions parse_options = csv::ParseOptions::Defaults();
+
+  CsvFileFormat();
+
+  std::string type_name() const override { return kCsvTypeName; }
+
+  bool Equals(const FileFormat& other) const override;
+
+  Result<bool> IsSupported(const FileSource& source) const override;
+
+  /// \brief Return the schema of the file if possible.
+  Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
+
+  Future<std::shared_ptr<FragmentScanner>> BeginScan(
+      const FragmentScanRequest& request, const InspectedFragment& inspected_fragment,
+      const FragmentScanOptions* format_options,
+      compute::ExecContext* exec_context) const override;
+
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& scan_options,
+      const std::shared_ptr<FileFragment>& file) const override;
+
+  Future<std::shared_ptr<InspectedFragment>> InspectFragment(
+      const FileSource& source, const FragmentScanOptions* format_options,
+      compute::ExecContext* exec_context) const override;
+
+  Future<std::optional<int64_t>> CountRows(
+      const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options) override;
+
+  Result<std::shared_ptr<FileWriter>> MakeWriter(
+      std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override;
+
+  std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
+};
+
+/// \brief Per-scan options for CSV fragments
+struct ARROW_DS_EXPORT CsvFragmentScanOptions : public FragmentScanOptions {
+  std::string type_name() const override { return kCsvTypeName; }
+
+  using StreamWrapFunc = std::function<Result<std::shared_ptr<io::InputStream>>(
+      std::shared_ptr<io::InputStream>)>;
+
+  /// CSV conversion options
+  csv::ConvertOptions convert_options = csv::ConvertOptions::Defaults();
+
+  /// CSV reading options
+  ///
+  /// Note that use_threads is always ignored.
+  csv::ReadOptions read_options = csv::ReadOptions::Defaults();
+
+  /// CSV parse options
+  csv::ParseOptions parse_options = csv::ParseOptions::Defaults();
+
+  /// Optional stream wrapping function
+  ///
+  /// If defined, all open dataset file fragments will be passed
+  /// through this function.  One possible use case is to transparently
+  /// transcode all input files from a given character set to utf8.
+  StreamWrapFunc stream_transform_func{};
+};
+
+class ARROW_DS_EXPORT CsvFileWriteOptions : public FileWriteOptions {
+ public:
+  /// Options passed to csv::MakeCSVWriter.
+  std::shared_ptr<csv::WriteOptions> write_options;
+
+ protected:
+  explicit CsvFileWriteOptions(std::shared_ptr<FileFormat> format)
+      : FileWriteOptions(std::move(format)) {}
+
+  friend class CsvFileFormat;
+};
+
+class ARROW_DS_EXPORT CsvFileWriter : public FileWriter {
+ public:
+  Status Write(const std::shared_ptr<RecordBatch>& batch) override;
+
+ private:
+  CsvFileWriter(std::shared_ptr<io::OutputStream> destination,
+                std::shared_ptr<ipc::RecordBatchWriter> writer,
+                std::shared_ptr<Schema> schema,
+                std::shared_ptr<CsvFileWriteOptions> options,
+                fs::FileLocator destination_locator);
+
+  Future<> FinishInternal() override;
+
+  std::shared_ptr<io::OutputStream> destination_;
+  std::shared_ptr<ipc::RecordBatchWriter> batch_writer_;
+
+  friend class CsvFileFormat;
+};
+
+/// @}
+
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/file_ipc.h b/pyarrow/include/arrow/dataset/file_ipc.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f7da82a0af5b1e58b724646853e8f482781778b
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/file_ipc.h
@@ -0,0 +1,123 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/io/type_fwd.h"
+#include "arrow/ipc/type_fwd.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace dataset {
+
+/// \addtogroup dataset-file-formats
+///
+/// @{
+
+constexpr char kIpcTypeName[] = "ipc";
+
+/// \brief A FileFormat implementation that reads from and writes to Ipc files
+class ARROW_DS_EXPORT IpcFileFormat : public FileFormat {
+ public:
+  std::string type_name() const override { return kIpcTypeName; }
+
+  IpcFileFormat();
+
+  bool Equals(const FileFormat& other) const override {
+    return type_name() == other.type_name();
+  }
+
+  Result<bool> IsSupported(const FileSource& source) const override;
+
+  /// \brief Return the schema of the file if possible.
+  Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
+
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options,
+      const std::shared_ptr<FileFragment>& file) const override;
+
+  Future<std::optional<int64_t>> CountRows(
+      const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options) override;
+
+  Result<std::shared_ptr<FileWriter>> MakeWriter(
+      std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override;
+
+  std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
+};
+
+/// \brief Per-scan options for IPC fragments
+class ARROW_DS_EXPORT IpcFragmentScanOptions : public FragmentScanOptions {
+ public:
+  std::string type_name() const override { return kIpcTypeName; }
+
+  /// Options passed to the IPC file reader.
+  /// included_fields, memory_pool, and use_threads are ignored.
+  std::shared_ptr<ipc::IpcReadOptions> options;
+  /// If present, the async scanner will enable I/O coalescing.
+  /// This is ignored by the sync scanner.
+  std::shared_ptr<io::CacheOptions> cache_options;
+};
+
+class ARROW_DS_EXPORT IpcFileWriteOptions : public FileWriteOptions {
+ public:
+  /// Options passed to ipc::MakeFileWriter. use_threads is ignored
+  std::shared_ptr<ipc::IpcWriteOptions> options;
+
+  /// custom_metadata written to the file's footer
+  std::shared_ptr<const KeyValueMetadata> metadata;
+
+ protected:
+  explicit IpcFileWriteOptions(std::shared_ptr<FileFormat> format)
+      : FileWriteOptions(std::move(format)) {}
+
+  friend class IpcFileFormat;
+};
+
+class ARROW_DS_EXPORT IpcFileWriter : public FileWriter {
+ public:
+  Status Write(const std::shared_ptr<RecordBatch>& batch) override;
+
+ private:
+  IpcFileWriter(std::shared_ptr<io::OutputStream> destination,
+                std::shared_ptr<ipc::RecordBatchWriter> writer,
+                std::shared_ptr<Schema> schema,
+                std::shared_ptr<IpcFileWriteOptions> options,
+                fs::FileLocator destination_locator);
+
+  Future<> FinishInternal() override;
+
+  std::shared_ptr<io::OutputStream> destination_;
+  std::shared_ptr<ipc::RecordBatchWriter> batch_writer_;
+
+  friend class IpcFileFormat;
+};
+
+/// @}
+
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/file_json.h b/pyarrow/include/arrow/dataset/file_json.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b8112d87095ccc9d02b0c52b4df2b1e674b8cc5
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/file_json.h
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/ipc/type_fwd.h"
+#include "arrow/json/options.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/future.h"
+#include "arrow/util/macros.h"
+
+namespace arrow::dataset {
+
+/// \addtogroup dataset-file-formats
+///
+/// @{
+
+constexpr char kJsonTypeName[] = "json";
+
+/// \brief A FileFormat implementation that reads from JSON files
+class ARROW_DS_EXPORT JsonFileFormat : public FileFormat {
+ public:
+  JsonFileFormat();
+
+  std::string type_name() const override { return kJsonTypeName; }
+
+  bool Equals(const FileFormat& other) const override;
+
+  Result<bool> IsSupported(const FileSource& source) const override;
+
+  Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
+
+  Future<std::shared_ptr<InspectedFragment>> InspectFragment(
+      const FileSource& source, const FragmentScanOptions* format_options,
+      compute::ExecContext* exec_context) const override;
+
+  Future<std::shared_ptr<FragmentScanner>> BeginScan(
+      const FragmentScanRequest& scan_request, const InspectedFragment& inspected,
+      const FragmentScanOptions* format_options,
+      compute::ExecContext* exec_context) const override;
+
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& scan_options,
+      const std::shared_ptr<FileFragment>& file) const override;
+
+  Future<std::optional<int64_t>> CountRows(
+      const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& scan_options) override;
+
+  Result<std::shared_ptr<FileWriter>> MakeWriter(
+      std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override {
+    return Status::NotImplemented("Writing JSON files is not currently supported");
+  }
+
+  std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override { return NULLPTR; }
+};
+
+/// \brief Per-scan options for JSON fragments
+struct ARROW_DS_EXPORT JsonFragmentScanOptions : public FragmentScanOptions {
+  std::string type_name() const override { return kJsonTypeName; }
+
+  /// @brief Options that affect JSON parsing
+  ///
+  /// Note: `explicit_schema` and `unexpected_field_behavior` are ignored.
+  json::ParseOptions parse_options = json::ParseOptions::Defaults();
+
+  /// @brief Options that affect JSON reading
+  json::ReadOptions read_options = json::ReadOptions::Defaults();
+};
+
+/// @}
+
+}  // namespace arrow::dataset
diff --git a/pyarrow/include/arrow/dataset/file_orc.h b/pyarrow/include/arrow/dataset/file_orc.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bfefd1e02b5cccf74cf8ade579a937341aef013
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/file_orc.h
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/io/type_fwd.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace dataset {
+
+/// \addtogroup dataset-file-formats
+///
+/// @{
+
+constexpr char kOrcTypeName[] = "orc";
+
+/// \brief A FileFormat implementation that reads from and writes to ORC files
+class ARROW_DS_EXPORT OrcFileFormat : public FileFormat {
+ public:
+  OrcFileFormat();
+
+  std::string type_name() const override { return kOrcTypeName; }
+
+  bool Equals(const FileFormat& other) const override {
+    return type_name() == other.type_name();
+  }
+
+  Result<bool> IsSupported(const FileSource& source) const override;
+
+  /// \brief Return the schema of the file if possible.
+  Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
+
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options,
+      const std::shared_ptr<FileFragment>& file) const override;
+
+  Future<std::optional<int64_t>> CountRows(
+      const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options) override;
+
+  Result<std::shared_ptr<FileWriter>> MakeWriter(
+      std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override;
+
+  std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
+};
+
+/// @}
+
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/file_parquet.h b/pyarrow/include/arrow/dataset/file_parquet.h
new file mode 100644
index 0000000000000000000000000000000000000000..1811a96bf986f69f8c6e6ad040fe653a519ba95e
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/file_parquet.h
@@ -0,0 +1,410 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "arrow/dataset/discovery.h"
+#include "arrow/dataset/file_base.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/io/caching.h"
+
+namespace parquet {
+class ParquetFileReader;
+class Statistics;
+class ColumnChunkMetaData;
+class RowGroupMetaData;
+class FileMetaData;
+class FileDecryptionProperties;
+class FileEncryptionProperties;
+
+class ReaderProperties;
+class ArrowReaderProperties;
+
+class WriterProperties;
+class ArrowWriterProperties;
+
+namespace arrow {
+class FileReader;
+class FileWriter;
+struct SchemaManifest;
+}  // namespace arrow
+}  // namespace parquet
+
+namespace arrow {
+namespace dataset {
+
+struct ParquetDecryptionConfig;
+struct ParquetEncryptionConfig;
+
+/// \addtogroup dataset-file-formats
+///
+/// @{
+
+constexpr char kParquetTypeName[] = "parquet";
+
+/// \brief A FileFormat implementation that reads from Parquet files
+class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat {
+ public:
+  ParquetFileFormat();
+
+  /// Convenience constructor which copies properties from a parquet::ReaderProperties.
+  /// memory_pool will be ignored.
+  explicit ParquetFileFormat(const parquet::ReaderProperties& reader_properties);
+
+  std::string type_name() const override { return kParquetTypeName; }
+
+  bool Equals(const FileFormat& other) const override;
+
+  struct ReaderOptions {
+    /// \defgroup parquet-file-format-arrow-reader-properties properties which correspond
+    /// to members of parquet::ArrowReaderProperties.
+    ///
+    /// We don't embed parquet::ReaderProperties directly because column names (rather
+    /// than indices) are used to indicate dictionary columns, and other options are
+    /// deferred to scan time.
+    ///
+    /// @{
+    std::unordered_set<std::string> dict_columns;
+    arrow::TimeUnit::type coerce_int96_timestamp_unit = arrow::TimeUnit::NANO;
+    Type::type binary_type = Type::BINARY;
+    Type::type list_type = Type::LIST;
+    /// @}
+  } reader_options;
+
+  Result<bool> IsSupported(const FileSource& source) const override;
+
+  /// \brief Return the schema of the file if possible.
+  Result<std::shared_ptr<Schema>> Inspect(const FileSource& source) const override;
+
+  Result<RecordBatchGenerator> ScanBatchesAsync(
+      const std::shared_ptr<ScanOptions>& options,
+      const std::shared_ptr<FileFragment>& file) const override;
+
+  Future<std::optional<int64_t>> CountRows(
+      const std::shared_ptr<FileFragment>& file, compute::Expression predicate,
+      const std::shared_ptr<ScanOptions>& options) override;
+
+  using FileFormat::MakeFragment;
+
+  /// \brief Create a Fragment targeting all RowGroups.
+  Result<std::shared_ptr<FileFragment>> MakeFragment(
+      FileSource source, compute::Expression partition_expression,
+      std::shared_ptr<Schema> physical_schema) override;
+
+  /// \brief Create a Fragment, restricted to the specified row groups.
+  Result<std::shared_ptr<ParquetFileFragment>> MakeFragment(
+      FileSource source, compute::Expression partition_expression,
+      std::shared_ptr<Schema> physical_schema, std::vector<int> row_groups);
+
+  /// \brief Return a FileReader on the given source.
+  Result<std::shared_ptr<parquet::arrow::FileReader>> GetReader(
+      const FileSource& source, const std::shared_ptr<ScanOptions>& options) const;
+
+  Result<std::shared_ptr<parquet::arrow::FileReader>> GetReader(
+      const FileSource& source, const std::shared_ptr<ScanOptions>& options,
+      const std::shared_ptr<parquet::FileMetaData>& metadata) const;
+
+  Future<std::shared_ptr<parquet::arrow::FileReader>> GetReaderAsync(
+      const FileSource& source, const std::shared_ptr<ScanOptions>& options) const;
+
+  Future<std::shared_ptr<parquet::arrow::FileReader>> GetReaderAsync(
+      const FileSource& source, const std::shared_ptr<ScanOptions>& options,
+      const std::shared_ptr<parquet::FileMetaData>& metadata) const;
+
+  Result<std::shared_ptr<FileWriter>> MakeWriter(
+      std::shared_ptr<io::OutputStream> destination, std::shared_ptr<Schema> schema,
+      std::shared_ptr<FileWriteOptions> options,
+      fs::FileLocator destination_locator) const override;
+
+  std::shared_ptr<FileWriteOptions> DefaultWriteOptions() override;
+};
+
+/// \brief A FileFragment with parquet logic.
+///
+/// ParquetFileFragment provides a lazy (with respect to IO) interface to
+/// scan parquet files. Any heavy IO calls are deferred to the Scan() method.
+///
+/// The caller can provide an optional list of selected RowGroups to limit the
+/// number of scanned RowGroups, or to partition the scans across multiple
+/// threads.
+///
+/// Metadata can be explicitly provided, enabling pushdown predicate benefits without
+/// the potentially heavy IO of loading Metadata from the file system. This can induce
+/// significant performance boost when scanning high latency file systems.
+class ARROW_DS_EXPORT ParquetFileFragment : public FileFragment {
+ public:
+  Result<FragmentVector> SplitByRowGroup(compute::Expression predicate);
+
+  /// \brief Return the RowGroups selected by this fragment.
+  const std::vector<int>& row_groups() const {
+    if (row_groups_) return *row_groups_;
+    static std::vector<int> empty;
+    return empty;
+  }
+
+  /// \brief Return the FileMetaData associated with this fragment.
+  ///
+  /// This may return nullptr if the fragment wasn't scanned yet, or if
+  /// `ScanOptions::cache_metadata` was disabled.
+  std::shared_ptr<parquet::FileMetaData> metadata();
+
+  /// \brief Ensure this fragment's FileMetaData is in memory.
+  Status EnsureCompleteMetadata(parquet::arrow::FileReader* reader = NULLPTR);
+
+  Status ClearCachedMetadata() override;
+
+  /// \brief Return fragment which selects a filtered subset of this fragment's RowGroups.
+  Result<std::shared_ptr<Fragment>> Subset(compute::Expression predicate);
+  Result<std::shared_ptr<Fragment>> Subset(std::vector<int> row_group_ids);
+
+  static std::optional<compute::Expression> EvaluateStatisticsAsExpression(
+      const Field& field, const parquet::Statistics& statistics);
+
+  static std::optional<compute::Expression> EvaluateStatisticsAsExpression(
+      const Field& field, const FieldRef& field_ref,
+      const parquet::Statistics& statistics);
+
+ private:
+  ParquetFileFragment(FileSource source, std::shared_ptr<FileFormat> format,
+                      compute::Expression partition_expression,
+                      std::shared_ptr<Schema> physical_schema,
+                      std::optional<std::vector<int>> row_groups);
+
+  Status SetMetadata(std::shared_ptr<parquet::FileMetaData> metadata,
+                     std::shared_ptr<parquet::arrow::SchemaManifest> manifest,
+                     std::shared_ptr<parquet::FileMetaData> original_metadata = {});
+
+  // Overridden to opportunistically set metadata since a reader must be opened anyway.
+  Result<std::shared_ptr<Schema>> ReadPhysicalSchemaImpl() override {
+    ARROW_RETURN_NOT_OK(EnsureCompleteMetadata());
+    return physical_schema_;
+  }
+
+  /// Return a filtered subset of row group indices.
+  Result<std::vector<int>> FilterRowGroups(compute::Expression predicate);
+  /// Simplify the predicate against the statistics of each row group.
+  Result<std::vector<compute::Expression>> TestRowGroups(compute::Expression predicate);
+  /// Try to count rows matching the predicate using metadata. Expects
+  /// metadata to be present, and expects the predicate to have been
+  /// simplified against the partition expression already.
+  Result<std::optional<int64_t>> TryCountRows(compute::Expression predicate);
+
+  ParquetFileFormat& parquet_format_;
+
+  /// Indices of row groups selected by this fragment,
+  /// or std::nullopt if all row groups are selected.
+  std::optional<std::vector<int>> row_groups_;
+
+  // the expressions (combined for all columns for which statistics have been
+  // processed) are stored per column group
+  std::vector<compute::Expression> statistics_expressions_;
+  // statistics status are kept track of by Parquet Schema column indices
+  // (i.e. not Arrow schema field index)
+  std::vector<bool> statistics_expressions_complete_;
+  std::shared_ptr<parquet::FileMetaData> metadata_;
+  std::shared_ptr<parquet::arrow::SchemaManifest> manifest_;
+  // The FileMetaData that owns the SchemaDescriptor pointed by SchemaManifest.
+  std::shared_ptr<parquet::FileMetaData> original_metadata_;
+
+  friend class ParquetFileFormat;
+  friend class ParquetDatasetFactory;
+};
+
+/// \brief Per-scan options for Parquet fragments
+class ARROW_DS_EXPORT ParquetFragmentScanOptions : public FragmentScanOptions {
+ public:
+  ParquetFragmentScanOptions();
+  std::string type_name() const override { return kParquetTypeName; }
+
+  /// Reader properties. Not all properties are respected: memory_pool comes from
+  /// ScanOptions.
+  std::shared_ptr<parquet::ReaderProperties> reader_properties;
+  /// Arrow reader properties. Not all properties are respected: batch_size comes from
+  /// ScanOptions. Additionally, other options come from ParquetFileFormat::ReaderOptions.
+  std::shared_ptr<parquet::ArrowReaderProperties> arrow_reader_properties;
+  /// A configuration structure that provides decryption properties for a dataset
+  std::shared_ptr<ParquetDecryptionConfig> parquet_decryption_config = NULLPTR;
+};
+
+class ARROW_DS_EXPORT ParquetFileWriteOptions : public FileWriteOptions {
+ public:
+  /// \brief Parquet writer properties.
+  std::shared_ptr<parquet::WriterProperties> writer_properties;
+
+  /// \brief Parquet Arrow writer properties.
+  std::shared_ptr<parquet::ArrowWriterProperties> arrow_writer_properties;
+
+  // A configuration structure that provides encryption properties for a dataset
+  std::shared_ptr<ParquetEncryptionConfig> parquet_encryption_config = NULLPTR;
+
+ protected:
+  explicit ParquetFileWriteOptions(std::shared_ptr<FileFormat> format)
+      : FileWriteOptions(std::move(format)) {}
+
+  friend class ParquetFileFormat;
+};
+
+class ARROW_DS_EXPORT ParquetFileWriter : public FileWriter {
+ public:
+  const std::shared_ptr<parquet::arrow::FileWriter>& parquet_writer() const {
+    return parquet_writer_;
+  }
+
+  Status Write(const std::shared_ptr<RecordBatch>& batch) override;
+
+ private:
+  ParquetFileWriter(std::shared_ptr<io::OutputStream> destination,
+                    std::shared_ptr<parquet::arrow::FileWriter> writer,
+                    std::shared_ptr<ParquetFileWriteOptions> options,
+                    fs::FileLocator destination_locator);
+
+  Future<> FinishInternal() override;
+
+  std::shared_ptr<parquet::arrow::FileWriter> parquet_writer_;
+
+  friend class ParquetFileFormat;
+};
+
+/// \brief Options for making a FileSystemDataset from a Parquet _metadata file.
+struct ParquetFactoryOptions {
+  /// Either an explicit Partitioning or a PartitioningFactory to discover one.
+  ///
+  /// If a factory is provided, it will be used to infer a schema for partition fields
+  /// based on file and directory paths then construct a Partitioning. The default
+  /// is a Partitioning which will yield no partition information.
+  ///
+  /// The (explicit or discovered) partitioning will be applied to discovered files
+  /// and the resulting partition information embedded in the Dataset.
+  PartitioningOrFactory partitioning{Partitioning::Default()};
+
+  /// For the purposes of applying the partitioning, paths will be stripped
+  /// of the partition_base_dir. Files not matching the partition_base_dir
+  /// prefix will be skipped for partition discovery. The ignored files will still
+  /// be part of the Dataset, but will not have partition information.
+  ///
+  /// Example:
+  /// partition_base_dir = "/dataset";
+  ///
+  /// - "/dataset/US/sales.csv" -> "US/sales.csv" will be given to the partitioning
+  ///
+  /// - "/home/john/late_sales.csv" -> Will be ignored for partition discovery.
+  ///
+  /// This is useful for partitioning which parses directory when ordering
+  /// is important, e.g. DirectoryPartitioning.
+  std::string partition_base_dir;
+
+  /// Assert that all ColumnChunk paths are consistent. The parquet spec allows for
+  /// ColumnChunk data to be stored in multiple files, but ParquetDatasetFactory
+  /// supports only a single file with all ColumnChunk data. If this flag is set
+  /// construction of a ParquetDatasetFactory will raise an error if ColumnChunk
+  /// data is not resident in a single file.
+  bool validate_column_chunk_paths = false;
+};
+
+/// \brief Create FileSystemDataset from custom `_metadata` cache file.
+///
+/// Dask and other systems will generate a cache metadata file by concatenating
+/// the RowGroupMetaData of multiple parquet files into a single parquet file
+/// that only contains metadata and no ColumnChunk data.
+///
+/// ParquetDatasetFactory creates a FileSystemDataset composed of
+/// ParquetFileFragment where each fragment is pre-populated with the exact
+/// number of row groups and statistics for each columns.
+class ARROW_DS_EXPORT ParquetDatasetFactory : public DatasetFactory {
+ public:
+  /// \brief Create a ParquetDatasetFactory from a metadata path.
+  ///
+  /// The `metadata_path` will be read from `filesystem`. Each RowGroup
+  /// contained in the metadata file will be relative to `dirname(metadata_path)`.
+  ///
+  /// \param[in] metadata_path path of the metadata parquet file
+  /// \param[in] filesystem from which to open/read the path
+  /// \param[in] format to read the file with.
+  /// \param[in] options see ParquetFactoryOptions
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      const std::string& metadata_path, std::shared_ptr<fs::FileSystem> filesystem,
+      std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options);
+
+  /// \brief Create a ParquetDatasetFactory from a metadata source.
+  ///
+  /// Similar to the previous Make definition, but the metadata can be a Buffer
+  /// and the base_path is explicit instead of inferred from the metadata
+  /// path.
+  ///
+  /// \param[in] metadata source to open the metadata parquet file from
+  /// \param[in] base_path used as the prefix of every parquet files referenced
+  /// \param[in] filesystem from which to read the files referenced.
+  /// \param[in] format to read the file with.
+  /// \param[in] options see ParquetFactoryOptions
+  static Result<std::shared_ptr<DatasetFactory>> Make(
+      const FileSource& metadata, const std::string& base_path,
+      std::shared_ptr<fs::FileSystem> filesystem,
+      std::shared_ptr<ParquetFileFormat> format, ParquetFactoryOptions options);
+
+  Result<std::vector<std::shared_ptr<Schema>>> InspectSchemas(
+      InspectOptions options) override;
+
+  Result<std::shared_ptr<Dataset>> Finish(FinishOptions options) override;
+
+ protected:
+  ParquetDatasetFactory(
+      std::shared_ptr<fs::FileSystem> filesystem,
+      std::shared_ptr<ParquetFileFormat> format,
+      std::shared_ptr<parquet::FileMetaData> metadata,
+      std::shared_ptr<parquet::arrow::SchemaManifest> manifest,
+      std::shared_ptr<Schema> physical_schema, std::string base_path,
+      ParquetFactoryOptions options,
+      std::vector<std::pair<std::string, std::vector<int>>> paths_with_row_group_ids)
+      : filesystem_(std::move(filesystem)),
+        format_(std::move(format)),
+        metadata_(std::move(metadata)),
+        manifest_(std::move(manifest)),
+        physical_schema_(std::move(physical_schema)),
+        base_path_(std::move(base_path)),
+        options_(std::move(options)),
+        paths_with_row_group_ids_(std::move(paths_with_row_group_ids)) {}
+
+  std::shared_ptr<fs::FileSystem> filesystem_;
+  std::shared_ptr<ParquetFileFormat> format_;
+  std::shared_ptr<parquet::FileMetaData> metadata_;
+  std::shared_ptr<parquet::arrow::SchemaManifest> manifest_;
+  std::shared_ptr<Schema> physical_schema_;
+  std::string base_path_;
+  ParquetFactoryOptions options_;
+  std::vector<std::pair<std::string, std::vector<int>>> paths_with_row_group_ids_;
+
+ private:
+  Result<std::vector<std::shared_ptr<FileFragment>>> CollectParquetFragments(
+      const Partitioning& partitioning);
+
+  Result<std::shared_ptr<Schema>> PartitionSchema();
+};
+
+/// @}
+
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/parquet_encryption_config.h b/pyarrow/include/arrow/dataset/parquet_encryption_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..96200b8a3118b82c92977d222ba8775f61a02b0b
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/parquet_encryption_config.h
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/dataset/type_fwd.h"
+
+namespace parquet::encryption {
+class CryptoFactory;
+struct KmsConnectionConfig;
+struct EncryptionConfiguration;
+struct DecryptionConfiguration;
+}  // namespace parquet::encryption
+
+namespace arrow {
+namespace dataset {
+
+/// \brief Core configuration class encapsulating parameters for high-level encryption
+/// within Parquet framework.
+///
+/// ParquetEncryptionConfig serves as a bridge, passing encryption-related
+/// parameters to appropriate components within the Parquet library. It holds references
+/// to objects defining encryption strategy, Key Management Service (KMS) configuration,
+/// and specific encryption configurations for Parquet data.
+struct ARROW_DS_EXPORT ParquetEncryptionConfig {
+  ///  Shared pointer to CryptoFactory object, responsible for creating cryptographic
+  ///  components like encryptors and decryptors.
+  std::shared_ptr<parquet::encryption::CryptoFactory> crypto_factory;
+
+  ///  Shared pointer to KmsConnectionConfig object, holding configuration parameters for
+  ///  connecting to a Key Management Service (KMS).
+  std::shared_ptr<parquet::encryption::KmsConnectionConfig> kms_connection_config;
+
+  ///  Shared pointer to EncryptionConfiguration object, defining specific encryption
+  ///  settings for Parquet data, like keys for different columns.
+  std::shared_ptr<parquet::encryption::EncryptionConfiguration> encryption_config;
+};
+
+/// \brief Core configuration class encapsulating parameters for high-level decryption
+/// within Parquet framework.
+///
+/// ParquetDecryptionConfig is designed to pass decryption-related parameters to
+/// appropriate decryption components within Parquet library. It holds references to
+/// objects defining decryption strategy, Key Management Service (KMS) configuration,
+/// and specific decryption configurations for reading encrypted Parquet data.
+struct ARROW_DS_EXPORT ParquetDecryptionConfig {
+  ///  Shared pointer to CryptoFactory object, pivotal in creating cryptographic
+  ///  components for decryption process.
+  std::shared_ptr<parquet::encryption::CryptoFactory> crypto_factory;
+
+  ///  Shared pointer to KmsConnectionConfig object, containing parameters for connecting
+  ///  to a Key Management Service (KMS) during decryption.
+  std::shared_ptr<parquet::encryption::KmsConnectionConfig> kms_connection_config;
+
+  ///  Shared pointer to DecryptionConfiguration object, specifying decryption settings
+  ///  for reading encrypted Parquet data.
+  std::shared_ptr<parquet::encryption::DecryptionConfiguration> decryption_config;
+};
+
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/partition.h b/pyarrow/include/arrow/dataset/partition.h
new file mode 100644
index 0000000000000000000000000000000000000000..315a3d384d28c1b313bf1483fb38ad99c6713663
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/partition.h
@@ -0,0 +1,432 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <functional>
+#include <iosfwd>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/expression.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/util/compare.h"
+
+namespace arrow {
+
+namespace dataset {
+
+constexpr char kFilenamePartitionSep = '_';
+
+struct ARROW_DS_EXPORT PartitionPathFormat {
+  std::string directory, filename;
+};
+
+// ----------------------------------------------------------------------
+// Partitioning
+
+/// \defgroup dataset-partitioning Partitioning API
+///
+/// @{
+
+/// \brief Interface for parsing partition expressions from string partition
+/// identifiers.
+///
+/// For example, the identifier "foo=5" might be parsed to an equality expression
+/// between the "foo" field and the value 5.
+///
+/// Some partitionings may store the field names in a metadata
+/// store instead of in file paths, for example
+/// dataset_root/2009/11/... could be used when the partition fields
+/// are "year" and "month"
+///
+/// Paths are consumed from left to right. Paths must be relative to
+/// the root of a partition; path prefixes must be removed before passing
+/// the path to a partitioning for parsing.
+class ARROW_DS_EXPORT Partitioning : public util::EqualityComparable<Partitioning> {
+ public:
+  virtual ~Partitioning() = default;
+
+  /// \brief The name identifying the kind of partitioning
+  virtual std::string type_name() const = 0;
+
+  //// \brief Return whether the partitionings are equal
+  virtual bool Equals(const Partitioning& other) const {
+    return schema_->Equals(other.schema_, /*check_metadata=*/false);
+  }
+
+  /// \brief If the input batch shares any fields with this partitioning,
+  /// produce sub-batches which satisfy mutually exclusive Expressions.
+  struct PartitionedBatches {
+    RecordBatchVector batches;
+    std::vector<compute::Expression> expressions;
+  };
+  virtual Result<PartitionedBatches> Partition(
+      const std::shared_ptr<RecordBatch>& batch) const = 0;
+
+  /// \brief Parse a path into a partition expression
+  virtual Result<compute::Expression> Parse(const std::string& path) const = 0;
+
+  virtual Result<PartitionPathFormat> Format(const compute::Expression& expr) const = 0;
+
+  /// \brief A default Partitioning which is a DirectoryPartitioning
+  /// with an empty schema.
+  static std::shared_ptr<Partitioning> Default();
+
+  /// \brief The partition schema.
+  const std::shared_ptr<Schema>& schema() const { return schema_; }
+
+ protected:
+  explicit Partitioning(std::shared_ptr<Schema> schema) : schema_(std::move(schema)) {}
+
+  std::shared_ptr<Schema> schema_;
+};
+
+/// \brief The encoding of partition segments.
+enum class SegmentEncoding : int8_t {
+  /// No encoding.
+  None = 0,
+  /// Segment values are URL-encoded.
+  Uri = 1,
+};
+
+ARROW_DS_EXPORT
+std::ostream& operator<<(std::ostream& os, SegmentEncoding segment_encoding);
+
+/// \brief Options for key-value based partitioning (hive/directory).
+struct ARROW_DS_EXPORT KeyValuePartitioningOptions {
+  /// After splitting a path into components, decode the path components
+  /// before parsing according to this scheme.
+  SegmentEncoding segment_encoding = SegmentEncoding::Uri;
+};
+
+/// \brief Options for inferring a partitioning.
+struct ARROW_DS_EXPORT PartitioningFactoryOptions {
+  /// When inferring a schema for partition fields, yield dictionary encoded types
+  /// instead of plain. This can be more efficient when materializing virtual
+  /// columns, and Expressions parsed by the finished Partitioning will include
+  /// dictionaries of all unique inspected values for each field.
+  bool infer_dictionary = false;
+  /// Optionally, an expected schema can be provided, in which case inference
+  /// will only check discovered fields against the schema and update internal
+  /// state (such as dictionaries).
+  std::shared_ptr<Schema> schema;
+  /// After splitting a path into components, decode the path components
+  /// before parsing according to this scheme.
+  SegmentEncoding segment_encoding = SegmentEncoding::Uri;
+
+  KeyValuePartitioningOptions AsPartitioningOptions() const;
+};
+
+/// \brief Options for inferring a hive-style partitioning.
+struct ARROW_DS_EXPORT HivePartitioningFactoryOptions : PartitioningFactoryOptions {
+  /// The hive partitioning scheme maps null to a hard coded fallback string.
+  std::string null_fallback;
+
+  HivePartitioningOptions AsHivePartitioningOptions() const;
+};
+
+/// \brief PartitioningFactory provides creation of a partitioning  when the
+/// specific schema must be inferred from available paths (no explicit schema is known).
+class ARROW_DS_EXPORT PartitioningFactory {
+ public:
+  virtual ~PartitioningFactory() = default;
+
+  /// \brief The name identifying the kind of partitioning
+  virtual std::string type_name() const = 0;
+
+  /// Get the schema for the resulting Partitioning.
+  /// This may reset internal state, for example dictionaries of unique representations.
+  virtual Result<std::shared_ptr<Schema>> Inspect(
+      const std::vector<std::string>& paths) = 0;
+
+  /// Create a partitioning using the provided schema
+  /// (fields may be dropped).
+  virtual Result<std::shared_ptr<Partitioning>> Finish(
+      const std::shared_ptr<Schema>& schema) const = 0;
+};
+
+/// \brief Subclass for the common case of a partitioning which yields an equality
+/// expression for each segment
+class ARROW_DS_EXPORT KeyValuePartitioning : public Partitioning {
+ public:
+  /// An unconverted equality expression consisting of a field name and the representation
+  /// of a scalar value
+  struct Key {
+    std::string name;
+    std::optional<std::string> value;
+  };
+
+  Result<PartitionedBatches> Partition(
+      const std::shared_ptr<RecordBatch>& batch) const override;
+
+  Result<compute::Expression> Parse(const std::string& path) const override;
+
+  Result<PartitionPathFormat> Format(const compute::Expression& expr) const override;
+
+  const ArrayVector& dictionaries() const { return dictionaries_; }
+
+  SegmentEncoding segment_encoding() const { return options_.segment_encoding; }
+
+  bool Equals(const Partitioning& other) const override;
+
+ protected:
+  KeyValuePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
+                       KeyValuePartitioningOptions options)
+      : Partitioning(std::move(schema)),
+        dictionaries_(std::move(dictionaries)),
+        options_(options) {
+    if (dictionaries_.empty()) {
+      dictionaries_.resize(schema_->num_fields());
+    }
+  }
+
+  virtual Result<std::vector<Key>> ParseKeys(const std::string& path) const = 0;
+
+  virtual Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const = 0;
+
+  /// Convert a Key to a full expression.
+  Result<compute::Expression> ConvertKey(const Key& key) const;
+
+  Result<std::vector<std::string>> FormatPartitionSegments(
+      const ScalarVector& values) const;
+  Result<std::vector<Key>> ParsePartitionSegments(
+      const std::vector<std::string>& segments) const;
+
+  ArrayVector dictionaries_;
+  KeyValuePartitioningOptions options_;
+};
+
+/// \brief DirectoryPartitioning parses one segment of a path for each field in its
+/// schema. All fields are required, so paths passed to DirectoryPartitioning::Parse
+/// must contain segments for each field.
+///
+/// For example given schema<year:int16, month:int8> the path "/2009/11" would be
+/// parsed to ("year"_ == 2009 and "month"_ == 11)
+class ARROW_DS_EXPORT DirectoryPartitioning : public KeyValuePartitioning {
+ public:
+  /// If a field in schema is of dictionary type, the corresponding element of
+  /// dictionaries must be contain the dictionary of values for that field.
+  explicit DirectoryPartitioning(std::shared_ptr<Schema> schema,
+                                 ArrayVector dictionaries = {},
+                                 KeyValuePartitioningOptions options = {});
+
+  std::string type_name() const override { return "directory"; }
+
+  bool Equals(const Partitioning& other) const override;
+
+  /// \brief Create a factory for a directory partitioning.
+  ///
+  /// \param[in] field_names The names for the partition fields. Types will be
+  ///     inferred.
+  static std::shared_ptr<PartitioningFactory> MakeFactory(
+      std::vector<std::string> field_names, PartitioningFactoryOptions = {});
+
+ private:
+  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
+
+  Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
+};
+
+/// \brief The default fallback used for null values in a Hive-style partitioning.
+static constexpr char kDefaultHiveNullFallback[] = "__HIVE_DEFAULT_PARTITION__";
+
+struct ARROW_DS_EXPORT HivePartitioningOptions : public KeyValuePartitioningOptions {
+  std::string null_fallback = kDefaultHiveNullFallback;
+
+  static HivePartitioningOptions DefaultsWithNullFallback(std::string fallback) {
+    HivePartitioningOptions options;
+    options.null_fallback = std::move(fallback);
+    return options;
+  }
+};
+
+/// \brief Multi-level, directory based partitioning
+/// originating from Apache Hive with all data files stored in the
+/// leaf directories. Data is partitioned by static values of a
+/// particular column in the schema. Partition keys are represented in
+/// the form $key=$value in directory names.
+/// Field order is ignored, as are missing or unrecognized field names.
+///
+/// For example given schema<year:int16, month:int8, day:int8> the path
+/// "/day=321/ignored=3.4/year=2009" parses to ("year"_ == 2009 and "day"_ == 321)
+class ARROW_DS_EXPORT HivePartitioning : public KeyValuePartitioning {
+ public:
+  /// If a field in schema is of dictionary type, the corresponding element of
+  /// dictionaries must be contain the dictionary of values for that field.
+  explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries = {},
+                            std::string null_fallback = kDefaultHiveNullFallback)
+      : KeyValuePartitioning(std::move(schema), std::move(dictionaries),
+                             KeyValuePartitioningOptions()),
+        hive_options_(
+            HivePartitioningOptions::DefaultsWithNullFallback(std::move(null_fallback))) {
+  }
+
+  explicit HivePartitioning(std::shared_ptr<Schema> schema, ArrayVector dictionaries,
+                            HivePartitioningOptions options)
+      : KeyValuePartitioning(std::move(schema), std::move(dictionaries), options),
+        hive_options_(options) {}
+
+  std::string type_name() const override { return "hive"; }
+  std::string null_fallback() const { return hive_options_.null_fallback; }
+  const HivePartitioningOptions& options() const { return hive_options_; }
+
+  static Result<std::optional<Key>> ParseKey(const std::string& segment,
+                                             const HivePartitioningOptions& options);
+
+  bool Equals(const Partitioning& other) const override;
+
+  /// \brief Create a factory for a hive partitioning.
+  static std::shared_ptr<PartitioningFactory> MakeFactory(
+      HivePartitioningFactoryOptions = {});
+
+ private:
+  const HivePartitioningOptions hive_options_;
+  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
+
+  Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
+};
+
+/// \brief Implementation provided by lambda or other callable
+class ARROW_DS_EXPORT FunctionPartitioning : public Partitioning {
+ public:
+  using ParseImpl = std::function<Result<compute::Expression>(const std::string&)>;
+
+  using FormatImpl =
+      std::function<Result<PartitionPathFormat>(const compute::Expression&)>;
+
+  FunctionPartitioning(std::shared_ptr<Schema> schema, ParseImpl parse_impl,
+                       FormatImpl format_impl = NULLPTR, std::string name = "function")
+      : Partitioning(std::move(schema)),
+        parse_impl_(std::move(parse_impl)),
+        format_impl_(std::move(format_impl)),
+        name_(std::move(name)) {}
+
+  std::string type_name() const override { return name_; }
+
+  bool Equals(const Partitioning& other) const override { return false; }
+
+  Result<compute::Expression> Parse(const std::string& path) const override {
+    return parse_impl_(path);
+  }
+
+  Result<PartitionPathFormat> Format(const compute::Expression& expr) const override {
+    if (format_impl_) {
+      return format_impl_(expr);
+    }
+    return Status::NotImplemented("formatting paths from ", type_name(), " Partitioning");
+  }
+
+  Result<PartitionedBatches> Partition(
+      const std::shared_ptr<RecordBatch>& batch) const override {
+    return Status::NotImplemented("partitioning batches from ", type_name(),
+                                  " Partitioning");
+  }
+
+ private:
+  ParseImpl parse_impl_;
+  FormatImpl format_impl_;
+  std::string name_;
+};
+
+class ARROW_DS_EXPORT FilenamePartitioning : public KeyValuePartitioning {
+ public:
+  /// \brief Construct a FilenamePartitioning from its components.
+  ///
+  /// If a field in schema is of dictionary type, the corresponding element of
+  /// dictionaries must be contain the dictionary of values for that field.
+  explicit FilenamePartitioning(std::shared_ptr<Schema> schema,
+                                ArrayVector dictionaries = {},
+                                KeyValuePartitioningOptions options = {});
+
+  std::string type_name() const override { return "filename"; }
+
+  /// \brief Create a factory for a filename partitioning.
+  ///
+  /// \param[in] field_names The names for the partition fields. Types will be
+  ///     inferred.
+  static std::shared_ptr<PartitioningFactory> MakeFactory(
+      std::vector<std::string> field_names, PartitioningFactoryOptions = {});
+
+  bool Equals(const Partitioning& other) const override;
+
+ private:
+  Result<std::vector<Key>> ParseKeys(const std::string& path) const override;
+
+  Result<PartitionPathFormat> FormatValues(const ScalarVector& values) const override;
+};
+
+ARROW_DS_EXPORT std::string StripPrefix(const std::string& path,
+                                        const std::string& prefix);
+
+/// \brief Extracts the directory and filename and removes the prefix of a path
+///
+/// e.g., `StripPrefixAndFilename("/data/year=2019/c.txt", "/data") ->
+/// {"year=2019","c.txt"}`
+ARROW_DS_EXPORT std::string StripPrefixAndFilename(const std::string& path,
+                                                   const std::string& prefix);
+
+/// \brief Vector version of StripPrefixAndFilename.
+ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
+    const std::vector<std::string>& paths, const std::string& prefix);
+
+/// \brief Vector version of StripPrefixAndFilename.
+ARROW_DS_EXPORT std::vector<std::string> StripPrefixAndFilename(
+    const std::vector<fs::FileInfo>& files, const std::string& prefix);
+
+/// \brief Either a Partitioning or a PartitioningFactory
+class ARROW_DS_EXPORT PartitioningOrFactory {
+ public:
+  explicit PartitioningOrFactory(std::shared_ptr<Partitioning> partitioning)
+      : partitioning_(std::move(partitioning)) {}
+
+  explicit PartitioningOrFactory(std::shared_ptr<PartitioningFactory> factory)
+      : factory_(std::move(factory)) {}
+
+  PartitioningOrFactory& operator=(std::shared_ptr<Partitioning> partitioning) {
+    return *this = PartitioningOrFactory(std::move(partitioning));
+  }
+
+  PartitioningOrFactory& operator=(std::shared_ptr<PartitioningFactory> factory) {
+    return *this = PartitioningOrFactory(std::move(factory));
+  }
+
+  /// \brief The partitioning (if given).
+  const std::shared_ptr<Partitioning>& partitioning() const { return partitioning_; }
+
+  /// \brief The partition factory (if given).
+  const std::shared_ptr<PartitioningFactory>& factory() const { return factory_; }
+
+  /// \brief Get the partition schema, inferring it with the given factory if needed.
+  Result<std::shared_ptr<Schema>> GetOrInferSchema(const std::vector<std::string>& paths);
+
+ private:
+  std::shared_ptr<PartitioningFactory> factory_;
+  std::shared_ptr<Partitioning> partitioning_;
+};
+
+/// @}
+
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/plan.h b/pyarrow/include/arrow/dataset/plan.h
new file mode 100644
index 0000000000000000000000000000000000000000..10260ccec81d159ffd40d86144e39c4d91739db1
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/plan.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#include "arrow/dataset/visibility.h"
+
+namespace arrow {
+namespace dataset {
+namespace internal {
+
+/// Register dataset-based exec nodes with the exec node registry
+///
+/// This function must be called before using dataset ExecNode factories
+ARROW_DS_EXPORT void Initialize();
+
+}  // namespace internal
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/projector.h b/pyarrow/include/arrow/dataset/projector.h
new file mode 100644
index 0000000000000000000000000000000000000000..86d38f0af23522a08dcebc1c290fe6bc25ae014e
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/projector.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include "arrow/dataset/visibility.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace dataset {
+
+// FIXME this is superceded by compute::Expression::Bind
+ARROW_DS_EXPORT Status CheckProjectable(const Schema& from, const Schema& to);
+
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/scanner.h b/pyarrow/include/arrow/dataset/scanner.h
new file mode 100644
index 0000000000000000000000000000000000000000..7885b132cc9b529a0fbce41c807529ecd1e34da4
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/scanner.h
@@ -0,0 +1,623 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/acero/options.h"
+#include "arrow/compute/expression.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/dataset/dataset.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/dataset/visibility.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/async_generator_fwd.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/type_fwd.h"
+
+namespace arrow {
+
+using RecordBatchGenerator = std::function<Future<std::shared_ptr<RecordBatch>>()>;
+
+namespace dataset {
+
+/// \defgroup dataset-scanning Scanning API
+///
+/// @{
+
+constexpr int64_t kDefaultBatchSize = 1 << 17;  // 128Ki rows
+// This will yield 64 batches ~ 8Mi rows
+constexpr int32_t kDefaultBatchReadahead = 16;
+constexpr int32_t kDefaultFragmentReadahead = 4;
+constexpr int32_t kDefaultBytesReadahead = 1 << 25;  // 32MiB
+
+/// Scan-specific options, which can be changed between scans of the same dataset.
+struct ARROW_DS_EXPORT ScanOptions {
+  /// A row filter (which will be pushed down to partitioning/reading if supported).
+  compute::Expression filter = compute::literal(true);
+  /// A projection expression (which can add/remove/rename columns).
+  compute::Expression projection;
+
+  /// Schema with which batches will be read from fragments. This is also known as the
+  /// "reader schema" it will be used (for example) in constructing CSV file readers to
+  /// identify column types for parsing. Usually only a subset of its fields (see
+  /// MaterializedFields) will be materialized during a scan.
+  std::shared_ptr<Schema> dataset_schema;
+
+  /// Schema of projected record batches. This is independent of dataset_schema as its
+  /// fields are derived from the projection. For example, let
+  ///
+  ///   dataset_schema = {"a": int32, "b": int32, "id": utf8}
+  ///   projection = project({equal(field_ref("a"), field_ref("b"))}, {"a_plus_b"})
+  ///
+  /// (no filter specified). In this case, the projected_schema would be
+  ///
+  ///   {"a_plus_b": int32}
+  std::shared_ptr<Schema> projected_schema;
+
+  /// Maximum row count for scanned batches.
+  int64_t batch_size = kDefaultBatchSize;
+
+  /// How many batches to read ahead within a fragment.
+  ///
+  /// Set to 0 to disable batch readahead
+  ///
+  /// Note: May not be supported by all formats
+  /// Note: Will be ignored if use_threads is set to false
+  int32_t batch_readahead = kDefaultBatchReadahead;
+
+  /// How many files to read ahead
+  ///
+  /// Set to 0 to disable fragment readahead
+  ///
+  /// Note: May not be enforced by all scanners
+  /// Note: Will be ignored if use_threads is set to false
+  int32_t fragment_readahead = kDefaultFragmentReadahead;
+
+  /// A pool from which materialized and scanned arrays will be allocated.
+  MemoryPool* pool = arrow::default_memory_pool();
+
+  /// IOContext for any IO tasks
+  ///
+  /// Note: The IOContext executor will be ignored if use_threads is set to false
+  io::IOContext io_context;
+
+  /// Executor for any CPU tasks
+  ///
+  /// If null, the global CPU executor will be used
+  ///
+  /// Note: The Executor will be ignored if use_threads is set to false
+  arrow::internal::Executor* cpu_executor = NULLPTR;
+
+  /// If true the scanner will scan in parallel
+  ///
+  /// Note: If true, this will use threads from both the cpu_executor and the
+  /// io_context.executor
+  /// Note: This  must be true in order for any readahead to happen
+  bool use_threads = false;
+
+  /// If true the scanner will add augmented fields to the output schema.
+  bool add_augmented_fields = true;
+
+  /// Whether to cache metadata when scanning.
+  ///
+  /// Fragments may typically cache metadata to speed up repeated accesses.
+  /// However, in use cases where a single scan is done, or if memory use
+  /// is more critical than CPU time, setting this option to false can
+  /// lessen memory use.
+  bool cache_metadata = true;
+
+  /// Fragment-specific scan options.
+  std::shared_ptr<FragmentScanOptions> fragment_scan_options;
+
+  /// Return a vector of FieldRefs that require materialization.
+  ///
+  /// This is usually the union of the fields referenced in the projection and the
+  /// filter expression. Examples:
+  ///
+  /// - `SELECT a, b WHERE a < 2 && c > 1` => ["a", "b", "a", "c"]
+  /// - `SELECT a + b < 3 WHERE a > 1` => ["a", "b", "a"]
+  ///
+  /// This is needed for expression where a field may not be directly
+  /// used in the final projection but is still required to evaluate the
+  /// expression.
+  ///
+  /// This is used by Fragment implementations to apply the column
+  /// sub-selection optimization.
+  std::vector<FieldRef> MaterializedFields() const;
+
+  /// Parameters which control when the plan should pause for a slow consumer
+  acero::BackpressureOptions backpressure =
+      acero::BackpressureOptions::DefaultBackpressure();
+};
+
+/// Scan-specific options, which can be changed between scans of the same dataset.
+///
+/// A dataset consists of one or more individual fragments.  A fragment is anything
+/// that is independently scannable, often a file.
+///
+/// Batches from all fragments will be converted to a single schema. This unified
+/// schema is referred to as the "dataset schema" and is the output schema for
+/// this node.
+///
+/// Individual fragments may have schemas that are different from the dataset
+/// schema.  This is sometimes referred to as the physical or fragment schema.
+/// Conversion from the fragment schema to the dataset schema is a process
+/// known as evolution.
+struct ARROW_DS_EXPORT ScanV2Options : public acero::ExecNodeOptions {
+  explicit ScanV2Options(std::shared_ptr<Dataset> dataset)
+      : dataset(std::move(dataset)) {}
+
+  /// \brief The dataset to scan
+  std::shared_ptr<Dataset> dataset;
+  /// \brief A row filter
+  ///
+  /// The filter expression should be written against the dataset schema.
+  /// The filter must be unbound.
+  ///
+  /// This is an opportunistic pushdown filter.  Filtering capabilities will
+  /// vary between formats.  If a format is not capable of applying the filter
+  /// then it will ignore it.
+  ///
+  /// Each fragment will do its best to filter the data based on the information
+  /// (partitioning guarantees, statistics) available to it.  If it is able to
+  /// apply some filtering then it will indicate what filtering it was able to
+  /// apply by attaching a guarantee to the batch.
+  ///
+  /// For example, if a filter is x < 50 && y > 40 then a batch may be able to
+  /// apply a guarantee x < 50.  Post-scan filtering would then only need to
+  /// consider y > 40 (for this specific batch).  The next batch may not be able
+  /// to attach any guarantee and both clauses would need to be applied to that batch.
+  ///
+  /// A single guarantee-aware filtering operation should generally be applied to all
+  /// resulting batches.  The scan node is not responsible for this.
+  ///
+  /// Fields that are referenced by the filter should be included in the `columns` vector.
+  /// The scan node will not automatically fetch fields referenced by the filter
+  /// expression. \see AddFieldsNeededForFilter
+  ///
+  /// If the filter references fields that are not included in `columns` this may or may
+  /// not be an error, depending on the format.
+  compute::Expression filter = compute::literal(true);
+
+  /// \brief The columns to scan
+  ///
+  /// This is not a simple list of top-level column indices but instead a set of paths
+  /// allowing for partial selection of columns
+  ///
+  /// These paths refer to the dataset schema
+  ///
+  /// For example, consider the following dataset schema:
+  ///   schema({
+  ///     field("score", int32()),
+  ///           "marker", struct_({
+  ///              field("color", utf8()),
+  ///              field("location", struct_({
+  ///                  field("x", float64()),
+  ///                  field("y", float64())
+  ///              })
+  ///          })
+  ///   })
+  ///
+  /// If `columns` is {{0}, {1,1,0}} then the output schema is:
+  ///   schema({field("score", int32()), field("x", float64())})
+  ///
+  /// If `columns` is {{1,1,1}, {1,1}} then the output schema is:
+  ///   schema({
+  ///       field("y", float64()),
+  ///       field("location", struct_({
+  ///           field("x", float64()),
+  ///           field("y", float64())
+  ///       })
+  ///   })
+  std::vector<FieldPath> columns;
+
+  /// \brief Target number of bytes to read ahead in a fragment
+  ///
+  /// This limit involves some amount of estimation.  Formats typically only know
+  /// batch boundaries in terms of rows (not decoded bytes) and so an estimation
+  /// must be done to guess the average row size.  Other formats like CSV and JSON
+  /// must make even more generalized guesses.
+  ///
+  /// This is a best-effort guide.  Some formats may need to read ahead further,
+  /// for example, if scanning a parquet file that has batches with 100MiB of data
+  /// then the actual readahead will be at least 100MiB
+  ///
+  /// Set to 0 to disable readahead.  When disabled, the scanner will read the
+  /// dataset one batch at a time
+  ///
+  /// This limit applies across all fragments.  If the limit is 32MiB and the
+  /// fragment readahead allows for 20 fragments to be read at once then the
+  /// total readahead will still be 32MiB and NOT 20 * 32MiB.
+  int32_t target_bytes_readahead = kDefaultBytesReadahead;
+
+  /// \brief Number of fragments to read ahead
+  ///
+  /// Higher readahead will potentially lead to more efficient I/O but will lead
+  /// to the scan operation using more RAM.  The default is fairly conservative
+  /// and designed for fast local disks (or slow local spinning disks which cannot
+  /// handle much parallelism anyways).  When using a highly parallel remote filesystem
+  /// you will likely want to increase these values.
+  ///
+  /// Set to 0 to disable fragment readahead.  When disabled the dataset will be scanned
+  /// one fragment at a time.
+  int32_t fragment_readahead = kDefaultFragmentReadahead;
+  /// \brief Options specific to the file format
+  const FragmentScanOptions* format_options = NULLPTR;
+
+  /// \brief Utility method to get a selection representing all columns in a dataset
+  static std::vector<FieldPath> AllColumns(const Schema& dataset_schema);
+
+  /// \brief Utility method to add fields needed for the current filter
+  ///
+  /// This method adds any fields that are needed by `filter` which are not already
+  /// included in the list of columns.  Any new fields added will be added to the end
+  /// in no particular order.
+  static Status AddFieldsNeededForFilter(ScanV2Options* options);
+};
+
+/// \brief Describes a projection
+struct ARROW_DS_EXPORT ProjectionDescr {
+  /// \brief The projection expression itself
+  /// This expression must be a call to make_struct
+  compute::Expression expression;
+  /// \brief The output schema of the projection.
+
+  /// This can be calculated from the input schema and the expression but it
+  /// is cached here for convenience.
+  std::shared_ptr<Schema> schema;
+
+  /// \brief Create a ProjectionDescr by binding an expression to the dataset schema
+  ///
+  /// expression must return a struct type
+  static Result<ProjectionDescr> FromStructExpression(
+      const compute::Expression& expression, const Schema& dataset_schema);
+
+  /// \brief Create a ProjectionDescr from expressions/names for each field
+  static Result<ProjectionDescr> FromExpressions(std::vector<compute::Expression> exprs,
+                                                 std::vector<std::string> names,
+                                                 const Schema& dataset_schema);
+
+  /// \brief Create a default projection referencing fields in the dataset schema
+  static Result<ProjectionDescr> FromNames(std::vector<std::string> names,
+                                           const Schema& dataset_schema,
+                                           bool add_augmented_fields = true);
+
+  /// \brief Make a projection that projects every field in the dataset schema
+  static Result<ProjectionDescr> Default(const Schema& dataset_schema,
+                                         bool add_augmented_fields = true);
+};
+
+/// \brief Utility method to set the projection expression and schema
+ARROW_DS_EXPORT void SetProjection(ScanOptions* options, ProjectionDescr projection);
+
+/// \brief Combines a record batch with the fragment that the record batch originated
+/// from
+///
+/// Knowing the source fragment can be useful for debugging & understanding loaded
+/// data
+struct TaggedRecordBatch {
+  std::shared_ptr<RecordBatch> record_batch;
+  std::shared_ptr<Fragment> fragment;
+
+  friend inline bool operator==(const TaggedRecordBatch& left,
+                                const TaggedRecordBatch& right) {
+    return left.record_batch == right.record_batch && left.fragment == right.fragment;
+  }
+};
+
+using TaggedRecordBatchGenerator = std::function<Future<TaggedRecordBatch>()>;
+using TaggedRecordBatchIterator = Iterator<TaggedRecordBatch>;
+
+/// \brief Combines a tagged batch with positional information
+///
+/// This is returned when scanning batches in an unordered fashion.  This information is
+/// needed if you ever want to reassemble the batches in order
+struct EnumeratedRecordBatch {
+  Enumerated<std::shared_ptr<RecordBatch>> record_batch;
+  Enumerated<std::shared_ptr<Fragment>> fragment;
+
+  friend inline bool operator==(const EnumeratedRecordBatch& left,
+                                const EnumeratedRecordBatch& right) {
+    return left.record_batch == right.record_batch && left.fragment == right.fragment;
+  }
+};
+
+using EnumeratedRecordBatchGenerator = std::function<Future<EnumeratedRecordBatch>()>;
+using EnumeratedRecordBatchIterator = Iterator<EnumeratedRecordBatch>;
+
+/// @}
+
+}  // namespace dataset
+
+template <>
+struct IterationTraits<dataset::TaggedRecordBatch> {
+  static dataset::TaggedRecordBatch End() {
+    return dataset::TaggedRecordBatch{NULLPTR, NULLPTR};
+  }
+  static bool IsEnd(const dataset::TaggedRecordBatch& val) {
+    return val.record_batch == NULLPTR;
+  }
+};
+
+template <>
+struct IterationTraits<dataset::EnumeratedRecordBatch> {
+  static dataset::EnumeratedRecordBatch End() {
+    return dataset::EnumeratedRecordBatch{
+        IterationEnd<Enumerated<std::shared_ptr<RecordBatch>>>(),
+        IterationEnd<Enumerated<std::shared_ptr<dataset::Fragment>>>()};
+  }
+  static bool IsEnd(const dataset::EnumeratedRecordBatch& val) {
+    return IsIterationEnd(val.fragment);
+  }
+};
+
+namespace dataset {
+
+/// \defgroup dataset-scanning Scanning API
+///
+/// @{
+
+/// \brief A scanner glues together several dataset classes to load in data.
+/// The dataset contains a collection of fragments and partitioning rules.
+///
+/// The fragments identify independently loadable units of data (i.e. each fragment has
+/// a potentially unique schema and possibly even format.  It should be possible to read
+/// fragments in parallel if desired).
+///
+/// The fragment's format contains the logic necessary to actually create a task to load
+/// the fragment into memory.  That task may or may not support parallel execution of
+/// its own.
+///
+/// The scanner is then responsible for creating scan tasks from every fragment in the
+/// dataset and (potentially) sequencing the loaded record batches together.
+///
+/// The scanner should not buffer the entire dataset in memory (unless asked) instead
+/// yielding record batches as soon as they are ready to scan.  Various readahead
+/// properties control how much data is allowed to be scanned before pausing to let a
+/// slow consumer catchup.
+///
+/// Today the scanner also handles projection & filtering although that may change in
+/// the future.
+class ARROW_DS_EXPORT Scanner {
+ public:
+  virtual ~Scanner() = default;
+
+  /// \brief Apply a visitor to each RecordBatch as it is scanned. If multiple threads
+  /// are used (via use_threads), the visitor will be invoked from those threads and is
+  /// responsible for any synchronization.
+  virtual Status Scan(std::function<Status(TaggedRecordBatch)> visitor) = 0;
+  /// \brief Convert a Scanner into a Table.
+  ///
+  /// Use this convenience utility with care. This will serially materialize the
+  /// Scan result in memory before creating the Table.
+  virtual Result<std::shared_ptr<Table>> ToTable() = 0;
+  /// \brief Scan the dataset into a stream of record batches.  Each batch is tagged
+  /// with the fragment it originated from.  The batches will arrive in order.  The
+  /// order of fragments is determined by the dataset.
+  ///
+  /// Note: The scanner will perform some readahead but will avoid materializing too
+  /// much in memory (this is goverended by the readahead options and use_threads option).
+  /// If the readahead queue fills up then I/O will pause until the calling thread catches
+  /// up.
+  virtual Result<TaggedRecordBatchIterator> ScanBatches() = 0;
+  virtual Result<TaggedRecordBatchGenerator> ScanBatchesAsync() = 0;
+  virtual Result<TaggedRecordBatchGenerator> ScanBatchesAsync(
+      ::arrow::internal::Executor* cpu_thread_pool) = 0;
+  /// \brief Scan the dataset into a stream of record batches.  Unlike ScanBatches this
+  /// method may allow record batches to be returned out of order.  This allows for more
+  /// efficient scanning: some fragments may be accessed more quickly than others (e.g.
+  /// may be cached in RAM or just happen to get scheduled earlier by the I/O)
+  ///
+  /// To make up for the out-of-order iteration each batch is further tagged with
+  /// positional information.
+  virtual Result<EnumeratedRecordBatchIterator> ScanBatchesUnordered() = 0;
+  virtual Result<EnumeratedRecordBatchGenerator> ScanBatchesUnorderedAsync() = 0;
+  virtual Result<EnumeratedRecordBatchGenerator> ScanBatchesUnorderedAsync(
+      ::arrow::internal::Executor* cpu_thread_pool) = 0;
+  /// \brief A convenience to synchronously load the given rows by index.
+  ///
+  /// Will only consume as many batches as needed from ScanBatches().
+  virtual Result<std::shared_ptr<Table>> TakeRows(const Array& indices) = 0;
+  /// \brief Get the first N rows.
+  virtual Result<std::shared_ptr<Table>> Head(int64_t num_rows) = 0;
+  /// \brief Count rows matching a predicate.
+  ///
+  /// This method will push down the predicate and compute the result based on fragment
+  /// metadata if possible.
+  virtual Result<int64_t> CountRows() = 0;
+  virtual Future<int64_t> CountRowsAsync() = 0;
+  /// \brief Convert the Scanner to a RecordBatchReader so it can be
+  /// easily used with APIs that expect a reader.
+  virtual Result<std::shared_ptr<RecordBatchReader>> ToRecordBatchReader() = 0;
+
+  /// \brief Get the options for this scan.
+  const std::shared_ptr<ScanOptions>& options() const { return scan_options_; }
+  /// \brief Get the dataset that this scanner will scan
+  virtual const std::shared_ptr<Dataset>& dataset() const = 0;
+
+ protected:
+  explicit Scanner(std::shared_ptr<ScanOptions> scan_options)
+      : scan_options_(std::move(scan_options)) {}
+
+  Result<EnumeratedRecordBatchIterator> AddPositioningToInOrderScan(
+      TaggedRecordBatchIterator scan);
+
+  const std::shared_ptr<ScanOptions> scan_options_;
+};
+
+/// \brief ScannerBuilder is a factory class to construct a Scanner. It is used
+/// to pass information, notably a potential filter expression and a subset of
+/// columns to materialize.
+class ARROW_DS_EXPORT ScannerBuilder {
+ public:
+  explicit ScannerBuilder(std::shared_ptr<Dataset> dataset);
+
+  ScannerBuilder(std::shared_ptr<Dataset> dataset,
+                 std::shared_ptr<ScanOptions> scan_options);
+
+  ScannerBuilder(std::shared_ptr<Schema> schema, std::shared_ptr<Fragment> fragment,
+                 std::shared_ptr<ScanOptions> scan_options);
+
+  /// \brief Make a scanner from a record batch reader.
+  ///
+  /// The resulting scanner can be scanned only once. This is intended
+  /// to support writing data from streaming sources or other sources
+  /// that can be iterated only once.
+  static std::shared_ptr<ScannerBuilder> FromRecordBatchReader(
+      std::shared_ptr<RecordBatchReader> reader);
+
+  /// \brief Set the subset of columns to materialize.
+  ///
+  /// Columns which are not referenced may not be read from fragments.
+  ///
+  /// \param[in] columns list of columns to project. Order and duplicates will
+  ///            be preserved.
+  ///
+  /// \return Failure if any column name does not exists in the dataset's
+  ///         Schema.
+  Status Project(std::vector<std::string> columns);
+
+  /// \brief Set expressions which will be evaluated to produce the materialized
+  /// columns.
+  ///
+  /// Columns which are not referenced may not be read from fragments.
+  ///
+  /// \param[in] exprs expressions to evaluate to produce columns.
+  /// \param[in] names list of names for the resulting columns.
+  ///
+  /// \return Failure if any referenced column does not exists in the dataset's
+  ///         Schema.
+  Status Project(std::vector<compute::Expression> exprs, std::vector<std::string> names);
+
+  /// \brief Set the filter expression to return only rows matching the filter.
+  ///
+  /// The predicate will be passed down to Sources and corresponding
+  /// Fragments to exploit predicate pushdown if possible using
+  /// partition information or Fragment internal metadata, e.g. Parquet statistics.
+  /// Columns which are not referenced may not be read from fragments.
+  ///
+  /// \param[in] filter expression to filter rows with.
+  ///
+  /// \return Failure if any referenced columns does not exist in the dataset's
+  ///         Schema.
+  Status Filter(const compute::Expression& filter);
+
+  /// \brief Indicate if the Scanner should make use of the available
+  ///        ThreadPool found in ScanOptions;
+  Status UseThreads(bool use_threads = true);
+
+  /// \brief Indicate if metadata should be cached when scanning
+  ///
+  /// Fragments may typically cache metadata to speed up repeated accesses.
+  /// However, in use cases where a single scan is done, or if memory use
+  /// is more critical than CPU time, setting this option to false can
+  /// lessen memory use.
+  Status CacheMetadata(bool cache_metadata = true);
+
+  /// \brief Set the maximum number of rows per RecordBatch.
+  ///
+  /// \param[in] batch_size the maximum number of rows.
+  /// \returns An error if the number for batch is not greater than 0.
+  ///
+  /// This option provides a control limiting the memory owned by any RecordBatch.
+  Status BatchSize(int64_t batch_size);
+
+  /// \brief Set the number of batches to read ahead within a fragment.
+  ///
+  /// \param[in] batch_readahead How many batches to read ahead within a fragment
+  /// \returns an error if this number is less than 0.
+  ///
+  /// This option provides a control on the RAM vs I/O tradeoff.
+  /// It might not be supported by all file formats, in which case it will
+  /// simply be ignored.
+  Status BatchReadahead(int32_t batch_readahead);
+
+  /// \brief Set the number of fragments to read ahead
+  ///
+  /// \param[in] fragment_readahead How many fragments to read ahead
+  /// \returns an error if this number is less than 0.
+  ///
+  /// This option provides a control on the RAM vs I/O tradeoff.
+  Status FragmentReadahead(int32_t fragment_readahead);
+
+  /// \brief Set the pool from which materialized and scanned arrays will be allocated.
+  Status Pool(MemoryPool* pool);
+
+  /// \brief Set fragment-specific scan options.
+  Status FragmentScanOptions(std::shared_ptr<FragmentScanOptions> fragment_scan_options);
+
+  /// \brief Override default backpressure configuration
+  Status Backpressure(acero::BackpressureOptions backpressure);
+
+  /// \brief Return the current scan options for the builder.
+  Result<std::shared_ptr<ScanOptions>> GetScanOptions();
+
+  /// \brief Return the constructed now-immutable Scanner object
+  Result<std::shared_ptr<Scanner>> Finish();
+
+  const std::shared_ptr<Schema>& schema() const;
+  const std::shared_ptr<Schema>& projected_schema() const;
+
+ private:
+  std::shared_ptr<Dataset> dataset_;
+  std::shared_ptr<ScanOptions> scan_options_ = std::make_shared<ScanOptions>();
+};
+
+/// \brief Construct a source ExecNode which yields batches from a dataset scan.
+///
+/// Does not construct associated filter or project nodes.
+///
+/// Batches are yielded sequentially, like single-threaded,
+/// when require_sequenced_output=true.
+///
+/// Yielded batches will be augmented with fragment/batch indices when
+/// implicit_ordering=true to enable stable ordering for simple ExecPlans.
+class ARROW_DS_EXPORT ScanNodeOptions : public acero::ExecNodeOptions {
+ public:
+  explicit ScanNodeOptions(std::shared_ptr<Dataset> dataset,
+                           std::shared_ptr<ScanOptions> scan_options,
+                           bool require_sequenced_output = false,
+                           bool implicit_ordering = false)
+      : dataset(std::move(dataset)),
+        scan_options(std::move(scan_options)),
+        require_sequenced_output(require_sequenced_output),
+        implicit_ordering(implicit_ordering) {}
+
+  std::shared_ptr<Dataset> dataset;
+  std::shared_ptr<ScanOptions> scan_options;
+  bool require_sequenced_output;
+  bool implicit_ordering;
+};
+
+/// @}
+
+namespace internal {
+ARROW_DS_EXPORT void InitializeScanner(arrow::acero::ExecFactoryRegistry* registry);
+ARROW_DS_EXPORT void InitializeScannerV2(arrow::acero::ExecFactoryRegistry* registry);
+}  // namespace internal
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/type_fwd.h b/pyarrow/include/arrow/dataset/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..d58781e038de9ffc2686ebfda9f640eeacdd6668
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/type_fwd.h
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "arrow/compute/type_fwd.h"  // IWYU pragma: export
+#include "arrow/dataset/visibility.h"
+#include "arrow/filesystem/type_fwd.h"  // IWYU pragma: export
+#include "arrow/type_fwd.h"             // IWYU pragma: export
+
+namespace arrow {
+namespace dataset {
+
+class Dataset;
+class DatasetFactory;
+using DatasetVector = std::vector<std::shared_ptr<Dataset>>;
+
+class UnionDataset;
+class UnionDatasetFactory;
+
+class Fragment;
+using FragmentIterator = Iterator<std::shared_ptr<Fragment>>;
+using FragmentVector = std::vector<std::shared_ptr<Fragment>>;
+
+class FragmentScanOptions;
+
+class FileSource;
+class FileFormat;
+class FileFragment;
+class FileWriter;
+class FileWriteOptions;
+class FileSystemDataset;
+class FileSystemDatasetFactory;
+struct FileSystemDatasetWriteOptions;
+class WriteNodeOptions;
+
+/// \brief Controls what happens if files exist in an output directory during a dataset
+/// write
+enum class ExistingDataBehavior : int8_t {
+  /// Deletes all files in a directory the first time that directory is encountered
+  kDeleteMatchingPartitions,
+  /// Ignores existing files, overwriting any that happen to have the same name as an
+  /// output file
+  kOverwriteOrIgnore,
+  /// Returns an error if there are any files or subdirectories in the output directory
+  kError,
+};
+
+class InMemoryDataset;
+
+class CsvFileFormat;
+class CsvFileWriter;
+class CsvFileWriteOptions;
+struct CsvFragmentScanOptions;
+
+class JsonFileFormat;
+class JsonFileWriter;
+class JsonFileWriteOptions;
+struct JsonFragmentScanOptions;
+
+class IpcFileFormat;
+class IpcFileWriter;
+class IpcFileWriteOptions;
+class IpcFragmentScanOptions;
+
+class ParquetFileFormat;
+class ParquetFileFragment;
+class ParquetFragmentScanOptions;
+class ParquetFileWriter;
+class ParquetFileWriteOptions;
+
+class Partitioning;
+class PartitioningFactory;
+class PartitioningOrFactory;
+struct KeyValuePartitioningOptions;
+class DirectoryPartitioning;
+class HivePartitioning;
+struct HivePartitioningOptions;
+class FilenamePartitioning;
+struct FilenamePartitioningOptions;
+
+class ScanNodeOptions;
+struct ScanOptions;
+
+class Scanner;
+
+class ScannerBuilder;
+
+class ScanTask;
+using ScanTaskVector = std::vector<std::shared_ptr<ScanTask>>;
+using ScanTaskIterator = Iterator<std::shared_ptr<ScanTask>>;
+
+}  // namespace dataset
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/dataset/visibility.h b/pyarrow/include/arrow/dataset/visibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..752907238ca071238e21a303a947afbc1f11217f
--- /dev/null
+++ b/pyarrow/include/arrow/dataset/visibility.h
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_DS_STATIC
+#    define ARROW_DS_EXPORT
+#  elif defined(ARROW_DS_EXPORTING)
+#    define ARROW_DS_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_DS_EXPORT __declspec(dllimport)
+#  endif
+
+#  define ARROW_DS_NO_EXPORT
+#else  // Not Windows
+#  ifndef ARROW_DS_EXPORT
+#    define ARROW_DS_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_DS_NO_EXPORT
+#    define ARROW_DS_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif  // Non-Windows
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/pyarrow/include/arrow/datum.h b/pyarrow/include/arrow/datum.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a88e7a81125cbed89d78d0e67288075ed9295f8
--- /dev/null
+++ b/pyarrow/include/arrow/datum.h
@@ -0,0 +1,314 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "arrow/array/data.h"
+#include "arrow/device_allocation_type_set.h"
+#include "arrow/scalar.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class RecordBatch;
+class Table;
+
+/// \class Datum
+/// \brief Variant type for various Arrow C++ data structures
+struct ARROW_EXPORT Datum {
+  /// \brief The kind of datum stored
+  enum Kind { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE };
+
+  /// \brief A placeholder type to represent empty datum
+  struct Empty {};
+
+  /// \brief Datums variants may have a length. This special value indicate that the
+  /// current variant does not have a length.
+  static constexpr int64_t kUnknownLength = -1;
+
+  /// \brief Storage of the actual datum.
+  ///
+  /// Note: For arrays, ArrayData is stored instead of Array for easier processing
+  std::variant<Empty, std::shared_ptr<Scalar>, std::shared_ptr<ArrayData>,
+               std::shared_ptr<ChunkedArray>, std::shared_ptr<RecordBatch>,
+               std::shared_ptr<Table>>
+      value;
+
+  /// \brief Empty datum, to be populated elsewhere
+  Datum() = default;
+
+  Datum(const Datum& other) = default;
+  Datum& operator=(const Datum& other) = default;
+  Datum(Datum&& other) = default;
+  Datum& operator=(Datum&& other) = default;
+
+  /// \brief Construct from a Scalar
+  Datum(std::shared_ptr<Scalar> value)  // NOLINT implicit conversion
+      : value(std::move(value)) {}
+
+  /// \brief Construct from an ArrayData
+  Datum(std::shared_ptr<ArrayData> value)  // NOLINT implicit conversion
+      : value(std::move(value)) {}
+
+  /// \brief Construct from an ArrayData
+  Datum(ArrayData arg)  // NOLINT implicit conversion
+      : value(std::make_shared<ArrayData>(std::move(arg))) {}
+
+  /// \brief Construct from an Array
+  Datum(const Array& value);  // NOLINT implicit conversion
+
+  /// \brief Construct from an Array
+  Datum(const std::shared_ptr<Array>& value);  // NOLINT implicit conversion
+
+  /// \brief Construct from a ChunkedArray
+  Datum(std::shared_ptr<ChunkedArray> value);  // NOLINT implicit conversion
+
+  /// \brief Construct from a RecordBatch
+  Datum(std::shared_ptr<RecordBatch> value);  // NOLINT implicit conversion
+
+  /// \brief Construct from a Table
+  Datum(std::shared_ptr<Table> value);  // NOLINT implicit conversion
+
+  /// \brief Construct from a ChunkedArray.
+  ///
+  /// This can be expensive, prefer the shared_ptr<ChunkedArray> constructor
+  explicit Datum(const ChunkedArray& value);
+
+  /// \brief Construct from a RecordBatch.
+  ///
+  /// This can be expensive, prefer the shared_ptr<RecordBatch> constructor
+  explicit Datum(const RecordBatch& value);
+
+  /// \brief Construct from a Table.
+  ///
+  /// This can be expensive, prefer the shared_ptr<Table> constructor
+  explicit Datum(const Table& value);
+
+  /// \brief Cast from concrete subtypes of Array or Scalar to Datum
+  template <typename T, bool IsArray = std::is_base_of_v<Array, T>,
+            bool IsScalar = std::is_base_of_v<Scalar, T>,
+            typename = enable_if_t<IsArray || IsScalar>>
+  Datum(std::shared_ptr<T> value)  // NOLINT implicit conversion
+      : Datum(std::shared_ptr<typename std::conditional<IsArray, Array, Scalar>::type>(
+            std::move(value))) {}
+
+  /// \brief Cast from concrete subtypes of Array or Scalar to Datum
+  template <typename T, typename TV = typename std::remove_reference_t<T>,
+            bool IsArray = std::is_base_of_v<Array, T>,
+            bool IsScalar = std::is_base_of_v<Scalar, T>,
+            typename = enable_if_t<IsArray || IsScalar>>
+  Datum(T&& value)  // NOLINT implicit conversion
+      : Datum(std::make_shared<TV>(std::forward<T>(value))) {}
+
+  /// \brief Copy from concrete subtypes of Scalar.
+  ///
+  /// The concrete scalar type must be copyable (not all of them are).
+  template <typename T, typename = enable_if_t<std::is_base_of_v<Scalar, T>>>
+  Datum(const T& value)  // NOLINT implicit conversion
+      : Datum(std::make_shared<T>(value)) {}
+
+  // Convenience constructors
+  /// \brief Convenience constructor storing a bool scalar.
+  explicit Datum(bool value);
+  /// \brief Convenience constructor storing an int8 scalar.
+  explicit Datum(int8_t value);
+  /// \brief Convenience constructor storing a uint8 scalar.
+  explicit Datum(uint8_t value);
+  /// \brief Convenience constructor storing an int16 scalar.
+  explicit Datum(int16_t value);
+  /// \brief Convenience constructor storing a uint16 scalar.
+  explicit Datum(uint16_t value);
+  /// \brief Convenience constructor storing an int32 scalar.
+  explicit Datum(int32_t value);
+  /// \brief Convenience constructor storing a uint32 scalar.
+  explicit Datum(uint32_t value);
+  /// \brief Convenience constructor storing an int64 scalar.
+  explicit Datum(int64_t value);
+  /// \brief Convenience constructor storing a uint64 scalar.
+  explicit Datum(uint64_t value);
+  /// \brief Convenience constructor storing a float scalar.
+  explicit Datum(float value);
+  /// \brief Convenience constructor storing a double scalar.
+  explicit Datum(double value);
+  /// \brief Convenience constructor storing a string scalar.
+  explicit Datum(std::string value);
+  /// \brief Convenience constructor storing a string scalar.
+  explicit Datum(const char* value);
+
+  /// \brief Convenience constructor for a DurationScalar from std::chrono::duration
+  template <template <typename, typename> class StdDuration, typename Rep,
+            typename Period,
+            typename = decltype(DurationScalar{StdDuration<Rep, Period>{}})>
+  explicit Datum(StdDuration<Rep, Period> d) : Datum{DurationScalar(d)} {}
+
+  /// \brief The kind of data stored in Datum
+  Datum::Kind kind() const {
+    switch (this->value.index()) {
+      case 0:
+        return Datum::NONE;
+      case 1:
+        return Datum::SCALAR;
+      case 2:
+        return Datum::ARRAY;
+      case 3:
+        return Datum::CHUNKED_ARRAY;
+      case 4:
+        return Datum::RECORD_BATCH;
+      case 5:
+        return Datum::TABLE;
+      default:
+        return Datum::NONE;
+    }
+  }
+
+  /// \brief Retrieve the stored array as ArrayData
+  ///
+  /// Use make_array() if an Array is desired (which is more expensive).
+  /// \throws std::bad_variant_access if the datum is not an array
+  const std::shared_ptr<ArrayData>& array() const {
+    return std::get<std::shared_ptr<ArrayData>>(this->value);
+  }
+
+  /// \brief The sum of bytes in each buffer referenced by the datum
+  /// Note: Scalars report a size of 0
+  /// \see arrow::util::TotalBufferSize for caveats
+  int64_t TotalBufferSize() const;
+
+  /// \brief Get the stored ArrayData in mutable form
+  ///
+  /// For internal use primarily. Keep in mind a shared_ptr<Datum> may have multiple
+  /// owners.
+  ArrayData* mutable_array() const { return this->array().get(); }
+
+  /// \brief Retrieve the stored array as Array
+  /// \throws std::bad_variant_access if the datum is not an array
+  std::shared_ptr<Array> make_array() const;
+
+  /// \brief Retrieve the chunked array stored
+  /// \throws std::bad_variant_access if the datum is not a chunked array
+  const std::shared_ptr<ChunkedArray>& chunked_array() const {
+    return std::get<std::shared_ptr<ChunkedArray>>(this->value);
+  }
+
+  /// \brief Retrieve the record batch stored
+  /// \throws std::bad_variant_access if the datum is not a record batch
+  const std::shared_ptr<RecordBatch>& record_batch() const {
+    return std::get<std::shared_ptr<RecordBatch>>(this->value);
+  }
+
+  /// \brief Retrieve the table stored
+  /// \throws std::bad_variant_access if the datum is not a table
+  const std::shared_ptr<Table>& table() const {
+    return std::get<std::shared_ptr<Table>>(this->value);
+  }
+
+  /// \brief Retrieve the scalar stored
+  /// \throws std::bad_variant_access if the datum is not a scalar
+  const std::shared_ptr<Scalar>& scalar() const {
+    return std::get<std::shared_ptr<Scalar>>(this->value);
+  }
+
+  /// \brief Retrieve the datum as its concrete array type
+  /// \throws std::bad_variant_access if the datum is not an array
+  /// \tparam ExactType the expected array type, may cause undefined behavior if it is not
+  /// the type of the stored array
+  template <typename ExactType>
+  std::shared_ptr<ExactType> array_as() const {
+    return internal::checked_pointer_cast<ExactType>(this->make_array());
+  }
+
+  /// \brief Retrieve the datum as its concrete scalar type
+  /// \throws std::bad_variant_access if the datum is not a scalar
+  /// \tparam ExactType the expected scalar type, may cause undefined behavior if it is
+  /// not the type of the stored scalar
+  template <typename ExactType>
+  const ExactType& scalar_as() const {
+    return internal::checked_cast<const ExactType&>(*this->scalar());
+  }
+
+  /// \brief True if Datum contains an array
+  bool is_array() const { return this->kind() == Datum::ARRAY; }
+
+  /// \brief True if Datum contains a chunked array
+  bool is_chunked_array() const { return this->kind() == Datum::CHUNKED_ARRAY; }
+
+  /// \brief True if Datum contains an array or a chunked array
+  bool is_arraylike() const {
+    return this->kind() == Datum::ARRAY || this->kind() == Datum::CHUNKED_ARRAY;
+  }
+
+  /// \brief True if Datum contains a scalar
+  bool is_scalar() const { return this->kind() == Datum::SCALAR; }
+
+  /// \brief True if Datum contains a scalar or array-like data
+  bool is_value() const { return this->is_arraylike() || this->is_scalar(); }
+
+  /// \brief Return the null count.
+  ///
+  /// Only valid for scalar and array-like data.
+  int64_t null_count() const;
+
+  /// \brief The value type of the variant, if any
+  ///
+  /// \return nullptr if no type
+  const std::shared_ptr<DataType>& type() const;
+
+  /// \brief The schema of the variant, if any
+  ///
+  /// \return nullptr if no schema
+  const std::shared_ptr<Schema>& schema() const;
+
+  /// \brief The value length of the variant, if any
+  ///
+  /// \return kUnknownLength if no type
+  int64_t length() const;
+
+  /// \brief The array chunks of the variant, if any
+  ///
+  /// \return empty if not arraylike
+  ArrayVector chunks() const;
+
+  DeviceAllocationTypeSet device_types() const;
+
+  /// \brief True if the two data are equal
+  bool Equals(const Datum& other) const;
+
+  bool operator==(const Datum& other) const { return Equals(other); }
+  bool operator!=(const Datum& other) const { return !Equals(other); }
+
+  std::string ToString() const;
+};
+
+ARROW_EXPORT void PrintTo(const Datum&, std::ostream*);
+
+ARROW_EXPORT std::string ToString(Datum::Kind kind);
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/device.h b/pyarrow/include/arrow/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..1dbe5b4b13e898bb6402f833b982b33e134f7d7b
--- /dev/null
+++ b/pyarrow/include/arrow/device.h
@@ -0,0 +1,382 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "arrow/io/type_fwd.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryManager;
+
+/// \brief EXPERIMENTAL: Abstract interface for hardware devices
+///
+/// This object represents a device with access to some memory spaces.
+/// When handling a Buffer or raw memory address, it allows deciding in which
+/// context the raw memory address should be interpreted
+/// (e.g. CPU-accessible memory, or embedded memory on some particular GPU).
+class ARROW_EXPORT Device : public std::enable_shared_from_this<Device>,
+                            public util::EqualityComparable<Device> {
+ public:
+  virtual ~Device();
+
+  /// \brief A shorthand for this device's type.
+  ///
+  /// The returned value is different for each device class, but is the
+  /// same for all instances of a given class.  It can be used as a replacement
+  /// for RTTI.
+  virtual const char* type_name() const = 0;
+
+  /// \brief A human-readable description of the device.
+  ///
+  /// The returned value should be detailed enough to distinguish between
+  /// different instances, where necessary.
+  virtual std::string ToString() const = 0;
+
+  /// \brief Whether this instance points to the same device as another one.
+  virtual bool Equals(const Device&) const = 0;
+
+  /// \brief A device ID to identify this device if there are multiple of this type.
+  ///
+  /// If there is no "device_id" equivalent (such as for the main CPU device on
+  /// non-numa systems) returns -1.
+  virtual int64_t device_id() const { return -1; }
+
+  /// \brief Whether this device is the main CPU device.
+  ///
+  /// This shorthand method is very useful when deciding whether a memory address
+  /// is CPU-accessible.
+  bool is_cpu() const { return is_cpu_; }
+
+  /// \brief Return a MemoryManager instance tied to this device
+  ///
+  /// The returned instance uses default parameters for this device type's
+  /// MemoryManager implementation.  Some devices also allow constructing
+  /// MemoryManager instances with non-default parameters.
+  virtual std::shared_ptr<MemoryManager> default_memory_manager() = 0;
+
+  /// \brief Return the DeviceAllocationType of this device
+  virtual DeviceAllocationType device_type() const = 0;
+
+  class SyncEvent;
+
+  /// \brief EXPERIMENTAL: An opaque wrapper for Device-specific streams
+  ///
+  /// In essence this is just a wrapper around a void* to represent the
+  /// standard concept of a stream/queue on a device. Derived classes
+  /// should be trivially constructible from it's device-specific counterparts.
+  class ARROW_EXPORT Stream {
+   public:
+    using release_fn_t = std::function<void(void*)>;
+
+    virtual ~Stream() = default;
+
+    virtual const void* get_raw() const { return stream_.get(); }
+
+    /// \brief Make the stream wait on the provided event.
+    ///
+    /// Tells the stream that it should wait until the synchronization
+    /// event is completed without blocking the CPU.
+    virtual Status WaitEvent(const SyncEvent&) = 0;
+
+    /// \brief Blocks the current thread until a stream's remaining tasks are completed
+    virtual Status Synchronize() const = 0;
+
+   protected:
+    explicit Stream(void* stream, release_fn_t release_stream)
+        : stream_{stream, release_stream} {}
+
+    std::unique_ptr<void, release_fn_t> stream_;
+  };
+
+  virtual Result<std::shared_ptr<Stream>> MakeStream() { return NULLPTR; }
+
+  /// \brief Create a new device stream
+  ///
+  /// This should create the appropriate stream type for the device,
+  /// derived from Device::Stream to allow for stream ordered events
+  /// and memory allocations.
+  virtual Result<std::shared_ptr<Stream>> MakeStream(
+      unsigned int ARROW_ARG_UNUSED(flags)) {
+    return NULLPTR;
+  }
+
+  /// @brief Wrap an existing device stream alongside a release function
+  ///
+  /// @param device_stream a pointer to the stream to wrap
+  /// @param release_fn a function to call during destruction, `nullptr` or
+  ///        a no-op function can be passed to indicate ownership is maintained
+  ///        externally
+  virtual Result<std::shared_ptr<Stream>> WrapStream(
+      void* ARROW_ARG_UNUSED(device_stream),
+      Stream::release_fn_t ARROW_ARG_UNUSED(release_fn)) {
+    return NULLPTR;
+  }
+
+  /// \brief EXPERIMENTAL: An object that provides event/stream sync primitives
+  class ARROW_EXPORT SyncEvent {
+   public:
+    using release_fn_t = std::function<void(void*)>;
+
+    virtual ~SyncEvent() = default;
+
+    void* get_raw() { return sync_event_.get(); }
+
+    /// @brief Block until sync event is completed.
+    virtual Status Wait() = 0;
+
+    /// @brief Record the wrapped event on the stream so it triggers
+    /// the event when the stream gets to that point in its queue.
+    virtual Status Record(const Stream&) = 0;
+
+   protected:
+    /// If creating this with a passed in event, the caller must ensure
+    /// that the event lives until clear_event is called on this as it
+    /// won't own it.
+    explicit SyncEvent(void* sync_event, release_fn_t release_sync_event)
+        : sync_event_{sync_event, release_sync_event} {}
+
+    std::unique_ptr<void, release_fn_t> sync_event_;
+  };
+
+ protected:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Device);
+  explicit Device(bool is_cpu = false) : is_cpu_(is_cpu) {}
+
+  bool is_cpu_;
+};
+
+/// \brief EXPERIMENTAL: An object that provides memory management primitives
+///
+/// A MemoryManager is always tied to a particular Device instance.
+/// It can also have additional parameters (such as a MemoryPool to
+/// allocate CPU memory).
+class ARROW_EXPORT MemoryManager : public std::enable_shared_from_this<MemoryManager> {
+ public:
+  virtual ~MemoryManager();
+
+  /// \brief The device this MemoryManager is tied to
+  const std::shared_ptr<Device>& device() const { return device_; }
+
+  /// \brief Whether this MemoryManager is tied to the main CPU device.
+  ///
+  /// This shorthand method is very useful when deciding whether a memory address
+  /// is CPU-accessible.
+  bool is_cpu() const { return device_->is_cpu(); }
+
+  /// \brief Create a RandomAccessFile to read a particular buffer.
+  ///
+  /// The given buffer must be tied to this MemoryManager.
+  ///
+  /// See also the Buffer::GetReader shorthand.
+  virtual Result<std::shared_ptr<io::RandomAccessFile>> GetBufferReader(
+      std::shared_ptr<Buffer> buf) = 0;
+
+  /// \brief Create a OutputStream to write to a particular buffer.
+  ///
+  /// The given buffer must be mutable and tied to this MemoryManager.
+  /// The returned stream object writes into the buffer's underlying memory
+  /// (but it won't resize it).
+  ///
+  /// See also the Buffer::GetWriter shorthand.
+  virtual Result<std::shared_ptr<io::OutputStream>> GetBufferWriter(
+      std::shared_ptr<Buffer> buf) = 0;
+
+  /// \brief Allocate a (mutable) Buffer
+  ///
+  /// The buffer will be allocated in the device's memory.
+  virtual Result<std::unique_ptr<Buffer>> AllocateBuffer(int64_t size) = 0;
+
+  /// \brief Copy a Buffer to a destination MemoryManager
+  ///
+  /// See also the Buffer::Copy shorthand.
+  static Result<std::shared_ptr<Buffer>> CopyBuffer(
+      const std::shared_ptr<Buffer>& source, const std::shared_ptr<MemoryManager>& to);
+
+  /// \brief Copy a non-owned Buffer to a destination MemoryManager
+  ///
+  /// This is useful for cases where the source memory area is externally managed
+  /// (its lifetime not tied to the source Buffer), otherwise please use CopyBuffer().
+  static Result<std::unique_ptr<Buffer>> CopyNonOwned(
+      const Buffer& source, const std::shared_ptr<MemoryManager>& to);
+
+  /// \brief Make a no-copy Buffer view in a destination MemoryManager
+  ///
+  /// See also the Buffer::View shorthand.
+  static Result<std::shared_ptr<Buffer>> ViewBuffer(
+      const std::shared_ptr<Buffer>& source, const std::shared_ptr<MemoryManager>& to);
+
+  /// \brief Copy a slice of a buffer into a CPU pointer
+  static Status CopyBufferSliceToCPU(const std::shared_ptr<Buffer>& buf, int64_t offset,
+                                     int64_t length, uint8_t* out_data);
+
+  /// \brief Create a new SyncEvent.
+  ///
+  /// This version should construct the appropriate event for the device and
+  /// provide the unique_ptr with the correct deleter for the event type.
+  /// If the device does not require or work with any synchronization, it is
+  /// allowed for it to return a nullptr.
+  virtual Result<std::shared_ptr<Device::SyncEvent>> MakeDeviceSyncEvent();
+
+  /// \brief Wrap an event into a SyncEvent.
+  ///
+  /// @param sync_event passed in sync_event (should be a pointer to the appropriate type)
+  /// @param release_sync_event destructor to free sync_event. `nullptr` may be
+  ///        passed to indicate that no destruction/freeing is necessary
+  virtual Result<std::shared_ptr<Device::SyncEvent>> WrapDeviceSyncEvent(
+      void* sync_event, Device::SyncEvent::release_fn_t release_sync_event);
+
+ protected:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(MemoryManager);
+
+  explicit MemoryManager(const std::shared_ptr<Device>& device) : device_(device) {}
+
+  // Default implementations always return nullptr, should be overridden
+  // by subclasses that support data transfer.
+  // (returning nullptr means unsupported copy / view)
+  // In CopyBufferFrom and ViewBufferFrom, the `from` parameter is guaranteed to
+  // be equal to `buf->memory_manager()`.
+  virtual Result<std::shared_ptr<Buffer>> CopyBufferFrom(
+      const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& from);
+  virtual Result<std::shared_ptr<Buffer>> CopyBufferTo(
+      const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to);
+  virtual Result<std::unique_ptr<Buffer>> CopyNonOwnedFrom(
+      const Buffer& buf, const std::shared_ptr<MemoryManager>& from);
+  virtual Result<std::unique_ptr<Buffer>> CopyNonOwnedTo(
+      const Buffer& buf, const std::shared_ptr<MemoryManager>& to);
+  virtual Result<std::shared_ptr<Buffer>> ViewBufferFrom(
+      const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& from);
+  virtual Result<std::shared_ptr<Buffer>> ViewBufferTo(
+      const std::shared_ptr<Buffer>& buf, const std::shared_ptr<MemoryManager>& to);
+
+  std::shared_ptr<Device> device_;
+};
+
+// ----------------------------------------------------------------------
+// CPU backend implementation
+
+class ARROW_EXPORT CPUDevice : public Device {
+ public:
+  const char* type_name() const override;
+  std::string ToString() const override;
+  bool Equals(const Device&) const override;
+  DeviceAllocationType device_type() const override { return DeviceAllocationType::kCPU; }
+
+  std::shared_ptr<MemoryManager> default_memory_manager() override;
+
+  /// \brief Return the global CPUDevice instance
+  static std::shared_ptr<Device> Instance();
+
+  /// \brief Create a MemoryManager
+  ///
+  /// The returned MemoryManager will use the given MemoryPool for allocations.
+  static std::shared_ptr<MemoryManager> memory_manager(MemoryPool* pool);
+
+ protected:
+  CPUDevice() : Device(true) {}
+};
+
+class ARROW_EXPORT CPUMemoryManager : public MemoryManager {
+ public:
+  Result<std::shared_ptr<io::RandomAccessFile>> GetBufferReader(
+      std::shared_ptr<Buffer> buf) override;
+  Result<std::shared_ptr<io::OutputStream>> GetBufferWriter(
+      std::shared_ptr<Buffer> buf) override;
+
+  Result<std::unique_ptr<Buffer>> AllocateBuffer(int64_t size) override;
+
+  /// \brief Return the MemoryPool associated with this MemoryManager.
+  MemoryPool* pool() const { return pool_; }
+
+ protected:
+  CPUMemoryManager(const std::shared_ptr<Device>& device, MemoryPool* pool)
+      : MemoryManager(device), pool_(pool) {}
+
+  static std::shared_ptr<MemoryManager> Make(const std::shared_ptr<Device>& device,
+                                             MemoryPool* pool = default_memory_pool());
+
+  Result<std::shared_ptr<Buffer>> CopyBufferFrom(
+      const std::shared_ptr<Buffer>& buf,
+      const std::shared_ptr<MemoryManager>& from) override;
+  Result<std::shared_ptr<Buffer>> CopyBufferTo(
+      const std::shared_ptr<Buffer>& buf,
+      const std::shared_ptr<MemoryManager>& to) override;
+  Result<std::unique_ptr<Buffer>> CopyNonOwnedFrom(
+      const Buffer& buf, const std::shared_ptr<MemoryManager>& from) override;
+  Result<std::unique_ptr<Buffer>> CopyNonOwnedTo(
+      const Buffer& buf, const std::shared_ptr<MemoryManager>& to) override;
+  Result<std::shared_ptr<Buffer>> ViewBufferFrom(
+      const std::shared_ptr<Buffer>& buf,
+      const std::shared_ptr<MemoryManager>& from) override;
+  Result<std::shared_ptr<Buffer>> ViewBufferTo(
+      const std::shared_ptr<Buffer>& buf,
+      const std::shared_ptr<MemoryManager>& to) override;
+
+  MemoryPool* pool_;
+
+  friend std::shared_ptr<MemoryManager> CPUDevice::memory_manager(MemoryPool* pool);
+  ARROW_FRIEND_EXPORT friend std::shared_ptr<MemoryManager> default_cpu_memory_manager();
+};
+
+/// \brief Return the default CPU MemoryManager instance
+///
+/// The returned singleton instance uses the default MemoryPool.
+/// This function is a faster spelling of
+/// `CPUDevice::Instance()->default_memory_manager()`.
+ARROW_EXPORT
+std::shared_ptr<MemoryManager> default_cpu_memory_manager();
+
+using DeviceMapper =
+    std::function<Result<std::shared_ptr<MemoryManager>>(int64_t device_id)>;
+
+/// \brief Register a function to retrieve a MemoryManager for a Device type
+///
+/// This registers the device type globally. A specific device type can only
+/// be registered once. This method is thread-safe.
+///
+/// Currently, this registry is only used for importing data through the C Device
+/// Data Interface (for the default Device to MemoryManager mapper in
+/// arrow::ImportDeviceArray/ImportDeviceRecordBatch).
+///
+/// \param[in] device_type the device type for which to register a MemoryManager
+/// \param[in] mapper function that takes a device id and returns the appropriate
+/// MemoryManager for the registered device type and given device id
+/// \return Status
+ARROW_EXPORT
+Status RegisterDeviceMapper(DeviceAllocationType device_type, DeviceMapper mapper);
+
+/// \brief Get the registered function to retrieve a MemoryManager for the
+/// given Device type
+///
+/// \param[in] device_type the device type
+/// \return function that takes a device id and returns the appropriate
+/// MemoryManager for the registered device type and given device id
+ARROW_EXPORT
+Result<DeviceMapper> GetDeviceMapper(DeviceAllocationType device_type);
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/device_allocation_type_set.h b/pyarrow/include/arrow/device_allocation_type_set.h
new file mode 100644
index 0000000000000000000000000000000000000000..974367307e6d49695fec17a661f00ede9f620637
--- /dev/null
+++ b/pyarrow/include/arrow/device_allocation_type_set.h
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <bitset>
+#include <string>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+ARROW_EXPORT
+const char* DeviceAllocationTypeToCStr(DeviceAllocationType type);
+
+class ARROW_EXPORT DeviceAllocationTypeSet {
+ private:
+  std::bitset<kDeviceAllocationTypeMax + 1> device_type_bitset_;
+
+ public:
+  /// \brief Construct an empty set of device types.
+  DeviceAllocationTypeSet() = default;
+
+  /// \brief Construct a set of device types with a single device type.
+  DeviceAllocationTypeSet(  // NOLINT implicit construction
+      DeviceAllocationType accepted_device_type) {
+    add(accepted_device_type);
+  }
+
+  /// \brief Construct a set of device types containing only "kCPU".
+  static DeviceAllocationTypeSet CpuOnly() {
+    return DeviceAllocationTypeSet{DeviceAllocationType::kCPU};
+  }
+
+  /// \brief Construct a set of device types containing all device types.
+  static DeviceAllocationTypeSet All() {
+    DeviceAllocationTypeSet all;
+    all.device_type_bitset_.set();
+    // Don't set the invalid enum values.
+    all.device_type_bitset_.reset(0);
+    all.device_type_bitset_.reset(5);
+    all.device_type_bitset_.reset(6);
+    return all;
+  }
+
+  /// \brief Add a device type to the set of device types.
+  void add(DeviceAllocationType device_type) {
+    device_type_bitset_.set(static_cast<int>(device_type));
+  }
+
+  /// \brief Remove a device type from the set of device types.
+  void remove(DeviceAllocationType device_type) {
+    device_type_bitset_.reset(static_cast<int>(device_type));
+  }
+
+  /// \brief Return true iff the set only contains the CPU device type.
+  bool is_cpu_only() const {
+    return device_type_bitset_ == CpuOnly().device_type_bitset_;
+  }
+
+  /// \brief Return true if the set of accepted device types includes the
+  /// device type.
+  bool contains(DeviceAllocationType device_type) const {
+    return device_type_bitset_.test(static_cast<int>(device_type));
+  }
+
+  /// \brief Add all device types from another set to this set.
+  void Add(DeviceAllocationTypeSet other) {
+    device_type_bitset_ |= other.device_type_bitset_;
+  }
+
+  /// \brief Return true if the set of accepted device types includes all the
+  /// device types in the other set.
+  bool Contains(DeviceAllocationTypeSet other) const {
+    // other \subseteq this <==> (other \intersect this == other)
+    return (other.device_type_bitset_ & device_type_bitset_) == other.device_type_bitset_;
+  }
+
+  std::string ToString() const;
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/engine/api.h b/pyarrow/include/arrow/engine/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c94e13032307a7a954ce800fca99ca5a53fd15f
--- /dev/null
+++ b/pyarrow/include/arrow/engine/api.h
@@ -0,0 +1,22 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include "arrow/engine/substrait/api.h"
diff --git a/pyarrow/include/arrow/engine/substrait/api.h b/pyarrow/include/arrow/engine/substrait/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..8161f21712974ad6bb6a58ed451807e5a2e8e829
--- /dev/null
+++ b/pyarrow/include/arrow/engine/substrait/api.h
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include "arrow/engine/substrait/extension_set.h"
+#include "arrow/engine/substrait/extension_types.h"
+#include "arrow/engine/substrait/options.h"
+#include "arrow/engine/substrait/relation.h"
+#include "arrow/engine/substrait/serde.h"
diff --git a/pyarrow/include/arrow/engine/substrait/extension_set.h b/pyarrow/include/arrow/engine/substrait/extension_set.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f631e0f193d9440349f08b5edf7fad6fca49b34
--- /dev/null
+++ b/pyarrow/include/arrow/engine/substrait/extension_set.h
@@ -0,0 +1,481 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/expression.h"
+#include "arrow/engine/substrait/type_fwd.h"
+#include "arrow/engine/substrait/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace engine {
+
+constexpr const char* kSubstraitArithmeticFunctionsUri =
+    "https://github.com/substrait-io/substrait/blob/main/extensions/"
+    "functions_arithmetic.yaml";
+constexpr const char* kSubstraitBooleanFunctionsUri =
+    "https://github.com/substrait-io/substrait/blob/main/extensions/"
+    "functions_boolean.yaml";
+constexpr const char* kSubstraitComparisonFunctionsUri =
+    "https://github.com/substrait-io/substrait/blob/main/extensions/"
+    "functions_comparison.yaml";
+constexpr const char* kSubstraitDatetimeFunctionsUri =
+    "https://github.com/substrait-io/substrait/blob/main/extensions/"
+    "functions_datetime.yaml";
+constexpr const char* kSubstraitLogarithmicFunctionsUri =
+    "https://github.com/substrait-io/substrait/blob/main/extensions/"
+    "functions_logarithmic.yaml";
+constexpr const char* kSubstraitRoundingFunctionsUri =
+    "https://github.com/substrait-io/substrait/blob/main/extensions/"
+    "functions_rounding.yaml";
+constexpr const char* kSubstraitStringFunctionsUri =
+    "https://github.com/substrait-io/substrait/blob/main/extensions/"
+    "functions_string.yaml";
+constexpr const char* kSubstraitAggregateGenericFunctionsUri =
+    "https://github.com/substrait-io/substrait/blob/main/extensions/"
+    "functions_aggregate_generic.yaml";
+
+/// If a function call contains this URI then the function is looked up
+/// in the registry directly, all arguments are mapped as value arguments,
+/// and any options are ignored.
+constexpr const char* kArrowSimpleExtensionFunctionsUri =
+    "urn:arrow:substrait_simple_extension_function";
+
+struct ARROW_ENGINE_EXPORT Id {
+  std::string_view uri, name;
+  bool empty() const { return uri.empty() && name.empty(); }
+  std::string ToString() const;
+};
+struct ARROW_ENGINE_EXPORT IdHashEq {
+  size_t operator()(Id id) const;
+  bool operator()(Id l, Id r) const;
+};
+
+/// \brief Owning storage for ids
+///
+/// Substrait plans may reuse URIs and names in many places.  For convenience
+/// and performance Substrait ids are typically passed around as views.  As we
+/// convert a plan from Substrait to Arrow we need to copy these strings out of
+/// the Substrait buffer and into owned storage.  This class serves as that owned
+/// storage.
+class ARROW_ENGINE_EXPORT IdStorage {
+ public:
+  virtual ~IdStorage() = default;
+  /// \brief Get an equivalent id pointing into this storage
+  ///
+  /// This operation will copy the ids into storage if they do not already exist
+  virtual Id Emplace(Id id) = 0;
+  /// \brief Get an equivalent view pointing into this storage for a URI
+  ///
+  /// If no URI is found then the uri will be copied into storage
+  virtual std::string_view EmplaceUri(std::string_view uri) = 0;
+  /// \brief Get an equivalent id pointing into this storage
+  ///
+  /// If no id is found then nullopt will be returned
+  virtual std::optional<Id> Find(Id id) const = 0;
+  /// \brief Get an equivalent view pointing into this storage for a URI
+  ///
+  /// If no URI is found then nullopt will be returned
+  virtual std::optional<std::string_view> FindUri(std::string_view uri) const = 0;
+
+  static std::unique_ptr<IdStorage> Make();
+};
+
+/// \brief Describes a Substrait call
+///
+/// Substrait call expressions contain a list of arguments which can either
+/// be enum arguments (which are serialized as strings), value arguments (which)
+/// are Arrow expressions, or type arguments (not yet implemented)
+class ARROW_ENGINE_EXPORT SubstraitCall {
+ public:
+  SubstraitCall(Id id, std::shared_ptr<DataType> output_type, bool output_nullable,
+                bool is_hash = false)
+      : id_(id),
+        output_type_(std::move(output_type)),
+        output_nullable_(output_nullable),
+        is_hash_(is_hash) {}
+
+  const Id& id() const { return id_; }
+  const std::shared_ptr<DataType>& output_type() const { return output_type_; }
+  bool output_nullable() const { return output_nullable_; }
+  bool is_hash() const { return is_hash_; }
+  const std::unordered_map<std::string, std::vector<std::string>>& options() const {
+    return options_;
+  }
+
+  bool HasEnumArg(int index) const;
+  Result<std::string_view> GetEnumArg(int index) const;
+  void SetEnumArg(int index, std::string enum_arg);
+  Result<compute::Expression> GetValueArg(int index) const;
+  bool HasValueArg(int index) const;
+  void SetValueArg(int index, compute::Expression value_arg);
+  std::optional<const std::vector<std::string>*> GetOption(
+      std::string_view option_name) const;
+  void SetOption(std::string_view option_name,
+                 const std::vector<std::string_view>& option_preferences);
+  bool HasOptions() const;
+  int size() const { return size_; }
+
+ private:
+  Id id_;
+  std::shared_ptr<DataType> output_type_;
+  bool output_nullable_;
+  // Only needed when converting from Substrait -> Arrow aggregates.  The
+  // Arrow function name depends on whether or not there are any groups
+  bool is_hash_;
+  std::unordered_map<int, std::string> enum_args_;
+  std::unordered_map<int, compute::Expression> value_args_;
+  std::unordered_map<std::string, std::vector<std::string>> options_;
+  int size_ = 0;
+};
+
+/// Substrait identifies functions and custom data types using a (uri, name) pair.
+///
+/// This registry is a bidirectional mapping between Substrait IDs and their
+/// corresponding Arrow counterparts (arrow::DataType and function names in a function
+/// registry)
+///
+/// Substrait extension types and variations must be registered with their
+/// corresponding arrow::DataType before they can be used!
+///
+/// Conceptually this can be thought of as two pairs of `unordered_map`s.  One pair to
+/// go back and forth between Substrait ID and arrow::DataType and another pair to go
+/// back and forth between Substrait ID and Arrow function names.
+///
+/// Unlike an ExtensionSet this registry is not created automatically when consuming
+/// Substrait plans and must be configured ahead of time (although there is a default
+/// instance).
+class ARROW_ENGINE_EXPORT ExtensionIdRegistry {
+ public:
+  using ArrowToSubstraitCall =
+      std::function<Result<SubstraitCall>(const arrow::compute::Expression::Call&)>;
+  using SubstraitCallToArrow =
+      std::function<Result<arrow::compute::Expression>(const SubstraitCall&)>;
+  using ArrowToSubstraitAggregate =
+      std::function<Result<SubstraitCall>(const arrow::compute::Aggregate&)>;
+  using SubstraitAggregateToArrow =
+      std::function<Result<arrow::compute::Aggregate>(const SubstraitCall&)>;
+
+  /// \brief A mapping between a Substrait ID and an arrow::DataType
+  struct TypeRecord {
+    Id id;
+    const std::shared_ptr<DataType>& type;
+  };
+
+  /// \brief Return a uri view owned by this registry
+  ///
+  /// If the URI has never been emplaced it will return nullopt
+  virtual std::optional<std::string_view> FindUri(std::string_view uri) const = 0;
+  /// \brief Return a id view owned by this registry
+  ///
+  /// If the id has never been emplaced it will return nullopt
+  virtual std::optional<Id> FindId(Id id) const = 0;
+  virtual std::optional<TypeRecord> GetType(const DataType&) const = 0;
+  virtual std::optional<TypeRecord> GetType(Id) const = 0;
+  virtual Status CanRegisterType(Id, const std::shared_ptr<DataType>& type) const = 0;
+  virtual Status RegisterType(Id, std::shared_ptr<DataType>) = 0;
+  /// \brief Register a converter that converts an Arrow call to a Substrait call
+  ///
+  /// Note that there may not be 1:1 parity between ArrowToSubstraitCall and
+  /// SubstraitCallToArrow because some standard functions (e.g. add) may map to
+  /// multiple Arrow functions (e.g. add, add_checked)
+  virtual Status AddArrowToSubstraitCall(std::string arrow_function_name,
+                                         ArrowToSubstraitCall conversion_func) = 0;
+  /// \brief Check to see if a converter can be registered
+  ///
+  /// \return Status::OK if there are no conflicts, otherwise an error is returned
+  virtual Status CanAddArrowToSubstraitCall(
+      const std::string& arrow_function_name) const = 0;
+
+  /// \brief Register a converter that converts an Arrow aggregate to a Substrait
+  ///        aggregate
+  virtual Status AddArrowToSubstraitAggregate(
+      std::string arrow_function_name, ArrowToSubstraitAggregate conversion_func) = 0;
+  /// \brief Check to see if a converter can be registered
+  ///
+  /// \return Status::OK if there are no conflicts, otherwise an error is returned
+  virtual Status CanAddArrowToSubstraitAggregate(
+      const std::string& arrow_function_name) const = 0;
+
+  /// \brief Register a converter that converts a Substrait call to an Arrow call
+  virtual Status AddSubstraitCallToArrow(Id substrait_function_id,
+                                         SubstraitCallToArrow conversion_func) = 0;
+  /// \brief Check to see if a converter can be registered
+  ///
+  /// \return Status::OK if there are no conflicts, otherwise an error is returned
+  virtual Status CanAddSubstraitCallToArrow(Id substrait_function_id) const = 0;
+  /// \brief Register a simple mapping function
+  ///
+  /// All calls to the function must pass only value arguments.  The arguments
+  /// will be converted to expressions and passed to the Arrow function
+  virtual Status AddSubstraitCallToArrow(Id substrait_function_id,
+                                         std::string arrow_function_name) = 0;
+
+  /// \brief Register a converter that converts a Substrait aggregate to an Arrow
+  ///        aggregate
+  virtual Status AddSubstraitAggregateToArrow(
+      Id substrait_function_id, SubstraitAggregateToArrow conversion_func) = 0;
+  /// \brief Check to see if a converter can be registered
+  ///
+  /// \return Status::OK if there are no conflicts, otherwise an error is returned
+  virtual Status CanAddSubstraitAggregateToArrow(Id substrait_function_id) const = 0;
+
+  /// \brief Return a list of Substrait functions that have a converter
+  ///
+  /// The function ids are encoded as strings using the pattern {uri}#{name}
+  virtual std::vector<std::string> GetSupportedSubstraitFunctions() const = 0;
+
+  /// \brief Find a converter to map Arrow calls to Substrait calls
+  /// \return A converter function or an invalid status if no converter is registered
+  virtual Result<ArrowToSubstraitCall> GetArrowToSubstraitCall(
+      const std::string& arrow_function_name) const = 0;
+
+  /// \brief Find a converter to map Arrow aggregates to Substrait aggregates
+  /// \return A converter function or an invalid status if no converter is registered
+  virtual Result<ArrowToSubstraitAggregate> GetArrowToSubstraitAggregate(
+      const std::string& arrow_function_name) const = 0;
+
+  /// \brief Find a converter to map a Substrait aggregate to an Arrow aggregate
+  /// \return A converter function or an invalid status if no converter is registered
+  virtual Result<SubstraitAggregateToArrow> GetSubstraitAggregateToArrow(
+      Id substrait_function_id) const = 0;
+
+  /// \brief Find a converter to map a Substrait call to an Arrow call
+  /// \return A converter function or an invalid status if no converter is registered
+  virtual Result<SubstraitCallToArrow> GetSubstraitCallToArrow(
+      Id substrait_function_id) const = 0;
+
+  /// \brief Similar to \see GetSubstraitCallToArrow but only uses the name
+  ///
+  /// There may be multiple functions with the same name and this will return
+  /// the first.  This is slower than GetSubstraitCallToArrow and should only
+  /// be used when the plan does not include a URI (or the URI is "/")
+  virtual Result<SubstraitCallToArrow> GetSubstraitCallToArrowFallback(
+      std::string_view function_name) const = 0;
+
+  /// \brief Similar to \see GetSubstraitAggregateToArrow but only uses the name
+  ///
+  /// \see GetSubstraitCallToArrowFallback for details on the fallback behavior
+  virtual Result<SubstraitAggregateToArrow> GetSubstraitAggregateToArrowFallback(
+      std::string_view function_name) const = 0;
+};
+
+constexpr std::string_view kArrowExtTypesUri =
+    "https://github.com/apache/arrow/blob/main/format/substrait/"
+    "extension_types.yaml";
+// Extension types that don't match 1:1 with a data type (or the data type is
+// parameterized)
+constexpr std::string_view kTimeNanosTypeName = "time_nanos";
+constexpr Id kTimeNanosId = {kArrowExtTypesUri, kTimeNanosTypeName};
+
+/// A default registry with all supported functions and data types registered
+///
+/// Note: Function support is currently very minimal, see ARROW-15538
+ARROW_ENGINE_EXPORT ExtensionIdRegistry* default_extension_id_registry();
+
+/// \brief Make a nested registry with a given parent.
+///
+/// A nested registry supports registering types and functions other and on top of those
+/// already registered in its parent registry. No conflicts in IDs and names used for
+/// lookup are allowed. Normally, the given parent is the default registry.
+///
+/// One use case for a nested registry is for dynamic registration of functions defined
+/// within a Substrait plan while keeping these registrations specific to the plan. When
+/// the Substrait plan is disposed of, normally after its execution, the nested registry
+/// can be disposed of as well.
+ARROW_ENGINE_EXPORT std::shared_ptr<ExtensionIdRegistry> nested_extension_id_registry(
+    const ExtensionIdRegistry* parent);
+
+/// \brief A set of extensions used within a plan
+///
+/// Each time an extension is used within a Substrait plan the extension
+/// must be included in an extension set that is defined at the root of the
+/// plan.
+///
+/// The plan refers to a specific extension using an "anchor" which is an
+/// arbitrary integer invented by the producer that has no meaning beyond a
+/// plan but which should be consistent within a plan.
+///
+/// To support serialization and deserialization this type serves as a
+/// bidirectional map between Substrait ID and "anchor"s.
+///
+/// When deserializing a Substrait plan the extension set should be extracted
+/// after the plan has been converted from Protobuf and before the plan
+/// is converted to an execution plan.
+///
+/// The extension set can be kept and reused during serialization if a perfect
+/// round trip is required.  If serialization is not needed or round tripping
+/// is not required then the extension set can be safely discarded after the
+/// plan has been converted into an execution plan.
+///
+/// When converting an execution plan into a Substrait plan an extension set
+/// can be automatically generated or a previously generated extension set can
+/// be used.
+///
+/// ExtensionSet does not own strings; it only refers to strings in an
+/// ExtensionIdRegistry.
+class ARROW_ENGINE_EXPORT ExtensionSet {
+ public:
+  struct FunctionRecord {
+    Id id;
+    std::string_view name;
+  };
+
+  struct TypeRecord {
+    Id id;
+    std::shared_ptr<DataType> type;
+  };
+
+  /// Construct an empty ExtensionSet to be populated during serialization.
+  explicit ExtensionSet(const ExtensionIdRegistry* = default_extension_id_registry());
+  ARROW_DEFAULT_MOVE_AND_ASSIGN(ExtensionSet);
+
+  /// Construct an ExtensionSet with explicit extension ids for efficient referencing
+  /// during deserialization. Note that input vectors need not be densely packed; an empty
+  /// (default constructed) Id may be used as a placeholder to indicate an unused
+  /// _anchor/_reference. This factory will be used to wrap the extensions declared in a
+  /// substrait::Plan before deserializing the plan's relations.
+  ///
+  /// Views will be replaced with equivalent views pointing to memory owned by the
+  /// registry.
+  ///
+  /// Note: This is an advanced operation.  The order of the ids, types, and functions
+  /// must match the anchor numbers chosen for a plan.
+  ///
+  /// An extension set should instead be created using
+  /// arrow::engine::GetExtensionSetFromPlan
+  static Result<ExtensionSet> Make(
+      std::unordered_map<uint32_t, std::string_view> uris,
+      std::unordered_map<uint32_t, Id> type_ids,
+      std::unordered_map<uint32_t, Id> function_ids,
+      const ConversionOptions& conversion_options,
+      const ExtensionIdRegistry* = default_extension_id_registry());
+
+  const std::unordered_map<uint32_t, std::string_view>& uris() const { return uris_; }
+
+  /// \brief Returns a data type given an anchor
+  ///
+  /// This is used when converting a Substrait plan to an Arrow execution plan.
+  ///
+  /// If the anchor does not exist in this extension set an error will be returned.
+  Result<TypeRecord> DecodeType(uint32_t anchor) const;
+
+  /// \brief Returns the number of custom type records in this extension set
+  ///
+  /// Note: the types are currently stored as a sparse vector, so this may return a value
+  /// larger than the actual number of types. This behavior may change in the future; see
+  /// ARROW-15583.
+  std::size_t num_types() const { return types_.size(); }
+
+  /// \brief Lookup the anchor for a given type
+  ///
+  /// This operation is used when converting an Arrow execution plan to a Substrait plan.
+  /// If the type has been previously encoded then the same anchor value will returned.
+  ///
+  /// If the type has not been previously encoded then a new anchor value will be created.
+  ///
+  /// If the type does not exist in the extension id registry then an error will be
+  /// returned.
+  ///
+  /// \return An anchor that can be used to refer to the type within a plan
+  Result<uint32_t> EncodeType(const DataType& type);
+
+  /// \brief Return a function id given an anchor
+  ///
+  /// This is used when converting a Substrait plan to an Arrow execution plan.
+  ///
+  /// If the anchor does not exist in this extension set an error will be returned.
+  Result<Id> DecodeFunction(uint32_t anchor) const;
+
+  /// \brief Lookup the anchor for a given function
+  ///
+  /// This operation is used when converting an Arrow execution plan to a Substrait  plan.
+  /// If the function has been previously encoded then the same anchor value will be
+  /// returned.
+  ///
+  /// If the function has not been previously encoded then a new anchor value will be
+  /// created.
+  ///
+  /// If the function name is not in the extension id registry then an error will be
+  /// returned.
+  ///
+  /// \return An anchor that can be used to refer to the function within a plan
+  Result<uint32_t> EncodeFunction(Id function_id);
+
+  /// \brief Stores a plan-specific id that is not known to the registry
+  ///
+  /// This is used when converting an Arrow execution plan to a Substrait plan.
+  ///
+  /// If the function is a UDF, something that wasn't known to the registry,
+  /// then we need long term storage of the function name (the ids are just
+  /// views)
+  Id RegisterPlanSpecificId(Id id);
+
+  /// \brief Return the number of custom functions in this extension set
+  std::size_t num_functions() const { return functions_.size(); }
+
+  const ExtensionIdRegistry* registry() const { return registry_; }
+
+ private:
+  const ExtensionIdRegistry* registry_;
+  // If the registry is not aware of an id then we probably can't do anything
+  // with it.  However, in some cases, these may represent extensions or features
+  // that we can safely ignore.  For example, we can usually safely ignore
+  // extension type variations if we assume the plan is valid.  These ignorable
+  // ids are stored here.
+  std::unique_ptr<IdStorage> plan_specific_ids_ = IdStorage::Make();
+
+  // Map from anchor values to URI values referenced by this extension set
+  std::unordered_map<uint32_t, std::string_view> uris_;
+  // Map from anchor values to type definitions, used during Substrait->Arrow
+  // and populated from the Substrait extension set
+  std::unordered_map<uint32_t, TypeRecord> types_;
+  // Map from anchor values to function ids, used during Substrait->Arrow
+  // and populated from the Substrait extension set
+  std::unordered_map<uint32_t, Id> functions_;
+  // Map from type names to anchor values.  Used during Arrow->Substrait
+  // and built as the plan is created.
+  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> types_map_;
+  // Map from function names to anchor values.  Used during Arrow->Substrait
+  // and built as the plan is created.
+  std::unordered_map<Id, uint32_t, IdHashEq, IdHashEq> functions_map_;
+
+  Status CheckHasUri(std::string_view uri);
+  void AddUri(std::pair<uint32_t, std::string_view> uri);
+  Status AddUri(Id id);
+};
+
+}  // namespace engine
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/engine/substrait/extension_types.h b/pyarrow/include/arrow/engine/substrait/extension_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae71ad83f7e5425adeae28d88d031667fe2ce9ce
--- /dev/null
+++ b/pyarrow/include/arrow/engine/substrait/extension_types.h
@@ -0,0 +1,90 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "arrow/engine/substrait/visibility.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace engine {
+
+// arrow::ExtensionTypes are provided to wrap uuid, fixed_char, varchar, interval_year,
+// and interval_day which are first-class types in substrait but do not appear in
+// the arrow type system.
+//
+// Note that these are not automatically registered with arrow::RegisterExtensionType(),
+// which means among other things that serialization of these types to IPC would fail.
+
+/// fixed_size_binary(16) for storing Universally Unique IDentifiers
+ARROW_ENGINE_EXPORT
+std::shared_ptr<DataType> uuid();
+
+/// fixed_size_binary(length) constrained to contain only valid UTF-8
+ARROW_ENGINE_EXPORT
+std::shared_ptr<DataType> fixed_char(int32_t length);
+
+/// utf8() constrained to be shorter than `length`
+ARROW_ENGINE_EXPORT
+std::shared_ptr<DataType> varchar(int32_t length);
+
+/// fixed_size_list(int32(), 2) storing a number of [years, months]
+ARROW_ENGINE_EXPORT
+std::shared_ptr<DataType> interval_year();
+
+/// fixed_size_list(int32(), 2) storing a number of [days, seconds]
+ARROW_ENGINE_EXPORT
+std::shared_ptr<DataType> interval_day();
+
+/// constructs the appropriate timestamp type given the precision
+/// no time zone
+ARROW_ENGINE_EXPORT
+Result<std::shared_ptr<DataType>> precision_timestamp(int precision);
+
+/// constructs the appropriate timestamp type given the precision
+/// and the UTC time zone
+ARROW_ENGINE_EXPORT
+Result<std::shared_ptr<DataType>> precision_timestamp_tz(int precision);
+
+/// Return true if t is Uuid, otherwise false
+ARROW_ENGINE_EXPORT
+bool UnwrapUuid(const DataType&);
+
+/// Return FixedChar length if t is FixedChar, otherwise nullopt
+ARROW_ENGINE_EXPORT
+std::optional<int32_t> UnwrapFixedChar(const DataType&);
+
+/// Return Varchar (max) length if t is VarChar, otherwise nullopt
+ARROW_ENGINE_EXPORT
+std::optional<int32_t> UnwrapVarChar(const DataType& t);
+
+/// Return true if t is IntervalYear, otherwise false
+ARROW_ENGINE_EXPORT
+bool UnwrapIntervalYear(const DataType&);
+
+/// Return true if t is IntervalDay, otherwise false
+ARROW_ENGINE_EXPORT
+bool UnwrapIntervalDay(const DataType&);
+
+}  // namespace engine
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/engine/substrait/options.h b/pyarrow/include/arrow/engine/substrait/options.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e6f6efb2c751a97e3f0cd9de3eb55c0bb87772c
--- /dev/null
+++ b/pyarrow/include/arrow/engine/substrait/options.h
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "arrow/acero/exec_plan.h"
+#include "arrow/acero/options.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/engine/substrait/type_fwd.h"
+#include "arrow/engine/substrait/visibility.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace engine {
+
+/// How strictly to adhere to the input structure when converting between Substrait and
+/// Acero representations of a plan. This allows the user to trade conversion accuracy
+/// for performance and lenience.
+enum class ARROW_ENGINE_EXPORT ConversionStrictness {
+  /// When a primitive is used at the input that doesn't have an exact match at the
+  /// output, reject the conversion. This effectively asserts that there is no (known)
+  /// information loss in the conversion, and that plans should either round-trip back and
+  /// forth exactly or not at all. This option is primarily intended for testing and
+  /// debugging.
+  EXACT_ROUNDTRIP,
+
+  /// When a primitive is used at the input that doesn't have an exact match at the
+  /// output, attempt to model it with some collection of primitives at the output. This
+  /// means that even if the incoming plan is completely optimal by some metric, the
+  /// returned plan is fairly likely to not be optimal anymore, and round-trips back and
+  /// forth may make the plan increasingly suboptimal. However, every primitive at the
+  /// output can be (manually) traced back to exactly one primitive at the input, which
+  /// may be useful when debugging.
+  PRESERVE_STRUCTURE,
+
+  /// Behaves like PRESERVE_STRUCTURE, but prefers performance over structural accuracy.
+  /// Basic optimizations *may* be applied, in order to attempt to not regress in terms of
+  /// plan performance: if the incoming plan was already aggressively optimized, the goal
+  /// is for the output plan to not be less performant. In practical use cases, this is
+  /// probably the option you want.
+  ///
+  /// Note that no guarantees are made on top of PRESERVE_STRUCTURE. Past and future
+  /// versions of Arrow may even ignore this option entirely and treat it exactly like
+  /// PRESERVE_STRUCTURE.
+  BEST_EFFORT,
+};
+
+using NamedTableProvider = std::function<Result<acero::Declaration>(
+    const std::vector<std::string>&, const Schema&)>;
+static NamedTableProvider kDefaultNamedTableProvider;
+
+using NamedTapProvider = std::function<Result<acero::Declaration>(
+    const std::string&, std::vector<acero::Declaration::Input>, const std::string&,
+    std::shared_ptr<Schema>)>;
+
+class ARROW_ENGINE_EXPORT ExtensionDetails {
+ public:
+  virtual ~ExtensionDetails() = default;
+};
+
+class ARROW_ENGINE_EXPORT ExtensionProvider {
+ public:
+  virtual ~ExtensionProvider() = default;
+  virtual Result<DeclarationInfo> MakeRel(const ConversionOptions& conv_opts,
+                                          const std::vector<DeclarationInfo>& inputs,
+                                          const ExtensionDetails& ext_details,
+                                          const ExtensionSet& ext_set) = 0;
+};
+
+/// \brief Get the default extension provider
+ARROW_ENGINE_EXPORT std::shared_ptr<ExtensionProvider> default_extension_provider();
+/// \brief Set the default extension provider
+///
+/// \param[in] provider the new provider to be set as default
+ARROW_ENGINE_EXPORT void set_default_extension_provider(
+    const std::shared_ptr<ExtensionProvider>& provider);
+
+ARROW_ENGINE_EXPORT NamedTapProvider default_named_tap_provider();
+
+ARROW_ENGINE_EXPORT void set_default_named_tap_provider(NamedTapProvider provider);
+
+/// Options that control the conversion between Substrait and Acero representations of a
+/// plan.
+struct ARROW_ENGINE_EXPORT ConversionOptions {
+  ConversionOptions()
+      : strictness(ConversionStrictness::BEST_EFFORT),
+        named_table_provider(kDefaultNamedTableProvider),
+        named_tap_provider(default_named_tap_provider()),
+        extension_provider(default_extension_provider()),
+        allow_arrow_extensions(false) {}
+
+  /// \brief How strictly the converter should adhere to the structure of the input.
+  ConversionStrictness strictness;
+  /// \brief A custom strategy to be used for providing named tables
+  ///
+  /// The default behavior will return an invalid status if the plan has any
+  /// named table relations.
+  NamedTableProvider named_table_provider;
+  /// \brief A custom strategy to be used for obtaining a tap declaration
+  ///
+  /// The default provider returns an error
+  NamedTapProvider named_tap_provider;
+  /// \brief A custom strategy to be used for providing relation infos.
+  ///
+  /// The default behavior will provide for relations known to Arrow.
+  std::shared_ptr<ExtensionProvider> extension_provider;
+  /// \brief If true then Arrow-specific types and functions will be allowed
+  ///
+  /// Set to false to create plans that are more likely to be compatible with non-Arrow
+  /// engines
+  bool allow_arrow_extensions;
+};
+
+}  // namespace engine
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/engine/substrait/relation.h b/pyarrow/include/arrow/engine/substrait/relation.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0913b9ae029bf790fe1d348eb82911f8a912079
--- /dev/null
+++ b/pyarrow/include/arrow/engine/substrait/relation.h
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/acero/exec_plan.h"
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/engine/substrait/visibility.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace engine {
+
+/// Execution information resulting from converting a Substrait relation.
+struct ARROW_ENGINE_EXPORT DeclarationInfo {
+  /// The compute declaration produced thus far.
+  acero::Declaration declaration;
+
+  std::shared_ptr<Schema> output_schema;
+};
+
+/// Information resulting from converting a Substrait plan
+struct ARROW_ENGINE_EXPORT PlanInfo {
+  /// The root declaration.
+  ///
+  /// Only plans containing a single top-level relation are supported and so this will
+  /// represent that relation.
+  ///
+  /// This should technically be a RelRoot but some producers use a simple Rel here and so
+  /// Acero currently supports that case.
+  DeclarationInfo root;
+  /// The names of the output fields
+  ///
+  /// If `root` was created from a simple Rel then this will be empty
+  std::vector<std::string> names;
+};
+
+/// An expression whose output has a name
+struct ARROW_ENGINE_EXPORT NamedExpression {
+  /// An expression
+  compute::Expression expression;
+  // An optional name to assign to the output, may be the empty string
+  std::string name;
+};
+
+/// A collection of expressions bound to a common schema
+struct ARROW_ENGINE_EXPORT BoundExpressions {
+  /// The expressions
+  std::vector<NamedExpression> named_expressions;
+  /// The schema that all the expressions are bound to
+  std::shared_ptr<Schema> schema;
+};
+
+}  // namespace engine
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/engine/substrait/serde.h b/pyarrow/include/arrow/engine/substrait/serde.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab749f4a64b0513a1838c8e049c2abcd24181016
--- /dev/null
+++ b/pyarrow/include/arrow/engine/substrait/serde.h
@@ -0,0 +1,331 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "arrow/compute/type_fwd.h"
+#include "arrow/dataset/type_fwd.h"
+#include "arrow/engine/substrait/options.h"
+#include "arrow/engine/substrait/relation.h"
+#include "arrow/engine/substrait/type_fwd.h"
+#include "arrow/engine/substrait/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace engine {
+
+/// \brief Serialize an Acero Plan to a binary protobuf Substrait message
+///
+/// \param[in] declaration the Acero declaration to serialize.
+/// This declaration is the sink relation of the Acero plan.
+/// \param[in,out] ext_set the extension mapping to use; may be updated to add
+/// \param[in] conversion_options options to control how the conversion is done
+///
+/// \return a buffer containing the protobuf serialization of the Acero relation
+ARROW_ENGINE_EXPORT
+Result<std::shared_ptr<Buffer>> SerializePlan(
+    const acero::Declaration& declaration, ExtensionSet* ext_set,
+    const ConversionOptions& conversion_options = {});
+
+/// \brief Serialize expressions to a Substrait message
+///
+/// \param[in] bound_expressions the expressions to serialize.
+/// \param[in] conversion_options options to control how the conversion is done
+/// \param[in,out] ext_set the extension mapping to use, optional, only needed
+///                        if you want to control the value of function anchors
+///                        to mirror a previous serialization / deserialization.
+///                        Will be updated if new functions are encountered
+ARROW_ENGINE_EXPORT
+Result<std::shared_ptr<Buffer>> SerializeExpressions(
+    const BoundExpressions& bound_expressions,
+    const ConversionOptions& conversion_options = {}, ExtensionSet* ext_set = NULLPTR);
+
+/// Factory function type for generating the node that consumes the batches produced by
+/// each toplevel Substrait relation when deserializing a Substrait Plan.
+using ConsumerFactory = std::function<std::shared_ptr<acero::SinkNodeConsumer>()>;
+
+/// \brief Deserializes a Substrait Plan message to a list of ExecNode declarations
+///
+/// The output of each top-level Substrait relation will be sent to a caller supplied
+/// consumer function provided by consumer_factory
+///
+/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
+/// message
+/// \param[in] consumer_factory factory function for generating the node that consumes
+/// the batches produced by each toplevel Substrait relation
+/// \param[in] registry an extension-id-registry to use, or null for the default one.
+/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
+/// Plan is returned here.
+/// \param[in] conversion_options options to control how the conversion is to be done.
+/// \return a vector of ExecNode declarations, one for each toplevel relation in the
+/// Substrait Plan
+ARROW_ENGINE_EXPORT Result<std::vector<acero::Declaration>> DeserializePlans(
+    const Buffer& buf, const ConsumerFactory& consumer_factory,
+    const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR,
+    const ConversionOptions& conversion_options = {});
+
+/// \brief Deserializes a single-relation Substrait Plan message to an execution plan
+///
+/// The output of each top-level Substrait relation will be sent to a caller supplied
+/// consumer function provided by consumer_factory
+///
+/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
+/// message
+/// \param[in] consumer node that consumes the batches produced by each toplevel Substrait
+/// relation
+/// \param[in] registry an extension-id-registry to use, or null for the default one.
+/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
+/// \param[in] conversion_options options to control how the conversion is to be done.
+/// Plan is returned here.
+/// \return an ExecPlan for the Substrait Plan
+ARROW_ENGINE_EXPORT Result<std::shared_ptr<acero::ExecPlan>> DeserializePlan(
+    const Buffer& buf, const std::shared_ptr<acero::SinkNodeConsumer>& consumer,
+    const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR,
+    const ConversionOptions& conversion_options = {});
+
+/// Factory function type for generating the write options of a node consuming the batches
+/// produced by each toplevel Substrait relation when deserializing a Substrait Plan.
+using WriteOptionsFactory = std::function<std::shared_ptr<dataset::WriteNodeOptions>()>;
+
+/// \brief Deserializes a Substrait Plan message to a list of ExecNode declarations
+///
+/// The output of each top-level Substrait relation will be written to a filesystem.
+/// `write_options_factory` can be used to control write behavior.
+///
+/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
+/// message
+/// \param[in] write_options_factory factory function for generating the write options of
+/// a node consuming the batches produced by each toplevel Substrait relation
+/// \param[in] registry an extension-id-registry to use, or null for the default one.
+/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
+/// Plan is returned here.
+/// \param[in] conversion_options options to control how the conversion is to be done.
+/// \return a vector of ExecNode declarations, one for each toplevel relation in the
+/// Substrait Plan
+ARROW_ENGINE_EXPORT Result<std::vector<acero::Declaration>> DeserializePlans(
+    const Buffer& buf, const WriteOptionsFactory& write_options_factory,
+    const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR,
+    const ConversionOptions& conversion_options = {});
+
+/// \brief Deserializes a single-relation Substrait Plan message to an execution plan
+///
+/// The output of the single Substrait relation will be written to a filesystem.
+/// `write_options_factory` can be used to control write behavior.
+///
+/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
+/// message
+/// \param[in] write_options write options of a node consuming the batches produced by
+/// each toplevel Substrait relation
+/// \param[in] registry an extension-id-registry to use, or null for the default one.
+/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
+/// Plan is returned here.
+/// \param[in] conversion_options options to control how the conversion is to be done.
+/// \return an ExecPlan for the Substrait Plan
+ARROW_ENGINE_EXPORT Result<std::shared_ptr<acero::ExecPlan>> DeserializePlan(
+    const Buffer& buf, const std::shared_ptr<dataset::WriteNodeOptions>& write_options,
+    const ExtensionIdRegistry* registry = NULLPTR, ExtensionSet* ext_set_out = NULLPTR,
+    const ConversionOptions& conversion_options = {});
+
+/// \brief Deserializes a Substrait Plan message to a Declaration
+///
+/// The plan will not contain any sink nodes and will be suitable for use in any
+/// of the arrow::compute::DeclarationToXyz methods.
+///
+/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Plan
+/// message
+/// \param[in] registry an extension-id-registry to use, or null for the default one.
+/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
+/// Plan is returned here.
+/// \param[in] conversion_options options to control how the conversion is to be done.
+/// \return A declaration representing the Substrait plan
+ARROW_ENGINE_EXPORT Result<PlanInfo> DeserializePlan(
+    const Buffer& buf, const ExtensionIdRegistry* registry = NULLPTR,
+    ExtensionSet* ext_set_out = NULLPTR,
+    const ConversionOptions& conversion_options = {});
+
+/// \brief Deserialize a Substrait ExtendedExpression message to the corresponding Arrow
+/// type
+///
+/// \param[in] buf a buffer containing the protobuf serialization of a collection of bound
+/// expressions
+/// \param[in] registry an extension-id-registry to use, or null for the default one
+/// \param[in] conversion_options options to control how the conversion is done
+/// \param[out] ext_set_out if non-null, the extension mapping used by the Substrait
+/// message is returned here.
+/// \return A collection of expressions and a common input schema they are bound to
+ARROW_ENGINE_EXPORT Result<BoundExpressions> DeserializeExpressions(
+    const Buffer& buf, const ExtensionIdRegistry* registry = NULLPTR,
+    const ConversionOptions& conversion_options = {},
+    ExtensionSet* ext_set_out = NULLPTR);
+
+/// \brief Deserializes a Substrait Type message to the corresponding Arrow type
+///
+/// \param[in] buf a buffer containing the protobuf serialization of a Substrait Type
+/// message
+/// \param[in] ext_set the extension mapping to use, normally provided by the
+/// surrounding Plan message
+/// \param[in] conversion_options options to control how the conversion is to be done.
+/// \return the corresponding Arrow data type
+ARROW_ENGINE_EXPORT
+Result<std::shared_ptr<DataType>> DeserializeType(
+    const Buffer& buf, const ExtensionSet& ext_set,
+    const ConversionOptions& conversion_options = {});
+
+/// \brief Serializes an Arrow type to a Substrait Type message
+///
+/// \param[in] type the Arrow data type to serialize
+/// \param[in,out] ext_set the extension mapping to use; may be updated to add a
+/// mapping for the given type
+/// \param[in] conversion_options options to control how the conversion is to be done.
+/// \return a buffer containing the protobuf serialization of the corresponding Substrait
+/// Type message
+ARROW_ENGINE_EXPORT
+Result<std::shared_ptr<Buffer>> SerializeType(
+    const DataType& type, ExtensionSet* ext_set,
+    const ConversionOptions& conversion_options = {});
+
+/// \brief Deserializes a Substrait NamedStruct message to an Arrow schema
+///
+/// \param[in] buf a buffer containing the protobuf serialization of a Substrait
+/// NamedStruct message
+/// \param[in] ext_set the extension mapping to use, normally provided by the
+/// surrounding Plan message
+/// \param[in] conversion_options options to control how the conversion is to be done.
+/// \return the corresponding Arrow schema
+ARROW_ENGINE_EXPORT
+Result<std::shared_ptr<Schema>> DeserializeSchema(
+    const Buffer& buf, const ExtensionSet& ext_set,
+    const ConversionOptions& conversion_options = {});
+
+/// \brief Serializes an Arrow schema to a Substrait NamedStruct message
+///
+/// \param[in] schema the Arrow schema to serialize
+/// \param[in,out] ext_set the extension mapping to use; may be updated to add
+/// mappings for the types used in the schema
+/// \param[in] conversion_options options to control how the conversion is to be done.
+/// \return a buffer containing the protobuf serialization of the corresponding Substrait
+/// NamedStruct message
+ARROW_ENGINE_EXPORT
+Result<std::shared_ptr<Buffer>> SerializeSchema(
+    const Schema& schema, ExtensionSet* ext_set,
+    const ConversionOptions& conversion_options = {});
+
+/// \brief Deserializes a Substrait Expression message to a compute expression
+///
+/// \param[in] buf a buffer containing the protobuf serialization of a Substrait
+/// Expression message
+/// \param[in] ext_set the extension mapping to use, normally provided by the
+/// surrounding Plan message
+/// \param[in] conversion_options options to control how the conversion is to be done.
+/// \return the corresponding Arrow compute expression
+ARROW_ENGINE_EXPORT
+Result<compute::Expression> DeserializeExpression(
+    const Buffer& buf, const ExtensionSet& ext_set,
+    const ConversionOptions& conversion_options = {});
+
+/// \brief Serializes an Arrow compute expression to a Substrait Expression message
+///
+/// \param[in] expr the Arrow compute expression to serialize
+/// \param[in,out] ext_set the extension mapping to use; may be updated to add
+/// mappings for the types used in the expression
+/// \param[in] conversion_options options to control how the conversion is to be done.
+/// \return a buffer containing the protobuf serialization of the corresponding Substrait
+/// Expression message
+ARROW_ENGINE_EXPORT
+Result<std::shared_ptr<Buffer>> SerializeExpression(
+    const compute::Expression& expr, ExtensionSet* ext_set,
+    const ConversionOptions& conversion_options = {});
+
+/// \brief Serialize an Acero Declaration to a binary protobuf Substrait message
+///
+/// \param[in] declaration the Acero declaration to serialize
+/// \param[in,out] ext_set the extension mapping to use; may be updated to add
+/// \param[in] conversion_options options to control how the conversion is done
+///
+/// \return a buffer containing the protobuf serialization of the Acero relation
+ARROW_ENGINE_EXPORT Result<std::shared_ptr<Buffer>> SerializeRelation(
+    const acero::Declaration& declaration, ExtensionSet* ext_set,
+    const ConversionOptions& conversion_options = {});
+
+/// \brief Deserializes a Substrait Rel (relation) message to an ExecNode declaration
+///
+/// \param[in] buf a buffer containing the protobuf serialization of a Substrait
+/// Rel message
+/// \param[in] ext_set the extension mapping to use, normally provided by the
+/// surrounding Plan message
+/// \param[in] conversion_options options to control how the conversion is to be done.
+/// \return the corresponding ExecNode declaration
+ARROW_ENGINE_EXPORT Result<acero::Declaration> DeserializeRelation(
+    const Buffer& buf, const ExtensionSet& ext_set,
+    const ConversionOptions& conversion_options = {});
+
+namespace internal {
+
+/// \brief Checks whether two protobuf serializations of a particular Substrait message
+/// type are equivalent
+///
+/// Note that a binary comparison of the two buffers is insufficient. One reason for this
+/// is that the fields of a message can be specified in any order in the serialization.
+///
+/// \param[in] message_name the name of the Substrait message type to check
+/// \param[in] l_buf buffer containing the first protobuf serialization to compare
+/// \param[in] r_buf buffer containing the second protobuf serialization to compare
+/// \return success if equivalent, failure if not
+ARROW_ENGINE_EXPORT
+Status CheckMessagesEquivalent(std::string_view message_name, const Buffer& l_buf,
+                               const Buffer& r_buf);
+
+/// \brief Utility function to convert a JSON serialization of a Substrait message to
+/// its binary serialization
+///
+/// \param[in] type_name the name of the Substrait message type to convert
+/// \param[in] json the JSON string to convert
+/// \param[in] ignore_unknown_fields if true then unknown fields will be ignored and
+///            will not cause an error
+///
+///            This should generally be true to allow consumption of plans from newer
+///            producers but setting to false can be useful if you are testing
+///            conformance to a specific Substrait version
+/// \return a buffer filled with the binary protobuf serialization of message
+ARROW_ENGINE_EXPORT
+Result<std::shared_ptr<Buffer>> SubstraitFromJSON(std::string_view type_name,
+                                                  std::string_view json,
+                                                  bool ignore_unknown_fields = true);
+
+/// \brief Utility function to convert a binary protobuf serialization of a Substrait
+/// message to JSON
+///
+/// \param[in] type_name the name of the Substrait message type to convert
+/// \param[in] buf the buffer containing the binary protobuf serialization of the message
+/// \return a JSON string representing the message
+ARROW_ENGINE_EXPORT
+Result<std::string> SubstraitToJSON(std::string_view type_name, const Buffer& buf);
+
+}  // namespace internal
+}  // namespace engine
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/engine/substrait/test_plan_builder.h b/pyarrow/include/arrow/engine/substrait/test_plan_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..94c03daaa7a6957a2f8d5db77b7def1f8394d301
--- /dev/null
+++ b/pyarrow/include/arrow/engine/substrait/test_plan_builder.h
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// These utilities are for internal / unit test use only.
+// They allow for the construction of simple Substrait plans
+// programmatically without first requiring the construction
+// of an ExecPlan
+
+// These utilities have to be here, and not in a test_util.cc
+// file (or in a unit test) because only one .so is allowed
+// to include each .pb.h file or else protobuf will encounter
+// global namespace conflicts.
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/engine/substrait/visibility.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace engine {
+
+struct Id;
+
+namespace internal {
+
+/// \brief Create a scan->project->sink plan for tests
+///
+/// The plan will project one additional column using the function
+/// defined by `function_id`, `arguments`, and data_types.  `arguments`
+/// and `data_types` should have the same length but only one of each
+/// should be defined at each index.
+///
+/// If `data_types` is defined at an index then the plan will create a
+/// direct reference (starting at index 0 and increasing by 1 for each
+/// argument of this type).
+///
+/// If `arguments` is defined at an index then the plan will create an
+/// enum argument with that value.
+ARROW_ENGINE_EXPORT Result<std::shared_ptr<Buffer>> CreateScanProjectSubstrait(
+    Id function_id, const std::shared_ptr<Table>& input_table,
+    const std::vector<std::string>& arguments,
+    const std::unordered_map<std::string, std::vector<std::string>>& options,
+    const std::vector<std::shared_ptr<DataType>>& data_types,
+    const DataType& output_type);
+
+/// \brief Create a scan->aggregate->sink plan for tests
+///
+/// The plan will create an aggregate with one grouping set (defined by
+/// key_idxs) and one measure.  The measure will be a function
+/// defined by `function_id` and direct references to `arg_idxs`.
+ARROW_ENGINE_EXPORT Result<std::shared_ptr<Buffer>> CreateScanAggSubstrait(
+    Id function_id, const std::shared_ptr<Table>& input_table,
+    const std::vector<int>& key_idxs, const std::vector<int>& arg_idxs,
+    const DataType& output_type);
+
+}  // namespace internal
+}  // namespace engine
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/engine/substrait/test_util.h b/pyarrow/include/arrow/engine/substrait/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1db4b255ed8ee6a0ae7bb4a7a57f5a1aadb27cf
--- /dev/null
+++ b/pyarrow/include/arrow/engine/substrait/test_util.h
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/vector.h"
+
+#include <functional>
+#include <random>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "arrow/acero/exec_plan.h"
+#include "arrow/compute/exec.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/testing/visibility.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/pcg_random.h"
+
+namespace arrow {
+namespace engine {
+
+Result<std::shared_ptr<Table>> SortTableOnAllFields(const std::shared_ptr<Table>& tab);
+
+void AssertTablesEqualIgnoringOrder(const std::shared_ptr<Table>& exp,
+                                    const std::shared_ptr<Table>& act);
+
+}  // namespace engine
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/engine/substrait/type_fwd.h b/pyarrow/include/arrow/engine/substrait/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..6089d3f747a82cdc68b738b9ce6abbbb60e6811c
--- /dev/null
+++ b/pyarrow/include/arrow/engine/substrait/type_fwd.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+namespace arrow {
+namespace engine {
+
+class ExtensionIdRegistry;
+class ExtensionSet;
+
+struct ConversionOptions;
+struct DeclarationInfo;
+
+}  // namespace engine
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/engine/substrait/util.h b/pyarrow/include/arrow/engine/substrait/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..bef2a6c7e1823e5a661a36c96a94eac81b5462f4
--- /dev/null
+++ b/pyarrow/include/arrow/engine/substrait/util.h
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/compute/type_fwd.h"
+#include "arrow/engine/substrait/options.h"
+#include "arrow/engine/substrait/type_fwd.h"
+#include "arrow/engine/substrait/visibility.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+namespace engine {
+
+using PythonTableProvider =
+    std::function<Result<std::shared_ptr<Table>>(const std::vector<std::string>&)>;
+
+/// \brief Utility method to run a Substrait plan
+/// \param substrait_buffer The plan to run, must be in binary protobuf format
+/// \param registry A registry of extension functions to make available to the plan
+///                 If null then the default registry will be used.
+/// \param memory_pool The memory pool the plan should use to make allocations.
+/// \param func_registry A registry of functions used for execution expressions.
+///                      `registry` maps from Substrait function IDs to "names". These
+///                      names will be provided to `func_registry` to get the actual
+///                      kernel.
+/// \param conversion_options Options to control plan deserialization
+/// \param use_threads If True then the CPU thread pool will be used for CPU work.  If
+///                    False then all work will be done on the calling thread.
+/// \return A record batch reader that will read out the results
+ARROW_ENGINE_EXPORT Result<std::shared_ptr<RecordBatchReader>> ExecuteSerializedPlan(
+    const Buffer& substrait_buffer, const ExtensionIdRegistry* registry = NULLPTR,
+    compute::FunctionRegistry* func_registry = NULLPTR,
+    const ConversionOptions& conversion_options = {}, bool use_threads = true,
+    MemoryPool* memory_pool = default_memory_pool());
+
+/// \brief Get a Serialized Plan from a Substrait JSON plan.
+/// This is a helper method for Python tests.
+ARROW_ENGINE_EXPORT Result<std::shared_ptr<Buffer>> SerializeJsonPlan(
+    const std::string& substrait_json);
+
+/// \brief Make a nested registry with the default registry as parent.
+/// See arrow::engine::nested_extension_id_registry for details.
+ARROW_ENGINE_EXPORT std::shared_ptr<ExtensionIdRegistry> MakeExtensionIdRegistry();
+
+ARROW_ENGINE_EXPORT const std::string& default_extension_types_uri();
+
+// TODO(ARROW-18145) Populate these from cmake files
+constexpr uint32_t kSubstraitMajorVersion = 0;
+constexpr uint32_t kSubstraitMinorVersion = 44;
+constexpr uint32_t kSubstraitPatchVersion = 0;
+
+constexpr uint32_t kSubstraitMinimumMajorVersion = 0;
+constexpr uint32_t kSubstraitMinimumMinorVersion = 20;
+
+Status CheckVersion(uint32_t major_version, uint32_t minor_version);
+
+}  // namespace engine
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/engine/substrait/visibility.h b/pyarrow/include/arrow/engine/substrait/visibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ed1c67352d6022b6d1770d9b01472b0557666fc
--- /dev/null
+++ b/pyarrow/include/arrow/engine/substrait/visibility.h
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// TODO(westonpace): Once we have a proper engine module this file
+// should be renamed arrow/engine/visibility.h
+// This API is EXPERIMENTAL.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_ENGINE_STATIC
+#    define ARROW_ENGINE_EXPORT
+#  elif defined(ARROW_ENGINE_EXPORTING)
+#    define ARROW_ENGINE_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_ENGINE_EXPORT __declspec(dllimport)
+#  endif
+
+#  define ARROW_ENGINE_NO_EXPORT
+#else  // Not Windows
+#  ifndef ARROW_ENGINE_EXPORT
+#    define ARROW_ENGINE_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_ENGINE_NO_EXPORT
+#    define ARROW_ENGINE_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif  // Non-Windows
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/pyarrow/include/arrow/extension/bool8.h b/pyarrow/include/arrow/extension/bool8.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbb507639e272daaf37c20accf7f0728c1822281
--- /dev/null
+++ b/pyarrow/include/arrow/extension/bool8.h
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/extension_type.h"
+
+namespace arrow::extension {
+
+/// \brief Bool8 is an alternate representation for boolean
+/// arrays using 8 bits instead of 1 bit per value. The underlying
+/// storage type is int8.
+class ARROW_EXPORT Bool8Array : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+/// \brief Bool8 is an alternate representation for boolean
+/// arrays using 8 bits instead of 1 bit per value. The underlying
+/// storage type is int8.
+class ARROW_EXPORT Bool8Type : public ExtensionType {
+ public:
+  /// \brief Construct a Bool8Type.
+  Bool8Type() : ExtensionType(int8()) {}
+
+  std::string extension_name() const override { return "arrow.bool8"; }
+  std::string ToString(bool show_metadata = false) const override;
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::string Serialize() const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized_data) const override;
+
+  /// Create a Bool8Array from ArrayData
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  static Result<std::shared_ptr<DataType>> Make();
+};
+
+/// \brief Return a Bool8Type instance.
+ARROW_EXPORT std::shared_ptr<DataType> bool8();
+
+}  // namespace arrow::extension
diff --git a/pyarrow/include/arrow/extension/fixed_shape_tensor.h b/pyarrow/include/arrow/extension/fixed_shape_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..80a602021c60b8ddf8e8627282b976d463d1c21f
--- /dev/null
+++ b/pyarrow/include/arrow/extension/fixed_shape_tensor.h
@@ -0,0 +1,130 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/extension_type.h"
+
+namespace arrow {
+namespace extension {
+
+class ARROW_EXPORT FixedShapeTensorArray : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+
+  /// \brief Create a FixedShapeTensorArray from a Tensor
+  ///
+  /// This method will create a FixedShapeTensorArray from a Tensor, taking its first
+  /// dimension as the number of elements in the resulting array and the remaining
+  /// dimensions as the shape of the individual tensors. If Tensor provides strides,
+  /// they will be used to determine dimension permutation. Otherwise, row-major layout
+  /// (i.e. no permutation) will be assumed.
+  ///
+  /// \param[in] tensor The Tensor to convert to a FixedShapeTensorArray
+  static Result<std::shared_ptr<FixedShapeTensorArray>> FromTensor(
+      const std::shared_ptr<Tensor>& tensor);
+
+  /// \brief Create a Tensor from FixedShapeTensorArray
+  ///
+  /// This method will create a Tensor from a FixedShapeTensorArray, setting its first
+  /// dimension as length equal to the FixedShapeTensorArray's length and the remaining
+  /// dimensions as the FixedShapeTensorType's shape. Shape and dim_names will be
+  /// permuted according to permutation stored in the FixedShapeTensorType metadata.
+  const Result<std::shared_ptr<Tensor>> ToTensor() const;
+};
+
+/// \brief Concrete type class for constant-size Tensor data.
+/// This is a canonical arrow extension type.
+/// See: https://arrow.apache.org/docs/format/CanonicalExtensions.html
+class ARROW_EXPORT FixedShapeTensorType : public ExtensionType {
+ public:
+  FixedShapeTensorType(const std::shared_ptr<DataType>& value_type, const int32_t& size,
+                       const std::vector<int64_t>& shape,
+                       const std::vector<int64_t>& permutation = {},
+                       const std::vector<std::string>& dim_names = {})
+      : ExtensionType(fixed_size_list(value_type, size)),
+        value_type_(value_type),
+        shape_(shape),
+        permutation_(permutation),
+        dim_names_(dim_names) {}
+
+  std::string extension_name() const override { return "arrow.fixed_shape_tensor"; }
+  std::string ToString(bool show_metadata = false) const override;
+
+  /// Number of dimensions of tensor elements
+  size_t ndim() const { return shape_.size(); }
+
+  /// Shape of tensor elements
+  const std::vector<int64_t>& shape() const { return shape_; }
+
+  /// Value type of tensor elements
+  const std::shared_ptr<DataType>& value_type() const { return value_type_; }
+
+  /// Strides of tensor elements. Strides state offset in bytes between adjacent
+  /// elements along each dimension. In case permutation is non-empty strides are
+  /// computed from permuted tensor element's shape.
+  const std::vector<int64_t>& strides();
+
+  /// Permutation mapping from logical to physical memory layout of tensor elements
+  const std::vector<int64_t>& permutation() const { return permutation_; }
+
+  /// Dimension names of tensor elements. Dimensions are ordered physically.
+  const std::vector<std::string>& dim_names() const { return dim_names_; }
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::string Serialize() const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized_data) const override;
+
+  /// Create a FixedShapeTensorArray from ArrayData
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  /// \brief Create a Tensor from an ExtensionScalar from a FixedShapeTensorArray
+  ///
+  /// This method will return a Tensor from ExtensionScalar with strides
+  /// derived from shape and permutation of FixedShapeTensorType. Shape and
+  /// dim_names will be permuted according to permutation stored in the
+  /// FixedShapeTensorType metadata.
+  static Result<std::shared_ptr<Tensor>> MakeTensor(
+      const std::shared_ptr<ExtensionScalar>& scalar);
+
+  /// \brief Create a FixedShapeTensorType instance
+  static Result<std::shared_ptr<DataType>> Make(
+      const std::shared_ptr<DataType>& value_type, const std::vector<int64_t>& shape,
+      const std::vector<int64_t>& permutation = {},
+      const std::vector<std::string>& dim_names = {});
+
+ private:
+  std::shared_ptr<DataType> storage_type_;
+  std::shared_ptr<DataType> value_type_;
+  std::vector<int64_t> shape_;
+  std::vector<int64_t> strides_;
+  std::vector<int64_t> permutation_;
+  std::vector<std::string> dim_names_;
+};
+
+/// \brief Return a FixedShapeTensorType instance.
+ARROW_EXPORT std::shared_ptr<DataType> fixed_shape_tensor(
+    const std::shared_ptr<DataType>& storage_type, const std::vector<int64_t>& shape,
+    const std::vector<int64_t>& permutation = {},
+    const std::vector<std::string>& dim_names = {});
+
+}  // namespace extension
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/extension/json.h b/pyarrow/include/arrow/extension/json.h
new file mode 100644
index 0000000000000000000000000000000000000000..589b341fa6f7a1ab5abfbfb95d02b24f24e618b0
--- /dev/null
+++ b/pyarrow/include/arrow/extension/json.h
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdexcept>
+#include <string>
+
+#include "arrow/extension_type.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow::extension {
+
+/// \brief Concrete type class for variable-size JSON data, utf8-encoded.
+class ARROW_EXPORT JsonExtensionType : public ExtensionType {
+ public:
+  explicit JsonExtensionType(const std::shared_ptr<DataType>& storage_type)
+      : ExtensionType(storage_type) {}
+
+  std::string extension_name() const override { return "arrow.json"; }
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized_data) const override;
+
+  std::string Serialize() const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  static Result<std::shared_ptr<DataType>> Make(std::shared_ptr<DataType> storage_type);
+
+  static bool IsSupportedStorageType(Type::type type_id);
+};
+
+/// \brief Return a JsonExtensionType instance.
+ARROW_EXPORT std::shared_ptr<DataType> json(
+    std::shared_ptr<DataType> storage_type = utf8());
+
+}  // namespace arrow::extension
diff --git a/pyarrow/include/arrow/extension/opaque.h b/pyarrow/include/arrow/extension/opaque.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d3411798f88d187c55930f13d5566a5ff27ca8c
--- /dev/null
+++ b/pyarrow/include/arrow/extension/opaque.h
@@ -0,0 +1,71 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/extension_type.h"
+#include "arrow/type.h"
+
+namespace arrow::extension {
+
+/// \brief Opaque is a placeholder for a type from an external (usually
+///   non-Arrow) system that could not be interpreted.
+class ARROW_EXPORT OpaqueType : public ExtensionType {
+ public:
+  /// \brief Construct an OpaqueType.
+  ///
+  /// \param[in] storage_type The underlying storage type.  Should be
+  ///   arrow::null if there is no data.
+  /// \param[in] type_name The name of the type in the external system.
+  /// \param[in] vendor_name The name of the external system.
+  explicit OpaqueType(std::shared_ptr<DataType> storage_type, std::string type_name,
+                      std::string vendor_name)
+      : ExtensionType(std::move(storage_type)),
+        type_name_(std::move(type_name)),
+        vendor_name_(std::move(vendor_name)) {}
+
+  std::string extension_name() const override { return "arrow.opaque"; }
+  std::string ToString(bool show_metadata) const override;
+  bool ExtensionEquals(const ExtensionType& other) const override;
+  std::string Serialize() const override;
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized_data) const override;
+  /// Create an OpaqueArray from ArrayData
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  std::string_view type_name() const { return type_name_; }
+  std::string_view vendor_name() const { return vendor_name_; }
+
+ private:
+  std::string type_name_;
+  std::string vendor_name_;
+};
+
+/// \brief Opaque is a wrapper for (usually binary) data from an external
+///   (often non-Arrow) system that could not be interpreted.
+class ARROW_EXPORT OpaqueArray : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+/// \brief Return an OpaqueType instance.
+ARROW_EXPORT std::shared_ptr<DataType> opaque(std::shared_ptr<DataType> storage_type,
+                                              std::string type_name,
+                                              std::string vendor_name);
+
+}  // namespace arrow::extension
diff --git a/pyarrow/include/arrow/extension/uuid.h b/pyarrow/include/arrow/extension/uuid.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c9660c463b08f799f71a3a710f3d3f2eadadd96
--- /dev/null
+++ b/pyarrow/include/arrow/extension/uuid.h
@@ -0,0 +1,63 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/extension_type.h"
+
+namespace arrow::extension {
+
+/// \brief UuidArray stores array of UUIDs. Underlying storage type is
+/// FixedSizeBinary(16).
+class ARROW_EXPORT UuidArray : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+/// \brief UuidType is a canonical arrow extension type for UUIDs.
+/// UUIDs are stored as FixedSizeBinary(16) with big-endian notation and this
+/// does not interpret the bytes in any way. Specific UUID version is not
+/// required or guaranteed.
+class ARROW_EXPORT UuidType : public ExtensionType {
+ public:
+  /// \brief Construct a UuidType.
+  UuidType() : ExtensionType(fixed_size_binary(16)) {}
+
+  std::string extension_name() const override { return "arrow.uuid"; }
+  std::string ToString(bool show_metadata = false) const override;
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  /// Create a UuidArray from ArrayData
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override { return ""; }
+
+  /// \brief Create a UuidType instance
+  static Result<std::shared_ptr<DataType>> Make() { return std::make_shared<UuidType>(); }
+
+  static bool IsSupportedStorageType(const std::shared_ptr<DataType>& storage_type);
+};
+
+/// \brief Return a UuidType instance.
+ARROW_EXPORT std::shared_ptr<DataType> uuid();
+
+}  // namespace arrow::extension
diff --git a/pyarrow/include/arrow/extension_type.h b/pyarrow/include/arrow/extension_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..38200f42c62e8fe42fb8270172896d6b252d28bb
--- /dev/null
+++ b/pyarrow/include/arrow/extension_type.h
@@ -0,0 +1,168 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// User-defined extension types.
+/// \since 0.13.0
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \brief The base class for custom / user-defined types.
+class ARROW_EXPORT ExtensionType : public DataType {
+ public:
+  static constexpr Type::type type_id = Type::EXTENSION;
+
+  static constexpr const char* type_name() { return "extension"; }
+
+  /// \brief The type of array used to represent this extension type's data
+  const std::shared_ptr<DataType>& storage_type() const { return storage_type_; }
+
+  /// \brief Return the type category of the storage type
+  Type::type storage_id() const override { return storage_type_->id(); }
+
+  DataTypeLayout layout() const override;
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  std::string name() const override { return "extension"; }
+
+  int32_t byte_width() const override { return storage_type_->byte_width(); }
+  int bit_width() const override { return storage_type_->bit_width(); }
+
+  /// \brief Unique name of extension type used to identify type for
+  /// serialization
+  /// \return the string name of the extension
+  virtual std::string extension_name() const = 0;
+
+  /// \brief Determine if two instances of the same extension types are
+  /// equal. Invoked from ExtensionType::Equals
+  /// \param[in] other the type to compare this type with
+  /// \return bool true if type instances are equal
+  virtual bool ExtensionEquals(const ExtensionType& other) const = 0;
+
+  /// \brief Wrap built-in Array type in a user-defined ExtensionArray instance
+  /// \param[in] data the physical storage for the extension type
+  virtual std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const = 0;
+
+  /// \brief Create an instance of the ExtensionType given the actual storage
+  /// type and the serialized representation
+  /// \param[in] storage_type the physical storage type of the extension
+  /// \param[in] serialized_data the serialized representation produced by
+  /// Serialize
+  virtual Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized_data) const = 0;
+
+  /// \brief Create a serialized representation of the extension type's
+  /// metadata. The storage type will be handled automatically in IPC code
+  /// paths
+  /// \return the serialized representation
+  virtual std::string Serialize() const = 0;
+
+  /// \brief Wrap the given storage array as an extension array
+  static std::shared_ptr<Array> WrapArray(const std::shared_ptr<DataType>& ext_type,
+                                          const std::shared_ptr<Array>& storage);
+
+  /// \brief Wrap the given chunked storage array as a chunked extension array
+  static std::shared_ptr<ChunkedArray> WrapArray(
+      const std::shared_ptr<DataType>& ext_type,
+      const std::shared_ptr<ChunkedArray>& storage);
+
+ protected:
+  explicit ExtensionType(std::shared_ptr<DataType> storage_type)
+      : DataType(Type::EXTENSION), storage_type_(std::move(storage_type)) {}
+
+  std::shared_ptr<DataType> storage_type_;
+};
+
+/// \brief Base array class for user-defined extension types
+class ARROW_EXPORT ExtensionArray : public Array {
+ public:
+  using TypeClass = ExtensionType;
+  /// \brief Construct an ExtensionArray from an ArrayData.
+  ///
+  /// The ArrayData must have the right ExtensionType.
+  explicit ExtensionArray(const std::shared_ptr<ArrayData>& data);
+
+  /// \brief Construct an ExtensionArray from a type and the underlying storage.
+  ExtensionArray(const std::shared_ptr<DataType>& type,
+                 const std::shared_ptr<Array>& storage);
+
+  const ExtensionType* extension_type() const {
+    return internal::checked_cast<const ExtensionType*>(data_->type.get());
+  }
+
+  /// \brief The physical storage for the extension array
+  const std::shared_ptr<Array>& storage() const { return storage_; }
+
+ protected:
+  void SetData(const std::shared_ptr<ArrayData>& data);
+  std::shared_ptr<Array> storage_;
+};
+
+class ARROW_EXPORT ExtensionTypeRegistry {
+ public:
+  /// \brief Provide access to the global registry to allow code to control for
+  /// race conditions in registry teardown when some types need to be
+  /// unregistered and destroyed first
+  static std::shared_ptr<ExtensionTypeRegistry> GetGlobalRegistry();
+
+  virtual ~ExtensionTypeRegistry() = default;
+
+  virtual Status RegisterType(std::shared_ptr<ExtensionType> type) = 0;
+  virtual Status UnregisterType(const std::string& type_name) = 0;
+  virtual std::shared_ptr<ExtensionType> GetType(const std::string& type_name) = 0;
+};
+
+/// \brief Register an extension type globally. The name returned by the type's
+/// extension_name() method should be unique. This method is thread-safe
+/// \param[in] type an instance of the extension type
+/// \return Status
+ARROW_EXPORT
+Status RegisterExtensionType(std::shared_ptr<ExtensionType> type);
+
+/// \brief Delete an extension type from the global registry. This method is
+/// thread-safe
+/// \param[in] type_name the unique name of a registered extension type
+/// \return Status error if the type name is unknown
+ARROW_EXPORT
+Status UnregisterExtensionType(const std::string& type_name);
+
+/// \brief Retrieve an extension type from the global registry. Returns nullptr
+/// if not found. This method is thread-safe
+/// \return the globally-registered extension type
+ARROW_EXPORT
+std::shared_ptr<ExtensionType> GetExtensionType(const std::string& type_name);
+
+ARROW_EXPORT extern const char kExtensionTypeKeyName[];
+ARROW_EXPORT extern const char kExtensionMetadataKeyName[];
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/filesystem/api.h b/pyarrow/include/arrow/filesystem/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..7211ad5c2ccdbd20cad3599652766f7562cf5158
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/api.h
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/config.h"  // IWYU pragma: export
+
+#include "arrow/filesystem/filesystem.h"  // IWYU pragma: export
+#ifdef ARROW_AZURE
+#  include "arrow/filesystem/azurefs.h"  // IWYU pragma: export
+#endif
+#ifdef ARROW_GCS
+#  include "arrow/filesystem/gcsfs.h"  // IWYU pragma: export
+#endif
+#include "arrow/filesystem/hdfs.h"     // IWYU pragma: export
+#include "arrow/filesystem/localfs.h"  // IWYU pragma: export
+#include "arrow/filesystem/mockfs.h"   // IWYU pragma: export
+#ifdef ARROW_S3
+#  include "arrow/filesystem/s3fs.h"  // IWYU pragma: export
+#endif
diff --git a/pyarrow/include/arrow/filesystem/azurefs.h b/pyarrow/include/arrow/filesystem/azurefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee0956afdd7a982769fdb5035db02e17fac3f2cb
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/azurefs.h
@@ -0,0 +1,373 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/uri.h"
+
+namespace Azure::Core::Credentials {
+class TokenCredential;
+}
+
+namespace Azure::Storage {
+class StorageSharedKeyCredential;
+}
+
+namespace Azure::Storage::Blobs {
+class BlobServiceClient;
+}
+
+namespace Azure::Storage::Files::DataLake {
+class DataLakeFileSystemClient;
+class DataLakeServiceClient;
+}  // namespace Azure::Storage::Files::DataLake
+
+namespace arrow::fs {
+
+class TestAzureFileSystem;
+class TestAzureOptions;
+
+/// Options for the AzureFileSystem implementation.
+///
+/// By default, authentication is handled by the Azure SDK's credential chain
+/// which may read from multiple environment variables, such as:
+/// - `AZURE_TENANT_ID`
+/// - `AZURE_CLIENT_ID`
+/// - `AZURE_CLIENT_SECRET`
+/// - `AZURE_AUTHORITY_HOST`
+/// - `AZURE_CLIENT_CERTIFICATE_PATH`
+/// - `AZURE_FEDERATED_TOKEN_FILE`
+///
+/// Functions are provided for explicit configuration of credentials if that is preferred.
+struct ARROW_EXPORT AzureOptions {
+  friend class TestAzureOptions;
+
+  /// \brief The name of the Azure Storage Account being accessed.
+  ///
+  /// All service URLs will be constructed using this storage account name.
+  /// `ConfigureAccountKeyCredential` assumes the user wants to authenticate
+  /// this account.
+  std::string account_name;
+
+  /// \brief hostname[:port] of the Azure Blob Storage Service.
+  ///
+  /// If the hostname is a relative domain name (one that starts with a '.'), then storage
+  /// account URLs will be constructed by prepending the account name to the hostname.
+  /// If the hostname is a fully qualified domain name, then the hostname will be used
+  /// as-is and the account name will follow the hostname in the URL path.
+  ///
+  /// Default: ".blob.core.windows.net"
+  std::string blob_storage_authority = ".blob.core.windows.net";
+
+  /// \brief hostname[:port] of the Azure Data Lake Storage Gen 2 Service.
+  ///
+  /// If the hostname is a relative domain name (one that starts with a '.'), then storage
+  /// account URLs will be constructed by prepending the account name to the hostname.
+  /// If the hostname is a fully qualified domain name, then the hostname will be used
+  /// as-is and the account name will follow the hostname in the URL path.
+  ///
+  /// Default: ".dfs.core.windows.net"
+  std::string dfs_storage_authority = ".dfs.core.windows.net";
+
+  /// \brief Azure Blob Storage connection transport.
+  ///
+  /// Default: "https"
+  std::string blob_storage_scheme = "https";
+
+  /// \brief Azure Data Lake Storage Gen 2 connection transport.
+  ///
+  /// Default: "https"
+  std::string dfs_storage_scheme = "https";
+
+  // TODO(GH-38598): Add support for more auth methods.
+  // std::string connection_string;
+  // std::string sas_token;
+
+  /// \brief Default metadata for OpenOutputStream.
+  ///
+  /// This will be ignored if non-empty metadata is passed to OpenOutputStream.
+  std::shared_ptr<const KeyValueMetadata> default_metadata;
+
+  /// Whether OutputStream writes will be issued in the background, without blocking.
+  bool background_writes = true;
+
+ private:
+  enum class CredentialKind {
+    kDefault,
+    kAnonymous,
+    kStorageSharedKey,
+    kSASToken,
+    kClientSecret,
+    kManagedIdentity,
+    kCLI,
+    kWorkloadIdentity,
+    kEnvironment,
+  } credential_kind_ = CredentialKind::kDefault;
+
+  std::shared_ptr<Azure::Storage::StorageSharedKeyCredential>
+      storage_shared_key_credential_;
+  std::string sas_token_;
+  mutable std::shared_ptr<Azure::Core::Credentials::TokenCredential> token_credential_;
+
+ public:
+  AzureOptions();
+  ~AzureOptions();
+
+ private:
+  void ExtractFromUriSchemeAndHierPart(const Uri& uri, std::string* out_path);
+  Status ExtractFromUriQuery(const Uri& uri);
+
+ public:
+  /// \brief Construct a new AzureOptions from an URI.
+  ///
+  /// Supported formats:
+  ///
+  /// 1. abfs[s]://\<account\>.blob.core.windows.net[/\<container\>[/\<path\>]]
+  /// 2. abfs[s]://\<container\>\@\<account\>.dfs.core.windows.net[/path]
+  /// 3. abfs[s]://[\<account@]\<host[.domain]\>[\<:port\>][/\<container\>[/path]]
+  /// 4. abfs[s]://[\<account@]\<container\>[/path]
+  ///
+  /// (1) and (2) are compatible with the Azure Data Lake Storage Gen2 URIs
+  /// [1], (3) is for Azure Blob Storage compatible service including Azurite,
+  /// and (4) is a shorter version of (1) and (2).
+  ///
+  /// Note that there is no difference between abfs and abfss. HTTPS is
+  /// used with abfs by default. You can force to use HTTP by specifying
+  /// "enable_tls=false" query.
+  ///
+  /// Supported query parameters:
+  ///
+  /// * blob_storage_authority: Set AzureOptions::blob_storage_authority
+  /// * dfs_storage_authority: Set AzureOptions::dfs_storage_authority
+  /// * enable_tls: If it's "false" or "0", HTTP not HTTPS is used.
+  /// * credential_kind: One of "default", "anonymous", "workload_identity",
+  ///   "environment" or "cli". If "default" is specified, it's
+  ///   just ignored.  If "anonymous" is specified,
+  ///   AzureOptions::ConfigureAnonymousCredential() is called. If
+  ///   "workload_identity" is specified,
+  ///   AzureOptions::ConfigureWorkloadIdentityCredential() is called. If
+  ///   "environment" is specified,
+  ///   AzureOptions::ConfigureEnvironmentCredential() is called. If "cli" is
+  ///   specified, AzureOptions::ConfigureCLICredential() is called.
+  /// * tenant_id: You must specify "client_id" and "client_secret"
+  ///   too. AzureOptions::ConfigureClientSecretCredential() is called.
+  /// * client_id: If you don't specify "tenant_id" and
+  ///   "client_secret",
+  ///   AzureOptions::ConfigureManagedIdentityCredential() is
+  ///   called. If you specify "tenant_id" and "client_secret" too,
+  ///   AzureOptions::ConfigureClientSecretCredential() is called.
+  /// * client_secret: You must specify "tenant_id" and "client_id"
+  ///   too. AzureOptions::ConfigureClientSecretCredential() is called.
+  /// * A SAS token is made up of several query parameters. Appending a SAS
+  ///   token to the URI configures SAS token auth by calling
+  ///   AzureOptions::ConfigureSASCredential().
+  ///
+  /// [1]:
+  /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri
+  static Result<AzureOptions> FromUri(const Uri& uri, std::string* out_path);
+  static Result<AzureOptions> FromUri(const std::string& uri, std::string* out_path);
+
+  Status ConfigureDefaultCredential();
+  Status ConfigureAnonymousCredential();
+  Status ConfigureAccountKeyCredential(const std::string& account_key);
+  Status ConfigureSASCredential(const std::string& sas_token);
+  Status ConfigureClientSecretCredential(const std::string& tenant_id,
+                                         const std::string& client_id,
+                                         const std::string& client_secret);
+  Status ConfigureManagedIdentityCredential(const std::string& client_id = std::string());
+  Status ConfigureCLICredential();
+  Status ConfigureWorkloadIdentityCredential();
+  Status ConfigureEnvironmentCredential();
+
+  bool Equals(const AzureOptions& other) const;
+
+  std::string AccountBlobUrl(const std::string& account_name) const;
+  std::string AccountDfsUrl(const std::string& account_name) const;
+
+  Result<std::unique_ptr<Azure::Storage::Blobs::BlobServiceClient>>
+  MakeBlobServiceClient() const;
+
+  Result<std::unique_ptr<Azure::Storage::Files::DataLake::DataLakeServiceClient>>
+  MakeDataLakeServiceClient() const;
+};
+
+/// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and
+/// Azure Data Lake Storage Gen2 (ADLS Gen2) [2].
+///
+/// ADLS Gen2 isn't a dedicated service or account type. It's a set of capabilities that
+/// support high throughput analytic workloads, built on Azure Blob Storage. All the data
+/// ingested via the ADLS Gen2 APIs is persisted as blobs in the storage account.
+/// ADLS Gen2 provides filesystem semantics, file-level security, and Hadoop
+/// compatibility. ADLS Gen1 exists as a separate object that will retired on 2024-02-29
+/// and new ADLS accounts use Gen2 instead.
+///
+/// ADLS Gen2 and Blob APIs can operate on the same data, but there are
+/// some limitations [3]. The ones that are relevant to this
+/// implementation are listed here:
+///
+/// - You can't use Blob APIs, and ADLS APIs to write to the same instance of a file. If
+///   you write to a file by using ADLS APIs then that file's blocks won't be visible
+///   to calls to the GetBlockList Blob API. The only exception is when you're
+///   overwriting.
+/// - When you use the ListBlobs operation without specifying a delimiter, the results
+///   include both directories and blobs. If you choose to use a delimiter, use only a
+///   forward slash (/) \--- the only supported delimiter.
+/// - If you use the DeleteBlob API to delete a directory, that directory is deleted only
+///   if it's empty. This means that you can't use the Blob API delete directories
+///   recursively.
+///
+/// [1]: https://azure.microsoft.com/en-us/products/storage/blobs
+/// [2]: https://azure.microsoft.com/en-us/products/storage/data-lake-storage
+/// [3]:
+/// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-known-issues
+class ARROW_EXPORT AzureFileSystem : public FileSystem {
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+
+  explicit AzureFileSystem(std::unique_ptr<Impl>&& impl);
+
+  friend class TestAzureFileSystem;
+  void ForceCachedHierarchicalNamespaceSupport(int hns_support);
+
+ public:
+  ~AzureFileSystem() override = default;
+
+  static Result<std::shared_ptr<AzureFileSystem>> Make(
+      const AzureOptions& options, const io::IOContext& = io::default_io_context());
+
+  std::string type_name() const override { return "abfs"; }
+
+  /// Return the original Azure options when constructing the filesystem
+  const AzureOptions& options() const;
+
+  bool Equals(const FileSystem& other) const override;
+
+  /// \cond FALSE
+  using FileSystem::CreateDir;
+  using FileSystem::DeleteDirContents;
+  using FileSystem::GetFileInfo;
+  using FileSystem::OpenAppendStream;
+  using FileSystem::OpenOutputStream;
+  /// \endcond
+
+  Result<FileInfo> GetFileInfo(const std::string& path) override;
+
+  Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
+
+  Status CreateDir(const std::string& path, bool recursive) override;
+
+  /// \brief Delete a directory and its contents recursively.
+  ///
+  /// Atomicity is guaranteed only on Hierarchical Namespace Storage accounts.
+  Status DeleteDir(const std::string& path) override;
+
+  /// \brief Non-atomically deletes the contents of a directory.
+  ///
+  /// This function can return a bad Status after only partially deleting the
+  /// contents of the directory.
+  Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
+
+  /// \brief Deletion of all the containers in the storage account (not
+  /// implemented for safety reasons).
+  ///
+  /// \return Status::NotImplemented
+  Status DeleteRootDirContents() override;
+
+  /// \brief Deletes a file.
+  ///
+  /// Supported on both flat namespace and Hierarchical Namespace storage
+  /// accounts. A check is made to guarantee the parent directory doesn't
+  /// disappear after the blob is deleted and while this operation is running,
+  /// no other client can delete the parent directory due to the use of leases.
+  ///
+  /// This means applications can safely retry this operation without coordination to
+  /// guarantee only one client/process is trying to delete the same file.
+  Status DeleteFile(const std::string& path) override;
+
+  /// \brief Move/rename a file or directory.
+  ///
+  /// There are no files immediately at the root directory, so paths like
+  /// "/segment" always refer to a container of the storage account and are
+  /// treated as directories.
+  ///
+  /// If `dest` exists but the operation fails for some reason, `Move`
+  /// guarantees `dest` is not lost.
+  ///
+  /// Conditions for a successful move:
+  ///
+  /// 1. `src` must exist.
+  /// 2. `dest` can't contain a strict path prefix of `src`. More generally,
+  ///    a directory can't be made a subdirectory of itself.
+  /// 3. If `dest` already exists and it's a file, `src` must also be a file.
+  ///    `dest` is then replaced by `src`.
+  /// 4. All components of `dest` must exist, except for the last.
+  /// 5. If `dest` already exists and it's a directory, `src` must also be a
+  ///    directory and `dest` must be empty. `dest` is then replaced by `src`
+  ///    and its contents.
+  ///
+  /// Leases are used to guarantee the pre-condition checks and the rename
+  /// operation are atomic: other clients can't invalidate the pre-condition in
+  /// the time between the checks and the actual rename operation.
+  ///
+  /// This is possible because Move() is only support on storage accounts with
+  /// Hierarchical Namespace Support enabled.
+  ///
+  /// ## Limitations
+  ///
+  /// - Moves are not supported on storage accounts without
+  ///   Hierarchical Namespace support enabled
+  /// - Moves across different containers are not supported
+  /// - Moving a path of the form `/container` is not supported as it would
+  ///   require moving all the files in a container to another container.
+  ///   The only exception is a `Move("/container_a", "/container_b")` where
+  ///   both containers are empty or `container_b` doesn't even exist.
+  ///   The atomicity of the emptiness checks followed by the renaming operation
+  ///   is guaranteed by the use of leases.
+  Status Move(const std::string& src, const std::string& dest) override;
+
+  Status CopyFile(const std::string& src, const std::string& dest) override;
+
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(
+      const std::string& path) override;
+
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
+
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const std::string& path) override;
+
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const FileInfo& info) override;
+
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+  Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
+};
+
+}  // namespace arrow::fs
diff --git a/pyarrow/include/arrow/filesystem/filesystem.h b/pyarrow/include/arrow/filesystem/filesystem.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a47eb62f524550eccfe750fb542d0cccecc8244
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/filesystem.h
@@ -0,0 +1,723 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <chrono>
+#include <cstdint>
+#include <functional>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/filesystem/type_fwd.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+#include "arrow/util/windows_fixup.h"
+
+namespace arrow {
+namespace fs {
+
+using arrow::util::Uri;
+
+// A system clock time point expressed as a 64-bit (or more) number of
+// nanoseconds since the epoch.
+using TimePoint =
+    std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
+
+ARROW_EXPORT std::string ToString(FileType);
+
+ARROW_EXPORT std::ostream& operator<<(std::ostream& os, FileType);
+
+static const int64_t kNoSize = -1;
+static const TimePoint kNoTime = TimePoint(TimePoint::duration(-1));
+
+/// \brief FileSystem entry info
+struct ARROW_EXPORT FileInfo : public util::EqualityComparable<FileInfo> {
+  FileInfo() = default;
+  FileInfo(FileInfo&&) = default;
+  FileInfo& operator=(FileInfo&&) = default;
+  FileInfo(const FileInfo&) = default;
+  FileInfo& operator=(const FileInfo&) = default;
+
+  explicit FileInfo(std::string path, FileType type = FileType::Unknown)
+      : path_(std::move(path)), type_(type) {}
+
+  /// The file type
+  FileType type() const { return type_; }
+  void set_type(FileType type) { type_ = type; }
+
+  /// The full file path in the filesystem
+  const std::string& path() const { return path_; }
+  void set_path(std::string path) { path_ = std::move(path); }
+
+  /// The file base name (component after the last directory separator)
+  std::string base_name() const;
+
+  // The directory base name (component before the file base name).
+  std::string dir_name() const;
+
+  /// The size in bytes, if available
+  ///
+  /// Only regular files are guaranteed to have a size.
+  int64_t size() const { return size_; }
+  void set_size(int64_t size) { size_ = size; }
+
+  /// The file extension (excluding the dot)
+  std::string extension() const;
+
+  /// The time of last modification, if available
+  TimePoint mtime() const { return mtime_; }
+  void set_mtime(TimePoint mtime) { mtime_ = mtime; }
+
+  bool IsFile() const { return type_ == FileType::File; }
+  bool IsDirectory() const { return type_ == FileType::Directory; }
+
+  bool Equals(const FileInfo& other) const {
+    return type() == other.type() && path() == other.path() && size() == other.size() &&
+           mtime() == other.mtime();
+  }
+
+  std::string ToString() const;
+
+  /// Function object implementing less-than comparison and hashing by
+  /// path, to support sorting infos, using them as keys, and other
+  /// interactions with the STL.
+  struct ByPath {
+    bool operator()(const FileInfo& l, const FileInfo& r) const {
+      return l.path() < r.path();
+    }
+
+    size_t operator()(const FileInfo& i) const {
+      return std::hash<std::string>{}(i.path());
+    }
+  };
+
+ protected:
+  std::string path_;
+  FileType type_ = FileType::Unknown;
+  int64_t size_ = kNoSize;
+  TimePoint mtime_ = kNoTime;
+};
+
+ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const FileInfo&);
+
+/// \brief File selector for filesystem APIs
+struct ARROW_EXPORT FileSelector {
+  /// The directory in which to select files.
+  /// If the path exists but doesn't point to a directory, this should be an error.
+  std::string base_dir;
+  /// The behavior if `base_dir` isn't found in the filesystem.  If false,
+  /// an error is returned.  If true, an empty selection is returned.
+  bool allow_not_found;
+  /// Whether to recurse into subdirectories.
+  bool recursive;
+  /// The maximum number of subdirectories to recurse into.
+  int32_t max_recursion;
+
+  FileSelector() : allow_not_found(false), recursive(false), max_recursion(INT32_MAX) {}
+};
+
+/// \brief FileSystem, path pair
+struct ARROW_EXPORT FileLocator {
+  std::shared_ptr<FileSystem> filesystem;
+  std::string path;
+};
+
+using FileInfoVector = std::vector<FileInfo>;
+using FileInfoGenerator = std::function<Future<FileInfoVector>()>;
+
+}  // namespace fs
+
+template <>
+struct IterationTraits<fs::FileInfoVector> {
+  static fs::FileInfoVector End() { return {}; }
+  static bool IsEnd(const fs::FileInfoVector& val) { return val.empty(); }
+};
+
+namespace fs {
+
+/// \brief Abstract file system API
+class ARROW_EXPORT FileSystem
+    /// \cond false
+    : public std::enable_shared_from_this<FileSystem>
+/// \endcond
+{  // NOLINT
+ public:
+  virtual ~FileSystem();
+
+  virtual std::string type_name() const = 0;
+
+  /// EXPERIMENTAL: The IOContext associated with this filesystem.
+  const io::IOContext& io_context() const { return io_context_; }
+
+  /// Normalize path for the given filesystem
+  ///
+  /// The default implementation of this method is a no-op, but subclasses
+  /// may allow normalizing irregular path forms (such as Windows local paths).
+  virtual Result<std::string> NormalizePath(std::string path);
+
+  /// \brief Ensure a URI (or path) is compatible with the given filesystem and return the
+  ///        path
+  ///
+  /// \param uri_string A URI representing a resource in the given filesystem.
+  ///
+  /// This method will check to ensure the given filesystem is compatible with the
+  /// URI. This can be useful when the user provides both a URI and a filesystem or
+  /// when a user provides multiple URIs that should be compatible with the same
+  /// filesystem.
+  ///
+  /// uri_string can be an absolute path instead of a URI.  In that case it will ensure
+  /// the filesystem (if supplied) is the local filesystem (or some custom filesystem that
+  /// is capable of reading local paths) and will normalize the path's file separators.
+  ///
+  /// Note, this method only checks to ensure the URI scheme is valid.  It will not detect
+  /// inconsistencies like a mismatching region or endpoint override.
+  ///
+  /// \return The path inside the filesystem that is indicated by the URI.
+  virtual Result<std::string> PathFromUri(const std::string& uri_string) const;
+
+  /// \brief Make a URI from which FileSystemFromUri produces an equivalent filesystem
+  /// \param path The path component to use in the resulting URI. Must be absolute.
+  /// \return A URI string, or an error if an equivalent URI cannot be produced
+  virtual Result<std::string> MakeUri(std::string path) const;
+
+  virtual bool Equals(const FileSystem& other) const = 0;
+
+  virtual bool Equals(const std::shared_ptr<FileSystem>& other) const {
+    return Equals(*other);
+  }
+
+  /// Get info for the given target.
+  ///
+  /// Any symlink is automatically dereferenced, recursively.
+  /// A nonexistent or unreachable file returns an Ok status and
+  /// has a FileType of value NotFound.  An error status indicates
+  /// a truly exceptional condition (low-level I/O error, etc.).
+  virtual Result<FileInfo> GetFileInfo(const std::string& path) = 0;
+  /// Same, for many targets at once.
+  virtual Result<FileInfoVector> GetFileInfo(const std::vector<std::string>& paths);
+  /// Same, according to a selector.
+  ///
+  /// The selector's base directory will not be part of the results, even if
+  /// it exists.
+  /// If it doesn't exist, see `FileSelector::allow_not_found`.
+  virtual Result<FileInfoVector> GetFileInfo(const FileSelector& select) = 0;
+
+  /// Async version of GetFileInfo
+  virtual Future<FileInfoVector> GetFileInfoAsync(const std::vector<std::string>& paths);
+
+  /// Streaming async version of GetFileInfo
+  ///
+  /// The returned generator is not async-reentrant, i.e. you need to wait for
+  /// the returned future to complete before calling the generator again.
+  virtual FileInfoGenerator GetFileInfoGenerator(const FileSelector& select);
+
+  /// Create a directory and subdirectories.
+  ///
+  /// This function succeeds if the directory already exists.
+  virtual Status CreateDir(const std::string& path, bool recursive) = 0;
+  Status CreateDir(const std::string& path) { return CreateDir(path, true); }
+
+  /// Delete a directory and its contents, recursively.
+  virtual Status DeleteDir(const std::string& path) = 0;
+
+  /// Delete a directory's contents, recursively.
+  ///
+  /// Like DeleteDir, but doesn't delete the directory itself.
+  /// Passing an empty path ("" or "/") is disallowed, see DeleteRootDirContents.
+  virtual Status DeleteDirContents(const std::string& path, bool missing_dir_ok) = 0;
+  Status DeleteDirContents(const std::string& path) {
+    return DeleteDirContents(path, false);
+  }
+
+  /// Async version of DeleteDirContents.
+  virtual Future<> DeleteDirContentsAsync(const std::string& path, bool missing_dir_ok);
+
+  /// Async version of DeleteDirContents.
+  ///
+  /// This overload allows missing directories.
+  Future<> DeleteDirContentsAsync(const std::string& path);
+
+  /// EXPERIMENTAL: Delete the root directory's contents, recursively.
+  ///
+  /// Implementations may decide to raise an error if this operation is
+  /// too dangerous.
+  // NOTE: may decide to remove this if it's deemed not useful
+  virtual Status DeleteRootDirContents() = 0;
+
+  /// Delete a file.
+  virtual Status DeleteFile(const std::string& path) = 0;
+  /// Delete many files.
+  ///
+  /// The default implementation issues individual delete operations in sequence.
+  virtual Status DeleteFiles(const std::vector<std::string>& paths);
+
+  /// Move / rename a file or directory.
+  ///
+  /// If the destination exists:
+  /// - if it is a non-empty directory, an error is returned
+  /// - otherwise, if it has the same type as the source, it is replaced
+  /// - otherwise, behavior is unspecified (implementation-dependent).
+  virtual Status Move(const std::string& src, const std::string& dest) = 0;
+
+  /// Copy a file.
+  ///
+  /// If the destination exists and is a directory, an error is returned.
+  /// Otherwise, it is replaced.
+  virtual Status CopyFile(const std::string& src, const std::string& dest) = 0;
+
+  /// Open an input stream for sequential reading.
+  virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream(
+      const std::string& path) = 0;
+
+  /// Open an input stream for sequential reading.
+  ///
+  /// This override assumes the given FileInfo validly represents the file's
+  /// characteristics, and may optimize access depending on them (for example
+  /// avoid querying the file size or its existence).
+  virtual Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info);
+
+  /// Open an input file for random access reading.
+  virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const std::string& path) = 0;
+
+  /// Open an input file for random access reading.
+  ///
+  /// This override assumes the given FileInfo validly represents the file's
+  /// characteristics, and may optimize access depending on them (for example
+  /// avoid querying the file size or its existence).
+  virtual Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const FileInfo& info);
+
+  /// Async version of OpenInputStream
+  virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
+      const std::string& path);
+
+  /// Async version of OpenInputStream
+  virtual Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
+      const FileInfo& info);
+
+  /// Async version of OpenInputFile
+  virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
+      const std::string& path);
+
+  /// Async version of OpenInputFile
+  virtual Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
+      const FileInfo& info);
+
+  /// Open an output stream for sequential writing.
+  ///
+  /// If the target already exists, existing data is truncated.
+  virtual Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) = 0;
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(const std::string& path);
+
+  /// Open an output stream for appending.
+  ///
+  /// If the target doesn't exist, a new empty file is created.
+  ///
+  /// Note: some filesystem implementations do not support efficient appending
+  /// to an existing file, in which case this method will return NotImplemented.
+  /// Consider writing to multiple files (using e.g. the dataset layer) instead.
+  virtual Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) = 0;
+  Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(const std::string& path);
+
+ protected:
+  explicit FileSystem(io::IOContext io_context = io::default_io_context())
+      : io_context_(std::move(io_context)) {}
+
+  io::IOContext io_context_;
+  // Whether metadata operations (such as GetFileInfo or OpenInputStream)
+  // are cheap enough that the default async variants don't bother with
+  // a thread pool.
+  bool default_async_is_sync_ = true;
+};
+
+struct FileSystemFactory {
+  std::function<Result<std::shared_ptr<FileSystem>>(
+      const Uri& uri, const io::IOContext& io_context, std::string* out_path)>
+      function;
+  std::string_view file;
+  int line;
+
+  bool operator==(const FileSystemFactory& other) const {
+    // In the case where libarrow is linked statically both to the executable and to a
+    // dynamically loaded filesystem implementation library, the library contains a
+    // duplicate definition of the registry and duplicate definitions of any
+    // FileSystemRegistrars which are statically linked to libarrow. When retrieving
+    // factories from the filesystem implementation library, we use the file and line
+    // of the registrar's definition to determine equivalence of the duplicate factories.
+    return file == other.file && line == other.line;
+  }
+};
+
+/// \brief A FileSystem implementation that delegates to another
+/// implementation after prepending a fixed base path.
+///
+/// This is useful to expose a logical view of a subtree of a filesystem,
+/// for example a directory in a LocalFileSystem.
+/// This works on abstract paths, i.e. paths using forward slashes and
+/// and a single root "/".  Windows paths are not guaranteed to work.
+/// This makes no security guarantee.  For example, symlinks may allow to
+/// "escape" the subtree and access other parts of the underlying filesystem.
+class ARROW_EXPORT SubTreeFileSystem : public FileSystem {
+ public:
+  // This constructor may abort if base_path is invalid.
+  explicit SubTreeFileSystem(const std::string& base_path,
+                             std::shared_ptr<FileSystem> base_fs);
+  ~SubTreeFileSystem() override;
+
+  std::string type_name() const override { return "subtree"; }
+  std::string base_path() const { return base_path_; }
+  std::shared_ptr<FileSystem> base_fs() const { return base_fs_; }
+
+  Result<std::string> NormalizePath(std::string path) override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
+
+  bool Equals(const FileSystem& other) const override;
+
+  /// \cond FALSE
+  using FileSystem::CreateDir;
+  using FileSystem::DeleteDirContents;
+  using FileSystem::GetFileInfo;
+  using FileSystem::OpenAppendStream;
+  using FileSystem::OpenOutputStream;
+  /// \endcond
+
+  Result<FileInfo> GetFileInfo(const std::string& path) override;
+  Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
+
+  FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
+
+  Status CreateDir(const std::string& path, bool recursive) override;
+
+  Status DeleteDir(const std::string& path) override;
+  Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
+  Status DeleteRootDirContents() override;
+
+  Status DeleteFile(const std::string& path) override;
+
+  Status Move(const std::string& src, const std::string& dest) override;
+
+  Status CopyFile(const std::string& src, const std::string& dest) override;
+
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const FileInfo& info) override;
+
+  Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
+      const std::string& path) override;
+  Future<std::shared_ptr<io::InputStream>> OpenInputStreamAsync(
+      const FileInfo& info) override;
+  Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
+      const std::string& path) override;
+  Future<std::shared_ptr<io::RandomAccessFile>> OpenInputFileAsync(
+      const FileInfo& info) override;
+
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+ protected:
+  SubTreeFileSystem() = default;
+
+  const std::string base_path_;
+  std::shared_ptr<FileSystem> base_fs_;
+
+  Result<std::string> PrependBase(const std::string& s) const;
+  Result<std::string> PrependBaseNonEmpty(const std::string& s) const;
+  Result<std::string> StripBase(const std::string& s) const;
+  Status FixInfo(FileInfo* info) const;
+
+  static Result<std::string> NormalizeBasePath(
+      std::string base_path, const std::shared_ptr<FileSystem>& base_fs);
+};
+
+/// \brief A FileSystem implementation that delegates to another
+/// implementation but inserts latencies at various points.
+class ARROW_EXPORT SlowFileSystem : public FileSystem {
+ public:
+  SlowFileSystem(std::shared_ptr<FileSystem> base_fs,
+                 std::shared_ptr<io::LatencyGenerator> latencies);
+  SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency);
+  SlowFileSystem(std::shared_ptr<FileSystem> base_fs, double average_latency,
+                 int32_t seed);
+
+  std::string type_name() const override { return "slow"; }
+  bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
+
+  /// \cond FALSE
+  using FileSystem::CreateDir;
+  using FileSystem::DeleteDirContents;
+  using FileSystem::GetFileInfo;
+  using FileSystem::OpenAppendStream;
+  using FileSystem::OpenOutputStream;
+  /// \endcond
+
+  Result<FileInfo> GetFileInfo(const std::string& path) override;
+  Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
+
+  Status CreateDir(const std::string& path, bool recursive) override;
+
+  Status DeleteDir(const std::string& path) override;
+  Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
+  Status DeleteRootDirContents() override;
+
+  Status DeleteFile(const std::string& path) override;
+
+  Status Move(const std::string& src, const std::string& dest) override;
+
+  Status CopyFile(const std::string& src, const std::string& dest) override;
+
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const FileInfo& info) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+ protected:
+  std::shared_ptr<FileSystem> base_fs_;
+  std::shared_ptr<io::LatencyGenerator> latencies_;
+};
+
+/// \brief Ensure all registered filesystem implementations are finalized.
+///
+/// Individual finalizers may wait for concurrent calls to finish so as to avoid
+/// race conditions. After this function has been called, all filesystem APIs
+/// will fail with an error.
+///
+/// The user is responsible for synchronization of calls to this function.
+void EnsureFinalized();
+
+/// \defgroup filesystem-factories Functions for creating FileSystem instances
+///
+/// @{
+
+/// \brief Create a new FileSystem by URI
+///
+/// Recognized schemes are "file", "mock", "hdfs", "viewfs", "s3",
+/// "gs" and "gcs".
+///
+/// Support for other schemes can be added using RegisterFileSystemFactory.
+///
+/// \param[in] uri a URI-based path, ex: file:///some/local/path
+/// \param[out] out_path (optional) Path inside the filesystem.
+/// \return out_fs FileSystem instance.
+ARROW_EXPORT
+Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri,
+                                                      std::string* out_path = NULLPTR);
+
+/// \brief Create a new FileSystem by URI with a custom IO context
+///
+/// Recognized schemes are "file", "mock", "hdfs", "viewfs", "s3",
+/// "gs" and "gcs".
+///
+/// Support for other schemes can be added using RegisterFileSystemFactory.
+///
+/// \param[in] uri a URI-based path, ex: file:///some/local/path
+/// \param[in] io_context an IOContext which will be associated with the filesystem
+/// \param[out] out_path (optional) Path inside the filesystem.
+/// \return out_fs FileSystem instance.
+ARROW_EXPORT
+Result<std::shared_ptr<FileSystem>> FileSystemFromUri(const std::string& uri,
+                                                      const io::IOContext& io_context,
+                                                      std::string* out_path = NULLPTR);
+
+/// \brief Create a new FileSystem by URI
+///
+/// Support for other schemes can be added using RegisterFileSystemFactory.
+///
+/// Same as FileSystemFromUri, but in addition also recognize non-URIs
+/// and treat them as local filesystem paths.  Only absolute local filesystem
+/// paths are allowed.
+ARROW_EXPORT
+Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
+    const std::string& uri, std::string* out_path = NULLPTR);
+
+/// \brief Create a new FileSystem by URI with a custom IO context
+///
+/// Support for other schemes can be added using RegisterFileSystemFactory.
+///
+/// Same as FileSystemFromUri, but in addition also recognize non-URIs
+/// and treat them as local filesystem paths.  Only absolute local filesystem
+/// paths are allowed.
+ARROW_EXPORT
+Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(
+    const std::string& uri, const io::IOContext& io_context,
+    std::string* out_path = NULLPTR);
+
+/// @}
+
+/// \defgroup filesystem-factory-registration Helpers for FileSystem registration
+///
+/// @{
+
+/// \brief Register a FileSystem factory
+///
+/// Support for custom URI schemes can be added by registering a factory
+/// for the corresponding FileSystem.
+///
+/// \param[in] scheme a Uri scheme which the factory will handle.
+///            If a factory has already been registered for a scheme,
+///            the new factory will be ignored.
+/// \param[in] factory a function which can produce a FileSystem for Uris which match
+///            scheme.
+/// \param[in] finalizer a function which must be called to finalize the factory before
+///            the process exits, or nullptr if no finalization is necessary.
+/// \return raises KeyError if a name collision occurs.
+ARROW_EXPORT Status RegisterFileSystemFactory(std::string scheme,
+                                              FileSystemFactory factory,
+                                              std::function<void()> finalizer = {});
+
+/// \brief Register FileSystem factories from a shared library
+///
+/// FileSystem implementations may be housed in separate shared libraries and only
+/// registered when the shared library is explicitly loaded. FileSystemRegistrar is
+/// provided to simplify definition of such libraries: each instance at namespace scope
+/// in the library will register a factory for a scheme. Any library which uses
+/// FileSystemRegistrars and which must be dynamically loaded should be loaded using
+/// LoadFileSystemFactories(), which will additionally merge registries are if necessary
+/// (static linkage to arrow can produce isolated registries).
+ARROW_EXPORT Status LoadFileSystemFactories(const char* libpath);
+
+struct ARROW_EXPORT FileSystemRegistrar {
+  /// \brief Register a FileSystem factory at load time
+  ///
+  /// Support for custom URI schemes can be added by registering a factory for the
+  /// corresponding FileSystem. An instance of this helper can be defined at namespace
+  /// scope to cause the factory to be registered at load time.
+  ///
+  /// Global constructors will finish execution before main() starts if the registrar is
+  /// linked into the same binary as main(), or before dlopen()/LoadLibrary() returns if
+  /// the library in which the registrar is defined is dynamically loaded.
+  ///
+  /// \code
+  ///     FileSystemRegistrar kSlowFileSystemModule{
+  ///       "slowfile",
+  ///       [](const Uri& uri, const io::IOContext& io_context, std::string* out_path)
+  ///           ->Result<std::shared_ptr<FileSystem>> {
+  ///         auto local_uri = "file" + uri.ToString().substr(uri.scheme().size());
+  ///         ARROW_ASSIGN_OR_RAISE(auto base_fs,
+  ///             FileSystemFromUri(local_uri, io_context, out_path));
+  ///         double average_latency = 1;
+  ///         int32_t seed = 0xDEADBEEF;
+  ///         ARROW_ASSIGN_OR_RAISE(auto params, uri.query_item());
+  ///         for (const auto& [key, value] : params) {
+  ///           if (key == "average_latency") {
+  ///             average_latency = std::stod(value);
+  ///           }
+  ///           if (key == "seed") {
+  ///             seed = std::stoi(value, nullptr, /*base=*/16);
+  ///           }
+  ///         }
+  ///         return std::make_shared<SlowFileSystem>(base_fs, average_latency, seed);
+  ///     }));
+  /// \endcode
+  ///
+  /// \param[in] scheme a Uri scheme which the factory will handle.
+  ///            If a factory has already been registered for a scheme, the
+  ///            new factory will be ignored.
+  /// \param[in] factory a function which can produce a FileSystem for Uris which match
+  ///            scheme.
+  /// \param[in] finalizer a function which must be called to finalize the factory before
+  ///            the process exits, or nullptr if no finalization is necessary.
+  FileSystemRegistrar(std::string scheme, FileSystemFactory factory,
+                      std::function<void()> finalizer = {});
+};
+
+#define ARROW_REGISTER_FILESYSTEM(scheme, factory_function, finalizer)            \
+  ::arrow::fs::FileSystemRegistrar {                                              \
+    scheme, ::arrow::fs::FileSystemFactory{factory_function, __FILE__, __LINE__}, \
+        finalizer                                                                 \
+  }
+
+/// @}
+
+namespace internal {
+ARROW_EXPORT void* GetFileSystemRegistry();
+}  // namespace internal
+
+/// \brief Copy files, including from one FileSystem to another
+///
+/// If a source and destination are resident in the same FileSystem FileSystem::CopyFile
+/// will be used, otherwise the file will be opened as a stream in both FileSystems and
+/// chunks copied from the source to the destination. No directories will be created.
+ARROW_EXPORT
+Status CopyFiles(const std::vector<FileLocator>& sources,
+                 const std::vector<FileLocator>& destinations,
+                 const io::IOContext& io_context = io::default_io_context(),
+                 int64_t chunk_size = 1024 * 1024, bool use_threads = true);
+
+/// \brief Copy selected files, including from one FileSystem to another
+///
+/// Directories will be created under the destination base directory as needed.
+ARROW_EXPORT
+Status CopyFiles(const std::shared_ptr<FileSystem>& source_fs,
+                 const FileSelector& source_sel,
+                 const std::shared_ptr<FileSystem>& destination_fs,
+                 const std::string& destination_base_dir,
+                 const io::IOContext& io_context = io::default_io_context(),
+                 int64_t chunk_size = 1024 * 1024, bool use_threads = true);
+
+struct FileSystemGlobalOptions {
+  /// Path to a single PEM file holding all TLS CA certificates
+  ///
+  /// If empty, the underlying TLS library's defaults will be used.
+  std::string tls_ca_file_path;
+
+  /// Path to a directory holding TLS CA certificates in individual PEM files
+  /// named along the OpenSSL "hashed" format.
+  ///
+  /// If empty, the underlying TLS library's defaults will be used.
+  std::string tls_ca_dir_path;
+};
+
+/// EXPERIMENTAL: optional global initialization routine
+///
+/// This is for environments (such as manylinux) where the path
+/// to TLS CA certificates needs to be configured at runtime.
+ARROW_EXPORT
+Status Initialize(const FileSystemGlobalOptions& options);
+
+}  // namespace fs
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/filesystem/filesystem_library.h b/pyarrow/include/arrow/filesystem/filesystem_library.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d65690130817b17eff819e3a231b79b6e07dc8a
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/filesystem_library.h
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/filesystem/filesystem.h"
+
+namespace arrow::fs {
+extern "C" {
+
+// ARROW_FORCE_EXPORT ensures this function's visibility is
+// _declspec(dllexport)/[[gnu::visibility("default")]] even when
+// this header is #included by a non-arrow source, as in a third
+// party filesystem implementation.
+ARROW_FORCE_EXPORT void* arrow_filesystem_get_registry();
+
+void* arrow_filesystem_get_registry() {
+  // In the case where libarrow is linked statically both to the executable and to a
+  // dynamically loaded filesystem implementation library, the library contains a
+  // duplicate definition of the registry into which the library's instances of
+  // FileSystemRegistrar insert their factories. This function is made accessible to
+  // dlsym/GetProcAddress to enable detection of such duplicate registries and merging
+  // into the registry accessible to the executable.
+  return internal::GetFileSystemRegistry();
+}
+}
+}  // namespace arrow::fs
diff --git a/pyarrow/include/arrow/filesystem/gcsfs.h b/pyarrow/include/arrow/filesystem/gcsfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fe6f478b48e9ae041a0b0240bbf4d0a01135bdc
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/gcsfs.h
@@ -0,0 +1,242 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/util/uri.h"
+
+namespace arrow {
+namespace fs {
+namespace internal {
+
+// Opaque wrapper for GCS's library credentials to avoid exposing in Arrow headers.
+struct GcsCredentialsHolder;
+
+}  // namespace internal
+
+class GcsFileSystem;
+
+/// \brief Container for GCS Credentials and information necessary to recreate them.
+class ARROW_EXPORT GcsCredentials {
+ public:
+  bool Equals(const GcsCredentials& other) const;
+  bool anonymous() const { return anonymous_; }
+  const std::string& access_token() const { return access_token_; }
+  TimePoint expiration() const { return expiration_; }
+  const std::string& target_service_account() const { return target_service_account_; }
+  const std::string& json_credentials() const { return json_credentials_; }
+  const std::shared_ptr<internal::GcsCredentialsHolder>& holder() const {
+    return holder_;
+  }
+
+ private:
+  GcsCredentials() = default;
+  bool anonymous_ = false;
+  std::string access_token_;
+  TimePoint expiration_;
+  std::string target_service_account_;
+  std::string json_credentials_;
+  std::shared_ptr<internal::GcsCredentialsHolder> holder_;
+  friend class GcsFileSystem;
+  friend struct GcsOptions;
+};
+
+/// Options for the GcsFileSystem implementation.
+struct ARROW_EXPORT GcsOptions {
+  /// \brief Equivalent to GcsOptions::Defaults().
+  GcsOptions();
+  GcsCredentials credentials;
+
+  std::string endpoint_override;
+  std::string scheme;
+  /// \brief Location to use for creating buckets.
+  std::string default_bucket_location;
+
+  /// \brief If set used to control total time allowed for retrying underlying
+  /// errors.
+  ///
+  /// The default policy is to retry for up to 15 minutes.
+  std::optional<double> retry_limit_seconds;
+
+  /// \brief Default metadata for OpenOutputStream.
+  ///
+  /// This will be ignored if non-empty metadata is passed to OpenOutputStream.
+  std::shared_ptr<const KeyValueMetadata> default_metadata;
+
+  /// \brief The project to use for creating buckets.
+  ///
+  /// If not set, the library uses the GOOGLE_CLOUD_PROJECT environment
+  /// variable. Most I/O operations do not need a project id, only applications
+  /// that create new buckets need a project id.
+  std::optional<std::string> project_id;
+
+  bool Equals(const GcsOptions& other) const;
+
+  /// \brief Initialize with Google Default Credentials
+  ///
+  /// Create options configured to use [Application Default Credentials][aip/4110]. The
+  /// details of this mechanism are too involved to describe here, but suffice is to say
+  /// that applications can override any defaults using an environment variable
+  /// (`GOOGLE_APPLICATION_CREDENTIALS`), and that the defaults work with most Google
+  /// Cloud Platform deployment environments (GCE, GKE, Cloud Run, etc.), and that have
+  /// the same behavior as the `gcloud` CLI tool on your workstation.
+  ///
+  /// \see https://cloud.google.com/docs/authentication
+  ///
+  /// [aip/4110]: https://google.aip.dev/auth/4110
+  static GcsOptions Defaults();
+
+  /// \brief Initialize with anonymous credentials
+  static GcsOptions Anonymous();
+
+  /// \brief Initialize with access token
+  ///
+  /// These credentials are useful when using an out-of-band mechanism to fetch access
+  /// tokens. Note that access tokens are time limited, you will need to manually refresh
+  /// the tokens created by the out-of-band mechanism.
+  static GcsOptions FromAccessToken(const std::string& access_token,
+                                    TimePoint expiration);
+
+  /// \brief Initialize with service account impersonation
+  ///
+  /// Service account impersonation allows one principal (a user or service account) to
+  /// impersonate a service account. It requires that the calling principal has the
+  /// necessary permissions *on* the service account.
+  static GcsOptions FromImpersonatedServiceAccount(
+      const GcsCredentials& base_credentials, const std::string& target_service_account);
+
+  /// Creates service account credentials from a JSON object in string form.
+  ///
+  /// The @p json_object  is expected to be in the format described by [aip/4112]. Such an
+  /// object contains the identity of a service account, as well as a private key that can
+  /// be used to sign tokens, showing the caller was holding the private key.
+  ///
+  /// In GCP one can create several "keys" for each service account, and these keys are
+  /// downloaded as a JSON "key file". The contents of such a file are in the format
+  /// required by this function. Remember that key files and their contents should be
+  /// treated as any other secret with security implications, think of them as passwords
+  /// (because they are!), don't store them or output them where unauthorized persons may
+  /// read them.
+  ///
+  /// Most applications should probably use default credentials, maybe pointing them to a
+  /// file with these contents. Using this function may be useful when the json object is
+  /// obtained from a Cloud Secret Manager or a similar service.
+  ///
+  /// [aip/4112]: https://google.aip.dev/auth/4112
+  static GcsOptions FromServiceAccountCredentials(const std::string& json_object);
+
+  /// Initialize from URIs such as "gs://bucket/object".
+  static Result<GcsOptions> FromUri(const arrow::util::Uri& uri, std::string* out_path);
+  static Result<GcsOptions> FromUri(const std::string& uri, std::string* out_path);
+};
+
+/// \brief GCS-backed FileSystem implementation.
+///
+/// GCS (Google Cloud Storage - https://cloud.google.com/storage) is a scalable object
+/// storage system for any amount of data. The main abstractions in GCS are buckets and
+/// objects. A bucket is a namespace for objects, buckets can store any number of objects,
+/// tens of millions and even billions is not uncommon.  Each object contains a single
+/// blob of data, up to 5TiB in size.  Buckets are typically configured to keep a single
+/// version of each object, but versioning can be enabled. Versioning is important because
+/// objects are immutable, once created one cannot append data to the object or modify the
+/// object data in any way.
+///
+/// GCS buckets are in a global namespace, if a Google Cloud customer creates a bucket
+/// named `foo` no other customer can create a bucket with the same name. Note that a
+/// principal (a user or service account) may only list the buckets they are entitled to,
+/// and then only within a project. It is not possible to list "all" the buckets.
+///
+/// Within each bucket objects are in flat namespace. GCS does not have folders or
+/// directories. However, following some conventions it is possible to emulate
+/// directories. To this end, this class:
+///
+/// - All buckets are treated as directories at the "root"
+/// - Creating a root directory results in a new bucket being created, this may be slower
+///   than most GCS operations.
+/// - The class creates marker objects for a directory, using a metadata attribute to
+///   annotate the file.
+/// - GCS can list all the objects with a given prefix, this is used to emulate listing
+///   of directories.
+/// - In object lists GCS can summarize all the objects with a common prefix as a single
+///   entry, this is used to emulate non-recursive lists. Note that GCS list time is
+///   proportional to the number of objects in the prefix. Listing recursively takes
+///   almost the same time as non-recursive lists.
+///
+class ARROW_EXPORT GcsFileSystem : public FileSystem {
+ public:
+  ~GcsFileSystem() override = default;
+
+  std::string type_name() const override;
+  const GcsOptions& options() const;
+
+  bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
+
+  Result<FileInfo> GetFileInfo(const std::string& path) override;
+  Result<FileInfoVector> GetFileInfo(const FileSelector& select) override;
+
+  Status CreateDir(const std::string& path, bool recursive) override;
+
+  Status DeleteDir(const std::string& path) override;
+
+  Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override;
+
+  /// This is not implemented in GcsFileSystem, as it would be too dangerous.
+  Status DeleteRootDirContents() override;
+
+  Status DeleteFile(const std::string& path) override;
+
+  Status Move(const std::string& src, const std::string& dest) override;
+
+  Status CopyFile(const std::string& src, const std::string& dest) override;
+
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
+
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const FileInfo& info) override;
+
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+  Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+  /// Create a GcsFileSystem instance from the given options.
+  static Result<std::shared_ptr<GcsFileSystem>> Make(
+      const GcsOptions& options, const io::IOContext& = io::default_io_context());
+
+ private:
+  explicit GcsFileSystem(const GcsOptions& options, const io::IOContext& io_context);
+
+  class Impl;
+  std::shared_ptr<Impl> impl_;
+};
+
+}  // namespace fs
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/filesystem/hdfs.h b/pyarrow/include/arrow/filesystem/hdfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..25604a39e3aceb26b2e7da5dc72e97a0cbd635d5
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/hdfs.h
@@ -0,0 +1,117 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/io/hdfs.h"
+#include "arrow/util/uri.h"
+
+namespace arrow::fs {
+
+/// Options for the HDFS implementation.
+struct ARROW_EXPORT HdfsOptions {
+  HdfsOptions() = default;
+  ~HdfsOptions() = default;
+
+  /// Hdfs configuration options, contains host, port, driver
+  io::HdfsConnectionConfig connection_config;
+
+  /// Used by Hdfs OpenWritable Interface.
+  int32_t buffer_size = 0;
+  int16_t replication = 3;
+  int64_t default_block_size = 0;
+
+  void ConfigureEndPoint(std::string host, int port);
+  void ConfigureReplication(int16_t replication);
+  void ConfigureUser(std::string user_name);
+  void ConfigureBufferSize(int32_t buffer_size);
+  void ConfigureBlockSize(int64_t default_block_size);
+  void ConfigureKerberosTicketCachePath(std::string path);
+  void ConfigureExtraConf(std::string key, std::string val);
+
+  bool Equals(const HdfsOptions& other) const;
+
+  static Result<HdfsOptions> FromUri(const ::arrow::util::Uri& uri);
+  static Result<HdfsOptions> FromUri(const std::string& uri);
+};
+
+/// HDFS-backed FileSystem implementation.
+///
+/// implementation notes:
+/// - This is a wrapper of arrow/io/hdfs, so we can use FileSystem API to handle hdfs.
+class ARROW_EXPORT HadoopFileSystem : public FileSystem {
+ public:
+  ~HadoopFileSystem() override;
+
+  std::string type_name() const override { return "hdfs"; }
+  HdfsOptions options() const;
+  bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
+
+  /// \cond FALSE
+  using FileSystem::CreateDir;
+  using FileSystem::DeleteDirContents;
+  using FileSystem::GetFileInfo;
+  using FileSystem::OpenAppendStream;
+  using FileSystem::OpenOutputStream;
+  /// \endcond
+
+  Result<FileInfo> GetFileInfo(const std::string& path) override;
+  Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
+
+  Status CreateDir(const std::string& path, bool recursive) override;
+
+  Status DeleteDir(const std::string& path) override;
+
+  Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
+
+  Status DeleteRootDirContents() override;
+
+  Status DeleteFile(const std::string& path) override;
+
+  Status Move(const std::string& src, const std::string& dest) override;
+
+  Status CopyFile(const std::string& src, const std::string& dest) override;
+
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+  /// Create a HdfsFileSystem instance from the given options.
+  static Result<std::shared_ptr<HadoopFileSystem>> Make(
+      const HdfsOptions& options, const io::IOContext& = io::default_io_context());
+
+ protected:
+  HadoopFileSystem(const HdfsOptions& options, const io::IOContext&);
+
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace arrow::fs
diff --git a/pyarrow/include/arrow/filesystem/localfs.h b/pyarrow/include/arrow/filesystem/localfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..d72e8f7d74d51659b67355c2bdf6b7a107102b75
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/localfs.h
@@ -0,0 +1,132 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/filesystem/filesystem.h"
+
+namespace arrow {
+namespace internal {
+
+class Uri;
+
+}
+
+namespace fs {
+
+/// Options for the LocalFileSystem implementation.
+struct ARROW_EXPORT LocalFileSystemOptions {
+  static constexpr int32_t kDefaultDirectoryReadahead = 16;
+  static constexpr int32_t kDefaultFileInfoBatchSize = 1000;
+
+  /// Whether OpenInputStream and OpenInputFile return a mmap'ed file,
+  /// or a regular one.
+  bool use_mmap = false;
+
+  /// Options related to `GetFileInfoGenerator` interface.
+
+  /// EXPERIMENTAL: The maximum number of directories processed in parallel
+  /// by `GetFileInfoGenerator`.
+  int32_t directory_readahead = kDefaultDirectoryReadahead;
+
+  /// EXPERIMENTAL: The maximum number of entries aggregated into each
+  /// FileInfoVector chunk by `GetFileInfoGenerator`.
+  ///
+  /// Since each FileInfo entry needs a separate `stat` system call, a
+  /// directory with a very large number of files may take a lot of time to
+  /// process entirely. By generating a FileInfoVector after this chunk
+  /// size is reached, we ensure FileInfo entries can start being consumed
+  /// from the FileInfoGenerator with less initial latency.
+  int32_t file_info_batch_size = kDefaultFileInfoBatchSize;
+
+  /// \brief Initialize with defaults
+  static LocalFileSystemOptions Defaults();
+
+  bool Equals(const LocalFileSystemOptions& other) const;
+
+  static Result<LocalFileSystemOptions> FromUri(const ::arrow::util::Uri& uri,
+                                                std::string* out_path);
+};
+
+/// \brief A FileSystem implementation accessing files on the local machine.
+///
+/// This class handles only `/`-separated paths.  If desired, conversion
+/// from Windows backslash-separated paths should be done by the caller.
+/// Details such as symlinks are abstracted away (symlinks are always
+/// followed, except when deleting an entry).
+class ARROW_EXPORT LocalFileSystem : public FileSystem {
+ public:
+  explicit LocalFileSystem(const io::IOContext& = io::default_io_context());
+  explicit LocalFileSystem(const LocalFileSystemOptions&,
+                           const io::IOContext& = io::default_io_context());
+  ~LocalFileSystem() override;
+
+  std::string type_name() const override { return "local"; }
+
+  Result<std::string> NormalizePath(std::string path) override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
+  Result<std::string> MakeUri(std::string path) const override;
+
+  bool Equals(const FileSystem& other) const override;
+
+  LocalFileSystemOptions options() const { return options_; }
+
+  /// \cond FALSE
+  using FileSystem::CreateDir;
+  using FileSystem::DeleteDirContents;
+  using FileSystem::GetFileInfo;
+  using FileSystem::OpenAppendStream;
+  using FileSystem::OpenOutputStream;
+  /// \endcond
+
+  Result<FileInfo> GetFileInfo(const std::string& path) override;
+  Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
+  FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
+
+  Status CreateDir(const std::string& path, bool recursive) override;
+
+  Status DeleteDir(const std::string& path) override;
+  Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
+  Status DeleteRootDirContents() override;
+
+  Status DeleteFile(const std::string& path) override;
+
+  Status Move(const std::string& src, const std::string& dest) override;
+
+  Status CopyFile(const std::string& src, const std::string& dest) override;
+
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+ protected:
+  LocalFileSystemOptions options_;
+};
+
+}  // namespace fs
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/filesystem/mockfs.h b/pyarrow/include/arrow/filesystem/mockfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..5626560e08363f20c5479a1b5f540d6aed1a2d04
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/mockfs.h
@@ -0,0 +1,134 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/util/windows_fixup.h"
+
+namespace arrow::fs::internal {
+
+struct MockDirInfo {
+  std::string full_path;
+  TimePoint mtime;
+
+  bool operator==(const MockDirInfo& other) const {
+    return mtime == other.mtime && full_path == other.full_path;
+  }
+
+  ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream&, const MockDirInfo&);
+};
+
+struct MockFileInfo {
+  std::string full_path;
+  TimePoint mtime;
+  std::string_view data;
+
+  bool operator==(const MockFileInfo& other) const {
+    return mtime == other.mtime && full_path == other.full_path && data == other.data;
+  }
+
+  ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream&, const MockFileInfo&);
+};
+
+/// A mock FileSystem implementation that holds its contents in memory.
+///
+/// Useful for validating the FileSystem API, writing conformance suite,
+/// and bootstrapping FileSystem-based APIs.
+class ARROW_EXPORT MockFileSystem : public FileSystem {
+ public:
+  explicit MockFileSystem(TimePoint current_time,
+                          const io::IOContext& = io::default_io_context());
+  ~MockFileSystem() override;
+
+  std::string type_name() const override { return "mock"; }
+
+  bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
+
+  /// \cond FALSE
+  using FileSystem::CreateDir;
+  using FileSystem::DeleteDirContents;
+  using FileSystem::GetFileInfo;
+  using FileSystem::OpenAppendStream;
+  using FileSystem::OpenOutputStream;
+  /// \endcond
+
+  Result<FileInfo> GetFileInfo(const std::string& path) override;
+  Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
+
+  Status CreateDir(const std::string& path, bool recursive) override;
+
+  Status DeleteDir(const std::string& path) override;
+  Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
+  Status DeleteRootDirContents() override;
+
+  Status DeleteFile(const std::string& path) override;
+
+  Status Move(const std::string& src, const std::string& dest) override;
+
+  Status CopyFile(const std::string& src, const std::string& dest) override;
+
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+  // Contents-dumping helpers to ease testing.
+  // Output is lexicographically-ordered by full path.
+  std::vector<MockDirInfo> AllDirs();
+  std::vector<MockFileInfo> AllFiles();
+
+  // Create a File with a content from a string.
+  Status CreateFile(const std::string& path, std::string_view content,
+                    bool recursive = true);
+
+  // Create a MockFileSystem out of (empty) FileInfo. The content of every
+  // file is empty and of size 0. All directories will be created recursively.
+  static Result<std::shared_ptr<FileSystem>> Make(TimePoint current_time,
+                                                  const std::vector<FileInfo>& infos);
+
+  class Impl;
+
+ protected:
+  std::unique_ptr<Impl> impl_;
+};
+
+class ARROW_EXPORT MockAsyncFileSystem : public MockFileSystem {
+ public:
+  explicit MockAsyncFileSystem(TimePoint current_time,
+                               const io::IOContext& io_context = io::default_io_context())
+      : MockFileSystem(current_time, io_context) {
+    default_async_is_sync_ = false;
+  }
+
+  FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
+};
+
+}  // namespace arrow::fs::internal
diff --git a/pyarrow/include/arrow/filesystem/path_util.h b/pyarrow/include/arrow/filesystem/path_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d49d9d2efa7f6aa92e568f8305c15dc06c86c806
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/path_util.h
@@ -0,0 +1,178 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace fs {
+namespace internal {
+
+constexpr char kSep = '/';
+
+// Computations on abstract paths (not local paths with system-dependent behaviour).
+// Abstract paths are typically used in URIs.
+
+// Split an abstract path into its individual components.
+ARROW_EXPORT
+std::vector<std::string> SplitAbstractPath(const std::string& path, char sep = kSep);
+
+// Slice the individual components of an abstract path and combine them
+//
+// If offset or length are negative then an empty string is returned
+// If offset is >= the number of components then an empty string is returned
+// If offset + length is >= the number of components then length is truncated
+ARROW_EXPORT
+std::string SliceAbstractPath(const std::string& path, int offset, int length,
+                              char sep = kSep);
+
+// Return the extension of the file
+ARROW_EXPORT std::string GetAbstractPathExtension(const std::string& s);
+
+// Return the depth (number of components) of an abstract path
+//
+// Trailing slashes do not count towards depth
+// Leading slashes do not count towards depth
+//
+// The root path ("/") has depth 0
+ARROW_EXPORT int GetAbstractPathDepth(std::string_view path);
+
+// Return the parent directory and basename of an abstract path.  Both values may be
+// empty.
+ARROW_EXPORT
+std::pair<std::string, std::string> GetAbstractPathParent(const std::string& s);
+
+// Validate an abstract path.
+ARROW_EXPORT
+Status ValidateAbstractPath(std::string_view path);
+
+// Validate the components of an abstract path.
+ARROW_EXPORT
+Status ValidateAbstractPathParts(const std::vector<std::string>& parts);
+
+// Append a non-empty stem to an abstract path.
+ARROW_EXPORT
+std::string ConcatAbstractPath(std::string_view base, std::string_view stem);
+
+// Make path relative to base, if it starts with base.  Otherwise error out.
+ARROW_EXPORT
+Result<std::string> MakeAbstractPathRelative(const std::string& base,
+                                             const std::string& path);
+
+ARROW_EXPORT
+std::string EnsureLeadingSlash(std::string_view s);
+
+ARROW_EXPORT
+std::string_view RemoveLeadingSlash(std::string_view s);
+
+ARROW_EXPORT
+std::string EnsureTrailingSlash(std::string_view s);
+
+/// \brief remove the forward slash (if any) from the given path
+/// \param s the input path
+/// \param preserve_root if true, allow a path of just "/" to remain unchanged
+ARROW_EXPORT
+std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = false);
+
+ARROW_EXPORT
+Status AssertNoTrailingSlash(std::string_view s);
+
+inline bool HasTrailingSlash(std::string_view s) {
+  return !s.empty() && s.back() == kSep;
+}
+
+inline bool HasLeadingSlash(std::string_view s) {
+  return !s.empty() && s.front() == kSep;
+}
+
+ARROW_EXPORT
+bool IsAncestorOf(std::string_view ancestor, std::string_view descendant);
+
+ARROW_EXPORT
+std::optional<std::string_view> RemoveAncestor(std::string_view ancestor,
+                                               std::string_view descendant);
+
+/// Return a vector of ancestors between a base path and a descendant.
+/// For example,
+///
+/// AncestorsFromBasePath("a/b", "a/b/c/d/e") -> ["a/b/c", "a/b/c/d"]
+ARROW_EXPORT
+std::vector<std::string> AncestorsFromBasePath(std::string_view base_path,
+                                               std::string_view descendant);
+
+/// Given a vector of paths of directories which must be created, produce a the minimal
+/// subset for passing to CreateDir(recursive=true) by removing redundant parent
+/// directories
+ARROW_EXPORT
+std::vector<std::string> MinimalCreateDirSet(std::vector<std::string> dirs);
+
+// Join the components of an abstract path.
+template <class StringIt>
+std::string JoinAbstractPath(StringIt it, StringIt end, char sep = kSep) {
+  std::string path;
+  for (; it != end; ++it) {
+    if (it->empty()) continue;
+
+    if (!path.empty()) {
+      path += sep;
+    }
+    path += *it;
+  }
+  return path;
+}
+
+template <class StringRange>
+std::string JoinAbstractPath(const StringRange& range, char sep = kSep) {
+  return JoinAbstractPath(range.begin(), range.end(), sep);
+}
+
+/// Convert slashes to backslashes, on all platforms.  Mostly useful for testing.
+ARROW_EXPORT
+std::string ToBackslashes(std::string_view s);
+
+/// Ensure a local path is abstract, by converting backslashes to regular slashes
+/// on Windows.  Return the path unchanged on other systems.
+ARROW_EXPORT
+std::string ToSlashes(std::string_view s);
+
+ARROW_EXPORT
+bool IsEmptyPath(std::string_view s);
+
+ARROW_EXPORT
+bool IsLikelyUri(std::string_view s);
+
+class ARROW_EXPORT Globber {
+ public:
+  ~Globber();
+  explicit Globber(std::string pattern);
+  bool Matches(const std::string& path);
+
+ protected:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace internal
+}  // namespace fs
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/filesystem/s3_test_util.h b/pyarrow/include/arrow/filesystem/s3_test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a89a7a9d5a15a5562c5871b123eef3da847ec29
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/s3_test_util.h
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include <gtest/gtest.h>
+
+#include "arrow/filesystem/s3fs.h"
+#include "arrow/status.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace fs {
+
+// A minio test server, managed as a child process
+
+class MinioTestServer {
+ public:
+  MinioTestServer();
+  ~MinioTestServer();
+
+  Status Start(bool enable_tls = false);
+
+  Status Stop();
+
+  std::string connect_string() const;
+
+  std::string access_key() const;
+
+  std::string secret_key() const;
+
+  std::string ca_dir_path() const;
+
+  std::string ca_file_path() const;
+
+  std::string scheme() const;
+
+ private:
+  Status GenerateCertificateFile();
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+// A Minio "environment" that spawns Minio processes in advances, such as
+// to hide process launch latencies during testing.
+
+class MinioTestEnvironment : public ::testing::Environment {
+ public:
+  explicit MinioTestEnvironment(bool enable_tls = false);
+  ~MinioTestEnvironment();
+
+  void SetUp() override;
+
+  Result<std::shared_ptr<MinioTestServer>> GetOneServer();
+
+ protected:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+// A global test "environment", to ensure that the S3 API is initialized before
+// running unit tests.
+
+class S3Environment : public ::testing::Environment {
+ public:
+  // We set this environment variable to speed up tests by ensuring
+  // DefaultAWSCredentialsProviderChain does not query (inaccessible)
+  // EC2 metadata endpoint.
+  // This must be done before spawning any Minio child process to avoid any race
+  // condition accessing environment variables.
+  S3Environment() : ec2_metadata_disabled_guard_("AWS_EC2_METADATA_DISABLED", "true") {}
+
+  void SetUp() override {
+    // Change this to increase logging during tests
+    S3GlobalOptions options;
+    options.log_level = S3LogLevel::Fatal;
+    ASSERT_OK(InitializeS3(options));
+  }
+
+  void TearDown() override { ASSERT_OK(FinalizeS3()); }
+
+ private:
+  EnvVarGuard ec2_metadata_disabled_guard_;
+};
+
+}  // namespace fs
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/filesystem/s3fs.h b/pyarrow/include/arrow/filesystem/s3fs.h
new file mode 100644
index 0000000000000000000000000000000000000000..158d70a93fce92f27ab8fffb286e7b1e78b73b14
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/s3fs.h
@@ -0,0 +1,467 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/uri.h"
+
+namespace Aws::Auth {
+class AWSCredentialsProvider;
+class STSAssumeRoleCredentialsProvider;
+}  // namespace Aws::Auth
+
+namespace Aws::STS {
+class STSClient;
+}  // namespace Aws::STS
+
+namespace arrow::fs {
+
+/// Options for using a proxy for S3
+struct ARROW_EXPORT S3ProxyOptions {
+  std::string scheme;
+  std::string host;
+  int port = -1;
+  std::string username;
+  std::string password;
+
+  /// Initialize from URI such as http://username:password@host:port
+  /// or http://host:port
+  static Result<S3ProxyOptions> FromUri(const std::string& uri);
+  static Result<S3ProxyOptions> FromUri(const ::arrow::util::Uri& uri);
+
+  bool Equals(const S3ProxyOptions& other) const;
+};
+
+enum class S3CredentialsKind : int8_t {
+  /// Anonymous access (no credentials used)
+  Anonymous,
+  /// Use default AWS credentials, configured through environment variables
+  Default,
+  /// Use explicitly-provided access key pair
+  Explicit,
+  /// Assume role through a role ARN
+  Role,
+  /// Use web identity token to assume role, configured through environment variables
+  WebIdentity
+};
+
+/// Pure virtual class for describing custom S3 retry strategies
+class ARROW_EXPORT S3RetryStrategy {
+ public:
+  virtual ~S3RetryStrategy() = default;
+
+  /// Simple struct where each field corresponds to a field in Aws::Client::AWSError
+  struct AWSErrorDetail {
+    /// Corresponds to AWSError::GetErrorType()
+    int error_type;
+    /// Corresponds to AWSError::GetMessage()
+    std::string message;
+    /// Corresponds to AWSError::GetExceptionName()
+    std::string exception_name;
+    /// Corresponds to AWSError::ShouldRetry()
+    bool should_retry;
+  };
+  /// Returns true if the S3 request resulting in the provided error should be retried.
+  virtual bool ShouldRetry(const AWSErrorDetail& error, int64_t attempted_retries) = 0;
+  /// Returns the time in milliseconds the S3 client should sleep for until retrying.
+  virtual int64_t CalculateDelayBeforeNextRetry(const AWSErrorDetail& error,
+                                                int64_t attempted_retries) = 0;
+  /// Returns a stock AWS Default retry strategy.
+  static std::shared_ptr<S3RetryStrategy> GetAwsDefaultRetryStrategy(
+      int64_t max_attempts);
+  /// Returns a stock AWS Standard retry strategy.
+  static std::shared_ptr<S3RetryStrategy> GetAwsStandardRetryStrategy(
+      int64_t max_attempts);
+};
+
+/// Options for the S3FileSystem implementation.
+struct ARROW_EXPORT S3Options {
+  /// \brief Smart defaults for option values
+  ///
+  /// The possible values for this setting are explained in the AWS docs:
+  /// https://docs.aws.amazon.com/sdkref/latest/guide/feature-smart-config-defaults.html
+  std::string smart_defaults = "standard";
+
+  /// \brief AWS region to connect to.
+  ///
+  /// If unset, the AWS SDK will choose a default value.  The exact algorithm
+  /// depends on the SDK version.  Before 1.8, the default is hardcoded
+  /// to "us-east-1".  Since 1.8, several heuristics are used to determine
+  /// the region (environment variables, configuration profile, EC2 metadata
+  /// server).
+  std::string region;
+
+  /// \brief Socket connection timeout, in seconds
+  ///
+  /// If negative, the AWS SDK default value is used (typically 1 second).
+  double connect_timeout = -1;
+
+  /// \brief Socket read timeout on Windows and macOS, in seconds
+  ///
+  /// If negative, the AWS SDK default value is used (typically 3 seconds).
+  /// This option is ignored on non-Windows, non-macOS systems.
+  double request_timeout = -1;
+
+  /// If non-empty, override region with a connect string such as "localhost:9000"
+  // XXX perhaps instead take a URL like "http://localhost:9000"?
+  std::string endpoint_override;
+  /// S3 connection transport, default "https"
+  std::string scheme = "https";
+
+  /// ARN of role to assume
+  std::string role_arn;
+  /// Optional identifier for an assumed role session.
+  std::string session_name;
+  /// Optional external identifier to pass to STS when assuming a role
+  std::string external_id;
+  /// Frequency (in seconds) to refresh temporary credentials from assumed role
+  int load_frequency = 900;
+
+  /// If connection is through a proxy, set options here
+  S3ProxyOptions proxy_options;
+
+  /// AWS credentials provider
+  std::shared_ptr<Aws::Auth::AWSCredentialsProvider> credentials_provider;
+
+  /// Type of credentials being used. Set along with credentials_provider.
+  S3CredentialsKind credentials_kind = S3CredentialsKind::Default;
+
+  /// Whether to use virtual addressing of buckets
+  ///
+  /// If true, then virtual addressing is always enabled.
+  /// If false, then virtual addressing is only enabled if `endpoint_override` is empty.
+  ///
+  /// This can be used for non-AWS backends that only support virtual hosted-style access.
+  bool force_virtual_addressing = false;
+
+  /// Whether OutputStream writes will be issued in the background, without blocking.
+  bool background_writes = true;
+
+  /// Whether to allow creation of buckets
+  ///
+  /// When S3FileSystem creates new buckets, it does not pass any non-default settings.
+  /// In AWS S3, the bucket and all objects will be not publicly visible, and there
+  /// will be no bucket policies and no resource tags. To have more control over how
+  /// buckets are created, use a different API to create them.
+  bool allow_bucket_creation = false;
+
+  /// Whether to allow deletion of buckets
+  bool allow_bucket_deletion = false;
+
+  /// Whether to allow pessimistic directory creation in CreateDir function
+  ///
+  /// By default, CreateDir function will try to create the directory without checking its
+  /// existence. It's an optimization to try directory creation and catch the error,
+  /// rather than issue two dependent I/O calls.
+  /// Though for key/value storage like Google Cloud Storage, too many creation calls will
+  /// breach the rate limit for object mutation operations and cause serious consequences.
+  /// It's also possible you don't have creation access for the parent directory. Set it
+  /// to be true to address these scenarios.
+  bool check_directory_existence_before_creation = false;
+
+  /// Whether to allow file-open methods to return before the actual open.
+  ///
+  /// Enabling this may reduce the latency of `OpenInputStream`, `OpenOutputStream`,
+  /// and similar methods, by reducing the number of roundtrips necessary. It may also
+  /// allow usage of more efficient S3 APIs for small files.
+  /// The downside is that failure conditions such as attempting to open a file in a
+  /// non-existing bucket will only be reported when actual I/O is done (at worse,
+  /// when attempting to close the file).
+  bool allow_delayed_open = false;
+
+  /// \brief Default metadata for OpenOutputStream.
+  ///
+  /// This will be ignored if non-empty metadata is passed to OpenOutputStream.
+  std::shared_ptr<const KeyValueMetadata> default_metadata;
+
+  /// Optional retry strategy to determine which error types should be retried, and the
+  /// delay between retries.
+  std::shared_ptr<S3RetryStrategy> retry_strategy;
+
+  /// Optional customer-provided key for server-side encryption (SSE-C).
+  ///
+  /// This should be the 32-byte AES-256 key, unencoded.
+  std::string sse_customer_key;
+
+  /// Optional path to a single PEM file holding all TLS CA certificates
+  ///
+  /// If empty, global filesystem options will be used (see FileSystemGlobalOptions);
+  /// if the corresponding global filesystem option is also empty, the underlying
+  /// TLS library's defaults will be used.
+  ///
+  /// Note this option may be ignored on some systems (Windows, macOS).
+  std::string tls_ca_file_path;
+
+  /// Optional path to a directory holding TLS CA
+  ///
+  /// The given directory should contain CA certificates as individual PEM files
+  /// named along the OpenSSL "hashed" format.
+  ///
+  /// If empty, global filesystem options will be used (see FileSystemGlobalOptions);
+  /// if the corresponding global filesystem option is also empty, the underlying
+  /// TLS library's defaults will be used.
+  ///
+  /// Note this option may be ignored on some systems (Windows, macOS).
+  std::string tls_ca_dir_path;
+
+  /// Whether to verify the S3 endpoint's TLS certificate
+  ///
+  /// This option applies if the scheme is "https".
+  bool tls_verify_certificates = true;
+
+  S3Options();
+
+  /// Configure with the default AWS credentials provider chain.
+  void ConfigureDefaultCredentials();
+
+  /// Configure with anonymous credentials.  This will only let you access public buckets.
+  void ConfigureAnonymousCredentials();
+
+  /// Configure with explicit access and secret key.
+  void ConfigureAccessKey(const std::string& access_key, const std::string& secret_key,
+                          const std::string& session_token = "");
+
+  /// Configure with credentials from an assumed role.
+  void ConfigureAssumeRoleCredentials(
+      const std::string& role_arn, const std::string& session_name = "",
+      const std::string& external_id = "", int load_frequency = 900,
+      const std::shared_ptr<Aws::STS::STSClient>& stsClient = NULLPTR);
+
+  /// Configure with credentials from role assumed using a web identity token
+  void ConfigureAssumeRoleWithWebIdentityCredentials();
+
+  std::string GetAccessKey() const;
+  std::string GetSecretKey() const;
+  std::string GetSessionToken() const;
+
+  bool Equals(const S3Options& other) const;
+
+  /// \brief Initialize with default credentials provider chain
+  ///
+  /// This is recommended if you use the standard AWS environment variables
+  /// and/or configuration file.
+  static S3Options Defaults();
+
+  /// \brief Initialize with anonymous credentials.
+  ///
+  /// This will only let you access public buckets.
+  static S3Options Anonymous();
+
+  /// \brief Initialize with explicit access and secret key.
+  ///
+  /// Optionally, a session token may also be provided for temporary credentials
+  /// (from STS).
+  static S3Options FromAccessKey(const std::string& access_key,
+                                 const std::string& secret_key,
+                                 const std::string& session_token = "");
+
+  /// \brief Initialize from an assumed role.
+  static S3Options FromAssumeRole(
+      const std::string& role_arn, const std::string& session_name = "",
+      const std::string& external_id = "", int load_frequency = 900,
+      const std::shared_ptr<Aws::STS::STSClient>& stsClient = NULLPTR);
+
+  /// \brief Initialize from an assumed role with web-identity.
+  /// Uses the AWS SDK which uses environment variables to
+  /// generate temporary credentials.
+  static S3Options FromAssumeRoleWithWebIdentity();
+
+  static Result<S3Options> FromUri(const ::arrow::util::Uri& uri,
+                                   std::string* out_path = NULLPTR);
+  static Result<S3Options> FromUri(const std::string& uri,
+                                   std::string* out_path = NULLPTR);
+};
+
+/// S3-backed FileSystem implementation.
+///
+/// Some implementation notes:
+/// - buckets are special and the operations available on them may be limited
+///   or more expensive than desired.
+class ARROW_EXPORT S3FileSystem : public FileSystem {
+ public:
+  ~S3FileSystem() override;
+
+  std::string type_name() const override { return "s3"; }
+
+  /// Return the original S3 options when constructing the filesystem
+  S3Options options() const;
+  /// Return the actual region this filesystem connects to
+  std::string region() const;
+
+  bool Equals(const FileSystem& other) const override;
+  Result<std::string> PathFromUri(const std::string& uri_string) const override;
+  Result<std::string> MakeUri(std::string path) const override;
+
+  /// \cond FALSE
+  using FileSystem::CreateDir;
+  using FileSystem::DeleteDirContents;
+  using FileSystem::DeleteDirContentsAsync;
+  using FileSystem::GetFileInfo;
+  using FileSystem::OpenAppendStream;
+  using FileSystem::OpenOutputStream;
+  /// \endcond
+
+  Result<FileInfo> GetFileInfo(const std::string& path) override;
+  Result<std::vector<FileInfo>> GetFileInfo(const FileSelector& select) override;
+
+  FileInfoGenerator GetFileInfoGenerator(const FileSelector& select) override;
+
+  Status CreateDir(const std::string& path, bool recursive) override;
+
+  Status DeleteDir(const std::string& path) override;
+  Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
+  Future<> DeleteDirContentsAsync(const std::string& path, bool missing_dir_ok) override;
+  Status DeleteRootDirContents() override;
+
+  Status DeleteFile(const std::string& path) override;
+
+  Status Move(const std::string& src, const std::string& dest) override;
+
+  Status CopyFile(const std::string& src, const std::string& dest) override;
+
+  /// Create a sequential input stream for reading from a S3 object.
+  ///
+  /// NOTE: Reads from the stream will be synchronous and unbuffered.
+  /// You way want to wrap the stream in a BufferedInputStream or use
+  /// a custom readahead strategy to avoid idle waits.
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(
+      const std::string& path) override;
+  /// Create a sequential input stream for reading from a S3 object.
+  ///
+  /// This override avoids a HEAD request by assuming the FileInfo
+  /// contains correct information.
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override;
+
+  /// Create a random access file for reading from a S3 object.
+  ///
+  /// See OpenInputStream for performance notes.
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const std::string& path) override;
+  /// Create a random access file for reading from a S3 object.
+  ///
+  /// This override avoids a HEAD request by assuming the FileInfo
+  /// contains correct information.
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const FileInfo& info) override;
+
+  /// Create a sequential output stream for writing to a S3 object.
+  ///
+  /// NOTE: Writes to the stream will be buffered.  Depending on
+  /// S3Options.background_writes, they can be synchronous or not.
+  /// It is recommended to enable background_writes unless you prefer
+  /// implementing your own background execution strategy.
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+  Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+  /// Create a S3FileSystem instance from the given options.
+  static Result<std::shared_ptr<S3FileSystem>> Make(
+      const S3Options& options, const io::IOContext& = io::default_io_context());
+
+ protected:
+  explicit S3FileSystem(const S3Options& options, const io::IOContext&);
+
+  class Impl;
+  std::shared_ptr<Impl> impl_;
+};
+
+enum class S3LogLevel : int8_t { Off, Fatal, Error, Warn, Info, Debug, Trace };
+
+struct ARROW_EXPORT S3GlobalOptions {
+  /// The log level for S3-originating messages.
+  S3LogLevel log_level;
+
+  /// The number of threads to configure when creating AWS' I/O event loop
+  ///
+  /// Defaults to 1 as recommended by AWS' doc when the # of connections is
+  /// expected to be, at most, in the hundreds
+  ///
+  /// For more details see Aws::Crt::Io::EventLoopGroup
+  int num_event_loop_threads = 1;
+
+  /// Whether to install a process-wide SIGPIPE handler
+  ///
+  /// The AWS SDK may sometimes emit SIGPIPE signals for certain errors;
+  /// by default, they would abort the current process.
+  /// This option, if enabled, will install a process-wide signal handler
+  /// that logs and otherwise ignore incoming SIGPIPE signals.
+  ///
+  /// This option has no effect on Windows.
+  bool install_sigpipe_handler = false;
+
+  /// \brief Initialize with default options
+  ///
+  /// For log_level, this method first tries to extract a suitable value from the
+  /// environment variable ARROW_S3_LOG_LEVEL.
+  static S3GlobalOptions Defaults();
+};
+
+/// \brief Initialize the S3 APIs with the specified set of options.
+///
+/// It is required to call this function at least once before using S3FileSystem.
+///
+/// Once this function is called you MUST call FinalizeS3 before the end of the
+/// application in order to avoid a segmentation fault at shutdown.
+ARROW_EXPORT
+Status InitializeS3(const S3GlobalOptions& options);
+
+/// \brief Ensure the S3 APIs are initialized, but only if not already done.
+///
+/// If necessary, this will call InitializeS3() with some default options.
+ARROW_EXPORT
+Status EnsureS3Initialized();
+
+/// Whether S3 was initialized, and not finalized.
+ARROW_EXPORT
+bool IsS3Initialized();
+
+/// Whether S3 was finalized.
+ARROW_EXPORT
+bool IsS3Finalized();
+
+/// \brief Shutdown the S3 APIs.
+///
+/// This can wait for some S3 concurrent calls to finish so as to avoid
+/// race conditions.
+/// After this function has been called, all S3 calls will fail with an error.
+///
+/// Calls to InitializeS3() and FinalizeS3() should be serialized by the
+/// application (this also applies to EnsureS3Initialized() and
+/// EnsureS3Finalized()).
+ARROW_EXPORT
+Status FinalizeS3();
+
+/// \brief Ensure the S3 APIs are shutdown, but only if not already done.
+///
+/// If necessary, this will call FinalizeS3().
+ARROW_EXPORT
+Status EnsureS3Finalized();
+
+ARROW_EXPORT
+Result<std::string> ResolveS3BucketRegion(const std::string& bucket);
+
+}  // namespace arrow::fs
diff --git a/pyarrow/include/arrow/filesystem/test_util.h b/pyarrow/include/arrow/filesystem/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..3217cc8ca36a53c02242e85558d358a69137bcfe
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/test_util.h
@@ -0,0 +1,264 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/filesystem/mockfs.h"
+#include "arrow/testing/visibility.h"
+#include "arrow/util/counting_semaphore_internal.h"
+
+namespace arrow {
+namespace fs {
+
+static constexpr double kTimeSlack = 2.0;  // In seconds
+
+static inline FileInfo File(std::string path) {
+  return FileInfo(std::move(path), FileType::File);
+}
+
+static inline FileInfo Dir(std::string path) {
+  return FileInfo(std::move(path), FileType::Directory);
+}
+
+// A subclass of MockFileSystem that blocks operations until an unlock method is
+// called.
+//
+// This is intended for testing fine-grained ordering of filesystem operations.
+//
+// N.B. Only OpenOutputStream supports gating at the moment but this is simply because
+//      it is all that has been needed so far.  Feel free to add support for more methods
+//      as required.
+class ARROW_TESTING_EXPORT GatedMockFilesystem : public internal::MockFileSystem {
+ public:
+  GatedMockFilesystem(TimePoint current_time,
+                      const io::IOContext& = io::default_io_context());
+  ~GatedMockFilesystem() override;
+
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata = {}) override;
+
+  // Wait until at least num_waiters are waiting on OpenOutputStream
+  Status WaitForOpenOutputStream(uint32_t num_waiters);
+  // Unlock `num_waiters` individual calls to OpenOutputStream
+  Status UnlockOpenOutputStream(uint32_t num_waiters);
+
+ private:
+  util::CountingSemaphore open_output_sem_;
+};
+
+ARROW_TESTING_EXPORT
+void CreateFile(FileSystem* fs, const std::string& path, const std::string& data);
+
+// Sort a vector of FileInfo by lexicographic path order
+ARROW_TESTING_EXPORT
+void SortInfos(FileInfoVector* infos);
+
+// Create a copy of a FileInfo vector sorted by lexicographic path order
+ARROW_TESTING_EXPORT
+FileInfoVector SortedInfos(const FileInfoVector& infos);
+
+ARROW_TESTING_EXPORT
+void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos);
+
+ARROW_TESTING_EXPORT
+void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type);
+
+ARROW_TESTING_EXPORT
+void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type,
+                    TimePoint mtime);
+
+ARROW_TESTING_EXPORT
+void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type,
+                    TimePoint mtime, int64_t size);
+
+ARROW_TESTING_EXPORT
+void AssertFileInfo(const FileInfo& info, const std::string& path, FileType type,
+                    int64_t size);
+
+ARROW_TESTING_EXPORT
+void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type);
+
+ARROW_TESTING_EXPORT
+void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type,
+                    TimePoint mtime);
+
+ARROW_TESTING_EXPORT
+void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type,
+                    TimePoint mtime, int64_t size);
+
+ARROW_TESTING_EXPORT
+void AssertFileInfo(FileSystem* fs, const std::string& path, FileType type, int64_t size);
+
+ARROW_TESTING_EXPORT
+void AssertFileContents(FileSystem* fs, const std::string& path,
+                        const std::string& expected_data);
+
+template <typename Duration>
+void AssertDurationBetween(Duration d, double min_secs, double max_secs) {
+  auto seconds = std::chrono::duration_cast<std::chrono::duration<double>>(d);
+  ASSERT_GE(seconds.count(), min_secs);
+  ASSERT_LE(seconds.count(), max_secs);
+}
+
+// Generic tests for FileSystem implementations.
+// To use this class, subclass both from it and ::testing::Test,
+// implement GetEmptyFileSystem(), and use GENERIC_FS_TEST_FUNCTIONS()
+// to define the various tests.
+class ARROW_TESTING_EXPORT GenericFileSystemTest {
+ public:
+  virtual ~GenericFileSystemTest();
+
+  void TestEmpty();
+  void TestNormalizePath();
+  void TestCreateDir();
+  void TestDeleteDir();
+  void TestDeleteDirContents();
+  void TestDeleteRootDirContents();
+  void TestDeleteFile();
+  void TestDeleteFiles();
+  void TestMoveFile();
+  void TestMoveDir();
+  void TestCopyFile();
+  void TestCopyFiles();
+  void TestGetFileInfo();
+  void TestGetFileInfoVector();
+  void TestGetFileInfoSelector();
+  void TestGetFileInfoSelectorWithRecursion();
+  void TestGetFileInfoAsync();
+  void TestGetFileInfoGenerator();
+  void TestOpenOutputStream();
+  void TestOpenAppendStream();
+  void TestOpenInputStream();
+  void TestOpenInputStreamWithFileInfo();
+  void TestOpenInputStreamAsync();
+  void TestOpenInputFile();
+  void TestOpenInputFileWithFileInfo();
+  void TestOpenInputFileAsync();
+  void TestSpecialChars();
+
+ protected:
+  // This function should return the filesystem under test.
+  virtual std::shared_ptr<FileSystem> GetEmptyFileSystem() = 0;
+
+  // Override the following functions to specify deviations from expected
+  // filesystem semantics.
+  // - Whether the filesystem may "implicitly" create intermediate directories
+  virtual bool have_implicit_directories() const { return false; }
+  // - Whether the filesystem may allow writing a file "over" a directory
+  virtual bool allow_write_file_over_dir() const { return false; }
+  // - Whether the filesystem may allow writing a directory "over" a file,
+  //   for example copying file "A" to "B/C" while "B" exists and is a file.
+  virtual bool allow_write_implicit_dir_over_file() const { return false; }
+  // - Whether the filesystem allows reading a directory
+  virtual bool allow_read_dir_as_file() const { return false; }
+  // - Whether the filesystem allows moving a file
+  virtual bool allow_move_file() const { return true; }
+  // - Whether the filesystem allows moving a directory
+  virtual bool allow_move_dir() const { return true; }
+  // - Whether the filesystem allows moving a directory "over" a non-empty destination
+  virtual bool allow_move_dir_over_non_empty_dir() const { return false; }
+  // - Whether the filesystem allows appending to a file
+  virtual bool allow_append_to_file() const { return true; }
+  // - Whether the filesystem allows appending to a nonexistent file
+  virtual bool allow_append_to_new_file() const { return true; }
+  // - Whether the filesystem supports directory modification times
+  virtual bool have_directory_mtimes() const { return true; }
+  // - Whether some directory tree deletion tests may fail randomly
+  virtual bool have_flaky_directory_tree_deletion() const { return false; }
+  // - Whether the filesystem stores some metadata alongside files
+  virtual bool have_file_metadata() const { return false; }
+  // - Whether the filesystem has a false positive memory leak with generator
+  virtual bool have_false_positive_memory_leak_with_generator() const { return false; }
+  // - Whether the filesystem has a false positive memory leak in async close
+  virtual bool have_false_positive_memory_leak_with_async_close() const { return false; }
+
+  void TestEmpty(FileSystem* fs);
+  void TestNormalizePath(FileSystem* fs);
+  void TestCreateDir(FileSystem* fs);
+  void TestDeleteDir(FileSystem* fs);
+  void TestDeleteDirContents(FileSystem* fs);
+  void TestDeleteRootDirContents(FileSystem* fs);
+  void TestDeleteFile(FileSystem* fs);
+  void TestDeleteFiles(FileSystem* fs);
+  void TestMoveFile(FileSystem* fs);
+  void TestMoveDir(FileSystem* fs);
+  void TestCopyFile(FileSystem* fs);
+  void TestCopyFiles(FileSystem* fs);
+  void TestGetFileInfo(FileSystem* fs);
+  void TestGetFileInfoVector(FileSystem* fs);
+  void TestGetFileInfoSelector(FileSystem* fs);
+  void TestGetFileInfoSelectorWithRecursion(FileSystem* fs);
+  void TestGetFileInfoAsync(FileSystem* fs);
+  void TestGetFileInfoGenerator(FileSystem* fs);
+  void TestOpenOutputStream(FileSystem* fs);
+  void TestOpenAppendStream(FileSystem* fs);
+  void TestOpenInputStream(FileSystem* fs);
+  void TestOpenInputStreamWithFileInfo(FileSystem* fs);
+  void TestOpenInputStreamAsync(FileSystem* fs);
+  void TestOpenInputFile(FileSystem* fs);
+  void TestOpenInputFileWithFileInfo(FileSystem* fs);
+  void TestOpenInputFileAsync(FileSystem* fs);
+  void TestSpecialChars(FileSystem* fs);
+};
+
+#define GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, NAME) \
+  TEST_MACRO(TEST_CLASS, NAME) { this->Test##NAME(); }
+
+#define GENERIC_FS_TEST_FUNCTIONS_MACROS(TEST_MACRO, TEST_CLASS)                     \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, Empty)                            \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, NormalizePath)                    \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CreateDir)                        \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteDir)                        \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteDirContents)                \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteRootDirContents)            \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteFile)                       \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, DeleteFiles)                      \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveFile)                         \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, MoveDir)                          \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CopyFile)                         \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, CopyFiles)                        \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfo)                      \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoVector)                \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoSelector)              \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoSelectorWithRecursion) \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoAsync)                 \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, GetFileInfoGenerator)             \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenOutputStream)                 \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenAppendStream)                 \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStream)                  \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStreamWithFileInfo)      \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputStreamAsync)             \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFile)                    \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileWithFileInfo)        \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, OpenInputFileAsync)               \
+  GENERIC_FS_TEST_FUNCTION(TEST_MACRO, TEST_CLASS, SpecialChars)
+
+#define GENERIC_FS_TEST_FUNCTIONS(TEST_CLASS) \
+  GENERIC_FS_TEST_FUNCTIONS_MACROS(TEST_F, TEST_CLASS)
+
+#define GENERIC_FS_TYPED_TEST_FUNCTIONS(TEST_CLASS) \
+  GENERIC_FS_TEST_FUNCTIONS_MACROS(TYPED_TEST, TEST_CLASS)
+
+}  // namespace fs
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/filesystem/type_fwd.h b/pyarrow/include/arrow/filesystem/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..92c70799be16c73804353a1f3bcae8b5a3674057
--- /dev/null
+++ b/pyarrow/include/arrow/filesystem/type_fwd.h
@@ -0,0 +1,53 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+namespace arrow {
+namespace fs {
+
+/// \brief FileSystem entry type
+enum class FileType : int8_t {
+  /// Entry is not found
+  NotFound,
+  /// Entry exists but its type is unknown
+  ///
+  /// This can designate a special file such as a Unix socket or character
+  /// device, or Windows NUL / CON / ...
+  Unknown,
+  /// Entry is a regular file
+  File,
+  /// Entry is a directory
+  Directory
+};
+
+struct FileInfo;
+
+struct FileSelector;
+
+class FileSystem;
+class AzureFileSystem;
+class GcsFileSystem;
+class LocalFileSystem;
+class S3FileSystem;
+class SlowFileSystem;
+class SubTreeFileSystem;
+
+}  // namespace fs
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/api.h b/pyarrow/include/arrow/flight/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed31b5c8fa41f39d915d8ecbeb40b37b51ac26d3
--- /dev/null
+++ b/pyarrow/include/arrow/flight/api.h
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/flight/client.h"
+#include "arrow/flight/client_auth.h"
+#include "arrow/flight/client_middleware.h"
+#include "arrow/flight/client_tracing_middleware.h"
+#include "arrow/flight/middleware.h"
+#include "arrow/flight/server.h"
+#include "arrow/flight/server_auth.h"
+#include "arrow/flight/server_middleware.h"
+#include "arrow/flight/server_tracing_middleware.h"
+#include "arrow/flight/types.h"
+#include "arrow/flight/types_async.h"
diff --git a/pyarrow/include/arrow/flight/client.h b/pyarrow/include/arrow/flight/client.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ad9f26275b0bb407e88e7dad6970db201a05579
--- /dev/null
+++ b/pyarrow/include/arrow/flight/client.h
@@ -0,0 +1,434 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// \brief Implementation of Flight RPC client.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "arrow/ipc/options.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/cancel.h"
+
+#include "arrow/flight/type_fwd.h"
+#include "arrow/flight/types.h"  // IWYU pragma: keep
+#include "arrow/flight/visibility.h"
+
+namespace arrow {
+
+class RecordBatch;
+class Schema;
+
+namespace flight {
+
+/// \brief A duration type for Flight call timeouts.
+typedef std::chrono::duration<double, std::chrono::seconds::period> TimeoutDuration;
+
+/// \brief Hints to the underlying RPC layer for Arrow Flight calls.
+class ARROW_FLIGHT_EXPORT FlightCallOptions {
+ public:
+  /// Create a default set of call options.
+  FlightCallOptions();
+
+  /// \brief An optional timeout for this call. Negative durations
+  /// mean an implementation-defined default behavior will be used
+  /// instead. This is the default value.
+  TimeoutDuration timeout;
+
+  /// \brief IPC reader options, if applicable for the call.
+  ipc::IpcReadOptions read_options;
+
+  /// \brief IPC writer options, if applicable for the call.
+  ipc::IpcWriteOptions write_options;
+
+  /// \brief Headers for client to add to context.
+  std::vector<std::pair<std::string, std::string>> headers;
+
+  /// \brief A token to enable interactive user cancellation of long-running requests.
+  StopToken stop_token;
+
+  /// \brief An optional memory manager to control where to allocate incoming data.
+  std::shared_ptr<MemoryManager> memory_manager;
+};
+
+/// \brief Indicate that the client attempted to write a message
+///     larger than the soft limit set via write_size_limit_bytes.
+class ARROW_FLIGHT_EXPORT FlightWriteSizeStatusDetail : public arrow::StatusDetail {
+ public:
+  explicit FlightWriteSizeStatusDetail(int64_t limit, int64_t actual)
+      : limit_(limit), actual_(actual) {}
+  const char* type_id() const override;
+  std::string ToString() const override;
+  int64_t limit() const { return limit_; }
+  int64_t actual() const { return actual_; }
+
+  /// \brief Extract this status detail from a status, or return
+  ///     nullptr if the status doesn't contain this status detail.
+  static std::shared_ptr<FlightWriteSizeStatusDetail> UnwrapStatus(
+      const arrow::Status& status);
+
+ private:
+  int64_t limit_;
+  int64_t actual_;
+};
+
+struct ARROW_FLIGHT_EXPORT FlightClientOptions {
+  /// \brief Root certificates to use for validating server
+  /// certificates.
+  std::string tls_root_certs;
+  /// \brief Override the hostname checked by TLS. Use with caution.
+  std::string override_hostname;
+  /// \brief The client certificate to use if using Mutual TLS
+  std::string cert_chain;
+  /// \brief The private key associated with the client certificate for Mutual TLS
+  std::string private_key;
+  /// \brief A list of client middleware to apply.
+  std::vector<std::shared_ptr<ClientMiddlewareFactory>> middleware;
+  /// \brief A soft limit on the number of bytes to write in a single
+  ///     batch when sending Arrow data to a server.
+  ///
+  /// Used to help limit server memory consumption. Only enabled if
+  /// positive. When enabled, FlightStreamWriter.Write* may yield a
+  /// IOError with error detail FlightWriteSizeStatusDetail.
+  int64_t write_size_limit_bytes = 0;
+
+  /// \brief Generic connection options, passed to the underlying
+  ///     transport; interpretation is implementation-dependent.
+  std::vector<std::pair<std::string, std::variant<int, std::string>>> generic_options;
+
+  /// \brief Use TLS without validating the server certificate. Use with caution.
+  bool disable_server_verification = false;
+
+  /// \brief Get default options.
+  static FlightClientOptions Defaults();
+};
+
+/// \brief A RecordBatchReader exposing Flight metadata and cancel
+/// operations.
+class ARROW_FLIGHT_EXPORT FlightStreamReader : public MetadataRecordBatchReader {
+ public:
+  /// \brief Try to cancel the call.
+  virtual void Cancel() = 0;
+
+  using MetadataRecordBatchReader::ToRecordBatches;
+  /// \brief Consume entire stream as a vector of record batches
+  virtual arrow::Result<std::vector<std::shared_ptr<RecordBatch>>> ToRecordBatches(
+      const StopToken& stop_token) = 0;
+
+  using MetadataRecordBatchReader::ToTable;
+  /// \brief Consume entire stream as a Table
+  arrow::Result<std::shared_ptr<Table>> ToTable(const StopToken& stop_token);
+
+  using MetadataRecordBatchReader::stats;
+  /// \brief Return current read statistics
+  virtual arrow::ipc::ReadStats stats() const = 0;
+};
+
+// Silence warning
+// "non dll-interface class RecordBatchReader used as base for dll-interface class"
+#ifdef _MSC_VER
+#  pragma warning(push)
+#  pragma warning(disable : 4275)
+#endif
+
+/// \brief A RecordBatchWriter that also allows sending
+/// application-defined metadata via the Flight protocol.
+class ARROW_FLIGHT_EXPORT FlightStreamWriter : public MetadataRecordBatchWriter {
+ public:
+  /// \brief Indicate that the application is done writing to this stream.
+  ///
+  /// The application may not write to this stream after calling
+  /// this. This differs from closing the stream because this writer
+  /// may represent only one half of a readable and writable stream.
+  virtual Status DoneWriting() = 0;
+};
+
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
+/// \brief A reader for application-specific metadata sent back to the
+/// client during an upload.
+class ARROW_FLIGHT_EXPORT FlightMetadataReader {
+ public:
+  virtual ~FlightMetadataReader();
+  /// \brief Read a message from the server.
+  virtual Status ReadMetadata(std::shared_ptr<Buffer>* out) = 0;
+};
+
+/// \brief Client class for Arrow Flight RPC services.
+class ARROW_FLIGHT_EXPORT FlightClient {
+ public:
+  ~FlightClient();
+
+  /// \brief Connect to an unauthenticated flight service
+  /// \param[in] location the URI
+  /// \return Arrow result with the created FlightClient, OK status may not indicate that
+  /// the connection was successful
+  static arrow::Result<std::unique_ptr<FlightClient>> Connect(const Location& location);
+
+  /// \brief Connect to an unauthenticated flight service
+  /// \param[in] location the URI
+  /// \param[in] options Other options for setting up the client
+  /// \return Arrow result with the created FlightClient, OK status may not indicate that
+  /// the connection was successful
+  static arrow::Result<std::unique_ptr<FlightClient>> Connect(
+      const Location& location, const FlightClientOptions& options);
+
+  /// \brief Authenticate to the server using the given handler.
+  /// \param[in] options Per-RPC options
+  /// \param[in] auth_handler The authentication mechanism to use
+  /// \return Status OK if the client authenticated successfully
+  Status Authenticate(const FlightCallOptions& options,
+                      std::unique_ptr<ClientAuthHandler> auth_handler);
+
+  /// \brief Authenticate to the server using basic HTTP style authentication.
+  /// \param[in] options Per-RPC options
+  /// \param[in] username Username to use
+  /// \param[in] password Password to use
+  /// \return Arrow result with bearer token and status OK if client authenticated
+  /// successfully
+  arrow::Result<std::pair<std::string, std::string>> AuthenticateBasicToken(
+      const FlightCallOptions& options, const std::string& username,
+      const std::string& password);
+
+  /// \brief Perform the indicated action, returning an iterator to the stream
+  /// of results, if any
+  /// \param[in] options Per-RPC options
+  /// \param[in] action the action to be performed
+  /// \return Arrow result with an iterator object for reading the returned results
+  arrow::Result<std::unique_ptr<ResultStream>> DoAction(const FlightCallOptions& options,
+                                                        const Action& action);
+  arrow::Result<std::unique_ptr<ResultStream>> DoAction(const Action& action) {
+    return DoAction({}, action);
+  }
+
+  /// \brief Perform the CancelFlightInfo action, returning a
+  /// CancelFlightInfoResult
+  ///
+  /// \param[in] options Per-RPC options
+  /// \param[in] request The CancelFlightInfoRequest
+  /// \return Arrow result with a CancelFlightInfoResult
+  arrow::Result<CancelFlightInfoResult> CancelFlightInfo(
+      const FlightCallOptions& options, const CancelFlightInfoRequest& request);
+  arrow::Result<CancelFlightInfoResult> CancelFlightInfo(
+      const CancelFlightInfoRequest& request) {
+    return CancelFlightInfo({}, request);
+  }
+
+  /// \brief Perform the RenewFlightEndpoint action, returning a renewed
+  /// FlightEndpoint
+  ///
+  /// \param[in] options Per-RPC options
+  /// \param[in] request The RenewFlightEndpointRequest
+  /// \return Arrow result with a renewed FlightEndpoint
+  arrow::Result<FlightEndpoint> RenewFlightEndpoint(
+      const FlightCallOptions& options, const RenewFlightEndpointRequest& request);
+  arrow::Result<FlightEndpoint> RenewFlightEndpoint(
+      const RenewFlightEndpointRequest& request) {
+    return RenewFlightEndpoint({}, request);
+  }
+
+  /// \brief Retrieve a list of available Action types
+  /// \param[in] options Per-RPC options
+  /// \return Arrow result with the available actions
+  arrow::Result<std::vector<ActionType>> ListActions(const FlightCallOptions& options);
+  arrow::Result<std::vector<ActionType>> ListActions() {
+    return ListActions(FlightCallOptions());
+  }
+
+  /// \brief Request access plan for a single flight, which may be an existing
+  /// dataset or a command to be executed
+  /// \param[in] options Per-RPC options
+  /// \param[in] descriptor the dataset request, whether a named dataset or
+  /// command
+  /// \return Arrow result with the FlightInfo describing where to access the dataset
+  arrow::Result<std::unique_ptr<FlightInfo>> GetFlightInfo(
+      const FlightCallOptions& options, const FlightDescriptor& descriptor);
+  arrow::Result<std::unique_ptr<FlightInfo>> GetFlightInfo(
+      const FlightDescriptor& descriptor) {
+    return GetFlightInfo({}, descriptor);
+  }
+
+  /// \brief Asynchronous GetFlightInfo.
+  /// \param[in] options Per-RPC options
+  /// \param[in] descriptor the dataset request
+  /// \param[in] listener Callbacks for response and RPC completion
+  void GetFlightInfoAsync(const FlightCallOptions& options,
+                          const FlightDescriptor& descriptor,
+                          std::shared_ptr<AsyncListener<FlightInfo>> listener);
+  void GetFlightInfoAsync(const FlightDescriptor& descriptor,
+                          std::shared_ptr<AsyncListener<FlightInfo>> listener) {
+    return GetFlightInfoAsync({}, descriptor, std::move(listener));
+  }
+
+  /// \brief Asynchronous GetFlightInfo returning a Future.
+  /// \param[in] options Per-RPC options
+  /// \param[in] descriptor the dataset request
+  arrow::Future<FlightInfo> GetFlightInfoAsync(const FlightCallOptions& options,
+                                               const FlightDescriptor& descriptor);
+  arrow::Future<FlightInfo> GetFlightInfoAsync(const FlightDescriptor& descriptor) {
+    return GetFlightInfoAsync({}, descriptor);
+  }
+
+  /// \brief Request and poll a long running query
+  /// \param[in] options Per-RPC options
+  /// \param[in] descriptor the dataset request or a descriptor returned by a
+  /// prior PollFlightInfo call
+  /// \return Arrow result with the PollInfo describing the status of
+  /// the requested query
+  arrow::Result<std::unique_ptr<PollInfo>> PollFlightInfo(
+      const FlightCallOptions& options, const FlightDescriptor& descriptor);
+  arrow::Result<std::unique_ptr<PollInfo>> PollFlightInfo(
+      const FlightDescriptor& descriptor) {
+    return PollFlightInfo({}, descriptor);
+  }
+
+  /// \brief Request schema for a single flight, which may be an existing
+  /// dataset or a command to be executed
+  /// \param[in] options Per-RPC options
+  /// \param[in] descriptor the dataset request, whether a named dataset or
+  /// command
+  /// \return Arrow result with the SchemaResult describing the dataset schema
+  arrow::Result<std::unique_ptr<SchemaResult>> GetSchema(
+      const FlightCallOptions& options, const FlightDescriptor& descriptor);
+
+  arrow::Result<std::unique_ptr<SchemaResult>> GetSchema(
+      const FlightDescriptor& descriptor) {
+    return GetSchema({}, descriptor);
+  }
+
+  /// \brief List all available flights known to the server
+  /// \return Arrow result with an iterator that returns a FlightInfo for each flight
+  arrow::Result<std::unique_ptr<FlightListing>> ListFlights();
+
+  /// \brief List available flights given indicated filter criteria
+  /// \param[in] options Per-RPC options
+  /// \param[in] criteria the filter criteria (opaque)
+  /// \return Arrow result with an iterator that returns a FlightInfo for each flight
+  arrow::Result<std::unique_ptr<FlightListing>> ListFlights(
+      const FlightCallOptions& options, const Criteria& criteria);
+
+  /// \brief Given a flight ticket and schema, request to be sent the
+  /// stream. Returns record batch stream reader
+  /// \param[in] options Per-RPC options
+  /// \param[in] ticket The flight ticket to use
+  /// \return Arrow result with the returned RecordBatchReader
+  arrow::Result<std::unique_ptr<FlightStreamReader>> DoGet(
+      const FlightCallOptions& options, const Ticket& ticket);
+  arrow::Result<std::unique_ptr<FlightStreamReader>> DoGet(const Ticket& ticket) {
+    return DoGet({}, ticket);
+  }
+
+  /// \brief DoPut return value
+  struct DoPutResult {
+    /// \brief a writer to write record batches to
+    std::unique_ptr<FlightStreamWriter> writer;
+    /// \brief a reader for application metadata from the server
+    std::unique_ptr<FlightMetadataReader> reader;
+  };
+  /// \brief Upload data to a Flight described by the given
+  /// descriptor. The caller must call Close() on the returned stream
+  /// once they are done writing.
+  ///
+  /// The reader and writer are linked; closing the writer will also
+  /// close the reader. Use \a DoneWriting to only close the write
+  /// side of the channel.
+  ///
+  /// \param[in] options Per-RPC options
+  /// \param[in] descriptor the descriptor of the stream
+  /// \param[in] schema the schema for the data to upload
+  /// \return Arrow result with a DoPutResult struct holding a reader and a writer
+  arrow::Result<DoPutResult> DoPut(const FlightCallOptions& options,
+                                   const FlightDescriptor& descriptor,
+                                   const std::shared_ptr<Schema>& schema);
+
+  arrow::Result<DoPutResult> DoPut(const FlightDescriptor& descriptor,
+                                   const std::shared_ptr<Schema>& schema) {
+    return DoPut({}, descriptor, schema);
+  }
+
+  struct DoExchangeResult {
+    std::unique_ptr<FlightStreamWriter> writer;
+    std::unique_ptr<FlightStreamReader> reader;
+  };
+  arrow::Result<DoExchangeResult> DoExchange(const FlightCallOptions& options,
+                                             const FlightDescriptor& descriptor);
+  arrow::Result<DoExchangeResult> DoExchange(const FlightDescriptor& descriptor) {
+    return DoExchange({}, descriptor);
+  }
+
+  /// \brief Set server session option(s) by name/value. Sessions are generally
+  /// persisted via HTTP cookies.
+  /// \param[in] options Per-RPC options
+  /// \param[in] request The server session options to set
+  ::arrow::Result<SetSessionOptionsResult> SetSessionOptions(
+      const FlightCallOptions& options, const SetSessionOptionsRequest& request);
+
+  /// \brief Get the current server session options. The session is generally
+  /// accessed via an HTTP cookie.
+  /// \param[in] options Per-RPC options
+  /// \param[in] request The (empty) GetSessionOptions request object.
+  ::arrow::Result<GetSessionOptionsResult> GetSessionOptions(
+      const FlightCallOptions& options, const GetSessionOptionsRequest& request);
+
+  /// \brief Close/invalidate the current server session. The session is generally
+  /// accessed via an HTTP cookie.
+  /// \param[in] options Per-RPC options
+  /// \param[in] request The (empty) CloseSession request object.
+  ::arrow::Result<CloseSessionResult> CloseSession(const FlightCallOptions& options,
+                                                   const CloseSessionRequest& request);
+
+  /// \brief Explicitly shut down and clean up the client.
+  ///
+  /// For backwards compatibility, this will be implicitly called by
+  /// the destructor if not already called, but this gives the
+  /// application no chance to handle errors, so it is recommended to
+  /// explicitly close the client.
+  ///
+  /// \since 8.0.0
+  Status Close();
+
+  /// \brief Whether this client supports asynchronous methods.
+  bool supports_async() const;
+
+  /// \brief Check whether this client supports asynchronous methods.
+  ///
+  /// This is like supports_async(), except that a detailed error message
+  /// is returned if async support is not available.  If async support is
+  /// available, this function returns successfully.
+  Status CheckAsyncSupport() const;
+
+ private:
+  FlightClient();
+  Status CheckOpen() const;
+  std::unique_ptr<internal::ClientTransport> transport_;
+  bool closed_;
+  int64_t write_size_limit_bytes_;
+};
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/client_auth.h b/pyarrow/include/arrow/flight/client_auth.h
new file mode 100644
index 0000000000000000000000000000000000000000..9dad36aa0948906ebb2447c0030cf117c8549c2c
--- /dev/null
+++ b/pyarrow/include/arrow/flight/client_auth.h
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/flight/visibility.h"
+#include "arrow/status.h"
+
+namespace arrow {
+
+namespace flight {
+
+/// \brief A reader for messages from the server during an
+/// authentication handshake.
+class ARROW_FLIGHT_EXPORT ClientAuthReader {
+ public:
+  virtual ~ClientAuthReader() = default;
+  virtual Status Read(std::string* response) = 0;
+};
+
+/// \brief A writer for messages to the server during an
+/// authentication handshake.
+class ARROW_FLIGHT_EXPORT ClientAuthSender {
+ public:
+  virtual ~ClientAuthSender() = default;
+  virtual Status Write(const std::string& token) = 0;
+};
+
+/// \brief An authentication implementation for a Flight service.
+/// Authentication includes both an initial negotiation and a per-call
+/// token validation. Implementations may choose to use either or both
+/// mechanisms.
+class ARROW_FLIGHT_EXPORT ClientAuthHandler {
+ public:
+  virtual ~ClientAuthHandler() = default;
+  /// \brief Authenticate the client on initial connection. The client
+  /// can send messages to/read responses from the server at any time.
+  /// \return Status OK if authenticated successfully
+  virtual Status Authenticate(ClientAuthSender* outgoing, ClientAuthReader* incoming) = 0;
+  /// \brief Get a per-call token.
+  /// \param[out] token The token to send to the server.
+  virtual Status GetToken(std::string* token) = 0;
+};
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/client_cookie_middleware.h b/pyarrow/include/arrow/flight/client_cookie_middleware.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a56a632dfbd220ee1aaf749f1c7fb2b9ab0852e
--- /dev/null
+++ b/pyarrow/include/arrow/flight/client_cookie_middleware.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Middleware implementation for sending and receiving HTTP cookies.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/flight/client_middleware.h"
+
+namespace arrow {
+namespace flight {
+
+/// \brief Returns a ClientMiddlewareFactory that handles sending and receiving cookies.
+ARROW_FLIGHT_EXPORT std::shared_ptr<ClientMiddlewareFactory> GetCookieFactory();
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/client_middleware.h b/pyarrow/include/arrow/flight/client_middleware.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e3126553a953b9d8f2fcdb94b72f9214b690de1
--- /dev/null
+++ b/pyarrow/include/arrow/flight/client_middleware.h
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Interfaces for defining middleware for Flight clients. Currently
+// experimental.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/flight/middleware.h"
+#include "arrow/flight/visibility.h"  // IWYU pragma: keep
+#include "arrow/status.h"
+
+namespace arrow {
+namespace flight {
+
+/// \brief Client-side middleware for a call, instantiated per RPC.
+///
+/// Middleware should be fast and must be infallible: there is no way
+/// to reject the call or report errors from the middleware instance.
+class ARROW_FLIGHT_EXPORT ClientMiddleware {
+ public:
+  virtual ~ClientMiddleware() = default;
+
+  /// \brief A callback before headers are sent. Extra headers can be
+  /// added, but existing ones cannot be read.
+  virtual void SendingHeaders(AddCallHeaders* outgoing_headers) = 0;
+
+  /// \brief A callback when headers are received from the server.
+  ///
+  /// This may be called more than once, since servers send both
+  /// headers and trailers.  Some implementations (e.g. gRPC-Java, and
+  /// hence Arrow Flight in Java) may consolidate headers into
+  /// trailers if the RPC errored.
+  virtual void ReceivedHeaders(const CallHeaders& incoming_headers) = 0;
+
+  /// \brief A callback after the call has completed.
+  virtual void CallCompleted(const Status& status) = 0;
+};
+
+/// \brief A factory for new middleware instances.
+///
+/// If added to a client, this will be called for each RPC (including
+/// Handshake) to give the opportunity to intercept the call.
+///
+/// It is guaranteed that all client middleware methods are called
+/// from the same thread that calls the RPC method implementation.
+class ARROW_FLIGHT_EXPORT ClientMiddlewareFactory {
+ public:
+  virtual ~ClientMiddlewareFactory() = default;
+
+  /// \brief A callback for the start of a new call.
+  ///
+  /// \param info Information about the call.
+  /// \param[out] middleware The middleware instance for this call. If
+  ///     unset, will not add middleware to this call instance from
+  ///     this factory.
+  virtual void StartCall(const CallInfo& info,
+                         std::unique_ptr<ClientMiddleware>* middleware) = 0;
+};
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/client_tracing_middleware.h b/pyarrow/include/arrow/flight/client_tracing_middleware.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a8b665ed6c0f0021abedea1917a4b4501157179
--- /dev/null
+++ b/pyarrow/include/arrow/flight/client_tracing_middleware.h
@@ -0,0 +1,34 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Middleware implementation for propagating OpenTelemetry spans.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/flight/client_middleware.h"
+
+namespace arrow {
+namespace flight {
+
+/// \brief Returns a ClientMiddlewareFactory that handles sending OpenTelemetry spans.
+ARROW_FLIGHT_EXPORT std::shared_ptr<ClientMiddlewareFactory>
+MakeTracingClientMiddlewareFactory();
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/middleware.h b/pyarrow/include/arrow/flight/middleware.h
new file mode 100644
index 0000000000000000000000000000000000000000..d717e396a8b68c749e53eeb241599ae28986d6da
--- /dev/null
+++ b/pyarrow/include/arrow/flight/middleware.h
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Interfaces for defining middleware for Flight clients and
+// servers.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "arrow/flight/types.h"
+#include "arrow/status.h"
+
+namespace arrow {
+namespace flight {
+
+/// \brief A write-only wrapper around headers for an RPC call.
+class ARROW_FLIGHT_EXPORT AddCallHeaders {
+ public:
+  virtual ~AddCallHeaders() = default;
+
+  /// \brief Add a header to be sent to the client.
+  ///
+  /// \param[in] key The header name. Must be lowercase ASCII; some
+  ///   transports may reject invalid header names.
+  /// \param[in] value The header value. Some transports may only
+  ///   accept binary header values if the header name ends in "-bin".
+  virtual void AddHeader(const std::string& key, const std::string& value) = 0;
+};
+
+/// \brief An enumeration of the RPC methods Flight implements.
+enum class FlightMethod : char {
+  Invalid = 0,
+  Handshake = 1,
+  ListFlights = 2,
+  GetFlightInfo = 3,
+  GetSchema = 4,
+  DoGet = 5,
+  DoPut = 6,
+  DoAction = 7,
+  ListActions = 8,
+  DoExchange = 9,
+  PollFlightInfo = 10,
+};
+
+/// \brief Get a human-readable name for a Flight method.
+ARROW_FLIGHT_EXPORT
+std::string ToString(FlightMethod method);
+
+/// \brief Information about an instance of a Flight RPC.
+struct ARROW_FLIGHT_EXPORT CallInfo {
+ public:
+  /// \brief The RPC method of this call.
+  FlightMethod method;
+};
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/otel_logging.h b/pyarrow/include/arrow/flight/otel_logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1e8cbb6fcc645e8d4df58be9b6e9415c67a4f59
--- /dev/null
+++ b/pyarrow/include/arrow/flight/otel_logging.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/config.h"
+
+#ifdef ARROW_WITH_OPENTELEMETRY
+#  include "arrow/status.h"
+#  include "arrow/telemetry/logging.h"
+#  include "arrow/util/macros.h"
+
+namespace arrow::flight {
+
+ARROW_EXPORT Status
+RegisterFlightOtelLoggers(const telemetry::OtelLoggingOptions& options);
+
+}  // namespace arrow::flight
+#endif
diff --git a/pyarrow/include/arrow/flight/platform.h b/pyarrow/include/arrow/flight/platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..498c87c5b7dc9ae94cb1fc1fa59e79338350493e
--- /dev/null
+++ b/pyarrow/include/arrow/flight/platform.h
@@ -0,0 +1,31 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Internal header. Platform-specific definitions for Flight.
+
+#pragma once
+
+#ifdef _MSC_VER
+
+// The protobuf documentation says that C4251 warnings when using the
+// library are spurious and suppressed when the build the library and
+// compiler, but must be also suppressed in downstream projects
+#  pragma warning(disable : 4251)
+
+#endif  // _MSC_VER
+
+#include "arrow/util/config.h"  // IWYU pragma: keep
diff --git a/pyarrow/include/arrow/flight/server.h b/pyarrow/include/arrow/flight/server.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d73353ab16c10dcc6742632f082a2b4aca907b8
--- /dev/null
+++ b/pyarrow/include/arrow/flight/server.h
@@ -0,0 +1,327 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Interfaces to use for defining Flight RPC servers.
+
+#pragma once
+
+#include <chrono>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/flight/server_auth.h"
+#include "arrow/flight/type_fwd.h"
+#include "arrow/flight/types.h"       // IWYU pragma: keep
+#include "arrow/flight/visibility.h"  // IWYU pragma: keep
+#include "arrow/ipc/dictionary.h"
+#include "arrow/ipc/options.h"
+#include "arrow/record_batch.h"
+
+namespace arrow {
+
+class Schema;
+class Status;
+
+namespace flight {
+
+/// \brief Interface that produces a sequence of IPC payloads to be sent in
+/// FlightData protobuf messages
+class ARROW_FLIGHT_EXPORT FlightDataStream {
+ public:
+  virtual ~FlightDataStream();
+
+  virtual std::shared_ptr<Schema> schema() = 0;
+
+  /// \brief Compute FlightPayload containing serialized RecordBatch schema
+  virtual arrow::Result<FlightPayload> GetSchemaPayload() = 0;
+
+  // When the stream is completed, the last payload written will have null
+  // metadata
+  virtual arrow::Result<FlightPayload> Next() = 0;
+
+  virtual Status Close();
+};
+
+/// \brief A basic implementation of FlightDataStream that will provide
+/// a sequence of FlightData messages to be written to a stream
+class ARROW_FLIGHT_EXPORT RecordBatchStream : public FlightDataStream {
+ public:
+  /// \param[in] reader produces a sequence of record batches
+  /// \param[in] options IPC options for writing
+  explicit RecordBatchStream(
+      const std::shared_ptr<RecordBatchReader>& reader,
+      const ipc::IpcWriteOptions& options = ipc::IpcWriteOptions::Defaults());
+  ~RecordBatchStream() override;
+
+  // inherit deprecated API
+  using FlightDataStream::GetSchemaPayload;
+  using FlightDataStream::Next;
+
+  std::shared_ptr<Schema> schema() override;
+  arrow::Result<FlightPayload> GetSchemaPayload() override;
+
+  arrow::Result<FlightPayload> Next() override;
+  Status Close() override;
+
+ private:
+  class RecordBatchStreamImpl;
+  std::unique_ptr<RecordBatchStreamImpl> impl_;
+};
+
+/// \brief A reader for IPC payloads uploaded by a client. Also allows
+/// reading application-defined metadata via the Flight protocol.
+class ARROW_FLIGHT_EXPORT FlightMessageReader : public MetadataRecordBatchReader {
+ public:
+  /// \brief Get the descriptor for this upload.
+  virtual const FlightDescriptor& descriptor() const = 0;
+};
+
+/// \brief A writer for application-specific metadata sent back to the
+/// client during an upload.
+class ARROW_FLIGHT_EXPORT FlightMetadataWriter {
+ public:
+  virtual ~FlightMetadataWriter();
+  /// \brief Send a message to the client.
+  virtual Status WriteMetadata(const Buffer& app_metadata) = 0;
+};
+
+/// \brief A writer for IPC payloads to a client. Also allows sending
+/// application-defined metadata via the Flight protocol.
+///
+/// This class offers more control compared to FlightDataStream,
+/// including the option to write metadata without data and the
+/// ability to interleave reading and writing.
+class ARROW_FLIGHT_EXPORT FlightMessageWriter : public MetadataRecordBatchWriter {
+ public:
+  virtual ~FlightMessageWriter() = default;
+};
+
+/// \brief Call state/contextual data.
+class ARROW_FLIGHT_EXPORT ServerCallContext {
+ public:
+  virtual ~ServerCallContext() = default;
+  /// \brief The name of the authenticated peer (may be the empty string)
+  virtual const std::string& peer_identity() const = 0;
+  /// \brief The peer address (not validated)
+  virtual const std::string& peer() const = 0;
+  /// \brief Add a response header.  This is only valid before the server
+  /// starts sending the response; generally this isn't an issue unless you
+  /// are implementing FlightDataStream, ResultStream, or similar interfaces
+  /// yourself, or during a DoExchange or DoPut.
+  virtual void AddHeader(const std::string& key, const std::string& value) const = 0;
+  /// \brief Add a response trailer.  This is only valid before the server
+  /// sends the final status; generally this isn't an issue unless your RPC
+  /// handler launches a thread or similar.
+  virtual void AddTrailer(const std::string& key, const std::string& value) const = 0;
+  /// \brief Look up a middleware by key. Do not maintain a reference
+  /// to the object beyond the request body.
+  /// \return The middleware, or nullptr if not found.
+  virtual ServerMiddleware* GetMiddleware(const std::string& key) const = 0;
+  /// \brief Check if the current RPC has been cancelled (by the client, by
+  /// a network error, etc.).
+  virtual bool is_cancelled() const = 0;
+  /// \brief The headers sent by the client for this call.
+  virtual const CallHeaders& incoming_headers() const = 0;
+};
+
+class ARROW_FLIGHT_EXPORT FlightServerOptions {
+ public:
+  explicit FlightServerOptions(const Location& location_);
+
+  ~FlightServerOptions();
+
+  /// \brief The host & port (or domain socket path) to listen on.
+  /// Use port 0 to bind to an available port.
+  Location location;
+  /// \brief The authentication handler to use.
+  std::shared_ptr<ServerAuthHandler> auth_handler;
+  /// \brief A list of TLS certificate+key pairs to use.
+  std::vector<CertKeyPair> tls_certificates;
+  /// \brief Enable mTLS and require that the client present a certificate.
+  bool verify_client;
+  /// \brief If using mTLS, the PEM-encoded root certificate to use.
+  std::string root_certificates;
+  /// \brief A list of server middleware to apply, along with a key to
+  /// identify them by.
+  ///
+  /// Middleware are always applied in the order provided. Duplicate
+  /// keys are an error.
+  std::vector<std::pair<std::string, std::shared_ptr<ServerMiddlewareFactory>>>
+      middleware;
+
+  /// \brief An optional memory manager to control where to allocate incoming data.
+  std::shared_ptr<MemoryManager> memory_manager;
+
+  /// \brief A Flight implementation-specific callback to customize
+  /// transport-specific options.
+  ///
+  /// Not guaranteed to be called. The type of the parameter is
+  /// specific to the Flight implementation. Users should take care to
+  /// link to the same transport implementation as Flight to avoid
+  /// runtime problems. See "Using Arrow C++ in your own project" in
+  /// the documentation for more details.
+  std::function<void(void*)> builder_hook;
+};
+
+/// \brief Skeleton RPC server implementation which can be used to create
+/// custom servers by implementing its abstract methods
+class ARROW_FLIGHT_EXPORT FlightServerBase {
+ public:
+  FlightServerBase();
+  virtual ~FlightServerBase();
+
+  // Lifecycle methods.
+
+  /// \brief Initialize a Flight server listening at the given location.
+  /// This method must be called before any other method.
+  /// \param[in] options The configuration for this server.
+  Status Init(const FlightServerOptions& options);
+
+  /// \brief Get the port that the Flight server is listening on.
+  /// This method must only be called after Init().  Will return a
+  /// non-positive value if no port exists (e.g. when listening on a
+  /// domain socket).
+  int port() const;
+
+  /// \brief Get the address that the Flight server is listening on.
+  /// This method must only be called after Init().
+  Location location() const;
+
+  /// \brief Set the server to stop when receiving any of the given signal
+  /// numbers.
+  /// This method must be called before Serve().
+  Status SetShutdownOnSignals(const std::vector<int> sigs);
+
+  /// \brief Start serving.
+  /// This method blocks until the server shuts down.
+  ///
+  /// The server will start to shut down when either Shutdown() is called
+  /// or one of the signals registered in SetShutdownOnSignals() is received.
+  Status Serve();
+
+  /// \brief Query whether Serve() was interrupted by a signal.
+  /// This method must be called after Serve() has returned.
+  ///
+  /// \return int the signal number that interrupted Serve(), if any, otherwise 0
+  int GotSignal() const;
+
+  /// \brief Shut down the server, blocking until current requests finish.
+  ///
+  /// Can be called from a signal handler or another thread while Serve()
+  /// blocks. Optionally a deadline can be set. Once the deadline expires
+  /// server will wait until remaining running calls complete.
+  ///
+  /// Should only be called once.
+  Status Shutdown(const std::chrono::system_clock::time_point* deadline = NULLPTR);
+
+  /// \brief Block until server shuts down with Shutdown.
+  ///
+  /// Does not respond to signals like Serve().
+  Status Wait();
+
+  // Implement these methods to create your own server. The default
+  // implementations will return a not-implemented result to the client
+
+  /// \brief Retrieve a list of available fields given an optional opaque
+  /// criteria
+  /// \param[in] context The call context.
+  /// \param[in] criteria may be null
+  /// \param[out] listings the returned listings iterator
+  /// \return Status
+  virtual Status ListFlights(const ServerCallContext& context, const Criteria* criteria,
+                             std::unique_ptr<FlightListing>* listings);
+
+  /// \brief Retrieve the schema and an access plan for the indicated
+  /// descriptor
+  /// \param[in] context The call context.
+  /// \param[in] request the dataset request, whether a named dataset or command
+  /// \param[out] info the returned flight info provider
+  /// \return Status
+  virtual Status GetFlightInfo(const ServerCallContext& context,
+                               const FlightDescriptor& request,
+                               std::unique_ptr<FlightInfo>* info);
+
+  /// \brief Retrieve the current status of the target query
+  /// \param[in] context The call context.
+  /// \param[in] request the dataset request or a descriptor returned by a
+  /// prior PollFlightInfo call
+  /// \param[out] info the returned retry info provider
+  /// \return Status
+  virtual Status PollFlightInfo(const ServerCallContext& context,
+                                const FlightDescriptor& request,
+                                std::unique_ptr<PollInfo>* info);
+
+  /// \brief Retrieve the schema for the indicated descriptor
+  /// \param[in] context The call context.
+  /// \param[in] request the dataset request, whether a named dataset or command
+  /// \param[out] schema the returned flight schema provider
+  /// \return Status
+  virtual Status GetSchema(const ServerCallContext& context,
+                           const FlightDescriptor& request,
+                           std::unique_ptr<SchemaResult>* schema);
+
+  /// \brief Get a stream of IPC payloads to put on the wire
+  /// \param[in] context The call context.
+  /// \param[in] request an opaque ticket
+  /// \param[out] stream the returned stream provider
+  /// \return Status
+  virtual Status DoGet(const ServerCallContext& context, const Ticket& request,
+                       std::unique_ptr<FlightDataStream>* stream);
+
+  /// \brief Process a stream of IPC payloads sent from a client
+  /// \param[in] context The call context.
+  /// \param[in] reader a sequence of uploaded record batches
+  /// \param[in] writer send metadata back to the client
+  /// \return Status
+  virtual Status DoPut(const ServerCallContext& context,
+                       std::unique_ptr<FlightMessageReader> reader,
+                       std::unique_ptr<FlightMetadataWriter> writer);
+
+  /// \brief Process a bidirectional stream of IPC payloads
+  /// \param[in] context The call context.
+  /// \param[in] reader a sequence of uploaded record batches
+  /// \param[in] writer send data back to the client
+  /// \return Status
+  virtual Status DoExchange(const ServerCallContext& context,
+                            std::unique_ptr<FlightMessageReader> reader,
+                            std::unique_ptr<FlightMessageWriter> writer);
+
+  /// \brief Execute an action, return stream of zero or more results
+  /// \param[in] context The call context.
+  /// \param[in] action the action to execute, with type and body
+  /// \param[out] result the result iterator
+  /// \return Status
+  virtual Status DoAction(const ServerCallContext& context, const Action& action,
+                          std::unique_ptr<ResultStream>* result);
+
+  /// \brief Retrieve the list of available actions
+  /// \param[in] context The call context.
+  /// \param[out] actions a vector of available action types
+  /// \return Status
+  virtual Status ListActions(const ServerCallContext& context,
+                             std::vector<ActionType>* actions);
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/server_auth.h b/pyarrow/include/arrow/flight/server_auth.h
new file mode 100644
index 0000000000000000000000000000000000000000..147bef68d08a0a25a7b6852d16921b357668ea57
--- /dev/null
+++ b/pyarrow/include/arrow/flight/server_auth.h
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// \brief Server-side APIs to implement authentication for Flight.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/flight/type_fwd.h"
+#include "arrow/flight/visibility.h"
+#include "arrow/status.h"
+
+namespace arrow {
+
+namespace flight {
+
+/// \brief A reader for messages from the client during an
+/// authentication handshake.
+class ARROW_FLIGHT_EXPORT ServerAuthReader {
+ public:
+  virtual ~ServerAuthReader() = default;
+  virtual Status Read(std::string* token) = 0;
+};
+
+/// \brief A writer for messages to the client during an
+/// authentication handshake.
+class ARROW_FLIGHT_EXPORT ServerAuthSender {
+ public:
+  virtual ~ServerAuthSender() = default;
+  virtual Status Write(const std::string& message) = 0;
+};
+
+/// \brief An authentication implementation for a Flight service.
+/// Authentication includes both an initial negotiation and a per-call
+/// token validation. Implementations may choose to use either or both
+/// mechanisms.
+/// An implementation may need to track some state, e.g. a mapping of
+/// client tokens to authenticated identities.
+class ARROW_FLIGHT_EXPORT ServerAuthHandler {
+ public:
+  virtual ~ServerAuthHandler();
+  /// \brief Authenticate the client on initial connection. The server
+  /// can send and read responses from the client at any time.
+  /// \param[in] context The call context.
+  /// \param[in] outgoing The writer for messages to the client.
+  /// \param[in] incoming The reader for messages from the client.
+  /// \return Status OK if this authentication is succeeded.
+  virtual Status Authenticate(const ServerCallContext& context,
+                              ServerAuthSender* outgoing, ServerAuthReader* incoming) = 0;
+  /// \brief Validate a per-call client token.
+  /// \param[in] context The call context.
+  /// \param[in] token The client token. May be the empty string if
+  /// the client does not provide a token.
+  /// \param[out] peer_identity The identity of the peer, if this
+  /// authentication method supports it.
+  /// \return Status OK if the token is valid, any other status if
+  /// validation failed
+  virtual Status IsValid(const ServerCallContext& context, const std::string& token,
+                         std::string* peer_identity) {
+    // TODO: We can make this pure virtual function when we remove
+    // the deprecated version.
+    ARROW_SUPPRESS_DEPRECATION_WARNING
+    return IsValid(token, peer_identity);
+    ARROW_UNSUPPRESS_DEPRECATION_WARNING
+  }
+  /// \brief Validate a per-call client token.
+  /// \param[in] token The client token. May be the empty string if
+  /// the client does not provide a token.
+  /// \param[out] peer_identity The identity of the peer, if this
+  /// authentication method supports it.
+  /// \return Status OK if the token is valid, any other status if
+  /// validation failed
+  /// \deprecated Deprecated in 13.0.0. Implement the IsValid()
+  /// with ServerCallContext version instead.
+  ARROW_DEPRECATED("Deprecated in 13.0.0. Use ServerCallContext overload instead.")
+  virtual Status IsValid(const std::string& token, std::string* peer_identity) {
+    return Status::NotImplemented(typeid(this).name(), "::IsValid() isn't implemented");
+  }
+};
+
+/// \brief An authentication mechanism that does nothing.
+class ARROW_FLIGHT_EXPORT NoOpAuthHandler : public ServerAuthHandler {
+ public:
+  ~NoOpAuthHandler() override;
+  Status Authenticate(const ServerCallContext& context, ServerAuthSender* outgoing,
+                      ServerAuthReader* incoming) override;
+  Status IsValid(const ServerCallContext& context, const std::string& token,
+                 std::string* peer_identity) override;
+};
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/server_middleware.h b/pyarrow/include/arrow/flight/server_middleware.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a0f3c4750612eda91434bb87479f51cf85be7e6
--- /dev/null
+++ b/pyarrow/include/arrow/flight/server_middleware.h
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Interfaces for defining middleware for Flight servers.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/flight/middleware.h"
+#include "arrow/flight/type_fwd.h"
+#include "arrow/flight/visibility.h"  // IWYU pragma: keep
+#include "arrow/status.h"
+
+namespace arrow {
+namespace flight {
+
+/// \brief Server-side middleware for a call, instantiated per RPC.
+///
+/// Middleware should be fast and must be infallible: there is no way
+/// to reject the call or report errors from the middleware instance.
+class ARROW_FLIGHT_EXPORT ServerMiddleware {
+ public:
+  virtual ~ServerMiddleware() = default;
+
+  /// \brief Unique name of middleware, used as alternative to RTTI
+  /// \return the string name of the middleware
+  virtual std::string name() const = 0;
+
+  /// \brief A callback before headers are sent. Extra headers can be
+  /// added, but existing ones cannot be read.
+  virtual void SendingHeaders(AddCallHeaders* outgoing_headers) = 0;
+
+  /// \brief A callback after the call has completed.
+  virtual void CallCompleted(const Status& status) = 0;
+};
+
+/// \brief A factory for new middleware instances.
+///
+/// If added to a server, this will be called for each RPC (including
+/// Handshake) to give the opportunity to intercept the call.
+///
+/// It is guaranteed that all server middleware methods are called
+/// from the same thread that calls the RPC method implementation.
+class ARROW_FLIGHT_EXPORT ServerMiddlewareFactory {
+ public:
+  virtual ~ServerMiddlewareFactory() = default;
+
+  /// \brief A callback for the start of a new call.
+  ///
+  /// Return a non-OK status to reject the call with the given status.
+  ///
+  /// \param[in] info Information about the call.
+  /// \param[in] context The call context.
+  /// \param[out] middleware The middleware instance for this call. If
+  ///     null, no middleware will be added to this call instance from
+  ///     this factory.
+  /// \return Status A non-OK status will reject the call with the
+  ///     given status. Middleware previously in the chain will have
+  ///     their CallCompleted callback called. Other middleware
+  ///     factories will not be called.
+  virtual Status StartCall(const CallInfo& info, const ServerCallContext& context,
+                           std::shared_ptr<ServerMiddleware>* middleware) = 0;
+};
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/server_tracing_middleware.h b/pyarrow/include/arrow/flight/server_tracing_middleware.h
new file mode 100644
index 0000000000000000000000000000000000000000..50c8294a63b0622a91a514eb59b3f6d47c5aa076
--- /dev/null
+++ b/pyarrow/include/arrow/flight/server_tracing_middleware.h
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Middleware implementation for propagating OpenTelemetry spans.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/flight/server_middleware.h"
+#include "arrow/flight/visibility.h"
+#include "arrow/status.h"
+
+namespace arrow {
+namespace flight {
+
+/// \brief Returns a ServerMiddlewareFactory that handles receiving OpenTelemetry spans.
+ARROW_FLIGHT_EXPORT std::shared_ptr<ServerMiddlewareFactory>
+MakeTracingServerMiddlewareFactory();
+
+/// \brief A server middleware that provides access to the
+///   OpenTelemetry context, if present.
+///
+/// Used to make the OpenTelemetry span available in Python.
+class ARROW_FLIGHT_EXPORT TracingServerMiddleware : public ServerMiddleware {
+ public:
+  ~TracingServerMiddleware();
+
+  static constexpr const char kMiddlewareName[] =
+      "arrow::flight::TracingServerMiddleware";
+
+  std::string name() const override { return kMiddlewareName; }
+  void SendingHeaders(AddCallHeaders*) override;
+  void CallCompleted(const Status&) override;
+
+  struct TraceKey {
+    std::string key;
+    std::string value;
+  };
+  /// \brief Get the trace context.
+  std::vector<TraceKey> GetTraceContext() const;
+
+ private:
+  class Impl;
+  friend class TracingServerMiddlewareFactory;
+
+  explicit TracingServerMiddleware(std::unique_ptr<Impl> impl);
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/test_auth_handlers.h b/pyarrow/include/arrow/flight/test_auth_handlers.h
new file mode 100644
index 0000000000000000000000000000000000000000..74f48798f3b025ad05617a377522040c152d2a6f
--- /dev/null
+++ b/pyarrow/include/arrow/flight/test_auth_handlers.h
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/flight/client_auth.h"
+#include "arrow/flight/server.h"
+#include "arrow/flight/server_auth.h"
+#include "arrow/flight/types.h"
+#include "arrow/flight/visibility.h"
+#include "arrow/status.h"
+
+// A pair of authentication handlers that check for a predefined password
+// and set the peer identity to a predefined username.
+
+namespace arrow::flight {
+
+class ARROW_FLIGHT_EXPORT TestServerAuthHandler : public ServerAuthHandler {
+ public:
+  explicit TestServerAuthHandler(const std::string& username,
+                                 const std::string& password);
+  ~TestServerAuthHandler() override;
+  Status Authenticate(const ServerCallContext& context, ServerAuthSender* outgoing,
+                      ServerAuthReader* incoming) override;
+  Status IsValid(const ServerCallContext& context, const std::string& token,
+                 std::string* peer_identity) override;
+
+ private:
+  std::string username_;
+  std::string password_;
+};
+
+class ARROW_FLIGHT_EXPORT TestServerBasicAuthHandler : public ServerAuthHandler {
+ public:
+  explicit TestServerBasicAuthHandler(const std::string& username,
+                                      const std::string& password);
+  ~TestServerBasicAuthHandler() override;
+  Status Authenticate(const ServerCallContext& context, ServerAuthSender* outgoing,
+                      ServerAuthReader* incoming) override;
+  Status IsValid(const ServerCallContext& context, const std::string& token,
+                 std::string* peer_identity) override;
+
+ private:
+  BasicAuth basic_auth_;
+};
+
+class ARROW_FLIGHT_EXPORT TestClientAuthHandler : public ClientAuthHandler {
+ public:
+  explicit TestClientAuthHandler(const std::string& username,
+                                 const std::string& password);
+  ~TestClientAuthHandler() override;
+  Status Authenticate(ClientAuthSender* outgoing, ClientAuthReader* incoming) override;
+  Status GetToken(std::string* token) override;
+
+ private:
+  std::string username_;
+  std::string password_;
+};
+
+class ARROW_FLIGHT_EXPORT TestClientBasicAuthHandler : public ClientAuthHandler {
+ public:
+  explicit TestClientBasicAuthHandler(const std::string& username,
+                                      const std::string& password);
+  ~TestClientBasicAuthHandler() override;
+  Status Authenticate(ClientAuthSender* outgoing, ClientAuthReader* incoming) override;
+  Status GetToken(std::string* token) override;
+
+ private:
+  BasicAuth basic_auth_;
+  std::string token_;
+};
+
+}  // namespace arrow::flight
diff --git a/pyarrow/include/arrow/flight/test_definitions.h b/pyarrow/include/arrow/flight/test_definitions.h
new file mode 100644
index 0000000000000000000000000000000000000000..1391ffc40bd8fa82904eefea9bad17689d6a41f5
--- /dev/null
+++ b/pyarrow/include/arrow/flight/test_definitions.h
@@ -0,0 +1,318 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Common test definitions for Flight. Individual transport
+// implementations can instantiate these tests.
+//
+// While Googletest's value-parameterized tests would be a more
+// natural way to do this, they cause runtime issues on MinGW/MSVC
+// (Googletest thinks the test suite has been defined twice).
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/flight/server.h"
+#include "arrow/flight/types.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace flight {
+
+class ARROW_FLIGHT_EXPORT FlightTest {
+ protected:
+  virtual std::string transport() const = 0;
+  virtual bool supports_async() const { return false; }
+  virtual void SetUpTest() {}
+  virtual void TearDownTest() {}
+};
+
+/// Common tests of startup/shutdown
+class ARROW_FLIGHT_EXPORT ConnectivityTest : public FlightTest {
+ public:
+  // Test methods
+  void TestGetPort();
+  void TestBuilderHook();
+  void TestShutdown();
+  void TestShutdownWithDeadline();
+  void TestBrokenConnection();
+};
+
+#define ARROW_FLIGHT_TEST_CONNECTIVITY(FIXTURE)                                  \
+  static_assert(std::is_base_of<ConnectivityTest, FIXTURE>::value,               \
+                ARROW_STRINGIFY(FIXTURE) " must inherit from ConnectivityTest"); \
+  TEST_F(FIXTURE, GetPort) { TestGetPort(); }                                    \
+  TEST_F(FIXTURE, BuilderHook) { TestBuilderHook(); }                            \
+  TEST_F(FIXTURE, Shutdown) { TestShutdown(); }                                  \
+  TEST_F(FIXTURE, ShutdownWithDeadline) { TestShutdownWithDeadline(); }          \
+  TEST_F(FIXTURE, BrokenConnection) { TestBrokenConnection(); }
+
+/// Common tests of data plane methods
+class ARROW_FLIGHT_EXPORT DataTest : public FlightTest {
+ public:
+  void SetUpTest() override;
+  void TearDownTest() override;
+  Status ConnectClient();
+
+  // Test methods
+  void TestDoGetInts();
+  void TestDoGetFloats();
+  void TestDoGetDicts();
+  void TestDoGetLargeBatch();
+  void TestFlightDataStreamError();
+  void TestOverflowServerBatch();
+  void TestOverflowClientBatch();
+  void TestDoExchange();
+  void TestDoExchangeNoData();
+  void TestDoExchangeWriteOnlySchema();
+  void TestDoExchangeGet();
+  void TestDoExchangePut();
+  void TestDoExchangeEcho();
+  void TestDoExchangeTotal();
+  void TestDoExchangeError();
+  void TestDoExchangeConcurrency();
+  void TestDoExchangeUndrained();
+  void TestIssue5095();
+
+ private:
+  void CheckDoGet(
+      const FlightDescriptor& descr, const RecordBatchVector& expected_batches,
+      std::function<void(const std::vector<FlightEndpoint>&)> check_endpoints);
+  void CheckDoGet(const Ticket& ticket, const RecordBatchVector& expected_batches);
+
+  std::unique_ptr<FlightClient> client_;
+  std::unique_ptr<FlightServerBase> server_;
+};
+
+#define ARROW_FLIGHT_TEST_DATA(FIXTURE)                                               \
+  static_assert(std::is_base_of<DataTest, FIXTURE>::value,                            \
+                ARROW_STRINGIFY(FIXTURE) " must inherit from DataTest");              \
+  TEST_F(FIXTURE, TestDoGetInts) { TestDoGetInts(); }                                 \
+  TEST_F(FIXTURE, TestDoGetFloats) { TestDoGetFloats(); }                             \
+  TEST_F(FIXTURE, TestDoGetDicts) { TestDoGetDicts(); }                               \
+  TEST_F(FIXTURE, TestDoGetLargeBatch) { TestDoGetLargeBatch(); }                     \
+  TEST_F(FIXTURE, TestFlightDataStreamError) { TestFlightDataStreamError(); }         \
+  TEST_F(FIXTURE, TestOverflowServerBatch) { TestOverflowServerBatch(); }             \
+  TEST_F(FIXTURE, TestOverflowClientBatch) { TestOverflowClientBatch(); }             \
+  TEST_F(FIXTURE, TestDoExchange) { TestDoExchange(); }                               \
+  TEST_F(FIXTURE, TestDoExchangeNoData) { TestDoExchangeNoData(); }                   \
+  TEST_F(FIXTURE, TestDoExchangeWriteOnlySchema) { TestDoExchangeWriteOnlySchema(); } \
+  TEST_F(FIXTURE, TestDoExchangeGet) { TestDoExchangeGet(); }                         \
+  TEST_F(FIXTURE, TestDoExchangePut) { TestDoExchangePut(); }                         \
+  TEST_F(FIXTURE, TestDoExchangeEcho) { TestDoExchangeEcho(); }                       \
+  TEST_F(FIXTURE, TestDoExchangeTotal) { TestDoExchangeTotal(); }                     \
+  TEST_F(FIXTURE, TestDoExchangeError) { TestDoExchangeError(); }                     \
+  TEST_F(FIXTURE, TestDoExchangeConcurrency) { TestDoExchangeConcurrency(); }         \
+  TEST_F(FIXTURE, TestDoExchangeUndrained) { TestDoExchangeUndrained(); }             \
+  TEST_F(FIXTURE, TestIssue5095) { TestIssue5095(); }
+
+/// \brief Specific tests of DoPut.
+class ARROW_FLIGHT_EXPORT DoPutTest : public FlightTest {
+ public:
+  void SetUpTest() override;
+  void TearDownTest() override;
+  void CheckBatches(const FlightDescriptor& expected_descriptor,
+                    const RecordBatchVector& expected_batches);
+  void CheckDoPut(const FlightDescriptor& descr, const std::shared_ptr<Schema>& schema,
+                  const RecordBatchVector& batches);
+
+  // Test methods
+  void TestInts();
+  void TestFloats();
+  void TestEmptyBatch();
+  void TestDicts();
+  void TestLargeBatch();
+  void TestSizeLimit();
+  void TestUndrained();
+
+ private:
+  std::unique_ptr<FlightClient> client_;
+  std::unique_ptr<FlightServerBase> server_;
+};
+
+#define ARROW_FLIGHT_TEST_DO_PUT(FIXTURE)                                 \
+  static_assert(std::is_base_of<DoPutTest, FIXTURE>::value,               \
+                ARROW_STRINGIFY(FIXTURE) " must inherit from DoPutTest"); \
+  TEST_F(FIXTURE, TestInts) { TestInts(); }                               \
+  TEST_F(FIXTURE, TestFloats) { TestFloats(); }                           \
+  TEST_F(FIXTURE, TestEmptyBatch) { TestEmptyBatch(); }                   \
+  TEST_F(FIXTURE, TestDicts) { TestDicts(); }                             \
+  TEST_F(FIXTURE, TestLargeBatch) { TestLargeBatch(); }                   \
+  TEST_F(FIXTURE, TestSizeLimit) { TestSizeLimit(); }                     \
+  TEST_F(FIXTURE, TestUndrained) { TestUndrained(); }
+
+class ARROW_FLIGHT_EXPORT AppMetadataTestServer : public FlightServerBase {
+ public:
+  virtual ~AppMetadataTestServer() = default;
+
+  Status DoGet(const ServerCallContext& context, const Ticket& request,
+               std::unique_ptr<FlightDataStream>* data_stream) override;
+
+  Status DoPut(const ServerCallContext& context,
+               std::unique_ptr<FlightMessageReader> reader,
+               std::unique_ptr<FlightMetadataWriter> writer) override;
+};
+
+/// \brief Tests of app_metadata in data plane methods.
+class ARROW_FLIGHT_EXPORT AppMetadataTest : public FlightTest {
+ public:
+  void SetUpTest() override;
+  void TearDownTest() override;
+
+  // Test methods
+  void TestDoGet();
+  void TestDoGetDictionaries();
+  void TestDoPut();
+  void TestDoPutDictionaries();
+  void TestDoPutReadMetadata();
+
+ private:
+  std::unique_ptr<FlightClient> client_;
+  std::unique_ptr<FlightServerBase> server_;
+};
+
+#define ARROW_FLIGHT_TEST_APP_METADATA(FIXTURE)                                 \
+  static_assert(std::is_base_of<AppMetadataTest, FIXTURE>::value,               \
+                ARROW_STRINGIFY(FIXTURE) " must inherit from AppMetadataTest"); \
+  TEST_F(FIXTURE, TestDoGet) { TestDoGet(); }                                   \
+  TEST_F(FIXTURE, TestDoGetDictionaries) { TestDoGetDictionaries(); }           \
+  TEST_F(FIXTURE, TestDoPut) { TestDoPut(); }                                   \
+  TEST_F(FIXTURE, TestDoPutDictionaries) { TestDoPutDictionaries(); }           \
+  TEST_F(FIXTURE, TestDoPutReadMetadata) { TestDoPutReadMetadata(); }
+
+/// \brief Tests of IPC options in data plane methods.
+class ARROW_FLIGHT_EXPORT IpcOptionsTest : public FlightTest {
+ public:
+  void SetUpTest() override;
+  void TearDownTest() override;
+
+  // Test methods
+  void TestDoGetReadOptions();
+  void TestDoPutWriteOptions();
+  void TestDoExchangeClientWriteOptions();
+  void TestDoExchangeClientWriteOptionsBegin();
+  void TestDoExchangeServerWriteOptions();
+
+ private:
+  std::unique_ptr<FlightClient> client_;
+  std::unique_ptr<FlightServerBase> server_;
+};
+
+#define ARROW_FLIGHT_TEST_IPC_OPTIONS(FIXTURE)                                 \
+  static_assert(std::is_base_of<IpcOptionsTest, FIXTURE>::value,               \
+                ARROW_STRINGIFY(FIXTURE) " must inherit from IpcOptionsTest"); \
+  TEST_F(FIXTURE, TestDoGetReadOptions) { TestDoGetReadOptions(); }            \
+  TEST_F(FIXTURE, TestDoPutWriteOptions) { TestDoPutWriteOptions(); }          \
+  TEST_F(FIXTURE, TestDoExchangeClientWriteOptions) {                          \
+    TestDoExchangeClientWriteOptions();                                        \
+  }                                                                            \
+  TEST_F(FIXTURE, TestDoExchangeClientWriteOptionsBegin) {                     \
+    TestDoExchangeClientWriteOptionsBegin();                                   \
+  }                                                                            \
+  TEST_F(FIXTURE, TestDoExchangeServerWriteOptions) {                          \
+    TestDoExchangeServerWriteOptions();                                        \
+  }
+
+/// \brief Tests of data plane methods with CUDA memory.
+///
+/// If not built with ARROW_CUDA, tests are no-ops.
+class ARROW_FLIGHT_EXPORT CudaDataTest : public FlightTest {
+ public:
+  void SetUpTest() override;
+  void TearDownTest() override;
+
+  // Test methods
+  void TestDoGet();
+  void TestDoPut();
+  void TestDoExchange();
+
+ private:
+  class Impl;
+  std::unique_ptr<FlightClient> client_;
+  std::unique_ptr<FlightServerBase> server_;
+  std::shared_ptr<Impl> impl_;
+};
+
+#define ARROW_FLIGHT_TEST_CUDA_DATA(FIXTURE)                                 \
+  static_assert(std::is_base_of<CudaDataTest, FIXTURE>::value,               \
+                ARROW_STRINGIFY(FIXTURE) " must inherit from CudaDataTest"); \
+  TEST_F(FIXTURE, TestDoGet) { TestDoGet(); }                                \
+  TEST_F(FIXTURE, TestDoPut) { TestDoPut(); }                                \
+  TEST_F(FIXTURE, TestDoExchange) { TestDoExchange(); }
+
+/// \brief Tests of error handling.
+class ARROW_FLIGHT_EXPORT ErrorHandlingTest : public FlightTest {
+ public:
+  void SetUpTest() override;
+  void TearDownTest() override;
+
+  // Test methods
+  void TestGetFlightInfo();
+  void TestGetFlightInfoMetadata();
+  void TestAsyncGetFlightInfo();
+  void TestDoPut();
+  void TestDoExchange();
+
+ protected:
+  struct Impl;
+
+  std::vector<std::pair<std::string, std::string>> GetHeaders();
+
+  std::shared_ptr<Impl> impl_;
+  std::unique_ptr<FlightClient> client_;
+  std::unique_ptr<FlightServerBase> server_;
+};
+
+#define ARROW_FLIGHT_TEST_ERROR_HANDLING(FIXTURE)                                 \
+  static_assert(std::is_base_of<ErrorHandlingTest, FIXTURE>::value,               \
+                ARROW_STRINGIFY(FIXTURE) " must inherit from ErrorHandlingTest"); \
+  TEST_F(FIXTURE, TestAsyncGetFlightInfo) { TestAsyncGetFlightInfo(); }           \
+  TEST_F(FIXTURE, TestGetFlightInfo) { TestGetFlightInfo(); }                     \
+  TEST_F(FIXTURE, TestGetFlightInfoMetadata) { TestGetFlightInfoMetadata(); }     \
+  TEST_F(FIXTURE, TestDoPut) { TestDoPut(); }                                     \
+  TEST_F(FIXTURE, TestDoExchange) { TestDoExchange(); }
+
+/// \brief Tests of the async client.
+class ARROW_FLIGHT_EXPORT AsyncClientTest : public FlightTest {
+ public:
+  void SetUpTest() override;
+  void TearDownTest() override;
+
+  // Test methods
+  void TestGetFlightInfo();
+  void TestGetFlightInfoFuture();
+  void TestListenerLifetime();
+
+ private:
+  std::unique_ptr<FlightClient> client_;
+  std::unique_ptr<FlightServerBase> server_;
+};
+
+// DISABLED TestListenerLifetime: https://github.com/apache/arrow/issues/45120
+#define ARROW_FLIGHT_TEST_ASYNC_CLIENT(FIXTURE)                                 \
+  static_assert(std::is_base_of<AsyncClientTest, FIXTURE>::value,               \
+                ARROW_STRINGIFY(FIXTURE) " must inherit from AsyncClientTest"); \
+  TEST_F(FIXTURE, TestGetFlightInfo) { TestGetFlightInfo(); }                   \
+  TEST_F(FIXTURE, TestGetFlightInfoFuture) { TestGetFlightInfoFuture(); }       \
+  TEST_F(FIXTURE, DISABLED_TestListenerLifetime) { TestListenerLifetime(); }
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/test_flight_server.h b/pyarrow/include/arrow/flight/test_flight_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..794dd834c014b12ba23acfbc85d15034babbba69
--- /dev/null
+++ b/pyarrow/include/arrow/flight/test_flight_server.h
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/flight/server.h"
+#include "arrow/flight/type_fwd.h"
+#include "arrow/flight/visibility.h"
+#include "arrow/status.h"
+
+namespace arrow::flight {
+
+class ARROW_FLIGHT_EXPORT TestFlightServer : public FlightServerBase {
+ public:
+  static std::unique_ptr<FlightServerBase> Make();
+
+  Status ListFlights(const ServerCallContext& context, const Criteria* criteria,
+                     std::unique_ptr<FlightListing>* listings) override;
+
+  Status GetFlightInfo(const ServerCallContext& context, const FlightDescriptor& request,
+                       std::unique_ptr<FlightInfo>* out) override;
+
+  Status DoGet(const ServerCallContext& context, const Ticket& request,
+               std::unique_ptr<FlightDataStream>* data_stream) override;
+
+  Status DoPut(const ServerCallContext&, std::unique_ptr<FlightMessageReader> reader,
+               std::unique_ptr<FlightMetadataWriter> writer) override;
+
+  Status DoExchange(const ServerCallContext& context,
+                    std::unique_ptr<FlightMessageReader> reader,
+                    std::unique_ptr<FlightMessageWriter> writer) override;
+
+  // A simple example - act like DoGet.
+  Status RunExchangeGet(std::unique_ptr<FlightMessageReader> reader,
+                        std::unique_ptr<FlightMessageWriter> writer);
+
+  // A simple example - act like DoPut
+  Status RunExchangePut(std::unique_ptr<FlightMessageReader> reader,
+                        std::unique_ptr<FlightMessageWriter> writer);
+
+  // Read some number of record batches from the client, send a
+  // metadata message back with the count, then echo the batches back.
+  Status RunExchangeCounter(std::unique_ptr<FlightMessageReader> reader,
+                            std::unique_ptr<FlightMessageWriter> writer);
+
+  // Read int64 batches from the client, each time sending back a
+  // batch with a running sum of columns.
+  Status RunExchangeTotal(std::unique_ptr<FlightMessageReader> reader,
+                          std::unique_ptr<FlightMessageWriter> writer);
+
+  // Echo the client's messages back.
+  Status RunExchangeEcho(std::unique_ptr<FlightMessageReader> reader,
+                         std::unique_ptr<FlightMessageWriter> writer);
+
+  // Regression test for ARROW-13253
+  Status RunExchangeLargeBatch(std::unique_ptr<FlightMessageReader>,
+                               std::unique_ptr<FlightMessageWriter> writer);
+
+  Status RunAction1(const Action& action, std::unique_ptr<ResultStream>* out);
+
+  Status RunAction2(std::unique_ptr<ResultStream>* out);
+
+  Status ListIncomingHeaders(const ServerCallContext& context, const Action& action,
+                             std::unique_ptr<ResultStream>* out);
+
+  Status DoAction(const ServerCallContext& context, const Action& action,
+                  std::unique_ptr<ResultStream>* out) override;
+
+  Status ListActions(const ServerCallContext& context,
+                     std::vector<ActionType>* out) override;
+
+  Status GetSchema(const ServerCallContext& context, const FlightDescriptor& request,
+                   std::unique_ptr<SchemaResult>* schema) override;
+};
+
+}  // namespace arrow::flight
diff --git a/pyarrow/include/arrow/flight/test_util.h b/pyarrow/include/arrow/flight/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..59eb9c0c1a050f9075e1c4d0ab55a4984f67331a
--- /dev/null
+++ b/pyarrow/include/arrow/flight/test_util.h
@@ -0,0 +1,188 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/process.h"
+#include "arrow/testing/util.h"
+
+#include "arrow/flight/client.h"
+#include "arrow/flight/server.h"
+#include "arrow/flight/types.h"
+#include "arrow/flight/visibility.h"
+
+namespace arrow {
+namespace flight {
+
+// ----------------------------------------------------------------------
+// Helpers to compare values for equality
+ARROW_FLIGHT_EXPORT
+void AssertEqual(const FlightInfo& expected, const FlightInfo& actual);
+
+// ----------------------------------------------------------------------
+// Fixture to use for running test servers
+
+class ARROW_FLIGHT_EXPORT TestServer {
+ public:
+  explicit TestServer(const std::string& executable_name)
+      : executable_name_(executable_name), port_(::arrow::GetListenPort()) {}
+  TestServer(const std::string& executable_name, int port)
+      : executable_name_(executable_name), port_(port) {}
+  TestServer(const std::string& executable_name, const std::string& unix_sock)
+      : executable_name_(executable_name), unix_sock_(unix_sock) {}
+
+  Status Start(const std::vector<std::string>& extra_args);
+  Status Start() { return Start({}); }
+
+  void Stop();
+
+  bool IsRunning();
+
+  int port() const;
+  const std::string& unix_sock() const;
+
+ private:
+  std::string executable_name_;
+  int port_;
+  std::string unix_sock_;
+  std::unique_ptr<util::Process> server_process_;
+};
+
+// Helper to initialize a server and matching client with callbacks to
+// populate options.
+template <typename T, typename... Args>
+Status MakeServer(const Location& location, std::unique_ptr<FlightServerBase>* server,
+                  std::unique_ptr<FlightClient>* client,
+                  std::function<Status(FlightServerOptions*)> make_server_options,
+                  std::function<Status(FlightClientOptions*)> make_client_options,
+                  Args&&... server_args) {
+  *server = std::make_unique<T>(std::forward<Args>(server_args)...);
+  FlightServerOptions server_options(location);
+  RETURN_NOT_OK(make_server_options(&server_options));
+  RETURN_NOT_OK((*server)->Init(server_options));
+  std::string uri =
+      location.scheme() + "://127.0.0.1:" + std::to_string((*server)->port());
+  ARROW_ASSIGN_OR_RAISE(auto real_location, Location::Parse(uri));
+  FlightClientOptions client_options = FlightClientOptions::Defaults();
+  RETURN_NOT_OK(make_client_options(&client_options));
+  return FlightClient::Connect(real_location, client_options).Value(client);
+}
+
+// Helper to initialize a server and matching client with callbacks to
+// populate options.
+template <typename T, typename... Args>
+Status MakeServer(std::unique_ptr<FlightServerBase>* server,
+                  std::unique_ptr<FlightClient>* client,
+                  std::function<Status(FlightServerOptions*)> make_server_options,
+                  std::function<Status(FlightClientOptions*)> make_client_options,
+                  Args&&... server_args) {
+  ARROW_ASSIGN_OR_RAISE(auto location, Location::ForGrpcTcp("localhost", 0));
+  return MakeServer<T>(location, server, client, std::move(make_server_options),
+                       std::move(make_client_options),
+                       std::forward<Args>(server_args)...);
+}
+
+// ----------------------------------------------------------------------
+// A FlightDataStream that numbers the record batches
+/// \brief A basic implementation of FlightDataStream that will provide
+/// a sequence of FlightData messages to be written to a stream
+class ARROW_FLIGHT_EXPORT NumberingStream : public FlightDataStream {
+ public:
+  explicit NumberingStream(std::unique_ptr<FlightDataStream> stream);
+
+  std::shared_ptr<Schema> schema() override;
+  arrow::Result<FlightPayload> GetSchemaPayload() override;
+  arrow::Result<FlightPayload> Next() override;
+
+ private:
+  int counter_;
+  std::shared_ptr<FlightDataStream> stream_;
+};
+
+// ----------------------------------------------------------------------
+// Example data for test-server and unit tests
+
+ARROW_FLIGHT_EXPORT
+std::shared_ptr<Schema> ExampleIntSchema();
+
+ARROW_FLIGHT_EXPORT
+std::shared_ptr<Schema> ExampleFloatSchema();
+
+ARROW_FLIGHT_EXPORT
+std::shared_ptr<Schema> ExampleStringSchema();
+
+ARROW_FLIGHT_EXPORT
+std::shared_ptr<Schema> ExampleDictSchema();
+
+ARROW_FLIGHT_EXPORT
+std::shared_ptr<Schema> ExampleLargeSchema();
+
+ARROW_FLIGHT_EXPORT
+Status ExampleIntBatches(RecordBatchVector* out);
+
+ARROW_FLIGHT_EXPORT
+Status ExampleFloatBatches(RecordBatchVector* out);
+
+ARROW_FLIGHT_EXPORT
+Status ExampleDictBatches(RecordBatchVector* out);
+
+ARROW_FLIGHT_EXPORT
+Status ExampleNestedBatches(RecordBatchVector* out);
+
+ARROW_FLIGHT_EXPORT
+Status ExampleLargeBatches(RecordBatchVector* out);
+
+ARROW_FLIGHT_EXPORT
+arrow::Result<std::shared_ptr<RecordBatch>> VeryLargeBatch();
+
+ARROW_FLIGHT_EXPORT
+std::vector<FlightInfo> ExampleFlightInfo();
+
+ARROW_FLIGHT_EXPORT
+std::vector<ActionType> ExampleActionTypes();
+
+ARROW_FLIGHT_EXPORT
+FlightInfo MakeFlightInfo(const Schema& schema, const FlightDescriptor& descriptor,
+                          const std::vector<FlightEndpoint>& endpoints,
+                          int64_t total_records, int64_t total_bytes, bool ordered,
+                          std::string app_metadata);
+
+ARROW_FLIGHT_EXPORT
+FlightInfo MakeFlightInfo(const FlightDescriptor& descriptor,
+                          const std::vector<FlightEndpoint>& endpoints,
+                          int64_t total_records, int64_t total_bytes, bool ordered,
+                          std::string app_metadata);
+
+ARROW_FLIGHT_EXPORT
+Status ExampleTlsCertificates(std::vector<CertKeyPair>* out);
+
+ARROW_FLIGHT_EXPORT
+Status ExampleTlsCertificateRoot(CertKeyPair* out);
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/transport.h b/pyarrow/include/arrow/flight/transport.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ce50534023fc81518245d96e4ee39870cc96cee
--- /dev/null
+++ b/pyarrow/include/arrow/flight/transport.h
@@ -0,0 +1,298 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// \file
+/// Internal (but not private) interface for implementing
+/// alternate network transports in Flight.
+///
+/// To implement a transport, implement ServerTransport and
+/// ClientTransport, and register the desired URI schemes with
+/// TransportRegistry. Flight takes care of most of the per-RPC
+/// details; transports only handle connections and providing a I/O
+/// stream implementation (TransportDataStream).
+///
+/// On the server side:
+///
+/// 1. Applications subclass FlightServerBase and override RPC handlers.
+/// 2. FlightServerBase::Init will look up and create a ServerTransport
+///    based on the scheme of the Location given to it.
+/// 3. The ServerTransport will start the actual server. (For instance,
+///    for gRPC, it creates a gRPC server and registers a gRPC service.)
+///    That server will handle connections.
+/// 4. The transport should forward incoming calls to the server to the RPC
+///    handlers defined on ServerTransport, which implements the actual
+///    RPC handler using the interfaces here. Any I/O the RPC handler needs
+///    to do is managed by transport-specific implementations of
+///    TransportDataStream.
+/// 5. ServerTransport calls FlightServerBase for the actual application
+///    logic.
+///
+/// On the client side:
+///
+/// 1. Applications create a FlightClient with a Location.
+/// 2. FlightClient will look up and create a ClientTransport based on
+///    the scheme of the Location given to it.
+/// 3. When calling a method on FlightClient, FlightClient will delegate to
+///    the ClientTransport. There is some indirection, e.g. for DoGet,
+///    FlightClient only requests that the ClientTransport start the
+///    call and provide it with an I/O stream. The "Flight implementation"
+///    itself still lives in FlightClient.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/flight/type_fwd.h"
+#include "arrow/flight/types.h"
+#include "arrow/flight/visibility.h"
+#include "arrow/ipc/options.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace ipc {
+class Message;
+}
+namespace flight {
+class FlightStatusDetail;
+namespace internal {
+
+/// Internal, not user-visible type used for memory-efficient reads
+struct FlightData {
+  /// Used only for puts, may be null
+  std::unique_ptr<FlightDescriptor> descriptor;
+
+  /// Non-length-prefixed Message header as described in format/Message.fbs
+  std::shared_ptr<Buffer> metadata;
+
+  /// Application-defined metadata
+  std::shared_ptr<Buffer> app_metadata;
+
+  /// Message body
+  std::shared_ptr<Buffer> body;
+
+  /// Open IPC message from the metadata and body
+  ::arrow::Result<std::unique_ptr<ipc::Message>> OpenMessage();
+};
+
+/// \brief A transport-specific interface for reading/writing Arrow data.
+///
+/// New transports will implement this to read/write IPC payloads to
+/// the underlying stream.
+class ARROW_FLIGHT_EXPORT TransportDataStream {
+ public:
+  virtual ~TransportDataStream() = default;
+  /// \brief Attempt to read the next FlightData message.
+  ///
+  /// \return success true if data was populated, false if there was
+  ///   an error. For clients, the error can be retrieved from
+  ///   Finish(Status).
+  virtual bool ReadData(FlightData* data);
+  /// \brief Attempt to write a FlightPayload.
+  ///
+  /// \param[in] payload The data to write.
+  /// \return true if the message was accepted by the transport, false
+  ///   if not (e.g. due to client/server disconnect), Status if there
+  ///   was an error (e.g. with the payload itself).
+  virtual arrow::Result<bool> WriteData(const FlightPayload& payload);
+  /// \brief Indicate that there are no more writes on this stream.
+  ///
+  /// This is only a hint for the underlying transport and may not
+  /// actually do anything.
+  virtual Status WritesDone();
+};
+
+/// \brief A transport-specific interface for reading/writing Arrow
+///   data for a client.
+class ARROW_FLIGHT_EXPORT ClientDataStream : public TransportDataStream {
+ public:
+  /// \brief Attempt to read a non-data message.
+  ///
+  /// Only implemented for DoPut; mutually exclusive with
+  /// ReadData(FlightData*).
+  virtual bool ReadPutMetadata(std::shared_ptr<Buffer>* out);
+  /// \brief Attempt to cancel the call.
+  ///
+  /// This is only a hint and may not take effect immediately. The
+  /// client should still finish the call with Finish(Status) as usual.
+  virtual void TryCancel() {}
+  /// \brief Finish the call, reporting the server-sent status and/or
+  ///   any client-side errors as appropriate.
+  ///
+  /// Implies WritesDone() and DoFinish().
+  ///
+  /// \param[in] st A client-side status to combine with the
+  ///   server-side error. That is, if an error occurs on the
+  ///   client-side, call Finish(Status) to finish the server-side
+  ///   call, get the server-side status, and merge the statuses
+  ///   together so context is not lost.
+  Status Finish(Status st);
+
+ protected:
+  /// \brief End the call, returning the final server status.
+  ///
+  /// For implementors: should imply WritesDone() (even if it does not
+  /// directly call it).
+  ///
+  /// Implies WritesDone().
+  virtual Status DoFinish() = 0;
+};
+
+/// An implementation of a Flight client for a particular transport.
+///
+/// Transports should override the methods they are capable of
+/// supporting. The default method implementations return an error.
+class ARROW_FLIGHT_EXPORT ClientTransport {
+ public:
+  virtual ~ClientTransport() = default;
+
+  /// Initialize the client.
+  virtual Status Init(const FlightClientOptions& options, const Location& location,
+                      const arrow::util::Uri& uri) = 0;
+  /// Close the client. Once this returns, the client is no longer usable.
+  virtual Status Close() = 0;
+
+  virtual Status Authenticate(const FlightCallOptions& options,
+                              std::unique_ptr<ClientAuthHandler> auth_handler);
+  virtual arrow::Result<std::pair<std::string, std::string>> AuthenticateBasicToken(
+      const FlightCallOptions& options, const std::string& username,
+      const std::string& password);
+  virtual Status DoAction(const FlightCallOptions& options, const Action& action,
+                          std::unique_ptr<ResultStream>* results);
+  virtual Status ListActions(const FlightCallOptions& options,
+                             std::vector<ActionType>* actions);
+  virtual Status GetFlightInfo(const FlightCallOptions& options,
+                               const FlightDescriptor& descriptor,
+                               std::unique_ptr<FlightInfo>* info);
+  virtual void GetFlightInfoAsync(const FlightCallOptions& options,
+                                  const FlightDescriptor& descriptor,
+                                  std::shared_ptr<AsyncListener<FlightInfo>> listener);
+  virtual Status PollFlightInfo(const FlightCallOptions& options,
+                                const FlightDescriptor& descriptor,
+                                std::unique_ptr<PollInfo>* info);
+  virtual arrow::Result<std::unique_ptr<SchemaResult>> GetSchema(
+      const FlightCallOptions& options, const FlightDescriptor& descriptor);
+  virtual Status ListFlights(const FlightCallOptions& options, const Criteria& criteria,
+                             std::unique_ptr<FlightListing>* listing);
+  virtual Status DoGet(const FlightCallOptions& options, const Ticket& ticket,
+                       std::unique_ptr<ClientDataStream>* stream);
+  virtual Status DoPut(const FlightCallOptions& options,
+                       std::unique_ptr<ClientDataStream>* stream);
+  virtual Status DoExchange(const FlightCallOptions& options,
+                            std::unique_ptr<ClientDataStream>* stream);
+
+  bool supports_async() const { return CheckAsyncSupport().ok(); }
+  virtual Status CheckAsyncSupport() const {
+    return Status::NotImplemented(
+        "this Flight transport does not support async operations");
+  }
+
+  static void SetAsyncRpc(AsyncListenerBase* listener, std::unique_ptr<AsyncRpc>&& rpc);
+  static AsyncRpc* GetAsyncRpc(AsyncListenerBase* listener);
+  static std::unique_ptr<AsyncRpc> ReleaseAsyncRpc(AsyncListenerBase* listener);
+};
+
+/// A registry of transport implementations.
+class ARROW_FLIGHT_EXPORT TransportRegistry {
+ public:
+  using ClientFactory = std::function<arrow::Result<std::unique_ptr<ClientTransport>>()>;
+  using ServerFactory = std::function<arrow::Result<std::unique_ptr<ServerTransport>>(
+      FlightServerBase*, std::shared_ptr<MemoryManager> memory_manager)>;
+
+  TransportRegistry();
+  ~TransportRegistry();
+
+  arrow::Result<std::unique_ptr<ClientTransport>> MakeClient(
+      const std::string& scheme) const;
+  arrow::Result<std::unique_ptr<ServerTransport>> MakeServer(
+      const std::string& scheme, FlightServerBase* base,
+      std::shared_ptr<MemoryManager> memory_manager) const;
+
+  Status RegisterClient(const std::string& scheme, ClientFactory factory);
+  Status RegisterServer(const std::string& scheme, ServerFactory factory);
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+/// \brief Get the registry of transport implementations.
+ARROW_FLIGHT_EXPORT
+TransportRegistry* GetDefaultTransportRegistry();
+
+//------------------------------------------------------------
+// Async APIs
+
+/// \brief Transport-specific state for an async RPC.
+///
+/// Transport implementations may subclass this to store their own
+/// state, and stash an instance in a user-supplied AsyncListener via
+/// ClientTransport::GetAsyncRpc and ClientTransport::SetAsyncRpc.
+class ARROW_FLIGHT_EXPORT AsyncRpc {
+ public:
+  virtual ~AsyncRpc() = default;
+  /// \brief Request cancellation of the RPC.
+  virtual void TryCancel() {}
+
+  /// Only needed for DoPut/DoExchange
+  virtual void Begin(const FlightDescriptor& descriptor, std::shared_ptr<Schema> schema) {
+  }
+  /// Only needed for DoPut/DoExchange
+  virtual void Write(arrow::flight::FlightStreamChunk chunk) {}
+  /// Only needed for DoPut/DoExchange
+  virtual void DoneWriting() {}
+};
+
+//------------------------------------------------------------
+// Error propagation helpers
+
+/// \brief Abstract error status.
+///
+/// Transport implementations may use side channels (e.g. HTTP
+/// trailers) to convey additional information to reconstruct the
+/// original C++ status for implementations that can use it.
+struct ARROW_FLIGHT_EXPORT TransportStatus {
+  TransportStatusCode code;
+  std::string message;
+
+  /// \brief Convert a C++ status to an abstract transport status.
+  static TransportStatus FromStatus(const Status& arrow_status);
+
+  /// \brief Reconstruct a string-encoded TransportStatus.
+  static TransportStatus FromCodeStringAndMessage(const std::string& code_str,
+                                                  std::string message);
+
+  /// \brief Convert an abstract transport status to a C++ status.
+  Status ToStatus() const;
+};
+
+/// \brief Convert the string representation of an Arrow status code
+///   back to an Arrow status.
+ARROW_FLIGHT_EXPORT
+Status ReconstructStatus(const std::string& code_str, const Status& current_status,
+                         std::optional<std::string> message,
+                         std::optional<std::string> detail_message,
+                         std::optional<std::string> detail_bin,
+                         std::shared_ptr<FlightStatusDetail> detail);
+
+}  // namespace internal
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/transport_server.h b/pyarrow/include/arrow/flight/transport_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e5fe3e710c139d53dee896e42dd9475ee4f52c1
--- /dev/null
+++ b/pyarrow/include/arrow/flight/transport_server.h
@@ -0,0 +1,133 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <chrono>
+#include <memory>
+
+#include "arrow/flight/transport.h"
+#include "arrow/flight/type_fwd.h"
+#include "arrow/flight/visibility.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace ipc {
+class Message;
+}
+namespace flight {
+namespace internal {
+
+/// \brief A transport-specific interface for reading/writing Arrow
+///   data for a server.
+class ARROW_FLIGHT_EXPORT ServerDataStream : public TransportDataStream {
+ public:
+  /// \brief Attempt to write a non-data message.
+  ///
+  /// Only implemented for DoPut; mutually exclusive with
+  /// WriteData(const FlightPayload&).
+  virtual Status WritePutMetadata(const Buffer& payload);
+};
+
+/// \brief An implementation of a Flight server for a particular
+/// transport.
+///
+/// This class (the transport implementation) implements the underlying
+/// server and handles connections/incoming RPC calls. It should forward RPC
+/// calls to the RPC handlers defined on this class, which work in terms of
+/// the generic interfaces above. The RPC handlers here then forward calls
+/// to the underlying FlightServerBase instance that contains the actual
+/// application RPC method handlers.
+///
+/// Used by FlightServerBase to manage the server lifecycle.
+class ARROW_FLIGHT_EXPORT ServerTransport {
+ public:
+  ServerTransport(FlightServerBase* base, std::shared_ptr<MemoryManager> memory_manager)
+      : base_(base), memory_manager_(std::move(memory_manager)) {}
+  virtual ~ServerTransport() = default;
+
+  /// \name Server Lifecycle Methods
+  /// Transports implement these methods to start/shutdown the underlying
+  /// server.
+  /// @{
+  /// \brief Initialize the server.
+  ///
+  /// This method should launch the server in a background thread, i.e. it
+  /// should not block. Once this returns, the server should be active.
+  virtual Status Init(const FlightServerOptions& options,
+                      const arrow::util::Uri& uri) = 0;
+  /// \brief Shutdown the server.
+  ///
+  /// This should wait for active RPCs to finish. Once this returns, the
+  /// server is no longer listening.
+  virtual Status Shutdown() = 0;
+  /// \brief Shutdown the server with a deadline.
+  ///
+  /// This should wait for active RPCs to finish, or for the deadline to
+  /// expire. Once this returns, the server is no longer listening.
+  virtual Status Shutdown(const std::chrono::system_clock::time_point& deadline) = 0;
+  /// \brief Wait for the server to shutdown (but do not shut down the server).
+  ///
+  /// Once this returns, the server is no longer listening.
+  virtual Status Wait() = 0;
+  /// \brief Get the address the server is listening on, else an empty Location.
+  virtual Location location() const = 0;
+  ///@}
+
+  /// \name RPC Handlers
+  /// Implementations of RPC handlers for Flight methods using the common
+  /// interfaces here. Transports should call these methods from their
+  /// server implementation to handle the actual RPC calls.
+  ///@{
+  /// \brief Get the FlightServerBase.
+  ///
+  /// Intended as an escape hatch for now since not all methods have been
+  /// factored into a transport-agnostic interface.
+  FlightServerBase* base() const { return base_; }
+  /// \brief Implement DoGet in terms of a transport-level stream.
+  ///
+  /// \param[in] context The server context.
+  /// \param[in] request The request payload.
+  /// \param[in] stream The transport-specific data stream
+  ///   implementation. Must implement WriteData(const
+  ///   FlightPayload&).
+  Status DoGet(const ServerCallContext& context, const Ticket& request,
+               ServerDataStream* stream);
+  /// \brief Implement DoPut in terms of a transport-level stream.
+  ///
+  /// \param[in] context The server context.
+  /// \param[in] stream The transport-specific data stream
+  ///   implementation. Must implement ReadData(FlightData*)
+  ///   and WritePutMetadata(const Buffer&).
+  Status DoPut(const ServerCallContext& context, ServerDataStream* stream);
+  /// \brief Implement DoExchange in terms of a transport-level stream.
+  ///
+  /// \param[in] context The server context.
+  /// \param[in] stream The transport-specific data stream
+  ///   implementation. Must implement ReadData(FlightData*)
+  ///   and WriteData(const FlightPayload&).
+  Status DoExchange(const ServerCallContext& context, ServerDataStream* stream);
+  ///@}
+
+ protected:
+  FlightServerBase* base_;
+  std::shared_ptr<MemoryManager> memory_manager_;
+};
+
+}  // namespace internal
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/type_fwd.h b/pyarrow/include/arrow/flight/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f22bbea36dbbf1e3da7ce10975a9584accb989e
--- /dev/null
+++ b/pyarrow/include/arrow/flight/type_fwd.h
@@ -0,0 +1,65 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+namespace internal {
+class Uri;
+}
+namespace flight {
+struct Action;
+struct ActionType;
+template <typename T>
+class AsyncListener;
+class AsyncListenerBase;
+class AsyncRpc;
+struct BasicAuth;
+class ClientAuthHandler;
+class ClientMiddleware;
+class ClientMiddlewareFactory;
+struct Criteria;
+class FlightCallOptions;
+struct FlightClientOptions;
+struct FlightDescriptor;
+struct FlightEndpoint;
+class FlightInfo;
+class PollInfo;
+class FlightListing;
+class FlightMetadataReader;
+class FlightMetadataWriter;
+struct FlightPayload;
+class FlightServerBase;
+class FlightServerOptions;
+class FlightStreamReader;
+class FlightStreamWriter;
+struct Location;
+struct Result;
+class ResultStream;
+struct SchemaResult;
+class ServerCallContext;
+class ServerMiddleware;
+class ServerMiddlewareFactory;
+struct Ticket;
+namespace internal {
+class AsyncRpc;
+class ClientTransport;
+struct FlightData;
+class ServerTransport;
+}  // namespace internal
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/types.h b/pyarrow/include/arrow/flight/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..d498ac67f7a7da8affacd14aecef9870aaf09e8e
--- /dev/null
+++ b/pyarrow/include/arrow/flight/types.h
@@ -0,0 +1,1311 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Data structure for Flight RPC.
+
+#pragma once
+
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/flight/type_fwd.h"
+#include "arrow/flight/visibility.h"
+#include "arrow/ipc/options.h"
+#include "arrow/ipc/writer.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+
+namespace arrow {
+
+class RecordBatch;
+class Schema;
+class Table;
+
+namespace ipc {
+class DictionaryMemo;
+struct ReadStats;
+}  // namespace ipc
+
+namespace util {
+class Uri;
+}  // namespace util
+
+namespace flight {
+
+ARROW_FLIGHT_EXPORT
+extern const char* kSchemeGrpc;
+ARROW_FLIGHT_EXPORT
+extern const char* kSchemeGrpcTcp;
+ARROW_FLIGHT_EXPORT
+extern const char* kSchemeGrpcUnix;
+ARROW_FLIGHT_EXPORT
+extern const char* kSchemeGrpcTls;
+
+class FlightClient;
+class FlightServerBase;
+
+/// \brief A timestamp compatible with Protocol Buffer's
+/// google.protobuf.Timestamp:
+///
+/// https://protobuf.dev/reference/protobuf/google.protobuf/#timestamp
+///
+/// > A Timestamp represents a point in time independent of any time
+/// > zone or calendar, represented as seconds and fractions of
+/// > seconds at nanosecond resolution in UTC Epoch time. It is
+/// > encoded using the Proleptic Gregorian Calendar which extends the
+/// > Gregorian calendar backwards to year one. It is encoded assuming
+/// > all minutes are 60 seconds long, i.e. leap seconds are "smeared"
+/// > so that no leap second table is needed for interpretation. Range
+/// > is from 0001-01-01T00:00:00Z to 9999-12-31T23:59:59.999999999Z.
+using Timestamp =
+    std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
+
+/// \brief A Flight-specific status code.  Used to encode some
+///   additional status codes into an Arrow Status.
+enum class FlightStatusCode : int8_t {
+  /// An implementation error has occurred.
+  Internal,
+  /// A request timed out.
+  TimedOut,
+  /// A request was cancelled.
+  Cancelled,
+  /// We are not authenticated to the remote service.
+  Unauthenticated,
+  /// We do not have permission to make this request.
+  Unauthorized,
+  /// The remote service cannot handle this request at the moment.
+  Unavailable,
+  /// A request failed for some other reason
+  Failed
+};
+
+// Silence warning
+// "non dll-interface class RecordBatchReader used as base for dll-interface class"
+#ifdef _MSC_VER
+#  pragma warning(push)
+#  pragma warning(disable : 4275)
+#endif
+
+/// \brief Flight-specific error information in a Status.
+class ARROW_FLIGHT_EXPORT FlightStatusDetail : public arrow::StatusDetail {
+ public:
+  explicit FlightStatusDetail(FlightStatusCode code) : code_{code} {}
+  explicit FlightStatusDetail(FlightStatusCode code, std::string extra_info)
+      : code_{code}, extra_info_(std::move(extra_info)) {}
+  const char* type_id() const override;
+  std::string ToString() const override;
+
+  /// \brief Get the Flight status code.
+  FlightStatusCode code() const;
+  /// \brief Get the extra error info
+  std::string extra_info() const;
+  /// \brief Get the human-readable name of the status code.
+  std::string CodeAsString() const;
+  /// \brief Set the extra error info
+  void set_extra_info(std::string extra_info);
+
+  /// \brief Try to extract a \a FlightStatusDetail from any Arrow
+  /// status.
+  ///
+  /// \return a \a FlightStatusDetail if it could be unwrapped, \a
+  /// nullptr otherwise
+  static std::shared_ptr<FlightStatusDetail> UnwrapStatus(const arrow::Status& status);
+
+ private:
+  FlightStatusCode code_;
+  std::string extra_info_;
+};
+
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
+/// \brief Make an appropriate Arrow status for the given
+/// Flight-specific status.
+///
+/// \param code The Flight status code.
+/// \param message The message for the error.
+/// \param extra_info Optional extra binary info for the error (eg protobuf)
+ARROW_FLIGHT_EXPORT
+Status MakeFlightError(FlightStatusCode code, std::string message,
+                       std::string extra_info = {});
+
+/// \brief Headers sent from the client or server.
+///
+/// Header values are ordered.
+using CallHeaders = std::multimap<std::string_view, std::string_view>;
+
+/// \brief A TLS certificate plus key.
+struct ARROW_FLIGHT_EXPORT CertKeyPair {
+  /// \brief The certificate in PEM format.
+  std::string pem_cert;
+
+  /// \brief The key in PEM format.
+  std::string pem_key;
+};
+
+namespace internal {
+
+template <typename T>
+struct remove_unique_ptr {
+  using type = T;
+};
+
+template <typename T>
+struct remove_unique_ptr<std::unique_ptr<T>> {
+  using type = T;
+};
+
+// Base CRTP type
+template <class T>
+struct BaseType {
+ protected:
+  using SuperT = BaseType<T>;
+  using SelfT = typename remove_unique_ptr<T>::type;
+
+  const SelfT& self() const { return static_cast<const SelfT&>(*this); }
+  SelfT& self() { return static_cast<SelfT&>(*this); }
+
+ public:
+  BaseType() = default;
+
+  friend bool operator==(const SelfT& left, const SelfT& right) {
+    return left.Equals(right);
+  }
+  friend bool operator!=(const SelfT& left, const SelfT& right) {
+    return !left.Equals(right);
+  }
+
+  /// \brief Serialize this message to its wire-format representation.
+  inline arrow::Result<std::string> SerializeToString() const {
+    std::string out;
+    ARROW_RETURN_NOT_OK(self().SelfT::SerializeToString(&out));
+    return out;
+  }
+
+  inline static arrow::Result<T> Deserialize(std::string_view serialized) {
+    T out;
+    ARROW_RETURN_NOT_OK(SelfT::Deserialize(serialized, &out));
+    return out;
+  }
+
+  inline arrow::Result<std::shared_ptr<Buffer>> SerializeToBuffer() const {
+    std::string out;
+    ARROW_RETURN_NOT_OK(self().SelfT::SerializeToString(&out));
+    return Buffer::FromString(std::move(out));
+  }
+};
+
+}  // namespace internal
+
+//------------------------------------------------------------
+// Wrapper types for Flight RPC protobuf messages
+
+// A wrapper around arrow.flight.protocol.HandshakeRequest is not defined
+// A wrapper around arrow.flight.protocol.HandshakeResponse is not defined
+
+/// \brief message for simple auth
+struct ARROW_FLIGHT_EXPORT BasicAuth : public internal::BaseType<BasicAuth> {
+  std::string username;
+  std::string password;
+
+  BasicAuth() = default;
+  BasicAuth(std::string username, std::string password)
+      : username(std::move(username)), password(std::move(password)) {}
+
+  std::string ToString() const;
+  bool Equals(const BasicAuth& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized, BasicAuth* out);
+};
+
+// A wrapper around arrow.flight.protocol.Empty is not defined
+
+/// \brief A type of action that can be performed with the DoAction RPC.
+struct ARROW_FLIGHT_EXPORT ActionType : public internal::BaseType<ActionType> {
+  /// \brief The name of the action.
+  std::string type;
+
+  /// \brief A human-readable description of the action.
+  std::string description;
+
+  ActionType() = default;
+
+  ActionType(std::string type, std::string description)
+      : type(std::move(type)), description(std::move(description)) {}
+
+  std::string ToString() const;
+  bool Equals(const ActionType& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized, ActionType* out);
+
+  static const ActionType kCancelFlightInfo;
+  static const ActionType kRenewFlightEndpoint;
+  static const ActionType kSetSessionOptions;
+  static const ActionType kGetSessionOptions;
+  static const ActionType kCloseSession;
+};
+
+/// \brief Opaque selection criteria for ListFlights RPC
+struct ARROW_FLIGHT_EXPORT Criteria : public internal::BaseType<Criteria> {
+  /// Opaque criteria expression, dependent on server implementation
+  std::string expression;
+
+  Criteria() = default;
+  Criteria(std::string expression)  // NOLINT runtime/explicit
+      : expression(std::move(expression)) {}
+
+  std::string ToString() const;
+  bool Equals(const Criteria& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized, Criteria* out);
+};
+
+/// \brief An action to perform with the DoAction RPC
+struct ARROW_FLIGHT_EXPORT Action : public internal::BaseType<Action> {
+  /// The action type
+  std::string type;
+
+  /// The action content as a Buffer
+  std::shared_ptr<Buffer> body;
+
+  Action() = default;
+  Action(std::string type, std::shared_ptr<Buffer> body)
+      : type(std::move(type)), body(std::move(body)) {}
+
+  std::string ToString() const;
+  bool Equals(const Action& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized, Action* out);
+};
+
+/// \brief Opaque result returned after executing an action
+struct ARROW_FLIGHT_EXPORT Result : public internal::BaseType<Result> {
+  std::shared_ptr<Buffer> body;
+
+  Result() = default;
+  Result(std::shared_ptr<Buffer> body)  // NOLINT runtime/explicit
+      : body(std::move(body)) {}
+
+  std::string ToString() const;
+  bool Equals(const Result& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized, Result* out);
+};
+
+/// \brief Schema result returned after a schema request RPC
+struct ARROW_FLIGHT_EXPORT SchemaResult : public internal::BaseType<SchemaResult> {
+ public:
+  SchemaResult() = default;
+  explicit SchemaResult(std::string schema) : raw_schema_(std::move(schema)) {}
+
+  /// \brief Factory method to construct a SchemaResult.
+  static arrow::Result<std::unique_ptr<SchemaResult>> Make(const Schema& schema);
+
+  /// \brief return schema
+  /// \param[in,out] dictionary_memo for dictionary bookkeeping, will
+  /// be modified
+  /// \return Arrow result with the reconstructed Schema
+  arrow::Result<std::shared_ptr<Schema>> GetSchema(
+      ipc::DictionaryMemo* dictionary_memo) const;
+
+  const std::string& serialized_schema() const { return raw_schema_; }
+
+  std::string ToString() const;
+  bool Equals(const SchemaResult& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized, SchemaResult* out);
+
+ private:
+  std::string raw_schema_;
+};
+
+/// \brief A request to retrieve or generate a dataset
+struct ARROW_FLIGHT_EXPORT FlightDescriptor
+    : public internal::BaseType<FlightDescriptor> {
+  enum DescriptorType {
+    UNKNOWN = 0,  /// Unused
+    PATH = 1,     /// Named path identifying a dataset
+    CMD = 2       /// Opaque command to generate a dataset
+  };
+
+  /// The descriptor type
+  DescriptorType type = UNKNOWN;
+
+  /// Opaque value used to express a command. Should only be defined when type
+  /// is CMD
+  std::string cmd;
+
+  /// List of strings identifying a particular dataset. Should only be defined
+  /// when type is PATH
+  std::vector<std::string> path;
+
+  FlightDescriptor();
+  FlightDescriptor(DescriptorType type, std::string cmd,
+                   std::vector<std::string> path) noexcept;
+  ~FlightDescriptor();
+
+  /// \brief Get a human-readable form of this descriptor.
+  std::string ToString() const;
+  bool Equals(const FlightDescriptor& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Get the wire-format representation of this type.
+  ///
+  /// Useful when interoperating with non-Flight systems (e.g. REST
+  /// services) that may want to return Flight types.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Parse the wire-format representation of this type.
+  ///
+  /// Useful when interoperating with non-Flight systems (e.g. REST
+  /// services) that may want to return Flight types.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized, FlightDescriptor* out);
+
+  // Convenience factory functions
+
+  static FlightDescriptor Command(std::string cmd) {
+    return FlightDescriptor{CMD, std::move(cmd), {}};
+  }
+
+  static FlightDescriptor Path(std::vector<std::string> path) {
+    return FlightDescriptor{PATH, "", std::move(path)};
+  }
+};
+
+/// \brief Data structure providing an opaque identifier or credential to use
+/// when requesting a data stream with the DoGet RPC
+struct ARROW_FLIGHT_EXPORT Ticket : public internal::BaseType<Ticket> {
+  std::string ticket;
+
+  Ticket() = default;
+  Ticket(std::string ticket)  // NOLINT runtime/explicit
+      : ticket(std::move(ticket)) {}
+
+  std::string ToString() const;
+  bool Equals(const Ticket& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Get the wire-format representation of this type.
+  ///
+  /// Useful when interoperating with non-Flight systems (e.g. REST
+  /// services) that may want to return Flight types.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Parse the wire-format representation of this type.
+  ///
+  /// Useful when interoperating with non-Flight systems (e.g. REST
+  /// services) that may want to return Flight types.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized, Ticket* out);
+};
+
+/// \brief A host location (a URI)
+struct ARROW_FLIGHT_EXPORT Location : public internal::BaseType<Location> {
+ public:
+  /// \brief Initialize a blank location.
+  Location();
+
+  ~Location();
+
+  /// \brief Initialize a location by parsing a URI string
+  static arrow::Result<Location> Parse(const std::string& uri_string);
+
+  /// \brief Get the fallback URI.
+  ///
+  /// arrow-flight-reuse-connection://? means that a client may attempt to
+  /// reuse an existing connection to a Flight service to fetch data instead
+  /// of creating a new connection to one of the other locations listed in a
+  /// FlightEndpoint response.
+  static const Location& ReuseConnection();
+
+  /// \brief Initialize a location for a non-TLS, gRPC-based Flight
+  /// service from a host and port
+  /// \param[in] host The hostname to connect to
+  /// \param[in] port The port
+  /// \return Arrow result with the resulting location
+  static arrow::Result<Location> ForGrpcTcp(const std::string& host, const int port);
+
+  /// \brief Initialize a location for a TLS-enabled, gRPC-based Flight
+  /// service from a host and port
+  /// \param[in] host The hostname to connect to
+  /// \param[in] port The port
+  /// \return Arrow result with the resulting location
+  static arrow::Result<Location> ForGrpcTls(const std::string& host, const int port);
+
+  /// \brief Initialize a location for a domain socket-based Flight
+  /// service
+  /// \param[in] path The path to the domain socket
+  /// \return Arrow result with the resulting location
+  static arrow::Result<Location> ForGrpcUnix(const std::string& path);
+
+  /// \brief Initialize a location based on a URI scheme
+  static arrow::Result<Location> ForScheme(const std::string& scheme,
+                                           const std::string& host, const int port);
+
+  /// \brief Get the scheme of this URI.
+  std::string scheme() const;
+
+  /// \brief Get a representation of this URI as a string.
+  std::string ToString() const;
+  bool Equals(const Location& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized, Location* out);
+
+ private:
+  friend class FlightClient;
+  friend class FlightServerBase;
+  std::shared_ptr<arrow::util::Uri> uri_;
+};
+
+/// \brief A flight ticket and list of locations where the ticket can be
+/// redeemed
+struct ARROW_FLIGHT_EXPORT FlightEndpoint : public internal::BaseType<FlightEndpoint> {
+  /// Opaque ticket identify; use with DoGet RPC
+  Ticket ticket;
+
+  /// List of locations where ticket can be redeemed. If the list is empty, the
+  /// ticket can only be redeemed on the current service where the ticket was
+  /// generated
+  std::vector<Location> locations;
+
+  /// Expiration time of this stream. If present, clients may assume
+  /// they can retry DoGet requests. Otherwise, clients should avoid
+  /// retrying DoGet requests.
+  std::optional<Timestamp> expiration_time;
+
+  /// Opaque Application-defined metadata
+  std::string app_metadata;
+
+  FlightEndpoint() = default;
+  FlightEndpoint(Ticket ticket, std::vector<Location> locations,
+                 std::optional<Timestamp> expiration_time, std::string app_metadata)
+      : ticket(std::move(ticket)),
+        locations(std::move(locations)),
+        expiration_time(expiration_time),
+        app_metadata(std::move(app_metadata)) {}
+
+  std::string ToString() const;
+  bool Equals(const FlightEndpoint& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized, FlightEndpoint* out);
+};
+
+/// \brief The access coordinates for retrieval of a dataset, returned by
+/// GetFlightInfo
+class ARROW_FLIGHT_EXPORT FlightInfo
+    : public internal::BaseType<std::unique_ptr<FlightInfo>> {
+ public:
+  struct Data {
+    std::string schema;
+    FlightDescriptor descriptor;
+    std::vector<FlightEndpoint> endpoints;
+    int64_t total_records = -1;
+    int64_t total_bytes = -1;
+    bool ordered = false;
+    std::string app_metadata;
+  };
+
+  explicit FlightInfo(Data data) : data_(std::move(data)), reconstructed_schema_(false) {}
+
+  /// \brief Factory method to construct a FlightInfo.
+  static arrow::Result<FlightInfo> Make(const Schema& schema,
+                                        const FlightDescriptor& descriptor,
+                                        const std::vector<FlightEndpoint>& endpoints,
+                                        int64_t total_records, int64_t total_bytes,
+                                        bool ordered = false,
+                                        std::string app_metadata = "");
+
+  /// \brief Factory method to construct a FlightInfo.
+  static arrow::Result<FlightInfo> Make(const std::shared_ptr<Schema>& schema,
+                                        const FlightDescriptor& descriptor,
+                                        const std::vector<FlightEndpoint>& endpoints,
+                                        int64_t total_records, int64_t total_bytes,
+                                        bool ordered = false,
+                                        std::string app_metadata = "");
+
+  /// \brief Deserialize the Arrow schema of the dataset. Populate any
+  ///   dictionary encoded fields into a DictionaryMemo for
+  ///   bookkeeping
+  /// \param[in,out] dictionary_memo for dictionary bookkeeping, will
+  /// be modified
+  /// \return Arrow result with the reconstructed Schema. Note that the schema
+  ///   may be nullptr, as the schema is optional.
+  arrow::Result<std::shared_ptr<Schema>> GetSchema(
+      ipc::DictionaryMemo* dictionary_memo) const;
+
+  const std::string& serialized_schema() const { return data_.schema; }
+
+  /// The descriptor associated with this flight, may not be set
+  const FlightDescriptor& descriptor() const { return data_.descriptor; }
+
+  /// A list of endpoints associated with the flight (dataset). To consume the
+  /// whole flight, all endpoints must be consumed
+  const std::vector<FlightEndpoint>& endpoints() const { return data_.endpoints; }
+
+  /// The total number of records (rows) in the dataset. If unknown, set to -1
+  int64_t total_records() const { return data_.total_records; }
+
+  /// The total number of bytes in the dataset. If unknown, set to -1
+  int64_t total_bytes() const { return data_.total_bytes; }
+
+  /// Whether endpoints are in the same order as the data.
+  bool ordered() const { return data_.ordered; }
+
+  /// Application-defined opaque metadata
+  const std::string& app_metadata() const { return data_.app_metadata; }
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Get the wire-format representation of this type.
+  ///
+  /// Useful when interoperating with non-Flight systems (e.g. REST
+  /// services) that may want to return Flight types.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Parse the wire-format representation of this type.
+  ///
+  /// Useful when interoperating with non-Flight systems (e.g. REST
+  /// services) that may want to return Flight types.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized,
+                                   std::unique_ptr<FlightInfo>* out);
+
+  std::string ToString() const;
+
+  /// Compare two FlightInfo for equality. This will compare the
+  /// serialized schema representations, NOT the logical equality of
+  /// the schemas.
+  bool Equals(const FlightInfo& other) const;
+
+ private:
+  Data data_;
+  mutable std::shared_ptr<Schema> schema_;
+  mutable bool reconstructed_schema_;
+};
+
+/// \brief The information to process a long-running query.
+class ARROW_FLIGHT_EXPORT PollInfo
+    : public internal::BaseType<std::unique_ptr<PollInfo>> {
+ public:
+  /// The currently available results so far.
+  std::unique_ptr<FlightInfo> info = NULLPTR;
+  /// The descriptor the client should use on the next try. If unset,
+  /// the query is complete.
+  std::optional<FlightDescriptor> descriptor = std::nullopt;
+  /// Query progress. Must be in [0.0, 1.0] but need not be
+  /// monotonic or nondecreasing. If unknown, do not set.
+  std::optional<double> progress = std::nullopt;
+  /// Expiration time for this request. After this passes, the server
+  /// might not accept the poll descriptor anymore (and the query may
+  /// be cancelled). This may be updated on a call to PollFlightInfo.
+  std::optional<Timestamp> expiration_time = std::nullopt;
+
+  PollInfo()
+      : info(NULLPTR),
+        descriptor(std::nullopt),
+        progress(std::nullopt),
+        expiration_time(std::nullopt) {}
+
+  PollInfo(std::unique_ptr<FlightInfo> info, std::optional<FlightDescriptor> descriptor,
+           std::optional<double> progress, std::optional<Timestamp> expiration_time)
+      : info(std::move(info)),
+        descriptor(std::move(descriptor)),
+        progress(progress),
+        expiration_time(expiration_time) {}
+
+  PollInfo(const PollInfo& other)
+      : info(other.info ? std::make_unique<FlightInfo>(*other.info) : NULLPTR),
+        descriptor(other.descriptor),
+        progress(other.progress),
+        expiration_time(other.expiration_time) {}
+  PollInfo(PollInfo&& other) noexcept = default;
+  ~PollInfo() = default;
+  PollInfo& operator=(const PollInfo& other) {
+    info = other.info ? std::make_unique<FlightInfo>(*other.info) : NULLPTR;
+    descriptor = other.descriptor;
+    progress = other.progress;
+    expiration_time = other.expiration_time;
+    return *this;
+  }
+  PollInfo& operator=(PollInfo&& other) = default;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Get the wire-format representation of this type.
+  ///
+  /// Useful when interoperating with non-Flight systems (e.g. REST
+  /// services) that may want to return Flight types.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Parse the wire-format representation of this type.
+  ///
+  /// Useful when interoperating with non-Flight systems (e.g. REST
+  /// services) that may want to return Flight types.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized,
+                                   std::unique_ptr<PollInfo>* out);
+
+  std::string ToString() const;
+
+  /// Compare two PollInfo for equality. This will compare the
+  /// serialized schema representations, NOT the logical equality of
+  /// the schemas.
+  bool Equals(const PollInfo& other) const;
+};
+
+/// \brief The request of the CancelFlightInfoRequest action.
+struct ARROW_FLIGHT_EXPORT CancelFlightInfoRequest
+    : public internal::BaseType<CancelFlightInfoRequest> {
+  std::unique_ptr<FlightInfo> info;
+
+  CancelFlightInfoRequest() = default;
+  CancelFlightInfoRequest(std::unique_ptr<FlightInfo> info)  // NOLINT runtime/explicit
+      : info(std::move(info)) {}
+
+  std::string ToString() const;
+  bool Equals(const CancelFlightInfoRequest& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized,
+                                   CancelFlightInfoRequest* out);
+};
+
+enum class CancelStatus {
+  /// The cancellation status is unknown. Servers should avoid using
+  /// this value (send a kNotCancellable if the requested FlightInfo
+  /// is not known). Clients can retry the request.
+  kUnspecified = 0,
+  /// The cancellation request is complete. Subsequent requests with
+  /// the same payload may return kCancelled or a kNotCancellable error.
+  kCancelled = 1,
+  /// The cancellation request is in progress. The client may retry
+  /// the cancellation request.
+  kCancelling = 2,
+  // The FlightInfo is not cancellable. The client should not retry the
+  // cancellation request.
+  kNotCancellable = 3,
+};
+
+/// \brief The result of the CancelFlightInfo action.
+struct ARROW_FLIGHT_EXPORT CancelFlightInfoResult
+    : public internal::BaseType<CancelFlightInfoResult> {
+  CancelStatus status = CancelStatus::kUnspecified;
+
+  CancelFlightInfoResult() = default;
+  CancelFlightInfoResult(CancelStatus status)  // NOLINT runtime/explicit
+      : status(status) {}
+
+  std::string ToString() const;
+  bool Equals(const CancelFlightInfoResult& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized,
+                                   CancelFlightInfoResult* out);
+};
+
+ARROW_FLIGHT_EXPORT
+std::ostream& operator<<(std::ostream& os, CancelStatus status);
+
+/// \brief The request of the RenewFlightEndpoint action.
+struct ARROW_FLIGHT_EXPORT RenewFlightEndpointRequest
+    : public internal::BaseType<RenewFlightEndpointRequest> {
+  FlightEndpoint endpoint;
+
+  RenewFlightEndpointRequest() = default;
+  explicit RenewFlightEndpointRequest(FlightEndpoint endpoint)
+      : endpoint(std::move(endpoint)) {}
+
+  std::string ToString() const;
+  bool Equals(const RenewFlightEndpointRequest& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized,
+                                   RenewFlightEndpointRequest* out);
+};
+
+// FlightData in Flight.proto maps to FlightPayload here.
+
+/// \brief Staging data structure for messages about to be put on the wire
+///
+/// This structure corresponds to FlightData in the protocol.
+struct ARROW_FLIGHT_EXPORT FlightPayload {
+  std::shared_ptr<Buffer> descriptor;
+  std::shared_ptr<Buffer> app_metadata;
+  ipc::IpcPayload ipc_message;
+
+  FlightPayload() = default;
+  FlightPayload(std::shared_ptr<Buffer> descriptor, std::shared_ptr<Buffer> app_metadata,
+                ipc::IpcPayload ipc_message)
+      : descriptor(std::move(descriptor)),
+        app_metadata(std::move(app_metadata)),
+        ipc_message(std::move(ipc_message)) {}
+
+  /// \brief Check that the payload can be written to the wire.
+  Status Validate() const;
+};
+
+// A wrapper around arrow.flight.protocol.PutResult is not defined
+
+// Session management messages
+
+/// \brief Variant supporting all possible value types for {Set,Get}SessionOptions
+///
+/// By convention, an attempt to set a valueless (std::monostate) SessionOptionValue
+/// should attempt to unset or clear the named option value on the server.
+using SessionOptionValue = std::variant<std::monostate, std::string, bool, int64_t,
+                                        double, std::vector<std::string>>;
+std::ostream& operator<<(std::ostream& os, const SessionOptionValue& v);
+
+/// \brief A request to set a set of session options by name/value.
+struct ARROW_FLIGHT_EXPORT SetSessionOptionsRequest
+    : public internal::BaseType<SetSessionOptionsRequest> {
+  std::map<std::string, SessionOptionValue> session_options;
+
+  SetSessionOptionsRequest() = default;
+  explicit SetSessionOptionsRequest(
+      std::map<std::string, SessionOptionValue> session_options)
+      : session_options(std::move(session_options)) {}
+
+  std::string ToString() const;
+  bool Equals(const SetSessionOptionsRequest& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized,
+                                   SetSessionOptionsRequest* out);
+};
+
+/// \brief The result of setting a session option.
+enum class SetSessionOptionErrorValue : int8_t {
+  /// \brief The status of setting the option is unknown.
+  ///
+  /// Servers should avoid using this value (send a NOT_FOUND error if the requested
+  /// session is not known). Clients can retry the request.
+  kUnspecified,
+  /// \brief The given session option name is invalid.
+  kInvalidName,
+  /// \brief The session option value or type is invalid.
+  kInvalidValue,
+  /// \brief The session option cannot be set.
+  kError
+};
+std::string ToString(const SetSessionOptionErrorValue& error_value);
+std::ostream& operator<<(std::ostream& os, const SetSessionOptionErrorValue& error_value);
+
+/// \brief The result(s) of setting session option(s).
+struct ARROW_FLIGHT_EXPORT SetSessionOptionsResult
+    : public internal::BaseType<SetSessionOptionsResult> {
+  struct Error {
+    SetSessionOptionErrorValue value;
+
+    bool Equals(const Error& other) const { return value == other.value; }
+    friend bool operator==(const Error& left, const Error& right) {
+      return left.Equals(right);
+    }
+    friend bool operator!=(const Error& left, const Error& right) {
+      return !(left == right);
+    }
+  };
+
+  std::map<std::string, Error> errors;
+
+  SetSessionOptionsResult() = default;
+  SetSessionOptionsResult(std::map<std::string, Error> errors)  // NOLINT runtime/explicit
+      : errors(std::move(errors)) {}
+
+  std::string ToString() const;
+  bool Equals(const SetSessionOptionsResult& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized,
+                                   SetSessionOptionsResult* out);
+};
+
+/// \brief A request to get current session options.
+struct ARROW_FLIGHT_EXPORT GetSessionOptionsRequest
+    : public internal::BaseType<GetSessionOptionsRequest> {
+  GetSessionOptionsRequest() = default;
+
+  std::string ToString() const;
+  bool Equals(const GetSessionOptionsRequest& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized,
+                                   GetSessionOptionsRequest* out);
+};
+
+/// \brief The current session options.
+struct ARROW_FLIGHT_EXPORT GetSessionOptionsResult
+    : public internal::BaseType<GetSessionOptionsResult> {
+  std::map<std::string, SessionOptionValue> session_options;
+
+  GetSessionOptionsResult() = default;
+  GetSessionOptionsResult(  // NOLINT runtime/explicit
+      std::map<std::string, SessionOptionValue> session_options)
+      : session_options(std::move(session_options)) {}
+
+  std::string ToString() const;
+  bool Equals(const GetSessionOptionsResult& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized,
+                                   GetSessionOptionsResult* out);
+};
+
+/// \brief A request to close the open client session.
+struct ARROW_FLIGHT_EXPORT CloseSessionRequest
+    : public internal::BaseType<CloseSessionRequest> {
+  CloseSessionRequest() = default;
+
+  std::string ToString() const;
+  bool Equals(const CloseSessionRequest& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized, CloseSessionRequest* out);
+};
+
+/// \brief The result of closing a session.
+enum class CloseSessionStatus : int8_t {
+  // \brief The session close status is unknown.
+  //
+  // Servers should avoid using this value (send a NOT_FOUND error if the requested
+  // session is not known). Clients can retry the request.
+  kUnspecified,
+  // \brief The session close request is complete.
+  //
+  // Subsequent requests with the same session produce a NOT_FOUND error.
+  kClosed,
+  // \brief The session close request is in progress.
+  //
+  // The client may retry the request.
+  kClosing,
+  // \brief The session is not closeable.
+  //
+  // The client should not retry the request.
+  kNotClosable
+};
+std::string ToString(const CloseSessionStatus& status);
+std::ostream& operator<<(std::ostream& os, const CloseSessionStatus& status);
+
+/// \brief The result of attempting to close the client session.
+struct ARROW_FLIGHT_EXPORT CloseSessionResult
+    : public internal::BaseType<CloseSessionResult> {
+  CloseSessionStatus status;
+
+  CloseSessionResult() = default;
+  CloseSessionResult(CloseSessionStatus status)  // NOLINT runtime/explicit
+      : status(status) {}
+
+  std::string ToString() const;
+  bool Equals(const CloseSessionResult& other) const;
+
+  using SuperT::Deserialize;
+  using SuperT::SerializeToString;
+
+  /// \brief Serialize this message to its wire-format representation.
+  ///
+  /// Use `SerializeToString()` if you want a Result-returning version.
+  arrow::Status SerializeToString(std::string* out) const;
+
+  /// \brief Deserialize this message from its wire-format representation.
+  ///
+  /// Use `Deserialize(serialized)` if you want a Result-returning version.
+  static arrow::Status Deserialize(std::string_view serialized, CloseSessionResult* out);
+};
+
+//------------------------------------------------------------
+
+/// \brief An iterator to FlightInfo instances returned by ListFlights.
+class ARROW_FLIGHT_EXPORT FlightListing {
+ public:
+  virtual ~FlightListing() = default;
+
+  /// \brief Retrieve the next FlightInfo from the iterator.
+  /// \return Arrow result with a single FlightInfo. Set to \a nullptr if there
+  /// are none left.
+  virtual arrow::Result<std::unique_ptr<FlightInfo>> Next() = 0;
+};
+
+/// \brief An iterator to Result instances returned by DoAction.
+class ARROW_FLIGHT_EXPORT ResultStream {
+ public:
+  virtual ~ResultStream() = default;
+
+  /// \brief Retrieve the next Result from the iterator.
+  /// \return Arrow result with a single Result. Set to \a nullptr if there are none left.
+  virtual arrow::Result<std::unique_ptr<Result>> Next() = 0;
+
+  /// \brief Read and drop the remaining messages to get the error (if any) from a server.
+  /// \return Status OK if this is no error from a server, any other status if a
+  /// server returns an error.
+  Status Drain();
+};
+
+/// \brief A holder for a RecordBatch with associated Flight metadata.
+struct ARROW_FLIGHT_EXPORT FlightStreamChunk {
+ public:
+  FlightStreamChunk() noexcept;
+  ~FlightStreamChunk();
+
+  std::shared_ptr<RecordBatch> data;
+  std::shared_ptr<Buffer> app_metadata;
+};
+
+/// \brief An interface to read Flight data with metadata.
+class ARROW_FLIGHT_EXPORT MetadataRecordBatchReader {
+ public:
+  virtual ~MetadataRecordBatchReader() = default;
+
+  /// \brief Get the schema for this stream.
+  virtual arrow::Result<std::shared_ptr<Schema>> GetSchema() = 0;
+
+  /// \brief Get the next message from Flight. If the stream is
+  /// finished, then the members of \a FlightStreamChunk will be
+  /// nullptr.
+  virtual arrow::Result<FlightStreamChunk> Next() = 0;
+
+  /// \brief Consume entire stream as a vector of record batches
+  virtual arrow::Result<std::vector<std::shared_ptr<RecordBatch>>> ToRecordBatches();
+
+  /// \brief Consume entire stream as a Table
+  virtual arrow::Result<std::shared_ptr<Table>> ToTable();
+
+  /// \brief Return current read statistics
+  virtual arrow::ipc::ReadStats stats() const = 0;
+};
+
+/// \brief Convert a MetadataRecordBatchReader to a regular RecordBatchReader.
+ARROW_FLIGHT_EXPORT
+arrow::Result<std::shared_ptr<RecordBatchReader>> MakeRecordBatchReader(
+    std::shared_ptr<MetadataRecordBatchReader> reader);
+
+/// \brief An interface to write IPC payloads with metadata.
+class ARROW_FLIGHT_EXPORT MetadataRecordBatchWriter : public ipc::RecordBatchWriter {
+ public:
+  virtual ~MetadataRecordBatchWriter() = default;
+  /// \brief Begin writing data with the given schema. Only used with \a DoExchange.
+  virtual Status Begin(const std::shared_ptr<Schema>& schema,
+                       const ipc::IpcWriteOptions& options) = 0;
+  virtual Status Begin(const std::shared_ptr<Schema>& schema);
+  virtual Status WriteMetadata(std::shared_ptr<Buffer> app_metadata) = 0;
+  virtual Status WriteWithMetadata(const RecordBatch& batch,
+                                   std::shared_ptr<Buffer> app_metadata) = 0;
+};
+
+/// \brief A FlightListing implementation based on a vector of
+/// FlightInfo objects.
+///
+/// This can be iterated once, then it is consumed.
+class ARROW_FLIGHT_EXPORT SimpleFlightListing : public FlightListing {
+ public:
+  explicit SimpleFlightListing(const std::vector<FlightInfo>& flights);
+  explicit SimpleFlightListing(std::vector<FlightInfo>&& flights);
+
+  arrow::Result<std::unique_ptr<FlightInfo>> Next() override;
+
+ private:
+  int position_;
+  std::vector<FlightInfo> flights_;
+};
+
+/// \brief A ResultStream implementation based on a vector of
+/// Result objects.
+///
+/// This can be iterated once, then it is consumed.
+class ARROW_FLIGHT_EXPORT SimpleResultStream : public ResultStream {
+ public:
+  explicit SimpleResultStream(std::vector<Result>&& results);
+  arrow::Result<std::unique_ptr<Result>> Next() override;
+
+ private:
+  std::vector<Result> results_;
+  size_t position_;
+};
+
+/// \defgroup flight-error Error Handling
+/// Types for handling errors from RPCs.  Flight uses a set of status
+/// codes standardized across Flight implementations, so these types
+/// let applications work directly with those codes instead of having
+/// to translate to and from Arrow Status.
+/// @{
+
+/// \brief Abstract status code for an RPC as per the Flight
+///   specification.
+enum class TransportStatusCode {
+  /// \brief No error.
+  kOk = 0,
+  /// \brief An unknown error occurred.
+  kUnknown = 1,
+  /// \brief An error occurred in the transport implementation, or an
+  ///   error internal to the service implementation occurred.
+  kInternal = 2,
+  /// \brief An argument is invalid.
+  kInvalidArgument = 3,
+  /// \brief The request timed out.
+  kTimedOut = 4,
+  /// \brief An argument is not necessarily invalid, but references
+  ///   some resource that does not exist.  Prefer over
+  ///   kInvalidArgument where applicable.
+  kNotFound = 5,
+  /// \brief The request attempted to create some resource that does
+  ///   not exist.
+  kAlreadyExists = 6,
+  /// \brief The request was explicitly cancelled.
+  kCancelled = 7,
+  /// \brief The client is not authenticated.
+  kUnauthenticated = 8,
+  /// \brief The client is not authorized to perform this request.
+  kUnauthorized = 9,
+  /// \brief The request is not implemented
+  kUnimplemented = 10,
+  /// \brief There is a network connectivity error, or some resource
+  ///   is otherwise unavailable.  Most likely a temporary condition.
+  kUnavailable = 11,
+};
+
+/// \brief Convert a code to a string.
+std::string ToString(TransportStatusCode code);
+
+/// \brief An error from an RPC call, using Flight error codes directly
+///   instead of trying to translate to Arrow Status.
+///
+/// Currently, only attached to the Status passed to AsyncListener::OnFinish.
+class ARROW_FLIGHT_EXPORT TransportStatusDetail : public StatusDetail {
+ public:
+  constexpr static const char* kTypeId = "flight::TransportStatusDetail";
+  explicit TransportStatusDetail(TransportStatusCode code, std::string message,
+                                 std::vector<std::pair<std::string, std::string>> details)
+      : code_(code), message_(std::move(message)), details_(std::move(details)) {}
+  const char* type_id() const override { return kTypeId; }
+  std::string ToString() const override;
+
+  static std::optional<std::reference_wrapper<const TransportStatusDetail>> Unwrap(
+      const Status& status);
+
+  TransportStatusCode code() const { return code_; }
+  std::string_view message() const { return message_; }
+  const std::vector<std::pair<std::string, std::string>>& details() const {
+    return details_;
+  }
+
+ private:
+  TransportStatusCode code_;
+  std::string message_;
+  std::vector<std::pair<std::string, std::string>> details_;
+};
+
+/// @}
+
+}  // namespace flight
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/flight/types_async.h b/pyarrow/include/arrow/flight/types_async.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5ed48d8a6438b5199fe7cf602ee2c9380326f67
--- /dev/null
+++ b/pyarrow/include/arrow/flight/types_async.h
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/flight/type_fwd.h"
+#include "arrow/flight/types.h"
+#include "arrow/ipc/options.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow::flight {
+
+/// \defgroup flight-async Async Flight Types
+/// Common types used for asynchronous Flight APIs.
+/// @{
+
+/// \brief Non-templated state for an async RPC.
+class ARROW_FLIGHT_EXPORT AsyncListenerBase {
+ public:
+  AsyncListenerBase();
+  virtual ~AsyncListenerBase();
+
+  /// \brief Request cancellation of the RPC.
+  ///
+  /// The RPC is not cancelled until AsyncListener::OnFinish is called.
+  void TryCancel();
+
+ private:
+  friend class arrow::flight::internal::ClientTransport;
+
+  /// Transport-specific state for this RPC.  Transport
+  /// implementations may store and retrieve state here via
+  /// ClientTransport::SetAsyncRpc and ClientTransport::GetAsyncRpc.
+  std::unique_ptr<internal::AsyncRpc> rpc_state_;
+};
+
+/// \brief Callbacks for results from async RPCs.
+///
+/// A single listener may not be used for multiple concurrent RPC
+/// calls.  The application MUST hold the listener alive until
+/// OnFinish() is called and has finished.
+template <typename T>
+class ARROW_FLIGHT_EXPORT AsyncListener : public AsyncListenerBase {
+ public:
+  /// \brief Get the next server result.
+  ///
+  /// This will never be called concurrently with itself or OnFinish.
+  virtual void OnNext(T message) = 0;
+  /// \brief Get the final status.
+  ///
+  /// This will never be called concurrently with itself or OnNext.  If the
+  /// error comes from the remote server, then a TransportStatusDetail will be
+  /// attached.  Otherwise, the error is generated by the client-side
+  /// transport and will not have a TransportStatusDetail.
+  virtual void OnFinish(Status status) = 0;
+};
+
+/// @}
+
+}  // namespace arrow::flight
diff --git a/pyarrow/include/arrow/flight/visibility.h b/pyarrow/include/arrow/flight/visibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..06f864ba8cffc16520e7768d51f43cfb25a72dd0
--- /dev/null
+++ b/pyarrow/include/arrow/flight/visibility.h
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_FLIGHT_STATIC
+#    define ARROW_FLIGHT_EXPORT
+#  elif defined(ARROW_FLIGHT_EXPORTING)
+#    define ARROW_FLIGHT_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_FLIGHT_EXPORT __declspec(dllimport)
+#  endif
+
+#  define ARROW_FLIGHT_NO_EXPORT
+#else  // Not Windows
+#  ifndef ARROW_FLIGHT_EXPORT
+#    define ARROW_FLIGHT_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_FLIGHT_NO_EXPORT
+#    define ARROW_FLIGHT_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif  // Non-Windows
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/pyarrow/include/arrow/io/api.h b/pyarrow/include/arrow/io/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..d55b2c2d55a8afc1a84fb204b2356e93503def42
--- /dev/null
+++ b/pyarrow/include/arrow/io/api.h
@@ -0,0 +1,25 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/buffered.h"
+#include "arrow/io/compressed.h"
+#include "arrow/io/file.h"
+#include "arrow/io/hdfs.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/io/memory.h"
diff --git a/pyarrow/include/arrow/io/buffered.h b/pyarrow/include/arrow/io/buffered.h
new file mode 100644
index 0000000000000000000000000000000000000000..22ea7520a5050e53a5acf83c13943923595daf5b
--- /dev/null
+++ b/pyarrow/include/arrow/io/buffered.h
@@ -0,0 +1,168 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Buffered stream implementations
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string_view>
+
+#include "arrow/io/concurrency.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Buffer;
+class MemoryPool;
+class Status;
+
+namespace io {
+
+class ARROW_EXPORT BufferedOutputStream : public OutputStream {
+ public:
+  ~BufferedOutputStream() override;
+
+  /// \brief Create a buffered output stream wrapping the given output stream.
+  /// \param[in] buffer_size the size of the temporary write buffer
+  /// \param[in] pool a MemoryPool to use for allocations
+  /// \param[in] raw another OutputStream
+  /// \return the created BufferedOutputStream
+  static Result<std::shared_ptr<BufferedOutputStream>> Create(
+      int64_t buffer_size, MemoryPool* pool, std::shared_ptr<OutputStream> raw);
+
+  /// \brief Resize internal buffer
+  /// \param[in] new_buffer_size the new buffer size
+  /// \return Status
+  Status SetBufferSize(int64_t new_buffer_size);
+
+  /// \brief Return the current size of the internal buffer
+  int64_t buffer_size() const;
+
+  /// \brief Return the number of remaining bytes that have not been flushed to
+  /// the raw OutputStream
+  int64_t bytes_buffered() const;
+
+  /// \brief Flush any buffered writes and release the raw
+  /// OutputStream. Further operations on this object are invalid
+  /// \return the underlying OutputStream
+  Result<std::shared_ptr<OutputStream>> Detach();
+
+  // OutputStream interface
+
+  /// \brief Close the buffered output stream.  This implicitly closes the
+  /// underlying raw output stream.
+  Status Close() override;
+  Status Abort() override;
+  bool closed() const override;
+
+  Result<int64_t> Tell() const override;
+  // Write bytes to the stream. Thread-safe
+  Status Write(const void* data, int64_t nbytes) override;
+  Status Write(const std::shared_ptr<Buffer>& data) override;
+
+  Status Flush() override;
+
+  /// \brief Return the underlying raw output stream.
+  std::shared_ptr<OutputStream> raw() const;
+
+ private:
+  explicit BufferedOutputStream(std::shared_ptr<OutputStream> raw, MemoryPool* pool);
+
+  class ARROW_NO_EXPORT Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+/// \class BufferedInputStream
+/// \brief An InputStream that performs buffered reads from an unbuffered
+/// InputStream, which can mitigate the overhead of many small reads in some
+/// cases
+class ARROW_EXPORT BufferedInputStream
+    : public internal::InputStreamConcurrencyWrapper<BufferedInputStream> {
+ public:
+  ~BufferedInputStream() override;
+
+  /// \brief Create a BufferedInputStream from a raw InputStream
+  /// \param[in] buffer_size the size of the temporary read buffer
+  /// \param[in] pool a MemoryPool to use for allocations
+  /// \param[in] raw a raw InputStream
+  /// \param[in] raw_read_bound a bound on the maximum number of bytes
+  /// to read from the raw input stream. The default -1 indicates that
+  /// it is unbounded
+  /// \return the created BufferedInputStream
+  static Result<std::shared_ptr<BufferedInputStream>> Create(
+      int64_t buffer_size, MemoryPool* pool, std::shared_ptr<InputStream> raw,
+      int64_t raw_read_bound = -1);
+
+  /// \brief Resize internal read buffer; calls to Read(...) will read at least
+  ///        this many bytes from the raw InputStream if possible.
+  /// \param[in] new_buffer_size the new read buffer size
+  /// \return Status
+  Status SetBufferSize(int64_t new_buffer_size);
+
+  /// \brief Return the number of remaining bytes in the read buffer
+  int64_t bytes_buffered() const;
+
+  /// \brief Return the current size of the internal buffer
+  int64_t buffer_size() const;
+
+  /// \brief Release the raw InputStream. Any data buffered will be
+  /// discarded. Further operations on this object are invalid
+  /// \return raw the underlying InputStream
+  std::shared_ptr<InputStream> Detach();
+
+  /// \brief Return the unbuffered InputStream
+  std::shared_ptr<InputStream> raw() const;
+
+  // InputStream APIs
+
+  bool closed() const override;
+  Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
+  Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+      const IOContext& io_context) override;
+
+ private:
+  friend InputStreamConcurrencyWrapper<BufferedInputStream>;
+
+  explicit BufferedInputStream(std::shared_ptr<InputStream> raw, MemoryPool* pool,
+                               int64_t raw_total_bytes_bound);
+
+  Status DoClose();
+  Status DoAbort() override;
+
+  /// \brief Returns the position of the buffered stream, though the position
+  /// of the unbuffered stream may be further advanced.
+  Result<int64_t> DoTell() const;
+
+  Result<int64_t> DoRead(int64_t nbytes, void* out);
+
+  /// \brief Read into buffer.
+  Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
+
+  /// \brief Return a zero-copy string view referencing buffered data,
+  /// but do not advance the position of the stream. Buffers data and
+  /// expands the buffer size if necessary
+  Result<std::string_view> DoPeek(int64_t nbytes) override;
+
+  class ARROW_NO_EXPORT Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/io/caching.h b/pyarrow/include/arrow/io/caching.h
new file mode 100644
index 0000000000000000000000000000000000000000..e2b911fafdbbc2ec95d0de4233b6bbb663ffa44e
--- /dev/null
+++ b/pyarrow/include/arrow/io/caching.h
@@ -0,0 +1,157 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+struct ARROW_EXPORT CacheOptions {
+  static constexpr double kDefaultIdealBandwidthUtilizationFrac = 0.9;
+  static constexpr int64_t kDefaultMaxIdealRequestSizeMib = 64;
+
+  /// \brief The maximum distance in bytes between two consecutive
+  ///   ranges; beyond this value, ranges are not combined
+  int64_t hole_size_limit;
+  /// \brief The maximum size in bytes of a combined range; if
+  ///   combining two consecutive ranges would produce a range of a
+  ///   size greater than this, they are not combined
+  int64_t range_size_limit;
+  /// \brief A lazy cache does not perform any I/O until requested.
+  ///   lazy = false: request all byte ranges when PreBuffer or WillNeed is called.
+  ///   lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader
+  ///   needs them.
+  ///   lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the
+  ///   range that is currently being read.
+  bool lazy;
+  /// \brief The maximum number of ranges to be prefetched. This is only used
+  ///   for lazy cache to asynchronously read some ranges after reading the target range.
+  int64_t prefetch_limit = 0;
+
+  bool operator==(const CacheOptions& other) const {
+    return hole_size_limit == other.hole_size_limit &&
+           range_size_limit == other.range_size_limit && lazy == other.lazy &&
+           prefetch_limit == other.prefetch_limit;
+  }
+
+  /// \brief Construct CacheOptions from network storage metrics (e.g. S3).
+  ///
+  /// \param[in] time_to_first_byte_millis Seek-time or Time-To-First-Byte (TTFB) in
+  ///   milliseconds, also called call setup latency of a new read request.
+  ///   The value is a positive integer.
+  /// \param[in] transfer_bandwidth_mib_per_sec Data transfer Bandwidth (BW) in MiB/sec
+  ///   (per connection).
+  ///   The value is a positive integer.
+  /// \param[in] ideal_bandwidth_utilization_frac Transfer bandwidth utilization fraction
+  ///   (per connection) to maximize the net data load.
+  ///   The value is a positive double precision number less than 1.
+  /// \param[in] max_ideal_request_size_mib The maximum single data request size (in MiB)
+  ///   to maximize the net data load.
+  ///   The value is a positive integer.
+  /// \return A new instance of CacheOptions.
+  static CacheOptions MakeFromNetworkMetrics(
+      int64_t time_to_first_byte_millis, int64_t transfer_bandwidth_mib_per_sec,
+      double ideal_bandwidth_utilization_frac = kDefaultIdealBandwidthUtilizationFrac,
+      int64_t max_ideal_request_size_mib = kDefaultMaxIdealRequestSizeMib);
+
+  static CacheOptions Defaults();
+  static CacheOptions LazyDefaults();
+};
+
+namespace internal {
+
+/// \brief A read cache designed to hide IO latencies when reading.
+///
+/// This class takes multiple byte ranges that an application expects to read, and
+/// coalesces them into fewer, larger read requests, which benefits performance on some
+/// filesystems, particularly remote ones like Amazon S3. By default, it also issues
+/// these read requests in parallel up front.
+///
+/// To use:
+/// 1. Cache() the ranges you expect to read in the future. Ideally, these ranges have
+///    the exact offset and length that will later be read. The cache will combine those
+///    ranges according to parameters (see constructor).
+///
+///    By default, the cache will also start fetching the combined ranges in parallel in
+///    the background, unless CacheOptions.lazy is set.
+///
+/// 2. Call WaitFor() to be notified when the given ranges have been read. If
+///    CacheOptions.lazy is set, I/O will be triggered in the background here instead.
+///    This can be done in parallel (e.g. if parsing a file, call WaitFor() for each
+///    chunk of the file that can be parsed in parallel).
+///
+/// 3. Call Read() to retrieve the actual data for the given ranges.
+///    A synchronous application may skip WaitFor() and just call Read() - it will still
+///    benefit from coalescing and parallel fetching.
+class ARROW_EXPORT ReadRangeCache {
+ public:
+  static constexpr int64_t kDefaultHoleSizeLimit = 8192;
+  static constexpr int64_t kDefaultRangeSizeLimit = 32 * 1024 * 1024;
+
+  /// Construct a read cache with default
+  explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx)
+      : ReadRangeCache(file, file.get(), std::move(ctx), CacheOptions::Defaults()) {}
+
+  /// Construct a read cache with given options
+  explicit ReadRangeCache(std::shared_ptr<RandomAccessFile> file, IOContext ctx,
+                          CacheOptions options)
+      : ReadRangeCache(file, file.get(), std::move(ctx), options) {}
+
+  /// Construct a read cache with an unowned file
+  ReadRangeCache(RandomAccessFile* file, IOContext ctx, CacheOptions options)
+      : ReadRangeCache(NULLPTR, file, std::move(ctx), options) {}
+
+  ~ReadRangeCache();
+
+  /// \brief Cache the given ranges in the background.
+  ///
+  /// The caller must ensure that the ranges do not overlap with each other,
+  /// nor with previously cached ranges.  Otherwise, behaviour will be undefined.
+  Status Cache(std::vector<ReadRange> ranges);
+
+  /// \brief Read a range previously given to Cache().
+  Result<std::shared_ptr<Buffer>> Read(ReadRange range);
+
+  /// \brief Wait until all ranges added so far have been cached.
+  Future<> Wait();
+
+  /// \brief Wait until all given ranges have been cached.
+  Future<> WaitFor(std::vector<ReadRange> ranges);
+
+ protected:
+  struct Impl;
+  struct LazyImpl;
+
+  ReadRangeCache(std::shared_ptr<RandomAccessFile> owned_file, RandomAccessFile* file,
+                 IOContext ctx, CacheOptions options);
+
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace internal
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/io/compressed.h b/pyarrow/include/arrow/io/compressed.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b4e7ab4d7248829e26bc4bbef9cb3e628f5f906
--- /dev/null
+++ b/pyarrow/include/arrow/io/compressed.h
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Compressed stream implementations
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/io/concurrency.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryPool;
+class Status;
+
+namespace util {
+
+class Codec;
+
+}  // namespace util
+
+namespace io {
+
+class ARROW_EXPORT CompressedOutputStream : public OutputStream {
+ public:
+  ~CompressedOutputStream() override;
+
+  /// \brief Create a compressed output stream wrapping the given output stream.
+  ///
+  /// The codec must be capable of streaming compression. Some codecs,
+  /// like Snappy, are not able to do so.
+  static Result<std::shared_ptr<CompressedOutputStream>> Make(
+      util::Codec* codec, const std::shared_ptr<OutputStream>& raw,
+      MemoryPool* pool = default_memory_pool());
+
+  // OutputStream interface
+
+  /// \brief Close the compressed output stream.  This implicitly closes the
+  /// underlying raw output stream.
+  Status Close() override;
+  Status Abort() override;
+  bool closed() const override;
+
+  Result<int64_t> Tell() const override;
+
+  Status Write(const void* data, int64_t nbytes) override;
+  /// \cond FALSE
+  using Writable::Write;
+  /// \endcond
+  Status Flush() override;
+
+  /// \brief Return the underlying raw output stream.
+  std::shared_ptr<OutputStream> raw() const;
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(CompressedOutputStream);
+
+  CompressedOutputStream() = default;
+
+  class ARROW_NO_EXPORT Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+class ARROW_EXPORT CompressedInputStream
+    : public internal::InputStreamConcurrencyWrapper<CompressedInputStream> {
+ public:
+  ~CompressedInputStream() override;
+
+  /// \brief Create a compressed input stream wrapping the given input stream.
+  ///
+  /// The codec must be capable of streaming decompression. Some codecs,
+  /// like Snappy, are not able to do so.
+  static Result<std::shared_ptr<CompressedInputStream>> Make(
+      util::Codec* codec, const std::shared_ptr<InputStream>& raw,
+      MemoryPool* pool = default_memory_pool());
+
+  // InputStream interface
+
+  bool closed() const override;
+  Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
+  Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+      const IOContext& io_context) override;
+
+  /// \brief Return the underlying raw input stream.
+  std::shared_ptr<InputStream> raw() const;
+
+ private:
+  friend InputStreamConcurrencyWrapper<CompressedInputStream>;
+  ARROW_DISALLOW_COPY_AND_ASSIGN(CompressedInputStream);
+
+  CompressedInputStream() = default;
+
+  /// \brief Close the compressed input stream.  This implicitly closes the
+  /// underlying raw input stream.
+  Status DoClose();
+  Status DoAbort() override;
+  Result<int64_t> DoTell() const;
+  Result<int64_t> DoRead(int64_t nbytes, void* out);
+  Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
+
+  class ARROW_NO_EXPORT Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/io/concurrency.h b/pyarrow/include/arrow/io/concurrency.h
new file mode 100644
index 0000000000000000000000000000000000000000..35c2aac6a7e155dd6ef8be35bfbbc7cc8edd4f2f
--- /dev/null
+++ b/pyarrow/include/arrow/io/concurrency.h
@@ -0,0 +1,263 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+namespace internal {
+
+template <class LockType>
+class SharedLockGuard {
+ public:
+  explicit SharedLockGuard(LockType* lock) : lock_(lock) { lock_->LockShared(); }
+
+  ~SharedLockGuard() { lock_->UnlockShared(); }
+
+ protected:
+  LockType* lock_;
+};
+
+template <class LockType>
+class ExclusiveLockGuard {
+ public:
+  explicit ExclusiveLockGuard(LockType* lock) : lock_(lock) { lock_->LockExclusive(); }
+
+  ~ExclusiveLockGuard() { lock_->UnlockExclusive(); }
+
+ protected:
+  LockType* lock_;
+};
+
+// Debug concurrency checker that marks "shared" and "exclusive" code sections,
+// aborting if the concurrency rules get violated.  Does nothing in release mode.
+// Note that we intentionally use the same class declaration in debug and
+// release builds in order to avoid runtime failures when e.g. loading a
+// release-built DLL with a debug-built application, or the reverse.
+
+class ARROW_EXPORT SharedExclusiveChecker {
+ public:
+  SharedExclusiveChecker();
+  void LockShared();
+  void UnlockShared();
+  void LockExclusive();
+  void UnlockExclusive();
+
+  SharedLockGuard<SharedExclusiveChecker> shared_guard() {
+    return SharedLockGuard<SharedExclusiveChecker>(this);
+  }
+
+  ExclusiveLockGuard<SharedExclusiveChecker> exclusive_guard() {
+    return ExclusiveLockGuard<SharedExclusiveChecker>(this);
+  }
+
+ protected:
+  struct Impl;
+  std::shared_ptr<Impl> impl_;
+};
+
+// Concurrency wrappers for IO classes that check the correctness of
+// concurrent calls to various methods.  It is not necessary to wrap all
+// IO classes with these, only a few core classes that get used in tests.
+//
+// We're not using virtual inheritance here as virtual bases have poorly
+// understood semantic overhead which we'd be passing on to implementers
+// and users of these interfaces.  Instead, we just duplicate the method
+// wrappers between those two classes.
+
+template <class Derived>
+class InputStreamConcurrencyWrapper : public InputStream {
+ public:
+  Status Close() final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoClose();
+  }
+
+  Status Abort() final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoAbort();
+  }
+
+  Result<int64_t> Tell() const final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoTell();
+  }
+
+  Result<int64_t> Read(int64_t nbytes, void* out) final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoRead(nbytes, out);
+  }
+
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoRead(nbytes);
+  }
+
+  Result<std::string_view> Peek(int64_t nbytes) final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoPeek(nbytes);
+  }
+
+  /*
+  Methods to implement in derived class:
+
+  Status DoClose();
+  Result<int64_t> DoTell() const;
+  Result<int64_t> DoRead(int64_t nbytes, void* out);
+  Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
+
+  And optionally:
+
+  Status DoAbort() override;
+  Result<std::string_view> DoPeek(int64_t nbytes) override;
+
+  These methods should be protected in the derived class and
+  InputStreamConcurrencyWrapper declared as a friend with
+
+  friend InputStreamConcurrencyWrapper<derived>;
+  */
+
+ protected:
+  // Default implementations.  They are virtual because the derived class may
+  // have derived classes itself.
+  virtual Status DoAbort() { return derived()->DoClose(); }
+
+  virtual Result<std::string_view> DoPeek(int64_t ARROW_ARG_UNUSED(nbytes)) {
+    return Status::NotImplemented("Peek not implemented");
+  }
+
+  Derived* derived() { return ::arrow::internal::checked_cast<Derived*>(this); }
+
+  const Derived* derived() const {
+    return ::arrow::internal::checked_cast<const Derived*>(this);
+  }
+
+  mutable SharedExclusiveChecker lock_;
+};
+
+template <class Derived>
+class RandomAccessFileConcurrencyWrapper : public RandomAccessFile {
+ public:
+  Status Close() final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoClose();
+  }
+
+  Status Abort() final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoAbort();
+  }
+
+  Result<int64_t> Tell() const final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoTell();
+  }
+
+  Result<int64_t> Read(int64_t nbytes, void* out) final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoRead(nbytes, out);
+  }
+
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoRead(nbytes);
+  }
+
+  Result<std::string_view> Peek(int64_t nbytes) final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoPeek(nbytes);
+  }
+
+  Status Seek(int64_t position) final {
+    auto guard = lock_.exclusive_guard();
+    return derived()->DoSeek(position);
+  }
+
+  Result<int64_t> GetSize() final {
+    auto guard = lock_.shared_guard();
+    return derived()->DoGetSize();
+  }
+
+  // NOTE: ReadAt doesn't use stream pointer, but it is allowed to update it
+  // (it's the case on Windows when using ReadFileEx).
+  // So any method that relies on the current position (even if it doesn't
+  // update it, such as Peek) cannot run in parallel with ReadAt and has
+  // to use the exclusive_guard.
+
+  Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) final {
+    auto guard = lock_.shared_guard();
+    return derived()->DoReadAt(position, nbytes, out);
+  }
+
+  Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) final {
+    auto guard = lock_.shared_guard();
+    return derived()->DoReadAt(position, nbytes);
+  }
+
+  /*
+  Methods to implement in derived class:
+
+  Status DoClose();
+  Result<int64_t> DoTell() const;
+  Result<int64_t> DoRead(int64_t nbytes, void* out);
+  Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
+  Status DoSeek(int64_t position);
+  Result<int64_t> DoGetSize()
+  Result<int64_t> DoReadAt(int64_t position, int64_t nbytes, void* out);
+  Result<std::shared_ptr<Buffer>> DoReadAt(int64_t position, int64_t nbytes);
+
+  And optionally:
+
+  Status DoAbort() override;
+  Result<std::string_view> DoPeek(int64_t nbytes) override;
+
+  These methods should be protected in the derived class and
+  RandomAccessFileConcurrencyWrapper declared as a friend with
+
+  friend RandomAccessFileConcurrencyWrapper<derived>;
+  */
+
+ protected:
+  // Default implementations.  They are virtual because the derived class may
+  // have derived classes itself.
+  virtual Status DoAbort() { return derived()->DoClose(); }
+
+  virtual Result<std::string_view> DoPeek(int64_t ARROW_ARG_UNUSED(nbytes)) {
+    return Status::NotImplemented("Peek not implemented");
+  }
+
+  Derived* derived() { return ::arrow::internal::checked_cast<Derived*>(this); }
+
+  const Derived* derived() const {
+    return ::arrow::internal::checked_cast<const Derived*>(this);
+  }
+
+  mutable SharedExclusiveChecker lock_;
+};
+
+}  // namespace internal
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/io/file.h b/pyarrow/include/arrow/io/file.h
new file mode 100644
index 0000000000000000000000000000000000000000..50d4f2c4dfc90f8ffb8061f68125b24ae82bb7ed
--- /dev/null
+++ b/pyarrow/include/arrow/io/file.h
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// IO interface implementations for OS files
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/io/concurrency.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Buffer;
+class MemoryPool;
+class Status;
+
+namespace io {
+
+/// \brief An operating system file open in write-only mode.
+class ARROW_EXPORT FileOutputStream : public OutputStream {
+ public:
+  ~FileOutputStream() override;
+
+  /// \brief Open a local file for writing, truncating any existing file
+  /// \param[in] path with UTF8 encoding
+  /// \param[in] append append to existing file, otherwise truncate to 0 bytes
+  /// \return an open FileOutputStream
+  ///
+  /// When opening a new file, any existing file with the indicated path is
+  /// truncated to 0 bytes, deleting any existing data
+  static Result<std::shared_ptr<FileOutputStream>> Open(const std::string& path,
+                                                        bool append = false);
+
+  /// \brief Open a file descriptor for writing.  The underlying file isn't
+  /// truncated.
+  /// \param[in] fd file descriptor
+  /// \return an open FileOutputStream
+  ///
+  /// The file descriptor becomes owned by the OutputStream, and will be closed
+  /// on Close() or destruction.
+  static Result<std::shared_ptr<FileOutputStream>> Open(int fd);
+
+  // OutputStream interface
+  Status Close() override;
+  bool closed() const override;
+  Result<int64_t> Tell() const override;
+
+  // Write bytes to the stream. Thread-safe
+  Status Write(const void* data, int64_t nbytes) override;
+  /// \cond FALSE
+  using Writable::Write;
+  /// \endcond
+
+  int file_descriptor() const;
+
+ private:
+  FileOutputStream();
+
+  class ARROW_NO_EXPORT FileOutputStreamImpl;
+  std::unique_ptr<FileOutputStreamImpl> impl_;
+};
+
+/// \brief An operating system file open in read-only mode.
+///
+/// Reads through this implementation are unbuffered.  If many small reads
+/// need to be issued, it is recommended to use a buffering layer for good
+/// performance.
+class ARROW_EXPORT ReadableFile
+    : public internal::RandomAccessFileConcurrencyWrapper<ReadableFile> {
+ public:
+  ~ReadableFile() override;
+
+  /// \brief Open a local file for reading
+  /// \param[in] path with UTF8 encoding
+  /// \param[in] pool a MemoryPool for memory allocations
+  /// \return ReadableFile instance
+  static Result<std::shared_ptr<ReadableFile>> Open(
+      const std::string& path, MemoryPool* pool = default_memory_pool());
+
+  /// \brief Open a local file for reading
+  /// \param[in] fd file descriptor
+  /// \param[in] pool a MemoryPool for memory allocations
+  /// \return ReadableFile instance
+  ///
+  /// The file descriptor becomes owned by the ReadableFile, and will be closed
+  /// on Close() or destruction.
+  static Result<std::shared_ptr<ReadableFile>> Open(
+      int fd, MemoryPool* pool = default_memory_pool());
+
+  bool closed() const override;
+
+  int file_descriptor() const;
+
+  Status WillNeed(const std::vector<ReadRange>& ranges) override;
+
+ private:
+  friend RandomAccessFileConcurrencyWrapper<ReadableFile>;
+
+  explicit ReadableFile(MemoryPool* pool);
+
+  Status DoClose();
+  Result<int64_t> DoTell() const;
+  Result<int64_t> DoRead(int64_t nbytes, void* buffer);
+  Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
+
+  /// \brief Thread-safe implementation of ReadAt
+  Result<int64_t> DoReadAt(int64_t position, int64_t nbytes, void* out);
+
+  /// \brief Thread-safe implementation of ReadAt
+  Result<std::shared_ptr<Buffer>> DoReadAt(int64_t position, int64_t nbytes);
+
+  Result<int64_t> DoGetSize();
+  Status DoSeek(int64_t position);
+
+  class ARROW_NO_EXPORT ReadableFileImpl;
+  std::unique_ptr<ReadableFileImpl> impl_;
+};
+
+/// \brief A file interface that uses memory-mapped files for memory interactions
+///
+/// This implementation supports zero-copy reads. The same class is used
+/// for both reading and writing.
+///
+/// If opening a file in a writable mode, it is not truncated first as with
+/// FileOutputStream.
+class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface {
+ public:
+  ~MemoryMappedFile() override;
+
+  /// Create new file with indicated size, return in read/write mode
+  static Result<std::shared_ptr<MemoryMappedFile>> Create(const std::string& path,
+                                                          int64_t size);
+
+  // mmap() with whole file
+  static Result<std::shared_ptr<MemoryMappedFile>> Open(const std::string& path,
+                                                        FileMode::type mode);
+
+  // mmap() with a region of file, the offset must be a multiple of the page size
+  static Result<std::shared_ptr<MemoryMappedFile>> Open(const std::string& path,
+                                                        FileMode::type mode,
+                                                        const int64_t offset,
+                                                        const int64_t length);
+
+  Status Close() override;
+
+  bool closed() const override;
+
+  Result<int64_t> Tell() const override;
+
+  Status Seek(int64_t position) override;
+
+  // Required by RandomAccessFile, copies memory into out. Not thread-safe
+  Result<int64_t> Read(int64_t nbytes, void* out) override;
+
+  // Zero copy read, moves position pointer. Not thread-safe
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+
+  // Zero-copy read, leaves position unchanged. Acquires a reader lock
+  // for the duration of slice creation (typically very short). Is thread-safe.
+  Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) override;
+
+  // Raw copy of the memory at specified position. Thread-safe, but
+  // locks out other readers for the duration of memcpy. Prefer the
+  // zero copy method
+  Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override;
+
+  // Synchronous ReadAsync override
+  Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
+                                            int64_t nbytes) override;
+
+  Status WillNeed(const std::vector<ReadRange>& ranges) override;
+
+  bool supports_zero_copy() const override;
+
+  /// Write data at the current position in the file. Thread-safe
+  Status Write(const void* data, int64_t nbytes) override;
+  /// \cond FALSE
+  using Writable::Write;
+  /// \endcond
+
+  /// Set the size of the map to new_size.
+  Status Resize(int64_t new_size);
+
+  /// Write data at a particular position in the file. Thread-safe
+  Status WriteAt(int64_t position, const void* data, int64_t nbytes) override;
+
+  Result<int64_t> GetSize() override;
+
+  int file_descriptor() const;
+
+ private:
+  MemoryMappedFile();
+
+  Status WriteInternal(const void* data, int64_t nbytes);
+
+  class ARROW_NO_EXPORT MemoryMap;
+  std::shared_ptr<MemoryMap> memory_map_;
+};
+
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/io/hdfs.h b/pyarrow/include/arrow/io/hdfs.h
new file mode 100644
index 0000000000000000000000000000000000000000..46038070ae4edae9dc59760004079b596adfec51
--- /dev/null
+++ b/pyarrow/include/arrow/io/hdfs.h
@@ -0,0 +1,284 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Buffer;
+class MemoryPool;
+class Status;
+
+namespace io {
+
+class HdfsReadableFile;
+class HdfsOutputStream;
+
+/// DEPRECATED.  Use the FileSystem API in arrow::fs instead.
+struct ObjectType {
+  enum type { FILE, DIRECTORY };
+};
+
+/// DEPRECATED.  Use the FileSystem API in arrow::fs instead.
+struct ARROW_EXPORT FileStatistics {
+  /// Size of file, -1 if finding length is unsupported
+  int64_t size;
+  ObjectType::type kind;
+};
+
+class ARROW_EXPORT FileSystem {
+ public:
+  virtual ~FileSystem() = default;
+
+  virtual Status MakeDirectory(const std::string& path) = 0;
+
+  virtual Status DeleteDirectory(const std::string& path) = 0;
+
+  virtual Status GetChildren(const std::string& path,
+                             std::vector<std::string>* listing) = 0;
+
+  virtual Status Rename(const std::string& src, const std::string& dst) = 0;
+
+  virtual Status Stat(const std::string& path, FileStatistics* stat) = 0;
+};
+
+struct HdfsPathInfo {
+  ObjectType::type kind;
+
+  std::string name;
+  std::string owner;
+  std::string group;
+
+  // Access times in UNIX timestamps (seconds)
+  int64_t size;
+  int64_t block_size;
+
+  int32_t last_modified_time;
+  int32_t last_access_time;
+
+  int16_t replication;
+  int16_t permissions;
+};
+
+struct HdfsConnectionConfig {
+  std::string host;
+  int port;
+  std::string user;
+  std::string kerb_ticket;
+  std::unordered_map<std::string, std::string> extra_conf;
+};
+
+class ARROW_EXPORT HadoopFileSystem : public FileSystem {
+ public:
+  ~HadoopFileSystem() override;
+
+  // Connect to an HDFS cluster given a configuration
+  //
+  // @param config (in): configuration for connecting
+  // @param fs (out): the created client
+  // @returns Status
+  static Status Connect(const HdfsConnectionConfig* config,
+                        std::shared_ptr<HadoopFileSystem>* fs);
+
+  // Create directory and all parents
+  //
+  // @param path (in): absolute HDFS path
+  // @returns Status
+  Status MakeDirectory(const std::string& path) override;
+
+  // Delete file or directory
+  // @param path absolute path to data
+  // @param recursive if path is a directory, delete contents as well
+  // @returns error status on failure
+  Status Delete(const std::string& path, bool recursive = false);
+
+  Status DeleteDirectory(const std::string& path) override;
+
+  // Disconnect from cluster
+  //
+  // @returns Status
+  Status Disconnect();
+
+  // @param path (in): absolute HDFS path
+  // @returns bool, true if the path exists, false if not (or on error)
+  bool Exists(const std::string& path);
+
+  // @param path (in): absolute HDFS path
+  // @param info (out)
+  // @returns Status
+  Status GetPathInfo(const std::string& path, HdfsPathInfo* info);
+
+  // @param nbytes (out): total capacity of the filesystem
+  // @returns Status
+  Status GetCapacity(int64_t* nbytes);
+
+  // @param nbytes (out): total bytes used of the filesystem
+  // @returns Status
+  Status GetUsed(int64_t* nbytes);
+
+  Status GetChildren(const std::string& path, std::vector<std::string>* listing) override;
+
+  /// List directory contents
+  ///
+  /// If path is a relative path, returned values will be absolute paths or URIs
+  /// starting from the current working directory.
+  Status ListDirectory(const std::string& path, std::vector<HdfsPathInfo>* listing);
+
+  /// Return the filesystem's current working directory.
+  ///
+  /// The working directory is the base path for all relative paths given to
+  /// other APIs.
+  /// NOTE: this actually returns a URI.
+  Status GetWorkingDirectory(std::string* out);
+
+  /// Change
+  ///
+  /// @param path file path to change
+  /// @param owner pass null for no change
+  /// @param group pass null for no change
+  Status Chown(const std::string& path, const char* owner, const char* group);
+
+  /// Change path permissions
+  ///
+  /// \param path Absolute path in file system
+  /// \param mode Mode bitset
+  /// \return Status
+  Status Chmod(const std::string& path, int mode);
+
+  // Move file or directory from source path to destination path within the
+  // current filesystem
+  Status Rename(const std::string& src, const std::string& dst) override;
+
+  Status Copy(const std::string& src, const std::string& dst);
+
+  Status Move(const std::string& src, const std::string& dst);
+
+  Status Stat(const std::string& path, FileStatistics* stat) override;
+
+  // TODO(wesm): GetWorkingDirectory, SetWorkingDirectory
+
+  // Open an HDFS file in READ mode. Returns error
+  // status if the file is not found.
+  //
+  // @param path complete file path
+  Status OpenReadable(const std::string& path, int32_t buffer_size,
+                      std::shared_ptr<HdfsReadableFile>* file);
+
+  Status OpenReadable(const std::string& path, int32_t buffer_size,
+                      const io::IOContext& io_context,
+                      std::shared_ptr<HdfsReadableFile>* file);
+
+  Status OpenReadable(const std::string& path, std::shared_ptr<HdfsReadableFile>* file);
+
+  Status OpenReadable(const std::string& path, const io::IOContext& io_context,
+                      std::shared_ptr<HdfsReadableFile>* file);
+
+  // FileMode::WRITE options
+  // @param path complete file path
+  // @param buffer_size 0 by default
+  // @param replication 0 by default
+  // @param default_block_size 0 by default
+  Status OpenWritable(const std::string& path, bool append, int32_t buffer_size,
+                      int16_t replication, int64_t default_block_size,
+                      std::shared_ptr<HdfsOutputStream>* file);
+
+  Status OpenWritable(const std::string& path, bool append,
+                      std::shared_ptr<HdfsOutputStream>* file);
+
+ private:
+  friend class HdfsReadableFile;
+  friend class HdfsOutputStream;
+
+  class ARROW_NO_EXPORT HadoopFileSystemImpl;
+  std::unique_ptr<HadoopFileSystemImpl> impl_;
+
+  HadoopFileSystem();
+  ARROW_DISALLOW_COPY_AND_ASSIGN(HadoopFileSystem);
+};
+
+class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile {
+ public:
+  ~HdfsReadableFile() override;
+
+  Status Close() override;
+
+  bool closed() const override;
+
+  // NOTE: If you wish to read a particular range of a file in a multithreaded
+  // context, you may prefer to use ReadAt to avoid locking issues
+  Result<int64_t> Read(int64_t nbytes, void* out) override;
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+  Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override;
+  Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) override;
+
+  Status Seek(int64_t position) override;
+  Result<int64_t> Tell() const override;
+  Result<int64_t> GetSize() override;
+
+ private:
+  explicit HdfsReadableFile(const io::IOContext&);
+
+  class ARROW_NO_EXPORT HdfsReadableFileImpl;
+  std::unique_ptr<HdfsReadableFileImpl> impl_;
+
+  friend class HadoopFileSystem::HadoopFileSystemImpl;
+
+  ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile);
+};
+
+// Naming this file OutputStream because it does not support seeking (like the
+// WritableFile interface)
+class ARROW_EXPORT HdfsOutputStream : public OutputStream {
+ public:
+  ~HdfsOutputStream() override;
+
+  Status Close() override;
+
+  bool closed() const override;
+
+  using OutputStream::Write;
+  Status Write(const void* buffer, int64_t nbytes) override;
+
+  Status Flush() override;
+
+  Result<int64_t> Tell() const override;
+
+ private:
+  class ARROW_NO_EXPORT HdfsOutputStreamImpl;
+  std::unique_ptr<HdfsOutputStreamImpl> impl_;
+
+  friend class HadoopFileSystem::HadoopFileSystemImpl;
+
+  HdfsOutputStream();
+
+  ARROW_DISALLOW_COPY_AND_ASSIGN(HdfsOutputStream);
+};
+
+ARROW_EXPORT Status HaveLibHdfs();
+
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/io/interfaces.h b/pyarrow/include/arrow/io/interfaces.h
new file mode 100644
index 0000000000000000000000000000000000000000..b36c38c6d48688a793c2588477f97648a8b550c6
--- /dev/null
+++ b/pyarrow/include/arrow/io/interfaces.h
@@ -0,0 +1,362 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "arrow/io/type_fwd.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/cancel.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+struct ReadRange {
+  int64_t offset;
+  int64_t length;
+
+  friend bool operator==(const ReadRange& left, const ReadRange& right) {
+    return (left.offset == right.offset && left.length == right.length);
+  }
+  friend bool operator!=(const ReadRange& left, const ReadRange& right) {
+    return !(left == right);
+  }
+
+  bool Contains(const ReadRange& other) const {
+    return (offset <= other.offset && offset + length >= other.offset + other.length);
+  }
+};
+
+/// EXPERIMENTAL: options provider for IO tasks
+///
+/// Includes an Executor (which will be used to execute asynchronous reads),
+/// a MemoryPool (which will be used to allocate buffers when zero copy reads
+/// are not possible), and an external id (in case the executor receives tasks from
+/// multiple sources and must distinguish tasks associated with this IOContext).
+struct ARROW_EXPORT IOContext {
+  // No specified executor: will use a global IO thread pool
+  IOContext() : IOContext(default_memory_pool(), StopToken::Unstoppable()) {}
+
+  explicit IOContext(StopToken stop_token)
+      : IOContext(default_memory_pool(), std::move(stop_token)) {}
+
+  explicit IOContext(MemoryPool* pool, StopToken stop_token = StopToken::Unstoppable());
+
+  explicit IOContext(MemoryPool* pool, ::arrow::internal::Executor* executor,
+                     StopToken stop_token = StopToken::Unstoppable(),
+                     int64_t external_id = -1)
+      : pool_(pool),
+        executor_(executor),
+        external_id_(external_id),
+        stop_token_(std::move(stop_token)) {}
+
+  explicit IOContext(::arrow::internal::Executor* executor,
+                     StopToken stop_token = StopToken::Unstoppable(),
+                     int64_t external_id = -1)
+      : pool_(default_memory_pool()),
+        executor_(executor),
+        external_id_(external_id),
+        stop_token_(std::move(stop_token)) {}
+
+  MemoryPool* pool() const { return pool_; }
+
+  ::arrow::internal::Executor* executor() const { return executor_; }
+
+  // An application-specific ID, forwarded to executor task submissions
+  int64_t external_id() const { return external_id_; }
+
+  StopToken stop_token() const { return stop_token_; }
+
+ private:
+  MemoryPool* pool_;
+  ::arrow::internal::Executor* executor_;
+  int64_t external_id_;
+  StopToken stop_token_;
+};
+
+class ARROW_EXPORT FileInterface : public std::enable_shared_from_this<FileInterface> {
+ public:
+  virtual ~FileInterface() = 0;
+
+  /// \brief Close the stream cleanly
+  ///
+  /// For writable streams, this will attempt to flush any pending data
+  /// before releasing the underlying resource.
+  ///
+  /// After Close() is called, closed() returns true and the stream is not
+  /// available for further operations.
+  virtual Status Close() = 0;
+
+  /// \brief Close the stream asynchronously
+  ///
+  /// By default, this will just submit the synchronous Close() to the
+  /// default I/O thread pool. Subclasses may implement this in a more
+  /// efficient manner.
+  virtual Future<> CloseAsync();
+
+  /// \brief Close the stream abruptly
+  ///
+  /// This method does not guarantee that any pending data is flushed.
+  /// It merely releases any underlying resource used by the stream for
+  /// its operation.
+  ///
+  /// After Abort() is called, closed() returns true and the stream is not
+  /// available for further operations.
+  virtual Status Abort();
+
+  /// \brief Return the position in this stream
+  virtual Result<int64_t> Tell() const = 0;
+
+  /// \brief Return whether the stream is closed
+  virtual bool closed() const = 0;
+
+  FileMode::type mode() const { return mode_; }
+
+ protected:
+  FileInterface() : mode_(FileMode::READ) {}
+  FileMode::type mode_;
+  void set_mode(FileMode::type mode) { mode_ = mode; }
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(FileInterface);
+};
+
+class ARROW_EXPORT Seekable {
+ public:
+  virtual ~Seekable() = default;
+  virtual Status Seek(int64_t position) = 0;
+};
+
+class ARROW_EXPORT Writable {
+ public:
+  virtual ~Writable() = default;
+
+  /// \brief Write the given data to the stream
+  ///
+  /// This method always processes the bytes in full.  Depending on the
+  /// semantics of the stream, the data may be written out immediately,
+  /// held in a buffer, or written asynchronously.  In the case where
+  /// the stream buffers the data, it will be copied.  To avoid potentially
+  /// large copies, use the Write variant that takes an owned Buffer.
+  virtual Status Write(const void* data, int64_t nbytes) = 0;
+
+  /// \brief Write the given data to the stream
+  ///
+  /// Since the Buffer owns its memory, this method can avoid a copy if
+  /// buffering is required.  See Write(const void*, int64_t) for details.
+  virtual Status Write(const std::shared_ptr<Buffer>& data);
+
+  /// \brief Flush buffered bytes, if any
+  virtual Status Flush();
+
+  Status Write(std::string_view data);
+};
+
+class ARROW_EXPORT Readable {
+ public:
+  virtual ~Readable() = default;
+
+  /// \brief Read data from current file position.
+  ///
+  /// Read at most `nbytes` from the current file position into `out`.
+  /// The number of bytes read is returned.
+  virtual Result<int64_t> Read(int64_t nbytes, void* out) = 0;
+
+  /// \brief Read data from current file position.
+  ///
+  /// Read at most `nbytes` from the current file position. Less bytes may
+  /// be read if EOF is reached. This method updates the current file position.
+  ///
+  /// In some cases (e.g. a memory-mapped file), this method may avoid a
+  /// memory copy.
+  virtual Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) = 0;
+
+  /// EXPERIMENTAL: The IOContext associated with this file.
+  ///
+  /// By default, this is the same as default_io_context(), but it may be
+  /// overridden by subclasses.
+  virtual const IOContext& io_context() const;
+};
+
+class ARROW_EXPORT OutputStream : virtual public FileInterface, public Writable {
+ protected:
+  OutputStream() = default;
+};
+
+class ARROW_EXPORT InputStream : virtual public FileInterface, virtual public Readable {
+ public:
+  /// \brief Advance or skip stream indicated number of bytes
+  /// \param[in] nbytes the number to move forward
+  /// \return Status
+  Status Advance(int64_t nbytes);
+
+  /// \brief Return zero-copy string_view to upcoming bytes.
+  ///
+  /// Do not modify the stream position.  The view becomes invalid after
+  /// any operation on the stream.  May trigger buffering if the requested
+  /// size is larger than the number of buffered bytes.
+  ///
+  /// May return NotImplemented on streams that don't support it.
+  ///
+  /// \param[in] nbytes the maximum number of bytes to see
+  virtual Result<std::string_view> Peek(int64_t nbytes);
+
+  /// \brief Return true if InputStream is capable of zero copy Buffer reads
+  ///
+  /// Zero copy reads imply the use of Buffer-returning Read() overloads.
+  virtual bool supports_zero_copy() const;
+
+  /// \brief Read and return stream metadata
+  ///
+  /// If the stream implementation doesn't support metadata, empty metadata
+  /// is returned.  Note that it is allowed to return a null pointer rather
+  /// than an allocated empty metadata.
+  virtual Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata();
+
+  /// \brief Read stream metadata asynchronously
+  virtual Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+      const IOContext& io_context);
+  Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync();
+
+ protected:
+  InputStream() = default;
+};
+
+class ARROW_EXPORT RandomAccessFile : public InputStream, public Seekable {
+ public:
+  /// Necessary because we hold a std::unique_ptr
+  ~RandomAccessFile() override;
+
+  /// \brief Create an isolated InputStream that reads a segment of a
+  /// RandomAccessFile. Multiple such stream can be created and used
+  /// independently without interference
+  /// \param[in] file a file instance
+  /// \param[in] file_offset the starting position in the file
+  /// \param[in] nbytes the extent of bytes to read. The file should have
+  /// sufficient bytes available
+  static Result<std::shared_ptr<InputStream>> GetStream(
+      std::shared_ptr<RandomAccessFile> file, int64_t file_offset, int64_t nbytes);
+
+  /// \brief Return the total file size in bytes.
+  ///
+  /// This method does not read or move the current file position, so is safe
+  /// to call concurrently with e.g. ReadAt().
+  virtual Result<int64_t> GetSize() = 0;
+
+  /// \brief Read data from given file position.
+  ///
+  /// At most `nbytes` bytes are read.  The number of bytes read is returned
+  /// (it can be less than `nbytes` if EOF is reached).
+  ///
+  /// This method can be safely called from multiple threads concurrently.
+  /// It is unspecified whether this method updates the file position or not.
+  ///
+  /// The default RandomAccessFile-provided implementation uses Seek() and Read(),
+  /// but subclasses may override it with a more efficient implementation
+  /// that doesn't depend on implicit file positioning.
+  ///
+  /// \param[in] position Where to read bytes from
+  /// \param[in] nbytes The number of bytes to read
+  /// \param[out] out The buffer to read bytes into
+  /// \return The number of bytes read, or an error
+  virtual Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out);
+
+  /// \brief Read data from given file position.
+  ///
+  /// At most `nbytes` bytes are read, but it can be less if EOF is reached.
+  ///
+  /// \param[in] position Where to read bytes from
+  /// \param[in] nbytes The number of bytes to read
+  /// \return A buffer containing the bytes read, or an error
+  virtual Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes);
+
+  /// EXPERIMENTAL: Read data asynchronously.
+  virtual Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
+                                                    int64_t nbytes);
+
+  /// EXPERIMENTAL: Read data asynchronously, using the file's IOContext.
+  Future<std::shared_ptr<Buffer>> ReadAsync(int64_t position, int64_t nbytes);
+
+  /// EXPERIMENTAL: Explicit multi-read.
+  /// \brief Request multiple reads at once
+  ///
+  /// The underlying filesystem may optimize these reads by coalescing small reads into
+  /// large reads or by breaking up large reads into multiple parallel smaller reads.  The
+  /// reads should be issued in parallel if it makes sense for the filesystem.
+  ///
+  /// One future will be returned for each input read range.  Multiple returned futures
+  /// may correspond to a single read.  Or, a single returned future may be a combined
+  /// result of several individual reads.
+  ///
+  /// \param[in] ranges The ranges to read
+  /// \return A future that will complete with the data from the requested range is
+  /// available
+  virtual std::vector<Future<std::shared_ptr<Buffer>>> ReadManyAsync(
+      const IOContext&, const std::vector<ReadRange>& ranges);
+
+  /// EXPERIMENTAL: Explicit multi-read, using the file's IOContext.
+  std::vector<Future<std::shared_ptr<Buffer>>> ReadManyAsync(
+      const std::vector<ReadRange>& ranges);
+
+  /// EXPERIMENTAL: Inform that the given ranges may be read soon.
+  ///
+  /// Some implementations might arrange to prefetch some of the data.
+  /// However, no guarantee is made and the default implementation does nothing.
+  /// For robust prefetching, use ReadAt() or ReadAsync().
+  virtual Status WillNeed(const std::vector<ReadRange>& ranges);
+
+ protected:
+  RandomAccessFile();
+
+ private:
+  struct ARROW_NO_EXPORT Impl;
+  std::unique_ptr<Impl> interface_impl_;
+};
+
+class ARROW_EXPORT WritableFile : public OutputStream, public Seekable {
+ public:
+  virtual Status WriteAt(int64_t position, const void* data, int64_t nbytes) = 0;
+
+ protected:
+  WritableFile() = default;
+};
+
+class ARROW_EXPORT ReadWriteFileInterface : public RandomAccessFile, public WritableFile {
+ protected:
+  ReadWriteFileInterface() { RandomAccessFile::set_mode(FileMode::READWRITE); }
+};
+
+/// \brief Return an iterator on an input stream
+///
+/// The iterator yields a fixed-size block on each Next() call, except the
+/// last block in the stream which may be smaller.
+/// Once the end of stream is reached, Next() returns nullptr
+/// (unlike InputStream::Read() which returns an empty buffer).
+ARROW_EXPORT
+Result<Iterator<std::shared_ptr<Buffer>>> MakeInputStreamIterator(
+    std::shared_ptr<InputStream> stream, int64_t block_size);
+
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/io/memory.h b/pyarrow/include/arrow/io/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ce0204654d0e33cdce28cb4a3e39148fdee8f8f
--- /dev/null
+++ b/pyarrow/include/arrow/io/memory.h
@@ -0,0 +1,197 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Public API for different memory sharing / IO mechanisms
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string_view>
+#include <vector>
+
+#include "arrow/io/concurrency.h"
+#include "arrow/io/interfaces.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Status;
+
+namespace io {
+
+/// \brief An output stream that writes to a resizable buffer
+class ARROW_EXPORT BufferOutputStream : public OutputStream {
+ public:
+  explicit BufferOutputStream(const std::shared_ptr<ResizableBuffer>& buffer);
+
+  /// \brief Create in-memory output stream with indicated capacity using a
+  /// memory pool
+  /// \param[in] initial_capacity the initial allocated internal capacity of
+  /// the OutputStream
+  /// \param[in,out] pool a MemoryPool to use for allocations
+  /// \return the created stream
+  static Result<std::shared_ptr<BufferOutputStream>> Create(
+      int64_t initial_capacity = 4096, MemoryPool* pool = default_memory_pool());
+
+  ~BufferOutputStream() override;
+
+  // Implement the OutputStream interface
+
+  /// Close the stream, preserving the buffer (retrieve it with Finish()).
+  Status Close() override;
+  bool closed() const override;
+  Result<int64_t> Tell() const override;
+  Status Write(const void* data, int64_t nbytes) override;
+
+  /// \cond FALSE
+  using OutputStream::Write;
+  /// \endcond
+
+  /// Close the stream and return the buffer
+  Result<std::shared_ptr<Buffer>> Finish();
+
+  /// \brief Initialize state of OutputStream with newly allocated memory and
+  /// set position to 0
+  /// \param[in] initial_capacity the starting allocated capacity
+  /// \param[in,out] pool the memory pool to use for allocations
+  /// \return Status
+  Status Reset(int64_t initial_capacity = 1024, MemoryPool* pool = default_memory_pool());
+
+  int64_t capacity() const { return capacity_; }
+
+ private:
+  BufferOutputStream();
+
+  // Ensures there is sufficient space available to write nbytes
+  Status Reserve(int64_t nbytes);
+
+  std::shared_ptr<ResizableBuffer> buffer_;
+  bool is_open_;
+  int64_t capacity_;
+  int64_t position_;
+  uint8_t* mutable_data_;
+};
+
+/// \brief A helper class to track the size of allocations
+///
+/// Writes to this stream do not copy or retain any data, they just bump
+/// a size counter that can be later used to know exactly which data size
+/// needs to be allocated for actual writing.
+class ARROW_EXPORT MockOutputStream : public OutputStream {
+ public:
+  MockOutputStream() : extent_bytes_written_(0), is_open_(true) {}
+
+  // Implement the OutputStream interface
+  Status Close() override;
+  bool closed() const override;
+  Result<int64_t> Tell() const override;
+  Status Write(const void* data, int64_t nbytes) override;
+  /// \cond FALSE
+  using Writable::Write;
+  /// \endcond
+
+  int64_t GetExtentBytesWritten() const { return extent_bytes_written_; }
+
+ private:
+  int64_t extent_bytes_written_;
+  bool is_open_;
+};
+
+/// \brief An output stream that writes into a fixed-size mutable buffer
+class ARROW_EXPORT FixedSizeBufferWriter : public WritableFile {
+ public:
+  /// Input buffer must be mutable, will abort if not
+  explicit FixedSizeBufferWriter(const std::shared_ptr<Buffer>& buffer);
+  ~FixedSizeBufferWriter() override;
+
+  Status Close() override;
+  bool closed() const override;
+  Status Seek(int64_t position) override;
+  Result<int64_t> Tell() const override;
+  Status Write(const void* data, int64_t nbytes) override;
+  /// \cond FALSE
+  using Writable::Write;
+  /// \endcond
+
+  Status WriteAt(int64_t position, const void* data, int64_t nbytes) override;
+
+  void set_memcopy_threads(int num_threads);
+  void set_memcopy_blocksize(int64_t blocksize);
+  void set_memcopy_threshold(int64_t threshold);
+
+ protected:
+  class FixedSizeBufferWriterImpl;
+  std::unique_ptr<FixedSizeBufferWriterImpl> impl_;
+};
+
+/// \class BufferReader
+/// \brief Random access zero-copy reads on an arrow::Buffer
+class ARROW_EXPORT BufferReader
+    : public internal::RandomAccessFileConcurrencyWrapper<BufferReader> {
+ public:
+  /// \brief Instantiate from std::shared_ptr<Buffer>.
+  ///
+  /// This is a zero-copy constructor.
+  explicit BufferReader(std::shared_ptr<Buffer> buffer);
+
+  /// \brief Instantiate from std::string. Owns data.
+  static std::unique_ptr<BufferReader> FromString(std::string data);
+
+  bool closed() const override;
+
+  bool supports_zero_copy() const override;
+
+  std::shared_ptr<Buffer> buffer() const { return buffer_; }
+
+  // Synchronous ReadAsync override
+  Future<std::shared_ptr<Buffer>> ReadAsync(const IOContext&, int64_t position,
+                                            int64_t nbytes) override;
+  Status WillNeed(const std::vector<ReadRange>& ranges) override;
+
+ protected:
+  friend RandomAccessFileConcurrencyWrapper<BufferReader>;
+
+  Status DoClose();
+
+  Result<int64_t> DoRead(int64_t nbytes, void* buffer);
+  Result<std::shared_ptr<Buffer>> DoRead(int64_t nbytes);
+  Result<int64_t> DoReadAt(int64_t position, int64_t nbytes, void* out);
+  Result<std::shared_ptr<Buffer>> DoReadAt(int64_t position, int64_t nbytes);
+  Result<std::string_view> DoPeek(int64_t nbytes) override;
+
+  Result<int64_t> DoTell() const;
+  Status DoSeek(int64_t position);
+  Result<int64_t> DoGetSize();
+
+  Status CheckClosed() const {
+    if (!is_open_) {
+      return Status::Invalid("Operation forbidden on closed BufferReader");
+    }
+    return Status::OK();
+  }
+
+  std::shared_ptr<Buffer> buffer_;
+  const uint8_t* data_;
+  int64_t size_;
+  int64_t position_;
+  bool is_open_;
+};
+
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/io/mman.h b/pyarrow/include/arrow/io/mman.h
new file mode 100644
index 0000000000000000000000000000000000000000..04d450cbff5130a2a09fb8a792338b482c1ed2be
--- /dev/null
+++ b/pyarrow/include/arrow/io/mman.h
@@ -0,0 +1,169 @@
+// Copyright https://code.google.com/p/mman-win32/
+//
+// Licensed under the MIT License;
+// You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/MIT
+
+#pragma once
+
+#include "arrow/util/windows_compatibility.h"
+
+#include <errno.h>
+#include <io.h>
+#include <sys/types.h>
+
+#include <cstdint>
+
+#define PROT_NONE 0
+#define PROT_READ 1
+#define PROT_WRITE 2
+#define PROT_EXEC 4
+
+#define MAP_FILE 0
+#define MAP_SHARED 1
+#define MAP_PRIVATE 2
+#define MAP_TYPE 0xf
+#define MAP_FIXED 0x10
+#define MAP_ANONYMOUS 0x20
+#define MAP_ANON MAP_ANONYMOUS
+
+#define MAP_FAILED ((void*)-1)
+
+/* Flags for msync. */
+#define MS_ASYNC 1
+#define MS_SYNC 2
+#define MS_INVALIDATE 4
+
+#ifndef FILE_MAP_EXECUTE
+#  define FILE_MAP_EXECUTE 0x0020
+#endif
+
+static inline int __map_mman_error(const DWORD err, const int deferr) {
+  if (err == 0) return 0;
+  // TODO: implement
+  return err;
+}
+
+static inline DWORD __map_mmap_prot_page(const int prot) {
+  DWORD protect = 0;
+
+  if (prot == PROT_NONE) return protect;
+
+  if ((prot & PROT_EXEC) != 0) {
+    protect = ((prot & PROT_WRITE) != 0) ? PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ;
+  } else {
+    protect = ((prot & PROT_WRITE) != 0) ? PAGE_READWRITE : PAGE_READONLY;
+  }
+
+  return protect;
+}
+
+static inline DWORD __map_mmap_prot_file(const int prot) {
+  DWORD desiredAccess = 0;
+
+  if (prot == PROT_NONE) return desiredAccess;
+
+  if ((prot & PROT_READ) != 0) desiredAccess |= FILE_MAP_READ;
+  if ((prot & PROT_WRITE) != 0) desiredAccess |= FILE_MAP_WRITE;
+  if ((prot & PROT_EXEC) != 0) desiredAccess |= FILE_MAP_EXECUTE;
+
+  return desiredAccess;
+}
+
+static inline void* mmap(void* addr, size_t len, int prot, int flags, int fildes,
+                         off_t off) {
+  HANDLE fm, h;
+
+  void* map = MAP_FAILED;
+  const uint64_t off64 = static_cast<uint64_t>(off);
+  const uint64_t maxSize = off64 + len;
+
+  const DWORD dwFileOffsetLow = static_cast<DWORD>(off64 & 0xFFFFFFFFUL);
+  const DWORD dwFileOffsetHigh = static_cast<DWORD>((off64 >> 32) & 0xFFFFFFFFUL);
+  const DWORD dwMaxSizeLow = static_cast<DWORD>(maxSize & 0xFFFFFFFFUL);
+  const DWORD dwMaxSizeHigh = static_cast<DWORD>((maxSize >> 32) & 0xFFFFFFFFUL);
+
+  const DWORD protect = __map_mmap_prot_page(prot);
+  const DWORD desiredAccess = __map_mmap_prot_file(prot);
+
+  errno = 0;
+
+  if (len == 0
+      /* Unsupported flag combinations */
+      || (flags & MAP_FIXED) != 0
+      /* Unsupported protection combinations */
+      || prot == PROT_EXEC) {
+    errno = EINVAL;
+    return MAP_FAILED;
+  }
+
+  h = ((flags & MAP_ANONYMOUS) == 0) ? (HANDLE)_get_osfhandle(fildes)
+                                     : INVALID_HANDLE_VALUE;
+
+  if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE) {
+    errno = EBADF;
+    return MAP_FAILED;
+  }
+
+  fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL);
+
+  if (fm == NULL) {
+    errno = __map_mman_error(GetLastError(), EPERM);
+    return MAP_FAILED;
+  }
+
+  map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len);
+
+  CloseHandle(fm);
+
+  if (map == NULL) {
+    errno = __map_mman_error(GetLastError(), EPERM);
+    return MAP_FAILED;
+  }
+
+  return map;
+}
+
+static inline int munmap(void* addr, size_t len) {
+  if (UnmapViewOfFile(addr)) return 0;
+
+  errno = __map_mman_error(GetLastError(), EPERM);
+
+  return -1;
+}
+
+static inline int mprotect(void* addr, size_t len, int prot) {
+  DWORD newProtect = __map_mmap_prot_page(prot);
+  DWORD oldProtect = 0;
+
+  if (VirtualProtect(addr, len, newProtect, &oldProtect)) return 0;
+
+  errno = __map_mman_error(GetLastError(), EPERM);
+
+  return -1;
+}
+
+static inline int msync(void* addr, size_t len, int flags) {
+  if (FlushViewOfFile(addr, len)) return 0;
+
+  errno = __map_mman_error(GetLastError(), EPERM);
+
+  return -1;
+}
+
+static inline int mlock(const void* addr, size_t len) {
+  if (VirtualLock((LPVOID)addr, len)) return 0;
+
+  errno = __map_mman_error(GetLastError(), EPERM);
+
+  return -1;
+}
+
+static inline int munlock(const void* addr, size_t len) {
+  if (VirtualUnlock((LPVOID)addr, len)) return 0;
+
+  errno = __map_mman_error(GetLastError(), EPERM);
+
+  return -1;
+}
diff --git a/pyarrow/include/arrow/io/slow.h b/pyarrow/include/arrow/io/slow.h
new file mode 100644
index 0000000000000000000000000000000000000000..fdcc56dfa6af622fcfd9fd10984c1d0a87414149
--- /dev/null
+++ b/pyarrow/include/arrow/io/slow.h
@@ -0,0 +1,118 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Slow stream implementations, mainly for testing and benchmarking
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Buffer;
+class Status;
+
+namespace io {
+
+class ARROW_EXPORT LatencyGenerator {
+ public:
+  virtual ~LatencyGenerator();
+
+  void Sleep();
+
+  virtual double NextLatency() = 0;
+
+  static std::shared_ptr<LatencyGenerator> Make(double average_latency);
+  static std::shared_ptr<LatencyGenerator> Make(double average_latency, int32_t seed);
+};
+
+// XXX use ConcurrencyWrapper?  It could increase chances of finding a race.
+
+template <class StreamType>
+class SlowInputStreamBase : public StreamType {
+ public:
+  SlowInputStreamBase(std::shared_ptr<StreamType> stream,
+                      std::shared_ptr<LatencyGenerator> latencies)
+      : stream_(std::move(stream)), latencies_(std::move(latencies)) {}
+
+  SlowInputStreamBase(std::shared_ptr<StreamType> stream, double average_latency)
+      : stream_(std::move(stream)), latencies_(LatencyGenerator::Make(average_latency)) {}
+
+  SlowInputStreamBase(std::shared_ptr<StreamType> stream, double average_latency,
+                      int32_t seed)
+      : stream_(std::move(stream)),
+        latencies_(LatencyGenerator::Make(average_latency, seed)) {}
+
+ protected:
+  std::shared_ptr<StreamType> stream_;
+  std::shared_ptr<LatencyGenerator> latencies_;
+};
+
+/// \brief An InputStream wrapper that makes reads slower.
+///
+/// Read() calls are made slower by an average latency (in seconds).
+/// Actual latencies form a normal distribution closely centered
+/// on the average latency.
+/// Other calls are forwarded directly.
+class ARROW_EXPORT SlowInputStream : public SlowInputStreamBase<InputStream> {
+ public:
+  ~SlowInputStream() override;
+
+  using SlowInputStreamBase<InputStream>::SlowInputStreamBase;
+
+  Status Close() override;
+  Status Abort() override;
+  bool closed() const override;
+
+  Result<int64_t> Read(int64_t nbytes, void* out) override;
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+  Result<std::string_view> Peek(int64_t nbytes) override;
+
+  Result<int64_t> Tell() const override;
+};
+
+/// \brief A RandomAccessFile wrapper that makes reads slower.
+///
+/// Similar to SlowInputStream, but allows random access and seeking.
+class ARROW_EXPORT SlowRandomAccessFile : public SlowInputStreamBase<RandomAccessFile> {
+ public:
+  ~SlowRandomAccessFile() override;
+
+  using SlowInputStreamBase<RandomAccessFile>::SlowInputStreamBase;
+
+  Status Close() override;
+  Status Abort() override;
+  bool closed() const override;
+
+  Result<int64_t> Read(int64_t nbytes, void* out) override;
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+  Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override;
+  Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) override;
+  Result<std::string_view> Peek(int64_t nbytes) override;
+
+  Result<int64_t> GetSize() override;
+  Status Seek(int64_t position) override;
+  Result<int64_t> Tell() const override;
+};
+
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/io/stdio.h b/pyarrow/include/arrow/io/stdio.h
new file mode 100644
index 0000000000000000000000000000000000000000..9484ac7712427733862ecbc7d9ee932c5dfc0907
--- /dev/null
+++ b/pyarrow/include/arrow/io/stdio.h
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+// Output stream that just writes to stdout.
+class ARROW_EXPORT StdoutStream : public OutputStream {
+ public:
+  StdoutStream();
+  ~StdoutStream() override {}
+
+  Status Close() override;
+  bool closed() const override;
+
+  Result<int64_t> Tell() const override;
+
+  Status Write(const void* data, int64_t nbytes) override;
+
+ private:
+  int64_t pos_;
+};
+
+// Output stream that just writes to stderr.
+class ARROW_EXPORT StderrStream : public OutputStream {
+ public:
+  StderrStream();
+  ~StderrStream() override {}
+
+  Status Close() override;
+  bool closed() const override;
+
+  Result<int64_t> Tell() const override;
+
+  Status Write(const void* data, int64_t nbytes) override;
+
+ private:
+  int64_t pos_;
+};
+
+// Input stream that just reads from stdin.
+class ARROW_EXPORT StdinStream : public InputStream {
+ public:
+  StdinStream();
+  ~StdinStream() override {}
+
+  Status Close() override;
+  bool closed() const override;
+
+  Result<int64_t> Tell() const override;
+
+  Result<int64_t> Read(int64_t nbytes, void* out) override;
+
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+
+ private:
+  int64_t pos_;
+};
+
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/io/test_common.h b/pyarrow/include/arrow/io/test_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ee72a05cf825b45ecddbb8163abf6c2bdfc76d1
--- /dev/null
+++ b/pyarrow/include/arrow/io/test_common.h
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/testing/visibility.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+namespace io {
+
+class MemoryMappedFile;
+
+ARROW_TESTING_EXPORT
+void AssertFileContents(const std::string& path, const std::string& contents);
+
+ARROW_TESTING_EXPORT bool FileExists(const std::string& path);
+
+ARROW_TESTING_EXPORT Status PurgeLocalFileFromOsCache(const std::string& path);
+
+ARROW_TESTING_EXPORT
+Status ZeroMemoryMap(MemoryMappedFile* file);
+
+class ARROW_TESTING_EXPORT MemoryMapFixture {
+ public:
+  void TearDown();
+
+  void CreateFile(const std::string& path, int64_t size);
+
+  Result<std::shared_ptr<MemoryMappedFile>> InitMemoryMap(int64_t size,
+                                                          const std::string& path);
+
+  void AppendFile(const std::string& path);
+
+ private:
+  std::vector<std::string> tmp_files_;
+};
+
+class ARROW_TESTING_EXPORT TrackedRandomAccessFile : public io::RandomAccessFile {
+ public:
+  virtual int64_t num_reads() const = 0;
+  virtual int64_t bytes_read() const = 0;
+  virtual const std::vector<io::ReadRange>& get_read_ranges() const = 0;
+  virtual void ResetStats() = 0;
+
+  static std::unique_ptr<TrackedRandomAccessFile> Make(io::RandomAccessFile* target);
+};
+
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/io/transform.h b/pyarrow/include/arrow/io/transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..7afe29b10194efa39fec8e3b2008e16e5a3ee8e8
--- /dev/null
+++ b/pyarrow/include/arrow/io/transform.h
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Transform stream implementations
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+class ARROW_EXPORT TransformInputStream : public InputStream {
+ public:
+  using TransformFunc =
+      std::function<Result<std::shared_ptr<Buffer>>(const std::shared_ptr<Buffer>&)>;
+
+  TransformInputStream(std::shared_ptr<InputStream> wrapped, TransformFunc transform);
+  ~TransformInputStream() override;
+
+  Status Close() override;
+  Status Abort() override;
+  bool closed() const override;
+
+  Result<int64_t> Read(int64_t nbytes, void* out) override;
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+
+  Result<std::shared_ptr<const KeyValueMetadata>> ReadMetadata() override;
+  Future<std::shared_ptr<const KeyValueMetadata>> ReadMetadataAsync(
+      const IOContext& io_context) override;
+
+  Result<int64_t> Tell() const override;
+
+ protected:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/io/type_fwd.h b/pyarrow/include/arrow/io/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1b9e626bba289a030d87d0a14bfa2f1fb2dc29d
--- /dev/null
+++ b/pyarrow/include/arrow/io/type_fwd.h
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace io {
+
+struct FileMode {
+  enum type { READ, WRITE, READWRITE };
+};
+
+struct IOContext;
+struct CacheOptions;
+
+/// EXPERIMENTAL: convenience global singleton for default IOContext settings
+ARROW_EXPORT
+const IOContext& default_io_context();
+
+/// \brief Get the capacity of the global I/O thread pool
+///
+/// Return the number of worker threads in the thread pool to which
+/// Arrow dispatches various I/O-bound tasks.  This is an ideal number,
+/// not necessarily the exact number of threads at a given point in time.
+///
+/// You can change this number using SetIOThreadPoolCapacity().
+ARROW_EXPORT int GetIOThreadPoolCapacity();
+
+/// \brief Set the capacity of the global I/O thread pool
+///
+/// Set the number of worker threads in the thread pool to which
+/// Arrow dispatches various I/O-bound tasks.
+///
+/// The current number is returned by GetIOThreadPoolCapacity().
+ARROW_EXPORT Status SetIOThreadPoolCapacity(int threads);
+
+class FileInterface;
+class Seekable;
+class Writable;
+class Readable;
+class OutputStream;
+class FileOutputStream;
+class InputStream;
+class ReadableFile;
+class RandomAccessFile;
+class MemoryMappedFile;
+class WritableFile;
+class ReadWriteFileInterface;
+
+class LatencyGenerator;
+
+class BufferOutputStream;
+class BufferReader;
+class CompressedInputStream;
+class CompressedOutputStream;
+class BufferedInputStream;
+class BufferedOutputStream;
+
+}  // namespace io
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/ipc/api.h b/pyarrow/include/arrow/ipc/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..3047180fb1a20d11279654ebc6b905dadcf968a0
--- /dev/null
+++ b/pyarrow/include/arrow/ipc/api.h
@@ -0,0 +1,24 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/ipc/dictionary.h"
+#include "arrow/ipc/feather.h"
+#include "arrow/ipc/message.h"
+#include "arrow/ipc/reader.h"
+#include "arrow/ipc/writer.h"
diff --git a/pyarrow/include/arrow/ipc/dictionary.h b/pyarrow/include/arrow/ipc/dictionary.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4287cb19747fa60f5d728b6afb2bcab30443bfd
--- /dev/null
+++ b/pyarrow/include/arrow/ipc/dictionary.h
@@ -0,0 +1,177 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Tools for dictionaries in IPC context
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace ipc {
+
+namespace internal {
+
+class FieldPosition {
+ public:
+  FieldPosition() : parent_(NULLPTR), index_(-1), depth_(0) {}
+
+  FieldPosition child(int index) const { return {this, index}; }
+
+  std::vector<int> path() const {
+    std::vector<int> path(depth_);
+    const FieldPosition* cur = this;
+    for (int i = depth_ - 1; i >= 0; --i) {
+      path[i] = cur->index_;
+      cur = cur->parent_;
+    }
+    return path;
+  }
+
+ protected:
+  FieldPosition(const FieldPosition* parent, int index)
+      : parent_(parent), index_(index), depth_(parent->depth_ + 1) {}
+
+  const FieldPosition* parent_;
+  int index_;
+  int depth_;
+};
+
+}  // namespace internal
+
+/// \brief Map fields in a schema to dictionary ids
+///
+/// The mapping is structural, i.e. the field path (as a vector of indices)
+/// is associated to the dictionary id.  A dictionary id may be associated
+/// to multiple fields.
+class ARROW_EXPORT DictionaryFieldMapper {
+ public:
+  DictionaryFieldMapper();
+  explicit DictionaryFieldMapper(const Schema& schema);
+  ~DictionaryFieldMapper();
+
+  Status AddSchemaFields(const Schema& schema);
+  Status AddField(int64_t id, std::vector<int> field_path);
+
+  Result<int64_t> GetFieldId(std::vector<int> field_path) const;
+
+  int num_fields() const;
+
+  /// \brief Returns number of unique dictionaries, taking into
+  /// account that different fields can share the same dictionary.
+  int num_dicts() const;
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+using DictionaryVector = std::vector<std::pair<int64_t, std::shared_ptr<Array>>>;
+
+/// \brief Memoization data structure for reading dictionaries from IPC streams
+///
+/// This structure tracks the following associations:
+/// - field position (structural) -> dictionary id
+/// - dictionary id -> value type
+/// - dictionary id -> dictionary (value) data
+///
+/// Together, they allow resolving dictionary data when reading an IPC stream,
+/// using metadata recorded in the schema message and data recorded in the
+/// dictionary batch messages (see ResolveDictionaries).
+///
+/// This structure isn't useful for writing an IPC stream, where only
+/// DictionaryFieldMapper is necessary.
+class ARROW_EXPORT DictionaryMemo {
+ public:
+  DictionaryMemo();
+  ~DictionaryMemo();
+
+  DictionaryFieldMapper& fields();
+  const DictionaryFieldMapper& fields() const;
+
+  /// \brief Return current dictionary corresponding to a particular
+  /// id. Returns KeyError if id not found
+  Result<std::shared_ptr<ArrayData>> GetDictionary(int64_t id, MemoryPool* pool) const;
+
+  /// \brief Return dictionary value type corresponding to a
+  /// particular dictionary id.
+  Result<std::shared_ptr<DataType>> GetDictionaryType(int64_t id) const;
+
+  /// \brief Return true if we have a dictionary for the input id
+  bool HasDictionary(int64_t id) const;
+
+  /// \brief Add a dictionary value type to the memo with a particular id.
+  /// Returns KeyError if a different type is already registered with the same id.
+  Status AddDictionaryType(int64_t id, const std::shared_ptr<DataType>& type);
+
+  /// \brief Add a dictionary to the memo with a particular id. Returns
+  /// KeyError if that dictionary already exists
+  Status AddDictionary(int64_t id, const std::shared_ptr<ArrayData>& dictionary);
+
+  /// \brief Append a dictionary delta to the memo with a particular id. Returns
+  /// KeyError if that dictionary does not exists
+  Status AddDictionaryDelta(int64_t id, const std::shared_ptr<ArrayData>& dictionary);
+
+  /// \brief Add a dictionary to the memo if it does not have one with the id,
+  /// otherwise, replace the dictionary with the new one.
+  ///
+  /// Return true if the dictionary was added, false if replaced.
+  Result<bool> AddOrReplaceDictionary(int64_t id,
+                                      const std::shared_ptr<ArrayData>& dictionary);
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+// For writing: collect dictionary entries to write to the IPC stream, in order
+// (i.e. inner dictionaries before dependent outer dictionaries).
+ARROW_EXPORT
+Result<DictionaryVector> CollectDictionaries(const RecordBatch& batch,
+                                             const DictionaryFieldMapper& mapper);
+
+// For reading: resolve all dictionaries in columns, according to the field
+// mapping and dictionary arrays stored in memo.
+// Columns may be sparse, i.e. some entries may be left null
+// (e.g. if an inclusion mask was used).
+ARROW_EXPORT
+Status ResolveDictionaries(const ArrayDataVector& columns, const DictionaryMemo& memo,
+                           MemoryPool* pool);
+
+namespace internal {
+
+// Like CollectDictionaries above, but uses the memo's DictionaryFieldMapper
+// and all collected dictionaries are added to the memo using AddDictionary.
+//
+// This is used as a shortcut in some roundtripping tests (to avoid emitting
+// any actual dictionary batches).
+ARROW_EXPORT
+Status CollectDictionaries(const RecordBatch& batch, DictionaryMemo* memo);
+
+}  // namespace internal
+
+}  // namespace ipc
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/ipc/feather.h b/pyarrow/include/arrow/ipc/feather.h
new file mode 100644
index 0000000000000000000000000000000000000000..da88ee22f8291f81da3046e3c6e5844a5021be4d
--- /dev/null
+++ b/pyarrow/include/arrow/ipc/feather.h
@@ -0,0 +1,150 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Public API for the "Feather" file format, originally created at
+// http://github.com/wesm/feather
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/ipc/options.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Schema;
+class Status;
+class Table;
+
+namespace io {
+
+class OutputStream;
+class RandomAccessFile;
+
+}  // namespace io
+
+namespace ipc {
+namespace feather {
+
+static constexpr const int kFeatherV1Version = 2;
+static constexpr const int kFeatherV2Version = 3;
+
+// ----------------------------------------------------------------------
+// Metadata accessor classes
+
+/// \class Reader
+/// \brief An interface for reading columns from Feather files
+class ARROW_EXPORT Reader {
+ public:
+  virtual ~Reader() = default;
+
+  /// \brief Open a Feather file from a RandomAccessFile interface
+  ///
+  /// \param[in] source a RandomAccessFile instance
+  /// \return the table reader
+  static Result<std::shared_ptr<Reader>> Open(
+      const std::shared_ptr<io::RandomAccessFile>& source);
+
+  /// \brief Open a Feather file from a RandomAccessFile interface
+  /// with IPC Read options
+  ///
+  /// \param[in] source a RandomAccessFile instance
+  /// \param[in] options IPC Read options
+  /// \return the table reader
+  static Result<std::shared_ptr<Reader>> Open(
+      const std::shared_ptr<io::RandomAccessFile>& source, const IpcReadOptions& options);
+
+  /// \brief Return the version number of the Feather file
+  virtual int version() const = 0;
+
+  virtual std::shared_ptr<Schema> schema() const = 0;
+
+  /// \brief Read all columns from the file as an arrow::Table.
+  ///
+  /// \param[out] out the returned table
+  /// \return Status
+  ///
+  /// This function is zero-copy if the file source supports zero-copy reads
+  virtual Status Read(std::shared_ptr<Table>* out) = 0;
+
+  /// \brief Read only the specified columns from the file as an arrow::Table.
+  ///
+  /// \param[in] indices the column indices to read
+  /// \param[out] out the returned table
+  /// \return Status
+  ///
+  /// This function is zero-copy if the file source supports zero-copy reads
+  virtual Status Read(const std::vector<int>& indices, std::shared_ptr<Table>* out) = 0;
+
+  /// \brief Read only the specified columns from the file as an arrow::Table.
+  ///
+  /// \param[in] names the column names to read
+  /// \param[out] out the returned table
+  /// \return Status
+  ///
+  /// This function is zero-copy if the file source supports zero-copy reads
+  virtual Status Read(const std::vector<std::string>& names,
+                      std::shared_ptr<Table>* out) = 0;
+};
+
+struct ARROW_EXPORT WriteProperties {
+  static WriteProperties Defaults();
+
+  static WriteProperties DefaultsV1() {
+    WriteProperties props = Defaults();
+    props.version = kFeatherV1Version;
+    return props;
+  }
+
+  /// Feather file version number
+  ///
+  /// version 2: "Feather V1" Apache Arrow <= 0.16.0
+  /// version 3: "Feather V2" Apache Arrow > 0.16.0
+  int version = kFeatherV2Version;
+
+  // Parameters for Feather V2 only
+
+  /// Number of rows per intra-file chunk. Use smaller chunksize when you need
+  /// faster random row access
+  int64_t chunksize = 1LL << 16;
+
+  /// Compression type to use. Only UNCOMPRESSED, LZ4_FRAME, and ZSTD are
+  /// supported. The default compression returned by Defaults() is LZ4 if the
+  /// project is built with support for it, otherwise
+  /// UNCOMPRESSED. UNCOMPRESSED is set as the object default here so that if
+  /// WriteProperties::Defaults() is not used, the default constructor for
+  /// WriteProperties will work regardless of the options used to build the C++
+  /// project.
+  Compression::type compression = Compression::UNCOMPRESSED;
+
+  /// Compressor-specific compression level
+  int compression_level = ::arrow::util::kUseDefaultCompressionLevel;
+};
+
+ARROW_EXPORT
+Status WriteTable(const Table& table, io::OutputStream* dst,
+                  const WriteProperties& properties = WriteProperties::Defaults());
+
+}  // namespace feather
+}  // namespace ipc
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/ipc/message.h b/pyarrow/include/arrow/ipc/message.h
new file mode 100644
index 0000000000000000000000000000000000000000..1cd72ce993ed28ddfd1f894af35eeefbbdce6050
--- /dev/null
+++ b/pyarrow/include/arrow/ipc/message.h
@@ -0,0 +1,565 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// C++ object model and user API for interprocess schema messaging
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/io/type_fwd.h"
+#include "arrow/ipc/type_fwd.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace ipc {
+
+struct IpcWriteOptions;
+
+// Read interface classes. We do not fully deserialize the flatbuffers so that
+// individual fields metadata can be retrieved from very large schema without
+//
+
+/// \class Message
+/// \brief An IPC message including metadata and body
+class ARROW_EXPORT Message {
+ public:
+  /// \brief Construct message, but do not validate
+  ///
+  /// Use at your own risk; Message::Open has more metadata validation
+  Message(std::shared_ptr<Buffer> metadata, std::shared_ptr<Buffer> body);
+
+  ~Message();
+
+  /// \brief Create and validate a Message instance from two buffers
+  ///
+  /// \param[in] metadata a buffer containing the Flatbuffer metadata
+  /// \param[in] body a buffer containing the message body, which may be null
+  /// \return the created message
+  static Result<std::unique_ptr<Message>> Open(std::shared_ptr<Buffer> metadata,
+                                               std::shared_ptr<Buffer> body);
+
+  /// \brief Read message body and create Message given Flatbuffer metadata
+  /// \param[in] metadata containing a serialized Message flatbuffer
+  /// \param[in] stream an InputStream
+  /// \return the created Message
+  ///
+  /// \note If stream supports zero-copy, this is zero-copy
+  static Result<std::unique_ptr<Message>> ReadFrom(std::shared_ptr<Buffer> metadata,
+                                                   io::InputStream* stream);
+
+  /// \brief Read message body from position in file, and create Message given
+  /// the Flatbuffer metadata
+  /// \param[in] offset the position in the file where the message body starts.
+  /// \param[in] metadata containing a serialized Message flatbuffer
+  /// \param[in] file the seekable file interface to read from
+  /// \return the created Message
+  ///
+  /// \note If file supports zero-copy, this is zero-copy
+  static Result<std::unique_ptr<Message>> ReadFrom(const int64_t offset,
+                                                   std::shared_ptr<Buffer> metadata,
+                                                   io::RandomAccessFile* file);
+
+  /// \brief Return true if message type and contents are equal
+  ///
+  /// \param other another message
+  /// \return true if contents equal
+  bool Equals(const Message& other) const;
+
+  /// \brief the Message metadata
+  ///
+  /// \return buffer
+  std::shared_ptr<Buffer> metadata() const;
+
+  /// \brief Custom metadata serialized in metadata Flatbuffer. Returns nullptr
+  /// when none set
+  const std::shared_ptr<const KeyValueMetadata>& custom_metadata() const;
+
+  /// \brief the Message body, if any
+  ///
+  /// \return buffer is null if no body
+  std::shared_ptr<Buffer> body() const;
+
+  /// \brief The expected body length according to the metadata, for
+  /// verification purposes
+  int64_t body_length() const;
+
+  /// \brief The Message type
+  MessageType type() const;
+
+  /// \brief The Message metadata version
+  MetadataVersion metadata_version() const;
+
+  const void* header() const;
+
+  /// \brief Write length-prefixed metadata and body to output stream
+  ///
+  /// \param[in] file output stream to write to
+  /// \param[in] options IPC writing options including alignment
+  /// \param[out] output_length the number of bytes written
+  /// \return Status
+  Status SerializeTo(io::OutputStream* file, const IpcWriteOptions& options,
+                     int64_t* output_length) const;
+
+  /// \brief Return true if the Message metadata passes Flatbuffer validation
+  bool Verify() const;
+
+  /// \brief Whether a given message type needs a body.
+  static bool HasBody(MessageType type) {
+    return type != MessageType::NONE && type != MessageType::SCHEMA;
+  }
+
+ private:
+  // Hide serialization details from user API
+  class MessageImpl;
+  std::unique_ptr<MessageImpl> impl_;
+
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Message);
+};
+
+ARROW_EXPORT std::string FormatMessageType(MessageType type);
+
+/// \class MessageDecoderListener
+/// \brief An abstract class to listen events from MessageDecoder.
+///
+/// This API is EXPERIMENTAL.
+///
+/// \since 0.17.0
+class ARROW_EXPORT MessageDecoderListener {
+ public:
+  virtual ~MessageDecoderListener() = default;
+
+  /// \brief Called when a message is decoded.
+  ///
+  /// MessageDecoder calls this method when it decodes a message. This
+  /// method is called multiple times when the target stream has
+  /// multiple messages.
+  ///
+  /// \param[in] message a decoded message
+  /// \return Status
+  virtual Status OnMessageDecoded(std::unique_ptr<Message> message) = 0;
+
+  /// \brief Called when the decoder state is changed to
+  /// MessageDecoder::State::INITIAL.
+  ///
+  /// The default implementation just returns arrow::Status::OK().
+  ///
+  /// \return Status
+  virtual Status OnInitial();
+
+  /// \brief Called when the decoder state is changed to
+  /// MessageDecoder::State::METADATA_LENGTH.
+  ///
+  /// The default implementation just returns arrow::Status::OK().
+  ///
+  /// \return Status
+  virtual Status OnMetadataLength();
+
+  /// \brief Called when the decoder state is changed to
+  /// MessageDecoder::State::METADATA.
+  ///
+  /// The default implementation just returns arrow::Status::OK().
+  ///
+  /// \return Status
+  virtual Status OnMetadata();
+
+  /// \brief Called when the decoder state is changed to
+  /// MessageDecoder::State::BODY.
+  ///
+  /// The default implementation just returns arrow::Status::OK().
+  ///
+  /// \return Status
+  virtual Status OnBody();
+
+  /// \brief Called when the decoder state is changed to
+  /// MessageDecoder::State::EOS.
+  ///
+  /// The default implementation just returns arrow::Status::OK().
+  ///
+  /// \return Status
+  virtual Status OnEOS();
+};
+
+/// \class AssignMessageDecoderListener
+/// \brief Assign a message decoded by MessageDecoder.
+///
+/// This API is EXPERIMENTAL.
+///
+/// \since 0.17.0
+class ARROW_EXPORT AssignMessageDecoderListener : public MessageDecoderListener {
+ public:
+  /// \brief Construct a listener that assigns a decoded message to the
+  /// specified location.
+  ///
+  /// \param[in] message a location to store the received message
+  explicit AssignMessageDecoderListener(std::unique_ptr<Message>* message)
+      : message_(message) {}
+
+  virtual ~AssignMessageDecoderListener() = default;
+
+  Status OnMessageDecoded(std::unique_ptr<Message> message) override {
+    *message_ = std::move(message);
+    return Status::OK();
+  }
+
+ private:
+  std::unique_ptr<Message>* message_;
+
+  ARROW_DISALLOW_COPY_AND_ASSIGN(AssignMessageDecoderListener);
+};
+
+/// \class MessageDecoder
+/// \brief Push style message decoder that receives data from user.
+///
+/// This API is EXPERIMENTAL.
+///
+/// \since 0.17.0
+class ARROW_EXPORT MessageDecoder {
+ public:
+  /// \brief State for reading a message
+  enum State {
+    /// The initial state. It requires one of the followings as the next data:
+    ///
+    ///   * int32_t continuation token
+    ///   * int32_t end-of-stream mark (== 0)
+    ///   * int32_t metadata length (backward compatibility for
+    ///     reading old IPC messages produced prior to version 0.15.0
+    INITIAL,
+
+    /// It requires int32_t metadata length.
+    METADATA_LENGTH,
+
+    /// It requires metadata.
+    METADATA,
+
+    /// It requires message body.
+    BODY,
+
+    /// The end-of-stream state. No more data is processed.
+    EOS,
+  };
+
+  /// \brief Construct a message decoder.
+  ///
+  /// \param[in] listener a MessageDecoderListener that responds events from
+  /// the decoder
+  /// \param[in] pool an optional MemoryPool to copy metadata on the
+  /// \param[in] skip_body if true the body will be skipped even if the message has a body
+  /// CPU, if required
+  explicit MessageDecoder(std::shared_ptr<MessageDecoderListener> listener,
+                          MemoryPool* pool = default_memory_pool(),
+                          bool skip_body = false);
+
+  /// \brief Construct a message decoder with the specified state.
+  ///
+  /// This is a construct for advanced users that know how to decode
+  /// Message.
+  ///
+  /// \param[in] listener a MessageDecoderListener that responds events from
+  /// the decoder
+  /// \param[in] initial_state an initial state of the decode
+  /// \param[in] initial_next_required_size the number of bytes needed
+  /// to run the next action
+  /// \param[in] pool an optional MemoryPool to copy metadata on the
+  /// CPU, if required
+  /// \param[in] skip_body if true the body will be skipped even if the message has a body
+  MessageDecoder(std::shared_ptr<MessageDecoderListener> listener, State initial_state,
+                 int64_t initial_next_required_size,
+                 MemoryPool* pool = default_memory_pool(), bool skip_body = false);
+
+  virtual ~MessageDecoder();
+
+  /// \brief Feed data to the decoder as a raw data.
+  ///
+  /// If the decoder can decode one or more messages by the data, the
+  /// decoder calls listener->OnMessageDecoded() with a decoded
+  /// message multiple times.
+  ///
+  /// If the state of the decoder is changed, corresponding callbacks
+  /// on listener is called:
+  ///
+  /// * MessageDecoder::State::INITIAL: listener->OnInitial()
+  /// * MessageDecoder::State::METADATA_LENGTH: listener->OnMetadataLength()
+  /// * MessageDecoder::State::METADATA: listener->OnMetadata()
+  /// * MessageDecoder::State::BODY: listener->OnBody()
+  /// * MessageDecoder::State::EOS: listener->OnEOS()
+  ///
+  /// \param[in] data a raw data to be processed. This data isn't
+  /// copied. The passed memory must be kept alive through message
+  /// processing.
+  /// \param[in] size raw data size.
+  /// \return Status
+  Status Consume(const uint8_t* data, int64_t size);
+
+  /// \brief Feed data to the decoder as a Buffer.
+  ///
+  /// If the decoder can decode one or more messages by the Buffer,
+  /// the decoder calls listener->OnMessageDecoded() with a decoded
+  /// message multiple times.
+  ///
+  /// \param[in] buffer a Buffer to be processed.
+  /// \return Status
+  Status Consume(std::shared_ptr<Buffer> buffer);
+
+  /// \brief Return the number of bytes needed to advance the state of
+  /// the decoder.
+  ///
+  /// This method is provided for users who want to optimize performance.
+  /// Normal users don't need to use this method.
+  ///
+  /// Here is an example usage for normal users:
+  ///
+  /// ~~~{.cpp}
+  /// decoder.Consume(buffer1);
+  /// decoder.Consume(buffer2);
+  /// decoder.Consume(buffer3);
+  /// ~~~
+  ///
+  /// Decoder has internal buffer. If consumed data isn't enough to
+  /// advance the state of the decoder, consumed data is buffered to
+  /// the internal buffer. It causes performance overhead.
+  ///
+  /// If you pass next_required_size() size data to each Consume()
+  /// call, the decoder doesn't use its internal buffer. It improves
+  /// performance.
+  ///
+  /// Here is an example usage to avoid using internal buffer:
+  ///
+  /// ~~~{.cpp}
+  /// buffer1 = get_data(decoder.next_required_size());
+  /// decoder.Consume(buffer1);
+  /// buffer2 = get_data(decoder.next_required_size());
+  /// decoder.Consume(buffer2);
+  /// ~~~
+  ///
+  /// Users can use this method to avoid creating small
+  /// chunks. Message body must be contiguous data. If users pass
+  /// small chunks to the decoder, the decoder needs concatenate small
+  /// chunks internally. It causes performance overhead.
+  ///
+  /// Here is an example usage to reduce small chunks:
+  ///
+  /// ~~~{.cpp}
+  /// buffer = AllocateResizableBuffer();
+  /// while ((small_chunk = get_data(&small_chunk_size))) {
+  ///   auto current_buffer_size = buffer->size();
+  ///   buffer->Resize(current_buffer_size + small_chunk_size);
+  ///   memcpy(buffer->mutable_data() + current_buffer_size,
+  ///          small_chunk,
+  ///          small_chunk_size);
+  ///   if (buffer->size() < decoder.next_required_size()) {
+  ///     continue;
+  ///   }
+  ///   std::shared_ptr<arrow::Buffer> chunk(buffer.release());
+  ///   decoder.Consume(chunk);
+  ///   buffer = AllocateResizableBuffer();
+  /// }
+  /// if (buffer->size() > 0) {
+  ///   std::shared_ptr<arrow::Buffer> chunk(buffer.release());
+  ///   decoder.Consume(chunk);
+  /// }
+  /// ~~~
+  ///
+  /// \return the number of bytes needed to advance the state of the
+  /// decoder
+  int64_t next_required_size() const;
+
+  /// \brief Return the current state of the decoder.
+  ///
+  /// This method is provided for users who want to optimize performance.
+  /// Normal users don't need to use this method.
+  ///
+  /// Decoder doesn't need Buffer to process data on the
+  /// MessageDecoder::State::INITIAL state and the
+  /// MessageDecoder::State::METADATA_LENGTH. Creating Buffer has
+  /// performance overhead. Advanced users can avoid creating Buffer
+  /// by checking the current state of the decoder:
+  ///
+  /// ~~~{.cpp}
+  /// switch (decoder.state()) {
+  ///   MessageDecoder::State::INITIAL:
+  ///   MessageDecoder::State::METADATA_LENGTH:
+  ///     {
+  ///       uint8_t data[sizeof(int32_t)];
+  ///       auto data_size = input->Read(decoder.next_required_size(), data);
+  ///       decoder.Consume(data, data_size);
+  ///     }
+  ///     break;
+  ///   default:
+  ///     {
+  ///       auto buffer = input->Read(decoder.next_required_size());
+  ///       decoder.Consume(buffer);
+  ///     }
+  ///     break;
+  /// }
+  /// ~~~
+  ///
+  /// \return the current state
+  State state() const;
+
+ private:
+  class MessageDecoderImpl;
+  std::unique_ptr<MessageDecoderImpl> impl_;
+
+  ARROW_DISALLOW_COPY_AND_ASSIGN(MessageDecoder);
+};
+
+/// \brief Abstract interface for a sequence of messages
+/// \since 0.5.0
+class ARROW_EXPORT MessageReader {
+ public:
+  virtual ~MessageReader() = default;
+
+  /// \brief Create MessageReader that reads from InputStream
+  static std::unique_ptr<MessageReader> Open(io::InputStream* stream);
+
+  /// \brief Create MessageReader that reads from owned InputStream
+  static std::unique_ptr<MessageReader> Open(
+      const std::shared_ptr<io::InputStream>& owned_stream);
+
+  /// \brief Read next Message from the interface
+  ///
+  /// \return an arrow::ipc::Message instance
+  virtual Result<std::unique_ptr<Message>> ReadNextMessage() = 0;
+};
+
+// the first parameter of the function should be a pointer to metadata (aka.
+// org::apache::arrow::flatbuf::RecordBatch*)
+using FieldsLoaderFunction = std::function<Status(const void*, io::RandomAccessFile*)>;
+
+/// \brief Read encapsulated RPC message from position in file
+///
+/// Read a length-prefixed message flatbuffer starting at the indicated file
+/// offset. If the message has a body with non-zero length, it will also be
+/// read
+///
+/// The metadata_length includes at least the length prefix and the flatbuffer
+///
+/// \param[in] offset the position in the file where the message starts. The
+/// first 4 bytes after the offset are the message length
+/// \param[in] metadata_length the total number of bytes to read from file
+/// \param[in] file the seekable file interface to read from
+/// \param[in] fields_loader the function for loading subset of fields from the given file
+/// \return the message read
+
+ARROW_EXPORT
+Result<std::unique_ptr<Message>> ReadMessage(
+    const int64_t offset, const int32_t metadata_length, io::RandomAccessFile* file,
+    const FieldsLoaderFunction& fields_loader = {});
+
+/// \brief Read encapsulated RPC message from cached buffers
+///
+/// The buffers should contain an entire message.  Partial reads are not handled.
+///
+/// This method can be used to read just the metadata by passing in a nullptr for the
+/// body.  The body will then be skipped and the body size will not be validated.
+///
+/// If the body buffer is provided then it must be the complete body buffer
+///
+/// This is similar to Message::Open but performs slightly more validation (e.g. checks
+/// to see that the metadata length is correct and that the body is the size the metadata
+/// expected)
+///
+/// \param metadata The bytes for the metadata
+/// \param body The bytes for the body
+/// \return The message represented by the buffers
+ARROW_EXPORT Result<std::unique_ptr<Message>> ReadMessage(
+    std::shared_ptr<Buffer> metadata, std::shared_ptr<Buffer> body);
+
+ARROW_EXPORT
+Future<std::shared_ptr<Message>> ReadMessageAsync(
+    const int64_t offset, const int32_t metadata_length, const int64_t body_length,
+    io::RandomAccessFile* file, const io::IOContext& context = io::default_io_context());
+
+/// \brief Advance stream to an 8-byte offset if its position is not a multiple
+/// of 8 already
+/// \param[in] stream an input stream
+/// \param[in] alignment the byte multiple for the metadata prefix, usually 8
+/// or 64, to ensure the body starts on a multiple of that alignment
+/// \return Status
+ARROW_EXPORT
+Status AlignStream(io::InputStream* stream, int32_t alignment = 8);
+
+/// \brief Advance stream to an 8-byte offset if its position is not a multiple
+/// of 8 already
+/// \param[in] stream an output stream
+/// \param[in] alignment the byte multiple for the metadata prefix, usually 8
+/// or 64, to ensure the body starts on a multiple of that alignment
+/// \return Status
+ARROW_EXPORT
+Status AlignStream(io::OutputStream* stream, int32_t alignment = 8);
+
+/// \brief Return error Status if file position is not a multiple of the
+/// indicated alignment
+ARROW_EXPORT
+Status CheckAligned(io::FileInterface* stream, int32_t alignment = 8);
+
+/// \brief Read encapsulated IPC message (metadata and body) from InputStream
+///
+/// Returns null if there are not enough bytes available or the
+/// message length is 0 (e.g. EOS in a stream)
+///
+/// \param[in] stream an input stream
+/// \param[in] pool an optional MemoryPool to copy metadata on the CPU, if required
+/// \return Message
+ARROW_EXPORT
+Result<std::unique_ptr<Message>> ReadMessage(io::InputStream* stream,
+                                             MemoryPool* pool = default_memory_pool());
+
+/// \brief Feed data from InputStream to MessageDecoder to decode an
+/// encapsulated IPC message (metadata and body)
+///
+/// This API is EXPERIMENTAL.
+///
+/// \param[in] decoder a decoder
+/// \param[in] stream an input stream
+/// \return Status
+///
+/// \since 0.17.0
+ARROW_EXPORT
+Status DecodeMessage(MessageDecoder* decoder, io::InputStream* stream);
+
+/// Write encapsulated IPC message Does not make assumptions about
+/// whether the stream is aligned already. Can write legacy (pre
+/// version 0.15.0) IPC message if option set
+///
+/// continuation: 0xFFFFFFFF
+/// message_size: int32
+/// message: const void*
+/// padding
+///
+///
+/// \param[in] message a buffer containing the metadata to write
+/// \param[in] options IPC writing options, including alignment and
+/// legacy message support
+/// \param[in,out] file the OutputStream to write to
+/// \param[out] message_length the total size of the payload written including
+/// padding
+/// \return Status
+Status WriteMessage(const Buffer& message, const IpcWriteOptions& options,
+                    io::OutputStream* file, int32_t* message_length);
+
+}  // namespace ipc
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/ipc/options.h b/pyarrow/include/arrow/ipc/options.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec0e2a5b6f900c69c09a12f9a47a39f987c33aee
--- /dev/null
+++ b/pyarrow/include/arrow/ipc/options.h
@@ -0,0 +1,201 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "arrow/io/caching.h"
+#include "arrow/ipc/type_fwd.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/align_util.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+namespace ipc {
+
+// ARROW-109: We set this number arbitrarily to help catch user mistakes. For
+// deeply nested schemas, it is expected the user will indicate explicitly the
+// maximum allowed recursion depth
+constexpr int kMaxNestingDepth = 64;
+
+/// \brief Options for writing Arrow IPC messages
+struct ARROW_EXPORT IpcWriteOptions {
+  /// \brief If true, allow field lengths that don't fit in a signed 32-bit int.
+  ///
+  /// Some implementations may not be able to parse streams created with this option.
+  bool allow_64bit = false;
+
+  /// \brief The maximum permitted schema nesting depth.
+  int max_recursion_depth = kMaxNestingDepth;
+
+  /// \brief Write padding after memory buffers up to this multiple of bytes.
+  int32_t alignment = 8;
+
+  /// \brief Write the pre-0.15.0 IPC message format
+  ///
+  /// This legacy format consists of a 4-byte prefix instead of 8-byte.
+  bool write_legacy_ipc_format = false;
+
+  /// \brief The memory pool to use for allocations made during IPC writing
+  ///
+  /// While Arrow IPC is predominantly zero-copy, it may have to allocate
+  /// memory in some cases (for example if compression is enabled).
+  MemoryPool* memory_pool = default_memory_pool();
+
+  /// \brief Compression codec to use for record batch body buffers
+  ///
+  /// May only be UNCOMPRESSED, LZ4_FRAME and ZSTD.
+  std::shared_ptr<util::Codec> codec;
+
+  /// \brief Minimum space savings percentage required for compression to be applied
+  ///
+  /// Space savings is calculated as (1.0 - compressed_size / uncompressed_size).
+  ///
+  /// For example, if min_space_savings = 0.1, a 100-byte body buffer won't undergo
+  /// compression if its expected compressed size exceeds 90 bytes. If this option is
+  /// unset, compression will be used indiscriminately. If no codec was supplied, this
+  /// option is ignored.
+  ///
+  /// Values outside of the range [0,1] are handled as errors.
+  ///
+  /// Note that enabling this option may result in unreadable data for Arrow C++ versions
+  /// prior to 12.0.0.
+  std::optional<double> min_space_savings;
+
+  /// \brief Use global CPU thread pool to parallelize any computational tasks
+  /// like compression
+  bool use_threads = true;
+
+  /// \brief Whether to emit dictionary deltas
+  ///
+  /// If false, a changed dictionary for a given field will emit a full
+  /// dictionary replacement.
+  /// If true, a changed dictionary will be compared against the previous
+  /// version. If possible, a dictionary delta will be emitted, otherwise
+  /// a full dictionary replacement.
+  ///
+  /// Default is false to maximize stream compatibility.
+  ///
+  /// Also, note that if a changed dictionary is a nested dictionary,
+  /// then a delta is never emitted, for compatibility with the read path.
+  bool emit_dictionary_deltas = false;
+
+  /// \brief Whether to unify dictionaries for the IPC file format
+  ///
+  /// The IPC file format doesn't support dictionary replacements.
+  /// Therefore, chunks of a column with a dictionary type must have the same
+  /// dictionary in each record batch (or an extended dictionary + delta).
+  ///
+  /// If this option is true, RecordBatchWriter::WriteTable will attempt
+  /// to unify dictionaries across each table column.  If this option is
+  /// false, incompatible dictionaries across a table column will simply
+  /// raise an error.
+  ///
+  /// Note that enabling this option has a runtime cost. Also, not all types
+  /// currently support dictionary unification.
+  ///
+  /// This option is ignored for IPC streams, which support dictionary replacement
+  /// and deltas.
+  bool unify_dictionaries = false;
+
+  /// \brief Format version to use for IPC messages and their metadata.
+  ///
+  /// Presently using V5 version (readable by 1.0.0 and later).
+  /// V4 is also available (readable by 0.8.0 and later).
+  MetadataVersion metadata_version = MetadataVersion::V5;
+
+  static IpcWriteOptions Defaults();
+};
+
+/// \brief Alignment of data in memory
+/// Alignment values larger than 0 are taken directly as byte alignment value
+/// See util::EnsureAlignment(..., int64_t alignment, ...)
+enum class Alignment : int64_t {
+  /// \brief data is aligned depending on the actual data type
+  kDataTypeSpecificAlignment = util::kValueAlignment,
+  /// \brief no particular alignment enforced
+  kAnyAlignment = 0,
+  /// \brief data is aligned to 64-byte boundary
+  k64ByteAlignment = 64
+};
+
+/// \brief Options for reading Arrow IPC messages
+struct ARROW_EXPORT IpcReadOptions {
+  /// \brief The maximum permitted schema nesting depth.
+  int max_recursion_depth = kMaxNestingDepth;
+
+  /// \brief The memory pool to use for allocations made during IPC reading
+  ///
+  /// While Arrow IPC is predominantly zero-copy, it may have to allocate
+  /// memory in some cases (for example if compression is enabled).
+  MemoryPool* memory_pool = default_memory_pool();
+
+  /// \brief Top-level schema fields to include when deserializing RecordBatch.
+  ///
+  /// If empty (the default), return all deserialized fields.
+  /// If non-empty, the values are the indices of fields in the top-level schema.
+  std::vector<int> included_fields;
+
+  /// \brief Use global CPU thread pool to parallelize any computational tasks
+  /// like decompression
+  bool use_threads = true;
+
+  /// \brief Whether to convert incoming data to platform-native endianness
+  ///
+  /// If the endianness of the received schema is not equal to platform-native
+  /// endianness, then all buffers with endian-sensitive data will be byte-swapped.
+  /// This includes the value buffers of numeric types, temporal types, decimal
+  /// types, as well as the offset buffers of variable-sized binary and list-like
+  /// types.
+  ///
+  /// Endianness conversion is achieved by the RecordBatchFileReader,
+  /// RecordBatchStreamReader and StreamDecoder classes.
+  bool ensure_native_endian = true;
+
+  /// \brief How to align data if mis-aligned
+  ///
+  /// Data is copied to aligned memory locations allocated via the
+  /// MemoryPool configured as \ref arrow::ipc::IpcReadOptions::memory_pool.
+  /// Some use cases might require data to have a specific alignment, for example,
+  /// for the data buffer of an Int32 array to be aligned on a 4-byte boundary.
+  ///
+  /// Default (kAnyAlignment) keeps the alignment as is, so no copy of data occurs.
+  Alignment ensure_alignment = Alignment::kAnyAlignment;
+
+  /// \brief Options to control caching behavior when pre-buffering is requested
+  ///
+  /// The lazy property will always be reset to true to deliver the expected behavior
+  io::CacheOptions pre_buffer_cache_options = io::CacheOptions::LazyDefaults();
+
+  static IpcReadOptions Defaults();
+};
+
+namespace internal {
+
+Status CheckCompressionSupported(Compression::type codec);
+
+}  // namespace internal
+}  // namespace ipc
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/ipc/reader.h b/pyarrow/include/arrow/ipc/reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..888f59a627771b4591d2eb030483b70a49630999
--- /dev/null
+++ b/pyarrow/include/arrow/ipc/reader.h
@@ -0,0 +1,638 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Read Arrow files and streams
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/io/caching.h"
+#include "arrow/io/type_fwd.h"
+#include "arrow/ipc/message.h"
+#include "arrow/ipc/options.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace ipc {
+
+class DictionaryMemo;
+struct IpcPayload;
+
+using RecordBatchReader = ::arrow::RecordBatchReader;
+
+struct ReadStats {
+  /// Number of IPC messages read.
+  int64_t num_messages = 0;
+  /// Number of record batches read.
+  int64_t num_record_batches = 0;
+  /// Number of dictionary batches read.
+  ///
+  /// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries
+  int64_t num_dictionary_batches = 0;
+
+  /// Number of dictionary deltas read.
+  int64_t num_dictionary_deltas = 0;
+  /// Number of replaced dictionaries (i.e. where a dictionary batch replaces
+  /// an existing dictionary with an unrelated new dictionary).
+  int64_t num_replaced_dictionaries = 0;
+};
+
+/// \brief Synchronous batch stream reader that reads from io::InputStream
+///
+/// This class reads the schema (plus any dictionaries) as the first messages
+/// in the stream, followed by record batches. For more granular zero-copy
+/// reads see the ReadRecordBatch functions
+class ARROW_EXPORT RecordBatchStreamReader : public RecordBatchReader {
+ public:
+  /// Create batch reader from generic MessageReader.
+  /// This will take ownership of the given MessageReader.
+  ///
+  /// \param[in] message_reader a MessageReader implementation
+  /// \param[in] options any IPC reading options (optional)
+  /// \return the created batch reader
+  static Result<std::shared_ptr<RecordBatchStreamReader>> Open(
+      std::unique_ptr<MessageReader> message_reader,
+      const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+  /// \brief Record batch stream reader from InputStream
+  ///
+  /// \param[in] stream an input stream instance. Must stay alive throughout
+  /// lifetime of stream reader
+  /// \param[in] options any IPC reading options (optional)
+  /// \return the created batch reader
+  static Result<std::shared_ptr<RecordBatchStreamReader>> Open(
+      io::InputStream* stream,
+      const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+  /// \brief Open stream and retain ownership of stream object
+  /// \param[in] stream the input stream
+  /// \param[in] options any IPC reading options (optional)
+  /// \return the created batch reader
+  static Result<std::shared_ptr<RecordBatchStreamReader>> Open(
+      const std::shared_ptr<io::InputStream>& stream,
+      const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+  /// \brief Return current read statistics
+  virtual ReadStats stats() const = 0;
+};
+
+/// \brief Reads the record batch file format
+class ARROW_EXPORT RecordBatchFileReader
+    : public std::enable_shared_from_this<RecordBatchFileReader> {
+ public:
+  virtual ~RecordBatchFileReader() = default;
+
+  /// \brief Open a RecordBatchFileReader
+  ///
+  /// Open a file-like object that is assumed to be self-contained; i.e., the
+  /// end of the file interface is the end of the Arrow file. Note that there
+  /// can be any amount of data preceding the Arrow-formatted data, because we
+  /// need only locate the end of the Arrow file stream to discover the metadata
+  /// and then proceed to read the data into memory.
+  static Result<std::shared_ptr<RecordBatchFileReader>> Open(
+      io::RandomAccessFile* file,
+      const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+  /// \brief Open a RecordBatchFileReader
+  /// If the file is embedded within some larger file or memory region, you can
+  /// pass the absolute memory offset to the end of the file (which contains the
+  /// metadata footer). The metadata must have been written with memory offsets
+  /// relative to the start of the containing file
+  ///
+  /// \param[in] file the data source
+  /// \param[in] footer_offset the position of the end of the Arrow file
+  /// \param[in] options options for IPC reading
+  /// \return the returned reader
+  static Result<std::shared_ptr<RecordBatchFileReader>> Open(
+      io::RandomAccessFile* file, int64_t footer_offset,
+      const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+  /// \brief Version of Open that retains ownership of file
+  ///
+  /// \param[in] file the data source
+  /// \param[in] options options for IPC reading
+  /// \return the returned reader
+  static Result<std::shared_ptr<RecordBatchFileReader>> Open(
+      const std::shared_ptr<io::RandomAccessFile>& file,
+      const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+  /// \brief Version of Open that retains ownership of file
+  ///
+  /// \param[in] file the data source
+  /// \param[in] footer_offset the position of the end of the Arrow file
+  /// \param[in] options options for IPC reading
+  /// \return the returned reader
+  static Result<std::shared_ptr<RecordBatchFileReader>> Open(
+      const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
+      const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+  /// \brief Open a file asynchronously (owns the file).
+  static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+      const std::shared_ptr<io::RandomAccessFile>& file,
+      const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+  /// \brief Open a file asynchronously (borrows the file).
+  static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+      io::RandomAccessFile* file,
+      const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+  /// \brief Open a file asynchronously (owns the file).
+  static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+      const std::shared_ptr<io::RandomAccessFile>& file, int64_t footer_offset,
+      const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+  /// \brief Open a file asynchronously (borrows the file).
+  static Future<std::shared_ptr<RecordBatchFileReader>> OpenAsync(
+      io::RandomAccessFile* file, int64_t footer_offset,
+      const IpcReadOptions& options = IpcReadOptions::Defaults());
+
+  /// \brief The schema read from the file
+  virtual std::shared_ptr<Schema> schema() const = 0;
+
+  /// \brief Returns the number of record batches in the file
+  virtual int num_record_batches() const = 0;
+
+  /// \brief Return the metadata version from the file metadata
+  virtual MetadataVersion version() const = 0;
+
+  /// \brief Return the contents of the custom_metadata field from the file's
+  /// Footer
+  virtual std::shared_ptr<const KeyValueMetadata> metadata() const = 0;
+
+  /// \brief Read a particular record batch from the file. Does not copy memory
+  /// if the input source supports zero-copy.
+  ///
+  /// \param[in] i the index of the record batch to return
+  /// \return the read batch
+  virtual Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(int i) = 0;
+
+  /// \brief Read a particular record batch along with its custom metadata from the file.
+  /// Does not copy memory if the input source supports zero-copy.
+  ///
+  /// \param[in] i the index of the record batch to return
+  /// \return a struct containing the read batch and its custom metadata
+  virtual Result<RecordBatchWithMetadata> ReadRecordBatchWithCustomMetadata(int i) = 0;
+
+  /// \brief Return current read statistics
+  virtual ReadStats stats() const = 0;
+
+  /// \brief Computes the total number of rows in the file.
+  virtual Result<int64_t> CountRows() = 0;
+
+  /// \brief Begin loading metadata for the desired batches into memory.
+  ///
+  /// This method will also begin loading all dictionaries messages into memory.
+  ///
+  /// For a regular file this will immediately begin disk I/O in the background on a
+  /// thread on the IOContext's thread pool.  If the file is memory mapped this will
+  /// ensure the memory needed for the metadata is paged from disk into memory
+  ///
+  /// \param indices Indices of the batches to prefetch
+  ///                If empty then all batches will be prefetched.
+  virtual Status PreBufferMetadata(const std::vector<int>& indices) = 0;
+
+  /// \brief Get a reentrant generator of record batches.
+  ///
+  /// \param[in] coalesce If true, enable I/O coalescing.
+  /// \param[in] io_context The IOContext to use (controls which thread pool
+  ///     is used for I/O).
+  /// \param[in] cache_options Options for coalescing (if enabled).
+  /// \param[in] executor Optionally, an executor to use for decoding record
+  ///     batches. This is generally only a benefit for very wide and/or
+  ///     compressed batches.
+  virtual Result<AsyncGenerator<std::shared_ptr<RecordBatch>>> GetRecordBatchGenerator(
+      const bool coalesce = false,
+      const io::IOContext& io_context = io::default_io_context(),
+      const io::CacheOptions cache_options = io::CacheOptions::LazyDefaults(),
+      arrow::internal::Executor* executor = NULLPTR) = 0;
+
+  /// \brief Collect all batches as a vector of record batches
+  Result<RecordBatchVector> ToRecordBatches();
+
+  /// \brief Collect all batches and concatenate as arrow::Table
+  Result<std::shared_ptr<Table>> ToTable();
+};
+
+/// \brief A general listener class to receive events.
+///
+/// You must implement callback methods for interested events.
+///
+/// This API is EXPERIMENTAL.
+///
+/// \since 0.17.0
+class ARROW_EXPORT Listener {
+ public:
+  virtual ~Listener() = default;
+
+  /// \brief Called when end-of-stream is received.
+  ///
+  /// The default implementation just returns arrow::Status::OK().
+  ///
+  /// \return Status
+  ///
+  /// \see StreamDecoder
+  virtual Status OnEOS();
+
+  /// \brief Called when a record batch is decoded and
+  /// OnRecordBatchWithMetadataDecoded() isn't overridden.
+  ///
+  /// The default implementation just returns
+  /// arrow::Status::NotImplemented().
+  ///
+  /// \param[in] record_batch a record batch decoded
+  /// \return Status
+  ///
+  /// \see StreamDecoder
+  virtual Status OnRecordBatchDecoded(std::shared_ptr<RecordBatch> record_batch);
+
+  /// \brief Called when a record batch with custom metadata is decoded.
+  ///
+  /// The default implementation just calls OnRecordBatchDecoded()
+  /// without custom metadata.
+  ///
+  /// \param[in] record_batch_with_metadata a record batch with custom
+  /// metadata decoded
+  /// \return Status
+  ///
+  /// \see StreamDecoder
+  ///
+  /// \since 13.0.0
+  virtual Status OnRecordBatchWithMetadataDecoded(
+      RecordBatchWithMetadata record_batch_with_metadata);
+
+  /// \brief Called when a schema is decoded.
+  ///
+  /// The default implementation just returns arrow::Status::OK().
+  ///
+  /// \param[in] schema a schema decoded
+  /// \return Status
+  ///
+  /// \see StreamDecoder
+  virtual Status OnSchemaDecoded(std::shared_ptr<Schema> schema);
+
+  /// \brief Called when a schema is decoded.
+  ///
+  /// The default implementation just calls OnSchemaDecoded(schema)
+  /// (without filtered_schema) to keep backward compatibility.
+  ///
+  /// \param[in] schema a schema decoded
+  /// \param[in] filtered_schema a filtered schema that only has read fields
+  /// \return Status
+  ///
+  /// \see StreamDecoder
+  ///
+  /// \since 13.0.0
+  virtual Status OnSchemaDecoded(std::shared_ptr<Schema> schema,
+                                 std::shared_ptr<Schema> filtered_schema);
+};
+
+/// \brief Collect schema and record batches decoded by StreamDecoder.
+///
+/// This API is EXPERIMENTAL.
+///
+/// \since 0.17.0
+class ARROW_EXPORT CollectListener : public Listener {
+ public:
+  CollectListener() : schema_(), filtered_schema_(), record_batches_(), metadatas_() {}
+  virtual ~CollectListener() = default;
+
+  Status OnSchemaDecoded(std::shared_ptr<Schema> schema,
+                         std::shared_ptr<Schema> filtered_schema) override {
+    schema_ = std::move(schema);
+    filtered_schema_ = std::move(filtered_schema);
+    return Status::OK();
+  }
+
+  Status OnRecordBatchWithMetadataDecoded(
+      RecordBatchWithMetadata record_batch_with_metadata) override {
+    record_batches_.push_back(std::move(record_batch_with_metadata.batch));
+    metadatas_.push_back(std::move(record_batch_with_metadata.custom_metadata));
+    return Status::OK();
+  }
+
+  /// \return the decoded schema
+  std::shared_ptr<Schema> schema() const { return schema_; }
+
+  /// \return the filtered schema
+  std::shared_ptr<Schema> filtered_schema() const { return filtered_schema_; }
+
+  /// \return the all decoded record batches
+  const std::vector<std::shared_ptr<RecordBatch>>& record_batches() const {
+    return record_batches_;
+  }
+
+  /// \return the all decoded metadatas
+  const std::vector<std::shared_ptr<KeyValueMetadata>>& metadatas() const {
+    return metadatas_;
+  }
+
+  /// \return the number of collected record batches
+  int64_t num_record_batches() const { return record_batches_.size(); }
+
+  /// \return the last decoded record batch and remove it from
+  /// record_batches
+  std::shared_ptr<RecordBatch> PopRecordBatch() {
+    auto record_batch_with_metadata = PopRecordBatchWithMetadata();
+    return std::move(record_batch_with_metadata.batch);
+  }
+
+  /// \return the last decoded record batch with custom metadata and
+  /// remove it from record_batches
+  RecordBatchWithMetadata PopRecordBatchWithMetadata() {
+    RecordBatchWithMetadata record_batch_with_metadata;
+    if (record_batches_.empty()) {
+      return record_batch_with_metadata;
+    }
+    record_batch_with_metadata.batch = std::move(record_batches_.back());
+    record_batch_with_metadata.custom_metadata = std::move(metadatas_.back());
+    record_batches_.pop_back();
+    metadatas_.pop_back();
+    return record_batch_with_metadata;
+  }
+
+ private:
+  std::shared_ptr<Schema> schema_;
+  std::shared_ptr<Schema> filtered_schema_;
+  std::vector<std::shared_ptr<RecordBatch>> record_batches_;
+  std::vector<std::shared_ptr<KeyValueMetadata>> metadatas_;
+};
+
+/// \brief Push style stream decoder that receives data from user.
+///
+/// This class decodes the Apache Arrow IPC streaming format data.
+///
+/// This API is EXPERIMENTAL.
+///
+/// \see https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format
+///
+/// \since 0.17.0
+class ARROW_EXPORT StreamDecoder {
+ public:
+  /// \brief Construct a stream decoder.
+  ///
+  /// \param[in] listener a Listener that must implement
+  /// Listener::OnRecordBatchDecoded() to receive decoded record batches
+  /// \param[in] options any IPC reading options (optional)
+  StreamDecoder(std::shared_ptr<Listener> listener,
+                IpcReadOptions options = IpcReadOptions::Defaults());
+
+  virtual ~StreamDecoder();
+
+  /// \brief Feed data to the decoder as a raw data.
+  ///
+  /// If the decoder can read one or more record batches by the data,
+  /// the decoder calls listener->OnRecordBatchDecoded() with a
+  /// decoded record batch multiple times.
+  ///
+  /// \param[in] data a raw data to be processed. This data isn't
+  /// copied. The passed memory must be kept alive through record
+  /// batch processing.
+  /// \param[in] size raw data size.
+  /// \return Status
+  Status Consume(const uint8_t* data, int64_t size);
+
+  /// \brief Feed data to the decoder as a Buffer.
+  ///
+  /// If the decoder can read one or more record batches by the
+  /// Buffer, the decoder calls listener->RecordBatchReceived() with a
+  /// decoded record batch multiple times.
+  ///
+  /// \param[in] buffer a Buffer to be processed.
+  /// \return Status
+  Status Consume(std::shared_ptr<Buffer> buffer);
+
+  /// \brief Reset the internal status.
+  ///
+  /// You can reuse this decoder for new stream after calling
+  /// this.
+  ///
+  /// \return Status
+  Status Reset();
+
+  /// \return the shared schema of the record batches in the stream
+  std::shared_ptr<Schema> schema() const;
+
+  /// \brief Return the number of bytes needed to advance the state of
+  /// the decoder.
+  ///
+  /// This method is provided for users who want to optimize performance.
+  /// Normal users don't need to use this method.
+  ///
+  /// Here is an example usage for normal users:
+  ///
+  /// ~~~{.cpp}
+  /// decoder.Consume(buffer1);
+  /// decoder.Consume(buffer2);
+  /// decoder.Consume(buffer3);
+  /// ~~~
+  ///
+  /// Decoder has internal buffer. If consumed data isn't enough to
+  /// advance the state of the decoder, consumed data is buffered to
+  /// the internal buffer. It causes performance overhead.
+  ///
+  /// If you pass next_required_size() size data to each Consume()
+  /// call, the decoder doesn't use its internal buffer. It improves
+  /// performance.
+  ///
+  /// Here is an example usage to avoid using internal buffer:
+  ///
+  /// ~~~{.cpp}
+  /// buffer1 = get_data(decoder.next_required_size());
+  /// decoder.Consume(buffer1);
+  /// buffer2 = get_data(decoder.next_required_size());
+  /// decoder.Consume(buffer2);
+  /// ~~~
+  ///
+  /// Users can use this method to avoid creating small chunks. Record
+  /// batch data must be contiguous data. If users pass small chunks
+  /// to the decoder, the decoder needs concatenate small chunks
+  /// internally. It causes performance overhead.
+  ///
+  /// Here is an example usage to reduce small chunks:
+  ///
+  /// ~~~{.cpp}
+  /// buffer = AllocateResizableBuffer();
+  /// while ((small_chunk = get_data(&small_chunk_size))) {
+  ///   auto current_buffer_size = buffer->size();
+  ///   buffer->Resize(current_buffer_size + small_chunk_size);
+  ///   memcpy(buffer->mutable_data() + current_buffer_size,
+  ///          small_chunk,
+  ///          small_chunk_size);
+  ///   if (buffer->size() < decoder.next_required_size()) {
+  ///     continue;
+  ///   }
+  ///   std::shared_ptr<arrow::Buffer> chunk(buffer.release());
+  ///   decoder.Consume(chunk);
+  ///   buffer = AllocateResizableBuffer();
+  /// }
+  /// if (buffer->size() > 0) {
+  ///   std::shared_ptr<arrow::Buffer> chunk(buffer.release());
+  ///   decoder.Consume(chunk);
+  /// }
+  /// ~~~
+  ///
+  /// \return the number of bytes needed to advance the state of the
+  /// decoder
+  int64_t next_required_size() const;
+
+  /// \brief Return current read statistics
+  ReadStats stats() const;
+
+ private:
+  class StreamDecoderImpl;
+  std::unique_ptr<StreamDecoderImpl> impl_;
+
+  ARROW_DISALLOW_COPY_AND_ASSIGN(StreamDecoder);
+};
+
+// Generic read functions; does not copy data if the input supports zero copy reads
+
+/// \brief Read Schema from stream serialized as a single IPC message
+/// and populate any dictionary-encoded fields into a DictionaryMemo
+///
+/// \param[in] stream an InputStream
+/// \param[in] dictionary_memo for recording dictionary-encoded fields
+/// \return the output Schema
+///
+/// If record batches follow the schema, it is better to use
+/// RecordBatchStreamReader
+ARROW_EXPORT
+Result<std::shared_ptr<Schema>> ReadSchema(io::InputStream* stream,
+                                           DictionaryMemo* dictionary_memo);
+
+/// \brief Read Schema from encapsulated Message
+///
+/// \param[in] message the message containing the Schema IPC metadata
+/// \param[in] dictionary_memo DictionaryMemo for recording dictionary-encoded
+/// fields. Can be nullptr if you are sure there are no
+/// dictionary-encoded fields
+/// \return the resulting Schema
+ARROW_EXPORT
+Result<std::shared_ptr<Schema>> ReadSchema(const Message& message,
+                                           DictionaryMemo* dictionary_memo);
+
+/// Read record batch as encapsulated IPC message with metadata size prefix and
+/// header
+///
+/// \param[in] schema the record batch schema
+/// \param[in] dictionary_memo DictionaryMemo which has any
+/// dictionaries. Can be nullptr if you are sure there are no
+/// dictionary-encoded fields
+/// \param[in] options IPC options for reading
+/// \param[in] stream the file where the batch is located
+/// \return the read record batch
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
+    const std::shared_ptr<Schema>& schema, const DictionaryMemo* dictionary_memo,
+    const IpcReadOptions& options, io::InputStream* stream);
+
+/// \brief Read record batch from message
+///
+/// \param[in] message a Message containing the record batch metadata
+/// \param[in] schema the record batch schema
+/// \param[in] dictionary_memo DictionaryMemo which has any
+/// dictionaries. Can be nullptr if you are sure there are no
+/// dictionary-encoded fields
+/// \param[in] options IPC options for reading
+/// \return the read record batch
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
+    const Message& message, const std::shared_ptr<Schema>& schema,
+    const DictionaryMemo* dictionary_memo, const IpcReadOptions& options);
+
+/// Read record batch from file given metadata and schema
+///
+/// \param[in] metadata a Message containing the record batch metadata
+/// \param[in] schema the record batch schema
+/// \param[in] dictionary_memo DictionaryMemo which has any
+/// dictionaries. Can be nullptr if you are sure there are no
+/// dictionary-encoded fields
+/// \param[in] file a random access file
+/// \param[in] options options for deserialization
+/// \return the read record batch
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ReadRecordBatch(
+    const Buffer& metadata, const std::shared_ptr<Schema>& schema,
+    const DictionaryMemo* dictionary_memo, const IpcReadOptions& options,
+    io::RandomAccessFile* file);
+
+/// \brief Read arrow::Tensor as encapsulated IPC message in file
+///
+/// \param[in] file an InputStream pointed at the start of the message
+/// \return the read tensor
+ARROW_EXPORT
+Result<std::shared_ptr<Tensor>> ReadTensor(io::InputStream* file);
+
+/// \brief EXPERIMENTAL: Read arrow::Tensor from IPC message
+///
+/// \param[in] message a Message containing the tensor metadata and body
+/// \return the read tensor
+ARROW_EXPORT
+Result<std::shared_ptr<Tensor>> ReadTensor(const Message& message);
+
+/// \brief EXPERIMENTAL: Read arrow::SparseTensor as encapsulated IPC message in file
+///
+/// \param[in] file an InputStream pointed at the start of the message
+/// \return the read sparse tensor
+ARROW_EXPORT
+Result<std::shared_ptr<SparseTensor>> ReadSparseTensor(io::InputStream* file);
+
+/// \brief EXPERIMENTAL: Read arrow::SparseTensor from IPC message
+///
+/// \param[in] message a Message containing the tensor metadata and body
+/// \return the read sparse tensor
+ARROW_EXPORT
+Result<std::shared_ptr<SparseTensor>> ReadSparseTensor(const Message& message);
+
+namespace internal {
+
+// These internal APIs may change without warning or deprecation
+
+/// \brief EXPERIMENTAL: Read arrow::SparseTensorFormat::type from a metadata
+/// \param[in] metadata a Buffer containing the sparse tensor metadata
+/// \return the count of the body buffers
+ARROW_EXPORT
+Result<size_t> ReadSparseTensorBodyBufferCount(const Buffer& metadata);
+
+/// \brief EXPERIMENTAL: Read arrow::SparseTensor from an IpcPayload
+/// \param[in] payload a IpcPayload contains a serialized SparseTensor
+/// \return the read sparse tensor
+ARROW_EXPORT
+Result<std::shared_ptr<SparseTensor>> ReadSparseTensorPayload(const IpcPayload& payload);
+
+// For fuzzing targets
+ARROW_EXPORT
+Status FuzzIpcStream(const uint8_t* data, int64_t size);
+ARROW_EXPORT
+Status FuzzIpcTensorStream(const uint8_t* data, int64_t size);
+ARROW_EXPORT
+Status FuzzIpcFile(const uint8_t* data, int64_t size);
+
+}  // namespace internal
+
+}  // namespace ipc
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/ipc/test_common.h b/pyarrow/include/arrow/ipc/test_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..6044ef207bc7a9f1524bb21e54380e3b68224691
--- /dev/null
+++ b/pyarrow/include/arrow/ipc/test_common.h
@@ -0,0 +1,198 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/testing/visibility.h"
+#include "arrow/type.h"
+
+namespace arrow {
+namespace ipc {
+namespace test {
+
+// A typedef used for test parameterization
+typedef Status MakeRecordBatch(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+void CompareArraysDetailed(int index, const Array& result, const Array& expected);
+
+ARROW_TESTING_EXPORT
+void CompareBatchColumnsDetailed(const RecordBatch& result, const RecordBatch& expected);
+
+ARROW_TESTING_EXPORT
+Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool,
+                            std::shared_ptr<Array>* out, uint32_t seed = 0,
+                            int32_t min = 0, int32_t max = 1000);
+
+ARROW_TESTING_EXPORT
+Status MakeRandomInt64Array(int64_t length, bool include_nulls, MemoryPool* pool,
+                            std::shared_ptr<Array>* out, uint32_t seed = 0);
+
+ARROW_TESTING_EXPORT
+Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_lists,
+                           bool include_nulls, MemoryPool* pool,
+                           std::shared_ptr<Array>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeRandomLargeListArray(const std::shared_ptr<Array>& child_array, int num_lists,
+                                bool include_nulls, MemoryPool* pool,
+                                std::shared_ptr<Array>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeRandomBooleanArray(const int length, bool include_nulls,
+                              std::shared_ptr<Array>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeBooleanBatchSized(const int length, std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeBooleanBatch(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeIntBatchSized(int length, std::shared_ptr<RecordBatch>* out,
+                         uint32_t seed = 0);
+
+ARROW_TESTING_EXPORT
+Status MakeIntRecordBatch(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeFloat3264BatchSized(int length, std::shared_ptr<RecordBatch>* out,
+                               uint32_t seed = 0);
+
+ARROW_TESTING_EXPORT
+Status MakeFloat3264Batch(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeFloatBatchSized(int length, std::shared_ptr<RecordBatch>* out,
+                           uint32_t seed = 0);
+
+ARROW_TESTING_EXPORT
+Status MakeFloatBatch(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* pool,
+                             std::shared_ptr<Array>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeStringTypesRecordBatch(std::shared_ptr<RecordBatch>* out,
+                                  bool with_nulls = true, bool with_view_types = true);
+
+ARROW_TESTING_EXPORT
+Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeNullRecordBatch(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeListRecordBatchSized(int length, std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeListViewRecordBatchSized(int length, std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeListViewRecordBatch(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeFixedSizeListRecordBatch(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeZeroLengthRecordBatch(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeNonNullRecordBatch(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeDeeplyNestedList(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeDeeplyNestedListView(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeStruct(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeRunEndEncoded(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeUnion(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeDictionary(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeDictionaryFlat(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeNestedDictionary(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeMap(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeMapOfDictionary(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeDates(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeTimestamps(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeIntervals(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeTimes(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeFWBinary(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeDecimal(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeNull(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeUuid(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeComplex128(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeDictExtension(std::shared_ptr<RecordBatch>* out);
+
+ARROW_TESTING_EXPORT
+Status MakeRandomTensor(const std::shared_ptr<DataType>& type,
+                        const std::vector<int64_t>& shape, bool row_major_p,
+                        std::shared_ptr<Tensor>* out, uint32_t seed = 0);
+
+ARROW_TESTING_EXPORT Status RoundtripBatch(const std::shared_ptr<RecordBatch>& batch,
+                                           std::shared_ptr<RecordBatch>* out);
+
+}  // namespace test
+}  // namespace ipc
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/ipc/type_fwd.h b/pyarrow/include/arrow/ipc/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0d3afa922f789f4f9a8a0b2b435b3ebe0456d42
--- /dev/null
+++ b/pyarrow/include/arrow/ipc/type_fwd.h
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+namespace ipc {
+
+enum class MetadataVersion : char {
+  /// 0.1.0
+  V1,
+
+  /// 0.2.0
+  V2,
+
+  /// 0.3.0 to 0.7.1
+  V3,
+
+  /// 0.8.0 to 0.17.0
+  V4,
+
+  /// >= 1.0.0
+  V5
+};
+
+class Message;
+enum class MessageType {
+  NONE,
+  SCHEMA,
+  DICTIONARY_BATCH,
+  RECORD_BATCH,
+  TENSOR,
+  SPARSE_TENSOR
+};
+
+struct IpcReadOptions;
+struct IpcWriteOptions;
+
+class MessageReader;
+
+class RecordBatchStreamReader;
+class RecordBatchFileReader;
+class RecordBatchWriter;
+
+class DictionaryFieldMapper;
+class DictionaryMemo;
+
+namespace feather {
+
+class Reader;
+
+}  // namespace feather
+}  // namespace ipc
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/ipc/util.h b/pyarrow/include/arrow/ipc/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..709fedbf31b0b31585c81b36d5a81db0e5c92754
--- /dev/null
+++ b/pyarrow/include/arrow/ipc/util.h
@@ -0,0 +1,41 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+namespace arrow {
+namespace ipc {
+
+// Buffers are padded to 64-byte boundaries (for SIMD)
+static constexpr int32_t kArrowAlignment = 64;
+
+// Tensors are padded to 64-byte boundaries
+static constexpr int32_t kTensorAlignment = 64;
+
+// Align on 8-byte boundaries in IPC
+static constexpr int32_t kArrowIpcAlignment = 8;
+
+static constexpr uint8_t kPaddingBytes[kArrowAlignment] = {0};
+
+static inline int64_t PaddedLength(int64_t nbytes, int32_t alignment = kArrowAlignment) {
+  return ((nbytes + alignment - 1) / alignment) * alignment;
+}
+
+}  // namespace ipc
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/ipc/writer.h b/pyarrow/include/arrow/ipc/writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..aefb59f3136e4c98419799eb31faf9700fc6efd2
--- /dev/null
+++ b/pyarrow/include/arrow/ipc/writer.h
@@ -0,0 +1,475 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implement Arrow streaming binary format
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/ipc/dictionary.h"  // IWYU pragma: export
+#include "arrow/ipc/message.h"
+#include "arrow/ipc/options.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class Buffer;
+class MemoryManager;
+class MemoryPool;
+class RecordBatch;
+class Schema;
+class Status;
+class Table;
+class Tensor;
+class SparseTensor;
+
+namespace io {
+
+class OutputStream;
+
+}  // namespace io
+
+namespace ipc {
+
+/// \brief Intermediate data structure with metadata header, and zero
+/// or more buffers for the message body.
+struct IpcPayload {
+  MessageType type = MessageType::NONE;
+  std::shared_ptr<Buffer> metadata;
+  std::vector<std::shared_ptr<Buffer>> body_buffers;
+  std::vector<int64_t> variadic_buffer_counts;
+  int64_t body_length = 0;      // serialized body length (padded, maybe compressed)
+  int64_t raw_body_length = 0;  // initial uncompressed body length
+};
+
+struct WriteStats {
+  /// Number of IPC messages written.
+  int64_t num_messages = 0;
+  /// Number of record batches written.
+  int64_t num_record_batches = 0;
+  /// Number of dictionary batches written.
+  ///
+  /// Note: num_dictionary_batches >= num_dictionary_deltas + num_replaced_dictionaries
+  int64_t num_dictionary_batches = 0;
+
+  /// Number of dictionary deltas written.
+  int64_t num_dictionary_deltas = 0;
+  /// Number of replaced dictionaries (i.e. where a dictionary batch replaces
+  /// an existing dictionary with an unrelated new dictionary).
+  int64_t num_replaced_dictionaries = 0;
+
+  /// Total size in bytes of record batches emitted.
+  /// The "raw" size counts the original buffer sizes, while the "serialized" size
+  /// includes padding and (optionally) compression.
+  int64_t total_raw_body_size = 0;
+  int64_t total_serialized_body_size = 0;
+};
+
+/// \class RecordBatchWriter
+/// \brief Abstract interface for writing a stream of record batches
+class ARROW_EXPORT RecordBatchWriter {
+ public:
+  virtual ~RecordBatchWriter();
+
+  /// \brief Write a record batch to the stream
+  ///
+  /// \param[in] batch the record batch to write to the stream
+  /// \return Status
+  virtual Status WriteRecordBatch(const RecordBatch& batch) = 0;
+
+  /// \brief Write a record batch with custom metadata to the stream
+  ///
+  /// \param[in] batch the record batch to write to the stream
+  /// \param[in] custom_metadata the record batch's custom metadata to write to the stream
+  /// \return Status
+  virtual Status WriteRecordBatch(
+      const RecordBatch& batch,
+      const std::shared_ptr<const KeyValueMetadata>& custom_metadata);
+
+  /// \brief Write possibly-chunked table by creating sequence of record batches
+  /// \param[in] table table to write
+  /// \return Status
+  Status WriteTable(const Table& table);
+
+  /// \brief Write Table with a particular chunksize
+  /// \param[in] table table to write
+  /// \param[in] max_chunksize maximum number of rows for table chunks. To
+  /// indicate that no maximum should be enforced, pass -1.
+  /// \return Status
+  virtual Status WriteTable(const Table& table, int64_t max_chunksize);
+
+  /// \brief Perform any logic necessary to finish the stream
+  ///
+  /// \return Status
+  virtual Status Close() = 0;
+
+  /// \brief Return current write statistics
+  virtual WriteStats stats() const = 0;
+};
+
+/// \defgroup record-batch-writer-factories Functions for creating RecordBatchWriter
+/// instances
+///
+/// @{
+
+/// Create a new IPC stream writer from stream sink and schema. User is
+/// responsible for closing the actual OutputStream.
+///
+/// \param[in] sink output stream to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization
+/// \return Result<std::shared_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchWriter>> MakeStreamWriter(
+    io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+    const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
+/// Create a new IPC stream writer from stream sink and schema. User is
+/// responsible for closing the actual OutputStream.
+///
+/// \param[in] sink output stream to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization
+/// \return Result<std::shared_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchWriter>> MakeStreamWriter(
+    std::shared_ptr<io::OutputStream> sink, const std::shared_ptr<Schema>& schema,
+    const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
+/// Create a new IPC file writer from stream sink and schema
+///
+/// \param[in] sink output stream to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization, optional
+/// \param[in] metadata custom metadata for File Footer, optional
+/// \return Result<std::shared_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchWriter>> MakeFileWriter(
+    io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+    const IpcWriteOptions& options = IpcWriteOptions::Defaults(),
+    const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR);
+
+/// Create a new IPC file writer from stream sink and schema
+///
+/// \param[in] sink output stream to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization, optional
+/// \param[in] metadata custom metadata for File Footer, optional
+/// \return Result<std::shared_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatchWriter>> MakeFileWriter(
+    std::shared_ptr<io::OutputStream> sink, const std::shared_ptr<Schema>& schema,
+    const IpcWriteOptions& options = IpcWriteOptions::Defaults(),
+    const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR);
+
+/// @}
+
+/// \brief Low-level API for writing a record batch (without schema)
+/// to an OutputStream as encapsulated IPC message. See Arrow format
+/// documentation for more detail.
+///
+/// \param[in] batch the record batch to write
+/// \param[in] buffer_start_offset the start offset to use in the buffer metadata,
+/// generally should be 0
+/// \param[in] dst an OutputStream
+/// \param[out] metadata_length the size of the length-prefixed flatbuffer
+/// including padding to a 64-byte boundary
+/// \param[out] body_length the size of the contiguous buffer block plus
+/// \param[in] options options for serialization
+/// \return Status
+ARROW_EXPORT
+Status WriteRecordBatch(const RecordBatch& batch, int64_t buffer_start_offset,
+                        io::OutputStream* dst, int32_t* metadata_length,
+                        int64_t* body_length, const IpcWriteOptions& options);
+
+/// \brief Serialize record batch as encapsulated IPC message in a new buffer
+///
+/// \param[in] batch the record batch
+/// \param[in] options the IpcWriteOptions to use for serialization
+/// \return the serialized message
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SerializeRecordBatch(const RecordBatch& batch,
+                                                     const IpcWriteOptions& options);
+
+/// \brief Serialize record batch as encapsulated IPC message in a new buffer
+///
+/// \param[in] batch the record batch
+/// \param[in] mm a MemoryManager to allocate memory from
+/// \return the serialized message
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SerializeRecordBatch(const RecordBatch& batch,
+                                                     std::shared_ptr<MemoryManager> mm);
+
+/// \brief Write record batch to OutputStream
+///
+/// \param[in] batch the record batch to write
+/// \param[in] options the IpcWriteOptions to use for serialization
+/// \param[in] out the OutputStream to write the output to
+/// \return Status
+///
+/// If writing to pre-allocated memory, you can use
+/// arrow::ipc::GetRecordBatchSize to compute how much space is required
+ARROW_EXPORT
+Status SerializeRecordBatch(const RecordBatch& batch, const IpcWriteOptions& options,
+                            io::OutputStream* out);
+
+/// \brief Serialize schema as encapsulated IPC message
+///
+/// \param[in] schema the schema to write
+/// \param[in] pool a MemoryPool to allocate memory from
+/// \return the serialized schema
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> SerializeSchema(const Schema& schema,
+                                                MemoryPool* pool = default_memory_pool());
+
+/// \brief Write multiple record batches to OutputStream, including schema
+/// \param[in] batches a vector of batches. Must all have same schema
+/// \param[in] options options for serialization
+/// \param[out] dst an OutputStream
+/// \return Status
+ARROW_EXPORT
+Status WriteRecordBatchStream(const std::vector<std::shared_ptr<RecordBatch>>& batches,
+                              const IpcWriteOptions& options, io::OutputStream* dst);
+
+/// \brief Compute the number of bytes needed to write an IPC payload
+///     including metadata
+///
+/// \param[in] payload the IPC payload to write
+/// \param[in] options write options
+/// \return the size of the complete encapsulated message
+ARROW_EXPORT
+int64_t GetPayloadSize(const IpcPayload& payload,
+                       const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
+/// \brief Compute the number of bytes needed to write a record batch including metadata
+///
+/// \param[in] batch the record batch to write
+/// \param[out] size the size of the complete encapsulated message
+/// \return Status
+ARROW_EXPORT
+Status GetRecordBatchSize(const RecordBatch& batch, int64_t* size);
+
+/// \brief Compute the number of bytes needed to write a record batch including metadata
+///
+/// \param[in] batch the record batch to write
+/// \param[in] options options for serialization
+/// \param[out] size the size of the complete encapsulated message
+/// \return Status
+ARROW_EXPORT
+Status GetRecordBatchSize(const RecordBatch& batch, const IpcWriteOptions& options,
+                          int64_t* size);
+
+/// \brief Compute the number of bytes needed to write a tensor including metadata
+///
+/// \param[in] tensor the tensor to write
+/// \param[out] size the size of the complete encapsulated message
+/// \return Status
+ARROW_EXPORT
+Status GetTensorSize(const Tensor& tensor, int64_t* size);
+
+/// \brief EXPERIMENTAL: Convert arrow::Tensor to a Message with minimal memory
+/// allocation
+///
+/// \param[in] tensor the Tensor to write
+/// \param[in] pool MemoryPool to allocate space for metadata
+/// \return the resulting Message
+ARROW_EXPORT
+Result<std::unique_ptr<Message>> GetTensorMessage(const Tensor& tensor, MemoryPool* pool);
+
+/// \brief Write arrow::Tensor as a contiguous message.
+///
+/// The metadata and body are written assuming 64-byte alignment. It is the
+/// user's responsibility to ensure that the OutputStream has been aligned
+/// to a 64-byte multiple before writing the message.
+///
+/// The message is written out as followed:
+/// \code
+/// <metadata size> <metadata> <tensor data>
+/// \endcode
+///
+/// \param[in] tensor the Tensor to write
+/// \param[in] dst the OutputStream to write to
+/// \param[out] metadata_length the actual metadata length, including padding
+/// \param[out] body_length the actual message body length
+/// \return Status
+ARROW_EXPORT
+Status WriteTensor(const Tensor& tensor, io::OutputStream* dst, int32_t* metadata_length,
+                   int64_t* body_length);
+
+/// \brief EXPERIMENTAL: Convert arrow::SparseTensor to a Message with minimal memory
+/// allocation
+///
+/// The message is written out as followed:
+/// \code
+/// <metadata size> <metadata> <sparse index> <sparse tensor body>
+/// \endcode
+///
+/// \param[in] sparse_tensor the SparseTensor to write
+/// \param[in] pool MemoryPool to allocate space for metadata
+/// \return the resulting Message
+ARROW_EXPORT
+Result<std::unique_ptr<Message>> GetSparseTensorMessage(const SparseTensor& sparse_tensor,
+                                                        MemoryPool* pool);
+
+/// \brief EXPERIMENTAL: Write arrow::SparseTensor as a contiguous message. The metadata,
+/// sparse index, and body are written assuming 64-byte alignment. It is the
+/// user's responsibility to ensure that the OutputStream has been aligned
+/// to a 64-byte multiple before writing the message.
+///
+/// \param[in] sparse_tensor the SparseTensor to write
+/// \param[in] dst the OutputStream to write to
+/// \param[out] metadata_length the actual metadata length, including padding
+/// \param[out] body_length the actual message body length
+/// \return Status
+ARROW_EXPORT
+Status WriteSparseTensor(const SparseTensor& sparse_tensor, io::OutputStream* dst,
+                         int32_t* metadata_length, int64_t* body_length);
+
+/// \brief Compute IpcPayload for the given schema
+/// \param[in] schema the Schema that is being serialized
+/// \param[in] options options for serialization
+/// \param[in] mapper object mapping dictionary fields to dictionary ids
+/// \param[out] out the returned vector of IpcPayloads
+/// \return Status
+ARROW_EXPORT
+Status GetSchemaPayload(const Schema& schema, const IpcWriteOptions& options,
+                        const DictionaryFieldMapper& mapper, IpcPayload* out);
+
+/// \brief Compute IpcPayload for a dictionary
+/// \param[in] id the dictionary id
+/// \param[in] dictionary the dictionary values
+/// \param[in] options options for serialization
+/// \param[out] payload the output IpcPayload
+/// \return Status
+ARROW_EXPORT
+Status GetDictionaryPayload(int64_t id, const std::shared_ptr<Array>& dictionary,
+                            const IpcWriteOptions& options, IpcPayload* payload);
+
+/// \brief Compute IpcPayload for a dictionary
+/// \param[in] id the dictionary id
+/// \param[in] is_delta whether the dictionary is a delta dictionary
+/// \param[in] dictionary the dictionary values
+/// \param[in] options options for serialization
+/// \param[out] payload the output IpcPayload
+/// \return Status
+ARROW_EXPORT
+Status GetDictionaryPayload(int64_t id, bool is_delta,
+                            const std::shared_ptr<Array>& dictionary,
+                            const IpcWriteOptions& options, IpcPayload* payload);
+
+/// \brief Compute IpcPayload for the given record batch
+/// \param[in] batch the RecordBatch that is being serialized
+/// \param[in] options options for serialization
+/// \param[out] out the returned IpcPayload
+/// \return Status
+ARROW_EXPORT
+Status GetRecordBatchPayload(const RecordBatch& batch, const IpcWriteOptions& options,
+                             IpcPayload* out);
+
+/// \brief Compute IpcPayload for the given record batch and custom metadata
+/// \param[in] batch the RecordBatch that is being serialized
+/// \param[in] custom_metadata the custom metadata to be serialized with the record batch
+/// \param[in] options options for serialization
+/// \param[out] out the returned IpcPayload
+/// \return Status
+ARROW_EXPORT
+Status GetRecordBatchPayload(
+    const RecordBatch& batch,
+    const std::shared_ptr<const KeyValueMetadata>& custom_metadata,
+    const IpcWriteOptions& options, IpcPayload* out);
+
+/// \brief Write an IPC payload to the given stream.
+/// \param[in] payload the payload to write
+/// \param[in] options options for serialization
+/// \param[in] dst The stream to write the payload to.
+/// \param[out] metadata_length the length of the serialized metadata
+/// \return Status
+ARROW_EXPORT
+Status WriteIpcPayload(const IpcPayload& payload, const IpcWriteOptions& options,
+                       io::OutputStream* dst, int32_t* metadata_length);
+
+/// \brief Compute IpcPayload for the given sparse tensor
+/// \param[in] sparse_tensor the SparseTensor that is being serialized
+/// \param[in,out] pool for any required temporary memory allocations
+/// \param[out] out the returned IpcPayload
+/// \return Status
+ARROW_EXPORT
+Status GetSparseTensorPayload(const SparseTensor& sparse_tensor, MemoryPool* pool,
+                              IpcPayload* out);
+
+namespace internal {
+
+// These internal APIs may change without warning or deprecation
+
+class ARROW_EXPORT IpcPayloadWriter {
+ public:
+  virtual ~IpcPayloadWriter();
+
+  // Default implementation is a no-op
+  virtual Status Start();
+
+  virtual Status WritePayload(const IpcPayload& payload) = 0;
+
+  virtual Status Close() = 0;
+};
+
+/// Create a new IPC payload stream writer from stream sink. User is
+/// responsible for closing the actual OutputStream.
+///
+/// \param[in] sink output stream to write to
+/// \param[in] options options for serialization
+/// \return Result<std::shared_ptr<IpcPayloadWriter>>
+ARROW_EXPORT
+Result<std::unique_ptr<IpcPayloadWriter>> MakePayloadStreamWriter(
+    io::OutputStream* sink, const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
+/// Create a new IPC payload file writer from stream sink.
+///
+/// \param[in] sink output stream to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization, optional
+/// \param[in] metadata custom metadata for File Footer, optional
+/// \return Status
+ARROW_EXPORT
+Result<std::unique_ptr<IpcPayloadWriter>> MakePayloadFileWriter(
+    io::OutputStream* sink, const std::shared_ptr<Schema>& schema,
+    const IpcWriteOptions& options = IpcWriteOptions::Defaults(),
+    const std::shared_ptr<const KeyValueMetadata>& metadata = NULLPTR);
+
+/// Create a new RecordBatchWriter from IpcPayloadWriter and schema.
+///
+/// The format is implicitly the IPC stream format (allowing dictionary
+/// replacement and deltas).
+///
+/// \param[in] sink the IpcPayloadWriter to write to
+/// \param[in] schema the schema of the record batches to be written
+/// \param[in] options options for serialization
+/// \return Result<std::unique_ptr<RecordBatchWriter>>
+ARROW_EXPORT
+Result<std::unique_ptr<RecordBatchWriter>> OpenRecordBatchWriter(
+    std::unique_ptr<IpcPayloadWriter> sink, const std::shared_ptr<Schema>& schema,
+    const IpcWriteOptions& options = IpcWriteOptions::Defaults());
+
+}  // namespace internal
+}  // namespace ipc
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/json/api.h b/pyarrow/include/arrow/json/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..47b56684b5af7f383e6e2acee014dde6ba40d11d
--- /dev/null
+++ b/pyarrow/include/arrow/json/api.h
@@ -0,0 +1,21 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/json/options.h"
+#include "arrow/json/reader.h"
diff --git a/pyarrow/include/arrow/json/chunked_builder.h b/pyarrow/include/arrow/json/chunked_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..93b327bf3ae2b63bc4439d77440b54d10e45810a
--- /dev/null
+++ b/pyarrow/include/arrow/json/chunked_builder.h
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace json {
+
+class PromotionGraph;
+
+class ARROW_EXPORT ChunkedArrayBuilder {
+ public:
+  virtual ~ChunkedArrayBuilder() = default;
+
+  /// Spawn a task that will try to convert and insert the given JSON block
+  virtual void Insert(int64_t block_index,
+                      const std::shared_ptr<Field>& unconverted_field,
+                      const std::shared_ptr<Array>& unconverted) = 0;
+
+  /// Return the final chunked array.
+  /// Every chunk must be inserted before this is called!
+  virtual Status Finish(std::shared_ptr<ChunkedArray>* out) = 0;
+
+  /// Finish current task group and substitute a new one
+  virtual Status ReplaceTaskGroup(
+      const std::shared_ptr<arrow::internal::TaskGroup>& task_group) = 0;
+
+ protected:
+  explicit ChunkedArrayBuilder(
+      const std::shared_ptr<arrow::internal::TaskGroup>& task_group)
+      : task_group_(task_group) {}
+
+  std::shared_ptr<arrow::internal::TaskGroup> task_group_;
+};
+
+/// create a chunked builder
+///
+/// if unexpected fields and promotion need to be handled, promotion_graph must be
+/// non-null
+ARROW_EXPORT Status MakeChunkedArrayBuilder(
+    const std::shared_ptr<arrow::internal::TaskGroup>& task_group, MemoryPool* pool,
+    const PromotionGraph* promotion_graph, const std::shared_ptr<DataType>& type,
+    std::shared_ptr<ChunkedArrayBuilder>* out);
+
+}  // namespace json
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/json/chunker.h b/pyarrow/include/arrow/json/chunker.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ed85126da1412774bc216737b7f4abc3795815c
--- /dev/null
+++ b/pyarrow/include/arrow/json/chunker.h
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/util/delimiting.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace json {
+
+struct ParseOptions;
+
+ARROW_EXPORT
+std::unique_ptr<Chunker> MakeChunker(const ParseOptions& options);
+
+}  // namespace json
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/json/converter.h b/pyarrow/include/arrow/json/converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a812dd3c3afaec0ccc36f3bb72fa2d1a459f4e7
--- /dev/null
+++ b/pyarrow/include/arrow/json/converter.h
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class DataType;
+class Field;
+class MemoryPool;
+
+namespace json {
+
+/// \brief interface for conversion of Arrays
+///
+/// Converters are not required to be correct for arbitrary input- only
+/// for unconverted arrays emitted by a corresponding parser.
+class ARROW_EXPORT Converter {
+ public:
+  virtual ~Converter() = default;
+
+  /// convert an array
+  /// on failure, this converter may be promoted to another converter which
+  /// *can* convert the given input.
+  virtual Status Convert(const std::shared_ptr<Array>& in,
+                         std::shared_ptr<Array>* out) = 0;
+
+  std::shared_ptr<DataType> out_type() const { return out_type_; }
+
+  MemoryPool* pool() { return pool_; }
+
+ protected:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Converter);
+
+  Converter(MemoryPool* pool, const std::shared_ptr<DataType>& out_type)
+      : pool_(pool), out_type_(out_type) {}
+
+  MemoryPool* pool_;
+  std::shared_ptr<DataType> out_type_;
+};
+
+/// \brief produce a single converter to the specified out_type
+ARROW_EXPORT Status MakeConverter(const std::shared_ptr<DataType>& out_type,
+                                  MemoryPool* pool, std::shared_ptr<Converter>* out);
+
+class ARROW_EXPORT PromotionGraph {
+ public:
+  virtual ~PromotionGraph() = default;
+
+  /// \brief produce a valid field which will be inferred as null
+  virtual std::shared_ptr<Field> Null(const std::string& name) const = 0;
+
+  /// \brief given an unexpected field encountered during parsing, return a type to which
+  /// it may be convertible (may return null if none is available)
+  virtual std::shared_ptr<DataType> Infer(
+      const std::shared_ptr<Field>& unexpected_field) const = 0;
+
+  /// \brief given a type to which conversion failed, return a promoted type to which
+  /// conversion may succeed (may return null if none is available)
+  virtual std::shared_ptr<DataType> Promote(
+      const std::shared_ptr<DataType>& failed,
+      const std::shared_ptr<Field>& unexpected_field) const = 0;
+
+ protected:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(PromotionGraph);
+  PromotionGraph() = default;
+};
+
+ARROW_EXPORT const PromotionGraph* GetPromotionGraph();
+
+}  // namespace json
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/json/from_string.h b/pyarrow/include/arrow/json/from_string.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd5ed3d46a3b99e593eee7d38ded5183b694d3c3
--- /dev/null
+++ b/pyarrow/include/arrow/json/from_string.h
@@ -0,0 +1,112 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Implement a simple JSON representation format for arrays
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class DataType;
+
+namespace json {
+
+/// \defgroup array-from-json-string FromJSONString Helpers
+///
+/// These helpers are intended to be used in examples, tests, or for quick
+/// prototyping and are not intended to be used where performance matters.
+///
+/// See the <a href="../arrays.html#fromjsonstring-helpers">User Guide</a> for
+/// more information.
+///
+/// @{
+
+/// \brief Create an Array from a JSON string
+///
+/// \code {.cpp}
+/// Result<std::shared_ptr<Array>> maybe_array =
+///     ArrayFromJSONString(int64(), "[2, 3, null, 7, 11]");
+/// \endcode
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> ArrayFromJSONString(const std::shared_ptr<DataType>&,
+                                                   const std::string& json);
+
+/// \copydoc ArrayFromJSONString(const std::shared_ptr<DataType>&, const std::string&)
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> ArrayFromJSONString(const std::shared_ptr<DataType>&,
+                                                   std::string_view json);
+
+/// \copydoc ArrayFromJSONString(const std::shared_ptr<DataType>&, const std::string&)
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> ArrayFromJSONString(const std::shared_ptr<DataType>&,
+                                                   const char* json);
+
+/// \brief Create a ChunkedArray from a JSON string
+///
+/// \code {.cpp}
+/// Result<std::shared_ptr<ChunkedArray>> maybe_chunked_array =
+///     ChunkedArrayFromJSONString(int64(), {R"([5, 10])", R"([null])", R"([16])"});
+/// \endcode
+ARROW_EXPORT
+Result<std::shared_ptr<ChunkedArray>> ChunkedArrayFromJSONString(
+    const std::shared_ptr<DataType>& type, const std::vector<std::string>& json_strings);
+
+/// \brief Create a DictionaryArray from a JSON string
+///
+/// \code {.cpp}
+/// Result<std::shared_ptr<Array>> maybe_dict_array =
+///     DictArrayFromJSONString(dictionary(int32(), utf8()), "[0, 1, 0, 2, 0, 3]",
+///     R"(["k1", "k2", "k3", "k4"])");
+/// \endcode
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> DictArrayFromJSONString(const std::shared_ptr<DataType>&,
+                                                       std::string_view indices_json,
+                                                       std::string_view dictionary_json);
+
+/// \brief Create a Scalar from a JSON string
+/// \code {.cpp}
+/// Result<std::shared_ptr<Scalar>> maybe_scalar =
+///     ScalarFromJSONString(float64(), "42", &scalar);
+/// \endcode
+ARROW_EXPORT
+Result<std::shared_ptr<Scalar>> ScalarFromJSONString(const std::shared_ptr<DataType>&,
+                                                     std::string_view json);
+
+/// \brief Create a DictionaryScalar from a JSON string
+/// \code {.cpp}
+/// Result<std::shared_ptr<Scalar>> maybe_dict_scalar =
+///     DictScalarFromJSONString(dictionary(int32(), utf8()), "3", R"(["k1", "k2", "k3",
+///     "k4"])", &scalar);
+/// \endcode
+ARROW_EXPORT
+Result<std::shared_ptr<Scalar>> DictScalarFromJSONString(
+    const std::shared_ptr<DataType>&, std::string_view index_json,
+    std::string_view dictionary_json);
+
+/// @}
+
+}  // namespace json
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/json/object_parser.h b/pyarrow/include/arrow/json/object_parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..8035695e537cb9a022cd694993185f687ccdab04
--- /dev/null
+++ b/pyarrow/include/arrow/json/object_parser.h
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string_view>
+#include <unordered_map>
+
+#include "arrow/result.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace json {
+namespace internal {
+
+/// This class is a helper to parse a json object from a string.
+/// It uses rapidjson::Document in implementation.
+class ARROW_EXPORT ObjectParser {
+ public:
+  ObjectParser();
+  ~ObjectParser();
+
+  Status Parse(std::string_view json);
+
+  Result<std::string> GetString(const char* key) const;
+
+  Result<bool> GetBool(const char* key) const;
+
+  // Get all members of the object as a map from string keys to string values
+  Result<std::unordered_map<std::string, std::string>> GetStringMap() const;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace internal
+}  // namespace json
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/json/object_writer.h b/pyarrow/include/arrow/json/object_writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf1ce62194fb89b60a37c9481716f57df545dcbe
--- /dev/null
+++ b/pyarrow/include/arrow/json/object_writer.h
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace json {
+namespace internal {
+
+/// This class is a helper to serialize a json object to a string.
+/// It uses rapidjson in implementation.
+class ARROW_EXPORT ObjectWriter {
+ public:
+  ObjectWriter();
+  ~ObjectWriter();
+
+  void SetString(std::string_view key, std::string_view value);
+  void SetBool(std::string_view key, bool value);
+
+  std::string Serialize();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace internal
+}  // namespace json
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/json/options.h b/pyarrow/include/arrow/json/options.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7edab9ceddb4d4e2d5c79b8652d7d47d0557b55
--- /dev/null
+++ b/pyarrow/include/arrow/json/options.h
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/json/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class DataType;
+class Schema;
+
+namespace json {
+
+enum class UnexpectedFieldBehavior : char {
+  /// Unexpected JSON fields are ignored
+  Ignore,
+  /// Unexpected JSON fields error out
+  Error,
+  /// Unexpected JSON fields are type-inferred and included in the output
+  InferType
+};
+
+struct ARROW_EXPORT ParseOptions {
+  // Parsing options
+
+  /// Optional explicit schema (disables type inference on those fields)
+  std::shared_ptr<Schema> explicit_schema;
+
+  /// Whether objects may be printed across multiple lines (for example pretty-printed)
+  ///
+  /// If true, parsing may be slower.
+  bool newlines_in_values = false;
+
+  /// How JSON fields outside of explicit_schema (if given) are treated
+  UnexpectedFieldBehavior unexpected_field_behavior = UnexpectedFieldBehavior::InferType;
+
+  /// Create parsing options with default values
+  static ParseOptions Defaults();
+};
+
+struct ARROW_EXPORT ReadOptions {
+  // Reader options
+
+  /// Whether to use the global CPU thread pool
+  bool use_threads = true;
+  /// Block size we request from the IO layer; also determines the size of
+  /// chunks when use_threads is true
+  int32_t block_size = 1 << 20;  // 1 MB
+
+  /// Create read options with default values
+  static ReadOptions Defaults();
+};
+
+}  // namespace json
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/json/parser.h b/pyarrow/include/arrow/json/parser.h
new file mode 100644
index 0000000000000000000000000000000000000000..aca416dbb7b5b4915cb8d1f74d932989cde286dd
--- /dev/null
+++ b/pyarrow/include/arrow/json/parser.h
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/json/options.h"
+#include "arrow/status.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class Buffer;
+class MemoryPool;
+class KeyValueMetadata;
+class ResizableBuffer;
+
+namespace json {
+
+struct Kind {
+  enum type : uint8_t {
+    kNull,
+    kBoolean,
+    kNumber,
+    kString,
+    kArray,
+    kObject,
+    kNumberOrString
+  };
+
+  static const std::string& Name(Kind::type);
+
+  static const std::shared_ptr<const KeyValueMetadata>& Tag(Kind::type);
+
+  static Kind::type FromTag(const std::shared_ptr<const KeyValueMetadata>& tag);
+
+  static Status ForType(const DataType& type, Kind::type* kind);
+};
+
+/// \class BlockParser
+/// \brief A reusable block-based parser for JSON data
+///
+/// The parser takes a block of newline delimited JSON data and extracts Arrays
+/// of unconverted strings which can be fed to a Converter to obtain a usable Array.
+///
+/// Note that in addition to parse errors (such as malformed JSON) some conversion
+/// errors are caught at parse time:
+/// - A null value in non-nullable column
+/// - Change in the JSON kind of a column. For example, if an explicit schema is provided
+///   which stipulates that field "a" is integral, a row of {"a": "not a number"} will
+///   result in an error. This also applies to fields outside an explicit schema.
+class ARROW_EXPORT BlockParser {
+ public:
+  virtual ~BlockParser() = default;
+
+  /// \brief Reserve storage for scalars parsed from a block of json
+  virtual Status ReserveScalarStorage(int64_t nbytes) = 0;
+
+  /// \brief Parse a block of data
+  virtual Status Parse(const std::shared_ptr<Buffer>& json) = 0;
+
+  /// \brief Extract parsed data
+  virtual Status Finish(std::shared_ptr<Array>* parsed) = 0;
+
+  /// \brief Return the number of parsed rows
+  int32_t num_rows() const { return num_rows_; }
+
+  /// \brief Construct a BlockParser
+  ///
+  /// \param[in] pool MemoryPool to use when constructing parsed array
+  /// \param[in] options ParseOptions to use when parsing JSON
+  /// \param[out] out constructed BlockParser
+  static Status Make(MemoryPool* pool, const ParseOptions& options,
+                     std::unique_ptr<BlockParser>* out);
+
+  static Status Make(const ParseOptions& options, std::unique_ptr<BlockParser>* out);
+
+ protected:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(BlockParser);
+
+  explicit BlockParser(MemoryPool* pool) : pool_(pool) {}
+
+  MemoryPool* pool_;
+  int32_t num_rows_ = 0;
+};
+
+}  // namespace json
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/json/rapidjson_defs.h b/pyarrow/include/arrow/json/rapidjson_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..2354c6157263a46edf87bc048b713a6a0d9387c7
--- /dev/null
+++ b/pyarrow/include/arrow/json/rapidjson_defs.h
@@ -0,0 +1,43 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Include this file before including any RapidJSON headers.
+
+#pragma once
+
+#define RAPIDJSON_HAS_STDSTRING 1
+#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1
+#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1
+
+// rapidjson will be defined in namespace arrow::rapidjson
+#define RAPIDJSON_NAMESPACE arrow::rapidjson
+#define RAPIDJSON_NAMESPACE_BEGIN \
+  namespace arrow {               \
+  namespace rapidjson {
+#define RAPIDJSON_NAMESPACE_END \
+  }                             \
+  }
+
+// enable SIMD whitespace skipping, if available
+#if defined(ARROW_HAVE_SSE4_2)
+#  define RAPIDJSON_SSE2 1
+#  define RAPIDJSON_SSE42 1
+#endif
+
+#if defined(ARROW_HAVE_NEON)
+#  define RAPIDJSON_NEON 1
+#endif
diff --git a/pyarrow/include/arrow/json/reader.h b/pyarrow/include/arrow/json/reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7849a83ba1f88e54961df5a1e9739afe24ba026
--- /dev/null
+++ b/pyarrow/include/arrow/json/reader.h
@@ -0,0 +1,118 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/io/type_fwd.h"
+#include "arrow/json/options.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace json {
+
+/// A class that reads an entire JSON file into a Arrow Table
+///
+/// The file is expected to consist of individual line-separated JSON objects
+class ARROW_EXPORT TableReader {
+ public:
+  virtual ~TableReader() = default;
+
+  /// Read the entire JSON file and convert it to a Arrow Table
+  virtual Result<std::shared_ptr<Table>> Read() = 0;
+
+  /// Create a TableReader instance
+  static Result<std::shared_ptr<TableReader>> Make(MemoryPool* pool,
+                                                   std::shared_ptr<io::InputStream> input,
+                                                   const ReadOptions&,
+                                                   const ParseOptions&);
+};
+
+ARROW_EXPORT Result<std::shared_ptr<RecordBatch>> ParseOne(ParseOptions options,
+                                                           std::shared_ptr<Buffer> json);
+
+/// \brief A class that reads a JSON file incrementally
+///
+/// JSON data is read from a stream in fixed-size blocks (configurable with
+/// `ReadOptions::block_size`). Each block is converted to a `RecordBatch`. Yielded
+/// batches have a consistent schema but may differ in row count.
+///
+/// The supplied `ParseOptions` are used to determine a schema, based either on a
+/// provided explicit schema or inferred from the first non-empty block.
+/// Afterwards, the target schema is frozen. If `UnexpectedFieldBehavior::InferType` is
+/// specified, unexpected fields will only be inferred for the first block. Afterwards
+/// they'll be treated as errors.
+///
+/// If `ReadOptions::use_threads` is `true`, each block's parsing/decoding task will be
+/// parallelized on the given `cpu_executor` (with readahead corresponding to the
+/// executor's capacity). If an executor isn't provided, the global thread pool will be
+/// used.
+///
+/// If `ReadOptions::use_threads` is `false`, computations will be run on the calling
+/// thread and `cpu_executor` will be ignored.
+class ARROW_EXPORT StreamingReader : public RecordBatchReader {
+ public:
+  virtual ~StreamingReader() = default;
+
+  /// \brief Read the next `RecordBatch` asynchronously
+  /// This function is async-reentrant (but not synchronously reentrant). However, if
+  /// threading is disabled, this will block until completion.
+  virtual Future<std::shared_ptr<RecordBatch>> ReadNextAsync() = 0;
+
+  /// Get the number of bytes which have been successfully converted to record batches
+  /// and consumed
+  [[nodiscard]] virtual int64_t bytes_processed() const = 0;
+
+  /// \brief Create a `StreamingReader` from an `InputStream`
+  /// Blocks until the initial batch is loaded
+  ///
+  /// \param[in] stream JSON source stream
+  /// \param[in] read_options Options for reading
+  /// \param[in] parse_options Options for chunking, parsing, and conversion
+  /// \param[in] io_context Context for IO operations (optional)
+  /// \param[in] cpu_executor Executor for computation tasks (optional)
+  /// \return The initialized reader
+  static Result<std::shared_ptr<StreamingReader>> Make(
+      std::shared_ptr<io::InputStream> stream, const ReadOptions& read_options,
+      const ParseOptions& parse_options,
+      const io::IOContext& io_context = io::default_io_context(),
+      ::arrow::internal::Executor* cpu_executor = NULLPTR);
+
+  /// \brief Create a `StreamingReader` from an `InputStream` asynchronously
+  /// Returned future completes after loading the first batch
+  ///
+  /// \param[in] stream JSON source stream
+  /// \param[in] read_options Options for reading
+  /// \param[in] parse_options Options for chunking, parsing, and conversion
+  /// \param[in] io_context Context for IO operations (optional)
+  /// \param[in] cpu_executor Executor for computation tasks (optional)
+  /// \return Future for the initialized reader
+  static Future<std::shared_ptr<StreamingReader>> MakeAsync(
+      std::shared_ptr<io::InputStream> stream, const ReadOptions& read_options,
+      const ParseOptions& parse_options,
+      const io::IOContext& io_context = io::default_io_context(),
+      ::arrow::internal::Executor* cpu_executor = NULLPTR);
+};
+
+}  // namespace json
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/json/test_common.h b/pyarrow/include/arrow/json/test_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f819779bdb5940b081a2a41756d3a6510260476
--- /dev/null
+++ b/pyarrow/include/arrow/json/test_common.h
@@ -0,0 +1,330 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <random>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/io/memory.h"
+#include "arrow/json/converter.h"
+#include "arrow/json/options.h"
+#include "arrow/json/parser.h"
+#include "arrow/json/rapidjson_defs.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/visit_type_inline.h"
+
+#include "rapidjson/document.h"
+#include "rapidjson/prettywriter.h"
+#include "rapidjson/reader.h"
+#include "rapidjson/writer.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace json {
+
+namespace rj = arrow::rapidjson;
+
+using rj::StringBuffer;
+using std::string_view;
+using Writer = rj::Writer<StringBuffer>;
+
+struct GenerateOptions {
+  // Probability of a field being written
+  double field_probability = 1.0;
+  // Probability of a value being null
+  double null_probability = 0.2;
+  // Whether to randomize the order of written fields
+  bool randomize_field_order = false;
+
+  static constexpr GenerateOptions Defaults() { return GenerateOptions{}; }
+};
+
+inline static Status OK(bool ok) { return ok ? Status::OK() : Status::Invalid(""); }
+
+template <typename Engine>
+inline static Status Generate(
+    const std::shared_ptr<DataType>& type, Engine& e, Writer* writer,
+    const GenerateOptions& options = GenerateOptions::Defaults());
+
+template <typename Engine>
+inline static Status Generate(
+    const std::vector<std::shared_ptr<Field>>& fields, Engine& e, Writer* writer,
+    const GenerateOptions& options = GenerateOptions::Defaults());
+
+template <typename Engine>
+inline static Status Generate(
+    const std::shared_ptr<Schema>& schm, Engine& e, Writer* writer,
+    const GenerateOptions& options = GenerateOptions::Defaults()) {
+  return Generate(schm->fields(), e, writer, options);
+}
+
+template <typename Engine>
+struct GenerateImpl {
+  Status Visit(const NullType&) { return OK(writer.Null()); }
+
+  Status Visit(const BooleanType&) {
+    return OK(writer.Bool(std::uniform_int_distribution<uint16_t>{}(e)&1));
+  }
+
+  template <typename T>
+  enable_if_physical_unsigned_integer<T, Status> Visit(const T&) {
+    auto val = std::uniform_int_distribution<>{}(e);
+    return OK(writer.Uint64(static_cast<typename T::c_type>(val)));
+  }
+
+  template <typename T>
+  enable_if_physical_signed_integer<T, Status> Visit(const T&) {
+    auto val = std::uniform_int_distribution<>{}(e);
+    return OK(writer.Int64(static_cast<typename T::c_type>(val)));
+  }
+
+  template <typename T>
+  enable_if_physical_floating_point<T, Status> Visit(const T&) {
+    auto val = std::normal_distribution<typename T::c_type>{0, 1 << 10}(e);
+    return OK(writer.Double(val));
+  }
+
+  Status GenerateAscii(const DataType&) {
+    auto size = std::poisson_distribution<>{4}(e);
+    std::uniform_int_distribution<uint16_t> gen_char(32, 126);  // FIXME generate UTF8
+    std::string s(size, '\0');
+    for (char& ch : s) ch = static_cast<char>(gen_char(e));
+    return OK(writer.String(s.c_str()));
+  }
+
+  template <typename T>
+  enable_if_base_binary<T, Status> Visit(const T& t) {
+    return GenerateAscii(t);
+  }
+
+  Status Visit(const BinaryViewType& t) { return GenerateAscii(t); }
+
+  template <typename T>
+  enable_if_list_like<T, Status> Visit(const T& t) {
+    auto size = std::poisson_distribution<>{4}(e);
+    writer.StartArray();
+    for (int i = 0; i < size; ++i) {
+      RETURN_NOT_OK(Generate(t.value_type(), e, &writer, options));
+    }
+    return OK(writer.EndArray(size));
+  }
+
+  Status Visit(const ListViewType& t) { return NotImplemented(t); }
+
+  Status Visit(const LargeListViewType& t) { return NotImplemented(t); }
+
+  Status Visit(const StructType& t) { return Generate(t.fields(), e, &writer, options); }
+
+  Status Visit(const DayTimeIntervalType& t) { return NotImplemented(t); }
+
+  Status Visit(const MonthDayNanoIntervalType& t) { return NotImplemented(t); }
+
+  Status Visit(const DictionaryType& t) { return NotImplemented(t); }
+
+  Status Visit(const ExtensionType& t) { return NotImplemented(t); }
+
+  Status Visit(const Decimal128Type& t) { return NotImplemented(t); }
+
+  Status Visit(const FixedSizeBinaryType& t) { return NotImplemented(t); }
+
+  Status Visit(const UnionType& t) { return NotImplemented(t); }
+
+  Status Visit(const RunEndEncodedType& t) { return NotImplemented(t); }
+
+  Status NotImplemented(const DataType& t) {
+    return Status::NotImplemented("random generation of arrays of type ", t);
+  }
+
+  Engine& e;
+  rj::Writer<rj::StringBuffer>& writer;
+  const GenerateOptions& options;
+};
+
+template <typename Engine>
+inline static Status Generate(const std::shared_ptr<DataType>& type, Engine& e,
+                              Writer* writer, const GenerateOptions& options) {
+  if (std::bernoulli_distribution(options.null_probability)(e)) {
+    writer->Null();
+    return Status::OK();
+  }
+  GenerateImpl<Engine> visitor = {e, *writer, options};
+  return VisitTypeInline(*type, &visitor);
+}
+
+template <typename Engine>
+inline static Status Generate(const std::vector<std::shared_ptr<Field>>& fields,
+                              Engine& e, Writer* writer, const GenerateOptions& options) {
+  RETURN_NOT_OK(OK(writer->StartObject()));
+
+  int num_fields = 0;
+  auto write_field = [&](const Field& f) {
+    ++num_fields;
+    writer->Key(f.name().c_str());
+    return Generate(f.type(), e, writer, options);
+  };
+
+  std::bernoulli_distribution bool_dist(options.field_probability);
+  if (options.randomize_field_order) {
+    std::vector<size_t> indices;
+    indices.reserve(static_cast<size_t>(fields.size() * options.field_probability));
+    for (size_t i = 0; i < fields.size(); ++i) {
+      if (bool_dist(e)) {
+        indices.push_back(i);
+      }
+    }
+    std::shuffle(indices.begin(), indices.end(), e);
+    for (auto i : indices) {
+      RETURN_NOT_OK(write_field(*fields[i]));
+    }
+  } else {
+    for (const auto& f : fields) {
+      if (bool_dist(e)) {
+        RETURN_NOT_OK(write_field(*f));
+      }
+    }
+  }
+
+  return OK(writer->EndObject(num_fields));
+}
+
+inline static Status MakeStream(string_view src_str,
+                                std::shared_ptr<io::InputStream>* out) {
+  auto src = std::make_shared<Buffer>(src_str);
+  *out = std::make_shared<io::BufferReader>(src);
+  return Status::OK();
+}
+
+// scalar values (numbers and strings) are parsed into a
+// dictionary<index:int32, value:string>. This can be decoded for ease of comparison
+inline static Status DecodeStringDictionary(const DictionaryArray& dict_array,
+                                            std::shared_ptr<Array>* decoded) {
+  const StringArray& dict = checked_cast<const StringArray&>(*dict_array.dictionary());
+  const Int32Array& indices = checked_cast<const Int32Array&>(*dict_array.indices());
+  StringBuilder builder;
+  RETURN_NOT_OK(builder.Resize(indices.length()));
+  for (int64_t i = 0; i < indices.length(); ++i) {
+    if (indices.IsNull(i)) {
+      builder.UnsafeAppendNull();
+      continue;
+    }
+    auto value = dict.GetView(indices.GetView(i));
+    RETURN_NOT_OK(builder.ReserveData(value.size()));
+    builder.UnsafeAppend(value);
+  }
+  return builder.Finish(decoded);
+}
+
+inline static Status ParseFromString(ParseOptions options, string_view src_str,
+                                     std::shared_ptr<Array>* parsed) {
+  auto src = std::make_shared<Buffer>(src_str);
+  std::unique_ptr<BlockParser> parser;
+  RETURN_NOT_OK(BlockParser::Make(options, &parser));
+  RETURN_NOT_OK(parser->Parse(src));
+  return parser->Finish(parsed);
+}
+
+inline static Status ParseFromString(ParseOptions options, string_view src_str,
+                                     std::shared_ptr<StructArray>* parsed) {
+  std::shared_ptr<Array> parsed_non_struct;
+  RETURN_NOT_OK(ParseFromString(options, src_str, &parsed_non_struct));
+  *parsed = internal::checked_pointer_cast<StructArray>(parsed_non_struct);
+  return Status::OK();
+}
+
+static inline std::string PrettyPrint(string_view one_line) {
+  rj::Document document;
+
+  // Must pass size to avoid ASAN issues.
+  document.Parse(one_line.data(), one_line.size());
+  rj::StringBuffer sb;
+  rj::PrettyWriter<rj::StringBuffer> writer(sb);
+  document.Accept(writer);
+  return sb.GetString();
+}
+
+template <typename T>
+std::string RowsOfOneColumn(std::string_view name, std::initializer_list<T> values,
+                            decltype(std::to_string(*values.begin()))* = nullptr) {
+  std::stringstream ss;
+  for (auto value : values) {
+    ss << R"({")" << name << R"(":)" << std::to_string(value) << "}\n";
+  }
+  return ss.str();
+}
+
+inline std::string RowsOfOneColumn(std::string_view name,
+                                   std::initializer_list<std::string> values) {
+  std::stringstream ss;
+  for (auto value : values) {
+    ss << R"({")" << name << R"(":)" << value << "}\n";
+  }
+  return ss.str();
+}
+
+inline static std::string scalars_only_src() {
+  return R"(
+    { "hello": 3.5, "world": false, "yo": "thing" }
+    { "hello": 3.25, "world": null }
+    { "hello": 3.125, "world": null, "yo": "\u5fcd" }
+    { "hello": 0.0, "world": true, "yo": null }
+  )";
+}
+
+inline static std::string nested_src() {
+  return R"(
+    { "hello": 3.5, "world": false, "yo": "thing", "arr": [1, 2, 3], "nuf": {} }
+    { "hello": 3.25, "world": null, "arr": [2], "nuf": null }
+    { "hello": 3.125, "world": null, "yo": "\u5fcd", "arr": [], "nuf": { "ps": 78 } }
+    { "hello": 0.0, "world": true, "yo": null, "arr": null, "nuf": { "ps": 90 } }
+  )";
+}
+
+inline static std::string null_src() {
+  return R"(
+    { "plain": null, "list1": [], "list2": [], "struct": { "plain": null } }
+    { "plain": null, "list1": [], "list2": [null], "struct": {} }
+  )";
+}
+
+inline static std::string unquoted_decimal_src() {
+  return R"(
+    { "price": 30.04, "cost":30.001 }
+    { "price": 1.23, "cost":1.229 }
+  )";
+}
+
+inline static std::string mixed_decimal_src() {
+  return R"(
+    { "price": 30.04, "cost": 30.001 }
+    { "price": "1.23", "cost": "1.229" }
+  )";
+}
+
+}  // namespace json
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/json/type_fwd.h b/pyarrow/include/arrow/json/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..67e2e1bb4065d0bc238d04073f673a699c5da4ea
--- /dev/null
+++ b/pyarrow/include/arrow/json/type_fwd.h
@@ -0,0 +1,26 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+namespace arrow {
+namespace json {
+
+class TableReader;
+struct ReadOptions;
+struct ParseOptions;
+
+}  // namespace json
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/memory_pool.h b/pyarrow/include/arrow/memory_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..07d17e530b10d85ef9267ada300af6bd469e6a93
--- /dev/null
+++ b/pyarrow/include/arrow/memory_pool.h
@@ -0,0 +1,348 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace internal {
+
+///////////////////////////////////////////////////////////////////////
+// Helper tracking memory statistics
+
+/// \brief Memory pool statistics
+///
+/// 64-byte aligned so that all atomic values are on the same cache line.
+class alignas(64) MemoryPoolStats {
+ private:
+  // All atomics are updated according to Acquire-Release ordering.
+  // https://en.cppreference.com/w/cpp/atomic/memory_order#Release-Acquire_ordering
+  //
+  // max_memory_, total_allocated_bytes_, and num_allocs_ only go up (they are
+  // monotonically increasing) which can allow some optimizations.
+  std::atomic<int64_t> max_memory_{0};
+  std::atomic<int64_t> bytes_allocated_{0};
+  std::atomic<int64_t> total_allocated_bytes_{0};
+  std::atomic<int64_t> num_allocs_{0};
+
+ public:
+  int64_t max_memory() const { return max_memory_.load(std::memory_order_acquire); }
+
+  int64_t bytes_allocated() const {
+    return bytes_allocated_.load(std::memory_order_acquire);
+  }
+
+  int64_t total_bytes_allocated() const {
+    return total_allocated_bytes_.load(std::memory_order_acquire);
+  }
+
+  int64_t num_allocations() const { return num_allocs_.load(std::memory_order_acquire); }
+
+  inline void DidAllocateBytes(int64_t size) {
+    // Issue the load before everything else. max_memory_ is monotonically increasing,
+    // so we can use a relaxed load before the read-modify-write.
+    auto max_memory = max_memory_.load(std::memory_order_relaxed);
+    const auto old_bytes_allocated =
+        bytes_allocated_.fetch_add(size, std::memory_order_acq_rel);
+    // Issue store operations on values that we don't depend on to proceed
+    // with execution. When done, max_memory and old_bytes_allocated have
+    // a higher chance of being available on CPU registers. This also has the
+    // nice side-effect of putting 3 atomic stores close to each other in the
+    // instruction stream.
+    total_allocated_bytes_.fetch_add(size, std::memory_order_acq_rel);
+    num_allocs_.fetch_add(1, std::memory_order_acq_rel);
+
+    // If other threads are updating max_memory_ concurrently we leave the loop without
+    // updating knowing that it already reached a value even higher than ours.
+    const auto allocated = old_bytes_allocated + size;
+    while (max_memory < allocated && !max_memory_.compare_exchange_weak(
+                                         /*expected=*/max_memory, /*desired=*/allocated,
+                                         std::memory_order_acq_rel)) {
+    }
+  }
+
+  inline void DidReallocateBytes(int64_t old_size, int64_t new_size) {
+    if (new_size > old_size) {
+      DidAllocateBytes(new_size - old_size);
+    } else {
+      DidFreeBytes(old_size - new_size);
+    }
+  }
+
+  inline void DidFreeBytes(int64_t size) {
+    bytes_allocated_.fetch_sub(size, std::memory_order_acq_rel);
+  }
+};
+
+}  // namespace internal
+
+/// Base class for memory allocation on the CPU.
+///
+/// Besides tracking the number of allocated bytes, the allocator also should
+/// take care of the required 64-byte alignment.
+class ARROW_EXPORT MemoryPool {
+ public:
+  virtual ~MemoryPool() = default;
+
+  /// \brief EXPERIMENTAL. Create a new instance of the default MemoryPool
+  static std::unique_ptr<MemoryPool> CreateDefault();
+
+  /// Allocate a new memory region of at least size bytes.
+  ///
+  /// The allocated region shall be 64-byte aligned.
+  Status Allocate(int64_t size, uint8_t** out) {
+    return Allocate(size, kDefaultBufferAlignment, out);
+  }
+
+  /// Allocate a new memory region of at least size bytes aligned to alignment.
+  virtual Status Allocate(int64_t size, int64_t alignment, uint8_t** out) = 0;
+
+  /// Resize an already allocated memory section.
+  ///
+  /// As by default most default allocators on a platform don't support aligned
+  /// reallocation, this function can involve a copy of the underlying data.
+  virtual Status Reallocate(int64_t old_size, int64_t new_size, int64_t alignment,
+                            uint8_t** ptr) = 0;
+  Status Reallocate(int64_t old_size, int64_t new_size, uint8_t** ptr) {
+    return Reallocate(old_size, new_size, kDefaultBufferAlignment, ptr);
+  }
+
+  /// Free an allocated region.
+  ///
+  /// @param buffer Pointer to the start of the allocated memory region
+  /// @param size Allocated size located at buffer. An allocator implementation
+  ///   may use this for tracking the amount of allocated bytes as well as for
+  ///   faster deallocation if supported by its backend.
+  /// @param alignment The alignment of the allocation. Defaults to 64 bytes.
+  virtual void Free(uint8_t* buffer, int64_t size, int64_t alignment) = 0;
+  void Free(uint8_t* buffer, int64_t size) {
+    Free(buffer, size, kDefaultBufferAlignment);
+  }
+
+  /// Return unused memory to the OS
+  ///
+  /// Only applies to allocators that hold onto unused memory.  This will be
+  /// best effort, a memory pool may not implement this feature or may be
+  /// unable to fulfill the request due to fragmentation.
+  virtual void ReleaseUnused() {}
+
+  /// Print statistics
+  ///
+  /// Print allocation statistics on stderr. The output format is
+  /// implementation-specific. Not all memory pools implement this method.
+  virtual void PrintStats() {}
+
+  /// The number of bytes that were allocated and not yet free'd through
+  /// this allocator.
+  virtual int64_t bytes_allocated() const = 0;
+
+  /// Return peak memory allocation in this memory pool
+  ///
+  /// \return Maximum bytes allocated. If not known (or not implemented),
+  /// returns -1
+  virtual int64_t max_memory() const;
+
+  /// The number of bytes that were allocated.
+  virtual int64_t total_bytes_allocated() const = 0;
+
+  /// The number of allocations or reallocations that were requested.
+  virtual int64_t num_allocations() const = 0;
+
+  /// The name of the backend used by this MemoryPool (e.g. "system" or "jemalloc").
+  virtual std::string backend_name() const = 0;
+
+ protected:
+  MemoryPool() = default;
+};
+
+class ARROW_EXPORT LoggingMemoryPool : public MemoryPool {
+ public:
+  explicit LoggingMemoryPool(MemoryPool* pool);
+  ~LoggingMemoryPool() override = default;
+
+  using MemoryPool::Allocate;
+  using MemoryPool::Free;
+  using MemoryPool::Reallocate;
+
+  Status Allocate(int64_t size, int64_t alignment, uint8_t** out) override;
+  Status Reallocate(int64_t old_size, int64_t new_size, int64_t alignment,
+                    uint8_t** ptr) override;
+  void Free(uint8_t* buffer, int64_t size, int64_t alignment) override;
+  void ReleaseUnused() override;
+  void PrintStats() override;
+
+  int64_t bytes_allocated() const override;
+
+  int64_t max_memory() const override;
+
+  int64_t total_bytes_allocated() const override;
+
+  int64_t num_allocations() const override;
+
+  std::string backend_name() const override;
+
+ private:
+  MemoryPool* pool_;
+};
+
+/// Derived class for memory allocation.
+///
+/// Tracks the number of bytes and maximum memory allocated through its direct
+/// calls. Actual allocation is delegated to MemoryPool class.
+class ARROW_EXPORT ProxyMemoryPool : public MemoryPool {
+ public:
+  explicit ProxyMemoryPool(MemoryPool* pool);
+  ~ProxyMemoryPool() override;
+
+  using MemoryPool::Allocate;
+  using MemoryPool::Free;
+  using MemoryPool::Reallocate;
+
+  Status Allocate(int64_t size, int64_t alignment, uint8_t** out) override;
+  Status Reallocate(int64_t old_size, int64_t new_size, int64_t alignment,
+                    uint8_t** ptr) override;
+  void Free(uint8_t* buffer, int64_t size, int64_t alignment) override;
+  void ReleaseUnused() override;
+  void PrintStats() override;
+
+  int64_t bytes_allocated() const override;
+
+  int64_t max_memory() const override;
+
+  int64_t total_bytes_allocated() const override;
+
+  int64_t num_allocations() const override;
+
+  std::string backend_name() const override;
+
+ private:
+  class ProxyMemoryPoolImpl;
+  std::unique_ptr<ProxyMemoryPoolImpl> impl_;
+};
+
+/// EXPERIMENTAL MemoryPool wrapper with an upper limit
+///
+/// Checking for limits is not done in a fully thread-safe way, therefore
+/// multi-threaded allocations might be able to go successfully above the
+/// configured limit.
+class ARROW_EXPORT CappedMemoryPool : public MemoryPool {
+ public:
+  CappedMemoryPool(MemoryPool* wrapped_pool, int64_t bytes_allocated_limit)
+      : wrapped_(wrapped_pool), bytes_allocated_limit_(bytes_allocated_limit) {}
+
+  using MemoryPool::Allocate;
+  using MemoryPool::Reallocate;
+
+  Status Allocate(int64_t size, int64_t alignment, uint8_t** out) override;
+  Status Reallocate(int64_t old_size, int64_t new_size, int64_t alignment,
+                    uint8_t** ptr) override;
+  void Free(uint8_t* buffer, int64_t size, int64_t alignment) override;
+
+  void ReleaseUnused() override { wrapped_->ReleaseUnused(); }
+
+  void PrintStats() override { wrapped_->PrintStats(); }
+
+  int64_t bytes_allocated() const override { return wrapped_->bytes_allocated(); }
+
+  int64_t max_memory() const override { return wrapped_->max_memory(); }
+
+  int64_t total_bytes_allocated() const override {
+    return wrapped_->total_bytes_allocated();
+  }
+
+  int64_t num_allocations() const override { return wrapped_->num_allocations(); }
+
+  std::string backend_name() const override { return wrapped_->backend_name(); }
+
+ private:
+  Status OutOfMemory(int64_t current_allocated, int64_t requested) const;
+
+  MemoryPool* wrapped_;
+  const int64_t bytes_allocated_limit_;
+};
+
+/// \brief Return a process-wide memory pool based on the system allocator.
+ARROW_EXPORT MemoryPool* system_memory_pool();
+
+/// \brief Return a process-wide memory pool based on jemalloc.
+///
+/// May return NotImplemented if jemalloc is not available.
+ARROW_EXPORT Status jemalloc_memory_pool(MemoryPool** out);
+
+/// \brief Set jemalloc memory page purging behavior for future-created arenas
+/// to the indicated number of milliseconds. See dirty_decay_ms and
+/// muzzy_decay_ms options in jemalloc for a description of what these do. The
+/// default is configured to 1000 (1 second) which releases memory more
+/// aggressively to the operating system than the jemalloc default of 10
+/// seconds. If you set the value to 0, dirty / muzzy pages will be released
+/// immediately rather than with a time decay, but this may reduce application
+/// performance.
+ARROW_EXPORT
+Status jemalloc_set_decay_ms(int ms);
+
+/// \brief Get basic statistics from jemalloc's mallctl.
+/// See the MALLCTL NAMESPACE section in jemalloc project documentation for
+/// available stats.
+ARROW_EXPORT
+Result<int64_t> jemalloc_get_stat(const char* name);
+
+/// \brief Reset the counter for peak bytes allocated in the calling thread to zero.
+/// This affects subsequent calls to thread.peak.read, but not the values returned by
+/// thread.allocated or thread.deallocated.
+ARROW_EXPORT
+Status jemalloc_peak_reset();
+
+/// \brief Print summary statistics in human-readable form to stderr.
+/// See malloc_stats_print documentation in jemalloc project documentation for
+/// available opt flags.
+ARROW_EXPORT
+Status jemalloc_stats_print(const char* opts = "");
+
+/// \brief Print summary statistics in human-readable form using a callback
+/// See malloc_stats_print documentation in jemalloc project documentation for
+/// available opt flags.
+ARROW_EXPORT
+Status jemalloc_stats_print(std::function<void(const char*)> write_cb,
+                            const char* opts = "");
+
+/// \brief Get summary statistics in human-readable form.
+/// See malloc_stats_print documentation in jemalloc project documentation for
+/// available opt flags.
+ARROW_EXPORT
+Result<std::string> jemalloc_stats_string(const char* opts = "");
+
+/// \brief Return a process-wide memory pool based on mimalloc.
+///
+/// May return NotImplemented if mimalloc is not available.
+ARROW_EXPORT Status mimalloc_memory_pool(MemoryPool** out);
+
+/// \brief Return the names of the backends supported by this Arrow build.
+ARROW_EXPORT std::vector<std::string> SupportedMemoryBackendNames();
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/memory_pool_test.h b/pyarrow/include/arrow/memory_pool_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..32f1cc5d1d310a90e80d16210c72a8825c074767
--- /dev/null
+++ b/pyarrow/include/arrow/memory_pool_test.h
@@ -0,0 +1,111 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+
+#include <gtest/gtest.h>
+
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/testing/gtest_util.h"
+
+namespace arrow {
+
+class TestMemoryPoolBase : public ::testing::Test {
+ public:
+  virtual ::arrow::MemoryPool* memory_pool() = 0;
+
+  void TestMemoryTracking() {
+    auto pool = memory_pool();
+
+    uint8_t* data;
+    const auto old_bytes_allocated = pool->bytes_allocated();
+    ASSERT_OK(pool->Allocate(100, &data));
+    EXPECT_EQ(static_cast<uint64_t>(0), reinterpret_cast<uint64_t>(data) % 64);
+    ASSERT_EQ(old_bytes_allocated + 100, pool->bytes_allocated());
+
+    uint8_t* data2;
+    ASSERT_OK(pool->Allocate(27, &data2));
+    EXPECT_EQ(static_cast<uint64_t>(0), reinterpret_cast<uint64_t>(data2) % 64);
+    ASSERT_EQ(old_bytes_allocated + 127, pool->bytes_allocated());
+
+    pool->Free(data, 100);
+    ASSERT_EQ(old_bytes_allocated + 27, pool->bytes_allocated());
+    pool->Free(data2, 27);
+    ASSERT_EQ(old_bytes_allocated, pool->bytes_allocated());
+  }
+
+  void TestOOM() {
+    auto pool = memory_pool();
+
+    uint8_t* data;
+    int64_t max_alloc = std::min<uint64_t>(std::numeric_limits<int64_t>::max(),
+                                           std::numeric_limits<size_t>::max());
+    // subtract 63 to prevent overflow after the size is aligned
+    for (int64_t to_alloc : {max_alloc, max_alloc - 63, max_alloc - 127}) {
+      ASSERT_RAISES(OutOfMemory, pool->Allocate(to_alloc, &data));
+    }
+  }
+
+  void TestReallocate() {
+    auto pool = memory_pool();
+
+    uint8_t* data;
+    ASSERT_OK(pool->Allocate(10, &data));
+    ASSERT_EQ(10, pool->bytes_allocated());
+    data[0] = 35;
+    data[9] = 12;
+
+    // Expand
+    ASSERT_OK(pool->Reallocate(10, 20, &data));
+    ASSERT_EQ(data[9], 12);
+    ASSERT_EQ(20, pool->bytes_allocated());
+
+    // Shrink
+    ASSERT_OK(pool->Reallocate(20, 5, &data));
+    ASSERT_EQ(data[0], 35);
+    ASSERT_EQ(5, pool->bytes_allocated());
+
+    // Free
+    pool->Free(data, 5);
+    ASSERT_EQ(0, pool->bytes_allocated());
+  }
+
+  void TestAlignment() {
+    auto pool = memory_pool();
+    {
+      uint8_t* data64;
+      ASSERT_OK(pool->Allocate(10, &data64));
+      ASSERT_EQ(reinterpret_cast<uintptr_t>(data64) % kDefaultBufferAlignment, 0);
+      pool->Free(data64, 10);
+    }
+
+    {
+      uint8_t* data512;
+      ASSERT_OK(pool->Allocate(10, 512, &data512));
+      ASSERT_EQ(reinterpret_cast<uintptr_t>(data512) % 512, 0);
+      pool->Free(data512, 10, 512);
+    }
+  }
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/pretty_print.h b/pyarrow/include/arrow/pretty_print.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e5eca4300b0303127e303ba5a20e676b71c7a46
--- /dev/null
+++ b/pyarrow/include/arrow/pretty_print.h
@@ -0,0 +1,161 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <iosfwd>
+#include <string>
+#include <utility>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class RecordBatch;
+class Schema;
+class Status;
+class Table;
+
+/// \class PrettyPrintDelimiters
+/// \brief Options for controlling which delimiters to use when printing
+/// an Array or ChunkedArray.
+struct ARROW_EXPORT PrettyPrintDelimiters {
+  /// Delimiter to use when opening an Array or ChunkedArray (e.g. "[")
+  std::string open = "[";
+
+  /// Delimiter to use when closing an Array or ChunkedArray (e.g. "]")
+  std::string close = "]";
+
+  /// Delimiter for separating individual elements of an Array (e.g. ","),
+  /// or individual chunks of a ChunkedArray
+  std::string element = ",";
+
+  /// Create a PrettyPrintDelimiters instance with default values
+  static PrettyPrintDelimiters Defaults() { return PrettyPrintDelimiters(); }
+};
+
+/// \class PrettyPrintOptions
+/// \brief Options for controlling how various Arrow types should be printed.
+struct ARROW_EXPORT PrettyPrintOptions {
+  PrettyPrintOptions() = default;
+
+  PrettyPrintOptions(int indent,  // NOLINT runtime/explicit
+                     int window = 10, int indent_size = 2, std::string null_rep = "null",
+                     bool skip_new_lines = false, bool truncate_metadata = true,
+                     int container_window = 2, int element_size_limit = 100)
+      : indent(indent),
+        indent_size(indent_size),
+        window(window),
+        container_window(container_window),
+        null_rep(std::move(null_rep)),
+        skip_new_lines(skip_new_lines),
+        truncate_metadata(truncate_metadata),
+        element_size_limit(element_size_limit) {}
+
+  /// Create a PrettyPrintOptions instance with default values
+  static PrettyPrintOptions Defaults() { return PrettyPrintOptions(); }
+
+  /// Number of spaces to shift entire formatted object to the right
+  int indent = 0;
+
+  /// Size of internal indents
+  int indent_size = 2;
+
+  /// Maximum number of elements to show at the beginning and at the end.
+  int window = 10;
+
+  /// Maximum number of elements to show at the beginning and at the end, for elements
+  /// that are containers (that is, list in ListArray and chunks in ChunkedArray)
+  int container_window = 2;
+
+  /// String to use for representing a null value, defaults to "null"
+  std::string null_rep = "null";
+
+  /// Skip new lines between elements, defaults to false
+  bool skip_new_lines = false;
+
+  /// Limit display of each KeyValueMetadata key/value pair to a single line at
+  /// 80 character width
+  bool truncate_metadata = true;
+
+  /// If true, display field metadata when pretty-printing a Schema
+  bool show_field_metadata = true;
+
+  /// If true, display schema metadata when pretty-printing a Schema
+  bool show_schema_metadata = true;
+
+  /// Limit each element to specified number of characters, defaults to 100
+  int element_size_limit = 100;
+
+  /// Delimiters to use when printing an Array
+  PrettyPrintDelimiters array_delimiters = PrettyPrintDelimiters::Defaults();
+
+  /// Delimiters to use when printing a ChunkedArray
+  PrettyPrintDelimiters chunked_array_delimiters = PrettyPrintDelimiters::Defaults();
+};
+
+/// \brief Print human-readable representation of RecordBatch
+ARROW_EXPORT
+Status PrettyPrint(const RecordBatch& batch, int indent, std::ostream* sink);
+
+ARROW_EXPORT
+Status PrettyPrint(const RecordBatch& batch, const PrettyPrintOptions& options,
+                   std::ostream* sink);
+
+/// \brief Print human-readable representation of Table
+ARROW_EXPORT
+Status PrettyPrint(const Table& table, const PrettyPrintOptions& options,
+                   std::ostream* sink);
+
+/// \brief Print human-readable representation of Array
+ARROW_EXPORT
+Status PrettyPrint(const Array& arr, int indent, std::ostream* sink);
+
+/// \brief Print human-readable representation of Array
+ARROW_EXPORT
+Status PrettyPrint(const Array& arr, const PrettyPrintOptions& options,
+                   std::ostream* sink);
+
+/// \brief Print human-readable representation of Array
+ARROW_EXPORT
+Status PrettyPrint(const Array& arr, const PrettyPrintOptions& options,
+                   std::string* result);
+
+/// \brief Print human-readable representation of ChunkedArray
+ARROW_EXPORT
+Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options,
+                   std::ostream* sink);
+
+/// \brief Print human-readable representation of ChunkedArray
+ARROW_EXPORT
+Status PrettyPrint(const ChunkedArray& chunked_arr, const PrettyPrintOptions& options,
+                   std::string* result);
+
+ARROW_EXPORT
+Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options,
+                   std::ostream* sink);
+
+ARROW_EXPORT
+Status PrettyPrint(const Schema& schema, const PrettyPrintOptions& options,
+                   std::string* result);
+
+ARROW_EXPORT
+Status DebugPrint(const Array& arr, int indent);
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/api.h b/pyarrow/include/arrow/python/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..2af0963a9c0444bb858f10323f914e21747cebaf
--- /dev/null
+++ b/pyarrow/include/arrow/python/api.h
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/arrow_to_pandas.h"
+#include "arrow/python/common.h"
+#include "arrow/python/datetime.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/inference.h"
+#include "arrow/python/io.h"
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/numpy_to_arrow.h"
+#include "arrow/python/python_to_arrow.h"
+#include "arrow/python/util.h"
diff --git a/pyarrow/include/arrow/python/arrow_to_pandas.h b/pyarrow/include/arrow/python/arrow_to_pandas.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4e91e6cf5ab4469236c7eec3c8174d4b0d47427
--- /dev/null
+++ b/pyarrow/include/arrow/python/arrow_to_pandas.h
@@ -0,0 +1,165 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for converting between pandas's NumPy-based data representation
+// and Arrow data structures
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "arrow/memory_pool.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class Column;
+class DataType;
+class MemoryPool;
+class Status;
+class Table;
+
+namespace py {
+
+enum class MapConversionType {
+  DEFAULT,  // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas
+  LOSSY,    // report warnings when lossiness is encountered due to duplicate keys
+  STRICT_,  // raise a Python exception when lossiness is encountered due to duplicate
+            // keys
+};
+
+struct PandasOptions {
+  bool HasCategoricalColumns() const {
+    return categorical_columns && !categorical_columns->empty();
+  }
+
+  bool IsCategoricalColumn(const std::string& name) const {
+    return categorical_columns && categorical_columns->count(name);
+  }
+
+  bool HasExtensionColumns() const {
+    return extension_columns && !extension_columns->empty();
+  }
+
+  bool IsExtensionColumn(const std::string& name) const {
+    return extension_columns && extension_columns->count(name);
+  }
+
+  /// arrow::MemoryPool to use for memory allocations
+  MemoryPool* pool = default_memory_pool();
+
+  /// If true, we will convert all string columns to categoricals
+  bool strings_to_categorical = false;
+  bool zero_copy_only = false;
+  bool integer_object_nulls = false;
+  bool date_as_object = false;
+  bool timestamp_as_object = false;
+  bool use_threads = false;
+
+  /// Coerce all date and timestamp to datetime64[ns]
+  bool coerce_temporal_nanoseconds = false;
+
+  /// Used to maintain backwards compatibility for
+  /// timezone bugs (see ARROW-9528).  Should be removed
+  /// after Arrow 2.0 release.
+  bool ignore_timezone = false;
+
+  /// \brief If true, do not create duplicate PyObject versions of equal
+  /// objects. This only applies to immutable objects like strings or datetime
+  /// objects
+  bool deduplicate_objects = false;
+
+  /// \brief For certain data types, a cast is needed in order to store the
+  /// data in a pandas DataFrame or Series (e.g. timestamps are always stored
+  /// as nanoseconds in pandas). This option controls whether it is a safe
+  /// cast or not.
+  bool safe_cast = true;
+
+  /// \brief If true, create one block per column rather than consolidated
+  /// blocks (1 per data type). Do zero-copy wrapping when there are no
+  /// nulls. pandas currently will consolidate the blocks on its own, causing
+  /// increased memory use, so keep this in mind if you are working on a
+  /// memory-constrained situation.
+  bool split_blocks = false;
+
+  /// \brief If true, allow non-writable zero-copy views to be created for
+  /// single column blocks. This option is also used to provide zero copy for
+  /// Series data
+  bool allow_zero_copy_blocks = false;
+
+  /// \brief If true, attempt to deallocate buffers in passed Arrow object if
+  /// it is the only remaining shared_ptr copy of it. See ARROW-3789 for
+  /// original context for this feature. Only currently implemented for Table
+  /// conversions
+  bool self_destruct = false;
+
+  /// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to
+  /// Python association lists (list-of-tuples) in the same order as the Arrow
+  /// Map, as in [(key1, value1), (key2, value2), ...]
+  /// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts.
+  /// This can change the ordering of (key, value) pairs, and will deduplicate
+  /// multiple keys, resulting in a possible loss of data.
+  /// If 'lossy', this key deduplication results in a warning printed
+  /// when detected. If 'strict', this instead results in an exception
+  /// being raised when detected.
+  MapConversionType maps_as_pydicts = MapConversionType::DEFAULT;
+
+  // Used internally for nested arrays.
+  bool decode_dictionaries = false;
+
+  // Columns that should be casted to categorical
+  //
+  // This is wrapped in a shared_ptr because this struct is copied internally for
+  // each column or nested field (see GH-47861).
+  std::shared_ptr<const std::unordered_set<std::string>> categorical_columns;
+
+  // Columns that should be passed through to be converted to
+  // ExtensionArray/Block
+  std::shared_ptr<const std::unordered_set<std::string>> extension_columns;
+
+  // Used internally to decipher between to_numpy() and to_pandas() when
+  // the expected output differs
+  bool to_numpy = false;
+};
+
+ARROW_PYTHON_EXPORT
+Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr<Array> arr,
+                            PyObject* py_ref, PyObject** out);
+
+ARROW_PYTHON_EXPORT
+Status ConvertChunkedArrayToPandas(const PandasOptions& options,
+                                   std::shared_ptr<ChunkedArray> col, PyObject* py_ref,
+                                   PyObject** out);
+
+// Convert a whole table as efficiently as possible to a pandas.DataFrame.
+//
+// The returned Python object is a list of tuples consisting of the exact 2D
+// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x.
+//
+// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
+ARROW_PYTHON_EXPORT
+Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table> table,
+                            PyObject** out);
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/async.h b/pyarrow/include/arrow/python/async.h
new file mode 100644
index 0000000000000000000000000000000000000000..1568d21938e6e79e724d957120e68a7576ba9c2a
--- /dev/null
+++ b/pyarrow/include/arrow/python/async.h
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <utility>
+
+#include "arrow/python/common.h"
+#include "arrow/status.h"
+#include "arrow/util/future.h"
+
+namespace arrow::py {
+
+/// \brief Bind a Python callback to an arrow::Future.
+///
+/// If the Future finishes successfully, py_wrapper is called with its
+/// result value and should return a PyObject*. If py_wrapper is successful,
+/// py_cb is called with its return value.
+///
+/// If either the Future or py_wrapper fails, py_cb is called with the
+/// associated Python exception.
+///
+/// \param future The future to bind to.
+/// \param py_cb The Python callback function. Will be passed the result of
+///   py_wrapper, or a Python exception if the future failed or one was
+///   raised by py_wrapper.
+/// \param py_wrapper A function (likely defined in Cython) to convert the C++
+///   result of the future to a Python object.
+template <typename T, typename PyWrapper = PyObject* (*)(T)>
+void BindFuture(Future<T> future, PyObject* py_cb, PyWrapper py_wrapper) {
+  Py_INCREF(py_cb);
+  OwnedRefNoGIL cb_ref(py_cb);
+
+  auto future_cb = [cb_ref = std::move(cb_ref),
+                    py_wrapper = std::move(py_wrapper)](Result<T> result) {
+    SafeCallIntoPythonVoid([&]() {
+      OwnedRef py_value_or_exc{WrapResult(std::move(result), std::move(py_wrapper))};
+      Py_XDECREF(
+          PyObject_CallFunctionObjArgs(cb_ref.obj(), py_value_or_exc.obj(), NULLPTR));
+      ARROW_WARN_NOT_OK(CheckPyError(), "Internal error in async call");
+    });
+  };
+  future.AddCallback(std::move(future_cb));
+}
+
+}  // namespace arrow::py
diff --git a/pyarrow/include/arrow/python/benchmark.h b/pyarrow/include/arrow/python/benchmark.h
new file mode 100644
index 0000000000000000000000000000000000000000..8060dd33722a08eb0935687ea5cb306dbd38a9f0
--- /dev/null
+++ b/pyarrow/include/arrow/python/benchmark.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+namespace py {
+namespace benchmark {
+
+// Micro-benchmark routines for use from ASV
+
+// Run PandasObjectIsNull() once over every object in *list*
+ARROW_PYTHON_EXPORT
+void Benchmark_PandasObjectIsNull(PyObject* list);
+
+}  // namespace benchmark
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/common.h b/pyarrow/include/arrow/python/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..affefe2859b65f04f711de1f90d14f642641c5fb
--- /dev/null
+++ b/pyarrow/include/arrow/python/common.h
@@ -0,0 +1,457 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/visibility.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+class MemoryPool;
+template <class T>
+class Result;
+
+namespace py {
+
+// Convert current Python error to a Status.  The Python error state is cleared
+// and can be restored with RestorePyError().
+ARROW_PYTHON_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError);
+// Query whether the given Status is a Python error (as wrapped by ConvertPyError()).
+ARROW_PYTHON_EXPORT bool IsPyError(const Status& status);
+// Restore a Python error wrapped in a Status.
+ARROW_PYTHON_EXPORT void RestorePyError(const Status& status);
+
+// Catch a pending Python exception and return the corresponding Status.
+// If no exception is pending, Status::OK() is returned.
+inline Status CheckPyError(StatusCode code = StatusCode::UnknownError) {
+  if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
+    return Status::OK();
+  } else {
+    return ConvertPyError(code);
+  }
+}
+
+#define RETURN_IF_PYERROR() ARROW_RETURN_NOT_OK(CheckPyError())
+
+#define PY_RETURN_IF_ERROR(CODE) ARROW_RETURN_NOT_OK(CheckPyError(CODE))
+
+// For Cython, as you can't define template C++ functions in Cython, only use them.
+// This function can set a Python exception.  It assumes that T has a (cheap)
+// default constructor.
+template <class T>
+T GetResultValue(Result<T> result) {
+  if (ARROW_PREDICT_TRUE(result.ok())) {
+    return *std::move(result);
+  } else {
+    int r = internal::check_status(result.status());  // takes the GIL
+    assert(r == -1);                                  // should have errored out
+    ARROW_UNUSED(r);
+    return {};
+  }
+}
+
+/// \brief Wrap a Result and return the corresponding Python object.
+///
+/// If the Result is successful, py_wrapper is called with its result value
+/// and should return a PyObject*. If py_wrapper is successful (returns
+/// a non-NULL value), its return value is returned.
+///
+/// If either the Result or py_wrapper fails, the associated Python exception
+/// is raised and NULL is returned.
+//
+/// \param result The Result whose value to wrap in a Python object.
+/// \param py_wrapper A function (likely defined in Cython) to convert the C++
+///   value of the Result to a Python object.
+/// \return A new Python reference, or NULL if an exception occurred
+template <typename T, typename PyWrapper = PyObject* (*)(T)>
+PyObject* WrapResult(Result<T> result, PyWrapper&& py_wrapper) {
+  static_assert(std::is_same_v<PyObject*, decltype(py_wrapper(std::declval<T>()))>,
+                "PyWrapper argument to WrapResult should return a PyObject* "
+                "when called with a T*");
+  Status st = result.status();
+  if (st.ok()) {
+    PyObject* py_value = py_wrapper(result.MoveValueUnsafe());
+    st = CheckPyError();
+    if (st.ok()) {
+      return py_value;
+    }
+    Py_XDECREF(py_value);  // should be null, but who knows
+  }
+  // Status is an error, convert it to an exception.
+  return internal::convert_status(st);
+}
+
+// A RAII-style helper that ensures the GIL is acquired inside a lexical block.
+class ARROW_PYTHON_EXPORT PyAcquireGIL {
+ public:
+  PyAcquireGIL() : acquired_gil_(false) { acquire(); }
+
+  ~PyAcquireGIL() { release(); }
+
+  void acquire() {
+    if (!acquired_gil_) {
+      state_ = PyGILState_Ensure();
+      acquired_gil_ = true;
+    }
+  }
+
+  // idempotent
+  void release() {
+    if (acquired_gil_) {
+      PyGILState_Release(state_);
+      acquired_gil_ = false;
+    }
+  }
+
+ private:
+  bool acquired_gil_;
+  PyGILState_STATE state_;
+  ARROW_DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL);
+};
+
+// A RAII-style helper that releases the GIL until the end of a lexical block
+class ARROW_PYTHON_EXPORT PyReleaseGIL {
+ public:
+  PyReleaseGIL() : ptr_(PyEval_SaveThread(), &unique_ptr_deleter) {}
+
+ private:
+  static void unique_ptr_deleter(PyThreadState* state) {
+    if (state) {
+      PyEval_RestoreThread(state);
+    }
+  }
+  std::unique_ptr<PyThreadState, decltype(&unique_ptr_deleter)> ptr_;
+};
+
+// A helper to call safely into the Python interpreter from arbitrary C++ code.
+// The GIL is acquired, and the current thread's error status is preserved.
+template <typename Function>
+auto SafeCallIntoPython(Function&& func) -> decltype(func()) {
+  PyAcquireGIL lock;
+  PyObject* exc_type;
+  PyObject* exc_value;
+  PyObject* exc_traceback;
+  PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
+  auto maybe_status = std::forward<Function>(func)();
+  // If the return Status is a "Python error", the current Python error status
+  // describes the error and shouldn't be clobbered.
+  if (!IsPyError(::arrow::ToStatus(maybe_status)) && exc_type != NULLPTR) {
+    PyErr_Restore(exc_type, exc_value, exc_traceback);
+  }
+  return maybe_status;
+}
+
+template <typename Function>
+auto SafeCallIntoPythonVoid(Function&& func) -> decltype(func()) {
+  PyAcquireGIL lock;
+  PyObject* exc_type;
+  PyObject* exc_value;
+  PyObject* exc_traceback;
+  PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
+  func();
+  if (exc_type != NULLPTR) {
+    PyErr_Restore(exc_type, exc_value, exc_traceback);
+  }
+}
+
+// A RAII primitive that DECREFs the underlying PyObject* when it
+// goes out of scope.
+class ARROW_PYTHON_EXPORT OwnedRef {
+ public:
+  OwnedRef() : obj_(NULLPTR) {}
+  OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {}
+  explicit OwnedRef(PyObject* obj) : obj_(obj) {}
+
+  OwnedRef& operator=(OwnedRef&& other) {
+    obj_ = other.detach();
+    return *this;
+  }
+
+  ~OwnedRef() {
+    // GH-38626: destructor may be called after the Python interpreter is finalized.
+    if (Py_IsInitialized()) {
+      reset();
+    }
+  }
+
+  void reset(PyObject* obj) {
+    Py_XDECREF(obj_);
+    obj_ = obj;
+  }
+
+  void reset() { reset(NULLPTR); }
+
+  PyObject* detach() {
+    PyObject* result = obj_;
+    obj_ = NULLPTR;
+    return result;
+  }
+
+  PyObject* obj() const { return obj_; }
+
+  PyObject** ref() { return &obj_; }
+
+  operator bool() const { return obj_ != NULLPTR; }
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(OwnedRef);
+
+  PyObject* obj_;
+};
+
+// Same as OwnedRef, but ensures the GIL is taken when it goes out of scope.
+// This is for situations where the GIL is not always known to be held
+// (e.g. if it is released in the middle of a function for performance reasons)
+class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef {
+ public:
+  OwnedRefNoGIL() : OwnedRef() {}
+  OwnedRefNoGIL(OwnedRefNoGIL&& other) : OwnedRef(other.detach()) {}
+  explicit OwnedRefNoGIL(PyObject* obj) : OwnedRef(obj) {}
+
+  ~OwnedRefNoGIL() {
+    // GH-38626: destructor may be called after the Python interpreter is finalized.
+    if (Py_IsInitialized() && obj() != NULLPTR) {
+      PyAcquireGIL lock;
+      reset();
+    }
+  }
+};
+
+template <template <typename...> typename SmartPtr, typename... Ts>
+class SmartPtrNoGIL : public SmartPtr<Ts...> {
+  using Base = SmartPtr<Ts...>;
+
+ public:
+  template <typename... Args>
+  SmartPtrNoGIL(Args&&... args) : Base(std::forward<Args>(args)...) {}
+
+  ~SmartPtrNoGIL() { reset(); }
+
+  template <typename... Args>
+  void reset(Args&&... args) {
+    auto release_guard = optional_gil_release();
+    Base::reset(std::forward<Args>(args)...);
+  }
+
+  template <typename V>
+  SmartPtrNoGIL& operator=(V&& v) {
+    auto release_guard = optional_gil_release();
+    Base::operator=(std::forward<V>(v));
+    return *this;
+  }
+
+ private:
+  // Only release the GIL if we own an object *and* the Python runtime is
+  // valid *and* the GIL is held.
+  std::optional<PyReleaseGIL> optional_gil_release() const {
+    if (this->get() != nullptr && Py_IsInitialized() && PyGILState_Check()) {
+      return PyReleaseGIL();
+    }
+    return {};
+  }
+};
+
+/// \brief A std::shared_ptr<T, ...> subclass that releases the GIL when destroying T
+template <typename... Ts>
+using SharedPtrNoGIL = SmartPtrNoGIL<std::shared_ptr, Ts...>;
+
+/// \brief A std::unique_ptr<T, ...> subclass that releases the GIL when destroying T
+template <typename... Ts>
+using UniquePtrNoGIL = SmartPtrNoGIL<std::unique_ptr, Ts...>;
+
+template <typename Fn>
+struct BoundFunction;
+
+template <typename... Args>
+struct BoundFunction<void(PyObject*, Args...)> {
+  // We bind `cdef void fn(object, ...)` to get a `Status(...)`
+  // where the Status contains any Python error raised by `fn`
+  using Unbound = void(PyObject*, Args...);
+  using Bound = Status(Args...);
+
+  BoundFunction(Unbound* unbound, PyObject* bound_arg)
+      : unbound_(unbound), bound_arg_(bound_arg) {}
+
+  Status Invoke(Args... args) const {
+    PyAcquireGIL lock;
+    unbound_(bound_arg_.obj(), std::forward<Args>(args)...);
+    RETURN_IF_PYERROR();
+    return Status::OK();
+  }
+
+  Unbound* unbound_;
+  OwnedRefNoGIL bound_arg_;
+};
+
+template <typename Return, typename... Args>
+struct BoundFunction<Return(PyObject*, Args...)> {
+  // We bind `cdef Return fn(object, ...)` to get a `Result<Return>(...)`
+  // where the Result contains any Python error raised by `fn` or the
+  // return value from `fn`.
+  using Unbound = Return(PyObject*, Args...);
+  using Bound = Result<Return>(Args...);
+
+  BoundFunction(Unbound* unbound, PyObject* bound_arg)
+      : unbound_(unbound), bound_arg_(bound_arg) {}
+
+  Result<Return> Invoke(Args... args) const {
+    PyAcquireGIL lock;
+    Return ret = unbound_(bound_arg_.obj(), std::forward<Args>(args)...);
+    RETURN_IF_PYERROR();
+    return ret;
+  }
+
+  Unbound* unbound_;
+  OwnedRefNoGIL bound_arg_;
+};
+
+template <typename OutFn, typename Return, typename... Args>
+std::function<OutFn> BindFunction(Return (*unbound)(PyObject*, Args...),
+                                  PyObject* bound_arg) {
+  using Fn = BoundFunction<Return(PyObject*, Args...)>;
+
+  static_assert(std::is_same<typename Fn::Bound, OutFn>::value,
+                "requested bound function of unsupported type");
+
+  Py_XINCREF(bound_arg);
+  auto bound_fn = std::make_shared<Fn>(unbound, bound_arg);
+  return
+      [bound_fn](Args... args) { return bound_fn->Invoke(std::forward<Args>(args)...); };
+}
+
+// A temporary conversion of a Python object to a bytes area.
+struct PyBytesView {
+  const char* bytes;
+  Py_ssize_t size;
+  bool is_utf8;
+
+  static Result<PyBytesView> FromString(PyObject* obj, bool check_utf8 = false) {
+    PyBytesView self;
+    ARROW_RETURN_NOT_OK(self.ParseString(obj, check_utf8));
+    return std::move(self);
+  }
+
+  static Result<PyBytesView> FromUnicode(PyObject* obj) {
+    PyBytesView self;
+    ARROW_RETURN_NOT_OK(self.ParseUnicode(obj));
+    return std::move(self);
+  }
+
+  static Result<PyBytesView> FromBinary(PyObject* obj) {
+    PyBytesView self;
+    ARROW_RETURN_NOT_OK(self.ParseBinary(obj));
+    return std::move(self);
+  }
+
+  // View the given Python object as string-like, i.e. str or (utf8) bytes
+  Status ParseString(PyObject* obj, bool check_utf8 = false) {
+    if (PyUnicode_Check(obj)) {
+      return ParseUnicode(obj);
+    } else {
+      ARROW_RETURN_NOT_OK(ParseBinary(obj));
+      if (check_utf8) {
+        // Check the bytes are utf8 utf-8
+        OwnedRef decoded(PyUnicode_FromStringAndSize(bytes, size));
+        if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
+          is_utf8 = true;
+        } else {
+          PyErr_Clear();
+          is_utf8 = false;
+        }
+      }
+      return Status::OK();
+    }
+  }
+
+  // View the given Python object as unicode string
+  Status ParseUnicode(PyObject* obj) {
+    // The utf-8 representation is cached on the unicode object
+    bytes = PyUnicode_AsUTF8AndSize(obj, &size);
+    RETURN_IF_PYERROR();
+    is_utf8 = true;
+    return Status::OK();
+  }
+
+  // View the given Python object as binary-like, i.e. bytes
+  Status ParseBinary(PyObject* obj) {
+    if (PyBytes_Check(obj)) {
+      bytes = PyBytes_AS_STRING(obj);
+      size = PyBytes_GET_SIZE(obj);
+      is_utf8 = false;
+    } else if (PyByteArray_Check(obj)) {
+      bytes = PyByteArray_AS_STRING(obj);
+      size = PyByteArray_GET_SIZE(obj);
+      is_utf8 = false;
+    } else if (PyMemoryView_Check(obj)) {
+      PyObject* ref = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C');
+      RETURN_IF_PYERROR();
+      Py_buffer* buffer = PyMemoryView_GET_BUFFER(ref);
+      bytes = reinterpret_cast<const char*>(buffer->buf);
+      size = buffer->len;
+      is_utf8 = false;
+    } else {
+      return Status::TypeError("Expected bytes, got a '", Py_TYPE(obj)->tp_name,
+                               "' object");
+    }
+    return Status::OK();
+  }
+
+ protected:
+  OwnedRef ref;
+};
+
+class ARROW_PYTHON_EXPORT PyBuffer : public Buffer {
+ public:
+  /// While memoryview objects support multi-dimensional buffers, PyBuffer only supports
+  /// one-dimensional byte buffers.
+  ~PyBuffer();
+
+  static Result<std::shared_ptr<Buffer>> FromPyObject(PyObject* obj);
+
+ private:
+  PyBuffer();
+  Status Init(PyObject*);
+
+  Py_buffer py_buf_;
+};
+
+// Return the common PyArrow memory pool
+ARROW_PYTHON_EXPORT void set_default_memory_pool(MemoryPool* pool);
+ARROW_PYTHON_EXPORT MemoryPool* get_memory_pool();
+
+// This is annoying: because C++11 does not allow implicit conversion of string
+// literals to non-const char*, we need to go through some gymnastics to use
+// PyObject_CallMethod without a lot of pain (its arguments are non-const
+// char*)
+template <typename... ArgTypes>
+static inline PyObject* cpp_PyObject_CallMethod(PyObject* obj, const char* method_name,
+                                                const char* argspec, ArgTypes... args) {
+  return PyObject_CallMethod(obj, const_cast<char*>(method_name),
+                             const_cast<char*>(argspec), args...);
+}
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/config.h b/pyarrow/include/arrow/python/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..9af06f271661e53f1790b5f408bd62f3707aff78
--- /dev/null
+++ b/pyarrow/include/arrow/python/config.h
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+namespace py {
+
+struct BuildInfo {
+  // The uppercase build type, e.g. "DEBUG" or "RELEASE"
+  std::string build_type;
+};
+
+/// \brief Get build info for PyArrow.
+///
+ARROW_PYTHON_EXPORT
+const BuildInfo& GetBuildInfo();
+
+}  // namespace py
+}  // namespace arrow
\ No newline at end of file
diff --git a/pyarrow/include/arrow/python/csv.h b/pyarrow/include/arrow/python/csv.h
new file mode 100644
index 0000000000000000000000000000000000000000..34302e93667394d616692a6a4603e6d0be67d211
--- /dev/null
+++ b/pyarrow/include/arrow/python/csv.h
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/csv/options.h"
+#include "arrow/python/common.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace py {
+namespace csv {
+
+using PyInvalidRowCallback = std::function<::arrow::csv::InvalidRowResult(
+    PyObject*, const ::arrow::csv::InvalidRow&)>;
+
+ARROW_PYTHON_EXPORT
+::arrow::csv::InvalidRowHandler MakeInvalidRowHandler(PyInvalidRowCallback,
+                                                      PyObject* handler);
+
+}  // namespace csv
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/datetime.h b/pyarrow/include/arrow/python/datetime.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b21eeb4342170f5462ee6a532003985fe5882f6
--- /dev/null
+++ b/pyarrow/include/arrow/python/datetime.h
@@ -0,0 +1,231 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+
+#include "arrow/python/platform.h"
+#include "arrow/python/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/int_util_overflow.h"
+#include "arrow/util/logging.h"
+
+// By default, PyDateTimeAPI is a *static* variable.  This forces
+// PyDateTime_IMPORT to be called in every C/C++ module using the
+// C datetime API.  This is error-prone and potentially costly.
+// Instead, we redefine PyDateTimeAPI to point to a global variable,
+// which is initialized once by calling InitDatetime().
+#ifdef PYPY_VERSION
+#  include "datetime.h"
+#else
+#  define PyDateTimeAPI ::arrow::py::internal::datetime_api
+#endif
+
+namespace arrow {
+using internal::AddWithOverflow;
+using internal::MultiplyWithOverflow;
+namespace py {
+namespace internal {
+
+#ifndef PYPY_VERSION
+extern PyDateTime_CAPI* datetime_api;
+
+ARROW_PYTHON_EXPORT
+void InitDatetime();
+#endif
+
+// Returns the MonthDayNano namedtuple type (increments the reference count).
+ARROW_PYTHON_EXPORT
+PyObject* NewMonthDayNanoTupleType();
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyTime_to_us(PyObject* pytime) {
+  return (PyDateTime_TIME_GET_HOUR(pytime) * 3600000000LL +
+          PyDateTime_TIME_GET_MINUTE(pytime) * 60000000LL +
+          PyDateTime_TIME_GET_SECOND(pytime) * 1000000LL +
+          PyDateTime_TIME_GET_MICROSECOND(pytime));
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyTime_to_s(PyObject* pytime) { return PyTime_to_us(pytime) / 1000000; }
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyTime_to_ms(PyObject* pytime) { return PyTime_to_us(pytime) / 1000; }
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyTime_to_ns(PyObject* pytime) { return PyTime_to_us(pytime) * 1000; }
+
+ARROW_PYTHON_EXPORT
+Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out);
+
+ARROW_PYTHON_EXPORT
+Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out);
+
+// WARNING: This function returns a naive datetime.
+ARROW_PYTHON_EXPORT
+Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out);
+
+// This declaration must be the same as in filesystem/filesystem.h
+using TimePoint =
+    std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
+
+ARROW_PYTHON_EXPORT
+int64_t PyDate_to_days(PyDateTime_Date* pydate);
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDate_to_s(PyDateTime_Date* pydate) {
+  return PyDate_to_days(pydate) * 86400LL;
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
+  return PyDate_to_days(pydate) * 86400000LL;
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) {
+  return (PyDate_to_s(reinterpret_cast<PyDateTime_Date*>(pydatetime)) +
+          PyDateTime_DATE_GET_HOUR(pydatetime) * 3600LL +
+          PyDateTime_DATE_GET_MINUTE(pydatetime) * 60LL +
+          PyDateTime_DATE_GET_SECOND(pydatetime));
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) {
+  return (PyDateTime_to_s(pydatetime) * 1000LL +
+          PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000);
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) {
+  return (PyDateTime_to_s(pydatetime) * 1000000LL +
+          PyDateTime_DATE_GET_MICROSECOND(pydatetime));
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) {
+  return PyDateTime_to_us(pydatetime) * 1000LL;
+}
+
+ARROW_PYTHON_EXPORT
+inline TimePoint PyDateTime_to_TimePoint(PyDateTime_DateTime* pydatetime) {
+  return TimePoint(TimePoint::duration(PyDateTime_to_ns(pydatetime)));
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t TimePoint_to_ns(TimePoint val) { return val.time_since_epoch().count(); }
+
+ARROW_PYTHON_EXPORT
+inline TimePoint TimePoint_from_s(double val) {
+  return TimePoint(TimePoint::duration(static_cast<int64_t>(1e9 * val)));
+}
+
+ARROW_PYTHON_EXPORT
+inline TimePoint TimePoint_from_ns(int64_t val) {
+  return TimePoint(TimePoint::duration(val));
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDelta_to_s(PyDateTime_Delta* pytimedelta) {
+  return (PyDateTime_DELTA_GET_DAYS(pytimedelta) * 86400LL +
+          PyDateTime_DELTA_GET_SECONDS(pytimedelta));
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDelta_to_ms(PyDateTime_Delta* pytimedelta) {
+  return (PyDelta_to_s(pytimedelta) * 1000LL +
+          PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta) / 1000);
+}
+
+ARROW_PYTHON_EXPORT
+inline Result<int64_t> PyDelta_to_us(PyDateTime_Delta* pytimedelta) {
+  int64_t result = PyDelta_to_s(pytimedelta);
+  if (MultiplyWithOverflow(result, 1000000LL, &result)) {
+    return Status::Invalid("Timedelta too large to fit in 64-bit integer");
+  }
+  if (AddWithOverflow(result, PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta), &result)) {
+    return Status::Invalid("Timedelta too large to fit in 64-bit integer");
+  }
+  return result;
+}
+
+ARROW_PYTHON_EXPORT
+inline Result<int64_t> PyDelta_to_ns(PyDateTime_Delta* pytimedelta) {
+  ARROW_ASSIGN_OR_RAISE(int64_t result, PyDelta_to_us(pytimedelta));
+  if (MultiplyWithOverflow(result, 1000LL, &result)) {
+    return Status::Invalid("Timedelta too large to fit in 64-bit integer");
+  }
+  return result;
+}
+
+ARROW_PYTHON_EXPORT
+Result<int64_t> PyDateTime_utcoffset_s(PyObject* pydatetime);
+
+/// \brief Convert a time zone name into a time zone object.
+///
+/// Supported input strings are:
+/// * As used in the Olson time zone database (the "tz database" or
+///   "tzdata"), such as "America/New_York"
+/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+/// GIL must be held when calling this method.
+ARROW_PYTHON_EXPORT
+Result<PyObject*> StringToTzinfo(const std::string& tz);
+
+/// \brief Convert a time zone object to a string representation.
+///
+/// The output strings are:
+/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+///   if the input object is either an instance of pytz._FixedOffset or
+///   datetime.timedelta
+/// * The timezone's name if the input object's tzname() method returns with a
+///   non-empty timezone name such as "UTC" or "America/New_York"
+///
+/// GIL must be held when calling this method.
+ARROW_PYTHON_EXPORT
+Result<std::string> TzinfoToString(PyObject* pytzinfo);
+
+/// \brief Convert MonthDayNano to a python namedtuple.
+///
+/// Return a named tuple (pyarrow.MonthDayNano) containing attributes
+/// "months", "days", "nanoseconds" in the given order
+/// with values extracted from the fields on interval.
+///
+/// GIL must be held when calling this method.
+ARROW_PYTHON_EXPORT
+PyObject* MonthDayNanoIntervalToNamedTuple(
+    const MonthDayNanoIntervalType::MonthDayNanos& interval);
+
+/// \brief Convert the given Array to a PyList object containing
+/// pyarrow.MonthDayNano objects.
+ARROW_PYTHON_EXPORT
+Result<PyObject*> MonthDayNanoIntervalArrayToPyList(
+    const MonthDayNanoIntervalArray& array);
+
+/// \brief Convert the Scalar object to a pyarrow.MonthDayNano (or None if
+/// is isn't valid).
+ARROW_PYTHON_EXPORT
+Result<PyObject*> MonthDayNanoIntervalScalarToPyObject(
+    const MonthDayNanoIntervalScalar& scalar);
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/decimal.h b/pyarrow/include/arrow/python/decimal.h
new file mode 100644
index 0000000000000000000000000000000000000000..83ded0b82b922afe2afdd2e9b5f405ccf1dd2062
--- /dev/null
+++ b/pyarrow/include/arrow/python/decimal.h
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/python/visibility.h"
+#include "arrow/type.h"
+
+namespace arrow {
+
+class Decimal128;
+class Decimal256;
+
+namespace py {
+
+class OwnedRef;
+
+//
+// Python Decimal support
+//
+
+namespace internal {
+
+// \brief Import the Python Decimal type
+ARROW_PYTHON_EXPORT
+Status ImportDecimalType(OwnedRef* decimal_type);
+
+// \brief Convert a Python Decimal object to a C++ string
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[out] The string representation of the Python Decimal instance
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status PythonDecimalToString(PyObject* python_decimal, std::string* out);
+
+// \brief Convert a C++ std::string to a Python Decimal instance
+// \param[in] decimal_constructor The decimal type object
+// \param[in] decimal_string A decimal string
+// \return An instance of decimal.Decimal
+ARROW_PYTHON_EXPORT
+PyObject* DecimalFromString(PyObject* decimal_constructor,
+                            const std::string& decimal_string);
+
+// \brief Convert a Python decimal to an Arrow Decimal128 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal32* out);
+
+// \brief Convert a Python object to an Arrow Decimal128 object
+// \param[in] python_decimal A Python int or decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal32* out);
+
+// \brief Convert a Python decimal to an Arrow Decimal128 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal64* out);
+
+// \brief Convert a Python object to an Arrow Decimal128 object
+// \param[in] python_decimal A Python int or decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal64* out);
+
+// \brief Convert a Python decimal to an Arrow Decimal128 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal128* out);
+
+// \brief Convert a Python object to an Arrow Decimal128 object
+// \param[in] python_decimal A Python int or decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal128* out);
+
+// \brief Convert a Python decimal to an Arrow Decimal256 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal256
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal256* out);
+
+// \brief Convert a Python object to an Arrow Decimal256 object
+// \param[in] python_decimal A Python int or decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal256
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal256* out);
+
+// \brief Check whether obj is an instance of Decimal
+ARROW_PYTHON_EXPORT
+bool PyDecimal_Check(PyObject* obj);
+
+// \brief Check whether obj is nan. This function will abort the program if the argument
+// is not a Decimal instance
+ARROW_PYTHON_EXPORT
+bool PyDecimal_ISNAN(PyObject* obj);
+
+// \brief Helper class to track and update the precision and scale of a decimal
+class ARROW_PYTHON_EXPORT DecimalMetadata {
+ public:
+  DecimalMetadata();
+  DecimalMetadata(int32_t precision, int32_t scale);
+
+  // \brief Adjust the precision and scale of a decimal type given a new precision and a
+  // new scale \param[in] suggested_precision A candidate precision \param[in]
+  // suggested_scale A candidate scale \return The status of the operation
+  Status Update(int32_t suggested_precision, int32_t suggested_scale);
+
+  // \brief A convenient interface for updating the precision and scale based on a Python
+  // Decimal object \param object A Python Decimal object \return The status of the
+  // operation
+  Status Update(PyObject* object);
+
+  int32_t precision() const { return precision_; }
+  int32_t scale() const { return scale_; }
+
+ private:
+  int32_t precision_;
+  int32_t scale_;
+};
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/extension_type.h b/pyarrow/include/arrow/python/extension_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6523824eb9634c18b87e4e3e5c827d8be43f8a8
--- /dev/null
+++ b/pyarrow/include/arrow/python/extension_type.h
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/extension_type.h"
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace py {
+
+class ARROW_PYTHON_EXPORT PyExtensionType : public ExtensionType {
+ public:
+  // Implement extensionType API
+  std::string extension_name() const override { return extension_name_; }
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override;
+
+  // For use from Cython
+  // Assumes that `typ` is borrowed
+  static Status FromClass(const std::shared_ptr<DataType> storage_type,
+                          const std::string extension_name, PyObject* typ,
+                          std::shared_ptr<ExtensionType>* out);
+
+  // Return new ref
+  PyObject* GetInstance() const;
+  Status SetInstance(PyObject*) const;
+
+ protected:
+  PyExtensionType(std::shared_ptr<DataType> storage_type, PyObject* typ,
+                  PyObject* inst = NULLPTR);
+  PyExtensionType(std::shared_ptr<DataType> storage_type, std::string extension_name,
+                  PyObject* typ, PyObject* inst = NULLPTR);
+
+  std::string extension_name_;
+
+  // These fields are mutable because of two-step initialization.
+  mutable OwnedRefNoGIL type_class_;
+  // A weakref or null.  Storing a strong reference to the Python extension type
+  // instance would create an unreclaimable reference cycle between Python and C++
+  // (the Python instance has to keep a strong reference to the C++ ExtensionType
+  //  in other direction).  Instead, we store a weakref to the instance.
+  // If the weakref is dead, we reconstruct the instance from its serialized form.
+  mutable OwnedRefNoGIL type_instance_;
+  // Empty if type_instance_ is null
+  mutable std::string serialized_;
+};
+
+ARROW_PYTHON_EXPORT std::string PyExtensionName();
+
+ARROW_PYTHON_EXPORT Status RegisterPyExtensionType(const std::shared_ptr<DataType>&);
+
+ARROW_PYTHON_EXPORT Status UnregisterPyExtensionType(const std::string& type_name);
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/filesystem.h b/pyarrow/include/arrow/python/filesystem.h
new file mode 100644
index 0000000000000000000000000000000000000000..194b226ac5c35d4b3518c2e9fa9443c2ba1007ae
--- /dev/null
+++ b/pyarrow/include/arrow/python/filesystem.h
@@ -0,0 +1,130 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+#include "arrow/util/macros.h"
+
+namespace arrow::py::fs {
+
+class ARROW_PYTHON_EXPORT PyFileSystemVtable {
+ public:
+  std::function<void(PyObject*, std::string* out)> get_type_name;
+  std::function<bool(PyObject*, const arrow::fs::FileSystem& other)> equals;
+
+  std::function<void(PyObject*, const std::string& path, arrow::fs::FileInfo* out)>
+      get_file_info;
+  std::function<void(PyObject*, const std::vector<std::string>& paths,
+                     std::vector<arrow::fs::FileInfo>* out)>
+      get_file_info_vector;
+  std::function<void(PyObject*, const arrow::fs::FileSelector&,
+                     std::vector<arrow::fs::FileInfo>* out)>
+      get_file_info_selector;
+
+  std::function<void(PyObject*, const std::string& path, bool)> create_dir;
+  std::function<void(PyObject*, const std::string& path)> delete_dir;
+  std::function<void(PyObject*, const std::string& path, bool)> delete_dir_contents;
+  std::function<void(PyObject*)> delete_root_dir_contents;
+  std::function<void(PyObject*, const std::string& path)> delete_file;
+  std::function<void(PyObject*, const std::string& src, const std::string& dest)> move;
+  std::function<void(PyObject*, const std::string& src, const std::string& dest)>
+      copy_file;
+
+  std::function<void(PyObject*, const std::string& path,
+                     std::shared_ptr<io::InputStream>* out)>
+      open_input_stream;
+  std::function<void(PyObject*, const std::string& path,
+                     std::shared_ptr<io::RandomAccessFile>* out)>
+      open_input_file;
+  std::function<void(PyObject*, const std::string& path,
+                     const std::shared_ptr<const KeyValueMetadata>&,
+                     std::shared_ptr<io::OutputStream>* out)>
+      open_output_stream;
+  std::function<void(PyObject*, const std::string& path,
+                     const std::shared_ptr<const KeyValueMetadata>&,
+                     std::shared_ptr<io::OutputStream>* out)>
+      open_append_stream;
+
+  std::function<void(PyObject*, const std::string& path, std::string* out)>
+      normalize_path;
+};
+
+class ARROW_PYTHON_EXPORT PyFileSystem : public arrow::fs::FileSystem {
+ public:
+  PyFileSystem(PyObject* handler, PyFileSystemVtable vtable);
+  ~PyFileSystem() override;
+
+  static std::shared_ptr<PyFileSystem> Make(PyObject* handler, PyFileSystemVtable vtable);
+
+  std::string type_name() const override;
+
+  bool Equals(const FileSystem& other) const override;
+
+  /// \cond FALSE
+  using FileSystem::CreateDir;
+  using FileSystem::DeleteDirContents;
+  using FileSystem::GetFileInfo;
+  using FileSystem::OpenAppendStream;
+  using FileSystem::OpenOutputStream;
+  /// \endcond
+
+  Result<arrow::fs::FileInfo> GetFileInfo(const std::string& path) override;
+  Result<std::vector<arrow::fs::FileInfo>> GetFileInfo(
+      const std::vector<std::string>& paths) override;
+  Result<std::vector<arrow::fs::FileInfo>> GetFileInfo(
+      const arrow::fs::FileSelector& select) override;
+
+  Status CreateDir(const std::string& path, bool recursive) override;
+
+  Status DeleteDir(const std::string& path) override;
+  Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
+  Status DeleteRootDirContents() override;
+
+  Status DeleteFile(const std::string& path) override;
+
+  Status Move(const std::string& src, const std::string& dest) override;
+
+  Status CopyFile(const std::string& src, const std::string& dest) override;
+
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+  Result<std::string> NormalizePath(std::string path) override;
+
+  PyObject* handler() const { return handler_.obj(); }
+
+ private:
+  OwnedRefNoGIL handler_;
+  PyFileSystemVtable vtable_;
+};
+
+}  // namespace arrow::py::fs
diff --git a/pyarrow/include/arrow/python/flight.h b/pyarrow/include/arrow/python/flight.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a1f4c750aac77573fd7eb84e9d1cc24eae92700
--- /dev/null
+++ b/pyarrow/include/arrow/python/flight.h
@@ -0,0 +1,352 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/flight/api.h"
+#include "arrow/ipc/dictionary.h"
+#include "arrow/python/common.h"
+
+#if defined(_WIN32) || defined(__CYGWIN__)  // Windows
+#  if defined(_MSC_VER)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_PYTHON_STATIC
+#    define ARROW_PYFLIGHT_EXPORT
+#  elif defined(ARROW_PYFLIGHT_EXPORTING)
+#    define ARROW_PYFLIGHT_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_PYFLIGHT_EXPORT __declspec(dllimport)
+#  endif
+
+#else  // Not Windows
+#  ifndef ARROW_PYFLIGHT_EXPORT
+#    define ARROW_PYFLIGHT_EXPORT __attribute__((visibility("default")))
+#  endif
+#endif  // Non-Windows
+
+namespace arrow {
+
+namespace py {
+
+namespace flight {
+
+ARROW_PYFLIGHT_EXPORT
+extern const char* kPyServerMiddlewareName;
+
+/// \brief A table of function pointers for calling from C++ into
+/// Python.
+class ARROW_PYFLIGHT_EXPORT PyFlightServerVtable {
+ public:
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       const arrow::flight::Criteria*,
+                       std::unique_ptr<arrow::flight::FlightListing>*)>
+      list_flights;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       const arrow::flight::FlightDescriptor&,
+                       std::unique_ptr<arrow::flight::FlightInfo>*)>
+      get_flight_info;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       const arrow::flight::FlightDescriptor&,
+                       std::unique_ptr<arrow::flight::SchemaResult>*)>
+      get_schema;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       const arrow::flight::Ticket&,
+                       std::unique_ptr<arrow::flight::FlightDataStream>*)>
+      do_get;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       std::unique_ptr<arrow::flight::FlightMessageReader>,
+                       std::unique_ptr<arrow::flight::FlightMetadataWriter>)>
+      do_put;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       std::unique_ptr<arrow::flight::FlightMessageReader>,
+                       std::unique_ptr<arrow::flight::FlightMessageWriter>)>
+      do_exchange;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       const arrow::flight::Action&,
+                       std::unique_ptr<arrow::flight::ResultStream>*)>
+      do_action;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       std::vector<arrow::flight::ActionType>*)>
+      list_actions;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyServerAuthHandlerVtable {
+ public:
+  std::function<Status(PyObject*, arrow::flight::ServerAuthSender*,
+                       arrow::flight::ServerAuthReader*)>
+      authenticate;
+  std::function<Status(PyObject*, const std::string&, std::string*)> is_valid;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyClientAuthHandlerVtable {
+ public:
+  std::function<Status(PyObject*, arrow::flight::ClientAuthSender*,
+                       arrow::flight::ClientAuthReader*)>
+      authenticate;
+  std::function<Status(PyObject*, std::string*)> get_token;
+};
+
+/// \brief A helper to implement an auth mechanism in Python.
+class ARROW_PYFLIGHT_EXPORT PyServerAuthHandler
+    : public arrow::flight::ServerAuthHandler {
+ public:
+  explicit PyServerAuthHandler(PyObject* handler,
+                               const PyServerAuthHandlerVtable& vtable);
+  Status Authenticate(const arrow::flight::ServerCallContext& context,
+                      arrow::flight::ServerAuthSender* outgoing,
+                      arrow::flight::ServerAuthReader* incoming) override;
+  Status IsValid(const std::string& token, std::string* peer_identity) override;
+
+ private:
+  OwnedRefNoGIL handler_;
+  PyServerAuthHandlerVtable vtable_;
+};
+
+/// \brief A helper to implement an auth mechanism in Python.
+class ARROW_PYFLIGHT_EXPORT PyClientAuthHandler
+    : public arrow::flight::ClientAuthHandler {
+ public:
+  explicit PyClientAuthHandler(PyObject* handler,
+                               const PyClientAuthHandlerVtable& vtable);
+  Status Authenticate(arrow::flight::ClientAuthSender* outgoing,
+                      arrow::flight::ClientAuthReader* incoming) override;
+  Status GetToken(std::string* token) override;
+
+ private:
+  OwnedRefNoGIL handler_;
+  PyClientAuthHandlerVtable vtable_;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyFlightServer : public arrow::flight::FlightServerBase {
+ public:
+  explicit PyFlightServer(PyObject* server, const PyFlightServerVtable& vtable);
+
+  // Like Serve(), but set up signals and invoke Python signal handlers
+  // if necessary.  This function may return with a Python exception set.
+  Status ServeWithSignals();
+
+  Status ListFlights(const arrow::flight::ServerCallContext& context,
+                     const arrow::flight::Criteria* criteria,
+                     std::unique_ptr<arrow::flight::FlightListing>* listings) override;
+  Status GetFlightInfo(const arrow::flight::ServerCallContext& context,
+                       const arrow::flight::FlightDescriptor& request,
+                       std::unique_ptr<arrow::flight::FlightInfo>* info) override;
+  Status GetSchema(const arrow::flight::ServerCallContext& context,
+                   const arrow::flight::FlightDescriptor& request,
+                   std::unique_ptr<arrow::flight::SchemaResult>* result) override;
+  Status DoGet(const arrow::flight::ServerCallContext& context,
+               const arrow::flight::Ticket& request,
+               std::unique_ptr<arrow::flight::FlightDataStream>* stream) override;
+  Status DoPut(const arrow::flight::ServerCallContext& context,
+               std::unique_ptr<arrow::flight::FlightMessageReader> reader,
+               std::unique_ptr<arrow::flight::FlightMetadataWriter> writer) override;
+  Status DoExchange(const arrow::flight::ServerCallContext& context,
+                    std::unique_ptr<arrow::flight::FlightMessageReader> reader,
+                    std::unique_ptr<arrow::flight::FlightMessageWriter> writer) override;
+  Status DoAction(const arrow::flight::ServerCallContext& context,
+                  const arrow::flight::Action& action,
+                  std::unique_ptr<arrow::flight::ResultStream>* result) override;
+  Status ListActions(const arrow::flight::ServerCallContext& context,
+                     std::vector<arrow::flight::ActionType>* actions) override;
+
+ private:
+  OwnedRefNoGIL server_;
+  PyFlightServerVtable vtable_;
+};
+
+/// \brief A callback that obtains the next result from a Flight action.
+typedef std::function<Status(PyObject*, std::unique_ptr<arrow::flight::Result>*)>
+    PyFlightResultStreamCallback;
+
+/// \brief A ResultStream built around a Python callback.
+class ARROW_PYFLIGHT_EXPORT PyFlightResultStream : public arrow::flight::ResultStream {
+ public:
+  /// \brief Construct a FlightResultStream from a Python object and callback.
+  /// Must only be called while holding the GIL.
+  explicit PyFlightResultStream(PyObject* generator,
+                                PyFlightResultStreamCallback callback);
+  arrow::Result<std::unique_ptr<arrow::flight::Result>> Next() override;
+
+ private:
+  OwnedRefNoGIL generator_;
+  PyFlightResultStreamCallback callback_;
+};
+
+/// \brief A wrapper around a FlightDataStream that keeps alive a
+/// Python object backing it.
+class ARROW_PYFLIGHT_EXPORT PyFlightDataStream : public arrow::flight::FlightDataStream {
+ public:
+  /// \brief Construct a FlightDataStream from a Python object and underlying stream.
+  /// Must only be called while holding the GIL.
+  explicit PyFlightDataStream(PyObject* data_source,
+                              std::unique_ptr<arrow::flight::FlightDataStream> stream);
+
+  std::shared_ptr<Schema> schema() override;
+  arrow::Result<arrow::flight::FlightPayload> GetSchemaPayload() override;
+  arrow::Result<arrow::flight::FlightPayload> Next() override;
+
+ private:
+  OwnedRefNoGIL data_source_;
+  std::unique_ptr<arrow::flight::FlightDataStream> stream_;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyServerMiddlewareFactory
+    : public arrow::flight::ServerMiddlewareFactory {
+ public:
+  /// \brief A callback to create the middleware instance in Python
+  typedef std::function<Status(
+      PyObject*, const arrow::flight::CallInfo& info,
+      const arrow::flight::CallHeaders& incoming_headers,
+      std::shared_ptr<arrow::flight::ServerMiddleware>* middleware)>
+      StartCallCallback;
+
+  /// \brief Must only be called while holding the GIL.
+  explicit PyServerMiddlewareFactory(PyObject* factory, StartCallCallback start_call);
+
+  Status StartCall(const arrow::flight::CallInfo& info,
+                   const arrow::flight::ServerCallContext& context,
+                   std::shared_ptr<arrow::flight::ServerMiddleware>* middleware) override;
+
+ private:
+  OwnedRefNoGIL factory_;
+  StartCallCallback start_call_;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyServerMiddleware : public arrow::flight::ServerMiddleware {
+ public:
+  typedef std::function<Status(PyObject*,
+                               arrow::flight::AddCallHeaders* outgoing_headers)>
+      SendingHeadersCallback;
+  typedef std::function<Status(PyObject*, const Status& status)> CallCompletedCallback;
+
+  struct Vtable {
+    SendingHeadersCallback sending_headers;
+    CallCompletedCallback call_completed;
+  };
+
+  /// \brief Must only be called while holding the GIL.
+  explicit PyServerMiddleware(PyObject* middleware, Vtable vtable);
+
+  void SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) override;
+  void CallCompleted(const Status& status) override;
+  std::string name() const override;
+  /// \brief Get the underlying Python object.
+  PyObject* py_object() const;
+
+ private:
+  OwnedRefNoGIL middleware_;
+  Vtable vtable_;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyClientMiddlewareFactory
+    : public arrow::flight::ClientMiddlewareFactory {
+ public:
+  /// \brief A callback to create the middleware instance in Python
+  typedef std::function<Status(
+      PyObject*, const arrow::flight::CallInfo& info,
+      std::unique_ptr<arrow::flight::ClientMiddleware>* middleware)>
+      StartCallCallback;
+
+  /// \brief Must only be called while holding the GIL.
+  explicit PyClientMiddlewareFactory(PyObject* factory, StartCallCallback start_call);
+
+  void StartCall(const arrow::flight::CallInfo& info,
+                 std::unique_ptr<arrow::flight::ClientMiddleware>* middleware) override;
+
+ private:
+  OwnedRefNoGIL factory_;
+  StartCallCallback start_call_;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyClientMiddleware : public arrow::flight::ClientMiddleware {
+ public:
+  typedef std::function<Status(PyObject*,
+                               arrow::flight::AddCallHeaders* outgoing_headers)>
+      SendingHeadersCallback;
+  typedef std::function<Status(PyObject*,
+                               const arrow::flight::CallHeaders& incoming_headers)>
+      ReceivedHeadersCallback;
+  typedef std::function<Status(PyObject*, const Status& status)> CallCompletedCallback;
+
+  struct Vtable {
+    SendingHeadersCallback sending_headers;
+    ReceivedHeadersCallback received_headers;
+    CallCompletedCallback call_completed;
+  };
+
+  /// \brief Must only be called while holding the GIL.
+  explicit PyClientMiddleware(PyObject* factory, Vtable vtable);
+
+  void SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) override;
+  void ReceivedHeaders(const arrow::flight::CallHeaders& incoming_headers) override;
+  void CallCompleted(const Status& status) override;
+
+ private:
+  OwnedRefNoGIL middleware_;
+  Vtable vtable_;
+};
+
+/// \brief A callback that obtains the next payload from a Flight result stream.
+typedef std::function<Status(PyObject*, arrow::flight::FlightPayload*)>
+    PyGeneratorFlightDataStreamCallback;
+
+/// \brief A FlightDataStream built around a Python callback.
+class ARROW_PYFLIGHT_EXPORT PyGeneratorFlightDataStream
+    : public arrow::flight::FlightDataStream {
+ public:
+  /// \brief Construct a FlightDataStream from a Python object and underlying stream.
+  /// Must only be called while holding the GIL.
+  explicit PyGeneratorFlightDataStream(PyObject* generator,
+                                       std::shared_ptr<arrow::Schema> schema,
+                                       PyGeneratorFlightDataStreamCallback callback,
+                                       const ipc::IpcWriteOptions& options);
+  std::shared_ptr<Schema> schema() override;
+  arrow::Result<arrow::flight::FlightPayload> GetSchemaPayload() override;
+  arrow::Result<arrow::flight::FlightPayload> Next() override;
+
+ private:
+  OwnedRefNoGIL generator_;
+  std::shared_ptr<arrow::Schema> schema_;
+  ipc::DictionaryFieldMapper mapper_;
+  ipc::IpcWriteOptions options_;
+  PyGeneratorFlightDataStreamCallback callback_;
+};
+
+ARROW_PYFLIGHT_EXPORT
+Status CreateFlightInfo(const std::shared_ptr<arrow::Schema>& schema,
+                        const arrow::flight::FlightDescriptor& descriptor,
+                        const std::vector<arrow::flight::FlightEndpoint>& endpoints,
+                        int64_t total_records, int64_t total_bytes, bool ordered,
+                        const std::string& app_metadata,
+                        std::unique_ptr<arrow::flight::FlightInfo>* out);
+
+/// \brief Create a SchemaResult from schema.
+ARROW_PYFLIGHT_EXPORT
+Status CreateSchemaResult(const std::shared_ptr<arrow::Schema>& schema,
+                          std::unique_ptr<arrow::flight::SchemaResult>* out);
+
+}  // namespace flight
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/gdb.h b/pyarrow/include/arrow/python/gdb.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ddcbb51f6e0b70c1b16dc9a9ce6caf79fb2369e
--- /dev/null
+++ b/pyarrow/include/arrow/python/gdb.h
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+namespace gdb {
+
+ARROW_PYTHON_EXPORT
+void TestSession();
+
+}  // namespace gdb
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/helpers.h b/pyarrow/include/arrow/python/helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0cf1010289ead191c735ad48c999f1a850953b4
--- /dev/null
+++ b/pyarrow/include/arrow/python/helpers.h
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/python/numpy_interop.h"
+
+#include "arrow/python/visibility.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+namespace py {
+
+class OwnedRef;
+
+// \brief Get an arrow DataType instance from Arrow's Type::type enum
+// \param[in] type One of the values of Arrow's Type::type enum
+// \return A shared pointer to DataType
+ARROW_PYTHON_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
+
+// \brief Construct a Python float object from a half-float uint16_t value.
+ARROW_PYTHON_EXPORT PyObject* PyFloat_FromHalf(uint16_t value);
+
+// \brief Convert a Python object to a half-float uint16_t value.
+ARROW_PYTHON_EXPORT Result<uint16_t> PyFloat_AsHalf(PyObject* obj);
+
+namespace internal {
+
+// \brief Check that a Python module has been already imported
+// \param[in] module_name The name of the module
+Result<bool> IsModuleImported(const std::string& module_name);
+
+// \brief Import a Python module
+// \param[in] module_name The name of the module
+// \param[out] ref The OwnedRef containing the module PyObject*
+ARROW_PYTHON_EXPORT
+Status ImportModule(const std::string& module_name, OwnedRef* ref);
+
+// \brief Import an object from a Python module
+// \param[in] module A Python module
+// \param[in] name The name of the object to import
+// \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c
+// module
+ARROW_PYTHON_EXPORT
+Status ImportFromModule(PyObject* module, const std::string& name, OwnedRef* ref);
+
+// \brief Check whether obj is an integer, independent of Python versions.
+inline bool IsPyInteger(PyObject* obj) { return PyLong_Check(obj); }
+
+// \brief Import symbols from pandas that we need for various type-checking,
+// like pandas.NaT or pandas.NA
+void InitPandasStaticData();
+
+// \brief Use pandas missing value semantics to check if a value is null
+ARROW_PYTHON_EXPORT
+bool PandasObjectIsNull(PyObject* obj);
+
+// \brief Check that obj is a pandas.Timedelta instance
+ARROW_PYTHON_EXPORT
+bool IsPandasTimedelta(PyObject* obj);
+
+// \brief Check that obj is a pandas.Timestamp instance
+bool IsPandasTimestamp(PyObject* obj);
+
+// \brief Returned a borrowed reference to the pandas.tseries.offsets.DateOffset
+PyObject* BorrowPandasDataOffsetType();
+
+// \brief Check whether obj is a floating-point NaN
+ARROW_PYTHON_EXPORT
+bool PyFloat_IsNaN(PyObject* obj);
+
+inline bool IsPyBinary(PyObject* obj) {
+  return PyBytes_Check(obj) || PyByteArray_Check(obj) || PyMemoryView_Check(obj);
+}
+
+// \brief Convert a Python integer into a C integer
+// \param[in] obj A Python integer
+// \param[out] out A pointer to a C integer to hold the result of the conversion
+// \return The status of the operation
+template <typename Int>
+Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message = "");
+
+// \brief Convert a Python unicode string to a std::string
+ARROW_PYTHON_EXPORT
+Status PyUnicode_AsStdString(PyObject* obj, std::string* out);
+
+// \brief Convert a Python bytes object to a std::string
+ARROW_PYTHON_EXPORT
+std::string PyBytes_AsStdString(PyObject* obj);
+
+// \brief Call str() on the given object and return the result as a std::string
+ARROW_PYTHON_EXPORT
+Status PyObject_StdStringStr(PyObject* obj, std::string* out);
+
+// \brief Return the repr() of the given object (always succeeds)
+ARROW_PYTHON_EXPORT
+std::string PyObject_StdStringRepr(PyObject* obj);
+
+// \brief Cast the given size to int32_t, with error checking
+inline Status CastSize(Py_ssize_t size, int32_t* out,
+                       const char* error_msg = "Maximum size exceeded (2GB)") {
+  // size is assumed to be positive
+  if (size > std::numeric_limits<int32_t>::max()) {
+    return Status::Invalid(error_msg);
+  }
+  *out = static_cast<int32_t>(size);
+  return Status::OK();
+}
+
+inline Status CastSize(Py_ssize_t size, int64_t* out, const char* error_msg = NULLPTR) {
+  // size is assumed to be positive
+  *out = static_cast<int64_t>(size);
+  return Status::OK();
+}
+
+// \brief Print the Python object's __str__ form along with the passed error
+// message
+ARROW_PYTHON_EXPORT
+Status InvalidValue(PyObject* obj, const std::string& why);
+
+ARROW_PYTHON_EXPORT
+Status InvalidType(PyObject* obj, const std::string& why);
+
+ARROW_PYTHON_EXPORT
+Status IntegerScalarToDoubleSafe(PyObject* obj, double* result);
+ARROW_PYTHON_EXPORT
+Status IntegerScalarToFloat32Safe(PyObject* obj, float* result);
+
+// \brief Print Python object __repr__
+void DebugPrint(PyObject* obj);
+
+ARROW_PYTHON_EXPORT
+bool IsThreadingEnabled();
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/inference.h b/pyarrow/include/arrow/python/inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..983384db118a16141e49a679388b83c75d1d77d6
--- /dev/null
+++ b/pyarrow/include/arrow/python/inference.h
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for converting between CPython built-in data structures and Arrow
+// data structures
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <memory>
+
+#include "arrow/python/visibility.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+
+#include "common.h"
+
+namespace arrow {
+
+class Array;
+class Status;
+
+namespace py {
+
+// These functions take a sequence input, not arbitrary iterables
+
+/// \brief Infer Arrow type from a Python sequence
+/// \param[in] obj the sequence of values
+/// \param[in] mask an optional mask where True values are null. May
+/// be nullptr
+/// \param[in] pandas_null_sentinels use pandas's null value markers
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<arrow::DataType>> InferArrowType(PyObject* obj, PyObject* mask,
+                                                        bool pandas_null_sentinels);
+
+/// Checks whether the passed Python object is a boolean scalar
+ARROW_PYTHON_EXPORT
+bool IsPyBool(PyObject* obj);
+
+/// Checks whether the passed Python object is an integer scalar
+ARROW_PYTHON_EXPORT
+bool IsPyInt(PyObject* obj);
+
+/// Checks whether the passed Python object is a float scalar
+ARROW_PYTHON_EXPORT
+bool IsPyFloat(PyObject* obj);
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/io.h b/pyarrow/include/arrow/python/io.h
new file mode 100644
index 0000000000000000000000000000000000000000..10489c1351b5a73d7fd19cf17b187c716ea31dbd
--- /dev/null
+++ b/pyarrow/include/arrow/python/io.h
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/io/transform.h"
+
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+namespace py {
+
+class ARROW_NO_EXPORT PythonFile;
+
+class ARROW_PYTHON_EXPORT PyReadableFile : public io::RandomAccessFile {
+ public:
+  explicit PyReadableFile(PyObject* file);
+  ~PyReadableFile() override;
+
+  Status Close() override;
+  Status Abort() override;
+  bool closed() const override;
+
+  Result<int64_t> Read(int64_t nbytes, void* out) override;
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+
+  // Thread-safe version
+  Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override;
+
+  // Thread-safe version
+  Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) override;
+
+  Result<int64_t> GetSize() override;
+
+  Status Seek(int64_t position) override;
+
+  Result<int64_t> Tell() const override;
+
+ private:
+  std::unique_ptr<PythonFile> file_;
+};
+
+class ARROW_PYTHON_EXPORT PyOutputStream : public io::OutputStream {
+ public:
+  explicit PyOutputStream(PyObject* file);
+  ~PyOutputStream() override;
+
+  Status Close() override;
+  Status Abort() override;
+  bool closed() const override;
+  Result<int64_t> Tell() const override;
+  Status Write(const void* data, int64_t nbytes) override;
+  Status Write(const std::shared_ptr<Buffer>& buffer) override;
+
+ private:
+  std::unique_ptr<PythonFile> file_;
+  int64_t position_;
+};
+
+// TODO(wesm): seekable output files
+
+// A Buffer subclass that keeps a PyObject reference throughout its
+// lifetime, such that the Python object is kept alive as long as the
+// C++ buffer is still needed.
+// Keeping the reference in a Python wrapper would be incorrect as
+// the Python wrapper can get destroyed even though the wrapped C++
+// buffer is still alive (ARROW-2270).
+class ARROW_PYTHON_EXPORT PyForeignBuffer : public Buffer {
+ public:
+  static Status Make(const uint8_t* data, int64_t size, PyObject* base,
+                     std::shared_ptr<Buffer>* out);
+
+ private:
+  PyForeignBuffer(const uint8_t* data, int64_t size, PyObject* base)
+      : Buffer(data, size) {
+    Py_INCREF(base);
+    base_.reset(base);
+  }
+
+  OwnedRefNoGIL base_;
+};
+
+// All this rigamarole because Cython is really poor with std::function<>
+
+using TransformCallback = std::function<void(
+    PyObject*, const std::shared_ptr<Buffer>& src, std::shared_ptr<Buffer>* out)>;
+
+struct TransformInputStreamVTable {
+  TransformCallback transform;
+};
+
+ARROW_PYTHON_EXPORT
+std::shared_ptr<::arrow::io::InputStream> MakeTransformInputStream(
+    std::shared_ptr<::arrow::io::InputStream> wrapped, TransformInputStreamVTable vtable,
+    PyObject* arg);
+
+using StreamWrapFunc = std::function<Result<std::shared_ptr<io::InputStream>>(
+    std::shared_ptr<io::InputStream>)>;
+ARROW_PYTHON_EXPORT
+std::shared_ptr<StreamWrapFunc> MakeStreamTransformFunc(TransformInputStreamVTable vtable,
+                                                        PyObject* handler);
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/ipc.h b/pyarrow/include/arrow/python/ipc.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c16d8c967ff0bffc52e7803d4d894adb72b1215
--- /dev/null
+++ b/pyarrow/include/arrow/python/ipc.h
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace py {
+
+class ARROW_PYTHON_EXPORT PyRecordBatchReader : public RecordBatchReader {
+ public:
+  std::shared_ptr<Schema> schema() const override;
+
+  Status ReadNext(std::shared_ptr<RecordBatch>* batch) override;
+
+  // For use from Cython
+  // Assumes that `iterable` is borrowed
+  static Result<std::shared_ptr<RecordBatchReader>> Make(std::shared_ptr<Schema>,
+                                                         PyObject* iterable);
+
+ protected:
+  PyRecordBatchReader();
+
+  Status Init(std::shared_ptr<Schema>, PyObject* iterable);
+
+  std::shared_ptr<Schema> schema_;
+  OwnedRefNoGIL iterator_;
+};
+
+class ARROW_PYTHON_EXPORT CastingRecordBatchReader : public RecordBatchReader {
+ public:
+  std::shared_ptr<Schema> schema() const override;
+
+  Status ReadNext(std::shared_ptr<RecordBatch>* batch) override;
+
+  static Result<std::shared_ptr<RecordBatchReader>> Make(
+      std::shared_ptr<RecordBatchReader> parent, std::shared_ptr<Schema> schema);
+
+  Status Close() override;
+
+ protected:
+  CastingRecordBatchReader();
+
+  Status Init(std::shared_ptr<RecordBatchReader> parent, std::shared_ptr<Schema> schema);
+
+  std::shared_ptr<RecordBatchReader> parent_;
+  std::shared_ptr<Schema> schema_;
+};
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/iterators.h b/pyarrow/include/arrow/python/iterators.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd467f6ac407705e62787dd22c9413616647de17
--- /dev/null
+++ b/pyarrow/include/arrow/python/iterators.h
@@ -0,0 +1,200 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <utility>
+
+#include "arrow/array/array_primitive.h"
+
+#include "arrow/python/common.h"
+#include "arrow/python/numpy_init.h"
+#include "arrow/python/numpy_internal.h"
+
+namespace arrow {
+namespace py {
+namespace internal {
+
+using arrow::internal::checked_cast;
+
+// Visit the Python sequence, calling the given callable on each element.  If
+// the callable returns a non-OK status, iteration stops and the status is
+// returned.
+//
+// The call signature for Visitor must be
+//
+// Visit(PyObject* obj, int64_t index, bool* keep_going)
+//
+// If keep_going is set to false, the iteration terminates
+template <class VisitorFunc>
+inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&& func) {
+  // VisitorFunc may set to false to terminate iteration
+  bool keep_going = true;
+
+  if (has_numpy() && PyArray_Check(obj)) {
+    PyArrayObject* arr_obj = reinterpret_cast<PyArrayObject*>(obj);
+    if (PyArray_NDIM(arr_obj) != 1) {
+      return Status::Invalid("Only 1D arrays accepted");
+    }
+
+    if (PyArray_DESCR(arr_obj)->type_num == NPY_OBJECT) {
+      // It's an array object, we can fetch object pointers directly
+      const Ndarray1DIndexer<PyObject*> objects(arr_obj);
+      for (int64_t i = offset; keep_going && i < objects.size(); ++i) {
+        RETURN_NOT_OK(func(objects[i], i, &keep_going));
+      }
+      return Status::OK();
+    }
+    // It's a non-object array, fall back on regular sequence access.
+    // (note PyArray_GETITEM() is slightly different: it returns standard
+    //  Python types, not Numpy scalar types)
+    // This code path is inefficient: callers should implement dedicated
+    // logic for non-object arrays.
+  }
+
+  if (PySequence_Check(obj)) {
+#ifdef Py_GIL_DISABLED
+    if (PyTuple_Check(obj)) {
+#else
+    if (PyList_Check(obj) || PyTuple_Check(obj)) {
+#endif
+      // Use fast item access
+      const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj);
+      for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
+        PyObject* value = PySequence_Fast_GET_ITEM(obj, i);
+        RETURN_NOT_OK(func(value, static_cast<int64_t>(i), &keep_going));
+      }
+    } else {
+      // Regular sequence: avoid making a potentially large copy
+      const Py_ssize_t size = PySequence_Size(obj);
+      RETURN_IF_PYERROR();
+      for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
+        OwnedRef value_ref(PySequence_ITEM(obj, i));
+        RETURN_IF_PYERROR();
+        RETURN_NOT_OK(func(value_ref.obj(), static_cast<int64_t>(i), &keep_going));
+      }
+    }
+  } else {
+    return Status::TypeError("Object is not a sequence");
+  }
+  return Status::OK();
+}
+
+// Visit sequence with no null mask
+template <class VisitorFunc>
+inline Status VisitSequence(PyObject* obj, int64_t offset, VisitorFunc&& func) {
+  return VisitSequenceGeneric(
+      obj, offset, [&func](PyObject* value, int64_t i /* unused */, bool* keep_going) {
+        return func(value, keep_going);
+      });
+}
+
+/// Visit sequence with null mask
+template <class VisitorFunc>
+inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset,
+                                  VisitorFunc&& func) {
+  if (has_numpy() && PyArray_Check(mo)) {
+    PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
+    if (PyArray_NDIM(mask) != 1) {
+      return Status::Invalid("Mask must be 1D array");
+    }
+    if (PyArray_SIZE(mask) != static_cast<int64_t>(PySequence_Size(obj))) {
+      return Status::Invalid("Mask was a different length from sequence being converted");
+    }
+
+    const int dtype = fix_numpy_type_num(PyArray_DESCR(mask)->type_num);
+    if (dtype == NPY_BOOL) {
+      Ndarray1DIndexer<uint8_t> mask_values(mask);
+
+      return VisitSequenceGeneric(
+          obj, offset,
+          [&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) {
+            return func(value, mask_values[i], keep_going);
+          });
+    } else {
+      return Status::TypeError("Mask must be boolean dtype");
+    }
+  } else if (py::is_array(mo)) {
+    auto unwrap_mask_result = unwrap_array(mo);
+    ARROW_RETURN_NOT_OK(unwrap_mask_result);
+    std::shared_ptr<Array> mask_ = unwrap_mask_result.ValueOrDie();
+    if (mask_->type_id() != Type::type::BOOL) {
+      return Status::TypeError("Mask must be an array of booleans");
+    }
+
+    if (mask_->length() != PySequence_Size(obj)) {
+      return Status::Invalid("Mask was a different length from sequence being converted");
+    }
+
+    if (mask_->null_count() != 0) {
+      return Status::TypeError("Mask must be an array of booleans");
+    }
+
+    BooleanArray* boolmask = checked_cast<BooleanArray*>(mask_.get());
+    return VisitSequenceGeneric(
+        obj, offset, [&func, &boolmask](PyObject* value, int64_t i, bool* keep_going) {
+          return func(value, boolmask->Value(i), keep_going);
+        });
+  } else if (PySequence_Check(mo)) {
+    if (PySequence_Size(mo) != PySequence_Size(obj)) {
+      return Status::Invalid("Mask was a different length from sequence being converted");
+    }
+    RETURN_IF_PYERROR();
+
+    return VisitSequenceGeneric(
+        obj, offset, [&func, &mo](PyObject* value, int64_t i, bool* keep_going) {
+          OwnedRef value_ref(PySequence_ITEM(mo, i));
+          if (!PyBool_Check(value_ref.obj()))
+            return Status::TypeError("Mask must be a sequence of booleans");
+          return func(value, value_ref.obj() == Py_True, keep_going);
+        });
+  } else {
+    return Status::Invalid("Null mask must be a NumPy array, Arrow array or a Sequence");
+  }
+
+  return Status::OK();
+}
+
+// Like IterateSequence, but accepts any generic iterable (including
+// non-restartable iterators, e.g. generators).
+//
+// The call signature for VisitorFunc must be Visit(PyObject*, bool*
+// keep_going). If keep_going is set to false, the iteration terminates
+template <class VisitorFunc>
+inline Status VisitIterable(PyObject* obj, VisitorFunc&& func) {
+  if (PySequence_Check(obj)) {
+    // Numpy arrays fall here as well
+    return VisitSequence(obj, /*offset=*/0, std::forward<VisitorFunc>(func));
+  }
+  // Fall back on the iterator protocol
+  OwnedRef iter_ref(PyObject_GetIter(obj));
+  PyObject* iter = iter_ref.obj();
+  RETURN_IF_PYERROR();
+  PyObject* value;
+
+  bool keep_going = true;
+  while (keep_going && (value = PyIter_Next(iter))) {
+    OwnedRef value_ref(value);
+    RETURN_NOT_OK(func(value_ref.obj(), &keep_going));
+  }
+  RETURN_IF_PYERROR();  // __next__() might have raised
+  return Status::OK();
+}
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/lib.h b/pyarrow/include/arrow/python/lib.h
new file mode 100644
index 0000000000000000000000000000000000000000..5be7e85c997412117304362770b85fffed80e055
--- /dev/null
+++ b/pyarrow/include/arrow/python/lib.h
@@ -0,0 +1,79 @@
+/* Generated by Cython 3.2.4 */
+
+#ifndef __PYX_HAVE__pyarrow__lib
+#define __PYX_HAVE__pyarrow__lib
+
+#include "Python.h"
+
+#ifndef __PYX_HAVE_API__pyarrow__lib
+
+#ifdef CYTHON_EXTERN_C
+    #undef __PYX_EXTERN_C
+    #define __PYX_EXTERN_C CYTHON_EXTERN_C
+#elif defined(__PYX_EXTERN_C)
+    #ifdef _MSC_VER
+    #pragma message ("Please do not define the '__PYX_EXTERN_C' macro externally. Use 'CYTHON_EXTERN_C' instead.")
+    #else
+    #warning Please do not define the '__PYX_EXTERN_C' macro externally. Use 'CYTHON_EXTERN_C' instead.
+    #endif
+#else
+    #define __PYX_EXTERN_C extern "C++"
+#endif
+
+#ifndef DL_IMPORT
+  #define DL_IMPORT(_T) _T
+#endif
+
+__PYX_EXTERN_C PyObject *pyarrow_wrap_buffer(std::shared_ptr< arrow::Buffer>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_resizable_buffer(std::shared_ptr< arrow::ResizableBuffer>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_data_type(std::shared_ptr< arrow::DataType>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_field(std::shared_ptr< arrow::Field>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_schema(std::shared_ptr< arrow::Schema>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_scalar(std::shared_ptr< arrow::Scalar>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_array(std::shared_ptr< arrow::Array>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_chunked_array(std::shared_ptr< arrow::ChunkedArray>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_sparse_coo_tensor(std::shared_ptr< arrow::SparseCOOTensor>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_sparse_csc_matrix(std::shared_ptr< arrow::SparseCSCMatrix>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_sparse_csf_tensor(std::shared_ptr< arrow::SparseCSFTensor>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_sparse_csr_matrix(std::shared_ptr< arrow::SparseCSRMatrix>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_tensor(std::shared_ptr< arrow::Tensor>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_batch(std::shared_ptr< arrow::RecordBatch>  const &);
+__PYX_EXTERN_C PyObject *pyarrow_wrap_table(std::shared_ptr< arrow::Table>  const &);
+__PYX_EXTERN_C std::shared_ptr< arrow::Buffer>  pyarrow_unwrap_buffer(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::DataType>  pyarrow_unwrap_data_type(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::Field>  pyarrow_unwrap_field(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::Schema>  pyarrow_unwrap_schema(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::Scalar>  pyarrow_unwrap_scalar(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::Array>  pyarrow_unwrap_array(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::ChunkedArray>  pyarrow_unwrap_chunked_array(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::SparseCOOTensor>  pyarrow_unwrap_sparse_coo_tensor(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::SparseCSCMatrix>  pyarrow_unwrap_sparse_csc_matrix(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::SparseCSFTensor>  pyarrow_unwrap_sparse_csf_tensor(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::SparseCSRMatrix>  pyarrow_unwrap_sparse_csr_matrix(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::Tensor>  pyarrow_unwrap_tensor(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::RecordBatch>  pyarrow_unwrap_batch(PyObject *);
+__PYX_EXTERN_C std::shared_ptr< arrow::Table>  pyarrow_unwrap_table(PyObject *);
+
+#endif /* !__PYX_HAVE_API__pyarrow__lib */
+
+/* WARNING: the interface of the module init function changed in CPython 3.5. */
+/* It now returns a PyModuleDef instance instead of a PyModule instance. */
+
+/* WARNING: Use PyImport_AppendInittab("lib", PyInit_lib) instead of calling PyInit_lib directly from Python 3.5 */
+PyMODINIT_FUNC PyInit_lib(void);
+
+#if PY_VERSION_HEX >= 0x03050000 && (defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) || (defined(__cplusplus) && __cplusplus >= 201402L))
+#if defined(__cplusplus) && __cplusplus >= 201402L
+[[deprecated("Use PyImport_AppendInittab(\"lib\", PyInit_lib) instead of calling PyInit_lib directly.")]] inline
+#elif defined(__GNUC__) || defined(__clang__)
+__attribute__ ((__deprecated__("Use PyImport_AppendInittab(\"lib\", PyInit_lib) instead of calling PyInit_lib directly."), __unused__)) __inline__
+#elif defined(_MSC_VER)
+__declspec(deprecated("Use PyImport_AppendInittab(\"lib\", PyInit_lib) instead of calling PyInit_lib directly.")) __inline
+#endif
+static PyObject* __PYX_WARN_IF_PyInit_lib_INIT_CALLED(PyObject* res) {
+  return res;
+}
+#define PyInit_lib() __PYX_WARN_IF_PyInit_lib_INIT_CALLED(PyInit_lib())
+#endif
+
+#endif /* !__PYX_HAVE__pyarrow__lib */
diff --git a/pyarrow/include/arrow/python/lib_api.h b/pyarrow/include/arrow/python/lib_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..acb2ab728417de007ab170d039192e88f2754cf4
--- /dev/null
+++ b/pyarrow/include/arrow/python/lib_api.h
@@ -0,0 +1,220 @@
+/* Generated by Cython 3.2.4 */
+
+#ifndef __PYX_HAVE_API__pyarrow__lib
+#define __PYX_HAVE_API__pyarrow__lib
+#ifdef __MINGW64__
+#define MS_WIN64
+#endif
+#include "Python.h"
+#include "lib.h"
+
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_box_memory_pool)( arrow::MemoryPool *) = 0;
+#define box_memory_pool __pyx_api_f_7pyarrow_3lib_box_memory_pool
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer)(std::shared_ptr< arrow::Buffer>  const &) = 0;
+#define pyarrow_wrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer)(std::shared_ptr< arrow::ResizableBuffer>  const &) = 0;
+#define pyarrow_wrap_resizable_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type)(std::shared_ptr< arrow::DataType>  const &) = 0;
+#define pyarrow_wrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field)(std::shared_ptr< arrow::Field>  const &) = 0;
+#define pyarrow_wrap_field __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema)(std::shared_ptr< arrow::Schema>  const &) = 0;
+#define pyarrow_wrap_schema __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar)(std::shared_ptr< arrow::Scalar>  const &) = 0;
+#define pyarrow_wrap_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array)(std::shared_ptr< arrow::Array>  const &) = 0;
+#define pyarrow_wrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array)(std::shared_ptr< arrow::ChunkedArray>  const &) = 0;
+#define pyarrow_wrap_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_coo_tensor)(std::shared_ptr< arrow::SparseCOOTensor>  const &) = 0;
+#define pyarrow_wrap_sparse_coo_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_coo_tensor
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csc_matrix)(std::shared_ptr< arrow::SparseCSCMatrix>  const &) = 0;
+#define pyarrow_wrap_sparse_csc_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csc_matrix
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csf_tensor)(std::shared_ptr< arrow::SparseCSFTensor>  const &) = 0;
+#define pyarrow_wrap_sparse_csf_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csf_tensor
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csr_matrix)(std::shared_ptr< arrow::SparseCSRMatrix>  const &) = 0;
+#define pyarrow_wrap_sparse_csr_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csr_matrix
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor)(std::shared_ptr< arrow::Tensor>  const &) = 0;
+#define pyarrow_wrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch)(std::shared_ptr< arrow::RecordBatch>  const &) = 0;
+#define pyarrow_wrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table)(std::shared_ptr< arrow::Table>  const &) = 0;
+#define pyarrow_wrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table
+static std::shared_ptr< arrow::Buffer>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer)(PyObject *) = 0;
+#define pyarrow_unwrap_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer
+static std::shared_ptr< arrow::DataType>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type)(PyObject *) = 0;
+#define pyarrow_unwrap_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type
+static std::shared_ptr< arrow::Field>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field)(PyObject *) = 0;
+#define pyarrow_unwrap_field __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field
+static std::shared_ptr< arrow::Schema>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema)(PyObject *) = 0;
+#define pyarrow_unwrap_schema __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema
+static std::shared_ptr< arrow::Scalar>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_scalar)(PyObject *) = 0;
+#define pyarrow_unwrap_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_scalar
+static std::shared_ptr< arrow::Array>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array)(PyObject *) = 0;
+#define pyarrow_unwrap_array __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array
+static std::shared_ptr< arrow::ChunkedArray>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_chunked_array)(PyObject *) = 0;
+#define pyarrow_unwrap_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_chunked_array
+static std::shared_ptr< arrow::SparseCOOTensor>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_coo_tensor)(PyObject *) = 0;
+#define pyarrow_unwrap_sparse_coo_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_coo_tensor
+static std::shared_ptr< arrow::SparseCSCMatrix>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csc_matrix)(PyObject *) = 0;
+#define pyarrow_unwrap_sparse_csc_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csc_matrix
+static std::shared_ptr< arrow::SparseCSFTensor>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csf_tensor)(PyObject *) = 0;
+#define pyarrow_unwrap_sparse_csf_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csf_tensor
+static std::shared_ptr< arrow::SparseCSRMatrix>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csr_matrix)(PyObject *) = 0;
+#define pyarrow_unwrap_sparse_csr_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csr_matrix
+static std::shared_ptr< arrow::Tensor>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor)(PyObject *) = 0;
+#define pyarrow_unwrap_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor
+static std::shared_ptr< arrow::RecordBatch>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch)(PyObject *) = 0;
+#define pyarrow_unwrap_batch __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch
+static std::shared_ptr< arrow::Table>  (*__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table)(PyObject *) = 0;
+#define pyarrow_unwrap_table __pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status)(arrow::Status const &) = 0;
+#define pyarrow_internal_check_status __pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status
+static PyObject *(*__pyx_api_f_7pyarrow_3lib_pyarrow_internal_convert_status)(arrow::Status const &) = 0;
+#define pyarrow_internal_convert_status __pyx_api_f_7pyarrow_3lib_pyarrow_internal_convert_status
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer)(PyObject *) = 0;
+#define pyarrow_is_buffer __pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type)(PyObject *) = 0;
+#define pyarrow_is_data_type __pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_metadata)(PyObject *) = 0;
+#define pyarrow_is_metadata __pyx_api_f_7pyarrow_3lib_pyarrow_is_metadata
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_field)(PyObject *) = 0;
+#define pyarrow_is_field __pyx_api_f_7pyarrow_3lib_pyarrow_is_field
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_schema)(PyObject *) = 0;
+#define pyarrow_is_schema __pyx_api_f_7pyarrow_3lib_pyarrow_is_schema
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_array)(PyObject *) = 0;
+#define pyarrow_is_array __pyx_api_f_7pyarrow_3lib_pyarrow_is_array
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_chunked_array)(PyObject *) = 0;
+#define pyarrow_is_chunked_array __pyx_api_f_7pyarrow_3lib_pyarrow_is_chunked_array
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_scalar)(PyObject *) = 0;
+#define pyarrow_is_scalar __pyx_api_f_7pyarrow_3lib_pyarrow_is_scalar
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor)(PyObject *) = 0;
+#define pyarrow_is_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_coo_tensor)(PyObject *) = 0;
+#define pyarrow_is_sparse_coo_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_coo_tensor
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csr_matrix)(PyObject *) = 0;
+#define pyarrow_is_sparse_csr_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csr_matrix
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csc_matrix)(PyObject *) = 0;
+#define pyarrow_is_sparse_csc_matrix __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csc_matrix
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csf_tensor)(PyObject *) = 0;
+#define pyarrow_is_sparse_csf_tensor __pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csf_tensor
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_table)(PyObject *) = 0;
+#define pyarrow_is_table __pyx_api_f_7pyarrow_3lib_pyarrow_is_table
+static int (*__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch)(PyObject *) = 0;
+#define pyarrow_is_batch __pyx_api_f_7pyarrow_3lib_pyarrow_is_batch
+static int __Pyx_ImportFunction_3_2_4(PyObject *module, const char *funcname, void (**f)(void), const char *sig);
+
+#ifndef __PYX_HAVE_RT_ImportFromPxd_3_2_4
+#define __PYX_HAVE_RT_ImportFromPxd_3_2_4
+static int __Pyx_ImportFromPxd_3_2_4(PyObject *module, const char *name, void **p, const char *sig, const char *what) {
+    PyObject *d = 0;
+    PyObject *cobj = 0;
+    d = PyObject_GetAttrString(module, "__pyx_capi__");
+    if (!d)
+        goto bad;
+#if (defined(Py_LIMITED_API) && Py_LIMITED_API >= 0x030d0000) || (!defined(Py_LIMITED_API) && PY_VERSION_HEX >= 0x030d0000)
+    PyDict_GetItemStringRef(d, name, &cobj);
+#else
+    cobj = PyDict_GetItemString(d, name);
+    Py_XINCREF(cobj);
+#endif
+    if (!cobj) {
+        PyErr_Format(PyExc_ImportError,
+            "%.200s does not export expected C %.8s %.200s",
+                PyModule_GetName(module), what, name);
+        goto bad;
+    }
+    if (!PyCapsule_IsValid(cobj, sig)) {
+        PyErr_Format(PyExc_TypeError,
+            "C %.8s %.200s.%.200s has wrong signature (expected %.500s, got %.500s)",
+             what, PyModule_GetName(module), name, sig, PyCapsule_GetName(cobj));
+        goto bad;
+    }
+    *p = PyCapsule_GetPointer(cobj, sig);
+    if (!(*p))
+        goto bad;
+    Py_DECREF(d);
+    Py_DECREF(cobj);
+    return 0;
+bad:
+    Py_XDECREF(d);
+    Py_XDECREF(cobj);
+    return -1;
+}
+#endif
+
+#ifndef __PYX_HAVE_RT_ImportFunction_3_2_4
+#define __PYX_HAVE_RT_ImportFunction_3_2_4
+static int __Pyx_ImportFunction_3_2_4(PyObject *module, const char *funcname, void (**f)(void), const char *sig) {
+    union {
+        void (*fp)(void);
+        void *p;
+    } tmp;
+    int result = __Pyx_ImportFromPxd_3_2_4(module, funcname, &tmp.p, sig, "function");
+    if (result == 0) {
+        *f = tmp.fp;
+    }
+    return result;
+}
+#endif
+
+
+static int import_pyarrow__lib(void) {
+  PyObject *module = 0;
+  module = PyImport_ImportModule("pyarrow.lib");
+  if (!module) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "box_memory_pool", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_box_memory_pool, "PyObject *( arrow::MemoryPool *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_buffer, "PyObject *(std::shared_ptr< arrow::Buffer>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_resizable_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_resizable_buffer, "PyObject *(std::shared_ptr< arrow::ResizableBuffer>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_data_type, "PyObject *(std::shared_ptr< arrow::DataType>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_field, "PyObject *(std::shared_ptr< arrow::Field>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_schema, "PyObject *(std::shared_ptr< arrow::Schema>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_scalar", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_scalar, "PyObject *(std::shared_ptr< arrow::Scalar>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_array, "PyObject *(std::shared_ptr< arrow::Array>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_chunked_array, "PyObject *(std::shared_ptr< arrow::ChunkedArray>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_sparse_coo_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_coo_tensor, "PyObject *(std::shared_ptr< arrow::SparseCOOTensor>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_sparse_csc_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csc_matrix, "PyObject *(std::shared_ptr< arrow::SparseCSCMatrix>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_sparse_csf_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csf_tensor, "PyObject *(std::shared_ptr< arrow::SparseCSFTensor>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_sparse_csr_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_sparse_csr_matrix, "PyObject *(std::shared_ptr< arrow::SparseCSRMatrix>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_tensor, "PyObject *(std::shared_ptr< arrow::Tensor>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_batch, "PyObject *(std::shared_ptr< arrow::RecordBatch>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_wrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_wrap_table, "PyObject *(std::shared_ptr< arrow::Table>  const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_buffer, "std::shared_ptr< arrow::Buffer>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_data_type, "std::shared_ptr< arrow::DataType>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_field, "std::shared_ptr< arrow::Field>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_schema, "std::shared_ptr< arrow::Schema>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_scalar", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_scalar, "std::shared_ptr< arrow::Scalar>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_array, "std::shared_ptr< arrow::Array>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_chunked_array, "std::shared_ptr< arrow::ChunkedArray>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_sparse_coo_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_coo_tensor, "std::shared_ptr< arrow::SparseCOOTensor>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_sparse_csc_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csc_matrix, "std::shared_ptr< arrow::SparseCSCMatrix>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_sparse_csf_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csf_tensor, "std::shared_ptr< arrow::SparseCSFTensor>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_sparse_csr_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_sparse_csr_matrix, "std::shared_ptr< arrow::SparseCSRMatrix>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_tensor, "std::shared_ptr< arrow::Tensor>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_batch, "std::shared_ptr< arrow::RecordBatch>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_unwrap_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_unwrap_table, "std::shared_ptr< arrow::Table>  (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_internal_check_status", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_internal_check_status, "int (arrow::Status const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_internal_convert_status", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_internal_convert_status, "PyObject *(arrow::Status const &)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_buffer", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_buffer, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_data_type", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_data_type, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_metadata", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_metadata, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_field", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_field, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_schema", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_schema, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_array, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_chunked_array", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_chunked_array, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_scalar", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_scalar, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_tensor, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_sparse_coo_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_coo_tensor, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_sparse_csr_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csr_matrix, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_sparse_csc_matrix", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csc_matrix, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_sparse_csf_tensor", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_sparse_csf_tensor, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_table", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_table, "int (PyObject *)") < 0) goto bad;
+  if (__Pyx_ImportFunction_3_2_4(module, "pyarrow_is_batch", (void (**)(void))&__pyx_api_f_7pyarrow_3lib_pyarrow_is_batch, "int (PyObject *)") < 0) goto bad;
+  Py_DECREF(module); module = 0;
+  return 0;
+  bad:
+  Py_XDECREF(module);
+  return -1;
+}
+
+#endif /* !__PYX_HAVE_API__pyarrow__lib */
diff --git a/pyarrow/include/arrow/python/numpy_convert.h b/pyarrow/include/arrow/python/numpy_convert.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d1086e13552885f09431848fabf0829e670d681
--- /dev/null
+++ b/pyarrow/include/arrow/python/numpy_convert.h
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for converting between pandas's NumPy-based data representation
+// and Arrow data structures
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/python/visibility.h"
+#include "arrow/sparse_tensor.h"
+
+namespace arrow {
+
+class DataType;
+class MemoryPool;
+class Status;
+class Tensor;
+
+namespace py {
+
+class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer {
+ public:
+  explicit NumPyBuffer(PyObject* arr);
+  virtual ~NumPyBuffer();
+
+ private:
+  PyObject* arr_;
+};
+
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype);
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr);
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar);
+
+ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
+                                           const std::vector<std::string>& dim_names,
+                                           std::shared_ptr<Tensor>* out);
+
+ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor,
+                                           PyObject* base, PyObject** out);
+
+ARROW_PYTHON_EXPORT Status
+SparseCOOTensorToNdarray(const std::shared_ptr<SparseCOOTensor>& sparse_tensor,
+                         PyObject* base, PyObject** out_data, PyObject** out_coords);
+
+Status SparseCSXMatrixToNdarray(const std::shared_ptr<SparseTensor>& sparse_tensor,
+                                PyObject* base, PyObject** out_data,
+                                PyObject** out_indptr, PyObject** out_indices);
+
+ARROW_PYTHON_EXPORT Status SparseCSRMatrixToNdarray(
+    const std::shared_ptr<SparseCSRMatrix>& sparse_tensor, PyObject* base,
+    PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
+
+ARROW_PYTHON_EXPORT Status SparseCSCMatrixToNdarray(
+    const std::shared_ptr<SparseCSCMatrix>& sparse_tensor, PyObject* base,
+    PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
+
+ARROW_PYTHON_EXPORT Status SparseCSFTensorToNdarray(
+    const std::shared_ptr<SparseCSFTensor>& sparse_tensor, PyObject* base,
+    PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
+
+ARROW_PYTHON_EXPORT Status NdarraysToSparseCOOTensor(
+    MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
+    const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
+    std::shared_ptr<SparseCOOTensor>* out);
+
+ARROW_PYTHON_EXPORT Status NdarraysToSparseCSRMatrix(
+    MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
+    const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
+    std::shared_ptr<SparseCSRMatrix>* out);
+
+ARROW_PYTHON_EXPORT Status NdarraysToSparseCSCMatrix(
+    MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
+    const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
+    std::shared_ptr<SparseCSCMatrix>* out);
+
+ARROW_PYTHON_EXPORT Status NdarraysToSparseCSFTensor(
+    MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
+    const std::vector<int64_t>& shape, const std::vector<int64_t>& axis_order,
+    const std::vector<std::string>& dim_names, std::shared_ptr<SparseCSFTensor>* out);
+
+ARROW_PYTHON_EXPORT Status
+TensorToSparseCOOTensor(const std::shared_ptr<Tensor>& tensor,
+                        std::shared_ptr<SparseCOOTensor>* csparse_tensor);
+
+ARROW_PYTHON_EXPORT Status
+TensorToSparseCSRMatrix(const std::shared_ptr<Tensor>& tensor,
+                        std::shared_ptr<SparseCSRMatrix>* csparse_tensor);
+
+ARROW_PYTHON_EXPORT Status
+TensorToSparseCSCMatrix(const std::shared_ptr<Tensor>& tensor,
+                        std::shared_ptr<SparseCSCMatrix>* csparse_tensor);
+
+ARROW_PYTHON_EXPORT Status
+TensorToSparseCSFTensor(const std::shared_ptr<Tensor>& tensor,
+                        std::shared_ptr<SparseCSFTensor>* csparse_tensor);
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/numpy_init.h b/pyarrow/include/arrow/python/numpy_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..36c544c1b51fd431e1f7d3b4c4f01c0e18e527df
--- /dev/null
+++ b/pyarrow/include/arrow/python/numpy_init.h
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/platform.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow::py {
+ARROW_PYTHON_EXPORT
+int arrow_init_numpy();
+bool has_numpy();
+}  // namespace arrow::py
diff --git a/pyarrow/include/arrow/python/numpy_interop.h b/pyarrow/include/arrow/python/numpy_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..a83ae4a62b944c71af70d58c7107befd659baa8c
--- /dev/null
+++ b/pyarrow/include/arrow/python/numpy_interop.h
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/platform.h"  // IWYU pragma: export
+
+#include <numpy/numpyconfig.h>  // IWYU pragma: export
+
+// Don't use the deprecated Numpy functions
+#ifdef NPY_1_7_API_VERSION
+#  define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#else
+#  define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED
+#  define NPY_ARRAY_ALIGNED NPY_ALIGNED
+#  define NPY_ARRAY_WRITEABLE NPY_WRITEABLE
+#  define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY
+#endif
+
+// This is required to be able to access the NumPy C API properly in C++ files
+// other than init.cc.
+#define PY_ARRAY_UNIQUE_SYMBOL arrow_ARRAY_API
+#ifndef NUMPY_IMPORT_ARRAY
+#  define NO_IMPORT_ARRAY
+#endif
+
+#include <numpy/arrayobject.h>   // IWYU pragma: export
+#include <numpy/arrayscalars.h>  // IWYU pragma: export
+#include <numpy/ufuncobject.h>   // IWYU pragma: export
+
+// A bit subtle. Numpy has 5 canonical integer types:
+// (or, rather, type pairs: signed and unsigned)
+//   NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONG, NPY_LONGLONG
+// It also has 4 fixed-width integer aliases.
+// When mapping Arrow integer types to these 4 fixed-width aliases,
+// we always miss one of the canonical types (even though it may
+// have the same width as one of the aliases).
+// Which one depends on the platform...
+// On a LP64 system, NPY_INT64 maps to NPY_LONG and
+// NPY_LONGLONG needs to be handled separately.
+// On a LLP64 system, NPY_INT32 maps to NPY_LONG and
+// NPY_INT needs to be handled separately.
+
+#if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64
+#  define NPY_INT64_IS_LONG_LONG 1
+#else
+#  define NPY_INT64_IS_LONG_LONG 0
+#endif
+
+#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64
+#  define NPY_INT32_IS_INT 1
+#else
+#  define NPY_INT32_IS_INT 0
+#endif
+
+// Backported NumPy 2 API (can be removed if numpy 2 is required)
+#if NPY_ABI_VERSION < 0x02000000
+#  define PyDataType_ELSIZE(descr) ((descr)->elsize)
+#  define PyDataType_C_METADATA(descr) ((descr)->c_metadata)
+#  define PyDataType_FIELDS(descr) ((descr)->fields)
+#endif
+
+namespace arrow {
+namespace py {
+
+inline int import_numpy() {
+#ifdef NUMPY_IMPORT_ARRAY
+  import_array1(-1);
+  import_umath1(-1);
+#endif
+
+  return 0;
+}
+
+// See above about the missing Numpy integer type numbers
+inline int fix_numpy_type_num(int type_num) {
+#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32
+  if (type_num == NPY_INT) return NPY_INT32;
+  if (type_num == NPY_UINT) return NPY_UINT32;
+#endif
+#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64
+  if (type_num == NPY_LONGLONG) return NPY_INT64;
+  if (type_num == NPY_ULONGLONG) return NPY_UINT64;
+#endif
+  return type_num;
+}
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/numpy_to_arrow.h b/pyarrow/include/arrow/python/numpy_to_arrow.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6cd093e5542008cf173f43de311e40c418e7c8d
--- /dev/null
+++ b/pyarrow/include/arrow/python/numpy_to_arrow.h
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Converting from pandas memory representation to Arrow data structures
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <memory>
+
+#include "arrow/compute/api.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class DataType;
+class MemoryPool;
+class Status;
+
+namespace py {
+
+/// Convert NumPy arrays to Arrow. If target data type is not known, pass a
+/// type with null
+///
+/// \param[in] pool Memory pool for any memory allocations
+/// \param[in] ao an ndarray with the array data
+/// \param[in] mo an ndarray with a null mask (True is null), optional
+/// \param[in] from_pandas If true, use pandas's null sentinels to determine
+/// whether values are null
+/// \param[in] type a specific type to cast to, may be null
+/// \param[in] cast_options casting options
+/// \param[out] out a ChunkedArray, to accommodate chunked output
+ARROW_PYTHON_EXPORT
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+                      const std::shared_ptr<DataType>& type,
+                      const compute::CastOptions& cast_options,
+                      std::shared_ptr<ChunkedArray>* out);
+
+/// Safely convert NumPy arrays to Arrow. If target data type is not known,
+/// pass a type with null.
+///
+/// \param[in] pool Memory pool for any memory allocations
+/// \param[in] ao an ndarray with the array data
+/// \param[in] mo an ndarray with a null mask (True is null), optional
+/// \param[in] from_pandas If true, use pandas's null sentinels to determine
+/// whether values are null
+/// \param[in] type a specific type to cast to, may be null
+/// \param[out] out a ChunkedArray, to accommodate chunked output
+ARROW_PYTHON_EXPORT
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+                      const std::shared_ptr<DataType>& type,
+                      std::shared_ptr<ChunkedArray>* out);
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/parquet_encryption.h b/pyarrow/include/arrow/python/parquet_encryption.h
new file mode 100644
index 0000000000000000000000000000000000000000..b485b8b11537009479787d5bba8c50c6e2744ec5
--- /dev/null
+++ b/pyarrow/include/arrow/python/parquet_encryption.h
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/secure_string.h"
+#include "parquet/encryption/crypto_factory.h"
+#include "parquet/encryption/file_system_key_material_store.h"
+#include "parquet/encryption/key_material.h"
+#include "parquet/encryption/kms_client.h"
+#include "parquet/encryption/kms_client_factory.h"
+
+#if defined(_WIN32) || defined(__CYGWIN__)  // Windows
+#  if defined(_MSC_VER)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_PYTHON_STATIC
+#    define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT
+#  elif defined(ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORTING)
+#    define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllimport)
+#  endif
+
+#else  // Not Windows
+#  ifndef ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT
+#    define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __attribute__((visibility("default")))
+#  endif
+#endif  // Non-Windows
+
+namespace arrow {
+namespace py {
+namespace parquet {
+namespace encryption {
+
+/// \brief A table of function pointers for calling from C++ into
+/// Python.
+class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientVtable {
+ public:
+  std::function<void(PyObject*, const ::arrow::util::SecureString& key,
+                     const std::string& master_key_identifier, std::string* out)>
+      wrap_key;
+  std::function<void(PyObject*, const std::string& wrapped_key,
+                     const std::string& master_key_identifier,
+                     ::arrow::util::SecureString* out)>
+      unwrap_key;
+};
+
+/// \brief A helper for KmsClient implementation in Python.
+class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClient
+    : public ::parquet::encryption::KmsClient {
+ public:
+  PyKmsClient(PyObject* handler, PyKmsClientVtable vtable);
+  ~PyKmsClient() override;
+
+  std::string WrapKey(const ::arrow::util::SecureString& key,
+                      const std::string& master_key_identifier) override;
+
+  ::arrow::util::SecureString UnwrapKey(
+      const std::string& wrapped_key, const std::string& master_key_identifier) override;
+
+ private:
+  OwnedRefNoGIL handler_;
+  PyKmsClientVtable vtable_;
+};
+
+/// \brief A table of function pointers for calling from C++ into
+/// Python.
+class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientFactoryVtable {
+ public:
+  std::function<void(
+      PyObject*, const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
+      std::shared_ptr<::parquet::encryption::KmsClient>* out)>
+      create_kms_client;
+};
+
+/// \brief A helper for KmsClientFactory implementation in Python.
+class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientFactory
+    : public ::parquet::encryption::KmsClientFactory {
+ public:
+  PyKmsClientFactory(PyObject* handler, PyKmsClientFactoryVtable vtable);
+  ~PyKmsClientFactory() override;
+
+  std::shared_ptr<::parquet::encryption::KmsClient> CreateKmsClient(
+      const ::parquet::encryption::KmsConnectionConfig& kms_connection_config) override;
+
+ private:
+  OwnedRefNoGIL handler_;
+  PyKmsClientFactoryVtable vtable_;
+};
+
+/// \brief A CryptoFactory that returns Results instead of throwing exceptions.
+class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyCryptoFactory
+    : public ::parquet::encryption::CryptoFactory {
+ public:
+  arrow::Result<std::shared_ptr<::parquet::FileEncryptionProperties>>
+  SafeGetFileEncryptionProperties(
+      const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
+      const ::parquet::encryption::EncryptionConfiguration& encryption_config,
+      const std::string& parquet_file_path,
+      const std::shared_ptr<::arrow::fs::FileSystem>& filesystem);
+
+  /// The returned FileDecryptionProperties object will use the cache inside this
+  /// CryptoFactory object, so please keep this
+  /// CryptoFactory object alive along with the returned
+  /// FileDecryptionProperties object.
+  arrow::Result<std::shared_ptr<::parquet::FileDecryptionProperties>>
+  SafeGetFileDecryptionProperties(
+      const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
+      const ::parquet::encryption::DecryptionConfiguration& decryption_config,
+      const std::string& parquet_file_path,
+      const std::shared_ptr<::arrow::fs::FileSystem>& filesystem);
+
+  arrow::Status SafeRotateMasterKeys(
+      const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
+      const std::string& parquet_file_path,
+      const std::shared_ptr<::arrow::fs::FileSystem>& filesystem, bool double_wrapping,
+      double cache_lifetime_seconds);
+};
+
+}  // namespace encryption
+}  // namespace parquet
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/platform.h b/pyarrow/include/arrow/python/platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..04fb9fb8089944ea41b3ef5bfb3addaac710f39e
--- /dev/null
+++ b/pyarrow/include/arrow/python/platform.h
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for converting between pandas's NumPy-based data representation
+// and Arrow data structures
+
+#pragma once
+
+// If PY_SSIZE_T_CLEAN is defined, argument parsing functions treat #-specifier
+// to mean Py_ssize_t (defining this to suppress deprecation warning)
+#define PY_SSIZE_T_CLEAN
+
+#include <Python.h>  // IWYU pragma: export
+#include <datetime.h>
+
+// Work around C2528 error
+#ifdef _MSC_VER
+#  if _MSC_VER >= 1900
+#    undef timezone
+#  endif
+#endif
diff --git a/pyarrow/include/arrow/python/pyarrow.h b/pyarrow/include/arrow/python/pyarrow.h
new file mode 100644
index 0000000000000000000000000000000000000000..113035500c0053dbb9dde5a99216aec1aefd1140
--- /dev/null
+++ b/pyarrow/include/arrow/python/pyarrow.h
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <memory>
+
+#include "arrow/python/visibility.h"
+
+#include "arrow/sparse_tensor.h"
+
+// Work around ARROW-2317 (C linkage warning from Cython)
+extern "C++" {
+
+namespace arrow {
+
+class Array;
+class Buffer;
+class DataType;
+class Field;
+class RecordBatch;
+class Schema;
+class Status;
+class Table;
+class Tensor;
+
+namespace py {
+
+// Returns 0 on success, -1 on error.
+ARROW_PYTHON_EXPORT int import_pyarrow();
+
+#define DECLARE_WRAP_FUNCTIONS(FUNC_SUFFIX, TYPE_NAME)                         \
+  ARROW_PYTHON_EXPORT bool is_##FUNC_SUFFIX(PyObject*);                        \
+  ARROW_PYTHON_EXPORT Result<std::shared_ptr<TYPE_NAME>> unwrap_##FUNC_SUFFIX( \
+      PyObject*);                                                              \
+  ARROW_PYTHON_EXPORT PyObject* wrap_##FUNC_SUFFIX(const std::shared_ptr<TYPE_NAME>&);
+
+DECLARE_WRAP_FUNCTIONS(buffer, Buffer)
+
+DECLARE_WRAP_FUNCTIONS(data_type, DataType)
+DECLARE_WRAP_FUNCTIONS(field, Field)
+DECLARE_WRAP_FUNCTIONS(schema, Schema)
+
+DECLARE_WRAP_FUNCTIONS(scalar, Scalar)
+
+DECLARE_WRAP_FUNCTIONS(array, Array)
+DECLARE_WRAP_FUNCTIONS(chunked_array, ChunkedArray)
+
+DECLARE_WRAP_FUNCTIONS(sparse_coo_tensor, SparseCOOTensor)
+DECLARE_WRAP_FUNCTIONS(sparse_csc_matrix, SparseCSCMatrix)
+DECLARE_WRAP_FUNCTIONS(sparse_csf_tensor, SparseCSFTensor)
+DECLARE_WRAP_FUNCTIONS(sparse_csr_matrix, SparseCSRMatrix)
+DECLARE_WRAP_FUNCTIONS(tensor, Tensor)
+
+DECLARE_WRAP_FUNCTIONS(batch, RecordBatch)
+DECLARE_WRAP_FUNCTIONS(table, Table)
+
+#undef DECLARE_WRAP_FUNCTIONS
+
+namespace internal {
+
+// If status is ok, return 0.
+// If status is not ok, set Python error indicator and return -1.
+ARROW_PYTHON_EXPORT int check_status(const Status& status);
+
+// Convert status to a Python exception object.  Status must not be ok.
+ARROW_PYTHON_EXPORT PyObject* convert_status(const Status& status);
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
+
+}  // extern "C++"
diff --git a/pyarrow/include/arrow/python/pyarrow_api.h b/pyarrow/include/arrow/python/pyarrow_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..a476e55a2a111332ed8594ace0fd29e2987046cb
--- /dev/null
+++ b/pyarrow/include/arrow/python/pyarrow_api.h
@@ -0,0 +1,19 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// For backward compatibility.
+#include "arrow/python/lib_api.h"
diff --git a/pyarrow/include/arrow/python/pyarrow_lib.h b/pyarrow/include/arrow/python/pyarrow_lib.h
new file mode 100644
index 0000000000000000000000000000000000000000..e509593c254468a62216e0e4a7ea073ad9a3f1d4
--- /dev/null
+++ b/pyarrow/include/arrow/python/pyarrow_lib.h
@@ -0,0 +1,19 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// For backward compatibility.
+#include "arrow/python/lib.h"
diff --git a/pyarrow/include/arrow/python/python_test.h b/pyarrow/include/arrow/python/python_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2eb62fc29accb670f5d53e326381d68a6534335
--- /dev/null
+++ b/pyarrow/include/arrow/python/python_test.h
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "arrow/status.h"
+
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+namespace py {
+namespace testing {
+
+struct TestCase {
+  std::string name;
+  std::function<Status()> func;
+};
+
+ARROW_PYTHON_EXPORT
+std::vector<TestCase> GetCppTestCases();
+
+}  // namespace testing
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/python_to_arrow.h b/pyarrow/include/arrow/python/python_to_arrow.h
new file mode 100644
index 0000000000000000000000000000000000000000..d167996ba8da6796ac62da0fa0186419a3211930
--- /dev/null
+++ b/pyarrow/include/arrow/python/python_to_arrow.h
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for converting between CPython built-in data structures and Arrow
+// data structures
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/python/visibility.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+
+#include "arrow/python/common.h"
+
+namespace arrow {
+
+class Array;
+class Status;
+
+namespace py {
+
+struct PyConversionOptions {
+  PyConversionOptions() = default;
+
+  PyConversionOptions(const std::shared_ptr<DataType>& type, int64_t size,
+                      MemoryPool* pool, bool from_pandas)
+      : type(type), size(size), from_pandas(from_pandas) {}
+
+  // Set to null if to be inferred
+  std::shared_ptr<DataType> type;
+
+  // Default is -1, which indicates the size should the same as the input sequence
+  int64_t size = -1;
+
+  bool from_pandas = false;
+
+  /// Used to maintain backwards compatibility for
+  /// timezone bugs (see ARROW-9528).  Should be removed
+  /// after Arrow 2.0 release.
+  bool ignore_timezone = false;
+
+  bool strict = false;
+};
+
+/// \brief Convert sequence (list, generator, NumPy array with dtype object) of
+/// Python objects.
+/// \param[in] obj the sequence to convert
+/// \param[in] mask a NumPy array of true/false values to indicate whether
+/// values in the sequence are null (true) or not null (false). This parameter
+/// may be null
+/// \param[in] options various conversion options
+/// \param[in] pool MemoryPool to use for allocations
+/// \return Result ChunkedArray
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(
+    PyObject* obj, PyObject* mask, PyConversionOptions options,
+    MemoryPool* pool = default_memory_pool());
+
+}  // namespace py
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/type_traits.h b/pyarrow/include/arrow/python/type_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..865e1af4276711b07de28185ce22bf7663a3cdbb
--- /dev/null
+++ b/pyarrow/include/arrow/python/type_traits.h
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Internal header
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <cstdint>
+#include <limits>
+
+#include "arrow/python/numpy_interop.h"
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/float16.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace py {
+
+static constexpr int64_t kPandasTimestampNull = std::numeric_limits<int64_t>::min();
+constexpr int64_t kNanosecondsInDay = 86400000000000LL;
+
+namespace internal {
+
+//
+// Type traits for Numpy -> Arrow equivalence
+//
+template <int TYPE>
+struct npy_traits {};
+
+template <>
+struct npy_traits<NPY_BOOL> {
+  typedef uint8_t value_type;
+  using TypeClass = BooleanType;
+  using BuilderClass = BooleanBuilder;
+
+  static constexpr bool supports_nulls = false;
+  static inline bool isnull(uint8_t v) { return false; }
+};
+
+#define NPY_INT_DECL(TYPE, CapType, T)               \
+  template <>                                        \
+  struct npy_traits<NPY_##TYPE> {                    \
+    typedef T value_type;                            \
+    using TypeClass = CapType##Type;                 \
+    using BuilderClass = CapType##Builder;           \
+                                                     \
+    static constexpr bool supports_nulls = false;    \
+    static inline bool isnull(T v) { return false; } \
+  };
+
+NPY_INT_DECL(INT8, Int8, int8_t);
+NPY_INT_DECL(INT16, Int16, int16_t);
+NPY_INT_DECL(INT32, Int32, int32_t);
+NPY_INT_DECL(INT64, Int64, int64_t);
+
+NPY_INT_DECL(UINT8, UInt8, uint8_t);
+NPY_INT_DECL(UINT16, UInt16, uint16_t);
+NPY_INT_DECL(UINT32, UInt32, uint32_t);
+NPY_INT_DECL(UINT64, UInt64, uint64_t);
+
+#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32
+NPY_INT_DECL(INT, Int32, int32_t);
+NPY_INT_DECL(UINT, UInt32, uint32_t);
+#endif
+#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64
+NPY_INT_DECL(LONGLONG, Int64, int64_t);
+NPY_INT_DECL(ULONGLONG, UInt64, uint64_t);
+#endif
+
+template <>
+struct npy_traits<NPY_FLOAT16> {
+  typedef uint16_t value_type;
+  using TypeClass = HalfFloatType;
+  using BuilderClass = HalfFloatBuilder;
+
+  static constexpr uint16_t na_sentinel =
+      std::numeric_limits<arrow::util::Float16>::quiet_NaN().bits();
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(uint16_t v) {
+    return arrow::util::Float16::FromBits(v).is_nan();
+  }
+};
+
+template <>
+struct npy_traits<NPY_FLOAT32> {
+  typedef float value_type;
+  using TypeClass = FloatType;
+  using BuilderClass = FloatBuilder;
+
+  // We need to use quiet_NaN here instead of the NAN macro as on Windows
+  // the NAN macro leads to "division-by-zero" compile-time error with clang.
+  static constexpr float na_sentinel = std::numeric_limits<float>::quiet_NaN();
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(float v) { return v != v; }
+};
+
+template <>
+struct npy_traits<NPY_FLOAT64> {
+  typedef double value_type;
+  using TypeClass = DoubleType;
+  using BuilderClass = DoubleBuilder;
+
+  static constexpr double na_sentinel = std::numeric_limits<double>::quiet_NaN();
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(double v) { return v != v; }
+};
+
+template <>
+struct npy_traits<NPY_DATETIME> {
+  typedef int64_t value_type;
+  using TypeClass = TimestampType;
+  using BuilderClass = TimestampBuilder;
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(int64_t v) {
+    // NaT = -2**63
+    // = -0x8000000000000000
+    // = -9223372036854775808;
+    // = std::numeric_limits<int64_t>::min()
+    return v == std::numeric_limits<int64_t>::min();
+  }
+};
+
+template <>
+struct npy_traits<NPY_TIMEDELTA> {
+  typedef int64_t value_type;
+  using TypeClass = DurationType;
+  using BuilderClass = DurationBuilder;
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(int64_t v) {
+    // NaT = -2**63 = std::numeric_limits<int64_t>::min()
+    return v == std::numeric_limits<int64_t>::min();
+  }
+};
+
+template <>
+struct npy_traits<NPY_OBJECT> {
+  typedef PyObject* value_type;
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(PyObject* v) { return v == Py_None; }
+};
+
+//
+// Type traits for Arrow -> Numpy equivalence
+// Note *supports_nulls* means the equivalent Numpy type support nulls
+//
+template <int TYPE>
+struct arrow_traits {};
+
+template <>
+struct arrow_traits<Type::BOOL> {
+  static constexpr int npy_type = NPY_BOOL;
+  static constexpr bool supports_nulls = false;
+  typedef typename npy_traits<NPY_BOOL>::value_type T;
+};
+
+#define INT_DECL(TYPE)                                                           \
+  template <>                                                                    \
+  struct arrow_traits<Type::TYPE> {                                              \
+    static constexpr int npy_type = NPY_##TYPE;                                  \
+    static constexpr bool supports_nulls = false;                                \
+    static constexpr double na_value = std::numeric_limits<double>::quiet_NaN(); \
+    typedef typename npy_traits<NPY_##TYPE>::value_type T;                       \
+  };
+
+INT_DECL(INT8);
+INT_DECL(INT16);
+INT_DECL(INT32);
+INT_DECL(INT64);
+INT_DECL(UINT8);
+INT_DECL(UINT16);
+INT_DECL(UINT32);
+INT_DECL(UINT64);
+
+template <>
+struct arrow_traits<Type::HALF_FLOAT> {
+  static constexpr int npy_type = NPY_FLOAT16;
+  static constexpr bool supports_nulls = true;
+  static constexpr uint16_t na_value =
+      std::numeric_limits<arrow::util::Float16>::quiet_NaN().bits();
+  typedef typename npy_traits<NPY_FLOAT16>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::FLOAT> {
+  static constexpr int npy_type = NPY_FLOAT32;
+  static constexpr bool supports_nulls = true;
+  static constexpr float na_value = std::numeric_limits<float>::quiet_NaN();
+  typedef typename npy_traits<NPY_FLOAT32>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::DOUBLE> {
+  static constexpr int npy_type = NPY_FLOAT64;
+  static constexpr bool supports_nulls = true;
+  static constexpr double na_value = std::numeric_limits<double>::quiet_NaN();
+  typedef typename npy_traits<NPY_FLOAT64>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::TIMESTAMP> {
+  static constexpr int npy_type = NPY_DATETIME;
+  static constexpr int64_t npy_shift = 1;
+
+  static constexpr bool supports_nulls = true;
+  static constexpr int64_t na_value = kPandasTimestampNull;
+  typedef typename npy_traits<NPY_DATETIME>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::DURATION> {
+  static constexpr int npy_type = NPY_TIMEDELTA;
+  static constexpr int64_t npy_shift = 1;
+
+  static constexpr bool supports_nulls = true;
+  static constexpr int64_t na_value = kPandasTimestampNull;
+  typedef typename npy_traits<NPY_TIMEDELTA>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::DATE32> {
+  // Data stores as FR_D day unit
+  static constexpr int npy_type = NPY_DATETIME;
+  static constexpr int64_t npy_shift = 1;
+
+  static constexpr bool supports_nulls = true;
+  typedef typename npy_traits<NPY_DATETIME>::value_type T;
+
+  static constexpr int64_t na_value = kPandasTimestampNull;
+  static inline bool isnull(int64_t v) { return npy_traits<NPY_DATETIME>::isnull(v); }
+};
+
+template <>
+struct arrow_traits<Type::DATE64> {
+  // Data stores as FR_D day unit
+  static constexpr int npy_type = NPY_DATETIME;
+
+  // There are 1000 * 60 * 60 * 24 = 86400000ms in a day
+  static constexpr int64_t npy_shift = 86400000;
+
+  static constexpr bool supports_nulls = true;
+  typedef typename npy_traits<NPY_DATETIME>::value_type T;
+
+  static constexpr int64_t na_value = kPandasTimestampNull;
+  static inline bool isnull(int64_t v) { return npy_traits<NPY_DATETIME>::isnull(v); }
+};
+
+template <>
+struct arrow_traits<Type::TIME32> {
+  static constexpr int npy_type = NPY_OBJECT;
+  static constexpr bool supports_nulls = true;
+  static constexpr int64_t na_value = kPandasTimestampNull;
+  typedef typename npy_traits<NPY_DATETIME>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::TIME64> {
+  static constexpr int npy_type = NPY_OBJECT;
+  static constexpr bool supports_nulls = true;
+  typedef typename npy_traits<NPY_DATETIME>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::STRING> {
+  static constexpr int npy_type = NPY_OBJECT;
+  static constexpr bool supports_nulls = true;
+};
+
+template <>
+struct arrow_traits<Type::BINARY> {
+  static constexpr int npy_type = NPY_OBJECT;
+  static constexpr bool supports_nulls = true;
+};
+
+static inline NPY_DATETIMEUNIT NumPyFrequency(TimeUnit::type unit) {
+  switch (unit) {
+    case TimestampType::Unit::SECOND:
+      return NPY_FR_s;
+    case TimestampType::Unit::MILLI:
+      return NPY_FR_ms;
+      break;
+    case TimestampType::Unit::MICRO:
+      return NPY_FR_us;
+    default:
+      // NANO
+      return NPY_FR_ns;
+  }
+}
+
+static inline int NumPyTypeSize(int npy_type) {
+  npy_type = fix_numpy_type_num(npy_type);
+
+  switch (npy_type) {
+    case NPY_BOOL:
+    case NPY_INT8:
+    case NPY_UINT8:
+      return 1;
+    case NPY_INT16:
+    case NPY_UINT16:
+      return 2;
+    case NPY_INT32:
+    case NPY_UINT32:
+      return 4;
+    case NPY_INT64:
+    case NPY_UINT64:
+      return 8;
+    case NPY_FLOAT16:
+      return 2;
+    case NPY_FLOAT32:
+      return 4;
+    case NPY_FLOAT64:
+      return 8;
+    case NPY_DATETIME:
+      return 8;
+    case NPY_OBJECT:
+      return sizeof(void*);
+    default:
+      ARROW_CHECK(false) << "unhandled numpy type";
+      break;
+  }
+  return -1;
+}
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/udf.h b/pyarrow/include/arrow/python/udf.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8c4e430e53d49a8fe7d237ffe7ba8feae5e452f
--- /dev/null
+++ b/pyarrow/include/arrow/python/udf.h
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/compute/exec.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/registry.h"
+#include "arrow/python/platform.h"
+#include "arrow/record_batch.h"
+#include "arrow/util/iterator.h"
+
+#include "arrow/python/common.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+
+namespace py {
+
+// TODO: TODO(ARROW-16041): UDF Options are not exposed to the Python
+// users. This feature will be included when extending to provide advanced
+// options for the users.
+struct ARROW_PYTHON_EXPORT UdfOptions {
+  std::string func_name;
+  compute::Arity arity;
+  compute::FunctionDoc func_doc;
+  std::vector<std::shared_ptr<DataType>> input_types;
+  std::shared_ptr<DataType> output_type;
+};
+
+/// \brief A context passed as the first argument of UDF functions.
+struct ARROW_PYTHON_EXPORT UdfContext {
+  MemoryPool* pool;
+  int64_t batch_length;
+};
+
+using UdfWrapperCallback = std::function<PyObject*(
+    PyObject* user_function, const UdfContext& context, PyObject* inputs)>;
+
+/// \brief register a Scalar user-defined-function from Python
+Status ARROW_PYTHON_EXPORT RegisterScalarFunction(
+    PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options,
+    compute::FunctionRegistry* registry = NULLPTR);
+
+/// \brief register a Table user-defined-function from Python
+Status ARROW_PYTHON_EXPORT RegisterTabularFunction(
+    PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options,
+    compute::FunctionRegistry* registry = NULLPTR);
+
+/// \brief register a Aggregate user-defined-function from Python
+Status ARROW_PYTHON_EXPORT RegisterAggregateFunction(
+    PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options,
+    compute::FunctionRegistry* registry = NULLPTR);
+
+/// \brief register a Vector user-defined-function from Python
+Status ARROW_PYTHON_EXPORT RegisterVectorFunction(
+    PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options,
+    compute::FunctionRegistry* registry = NULLPTR);
+
+Result<std::shared_ptr<RecordBatchReader>> ARROW_PYTHON_EXPORT
+CallTabularFunction(const std::string& func_name, const std::vector<Datum>& args,
+                    compute::FunctionRegistry* registry = NULLPTR);
+
+}  // namespace py
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/python/util.h b/pyarrow/include/arrow/python/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff2ffcaea9cfd7835733bce04b72447cca7ee372
--- /dev/null
+++ b/pyarrow/include/arrow/python/util.h
@@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow::py {
+
+/// \brief Create an array of evenly spaced values within a given interval.
+/// This function is similar to Python's `range` function.
+/// The resulting array will contain values starting from `start` up to but not
+/// including `stop`, with a step size of `step`. If `step` is zero, the function
+/// will return an error.
+/// The resulting array will have a data type of `int64`.
+/// \param[in] start initial value of the sequence.
+/// \param[in] stop final value of the sequence (exclusive).
+/// \param[in] step step size between consecutive values.
+/// \param[in] pool Memory pool for any memory allocations.
+/// \return Result Array
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<Array>> Arange(int64_t start, int64_t stop, int64_t step,
+                                      MemoryPool* pool);
+
+}  // namespace arrow::py
diff --git a/pyarrow/include/arrow/python/vendored/pythoncapi_compat.h b/pyarrow/include/arrow/python/vendored/pythoncapi_compat.h
new file mode 100644
index 0000000000000000000000000000000000000000..4baa7b34a93500e0d0d120a60332fba1ed5091fe
--- /dev/null
+++ b/pyarrow/include/arrow/python/vendored/pythoncapi_compat.h
@@ -0,0 +1,1519 @@
+// Header file providing new C API functions to old Python versions.
+//
+// File distributed under the Zero Clause BSD (0BSD) license.
+// Copyright Contributors to the pythoncapi_compat project.
+//
+// Homepage:
+// https://github.com/python/pythoncapi_compat
+//
+// Latest version:
+// https://raw.githubusercontent.com/python/pythoncapi_compat/master/pythoncapi_compat.h
+//
+// Vendored from git revision:
+// 39e2663e6acc0b68d5dd75bdaad0af33152552ae
+// https://raw.githubusercontent.com/python/pythoncapi-compat/39e2663e6acc0b68d5dd75bdaad0af33152552ae/pythoncapi_compat.h
+//
+// SPDX-License-Identifier: 0BSD
+
+/* clang-format off */
+
+#ifndef PYTHONCAPI_COMPAT
+#define PYTHONCAPI_COMPAT
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <Python.h>
+
+// Python 3.11.0b4 added PyFrame_Back() to Python.h
+#if PY_VERSION_HEX < 0x030b00B4 && !defined(PYPY_VERSION)
+#  include "frameobject.h"        // PyFrameObject, PyFrame_GetBack()
+#endif
+
+
+#ifndef _Py_CAST
+#  define _Py_CAST(type, expr) ((type)(expr))
+#endif
+
+// Static inline functions should use _Py_NULL rather than using directly NULL
+// to prevent C++ compiler warnings. On C23 and newer and on C++11 and newer,
+// _Py_NULL is defined as nullptr.
+#if (defined (__STDC_VERSION__) && __STDC_VERSION__ > 201710L) \
+        || (defined(__cplusplus) && __cplusplus >= 201103)
+#  define _Py_NULL nullptr
+#else
+#  define _Py_NULL NULL
+#endif
+
+// Cast argument to PyObject* type.
+#ifndef _PyObject_CAST
+#  define _PyObject_CAST(op) _Py_CAST(PyObject*, op)
+#endif
+
+
+// bpo-42262 added Py_NewRef() to Python 3.10.0a3
+#if PY_VERSION_HEX < 0x030A00A3 && !defined(Py_NewRef)
+static inline PyObject* _Py_NewRef(PyObject *obj)
+{
+    Py_INCREF(obj);
+    return obj;
+}
+#define Py_NewRef(obj) _Py_NewRef(_PyObject_CAST(obj))
+#endif
+
+
+// bpo-42262 added Py_XNewRef() to Python 3.10.0a3
+#if PY_VERSION_HEX < 0x030A00A3 && !defined(Py_XNewRef)
+static inline PyObject* _Py_XNewRef(PyObject *obj)
+{
+    Py_XINCREF(obj);
+    return obj;
+}
+#define Py_XNewRef(obj) _Py_XNewRef(_PyObject_CAST(obj))
+#endif
+
+
+// bpo-39573 added Py_SET_REFCNT() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_REFCNT)
+static inline void _Py_SET_REFCNT(PyObject *ob, Py_ssize_t refcnt)
+{
+    ob->ob_refcnt = refcnt;
+}
+#define Py_SET_REFCNT(ob, refcnt) _Py_SET_REFCNT(_PyObject_CAST(ob), refcnt)
+#endif
+
+
+// Py_SETREF() and Py_XSETREF() were added to Python 3.5.2.
+// It is excluded from the limited C API.
+#if (PY_VERSION_HEX < 0x03050200 && !defined(Py_SETREF)) && !defined(Py_LIMITED_API)
+#define Py_SETREF(dst, src)                                     \
+    do {                                                        \
+        PyObject **_tmp_dst_ptr = _Py_CAST(PyObject**, &(dst)); \
+        PyObject *_tmp_dst = (*_tmp_dst_ptr);                   \
+        *_tmp_dst_ptr = _PyObject_CAST(src);                    \
+        Py_DECREF(_tmp_dst);                                    \
+    } while (0)
+
+#define Py_XSETREF(dst, src)                                    \
+    do {                                                        \
+        PyObject **_tmp_dst_ptr = _Py_CAST(PyObject**, &(dst)); \
+        PyObject *_tmp_dst = (*_tmp_dst_ptr);                   \
+        *_tmp_dst_ptr = _PyObject_CAST(src);                    \
+        Py_XDECREF(_tmp_dst);                                   \
+    } while (0)
+#endif
+
+
+// bpo-43753 added Py_Is(), Py_IsNone(), Py_IsTrue() and Py_IsFalse()
+// to Python 3.10.0b1.
+#if PY_VERSION_HEX < 0x030A00B1 && !defined(Py_Is)
+#  define Py_Is(x, y) ((x) == (y))
+#endif
+#if PY_VERSION_HEX < 0x030A00B1 && !defined(Py_IsNone)
+#  define Py_IsNone(x) Py_Is(x, Py_None)
+#endif
+#if (PY_VERSION_HEX < 0x030A00B1 || defined(PYPY_VERSION)) && !defined(Py_IsTrue)
+#  define Py_IsTrue(x) Py_Is(x, Py_True)
+#endif
+#if (PY_VERSION_HEX < 0x030A00B1 || defined(PYPY_VERSION)) && !defined(Py_IsFalse)
+#  define Py_IsFalse(x) Py_Is(x, Py_False)
+#endif
+
+
+// bpo-39573 added Py_SET_TYPE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE)
+static inline void _Py_SET_TYPE(PyObject *ob, PyTypeObject *type)
+{
+    ob->ob_type = type;
+}
+#define Py_SET_TYPE(ob, type) _Py_SET_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+
+// bpo-39573 added Py_SET_SIZE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_SIZE)
+static inline void _Py_SET_SIZE(PyVarObject *ob, Py_ssize_t size)
+{
+    ob->ob_size = size;
+}
+#define Py_SET_SIZE(ob, size) _Py_SET_SIZE((PyVarObject*)(ob), size)
+#endif
+
+
+// bpo-40421 added PyFrame_GetCode() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 || defined(PYPY_VERSION)
+static inline PyCodeObject* PyFrame_GetCode(PyFrameObject *frame)
+{
+    assert(frame != _Py_NULL);
+    assert(frame->f_code != _Py_NULL);
+    return _Py_CAST(PyCodeObject*, Py_NewRef(frame->f_code));
+}
+#endif
+
+static inline PyCodeObject* _PyFrame_GetCodeBorrow(PyFrameObject *frame)
+{
+    PyCodeObject *code = PyFrame_GetCode(frame);
+    Py_DECREF(code);
+    return code;
+}
+
+
+// bpo-40421 added PyFrame_GetBack() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
+static inline PyFrameObject* PyFrame_GetBack(PyFrameObject *frame)
+{
+    assert(frame != _Py_NULL);
+    return _Py_CAST(PyFrameObject*, Py_XNewRef(frame->f_back));
+}
+#endif
+
+#if !defined(PYPY_VERSION)
+static inline PyFrameObject* _PyFrame_GetBackBorrow(PyFrameObject *frame)
+{
+    PyFrameObject *back = PyFrame_GetBack(frame);
+    Py_XDECREF(back);
+    return back;
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetLocals() to Python 3.11.0a7
+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
+static inline PyObject* PyFrame_GetLocals(PyFrameObject *frame)
+{
+#if PY_VERSION_HEX >= 0x030400B1
+    if (PyFrame_FastToLocalsWithError(frame) < 0) {
+        return NULL;
+    }
+#else
+    PyFrame_FastToLocals(frame);
+#endif
+    return Py_NewRef(frame->f_locals);
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetGlobals() to Python 3.11.0a7
+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
+static inline PyObject* PyFrame_GetGlobals(PyFrameObject *frame)
+{
+    return Py_NewRef(frame->f_globals);
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetBuiltins() to Python 3.11.0a7
+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
+static inline PyObject* PyFrame_GetBuiltins(PyFrameObject *frame)
+{
+    return Py_NewRef(frame->f_builtins);
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetLasti() to Python 3.11.0b1
+#if PY_VERSION_HEX < 0x030B00B1 && !defined(PYPY_VERSION)
+static inline int PyFrame_GetLasti(PyFrameObject *frame)
+{
+#if PY_VERSION_HEX >= 0x030A00A7
+    // bpo-27129: Since Python 3.10.0a7, f_lasti is an instruction offset,
+    // not a bytes offset anymore. Python uses 16-bit "wordcode" (2 bytes)
+    // instructions.
+    if (frame->f_lasti < 0) {
+        return -1;
+    }
+    return frame->f_lasti * 2;
+#else
+    return frame->f_lasti;
+#endif
+}
+#endif
+
+
+// gh-91248 added PyFrame_GetVar() to Python 3.12.0a2
+#if PY_VERSION_HEX < 0x030C00A2 && !defined(PYPY_VERSION)
+static inline PyObject* PyFrame_GetVar(PyFrameObject *frame, PyObject *name)
+{
+    PyObject *locals, *value;
+
+    locals = PyFrame_GetLocals(frame);
+    if (locals == NULL) {
+        return NULL;
+    }
+#if PY_VERSION_HEX >= 0x03000000
+    value = PyDict_GetItemWithError(locals, name);
+#else
+    value = _PyDict_GetItemWithError(locals, name);
+#endif
+    Py_DECREF(locals);
+
+    if (value == NULL) {
+        if (PyErr_Occurred()) {
+            return NULL;
+        }
+#if PY_VERSION_HEX >= 0x03000000
+        PyErr_Format(PyExc_NameError, "variable %R does not exist", name);
+#else
+        PyErr_SetString(PyExc_NameError, "variable does not exist");
+#endif
+        return NULL;
+    }
+    return Py_NewRef(value);
+}
+#endif
+
+
+// gh-91248 added PyFrame_GetVarString() to Python 3.12.0a2
+#if PY_VERSION_HEX < 0x030C00A2 && !defined(PYPY_VERSION)
+static inline PyObject*
+PyFrame_GetVarString(PyFrameObject *frame, const char *name)
+{
+    PyObject *name_obj, *value;
+#if PY_VERSION_HEX >= 0x03000000
+    name_obj = PyUnicode_FromString(name);
+#else
+    name_obj = PyString_FromString(name);
+#endif
+    if (name_obj == NULL) {
+        return NULL;
+    }
+    value = PyFrame_GetVar(frame, name_obj);
+    Py_DECREF(name_obj);
+    return value;
+}
+#endif
+
+
+// bpo-39947 added PyThreadState_GetInterpreter() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
+static inline PyInterpreterState *
+PyThreadState_GetInterpreter(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return tstate->interp;
+}
+#endif
+
+
+// bpo-40429 added PyThreadState_GetFrame() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
+static inline PyFrameObject* PyThreadState_GetFrame(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return _Py_CAST(PyFrameObject *, Py_XNewRef(tstate->frame));
+}
+#endif
+
+#if !defined(PYPY_VERSION)
+static inline PyFrameObject*
+_PyThreadState_GetFrameBorrow(PyThreadState *tstate)
+{
+    PyFrameObject *frame = PyThreadState_GetFrame(tstate);
+    Py_XDECREF(frame);
+    return frame;
+}
+#endif
+
+
+// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
+static inline PyInterpreterState* PyInterpreterState_Get(void)
+{
+    PyThreadState *tstate;
+    PyInterpreterState *interp;
+
+    tstate = PyThreadState_GET();
+    if (tstate == _Py_NULL) {
+        Py_FatalError("GIL released (tstate is NULL)");
+    }
+    interp = tstate->interp;
+    if (interp == _Py_NULL) {
+        Py_FatalError("no current interpreter");
+    }
+    return interp;
+}
+#endif
+
+
+// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a6
+#if 0x030700A1 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
+static inline uint64_t PyThreadState_GetID(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return tstate->id;
+}
+#endif
+
+// bpo-43760 added PyThreadState_EnterTracing() to Python 3.11.0a2
+#if PY_VERSION_HEX < 0x030B00A2 && !defined(PYPY_VERSION)
+static inline void PyThreadState_EnterTracing(PyThreadState *tstate)
+{
+    tstate->tracing++;
+#if PY_VERSION_HEX >= 0x030A00A1
+    tstate->cframe->use_tracing = 0;
+#else
+    tstate->use_tracing = 0;
+#endif
+}
+#endif
+
+// bpo-43760 added PyThreadState_LeaveTracing() to Python 3.11.0a2
+#if PY_VERSION_HEX < 0x030B00A2 && !defined(PYPY_VERSION)
+static inline void PyThreadState_LeaveTracing(PyThreadState *tstate)
+{
+    int use_tracing = (tstate->c_tracefunc != _Py_NULL
+                       || tstate->c_profilefunc != _Py_NULL);
+    tstate->tracing--;
+#if PY_VERSION_HEX >= 0x030A00A1
+    tstate->cframe->use_tracing = use_tracing;
+#else
+    tstate->use_tracing = use_tracing;
+#endif
+}
+#endif
+
+
+// bpo-37194 added PyObject_CallNoArgs() to Python 3.9.0a1
+// PyObject_CallNoArgs() added to PyPy 3.9.16-v7.3.11
+#if !defined(PyObject_CallNoArgs) && PY_VERSION_HEX < 0x030900A1
+static inline PyObject* PyObject_CallNoArgs(PyObject *func)
+{
+    return PyObject_CallFunctionObjArgs(func, NULL);
+}
+#endif
+
+
+// bpo-39245 made PyObject_CallOneArg() public (previously called
+// _PyObject_CallOneArg) in Python 3.9.0a4
+// PyObject_CallOneArg() added to PyPy 3.9.16-v7.3.11
+#if !defined(PyObject_CallOneArg) && PY_VERSION_HEX < 0x030900A4
+static inline PyObject* PyObject_CallOneArg(PyObject *func, PyObject *arg)
+{
+    return PyObject_CallFunctionObjArgs(func, arg, NULL);
+}
+#endif
+
+
+// bpo-1635741 added PyModule_AddObjectRef() to Python 3.10.0a3
+#if PY_VERSION_HEX < 0x030A00A3
+static inline int
+PyModule_AddObjectRef(PyObject *module, const char *name, PyObject *value)
+{
+    int res;
+
+    if (!value && !PyErr_Occurred()) {
+        // PyModule_AddObject() raises TypeError in this case
+        PyErr_SetString(PyExc_SystemError,
+                        "PyModule_AddObjectRef() must be called "
+                        "with an exception raised if value is NULL");
+        return -1;
+    }
+
+    Py_XINCREF(value);
+    res = PyModule_AddObject(module, name, value);
+    if (res < 0) {
+        Py_XDECREF(value);
+    }
+    return res;
+}
+#endif
+
+
+// bpo-40024 added PyModule_AddType() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5
+static inline int PyModule_AddType(PyObject *module, PyTypeObject *type)
+{
+    const char *name, *dot;
+
+    if (PyType_Ready(type) < 0) {
+        return -1;
+    }
+
+    // inline _PyType_Name()
+    name = type->tp_name;
+    assert(name != _Py_NULL);
+    dot = strrchr(name, '.');
+    if (dot != _Py_NULL) {
+        name = dot + 1;
+    }
+
+    return PyModule_AddObjectRef(module, name, _PyObject_CAST(type));
+}
+#endif
+
+
+// bpo-40241 added PyObject_GC_IsTracked() to Python 3.9.0a6.
+// bpo-4688 added _PyObject_GC_IS_TRACKED() to Python 2.7.0a2.
+#if PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
+static inline int PyObject_GC_IsTracked(PyObject* obj)
+{
+    return (PyObject_IS_GC(obj) && _PyObject_GC_IS_TRACKED(obj));
+}
+#endif
+
+// bpo-40241 added PyObject_GC_IsFinalized() to Python 3.9.0a6.
+// bpo-18112 added _PyGCHead_FINALIZED() to Python 3.4.0 final.
+#if PY_VERSION_HEX < 0x030900A6 && PY_VERSION_HEX >= 0x030400F0 && !defined(PYPY_VERSION)
+static inline int PyObject_GC_IsFinalized(PyObject *obj)
+{
+    PyGC_Head *gc = _Py_CAST(PyGC_Head*, obj) - 1;
+    return (PyObject_IS_GC(obj) && _PyGCHead_FINALIZED(gc));
+}
+#endif
+
+
+// bpo-39573 added Py_IS_TYPE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_IS_TYPE)
+static inline int _Py_IS_TYPE(PyObject *ob, PyTypeObject *type) {
+    return Py_TYPE(ob) == type;
+}
+#define Py_IS_TYPE(ob, type) _Py_IS_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+
+// bpo-46906 added PyFloat_Pack2() and PyFloat_Unpack2() to Python 3.11a7.
+// bpo-11734 added _PyFloat_Pack2() and _PyFloat_Unpack2() to Python 3.6.0b1.
+// Python 3.11a2 moved _PyFloat_Pack2() and _PyFloat_Unpack2() to the internal
+// C API: Python 3.11a2-3.11a6 versions are not supported.
+#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+static inline int PyFloat_Pack2(double x, char *p, int le)
+{ return _PyFloat_Pack2(x, (unsigned char*)p, le); }
+
+static inline double PyFloat_Unpack2(const char *p, int le)
+{ return _PyFloat_Unpack2((const unsigned char *)p, le); }
+#endif
+
+
+// bpo-46906 added PyFloat_Pack4(), PyFloat_Pack8(), PyFloat_Unpack4() and
+// PyFloat_Unpack8() to Python 3.11a7.
+// Python 3.11a2 moved _PyFloat_Pack4(), _PyFloat_Pack8(), _PyFloat_Unpack4()
+// and _PyFloat_Unpack8() to the internal C API: Python 3.11a2-3.11a6 versions
+// are not supported.
+#if PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+static inline int PyFloat_Pack4(double x, char *p, int le)
+{ return _PyFloat_Pack4(x, (unsigned char*)p, le); }
+
+static inline int PyFloat_Pack8(double x, char *p, int le)
+{ return _PyFloat_Pack8(x, (unsigned char*)p, le); }
+
+static inline double PyFloat_Unpack4(const char *p, int le)
+{ return _PyFloat_Unpack4((const unsigned char *)p, le); }
+
+static inline double PyFloat_Unpack8(const char *p, int le)
+{ return _PyFloat_Unpack8((const unsigned char *)p, le); }
+#endif
+
+
+// gh-92154 added PyCode_GetCode() to Python 3.11.0b1
+#if PY_VERSION_HEX < 0x030B00B1 && !defined(PYPY_VERSION)
+static inline PyObject* PyCode_GetCode(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_code);
+}
+#endif
+
+
+// gh-95008 added PyCode_GetVarnames() to Python 3.11.0rc1
+#if PY_VERSION_HEX < 0x030B00C1 && !defined(PYPY_VERSION)
+static inline PyObject* PyCode_GetVarnames(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_varnames);
+}
+#endif
+
+// gh-95008 added PyCode_GetFreevars() to Python 3.11.0rc1
+#if PY_VERSION_HEX < 0x030B00C1 && !defined(PYPY_VERSION)
+static inline PyObject* PyCode_GetFreevars(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_freevars);
+}
+#endif
+
+// gh-95008 added PyCode_GetCellvars() to Python 3.11.0rc1
+#if PY_VERSION_HEX < 0x030B00C1 && !defined(PYPY_VERSION)
+static inline PyObject* PyCode_GetCellvars(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_cellvars);
+}
+#endif
+
+
+// Py_UNUSED() was added to Python 3.4.0b2.
+#if PY_VERSION_HEX < 0x030400B2 && !defined(Py_UNUSED)
+#  if defined(__GNUC__) || defined(__clang__)
+#    define Py_UNUSED(name) _unused_ ## name __attribute__((unused))
+#  else
+#    define Py_UNUSED(name) _unused_ ## name
+#  endif
+#endif
+
+
+// gh-105922 added PyImport_AddModuleRef() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A0
+static inline PyObject* PyImport_AddModuleRef(const char *name)
+{
+    return Py_XNewRef(PyImport_AddModule(name));
+}
+#endif
+
+
+// gh-105927 added PyWeakref_GetRef() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D0000
+static inline int PyWeakref_GetRef(PyObject *ref, PyObject **pobj)
+{
+    PyObject *obj;
+    if (ref != NULL && !PyWeakref_Check(ref)) {
+        *pobj = NULL;
+        PyErr_SetString(PyExc_TypeError, "expected a weakref");
+        return -1;
+    }
+    obj = PyWeakref_GetObject(ref);
+    if (obj == NULL) {
+        // SystemError if ref is NULL
+        *pobj = NULL;
+        return -1;
+    }
+    if (obj == Py_None) {
+        *pobj = NULL;
+        return 0;
+    }
+    *pobj = Py_NewRef(obj);
+    return (*pobj != NULL);
+}
+#endif
+
+
+// bpo-36974 added PY_VECTORCALL_ARGUMENTS_OFFSET to Python 3.8b1
+#ifndef PY_VECTORCALL_ARGUMENTS_OFFSET
+#  define PY_VECTORCALL_ARGUMENTS_OFFSET (_Py_CAST(size_t, 1) << (8 * sizeof(size_t) - 1))
+#endif
+
+// bpo-36974 added PyVectorcall_NARGS() to Python 3.8b1
+#if PY_VERSION_HEX < 0x030800B1
+static inline Py_ssize_t PyVectorcall_NARGS(size_t n)
+{
+    return n & ~PY_VECTORCALL_ARGUMENTS_OFFSET;
+}
+#endif
+
+
+// gh-105922 added PyObject_Vectorcall() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4
+static inline PyObject*
+PyObject_Vectorcall(PyObject *callable, PyObject *const *args,
+                     size_t nargsf, PyObject *kwnames)
+{
+#if PY_VERSION_HEX >= 0x030800B1 && !defined(PYPY_VERSION)
+    // bpo-36974 added _PyObject_Vectorcall() to Python 3.8.0b1
+    return _PyObject_Vectorcall(callable, args, nargsf, kwnames);
+#else
+    PyObject *posargs = NULL, *kwargs = NULL;
+    PyObject *res;
+    Py_ssize_t nposargs, nkwargs, i;
+
+    if (nargsf != 0 && args == NULL) {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+    if (kwnames != NULL && !PyTuple_Check(kwnames)) {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+
+    nposargs = (Py_ssize_t)PyVectorcall_NARGS(nargsf);
+    if (kwnames) {
+        nkwargs = PyTuple_GET_SIZE(kwnames);
+    }
+    else {
+        nkwargs = 0;
+    }
+
+    posargs = PyTuple_New(nposargs);
+    if (posargs == NULL) {
+        goto error;
+    }
+    if (nposargs) {
+        for (i=0; i < nposargs; i++) {
+            PyTuple_SET_ITEM(posargs, i, Py_NewRef(*args));
+            args++;
+        }
+    }
+
+    if (nkwargs) {
+        kwargs = PyDict_New();
+        if (kwargs == NULL) {
+            goto error;
+        }
+
+        for (i = 0; i < nkwargs; i++) {
+            PyObject *key = PyTuple_GET_ITEM(kwnames, i);
+            PyObject *value = *args;
+            args++;
+            if (PyDict_SetItem(kwargs, key, value) < 0) {
+                goto error;
+            }
+        }
+    }
+    else {
+        kwargs = NULL;
+    }
+
+    res = PyObject_Call(callable, posargs, kwargs);
+    Py_DECREF(posargs);
+    Py_XDECREF(kwargs);
+    return res;
+
+error:
+    Py_DECREF(posargs);
+    Py_XDECREF(kwargs);
+    return NULL;
+#endif
+}
+#endif
+
+
+// gh-106521 added PyObject_GetOptionalAttr() and
+// PyObject_GetOptionalAttrString() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyObject_GetOptionalAttr(PyObject *obj, PyObject *attr_name, PyObject **result)
+{
+    // bpo-32571 added _PyObject_LookupAttr() to Python 3.7.0b1
+#if PY_VERSION_HEX >= 0x030700B1 && !defined(PYPY_VERSION)
+    return _PyObject_LookupAttr(obj, attr_name, result);
+#else
+    *result = PyObject_GetAttr(obj, attr_name);
+    if (*result != NULL) {
+        return 1;
+    }
+    if (!PyErr_Occurred()) {
+        return 0;
+    }
+    if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+        return 0;
+    }
+    return -1;
+#endif
+}
+
+static inline int
+PyObject_GetOptionalAttrString(PyObject *obj, const char *attr_name, PyObject **result)
+{
+    PyObject *name_obj;
+    int rc;
+#if PY_VERSION_HEX >= 0x03000000
+    name_obj = PyUnicode_FromString(attr_name);
+#else
+    name_obj = PyString_FromString(attr_name);
+#endif
+    if (name_obj == NULL) {
+        *result = NULL;
+        return -1;
+    }
+    rc = PyObject_GetOptionalAttr(obj, name_obj, result);
+    Py_DECREF(name_obj);
+    return rc;
+}
+#endif
+
+
+// gh-106307 added PyObject_GetOptionalAttr() and
+// PyMapping_GetOptionalItemString() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyMapping_GetOptionalItem(PyObject *obj, PyObject *key, PyObject **result)
+{
+    *result = PyObject_GetItem(obj, key);
+    if (*result) {
+        return 1;
+    }
+    if (!PyErr_ExceptionMatches(PyExc_KeyError)) {
+        return -1;
+    }
+    PyErr_Clear();
+    return 0;
+}
+
+static inline int
+PyMapping_GetOptionalItemString(PyObject *obj, const char *key, PyObject **result)
+{
+    PyObject *key_obj;
+    int rc;
+#if PY_VERSION_HEX >= 0x03000000
+    key_obj = PyUnicode_FromString(key);
+#else
+    key_obj = PyString_FromString(key);
+#endif
+    if (key_obj == NULL) {
+        *result = NULL;
+        return -1;
+    }
+    rc = PyMapping_GetOptionalItem(obj, key_obj, result);
+    Py_DECREF(key_obj);
+    return rc;
+}
+#endif
+
+// gh-108511 added PyMapping_HasKeyWithError() and
+// PyMapping_HasKeyStringWithError() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyMapping_HasKeyWithError(PyObject *obj, PyObject *key)
+{
+    PyObject *res;
+    int rc = PyMapping_GetOptionalItem(obj, key, &res);
+    Py_XDECREF(res);
+    return rc;
+}
+
+static inline int
+PyMapping_HasKeyStringWithError(PyObject *obj, const char *key)
+{
+    PyObject *res;
+    int rc = PyMapping_GetOptionalItemString(obj, key, &res);
+    Py_XDECREF(res);
+    return rc;
+}
+#endif
+
+
+// gh-108511 added PyObject_HasAttrWithError() and
+// PyObject_HasAttrStringWithError() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyObject_HasAttrWithError(PyObject *obj, PyObject *attr)
+{
+    PyObject *res;
+    int rc = PyObject_GetOptionalAttr(obj, attr, &res);
+    Py_XDECREF(res);
+    return rc;
+}
+
+static inline int
+PyObject_HasAttrStringWithError(PyObject *obj, const char *attr)
+{
+    PyObject *res;
+    int rc = PyObject_GetOptionalAttrString(obj, attr, &res);
+    Py_XDECREF(res);
+    return rc;
+}
+#endif
+
+
+// gh-106004 added PyDict_GetItemRef() and PyDict_GetItemStringRef()
+// to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyDict_GetItemRef(PyObject *mp, PyObject *key, PyObject **result)
+{
+#if PY_VERSION_HEX >= 0x03000000
+    PyObject *item = PyDict_GetItemWithError(mp, key);
+#else
+    PyObject *item = _PyDict_GetItemWithError(mp, key);
+#endif
+    if (item != NULL) {
+        *result = Py_NewRef(item);
+        return 1;  // found
+    }
+    if (!PyErr_Occurred()) {
+        *result = NULL;
+        return 0;  // not found
+    }
+    *result = NULL;
+    return -1;
+}
+
+static inline int
+PyDict_GetItemStringRef(PyObject *mp, const char *key, PyObject **result)
+{
+    int res;
+#if PY_VERSION_HEX >= 0x03000000
+    PyObject *key_obj = PyUnicode_FromString(key);
+#else
+    PyObject *key_obj = PyString_FromString(key);
+#endif
+    if (key_obj == NULL) {
+        *result = NULL;
+        return -1;
+    }
+    res = PyDict_GetItemRef(mp, key_obj, result);
+    Py_DECREF(key_obj);
+    return res;
+}
+#endif
+
+
+// gh-106307 added PyModule_Add() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyModule_Add(PyObject *mod, const char *name, PyObject *value)
+{
+    int res = PyModule_AddObjectRef(mod, name, value);
+    Py_XDECREF(value);
+    return res;
+}
+#endif
+
+
+// gh-108014 added Py_IsFinalizing() to Python 3.13.0a1
+// bpo-1856 added _Py_Finalizing to Python 3.2.1b1.
+// _Py_IsFinalizing() was added to PyPy 7.3.0.
+#if (0x030201B1 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x030D00A1) \
+        && (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x7030000)
+static inline int Py_IsFinalizing(void)
+{
+#if PY_VERSION_HEX >= 0x030700A1
+    // _Py_IsFinalizing() was added to Python 3.7.0a1.
+    return _Py_IsFinalizing();
+#else
+    return (_Py_Finalizing != NULL);
+#endif
+}
+#endif
+
+
+// gh-108323 added PyDict_ContainsString() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int PyDict_ContainsString(PyObject *op, const char *key)
+{
+    PyObject *key_obj = PyUnicode_FromString(key);
+    if (key_obj == NULL) {
+        return -1;
+    }
+    int res = PyDict_Contains(op, key_obj);
+    Py_DECREF(key_obj);
+    return res;
+}
+#endif
+
+
+// gh-108445 added PyLong_AsInt() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int PyLong_AsInt(PyObject *obj)
+{
+#ifdef PYPY_VERSION
+    long value = PyLong_AsLong(obj);
+    if (value == -1 && PyErr_Occurred()) {
+        return -1;
+    }
+    if (value < (long)INT_MIN || (long)INT_MAX < value) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "Python int too large to convert to C int");
+        return -1;
+    }
+    return (int)value;
+#else
+    return _PyLong_AsInt(obj);
+#endif
+}
+#endif
+
+
+// gh-107073 added PyObject_VisitManagedDict() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyObject_VisitManagedDict(PyObject *obj, visitproc visit, void *arg)
+{
+    PyObject **dict = _PyObject_GetDictPtr(obj);
+    if (*dict == NULL) {
+        return -1;
+    }
+    Py_VISIT(*dict);
+    return 0;
+}
+
+static inline void
+PyObject_ClearManagedDict(PyObject *obj)
+{
+    PyObject **dict = _PyObject_GetDictPtr(obj);
+    if (*dict == NULL) {
+        return;
+    }
+    Py_CLEAR(*dict);
+}
+#endif
+
+// gh-108867 added PyThreadState_GetUnchecked() to Python 3.13.0a1
+// Python 3.5.2 added _PyThreadState_UncheckedGet().
+#if PY_VERSION_HEX >= 0x03050200 && PY_VERSION_HEX < 0x030D00A1
+static inline PyThreadState*
+PyThreadState_GetUnchecked(void)
+{
+    return _PyThreadState_UncheckedGet();
+}
+#endif
+
+// gh-110289 added PyUnicode_EqualToUTF8() and PyUnicode_EqualToUTF8AndSize()
+// to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t str_len)
+{
+    Py_ssize_t len;
+    const void *utf8;
+    PyObject *exc_type, *exc_value, *exc_tb;
+    int res;
+
+    // API cannot report errors so save/restore the exception
+    PyErr_Fetch(&exc_type, &exc_value, &exc_tb);
+
+    // Python 3.3.0a1 added PyUnicode_AsUTF8AndSize()
+#if PY_VERSION_HEX >= 0x030300A1
+    if (PyUnicode_IS_ASCII(unicode)) {
+        utf8 = PyUnicode_DATA(unicode);
+        len = PyUnicode_GET_LENGTH(unicode);
+    }
+    else {
+        utf8 = PyUnicode_AsUTF8AndSize(unicode, &len);
+        if (utf8 == NULL) {
+            // Memory allocation failure. The API cannot report error,
+            // so ignore the exception and return 0.
+            res = 0;
+            goto done;
+        }
+    }
+
+    if (len != str_len) {
+        res = 0;
+        goto done;
+    }
+    res = (memcmp(utf8, str, (size_t)len) == 0);
+#else
+    PyObject *bytes = PyUnicode_AsUTF8String(unicode);
+    if (bytes == NULL) {
+        // Memory allocation failure. The API cannot report error,
+        // so ignore the exception and return 0.
+        res = 0;
+        goto done;
+    }
+
+#if PY_VERSION_HEX >= 0x03000000
+    len = PyBytes_GET_SIZE(bytes);
+    utf8 = PyBytes_AS_STRING(bytes);
+#else
+    len = PyString_GET_SIZE(bytes);
+    utf8 = PyString_AS_STRING(bytes);
+#endif
+    if (len != str_len) {
+        Py_DECREF(bytes);
+        res = 0;
+        goto done;
+    }
+
+    res = (memcmp(utf8, str, (size_t)len) == 0);
+    Py_DECREF(bytes);
+#endif
+
+done:
+    PyErr_Restore(exc_type, exc_value, exc_tb);
+    return res;
+}
+
+static inline int
+PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
+{
+    return PyUnicode_EqualToUTF8AndSize(unicode, str, (Py_ssize_t)strlen(str));
+}
+#endif
+
+
+// gh-111138 added PyList_Extend() and PyList_Clear() to Python 3.13.0a2
+#if PY_VERSION_HEX < 0x030D00A2
+static inline int
+PyList_Extend(PyObject *list, PyObject *iterable)
+{
+    return PyList_SetSlice(list, PY_SSIZE_T_MAX, PY_SSIZE_T_MAX, iterable);
+}
+
+static inline int
+PyList_Clear(PyObject *list)
+{
+    return PyList_SetSlice(list, 0, PY_SSIZE_T_MAX, NULL);
+}
+#endif
+
+// gh-111262 added PyDict_Pop() and PyDict_PopString() to Python 3.13.0a2
+#if PY_VERSION_HEX < 0x030D00A2
+static inline int
+PyDict_Pop(PyObject *dict, PyObject *key, PyObject **result)
+{
+    PyObject *value;
+
+    if (!PyDict_Check(dict)) {
+        PyErr_BadInternalCall();
+        if (result) {
+            *result = NULL;
+        }
+        return -1;
+    }
+
+    // bpo-16991 added _PyDict_Pop() to Python 3.5.0b2.
+    // Python 3.6.0b3 changed _PyDict_Pop() first argument type to PyObject*.
+    // Python 3.13.0a1 removed _PyDict_Pop().
+#if defined(PYPY_VERSION) || PY_VERSION_HEX < 0x030500b2 || PY_VERSION_HEX >= 0x030D0000
+    value = PyObject_CallMethod(dict, "pop", "O", key);
+#elif PY_VERSION_HEX < 0x030600b3
+    value = _PyDict_Pop(_Py_CAST(PyDictObject*, dict), key, NULL);
+#else
+    value = _PyDict_Pop(dict, key, NULL);
+#endif
+    if (value == NULL) {
+        if (result) {
+            *result = NULL;
+        }
+        if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_KeyError)) {
+            return -1;
+        }
+        PyErr_Clear();
+        return 0;
+    }
+    if (result) {
+        *result = value;
+    }
+    else {
+        Py_DECREF(value);
+    }
+    return 1;
+}
+
+static inline int
+PyDict_PopString(PyObject *dict, const char *key, PyObject **result)
+{
+    PyObject *key_obj = PyUnicode_FromString(key);
+    if (key_obj == NULL) {
+        if (result != NULL) {
+            *result = NULL;
+        }
+        return -1;
+    }
+
+    int res = PyDict_Pop(dict, key_obj, result);
+    Py_DECREF(key_obj);
+    return res;
+}
+#endif
+
+
+#if PY_VERSION_HEX < 0x030200A4
+// Python 3.2.0a4 added Py_hash_t type
+typedef Py_ssize_t Py_hash_t;
+#endif
+
+
+// gh-111545 added Py_HashPointer() to Python 3.13.0a3
+#if PY_VERSION_HEX < 0x030D00A3
+static inline Py_hash_t Py_HashPointer(const void *ptr)
+{
+#if PY_VERSION_HEX >= 0x030900A4 && !defined(PYPY_VERSION)
+    return _Py_HashPointer(ptr);
+#else
+    return _Py_HashPointer(_Py_CAST(void*, ptr));
+#endif
+}
+#endif
+
+
+// Python 3.13a4 added a PyTime API.
+// Use the private API added to Python 3.5.
+#if PY_VERSION_HEX < 0x030D00A4 && PY_VERSION_HEX  >= 0x03050000
+typedef _PyTime_t PyTime_t;
+#define PyTime_MIN _PyTime_MIN
+#define PyTime_MAX _PyTime_MAX
+
+static inline double PyTime_AsSecondsDouble(PyTime_t t)
+{ return _PyTime_AsSecondsDouble(t); }
+
+static inline int PyTime_Monotonic(PyTime_t *result)
+{ return _PyTime_GetMonotonicClockWithInfo(result, NULL); }
+
+static inline int PyTime_Time(PyTime_t *result)
+{ return _PyTime_GetSystemClockWithInfo(result, NULL); }
+
+static inline int PyTime_PerfCounter(PyTime_t *result)
+{
+#if PY_VERSION_HEX >= 0x03070000 && !defined(PYPY_VERSION)
+    return _PyTime_GetPerfCounterWithInfo(result, NULL);
+#elif PY_VERSION_HEX >= 0x03070000
+    // Call time.perf_counter_ns() and convert Python int object to PyTime_t.
+    // Cache time.perf_counter_ns() function for best performance.
+    static PyObject *func = NULL;
+    if (func == NULL) {
+        PyObject *mod = PyImport_ImportModule("time");
+        if (mod == NULL) {
+            return -1;
+        }
+
+        func = PyObject_GetAttrString(mod, "perf_counter_ns");
+        Py_DECREF(mod);
+        if (func == NULL) {
+            return -1;
+        }
+    }
+
+    PyObject *res = PyObject_CallNoArgs(func);
+    if (res == NULL) {
+        return -1;
+    }
+    long long value = PyLong_AsLongLong(res);
+    Py_DECREF(res);
+
+    if (value == -1 && PyErr_Occurred()) {
+        return -1;
+    }
+
+    Py_BUILD_ASSERT(sizeof(value) >= sizeof(PyTime_t));
+    *result = (PyTime_t)value;
+    return 0;
+#else
+    // Call time.perf_counter() and convert C double to PyTime_t.
+    // Cache time.perf_counter() function for best performance.
+    static PyObject *func = NULL;
+    if (func == NULL) {
+        PyObject *mod = PyImport_ImportModule("time");
+        if (mod == NULL) {
+            return -1;
+        }
+
+        func = PyObject_GetAttrString(mod, "perf_counter");
+        Py_DECREF(mod);
+        if (func == NULL) {
+            return -1;
+        }
+    }
+
+    PyObject *res = PyObject_CallNoArgs(func);
+    if (res == NULL) {
+        return -1;
+    }
+    double d = PyFloat_AsDouble(res);
+    Py_DECREF(res);
+
+    if (d == -1.0 && PyErr_Occurred()) {
+        return -1;
+    }
+
+    // Avoid floor() to avoid having to link to libm
+    *result = (PyTime_t)(d * 1e9);
+    return 0;
+#endif
+}
+
+#endif
+
+// gh-111389 added hash constants to Python 3.13.0a5. These constants were
+// added first as private macros to Python 3.4.0b1 and PyPy 7.3.9.
+#if (!defined(PyHASH_BITS) \
+     && ((!defined(PYPY_VERSION) && PY_VERSION_HEX >= 0x030400B1) \
+         || (defined(PYPY_VERSION) && PY_VERSION_HEX >= 0x03070000 \
+             && PYPY_VERSION_NUM >= 0x07090000)))
+#  define PyHASH_BITS _PyHASH_BITS
+#  define PyHASH_MODULUS _PyHASH_MODULUS
+#  define PyHASH_INF _PyHASH_INF
+#  define PyHASH_IMAG _PyHASH_IMAG
+#endif
+
+
+// gh-111545 added Py_GetConstant() and Py_GetConstantBorrowed()
+// to Python 3.13.0a6
+#if PY_VERSION_HEX < 0x030D00A6 && !defined(Py_CONSTANT_NONE)
+
+#define Py_CONSTANT_NONE 0
+#define Py_CONSTANT_FALSE 1
+#define Py_CONSTANT_TRUE 2
+#define Py_CONSTANT_ELLIPSIS 3
+#define Py_CONSTANT_NOT_IMPLEMENTED 4
+#define Py_CONSTANT_ZERO 5
+#define Py_CONSTANT_ONE 6
+#define Py_CONSTANT_EMPTY_STR 7
+#define Py_CONSTANT_EMPTY_BYTES 8
+#define Py_CONSTANT_EMPTY_TUPLE 9
+
+static inline PyObject* Py_GetConstant(unsigned int constant_id)
+{
+    static PyObject* constants[Py_CONSTANT_EMPTY_TUPLE + 1] = {NULL};
+
+    if (constants[Py_CONSTANT_NONE] == NULL) {
+        constants[Py_CONSTANT_NONE] = Py_None;
+        constants[Py_CONSTANT_FALSE] = Py_False;
+        constants[Py_CONSTANT_TRUE] = Py_True;
+        constants[Py_CONSTANT_ELLIPSIS] = Py_Ellipsis;
+        constants[Py_CONSTANT_NOT_IMPLEMENTED] = Py_NotImplemented;
+
+        constants[Py_CONSTANT_ZERO] = PyLong_FromLong(0);
+        if (constants[Py_CONSTANT_ZERO] == NULL) {
+            goto fatal_error;
+        }
+
+        constants[Py_CONSTANT_ONE] = PyLong_FromLong(1);
+        if (constants[Py_CONSTANT_ONE] == NULL) {
+            goto fatal_error;
+        }
+
+        constants[Py_CONSTANT_EMPTY_STR] = PyUnicode_FromStringAndSize("", 0);
+        if (constants[Py_CONSTANT_EMPTY_STR] == NULL) {
+            goto fatal_error;
+        }
+
+        constants[Py_CONSTANT_EMPTY_BYTES] = PyBytes_FromStringAndSize("", 0);
+        if (constants[Py_CONSTANT_EMPTY_BYTES] == NULL) {
+            goto fatal_error;
+        }
+
+        constants[Py_CONSTANT_EMPTY_TUPLE] = PyTuple_New(0);
+        if (constants[Py_CONSTANT_EMPTY_TUPLE] == NULL) {
+            goto fatal_error;
+        }
+        // goto dance to avoid compiler warnings about Py_FatalError()
+        goto init_done;
+
+fatal_error:
+        // This case should never happen
+        Py_FatalError("Py_GetConstant() failed to get constants");
+    }
+
+init_done:
+    if (constant_id <= Py_CONSTANT_EMPTY_TUPLE) {
+        return Py_NewRef(constants[constant_id]);
+    }
+    else {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+}
+
+static inline PyObject* Py_GetConstantBorrowed(unsigned int constant_id)
+{
+    PyObject *obj = Py_GetConstant(constant_id);
+    Py_XDECREF(obj);
+    return obj;
+}
+#endif
+
+
+// gh-114329 added PyList_GetItemRef() to Python 3.13.0a4
+#if PY_VERSION_HEX < 0x030D00A4
+static inline PyObject *
+PyList_GetItemRef(PyObject *op, Py_ssize_t index)
+{
+    PyObject *item = PyList_GetItem(op, index);
+    Py_XINCREF(item);
+    return item;
+}
+#endif
+
+
+// gh-114329 added PyList_GetItemRef() to Python 3.13.0a4
+#if PY_VERSION_HEX < 0x030D00A4
+static inline int
+PyDict_SetDefaultRef(PyObject *d, PyObject *key, PyObject *default_value,
+                     PyObject **result)
+{
+    PyObject *value;
+    if (PyDict_GetItemRef(d, key, &value) < 0) {
+        // get error
+        if (result) {
+            *result = NULL;
+        }
+        return -1;
+    }
+    if (value != NULL) {
+        // present
+        if (result) {
+            *result = value;
+        }
+        else {
+            Py_DECREF(value);
+        }
+        return 1;
+    }
+
+    // missing: set the item
+    if (PyDict_SetItem(d, key, default_value) < 0) {
+        // set error
+        if (result) {
+            *result = NULL;
+        }
+        return -1;
+    }
+    if (result) {
+        *result = Py_NewRef(default_value);
+    }
+    return 0;
+}
+#endif
+
+#if PY_VERSION_HEX < 0x030E0000 && PY_VERSION_HEX >= 0x03060000 && !defined(PYPY_VERSION)
+typedef struct PyUnicodeWriter PyUnicodeWriter;
+
+static inline void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
+{
+    _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
+    PyMem_Free(writer);
+}
+
+static inline PyUnicodeWriter* PyUnicodeWriter_Create(Py_ssize_t length)
+{
+    if (length < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "length must be positive");
+        return NULL;
+    }
+
+    const size_t size = sizeof(_PyUnicodeWriter);
+    PyUnicodeWriter *pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
+    if (pub_writer == _Py_NULL) {
+        PyErr_NoMemory();
+        return _Py_NULL;
+    }
+    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
+
+    _PyUnicodeWriter_Init(writer);
+    if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
+        PyUnicodeWriter_Discard(pub_writer);
+        return NULL;
+    }
+    writer->overallocate = 1;
+    return pub_writer;
+}
+
+static inline PyObject* PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
+{
+    PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
+    assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
+    PyMem_Free(writer);
+    return str;
+}
+
+static inline int
+PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
+{
+    if (ch > 0x10ffff) {
+        PyErr_SetString(PyExc_ValueError,
+                        "character must be in range(0x110000)");
+        return -1;
+    }
+
+    return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
+}
+
+static inline int
+PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
+{
+    PyObject *str = PyObject_Str(obj);
+    if (str == NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
+    Py_DECREF(str);
+    return res;
+}
+
+static inline int
+PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
+{
+    PyObject *str = PyObject_Repr(obj);
+    if (str == NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
+    Py_DECREF(str);
+    return res;
+}
+
+static inline int
+PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
+                          const char *str, Py_ssize_t size)
+{
+    if (size < 0) {
+        size = (Py_ssize_t)strlen(str);
+    }
+
+    PyObject *str_obj = PyUnicode_FromStringAndSize(str, size);
+    if (str_obj == _Py_NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str_obj);
+    Py_DECREF(str_obj);
+    return res;
+}
+
+static inline int
+PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer,
+                              const wchar_t *str, Py_ssize_t size)
+{
+    if (size < 0) {
+        size = (Py_ssize_t)wcslen(str);
+    }
+
+    PyObject *str_obj = PyUnicode_FromWideChar(str, size);
+    if (str_obj == _Py_NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str_obj);
+    Py_DECREF(str_obj);
+    return res;
+}
+
+static inline int
+PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
+                               Py_ssize_t start, Py_ssize_t end)
+{
+    if (!PyUnicode_Check(str)) {
+        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
+        return -1;
+    }
+    if (start < 0 || start > end) {
+        PyErr_Format(PyExc_ValueError, "invalid start argument");
+        return -1;
+    }
+    if (end > PyUnicode_GET_LENGTH(str)) {
+        PyErr_Format(PyExc_ValueError, "invalid end argument");
+        return -1;
+    }
+
+    return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
+                                           start, end);
+}
+
+static inline int
+PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
+{
+    va_list vargs;
+    va_start(vargs, format);
+    PyObject *str = PyUnicode_FromFormatV(format, vargs);
+    va_end(vargs);
+    if (str == _Py_NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
+    Py_DECREF(str);
+    return res;
+}
+#endif  // PY_VERSION_HEX < 0x030E0000
+
+// gh-116560 added PyLong_GetSign() to Python 3.14.0a0
+#if PY_VERSION_HEX < 0x030E00A0
+static inline int PyLong_GetSign(PyObject *obj, int *sign)
+{
+    if (!PyLong_Check(obj)) {
+        PyErr_Format(PyExc_TypeError, "expect int, got %s", Py_TYPE(obj)->tp_name);
+        return -1;
+    }
+
+    *sign = _PyLong_Sign(obj);
+    return 0;
+}
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // PYTHONCAPI_COMPAT
diff --git a/pyarrow/include/arrow/python/visibility.h b/pyarrow/include/arrow/python/visibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bf9680a06bf016478232f3914d3728bfd9ebffd
--- /dev/null
+++ b/pyarrow/include/arrow/python/visibility.h
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)  // Windows
+#  if defined(_MSC_VER)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_PYTHON_STATIC
+#    define ARROW_PYTHON_EXPORT
+#  elif defined(ARROW_PYTHON_EXPORTING)
+#    define ARROW_PYTHON_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_PYTHON_EXPORT __declspec(dllimport)
+#  endif
+
+#else  // Not Windows
+#  ifndef ARROW_PYTHON_EXPORT
+#    define ARROW_PYTHON_EXPORT __attribute__((visibility("default")))
+#  endif
+#endif  // Non-Windows
diff --git a/pyarrow/include/arrow/record_batch.h b/pyarrow/include/arrow/record_batch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d1d2d4ac359c6d96f91ba5bdf3b5b123597c9f7
--- /dev/null
+++ b/pyarrow/include/arrow/record_batch.h
@@ -0,0 +1,481 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/compare.h"
+#include "arrow/device.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \class RecordBatch
+/// \brief Collection of equal-length arrays matching a particular Schema
+///
+/// A record batch is table-like data structure that is semantically a sequence
+/// of fields, each a contiguous Arrow array
+class ARROW_EXPORT RecordBatch {
+ public:
+  virtual ~RecordBatch() = default;
+
+  /// \param[in] schema The record batch schema
+  /// \param[in] num_rows length of fields in the record batch. Each array
+  /// should have the same length as num_rows
+  /// \param[in] columns the record batch fields as vector of arrays
+  /// \param[in] sync_event optional synchronization event for non-CPU device
+  /// memory used by buffers
+  static std::shared_ptr<RecordBatch> Make(
+      std::shared_ptr<Schema> schema, int64_t num_rows,
+      std::vector<std::shared_ptr<Array>> columns,
+      std::shared_ptr<Device::SyncEvent> sync_event = NULLPTR);
+
+  /// \brief Construct record batch from vector of internal data structures
+  /// \since 0.5.0
+  ///
+  /// This class is intended for internal use, or advanced users.
+  ///
+  /// \param schema the record batch schema
+  /// \param num_rows the number of semantic rows in the record batch. This
+  /// should be equal to the length of each field
+  /// \param columns the data for the batch's columns
+  /// \param device_type the type of the device that the Arrow columns are
+  /// allocated on
+  /// \param sync_event optional synchronization event for non-CPU device
+  /// memory used by buffers
+  static std::shared_ptr<RecordBatch> Make(
+      std::shared_ptr<Schema> schema, int64_t num_rows,
+      std::vector<std::shared_ptr<ArrayData>> columns,
+      DeviceAllocationType device_type = DeviceAllocationType::kCPU,
+      std::shared_ptr<Device::SyncEvent> sync_event = NULLPTR);
+
+  /// \brief Create an empty RecordBatch of a given schema
+  ///
+  /// The output RecordBatch will be created with DataTypes from
+  /// the given schema.
+  ///
+  /// \param[in] schema the schema of the empty RecordBatch
+  /// \param[in] pool the memory pool to allocate memory from
+  /// \return the resulting RecordBatch
+  static Result<std::shared_ptr<RecordBatch>> MakeEmpty(
+      std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool());
+
+  /// \brief Convert record batch to struct array
+  ///
+  /// Create a struct array whose child arrays are the record batch's columns.
+  /// Note that the record batch's top-level field metadata cannot be reflected
+  /// in the resulting struct array.
+  Result<std::shared_ptr<StructArray>> ToStructArray() const;
+
+  /// \brief Convert record batch with one data type to Tensor
+  ///
+  /// Create a Tensor object with shape (number of rows, number of columns) and
+  /// strides (type size in bytes, type size in bytes * number of rows).
+  /// Generated Tensor will have column-major layout.
+  ///
+  /// \param[in] null_to_nan if true, convert nulls to NaN
+  /// \param[in] row_major if true, create row-major Tensor else column-major Tensor
+  /// \param[in] pool the memory pool to allocate the tensor buffer
+  /// \return the resulting Tensor
+  Result<std::shared_ptr<Tensor>> ToTensor(
+      bool null_to_nan = false, bool row_major = true,
+      MemoryPool* pool = default_memory_pool()) const;
+
+  /// \brief Construct record batch from struct array
+  ///
+  /// This constructs a record batch using the child arrays of the given
+  /// array, which must be a struct array.
+  ///
+  /// \param[in] array the source array, must be a StructArray
+  /// \param[in] pool the memory pool to allocate new validity bitmaps
+  ///
+  /// This operation will usually be zero-copy.  However, if the struct array has an
+  /// offset or a validity bitmap then these will need to be pushed into the child arrays.
+  /// Pushing the offset is zero-copy but pushing the validity bitmap is not.
+  static Result<std::shared_ptr<RecordBatch>> FromStructArray(
+      const std::shared_ptr<Array>& array, MemoryPool* pool = default_memory_pool());
+
+  /// \brief Determine if two record batches are equal
+  ///
+  /// \param[in] other the RecordBatch to compare with
+  /// \param[in] check_metadata if true, the schema metadata will be compared,
+  ///            regardless of the value set in \ref EqualOptions::use_metadata
+  /// \param[in] opts the options for equality comparisons
+  /// \return true if batches are equal
+  bool Equals(const RecordBatch& other, bool check_metadata = false,
+              const EqualOptions& opts = EqualOptions::Defaults()) const;
+
+  /// \brief Determine if two record batches are equal
+  ///
+  /// \param[in] other the RecordBatch to compare with
+  /// \param[in] opts the options for equality comparisons
+  /// \return true if batches are equal
+  bool Equals(const RecordBatch& other, const EqualOptions& opts) const;
+
+  /// \brief Determine if two record batches are approximately equal
+  ///
+  /// \param[in] other the RecordBatch to compare with
+  /// \param[in] opts the options for equality comparisons
+  /// \return true if batches are approximately equal
+  bool ApproxEquals(const RecordBatch& other,
+                    const EqualOptions& opts = EqualOptions::Defaults()) const {
+    return Equals(other, opts.use_schema(false).use_atol(true));
+  }
+
+  /// \return the record batch's schema
+  const std::shared_ptr<Schema>& schema() const { return schema_; }
+
+  /// \brief Replace the schema with another schema with the same types, but potentially
+  /// different field names and/or metadata.
+  Result<std::shared_ptr<RecordBatch>> ReplaceSchema(
+      std::shared_ptr<Schema> schema) const;
+
+  /// \brief Retrieve all columns at once
+  virtual const std::vector<std::shared_ptr<Array>>& columns() const = 0;
+
+  /// \brief Retrieve an array from the record batch
+  /// \param[in] i field index, does not boundscheck
+  /// \return an Array object
+  virtual std::shared_ptr<Array> column(int i) const = 0;
+
+  /// \brief Retrieve an array from the record batch
+  /// \param[in] name field name
+  /// \return an Array or null if no field was found
+  std::shared_ptr<Array> GetColumnByName(const std::string& name) const;
+
+  /// \brief Retrieve an array's internal data from the record batch
+  /// \param[in] i field index, does not boundscheck
+  /// \return an internal ArrayData object
+  virtual std::shared_ptr<ArrayData> column_data(int i) const = 0;
+
+  /// \brief Retrieve all arrays' internal data from the record batch.
+  virtual const ArrayDataVector& column_data() const = 0;
+
+  /// \brief Add column to the record batch, producing a new RecordBatch
+  ///
+  /// \param[in] i field index, which will be boundschecked
+  /// \param[in] field field to be added
+  /// \param[in] column column to be added
+  virtual Result<std::shared_ptr<RecordBatch>> AddColumn(
+      int i, const std::shared_ptr<Field>& field,
+      const std::shared_ptr<Array>& column) const = 0;
+
+  /// \brief Add new nullable column to the record batch, producing a new
+  /// RecordBatch.
+  ///
+  /// For non-nullable columns, use the Field-based version of this method.
+  ///
+  /// \param[in] i field index, which will be boundschecked
+  /// \param[in] field_name name of field to be added
+  /// \param[in] column column to be added
+  virtual Result<std::shared_ptr<RecordBatch>> AddColumn(
+      int i, std::string field_name, const std::shared_ptr<Array>& column) const;
+
+  /// \brief Replace a column in the record batch, producing a new RecordBatch
+  ///
+  /// \param[in] i field index, does boundscheck
+  /// \param[in] field field to be replaced
+  /// \param[in] column column to be replaced
+  virtual Result<std::shared_ptr<RecordBatch>> SetColumn(
+      int i, const std::shared_ptr<Field>& field,
+      const std::shared_ptr<Array>& column) const = 0;
+
+  /// \brief Remove column from the record batch, producing a new RecordBatch
+  ///
+  /// \param[in] i field index, does boundscheck
+  virtual Result<std::shared_ptr<RecordBatch>> RemoveColumn(int i) const = 0;
+
+  virtual std::shared_ptr<RecordBatch> ReplaceSchemaMetadata(
+      const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0;
+
+  /// \brief Name in i-th column
+  const std::string& column_name(int i) const;
+
+  /// \return the number of columns in the table
+  int num_columns() const;
+
+  /// \return the number of rows (the corresponding length of each column)
+  int64_t num_rows() const { return num_rows_; }
+
+  /// \brief Copy the entire RecordBatch to destination MemoryManager
+  ///
+  /// This uses Array::CopyTo on each column of the record batch to create
+  /// a new record batch where all underlying buffers for the columns have
+  /// been copied to the destination MemoryManager. This uses
+  /// MemoryManager::CopyBuffer under the hood.
+  Result<std::shared_ptr<RecordBatch>> CopyTo(
+      const std::shared_ptr<MemoryManager>& to) const;
+
+  /// \brief View or Copy the entire RecordBatch to destination MemoryManager
+  ///
+  /// This uses Array::ViewOrCopyTo on each column of the record batch to create
+  /// a new record batch where all underlying buffers for the columns have
+  /// been zero-copy viewed on the destination MemoryManager, falling back
+  /// to performing a copy if it can't be viewed as a zero-copy buffer. This uses
+  /// Buffer::ViewOrCopy under the hood.
+  Result<std::shared_ptr<RecordBatch>> ViewOrCopyTo(
+      const std::shared_ptr<MemoryManager>& to) const;
+
+  /// \brief Slice each of the arrays in the record batch
+  /// \param[in] offset the starting offset to slice, through end of batch
+  /// \return new record batch
+  virtual std::shared_ptr<RecordBatch> Slice(int64_t offset) const;
+
+  /// \brief Slice each of the arrays in the record batch
+  /// \param[in] offset the starting offset to slice
+  /// \param[in] length the number of elements to slice from offset
+  /// \return new record batch
+  virtual std::shared_ptr<RecordBatch> Slice(int64_t offset, int64_t length) const = 0;
+
+  /// \return PrettyPrint representation suitable for debugging
+  std::string ToString() const;
+
+  /// \brief Return names of all columns
+  std::vector<std::string> ColumnNames() const;
+
+  /// \brief Rename columns with provided names
+  Result<std::shared_ptr<RecordBatch>> RenameColumns(
+      const std::vector<std::string>& names) const;
+
+  /// \brief Return new record batch with specified columns
+  Result<std::shared_ptr<RecordBatch>> SelectColumns(
+      const std::vector<int>& indices) const;
+
+  /// \brief Perform cheap validation checks to determine obvious inconsistencies
+  /// within the record batch's schema and internal data.
+  ///
+  /// This is O(k) where k is the total number of fields and array descendents.
+  ///
+  /// \return Status
+  virtual Status Validate() const;
+
+  /// \brief Perform extensive validation checks to determine inconsistencies
+  /// within the record batch's schema and internal data.
+  ///
+  /// This is potentially O(k*n) where n is the number of rows.
+  ///
+  /// \return Status
+  virtual Status ValidateFull() const;
+
+  /// \brief EXPERIMENTAL: Return a top-level sync event object for this record batch
+  ///
+  /// If all of the data for this record batch is in CPU memory, then this
+  /// will return null. If the data for this batch is
+  /// on a device, then if synchronization is needed before accessing the
+  /// data the returned sync event will allow for it.
+  ///
+  /// \return null or a Device::SyncEvent
+  virtual const std::shared_ptr<Device::SyncEvent>& GetSyncEvent() const = 0;
+
+  virtual DeviceAllocationType device_type() const = 0;
+
+  /// \brief Create a statistics array of this record batch
+  ///
+  /// The created array follows the C data interface statistics
+  /// specification. See
+  /// https://arrow.apache.org/docs/format/StatisticsSchema.html
+  /// for details.
+  ///
+  /// \param[in] pool the memory pool to allocate memory from
+  /// \return the statistics array of this record batch
+  Result<std::shared_ptr<Array>> MakeStatisticsArray(
+      MemoryPool* pool = default_memory_pool()) const;
+
+ protected:
+  RecordBatch(std::shared_ptr<Schema> schema, int64_t num_rows);
+
+  std::shared_ptr<Schema> schema_;
+  int64_t num_rows_;
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatch);
+};
+
+struct ARROW_EXPORT RecordBatchWithMetadata {
+  std::shared_ptr<RecordBatch> batch;
+  std::shared_ptr<KeyValueMetadata> custom_metadata;
+};
+
+template <>
+struct IterationTraits<RecordBatchWithMetadata> {
+  static RecordBatchWithMetadata End() { return {NULLPTR, NULLPTR}; }
+  static bool IsEnd(const RecordBatchWithMetadata& val) { return val.batch == NULLPTR; }
+};
+
+/// \brief Abstract interface for reading stream of record batches
+class ARROW_EXPORT RecordBatchReader {
+ public:
+  using ValueType = std::shared_ptr<RecordBatch>;
+
+  virtual ~RecordBatchReader();
+
+  /// \return the shared schema of the record batches in the stream
+  virtual std::shared_ptr<Schema> schema() const = 0;
+
+  /// \brief Read the next record batch in the stream. Return null for batch
+  /// when reaching end of stream
+  ///
+  /// Example:
+  ///
+  /// ```
+  /// while (true) {
+  ///   std::shared_ptr<RecordBatch> batch;
+  ///   ARROW_RETURN_NOT_OK(reader->ReadNext(&batch));
+  ///   if (!batch) {
+  ///     break;
+  ///   }
+  ///   // handling the `batch`, the `batch->num_rows()`
+  ///   // might be 0.
+  /// }
+  /// ```
+  ///
+  /// \param[out] batch the next loaded batch, null at end of stream. Returning
+  /// an empty batch doesn't mean the end of stream because it is valid data.
+  /// \return Status
+  virtual Status ReadNext(std::shared_ptr<RecordBatch>* batch) = 0;
+
+  virtual Result<RecordBatchWithMetadata> ReadNext() {
+    return Status::NotImplemented("ReadNext with custom metadata");
+  }
+
+  /// \brief Iterator interface
+  Result<std::shared_ptr<RecordBatch>> Next() {
+    std::shared_ptr<RecordBatch> batch;
+    ARROW_RETURN_NOT_OK(ReadNext(&batch));
+    return batch;
+  }
+
+  /// \brief finalize reader
+  virtual Status Close() { return Status::OK(); }
+
+  /// \brief EXPERIMENTAL: Get the device type for record batches this reader produces
+  ///
+  /// default implementation is to return DeviceAllocationType::kCPU
+  virtual DeviceAllocationType device_type() const { return DeviceAllocationType::kCPU; }
+
+  class RecordBatchReaderIterator {
+   public:
+    using iterator_category = std::input_iterator_tag;
+    using difference_type = std::ptrdiff_t;
+    using value_type = std::shared_ptr<RecordBatch>;
+    using pointer = const value_type*;
+    using reference = const value_type&;
+
+    RecordBatchReaderIterator() : batch_(RecordBatchEnd()), reader_(NULLPTR) {}
+
+    explicit RecordBatchReaderIterator(RecordBatchReader* reader)
+        : batch_(RecordBatchEnd()), reader_(reader) {
+      Next();
+    }
+
+    bool operator==(const RecordBatchReaderIterator& other) const {
+      return batch_ == other.batch_;
+    }
+
+    bool operator!=(const RecordBatchReaderIterator& other) const {
+      return !(*this == other);
+    }
+
+    Result<std::shared_ptr<RecordBatch>> operator*() {
+      ARROW_RETURN_NOT_OK(batch_);
+
+      return batch_;
+    }
+
+    RecordBatchReaderIterator& operator++() {
+      Next();
+      return *this;
+    }
+
+    RecordBatchReaderIterator operator++(int) {
+      RecordBatchReaderIterator tmp(*this);
+      Next();
+      return tmp;
+    }
+
+   private:
+    std::shared_ptr<RecordBatch> RecordBatchEnd() {
+      return std::shared_ptr<RecordBatch>(NULLPTR);
+    }
+
+    void Next() {
+      if (reader_ == NULLPTR) {
+        batch_ = RecordBatchEnd();
+        return;
+      }
+      batch_ = reader_->Next();
+    }
+
+    Result<std::shared_ptr<RecordBatch>> batch_;
+    RecordBatchReader* reader_;
+  };
+  /// \brief Return an iterator to the first record batch in the stream
+  RecordBatchReaderIterator begin() { return RecordBatchReaderIterator(this); }
+
+  /// \brief Return an iterator to the end of the stream
+  RecordBatchReaderIterator end() { return RecordBatchReaderIterator(); }
+
+  /// \brief Consume entire stream as a vector of record batches
+  Result<RecordBatchVector> ToRecordBatches();
+
+  /// \brief Read all batches and concatenate as arrow::Table
+  Result<std::shared_ptr<Table>> ToTable();
+
+  /// \brief Create a RecordBatchReader from a vector of RecordBatch.
+  ///
+  /// \param[in] batches the vector of RecordBatch to read from
+  /// \param[in] schema schema to conform to. Will be inferred from the first
+  ///            element if not provided.
+  /// \param[in] device_type the type of device that the batches are allocated on
+  static Result<std::shared_ptr<RecordBatchReader>> Make(
+      RecordBatchVector batches, std::shared_ptr<Schema> schema = NULLPTR,
+      DeviceAllocationType device_type = DeviceAllocationType::kCPU);
+
+  /// \brief Create a RecordBatchReader from an Iterator of RecordBatch.
+  ///
+  /// \param[in] batches an iterator of RecordBatch to read from.
+  /// \param[in] schema schema that each record batch in iterator will conform to.
+  /// \param[in] device_type the type of device that the batches are allocated on
+  static Result<std::shared_ptr<RecordBatchReader>> MakeFromIterator(
+      Iterator<std::shared_ptr<RecordBatch>> batches, std::shared_ptr<Schema> schema,
+      DeviceAllocationType device_type = DeviceAllocationType::kCPU);
+};
+
+/// \brief Concatenate record batches
+///
+/// The columns of the new batch are formed by concatenate the same columns of each input
+/// batch. Concatenate multiple batches into a new batch requires that the schema must be
+/// consistent. It supports merging batches without columns (only length, scenarios such
+/// as count(*)).
+///
+/// \param[in] batches a vector of record batches to be concatenated
+/// \param[in] pool memory to store the result will be allocated from this memory pool
+/// \return the concatenated record batch
+ARROW_EXPORT
+Result<std::shared_ptr<RecordBatch>> ConcatenateRecordBatches(
+    const RecordBatchVector& batches, MemoryPool* pool = default_memory_pool());
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/result.h b/pyarrow/include/arrow/result.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b25de69486417e2be040017edaf646a3f25b98c
--- /dev/null
+++ b/pyarrow/include/arrow/result.h
@@ -0,0 +1,521 @@
+//
+// Copyright 2017 Asylo authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+// Adapted from Asylo
+
+#pragma once
+
+#include <cstddef>
+#include <new>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/status.h"
+#include "arrow/util/aligned_storage.h"
+#include "arrow/util/compare.h"
+
+namespace arrow {
+
+template <typename>
+struct EnsureResult;
+
+namespace internal {
+
+ARROW_EXPORT void DieWithMessage(const std::string& msg);
+
+ARROW_EXPORT void InvalidValueOrDie(const Status& st);
+
+ARROW_EXPORT Status UninitializedResult();
+
+}  // namespace internal
+
+/// A class for representing either a usable value, or an error.
+///
+/// A Result object either contains a value of type `T` or a Status object
+/// explaining why such a value is not present. The type `T` must be
+/// copy-constructible and/or move-constructible.
+///
+/// The state of a Result object may be determined by calling ok() or
+/// status(). The ok() method returns true if the object contains a valid value.
+/// The status() method returns the internal Status object. A Result object
+/// that contains a valid value will return an OK Status for a call to status().
+///
+/// A value of type `T` may be extracted from a Result object through a call
+/// to ValueOrDie(). This function should only be called if a call to ok()
+/// returns true. Sample usage:
+///
+/// ```
+///   arrow::Result<Foo> result = CalculateFoo();
+///   if (result.ok()) {
+///     Foo foo = result.ValueOrDie();
+///     foo.DoSomethingCool();
+///   } else {
+///     ARROW_LOG(ERROR) << result.status();
+///  }
+/// ```
+///
+/// If `T` is a move-only type, like `std::unique_ptr<>`, then the value should
+/// only be extracted after invoking `std::move()` on the Result object.
+/// Sample usage:
+///
+/// ```
+///   arrow::Result<std::unique_ptr<Foo>> result = CalculateFoo();
+///   if (result.ok()) {
+///     std::unique_ptr<Foo> foo = std::move(result).ValueOrDie();
+///     foo->DoSomethingCool();
+///   } else {
+///     ARROW_LOG(ERROR) << result.status();
+///   }
+/// ```
+///
+/// Result is provided for the convenience of implementing functions that
+/// return some value but may fail during execution. For instance, consider a
+/// function with the following signature:
+///
+/// ```
+///   arrow::Status CalculateFoo(int *output);
+/// ```
+///
+/// This function may instead be written as:
+///
+/// ```
+///   arrow::Result<int> CalculateFoo();
+/// ```
+template <class T>
+class [[nodiscard]] Result : public util::EqualityComparable<Result<T>> {
+  template <typename U>
+  friend class Result;
+
+  static_assert(!std::is_same<T, Status>::value,
+                "this assert indicates you have probably made a metaprogramming error");
+
+ public:
+  using ValueType = T;
+
+  /// Constructs a Result object that contains a non-OK status.
+  ///
+  /// This constructor is marked `explicit` to prevent attempts to `return {}`
+  /// from a function with a return type of, for example,
+  /// `Result<std::vector<int>>`. While `return {}` seems like it would return
+  /// an empty vector, it will actually invoke the default constructor of
+  /// Result.
+  explicit Result() noexcept  // NOLINT(runtime/explicit)
+      : status_(internal::UninitializedResult()) {}
+
+  ~Result() noexcept { Destroy(); }
+
+  /// Constructs a Result object with the given non-OK Status object. All
+  /// calls to ValueOrDie() on this object will abort. The given `status` must
+  /// not be an OK status, otherwise this constructor will abort.
+  ///
+  /// This constructor is not declared explicit so that a function with a return
+  /// type of `Result<T>` can return a Status object, and the status will be
+  /// implicitly converted to the appropriate return type as a matter of
+  /// convenience.
+  ///
+  /// \param status The non-OK Status object to initialize to.
+  Result(const Status& status) noexcept  // NOLINT(runtime/explicit)
+      : status_(status) {
+    if (ARROW_PREDICT_FALSE(status.ok())) {
+      internal::DieWithMessage(std::string("Constructed with a non-error status: ") +
+                               status.ToString());
+    }
+  }
+
+  /// Constructs a Result object that contains `value`. The resulting object
+  /// is considered to have an OK status. The wrapped element can be accessed
+  /// with ValueOrDie().
+  ///
+  /// This constructor is made implicit so that a function with a return type of
+  /// `Result<T>` can return an object of type `U &&`, implicitly converting
+  /// it to a `Result<T>` object.
+  ///
+  /// Note that `T` must be implicitly constructible from `U`, and `U` must not
+  /// be a (cv-qualified) Status or Status-reference type. Due to C++
+  /// reference-collapsing rules and perfect-forwarding semantics, this
+  /// constructor matches invocations that pass `value` either as a const
+  /// reference or as an rvalue reference. Since Result needs to work for both
+  /// reference and rvalue-reference types, the constructor uses perfect
+  /// forwarding to avoid invalidating arguments that were passed by reference.
+  /// See http://thbecker.net/articles/rvalue_references/section_08.html for
+  /// additional details.
+  ///
+  /// \param value The value to initialize to.
+  template <typename U,
+            typename E = typename std::enable_if<
+                std::is_constructible<T, U>::value && std::is_convertible<U, T>::value &&
+                !std::is_same<typename std::remove_reference<
+                                  typename std::remove_cv<U>::type>::type,
+                              Status>::value>::type>
+  Result(U&& value) noexcept {  // NOLINT(runtime/explicit)
+    ConstructValue(std::forward<U>(value));
+  }
+
+  /// Constructs a Result object that contains `value`. The resulting object
+  /// is considered to have an OK status. The wrapped element can be accessed
+  /// with ValueOrDie().
+  ///
+  /// This constructor is made implicit so that a function with a return type of
+  /// `Result<T>` can return an object of type `T`, implicitly converting
+  /// it to a `Result<T>` object.
+  ///
+  /// \param value The value to initialize to.
+  // NOTE `Result(U&& value)` above should be sufficient, but some compilers
+  // fail matching it.
+  Result(T&& value) noexcept {  // NOLINT(runtime/explicit)
+    ConstructValue(std::move(value));
+  }
+
+  /// Copy constructor.
+  ///
+  /// This constructor needs to be explicitly defined because the presence of
+  /// the move-assignment operator deletes the default copy constructor. In such
+  /// a scenario, since the deleted copy constructor has stricter binding rules
+  /// than the templated copy constructor, the templated constructor cannot act
+  /// as a copy constructor, and any attempt to copy-construct a `Result`
+  /// object results in a compilation error.
+  ///
+  /// \param other The value to copy from.
+  Result(const Result& other) noexcept : status_(other.status_) {
+    if (ARROW_PREDICT_TRUE(status_.ok())) {
+      ConstructValue(other.ValueUnsafe());
+    }
+  }
+
+  /// Templatized constructor that constructs a `Result<T>` from a const
+  /// reference to a `Result<U>`.
+  ///
+  /// `T` must be implicitly constructible from `const U &`.
+  ///
+  /// \param other The value to copy from.
+  template <typename U, typename E = typename std::enable_if<
+                            std::is_constructible<T, const U&>::value &&
+                            std::is_convertible<U, T>::value>::type>
+  Result(const Result<U>& other) noexcept : status_(other.status_) {
+    if (ARROW_PREDICT_TRUE(status_.ok())) {
+      ConstructValue(other.ValueUnsafe());
+    }
+  }
+
+  /// Copy-assignment operator.
+  ///
+  /// \param other The Result object to copy.
+  Result& operator=(const Result& other) noexcept {
+    // Check for self-assignment.
+    if (ARROW_PREDICT_FALSE(this == &other)) {
+      return *this;
+    }
+    Destroy();
+    status_ = other.status_;
+    if (ARROW_PREDICT_TRUE(status_.ok())) {
+      ConstructValue(other.ValueUnsafe());
+    }
+    return *this;
+  }
+
+  /// Templatized constructor which constructs a `Result<T>` by moving the
+  /// contents of a `Result<U>`. `T` must be implicitly constructible from `U
+  /// &&`.
+  ///
+  /// Sets `other` to contain a non-OK status with a `StatusError::Invalid`
+  /// error code.
+  ///
+  /// \param other The Result object to move from and set to a non-OK status.
+  template <typename U,
+            typename E = typename std::enable_if<std::is_constructible<T, U&&>::value &&
+                                                 std::is_convertible<U, T>::value>::type>
+  Result(Result<U>&& other) noexcept {
+    if (ARROW_PREDICT_TRUE(other.status_.ok())) {
+      status_ = std::move(other.status_);
+      ConstructValue(other.MoveValueUnsafe());
+    } else {
+      // If we moved the status, the other status may become ok but the other
+      // value hasn't been constructed => crash on other destructor.
+      status_ = other.status_;
+    }
+  }
+
+  /// Move-assignment operator.
+  ///
+  /// Sets `other` to an invalid state..
+  ///
+  /// \param other The Result object to assign from and set to a non-OK
+  /// status.
+  Result& operator=(Result&& other) noexcept {
+    // Check for self-assignment.
+    if (ARROW_PREDICT_FALSE(this == &other)) {
+      return *this;
+    }
+    Destroy();
+    if (ARROW_PREDICT_TRUE(other.status_.ok())) {
+      status_ = std::move(other.status_);
+      ConstructValue(other.MoveValueUnsafe());
+    } else {
+      // If we moved the status, the other status may become ok but the other
+      // value hasn't been constructed => crash on other destructor.
+      status_ = other.status_;
+    }
+    return *this;
+  }
+
+  /// Compare to another Result.
+  bool Equals(const Result& other) const {
+    if (ARROW_PREDICT_TRUE(status_.ok())) {
+      return other.status_.ok() && ValueUnsafe() == other.ValueUnsafe();
+    }
+    return status_ == other.status_;
+  }
+
+  /// Indicates whether the object contains a `T` value.  Generally instead
+  /// of accessing this directly you will want to use ASSIGN_OR_RAISE defined
+  /// below.
+  ///
+  /// \return True if this Result object's status is OK (i.e. a call to ok()
+  /// returns true). If this function returns true, then it is safe to access
+  /// the wrapped element through a call to ValueOrDie().
+  constexpr bool ok() const { return status_.ok(); }
+
+  /// \brief Equivalent to ok().
+  // operator bool() const { return ok(); }
+
+  /// Gets the stored status object, or an OK status if a `T` value is stored.
+  ///
+  /// \return The stored non-OK status object, or an OK status if this object
+  ///         has a value.
+  constexpr const Status& status() const& { return status_; }
+
+  /// Gets the stored status object, or an OK status if a `T` value is stored.
+  ///
+  /// \return The stored non-OK status object, or an OK status if this object
+  ///         has a value.
+  Status status() && {
+    if (ARROW_PREDICT_TRUE(ok())) return Status::OK();
+    auto tmp = internal::UninitializedResult();
+    std::swap(status_, tmp);
+    return tmp;
+  }
+
+  /// Gets the stored `T` value.
+  ///
+  /// This method should only be called if this Result object's status is OK
+  /// (i.e. a call to ok() returns true), otherwise this call will abort.
+  ///
+  /// \return The stored `T` value.
+  const T& ValueOrDie() const& {
+    if (ARROW_PREDICT_FALSE(!ok())) {
+      internal::InvalidValueOrDie(status_);
+    }
+    return ValueUnsafe();
+  }
+  const T& operator*() const& { return ValueOrDie(); }
+  const T* operator->() const { return &ValueOrDie(); }
+
+  /// Gets a mutable reference to the stored `T` value.
+  ///
+  /// This method should only be called if this Result object's status is OK
+  /// (i.e. a call to ok() returns true), otherwise this call will abort.
+  ///
+  /// \return The stored `T` value.
+  T& ValueOrDie() & {
+    if (ARROW_PREDICT_FALSE(!ok())) {
+      internal::InvalidValueOrDie(status_);
+    }
+    return ValueUnsafe();
+  }
+  T& operator*() & { return ValueOrDie(); }
+  T* operator->() { return &ValueOrDie(); }
+
+  /// Moves and returns the internally-stored `T` value.
+  ///
+  /// This method should only be called if this Result object's status is OK
+  /// (i.e. a call to ok() returns true), otherwise this call will abort. The
+  /// Result object is invalidated after this call and will be updated to
+  /// contain a non-OK status.
+  ///
+  /// \return The stored `T` value.
+  T ValueOrDie() && {
+    if (ARROW_PREDICT_FALSE(!ok())) {
+      internal::InvalidValueOrDie(status_);
+    }
+    return MoveValueUnsafe();
+  }
+  T operator*() && { return std::move(*this).ValueOrDie(); }
+
+  /// Helper method for implementing Status returning functions in terms of semantically
+  /// equivalent Result returning functions. For example:
+  ///
+  /// Status GetInt(int *out) { return GetInt().Value(out); }
+  template <typename U, typename E = typename std::enable_if<
+                            std::is_constructible<U, T>::value>::type>
+  Status Value(U* out) && {
+    if (!ok()) {
+      return std::move(*this).status();
+    }
+    *out = U(MoveValueUnsafe());
+    return Status::OK();
+  }
+
+  /// Move and return the internally stored value or alternative if an error is stored.
+  T ValueOr(T alternative) && {
+    if (!ok()) {
+      return alternative;
+    }
+    return MoveValueUnsafe();
+  }
+
+  /// Return a copy of the internally stored value or alternative if an error is stored.
+  T ValueOr(T alternative) const& {
+    if (!ok()) {
+      return alternative;
+    }
+    return ValueUnsafe();
+  }
+
+  /// Retrieve the value if ok(), falling back to an alternative generated by the provided
+  /// factory
+  template <typename G>
+  T ValueOrElse(G&& generate_alternative) && {
+    if (ok()) {
+      return MoveValueUnsafe();
+    }
+    return std::forward<G>(generate_alternative)();
+  }
+
+  /// Apply a function to the internally stored value to produce a new result or propagate
+  /// the stored error.
+  template <typename M>
+  typename EnsureResult<decltype(std::declval<M&&>()(std::declval<T&&>()))>::type Map(
+      M&& m) && {
+    if (!ok()) {
+      return std::move(*this).status();
+    }
+    return std::forward<M>(m)(MoveValueUnsafe());
+  }
+
+  /// Apply a function to the internally stored value to produce a new result or propagate
+  /// the stored error.
+  template <typename M>
+  typename EnsureResult<decltype(std::declval<M&&>()(std::declval<const T&>()))>::type
+  Map(M&& m) const& {
+    if (!ok()) {
+      return status();
+    }
+    return std::forward<M>(m)(ValueUnsafe());
+  }
+
+  /// Cast the internally stored value to produce a new result or propagate the stored
+  /// error.
+  template <typename U, typename E = typename std::enable_if<
+                            std::is_constructible<U, T>::value>::type>
+  Result<U> As() && {
+    if (!ok()) {
+      return std::move(*this).status();
+    }
+    return U(MoveValueUnsafe());
+  }
+
+  /// Cast the internally stored value to produce a new result or propagate the stored
+  /// error.
+  template <typename U, typename E = typename std::enable_if<
+                            std::is_constructible<U, const T&>::value>::type>
+  Result<U> As() const& {
+    if (!ok()) {
+      return status();
+    }
+    return U(ValueUnsafe());
+  }
+
+  constexpr const T& ValueUnsafe() const& { return *storage_.get(); }
+
+  constexpr T& ValueUnsafe() & { return *storage_.get(); }
+
+  T ValueUnsafe() && { return MoveValueUnsafe(); }
+
+  T MoveValueUnsafe() { return std::move(*storage_.get()); }
+
+ private:
+  Status status_;  // pointer-sized
+  internal::AlignedStorage<T> storage_;
+
+  template <typename U>
+  void ConstructValue(U&& u) noexcept {
+    storage_.construct(std::forward<U>(u));
+  }
+
+  void Destroy() noexcept {
+    if (ARROW_PREDICT_TRUE(status_.ok())) {
+      static_assert(offsetof(Result<T>, status_) == 0,
+                    "Status is guaranteed to be at the start of Result<>");
+      storage_.destroy();
+    }
+  }
+};
+
+#define ARROW_ASSIGN_OR_RAISE_IMPL(result_name, lhs, rexpr)                              \
+  auto&& result_name = (rexpr);                                                          \
+  ARROW_RETURN_IF_(!(result_name).ok(), (result_name).status(), ARROW_STRINGIFY(rexpr)); \
+  lhs = std::move(result_name).ValueUnsafe();
+
+#define ARROW_ASSIGN_OR_RAISE_NAME(x, y) ARROW_CONCAT(x, y)
+
+/// \brief Execute an expression that returns a Result, extracting its value
+/// into the variable defined by `lhs` (or returning a Status on error).
+///
+/// Example: Assigning to a new value:
+///   ARROW_ASSIGN_OR_RAISE(auto value, MaybeGetValue(arg));
+///
+/// Example: Assigning to an existing value:
+///   ValueType value;
+///   ARROW_ASSIGN_OR_RAISE(value, MaybeGetValue(arg));
+///
+/// WARNING: ARROW_ASSIGN_OR_RAISE expands into multiple statements;
+/// it cannot be used in a single statement (e.g. as the body of an if
+/// statement without {})!
+///
+/// WARNING: ARROW_ASSIGN_OR_RAISE `std::move`s its right operand. If you have
+/// an lvalue Result which you *don't* want to move out of cast appropriately.
+///
+/// WARNING: ARROW_ASSIGN_OR_RAISE is not a single expression; it will not
+/// maintain lifetimes of all temporaries in `rexpr` (e.g.
+/// `ARROW_ASSIGN_OR_RAISE(auto x, MakeTemp().GetResultRef());`
+/// will most likely segfault)!
+#define ARROW_ASSIGN_OR_RAISE(lhs, rexpr)                                              \
+  ARROW_ASSIGN_OR_RAISE_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
+                             lhs, rexpr);
+
+template <typename T>
+struct IntoStatus<Result<T>> {
+  static constexpr const Status& ToStatus(const Result<T>& res) { return res.status(); }
+  static inline Status ToStatus(Result<T>&& res) { return std::move(res).status(); }
+};
+
+template <typename T, typename R = typename EnsureResult<T>::type>
+R ToResult(T t) {
+  return R(std::move(t));
+}
+
+template <typename T>
+struct EnsureResult {
+  using type = Result<T>;
+};
+
+template <typename T>
+struct EnsureResult<Result<T>> {
+  using type = Result<T>;
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/scalar.h b/pyarrow/include/arrow/scalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..b96d930a444cda27026994084ae8e8fd587f7972
--- /dev/null
+++ b/pyarrow/include/arrow/scalar.h
@@ -0,0 +1,1026 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Object model for scalar (non-Array) values. Not intended for use with large
+// amounts of data
+
+#pragma once
+
+#include <iosfwd>
+#include <memory>
+#include <ratio>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "arrow/compare.h"
+#include "arrow/extension_type.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/float16.h"
+#include "arrow/util/visibility.h"
+#include "arrow/visit_type_inline.h"
+
+namespace arrow {
+
+class Array;
+
+/// \brief Base class for scalar values
+///
+/// A Scalar represents a single value with a specific DataType.
+/// Scalars are useful for passing single value inputs to compute functions,
+/// or for representing individual array elements (with a non-trivial
+/// wrapping cost, though).
+struct ARROW_EXPORT Scalar : public std::enable_shared_from_this<Scalar>,
+                             public util::EqualityComparable<Scalar> {
+  virtual ~Scalar() = default;
+
+  /// \brief The type of the scalar value
+  std::shared_ptr<DataType> type;
+
+  /// \brief Whether the value is valid (not null) or not
+  bool is_valid = false;
+
+  bool Equals(const Scalar& other,
+              const EqualOptions& options = EqualOptions::Defaults()) const;
+
+  bool ApproxEquals(const Scalar& other,
+                    const EqualOptions& options = EqualOptions::Defaults()) const;
+
+  struct ARROW_EXPORT Hash {
+    size_t operator()(const Scalar& scalar) const { return scalar.hash(); }
+
+    size_t operator()(const std::shared_ptr<Scalar>& scalar) const {
+      return scalar->hash();
+    }
+  };
+
+  size_t hash() const;
+
+  std::string ToString() const;
+
+  /// \brief Perform cheap validation checks
+  ///
+  /// This is O(k) where k is the number of descendents.
+  ///
+  /// \return Status
+  Status Validate() const;
+
+  /// \brief Perform extensive data validation checks
+  ///
+  /// This is potentially O(k*n) where k is the number of descendents and n
+  /// is the length of descendents (if list scalars are involved).
+  ///
+  /// \return Status
+  Status ValidateFull() const;
+
+  static Result<std::shared_ptr<Scalar>> Parse(const std::shared_ptr<DataType>& type,
+                                               std::string_view repr);
+
+  // TODO(bkietz) add compute::CastOptions
+  Result<std::shared_ptr<Scalar>> CastTo(std::shared_ptr<DataType> to) const;
+
+  /// \brief Apply the ScalarVisitor::Visit() method specialized to the scalar type
+  Status Accept(ScalarVisitor* visitor) const;
+
+  /// \brief EXPERIMENTAL Enable obtaining shared_ptr<Scalar> from a const
+  /// Scalar& context.
+  std::shared_ptr<Scalar> GetSharedPtr() const {
+    return const_cast<Scalar*>(this)->shared_from_this();
+  }
+
+ protected:
+  Scalar(std::shared_ptr<DataType> type, bool is_valid)
+      : type(std::move(type)), is_valid(is_valid) {}
+};
+
+ARROW_EXPORT void PrintTo(const Scalar& scalar, std::ostream* os);
+
+/// \defgroup concrete-scalar-classes Concrete Scalar subclasses
+///
+/// @{
+
+/// \brief A scalar value for NullType. Never valid
+struct ARROW_EXPORT NullScalar : public Scalar {
+ public:
+  using TypeClass = NullType;
+
+  NullScalar() : Scalar{null(), false} {}
+};
+
+/// @}
+
+namespace internal {
+
+constexpr auto kScalarScratchSpaceSize = sizeof(int64_t) * 2;
+
+template <typename Impl>
+struct ArraySpanFillFromScalarScratchSpace {
+  //  16 bytes of scratch space to enable ArraySpan to be a view onto any
+  //  Scalar- including binary scalars where we need to create a buffer
+  //  that looks like two 32-bit or 64-bit offsets.
+  alignas(int64_t) mutable uint8_t scratch_space_[kScalarScratchSpaceSize];
+
+ private:
+  template <typename... Args>
+  explicit ArraySpanFillFromScalarScratchSpace(Args&&... args) {
+    Impl::FillScratchSpace(scratch_space_, std::forward<Args>(args)...);
+  }
+
+  ArraySpanFillFromScalarScratchSpace() = delete;
+
+  friend Impl;
+};
+
+struct ARROW_EXPORT PrimitiveScalarBase : public Scalar {
+  explicit PrimitiveScalarBase(std::shared_ptr<DataType> type)
+      : Scalar(std::move(type), false) {}
+
+  using Scalar::Scalar;
+  /// \brief Get a const pointer to the value of this scalar. May be null.
+  virtual const void* data() const = 0;
+  /// \brief Get an immutable view of the value of this scalar as bytes.
+  virtual std::string_view view() const = 0;
+};
+
+template <typename T, typename CType = typename T::c_type>
+struct PrimitiveScalar : public PrimitiveScalarBase {
+  using PrimitiveScalarBase::PrimitiveScalarBase;
+  using TypeClass = T;
+  using ValueType = CType;
+
+  // Non-null constructor.
+  PrimitiveScalar(ValueType value, std::shared_ptr<DataType> type)
+      : PrimitiveScalarBase(std::move(type), true), value(value) {}
+
+  explicit PrimitiveScalar(std::shared_ptr<DataType> type)
+      : PrimitiveScalarBase(std::move(type), false) {}
+
+  ValueType value{};
+
+  const void* data() const override { return &value; }
+  std::string_view view() const override {
+    return std::string_view(reinterpret_cast<const char*>(&value), sizeof(ValueType));
+  };
+};
+
+}  // namespace internal
+
+/// \addtogroup concrete-scalar-classes Concrete Scalar subclasses
+///
+/// @{
+
+struct ARROW_EXPORT BooleanScalar : public internal::PrimitiveScalar<BooleanType, bool> {
+  using Base = internal::PrimitiveScalar<BooleanType, bool>;
+  using Base::Base;
+
+  explicit BooleanScalar(bool value) : Base(value, boolean()) {}
+
+  BooleanScalar() : Base(boolean()) {}
+};
+
+template <typename T>
+struct NumericScalar : public internal::PrimitiveScalar<T> {
+  using Base = typename internal::PrimitiveScalar<T>;
+  using Base::Base;
+  using TypeClass = typename Base::TypeClass;
+  using ValueType = typename Base::ValueType;
+
+  explicit NumericScalar(ValueType value)
+      : Base(value, TypeTraits<T>::type_singleton()) {}
+
+  NumericScalar() : Base(TypeTraits<T>::type_singleton()) {}
+};
+
+struct ARROW_EXPORT Int8Scalar : public NumericScalar<Int8Type> {
+  using NumericScalar<Int8Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT Int16Scalar : public NumericScalar<Int16Type> {
+  using NumericScalar<Int16Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT Int32Scalar : public NumericScalar<Int32Type> {
+  using NumericScalar<Int32Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT Int64Scalar : public NumericScalar<Int64Type> {
+  using NumericScalar<Int64Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT UInt8Scalar : public NumericScalar<UInt8Type> {
+  using NumericScalar<UInt8Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT UInt16Scalar : public NumericScalar<UInt16Type> {
+  using NumericScalar<UInt16Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT UInt32Scalar : public NumericScalar<UInt32Type> {
+  using NumericScalar<UInt32Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT UInt64Scalar : public NumericScalar<UInt64Type> {
+  using NumericScalar<UInt64Type>::NumericScalar;
+};
+
+struct ARROW_EXPORT HalfFloatScalar : public NumericScalar<HalfFloatType> {
+  using NumericScalar<HalfFloatType>::NumericScalar;
+
+  explicit HalfFloatScalar(util::Float16 value)
+      : NumericScalar(value.bits(), float16()) {}
+
+  HalfFloatScalar(util::Float16 value, std::shared_ptr<DataType> type)
+      : NumericScalar(value.bits(), std::move(type)) {}
+};
+
+struct ARROW_EXPORT FloatScalar : public NumericScalar<FloatType> {
+  using NumericScalar<FloatType>::NumericScalar;
+};
+
+struct ARROW_EXPORT DoubleScalar : public NumericScalar<DoubleType> {
+  using NumericScalar<DoubleType>::NumericScalar;
+};
+
+struct ARROW_EXPORT BaseBinaryScalar : public internal::PrimitiveScalarBase {
+  using ValueType = std::shared_ptr<Buffer>;
+
+  // The value is not supposed to be modified after construction, because subclasses have
+  // a scratch space whose content need to be kept consistent with the value. It is also
+  // the user of this class's responsibility to ensure that the buffer is not written to
+  // accidentally.
+  const std::shared_ptr<Buffer> value = NULLPTR;
+
+  const void* data() const override {
+    return value ? reinterpret_cast<const void*>(value->data()) : NULLPTR;
+  }
+  std::string_view view() const override {
+    return value ? std::string_view(*value) : std::string_view();
+  }
+
+  explicit BaseBinaryScalar(std::shared_ptr<DataType> type)
+      : internal::PrimitiveScalarBase(std::move(type)) {}
+
+  BaseBinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
+      : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {}
+
+  BaseBinaryScalar(std::string s, std::shared_ptr<DataType> type);
+};
+
+struct ARROW_EXPORT BinaryScalar
+    : public BaseBinaryScalar,
+      private internal::ArraySpanFillFromScalarScratchSpace<BinaryScalar> {
+  using TypeClass = BinaryType;
+  using ArraySpanFillFromScalarScratchSpace =
+      internal::ArraySpanFillFromScalarScratchSpace<BinaryScalar>;
+
+  explicit BinaryScalar(std::shared_ptr<DataType> type)
+      : BaseBinaryScalar(std::move(type)),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  BinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
+      : BaseBinaryScalar(std::move(value), std::move(type)),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  BinaryScalar(std::string s, std::shared_ptr<DataType> type)
+      : BaseBinaryScalar(std::move(s), std::move(type)),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  explicit BinaryScalar(std::shared_ptr<Buffer> value)
+      : BinaryScalar(std::move(value), binary()) {}
+
+  explicit BinaryScalar(std::string s) : BinaryScalar(std::move(s), binary()) {}
+
+  BinaryScalar() : BinaryScalar(binary()) {}
+
+ private:
+  static void FillScratchSpace(uint8_t* scratch_space,
+                               const std::shared_ptr<Buffer>& value);
+
+  friend ArraySpan;
+  friend ArraySpanFillFromScalarScratchSpace;
+};
+
+struct ARROW_EXPORT StringScalar : public BinaryScalar {
+  using BinaryScalar::BinaryScalar;
+  using TypeClass = StringType;
+
+  explicit StringScalar(std::shared_ptr<Buffer> value)
+      : StringScalar(std::move(value), utf8()) {}
+
+  explicit StringScalar(std::string s) : BinaryScalar(std::move(s), utf8()) {}
+
+  StringScalar() : StringScalar(utf8()) {}
+};
+
+struct ARROW_EXPORT BinaryViewScalar
+    : public BaseBinaryScalar,
+      private internal::ArraySpanFillFromScalarScratchSpace<BinaryViewScalar> {
+  using TypeClass = BinaryViewType;
+  using ArraySpanFillFromScalarScratchSpace =
+      internal::ArraySpanFillFromScalarScratchSpace<BinaryViewScalar>;
+
+  explicit BinaryViewScalar(std::shared_ptr<DataType> type)
+      : BaseBinaryScalar(std::move(type)),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  BinaryViewScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
+      : BaseBinaryScalar(std::move(value), std::move(type)),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  BinaryViewScalar(std::string s, std::shared_ptr<DataType> type)
+      : BaseBinaryScalar(std::move(s), std::move(type)),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  explicit BinaryViewScalar(std::shared_ptr<Buffer> value)
+      : BinaryViewScalar(std::move(value), binary_view()) {}
+
+  explicit BinaryViewScalar(std::string s)
+      : BinaryViewScalar(std::move(s), binary_view()) {}
+
+  BinaryViewScalar() : BinaryViewScalar(binary_view()) {}
+
+  std::string_view view() const override { return std::string_view(*this->value); }
+
+ private:
+  static void FillScratchSpace(uint8_t* scratch_space,
+                               const std::shared_ptr<Buffer>& value);
+
+  friend ArraySpan;
+  friend ArraySpanFillFromScalarScratchSpace;
+};
+
+struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar {
+  using BinaryViewScalar::BinaryViewScalar;
+  using TypeClass = StringViewType;
+
+  explicit StringViewScalar(std::shared_ptr<Buffer> value)
+      : StringViewScalar(std::move(value), utf8_view()) {}
+
+  explicit StringViewScalar(std::string s)
+      : BinaryViewScalar(std::move(s), utf8_view()) {}
+
+  StringViewScalar() : StringViewScalar(utf8_view()) {}
+};
+
+struct ARROW_EXPORT LargeBinaryScalar
+    : public BaseBinaryScalar,
+      private internal::ArraySpanFillFromScalarScratchSpace<LargeBinaryScalar> {
+  using TypeClass = LargeBinaryType;
+  using ArraySpanFillFromScalarScratchSpace =
+      internal::ArraySpanFillFromScalarScratchSpace<LargeBinaryScalar>;
+
+  explicit LargeBinaryScalar(std::shared_ptr<DataType> type)
+      : BaseBinaryScalar(std::move(type)),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  LargeBinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type)
+      : BaseBinaryScalar(std::move(value), std::move(type)),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  LargeBinaryScalar(std::string s, std::shared_ptr<DataType> type)
+      : BaseBinaryScalar(std::move(s), std::move(type)),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  explicit LargeBinaryScalar(std::shared_ptr<Buffer> value)
+      : LargeBinaryScalar(std::move(value), large_binary()) {}
+
+  explicit LargeBinaryScalar(std::string s)
+      : LargeBinaryScalar(std::move(s), large_binary()) {}
+
+  LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {}
+
+ private:
+  static void FillScratchSpace(uint8_t* scratch_space,
+                               const std::shared_ptr<Buffer>& value);
+
+  friend ArraySpan;
+  friend ArraySpanFillFromScalarScratchSpace;
+};
+
+struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar {
+  using LargeBinaryScalar::LargeBinaryScalar;
+  using TypeClass = LargeStringType;
+
+  explicit LargeStringScalar(std::shared_ptr<Buffer> value)
+      : LargeStringScalar(std::move(value), large_utf8()) {}
+
+  explicit LargeStringScalar(std::string s)
+      : LargeBinaryScalar(std::move(s), large_utf8()) {}
+
+  LargeStringScalar() : LargeStringScalar(large_utf8()) {}
+};
+
+struct ARROW_EXPORT FixedSizeBinaryScalar : public BinaryScalar {
+  using TypeClass = FixedSizeBinaryType;
+
+  FixedSizeBinaryScalar(std::shared_ptr<Buffer> value, std::shared_ptr<DataType> type,
+                        bool is_valid = true);
+
+  explicit FixedSizeBinaryScalar(const std::shared_ptr<Buffer>& value,
+                                 bool is_valid = true);
+
+  explicit FixedSizeBinaryScalar(std::string s, bool is_valid = true);
+};
+
+template <typename T>
+struct TemporalScalar : internal::PrimitiveScalar<T> {
+  using internal::PrimitiveScalar<T>::PrimitiveScalar;
+  using ValueType = typename internal::PrimitiveScalar<T>::ValueType;
+
+  TemporalScalar(ValueType value, std::shared_ptr<DataType> type)
+      : internal::PrimitiveScalar<T>(std::move(value), type) {}
+};
+
+template <typename T>
+struct DateScalar : public TemporalScalar<T> {
+  using TemporalScalar<T>::TemporalScalar;
+  using ValueType = typename TemporalScalar<T>::ValueType;
+
+  explicit DateScalar(ValueType value)
+      : TemporalScalar<T>(std::move(value), TypeTraits<T>::type_singleton()) {}
+  DateScalar() : TemporalScalar<T>(TypeTraits<T>::type_singleton()) {}
+};
+
+struct ARROW_EXPORT Date32Scalar : public DateScalar<Date32Type> {
+  using DateScalar<Date32Type>::DateScalar;
+};
+
+struct ARROW_EXPORT Date64Scalar : public DateScalar<Date64Type> {
+  using DateScalar<Date64Type>::DateScalar;
+};
+
+template <typename T>
+struct TimeScalar : public TemporalScalar<T> {
+  using TemporalScalar<T>::TemporalScalar;
+
+  TimeScalar(typename TemporalScalar<T>::ValueType value, TimeUnit::type unit)
+      : TimeScalar(std::move(value), std::make_shared<T>(unit)) {}
+};
+
+struct ARROW_EXPORT Time32Scalar : public TimeScalar<Time32Type> {
+  using TimeScalar<Time32Type>::TimeScalar;
+};
+
+struct ARROW_EXPORT Time64Scalar : public TimeScalar<Time64Type> {
+  using TimeScalar<Time64Type>::TimeScalar;
+};
+
+struct ARROW_EXPORT TimestampScalar : public TemporalScalar<TimestampType> {
+  using TemporalScalar<TimestampType>::TemporalScalar;
+
+  TimestampScalar(typename TemporalScalar<TimestampType>::ValueType value,
+                  TimeUnit::type unit, std::string tz = "")
+      : TimestampScalar(std::move(value), timestamp(unit, std::move(tz))) {}
+
+  static Result<TimestampScalar> FromISO8601(std::string_view iso8601,
+                                             TimeUnit::type unit);
+};
+
+template <typename T>
+struct IntervalScalar : public TemporalScalar<T> {
+  using TemporalScalar<T>::TemporalScalar;
+  using ValueType = typename TemporalScalar<T>::ValueType;
+
+  explicit IntervalScalar(ValueType value)
+      : TemporalScalar<T>(value, TypeTraits<T>::type_singleton()) {}
+  IntervalScalar() : TemporalScalar<T>(TypeTraits<T>::type_singleton()) {}
+};
+
+struct ARROW_EXPORT MonthIntervalScalar : public IntervalScalar<MonthIntervalType> {
+  using IntervalScalar<MonthIntervalType>::IntervalScalar;
+};
+
+struct ARROW_EXPORT DayTimeIntervalScalar : public IntervalScalar<DayTimeIntervalType> {
+  using IntervalScalar<DayTimeIntervalType>::IntervalScalar;
+};
+
+struct ARROW_EXPORT MonthDayNanoIntervalScalar
+    : public IntervalScalar<MonthDayNanoIntervalType> {
+  using IntervalScalar<MonthDayNanoIntervalType>::IntervalScalar;
+};
+
+struct ARROW_EXPORT DurationScalar : public TemporalScalar<DurationType> {
+  using TemporalScalar<DurationType>::TemporalScalar;
+
+  DurationScalar(typename TemporalScalar<DurationType>::ValueType value,
+                 TimeUnit::type unit)
+      : DurationScalar(std::move(value), duration(unit)) {}
+
+  // Convenience constructors for a DurationScalar from std::chrono::nanoseconds
+  template <template <typename, typename> class StdDuration, typename Rep>
+  explicit DurationScalar(StdDuration<Rep, std::nano> d)
+      : DurationScalar{DurationScalar(d.count(), duration(TimeUnit::NANO))} {}
+
+  // Convenience constructors for a DurationScalar from std::chrono::microseconds
+  template <template <typename, typename> class StdDuration, typename Rep>
+  explicit DurationScalar(StdDuration<Rep, std::micro> d)
+      : DurationScalar{DurationScalar(d.count(), duration(TimeUnit::MICRO))} {}
+
+  // Convenience constructors for a DurationScalar from std::chrono::milliseconds
+  template <template <typename, typename> class StdDuration, typename Rep>
+  explicit DurationScalar(StdDuration<Rep, std::milli> d)
+      : DurationScalar{DurationScalar(d.count(), duration(TimeUnit::MILLI))} {}
+
+  // Convenience constructors for a DurationScalar from std::chrono::seconds
+  // or from units which are whole numbers of seconds
+  template <template <typename, typename> class StdDuration, typename Rep, intmax_t Num>
+  explicit DurationScalar(StdDuration<Rep, std::ratio<Num, 1>> d)
+      : DurationScalar{DurationScalar(d.count() * Num, duration(TimeUnit::SECOND))} {}
+};
+
+template <typename TYPE_CLASS, typename VALUE_TYPE>
+struct DecimalScalar : public internal::PrimitiveScalarBase {
+  using internal::PrimitiveScalarBase::PrimitiveScalarBase;
+  using TypeClass = TYPE_CLASS;
+  using ValueType = VALUE_TYPE;
+
+  DecimalScalar(ValueType value, std::shared_ptr<DataType> type)
+      : internal::PrimitiveScalarBase(std::move(type), true), value(value) {}
+
+  const void* data() const override {
+    return reinterpret_cast<const void*>(value.native_endian_bytes());
+  }
+
+  std::string_view view() const override {
+    return std::string_view(reinterpret_cast<const char*>(value.native_endian_bytes()),
+                            ValueType::kByteWidth);
+  }
+
+  ValueType value;
+};
+
+struct ARROW_EXPORT Decimal32Scalar : public DecimalScalar<Decimal32Type, Decimal32> {
+  using DecimalScalar::DecimalScalar;
+};
+
+struct ARROW_EXPORT Decimal64Scalar : public DecimalScalar<Decimal64Type, Decimal64> {
+  using DecimalScalar::DecimalScalar;
+};
+
+struct ARROW_EXPORT Decimal128Scalar : public DecimalScalar<Decimal128Type, Decimal128> {
+  using DecimalScalar::DecimalScalar;
+};
+
+struct ARROW_EXPORT Decimal256Scalar : public DecimalScalar<Decimal256Type, Decimal256> {
+  using DecimalScalar::DecimalScalar;
+};
+
+struct ARROW_EXPORT BaseListScalar : public Scalar {
+  using ValueType = std::shared_ptr<Array>;
+
+  BaseListScalar(std::shared_ptr<Array> value, std::shared_ptr<DataType> type,
+                 bool is_valid = true);
+
+  // The value is not supposed to be modified after construction, because subclasses have
+  // a scratch space whose content need to be kept consistent with the value. It is also
+  // the user of this class's responsibility to ensure that the array is not modified
+  // accidentally.
+  const std::shared_ptr<Array> value;
+};
+
+struct ARROW_EXPORT ListScalar
+    : public BaseListScalar,
+      private internal::ArraySpanFillFromScalarScratchSpace<ListScalar> {
+  using TypeClass = ListType;
+  using ArraySpanFillFromScalarScratchSpace =
+      internal::ArraySpanFillFromScalarScratchSpace<ListScalar>;
+
+  ListScalar(std::shared_ptr<Array> value, std::shared_ptr<DataType> type,
+             bool is_valid = true)
+      : BaseListScalar(std::move(value), std::move(type), is_valid),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  explicit ListScalar(std::shared_ptr<Array> value, bool is_valid = true);
+
+ private:
+  static void FillScratchSpace(uint8_t* scratch_space,
+                               const std::shared_ptr<Array>& value);
+
+  friend ArraySpan;
+  friend ArraySpanFillFromScalarScratchSpace;
+};
+
+struct ARROW_EXPORT LargeListScalar
+    : public BaseListScalar,
+      private internal::ArraySpanFillFromScalarScratchSpace<LargeListScalar> {
+  using TypeClass = LargeListType;
+  using ArraySpanFillFromScalarScratchSpace =
+      internal::ArraySpanFillFromScalarScratchSpace<LargeListScalar>;
+
+  LargeListScalar(std::shared_ptr<Array> value, std::shared_ptr<DataType> type,
+                  bool is_valid = true)
+      : BaseListScalar(std::move(value), std::move(type), is_valid),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  explicit LargeListScalar(std::shared_ptr<Array> value, bool is_valid = true);
+
+ private:
+  static void FillScratchSpace(uint8_t* scratch_space,
+                               const std::shared_ptr<Array>& value);
+
+  friend ArraySpan;
+  friend ArraySpanFillFromScalarScratchSpace;
+};
+
+struct ARROW_EXPORT ListViewScalar
+    : public BaseListScalar,
+      private internal::ArraySpanFillFromScalarScratchSpace<ListViewScalar> {
+  using TypeClass = ListViewType;
+  using ArraySpanFillFromScalarScratchSpace =
+      internal::ArraySpanFillFromScalarScratchSpace<ListViewScalar>;
+
+  ListViewScalar(std::shared_ptr<Array> value, std::shared_ptr<DataType> type,
+                 bool is_valid = true)
+      : BaseListScalar(std::move(value), std::move(type), is_valid),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  explicit ListViewScalar(std::shared_ptr<Array> value, bool is_valid = true);
+
+ private:
+  static void FillScratchSpace(uint8_t* scratch_space,
+                               const std::shared_ptr<Array>& value);
+
+  friend ArraySpan;
+  friend ArraySpanFillFromScalarScratchSpace;
+};
+
+struct ARROW_EXPORT LargeListViewScalar
+    : public BaseListScalar,
+      private internal::ArraySpanFillFromScalarScratchSpace<LargeListViewScalar> {
+  using TypeClass = LargeListViewType;
+  using ArraySpanFillFromScalarScratchSpace =
+      internal::ArraySpanFillFromScalarScratchSpace<LargeListViewScalar>;
+
+  LargeListViewScalar(std::shared_ptr<Array> value, std::shared_ptr<DataType> type,
+                      bool is_valid = true)
+      : BaseListScalar(std::move(value), std::move(type), is_valid),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  explicit LargeListViewScalar(std::shared_ptr<Array> value, bool is_valid = true);
+
+ private:
+  static void FillScratchSpace(uint8_t* scratch_space,
+                               const std::shared_ptr<Array>& value);
+
+  friend ArraySpan;
+  friend ArraySpanFillFromScalarScratchSpace;
+};
+
+struct ARROW_EXPORT MapScalar
+    : public BaseListScalar,
+      private internal::ArraySpanFillFromScalarScratchSpace<MapScalar> {
+  using TypeClass = MapType;
+  using ArraySpanFillFromScalarScratchSpace =
+      internal::ArraySpanFillFromScalarScratchSpace<MapScalar>;
+
+  MapScalar(std::shared_ptr<Array> value, std::shared_ptr<DataType> type,
+            bool is_valid = true)
+      : BaseListScalar(std::move(value), std::move(type), is_valid),
+        ArraySpanFillFromScalarScratchSpace(this->value) {}
+
+  explicit MapScalar(std::shared_ptr<Array> value, bool is_valid = true);
+
+ private:
+  static void FillScratchSpace(uint8_t* scratch_space,
+                               const std::shared_ptr<Array>& value);
+
+  friend ArraySpan;
+  friend ArraySpanFillFromScalarScratchSpace;
+};
+
+struct ARROW_EXPORT FixedSizeListScalar : public BaseListScalar {
+  using TypeClass = FixedSizeListType;
+
+  FixedSizeListScalar(std::shared_ptr<Array> value, std::shared_ptr<DataType> type,
+                      bool is_valid = true);
+
+  explicit FixedSizeListScalar(std::shared_ptr<Array> value, bool is_valid = true);
+};
+
+struct ARROW_EXPORT StructScalar : public Scalar {
+  using TypeClass = StructType;
+  using ValueType = std::vector<std::shared_ptr<Scalar>>;
+
+  ScalarVector value;
+
+  Result<std::shared_ptr<Scalar>> field(FieldRef ref) const;
+
+  StructScalar(ValueType value, std::shared_ptr<DataType> type, bool is_valid = true)
+      : Scalar(std::move(type), is_valid), value(std::move(value)) {}
+
+  static Result<std::shared_ptr<StructScalar>> Make(ValueType value,
+                                                    std::vector<std::string> field_names);
+};
+
+struct ARROW_EXPORT UnionScalar : public Scalar {
+  // The type code is not supposed to be modified after construction, because the scratch
+  // space's content need to be kept consistent with it.
+  const int8_t type_code;
+
+  virtual const std::shared_ptr<Scalar>& child_value() const = 0;
+
+ protected:
+  UnionScalar(std::shared_ptr<DataType> type, int8_t type_code, bool is_valid)
+      : Scalar(std::move(type), is_valid), type_code(type_code) {}
+
+  struct UnionScratchSpace {
+    alignas(int64_t) int8_t type_code;
+    alignas(int64_t) uint8_t offsets[sizeof(int32_t) * 2];
+  };
+  static_assert(sizeof(UnionScratchSpace) <= internal::kScalarScratchSpaceSize);
+
+  friend ArraySpan;
+};
+
+struct ARROW_EXPORT SparseUnionScalar
+    : public UnionScalar,
+      private internal::ArraySpanFillFromScalarScratchSpace<SparseUnionScalar> {
+  using TypeClass = SparseUnionType;
+  using ArraySpanFillFromScalarScratchSpace =
+      internal::ArraySpanFillFromScalarScratchSpace<SparseUnionScalar>;
+
+  // Even though only one of the union values is relevant for this scalar, we
+  // nonetheless construct a vector of scalars, one per union value, to have
+  // enough data to reconstruct a valid ArraySpan of length 1 from this scalar
+  using ValueType = std::vector<std::shared_ptr<Scalar>>;
+  // The value is not supposed to be modified after construction, because the scratch
+  // space's content need to be kept consistent with the value. It is also the user of
+  // this class's responsibility to ensure that the scalars of the vector is not modified
+  // to accidentally.
+  const ValueType value;
+
+  // The value index corresponding to the active type code
+  int child_id;
+
+  SparseUnionScalar(ValueType value, int8_t type_code, std::shared_ptr<DataType> type);
+
+  const std::shared_ptr<Scalar>& child_value() const override {
+    return this->value[this->child_id];
+  }
+
+  /// \brief Construct a SparseUnionScalar from a single value, versus having
+  /// to construct a vector of scalars
+  static std::shared_ptr<Scalar> FromValue(std::shared_ptr<Scalar> value, int field_index,
+                                           std::shared_ptr<DataType> type);
+
+ private:
+  static void FillScratchSpace(uint8_t* scratch_space, int8_t type_code);
+
+  friend ArraySpan;
+  friend ArraySpanFillFromScalarScratchSpace;
+};
+
+struct ARROW_EXPORT DenseUnionScalar
+    : public UnionScalar,
+      private internal::ArraySpanFillFromScalarScratchSpace<DenseUnionScalar> {
+  using TypeClass = DenseUnionType;
+  using ArraySpanFillFromScalarScratchSpace =
+      internal::ArraySpanFillFromScalarScratchSpace<DenseUnionScalar>;
+
+  // For DenseUnionScalar, we can make a valid ArraySpan of length 1 from this
+  // scalar
+  using ValueType = std::shared_ptr<Scalar>;
+  // The value is not supposed to be modified after construction, because the scratch
+  // space's content need to be kept consistent with the value. It is also the user of
+  // this class's responsibility to ensure that the elements of the vector is not modified
+  // accidentally.
+  const ValueType value;
+
+  const std::shared_ptr<Scalar>& child_value() const override { return this->value; }
+
+  DenseUnionScalar(ValueType value, int8_t type_code, std::shared_ptr<DataType> type)
+      : UnionScalar(std::move(type), type_code, value->is_valid),
+        ArraySpanFillFromScalarScratchSpace(type_code),
+        value(std::move(value)) {}
+
+ private:
+  static void FillScratchSpace(uint8_t* scratch_space, int8_t type_code);
+
+  friend ArraySpan;
+  friend ArraySpanFillFromScalarScratchSpace;
+};
+
+struct ARROW_EXPORT RunEndEncodedScalar
+    : public Scalar,
+      private internal::ArraySpanFillFromScalarScratchSpace<RunEndEncodedScalar> {
+  using TypeClass = RunEndEncodedType;
+  using ValueType = std::shared_ptr<Scalar>;
+  using ArraySpanFillFromScalarScratchSpace =
+      internal::ArraySpanFillFromScalarScratchSpace<RunEndEncodedScalar>;
+
+  // The value is not supposed to be modified after construction, because the scratch
+  // space's content need to be kept consistent with the value. It is also the user of
+  // this class's responsibility to ensure that the wrapped scalar is not modified
+  // accidentally.
+  const ValueType value;
+
+  RunEndEncodedScalar(std::shared_ptr<Scalar> value, std::shared_ptr<DataType> type);
+
+  /// \brief Constructs a NULL RunEndEncodedScalar
+  explicit RunEndEncodedScalar(const std::shared_ptr<DataType>& type);
+
+  ~RunEndEncodedScalar() override;
+
+  const std::shared_ptr<DataType>& run_end_type() const {
+    return ree_type().run_end_type();
+  }
+
+  const std::shared_ptr<DataType>& value_type() const { return ree_type().value_type(); }
+
+ private:
+  const TypeClass& ree_type() const { return internal::checked_cast<TypeClass&>(*type); }
+
+  static void FillScratchSpace(uint8_t* scratch_space, const DataType& type);
+
+  friend ArraySpan;
+  friend ArraySpanFillFromScalarScratchSpace;
+};
+
+/// \brief A Scalar value for DictionaryType
+///
+/// `is_valid` denotes the validity of the `index`, regardless of
+/// the corresponding value in the `dictionary`.
+struct ARROW_EXPORT DictionaryScalar : public internal::PrimitiveScalarBase {
+  using TypeClass = DictionaryType;
+  struct ValueType {
+    std::shared_ptr<Scalar> index;
+    std::shared_ptr<Array> dictionary;
+  } value;
+
+  explicit DictionaryScalar(std::shared_ptr<DataType> type);
+
+  DictionaryScalar(ValueType value, std::shared_ptr<DataType> type, bool is_valid = true)
+      : internal::PrimitiveScalarBase(std::move(type), is_valid),
+        value(std::move(value)) {}
+
+  static std::shared_ptr<DictionaryScalar> Make(std::shared_ptr<Scalar> index,
+                                                std::shared_ptr<Array> dict);
+
+  Result<std::shared_ptr<Scalar>> GetEncodedValue() const;
+
+  const void* data() const override {
+    return internal::checked_cast<internal::PrimitiveScalarBase&>(*value.index).data();
+  }
+  std::string_view view() const override {
+    return internal::checked_cast<const internal::PrimitiveScalarBase&>(*value.index)
+        .view();
+  }
+};
+
+/// \brief A Scalar value for ExtensionType
+///
+/// The value is the underlying storage scalar.
+/// `is_valid` must only be true if `value` is non-null and `value->is_valid` is true
+struct ARROW_EXPORT ExtensionScalar : public Scalar {
+  using TypeClass = ExtensionType;
+  using ValueType = std::shared_ptr<Scalar>;
+
+  ExtensionScalar(std::shared_ptr<Scalar> storage, std::shared_ptr<DataType> type,
+                  bool is_valid = true)
+      : Scalar(std::move(type), is_valid), value(std::move(storage)) {}
+
+  template <typename Storage,
+            typename = enable_if_t<std::is_base_of<Scalar, Storage>::value>>
+  ExtensionScalar(Storage&& storage, std::shared_ptr<DataType> type, bool is_valid = true)
+      : ExtensionScalar(std::make_shared<Storage>(std::move(storage)), std::move(type),
+                        is_valid) {}
+
+  std::shared_ptr<Scalar> value;
+};
+
+/// @}
+
+namespace internal {
+
+inline Status CheckBufferLength(...) { return Status::OK(); }
+
+ARROW_EXPORT Status CheckBufferLength(const FixedSizeBinaryType* t,
+                                      const std::shared_ptr<Buffer>* b);
+
+}  // namespace internal
+
+template <typename ValueRef>
+struct MakeScalarImpl;
+
+/// \defgroup scalar-factories Scalar factory functions
+///
+/// @{
+
+/// \brief Scalar factory for null scalars
+ARROW_EXPORT
+std::shared_ptr<Scalar> MakeNullScalar(std::shared_ptr<DataType> type);
+
+/// \brief Scalar factory for non-null scalars
+template <typename Value>
+Result<std::shared_ptr<Scalar>> MakeScalar(std::shared_ptr<DataType> type,
+                                           Value&& value) {
+  return MakeScalarImpl<Value&&>{type, std::forward<Value>(value), NULLPTR}.Finish();
+}
+
+/// \brief Type-inferring scalar factory for non-null scalars
+///
+/// Construct a Scalar instance with a DataType determined by the input C++ type.
+/// (for example Int8Scalar for a int8_t input).
+/// Only non-parametric primitive types and String are supported.
+template <typename Value, typename Traits = CTypeTraits<typename std::decay<Value>::type>,
+          typename ScalarType = typename Traits::ScalarType,
+          typename Enable = decltype(ScalarType(std::declval<Value>(),
+                                                Traits::type_singleton()))>
+std::shared_ptr<Scalar> MakeScalar(Value value) {
+  return std::make_shared<ScalarType>(std::move(value), Traits::type_singleton());
+}
+
+inline std::shared_ptr<Scalar> MakeScalar(std::string value) {
+  return std::make_shared<StringScalar>(std::move(value));
+}
+
+inline std::shared_ptr<Scalar> MakeScalar(const std::shared_ptr<Scalar>& scalar) {
+  return scalar;
+}
+/// @}
+
+template <typename ValueRef>
+struct MakeScalarImpl {
+  template <typename T, typename ScalarType = typename TypeTraits<T>::ScalarType,
+            typename ValueType = typename ScalarType::ValueType,
+            typename Enable = typename std::enable_if<
+                std::is_constructible<ScalarType, ValueType,
+                                      std::shared_ptr<DataType>>::value &&
+                std::is_convertible<ValueRef, ValueType>::value>::type>
+  Status Visit(const T& t) {
+    ARROW_RETURN_NOT_OK(internal::CheckBufferLength(&t, &value_));
+    // `static_cast<ValueRef>` makes a rvalue if ValueRef is `ValueType&&`
+    out_ = std::make_shared<ScalarType>(
+        static_cast<ValueType>(static_cast<ValueRef>(value_)), std::move(type_));
+    return Status::OK();
+  }
+
+  // This isn't captured by the generic case above because `util::Float16` isn't implicity
+  // convertible to `uint16_t` (HalfFloat's ValueType)
+  template <typename T>
+  std::enable_if_t<std::is_same_v<std::decay_t<ValueRef>, util::Float16> &&
+                       is_half_float_type<T>::value,
+                   Status>
+  Visit(const T& t) {
+    out_ = std::make_shared<HalfFloatScalar>(static_cast<ValueRef>(value_),
+                                             std::move(type_));
+    return Status::OK();
+  }
+
+  Status Visit(const ExtensionType& t) {
+    ARROW_ASSIGN_OR_RAISE(auto storage,
+                          MakeScalar(t.storage_type(), static_cast<ValueRef>(value_)));
+    out_ = std::make_shared<ExtensionScalar>(std::move(storage), type_);
+    return Status::OK();
+  }
+
+  // Enable constructing string/binary scalars (but not decimal, etc) from std::string
+  template <typename T>
+  enable_if_t<
+      std::is_same<typename std::remove_reference<ValueRef>::type, std::string>::value &&
+          (is_base_binary_type<T>::value || std::is_same<T, FixedSizeBinaryType>::value),
+      Status>
+  Visit(const T& t) {
+    using ScalarType = typename TypeTraits<T>::ScalarType;
+    out_ = std::make_shared<ScalarType>(Buffer::FromString(std::move(value_)),
+                                        std::move(type_));
+    return Status::OK();
+  }
+
+  Status Visit(const DataType& t) {
+    return Status::NotImplemented("constructing scalars of type ", t,
+                                  " from unboxed values");
+  }
+
+  Result<std::shared_ptr<Scalar>> Finish() && {
+    ARROW_RETURN_NOT_OK(VisitTypeInline(*type_, this));
+    return std::move(out_);
+  }
+
+  std::shared_ptr<DataType> type_;
+  ValueRef value_;
+  std::shared_ptr<Scalar> out_;
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/sparse_tensor.h b/pyarrow/include/arrow/sparse_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..5faae16bb25cc76b247bd9dea25cd36c33becc3d
--- /dev/null
+++ b/pyarrow/include/arrow/sparse_tensor.h
@@ -0,0 +1,617 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compare.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/tensor.h"  // IWYU pragma: export
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+namespace internal {
+
+ARROW_EXPORT
+Status CheckSparseIndexMaximumValue(const std::shared_ptr<DataType>& index_value_type,
+                                    const std::vector<int64_t>& shape);
+
+}  // namespace internal
+
+// ----------------------------------------------------------------------
+// SparseIndex class
+
+struct SparseTensorFormat {
+  /// EXPERIMENTAL: The index format type of SparseTensor
+  enum type {
+    /// Coordinate list (COO) format.
+    COO,
+    /// Compressed sparse row (CSR) format.
+    CSR,
+    /// Compressed sparse column (CSC) format.
+    CSC,
+    /// Compressed sparse fiber (CSF) format.
+    CSF
+  };
+};
+
+/// \brief EXPERIMENTAL: The base class for the index of a sparse tensor
+///
+/// SparseIndex describes where the non-zero elements are within a SparseTensor.
+///
+/// There are several ways to represent this.  The format_id is used to
+/// distinguish what kind of representation is used.  Each possible value of
+/// format_id must have only one corresponding concrete subclass of SparseIndex.
+class ARROW_EXPORT SparseIndex {
+ public:
+  explicit SparseIndex(SparseTensorFormat::type format_id) : format_id_(format_id) {}
+
+  virtual ~SparseIndex() = default;
+
+  /// \brief Return the identifier of the format type
+  SparseTensorFormat::type format_id() const { return format_id_; }
+
+  /// \brief Return the number of non zero values in the sparse tensor related
+  /// to this sparse index
+  virtual int64_t non_zero_length() const = 0;
+
+  /// \brief Return the string representation of the sparse index
+  virtual std::string ToString() const = 0;
+
+  virtual Status ValidateShape(const std::vector<int64_t>& shape) const;
+
+ protected:
+  const SparseTensorFormat::type format_id_;
+};
+
+namespace internal {
+template <typename SparseIndexType>
+class SparseIndexBase : public SparseIndex {
+ public:
+  SparseIndexBase() : SparseIndex(SparseIndexType::format_id) {}
+};
+}  // namespace internal
+
+// ----------------------------------------------------------------------
+// SparseCOOIndex class
+
+/// \brief EXPERIMENTAL: The index data for a COO sparse tensor
+///
+/// A COO sparse index manages the location of its non-zero values by their
+/// coordinates.
+class ARROW_EXPORT SparseCOOIndex : public internal::SparseIndexBase<SparseCOOIndex> {
+ public:
+  static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::COO;
+
+  /// \brief Make SparseCOOIndex from a coords tensor and canonicality
+  static Result<std::shared_ptr<SparseCOOIndex>> Make(
+      const std::shared_ptr<Tensor>& coords, bool is_canonical);
+
+  /// \brief Make SparseCOOIndex from a coords tensor with canonicality auto-detection
+  static Result<std::shared_ptr<SparseCOOIndex>> Make(
+      const std::shared_ptr<Tensor>& coords);
+
+  /// \brief Make SparseCOOIndex from raw properties with canonicality auto-detection
+  static Result<std::shared_ptr<SparseCOOIndex>> Make(
+      const std::shared_ptr<DataType>& indices_type,
+      const std::vector<int64_t>& indices_shape,
+      const std::vector<int64_t>& indices_strides, std::shared_ptr<Buffer> indices_data);
+
+  /// \brief Make SparseCOOIndex from raw properties
+  static Result<std::shared_ptr<SparseCOOIndex>> Make(
+      const std::shared_ptr<DataType>& indices_type,
+      const std::vector<int64_t>& indices_shape,
+      const std::vector<int64_t>& indices_strides, std::shared_ptr<Buffer> indices_data,
+      bool is_canonical);
+
+  /// \brief Make SparseCOOIndex from sparse tensor's shape properties and data
+  /// with canonicality auto-detection
+  ///
+  /// The indices_data should be in row-major (C-like) order.  If not,
+  /// use the raw properties constructor.
+  static Result<std::shared_ptr<SparseCOOIndex>> Make(
+      const std::shared_ptr<DataType>& indices_type, const std::vector<int64_t>& shape,
+      int64_t non_zero_length, std::shared_ptr<Buffer> indices_data);
+
+  /// \brief Make SparseCOOIndex from sparse tensor's shape properties and data
+  ///
+  /// The indices_data should be in row-major (C-like) order.  If not,
+  /// use the raw properties constructor.
+  static Result<std::shared_ptr<SparseCOOIndex>> Make(
+      const std::shared_ptr<DataType>& indices_type, const std::vector<int64_t>& shape,
+      int64_t non_zero_length, std::shared_ptr<Buffer> indices_data, bool is_canonical);
+
+  /// \brief Construct SparseCOOIndex from column-major NumericTensor
+  explicit SparseCOOIndex(const std::shared_ptr<Tensor>& coords, bool is_canonical);
+
+  /// \brief Return a tensor that has the coordinates of the non-zero values
+  ///
+  /// The returned tensor is a N x D tensor where N is the number of non-zero
+  /// values and D is the number of dimensions in the logical data.
+  /// The column at index `i` is a D-tuple of coordinates indicating that the
+  /// logical value at those coordinates should be found at physical index `i`.
+  const std::shared_ptr<Tensor>& indices() const { return coords_; }
+
+  /// \brief Return the number of non zero values in the sparse tensor related
+  /// to this sparse index
+  int64_t non_zero_length() const override { return coords_->shape()[0]; }
+
+  /// \brief Return whether a sparse tensor index is canonical, or not.
+  /// If a sparse tensor index is canonical, it is sorted in the lexicographical order,
+  /// and the corresponding sparse tensor doesn't have duplicated entries.
+  bool is_canonical() const { return is_canonical_; }
+
+  /// \brief Return a string representation of the sparse index
+  std::string ToString() const override;
+
+  /// \brief Return whether the COO indices are equal
+  bool Equals(const SparseCOOIndex& other) const {
+    return indices()->Equals(*other.indices());
+  }
+
+  inline Status ValidateShape(const std::vector<int64_t>& shape) const override {
+    ARROW_RETURN_NOT_OK(SparseIndex::ValidateShape(shape));
+
+    if (static_cast<size_t>(coords_->shape()[1]) == shape.size()) {
+      return Status::OK();
+    }
+
+    return Status::Invalid(
+        "shape length is inconsistent with the coords matrix in COO index");
+  }
+
+ protected:
+  std::shared_ptr<Tensor> coords_;
+  bool is_canonical_;
+};
+
+namespace internal {
+
+/// EXPERIMENTAL: The axis to be compressed
+enum class SparseMatrixCompressedAxis : char {
+  /// The value for CSR matrix
+  ROW,
+  /// The value for CSC matrix
+  COLUMN
+};
+
+ARROW_EXPORT
+Status ValidateSparseCSXIndex(const std::shared_ptr<DataType>& indptr_type,
+                              const std::shared_ptr<DataType>& indices_type,
+                              const std::vector<int64_t>& indptr_shape,
+                              const std::vector<int64_t>& indices_shape,
+                              const char* type_name);
+
+ARROW_EXPORT
+void CheckSparseCSXIndexValidity(const std::shared_ptr<DataType>& indptr_type,
+                                 const std::shared_ptr<DataType>& indices_type,
+                                 const std::vector<int64_t>& indptr_shape,
+                                 const std::vector<int64_t>& indices_shape,
+                                 const char* type_name);
+
+template <typename SparseIndexType, SparseMatrixCompressedAxis COMPRESSED_AXIS>
+class SparseCSXIndex : public SparseIndexBase<SparseIndexType> {
+ public:
+  static constexpr SparseMatrixCompressedAxis kCompressedAxis = COMPRESSED_AXIS;
+
+  /// \brief Make a subclass of SparseCSXIndex from raw properties
+  static Result<std::shared_ptr<SparseIndexType>> Make(
+      const std::shared_ptr<DataType>& indptr_type,
+      const std::shared_ptr<DataType>& indices_type,
+      const std::vector<int64_t>& indptr_shape, const std::vector<int64_t>& indices_shape,
+      std::shared_ptr<Buffer> indptr_data, std::shared_ptr<Buffer> indices_data) {
+    ARROW_RETURN_NOT_OK(ValidateSparseCSXIndex(indptr_type, indices_type, indptr_shape,
+                                               indices_shape,
+                                               SparseIndexType::kTypeName));
+    return std::make_shared<SparseIndexType>(
+        std::make_shared<Tensor>(indptr_type, indptr_data, indptr_shape),
+        std::make_shared<Tensor>(indices_type, indices_data, indices_shape));
+  }
+
+  /// \brief Make a subclass of SparseCSXIndex from raw properties
+  static Result<std::shared_ptr<SparseIndexType>> Make(
+      const std::shared_ptr<DataType>& indices_type,
+      const std::vector<int64_t>& indptr_shape, const std::vector<int64_t>& indices_shape,
+      std::shared_ptr<Buffer> indptr_data, std::shared_ptr<Buffer> indices_data) {
+    return Make(indices_type, indices_type, indptr_shape, indices_shape, indptr_data,
+                indices_data);
+  }
+
+  /// \brief Make a subclass of SparseCSXIndex from sparse tensor's shape properties and
+  /// data
+  static Result<std::shared_ptr<SparseIndexType>> Make(
+      const std::shared_ptr<DataType>& indptr_type,
+      const std::shared_ptr<DataType>& indices_type, const std::vector<int64_t>& shape,
+      int64_t non_zero_length, std::shared_ptr<Buffer> indptr_data,
+      std::shared_ptr<Buffer> indices_data) {
+    std::vector<int64_t> indptr_shape({shape[0] + 1});
+    std::vector<int64_t> indices_shape({non_zero_length});
+    return Make(indptr_type, indices_type, indptr_shape, indices_shape, indptr_data,
+                indices_data);
+  }
+
+  /// \brief Make a subclass of SparseCSXIndex from sparse tensor's shape properties and
+  /// data
+  static Result<std::shared_ptr<SparseIndexType>> Make(
+      const std::shared_ptr<DataType>& indices_type, const std::vector<int64_t>& shape,
+      int64_t non_zero_length, std::shared_ptr<Buffer> indptr_data,
+      std::shared_ptr<Buffer> indices_data) {
+    return Make(indices_type, indices_type, shape, non_zero_length, indptr_data,
+                indices_data);
+  }
+
+  /// \brief Construct SparseCSXIndex from two index vectors
+  explicit SparseCSXIndex(const std::shared_ptr<Tensor>& indptr,
+                          const std::shared_ptr<Tensor>& indices)
+      : SparseIndexBase<SparseIndexType>(), indptr_(indptr), indices_(indices) {
+    CheckSparseCSXIndexValidity(indptr_->type(), indices_->type(), indptr_->shape(),
+                                indices_->shape(), SparseIndexType::kTypeName);
+  }
+
+  /// \brief Return a 1D tensor of indptr vector
+  const std::shared_ptr<Tensor>& indptr() const { return indptr_; }
+
+  /// \brief Return a 1D tensor of indices vector
+  const std::shared_ptr<Tensor>& indices() const { return indices_; }
+
+  /// \brief Return the number of non zero values in the sparse tensor related
+  /// to this sparse index
+  int64_t non_zero_length() const override { return indices_->shape()[0]; }
+
+  /// \brief Return a string representation of the sparse index
+  std::string ToString() const override {
+    return std::string(SparseIndexType::kTypeName);
+  }
+
+  /// \brief Return whether the CSR indices are equal
+  bool Equals(const SparseIndexType& other) const {
+    return indptr()->Equals(*other.indptr()) && indices()->Equals(*other.indices());
+  }
+
+  inline Status ValidateShape(const std::vector<int64_t>& shape) const override {
+    ARROW_RETURN_NOT_OK(SparseIndex::ValidateShape(shape));
+
+    if (shape.size() < 2) {
+      return Status::Invalid("shape length is too short");
+    }
+
+    if (shape.size() > 2) {
+      return Status::Invalid("shape length is too long");
+    }
+
+    if (indptr_->shape()[0] == shape[static_cast<int64_t>(kCompressedAxis)] + 1) {
+      return Status::OK();
+    }
+
+    return Status::Invalid("shape length is inconsistent with the ", ToString());
+  }
+
+ protected:
+  std::shared_ptr<Tensor> indptr_;
+  std::shared_ptr<Tensor> indices_;
+};
+
+}  // namespace internal
+
+// ----------------------------------------------------------------------
+// SparseCSRIndex class
+
+/// \brief EXPERIMENTAL: The index data for a CSR sparse matrix
+///
+/// A CSR sparse index manages the location of its non-zero values by two
+/// vectors.
+///
+/// The first vector, called indptr, represents the range of the rows; the i-th
+/// row spans from indptr[i] to indptr[i+1] in the corresponding value vector.
+/// So the length of an indptr vector is the number of rows + 1.
+///
+/// The other vector, called indices, represents the column indices of the
+/// corresponding non-zero values.  So the length of an indices vector is same
+/// as the number of non-zero-values.
+class ARROW_EXPORT SparseCSRIndex
+    : public internal::SparseCSXIndex<SparseCSRIndex,
+                                      internal::SparseMatrixCompressedAxis::ROW> {
+ public:
+  using BaseClass =
+      internal::SparseCSXIndex<SparseCSRIndex, internal::SparseMatrixCompressedAxis::ROW>;
+
+  static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSR;
+  static constexpr const char* kTypeName = "SparseCSRIndex";
+
+  using SparseCSXIndex::kCompressedAxis;
+  using SparseCSXIndex::Make;
+  using SparseCSXIndex::SparseCSXIndex;
+};
+
+// ----------------------------------------------------------------------
+// SparseCSCIndex class
+
+/// \brief EXPERIMENTAL: The index data for a CSC sparse matrix
+///
+/// A CSC sparse index manages the location of its non-zero values by two
+/// vectors.
+///
+/// The first vector, called indptr, represents the range of the column; the i-th
+/// column spans from indptr[i] to indptr[i+1] in the corresponding value vector.
+/// So the length of an indptr vector is the number of columns + 1.
+///
+/// The other vector, called indices, represents the row indices of the
+/// corresponding non-zero values.  So the length of an indices vector is same
+/// as the number of non-zero-values.
+class ARROW_EXPORT SparseCSCIndex
+    : public internal::SparseCSXIndex<SparseCSCIndex,
+                                      internal::SparseMatrixCompressedAxis::COLUMN> {
+ public:
+  using BaseClass =
+      internal::SparseCSXIndex<SparseCSCIndex,
+                               internal::SparseMatrixCompressedAxis::COLUMN>;
+
+  static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSC;
+  static constexpr const char* kTypeName = "SparseCSCIndex";
+
+  using SparseCSXIndex::kCompressedAxis;
+  using SparseCSXIndex::Make;
+  using SparseCSXIndex::SparseCSXIndex;
+};
+
+// ----------------------------------------------------------------------
+// SparseCSFIndex class
+
+/// \brief EXPERIMENTAL: The index data for a CSF sparse tensor
+///
+/// A CSF sparse index manages the location of its non-zero values by set of
+/// prefix trees. Each path from a root to leaf forms one tensor non-zero index.
+/// CSF is implemented with three vectors.
+///
+/// Vectors inptr and indices contain N-1 and N buffers respectively, where N is the
+/// number of dimensions. Axis_order is a vector of integers of length N. Indptr and
+/// indices describe the set of prefix trees. Trees traverse dimensions in order given by
+/// axis_order.
+class ARROW_EXPORT SparseCSFIndex : public internal::SparseIndexBase<SparseCSFIndex> {
+ public:
+  static constexpr SparseTensorFormat::type format_id = SparseTensorFormat::CSF;
+  static constexpr const char* kTypeName = "SparseCSFIndex";
+
+  /// \brief Make SparseCSFIndex from raw properties
+  static Result<std::shared_ptr<SparseCSFIndex>> Make(
+      const std::shared_ptr<DataType>& indptr_type,
+      const std::shared_ptr<DataType>& indices_type,
+      const std::vector<int64_t>& indices_shapes, const std::vector<int64_t>& axis_order,
+      const std::vector<std::shared_ptr<Buffer>>& indptr_data,
+      const std::vector<std::shared_ptr<Buffer>>& indices_data);
+
+  /// \brief Make SparseCSFIndex from raw properties
+  static Result<std::shared_ptr<SparseCSFIndex>> Make(
+      const std::shared_ptr<DataType>& indices_type,
+      const std::vector<int64_t>& indices_shapes, const std::vector<int64_t>& axis_order,
+      const std::vector<std::shared_ptr<Buffer>>& indptr_data,
+      const std::vector<std::shared_ptr<Buffer>>& indices_data) {
+    return Make(indices_type, indices_type, indices_shapes, axis_order, indptr_data,
+                indices_data);
+  }
+
+  /// \brief Construct SparseCSFIndex from two index vectors
+  explicit SparseCSFIndex(const std::vector<std::shared_ptr<Tensor>>& indptr,
+                          const std::vector<std::shared_ptr<Tensor>>& indices,
+                          const std::vector<int64_t>& axis_order);
+
+  /// \brief Return a 1D vector of indptr tensors
+  const std::vector<std::shared_ptr<Tensor>>& indptr() const { return indptr_; }
+
+  /// \brief Return a 1D vector of indices tensors
+  const std::vector<std::shared_ptr<Tensor>>& indices() const { return indices_; }
+
+  /// \brief Return a 1D vector specifying the order of axes
+  const std::vector<int64_t>& axis_order() const { return axis_order_; }
+
+  /// \brief Return the number of non zero values in the sparse tensor related
+  /// to this sparse index
+  int64_t non_zero_length() const override { return indices_.back()->shape()[0]; }
+
+  /// \brief Return a string representation of the sparse index
+  std::string ToString() const override;
+
+  /// \brief Return whether the CSF indices are equal
+  bool Equals(const SparseCSFIndex& other) const;
+
+ protected:
+  std::vector<std::shared_ptr<Tensor>> indptr_;
+  std::vector<std::shared_ptr<Tensor>> indices_;
+  std::vector<int64_t> axis_order_;
+};
+
+// ----------------------------------------------------------------------
+// SparseTensor class
+
+/// \brief EXPERIMENTAL: The base class of sparse tensor container
+class ARROW_EXPORT SparseTensor {
+ public:
+  virtual ~SparseTensor() = default;
+
+  SparseTensorFormat::type format_id() const { return sparse_index_->format_id(); }
+
+  /// \brief Return a value type of the sparse tensor
+  std::shared_ptr<DataType> type() const { return type_; }
+
+  /// \brief Return a buffer that contains the value vector of the sparse tensor
+  std::shared_ptr<Buffer> data() const { return data_; }
+
+  /// \brief Return an immutable raw data pointer
+  const uint8_t* raw_data() const { return data_->data(); }
+
+  /// \brief Return a mutable raw data pointer
+  uint8_t* raw_mutable_data() const { return data_->mutable_data(); }
+
+  /// \brief Return a shape vector of the sparse tensor
+  const std::vector<int64_t>& shape() const { return shape_; }
+
+  /// \brief Return a sparse index of the sparse tensor
+  const std::shared_ptr<SparseIndex>& sparse_index() const { return sparse_index_; }
+
+  /// \brief Return a number of dimensions of the sparse tensor
+  int ndim() const { return static_cast<int>(shape_.size()); }
+
+  /// \brief Return a vector of dimension names
+  const std::vector<std::string>& dim_names() const { return dim_names_; }
+
+  /// \brief Return the name of the i-th dimension
+  const std::string& dim_name(int i) const;
+
+  /// \brief Total number of value cells in the sparse tensor
+  int64_t size() const;
+
+  /// \brief Return true if the underlying data buffer is mutable
+  bool is_mutable() const { return data_->is_mutable(); }
+
+  /// \brief Total number of non-zero cells in the sparse tensor
+  int64_t non_zero_length() const {
+    return sparse_index_ ? sparse_index_->non_zero_length() : 0;
+  }
+
+  /// \brief Return whether sparse tensors are equal
+  bool Equals(const SparseTensor& other,
+              const EqualOptions& = EqualOptions::Defaults()) const;
+
+  /// \brief Return dense representation of sparse tensor as tensor
+  ///
+  /// The returned Tensor has row-major order (C-like).
+  Result<std::shared_ptr<Tensor>> ToTensor(MemoryPool* pool) const;
+  Result<std::shared_ptr<Tensor>> ToTensor() const {
+    return ToTensor(default_memory_pool());
+  }
+
+ protected:
+  // Constructor with all attributes
+  SparseTensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+               const std::vector<int64_t>& shape,
+               const std::shared_ptr<SparseIndex>& sparse_index,
+               const std::vector<std::string>& dim_names);
+
+  std::shared_ptr<DataType> type_;
+  std::shared_ptr<Buffer> data_;
+  std::vector<int64_t> shape_;
+  std::shared_ptr<SparseIndex> sparse_index_;
+
+  // These names are optional
+  std::vector<std::string> dim_names_;
+};
+
+// ----------------------------------------------------------------------
+// SparseTensorImpl class
+
+namespace internal {
+
+ARROW_EXPORT
+Status MakeSparseTensorFromTensor(const Tensor& tensor,
+                                  SparseTensorFormat::type sparse_format_id,
+                                  const std::shared_ptr<DataType>& index_value_type,
+                                  MemoryPool* pool,
+                                  std::shared_ptr<SparseIndex>* out_sparse_index,
+                                  std::shared_ptr<Buffer>* out_data);
+
+}  // namespace internal
+
+/// \brief EXPERIMENTAL: Concrete sparse tensor implementation classes with sparse index
+/// type
+template <typename SparseIndexType>
+class SparseTensorImpl : public SparseTensor {
+ public:
+  virtual ~SparseTensorImpl() = default;
+
+  /// \brief Construct a sparse tensor from physical data buffer and logical index
+  SparseTensorImpl(const std::shared_ptr<SparseIndexType>& sparse_index,
+                   const std::shared_ptr<DataType>& type,
+                   const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
+                   const std::vector<std::string>& dim_names)
+      : SparseTensor(type, data, shape, sparse_index, dim_names) {}
+
+  /// \brief Construct an empty sparse tensor
+  SparseTensorImpl(const std::shared_ptr<DataType>& type,
+                   const std::vector<int64_t>& shape,
+                   const std::vector<std::string>& dim_names = {})
+      : SparseTensorImpl(NULLPTR, type, NULLPTR, shape, dim_names) {}
+
+  /// \brief Create a SparseTensor with full parameters
+  static inline Result<std::shared_ptr<SparseTensorImpl<SparseIndexType>>> Make(
+      const std::shared_ptr<SparseIndexType>& sparse_index,
+      const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+      const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names) {
+    if (!is_tensor_supported(type->id())) {
+      return Status::Invalid(type->ToString(),
+                             " is not valid data type for a sparse tensor");
+    }
+    ARROW_RETURN_NOT_OK(sparse_index->ValidateShape(shape));
+    if (dim_names.size() > 0 && dim_names.size() != shape.size()) {
+      return Status::Invalid("dim_names length is inconsistent with shape");
+    }
+    return std::make_shared<SparseTensorImpl<SparseIndexType>>(sparse_index, type, data,
+                                                               shape, dim_names);
+  }
+
+  /// \brief Create a sparse tensor from a dense tensor
+  ///
+  /// The dense tensor is re-encoded as a sparse index and a physical
+  /// data buffer for the non-zero value.
+  static inline Result<std::shared_ptr<SparseTensorImpl<SparseIndexType>>> Make(
+      const Tensor& tensor, const std::shared_ptr<DataType>& index_value_type,
+      MemoryPool* pool = default_memory_pool()) {
+    std::shared_ptr<SparseIndex> sparse_index;
+    std::shared_ptr<Buffer> data;
+    ARROW_RETURN_NOT_OK(internal::MakeSparseTensorFromTensor(
+        tensor, SparseIndexType::format_id, index_value_type, pool, &sparse_index,
+        &data));
+    return std::make_shared<SparseTensorImpl<SparseIndexType>>(
+        internal::checked_pointer_cast<SparseIndexType>(sparse_index), tensor.type(),
+        data, tensor.shape(), tensor.dim_names_);
+  }
+
+  static inline Result<std::shared_ptr<SparseTensorImpl<SparseIndexType>>> Make(
+      const Tensor& tensor, MemoryPool* pool = default_memory_pool()) {
+    return Make(tensor, int64(), pool);
+  }
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(SparseTensorImpl);
+};
+
+/// \brief EXPERIMENTAL: Type alias for COO sparse tensor
+using SparseCOOTensor = SparseTensorImpl<SparseCOOIndex>;
+
+/// \brief EXPERIMENTAL: Type alias for CSR sparse matrix
+using SparseCSRMatrix = SparseTensorImpl<SparseCSRIndex>;
+
+/// \brief EXPERIMENTAL: Type alias for CSC sparse matrix
+using SparseCSCMatrix = SparseTensorImpl<SparseCSCIndex>;
+
+/// \brief EXPERIMENTAL: Type alias for CSF sparse matrix
+using SparseCSFTensor = SparseTensorImpl<SparseCSFIndex>;
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/status.h b/pyarrow/include/arrow/status.h
new file mode 100644
index 0000000000000000000000000000000000000000..8907d32ff7d8354f7393b953270a7f8c4f5b00d6
--- /dev/null
+++ b/pyarrow/include/arrow/status.h
@@ -0,0 +1,509 @@
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+//
+// A Status encapsulates the result of an operation.  It may indicate success,
+// or it may indicate an error with an associated error message.
+//
+// Multiple threads can invoke const methods on a Status without
+// external synchronization, but if any of the threads may call a
+// non-const method, all threads accessing the same Status must use
+// external synchronization.
+
+// Adapted from Apache Kudu, TensorFlow
+
+#pragma once
+
+#include <cstring>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/util/compare.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_util.h"
+#include "arrow/util/visibility.h"
+
+#ifdef ARROW_EXTRA_ERROR_CONTEXT
+
+/// \brief Return with given status if condition is met.
+#  define ARROW_RETURN_IF_(condition, status, expr)   \
+    do {                                              \
+      if (ARROW_PREDICT_FALSE(condition)) {           \
+        ::arrow::Status _st = (status);               \
+        _st.AddContextLine(__FILE__, __LINE__, expr); \
+        return _st;                                   \
+      }                                               \
+    } while (0)
+
+#else
+
+#  define ARROW_RETURN_IF_(condition, status, _) \
+    do {                                         \
+      if (ARROW_PREDICT_FALSE(condition)) {      \
+        return (status);                         \
+      }                                          \
+    } while (0)
+
+#endif  // ARROW_EXTRA_ERROR_CONTEXT
+
+#define ARROW_RETURN_IF(condition, status) \
+  ARROW_RETURN_IF_(condition, status, ARROW_STRINGIFY(status))
+
+/// \brief Propagate any non-successful Status to the caller
+#define ARROW_RETURN_NOT_OK(status)                            \
+  do {                                                         \
+    ::arrow::Status __s = ::arrow::ToStatus(status);           \
+    ARROW_RETURN_IF_(!__s.ok(), __s, ARROW_STRINGIFY(status)); \
+  } while (false)
+
+/// \brief Given `expr` and `warn_msg`; log `warn_msg` if `expr` is a non-ok status
+#define ARROW_WARN_NOT_OK(expr, warn_msg) \
+  do {                                    \
+    ::arrow::Status _s = (expr);          \
+    if (ARROW_PREDICT_FALSE(!_s.ok())) {  \
+      _s.Warn(warn_msg);                  \
+    }                                     \
+  } while (false)
+
+// This is an internal-use macro and should not be used in public headers.
+#ifndef RETURN_NOT_OK
+#  define RETURN_NOT_OK(s) ARROW_RETURN_NOT_OK(s)
+#endif
+
+namespace arrow {
+namespace internal {
+
+class StatusConstant;
+
+}  // namespace internal
+
+enum class StatusCode : char {
+  OK = 0,
+  OutOfMemory = 1,
+  KeyError = 2,
+  TypeError = 3,
+  Invalid = 4,
+  IOError = 5,
+  CapacityError = 6,
+  IndexError = 7,
+  Cancelled = 8,
+  UnknownError = 9,
+  NotImplemented = 10,
+  SerializationError = 11,
+  RError = 13,
+  // Gandiva range of errors
+  CodeGenError = 40,
+  ExpressionValidationError = 41,
+  ExecutionError = 42,
+  // Continue generic codes.
+  AlreadyExists = 45
+};
+
+/// \brief An opaque class that allows subsystems to retain
+/// additional information inside the Status.
+class ARROW_EXPORT StatusDetail {
+ public:
+  virtual ~StatusDetail() = default;
+  /// \brief Return a unique id for the type of the StatusDetail
+  /// (effectively a poor man's substitute for RTTI).
+  virtual const char* type_id() const = 0;
+  /// \brief Produce a human-readable description of this status.
+  virtual std::string ToString() const = 0;
+
+  bool operator==(const StatusDetail& other) const noexcept {
+    return std::string(type_id()) == other.type_id() && ToString() == other.ToString();
+  }
+};
+
+/// \brief A type trait to declare a given type as Status-compatible.
+///
+/// This trait structure can be implemented if a type (such as Result<T>) embeds
+/// error information that can be converted to the Status class.
+/// It will make the given type usable directly in functions such as
+/// Status::OrElse and error-checking macros such as ARROW_RETURN_NOT_OK.
+template <typename T>
+struct IntoStatus;
+
+/// \brief Convert a Status-compatible object to Status
+///
+/// This generic function delegates to the IntoStatus type trait.
+template <typename T>
+constexpr decltype(auto) ToStatus(T&& t) {
+  return IntoStatus<std::decay_t<T>>::ToStatus(std::forward<T>(t));
+}
+
+/// \brief Status outcome object (success or error)
+///
+/// The Status object is an object holding the outcome of an operation.
+/// The outcome is represented as a StatusCode, either success
+/// (StatusCode::OK) or an error (any other of the StatusCode enumeration values).
+///
+/// Additionally, if an error occurred, a specific error message is generally
+/// attached.
+class ARROW_EXPORT [[nodiscard]] Status : public util::EqualityComparable<Status>,
+                                          public util::ToStringOstreamable<Status> {
+ public:
+  // Create a success status.
+  constexpr Status() noexcept : state_(NULLPTR) {}
+  ~Status() noexcept {
+    if (ARROW_PREDICT_FALSE(state_ != NULL)) {
+      if (!state_->is_constant) {
+        DeleteState();
+      }
+    }
+  }
+
+  Status(StatusCode code, const std::string& msg);
+  /// \brief Pluggable constructor for use by sub-systems.  detail cannot be null.
+  Status(StatusCode code, std::string msg, std::shared_ptr<StatusDetail> detail);
+
+  // Copy the specified status.
+  inline Status(const Status& s);
+  inline Status& operator=(const Status& s);
+
+  // Move the specified status.
+  inline Status(Status&& s) noexcept;
+  inline Status& operator=(Status&& s) noexcept;
+
+  inline bool Equals(const Status& s) const;
+
+  // AND the statuses.
+  inline Status operator&(const Status& s) const noexcept;
+  inline Status operator&(Status&& s) const noexcept;
+  inline Status& operator&=(const Status& s) noexcept;
+  inline Status& operator&=(Status&& s) noexcept;
+
+  /// Return a success status
+  static Status OK() { return Status(); }
+
+  template <typename... Args>
+  static Status FromArgs(StatusCode code, Args&&... args) {
+    return Status(code, internal::JoinToString(std::forward<Args>(args)...));
+  }
+
+  template <typename... Args>
+  static Status FromDetailAndArgs(StatusCode code, std::shared_ptr<StatusDetail> detail,
+                                  Args&&... args) {
+    return Status(code, internal::JoinToString(std::forward<Args>(args)...),
+                  std::move(detail));
+  }
+
+  /// Return an error status for out-of-memory conditions
+  template <typename... Args>
+  static Status OutOfMemory(Args&&... args) {
+    return Status::FromArgs(StatusCode::OutOfMemory, std::forward<Args>(args)...);
+  }
+
+  /// Return an error status for failed key lookups (e.g. column name in a table)
+  template <typename... Args>
+  static Status KeyError(Args&&... args) {
+    return Status::FromArgs(StatusCode::KeyError, std::forward<Args>(args)...);
+  }
+
+  /// Return an error status for type errors (such as mismatching data types)
+  template <typename... Args>
+  static Status TypeError(Args&&... args) {
+    return Status::FromArgs(StatusCode::TypeError, std::forward<Args>(args)...);
+  }
+
+  /// Return an error status for unknown errors
+  template <typename... Args>
+  static Status UnknownError(Args&&... args) {
+    return Status::FromArgs(StatusCode::UnknownError, std::forward<Args>(args)...);
+  }
+
+  /// Return an error status when an operation or a combination of operation and
+  /// data types is unimplemented
+  template <typename... Args>
+  static Status NotImplemented(Args&&... args) {
+    return Status::FromArgs(StatusCode::NotImplemented, std::forward<Args>(args)...);
+  }
+
+  /// Return an error status for invalid data (for example a string that fails parsing)
+  template <typename... Args>
+  static Status Invalid(Args&&... args) {
+    return Status::FromArgs(StatusCode::Invalid, std::forward<Args>(args)...);
+  }
+
+  /// Return an error status for cancelled operation
+  template <typename... Args>
+  static Status Cancelled(Args&&... args) {
+    return Status::FromArgs(StatusCode::Cancelled, std::forward<Args>(args)...);
+  }
+
+  /// Return an error status when an index is out of bounds
+  template <typename... Args>
+  static Status IndexError(Args&&... args) {
+    return Status::FromArgs(StatusCode::IndexError, std::forward<Args>(args)...);
+  }
+
+  /// Return an error status when a container's capacity would exceed its limits
+  template <typename... Args>
+  static Status CapacityError(Args&&... args) {
+    return Status::FromArgs(StatusCode::CapacityError, std::forward<Args>(args)...);
+  }
+
+  /// Return an error status when some IO-related operation failed
+  template <typename... Args>
+  static Status IOError(Args&&... args) {
+    return Status::FromArgs(StatusCode::IOError, std::forward<Args>(args)...);
+  }
+
+  /// Return an error status when some (de)serialization operation failed
+  template <typename... Args>
+  static Status SerializationError(Args&&... args) {
+    return Status::FromArgs(StatusCode::SerializationError, std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  static Status RError(Args&&... args) {
+    return Status::FromArgs(StatusCode::RError, std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  static Status CodeGenError(Args&&... args) {
+    return Status::FromArgs(StatusCode::CodeGenError, std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  static Status ExpressionValidationError(Args&&... args) {
+    return Status::FromArgs(StatusCode::ExpressionValidationError,
+                            std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  static Status ExecutionError(Args&&... args) {
+    return Status::FromArgs(StatusCode::ExecutionError, std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  static Status AlreadyExists(Args&&... args) {
+    return Status::FromArgs(StatusCode::AlreadyExists, std::forward<Args>(args)...);
+  }
+
+  /// Return true iff the status indicates success.
+  constexpr bool ok() const { return (state_ == NULLPTR); }
+
+  /// Return true iff the status indicates an out-of-memory error.
+  constexpr bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; }
+  /// Return true iff the status indicates a key lookup error.
+  constexpr bool IsKeyError() const { return code() == StatusCode::KeyError; }
+  /// Return true iff the status indicates invalid data.
+  constexpr bool IsInvalid() const { return code() == StatusCode::Invalid; }
+  /// Return true iff the status indicates a cancelled operation.
+  constexpr bool IsCancelled() const { return code() == StatusCode::Cancelled; }
+  /// Return true iff the status indicates an IO-related failure.
+  constexpr bool IsIOError() const { return code() == StatusCode::IOError; }
+  /// Return true iff the status indicates a container reaching capacity limits.
+  constexpr bool IsCapacityError() const { return code() == StatusCode::CapacityError; }
+  /// Return true iff the status indicates an out of bounds index.
+  constexpr bool IsIndexError() const { return code() == StatusCode::IndexError; }
+  /// Return true iff the status indicates a type error.
+  constexpr bool IsTypeError() const { return code() == StatusCode::TypeError; }
+  /// Return true iff the status indicates an unknown error.
+  constexpr bool IsUnknownError() const { return code() == StatusCode::UnknownError; }
+  /// Return true iff the status indicates an unimplemented operation.
+  constexpr bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; }
+  /// Return true iff the status indicates a (de)serialization failure
+  constexpr bool IsSerializationError() const {
+    return code() == StatusCode::SerializationError;
+  }
+  /// Return true iff the status indicates a R-originated error.
+  constexpr bool IsRError() const { return code() == StatusCode::RError; }
+
+  constexpr bool IsCodeGenError() const { return code() == StatusCode::CodeGenError; }
+
+  constexpr bool IsExpressionValidationError() const {
+    return code() == StatusCode::ExpressionValidationError;
+  }
+
+  constexpr bool IsExecutionError() const { return code() == StatusCode::ExecutionError; }
+  constexpr bool IsAlreadyExists() const { return code() == StatusCode::AlreadyExists; }
+
+  /// \brief Return a string representation of this status suitable for printing.
+  ///
+  /// The string "OK" is returned for success.
+  std::string ToString() const;
+
+  /// \brief Return a string representation of this status without
+  /// context lines suitable for printing.
+  ///
+  /// The string "OK" is returned for success.
+  std::string ToStringWithoutContextLines() const;
+
+  /// \brief Return a string representation of the status code, without the message
+  /// text or POSIX code information.
+  std::string CodeAsString() const;
+  static std::string CodeAsString(StatusCode);
+
+  /// \brief Return the StatusCode value attached to this status.
+  constexpr StatusCode code() const { return ok() ? StatusCode::OK : state_->code; }
+
+  /// \brief Return the specific error message attached to this status.
+  const std::string& message() const;
+
+  /// \brief Return the status detail attached to this message.
+  const std::shared_ptr<StatusDetail>& detail() const;
+
+  /// \brief Return a new Status copying the existing status, but
+  /// updating with the existing detail.
+  Status WithDetail(std::shared_ptr<StatusDetail> new_detail) const {
+    return Status(code(), message(), std::move(new_detail));
+  }
+
+  /// \brief Return a new Status with changed message, copying the
+  /// existing status code and detail.
+  template <typename... Args>
+  Status WithMessage(Args&&... args) const {
+    return FromArgs(code(), std::forward<Args>(args)...).WithDetail(detail());
+  }
+
+  /// \brief Apply a functor if the status indicates an error
+  ///
+  /// This can be used to execute fallback or cleanup actions.
+  ///
+  /// If the status indicates a success, it is returned as-is.
+  ///
+  /// If the status indicates an error, the given functor is called with the status
+  /// as argument.
+  /// If the functor returns a new Status, it is returned.
+  /// If the functor returns a Status-compatible object such as Result<T>, it is
+  /// converted to Status and returned.
+  /// If the functor returns void, the original Status is returned.
+  template <typename OnError>
+  Status OrElse(OnError&& on_error) {
+    using RT = decltype(on_error(Status()));
+    if (ARROW_PREDICT_TRUE(ok())) {
+      return *this;
+    }
+    if constexpr (std::is_void_v<RT>) {
+      on_error(*this);
+      return *this;
+    } else {
+      return ToStatus(on_error(*this));
+    }
+  }
+
+  void Warn() const;
+  void Warn(const std::string& message) const;
+
+  [[noreturn]] void Abort() const;
+  [[noreturn]] void Abort(const std::string& message) const;
+
+#ifdef ARROW_EXTRA_ERROR_CONTEXT
+  void AddContextLine(const char* filename, int line, const char* expr);
+#endif
+
+ private:
+  struct State {
+    StatusCode code;
+    bool is_constant;
+    std::string msg;
+    std::shared_ptr<StatusDetail> detail;
+  };
+  // OK status has a `NULL` state_.  Otherwise, `state_` points to
+  // a `State` structure containing the error code and message(s)
+  State* state_;
+
+  void DeleteState() noexcept {
+    // ARROW-2400: On certain compilers, splitting off the slow path improves
+    // performance significantly.
+    delete state_;
+  }
+  void CopyFrom(const Status& s);
+  inline void MoveFrom(Status& s);
+
+  friend class internal::StatusConstant;
+};
+
+void Status::MoveFrom(Status& s) {
+  if (ARROW_PREDICT_FALSE(state_ != NULL)) {
+    if (!state_->is_constant) {
+      DeleteState();
+    }
+  }
+  state_ = s.state_;
+  s.state_ = NULLPTR;
+}
+
+Status::Status(const Status& s) : state_{NULLPTR} { CopyFrom(s); }
+
+Status& Status::operator=(const Status& s) {
+  // The following condition catches both aliasing (when this == &s),
+  // and the common case where both s and *this are ok.
+  if (state_ != s.state_) {
+    CopyFrom(s);
+  }
+  return *this;
+}
+
+Status::Status(Status&& s) noexcept : state_(s.state_) { s.state_ = NULLPTR; }
+
+Status& Status::operator=(Status&& s) noexcept {
+  MoveFrom(s);
+  return *this;
+}
+
+bool Status::Equals(const Status& s) const {
+  if (state_ == s.state_) {
+    return true;
+  }
+
+  if (ok() || s.ok()) {
+    return false;
+  }
+
+  if (detail() != s.detail()) {
+    if ((detail() && !s.detail()) || (!detail() && s.detail())) {
+      return false;
+    }
+    return *detail() == *s.detail();
+  }
+
+  return code() == s.code() && message() == s.message();
+}
+
+/// \cond FALSE
+// (note: emits warnings on Doxygen < 1.8.15,
+//  see https://github.com/doxygen/doxygen/issues/6295)
+Status Status::operator&(const Status& s) const noexcept {
+  if (ok()) {
+    return s;
+  } else {
+    return *this;
+  }
+}
+
+Status Status::operator&(Status&& s) const noexcept {
+  if (ok()) {
+    return std::move(s);
+  } else {
+    return *this;
+  }
+}
+
+Status& Status::operator&=(const Status& s) noexcept {
+  if (ok() && !s.ok()) {
+    CopyFrom(s);
+  }
+  return *this;
+}
+
+Status& Status::operator&=(Status&& s) noexcept {
+  if (ok() && !s.ok()) {
+    MoveFrom(s);
+  }
+  return *this;
+}
+/// \endcond
+
+template <>
+struct IntoStatus<Status> {
+  static constexpr const Status& ToStatus(const Status& st) { return st; }
+  static constexpr Status&& ToStatus(Status&& st) { return std::move(st); }
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/stl.h b/pyarrow/include/arrow/stl.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a9e4bdf77eb765a48f6d6e7c4160a77bdf6b7bb
--- /dev/null
+++ b/pyarrow/include/arrow/stl.h
@@ -0,0 +1,491 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/chunked_array.h"
+#include "arrow/compute/api.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+class Schema;
+
+namespace stl {
+
+namespace internal {
+
+template <typename T, typename = void>
+struct is_optional_like : public std::false_type {};
+
+template <typename T, typename = void>
+struct is_dereferencable : public std::false_type {};
+
+template <typename T>
+struct is_dereferencable<T, arrow::internal::void_t<decltype(*std::declval<T>())>>
+    : public std::true_type {};
+
+template <typename T>
+struct is_optional_like<
+    T, typename std::enable_if<
+           std::is_constructible<bool, T>::value && is_dereferencable<T>::value &&
+           !std::is_array<typename std::remove_reference<T>::type>::value>::type>
+    : public std::true_type {};
+
+template <size_t N, typename Tuple>
+using BareTupleElement =
+    typename std::decay<typename std::tuple_element<N, Tuple>::type>::type;
+
+}  // namespace internal
+
+template <typename T, typename R = void>
+using enable_if_optional_like =
+    typename std::enable_if<internal::is_optional_like<T>::value, R>::type;
+
+/// Traits meta class to map standard C/C++ types to equivalent Arrow types.
+template <typename T, typename Enable = void>
+struct ConversionTraits {};
+
+/// Returns builder type for given standard C/C++ type.
+template <typename CType>
+using CBuilderType =
+    typename TypeTraits<typename ConversionTraits<CType>::ArrowType>::BuilderType;
+
+/// Default implementation of AppendListValues.
+///
+/// This function can be specialized by user to take advantage of appending
+/// contiguous ranges while appending. This default implementation will call
+/// ConversionTraits<ValueCType>::AppendRow() for each value in the range.
+template <typename ValueCType, typename Range>
+inline Status AppendListValues(CBuilderType<ValueCType>& value_builder,
+                               Range&& cell_range) {
+  for (const auto& value : cell_range) {
+    ARROW_RETURN_NOT_OK(ConversionTraits<ValueCType>::AppendRow(value_builder, value));
+  }
+  return Status::OK();
+}
+
+#define ARROW_STL_CONVERSION(CType_, ArrowType_)                                    \
+  template <>                                                                       \
+  struct ConversionTraits<CType_> : public CTypeTraits<CType_> {                    \
+    static Status AppendRow(typename TypeTraits<ArrowType_>::BuilderType& builder,  \
+                            CType_ cell) {                                          \
+      return builder.Append(cell);                                                  \
+    }                                                                               \
+    static CType_ GetEntry(const typename TypeTraits<ArrowType_>::ArrayType& array, \
+                           size_t j) {                                              \
+      return array.Value(j);                                                        \
+    }                                                                               \
+  };                                                                                \
+                                                                                    \
+  template <>                                                                       \
+  inline Status AppendListValues<CType_, const std::vector<CType_>&>(               \
+      typename TypeTraits<ArrowType_>::BuilderType & value_builder,                 \
+      const std::vector<CType_>& cell_range) {                                      \
+    return value_builder.AppendValues(cell_range);                                  \
+  }
+
+ARROW_STL_CONVERSION(bool, BooleanType)
+ARROW_STL_CONVERSION(int8_t, Int8Type)
+ARROW_STL_CONVERSION(int16_t, Int16Type)
+ARROW_STL_CONVERSION(int32_t, Int32Type)
+ARROW_STL_CONVERSION(int64_t, Int64Type)
+ARROW_STL_CONVERSION(uint8_t, UInt8Type)
+ARROW_STL_CONVERSION(uint16_t, UInt16Type)
+ARROW_STL_CONVERSION(uint32_t, UInt32Type)
+ARROW_STL_CONVERSION(uint64_t, UInt64Type)
+ARROW_STL_CONVERSION(float, FloatType)
+ARROW_STL_CONVERSION(double, DoubleType)
+
+template <>
+struct ConversionTraits<std::string> : public CTypeTraits<std::string> {
+  static Status AppendRow(StringBuilder& builder, const std::string& cell) {
+    return builder.Append(cell);
+  }
+  static std::string GetEntry(const StringArray& array, size_t j) {
+    return array.GetString(j);
+  }
+};
+
+/// Append cell range elements as a single value to the list builder.
+///
+/// Cell range will be added to child builder using AppendListValues<ValueCType>()
+/// if provided. AppendListValues<ValueCType>() has a default implementation, but
+/// it can be specialized by users.
+template <typename ValueCType, typename ListBuilderType, typename Range>
+Status AppendCellRange(ListBuilderType& builder, Range&& cell_range) {
+  constexpr bool is_list_builder = std::is_same<ListBuilderType, ListBuilder>::value;
+  constexpr bool is_large_list_builder =
+      std::is_same<ListBuilderType, LargeListBuilder>::value;
+  static_assert(
+      is_list_builder || is_large_list_builder,
+      "Builder type must be either ListBuilder or LargeListBuilder for appending "
+      "multiple rows.");
+
+  using ChildBuilderType = CBuilderType<ValueCType>;
+  ARROW_RETURN_NOT_OK(builder.Append());
+  auto& value_builder =
+      ::arrow::internal::checked_cast<ChildBuilderType&>(*builder.value_builder());
+
+  // XXX: Remove appended value before returning if status isn't OK?
+  return AppendListValues<ValueCType>(value_builder, std::forward<Range>(cell_range));
+}
+
+template <typename ValueCType>
+struct ConversionTraits<std::vector<ValueCType>>
+    : public CTypeTraits<std::vector<ValueCType>> {
+  static Status AppendRow(ListBuilder& builder, const std::vector<ValueCType>& cell) {
+    return AppendCellRange<ValueCType>(builder, cell);
+  }
+
+  static std::vector<ValueCType> GetEntry(const ListArray& array, size_t j) {
+    using ElementArrayType =
+        typename TypeTraits<typename ConversionTraits<ValueCType>::ArrowType>::ArrayType;
+
+    const ElementArrayType& value_array =
+        ::arrow::internal::checked_cast<const ElementArrayType&>(*array.values());
+
+    std::vector<ValueCType> vec(array.value_length(j));
+    for (int64_t i = 0; i < array.value_length(j); i++) {
+      vec[i] =
+          ConversionTraits<ValueCType>::GetEntry(value_array, array.value_offset(j) + i);
+    }
+    return vec;
+  }
+};
+
+template <class ValueCType, std::size_t N>
+struct ConversionTraits<std::array<ValueCType, N>>
+    : public CTypeTraits<std::array<ValueCType, N>> {
+  static arrow::Status AppendRow(FixedSizeListBuilder& builder,
+                                 const std::array<ValueCType, N>& values) {
+    auto vb =
+        ::arrow::internal::checked_cast<typename CTypeTraits<ValueCType>::BuilderType*>(
+            builder.value_builder());
+    ARROW_RETURN_NOT_OK(builder.Append());
+    return vb->AppendValues(values.data(), N);
+  }
+
+  static std::array<ValueCType, N> GetEntry(const ::arrow::FixedSizeListArray& array,
+                                            size_t j) {
+    using ElementArrayType = typename TypeTraits<
+        typename stl::ConversionTraits<ValueCType>::ArrowType>::ArrayType;
+
+    const ElementArrayType& value_array =
+        ::arrow::internal::checked_cast<const ElementArrayType&>(*array.values());
+
+    std::array<ValueCType, N> arr;
+    for (size_t i = 0; i < N; i++) {
+      arr[i] = stl::ConversionTraits<ValueCType>::GetEntry(value_array,
+                                                           array.value_offset(j) + i);
+    }
+    return arr;
+  }
+};
+
+template <typename Optional>
+struct ConversionTraits<Optional, enable_if_optional_like<Optional>>
+    : public CTypeTraits<typename std::decay<decltype(*std::declval<Optional>())>::type> {
+  using OptionalInnerType =
+      typename std::decay<decltype(*std::declval<Optional>())>::type;
+  using typename CTypeTraits<OptionalInnerType>::ArrowType;
+  using CTypeTraits<OptionalInnerType>::type_singleton;
+
+  static Status AppendRow(typename TypeTraits<ArrowType>::BuilderType& builder,
+                          const Optional& cell) {
+    if (cell) {
+      return ConversionTraits<OptionalInnerType>::AppendRow(builder, *cell);
+    } else {
+      return builder.AppendNull();
+    }
+  }
+};
+
+/// Build an arrow::Schema based upon the types defined in a std::tuple-like structure.
+///
+/// While the type information is available at compile-time, we still need to add the
+/// column names at runtime, thus these methods are not constexpr.
+template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value>
+struct SchemaFromTuple {
+  using Element = internal::BareTupleElement<N - 1, Tuple>;
+
+  // Implementations that take a vector-like object for the column names.
+
+  /// Recursively build a vector of arrow::Field from the defined types.
+  ///
+  /// In most cases MakeSchema is the better entrypoint for the Schema creation.
+  static std::vector<std::shared_ptr<Field>> MakeSchemaRecursion(
+      const std::vector<std::string>& names) {
+    std::vector<std::shared_ptr<Field>> ret =
+        SchemaFromTuple<Tuple, N - 1>::MakeSchemaRecursion(names);
+    auto type = ConversionTraits<Element>::type_singleton();
+    ret.push_back(field(names[N - 1], type, internal::is_optional_like<Element>::value));
+    return ret;
+  }
+
+  /// Build a Schema from the types of the tuple-like structure passed in as template
+  /// parameter assign the column names at runtime.
+  ///
+  /// An example usage of this API can look like the following:
+  ///
+  /// \code{.cpp}
+  /// using TupleType = std::tuple<int, std::vector<std::string>>;
+  /// std::shared_ptr<Schema> schema =
+  ///   SchemaFromTuple<TupleType>::MakeSchema({"int_column", "list_of_strings_column"});
+  /// \endcode
+  static std::shared_ptr<Schema> MakeSchema(const std::vector<std::string>& names) {
+    return std::make_shared<Schema>(MakeSchemaRecursion(names));
+  }
+
+  // Implementations that take a tuple-like object for the column names.
+
+  /// Recursively build a vector of arrow::Field from the defined types.
+  ///
+  /// In most cases MakeSchema is the better entrypoint for the Schema creation.
+  template <typename NamesTuple>
+  static std::vector<std::shared_ptr<Field>> MakeSchemaRecursionT(
+      const NamesTuple& names) {
+    using std::get;
+
+    std::vector<std::shared_ptr<Field>> ret =
+        SchemaFromTuple<Tuple, N - 1>::MakeSchemaRecursionT(names);
+    std::shared_ptr<DataType> type = ConversionTraits<Element>::type_singleton();
+    ret.push_back(
+        field(get<N - 1>(names), type, internal::is_optional_like<Element>::value));
+    return ret;
+  }
+
+  /// Build a Schema from the types of the tuple-like structure passed in as template
+  /// parameter assign the column names at runtime.
+  ///
+  /// An example usage of this API can look like the following:
+  ///
+  /// \code{.cpp}
+  /// using TupleType = std::tuple<int, std::vector<std::string>>;
+  /// std::shared_ptr<Schema> schema =
+  ///   SchemaFromTuple<TupleType>::MakeSchema({"int_column", "list_of_strings_column"});
+  /// \endcode
+  template <typename NamesTuple>
+  static std::shared_ptr<Schema> MakeSchema(const NamesTuple& names) {
+    return std::make_shared<Schema>(MakeSchemaRecursionT<NamesTuple>(names));
+  }
+};
+
+template <typename Tuple>
+struct SchemaFromTuple<Tuple, 0> {
+  static std::vector<std::shared_ptr<Field>> MakeSchemaRecursion(
+      const std::vector<std::string>& names) {
+    std::vector<std::shared_ptr<Field>> ret;
+    ret.reserve(names.size());
+    return ret;
+  }
+
+  template <typename NamesTuple>
+  static std::vector<std::shared_ptr<Field>> MakeSchemaRecursionT(
+      const NamesTuple& names) {
+    std::vector<std::shared_ptr<Field>> ret;
+    ret.reserve(std::tuple_size<NamesTuple>::value);
+    return ret;
+  }
+};
+
+namespace internal {
+
+template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value>
+struct CreateBuildersRecursive {
+  static Status Make(MemoryPool* pool,
+                     std::vector<std::unique_ptr<ArrayBuilder>>* builders) {
+    using Element = BareTupleElement<N - 1, Tuple>;
+    std::shared_ptr<DataType> type = ConversionTraits<Element>::type_singleton();
+    ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &builders->at(N - 1)));
+
+    return CreateBuildersRecursive<Tuple, N - 1>::Make(pool, builders);
+  }
+};
+
+template <typename Tuple>
+struct CreateBuildersRecursive<Tuple, 0> {
+  static Status Make(MemoryPool*, std::vector<std::unique_ptr<ArrayBuilder>>*) {
+    return Status::OK();
+  }
+};
+
+template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value>
+struct RowIterator {
+  static Status Append(const std::vector<std::unique_ptr<ArrayBuilder>>& builders,
+                       const Tuple& row) {
+    using std::get;
+    using Element = BareTupleElement<N - 1, Tuple>;
+    using BuilderType =
+        typename TypeTraits<typename ConversionTraits<Element>::ArrowType>::BuilderType;
+
+    BuilderType& builder =
+        ::arrow::internal::checked_cast<BuilderType&>(*builders[N - 1]);
+    ARROW_RETURN_NOT_OK(ConversionTraits<Element>::AppendRow(builder, get<N - 1>(row)));
+
+    return RowIterator<Tuple, N - 1>::Append(builders, row);
+  }
+};
+
+template <typename Tuple>
+struct RowIterator<Tuple, 0> {
+  static Status Append(const std::vector<std::unique_ptr<ArrayBuilder>>& builders,
+                       const Tuple& row) {
+    return Status::OK();
+  }
+};
+
+template <typename Tuple, std::size_t N = std::tuple_size<Tuple>::value>
+struct EnsureColumnTypes {
+  static Status Cast(const Table& table, std::shared_ptr<Table>* table_owner,
+                     const compute::CastOptions& cast_options, compute::ExecContext* ctx,
+                     std::reference_wrapper<const ::arrow::Table>* result) {
+    using Element = BareTupleElement<N - 1, Tuple>;
+    std::shared_ptr<DataType> expected_type = ConversionTraits<Element>::type_singleton();
+
+    if (!table.schema()->field(N - 1)->type()->Equals(*expected_type)) {
+      ARROW_ASSIGN_OR_RAISE(
+          Datum casted,
+          compute::Cast(table.column(N - 1), expected_type, cast_options, ctx));
+      auto new_field = table.schema()->field(N - 1)->WithType(expected_type);
+      ARROW_ASSIGN_OR_RAISE(*table_owner,
+                            table.SetColumn(N - 1, new_field, casted.chunked_array()));
+      *result = **table_owner;
+    }
+
+    return EnsureColumnTypes<Tuple, N - 1>::Cast(result->get(), table_owner, cast_options,
+                                                 ctx, result);
+  }
+};
+
+template <typename Tuple>
+struct EnsureColumnTypes<Tuple, 0> {
+  static Status Cast(const Table& table, std::shared_ptr<Table>* table_owner,
+                     const compute::CastOptions& cast_options, compute::ExecContext* ctx,
+                     std::reference_wrapper<const ::arrow::Table>* result) {
+    return Status::OK();
+  }
+};
+
+template <typename Range, typename Tuple, std::size_t N = std::tuple_size<Tuple>::value>
+struct TupleSetter {
+  static void Fill(const Table& table, Range* rows) {
+    using std::get;
+    using Element = typename std::tuple_element<N - 1, Tuple>::type;
+    using ArrayType =
+        typename TypeTraits<typename ConversionTraits<Element>::ArrowType>::ArrayType;
+
+    auto iter = rows->begin();
+    const ChunkedArray& chunked_array = *table.column(N - 1);
+    for (int i = 0; i < chunked_array.num_chunks(); i++) {
+      const ArrayType& array =
+          ::arrow::internal::checked_cast<const ArrayType&>(*chunked_array.chunk(i));
+      for (int64_t j = 0; j < array.length(); j++) {
+        get<N - 1>(*iter++) = ConversionTraits<Element>::GetEntry(array, j);
+      }
+    }
+
+    return TupleSetter<Range, Tuple, N - 1>::Fill(table, rows);
+  }
+};
+
+template <typename Range, typename Tuple>
+struct TupleSetter<Range, Tuple, 0> {
+  static void Fill(const Table& table, Range* rows) {}
+};
+
+}  // namespace internal
+
+template <typename Range>
+Status TableFromTupleRange(MemoryPool* pool, Range&& rows,
+                           const std::vector<std::string>& names,
+                           std::shared_ptr<Table>* table) {
+  using row_type = typename std::iterator_traits<decltype(std::begin(rows))>::value_type;
+  constexpr std::size_t n_columns = std::tuple_size<row_type>::value;
+
+  std::shared_ptr<Schema> schema = SchemaFromTuple<row_type>::MakeSchema(names);
+
+  std::vector<std::unique_ptr<ArrayBuilder>> builders(n_columns);
+  ARROW_RETURN_NOT_OK(internal::CreateBuildersRecursive<row_type>::Make(pool, &builders));
+
+  for (const auto& row : rows) {
+    ARROW_RETURN_NOT_OK(internal::RowIterator<row_type>::Append(builders, row));
+  }
+
+  std::vector<std::shared_ptr<Array>> arrays;
+  for (const auto& builder : builders) {
+    std::shared_ptr<Array> array;
+    ARROW_RETURN_NOT_OK(builder->Finish(&array));
+    arrays.emplace_back(array);
+  }
+
+  *table = Table::Make(std::move(schema), std::move(arrays));
+
+  return Status::OK();
+}
+
+template <typename Range>
+Status TupleRangeFromTable(const Table& table, const compute::CastOptions& cast_options,
+                           compute::ExecContext* ctx, Range* rows) {
+  using row_type = typename std::decay<decltype(*std::begin(*rows))>::type;
+  constexpr std::size_t n_columns = std::tuple_size<row_type>::value;
+
+  if (table.schema()->num_fields() != n_columns) {
+    return Status::Invalid(
+        "Number of columns in the table does not match the width of the target: ",
+        table.schema()->num_fields(), " != ", n_columns);
+  }
+
+  if (std::size(*rows) != static_cast<size_t>(table.num_rows())) {
+    return Status::Invalid(
+        "Number of rows in the table does not match the size of the target: ",
+        table.num_rows(), " != ", std::size(*rows));
+  }
+
+  // Check that all columns have the correct type, otherwise cast them.
+  std::shared_ptr<Table> table_owner;
+  std::reference_wrapper<const ::arrow::Table> current_table(table);
+
+  ARROW_RETURN_NOT_OK(internal::EnsureColumnTypes<row_type>::Cast(
+      table, &table_owner, cast_options, ctx, &current_table));
+
+  internal::TupleSetter<Range, row_type>::Fill(current_table.get(), rows);
+
+  return Status::OK();
+}
+
+}  // namespace stl
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/stl_allocator.h b/pyarrow/include/arrow/stl_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..82e6aaa8772b9f6f65151fcea7defdeb2a86b4d3
--- /dev/null
+++ b/pyarrow/include/arrow/stl_allocator.h
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/memory_pool.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace stl {
+
+/// \brief A STL allocator delegating allocations to a Arrow MemoryPool
+template <class T>
+class allocator {
+ public:
+  using value_type = T;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using reference = T&;
+  using const_reference = const T&;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+
+  template <class U>
+  struct rebind {
+    using other = allocator<U>;
+  };
+
+  /// \brief Construct an allocator from the default MemoryPool
+  allocator() noexcept : pool_(default_memory_pool()) {}
+  /// \brief Construct an allocator from the given MemoryPool
+  explicit allocator(MemoryPool* pool) noexcept : pool_(pool) {}
+
+  template <class U>
+  allocator(const allocator<U>& rhs) noexcept : pool_(rhs.pool()) {}
+
+  ~allocator() { pool_ = NULLPTR; }
+
+  pointer address(reference r) const noexcept { return std::addressof(r); }
+
+  const_pointer address(const_reference r) const noexcept { return std::addressof(r); }
+
+  pointer allocate(size_type n, const void* /*hint*/ = NULLPTR) {
+    uint8_t* data;
+    Status s = pool_->Allocate(n * sizeof(T), &data);
+    if (!s.ok()) throw std::bad_alloc();
+    return reinterpret_cast<pointer>(data);
+  }
+
+  void deallocate(pointer p, size_type n) {
+    pool_->Free(reinterpret_cast<uint8_t*>(p), n * sizeof(T));
+  }
+
+  size_type size_max() const noexcept { return size_type(-1) / sizeof(T); }
+
+  template <class U, class... Args>
+  void construct(U* p, Args&&... args) {
+    new (reinterpret_cast<void*>(p)) U(std::forward<Args>(args)...);
+  }
+
+  template <class U>
+  void destroy(U* p) {
+    p->~U();
+  }
+
+  MemoryPool* pool() const noexcept { return pool_; }
+
+ private:
+  MemoryPool* pool_;
+};
+
+/// \brief A MemoryPool implementation delegating allocations to a STL allocator
+///
+/// Note that STL allocators don't provide a resizing operation, and therefore
+/// any buffer resizes will do a full reallocation and copy.
+template <typename Allocator = std::allocator<uint8_t>>
+class STLMemoryPool : public MemoryPool {
+ public:
+  /// \brief Construct a memory pool from the given allocator
+  explicit STLMemoryPool(const Allocator& alloc) : alloc_(alloc) {}
+
+  using MemoryPool::Allocate;
+  using MemoryPool::Free;
+  using MemoryPool::Reallocate;
+
+  Status Allocate(int64_t size, int64_t /*alignment*/, uint8_t** out) override {
+    try {
+      *out = alloc_.allocate(size);
+    } catch (std::bad_alloc& e) {
+      return Status::OutOfMemory(e.what());
+    }
+    stats_.DidAllocateBytes(size);
+    return Status::OK();
+  }
+
+  Status Reallocate(int64_t old_size, int64_t new_size, int64_t /*alignment*/,
+                    uint8_t** ptr) override {
+    uint8_t* old_ptr = *ptr;
+    try {
+      *ptr = alloc_.allocate(new_size);
+    } catch (std::bad_alloc& e) {
+      return Status::OutOfMemory(e.what());
+    }
+    memcpy(*ptr, old_ptr, std::min(old_size, new_size));
+    alloc_.deallocate(old_ptr, old_size);
+    stats_.DidReallocateBytes(old_size, new_size);
+    return Status::OK();
+  }
+
+  void Free(uint8_t* buffer, int64_t size, int64_t /*alignment*/) override {
+    alloc_.deallocate(buffer, size);
+    stats_.DidFreeBytes(size);
+  }
+
+  int64_t bytes_allocated() const override { return stats_.bytes_allocated(); }
+
+  int64_t max_memory() const override { return stats_.max_memory(); }
+
+  int64_t total_bytes_allocated() const override {
+    return stats_.total_bytes_allocated();
+  }
+
+  int64_t num_allocations() const override { return stats_.num_allocations(); }
+
+  std::string backend_name() const override { return "stl"; }
+
+ private:
+  Allocator alloc_;
+  arrow::internal::MemoryPoolStats stats_;
+};
+
+template <class T1, class T2>
+bool operator==(const allocator<T1>& lhs, const allocator<T2>& rhs) noexcept {
+  return lhs.pool() == rhs.pool();
+}
+
+template <class T1, class T2>
+bool operator!=(const allocator<T1>& lhs, const allocator<T2>& rhs) noexcept {
+  return !(lhs == rhs);
+}
+
+}  // namespace stl
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/stl_iterator.h b/pyarrow/include/arrow/stl_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..577066cba0fcd5b044dab9ffa6ff41d4906bda81
--- /dev/null
+++ b/pyarrow/include/arrow/stl_iterator.h
@@ -0,0 +1,304 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <optional>
+#include <utility>
+
+#include "arrow/chunked_array.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace stl {
+
+namespace detail {
+
+template <typename ArrayType>
+struct DefaultValueAccessor {
+  using ValueType = decltype(std::declval<ArrayType>().GetView(0));
+
+  ValueType operator()(const ArrayType& array, int64_t index) {
+    return array.GetView(index);
+  }
+};
+
+}  // namespace detail
+
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ArrayIterator {
+ public:
+  using value_type = std::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ArrayIterator() : array_(NULLPTR), index_(0) {}
+
+  explicit ArrayIterator(const ArrayType& array, int64_t index = 0)
+      : array_(&array), index_(index) {}
+
+  // Value access
+  value_type operator*() const {
+    assert(array_);
+    return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
+  }
+
+  value_type operator[](difference_type n) const {
+    assert(array_);
+    return array_->IsNull(index_ + n) ? value_type{} : array_->GetView(index_ + n);
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ArrayIterator& operator++() {
+    ++index_;
+    return *this;
+  }
+  ArrayIterator& operator--() {
+    --index_;
+    return *this;
+  }
+  ArrayIterator operator++(int) {
+    ArrayIterator tmp(*this);
+    ++index_;
+    return tmp;
+  }
+  ArrayIterator operator--(int) {
+    ArrayIterator tmp(*this);
+    --index_;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ArrayIterator operator+(difference_type n) const {
+    return ArrayIterator(*array_, index_ + n);
+  }
+  ArrayIterator operator-(difference_type n) const {
+    return ArrayIterator(*array_, index_ - n);
+  }
+  friend inline ArrayIterator operator+(difference_type diff,
+                                        const ArrayIterator& other) {
+    return ArrayIterator(*other.array_, diff + other.index_);
+  }
+  friend inline ArrayIterator operator-(difference_type diff,
+                                        const ArrayIterator& other) {
+    return ArrayIterator(*other.array_, diff - other.index_);
+  }
+  ArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    return *this;
+  }
+  ArrayIterator& operator-=(difference_type n) {
+    index_ -= n;
+    return *this;
+  }
+
+  // Comparisons
+  bool operator==(const ArrayIterator& other) const { return index_ == other.index_; }
+  bool operator!=(const ArrayIterator& other) const { return index_ != other.index_; }
+  bool operator<(const ArrayIterator& other) const { return index_ < other.index_; }
+  bool operator>(const ArrayIterator& other) const { return index_ > other.index_; }
+  bool operator<=(const ArrayIterator& other) const { return index_ <= other.index_; }
+  bool operator>=(const ArrayIterator& other) const { return index_ >= other.index_; }
+
+ private:
+  const ArrayType* array_;
+  int64_t index_;
+};
+
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = std::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() noexcept : chunked_array_(NULLPTR), index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array,
+                                int64_t index = 0) noexcept
+      : chunked_array_(&chunked_array), index_(index) {}
+
+  // Value access
+  value_type operator*() const {
+    auto chunk_location = GetChunkLocation(index_);
+    ArrayIterator<ArrayType> target_iterator{
+        arrow::internal::checked_cast<const ArrayType&>(
+            *chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+    return target_iterator[chunk_location.index_in_chunk];
+  }
+
+  value_type operator[](difference_type n) const { return *(*this + n); }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    assert(chunked_array_);
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    assert(chunked_array_);
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    assert(other.chunked_array_);
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& other) {
+    assert(other.chunked_array_);
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    return *this;
+  }
+  ChunkedArrayIterator& operator-=(difference_type n) {
+    (*this) += -n;
+    return *this;
+  }
+
+  // Comparisons
+  bool operator==(const ChunkedArrayIterator& other) const {
+    return index_ == other.index_;
+  }
+  bool operator!=(const ChunkedArrayIterator& other) const {
+    return index_ != other.index_;
+  }
+  bool operator<(const ChunkedArrayIterator& other) const {
+    return index_ < other.index_;
+  }
+  bool operator>(const ChunkedArrayIterator& other) const {
+    return index_ > other.index_;
+  }
+  bool operator<=(const ChunkedArrayIterator& other) const {
+    return index_ <= other.index_;
+  }
+  bool operator>=(const ChunkedArrayIterator& other) const {
+    return index_ >= other.index_;
+  }
+
+ private:
+  arrow::ChunkLocation GetChunkLocation(int64_t index) const {
+    assert(chunked_array_);
+    return chunked_array_->chunk_resolver_.Resolve(index);
+  }
+
+  const ChunkedArray* chunked_array_;
+  int64_t index_;
+};
+
+/// Return an iterator to the beginning of the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> Begin(const ChunkedArray& chunked_array) {
+  return ChunkedArrayIterator<ArrayType>(chunked_array);
+}
+
+/// Return an iterator to the end of the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayIterator<ArrayType> End(const ChunkedArray& chunked_array) {
+  return ChunkedArrayIterator<ArrayType>(chunked_array, chunked_array.length());
+}
+
+template <typename ArrayType>
+struct ChunkedArrayRange {
+  const ChunkedArray* chunked_array;
+
+  ChunkedArrayIterator<ArrayType> begin() {
+    return stl::ChunkedArrayIterator<ArrayType>(*chunked_array);
+  }
+  ChunkedArrayIterator<ArrayType> end() {
+    return stl::ChunkedArrayIterator<ArrayType>(*chunked_array, chunked_array->length());
+  }
+};
+
+/// Return an iterable range over the chunked array
+template <typename Type, typename ArrayType = typename TypeTraits<Type>::ArrayType>
+ChunkedArrayRange<ArrayType> Iterate(const ChunkedArray& chunked_array) {
+  return stl::ChunkedArrayRange<ArrayType>{&chunked_array};
+}
+
+}  // namespace stl
+}  // namespace arrow
+
+namespace std {
+
+template <typename ArrayType>
+struct iterator_traits<::arrow::stl::ArrayIterator<ArrayType>> {
+  using IteratorType = ::arrow::stl::ArrayIterator<ArrayType>;
+  using difference_type = typename IteratorType::difference_type;
+  using value_type = typename IteratorType::value_type;
+  using pointer = typename IteratorType::pointer;
+  using reference = typename IteratorType::reference;
+  using iterator_category = typename IteratorType::iterator_category;
+};
+
+template <typename ArrayType>
+struct iterator_traits<::arrow::stl::ChunkedArrayIterator<ArrayType>> {
+  using IteratorType = ::arrow::stl::ChunkedArrayIterator<ArrayType>;
+  using difference_type = typename IteratorType::difference_type;
+  using value_type = typename IteratorType::value_type;
+  using pointer = typename IteratorType::pointer;
+  using reference = typename IteratorType::reference;
+  using iterator_category = typename IteratorType::iterator_category;
+};
+
+}  // namespace std
diff --git a/pyarrow/include/arrow/table.h b/pyarrow/include/arrow/table.h
new file mode 100644
index 0000000000000000000000000000000000000000..dee6f6fdd3cbd64f653737f8aac4c1ca94eeb6dd
--- /dev/null
+++ b/pyarrow/include/arrow/table.h
@@ -0,0 +1,380 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/chunked_array.h"  // IWYU pragma: keep
+#include "arrow/compare.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class KeyValueMetadata;
+class MemoryPool;
+
+/// \class Table
+/// \brief Logical table as sequence of chunked arrays
+class ARROW_EXPORT Table {
+ public:
+  virtual ~Table() = default;
+
+  /// \brief Construct a Table from schema and columns
+  ///
+  /// If columns is zero-length, the table's number of rows is zero
+  ///
+  /// \param[in] schema The table schema (column types)
+  /// \param[in] columns The table's columns as chunked arrays
+  /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns
+  static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema,
+                                     std::vector<std::shared_ptr<ChunkedArray>> columns,
+                                     int64_t num_rows = -1);
+
+  /// \brief Construct a Table from schema and arrays
+  ///
+  /// \param[in] schema The table schema (column types)
+  /// \param[in] arrays The table's columns as arrays
+  /// \param[in] num_rows number of rows in table, -1 (default) to infer from columns
+  static std::shared_ptr<Table> Make(std::shared_ptr<Schema> schema,
+                                     const std::vector<std::shared_ptr<Array>>& arrays,
+                                     int64_t num_rows = -1);
+
+  /// \brief Create an empty Table of a given schema
+  ///
+  /// The output Table will be created with a single empty chunk per column.
+  ///
+  /// \param[in] schema the schema of the empty Table
+  /// \param[in] pool the memory pool to allocate memory from
+  /// \return the resulting Table
+  static Result<std::shared_ptr<Table>> MakeEmpty(
+      std::shared_ptr<Schema> schema, MemoryPool* pool = default_memory_pool());
+
+  /// \brief Construct a Table from a RecordBatchReader.
+  ///
+  /// \param[in] reader the arrow::RecordBatchReader that produces batches
+  static Result<std::shared_ptr<Table>> FromRecordBatchReader(RecordBatchReader* reader);
+
+  /// \brief Construct a Table from RecordBatches, using schema supplied by the first
+  /// RecordBatch.
+  ///
+  /// \param[in] batches a std::vector of record batches
+  static Result<std::shared_ptr<Table>> FromRecordBatches(
+      const std::vector<std::shared_ptr<RecordBatch>>& batches);
+
+  /// \brief Construct a Table from RecordBatches, using supplied schema. There may be
+  /// zero record batches
+  ///
+  /// \param[in] schema the arrow::Schema for each batch
+  /// \param[in] batches a std::vector of record batches
+  static Result<std::shared_ptr<Table>> FromRecordBatches(
+      std::shared_ptr<Schema> schema,
+      const std::vector<std::shared_ptr<RecordBatch>>& batches);
+
+  /// \brief Construct a Table from a chunked StructArray. One column will be produced
+  /// for each field of the StructArray.
+  ///
+  /// \param[in] array a chunked StructArray
+  static Result<std::shared_ptr<Table>> FromChunkedStructArray(
+      const std::shared_ptr<ChunkedArray>& array);
+
+  /// \brief Return the table schema
+  const std::shared_ptr<Schema>& schema() const { return schema_; }
+
+  /// \brief Return a column by index
+  virtual std::shared_ptr<ChunkedArray> column(int i) const = 0;
+
+  /// \brief Return vector of all columns for table
+  virtual const std::vector<std::shared_ptr<ChunkedArray>>& columns() const = 0;
+
+  /// Return a column's field by index
+  std::shared_ptr<Field> field(int i) const { return schema_->field(i); }
+
+  /// \brief Return vector of all fields for table
+  std::vector<std::shared_ptr<Field>> fields() const;
+
+  /// \brief Construct a zero-copy slice of the table with the
+  /// indicated offset and length
+  ///
+  /// \param[in] offset the index of the first row in the constructed
+  /// slice
+  /// \param[in] length the number of rows of the slice. If there are not enough
+  /// rows in the table, the length will be adjusted accordingly
+  ///
+  /// \return a new object wrapped in std::shared_ptr<Table>
+  virtual std::shared_ptr<Table> Slice(int64_t offset, int64_t length) const = 0;
+
+  /// \brief Slice from first row at offset until end of the table
+  std::shared_ptr<Table> Slice(int64_t offset) const { return Slice(offset, num_rows_); }
+
+  /// \brief Return a column by name
+  /// \param[in] name field name
+  /// \return an Array or null if no field was found
+  std::shared_ptr<ChunkedArray> GetColumnByName(const std::string& name) const {
+    auto i = schema_->GetFieldIndex(name);
+    return i == -1 ? NULLPTR : column(i);
+  }
+
+  /// \brief Remove column from the table, producing a new Table
+  virtual Result<std::shared_ptr<Table>> RemoveColumn(int i) const = 0;
+
+  /// \brief Add column to the table, producing a new Table
+  virtual Result<std::shared_ptr<Table>> AddColumn(
+      int i, std::shared_ptr<Field> field_arg,
+      std::shared_ptr<ChunkedArray> column) const = 0;
+
+  /// \brief Replace a column in the table, producing a new Table
+  virtual Result<std::shared_ptr<Table>> SetColumn(
+      int i, std::shared_ptr<Field> field_arg,
+      std::shared_ptr<ChunkedArray> column) const = 0;
+
+  /// \brief Return names of all columns
+  std::vector<std::string> ColumnNames() const;
+
+  /// \brief Rename columns with provided names
+  Result<std::shared_ptr<Table>> RenameColumns(
+      const std::vector<std::string>& names) const;
+
+  /// \brief Return new table with specified columns
+  Result<std::shared_ptr<Table>> SelectColumns(const std::vector<int>& indices) const;
+
+  /// \brief Replace schema key-value metadata with new metadata
+  /// \since 0.5.0
+  ///
+  /// \param[in] metadata new KeyValueMetadata
+  /// \return new Table
+  virtual std::shared_ptr<Table> ReplaceSchemaMetadata(
+      const std::shared_ptr<const KeyValueMetadata>& metadata) const = 0;
+
+  /// \brief Flatten the table, producing a new Table.  Any column with a
+  /// struct type will be flattened into multiple columns
+  ///
+  /// \param[in] pool The pool for buffer allocations, if any
+  virtual Result<std::shared_ptr<Table>> Flatten(
+      MemoryPool* pool = default_memory_pool()) const = 0;
+
+  /// \return PrettyPrint representation suitable for debugging
+  std::string ToString() const;
+
+  /// \brief Perform cheap validation checks to determine obvious inconsistencies
+  /// within the table's schema and internal data.
+  ///
+  /// This is O(k*m) where k is the total number of field descendents,
+  /// and m is the number of chunks.
+  ///
+  /// \return Status
+  virtual Status Validate() const = 0;
+
+  /// \brief Perform extensive validation checks to determine inconsistencies
+  /// within the table's schema and internal data.
+  ///
+  /// This is O(k*n) where k is the total number of field descendents,
+  /// and n is the number of rows.
+  ///
+  /// \return Status
+  virtual Status ValidateFull() const = 0;
+
+  /// \brief Return the number of columns in the table
+  int num_columns() const { return schema_->num_fields(); }
+
+  /// \brief Return the number of rows (equal to each column's logical length)
+  int64_t num_rows() const { return num_rows_; }
+
+  /// \brief Determine if two tables are equal
+  ///
+  /// \param[in] other the table to compare with
+  /// \param[in] opts the options for equality comparisons
+  /// \return true if two tables are equal
+  bool Equals(const Table& other, const EqualOptions& opts) const;
+
+  /// \brief Determine if two tables are equal
+  ///
+  /// \param[in] other the table to compare with
+  /// \param[in] check_metadata if true, the schema metadata will be compared,
+  ///            regardless of the value set in \ref EqualOptions::use_metadata
+  /// \param[in] opts the options for equality comparisons
+  /// \return true if two tables are equal
+  bool Equals(const Table& other, bool check_metadata = false,
+              const EqualOptions& opts = EqualOptions::Defaults()) const {
+    return Equals(other, opts.use_metadata(check_metadata));
+  }
+
+  /// \brief Make a new table by combining the chunks this table has.
+  ///
+  /// All the underlying chunks in the ChunkedArray of each column are
+  /// concatenated into zero or one chunk.
+  ///
+  /// To avoid buffer overflow, binary columns may be combined into
+  /// multiple chunks. Chunks will have the maximum possible length.
+  ///
+  /// \param[in] pool The pool for buffer allocations
+  Result<std::shared_ptr<Table>> CombineChunks(
+      MemoryPool* pool = default_memory_pool()) const;
+
+  /// \brief Make a new record batch by combining the chunks this table has.
+  ///
+  /// All the underlying chunks in the ChunkedArray of each column are
+  /// concatenated into a single chunk.
+  ///
+  /// \param[in] pool The pool for buffer allocations
+  Result<std::shared_ptr<RecordBatch>> CombineChunksToBatch(
+      MemoryPool* pool = default_memory_pool()) const;
+
+ protected:
+  Table();
+
+  std::shared_ptr<Schema> schema_;
+  int64_t num_rows_;
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Table);
+};
+
+/// \brief Compute a stream of record batches from a (possibly chunked) Table
+///
+/// The conversion is zero-copy: each record batch is a view over a slice
+/// of the table's columns.
+///
+/// The table is expected to be valid prior to using it with the batch reader.
+class ARROW_EXPORT TableBatchReader : public RecordBatchReader {
+ public:
+  /// \brief Construct a TableBatchReader for the given table
+  explicit TableBatchReader(const Table& table);
+  explicit TableBatchReader(std::shared_ptr<Table> table);
+
+  std::shared_ptr<Schema> schema() const override;
+
+  Status ReadNext(std::shared_ptr<RecordBatch>* out) override;
+
+  /// \brief Set the desired maximum number of rows for record batches
+  ///
+  /// The actual number of rows in each record batch may be smaller, depending
+  /// on actual chunking characteristics of each table column.
+  void set_chunksize(int64_t chunksize);
+
+ private:
+  std::shared_ptr<Table> owned_table_;
+  const Table& table_;
+  std::vector<ChunkedArray*> column_data_;
+  std::vector<int> chunk_numbers_;
+  std::vector<int64_t> chunk_offsets_;
+  int64_t absolute_row_position_;
+  int64_t max_chunksize_;
+};
+
+/// \brief Controls the behavior of ConcatenateTables().
+struct ARROW_EXPORT ConcatenateTablesOptions {
+  /// If true, the schemas of the tables will be first unified with fields of
+  /// the same name being merged, according to `field_merge_options`, then each
+  /// table will be promoted to the unified schema before being concatenated.
+  /// Otherwise, all tables should have the same schema. Each column in the output table
+  /// is the result of concatenating the corresponding columns in all input tables.
+  bool unify_schemas = false;
+
+  /// options to control how fields are merged when unifying schemas
+  ///
+  /// This field will be ignored if unify_schemas is false
+  Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults();
+
+  static ConcatenateTablesOptions Defaults() { return {}; }
+};
+
+/// \brief Construct a new table from multiple input tables.
+///
+/// The new table is assembled from existing column chunks without copying,
+/// if schemas are identical. If schemas do not match exactly and
+/// unify_schemas is enabled in options (off by default), an attempt is
+/// made to unify them, and then column chunks are converted to their
+/// respective unified datatype, which will probably incur a copy.
+/// :func:`arrow::PromoteTableToSchema` is used to unify schemas.
+///
+/// Tables are concatenated in order they are provided in and the order of
+/// rows within tables will be preserved.
+///
+/// \param[in] tables a std::vector of Tables to be concatenated
+/// \param[in] options specify how to unify schema of input tables
+/// \param[in] memory_pool MemoryPool to be used if null-filled arrays need to
+/// be created or if existing column chunks need to endure type conversion
+/// \return new Table
+ARROW_EXPORT
+Result<std::shared_ptr<Table>> ConcatenateTables(
+    const std::vector<std::shared_ptr<Table>>& tables,
+    ConcatenateTablesOptions options = ConcatenateTablesOptions::Defaults(),
+    MemoryPool* memory_pool = default_memory_pool());
+
+namespace compute {
+class CastOptions;
+}
+
+/// \brief Promotes a table to conform to the given schema.
+///
+/// If a field in the schema does not have a corresponding column in
+/// the table, a column of nulls will be added to the resulting table.
+/// If the corresponding column is of type Null, it will be promoted
+/// to the type specified by schema, with null values filled. The
+/// column will be casted to the type specified by the schema.
+///
+/// Returns an error:
+/// - if the corresponding column's type is not compatible with the
+///   schema.
+/// - if there is a column in the table that does not exist in the schema.
+/// - if the cast fails or casting would be required but is not available.
+///
+/// \param[in] table the input Table
+/// \param[in] schema the target schema to promote to
+/// \param[in] pool The memory pool to be used if null-filled arrays need to
+/// be created.
+ARROW_EXPORT
+Result<std::shared_ptr<Table>> PromoteTableToSchema(
+    const std::shared_ptr<Table>& table, const std::shared_ptr<Schema>& schema,
+    MemoryPool* pool = default_memory_pool());
+
+/// \brief Promotes a table to conform to the given schema.
+///
+/// If a field in the schema does not have a corresponding column in
+/// the table, a column of nulls will be added to the resulting table.
+/// If the corresponding column is of type Null, it will be promoted
+/// to the type specified by schema, with null values filled. The column
+/// will be casted to the type specified by the schema.
+///
+/// Returns an error:
+/// - if the corresponding column's type is not compatible with the
+///   schema.
+/// - if there is a column in the table that does not exist in the schema.
+/// - if the cast fails or casting would be required but is not available.
+///
+/// \param[in] table the input Table
+/// \param[in] schema the target schema to promote to
+/// \param[in] options The cast options to allow promotion of types
+/// \param[in] pool The memory pool to be used if null-filled arrays need to
+/// be created.
+ARROW_EXPORT
+Result<std::shared_ptr<Table>> PromoteTableToSchema(
+    const std::shared_ptr<Table>& table, const std::shared_ptr<Schema>& schema,
+    const compute::CastOptions& options, MemoryPool* pool = default_memory_pool());
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/table_builder.h b/pyarrow/include/arrow/table_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..671cc4ab97996e0eabe5b86af62af4340f1223f6
--- /dev/null
+++ b/pyarrow/include/arrow/table_builder.h
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/array/builder_base.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class MemoryPool;
+class RecordBatch;
+
+/// \class RecordBatchBuilder
+/// \brief Helper class for creating record batches iteratively given a known
+/// schema
+class ARROW_EXPORT RecordBatchBuilder {
+ public:
+  /// \brief Create and initialize a RecordBatchBuilder
+  /// \param[in] schema The schema for the record batch
+  /// \param[in] pool A MemoryPool to use for allocations
+  /// \return the created builder instance
+  static Result<std::unique_ptr<RecordBatchBuilder>> Make(
+      const std::shared_ptr<Schema>& schema, MemoryPool* pool);
+
+  /// \brief Create and initialize a RecordBatchBuilder
+  /// \param[in] schema The schema for the record batch
+  /// \param[in] pool A MemoryPool to use for allocations
+  /// \param[in] initial_capacity The initial capacity for the builders
+  /// \return the created builder instance
+  static Result<std::unique_ptr<RecordBatchBuilder>> Make(
+      const std::shared_ptr<Schema>& schema, MemoryPool* pool, int64_t initial_capacity);
+
+  /// \brief Get base pointer to field builder
+  /// \param i the field index
+  /// \return pointer to ArrayBuilder
+  ArrayBuilder* GetField(int i) { return raw_field_builders_[i]; }
+
+  /// \brief Return field builder casted to indicated specific builder type
+  /// \param i the field index
+  /// \return pointer to template type
+  template <typename T>
+  T* GetFieldAs(int i) {
+    return internal::checked_cast<T*>(raw_field_builders_[i]);
+  }
+
+  /// \brief Finish current batch and optionally reset
+  /// \param[in] reset_builders the resulting RecordBatch
+  /// \return the resulting RecordBatch
+  Result<std::shared_ptr<RecordBatch>> Flush(bool reset_builders);
+
+  /// \brief Finish current batch and reset
+  /// \return the resulting RecordBatch
+  Result<std::shared_ptr<RecordBatch>> Flush();
+
+  /// \brief Set the initial capacity for new builders
+  void SetInitialCapacity(int64_t capacity);
+
+  /// \brief The initial capacity for builders
+  int64_t initial_capacity() const { return initial_capacity_; }
+
+  /// \brief The number of fields in the schema
+  int num_fields() const { return schema_->num_fields(); }
+
+  /// \brief The number of fields in the schema
+  std::shared_ptr<Schema> schema() const { return schema_; }
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(RecordBatchBuilder);
+
+  RecordBatchBuilder(const std::shared_ptr<Schema>& schema, MemoryPool* pool,
+                     int64_t initial_capacity);
+
+  Status CreateBuilders();
+  Status InitBuilders();
+
+  std::shared_ptr<Schema> schema_;
+  int64_t initial_capacity_;
+  MemoryPool* pool_;
+
+  std::vector<std::unique_ptr<ArrayBuilder>> field_builders_;
+  std::vector<ArrayBuilder*> raw_field_builders_;
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/tensor.h b/pyarrow/include/arrow/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..beb62a11bdce9f6a31cb3cbb296169fc3bf8ee92
--- /dev/null
+++ b/pyarrow/include/arrow/tensor.h
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/compare.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+constexpr bool is_tensor_supported(Type::type type_id) {
+  switch (type_id) {
+    case Type::UINT8:
+    case Type::INT8:
+    case Type::UINT16:
+    case Type::INT16:
+    case Type::UINT32:
+    case Type::INT32:
+    case Type::UINT64:
+    case Type::INT64:
+    case Type::HALF_FLOAT:
+    case Type::FLOAT:
+    case Type::DOUBLE:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+namespace internal {
+
+ARROW_EXPORT
+Status ComputeRowMajorStrides(const FixedWidthType& type,
+                              const std::vector<int64_t>& shape,
+                              std::vector<int64_t>* strides);
+
+ARROW_EXPORT
+Status ComputeColumnMajorStrides(const FixedWidthType& type,
+                                 const std::vector<int64_t>& shape,
+                                 std::vector<int64_t>* strides);
+
+ARROW_EXPORT
+bool IsTensorStridesContiguous(const std::shared_ptr<DataType>& type,
+                               const std::vector<int64_t>& shape,
+                               const std::vector<int64_t>& strides);
+
+ARROW_EXPORT
+Status ValidateTensorParameters(const std::shared_ptr<DataType>& type,
+                                const std::shared_ptr<Buffer>& data,
+                                const std::vector<int64_t>& shape,
+                                const std::vector<int64_t>& strides,
+                                const std::vector<std::string>& dim_names);
+
+ARROW_EXPORT
+Status RecordBatchToTensor(const RecordBatch& batch, bool null_to_nan, bool row_major,
+                           MemoryPool* pool, std::shared_ptr<Tensor>* tensor);
+
+}  // namespace internal
+
+class ARROW_EXPORT Tensor {
+ public:
+  /// \brief Create a Tensor with full parameters
+  ///
+  /// This factory function will return Status::Invalid when the parameters are
+  /// inconsistent
+  ///
+  /// \param[in] type The data type of the tensor values
+  /// \param[in] data The buffer of the tensor content
+  /// \param[in] shape The shape of the tensor
+  /// \param[in] strides The strides of the tensor
+  ///            (if this is empty, the data assumed to be row-major)
+  /// \param[in] dim_names The names of the tensor dimensions
+  static inline Result<std::shared_ptr<Tensor>> Make(
+      const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+      const std::vector<int64_t>& shape, const std::vector<int64_t>& strides = {},
+      const std::vector<std::string>& dim_names = {}) {
+    ARROW_RETURN_NOT_OK(
+        internal::ValidateTensorParameters(type, data, shape, strides, dim_names));
+    return std::make_shared<Tensor>(type, data, shape, strides, dim_names);
+  }
+
+  virtual ~Tensor() = default;
+
+  /// Constructor with no dimension names or strides, data assumed to be row-major
+  Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+         const std::vector<int64_t>& shape);
+
+  /// Constructor with non-negative strides
+  Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+         const std::vector<int64_t>& shape, const std::vector<int64_t>& strides);
+
+  /// Constructor with non-negative strides and dimension names
+  Tensor(const std::shared_ptr<DataType>& type, const std::shared_ptr<Buffer>& data,
+         const std::vector<int64_t>& shape, const std::vector<int64_t>& strides,
+         const std::vector<std::string>& dim_names);
+
+  std::shared_ptr<DataType> type() const { return type_; }
+  std::shared_ptr<Buffer> data() const { return data_; }
+
+  const uint8_t* raw_data() const { return data_->data(); }
+  uint8_t* raw_mutable_data() { return data_->mutable_data(); }
+
+  const std::vector<int64_t>& shape() const { return shape_; }
+  const std::vector<int64_t>& strides() const { return strides_; }
+
+  int ndim() const { return static_cast<int>(shape_.size()); }
+
+  const std::vector<std::string>& dim_names() const { return dim_names_; }
+  const std::string& dim_name(int i) const;
+
+  /// Total number of value cells in the tensor
+  int64_t size() const;
+
+  /// Return true if the underlying data buffer is mutable
+  bool is_mutable() const { return data_->is_mutable(); }
+
+  /// Either row major or column major
+  bool is_contiguous() const;
+
+  /// AKA "C order"
+  bool is_row_major() const;
+
+  /// AKA "Fortran order"
+  bool is_column_major() const;
+
+  Type::type type_id() const;
+
+  bool Equals(const Tensor& other, const EqualOptions& = EqualOptions::Defaults()) const;
+
+  /// Compute the number of non-zero values in the tensor
+  Result<int64_t> CountNonZero() const;
+
+  /// Return the offset of the given index on the given strides
+  static int64_t CalculateValueOffset(const std::vector<int64_t>& strides,
+                                      const std::vector<int64_t>& index) {
+    const int64_t n = static_cast<int64_t>(index.size());
+    int64_t offset = 0;
+    for (int64_t i = 0; i < n; ++i) {
+      offset += index[i] * strides[i];
+    }
+    return offset;
+  }
+
+  int64_t CalculateValueOffset(const std::vector<int64_t>& index) const {
+    return Tensor::CalculateValueOffset(strides_, index);
+  }
+
+  /// Returns the value at the given index without data-type and bounds checks
+  template <typename ValueType>
+  const typename ValueType::c_type& Value(const std::vector<int64_t>& index) const {
+    using c_type = typename ValueType::c_type;
+    const int64_t offset = CalculateValueOffset(index);
+    const c_type* ptr = reinterpret_cast<const c_type*>(raw_data() + offset);
+    return *ptr;
+  }
+
+  Status Validate() const {
+    return internal::ValidateTensorParameters(type_, data_, shape_, strides_, dim_names_);
+  }
+
+ protected:
+  Tensor() {}
+
+  std::shared_ptr<DataType> type_;
+  std::shared_ptr<Buffer> data_;
+  std::vector<int64_t> shape_;
+  std::vector<int64_t> strides_;
+
+  /// These names are optional
+  std::vector<std::string> dim_names_;
+
+  template <typename SparseIndexType>
+  friend class SparseTensorImpl;
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Tensor);
+};
+
+template <typename TYPE>
+class NumericTensor : public Tensor {
+ public:
+  using TypeClass = TYPE;
+  using value_type = typename TypeClass::c_type;
+
+  /// \brief Create a NumericTensor with full parameters
+  ///
+  /// This factory function will return Status::Invalid when the parameters are
+  /// inconsistent
+  ///
+  /// \param[in] data The buffer of the tensor content
+  /// \param[in] shape The shape of the tensor
+  /// \param[in] strides The strides of the tensor
+  ///            (if this is empty, the data assumed to be row-major)
+  /// \param[in] dim_names The names of the tensor dimensions
+  static Result<std::shared_ptr<NumericTensor<TYPE>>> Make(
+      const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
+      const std::vector<int64_t>& strides = {},
+      const std::vector<std::string>& dim_names = {}) {
+    ARROW_RETURN_NOT_OK(internal::ValidateTensorParameters(
+        TypeTraits<TYPE>::type_singleton(), data, shape, strides, dim_names));
+    return std::make_shared<NumericTensor<TYPE>>(data, shape, strides, dim_names);
+  }
+
+  /// Constructor with non-negative strides and dimension names
+  NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
+                const std::vector<int64_t>& strides,
+                const std::vector<std::string>& dim_names)
+      : Tensor(TypeTraits<TYPE>::type_singleton(), data, shape, strides, dim_names) {}
+
+  /// Constructor with no dimension names or strides, data assumed to be row-major
+  NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape)
+      : NumericTensor(data, shape, {}, {}) {}
+
+  /// Constructor with non-negative strides
+  NumericTensor(const std::shared_ptr<Buffer>& data, const std::vector<int64_t>& shape,
+                const std::vector<int64_t>& strides)
+      : NumericTensor(data, shape, strides, {}) {}
+
+  const value_type& Value(const std::vector<int64_t>& index) const {
+    return Tensor::Value<TypeClass>(index);
+  }
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/tensor/converter.h b/pyarrow/include/arrow/tensor/converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..408ab22305fff1665956ee8bb831fbc062b9994c
--- /dev/null
+++ b/pyarrow/include/arrow/tensor/converter.h
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/sparse_tensor.h"  // IWYU pragma: export
+
+#include <memory>
+
+namespace arrow {
+namespace internal {
+
+struct SparseTensorConverterMixin {
+  static bool IsNonZero(const uint8_t val) { return val != 0; }
+
+  static void AssignIndex(uint8_t* indices, int64_t val, const int elsize);
+
+  static int64_t GetIndexValue(const uint8_t* value_ptr, const int elsize);
+};
+
+Status MakeSparseCOOTensorFromTensor(const Tensor& tensor,
+                                     const std::shared_ptr<DataType>& index_value_type,
+                                     MemoryPool* pool,
+                                     std::shared_ptr<SparseIndex>* out_sparse_index,
+                                     std::shared_ptr<Buffer>* out_data);
+
+Status MakeSparseCSXMatrixFromTensor(SparseMatrixCompressedAxis axis,
+                                     const Tensor& tensor,
+                                     const std::shared_ptr<DataType>& index_value_type,
+                                     MemoryPool* pool,
+                                     std::shared_ptr<SparseIndex>* out_sparse_index,
+                                     std::shared_ptr<Buffer>* out_data);
+
+Status MakeSparseCSFTensorFromTensor(const Tensor& tensor,
+                                     const std::shared_ptr<DataType>& index_value_type,
+                                     MemoryPool* pool,
+                                     std::shared_ptr<SparseIndex>* out_sparse_index,
+                                     std::shared_ptr<Buffer>* out_data);
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCOOTensor(
+    MemoryPool* pool, const SparseCOOTensor* sparse_tensor);
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSRMatrix(
+    MemoryPool* pool, const SparseCSRMatrix* sparse_tensor);
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSCMatrix(
+    MemoryPool* pool, const SparseCSCMatrix* sparse_tensor);
+
+Result<std::shared_ptr<Tensor>> MakeTensorFromSparseCSFTensor(
+    MemoryPool* pool, const SparseCSFTensor* sparse_tensor);
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/async_test_util.h b/pyarrow/include/arrow/testing/async_test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..7066bbb63d2a5775454d5cffc82df7faf0056db8
--- /dev/null
+++ b/pyarrow/include/arrow/testing/async_test_util.h
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/async_generator.h"
+#include "arrow/util/future.h"
+
+namespace arrow {
+namespace util {
+
+template <typename T>
+AsyncGenerator<T> AsyncVectorIt(std::vector<T> v) {
+  return MakeVectorGenerator(std::move(v));
+}
+
+template <typename T>
+AsyncGenerator<T> FailAt(AsyncGenerator<T> src, int failing_index) {
+  auto index = std::make_shared<std::atomic<int>>(0);
+  return [src, index, failing_index]() {
+    auto idx = index->fetch_add(1);
+    if (idx >= failing_index) {
+      return Future<T>::MakeFinished(Status::Invalid("XYZ"));
+    }
+    return src();
+  };
+}
+
+template <typename T>
+AsyncGenerator<T> SlowdownABit(AsyncGenerator<T> source) {
+  return MakeMappedGenerator(std::move(source), [](const T& res) {
+    return SleepABitAsync().Then([res]() { return res; });
+  });
+}
+
+template <typename T>
+class TrackingGenerator {
+ public:
+  explicit TrackingGenerator(AsyncGenerator<T> source)
+      : state_(std::make_shared<State>(std::move(source))) {}
+
+  Future<T> operator()() {
+    state_->num_read++;
+    return state_->source();
+  }
+
+  int num_read() { return state_->num_read.load(); }
+
+ private:
+  struct State {
+    explicit State(AsyncGenerator<T> source) : source(std::move(source)), num_read(0) {}
+
+    AsyncGenerator<T> source;
+    std::atomic<int> num_read;
+  };
+
+  std::shared_ptr<State> state_;
+};
+
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/builder.h b/pyarrow/include/arrow/testing/builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..6beb7760e3bbf622557aa576590cc1bb3665ea5c
--- /dev/null
+++ b/pyarrow/include/arrow/testing/builder.h
@@ -0,0 +1,231 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/array/builder_time.h"
+#include "arrow/buffer.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/visit_type_inline.h"
+
+namespace arrow {
+
+// ArrayFromVector: construct an Array from vectors of C values
+
+template <typename TYPE, typename C_TYPE = typename TYPE::c_type>
+void ArrayFromVector(const std::shared_ptr<DataType>& type,
+                     const std::vector<bool>& is_valid, const std::vector<C_TYPE>& values,
+                     std::shared_ptr<Array>* out) {
+  auto type_id = TYPE::type_id;
+  ASSERT_EQ(type_id, type->id())
+      << "template parameter and concrete DataType instance don't agree";
+
+  std::unique_ptr<ArrayBuilder> builder_ptr;
+  ASSERT_OK(MakeBuilder(default_memory_pool(), type, &builder_ptr));
+  // Get the concrete builder class to access its Append() specializations
+  auto& builder = dynamic_cast<typename TypeTraits<TYPE>::BuilderType&>(*builder_ptr);
+
+  for (size_t i = 0; i < values.size(); ++i) {
+    if (is_valid[i]) {
+      ASSERT_OK(builder.Append(values[i]));
+    } else {
+      ASSERT_OK(builder.AppendNull());
+    }
+  }
+  ASSERT_OK(builder.Finish(out));
+}
+
+template <typename TYPE, typename C_TYPE = typename TYPE::c_type>
+void ArrayFromVector(const std::shared_ptr<DataType>& type,
+                     const std::vector<C_TYPE>& values, std::shared_ptr<Array>* out) {
+  auto type_id = TYPE::type_id;
+  ASSERT_EQ(type_id, type->id())
+      << "template parameter and concrete DataType instance don't agree";
+
+  std::unique_ptr<ArrayBuilder> builder_ptr;
+  ASSERT_OK(MakeBuilder(default_memory_pool(), type, &builder_ptr));
+  // Get the concrete builder class to access its Append() specializations
+  auto& builder = dynamic_cast<typename TypeTraits<TYPE>::BuilderType&>(*builder_ptr);
+
+  for (size_t i = 0; i < values.size(); ++i) {
+    ASSERT_OK(builder.Append(values[i]));
+  }
+  ASSERT_OK(builder.Finish(out));
+}
+
+// Overloads without a DataType argument, for parameterless types
+
+template <typename TYPE, typename C_TYPE = typename TYPE::c_type>
+void ArrayFromVector(const std::vector<bool>& is_valid, const std::vector<C_TYPE>& values,
+                     std::shared_ptr<Array>* out) {
+  auto type = TypeTraits<TYPE>::type_singleton();
+  ArrayFromVector<TYPE, C_TYPE>(type, is_valid, values, out);
+}
+
+template <typename TYPE, typename C_TYPE = typename TYPE::c_type>
+void ArrayFromVector(const std::vector<C_TYPE>& values, std::shared_ptr<Array>* out) {
+  auto type = TypeTraits<TYPE>::type_singleton();
+  ArrayFromVector<TYPE, C_TYPE>(type, values, out);
+}
+
+// ChunkedArrayFromVector: construct a ChunkedArray from vectors of C values
+
+template <typename TYPE, typename C_TYPE = typename TYPE::c_type>
+void ChunkedArrayFromVector(const std::shared_ptr<DataType>& type,
+                            const std::vector<std::vector<bool>>& is_valid,
+                            const std::vector<std::vector<C_TYPE>>& values,
+                            std::shared_ptr<ChunkedArray>* out) {
+  ArrayVector chunks;
+  ASSERT_EQ(is_valid.size(), values.size());
+  for (size_t i = 0; i < values.size(); ++i) {
+    std::shared_ptr<Array> array;
+    ArrayFromVector<TYPE, C_TYPE>(type, is_valid[i], values[i], &array);
+    chunks.push_back(array);
+  }
+  *out = std::make_shared<ChunkedArray>(chunks);
+}
+
+template <typename TYPE, typename C_TYPE = typename TYPE::c_type>
+void ChunkedArrayFromVector(const std::shared_ptr<DataType>& type,
+                            const std::vector<std::vector<C_TYPE>>& values,
+                            std::shared_ptr<ChunkedArray>* out) {
+  ArrayVector chunks;
+  for (size_t i = 0; i < values.size(); ++i) {
+    std::shared_ptr<Array> array;
+    ArrayFromVector<TYPE, C_TYPE>(type, values[i], &array);
+    chunks.push_back(array);
+  }
+  *out = std::make_shared<ChunkedArray>(chunks);
+}
+
+// Overloads without a DataType argument, for parameterless types
+
+template <typename TYPE, typename C_TYPE = typename TYPE::c_type>
+void ChunkedArrayFromVector(const std::vector<std::vector<bool>>& is_valid,
+                            const std::vector<std::vector<C_TYPE>>& values,
+                            std::shared_ptr<ChunkedArray>* out) {
+  auto type = TypeTraits<TYPE>::type_singleton();
+  ChunkedArrayFromVector<TYPE, C_TYPE>(type, is_valid, values, out);
+}
+
+template <typename TYPE, typename C_TYPE = typename TYPE::c_type>
+void ChunkedArrayFromVector(const std::vector<std::vector<C_TYPE>>& values,
+                            std::shared_ptr<ChunkedArray>* out) {
+  auto type = TypeTraits<TYPE>::type_singleton();
+  ChunkedArrayFromVector<TYPE, C_TYPE>(type, values, out);
+}
+
+template <typename BuilderType>
+void FinishAndCheckPadding(BuilderType* builder, std::shared_ptr<Array>* out) {
+  ASSERT_OK_AND_ASSIGN(*out, builder->Finish());
+  AssertZeroPadded(**out);
+  TestInitialized(**out);
+}
+
+template <class T, class Builder>
+Status MakeArray(const std::vector<uint8_t>& valid_bytes, const std::vector<T>& values,
+                 int64_t size, Builder* builder, std::shared_ptr<Array>* out) {
+  // Append the first 1000
+  for (int64_t i = 0; i < size; ++i) {
+    if (valid_bytes[i] > 0) {
+      RETURN_NOT_OK(builder->Append(values[i]));
+    } else {
+      RETURN_NOT_OK(builder->AppendNull());
+    }
+  }
+  return builder->Finish(out);
+}
+
+template <typename Fn>
+struct VisitBuilder {
+  template <typename T, typename BuilderType = typename TypeTraits<T>::BuilderType,
+            // need to let SFINAE drop this Visit when it would result in
+            // [](NullBuilder*){}(double_builder)
+            typename = decltype(std::declval<Fn>()(std::declval<BuilderType*>()))>
+  Status Visit(const T&, ArrayBuilder* builder, Fn&& fn) {
+    fn(internal::checked_cast<BuilderType*>(builder));
+    return Status::OK();
+  }
+
+  Status Visit(const DataType& t, ArrayBuilder* builder, Fn&& fn) {
+    return Status::NotImplemented("visiting builders of type ", t);
+  }
+};
+
+template <typename Fn>
+Result<std::shared_ptr<Array>> ArrayFromBuilderVisitor(
+    const std::shared_ptr<DataType>& type, int64_t initial_capacity,
+    int64_t visitor_repetitions, Fn&& fn) {
+  std::unique_ptr<ArrayBuilder> builder;
+  RETURN_NOT_OK(MakeBuilder(default_memory_pool(), type, &builder));
+
+  if (initial_capacity != 0) {
+    RETURN_NOT_OK(builder->Resize(initial_capacity));
+  }
+
+  VisitBuilder<Fn> visitor;
+  for (int64_t i = 0; i < visitor_repetitions; ++i) {
+    RETURN_NOT_OK(
+        VisitTypeInline(*builder->type(), &visitor, builder.get(), std::forward<Fn>(fn)));
+  }
+
+  std::shared_ptr<Array> out;
+  RETURN_NOT_OK(builder->Finish(&out));
+  return out;
+}
+
+template <typename Fn>
+Result<std::shared_ptr<Array>> ArrayFromBuilderVisitor(
+    const std::shared_ptr<DataType>& type, int64_t length, Fn&& fn) {
+  return ArrayFromBuilderVisitor(type, length, length, std::forward<Fn>(fn));
+}
+
+template <typename T>
+static inline Status GetBitmapFromVector(const std::vector<T>& is_valid,
+                                         std::shared_ptr<Buffer>* result) {
+  size_t length = is_valid.size();
+
+  ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateEmptyBitmap(length));
+
+  uint8_t* bitmap = buffer->mutable_data();
+  for (size_t i = 0; i < static_cast<size_t>(length); ++i) {
+    if (is_valid[i]) {
+      bit_util::SetBit(bitmap, i);
+    }
+  }
+
+  *result = buffer;
+  return Status::OK();
+}
+
+template <typename T>
+inline void BitmapFromVector(const std::vector<T>& is_valid,
+                             std::shared_ptr<Buffer>* out) {
+  ASSERT_OK(GetBitmapFromVector(is_valid, out));
+}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/executor_util.h b/pyarrow/include/arrow/testing/executor_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..e34fc858d07f60ac31b73d1e84b5dc1cf4189b3f
--- /dev/null
+++ b/pyarrow/include/arrow/testing/executor_util.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/thread_pool.h"
+
+namespace arrow {
+
+/// An executor which synchronously runs the task as part of the SpawnReal call.
+class MockExecutor : public internal::Executor {
+ public:
+  int GetCapacity() override { return 0; }
+
+  Status SpawnReal(internal::TaskHints hints, internal::FnOnce<void()> task, StopToken,
+                   StopCallback&&) override {
+    spawn_count++;
+    std::move(task)();
+    return Status::OK();
+  }
+
+  int spawn_count = 0;
+};
+
+/// An executor which does not actually run the task.  Can be used to simulate situations
+/// where the executor schedules a task in a long queue and doesn't get around to running
+/// it for a while
+class DelayedExecutor : public internal::Executor {
+ public:
+  int GetCapacity() override { return 0; }
+
+  Status SpawnReal(internal::TaskHints hints, internal::FnOnce<void()> task, StopToken,
+                   StopCallback&&) override {
+    captured_tasks.push_back(std::move(task));
+    return Status::OK();
+  }
+
+  std::vector<internal::FnOnce<void()>> captured_tasks;
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/extension_type.h b/pyarrow/include/arrow/testing/extension_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b4492a543a30063a9b0900f5c50e0d76a5f996b
--- /dev/null
+++ b/pyarrow/include/arrow/testing/extension_type.h
@@ -0,0 +1,250 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/extension_type.h"
+#include "arrow/testing/visibility.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+class ARROW_TESTING_EXPORT ExampleUuidArray : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+class ARROW_TESTING_EXPORT ExampleUuidType : public ExtensionType {
+ public:
+  ExampleUuidType() : ExtensionType(fixed_size_binary(16)) {}
+
+  std::string extension_name() const override { return "uuid"; }
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override { return "uuid-serialized"; }
+};
+
+class ARROW_TESTING_EXPORT SmallintArray : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+class ARROW_TESTING_EXPORT TinyintArray : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+class ARROW_TESTING_EXPORT ListExtensionArray : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+class ARROW_TESTING_EXPORT SmallintType : public ExtensionType {
+ public:
+  SmallintType() : ExtensionType(int16()) {}
+
+  std::string extension_name() const override { return "smallint"; }
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override { return "smallint"; }
+};
+
+class ARROW_TESTING_EXPORT TinyintType : public ExtensionType {
+ public:
+  TinyintType() : ExtensionType(int8()) {}
+
+  std::string extension_name() const override { return "tinyint"; }
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override { return "tinyint"; }
+};
+
+class ARROW_TESTING_EXPORT ListExtensionType : public ExtensionType {
+ public:
+  ListExtensionType() : ExtensionType(list(int32())) {}
+
+  std::string extension_name() const override { return "list-ext"; }
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override { return "list-ext"; }
+};
+
+class ARROW_TESTING_EXPORT DictExtensionType : public ExtensionType {
+ public:
+  DictExtensionType() : ExtensionType(dictionary(int8(), utf8())) {}
+
+  std::string extension_name() const override { return "dict-extension"; }
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override { return "dict-extension-serialized"; }
+};
+
+class ARROW_TESTING_EXPORT BinaryViewExtensionType : public ExtensionType {
+ public:
+  BinaryViewExtensionType() : ExtensionType(binary_view()) {}
+
+  std::string extension_name() const override { return "binary_view"; }
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override { return "binary_view_serialized"; }
+};
+
+// A minimal extension type that does not error when passed blank extension information
+class ARROW_TESTING_EXPORT MetadataOptionalExtensionType : public ExtensionType {
+ public:
+  MetadataOptionalExtensionType() : ExtensionType(null()) {}
+  std::string extension_name() const override { return "metadata.optional"; }
+  std::string Serialize() const override { return ""; }
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override {
+    return nullptr;
+  }
+  bool ExtensionEquals(const ExtensionType& other) const override {
+    return other.extension_name() == extension_name();
+  }
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized_data) const override {
+    return std::make_shared<MetadataOptionalExtensionType>();
+  }
+};
+
+class ARROW_TESTING_EXPORT Complex128Array : public ExtensionArray {
+ public:
+  using ExtensionArray::ExtensionArray;
+};
+
+class ARROW_TESTING_EXPORT Complex128Type : public ExtensionType {
+ public:
+  Complex128Type()
+      : ExtensionType(struct_({::arrow::field("real", float64(), /*nullable=*/false),
+                               ::arrow::field("imag", float64(), /*nullable=*/false)})) {}
+
+  std::string extension_name() const override { return "complex128"; }
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override { return "complex128-serialized"; }
+};
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<DataType> uuid();
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<DataType> smallint();
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<DataType> tinyint();
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<DataType> list_extension_type();
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<DataType> dict_extension_type();
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<DataType> binary_view_extension_type();
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<DataType> complex128();
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Array> ExampleUuid();
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Array> ExampleSmallint();
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Array> ExampleTinyint();
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Array> ExampleDictExtension();
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Array> ExampleComplex128();
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Array> MakeComplex128(const std::shared_ptr<Array>& real,
+                                      const std::shared_ptr<Array>& imag);
+
+// A RAII class that registers an extension type on construction
+// and unregisters it on destruction.
+class ARROW_TESTING_EXPORT ExtensionTypeGuard {
+ public:
+  explicit ExtensionTypeGuard(const std::shared_ptr<DataType>& type);
+  explicit ExtensionTypeGuard(const DataTypeVector& types);
+  ~ExtensionTypeGuard();
+  ARROW_DEFAULT_MOVE_AND_ASSIGN(ExtensionTypeGuard);
+
+ protected:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(ExtensionTypeGuard);
+
+  std::vector<std::string> extension_names_;
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/fixed_width_test_util.h b/pyarrow/include/arrow/testing/fixed_width_test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e5e6fa68509e8cc650986ffb67f82cd7b1da207
--- /dev/null
+++ b/pyarrow/include/arrow/testing/fixed_width_test_util.h
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "arrow/testing/visibility.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow::util::internal {
+
+class ARROW_TESTING_EXPORT NestedListGenerator {
+ public:
+  /// \brief Create a nested FixedSizeListType.
+  ///
+  /// \return `fixed_size_list(fixed_size_list(..., sizes[1]), sizes[0])`
+  static std::shared_ptr<DataType> NestedFSLType(
+      const std::shared_ptr<DataType>& inner_type, const std::vector<int>& sizes);
+
+  /// \brief Create a nested FixedListType.
+  ///
+  /// \return `list(list(...))`
+  static std::shared_ptr<DataType> NestedListType(
+      const std::shared_ptr<DataType>& inner_type, size_t depth);
+
+  static Result<std::shared_ptr<Array>> NestedFSLArray(
+      const std::shared_ptr<DataType>& inner_type, const std::vector<int>& list_sizes,
+      int64_t length);
+
+  static Result<std::shared_ptr<Array>> NestedListArray(
+      const std::shared_ptr<DataType>& inner_type, const std::vector<int>& list_sizes,
+      int64_t length);
+
+  /// \brief Generate all possible nested list configurations of depth 1 to max_depth.
+  ///
+  /// Each configuration consists of a single inner value type and a list of sizes.
+  /// Both can be used with NestedFSLArray and NestedListArray to generate test data.
+  ///
+  /// The product of the list sizes and the size of the inner value type is always a power
+  /// of 2 no greater than max_power_of_2_size. For max_depth=3 and
+  /// max_power_of_2_size=32, this generates 108 configurations.
+  static void VisitAllNestedListConfigurations(
+      const std::vector<std::shared_ptr<DataType>>& inner_value_types,
+      const std::function<void(const std::shared_ptr<DataType>&,
+                               const std::vector<int>&)>& visit,
+      int max_depth = 3, int max_power_of_2_size = 32);
+
+ private:
+  // Append([...[[*next_inner_value++, *next_inner_value++, ...]]...])
+  static Status AppendNestedList(ArrayBuilder* nested_builder, const int* list_sizes,
+                                 int64_t* next_inner_value);
+
+  static Result<std::shared_ptr<Array>> NestedListArray(
+      ArrayBuilder* nested_builder, const std::vector<int>& list_sizes, int64_t length);
+};
+
+}  // namespace arrow::util::internal
diff --git a/pyarrow/include/arrow/testing/future_util.h b/pyarrow/include/arrow/testing/future_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ca70d05402f92c71d8f86441eeccec1ebc6d156
--- /dev/null
+++ b/pyarrow/include/arrow/testing/future_util.h
@@ -0,0 +1,142 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/future.h"
+
+// This macro should be called by futures that are expected to
+// complete pretty quickly.  arrow::kDefaultAssertFinishesWaitSeconds is the
+// default max wait here.  Anything longer than that and it's a questionable unit test
+// anyways.
+#define ASSERT_FINISHES_IMPL(fut)                                      \
+  do {                                                                 \
+    ASSERT_TRUE(fut.Wait(::arrow::kDefaultAssertFinishesWaitSeconds)); \
+    if (!fut.is_finished()) {                                          \
+      FAIL() << "Future did not finish in a timely fashion";           \
+    }                                                                  \
+  } while (false)
+
+#define ASSERT_FINISHES_OK(expr)                                              \
+  do {                                                                        \
+    auto&& _fut = (expr);                                                     \
+    ASSERT_TRUE(_fut.Wait(::arrow::kDefaultAssertFinishesWaitSeconds));       \
+    if (!_fut.is_finished()) {                                                \
+      FAIL() << "Future did not finish in a timely fashion";                  \
+    }                                                                         \
+    auto& _st = _fut.status();                                                \
+    if (!_st.ok()) {                                                          \
+      FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _st.ToString(); \
+    }                                                                         \
+  } while (false)
+
+#define ASSERT_FINISHES_AND_RAISES(ENUM, expr) \
+  do {                                         \
+    auto&& _fut = (expr);                      \
+    ASSERT_FINISHES_IMPL(_fut);                \
+    ASSERT_RAISES(ENUM, _fut.status());        \
+  } while (false)
+
+#define EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT(ENUM, matcher, expr) \
+  do {                                                                    \
+    auto&& fut = (expr);                                                  \
+    ASSERT_FINISHES_IMPL(fut);                                            \
+    EXPECT_RAISES_WITH_MESSAGE_THAT(ENUM, matcher, fut.status());         \
+  } while (false)
+
+#define ASSERT_FINISHES_OK_AND_ASSIGN_IMPL(lhs, rexpr, _future_name) \
+  auto _future_name = (rexpr);                                       \
+  ASSERT_FINISHES_IMPL(_future_name);                                \
+  ASSERT_OK_AND_ASSIGN(lhs, _future_name.result());
+
+#define ASSERT_FINISHES_OK_AND_ASSIGN(lhs, rexpr) \
+  ASSERT_FINISHES_OK_AND_ASSIGN_IMPL(lhs, rexpr,  \
+                                     ARROW_ASSIGN_OR_RAISE_NAME(_fut, __COUNTER__))
+
+#define ASSERT_FINISHES_OK_AND_EQ(expected, expr)        \
+  do {                                                   \
+    ASSERT_FINISHES_OK_AND_ASSIGN(auto _actual, (expr)); \
+    ASSERT_EQ(expected, _actual);                        \
+  } while (0)
+
+#define EXPECT_FINISHES_IMPL(fut)                                      \
+  do {                                                                 \
+    EXPECT_TRUE(fut.Wait(::arrow::kDefaultAssertFinishesWaitSeconds)); \
+    if (!fut.is_finished()) {                                          \
+      ADD_FAILURE() << "Future did not finish in a timely fashion";    \
+    }                                                                  \
+  } while (false)
+
+#define ON_FINISH_ASSIGN_OR_HANDLE_ERROR_IMPL(handle_error, future_name, lhs, rexpr) \
+  auto future_name = (rexpr);                                                        \
+  EXPECT_FINISHES_IMPL(future_name);                                                 \
+  handle_error(future_name.status());                                                \
+  EXPECT_OK_AND_ASSIGN(lhs, future_name.result());
+
+#define EXPECT_FINISHES(expr)   \
+  do {                          \
+    EXPECT_FINISHES_IMPL(expr); \
+  } while (0)
+
+#define EXPECT_FINISHES_OK_AND_ASSIGN(lhs, rexpr) \
+  ON_FINISH_ASSIGN_OR_HANDLE_ERROR_IMPL(          \
+      ARROW_EXPECT_OK, ARROW_ASSIGN_OR_RAISE_NAME(_fut, __COUNTER__), lhs, rexpr);
+
+#define EXPECT_FINISHES_OK_AND_EQ(expected, expr)        \
+  do {                                                   \
+    EXPECT_FINISHES_OK_AND_ASSIGN(auto _actual, (expr)); \
+    EXPECT_EQ(expected, _actual);                        \
+  } while (0)
+
+namespace arrow {
+
+constexpr double kDefaultAssertFinishesWaitSeconds = 64;
+
+template <typename T>
+void AssertNotFinished(const Future<T>& fut) {
+  ASSERT_FALSE(IsFutureFinished(fut.state()));
+}
+
+template <typename T>
+void AssertFinished(const Future<T>& fut) {
+  ASSERT_TRUE(IsFutureFinished(fut.state()));
+}
+
+// Assert the future is successful *now*
+template <typename T>
+void AssertSuccessful(const Future<T>& fut) {
+  if (IsFutureFinished(fut.state())) {
+    ASSERT_EQ(fut.state(), FutureState::SUCCESS);
+    ASSERT_OK(fut.status());
+  } else {
+    FAIL() << "Expected future to be completed successfully but it was still pending";
+  }
+}
+
+// Assert the future is failed *now*
+template <typename T>
+void AssertFailed(const Future<T>& fut) {
+  if (IsFutureFinished(fut.state())) {
+    ASSERT_EQ(fut.state(), FutureState::FAILURE);
+    ASSERT_FALSE(fut.status().ok());
+  } else {
+    FAIL() << "Expected future to have failed but it was still pending";
+  }
+}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/generator.h b/pyarrow/include/arrow/testing/generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..05cb8621ab9cbc79d0fd97448911b21167539359
--- /dev/null
+++ b/pyarrow/include/arrow/testing/generator.h
@@ -0,0 +1,369 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/array/array_base.h"
+#include "arrow/array/util.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/compute/type_fwd.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/visibility.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+
+class ARROW_TESTING_EXPORT ConstantArrayGenerator {
+ public:
+  /// \brief Generates a constant BooleanArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> Boolean(int64_t size, bool value = false);
+
+  /// \brief Generates a constant UInt8Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> UInt8(int64_t size, uint8_t value = 0);
+
+  /// \brief Generates a constant Int8Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> Int8(int64_t size, int8_t value = 0);
+
+  /// \brief Generates a constant UInt16Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> UInt16(int64_t size, uint16_t value = 0);
+
+  /// \brief Generates a constant UInt16Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> Int16(int64_t size, int16_t value = 0);
+
+  /// \brief Generates a constant UInt32Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> UInt32(int64_t size, uint32_t value = 0);
+
+  /// \brief Generates a constant UInt32Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> Int32(int64_t size, int32_t value = 0);
+
+  /// \brief Generates a constant UInt64Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> UInt64(int64_t size, uint64_t value = 0);
+
+  /// \brief Generates a constant UInt64Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> Int64(int64_t size, int64_t value = 0);
+
+  /// \brief Generates a constant Float16Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> Float16(int64_t size, uint16_t value = 0);
+
+  /// \brief Generates a constant Float32Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> Float32(int64_t size, float value = 0);
+
+  /// \brief Generates a constant Float64Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> Float64(int64_t size, double value = 0);
+
+  /// \brief Generates a constant StringArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] value to repeat
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> String(int64_t size, std::string value = "");
+
+  template <typename ArrowType, typename CType = typename ArrowType::c_type>
+  static std::shared_ptr<Array> Numeric(int64_t size, CType value = 0) {
+    switch (ArrowType::type_id) {
+      case Type::BOOL:
+        return Boolean(size, static_cast<bool>(value));
+      case Type::UINT8:
+        return UInt8(size, static_cast<uint8_t>(value));
+      case Type::INT8:
+        return Int8(size, static_cast<int8_t>(value));
+      case Type::UINT16:
+        return UInt16(size, static_cast<uint16_t>(value));
+      case Type::INT16:
+        return Int16(size, static_cast<int16_t>(value));
+      case Type::UINT32:
+        return UInt32(size, static_cast<uint32_t>(value));
+      case Type::INT32:
+        return Int32(size, static_cast<int32_t>(value));
+      case Type::UINT64:
+        return UInt64(size, static_cast<uint64_t>(value));
+      case Type::INT64:
+        return Int64(size, static_cast<int64_t>(value));
+      case Type::HALF_FLOAT:
+        return Float16(size, static_cast<uint16_t>(value));
+      case Type::FLOAT:
+        return Float32(size, static_cast<float>(value));
+      case Type::DOUBLE:
+        return Float64(size, static_cast<double>(value));
+      case Type::INTERVAL_DAY_TIME:
+      case Type::DATE32: {
+        EXPECT_OK_AND_ASSIGN(auto viewed,
+                             Int32(size, static_cast<uint32_t>(value))->View(date32()));
+        return viewed;
+      }
+      case Type::INTERVAL_MONTHS: {
+        EXPECT_OK_AND_ASSIGN(auto viewed,
+                             Int32(size, static_cast<uint32_t>(value))
+                                 ->View(std::make_shared<MonthIntervalType>()));
+        return viewed;
+      }
+      case Type::TIME32: {
+        EXPECT_OK_AND_ASSIGN(auto viewed,
+                             Int32(size, static_cast<uint32_t>(value))
+                                 ->View(std::make_shared<Time32Type>(TimeUnit::SECOND)));
+        return viewed;
+      }
+      case Type::TIME64: {
+        EXPECT_OK_AND_ASSIGN(auto viewed, Int64(size, static_cast<uint64_t>(value))
+                                              ->View(std::make_shared<Time64Type>()));
+        return viewed;
+      }
+      case Type::DATE64: {
+        EXPECT_OK_AND_ASSIGN(auto viewed,
+                             Int64(size, static_cast<uint64_t>(value))->View(date64()));
+        return viewed;
+      }
+      case Type::TIMESTAMP: {
+        EXPECT_OK_AND_ASSIGN(
+            auto viewed, Int64(size, static_cast<int64_t>(value))
+                             ->View(std::make_shared<TimestampType>(TimeUnit::SECOND)));
+        return viewed;
+      }
+      default:
+        return nullptr;
+    }
+  }
+
+  /// \brief Generates a constant Array of zeroes
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] type the type of the Array
+  ///
+  /// \return a generated Array
+  static std::shared_ptr<Array> Zeroes(int64_t size,
+                                       const std::shared_ptr<DataType>& type);
+
+  /// \brief Generates a RecordBatch of zeroes
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] schema to conform to
+  ///
+  /// This function is handy to return of RecordBatch of a desired shape.
+  ///
+  /// \return a generated RecordBatch
+  static std::shared_ptr<RecordBatch> Zeroes(int64_t size,
+                                             const std::shared_ptr<Schema>& schema);
+
+  /// \brief Generates a RecordBatchReader by repeating a RecordBatch
+  ///
+  /// \param[in] n_batch the number of times it repeats batch
+  /// \param[in] batch the RecordBatch to repeat
+  ///
+  /// \return a generated RecordBatchReader
+  static std::shared_ptr<RecordBatchReader> Repeat(
+      int64_t n_batch, const std::shared_ptr<RecordBatch> batch);
+
+  /// \brief Generates a RecordBatchReader of zeroes batches
+  ///
+  /// \param[in] n_batch the number of RecordBatch
+  /// \param[in] batch_size the size of each RecordBatch
+  /// \param[in] schema to conform to
+  ///
+  /// \return a generated RecordBatchReader
+  static std::shared_ptr<RecordBatchReader> Zeroes(int64_t n_batch, int64_t batch_size,
+                                                   const std::shared_ptr<Schema>& schema);
+};
+
+ARROW_TESTING_EXPORT
+Result<std::shared_ptr<Array>> ScalarVectorToArray(const ScalarVector& scalars);
+
+namespace gen {
+
+class ARROW_TESTING_EXPORT ArrayGenerator {
+ public:
+  virtual ~ArrayGenerator() = default;
+  virtual Result<std::shared_ptr<Array>> Generate(int64_t num_rows) = 0;
+  virtual std::shared_ptr<DataType> type() const = 0;
+};
+
+// Same as DataGenerator below but instead of returning Result an ok status is EXPECT'd
+class ARROW_TESTING_EXPORT GTestDataGenerator {
+ public:
+  virtual ~GTestDataGenerator() = default;
+  virtual std::shared_ptr<::arrow::RecordBatch> RecordBatch(int64_t num_rows) = 0;
+  virtual std::vector<std::shared_ptr<::arrow::RecordBatch>> RecordBatches(
+      int64_t rows_per_batch, int num_batches) = 0;
+
+  virtual ::arrow::compute::ExecBatch ExecBatch(int64_t num_rows) = 0;
+  virtual std::vector<::arrow::compute::ExecBatch> ExecBatches(int64_t rows_per_batch,
+                                                               int num_batches) = 0;
+
+  virtual std::shared_ptr<::arrow::Table> Table(int64_t rows_per_chunk,
+                                                int num_chunks = 1) = 0;
+  virtual std::shared_ptr<::arrow::Schema> Schema() = 0;
+};
+
+class ARROW_TESTING_EXPORT DataGenerator {
+ public:
+  virtual ~DataGenerator() = default;
+  virtual Result<std::shared_ptr<::arrow::RecordBatch>> RecordBatch(int64_t num_rows) = 0;
+  virtual Result<std::vector<std::shared_ptr<::arrow::RecordBatch>>> RecordBatches(
+      int64_t rows_per_batch, int num_batches) = 0;
+
+  virtual Result<::arrow::compute::ExecBatch> ExecBatch(int64_t num_rows) = 0;
+  virtual Result<std::vector<::arrow::compute::ExecBatch>> ExecBatches(
+      int64_t rows_per_batch, int num_batches) = 0;
+
+  virtual Result<std::shared_ptr<::arrow::Table>> Table(int64_t rows_per_chunk,
+                                                        int num_chunks = 1) = 0;
+  virtual std::shared_ptr<::arrow::Schema> Schema() = 0;
+  /// @brief Converts this generator to a variant that fails (in a googletest sense)
+  ///        if any error is encountered.
+  virtual std::unique_ptr<GTestDataGenerator> FailOnError() = 0;
+};
+
+/// @brief A potentially named field
+///
+/// If name is not specified then a name will be generated automatically (e.g. f0, f1)
+struct ARROW_TESTING_EXPORT GeneratorField {
+ public:
+  GeneratorField(std::shared_ptr<ArrayGenerator> gen)  // NOLINT implicit conversion
+      : name(), gen(std::move(gen)) {}
+  GeneratorField(std::string name, std::shared_ptr<ArrayGenerator> gen)
+      : name(std::move(name)), gen(std::move(gen)) {}
+
+  std::optional<std::string> name;
+  std::shared_ptr<ArrayGenerator> gen;
+};
+
+/// Create a table generator with the given fields
+ARROW_TESTING_EXPORT std::shared_ptr<DataGenerator> Gen(
+    std::vector<GeneratorField> column_gens);
+
+/// make a generator that returns a constant value
+ARROW_TESTING_EXPORT std::shared_ptr<ArrayGenerator> Constant(
+    std::shared_ptr<Scalar> value);
+
+/// make a generator that returns an incrementing value
+///
+/// Note: overflow is not prevented standard unsigned integer overflow applies
+template <typename T = uint32_t>
+std::shared_ptr<ArrayGenerator> Step(T start = 0, T step = 1) {
+  class StepGenerator : public ArrayGenerator {
+   public:
+    // Use [[maybe_unused]] to avoid a compiler warning in Clang versions before 15 that
+    // incorrectly reports 'unused type alias'.
+    using ArrowType [[maybe_unused]] = typename CTypeTraits<T>::ArrowType;
+    static_assert(is_number_type<ArrowType>::value,
+                  "Step generator only supports numeric types");
+
+    StepGenerator(T start, T step) : start_(start), step_(step) {}
+
+    Result<std::shared_ptr<Array>> Generate(int64_t num_rows) override {
+      TypedBufferBuilder<T> builder;
+      ARROW_RETURN_NOT_OK(builder.Reserve(num_rows));
+      T val = start_;
+      for (int64_t i = 0; i < num_rows; i++) {
+        builder.UnsafeAppend(val);
+        val += step_;
+      }
+      start_ = val;
+      ARROW_ASSIGN_OR_RAISE(auto buf, builder.Finish());
+      return MakeArray(ArrayData::Make(TypeTraits<ArrowType>::type_singleton(), num_rows,
+                                       {NULLPTR, std::move(buf)}, /*null_count=*/0));
+    }
+
+    std::shared_ptr<DataType> type() const override {
+      return TypeTraits<ArrowType>::type_singleton();
+    }
+
+   private:
+    T start_;
+    T step_;
+  };
+
+  return std::make_shared<StepGenerator>(start, step);
+}
+
+/// make a generator that returns a random value
+ARROW_TESTING_EXPORT std::shared_ptr<ArrayGenerator> Random(
+    std::shared_ptr<DataType> type);
+/// TODO(if-needed) could add a repeat-scalars generator, e.g. Repeat({1, 2, 3}) for
+/// 1,2,3,1,2,3,1
+///
+/// TODO(if-needed) could add a repeat-from-json generator e.g. Repeat(int32(), "[1, 2,
+/// 3]")), same behavior as repeat-scalars
+
+}  // namespace gen
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/gtest_compat.h b/pyarrow/include/arrow/testing/gtest_compat.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fd0bfd32c5bc72bfdfca4108b40dbedace62d1e
--- /dev/null
+++ b/pyarrow/include/arrow/testing/gtest_compat.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+// GTest < 1.11
+#ifndef GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST
+#  define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(A)
+#endif
+// GTest < 1.10
+#ifndef TYPED_TEST_SUITE
+#  define TYPED_TEST_SUITE TYPED_TEST_CASE
+#  define TYPED_TEST_SUITE_P TYPED_TEST_CASE_P
+#  define INSTANTIATE_TEST_SUITE_P INSTANTIATE_TEST_CASE_P
+#  define REGISTER_TYPED_TEST_SUITE_P REGISTER_TYPED_TEST_CASE_P
+#  define INSTANTIATE_TYPED_TEST_SUITE_P INSTANTIATE_TYPED_TEST_CASE_P
+#endif
diff --git a/pyarrow/include/arrow/testing/gtest_util.h b/pyarrow/include/arrow/testing/gtest_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..62bf907a2d899d4b27d7ba33d79dd938498e2561
--- /dev/null
+++ b/pyarrow/include/arrow/testing/gtest_util.h
@@ -0,0 +1,575 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/compare.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/testing/gtest_compat.h"
+#include "arrow/testing/visibility.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_util.h"
+#include "arrow/util/type_fwd.h"
+
+// NOTE: failing must be inline in the macros below, to get correct file / line number
+// reporting on test failures.
+
+// NOTE: using a for loop for this macro allows extra failure messages to be
+// appended with operator<<
+#define ASSERT_RAISES(ENUM, expr)                                                 \
+  for (::arrow::Status _st = ::arrow::ToStatus((expr)); !_st.Is##ENUM();)         \
+  FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \
+                ENUM) ", but got "                                                \
+         << _st.ToString()
+
+#define ASSERT_RAISES_WITH_MESSAGE(ENUM, message, expr)                               \
+  do {                                                                                \
+    auto _res = (expr);                                                               \
+    ::arrow::Status _st = ::arrow::ToStatus(_res);                                    \
+    if (!_st.Is##ENUM()) {                                                            \
+      FAIL() << "Expected '" ARROW_STRINGIFY(expr) "' to fail with " ARROW_STRINGIFY( \
+                    ENUM) ", but got "                                                \
+             << _st.ToString();                                                       \
+    }                                                                                 \
+    ASSERT_EQ((message), _st.ToStringWithoutContextLines());                          \
+  } while (false)
+
+#define EXPECT_RAISES_WITH_MESSAGE_THAT(ENUM, matcher, expr)                             \
+  do {                                                                                   \
+    auto _res = (expr);                                                                  \
+    ::arrow::Status _st = ::arrow::ToStatus(_res);                                       \
+    EXPECT_TRUE(_st.Is##ENUM()) << "Expected '" ARROW_STRINGIFY(expr) "' to fail with "  \
+                                << ARROW_STRINGIFY(ENUM) ", but got " << _st.ToString(); \
+    EXPECT_THAT(_st.ToStringWithoutContextLines(), (matcher));                           \
+  } while (false)
+
+#define EXPECT_RAISES_WITH_CODE_AND_MESSAGE_THAT(code, matcher, expr) \
+  do {                                                                \
+    auto _res = (expr);                                               \
+    ::arrow::Status _st = ::arrow::ToStatus(_res);                    \
+    EXPECT_EQ(_st.CodeAsString(), Status::CodeAsString(code));        \
+    EXPECT_THAT(_st.ToStringWithoutContextLines(), (matcher));        \
+  } while (false)
+
+#define ASSERT_OK(expr)                                             \
+  for (::arrow::Status _st = ::arrow::ToStatus((expr)); !_st.ok();) \
+  FAIL() << "'" ARROW_STRINGIFY(expr) "' failed with " << _st.ToString()
+
+#define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr))
+
+#define ARROW_EXPECT_OK(expr)                                           \
+  do {                                                                  \
+    auto _res = (expr);                                                 \
+    ::arrow::Status _st = ::arrow::ToStatus(_res);                      \
+    EXPECT_TRUE(_st.ok()) << "'" ARROW_STRINGIFY(expr) "' failed with " \
+                          << _st.ToString();                            \
+  } while (false)
+
+#define EXPECT_OK ARROW_EXPECT_OK
+
+#define EXPECT_OK_NO_THROW(expr) EXPECT_NO_THROW(EXPECT_OK(expr))
+
+#define ASSERT_NOT_OK(expr)                                        \
+  for (::arrow::Status _st = ::arrow::ToStatus((expr)); _st.ok();) \
+  FAIL() << "'" ARROW_STRINGIFY(expr) "' did not failed" << _st.ToString()
+
+#define ABORT_NOT_OK(expr)                         \
+  do {                                             \
+    auto _res = (expr);                            \
+    ::arrow::Status _st = ::arrow::ToStatus(_res); \
+    if (ARROW_PREDICT_FALSE(!_st.ok())) {          \
+      _st.Abort();                                 \
+    }                                              \
+  } while (false);
+
+#define ASSIGN_OR_HANDLE_ERROR_IMPL(handle_error, status_name, lhs, rexpr) \
+  auto&& status_name = (rexpr);                                            \
+  handle_error(status_name.status());                                      \
+  lhs = std::move(status_name).ValueOrDie();
+
+#define ASSERT_OK_AND_ASSIGN(lhs, rexpr) \
+  ASSIGN_OR_HANDLE_ERROR_IMPL(           \
+      ASSERT_OK, ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), lhs, rexpr);
+
+#define ASSIGN_OR_ABORT(lhs, rexpr)                                                     \
+  ASSIGN_OR_HANDLE_ERROR_IMPL(ABORT_NOT_OK,                                             \
+                              ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
+                              lhs, rexpr);
+
+#define EXPECT_OK_AND_ASSIGN(lhs, rexpr)                                                \
+  ASSIGN_OR_HANDLE_ERROR_IMPL(ARROW_EXPECT_OK,                                          \
+                              ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
+                              lhs, rexpr);
+
+#define ASSERT_OK_AND_EQ(expected, expr)        \
+  do {                                          \
+    ASSERT_OK_AND_ASSIGN(auto _actual, (expr)); \
+    ASSERT_EQ(expected, _actual);               \
+  } while (0)
+
+// A generalized version of GTest's SCOPED_TRACE that takes arbitrary arguments.
+//   ARROW_SCOPED_TRACE("some variable = ", some_variable, ...)
+
+#define ARROW_SCOPED_TRACE(...) SCOPED_TRACE(::arrow::internal::JoinToString(__VA_ARGS__))
+
+namespace arrow {
+
+// ----------------------------------------------------------------------
+// Useful testing::Types declarations
+
+inline void PrintTo(StatusCode code, std::ostream* os) {
+  *os << Status::CodeAsString(code);
+}
+
+using NumericArrowTypes =
+    ::testing::Types<UInt8Type, UInt16Type, UInt32Type, UInt64Type, Int8Type, Int16Type,
+                     Int32Type, Int64Type, FloatType, DoubleType>;
+
+using RealArrowTypes = ::testing::Types<FloatType, DoubleType>;
+
+using IntegralArrowTypes = ::testing::Types<UInt8Type, UInt16Type, UInt32Type, UInt64Type,
+                                            Int8Type, Int16Type, Int32Type, Int64Type>;
+
+using PhysicalIntegralArrowTypes =
+    ::testing::Types<UInt8Type, UInt16Type, UInt32Type, UInt64Type, Int8Type, Int16Type,
+                     Int32Type, Int64Type, Date32Type, Date64Type, Time32Type, Time64Type,
+                     TimestampType, MonthIntervalType>;
+
+using PrimitiveArrowTypes =
+    ::testing::Types<BooleanType, Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
+                     UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType>;
+
+using TemporalArrowTypes =
+    ::testing::Types<Date32Type, Date64Type, TimestampType, Time32Type, Time64Type>;
+
+// we can uncomment Decimal32Type and Decimal64Type once the cast
+// functions are implemented for those types
+using DecimalArrowTypes =
+    ::testing::Types</*Decimal32Type, Decimal64Type,*/ Decimal128Type, Decimal256Type>;
+
+using BaseBinaryArrowTypes =
+    ::testing::Types<BinaryType, LargeBinaryType, StringType, LargeStringType>;
+
+using BaseBinaryOrBinaryViewLikeArrowTypes =
+    ::testing::Types<BinaryType, LargeBinaryType, BinaryViewType, StringType,
+                     LargeStringType, StringViewType>;
+using AllBinaryOrBinrayViewLikeArrowTypes =
+    ::testing::Types<BinaryType, LargeBinaryType, BinaryViewType, FixedSizeBinaryType,
+                     StringType, LargeStringType, StringViewType>;
+
+using BinaryArrowTypes = ::testing::Types<BinaryType, LargeBinaryType>;
+
+using StringArrowTypes = ::testing::Types<StringType, LargeStringType>;
+
+using StringOrStringViewArrowTypes =
+    ::testing::Types<StringType, LargeStringType, StringViewType>;
+
+using ListArrowTypes = ::testing::Types<ListType, LargeListType>;
+
+using UnionArrowTypes = ::testing::Types<SparseUnionType, DenseUnionType>;
+
+class Array;
+class ChunkedArray;
+class RecordBatch;
+class Table;
+struct Datum;
+
+#define ASSERT_ARRAYS_EQUAL(lhs, rhs) AssertArraysEqual((lhs), (rhs))
+#define ASSERT_BATCHES_EQUAL(lhs, rhs) AssertBatchesEqual((lhs), (rhs))
+#define ASSERT_BATCHES_APPROX_EQUAL(lhs, rhs) AssertBatchesApproxEqual((lhs), (rhs))
+#define ASSERT_TABLES_EQUAL(lhs, rhs) AssertTablesEqual((lhs), (rhs))
+
+// Default EqualOptions for testing
+static inline EqualOptions TestingEqualOptions() {
+  return EqualOptions{}.nans_equal(true).signed_zeros_equal(false);
+}
+
+// If verbose is true, then the arrays will be pretty printed
+ARROW_TESTING_EXPORT void AssertArraysEqual(
+    const Array& expected, const Array& actual, bool verbose = false,
+    const EqualOptions& options = TestingEqualOptions());
+ARROW_TESTING_EXPORT void AssertArraysApproxEqual(
+    const Array& expected, const Array& actual, bool verbose = false,
+    const EqualOptions& options = TestingEqualOptions());
+// Returns true when values are both null
+ARROW_TESTING_EXPORT void AssertScalarsEqual(
+    const Scalar& expected, const Scalar& actual, bool verbose = false,
+    const EqualOptions& options = TestingEqualOptions());
+ARROW_TESTING_EXPORT void AssertScalarsApproxEqual(
+    const Scalar& expected, const Scalar& actual, bool verbose = false,
+    const EqualOptions& options = TestingEqualOptions());
+ARROW_TESTING_EXPORT void AssertBatchesEqual(
+    const RecordBatch& expected, const RecordBatch& actual, bool check_metadata = false,
+    const EqualOptions& options = TestingEqualOptions());
+ARROW_TESTING_EXPORT void AssertBatchesApproxEqual(
+    const RecordBatch& expected, const RecordBatch& actual,
+    const EqualOptions& options = TestingEqualOptions());
+ARROW_TESTING_EXPORT void AssertChunkedEqual(
+    const ChunkedArray& expected, const ChunkedArray& actual,
+    const EqualOptions& options = TestingEqualOptions());
+ARROW_TESTING_EXPORT void AssertChunkedEqual(
+    const ChunkedArray& actual, const ArrayVector& expected,
+    const EqualOptions& options = TestingEqualOptions());
+// Like ChunkedEqual, but permits different chunk layout
+ARROW_TESTING_EXPORT void AssertChunkedEquivalent(
+    const ChunkedArray& expected, const ChunkedArray& actual,
+    const EqualOptions& options = TestingEqualOptions());
+ARROW_TESTING_EXPORT void AssertChunkedApproxEquivalent(
+    const ChunkedArray& expected, const ChunkedArray& actual,
+    const EqualOptions& options = TestingEqualOptions());
+ARROW_TESTING_EXPORT void AssertBufferEqual(const Buffer& buffer,
+                                            const std::vector<uint8_t>& expected);
+ARROW_TESTING_EXPORT void AssertBufferEqual(const Buffer& buffer,
+                                            std::string_view expected);
+ARROW_TESTING_EXPORT void AssertBufferEqual(const Buffer& buffer, const Buffer& expected);
+
+ARROW_TESTING_EXPORT void AssertTypeEqual(const DataType& lhs, const DataType& rhs,
+                                          bool check_metadata = false);
+ARROW_TESTING_EXPORT void AssertTypeEqual(const std::shared_ptr<DataType>& lhs,
+                                          const std::shared_ptr<DataType>& rhs,
+                                          bool check_metadata = false);
+ARROW_TESTING_EXPORT void AssertFieldEqual(const Field& lhs, const Field& rhs,
+                                           bool check_metadata = false);
+ARROW_TESTING_EXPORT void AssertFieldEqual(const std::shared_ptr<Field>& lhs,
+                                           const std::shared_ptr<Field>& rhs,
+                                           bool check_metadata = false);
+ARROW_TESTING_EXPORT void AssertSchemaEqual(const Schema& lhs, const Schema& rhs,
+                                            bool check_metadata = false);
+ARROW_TESTING_EXPORT void AssertSchemaEqual(const std::shared_ptr<Schema>& lhs,
+                                            const std::shared_ptr<Schema>& rhs,
+                                            bool check_metadata = false);
+
+ARROW_TESTING_EXPORT void AssertTypeNotEqual(const DataType& lhs, const DataType& rhs,
+                                             bool check_metadata = false);
+ARROW_TESTING_EXPORT void AssertTypeNotEqual(const std::shared_ptr<DataType>& lhs,
+                                             const std::shared_ptr<DataType>& rhs,
+                                             bool check_metadata = false);
+ARROW_TESTING_EXPORT void AssertFieldNotEqual(const Field& lhs, const Field& rhs,
+                                              bool check_metadata = false);
+ARROW_TESTING_EXPORT void AssertFieldNotEqual(const std::shared_ptr<Field>& lhs,
+                                              const std::shared_ptr<Field>& rhs,
+                                              bool check_metadata = false);
+ARROW_TESTING_EXPORT void AssertSchemaNotEqual(const Schema& lhs, const Schema& rhs,
+                                               bool check_metadata = false);
+ARROW_TESTING_EXPORT void AssertSchemaNotEqual(const std::shared_ptr<Schema>& lhs,
+                                               const std::shared_ptr<Schema>& rhs,
+                                               bool check_metadata = false);
+
+ARROW_TESTING_EXPORT Result<std::optional<std::string>> PrintArrayDiff(
+    const ChunkedArray& expected, const ChunkedArray& actual);
+
+ARROW_TESTING_EXPORT void AssertTablesEqual(
+    const Table& expected, const Table& actual, bool same_chunk_layout = true,
+    bool flatten = false, const EqualOptions& options = TestingEqualOptions());
+
+ARROW_TESTING_EXPORT void AssertDatumsEqual(
+    const Datum& expected, const Datum& actual, bool verbose = false,
+    const EqualOptions& options = TestingEqualOptions());
+ARROW_TESTING_EXPORT void AssertDatumsApproxEqual(
+    const Datum& expected, const Datum& actual, bool verbose = false,
+    const EqualOptions& options = TestingEqualOptions());
+
+template <typename C_TYPE>
+void AssertNumericDataEqual(const C_TYPE* raw_data,
+                            const std::vector<C_TYPE>& expected_values) {
+  for (auto expected : expected_values) {
+    ASSERT_EQ(expected, *raw_data);
+    ++raw_data;
+  }
+}
+
+ARROW_TESTING_EXPORT void CompareBatch(
+    const RecordBatch& left, const RecordBatch& right, bool compare_metadata = true,
+    const EqualOptions& options = TestingEqualOptions());
+
+ARROW_TESTING_EXPORT void ApproxCompareBatch(
+    const RecordBatch& left, const RecordBatch& right, bool compare_metadata = true,
+    const EqualOptions& options = TestingEqualOptions());
+
+// Check if the padding of the buffers of the array is zero.
+// Also cause valgrind warnings if the padding bytes are uninitialized.
+ARROW_TESTING_EXPORT void AssertZeroPadded(const Array& array);
+
+// Check if the valid buffer bytes are initialized
+// and cause valgrind warnings otherwise.
+ARROW_TESTING_EXPORT void TestInitialized(const ArrayData& array);
+ARROW_TESTING_EXPORT void TestInitialized(const Array& array);
+
+#define DECL_T() typedef typename TestFixture::T T;
+
+#define DECL_TYPE() typedef typename TestFixture::Type Type;
+
+// ArrayFromJSON: construct an Array from a simple JSON representation
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Array> ArrayFromJSON(const std::shared_ptr<DataType>&,
+                                     std::string_view json);
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Array> DictArrayFromJSON(const std::shared_ptr<DataType>& type,
+                                         std::string_view indices_json,
+                                         std::string_view dictionary_json);
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<RecordBatch> RecordBatchFromJSON(const std::shared_ptr<Schema>&,
+                                                 std::string_view);
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<ChunkedArray> ChunkedArrayFromJSON(const std::shared_ptr<DataType>&,
+                                                   const std::vector<std::string>& json);
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Scalar> ScalarFromJSON(const std::shared_ptr<DataType>&,
+                                       std::string_view json);
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Scalar> DictScalarFromJSON(const std::shared_ptr<DataType>&,
+                                           std::string_view index_json,
+                                           std::string_view dictionary_json);
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Table> TableFromJSON(const std::shared_ptr<Schema>&,
+                                     const std::vector<std::string>& json);
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Tensor> TensorFromJSON(const std::shared_ptr<DataType>& type,
+                                       std::string_view data, std::string_view shape,
+                                       std::string_view strides = "[]",
+                                       std::string_view dim_names = "[]");
+
+ARROW_TESTING_EXPORT
+std::shared_ptr<Tensor> TensorFromJSON(const std::shared_ptr<DataType>& type,
+                                       std::string_view data,
+                                       const std::vector<int64_t>& shape,
+                                       const std::vector<int64_t>& strides = {},
+                                       const std::vector<std::string>& dim_names = {});
+
+// Given an array, return a new identical array except for one validity bit
+// set to a new value.
+// This is useful to force the underlying "value" of null entries to otherwise
+// invalid data and check that errors don't get reported.
+ARROW_TESTING_EXPORT
+std::shared_ptr<Array> TweakValidityBit(const std::shared_ptr<Array>& array,
+                                        int64_t index, bool validity);
+
+ARROW_TESTING_EXPORT
+void SleepFor(double seconds);
+
+// Sleeps for a very small amount of time.  The thread will be yielded
+// at least once ensuring that context switches could happen.  It is intended
+// to be used for stress testing parallel code and shouldn't be assumed to do any
+// reliable timing.
+ARROW_TESTING_EXPORT
+void SleepABit();
+
+// Wait until predicate is true or timeout in seconds expires.
+ARROW_TESTING_EXPORT
+void BusyWait(double seconds, std::function<bool()> predicate);
+
+// \see SleepABit
+ARROW_TESTING_EXPORT
+Future<> SleepABitAsync();
+
+ARROW_TESTING_EXPORT bool FileIsClosed(int fd);
+
+template <typename T>
+std::vector<T> IteratorToVector(Iterator<T> iterator) {
+  EXPECT_OK_AND_ASSIGN(auto out, iterator.ToVector());
+  return out;
+}
+
+ARROW_TESTING_EXPORT
+bool LocaleExists(const char* locale);
+
+#ifndef _WIN32
+ARROW_TESTING_EXPORT
+void AssertChildExit(int child_pid, int expected_exit_status = 0);
+#endif
+
+// A RAII-style object that switches to a new locale, and switches back
+// to the old locale when going out of scope.  Doesn't do anything if the
+// new locale doesn't exist on the local machine.
+// ATTENTION: may crash with an assertion failure on Windows debug builds.
+// See ARROW-6108, also https://gerrit.libreoffice.org/#/c/54110/
+class ARROW_TESTING_EXPORT LocaleGuard {
+ public:
+  explicit LocaleGuard(const char* new_locale);
+  ~LocaleGuard();
+
+ protected:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+class ARROW_TESTING_EXPORT EnvVarGuard {
+ public:
+  EnvVarGuard(const std::string& name, const std::string& value);
+  ~EnvVarGuard();
+
+ protected:
+  const std::string name_;
+  std::string old_value_;
+  bool was_set_;
+};
+
+namespace internal {
+class SignalHandler;
+}
+
+class ARROW_TESTING_EXPORT SignalHandlerGuard {
+ public:
+  typedef void (*Callback)(int);
+
+  SignalHandlerGuard(int signum, Callback cb);
+  SignalHandlerGuard(int signum, const internal::SignalHandler& handler);
+  ~SignalHandlerGuard();
+
+ protected:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+#ifndef ARROW_LARGE_MEMORY_TESTS
+#  define LARGE_MEMORY_TEST(name) DISABLED_##name
+#else
+#  define LARGE_MEMORY_TEST(name) name
+#endif
+
+inline void PrintTo(const Status& st, std::ostream* os) { *os << st.ToString(); }
+
+template <typename T>
+void PrintTo(const Result<T>& result, std::ostream* os) {
+  if (result.ok()) {
+    ::testing::internal::UniversalPrint(result.ValueOrDie(), os);
+  } else {
+    *os << result.status();
+  }
+}
+
+// A data type with only move constructors (no copy, no default).
+struct MoveOnlyDataType {
+  explicit MoveOnlyDataType(int x) : data(new int(x)) {}
+
+  MoveOnlyDataType(const MoveOnlyDataType& other) = delete;
+  MoveOnlyDataType& operator=(const MoveOnlyDataType& other) = delete;
+
+  MoveOnlyDataType(MoveOnlyDataType&& other) { MoveFrom(&other); }
+  MoveOnlyDataType& operator=(MoveOnlyDataType&& other) {
+    MoveFrom(&other);
+    return *this;
+  }
+
+  MoveOnlyDataType& operator=(int x) {
+    if (data != nullptr) {
+      delete data;
+    }
+    data = new int(x);
+    return *this;
+  }
+
+  ~MoveOnlyDataType() { Destroy(); }
+
+  void Destroy() {
+    if (data != nullptr) {
+      delete data;
+      data = nullptr;
+      moves = -1;
+    }
+  }
+
+  void MoveFrom(MoveOnlyDataType* other) {
+    Destroy();
+    data = other->data;
+    other->data = nullptr;
+    moves = other->moves + 1;
+  }
+
+  int ToInt() const { return data == nullptr ? -42 : *data; }
+
+  bool operator==(const MoveOnlyDataType& other) const {
+    return data != nullptr && other.data != nullptr && *data == *other.data;
+  }
+  bool operator<(const MoveOnlyDataType& other) const {
+    return data == nullptr || (other.data != nullptr && *data < *other.data);
+  }
+
+  bool operator==(int other) const { return data != nullptr && *data == other; }
+  friend bool operator==(int left, const MoveOnlyDataType& right) {
+    return right == left;
+  }
+
+  int* data = nullptr;
+  int moves = 0;
+};
+
+// A task that blocks until unlocked.  Useful for timing tests.
+class ARROW_TESTING_EXPORT GatingTask {
+ public:
+  explicit GatingTask(double timeout_seconds = 10);
+  /// \brief During destruction we wait for all pending tasks to finish
+  ~GatingTask();
+
+  /// \brief Creates a new waiting task (presumably to spawn on a thread).  It will return
+  /// invalid if the timeout arrived before the unlock.  The task will not complete until
+  /// unlocked or timed out
+  ///
+  /// Note: The GatingTask must outlive any Task instances
+  std::function<void()> Task();
+  /// \brief Creates a new waiting task as a future.  The future will not complete
+  /// until unlocked.
+  Future<> AsyncTask();
+  /// \brief Waits until at least count tasks are running.
+  Status WaitForRunning(int count);
+  /// \brief Unlocks all waiting tasks.  Returns an invalid status if any waiting task has
+  /// timed out
+  Status Unlock();
+
+  static std::shared_ptr<GatingTask> Make(double timeout_seconds = 10);
+
+ private:
+  class Impl;
+  std::shared_ptr<Impl> impl_;
+};
+
+/// \brief create an exact copy of the data where each buffer has a max alignment of 1
+///
+/// This method does not recurse into the dictionary or children
+ARROW_TESTING_EXPORT std::shared_ptr<ArrayData> UnalignBuffers(const ArrayData& array);
+/// \brief create an exact copy of the array where each buffer has a max alignment of 1
+///
+/// This method does not recurse into the dictionary or children
+ARROW_TESTING_EXPORT std::shared_ptr<Array> UnalignBuffers(const Array& array);
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/matchers.h b/pyarrow/include/arrow/testing/matchers.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e1bae47381a763ca53fc21aa66bffa91ff68152
--- /dev/null
+++ b/pyarrow/include/arrow/testing/matchers.h
@@ -0,0 +1,467 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <utility>
+
+#include <gmock/gmock-matchers.h>
+
+#include "arrow/datum.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/stl_iterator.h"
+#include "arrow/testing/future_util.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/future.h"
+#include "arrow/util/unreachable.h"
+
+namespace arrow {
+
+class PointeesEqualMatcher {
+ public:
+  template <typename PtrPair>
+  operator testing::Matcher<PtrPair>() const {  // NOLINT runtime/explicit
+    struct Impl : testing::MatcherInterface<const PtrPair&> {
+      void DescribeTo(::std::ostream* os) const override { *os << "pointees are equal"; }
+
+      void DescribeNegationTo(::std::ostream* os) const override {
+        *os << "pointees are not equal";
+      }
+
+      bool MatchAndExplain(const PtrPair& pair,
+                           testing::MatchResultListener* listener) const override {
+        const auto& first = *std::get<0>(pair);
+        const auto& second = *std::get<1>(pair);
+        const bool match = first.Equals(second);
+        *listener << "whose pointees " << testing::PrintToString(first) << " and "
+                  << testing::PrintToString(second)
+                  << (match ? " are equal" : " are not equal");
+        return match;
+      }
+    };
+
+    return testing::Matcher<PtrPair>(new Impl());
+  }
+};
+
+// A matcher that checks that the values pointed to are Equals().
+// Useful in conjunction with other googletest matchers.
+inline PointeesEqualMatcher PointeesEqual() { return {}; }
+
+class AnyOfJSONMatcher {
+ public:
+  AnyOfJSONMatcher(std::shared_ptr<DataType> type, std::string array_json)
+      : type_(std::move(type)), array_json_(std::move(array_json)) {}
+
+  template <typename arg_type>
+  operator testing::Matcher<arg_type>() const {  // NOLINT runtime/explicit
+    struct Impl : testing::MatcherInterface<const arg_type&> {
+      static_assert(std::is_same<arg_type, std::shared_ptr<Scalar>>(),
+                    "AnyOfJSON only supported for std::shared_ptr<Scalar>");
+      Impl(std::shared_ptr<DataType> type, std::string array_json)
+          : type_(std::move(type)), array_json_(std::move(array_json)) {
+        array = arrow::ArrayFromJSON(type_, array_json_);
+      }
+      void DescribeTo(std::ostream* os) const override {
+        *os << "matches at least one scalar from ";
+        *os << array->ToString();
+      }
+      void DescribeNegationTo(::std::ostream* os) const override {
+        *os << "matches no scalar from ";
+        *os << array->ToString();
+      }
+      bool MatchAndExplain(
+          const arg_type& arg,
+          ::testing::MatchResultListener* result_listener) const override {
+        for (int64_t i = 0; i < array->length(); ++i) {
+          std::shared_ptr<Scalar> scalar;
+          auto maybe_scalar = array->GetScalar(i);
+          if (maybe_scalar.ok()) {
+            scalar = maybe_scalar.ValueOrDie();
+          } else {
+            *result_listener << "GetScalar() had status "
+                             << maybe_scalar.status().ToString() << "at index " << i
+                             << " in the input JSON Array";
+            return false;
+          }
+
+          if (scalar->Equals(*arg)) return true;
+        }
+        *result_listener << "Argument scalar: '" << arg->ToString()
+                         << "' matches no scalar from " << array->ToString();
+        return false;
+      }
+      const std::shared_ptr<DataType> type_;
+      const std::string array_json_;
+      std::shared_ptr<Array> array;
+    };
+
+    return testing::Matcher<arg_type>(new Impl(type_, array_json_));
+  }
+
+ private:
+  const std::shared_ptr<DataType> type_;
+  const std::string array_json_;
+};
+
+inline AnyOfJSONMatcher AnyOfJSON(std::shared_ptr<DataType> type,
+                                  std::string array_json) {
+  return {std::move(type), std::move(array_json)};
+}
+
+template <typename ResultMatcher>
+class FutureMatcher {
+ public:
+  explicit FutureMatcher(ResultMatcher result_matcher, double wait_seconds)
+      : result_matcher_(std::move(result_matcher)), wait_seconds_(wait_seconds) {}
+
+  template <typename Fut,
+            typename ValueType = typename std::decay<Fut>::type::ValueType>
+  operator testing::Matcher<Fut>() const {  // NOLINT runtime/explicit
+    struct Impl : testing::MatcherInterface<const Fut&> {
+      explicit Impl(const ResultMatcher& result_matcher, double wait_seconds)
+          : result_matcher_(testing::MatcherCast<Result<ValueType>>(result_matcher)),
+            wait_seconds_(wait_seconds) {}
+
+      void DescribeTo(::std::ostream* os) const override {
+        *os << "value ";
+        result_matcher_.DescribeTo(os);
+      }
+
+      void DescribeNegationTo(::std::ostream* os) const override {
+        *os << "value ";
+        result_matcher_.DescribeNegationTo(os);
+      }
+
+      bool MatchAndExplain(const Fut& fut,
+                           testing::MatchResultListener* listener) const override {
+        if (!fut.Wait(wait_seconds_)) {
+          *listener << "which didn't finish within " << wait_seconds_ << " seconds";
+          return false;
+        }
+        return result_matcher_.MatchAndExplain(fut.result(), listener);
+      }
+
+      const testing::Matcher<Result<ValueType>> result_matcher_;
+      const double wait_seconds_;
+    };
+
+    return testing::Matcher<Fut>(new Impl(result_matcher_, wait_seconds_));
+  }
+
+ private:
+  const ResultMatcher result_matcher_;
+  const double wait_seconds_;
+};
+
+template <typename ValueMatcher>
+class ResultMatcher {
+ public:
+  explicit ResultMatcher(ValueMatcher value_matcher)
+      : value_matcher_(std::move(value_matcher)) {}
+
+  template <typename Res,
+            typename ValueType = typename std::decay<Res>::type::ValueType>
+  operator testing::Matcher<Res>() const {  // NOLINT runtime/explicit
+    struct Impl : testing::MatcherInterface<const Res&> {
+      explicit Impl(const ValueMatcher& value_matcher)
+          : value_matcher_(testing::MatcherCast<ValueType>(value_matcher)) {}
+
+      void DescribeTo(::std::ostream* os) const override {
+        *os << "value ";
+        value_matcher_.DescribeTo(os);
+      }
+
+      void DescribeNegationTo(::std::ostream* os) const override {
+        *os << "value ";
+        value_matcher_.DescribeNegationTo(os);
+      }
+
+      bool MatchAndExplain(const Res& maybe_value,
+                           testing::MatchResultListener* listener) const override {
+        if (!maybe_value.status().ok()) {
+          *listener << "whose error "
+                    << testing::PrintToString(maybe_value.status().ToString())
+                    << " doesn't match";
+          return false;
+        }
+        const ValueType& value = maybe_value.ValueOrDie();
+        testing::StringMatchResultListener value_listener;
+        const bool match = value_matcher_.MatchAndExplain(value, &value_listener);
+        *listener << "whose value " << testing::PrintToString(value)
+                  << (match ? " matches" : " doesn't match");
+        testing::internal::PrintIfNotEmpty(value_listener.str(), listener->stream());
+        return match;
+      }
+
+      const testing::Matcher<ValueType> value_matcher_;
+    };
+
+    return testing::Matcher<Res>(new Impl(value_matcher_));
+  }
+
+ private:
+  const ValueMatcher value_matcher_;
+};
+
+class ErrorMatcher {
+ public:
+  explicit ErrorMatcher(StatusCode code,
+                        std::optional<testing::Matcher<std::string>> message_matcher)
+      : code_(code), message_matcher_(std::move(message_matcher)) {}
+
+  template <typename Res>
+  operator testing::Matcher<Res>() const {  // NOLINT runtime/explicit
+    struct Impl : testing::MatcherInterface<const Res&> {
+      explicit Impl(StatusCode code,
+                    std::optional<testing::Matcher<std::string>> message_matcher)
+          : code_(code), message_matcher_(std::move(message_matcher)) {}
+
+      void DescribeTo(::std::ostream* os) const override {
+        *os << "raises StatusCode::" << Status::CodeAsString(code_);
+        if (message_matcher_) {
+          *os << " and message ";
+          message_matcher_->DescribeTo(os);
+        }
+      }
+
+      void DescribeNegationTo(::std::ostream* os) const override {
+        *os << "does not raise StatusCode::" << Status::CodeAsString(code_);
+        if (message_matcher_) {
+          *os << " or message ";
+          message_matcher_->DescribeNegationTo(os);
+        }
+      }
+
+      bool MatchAndExplain(const Res& maybe_value,
+                           testing::MatchResultListener* listener) const override {
+        const Status& status = ToStatus(maybe_value);
+        testing::StringMatchResultListener value_listener;
+
+        bool match = status.code() == code_;
+        if (message_matcher_) {
+          match = match &&
+                  message_matcher_->MatchAndExplain(status.message(), &value_listener);
+        }
+
+        if (match) {
+          *listener << "whose error matches";
+        } else if (status.ok()) {
+          *listener << "whose non-error doesn't match";
+        } else {
+          *listener << "whose error doesn't match";
+        }
+
+        testing::internal::PrintIfNotEmpty(value_listener.str(), listener->stream());
+        return match;
+      }
+
+      const StatusCode code_;
+      const std::optional<testing::Matcher<std::string>> message_matcher_;
+    };
+
+    return testing::Matcher<Res>(new Impl(code_, message_matcher_));
+  }
+
+ private:
+  const StatusCode code_;
+  const std::optional<testing::Matcher<std::string>> message_matcher_;
+};
+
+class OkMatcher {
+ public:
+  template <typename Res>
+  operator testing::Matcher<Res>() const {  // NOLINT runtime/explicit
+    struct Impl : testing::MatcherInterface<const Res&> {
+      void DescribeTo(::std::ostream* os) const override { *os << "is ok"; }
+
+      void DescribeNegationTo(::std::ostream* os) const override { *os << "is not ok"; }
+
+      bool MatchAndExplain(const Res& maybe_value,
+                           testing::MatchResultListener* listener) const override {
+        const Status& status = ToStatus(maybe_value);
+
+        const bool match = status.ok();
+        *listener << "whose " << (match ? "non-error matches" : "error doesn't match");
+        return match;
+      }
+    };
+
+    return testing::Matcher<Res>(new Impl());
+  }
+};
+
+// Returns a matcher that waits on a Future (by default for 16 seconds)
+// then applies a matcher to the result.
+template <typename ResultMatcher>
+FutureMatcher<ResultMatcher> Finishes(
+    const ResultMatcher& result_matcher,
+    double wait_seconds = kDefaultAssertFinishesWaitSeconds) {
+  return FutureMatcher<ResultMatcher>(result_matcher, wait_seconds);
+}
+
+// Returns a matcher that matches the value of a successful Result<T>.
+template <typename ValueMatcher>
+ResultMatcher<ValueMatcher> ResultWith(const ValueMatcher& value_matcher) {
+  return ResultMatcher<ValueMatcher>(value_matcher);
+}
+
+// Returns a matcher that matches an ok Status or Result<T>.
+inline OkMatcher Ok() { return {}; }
+
+// Returns a matcher that matches the StatusCode of a Status or Result<T>.
+// Do not use Raises(StatusCode::OK) to match a non error code.
+inline ErrorMatcher Raises(StatusCode code) { return ErrorMatcher(code, std::nullopt); }
+
+// Returns a matcher that matches the StatusCode and message of a Status or Result<T>.
+template <typename MessageMatcher>
+ErrorMatcher Raises(StatusCode code, const MessageMatcher& message_matcher) {
+  return ErrorMatcher(code, testing::MatcherCast<std::string>(message_matcher));
+}
+
+class DataEqMatcher {
+ public:
+  // TODO(bkietz) support EqualOptions, ApproxEquals, etc
+  // Probably it's better to use something like config-through-key_value_metadata
+  // as with the random generators to decouple this from EqualOptions etc.
+  explicit DataEqMatcher(Datum expected) : expected_(std::move(expected)) {}
+
+  template <typename Data>
+  operator testing::Matcher<Data>() const {  // NOLINT runtime/explicit
+    struct Impl : testing::MatcherInterface<const Data&> {
+      explicit Impl(Datum expected) : expected_(std::move(expected)) {}
+
+      void DescribeTo(::std::ostream* os) const override {
+        *os << "has data ";
+        PrintTo(expected_, os);
+      }
+
+      void DescribeNegationTo(::std::ostream* os) const override {
+        *os << "doesn't have data ";
+        PrintTo(expected_, os);
+      }
+
+      bool MatchAndExplain(const Data& data,
+                           testing::MatchResultListener* listener) const override {
+        Datum boxed(data);
+
+        if (boxed.kind() != expected_.kind()) {
+          *listener << "whose Datum::kind " << boxed.ToString() << " doesn't match "
+                    << expected_.ToString();
+          return false;
+        }
+
+        if (const auto& boxed_type = boxed.type()) {
+          if (*boxed_type != *expected_.type()) {
+            *listener << "whose DataType " << boxed_type->ToString() << " doesn't match "
+                      << expected_.type()->ToString();
+            return false;
+          }
+        } else if (const auto& boxed_schema = boxed.schema()) {
+          if (*boxed_schema != *expected_.schema()) {
+            *listener << "whose Schema " << boxed_schema->ToString() << " doesn't match "
+                      << expected_.schema()->ToString();
+            return false;
+          }
+        } else {
+          Unreachable();
+        }
+
+        if (boxed == expected_) {
+          *listener << "whose value matches";
+          return true;
+        }
+
+        if (listener->IsInterested() && boxed.kind() == Datum::ARRAY) {
+          *listener << "whose value differs from the expected value by "
+                    << boxed.make_array()->Diff(*expected_.make_array());
+        } else {
+          *listener << "whose value doesn't match";
+        }
+        return false;
+      }
+
+      Datum expected_;
+    };
+
+    return testing::Matcher<Data>(new Impl(expected_));
+  }
+
+ private:
+  Datum expected_;
+};
+
+/// Constructs a datum against which arguments are matched
+template <typename Data>
+DataEqMatcher DataEq(Data&& dat) {
+  return DataEqMatcher(Datum(std::forward<Data>(dat)));
+}
+
+/// Constructs an array with ArrayFromJSON against which arguments are matched
+inline DataEqMatcher DataEqArray(const std::shared_ptr<DataType>& type,
+                                 std::string_view json) {
+  return DataEq(arrow::ArrayFromJSON(type, json));
+}
+
+/// Constructs an array from a vector of optionals against which arguments are matched
+template <typename T, typename ArrayType = typename TypeTraits<T>::ArrayType,
+          typename BuilderType = typename TypeTraits<T>::BuilderType,
+          typename ValueType =
+              typename ::arrow::stl::detail::DefaultValueAccessor<ArrayType>::ValueType>
+DataEqMatcher DataEqArray(T type, const std::vector<std::optional<ValueType>>& values) {
+  // FIXME(bkietz) broken until DataType is move constructible
+  BuilderType builder(std::make_shared<T>(std::move(type)), default_memory_pool());
+  DCHECK_OK(builder.Reserve(static_cast<int64_t>(values.size())));
+
+  // pseudo constexpr:
+  static const bool need_safe_append = !is_fixed_width(T::type_id);
+
+  for (auto value : values) {
+    if (need_safe_append) {
+      DCHECK_OK(builder.AppendOrNull(value));
+    } else {
+      builder.UnsafeAppendOrNull(value);
+    }
+  }
+
+  return DataEq(builder.Finish().ValueOrDie());
+}
+
+/// Constructs a scalar with ScalarFromJSON against which arguments are matched
+inline DataEqMatcher DataEqScalar(const std::shared_ptr<DataType>& type,
+                                  std::string_view json) {
+  return DataEq(ScalarFromJSON(type, json));
+}
+
+/// Constructs a scalar against which arguments are matched
+template <typename T, typename ScalarType = typename TypeTraits<T>::ScalarType,
+          typename ValueType = typename ScalarType::ValueType>
+DataEqMatcher DataEqScalar(T type, std::optional<ValueType> value) {
+  ScalarType expected(std::make_shared<T>(std::move(type)));
+
+  if (value) {
+    expected.is_valid = true;
+    expected.value = std::move(*value);
+  }
+
+  return DataEq(std::move(expected));
+}
+
+// HasType, HasSchema matchers
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/math.h b/pyarrow/include/arrow/testing/math.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e829e0d616ba85f8738ea01bfe3732c895c15f9
--- /dev/null
+++ b/pyarrow/include/arrow/testing/math.h
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/testing/visibility.h"
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+
+ARROW_TESTING_EXPORT
+bool WithinUlp(util::Float16 left, util::Float16 right, int n_ulps);
+ARROW_TESTING_EXPORT
+bool WithinUlp(float left, float right, int n_ulps);
+ARROW_TESTING_EXPORT
+bool WithinUlp(double left, double right, int n_ulps);
+
+ARROW_TESTING_EXPORT
+void AssertWithinUlp(util::Float16 left, util::Float16 right, int n_ulps);
+ARROW_TESTING_EXPORT
+void AssertWithinUlp(float left, float right, int n_ulps);
+ARROW_TESTING_EXPORT
+void AssertWithinUlp(double left, double right, int n_ulps);
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/process.h b/pyarrow/include/arrow/testing/process.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4d2ae124f427b8143e7fb9d69c6b97e914befd5
--- /dev/null
+++ b/pyarrow/include/arrow/testing/process.h
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/testing/visibility.h"
+
+namespace arrow::util {
+
+class ARROW_TESTING_EXPORT Process {
+ public:
+  Process();
+  ~Process();
+
+  Status SetExecutable(const std::string& path);
+  void SetArgs(const std::vector<std::string>& args);
+  void SetEnv(const std::string& name, const std::string& value);
+  void IgnoreStderr();
+  Status Execute();
+  bool IsRunning();
+  uint64_t pid();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+}  // namespace arrow::util
diff --git a/pyarrow/include/arrow/testing/random.h b/pyarrow/include/arrow/testing/random.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9122915a092cdad5873465ef8778d9f7619fa9f
--- /dev/null
+++ b/pyarrow/include/arrow/testing/random.h
@@ -0,0 +1,758 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include "arrow/testing/uniform_real.h"
+#include "arrow/testing/visibility.h"
+#include "arrow/type.h"
+#include "arrow/util/float16.h"
+
+namespace arrow {
+
+class Array;
+
+namespace random {
+
+using SeedType = int32_t;
+constexpr SeedType kSeedMax = std::numeric_limits<SeedType>::max();
+
+class ARROW_TESTING_EXPORT RandomArrayGenerator {
+ public:
+  explicit RandomArrayGenerator(SeedType seed)
+      : seed_distribution_(static_cast<SeedType>(1), kSeedMax), seed_rng_(seed) {}
+
+  /// \brief Generate a null bitmap
+  ///
+  /// \param[in] size the size of the bitmap to generate
+  /// \param[in] null_probability the probability of a bit being zero
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Buffer
+  std::shared_ptr<Buffer> NullBitmap(int64_t size, double null_probability = 0,
+                                     int64_t alignment = kDefaultBufferAlignment,
+                                     MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random BooleanArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] true_probability the probability of a value being 1 / bit-set
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Boolean(int64_t size, double true_probability,
+                                 double null_probability = 0,
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 MemoryPool* memory_pool = default_memory_pool());
+  /// \brief Generate a random UInt8Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> UInt8(int64_t size, uint8_t min, uint8_t max,
+                               double null_probability = 0,
+                               int64_t alignment = kDefaultBufferAlignment,
+                               MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random Int8Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Int8(int64_t size, int8_t min, int8_t max,
+                              double null_probability = 0,
+                              int64_t alignment = kDefaultBufferAlignment,
+                              MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random UInt16Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> UInt16(int64_t size, uint16_t min, uint16_t max,
+                                double null_probability = 0,
+                                int64_t alignment = kDefaultBufferAlignment,
+                                MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random Int16Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Int16(int64_t size, int16_t min, int16_t max,
+                               double null_probability = 0,
+                               int64_t alignment = kDefaultBufferAlignment,
+                               MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random UInt32Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> UInt32(int64_t size, uint32_t min, uint32_t max,
+                                double null_probability = 0,
+                                int64_t alignment = kDefaultBufferAlignment,
+                                MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random Int32Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Int32(int64_t size, int32_t min, int32_t max,
+                               double null_probability = 0,
+                               int64_t alignment = kDefaultBufferAlignment,
+                               MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random UInt64Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> UInt64(int64_t size, uint64_t min, uint64_t max,
+                                double null_probability = 0,
+                                int64_t alignment = kDefaultBufferAlignment,
+                                MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random Int64Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Int64(int64_t size, int64_t min, int64_t max,
+                               double null_probability = 0,
+                               int64_t alignment = kDefaultBufferAlignment,
+                               MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random HalfFloatArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the distribution
+  /// \param[in] max the upper bound of the distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  ///
+  /// \deprecated Deprecated in 22.0.0. Use the other Float16() method that accepts
+  /// nan_probability as a parameter
+  ARROW_DEPRECATED(
+      "Deprecated in 22.0.0. Use the other Float16() method that accepts nan_probability "
+      "as a parameter")
+  std::shared_ptr<Array> Float16(int64_t size, uint16_t min, uint16_t max,
+                                 double null_probability = 0,
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random HalfFloatArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] nan_probability the probability of a value being NaN
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Float16(int64_t size, util::Float16 min, util::Float16 max,
+                                 double null_probability = 0, double nan_probability = 0,
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random FloatArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] nan_probability the probability of a value being NaN
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Float32(int64_t size, float min, float max,
+                                 double null_probability = 0, double nan_probability = 0,
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random DoubleArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] nan_probability the probability of a value being NaN
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Float64(int64_t size, double min, double max,
+                                 double null_probability = 0, double nan_probability = 0,
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random Date64Array
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min the lower bound of the uniform distribution
+  /// \param[in] max the upper bound of the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Date64(int64_t size, int64_t min, int64_t max,
+                                double null_probability = 0,
+                                int64_t alignment = kDefaultBufferAlignment,
+                                MemoryPool* memory_pool = default_memory_pool());
+
+  template <typename ArrowType, typename CType = typename ArrowType::c_type>
+  std::shared_ptr<Array> Numeric(int64_t size, CType min, CType max,
+                                 double null_probability = 0,
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 MemoryPool* memory_pool = default_memory_pool()) {
+    switch (ArrowType::type_id) {
+      case Type::UINT8:
+        return UInt8(size, static_cast<uint8_t>(min), static_cast<uint8_t>(max),
+                     null_probability, alignment, memory_pool);
+      case Type::INT8:
+        return Int8(size, static_cast<int8_t>(min), static_cast<int8_t>(max),
+                    null_probability, alignment, memory_pool);
+      case Type::UINT16:
+        return UInt16(size, static_cast<uint16_t>(min), static_cast<uint16_t>(max),
+                      null_probability, alignment, memory_pool);
+      case Type::INT16:
+        return Int16(size, static_cast<int16_t>(min), static_cast<int16_t>(max),
+                     null_probability, alignment, memory_pool);
+      case Type::UINT32:
+        return UInt32(size, static_cast<uint32_t>(min), static_cast<uint32_t>(max),
+                      null_probability, alignment, memory_pool);
+      case Type::INT32:
+        return Int32(size, static_cast<int32_t>(min), static_cast<int32_t>(max),
+                     null_probability, alignment, memory_pool);
+      case Type::UINT64:
+        return UInt64(size, static_cast<uint64_t>(min), static_cast<uint64_t>(max),
+                      null_probability, alignment, memory_pool);
+      case Type::INT64:
+        return Int64(size, static_cast<int64_t>(min), static_cast<int64_t>(max),
+                     null_probability, alignment, memory_pool);
+      case Type::HALF_FLOAT:
+        return Float16(size, util::Float16::FromBits(static_cast<uint16_t>(min)),
+                       util::Float16::FromBits(static_cast<uint16_t>(max)),
+                       null_probability, /*nan_probability=*/0, alignment, memory_pool);
+      case Type::FLOAT:
+        return Float32(size, static_cast<float>(min), static_cast<float>(max),
+                       null_probability, /*nan_probability=*/0, alignment, memory_pool);
+      case Type::DOUBLE:
+        return Float64(size, static_cast<double>(min), static_cast<double>(max),
+                       null_probability, /*nan_probability=*/0, alignment, memory_pool);
+      case Type::DATE64:
+        return Date64(size, static_cast<int64_t>(min), static_cast<int64_t>(max),
+                      null_probability, alignment, memory_pool);
+      default:
+        return nullptr;
+    }
+  }
+
+  /// \brief Generate a random Decimal32Array
+  ///
+  /// \param[in] type the type of the array to generate
+  ///            (must be an instance of Decimal32Type)
+  /// \param[in] size the size of the array to generate
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Decimal32(std::shared_ptr<DataType> type, int64_t size,
+                                   double null_probability = 0,
+                                   int64_t alignment = kDefaultBufferAlignment,
+                                   MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random Decimal64Array
+  ///
+  /// \param[in] type the type of the array to generate
+  ///            (must be an instance of Decimal64Type)
+  /// \param[in] size the size of the array to generate
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Decimal64(std::shared_ptr<DataType> type, int64_t size,
+                                   double null_probability = 0,
+                                   int64_t alignment = kDefaultBufferAlignment,
+                                   MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random Decimal128Array
+  ///
+  /// \param[in] type the type of the array to generate
+  ///            (must be an instance of Decimal128Type)
+  /// \param[in] size the size of the array to generate
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Decimal128(std::shared_ptr<DataType> type, int64_t size,
+                                    double null_probability = 0,
+                                    int64_t alignment = kDefaultBufferAlignment,
+                                    MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random Decimal256Array
+  ///
+  /// \param[in] type the type of the array to generate
+  ///            (must be an instance of Decimal256Type)
+  /// \param[in] size the size of the array to generate
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Decimal256(std::shared_ptr<DataType> type, int64_t size,
+                                    double null_probability = 0,
+                                    int64_t alignment = kDefaultBufferAlignment,
+                                    MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate an array of offsets (for use in e.g. ListArray::FromArrays)
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] first_offset the first offset value (usually 0)
+  /// \param[in] last_offset the last offset value (usually the size of the child array)
+  /// \param[in] null_probability the probability of an offset being null
+  /// \param[in] force_empty_nulls if true, null offsets must have 0 "length"
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Offsets(int64_t size, int32_t first_offset, int32_t last_offset,
+                                 double null_probability = 0,
+                                 bool force_empty_nulls = false,
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 MemoryPool* memory_pool = default_memory_pool());
+
+  std::shared_ptr<Array> LargeOffsets(int64_t size, int64_t first_offset,
+                                      int64_t last_offset, double null_probability = 0,
+                                      bool force_empty_nulls = false,
+                                      int64_t alignment = kDefaultBufferAlignment,
+                                      MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random StringArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min_length the lower bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] max_length the upper bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> String(int64_t size, int32_t min_length, int32_t max_length,
+                                double null_probability = 0,
+                                int64_t alignment = kDefaultBufferAlignment,
+                                MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random StringViewArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min_length the lower bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] max_length the upper bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] max_data_buffer_length the data buffer size at which
+  ///            a new chunk will be generated
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> StringView(int64_t size, int32_t min_length, int32_t max_length,
+                                    double null_probability = 0,
+                                    std::optional<int64_t> max_data_buffer_length = {},
+                                    int64_t alignment = kDefaultBufferAlignment,
+                                    MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random LargeStringArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] min_length the lower bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] max_length the upper bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> LargeString(int64_t size, int32_t min_length, int32_t max_length,
+                                     double null_probability = 0,
+                                     int64_t alignment = kDefaultBufferAlignment,
+                                     MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random StringArray with repeated values
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] unique the number of unique string values used
+  ///            to populate the array
+  /// \param[in] min_length the lower bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] max_length the upper bound of the string length
+  ///            determined by the uniform distribution
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> StringWithRepeats(
+      int64_t size, int64_t unique, int32_t min_length, int32_t max_length,
+      double null_probability = 0, int64_t alignment = kDefaultBufferAlignment,
+      MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Like StringWithRepeats but return BinaryArray
+  std::shared_ptr<Array> BinaryWithRepeats(
+      int64_t size, int64_t unique, int32_t min_length, int32_t max_length,
+      double null_probability = 0, int64_t alignment = kDefaultBufferAlignment,
+      MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random FixedSizeBinaryArray
+  ///
+  /// \param[in] size the size of the array to generate
+  /// \param[in] byte_width the byte width of fixed-size binary items
+  /// \param[in] null_probability the probability of a value being null
+  /// \param[in] min_byte the lower bound of each byte in the binary determined by the
+  ///            uniform distribution
+  /// \param[in] max_byte the upper bound of each byte in the binary determined by the
+  ///            uniform distribution
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> FixedSizeBinary(int64_t size, int32_t byte_width,
+                                         double null_probability = 0,
+                                         uint8_t min_byte = static_cast<uint8_t>('A'),
+                                         uint8_t max_byte = static_cast<uint8_t>('z'),
+                                         int64_t alignment = kDefaultBufferAlignment,
+                                         MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random ListArray
+  ///
+  /// \param[in] values The underlying values array
+  /// \param[in] size The size of the generated list array
+  /// \param[in] null_probability the probability of a list value being null
+  /// \param[in] force_empty_nulls if true, null list entries must have 0 length
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> List(const Array& values, int64_t size,
+                              double null_probability = 0, bool force_empty_nulls = false,
+                              int64_t alignment = kDefaultBufferAlignment,
+                              MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random ListViewArray
+  ///
+  /// \param[in] values The underlying values array
+  /// \param[in] size The size of the generated list array
+  /// \param[in] null_probability the probability of a list value being null
+  /// \param[in] force_empty_nulls if true, null list entries must have 0 length
+  /// must be set to 0
+  /// \param[in] coverage proportion of the values array covered by list-views
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> ListView(const Array& values, int64_t size,
+                                  double null_probability = 0,
+                                  bool force_empty_nulls = false, double coverage = 1.0,
+                                  int64_t alignment = kDefaultBufferAlignment,
+                                  MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random LargeListViewArray
+  ///
+  /// \param[in] values The underlying values array
+  /// \param[in] size The size of the generated list array
+  /// \param[in] null_probability the probability of a list value being null
+  /// \param[in] force_empty_nulls if true, null list entries must have 0 length
+  /// must be set to 0
+  /// \param[in] coverage proportion of the values array covered by list-views
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> LargeListView(const Array& values, int64_t size,
+                                       double null_probability = 0,
+                                       bool force_empty_nulls = false,
+                                       double coverage = 1.0,
+                                       int64_t alignment = kDefaultBufferAlignment,
+                                       MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random MapArray
+  ///
+  /// \param[in] keys The underlying keys array
+  /// \param[in] items The underlying items array
+  /// \param[in] size The size of the generated map array
+  /// \param[in] null_probability the probability of a map value being null
+  /// \param[in] force_empty_nulls if true, null map entries must have 0 length
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> Map(const std::shared_ptr<Array>& keys,
+                             const std::shared_ptr<Array>& items, int64_t size,
+                             double null_probability = 0, bool force_empty_nulls = false,
+                             int64_t alignment = kDefaultBufferAlignment,
+                             MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random RunEndEncodedArray
+  ///
+  /// \param[in] value_type The DataType of the encoded values
+  /// \param[in] logical_size The logical length of the generated array
+  /// \param[in] null_probability the probability of a value being null
+  ///
+  /// \return a generated Array
+  std::shared_ptr<Array> RunEndEncoded(std::shared_ptr<DataType> value_type,
+                                       int64_t logical_size,
+                                       double null_probability = 0.0);
+
+  /// \brief Generate a random SparseUnionArray
+  ///
+  /// The type ids are chosen randomly, according to a uniform distribution,
+  /// amongst the given child fields.
+  ///
+  /// \param[in] fields Vector of Arrays containing the data for each union field
+  /// \param[in] size The size of the generated sparse union array
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  std::shared_ptr<Array> SparseUnion(const ArrayVector& fields, int64_t size,
+                                     int64_t alignment = kDefaultBufferAlignment,
+                                     MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random DenseUnionArray
+  ///
+  /// The type ids are chosen randomly, according to a uniform distribution,
+  /// amongst the given child fields.  The offsets are incremented along
+  /// each child field.
+  ///
+  /// \param[in] fields Vector of Arrays containing the data for each union field
+  /// \param[in] size The size of the generated sparse union array
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  std::shared_ptr<Array> DenseUnion(const ArrayVector& fields, int64_t size,
+                                    int64_t alignment = kDefaultBufferAlignment,
+                                    MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a random Array of the specified type, size, and null_probability.
+  ///
+  /// Generation parameters other than size and null_probability are determined based on
+  /// the type of Array to be generated.
+  /// If boolean the probabilities of true,false values are 0.25,0.75 respectively.
+  /// If numeric min,max will be the least and greatest representable values.
+  /// If string min_length,max_length will be 0,sqrt(size) respectively.
+  ///
+  /// \param[in] type the type of Array to generate
+  /// \param[in] size the size of the Array to generate
+  /// \param[in] null_probability the probability of a slot being null
+  /// \param[in] alignment alignment for memory allocations (in bytes)
+  /// \param[in] memory_pool memory pool to allocate memory from
+  /// \return a generated Array
+  std::shared_ptr<Array> ArrayOf(std::shared_ptr<DataType> type, int64_t size,
+                                 double null_probability = 0,
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate an array with random data based on the given field. See BatchOf
+  /// for usage info.
+  std::shared_ptr<Array> ArrayOf(const Field& field, int64_t size,
+                                 int64_t alignment = kDefaultBufferAlignment,
+                                 MemoryPool* memory_pool = default_memory_pool());
+
+  /// \brief Generate a record batch with random data of the specified length.
+  ///
+  /// Generation options are read from key-value metadata for each field, and may be
+  /// specified at any nesting level. For example, generation options for the child
+  /// values of a list array can be specified by constructing the list type with
+  /// list(field("item", int8(), options_metadata))
+  ///
+  /// The following options are supported:
+  ///
+  /// For all types except NullType:
+  /// - null_probability (double): range [0.0, 1.0] the probability of a null value.
+  /// Default/value is 0.0 if the field is marked non-nullable, else it is 0.01
+  ///
+  /// For all numeric types T:
+  /// - min (T::c_type): the minimum value to generate (inclusive), default
+  ///   std::numeric_limits<T::c_type>::min()
+  /// - max (T::c_type): the maximum value to generate (inclusive), default
+  ///   std::numeric_limits<T::c_type>::max()
+  /// Note this means that, for example, min/max are int16_t values for HalfFloatType.
+  ///
+  /// For floating point types T for which is_physical_floating_type<T>:
+  /// - nan_probability (double): range [0.0, 1.0] the probability of a NaN value.
+  ///
+  /// For BooleanType:
+  /// - true_probability (double): range [0.0, 1.0] the probability of a true.
+  ///
+  /// For DictionaryType:
+  /// - values (int32_t): the size of the dictionary.
+  /// Other properties are passed to the generator for the dictionary indices. However,
+  /// min and max cannot be specified. Note it is not possible to otherwise customize
+  /// the generation of dictionary values.
+  ///
+  /// For list, string, and binary types T, including their large variants:
+  /// - min_length (T::offset_type): the minimum length of the child to generate,
+  ///   default 0
+  /// - max_length (T::offset_type): the minimum length of the child to generate,
+  ///   default 1024
+  ///
+  /// For string and binary types T (not including their large or view variants):
+  /// - unique (int32_t): if positive, this many distinct values will be generated
+  ///   and all array values will be one of these values, default -1
+  ///
+  /// For string and binary view types T:
+  /// - max_data_buffer_length (int64_t): the data buffer size at which a new chunk
+  ///   will be generated, default 32KB
+  ///
+  /// For MapType:
+  /// - values (int32_t): the number of key-value pairs to generate, which will be
+  ///   partitioned among the array values.
+  ///
+  /// For extension types:
+  /// - extension_allow_random_storage (bool): in general an extension array may have
+  ///   invariants on its storage beyond those already imposed by the arrow format,
+  ///   which may result in an invalid array if we just wrap randomly generated
+  ///   storage. Set this flag to explicitly allow wrapping of randomly generated
+  ///   storage.
+  std::shared_ptr<arrow::RecordBatch> BatchOf(
+      const FieldVector& fields, int64_t size,
+      int64_t alignment = kDefaultBufferAlignment,
+      MemoryPool* memory_pool = default_memory_pool());
+
+  SeedType seed() { return seed_distribution_(seed_rng_); }
+
+ private:
+  std::uniform_int_distribution<SeedType> seed_distribution_;
+  std::default_random_engine seed_rng_;
+};
+
+/// Generate a batch with random data. See RandomArrayGenerator::BatchOf.
+ARROW_TESTING_EXPORT
+std::shared_ptr<arrow::RecordBatch> GenerateBatch(
+    const FieldVector& fields, int64_t size, SeedType seed,
+    int64_t alignment = kDefaultBufferAlignment,
+    MemoryPool* memory_pool = default_memory_pool());
+
+/// Generate an array with random data. See RandomArrayGenerator::BatchOf.
+ARROW_TESTING_EXPORT
+std::shared_ptr<arrow::Array> GenerateArray(
+    const Field& field, int64_t size, SeedType seed,
+    int64_t alignment = kDefaultBufferAlignment,
+    MemoryPool* memory_pool = default_memory_pool());
+
+}  // namespace random
+
+//
+// Assorted functions
+//
+
+ARROW_TESTING_EXPORT
+void rand_day_millis(int64_t N, std::vector<DayTimeIntervalType::DayMilliseconds>* out);
+ARROW_TESTING_EXPORT
+void rand_month_day_nanos(int64_t N,
+                          std::vector<MonthDayNanoIntervalType::MonthDayNanos>* out);
+
+template <typename T, typename U>
+void randint(int64_t N, T lower, T upper, std::vector<U>* out) {
+  const int random_seed = 0;
+  std::default_random_engine gen(random_seed);
+  std::uniform_int_distribution<T> d(lower, upper);
+  out->resize(N, static_cast<T>(0));
+  std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast<U>(d(gen)); });
+}
+
+template <typename T, typename U>
+void random_real(int64_t n, uint32_t seed, T min_value, T max_value,
+                 std::vector<U>* out) {
+  std::default_random_engine gen(seed);
+  ::arrow::random::uniform_real_distribution<T> d(min_value, max_value);
+  out->resize(n, static_cast<T>(0));
+  std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast<U>(d(gen)); });
+}
+
+template <typename T, typename U>
+void rand_uniform_int(int64_t n, uint32_t seed, T min_value, T max_value, U* out) {
+  assert(out || (n == 0));
+  std::default_random_engine gen(seed);
+  std::uniform_int_distribution<T> d(min_value, max_value);
+  std::generate(out, out + n, [&d, &gen] { return static_cast<U>(d(gen)); });
+}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/uniform_real.h b/pyarrow/include/arrow/testing/uniform_real.h
new file mode 100644
index 0000000000000000000000000000000000000000..8aa04a83288d9f8ce39a2d7c92b528ac9742bf98
--- /dev/null
+++ b/pyarrow/include/arrow/testing/uniform_real.h
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Random real generation is very slow on Arm if built with clang + libstdc++
+// due to software emulated long double arithmetic.
+// This file ports some random real libs from llvm libc++ library, which are
+// free from long double calculation.
+// It improves performance significantly on both Arm (~100x) and x86 (~8x) in
+// generating random reals when built with clang + gnu libstdc++.
+// Based on: https://github.com/llvm/llvm-project/tree/main/libcxx
+
+#pragma once
+
+#include <limits>
+
+#include <arrow/util/bit_util.h>
+
+namespace arrow {
+namespace random {
+
+namespace detail {
+
+// std::generate_canonical, simplified
+// https://en.cppreference.com/w/cpp/numeric/random/generate_canonical
+template <typename RealType, typename Rng>
+RealType generate_canonical(Rng& rng) {
+  const size_t b = std::numeric_limits<RealType>::digits;
+  const size_t log2R = 63 - ::arrow::bit_util::CountLeadingZeros(
+                                static_cast<uint64_t>(Rng::max() - Rng::min()) + 1);
+  const size_t k = b / log2R + (b % log2R != 0) + (b == 0);
+  const RealType r = static_cast<RealType>(Rng::max() - Rng::min()) + 1;
+  RealType base = r;
+  RealType sp = static_cast<RealType>(rng() - Rng::min());
+  for (size_t i = 1; i < k; ++i, base *= r) {
+    sp += (rng() - Rng::min()) * base;
+  }
+  return sp / base;
+}
+
+}  // namespace detail
+
+// std::uniform_real_distribution, simplified
+// https://en.cppreference.com/w/cpp/numeric/random/uniform_real_distribution
+template <typename RealType = double>
+struct uniform_real_distribution {
+  const RealType a, b;
+
+  explicit uniform_real_distribution(RealType a = 0, RealType b = 1) : a(a), b(b) {}
+
+  template <typename Rng>
+  RealType operator()(Rng& rng) {
+    return (b - a) * detail::generate_canonical<RealType>(rng) + a;
+  }
+};
+
+// std::bernoulli_distribution, simplified
+// https://en.cppreference.com/w/cpp/numeric/random/bernoulli_distribution
+struct bernoulli_distribution {
+  const double p;
+
+  explicit bernoulli_distribution(double p = 0.5) : p(p) {}
+
+  template <class Rng>
+  bool operator()(Rng& rng) {
+    return detail::generate_canonical<double>(rng) < p;
+  }
+};
+
+}  // namespace random
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/util.h b/pyarrow/include/arrow/testing/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2d6ca4d1561f0054f3d7ad4ae6b0549789d7491
--- /dev/null
+++ b/pyarrow/include/arrow/testing/util.h
@@ -0,0 +1,145 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/record_batch.h"
+#include "arrow/status.h"
+#include "arrow/testing/visibility.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+template <typename T>
+Status CopyBufferFromVector(const std::vector<T>& values, MemoryPool* pool,
+                            std::shared_ptr<Buffer>* result) {
+  int64_t nbytes = static_cast<int>(values.size()) * sizeof(T);
+
+  ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(nbytes, pool));
+  auto immutable_data = reinterpret_cast<const uint8_t*>(values.data());
+  std::copy(immutable_data, immutable_data + nbytes, buffer->mutable_data());
+  memset(buffer->mutable_data() + nbytes, 0,
+         static_cast<size_t>(buffer->capacity() - nbytes));
+
+  *result = std::move(buffer);
+  return Status::OK();
+}
+
+// Sets approximately pct_null of the first n bytes in null_bytes to zero
+// and the rest to non-zero (true) values.
+ARROW_TESTING_EXPORT void random_null_bytes(int64_t n, double pct_null,
+                                            uint8_t* null_bytes);
+ARROW_TESTING_EXPORT void random_is_valid(int64_t n, double pct_null,
+                                          std::vector<bool>* is_valid,
+                                          int random_seed = 0);
+ARROW_TESTING_EXPORT void random_bytes(int64_t n, uint32_t seed, uint8_t* out);
+ARROW_TESTING_EXPORT std::string random_string(int64_t n, uint32_t seed);
+ARROW_TESTING_EXPORT int32_t DecimalSize(int32_t precision);
+ARROW_TESTING_EXPORT void random_ascii(int64_t n, uint32_t seed, uint8_t* out);
+ARROW_TESTING_EXPORT void random_alnum(int64_t n, uint32_t seed, uint8_t* out);
+ARROW_TESTING_EXPORT int64_t CountNulls(const std::vector<uint8_t>& valid_bytes);
+
+ARROW_TESTING_EXPORT Status MakeRandomByteBuffer(int64_t length, MemoryPool* pool,
+                                                 std::shared_ptr<ResizableBuffer>* out,
+                                                 uint32_t seed = 0);
+
+ARROW_TESTING_EXPORT uint64_t random_seed();
+
+#define DECL_T() typedef typename TestFixture::T T;
+
+#define DECL_TYPE() typedef typename TestFixture::Type Type;
+
+// ----------------------------------------------------------------------
+// A RecordBatchReader for serving a sequence of in-memory record batches
+
+class BatchIterator : public RecordBatchReader {
+ public:
+  BatchIterator(const std::shared_ptr<Schema>& schema,
+                const std::vector<std::shared_ptr<RecordBatch>>& batches)
+      : schema_(schema), batches_(batches), position_(0) {}
+
+  std::shared_ptr<Schema> schema() const override { return schema_; }
+
+  Status ReadNext(std::shared_ptr<RecordBatch>* out) override {
+    if (position_ >= batches_.size()) {
+      *out = nullptr;
+    } else {
+      *out = batches_[position_++];
+    }
+    return Status::OK();
+  }
+
+ private:
+  std::shared_ptr<Schema> schema_;
+  std::vector<std::shared_ptr<RecordBatch>> batches_;
+  size_t position_;
+};
+
+static inline std::vector<std::shared_ptr<DataType> (*)(FieldVector, std::vector<int8_t>)>
+UnionTypeFactories() {
+  return {sparse_union, dense_union};
+}
+
+// Return the value of the ARROW_TEST_DATA environment variable or return error
+// Status
+ARROW_TESTING_EXPORT Status GetTestResourceRoot(std::string*);
+
+// Return the value of the ARROW_TIMEZONE_DATABASE environment variable
+ARROW_TESTING_EXPORT std::optional<std::string> GetTestTimezoneDatabaseRoot();
+
+// Set the Timezone database based on the ARROW_TIMEZONE_DATABASE env variable
+// This is only relevant on Windows, since other OSs have compatible databases built-in
+ARROW_TESTING_EXPORT Status InitTestTimezoneDatabase();
+
+// Get a TCP port number to listen on.  This is a different number every time,
+// as reusing the same port across tests can produce spurious bind errors on
+// Windows.
+ARROW_TESTING_EXPORT int GetListenPort();
+
+// Get a IPv4 "address:port" to listen on.  The address will be a loopback
+// address.  Compared to GetListenPort(), this will minimize the risk of
+// port conflicts.
+ARROW_TESTING_EXPORT std::string GetListenAddress();
+
+// Get a "host:port" to listen on. Compared to GetListenAddress(), this function would use
+// the host passed in.
+ARROW_TESTING_EXPORT std::string GetListenAddress(const std::string& host);
+
+ARROW_TESTING_EXPORT
+const std::vector<std::shared_ptr<DataType>>& all_dictionary_index_types();
+
+// Get a list of supported hardware flags from the given candidates.
+// The result will always contain 0, meaning no optional CPU feature enabled at all.
+ARROW_TESTING_EXPORT
+std::vector<int64_t> GetSupportedHardwareFlags(
+    const std::vector<int64_t>& candidate_flags);
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/testing/visibility.h b/pyarrow/include/arrow/testing/visibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7fbcd42757bd674ea09821d6a6f9e626ffe26c4
--- /dev/null
+++ b/pyarrow/include/arrow/testing/visibility.h
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#  if defined(_MSC_VER)
+#    pragma warning(push)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_TESTING_STATIC
+#    define ARROW_TESTING_EXPORT
+#  elif defined(ARROW_TESTING_EXPORTING)
+#    define ARROW_TESTING_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_TESTING_EXPORT __declspec(dllimport)
+#  endif
+
+#  define ARROW_TESTING_NO_EXPORT
+#else  // Not Windows
+#  ifndef ARROW_TESTING_EXPORT
+#    define ARROW_TESTING_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef ARROW_TESTING_NO_EXPORT
+#    define ARROW_TESTING_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif  // Non-Windows
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/pyarrow/include/arrow/type.h b/pyarrow/include/arrow/type.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3582056ead0160e2537e83e6a9c014f9b170092
--- /dev/null
+++ b/pyarrow/include/arrow/type.h
@@ -0,0 +1,2648 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <climits>
+#include <cstdint>
+#include <iosfwd>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"  // IWYU pragma: export
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+#include "arrow/visitor.h"  // IWYU pragma: keep
+
+namespace arrow {
+namespace detail {
+
+/// \defgroup numeric-datatypes Datatypes for numeric data
+/// @{
+/// @}
+
+/// \defgroup binary-datatypes Datatypes for binary/string data
+/// @{
+/// @}
+
+/// \defgroup temporal-datatypes Datatypes for temporal data
+/// @{
+/// @}
+
+/// \defgroup nested-datatypes Datatypes for nested data
+/// @{
+/// @}
+
+class ARROW_EXPORT Fingerprintable {
+ public:
+  virtual ~Fingerprintable();
+
+  const std::string& fingerprint() const {
+    auto p = fingerprint_.load();
+    if (ARROW_PREDICT_TRUE(p != NULLPTR)) {
+      return *p;
+    }
+    return LoadFingerprintSlow();
+  }
+
+  const std::string& metadata_fingerprint() const {
+    auto p = metadata_fingerprint_.load();
+    if (ARROW_PREDICT_TRUE(p != NULLPTR)) {
+      return *p;
+    }
+    return LoadMetadataFingerprintSlow();
+  }
+
+ protected:
+  const std::string& LoadFingerprintSlow() const;
+  const std::string& LoadMetadataFingerprintSlow() const;
+
+  virtual std::string ComputeFingerprint() const = 0;
+  virtual std::string ComputeMetadataFingerprint() const = 0;
+
+  mutable std::atomic<std::string*> fingerprint_{NULLPTR};
+  mutable std::atomic<std::string*> metadata_fingerprint_{NULLPTR};
+};
+
+}  // namespace detail
+
+/// EXPERIMENTAL: Layout specification for a data type
+struct ARROW_EXPORT DataTypeLayout {
+  enum BufferKind { FIXED_WIDTH, VARIABLE_WIDTH, BITMAP, ALWAYS_NULL };
+
+  /// Layout specification for a single data type buffer
+  struct BufferSpec {
+    BufferKind kind;
+    int64_t byte_width;  // For FIXED_WIDTH
+
+    bool operator==(const BufferSpec& other) const {
+      return kind == other.kind &&
+             (kind != FIXED_WIDTH || byte_width == other.byte_width);
+    }
+    bool operator!=(const BufferSpec& other) const { return !(*this == other); }
+  };
+
+  static BufferSpec FixedWidth(int64_t w) { return BufferSpec{FIXED_WIDTH, w}; }
+  static BufferSpec VariableWidth() { return BufferSpec{VARIABLE_WIDTH, -1}; }
+  static BufferSpec Bitmap() { return BufferSpec{BITMAP, -1}; }
+  static BufferSpec AlwaysNull() { return BufferSpec{ALWAYS_NULL, -1}; }
+
+  /// A vector of buffer layout specifications, one for each expected buffer
+  std::vector<BufferSpec> buffers;
+  /// Whether this type expects an associated dictionary array.
+  bool has_dictionary = false;
+  /// If this is provided, the number of buffers expected is only lower-bounded by
+  /// buffers.size(). Buffers beyond this lower bound are expected to conform to
+  /// variadic_spec.
+  std::optional<BufferSpec> variadic_spec;
+
+  explicit DataTypeLayout(std::vector<BufferSpec> buffers,
+                          std::optional<BufferSpec> variadic_spec = {})
+      : buffers(std::move(buffers)), variadic_spec(variadic_spec) {}
+};
+
+/// \brief Base class for all data types
+///
+/// Data types in this library are all *logical*. They can be expressed as
+/// either a primitive physical type (bytes or bits of some fixed size), a
+/// nested type consisting of other data types, or another data type (e.g. a
+/// timestamp encoded as an int64).
+///
+/// Simple datatypes may be entirely described by their Type::type id, but
+/// complex datatypes are usually parametric.
+class ARROW_EXPORT DataType : public std::enable_shared_from_this<DataType>,
+                              public detail::Fingerprintable,
+                              public util::EqualityComparable<DataType> {
+ public:
+  explicit DataType(Type::type id) : detail::Fingerprintable(), id_(id) {}
+  ~DataType() override;
+
+  /// \brief Return whether the types are equal
+  ///
+  /// Types that are logically convertible from one to another (e.g. List<UInt8>
+  /// and Binary) are NOT equal.
+  bool Equals(const DataType& other, bool check_metadata = false) const;
+
+  /// \brief Return whether the types are equal
+  bool Equals(const std::shared_ptr<DataType>& other, bool check_metadata = false) const;
+
+  /// \brief Return the child field at index i.
+  const std::shared_ptr<Field>& field(int i) const { return children_[i]; }
+
+  /// \brief Return the children fields associated with this type.
+  const FieldVector& fields() const { return children_; }
+
+  /// \brief Return the number of children fields associated with this type.
+  int num_fields() const { return static_cast<int>(children_.size()); }
+
+  /// \brief Apply the TypeVisitor::Visit() method specialized to the data type
+  Status Accept(TypeVisitor* visitor) const;
+
+  /// \brief A string representation of the type, including any children
+  virtual std::string ToString(bool show_metadata = false) const = 0;
+
+  /// \brief Return hash value (excluding metadata in child fields)
+  size_t Hash() const;
+
+  /// \brief A string name of the type, omitting any child fields
+  ///
+  /// \since 0.7.0
+  virtual std::string name() const = 0;
+
+  /// \brief Return the data type layout.  Children are not included.
+  ///
+  /// \note Experimental API
+  virtual DataTypeLayout layout() const = 0;
+
+  /// \brief Return the type category
+  constexpr Type::type id() const { return id_; }
+
+  /// \brief Return the type category of the storage type
+  virtual Type::type storage_id() const { return id_; }
+
+  /// \brief Returns the type's fixed byte width, if any. Returns -1
+  /// for non-fixed-width types, and should only be used for
+  /// subclasses of FixedWidthType
+  virtual int32_t byte_width() const {
+    int32_t num_bits = this->bit_width();
+    return num_bits > 0 ? num_bits / 8 : -1;
+  }
+
+  /// \brief Returns the type's fixed bit width, if any. Returns -1
+  /// for non-fixed-width types, and should only be used for
+  /// subclasses of FixedWidthType
+  virtual int bit_width() const { return -1; }
+
+  // \brief EXPERIMENTAL: Enable retrieving shared_ptr<DataType> from a const
+  // context.
+  std::shared_ptr<DataType> GetSharedPtr() const {
+    return const_cast<DataType*>(this)->shared_from_this();
+  }
+
+ protected:
+  // Dummy version that returns a null string (indicating not implemented).
+  // Subclasses should override for fast equality checks.
+  std::string ComputeFingerprint() const override;
+
+  // Generic versions that works for all regular types, nested or not.
+  std::string ComputeMetadataFingerprint() const override;
+
+  Type::type id_;
+  FieldVector children_;
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(DataType);
+};
+
+/// \brief EXPERIMENTAL: Container for a type pointer which can hold a
+/// dynamically created shared_ptr<DataType> if it needs to.
+struct ARROW_EXPORT TypeHolder {
+  const DataType* type = NULLPTR;
+  std::shared_ptr<DataType> owned_type;
+
+  TypeHolder() = default;
+  TypeHolder(const TypeHolder& other) = default;
+  TypeHolder& operator=(const TypeHolder& other) = default;
+  TypeHolder(TypeHolder&& other) = default;
+  TypeHolder& operator=(TypeHolder&& other) = default;
+
+  TypeHolder(std::shared_ptr<DataType> owned_type)  // NOLINT implicit construction
+      : type(owned_type.get()), owned_type(std::move(owned_type)) {}
+
+  TypeHolder(const DataType* type)  // NOLINT implicit construction
+      : type(type) {}
+
+  Type::type id() const { return this->type->id(); }
+
+  std::shared_ptr<DataType> GetSharedPtr() const {
+    return this->type != NULLPTR ? this->type->GetSharedPtr() : NULLPTR;
+  }
+
+  const DataType& operator*() const { return *this->type; }
+
+  operator bool() const { return this->type != NULLPTR; }
+
+  bool operator==(const TypeHolder& other) const {
+    if (type == other.type) return true;
+    if (type == NULLPTR || other.type == NULLPTR) return false;
+    return type->Equals(*other.type);
+  }
+
+  bool operator==(decltype(NULLPTR)) const { return this->type == NULLPTR; }
+
+  bool operator==(const DataType& other) const {
+    if (this->type == NULLPTR) return false;
+    return other.Equals(*this->type);
+  }
+
+  bool operator!=(const DataType& other) const { return !(*this == other); }
+
+  bool operator==(const std::shared_ptr<DataType>& other) const {
+    return *this == *other;
+  }
+
+  bool operator!=(const TypeHolder& other) const { return !(*this == other); }
+
+  std::string ToString(bool show_metadata = false) const {
+    return this->type ? this->type->ToString(show_metadata) : "<NULLPTR>";
+  }
+
+  static std::string ToString(const std::vector<TypeHolder>&, bool show_metadata = false);
+
+  static std::vector<TypeHolder> FromTypes(
+      const std::vector<std::shared_ptr<DataType>>& types);
+};
+
+ARROW_EXPORT
+std::ostream& operator<<(std::ostream& os, const DataType& type);
+
+ARROW_EXPORT
+std::ostream& operator<<(std::ostream& os, const TypeHolder& type);
+
+/// \brief Return the compatible physical data type
+///
+/// Some types may have distinct logical meanings but the exact same physical
+/// representation.  For example, TimestampType has Int64Type as a physical
+/// type (defined as TimestampType::PhysicalType).
+///
+/// The return value is as follows:
+/// - if a `PhysicalType` alias exists in the concrete type class, return
+///   an instance of `PhysicalType`.
+/// - otherwise, return the input type itself.
+ARROW_EXPORT
+std::shared_ptr<DataType> GetPhysicalType(const std::shared_ptr<DataType>& type);
+
+/// \brief Base class for all fixed-width data types
+class ARROW_EXPORT FixedWidthType : public DataType {
+ public:
+  using DataType::DataType;
+  // This is only for preventing defining this class in each
+  // translation unit to avoid one-definition-rule violation.
+  ~FixedWidthType() override;
+};
+
+/// \brief Base class for all data types representing primitive values
+class ARROW_EXPORT PrimitiveCType : public FixedWidthType {
+ public:
+  using FixedWidthType::FixedWidthType;
+  // This is only for preventing defining this class in each
+  // translation unit to avoid one-definition-rule violation.
+  ~PrimitiveCType() override;
+};
+
+/// \brief Base class for all numeric data types
+class ARROW_EXPORT NumberType : public PrimitiveCType {
+ public:
+  using PrimitiveCType::PrimitiveCType;
+  // This is only for preventing defining this class in each
+  // translation unit to avoid one-definition-rule violation.
+  ~NumberType() override;
+};
+
+/// \brief Base class for all integral data types
+class ARROW_EXPORT IntegerType : public NumberType {
+ public:
+  using NumberType::NumberType;
+  // This is only for preventing defining this class in each
+  // translation unit to avoid one-definition-rule violation.
+  ~IntegerType() override;
+  virtual bool is_signed() const = 0;
+};
+
+/// \brief Base class for all floating-point data types
+class ARROW_EXPORT FloatingPointType : public NumberType {
+ public:
+  using NumberType::NumberType;
+  // This is only for preventing defining this class in each
+  // translation unit to avoid one-definition-rule violation.
+  ~FloatingPointType() override;
+  enum Precision { HALF, SINGLE, DOUBLE };
+  virtual Precision precision() const = 0;
+};
+
+/// \brief Base class for all parametric data types
+class ParametricType {};
+
+class ARROW_EXPORT NestedType : public DataType, public ParametricType {
+ public:
+  using DataType::DataType;
+  // This is only for preventing defining this class in each
+  // translation unit to avoid one-definition-rule violation.
+  ~NestedType() override;
+};
+
+/// \brief The combination of a field name and data type, with optional metadata
+///
+/// Fields are used to describe the individual constituents of a
+/// nested DataType or a Schema.
+///
+/// A field's metadata is represented by a KeyValueMetadata instance,
+/// which holds arbitrary key-value pairs.
+class ARROW_EXPORT Field : public detail::Fingerprintable,
+                           public util::EqualityComparable<Field> {
+ public:
+  Field(std::string name, std::shared_ptr<DataType> type, bool nullable = true,
+        std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR)
+      : detail::Fingerprintable(),
+        name_(std::move(name)),
+        type_(std::move(type)),
+        nullable_(nullable),
+        metadata_(std::move(metadata)) {}
+
+  ~Field() override;
+
+  /// \brief Return the field's attached metadata
+  std::shared_ptr<const KeyValueMetadata> metadata() const { return metadata_; }
+
+  /// \brief Return whether the field has non-empty metadata
+  bool HasMetadata() const;
+
+  /// \brief Return a copy of this field with the given metadata attached to it
+  std::shared_ptr<Field> WithMetadata(
+      const std::shared_ptr<const KeyValueMetadata>& metadata) const;
+
+  /// \brief EXPERIMENTAL: Return a copy of this field with the given metadata
+  /// merged with existing metadata (any colliding keys will be overridden by
+  /// the passed metadata)
+  std::shared_ptr<Field> WithMergedMetadata(
+      const std::shared_ptr<const KeyValueMetadata>& metadata) const;
+
+  /// \brief Return a copy of this field without any metadata attached to it
+  std::shared_ptr<Field> RemoveMetadata() const;
+
+  /// \brief Return a copy of this field with the replaced type.
+  std::shared_ptr<Field> WithType(const std::shared_ptr<DataType>& type) const;
+
+  /// \brief Return a copy of this field with the replaced name.
+  std::shared_ptr<Field> WithName(const std::string& name) const;
+
+  /// \brief Return a copy of this field with the replaced nullability.
+  std::shared_ptr<Field> WithNullable(bool nullable) const;
+
+  /// \brief Options that control the behavior of `MergeWith`.
+  /// Options are to be added to allow type conversions, including integer
+  /// widening, promotion from integer to float, or conversion to or from boolean.
+  struct ARROW_EXPORT MergeOptions : public util::ToStringOstreamable<MergeOptions> {
+    /// If true, a Field of NullType can be unified with a Field of another type.
+    /// The unified field will be of the other type and become nullable.
+    /// Nullability will be promoted to the looser option (nullable if one is not
+    /// nullable).
+    bool promote_nullability = true;
+
+    /// Allow a decimal to be unified with another decimal of the same
+    /// width, adjusting scale and precision as appropriate. May fail
+    /// if the adjustment is not possible.
+    bool promote_decimal = false;
+
+    /// Allow a decimal to be promoted to a float. The float type will
+    /// not itself be promoted (e.g. Decimal128 + Float32 = Float32).
+    bool promote_decimal_to_float = false;
+
+    /// Allow an integer to be promoted to a decimal.
+    ///
+    /// May fail if the decimal has insufficient precision to
+    /// accommodate the integer (see promote_numeric_width).
+    bool promote_integer_to_decimal = false;
+
+    /// Allow an integer of a given bit width to be promoted to a
+    /// float; the result will be a float of an equal or greater bit
+    /// width to both of the inputs. Examples:
+    ///  - int8 + float32 = float32
+    ///  - int32 + float32 = float64
+    ///  - int32 + float64 = float64
+    /// Because an int32 cannot always be represented exactly in the
+    /// 24 bits of a float32 mantissa.
+    bool promote_integer_to_float = false;
+
+    /// Allow an unsigned integer of a given bit width to be promoted
+    /// to a signed integer that fits into the signed type:
+    /// uint + int16 = int16
+    /// When widening is needed, set promote_numeric_width to true:
+    /// uint16 + int16 = int32
+    bool promote_integer_sign = false;
+
+    /// Allow an integer, float, or decimal of a given bit width to be
+    /// promoted to an equivalent type of a greater bit width.
+    bool promote_numeric_width = false;
+
+    /// Allow strings to be promoted to binary types. Promotion of fixed size
+    /// binary types to variable sized formats, and binary to large binary,
+    /// and string to large string.
+    bool promote_binary = false;
+
+    /// Second to millisecond, Time32 to Time64, Time32(SECOND) to Time32(MILLI), etc
+    bool promote_temporal_unit = false;
+
+    /// Allow promotion from a list to a large-list and from a fixed-size list to a
+    /// variable sized list
+    bool promote_list = false;
+
+    /// Unify dictionary index types and dictionary value types.
+    bool promote_dictionary = false;
+
+    /// Allow merging ordered and non-ordered dictionaries.
+    /// The result will be ordered if and only if both inputs
+    /// are ordered.
+    bool promote_dictionary_ordered = false;
+
+    /// Get default options. Only NullType will be merged with other types.
+    static MergeOptions Defaults() { return MergeOptions(); }
+    /// Get permissive options. All options are enabled, except
+    /// promote_dictionary_ordered.
+    static MergeOptions Permissive();
+    /// Get a human-readable representation of the options.
+    std::string ToString() const;
+  };
+
+  /// \brief Merge the current field with a field of the same name.
+  ///
+  /// The two fields must be compatible, i.e:
+  ///   - have the same name
+  ///   - have the same type, or of compatible types according to `options`.
+  ///
+  /// The metadata of the current field is preserved; the metadata of the other
+  /// field is discarded.
+  Result<std::shared_ptr<Field>> MergeWith(
+      const Field& other, MergeOptions options = MergeOptions::Defaults()) const;
+  Result<std::shared_ptr<Field>> MergeWith(
+      const std::shared_ptr<Field>& other,
+      MergeOptions options = MergeOptions::Defaults()) const;
+
+  FieldVector Flatten() const;
+
+  /// \brief Indicate if fields are equals.
+  ///
+  /// \param[in] other field to check equality with.
+  /// \param[in] check_metadata controls if it should check for metadata
+  ///            equality.
+  ///
+  /// \return true if fields are equal, false otherwise.
+  bool Equals(const Field& other, bool check_metadata = false) const;
+  bool Equals(const std::shared_ptr<Field>& other, bool check_metadata = false) const;
+
+  /// \brief Indicate if fields are compatibles.
+  ///
+  /// See the criteria of MergeWith.
+  ///
+  /// \return true if fields are compatible, false otherwise.
+  bool IsCompatibleWith(const Field& other) const;
+  bool IsCompatibleWith(const std::shared_ptr<Field>& other) const;
+
+  /// \brief Return a string representation ot the field
+  /// \param[in] show_metadata when true, if KeyValueMetadata is non-empty,
+  /// print keys and values in the output
+  std::string ToString(bool show_metadata = false) const;
+
+  /// \brief Return the field name
+  const std::string& name() const { return name_; }
+  /// \brief Return the field data type
+  const std::shared_ptr<DataType>& type() const { return type_; }
+  /// \brief Return whether the field is nullable
+  bool nullable() const { return nullable_; }
+
+  std::shared_ptr<Field> Copy() const;
+
+ private:
+  std::string ComputeFingerprint() const override;
+  std::string ComputeMetadataFingerprint() const override;
+
+  // Field name
+  std::string name_;
+
+  // The field's data type
+  std::shared_ptr<DataType> type_;
+
+  // Fields can be nullable
+  bool nullable_;
+
+  // The field's metadata, if any
+  std::shared_ptr<const KeyValueMetadata> metadata_;
+
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Field);
+};
+
+ARROW_EXPORT void PrintTo(const Field& field, std::ostream* os);
+
+namespace detail {
+
+template <typename DERIVED, typename BASE, Type::type TYPE_ID, typename C_TYPE>
+class CTypeImpl : public BASE {
+ public:
+  static constexpr Type::type type_id = TYPE_ID;
+  using c_type = C_TYPE;
+  using PhysicalType = DERIVED;
+
+  CTypeImpl() : BASE(TYPE_ID) {}
+
+  int bit_width() const override { return static_cast<int>(sizeof(C_TYPE) * CHAR_BIT); }
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout(
+        {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(C_TYPE))});
+  }
+
+  std::string name() const override { return DERIVED::type_name(); }
+
+  std::string ToString(bool show_metadata = false) const override { return this->name(); }
+};
+
+template <typename DERIVED, typename BASE, Type::type TYPE_ID, typename C_TYPE>
+constexpr Type::type CTypeImpl<DERIVED, BASE, TYPE_ID, C_TYPE>::type_id;
+
+template <typename DERIVED, Type::type TYPE_ID, typename C_TYPE>
+class IntegerTypeImpl : public detail::CTypeImpl<DERIVED, IntegerType, TYPE_ID, C_TYPE> {
+  bool is_signed() const override { return std::is_signed<C_TYPE>::value; }
+};
+
+}  // namespace detail
+
+/// Concrete type class for always-null data
+class ARROW_EXPORT NullType : public DataType {
+ public:
+  static constexpr Type::type type_id = Type::NA;
+
+  static constexpr const char* type_name() { return "null"; }
+
+  NullType() : DataType(Type::NA) {}
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout({DataTypeLayout::AlwaysNull()});
+  }
+
+  std::string name() const override { return "null"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for boolean data
+class ARROW_EXPORT BooleanType
+    : public detail::CTypeImpl<BooleanType, PrimitiveCType, Type::BOOL, bool> {
+ public:
+  static constexpr const char* type_name() { return "bool"; }
+
+  // BooleanType within arrow use a single bit instead of the C 8-bits layout.
+  int bit_width() const final { return 1; }
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout({DataTypeLayout::Bitmap(), DataTypeLayout::Bitmap()});
+  }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// \addtogroup numeric-datatypes
+///
+/// @{
+
+/// Concrete type class for unsigned 8-bit integer data
+class ARROW_EXPORT UInt8Type
+    : public detail::IntegerTypeImpl<UInt8Type, Type::UINT8, uint8_t> {
+ public:
+  static constexpr const char* type_name() { return "uint8"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for signed 8-bit integer data
+class ARROW_EXPORT Int8Type
+    : public detail::IntegerTypeImpl<Int8Type, Type::INT8, int8_t> {
+ public:
+  static constexpr const char* type_name() { return "int8"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for unsigned 16-bit integer data
+class ARROW_EXPORT UInt16Type
+    : public detail::IntegerTypeImpl<UInt16Type, Type::UINT16, uint16_t> {
+ public:
+  static constexpr const char* type_name() { return "uint16"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for signed 16-bit integer data
+class ARROW_EXPORT Int16Type
+    : public detail::IntegerTypeImpl<Int16Type, Type::INT16, int16_t> {
+ public:
+  static constexpr const char* type_name() { return "int16"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for unsigned 32-bit integer data
+class ARROW_EXPORT UInt32Type
+    : public detail::IntegerTypeImpl<UInt32Type, Type::UINT32, uint32_t> {
+ public:
+  static constexpr const char* type_name() { return "uint32"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for signed 32-bit integer data
+class ARROW_EXPORT Int32Type
+    : public detail::IntegerTypeImpl<Int32Type, Type::INT32, int32_t> {
+ public:
+  static constexpr const char* type_name() { return "int32"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for unsigned 64-bit integer data
+class ARROW_EXPORT UInt64Type
+    : public detail::IntegerTypeImpl<UInt64Type, Type::UINT64, uint64_t> {
+ public:
+  static constexpr const char* type_name() { return "uint64"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for signed 64-bit integer data
+class ARROW_EXPORT Int64Type
+    : public detail::IntegerTypeImpl<Int64Type, Type::INT64, int64_t> {
+ public:
+  static constexpr const char* type_name() { return "int64"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for 16-bit floating-point data
+class ARROW_EXPORT HalfFloatType
+    : public detail::CTypeImpl<HalfFloatType, FloatingPointType, Type::HALF_FLOAT,
+                               uint16_t> {
+ public:
+  Precision precision() const override;
+  static constexpr const char* type_name() { return "halffloat"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for 32-bit floating-point data (C "float")
+class ARROW_EXPORT FloatType
+    : public detail::CTypeImpl<FloatType, FloatingPointType, Type::FLOAT, float> {
+ public:
+  Precision precision() const override;
+  static constexpr const char* type_name() { return "float"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for 64-bit floating-point data (C "double")
+class ARROW_EXPORT DoubleType
+    : public detail::CTypeImpl<DoubleType, FloatingPointType, Type::DOUBLE, double> {
+ public:
+  Precision precision() const override;
+  static constexpr const char* type_name() { return "double"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// @}
+
+/// \brief Base class for all variable-size binary data types
+class ARROW_EXPORT BaseBinaryType : public DataType {
+ public:
+  using DataType::DataType;
+  // This is only for preventing defining this class in each
+  // translation unit to avoid one-definition-rule violation.
+  ~BaseBinaryType() override;
+};
+
+constexpr int64_t kBinaryMemoryLimit = std::numeric_limits<int32_t>::max() - 1;
+
+/// \addtogroup binary-datatypes
+///
+/// @{
+
+/// \brief Concrete type class for variable-size binary data
+class ARROW_EXPORT BinaryType : public BaseBinaryType {
+ public:
+  static constexpr Type::type type_id = Type::BINARY;
+  static constexpr bool is_utf8 = false;
+  using offset_type = int32_t;
+  using PhysicalType = BinaryType;
+
+  static constexpr const char* type_name() { return "binary"; }
+
+  BinaryType() : BinaryType(Type::BINARY) {}
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout({DataTypeLayout::Bitmap(),
+                           DataTypeLayout::FixedWidth(sizeof(offset_type)),
+                           DataTypeLayout::VariableWidth()});
+  }
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "binary"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+
+  // Allow subclasses like StringType to change the logical type.
+  explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {}
+};
+
+/// \brief Concrete type class for variable-size binary view data
+class ARROW_EXPORT BinaryViewType : public DataType {
+ public:
+  static constexpr Type::type type_id = Type::BINARY_VIEW;
+  static constexpr bool is_utf8 = false;
+  using PhysicalType = BinaryViewType;
+
+  static constexpr int kSize = 16;
+  static constexpr int kInlineSize = 12;
+  static constexpr int kPrefixSize = 4;
+
+  /// Variable length string or binary with inline optimization for small values (12 bytes
+  /// or fewer). This is similar to std::string_view except limited in size to INT32_MAX
+  /// and at least the first four bytes of the string are copied inline (accessible
+  /// without pointer dereference). This inline prefix allows failing comparisons early.
+  /// Furthermore when dealing with short strings the CPU cache working set is reduced
+  /// since many can be inline.
+  ///
+  /// This union supports two states:
+  ///
+  /// - Entirely inlined string data
+  /// \code{.unparsed}
+  ///                |----|--------------|
+  ///                 ^    ^
+  ///                 |    |
+  ///              size    in-line string data, zero padded
+  /// \endcode
+  ///
+  /// - Reference into a buffer
+  /// \code{.unparsed}
+  ///                |----|----|----|----|
+  ///                 ^    ^    ^    ^
+  ///                 |    |    |    |
+  ///              size    |    |    `------.
+  ///                  prefix   |           |
+  ///                        buffer index   |
+  ///                                  offset in buffer
+  /// \endcode
+  ///
+  /// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+  ///
+  /// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+  ///
+  /// Alignment to 64 bits enables an aligned load of the size and prefix into
+  /// a single 64 bit integer, which is useful to the comparison fast path.
+  union alignas(int64_t) c_type {
+    struct {
+      int32_t size;
+      std::array<uint8_t, kInlineSize> data;
+    } inlined;
+
+    struct {
+      int32_t size;
+      std::array<uint8_t, kPrefixSize> prefix;
+      int32_t buffer_index;
+      int32_t offset;
+    } ref;
+
+    /// The number of bytes viewed.
+    int32_t size() const {
+      // Size is in the common initial subsequence of each member of the union,
+      // so accessing `inlined.size` is legal even if another member is active.
+      return inlined.size;
+    }
+
+    /// True if the view's data is entirely stored inline.
+    bool is_inline() const { return size() <= kInlineSize; }
+
+    /// Return a pointer to the inline data of a view.
+    ///
+    /// For inline views, this points to the entire data of the view.
+    /// For other views, this points to the 4 byte prefix.
+    const uint8_t* inline_data() const& {
+      // Since `ref.prefix` has the same address as `inlined.data`,
+      // the branch will be trivially optimized out.
+      return is_inline() ? inlined.data.data() : ref.prefix.data();
+    }
+    const uint8_t* inline_data() && = delete;
+  };
+  static_assert(sizeof(c_type) == kSize);
+  static_assert(std::is_trivial_v<c_type>);
+
+  static constexpr const char* type_name() { return "binary_view"; }
+
+  BinaryViewType() : BinaryViewType(Type::BINARY_VIEW) {}
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout({DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(kSize)},
+                          DataTypeLayout::VariableWidth());
+  }
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "binary_view"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+
+  // Allow subclasses like StringType to change the logical type.
+  explicit BinaryViewType(Type::type logical_type) : DataType(logical_type) {}
+};
+
+/// \brief Concrete type class for large variable-size binary data
+class ARROW_EXPORT LargeBinaryType : public BaseBinaryType {
+ public:
+  static constexpr Type::type type_id = Type::LARGE_BINARY;
+  static constexpr bool is_utf8 = false;
+  using offset_type = int64_t;
+  using PhysicalType = LargeBinaryType;
+
+  static constexpr const char* type_name() { return "large_binary"; }
+
+  LargeBinaryType() : LargeBinaryType(Type::LARGE_BINARY) {}
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout({DataTypeLayout::Bitmap(),
+                           DataTypeLayout::FixedWidth(sizeof(offset_type)),
+                           DataTypeLayout::VariableWidth()});
+  }
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "large_binary"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+
+  // Allow subclasses like LargeStringType to change the logical type.
+  explicit LargeBinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {}
+};
+
+/// \brief Concrete type class for variable-size string data, utf8-encoded
+class ARROW_EXPORT StringType : public BinaryType {
+ public:
+  static constexpr Type::type type_id = Type::STRING;
+  static constexpr bool is_utf8 = true;
+  using PhysicalType = BinaryType;
+
+  static constexpr const char* type_name() { return "utf8"; }
+
+  StringType() : BinaryType(Type::STRING) {}
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "utf8"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// \brief Concrete type class for variable-size string data, utf8-encoded
+class ARROW_EXPORT StringViewType : public BinaryViewType {
+ public:
+  static constexpr Type::type type_id = Type::STRING_VIEW;
+  static constexpr bool is_utf8 = true;
+  using PhysicalType = BinaryViewType;
+
+  static constexpr const char* type_name() { return "utf8_view"; }
+
+  StringViewType() : BinaryViewType(Type::STRING_VIEW) {}
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "utf8_view"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// \brief Concrete type class for large variable-size string data, utf8-encoded
+class ARROW_EXPORT LargeStringType : public LargeBinaryType {
+ public:
+  static constexpr Type::type type_id = Type::LARGE_STRING;
+  static constexpr bool is_utf8 = true;
+  using PhysicalType = LargeBinaryType;
+
+  static constexpr const char* type_name() { return "large_utf8"; }
+
+  LargeStringType() : LargeBinaryType(Type::LARGE_STRING) {}
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "large_utf8"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// \brief Concrete type class for fixed-size binary data
+class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public ParametricType {
+ public:
+  static constexpr Type::type type_id = Type::FIXED_SIZE_BINARY;
+  static constexpr bool is_utf8 = false;
+
+  static constexpr const char* type_name() { return "fixed_size_binary"; }
+
+  explicit FixedSizeBinaryType(int32_t byte_width)
+      : FixedWidthType(Type::FIXED_SIZE_BINARY), byte_width_(byte_width) {}
+  explicit FixedSizeBinaryType(int32_t byte_width, Type::type override_type_id)
+      : FixedWidthType(override_type_id), byte_width_(byte_width) {}
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "fixed_size_binary"; }
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout(
+        {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(byte_width())});
+  }
+
+  int byte_width() const override { return byte_width_; }
+
+  int bit_width() const override;
+
+  // Validating constructor
+  static Result<std::shared_ptr<DataType>> Make(int32_t byte_width);
+
+ protected:
+  std::string ComputeFingerprint() const override;
+
+  int32_t byte_width_;
+};
+
+/// @}
+
+/// \addtogroup numeric-datatypes
+///
+/// @{
+
+/// \brief Base type class for (fixed-size) decimal data
+class ARROW_EXPORT DecimalType : public FixedSizeBinaryType {
+ public:
+  explicit DecimalType(Type::type type_id, int32_t byte_width, int32_t precision,
+                       int32_t scale)
+      : FixedSizeBinaryType(byte_width, type_id), precision_(precision), scale_(scale) {}
+
+  /// Constructs concrete decimal types
+  static Result<std::shared_ptr<DataType>> Make(Type::type type_id, int32_t precision,
+                                                int32_t scale);
+
+  int32_t precision() const { return precision_; }
+  int32_t scale() const { return scale_; }
+
+  /// \brief Returns the number of bytes needed for precision.
+  ///
+  /// precision must be >= 1
+  static int32_t DecimalSize(int32_t precision);
+
+ protected:
+  std::string ComputeFingerprint() const override;
+
+  int32_t precision_;
+  int32_t scale_;
+};
+
+/// \brief Concrete type class for 32-bit decimal data
+///
+/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
+/// integer.  The precision is the number of significant digits that the
+/// decimal type can represent; the scale is the number of digits after
+/// the decimal point (note the scale can be negative).
+///
+/// As an example, `Decimal32Type(7, 3)` can exactly represent the numbers
+/// 1234.567 and -1234.567 (encoded internally as the 32-bit integers
+/// 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567.
+///
+/// Decimal32Type has a maximum precision of 9 significant digits
+/// (also available as Decimal32Type::kMaxPrecision).
+/// If higher precision is needed, consider using Decimal64Type,
+/// Decimal128Type or Decimal256Type.
+class ARROW_EXPORT Decimal32Type : public DecimalType {
+ public:
+  static constexpr Type::type type_id = Type::DECIMAL32;
+
+  static constexpr const char* type_name() { return "decimal32"; }
+
+  /// Decimal32Type constructor that aborts on invalid input.
+  explicit Decimal32Type(int32_t precision, int32_t scale);
+
+  /// Decimal32Type constructor that returns an error on invalid input
+  static Result<std::shared_ptr<DataType>> Make(int32_t precision, int32_t scale);
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "decimal32"; }
+
+  static constexpr int32_t kMinPrecision = 1;
+  static constexpr int32_t kMaxPrecision = 9;
+  static constexpr int32_t kByteWidth = 4;
+};
+
+/// \brief Concrete type class for 64-bit decimal data
+///
+/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
+/// integer.  The precision is the number of significant digits that the
+/// decimal type can represent; the scale is the number of digits after
+/// the decimal point (note the scale can be negative).
+///
+/// As an example, `Decimal64Type(7, 3)` can exactly represent the numbers
+/// 1234.567 and -1234.567 (encoded internally as the 64-bit integers
+/// 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567.
+///
+/// Decimal64Type has a maximum precision of 18 significant digits
+/// (also available as Decimal64Type::kMaxPrecision).
+/// If higher precision is needed, consider using Decimal128Type or
+/// Decimal256Type.
+class ARROW_EXPORT Decimal64Type : public DecimalType {
+ public:
+  static constexpr Type::type type_id = Type::DECIMAL64;
+
+  static constexpr const char* type_name() { return "decimal64"; }
+
+  /// Decimal32Type constructor that aborts on invalid input.
+  explicit Decimal64Type(int32_t precision, int32_t scale);
+
+  /// Decimal32Type constructor that returns an error on invalid input
+  static Result<std::shared_ptr<DataType>> Make(int32_t precision, int32_t scale);
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "decimal64"; }
+
+  static constexpr int32_t kMinPrecision = 1;
+  static constexpr int32_t kMaxPrecision = 18;
+  static constexpr int32_t kByteWidth = 8;
+};
+
+/// \brief Concrete type class for 128-bit decimal data
+///
+/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
+/// integer.  The precision is the number of significant digits that the
+/// decimal type can represent; the scale is the number of digits after
+/// the decimal point (note the scale can be negative).
+///
+/// As an example, `Decimal128Type(7, 3)` can exactly represent the numbers
+/// 1234.567 and -1234.567 (encoded internally as the 128-bit integers
+/// 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567.
+///
+/// Decimal128Type has a maximum precision of 38 significant digits
+/// (also available as Decimal128Type::kMaxPrecision).
+/// If higher precision is needed, consider using Decimal256Type.
+class ARROW_EXPORT Decimal128Type : public DecimalType {
+ public:
+  static constexpr Type::type type_id = Type::DECIMAL128;
+
+  static constexpr const char* type_name() { return "decimal128"; }
+
+  /// Decimal128Type constructor that aborts on invalid input.
+  explicit Decimal128Type(int32_t precision, int32_t scale);
+
+  /// Decimal128Type constructor that returns an error on invalid input.
+  static Result<std::shared_ptr<DataType>> Make(int32_t precision, int32_t scale);
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "decimal128"; }
+
+  static constexpr int32_t kMinPrecision = 1;
+  static constexpr int32_t kMaxPrecision = 38;
+  static constexpr int32_t kByteWidth = 16;
+};
+
+/// \brief Concrete type class for 256-bit decimal data
+///
+/// Arrow decimals are fixed-point decimal numbers encoded as a scaled
+/// integer.  The precision is the number of significant digits that the
+/// decimal type can represent; the scale is the number of digits after
+/// the decimal point (note the scale can be negative).
+///
+/// Decimal256Type has a maximum precision of 76 significant digits.
+/// (also available as Decimal256Type::kMaxPrecision).
+///
+/// For most use cases, the maximum precision offered by Decimal128Type
+/// is sufficient, and it will result in a more compact and more efficient
+/// encoding.
+class ARROW_EXPORT Decimal256Type : public DecimalType {
+ public:
+  static constexpr Type::type type_id = Type::DECIMAL256;
+
+  static constexpr const char* type_name() { return "decimal256"; }
+
+  /// Decimal256Type constructor that aborts on invalid input.
+  explicit Decimal256Type(int32_t precision, int32_t scale);
+
+  /// Decimal256Type constructor that returns an error on invalid input.
+  static Result<std::shared_ptr<DataType>> Make(int32_t precision, int32_t scale);
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "decimal256"; }
+
+  static constexpr int32_t kMinPrecision = 1;
+  static constexpr int32_t kMaxPrecision = 76;
+  static constexpr int32_t kByteWidth = 32;
+};
+
+/// @}
+
+/// \addtogroup nested-datatypes
+///
+/// @{
+
+/// \brief Base class for all variable-size list data types
+class ARROW_EXPORT BaseListType : public NestedType {
+ public:
+  using NestedType::NestedType;
+  // This is only for preventing defining this class in each
+  // translation unit to avoid one-definition-rule violation.
+  ~BaseListType() override;
+  const std::shared_ptr<Field>& value_field() const { return children_[0]; }
+
+  const std::shared_ptr<DataType>& value_type() const { return children_[0]->type(); }
+};
+
+/// \brief Concrete type class for list data
+///
+/// List data is nested data where each value is a variable number of
+/// child items.  Lists can be recursively nested, for example
+/// list(list(int32)).
+class ARROW_EXPORT ListType : public BaseListType {
+ public:
+  static constexpr Type::type type_id = Type::LIST;
+  using offset_type = int32_t;
+
+  static constexpr const char* type_name() { return "list"; }
+
+  // List can contain any other logical value type
+  explicit ListType(std::shared_ptr<DataType> value_type)
+      : ListType(std::make_shared<Field>("item", std::move(value_type))) {}
+
+  explicit ListType(std::shared_ptr<Field> value_field) : BaseListType(type_id) {
+    children_ = {std::move(value_field)};
+  }
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout(
+        {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(offset_type))});
+  }
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  std::string name() const override { return "list"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// \brief Concrete type class for large list data
+///
+/// LargeListType is like ListType but with 64-bit rather than 32-bit offsets.
+class ARROW_EXPORT LargeListType : public BaseListType {
+ public:
+  static constexpr Type::type type_id = Type::LARGE_LIST;
+  using offset_type = int64_t;
+
+  static constexpr const char* type_name() { return "large_list"; }
+
+  // List can contain any other logical value type
+  explicit LargeListType(std::shared_ptr<DataType> value_type)
+      : LargeListType(std::make_shared<Field>("item", std::move(value_type))) {}
+
+  explicit LargeListType(std::shared_ptr<Field> value_field) : BaseListType(type_id) {
+    children_ = {std::move(value_field)};
+  }
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout(
+        {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(sizeof(offset_type))});
+  }
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  std::string name() const override { return "large_list"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// \brief Type class for array of list views
+class ARROW_EXPORT ListViewType : public BaseListType {
+ public:
+  static constexpr Type::type type_id = Type::LIST_VIEW;
+  using offset_type = int32_t;
+
+  static constexpr const char* type_name() { return "list_view"; }
+
+  // ListView can contain any other logical value type
+  explicit ListViewType(const std::shared_ptr<DataType>& value_type)
+      : ListViewType(std::make_shared<Field>("item", value_type)) {}
+
+  explicit ListViewType(const std::shared_ptr<Field>& value_field)
+      : BaseListType(type_id) {
+    children_ = {value_field};
+  }
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout({DataTypeLayout::Bitmap(),
+                           DataTypeLayout::FixedWidth(sizeof(offset_type)),
+                           DataTypeLayout::FixedWidth(sizeof(offset_type))});
+  }
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  std::string name() const override { return "list_view"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// \brief Concrete type class for large list-view data
+///
+/// LargeListViewType is like ListViewType but with 64-bit rather than 32-bit offsets and
+/// sizes.
+class ARROW_EXPORT LargeListViewType : public BaseListType {
+ public:
+  static constexpr Type::type type_id = Type::LARGE_LIST_VIEW;
+  using offset_type = int64_t;
+
+  static constexpr const char* type_name() { return "large_list_view"; }
+
+  // LargeListView can contain any other logical value type
+  explicit LargeListViewType(const std::shared_ptr<DataType>& value_type)
+      : LargeListViewType(std::make_shared<Field>("item", value_type)) {}
+
+  explicit LargeListViewType(const std::shared_ptr<Field>& value_field)
+      : BaseListType(type_id) {
+    children_ = {value_field};
+  }
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout({DataTypeLayout::Bitmap(),
+                           DataTypeLayout::FixedWidth(sizeof(offset_type)),
+                           DataTypeLayout::FixedWidth(sizeof(offset_type))});
+  }
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  std::string name() const override { return "large_list_view"; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// \brief Concrete type class for map data
+///
+/// Map data is nested data where each value is a variable number of
+/// key-item pairs.  Its physical representation is the same as
+/// a list of `{key, item}` structs.
+///
+/// Maps can be recursively nested, for example map(utf8, map(utf8, int32)).
+class ARROW_EXPORT MapType : public ListType {
+ public:
+  static constexpr Type::type type_id = Type::MAP;
+
+  static constexpr const char* type_name() { return "map"; }
+
+  MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<DataType> item_type,
+          bool keys_sorted = false);
+
+  MapType(std::shared_ptr<DataType> key_type, std::shared_ptr<Field> item_field,
+          bool keys_sorted = false);
+
+  MapType(std::shared_ptr<Field> key_field, std::shared_ptr<Field> item_field,
+          bool keys_sorted = false);
+
+  explicit MapType(std::shared_ptr<Field> value_field, bool keys_sorted = false);
+
+  // Validating constructor
+  static Result<std::shared_ptr<DataType>> Make(std::shared_ptr<Field> value_field,
+                                                bool keys_sorted = false);
+
+  std::shared_ptr<Field> key_field() const { return value_type()->field(0); }
+  std::shared_ptr<DataType> key_type() const { return key_field()->type(); }
+
+  std::shared_ptr<Field> item_field() const { return value_type()->field(1); }
+  std::shared_ptr<DataType> item_type() const { return item_field()->type(); }
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  std::string name() const override { return "map"; }
+
+  bool keys_sorted() const { return keys_sorted_; }
+
+ private:
+  std::string ComputeFingerprint() const override;
+
+  bool keys_sorted_;
+};
+
+/// \brief Concrete type class for fixed size list data
+class ARROW_EXPORT FixedSizeListType : public BaseListType {
+ public:
+  static constexpr Type::type type_id = Type::FIXED_SIZE_LIST;
+  // While the individual item size is 32-bit, the overall data size
+  // (item size * list length) may not fit in a 32-bit int.
+  using offset_type = int64_t;
+
+  static constexpr const char* type_name() { return "fixed_size_list"; }
+
+  // List can contain any other logical value type
+  FixedSizeListType(std::shared_ptr<DataType> value_type, int32_t list_size)
+      : FixedSizeListType(std::make_shared<Field>("item", std::move(value_type)),
+                          list_size) {}
+
+  FixedSizeListType(std::shared_ptr<Field> value_field, int32_t list_size)
+      : BaseListType(type_id), list_size_(list_size) {
+    children_ = {std::move(value_field)};
+  }
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout({DataTypeLayout::Bitmap()});
+  }
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  std::string name() const override { return "fixed_size_list"; }
+
+  int32_t list_size() const { return list_size_; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+
+  int32_t list_size_;
+};
+
+/// \brief Concrete type class for struct data
+class ARROW_EXPORT StructType : public NestedType {
+ public:
+  static constexpr Type::type type_id = Type::STRUCT;
+
+  static constexpr const char* type_name() { return "struct"; }
+
+  explicit StructType(const FieldVector& fields);
+
+  ~StructType() override;
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout({DataTypeLayout::Bitmap()});
+  }
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "struct"; }
+
+  /// Returns null if name not found
+  std::shared_ptr<Field> GetFieldByName(const std::string& name) const;
+
+  /// Return all fields having this name
+  FieldVector GetAllFieldsByName(const std::string& name) const;
+
+  /// Returns -1 if name not found or if there are multiple fields having the
+  /// same name
+  int GetFieldIndex(const std::string& name) const;
+
+  /// \brief Return the indices of all fields having this name in sorted order
+  std::vector<int> GetAllFieldIndices(const std::string& name) const;
+
+  /// \brief Create a new StructType with field added at given index
+  Result<std::shared_ptr<StructType>> AddField(int i,
+                                               const std::shared_ptr<Field>& field) const;
+  /// \brief Create a new StructType by removing the field at given index
+  Result<std::shared_ptr<StructType>> RemoveField(int i) const;
+  /// \brief Create a new StructType by changing the field at given index
+  Result<std::shared_ptr<StructType>> SetField(int i,
+                                               const std::shared_ptr<Field>& field) const;
+
+ private:
+  std::string ComputeFingerprint() const override;
+
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+/// \brief Base type class for union data
+class ARROW_EXPORT UnionType : public NestedType {
+ public:
+  static constexpr int8_t kMaxTypeCode = 127;
+  static constexpr int kInvalidChildId = -1;
+
+  static Result<std::shared_ptr<DataType>> Make(
+      const FieldVector& fields, const std::vector<int8_t>& type_codes,
+      UnionMode::type mode = UnionMode::SPARSE) {
+    if (mode == UnionMode::SPARSE) {
+      return sparse_union(fields, type_codes);
+    } else {
+      return dense_union(fields, type_codes);
+    }
+  }
+
+  DataTypeLayout layout() const override;
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  /// The array of logical type ids.
+  ///
+  /// For example, the first type in the union might be denoted by the id 5
+  /// (instead of 0).
+  const std::vector<int8_t>& type_codes() const { return type_codes_; }
+
+  /// An array mapping logical type ids to physical child ids.
+  const std::vector<int>& child_ids() const { return child_ids_; }
+
+  uint8_t max_type_code() const;
+
+  UnionMode::type mode() const;
+
+ protected:
+  UnionType(FieldVector fields, std::vector<int8_t> type_codes, Type::type id);
+
+  static Status ValidateParameters(const FieldVector& fields,
+                                   const std::vector<int8_t>& type_codes,
+                                   UnionMode::type mode);
+
+ private:
+  std::string ComputeFingerprint() const override;
+
+  std::vector<int8_t> type_codes_;
+  std::vector<int> child_ids_;
+};
+
+/// \brief Concrete type class for sparse union data
+///
+/// A sparse union is a nested type where each logical value is taken from
+/// a single child.  A buffer of 8-bit type ids indicates which child
+/// a given logical value is to be taken from.
+///
+/// In a sparse union, each child array should have the same length as the
+/// union array, regardless of the actual number of union values that
+/// refer to it.
+///
+/// Note that, unlike most other types, unions don't have a top-level validity bitmap.
+class ARROW_EXPORT SparseUnionType : public UnionType {
+ public:
+  static constexpr Type::type type_id = Type::SPARSE_UNION;
+
+  static constexpr const char* type_name() { return "sparse_union"; }
+
+  SparseUnionType(FieldVector fields, std::vector<int8_t> type_codes);
+
+  // A constructor variant that validates input parameters
+  static Result<std::shared_ptr<DataType>> Make(FieldVector fields,
+                                                std::vector<int8_t> type_codes);
+
+  std::string name() const override { return "sparse_union"; }
+};
+
+/// \brief Concrete type class for dense union data
+///
+/// A dense union is a nested type where each logical value is taken from
+/// a single child, at a specific offset.  A buffer of 8-bit type ids
+/// indicates which child a given logical value is to be taken from,
+/// and a buffer of 32-bit offsets indicates at which physical position
+/// in the given child array the logical value is to be taken from.
+///
+/// Unlike a sparse union, a dense union allows encoding only the child array
+/// values which are actually referred to by the union array.  This is
+/// counterbalanced by the additional footprint of the offsets buffer, and
+/// the additional indirection cost when looking up values.
+///
+/// Note that, unlike most other types, unions don't have a top-level validity bitmap.
+class ARROW_EXPORT DenseUnionType : public UnionType {
+ public:
+  static constexpr Type::type type_id = Type::DENSE_UNION;
+
+  static constexpr const char* type_name() { return "dense_union"; }
+
+  DenseUnionType(FieldVector fields, std::vector<int8_t> type_codes);
+
+  // A constructor variant that validates input parameters
+  static Result<std::shared_ptr<DataType>> Make(FieldVector fields,
+                                                std::vector<int8_t> type_codes);
+
+  std::string name() const override { return "dense_union"; }
+};
+
+/// \brief Type class for run-end encoded data
+class ARROW_EXPORT RunEndEncodedType : public NestedType {
+ public:
+  static constexpr Type::type type_id = Type::RUN_END_ENCODED;
+
+  static constexpr const char* type_name() { return "run_end_encoded"; }
+
+  explicit RunEndEncodedType(std::shared_ptr<DataType> run_end_type,
+                             std::shared_ptr<DataType> value_type);
+  ~RunEndEncodedType() override;
+
+  DataTypeLayout layout() const override {
+    // A lot of existing code expects at least one buffer
+    return DataTypeLayout({DataTypeLayout::AlwaysNull()});
+  }
+
+  const std::shared_ptr<DataType>& run_end_type() const { return fields()[0]->type(); }
+  const std::shared_ptr<DataType>& value_type() const { return fields()[1]->type(); }
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  std::string name() const override { return "run_end_encoded"; }
+
+  static bool RunEndTypeValid(const DataType& run_end_type);
+
+ private:
+  std::string ComputeFingerprint() const override;
+};
+
+/// @}
+
+// ----------------------------------------------------------------------
+// Date and time types
+
+/// \addtogroup temporal-datatypes
+///
+/// @{
+
+/// \brief Base type for all date and time types
+class ARROW_EXPORT TemporalType : public FixedWidthType {
+ public:
+  using FixedWidthType::FixedWidthType;
+  // This is only for preventing defining this class in each
+  // translation unit to avoid one-definition-rule violation.
+  ~TemporalType() override;
+
+  DataTypeLayout layout() const override {
+    return DataTypeLayout(
+        {DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(bit_width() / 8)});
+  }
+};
+
+/// \brief Base type class for date data
+class ARROW_EXPORT DateType : public TemporalType {
+ public:
+  virtual DateUnit unit() const = 0;
+
+ protected:
+  explicit DateType(Type::type type_id);
+};
+
+/// Concrete type class for 32-bit date data (as number of days since UNIX epoch)
+class ARROW_EXPORT Date32Type : public DateType {
+ public:
+  static constexpr Type::type type_id = Type::DATE32;
+  static constexpr DateUnit UNIT = DateUnit::DAY;
+  using c_type = int32_t;
+  using PhysicalType = Int32Type;
+
+  static constexpr const char* type_name() { return "date32"; }
+
+  Date32Type();
+
+  int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  std::string name() const override { return "date32"; }
+  DateUnit unit() const override { return UNIT; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+/// Concrete type class for 64-bit date data (as number of milliseconds since UNIX epoch)
+class ARROW_EXPORT Date64Type : public DateType {
+ public:
+  static constexpr Type::type type_id = Type::DATE64;
+  static constexpr DateUnit UNIT = DateUnit::MILLI;
+  using c_type = int64_t;
+  using PhysicalType = Int64Type;
+
+  static constexpr const char* type_name() { return "date64"; }
+
+  Date64Type();
+
+  int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  std::string name() const override { return "date64"; }
+  DateUnit unit() const override { return UNIT; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+};
+
+ARROW_EXPORT
+std::ostream& operator<<(std::ostream& os, TimeUnit::type unit);
+
+/// Base type class for time data
+class ARROW_EXPORT TimeType : public TemporalType, public ParametricType {
+ public:
+  TimeUnit::type unit() const { return unit_; }
+
+ protected:
+  TimeType(Type::type type_id, TimeUnit::type unit);
+  std::string ComputeFingerprint() const override;
+
+  TimeUnit::type unit_;
+};
+
+/// Concrete type class for 32-bit time data (as number of seconds or milliseconds
+/// since midnight)
+class ARROW_EXPORT Time32Type : public TimeType {
+ public:
+  static constexpr Type::type type_id = Type::TIME32;
+  using c_type = int32_t;
+  using PhysicalType = Int32Type;
+
+  static constexpr const char* type_name() { return "time32"; }
+
+  int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+  explicit Time32Type(TimeUnit::type unit = TimeUnit::MILLI);
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  std::string name() const override { return "time32"; }
+};
+
+/// Concrete type class for 64-bit time data (as number of microseconds or nanoseconds
+/// since midnight)
+class ARROW_EXPORT Time64Type : public TimeType {
+ public:
+  static constexpr Type::type type_id = Type::TIME64;
+  using c_type = int64_t;
+  using PhysicalType = Int64Type;
+
+  static constexpr const char* type_name() { return "time64"; }
+
+  int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+  explicit Time64Type(TimeUnit::type unit = TimeUnit::NANO);
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  std::string name() const override { return "time64"; }
+};
+
+/// \brief Concrete type class for datetime data (as number of seconds, milliseconds,
+/// microseconds or nanoseconds since UNIX epoch)
+///
+/// If supplied, the timezone string should take either the form (i) "Area/Location",
+/// with values drawn from the names in the IANA Time Zone Database (such as
+/// "Europe/Zurich"); or (ii) "(+|-)HH:MM" indicating an absolute offset from GMT
+/// (such as "-08:00").  To indicate a native UTC timestamp, one of the strings "UTC",
+/// "Etc/UTC" or "+00:00" should be used.
+///
+/// If any non-empty string is supplied as the timezone for a TimestampType, then the
+/// Arrow field containing that timestamp type (and by extension the column associated
+/// with such a field) is considered "timezone-aware".  The integer arrays that comprise
+/// a timezone-aware column must contain UTC normalized datetime values, regardless of
+/// the contents of their timezone string.  More precisely, (i) the producer of a
+/// timezone-aware column must populate its constituent arrays with valid UTC values
+/// (performing offset conversions from non-UTC values if necessary); and (ii) the
+/// consumer of a timezone-aware column may assume that the column's values are directly
+/// comparable (that is, with no offset adjustment required) to the values of any other
+/// timezone-aware column or to any other valid UTC datetime value (provided all values
+/// are expressed in the same units).
+///
+/// If a TimestampType is constructed without a timezone (or, equivalently, if the
+/// timezone supplied is an empty string) then the resulting Arrow field (column) is
+/// considered "timezone-naive".  The producer of a timezone-naive column may populate
+/// its constituent integer arrays with datetime values from any timezone; the consumer
+/// of a timezone-naive column should make no assumptions about the interoperability or
+/// comparability of the values of such a column with those of any other timestamp
+/// column or datetime value.
+///
+/// If a timezone-aware field contains a recognized timezone, its values may be
+/// localized to that locale upon display; the values of timezone-naive fields must
+/// always be displayed "as is", with no localization performed on them.
+class ARROW_EXPORT TimestampType : public TemporalType, public ParametricType {
+ public:
+  using Unit = TimeUnit;
+
+  static constexpr Type::type type_id = Type::TIMESTAMP;
+  using c_type = int64_t;
+  using PhysicalType = Int64Type;
+
+  static constexpr const char* type_name() { return "timestamp"; }
+
+  int bit_width() const override { return static_cast<int>(sizeof(int64_t) * CHAR_BIT); }
+
+  explicit TimestampType(TimeUnit::type unit = TimeUnit::MILLI)
+      : TemporalType(Type::TIMESTAMP), unit_(unit) {}
+
+  explicit TimestampType(TimeUnit::type unit, const std::string& timezone)
+      : TemporalType(Type::TIMESTAMP), unit_(unit), timezone_(timezone) {}
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "timestamp"; }
+
+  TimeUnit::type unit() const { return unit_; }
+  const std::string& timezone() const { return timezone_; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+
+ private:
+  TimeUnit::type unit_;
+  std::string timezone_;
+};
+
+// Base class for the different kinds of calendar intervals.
+class ARROW_EXPORT IntervalType : public TemporalType, public ParametricType {
+ public:
+  enum type { MONTHS, DAY_TIME, MONTH_DAY_NANO };
+
+  virtual type interval_type() const = 0;
+
+ protected:
+  explicit IntervalType(Type::type subtype) : TemporalType(subtype) {}
+  std::string ComputeFingerprint() const override;
+};
+
+/// \brief Represents a number of months.
+///
+/// Type representing a number of months.  Corresponds to YearMonth type
+/// in Schema.fbs (years are defined as 12 months).
+class ARROW_EXPORT MonthIntervalType : public IntervalType {
+ public:
+  static constexpr Type::type type_id = Type::INTERVAL_MONTHS;
+  using c_type = int32_t;
+  using PhysicalType = Int32Type;
+
+  static constexpr const char* type_name() { return "month_interval"; }
+
+  IntervalType::type interval_type() const override { return IntervalType::MONTHS; }
+
+  int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+  MonthIntervalType() : IntervalType(type_id) {}
+
+  std::string ToString(bool ARROW_ARG_UNUSED(show_metadata) = false) const override {
+    return name();
+  }
+  std::string name() const override { return "month_interval"; }
+};
+
+/// \brief Represents a number of days and milliseconds (fraction of day).
+class ARROW_EXPORT DayTimeIntervalType : public IntervalType {
+ public:
+  struct DayMilliseconds {
+    int32_t days = 0;
+    int32_t milliseconds = 0;
+    constexpr DayMilliseconds() = default;
+    constexpr DayMilliseconds(int32_t days, int32_t milliseconds)
+        : days(days), milliseconds(milliseconds) {}
+    bool operator==(DayMilliseconds other) const {
+      return this->days == other.days && this->milliseconds == other.milliseconds;
+    }
+    bool operator!=(DayMilliseconds other) const { return !(*this == other); }
+    bool operator<(DayMilliseconds other) const {
+      return this->days < other.days || this->milliseconds < other.milliseconds;
+    }
+  };
+  using c_type = DayMilliseconds;
+  using PhysicalType = DayTimeIntervalType;
+
+  static_assert(sizeof(DayMilliseconds) == 8,
+                "DayMilliseconds struct assumed to be of size 8 bytes");
+  static constexpr Type::type type_id = Type::INTERVAL_DAY_TIME;
+
+  static constexpr const char* type_name() { return "day_time_interval"; }
+
+  IntervalType::type interval_type() const override { return IntervalType::DAY_TIME; }
+
+  DayTimeIntervalType() : IntervalType(type_id) {}
+
+  int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+  std::string ToString(bool ARROW_ARG_UNUSED(show_metadata) = false) const override {
+    return name();
+  }
+  std::string name() const override { return "day_time_interval"; }
+};
+
+ARROW_EXPORT
+std::ostream& operator<<(std::ostream& os, DayTimeIntervalType::DayMilliseconds interval);
+
+/// \brief Represents a number of months, days and nanoseconds between
+/// two dates.
+///
+/// All fields are independent from one another.
+class ARROW_EXPORT MonthDayNanoIntervalType : public IntervalType {
+ public:
+  struct MonthDayNanos {
+    int32_t months;
+    int32_t days;
+    int64_t nanoseconds;
+    bool operator==(MonthDayNanos other) const {
+      return this->months == other.months && this->days == other.days &&
+             this->nanoseconds == other.nanoseconds;
+    }
+    bool operator!=(MonthDayNanos other) const { return !(*this == other); }
+  };
+  using c_type = MonthDayNanos;
+  using PhysicalType = MonthDayNanoIntervalType;
+
+  static_assert(sizeof(MonthDayNanos) == 16,
+                "MonthDayNanos struct assumed to be of size 16 bytes");
+  static constexpr Type::type type_id = Type::INTERVAL_MONTH_DAY_NANO;
+
+  static constexpr const char* type_name() { return "month_day_nano_interval"; }
+
+  IntervalType::type interval_type() const override {
+    return IntervalType::MONTH_DAY_NANO;
+  }
+
+  MonthDayNanoIntervalType() : IntervalType(type_id) {}
+
+  int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
+
+  std::string ToString(bool ARROW_ARG_UNUSED(show_metadata) = false) const override {
+    return name();
+  }
+  std::string name() const override { return "month_day_nano_interval"; }
+};
+
+ARROW_EXPORT
+std::ostream& operator<<(std::ostream& os,
+                         MonthDayNanoIntervalType::MonthDayNanos interval);
+
+/// \brief Represents an elapsed time without any relation to a calendar artifact.
+class ARROW_EXPORT DurationType : public TemporalType, public ParametricType {
+ public:
+  using Unit = TimeUnit;
+
+  static constexpr Type::type type_id = Type::DURATION;
+  using c_type = int64_t;
+  using PhysicalType = Int64Type;
+
+  static constexpr const char* type_name() { return "duration"; }
+
+  int bit_width() const override { return static_cast<int>(sizeof(int64_t) * CHAR_BIT); }
+
+  explicit DurationType(TimeUnit::type unit = TimeUnit::MILLI)
+      : TemporalType(Type::DURATION), unit_(unit) {}
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "duration"; }
+
+  TimeUnit::type unit() const { return unit_; }
+
+ protected:
+  std::string ComputeFingerprint() const override;
+
+ private:
+  TimeUnit::type unit_;
+};
+
+/// @}
+
+// ----------------------------------------------------------------------
+// Dictionary type (for representing categorical or dictionary-encoded
+// in memory)
+
+/// \brief Dictionary-encoded value type with data-dependent
+/// dictionary. Indices are represented by any integer types.
+class ARROW_EXPORT DictionaryType : public FixedWidthType {
+ public:
+  static constexpr Type::type type_id = Type::DICTIONARY;
+
+  static constexpr const char* type_name() { return "dictionary"; }
+
+  DictionaryType(const std::shared_ptr<DataType>& index_type,
+                 const std::shared_ptr<DataType>& value_type, bool ordered = false);
+
+  // A constructor variant that validates its input parameters
+  static Result<std::shared_ptr<DataType>> Make(
+      const std::shared_ptr<DataType>& index_type,
+      const std::shared_ptr<DataType>& value_type, bool ordered = false);
+
+  std::string ToString(bool show_metadata = false) const override;
+  std::string name() const override { return "dictionary"; }
+
+  int bit_width() const override;
+
+  DataTypeLayout layout() const override;
+
+  const std::shared_ptr<DataType>& index_type() const { return index_type_; }
+  const std::shared_ptr<DataType>& value_type() const { return value_type_; }
+
+  bool ordered() const { return ordered_; }
+
+ protected:
+  static Status ValidateParameters(const DataType& index_type,
+                                   const DataType& value_type);
+
+  std::string ComputeFingerprint() const override;
+
+  // Must be an integer type (not currently checked)
+  std::shared_ptr<DataType> index_type_;
+  std::shared_ptr<DataType> value_type_;
+  bool ordered_;
+};
+
+// ----------------------------------------------------------------------
+// FieldRef
+
+/// \class FieldPath
+///
+/// Represents a path to a nested field using indices of child fields.
+/// For example, given indices {5, 9, 3} the field would be retrieved with
+/// schema->field(5)->type()->field(9)->type()->field(3)
+///
+/// Attempting to retrieve a child field using a FieldPath which is not valid for
+/// a given schema will raise an error. Invalid FieldPaths include:
+/// - an index is out of range
+/// - the path is empty (note: a default constructed FieldPath will be empty)
+///
+/// FieldPaths provide a number of accessors for drilling down to potentially nested
+/// children. They are overloaded for convenience to support Schema (returns a field),
+/// DataType (returns a child field), Field (returns a child field of this field's type)
+/// Array (returns a child array), RecordBatch (returns a column).
+class ARROW_EXPORT FieldPath {
+ public:
+  FieldPath() = default;
+
+  FieldPath(std::vector<int> indices)  // NOLINT runtime/explicit
+      : indices_(std::move(indices)) {}
+
+  FieldPath(std::initializer_list<int> indices)  // NOLINT runtime/explicit
+      : indices_(std::move(indices)) {}
+
+  std::string ToString() const;
+
+  size_t hash() const;
+  struct Hash {
+    size_t operator()(const FieldPath& path) const { return path.hash(); }
+  };
+
+  bool empty() const { return indices_.empty(); }
+  bool operator==(const FieldPath& other) const { return indices() == other.indices(); }
+  bool operator!=(const FieldPath& other) const { return indices() != other.indices(); }
+
+  const std::vector<int>& indices() const { return indices_; }
+  int operator[](size_t i) const { return indices_[i]; }
+  std::vector<int>::const_iterator begin() const { return indices_.begin(); }
+  std::vector<int>::const_iterator end() const { return indices_.end(); }
+
+  /// \brief Retrieve the referenced child Field from a Schema, Field, or DataType
+  Result<std::shared_ptr<Field>> Get(const Schema& schema) const;
+  Result<std::shared_ptr<Field>> Get(const Field& field) const;
+  Result<std::shared_ptr<Field>> Get(const DataType& type) const;
+  Result<std::shared_ptr<Field>> Get(const FieldVector& fields) const;
+
+  static Result<std::shared_ptr<Schema>> GetAll(const Schema& schema,
+                                                const std::vector<FieldPath>& paths);
+
+  /// \brief Retrieve the referenced column from a RecordBatch or Table
+  Result<std::shared_ptr<Array>> Get(const RecordBatch& batch) const;
+  Result<std::shared_ptr<ChunkedArray>> Get(const Table& table) const;
+
+  /// \brief Retrieve the referenced child from an Array or ArrayData
+  Result<std::shared_ptr<Array>> Get(const Array& array) const;
+  Result<std::shared_ptr<ArrayData>> Get(const ArrayData& data) const;
+
+  /// \brief Retrieve the referenced child from a ChunkedArray
+  Result<std::shared_ptr<ChunkedArray>> Get(const ChunkedArray& chunked_array) const;
+
+  /// \brief Retrieve the referenced child/column from an Array, ArrayData, ChunkedArray,
+  /// RecordBatch, or Table
+  ///
+  /// Unlike `FieldPath::Get`, these variants are not zero-copy and the retrieved child's
+  /// null bitmap is ANDed with its ancestors'
+  Result<std::shared_ptr<Array>> GetFlattened(const Array& array,
+                                              MemoryPool* pool = NULLPTR) const;
+  Result<std::shared_ptr<ArrayData>> GetFlattened(const ArrayData& data,
+                                                  MemoryPool* pool = NULLPTR) const;
+  Result<std::shared_ptr<ChunkedArray>> GetFlattened(const ChunkedArray& chunked_array,
+                                                     MemoryPool* pool = NULLPTR) const;
+  Result<std::shared_ptr<Array>> GetFlattened(const RecordBatch& batch,
+                                              MemoryPool* pool = NULLPTR) const;
+  Result<std::shared_ptr<ChunkedArray>> GetFlattened(const Table& table,
+                                                     MemoryPool* pool = NULLPTR) const;
+
+ private:
+  std::vector<int> indices_;
+};
+
+/// \class FieldRef
+/// \brief Descriptor of a (potentially nested) field within a schema.
+///
+/// Unlike FieldPath (which exclusively uses indices of child fields), FieldRef may
+/// reference a field by name. It is intended to replace parameters like `int field_index`
+/// and `const std::string& field_name`; it can be implicitly constructed from either a
+/// field index or a name.
+///
+/// Nested fields can be referenced as well. Given
+///     schema({field("a", struct_({field("n", null())})), field("b", int32())})
+///
+/// the following all indicate the nested field named "n":
+///     FieldRef ref1(0, 0);
+///     FieldRef ref2("a", 0);
+///     FieldRef ref3("a", "n");
+///     FieldRef ref4(0, "n");
+///     ARROW_ASSIGN_OR_RAISE(FieldRef ref5,
+///                           FieldRef::FromDotPath(".a[0]"));
+///
+/// FieldPaths matching a FieldRef are retrieved using the member function FindAll.
+/// Multiple matches are possible because field names may be duplicated within a schema.
+/// For example:
+///     Schema a_is_ambiguous({field("a", int32()), field("a", float32())});
+///     auto matches = FieldRef("a").FindAll(a_is_ambiguous);
+///     assert(matches.size() == 2);
+///     assert(matches[0].Get(a_is_ambiguous)->Equals(a_is_ambiguous.field(0)));
+///     assert(matches[1].Get(a_is_ambiguous)->Equals(a_is_ambiguous.field(1)));
+///
+/// Convenience accessors are available which raise a helpful error if the field is not
+/// found or ambiguous, and for immediately calling FieldPath::Get to retrieve any
+/// matching children:
+///     auto maybe_match = FieldRef("struct", "field_i32").FindOneOrNone(schema);
+///     auto maybe_column = FieldRef("struct", "field_i32").GetOne(some_table);
+class ARROW_EXPORT FieldRef : public util::EqualityComparable<FieldRef> {
+ public:
+  FieldRef() = default;
+
+  /// Construct a FieldRef using a string of indices. The reference will be retrieved as:
+  /// schema.fields[self.indices[0]].type.fields[self.indices[1]] ...
+  ///
+  /// Empty indices are not valid.
+  FieldRef(FieldPath indices);  // NOLINT runtime/explicit
+
+  /// Construct a by-name FieldRef. Multiple fields may match a by-name FieldRef:
+  /// [f for f in schema.fields where f.name == self.name]
+  FieldRef(std::string name) : impl_(std::move(name)) {}    // NOLINT runtime/explicit
+  FieldRef(const char* name) : impl_(std::string(name)) {}  // NOLINT runtime/explicit
+
+  /// Equivalent to a single index string of indices.
+  FieldRef(int index) : impl_(FieldPath({index})) {}  // NOLINT runtime/explicit
+
+  /// Construct a nested FieldRef.
+  explicit FieldRef(std::vector<FieldRef> refs) { Flatten(std::move(refs)); }
+
+  /// Convenience constructor for nested FieldRefs: each argument will be used to
+  /// construct a FieldRef
+  template <typename A0, typename A1, typename... A>
+  FieldRef(A0&& a0, A1&& a1, A&&... a) {
+    Flatten({// cpplint thinks the following are constructor decls
+             FieldRef(std::forward<A0>(a0)),     // NOLINT runtime/explicit
+             FieldRef(std::forward<A1>(a1)),     // NOLINT runtime/explicit
+             FieldRef(std::forward<A>(a))...});  // NOLINT runtime/explicit
+  }
+
+  /// Parse a dot path into a FieldRef.
+  ///
+  /// dot_path = '.' name
+  ///          | '[' digit+ ']'
+  ///          | dot_path+
+  ///
+  /// Examples:
+  ///   ".alpha" => FieldRef("alpha")
+  ///   "[2]" => FieldRef(2)
+  ///   ".beta[3]" => FieldRef("beta", 3)
+  ///   "[5].gamma.delta[7]" => FieldRef(5, "gamma", "delta", 7)
+  ///   ".hello world" => FieldRef("hello world")
+  ///   R"(.\[y\]\\tho\.\)" => FieldRef(R"([y]\tho.\)")
+  ///
+  /// Note: When parsing a name, a '\' preceding any other character will be dropped from
+  /// the resulting name. Therefore if a name must contain the characters '.', '\', or '['
+  /// those must be escaped with a preceding '\'.
+  static Result<FieldRef> FromDotPath(const std::string& dot_path);
+  std::string ToDotPath() const;
+
+  bool Equals(const FieldRef& other) const { return impl_ == other.impl_; }
+
+  std::string ToString() const;
+
+  size_t hash() const;
+  struct Hash {
+    size_t operator()(const FieldRef& ref) const { return ref.hash(); }
+  };
+
+  explicit operator bool() const { return Equals(FieldPath{}); }
+  bool operator!() const { return !Equals(FieldPath{}); }
+
+  bool IsFieldPath() const { return std::holds_alternative<FieldPath>(impl_); }
+  bool IsName() const { return std::holds_alternative<std::string>(impl_); }
+  bool IsNested() const {
+    if (IsName()) return false;
+    if (IsFieldPath()) return std::get<FieldPath>(impl_).indices().size() > 1;
+    return true;
+  }
+
+  /// \brief Return true if this ref is a name or a nested sequence of only names
+  ///
+  /// Useful for determining if iteration is possible without recursion or inner loops
+  bool IsNameSequence() const {
+    if (IsName()) return true;
+    if (const auto* nested = nested_refs()) {
+      for (const auto& ref : *nested) {
+        if (!ref.IsName()) return false;
+      }
+      return !nested->empty();
+    }
+    return false;
+  }
+
+  const FieldPath* field_path() const {
+    return IsFieldPath() ? &std::get<FieldPath>(impl_) : NULLPTR;
+  }
+  const std::string* name() const {
+    return IsName() ? &std::get<std::string>(impl_) : NULLPTR;
+  }
+  const std::vector<FieldRef>* nested_refs() const {
+    return std::holds_alternative<std::vector<FieldRef>>(impl_)
+               ? &std::get<std::vector<FieldRef>>(impl_)
+               : NULLPTR;
+  }
+
+  /// \brief Retrieve FieldPath of every child field which matches this FieldRef.
+  std::vector<FieldPath> FindAll(const Schema& schema) const;
+  std::vector<FieldPath> FindAll(const Field& field) const;
+  std::vector<FieldPath> FindAll(const DataType& type) const;
+  std::vector<FieldPath> FindAll(const FieldVector& fields) const;
+
+  /// \brief Convenience function which applies FindAll to arg's type or schema.
+  std::vector<FieldPath> FindAll(const ArrayData& array) const;
+  std::vector<FieldPath> FindAll(const Array& array) const;
+  std::vector<FieldPath> FindAll(const ChunkedArray& chunked_array) const;
+  std::vector<FieldPath> FindAll(const RecordBatch& batch) const;
+  std::vector<FieldPath> FindAll(const Table& table) const;
+
+  /// \brief Convenience function: raise an error if matches is empty.
+  template <typename T>
+  Status CheckNonEmpty(const std::vector<FieldPath>& matches, const T& root) const {
+    if (matches.empty()) {
+      return Status::Invalid("No match for ", ToString(), " in ", root.ToString());
+    }
+    return Status::OK();
+  }
+
+  /// \brief Convenience function: raise an error if matches contains multiple FieldPaths.
+  template <typename T>
+  Status CheckNonMultiple(const std::vector<FieldPath>& matches, const T& root) const {
+    if (matches.size() > 1) {
+      return Status::Invalid("Multiple matches for ", ToString(), " in ",
+                             root.ToString());
+    }
+    return Status::OK();
+  }
+
+  /// \brief Retrieve FieldPath of a single child field which matches this
+  /// FieldRef. Emit an error if none or multiple match.
+  template <typename T>
+  Result<FieldPath> FindOne(const T& root) const {
+    auto matches = FindAll(root);
+    ARROW_RETURN_NOT_OK(CheckNonEmpty(matches, root));
+    ARROW_RETURN_NOT_OK(CheckNonMultiple(matches, root));
+    return std::move(matches[0]);
+  }
+
+  /// \brief Retrieve FieldPath of a single child field which matches this
+  /// FieldRef. Emit an error if multiple match. An empty (invalid) FieldPath
+  /// will be returned if none match.
+  template <typename T>
+  Result<FieldPath> FindOneOrNone(const T& root) const {
+    auto matches = FindAll(root);
+    ARROW_RETURN_NOT_OK(CheckNonMultiple(matches, root));
+    if (matches.empty()) {
+      return FieldPath();
+    }
+    return std::move(matches[0]);
+  }
+
+  template <typename T>
+  using GetType = decltype(std::declval<FieldPath>().Get(std::declval<T>()).ValueOrDie());
+
+  /// \brief Get all children matching this FieldRef.
+  template <typename T>
+  std::vector<GetType<T>> GetAll(const T& root) const {
+    std::vector<GetType<T>> out;
+    for (const auto& match : FindAll(root)) {
+      out.push_back(match.Get(root).ValueOrDie());
+    }
+    return out;
+  }
+  /// \brief Get all children matching this FieldRef.
+  ///
+  /// Unlike `FieldRef::GetAll`, this variant is not zero-copy and the retrieved
+  /// children's null bitmaps are ANDed with their ancestors'
+  template <typename T>
+  Result<std::vector<GetType<T>>> GetAllFlattened(const T& root,
+                                                  MemoryPool* pool = NULLPTR) const {
+    std::vector<GetType<T>> out;
+    for (const auto& match : FindAll(root)) {
+      ARROW_ASSIGN_OR_RAISE(auto child, match.GetFlattened(root, pool));
+      out.push_back(std::move(child));
+    }
+    return out;
+  }
+
+  /// \brief Get the single child matching this FieldRef.
+  /// Emit an error if none or multiple match.
+  template <typename T>
+  Result<GetType<T>> GetOne(const T& root) const {
+    ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root));
+    return match.Get(root).ValueOrDie();
+  }
+  /// \brief Get the single child matching this FieldRef.
+  ///
+  /// Unlike `FieldRef::GetOne`, this variant is not zero-copy and the retrieved
+  /// child's null bitmap is ANDed with its ancestors'
+  template <typename T>
+  Result<GetType<T>> GetOneFlattened(const T& root, MemoryPool* pool = NULLPTR) const {
+    ARROW_ASSIGN_OR_RAISE(auto match, FindOne(root));
+    return match.GetFlattened(root, pool);
+  }
+
+  /// \brief Get the single child matching this FieldRef.
+  /// Return nullptr if none match, emit an error if multiple match.
+  template <typename T>
+  Result<GetType<T>> GetOneOrNone(const T& root) const {
+    ARROW_ASSIGN_OR_RAISE(auto match, FindOneOrNone(root));
+    if (match.empty()) {
+      return static_cast<GetType<T>>(NULLPTR);
+    }
+    return match.Get(root).ValueOrDie();
+  }
+  /// \brief Get the single child matching this FieldRef.
+  ///
+  /// Return nullptr if none match, emit an error if multiple match.
+  /// Unlike `FieldRef::GetOneOrNone`, this variant is not zero-copy and the
+  /// retrieved child's null bitmap is ANDed with its ancestors'
+  template <typename T>
+  Result<GetType<T>> GetOneOrNoneFlattened(const T& root,
+                                           MemoryPool* pool = NULLPTR) const {
+    ARROW_ASSIGN_OR_RAISE(auto match, FindOneOrNone(root));
+    if (match.empty()) {
+      return static_cast<GetType<T>>(NULLPTR);
+    }
+    return match.GetFlattened(root, pool);
+  }
+
+ private:
+  void Flatten(std::vector<FieldRef> children);
+
+  std::variant<FieldPath, std::string, std::vector<FieldRef>> impl_;
+};
+
+ARROW_EXPORT void PrintTo(const FieldRef& ref, std::ostream* os);
+
+ARROW_EXPORT
+std::ostream& operator<<(std::ostream& os, const FieldRef&);
+
+// ----------------------------------------------------------------------
+// Schema
+
+enum class Endianness {
+  Little = 0,
+  Big = 1,
+#if ARROW_LITTLE_ENDIAN
+  Native = Little
+#else
+  Native = Big
+#endif
+};
+
+/// \class Schema
+/// \brief Sequence of arrow::Field objects describing the columns of a record
+/// batch or table data structure
+class ARROW_EXPORT Schema : public detail::Fingerprintable,
+                            public util::EqualityComparable<Schema>,
+                            public util::ToStringOstreamable<Schema> {
+ public:
+  explicit Schema(FieldVector fields, Endianness endianness,
+                  std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+  explicit Schema(FieldVector fields,
+                  std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+  Schema(const Schema&);
+
+  ~Schema() override;
+
+  /// Returns true if all of the schema fields are equal
+  bool Equals(const Schema& other, bool check_metadata = false) const;
+  bool Equals(const std::shared_ptr<Schema>& other, bool check_metadata = false) const;
+
+  /// \brief Set endianness in the schema
+  ///
+  /// \return new Schema
+  std::shared_ptr<Schema> WithEndianness(Endianness endianness) const;
+
+  /// \brief Return endianness in the schema
+  Endianness endianness() const;
+
+  /// \brief Indicate if endianness is equal to platform-native endianness
+  bool is_native_endian() const;
+
+  /// \brief Return the number of fields (columns) in the schema
+  int num_fields() const;
+
+  /// Return the ith schema element. Does not boundscheck
+  const std::shared_ptr<Field>& field(int i) const;
+
+  const FieldVector& fields() const;
+
+  std::vector<std::string> field_names() const;
+
+  /// Returns null if name not found
+  std::shared_ptr<Field> GetFieldByName(std::string_view name) const;
+
+  /// \brief Return the indices of all fields having this name in sorted order
+  FieldVector GetAllFieldsByName(std::string_view name) const;
+
+  /// Returns -1 if name not found
+  int GetFieldIndex(std::string_view name) const;
+
+  /// Return the indices of all fields having this name
+  std::vector<int> GetAllFieldIndices(std::string_view name) const;
+
+  /// Indicate if field named `name` can be found unambiguously in the schema.
+  Status CanReferenceFieldByName(std::string_view name) const;
+
+  /// Indicate if fields named `names` can be found unambiguously in the schema.
+  Status CanReferenceFieldsByNames(const std::vector<std::string>& names) const;
+
+  /// \brief The custom key-value metadata, if any
+  ///
+  /// \return metadata may be null
+  const std::shared_ptr<const KeyValueMetadata>& metadata() const;
+
+  /// \brief Render a string representation of the schema suitable for debugging
+  /// \param[in] show_metadata when true, if KeyValueMetadata is non-empty,
+  /// print keys and values in the output
+  std::string ToString(bool show_metadata = false) const;
+
+  Result<std::shared_ptr<Schema>> AddField(int i,
+                                           const std::shared_ptr<Field>& field) const;
+  Result<std::shared_ptr<Schema>> RemoveField(int i) const;
+  Result<std::shared_ptr<Schema>> SetField(int i,
+                                           const std::shared_ptr<Field>& field) const;
+
+  /// \brief Replace field names with new names
+  ///
+  /// \param[in] names new names
+  /// \return new Schema
+  Result<std::shared_ptr<Schema>> WithNames(const std::vector<std::string>& names) const;
+
+  /// \brief Replace key-value metadata with new metadata
+  ///
+  /// \param[in] metadata new KeyValueMetadata
+  /// \return new Schema
+  std::shared_ptr<Schema> WithMetadata(
+      const std::shared_ptr<const KeyValueMetadata>& metadata) const;
+
+  /// \brief Return copy of Schema without the KeyValueMetadata
+  std::shared_ptr<Schema> RemoveMetadata() const;
+
+  /// \brief Indicate that the Schema has non-empty KevValueMetadata
+  bool HasMetadata() const;
+
+  /// \brief Indicate that the Schema has distinct field names.
+  bool HasDistinctFieldNames() const;
+
+ protected:
+  std::string ComputeFingerprint() const override;
+  std::string ComputeMetadataFingerprint() const override;
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+ARROW_EXPORT void PrintTo(const Schema& s, std::ostream* os);
+
+ARROW_EXPORT
+std::string EndiannessToString(Endianness endianness);
+
+// ----------------------------------------------------------------------
+
+/// \brief Convenience class to incrementally construct/merge schemas.
+///
+/// This class amortizes the cost of validating field name conflicts by
+/// maintaining the mapping. The caller also controls the conflict resolution
+/// scheme.
+class ARROW_EXPORT SchemaBuilder {
+ public:
+  // Indicate how field conflict(s) should be resolved when building a schema. A
+  // conflict arise when a field is added to the builder and one or more field(s)
+  // with the same name already exists.
+  enum ConflictPolicy {
+    // Ignore the conflict and append the field. This is the default behavior of the
+    // Schema constructor and the `arrow::schema` factory function.
+    CONFLICT_APPEND = 0,
+    // Keep the existing field and ignore the newer one.
+    CONFLICT_IGNORE,
+    // Replace the existing field with the newer one.
+    CONFLICT_REPLACE,
+    // Merge the fields. The merging behavior can be controlled by `Field::MergeOptions`
+    // specified at construction time. Also see documentation of `Field::MergeWith`.
+    CONFLICT_MERGE,
+    // Refuse the new field and error out.
+    CONFLICT_ERROR
+  };
+
+  /// \brief Construct an empty SchemaBuilder
+  /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
+  SchemaBuilder(
+      ConflictPolicy conflict_policy = CONFLICT_APPEND,
+      Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
+  /// \brief Construct a SchemaBuilder from a list of fields
+  /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
+  SchemaBuilder(
+      FieldVector fields, ConflictPolicy conflict_policy = CONFLICT_APPEND,
+      Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
+  /// \brief Construct a SchemaBuilder from a schema, preserving the metadata
+  /// `field_merge_options` is only effective when `conflict_policy` == `CONFLICT_MERGE`.
+  SchemaBuilder(
+      const std::shared_ptr<Schema>& schema,
+      ConflictPolicy conflict_policy = CONFLICT_APPEND,
+      Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
+
+  /// \brief Return the conflict resolution method.
+  ConflictPolicy policy() const;
+
+  /// \brief Set the conflict resolution method.
+  void SetPolicy(ConflictPolicy resolution);
+
+  /// \brief Add a field to the constructed schema.
+  ///
+  /// \param[in] field to add to the constructed Schema.
+  /// \return A failure if encountered.
+  Status AddField(const std::shared_ptr<Field>& field);
+
+  /// \brief Add multiple fields to the constructed schema.
+  ///
+  /// \param[in] fields to add to the constructed Schema.
+  /// \return The first failure encountered, if any.
+  Status AddFields(const FieldVector& fields);
+
+  /// \brief Add fields of a Schema to the constructed Schema.
+  ///
+  /// \param[in] schema to take fields to add to the constructed Schema.
+  /// \return The first failure encountered, if any.
+  Status AddSchema(const std::shared_ptr<Schema>& schema);
+
+  /// \brief Add fields of multiple Schemas to the constructed Schema.
+  ///
+  /// \param[in] schemas to take fields to add to the constructed Schema.
+  /// \return The first failure encountered, if any.
+  Status AddSchemas(const std::vector<std::shared_ptr<Schema>>& schemas);
+
+  Status AddMetadata(const KeyValueMetadata& metadata);
+
+  /// \brief Return the constructed Schema.
+  ///
+  /// The builder internal state is not affected by invoking this method, i.e.
+  /// a single builder can yield multiple incrementally constructed schemas.
+  ///
+  /// \return the constructed schema.
+  Result<std::shared_ptr<Schema>> Finish() const;
+
+  /// \brief Merge schemas in a unified schema according to policy.
+  static Result<std::shared_ptr<Schema>> Merge(
+      const std::vector<std::shared_ptr<Schema>>& schemas,
+      ConflictPolicy policy = CONFLICT_MERGE);
+
+  /// \brief Indicate if schemas are compatible to merge according to policy.
+  static Status AreCompatible(const std::vector<std::shared_ptr<Schema>>& schemas,
+                              ConflictPolicy policy = CONFLICT_MERGE);
+
+  /// \brief Reset internal state with an empty schema (and metadata).
+  void Reset();
+
+  ~SchemaBuilder();
+
+ private:
+  class Impl;
+  std::unique_ptr<Impl> impl_;
+
+  Status AppendField(const std::shared_ptr<Field>& field);
+};
+
+/// \brief Unifies schemas by merging fields by name.
+///
+/// The behavior of field merging can be controlled via `Field::MergeOptions`.
+///
+/// The resulting schema will contain the union of fields from all schemas.
+/// Fields with the same name will be merged. See `Field::MergeOptions`.
+/// - They are expected to be mergeable under provided `field_merge_options`.
+/// - The unified field will inherit the metadata from the schema where
+///   that field is first defined.
+/// - The first N fields in the schema will be ordered the same as the
+///   N fields in the first schema.
+/// The resulting schema will inherit its metadata from the first input schema.
+/// Returns an error if:
+/// - Any input schema contains fields with duplicate names.
+/// - Fields of the same name are not mergeable.
+ARROW_EXPORT
+Result<std::shared_ptr<Schema>> UnifySchemas(
+    const std::vector<std::shared_ptr<Schema>>& schemas,
+    Field::MergeOptions field_merge_options = Field::MergeOptions::Defaults());
+
+namespace internal {
+
+constexpr bool may_have_validity_bitmap(Type::type id) {
+  switch (id) {
+    case Type::NA:
+    case Type::DENSE_UNION:
+    case Type::SPARSE_UNION:
+    case Type::RUN_END_ENCODED:
+      return false;
+    default:
+      return true;
+  }
+}
+
+constexpr bool has_variadic_buffers(Type::type id) {
+  switch (id) {
+    case Type::BINARY_VIEW:
+    case Type::STRING_VIEW:
+      return true;
+    default:
+      return false;
+  }
+}
+
+ARROW_DEPRECATED("Deprecated in 17.0.0. Use may_have_validity_bitmap() instead.")
+constexpr bool HasValidityBitmap(Type::type id) { return may_have_validity_bitmap(id); }
+
+ARROW_EXPORT
+std::string ToString(Type::type id);
+
+ARROW_EXPORT
+std::string ToTypeName(Type::type id);
+
+ARROW_EXPORT
+std::string ToString(TimeUnit::type unit);
+
+}  // namespace internal
+
+// Helpers to get instances of data types based on general categories
+
+/// \brief Signed integer types
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& SignedIntTypes();
+/// \brief Unsigned integer types
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& UnsignedIntTypes();
+/// \brief Signed and unsigned integer types
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& IntTypes();
+/// \brief Floating point types
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& FloatingPointTypes();
+/// \brief Number types without boolean - integer and floating point types
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& NumericTypes();
+/// \brief Binary and string-like types (except fixed-size binary)
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& BaseBinaryTypes();
+/// \brief Binary and large-binary types
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& BinaryTypes();
+/// \brief String and large-string types
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& StringTypes();
+/// \brief String-view and Binary-view
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& BinaryViewTypes();
+/// \brief Temporal types including date, time and timestamps for each unit
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& TemporalTypes();
+/// \brief Interval types
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& IntervalTypes();
+/// \brief Duration types for each unit
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& DurationTypes();
+/// \brief Numeric, base binary, date, boolean and null types
+ARROW_EXPORT
+const std::vector<std::shared_ptr<DataType>>& PrimitiveTypes();
+
+/// \brief Decimal type ids
+ARROW_EXPORT
+const std::vector<Type::type>& DecimalTypeIds();
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/type_fwd.h b/pyarrow/include/arrow/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..be26c40dc1f435aa0f05b01782d6d7f169b0be02
--- /dev/null
+++ b/pyarrow/include/arrow/type_fwd.h
@@ -0,0 +1,792 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+template <typename T>
+class Iterator;
+template <typename T>
+struct IterationTraits;
+
+template <typename T>
+class Result;
+
+class Status;
+
+namespace internal {
+struct Empty;
+}  // namespace internal
+template <typename T = internal::Empty>
+class Future;
+
+namespace util {
+class Codec;
+class CodecOptions;
+class Float16;
+}  // namespace util
+
+class Buffer;
+class Device;
+class MemoryManager;
+class MemoryPool;
+class MutableBuffer;
+class ResizableBuffer;
+
+using BufferVector = std::vector<std::shared_ptr<Buffer>>;
+
+class DataType;
+class Field;
+class FieldRef;
+class KeyValueMetadata;
+enum class Endianness;
+class Schema;
+
+using DataTypeVector = std::vector<std::shared_ptr<DataType>>;
+using FieldVector = std::vector<std::shared_ptr<Field>>;
+
+class Array;
+struct ArrayData;
+struct ArraySpan;
+class ArrayBuilder;
+struct Scalar;
+
+using ArrayDataVector = std::vector<std::shared_ptr<ArrayData>>;
+using ArrayVector = std::vector<std::shared_ptr<Array>>;
+using ScalarVector = std::vector<std::shared_ptr<Scalar>>;
+
+class ChunkedArray;
+class RecordBatch;
+struct RecordBatchWithMetadata;
+class RecordBatchReader;
+class AsyncRecordBatchReader;
+class Table;
+
+struct Datum;
+struct TypeHolder;
+
+using ChunkedArrayVector = std::vector<std::shared_ptr<ChunkedArray>>;
+using RecordBatchVector = std::vector<std::shared_ptr<RecordBatch>>;
+using RecordBatchIterator = Iterator<std::shared_ptr<RecordBatch>>;
+
+class DictionaryType;
+class DictionaryArray;
+struct DictionaryScalar;
+
+class NullType;
+class NullArray;
+class NullBuilder;
+struct NullScalar;
+
+class FixedWidthType;
+
+class BooleanType;
+class BooleanArray;
+class BooleanBuilder;
+struct BooleanScalar;
+
+class BinaryType;
+class BinaryArray;
+class BinaryBuilder;
+struct BinaryScalar;
+
+class BinaryViewType;
+class BinaryViewArray;
+class BinaryViewBuilder;
+struct BinaryViewScalar;
+
+class LargeBinaryType;
+class LargeBinaryArray;
+class LargeBinaryBuilder;
+struct LargeBinaryScalar;
+
+class FixedSizeBinaryType;
+class FixedSizeBinaryArray;
+class FixedSizeBinaryBuilder;
+struct FixedSizeBinaryScalar;
+
+class StringType;
+class StringArray;
+class StringBuilder;
+struct StringScalar;
+
+class StringViewType;
+class StringViewArray;
+class StringViewBuilder;
+struct StringViewScalar;
+
+class LargeStringType;
+class LargeStringArray;
+class LargeStringBuilder;
+struct LargeStringScalar;
+
+class ListType;
+class ListArray;
+class ListBuilder;
+struct ListScalar;
+
+class LargeListType;
+class LargeListArray;
+class LargeListBuilder;
+struct LargeListScalar;
+
+class ListViewType;
+class ListViewArray;
+class ListViewBuilder;
+struct ListViewScalar;
+
+class LargeListViewType;
+class LargeListViewArray;
+class LargeListViewBuilder;
+struct LargeListViewScalar;
+
+class MapType;
+class MapArray;
+class MapBuilder;
+struct MapScalar;
+
+class FixedSizeListType;
+class FixedSizeListArray;
+class FixedSizeListBuilder;
+struct FixedSizeListScalar;
+
+class StructType;
+class StructArray;
+class StructBuilder;
+struct StructScalar;
+
+class Decimal32;
+class Decimal64;
+class Decimal128;
+class Decimal256;
+class DecimalType;
+class Decimal32Type;
+class Decimal64Type;
+class Decimal128Type;
+class Decimal256Type;
+class Decimal32Array;
+class Decimal64Array;
+class Decimal128Array;
+class Decimal256Array;
+class Decimal32Builder;
+class Decimal64Builder;
+class Decimal128Builder;
+class Decimal256Builder;
+struct Decimal32Scalar;
+struct Decimal64Scalar;
+struct Decimal128Scalar;
+struct Decimal256Scalar;
+
+struct UnionMode {
+  enum type { SPARSE, DENSE };
+};
+
+class SparseUnionType;
+class SparseUnionArray;
+class SparseUnionBuilder;
+struct SparseUnionScalar;
+
+class DenseUnionType;
+class DenseUnionArray;
+class DenseUnionBuilder;
+struct DenseUnionScalar;
+
+class RunEndEncodedType;
+class RunEndEncodedArray;
+class RunEndEncodedBuilder;
+struct RunEndEncodedScalar;
+
+template <typename TypeClass>
+class NumericArray;
+
+template <typename TypeClass>
+class NumericBuilder;
+
+template <typename TypeClass>
+class NumericTensor;
+
+#define _NUMERIC_TYPE_DECL(KLASS)                     \
+  class KLASS##Type;                                  \
+  using KLASS##Array = NumericArray<KLASS##Type>;     \
+  using KLASS##Builder = NumericBuilder<KLASS##Type>; \
+  struct KLASS##Scalar;                               \
+  using KLASS##Tensor = NumericTensor<KLASS##Type>;
+
+_NUMERIC_TYPE_DECL(Int8)
+_NUMERIC_TYPE_DECL(Int16)
+_NUMERIC_TYPE_DECL(Int32)
+_NUMERIC_TYPE_DECL(Int64)
+_NUMERIC_TYPE_DECL(UInt8)
+_NUMERIC_TYPE_DECL(UInt16)
+_NUMERIC_TYPE_DECL(UInt32)
+_NUMERIC_TYPE_DECL(UInt64)
+_NUMERIC_TYPE_DECL(Float)
+_NUMERIC_TYPE_DECL(Double)
+
+#undef _NUMERIC_TYPE_DECL
+
+class HalfFloatType;
+using HalfFloatArray = NumericArray<HalfFloatType>;
+class HalfFloatBuilder;
+struct HalfFloatScalar;
+using HalfFloatTensor = NumericTensor<HalfFloatType>;
+
+enum class DateUnit : char { DAY = 0, MILLI = 1 };
+
+class DateType;
+class Date32Type;
+using Date32Array = NumericArray<Date32Type>;
+using Date32Builder = NumericBuilder<Date32Type>;
+struct Date32Scalar;
+
+class Date64Type;
+using Date64Array = NumericArray<Date64Type>;
+using Date64Builder = NumericBuilder<Date64Type>;
+struct Date64Scalar;
+
+struct ARROW_EXPORT TimeUnit {
+  /// The unit for a time or timestamp DataType
+  enum type { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 };
+
+  /// Iterate over all valid time units
+  static const std::vector<TimeUnit::type>& values();
+};
+
+class TimeType;
+class Time32Type;
+using Time32Array = NumericArray<Time32Type>;
+using Time32Builder = NumericBuilder<Time32Type>;
+struct Time32Scalar;
+
+class Time64Type;
+using Time64Array = NumericArray<Time64Type>;
+using Time64Builder = NumericBuilder<Time64Type>;
+struct Time64Scalar;
+
+class TimestampType;
+using TimestampArray = NumericArray<TimestampType>;
+using TimestampBuilder = NumericBuilder<TimestampType>;
+struct TimestampScalar;
+
+class MonthIntervalType;
+using MonthIntervalArray = NumericArray<MonthIntervalType>;
+using MonthIntervalBuilder = NumericBuilder<MonthIntervalType>;
+struct MonthIntervalScalar;
+
+class DayTimeIntervalType;
+class DayTimeIntervalArray;
+class DayTimeIntervalBuilder;
+struct DayTimeIntervalScalar;
+
+class MonthDayNanoIntervalType;
+class MonthDayNanoIntervalArray;
+class MonthDayNanoIntervalBuilder;
+struct MonthDayNanoIntervalScalar;
+
+class DurationType;
+using DurationArray = NumericArray<DurationType>;
+using DurationBuilder = NumericBuilder<DurationType>;
+struct DurationScalar;
+
+class ExtensionType;
+class ExtensionArray;
+struct ExtensionScalar;
+
+class Tensor;
+class SparseTensor;
+
+// ----------------------------------------------------------------------
+
+struct Type {
+  /// \brief Main data type enumeration
+  ///
+  /// This enumeration provides a quick way to interrogate the category
+  /// of a DataType instance.
+  enum type {
+    /// A NULL type having no physical storage
+    NA = 0,
+
+    /// Boolean as 1 bit, LSB bit-packed ordering
+    BOOL = 1,
+
+    /// Unsigned 8-bit little-endian integer
+    UINT8 = 2,
+
+    /// Signed 8-bit little-endian integer
+    INT8 = 3,
+
+    /// Unsigned 16-bit little-endian integer
+    UINT16 = 4,
+
+    /// Signed 16-bit little-endian integer
+    INT16 = 5,
+
+    /// Unsigned 32-bit little-endian integer
+    UINT32 = 6,
+
+    /// Signed 32-bit little-endian integer
+    INT32 = 7,
+
+    /// Unsigned 64-bit little-endian integer
+    UINT64 = 8,
+
+    /// Signed 64-bit little-endian integer
+    INT64 = 9,
+
+    /// 2-byte floating point value
+    HALF_FLOAT = 10,
+
+    /// 4-byte floating point value
+    FLOAT = 11,
+
+    /// 8-byte floating point value
+    DOUBLE = 12,
+
+    /// UTF8 variable-length string as List<Char>
+    STRING = 13,
+
+    /// Variable-length bytes (no guarantee of UTF8-ness)
+    BINARY = 14,
+
+    /// Fixed-size binary. Each value occupies the same number of bytes
+    FIXED_SIZE_BINARY = 15,
+
+    /// int32_t days since the UNIX epoch
+    DATE32 = 16,
+
+    /// int64_t milliseconds since the UNIX epoch
+    DATE64 = 17,
+
+    /// Exact timestamp encoded with int64 since UNIX epoch
+    /// Default unit millisecond
+    TIMESTAMP = 18,
+
+    /// Time as signed 32-bit integer, representing either seconds or
+    /// milliseconds since midnight
+    TIME32 = 19,
+
+    /// Time as signed 64-bit integer, representing either microseconds or
+    /// nanoseconds since midnight
+    TIME64 = 20,
+
+    /// YEAR_MONTH interval in SQL style
+    INTERVAL_MONTHS = 21,
+
+    /// DAY_TIME interval in SQL style
+    INTERVAL_DAY_TIME = 22,
+
+    /// Precision- and scale-based decimal type with 128 bits.
+    DECIMAL128 = 23,
+
+    /// Defined for backward-compatibility.
+    DECIMAL = DECIMAL128,
+
+    /// Precision- and scale-based decimal type with 256 bits.
+    DECIMAL256 = 24,
+
+    /// A list of some logical data type
+    LIST = 25,
+
+    /// Struct of logical types
+    STRUCT = 26,
+
+    /// Sparse unions of logical types
+    SPARSE_UNION = 27,
+
+    /// Dense unions of logical types
+    DENSE_UNION = 28,
+
+    /// Dictionary-encoded type, also called "categorical" or "factor"
+    /// in other programming languages. Holds the dictionary value
+    /// type but not the dictionary itself, which is part of the
+    /// ArrayData struct
+    DICTIONARY = 29,
+
+    /// Map, a repeated struct logical type
+    MAP = 30,
+
+    /// Custom data type, implemented by user
+    EXTENSION = 31,
+
+    /// Fixed size list of some logical type
+    FIXED_SIZE_LIST = 32,
+
+    /// Measure of elapsed time in either seconds, milliseconds, microseconds
+    /// or nanoseconds.
+    DURATION = 33,
+
+    /// Like STRING, but with 64-bit offsets
+    LARGE_STRING = 34,
+
+    /// Like BINARY, but with 64-bit offsets
+    LARGE_BINARY = 35,
+
+    /// Like LIST, but with 64-bit offsets
+    LARGE_LIST = 36,
+
+    /// Calendar interval type with three fields.
+    INTERVAL_MONTH_DAY_NANO = 37,
+
+    /// Run-end encoded data.
+    RUN_END_ENCODED = 38,
+
+    /// String (UTF8) view type with 4-byte prefix and inline small string
+    /// optimization
+    STRING_VIEW = 39,
+
+    /// Bytes view type with 4-byte prefix and inline small string optimization
+    BINARY_VIEW = 40,
+
+    /// A list of some logical data type represented by offset and size.
+    LIST_VIEW = 41,
+
+    /// Like LIST_VIEW, but with 64-bit offsets and sizes
+    LARGE_LIST_VIEW = 42,
+
+    /// Precision- and scale-based decimal type with 32 bits.
+    DECIMAL32 = 43,
+
+    /// Precision- and scale-based decimal type with 64 bits.
+    DECIMAL64 = 44,
+
+    // Leave this at the end
+    MAX_ID
+  };
+};
+
+/// \brief Get a vector of all type ids
+ARROW_EXPORT std::vector<Type::type> AllTypeIds();
+
+/// \defgroup type-factories Factory functions for creating data types
+///
+/// Factory functions for creating data types
+/// @{
+
+/// \brief Return a NullType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& null();
+/// \brief Return a BooleanType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& boolean();
+/// \brief Return a Int8Type instance
+ARROW_EXPORT const std::shared_ptr<DataType>& int8();
+/// \brief Return a Int16Type instance
+ARROW_EXPORT const std::shared_ptr<DataType>& int16();
+/// \brief Return a Int32Type instance
+ARROW_EXPORT const std::shared_ptr<DataType>& int32();
+/// \brief Return a Int64Type instance
+ARROW_EXPORT const std::shared_ptr<DataType>& int64();
+/// \brief Return a UInt8Type instance
+ARROW_EXPORT const std::shared_ptr<DataType>& uint8();
+/// \brief Return a UInt16Type instance
+ARROW_EXPORT const std::shared_ptr<DataType>& uint16();
+/// \brief Return a UInt32Type instance
+ARROW_EXPORT const std::shared_ptr<DataType>& uint32();
+/// \brief Return a UInt64Type instance
+ARROW_EXPORT const std::shared_ptr<DataType>& uint64();
+/// \brief Return a HalfFloatType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& float16();
+/// \brief Return a FloatType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& float32();
+/// \brief Return a DoubleType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& float64();
+/// \brief Return a StringType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& utf8();
+/// \brief Return a StringViewType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& utf8_view();
+/// \brief Return a LargeStringType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& large_utf8();
+/// \brief Return a BinaryType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& binary();
+/// \brief Return a BinaryViewType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& binary_view();
+/// \brief Return a LargeBinaryType instance
+ARROW_EXPORT const std::shared_ptr<DataType>& large_binary();
+/// \brief Return a Date32Type instance
+ARROW_EXPORT const std::shared_ptr<DataType>& date32();
+/// \brief Return a Date64Type instance
+ARROW_EXPORT const std::shared_ptr<DataType>& date64();
+
+/// \brief Create a FixedSizeBinaryType instance.
+ARROW_EXPORT
+std::shared_ptr<DataType> fixed_size_binary(int32_t byte_width);
+
+/// \brief Create a DecimalType instance depending on the precision
+///
+/// If the precision is greater than 38, a Decimal256Type is returned,
+/// otherwise a Decimal128Type.
+///
+/// Deprecated: prefer `smallest_decimal` instead.
+ARROW_DEPRECATED("Deprecated in 18.0. Use `smallest_decimal` instead")
+ARROW_EXPORT
+std::shared_ptr<DataType> decimal(int32_t precision, int32_t scale);
+
+/// \brief Create a the smallest DecimalType instance depending on precision
+///
+/// Given the requested precision and scale, the smallest DecimalType which
+/// is able to represent that precision will be returned. As different
+/// bit-widths for decimal types are added, the concrete data type returned
+/// here can potentially change accordingly.
+ARROW_EXPORT
+std::shared_ptr<DataType> smallest_decimal(int32_t precision, int32_t scale);
+
+/// \brief Create a Decimal32Type instance
+ARROW_EXPORT
+std::shared_ptr<DataType> decimal32(int32_t precision, int32_t scale);
+
+/// \brief Create a Decimal64Type instance
+ARROW_EXPORT
+std::shared_ptr<DataType> decimal64(int32_t precision, int32_t scale);
+
+/// \brief Create a Decimal128Type instance
+ARROW_EXPORT
+std::shared_ptr<DataType> decimal128(int32_t precision, int32_t scale);
+
+/// \brief Create a Decimal256Type instance
+ARROW_EXPORT
+std::shared_ptr<DataType> decimal256(int32_t precision, int32_t scale);
+
+/// \brief Create a ListType instance from its child Field type
+ARROW_EXPORT
+std::shared_ptr<DataType> list(std::shared_ptr<Field> value_type);
+
+/// \brief Create a ListType instance from its child DataType
+ARROW_EXPORT
+std::shared_ptr<DataType> list(std::shared_ptr<DataType> value_type);
+
+/// \brief Create a LargeListType instance from its child Field type
+ARROW_EXPORT
+std::shared_ptr<DataType> large_list(std::shared_ptr<Field> value_type);
+
+/// \brief Create a LargeListType instance from its child DataType
+ARROW_EXPORT
+std::shared_ptr<DataType> large_list(std::shared_ptr<DataType> value_type);
+
+/// \brief Create a ListViewType instance
+ARROW_EXPORT std::shared_ptr<DataType> list_view(std::shared_ptr<DataType> value_type);
+
+/// \brief Create a ListViewType instance from its child Field type
+ARROW_EXPORT std::shared_ptr<DataType> list_view(std::shared_ptr<Field> value_type);
+
+/// \brief Create a LargetListViewType instance
+ARROW_EXPORT std::shared_ptr<DataType> large_list_view(
+    std::shared_ptr<DataType> value_type);
+
+/// \brief Create a LargetListViewType instance from its child Field type
+ARROW_EXPORT std::shared_ptr<DataType> large_list_view(std::shared_ptr<Field> value_type);
+
+/// \brief Create a MapType instance from its key and value DataTypes
+ARROW_EXPORT
+std::shared_ptr<DataType> map(std::shared_ptr<DataType> key_type,
+                              std::shared_ptr<DataType> item_type,
+                              bool keys_sorted = false);
+
+/// \brief Create a MapType instance from its key DataType and value field.
+///
+/// The field override is provided to communicate nullability of the value.
+ARROW_EXPORT
+std::shared_ptr<DataType> map(std::shared_ptr<DataType> key_type,
+                              std::shared_ptr<Field> item_field,
+                              bool keys_sorted = false);
+
+/// \brief Create a FixedSizeListType instance from its child Field type
+ARROW_EXPORT
+std::shared_ptr<DataType> fixed_size_list(std::shared_ptr<Field> value_type,
+                                          int32_t list_size);
+
+/// \brief Create a FixedSizeListType instance from its child DataType
+ARROW_EXPORT
+std::shared_ptr<DataType> fixed_size_list(std::shared_ptr<DataType> value_type,
+                                          int32_t list_size);
+/// \brief Return a Duration instance (naming use _type to avoid namespace conflict with
+/// built in time classes).
+ARROW_EXPORT std::shared_ptr<DataType> duration(TimeUnit::type unit);
+
+/// \brief Return a DayTimeIntervalType instance
+ARROW_EXPORT std::shared_ptr<DataType> day_time_interval();
+
+/// \brief Return a MonthIntervalType instance
+ARROW_EXPORT std::shared_ptr<DataType> month_interval();
+
+/// \brief Return a MonthDayNanoIntervalType instance
+ARROW_EXPORT std::shared_ptr<DataType> month_day_nano_interval();
+
+/// \brief Create a TimestampType instance from its unit
+ARROW_EXPORT
+std::shared_ptr<DataType> timestamp(TimeUnit::type unit);
+
+/// \brief Create a TimestampType instance from its unit and timezone
+ARROW_EXPORT
+std::shared_ptr<DataType> timestamp(TimeUnit::type unit, const std::string& timezone);
+
+/// \brief Create a 32-bit time type instance
+///
+/// Unit can be either SECOND or MILLI
+ARROW_EXPORT std::shared_ptr<DataType> time32(TimeUnit::type unit);
+
+/// \brief Create a 64-bit time type instance
+///
+/// Unit can be either MICRO or NANO
+ARROW_EXPORT std::shared_ptr<DataType> time64(TimeUnit::type unit);
+
+/// \brief Create a StructType instance
+ARROW_EXPORT std::shared_ptr<DataType> struct_(const FieldVector& fields);
+
+/// \brief Create a StructType instance from (name, type) pairs
+ARROW_EXPORT std::shared_ptr<DataType> struct_(
+    std::initializer_list<std::pair<std::string, std::shared_ptr<DataType>>> fields);
+
+/// \brief Create a RunEndEncodedType instance
+ARROW_EXPORT std::shared_ptr<DataType> run_end_encoded(
+    std::shared_ptr<DataType> run_end_type, std::shared_ptr<DataType> value_type);
+
+/// \brief Create a SparseUnionType instance
+ARROW_EXPORT std::shared_ptr<DataType> sparse_union(FieldVector child_fields,
+                                                    std::vector<int8_t> type_codes = {});
+/// \brief Create a SparseUnionType instance
+ARROW_EXPORT std::shared_ptr<DataType> sparse_union(
+    const ArrayVector& children, std::vector<std::string> field_names = {},
+    std::vector<int8_t> type_codes = {});
+
+/// \brief Create a DenseUnionType instance
+ARROW_EXPORT std::shared_ptr<DataType> dense_union(FieldVector child_fields,
+                                                   std::vector<int8_t> type_codes = {});
+/// \brief Create a DenseUnionType instance
+ARROW_EXPORT std::shared_ptr<DataType> dense_union(
+    const ArrayVector& children, std::vector<std::string> field_names = {},
+    std::vector<int8_t> type_codes = {});
+
+/// \brief Create a DictionaryType instance
+/// \param[in] index_type the type of the dictionary indices (must be
+/// a signed integer)
+/// \param[in] dict_type the type of the values in the variable dictionary
+/// \param[in] ordered true if the order of the dictionary values has
+/// semantic meaning and should be preserved where possible
+ARROW_EXPORT
+std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type,
+                                     const std::shared_ptr<DataType>& dict_type,
+                                     bool ordered = false);
+
+/// @}
+
+/// \defgroup schema-factories Factory functions for fields and schemas
+///
+/// Factory functions for fields and schemas
+/// @{
+
+/// \brief Create a Field instance
+///
+/// \param name the field name
+/// \param type the field value type
+/// \param nullable whether the values are nullable, default true
+/// \param metadata any custom key-value metadata, default null
+ARROW_EXPORT std::shared_ptr<Field> field(
+    std::string name, std::shared_ptr<DataType> type, bool nullable = true,
+    std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+/// \brief Create a Field instance with metadata
+///
+/// The field will be assumed to be nullable.
+///
+/// \param name the field name
+/// \param type the field value type
+/// \param metadata any custom key-value metadata
+ARROW_EXPORT std::shared_ptr<Field> field(
+    std::string name, std::shared_ptr<DataType> type,
+    std::shared_ptr<const KeyValueMetadata> metadata);
+
+/// \brief Create a Schema instance
+///
+/// \param fields the schema's fields
+/// \param metadata any custom key-value metadata, default null
+/// \return schema shared_ptr to Schema
+ARROW_EXPORT
+std::shared_ptr<Schema> schema(
+    FieldVector fields, std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+/// \brief Create a Schema instance from (name, type) pairs
+///
+/// The schema's fields will all be nullable with no associated metadata.
+///
+/// \param fields (name, type) pairs of the schema's fields
+/// \param metadata any custom key-value metadata, default null
+/// \return schema shared_ptr to Schema
+ARROW_EXPORT
+std::shared_ptr<Schema> schema(
+    std::initializer_list<std::pair<std::string, std::shared_ptr<DataType>>> fields,
+    std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+/// \brief Create a Schema instance
+///
+/// \param fields the schema's fields
+/// \param endianness the endianness of the data
+/// \param metadata any custom key-value metadata, default null
+/// \return schema shared_ptr to Schema
+ARROW_EXPORT
+std::shared_ptr<Schema> schema(
+    FieldVector fields, Endianness endianness,
+    std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+/// \brief Create a Schema instance
+///
+/// The schema's fields will all be nullable with no associated metadata.
+///
+/// \param fields (name, type) pairs of the schema's fields
+/// \param endianness the endianness of the data
+/// \param metadata any custom key-value metadata, default null
+/// \return schema shared_ptr to Schema
+ARROW_EXPORT
+std::shared_ptr<Schema> schema(
+    std::initializer_list<std::pair<std::string, std::shared_ptr<DataType>>> fields,
+    Endianness endianness, std::shared_ptr<const KeyValueMetadata> metadata = NULLPTR);
+
+/// @}
+
+/// Return the process-wide default memory pool.
+ARROW_EXPORT MemoryPool* default_memory_pool();
+
+constexpr int64_t kDefaultBufferAlignment = 64;
+
+/// \brief EXPERIMENTAL: Device type enum which matches up with C Data Device types
+enum class DeviceAllocationType : char {
+  kCPU = 1,
+  kCUDA = 2,
+  kCUDA_HOST = 3,
+  kOPENCL = 4,
+  kVULKAN = 7,
+  kMETAL = 8,
+  kVPI = 9,
+  kROCM = 10,
+  kROCM_HOST = 11,
+  kEXT_DEV = 12,
+  kCUDA_MANAGED = 13,
+  kONEAPI = 14,
+  kWEBGPU = 15,
+  kHEXAGON = 16,
+};
+constexpr int kDeviceAllocationTypeMax = 16;
+
+class DeviceAllocationTypeSet;
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/type_traits.h b/pyarrow/include/arrow/type_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b7a02e1085a3be99a4ecd301fae425ea656440d
--- /dev/null
+++ b/pyarrow/include/arrow/type_traits.h
@@ -0,0 +1,1840 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "arrow/type.h"
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+
+//
+// Per-type id type lookup
+//
+
+template <Type::type id>
+struct TypeIdTraits {};
+
+#define TYPE_ID_TRAIT(_id, _typeclass) \
+  template <>                          \
+  struct TypeIdTraits<Type::_id> {     \
+    using Type = _typeclass;           \
+  };
+
+TYPE_ID_TRAIT(NA, NullType)
+TYPE_ID_TRAIT(BOOL, BooleanType)
+TYPE_ID_TRAIT(INT8, Int8Type)
+TYPE_ID_TRAIT(INT16, Int16Type)
+TYPE_ID_TRAIT(INT32, Int32Type)
+TYPE_ID_TRAIT(INT64, Int64Type)
+TYPE_ID_TRAIT(UINT8, UInt8Type)
+TYPE_ID_TRAIT(UINT16, UInt16Type)
+TYPE_ID_TRAIT(UINT32, UInt32Type)
+TYPE_ID_TRAIT(UINT64, UInt64Type)
+TYPE_ID_TRAIT(HALF_FLOAT, HalfFloatType)
+TYPE_ID_TRAIT(FLOAT, FloatType)
+TYPE_ID_TRAIT(DOUBLE, DoubleType)
+TYPE_ID_TRAIT(STRING, StringType)
+TYPE_ID_TRAIT(BINARY, BinaryType)
+TYPE_ID_TRAIT(LARGE_STRING, LargeStringType)
+TYPE_ID_TRAIT(LARGE_BINARY, LargeBinaryType)
+TYPE_ID_TRAIT(FIXED_SIZE_BINARY, FixedSizeBinaryType)
+TYPE_ID_TRAIT(DATE32, Date32Type)
+TYPE_ID_TRAIT(DATE64, Date64Type)
+TYPE_ID_TRAIT(TIME32, Time32Type)
+TYPE_ID_TRAIT(TIME64, Time64Type)
+TYPE_ID_TRAIT(TIMESTAMP, TimestampType)
+TYPE_ID_TRAIT(INTERVAL_DAY_TIME, DayTimeIntervalType)
+TYPE_ID_TRAIT(INTERVAL_MONTH_DAY_NANO, MonthDayNanoIntervalType)
+TYPE_ID_TRAIT(INTERVAL_MONTHS, MonthIntervalType)
+TYPE_ID_TRAIT(DURATION, DurationType)
+TYPE_ID_TRAIT(DECIMAL32, Decimal32Type)
+TYPE_ID_TRAIT(DECIMAL64, Decimal64Type)
+TYPE_ID_TRAIT(DECIMAL128, Decimal128Type)
+TYPE_ID_TRAIT(DECIMAL256, Decimal256Type)
+TYPE_ID_TRAIT(STRUCT, StructType)
+TYPE_ID_TRAIT(LIST, ListType)
+TYPE_ID_TRAIT(LARGE_LIST, LargeListType)
+TYPE_ID_TRAIT(FIXED_SIZE_LIST, FixedSizeListType)
+TYPE_ID_TRAIT(MAP, MapType)
+TYPE_ID_TRAIT(DENSE_UNION, DenseUnionType)
+TYPE_ID_TRAIT(SPARSE_UNION, SparseUnionType)
+TYPE_ID_TRAIT(DICTIONARY, DictionaryType)
+TYPE_ID_TRAIT(EXTENSION, ExtensionType)
+
+#undef TYPE_ID_TRAIT
+
+//
+// Per-type type traits
+//
+
+/// \addtogroup type-traits
+/// \brief Base template for type traits of Arrow data types
+/// Type traits provide various information about a type at compile time, such
+/// as the associated ArrayType, BuilderType, and ScalarType. Not all types
+/// provide all information.
+/// \tparam T An Arrow data type
+template <typename T>
+struct TypeTraits {};
+
+/// \brief Base template for type traits of C++ types
+/// \tparam T A standard C++ type
+template <typename T>
+struct CTypeTraits {};
+
+/// \addtogroup type-traits
+/// @{
+template <>
+struct TypeTraits<NullType> {
+  using ArrayType = NullArray;
+  using BuilderType = NullBuilder;
+  using ScalarType = NullScalar;
+
+  static constexpr int64_t bytes_required(int64_t) { return 0; }
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return null(); }
+};
+
+template <>
+struct TypeTraits<BooleanType> {
+  using ArrayType = BooleanArray;
+  using BuilderType = BooleanBuilder;
+  using ScalarType = BooleanScalar;
+  using CType = bool;
+
+  static constexpr int64_t bytes_required(int64_t elements) {
+    return bit_util::BytesForBits(elements);
+  }
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return boolean(); }
+};
+/// @}
+
+/// \addtogroup c-type-traits
+template <>
+struct CTypeTraits<bool> : public TypeTraits<BooleanType> {
+  using ArrowType = BooleanType;
+};
+
+#define PRIMITIVE_TYPE_TRAITS_DEF_(CType_, ArrowType_, ArrowArrayType, ArrowBuilderType, \
+                                   ArrowScalarType, ArrowTensorType, SingletonFn)        \
+  template <>                                                                            \
+  struct TypeTraits<ArrowType_> {                                                        \
+    using ArrayType = ArrowArrayType;                                                    \
+    using BuilderType = ArrowBuilderType;                                                \
+    using ScalarType = ArrowScalarType;                                                  \
+    using TensorType = ArrowTensorType;                                                  \
+    using CType = ArrowType_::c_type;                                                    \
+    static constexpr int64_t bytes_required(int64_t elements) {                          \
+      return elements * static_cast<int64_t>(sizeof(CType));                             \
+    }                                                                                    \
+    constexpr static bool is_parameter_free = true;                                      \
+    static inline std::shared_ptr<DataType> type_singleton() { return SingletonFn(); }   \
+  };                                                                                     \
+                                                                                         \
+  template <>                                                                            \
+  struct CTypeTraits<CType_> : public TypeTraits<ArrowType_> {                           \
+    using ArrowType = ArrowType_;                                                        \
+  };
+
+#define PRIMITIVE_TYPE_TRAITS_DEF(CType, ArrowShort, SingletonFn)             \
+  PRIMITIVE_TYPE_TRAITS_DEF_(                                                 \
+      CType, ARROW_CONCAT(ArrowShort, Type), ARROW_CONCAT(ArrowShort, Array), \
+      ARROW_CONCAT(ArrowShort, Builder), ARROW_CONCAT(ArrowShort, Scalar),    \
+      ARROW_CONCAT(ArrowShort, Tensor), SingletonFn)
+
+PRIMITIVE_TYPE_TRAITS_DEF(uint8_t, UInt8, uint8)
+PRIMITIVE_TYPE_TRAITS_DEF(int8_t, Int8, int8)
+PRIMITIVE_TYPE_TRAITS_DEF(uint16_t, UInt16, uint16)
+PRIMITIVE_TYPE_TRAITS_DEF(int16_t, Int16, int16)
+PRIMITIVE_TYPE_TRAITS_DEF(uint32_t, UInt32, uint32)
+PRIMITIVE_TYPE_TRAITS_DEF(int32_t, Int32, int32)
+PRIMITIVE_TYPE_TRAITS_DEF(uint64_t, UInt64, uint64)
+PRIMITIVE_TYPE_TRAITS_DEF(int64_t, Int64, int64)
+PRIMITIVE_TYPE_TRAITS_DEF(float, Float, float32)
+PRIMITIVE_TYPE_TRAITS_DEF(double, Double, float64)
+
+#undef PRIMITIVE_TYPE_TRAITS_DEF
+#undef PRIMITIVE_TYPE_TRAITS_DEF_
+
+/// \addtogroup type-traits
+/// @{
+template <>
+struct TypeTraits<Date64Type> {
+  using ArrayType = Date64Array;
+  using BuilderType = Date64Builder;
+  using ScalarType = Date64Scalar;
+  using CType = Date64Type::c_type;
+
+  static constexpr int64_t bytes_required(int64_t elements) {
+    return elements * static_cast<int64_t>(sizeof(int64_t));
+  }
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return date64(); }
+};
+
+template <>
+struct TypeTraits<Date32Type> {
+  using ArrayType = Date32Array;
+  using BuilderType = Date32Builder;
+  using ScalarType = Date32Scalar;
+  using CType = Date32Type::c_type;
+
+  static constexpr int64_t bytes_required(int64_t elements) {
+    return elements * static_cast<int64_t>(sizeof(int32_t));
+  }
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return date32(); }
+};
+
+template <>
+struct TypeTraits<TimestampType> {
+  using ArrayType = TimestampArray;
+  using BuilderType = TimestampBuilder;
+  using ScalarType = TimestampScalar;
+  using CType = TimestampType::c_type;
+
+  static constexpr int64_t bytes_required(int64_t elements) {
+    return elements * static_cast<int64_t>(sizeof(int64_t));
+  }
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<DurationType> {
+  using ArrayType = DurationArray;
+  using BuilderType = DurationBuilder;
+  using ScalarType = DurationScalar;
+  using CType = DurationType::c_type;
+
+  static constexpr int64_t bytes_required(int64_t elements) {
+    return elements * static_cast<int64_t>(sizeof(int64_t));
+  }
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<DayTimeIntervalType> {
+  using ArrayType = DayTimeIntervalArray;
+  using BuilderType = DayTimeIntervalBuilder;
+  using ScalarType = DayTimeIntervalScalar;
+  using CType = DayTimeIntervalType::c_type;
+
+  static constexpr int64_t bytes_required(int64_t elements) {
+    return elements * static_cast<int64_t>(sizeof(DayTimeIntervalType::DayMilliseconds));
+  }
+  constexpr static bool is_parameter_free = true;
+  static std::shared_ptr<DataType> type_singleton() { return day_time_interval(); }
+};
+
+template <>
+struct TypeTraits<MonthDayNanoIntervalType> {
+  using ArrayType = MonthDayNanoIntervalArray;
+  using BuilderType = MonthDayNanoIntervalBuilder;
+  using ScalarType = MonthDayNanoIntervalScalar;
+  using CType = MonthDayNanoIntervalType::c_type;
+
+  static constexpr int64_t bytes_required(int64_t elements) {
+    return elements *
+           static_cast<int64_t>(sizeof(MonthDayNanoIntervalType::MonthDayNanos));
+  }
+  constexpr static bool is_parameter_free = true;
+  static std::shared_ptr<DataType> type_singleton() { return month_day_nano_interval(); }
+};
+
+template <>
+struct TypeTraits<MonthIntervalType> {
+  using ArrayType = MonthIntervalArray;
+  using BuilderType = MonthIntervalBuilder;
+  using ScalarType = MonthIntervalScalar;
+  using CType = MonthIntervalType::c_type;
+
+  static constexpr int64_t bytes_required(int64_t elements) {
+    return elements * static_cast<int64_t>(sizeof(int32_t));
+  }
+  constexpr static bool is_parameter_free = true;
+  static std::shared_ptr<DataType> type_singleton() { return month_interval(); }
+};
+
+template <>
+struct TypeTraits<Time32Type> {
+  using ArrayType = Time32Array;
+  using BuilderType = Time32Builder;
+  using ScalarType = Time32Scalar;
+  using CType = Time32Type::c_type;
+
+  static constexpr int64_t bytes_required(int64_t elements) {
+    return elements * static_cast<int64_t>(sizeof(int32_t));
+  }
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<Time64Type> {
+  using ArrayType = Time64Array;
+  using BuilderType = Time64Builder;
+  using ScalarType = Time64Scalar;
+  using CType = Time64Type::c_type;
+
+  static constexpr int64_t bytes_required(int64_t elements) {
+    return elements * static_cast<int64_t>(sizeof(int64_t));
+  }
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<HalfFloatType> {
+  using ArrayType = HalfFloatArray;
+  using BuilderType = HalfFloatBuilder;
+  using ScalarType = HalfFloatScalar;
+  using TensorType = HalfFloatTensor;
+  using CType = uint16_t;
+
+  static constexpr int64_t bytes_required(int64_t elements) {
+    return elements * static_cast<int64_t>(sizeof(uint16_t));
+  }
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return float16(); }
+};
+
+template <>
+struct CTypeTraits<util::Float16> : public TypeTraits<HalfFloatType> {
+  using ArrowType = HalfFloatType;
+};
+
+template <>
+struct TypeTraits<Decimal32Type> {
+  using ArrayType = Decimal32Array;
+  using BuilderType = Decimal32Builder;
+  using ScalarType = Decimal32Scalar;
+  using CType = Decimal32;
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<Decimal64Type> {
+  using ArrayType = Decimal64Array;
+  using BuilderType = Decimal64Builder;
+  using ScalarType = Decimal64Scalar;
+  using CType = Decimal64;
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<Decimal128Type> {
+  using ArrayType = Decimal128Array;
+  using BuilderType = Decimal128Builder;
+  using ScalarType = Decimal128Scalar;
+  using CType = Decimal128;
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<Decimal256Type> {
+  using ArrayType = Decimal256Array;
+  using BuilderType = Decimal256Builder;
+  using ScalarType = Decimal256Scalar;
+  using CType = Decimal256;
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<BinaryType> {
+  using ArrayType = BinaryArray;
+  using BuilderType = BinaryBuilder;
+  using ScalarType = BinaryScalar;
+  using OffsetType = Int32Type;
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return binary(); }
+};
+
+template <>
+struct TypeTraits<BinaryViewType> {
+  using ArrayType = BinaryViewArray;
+  using BuilderType = BinaryViewBuilder;
+  using ScalarType = BinaryViewScalar;
+  using CType = BinaryViewType::c_type;
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return binary_view(); }
+};
+
+template <>
+struct TypeTraits<LargeBinaryType> {
+  using ArrayType = LargeBinaryArray;
+  using BuilderType = LargeBinaryBuilder;
+  using ScalarType = LargeBinaryScalar;
+  using OffsetType = Int64Type;
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return large_binary(); }
+};
+
+template <>
+struct TypeTraits<FixedSizeBinaryType> {
+  using ArrayType = FixedSizeBinaryArray;
+  using BuilderType = FixedSizeBinaryBuilder;
+  using ScalarType = FixedSizeBinaryScalar;
+  // FixedSizeBinary doesn't have offsets per se, but string length is int32 sized
+  using OffsetType = Int32Type;
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<StringType> {
+  using ArrayType = StringArray;
+  using BuilderType = StringBuilder;
+  using ScalarType = StringScalar;
+  using OffsetType = Int32Type;
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return utf8(); }
+};
+
+template <>
+struct TypeTraits<StringViewType> {
+  using ArrayType = StringViewArray;
+  using BuilderType = StringViewBuilder;
+  using ScalarType = StringViewScalar;
+  using CType = BinaryViewType::c_type;
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return utf8_view(); }
+};
+
+template <>
+struct TypeTraits<LargeStringType> {
+  using ArrayType = LargeStringArray;
+  using BuilderType = LargeStringBuilder;
+  using ScalarType = LargeStringScalar;
+  using OffsetType = Int64Type;
+  constexpr static bool is_parameter_free = true;
+  static inline std::shared_ptr<DataType> type_singleton() { return large_utf8(); }
+};
+
+template <>
+struct TypeTraits<RunEndEncodedType> {
+  using ArrayType = RunEndEncodedArray;
+  using BuilderType = RunEndEncodedBuilder;
+  using ScalarType = RunEndEncodedScalar;
+
+  constexpr static bool is_parameter_free = false;
+};
+
+/// @}
+
+/// \addtogroup c-type-traits
+/// @{
+template <>
+struct CTypeTraits<std::string> : public TypeTraits<StringType> {
+  using ArrowType = StringType;
+};
+
+template <>
+struct CTypeTraits<BinaryViewType::c_type> : public TypeTraits<BinaryViewType> {
+  using ArrowType = BinaryViewType;
+};
+
+template <>
+struct CTypeTraits<const char*> : public CTypeTraits<std::string> {};
+
+template <size_t N>
+struct CTypeTraits<const char (&)[N]> : public CTypeTraits<std::string> {};
+
+template <>
+struct CTypeTraits<DayTimeIntervalType::DayMilliseconds>
+    : public TypeTraits<DayTimeIntervalType> {
+  using ArrowType = DayTimeIntervalType;
+};
+/// @}
+
+/// \addtogroup type-traits
+/// @{
+template <>
+struct TypeTraits<ListType> {
+  using ArrayType = ListArray;
+  using BuilderType = ListBuilder;
+  using ScalarType = ListScalar;
+  using OffsetType = Int32Type;
+  using OffsetArrayType = Int32Array;
+  using OffsetBuilderType = Int32Builder;
+  using OffsetScalarType = Int32Scalar;
+  constexpr static bool is_parameter_free = false;
+  using LargeType = LargeListType;
+};
+
+template <>
+struct TypeTraits<LargeListType> {
+  using ArrayType = LargeListArray;
+  using BuilderType = LargeListBuilder;
+  using ScalarType = LargeListScalar;
+  using OffsetType = Int64Type;
+  using OffsetArrayType = Int64Array;
+  using OffsetBuilderType = Int64Builder;
+  using OffsetScalarType = Int64Scalar;
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<ListViewType> {
+  using ArrayType = ListViewArray;
+  using BuilderType = ListViewBuilder;
+  using ScalarType = ListViewScalar;
+  using OffsetType = Int32Type;
+  using OffsetArrayType = Int32Array;
+  using OffsetBuilderType = Int32Builder;
+  using OffsetScalarType = Int32Scalar;
+  constexpr static bool is_parameter_free = false;
+  using LargeType = LargeListViewType;
+};
+
+template <>
+struct TypeTraits<LargeListViewType> {
+  using ArrayType = LargeListViewArray;
+  using BuilderType = LargeListViewBuilder;
+  using ScalarType = LargeListViewScalar;
+  using OffsetType = Int64Type;
+  using OffsetArrayType = Int64Array;
+  using OffsetBuilderType = Int64Builder;
+  using OffsetScalarType = Int64Scalar;
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<MapType> {
+  using ArrayType = MapArray;
+  using BuilderType = MapBuilder;
+  using ScalarType = MapScalar;
+  using OffsetType = Int32Type;
+  using OffsetArrayType = Int32Array;
+  using OffsetBuilderType = Int32Builder;
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<FixedSizeListType> {
+  using ArrayType = FixedSizeListArray;
+  using BuilderType = FixedSizeListBuilder;
+  using ScalarType = FixedSizeListScalar;
+  constexpr static bool is_parameter_free = false;
+};
+/// @}
+
+/// \addtogroup c-type-traits
+template <typename CType>
+struct CTypeTraits<std::vector<CType>> : public TypeTraits<ListType> {
+  using ArrowType = ListType;
+
+  static inline std::shared_ptr<DataType> type_singleton() {
+    return list(CTypeTraits<CType>::type_singleton());
+  }
+};
+
+/// \addtogroup c-type-traits
+template <typename CType, std::size_t N>
+struct CTypeTraits<std::array<CType, N>> : public TypeTraits<FixedSizeListType> {
+  using ArrowType = FixedSizeListType;
+
+  static auto type_singleton() {
+    return fixed_size_list(CTypeTraits<CType>::type_singleton(), N);
+  }
+};
+
+/// \addtogroup type-traits
+/// @{
+template <>
+struct TypeTraits<StructType> {
+  using ArrayType = StructArray;
+  using BuilderType = StructBuilder;
+  using ScalarType = StructScalar;
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<SparseUnionType> {
+  using ArrayType = SparseUnionArray;
+  using BuilderType = SparseUnionBuilder;
+  using ScalarType = SparseUnionScalar;
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<DenseUnionType> {
+  using ArrayType = DenseUnionArray;
+  using BuilderType = DenseUnionBuilder;
+  using ScalarType = DenseUnionScalar;
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<DictionaryType> {
+  using ArrayType = DictionaryArray;
+  using ScalarType = DictionaryScalar;
+  constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<ExtensionType> {
+  using ArrayType = ExtensionArray;
+  using ScalarType = ExtensionScalar;
+  constexpr static bool is_parameter_free = false;
+};
+/// @}
+
+namespace internal {
+
+template <typename... Ts>
+struct make_void {
+  using type = void;
+};
+
+template <typename... Ts>
+using void_t = typename make_void<Ts...>::type;
+
+}  // namespace internal
+
+//
+// Useful type predicates
+//
+
+/// \addtogroup type-predicates
+/// @{
+
+// only in C++14
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+template <typename T>
+using is_null_type = std::is_same<NullType, T>;
+
+template <typename T, typename R = void>
+using enable_if_null = enable_if_t<is_null_type<T>::value, R>;
+
+template <typename T>
+using is_boolean_type = std::is_same<BooleanType, T>;
+
+template <typename T, typename R = void>
+using enable_if_boolean = enable_if_t<is_boolean_type<T>::value, R>;
+
+template <typename T>
+using is_number_type = std::is_base_of<NumberType, T>;
+
+template <typename T, typename R = void>
+using enable_if_number = enable_if_t<is_number_type<T>::value, R>;
+
+template <typename T>
+using is_integer_type = std::is_base_of<IntegerType, T>;
+
+template <typename T, typename R = void>
+using enable_if_integer = enable_if_t<is_integer_type<T>::value, R>;
+
+template <typename T>
+using is_signed_integer_type =
+    std::integral_constant<bool, is_integer_type<T>::value &&
+                                     std::is_signed<typename T::c_type>::value>;
+
+template <typename T, typename R = void>
+using enable_if_signed_integer = enable_if_t<is_signed_integer_type<T>::value, R>;
+
+template <typename T>
+using is_unsigned_integer_type =
+    std::integral_constant<bool, is_integer_type<T>::value &&
+                                     std::is_unsigned<typename T::c_type>::value>;
+
+template <typename T, typename R = void>
+using enable_if_unsigned_integer = enable_if_t<is_unsigned_integer_type<T>::value, R>;
+
+// Note this will also include HalfFloatType which is represented by a
+// non-floating point primitive (uint16_t).
+template <typename T>
+using is_floating_type = std::is_base_of<FloatingPointType, T>;
+
+template <typename T, typename R = void>
+using enable_if_floating_point = enable_if_t<is_floating_type<T>::value, R>;
+
+// Half floats are special in that they behave physically like an unsigned
+// integer.
+template <typename T>
+using is_half_float_type = std::is_same<HalfFloatType, T>;
+
+template <typename T, typename R = void>
+using enable_if_half_float = enable_if_t<is_half_float_type<T>::value, R>;
+
+// Binary Types
+
+// Base binary refers to Binary/LargeBinary/String/LargeString
+template <typename T>
+using is_base_binary_type = std::is_base_of<BaseBinaryType, T>;
+
+template <typename T, typename R = void>
+using enable_if_base_binary = enable_if_t<is_base_binary_type<T>::value, R>;
+
+// Any binary excludes string from Base binary
+template <typename T>
+using is_binary_type =
+    std::integral_constant<bool, std::is_same<BinaryType, T>::value ||
+                                     std::is_same<LargeBinaryType, T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_binary = enable_if_t<is_binary_type<T>::value, R>;
+
+template <typename T>
+using is_string_type =
+    std::integral_constant<bool, std::is_same<StringType, T>::value ||
+                                     std::is_same<LargeStringType, T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_string = enable_if_t<is_string_type<T>::value, R>;
+
+template <typename T>
+using is_binary_view_like_type = std::is_base_of<BinaryViewType, T>;
+
+template <typename T>
+using is_binary_view_type = std::is_same<BinaryViewType, T>;
+
+template <typename T>
+using is_string_view_type = std::is_same<StringViewType, T>;
+
+template <typename T, typename R = void>
+using enable_if_binary_view_like = enable_if_t<is_binary_view_like_type<T>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_binary_view = enable_if_t<is_binary_view_type<T>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_string_view = enable_if_t<is_string_view_type<T>::value, R>;
+
+template <typename T>
+using is_string_like_type =
+    std::integral_constant<bool, is_base_binary_type<T>::value && T::is_utf8>;
+
+template <typename T, typename R = void>
+using enable_if_string_like = enable_if_t<is_string_like_type<T>::value, R>;
+
+template <typename T, typename U, typename R = void>
+using enable_if_same = enable_if_t<std::is_same<T, U>::value, R>;
+
+// Note that this also includes DecimalType
+template <typename T>
+using is_fixed_size_binary_type = std::is_base_of<FixedSizeBinaryType, T>;
+
+template <typename T, typename R = void>
+using enable_if_fixed_size_binary = enable_if_t<is_fixed_size_binary_type<T>::value, R>;
+
+// This includes primitive, dictionary, and fixed-size-binary types
+template <typename T>
+using is_fixed_width_type = std::is_base_of<FixedWidthType, T>;
+
+template <typename T, typename R = void>
+using enable_if_fixed_width_type = enable_if_t<is_fixed_width_type<T>::value, R>;
+
+template <typename T>
+using is_binary_like_type =
+    std::integral_constant<bool, (is_base_binary_type<T>::value &&
+                                  !is_string_like_type<T>::value) ||
+                                     is_fixed_size_binary_type<T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_binary_like = enable_if_t<is_binary_like_type<T>::value, R>;
+
+template <typename T>
+using is_decimal_type = std::is_base_of<DecimalType, T>;
+
+template <typename T, typename R = void>
+using enable_if_decimal = enable_if_t<is_decimal_type<T>::value, R>;
+
+template <typename T>
+using is_decimal32_type = std::is_base_of<Decimal32Type, T>;
+
+template <typename T, typename R = void>
+using enable_if_decimal32 = enable_if_t<is_decimal32_type<T>::value, R>;
+
+template <typename T>
+using is_decimal64_type = std::is_base_of<Decimal64Type, T>;
+
+template <typename T, typename R = void>
+using enable_if_decimal64 = enable_if_t<is_decimal64_type<T>::value, R>;
+
+template <typename T>
+using is_decimal128_type = std::is_base_of<Decimal128Type, T>;
+
+template <typename T, typename R = void>
+using enable_if_decimal128 = enable_if_t<is_decimal128_type<T>::value, R>;
+
+template <typename T>
+using is_decimal256_type = std::is_base_of<Decimal256Type, T>;
+
+template <typename T, typename R = void>
+using enable_if_decimal256 = enable_if_t<is_decimal256_type<T>::value, R>;
+
+// Nested Types
+
+template <typename T>
+using is_nested_type = std::is_base_of<NestedType, T>;
+
+template <typename T, typename R = void>
+using enable_if_nested = enable_if_t<is_nested_type<T>::value, R>;
+
+template <typename T, typename R = void>
+using enable_if_not_nested = enable_if_t<!is_nested_type<T>::value, R>;
+
+template <typename T>
+using is_var_length_list_type =
+    std::integral_constant<bool, std::is_base_of<LargeListType, T>::value ||
+                                     std::is_base_of<ListType, T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_var_size_list = enable_if_t<is_var_length_list_type<T>::value, R>;
+
+// DEPRECATED use is_var_length_list_type.
+template <typename T>
+using is_base_list_type = is_var_length_list_type<T>;
+
+// DEPRECATED use enable_if_var_size_list
+template <typename T, typename R = void>
+using enable_if_base_list = enable_if_var_size_list<T, R>;
+
+template <typename T>
+using is_fixed_size_list_type = std::is_same<FixedSizeListType, T>;
+
+template <typename T, typename R = void>
+using enable_if_fixed_size_list = enable_if_t<is_fixed_size_list_type<T>::value, R>;
+
+template <typename T>
+using is_list_type =
+    std::integral_constant<bool, std::is_same<T, ListType>::value ||
+                                     std::is_same<T, LargeListType>::value ||
+                                     std::is_same<T, FixedSizeListType>::value>;
+
+template <typename T, typename R = void>
+using enable_if_list_type = enable_if_t<is_list_type<T>::value, R>;
+
+template <typename T>
+using is_list_view_type =
+    std::disjunction<std::is_same<T, ListViewType>, std::is_same<T, LargeListViewType>>;
+
+template <typename T, typename R = void>
+using enable_if_list_view = enable_if_t<is_list_view_type<T>::value, R>;
+
+template <typename T>
+using is_list_like_type =
+    std::integral_constant<bool, is_var_length_list_type<T>::value ||
+                                     is_fixed_size_list_type<T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_list_like = enable_if_t<is_list_like_type<T>::value, R>;
+
+template <typename T>
+using is_var_length_list_like_type =
+    std::disjunction<is_var_length_list_type<T>, is_list_view_type<T>>;
+
+template <typename T, typename R = void>
+using enable_if_var_length_list_like =
+    enable_if_t<is_var_length_list_like_type<T>::value, R>;
+
+template <typename T>
+using is_struct_type = std::is_base_of<StructType, T>;
+
+template <typename T, typename R = void>
+using enable_if_struct = enable_if_t<is_struct_type<T>::value, R>;
+
+template <typename T>
+using is_union_type = std::is_base_of<UnionType, T>;
+
+template <typename T, typename R = void>
+using enable_if_union = enable_if_t<is_union_type<T>::value, R>;
+
+// TemporalTypes
+
+template <typename T>
+using is_temporal_type = std::is_base_of<TemporalType, T>;
+
+template <typename T, typename R = void>
+using enable_if_temporal = enable_if_t<is_temporal_type<T>::value, R>;
+
+template <typename T>
+using is_date_type = std::is_base_of<DateType, T>;
+
+template <typename T, typename R = void>
+using enable_if_date = enable_if_t<is_date_type<T>::value, R>;
+
+template <typename T>
+using is_time_type = std::is_base_of<TimeType, T>;
+
+template <typename T, typename R = void>
+using enable_if_time = enable_if_t<is_time_type<T>::value, R>;
+
+template <typename T>
+using is_timestamp_type = std::is_base_of<TimestampType, T>;
+
+template <typename T, typename R = void>
+using enable_if_timestamp = enable_if_t<is_timestamp_type<T>::value, R>;
+
+template <typename T>
+using is_duration_type = std::is_base_of<DurationType, T>;
+
+template <typename T, typename R = void>
+using enable_if_duration = enable_if_t<is_duration_type<T>::value, R>;
+
+template <typename T>
+using is_interval_type = std::is_base_of<IntervalType, T>;
+
+template <typename T, typename R = void>
+using enable_if_interval = enable_if_t<is_interval_type<T>::value, R>;
+
+template <typename T>
+using is_run_end_encoded_type = std::is_base_of<RunEndEncodedType, T>;
+
+template <typename T, typename R = void>
+using enable_if_run_end_encoded = enable_if_t<is_run_end_encoded_type<T>::value, R>;
+
+template <typename T>
+using is_dictionary_type = std::is_base_of<DictionaryType, T>;
+
+template <typename T, typename R = void>
+using enable_if_dictionary = enable_if_t<is_dictionary_type<T>::value, R>;
+
+template <typename T>
+using is_extension_type = std::is_base_of<ExtensionType, T>;
+
+template <typename T, typename R = void>
+using enable_if_extension = enable_if_t<is_extension_type<T>::value, R>;
+
+// Attribute differentiation
+
+template <typename T>
+using is_primitive_ctype = std::is_base_of<PrimitiveCType, T>;
+
+template <typename T, typename R = void>
+using enable_if_primitive_ctype = enable_if_t<is_primitive_ctype<T>::value, R>;
+
+template <typename T>
+using has_c_type = std::integral_constant<bool, is_primitive_ctype<T>::value ||
+                                                    is_temporal_type<T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_has_c_type = enable_if_t<has_c_type<T>::value, R>;
+
+template <typename T>
+using has_string_view =
+    std::integral_constant<bool, std::is_same<BinaryType, T>::value ||
+                                     std::is_same<BinaryViewType, T>::value ||
+                                     std::is_same<LargeBinaryType, T>::value ||
+                                     std::is_same<StringType, T>::value ||
+                                     std::is_same<StringViewType, T>::value ||
+                                     std::is_same<LargeStringType, T>::value ||
+                                     std::is_same<FixedSizeBinaryType, T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_has_string_view = enable_if_t<has_string_view<T>::value, R>;
+
+template <typename T>
+using is_8bit_int = std::integral_constant<bool, std::is_same<UInt8Type, T>::value ||
+                                                     std::is_same<Int8Type, T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_8bit_int = enable_if_t<is_8bit_int<T>::value, R>;
+
+template <typename T>
+using is_parameter_free_type =
+    std::integral_constant<bool, TypeTraits<T>::is_parameter_free>;
+
+template <typename T, typename R = void>
+using enable_if_parameter_free = enable_if_t<is_parameter_free_type<T>::value, R>;
+
+// Physical representation quirks
+
+template <typename T>
+using is_physical_signed_integer_type =
+    std::integral_constant<bool,
+                           is_signed_integer_type<T>::value ||
+                               (is_temporal_type<T>::value && has_c_type<T>::value &&
+                                std::is_integral<typename T::c_type>::value)>;
+
+template <typename T, typename R = void>
+using enable_if_physical_signed_integer =
+    enable_if_t<is_physical_signed_integer_type<T>::value, R>;
+
+template <typename T>
+using is_physical_unsigned_integer_type =
+    std::integral_constant<bool, is_unsigned_integer_type<T>::value ||
+                                     is_half_float_type<T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_physical_unsigned_integer =
+    enable_if_t<is_physical_unsigned_integer_type<T>::value, R>;
+
+template <typename T>
+using is_physical_integer_type =
+    std::integral_constant<bool, is_physical_unsigned_integer_type<T>::value ||
+                                     is_physical_signed_integer_type<T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_physical_integer = enable_if_t<is_physical_integer_type<T>::value, R>;
+
+// Like is_floating_type but excluding half-floats which don't have a
+// float-like c type.
+template <typename T>
+using is_physical_floating_type =
+    std::integral_constant<bool,
+                           is_floating_type<T>::value && !is_half_float_type<T>::value>;
+
+template <typename T, typename R = void>
+using enable_if_physical_floating_point =
+    enable_if_t<is_physical_floating_type<T>::value, R>;
+
+/// @}
+
+/// \addtogroup runtime-type-predicates
+/// @{
+
+/// \brief Check for an integer type (signed or unsigned)
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is an integer type one
+constexpr bool is_integer(Type::type type_id) {
+  switch (type_id) {
+    case Type::UINT8:
+    case Type::INT8:
+    case Type::UINT16:
+    case Type::INT16:
+    case Type::UINT32:
+    case Type::INT32:
+    case Type::UINT64:
+    case Type::INT64:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a signed integer type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a signed integer type one
+constexpr bool is_signed_integer(Type::type type_id) {
+  switch (type_id) {
+    case Type::INT8:
+    case Type::INT16:
+    case Type::INT32:
+    case Type::INT64:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for an unsigned integer type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is an unsigned integer type one
+constexpr bool is_unsigned_integer(Type::type type_id) {
+  switch (type_id) {
+    case Type::UINT8:
+    case Type::UINT16:
+    case Type::UINT32:
+    case Type::UINT64:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a floating point type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a floating point type one
+constexpr bool is_floating(Type::type type_id) {
+  switch (type_id) {
+    case Type::HALF_FLOAT:
+    case Type::FLOAT:
+    case Type::DOUBLE:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a physical floating point type
+///
+/// This predicate matches floating-point types, except half-float.
+constexpr bool is_physical_floating(Type::type type_id) {
+  return is_floating(type_id) && type_id != Type::HALF_FLOAT;
+}
+
+/// \brief Check for a numeric type
+///
+/// This predicate doesn't match decimals (see `is_decimal`).
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a numeric type one
+constexpr bool is_numeric(Type::type type_id) {
+  switch (type_id) {
+    case Type::UINT8:
+    case Type::INT8:
+    case Type::UINT16:
+    case Type::INT16:
+    case Type::UINT32:
+    case Type::INT32:
+    case Type::UINT64:
+    case Type::INT64:
+    case Type::HALF_FLOAT:
+    case Type::FLOAT:
+    case Type::DOUBLE:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a decimal type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a decimal type one
+constexpr bool is_decimal(Type::type type_id) {
+  switch (type_id) {
+    case Type::DECIMAL32:
+    case Type::DECIMAL64:
+    case Type::DECIMAL128:
+    case Type::DECIMAL256:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a type that can be used as a run-end in Run-End Encoded
+/// arrays
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id can represent a run-end value
+constexpr bool is_run_end_type(Type::type type_id) {
+  switch (type_id) {
+    case Type::INT16:
+    case Type::INT32:
+    case Type::INT64:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a primitive type
+///
+/// This predicate doesn't match null, decimals and binary-like types.
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a primitive type one
+constexpr bool is_primitive(Type::type type_id) {
+  switch (type_id) {
+    case Type::BOOL:
+    case Type::UINT8:
+    case Type::INT8:
+    case Type::UINT16:
+    case Type::INT16:
+    case Type::UINT32:
+    case Type::INT32:
+    case Type::UINT64:
+    case Type::INT64:
+    case Type::HALF_FLOAT:
+    case Type::FLOAT:
+    case Type::DOUBLE:
+    case Type::DATE32:
+    case Type::DATE64:
+    case Type::TIME32:
+    case Type::TIME64:
+    case Type::TIMESTAMP:
+    case Type::DURATION:
+    case Type::INTERVAL_MONTHS:
+    case Type::INTERVAL_MONTH_DAY_NANO:
+    case Type::INTERVAL_DAY_TIME:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a base-binary-like type
+///
+/// This predicate doesn't match fixed-size binary types and will otherwise
+/// match all binary- and string-like types regardless of offset width.
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a base-binary-like type one
+constexpr bool is_base_binary_like(Type::type type_id) {
+  switch (type_id) {
+    case Type::BINARY:
+    case Type::LARGE_BINARY:
+    case Type::STRING:
+    case Type::LARGE_STRING:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a binary-like type (i.e. with 32-bit offsets)
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a binary-like type one
+constexpr bool is_binary_like(Type::type type_id) {
+  switch (type_id) {
+    case Type::BINARY:
+    case Type::STRING:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a large-binary-like type (i.e. with 64-bit offsets)
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a large-binary-like type one
+constexpr bool is_large_binary_like(Type::type type_id) {
+  switch (type_id) {
+    case Type::LARGE_BINARY:
+    case Type::LARGE_STRING:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a binary (non-string) type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a binary type one
+constexpr bool is_binary(Type::type type_id) {
+  switch (type_id) {
+    case Type::BINARY:
+    case Type::LARGE_BINARY:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a binary or binary view (non-string) type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a binary type one
+constexpr bool is_binary_or_binary_view(Type::type type_id) {
+  switch (type_id) {
+    case Type::BINARY:
+    case Type::LARGE_BINARY:
+    case Type::BINARY_VIEW:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a string type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a string type one
+constexpr bool is_string(Type::type type_id) {
+  switch (type_id) {
+    case Type::STRING:
+    case Type::LARGE_STRING:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a string or string view type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a string type one
+constexpr bool is_string_or_string_view(Type::type type_id) {
+  switch (type_id) {
+    case Type::STRING:
+    case Type::LARGE_STRING:
+    case Type::STRING_VIEW:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a binary-view-like type (i.e. string view and binary view)
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a binary-view-like type one
+constexpr bool is_binary_view_like(Type::type type_id) {
+  switch (type_id) {
+    case Type::STRING_VIEW:
+    case Type::BINARY_VIEW:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a temporal type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a temporal type one
+constexpr bool is_temporal(Type::type type_id) {
+  switch (type_id) {
+    case Type::DATE32:
+    case Type::DATE64:
+    case Type::TIME32:
+    case Type::TIME64:
+    case Type::TIMESTAMP:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a time type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a primitive type one
+constexpr bool is_time(Type::type type_id) {
+  switch (type_id) {
+    case Type::TIME32:
+    case Type::TIME64:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a date type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a primitive type one
+constexpr bool is_date(Type::type type_id) {
+  switch (type_id) {
+    case Type::DATE32:
+    case Type::DATE64:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for an interval type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is an interval type one
+constexpr bool is_interval(Type::type type_id) {
+  switch (type_id) {
+    case Type::INTERVAL_MONTHS:
+    case Type::INTERVAL_DAY_TIME:
+    case Type::INTERVAL_MONTH_DAY_NANO:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a dictionary type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a dictionary type one
+constexpr bool is_dictionary(Type::type type_id) { return type_id == Type::DICTIONARY; }
+
+/// \brief Check for a fixed-size-binary type
+///
+/// This predicate also matches decimals.
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a fixed-size-binary type one
+constexpr bool is_fixed_size_binary(Type::type type_id) {
+  switch (type_id) {
+    case Type::DECIMAL32:
+    case Type::DECIMAL64:
+    case Type::DECIMAL128:
+    case Type::DECIMAL256:
+    case Type::FIXED_SIZE_BINARY:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a fixed-width type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a fixed-width type one
+constexpr bool is_fixed_width(Type::type type_id) {
+  return is_primitive(type_id) || is_dictionary(type_id) || is_fixed_size_binary(type_id);
+}
+
+/// \brief Check for a variable-length list type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a variable-length list type one
+constexpr bool is_var_length_list(Type::type type_id) {
+  switch (type_id) {
+    case Type::LIST:
+    case Type::LARGE_LIST:
+    case Type::MAP:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a list type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a list type one
+constexpr bool is_list(Type::type type_id) {
+  switch (type_id) {
+    case Type::LIST:
+    case Type::LARGE_LIST:
+    case Type::FIXED_SIZE_LIST:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a list-like type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a list-like type one
+constexpr bool is_list_like(Type::type type_id) {
+  switch (type_id) {
+    case Type::LIST:
+    case Type::LARGE_LIST:
+    case Type::FIXED_SIZE_LIST:
+    case Type::MAP:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a var-length list or list-view like type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a var-length list or list-view like type
+constexpr bool is_var_length_list_like(Type::type type_id) {
+  switch (type_id) {
+    case Type::LIST:
+    case Type::LARGE_LIST:
+    case Type::LIST_VIEW:
+    case Type::LARGE_LIST_VIEW:
+    case Type::MAP:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a list-view type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a list-view type one
+constexpr bool is_list_view(Type::type type_id) {
+  switch (type_id) {
+    case Type::LIST_VIEW:
+    case Type::LARGE_LIST_VIEW:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a nested type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a nested type one
+constexpr bool is_nested(Type::type type_id) {
+  switch (type_id) {
+    case Type::LIST:
+    case Type::LARGE_LIST:
+    case Type::LIST_VIEW:
+    case Type::LARGE_LIST_VIEW:
+    case Type::FIXED_SIZE_LIST:
+    case Type::MAP:
+    case Type::STRUCT:
+    case Type::SPARSE_UNION:
+    case Type::DENSE_UNION:
+    case Type::RUN_END_ENCODED:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Check for a union type
+///
+/// \param[in] type_id the type-id to check
+/// \return whether type-id is a union type one
+constexpr bool is_union(Type::type type_id) {
+  switch (type_id) {
+    case Type::SPARSE_UNION:
+    case Type::DENSE_UNION:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+/// \brief Return the values bit width of a type
+///
+/// \param[in] type_id the type-id to check
+/// \return the values bit width, or 0 if the type does not have fixed-width values
+///
+/// For Type::FIXED_SIZE_BINARY, you will instead need to inspect the concrete
+/// DataType to get this information.
+constexpr int bit_width(Type::type type_id) {
+  switch (type_id) {
+    case Type::BOOL:
+      return 1;
+    case Type::UINT8:
+    case Type::INT8:
+      return 8;
+    case Type::UINT16:
+    case Type::INT16:
+      return 16;
+    case Type::UINT32:
+    case Type::INT32:
+    case Type::DATE32:
+    case Type::TIME32:
+      return 32;
+    case Type::UINT64:
+    case Type::INT64:
+    case Type::DATE64:
+    case Type::TIME64:
+    case Type::TIMESTAMP:
+    case Type::DURATION:
+      return 64;
+
+    case Type::HALF_FLOAT:
+      return 16;
+    case Type::FLOAT:
+      return 32;
+    case Type::DOUBLE:
+      return 64;
+
+    case Type::INTERVAL_MONTHS:
+      return 32;
+    case Type::INTERVAL_DAY_TIME:
+      return 64;
+    case Type::INTERVAL_MONTH_DAY_NANO:
+      return 128;
+
+    case Type::DECIMAL32:
+      return 32;
+    case Type::DECIMAL64:
+      return 64;
+    case Type::DECIMAL128:
+      return 128;
+    case Type::DECIMAL256:
+      return 256;
+
+    default:
+      break;
+  }
+  return 0;
+}
+
+/// \brief Return the offsets bit width of a type
+///
+/// \param[in] type_id the type-id to check
+/// \return the offsets bit width, or 0 if the type does not have offsets
+constexpr int offset_bit_width(Type::type type_id) {
+  switch (type_id) {
+    case Type::STRING:
+    case Type::BINARY:
+    case Type::LIST:
+    case Type::LIST_VIEW:
+    case Type::MAP:
+    case Type::DENSE_UNION:
+      return 32;
+    case Type::LARGE_STRING:
+    case Type::LARGE_BINARY:
+    case Type::LARGE_LIST:
+    case Type::LARGE_LIST_VIEW:
+      return 64;
+    default:
+      break;
+  }
+  return 0;
+}
+
+/// \brief Get the alignment a buffer should have to be considered "value aligned"
+///
+/// Some buffers are frequently type-punned.  For example, in an int32 array the
+/// values buffer is frequently cast to int32_t*
+///
+/// This sort of punning is technically only valid if the pointer is aligned to a
+/// proper width (e.g. 4 bytes in the case of int32).  However, most modern compilers
+/// are quite permissive if we get this wrong.  Note that this alignment is something
+/// that is guaranteed by malloc (e.g. new int32_t[] will return a buffer that is 4
+/// byte aligned) or common libraries (e.g. numpy) but it is not currently guaranteed
+/// by flight (GH-32276).
+///
+/// We call this "value aligned" and this method will calculate that required alignment.
+///
+/// \param type_id the type of the array containing the buffer
+///                Note: this should be the indices type for a dictionary array since
+///                A dictionary array's buffers are indices.  It should be the storage
+///                type for an extension array.
+/// \param buffer_index the index of the buffer to check, for example 0 will typically
+///                     give you the alignment expected of the validity buffer
+/// \return the required value alignment in bytes (1 if no alignment required)
+int RequiredValueAlignmentForBuffer(Type::type type_id, int buffer_index);
+
+/// \brief Check for an integer type (signed or unsigned)
+///
+/// \param[in] type the type to check
+/// \return whether type is an integer type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_integer(const DataType& type) { return is_integer(type.id()); }
+
+/// \brief Check for a signed integer type
+///
+/// \param[in] type the type to check
+/// \return whether type is a signed integer type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_signed_integer(const DataType& type) {
+  return is_signed_integer(type.id());
+}
+
+/// \brief Check for an unsigned integer type
+///
+/// \param[in] type the type to check
+/// \return whether type is an unsigned integer type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_unsigned_integer(const DataType& type) {
+  return is_unsigned_integer(type.id());
+}
+
+/// \brief Check for a floating point type
+///
+/// \param[in] type the type to check
+/// \return whether type is a floating point type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_floating(const DataType& type) { return is_floating(type.id()); }
+
+/// \brief Check for a numeric type (number except boolean type)
+///
+/// \param[in] type the type to check
+/// \return whether type is a numeric type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_numeric(const DataType& type) { return is_numeric(type.id()); }
+
+/// \brief Check for a decimal type
+///
+/// \param[in] type the type to check
+/// \return whether type is a decimal type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_decimal(const DataType& type) { return is_decimal(type.id()); }
+
+/// \brief Check for a primitive type
+///
+/// \param[in] type the type to check
+/// \return whether type is a primitive type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_primitive(const DataType& type) { return is_primitive(type.id()); }
+
+/// \brief Check for a binary or string-like type (except fixed-size binary)
+///
+/// \param[in] type the type to check
+/// \return whether type is a binary or string-like type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_base_binary_like(const DataType& type) {
+  return is_base_binary_like(type.id());
+}
+
+/// \brief Check for a binary-like type
+///
+/// \param[in] type the type to check
+/// \return whether type is a binary-like type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_binary_like(const DataType& type) { return is_binary_like(type.id()); }
+
+/// \brief Check for a large-binary-like type
+///
+/// \param[in] type the type to check
+/// \return whether type is a large-binary-like type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_large_binary_like(const DataType& type) {
+  return is_large_binary_like(type.id());
+}
+
+/// \brief Check for a binary type
+///
+/// \param[in] type the type to check
+/// \return whether type is a binary type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_binary(const DataType& type) { return is_binary(type.id()); }
+
+/// \brief Check for a string type
+///
+/// \param[in] type the type to check
+/// \return whether type is a string type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_string(const DataType& type) { return is_string(type.id()); }
+
+/// \brief Check for a binary-view-like type
+///
+/// \param[in] type the type to check
+/// \return whether type is a binary-view-like type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_binary_view_like(const DataType& type) {
+  return is_binary_view_like(type.id());
+}
+
+/// \brief Check for a temporal type, including time and timestamps for each unit
+///
+/// \param[in] type the type to check
+/// \return whether type is a temporal type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_temporal(const DataType& type) { return is_temporal(type.id()); }
+
+/// \brief Check for an interval type
+///
+/// \param[in] type the type to check
+/// \return whether type is a interval type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_interval(const DataType& type) { return is_interval(type.id()); }
+
+/// \brief Check for a dictionary type
+///
+/// \param[in] type the type to check
+/// \return whether type is a dictionary type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_dictionary(const DataType& type) { return is_dictionary(type.id()); }
+
+/// \brief Check for a fixed-size-binary type
+///
+/// \param[in] type the type to check
+/// \return whether type is a fixed-size-binary type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_fixed_size_binary(const DataType& type) {
+  return is_fixed_size_binary(type.id());
+}
+
+/// \brief Check for a fixed-width type
+///
+/// \param[in] type the type to check
+/// \return whether type is a fixed-width type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_fixed_width(const DataType& type) { return is_fixed_width(type.id()); }
+
+/// \brief Check for a variable-length list type
+///
+/// \param[in] type the type to check
+/// \return whether type is a variable-length list type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_var_length_list(const DataType& type) {
+  return is_var_length_list(type.id());
+}
+
+/// \brief Check for a list-like type
+///
+/// \param[in] type the type to check
+/// \return whether type is a list-like type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_list_like(const DataType& type) { return is_list_like(type.id()); }
+
+/// \brief Check for a var-length list or list-view like type
+///
+/// \param[in] type the type to check
+/// \return whether type is a var-length list or list-view like type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_var_length_list_like(const DataType& type) {
+  return is_var_length_list_like(type.id());
+}
+
+/// \brief Check for a list-view type
+///
+/// \param[in] type the type to check
+/// \return whether type is a list-view type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_list_view(const DataType& type) { return is_list_view(type.id()); }
+
+/// \brief Check for a nested type
+///
+/// \param[in] type the type to check
+/// \return whether type is a nested type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_nested(const DataType& type) { return is_nested(type.id()); }
+
+/// \brief Check for a union type
+///
+/// \param[in] type the type to check
+/// \return whether type is a union type
+///
+/// Convenience for checking using the type's id
+constexpr bool is_union(const DataType& type) { return is_union(type.id()); }
+
+/// @}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/algorithm.h b/pyarrow/include/arrow/util/algorithm.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a0e6ba709d974daebf81cf9e6cdb7aa8b947cc8
--- /dev/null
+++ b/pyarrow/include/arrow/util/algorithm.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/result.h"
+
+namespace arrow {
+
+template <typename InputIterator, typename OutputIterator, typename UnaryOperation>
+Status MaybeTransform(InputIterator first, InputIterator last, OutputIterator out,
+                      UnaryOperation unary_op) {
+  for (; first != last; ++first, (void)++out) {
+    ARROW_ASSIGN_OR_RAISE(*out, unary_op(*first));
+  }
+  return Status::OK();
+}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/align_util.h b/pyarrow/include/arrow/util/align_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..71920e49f4aa2b1d92312b4aabaffafe35d323c7
--- /dev/null
+++ b/pyarrow/include/arrow/util/align_util.h
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+
+#include "arrow/memory_pool.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/bit_util.h"
+
+namespace arrow {
+namespace internal {
+
+struct BitmapWordAlignParams {
+  int64_t leading_bits;
+  int64_t trailing_bits;
+  int64_t trailing_bit_offset;
+  const uint8_t* aligned_start;
+  int64_t aligned_bits;
+  int64_t aligned_words;
+};
+
+// Compute parameters for accessing a bitmap using aligned word instructions.
+// The returned parameters describe:
+// - a leading area of size `leading_bits` before the aligned words
+// - a word-aligned area of size `aligned_bits`
+// - a trailing area of size `trailing_bits` after the aligned words
+template <uint64_t ALIGN_IN_BYTES>
+inline BitmapWordAlignParams BitmapWordAlign(const uint8_t* data, int64_t bit_offset,
+                                             int64_t length) {
+  static_assert(bit_util::IsPowerOf2(ALIGN_IN_BYTES),
+                "ALIGN_IN_BYTES should be a positive power of two");
+  constexpr uint64_t ALIGN_IN_BITS = ALIGN_IN_BYTES * 8;
+
+  BitmapWordAlignParams p;
+
+  // Compute a "bit address" that we can align up to ALIGN_IN_BITS.
+  // We don't care about losing the upper bits since we are only interested in the
+  // difference between both addresses.
+  const uint64_t bit_addr =
+      reinterpret_cast<size_t>(data) * 8 + static_cast<uint64_t>(bit_offset);
+  const uint64_t aligned_bit_addr = bit_util::RoundUpToPowerOf2(bit_addr, ALIGN_IN_BITS);
+
+  p.leading_bits = std::min<int64_t>(length, aligned_bit_addr - bit_addr);
+  p.aligned_words = (length - p.leading_bits) / ALIGN_IN_BITS;
+  p.aligned_bits = p.aligned_words * ALIGN_IN_BITS;
+  p.trailing_bits = length - p.leading_bits - p.aligned_bits;
+  p.trailing_bit_offset = bit_offset + p.leading_bits + p.aligned_bits;
+
+  p.aligned_start = data + (bit_offset + p.leading_bits) / 8;
+  return p;
+}
+}  // namespace internal
+
+namespace util {
+
+// Functions to check if the provided Arrow object is aligned by the specified alignment
+
+/// \brief Special alignment value to use data type-specific alignment
+///
+/// If this is passed as the `alignment` in one of the CheckAlignment or EnsureAlignment
+/// functions, then the function will ensure each buffer is suitably aligned
+/// for the data type of the array.  For example, given an int32 buffer the values
+/// buffer's address must be a multiple of 4.  Given a large_string buffer the offsets
+/// buffer's address must be a multiple of 8.
+constexpr int64_t kValueAlignment = -3;
+
+/// \brief Calculate if the buffer's address is a multiple of `alignment`
+///
+/// If `alignment` is less than or equal to 0 then this method will always return true
+/// \param buffer the buffer to check
+/// \param alignment the alignment (in bytes) to check for
+ARROW_EXPORT bool CheckAlignment(const Buffer& buffer, int64_t alignment);
+/// \brief Calculate if all buffers in the array data are aligned
+///
+/// This will also check the buffers in the dictionary and any children
+/// \param array the array data to check
+/// \param alignment the alignment (in bytes) to check for
+ARROW_EXPORT bool CheckAlignment(const ArrayData& array, int64_t alignment);
+/// \brief Calculate if all buffers in the array are aligned
+///
+/// This will also check the buffers in the dictionary and any children
+/// \param array the array to check
+/// \param alignment the alignment (in bytes) to check for
+ARROW_EXPORT bool CheckAlignment(const Array& array, int64_t alignment);
+
+// Following functions require an additional boolean vector which stores the
+// alignment check bits of the constituent objects.
+// For example, needs_alignment vector for a ChunkedArray will contain the
+// check bits of the constituent Arrays.
+// The boolean vector check was introduced to minimize the repetitive checks
+// of the constituent objects during the EnsureAlignment function where certain
+// objects can be ignored for further checking if we already know that they are
+// completely aligned.
+
+/// \brief Calculate which (if any) chunks in a chunked array are unaligned
+/// \param array the array to check
+/// \param alignment the alignment (in bytes) to check for
+/// \param needs_alignment an output vector that will store the results of the check
+///        it must be set to a valid vector.  Extra elements will be added to the end
+///        of the vector for each chunk that is checked.  `true` will be stored if
+///        the chunk is unaligned.
+/// \param offset the index of the chunk to start checking
+/// \return true if all chunks (starting at `offset`) are aligned, false otherwise
+ARROW_EXPORT bool CheckAlignment(const ChunkedArray& array, int64_t alignment,
+                                 std::vector<bool>* needs_alignment, int offset = 0);
+
+/// \brief calculate which (if any) columns in a record batch are unaligned
+/// \param batch the batch to check
+/// \param alignment the alignment (in bytes) to check for
+/// \param needs_alignment an output vector that will store the results of the
+///        check.  It must be set to a valid vector.  Extra elements will be added
+///        to the end of the vector for each column that is checked.  `true` will be
+///        stored if the column is unaligned.
+ARROW_EXPORT bool CheckAlignment(const RecordBatch& batch, int64_t alignment,
+                                 std::vector<bool>* needs_alignment);
+
+/// \brief calculate which (if any) columns in a table are unaligned
+/// \param table the table to check
+/// \param alignment the alignment (in bytes) to check for
+/// \param needs_alignment an output vector that will store the results of the
+///        check.  It must be set to a valid vector.  Extra elements will be added
+///        to the end of the vector for each column that is checked.  `true` will be
+///        stored if the column is unaligned.
+ARROW_EXPORT bool CheckAlignment(const Table& table, int64_t alignment,
+                                 std::vector<bool>* needs_alignment);
+
+/// \brief return a buffer that has the given alignment and the same data as the input
+/// buffer
+///
+/// If the input buffer is already aligned then this method will return the input buffer
+/// If the input buffer is not already aligned then this method will allocate a new
+/// buffer.  The alignment of the new buffer will have at least
+/// max(kDefaultBufferAlignment, alignment) bytes of alignment.
+///
+/// \param buffer the buffer to check
+/// \param alignment the alignment (in bytes) to check for
+/// \param memory_pool a memory pool that will be used to allocate a new buffer if the
+///        input buffer is not sufficiently aligned
+ARROW_EXPORT Result<std::shared_ptr<Buffer>> EnsureAlignment(
+    std::shared_ptr<Buffer> buffer, int64_t alignment, MemoryPool* memory_pool);
+
+/// \brief return an array data where all buffers are aligned by the given alignment
+///
+/// If any input buffer is already aligned then this method will reuse that same input
+/// buffer.
+///
+/// \param array_data the array data to check
+/// \param alignment the alignment (in bytes) to check for
+/// \param memory_pool a memory pool that will be used to allocate new buffers if any
+///        input buffer is not sufficiently aligned
+ARROW_EXPORT Result<std::shared_ptr<ArrayData>> EnsureAlignment(
+    std::shared_ptr<ArrayData> array_data, int64_t alignment, MemoryPool* memory_pool);
+
+/// \brief return an array where all buffers are aligned by the given alignment
+///
+/// If any input buffer is already aligned then this method will reuse that same input
+/// buffer.
+///
+/// \param array the array to check
+/// \param alignment the alignment (in bytes) to check for
+/// \param memory_pool a memory pool that will be used to allocate new buffers if any
+///        input buffer is not sufficiently aligned
+ARROW_EXPORT Result<std::shared_ptr<Array>> EnsureAlignment(std::shared_ptr<Array> array,
+                                                            int64_t alignment,
+                                                            MemoryPool* memory_pool);
+
+/// \brief return a chunked array where all buffers are aligned by the given alignment
+///
+/// If any input buffer is already aligned then this method will reuse that same input
+/// buffer.
+///
+/// \param array the chunked array to check
+/// \param alignment the alignment (in bytes) to check for
+/// \param memory_pool a memory pool that will be used to allocate new buffers if any
+///        input buffer is not sufficiently aligned
+ARROW_EXPORT Result<std::shared_ptr<ChunkedArray>> EnsureAlignment(
+    std::shared_ptr<ChunkedArray> array, int64_t alignment, MemoryPool* memory_pool);
+
+/// \brief return a record batch where all buffers are aligned by the given alignment
+///
+/// If any input buffer is already aligned then this method will reuse that same input
+/// buffer.
+///
+/// \param batch the batch to check
+/// \param alignment the alignment (in bytes) to check for
+/// \param memory_pool a memory pool that will be used to allocate new buffers if any
+///        input buffer is not sufficiently aligned
+ARROW_EXPORT Result<std::shared_ptr<RecordBatch>> EnsureAlignment(
+    std::shared_ptr<RecordBatch> batch, int64_t alignment, MemoryPool* memory_pool);
+
+/// \brief return a table where all buffers are aligned by the given alignment
+///
+/// If any input buffer is already aligned then this method will reuse that same input
+/// buffer.
+///
+/// \param table the table to check
+/// \param alignment the alignment (in bytes) to check for
+/// \param memory_pool a memory pool that will be used to allocate new buffers if any
+///        input buffer is not sufficiently aligned
+ARROW_EXPORT Result<std::shared_ptr<Table>> EnsureAlignment(std::shared_ptr<Table> table,
+                                                            int64_t alignment,
+                                                            MemoryPool* memory_pool);
+
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/aligned_storage.h b/pyarrow/include/arrow/util/aligned_storage.h
new file mode 100644
index 0000000000000000000000000000000000000000..588806507039c90dc655a1a6df961151ace25502
--- /dev/null
+++ b/pyarrow/include/arrow/util/aligned_storage.h
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstring>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/util/launder.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+template <typename T>
+class AlignedStorage {
+ public:
+  static constexpr bool can_memcpy = std::is_trivial<T>::value;
+
+  constexpr T* get() noexcept {
+    return arrow::internal::launder(reinterpret_cast<T*>(&data_));
+  }
+
+  constexpr const T* get() const noexcept {
+    // Use fully qualified name to avoid ambiguities with MSVC (ARROW-14800)
+    return arrow::internal::launder(reinterpret_cast<const T*>(&data_));
+  }
+
+  void destroy() noexcept {
+    if (!std::is_trivially_destructible<T>::value) {
+      get()->~T();
+    }
+  }
+
+  template <typename... A>
+  void construct(A&&... args) noexcept {
+    new (&data_) T(std::forward<A>(args)...);
+  }
+
+  template <typename V>
+  void assign(V&& v) noexcept {
+    *get() = std::forward<V>(v);
+  }
+
+  void move_construct(AlignedStorage* other) noexcept {
+    new (&data_) T(std::move(*other->get()));
+  }
+
+  void move_assign(AlignedStorage* other) noexcept { *get() = std::move(*other->get()); }
+
+  template <bool CanMemcpy = can_memcpy>
+  static typename std::enable_if<CanMemcpy>::type move_construct_several(
+      AlignedStorage* ARROW_RESTRICT src, AlignedStorage* ARROW_RESTRICT dest, size_t n,
+      size_t memcpy_length) noexcept {
+    memcpy(dest->get(), src->get(), memcpy_length * sizeof(T));
+  }
+
+  template <bool CanMemcpy = can_memcpy>
+  static typename std::enable_if<CanMemcpy>::type
+  move_construct_several_and_destroy_source(AlignedStorage* ARROW_RESTRICT src,
+                                            AlignedStorage* ARROW_RESTRICT dest, size_t n,
+                                            size_t memcpy_length) noexcept {
+    memcpy(dest->get(), src->get(), memcpy_length * sizeof(T));
+  }
+
+  template <bool CanMemcpy = can_memcpy>
+  static typename std::enable_if<!CanMemcpy>::type move_construct_several(
+      AlignedStorage* ARROW_RESTRICT src, AlignedStorage* ARROW_RESTRICT dest, size_t n,
+      size_t memcpy_length) noexcept {
+    for (size_t i = 0; i < n; ++i) {
+      new (dest[i].get()) T(std::move(*src[i].get()));
+    }
+  }
+
+  template <bool CanMemcpy = can_memcpy>
+  static typename std::enable_if<!CanMemcpy>::type
+  move_construct_several_and_destroy_source(AlignedStorage* ARROW_RESTRICT src,
+                                            AlignedStorage* ARROW_RESTRICT dest, size_t n,
+                                            size_t memcpy_length) noexcept {
+    for (size_t i = 0; i < n; ++i) {
+      new (dest[i].get()) T(std::move(*src[i].get()));
+      src[i].destroy();
+    }
+  }
+
+  static void move_construct_several(AlignedStorage* ARROW_RESTRICT src,
+                                     AlignedStorage* ARROW_RESTRICT dest,
+                                     size_t n) noexcept {
+    move_construct_several(src, dest, n, n);
+  }
+
+  static void move_construct_several_and_destroy_source(
+      AlignedStorage* ARROW_RESTRICT src, AlignedStorage* ARROW_RESTRICT dest,
+      size_t n) noexcept {
+    move_construct_several_and_destroy_source(src, dest, n, n);
+  }
+
+  static void destroy_several(AlignedStorage* p, size_t n) noexcept {
+    if (!std::is_trivially_destructible<T>::value) {
+      for (size_t i = 0; i < n; ++i) {
+        p[i].destroy();
+      }
+    }
+  }
+
+ private:
+  alignas(T) std::byte data_[sizeof(T)];
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/async_generator.h b/pyarrow/include/arrow/util/async_generator.h
new file mode 100644
index 0000000000000000000000000000000000000000..57c6d9b5dded5ae0704fecb24ec16aa1f5e071ff
--- /dev/null
+++ b/pyarrow/include/arrow/util/async_generator.h
@@ -0,0 +1,2071 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstring>
+#include <deque>
+#include <limits>
+#include <optional>
+#include <queue>
+
+#include "arrow/util/async_generator_fwd.h"
+#include "arrow/util/async_util.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/future.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/mutex.h"
+#include "arrow/util/queue.h"
+#include "arrow/util/thread_pool.h"
+
+namespace arrow {
+
+// The methods in this file create, modify, and utilize AsyncGenerator which is an
+// iterator of futures.  This allows an asynchronous source (like file input) to be run
+// through a pipeline in the same way that iterators can be used to create pipelined
+// workflows.
+//
+// In order to support pipeline parallelism we introduce the concept of asynchronous
+// reentrancy. This is different than synchronous reentrancy.  With synchronous code a
+// function is reentrant if the function can be called again while a previous call to that
+// function is still running.  Unless otherwise specified none of these generators are
+// synchronously reentrant.  Care should be taken to avoid calling them in such a way (and
+// the utilities Visit/Collect/Await take care to do this).
+//
+// Asynchronous reentrancy on the other hand means the function is called again before the
+// future returned by the function is marked finished (but after the call to get the
+// future returns).  Some of these generators are async-reentrant while others (e.g.
+// those that depend on ordered processing like decompression) are not.  Read the MakeXYZ
+// function comments to determine which generators support async reentrancy.
+//
+// Note: Generators that are not asynchronously reentrant can still support readahead
+// (\see MakeSerialReadaheadGenerator).
+//
+// Readahead operators, and some other operators, may introduce queueing.  Any operators
+// that introduce buffering should detail the amount of buffering they introduce in their
+// MakeXYZ function comments.
+//
+// A generator should always be fully consumed before it is destroyed.
+// A generator should not mark a future complete with an error status or a terminal value
+//   until all outstanding futures have completed.  Generators that spawn multiple
+//   concurrent futures may need to hold onto an error while other concurrent futures wrap
+//   up.
+template <typename T>
+struct IterationTraits<AsyncGenerator<T>> {
+  /// \brief by default when iterating through a sequence of AsyncGenerator<T>,
+  /// an empty function indicates the end of iteration.
+  static AsyncGenerator<T> End() { return AsyncGenerator<T>(); }
+
+  static bool IsEnd(const AsyncGenerator<T>& val) { return !val; }
+};
+
+template <typename T>
+Future<T> AsyncGeneratorEnd() {
+  return Future<T>::MakeFinished(IterationTraits<T>::End());
+}
+
+/// returning a future that completes when all have been visited
+template <typename T, typename Visitor>
+Future<> VisitAsyncGenerator(AsyncGenerator<T> generator, Visitor visitor) {
+  struct LoopBody {
+    struct Callback {
+      Result<ControlFlow<>> operator()(const T& next) {
+        if (IsIterationEnd(next)) {
+          return Break();
+        } else {
+          auto visited = visitor(next);
+          if (visited.ok()) {
+            return Continue();
+          } else {
+            return visited;
+          }
+        }
+      }
+
+      Visitor visitor;
+    };
+
+    Future<ControlFlow<>> operator()() {
+      Callback callback{visitor};
+      auto next = generator();
+      return next.Then(std::move(callback));
+    }
+
+    AsyncGenerator<T> generator;
+    Visitor visitor;
+  };
+
+  return Loop(LoopBody{std::move(generator), std::move(visitor)});
+}
+
+/// \brief Wait for an async generator to complete, discarding results.
+template <typename T>
+Future<> DiscardAllFromAsyncGenerator(AsyncGenerator<T> generator) {
+  std::function<Status(T)> visitor = [](const T&) { return Status::OK(); };
+  return VisitAsyncGenerator(generator, visitor);
+}
+
+/// \brief Collect the results of an async generator into a vector
+template <typename T>
+Future<std::vector<T>> CollectAsyncGenerator(AsyncGenerator<T> generator) {
+  auto vec = std::make_shared<std::vector<T>>();
+  auto loop_body = [generator = std::move(generator),
+                    vec = std::move(vec)]() -> Future<ControlFlow<std::vector<T>>> {
+    auto next = generator();
+    return next.Then([vec](const T& result) -> Result<ControlFlow<std::vector<T>>> {
+      if (IsIterationEnd(result)) {
+        return Break(*vec);
+      } else {
+        vec->push_back(result);
+        return Continue();
+      }
+    });
+  };
+  return Loop(std::move(loop_body));
+}
+
+/// \see MakeMappedGenerator
+template <typename T, typename V>
+class MappingGenerator {
+ public:
+  MappingGenerator(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
+      : state_(std::make_shared<State>(std::move(source), std::move(map))) {}
+
+  Future<V> operator()() {
+    auto future = Future<V>::Make();
+    bool should_trigger;
+    {
+      auto guard = state_->mutex.Lock();
+      if (state_->finished) {
+        return AsyncGeneratorEnd<V>();
+      }
+      should_trigger = state_->waiting_jobs.empty();
+      state_->waiting_jobs.push_back(future);
+    }
+    if (should_trigger) {
+      state_->source().AddCallback(Callback{state_});
+    }
+    return future;
+  }
+
+ private:
+  struct State {
+    State(AsyncGenerator<T> source, std::function<Future<V>(const T&)> map)
+        : source(std::move(source)),
+          map(std::move(map)),
+          waiting_jobs(),
+          mutex(),
+          finished(false) {}
+
+    void Purge() {
+      // This might be called by an original callback (if the source iterator fails or
+      // ends) or by a mapped callback (if the map function fails or ends prematurely).
+      // Either way it should only be called once and after finished is set so there is no
+      // need to guard access to `waiting_jobs`.
+      while (!waiting_jobs.empty()) {
+        waiting_jobs.front().MarkFinished(IterationTraits<V>::End());
+        waiting_jobs.pop_front();
+      }
+    }
+
+    AsyncGenerator<T> source;
+    std::function<Future<V>(const T&)> map;
+    std::deque<Future<V>> waiting_jobs;
+    util::Mutex mutex;
+    bool finished;
+  };
+
+  struct Callback;
+
+  struct MappedCallback {
+    void operator()(const Result<V>& maybe_next) {
+      bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
+      bool should_purge = false;
+      if (end) {
+        {
+          auto guard = state->mutex.Lock();
+          should_purge = !state->finished;
+          state->finished = true;
+        }
+      }
+      sink.MarkFinished(maybe_next);
+      if (should_purge) {
+        state->Purge();
+      }
+    }
+    std::shared_ptr<State> state;
+    Future<V> sink;
+  };
+
+  struct Callback {
+    void operator()(const Result<T>& maybe_next) {
+      Future<V> sink;
+      bool end = !maybe_next.ok() || IsIterationEnd(*maybe_next);
+      bool should_purge = false;
+      bool should_trigger;
+      {
+        auto guard = state->mutex.Lock();
+        // A MappedCallback may have purged or be purging the queue;
+        // we shouldn't do anything here.
+        if (state->finished) return;
+        if (end) {
+          should_purge = !state->finished;
+          state->finished = true;
+        }
+        sink = state->waiting_jobs.front();
+        state->waiting_jobs.pop_front();
+        should_trigger = !end && !state->waiting_jobs.empty();
+      }
+      if (should_purge) {
+        state->Purge();
+      }
+      if (should_trigger) {
+        state->source().AddCallback(Callback{state});
+      }
+      if (maybe_next.ok()) {
+        const T& val = maybe_next.ValueUnsafe();
+        if (IsIterationEnd(val)) {
+          sink.MarkFinished(IterationTraits<V>::End());
+        } else {
+          Future<V> mapped_fut = state->map(val);
+          mapped_fut.AddCallback(MappedCallback{std::move(state), std::move(sink)});
+        }
+      } else {
+        sink.MarkFinished(maybe_next.status());
+      }
+    }
+
+    std::shared_ptr<State> state;
+  };
+
+  std::shared_ptr<State> state_;
+};
+
+/// \brief Create a generator that will apply the map function to each element of
+/// source.  The map function is not called on the end token.
+///
+/// Note: This function makes a copy of `map` for each item
+/// Note: Errors returned from the `map` function will be propagated
+///
+/// If the source generator is async-reentrant then this generator will be also
+template <typename T, typename MapFn,
+          typename Mapped = detail::result_of_t<MapFn(const T&)>,
+          typename V = typename EnsureFuture<Mapped>::type::ValueType>
+AsyncGenerator<V> MakeMappedGenerator(AsyncGenerator<T> source_generator, MapFn map) {
+  auto map_callback = [map = std::move(map)](const T& val) mutable -> Future<V> {
+    return ToFuture(map(val));
+  };
+  return MappingGenerator<T, V>(std::move(source_generator), std::move(map_callback));
+}
+
+/// \brief Create a generator that will apply the map function to
+/// each element of source.  The map function is not called on the end
+/// token.  The result of the map function should be another
+/// generator; all these generators will then be flattened to produce
+/// a single stream of items.
+///
+/// Note: This function makes a copy of `map` for each item
+/// Note: Errors returned from the `map` function will be propagated
+///
+/// If the source generator is async-reentrant then this generator will be also
+template <typename T, typename MapFn,
+          typename Mapped = detail::result_of_t<MapFn(const T&)>,
+          typename V = typename EnsureFuture<Mapped>::type::ValueType>
+AsyncGenerator<T> MakeFlatMappedGenerator(AsyncGenerator<T> source_generator, MapFn map) {
+  return MakeConcatenatedGenerator(
+      MakeMappedGenerator(std::move(source_generator), std::move(map)));
+}
+
+/// \see MakeSequencingGenerator
+template <typename T, typename ComesAfter, typename IsNext>
+class SequencingGenerator {
+ public:
+  SequencingGenerator(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next,
+                      T initial_value)
+      : state_(std::make_shared<State>(std::move(source), std::move(compare),
+                                       std::move(is_next), std::move(initial_value))) {}
+
+  Future<T> operator()() {
+    {
+      auto guard = state_->mutex.Lock();
+      // We can send a result immediately if the top of the queue is either an
+      // error or the next item
+      if (!state_->queue.empty() &&
+          (!state_->queue.top().ok() ||
+           state_->is_next(state_->previous_value, *state_->queue.top()))) {
+        auto result = std::move(state_->queue.top());
+        if (result.ok()) {
+          state_->previous_value = *result;
+        }
+        state_->queue.pop();
+        return Future<T>::MakeFinished(result);
+      }
+      if (state_->finished) {
+        return AsyncGeneratorEnd<T>();
+      }
+      // The next item is not in the queue so we will need to wait
+      auto new_waiting_fut = Future<T>::Make();
+      state_->waiting_future = new_waiting_fut;
+      guard.Unlock();
+      state_->source().AddCallback(Callback{state_});
+      return new_waiting_fut;
+    }
+  }
+
+ private:
+  struct WrappedComesAfter {
+    bool operator()(const Result<T>& left, const Result<T>& right) {
+      if (!left.ok() || !right.ok()) {
+        // Should never happen
+        return false;
+      }
+      return compare(*left, *right);
+    }
+    ComesAfter compare;
+  };
+
+  struct State {
+    State(AsyncGenerator<T> source, ComesAfter compare, IsNext is_next, T initial_value)
+        : source(std::move(source)),
+          is_next(std::move(is_next)),
+          previous_value(std::move(initial_value)),
+          waiting_future(),
+          queue(WrappedComesAfter{compare}),
+          finished(false),
+          mutex() {}
+
+    AsyncGenerator<T> source;
+    IsNext is_next;
+    T previous_value;
+    Future<T> waiting_future;
+    std::priority_queue<Result<T>, std::vector<Result<T>>, WrappedComesAfter> queue;
+    bool finished;
+    util::Mutex mutex;
+  };
+
+  class Callback {
+   public:
+    explicit Callback(std::shared_ptr<State> state) : state_(std::move(state)) {}
+
+    void operator()(const Result<T> result) {
+      Future<T> to_deliver;
+      bool finished;
+      {
+        auto guard = state_->mutex.Lock();
+        bool ready_to_deliver = false;
+        if (!result.ok()) {
+          // Clear any cached results
+          while (!state_->queue.empty()) {
+            state_->queue.pop();
+          }
+          ready_to_deliver = true;
+          state_->finished = true;
+        } else if (IsIterationEnd<T>(result.ValueUnsafe())) {
+          ready_to_deliver = state_->queue.empty();
+          state_->finished = true;
+        } else {
+          ready_to_deliver = state_->is_next(state_->previous_value, *result);
+        }
+
+        if (ready_to_deliver && state_->waiting_future.is_valid()) {
+          to_deliver = state_->waiting_future;
+          if (result.ok()) {
+            state_->previous_value = *result;
+          }
+        } else {
+          state_->queue.push(result);
+        }
+        // Capture state_->finished so we can access it outside the mutex
+        finished = state_->finished;
+      }
+      // Must deliver result outside of the mutex
+      if (to_deliver.is_valid()) {
+        to_deliver.MarkFinished(result);
+      } else {
+        // Otherwise, if we didn't get the next item (or a terminal item), we
+        // need to keep looking
+        if (!finished) {
+          state_->source().AddCallback(Callback{state_});
+        }
+      }
+    }
+
+   private:
+    const std::shared_ptr<State> state_;
+  };
+
+  const std::shared_ptr<State> state_;
+};
+
+/// \brief Buffer an AsyncGenerator to return values in sequence order  ComesAfter
+/// and IsNext determine the sequence order.
+///
+/// ComesAfter should be a BinaryPredicate that only returns true if a comes after b
+///
+/// IsNext should be a BinaryPredicate that returns true, given `a` and `b`, only if
+/// `b` follows immediately after `a`.  It should return true given `initial_value` and
+/// `b` if `b` is the first item in the sequence.
+///
+/// This operator will queue unboundedly while waiting for the next item.  It is intended
+/// for jittery sources that might scatter an ordered sequence.  It is NOT intended to
+/// sort.  Using it to try and sort could result in excessive RAM usage.  This generator
+/// will queue up to N blocks where N is the max "out of order"ness of the source.
+///
+/// For example, if the source is 1,6,2,5,4,3 it will queue 3 blocks because 3 is 3
+/// blocks beyond where it belongs.
+///
+/// This generator is not async-reentrant but it consists only of a simple log(n)
+/// insertion into a priority queue.
+template <typename T, typename ComesAfter, typename IsNext>
+AsyncGenerator<T> MakeSequencingGenerator(AsyncGenerator<T> source_generator,
+                                          ComesAfter compare, IsNext is_next,
+                                          T initial_value) {
+  return SequencingGenerator<T, ComesAfter, IsNext>(
+      std::move(source_generator), std::move(compare), std::move(is_next),
+      std::move(initial_value));
+}
+
+/// \see MakeTransformedGenerator
+template <typename T, typename V>
+class TransformingGenerator {
+  // The transforming generator state will be referenced as an async generator but will
+  // also be referenced via callback to various futures.  If the async generator owner
+  // moves it around we need the state to be consistent for future callbacks.
+  struct TransformingGeneratorState
+      : std::enable_shared_from_this<TransformingGeneratorState> {
+    TransformingGeneratorState(AsyncGenerator<T> generator, Transformer<T, V> transformer)
+        : generator_(std::move(generator)),
+          transformer_(std::move(transformer)),
+          last_value_(),
+          finished_() {}
+
+    Future<V> operator()() {
+      while (true) {
+        auto maybe_next_result = Pump();
+        if (!maybe_next_result.ok()) {
+          return Future<V>::MakeFinished(maybe_next_result.status());
+        }
+        auto maybe_next = std::move(maybe_next_result).ValueUnsafe();
+        if (maybe_next.has_value()) {
+          return Future<V>::MakeFinished(*std::move(maybe_next));
+        }
+
+        auto next_fut = generator_();
+        // If finished already, process results immediately inside the loop to avoid
+        // stack overflow
+        if (next_fut.is_finished()) {
+          auto next_result = next_fut.result();
+          if (next_result.ok()) {
+            last_value_ = *next_result;
+          } else {
+            return Future<V>::MakeFinished(next_result.status());
+          }
+          // Otherwise, if not finished immediately, add callback to process results
+        } else {
+          auto self = this->shared_from_this();
+          return next_fut.Then([self](const T& next_result) {
+            self->last_value_ = next_result;
+            return (*self)();
+          });
+        }
+      }
+    }
+
+    // See comment on TransformingIterator::Pump
+    Result<std::optional<V>> Pump() {
+      if (!finished_ && last_value_.has_value()) {
+        ARROW_ASSIGN_OR_RAISE(TransformFlow<V> next, transformer_(*last_value_));
+        if (next.ReadyForNext()) {
+          if (IsIterationEnd(*last_value_)) {
+            finished_ = true;
+          }
+          last_value_.reset();
+        }
+        if (next.Finished()) {
+          finished_ = true;
+        }
+        if (next.HasValue()) {
+          return next.Value();
+        }
+      }
+      if (finished_) {
+        return IterationTraits<V>::End();
+      }
+      return std::nullopt;
+    }
+
+    AsyncGenerator<T> generator_;
+    Transformer<T, V> transformer_;
+    std::optional<T> last_value_;
+    bool finished_;
+  };
+
+ public:
+  explicit TransformingGenerator(AsyncGenerator<T> generator,
+                                 Transformer<T, V> transformer)
+      : state_(std::make_shared<TransformingGeneratorState>(std::move(generator),
+                                                            std::move(transformer))) {}
+
+  Future<V> operator()() { return (*state_)(); }
+
+ protected:
+  std::shared_ptr<TransformingGeneratorState> state_;
+};
+
+/// \brief Transform an async generator using a transformer function returning a new
+/// AsyncGenerator
+///
+/// The transform function here behaves exactly the same as the transform function in
+/// MakeTransformedIterator and you can safely use the same transform function to
+/// transform both synchronous and asynchronous streams.
+///
+/// This generator is not async-reentrant
+///
+/// This generator may queue up to 1 instance of T but will not delay
+template <typename T, typename V>
+AsyncGenerator<V> MakeTransformedGenerator(AsyncGenerator<T> generator,
+                                           Transformer<T, V> transformer) {
+  return TransformingGenerator<T, V>(generator, transformer);
+}
+
+/// \see MakeSerialReadaheadGenerator
+template <typename T>
+class SerialReadaheadGenerator {
+ public:
+  SerialReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
+      : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
+
+  Future<T> operator()() {
+    if (state_->first_) {
+      // Lazy generator, need to wait for the first ask to prime the pump
+      state_->first_ = false;
+      auto next = state_->source_();
+      return next.Then(Callback{state_}, ErrCallback{state_});
+    }
+
+    // This generator is not async-reentrant.  We won't be called until the last
+    // future finished so we know there is something in the queue
+    auto finished = state_->finished_.load();
+    if (finished && state_->readahead_queue_.IsEmpty()) {
+      return AsyncGeneratorEnd<T>();
+    }
+
+    std::shared_ptr<Future<T>> next;
+    if (!state_->readahead_queue_.Read(next)) {
+      return Status::UnknownError("Could not read from readahead_queue");
+    }
+
+    auto last_available = state_->spaces_available_.fetch_add(1);
+    if (last_available == 0 && !finished) {
+      // Reader idled out, we need to restart it
+      ARROW_RETURN_NOT_OK(state_->Pump(state_));
+    }
+    return *next;
+  }
+
+ private:
+  struct State {
+    State(AsyncGenerator<T> source, int max_readahead)
+        : first_(true),
+          source_(std::move(source)),
+          finished_(false),
+          // There is one extra "space" for the in-flight request
+          spaces_available_(max_readahead + 1),
+          // The SPSC queue has size-1 "usable" slots so we need to overallocate 1
+          readahead_queue_(max_readahead + 1) {}
+
+    Status Pump(const std::shared_ptr<State>& self) {
+      // Can't do readahead_queue.write(source().Then(...)) because then the
+      // callback might run immediately and add itself to the queue before this gets added
+      // to the queue messing up the order.
+      auto next_slot = std::make_shared<Future<T>>();
+      auto written = readahead_queue_.Write(next_slot);
+      if (!written) {
+        return Status::UnknownError("Could not write to readahead_queue");
+      }
+      // If this Pump is being called from a callback it is possible for the source to
+      // poll and read from the queue between the Write and this spot where we fill the
+      // value in. However, it is not possible for the future to read this value we are
+      // writing.  That is because this callback (the callback for future X) must be
+      // finished before future X is marked complete and this source is not pulled
+      // reentrantly so it will not poll for future X+1 until this callback has completed.
+      *next_slot = source_().Then(Callback{self}, ErrCallback{self});
+      return Status::OK();
+    }
+
+    // Only accessed by the consumer end
+    bool first_;
+    // Accessed by both threads
+    AsyncGenerator<T> source_;
+    std::atomic<bool> finished_;
+    // The queue has a size but it is not atomic.  We keep track of how many spaces are
+    // left in the queue here so we know if we've just written the last value and we need
+    // to stop reading ahead or if we've just read from a full queue and we need to
+    // restart reading ahead
+    std::atomic<uint32_t> spaces_available_;
+    // Needs to be a queue of shared_ptr and not Future because we set the value of the
+    // future after we add it to the queue
+    util::SpscQueue<std::shared_ptr<Future<T>>> readahead_queue_;
+  };
+
+  struct Callback {
+    Result<T> operator()(const T& next) {
+      if (IsIterationEnd(next)) {
+        state_->finished_.store(true);
+        return next;
+      }
+      auto last_available = state_->spaces_available_.fetch_sub(1);
+      if (last_available > 1) {
+        ARROW_RETURN_NOT_OK(state_->Pump(state_));
+      }
+      return next;
+    }
+
+    std::shared_ptr<State> state_;
+  };
+
+  struct ErrCallback {
+    Result<T> operator()(const Status& st) {
+      state_->finished_.store(true);
+      return st;
+    }
+
+    std::shared_ptr<State> state_;
+  };
+
+  std::shared_ptr<State> state_;
+};
+
+/// \see MakeFromFuture
+template <typename T>
+class FutureFirstGenerator {
+ public:
+  explicit FutureFirstGenerator(Future<AsyncGenerator<T>> future)
+      : state_(std::make_shared<State>(std::move(future))) {}
+
+  Future<T> operator()() {
+    if (state_->source_) {
+      return state_->source_();
+    } else {
+      auto state = state_;
+      return state_->future_.Then([state](const AsyncGenerator<T>& source) {
+        state->source_ = source;
+        return state->source_();
+      });
+    }
+  }
+
+ private:
+  struct State {
+    explicit State(Future<AsyncGenerator<T>> future) : future_(future), source_() {}
+
+    Future<AsyncGenerator<T>> future_;
+    AsyncGenerator<T> source_;
+  };
+
+  std::shared_ptr<State> state_;
+};
+
+/// \brief Transform a Future<AsyncGenerator<T>> into an AsyncGenerator<T>
+/// that waits for the future to complete as part of the first item.
+///
+/// This generator is not async-reentrant (even if the generator yielded by future is)
+///
+/// This generator does not queue
+template <typename T>
+AsyncGenerator<T> MakeFromFuture(Future<AsyncGenerator<T>> future) {
+  return FutureFirstGenerator<T>(std::move(future));
+}
+
+/// \brief Create a generator that will pull from the source into a queue.  Unlike
+/// MakeReadaheadGenerator this will not pull reentrantly from the source.
+///
+/// The source generator does not need to be async-reentrant
+///
+/// This generator is not async-reentrant (even if the source is)
+///
+/// This generator may queue up to max_readahead additional instances of T
+template <typename T>
+AsyncGenerator<T> MakeSerialReadaheadGenerator(AsyncGenerator<T> source_generator,
+                                               int max_readahead) {
+  return SerialReadaheadGenerator<T>(std::move(source_generator), max_readahead);
+}
+
+/// \brief Create a generator that immediately pulls from the source
+///
+/// Typical generators do not pull from their source until they themselves
+/// are pulled.  This generator does not follow that convention and will call
+/// generator() once before it returns.  The returned generator will otherwise
+/// mirror the source.
+///
+/// This generator forwards async-reentrant pressure to the source
+/// This generator buffers one item (the first result) until it is delivered.
+template <typename T>
+AsyncGenerator<T> MakeAutoStartingGenerator(AsyncGenerator<T> generator) {
+  struct AutostartGenerator {
+    Future<T> operator()() {
+      if (first_future->is_valid()) {
+        Future<T> result = *first_future;
+        *first_future = Future<T>();
+        return result;
+      }
+      return source();
+    }
+
+    std::shared_ptr<Future<T>> first_future;
+    AsyncGenerator<T> source;
+  };
+
+  std::shared_ptr<Future<T>> first_future = std::make_shared<Future<T>>(generator());
+  return AutostartGenerator{std::move(first_future), std::move(generator)};
+}
+
+/// \see MakeReadaheadGenerator
+template <typename T>
+class ReadaheadGenerator {
+ public:
+  ReadaheadGenerator(AsyncGenerator<T> source_generator, int max_readahead)
+      : state_(std::make_shared<State>(std::move(source_generator), max_readahead)) {}
+
+  Future<T> AddMarkFinishedContinuation(Future<T> fut) {
+    auto state = state_;
+    return fut.Then(
+        [state](const T& result) -> Future<T> {
+          bool mark_finished = false;
+          {
+            auto guard = state->mutex.Lock();
+            state->MarkFinishedIfDone(result);
+            --state->num_running;
+            if (state->finished) {
+              mark_finished = state->num_running == 0;
+            }
+          }
+          if (mark_finished) {
+            state->final_future.MarkFinished();
+          }
+          return result;
+        },
+        [state](const Status& err) -> Future<T> {
+          // If there is an error we need to make sure all running
+          // tasks finish before we return the error.
+          bool mark_finished = false;
+          {
+            auto guard = state->mutex.Lock();
+            state->finished = true;
+            --state->num_running;
+            mark_finished = state->num_running == 0;
+          }
+          if (mark_finished) {
+            state->final_future.MarkFinished();
+          }
+          return state->final_future.Then([err]() -> Result<T> { return err; });
+        });
+  }
+
+  Future<T> operator()() {
+    if (state_->readahead_queue.empty()) {
+      // This is the first request, let's pump the underlying queue
+      {
+        auto guard = state_->mutex.Lock();
+        // We're going to push to the queue below, but we need
+        // to update `num_running` while we're holding the lock.
+        state_->num_running = state_->max_readahead;
+      }
+      for (int i = 0; i < state_->max_readahead; i++) {
+        auto next = state_->source_generator();
+        auto next_after_check = AddMarkFinishedContinuation(std::move(next));
+        state_->readahead_queue.push(std::move(next_after_check));
+      }
+    }
+    // Pop one and add one
+    auto result = std::move(state_->readahead_queue.front());
+    state_->readahead_queue.pop();
+    bool is_finished = false;
+    {
+      auto guard = state_->mutex.Lock();
+      is_finished = state_->finished;
+      if (!is_finished) {
+        // We're going to push to the queue below, but we need
+        // to update `num_running` while we're holding the lock.
+        ++state_->num_running;
+      }
+    }
+    if (is_finished) {
+      state_->readahead_queue.push(AsyncGeneratorEnd<T>());
+    } else {
+      auto back_of_queue = state_->source_generator();
+      auto back_of_queue_after_check =
+          AddMarkFinishedContinuation(std::move(back_of_queue));
+      state_->readahead_queue.push(std::move(back_of_queue_after_check));
+    }
+    return result;
+  }
+
+ private:
+  struct State {
+    State(AsyncGenerator<T> source_generator, int max_readahead)
+        : source_generator(std::move(source_generator)), max_readahead(max_readahead) {}
+
+    void MarkFinishedIfDone(const T& next_result) {
+      // ASSERT_HELD(mutex)
+      if (IsIterationEnd(next_result)) {
+        finished = true;
+      }
+    }
+
+    AsyncGenerator<T> source_generator;
+    int max_readahead;
+    Future<> final_future = Future<>::Make();
+    int num_running{0};    // GUARDED_BY(mutex)
+    bool finished{false};  // GUARDED_BY(mutex)
+    arrow::util::Mutex mutex;
+    std::queue<Future<T>> readahead_queue;
+  };
+
+  std::shared_ptr<State> state_;
+};
+
+/// \brief A generator where the producer pushes items on a queue.
+///
+/// No back-pressure is applied, so this generator is mostly useful when
+/// producing the values is neither CPU- nor memory-expensive (e.g. fetching
+/// filesystem metadata).
+///
+/// This generator is not async-reentrant.
+template <typename T>
+class PushGenerator {
+  struct State {
+    State() {}
+
+    util::Mutex mutex;
+    std::deque<Result<T>> result_q;
+    std::optional<Future<T>> consumer_fut;
+    bool finished = false;
+  };
+
+ public:
+  /// Producer API for PushGenerator
+  class Producer {
+   public:
+    explicit Producer(const std::shared_ptr<State>& state) : weak_state_(state) {}
+
+    /// \brief Push a value on the queue
+    ///
+    /// True is returned if the value was pushed, false if the generator is
+    /// already closed or destroyed.  If the latter, it is recommended to stop
+    /// producing any further values.
+    bool Push(Result<T> result) {
+      auto state = weak_state_.lock();
+      if (!state) {
+        // Generator was destroyed
+        return false;
+      }
+      auto lock = state->mutex.Lock();
+      if (state->finished) {
+        // Closed early
+        return false;
+      }
+      if (state->consumer_fut.has_value()) {
+        auto fut = std::move(state->consumer_fut.value());
+        state->consumer_fut.reset();
+        lock.Unlock();  // unlock before potentially invoking a callback
+        fut.MarkFinished(std::move(result));
+      } else {
+        state->result_q.push_back(std::move(result));
+      }
+      return true;
+    }
+
+    /// \brief Tell the consumer we have finished producing
+    ///
+    /// It is allowed to call this and later call Push() again ("early close").
+    /// In this case, calls to Push() after the queue is closed are silently
+    /// ignored.  This can help implementing non-trivial cancellation cases.
+    ///
+    /// True is returned on success, false if the generator is already closed
+    /// or destroyed.
+    bool Close() {
+      auto state = weak_state_.lock();
+      if (!state) {
+        // Generator was destroyed
+        return false;
+      }
+      auto lock = state->mutex.Lock();
+      if (state->finished) {
+        // Already closed
+        return false;
+      }
+      state->finished = true;
+      if (state->consumer_fut.has_value()) {
+        auto fut = std::move(state->consumer_fut.value());
+        state->consumer_fut.reset();
+        lock.Unlock();  // unlock before potentially invoking a callback
+        fut.MarkFinished(IterationTraits<T>::End());
+      }
+      return true;
+    }
+
+    /// Return whether the generator was closed or destroyed.
+    bool is_closed() const {
+      auto state = weak_state_.lock();
+      if (!state) {
+        // Generator was destroyed
+        return true;
+      }
+      auto lock = state->mutex.Lock();
+      return state->finished;
+    }
+
+   private:
+    const std::weak_ptr<State> weak_state_;
+  };
+
+  PushGenerator() : state_(std::make_shared<State>()) {}
+
+  /// Read an item from the queue
+  Future<T> operator()() const {
+    auto lock = state_->mutex.Lock();
+    assert(!state_->consumer_fut.has_value());  // Non-reentrant
+    if (!state_->result_q.empty()) {
+      auto fut = Future<T>::MakeFinished(std::move(state_->result_q.front()));
+      state_->result_q.pop_front();
+      return fut;
+    }
+    if (state_->finished) {
+      return AsyncGeneratorEnd<T>();
+    }
+    auto fut = Future<T>::Make();
+    state_->consumer_fut = fut;
+    return fut;
+  }
+
+  /// \brief Return producer-side interface
+  ///
+  /// The returned object must be used by the producer to push values on the queue.
+  /// Only a single Producer object should be instantiated.
+  Producer producer() { return Producer{state_}; }
+
+ private:
+  const std::shared_ptr<State> state_;
+};
+
+/// \brief Create a generator that pulls reentrantly from a source
+/// This generator will pull reentrantly from a source, ensuring that max_readahead
+/// requests are active at any given time.
+///
+/// The source generator must be async-reentrant
+///
+/// This generator itself is async-reentrant.
+///
+/// This generator may queue up to max_readahead instances of T
+template <typename T>
+AsyncGenerator<T> MakeReadaheadGenerator(AsyncGenerator<T> source_generator,
+                                         int max_readahead) {
+  return ReadaheadGenerator<T>(std::move(source_generator), max_readahead);
+}
+
+/// \brief Creates a generator that will yield finished futures from a vector
+///
+/// This generator is async-reentrant
+template <typename T>
+AsyncGenerator<T> MakeVectorGenerator(std::vector<T> vec) {
+  struct State {
+    explicit State(std::vector<T> vec_) : vec(std::move(vec_)), vec_idx(0) {}
+
+    std::vector<T> vec;
+    std::atomic<std::size_t> vec_idx;
+  };
+
+  auto state = std::make_shared<State>(std::move(vec));
+  return [state]() {
+    auto idx = state->vec_idx.fetch_add(1);
+    if (idx >= state->vec.size()) {
+      // Eagerly return memory
+      state->vec.clear();
+      return AsyncGeneratorEnd<T>();
+    }
+    return Future<T>::MakeFinished(state->vec[idx]);
+  };
+}
+
+/// \see MakeMergedGenerator
+template <typename T>
+class MergedGenerator {
+  // Note, the implementation of this class is quite complex at the moment (PRs to
+  // simplify are always welcome)
+  //
+  // Terminology is borrowed from rxjs.  This is a pull based implementation of the
+  // mergeAll operator.  The "outer subscription" refers to the async
+  // generator that the caller provided when creating this.  The outer subscription
+  // yields generators.
+  //
+  // Each of these generators is then subscribed to (up to max_subscriptions) and these
+  // are referred to as "inner subscriptions".
+  //
+  // As soon as we start we try and establish `max_subscriptions` inner subscriptions. For
+  // each inner subscription we will cache up to 1 value.  This means we may have more
+  // values than we have been asked for.  In our example, if a caller asks for one record
+  // batch we will start scanning `max_subscriptions` different files.  For each file we
+  // will only queue up to 1 batch (so a separate readahead is needed on the file if batch
+  // readahead is desired).
+  //
+  // If the caller is slow we may accumulate ready-to-deliver items.  These are stored
+  // in `delivered_jobs`.
+  //
+  // If the caller is very quick we may accumulate requests.  These are stored in
+  // `waiting_jobs`.
+  //
+  // It may be helpful to consider an example, in the scanner the outer subscription
+  // is some kind of asynchronous directory listing.  The inner subscription is
+  // then a scan on a file yielded by the directory listing.
+  //
+  // An "outstanding" request is when we have polled either the inner or outer
+  // subscription but that future hasn't completed yet.
+  //
+  // There are three possible "events" that can happen.
+  // * A caller could request the next future
+  // * An outer callback occurs when the next subscription is ready (e.g. the directory
+  //     listing has produced a new file)
+  // * An inner callback occurs when one of the inner subscriptions emits a value (e.g.
+  //     a file scan emits a record batch)
+  //
+  // Any time an event happens the logic is broken into two phases.  First, we grab the
+  // lock and modify the shared state.  While doing this we figure out what callbacks we
+  // will need to execute.  Then, we give up the lock and execute these callbacks.  It is
+  // important to execute these callbacks without the lock to avoid deadlock.
+ public:
+  explicit MergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
+                           int max_subscriptions)
+      : state_(std::make_shared<State>(std::move(source), max_subscriptions)) {}
+
+  Future<T> operator()() {
+    // A caller has requested a future
+    Future<T> waiting_future;
+    std::shared_ptr<DeliveredJob> delivered_job;
+    bool mark_generator_complete = false;
+    {
+      auto guard = state_->mutex.Lock();
+      if (!state_->delivered_jobs.empty()) {
+        // If we have a job sitting around we can deliver it
+        delivered_job = std::move(state_->delivered_jobs.front());
+        state_->delivered_jobs.pop_front();
+        if (state_->IsCompleteUnlocked(guard)) {
+          // It's possible this waiting job was the only thing left to handle and
+          // we have now completed the generator.
+          mark_generator_complete = true;
+        } else {
+          // Since we had a job sitting around we also had an inner subscription
+          // that had paused.  We are going to restart this inner subscription and
+          // so there will be a new outstanding request.
+          state_->outstanding_requests++;
+        }
+      } else if (state_->broken ||
+                 (!state_->first && state_->num_running_subscriptions == 0)) {
+        // If we are broken or exhausted then prepare a terminal item but
+        // we won't complete it until we've finished.
+        Result<T> end_res = IterationEnd<T>();
+        if (!state_->final_error.ok()) {
+          end_res = state_->final_error;
+          state_->final_error = Status::OK();
+        }
+        return state_->all_finished.Then([end_res]() -> Result<T> { return end_res; });
+      } else {
+        // Otherwise we just queue the request and it will be completed when one of the
+        // ongoing inner subscriptions delivers a result
+        waiting_future = Future<T>::Make();
+        state_->waiting_jobs.push_back(std::make_shared<Future<T>>(waiting_future));
+      }
+      if (state_->first) {
+        // On the first request we are going to try and immediately fill our queue
+        // of subscriptions.  We assume we are going to be able to start them all.
+        state_->outstanding_requests +=
+            static_cast<int>(state_->active_subscriptions.size());
+        state_->num_running_subscriptions +=
+            static_cast<int>(state_->active_subscriptions.size());
+      }
+    }
+    // If we grabbed a finished item from the delivered_jobs queue then we may need
+    // to mark the generator finished or issue a request for a new item to fill in
+    // the spot we just vacated.  Notice that we issue that request to the same
+    // subscription that delivered it (deliverer).
+    if (delivered_job) {
+      if (mark_generator_complete) {
+        state_->all_finished.MarkFinished();
+      } else {
+        delivered_job->deliverer().AddCallback(
+            InnerCallback(state_, delivered_job->index));
+      }
+      return std::move(delivered_job->value);
+    }
+    // On the first call we try and fill up our subscriptions.  It's possible the outer
+    // generator only has a few items and we can't fill up to what we were hoping.  In
+    // that case we have to bail early.
+    if (state_->first) {
+      state_->first = false;
+      mark_generator_complete = false;
+      for (int i = 0; i < static_cast<int>(state_->active_subscriptions.size()); i++) {
+        state_->PullSource().AddCallback(
+            OuterCallback{state_, static_cast<std::size_t>(i)});
+        // If we have to bail early then we need to update the shared state again so
+        // we need to reacquire the lock.
+        auto guard = state_->mutex.Lock();
+        if (state_->source_exhausted) {
+          int excess_requests =
+              static_cast<int>(state_->active_subscriptions.size()) - i - 1;
+          state_->outstanding_requests -= excess_requests;
+          state_->num_running_subscriptions -= excess_requests;
+          if (excess_requests > 0) {
+            // It's possible that we are completing the generator by reducing the number
+            // of outstanding requests (e.g. this happens when the outer subscription and
+            // all inner subscriptions are synchronous)
+            mark_generator_complete = state_->IsCompleteUnlocked(guard);
+          }
+          break;
+        }
+      }
+      if (mark_generator_complete) {
+        state_->MarkFinishedAndPurge();
+      }
+    }
+    return waiting_future;
+  }
+
+ private:
+  struct DeliveredJob {
+    explicit DeliveredJob(AsyncGenerator<T> deliverer_, Result<T> value_,
+                          std::size_t index_)
+        : deliverer(deliverer_), value(std::move(value_)), index(index_) {}
+
+    // The generator that delivered this result, we will request another item
+    // from this generator once the result is delivered
+    AsyncGenerator<T> deliverer;
+    // The result we received from the generator
+    Result<T> value;
+    // The index of the generator (in active_subscriptions) that delivered this
+    // result.  This is used if we need to replace a finished generator.
+    std::size_t index;
+  };
+
+  struct State {
+    State(AsyncGenerator<AsyncGenerator<T>> source, int max_subscriptions)
+        : source(std::move(source)),
+          active_subscriptions(max_subscriptions),
+          delivered_jobs(),
+          waiting_jobs(),
+          mutex(),
+          first(true),
+          broken(false),
+          source_exhausted(false),
+          outstanding_requests(0),
+          num_running_subscriptions(0),
+          final_error(Status::OK()) {}
+
+    Future<AsyncGenerator<T>> PullSource() {
+      // Need to guard access to source() so we don't pull sync-reentrantly which
+      // is never valid.
+      auto lock = mutex.Lock();
+      return source();
+    }
+
+    void SignalErrorUnlocked(const util::Mutex::Guard& guard) {
+      broken = true;
+      // Empty any results that have arrived but not asked for.
+      while (!delivered_jobs.empty()) {
+        delivered_jobs.pop_front();
+      }
+    }
+
+    // This function is called outside the mutex but it will only ever be
+    // called once
+    void MarkFinishedAndPurge() {
+      all_finished.MarkFinished();
+      while (!waiting_jobs.empty()) {
+        waiting_jobs.front()->MarkFinished(IterationEnd<T>());
+        waiting_jobs.pop_front();
+      }
+    }
+
+    // This is called outside the mutex but it is only ever called
+    // once and Future<>::AddCallback is thread-safe
+    void MarkFinalError(const Status& err, Future<T> maybe_sink) {
+      if (maybe_sink.is_valid()) {
+        // Someone is waiting for this error so lets mark it complete when
+        // all the work is done
+        all_finished.AddCallback([maybe_sink, err](const Status& status) mutable {
+          maybe_sink.MarkFinished(err);
+        });
+      } else {
+        // No one is waiting for this error right now so it will be delivered
+        // next.
+        final_error = err;
+      }
+    }
+
+    bool IsCompleteUnlocked(const util::Mutex::Guard& guard) {
+      return outstanding_requests == 0 &&
+             (broken || (source_exhausted && num_running_subscriptions == 0 &&
+                         delivered_jobs.empty()));
+    }
+
+    bool MarkTaskFinishedUnlocked(const util::Mutex::Guard& guard) {
+      --outstanding_requests;
+      return IsCompleteUnlocked(guard);
+    }
+
+    // The outer generator.  Each item we pull from this will be its own generator
+    // and become an inner subscription
+    AsyncGenerator<AsyncGenerator<T>> source;
+    // active_subscriptions and delivered_jobs will be bounded by max_subscriptions
+    std::vector<AsyncGenerator<T>> active_subscriptions;
+    // Results delivered by the inner subscriptions that weren't yet asked for by the
+    // caller
+    std::deque<std::shared_ptr<DeliveredJob>> delivered_jobs;
+    // waiting_jobs is unbounded, reentrant pulls (e.g. AddReadahead) will provide the
+    // backpressure
+    std::deque<std::shared_ptr<Future<T>>> waiting_jobs;
+    // A future that will be marked complete when the terminal item has arrived and all
+    // outstanding futures have completed.  It is used to hold off emission of an error
+    // until all outstanding work is done.
+    Future<> all_finished = Future<>::Make();
+    util::Mutex mutex;
+    // A flag cleared when the caller firsts asks for a future.  Used to start polling.
+    bool first;
+    // A flag set when an error arrives, prevents us from issuing new requests.
+    bool broken;
+    // A flag set when the outer subscription has been exhausted.  Prevents us from
+    // pulling it further (even though it would be generally harmless) and lets us know we
+    // are finishing up.
+    bool source_exhausted;
+    // The number of futures that we have requested from either the outer or inner
+    // subscriptions that have not yet completed.  We cannot mark all_finished until this
+    // reaches 0.  This will never be greater than max_subscriptions
+    int outstanding_requests;
+    // The number of running subscriptions.  We ramp this up to `max_subscriptions` as
+    // soon as the first item is requested and then it stays at that level (each exhausted
+    // inner subscription is replaced by a new inner subscription) until the outer
+    // subscription is exhausted at which point this descends to 0 (and source_exhausted)
+    // is then set to true.
+    int num_running_subscriptions;
+    // If an error arrives, and the caller hasn't asked for that item, we store the error
+    // here.  It is analagous to delivered_jobs but for errors instead of finished
+    // results.
+    Status final_error;
+  };
+
+  struct InnerCallback {
+    InnerCallback(std::shared_ptr<State> state, std::size_t index, bool recursive = false)
+        : state(std::move(state)), index(index), recursive(recursive) {}
+
+    void operator()(const Result<T>& maybe_next_ref) {
+      // An item has been delivered by one of the inner subscriptions
+      Future<T> next_fut;
+      const Result<T>* maybe_next = &maybe_next_ref;
+
+      // When an item is delivered (and the caller has asked for it) we grab the
+      // next item from the inner subscription.  To avoid this behavior leading to an
+      // infinite loop (this can happen if the caller's callback asks for the next item)
+      // we use a while loop.
+      while (true) {
+        Future<T> sink;
+        bool sub_finished = maybe_next->ok() && IsIterationEnd(**maybe_next);
+        bool pull_next_sub = false;
+        bool was_broken = false;
+        bool should_mark_gen_complete = false;
+        bool should_mark_final_error = false;
+        {
+          auto guard = state->mutex.Lock();
+          if (state->broken) {
+            // We've errored out previously so ignore the result.  If anyone was waiting
+            // for this they will get IterationEnd when we purge
+            was_broken = true;
+          } else {
+            if (!sub_finished) {
+              // There is a result to deliver.  Either we can deliver it now or we will
+              // queue it up
+              if (state->waiting_jobs.empty()) {
+                state->delivered_jobs.push_back(std::make_shared<DeliveredJob>(
+                    state->active_subscriptions[index], *maybe_next, index));
+              } else {
+                sink = std::move(*state->waiting_jobs.front());
+                state->waiting_jobs.pop_front();
+              }
+            }
+
+            // If this is the first error then we transition the state to a broken state
+            if (!maybe_next->ok()) {
+              should_mark_final_error = true;
+              state->SignalErrorUnlocked(guard);
+            }
+          }
+
+          // If we finished this inner subscription then we need to grab a new inner
+          // subscription to take its spot.  If we can't (because we're broken or
+          // exhausted) then we aren't going to be starting any new futures and so
+          // the number of running subscriptions drops.
+          pull_next_sub = sub_finished && !state->source_exhausted && !was_broken;
+          if (sub_finished && !pull_next_sub) {
+            state->num_running_subscriptions--;
+          }
+          // There are three situations we won't pull again.  If an error occurred or we
+          // are already finished or if no one was waiting for our result and so we queued
+          // it up.  We will decrement outstanding_requests and possibly mark the
+          // generator completed.
+          if (state->broken || (!sink.is_valid() && !sub_finished) ||
+              (sub_finished && state->source_exhausted)) {
+            if (state->MarkTaskFinishedUnlocked(guard)) {
+              should_mark_gen_complete = true;
+            }
+          }
+        }
+
+        // Now we have given up the lock and we can take all the actions we decided we
+        // need to take.
+        if (should_mark_final_error) {
+          state->MarkFinalError(maybe_next->status(), std::move(sink));
+        }
+
+        if (should_mark_gen_complete) {
+          state->MarkFinishedAndPurge();
+        }
+
+        // An error occurred elsewhere so there is no need to mark any future
+        // finished (will happen during the purge) or pull from anything
+        if (was_broken) {
+          return;
+        }
+
+        if (pull_next_sub) {
+          if (recursive) {
+            was_empty = true;
+            return;
+          }
+          // We pulled an end token so we need to start a new subscription
+          // in our spot
+          state->PullSource().AddCallback(OuterCallback{state, index});
+        } else if (sink.is_valid()) {
+          // We pulled a valid result and there was someone waiting for it
+          // so lets fetch the next result from our subscription
+          sink.MarkFinished(*maybe_next);
+          next_fut = state->active_subscriptions[index]();
+          if (next_fut.TryAddCallback([this]() { return InnerCallback(state, index); })) {
+            return;
+          }
+          // Already completed. Avoid very deep recursion by looping
+          // here instead of relying on the callback.
+          maybe_next = &next_fut.result();
+          continue;
+        }
+        // else: We pulled a valid result but no one was waiting for it so
+        // we can just stop.
+        return;
+      }
+    }
+    std::shared_ptr<State> state;
+    std::size_t index;
+    bool recursive;
+    bool was_empty = false;
+  };
+
+  struct OuterCallback {
+    void operator()(const Result<AsyncGenerator<T>>& initial_maybe_next) {
+      Result<AsyncGenerator<T>> maybe_next = initial_maybe_next;
+      while (true) {
+        // We have been given a new inner subscription
+        bool should_continue = false;
+        bool should_mark_gen_complete = false;
+        bool should_deliver_error = false;
+        bool source_exhausted = maybe_next.ok() && IsIterationEnd(*maybe_next);
+        Future<T> error_sink;
+        {
+          auto guard = state->mutex.Lock();
+          if (!maybe_next.ok() || source_exhausted || state->broken) {
+            // If here then we will not pull any more from the outer source
+            if (!state->broken && !maybe_next.ok()) {
+              state->SignalErrorUnlocked(guard);
+              // If here then we are the first error so we need to deliver it
+              should_deliver_error = true;
+              if (!state->waiting_jobs.empty()) {
+                error_sink = std::move(*state->waiting_jobs.front());
+                state->waiting_jobs.pop_front();
+              }
+            }
+            if (source_exhausted) {
+              state->source_exhausted = true;
+              state->num_running_subscriptions--;
+            }
+            if (state->MarkTaskFinishedUnlocked(guard)) {
+              should_mark_gen_complete = true;
+            }
+          } else {
+            state->active_subscriptions[index] = *maybe_next;
+            should_continue = true;
+          }
+        }
+        if (should_deliver_error) {
+          state->MarkFinalError(maybe_next.status(), std::move(error_sink));
+        }
+        if (should_mark_gen_complete) {
+          state->MarkFinishedAndPurge();
+        }
+        if (should_continue) {
+          // There is a possibility that a large sequence of immediately available inner
+          // callbacks could lead to a stack overflow.  To avoid this we need to
+          // synchronously loop through inner/outer callbacks until we either find an
+          // unfinished future or we find an actual item to deliver.
+          Future<T> next_item = (*maybe_next)();
+          if (!next_item.TryAddCallback([this] { return InnerCallback(state, index); })) {
+            // By setting recursive to true we signal to the inner callback that, if it is
+            // empty, instead of adding a new outer callback, it should just immediately
+            // return, flagging was_empty so that we know we need to check the next
+            // subscription.
+            InnerCallback immediate_inner(state, index, /*recursive=*/true);
+            immediate_inner(next_item.result());
+            if (immediate_inner.was_empty) {
+              Future<AsyncGenerator<T>> next_source = state->PullSource();
+              if (next_source.TryAddCallback([this] {
+                    return OuterCallback{state, index};
+                  })) {
+                // We hit an unfinished future so we can stop looping
+                return;
+              }
+              // The current subscription was immediately and synchronously empty
+              // and we were able to synchronously pull the next subscription so we
+              // can keep looping.
+              maybe_next = next_source.result();
+              continue;
+            }
+          }
+        }
+        return;
+      }
+    }
+    std::shared_ptr<State> state;
+    std::size_t index;
+  };
+
+  std::shared_ptr<State> state_;
+};
+
+/// \brief Create a generator that takes in a stream of generators and pulls from up to
+/// max_subscriptions at a time
+///
+/// Note: This may deliver items out of sequence. For example, items from the third
+/// AsyncGenerator generated by the source may be emitted before some items from the first
+/// AsyncGenerator generated by the source.
+///
+/// This generator will pull from source async-reentrantly unless max_subscriptions is 1
+/// This generator will not pull from the individual subscriptions reentrantly.  Add
+/// readahead to the individual subscriptions if that is desired.
+/// This generator is async-reentrant
+///
+/// This generator may queue up to max_subscriptions instances of T
+template <typename T>
+AsyncGenerator<T> MakeMergedGenerator(AsyncGenerator<AsyncGenerator<T>> source,
+                                      int max_subscriptions) {
+  return MergedGenerator<T>(std::move(source), max_subscriptions);
+}
+
+template <typename T>
+Result<AsyncGenerator<T>> MakeSequencedMergedGenerator(
+    AsyncGenerator<AsyncGenerator<T>> source, int max_subscriptions) {
+  if (max_subscriptions < 0) {
+    return Status::Invalid("max_subscriptions must be a positive integer");
+  }
+  if (max_subscriptions == 1) {
+    return Status::Invalid("Use MakeConcatenatedGenerator if max_subscriptions is 1");
+  }
+  AsyncGenerator<AsyncGenerator<T>> autostarting_source = MakeMappedGenerator(
+      std::move(source),
+      [](const AsyncGenerator<T>& sub) { return MakeAutoStartingGenerator(sub); });
+  AsyncGenerator<AsyncGenerator<T>> sub_readahead =
+      MakeSerialReadaheadGenerator(std::move(autostarting_source), max_subscriptions - 1);
+  return MakeConcatenatedGenerator(std::move(sub_readahead));
+}
+
+/// \brief Create a generator that takes in a stream of generators and pulls from each
+/// one in sequence.
+///
+/// This generator is async-reentrant but will never pull from source reentrantly and
+/// will never pull from any subscription reentrantly.
+///
+/// This generator may queue 1 instance of T
+///
+/// TODO: Could potentially make a bespoke implementation instead of MergedGenerator that
+/// forwards async-reentrant requests instead of buffering them (which is what
+/// MergedGenerator does)
+template <typename T>
+AsyncGenerator<T> MakeConcatenatedGenerator(AsyncGenerator<AsyncGenerator<T>> source) {
+  return MergedGenerator<T>(std::move(source), 1);
+}
+
+/// \see MakeEnumeratedGenerator
+template <typename T>
+class EnumeratingGenerator {
+ public:
+  EnumeratingGenerator(AsyncGenerator<T> source, T initial_value)
+      : state_(std::make_shared<State>(std::move(source), std::move(initial_value))) {}
+
+  Future<Enumerated<T>> operator()() {
+    if (state_->finished) {
+      return AsyncGeneratorEnd<Enumerated<T>>();
+    } else {
+      auto state = state_;
+      return state->source().Then([state](const T& next) {
+        auto finished = IsIterationEnd<T>(next);
+        auto prev = Enumerated<T>{state->prev_value, state->prev_index, finished};
+        state->prev_value = next;
+        state->prev_index++;
+        state->finished = finished;
+        return prev;
+      });
+    }
+  }
+
+ private:
+  struct State {
+    State(AsyncGenerator<T> source, T initial_value)
+        : source(std::move(source)), prev_value(std::move(initial_value)), prev_index(0) {
+      finished = IsIterationEnd<T>(prev_value);
+    }
+
+    AsyncGenerator<T> source;
+    T prev_value;
+    int prev_index;
+    bool finished;
+  };
+
+  std::shared_ptr<State> state_;
+};
+
+/// Wrap items from a source generator with positional information
+///
+/// When used with MakeMergedGenerator and MakeSequencingGenerator this allows items to be
+/// processed in a "first-available" fashion and later resequenced which can reduce the
+/// impact of sources with erratic performance (e.g. a filesystem where some items may
+/// take longer to read than others).
+///
+/// TODO(ARROW-12371) Would require this generator be async-reentrant
+///
+/// \see MakeSequencingGenerator for an example of putting items back in order
+///
+/// This generator is not async-reentrant
+///
+/// This generator buffers one item (so it knows which item is the last item)
+template <typename T>
+AsyncGenerator<Enumerated<T>> MakeEnumeratedGenerator(AsyncGenerator<T> source) {
+  return FutureFirstGenerator<Enumerated<T>>(
+      source().Then([source](const T& initial_value) -> AsyncGenerator<Enumerated<T>> {
+        return EnumeratingGenerator<T>(std::move(source), initial_value);
+      }));
+}
+
+/// \see MakeTransferredGenerator
+template <typename T>
+class TransferringGenerator {
+ public:
+  explicit TransferringGenerator(AsyncGenerator<T> source, internal::Executor* executor)
+      : source_(std::move(source)), executor_(executor) {}
+
+  Future<T> operator()() { return executor_->Transfer(source_()); }
+
+ private:
+  AsyncGenerator<T> source_;
+  internal::Executor* executor_;
+};
+
+/// \brief Transfer a future to an underlying executor.
+///
+/// Continuations run on the returned future will be run on the given executor
+/// if they cannot be run synchronously.
+///
+/// This is often needed to move computation off I/O threads or other external
+/// completion sources and back on to the CPU executor so the I/O thread can
+/// stay busy and focused on I/O
+///
+/// Keep in mind that continuations called on an already completed future will
+/// always be run synchronously and so no transfer will happen in that case.
+///
+/// This generator is async reentrant if the source is
+///
+/// This generator will not queue
+template <typename T>
+AsyncGenerator<T> MakeTransferredGenerator(AsyncGenerator<T> source,
+                                           internal::Executor* executor) {
+  return TransferringGenerator<T>(std::move(source), executor);
+}
+
+/// \see MakeBackgroundGenerator
+template <typename T>
+class BackgroundGenerator {
+ public:
+  explicit BackgroundGenerator(Iterator<T> it, internal::Executor* io_executor, int max_q,
+                               int q_restart)
+      : state_(std::make_shared<State>(io_executor, std::move(it), max_q, q_restart)),
+        cleanup_(std::make_shared<Cleanup>(state_.get())) {}
+
+  Future<T> operator()() {
+    auto guard = state_->mutex.Lock();
+    Future<T> waiting_future;
+    if (state_->queue.empty()) {
+      if (state_->finished) {
+        return AsyncGeneratorEnd<T>();
+      } else {
+        waiting_future = Future<T>::Make();
+        state_->waiting_future = waiting_future;
+      }
+    } else {
+      auto next = Future<T>::MakeFinished(std::move(state_->queue.front()));
+      state_->queue.pop();
+      if (state_->NeedsRestart()) {
+        return state_->RestartTask(state_, std::move(guard), std::move(next));
+      }
+      return next;
+    }
+    // This should only trigger the very first time this method is called
+    if (state_->NeedsRestart()) {
+      return state_->RestartTask(state_, std::move(guard), std::move(waiting_future));
+    }
+    return waiting_future;
+  }
+
+ protected:
+  static constexpr uint64_t kUnlikelyThreadId{std::numeric_limits<uint64_t>::max()};
+
+  struct State {
+    State(internal::Executor* io_executor, Iterator<T> it, int max_q, int q_restart)
+        : io_executor(io_executor),
+          max_q(max_q),
+          q_restart(q_restart),
+          it(std::move(it)),
+          reading(false),
+          finished(false),
+          should_shutdown(false) {}
+
+    void ClearQueue() {
+      while (!queue.empty()) {
+        queue.pop();
+      }
+    }
+
+    bool TaskIsRunning() const { return task_finished.is_valid(); }
+
+    bool NeedsRestart() const {
+      return !finished && !reading && static_cast<int>(queue.size()) <= q_restart;
+    }
+
+    void DoRestartTask(std::shared_ptr<State> state, util::Mutex::Guard guard) {
+      // If we get here we are actually going to start a new task so let's create a
+      // task_finished future for it
+      state->task_finished = Future<>::Make();
+      state->reading = true;
+      auto spawn_status = io_executor->Spawn(
+          [state]() { BackgroundGenerator::WorkerTask(std::move(state)); });
+      if (!spawn_status.ok()) {
+        // If we can't spawn a new task then send an error to the consumer (either via a
+        // waiting future or the queue) and mark ourselves finished
+        state->finished = true;
+        state->task_finished = Future<>();
+        if (waiting_future.has_value()) {
+          auto to_deliver = std::move(waiting_future.value());
+          waiting_future.reset();
+          guard.Unlock();
+          to_deliver.MarkFinished(spawn_status);
+        } else {
+          ClearQueue();
+          queue.push(spawn_status);
+        }
+      }
+    }
+
+    Future<T> RestartTask(std::shared_ptr<State> state, util::Mutex::Guard guard,
+                          Future<T> next) {
+      if (TaskIsRunning()) {
+        // If the task is still cleaning up we need to wait for it to finish before
+        // restarting.  We also want to block the consumer until we've restarted the
+        // reader to avoid multiple restarts
+        return task_finished.Then([state, next]() {
+          // This may appear dangerous (recursive mutex) but we should be guaranteed the
+          // outer guard has been released by this point.  We know...
+          // * task_finished is not already finished (it would be invalid in that case)
+          // * task_finished will not be marked complete until we've given up the mutex
+          auto guard_ = state->mutex.Lock();
+          state->DoRestartTask(state, std::move(guard_));
+          return next;
+        });
+      }
+      // Otherwise we can restart immediately
+      DoRestartTask(std::move(state), std::move(guard));
+      return next;
+    }
+
+    internal::Executor* io_executor;
+    const int max_q;
+    const int q_restart;
+    Iterator<T> it;
+    std::atomic<uint64_t> worker_thread_id{kUnlikelyThreadId};
+
+    // If true, the task is actively pumping items from the queue and does not need a
+    // restart
+    bool reading;
+    // Set to true when a terminal item arrives
+    bool finished;
+    // Signal to the background task to end early because consumers have given up on it
+    bool should_shutdown;
+    // If the queue is empty, the consumer will create a waiting future and wait for it
+    std::queue<Result<T>> queue;
+    std::optional<Future<T>> waiting_future;
+    // Every background task is given a future to complete when it is entirely finished
+    // processing and ready for the next task to start or for State to be destroyed
+    Future<> task_finished;
+    util::Mutex mutex;
+  };
+
+  // Cleanup task that will be run when all consumer references to the generator are lost
+  struct Cleanup {
+    explicit Cleanup(State* state) : state(state) {}
+    ~Cleanup() {
+      /// TODO: Once ARROW-13109 is available then we can be force consumers to spawn and
+      /// there is no need to perform this check.
+      ///
+      /// It's a deadlock if we enter cleanup from
+      /// the worker thread but it can happen if the consumer doesn't transfer away
+      assert(state->worker_thread_id.load() != ::arrow::internal::GetThreadId());
+      Future<> finish_fut;
+      {
+        auto lock = state->mutex.Lock();
+        if (!state->TaskIsRunning()) {
+          return;
+        }
+        // Signal the current task to stop and wait for it to finish
+        state->should_shutdown = true;
+        finish_fut = state->task_finished;
+      }
+      // Using future as a condition variable here
+      Status st = finish_fut.status();
+      ARROW_UNUSED(st);
+    }
+    State* state;
+  };
+
+  static void WorkerTask(std::shared_ptr<State> state) {
+    state->worker_thread_id.store(::arrow::internal::GetThreadId());
+    // We need to capture the state to read while outside the mutex
+    bool reading = true;
+    while (reading) {
+      auto next = state->it.Next();
+      // Need to capture state->waiting_future inside the mutex to mark finished outside
+      Future<T> waiting_future;
+      {
+        auto guard = state->mutex.Lock();
+
+        if (state->should_shutdown) {
+          state->finished = true;
+          break;
+        }
+
+        if (!next.ok() || IsIterationEnd<T>(*next)) {
+          // Terminal item.  Mark finished to true, send this last item, and quit
+          state->finished = true;
+          if (!next.ok()) {
+            state->ClearQueue();
+          }
+        }
+        // At this point we are going to send an item.  Either we will add it to the
+        // queue or deliver it to a waiting future.
+        if (state->waiting_future.has_value()) {
+          waiting_future = std::move(state->waiting_future.value());
+          state->waiting_future.reset();
+        } else {
+          state->queue.push(std::move(next));
+          // We just filled up the queue so it is time to quit.  We may need to notify
+          // a cleanup task so we transition to Quitting
+          if (static_cast<int>(state->queue.size()) >= state->max_q) {
+            state->reading = false;
+          }
+        }
+        reading = state->reading && !state->finished;
+      }
+      // This should happen outside the mutex.  Presumably there is a
+      // transferring generator on the other end that will quickly transfer any
+      // callbacks off of this thread so we can continue looping.  Still, best not to
+      // rely on that
+      if (waiting_future.is_valid()) {
+        waiting_future.MarkFinished(next);
+      }
+    }
+    // Once we've sent our last item we can notify any waiters that we are done and so
+    // either state can be cleaned up or a new background task can be started
+    Future<> task_finished;
+    {
+      auto guard = state->mutex.Lock();
+      // After we give up the mutex state can be safely deleted.  We will no longer
+      // reference it.  We can safely transition to idle now.
+      task_finished = state->task_finished;
+      state->task_finished = Future<>();
+      state->worker_thread_id.store(kUnlikelyThreadId);
+    }
+    task_finished.MarkFinished();
+  }
+
+  std::shared_ptr<State> state_;
+  // state_ is held by both the generator and the background thread so it won't be cleaned
+  // up when all consumer references are relinquished.  cleanup_ is only held by the
+  // generator so it will be destructed when the last consumer reference is gone.  We use
+  // this to cleanup / stop the background generator in case the consuming end stops
+  // listening (e.g. due to a downstream error)
+  std::shared_ptr<Cleanup> cleanup_;
+};
+
+constexpr int kDefaultBackgroundMaxQ = 32;
+constexpr int kDefaultBackgroundQRestart = 16;
+
+/// \brief Create an AsyncGenerator<T> by iterating over an Iterator<T> on a background
+/// thread
+///
+/// The parameter max_q and q_restart control queue size and background thread task
+/// management. If the background task is fast you typically don't want it creating a
+/// thread task for every item.  Instead the background thread will run until it fills
+/// up a readahead queue.
+///
+/// Once the queue has filled up the background thread task will terminate (allowing other
+/// I/O tasks to use the thread).  Once the queue has been drained enough (specified by
+/// q_restart) then the background thread task will be restarted.  If q_restart is too low
+/// then you may exhaust the queue waiting for the background thread task to start running
+/// again.  If it is too high then it will be constantly stopping and restarting the
+/// background queue task
+///
+/// The "background thread" is a logical thread and will run as tasks on the io_executor.
+/// This thread may stop and start when the queue fills up but there will only be one
+/// active background thread task at any given time.  You MUST transfer away from this
+/// background generator.  Otherwise there could be a race condition if a callback on the
+/// background thread deletes the last consumer reference to the background generator. You
+/// can transfer onto the same executor as the background thread, it is only necessary to
+/// create a new thread task, not to switch executors.
+///
+/// This generator is not async-reentrant
+///
+/// This generator will queue up to max_q blocks
+template <typename T>
+static Result<AsyncGenerator<T>> MakeBackgroundGenerator(
+    Iterator<T> iterator, internal::Executor* io_executor,
+    int max_q = kDefaultBackgroundMaxQ, int q_restart = kDefaultBackgroundQRestart) {
+  if (max_q < q_restart) {
+    return Status::Invalid("max_q must be >= q_restart");
+  }
+  return BackgroundGenerator<T>(std::move(iterator), io_executor, max_q, q_restart);
+}
+
+/// \brief Create an AsyncGenerator<T> by iterating over an Iterator<T> synchronously
+///
+/// This should only be used if you know the source iterator does not involve any
+/// I/O (or other blocking calls).  Otherwise a CPU thread will be blocked and, depending
+/// on the complexity of the iterator, it may lead to deadlock.
+///
+/// If you are not certain if there will be I/O then it is better to use
+/// MakeBackgroundGenerator.  If helpful you can think of this as the AsyncGenerator
+/// equivalent of Future::MakeFinished
+///
+/// It is impossible to call this in an async-reentrant manner since the returned
+/// future will be completed by the time it is polled.
+///
+/// This generator does not queue
+template <typename T>
+static Result<AsyncGenerator<T>> MakeBlockingGenerator(
+    std::shared_ptr<Iterator<T>> iterator) {
+  return [it = std::move(iterator)]() mutable -> Future<T> {
+    return Future<T>::MakeFinished(it->Next());
+  };
+}
+
+template <typename T>
+static Result<AsyncGenerator<T>> MakeBlockingGenerator(Iterator<T> iterator) {
+  return MakeBlockingGenerator(std::make_shared<Iterator<T>>(std::move(iterator)));
+}
+
+/// \see MakeGeneratorIterator
+template <typename T>
+class GeneratorIterator {
+ public:
+  explicit GeneratorIterator(AsyncGenerator<T> source) : source_(std::move(source)) {}
+
+  Result<T> Next() { return source_().result(); }
+
+ private:
+  AsyncGenerator<T> source_;
+};
+
+/// \brief Convert an AsyncGenerator<T> to an Iterator<T> which blocks until each future
+/// is finished
+template <typename T>
+Iterator<T> MakeGeneratorIterator(AsyncGenerator<T> source) {
+  return Iterator<T>(GeneratorIterator<T>(std::move(source)));
+}
+
+/// \brief Add readahead to an iterator using a background thread.
+///
+/// Under the hood this is converting the iterator to a generator using
+/// MakeBackgroundGenerator, adding readahead to the converted generator with
+/// MakeReadaheadGenerator, and then converting back to an iterator using
+/// MakeGeneratorIterator.
+template <typename T>
+Result<Iterator<T>> MakeReadaheadIterator(Iterator<T> it, int readahead_queue_size) {
+  ARROW_ASSIGN_OR_RAISE(auto io_executor, internal::ThreadPool::Make(1));
+  auto max_q = readahead_queue_size;
+  auto q_restart = std::max(1, max_q / 2);
+  ARROW_ASSIGN_OR_RAISE(
+      auto background_generator,
+      MakeBackgroundGenerator(std::move(it), io_executor.get(), max_q, q_restart));
+  // Capture io_executor to keep it alive as long as owned_bg_generator is still
+  // referenced
+  AsyncGenerator<T> owned_bg_generator = [io_executor, background_generator]() {
+    return background_generator();
+  };
+  return MakeGeneratorIterator(std::move(owned_bg_generator));
+}
+
+/// \brief Make a generator that returns a single pre-generated future
+///
+/// This generator is async-reentrant.
+template <typename T>
+std::function<Future<T>()> MakeSingleFutureGenerator(Future<T> future) {
+  assert(future.is_valid());
+  auto state = std::make_shared<Future<T>>(std::move(future));
+  return [state]() -> Future<T> {
+    auto fut = std::move(*state);
+    if (fut.is_valid()) {
+      return fut;
+    } else {
+      return AsyncGeneratorEnd<T>();
+    }
+  };
+}
+
+/// \brief Make a generator that immediately ends.
+///
+/// This generator is async-reentrant.
+template <typename T>
+std::function<Future<T>()> MakeEmptyGenerator() {
+  return []() -> Future<T> { return AsyncGeneratorEnd<T>(); };
+}
+
+/// \brief Make a generator that always fails with a given error
+///
+/// This generator is async-reentrant.
+template <typename T>
+AsyncGenerator<T> MakeFailingGenerator(Status st) {
+  assert(!st.ok());
+  auto state = std::make_shared<Status>(std::move(st));
+  return [state]() -> Future<T> {
+    auto st = std::move(*state);
+    if (!st.ok()) {
+      return st;
+    } else {
+      return AsyncGeneratorEnd<T>();
+    }
+  };
+}
+
+/// \brief Make a generator that always fails with a given error
+///
+/// This overload allows inferring the return type from the argument.
+template <typename T>
+AsyncGenerator<T> MakeFailingGenerator(const Result<T>& result) {
+  return MakeFailingGenerator<T>(result.status());
+}
+
+/// \brief Prepend initial_values onto a generator
+///
+/// This generator is async-reentrant but will buffer requests and will not
+/// pull from following_values async-reentrantly.
+template <typename T>
+AsyncGenerator<T> MakeGeneratorStartsWith(std::vector<T> initial_values,
+                                          AsyncGenerator<T> following_values) {
+  auto initial_values_vec_gen = MakeVectorGenerator(std::move(initial_values));
+  auto gen_gen = MakeVectorGenerator<AsyncGenerator<T>>(
+      {std::move(initial_values_vec_gen), std::move(following_values)});
+  return MakeConcatenatedGenerator(std::move(gen_gen));
+}
+
+template <typename T>
+struct CancellableGenerator {
+  Future<T> operator()() {
+    if (stop_token.IsStopRequested()) {
+      return stop_token.Poll();
+    }
+    return source();
+  }
+
+  AsyncGenerator<T> source;
+  StopToken stop_token;
+};
+
+/// \brief Allow an async generator to be cancelled
+///
+/// This generator is async-reentrant
+template <typename T>
+AsyncGenerator<T> MakeCancellable(AsyncGenerator<T> source, StopToken stop_token) {
+  return CancellableGenerator<T>{std::move(source), std::move(stop_token)};
+}
+
+template <typename T>
+class DefaultIfEmptyGenerator {
+ public:
+  DefaultIfEmptyGenerator(AsyncGenerator<T> source, T or_value)
+      : state_(std::make_shared<State>(std::move(source), std::move(or_value))) {}
+
+  Future<T> operator()() {
+    if (state_->first) {
+      state_->first = false;
+      struct {
+        T or_value;
+
+        Result<T> operator()(const T& value) {
+          if (IterationTraits<T>::IsEnd(value)) {
+            return std::move(or_value);
+          }
+          return value;
+        }
+      } Continuation;
+      Continuation.or_value = std::move(state_->or_value);
+      return state_->source().Then(std::move(Continuation));
+    }
+    return state_->source();
+  }
+
+ private:
+  struct State {
+    AsyncGenerator<T> source;
+    T or_value;
+    bool first;
+    State(AsyncGenerator<T> source_, T or_value_)
+        : source(std::move(source_)), or_value(std::move(or_value_)), first(true) {}
+  };
+  std::shared_ptr<State> state_;
+};
+
+/// \brief If the generator is empty, return the given value, else
+/// forward the values from the generator.
+///
+/// This generator is async-reentrant.
+template <typename T>
+AsyncGenerator<T> MakeDefaultIfEmptyGenerator(AsyncGenerator<T> source, T or_value) {
+  return DefaultIfEmptyGenerator<T>(std::move(source), std::move(or_value));
+}
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/async_generator_fwd.h b/pyarrow/include/arrow/util/async_generator_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..59c4276f4524cf5e124fe0bd2266579b5c28d06b
--- /dev/null
+++ b/pyarrow/include/arrow/util/async_generator_fwd.h
@@ -0,0 +1,69 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/type_fwd.h"
+
+namespace arrow {
+
+template <typename T>
+using AsyncGenerator = std::function<Future<T>()>;
+
+template <typename T, typename V>
+class MappingGenerator;
+
+template <typename T, typename ComesAfter, typename IsNext>
+class SequencingGenerator;
+
+template <typename T, typename V>
+class TransformingGenerator;
+
+template <typename T>
+class SerialReadaheadGenerator;
+
+template <typename T>
+class ReadaheadGenerator;
+
+template <typename T>
+class PushGenerator;
+
+template <typename T>
+class MergedGenerator;
+
+template <typename T>
+class EnumeratingGenerator;
+
+template <typename T>
+class TransferringGenerator;
+
+template <typename T>
+class BackgroundGenerator;
+
+template <typename T>
+class GeneratorIterator;
+
+template <typename T>
+struct CancellableGenerator;
+
+template <typename T>
+class DefaultIfEmptyGenerator;
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/async_util.h b/pyarrow/include/arrow/util/async_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9ed63bdbce2260e6c717769a5f91dfe1cca9f89
--- /dev/null
+++ b/pyarrow/include/arrow/util/async_util.h
@@ -0,0 +1,460 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <functional>
+#include <list>
+#include <memory>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/cancel.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/future.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/mutex.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/tracing.h"
+
+namespace arrow {
+
+using internal::FnOnce;
+
+namespace util {
+
+/// A utility which keeps tracks of, and schedules, asynchronous tasks
+///
+/// An asynchronous task has a synchronous component and an asynchronous component.
+/// The synchronous component typically schedules some kind of work on an external
+/// resource (e.g. the I/O thread pool or some kind of kernel-based asynchronous
+/// resource like io_uring).  The asynchronous part represents the work
+/// done on that external resource.  Executing the synchronous part will be referred
+/// to as "submitting the task" since this usually includes submitting the asynchronous
+/// portion to the external thread pool.
+///
+/// By default the scheduler will submit the task (execute the synchronous part) as
+/// soon as it is added, assuming the underlying thread pool hasn't terminated or the
+/// scheduler hasn't aborted.  In this mode, the scheduler is simply acting as
+/// a simple task group.
+///
+/// A task scheduler starts with an initial task.  That task, and all subsequent tasks
+/// are free to add subtasks.  Once all submitted tasks finish the scheduler will
+/// finish.  Note, it is not an error to add additional tasks after a scheduler has
+/// aborted. These tasks will be ignored and never submitted.  The scheduler returns a
+/// future which will complete when all submitted tasks have finished executing.  Once all
+/// tasks have been finished the scheduler is invalid and should no longer be used.
+///
+/// Task failure (either the synchronous portion or the asynchronous portion) will cause
+/// the scheduler to enter an aborted state.  The first such failure will be reported in
+/// the final task future.
+class ARROW_EXPORT AsyncTaskScheduler {
+ public:
+  /// Destructor for AsyncTaskScheduler
+  ///
+  /// The lifetime of the task scheduled is managed automatically.  The scheduler
+  /// will remain valid while any tasks are running (and can always be safely accessed)
+  /// within tasks) and will be destroyed as soon as all tasks have finished.
+  virtual ~AsyncTaskScheduler() = default;
+  /// An interface for a task
+  ///
+  /// Users may want to override this, for example, to add priority
+  /// information for use by a queue.
+  class Task {
+   public:
+    virtual ~Task() = default;
+    /// Submit the task
+    ///
+    /// This will be called by the scheduler at most once when there
+    /// is space to run the task.  This is expected to be a fairly quick
+    /// function that simply submits the actual task work to an external
+    /// resource (e.g. I/O thread pool).
+    ///
+    /// If this call fails then the scheduler will enter an aborted state.
+    virtual Result<Future<>> operator()() = 0;
+    /// The cost of the task
+    ///
+    /// A ThrottledAsyncTaskScheduler can be used to limit the number of concurrent tasks.
+    /// A custom cost may be used, for example, if you would like to limit the number of
+    /// tasks based on the total expected RAM usage of the tasks (this is done in the
+    /// scanner)
+    virtual int cost() const { return 1; }
+    /// The name of the task
+    ///
+    /// This is used for debugging and traceability.  The returned view must remain
+    /// valid for the lifetime of the task.
+    virtual std::string_view name() const = 0;
+
+    /// a span tied to the lifetime of the task, for internal use only
+    tracing::Span span;
+  };
+
+  /// Add a task to the scheduler
+  ///
+  /// If the scheduler is in an aborted state this call will return false and the task
+  /// will never be run.  This is harmless and does not need to be guarded against.
+  ///
+  /// The return value for this call can usually be ignored.  There is little harm in
+  /// attempting to add tasks to an aborted scheduler.  It is only included for callers
+  /// that want to avoid future task generation to save effort.
+  ///
+  /// \param task the task to submit
+  ///
+  /// A task's name must remain valid for the duration of the task.  It is used for
+  /// debugging (e.g. when debugging a deadlock to see which tasks still remain) and for
+  /// traceability (the name will be used for spans assigned to the task)
+  ///
+  /// \return true if the task was submitted or queued, false if the task was ignored
+  virtual bool AddTask(std::unique_ptr<Task> task) = 0;
+
+  /// Adds an async generator to the scheduler
+  ///
+  /// The async generator will be visited, one item at a time.  Submitting a task
+  /// will consist of polling the generator for the next future.  The generator's future
+  /// will then represent the task itself.
+  ///
+  /// This visits the task serially without readahead.  If readahead or parallelism
+  /// is desired then it should be added in the generator itself.
+  ///
+  /// The generator itself will be kept alive until all tasks have been completed.
+  /// However, if the scheduler is aborted, the generator will be destroyed as soon as the
+  /// next item would be requested.
+  ///
+  /// \param generator the generator to submit to the scheduler
+  /// \param visitor a function which visits each generator future as it completes
+  /// \param name a name which will be used for each submitted task
+  template <typename T>
+  bool AddAsyncGenerator(std::function<Future<T>()> generator,
+                         std::function<Status(const T&)> visitor, std::string_view name);
+
+  template <typename Callable>
+  struct SimpleTask : public Task {
+    SimpleTask(Callable callable, std::string_view name)
+        : callable(std::move(callable)), name_(name) {}
+    SimpleTask(Callable callable, std::string name)
+        : callable(std::move(callable)), owned_name_(std::move(name)) {
+      name_ = *owned_name_;
+    }
+    Result<Future<>> operator()() override { return callable(); }
+    std::string_view name() const override { return name_; }
+    Callable callable;
+    std::string_view name_;
+    std::optional<std::string> owned_name_;
+  };
+
+  /// Add a task with cost 1 to the scheduler
+  ///
+  /// \param callable a "submit" function that should return a future
+  /// \param name a name for the task
+  ///
+  /// `name` must remain valid until the task has been submitted AND the returned
+  /// future completes.  It is used for debugging and tracing.
+  ///
+  /// \see AddTask for more details
+  template <typename Callable>
+  bool AddSimpleTask(Callable callable, std::string_view name) {
+    return AddTask(std::make_unique<SimpleTask<Callable>>(std::move(callable), name));
+  }
+
+  /// Add a task with cost 1 to the scheduler
+  ///
+  /// This is an overload of \see AddSimpleTask that keeps `name` alive
+  /// in the task.
+  template <typename Callable>
+  bool AddSimpleTask(Callable callable, std::string name) {
+    return AddTask(
+        std::make_unique<SimpleTask<Callable>>(std::move(callable), std::move(name)));
+  }
+
+  /// Construct a scheduler
+  ///
+  /// \param initial_task The initial task which is responsible for adding
+  ///        the first subtasks to the scheduler.
+  /// \param abort_callback A callback that will be triggered immediately after a task
+  ///        fails while other tasks may still be running.  Nothing needs to be done here,
+  ///        when a task fails the scheduler will stop accepting new tasks and eventually
+  ///        return the error.  However, this callback can be used to more quickly end
+  ///        long running tasks that have already been submitted.  Defaults to doing
+  ///        nothing.
+  /// \param stop_token An optional stop token that will allow cancellation of the
+  ///        scheduler.  This will be checked before each task is submitted and, in the
+  ///        event of a cancellation, the scheduler will enter an aborted state. This is
+  ///        a graceful cancellation and submitted tasks will still complete.
+  /// \return A future that will be completed when the initial task and all subtasks have
+  ///         finished.
+  static Future<> Make(
+      FnOnce<Status(AsyncTaskScheduler*)> initial_task,
+      FnOnce<void(const Status&)> abort_callback = [](const Status&) {},
+      StopToken stop_token = StopToken::Unstoppable());
+
+  /// A span tracking execution of the scheduler's tasks, for internal use only
+  virtual const tracing::Span& span() const = 0;
+};
+
+class ARROW_EXPORT ThrottledAsyncTaskScheduler : public AsyncTaskScheduler {
+ public:
+  /// An interface for a task queue
+  ///
+  /// A queue's methods will not be called concurrently
+  class Queue {
+   public:
+    virtual ~Queue() = default;
+    /// Push a task to the queue
+    ///
+    /// \param task the task to enqueue
+    virtual void Push(std::unique_ptr<Task> task) = 0;
+    /// Pop the next task from the queue
+    virtual std::unique_ptr<Task> Pop() = 0;
+    /// Peek the next task in the queue
+    virtual const Task& Peek() = 0;
+    /// Check if the queue is empty
+    virtual bool Empty() = 0;
+    /// Purge the queue of all items
+    virtual void Purge() = 0;
+    virtual std::size_t Size() const = 0;
+  };
+
+  class Throttle {
+   public:
+    virtual ~Throttle() = default;
+    /// Acquire amt permits
+    ///
+    /// If nullopt is returned then the permits were immediately
+    /// acquired and the caller can proceed.  If a future is returned then the caller
+    /// should wait for the future to complete first.  When the returned future completes
+    /// the permits have NOT been acquired and the caller must call Acquire again
+    ///
+    /// \param amt the number of permits to acquire
+    virtual std::optional<Future<>> TryAcquire(int amt) = 0;
+    /// Release amt permits
+    ///
+    /// This will possibly complete waiting futures and should probably not be
+    /// called while holding locks.
+    ///
+    /// \param amt the number of permits to release
+    virtual void Release(int amt) = 0;
+
+    /// The size of the largest task that can run
+    ///
+    /// Incoming tasks will have their cost latched to this value to ensure
+    /// they can still run (although they will be the only thing allowed to
+    /// run at that time).
+    virtual int Capacity() = 0;
+
+    /// Pause the throttle
+    ///
+    /// Any tasks that have been submitted already will continue.  However, no new tasks
+    /// will be run until the throttle is resumed.
+    virtual void Pause() = 0;
+    /// Resume the throttle
+    ///
+    /// Allows task to be submitted again.  If there is a max_concurrent_cost limit then
+    /// it will still apply.
+    virtual void Resume() = 0;
+  };
+
+  /// Pause the throttle
+  ///
+  /// Any tasks that have been submitted already will continue.  However, no new tasks
+  /// will be run until the throttle is resumed.
+  virtual void Pause() = 0;
+  /// Resume the throttle
+  ///
+  /// Allows task to be submitted again.  If there is a max_concurrent_cost limit then
+  /// it will still apply.
+  virtual void Resume() = 0;
+  /// Return the number of tasks queued but not yet submitted
+  virtual std::size_t QueueSize() = 0;
+
+  /// Create a throttled view of a scheduler
+  ///
+  /// Tasks added via this view will be subjected to the throttle and, if the tasks cannot
+  /// run immediately, will be placed into a queue.
+  ///
+  /// Although a shared_ptr is returned it should generally be assumed that the caller
+  /// is being given exclusive ownership.  The shared_ptr is used to share the view with
+  /// queued and submitted tasks and the lifetime of those is unpredictable.  It is
+  /// important the caller keep the returned pointer alive for as long as they plan to add
+  /// tasks to the view.
+  ///
+  /// \param scheduler a scheduler to submit tasks to after throttling
+  ///
+  /// This can be the root scheduler, another throttled scheduler, or a task group.  These
+  /// are all composable.
+  ///
+  /// \param max_concurrent_cost the maximum amount of cost allowed to run at any one time
+  ///
+  /// If a task is added that has a cost greater than max_concurrent_cost then its cost
+  /// will be reduced to max_concurrent_cost so that it is still possible for the task to
+  /// run.
+  ///
+  /// \param queue the queue to use when tasks cannot be submitted
+  ///
+  /// By default a FIFO queue will be used.  However, a custom queue can be provided if
+  /// some tasks have higher priority than other tasks.
+  static std::shared_ptr<ThrottledAsyncTaskScheduler> Make(
+      AsyncTaskScheduler* scheduler, int max_concurrent_cost,
+      std::unique_ptr<Queue> queue = NULLPTR);
+
+  /// @brief Create a ThrottledAsyncTaskScheduler using a custom throttle
+  ///
+  /// \see Make
+  static std::shared_ptr<ThrottledAsyncTaskScheduler> MakeWithCustomThrottle(
+      AsyncTaskScheduler* scheduler, std::unique_ptr<Throttle> throttle,
+      std::unique_ptr<Queue> queue = NULLPTR);
+};
+
+/// A utility to keep track of a collection of tasks
+///
+/// Often it is useful to keep track of some state that only needs to stay alive
+/// for some small collection of tasks, or to perform some kind of final cleanup
+/// when a collection of tasks is finished.
+///
+/// For example, when scanning, we need to keep the file reader alive while all scan
+/// tasks run for a given file, and then we can gracefully close it when we finish the
+/// file.
+class ARROW_EXPORT AsyncTaskGroup : public AsyncTaskScheduler {
+ public:
+  /// Destructor for the task group
+  ///
+  /// The destructor might trigger the finish callback.  If the finish callback fails
+  /// then the error will be reported as a task on the scheduler.
+  ///
+  /// Failure to destroy the async task group will not prevent the scheduler from
+  /// finishing.  If the scheduler finishes before the async task group is done then
+  /// the finish callback will be run immediately when the async task group finishes.
+  ///
+  /// If the scheduler has aborted then the finish callback will not run.
+  ~AsyncTaskGroup() = default;
+  /// Create an async task group
+  ///
+  /// The finish callback will not run until the task group is destroyed and all
+  /// tasks are finished so you will generally want to reset / destroy the returned
+  /// unique_ptr at some point.
+  ///
+  /// \param scheduler The underlying scheduler to submit tasks to
+  /// \param finish_callback A callback that will be run only after the task group has
+  ///                        been destroyed and all tasks added by the group have
+  ///                        finished.
+  ///
+  /// Note: in error scenarios the finish callback may not run.  However, it will still,
+  /// of course, be destroyed.
+  static std::unique_ptr<AsyncTaskGroup> Make(AsyncTaskScheduler* scheduler,
+                                              FnOnce<Status()> finish_callback);
+};
+
+/// Create a task group that is also throttled
+///
+/// This is a utility factory that creates a throttled view of a scheduler and then
+/// wraps that throttled view with a task group that destroys the throttle when finished.
+///
+/// \see ThrottledAsyncTaskScheduler
+/// \see AsyncTaskGroup
+/// \param target the underlying scheduler to submit tasks to
+/// \param max_concurrent_cost the maximum amount of cost allowed to run at any one time
+/// \param queue the queue to use when tasks cannot be submitted
+/// \param finish_callback A callback that will be run only after the task group has
+///                  been destroyed and all tasks added by the group have finished
+ARROW_EXPORT std::unique_ptr<ThrottledAsyncTaskScheduler> MakeThrottledAsyncTaskGroup(
+    AsyncTaskScheduler* target, int max_concurrent_cost,
+    std::unique_ptr<ThrottledAsyncTaskScheduler::Queue> queue,
+    FnOnce<Status()> finish_callback);
+
+// Defined down here to avoid circular dependency between AsyncTaskScheduler and
+// AsyncTaskGroup
+template <typename T>
+bool AsyncTaskScheduler::AddAsyncGenerator(std::function<Future<T>()> generator,
+                                           std::function<Status(const T&)> visitor,
+                                           std::string_view name) {
+  struct State {
+    State(std::function<Future<T>()> generator, std::function<Status(const T&)> visitor,
+          std::unique_ptr<AsyncTaskGroup> task_group, std::string_view name)
+        : generator(std::move(generator)),
+          visitor(std::move(visitor)),
+          task_group(std::move(task_group)),
+          name(name) {}
+    std::function<Future<T>()> generator;
+    std::function<Status(const T&)> visitor;
+    std::unique_ptr<AsyncTaskGroup> task_group;
+    std::string_view name;
+  };
+  struct SubmitTask : public Task {
+    explicit SubmitTask(std::unique_ptr<State> state_holder)
+        : state_holder(std::move(state_holder)) {}
+
+    struct SubmitTaskCallback {
+      SubmitTaskCallback(std::unique_ptr<State> state_holder, Future<> task_completion)
+          : state_holder(std::move(state_holder)),
+            task_completion(std::move(task_completion)) {}
+      void operator()(const Result<T>& maybe_item) {
+        if (!maybe_item.ok()) {
+          task_completion.MarkFinished(maybe_item.status());
+          return;
+        }
+        const auto& item = *maybe_item;
+        if (IsIterationEnd(item)) {
+          task_completion.MarkFinished();
+          return;
+        }
+        Status visit_st = state_holder->visitor(item);
+        if (!visit_st.ok()) {
+          task_completion.MarkFinished(std::move(visit_st));
+          return;
+        }
+        state_holder->task_group->AddTask(
+            std::make_unique<SubmitTask>(std::move(state_holder)));
+        task_completion.MarkFinished();
+      }
+      std::unique_ptr<State> state_holder;
+      Future<> task_completion;
+    };
+
+    Result<Future<>> operator()() {
+      Future<> task = Future<>::Make();
+      // Consume as many items as we can (those that are already finished)
+      // synchronously to avoid recursion / stack overflow.
+      while (true) {
+        Future<T> next = state_holder->generator();
+        if (next.TryAddCallback(
+                [&] { return SubmitTaskCallback(std::move(state_holder), task); })) {
+          return task;
+        }
+        ARROW_ASSIGN_OR_RAISE(T item, next.result());
+        if (IsIterationEnd(item)) {
+          task.MarkFinished();
+          return task;
+        }
+        ARROW_RETURN_NOT_OK(state_holder->visitor(item));
+      }
+    }
+
+    std::string_view name() const { return state_holder->name; }
+
+    std::unique_ptr<State> state_holder;
+  };
+  std::unique_ptr<AsyncTaskGroup> task_group =
+      AsyncTaskGroup::Make(this, [] { return Status::OK(); });
+  AsyncTaskGroup* task_group_view = task_group.get();
+  std::unique_ptr<State> state_holder = std::make_unique<State>(
+      std::move(generator), std::move(visitor), std::move(task_group), name);
+  task_group_view->AddTask(std::make_unique<SubmitTask>(std::move(state_holder)));
+  return true;
+}
+
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/base64.h b/pyarrow/include/arrow/util/base64.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b80e19d896b746ccc4318bb2f8ce250c7892e66
--- /dev/null
+++ b/pyarrow/include/arrow/util/base64.h
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <string_view>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+ARROW_EXPORT
+std::string base64_encode(std::string_view s);
+
+ARROW_EXPORT
+std::string base64_decode(std::string_view s);
+
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/basic_decimal.h b/pyarrow/include/arrow/util/basic_decimal.h
new file mode 100644
index 0000000000000000000000000000000000000000..638c4870f1ded4567f9b77f695ac8c8f67db85b5
--- /dev/null
+++ b/pyarrow/include/arrow/util/basic_decimal.h
@@ -0,0 +1,887 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <climits>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <string>
+#include <type_traits>
+
+#include "arrow/util/endian.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/type_traits.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+enum class DecimalStatus {
+  kSuccess,
+  kDivideByZero,
+  kOverflow,
+  kRescaleDataLoss,
+};
+
+template <typename Derived, int BIT_WIDTH, int NWORDS = BIT_WIDTH / 64>
+class GenericBasicDecimal {
+ protected:
+  struct LittleEndianArrayTag {};
+
+#if ARROW_LITTLE_ENDIAN
+  static constexpr int kHighWordIndex = NWORDS - 1;
+  static constexpr int kLowWordIndex = 0;
+#else
+  static constexpr int kHighWordIndex = 0;
+  static constexpr int kLowWordIndex = NWORDS - 1;
+#endif
+
+ public:
+  static constexpr int kBitWidth = BIT_WIDTH;
+  static constexpr int kByteWidth = kBitWidth / 8;
+  static constexpr int kNumWords = NWORDS;
+
+  // A constructor tag to introduce a little-endian encoded array
+  static constexpr LittleEndianArrayTag LittleEndianArray{};
+
+  using WordArray = std::array<uint64_t, NWORDS>;
+
+  /// \brief Empty constructor creates a decimal with a value of 0.
+  constexpr GenericBasicDecimal() noexcept : array_({0}) {}
+
+  /// \brief Create a decimal from the two's complement representation.
+  ///
+  /// Input array is assumed to be in native endianness.
+  explicit constexpr GenericBasicDecimal(const WordArray& array) noexcept
+      : array_(array) {}
+
+  /// \brief Create a decimal from the two's complement representation.
+  ///
+  /// Input array is assumed to be in little endianness, with native endian elements.
+  GenericBasicDecimal(LittleEndianArrayTag, const WordArray& array) noexcept
+      : GenericBasicDecimal(bit_util::little_endian::ToNative(array)) {}
+
+  /// \brief Create a decimal from any integer not wider than 64 bits.
+  template <typename T,
+            typename = typename std::enable_if<
+                std::is_integral<T>::value && (sizeof(T) <= sizeof(uint64_t)), T>::type>
+  constexpr GenericBasicDecimal(T value) noexcept  // NOLINT(runtime/explicit)
+      : array_(WordsFromLowBits(value)) {}
+
+  /// \brief Create a decimal from an array of bytes.
+  ///
+  /// Bytes are assumed to be in native-endian byte order.
+  explicit GenericBasicDecimal(const uint8_t* bytes) {
+    memcpy(array_.data(), bytes, sizeof(array_));
+  }
+
+  /// \brief Get the bits of the two's complement representation of the number.
+  ///
+  /// The elements are in native endian order. The bits within each uint64_t element
+  /// are in native endian order. For example, on a little endian machine,
+  /// BasicDecimal128(123).native_endian_array() = {123, 0};
+  /// but on a big endian machine,
+  /// BasicDecimal128(123).native_endian_array() = {0, 123};
+  constexpr const WordArray& native_endian_array() const { return array_; }
+
+  /// \brief Get the bits of the two's complement representation of the number.
+  ///
+  /// The elements are in little endian order. However, the bits within each
+  /// uint64_t element are in native endian order.
+  /// For example, BasicDecimal128(123).little_endian_array() = {123, 0};
+  WordArray little_endian_array() const {
+    return bit_util::little_endian::FromNative(array_);
+  }
+
+  const uint8_t* native_endian_bytes() const {
+    return reinterpret_cast<const uint8_t*>(array_.data());
+  }
+
+  uint8_t* mutable_native_endian_bytes() {
+    return reinterpret_cast<uint8_t*>(array_.data());
+  }
+
+  /// \brief Return the raw bytes of the value in native-endian byte order.
+  std::array<uint8_t, kByteWidth> ToBytes() const {
+    std::array<uint8_t, kByteWidth> out{{0}};
+    memcpy(out.data(), array_.data(), kByteWidth);
+    return out;
+  }
+
+  /// \brief Copy the raw bytes of the value in native-endian byte order.
+  void ToBytes(uint8_t* out) const { memcpy(out, array_.data(), kByteWidth); }
+
+  /// Return 1 if positive or zero, -1 if strictly negative.
+  int64_t Sign() const {
+    return 1 | (static_cast<int64_t>(array_[kHighWordIndex]) >> 63);
+  }
+
+  bool IsNegative() const { return static_cast<int64_t>(array_[kHighWordIndex]) < 0; }
+
+  explicit operator bool() const { return array_ != WordArray{}; }
+
+  friend bool operator==(const GenericBasicDecimal& left,
+                         const GenericBasicDecimal& right) {
+    return left.array_ == right.array_;
+  }
+
+  friend bool operator!=(const GenericBasicDecimal& left,
+                         const GenericBasicDecimal& right) {
+    return left.array_ != right.array_;
+  }
+
+ protected:
+  WordArray array_;
+
+  template <typename T>
+  static constexpr uint64_t SignExtend(T low_bits) noexcept {
+    return low_bits >= T{} ? uint64_t{0} : ~uint64_t{0};
+  }
+
+  template <typename T>
+  static constexpr WordArray WordsFromLowBits(T low_bits) {
+    WordArray words{};
+    if (low_bits < T{}) {
+      for (auto& word : words) {
+        word = ~uint64_t{0};
+      }
+    }
+    words[kLowWordIndex] = static_cast<uint64_t>(low_bits);
+    return words;
+  }
+};
+
+template <typename DigitType>
+class ARROW_EXPORT SmallBasicDecimal {
+ public:
+  static_assert(
+      std::is_same_v<DigitType, int32_t> || std::is_same_v<DigitType, int64_t>,
+      "for bitwidths larger than 64 bits use BasicDecimal128 and BasicDecimal256");
+
+  static constexpr int kMaxPrecision = std::numeric_limits<DigitType>::digits10;
+  static constexpr int kMaxScale = kMaxPrecision;
+  static constexpr int kBitWidth = sizeof(DigitType) * CHAR_BIT;
+  static constexpr int kByteWidth = sizeof(DigitType);
+
+  using WordArray = std::array<std::make_unsigned_t<DigitType>, 1>;
+
+  /// \brief Empty constructor creates a decimal with a value of 0.
+  constexpr SmallBasicDecimal() noexcept : value_(0) {}
+
+  /// \brief Create a decimal from any integer not wider than 64 bits.
+  template <typename T,
+            typename = typename std::enable_if<
+                std::is_integral<T>::value && (sizeof(T) <= sizeof(int64_t)), T>::type>
+  constexpr SmallBasicDecimal(T value) noexcept  // NOLINT(runtime/explicit)
+      : value_(static_cast<DigitType>(value)) {}
+
+  /// \brief Create a decimal from an array of bytes.
+  ///
+  /// Bytes are assumed to be in native-endian byte order.
+  explicit SmallBasicDecimal(const uint8_t* bytes) {
+    memcpy(&value_, bytes, sizeof(value_));
+  }
+
+  constexpr const WordArray native_endian_array() const {
+    return WordArray{static_cast<typename WordArray::value_type>(value_)};
+  }
+
+  constexpr const WordArray little_endian_array() const {
+    return bit_util::little_endian::FromNative(
+        WordArray{static_cast<typename WordArray::value_type>(value_)});
+  }
+
+  const uint8_t* native_endian_bytes() const {
+    return reinterpret_cast<const uint8_t*>(&value_);
+  }
+
+  uint8_t* mutable_native_endian_bytes() { return reinterpret_cast<uint8_t*>(&value_); }
+
+  /// \brief Return the raw bytes of the value in native-endian byte order.
+  std::array<uint8_t, kByteWidth> ToBytes() const {
+    std::array<uint8_t, kByteWidth> out{{0}};
+    memcpy(out.data(), &value_, kByteWidth);
+    return out;
+  }
+
+  /// \brief Copy the raw bytes of the value in native-endian byte order
+  void ToBytes(uint8_t* out) const { memcpy(out, &value_, kByteWidth); }
+
+  /// \brief Return 1 if positive or 0, -1 if strictly negative
+  int64_t Sign() const { return 1 | (value_ >> (kBitWidth - 1)); }
+
+  bool IsNegative() const { return value_ < 0; }
+
+  explicit operator bool() const { return value_ != 0; }
+
+  friend bool operator==(const SmallBasicDecimal& left, const SmallBasicDecimal& right) {
+    return left.value_ == right.value_;
+  }
+
+  friend bool operator!=(const SmallBasicDecimal& left, const SmallBasicDecimal& right) {
+    return left.value_ != right.value_;
+  }
+
+  DigitType value() const { return value_; }
+
+  /// \brief count the number of leading binary zeroes.
+  int32_t CountLeadingBinaryZeros() const;
+
+  constexpr uint64_t low_bits() const { return static_cast<uint64_t>(value_); }
+
+ protected:
+  DigitType value_;
+};
+
+class BasicDecimal32;
+class BasicDecimal64;
+
+ARROW_EXPORT bool operator<(const BasicDecimal32& left, const BasicDecimal32& right);
+ARROW_EXPORT bool operator<=(const BasicDecimal32& left, const BasicDecimal32& right);
+ARROW_EXPORT bool operator>(const BasicDecimal32& left, const BasicDecimal32& right);
+ARROW_EXPORT bool operator>=(const BasicDecimal32& left, const BasicDecimal32& right);
+
+ARROW_EXPORT BasicDecimal32 operator-(const BasicDecimal32& self);
+ARROW_EXPORT BasicDecimal32 operator~(const BasicDecimal32& self);
+ARROW_EXPORT BasicDecimal32 operator+(const BasicDecimal32& left,
+                                      const BasicDecimal32& right);
+ARROW_EXPORT BasicDecimal32 operator-(const BasicDecimal32& left,
+                                      const BasicDecimal32& right);
+ARROW_EXPORT BasicDecimal32 operator*(const BasicDecimal32& left,
+                                      const BasicDecimal32& right);
+ARROW_EXPORT BasicDecimal32 operator/(const BasicDecimal32& left,
+                                      const BasicDecimal32& right);
+ARROW_EXPORT BasicDecimal32 operator%(const BasicDecimal32& left,
+                                      const BasicDecimal32& right);
+
+class ARROW_EXPORT BasicDecimal32 : public SmallBasicDecimal<int32_t> {
+ public:
+  using SmallBasicDecimal<int32_t>::SmallBasicDecimal;
+  using ValueType = int32_t;
+
+  /// \brief Negate the current value (in-place)
+  BasicDecimal32& Negate();
+
+  /// \brief Absolute value (in-place)
+  BasicDecimal32& Abs() { return *this < 0 ? Negate() : *this; }
+
+  /// \brief Absolute value
+  static BasicDecimal32 Abs(const BasicDecimal32& in) {
+    BasicDecimal32 result(in);
+    return result.Abs();
+  }
+
+  /// \brief Add a number to this one. The result is truncated to 32 bits.
+  BasicDecimal32& operator+=(const BasicDecimal32& right) {
+    value_ += right.value_;
+    return *this;
+  }
+
+  /// \brief Subtract a number from this one. The result is truncated to 32 bits.
+  BasicDecimal32& operator-=(const BasicDecimal32& right) {
+    value_ -= right.value_;
+    return *this;
+  }
+
+  /// \brief Multiply this number by another. The result is truncated to 32 bits.
+  BasicDecimal32& operator*=(const BasicDecimal32& right) {
+    value_ *= static_cast<uint64_t>(right.value_);
+    return *this;
+  }
+
+  /// \brief Divide this number by the divisor and return the result.
+  ///
+  /// This operation is not destructive.
+  /// The answer rounds to zero. Signs work like:
+  ///   21 /  5 ->  4,  1
+  ///  -21 /  5 -> -4, -1
+  ///   21 / -5 -> -4,  1
+  ///  -21 / -5 ->  4, -1
+  /// \param[in] divisor the number to divide by
+  /// \param[out] result the quotient
+  /// \param[out] remainder the remainder after the division
+  DecimalStatus Divide(const BasicDecimal32& divisor, BasicDecimal32* result,
+                       BasicDecimal32* remainder) const;
+
+  /// \brief In-place division
+  BasicDecimal32& operator/=(const BasicDecimal32& right) {
+    value_ /= right.value_;
+    return *this;
+  }
+
+  /// \brief Bitwise "or" between two BasicDecimal32s
+  BasicDecimal32& operator|=(const BasicDecimal32& right) {
+    value_ |= right.value_;
+    return *this;
+  }
+
+  /// \brief Bitwise "and" between two BasicDecimal32s
+  BasicDecimal32& operator&=(const BasicDecimal32& right) {
+    value_ &= right.value_;
+    return *this;
+  }
+  /// \brief Shift left by the given number of bits.
+  BasicDecimal32& operator<<=(uint32_t bits);
+
+  BasicDecimal32 operator<<(uint32_t bits) const {
+    auto res = *this;
+    res <<= bits;
+    return res;
+  }
+
+  /// \brief Shift right by the given number of bits.
+  ///
+  /// Negative values will sign-extend
+  BasicDecimal32& operator>>=(uint32_t bits);
+
+  BasicDecimal32 operator>>(uint32_t bits) const {
+    auto res = *this;
+    res >>= bits;
+    return res;
+  }
+
+  /// \brief Convert BasicDecimal32 from one scale to another
+  DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
+                        BasicDecimal32* out) const;
+
+  void GetWholeAndFraction(int scale, BasicDecimal32* whole,
+                           BasicDecimal32* fraction) const;
+
+  /// \brief Scale up.
+  BasicDecimal32 IncreaseScaleBy(int32_t increase_by) const;
+
+  /// \brief Scale down.
+  ///
+  /// - If 'round' is true, the right-most digits are dropped and the result value is
+  ///   rounded up (+1 for +ve, -1 for -ve) based on the value of the dropped digits
+  ///   (>= 10^reduce_by / 2).
+  /// - If 'round' is false, the right-most digits are simply dropped.
+  BasicDecimal32 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
+
+  /// \brief Whether this number fits in the given precision
+  ///
+  /// Return true if the number of significant digits is less or equal to 'precision'.
+  bool FitsInPrecision(int32_t precision) const;
+
+  /// \brief Get the maximum valid unscaled decimal value.
+  static const BasicDecimal32& GetMaxValue();
+  /// \brief Get the maximum valid unscaled decimal value for the given precision.
+  static BasicDecimal32 GetMaxValue(int32_t precision);
+
+  /// \brief Get the maximum decimal value (is not a valid value).
+  static constexpr BasicDecimal32 GetMaxSentinel() {
+    return BasicDecimal32(std::numeric_limits<int32_t>::max());
+  }
+
+  /// \brief Get the minimum decimal value (is not a valid value).
+  static constexpr BasicDecimal32 GetMinSentinel() {
+    return BasicDecimal32(std::numeric_limits<int32_t>::min());
+  }
+
+  /// \brief Scale multiplier for a given scale value.
+  static const BasicDecimal32& GetScaleMultiplier(int32_t scale);
+  /// \brief Half-scale multiplier for a given scale value.
+  static const BasicDecimal32& GetHalfScaleMultiplier(int32_t scale);
+
+  explicit operator BasicDecimal64() const;
+};
+
+ARROW_EXPORT bool operator<(const BasicDecimal64& left, const BasicDecimal64& right);
+ARROW_EXPORT bool operator<=(const BasicDecimal64& left, const BasicDecimal64& right);
+ARROW_EXPORT bool operator>(const BasicDecimal64& left, const BasicDecimal64& right);
+ARROW_EXPORT bool operator>=(const BasicDecimal64& left, const BasicDecimal64& right);
+
+ARROW_EXPORT BasicDecimal64 operator-(const BasicDecimal64& self);
+ARROW_EXPORT BasicDecimal64 operator~(const BasicDecimal64& self);
+ARROW_EXPORT BasicDecimal64 operator+(const BasicDecimal64& left,
+                                      const BasicDecimal64& right);
+ARROW_EXPORT BasicDecimal64 operator-(const BasicDecimal64& left,
+                                      const BasicDecimal64& right);
+ARROW_EXPORT BasicDecimal64 operator*(const BasicDecimal64& left,
+                                      const BasicDecimal64& right);
+ARROW_EXPORT BasicDecimal64 operator/(const BasicDecimal64& left,
+                                      const BasicDecimal64& right);
+ARROW_EXPORT BasicDecimal64 operator%(const BasicDecimal64& left,
+                                      const BasicDecimal64& right);
+
+class ARROW_EXPORT BasicDecimal64 : public SmallBasicDecimal<int64_t> {
+ public:
+  using SmallBasicDecimal<int64_t>::SmallBasicDecimal;
+  using ValueType = int64_t;
+
+  /// \brief Negate the current value (in-place)
+  BasicDecimal64& Negate();
+
+  /// \brief Absolute value (in-place)
+  BasicDecimal64& Abs() { return *this < 0 ? Negate() : *this; }
+
+  /// \brief Absolute value
+  static BasicDecimal64 Abs(const BasicDecimal64& in) {
+    BasicDecimal64 result(in);
+    return result.Abs();
+  }
+
+  /// \brief Add a number to this one. The result is truncated to 32 bits.
+  BasicDecimal64& operator+=(const BasicDecimal64& right) {
+    value_ += right.value_;
+    return *this;
+  }
+
+  /// \brief Subtract a number from this one. The result is truncated to 32 bits.
+  BasicDecimal64& operator-=(const BasicDecimal64& right) {
+    value_ -= right.value_;
+    return *this;
+  }
+
+  /// \brief Multiply this number by another. The result is truncated to 32 bits.
+  BasicDecimal64& operator*=(const BasicDecimal64& right) {
+    value_ *= static_cast<uint64_t>(right.value_);
+    return *this;
+  }
+
+  /// \brief Divide this number by the divisor and return the result.
+  ///
+  /// This operation is not destructive.
+  /// The answer rounds to zero. Signs work like:
+  ///   21 /  5 ->  4,  1
+  ///  -21 /  5 -> -4, -1
+  ///   21 / -5 -> -4,  1
+  ///  -21 / -5 ->  4, -1
+  /// \param[in] divisor the number to divide by
+  /// \param[out] result the quotient
+  /// \param[out] remainder the remainder after the division
+  DecimalStatus Divide(const BasicDecimal64& divisor, BasicDecimal64* result,
+                       BasicDecimal64* remainder) const;
+
+  /// \brief In-place division
+  BasicDecimal64& operator/=(const BasicDecimal64& right) {
+    value_ /= right.value_;
+    return *this;
+  }
+
+  /// \brief Bitwise "or" between two BasicDecimal64s
+  BasicDecimal64& operator|=(const BasicDecimal64& right) {
+    value_ |= right.value_;
+    return *this;
+  }
+
+  /// \brief Bitwise "and" between two BasicDecimal64s
+  BasicDecimal64& operator&=(const BasicDecimal64& right) {
+    value_ &= right.value_;
+    return *this;
+  }
+
+  /// \brief Shift left by the given number of bits.
+  BasicDecimal64& operator<<=(uint32_t bits);
+
+  BasicDecimal64 operator<<(uint32_t bits) const {
+    auto res = *this;
+    res <<= bits;
+    return res;
+  }
+
+  /// \brief Shift right by the given number of bits.
+  ///
+  /// Negative values will sign-extend
+  BasicDecimal64& operator>>=(uint32_t bits);
+
+  BasicDecimal64 operator>>(uint32_t bits) const {
+    auto res = *this;
+    res >>= bits;
+    return res;
+  }
+
+  /// \brief Convert BasicDecimal32 from one scale to another
+  DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
+                        BasicDecimal64* out) const;
+
+  void GetWholeAndFraction(int scale, BasicDecimal64* whole,
+                           BasicDecimal64* fraction) const;
+
+  /// \brief Scale up.
+  BasicDecimal64 IncreaseScaleBy(int32_t increase_by) const;
+
+  /// \brief Scale down.
+  ///
+  /// - If 'round' is true, the right-most digits are dropped and the result value is
+  ///   rounded up (+1 for +ve, -1 for -ve) based on the value of the dropped digits
+  ///   (>= 10^reduce_by / 2).
+  /// - If 'round' is false, the right-most digits are simply dropped.
+  BasicDecimal64 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
+
+  /// \brief Whether this number fits in the given precision
+  ///
+  /// Return true if the number of significant digits is less or equal to 'precision'.
+  bool FitsInPrecision(int32_t precision) const;
+
+  /// \brief Get the maximum valid unscaled decimal value.
+  static const BasicDecimal64& GetMaxValue();
+  /// \brief Get the maximum valid unscaled decimal value for the given precision.
+  static BasicDecimal64 GetMaxValue(int32_t precision);
+
+  /// \brief Get the maximum decimal value (is not a valid value).
+  static constexpr BasicDecimal64 GetMaxSentinel() {
+    return BasicDecimal64(std::numeric_limits<int32_t>::max());
+  }
+
+  /// \brief Get the minimum decimal value (is not a valid value).
+  static constexpr BasicDecimal64 GetMinSentinel() {
+    return BasicDecimal64(std::numeric_limits<int32_t>::min());
+  }
+
+  /// \brief Scale multiplier for a given scale value.
+  static const BasicDecimal64& GetScaleMultiplier(int32_t scale);
+  /// \brief Half-scale multiplier for a given scale value.
+  static const BasicDecimal64& GetHalfScaleMultiplier(int32_t scale);
+};
+
+/// Represents a signed 128-bit integer in two's complement.
+///
+/// This class is also compiled into LLVM IR - so, it should not have cpp references like
+/// streams and boost.
+class ARROW_EXPORT BasicDecimal128 : public GenericBasicDecimal<BasicDecimal128, 128> {
+ public:
+  static constexpr int kMaxPrecision = 38;
+  static constexpr int kMaxScale = 38;
+
+  using GenericBasicDecimal::GenericBasicDecimal;
+
+  constexpr BasicDecimal128() noexcept : GenericBasicDecimal() {}
+
+  /// \brief Create a BasicDecimal128 from the two's complement representation.
+#if ARROW_LITTLE_ENDIAN
+  constexpr BasicDecimal128(int64_t high, uint64_t low) noexcept
+      : BasicDecimal128(WordArray{low, static_cast<uint64_t>(high)}) {}
+#else
+  constexpr BasicDecimal128(int64_t high, uint64_t low) noexcept
+      : BasicDecimal128(WordArray{static_cast<uint64_t>(high), low}) {}
+#endif
+
+  /// \brief Negate the current value (in-place)
+  BasicDecimal128& Negate();
+
+  /// \brief Absolute value (in-place)
+  BasicDecimal128& Abs();
+
+  /// \brief Absolute value
+  static BasicDecimal128 Abs(const BasicDecimal128& left);
+
+  /// \brief Add a number to this one. The result is truncated to 128 bits.
+  BasicDecimal128& operator+=(const BasicDecimal128& right);
+
+  /// \brief Subtract a number from this one. The result is truncated to 128 bits.
+  BasicDecimal128& operator-=(const BasicDecimal128& right);
+
+  /// \brief Multiply this number by another number. The result is truncated to 128 bits.
+  BasicDecimal128& operator*=(const BasicDecimal128& right);
+
+  /// Divide this number by right and return the result.
+  ///
+  /// This operation is not destructive.
+  /// The answer rounds to zero. Signs work like:
+  ///   21 /  5 ->  4,  1
+  ///  -21 /  5 -> -4, -1
+  ///   21 / -5 -> -4,  1
+  ///  -21 / -5 ->  4, -1
+  /// \param[in] divisor the number to divide by
+  /// \param[out] result the quotient
+  /// \param[out] remainder the remainder after the division
+  DecimalStatus Divide(const BasicDecimal128& divisor, BasicDecimal128* result,
+                       BasicDecimal128* remainder) const;
+
+  /// \brief In-place division.
+  BasicDecimal128& operator/=(const BasicDecimal128& right);
+
+  /// \brief Bitwise "or" between two BasicDecimal128.
+  BasicDecimal128& operator|=(const BasicDecimal128& right);
+
+  /// \brief Bitwise "and" between two BasicDecimal128.
+  BasicDecimal128& operator&=(const BasicDecimal128& right);
+
+  /// \brief Shift left by the given number of bits.
+  BasicDecimal128& operator<<=(uint32_t bits);
+
+  BasicDecimal128 operator<<(uint32_t bits) const {
+    auto res = *this;
+    res <<= bits;
+    return res;
+  }
+
+  /// \brief Shift right by the given number of bits.
+  ///
+  /// Negative values will sign-extend.
+  BasicDecimal128& operator>>=(uint32_t bits);
+
+  BasicDecimal128 operator>>(uint32_t bits) const {
+    auto res = *this;
+    res >>= bits;
+    return res;
+  }
+
+  /// \brief Get the high bits of the two's complement representation of the number.
+  constexpr int64_t high_bits() const {
+#if ARROW_LITTLE_ENDIAN
+    return static_cast<int64_t>(array_[1]);
+#else
+    return static_cast<int64_t>(array_[0]);
+#endif
+  }
+
+  /// \brief Get the low bits of the two's complement representation of the number.
+  constexpr uint64_t low_bits() const {
+#if ARROW_LITTLE_ENDIAN
+    return array_[0];
+#else
+    return array_[1];
+#endif
+  }
+
+  /// \brief separate the integer and fractional parts for the given scale.
+  void GetWholeAndFraction(int32_t scale, BasicDecimal128* whole,
+                           BasicDecimal128* fraction) const;
+
+  /// \brief Scale multiplier for given scale value.
+  static const BasicDecimal128& GetScaleMultiplier(int32_t scale);
+  /// \brief Half-scale multiplier for given scale value.
+  static const BasicDecimal128& GetHalfScaleMultiplier(int32_t scale);
+
+  /// \brief Convert BasicDecimal128 from one scale to another
+  DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
+                        BasicDecimal128* out) const;
+
+  /// \brief Scale up.
+  BasicDecimal128 IncreaseScaleBy(int32_t increase_by) const;
+
+  /// \brief Scale down.
+  /// - If 'round' is true, the right-most digits are dropped and the result value is
+  ///   rounded up (+1 for +ve, -1 for -ve) based on the value of the dropped digits
+  ///   (>= 10^reduce_by / 2).
+  /// - If 'round' is false, the right-most digits are simply dropped.
+  BasicDecimal128 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
+
+  /// \brief Whether this number fits in the given precision
+  ///
+  /// Return true if the number of significant digits is less or equal to `precision`.
+  bool FitsInPrecision(int32_t precision) const;
+
+  /// \brief count the number of leading binary zeroes.
+  int32_t CountLeadingBinaryZeros() const;
+
+  /// \brief Get the maximum valid unscaled decimal value.
+  static const BasicDecimal128& GetMaxValue();
+
+  /// \brief Get the maximum valid unscaled decimal value for the given precision.
+  static BasicDecimal128 GetMaxValue(int32_t precision);
+
+  /// \brief Get the maximum decimal value (is not a valid value).
+  static constexpr BasicDecimal128 GetMaxSentinel() {
+    return BasicDecimal128(/*high=*/std::numeric_limits<int64_t>::max(),
+                           /*low=*/std::numeric_limits<uint64_t>::max());
+  }
+  /// \brief Get the minimum decimal value (is not a valid value).
+  static constexpr BasicDecimal128 GetMinSentinel() {
+    return BasicDecimal128(/*high=*/std::numeric_limits<int64_t>::min(),
+                           /*low=*/std::numeric_limits<uint64_t>::min());
+  }
+};
+
+ARROW_EXPORT bool operator<(const BasicDecimal128& left, const BasicDecimal128& right);
+ARROW_EXPORT bool operator<=(const BasicDecimal128& left, const BasicDecimal128& right);
+ARROW_EXPORT bool operator>(const BasicDecimal128& left, const BasicDecimal128& right);
+ARROW_EXPORT bool operator>=(const BasicDecimal128& left, const BasicDecimal128& right);
+
+ARROW_EXPORT BasicDecimal128 operator-(const BasicDecimal128& operand);
+ARROW_EXPORT BasicDecimal128 operator~(const BasicDecimal128& operand);
+ARROW_EXPORT BasicDecimal128 operator+(const BasicDecimal128& left,
+                                       const BasicDecimal128& right);
+ARROW_EXPORT BasicDecimal128 operator-(const BasicDecimal128& left,
+                                       const BasicDecimal128& right);
+ARROW_EXPORT BasicDecimal128 operator*(const BasicDecimal128& left,
+                                       const BasicDecimal128& right);
+ARROW_EXPORT BasicDecimal128 operator/(const BasicDecimal128& left,
+                                       const BasicDecimal128& right);
+ARROW_EXPORT BasicDecimal128 operator%(const BasicDecimal128& left,
+                                       const BasicDecimal128& right);
+
+class ARROW_EXPORT BasicDecimal256 : public GenericBasicDecimal<BasicDecimal256, 256> {
+ public:
+  using GenericBasicDecimal::GenericBasicDecimal;
+
+  static constexpr int kMaxPrecision = 76;
+  static constexpr int kMaxScale = 76;
+
+  constexpr BasicDecimal256() noexcept : GenericBasicDecimal() {}
+
+  explicit BasicDecimal256(const BasicDecimal128& value) noexcept
+      : BasicDecimal256(bit_util::little_endian::ToNative<uint64_t, 4>(
+            {value.low_bits(), static_cast<uint64_t>(value.high_bits()),
+             SignExtend(value.high_bits()), SignExtend(value.high_bits())})) {}
+
+  explicit BasicDecimal256(const BasicDecimal64& value) noexcept
+      : BasicDecimal256(bit_util::little_endian::ToNative<uint64_t, 4>(
+            {value.low_bits(), SignExtend(value.value()), SignExtend(value.value()),
+             SignExtend(value.value())})) {}
+
+  explicit BasicDecimal256(const BasicDecimal32& value) noexcept
+      : BasicDecimal256(bit_util::little_endian::ToNative<uint64_t, 4>(
+            {value.low_bits(), SignExtend(value.value()), SignExtend(value.value()),
+             SignExtend(value.value())})) {}
+
+  /// \brief Negate the current value (in-place)
+  BasicDecimal256& Negate();
+
+  /// \brief Absolute value (in-place)
+  BasicDecimal256& Abs();
+
+  /// \brief Absolute value
+  static BasicDecimal256 Abs(const BasicDecimal256& left);
+
+  /// \brief Add a number to this one. The result is truncated to 256 bits.
+  BasicDecimal256& operator+=(const BasicDecimal256& right);
+
+  /// \brief Subtract a number from this one. The result is truncated to 256 bits.
+  BasicDecimal256& operator-=(const BasicDecimal256& right);
+
+  /// \brief Get the lowest bits of the two's complement representation of the number.
+  uint64_t low_bits() const { return bit_util::little_endian::Make(array_)[0]; }
+
+  /// \brief separate the integer and fractional parts for the given scale.
+  void GetWholeAndFraction(int32_t scale, BasicDecimal256* whole,
+                           BasicDecimal256* fraction) const;
+
+  /// \brief Scale multiplier for given scale value.
+  static const BasicDecimal256& GetScaleMultiplier(int32_t scale);
+  /// \brief Half-scale multiplier for given scale value.
+  static const BasicDecimal256& GetHalfScaleMultiplier(int32_t scale);
+
+  /// \brief Convert BasicDecimal256 from one scale to another
+  DecimalStatus Rescale(int32_t original_scale, int32_t new_scale,
+                        BasicDecimal256* out) const;
+
+  /// \brief Scale up.
+  BasicDecimal256 IncreaseScaleBy(int32_t increase_by) const;
+
+  /// \brief Scale down.
+  /// - If 'round' is true, the right-most digits are dropped and the result value is
+  ///   rounded up (+1 for positive, -1 for negative) based on the value of the
+  ///   dropped digits (>= 10^reduce_by / 2).
+  /// - If 'round' is false, the right-most digits are simply dropped.
+  BasicDecimal256 ReduceScaleBy(int32_t reduce_by, bool round = true) const;
+
+  /// \brief Whether this number fits in the given precision
+  ///
+  /// Return true if the number of significant digits is less or equal to `precision`.
+  bool FitsInPrecision(int32_t precision) const;
+
+  /// \brief Multiply this number by another number. The result is truncated to 256 bits.
+  BasicDecimal256& operator*=(const BasicDecimal256& right);
+
+  /// Divide this number by right and return the result.
+  ///
+  /// This operation is not destructive.
+  /// The answer rounds to zero. Signs work like:
+  ///   21 /  5 ->  4,  1
+  ///  -21 /  5 -> -4, -1
+  ///   21 / -5 -> -4,  1
+  ///  -21 / -5 ->  4, -1
+  /// \param[in] divisor the number to divide by
+  /// \param[out] result the quotient
+  /// \param[out] remainder the remainder after the division
+  DecimalStatus Divide(const BasicDecimal256& divisor, BasicDecimal256* result,
+                       BasicDecimal256* remainder) const;
+
+  /// \brief Shift left by the given number of bits.
+  BasicDecimal256& operator<<=(uint32_t bits);
+
+  BasicDecimal256 operator<<(uint32_t bits) const {
+    auto res = *this;
+    res <<= bits;
+    return res;
+  }
+
+  /// \brief Shift right by the given number of bits.
+  ///
+  /// Negative values will sign-extend.
+  BasicDecimal256& operator>>=(uint32_t bits);
+
+  BasicDecimal256 operator>>(uint32_t bits) const {
+    auto res = *this;
+    res >>= bits;
+    return res;
+  }
+
+  /// \brief In-place division.
+  BasicDecimal256& operator/=(const BasicDecimal256& right);
+
+  /// \brief Get the maximum valid unscaled decimal value for the given precision.
+  static BasicDecimal256 GetMaxValue(int32_t precision);
+
+  /// \brief Get the maximum decimal value (is not a valid value).
+  static constexpr BasicDecimal256 GetMaxSentinel() {
+#if ARROW_LITTLE_ENDIAN
+    return BasicDecimal256({std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            static_cast<uint64_t>(std::numeric_limits<int64_t>::max())});
+#else
+    return BasicDecimal256({static_cast<uint64_t>(std::numeric_limits<int64_t>::max()),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max(),
+                            std::numeric_limits<uint64_t>::max()});
+#endif
+  }
+  /// \brief Get the minimum decimal value (is not a valid value).
+  static constexpr BasicDecimal256 GetMinSentinel() {
+#if ARROW_LITTLE_ENDIAN
+    return BasicDecimal256(
+        {0, 0, 0, static_cast<uint64_t>(std::numeric_limits<int64_t>::min())});
+#else
+    return BasicDecimal256(
+        {static_cast<uint64_t>(std::numeric_limits<int64_t>::min()), 0, 0, 0});
+#endif
+  }
+};
+
+ARROW_EXPORT bool operator<(const BasicDecimal256& left, const BasicDecimal256& right);
+
+ARROW_EXPORT inline bool operator<=(const BasicDecimal256& left,
+                                    const BasicDecimal256& right) {
+  return !operator<(right, left);
+}
+
+ARROW_EXPORT inline bool operator>(const BasicDecimal256& left,
+                                   const BasicDecimal256& right) {
+  return operator<(right, left);
+}
+
+ARROW_EXPORT inline bool operator>=(const BasicDecimal256& left,
+                                    const BasicDecimal256& right) {
+  return !operator<(left, right);
+}
+
+ARROW_EXPORT BasicDecimal256 operator-(const BasicDecimal256& operand);
+ARROW_EXPORT BasicDecimal256 operator~(const BasicDecimal256& operand);
+ARROW_EXPORT BasicDecimal256 operator+(const BasicDecimal256& left,
+                                       const BasicDecimal256& right);
+ARROW_EXPORT BasicDecimal256 operator*(const BasicDecimal256& left,
+                                       const BasicDecimal256& right);
+ARROW_EXPORT BasicDecimal256 operator/(const BasicDecimal256& left,
+                                       const BasicDecimal256& right);
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/benchmark_util.h b/pyarrow/include/arrow/util/benchmark_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..75639ac11ae41acb5e23e3eaa91901f41472fdc6
--- /dev/null
+++ b/pyarrow/include/arrow/util/benchmark_util.h
@@ -0,0 +1,211 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <string>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/memory_pool.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/cpu_info.h"
+#include "arrow/util/logging.h"  // IWYU pragma: keep
+
+namespace arrow {
+
+// Benchmark changed its parameter type between releases from
+// int to int64_t. As it doesn't have version macros, we need
+// to apply C++ template magic.
+
+template <typename Func>
+struct BenchmarkArgsType;
+
+// Pattern matching that extracts the vector element type of Benchmark::Args()
+template <typename Values>
+struct BenchmarkArgsType<benchmark::internal::Benchmark* (
+    benchmark::internal::Benchmark::*)(const std::vector<Values>&)> {
+  using type = Values;
+};
+
+using ArgsType =
+    typename BenchmarkArgsType<decltype(&benchmark::internal::Benchmark::Args)>::type;
+
+using internal::CpuInfo;
+
+static const CpuInfo* cpu_info = CpuInfo::GetInstance();
+
+static const int64_t kL1Size = cpu_info->CacheSize(CpuInfo::CacheLevel::L1);
+static const int64_t kL2Size = cpu_info->CacheSize(CpuInfo::CacheLevel::L2);
+static const int64_t kL3Size = cpu_info->CacheSize(CpuInfo::CacheLevel::L3);
+static const int64_t kCantFitInL3Size = kL3Size * 4;
+static const std::vector<int64_t> kMemorySizes = {kL1Size, kL2Size, kL3Size,
+                                                  kCantFitInL3Size};
+// 0 is treated as "no nulls"
+static const std::vector<ArgsType> kInverseNullProportions = {10000, 100, 10, 2, 1, 0};
+
+struct GenericItemsArgs {
+  // number of items processed per iteration
+  const int64_t size;
+
+  // proportion of nulls in generated arrays
+  double null_proportion;
+
+  explicit GenericItemsArgs(benchmark::State& state)
+      : size(state.range(0)), state_(state) {
+    if (state.range(1) == 0) {
+      this->null_proportion = 0.0;
+    } else {
+      this->null_proportion = std::min(1., 1. / static_cast<double>(state.range(1)));
+    }
+  }
+
+  ~GenericItemsArgs() {
+    state_.counters["size"] = static_cast<double>(size);
+    state_.counters["null_percent"] = null_proportion * 100;
+    state_.SetItemsProcessed(state_.iterations() * size);
+  }
+
+ private:
+  benchmark::State& state_;
+};
+
+void BenchmarkSetArgsWithSizes(benchmark::internal::Benchmark* bench,
+                               const std::vector<int64_t>& sizes = kMemorySizes) {
+  bench->Unit(benchmark::kMicrosecond);
+
+  for (const auto size : sizes) {
+    for (const auto inverse_null_proportion : kInverseNullProportions) {
+      bench->Args({static_cast<ArgsType>(size), inverse_null_proportion});
+    }
+  }
+}
+
+void BenchmarkSetArgs(benchmark::internal::Benchmark* bench) {
+  BenchmarkSetArgsWithSizes(bench, kMemorySizes);
+}
+
+void RegressionSetArgs(benchmark::internal::Benchmark* bench) {
+  // Regression do not need to account for cache hierarchy, thus optimize for
+  // the best case.
+  BenchmarkSetArgsWithSizes(bench, {kL1Size});
+}
+
+// RAII struct to handle some of the boilerplate in regression benchmarks
+struct RegressionArgs {
+  // size of memory tested (per iteration) in bytes
+  int64_t size;
+
+  // proportion of nulls in generated arrays
+  double null_proportion;
+
+  // If size_is_bytes is true, then it's a number of bytes, otherwise it's the
+  // number of items processed (for reporting)
+  explicit RegressionArgs(benchmark::State& state, bool size_is_bytes = true)
+      : size(state.range(0)), state_(state), size_is_bytes_(size_is_bytes) {
+    if (state.range(1) == 0) {
+      this->null_proportion = 0.0;
+    } else {
+      this->null_proportion = std::min(1., 1. / static_cast<double>(state.range(1)));
+    }
+  }
+
+  ~RegressionArgs() {
+    state_.counters["size"] = static_cast<double>(size);
+    state_.counters["null_percent"] = null_proportion * 100;
+    if (size_is_bytes_) {
+      state_.SetBytesProcessed(state_.iterations() * size);
+    } else {
+      state_.SetItemsProcessed(state_.iterations() * size);
+    }
+  }
+
+ private:
+  benchmark::State& state_;
+  bool size_is_bytes_;
+};
+
+class MemoryPoolMemoryManager : public benchmark::MemoryManager {
+  void Start() override {
+    memory_pool = std::make_shared<ProxyMemoryPool>(default_memory_pool());
+
+    MemoryPool* default_pool = default_memory_pool();
+    global_allocations_start = default_pool->num_allocations();
+  }
+
+// BENCHMARK_DONT_OPTIMIZE is used here to detect Google Benchmark
+// 1.8.0. We can remove this Stop(Result*) when we require Google
+// Benchmark 1.8.0 or later.
+#ifndef BENCHMARK_DONT_OPTIMIZE
+  void Stop(Result* result) override { Stop(*result); }
+#endif
+
+  void Stop(benchmark::MemoryManager::Result& result) override {
+    // If num_allocations is still zero, we assume that the memory pool wasn't passed down
+    // so we should record them.
+    MemoryPool* default_pool = default_memory_pool();
+    int64_t new_default_allocations =
+        default_pool->num_allocations() - global_allocations_start;
+
+    // Only record metrics if (1) there were allocations and (2) we
+    // recorded at least one.
+    if (new_default_allocations > 0 && memory_pool->num_allocations() > 0) {
+      if (new_default_allocations > memory_pool->num_allocations()) {
+        // If we missed some, let's report that.
+        int64_t missed_allocations =
+            new_default_allocations - memory_pool->num_allocations();
+        ARROW_LOG(WARNING) << "BenchmarkMemoryTracker recorded some allocations "
+                           << "for a benchmark, but missed " << missed_allocations
+                           << " allocations.\n";
+      }
+
+      result.max_bytes_used = memory_pool->max_memory();
+      result.total_allocated_bytes = memory_pool->total_bytes_allocated();
+      result.num_allocs = memory_pool->num_allocations();
+    }
+  }
+
+ public:
+  std::shared_ptr<::arrow::ProxyMemoryPool> memory_pool;
+
+ protected:
+  int64_t global_allocations_start;
+};
+
+/// \brief Track memory pool allocations in benchmarks.
+///
+/// Instantiate as a global variable to register the hooks into Google Benchmark
+/// to collect memory metrics. Before each benchmark, a new ProxyMemoryPool is
+/// created. It can then be accessed with memory_pool(). Once the benchmark is
+/// complete, the hook will record the maximum memory used, the total bytes
+/// allocated, and the total number of allocations. If no allocations were seen,
+/// (for example, if you forgot to pass down the memory pool), then these metrics
+/// will not be saved.
+///
+/// Since this is used as one global variable, this will not work if multiple
+/// benchmarks are run concurrently or for multi-threaded benchmarks (ones
+/// that use `->ThreadRange(...)`).
+class BenchmarkMemoryTracker {
+ public:
+  BenchmarkMemoryTracker() : manager_() { ::benchmark::RegisterMemoryManager(&manager_); }
+  ::arrow::MemoryPool* memory_pool() const { return manager_.memory_pool.get(); }
+
+ protected:
+  ::arrow::MemoryPoolMemoryManager manager_;
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/binary_view_util.h b/pyarrow/include/arrow/util/binary_view_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb079e2c548abee644a1d87669a2a11d52196985
--- /dev/null
+++ b/pyarrow/include/arrow/util/binary_view_util.h
@@ -0,0 +1,115 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string_view>
+#include <utility>
+
+#include "arrow/type.h"
+#include "arrow/util/span.h"
+
+namespace arrow::util {
+
+inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t size) {
+  assert(size <= BinaryViewType::kInlineSize);
+  // Small string: inlined. Bytes beyond size are zeroed
+  BinaryViewType::c_type out;
+  out.inlined = {size, {}};
+  memcpy(&out.inlined.data, data, size);
+  return out;
+}
+
+inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) {
+  assert(v.size() <= BinaryViewType::kInlineSize);
+  return ToInlineBinaryView(v.data(), static_cast<int32_t>(v.size()));
+}
+
+inline BinaryViewType::c_type ToNonInlineBinaryView(const void* data, int32_t size,
+                                                    int32_t buffer_index,
+                                                    int32_t offset) {
+  // Large string: store index/offset.
+  BinaryViewType::c_type out;
+  out.ref = {size, {}, buffer_index, offset};
+  memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix));
+  return out;
+}
+
+inline BinaryViewType::c_type ToBinaryView(const void* data, int32_t size,
+                                           int32_t buffer_index, int32_t offset) {
+  if (size <= BinaryViewType::kInlineSize) {
+    return ToInlineBinaryView(data, size);
+  }
+  return ToNonInlineBinaryView(data, size, buffer_index, offset);
+}
+
+inline BinaryViewType::c_type ToBinaryView(std::string_view v, int32_t buffer_index,
+                                           int32_t offset) {
+  return ToBinaryView(v.data(), static_cast<int32_t>(v.size()), buffer_index, offset);
+}
+
+template <typename BufferPtr>
+std::string_view FromBinaryView(const BinaryViewType::c_type& v,
+                                const BufferPtr* data_buffers) {
+  auto* data = v.is_inline() ? v.inlined.data.data()
+                             : data_buffers[v.ref.buffer_index]->data() + v.ref.offset;
+  return {reinterpret_cast<const char*>(data), static_cast<size_t>(v.size())};
+}
+template <typename BufferPtr>
+std::string_view FromBinaryView(BinaryViewType::c_type&&, const BufferPtr*) = delete;
+
+template <typename BufferPtr>
+bool EqualBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r,
+                     const BufferPtr* l_buffers, const BufferPtr* r_buffers) {
+  int64_t l_size_and_prefix, r_size_and_prefix;
+  memcpy(&l_size_and_prefix, &l, sizeof(l_size_and_prefix));
+  memcpy(&r_size_and_prefix, &r, sizeof(r_size_and_prefix));
+
+  if (l_size_and_prefix != r_size_and_prefix) return false;
+
+  if (l.is_inline()) {
+    // The columnar spec mandates that the inlined part be zero-padded, so we can compare
+    // a word at a time regardless of the exact size.
+    int64_t l_inlined, r_inlined;
+    memcpy(&l_inlined, l.inline_data() + BinaryViewType::kPrefixSize, sizeof(l_inlined));
+    memcpy(&r_inlined, r.inline_data() + BinaryViewType::kPrefixSize, sizeof(r_inlined));
+    return l_inlined == r_inlined;
+  }
+
+  // Sizes are equal and this is not inline, therefore both are out
+  // of line and have kPrefixSize first in common.
+  const uint8_t* l_data = l_buffers[l.ref.buffer_index]->data() + l.ref.offset;
+  const uint8_t* r_data = r_buffers[r.ref.buffer_index]->data() + r.ref.offset;
+  return memcmp(l_data + BinaryViewType::kPrefixSize,
+                r_data + BinaryViewType::kPrefixSize,
+                l.size() - BinaryViewType::kPrefixSize) == 0;
+}
+
+/// \brief Compute the total size of a list of binary views including null
+/// views.
+///
+/// This is useful when calculating the necessary memory to store all the string
+/// data from the views.
+inline int64_t SumOfBinaryViewSizes(const BinaryViewType::c_type* views, int64_t length) {
+  int64_t total = 0;
+  for (int64_t i = 0; i < length; ++i) {
+    total += views[i].size();
+  }
+  return total;
+}
+
+}  // namespace arrow::util
diff --git a/pyarrow/include/arrow/util/bit_block_counter.h b/pyarrow/include/arrow/util/bit_block_counter.h
new file mode 100644
index 0000000000000000000000000000000000000000..73a1ee8600fb4e0be10f26e921083c3be5740490
--- /dev/null
+++ b/pyarrow/include/arrow/util/bit_block_counter.h
@@ -0,0 +1,570 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <memory>
+
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+namespace detail {
+
+inline uint64_t LoadWord(const uint8_t* bytes) {
+  return bit_util::ToLittleEndian(util::SafeLoadAs<uint64_t>(bytes));
+}
+
+inline uint64_t ShiftWord(uint64_t current, uint64_t next, int64_t shift) {
+  if (shift == 0) {
+    return current;
+  }
+  return (current >> shift) | (next << (64 - shift));
+}
+
+// These templates are here to help with unit tests
+
+template <typename T>
+constexpr T BitNot(T x) {
+  return ~x;
+}
+
+template <>
+constexpr bool BitNot(bool x) {
+  return !x;
+}
+
+struct BitBlockAnd {
+  template <typename T>
+  static constexpr T Call(T left, T right) {
+    return left & right;
+  }
+};
+
+struct BitBlockAndNot {
+  template <typename T>
+  static constexpr T Call(T left, T right) {
+    return left & BitNot(right);
+  }
+};
+
+struct BitBlockOr {
+  template <typename T>
+  static constexpr T Call(T left, T right) {
+    return left | right;
+  }
+};
+
+struct BitBlockOrNot {
+  template <typename T>
+  static constexpr T Call(T left, T right) {
+    return left | BitNot(right);
+  }
+};
+
+}  // namespace detail
+
+/// \brief Return value from bit block counters: the total number of bits and
+/// the number of set bits.
+struct BitBlockCount {
+  int16_t length;
+  int16_t popcount;
+
+  bool NoneSet() const { return this->popcount == 0; }
+  bool AllSet() const { return this->length == this->popcount; }
+};
+
+/// \brief A class that scans through a true/false bitmap to compute popcounts
+/// 64 or 256 bits at a time. This is used to accelerate processing of
+/// mostly-not-null array data.
+class ARROW_EXPORT BitBlockCounter {
+ public:
+  BitBlockCounter(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+      : bitmap_(util::MakeNonNull(bitmap) + start_offset / 8),
+        bits_remaining_(length),
+        offset_(start_offset % 8) {}
+
+  /// \brief The bit size of each word run
+  static constexpr int64_t kWordBits = 64;
+
+  /// \brief The bit size of four words run
+  static constexpr int64_t kFourWordsBits = kWordBits * 4;
+
+  /// \brief Return the next run of available bits, usually 256. The returned
+  /// pair contains the size of run and the number of true values. The last
+  /// block will have a length less than 256 if the bitmap length is not a
+  /// multiple of 256, and will return 0-length blocks in subsequent
+  /// invocations.
+  BitBlockCount NextFourWords() {
+    using detail::LoadWord;
+    using detail::ShiftWord;
+
+    if (!bits_remaining_) {
+      return {0, 0};
+    }
+    int64_t total_popcount = 0;
+    if (offset_ == 0) {
+      if (bits_remaining_ < kFourWordsBits) {
+        return GetBlockSlow(kFourWordsBits);
+      }
+      total_popcount += bit_util::PopCount(LoadWord(bitmap_));
+      total_popcount += bit_util::PopCount(LoadWord(bitmap_ + 8));
+      total_popcount += bit_util::PopCount(LoadWord(bitmap_ + 16));
+      total_popcount += bit_util::PopCount(LoadWord(bitmap_ + 24));
+    } else {
+      // When the offset is > 0, we need there to be a word beyond the last
+      // aligned word in the bitmap for the bit shifting logic.
+      if (bits_remaining_ < 5 * kFourWordsBits - offset_) {
+        return GetBlockSlow(kFourWordsBits);
+      }
+      auto current = LoadWord(bitmap_);
+      auto next = LoadWord(bitmap_ + 8);
+      total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_));
+      current = next;
+      next = LoadWord(bitmap_ + 16);
+      total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_));
+      current = next;
+      next = LoadWord(bitmap_ + 24);
+      total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_));
+      current = next;
+      next = LoadWord(bitmap_ + 32);
+      total_popcount += bit_util::PopCount(ShiftWord(current, next, offset_));
+    }
+    bitmap_ += bit_util::BytesForBits(kFourWordsBits);
+    bits_remaining_ -= kFourWordsBits;
+    return {256, static_cast<int16_t>(total_popcount)};
+  }
+
+  /// \brief Return the next run of available bits, usually 64. The returned
+  /// pair contains the size of run and the number of true values. The last
+  /// block will have a length less than 64 if the bitmap length is not a
+  /// multiple of 64, and will return 0-length blocks in subsequent
+  /// invocations.
+  BitBlockCount NextWord() {
+    using detail::LoadWord;
+    using detail::ShiftWord;
+
+    if (!bits_remaining_) {
+      return {0, 0};
+    }
+    int64_t popcount = 0;
+    if (offset_ == 0) {
+      if (bits_remaining_ < kWordBits) {
+        return GetBlockSlow(kWordBits);
+      }
+      popcount = bit_util::PopCount(LoadWord(bitmap_));
+    } else {
+      // When the offset is > 0, we need there to be a word beyond the last
+      // aligned word in the bitmap for the bit shifting logic.
+      if (bits_remaining_ < 2 * kWordBits - offset_) {
+        return GetBlockSlow(kWordBits);
+      }
+      popcount = bit_util::PopCount(
+          ShiftWord(LoadWord(bitmap_), LoadWord(bitmap_ + 8), offset_));
+    }
+    bitmap_ += kWordBits / 8;
+    bits_remaining_ -= kWordBits;
+    return {64, static_cast<int16_t>(popcount)};
+  }
+
+ private:
+  /// \brief Return block with the requested size when doing word-wise
+  /// computation is not possible due to inadequate bits remaining.
+  BitBlockCount GetBlockSlow(int64_t block_size) noexcept;
+
+  const uint8_t* bitmap_;
+  int64_t bits_remaining_;
+  int64_t offset_;
+};
+
+/// \brief A tool to iterate through a possibly nonexistent validity bitmap,
+/// to allow us to write one code path for both the with-nulls and no-nulls
+/// cases without giving up a lot of performance.
+class ARROW_EXPORT OptionalBitBlockCounter {
+ public:
+  // validity_bitmap may be NULLPTR
+  OptionalBitBlockCounter(const uint8_t* validity_bitmap, int64_t offset, int64_t length);
+
+  // validity_bitmap may be null
+  OptionalBitBlockCounter(const std::shared_ptr<Buffer>& validity_bitmap, int64_t offset,
+                          int64_t length);
+
+  /// Return block count for next word when the bitmap is available otherwise
+  /// return a block with length up to INT16_MAX when there is no validity
+  /// bitmap (so all the referenced values are not null).
+  BitBlockCount NextBlock() {
+    static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
+    if (has_bitmap_) {
+      BitBlockCount block = counter_.NextWord();
+      position_ += block.length;
+      return block;
+    } else {
+      int16_t block_size =
+          static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
+      position_ += block_size;
+      // All values are non-null
+      return {block_size, block_size};
+    }
+  }
+
+  // Like NextBlock, but returns a word-sized block even when there is no
+  // validity bitmap
+  BitBlockCount NextWord() {
+    static constexpr int64_t kWordSize = 64;
+    if (has_bitmap_) {
+      BitBlockCount block = counter_.NextWord();
+      position_ += block.length;
+      return block;
+    } else {
+      int16_t block_size = static_cast<int16_t>(std::min(kWordSize, length_ - position_));
+      position_ += block_size;
+      // All values are non-null
+      return {block_size, block_size};
+    }
+  }
+
+ private:
+  const bool has_bitmap_;
+  int64_t position_;
+  int64_t length_;
+  BitBlockCounter counter_;
+};
+
+/// \brief A class that computes popcounts on the result of bitwise operations
+/// between two bitmaps, 64 bits at a time. A 64-bit word is loaded from each
+/// bitmap, then the popcount is computed on e.g. the bitwise-and of the two
+/// words.
+class ARROW_EXPORT BinaryBitBlockCounter {
+ public:
+  BinaryBitBlockCounter(const uint8_t* left_bitmap, int64_t left_offset,
+                        const uint8_t* right_bitmap, int64_t right_offset, int64_t length)
+      : left_bitmap_(util::MakeNonNull(left_bitmap) + left_offset / 8),
+        left_offset_(left_offset % 8),
+        right_bitmap_(util::MakeNonNull(right_bitmap) + right_offset / 8),
+        right_offset_(right_offset % 8),
+        bits_remaining_(length) {}
+
+  /// \brief Return the popcount of the bitwise-and of the next run of
+  /// available bits, up to 64. The returned pair contains the size of run and
+  /// the number of true values. The last block will have a length less than 64
+  /// if the bitmap length is not a multiple of 64, and will return 0-length
+  /// blocks in subsequent invocations.
+  BitBlockCount NextAndWord() { return NextWord<detail::BitBlockAnd>(); }
+
+  /// \brief Computes "x & ~y" block for each available run of bits.
+  BitBlockCount NextAndNotWord() { return NextWord<detail::BitBlockAndNot>(); }
+
+  /// \brief Computes "x | y" block for each available run of bits.
+  BitBlockCount NextOrWord() { return NextWord<detail::BitBlockOr>(); }
+
+  /// \brief Computes "x | ~y" block for each available run of bits.
+  BitBlockCount NextOrNotWord() { return NextWord<detail::BitBlockOrNot>(); }
+
+ private:
+  template <class Op>
+  BitBlockCount NextWord() {
+    using detail::LoadWord;
+    using detail::ShiftWord;
+
+    if (!bits_remaining_) {
+      return {0, 0};
+    }
+    // When the offset is > 0, we need there to be a word beyond the last aligned
+    // word in the bitmap for the bit shifting logic.
+    constexpr int64_t kWordBits = BitBlockCounter::kWordBits;
+    const int64_t bits_required_to_use_words =
+        std::max(left_offset_ == 0 ? 64 : 64 + (64 - left_offset_),
+                 right_offset_ == 0 ? 64 : 64 + (64 - right_offset_));
+    if (bits_remaining_ < bits_required_to_use_words) {
+      const int16_t run_length =
+          static_cast<int16_t>(std::min(bits_remaining_, kWordBits));
+      int16_t popcount = 0;
+      for (int64_t i = 0; i < run_length; ++i) {
+        if (Op::Call(bit_util::GetBit(left_bitmap_, left_offset_ + i),
+                     bit_util::GetBit(right_bitmap_, right_offset_ + i))) {
+          ++popcount;
+        }
+      }
+      // This code path should trigger _at most_ 2 times. In the "two times"
+      // case, the first time the run length will be a multiple of 8.
+      left_bitmap_ += run_length / 8;
+      right_bitmap_ += run_length / 8;
+      bits_remaining_ -= run_length;
+      return {run_length, popcount};
+    }
+
+    int64_t popcount = 0;
+    if (left_offset_ == 0 && right_offset_ == 0) {
+      popcount =
+          bit_util::PopCount(Op::Call(LoadWord(left_bitmap_), LoadWord(right_bitmap_)));
+    } else {
+      auto left_word =
+          ShiftWord(LoadWord(left_bitmap_), LoadWord(left_bitmap_ + 8), left_offset_);
+      auto right_word =
+          ShiftWord(LoadWord(right_bitmap_), LoadWord(right_bitmap_ + 8), right_offset_);
+      popcount = bit_util::PopCount(Op::Call(left_word, right_word));
+    }
+    left_bitmap_ += kWordBits / 8;
+    right_bitmap_ += kWordBits / 8;
+    bits_remaining_ -= kWordBits;
+    return {64, static_cast<int16_t>(popcount)};
+  }
+
+  const uint8_t* left_bitmap_;
+  int64_t left_offset_;
+  const uint8_t* right_bitmap_;
+  int64_t right_offset_;
+  int64_t bits_remaining_;
+};
+
+class ARROW_EXPORT OptionalBinaryBitBlockCounter {
+ public:
+  // Any bitmap may be NULLPTR
+  OptionalBinaryBitBlockCounter(const uint8_t* left_bitmap, int64_t left_offset,
+                                const uint8_t* right_bitmap, int64_t right_offset,
+                                int64_t length);
+
+  // Any bitmap may be null
+  OptionalBinaryBitBlockCounter(const std::shared_ptr<Buffer>& left_bitmap,
+                                int64_t left_offset,
+                                const std::shared_ptr<Buffer>& right_bitmap,
+                                int64_t right_offset, int64_t length);
+
+  BitBlockCount NextAndBlock() {
+    static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
+    switch (has_bitmap_) {
+      case HasBitmap::BOTH: {
+        BitBlockCount block = binary_counter_.NextAndWord();
+        position_ += block.length;
+        return block;
+      }
+      case HasBitmap::ONE: {
+        BitBlockCount block = unary_counter_.NextWord();
+        position_ += block.length;
+        return block;
+      }
+      case HasBitmap::NONE:
+      default: {
+        const int16_t block_size =
+            static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
+        position_ += block_size;
+        // All values are non-null
+        return {block_size, block_size};
+      }
+    }
+  }
+
+  BitBlockCount NextOrNotBlock() {
+    static constexpr int64_t kMaxBlockSize = std::numeric_limits<int16_t>::max();
+    switch (has_bitmap_) {
+      case HasBitmap::BOTH: {
+        BitBlockCount block = binary_counter_.NextOrNotWord();
+        position_ += block.length;
+        return block;
+      }
+      case HasBitmap::ONE: {
+        BitBlockCount block = unary_counter_.NextWord();
+        position_ += block.length;
+        return block;
+      }
+      case HasBitmap::NONE:
+      default: {
+        const int16_t block_size =
+            static_cast<int16_t>(std::min(kMaxBlockSize, length_ - position_));
+        position_ += block_size;
+        // All values are non-null
+        return {block_size, block_size};
+      }
+    }
+  }
+
+ private:
+  enum class HasBitmap : int { BOTH, ONE, NONE };
+
+  const HasBitmap has_bitmap_;
+  int64_t position_;
+  int64_t length_;
+  BitBlockCounter unary_counter_;
+  BinaryBitBlockCounter binary_counter_;
+
+  static HasBitmap HasBitmapFromBitmaps(bool has_left, bool has_right) {
+    switch (static_cast<int>(has_left) + static_cast<int>(has_right)) {
+      case 0:
+        return HasBitmap::NONE;
+      case 1:
+        return HasBitmap::ONE;
+      default:  // 2
+        return HasBitmap::BOTH;
+    }
+  }
+};
+
+// Functional-style bit block visitors.
+
+template <typename VisitNotNull, typename VisitNull>
+static Status VisitBitBlocks(const uint8_t* bitmap, int64_t offset, int64_t length,
+                             VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
+  internal::OptionalBitBlockCounter bit_counter(bitmap, offset, length);
+  int64_t position = 0;
+  while (position < length) {
+    internal::BitBlockCount block = bit_counter.NextBlock();
+    if (block.AllSet()) {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        ARROW_RETURN_NOT_OK(visit_not_null(position));
+      }
+    } else if (block.NoneSet()) {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        ARROW_RETURN_NOT_OK(visit_null());
+      }
+    } else {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        if (bit_util::GetBit(bitmap, offset + position)) {
+          ARROW_RETURN_NOT_OK(visit_not_null(position));
+        } else {
+          ARROW_RETURN_NOT_OK(visit_null());
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+template <typename VisitNotNull, typename VisitNull>
+static void VisitBitBlocksVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
+                               VisitNotNull&& visit_not_null, VisitNull&& visit_null) {
+  internal::OptionalBitBlockCounter bit_counter(bitmap, offset, length);
+  int64_t position = 0;
+  while (position < length) {
+    internal::BitBlockCount block = bit_counter.NextBlock();
+    if (block.AllSet()) {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        visit_not_null(position);
+      }
+    } else if (block.NoneSet()) {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        visit_null();
+      }
+    } else {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        if (bit_util::GetBit(bitmap, offset + position)) {
+          visit_not_null(position);
+        } else {
+          visit_null();
+        }
+      }
+    }
+  }
+}
+
+template <typename VisitNotNull, typename VisitNull>
+static Status VisitTwoBitBlocks(const uint8_t* left_bitmap, int64_t left_offset,
+                                const uint8_t* right_bitmap, int64_t right_offset,
+                                int64_t length, VisitNotNull&& visit_not_null,
+                                VisitNull&& visit_null) {
+  if (left_bitmap == NULLPTR || right_bitmap == NULLPTR) {
+    // At most one bitmap is present
+    if (left_bitmap == NULLPTR) {
+      return VisitBitBlocks(right_bitmap, right_offset, length,
+                            std::forward<VisitNotNull>(visit_not_null),
+                            std::forward<VisitNull>(visit_null));
+    } else {
+      return VisitBitBlocks(left_bitmap, left_offset, length,
+                            std::forward<VisitNotNull>(visit_not_null),
+                            std::forward<VisitNull>(visit_null));
+    }
+  }
+  BinaryBitBlockCounter bit_counter(left_bitmap, left_offset, right_bitmap, right_offset,
+                                    length);
+  int64_t position = 0;
+  while (position < length) {
+    BitBlockCount block = bit_counter.NextAndWord();
+    if (block.AllSet()) {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        ARROW_RETURN_NOT_OK(visit_not_null(position));
+      }
+    } else if (block.NoneSet()) {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        ARROW_RETURN_NOT_OK(visit_null());
+      }
+    } else {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        if (bit_util::GetBit(left_bitmap, left_offset + position) &&
+            bit_util::GetBit(right_bitmap, right_offset + position)) {
+          ARROW_RETURN_NOT_OK(visit_not_null(position));
+        } else {
+          ARROW_RETURN_NOT_OK(visit_null());
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+template <typename VisitNotNull, typename VisitNull>
+static void VisitTwoBitBlocksVoid(const uint8_t* left_bitmap, int64_t left_offset,
+                                  const uint8_t* right_bitmap, int64_t right_offset,
+                                  int64_t length, VisitNotNull&& visit_not_null,
+                                  VisitNull&& visit_null) {
+  if (left_bitmap == NULLPTR || right_bitmap == NULLPTR) {
+    // At most one bitmap is present
+    if (left_bitmap == NULLPTR) {
+      return VisitBitBlocksVoid(right_bitmap, right_offset, length,
+                                std::forward<VisitNotNull>(visit_not_null),
+                                std::forward<VisitNull>(visit_null));
+    } else {
+      return VisitBitBlocksVoid(left_bitmap, left_offset, length,
+                                std::forward<VisitNotNull>(visit_not_null),
+                                std::forward<VisitNull>(visit_null));
+    }
+  }
+  BinaryBitBlockCounter bit_counter(left_bitmap, left_offset, right_bitmap, right_offset,
+                                    length);
+  int64_t position = 0;
+  while (position < length) {
+    BitBlockCount block = bit_counter.NextAndWord();
+    if (block.AllSet()) {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        visit_not_null(position);
+      }
+    } else if (block.NoneSet()) {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        visit_null();
+      }
+    } else {
+      for (int64_t i = 0; i < block.length; ++i, ++position) {
+        if (bit_util::GetBit(left_bitmap, left_offset + position) &&
+            bit_util::GetBit(right_bitmap, right_offset + position)) {
+          visit_not_null(position);
+        } else {
+          visit_null();
+        }
+      }
+    }
+  }
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/bit_run_reader.h b/pyarrow/include/arrow/util/bit_run_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bb00140279a047415cf5be0c9439f9125c3efc8
--- /dev/null
+++ b/pyarrow/include/arrow/util/bit_run_reader.h
@@ -0,0 +1,539 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+struct BitRun {
+  int64_t length;
+  // Whether bits are set at this point.
+  bool set;
+
+  std::string ToString() const {
+    return std::string("{Length: ") + std::to_string(length) +
+           ", set=" + std::to_string(set) + "}";
+  }
+};
+
+inline bool operator==(const BitRun& lhs, const BitRun& rhs) {
+  return lhs.length == rhs.length && lhs.set == rhs.set;
+}
+
+inline bool operator!=(const BitRun& lhs, const BitRun& rhs) {
+  return lhs.length != rhs.length || lhs.set != rhs.set;
+}
+
+class BitRunReaderLinear {
+ public:
+  BitRunReaderLinear() = default;
+
+  BitRunReaderLinear(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+      : reader_(bitmap, start_offset, length) {}
+
+  BitRun NextRun() {
+    BitRun rl = {/*length=*/0, reader_.IsSet()};
+    // Advance while the values are equal and not at the end of list.
+    while (reader_.position() < reader_.length() && reader_.IsSet() == rl.set) {
+      rl.length++;
+      reader_.Next();
+    }
+    return rl;
+  }
+
+ private:
+  BitmapReader reader_;
+};
+
+#if ARROW_LITTLE_ENDIAN
+/// A convenience class for counting the number of contiguous set/unset bits
+/// in a bitmap.
+class ARROW_EXPORT BitRunReader {
+ public:
+  BitRunReader() = default;
+
+  /// \brief Constructs new BitRunReader.
+  ///
+  /// \param[in] bitmap source data
+  /// \param[in] start_offset bit offset into the source data
+  /// \param[in] length number of bits to copy
+  BitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length);
+
+  /// Returns a new BitRun containing the number of contiguous
+  /// bits with the same value.  length == 0 indicates the
+  /// end of the bitmap.
+  BitRun NextRun() {
+    if (ARROW_PREDICT_FALSE(position_ >= length_)) {
+      return {/*length=*/0, false};
+    }
+    // This implementation relies on a efficient implementations of
+    // CountTrailingZeros and assumes that runs are more often then
+    // not.  The logic is to incrementally find the next bit change
+    // from the current position.  This is done by zeroing all
+    // bits in word_ up to position_ and using the TrailingZeroCount
+    // to find the index of the next set bit.
+
+    // The runs alternate on each call, so flip the bit.
+    current_run_bit_set_ = !current_run_bit_set_;
+
+    int64_t start_position = position_;
+    int64_t start_bit_offset = start_position & 63;
+    // Invert the word for proper use of CountTrailingZeros and
+    // clear bits so CountTrailingZeros can do it magic.
+    word_ = ~word_ & ~bit_util::LeastSignificantBitMask<uint64_t>(start_bit_offset);
+
+    // Go  forward until the next change from unset to set.
+    int64_t new_bits = bit_util::CountTrailingZeros(word_) - start_bit_offset;
+    position_ += new_bits;
+
+    if (ARROW_PREDICT_FALSE(bit_util::IsMultipleOf64(position_)) &&
+        ARROW_PREDICT_TRUE(position_ < length_)) {
+      // Continue extending position while we can advance an entire word.
+      // (updates position_ accordingly).
+      AdvanceUntilChange();
+    }
+
+    return {/*length=*/position_ - start_position, current_run_bit_set_};
+  }
+
+ private:
+  void AdvanceUntilChange() {
+    int64_t new_bits = 0;
+    do {
+      // Advance the position of the bitmap for loading.
+      bitmap_ += sizeof(uint64_t);
+      LoadNextWord();
+      new_bits = bit_util::CountTrailingZeros(word_);
+      // Continue calculating run length.
+      position_ += new_bits;
+    } while (ARROW_PREDICT_FALSE(bit_util::IsMultipleOf64(position_)) &&
+             ARROW_PREDICT_TRUE(position_ < length_) && new_bits > 0);
+  }
+
+  void LoadNextWord() { return LoadWord(length_ - position_); }
+
+  // Helper method for Loading the next word.
+  void LoadWord(int64_t bits_remaining) {
+    word_ = 0;
+    // we need at least an extra byte in this case.
+    if (ARROW_PREDICT_TRUE(bits_remaining >= 64)) {
+      std::memcpy(&word_, bitmap_, 8);
+    } else {
+      int64_t bytes_to_load = bit_util::BytesForBits(bits_remaining);
+      auto word_ptr = reinterpret_cast<uint8_t*>(&word_);
+      std::memcpy(word_ptr, bitmap_, bytes_to_load);
+      // Ensure stoppage at last bit in bitmap by reversing the next higher
+      // order bit.
+      bit_util::SetBitTo(word_ptr, bits_remaining,
+                         !bit_util::GetBit(word_ptr, bits_remaining - 1));
+    }
+
+    // Two cases:
+    //   1. For unset, CountTrailingZeros works naturally so we don't
+    //   invert the word.
+    //   2. Otherwise invert so we can use CountTrailingZeros.
+    if (current_run_bit_set_) {
+      word_ = ~word_;
+    }
+  }
+  const uint8_t* bitmap_;
+  int64_t position_;
+  int64_t length_;
+  uint64_t word_;
+  bool current_run_bit_set_;
+};
+#else
+using BitRunReader = BitRunReaderLinear;
+#endif
+
+struct SetBitRun {
+  int64_t position;
+  int64_t length;
+
+  bool AtEnd() const { return length == 0; }
+
+  std::string ToString() const {
+    return std::string("{pos=") + std::to_string(position) +
+           ", len=" + std::to_string(length) + "}";
+  }
+
+  bool operator==(const SetBitRun& other) const {
+    return position == other.position && length == other.length;
+  }
+  bool operator!=(const SetBitRun& other) const {
+    return position != other.position || length != other.length;
+  }
+};
+
+template <bool Reverse>
+class BaseSetBitRunReader {
+ public:
+  /// \brief Constructs new SetBitRunReader.
+  ///
+  /// \param[in] bitmap source data
+  /// \param[in] start_offset bit offset into the source data
+  /// \param[in] length number of bits to copy
+  ARROW_NOINLINE
+  BaseSetBitRunReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+      : bitmap_(util::MakeNonNull(bitmap)),
+        length_(length),
+        remaining_(length_),
+        current_word_(0),
+        current_num_bits_(0) {
+    if (Reverse) {
+      bitmap_ += (start_offset + length) / 8;
+      const int8_t end_bit_offset = static_cast<int8_t>((start_offset + length) % 8);
+      if (length > 0 && end_bit_offset) {
+        // Get LSBs from last byte
+        ++bitmap_;
+        current_num_bits_ =
+            std::min(static_cast<int32_t>(length), static_cast<int32_t>(end_bit_offset));
+        current_word_ = LoadPartialWord(8 - end_bit_offset, current_num_bits_);
+      }
+    } else {
+      bitmap_ += start_offset / 8;
+      const int8_t bit_offset = static_cast<int8_t>(start_offset % 8);
+      if (length > 0 && bit_offset) {
+        // Get MSBs from first byte
+        current_num_bits_ =
+            std::min(static_cast<int32_t>(length), static_cast<int32_t>(8 - bit_offset));
+        current_word_ = LoadPartialWord(bit_offset, current_num_bits_);
+      }
+    }
+  }
+
+  ARROW_NOINLINE
+  SetBitRun NextRun() {
+    int64_t pos = 0;
+    int64_t len = 0;
+    if (current_num_bits_) {
+      const auto run = FindCurrentRun();
+      assert(remaining_ >= 0);
+      if (run.length && current_num_bits_) {
+        // The run ends in current_word_
+        return AdjustRun(run);
+      }
+      pos = run.position;
+      len = run.length;
+    }
+    if (!len) {
+      // We didn't get any ones in current_word_, so we can skip any zeros
+      // in the following words
+      SkipNextZeros();
+      if (remaining_ == 0) {
+        return {0, 0};
+      }
+      assert(current_num_bits_);
+      pos = position();
+    } else if (!current_num_bits_) {
+      if (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+        current_word_ = LoadFullWord();
+        current_num_bits_ = 64;
+      } else if (remaining_ > 0) {
+        current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+        current_num_bits_ = static_cast<int32_t>(remaining_);
+      } else {
+        // No bits remaining, perhaps we found a run?
+        return AdjustRun({pos, len});
+      }
+      // If current word starts with a zero, we got a full run
+      if (!(current_word_ & kFirstBit)) {
+        return AdjustRun({pos, len});
+      }
+    }
+    // Current word should now start with a set bit
+    len += CountNextOnes();
+    return AdjustRun({pos, len});
+  }
+
+ protected:
+  int64_t position() const {
+    if (Reverse) {
+      return remaining_;
+    } else {
+      return length_ - remaining_;
+    }
+  }
+
+  SetBitRun AdjustRun(SetBitRun run) {
+    if (Reverse) {
+      assert(run.position >= run.length);
+      run.position -= run.length;
+    }
+    return run;
+  }
+
+  uint64_t LoadFullWord() {
+    uint64_t word;
+    if (Reverse) {
+      bitmap_ -= 8;
+    }
+    memcpy(&word, bitmap_, 8);
+    if (!Reverse) {
+      bitmap_ += 8;
+    }
+    return bit_util::ToLittleEndian(word);
+  }
+
+  uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
+    assert(num_bits > 0);
+    uint64_t word = 0;
+    const int64_t num_bytes = bit_util::BytesForBits(num_bits);
+    if (Reverse) {
+      // Read in the most significant bytes of the word
+      bitmap_ -= num_bytes;
+      memcpy(reinterpret_cast<char*>(&word) + 8 - num_bytes, bitmap_, num_bytes);
+      // XXX MostSignificantBitmask
+      return (bit_util::ToLittleEndian(word) << bit_offset) &
+             ~bit_util::LeastSignificantBitMask<uint64_t>(64 - num_bits);
+    } else {
+      memcpy(&word, bitmap_, num_bytes);
+      bitmap_ += num_bytes;
+      return (bit_util::ToLittleEndian(word) >> bit_offset) &
+             bit_util::LeastSignificantBitMask<uint64_t>(num_bits);
+    }
+  }
+
+  void SkipNextZeros() {
+    assert(current_num_bits_ == 0);
+    while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+      current_word_ = LoadFullWord();
+      const auto num_zeros = CountFirstZeros(current_word_);
+      if (num_zeros < 64) {
+        // Run of zeros ends here
+        current_word_ = ConsumeBits(current_word_, num_zeros);
+        current_num_bits_ = 64 - num_zeros;
+        remaining_ -= num_zeros;
+        assert(remaining_ >= 0);
+        assert(current_num_bits_ >= 0);
+        return;
+      }
+      remaining_ -= 64;
+    }
+    // Run of zeros continues in last bitmap word
+    if (remaining_ > 0) {
+      current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+      current_num_bits_ = static_cast<int32_t>(remaining_);
+      const auto num_zeros =
+          std::min<int32_t>(current_num_bits_, CountFirstZeros(current_word_));
+      current_word_ = ConsumeBits(current_word_, num_zeros);
+      current_num_bits_ -= num_zeros;
+      remaining_ -= num_zeros;
+      assert(remaining_ >= 0);
+      assert(current_num_bits_ >= 0);
+    }
+  }
+
+  int64_t CountNextOnes() {
+    assert(current_word_ & kFirstBit);
+
+    int64_t len;
+    if (~current_word_) {
+      const auto num_ones = CountFirstZeros(~current_word_);
+      assert(num_ones <= current_num_bits_);
+      assert(num_ones <= remaining_);
+      remaining_ -= num_ones;
+      current_word_ = ConsumeBits(current_word_, num_ones);
+      current_num_bits_ -= num_ones;
+      if (current_num_bits_) {
+        // Run of ones ends here
+        return num_ones;
+      }
+      len = num_ones;
+    } else {
+      // current_word_ is all ones
+      remaining_ -= 64;
+      current_num_bits_ = 0;
+      len = 64;
+    }
+
+    while (ARROW_PREDICT_TRUE(remaining_ >= 64)) {
+      current_word_ = LoadFullWord();
+      const auto num_ones = CountFirstZeros(~current_word_);
+      len += num_ones;
+      remaining_ -= num_ones;
+      if (num_ones < 64) {
+        // Run of ones ends here
+        current_word_ = ConsumeBits(current_word_, num_ones);
+        current_num_bits_ = 64 - num_ones;
+        return len;
+      }
+    }
+    // Run of ones continues in last bitmap word
+    if (remaining_ > 0) {
+      current_word_ = LoadPartialWord(/*bit_offset=*/0, remaining_);
+      current_num_bits_ = static_cast<int32_t>(remaining_);
+      const auto num_ones = CountFirstZeros(~current_word_);
+      assert(num_ones <= current_num_bits_);
+      assert(num_ones <= remaining_);
+      current_word_ = ConsumeBits(current_word_, num_ones);
+      current_num_bits_ -= num_ones;
+      remaining_ -= num_ones;
+      len += num_ones;
+    }
+    return len;
+  }
+
+  SetBitRun FindCurrentRun() {
+    // Skip any pending zeros
+    const auto num_zeros = CountFirstZeros(current_word_);
+    if (num_zeros >= current_num_bits_) {
+      remaining_ -= current_num_bits_;
+      current_word_ = 0;
+      current_num_bits_ = 0;
+      return {0, 0};
+    }
+    assert(num_zeros <= remaining_);
+    current_word_ = ConsumeBits(current_word_, num_zeros);
+    current_num_bits_ -= num_zeros;
+    remaining_ -= num_zeros;
+    const int64_t pos = position();
+    // Count any ones
+    const auto num_ones = CountFirstZeros(~current_word_);
+    assert(num_ones <= current_num_bits_);
+    assert(num_ones <= remaining_);
+    current_word_ = ConsumeBits(current_word_, num_ones);
+    current_num_bits_ -= num_ones;
+    remaining_ -= num_ones;
+    return {pos, num_ones};
+  }
+
+  inline int CountFirstZeros(uint64_t word);
+  inline uint64_t ConsumeBits(uint64_t word, int32_t num_bits);
+
+  const uint8_t* bitmap_;
+  const int64_t length_;
+  int64_t remaining_;
+  uint64_t current_word_;
+  int32_t current_num_bits_;
+
+  static constexpr uint64_t kFirstBit = Reverse ? 0x8000000000000000ULL : 1;
+};
+
+template <>
+inline int BaseSetBitRunReader<false>::CountFirstZeros(uint64_t word) {
+  return bit_util::CountTrailingZeros(word);
+}
+
+template <>
+inline int BaseSetBitRunReader<true>::CountFirstZeros(uint64_t word) {
+  return bit_util::CountLeadingZeros(word);
+}
+
+template <>
+inline uint64_t BaseSetBitRunReader<false>::ConsumeBits(uint64_t word, int32_t num_bits) {
+  return word >> num_bits;
+}
+
+template <>
+inline uint64_t BaseSetBitRunReader<true>::ConsumeBits(uint64_t word, int32_t num_bits) {
+  return word << num_bits;
+}
+
+using SetBitRunReader = BaseSetBitRunReader</*Reverse=*/false>;
+using ReverseSetBitRunReader = BaseSetBitRunReader</*Reverse=*/true>;
+
+// Functional-style bit run visitors.
+
+template <typename Visit>
+inline Status VisitBitRuns(const uint8_t* bitmap, int64_t offset, int64_t length,
+                           Visit&& visit) {
+  if (bitmap == NULLPTR) {
+    // Assuming all set (as in a null bitmap)
+    return visit(static_cast<int64_t>(0), length, true);
+  }
+  BitRunReader reader(bitmap, offset, length);
+  int64_t position = 0;
+  while (true) {
+    const auto run = reader.NextRun();
+    if (run.length == 0) {
+      break;
+    }
+    ARROW_RETURN_NOT_OK(visit(position, run.length, run.set));
+    position += run.length;
+  }
+  return Status::OK();
+}
+
+// XXX: Try to make this function small so the compiler can inline and optimize
+// the `visit` function, which is normally a hot loop with vectorizable code.
+// - don't inline SetBitRunReader constructor, it doesn't hurt performance
+// - un-inline NextRun hurts 'many null' cases a bit, but improves normal cases
+template <typename Visit>
+inline Status VisitSetBitRuns(const uint8_t* bitmap, int64_t offset, int64_t length,
+                              Visit&& visit) {
+  if (bitmap == NULLPTR) {
+    // Assuming all set (as in a null bitmap)
+    return visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
+  }
+  SetBitRunReader reader(bitmap, offset, length);
+  while (true) {
+    const auto run = reader.NextRun();
+    if (run.length == 0) {
+      break;
+    }
+    ARROW_RETURN_NOT_OK(visit(run.position, run.length));
+  }
+  return Status::OK();
+}
+
+template <typename Visit>
+inline void VisitSetBitRunsVoid(const uint8_t* bitmap, int64_t offset, int64_t length,
+                                Visit&& visit) {
+  if (bitmap == NULLPTR) {
+    // Assuming all set (as in a null bitmap)
+    visit(static_cast<int64_t>(0), static_cast<int64_t>(length));
+    return;
+  }
+  SetBitRunReader reader(bitmap, offset, length);
+  while (true) {
+    const auto run = reader.NextRun();
+    if (run.length == 0) {
+      break;
+    }
+    visit(run.position, run.length);
+  }
+}
+
+template <typename Visit>
+inline Status VisitSetBitRuns(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
+                              int64_t length, Visit&& visit) {
+  return VisitSetBitRuns(bitmap ? bitmap->data() : NULLPTR, offset, length,
+                         std::forward<Visit>(visit));
+}
+
+template <typename Visit>
+inline void VisitSetBitRunsVoid(const std::shared_ptr<Buffer>& bitmap, int64_t offset,
+                                int64_t length, Visit&& visit) {
+  VisitSetBitRunsVoid(bitmap ? bitmap->data() : NULLPTR, offset, length,
+                      std::forward<Visit>(visit));
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/bit_util.h b/pyarrow/include/arrow/util/bit_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7849db871ac4303097fe98fb125bcab9f30aa9c
--- /dev/null
+++ b/pyarrow/include/arrow/util/bit_util.h
@@ -0,0 +1,499 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(_MSC_VER)
+#  if defined(_M_AMD64) || defined(_M_X64)
+#    include <intrin.h>  // IWYU pragma: keep
+#  endif
+
+#  pragma intrinsic(_BitScanReverse)
+#  pragma intrinsic(_BitScanForward)
+#  define ARROW_POPCOUNT64 __popcnt64
+#  define ARROW_POPCOUNT32 __popcnt
+#else
+#  define ARROW_POPCOUNT64 __builtin_popcountll
+#  define ARROW_POPCOUNT32 __builtin_popcount
+#endif
+
+#include <cstdint>
+#include <type_traits>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace detail {
+
+template <typename Integer>
+typename std::make_unsigned<Integer>::type as_unsigned(Integer x) {
+  return static_cast<typename std::make_unsigned<Integer>::type>(x);
+}
+
+}  // namespace detail
+
+namespace bit_util {
+
+// The number of set bits in a given unsigned byte value, pre-computed
+//
+// Generated with the following Python code
+// output = 'static constexpr uint8_t kBytePopcount[] = {{{0}}};'
+// popcounts = [str(bin(i).count('1')) for i in range(0, 256)]
+// print(output.format(', '.join(popcounts)))
+static constexpr uint8_t kBytePopcount[] = {
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3,
+    4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4,
+    4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4,
+    5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5,
+    4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2,
+    3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5,
+    5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4,
+    5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6,
+    4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+
+static inline uint64_t PopCount(uint64_t bitmap) { return ARROW_POPCOUNT64(bitmap); }
+static inline uint32_t PopCount(uint32_t bitmap) { return ARROW_POPCOUNT32(bitmap); }
+
+//
+// Bit-related computations on integer values
+//
+
+// Returns the ceil of value/divisor
+constexpr int64_t CeilDiv(int64_t value, int64_t divisor) {
+  return (value == 0) ? 0 : 1 + (value - 1) / divisor;
+}
+
+// Return the number of bytes needed to fit the given number of bits
+constexpr int64_t BytesForBits(int64_t bits) {
+  // This formula avoids integer overflow on very large `bits`
+  return (bits >> 3) + ((bits & 7) != 0);
+}
+
+constexpr bool IsPowerOf2(int64_t value) {
+  return value > 0 && (value & (value - 1)) == 0;
+}
+
+constexpr bool IsPowerOf2(uint64_t value) {
+  return value > 0 && (value & (value - 1)) == 0;
+}
+
+// Returns the smallest power of two that contains v.  If v is already a
+// power of two, it is returned as is.
+static inline int64_t NextPower2(int64_t n) {
+  // Taken from
+  // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  n |= n >> 32;
+  n++;
+  return n;
+}
+
+constexpr bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; }
+
+constexpr bool IsMultipleOf8(int64_t n) { return (n & 7) == 0; }
+
+// Returns a mask for the bit_index lower order bits.
+// Valid in the range `[0, 8*sizof(Uint)]` if `kAllowUpperBound`
+// otherwise `[0, 8*sizof(Uint)[`
+template <typename Uint, bool kAllowUpperBound = false>
+constexpr auto LeastSignificantBitMask(Uint bit_index) {
+  if constexpr (kAllowUpperBound) {
+    if (bit_index == 8 * sizeof(Uint)) {
+      return ~Uint{0};
+    }
+  }
+  return (Uint{1} << bit_index) - Uint{1};
+}
+
+// Returns 'value' rounded up to the nearest multiple of 'factor'
+constexpr int64_t RoundUp(int64_t value, int64_t factor) {
+  return CeilDiv(value, factor) * factor;
+}
+
+// Returns 'value' rounded down to the nearest multiple of 'factor'
+constexpr int64_t RoundDown(int64_t value, int64_t factor) {
+  return (value / factor) * factor;
+}
+
+// Returns 'value' rounded up to the nearest multiple of 'factor' when factor
+// is a power of two.
+// The result is undefined on overflow, i.e. if `value > 2**64 - factor`,
+// since we cannot return the correct result which would be 2**64.
+constexpr int64_t RoundUpToPowerOf2(int64_t value, int64_t factor) {
+  // DCHECK(value >= 0);
+  // DCHECK(IsPowerOf2(factor));
+  return (value + (factor - 1)) & ~(factor - 1);
+}
+
+constexpr uint64_t RoundUpToPowerOf2(uint64_t value, uint64_t factor) {
+  // DCHECK(IsPowerOf2(factor));
+  return (value + (factor - 1)) & ~(factor - 1);
+}
+
+constexpr int64_t RoundUpToMultipleOf8(int64_t num) { return RoundUpToPowerOf2(num, 8); }
+
+constexpr int64_t RoundUpToMultipleOf64(int64_t num) {
+  return RoundUpToPowerOf2(num, 64);
+}
+
+// Returns the number of bytes covering a sliced bitmap. Find the length
+// rounded to cover full bytes on both extremities.
+//
+// The following example represents a slice (offset=10, length=9)
+//
+// 0       8       16     24
+// |-------|-------|------|
+//           [       ]          (slice)
+//         [             ]      (same slice aligned to bytes bounds, length=16)
+//
+// The covering bytes is the length (in bytes) of this new aligned slice.
+constexpr int64_t CoveringBytes(int64_t offset, int64_t length) {
+  return (bit_util::RoundUp(length + offset, 8) - bit_util::RoundDown(offset, 8)) / 8;
+}
+
+// Returns the 'num_bits' least-significant bits of 'v'.
+static inline uint64_t TrailingBits(uint64_t v, int num_bits) {
+  if (ARROW_PREDICT_FALSE(num_bits == 0)) return 0;
+  if (ARROW_PREDICT_FALSE(num_bits >= 64)) return v;
+  int n = 64 - num_bits;
+  return (v << n) >> n;
+}
+
+/// \brief Count the number of leading zeros in an unsigned integer.
+static inline int CountLeadingZeros(uint32_t value) {
+#if defined(__clang__) || defined(__GNUC__)
+  if (value == 0) return 32;
+  return static_cast<int>(__builtin_clz(value));
+#elif defined(_MSC_VER)
+  unsigned long index;                                               // NOLINT
+  if (_BitScanReverse(&index, static_cast<unsigned long>(value))) {  // NOLINT
+    return 31 - static_cast<int>(index);
+  } else {
+    return 32;
+  }
+#else
+  int bitpos = 0;
+  while (value != 0) {
+    value >>= 1;
+    ++bitpos;
+  }
+  return 32 - bitpos;
+#endif
+}
+
+static inline int CountLeadingZeros(uint64_t value) {
+#if defined(__clang__) || defined(__GNUC__)
+  if (value == 0) return 64;
+  return static_cast<int>(__builtin_clzll(value));
+#elif defined(_MSC_VER)
+  unsigned long index;                     // NOLINT
+  if (_BitScanReverse64(&index, value)) {  // NOLINT
+    return 63 - static_cast<int>(index);
+  } else {
+    return 64;
+  }
+#else
+  int bitpos = 0;
+  while (value != 0) {
+    value >>= 1;
+    ++bitpos;
+  }
+  return 64 - bitpos;
+#endif
+}
+
+static inline int CountTrailingZeros(uint32_t value) {
+#if defined(__clang__) || defined(__GNUC__)
+  if (value == 0) return 32;
+  return static_cast<int>(__builtin_ctzl(value));
+#elif defined(_MSC_VER)
+  unsigned long index;  // NOLINT
+  if (_BitScanForward(&index, value)) {
+    return static_cast<int>(index);
+  } else {
+    return 32;
+  }
+#else
+  int bitpos = 0;
+  if (value) {
+    while (value & 1 == 0) {
+      value >>= 1;
+      ++bitpos;
+    }
+  } else {
+    bitpos = 32;
+  }
+  return bitpos;
+#endif
+}
+
+static inline int CountTrailingZeros(uint64_t value) {
+#if defined(__clang__) || defined(__GNUC__)
+  if (value == 0) return 64;
+  return static_cast<int>(__builtin_ctzll(value));
+#elif defined(_MSC_VER)
+  unsigned long index;  // NOLINT
+  if (_BitScanForward64(&index, value)) {
+    return static_cast<int>(index);
+  } else {
+    return 64;
+  }
+#else
+  int bitpos = 0;
+  if (value) {
+    while (value & 1 == 0) {
+      value >>= 1;
+      ++bitpos;
+    }
+  } else {
+    bitpos = 64;
+  }
+  return bitpos;
+#endif
+}
+
+// Returns the minimum number of bits needed to represent an unsigned value
+static inline int NumRequiredBits(uint64_t x) { return 64 - CountLeadingZeros(x); }
+
+// Returns ceil(log2(x)).
+static inline int Log2(uint64_t x) {
+  // DCHECK_GT(x, 0);
+  return NumRequiredBits(x - 1);
+}
+
+//
+// Utilities for reading and writing individual bits by their index
+// in a memory area.
+//
+
+// Bitmask selecting the k-th bit in a byte
+static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128};
+
+// the bitwise complement version of kBitmask
+static constexpr uint8_t kFlippedBitmask[] = {254, 253, 251, 247, 239, 223, 191, 127};
+
+// Bitmask selecting the (k - 1) preceding bits in a byte
+static constexpr uint8_t kPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127};
+static constexpr uint8_t kPrecedingWrappingBitmask[] = {255, 1, 3, 7, 15, 31, 63, 127};
+
+// the bitwise complement version of kPrecedingBitmask
+static constexpr uint8_t kTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128};
+
+static constexpr bool GetBit(const uint8_t* bits, uint64_t i) {
+  return (bits[i >> 3] >> (i & 0x07)) & 1;
+}
+
+// Gets the i-th bit from a byte. Should only be used with i <= 7.
+static constexpr bool GetBitFromByte(uint8_t byte, uint8_t i) {
+  return byte & kBitmask[i];
+}
+
+static inline void ClearBit(uint8_t* bits, int64_t i) {
+  bits[i / 8] &= kFlippedBitmask[i % 8];
+}
+
+static inline void SetBit(uint8_t* bits, int64_t i) { bits[i / 8] |= kBitmask[i % 8]; }
+
+static inline void SetBitTo(uint8_t* bits, int64_t i, bool bit_is_set) {
+  // https://graphics.stanford.edu/~seander/bithacks.html
+  // "Conditionally set or clear bits without branching"
+  // NOTE: this seems to confuse Valgrind as it reads from potentially
+  // uninitialized memory
+  bits[i / 8] ^= static_cast<uint8_t>(-static_cast<uint8_t>(bit_is_set) ^ bits[i / 8]) &
+                 kBitmask[i % 8];
+}
+
+/// \brief set or clear a range of bits quickly
+ARROW_EXPORT
+void SetBitsTo(uint8_t* bits, int64_t start_offset, int64_t length, bool bits_are_set);
+
+/// \brief Sets all bits in the bitmap to true
+ARROW_EXPORT
+void SetBitmap(uint8_t* data, int64_t offset, int64_t length);
+
+/// \brief Clears all bits in the bitmap (set to false)
+ARROW_EXPORT
+void ClearBitmap(uint8_t* data, int64_t offset, int64_t length);
+
+/// Returns a mask with lower i bits set to 1. If i >= sizeof(Word)*8, all-ones will be
+/// returned
+/// ex:
+/// ref: https://stackoverflow.com/a/59523400
+template <typename Word>
+constexpr Word PrecedingWordBitmask(const unsigned int i) {
+  return static_cast<Word>(static_cast<Word>(i < sizeof(Word) * 8)
+                           << (i & (sizeof(Word) * 8 - 1))) -
+         1;
+}
+static_assert(PrecedingWordBitmask<uint8_t>(0) == 0x00, "");
+static_assert(PrecedingWordBitmask<uint8_t>(4) == 0x0f, "");
+static_assert(PrecedingWordBitmask<uint8_t>(8) == 0xff, "");
+static_assert(PrecedingWordBitmask<uint16_t>(8) == 0x00ff, "");
+
+/// \brief Create a word with low `n` bits from `low` and high `sizeof(Word)-n` bits
+/// from `high`.
+/// Word ret
+/// for (i = 0; i < sizeof(Word)*8; i++){
+///     ret[i]= i < n ? low[i]: high[i];
+/// }
+template <typename Word>
+constexpr Word SpliceWord(int n, Word low, Word high) {
+  return (high & ~PrecedingWordBitmask<Word>(n)) | (low & PrecedingWordBitmask<Word>(n));
+}
+
+/// \brief Pack integers into a bitmap in batches of 8
+template <int batch_size>
+void PackBits(const uint32_t* values, uint8_t* out) {
+  for (int i = 0; i < batch_size / 8; ++i) {
+    *out++ = static_cast<uint8_t>(values[0] | values[1] << 1 | values[2] << 2 |
+                                  values[3] << 3 | values[4] << 4 | values[5] << 5 |
+                                  values[6] << 6 | values[7] << 7);
+    values += 8;
+  }
+}
+
+constexpr int32_t MaxLEB128ByteLen(int32_t n_bits) {
+  return static_cast<int32_t>(CeilDiv(n_bits, 7));
+}
+
+template <typename Int>
+constexpr int32_t kMaxLEB128ByteLenFor = MaxLEB128ByteLen(sizeof(Int) * 8);
+
+/// Write a integer as LEB128
+///
+/// Write the input value as LEB128 into the outptut buffer and return the number of bytes
+/// written.
+/// If the output buffer size is insufficient, return 0 but the output may have been
+/// written to.
+/// The input value can be a signed integer, but must be non negative.
+///
+/// \see https://en.wikipedia.org/wiki/LEB128
+/// \see MaxLEB128ByteLenFor
+template <typename Int>
+constexpr int32_t WriteLEB128(Int value, uint8_t* out, int32_t max_out_size) {
+  constexpr Int kLow7Mask = Int(0x7F);
+  constexpr Int kHigh7Mask = ~kLow7Mask;
+  constexpr uint8_t kContinuationBit = 0x80;
+
+  // This encoding does not work for negative values
+  if constexpr (std::is_signed_v<Int>) {
+    if (ARROW_PREDICT_FALSE(value < 0)) {
+      return 0;
+    }
+  }
+
+  const auto out_first = out;
+
+  // Write as many bytes as we could be for the given input
+  while ((value & kHigh7Mask) != Int(0)) {
+    // We do not have enough room to write the LEB128
+    if (ARROW_PREDICT_FALSE(out - out_first >= max_out_size)) {
+      return 0;
+    }
+
+    // Write the encoded byte with continuation bit
+    *out = static_cast<uint8_t>(value & kLow7Mask) | kContinuationBit;
+    ++out;
+    // Shift remaining data
+    value >>= 7;
+  }
+
+  // We do not have enough room to write the LEB128
+  if (ARROW_PREDICT_FALSE(out - out_first >= max_out_size)) {
+    return 0;
+  }
+
+  // Write last non-continuing byte
+  *out = static_cast<uint8_t>(value & kLow7Mask);
+  ++out;
+
+  return static_cast<int32_t>(out - out_first);
+}
+
+/// Parse a leading LEB128
+///
+/// Take as input a data pointer and the maximum number of bytes that can be read from it
+/// (typically the array size).
+/// When a valid LEB128 is found at the start of the data, the function writes it to the
+/// out pointer and return the number of bytes read.
+/// Otherwise, the out pointer is unmodified and zero is returned.
+///
+/// \see https://en.wikipedia.org/wiki/LEB128
+/// \see MaxLEB128ByteLenFor
+template <typename Int>
+constexpr int32_t ParseLeadingLEB128(const uint8_t* data, int32_t max_data_size,
+                                     Int* out) {
+  constexpr auto kMaxBytes = kMaxLEB128ByteLenFor<Int>;
+  static_assert(kMaxBytes >= 1);
+  constexpr uint8_t kLow7Mask = 0x7F;
+  constexpr uint8_t kContinuationBit = 0x80;
+  constexpr int32_t kSignBitCount = std::is_signed_v<Int> ? 1 : 0;
+  // Number of bits allowed for encoding data on the last byte to avoid overflow
+  constexpr uint8_t kHighBitCount = (8 * sizeof(Int) - kSignBitCount) % 7;
+  // kHighBitCount least significant `0` bits and the rest with `1`
+  constexpr uint8_t kHighForbiddenMask = ~((1 << kHighBitCount) - 1);
+
+  // Iteratively building the value
+  std::make_unsigned_t<Int> value = 0;
+
+  // Read as many bytes as we could be for the given output.
+  for (int32_t i = 0; i < kMaxBytes - 1; i++) {
+    // We have not finished reading a valid LEB128, yet we run out of data
+    if (ARROW_PREDICT_FALSE(i >= max_data_size)) {
+      return 0;
+    }
+
+    // Read the byte and set its 7 LSB to in the final value
+    const uint8_t byte = data[i];
+    value |= static_cast<Int>(byte & kLow7Mask) << (7 * i);
+
+    // Check for lack of continuation flag in MSB
+    if ((byte & kContinuationBit) == 0) {
+      *out = value;
+      return i + 1;
+    }
+  }
+
+  // Process the last index avoiding overflowing
+  constexpr int32_t last = kMaxBytes - 1;
+
+  // We have not finished reading a valid LEB128, yet we run out of data
+  if (ARROW_PREDICT_FALSE(last >= max_data_size)) {
+    return 0;
+  }
+
+  const uint8_t byte = data[last];
+
+  // Need to check if there are bits that would overflow the output.
+  // Also checks that there is no continuation.
+  if (ARROW_PREDICT_FALSE((byte & kHighForbiddenMask) != 0)) {
+    return 0;
+  }
+
+  // No longer need to mask since we ensured
+  value |= static_cast<Int>(byte) << (7 * last);
+  *out = value;
+  return last + 1;
+}
+}  // namespace bit_util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/bitmap.h b/pyarrow/include/arrow/util/bitmap.h
new file mode 100644
index 0000000000000000000000000000000000000000..141d558c3a8cfa85bde0d353e3b63e92a830a6c9
--- /dev/null
+++ b/pyarrow/include/arrow/util/bitmap.h
@@ -0,0 +1,466 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <bitset>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/span.h"
+#include "arrow/util/string_util.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class BooleanArray;
+
+namespace internal {
+
+class ARROW_EXPORT Bitmap : public util::ToStringOstreamable<Bitmap>,
+                            public util::EqualityComparable<Bitmap> {
+ public:
+  Bitmap() = default;
+
+  Bitmap(const std::shared_ptr<Buffer>& buffer, int64_t offset, int64_t length)
+      : data_(buffer->data()), offset_(offset), length_(length) {
+    if (buffer->is_mutable()) {
+      mutable_data_ = buffer->mutable_data();
+    }
+  }
+
+  Bitmap(const void* data, int64_t offset, int64_t length)
+      : data_(reinterpret_cast<const uint8_t*>(data)), offset_(offset), length_(length) {}
+
+  Bitmap(void* data, int64_t offset, int64_t length)
+      : data_(reinterpret_cast<const uint8_t*>(data)),
+        mutable_data_(reinterpret_cast<uint8_t*>(data)),
+        offset_(offset),
+        length_(length) {}
+
+  Bitmap Slice(int64_t offset) const {
+    if (mutable_data_ != NULLPTR) {
+      return {mutable_data_, offset_ + offset, length_ - offset};
+    } else {
+      return {data_, offset_ + offset, length_ - offset};
+    }
+  }
+
+  Bitmap Slice(int64_t offset, int64_t length) const {
+    if (mutable_data_ != NULLPTR) {
+      return {mutable_data_, offset_ + offset, length};
+    } else {
+      return {data_, offset_ + offset, length};
+    }
+  }
+
+  std::string ToString() const;
+
+  bool Equals(const Bitmap& other) const;
+
+  std::string Diff(const Bitmap& other) const;
+
+  bool GetBit(int64_t i) const { return bit_util::GetBit(data_, i + offset_); }
+
+  bool operator[](int64_t i) const { return GetBit(i); }
+
+  void SetBitTo(int64_t i, bool v) const {
+    bit_util::SetBitTo(mutable_data_, i + offset_, v);
+  }
+
+  void SetBitsTo(bool v) { bit_util::SetBitsTo(mutable_data_, offset_, length_, v); }
+
+  void CopyFrom(const Bitmap& other);
+  void CopyFromInverted(const Bitmap& other);
+
+  /// \brief Visit bits from each bitmap as bitset<N>
+  ///
+  /// All bitmaps must have identical length.
+  template <size_t N, typename Visitor>
+  static void VisitBits(const Bitmap (&bitmaps)[N], Visitor&& visitor) {
+    int64_t bit_length = BitLength(bitmaps, N);
+    std::bitset<N> bits;
+    for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
+      for (size_t i = 0; i < N; ++i) {
+        bits[i] = bitmaps[i].GetBit(bit_i);
+      }
+      visitor(bits);
+    }
+  }
+
+  /// \brief Visit bits from each bitmap as bitset<N>
+  ///
+  /// All bitmaps must have identical length.
+  template <size_t N, typename Visitor>
+  static void VisitBits(const std::array<Bitmap, N>& bitmaps, Visitor&& visitor) {
+    int64_t bit_length = BitLength(bitmaps);
+    std::bitset<N> bits;
+    for (int64_t bit_i = 0; bit_i < bit_length; ++bit_i) {
+      for (size_t i = 0; i < N; ++i) {
+        bits[i] = bitmaps[i].GetBit(bit_i);
+      }
+      visitor(bits);
+    }
+  }
+
+  /// \brief Visit words of bits from each bitmap as array<Word, N>
+  ///
+  /// All bitmaps must have identical length. The first bit in a visited bitmap
+  /// may be offset within the first visited word, but words will otherwise contain
+  /// densely packed bits loaded from the bitmap. That offset within the first word is
+  /// returned.
+  ///
+  /// TODO(bkietz) allow for early termination
+  // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
+  // It also has a large prolog / epilog overhead and should be used
+  // carefully in other cases.
+  // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
+  // and BitmapUInt64Reader.
+  template <size_t N, typename Visitor,
+            typename Word = typename std::decay<
+                internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+  static int64_t VisitWords(const Bitmap (&bitmaps_arg)[N], Visitor&& visitor) {
+    constexpr int64_t kBitWidth = sizeof(Word) * 8;
+
+    // local, mutable variables which will be sliced/decremented to represent consumption:
+    Bitmap bitmaps[N];
+    int64_t offsets[N];
+    int64_t bit_length = BitLength(bitmaps_arg, N);
+    util::span<const Word> words[N];
+    for (size_t i = 0; i < N; ++i) {
+      bitmaps[i] = bitmaps_arg[i];
+      offsets[i] = bitmaps[i].template word_offset<Word>();
+      assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
+      words[i] = bitmaps[i].template words<Word>();
+    }
+
+    auto consume = [&](int64_t consumed_bits) {
+      for (size_t i = 0; i < N; ++i) {
+        bitmaps[i] = bitmaps[i].Slice(consumed_bits, bit_length - consumed_bits);
+        offsets[i] = bitmaps[i].template word_offset<Word>();
+        assert(offsets[i] >= 0 && offsets[i] < kBitWidth);
+        words[i] = bitmaps[i].template words<Word>();
+      }
+      bit_length -= consumed_bits;
+    };
+
+    std::array<Word, N> visited_words;
+    visited_words.fill(0);
+
+    if (bit_length <= kBitWidth * 2) {
+      // bitmaps fit into one or two words so don't bother with optimization
+      while (bit_length > 0) {
+        auto leading_bits = std::min(bit_length, kBitWidth);
+        SafeLoadWords(bitmaps, 0, leading_bits, false, &visited_words);
+        visitor(visited_words);
+        consume(leading_bits);
+      }
+      return 0;
+    }
+
+    int64_t max_offset = *std::max_element(offsets, offsets + N);
+    int64_t min_offset = *std::min_element(offsets, offsets + N);
+    if (max_offset > 0) {
+      // consume leading bits
+      auto leading_bits = kBitWidth - min_offset;
+      SafeLoadWords(bitmaps, 0, leading_bits, true, &visited_words);
+      visitor(visited_words);
+      consume(leading_bits);
+    }
+    assert(*std::min_element(offsets, offsets + N) == 0);
+
+    int64_t whole_word_count = bit_length / kBitWidth;
+    assert(whole_word_count >= 1);
+
+    if (min_offset == max_offset) {
+      // all offsets were identical, all leading bits have been consumed
+      assert(
+          std::all_of(offsets, offsets + N, [](int64_t offset) { return offset == 0; }));
+
+      for (int64_t word_i = 0; word_i < whole_word_count; ++word_i) {
+        for (size_t i = 0; i < N; ++i) {
+          visited_words[i] = words[i][word_i];
+        }
+        visitor(visited_words);
+      }
+      consume(whole_word_count * kBitWidth);
+    } else {
+      // leading bits from potentially incomplete words have been consumed
+
+      // word_i such that words[i][word_i] and words[i][word_i + 1] are lie entirely
+      // within the bitmap for all i
+      for (int64_t word_i = 0; word_i < whole_word_count - 1; ++word_i) {
+        for (size_t i = 0; i < N; ++i) {
+          if (offsets[i] == 0) {
+            visited_words[i] = words[i][word_i];
+          } else {
+            auto words0 = bit_util::ToLittleEndian(words[i][word_i]);
+            auto words1 = bit_util::ToLittleEndian(words[i][word_i + 1]);
+            visited_words[i] = bit_util::FromLittleEndian(
+                (words0 >> offsets[i]) | (words1 << (kBitWidth - offsets[i])));
+          }
+        }
+        visitor(visited_words);
+      }
+      consume((whole_word_count - 1) * kBitWidth);
+
+      SafeLoadWords(bitmaps, 0, kBitWidth, false, &visited_words);
+
+      visitor(visited_words);
+      consume(kBitWidth);
+    }
+
+    // load remaining bits
+    if (bit_length > 0) {
+      SafeLoadWords(bitmaps, 0, bit_length, false, &visited_words);
+      visitor(visited_words);
+    }
+
+    return min_offset;
+  }
+
+  template <size_t N, size_t M, typename ReaderT, typename WriterT, typename Visitor,
+            typename Word = typename std::decay<
+                internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+  static void RunVisitWordsAndWriteLoop(int64_t bit_length,
+                                        std::array<ReaderT, N>& readers,
+                                        std::array<WriterT, M>& writers,
+                                        Visitor&& visitor) {
+    constexpr int64_t kBitWidth = sizeof(Word) * 8;
+
+    std::array<Word, N> visited_words;
+    std::array<Word, M> output_words;
+
+    // every reader will have same number of words, since they are same length'ed
+    // TODO($JIRA) this will be inefficient in some cases. When there are offsets beyond
+    //  Word boundary, every Word would have to be created from 2 adjoining Words
+    auto n_words = readers[0].words();
+    bit_length -= n_words * kBitWidth;
+    while (n_words--) {
+      // first collect all words to visited_words array
+      for (size_t i = 0; i < N; i++) {
+        visited_words[i] = readers[i].NextWord();
+      }
+      visitor(visited_words, &output_words);
+      for (size_t i = 0; i < M; i++) {
+        writers[i].PutNextWord(output_words[i]);
+      }
+    }
+
+    // every reader will have same number of trailing bytes, because of the above reason
+    // tailing portion could be more than one word! (ref: BitmapWordReader constructor)
+    // remaining full/ partial words to write
+
+    if (bit_length) {
+      // convert the word visitor lambda to a byte_visitor
+      auto byte_visitor = [&](const std::array<uint8_t, N>& in,
+                              std::array<uint8_t, M>* out) {
+        std::array<Word, N> in_words;
+        std::array<Word, M> out_words;
+        std::copy(in.begin(), in.end(), in_words.begin());
+        visitor(in_words, &out_words);
+        for (size_t i = 0; i < M; i++) {
+          out->at(i) = static_cast<uint8_t>(out_words[i]);
+        }
+      };
+
+      std::array<uint8_t, N> visited_bytes;
+      std::array<uint8_t, M> output_bytes;
+      int n_bytes = readers[0].trailing_bytes();
+      while (n_bytes--) {
+        visited_bytes.fill(0);
+        output_bytes.fill(0);
+        int valid_bits;
+        for (size_t i = 0; i < N; i++) {
+          visited_bytes[i] = readers[i].NextTrailingByte(valid_bits);
+        }
+        byte_visitor(visited_bytes, &output_bytes);
+        for (size_t i = 0; i < M; i++) {
+          writers[i].PutNextTrailingByte(output_bytes[i], valid_bits);
+        }
+      }
+    }
+  }
+
+  /// \brief Visit words of bits from each input bitmap as array<Word, N> and collects
+  /// outputs to an array<Word, M>, to be written into the output bitmaps accordingly.
+  ///
+  /// All bitmaps must have identical length. The first bit in a visited bitmap
+  /// may be offset within the first visited word, but words will otherwise contain
+  /// densely packed bits loaded from the bitmap. That offset within the first word is
+  /// returned.
+  /// Visitor is expected to have the following signature
+  ///     [](const std::array<Word, N>& in_words, std::array<Word, M>* out_words){...}
+  ///
+  // NOTE: this function is efficient on 3+ sufficiently large bitmaps.
+  // It also has a large prolog / epilog overhead and should be used
+  // carefully in other cases.
+  // For 2 bitmaps or less, and/or smaller bitmaps, see also VisitTwoBitBlocksVoid
+  // and BitmapUInt64Reader.
+  template <size_t N, size_t M, typename Visitor,
+            typename Word = typename std::decay<
+                internal::call_traits::argument_type<0, Visitor&&>>::type::value_type>
+  static void VisitWordsAndWrite(const std::array<Bitmap, N>& bitmaps_arg,
+                                 std::array<Bitmap, M>* out_bitmaps_arg,
+                                 Visitor&& visitor) {
+    int64_t bit_length = BitLength(bitmaps_arg);
+    assert(bit_length == BitLength(*out_bitmaps_arg));
+
+    // if both input and output bitmaps have no byte offset, then use special template
+    if (std::all_of(bitmaps_arg.begin(), bitmaps_arg.end(),
+                    [](const Bitmap& b) { return b.offset_ % 8 == 0; }) &&
+        std::all_of(out_bitmaps_arg->begin(), out_bitmaps_arg->end(),
+                    [](const Bitmap& b) { return b.offset_ % 8 == 0; })) {
+      std::array<BitmapWordReader<Word, /*may_have_byte_offset=*/false>, N> readers;
+      for (size_t i = 0; i < N; ++i) {
+        const Bitmap& in_bitmap = bitmaps_arg[i];
+        readers[i] = BitmapWordReader<Word, /*may_have_byte_offset=*/false>(
+            in_bitmap.data_, in_bitmap.offset_, in_bitmap.length_);
+      }
+
+      std::array<BitmapWordWriter<Word, /*may_have_byte_offset=*/false>, M> writers;
+      for (size_t i = 0; i < M; ++i) {
+        const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
+        writers[i] = BitmapWordWriter<Word, /*may_have_byte_offset=*/false>(
+            out_bitmap.mutable_data_, out_bitmap.offset_, out_bitmap.length_);
+      }
+
+      RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
+    } else {
+      std::array<BitmapWordReader<Word>, N> readers;
+      for (size_t i = 0; i < N; ++i) {
+        const Bitmap& in_bitmap = bitmaps_arg[i];
+        readers[i] =
+            BitmapWordReader<Word>(in_bitmap.data_, in_bitmap.offset_, in_bitmap.length_);
+      }
+
+      std::array<BitmapWordWriter<Word>, M> writers;
+      for (size_t i = 0; i < M; ++i) {
+        const Bitmap& out_bitmap = out_bitmaps_arg->at(i);
+        writers[i] = BitmapWordWriter<Word>(out_bitmap.mutable_data_, out_bitmap.offset_,
+                                            out_bitmap.length_);
+      }
+
+      RunVisitWordsAndWriteLoop(bit_length, readers, writers, visitor);
+    }
+  }
+
+  const uint8_t* data() const { return data_; }
+  uint8_t* mutable_data() { return mutable_data_; }
+
+  /// offset of first bit relative to buffer().data()
+  int64_t offset() const { return offset_; }
+
+  /// number of bits in this Bitmap
+  int64_t length() const { return length_; }
+
+  /// span of all bytes which contain any bit in this Bitmap
+  util::span<const uint8_t> bytes() const {
+    auto byte_offset = offset_ / 8;
+    auto byte_count = bit_util::CeilDiv(offset_ + length_, 8) - byte_offset;
+    return {data_ + byte_offset, static_cast<size_t>(byte_count)};
+  }
+
+ private:
+  /// span of all Words which contain any bit in this Bitmap
+  ///
+  /// For example, given Word=uint16_t and a bitmap spanning bits [20, 36)
+  /// words() would span bits [16, 48).
+  ///
+  /// 0       16      32     48     64
+  /// |-------|-------|------|------| (buffer)
+  ///           [       ]             (bitmap)
+  ///         |-------|------|        (returned words)
+  ///
+  /// \warning The words may contain bytes which lie outside the buffer or are
+  /// uninitialized.
+  template <typename Word>
+  util::span<const Word> words() const {
+    auto bytes_addr = reinterpret_cast<intptr_t>(bytes().data());
+    auto words_addr = bytes_addr - bytes_addr % sizeof(Word);
+    auto word_byte_count =
+        bit_util::RoundUpToPowerOf2(static_cast<int64_t>(bytes_addr + bytes().size()),
+                                    static_cast<int64_t>(sizeof(Word))) -
+        words_addr;
+    return {reinterpret_cast<const Word*>(words_addr),
+            static_cast<size_t>(word_byte_count / sizeof(Word))};
+  }
+
+  /// offset of first bit relative to words<Word>().data()
+  template <typename Word>
+  int64_t word_offset() const {
+    return offset_ + 8 * (reinterpret_cast<intptr_t>(data_) -
+                          reinterpret_cast<intptr_t>(words<Word>().data()));
+  }
+
+  /// load words from bitmaps bitwise
+  template <size_t N, typename Word>
+  static void SafeLoadWords(const Bitmap (&bitmaps)[N], int64_t offset,
+                            int64_t out_length, bool set_trailing_bits,
+                            std::array<Word, N>* out) {
+    out->fill(0);
+
+    int64_t out_offset = set_trailing_bits ? sizeof(Word) * 8 - out_length : 0;
+
+    Bitmap slices[N], out_bitmaps[N];
+    for (size_t i = 0; i < N; ++i) {
+      slices[i] = bitmaps[i].Slice(offset, out_length);
+      out_bitmaps[i] = Bitmap(&out->at(i), out_offset, out_length);
+    }
+
+    int64_t bit_i = 0;
+    Bitmap::VisitBits(slices, [&](std::bitset<N> bits) {
+      for (size_t i = 0; i < N; ++i) {
+        out_bitmaps[i].SetBitTo(bit_i, bits[i]);
+      }
+      ++bit_i;
+    });
+  }
+
+  /// assert bitmaps have identical length and return that length
+  static int64_t BitLength(const Bitmap* bitmaps, size_t N);
+
+  template <size_t N>
+  static int64_t BitLength(const std::array<Bitmap, N>& bitmaps) {
+    for (size_t i = 1; i < N; ++i) {
+      assert(bitmaps[i].length() == bitmaps[0].length());
+    }
+    return bitmaps[0].length();
+  }
+
+  const uint8_t* data_ = NULLPTR;
+  uint8_t* mutable_data_ = NULLPTR;
+  int64_t offset_ = 0, length_ = 0;
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/bitmap_builders.h b/pyarrow/include/arrow/util/bitmap_builders.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bf2edfdcbd69046e060489b7940370d7778a4c6
--- /dev/null
+++ b/pyarrow/include/arrow/util/bitmap_builders.h
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/span.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief Generate Bitmap with all position to `value` except for one found
+/// at `straggler_pos`.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapAllButOne(MemoryPool* pool, int64_t length,
+                                                int64_t straggler_pos, bool value = true);
+
+/// \brief Convert vector of bytes to bitmap buffer
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BytesToBits(util::span<const uint8_t> bytes,
+                                            MemoryPool* pool = default_memory_pool());
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/bitmap_generate.h b/pyarrow/include/arrow/util/bitmap_generate.h
new file mode 100644
index 0000000000000000000000000000000000000000..52a1e228e01f1d6c3c37a5e2d49d843f0a4573f9
--- /dev/null
+++ b/pyarrow/include/arrow/util/bitmap_generate.h
@@ -0,0 +1,112 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/result.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+// A std::generate() like function to write sequential bits into a bitmap area.
+// Bits preceding the bitmap area are preserved, bits following the bitmap
+// area may be clobbered.
+
+template <class Generator>
+void GenerateBits(uint8_t* bitmap, int64_t start_offset, int64_t length, Generator&& g) {
+  if (length == 0) {
+    return;
+  }
+  uint8_t* cur = bitmap + start_offset / 8;
+  uint8_t bit_mask = bit_util::kBitmask[start_offset % 8];
+  uint8_t current_byte = *cur & bit_util::kPrecedingBitmask[start_offset % 8];
+
+  for (int64_t index = 0; index < length; ++index) {
+    const bool bit = g();
+    current_byte = bit ? (current_byte | bit_mask) : current_byte;
+    bit_mask = static_cast<uint8_t>(bit_mask << 1);
+    if (bit_mask == 0) {
+      bit_mask = 1;
+      *cur++ = current_byte;
+      current_byte = 0;
+    }
+  }
+  if (bit_mask != 1) {
+    *cur++ = current_byte;
+  }
+}
+
+// Like GenerateBits(), but unrolls its main loop for higher performance.
+
+template <class Generator>
+void GenerateBitsUnrolled(uint8_t* bitmap, int64_t start_offset, int64_t length,
+                          Generator&& g) {
+  static_assert(std::is_same<decltype(std::declval<Generator>()()), bool>::value,
+                "Functor passed to GenerateBitsUnrolled must return bool");
+
+  if (length == 0) {
+    return;
+  }
+  uint8_t current_byte;
+  uint8_t* cur = bitmap + start_offset / 8;
+  const uint64_t start_bit_offset = start_offset % 8;
+  uint8_t bit_mask = bit_util::kBitmask[start_bit_offset];
+  int64_t remaining = length;
+
+  if (bit_mask != 0x01) {
+    current_byte = *cur & bit_util::kPrecedingBitmask[start_bit_offset];
+    while (bit_mask != 0 && remaining > 0) {
+      current_byte |= g() * bit_mask;
+      bit_mask = static_cast<uint8_t>(bit_mask << 1);
+      --remaining;
+    }
+    *cur++ = current_byte;
+  }
+
+  int64_t remaining_bytes = remaining / 8;
+  uint8_t out_results[8];
+  while (remaining_bytes-- > 0) {
+    for (int i = 0; i < 8; ++i) {
+      out_results[i] = g();
+    }
+    *cur++ = static_cast<uint8_t>(out_results[0] | out_results[1] << 1 |
+                                  out_results[2] << 2 | out_results[3] << 3 |
+                                  out_results[4] << 4 | out_results[5] << 5 |
+                                  out_results[6] << 6 | out_results[7] << 7);
+  }
+
+  int64_t remaining_bits = remaining % 8;
+  if (remaining_bits) {
+    current_byte = 0;
+    bit_mask = 0x01;
+    while (remaining_bits-- > 0) {
+      current_byte |= g() * bit_mask;
+      bit_mask = static_cast<uint8_t>(bit_mask << 1);
+    }
+    *cur++ = current_byte;
+  }
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/bitmap_ops.h b/pyarrow/include/arrow/util/bitmap_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac05bc87b30738a9cf875a5d57af833d5b526bdf
--- /dev/null
+++ b/pyarrow/include/arrow/util/bitmap_ops.h
@@ -0,0 +1,246 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/result.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Buffer;
+class MemoryPool;
+
+namespace internal {
+
+// ----------------------------------------------------------------------
+// Bitmap utilities
+
+/// Copy a bit range of an existing bitmap
+///
+/// \param[in] pool memory pool to allocate memory from
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to copy
+/// \param[in] out_offset bit offset into the output buffer
+///
+/// \return Status message
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> CopyBitmap(MemoryPool* pool, const uint8_t* bitmap,
+                                           int64_t offset, int64_t length,
+                                           int64_t out_offset = 0);
+
+/// Copy a bit range of an existing bitmap into an existing bitmap
+///
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to copy
+/// \param[in] dest_offset bit offset into the destination
+/// \param[out] dest the destination buffer, must have at least space for
+/// (offset + length) bits
+ARROW_EXPORT
+void CopyBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest,
+                int64_t dest_offset);
+
+/// Invert a bit range of an existing bitmap into an existing bitmap
+///
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to copy
+/// \param[in] dest_offset bit offset into the destination
+/// \param[out] dest the destination buffer, must have at least space for
+/// (offset + length) bits
+ARROW_EXPORT
+void InvertBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest,
+                  int64_t dest_offset);
+
+/// Invert a bit range of an existing bitmap
+///
+/// \param[in] pool memory pool to allocate memory from
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to copy
+///
+/// \return Status message
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> InvertBitmap(MemoryPool* pool, const uint8_t* bitmap,
+                                             int64_t offset, int64_t length);
+
+/// Reverse a bit range of an existing bitmap into an existing bitmap
+///
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to reverse
+/// \param[in] dest_offset bit offset into the destination
+/// \param[out] dest the destination buffer, must have at least space for
+/// (offset + length) bits
+ARROW_EXPORT
+void ReverseBitmap(const uint8_t* bitmap, int64_t offset, int64_t length, uint8_t* dest,
+                   int64_t dest_offset);
+
+/// Reverse a bit range of an existing bitmap
+///
+/// \param[in] pool memory pool to allocate memory from
+/// \param[in] bitmap source data
+/// \param[in] offset bit offset into the source data
+/// \param[in] length number of bits to reverse
+///
+/// \return Status message
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> ReverseBitmap(MemoryPool* pool, const uint8_t* bitmap,
+                                              int64_t offset, int64_t length);
+
+/// Compute the number of 1's in the given data array
+///
+/// \param[in] data a packed LSB-ordered bitmap as a byte array
+/// \param[in] bit_offset a bitwise offset into the bitmap
+/// \param[in] length the number of bits to inspect in the bitmap relative to
+/// the offset
+///
+/// \return The number of set (1) bits in the range
+ARROW_EXPORT
+int64_t CountSetBits(const uint8_t* data, int64_t bit_offset, int64_t length);
+
+/// Compute the number of 1's in the result of an "and" (&) of two bitmaps
+///
+/// \param[in] left_bitmap a packed LSB-ordered bitmap as a byte array
+/// \param[in] left_offset a bitwise offset into the left bitmap
+/// \param[in] right_bitmap a packed LSB-ordered bitmap as a byte array
+/// \param[in] right_offset a bitwise offset into the right bitmap
+/// \param[in] length the length of the bitmaps (must be the same)
+///
+/// \return The number of set (1) bits in the "and" of the two bitmaps
+ARROW_EXPORT
+int64_t CountAndSetBits(const uint8_t* left_bitmap, int64_t left_offset,
+                        const uint8_t* right_bitmap, int64_t right_offset,
+                        int64_t length);
+
+ARROW_EXPORT
+bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+                  int64_t right_offset, int64_t length);
+
+// Same as BitmapEquals, but considers a NULL bitmap pointer the same as an
+// all-ones bitmap.
+ARROW_EXPORT
+bool OptionalBitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+                          int64_t right_offset, int64_t length);
+
+ARROW_EXPORT
+bool OptionalBitmapEquals(const std::shared_ptr<Buffer>& left, int64_t left_offset,
+                          const std::shared_ptr<Buffer>& right, int64_t right_offset,
+                          int64_t length);
+
+/// \brief Do a "bitmap and" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out_buffer starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapAnd(MemoryPool* pool, const uint8_t* left,
+                                          int64_t left_offset, const uint8_t* right,
+                                          int64_t right_offset, int64_t length,
+                                          int64_t out_offset);
+
+/// \brief Do a "bitmap and" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapAnd(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+               int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+/// \brief Do a "bitmap or" for the given bit length on right and left buffers
+/// starting at their respective bit-offsets and put the results in out_buffer
+/// starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapOr(MemoryPool* pool, const uint8_t* left,
+                                         int64_t left_offset, const uint8_t* right,
+                                         int64_t right_offset, int64_t length,
+                                         int64_t out_offset);
+
+/// \brief Do a "bitmap or" for the given bit length on right and left buffers
+/// starting at their respective bit-offsets and put the results in out
+/// starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapOr(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+              int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+/// \brief Do a "bitmap xor" for the given bit-length on right and left
+/// buffers starting at their respective bit-offsets and put the results in
+/// out_buffer starting at the given bit offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapXor(MemoryPool* pool, const uint8_t* left,
+                                          int64_t left_offset, const uint8_t* right,
+                                          int64_t right_offset, int64_t length,
+                                          int64_t out_offset);
+
+/// \brief Do a "bitmap xor" for the given bit-length on right and left
+/// buffers starting at their respective bit-offsets and put the results in
+/// out starting at the given bit offset.
+ARROW_EXPORT
+void BitmapXor(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+               int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+/// \brief Do a "bitmap and not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out_buffer starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapAndNot(MemoryPool* pool, const uint8_t* left,
+                                             int64_t left_offset, const uint8_t* right,
+                                             int64_t right_offset, int64_t length,
+                                             int64_t out_offset);
+
+/// \brief Do a "bitmap and not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapAndNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+                  int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+/// \brief Do a "bitmap or not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out_buffer starting at the given bit-offset.
+///
+/// out_buffer will be allocated and initialized to zeros using pool before
+/// the operation.
+ARROW_EXPORT
+Result<std::shared_ptr<Buffer>> BitmapOrNot(MemoryPool* pool, const uint8_t* left,
+                                            int64_t left_offset, const uint8_t* right,
+                                            int64_t right_offset, int64_t length,
+                                            int64_t out_offset);
+
+/// \brief Do a "bitmap or not" on right and left buffers starting at
+/// their respective bit-offsets for the given bit-length and put
+/// the results in out starting at the given bit-offset.
+ARROW_EXPORT
+void BitmapOrNot(const uint8_t* left, int64_t left_offset, const uint8_t* right,
+                 int64_t right_offset, int64_t length, int64_t out_offset, uint8_t* out);
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/bitmap_reader.h b/pyarrow/include/arrow/util/bitmap_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..83c142c559bb5972f32873ed4bb5b80f457b17c8
--- /dev/null
+++ b/pyarrow/include/arrow/util/bitmap_reader.h
@@ -0,0 +1,275 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/buffer.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+class BitmapReader {
+ public:
+  BitmapReader() = default;
+
+  BitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+      : bitmap_(bitmap), position_(0), length_(length) {
+    current_byte_ = 0;
+    byte_offset_ = start_offset / 8;
+    bit_offset_ = start_offset % 8;
+    if (length > 0) {
+      current_byte_ = bitmap[byte_offset_];
+    }
+  }
+
+  bool IsSet() const { return (current_byte_ & (1 << bit_offset_)) != 0; }
+
+  bool IsNotSet() const { return (current_byte_ & (1 << bit_offset_)) == 0; }
+
+  void Next() {
+    ++bit_offset_;
+    ++position_;
+    if (ARROW_PREDICT_FALSE(bit_offset_ == 8)) {
+      bit_offset_ = 0;
+      ++byte_offset_;
+      if (ARROW_PREDICT_TRUE(position_ < length_)) {
+        current_byte_ = bitmap_[byte_offset_];
+      }
+    }
+  }
+
+  int64_t position() const { return position_; }
+
+  int64_t length() const { return length_; }
+
+ private:
+  const uint8_t* bitmap_;
+  int64_t position_;
+  int64_t length_;
+
+  uint8_t current_byte_;
+  int64_t byte_offset_;
+  int64_t bit_offset_;
+};
+
+// XXX Cannot name it BitmapWordReader because the name is already used
+// in bitmap_ops.cc
+
+class BitmapUInt64Reader {
+ public:
+  BitmapUInt64Reader(const uint8_t* bitmap, int64_t start_offset, int64_t length)
+      : bitmap_(util::MakeNonNull(bitmap) + start_offset / 8),
+        num_carry_bits_(8 - start_offset % 8),
+        length_(length),
+        remaining_length_(length_),
+        carry_bits_(0) {
+    if (length_ > 0) {
+      // Load carry bits from the first byte's MSBs
+      if (length_ >= num_carry_bits_) {
+        carry_bits_ =
+            LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), num_carry_bits_);
+      } else {
+        carry_bits_ = LoadPartialWord(static_cast<int8_t>(8 - num_carry_bits_), length_);
+      }
+    }
+  }
+
+  uint64_t NextWord() {
+    if (ARROW_PREDICT_TRUE(remaining_length_ >= 64 + num_carry_bits_)) {
+      // We can load a full word
+      uint64_t next_word = LoadFullWord();
+      // Carry bits come first, then the (64 - num_carry_bits_) LSBs from next_word
+      uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
+      carry_bits_ = next_word >> (64 - num_carry_bits_);
+      remaining_length_ -= 64;
+      return word;
+    } else if (remaining_length_ > num_carry_bits_) {
+      // We can load a partial word
+      uint64_t next_word =
+          LoadPartialWord(/*bit_offset=*/0, remaining_length_ - num_carry_bits_);
+      uint64_t word = carry_bits_ | (next_word << num_carry_bits_);
+      carry_bits_ = next_word >> (64 - num_carry_bits_);
+      remaining_length_ = std::max<int64_t>(remaining_length_ - 64, 0);
+      return word;
+    } else {
+      remaining_length_ = 0;
+      return carry_bits_;
+    }
+  }
+
+  int64_t position() const { return length_ - remaining_length_; }
+
+  int64_t length() const { return length_; }
+
+ private:
+  uint64_t LoadFullWord() {
+    uint64_t word;
+    memcpy(&word, bitmap_, 8);
+    bitmap_ += 8;
+    return bit_util::ToLittleEndian(word);
+  }
+
+  uint64_t LoadPartialWord(int8_t bit_offset, int64_t num_bits) {
+    uint64_t word = 0;
+    const int64_t num_bytes = bit_util::BytesForBits(num_bits);
+    memcpy(&word, bitmap_, num_bytes);
+    bitmap_ += num_bytes;
+    return (bit_util::ToLittleEndian(word) >> bit_offset) &
+           bit_util::LeastSignificantBitMask<uint64_t>(num_bits);
+  }
+
+  const uint8_t* bitmap_;
+  const int64_t num_carry_bits_;  // in [1, 8]
+  const int64_t length_;
+  int64_t remaining_length_;
+  uint64_t carry_bits_;
+};
+
+// BitmapWordReader here is faster than BitmapUInt64Reader (in bitmap_reader.h)
+// on sufficiently large inputs.  However, it has a larger prolog / epilog overhead
+// and should probably not be used for small bitmaps.
+
+template <typename Word, bool may_have_byte_offset = true>
+class BitmapWordReader {
+ public:
+  BitmapWordReader() = default;
+  BitmapWordReader(const uint8_t* bitmap, int64_t offset, int64_t length)
+      : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
+        bitmap_(bitmap + offset / 8),
+        bitmap_end_(bitmap_ + bit_util::BytesForBits(offset_ + length)) {
+    // decrement word count by one as we may touch two adjacent words in one iteration
+    nwords_ = length / (sizeof(Word) * 8) - 1;
+    if (nwords_ < 0) {
+      nwords_ = 0;
+    }
+    trailing_bits_ = static_cast<int>(length - nwords_ * sizeof(Word) * 8);
+    trailing_bytes_ = static_cast<int>(bit_util::BytesForBits(trailing_bits_));
+
+    if (nwords_ > 0) {
+      current_data.word_ = load<Word>(bitmap_);
+    } else if (length > 0) {
+      current_data.epi.byte_ = load<uint8_t>(bitmap_);
+    }
+  }
+
+  Word NextWord() {
+    bitmap_ += sizeof(Word);
+    const Word next_word = load<Word>(bitmap_);
+    Word word = current_data.word_;
+    if (may_have_byte_offset && offset_) {
+      // combine two adjacent words into one word
+      // |<------ next ----->|<---- current ---->|
+      // +-------------+-----+-------------+-----+
+      // |     ---     |  A  |      B      | --- |
+      // +-------------+-----+-------------+-----+
+      //                  |         |       offset
+      //                  v         v
+      //               +-----+-------------+
+      //               |  A  |      B      |
+      //               +-----+-------------+
+      //               |<------ word ----->|
+      word >>= offset_;
+      word |= next_word << (sizeof(Word) * 8 - offset_);
+    }
+    current_data.word_ = next_word;
+    return word;
+  }
+
+  uint8_t NextTrailingByte(int& valid_bits) {
+    uint8_t byte;
+    assert(trailing_bits_ > 0);
+
+    if (trailing_bits_ <= 8) {
+      // last byte
+      valid_bits = trailing_bits_;
+      trailing_bits_ = 0;
+      byte = 0;
+      internal::BitmapReader reader(bitmap_, offset_, valid_bits);
+      for (int i = 0; i < valid_bits; ++i) {
+        byte >>= 1;
+        if (reader.IsSet()) {
+          byte |= 0x80;
+        }
+        reader.Next();
+      }
+      byte >>= (8 - valid_bits);
+    } else {
+      ++bitmap_;
+      const uint8_t next_byte = load<uint8_t>(bitmap_);
+      byte = current_data.epi.byte_;
+      if (may_have_byte_offset && offset_) {
+        byte >>= offset_;
+        byte |= next_byte << (8 - offset_);
+      }
+      current_data.epi.byte_ = next_byte;
+      trailing_bits_ -= 8;
+      trailing_bytes_--;
+      valid_bits = 8;
+    }
+    return byte;
+  }
+
+  int64_t words() const { return nwords_; }
+  int trailing_bytes() const { return trailing_bytes_; }
+
+ private:
+  int64_t offset_;
+  const uint8_t* bitmap_;
+
+  const uint8_t* bitmap_end_;
+  int64_t nwords_;
+  int trailing_bits_;
+  int trailing_bytes_;
+  union {
+    Word word_;
+    struct {
+#if ARROW_LITTLE_ENDIAN == 0
+      uint8_t padding_bytes_[sizeof(Word) - 1];
+#endif
+      uint8_t byte_;
+    } epi;
+  } current_data;
+
+  template <typename DType>
+  DType load(const uint8_t* bitmap) {
+    assert(bitmap + sizeof(DType) <= bitmap_end_);
+    return bit_util::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
+  }
+};
+
+/// \brief Index into a possibly nonexistent bitmap
+struct OptionalBitIndexer {
+  const uint8_t* bitmap;
+  const int64_t offset;
+
+  explicit OptionalBitIndexer(const uint8_t* buffer = NULLPTR, int64_t offset = 0)
+      : bitmap(buffer), offset(offset) {}
+
+  bool operator[](int64_t i) const {
+    return bitmap == NULLPTR || bit_util::GetBit(bitmap, offset + i);
+  }
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/bitmap_visit.h b/pyarrow/include/arrow/util/bitmap_visit.h
new file mode 100644
index 0000000000000000000000000000000000000000..c29589013e4b7863705e1de4cf8c69293451eb8b
--- /dev/null
+++ b/pyarrow/include/arrow/util/bitmap_visit.h
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
+
+namespace arrow {
+namespace internal {
+
+// A function that visits each bit in a bitmap and calls a visitor function with a
+// boolean representation of that bit. This is intended to be analogous to
+// GenerateBits.
+template <class Visitor>
+void VisitBits(const uint8_t* bitmap, int64_t start_offset, int64_t length,
+               Visitor&& visit) {
+  BitmapReader reader(bitmap, start_offset, length);
+  for (int64_t index = 0; index < length; ++index) {
+    visit(reader.IsSet());
+    reader.Next();
+  }
+}
+
+// Like VisitBits(), but unrolls its main loop for better performance.
+template <class Visitor>
+void VisitBitsUnrolled(const uint8_t* bitmap, int64_t start_offset, int64_t length,
+                       Visitor&& visit) {
+  if (length == 0) {
+    return;
+  }
+
+  // Start by visiting any bits preceding the first full byte.
+  int64_t num_bits_before_full_bytes =
+      bit_util::RoundUpToMultipleOf8(start_offset) - start_offset;
+  // Truncate num_bits_before_full_bytes if it is greater than length.
+  if (num_bits_before_full_bytes > length) {
+    num_bits_before_full_bytes = length;
+  }
+  // Use the non loop-unrolled VisitBits since we don't want to add branches
+  VisitBits<Visitor>(bitmap, start_offset, num_bits_before_full_bytes, visit);
+
+  // Shift the start pointer to the first full byte and compute the
+  // number of full bytes to be read.
+  const uint8_t* first_full_byte = bitmap + bit_util::CeilDiv(start_offset, 8);
+  const int64_t num_full_bytes = (length - num_bits_before_full_bytes) / 8;
+
+  // Iterate over each full byte of the input bitmap and call the visitor in
+  // a loop-unrolled manner.
+  for (int64_t byte_index = 0; byte_index < num_full_bytes; ++byte_index) {
+    // Get the current bit-packed byte value from the bitmap.
+    const uint8_t byte = *(first_full_byte + byte_index);
+
+    // Execute the visitor function on each bit of the current byte.
+    visit(bit_util::GetBitFromByte(byte, 0));
+    visit(bit_util::GetBitFromByte(byte, 1));
+    visit(bit_util::GetBitFromByte(byte, 2));
+    visit(bit_util::GetBitFromByte(byte, 3));
+    visit(bit_util::GetBitFromByte(byte, 4));
+    visit(bit_util::GetBitFromByte(byte, 5));
+    visit(bit_util::GetBitFromByte(byte, 6));
+    visit(bit_util::GetBitFromByte(byte, 7));
+  }
+
+  // Write any leftover bits in the last byte.
+  const int64_t num_bits_after_full_bytes = (length - num_bits_before_full_bytes) % 8;
+  VisitBits<Visitor>(first_full_byte + num_full_bytes, 0, num_bits_after_full_bytes,
+                     visit);
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/bitmap_writer.h b/pyarrow/include/arrow/util/bitmap_writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9ce8012f3eb5a65ec91b1321b687bc0d77f7557
--- /dev/null
+++ b/pyarrow/include/arrow/util/bitmap_writer.h
@@ -0,0 +1,286 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+class BitmapWriter {
+  // A sequential bitwise writer that preserves surrounding bit values.
+
+ public:
+  BitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
+      : bitmap_(bitmap), position_(0), length_(length) {
+    byte_offset_ = start_offset / 8;
+    bit_mask_ = bit_util::kBitmask[start_offset % 8];
+    if (length > 0) {
+      current_byte_ = bitmap[byte_offset_];
+    } else {
+      current_byte_ = 0;
+    }
+  }
+
+  void Set() { current_byte_ |= bit_mask_; }
+
+  void Clear() { current_byte_ &= bit_mask_ ^ 0xFF; }
+
+  void Next() {
+    bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1);
+    ++position_;
+    if (bit_mask_ == 0) {
+      // Finished this byte, need advancing
+      bit_mask_ = 0x01;
+      bitmap_[byte_offset_++] = current_byte_;
+      if (ARROW_PREDICT_TRUE(position_ < length_)) {
+        current_byte_ = bitmap_[byte_offset_];
+      }
+    }
+  }
+
+  void Finish() {
+    // Store current byte if we didn't went past bitmap storage
+    if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) {
+      bitmap_[byte_offset_] = current_byte_;
+    }
+  }
+
+  int64_t position() const { return position_; }
+
+ private:
+  uint8_t* bitmap_;
+  int64_t position_;
+  int64_t length_;
+
+  uint8_t current_byte_;
+  uint8_t bit_mask_;
+  int64_t byte_offset_;
+};
+
+class FirstTimeBitmapWriter {
+  // Like BitmapWriter, but any bit values *following* the bits written
+  // might be clobbered.  It is hence faster than BitmapWriter, and can
+  // also avoid false positives with Valgrind.
+
+ public:
+  FirstTimeBitmapWriter(uint8_t* bitmap, int64_t start_offset, int64_t length)
+      : bitmap_(bitmap), position_(0), length_(length) {
+    current_byte_ = 0;
+    byte_offset_ = start_offset / 8;
+    bit_mask_ = bit_util::kBitmask[start_offset % 8];
+    if (length > 0) {
+      current_byte_ =
+          bitmap[byte_offset_] & bit_util::kPrecedingBitmask[start_offset % 8];
+    } else {
+      current_byte_ = 0;
+    }
+  }
+
+  /// Appends number_of_bits from word to valid_bits and valid_bits_offset.
+  ///
+  /// \param[in] word The LSB bitmap to append. Any bits past number_of_bits are assumed
+  ///            to be unset (i.e. 0).
+  /// \param[in] number_of_bits The number of bits to append from word.
+  void AppendWord(uint64_t word, int64_t number_of_bits) {
+    if (ARROW_PREDICT_FALSE(number_of_bits == 0)) {
+      return;
+    }
+
+    // Location that the first byte needs to be written to.
+    uint8_t* append_position = bitmap_ + byte_offset_;
+
+    // Update state variables except for current_byte_ here.
+    position_ += number_of_bits;
+    int64_t bit_offset = bit_util::CountTrailingZeros(static_cast<uint32_t>(bit_mask_));
+    bit_mask_ = bit_util::kBitmask[(bit_offset + number_of_bits) % 8];
+    byte_offset_ += (bit_offset + number_of_bits) / 8;
+
+    if (bit_offset != 0) {
+      // We are in the middle of the byte. This code updates the byte and shifts
+      // bits appropriately within word so it can be memcpy'd below.
+      int64_t bits_to_carry = 8 - bit_offset;
+      // Carry over bits from word to current_byte_. We assume any extra bits in word
+      // unset so no additional accounting is needed for when number_of_bits <
+      // bits_to_carry.
+      current_byte_ |= (word & bit_util::kPrecedingBitmask[bits_to_carry]) << bit_offset;
+      // Check if everything is transferred into current_byte_.
+      if (ARROW_PREDICT_FALSE(number_of_bits < bits_to_carry)) {
+        return;
+      }
+      *append_position = current_byte_;
+      append_position++;
+      // Move the carry bits off of word.
+      word = word >> bits_to_carry;
+      number_of_bits -= bits_to_carry;
+    }
+    word = bit_util::ToLittleEndian(word);
+    int64_t bytes_for_word = ::arrow::bit_util::BytesForBits(number_of_bits);
+    std::memcpy(append_position, &word, bytes_for_word);
+    // At this point, the previous current_byte_ has been written to bitmap_.
+    // The new current_byte_ is either the last relevant byte in 'word'
+    // or cleared if the new position is byte aligned (i.e. a fresh byte).
+    if (bit_mask_ == 0x1) {
+      current_byte_ = 0;
+    } else {
+      current_byte_ = *(append_position + bytes_for_word - 1);
+    }
+  }
+
+  void Set() { current_byte_ |= bit_mask_; }
+
+  void Clear() {}
+
+  void Next() {
+    bit_mask_ = static_cast<uint8_t>(bit_mask_ << 1);
+    ++position_;
+    if (bit_mask_ == 0) {
+      // Finished this byte, need advancing
+      bit_mask_ = 0x01;
+      bitmap_[byte_offset_++] = current_byte_;
+      current_byte_ = 0;
+    }
+  }
+
+  void Finish() {
+    // Store current byte if we didn't went go bitmap storage
+    if (length_ > 0 && (bit_mask_ != 0x01 || position_ < length_)) {
+      bitmap_[byte_offset_] = current_byte_;
+    }
+  }
+
+  int64_t position() const { return position_; }
+
+ private:
+  uint8_t* bitmap_;
+  int64_t position_;
+  int64_t length_;
+
+  uint8_t current_byte_;
+  uint8_t bit_mask_;
+  int64_t byte_offset_;
+};
+
+template <typename Word, bool may_have_byte_offset = true>
+class BitmapWordWriter {
+ public:
+  BitmapWordWriter() = default;
+  BitmapWordWriter(uint8_t* bitmap, int64_t offset, int64_t length)
+      : offset_(static_cast<int64_t>(may_have_byte_offset) * (offset % 8)),
+        bitmap_(bitmap + offset / 8),
+        bitmap_end_(bitmap_ + bit_util::BytesForBits(offset_ + length)),
+        mask_((1U << offset_) - 1) {
+    if (offset_) {
+      if (length >= static_cast<int>(sizeof(Word) * 8)) {
+        current_data.word_ = load<Word>(bitmap_);
+      } else if (length > 0) {
+        current_data.epi.byte_ = load<uint8_t>(bitmap_);
+      }
+    }
+  }
+
+  void PutNextWord(Word word) {
+    if (may_have_byte_offset && offset_) {
+      // split one word into two adjacent words, don't touch unused bits
+      //               |<------ word ----->|
+      //               +-----+-------------+
+      //               |  A  |      B      |
+      //               +-----+-------------+
+      //                  |         |
+      //                  v         v       offset
+      // +-------------+-----+-------------+-----+
+      // |     ---     |  A  |      B      | --- |
+      // +-------------+-----+-------------+-----+
+      // |<------ next ----->|<---- current ---->|
+      word = (word << offset_) | (word >> (sizeof(Word) * 8 - offset_));
+      Word next_word = load<Word>(bitmap_ + sizeof(Word));
+      current_data.word_ = (current_data.word_ & mask_) | (word & ~mask_);
+      next_word = (next_word & ~mask_) | (word & mask_);
+      store<Word>(bitmap_, current_data.word_);
+      store<Word>(bitmap_ + sizeof(Word), next_word);
+      current_data.word_ = next_word;
+    } else {
+      store<Word>(bitmap_, word);
+    }
+    bitmap_ += sizeof(Word);
+  }
+
+  void PutNextTrailingByte(uint8_t byte, int valid_bits) {
+    if (valid_bits == 8) {
+      if (may_have_byte_offset && offset_) {
+        byte = (byte << offset_) | (byte >> (8 - offset_));
+        uint8_t next_byte = load<uint8_t>(bitmap_ + 1);
+        current_data.epi.byte_ = (current_data.epi.byte_ & mask_) | (byte & ~mask_);
+        next_byte = (next_byte & ~mask_) | (byte & mask_);
+        store<uint8_t>(bitmap_, current_data.epi.byte_);
+        store<uint8_t>(bitmap_ + 1, next_byte);
+        current_data.epi.byte_ = next_byte;
+      } else {
+        store<uint8_t>(bitmap_, byte);
+      }
+      ++bitmap_;
+    } else {
+      assert(valid_bits > 0);
+      assert(valid_bits < 8);
+      assert(bitmap_ + bit_util::BytesForBits(offset_ + valid_bits) <= bitmap_end_);
+      internal::BitmapWriter writer(bitmap_, offset_, valid_bits);
+      for (int i = 0; i < valid_bits; ++i) {
+        (byte & 0x01) ? writer.Set() : writer.Clear();
+        writer.Next();
+        byte >>= 1;
+      }
+      writer.Finish();
+    }
+  }
+
+ private:
+  int64_t offset_;
+  uint8_t* bitmap_;
+
+  const uint8_t* bitmap_end_;
+  uint64_t mask_;
+  union {
+    Word word_;
+    struct {
+#if ARROW_LITTLE_ENDIAN == 0
+      uint8_t padding_bytes_[sizeof(Word) - 1];
+#endif
+      uint8_t byte_;
+    } epi;
+  } current_data;
+
+  template <typename DType>
+  DType load(const uint8_t* bitmap) {
+    assert(bitmap + sizeof(DType) <= bitmap_end_);
+    return bit_util::ToLittleEndian(util::SafeLoadAs<DType>(bitmap));
+  }
+
+  template <typename DType>
+  void store(uint8_t* bitmap, DType data) {
+    assert(bitmap + sizeof(DType) <= bitmap_end_);
+    util::SafeStore(bitmap, bit_util::FromLittleEndian(data));
+  }
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/byte_size.h b/pyarrow/include/arrow/util/byte_size.h
new file mode 100644
index 0000000000000000000000000000000000000000..214c7551b6c76bc95a7d71eb8b8c31bd96d4b838
--- /dev/null
+++ b/pyarrow/include/arrow/util/byte_size.h
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/type_fwd.h"
+
+namespace arrow {
+
+namespace util {
+
+/// \brief The sum of bytes in each buffer referenced by the array
+///
+/// Note: An array may only reference a portion of a buffer.
+///       This method will overestimate in this case and return the
+///       byte size of the entire buffer.
+/// Note: If a buffer is referenced multiple times then it will
+///       only be counted once.
+ARROW_EXPORT int64_t TotalBufferSize(const ArrayData& array_data);
+/// \brief The sum of bytes in each buffer referenced by the array
+/// \see TotalBufferSize(const ArrayData& array_data) for details
+ARROW_EXPORT int64_t TotalBufferSize(const Array& array);
+/// \brief The sum of bytes in each buffer referenced by the array
+/// \see TotalBufferSize(const ArrayData& array_data) for details
+ARROW_EXPORT int64_t TotalBufferSize(const ChunkedArray& chunked_array);
+/// \brief The sum of bytes in each buffer referenced by the batch
+/// \see TotalBufferSize(const ArrayData& array_data) for details
+ARROW_EXPORT int64_t TotalBufferSize(const RecordBatch& record_batch);
+/// \brief The sum of bytes in each buffer referenced by the table
+/// \see TotalBufferSize(const ArrayData& array_data) for details
+ARROW_EXPORT int64_t TotalBufferSize(const Table& table);
+
+/// \brief Calculate the buffer ranges referenced by the array
+///
+/// These ranges will take into account array offsets
+///
+/// The ranges may contain duplicates
+///
+/// Dictionary arrays will ignore the offset of their containing array
+///
+/// The return value will be a struct array corresponding to the schema:
+/// schema({field("start", uint64()), field("offset", uint64()), field("length",
+/// uint64()))
+ARROW_EXPORT Result<std::shared_ptr<Array>> ReferencedRanges(const ArrayData& array_data);
+
+/// \brief Returns the sum of bytes from all buffer ranges referenced
+///
+/// Unlike TotalBufferSize this method will account for array
+/// offsets.
+///
+/// If buffers are shared between arrays then the shared
+/// portion will be counted multiple times.
+///
+/// Dictionary arrays will always be counted in their entirety
+/// even if the array only references a portion of the dictionary.
+ARROW_EXPORT Result<int64_t> ReferencedBufferSize(const ArrayData& array_data);
+/// \brief Returns the sum of bytes from all buffer ranges referenced
+/// \see ReferencedBufferSize(const ArrayData& array_data) for details
+ARROW_EXPORT Result<int64_t> ReferencedBufferSize(const Array& array_data);
+/// \brief Returns the sum of bytes from all buffer ranges referenced
+/// \see ReferencedBufferSize(const ArrayData& array_data) for details
+ARROW_EXPORT Result<int64_t> ReferencedBufferSize(const ChunkedArray& array_data);
+/// \brief Returns the sum of bytes from all buffer ranges referenced
+/// \see ReferencedBufferSize(const ArrayData& array_data) for details
+ARROW_EXPORT Result<int64_t> ReferencedBufferSize(const RecordBatch& array_data);
+/// \brief Returns the sum of bytes from all buffer ranges referenced
+/// \see ReferencedBufferSize(const ArrayData& array_data) for details
+ARROW_EXPORT Result<int64_t> ReferencedBufferSize(const Table& array_data);
+
+}  // namespace util
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/cancel.h b/pyarrow/include/arrow/util/cancel.h
new file mode 100644
index 0000000000000000000000000000000000000000..863b2c4e8ca902b24d1c3d6fb26cbe46ad434cfd
--- /dev/null
+++ b/pyarrow/include/arrow/util/cancel.h
@@ -0,0 +1,118 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class StopToken;
+
+struct StopSourceImpl;
+
+/// EXPERIMENTAL
+class ARROW_EXPORT StopSource {
+ public:
+  StopSource();
+  ~StopSource();
+
+  // Consumer API (the side that stops)
+  void RequestStop();
+  void RequestStop(Status error);
+  // Async-signal-safe. TODO Deprecate this?
+  void RequestStopFromSignal(int signum);
+
+  StopToken token();
+
+  // For internal use only
+  void Reset();
+
+ protected:
+  std::shared_ptr<StopSourceImpl> impl_;
+};
+
+/// EXPERIMENTAL
+class ARROW_EXPORT StopToken {
+ public:
+  // Public for Cython
+  StopToken() = default;
+
+  explicit StopToken(std::shared_ptr<StopSourceImpl> impl) : impl_(std::move(impl)) {}
+
+  // A trivial token that never propagates any stop request
+  static StopToken Unstoppable() { return StopToken(); }
+
+  /// \brief Check if the stop source has been cancelled.
+  ///
+  /// Producers should call this method, whenever convenient, to check and
+  /// see if they should stop producing early (i.e. have been cancelled).
+  /// Failure to call this method often enough will lead to an unresponsive
+  /// cancellation.
+  ///
+  /// This is part of the producer API (the side that gets asked to stop)
+  /// This method is thread-safe
+  ///
+  /// \return An OK status if the stop source has not been cancelled or a
+  ///         cancel error if the source has been cancelled.
+  Status Poll() const;
+  bool IsStopRequested() const;
+
+ protected:
+  std::shared_ptr<StopSourceImpl> impl_;
+};
+
+/// EXPERIMENTAL: Set a global StopSource that can receive signals
+///
+/// The only allowed order of calls is the following:
+/// - SetSignalStopSource()
+/// - any number of pairs of (RegisterCancellingSignalHandler,
+///   UnregisterCancellingSignalHandler) calls
+/// - ResetSignalStopSource()
+///
+/// Beware that these settings are process-wide.  Typically, only one
+/// thread should call these APIs, even in a multithreaded setting.
+ARROW_EXPORT
+Result<StopSource*> SetSignalStopSource();
+
+/// EXPERIMENTAL: Reset the global signal-receiving StopSource
+///
+/// This will invalidate the pointer returned by SetSignalStopSource.
+ARROW_EXPORT
+void ResetSignalStopSource();
+
+/// EXPERIMENTAL: Register signal handler triggering the signal-receiving StopSource
+///
+/// Note that those handlers are automatically un-registered in a fork()ed process,
+/// therefore the child process will need to call RegisterCancellingSignalHandler()
+/// if desired.
+ARROW_EXPORT
+Status RegisterCancellingSignalHandler(const std::vector<int>& signals);
+
+/// EXPERIMENTAL: Unregister signal handler set up by RegisterCancellingSignalHandler
+ARROW_EXPORT
+void UnregisterCancellingSignalHandler();
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/checked_cast.h b/pyarrow/include/arrow/util/checked_cast.h
new file mode 100644
index 0000000000000000000000000000000000000000..97f6b61a1f8cebd297a5f4a8fe4401b6073de45f
--- /dev/null
+++ b/pyarrow/include/arrow/util/checked_cast.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+namespace arrow {
+namespace internal {
+
+template <typename OutputType, typename InputType>
+inline OutputType checked_cast(InputType&& value) {
+  static_assert(std::is_class<typename std::remove_pointer<
+                    typename std::remove_reference<InputType>::type>::type>::value,
+                "checked_cast input type must be a class");
+  static_assert(std::is_class<typename std::remove_pointer<
+                    typename std::remove_reference<OutputType>::type>::type>::value,
+                "checked_cast output type must be a class");
+#ifdef NDEBUG
+  return static_cast<OutputType>(value);
+#else
+  return dynamic_cast<OutputType>(value);
+#endif
+}
+
+template <class T, class U>
+std::shared_ptr<T> checked_pointer_cast(std::shared_ptr<U> r) noexcept {
+#ifdef NDEBUG
+  return std::static_pointer_cast<T>(std::move(r));
+#else
+  return std::dynamic_pointer_cast<T>(std::move(r));
+#endif
+}
+
+template <class T, class U>
+std::unique_ptr<T> checked_pointer_cast(std::unique_ptr<U> r) noexcept {
+#ifdef NDEBUG
+  return std::unique_ptr<T>(static_cast<T*>(r.release()));
+#else
+  return std::unique_ptr<T>(dynamic_cast<T*>(r.release()));
+#endif
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/compare.h b/pyarrow/include/arrow/util/compare.h
new file mode 100644
index 0000000000000000000000000000000000000000..0594b6002ff573afcb420b260c921a78277c9daf
--- /dev/null
+++ b/pyarrow/include/arrow/util/compare.h
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace util {
+
+/// CRTP helper for declaring equality comparison. Defines operator== and operator!=
+template <typename T>
+class EqualityComparable {
+ public:
+  ~EqualityComparable() {
+    static_assert(
+        std::is_same<decltype(std::declval<const T>().Equals(std::declval<const T>())),
+                     bool>::value,
+        "EqualityComparable depends on the method T::Equals(const T&) const");
+  }
+
+  template <typename... Extra>
+  bool Equals(const std::shared_ptr<T>& other, Extra&&... extra) const {
+    if (other == NULLPTR) {
+      return false;
+    }
+    return cast().Equals(*other, std::forward<Extra>(extra)...);
+  }
+
+  struct PtrsEqual {
+    bool operator()(const std::shared_ptr<T>& l, const std::shared_ptr<T>& r) const {
+      return l->Equals(*r);
+    }
+  };
+
+  friend bool operator==(T const& a, T const& b) { return a.Equals(b); }
+  friend bool operator!=(T const& a, T const& b) { return !(a == b); }
+
+ private:
+  const T& cast() const { return static_cast<const T&>(*this); }
+};
+
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/compression.h b/pyarrow/include/arrow/util/compression.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7bf4d5e12d02d349c3a0e0fce43f6be5ef4d585
--- /dev/null
+++ b/pyarrow/include/arrow/util/compression.h
@@ -0,0 +1,241 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+constexpr int kUseDefaultCompressionLevel = std::numeric_limits<int>::min();
+
+/// \brief Streaming compressor interface
+///
+class ARROW_EXPORT Compressor {
+ public:
+  virtual ~Compressor() = default;
+
+  struct CompressResult {
+    int64_t bytes_read;
+    int64_t bytes_written;
+  };
+  struct FlushResult {
+    int64_t bytes_written;
+    bool should_retry;
+  };
+  struct EndResult {
+    int64_t bytes_written;
+    bool should_retry;
+  };
+
+  /// \brief Compress some input.
+  ///
+  /// If bytes_read is 0 on return, then a larger output buffer should be supplied.
+  virtual Result<CompressResult> Compress(int64_t input_len, const uint8_t* input,
+                                          int64_t output_len, uint8_t* output) = 0;
+
+  /// \brief Flush part of the compressed output.
+  ///
+  /// If should_retry is true on return, Flush() should be called again
+  /// with a larger buffer.
+  virtual Result<FlushResult> Flush(int64_t output_len, uint8_t* output) = 0;
+
+  /// \brief End compressing, doing whatever is necessary to end the stream.
+  ///
+  /// If should_retry is true on return, End() should be called again
+  /// with a larger buffer.  Otherwise, the Compressor should not be used anymore.
+  ///
+  /// End() implies Flush().
+  virtual Result<EndResult> End(int64_t output_len, uint8_t* output) = 0;
+
+  // XXX add methods for buffer size heuristics?
+};
+
+/// \brief Streaming decompressor interface
+///
+class ARROW_EXPORT Decompressor {
+ public:
+  virtual ~Decompressor() = default;
+
+  struct DecompressResult {
+    // XXX is need_more_output necessary? (Brotli?)
+    int64_t bytes_read;
+    int64_t bytes_written;
+    bool need_more_output;
+  };
+
+  /// \brief Decompress some input.
+  ///
+  /// If need_more_output is true on return, a larger output buffer needs
+  /// to be supplied.
+  virtual Result<DecompressResult> Decompress(int64_t input_len, const uint8_t* input,
+                                              int64_t output_len, uint8_t* output) = 0;
+
+  /// \brief Return whether the compressed stream is finished.
+  ///
+  /// This is a heuristic.  If true is returned, then it is guaranteed
+  /// that the stream is finished.  If false is returned, however, it may
+  /// simply be that the underlying library isn't able to provide the information.
+  virtual bool IsFinished() = 0;
+
+  /// \brief Reinitialize decompressor, making it ready for a new compressed stream.
+  virtual Status Reset() = 0;
+
+  // XXX add methods for buffer size heuristics?
+};
+
+/// \brief Compression codec options
+class ARROW_EXPORT CodecOptions {
+ public:
+  explicit CodecOptions(int compression_level = kUseDefaultCompressionLevel)
+      : compression_level(compression_level) {}
+
+  virtual ~CodecOptions() = default;
+
+  int compression_level;
+};
+
+// ----------------------------------------------------------------------
+// GZip codec options implementation
+
+enum class GZipFormat {
+  ZLIB,
+  DEFLATE,
+  GZIP,
+};
+
+class ARROW_EXPORT GZipCodecOptions : public CodecOptions {
+ public:
+  GZipFormat gzip_format = GZipFormat::GZIP;
+  std::optional<int> window_bits;
+};
+
+// ----------------------------------------------------------------------
+// brotli codec options implementation
+
+class ARROW_EXPORT BrotliCodecOptions : public CodecOptions {
+ public:
+  std::optional<int> window_bits;
+};
+
+/// \brief Compression codec
+class ARROW_EXPORT Codec {
+ public:
+  virtual ~Codec() = default;
+
+  /// \brief Return special value to indicate that a codec implementation
+  /// should use its default compression level
+  static int UseDefaultCompressionLevel();
+
+  /// \brief Return a string name for compression type
+  static const std::string& GetCodecAsString(Compression::type t);
+
+  /// \brief Return compression type for name (all lower case)
+  static Result<Compression::type> GetCompressionType(const std::string& name);
+
+  /// \brief Create a codec for the given compression algorithm with CodecOptions
+  static Result<std::unique_ptr<Codec>> Create(
+      Compression::type codec, const CodecOptions& codec_options = CodecOptions{});
+
+  /// \brief Create a codec for the given compression algorithm
+  static Result<std::unique_ptr<Codec>> Create(Compression::type codec,
+                                               int compression_level);
+
+  /// \brief Return true if support for indicated codec has been enabled
+  static bool IsAvailable(Compression::type codec);
+
+  /// \brief Return true if indicated codec supports setting a compression level
+  static bool SupportsCompressionLevel(Compression::type codec);
+
+  /// \brief Return the smallest supported compression level for the codec
+  /// Note: This function creates a temporary Codec instance
+  static Result<int> MinimumCompressionLevel(Compression::type codec);
+
+  /// \brief Return the largest supported compression level for the codec
+  /// Note: This function creates a temporary Codec instance
+  static Result<int> MaximumCompressionLevel(Compression::type codec);
+
+  /// \brief Return the default compression level
+  /// Note: This function creates a temporary Codec instance
+  static Result<int> DefaultCompressionLevel(Compression::type codec);
+
+  /// \brief Return the smallest supported compression level
+  virtual int minimum_compression_level() const = 0;
+
+  /// \brief Return the largest supported compression level
+  virtual int maximum_compression_level() const = 0;
+
+  /// \brief Return the default compression level
+  virtual int default_compression_level() const = 0;
+
+  /// \brief One-shot decompression function
+  ///
+  /// output_buffer_len must be correct and therefore be obtained in advance.
+  /// The actual decompressed length is returned.
+  ///
+  /// \note One-shot decompression is not always compatible with streaming
+  /// compression.  Depending on the codec (e.g. LZ4), different formats may
+  /// be used.
+  virtual Result<int64_t> Decompress(int64_t input_len, const uint8_t* input,
+                                     int64_t output_buffer_len,
+                                     uint8_t* output_buffer) = 0;
+
+  /// \brief One-shot compression function
+  ///
+  /// output_buffer_len must first have been computed using MaxCompressedLen().
+  /// The actual compressed length is returned.
+  ///
+  /// \note One-shot compression is not always compatible with streaming
+  /// decompression.  Depending on the codec (e.g. LZ4), different formats may
+  /// be used.
+  virtual Result<int64_t> Compress(int64_t input_len, const uint8_t* input,
+                                   int64_t output_buffer_len, uint8_t* output_buffer) = 0;
+
+  virtual int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) = 0;
+
+  /// \brief Create a streaming compressor instance
+  virtual Result<std::shared_ptr<Compressor>> MakeCompressor() = 0;
+
+  /// \brief Create a streaming compressor instance
+  virtual Result<std::shared_ptr<Decompressor>> MakeDecompressor() = 0;
+
+  /// \brief This Codec's compression type
+  virtual Compression::type compression_type() const = 0;
+
+  /// \brief The name of this Codec's compression type
+  const std::string& name() const { return GetCodecAsString(compression_type()); }
+
+  /// \brief This Codec's compression level, if applicable
+  virtual int compression_level() const { return UseDefaultCompressionLevel(); }
+
+ private:
+  /// \brief Initializes the codec's resources.
+  virtual Status Init();
+};
+
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/concurrent_map.h b/pyarrow/include/arrow/util/concurrent_map.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff1584552a8ffc77fa518002bd285795ec0d1408
--- /dev/null
+++ b/pyarrow/include/arrow/util/concurrent_map.h
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+#include <utility>
+
+#include "arrow/util/mutex.h"
+
+namespace arrow {
+namespace util {
+
+template <typename K, typename V>
+class ConcurrentMap {
+ public:
+  void Insert(const K& key, const V& value) {
+    auto lock = mutex_.Lock();
+    map_.insert({key, value});
+  }
+
+  template <typename ValueFunc>
+  V GetOrInsert(const K& key, ValueFunc&& compute_value_func) {
+    auto lock = mutex_.Lock();
+    auto it = map_.find(key);
+    if (it == map_.end()) {
+      auto pair = map_.emplace(key, compute_value_func());
+      it = pair.first;
+    }
+    return it->second;
+  }
+
+  void Erase(const K& key) {
+    auto lock = mutex_.Lock();
+    map_.erase(key);
+  }
+
+  void Clear() {
+    auto lock = mutex_.Lock();
+    map_.clear();
+  }
+
+  size_t size() const {
+    auto lock = mutex_.Lock();
+    return map_.size();
+  }
+
+ private:
+  std::unordered_map<K, V> map_;
+  mutable arrow::util::Mutex mutex_;
+};
+
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/config.h b/pyarrow/include/arrow/util/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..2e4a0ea7f4a08d271e127936a9f9f0ebaf1af5e2
--- /dev/null
+++ b/pyarrow/include/arrow/util/config.h
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#define ARROW_VERSION_MAJOR 23
+#define ARROW_VERSION_MINOR 0
+#define ARROW_VERSION_PATCH 1
+#define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH
+
+#define ARROW_VERSION_STRING "23.0.1"
+
+#define ARROW_SO_VERSION "2300"
+#define ARROW_FULL_SO_VERSION "2300.1.0"
+
+#define ARROW_CXX_COMPILER_ID "GNU"
+#define ARROW_CXX_COMPILER_VERSION "14.2.1"
+
+#define ARROW_BUILD_TYPE "RELEASE"
+
+#define ARROW_PACKAGE_KIND "python-wheel-manylinux228"
+
+#define ARROW_COMPUTE
+#define ARROW_CSV
+/* #undef ARROW_CUDA */
+#define ARROW_DATASET
+#define ARROW_FILESYSTEM
+#define ARROW_FLIGHT
+/* #undef ARROW_FLIGHT_SQL */
+#define ARROW_IPC
+#define ARROW_JEMALLOC
+#define ARROW_JEMALLOC_VENDORED
+#define ARROW_JSON
+#define ARROW_MIMALLOC
+#define ARROW_ORC
+#define ARROW_PARQUET
+#define ARROW_SUBSTRAIT
+
+#define ARROW_AZURE
+#define ARROW_ENABLE_THREADING
+#define ARROW_GCS
+#define ARROW_HDFS
+#define ARROW_S3
+/* #undef ARROW_USE_GLOG */
+#define ARROW_USE_NATIVE_INT128
+#define ARROW_WITH_BROTLI
+#define ARROW_WITH_BZ2
+#define ARROW_WITH_LZ4
+/* #undef ARROW_WITH_MUSL */
+/* #undef ARROW_WITH_OPENTELEMETRY */
+#define ARROW_WITH_RE2
+#define ARROW_WITH_SNAPPY
+#define ARROW_WITH_UTF8PROC
+#define ARROW_WITH_ZLIB
+#define ARROW_WITH_ZSTD
+#define PARQUET_REQUIRE_ENCRYPTION
diff --git a/pyarrow/include/arrow/util/converter.h b/pyarrow/include/arrow/util/converter.h
new file mode 100644
index 0000000000000000000000000000000000000000..c23d6ccd9886e4539d52d537abb85da1dcc93385
--- /dev/null
+++ b/pyarrow/include/arrow/util/converter.h
@@ -0,0 +1,411 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/chunked_array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/visit_type_inline.h"
+
+namespace arrow {
+namespace internal {
+
+template <typename BaseConverter, template <typename...> class ConverterTrait>
+static Result<std::unique_ptr<BaseConverter>> MakeConverter(
+    std::shared_ptr<DataType> type, typename BaseConverter::OptionsType options,
+    MemoryPool* pool);
+
+template <typename Input, typename Options>
+class Converter {
+ public:
+  using Self = Converter<Input, Options>;
+  using InputType = Input;
+  using OptionsType = Options;
+
+  virtual ~Converter() = default;
+
+  Status Construct(std::shared_ptr<DataType> type, OptionsType options,
+                   MemoryPool* pool) {
+    type_ = std::move(type);
+    options_ = std::move(options);
+    return Init(pool);
+  }
+
+  virtual Status Append(InputType value) { return Status::NotImplemented("Append"); }
+
+  virtual Status Extend(InputType values, int64_t size, int64_t offset = 0) {
+    return Status::NotImplemented("Extend");
+  }
+
+  virtual Status ExtendMasked(InputType values, InputType mask, int64_t size,
+                              int64_t offset = 0) {
+    return Status::NotImplemented("ExtendMasked");
+  }
+
+  const std::shared_ptr<ArrayBuilder>& builder() const { return builder_; }
+
+  const std::shared_ptr<DataType>& type() const { return type_; }
+
+  OptionsType options() const { return options_; }
+
+  bool may_overflow() const { return may_overflow_; }
+
+  bool rewind_on_overflow() const { return rewind_on_overflow_; }
+
+  virtual Status Reserve(int64_t additional_capacity) {
+    return builder_->Reserve(additional_capacity);
+  }
+
+  Status AppendNull() { return builder_->AppendNull(); }
+
+  virtual Result<std::shared_ptr<Array>> ToArray() { return builder_->Finish(); }
+
+  virtual Result<std::shared_ptr<Array>> ToArray(int64_t length) {
+    ARROW_ASSIGN_OR_RAISE(auto arr, this->ToArray());
+    return arr->Slice(0, length);
+  }
+
+  virtual Result<std::shared_ptr<ChunkedArray>> ToChunkedArray() {
+    ARROW_ASSIGN_OR_RAISE(auto array, ToArray());
+    std::vector<std::shared_ptr<Array>> chunks = {std::move(array)};
+    return std::make_shared<ChunkedArray>(chunks);
+  }
+
+ protected:
+  virtual Status Init(MemoryPool* pool) { return Status::OK(); }
+
+  std::shared_ptr<DataType> type_;
+  std::shared_ptr<ArrayBuilder> builder_;
+  OptionsType options_;
+  bool may_overflow_ = false;
+  bool rewind_on_overflow_ = false;
+};
+
+template <typename ArrowType, typename BaseConverter>
+class PrimitiveConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
+
+ protected:
+  Status Init(MemoryPool* pool) override {
+    this->builder_ = std::make_shared<BuilderType>(this->type_, pool);
+    // Narrow variable-sized binary types may overflow
+    this->may_overflow_ = is_binary_like(this->type_->id());
+    primitive_type_ = checked_cast<const ArrowType*>(this->type_.get());
+    primitive_builder_ = checked_cast<BuilderType*>(this->builder_.get());
+    return Status::OK();
+  }
+
+  const ArrowType* primitive_type_;
+  BuilderType* primitive_builder_;
+};
+
+template <typename ArrowType, typename BaseConverter,
+          template <typename...> class ConverterTrait>
+class ListConverter : public BaseConverter {
+ public:
+  using BuilderType = typename TypeTraits<ArrowType>::BuilderType;
+  using ConverterType = typename ConverterTrait<ArrowType>::type;
+
+ protected:
+  Status Init(MemoryPool* pool) override {
+    list_type_ = checked_cast<const ArrowType*>(this->type_.get());
+    ARROW_ASSIGN_OR_RAISE(value_converter_,
+                          (MakeConverter<BaseConverter, ConverterTrait>(
+                              list_type_->value_type(), this->options_, pool)));
+    this->builder_ =
+        std::make_shared<BuilderType>(pool, value_converter_->builder(), this->type_);
+    list_builder_ = checked_cast<BuilderType*>(this->builder_.get());
+    // Narrow list types may overflow
+    this->may_overflow_ = this->rewind_on_overflow_ =
+        sizeof(typename ArrowType::offset_type) < sizeof(int64_t);
+    return Status::OK();
+  }
+
+  const ArrowType* list_type_;
+  BuilderType* list_builder_;
+  std::unique_ptr<BaseConverter> value_converter_;
+};
+
+template <typename BaseConverter, template <typename...> class ConverterTrait>
+class StructConverter : public BaseConverter {
+ public:
+  using ConverterType = typename ConverterTrait<StructType>::type;
+
+  Status Reserve(int64_t additional_capacity) override {
+    ARROW_RETURN_NOT_OK(this->builder_->Reserve(additional_capacity));
+    for (const auto& child : children_) {
+      ARROW_RETURN_NOT_OK(child->Reserve(additional_capacity));
+    }
+    return Status::OK();
+  }
+
+ protected:
+  Status Init(MemoryPool* pool) override {
+    std::unique_ptr<BaseConverter> child_converter;
+    std::vector<std::shared_ptr<ArrayBuilder>> child_builders;
+
+    struct_type_ = checked_cast<const StructType*>(this->type_.get());
+    for (const auto& field : struct_type_->fields()) {
+      ARROW_ASSIGN_OR_RAISE(child_converter,
+                            (MakeConverter<BaseConverter, ConverterTrait>(
+                                field->type(), this->options_, pool)));
+      this->may_overflow_ |= child_converter->may_overflow();
+      this->rewind_on_overflow_ = this->may_overflow_;
+      child_builders.push_back(child_converter->builder());
+      children_.push_back(std::move(child_converter));
+    }
+
+    this->builder_ =
+        std::make_shared<StructBuilder>(this->type_, pool, std::move(child_builders));
+    struct_builder_ = checked_cast<StructBuilder*>(this->builder_.get());
+
+    return Status::OK();
+  }
+
+  const StructType* struct_type_;
+  StructBuilder* struct_builder_;
+  std::vector<std::unique_ptr<BaseConverter>> children_;
+};
+
+template <typename ValueType, typename BaseConverter>
+class DictionaryConverter : public BaseConverter {
+ public:
+  using BuilderType = DictionaryBuilder<ValueType>;
+
+ protected:
+  Status Init(MemoryPool* pool) override {
+    std::unique_ptr<ArrayBuilder> builder;
+    ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, this->type_, NULLPTR, &builder));
+    this->builder_ = std::move(builder);
+    this->may_overflow_ = false;
+    dict_type_ = checked_cast<const DictionaryType*>(this->type_.get());
+    value_type_ = checked_cast<const ValueType*>(dict_type_->value_type().get());
+    value_builder_ = checked_cast<BuilderType*>(this->builder_.get());
+    return Status::OK();
+  }
+
+  const DictionaryType* dict_type_;
+  const ValueType* value_type_;
+  BuilderType* value_builder_;
+};
+
+template <typename BaseConverter, template <typename...> class ConverterTrait>
+struct MakeConverterImpl {
+  template <typename T, typename ConverterType = typename ConverterTrait<T>::type>
+  Status Visit(const T&) {
+    out.reset(new ConverterType());
+    return out->Construct(std::move(type), std::move(options), pool);
+  }
+
+  Status Visit(const DictionaryType& t) {
+    switch (t.value_type()->id()) {
+#define DICTIONARY_CASE(TYPE)                                                       \
+  case TYPE::type_id:                                                               \
+    out = std::make_unique<                                                         \
+        typename ConverterTrait<DictionaryType>::template dictionary_type<TYPE>>(); \
+    break;
+      DICTIONARY_CASE(BooleanType);
+      DICTIONARY_CASE(Int8Type);
+      DICTIONARY_CASE(Int16Type);
+      DICTIONARY_CASE(Int32Type);
+      DICTIONARY_CASE(Int64Type);
+      DICTIONARY_CASE(UInt8Type);
+      DICTIONARY_CASE(UInt16Type);
+      DICTIONARY_CASE(UInt32Type);
+      DICTIONARY_CASE(UInt64Type);
+      DICTIONARY_CASE(FloatType);
+      DICTIONARY_CASE(DoubleType);
+      DICTIONARY_CASE(BinaryType);
+      DICTIONARY_CASE(StringType);
+      DICTIONARY_CASE(FixedSizeBinaryType);
+#undef DICTIONARY_CASE
+      default:
+        return Status::NotImplemented("DictionaryArray converter for type ", t.ToString(),
+                                      " not implemented");
+    }
+    return out->Construct(std::move(type), std::move(options), pool);
+  }
+
+  Status Visit(const DataType& t) { return Status::NotImplemented(t.name()); }
+
+  std::shared_ptr<DataType> type;
+  typename BaseConverter::OptionsType options;
+  MemoryPool* pool;
+  std::unique_ptr<BaseConverter> out;
+};
+
+template <typename BaseConverter, template <typename...> class ConverterTrait>
+static Result<std::unique_ptr<BaseConverter>> MakeConverter(
+    std::shared_ptr<DataType> type, typename BaseConverter::OptionsType options,
+    MemoryPool* pool) {
+  MakeConverterImpl<BaseConverter, ConverterTrait> visitor{
+      std::move(type), std::move(options), pool, NULLPTR};
+  ARROW_RETURN_NOT_OK(VisitTypeInline(*visitor.type, &visitor));
+  return std::move(visitor.out);
+}
+
+template <typename Converter>
+class Chunker {
+ public:
+  using InputType = typename Converter::InputType;
+
+  explicit Chunker(std::unique_ptr<Converter> converter)
+      : converter_(std::move(converter)) {}
+
+  Status Reserve(int64_t additional_capacity) {
+    ARROW_RETURN_NOT_OK(converter_->Reserve(additional_capacity));
+    reserved_ += additional_capacity;
+    return Status::OK();
+  }
+
+  Status AppendNull() {
+    auto status = converter_->AppendNull();
+    if (ARROW_PREDICT_FALSE(status.IsCapacityError())) {
+      if (converter_->builder()->length() == 0) {
+        // Builder length == 0 means the individual element is too large to append.
+        // In this case, no need to try again.
+        return status;
+      }
+      ARROW_RETURN_NOT_OK(FinishChunk());
+      return converter_->AppendNull();
+    }
+    ++length_;
+    return status;
+  }
+
+  Status Append(InputType value) {
+    auto status = converter_->Append(value);
+    if (ARROW_PREDICT_FALSE(status.IsCapacityError())) {
+      if (converter_->builder()->length() == 0) {
+        return status;
+      }
+      ARROW_RETURN_NOT_OK(FinishChunk());
+      return Append(value);
+    }
+    ++length_;
+    return status;
+  }
+
+  Status Extend(InputType values, int64_t size, int64_t offset = 0) {
+    while (offset < size) {
+      auto length_before = converter_->builder()->length();
+      auto status = converter_->Extend(values, size, offset);
+      auto length_after = converter_->builder()->length();
+      auto num_converted = length_after - length_before;
+
+      offset += num_converted;
+      length_ += num_converted;
+
+      if (status.IsCapacityError()) {
+        if (converter_->builder()->length() == 0) {
+          // Builder length == 0 means the individual element is too large to append.
+          // In this case, no need to try again.
+          return status;
+        } else if (converter_->rewind_on_overflow()) {
+          // The list-like and binary-like conversion paths may raise  a capacity error,
+          // we need to handle them differently. While the binary-like converters check
+          // the capacity before append/extend the list-like converters just check after
+          // append/extend. Thus depending on the implementation semantics we may need
+          // to rewind (slice) the output chunk by one.
+          length_ -= 1;
+          offset -= 1;
+        }
+        ARROW_RETURN_NOT_OK(FinishChunk());
+      } else if (!status.ok()) {
+        return status;
+      }
+    }
+    return Status::OK();
+  }
+
+  Status ExtendMasked(InputType values, InputType mask, int64_t size,
+                      int64_t offset = 0) {
+    while (offset < size) {
+      auto length_before = converter_->builder()->length();
+      auto status = converter_->ExtendMasked(values, mask, size, offset);
+      auto length_after = converter_->builder()->length();
+      auto num_converted = length_after - length_before;
+
+      offset += num_converted;
+      length_ += num_converted;
+
+      if (status.IsCapacityError()) {
+        if (converter_->builder()->length() == 0) {
+          // Builder length == 0 means the individual element is too large to append.
+          // In this case, no need to try again.
+          return status;
+        } else if (converter_->rewind_on_overflow()) {
+          // The list-like and binary-like conversion paths may raise  a capacity error,
+          // we need to handle them differently. While the binary-like converters check
+          // the capacity before append/extend the list-like converters just check after
+          // append/extend. Thus depending on the implementation semantics we may need
+          // to rewind (slice) the output chunk by one.
+          length_ -= 1;
+          offset -= 1;
+        }
+        ARROW_RETURN_NOT_OK(FinishChunk());
+      } else if (!status.ok()) {
+        return status;
+      }
+    }
+    return Status::OK();
+  }
+
+  Status FinishChunk() {
+    ARROW_ASSIGN_OR_RAISE(auto chunk, converter_->ToArray(length_));
+    chunks_.push_back(chunk);
+    // Reserve space for the remaining items.
+    // Besides being an optimization, it is also required if the converter's
+    // implementation relies on unsafe builder methods in converter->Append().
+    auto remaining = reserved_ - length_;
+    Reset();
+    return Reserve(remaining);
+  }
+
+  Result<std::shared_ptr<ChunkedArray>> ToChunkedArray() {
+    ARROW_RETURN_NOT_OK(FinishChunk());
+    return std::make_shared<ChunkedArray>(chunks_);
+  }
+
+ protected:
+  void Reset() {
+    converter_->builder()->Reset();
+    length_ = 0;
+    reserved_ = 0;
+  }
+
+  int64_t length_ = 0;
+  int64_t reserved_ = 0;
+  std::unique_ptr<Converter> converter_;
+  std::vector<std::shared_ptr<Array>> chunks_;
+};
+
+template <typename T>
+static Result<std::unique_ptr<Chunker<T>>> MakeChunker(std::unique_ptr<T> converter) {
+  return std::make_unique<Chunker<T>>(std::move(converter));
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/cpu_info.h b/pyarrow/include/arrow/util/cpu_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..949719b97ed84da6277139a70e22203706ed6055
--- /dev/null
+++ b/pyarrow/include/arrow/util/cpu_info.h
@@ -0,0 +1,114 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala (incubating) as of 2016-01-29. Pared down to a minimal
+// set of functions needed for Apache Arrow / Apache parquet-cpp
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// CpuInfo is an interface to query for cpu information at runtime.  The caller can
+/// ask for the sizes of the caches and what hardware features are supported.
+/// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and
+/// /sys/devices)
+class ARROW_EXPORT CpuInfo {
+ public:
+  ~CpuInfo();
+
+  /// x86 features
+  static constexpr int64_t SSSE3 = (1LL << 0);
+  static constexpr int64_t SSE4_1 = (1LL << 1);
+  static constexpr int64_t SSE4_2 = (1LL << 2);
+  static constexpr int64_t POPCNT = (1LL << 3);
+  static constexpr int64_t AVX = (1LL << 4);
+  static constexpr int64_t AVX2 = (1LL << 5);
+  static constexpr int64_t AVX512F = (1LL << 6);
+  static constexpr int64_t AVX512CD = (1LL << 7);
+  static constexpr int64_t AVX512VL = (1LL << 8);
+  static constexpr int64_t AVX512DQ = (1LL << 9);
+  static constexpr int64_t AVX512BW = (1LL << 10);
+  static constexpr int64_t AVX512 = AVX512F | AVX512CD | AVX512VL | AVX512DQ | AVX512BW;
+  static constexpr int64_t BMI1 = (1LL << 11);
+  static constexpr int64_t BMI2 = (1LL << 12);
+
+  /// Arm features
+  static constexpr int64_t ASIMD = (1LL << 32);
+
+  /// Cache enums for L1 (data), L2 and L3
+  enum class CacheLevel { L1 = 0, L2, L3, Last = L3 };
+
+  /// CPU vendors
+  enum class Vendor { Unknown, Intel, AMD };
+
+  static const CpuInfo* GetInstance();
+
+  /// Returns all the flags for this cpu
+  int64_t hardware_flags() const;
+
+  /// Returns the number of cores (including hyper-threaded) on this machine.
+  int num_cores() const;
+
+  /// Returns the vendor of the cpu.
+  Vendor vendor() const;
+
+  /// Returns the model name of the cpu (e.g. Intel i7-2600)
+  const std::string& model_name() const;
+
+  /// Returns the size of the cache in KB at this cache level
+  int64_t CacheSize(CacheLevel level) const;
+
+  /// \brief Returns whether or not the given feature is enabled.
+  ///
+  /// IsSupported() is true iff IsDetected() is also true and the feature
+  /// wasn't disabled by the user (for example by setting the ARROW_USER_SIMD_LEVEL
+  /// environment variable).
+  bool IsSupported(int64_t flags) const;
+
+  /// Returns whether or not the given feature is available on the CPU.
+  bool IsDetected(int64_t flags) const;
+
+  /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error
+  /// and terminate.
+  void VerifyCpuRequirements() const;
+
+  /// Toggle a hardware feature on and off.  It is not valid to turn on a feature
+  /// that the underlying hardware cannot support. This is useful for testing.
+  void EnableFeature(int64_t flag, bool enable);
+
+  bool HasEfficientBmi2() const {
+    // BMI2 (pext, pdep) is only efficient on Intel X86 processors.
+    return vendor() == Vendor::Intel && IsSupported(BMI2);
+  }
+
+ private:
+  CpuInfo();
+
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/crc32.h b/pyarrow/include/arrow/util/crc32.h
new file mode 100644
index 0000000000000000000000000000000000000000..155cf7cfae1061feda9ae436a5f966b90cbabc6a
--- /dev/null
+++ b/pyarrow/include/arrow/util/crc32.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstddef>
+#include <cstdint>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief Compute the CRC32 checksum of the given data
+///
+/// This function computes CRC32 with the polynomial 0x04C11DB7,
+/// as used in zlib and others (note this is different from CRC32C).
+/// To compute a running CRC32, pass the previous value in `prev`,
+/// otherwise `prev` should be 0.
+ARROW_EXPORT
+uint32_t crc32(uint32_t prev, const void* data, size_t length);
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/debug.h b/pyarrow/include/arrow/util/debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed38a4dcf7ab87aad4db906dd8b6abc058387f8e
--- /dev/null
+++ b/pyarrow/include/arrow/util/debug.h
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+ARROW_EXPORT
+void DebugTrap();
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/decimal.h b/pyarrow/include/arrow/util/decimal.h
new file mode 100644
index 0000000000000000000000000000000000000000..bae0c4dd248cf14d0bc64230bf4e81cb934f73c2
--- /dev/null
+++ b/pyarrow/include/arrow/util/decimal.h
@@ -0,0 +1,523 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <limits>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/basic_decimal.h"
+
+namespace arrow {
+
+class Decimal64;
+
+namespace internal {
+
+ARROW_EXPORT
+Status ToArrowStatus(DecimalStatus);
+
+}  // namespace internal
+
+template <>
+struct IntoStatus<DecimalStatus> {
+  static inline Status ToStatus(DecimalStatus st) { return internal::ToArrowStatus(st); }
+};
+
+/// Represents a signed 32-bit decimal value in two's complement.
+/// Calulations wrap around and overflow is ignored.
+/// The max decimal precision that can be safely represented is
+/// 9 significant digits.
+///
+/// The implementation is split into two parts :
+///
+/// 1. BasicDecimal32
+///    - can be safely compiled to IR without references to libstdc++
+/// 2. Decimal32
+///    - has additional functionality on top of BasicDecimal32 to deal with
+///      strings and streams
+class ARROW_EXPORT Decimal32 : public BasicDecimal32 {
+ public:
+  /// \cond FALSE
+  // (need to avoid a duplicate definition in sphinx)
+  using BasicDecimal32::BasicDecimal32;
+  /// \endcond
+
+  /// \brief constructor creates a Decimal32 from a BasicDecimal32
+  constexpr Decimal32(const BasicDecimal32& value) noexcept  // NOLINT runtime/explicit
+      : BasicDecimal32(value) {}
+
+  /// \brief Parse the number from a base 10 string representation
+  explicit Decimal32(const std::string& value);
+
+  /// \brief Empty constructor creates a Decimal32 with a value of 0
+  /// this is required for some older compilers
+  constexpr Decimal32() noexcept : BasicDecimal32() {}
+
+  /// \brief Divide this number by right and return the result.
+  ///
+  /// This operation is not destructive.
+  /// The answer rounds to zero. Signs work like:
+  ///   21 /  5 ->  4,  1
+  ///  -21 /  5 -> -4, -1
+  ///   21 / -5 -> -4,  1
+  ///  -21 / -5 ->  4, -1
+  /// \param[in] divisor the number to divide by
+  /// \return the pair of the quotient and the remainder
+  Result<std::pair<Decimal32, Decimal32>> Divide(const Decimal32& divisor) const {
+    std::pair<Decimal32, Decimal32> result;
+    ARROW_RETURN_NOT_OK(BasicDecimal32::Divide(divisor, &result.first, &result.second));
+    return result;
+  }
+
+  /// \brief Convert the Decimal32 value to a base 10 decimal string with the given scale
+  std::string ToString(int32_t scale) const;
+
+  /// \brief Convert the value to an integer string
+  std::string ToIntegerString() const;
+
+  /// \brief Cast this value to an int64_t
+  explicit operator int64_t() const;
+
+  explicit operator Decimal64() const;
+
+  /// \brief Convert a decimal string to a Decimal value, optionally including
+  /// precision and scale if they're passed in and not null.
+  static Status FromString(std::string_view s, Decimal32* out, int32_t* precision,
+                           int32_t* scale = NULLPTR);
+  static Status FromString(const std::string& s, Decimal32* out, int32_t* precision,
+                           int32_t* scale = NULLPTR);
+  static Status FromString(const char* s, Decimal32* out, int32_t* precision,
+                           int32_t* scale = NULLPTR);
+  static Result<Decimal32> FromString(std::string_view s);
+  static Result<Decimal32> FromString(const std::string& s);
+  static Result<Decimal32> FromString(const char* s);
+
+  static Result<Decimal32> FromReal(double real, int32_t precision, int32_t scale);
+  static Result<Decimal32> FromReal(float real, int32_t precision, int32_t scale);
+
+  /// \brief Convert from a big-endian byte representation. The length must be
+  ///        between 1 and 4
+  /// \return error status if the length is an invalid value
+  static Result<Decimal32> FromBigEndian(const uint8_t* data, int32_t length);
+
+  /// \brief Convert Decimal32 from one scale to another
+  Result<Decimal32> Rescale(int32_t original_scale, int32_t new_scale) const {
+    Decimal32 out;
+    ARROW_RETURN_NOT_OK(BasicDecimal32::Rescale(original_scale, new_scale, &out));
+    return out;
+  }
+
+  /// \brief Convert to a signed integer
+  template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
+  Result<T> ToInteger() const {
+    return static_cast<T>(value_);
+  }
+
+  /// \brief Convert to a signed integer
+  template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
+  Status ToInteger(T* out) const {
+    return ToInteger<T>().Value(out);
+  }
+
+  /// \brief Convert to a floating-point number (scaled)
+  float ToFloat(int32_t scale) const;
+  /// \brief Convert to a floating-point number (scaled)
+  double ToDouble(int32_t scale) const;
+
+  /// \brief Convert to a floating-point number (scaled)
+  template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+  T ToReal(int32_t scale) const {
+    static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                  "Unexpected floating-point type");
+    if constexpr (std::is_same_v<T, float>) {
+      return ToFloat(scale);
+    } else {
+      return ToDouble(scale);
+    }
+  }
+
+  ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os,
+                                                      const Decimal32& decimal);
+};
+
+class ARROW_EXPORT Decimal64 : public BasicDecimal64 {
+ public:
+  /// \cond FALSE
+  // (need to avoid a duplicate definition in sphinx)
+  using BasicDecimal64::BasicDecimal64;
+  /// \endcond
+
+  /// \brief constructor creates a Decimal64 from a BasicDecimal64
+  constexpr Decimal64(const BasicDecimal64& value) noexcept  // NOLINT runtime/explicit
+      : BasicDecimal64(value) {}
+
+  explicit Decimal64(const BasicDecimal32& value) noexcept
+      : BasicDecimal64(static_cast<int64_t>(value.value())) {}
+
+  /// \brief Parse the number from a base 10 string representation
+  explicit Decimal64(const std::string& value);
+
+  /// \brief Empty constructor creates a Decimal64 with a value of 0
+  /// this is required for some older compilers
+  constexpr Decimal64() noexcept : BasicDecimal64() {}
+
+  /// \brief Divide this number by right and return the result.
+  ///
+  /// This operation is not destructive.
+  /// The answer rounds to zero. Signs work like:
+  ///   21 /  5 ->  4,  1
+  ///  -21 /  5 -> -4, -1
+  ///   21 / -5 -> -4,  1
+  ///  -21 / -5 ->  4, -1
+  /// \param[in] divisor the number to divide by
+  /// \return the pair of the quotient and the remainder
+  Result<std::pair<Decimal64, Decimal64>> Divide(const Decimal64& divisor) const {
+    std::pair<Decimal64, Decimal64> result;
+    ARROW_RETURN_NOT_OK(BasicDecimal64::Divide(divisor, &result.first, &result.second));
+    return result;
+  }
+
+  /// \brief Convert the Decimal64 value to a base 10 decimal string with the given scale
+  std::string ToString(int32_t scale) const;
+
+  /// \brief Convert the value to an integer string
+  std::string ToIntegerString() const;
+
+  /// \brief Cast this value to an int64_t
+  explicit operator int64_t() const;
+
+  /// \brief Convert a decimal string to a Decimal value, optionally including
+  /// precision and scale if they're passed in and not null.
+  static Status FromString(std::string_view s, Decimal64* out, int32_t* precision,
+                           int32_t* scale = NULLPTR);
+  static Status FromString(const std::string& s, Decimal64* out, int32_t* precision,
+                           int32_t* scale = NULLPTR);
+  static Status FromString(const char* s, Decimal64* out, int32_t* precision,
+                           int32_t* scale = NULLPTR);
+  static Result<Decimal64> FromString(std::string_view s);
+  static Result<Decimal64> FromString(const std::string& s);
+  static Result<Decimal64> FromString(const char* s);
+
+  static Result<Decimal64> FromReal(double real, int32_t precision, int32_t scale);
+  static Result<Decimal64> FromReal(float real, int32_t precision, int32_t scale);
+
+  /// \brief Convert from a big-endian byte representation. The length must be
+  ///        between 1 and 4
+  /// \return error status if the length is an invalid value
+  static Result<Decimal64> FromBigEndian(const uint8_t* data, int32_t length);
+
+  /// \brief Convert Decimal64 from one scale to another
+  Result<Decimal64> Rescale(int32_t original_scale, int32_t new_scale) const {
+    Decimal64 out;
+    ARROW_RETURN_NOT_OK(BasicDecimal64::Rescale(original_scale, new_scale, &out));
+    return out;
+  }
+
+  /// \brief Convert to a signed integer
+  template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
+  Result<T> ToInteger() const {
+    return static_cast<T>(value_);
+  }
+
+  /// \brief Convert to a signed integer
+  template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
+  Status ToInteger(T* out) const {
+    return ToInteger<T>().Value(out);
+  }
+
+  /// \brief Convert to a floating-point number (scaled)
+  float ToFloat(int32_t scale) const;
+  /// \brief Convert to a floating-point number (scaled)
+  double ToDouble(int32_t scale) const;
+
+  /// \brief Convert to a floating-point number (scaled)
+  template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+  T ToReal(int32_t scale) const {
+    static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                  "Unexpected floating-point type");
+    if constexpr (std::is_same_v<T, float>) {
+      return ToFloat(scale);
+    } else {
+      return ToDouble(scale);
+    }
+  }
+
+  ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os,
+                                                      const Decimal64& decimal);
+};
+
+/// Represents a signed 128-bit integer in two's complement.
+/// Calculations wrap around and overflow is ignored.
+/// The max decimal precision that can be safely represented is
+/// 38 significant digits.
+///
+/// For a discussion of the algorithms, look at Knuth's volume 2,
+/// Semi-numerical Algorithms section 4.3.1.
+///
+/// Adapted from the Apache ORC C++ implementation
+///
+/// The implementation is split into two parts :
+///
+/// 1. BasicDecimal128
+///    - can be safely compiled to IR without references to libstdc++.
+/// 2. Decimal128
+///    - has additional functionality on top of BasicDecimal128 to deal with
+///      strings and streams.
+class ARROW_EXPORT Decimal128 : public BasicDecimal128 {
+ public:
+  /// \cond FALSE
+  // (need to avoid a duplicate definition in Sphinx)
+  using BasicDecimal128::BasicDecimal128;
+  /// \endcond
+
+  /// \brief constructor creates a Decimal128 from a BasicDecimal128.
+  constexpr Decimal128(const BasicDecimal128& value) noexcept  // NOLINT runtime/explicit
+      : BasicDecimal128(value) {}
+
+  /// \brief Parse the number from a base 10 string representation.
+  explicit Decimal128(const std::string& value);
+
+  /// \brief Empty constructor creates a Decimal128 with a value of 0.
+  // This is required on some older compilers.
+  constexpr Decimal128() noexcept : BasicDecimal128() {}
+
+  /// Divide this number by right and return the result.
+  ///
+  /// This operation is not destructive.
+  /// The answer rounds to zero. Signs work like:
+  ///   21 /  5 ->  4,  1
+  ///  -21 /  5 -> -4, -1
+  ///   21 / -5 -> -4,  1
+  ///  -21 / -5 ->  4, -1
+  /// \param[in] divisor the number to divide by
+  /// \return the pair of the quotient and the remainder
+  Result<std::pair<Decimal128, Decimal128>> Divide(const Decimal128& divisor) const {
+    std::pair<Decimal128, Decimal128> result;
+    ARROW_RETURN_NOT_OK(BasicDecimal128::Divide(divisor, &result.first, &result.second));
+    return result;
+  }
+
+  /// \brief Convert the Decimal128 value to a base 10 decimal string with the given
+  /// scale.
+  std::string ToString(int32_t scale) const;
+
+  /// \brief Convert the value to an integer string
+  std::string ToIntegerString() const;
+
+  /// \brief Cast this value to an int64_t.
+  explicit operator int64_t() const;
+
+  /// \brief Convert a decimal string to a Decimal128 value, optionally including
+  /// precision and scale if they're passed in and not null.
+  static Status FromString(std::string_view s, Decimal128* out, int32_t* precision,
+                           int32_t* scale = NULLPTR);
+  static Status FromString(const std::string& s, Decimal128* out, int32_t* precision,
+                           int32_t* scale = NULLPTR);
+  static Status FromString(const char* s, Decimal128* out, int32_t* precision,
+                           int32_t* scale = NULLPTR);
+  static Result<Decimal128> FromString(std::string_view s);
+  static Result<Decimal128> FromString(const std::string& s);
+  static Result<Decimal128> FromString(const char* s);
+
+  static Result<Decimal128> FromReal(double real, int32_t precision, int32_t scale);
+  static Result<Decimal128> FromReal(float real, int32_t precision, int32_t scale);
+
+  /// \brief Convert from a big-endian byte representation. The length must be
+  ///        between 1 and 16.
+  /// \return error status if the length is an invalid value
+  static Result<Decimal128> FromBigEndian(const uint8_t* data, int32_t length);
+
+  /// \brief Convert Decimal128 from one scale to another
+  Result<Decimal128> Rescale(int32_t original_scale, int32_t new_scale) const {
+    Decimal128 out;
+    ARROW_RETURN_NOT_OK(BasicDecimal128::Rescale(original_scale, new_scale, &out));
+    return out;
+  }
+
+  /// \brief Convert to a signed integer
+  template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
+  Result<T> ToInteger() const {
+    constexpr auto min_value = std::numeric_limits<T>::min();
+    constexpr auto max_value = std::numeric_limits<T>::max();
+    const auto& self = *this;
+    if (self < min_value || self > max_value) {
+      return Status::Invalid("Invalid cast from Decimal128 to ", sizeof(T),
+                             " byte integer");
+    }
+    return static_cast<T>(low_bits());
+  }
+
+  /// \brief Convert to a signed integer
+  template <typename T, typename = internal::EnableIfIsOneOf<T, int32_t, int64_t>>
+  Status ToInteger(T* out) const {
+    return ToInteger<T>().Value(out);
+  }
+
+  /// \brief Convert to a floating-point number (scaled)
+  float ToFloat(int32_t scale) const;
+  /// \brief Convert to a floating-point number (scaled)
+  double ToDouble(int32_t scale) const;
+
+  /// \brief Convert to a floating-point number (scaled)
+  template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+  T ToReal(int32_t scale) const {
+    static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                  "Unexpected floating-point type");
+    if constexpr (std::is_same_v<T, float>) {
+      return ToFloat(scale);
+    } else {
+      return ToDouble(scale);
+    }
+  }
+
+  ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os,
+                                                      const Decimal128& decimal);
+};
+
+/// Represents a signed 256-bit integer in two's complement.
+/// The max decimal precision that can be safely represented is
+/// 76 significant digits.
+///
+/// The implementation is split into two parts :
+///
+/// 1. BasicDecimal256
+///    - can be safely compiled to IR without references to libstdc++.
+/// 2. Decimal256
+///    - (TODO) has additional functionality on top of BasicDecimal256 to deal with
+///      strings and streams.
+class ARROW_EXPORT Decimal256 : public BasicDecimal256 {
+ public:
+  /// \cond FALSE
+  // (need to avoid a duplicate definition in Sphinx)
+  using BasicDecimal256::BasicDecimal256;
+  /// \endcond
+
+  /// \brief constructor creates a Decimal256 from a BasicDecimal256.
+  constexpr Decimal256(const BasicDecimal256& value) noexcept  // NOLINT(runtime/explicit)
+      : BasicDecimal256(value) {}
+
+  /// \brief Parse the number from a base 10 string representation.
+  explicit Decimal256(const std::string& value);
+
+  /// \brief Empty constructor creates a Decimal256 with a value of 0.
+  // This is required on some older compilers.
+  constexpr Decimal256() noexcept : BasicDecimal256() {}
+
+  /// \brief Convert the Decimal256 value to a base 10 decimal string with the given
+  /// scale.
+  std::string ToString(int32_t scale) const;
+
+  /// \brief Convert the value to an integer string
+  std::string ToIntegerString() const;
+
+  /// \brief Convert a decimal string to a Decimal256 value, optionally including
+  /// precision and scale if they're passed in and not null.
+  static Status FromString(std::string_view s, Decimal256* out, int32_t* precision,
+                           int32_t* scale = NULLPTR);
+  static Status FromString(const std::string& s, Decimal256* out, int32_t* precision,
+                           int32_t* scale = NULLPTR);
+  static Status FromString(const char* s, Decimal256* out, int32_t* precision,
+                           int32_t* scale = NULLPTR);
+  static Result<Decimal256> FromString(std::string_view s);
+  static Result<Decimal256> FromString(const std::string& s);
+  static Result<Decimal256> FromString(const char* s);
+
+  /// \brief Convert Decimal256 from one scale to another
+  Result<Decimal256> Rescale(int32_t original_scale, int32_t new_scale) const {
+    Decimal256 out;
+    ARROW_RETURN_NOT_OK(BasicDecimal256::Rescale(original_scale, new_scale, &out));
+    return out;
+  }
+
+  /// Divide this number by right and return the result.
+  ///
+  /// This operation is not destructive.
+  /// The answer rounds to zero. Signs work like:
+  ///   21 /  5 ->  4,  1
+  ///  -21 /  5 -> -4, -1
+  ///   21 / -5 -> -4,  1
+  ///  -21 / -5 ->  4, -1
+  /// \param[in] divisor the number to divide by
+  /// \return the pair of the quotient and the remainder
+  Result<std::pair<Decimal256, Decimal256>> Divide(const Decimal256& divisor) const {
+    std::pair<Decimal256, Decimal256> result;
+    ARROW_RETURN_NOT_OK(BasicDecimal256::Divide(divisor, &result.first, &result.second));
+    return result;
+  }
+
+  /// \brief Convert from a big-endian byte representation. The length must be
+  ///        between 1 and 32.
+  /// \return error status if the length is an invalid value
+  static Result<Decimal256> FromBigEndian(const uint8_t* data, int32_t length);
+
+  static Result<Decimal256> FromReal(double real, int32_t precision, int32_t scale);
+  static Result<Decimal256> FromReal(float real, int32_t precision, int32_t scale);
+
+  /// \brief Convert to a floating-point number (scaled).
+  /// May return infinity in case of overflow.
+  float ToFloat(int32_t scale) const;
+  /// \brief Convert to a floating-point number (scaled)
+  double ToDouble(int32_t scale) const;
+
+  /// \brief Convert to a floating-point number (scaled)
+  template <typename T, typename = std::enable_if_t<std::is_floating_point_v<T>>>
+  T ToReal(int32_t scale) const {
+    static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
+                  "Unexpected floating-point type");
+    if constexpr (std::is_same_v<T, float>) {
+      return ToFloat(scale);
+    } else {
+      return ToDouble(scale);
+    }
+  }
+
+  ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os,
+                                                      const Decimal256& decimal);
+};
+
+/// For an integer type, return the max number of decimal digits
+/// (=minimal decimal precision) it can represent.
+inline Result<int32_t> MaxDecimalDigitsForInteger(Type::type type_id) {
+  switch (type_id) {
+    case Type::INT8:
+    case Type::UINT8:
+      return 3;
+    case Type::INT16:
+    case Type::UINT16:
+      return 5;
+    case Type::INT32:
+    case Type::UINT32:
+      return 10;
+    case Type::INT64:
+      return 19;
+    case Type::UINT64:
+      return 20;
+    default:
+      break;
+  }
+  return Status::Invalid("Not an integer type: ", type_id);
+}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/delimiting.h b/pyarrow/include/arrow/util/delimiting.h
new file mode 100644
index 0000000000000000000000000000000000000000..161ad0bfddfc5a52040256a9cb39b5af96b876db
--- /dev/null
+++ b/pyarrow/include/arrow/util/delimiting.h
@@ -0,0 +1,181 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string_view>
+
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Buffer;
+
+class ARROW_EXPORT BoundaryFinder {
+ public:
+  BoundaryFinder() = default;
+
+  virtual ~BoundaryFinder();
+
+  /// \brief Find the position of the first delimiter inside block
+  ///
+  /// `partial` is taken to be the beginning of the block, and `block`
+  /// its continuation.  Also, `partial` doesn't contain a delimiter.
+  ///
+  /// The returned `out_pos` is relative to `block`'s start and should point
+  /// to the first character after the first delimiter.
+  /// `out_pos` will be -1 if no delimiter is found.
+  virtual Status FindFirst(std::string_view partial, std::string_view block,
+                           int64_t* out_pos) = 0;
+
+  /// \brief Find the position of the last delimiter inside block
+  ///
+  /// The returned `out_pos` is relative to `block`'s start and should point
+  /// to the first character after the last delimiter.
+  /// `out_pos` will be -1 if no delimiter is found.
+  virtual Status FindLast(std::string_view block, int64_t* out_pos) = 0;
+
+  /// \brief Find the position of the Nth delimiter inside the block
+  ///
+  /// `partial` is taken to be the beginning of the block, and `block`
+  /// its continuation.  Also, `partial` doesn't contain a delimiter.
+  ///
+  /// The returned `out_pos` is relative to `block`'s start and should point
+  /// to the first character after the first delimiter.
+  /// `out_pos` will be -1 if no delimiter is found.
+  ///
+  /// The returned `num_found` is the number of delimiters actually found
+  virtual Status FindNth(std::string_view partial, std::string_view block, int64_t count,
+                         int64_t* out_pos, int64_t* num_found) = 0;
+
+  static constexpr int64_t kNoDelimiterFound = -1;
+
+ protected:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(BoundaryFinder);
+};
+
+ARROW_EXPORT
+std::shared_ptr<BoundaryFinder> MakeNewlineBoundaryFinder();
+
+/// \brief A reusable block-based chunker for delimited data
+///
+/// The chunker takes a block of delimited data and helps carve a sub-block
+/// which begins and ends on delimiters (suitable for consumption by parsers
+/// which can only parse whole objects).
+class ARROW_EXPORT Chunker {
+ public:
+  explicit Chunker(std::shared_ptr<BoundaryFinder> delimiter);
+  ~Chunker();
+
+  /// \brief Carve up a chunk in a block of data to contain only whole objects
+  ///
+  /// Pre-conditions:
+  /// - `block` is the start of a valid block of delimited data
+  ///   (i.e. starts just after a delimiter)
+  ///
+  /// Post-conditions:
+  /// - block == whole + partial
+  /// - `whole` is a valid block of delimited data
+  ///   (i.e. starts just after a delimiter and ends with a delimiter)
+  /// - `partial` doesn't contain an entire delimited object
+  ///   (IOW: `partial` is generally small)
+  ///
+  /// This method will look for the last delimiter in `block` and may
+  /// therefore be costly.
+  ///
+  /// \param[in] block data to be chunked
+  /// \param[out] whole subrange of block containing whole delimited objects
+  /// \param[out] partial subrange of block starting with a partial delimited object
+  Status Process(std::shared_ptr<Buffer> block, std::shared_ptr<Buffer>* whole,
+                 std::shared_ptr<Buffer>* partial);
+
+  /// \brief Carve the completion of a partial object out of a block
+  ///
+  /// Pre-conditions:
+  /// - `partial` is the start of a valid block of delimited data
+  ///   (i.e. starts just after a delimiter)
+  /// - `block` follows `partial` in file order
+  ///
+  /// Post-conditions:
+  /// - block == completion + rest
+  /// - `partial + completion` is a valid block of delimited data
+  ///   (i.e. starts just after a delimiter and ends with a delimiter)
+  /// - `completion` doesn't contain an entire delimited object
+  ///   (IOW: `completion` is generally small)
+  ///
+  /// This method will look for the first delimiter in `block` and should
+  /// therefore be reasonably cheap.
+  ///
+  /// \param[in] partial incomplete delimited data
+  /// \param[in] block delimited data following partial
+  /// \param[out] completion subrange of block containing the completion of partial
+  /// \param[out] rest subrange of block containing what completion does not cover
+  Status ProcessWithPartial(std::shared_ptr<Buffer> partial,
+                            std::shared_ptr<Buffer> block,
+                            std::shared_ptr<Buffer>* completion,
+                            std::shared_ptr<Buffer>* rest);
+
+  /// \brief Like ProcessWithPartial, but for the last block of a file
+  ///
+  /// This method allows for a final delimited object without a trailing delimiter
+  /// (ProcessWithPartial would return an error in that case).
+  ///
+  /// Pre-conditions:
+  /// - `partial` is the start of a valid block of delimited data
+  /// - `block` follows `partial` in file order and is the last data block
+  ///
+  /// Post-conditions:
+  /// - block == completion + rest
+  /// - `partial + completion` is a valid block of delimited data
+  /// - `completion` doesn't contain an entire delimited object
+  ///   (IOW: `completion` is generally small)
+  ///
+  Status ProcessFinal(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
+                      std::shared_ptr<Buffer>* completion, std::shared_ptr<Buffer>* rest);
+
+  /// \brief Skip count number of rows
+  /// Pre-conditions:
+  /// - `partial` is the start of a valid block of delimited data
+  ///   (i.e. starts just after a delimiter)
+  /// - `block` follows `partial` in file order
+  ///
+  /// Post-conditions:
+  /// - `count` is updated to indicate the number of rows that still need to be skipped
+  /// - If `count` is > 0 then `rest` is an incomplete block that should be a future
+  /// `partial`
+  /// - Else `rest` could be one or more valid blocks of delimited data which need to be
+  /// parsed
+  ///
+  /// \param[in] partial incomplete delimited data
+  /// \param[in] block delimited data following partial
+  /// \param[in] final whether this is the final chunk
+  /// \param[in,out] count number of rows that need to be skipped
+  /// \param[out] rest subrange of block containing what was not skipped
+  Status ProcessSkip(std::shared_ptr<Buffer> partial, std::shared_ptr<Buffer> block,
+                     bool final, int64_t* count, std::shared_ptr<Buffer>* rest);
+
+ protected:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Chunker);
+
+  std::shared_ptr<BoundaryFinder> boundary_finder_;
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/endian.h b/pyarrow/include/arrow/util/endian.h
new file mode 100644
index 0000000000000000000000000000000000000000..1edb828f9b5351fb21432105171ae933750cc73c
--- /dev/null
+++ b/pyarrow/include/arrow/util/endian.h
@@ -0,0 +1,245 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifdef _WIN32
+#  define ARROW_LITTLE_ENDIAN 1
+#else
+#  if defined(__APPLE__) || defined(__FreeBSD__)
+#    include <machine/endian.h>  // IWYU pragma: keep
+#  elif defined(sun) || defined(__sun)
+#    include <sys/byteorder.h>  // IWYU pragma: keep
+#  elif !defined(_AIX) && !defined(__QNXNTO__) && !defined(__QNX__)
+#    include <endian.h>  // IWYU pragma: keep
+#  endif
+#
+#  ifndef __BYTE_ORDER__
+#    error "__BYTE_ORDER__ not defined"
+#  endif
+#
+#  ifndef __ORDER_LITTLE_ENDIAN__
+#    error "__ORDER_LITTLE_ENDIAN__ not defined"
+#  endif
+#
+#  if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#    define ARROW_LITTLE_ENDIAN 1
+#  else
+#    define ARROW_LITTLE_ENDIAN 0
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  include <intrin.h>  // IWYU pragma: keep
+#  define ARROW_BYTE_SWAP64 _byteswap_uint64
+#  define ARROW_BYTE_SWAP32 _byteswap_ulong
+#else
+#  define ARROW_BYTE_SWAP64 __builtin_bswap64
+#  define ARROW_BYTE_SWAP32 __builtin_bswap32
+#endif
+
+#include <algorithm>
+#include <array>
+
+#include "arrow/util/type_traits.h"
+#include "arrow/util/ubsan.h"
+
+namespace arrow {
+namespace bit_util {
+
+//
+// Byte-swap 16-bit, 32-bit and 64-bit values
+//
+
+// Swap the byte order (i.e. endianness)
+static inline int64_t ByteSwap(int64_t value) { return ARROW_BYTE_SWAP64(value); }
+static inline uint64_t ByteSwap(uint64_t value) {
+  return static_cast<uint64_t>(ARROW_BYTE_SWAP64(value));
+}
+static inline int32_t ByteSwap(int32_t value) { return ARROW_BYTE_SWAP32(value); }
+static inline uint32_t ByteSwap(uint32_t value) {
+  return static_cast<uint32_t>(ARROW_BYTE_SWAP32(value));
+}
+static inline int16_t ByteSwap(int16_t value) {
+  constexpr auto m = static_cast<int16_t>(0xff);
+  return static_cast<int16_t>(((value >> 8) & m) | ((value & m) << 8));
+}
+static inline uint16_t ByteSwap(uint16_t value) {
+  return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
+}
+static inline uint8_t ByteSwap(uint8_t value) { return value; }
+static inline int8_t ByteSwap(int8_t value) { return value; }
+static inline double ByteSwap(double value) {
+  const uint64_t swapped = ARROW_BYTE_SWAP64(util::SafeCopy<uint64_t>(value));
+  return util::SafeCopy<double>(swapped);
+}
+static inline float ByteSwap(float value) {
+  const uint32_t swapped = ARROW_BYTE_SWAP32(util::SafeCopy<uint32_t>(value));
+  return util::SafeCopy<float>(swapped);
+}
+
+// Write the swapped bytes into dst. Src and dst cannot overlap.
+static inline void ByteSwap(void* dst, const void* src, int len) {
+  switch (len) {
+    case 1:
+      *reinterpret_cast<int8_t*>(dst) = *reinterpret_cast<const int8_t*>(src);
+      return;
+    case 2:
+      *reinterpret_cast<int16_t*>(dst) = ByteSwap(*reinterpret_cast<const int16_t*>(src));
+      return;
+    case 4:
+      *reinterpret_cast<int32_t*>(dst) = ByteSwap(*reinterpret_cast<const int32_t*>(src));
+      return;
+    case 8:
+      *reinterpret_cast<int64_t*>(dst) = ByteSwap(*reinterpret_cast<const int64_t*>(src));
+      return;
+    default:
+      break;
+  }
+
+  auto d = reinterpret_cast<uint8_t*>(dst);
+  auto s = reinterpret_cast<const uint8_t*>(src);
+  for (int i = 0; i < len; ++i) {
+    d[i] = s[len - i - 1];
+  }
+}
+
+// Convert to little/big endian format from the machine's native endian format.
+#if ARROW_LITTLE_ENDIAN
+template <typename T, typename = internal::EnableIfIsOneOf<
+                          T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+                          uint8_t, int8_t, float, double, bool>>
+static inline T ToBigEndian(T value) {
+  return ByteSwap(value);
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+                          T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+                          uint8_t, int8_t, float, double, bool>>
+static inline T ToLittleEndian(T value) {
+  return value;
+}
+#else
+template <typename T, typename = internal::EnableIfIsOneOf<
+                          T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+                          uint8_t, int8_t, float, double, bool>>
+static inline T ToBigEndian(T value) {
+  return value;
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+                          T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+                          uint8_t, int8_t, float, double, bool>>
+static inline T ToLittleEndian(T value) {
+  return ByteSwap(value);
+}
+#endif
+
+// Convert from big/little endian format to the machine's native endian format.
+#if ARROW_LITTLE_ENDIAN
+template <typename T, typename = internal::EnableIfIsOneOf<
+                          T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+                          uint8_t, int8_t, float, double, bool>>
+static inline T FromBigEndian(T value) {
+  return ByteSwap(value);
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+                          T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+                          uint8_t, int8_t, float, double, bool>>
+static inline T FromLittleEndian(T value) {
+  return value;
+}
+#else
+template <typename T, typename = internal::EnableIfIsOneOf<
+                          T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+                          uint8_t, int8_t, float, double, bool>>
+static inline T FromBigEndian(T value) {
+  return value;
+}
+
+template <typename T, typename = internal::EnableIfIsOneOf<
+                          T, int64_t, uint64_t, int32_t, uint32_t, int16_t, uint16_t,
+                          uint8_t, int8_t, float, double, bool>>
+static inline T FromLittleEndian(T value) {
+  return ByteSwap(value);
+}
+#endif
+
+// Handle endianness in *word* granularity (keep individual array element untouched)
+namespace little_endian {
+
+namespace detail {
+
+// Read a native endian array as little endian
+template <typename T, size_t N>
+struct Reader {
+  const std::array<T, N>& native_array;
+
+  explicit Reader(const std::array<T, N>& native_array) : native_array(native_array) {}
+
+  const T& operator[](size_t i) const {
+    return native_array[ARROW_LITTLE_ENDIAN ? i : N - 1 - i];
+  }
+};
+
+// Read/write a native endian array as little endian
+template <typename T, size_t N>
+struct Writer {
+  std::array<T, N>* native_array;
+
+  explicit Writer(std::array<T, N>* native_array) : native_array(native_array) {}
+
+  const T& operator[](size_t i) const {
+    return (*native_array)[ARROW_LITTLE_ENDIAN ? i : N - 1 - i];
+  }
+  T& operator[](size_t i) { return (*native_array)[ARROW_LITTLE_ENDIAN ? i : N - 1 - i]; }
+};
+
+}  // namespace detail
+
+// Construct array reader and try to deduce template augments
+template <typename T, size_t N>
+static inline detail::Reader<T, N> Make(const std::array<T, N>& native_array) {
+  return detail::Reader<T, N>(native_array);
+}
+
+// Construct array writer and try to deduce template augments
+template <typename T, size_t N>
+static inline detail::Writer<T, N> Make(std::array<T, N>* native_array) {
+  return detail::Writer<T, N>(native_array);
+}
+
+// Convert little endian array to native endian
+template <typename T, size_t N>
+static inline std::array<T, N> ToNative(std::array<T, N> array) {
+  if (!ARROW_LITTLE_ENDIAN) {
+    std::reverse(array.begin(), array.end());
+  }
+  return array;
+}
+
+// Convert native endian array to little endian
+template <typename T, size_t N>
+static inline std::array<T, N> FromNative(std::array<T, N> array) {
+  return ToNative(array);
+}
+
+}  // namespace little_endian
+
+}  // namespace bit_util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/float16.h b/pyarrow/include/arrow/util/float16.h
new file mode 100644
index 0000000000000000000000000000000000000000..b52145cdc0ca8cbdd7b0647d726798d7b77bde46
--- /dev/null
+++ b/pyarrow/include/arrow/util/float16.h
@@ -0,0 +1,206 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#include <type_traits>
+
+#include "arrow/util/endian.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// \brief Class representing an IEEE half-precision float, encoded as a `uint16_t`
+///
+/// The exact format is as follows (from LSB to MSB):
+/// - bits 0-10:  mantissa
+/// - bits 10-15: exponent
+/// - bit 15:     sign
+///
+class ARROW_EXPORT Float16 {
+ public:
+  Float16() = default;
+  explicit Float16(float f) : Float16(FromFloat(f)) {}
+  explicit Float16(double d) : Float16(FromDouble(d)) {}
+  template <typename T,
+            typename std::enable_if_t<std::is_convertible_v<T, double>>* = NULLPTR>
+  explicit Float16(T v) : Float16(static_cast<double>(v)) {}
+
+  /// \brief Create a `Float16` from its exact binary representation
+  constexpr static Float16 FromBits(uint16_t bits) { return Float16{bits, bool{}}; }
+  /// \brief Create a `Float16` from a 32-bit float (may lose precision)
+  static Float16 FromFloat(float f);
+  /// \brief Create a `Float16` from a 64-bit float (may lose precision)
+  static Float16 FromDouble(double d);
+
+  /// \brief Read a `Float16` from memory in native-endian byte order
+  static Float16 FromBytes(const uint8_t* src) {
+    return FromBits(SafeLoadAs<uint16_t>(src));
+  }
+
+  /// \brief Read a `Float16` from memory in little-endian byte order
+  static Float16 FromLittleEndian(const uint8_t* src) {
+    return FromBits(::arrow::bit_util::FromLittleEndian(SafeLoadAs<uint16_t>(src)));
+  }
+
+  /// \brief Read a `Float16` from memory in big-endian byte order
+  static Float16 FromBigEndian(const uint8_t* src) {
+    return FromBits(::arrow::bit_util::FromBigEndian(SafeLoadAs<uint16_t>(src)));
+  }
+
+  /// \brief Return the value's binary representation as a `uint16_t`
+  constexpr uint16_t bits() const { return bits_; }
+
+  /// \brief Return true if the value is negative (sign bit is set)
+  constexpr bool signbit() const { return (bits_ & 0x8000) != 0; }
+
+  /// \brief Return true if the value is NaN
+  constexpr bool is_nan() const { return (bits_ & 0x7fff) > 0x7c00; }
+  /// \brief Return true if the value is positive/negative infinity
+  constexpr bool is_infinity() const { return (bits_ & 0x7fff) == 0x7c00; }
+  /// \brief Return true if the value is finite and not NaN
+  constexpr bool is_finite() const { return (bits_ & 0x7c00) != 0x7c00; }
+  /// \brief Return true if the value is positive/negative zero
+  constexpr bool is_zero() const { return (bits_ & 0x7fff) == 0; }
+
+  /// \brief Convert to a 32-bit float
+  float ToFloat() const;
+  /// \brief Convert to a 64-bit float
+  double ToDouble() const;
+
+  explicit operator float() const { return ToFloat(); }
+  explicit operator double() const { return ToDouble(); }
+
+  /// \brief Copy the value's bytes in native-endian byte order
+  void ToBytes(uint8_t* dest) const { std::memcpy(dest, &bits_, sizeof(bits_)); }
+  /// \brief Return the value's bytes in native-endian byte order
+  constexpr std::array<uint8_t, 2> ToBytes() const {
+#if ARROW_LITTLE_ENDIAN
+    return ToLittleEndian();
+#else
+    return ToBigEndian();
+#endif
+  }
+
+  /// \brief Copy the value's bytes in little-endian byte order
+  void ToLittleEndian(uint8_t* dest) const {
+    const auto bytes = ToLittleEndian();
+    std::memcpy(dest, bytes.data(), bytes.size());
+  }
+  /// \brief Return the value's bytes in little-endian byte order
+  constexpr std::array<uint8_t, 2> ToLittleEndian() const {
+    return {uint8_t(bits_ & 0xff), uint8_t(bits_ >> 8)};
+  }
+
+  /// \brief Copy the value's bytes in big-endian byte order
+  void ToBigEndian(uint8_t* dest) const {
+    const auto bytes = ToBigEndian();
+    std::memcpy(dest, bytes.data(), bytes.size());
+  }
+  /// \brief Return the value's bytes in big-endian byte order
+  constexpr std::array<uint8_t, 2> ToBigEndian() const {
+    return {uint8_t(bits_ >> 8), uint8_t(bits_ & 0xff)};
+  }
+
+  constexpr Float16 operator-() const { return FromBits(bits_ ^ 0x8000); }
+  constexpr Float16 operator+() const { return FromBits(bits_); }
+
+  friend constexpr bool operator==(Float16 lhs, Float16 rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16::CompareEq(lhs, rhs);
+  }
+  friend constexpr bool operator!=(Float16 lhs, Float16 rhs) { return !(lhs == rhs); }
+
+  friend constexpr bool operator<(Float16 lhs, Float16 rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return Float16::CompareLt(lhs, rhs);
+  }
+  friend constexpr bool operator>(Float16 lhs, Float16 rhs) { return rhs < lhs; }
+
+  friend constexpr bool operator<=(Float16 lhs, Float16 rhs) {
+    if (lhs.is_nan() || rhs.is_nan()) return false;
+    return !Float16::CompareLt(rhs, lhs);
+  }
+  friend constexpr bool operator>=(Float16 lhs, Float16 rhs) { return rhs <= lhs; }
+
+  ARROW_FRIEND_EXPORT friend std::ostream& operator<<(std::ostream& os, Float16 arg);
+
+  static constexpr Float16 zero() { return FromBits(0); }
+  static constexpr Float16 one() { return FromBits(0x3c00); }
+
+ protected:
+  uint16_t bits_;
+
+ private:
+  constexpr Float16(uint16_t bits, bool) : bits_(bits) {}
+
+  // Comparison helpers that assume neither operand is NaN
+  static constexpr bool CompareEq(Float16 lhs, Float16 rhs) {
+    return (lhs.bits() == rhs.bits()) || (lhs.is_zero() && rhs.is_zero());
+  }
+  static constexpr bool CompareLt(Float16 lhs, Float16 rhs) {
+    if (lhs.signbit()) {
+      if (rhs.signbit()) {
+        // Both are negative
+        return lhs.bits() > rhs.bits();
+      } else {
+        // Handle +/-0
+        return !lhs.is_zero() || rhs.bits() != 0;
+      }
+    } else if (rhs.signbit()) {
+      return false;
+    } else {
+      // Both are positive
+      return lhs.bits() < rhs.bits();
+    }
+  }
+};
+
+static_assert(std::is_standard_layout_v<Float16>);
+static_assert(std::is_trivial_v<Float16>);
+static_assert(sizeof(Float16) == sizeof(uint16_t));
+
+}  // namespace util
+}  // namespace arrow
+
+// TODO: Not complete
+template <>
+class std::numeric_limits<arrow::util::Float16> {
+  using T = arrow::util::Float16;
+
+ public:
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+
+  static constexpr T min() { return T::FromBits(0b0000010000000000); }
+  static constexpr T max() { return T::FromBits(0b0111101111111111); }
+  static constexpr T lowest() { return -max(); }
+
+  static constexpr T infinity() { return T::FromBits(0b0111110000000000); }
+
+  static constexpr T quiet_NaN() { return T::FromBits(0b0111111111111111); }
+};
diff --git a/pyarrow/include/arrow/util/formatting.h b/pyarrow/include/arrow/util/formatting.h
new file mode 100644
index 0000000000000000000000000000000000000000..844b6fb91a8d3cf18747efbadaec4a21be20cacc
--- /dev/null
+++ b/pyarrow/include/arrow/util/formatting.h
@@ -0,0 +1,667 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This is a private header for number-to-string formatting utilities
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <chrono>
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string.h"
+#include "arrow/util/time.h"
+#include "arrow/util/visibility.h"
+#include "arrow/vendored/datetime.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief The entry point for conversion to strings.
+template <typename ARROW_TYPE, typename Enable = void>
+class StringFormatter;
+
+template <typename T>
+struct is_formattable {
+  template <typename U, typename = typename StringFormatter<U>::value_type>
+  static std::true_type Test(U*);
+
+  template <typename U>
+  static std::false_type Test(...);
+
+  static constexpr bool value = decltype(Test<T>(NULLPTR))::value;
+};
+
+template <typename T, typename R = void>
+using enable_if_formattable = enable_if_t<is_formattable<T>::value, R>;
+
+template <typename Appender>
+using Return = decltype(std::declval<Appender>()(std::string_view{}));
+
+/////////////////////////////////////////////////////////////////////////
+// Boolean formatting
+
+template <>
+class StringFormatter<BooleanType> {
+ public:
+  explicit StringFormatter(const DataType* = NULLPTR) {}
+
+  using value_type = bool;
+
+  template <typename Appender>
+  Return<Appender> operator()(bool value, Appender&& append) {
+    if (value) {
+      const char string[] = "true";
+      return append(std::string_view(string));
+    } else {
+      const char string[] = "false";
+      return append(std::string_view(string));
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////
+// Decimals formatting
+
+template <typename ARROW_TYPE>
+class DecimalToStringFormatterMixin {
+ public:
+  explicit DecimalToStringFormatterMixin(const DataType* type)
+      : scale_(static_cast<const ARROW_TYPE*>(type)->scale()) {}
+
+  using value_type = typename TypeTraits<ARROW_TYPE>::CType;
+
+  template <typename Appender>
+  Return<Appender> operator()(const value_type& value, Appender&& append) {
+    return append(value.ToString(scale_));
+  }
+
+ private:
+  int32_t scale_;
+};
+
+template <>
+class StringFormatter<Decimal32Type>
+    : public DecimalToStringFormatterMixin<Decimal32Type> {
+  using DecimalToStringFormatterMixin::DecimalToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<Decimal64Type>
+    : public DecimalToStringFormatterMixin<Decimal64Type> {
+  using DecimalToStringFormatterMixin::DecimalToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<Decimal128Type>
+    : public DecimalToStringFormatterMixin<Decimal128Type> {
+  using DecimalToStringFormatterMixin::DecimalToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<Decimal256Type>
+    : public DecimalToStringFormatterMixin<Decimal256Type> {
+  using DecimalToStringFormatterMixin::DecimalToStringFormatterMixin;
+};
+
+/////////////////////////////////////////////////////////////////////////
+// Integer formatting
+
+namespace detail {
+
+// A 2x100 direct table mapping integers in [0..99] to their decimal representations.
+ARROW_EXPORT extern const char digit_pairs[];
+
+// Based on fmtlib's format_int class:
+// Write digits from right to left into a stack allocated buffer.
+// \pre *cursor points to the byte after the one that will be written.
+// \post *cursor points to the byte that was written.
+inline void FormatOneChar(char c, char** cursor) { *(--(*cursor)) = c; }
+
+template <typename Int>
+void FormatOneDigit(Int value, char** cursor) {
+  assert(value >= 0 && value <= 9);
+  FormatOneChar(static_cast<char>('0' + value), cursor);
+}
+
+// GH-35662: I don't know why but the following combination causes SEGV:
+// * template implementation without inline
+// * MinGW
+// * Release build
+template <typename Int>
+inline void FormatTwoDigits(Int value, char** cursor) {
+  assert(value >= 0 && value <= 99);
+  auto digit_pair = &digit_pairs[value * 2];
+  FormatOneChar(digit_pair[1], cursor);
+  FormatOneChar(digit_pair[0], cursor);
+}
+
+template <typename Int>
+void FormatAllDigits(Int value, char** cursor) {
+  assert(value >= 0);
+  while (value >= 100) {
+    FormatTwoDigits(value % 100, cursor);
+    value /= 100;
+  }
+
+  if (value >= 10) {
+    FormatTwoDigits(value, cursor);
+  } else {
+    FormatOneDigit(value, cursor);
+  }
+}
+
+template <typename Int>
+void FormatAllDigitsLeftPadded(Int value, size_t pad, char pad_char, char** cursor) {
+  auto end = *cursor - pad;
+  FormatAllDigits(value, cursor);
+  while (*cursor > end) {
+    FormatOneChar(pad_char, cursor);
+  }
+}
+
+template <size_t BUFFER_SIZE>
+std::string_view ViewDigitBuffer(const std::array<char, BUFFER_SIZE>& buffer,
+                                 char* cursor) {
+  auto buffer_end = buffer.data() + BUFFER_SIZE;
+  return {cursor, static_cast<size_t>(buffer_end - cursor)};
+}
+
+template <typename Int, typename UInt = typename std::make_unsigned<Int>::type>
+constexpr UInt Abs(Int value) {
+  return value < 0 ? ~static_cast<UInt>(value) + 1 : static_cast<UInt>(value);
+}
+
+template <typename Int>
+constexpr size_t Digits10(Int value) {
+  return value <= 9 ? 1 : Digits10(value / 10) + 1;
+}
+
+}  // namespace detail
+
+template <typename ARROW_TYPE>
+class IntToStringFormatterMixin {
+ public:
+  explicit IntToStringFormatterMixin(const DataType* = NULLPTR) {}
+
+  using value_type = typename ARROW_TYPE::c_type;
+
+  template <typename Appender>
+  Return<Appender> operator()(value_type value, Appender&& append) {
+    constexpr size_t buffer_size =
+        detail::Digits10(std::numeric_limits<value_type>::max()) + 1;
+
+    std::array<char, buffer_size> buffer;
+    char* cursor = buffer.data() + buffer_size;
+    detail::FormatAllDigits(detail::Abs(value), &cursor);
+    if (value < 0) {
+      detail::FormatOneChar('-', &cursor);
+    }
+    return append(detail::ViewDigitBuffer(buffer, cursor));
+  }
+};
+
+template <>
+class StringFormatter<Int8Type> : public IntToStringFormatterMixin<Int8Type> {
+  using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<Int16Type> : public IntToStringFormatterMixin<Int16Type> {
+  using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<Int32Type> : public IntToStringFormatterMixin<Int32Type> {
+  using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<Int64Type> : public IntToStringFormatterMixin<Int64Type> {
+  using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<UInt8Type> : public IntToStringFormatterMixin<UInt8Type> {
+  using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<UInt16Type> : public IntToStringFormatterMixin<UInt16Type> {
+  using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<UInt32Type> : public IntToStringFormatterMixin<UInt32Type> {
+  using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<UInt64Type> : public IntToStringFormatterMixin<UInt64Type> {
+  using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+/////////////////////////////////////////////////////////////////////////
+// Floating-point formatting
+
+class ARROW_EXPORT FloatToStringFormatter {
+ public:
+  FloatToStringFormatter();
+  FloatToStringFormatter(int flags, const char* inf_symbol, const char* nan_symbol,
+                         char exp_character, int decimal_in_shortest_low,
+                         int decimal_in_shortest_high,
+                         int max_leading_padding_zeroes_in_precision_mode,
+                         int max_trailing_padding_zeroes_in_precision_mode);
+  ~FloatToStringFormatter();
+
+  // Returns the number of characters written
+  int FormatFloat(float v, char* out_buffer, int out_size);
+  int FormatFloat(double v, char* out_buffer, int out_size);
+  int FormatFloat(uint16_t v, char* out_buffer, int out_size);
+
+ protected:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+template <typename ARROW_TYPE>
+class FloatToStringFormatterMixin : public FloatToStringFormatter {
+ public:
+  using value_type = typename ARROW_TYPE::c_type;
+
+  static constexpr int buffer_size = 50;
+
+  explicit FloatToStringFormatterMixin(const DataType* = NULLPTR) {}
+
+  FloatToStringFormatterMixin(int flags, const char* inf_symbol, const char* nan_symbol,
+                              char exp_character, int decimal_in_shortest_low,
+                              int decimal_in_shortest_high,
+                              int max_leading_padding_zeroes_in_precision_mode,
+                              int max_trailing_padding_zeroes_in_precision_mode)
+      : FloatToStringFormatter(flags, inf_symbol, nan_symbol, exp_character,
+                               decimal_in_shortest_low, decimal_in_shortest_high,
+                               max_leading_padding_zeroes_in_precision_mode,
+                               max_trailing_padding_zeroes_in_precision_mode) {}
+
+  template <typename Appender>
+  Return<Appender> operator()(value_type value, Appender&& append) {
+    char buffer[buffer_size];
+    int size = FormatFloat(value, buffer, buffer_size);
+    return append(std::string_view(buffer, size));
+  }
+};
+
+template <>
+class StringFormatter<HalfFloatType> : public FloatToStringFormatterMixin<HalfFloatType> {
+ public:
+  using FloatToStringFormatterMixin::FloatToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<FloatType> : public FloatToStringFormatterMixin<FloatType> {
+ public:
+  using FloatToStringFormatterMixin::FloatToStringFormatterMixin;
+};
+
+template <>
+class StringFormatter<DoubleType> : public FloatToStringFormatterMixin<DoubleType> {
+ public:
+  using FloatToStringFormatterMixin::FloatToStringFormatterMixin;
+};
+
+/////////////////////////////////////////////////////////////////////////
+// Temporal formatting
+
+namespace detail {
+
+constexpr size_t BufferSizeYYYY_MM_DD() {
+  // "-"? "99999-12-31"
+  return 1 + detail::Digits10(99999) + 1 + detail::Digits10(12) + 1 +
+         detail::Digits10(31);
+}
+
+inline void FormatYYYY_MM_DD(arrow_vendored::date::year_month_day ymd, char** cursor) {
+  FormatTwoDigits(static_cast<unsigned>(ymd.day()), cursor);
+  FormatOneChar('-', cursor);
+  FormatTwoDigits(static_cast<unsigned>(ymd.month()), cursor);
+  FormatOneChar('-', cursor);
+  auto year = static_cast<int>(ymd.year());
+  const auto is_neg_year = year < 0;
+  year = std::abs(year);
+  assert(year <= 99999);
+  FormatTwoDigits(year % 100, cursor);
+  year /= 100;
+  FormatTwoDigits(year % 100, cursor);
+  if (year >= 100) {
+    FormatOneDigit(year / 100, cursor);
+  }
+  if (is_neg_year) {
+    FormatOneChar('-', cursor);
+  }
+}
+
+template <typename Duration>
+constexpr size_t BufferSizeHH_MM_SS() {
+  // "23:59:59" ("." "9"+)?
+  return detail::Digits10(23) + 1 + detail::Digits10(59) + 1 + detail::Digits10(59) + 1 +
+         detail::Digits10(Duration::period::den) - 1;
+}
+
+template <typename Duration>
+void FormatHH_MM_SS(arrow_vendored::date::hh_mm_ss<Duration> hms, char** cursor) {
+  constexpr size_t subsecond_digits = Digits10(Duration::period::den) - 1;
+  if (subsecond_digits != 0) {
+    FormatAllDigitsLeftPadded(hms.subseconds().count(), subsecond_digits, '0', cursor);
+    FormatOneChar('.', cursor);
+  }
+  FormatTwoDigits(hms.seconds().count(), cursor);
+  FormatOneChar(':', cursor);
+  FormatTwoDigits(hms.minutes().count(), cursor);
+  FormatOneChar(':', cursor);
+  FormatTwoDigits(hms.hours().count(), cursor);
+}
+
+// Some out-of-bound datetime values would result in erroneous printing
+// because of silent integer wraparound in the `arrow_vendored::date` library.
+//
+// To avoid such misprinting, we must therefore check the bounds explicitly.
+// The bounds correspond to start of year -32767 and end of year 32767,
+// respectively (-32768 is an invalid year value in `arrow_vendored::date`).
+//
+// Note these values are the same as documented for C++20:
+// https://en.cppreference.com/w/cpp/chrono/year_month_day/operator_days
+template <typename Unit>
+bool IsDateTimeInRange(Unit duration) {
+  constexpr Unit kMinIncl =
+      std::chrono::duration_cast<Unit>(arrow_vendored::date::days{-12687428});
+  constexpr Unit kMaxExcl =
+      std::chrono::duration_cast<Unit>(arrow_vendored::date::days{11248738});
+  return duration >= kMinIncl && duration < kMaxExcl;
+}
+
+// IsDateTimeInRange() specialization for nanoseconds: a 64-bit number of
+// nanoseconds cannot represent years outside of the [-32767, 32767]
+// range, and the {kMinIncl, kMaxExcl} constants above would overflow.
+constexpr bool IsDateTimeInRange(std::chrono::nanoseconds duration) { return true; }
+
+template <typename Unit>
+bool IsTimeInRange(Unit duration) {
+  constexpr Unit kMinIncl = std::chrono::duration_cast<Unit>(std::chrono::seconds{0});
+  constexpr Unit kMaxExcl = std::chrono::duration_cast<Unit>(std::chrono::seconds{86400});
+  return duration >= kMinIncl && duration < kMaxExcl;
+}
+
+template <typename RawValue, typename Appender>
+Return<Appender> FormatOutOfRange(RawValue&& raw_value, Appender&& append) {
+  // XXX locale-sensitive but good enough for now
+  std::string formatted = "<value out of range: " + ToChars(raw_value) + ">";
+  return append(std::move(formatted));
+}
+
+const auto kEpoch = arrow_vendored::date::sys_days{arrow_vendored::date::jan / 1 / 1970};
+
+}  // namespace detail
+
+template <>
+class StringFormatter<DurationType> : public IntToStringFormatterMixin<DurationType> {
+  using IntToStringFormatterMixin::IntToStringFormatterMixin;
+};
+
+class DateToStringFormatterMixin {
+ public:
+  explicit DateToStringFormatterMixin(const DataType* = NULLPTR) {}
+
+ protected:
+  template <typename Appender>
+  Return<Appender> FormatDays(arrow_vendored::date::days since_epoch, Appender&& append) {
+    arrow_vendored::date::sys_days timepoint_days{since_epoch};
+
+    constexpr size_t buffer_size = detail::BufferSizeYYYY_MM_DD();
+
+    std::array<char, buffer_size> buffer;
+    char* cursor = buffer.data() + buffer_size;
+
+    detail::FormatYYYY_MM_DD(arrow_vendored::date::year_month_day{timepoint_days},
+                             &cursor);
+    return append(detail::ViewDigitBuffer(buffer, cursor));
+  }
+};
+
+template <>
+class StringFormatter<Date32Type> : public DateToStringFormatterMixin {
+ public:
+  using value_type = typename Date32Type::c_type;
+
+  using DateToStringFormatterMixin::DateToStringFormatterMixin;
+
+  template <typename Appender>
+  Return<Appender> operator()(value_type value, Appender&& append) {
+    const auto since_epoch = arrow_vendored::date::days{value};
+    if (!ARROW_PREDICT_TRUE(detail::IsDateTimeInRange(since_epoch))) {
+      return detail::FormatOutOfRange(value, append);
+    }
+    return FormatDays(since_epoch, std::forward<Appender>(append));
+  }
+};
+
+template <>
+class StringFormatter<Date64Type> : public DateToStringFormatterMixin {
+ public:
+  using value_type = typename Date64Type::c_type;
+
+  using DateToStringFormatterMixin::DateToStringFormatterMixin;
+
+  template <typename Appender>
+  Return<Appender> operator()(value_type value, Appender&& append) {
+    const auto since_epoch = std::chrono::milliseconds{value};
+    if (!ARROW_PREDICT_TRUE(detail::IsDateTimeInRange(since_epoch))) {
+      return detail::FormatOutOfRange(value, append);
+    }
+    return FormatDays(std::chrono::duration_cast<arrow_vendored::date::days>(since_epoch),
+                      std::forward<Appender>(append));
+  }
+};
+
+template <>
+class StringFormatter<TimestampType> {
+ public:
+  using value_type = int64_t;
+
+  explicit StringFormatter(const DataType* type)
+      : unit_(checked_cast<const TimestampType&>(*type).unit()),
+        timezone_(checked_cast<const TimestampType&>(*type).timezone()) {}
+
+  template <typename Duration, typename Appender>
+  Return<Appender> operator()(Duration, value_type value, Appender&& append) {
+    using arrow_vendored::date::days;
+
+    const Duration since_epoch{value};
+    if (!ARROW_PREDICT_TRUE(detail::IsDateTimeInRange(since_epoch))) {
+      return detail::FormatOutOfRange(value, append);
+    }
+
+    const auto timepoint = detail::kEpoch + since_epoch;
+    // Round days towards zero
+    // (the naive approach of using arrow_vendored::date::floor() would
+    //  result in UB for very large negative timestamps, similarly as
+    //  https://github.com/HowardHinnant/date/issues/696)
+    auto timepoint_days = std::chrono::time_point_cast<days>(timepoint);
+    Duration since_midnight;
+    if (timepoint_days <= timepoint) {
+      // Year >= 1970
+      since_midnight = timepoint - timepoint_days;
+    } else {
+      // Year < 1970
+      since_midnight = days(1) - (timepoint_days - timepoint);
+      timepoint_days -= days(1);
+    }
+
+    // YYYY_MM_DD " " HH_MM_SS "Z"?
+    constexpr size_t buffer_size =
+        detail::BufferSizeYYYY_MM_DD() + 1 + detail::BufferSizeHH_MM_SS<Duration>() + 1;
+
+    std::array<char, buffer_size> buffer;
+    char* cursor = buffer.data() + buffer_size;
+
+    if (timezone_.size() > 0) {
+      detail::FormatOneChar('Z', &cursor);
+    }
+    detail::FormatHH_MM_SS(arrow_vendored::date::make_time(since_midnight), &cursor);
+    detail::FormatOneChar(' ', &cursor);
+    detail::FormatYYYY_MM_DD(timepoint_days, &cursor);
+    return append(detail::ViewDigitBuffer(buffer, cursor));
+  }
+
+  template <typename Appender>
+  Return<Appender> operator()(value_type value, Appender&& append) {
+    return util::VisitDuration(unit_, *this, value, std::forward<Appender>(append));
+  }
+
+ private:
+  TimeUnit::type unit_;
+  std::string timezone_;
+};
+
+template <typename T>
+class StringFormatter<T, enable_if_time<T>> {
+ public:
+  using value_type = typename T::c_type;
+
+  explicit StringFormatter(const DataType* type)
+      : unit_(checked_cast<const T&>(*type).unit()) {}
+
+  template <typename Duration, typename Appender>
+  Return<Appender> operator()(Duration, value_type count, Appender&& append) {
+    const Duration since_midnight{count};
+    if (!ARROW_PREDICT_TRUE(detail::IsTimeInRange(since_midnight))) {
+      return detail::FormatOutOfRange(count, append);
+    }
+
+    constexpr size_t buffer_size = detail::BufferSizeHH_MM_SS<Duration>();
+
+    std::array<char, buffer_size> buffer;
+    char* cursor = buffer.data() + buffer_size;
+
+    detail::FormatHH_MM_SS(arrow_vendored::date::make_time(since_midnight), &cursor);
+    return append(detail::ViewDigitBuffer(buffer, cursor));
+  }
+
+  template <typename Appender>
+  Return<Appender> operator()(value_type value, Appender&& append) {
+    return util::VisitDuration(unit_, *this, value, std::forward<Appender>(append));
+  }
+
+ private:
+  TimeUnit::type unit_;
+};
+
+template <>
+class StringFormatter<MonthIntervalType> {
+ public:
+  using value_type = MonthIntervalType::c_type;
+
+  explicit StringFormatter(const DataType*) {}
+
+  template <typename Appender>
+  Return<Appender> operator()(value_type interval, Appender&& append) {
+    constexpr size_t buffer_size =
+        /*'m'*/ 3 + /*negative signs*/ 1 +
+        /*months*/ detail::Digits10(std::numeric_limits<value_type>::max());
+    std::array<char, buffer_size> buffer;
+    char* cursor = buffer.data() + buffer_size;
+
+    detail::FormatOneChar('M', &cursor);
+    detail::FormatAllDigits(detail::Abs(interval), &cursor);
+    if (interval < 0) detail::FormatOneChar('-', &cursor);
+
+    return append(detail::ViewDigitBuffer(buffer, cursor));
+  }
+};
+
+template <>
+class StringFormatter<DayTimeIntervalType> {
+ public:
+  using value_type = DayTimeIntervalType::DayMilliseconds;
+
+  explicit StringFormatter(const DataType*) {}
+
+  template <typename Appender>
+  Return<Appender> operator()(value_type interval, Appender&& append) {
+    constexpr size_t buffer_size =
+        /*d, ms*/ 3 + /*negative signs*/ 2 +
+        /*days/milliseconds*/ 2 * detail::Digits10(std::numeric_limits<int32_t>::max());
+    std::array<char, buffer_size> buffer;
+    char* cursor = buffer.data() + buffer_size;
+
+    detail::FormatOneChar('s', &cursor);
+    detail::FormatOneChar('m', &cursor);
+    detail::FormatAllDigits(detail::Abs(interval.milliseconds), &cursor);
+    if (interval.milliseconds < 0) detail::FormatOneChar('-', &cursor);
+
+    detail::FormatOneChar('d', &cursor);
+    detail::FormatAllDigits(detail::Abs(interval.days), &cursor);
+    if (interval.days < 0) detail::FormatOneChar('-', &cursor);
+
+    return append(detail::ViewDigitBuffer(buffer, cursor));
+  }
+};
+
+template <>
+class StringFormatter<MonthDayNanoIntervalType> {
+ public:
+  using value_type = MonthDayNanoIntervalType::MonthDayNanos;
+
+  explicit StringFormatter(const DataType*) {}
+
+  template <typename Appender>
+  Return<Appender> operator()(value_type interval, Appender&& append) {
+    constexpr size_t buffer_size =
+        /*m, d, ns*/ 4 + /*negative signs*/ 3 +
+        /*months/days*/ 2 * detail::Digits10(std::numeric_limits<int32_t>::max()) +
+        /*nanoseconds*/ detail::Digits10(std::numeric_limits<int64_t>::max());
+    std::array<char, buffer_size> buffer;
+    char* cursor = buffer.data() + buffer_size;
+
+    detail::FormatOneChar('s', &cursor);
+    detail::FormatOneChar('n', &cursor);
+    detail::FormatAllDigits(detail::Abs(interval.nanoseconds), &cursor);
+    if (interval.nanoseconds < 0) detail::FormatOneChar('-', &cursor);
+
+    detail::FormatOneChar('d', &cursor);
+    detail::FormatAllDigits(detail::Abs(interval.days), &cursor);
+    if (interval.days < 0) detail::FormatOneChar('-', &cursor);
+
+    detail::FormatOneChar('M', &cursor);
+    detail::FormatAllDigits(detail::Abs(interval.months), &cursor);
+    if (interval.months < 0) detail::FormatOneChar('-', &cursor);
+
+    return append(detail::ViewDigitBuffer(buffer, cursor));
+  }
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/functional.h b/pyarrow/include/arrow/util/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..41e268852fa6ea76ce195240498bb11277a7228c
--- /dev/null
+++ b/pyarrow/include/arrow/util/functional.h
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <tuple>
+#include <type_traits>
+
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+struct Empty {
+  static Result<Empty> ToResult(Status s) {
+    if (ARROW_PREDICT_TRUE(s.ok())) {
+      return Empty{};
+    }
+    return s;
+  }
+};
+
+/// Helper struct for examining lambdas and other callables.
+/// TODO(ARROW-12655) support function pointers
+struct call_traits {
+ public:
+  template <typename R, typename... A>
+  static std::false_type is_overloaded_impl(R(A...));
+
+  template <typename F>
+  static std::false_type is_overloaded_impl(decltype(&F::operator())*);
+
+  template <typename F>
+  static std::true_type is_overloaded_impl(...);
+
+  template <typename F, typename R, typename... A>
+  static R return_type_impl(R (F::*)(A...));
+
+  template <typename F, typename R, typename... A>
+  static R return_type_impl(R (F::*)(A...) const);
+
+  template <std::size_t I, typename F, typename R, typename... A>
+  static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
+      R (F::*)(A...));
+
+  template <std::size_t I, typename F, typename R, typename... A>
+  static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
+      R (F::*)(A...) const);
+
+  template <std::size_t I, typename F, typename R, typename... A>
+  static typename std::tuple_element<I, std::tuple<A...>>::type argument_type_impl(
+      R (F::*)(A...) &&);
+
+  template <typename F, typename R, typename... A>
+  static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...));
+
+  template <typename F, typename R, typename... A>
+  static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...)
+                                                                           const);
+
+  template <typename F, typename R, typename... A>
+  static std::integral_constant<int, sizeof...(A)> argument_count_impl(R (F::*)(A...) &&);
+
+  /// bool constant indicating whether F is a callable with more than one possible
+  /// signature. Will be true_type for objects which define multiple operator() or which
+  /// define a template operator()
+  template <typename F>
+  using is_overloaded =
+      decltype(is_overloaded_impl<typename std::decay<F>::type>(NULLPTR));
+
+  template <typename F, typename T = void>
+  using enable_if_overloaded = typename std::enable_if<is_overloaded<F>::value, T>::type;
+
+  template <typename F, typename T = void>
+  using disable_if_overloaded =
+      typename std::enable_if<!is_overloaded<F>::value, T>::type;
+
+  /// If F is not overloaded, the argument types of its call operator can be
+  /// extracted via call_traits::argument_type<Index, F>
+  template <std::size_t I, typename F>
+  using argument_type = decltype(argument_type_impl<I>(&std::decay<F>::type::operator()));
+
+  template <typename F>
+  using argument_count = decltype(argument_count_impl(&std::decay<F>::type::operator()));
+
+  template <typename F>
+  using return_type = decltype(return_type_impl(&std::decay<F>::type::operator()));
+
+  template <typename F, typename T, typename RT = T>
+  using enable_if_return =
+      typename std::enable_if<std::is_same<return_type<F>, T>::value, RT>;
+
+  template <typename T, typename R = void>
+  using enable_if_empty = typename std::enable_if<std::is_same<T, Empty>::value, R>::type;
+
+  template <typename T, typename R = void>
+  using enable_if_not_empty =
+      typename std::enable_if<!std::is_same<T, Empty>::value, R>::type;
+};
+
+/// A type erased callable object which may only be invoked once.
+/// It can be constructed from any lambda which matches the provided call signature.
+/// Invoking it results in destruction of the lambda, freeing any state/references
+/// immediately. Invoking a default constructed FnOnce or one which has already been
+/// invoked will segfault.
+template <typename Signature>
+class FnOnce;
+
+template <typename R, typename... A>
+class FnOnce<R(A...)> {
+ public:
+  FnOnce() = default;
+
+  template <typename Fn,
+            typename = typename std::enable_if<std::is_convertible<
+                decltype(std::declval<Fn&&>()(std::declval<A>()...)), R>::value>::type>
+  FnOnce(Fn fn) : impl_(new FnImpl<Fn>(std::move(fn))) {  // NOLINT runtime/explicit
+  }
+
+  explicit operator bool() const { return impl_ != NULLPTR; }
+
+  R operator()(A... a) && {
+    auto bye = std::move(impl_);
+    return bye->invoke(std::forward<A&&>(a)...);
+  }
+
+ private:
+  struct Impl {
+    virtual ~Impl() = default;
+    virtual R invoke(A&&... a) = 0;
+  };
+
+  template <typename Fn>
+  struct FnImpl : Impl {
+    explicit FnImpl(Fn fn) : fn_(std::move(fn)) {}
+    R invoke(A&&... a) override { return std::move(fn_)(std::forward<A&&>(a)...); }
+    Fn fn_;
+  };
+
+  std::unique_ptr<Impl> impl_;
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/future.h b/pyarrow/include/arrow/util/future.h
new file mode 100644
index 0000000000000000000000000000000000000000..0aa2842703712d0245f47c2b0e1885067a4f8f90
--- /dev/null
+++ b/pyarrow/include/arrow/util/future.h
@@ -0,0 +1,882 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <atomic>
+#include <cmath>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/config.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/tracing.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+template <typename>
+struct EnsureFuture;
+
+namespace detail {
+
+template <typename>
+struct is_future : std::false_type {};
+
+template <typename T>
+struct is_future<Future<T>> : std::true_type {};
+
+template <typename Signature, typename Enable = void>
+struct result_of;
+
+template <typename Fn, typename... A>
+struct result_of<Fn(A...),
+                 internal::void_t<decltype(std::declval<Fn>()(std::declval<A>()...))>> {
+  using type = decltype(std::declval<Fn>()(std::declval<A>()...));
+};
+
+template <typename Signature>
+using result_of_t = typename result_of<Signature>::type;
+
+// Helper to find the synchronous counterpart for a Future
+template <typename T>
+struct SyncType {
+  using type = Result<T>;
+};
+
+template <>
+struct SyncType<internal::Empty> {
+  using type = Status;
+};
+
+template <typename Fn>
+using first_arg_is_status =
+    std::is_same<typename std::decay<internal::call_traits::argument_type<0, Fn>>::type,
+                 Status>;
+
+template <typename Fn, typename Then, typename Else,
+          typename Count = internal::call_traits::argument_count<Fn>>
+using if_has_no_args = typename std::conditional<Count::value == 0, Then, Else>::type;
+
+/// Creates a callback that can be added to a future to mark a `dest` future finished
+template <typename Source, typename Dest, bool SourceEmpty = Source::is_empty,
+          bool DestEmpty = Dest::is_empty>
+struct MarkNextFinished {};
+
+/// If the source and dest are both empty we can pass on the status
+template <typename Source, typename Dest>
+struct MarkNextFinished<Source, Dest, true, true> {
+  void operator()(const Status& status) && { next.MarkFinished(status); }
+  Dest next;
+};
+
+/// If the source is not empty but the dest is then we can take the
+/// status out of the result
+template <typename Source, typename Dest>
+struct MarkNextFinished<Source, Dest, false, true> {
+  void operator()(const Result<typename Source::ValueType>& res) && {
+    next.MarkFinished(internal::Empty::ToResult(res.status()));
+  }
+  Dest next;
+};
+
+/// If neither are empty we pass on the result
+template <typename Source, typename Dest>
+struct MarkNextFinished<Source, Dest, false, false> {
+  void operator()(const Result<typename Source::ValueType>& res) && {
+    next.MarkFinished(res);
+  }
+  Dest next;
+};
+
+/// Helper that contains information about how to apply a continuation
+struct ContinueFuture {
+  template <typename Return>
+  struct ForReturnImpl;
+
+  template <typename Return>
+  using ForReturn = typename ForReturnImpl<Return>::type;
+
+  template <typename Signature>
+  using ForSignature = ForReturn<result_of_t<Signature>>;
+
+  // If the callback returns void then we return Future<> that always finishes OK.
+  template <typename ContinueFunc, typename... Args,
+            typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+            typename NextFuture = ForReturn<ContinueResult>>
+  typename std::enable_if<std::is_void<ContinueResult>::value>::type operator()(
+      NextFuture next, ContinueFunc&& f, Args&&... a) const {
+    std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
+    next.MarkFinished();
+  }
+
+  /// If the callback returns a non-future then we return Future<T>
+  /// and mark the future finished with the callback result.  It will get promoted
+  /// to Result<T> as part of MarkFinished if it isn't already.
+  ///
+  /// If the callback returns Status and we return Future<> then also send the callback
+  /// result as-is to the destination future.
+  template <typename ContinueFunc, typename... Args,
+            typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+            typename NextFuture = ForReturn<ContinueResult>>
+  typename std::enable_if<
+      !std::is_void<ContinueResult>::value && !is_future<ContinueResult>::value &&
+      (!NextFuture::is_empty || std::is_same<ContinueResult, Status>::value)>::type
+  operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
+    next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...));
+  }
+
+  /// If the callback returns a Result and the next future is Future<> then we mark
+  /// the future finished with the callback result.
+  ///
+  /// It may seem odd that the next future is Future<> when the callback returns a
+  /// result but this can occur if the OnFailure callback returns a result while the
+  /// OnSuccess callback is void/Status (e.g. you would get this calling the one-arg
+  /// version of Then with an OnSuccess callback that returns void)
+  template <typename ContinueFunc, typename... Args,
+            typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+            typename NextFuture = ForReturn<ContinueResult>>
+  typename std::enable_if<!std::is_void<ContinueResult>::value &&
+                          !is_future<ContinueResult>::value && NextFuture::is_empty &&
+                          !std::is_same<ContinueResult, Status>::value>::type
+  operator()(NextFuture next, ContinueFunc&& f, Args&&... a) const {
+    next.MarkFinished(std::forward<ContinueFunc>(f)(std::forward<Args>(a)...).status());
+  }
+
+  /// If the callback returns a Future<T> then we return Future<T>.  We create a new
+  /// future and add a callback to the future given to us by the user that forwards the
+  /// result to the future we just created
+  template <typename ContinueFunc, typename... Args,
+            typename ContinueResult = result_of_t<ContinueFunc && (Args && ...)>,
+            typename NextFuture = ForReturn<ContinueResult>>
+  typename std::enable_if<is_future<ContinueResult>::value>::type operator()(
+      NextFuture next, ContinueFunc&& f, Args&&... a) const {
+    ContinueResult signal_to_complete_next =
+        std::forward<ContinueFunc>(f)(std::forward<Args>(a)...);
+    MarkNextFinished<ContinueResult, NextFuture> callback{std::move(next)};
+    signal_to_complete_next.AddCallback(std::move(callback));
+  }
+
+  /// Helpers to conditionally ignore arguments to ContinueFunc
+  template <typename ContinueFunc, typename NextFuture, typename... Args>
+  void IgnoringArgsIf(std::true_type, NextFuture&& next, ContinueFunc&& f,
+                      Args&&...) const {
+    operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f));
+  }
+  template <typename ContinueFunc, typename NextFuture, typename... Args>
+  void IgnoringArgsIf(std::false_type, NextFuture&& next, ContinueFunc&& f,
+                      Args&&... a) const {
+    operator()(std::forward<NextFuture>(next), std::forward<ContinueFunc>(f),
+               std::forward<Args>(a)...);
+  }
+};
+
+/// Helper struct which tells us what kind of Future gets returned from `Then` based on
+/// the return type of the OnSuccess callback
+template <>
+struct ContinueFuture::ForReturnImpl<void> {
+  using type = Future<>;
+};
+
+template <>
+struct ContinueFuture::ForReturnImpl<Status> {
+  using type = Future<>;
+};
+
+template <typename R>
+struct ContinueFuture::ForReturnImpl {
+  using type = Future<R>;
+};
+
+template <typename T>
+struct ContinueFuture::ForReturnImpl<Result<T>> {
+  using type = Future<T>;
+};
+
+template <typename T>
+struct ContinueFuture::ForReturnImpl<Future<T>> {
+  using type = Future<T>;
+};
+
+}  // namespace detail
+
+/// A Future's execution or completion status
+enum class FutureState : int8_t { PENDING, SUCCESS, FAILURE };
+
+inline bool IsFutureFinished(FutureState state) { return state != FutureState::PENDING; }
+
+/// \brief Describe whether the callback should be scheduled or run synchronously
+enum class ShouldSchedule {
+  /// Always run the callback synchronously (the default)
+  Never = 0,
+  /// Schedule a new task only if the future is not finished when the
+  /// callback is added
+  IfUnfinished = 1,
+  /// Always schedule the callback as a new task
+  Always = 2,
+  /// Schedule a new task only if it would run on an executor other than
+  /// the specified executor.
+  IfDifferentExecutor = 3,
+};
+
+/// \brief Options that control how a continuation is run
+struct CallbackOptions {
+  /// Describe whether the callback should be run synchronously or scheduled
+  ShouldSchedule should_schedule = ShouldSchedule::Never;
+  /// If the callback is scheduled then this is the executor it should be scheduled
+  /// on.  If this is NULL then should_schedule must be Never
+  internal::Executor* executor = NULLPTR;
+
+  static CallbackOptions Defaults() { return {}; }
+};
+
+// Untyped private implementation
+class ARROW_EXPORT FutureImpl : public std::enable_shared_from_this<FutureImpl> {
+ public:
+  FutureImpl();
+  virtual ~FutureImpl() = default;
+
+  FutureState state() { return state_.load(); }
+
+  static std::unique_ptr<FutureImpl> Make();
+  static std::unique_ptr<FutureImpl> MakeFinished(FutureState state);
+
+#ifdef ARROW_WITH_OPENTELEMETRY
+  void SetSpan(util::tracing::Span* span) { span_ = span; }
+#endif
+
+  // Future API
+  void MarkFinished();
+  void MarkFailed();
+  void Wait();
+  bool Wait(double seconds);
+  template <typename ValueType>
+  Result<ValueType>* CastResult() const {
+    return static_cast<Result<ValueType>*>(result_.get());
+  }
+
+  using Callback = internal::FnOnce<void(const FutureImpl& impl)>;
+  void AddCallback(Callback callback, CallbackOptions opts);
+  bool TryAddCallback(const std::function<Callback()>& callback_factory,
+                      CallbackOptions opts);
+
+  std::atomic<FutureState> state_{FutureState::PENDING};
+
+  // Type erased storage for arbitrary results
+  // XXX small objects could be stored inline instead of boxed in a pointer
+  using Storage = std::unique_ptr<void, void (*)(void*)>;
+  Storage result_{NULLPTR, NULLPTR};
+
+  struct CallbackRecord {
+    Callback callback;
+    CallbackOptions options;
+  };
+  std::vector<CallbackRecord> callbacks_;
+#ifdef ARROW_WITH_OPENTELEMETRY
+  util::tracing::Span* span_ = NULLPTR;
+#endif
+};
+
+// ---------------------------------------------------------------------
+// Public API
+
+/// \brief EXPERIMENTAL A std::future-like class with more functionality.
+///
+/// A Future represents the results of a past or future computation.
+/// The Future API has two sides: a producer side and a consumer side.
+///
+/// The producer API allows creating a Future and setting its result or
+/// status, possibly after running a computation function.
+///
+/// The consumer API allows querying a Future's current state, wait for it
+/// to complete, and composing futures with callbacks.
+template <typename T>
+class [[nodiscard]] Future {
+ public:
+  using ValueType = T;
+  using SyncType = typename detail::SyncType<T>::type;
+  static constexpr bool is_empty = std::is_same<T, internal::Empty>::value;
+  // The default constructor creates an invalid Future.  Use Future::Make()
+  // for a valid Future.  This constructor is mostly for the convenience
+  // of being able to presize a vector of Futures.
+  Future() = default;
+
+#ifdef ARROW_WITH_OPENTELEMETRY
+  void SetSpan(util::tracing::Span* span) { impl_->SetSpan(span); }
+#endif
+
+  // Consumer API
+
+  bool is_valid() const { return impl_ != NULLPTR; }
+
+  /// \brief Return the Future's current state
+  ///
+  /// A return value of PENDING is only indicative, as the Future can complete
+  /// concurrently.  A return value of FAILURE or SUCCESS is definitive, though.
+  FutureState state() const {
+    CheckValid();
+    return impl_->state();
+  }
+
+  /// \brief Whether the Future is finished
+  ///
+  /// A false return value is only indicative, as the Future can complete
+  /// concurrently.  A true return value is definitive, though.
+  bool is_finished() const {
+    CheckValid();
+    return IsFutureFinished(impl_->state());
+  }
+
+  /// \brief Wait for the Future to complete and return its Result
+  const Result<ValueType>& result() const& {
+    Wait();
+    return *GetResult();
+  }
+
+  /// \brief Returns an rvalue to the result.  This method is potentially unsafe
+  ///
+  /// The future is not the unique owner of the result, copies of a future will
+  /// also point to the same result.  You must make sure that no other copies
+  /// of the future exist.  Attempts to add callbacks after you move the result
+  /// will result in undefined behavior.
+  Result<ValueType>&& MoveResult() {
+    Wait();
+    return std::move(*GetResult());
+  }
+
+  /// \brief Wait for the Future to complete and return its Status
+  const Status& status() const { return result().status(); }
+
+  /// \brief Future<T> is convertible to Future<>, which views only the
+  /// Status of the original. Marking the returned Future Finished is not supported.
+  explicit operator Future<>() const {
+    Future<> status_future;
+    status_future.impl_ = impl_;
+    return status_future;
+  }
+
+  /// \brief Wait for the Future to complete
+  void Wait() const {
+    CheckValid();
+    impl_->Wait();
+  }
+
+  /// \brief Wait for the Future to complete, or for the timeout to expire
+  ///
+  /// `true` is returned if the Future completed, `false` if the timeout expired.
+  /// Note a `false` value is only indicative, as the Future can complete
+  /// concurrently.
+  bool Wait(double seconds) const {
+    CheckValid();
+    return impl_->Wait(seconds);
+  }
+
+  // Producer API
+
+  /// \brief Producer API: mark Future finished
+  ///
+  /// The Future's result is set to `res`.
+  void MarkFinished(Result<ValueType> res) { DoMarkFinished(std::move(res)); }
+
+  /// \brief Mark a Future<> completed with the provided Status.
+  template <typename E = ValueType, typename = typename std::enable_if<
+                                        std::is_same<E, internal::Empty>::value>::type>
+  void MarkFinished(Status s = Status::OK()) {
+    return DoMarkFinished(E::ToResult(std::move(s)));
+  }
+
+  /// \brief Producer API: instantiate a valid Future
+  ///
+  /// The Future's state is initialized with PENDING.  If you are creating a future with
+  /// this method you must ensure that future is eventually completed (with success or
+  /// failure).  Creating a future, returning it, and never completing the future can lead
+  /// to memory leaks (for example, see Loop).
+  static Future Make() {
+    Future fut;
+    fut.impl_ = FutureImpl::Make();
+    return fut;
+  }
+
+  /// \brief Producer API: instantiate a finished Future
+  static Future<ValueType> MakeFinished(Result<ValueType> res) {
+    Future<ValueType> fut;
+    fut.InitializeFromResult(std::move(res));
+    return fut;
+  }
+
+  /// \brief Make a finished Future<> with the provided Status.
+  template <typename E = ValueType, typename = typename std::enable_if<
+                                        std::is_same<E, internal::Empty>::value>::type>
+  static Future<> MakeFinished(Status s = Status::OK()) {
+    return MakeFinished(E::ToResult(std::move(s)));
+  }
+
+  struct WrapResultOnComplete {
+    template <typename OnComplete>
+    struct Callback {
+      void operator()(const FutureImpl& impl) && {
+        std::move(on_complete)(*impl.CastResult<ValueType>());
+      }
+      OnComplete on_complete;
+    };
+  };
+
+  struct WrapStatusyOnComplete {
+    template <typename OnComplete>
+    struct Callback {
+      static_assert(std::is_same<internal::Empty, ValueType>::value,
+                    "Only callbacks for Future<> should accept Status and not Result");
+
+      void operator()(const FutureImpl& impl) && {
+        std::move(on_complete)(impl.CastResult<ValueType>()->status());
+      }
+      OnComplete on_complete;
+    };
+  };
+
+  template <typename OnComplete>
+  using WrapOnComplete = typename std::conditional<
+      detail::first_arg_is_status<OnComplete>::value, WrapStatusyOnComplete,
+      WrapResultOnComplete>::type::template Callback<OnComplete>;
+
+  /// \brief Consumer API: Register a callback to run when this future completes
+  ///
+  /// The callback should receive the result of the future (const Result<T>&)
+  /// For a void or statusy future this should be (const Status&)
+  ///
+  /// There is no guarantee to the order in which callbacks will run.  In
+  /// particular, callbacks added while the future is being marked complete
+  /// may be executed immediately, ahead of, or even the same time as, other
+  /// callbacks that have been previously added.
+  ///
+  /// WARNING: callbacks may hold arbitrary references, including cyclic references.
+  /// Since callbacks will only be destroyed after they are invoked, this can lead to
+  /// memory leaks if a Future is never marked finished (abandoned):
+  ///
+  /// {
+  ///     auto fut = Future<>::Make();
+  ///     fut.AddCallback([fut]() {});
+  /// }
+  ///
+  /// In this example `fut` falls out of scope but is not destroyed because it holds a
+  /// cyclic reference to itself through the callback.
+  template <typename OnComplete, typename Callback = WrapOnComplete<OnComplete>>
+  void AddCallback(OnComplete on_complete,
+                   CallbackOptions opts = CallbackOptions::Defaults()) const {
+    // We know impl_ will not be dangling when invoking callbacks because at least one
+    // thread will be waiting for MarkFinished to return. Thus it's safe to keep a
+    // weak reference to impl_ here
+    impl_->AddCallback(Callback{std::move(on_complete)}, opts);
+  }
+
+  /// \brief Overload of AddCallback that will return false instead of running
+  /// synchronously
+  ///
+  /// This overload will guarantee the callback is never run synchronously.  If the future
+  /// is already finished then it will simply return false.  This can be useful to avoid
+  /// stack overflow in a situation where you have recursive Futures.  For an example
+  /// see the Loop function
+  ///
+  /// Takes in a callback factory function to allow moving callbacks (the factory function
+  /// will only be called if the callback can successfully be added)
+  ///
+  /// Returns true if a callback was actually added and false if the callback failed
+  /// to add because the future was marked complete.
+  template <typename CallbackFactory,
+            typename OnComplete = detail::result_of_t<CallbackFactory()>,
+            typename Callback = WrapOnComplete<OnComplete>>
+  bool TryAddCallback(CallbackFactory callback_factory,
+                      CallbackOptions opts = CallbackOptions::Defaults()) const {
+    return impl_->TryAddCallback([&]() { return Callback{callback_factory()}; }, opts);
+  }
+
+  template <typename OnSuccess, typename OnFailure>
+  struct ThenOnComplete {
+    static constexpr bool has_no_args =
+        internal::call_traits::argument_count<OnSuccess>::value == 0;
+
+    using ContinuedFuture = detail::ContinueFuture::ForSignature<
+        detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
+
+    static_assert(
+        std::is_same<detail::ContinueFuture::ForSignature<OnFailure && (const Status&)>,
+                     ContinuedFuture>::value,
+        "OnSuccess and OnFailure must continue with the same future type");
+
+    struct DummyOnSuccess {
+      void operator()(const T&);
+    };
+    using OnSuccessArg = typename std::decay<internal::call_traits::argument_type<
+        0, detail::if_has_no_args<OnSuccess, DummyOnSuccess, OnSuccess>>>::type;
+
+    static_assert(
+        !std::is_same<OnSuccessArg, typename EnsureResult<OnSuccessArg>::type>::value,
+        "OnSuccess' argument should not be a Result");
+
+    void operator()(const Result<T>& result) && {
+      detail::ContinueFuture continue_future;
+      if (ARROW_PREDICT_TRUE(result.ok())) {
+        // move on_failure to a(n immediately destroyed) temporary to free its resources
+        ARROW_UNUSED(OnFailure(std::move(on_failure)));
+        continue_future.IgnoringArgsIf(
+            detail::if_has_no_args<OnSuccess, std::true_type, std::false_type>{},
+            std::move(next), std::move(on_success), result.ValueOrDie());
+      } else {
+        ARROW_UNUSED(OnSuccess(std::move(on_success)));
+        continue_future(std::move(next), std::move(on_failure), result.status());
+      }
+    }
+
+    OnSuccess on_success;
+    OnFailure on_failure;
+    ContinuedFuture next;
+  };
+
+  template <typename OnSuccess>
+  struct PassthruOnFailure {
+    using ContinuedFuture = detail::ContinueFuture::ForSignature<
+        detail::if_has_no_args<OnSuccess, OnSuccess && (), OnSuccess && (const T&)>>;
+
+    Result<typename ContinuedFuture::ValueType> operator()(const Status& s) { return s; }
+  };
+
+  /// \brief Consumer API: Register a continuation to run when this future completes
+  ///
+  /// The continuation will run in the same thread that called MarkFinished (whatever
+  /// callback is registered with this function will run before MarkFinished returns).
+  /// Avoid long-running callbacks in favor of submitting a task to an Executor and
+  /// returning the future.
+  ///
+  /// Two callbacks are supported:
+  /// - OnSuccess, called with the result (const ValueType&) on successful completion.
+  ///              for an empty future this will be called with nothing ()
+  /// - OnFailure, called with the error (const Status&) on failed completion.
+  ///              This callback is optional and defaults to a passthru of any errors.
+  ///
+  /// Then() returns a Future whose ValueType is derived from the return type of the
+  /// callbacks. If a callback returns:
+  /// - void, a Future<> will be returned which will completes successfully as soon
+  ///   as the callback runs.
+  /// - Status, a Future<> will be returned which will complete with the returned Status
+  ///   as soon as the callback runs.
+  /// - V or Result<V>, a Future<V> will be returned which will complete with the result
+  ///   of invoking the callback as soon as the callback runs.
+  /// - Future<V>, a Future<V> will be returned which will be marked complete when the
+  ///   future returned by the callback completes (and will complete with the same
+  ///   result).
+  ///
+  /// The continued Future type must be the same for both callbacks.
+  ///
+  /// Note that OnFailure can swallow errors, allowing continued Futures to successfully
+  /// complete even if this Future fails.
+  ///
+  /// If this future is already completed then the callback will be run immediately
+  /// and the returned future may already be marked complete.
+  ///
+  /// See AddCallback for general considerations when writing callbacks.
+  template <typename OnSuccess, typename OnFailure = PassthruOnFailure<OnSuccess>,
+            typename OnComplete = ThenOnComplete<OnSuccess, OnFailure>,
+            typename ContinuedFuture = typename OnComplete::ContinuedFuture>
+  ContinuedFuture Then(OnSuccess on_success, OnFailure on_failure = {},
+                       CallbackOptions options = CallbackOptions::Defaults()) const {
+    auto next = ContinuedFuture::Make();
+    AddCallback(OnComplete{std::forward<OnSuccess>(on_success),
+                           std::forward<OnFailure>(on_failure), next},
+                options);
+    return next;
+  }
+
+  /// \brief Implicit constructor to create a finished future from a value
+  Future(ValueType val) : Future() {  // NOLINT runtime/explicit
+    impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+    SetResult(std::move(val));
+  }
+
+  /// \brief Implicit constructor to create a future from a Result, enabling use
+  ///     of macros like ARROW_ASSIGN_OR_RAISE.
+  Future(Result<ValueType> res) : Future() {  // NOLINT runtime/explicit
+    if (ARROW_PREDICT_TRUE(res.ok())) {
+      impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+    } else {
+      impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
+    }
+    SetResult(std::move(res));
+  }
+
+  /// \brief Implicit constructor to create a future from a Status, enabling use
+  ///     of macros like ARROW_RETURN_NOT_OK.
+  Future(Status s)  // NOLINT runtime/explicit
+      : Future(Result<ValueType>(std::move(s))) {}
+
+ protected:
+  void InitializeFromResult(Result<ValueType> res) {
+    if (ARROW_PREDICT_TRUE(res.ok())) {
+      impl_ = FutureImpl::MakeFinished(FutureState::SUCCESS);
+    } else {
+      impl_ = FutureImpl::MakeFinished(FutureState::FAILURE);
+    }
+    SetResult(std::move(res));
+  }
+
+  void Initialize() { impl_ = FutureImpl::Make(); }
+
+  Result<ValueType>* GetResult() const { return impl_->CastResult<ValueType>(); }
+
+  void SetResult(Result<ValueType> res) {
+    impl_->result_ = {new Result<ValueType>(std::move(res)),
+                      [](void* p) { delete static_cast<Result<ValueType>*>(p); }};
+  }
+
+  void DoMarkFinished(Result<ValueType> res) {
+    SetResult(std::move(res));
+
+    if (ARROW_PREDICT_TRUE(GetResult()->ok())) {
+      impl_->MarkFinished();
+    } else {
+      impl_->MarkFailed();
+    }
+  }
+
+  void CheckValid() const {
+#ifndef NDEBUG
+    if (!is_valid()) {
+      Status::Invalid("Invalid Future (default-initialized?)").Abort();
+    }
+#endif
+  }
+
+  explicit Future(std::shared_ptr<FutureImpl> impl) : impl_(std::move(impl)) {}
+
+  std::shared_ptr<FutureImpl> impl_;
+
+  friend struct detail::ContinueFuture;
+
+  template <typename U>
+  friend class Future;
+  friend class WeakFuture<T>;
+
+  FRIEND_TEST(FutureRefTest, ChainRemoved);
+  FRIEND_TEST(FutureRefTest, TailRemoved);
+  FRIEND_TEST(FutureRefTest, HeadRemoved);
+};
+
+template <typename T>
+typename Future<T>::SyncType FutureToSync(const Future<T>& fut) {
+  return fut.result();
+}
+
+template <>
+inline typename Future<internal::Empty>::SyncType FutureToSync<internal::Empty>(
+    const Future<internal::Empty>& fut) {
+  return fut.status();
+}
+
+template <>
+inline Future<>::Future(Status s) : Future(internal::Empty::ToResult(std::move(s))) {}
+
+template <typename T>
+class WeakFuture {
+ public:
+  explicit WeakFuture(const Future<T>& future) : impl_(future.impl_) {}
+
+  Future<T> get() { return Future<T>{impl_.lock()}; }
+
+ private:
+  std::weak_ptr<FutureImpl> impl_;
+};
+
+/// \defgroup future-utilities Functions for working with Futures
+/// @{
+
+/// If a Result<Future> holds an error instead of a Future, construct a finished Future
+/// holding that error.
+template <typename T>
+static Future<T> DeferNotOk(Result<Future<T>> maybe_future) {
+  if (ARROW_PREDICT_FALSE(!maybe_future.ok())) {
+    return Future<T>::MakeFinished(std::move(maybe_future).status());
+  }
+  return std::move(maybe_future).MoveValueUnsafe();
+}
+
+/// \brief Create a Future which completes when all of `futures` complete.
+///
+/// The future's result is a vector of the results of `futures`.
+/// Note that this future will never be marked "failed"; failed results
+/// will be stored in the result vector alongside successful results.
+template <typename T>
+Future<std::vector<Result<T>>> All(std::vector<Future<T>> futures) {
+  struct State {
+    explicit State(std::vector<Future<T>> f)
+        : futures(std::move(f)), n_remaining(futures.size()) {}
+
+    std::vector<Future<T>> futures;
+    std::atomic<size_t> n_remaining;
+  };
+
+  if (futures.size() == 0) {
+    return {std::vector<Result<T>>{}};
+  }
+
+  auto state = std::make_shared<State>(std::move(futures));
+
+  auto out = Future<std::vector<Result<T>>>::Make();
+  for (const Future<T>& future : state->futures) {
+    future.AddCallback([state, out](const Result<T>&) mutable {
+      if (state->n_remaining.fetch_sub(1) != 1) return;
+
+      std::vector<Result<T>> results(state->futures.size());
+      for (size_t i = 0; i < results.size(); ++i) {
+        results[i] = state->futures[i].result();
+      }
+      out.MarkFinished(std::move(results));
+    });
+  }
+  return out;
+}
+
+/// \brief Create a Future which completes when all of `futures` complete.
+///
+/// The future will be marked complete if all `futures` complete
+/// successfully. Otherwise, it will be marked failed with the status of
+/// the first failing future.
+ARROW_EXPORT
+Future<> AllComplete(const std::vector<Future<>>& futures);
+
+/// \brief Create a Future which completes when all of `futures` complete.
+///
+/// The future will finish with an ok status if all `futures` finish with
+/// an ok status. Otherwise, it will be marked failed with the status of
+/// one of the failing futures.
+///
+/// Unlike AllComplete this Future will not complete immediately when a
+/// failure occurs.  It will wait until all futures have finished.
+ARROW_EXPORT
+Future<> AllFinished(const std::vector<Future<>>& futures);
+
+/// @}
+
+struct Continue {
+  template <typename T>
+  operator std::optional<T>() && {  // NOLINT explicit
+    return {};
+  }
+};
+
+template <typename T = internal::Empty>
+std::optional<T> Break(T break_value = {}) {
+  return std::optional<T>{std::move(break_value)};
+}
+
+template <typename T = internal::Empty>
+using ControlFlow = std::optional<T>;
+
+/// \brief Loop through an asynchronous sequence
+///
+/// \param[in] iterate A generator of Future<ControlFlow<BreakValue>>. On completion
+/// of each yielded future the resulting ControlFlow will be examined. A Break will
+/// terminate the loop, while a Continue will re-invoke `iterate`.
+///
+/// \return A future which will complete when a Future returned by iterate completes with
+/// a Break
+template <typename Iterate,
+          typename Control = typename detail::result_of_t<Iterate()>::ValueType,
+          typename BreakValueType = typename Control::value_type>
+Future<BreakValueType> Loop(Iterate iterate) {
+  struct Callback {
+    bool CheckForTermination(const Result<Control>& control_res) {
+      if (!control_res.ok()) {
+        break_fut.MarkFinished(control_res.status());
+        return true;
+      }
+      if (control_res->has_value()) {
+        break_fut.MarkFinished(**control_res);
+        return true;
+      }
+      return false;
+    }
+
+    void operator()(const Result<Control>& maybe_control) && {
+      if (CheckForTermination(maybe_control)) return;
+
+      auto control_fut = iterate();
+      while (true) {
+        if (control_fut.TryAddCallback([this]() { return *this; })) {
+          // Adding a callback succeeded; control_fut was not finished
+          // and we must wait to CheckForTermination.
+          return;
+        }
+        // Adding a callback failed; control_fut was finished and we
+        // can CheckForTermination immediately. This also avoids recursion and potential
+        // stack overflow.
+        if (CheckForTermination(control_fut.result())) return;
+
+        control_fut = iterate();
+      }
+    }
+
+    Iterate iterate;
+
+    // If the future returned by control_fut is never completed then we will be hanging on
+    // to break_fut forever even if the listener has given up listening on it.  Instead we
+    // rely on the fact that a producer (the caller of Future<>::Make) is always
+    // responsible for completing the futures they create.
+    // TODO: Could avoid this kind of situation with "future abandonment" similar to mesos
+    Future<BreakValueType> break_fut;
+  };
+
+  auto break_fut = Future<BreakValueType>::Make();
+  auto control_fut = iterate();
+  control_fut.AddCallback(Callback{std::move(iterate), break_fut});
+
+  return break_fut;
+}
+
+inline Future<> ToFuture(Status status) {
+  return Future<>::MakeFinished(std::move(status));
+}
+
+template <typename T>
+Future<T> ToFuture(T value) {
+  return Future<T>::MakeFinished(std::move(value));
+}
+
+template <typename T>
+Future<T> ToFuture(Result<T> maybe_value) {
+  return Future<T>::MakeFinished(std::move(maybe_value));
+}
+
+template <typename T>
+Future<T> ToFuture(Future<T> fut) {
+  return fut;
+}
+
+template <typename T>
+struct EnsureFuture {
+  using type = decltype(ToFuture(std::declval<T>()));
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/hash_util.h b/pyarrow/include/arrow/util/hash_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b3de2208935fa8c7c8afbc83ba9982f4907491d
--- /dev/null
+++ b/pyarrow/include/arrow/util/hash_util.h
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+namespace internal {
+
+// ----------------------------------------------------------------------
+// BEGIN Hash utilities from Boost
+
+namespace detail {
+
+#if defined(_MSC_VER)
+#  define ARROW_HASH_ROTL32(x, r) _rotl(x, r)
+#else
+#  define ARROW_HASH_ROTL32(x, r) (x << r) | (x >> (32 - r))
+#endif
+
+template <typename SizeT>
+inline void hash_combine_impl(SizeT& seed, SizeT value) {
+  seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline void hash_combine_impl(uint32_t& h1, uint32_t k1) {
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  k1 *= c1;
+  k1 = ARROW_HASH_ROTL32(k1, 15);
+  k1 *= c2;
+
+  h1 ^= k1;
+  h1 = ARROW_HASH_ROTL32(h1, 13);
+  h1 = h1 * 5 + 0xe6546b64;
+}
+
+#undef ARROW_HASH_ROTL32
+
+}  // namespace detail
+
+template <class T>
+inline void hash_combine(std::size_t& seed, T const& v) {
+  std::hash<T> hasher;
+  return ::arrow::internal::detail::hash_combine_impl(seed, hasher(v));
+}
+
+// END Hash utilities from Boost
+// ----------------------------------------------------------------------
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/hashing.h b/pyarrow/include/arrow/util/hashing.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac3beea2660720e034d5d052de32ff2e2252337d
--- /dev/null
+++ b/pyarrow/include/arrow/util/hashing.h
@@ -0,0 +1,984 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Private header, not to be exported
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array/builder_binary.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_builders.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/float16.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/ubsan.h"
+
+#define XXH_INLINE_ALL
+
+#include "arrow/vendored/xxhash.h"  // IWYU pragma: keep
+
+namespace arrow {
+namespace internal {
+
+// XXX would it help to have a 32-bit hash value on large datasets?
+typedef uint64_t hash_t;
+
+// Notes about the choice of a hash function.
+// - XXH3 is extremely fast on most data sizes, from small to huge;
+//   faster even than HW CRC-based hashing schemes
+// - our custom hash function for tiny values (< 16 bytes) is still
+//   significantly faster (~30%), at least on this machine and compiler
+
+template <uint64_t AlgNum>
+inline hash_t ComputeStringHash(const void* data, int64_t length);
+
+/// \brief A hash function for bitmaps that can handle offsets and lengths in
+/// terms of number of bits. The hash only depends on the bits actually hashed.
+///
+/// It's the caller's responsibility to ensure that bits_offset + num_bits are
+/// readable from the bitmap.
+///
+/// \pre bits_offset >= 0
+/// \pre num_bits >= 0
+/// \pre (bits_offset + num_bits + 7) / 8 <= readable length in bytes from bitmap
+///
+/// \param bitmap The pointer to the bitmap.
+/// \param seed The seed for the hash function (useful when chaining hash functions).
+/// \param bits_offset The offset in bits relative to the start of the bitmap.
+/// \param num_bits The number of bits after the offset to be hashed.
+ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed,
+                                      int64_t bits_offset, int64_t num_bits);
+
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelperBase {
+  static bool CompareScalars(Scalar u, Scalar v) { return u == v; }
+
+  static hash_t ComputeHash(const Scalar& value) {
+    // Generic hash computation for scalars.  Simply apply the string hash
+    // to the bit representation of the value.
+
+    // XXX in the case of FP values, we'd like equal values to have the same hash,
+    // even if they have different bit representations...
+    return ComputeStringHash<AlgNum>(&value, sizeof(value));
+  }
+};
+
+template <typename Scalar, uint64_t AlgNum = 0, typename Enable = void>
+struct ScalarHelper : public ScalarHelperBase<Scalar, AlgNum> {};
+
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_integral<Scalar>::value>>
+    : public ScalarHelperBase<Scalar, AlgNum> {
+  // ScalarHelper specialization for integers
+
+  static hash_t ComputeHash(const Scalar& value) {
+    // Faster hash computation for integers.
+
+    // Two of xxhash's prime multipliers (which are chosen for their
+    // bit dispersion properties)
+    static constexpr uint64_t multipliers[] = {11400714785074694791ULL,
+                                               14029467366897019727ULL};
+
+    // Multiplying by the prime number mixes the low bits into the high bits,
+    // then byte-swapping (which is a single CPU instruction) allows the
+    // combined high and low bits to participate in the initial hash table index.
+    auto h = static_cast<hash_t>(value);
+    return bit_util::ByteSwap(multipliers[AlgNum] * h);
+  }
+};
+
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelper<Scalar, AlgNum,
+                    enable_if_t<std::is_same<std::string_view, Scalar>::value>>
+    : public ScalarHelperBase<Scalar, AlgNum> {
+  // ScalarHelper specialization for std::string_view
+
+  static hash_t ComputeHash(std::string_view value) {
+    return ComputeStringHash<AlgNum>(value.data(), static_cast<int64_t>(value.size()));
+  }
+};
+
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelper<Scalar, AlgNum, enable_if_t<std::is_floating_point<Scalar>::value>>
+    : public ScalarHelperBase<Scalar, AlgNum> {
+  // ScalarHelper specialization for reals
+
+  static bool CompareScalars(Scalar u, Scalar v) {
+    if (std::isnan(u)) {
+      // XXX should we do a bit-precise comparison?
+      return std::isnan(v);
+    }
+    return u == v;
+  }
+};
+
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelper<Scalar, AlgNum,
+                    enable_if_t<std::is_same_v<Scalar, ::arrow::util::Float16>>>
+    : public ScalarHelperBase<Scalar, AlgNum> {
+  // ScalarHelper specialization for Float16
+
+  static bool CompareScalars(Scalar u, Scalar v) {
+    if (u.is_nan()) {
+      // XXX should we do a bit-precise comparison?
+      return v.is_nan();
+    }
+    return u == v;
+  }
+};
+
+template <uint64_t AlgNum = 0>
+hash_t ComputeStringHash(const void* data, int64_t length) {
+  if (ARROW_PREDICT_TRUE(length <= 16)) {
+    // Specialize for small hash strings, as they are quite common as
+    // hash table keys.  Even XXH3 isn't quite as fast.
+    auto p = reinterpret_cast<const uint8_t*>(data);
+    auto n = static_cast<uint32_t>(length);
+    if (n <= 8) {
+      if (n <= 3) {
+        if (n == 0) {
+          return 1U;
+        }
+        uint32_t x = (n << 24) ^ (p[0] << 16) ^ (p[n / 2] << 8) ^ p[n - 1];
+        return ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
+      }
+      // 4 <= length <= 8
+      // We can read the string as two overlapping 32-bit ints, apply
+      // different hash functions to each of them in parallel, then XOR
+      // the results
+      uint32_t x, y;
+      hash_t hx, hy;
+      x = util::SafeLoadAs<uint32_t>(p + n - 4);
+      y = util::SafeLoadAs<uint32_t>(p);
+      hx = ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
+      hy = ScalarHelper<uint32_t, AlgNum ^ 1>::ComputeHash(y);
+      return n ^ hx ^ hy;
+    }
+    // 8 <= length <= 16
+    // Apply the same principle as above
+    uint64_t x, y;
+    hash_t hx, hy;
+    x = util::SafeLoadAs<uint64_t>(p + n - 8);
+    y = util::SafeLoadAs<uint64_t>(p);
+    hx = ScalarHelper<uint64_t, AlgNum>::ComputeHash(x);
+    hy = ScalarHelper<uint64_t, AlgNum ^ 1>::ComputeHash(y);
+    return n ^ hx ^ hy;
+  }
+
+#if XXH3_SECRET_SIZE_MIN != 136
+#  error XXH3_SECRET_SIZE_MIN changed, please fix kXxh3Secrets
+#endif
+
+  // XXH3_64bits_withSeed generates a secret based on the seed, which is too slow.
+  // Instead, we use hard-coded random secrets.  To maximize cache efficiency,
+  // they reuse the same memory area.
+  static constexpr unsigned char kXxh3Secrets[XXH3_SECRET_SIZE_MIN + 1] = {
+      0xe7, 0x8b, 0x13, 0xf9, 0xfc, 0xb5, 0x8e, 0xef, 0x81, 0x48, 0x2c, 0xbf, 0xf9, 0x9f,
+      0xc1, 0x1e, 0x43, 0x6d, 0xbf, 0xa6, 0x6d, 0xb5, 0x72, 0xbc, 0x97, 0xd8, 0x61, 0x24,
+      0x0f, 0x12, 0xe3, 0x05, 0x21, 0xf7, 0x5c, 0x66, 0x67, 0xa5, 0x65, 0x03, 0x96, 0x26,
+      0x69, 0xd8, 0x29, 0x20, 0xf8, 0xc7, 0xb0, 0x3d, 0xdd, 0x7d, 0x18, 0xa0, 0x60, 0x75,
+      0x92, 0xa4, 0xce, 0xba, 0xc0, 0x77, 0xf4, 0xac, 0xb7, 0x03, 0x53, 0xf0, 0x98, 0xce,
+      0xe6, 0x2b, 0x20, 0xc7, 0x82, 0x91, 0xab, 0xbf, 0x68, 0x5c, 0x62, 0x4d, 0x33, 0xa3,
+      0xe1, 0xb3, 0xff, 0x97, 0x54, 0x4c, 0x44, 0x34, 0xb5, 0xb9, 0x32, 0x4c, 0x75, 0x42,
+      0x89, 0x53, 0x94, 0xd4, 0x9f, 0x2b, 0x76, 0x4d, 0x4e, 0xe6, 0xfa, 0x15, 0x3e, 0xc1,
+      0xdb, 0x71, 0x4b, 0x2c, 0x94, 0xf5, 0xfc, 0x8c, 0x89, 0x4b, 0xfb, 0xc1, 0x82, 0xa5,
+      0x6a, 0x53, 0xf9, 0x4a, 0xba, 0xce, 0x1f, 0xc0, 0x97, 0x1a, 0x87};
+
+  static_assert(AlgNum < 2, "AlgNum too large");
+  static constexpr auto secret = kXxh3Secrets + AlgNum;
+  return XXH3_64bits_withSecret(data, static_cast<size_t>(length), secret,
+                                XXH3_SECRET_SIZE_MIN);
+}
+
+// XXX add a HashEq<ArrowType> struct with both hash and compare functions?
+
+// ----------------------------------------------------------------------
+// An open-addressing insert-only hash table (no deletes)
+
+template <typename Payload>
+class HashTable {
+ public:
+  static constexpr hash_t kSentinel = 0ULL;
+  static constexpr int64_t kLoadFactor = 2UL;
+
+  struct Entry {
+    hash_t h;
+    Payload payload;
+
+    // An entry is valid if the hash is different from the sentinel value
+    operator bool() const { return h != kSentinel; }
+  };
+
+  HashTable(MemoryPool* pool, uint64_t capacity) : entries_builder_(pool) {
+    ARROW_DCHECK_NE(pool, nullptr);
+    // Minimum of 32 elements
+    capacity = std::max<uint64_t>(capacity, 32UL);
+    capacity_ = bit_util::NextPower2(capacity);
+    capacity_mask_ = capacity_ - 1;
+    size_ = 0;
+
+    ARROW_DCHECK_OK(UpsizeBuffer(capacity_));
+  }
+
+  // Lookup with non-linear probing
+  // cmp_func should have signature bool(const Payload*).
+  // Return a (Entry*, found) pair.
+  template <typename CmpFunc>
+  std::pair<Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) {
+    auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_,
+                                        std::forward<CmpFunc>(cmp_func));
+    return {&entries_[p.first], p.second};
+  }
+
+  template <typename CmpFunc>
+  std::pair<const Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) const {
+    auto p = Lookup<DoCompare, CmpFunc>(h, entries_, capacity_mask_,
+                                        std::forward<CmpFunc>(cmp_func));
+    return {&entries_[p.first], p.second};
+  }
+
+  Status Insert(Entry* entry, hash_t h, const Payload& payload) {
+    // Ensure entry is empty before inserting
+    assert(!*entry);
+    entry->h = FixHash(h);
+    entry->payload = payload;
+    ++size_;
+
+    if (ARROW_PREDICT_FALSE(NeedUpsizing())) {
+      // Resize less frequently since it is expensive
+      return Upsize(capacity_ * kLoadFactor * 2);
+    }
+    return Status::OK();
+  }
+
+  uint64_t size() const { return size_; }
+
+  // Visit all non-empty entries in the table
+  // The visit_func should have signature void(const Entry*)
+  template <typename VisitFunc>
+  void VisitEntries(VisitFunc&& visit_func) const {
+    for (uint64_t i = 0; i < capacity_; i++) {
+      const auto& entry = entries_[i];
+      if (entry) {
+        visit_func(&entry);
+      }
+    }
+  }
+
+ protected:
+  // NoCompare is for when the value is known not to exist in the table
+  enum CompareKind { DoCompare, NoCompare };
+
+  // The workhorse lookup function
+  template <CompareKind CKind, typename CmpFunc>
+  std::pair<uint64_t, bool> Lookup(hash_t h, const Entry* entries, uint64_t size_mask,
+                                   CmpFunc&& cmp_func) const {
+    static constexpr uint8_t perturb_shift = 5;
+
+    uint64_t index, perturb;
+    const Entry* entry;
+
+    h = FixHash(h);
+    index = h & size_mask;
+    perturb = (h >> perturb_shift) + 1U;
+
+    while (true) {
+      entry = &entries[index];
+      if (CompareEntry<CKind, CmpFunc>(h, entry, std::forward<CmpFunc>(cmp_func))) {
+        // Found
+        return {index, true};
+      }
+      if (entry->h == kSentinel) {
+        // Empty slot
+        return {index, false};
+      }
+
+      // Perturbation logic inspired from CPython's set / dict object.
+      // The goal is that all 64 bits of the unmasked hash value eventually
+      // participate in the probing sequence, to minimize clustering.
+      index = (index + perturb) & size_mask;
+      perturb = (perturb >> perturb_shift) + 1U;
+    }
+  }
+
+  template <CompareKind CKind, typename CmpFunc>
+  bool CompareEntry(hash_t h, const Entry* entry, CmpFunc&& cmp_func) const {
+    if (CKind == NoCompare) {
+      return false;
+    } else {
+      return entry->h == h && cmp_func(&entry->payload);
+    }
+  }
+
+  bool NeedUpsizing() const {
+    // Keep the load factor <= 1/2
+    return size_ * kLoadFactor >= capacity_;
+  }
+
+  Status UpsizeBuffer(uint64_t capacity) {
+    RETURN_NOT_OK(entries_builder_.Resize(capacity));
+    entries_ = entries_builder_.mutable_data();
+    memset(static_cast<void*>(entries_), 0, capacity * sizeof(Entry));
+
+    return Status::OK();
+  }
+
+  Status Upsize(uint64_t new_capacity) {
+    assert(new_capacity > capacity_);
+    uint64_t new_mask = new_capacity - 1;
+    assert((new_capacity & new_mask) == 0);  // it's a power of two
+
+    // Stash old entries and seal builder, effectively resetting the Buffer
+    const Entry* old_entries = entries_;
+    ARROW_ASSIGN_OR_RAISE(auto previous, entries_builder_.FinishWithLength(capacity_));
+    // Allocate new buffer
+    RETURN_NOT_OK(UpsizeBuffer(new_capacity));
+
+    for (uint64_t i = 0; i < capacity_; i++) {
+      const auto& entry = old_entries[i];
+      if (entry) {
+        // Dummy compare function will not be called
+        auto p = Lookup<NoCompare>(entry.h, entries_, new_mask,
+                                   [](const Payload*) { return false; });
+        // Lookup<NoCompare> (and CompareEntry<NoCompare>) ensure that an
+        // empty slots is always returned
+        assert(!p.second);
+        entries_[p.first] = entry;
+      }
+    }
+    capacity_ = new_capacity;
+    capacity_mask_ = new_mask;
+
+    return Status::OK();
+  }
+
+  hash_t FixHash(hash_t h) const { return (h == kSentinel) ? 42U : h; }
+
+  // The number of slots available in the hash table array.
+  uint64_t capacity_;
+  uint64_t capacity_mask_;
+  // The number of used slots in the hash table array.
+  uint64_t size_;
+
+  Entry* entries_;
+  TypedBufferBuilder<Entry> entries_builder_;
+};
+
+// XXX typedef memo_index_t int32_t ?
+
+constexpr int32_t kKeyNotFound = -1;
+
+// ----------------------------------------------------------------------
+// A base class for memoization table.
+
+class MemoTable {
+ public:
+  virtual ~MemoTable() = default;
+
+  virtual int32_t size() const = 0;
+};
+
+// ----------------------------------------------------------------------
+// A memoization table for memory-cheap scalar values.
+
+// The memoization table remembers and allows to look up the insertion
+// index for each key.
+
+template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
+class ScalarMemoTable : public MemoTable {
+ public:
+  explicit ScalarMemoTable(MemoryPool* pool, int64_t entries = 0)
+      : hash_table_(pool, static_cast<uint64_t>(entries)) {}
+
+  template <typename Value>
+  int32_t Get(Value&& v) const {
+    const Scalar value(std::forward<Value>(v));
+    auto cmp_func = [value](const Payload* payload) -> bool {
+      return ScalarHelper<Scalar, 0>::CompareScalars(payload->value, value);
+    };
+    hash_t h = ComputeHash(value);
+    auto p = hash_table_.Lookup(h, cmp_func);
+    if (p.second) {
+      return p.first->payload.memo_index;
+    } else {
+      return kKeyNotFound;
+    }
+  }
+
+  template <typename Value, typename Func1, typename Func2>
+  Status GetOrInsert(Value&& v, Func1&& on_found, Func2&& on_not_found,
+                     int32_t* out_memo_index) {
+    const Scalar value(std::forward<Value>(v));
+    auto cmp_func = [value](const Payload* payload) -> bool {
+      return ScalarHelper<Scalar, 0>::CompareScalars(value, payload->value);
+    };
+    hash_t h = ComputeHash(value);
+    auto p = hash_table_.Lookup(h, cmp_func);
+    int32_t memo_index;
+    if (p.second) {
+      memo_index = p.first->payload.memo_index;
+      on_found(memo_index);
+    } else {
+      memo_index = size();
+      RETURN_NOT_OK(hash_table_.Insert(p.first, h, {value, memo_index}));
+      on_not_found(memo_index);
+    }
+    *out_memo_index = memo_index;
+    return Status::OK();
+  }
+
+  template <typename Value>
+  Status GetOrInsert(Value&& value, int32_t* out_memo_index) {
+    return GetOrInsert(
+        value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
+  }
+
+  int32_t GetNull() const { return null_index_; }
+
+  template <typename Func1, typename Func2>
+  int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
+    int32_t memo_index = GetNull();
+    if (memo_index != kKeyNotFound) {
+      on_found(memo_index);
+    } else {
+      null_index_ = memo_index = size();
+      on_not_found(memo_index);
+    }
+    return memo_index;
+  }
+
+  int32_t GetOrInsertNull() {
+    return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
+  }
+
+  // The number of entries in the memo table +1 if null was added.
+  // (which is also 1 + the largest memo index)
+  int32_t size() const override {
+    return static_cast<int32_t>(hash_table_.size()) + (GetNull() != kKeyNotFound);
+  }
+
+  // Copy values starting from index `start` into `out_data`
+  template <typename Value>
+  void CopyValues(int32_t start, Value* out_data) const {
+    // So that both uint16_t and Float16 are allowed
+    static_assert(sizeof(Value) == sizeof(Scalar));
+    Scalar* out = reinterpret_cast<Scalar*>(out_data);
+    hash_table_.VisitEntries([=](const HashTableEntry* entry) {
+      int32_t index = entry->payload.memo_index - start;
+      if (index >= 0) {
+        out[index] = entry->payload.value;
+      }
+    });
+    // Zero-initialize the null entry
+    if (null_index_ != kKeyNotFound) {
+      int32_t index = null_index_ - start;
+      if (index >= 0) {
+        out[index] = Scalar{};
+      }
+    }
+  }
+
+  template <typename Value>
+  void CopyValues(Value* out_data) const {
+    CopyValues(0, out_data);
+  }
+
+ protected:
+  struct Payload {
+    Scalar value;
+    int32_t memo_index;
+  };
+
+  using HashTableType = HashTableTemplateType<Payload>;
+  using HashTableEntry = typename HashTableType::Entry;
+  HashTableType hash_table_;
+  int32_t null_index_ = kKeyNotFound;
+
+  hash_t ComputeHash(const Scalar& value) const {
+    return ScalarHelper<Scalar, 0>::ComputeHash(value);
+  }
+
+ public:
+  // defined here so that `HashTableType` is visible
+  // Merge entries from `other_table` into `this->hash_table_`.
+  Status MergeTable(const ScalarMemoTable& other_table) {
+    const HashTableType& other_hashtable = other_table.hash_table_;
+
+    other_hashtable.VisitEntries([this](const HashTableEntry* other_entry) {
+      int32_t unused;
+      ARROW_DCHECK_OK(this->GetOrInsert(other_entry->payload.value, &unused));
+    });
+    // TODO: ARROW-17074 - implement proper error handling
+    return Status::OK();
+  }
+};
+
+// ----------------------------------------------------------------------
+// A memoization table for small scalar values, using direct indexing
+
+template <typename Scalar, typename Enable = void>
+struct SmallScalarTraits {};
+
+template <>
+struct SmallScalarTraits<bool> {
+  static constexpr int32_t cardinality = 2;
+
+  static uint32_t AsIndex(bool value) { return value ? 1 : 0; }
+};
+
+template <typename Scalar>
+struct SmallScalarTraits<Scalar, enable_if_t<std::is_integral<Scalar>::value>> {
+  using Unsigned = typename std::make_unsigned<Scalar>::type;
+
+  static constexpr int32_t cardinality = 1U + std::numeric_limits<Unsigned>::max();
+
+  static uint32_t AsIndex(Scalar value) { return static_cast<Unsigned>(value); }
+};
+
+template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
+class SmallScalarMemoTable : public MemoTable {
+ public:
+  explicit SmallScalarMemoTable(MemoryPool* pool, int64_t entries = 0) {
+    std::fill(value_to_index_, value_to_index_ + cardinality + 1, kKeyNotFound);
+    index_to_value_.reserve(cardinality);
+  }
+
+  int32_t Get(const Scalar value) const {
+    auto value_index = AsIndex(value);
+    return value_to_index_[value_index];
+  }
+
+  template <typename Func1, typename Func2>
+  Status GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found,
+                     int32_t* out_memo_index) {
+    auto value_index = AsIndex(value);
+    auto memo_index = value_to_index_[value_index];
+    if (memo_index == kKeyNotFound) {
+      memo_index = static_cast<int32_t>(index_to_value_.size());
+      index_to_value_.push_back(value);
+      value_to_index_[value_index] = memo_index;
+      ARROW_DCHECK_LT(memo_index, cardinality + 1);
+      on_not_found(memo_index);
+    } else {
+      on_found(memo_index);
+    }
+    *out_memo_index = memo_index;
+    return Status::OK();
+  }
+
+  Status GetOrInsert(const Scalar value, int32_t* out_memo_index) {
+    return GetOrInsert(
+        value, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
+  }
+
+  int32_t GetNull() const { return value_to_index_[cardinality]; }
+
+  template <typename Func1, typename Func2>
+  int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
+    auto memo_index = GetNull();
+    if (memo_index == kKeyNotFound) {
+      memo_index = value_to_index_[cardinality] = size();
+      index_to_value_.push_back(0);
+      on_not_found(memo_index);
+    } else {
+      on_found(memo_index);
+    }
+    return memo_index;
+  }
+
+  int32_t GetOrInsertNull() {
+    return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
+  }
+
+  // The number of entries in the memo table
+  // (which is also 1 + the largest memo index)
+  int32_t size() const override { return static_cast<int32_t>(index_to_value_.size()); }
+
+  // Merge entries from `other_table` into `this`.
+  Status MergeTable(const SmallScalarMemoTable& other_table) {
+    for (const Scalar& other_val : other_table.index_to_value_) {
+      int32_t unused;
+      RETURN_NOT_OK(this->GetOrInsert(other_val, &unused));
+    }
+    return Status::OK();
+  }
+
+  // Copy values starting from index `start` into `out_data`
+  void CopyValues(int32_t start, Scalar* out_data) const {
+    ARROW_DCHECK_GE(start, 0);
+    ARROW_DCHECK_LE(static_cast<size_t>(start), index_to_value_.size());
+    int64_t offset = start * static_cast<int32_t>(sizeof(Scalar));
+    memcpy(out_data, index_to_value_.data() + offset, (size() - start) * sizeof(Scalar));
+  }
+
+  void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
+
+  const std::vector<Scalar>& values() const { return index_to_value_; }
+
+ protected:
+  static constexpr auto cardinality = SmallScalarTraits<Scalar>::cardinality;
+  static_assert(cardinality <= 256, "cardinality too large for direct-addressed table");
+
+  uint32_t AsIndex(Scalar value) const {
+    return SmallScalarTraits<Scalar>::AsIndex(value);
+  }
+
+  // The last index is reserved for the null element.
+  int32_t value_to_index_[cardinality + 1];
+  std::vector<Scalar> index_to_value_;
+};
+
+// ----------------------------------------------------------------------
+// A memoization table for variable-sized binary data.
+
+template <typename BinaryBuilderT>
+class BinaryMemoTable : public MemoTable {
+ public:
+  using builder_offset_type = typename BinaryBuilderT::offset_type;
+  explicit BinaryMemoTable(MemoryPool* pool, int64_t entries = 0,
+                           int64_t values_size = -1)
+      : hash_table_(pool, static_cast<uint64_t>(entries)), binary_builder_(pool) {
+    const int64_t data_size = (values_size < 0) ? entries * 4 : values_size;
+    ARROW_DCHECK_OK(binary_builder_.Resize(entries));
+    ARROW_DCHECK_OK(binary_builder_.ReserveData(data_size));
+  }
+
+  int32_t Get(const void* data, builder_offset_type length) const {
+    hash_t h = ComputeStringHash<0>(data, length);
+    auto p = Lookup(h, data, length);
+    if (p.second) {
+      return p.first->payload.memo_index;
+    } else {
+      return kKeyNotFound;
+    }
+  }
+
+  int32_t Get(std::string_view value) const {
+    return Get(value.data(), static_cast<builder_offset_type>(value.length()));
+  }
+
+  template <typename Func1, typename Func2>
+  Status GetOrInsert(const void* data, builder_offset_type length, Func1&& on_found,
+                     Func2&& on_not_found, int32_t* out_memo_index) {
+    hash_t h = ComputeStringHash<0>(data, length);
+    auto p = Lookup(h, data, length);
+    int32_t memo_index;
+    if (p.second) {
+      memo_index = p.first->payload.memo_index;
+      on_found(memo_index);
+    } else {
+      memo_index = size();
+      // Insert string value
+      RETURN_NOT_OK(binary_builder_.Append(static_cast<const char*>(data), length));
+      // Insert hash entry
+      RETURN_NOT_OK(
+          hash_table_.Insert(const_cast<HashTableEntry*>(p.first), h, {memo_index}));
+
+      on_not_found(memo_index);
+    }
+    *out_memo_index = memo_index;
+    return Status::OK();
+  }
+
+  template <typename Func1, typename Func2>
+  Status GetOrInsert(std::string_view value, Func1&& on_found, Func2&& on_not_found,
+                     int32_t* out_memo_index) {
+    return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()),
+                       std::forward<Func1>(on_found), std::forward<Func2>(on_not_found),
+                       out_memo_index);
+  }
+
+  Status GetOrInsert(const void* data, builder_offset_type length,
+                     int32_t* out_memo_index) {
+    return GetOrInsert(
+        data, length, [](int32_t i) {}, [](int32_t i) {}, out_memo_index);
+  }
+
+  Status GetOrInsert(std::string_view value, int32_t* out_memo_index) {
+    return GetOrInsert(value.data(), static_cast<builder_offset_type>(value.length()),
+                       out_memo_index);
+  }
+
+  int32_t GetNull() const { return null_index_; }
+
+  template <typename Func1, typename Func2>
+  int32_t GetOrInsertNull(Func1&& on_found, Func2&& on_not_found) {
+    int32_t memo_index = GetNull();
+    if (memo_index == kKeyNotFound) {
+      memo_index = null_index_ = size();
+      ARROW_DCHECK_OK(binary_builder_.AppendNull());
+      on_not_found(memo_index);
+    } else {
+      on_found(memo_index);
+    }
+    return memo_index;
+  }
+
+  int32_t GetOrInsertNull() {
+    return GetOrInsertNull([](int32_t i) {}, [](int32_t i) {});
+  }
+
+  // The number of entries in the memo table
+  // (which is also 1 + the largest memo index)
+  int32_t size() const override {
+    return static_cast<int32_t>(hash_table_.size() + (GetNull() != kKeyNotFound));
+  }
+
+  int64_t values_size() const { return binary_builder_.value_data_length(); }
+
+  // Copy (n + 1) offsets starting from index `start` into `out_data`
+  template <class Offset>
+  void CopyOffsets(int32_t start, Offset* out_data) const {
+    ARROW_DCHECK_LE(start, size());
+
+    const builder_offset_type* offsets = binary_builder_.offsets_data();
+    const builder_offset_type delta =
+        start < binary_builder_.length() ? offsets[start] : 0;
+    for (int32_t i = start; i < size(); ++i) {
+      const builder_offset_type adjusted_offset = offsets[i] - delta;
+      Offset cast_offset = static_cast<Offset>(adjusted_offset);
+      assert(static_cast<builder_offset_type>(cast_offset) ==
+             adjusted_offset);  // avoid truncation
+      *out_data++ = cast_offset;
+    }
+
+    // Copy last value since BinaryBuilder only materializes it on in Finish()
+    *out_data = static_cast<Offset>(binary_builder_.value_data_length() - delta);
+  }
+
+  template <class Offset>
+  void CopyOffsets(Offset* out_data) const {
+    CopyOffsets(0, out_data);
+  }
+
+  // Copy values starting from index `start` into `out_data`
+  void CopyValues(int32_t start, uint8_t* out_data) const {
+    CopyValues(start, -1, out_data);
+  }
+
+  // Same as above, but check output size in debug mode
+  void CopyValues(int32_t start, int64_t out_size, uint8_t* out_data) const {
+    ARROW_DCHECK_LE(start, size());
+
+    // The absolute byte offset of `start` value in the binary buffer.
+    const builder_offset_type offset = binary_builder_.offset(start);
+    const auto length = binary_builder_.value_data_length() - static_cast<size_t>(offset);
+
+    if (out_size != -1) {
+      assert(static_cast<int64_t>(length) <= out_size);
+    }
+
+    auto view = binary_builder_.GetView(start);
+    memcpy(out_data, view.data(), length);
+  }
+
+  void CopyValues(uint8_t* out_data) const { CopyValues(0, -1, out_data); }
+
+  void CopyValues(int64_t out_size, uint8_t* out_data) const {
+    CopyValues(0, out_size, out_data);
+  }
+
+  void CopyFixedWidthValues(int32_t start, int32_t width_size, int64_t out_size,
+                            uint8_t* out_data) const {
+    // This method exists to cope with the fact that the BinaryMemoTable does
+    // not know the fixed width when inserting the null value. The data
+    // buffer hold a zero length string for the null value (if found).
+    //
+    // Thus, the method will properly inject an empty value of the proper width
+    // in the output buffer.
+    //
+    if (start >= size()) {
+      return;
+    }
+
+    int32_t null_index = GetNull();
+    if (null_index < start) {
+      // Nothing to skip, proceed as usual.
+      CopyValues(start, out_size, out_data);
+      return;
+    }
+
+    builder_offset_type left_offset = binary_builder_.offset(start);
+
+    // Ensure that the data length is exactly missing width_size bytes to fit
+    // in the expected output (n_values * width_size).
+#ifndef NDEBUG
+    int64_t data_length = values_size() - static_cast<size_t>(left_offset);
+    assert(data_length + width_size == out_size);
+    ARROW_UNUSED(data_length);
+#endif
+
+    auto in_data = binary_builder_.value_data() + left_offset;
+    // The null use 0-length in the data, slice the data in 2 and skip by
+    // width_size in out_data. [part_1][width_size][part_2]
+    auto null_data_offset = binary_builder_.offset(null_index);
+    auto left_size = null_data_offset - left_offset;
+    if (left_size > 0) {
+      memcpy(out_data, in_data + left_offset, left_size);
+    }
+    // Zero-initialize the null entry
+    memset(out_data + left_size, 0, width_size);
+
+    auto right_size = values_size() - static_cast<size_t>(null_data_offset);
+    if (right_size > 0) {
+      // skip the null fixed size value.
+      auto out_offset = left_size + width_size;
+      assert(out_data + out_offset + right_size == out_data + out_size);
+      memcpy(out_data + out_offset, in_data + null_data_offset, right_size);
+    }
+  }
+
+  // Visit the stored values in insertion order.
+  // The visitor function should have the signature `void(std::string_view)`
+  // or `void(const std::string_view&)`.
+  template <typename VisitFunc>
+  void VisitValues(int32_t start, VisitFunc&& visit) const {
+    for (int32_t i = start; i < size(); ++i) {
+      visit(binary_builder_.GetView(i));
+    }
+  }
+
+  // Visit the stored value at a specific index in insertion order.
+  // The visitor function should have the signature `void(std::string_view)`
+  // or `void(const std::string_view&)`.
+  template <typename VisitFunc>
+  void VisitValue(int32_t idx, VisitFunc&& visit) const {
+    visit(binary_builder_.GetView(idx));
+  }
+
+ protected:
+  struct Payload {
+    int32_t memo_index;
+  };
+
+  using HashTableType = HashTable<Payload>;
+  using HashTableEntry = typename HashTable<Payload>::Entry;
+  HashTableType hash_table_;
+  BinaryBuilderT binary_builder_;
+
+  int32_t null_index_ = kKeyNotFound;
+
+  std::pair<const HashTableEntry*, bool> Lookup(hash_t h, const void* data,
+                                                builder_offset_type length) const {
+    auto cmp_func = [&](const Payload* payload) {
+      std::string_view lhs = binary_builder_.GetView(payload->memo_index);
+      std::string_view rhs(static_cast<const char*>(data), length);
+      return lhs == rhs;
+    };
+    return hash_table_.Lookup(h, cmp_func);
+  }
+
+ public:
+  Status MergeTable(const BinaryMemoTable& other_table) {
+    other_table.VisitValues(0, [this](std::string_view other_value) {
+      int32_t unused;
+      ARROW_DCHECK_OK(this->GetOrInsert(other_value, &unused));
+    });
+    return Status::OK();
+  }
+};
+
+template <typename T, typename Enable = void>
+struct HashTraits {};
+
+template <>
+struct HashTraits<BooleanType> {
+  using MemoTableType = SmallScalarMemoTable<bool>;
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_8bit_int<T>> {
+  using c_type = typename T::c_type;
+  using MemoTableType = SmallScalarMemoTable<typename T::c_type>;
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_t<has_c_type<T>::value && !is_8bit_int<T>::value>> {
+  using c_type = typename T::c_type;
+  using MemoTableType = ScalarMemoTable<c_type, HashTable>;
+};
+
+template <>
+struct HashTraits<HalfFloatType> {
+  using MemoTableType = ScalarMemoTable<::arrow::util::Float16>;
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_t<has_string_view<T>::value &&
+                                 !std::is_base_of<LargeBinaryType, T>::value>> {
+  using MemoTableType = BinaryMemoTable<BinaryBuilder>;
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_decimal<T>> {
+  using MemoTableType = BinaryMemoTable<BinaryBuilder>;
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_t<std::is_base_of<LargeBinaryType, T>::value>> {
+  using MemoTableType = BinaryMemoTable<LargeBinaryBuilder>;
+};
+
+template <typename MemoTableType>
+static inline Status ComputeNullBitmap(MemoryPool* pool, const MemoTableType& memo_table,
+                                       int64_t start_offset, int64_t* null_count,
+                                       std::shared_ptr<Buffer>* null_bitmap) {
+  int64_t dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
+  int64_t null_index = memo_table.GetNull();
+
+  *null_count = 0;
+  *null_bitmap = nullptr;
+
+  if (null_index != kKeyNotFound && null_index >= start_offset) {
+    null_index -= start_offset;
+    *null_count = 1;
+    ARROW_ASSIGN_OR_RAISE(*null_bitmap,
+                          internal::BitmapAllButOne(pool, dict_length, null_index));
+  }
+
+  return Status::OK();
+}
+
+struct StringViewHash {
+  // std::hash compatible hasher for use with std::unordered_*
+  // (the std::hash specialization provided by nonstd constructs std::string
+  // temporaries then invokes std::hash<std::string> against those)
+  hash_t operator()(std::string_view value) const {
+    return ComputeStringHash<0>(value.data(), static_cast<int64_t>(value.size()));
+  }
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/int_util.h b/pyarrow/include/arrow/util/int_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..59a2ac7109a3c08b4cd265f88b7ca0ecffe5ae9d
--- /dev/null
+++ b/pyarrow/include/arrow/util/int_util.h
@@ -0,0 +1,137 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+#include "arrow/status.h"
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class DataType;
+struct ArraySpan;
+struct Scalar;
+
+namespace internal {
+
+ARROW_EXPORT
+uint8_t DetectUIntWidth(const uint64_t* values, int64_t length, uint8_t min_width = 1);
+
+ARROW_EXPORT
+uint8_t DetectUIntWidth(const uint64_t* values, const uint8_t* valid_bytes,
+                        int64_t length, uint8_t min_width = 1);
+
+ARROW_EXPORT
+uint8_t DetectIntWidth(const int64_t* values, int64_t length, uint8_t min_width = 1);
+
+ARROW_EXPORT
+uint8_t DetectIntWidth(const int64_t* values, const uint8_t* valid_bytes, int64_t length,
+                       uint8_t min_width = 1);
+
+ARROW_EXPORT
+void DowncastInts(const int64_t* source, int8_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastInts(const int64_t* source, int16_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastInts(const int64_t* source, int32_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastInts(const int64_t* source, int64_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastUInts(const uint64_t* source, uint8_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastUInts(const uint64_t* source, uint16_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length);
+
+ARROW_EXPORT
+void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length);
+
+ARROW_EXPORT
+void UpcastInts(const int32_t* source, int64_t* dest, int64_t length);
+
+template <typename InputInt, typename OutputInt>
+inline typename std::enable_if<(sizeof(InputInt) >= sizeof(OutputInt))>::type CastInts(
+    const InputInt* source, OutputInt* dest, int64_t length) {
+  DowncastInts(source, dest, length);
+}
+
+template <typename InputInt, typename OutputInt>
+inline typename std::enable_if<(sizeof(InputInt) < sizeof(OutputInt))>::type CastInts(
+    const InputInt* source, OutputInt* dest, int64_t length) {
+  UpcastInts(source, dest, length);
+}
+
+template <typename InputInt, typename OutputInt>
+ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length,
+                                const int32_t* transpose_map);
+
+ARROW_EXPORT
+Status TransposeInts(const DataType& src_type, const DataType& dest_type,
+                     const uint8_t* src, uint8_t* dest, int64_t src_offset,
+                     int64_t dest_offset, int64_t length, const int32_t* transpose_map);
+
+/// \brief Do vectorized boundschecking of integer-type array indices. The
+/// indices must be nonnegative and strictly less than the passed upper
+/// limit (which is usually the length of an array that is being indexed-into).
+ARROW_EXPORT
+Status CheckIndexBounds(const ArraySpan& values, uint64_t upper_limit);
+
+/// \brief Boundscheck integer values to determine if they are all between the
+/// passed upper and lower limits (inclusive). Upper and lower bounds must be
+/// the same type as the data and are not currently casted.
+ARROW_EXPORT
+Status CheckIntegersInRange(const ArraySpan& values, const Scalar& bound_lower,
+                            const Scalar& bound_upper);
+
+/// \brief Use CheckIntegersInRange to determine whether the passed integers
+/// can fit safely in the passed integer type. This helps quickly determine if
+/// integer narrowing (e.g. int64->int32) is safe to do.
+ARROW_EXPORT
+Status IntegersCanFit(const ArraySpan& values, const DataType& target_type);
+
+/// \brief Convenience for boundschecking a single Scalar value
+ARROW_EXPORT
+Status IntegersCanFit(const Scalar& value, const DataType& target_type);
+
+/// Upcast an integer to the largest possible width (currently 64 bits)
+
+template <typename Integer>
+typename std::enable_if<
+    std::is_integral<Integer>::value && std::is_signed<Integer>::value, int64_t>::type
+UpcastInt(Integer v) {
+  return v;
+}
+
+template <typename Integer>
+typename std::enable_if<
+    std::is_integral<Integer>::value && std::is_unsigned<Integer>::value, uint64_t>::type
+UpcastInt(Integer v) {
+  return v;
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/int_util_overflow.h b/pyarrow/include/arrow/util/int_util_overflow.h
new file mode 100644
index 0000000000000000000000000000000000000000..69714a935a489c0ef860e9a8e2b94443ee9bc9b8
--- /dev/null
+++ b/pyarrow/include/arrow/util/int_util_overflow.h
@@ -0,0 +1,238 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <initializer_list>
+#include <limits>
+#include <optional>
+#include <type_traits>
+
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+#include "arrow/vendored/safeint/safe_math.h"
+
+namespace arrow {
+namespace internal {
+
+// Define functions AddWithOverflow, SubtractWithOverflow, MultiplyWithOverflow
+// with the signature `bool(T u, T v, T* out)` where T is an integer type.
+// On overflow, these functions return true.  Otherwise, false is returned
+// and `out` is updated with the result of the operation.
+
+#define SAFE_INT_OP_WITH_OVERFLOW(_func_name, _op_name, _c_type, _type)             \
+  [[nodiscard]] static inline bool _func_name(_c_type u, _c_type v, _c_type* out) { \
+    return !check_##_op_name##_##_type##_##_type(u, v, out);                        \
+  }
+
+#define SAFE_INT_OPS_WITH_OVERFLOW(_func_name, _op_name)            \
+  SAFE_INT_OP_WITH_OVERFLOW(_func_name, _op_name, int32_t, int32)   \
+  SAFE_INT_OP_WITH_OVERFLOW(_func_name, _op_name, int64_t, int64)   \
+  SAFE_INT_OP_WITH_OVERFLOW(_func_name, _op_name, uint32_t, uint32) \
+  SAFE_INT_OP_WITH_OVERFLOW(_func_name, _op_name, uint64_t, uint64)
+
+SAFE_INT_OPS_WITH_OVERFLOW(SafeIntAddWithOverflow, add)
+SAFE_INT_OPS_WITH_OVERFLOW(SafeIntSubtractWithOverflow, sub)
+SAFE_INT_OPS_WITH_OVERFLOW(SafeIntMultiplyWithOverflow, mul)
+
+#undef SAFE_INT_OP_WITH_OVERFLOW
+#undef SAFE_INT_OPS_WITH_OVERFLOW
+
+template <typename Int, typename SignedRet, typename UnsignedRet>
+using transformed_int_t =
+    std::conditional_t<std::is_signed_v<Int>, SignedRet, UnsignedRet>;
+
+template <typename Int>
+using upscaled_int32_t = transformed_int_t<Int, int32_t, uint32_t>;
+
+// Use GCC/CLang builtins for checked arithmetic, promising better performance
+// than SafeInt's hand-written implementations.
+#if defined __has_builtin
+#  if __has_builtin(__builtin_object_size)
+#    define USE_CHECKED_ARITHMETIC_BUILTINS 1
+#  else
+#    define USE_CHECKED_ARITHMETIC_BUILTINS 0
+#  endif
+#endif
+
+template <typename Int>
+[[nodiscard]] bool AddWithOverflowGeneric(Int u, Int v, Int* out) {
+#if USE_CHECKED_ARITHMETIC_BUILTINS
+  return __builtin_add_overflow(u, v, out);
+#else
+  if constexpr (sizeof(Int) < 4) {
+    using UpscaledInt = upscaled_int32_t<Int>;
+    auto r = static_cast<UpscaledInt>(u) + static_cast<UpscaledInt>(v);
+    *out = static_cast<Int>(r);
+    return r != *out;
+  } else {
+    return SafeIntAddWithOverflow(u, v, out);
+  }
+#endif
+}
+
+template <typename Int>
+[[nodiscard]] bool SubtractWithOverflowGeneric(Int u, Int v, Int* out) {
+#if USE_CHECKED_ARITHMETIC_BUILTINS
+  return __builtin_sub_overflow(u, v, out);
+#else
+  if constexpr (sizeof(Int) < 4) {
+    using UpscaledInt = upscaled_int32_t<Int>;
+    auto r = static_cast<UpscaledInt>(u) - static_cast<UpscaledInt>(v);
+    *out = static_cast<Int>(r);
+    return r != *out;
+  } else {
+    return SafeIntSubtractWithOverflow(u, v, out);
+  }
+#endif
+}
+
+template <typename Int>
+[[nodiscard]] bool MultiplyWithOverflowGeneric(Int u, Int v, Int* out) {
+#if USE_CHECKED_ARITHMETIC_BUILTINS
+  return __builtin_mul_overflow(u, v, out);
+#else
+  if constexpr (sizeof(Int) < 4) {
+    using UpscaledInt = upscaled_int32_t<Int>;
+    auto r = static_cast<UpscaledInt>(u) * static_cast<UpscaledInt>(v);
+    *out = static_cast<Int>(r);
+    return r != *out;
+  } else {
+    return SafeIntMultiplyWithOverflow(u, v, out);
+  }
+#endif
+}
+
+template <typename Int>
+[[nodiscard]] bool DivideWithOverflowGeneric(Int u, Int v, Int* out) {
+  if (v == 0) {
+    *out = Int{};
+    return true;
+  }
+  if constexpr (std::is_signed_v<Int>) {
+    constexpr auto kMin = std::numeric_limits<Int>::min();
+    if (u == kMin && v == -1) {
+      *out = kMin;
+      return true;
+    }
+  }
+  *out = u / v;
+  return false;
+}
+
+// Define non-generic versions of the above so as to benefit from automatic
+// integer conversion, to allow for mixed-type calls such as
+// AddWithOverflow(int32_t, int64_t, int64_t*).
+
+#define NON_GENERIC_OP_WITH_OVERFLOW(_func_name, _c_type)                    \
+  [[nodiscard]] inline bool _func_name(_c_type u, _c_type v, _c_type* out) { \
+    return ARROW_PREDICT_FALSE(_func_name##Generic(u, v, out));              \
+  }
+
+#define NON_GENERIC_OPS_WITH_OVERFLOW(_func_name)    \
+  NON_GENERIC_OP_WITH_OVERFLOW(_func_name, int8_t)   \
+  NON_GENERIC_OP_WITH_OVERFLOW(_func_name, uint8_t)  \
+  NON_GENERIC_OP_WITH_OVERFLOW(_func_name, int16_t)  \
+  NON_GENERIC_OP_WITH_OVERFLOW(_func_name, uint16_t) \
+  NON_GENERIC_OP_WITH_OVERFLOW(_func_name, int32_t)  \
+  NON_GENERIC_OP_WITH_OVERFLOW(_func_name, uint32_t) \
+  NON_GENERIC_OP_WITH_OVERFLOW(_func_name, int64_t)  \
+  NON_GENERIC_OP_WITH_OVERFLOW(_func_name, uint64_t)
+
+NON_GENERIC_OPS_WITH_OVERFLOW(AddWithOverflow)
+NON_GENERIC_OPS_WITH_OVERFLOW(SubtractWithOverflow)
+NON_GENERIC_OPS_WITH_OVERFLOW(MultiplyWithOverflow)
+NON_GENERIC_OPS_WITH_OVERFLOW(DivideWithOverflow)
+
+#undef NON_GENERIC_OPS_WITH_OVERFLOW
+#undef NON_GENERIC_OP_WITH_OVERFLOW
+
+// Convenience functions over an arbitrary number of arguments
+template <typename Int>
+std::optional<Int> AddWithOverflow(std::initializer_list<Int> vs) {
+  if (vs.size() == 0) {
+    return {};
+  }
+  auto it = vs.begin();
+  Int v = *it++;
+  while (it != vs.end()) {
+    if (ARROW_PREDICT_FALSE(AddWithOverflowGeneric(v, *it++, &v))) {
+      return {};
+    }
+  }
+  return v;
+}
+
+template <typename Int>
+std::optional<Int> MultiplyWithOverflow(std::initializer_list<Int> vs) {
+  if (vs.size() == 0) {
+    return {};
+  }
+  auto it = vs.begin();
+  Int v = *it++;
+  while (it != vs.end()) {
+    if (ARROW_PREDICT_FALSE(MultiplyWithOverflowGeneric(v, *it++, &v))) {
+      return {};
+    }
+  }
+  return v;
+}
+
+// Define function NegateWithOverflow with the signature `bool(T u, T* out)`
+// where T is a signed integer type.  On overflow, these functions return true.
+// Otherwise, false is returned and `out` is updated with the result of the
+// operation.
+template <typename Int>
+[[nodiscard]] bool NegateWithOverflow(Int v, Int* out) {
+  return SubtractWithOverflow(Int{}, v, out);
+}
+
+/// Signed addition with well-defined behaviour on overflow (as unsigned)
+template <typename SignedInt>
+SignedInt SafeSignedAdd(SignedInt u, SignedInt v) {
+  using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
+  return static_cast<SignedInt>(static_cast<UnsignedInt>(u) +
+                                static_cast<UnsignedInt>(v));
+}
+
+/// Signed subtraction with well-defined behaviour on overflow (as unsigned)
+template <typename SignedInt>
+SignedInt SafeSignedSubtract(SignedInt u, SignedInt v) {
+  using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
+  return static_cast<SignedInt>(static_cast<UnsignedInt>(u) -
+                                static_cast<UnsignedInt>(v));
+}
+
+/// Signed negation with well-defined behaviour on overflow (as unsigned)
+template <typename SignedInt>
+SignedInt SafeSignedNegate(SignedInt u) {
+  using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
+  return static_cast<SignedInt>(~static_cast<UnsignedInt>(u) + 1);
+}
+
+/// Signed left shift with well-defined behaviour on negative numbers or overflow
+template <typename SignedInt, typename Shift>
+SignedInt SafeLeftShift(SignedInt u, Shift shift) {
+  using UnsignedInt = typename std::make_unsigned<SignedInt>::type;
+  return static_cast<SignedInt>(static_cast<UnsignedInt>(u) << shift);
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/io_util.h b/pyarrow/include/arrow/util/io_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..56bd4eff3d66e8e491d71da7b838a15980f92902
--- /dev/null
+++ b/pyarrow/include/arrow/util/io_util.h
@@ -0,0 +1,452 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifndef _WIN32
+#  define ARROW_HAVE_SIGACTION 1
+#endif
+
+#include <atomic>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#if ARROW_HAVE_SIGACTION
+#  include <csignal>  // Needed for struct sigaction
+#endif
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/windows_fixup.h"
+
+namespace arrow::internal {
+
+// NOTE: 8-bit path strings on Windows are encoded using UTF-8.
+// Using MBCS would fail encoding some paths.
+
+#if defined(_WIN32)
+using NativePathString = std::wstring;
+#else
+using NativePathString = std::string;
+#endif
+
+class ARROW_EXPORT PlatformFilename {
+ public:
+  struct Impl;
+
+  ~PlatformFilename();
+  PlatformFilename();
+  PlatformFilename(const PlatformFilename&);
+  PlatformFilename(PlatformFilename&&);
+  PlatformFilename& operator=(const PlatformFilename&);
+  PlatformFilename& operator=(PlatformFilename&&);
+  explicit PlatformFilename(NativePathString path);
+  explicit PlatformFilename(const NativePathString::value_type* path);
+
+  const NativePathString& ToNative() const;
+  std::string ToString() const;
+
+  PlatformFilename Parent() const;
+  Result<PlatformFilename> Real() const;
+
+  // These functions can fail for character encoding reasons.
+  static Result<PlatformFilename> FromString(std::string_view file_name);
+  Result<PlatformFilename> Join(std::string_view child_name) const;
+
+  PlatformFilename Join(const PlatformFilename& child_name) const;
+
+  bool operator==(const PlatformFilename& other) const;
+  bool operator!=(const PlatformFilename& other) const;
+
+  // Made public to avoid the proliferation of friend declarations.
+  const Impl* impl() const { return impl_.get(); }
+
+ private:
+  std::unique_ptr<Impl> impl_;
+
+  explicit PlatformFilename(Impl impl);
+};
+
+/// Create a directory if it doesn't exist.
+///
+/// Return whether the directory was created.
+ARROW_EXPORT
+Result<bool> CreateDir(const PlatformFilename& dir_path);
+
+/// Create a directory and its parents if it doesn't exist.
+///
+/// Return whether the directory was created.
+ARROW_EXPORT
+Result<bool> CreateDirTree(const PlatformFilename& dir_path);
+
+/// Delete a directory's contents (but not the directory itself) if it exists.
+///
+/// Return whether the directory existed.
+ARROW_EXPORT
+Result<bool> DeleteDirContents(const PlatformFilename& dir_path,
+                               bool allow_not_found = true);
+
+/// Delete a directory tree if it exists.
+///
+/// Return whether the directory existed.
+ARROW_EXPORT
+Result<bool> DeleteDirTree(const PlatformFilename& dir_path, bool allow_not_found = true);
+
+// Non-recursively list the contents of the given directory.
+// The returned names are the children's base names, not including dir_path.
+ARROW_EXPORT
+Result<std::vector<PlatformFilename>> ListDir(const PlatformFilename& dir_path);
+
+/// Delete a file if it exists.
+///
+/// Return whether the file existed.
+ARROW_EXPORT
+Result<bool> DeleteFile(const PlatformFilename& file_path, bool allow_not_found = true);
+
+/// Return whether a file exists.
+ARROW_EXPORT
+Result<bool> FileExists(const PlatformFilename& path);
+
+// TODO expose this more publicly to make it available from io/file.h?
+/// A RAII wrapper for a file descriptor.
+///
+/// The underlying file descriptor is automatically closed on destruction.
+/// Moving is supported with well-defined semantics.
+/// Furthermore, closing is idempotent.
+class ARROW_EXPORT FileDescriptor {
+ public:
+  FileDescriptor() = default;
+  explicit FileDescriptor(int fd) : fd_(fd) {}
+  FileDescriptor(FileDescriptor&&);
+  FileDescriptor& operator=(FileDescriptor&&);
+
+  ~FileDescriptor();
+
+  Status Close();
+
+  /// May return -1 if closed or default-initialized
+  int fd() const { return fd_.load(); }
+
+  /// Detach and return the underlying file descriptor
+  int Detach();
+
+  bool closed() const { return fd_.load() == -1; }
+
+ protected:
+  static void CloseFromDestructor(int fd);
+
+  std::atomic<int> fd_{-1};
+};
+
+/// Open a file for reading and return a file descriptor.
+ARROW_EXPORT
+Result<FileDescriptor> FileOpenReadable(const PlatformFilename& file_name);
+
+/// Open a file for writing and return a file descriptor.
+ARROW_EXPORT
+Result<FileDescriptor> FileOpenWritable(const PlatformFilename& file_name,
+                                        bool write_only = true, bool truncate = true,
+                                        bool append = false);
+
+/// Read from current file position.  Return number of bytes read.
+ARROW_EXPORT
+Result<int64_t> FileRead(int fd, uint8_t* buffer, int64_t nbytes);
+/// Read from given file position.  Return number of bytes read.
+ARROW_EXPORT
+Result<int64_t> FileReadAt(int fd, uint8_t* buffer, int64_t position, int64_t nbytes);
+
+ARROW_EXPORT
+Status FileWrite(int fd, const uint8_t* buffer, const int64_t nbytes);
+ARROW_EXPORT
+Status FileTruncate(int fd, const int64_t size);
+
+ARROW_EXPORT
+Status FileSeek(int fd, int64_t pos);
+ARROW_EXPORT
+Status FileSeek(int fd, int64_t pos, int whence);
+ARROW_EXPORT
+Result<int64_t> FileTell(int fd);
+ARROW_EXPORT
+Result<int64_t> FileGetSize(int fd);
+
+ARROW_EXPORT
+Status FileClose(int fd);
+
+struct Pipe {
+  FileDescriptor rfd;
+  FileDescriptor wfd;
+
+  Status Close() { return rfd.Close() & wfd.Close(); }
+};
+
+ARROW_EXPORT
+Result<Pipe> CreatePipe();
+
+ARROW_EXPORT
+Status SetPipeFileDescriptorNonBlocking(int fd);
+
+class ARROW_EXPORT SelfPipe {
+ public:
+  static Result<std::shared_ptr<SelfPipe>> Make(bool signal_safe);
+  virtual ~SelfPipe();
+
+  /// \brief Wait for a wakeup.
+  ///
+  /// Status::Invalid is returned if the pipe has been shutdown.
+  /// Otherwise the next sent payload is returned.
+  virtual Result<uint64_t> Wait() = 0;
+
+  /// \brief Wake up the pipe by sending a payload.
+  ///
+  /// This method is async-signal-safe if `signal_safe` was set to true.
+  virtual void Send(uint64_t payload) = 0;
+
+  /// \brief Wake up the pipe and shut it down.
+  virtual Status Shutdown() = 0;
+};
+
+ARROW_EXPORT
+int64_t GetPageSize();
+
+struct MemoryRegion {
+  void* addr;
+  size_t size;
+};
+
+ARROW_EXPORT
+Status MemoryMapRemap(void* addr, size_t old_size, size_t new_size, int fildes,
+                      void** new_addr);
+ARROW_EXPORT
+Status MemoryAdviseWillNeed(const std::vector<MemoryRegion>& regions);
+
+// Returns KeyError if the environment variable doesn't exist
+ARROW_EXPORT
+Result<std::string> GetEnvVar(std::string_view name);
+ARROW_EXPORT
+Result<NativePathString> GetEnvVarNative(std::string_view name);
+
+ARROW_EXPORT
+Status SetEnvVar(std::string_view name, std::string_view value);
+ARROW_EXPORT
+Status DelEnvVar(std::string_view name);
+
+ARROW_EXPORT
+std::string ErrnoMessage(int errnum);
+#if _WIN32
+ARROW_EXPORT
+std::string WinErrorMessage(int errnum);
+#endif
+
+ARROW_EXPORT
+std::shared_ptr<StatusDetail> StatusDetailFromErrno(int errnum);
+ARROW_EXPORT
+std::optional<int> ErrnoFromStatusDetail(const StatusDetail& detail);
+#if _WIN32
+ARROW_EXPORT
+std::shared_ptr<StatusDetail> StatusDetailFromWinError(int errnum);
+#endif
+ARROW_EXPORT
+std::shared_ptr<StatusDetail> StatusDetailFromSignal(int signum);
+
+template <typename... Args>
+Status StatusFromErrno(int errnum, StatusCode code, Args&&... args) {
+  return Status::FromDetailAndArgs(code, StatusDetailFromErrno(errnum),
+                                   std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+Status IOErrorFromErrno(int errnum, Args&&... args) {
+  return StatusFromErrno(errnum, StatusCode::IOError, std::forward<Args>(args)...);
+}
+
+#if _WIN32
+template <typename... Args>
+Status StatusFromWinError(int errnum, StatusCode code, Args&&... args) {
+  return Status::FromDetailAndArgs(code, StatusDetailFromWinError(errnum),
+                                   std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+Status IOErrorFromWinError(int errnum, Args&&... args) {
+  return StatusFromWinError(errnum, StatusCode::IOError, std::forward<Args>(args)...);
+}
+#endif
+
+template <typename... Args>
+Status StatusFromSignal(int signum, StatusCode code, Args&&... args) {
+  return Status::FromDetailAndArgs(code, StatusDetailFromSignal(signum),
+                                   std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+Status CancelledFromSignal(int signum, Args&&... args) {
+  return StatusFromSignal(signum, StatusCode::Cancelled, std::forward<Args>(args)...);
+}
+
+ARROW_EXPORT
+int ErrnoFromStatus(const Status&);
+
+// Always returns 0 on non-Windows platforms (for Python).
+ARROW_EXPORT
+int WinErrorFromStatus(const Status&);
+
+ARROW_EXPORT
+int SignalFromStatus(const Status&);
+
+class ARROW_EXPORT TemporaryDir {
+ public:
+  ~TemporaryDir();
+
+  /// '/'-terminated path to the temporary dir
+  const PlatformFilename& path() { return path_; }
+
+  /// Create a temporary subdirectory in the system temporary dir,
+  /// named starting with `prefix`.
+  static Result<std::unique_ptr<TemporaryDir>> Make(const std::string& prefix);
+
+ private:
+  PlatformFilename path_;
+
+  explicit TemporaryDir(PlatformFilename&&);
+};
+
+class ARROW_EXPORT SignalHandler {
+ public:
+  using Callback = void (*)(int);
+
+  SignalHandler();
+  explicit SignalHandler(Callback cb);
+#if ARROW_HAVE_SIGACTION
+  explicit SignalHandler(const struct sigaction& sa);
+#endif
+
+  Callback callback() const;
+#if ARROW_HAVE_SIGACTION
+  const struct sigaction& action() const;
+#endif
+
+ protected:
+#if ARROW_HAVE_SIGACTION
+  // Storing the full sigaction allows to restore the entire signal handling
+  // configuration.
+  struct sigaction sa_;
+#else
+  Callback cb_;
+#endif
+};
+
+/// \brief Return the current handler for the given signal number.
+ARROW_EXPORT
+Result<SignalHandler> GetSignalHandler(int signum);
+
+/// \brief Set a new handler for the given signal number.
+///
+/// The old signal handler is returned.
+ARROW_EXPORT
+Result<SignalHandler> SetSignalHandler(int signum, const SignalHandler& handler);
+
+/// \brief Reinstate the signal handler
+///
+/// For use in signal handlers.  This is needed on platforms without sigaction()
+/// such as Windows, as the default signal handler is restored there as
+/// soon as a signal is raised.
+ARROW_EXPORT
+void ReinstateSignalHandler(int signum, SignalHandler::Callback handler);
+
+/// \brief Send a signal to the current process
+///
+/// The thread which will receive the signal is unspecified.
+ARROW_EXPORT
+Status SendSignal(int signum);
+
+/// \brief Send a signal to the given thread
+///
+/// This function isn't supported on Windows.
+ARROW_EXPORT
+Status SendSignalToThread(int signum, uint64_t thread_id);
+
+/// \brief Get an unpredictable random seed
+///
+/// This function may be slightly costly, so should only be used to initialize
+/// a PRNG, not to generate a large amount of random numbers.
+/// It is better to use this function rather than std::random_device, unless
+/// absolutely necessary (e.g. to generate a cryptographic secret).
+ARROW_EXPORT
+int64_t GetRandomSeed();
+
+/// \brief Get the current thread id
+///
+/// In addition to having the same properties as std::thread, the returned value
+/// is a regular integer value, which is more convenient than an opaque type.
+ARROW_EXPORT
+uint64_t GetThreadId();
+
+/// \brief Get the current memory used by the current process in bytes
+///
+/// This function supports Windows, Linux, and Mac and will return 0 otherwise
+ARROW_EXPORT
+int64_t GetCurrentRSS();
+
+/// \brief Get the total memory available to the system in bytes
+///
+/// This function supports Windows, Linux, and Mac and will return 0 otherwise
+ARROW_EXPORT
+int64_t GetTotalMemoryBytes();
+
+/// \brief Get the number of affinity core on the system.
+///
+/// This is only implemented on Linux.
+/// If a value is returned, it is guaranteed to be greater or equal to one.
+ARROW_EXPORT Result<int32_t> GetNumAffinityCores();
+
+/// \brief Load a dynamic library
+///
+/// This wraps dlopen() except on Windows, where LoadLibrary() is called.
+/// These two platforms handle absolute paths consistently; relative paths
+/// or the library's bare name may be handled but inconsistently.
+///
+/// \return An opaque handle for the dynamic library, which can be used for
+///         subsequent symbol lookup. Nullptr will never be returned; instead
+///         an error will be raised.
+ARROW_EXPORT Result<void*> LoadDynamicLibrary(const PlatformFilename& path);
+
+/// \brief Load a dynamic library
+///
+/// An overload taking null terminated string.
+ARROW_EXPORT Result<void*> LoadDynamicLibrary(const char* path);
+
+/// \brief Retrieve a symbol by name from a library handle.
+///
+/// This wraps dlsym() except on Windows, where GetProcAddress() is called.
+///
+/// \return The address associated with the named symbol. Nullptr will never be
+///         returned; instead an error will be raised.
+ARROW_EXPORT Result<void*> GetSymbol(void* handle, const char* name);
+
+template <typename T>
+Result<T*> GetSymbolAs(void* handle, const char* name) {
+  ARROW_ASSIGN_OR_RAISE(void* sym, GetSymbol(handle, name));
+  return reinterpret_cast<T*>(sym);
+}
+
+}  // namespace arrow::internal
diff --git a/pyarrow/include/arrow/util/iterator.h b/pyarrow/include/arrow/util/iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc7fd1d84cc24f072ead1cb777e48548ca89005d
--- /dev/null
+++ b/pyarrow/include/arrow/util/iterator.h
@@ -0,0 +1,582 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/compare.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+template <typename T>
+class Iterator;
+
+template <typename T>
+struct IterationTraits {
+  /// \brief a reserved value which indicates the end of iteration. By
+  /// default this is NULLPTR since most iterators yield pointer types.
+  /// Specialize IterationTraits if different end semantics are required.
+  ///
+  /// Note: This should not be used to determine if a given value is a
+  /// terminal value.  Use IsIterationEnd (which uses IsEnd) instead.  This
+  /// is only for returning terminal values.
+  static T End() { return T(NULLPTR); }
+
+  /// \brief Checks to see if the value is a terminal value.
+  /// A method is used here since T is not necessarily comparable in many
+  /// cases even though it has a distinct final value
+  static bool IsEnd(const T& val) { return val == End(); }
+};
+
+template <typename T>
+T IterationEnd() {
+  return IterationTraits<T>::End();
+}
+
+template <typename T>
+bool IsIterationEnd(const T& val) {
+  return IterationTraits<T>::IsEnd(val);
+}
+
+template <typename T>
+struct IterationTraits<std::optional<T>> {
+  /// \brief by default when iterating through a sequence of optional,
+  /// nullopt indicates the end of iteration.
+  /// Specialize IterationTraits if different end semantics are required.
+  static std::optional<T> End() { return std::nullopt; }
+
+  /// \brief by default when iterating through a sequence of optional,
+  /// nullopt (!has_value()) indicates the end of iteration.
+  /// Specialize IterationTraits if different end semantics are required.
+  static bool IsEnd(const std::optional<T>& val) { return !val.has_value(); }
+
+  // TODO(bkietz) The range-for loop over Iterator<optional<T>> yields
+  // Result<optional<T>> which is unnecessary (since only the unyielded end optional
+  // is nullopt. Add IterationTraits::GetRangeElement() to handle this case
+};
+
+template <typename T>
+struct IterationTraits<Enumerated<T>> {
+  static Enumerated<T> End() { return Enumerated<T>{IterationEnd<T>(), -1, false}; }
+  static bool IsEnd(const Enumerated<T>& val) { return val.index < 0; }
+};
+
+/// \brief A generic Iterator that can return errors
+template <typename T>
+class Iterator : public util::EqualityComparable<Iterator<T>> {
+ public:
+  /// \brief Iterator may be constructed from any type which has a member function
+  /// with signature Result<T> Next();
+  /// End of iterator is signalled by returning IteratorTraits<T>::End();
+  ///
+  /// The argument is moved or copied to the heap and kept in a unique_ptr<void>. Only
+  /// its destructor and its Next method (which are stored in function pointers) are
+  /// referenced after construction.
+  ///
+  /// This approach is used to dodge MSVC linkage hell (ARROW-6244, ARROW-6558) when using
+  /// an abstract template base class: instead of being inlined as usual for a template
+  /// function the base's virtual destructor will be exported, leading to multiple
+  /// definition errors when linking to any other TU where the base is instantiated.
+  template <typename Wrapped>
+  explicit Iterator(Wrapped has_next)
+      : ptr_(new Wrapped(std::move(has_next)), Delete<Wrapped>), next_(Next<Wrapped>) {}
+
+  Iterator() : ptr_(NULLPTR, [](void*) {}) {}
+
+  /// \brief Return the next element of the sequence, IterationTraits<T>::End() when the
+  /// iteration is completed.
+  Result<T> Next() {
+    if (ptr_) {
+      auto next_result = next_(ptr_.get());
+      if (next_result.ok() && IsIterationEnd(next_result.ValueUnsafe())) {
+        ptr_.reset(NULLPTR);
+      }
+      return next_result;
+    } else {
+      return IterationTraits<T>::End();
+    }
+  }
+
+  /// Pass each element of the sequence to a visitor. Will return any error status
+  /// returned by the visitor, terminating iteration.
+  template <typename Visitor>
+  Status Visit(Visitor&& visitor) {
+    for (;;) {
+      ARROW_ASSIGN_OR_RAISE(auto value, Next());
+
+      if (IsIterationEnd(value)) break;
+
+      ARROW_RETURN_NOT_OK(visitor(std::move(value)));
+    }
+
+    return Status::OK();
+  }
+
+  /// Iterators will only compare equal if they are both null.
+  /// Equality comparability is required to make an Iterator of Iterators
+  /// (to check for the end condition).
+  bool Equals(const Iterator& other) const { return ptr_ == other.ptr_; }
+
+  explicit operator bool() const { return ptr_ != NULLPTR; }
+
+  class RangeIterator {
+   public:
+    RangeIterator() : value_(IterationTraits<T>::End()) {}
+
+    explicit RangeIterator(Iterator i)
+        : value_(IterationTraits<T>::End()),
+          iterator_(std::make_shared<Iterator>(std::move(i))) {
+      Next();
+    }
+
+    bool operator!=(const RangeIterator& other) const { return value_ != other.value_; }
+
+    RangeIterator& operator++() {
+      Next();
+      return *this;
+    }
+
+    Result<T> operator*() {
+      ARROW_RETURN_NOT_OK(value_);
+
+      auto value = std::move(value_);
+      value_ = IterationTraits<T>::End();
+      return value;
+    }
+
+   private:
+    void Next() {
+      if (!value_.ok()) {
+        value_ = IterationTraits<T>::End();
+        return;
+      }
+      value_ = iterator_->Next();
+    }
+
+    Result<T> value_;
+    std::shared_ptr<Iterator> iterator_;
+  };
+
+  RangeIterator begin() { return RangeIterator(std::move(*this)); }
+
+  RangeIterator end() { return RangeIterator(); }
+
+  /// \brief Move every element of this iterator into a vector.
+  Result<std::vector<T>> ToVector() {
+    std::vector<T> out;
+    for (auto maybe_element : *this) {
+      ARROW_ASSIGN_OR_RAISE(auto element, maybe_element);
+      out.push_back(std::move(element));
+    }
+    return out;
+  }
+
+ private:
+  /// Implementation of deleter for ptr_: Casts from void* to the wrapped type and
+  /// deletes that.
+  template <typename HasNext>
+  static void Delete(void* ptr) {
+    delete static_cast<HasNext*>(ptr);
+  }
+
+  /// Implementation of Next: Casts from void* to the wrapped type and invokes that
+  /// type's Next member function.
+  template <typename HasNext>
+  static Result<T> Next(void* ptr) {
+    return static_cast<HasNext*>(ptr)->Next();
+  }
+
+  /// ptr_ is a unique_ptr to void with a custom deleter: a function pointer which first
+  /// casts from void* to a pointer to the wrapped type then deletes that.
+  std::unique_ptr<void, void (*)(void*)> ptr_;
+
+  /// next_ is a function pointer which first casts from void* to a pointer to the wrapped
+  /// type then invokes its Next member function.
+  Result<T> (*next_)(void*) = NULLPTR;
+};
+
+template <typename T>
+struct TransformFlow {
+  using YieldValueType = T;
+
+  TransformFlow(YieldValueType value, bool ready_for_next)
+      : finished_(false),
+        ready_for_next_(ready_for_next),
+        yield_value_(std::move(value)) {}
+  TransformFlow(bool finished, bool ready_for_next)
+      : finished_(finished), ready_for_next_(ready_for_next), yield_value_() {}
+
+  bool HasValue() const { return yield_value_.has_value(); }
+  bool Finished() const { return finished_; }
+  bool ReadyForNext() const { return ready_for_next_; }
+  T Value() const { return *yield_value_; }
+
+  bool finished_ = false;
+  bool ready_for_next_ = false;
+  std::optional<YieldValueType> yield_value_;
+};
+
+struct TransformFinish {
+  template <typename T>
+  operator TransformFlow<T>() && {  // NOLINT explicit
+    return TransformFlow<T>(true, true);
+  }
+};
+
+struct TransformSkip {
+  template <typename T>
+  operator TransformFlow<T>() && {  // NOLINT explicit
+    return TransformFlow<T>(false, true);
+  }
+};
+
+template <typename T>
+TransformFlow<T> TransformYield(T value = {}, bool ready_for_next = true) {
+  return TransformFlow<T>(std::move(value), ready_for_next);
+}
+
+template <typename T, typename V>
+using Transformer = std::function<Result<TransformFlow<V>>(T)>;
+
+template <typename T, typename V>
+class TransformIterator {
+ public:
+  explicit TransformIterator(Iterator<T> it, Transformer<T, V> transformer)
+      : it_(std::move(it)),
+        transformer_(std::move(transformer)),
+        last_value_(),
+        finished_() {}
+
+  Result<V> Next() {
+    while (!finished_) {
+      ARROW_ASSIGN_OR_RAISE(std::optional<V> next, Pump());
+      if (next.has_value()) {
+        return std::move(*next);
+      }
+      ARROW_ASSIGN_OR_RAISE(last_value_, it_.Next());
+    }
+    return IterationTraits<V>::End();
+  }
+
+ private:
+  // Calls the transform function on the current value.  Can return in several ways
+  // * If the next value is requested (e.g. skip) it will return an empty optional
+  // * If an invalid status is encountered that will be returned
+  // * If finished it will return IterationTraits<V>::End()
+  // * If a value is returned by the transformer that will be returned
+  Result<std::optional<V>> Pump() {
+    if (!finished_ && last_value_.has_value()) {
+      auto next_res = transformer_(*last_value_);
+      if (!next_res.ok()) {
+        finished_ = true;
+        return next_res.status();
+      }
+      auto next = std::move(*next_res);
+      if (next.ReadyForNext()) {
+        if (IsIterationEnd(*last_value_)) {
+          finished_ = true;
+        }
+        last_value_.reset();
+      }
+      if (next.Finished()) {
+        finished_ = true;
+      }
+      if (next.HasValue()) {
+        return next.Value();
+      }
+    }
+    if (finished_) {
+      return IterationTraits<V>::End();
+    }
+    return std::nullopt;
+  }
+
+  Iterator<T> it_;
+  Transformer<T, V> transformer_;
+  std::optional<T> last_value_;
+  bool finished_ = false;
+};
+
+/// \brief Transforms an iterator according to a transformer, returning a new Iterator.
+///
+/// The transformer will be called on each element of the source iterator and for each
+/// call it can yield a value, skip, or finish the iteration.  When yielding a value the
+/// transformer can choose to consume the source item (the default, ready_for_next = true)
+/// or to keep it and it will be called again on the same value.
+///
+/// This is essentially a more generic form of the map operation that can return 0, 1, or
+/// many values for each of the source items.
+///
+/// The transformer will be exposed to the end of the source sequence
+/// (IterationTraits::End) in case it needs to return some penultimate item(s).
+///
+/// Any invalid status returned by the transformer will be returned immediately.
+template <typename T, typename V>
+Iterator<V> MakeTransformedIterator(Iterator<T> it, Transformer<T, V> op) {
+  return Iterator<V>(TransformIterator<T, V>(std::move(it), std::move(op)));
+}
+
+template <typename T>
+struct IterationTraits<Iterator<T>> {
+  // The end condition for an Iterator of Iterators is a default constructed (null)
+  // Iterator.
+  static Iterator<T> End() { return Iterator<T>(); }
+  static bool IsEnd(const Iterator<T>& val) { return !val; }
+};
+
+template <typename Fn, typename T>
+class FunctionIterator {
+ public:
+  explicit FunctionIterator(Fn fn) : fn_(std::move(fn)) {}
+
+  Result<T> Next() { return fn_(); }
+
+ private:
+  Fn fn_;
+};
+
+/// \brief Construct an Iterator which invokes a callable on Next()
+template <typename Fn,
+          typename Ret = typename internal::call_traits::return_type<Fn>::ValueType>
+Iterator<Ret> MakeFunctionIterator(Fn fn) {
+  return Iterator<Ret>(FunctionIterator<Fn, Ret>(std::move(fn)));
+}
+
+template <typename T>
+Iterator<T> MakeEmptyIterator() {
+  return MakeFunctionIterator([]() -> Result<T> { return IterationTraits<T>::End(); });
+}
+
+template <typename T>
+Iterator<T> MakeErrorIterator(Status s) {
+  return MakeFunctionIterator([s]() -> Result<T> {
+    ARROW_RETURN_NOT_OK(s);
+    return IterationTraits<T>::End();
+  });
+}
+
+/// \brief Simple iterator which yields the elements of a std::vector
+template <typename T>
+class VectorIterator {
+ public:
+  explicit VectorIterator(std::vector<T> v) : elements_(std::move(v)) {}
+
+  Result<T> Next() {
+    if (i_ == elements_.size()) {
+      return IterationTraits<T>::End();
+    }
+    return std::move(elements_[i_++]);
+  }
+
+ private:
+  std::vector<T> elements_;
+  size_t i_ = 0;
+};
+
+template <typename T>
+Iterator<T> MakeVectorIterator(std::vector<T> v) {
+  return Iterator<T>(VectorIterator<T>(std::move(v)));
+}
+
+/// \brief Simple iterator which yields *pointers* to the elements of a std::vector<T>.
+/// This is provided to support T where IterationTraits<T>::End is not specialized
+template <typename T>
+class VectorPointingIterator {
+ public:
+  explicit VectorPointingIterator(std::vector<T> v) : elements_(std::move(v)) {}
+
+  Result<T*> Next() {
+    if (i_ == elements_.size()) {
+      return NULLPTR;
+    }
+    return &elements_[i_++];
+  }
+
+ private:
+  std::vector<T> elements_;
+  size_t i_ = 0;
+};
+
+template <typename T>
+Iterator<T*> MakeVectorPointingIterator(std::vector<T> v) {
+  return Iterator<T*>(VectorPointingIterator<T>(std::move(v)));
+}
+
+/// \brief MapIterator takes ownership of an iterator and a function to apply
+/// on every element. The mapped function is not allowed to fail.
+template <typename Fn, typename I, typename O>
+class MapIterator {
+ public:
+  explicit MapIterator(Fn map, Iterator<I> it)
+      : map_(std::move(map)), it_(std::move(it)) {}
+
+  Result<O> Next() {
+    ARROW_ASSIGN_OR_RAISE(I i, it_.Next());
+
+    if (IsIterationEnd(i)) {
+      return IterationTraits<O>::End();
+    }
+
+    return map_(std::move(i));
+  }
+
+ private:
+  Fn map_;
+  Iterator<I> it_;
+};
+
+/// \brief MapIterator takes ownership of an iterator and a function to apply
+/// on every element. The mapped function is not allowed to fail.
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+          typename To = internal::call_traits::return_type<Fn>>
+Iterator<To> MakeMapIterator(Fn map, Iterator<From> it) {
+  return Iterator<To>(MapIterator<Fn, From, To>(std::move(map), std::move(it)));
+}
+
+/// \brief Like MapIterator, but where the function can fail.
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+          typename To = typename internal::call_traits::return_type<Fn>::ValueType>
+Iterator<To> MakeMaybeMapIterator(Fn map, Iterator<From> it) {
+  return Iterator<To>(MapIterator<Fn, From, To>(std::move(map), std::move(it)));
+}
+
+struct FilterIterator {
+  enum Action { ACCEPT, REJECT };
+
+  template <typename To>
+  static Result<std::pair<To, Action>> Reject() {
+    return std::make_pair(IterationTraits<To>::End(), REJECT);
+  }
+
+  template <typename To>
+  static Result<std::pair<To, Action>> Accept(To out) {
+    return std::make_pair(std::move(out), ACCEPT);
+  }
+
+  template <typename To>
+  static Result<std::pair<To, Action>> MaybeAccept(Result<To> maybe_out) {
+    return std::move(maybe_out).Map(Accept<To>);
+  }
+
+  template <typename To>
+  static Result<std::pair<To, Action>> Error(Status s) {
+    return s;
+  }
+
+  template <typename Fn, typename From, typename To>
+  class Impl {
+   public:
+    explicit Impl(Fn filter, Iterator<From> it) : filter_(filter), it_(std::move(it)) {}
+
+    Result<To> Next() {
+      To out = IterationTraits<To>::End();
+      Action action;
+
+      for (;;) {
+        ARROW_ASSIGN_OR_RAISE(From i, it_.Next());
+
+        if (IsIterationEnd(i)) {
+          return IterationTraits<To>::End();
+        }
+
+        ARROW_ASSIGN_OR_RAISE(std::tie(out, action), filter_(std::move(i)));
+
+        if (action == ACCEPT) return out;
+      }
+    }
+
+   private:
+    Fn filter_;
+    Iterator<From> it_;
+  };
+};
+
+/// \brief Like MapIterator, but where the function can fail or reject elements.
+template <
+    typename Fn, typename From = typename internal::call_traits::argument_type<0, Fn>,
+    typename Ret = typename internal::call_traits::return_type<Fn>::ValueType,
+    typename To = typename std::tuple_element<0, Ret>::type,
+    typename Enable = typename std::enable_if<std::is_same<
+        typename std::tuple_element<1, Ret>::type, FilterIterator::Action>::value>::type>
+Iterator<To> MakeFilterIterator(Fn filter, Iterator<From> it) {
+  return Iterator<To>(
+      FilterIterator::Impl<Fn, From, To>(std::move(filter), std::move(it)));
+}
+
+/// \brief FlattenIterator takes an iterator generating iterators and yields a
+/// unified iterator that flattens/concatenates in a single stream.
+template <typename T>
+class FlattenIterator {
+ public:
+  explicit FlattenIterator(Iterator<Iterator<T>> it) : parent_(std::move(it)) {}
+
+  Result<T> Next() {
+    if (IsIterationEnd(child_)) {
+      // Pop from parent's iterator.
+      ARROW_ASSIGN_OR_RAISE(child_, parent_.Next());
+
+      // Check if final iteration reached.
+      if (IsIterationEnd(child_)) {
+        return IterationTraits<T>::End();
+      }
+
+      return Next();
+    }
+
+    // Pop from child_ and check for depletion.
+    ARROW_ASSIGN_OR_RAISE(T out, child_.Next());
+    if (IsIterationEnd(out)) {
+      // Reset state such that we pop from parent on the recursive call
+      child_ = IterationTraits<Iterator<T>>::End();
+
+      return Next();
+    }
+
+    return out;
+  }
+
+ private:
+  Iterator<Iterator<T>> parent_;
+  Iterator<T> child_ = IterationTraits<Iterator<T>>::End();
+};
+
+template <typename T>
+Iterator<T> MakeFlattenIterator(Iterator<Iterator<T>> it) {
+  return Iterator<T>(FlattenIterator<T>(std::move(it)));
+}
+
+template <typename Reader>
+Iterator<typename Reader::ValueType> MakeIteratorFromReader(
+    const std::shared_ptr<Reader>& reader) {
+  return MakeFunctionIterator([reader] { return reader->Next(); });
+}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/key_value_metadata.h b/pyarrow/include/arrow/util/key_value_metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..57ade11e758684777fc8e2828c9c3d1b9deb0bee
--- /dev/null
+++ b/pyarrow/include/arrow/util/key_value_metadata.h
@@ -0,0 +1,99 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \brief A container for key-value pair type metadata. Not thread-safe
+class ARROW_EXPORT KeyValueMetadata {
+ public:
+  KeyValueMetadata();
+  KeyValueMetadata(std::vector<std::string> keys, std::vector<std::string> values);
+  explicit KeyValueMetadata(const std::unordered_map<std::string, std::string>& map);
+
+  static std::shared_ptr<KeyValueMetadata> Make(std::vector<std::string> keys,
+                                                std::vector<std::string> values);
+
+  void ToUnorderedMap(std::unordered_map<std::string, std::string>* out) const;
+  void Append(std::string key, std::string value);
+
+  Result<std::string> Get(std::string_view key) const;
+  bool Contains(std::string_view key) const;
+  // Note that deleting may invalidate known indices
+  Status Delete(std::string_view key);
+  Status Delete(int64_t index);
+  Status DeleteMany(std::vector<int64_t> indices);
+  Status Set(std::string key, std::string value);
+
+  void reserve(int64_t n);
+
+  int64_t size() const;
+  const std::string& key(int64_t i) const;
+  const std::string& value(int64_t i) const;
+  const std::vector<std::string>& keys() const { return keys_; }
+  const std::vector<std::string>& values() const { return values_; }
+
+  std::vector<std::pair<std::string, std::string>> sorted_pairs() const;
+
+  /// \brief Perform linear search for key, returning -1 if not found
+  int FindKey(std::string_view key) const;
+
+  std::shared_ptr<KeyValueMetadata> Copy() const;
+
+  /// \brief Return a new KeyValueMetadata by combining the passed metadata
+  /// with this KeyValueMetadata. Colliding keys will be overridden by the
+  /// passed metadata. Assumes keys in both containers are unique
+  std::shared_ptr<KeyValueMetadata> Merge(const KeyValueMetadata& other) const;
+
+  bool Equals(const KeyValueMetadata& other) const;
+  std::string ToString() const;
+
+ private:
+  std::vector<std::string> keys_;
+  std::vector<std::string> values_;
+
+  ARROW_DISALLOW_COPY_AND_ASSIGN(KeyValueMetadata);
+};
+
+/// \brief Create a KeyValueMetadata instance
+///
+/// \param pairs key-value mapping
+ARROW_EXPORT std::shared_ptr<KeyValueMetadata> key_value_metadata(
+    const std::unordered_map<std::string, std::string>& pairs);
+
+/// \brief Create a KeyValueMetadata instance
+///
+/// \param keys sequence of metadata keys
+/// \param values sequence of corresponding metadata values
+ARROW_EXPORT std::shared_ptr<KeyValueMetadata> key_value_metadata(
+    std::vector<std::string> keys, std::vector<std::string> values);
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/launder.h b/pyarrow/include/arrow/util/launder.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e4533c4b4760a416b0aca4b91c32ffd324d7f08
--- /dev/null
+++ b/pyarrow/include/arrow/util/launder.h
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <new>
+
+namespace arrow {
+namespace internal {
+
+#if __cpp_lib_launder
+using std::launder;
+#else
+template <class T>
+constexpr T* launder(T* p) noexcept {
+  return p;
+}
+#endif
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/list_util.h b/pyarrow/include/arrow/util/list_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..58deb8019d94155e4488af7e3047e599abb7197b
--- /dev/null
+++ b/pyarrow/include/arrow/util/list_util.h
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include "arrow/array/data.h"
+#include "arrow/result.h"
+
+namespace arrow {
+namespace list_util {
+namespace internal {
+
+/// \brief Calculate the smallest continuous range of values used by the
+/// var-length list-like input (list, map and list-view types).
+///
+/// \param input The input array such that is_var_length_list_like(input.type)
+/// is true
+/// \return A pair of (offset, length) describing the range
+ARROW_EXPORT Result<std::pair<int64_t, int64_t>> RangeOfValuesUsed(
+    const ArraySpan& input);
+
+/// \brief Calculate the sum of the sizes of all valid lists or list-views
+///
+/// This is usually the same as the length of the RangeOfValuesUsed() range, but
+/// it can be:
+/// - Smaller: when the child array contains many values that are not
+/// referenced by the lists or list-views in the parent array
+/// - Greater: when the list-views share child array ranges
+///
+/// \param input The input array such that is_var_length_list_like(input.type)
+/// is true
+/// \return The sum of all list or list-view sizes
+ARROW_EXPORT Result<int64_t> SumOfLogicalListSizes(const ArraySpan& input);
+
+}  // namespace internal
+
+}  // namespace list_util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/logger.h b/pyarrow/include/arrow/util/logger.h
new file mode 100644
index 0000000000000000000000000000000000000000..7832f4a4c223270eb92a1912a22c2e1e81e90b90
--- /dev/null
+++ b/pyarrow/include/arrow/util/logger.h
@@ -0,0 +1,186 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <chrono>
+#include <iosfwd>
+#include <memory>
+#include <string_view>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+struct SourceLocation {
+  const char* file = "";
+  int line = 0;
+};
+
+struct LogDetails {
+  ArrowLogLevel severity = ArrowLogLevel::ARROW_INFO;
+  std::chrono::system_clock::time_point timestamp = std::chrono::system_clock::now();
+  SourceLocation source_location{};
+  std::string_view message = "";
+};
+
+/// \brief A base interface for custom loggers.
+///
+/// Loggers can be added to the LoggerRegistry for global access or directly provided to
+/// certain logging utilities.
+class Logger {
+ public:
+  virtual ~Logger() = default;
+
+  virtual void Log(const LogDetails& details) = 0;
+
+  virtual bool Flush(std::chrono::microseconds timeout) { return true; }
+  bool Flush() { return this->Flush(std::chrono::microseconds::max()); }
+
+  virtual bool is_enabled() const { return true; }
+
+  virtual ArrowLogLevel severity_threshold() const { return ArrowLogLevel::ARROW_TRACE; }
+};
+
+/// \brief Creates a simple logger that redirects output to std::cerr
+ARROW_EXPORT std::shared_ptr<Logger> MakeOStreamLogger(ArrowLogLevel severity_threshold);
+/// \brief Creates a simple logger that redirects output to the provided ostream
+ARROW_EXPORT std::shared_ptr<Logger> MakeOStreamLogger(ArrowLogLevel severity_threshold,
+                                                       std::ostream& sink);
+
+class ARROW_EXPORT LoggerRegistry {
+ public:
+  /// \brief Add a logger to the registry with the associated name
+  ///
+  /// Returns Invalid if a logger with the provided name already exists. Users should call
+  /// `UnregisterLogger` first if they wish to overwrite it.
+  static Status RegisterLogger(std::string_view name, std::shared_ptr<Logger> logger);
+
+  /// \brief Remove a logger from the registry
+  static void UnregisterLogger(std::string_view name);
+
+  /// \brief Return the logger associated with the provided name
+  ///
+  /// If `name` is empty, the default logger is returned. If `name` doesn't match any of
+  /// the registered loggers then a non-null noop logger is returned
+  static std::shared_ptr<Logger> GetLogger(std::string_view name = "");
+
+  /// \brief Return the default logger
+  static std::shared_ptr<Logger> GetDefaultLogger();
+  /// \brief Set the default logger
+  static void SetDefaultLogger(std::shared_ptr<Logger> logger);
+};
+
+/// \brief Represents a single log record to be emitted by an underlying logger
+class ARROW_EXPORT LogMessage {
+ public:
+  /// \brief Construct a LogMessage with the provided underlying logger
+  LogMessage(ArrowLogLevel severity, std::shared_ptr<Logger> logger,
+             SourceLocation source_location = {});
+  /// \brief Construct a LogMessage with the provided logger name, which will be used to
+  /// find an underlying logger in the registry
+  LogMessage(ArrowLogLevel severity, std::string_view logger_name,
+             SourceLocation source_location = {});
+
+  std::ostream& Stream();
+
+  // Convenience method - mainly for use in ARROW_LOG_* macros. This prevents unnecessary
+  // argument evaluation when log statements are stripped in certain builds
+  template <typename... Args>
+  LogMessage& Append(Args&&... args) {
+    if constexpr (sizeof...(Args) > 0) {
+      if (CheckIsEnabled()) {
+        (Stream() << ... << args);
+      }
+    }
+    return *this;
+  }
+
+ private:
+  bool CheckIsEnabled();
+
+  class Impl;
+  std::shared_ptr<Impl> impl_;
+};
+
+}  // namespace util
+}  // namespace arrow
+
+// For the following macros, log statements with a lower severity than
+// `ARROW_MINIMUM_LOG_LEVEL` will be stripped from the build
+#ifndef ARROW_MINIMUM_LOG_LEVEL
+#  define ARROW_MINIMUM_LOG_LEVEL -1000
+#endif
+
+#define ARROW_LOGGER_INTERNAL(LOGGER, LEVEL)                                      \
+  (::arrow::util::LogMessage(::arrow::util::ArrowLogLevel::ARROW_##LEVEL, LOGGER, \
+                             ::arrow::util::SourceLocation{__FILE__, __LINE__}))
+
+static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_TRACE) == -2);
+#if ARROW_MINIMUM_LOG_LEVEL <= -2
+#  define ARROW_LOGGER_TRACE(LOGGER, ...) \
+    (ARROW_LOGGER_INTERNAL(LOGGER, TRACE).Append(__VA_ARGS__))
+#else
+#  define ARROW_LOGGER_TRACE(...) ARROW_UNUSED(0)
+#endif
+
+static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_DEBUG) == -1);
+#if ARROW_MINIMUM_LOG_LEVEL <= -1
+#  define ARROW_LOGGER_DEBUG(LOGGER, ...) \
+    (ARROW_LOGGER_INTERNAL(LOGGER, DEBUG).Append(__VA_ARGS__))
+#else
+#  define ARROW_LOGGER_DEBUG(...) ARROW_UNUSED(0)
+#endif
+
+static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_INFO) == 0);
+#if ARROW_MINIMUM_LOG_LEVEL <= 0
+#  define ARROW_LOGGER_INFO(LOGGER, ...) \
+    (ARROW_LOGGER_INTERNAL(LOGGER, INFO).Append(__VA_ARGS__))
+#else
+#  define ARROW_LOGGER_INFO(...) ARROW_UNUSED(0)
+#endif
+
+static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_WARNING) == 1);
+#if ARROW_MINIMUM_LOG_LEVEL <= 1
+#  define ARROW_LOGGER_WARNING(LOGGER, ...) \
+    (ARROW_LOGGER_INTERNAL(LOGGER, WARNING).Append(__VA_ARGS__))
+#else
+#  define ARROW_LOGGER_WARNING(...) ARROW_UNUSED(0)
+#endif
+
+static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_ERROR) == 2);
+#if ARROW_MINIMUM_LOG_LEVEL <= 2
+#  define ARROW_LOGGER_ERROR(LOGGER, ...) \
+    (ARROW_LOGGER_INTERNAL(LOGGER, ERROR).Append(__VA_ARGS__))
+#else
+#  define ARROW_LOGGER_ERROR(...) ARROW_UNUSED(0)
+#endif
+
+static_assert(static_cast<int>(::arrow::util::ArrowLogLevel::ARROW_FATAL) == 3);
+#if ARROW_MINIMUM_LOG_LEVEL <= 3
+#  define ARROW_LOGGER_FATAL(LOGGER, ...) \
+    (ARROW_LOGGER_INTERNAL(LOGGER, FATAL).Append(__VA_ARGS__))
+#else
+#  define ARROW_LOGGER_FATAL(...) ARROW_UNUSED(0)
+#endif
+
+#define ARROW_LOGGER_CALL(LOGGER, LEVEL, ...) ARROW_LOGGER_##LEVEL(LOGGER, __VA_ARGS__)
diff --git a/pyarrow/include/arrow/util/logging.h b/pyarrow/include/arrow/util/logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..460888f6d75ecf600203f38939f9c82fd2d5a1be
--- /dev/null
+++ b/pyarrow/include/arrow/util/logging.h
@@ -0,0 +1,251 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifdef GANDIVA_IR
+
+// The LLVM IR code doesn't have an NDEBUG mode. And, it shouldn't include references to
+// streams or stdc++. So, making the DCHECK calls void in that case.
+
+#  define ARROW_IGNORE_EXPR(expr) ((void)(expr))
+
+#  define ARROW_DCHECK(condition) ARROW_IGNORE_EXPR(condition)
+#  define ARROW_DCHECK_OK(status) ARROW_IGNORE_EXPR(status)
+#  define ARROW_DCHECK_EQ(val1, val2) ARROW_IGNORE_EXPR(val1)
+#  define ARROW_DCHECK_NE(val1, val2) ARROW_IGNORE_EXPR(val1)
+#  define ARROW_DCHECK_LE(val1, val2) ARROW_IGNORE_EXPR(val1)
+#  define ARROW_DCHECK_LT(val1, val2) ARROW_IGNORE_EXPR(val1)
+#  define ARROW_DCHECK_GE(val1, val2) ARROW_IGNORE_EXPR(val1)
+#  define ARROW_DCHECK_GT(val1, val2) ARROW_IGNORE_EXPR(val1)
+
+#else  // !GANDIVA_IR
+
+#  include <memory>
+#  include <ostream>
+#  include <string>
+
+#  include "arrow/util/macros.h"
+#  include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+enum class ArrowLogLevel : int {
+  ARROW_TRACE = -2,
+  ARROW_DEBUG = -1,
+  ARROW_INFO = 0,
+  ARROW_WARNING = 1,
+  ARROW_ERROR = 2,
+  ARROW_FATAL = 3
+};
+
+#  define ARROW_LOG_INTERNAL(level) ::arrow::util::ArrowLog(__FILE__, __LINE__, level)
+#  define ARROW_LOG(level) ARROW_LOG_INTERNAL(::arrow::util::ArrowLogLevel::ARROW_##level)
+
+#  define ARROW_IGNORE_EXPR(expr) ((void)(expr))
+
+#  define ARROW_CHECK_OR_LOG(condition, level) \
+    ARROW_PREDICT_TRUE(condition)              \
+    ? ARROW_IGNORE_EXPR(0)                     \
+    : ::arrow::util::Voidify() & ARROW_LOG(level) << " Check failed: " #condition " "
+
+#  define ARROW_CHECK(condition) ARROW_CHECK_OR_LOG(condition, FATAL)
+
+// If 'to_call' returns a bad status, CHECK immediately with a logged message
+// of 'msg' followed by the status.
+#  define ARROW_CHECK_OK_PREPEND(to_call, msg, level)                 \
+    do {                                                              \
+      ::arrow::Status _s = ::arrow::ToStatus(to_call);                \
+      ARROW_CHECK_OR_LOG(_s.ok(), level)                              \
+          << "Operation failed: " << ARROW_STRINGIFY(to_call) << "\n" \
+          << (msg) << ": " << _s.ToString();                          \
+    } while (false)
+
+// If the status is bad, CHECK immediately, appending the status to the
+// logged message.
+#  define ARROW_CHECK_OK(s) ARROW_CHECK_OK_PREPEND(s, "Bad status", FATAL)
+
+#  define ARROW_CHECK_EQ(val1, val2) ARROW_CHECK((val1) == (val2))
+#  define ARROW_CHECK_NE(val1, val2) ARROW_CHECK((val1) != (val2))
+#  define ARROW_CHECK_LE(val1, val2) ARROW_CHECK((val1) <= (val2))
+#  define ARROW_CHECK_LT(val1, val2) ARROW_CHECK((val1) < (val2))
+#  define ARROW_CHECK_GE(val1, val2) ARROW_CHECK((val1) >= (val2))
+#  define ARROW_CHECK_GT(val1, val2) ARROW_CHECK((val1) > (val2))
+
+#  ifdef NDEBUG
+#    define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_WARNING
+
+// CAUTION: DCHECK_OK() always evaluates its argument, but other DCHECK*() macros
+// only do so in debug mode.
+
+#    define ARROW_DCHECK(condition)               \
+      while (false) ARROW_IGNORE_EXPR(condition); \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_OK(s) \
+      ARROW_IGNORE_EXPR(s);    \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_EQ(val1, val2)      \
+      while (false) ARROW_IGNORE_EXPR(val1); \
+      while (false) ARROW_IGNORE_EXPR(val2); \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_NE(val1, val2)      \
+      while (false) ARROW_IGNORE_EXPR(val1); \
+      while (false) ARROW_IGNORE_EXPR(val2); \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_LE(val1, val2)      \
+      while (false) ARROW_IGNORE_EXPR(val1); \
+      while (false) ARROW_IGNORE_EXPR(val2); \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_LT(val1, val2)      \
+      while (false) ARROW_IGNORE_EXPR(val1); \
+      while (false) ARROW_IGNORE_EXPR(val2); \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_GE(val1, val2)      \
+      while (false) ARROW_IGNORE_EXPR(val1); \
+      while (false) ARROW_IGNORE_EXPR(val2); \
+      while (false) ::arrow::util::detail::NullLog()
+#    define ARROW_DCHECK_GT(val1, val2)      \
+      while (false) ARROW_IGNORE_EXPR(val1); \
+      while (false) ARROW_IGNORE_EXPR(val2); \
+      while (false) ::arrow::util::detail::NullLog()
+
+#  else
+#    define ARROW_DFATAL ::arrow::util::ArrowLogLevel::ARROW_FATAL
+
+#    define ARROW_DCHECK ARROW_CHECK
+#    define ARROW_DCHECK_OK ARROW_CHECK_OK
+#    define ARROW_DCHECK_EQ ARROW_CHECK_EQ
+#    define ARROW_DCHECK_NE ARROW_CHECK_NE
+#    define ARROW_DCHECK_LE ARROW_CHECK_LE
+#    define ARROW_DCHECK_LT ARROW_CHECK_LT
+#    define ARROW_DCHECK_GE ARROW_CHECK_GE
+#    define ARROW_DCHECK_GT ARROW_CHECK_GT
+
+#  endif  // NDEBUG
+
+// This code is adapted from
+// https://github.com/ray-project/ray/blob/master/src/ray/util/logging.h.
+
+// To make the logging lib pluggable with other logging libs and make
+// the implementation unawared by the user, ArrowLog is only a declaration
+// which hide the implementation into logging.cc file.
+// In logging.cc, we can choose different log libs using different macros.
+
+// This is also a null log which does not output anything.
+class ARROW_EXPORT ArrowLogBase {
+ public:
+  virtual ~ArrowLogBase() {}
+
+  virtual bool IsEnabled() const { return false; }
+
+  template <typename T>
+  ArrowLogBase& operator<<(const T& t) {
+    if (IsEnabled()) {
+      Stream() << t;
+    }
+    return *this;
+  }
+
+ protected:
+  virtual std::ostream& Stream() = 0;
+};
+
+class ARROW_EXPORT ArrowLog : public ArrowLogBase {
+ public:
+  ArrowLog(const char* file_name, int line_number, ArrowLogLevel severity);
+  ~ArrowLog() override;
+
+  /// Return whether or not current logging instance is enabled.
+  ///
+  /// \return True if logging is enabled and false otherwise.
+  bool IsEnabled() const override;
+
+  /// The init function of arrow log for a program which should be called only once.
+  ///
+  /// \param appName The app name which starts the log.
+  /// \param severity_threshold Logging threshold for the program.
+  /// \param logDir Logging output file name. If empty, the log won't output to file.
+  static void StartArrowLog(const std::string& appName,
+                            ArrowLogLevel severity_threshold = ArrowLogLevel::ARROW_INFO,
+                            const std::string& logDir = "");
+
+  /// The shutdown function of arrow log, it should be used with StartArrowLog as a pair.
+  static void ShutDownArrowLog();
+
+  /// Install the failure signal handler to output call stack when crash.
+  /// If glog is not installed, this function won't do anything.
+  static void InstallFailureSignalHandler();
+
+  /// Uninstall the signal actions installed by InstallFailureSignalHandler.
+  static void UninstallSignalAction();
+
+  /// Return whether or not the log level is enabled in current setting.
+  ///
+  /// \param log_level The input log level to test.
+  /// \return True if input log level is not lower than the threshold.
+  static bool IsLevelEnabled(ArrowLogLevel log_level);
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(ArrowLog);
+
+  // Hide the implementation of log provider by void *.
+  // Otherwise, lib user may define the same macro to use the correct header file.
+  void* logging_provider_;
+  /// True if log messages should be logged and false if they should be ignored.
+  bool is_enabled_;
+
+  static ArrowLogLevel severity_threshold_;
+
+ protected:
+  std::ostream& Stream() override;
+};
+
+// This class make ARROW_CHECK compilation pass to change the << operator to void.
+// This class is copied from glog.
+class ARROW_EXPORT Voidify {
+ public:
+  Voidify() {}
+  // This has to be an operator with a precedence lower than << but
+  // higher than ?:
+  void operator&(ArrowLogBase&) {}
+};
+
+namespace detail {
+
+/// @brief A helper for the nil log sink.
+///
+/// Using this helper is analogous to sending log messages to /dev/null:
+/// nothing gets logged.
+class NullLog {
+ public:
+  /// The no-op output operator.
+  ///
+  /// @param [in] t
+  ///   The object to send into the nil sink.
+  /// @return Reference to the updated object.
+  template <class T>
+  NullLog& operator<<(const T& t) {
+    return *this;
+  }
+};
+
+}  // namespace detail
+}  // namespace util
+}  // namespace arrow
+
+#endif  // GANDIVA_IR
diff --git a/pyarrow/include/arrow/util/macros.h b/pyarrow/include/arrow/util/macros.h
new file mode 100644
index 0000000000000000000000000000000000000000..55bc1eeb1d2d5c6f93e94fadb5bc1c03cbce34b7
--- /dev/null
+++ b/pyarrow/include/arrow/util/macros.h
@@ -0,0 +1,252 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#define ARROW_EXPAND(x) x
+#define ARROW_STRINGIFY(x) #x
+#define ARROW_CONCAT(x, y) x##y
+
+// From Google gutil
+#ifndef ARROW_DISALLOW_COPY_AND_ASSIGN
+#  define ARROW_DISALLOW_COPY_AND_ASSIGN(TypeName) \
+    TypeName(const TypeName&) = delete;            \
+    void operator=(const TypeName&) = delete
+#endif
+
+#ifndef ARROW_DEFAULT_MOVE_AND_ASSIGN
+#  define ARROW_DEFAULT_MOVE_AND_ASSIGN(TypeName) \
+    TypeName(TypeName&&) = default;               \
+    TypeName& operator=(TypeName&&) = default
+#endif
+
+// With ARROW_PREDICT_FALSE, GCC and clang can be told that a certain branch is
+// not likely to be taken (for instance, a CHECK failure), and use that information in
+// static analysis. Giving the compiler this information can affect the generated code
+// layout in the absence of better information (i.e. -fprofile-arcs). [1] explains how
+// this feature can be used to improve code generation. It was written as a positive
+// comment to a negative article about the use of these annotations.
+//
+// ARROW_COMPILER_ASSUME allows the compiler to assume that a given expression is
+// true, without evaluating it, and to optimise based on this assumption [2]. If this
+// condition is violated at runtime, the behavior is undefined. This can be useful to
+// generate both faster and smaller code in compute kernels.
+//
+// IMPORTANT: Different optimisers are likely to react differently to this annotation!
+// It should be used with care when we can prove by some means that the assumption
+// is (1) guaranteed to always hold and (2) is useful for optimization [3]. If the
+// assumption is pessimistic, it might even block the compiler from decisions that
+// could lead to better code [4]. If you have a good intuition for what the compiler
+// can do with assumptions [5], you can use this macro to guide it and end up with
+// results you would only get with more complex code transformations.
+// `clang -S -emit-llvm` can be used to check how the generated code changes with
+// your specific use of this macro.
+//
+// [1] https://lobste.rs/s/uwgtkt/don_t_use_likely_unlikely_attributes#c_xi3wmc
+// [2] "Portable assumptions"
+//     https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1774r4.pdf
+// [3] "Assertions Are Pessimistic, Assumptions Are Optimistic"
+//     https://blog.regehr.org/archives/1096
+// [4] https://discourse.llvm.org/t/llvm-assume-blocks-optimization/71609
+// [5] J. Doerfert et al. 2019. "Performance Exploration Through Optimistic Static
+//     Program Annotations". https://github.com/jdoerfert/PETOSPA/blob/master/ISC19.pdf
+#define ARROW_UNUSED(x) (void)(x)
+#ifdef ARROW_WARN_DOCUMENTATION
+#  define ARROW_ARG_UNUSED(x) x
+#else
+#  define ARROW_ARG_UNUSED(x)
+#endif
+#if defined(__GNUC__)  // GCC and compatible compilers (clang, Intel ICC)
+#  define ARROW_NORETURN __attribute__((noreturn))
+#  define ARROW_NOINLINE __attribute__((noinline))
+#  define ARROW_FORCE_INLINE __attribute__((always_inline))
+#  define ARROW_PREDICT_FALSE(x) (__builtin_expect(!!(x), 0))
+#  define ARROW_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
+#  define ARROW_RESTRICT __restrict
+#  if defined(__clang__)  // clang-specific
+#    define ARROW_COMPILER_ASSUME(expr) __builtin_assume(expr)
+#  else  // GCC-specific
+#    if __GNUC__ >= 13
+#      define ARROW_COMPILER_ASSUME(expr) __attribute__((assume(expr)))
+#    else
+// GCC does not have a built-in assume intrinsic before GCC 13, so we use an
+// if statement and __builtin_unreachable() to achieve the same effect [2].
+// Unlike clang's __builtin_assume and C++23's [[assume(expr)]], using this
+// on GCC won't warn about side-effects in the expression, so make sure expr
+// is side-effect free when working with GCC versions before 13 (Jan-2024),
+// otherwise clang/MSVC builds will fail in CI.
+#      define ARROW_COMPILER_ASSUME(expr) \
+        if (expr) {                       \
+        } else {                          \
+          __builtin_unreachable();        \
+        }
+#    endif  // __GNUC__ >= 13
+#  endif
+#elif defined(_MSC_VER)  // MSVC
+#  define ARROW_NORETURN __declspec(noreturn)
+#  define ARROW_NOINLINE __declspec(noinline)
+#  define ARROW_FORCE_INLINE __forceinline
+#  define ARROW_PREDICT_FALSE(x) (x)
+#  define ARROW_PREDICT_TRUE(x) (x)
+#  define ARROW_RESTRICT __restrict
+#  define ARROW_COMPILER_ASSUME(expr) __assume(expr)
+#else
+#  define ARROW_NORETURN
+#  define ARROW_NOINLINE
+#  define ARROW_FORCE_INLINE
+#  define ARROW_PREDICT_FALSE(x) (x)
+#  define ARROW_PREDICT_TRUE(x) (x)
+#  define ARROW_RESTRICT
+#  define ARROW_COMPILER_ASSUME(expr)
+#endif
+
+// ----------------------------------------------------------------------
+// C++/CLI support macros (see ARROW-1134)
+
+#ifndef NULLPTR
+
+#  ifdef __cplusplus_cli
+#    define NULLPTR __nullptr
+#  else
+#    define NULLPTR nullptr
+#  endif
+
+#endif  // ifndef NULLPTR
+
+// ----------------------------------------------------------------------
+
+// clang-format off
+// [[deprecated]] is only available in C++14, use this for the time being
+// This macro takes an optional deprecation message
+#ifdef __COVERITY__
+#  define ARROW_DEPRECATED(...)
+#else
+#  define ARROW_DEPRECATED(...) [[deprecated(__VA_ARGS__)]]
+#endif
+
+#ifdef __COVERITY__
+#  define ARROW_DEPRECATED_ENUM_VALUE(...)
+#else
+#  define ARROW_DEPRECATED_ENUM_VALUE(...) [[deprecated(__VA_ARGS__)]]
+#endif
+
+// clang-format on
+
+// Macros to disable deprecation warnings
+
+#ifdef __clang__
+#  define ARROW_SUPPRESS_DEPRECATION_WARNING \
+    _Pragma("clang diagnostic push");        \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#  define ARROW_UNSUPPRESS_DEPRECATION_WARNING _Pragma("clang diagnostic pop")
+#elif defined(__GNUC__)
+#  define ARROW_SUPPRESS_DEPRECATION_WARNING \
+    _Pragma("GCC diagnostic push");          \
+    _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#  define ARROW_UNSUPPRESS_DEPRECATION_WARNING _Pragma("GCC diagnostic pop")
+#elif defined(_MSC_VER)
+#  define ARROW_SUPPRESS_DEPRECATION_WARNING \
+    __pragma(warning(push)) __pragma(warning(disable : 4996))
+#  define ARROW_UNSUPPRESS_DEPRECATION_WARNING __pragma(warning(pop))
+#else
+#  define ARROW_SUPPRESS_DEPRECATION_WARNING
+#  define ARROW_UNSUPPRESS_DEPRECATION_WARNING
+#endif
+
+// ----------------------------------------------------------------------
+
+// Macros to disable warnings about undeclared global functions
+#if defined(__GNUC__)
+#  define ARROW_SUPPRESS_MISSING_DECLARATIONS_WARNING \
+    _Pragma("GCC diagnostic push");                   \
+    _Pragma("GCC diagnostic ignored \"-Wmissing-declarations\"")
+#  define ARROW_UNSUPPRESS_MISSING_DECLARATIONS_WARNING _Pragma("GCC diagnostic pop")
+#else
+#  define ARROW_SUPPRESS_MISSING_DECLARATIONS_WARNING
+#  define ARROW_UNSUPPRESS_MISSING_DECLARATIONS_WARNING
+#endif
+
+// ----------------------------------------------------------------------
+
+// macros to disable padding
+// these macros are portable across different compilers and platforms
+//[https://github.com/google/flatbuffers/blob/master/include/flatbuffers/flatbuffers.h#L1355]
+#if !defined(MANUALLY_ALIGNED_STRUCT)
+#  if defined(_MSC_VER)
+#    define MANUALLY_ALIGNED_STRUCT(alignment) \
+      __pragma(pack(1));                       \
+      struct __declspec(align(alignment))
+#    define STRUCT_END(name, size) \
+      __pragma(pack());            \
+      static_assert(sizeof(name) == size, "compiler breaks packing rules")
+#  elif defined(__GNUC__) || defined(__clang__)
+#    define MANUALLY_ALIGNED_STRUCT(alignment) \
+      _Pragma("pack(1)") struct __attribute__((aligned(alignment)))
+#    define STRUCT_END(name, size)                          \
+      _Pragma("pack()") static_assert(sizeof(name) == size, \
+                                      "compiler breaks packing rules")
+#  else
+#    error Unknown compiler, please define structure alignment macros
+#  endif
+#endif  // !defined(MANUALLY_ALIGNED_STRUCT)
+
+// ----------------------------------------------------------------------
+// Convenience macro disabling a particular UBSan check in a function
+
+#if defined(__clang__)
+#  define ARROW_DISABLE_UBSAN(feature) __attribute__((no_sanitize(feature)))
+#else
+#  define ARROW_DISABLE_UBSAN(feature)
+#endif
+
+// ----------------------------------------------------------------------
+// Machine information
+
+#if INTPTR_MAX == INT64_MAX
+#  define ARROW_BITNESS 64
+#elif INTPTR_MAX == INT32_MAX
+#  define ARROW_BITNESS 32
+#else
+#  error Unexpected INTPTR_MAX
+#endif
+
+// ----------------------------------------------------------------------
+// From googletest
+// (also in parquet-cpp)
+
+// When you need to test the private or protected members of a class,
+// use the FRIEND_TEST macro to declare your tests as friends of the
+// class.  For example:
+//
+// class MyClass {
+//  private:
+//   void MyMethod();
+//   FRIEND_TEST(MyClassTest, MyMethod);
+// };
+//
+// class MyClassTest : public testing::Test {
+//   // ...
+// };
+//
+// TEST_F(MyClassTest, MyMethod) {
+//   // Can call MyClass::MyMethod() here.
+// }
+
+#define FRIEND_TEST(test_case_name, test_name) \
+  friend class test_case_name##_##test_name##_Test
diff --git a/pyarrow/include/arrow/util/math_constants.h b/pyarrow/include/arrow/util/math_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..3524f88e0ba9a5c2f4cd49079c2f3de90e5e9aaa
--- /dev/null
+++ b/pyarrow/include/arrow/util/math_constants.h
@@ -0,0 +1,32 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cmath>
+
+// Not provided by default in MSVC,
+// and _USE_MATH_DEFINES is not reliable with unity builds
+#ifndef M_PI
+#  define M_PI 3.14159265358979323846
+#endif
+#ifndef M_PI_2
+#  define M_PI_2 1.57079632679489661923
+#endif
+#ifndef M_PI_4
+#  define M_PI_4 0.785398163397448309616
+#endif
diff --git a/pyarrow/include/arrow/util/mutex.h b/pyarrow/include/arrow/util/mutex.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac63cf70cd9ae9c05189f89e2f96c4d216d09573
--- /dev/null
+++ b/pyarrow/include/arrow/util/mutex.h
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+/// A wrapper around std::mutex since we can't use it directly in
+/// public headers due to C++/CLI.
+/// https://docs.microsoft.com/en-us/cpp/standard-library/mutex#remarks
+class ARROW_EXPORT Mutex {
+ public:
+  Mutex();
+  Mutex(Mutex&&) = default;
+  Mutex& operator=(Mutex&&) = default;
+
+  /// A Guard is falsy if a lock could not be acquired.
+  class ARROW_EXPORT Guard {
+   public:
+    Guard() : locked_(NULLPTR, [](Mutex* mutex) {}) {}
+    Guard(Guard&&) = default;
+    Guard& operator=(Guard&&) = default;
+
+    explicit operator bool() const { return bool(locked_); }
+
+    void Unlock() { locked_.reset(); }
+
+   private:
+    explicit Guard(Mutex* locked);
+
+    std::unique_ptr<Mutex, void (*)(Mutex*)> locked_;
+    friend Mutex;
+  };
+
+  Guard TryLock();
+  Guard Lock();
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl, void (*)(Impl*)> impl_;
+};
+
+#ifndef _WIN32
+/// Return a pointer to a process-wide, process-specific Mutex that can be used
+/// at any point in a child process.  NULL is returned when called in the parent.
+///
+/// The rule is to first check that getpid() corresponds to the parent process pid
+/// and, if not, call this function to lock any after-fork reinitialization code.
+/// Like this:
+///
+///   std::atomic<pid_t> pid{getpid()};
+///   ...
+///   if (pid.load() != getpid()) {
+///     // In child process
+///     auto lock = GlobalForkSafeMutex()->Lock();
+///     if (pid.load() != getpid()) {
+///       // Reinitialize internal structures after fork
+///       ...
+///       pid.store(getpid());
+ARROW_EXPORT
+Mutex* GlobalForkSafeMutex();
+#endif
+
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/parallel.h b/pyarrow/include/arrow/util/parallel.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae48a606e366f914ac92e18fad3e71e87aea2b96
--- /dev/null
+++ b/pyarrow/include/arrow/util/parallel.h
@@ -0,0 +1,104 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <utility>
+#include <vector>
+
+#include "arrow/status.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/thread_pool.h"
+#include "arrow/util/vector.h"
+
+namespace arrow {
+namespace internal {
+
+// A parallelizer that takes a `Status(int)` function and calls it with
+// arguments between 0 and `num_tasks - 1`, on an arbitrary number of threads.
+
+template <class FUNCTION>
+Status ParallelFor(int num_tasks, FUNCTION&& func,
+                   Executor* executor = internal::GetCpuThreadPool()) {
+  std::vector<Future<>> futures(num_tasks);
+
+  for (int i = 0; i < num_tasks; ++i) {
+    ARROW_ASSIGN_OR_RAISE(futures[i], executor->Submit(func, i));
+  }
+  auto st = Status::OK();
+  for (auto& fut : futures) {
+    st &= fut.status();
+  }
+  return st;
+}
+
+template <class FUNCTION, typename T,
+          typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
+Future<std::vector<R>> ParallelForAsync(std::vector<T> inputs, FUNCTION&& func,
+                                        Executor* executor = internal::GetCpuThreadPool(),
+                                        TaskHints hints = TaskHints{}) {
+  std::vector<Future<R>> futures(inputs.size());
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    ARROW_ASSIGN_OR_RAISE(futures[i],
+                          executor->Submit(hints, func, i, std::move(inputs[i])));
+  }
+  return All(std::move(futures))
+      .Then([](const std::vector<Result<R>>& results) -> Result<std::vector<R>> {
+        return UnwrapOrRaise(results);
+      });
+}
+
+// A parallelizer that takes a `Status(int)` function and calls it with
+// arguments between 0 and `num_tasks - 1`, in sequence or in parallel,
+// depending on the input boolean.
+
+template <class FUNCTION>
+Status OptionalParallelFor(bool use_threads, int num_tasks, FUNCTION&& func,
+                           Executor* executor = internal::GetCpuThreadPool()) {
+  if (use_threads) {
+    return ParallelFor(num_tasks, std::forward<FUNCTION>(func), executor);
+  } else {
+    for (int i = 0; i < num_tasks; ++i) {
+      RETURN_NOT_OK(func(i));
+    }
+    return Status::OK();
+  }
+}
+
+// A parallelizer that takes a `Result<R>(int index, T item)` function and
+// calls it with each item from the input array, in sequence or in parallel,
+// depending on the input boolean.
+
+template <class FUNCTION, typename T,
+          typename R = typename internal::call_traits::return_type<FUNCTION>::ValueType>
+Future<std::vector<R>> OptionalParallelForAsync(
+    bool use_threads, std::vector<T> inputs, FUNCTION&& func,
+    Executor* executor = internal::GetCpuThreadPool(), TaskHints hints = TaskHints{}) {
+  if (use_threads) {
+    return ParallelForAsync(std::move(inputs), std::forward<FUNCTION>(func), executor,
+                            hints);
+  } else {
+    std::vector<R> result(inputs.size());
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      ARROW_ASSIGN_OR_RAISE(result[i], func(i, inputs[i]));
+    }
+    return result;
+  }
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/pcg_random.h b/pyarrow/include/arrow/util/pcg_random.h
new file mode 100644
index 0000000000000000000000000000000000000000..768f2328200fb2635213358226cfdb3f9273c808
--- /dev/null
+++ b/pyarrow/include/arrow/util/pcg_random.h
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/vendored/pcg/pcg_random.hpp"  // IWYU pragma: export
+
+namespace arrow {
+namespace random {
+
+using pcg32 = ::arrow_vendored::pcg32;
+using pcg64 = ::arrow_vendored::pcg64;
+using pcg32_fast = ::arrow_vendored::pcg32_fast;
+using pcg64_fast = ::arrow_vendored::pcg64_fast;
+using pcg32_oneseq = ::arrow_vendored::pcg32_oneseq;
+using pcg64_oneseq = ::arrow_vendored::pcg64_oneseq;
+
+}  // namespace random
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/prefetch.h b/pyarrow/include/arrow/util/prefetch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e9b5ae670ca173edb6448d6575fd5a946aaf4c9
--- /dev/null
+++ b/pyarrow/include/arrow/util/prefetch.h
@@ -0,0 +1,31 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(__GNUC__)  // GCC and compatible compilers (clang, Intel ICC)
+#  define ARROW_PREFETCH(addr) __builtin_prefetch(addr)
+#elif defined(_MSC_VER)  // MSVC
+#  if defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_RUNTIME_SSE4_2)
+#    include <nmmintrin.h>
+#    define ARROW_PREFETCH(addr) _mm_prefetch((const char*)(addr), _MM_HINT_T0)
+#  else
+#    define ARROW_PREFETCH(addr)
+#  endif
+#else
+#  define ARROW_PREFETCH(addr)
+#endif
diff --git a/pyarrow/include/arrow/util/queue.h b/pyarrow/include/arrow/util/queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c71fa6e155e8818801db2ccb18127d75d6364a8
--- /dev/null
+++ b/pyarrow/include/arrow/util/queue.h
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/vendored/ProducerConsumerQueue.h"
+
+namespace arrow {
+namespace util {
+
+template <typename T>
+using SpscQueue = arrow_vendored::folly::ProducerConsumerQueue<T>;
+
+}
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/range.h b/pyarrow/include/arrow/util/range.h
new file mode 100644
index 0000000000000000000000000000000000000000..449a1fbd80cef3d4fc3937ea2d8479c12411bda7
--- /dev/null
+++ b/pyarrow/include/arrow/util/range.h
@@ -0,0 +1,265 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <numeric>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace arrow::internal {
+
+/// Create a vector containing the values from start with length elements
+template <typename T>
+std::vector<T> Iota(T start, size_t length) {
+  std::vector<T> result(length);
+  std::iota(result.begin(), result.end(), start);
+  return result;
+}
+
+/// Create a vector containing the values from start up to stop
+template <typename T>
+std::vector<T> Iota(T start, T stop) {
+  if (start > stop) {
+    return {};
+  }
+  return Iota<T>(start, static_cast<size_t>(stop - start));
+}
+
+/// Create a vector containing the values from 0 up to length
+template <typename T>
+std::vector<T> Iota(T length) {
+  return Iota(static_cast<T>(0), length);
+}
+
+/// Create a range from a callable which takes a single index parameter
+/// and returns the value of iterator on each call and a length.
+/// Only iterators obtained from the same range should be compared, the
+/// behaviour generally similar to other STL containers.
+template <typename Generator>
+class LazyRange {
+ private:
+  // callable which generates the values
+  // has to be defined at the beginning of the class for type deduction
+  const Generator gen_;
+  // the length of the range
+  int64_t length_;
+#ifdef _MSC_VER
+  // workaround to VS2010 not supporting decltype properly
+  // see https://stackoverflow.com/questions/21782846/decltype-for-class-member-function
+  static Generator gen_static_;
+#endif
+
+ public:
+#ifdef _MSC_VER
+  using return_type = decltype(gen_static_(0));
+#else
+  using return_type = decltype(gen_(0));
+#endif
+
+  /// Construct a new range from a callable and length
+  LazyRange(Generator gen, int64_t length) : gen_(gen), length_(length) {}
+
+  // Class of the dependent iterator, created implicitly by begin and end
+  class RangeIter {
+   public:
+    using difference_type = int64_t;
+    using value_type = return_type;
+    using reference = const value_type&;
+    using pointer = const value_type*;
+    using iterator_category = std::forward_iterator_tag;
+
+#ifdef _MSC_VER
+    // msvc complains about unchecked iterators,
+    // see https://stackoverflow.com/questions/21655496/error-c4996-checked-iterators
+    using _Unchecked_type = typename LazyRange<Generator>::RangeIter;
+#endif
+
+    RangeIter() = delete;
+    RangeIter(const RangeIter& other) = default;
+    RangeIter& operator=(const RangeIter& other) = default;
+
+    RangeIter(const LazyRange<Generator>& range, int64_t index)
+        : range_(&range), index_(index) {}
+
+    const return_type operator*() const { return range_->gen_(index_); }
+
+    RangeIter operator+(difference_type length) const {
+      return RangeIter(*range_, index_ + length);
+    }
+
+    // pre-increment
+    RangeIter& operator++() {
+      ++index_;
+      return *this;
+    }
+
+    // post-increment
+    RangeIter operator++(int) {
+      auto copy = RangeIter(*this);
+      ++index_;
+      return copy;
+    }
+
+    bool operator==(const typename LazyRange<Generator>::RangeIter& other) const {
+      return this->index_ == other.index_ && this->range_ == other.range_;
+    }
+
+    bool operator!=(const typename LazyRange<Generator>::RangeIter& other) const {
+      return this->index_ != other.index_ || this->range_ != other.range_;
+    }
+
+    int64_t operator-(const typename LazyRange<Generator>::RangeIter& other) const {
+      return this->index_ - other.index_;
+    }
+
+    bool operator<(const typename LazyRange<Generator>::RangeIter& other) const {
+      return this->index_ < other.index_;
+    }
+
+   private:
+    // parent range reference
+    const LazyRange* range_;
+    // current index
+    int64_t index_;
+  };
+
+  friend class RangeIter;
+
+  // Create a new begin const iterator
+  RangeIter begin() { return RangeIter(*this, 0); }
+
+  // Create a new end const iterator
+  RangeIter end() { return RangeIter(*this, length_); }
+};
+
+/// Helper function to create a lazy range from a callable (e.g. lambda) and length
+template <typename Generator>
+LazyRange<Generator> MakeLazyRange(Generator&& gen, int64_t length) {
+  return LazyRange<Generator>(std::forward<Generator>(gen), length);
+}
+
+/// \brief A helper for iterating multiple ranges simultaneously, similar to C++23's
+/// zip() view adapter modelled after python's built-in zip() function.
+///
+/// \code {.cpp}
+/// const std::vector<SomeTable>& tables = ...
+/// std::function<std::vector<std::string>()> GetNames = ...
+/// for (auto [table, name] : Zip(tables, GetNames())) {
+///   static_assert(std::is_same_v<decltype(table), const SomeTable&>);
+///   static_assert(std::is_same_v<decltype(name), std::string&>);
+///   // temporaries (like this vector of strings) are kept alive for the
+///   // duration of a loop and are safely movable).
+///   RegisterTableWithName(std::move(name), &table);
+/// }
+/// \endcode
+///
+/// The zipped sequence ends as soon as any of its member ranges ends.
+///
+/// Always use `auto` for the loop's declaration; it will always be a tuple
+/// of references so for example using `const auto&` will compile but will
+/// *look* like forcing const-ness even though the members of the tuple are
+/// still mutable references.
+///
+/// NOTE: we *could* make Zip a more full fledged range and enable things like
+/// - gtest recognizing it as a container; it currently doesn't since Zip is
+///   always mutable so this breaks:
+///       EXPECT_THAT(Zip(std::vector{0}, std::vector{1}),
+///                   ElementsAre(std::tuple{0, 1}));
+/// - letting it be random access when possible so we can do things like *sort*
+///   parallel ranges
+/// - ...
+///
+/// However doing this will increase the compile time overhead of using Zip as
+/// long as we're still using headers. Therefore until we can use c++20 modules:
+/// *don't* extend Zip.
+template <typename Ranges, typename Indices>
+struct Zip;
+
+template <typename... Ranges>
+Zip(Ranges&&...) -> Zip<std::tuple<Ranges...>, std::index_sequence_for<Ranges...>>;
+
+template <typename... Ranges, size_t... I>
+struct Zip<std::tuple<Ranges...>, std::index_sequence<I...>> {
+  explicit Zip(Ranges... ranges) : ranges_(std::forward<Ranges>(ranges)...) {}
+
+  std::tuple<Ranges...> ranges_;
+
+  using sentinel = std::tuple<decltype(std::end(std::get<I>(ranges_)))...>;
+  constexpr sentinel end() { return {std::end(std::get<I>(ranges_))...}; }
+
+  struct iterator : std::tuple<decltype(std::begin(std::get<I>(ranges_)))...> {
+    using std::tuple<decltype(std::begin(std::get<I>(ranges_)))...>::tuple;
+
+    constexpr auto operator*() {
+      return std::tuple<decltype(*std::get<I>(*this))...>{*std::get<I>(*this)...};
+    }
+
+    constexpr iterator& operator++() {
+      (++std::get<I>(*this), ...);
+      return *this;
+    }
+
+    constexpr bool operator!=(const sentinel& s) const {
+      bool all_iterators_valid = (... && (std::get<I>(*this) != std::get<I>(s)));
+      return all_iterators_valid;
+    }
+  };
+  constexpr iterator begin() { return {std::begin(std::get<I>(ranges_))...}; }
+};
+
+/// \brief A lazy sequence of integers which starts from 0 and never stops.
+///
+/// This can be used in conjunction with Zip() to emulate python's built-in
+/// enumerate() function:
+///
+/// \code {.cpp}
+/// const std::vector<SomeTable>& tables = ...
+/// for (auto [i, table] : Zip(Enumerate<>, tables)) {
+///   std::cout << "#" << i << ": " << table.name() << std::endl;
+/// }
+/// \endcode
+template <typename I = size_t>
+constexpr auto Enumerate = [] {
+  using Int = I;
+  struct {
+    struct sentinel {};
+    constexpr sentinel end() const { return {}; }
+
+    struct iterator {
+      Int value{0};
+
+      constexpr Int operator*() { return value; }
+
+      constexpr iterator& operator++() {
+        ++value;
+        return *this;
+      }
+
+      constexpr std::true_type operator!=(sentinel) const { return {}; }
+    };
+    constexpr iterator begin() const { return {}; }
+  } out;
+
+  return out;
+}();
+
+}  // namespace arrow::internal
diff --git a/pyarrow/include/arrow/util/ree_util.h b/pyarrow/include/arrow/util/ree_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c759f2e80dc5e27559e06ea33341a0a765fe244
--- /dev/null
+++ b/pyarrow/include/arrow/util/ree_util.h
@@ -0,0 +1,584 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
+#include "arrow/array/data.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace ree_util {
+
+/// \brief Get the child array holding the run ends from an REE array
+inline const ArraySpan& RunEndsArray(const ArraySpan& span) { return span.child_data[0]; }
+
+/// \brief Get the child array holding the data values from an REE array
+inline const ArraySpan& ValuesArray(const ArraySpan& span) { return span.child_data[1]; }
+
+/// \brief Get a pointer to run ends values of an REE array
+template <typename RunEndCType>
+const RunEndCType* RunEnds(const ArraySpan& span) {
+  assert(RunEndsArray(span).type->id() == CTypeTraits<RunEndCType>::ArrowType::type_id);
+  return RunEndsArray(span).GetValues<RunEndCType>(1);
+}
+
+/// \brief Perform basic validations on the parameters of an REE array
+/// and its two children arrays
+///
+/// All the checks complete in O(1) time. Consequently, this function:
+/// - DOES NOT check that run_ends is sorted and all-positive
+/// - DOES NOT check the actual contents of the run_ends and values arrays
+Status ValidateRunEndEncodedChildren(const RunEndEncodedType& type,
+                                     int64_t logical_length,
+                                     const std::shared_ptr<ArrayData>& run_ends_data,
+                                     const std::shared_ptr<ArrayData>& values_data,
+                                     int64_t null_count, int64_t logical_offset);
+
+/// \brief Compute the logical null count of an REE array
+int64_t LogicalNullCount(const ArraySpan& span);
+
+namespace internal {
+
+/// \brief Uses binary-search to find the physical offset given a logical offset
+/// and run-end values
+///
+/// \return the physical offset or run_ends_size if the physical offset is not
+/// found in run_ends
+template <typename RunEndCType>
+int64_t FindPhysicalIndex(const RunEndCType* run_ends, int64_t run_ends_size, int64_t i,
+                          int64_t absolute_offset) {
+  assert(absolute_offset + i >= 0);
+  auto it = std::upper_bound(run_ends, run_ends + run_ends_size, absolute_offset + i);
+  int64_t result = std::distance(run_ends, it);
+  assert(result <= run_ends_size);
+  return result;
+}
+
+/// \brief Uses binary-search to calculate the range of physical values (and
+/// run-ends) necessary to represent the logical range of values from
+/// offset to length
+///
+/// \return a pair of physical offset and physical length
+template <typename RunEndCType>
+std::pair<int64_t, int64_t> FindPhysicalRange(const RunEndCType* run_ends,
+                                              int64_t run_ends_size, int64_t length,
+                                              int64_t offset) {
+  const int64_t physical_offset =
+      FindPhysicalIndex<RunEndCType>(run_ends, run_ends_size, 0, offset);
+  // The physical length is calculated by finding the offset of the last element
+  // and adding 1 to it, so first we ensure there is at least one element.
+  if (length == 0) {
+    return {physical_offset, 0};
+  }
+  const int64_t physical_index_of_last = FindPhysicalIndex<RunEndCType>(
+      run_ends + physical_offset, run_ends_size - physical_offset, length - 1, offset);
+
+  assert(physical_index_of_last < run_ends_size - physical_offset);
+  return {physical_offset, physical_index_of_last + 1};
+}
+
+/// \brief Uses binary-search to calculate the number of physical values (and
+/// run-ends) necessary to represent the logical range of values from
+/// offset to length
+template <typename RunEndCType>
+int64_t FindPhysicalLength(const RunEndCType* run_ends, int64_t run_ends_size,
+                           int64_t length, int64_t offset) {
+  auto [_, physical_length] =
+      FindPhysicalRange<RunEndCType>(run_ends, run_ends_size, length, offset);
+  // GH-37107: This is a workaround for GCC 7. GCC 7 doesn't ignore
+  // variables in structured binding automatically from unused
+  // variables when one of these variables are used.
+  // See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81767
+  ARROW_UNUSED(_);
+  return physical_length;
+}
+
+/// \brief Find the physical index into the values array of the REE ArraySpan
+///
+/// This function uses binary-search, so it has a O(log N) cost.
+template <typename RunEndCType>
+int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, int64_t absolute_offset) {
+  const int64_t run_ends_size = RunEndsArray(span).length;
+  return FindPhysicalIndex(RunEnds<RunEndCType>(span), run_ends_size, i, absolute_offset);
+}
+
+/// \brief Find the physical length of an REE ArraySpan
+///
+/// The physical length of an REE is the number of physical values (and
+/// run-ends) necessary to represent the logical range of values from
+/// offset to length.
+///
+/// Avoid calling this function if the physical length can be established in
+/// some other way (e.g. when iterating over the runs sequentially until the
+/// end). This function uses binary-search, so it has a O(log N) cost.
+template <typename RunEndCType>
+int64_t FindPhysicalLength(const ArraySpan& span) {
+  return FindPhysicalLength(
+      /*run_ends=*/RunEnds<RunEndCType>(span),
+      /*run_ends_size=*/RunEndsArray(span).length,
+      /*length=*/span.length,
+      /*offset=*/span.offset);
+}
+
+template <typename RunEndCType>
+struct PhysicalIndexFinder;
+
+// non-inline implementations for each run-end type
+ARROW_EXPORT int64_t FindPhysicalIndexImpl16(PhysicalIndexFinder<int16_t>& self,
+                                             int64_t i);
+ARROW_EXPORT int64_t FindPhysicalIndexImpl32(PhysicalIndexFinder<int32_t>& self,
+                                             int64_t i);
+ARROW_EXPORT int64_t FindPhysicalIndexImpl64(PhysicalIndexFinder<int64_t>& self,
+                                             int64_t i);
+
+/// \brief Stateful version of FindPhysicalIndex() that caches the result of
+/// the previous search and uses it to optimize the next search.
+///
+/// When new queries for the physical index of a logical index come in,
+/// binary search is performed again but the first candidate checked is the
+/// result of the previous search (cached physical index) instead of the
+/// midpoint of the run-ends array.
+///
+/// If that test fails, internal::FindPhysicalIndex() is called with one of the
+/// partitions defined by the cached index. If the queried logical indices
+/// follow an increasing or decreasing pattern, this first test is much more
+/// effective in (1) finding the answer right away (close logical indices belong
+/// to the same runs) or (2) discarding many more candidates than probing
+/// the midpoint would.
+///
+/// The most adversarial case (i.e. alternating between 0 and length-1 queries)
+/// only adds one extra binary search probe when compared to always starting
+/// binary search from the midpoint without any of these optimizations.
+///
+/// \tparam RunEndCType The numeric type of the run-ends array.
+template <typename RunEndCType>
+struct PhysicalIndexFinder {
+  const ArraySpan array_span;
+  const RunEndCType* run_ends;
+  int64_t last_physical_index = 0;
+
+  explicit PhysicalIndexFinder(const ArrayData& data)
+      : array_span(data),
+        run_ends(RunEndsArray(array_span).template GetValues<RunEndCType>(1)) {
+    assert(CTypeTraits<RunEndCType>::ArrowType::type_id ==
+           ::arrow::internal::checked_cast<const RunEndEncodedType&>(*data.type)
+               .run_end_type()
+               ->id());
+  }
+
+  /// \brief Find the physical index into the values array of the REE array.
+  ///
+  /// \pre 0 <= i < array_span.length()
+  /// \param i the logical index into the REE array
+  /// \return the physical index into the values array
+  int64_t FindPhysicalIndex(int64_t i) {
+    if constexpr (std::is_same_v<RunEndCType, int16_t>) {
+      return FindPhysicalIndexImpl16(*this, i);
+    } else if constexpr (std::is_same_v<RunEndCType, int32_t>) {
+      return FindPhysicalIndexImpl32(*this, i);
+    } else {
+      static_assert(std::is_same_v<RunEndCType, int64_t>, "Unsupported RunEndCType.");
+      return FindPhysicalIndexImpl64(*this, i);
+    }
+  }
+};
+
+}  // namespace internal
+
+/// \brief Find the physical index into the values array of the REE ArraySpan
+///
+/// This function uses binary-search, so it has a O(log N) cost.
+ARROW_EXPORT int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i,
+                                       int64_t absolute_offset);
+
+/// \brief Find the physical length of an REE ArraySpan
+///
+/// The physical length of an REE is the number of physical values (and
+/// run-ends) necessary to represent the logical range of values from
+/// offset to length.
+///
+/// Avoid calling this function if the physical length can be established in
+/// some other way (e.g. when iterating over the runs sequentially until the
+/// end). This function uses binary-search, so it has a O(log N) cost.
+ARROW_EXPORT int64_t FindPhysicalLength(const ArraySpan& span);
+
+/// \brief Find the physical range of physical values referenced by the REE in
+/// the logical range from offset to offset + length
+///
+/// \return a pair of physical offset and physical length
+ARROW_EXPORT std::pair<int64_t, int64_t> FindPhysicalRange(const ArraySpan& span,
+                                                           int64_t offset,
+                                                           int64_t length);
+
+// Publish PhysicalIndexFinder outside of the internal namespace.
+template <typename RunEndCType>
+using PhysicalIndexFinder = internal::PhysicalIndexFinder<RunEndCType>;
+
+template <typename RunEndCType>
+class RunEndEncodedArraySpan {
+ private:
+  struct PrivateTag {};
+
+ public:
+  /// \brief Iterator representing the current run during iteration over a
+  /// run-end encoded array
+  class Iterator {
+   public:
+    Iterator(PrivateTag, const RunEndEncodedArraySpan& span, int64_t logical_pos,
+             int64_t physical_pos)
+        : span(span), logical_pos_(logical_pos), physical_pos_(physical_pos) {}
+
+    /// \brief Return the physical index of the run
+    ///
+    /// The values array can be addressed with this index to get the value
+    /// that makes up the run.
+    ///
+    /// NOTE: if this Iterator is equal to RunEndEncodedArraySpan::end(),
+    /// the value returned is undefined.
+    int64_t index_into_array() const { return physical_pos_; }
+
+    /// \brief Return the initial logical position of the run
+    ///
+    /// If this Iterator is equal to RunEndEncodedArraySpan::end(), this is
+    /// the same as RunEndEncodedArraySpan::length().
+    int64_t logical_position() const { return logical_pos_; }
+
+    /// \brief Return the logical position immediately after the run.
+    ///
+    /// Pre-condition: *this != RunEndEncodedArraySpan::end()
+    int64_t run_end() const { return span.run_end(physical_pos_); }
+
+    /// \brief Returns the logical length of the run.
+    ///
+    /// Pre-condition: *this != RunEndEncodedArraySpan::end()
+    int64_t run_length() const { return run_end() - logical_pos_; }
+
+    /// \brief Check if the iterator is at the end of the array.
+    ///
+    /// This can be used to avoid paying the cost of a call to
+    /// RunEndEncodedArraySpan::end().
+    ///
+    /// \return true if the iterator is at the end of the array
+    bool is_end(const RunEndEncodedArraySpan& span) const {
+      return logical_pos_ >= span.length();
+    }
+
+    Iterator& operator++() {
+      logical_pos_ = span.run_end(physical_pos_);
+      physical_pos_ += 1;
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      const Iterator prev = *this;
+      ++(*this);
+      return prev;
+    }
+
+    Iterator& operator--() {
+      physical_pos_ -= 1;
+      logical_pos_ = (physical_pos_ > 0) ? span.run_end(physical_pos_ - 1) : 0;
+      return *this;
+    }
+
+    Iterator operator--(int) {
+      const Iterator prev = *this;
+      --(*this);
+      return prev;
+    }
+
+    bool operator==(const Iterator& other) const {
+      return logical_pos_ == other.logical_pos_;
+    }
+
+    bool operator!=(const Iterator& other) const {
+      return logical_pos_ != other.logical_pos_;
+    }
+
+   public:
+    const RunEndEncodedArraySpan& span;
+
+   private:
+    int64_t logical_pos_;
+    int64_t physical_pos_;
+  };
+
+  // Prevent implicit ArrayData -> ArraySpan conversion in
+  // RunEndEncodedArraySpan instantiation.
+  explicit RunEndEncodedArraySpan(const ArrayData& data) = delete;
+
+  /// \brief Construct a RunEndEncodedArraySpan from an ArraySpan and new
+  /// absolute offset and length.
+  ///
+  /// RunEndEncodedArraySpan{span, off, len} is equivalent to:
+  ///
+  ///   span.SetSlice(off, len);
+  ///   RunEndEncodedArraySpan{span}
+  ///
+  /// ArraySpan::SetSlice() updates the null_count to kUnknownNullCount, but
+  /// we don't need that here as REE arrays have null_count set to 0 by
+  /// convention.
+  explicit RunEndEncodedArraySpan(const ArraySpan& array_span, int64_t offset,
+                                  int64_t length)
+      : array_span_{array_span},
+        run_ends_(RunEnds<RunEndCType>(array_span_)),
+        length_(length),
+        offset_(offset) {
+    assert(array_span_.type->id() == Type::RUN_END_ENCODED);
+  }
+
+  explicit RunEndEncodedArraySpan(const ArraySpan& array_span)
+      : RunEndEncodedArraySpan(array_span, array_span.offset, array_span.length) {}
+
+  int64_t offset() const { return offset_; }
+  int64_t length() const { return length_; }
+
+  int64_t PhysicalIndex(int64_t logical_pos) const {
+    return internal::FindPhysicalIndex(run_ends_, RunEndsArray(array_span_).length,
+                                       logical_pos, offset_);
+  }
+
+  /// \brief Create an iterator from a logical position and its
+  /// pre-computed physical offset into the run ends array
+  ///
+  /// \param logical_pos is an index in the [0, length()] range
+  /// \param physical_offset the pre-calculated PhysicalIndex(logical_pos)
+  Iterator iterator(int64_t logical_pos, int64_t physical_offset) const {
+    return Iterator{PrivateTag{}, *this, logical_pos, physical_offset};
+  }
+
+  /// \brief Create an iterator from a logical position
+  ///
+  /// \param logical_pos is an index in the [0, length()] range
+  Iterator iterator(int64_t logical_pos) const {
+    if (logical_pos < length()) {
+      return iterator(logical_pos, PhysicalIndex(logical_pos));
+    }
+    // If logical_pos is above the valid range, use length() as the logical
+    // position and calculate the physical address right after the last valid
+    // physical position. Which is the physical index of the last logical
+    // position, plus 1.
+    return (length() == 0) ? iterator(0, PhysicalIndex(0))
+                           : iterator(length(), PhysicalIndex(length() - 1) + 1);
+  }
+
+  /// \brief Create an iterator representing the logical begin of the run-end
+  /// encoded array
+  Iterator begin() const { return iterator(0, PhysicalIndex(0)); }
+
+  /// \brief Create an iterator representing the first invalid logical position
+  /// of the run-end encoded array
+  ///
+  /// \warning Avoid calling end() in a loop, as it will recompute the physical
+  /// length of the array on each call (O(log N) cost per call).
+  ///
+  /// You can write your loops like this instead:
+  ///
+  /// \code
+  /// for (auto it = array.begin(), end = array.end(); it != end; ++it) {
+  ///   // ...
+  /// }
+  /// \endcode
+  ///
+  /// Or this version that does not look like idiomatic C++, but removes
+  /// the need for calling end() completely:
+  ///
+  /// \code
+  /// for (auto it = array.begin(); !it.is_end(array); ++it) {
+  ///   // ...
+  /// }
+  /// \endcode
+  Iterator end() const {
+    return iterator(length(),
+                    (length() == 0) ? PhysicalIndex(0) : PhysicalIndex(length() - 1) + 1);
+  }
+
+  // Pre-condition: physical_pos < RunEndsArray(array_span_).length);
+  inline int64_t run_end(int64_t physical_pos) const {
+    assert(physical_pos < RunEndsArray(array_span_).length);
+    // Logical index of the end of the run at physical_pos with offset applied
+    const int64_t logical_run_end =
+        std::max<int64_t>(static_cast<int64_t>(run_ends_[physical_pos]) - offset(), 0);
+    // The current run may go further than the logical length, cap it
+    return std::min(logical_run_end, length());
+  }
+
+ private:
+  const ArraySpan& array_span_;
+  const RunEndCType* run_ends_;
+  const int64_t length_;
+  const int64_t offset_;
+};
+
+/// \brief Iterate over two run-end encoded arrays in runs or sub-runs that are
+/// inside run boundaries on both inputs
+///
+/// Both RunEndEncodedArraySpan should have the same logical length. Instances
+/// of this iterator only hold references to the RunEndEncodedArraySpan inputs.
+template <typename Left, typename Right>
+class MergedRunsIterator {
+ private:
+  using LeftIterator = typename Left::Iterator;
+  using RightIterator = typename Right::Iterator;
+
+  MergedRunsIterator(LeftIterator left_it, RightIterator right_it,
+                     int64_t common_logical_length, int64_t common_logical_pos)
+      : ree_iterators_{std::move(left_it), std::move(right_it)},
+        logical_length_(common_logical_length),
+        logical_pos_(common_logical_pos) {}
+
+ public:
+  /// \brief Construct a MergedRunsIterator positioned at logical position 0.
+  ///
+  /// Pre-condition: left.length() == right.length()
+  MergedRunsIterator(const Left& left, const Right& right)
+      : MergedRunsIterator(left.begin(), right.begin(), left.length(), 0) {
+    assert(left.length() == right.length());
+  }
+
+  static Result<MergedRunsIterator> MakeBegin(const Left& left, const Right& right) {
+    if (left.length() != right.length()) {
+      return Status::Invalid(
+          "MergedRunsIterator expects RunEndEncodedArraySpans of the same length");
+    }
+    return MergedRunsIterator(left, right);
+  }
+
+  static Result<MergedRunsIterator> MakeEnd(const Left& left, const Right& right) {
+    if (left.length() != right.length()) {
+      return Status::Invalid(
+          "MergedRunsIterator expects RunEndEncodedArraySpans of the same length");
+    }
+    return MergedRunsIterator(left.end(), right.end(), left.length(), left.length());
+  }
+
+  /// \brief Return the left RunEndEncodedArraySpan child
+  const Left& left() const { return std::get<0>(ree_iterators_).span; }
+
+  /// \brief Return the right RunEndEncodedArraySpan child
+  const Right& right() const { return std::get<1>(ree_iterators_).span; }
+
+  /// \brief Return the initial logical position of the run
+  ///
+  /// If is_end(), this is the same as length().
+  int64_t logical_position() const { return logical_pos_; }
+
+  /// \brief Whether the iterator is at logical position 0.
+  bool is_begin() const { return logical_pos_ == 0; }
+
+  /// \brief Whether the iterator has reached the end of both arrays
+  bool is_end() const { return logical_pos_ == logical_length_; }
+
+  /// \brief Return the logical position immediately after the run.
+  ///
+  /// Pre-condition: !is_end()
+  int64_t run_end() const {
+    const auto& left_it = std::get<0>(ree_iterators_);
+    const auto& right_it = std::get<1>(ree_iterators_);
+    return std::min(left_it.run_end(), right_it.run_end());
+  }
+
+  /// \brief returns the logical length of the current run
+  ///
+  /// Pre-condition: !is_end()
+  int64_t run_length() const { return run_end() - logical_pos_; }
+
+  /// \brief Return a physical index into the values array of a given input,
+  /// pointing to the value of the current run
+  template <size_t input_id>
+  int64_t index_into_array() const {
+    return std::get<input_id>(ree_iterators_).index_into_array();
+  }
+
+  int64_t index_into_left_array() const { return index_into_array<0>(); }
+  int64_t index_into_right_array() const { return index_into_array<1>(); }
+
+  MergedRunsIterator& operator++() {
+    auto& left_it = std::get<0>(ree_iterators_);
+    auto& right_it = std::get<1>(ree_iterators_);
+
+    const int64_t left_run_end = left_it.run_end();
+    const int64_t right_run_end = right_it.run_end();
+
+    if (left_run_end < right_run_end) {
+      logical_pos_ = left_run_end;
+      ++left_it;
+    } else if (left_run_end > right_run_end) {
+      logical_pos_ = right_run_end;
+      ++right_it;
+    } else {
+      logical_pos_ = left_run_end;
+      ++left_it;
+      ++right_it;
+    }
+    return *this;
+  }
+
+  MergedRunsIterator operator++(int) {
+    MergedRunsIterator prev = *this;
+    ++(*this);
+    return prev;
+  }
+
+  MergedRunsIterator& operator--() {
+    auto& left_it = std::get<0>(ree_iterators_);
+    auto& right_it = std::get<1>(ree_iterators_);
+
+    // The logical position of each iterator is the run_end() of the previous run.
+    const int64_t left_logical_pos = left_it.logical_position();
+    const int64_t right_logical_pos = right_it.logical_position();
+
+    if (left_logical_pos < right_logical_pos) {
+      --right_it;
+      logical_pos_ = std::max(left_logical_pos, right_it.logical_position());
+    } else if (left_logical_pos > right_logical_pos) {
+      --left_it;
+      logical_pos_ = std::max(left_it.logical_position(), right_logical_pos);
+    } else {
+      --left_it;
+      --right_it;
+      logical_pos_ = std::max(left_it.logical_position(), right_it.logical_position());
+    }
+    return *this;
+  }
+
+  MergedRunsIterator operator--(int) {
+    MergedRunsIterator prev = *this;
+    --(*this);
+    return prev;
+  }
+
+  bool operator==(const MergedRunsIterator& other) const {
+    return logical_pos_ == other.logical_position();
+  }
+
+  bool operator!=(const MergedRunsIterator& other) const { return !(*this == other); }
+
+ private:
+  std::tuple<LeftIterator, RightIterator> ree_iterators_;
+  const int64_t logical_length_;
+  int64_t logical_pos_;
+};
+
+}  // namespace ree_util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/regex.h b/pyarrow/include/arrow/util/regex.h
new file mode 100644
index 0000000000000000000000000000000000000000..590fbac7153889129e7bca7652125980cb4457cd
--- /dev/null
+++ b/pyarrow/include/arrow/util/regex.h
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <initializer_list>
+#include <regex>
+#include <string_view>
+#include <type_traits>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// Match regex against target and produce string_views out of matches.
+inline bool RegexMatch(const std::regex& regex, std::string_view target,
+                       std::initializer_list<std::string_view*> out_matches) {
+  assert(regex.mark_count() == out_matches.size());
+
+  std::match_results<decltype(target.begin())> match;
+  if (!std::regex_match(target.begin(), target.end(), match, regex)) {
+    return false;
+  }
+
+  // Match #0 is the whole matched sequence
+  assert(regex.mark_count() + 1 == match.size());
+  auto out_it = out_matches.begin();
+  for (size_t i = 1; i < match.size(); ++i) {
+    **out_it++ = target.substr(match.position(i), match.length(i));
+  }
+  return true;
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/rows_to_batches.h b/pyarrow/include/arrow/util/rows_to_batches.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ad254df200efc08c5c9a4956e0e781b496b2b07
--- /dev/null
+++ b/pyarrow/include/arrow/util/rows_to_batches.h
@@ -0,0 +1,163 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/table_builder.h"
+#include "arrow/util/iterator.h"
+
+#include <type_traits>
+
+namespace arrow::util {
+
+namespace detail {
+
+// Default identity function row accessor. Used to for the common case where the value
+// of each row iterated over is it's self also directly iterable.
+[[nodiscard]] constexpr inline auto MakeDefaultRowAccessor() {
+  return [](auto& x) -> Result<decltype(std::ref(x))> { return std::ref(x); };
+}
+
+// Meta-function to check if a type `T` is a range (iterable using `std::begin()` /
+// `std::end()`). `is_range<T>::value` will be false if `T` is not a valid range.
+template <typename T, typename = void>
+struct is_range : std::false_type {};
+
+template <typename T>
+struct is_range<T, std::void_t<decltype(std::begin(std::declval<T>())),
+                               decltype(std::end(std::declval<T>()))>> : std::true_type {
+};
+
+}  // namespace detail
+
+/// Delete overload for `const Range&& rows` because the data's lifetime must exceed
+/// the lifetime of the function call. `data` will be read when client uses the
+/// `RecordBatchReader`
+template <class Range, class DataPointConvertor,
+          class RowAccessor = decltype(detail::MakeDefaultRowAccessor())>
+[[nodiscard]] typename std::enable_if_t<detail::is_range<Range>::value,
+                                        Result<std::shared_ptr<RecordBatchReader>>>
+/* Result<std::shared_ptr<RecordBatchReader>>> */ RowsToBatches(
+    const std::shared_ptr<Schema>& schema, const Range&& rows,
+    DataPointConvertor&& data_point_convertor,
+    RowAccessor&& row_accessor = detail::MakeDefaultRowAccessor(),
+    MemoryPool* pool = default_memory_pool(),
+    const std::size_t batch_size = 1024) = delete;
+
+/// \brief Utility function for converting any row-based structure into an
+/// `arrow::RecordBatchReader` (this can be easily converted to an `arrow::Table` using
+/// `arrow::RecordBatchReader::ToTable()`).
+///
+/// Examples of supported types:
+/// - `std::vector<std::vector<std::variant<int, bsl::string>>>`
+/// - `std::vector<MyRowStruct>`
+
+/// If `rows` (client’s row-based structure) is not a valid C++ range, the client will
+/// need to either make it iterable, or make an adapter/wrapper that is a valid C++
+/// range.
+
+/// The client must provide a `DataPointConvertor` callable type that will convert the
+/// structure’s data points into the corresponding arrow types.
+
+/// Complex nested rows can be supported by providing a custom `row_accessor` instead
+/// of the default.
+
+/// Example usage:
+/// \code{.cpp}
+/// auto IntConvertor = [](ArrayBuilder& array_builder, int value) {
+///  return static_cast<Int64Builder&>(array_builder).Append(value);
+/// };
+/// std::vector<std::vector<int>> data = {{1, 2, 4}, {5, 6, 7}};
+/// auto batches = RowsToBatches(kTestSchema, data, IntConvertor);
+/// \endcode
+
+/// \param[in] schema - The schema to be used in the `RecordBatchReader`
+
+/// \param[in] rows - Iterable row-based structure that will be converted to arrow
+/// batches
+
+/// \param[in] data_point_convertor - Client provided callable type that will convert
+/// the structure’s data points into the corresponding arrow types. The convertor must
+/// return an error `Status` if an error happens during conversion.
+
+/// \param[in] row_accessor - In the common case where the value of each row iterated
+/// over is it's self also directly iterable, the client can just use the default.
+/// The provided callable must take the values of the `rows` range and return a
+/// `std::reference_wrapper<Range>` to the data points in a given row. The data points
+/// must be in order of their corresponding fields in the schema.
+/// see: /ref `MakeDefaultRowAccessor`
+
+/// \param[in] pool - The MemoryPool to use for allocations.
+
+/// \param[in] batch_size - Number of rows to insert into each RecordBatch.
+
+/// \return `Result<std::shared_ptr<RecordBatchReader>>>` result will be a
+/// `std::shared_ptr<RecordBatchReader>>` if not errors occurred, else an error status.
+template <class Range, class DataPointConvertor,
+          class RowAccessor = decltype(detail::MakeDefaultRowAccessor())>
+[[nodiscard]] typename std::enable_if_t<detail::is_range<Range>::value,
+                                        Result<std::shared_ptr<RecordBatchReader>>>
+/* Result<std::shared_ptr<RecordBatchReader>>> */ RowsToBatches(
+    const std::shared_ptr<Schema>& schema, const Range& rows,
+    DataPointConvertor&& data_point_convertor,
+    RowAccessor&& row_accessor = detail::MakeDefaultRowAccessor(),
+    MemoryPool* pool = default_memory_pool(), const std::size_t batch_size = 1024) {
+  auto make_next_batch =
+      [pool = pool, batch_size = batch_size, rows_ittr = std::begin(rows),
+       rows_ittr_end = std::end(rows), schema = schema,
+       row_accessor = std::forward<RowAccessor>(row_accessor),
+       data_point_convertor = std::forward<DataPointConvertor>(
+           data_point_convertor)]() mutable -> Result<std::shared_ptr<RecordBatch>> {
+    if (rows_ittr == rows_ittr_end) return NULLPTR;
+
+    ARROW_ASSIGN_OR_RAISE(auto record_batch_builder,
+                          RecordBatchBuilder::Make(schema, pool, batch_size));
+
+    for (size_t i = 0; i < batch_size && (rows_ittr != rows_ittr_end);
+         i++, std::advance(rows_ittr, 1)) {
+      int col_index = 0;
+      ARROW_ASSIGN_OR_RAISE(const auto row, row_accessor(*rows_ittr));
+
+      // If the accessor returns a `std::reference_wrapper` unwrap if
+      const auto& row_unwrapped = [&]() {
+        if constexpr (detail::is_range<decltype(row)>::value)
+          return row;
+        else
+          return row.get();
+      }();
+
+      for (auto& data_point : row_unwrapped) {
+        ArrayBuilder* array_builder = record_batch_builder->GetField(col_index);
+        ARROW_RETURN_IF(array_builder == NULLPTR,
+                        Status::Invalid("array_builder == NULLPTR"));
+
+        ARROW_RETURN_NOT_OK(data_point_convertor(*array_builder, data_point));
+        col_index++;
+      }
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto result, record_batch_builder->Flush());
+    return result;
+  };
+  return RecordBatchReader::MakeFromIterator(MakeFunctionIterator(make_next_batch),
+                                             schema);
+}
+
+}  // namespace arrow::util
diff --git a/pyarrow/include/arrow/util/secure_string.h b/pyarrow/include/arrow/util/secure_string.h
new file mode 100644
index 0000000000000000000000000000000000000000..30088c78d4c3a7ab33b8e00a6e526777bae8d1b6
--- /dev/null
+++ b/pyarrow/include/arrow/util/secure_string.h
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include "arrow/util/span.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow::util {
+/**
+ * A secure string that ensures the wrapped string is cleared from memory on
+ * deconstruction. This class can only be created from std::string that are securely
+ * erased after creation.
+ *
+ * Note: This class does not provide a constructor / assignment operator that copies a
+ * std::string because that would allow code to create a SecureString while accidentally
+ * not noticing the need to securely erasing the argument after invoking the constructor /
+ * calling the assignment operator.
+ */
+class ARROW_EXPORT SecureString {
+ public:
+  SecureString() = default;
+  SecureString(SecureString&&) noexcept;
+  SecureString(const SecureString&) = default;
+  explicit SecureString(std::string&&) noexcept;
+  explicit SecureString(size_t, char) noexcept;
+
+  SecureString& operator=(SecureString&&) noexcept;
+  SecureString& operator=(const SecureString&);
+  SecureString& operator=(std::string&&) noexcept;
+
+  bool operator==(const SecureString&) const;
+  bool operator!=(const SecureString&) const;
+
+  ~SecureString() { Dispose(); }
+
+  [[nodiscard]] bool empty() const;
+  [[nodiscard]] std::size_t size() const;
+  [[nodiscard]] std::size_t length() const;
+  [[nodiscard]] std::size_t capacity() const;
+
+  [[nodiscard]] span<uint8_t> as_span();
+  [[nodiscard]] span<const uint8_t> as_span() const;
+  [[nodiscard]] std::string_view as_view() const;
+
+  void Dispose();
+
+  static void SecureClear(std::string*);
+  static void SecureClear(uint8_t* data, size_t size);
+
+ private:
+  std::string secret_;
+};
+
+}  // namespace arrow::util
diff --git a/pyarrow/include/arrow/util/simd.h b/pyarrow/include/arrow/util/simd.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc1a7d6cc807cc2139d3bb0ee706e51f4c2a0192
--- /dev/null
+++ b/pyarrow/include/arrow/util/simd.h
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifdef _MSC_VER
+// MSVC x86_64/arm64
+
+#  if defined(_M_AMD64) || defined(_M_X64)
+#    include <intrin.h>
+#  endif
+
+#else
+// gcc/clang (possibly others)
+
+#  if defined(ARROW_HAVE_BMI2) || defined(ARROW_HAVE_RUNTIME_BMI2)
+#    include <x86intrin.h>
+#  endif
+
+#  if defined(ARROW_HAVE_AVX2) || defined(ARROW_HAVE_AVX512) || \
+      defined(ARROW_HAVE_RUNTIME_AVX2) || defined(ARROW_HAVE_RUNTIME_AVX512)
+#    include <immintrin.h>
+#  elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_RUNTIME_SSE4_2)
+#    include <nmmintrin.h>
+#  endif
+
+#  ifdef ARROW_HAVE_NEON
+#    include <arm_neon.h>
+#  endif
+
+// GH-44098: Workaround for missing _mm256_set_m128i in older versions of GCC.
+#  if defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 8
+#    define _mm256_set_m128i(hi, lo) \
+      _mm256_inserti128_si256(_mm256_castsi128_si256(lo), (hi), 1)
+#  endif
+
+#endif
diff --git a/pyarrow/include/arrow/util/small_vector.h b/pyarrow/include/arrow/util/small_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..f371e647152fde369147abde533fbc45b4c2f3f7
--- /dev/null
+++ b/pyarrow/include/arrow/util/small_vector.h
@@ -0,0 +1,512 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <initializer_list>
+#include <iterator>
+#include <limits>
+#include <new>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/util/aligned_storage.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+template <typename T, size_t N, bool NonTrivialDestructor>
+struct StaticVectorStorageBase {
+  using storage_type = AlignedStorage<T>;
+
+  storage_type static_data_[N];
+  size_t size_ = 0;
+
+  void destroy() noexcept {}
+};
+
+template <typename T, size_t N>
+struct StaticVectorStorageBase<T, N, true> {
+  using storage_type = AlignedStorage<T>;
+
+  storage_type static_data_[N];
+  size_t size_ = 0;
+
+  ~StaticVectorStorageBase() noexcept { destroy(); }
+
+  void destroy() noexcept { storage_type::destroy_several(static_data_, size_); }
+};
+
+template <typename T, size_t N, bool D = !std::is_trivially_destructible<T>::value>
+struct StaticVectorStorage : public StaticVectorStorageBase<T, N, D> {
+  using Base = StaticVectorStorageBase<T, N, D>;
+  using typename Base::storage_type;
+
+  using Base::size_;
+  using Base::static_data_;
+
+  StaticVectorStorage() noexcept = default;
+
+  constexpr storage_type* storage_ptr() { return static_data_; }
+
+  constexpr const storage_type* const_storage_ptr() const { return static_data_; }
+
+  // Adjust storage size, but don't initialize any objects
+  void bump_size(size_t addend) {
+    assert(size_ + addend <= N);
+    size_ += addend;
+  }
+
+  void ensure_capacity(size_t min_capacity) { assert(min_capacity <= N); }
+
+  // Adjust storage size, but don't destroy any objects
+  void reduce_size(size_t reduce_by) {
+    assert(reduce_by <= size_);
+    size_ -= reduce_by;
+  }
+
+  // Move objects from another storage, but don't destroy any objects currently
+  // stored in *this.
+  // You need to call destroy() first if necessary (e.g. in a
+  // move assignment operator).
+  void move_construct(StaticVectorStorage&& other) noexcept {
+    size_ = other.size_;
+    if (size_ != 0) {
+      // Use a compile-time memcpy size (N) for trivial types
+      storage_type::move_construct_several(other.static_data_, static_data_, size_, N);
+    }
+  }
+
+  constexpr size_t capacity() const { return N; }
+
+  constexpr size_t max_size() const { return N; }
+
+  void reserve(size_t n) {}
+
+  void clear() {
+    storage_type::destroy_several(static_data_, size_);
+    size_ = 0;
+  }
+};
+
+template <typename T, size_t N>
+struct SmallVectorStorage {
+  using storage_type = AlignedStorage<T>;
+
+  storage_type static_data_[N];
+  size_t size_ = 0;
+  storage_type* data_ = static_data_;
+  size_t dynamic_capacity_ = 0;
+
+  SmallVectorStorage() noexcept = default;
+
+  ~SmallVectorStorage() { destroy(); }
+
+  constexpr storage_type* storage_ptr() { return data_; }
+
+  constexpr const storage_type* const_storage_ptr() const { return data_; }
+
+  void bump_size(size_t addend) {
+    const size_t new_size = size_ + addend;
+    ensure_capacity(new_size);
+    size_ = new_size;
+  }
+
+  void ensure_capacity(size_t min_capacity) {
+    if (dynamic_capacity_) {
+      // Grow dynamic storage if necessary
+      if (min_capacity > dynamic_capacity_) {
+        size_t new_capacity = std::max(dynamic_capacity_ * 2, min_capacity);
+        reallocate_dynamic(new_capacity);
+      }
+    } else if (min_capacity > N) {
+      switch_to_dynamic(min_capacity);
+    }
+  }
+
+  void reduce_size(size_t reduce_by) {
+    assert(reduce_by <= size_);
+    size_ -= reduce_by;
+  }
+
+  void destroy() noexcept {
+    storage_type::destroy_several(data_, size_);
+    if (dynamic_capacity_) {
+      delete[] data_;
+    }
+  }
+
+  void move_construct(SmallVectorStorage&& other) noexcept {
+    size_ = other.size_;
+    dynamic_capacity_ = other.dynamic_capacity_;
+    if (dynamic_capacity_) {
+      data_ = other.data_;
+      other.data_ = other.static_data_;
+      other.dynamic_capacity_ = 0;
+      other.size_ = 0;
+    } else if (size_ != 0) {
+      // Use a compile-time memcpy size (N) for trivial types
+      storage_type::move_construct_several(other.static_data_, static_data_, size_, N);
+    }
+  }
+
+  constexpr size_t capacity() const { return dynamic_capacity_ ? dynamic_capacity_ : N; }
+
+  constexpr size_t max_size() const { return std::numeric_limits<size_t>::max(); }
+
+  void reserve(size_t n) {
+    if (dynamic_capacity_) {
+      if (n > dynamic_capacity_) {
+        reallocate_dynamic(n);
+      }
+    } else if (n > N) {
+      switch_to_dynamic(n);
+    }
+  }
+
+  void clear() {
+    storage_type::destroy_several(data_, size_);
+    size_ = 0;
+  }
+
+ private:
+  void switch_to_dynamic(size_t new_capacity) {
+    dynamic_capacity_ = new_capacity;
+    data_ = new storage_type[new_capacity];
+    storage_type::move_construct_several_and_destroy_source(static_data_, data_, size_);
+  }
+
+  void reallocate_dynamic(size_t new_capacity) {
+    assert(new_capacity >= size_);
+    auto new_data = new storage_type[new_capacity];
+    storage_type::move_construct_several_and_destroy_source(data_, new_data, size_);
+    delete[] data_;
+    dynamic_capacity_ = new_capacity;
+    data_ = new_data;
+  }
+};
+
+template <typename T, size_t N, typename Storage>
+class StaticVectorImpl {
+ private:
+  Storage storage_;
+
+  T* data_ptr() { return storage_.storage_ptr()->get(); }
+
+  constexpr const T* const_data_ptr() const {
+    return storage_.const_storage_ptr()->get();
+  }
+
+ public:
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using value_type = T;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using reference = T&;
+  using const_reference = const T&;
+  using iterator = T*;
+  using const_iterator = const T*;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+  constexpr StaticVectorImpl() noexcept = default;
+
+  // Move and copy constructors
+  StaticVectorImpl(StaticVectorImpl&& other) noexcept {
+    storage_.move_construct(std::move(other.storage_));
+  }
+
+  StaticVectorImpl& operator=(StaticVectorImpl&& other) noexcept {
+    if (ARROW_PREDICT_TRUE(&other != this)) {
+      // TODO move_assign?
+      storage_.destroy();
+      storage_.move_construct(std::move(other.storage_));
+    }
+    return *this;
+  }
+
+  StaticVectorImpl(const StaticVectorImpl& other) {
+    init_by_copying(other.storage_.size_, other.const_data_ptr());
+  }
+
+  StaticVectorImpl& operator=(const StaticVectorImpl& other) noexcept {
+    if (ARROW_PREDICT_TRUE(&other != this)) {
+      assign_by_copying(other.storage_.size_, other.data());
+    }
+    return *this;
+  }
+
+  // Automatic conversion from std::vector<T>, for convenience
+  StaticVectorImpl(const std::vector<T>& other) {  // NOLINT: explicit
+    init_by_copying(other.size(), other.data());
+  }
+
+  StaticVectorImpl(std::vector<T>&& other) noexcept {  // NOLINT: explicit
+    init_by_moving(other.size(), other.data());
+  }
+
+  StaticVectorImpl& operator=(const std::vector<T>& other) {
+    assign_by_copying(other.size(), other.data());
+    return *this;
+  }
+
+  StaticVectorImpl& operator=(std::vector<T>&& other) noexcept {
+    assign_by_moving(other.size(), other.data());
+    return *this;
+  }
+
+  // Constructing from count and optional initialization value
+  explicit StaticVectorImpl(size_t count) {
+    storage_.bump_size(count);
+    auto* p = storage_.storage_ptr();
+    for (size_t i = 0; i < count; ++i) {
+      p[i].construct();
+    }
+  }
+
+  StaticVectorImpl(size_t count, const T& value) {
+    storage_.bump_size(count);
+    auto* p = storage_.storage_ptr();
+    for (size_t i = 0; i < count; ++i) {
+      p[i].construct(value);
+    }
+  }
+
+  StaticVectorImpl(std::initializer_list<T> values) {
+    storage_.bump_size(values.size());
+    auto* p = storage_.storage_ptr();
+    for (auto&& v : values) {
+      // Unfortunately, cannot move initializer values
+      p++->construct(v);
+    }
+  }
+
+  // Size inspection
+
+  constexpr bool empty() const { return storage_.size_ == 0; }
+
+  constexpr size_t size() const { return storage_.size_; }
+
+  constexpr size_t capacity() const { return storage_.capacity(); }
+
+  constexpr size_t max_size() const { return storage_.max_size(); }
+
+  // Data access
+
+  T& operator[](size_t i) { return data_ptr()[i]; }
+
+  constexpr const T& operator[](size_t i) const { return const_data_ptr()[i]; }
+
+  T& front() { return data_ptr()[0]; }
+
+  constexpr const T& front() const { return const_data_ptr()[0]; }
+
+  T& back() { return data_ptr()[storage_.size_ - 1]; }
+
+  constexpr const T& back() const { return const_data_ptr()[storage_.size_ - 1]; }
+
+  T* data() { return data_ptr(); }
+
+  constexpr const T* data() const { return const_data_ptr(); }
+
+  // Iterators
+
+  iterator begin() { return iterator(data_ptr()); }
+
+  constexpr const_iterator begin() const { return const_iterator(const_data_ptr()); }
+
+  constexpr const_iterator cbegin() const { return const_iterator(const_data_ptr()); }
+
+  iterator end() { return iterator(data_ptr() + storage_.size_); }
+
+  constexpr const_iterator end() const {
+    return const_iterator(const_data_ptr() + storage_.size_);
+  }
+
+  constexpr const_iterator cend() const {
+    return const_iterator(const_data_ptr() + storage_.size_);
+  }
+
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+
+  constexpr const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+
+  constexpr const_reverse_iterator crbegin() const {
+    return const_reverse_iterator(end());
+  }
+
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+
+  constexpr const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  constexpr const_reverse_iterator crend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  // Mutations
+
+  void reserve(size_t n) { storage_.reserve(n); }
+
+  void clear() { storage_.clear(); }
+
+  void push_back(const T& value) {
+    storage_.bump_size(1);
+    storage_.storage_ptr()[storage_.size_ - 1].construct(value);
+  }
+
+  void push_back(T&& value) {
+    storage_.bump_size(1);
+    storage_.storage_ptr()[storage_.size_ - 1].construct(std::move(value));
+  }
+
+  template <typename... Args>
+  void emplace_back(Args&&... args) {
+    storage_.bump_size(1);
+    storage_.storage_ptr()[storage_.size_ - 1].construct(std::forward<Args>(args)...);
+  }
+
+  template <typename InputIt>
+  iterator insert(const_iterator insert_at, InputIt first, InputIt last) {
+    const size_t n = storage_.size_;
+    const size_t it_size = static_cast<size_t>(last - first);  // XXX might be O(n)?
+    const size_t pos = static_cast<size_t>(insert_at - const_data_ptr());
+    storage_.bump_size(it_size);
+    auto* p = storage_.storage_ptr();
+    if (it_size == 0) {
+      return p[pos].get();
+    }
+    const size_t end_pos = pos + it_size;
+
+    // Move [pos; n) to [end_pos; end_pos + n - pos)
+    size_t i = n;
+    size_t j = end_pos + n - pos;
+    while (j > std::max(n, end_pos)) {
+      p[--j].move_construct(&p[--i]);
+    }
+    while (j > end_pos) {
+      p[--j].move_assign(&p[--i]);
+    }
+    assert(j == end_pos);
+    // Copy [first; last) to [pos; end_pos)
+    j = pos;
+    while (j < std::min(n, end_pos)) {
+      p[j++].assign(*first++);
+    }
+    while (j < end_pos) {
+      p[j++].construct(*first++);
+    }
+    assert(first == last);
+    return p[pos].get();
+  }
+
+  void resize(size_t n) {
+    const size_t old_size = storage_.size_;
+    if (n > storage_.size_) {
+      storage_.bump_size(n - old_size);
+      auto* p = storage_.storage_ptr();
+      for (size_t i = old_size; i < n; ++i) {
+        p[i].construct(T{});
+      }
+    } else {
+      auto* p = storage_.storage_ptr();
+      for (size_t i = n; i < old_size; ++i) {
+        p[i].destroy();
+      }
+      storage_.reduce_size(old_size - n);
+    }
+  }
+
+  void resize(size_t n, const T& value) {
+    const size_t old_size = storage_.size_;
+    if (n > storage_.size_) {
+      storage_.bump_size(n - old_size);
+      auto* p = storage_.storage_ptr();
+      for (size_t i = old_size; i < n; ++i) {
+        p[i].construct(value);
+      }
+    } else {
+      auto* p = storage_.storage_ptr();
+      for (size_t i = n; i < old_size; ++i) {
+        p[i].destroy();
+      }
+      storage_.reduce_size(old_size - n);
+    }
+  }
+
+ private:
+  template <typename InputIt>
+  void init_by_copying(size_t n, InputIt src) {
+    storage_.bump_size(n);
+    auto* dest = storage_.storage_ptr();
+    for (size_t i = 0; i < n; ++i, ++src) {
+      dest[i].construct(*src);
+    }
+  }
+
+  template <typename InputIt>
+  void init_by_moving(size_t n, InputIt src) {
+    init_by_copying(n, std::make_move_iterator(src));
+  }
+
+  template <typename InputIt>
+  void assign_by_copying(size_t n, InputIt src) {
+    const size_t old_size = storage_.size_;
+    if (n > old_size) {
+      storage_.bump_size(n - old_size);
+      auto* dest = storage_.storage_ptr();
+      for (size_t i = 0; i < old_size; ++i, ++src) {
+        dest[i].assign(*src);
+      }
+      for (size_t i = old_size; i < n; ++i, ++src) {
+        dest[i].construct(*src);
+      }
+    } else {
+      auto* dest = storage_.storage_ptr();
+      for (size_t i = 0; i < n; ++i, ++src) {
+        dest[i].assign(*src);
+      }
+      for (size_t i = n; i < old_size; ++i) {
+        dest[i].destroy();
+      }
+      storage_.reduce_size(old_size - n);
+    }
+  }
+
+  template <typename InputIt>
+  void assign_by_moving(size_t n, InputIt src) {
+    assign_by_copying(n, std::make_move_iterator(src));
+  }
+};
+
+template <typename T, size_t N>
+using StaticVector = StaticVectorImpl<T, N, StaticVectorStorage<T, N>>;
+
+template <typename T, size_t N>
+using SmallVector = StaticVectorImpl<T, N, SmallVectorStorage<T, N>>;
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/span.h b/pyarrow/include/arrow/util/span.h
new file mode 100644
index 0000000000000000000000000000000000000000..1e57ee8c8d1ccf1c418a1032a513ca79e64c1468
--- /dev/null
+++ b/pyarrow/include/arrow/util/span.h
@@ -0,0 +1,132 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <type_traits>
+
+namespace arrow::util {
+
+template <class T>
+class span;
+
+/// std::span polyfill.
+///
+/// Does not support static extents.
+template <typename T>
+class span {
+  static_assert(sizeof(T),
+                R"(
+std::span allows contiguous_iterators instead of just pointers, the enforcement
+of which requires T to be a complete type. arrow::util::span does not support
+contiguous_iterators, but T is still required to be a complete type to prevent
+writing code which would break when it is replaced by std::span.)");
+
+ public:
+  using element_type = T;
+  using value_type = std::remove_cv_t<T>;
+  using iterator = T*;
+  using const_iterator = T const*;
+
+  span() = default;
+  span(const span&) = default;
+  span& operator=(const span&) = default;
+
+  template <typename M, typename = std::enable_if_t<std::is_same_v<T, M const>>>
+  // NOLINTNEXTLINE runtime/explicit
+  constexpr span(span<M> mut) : span{mut.data(), mut.size()} {}
+
+  constexpr span(T* data, size_t count) : data_{data}, size_{count} {}
+
+  constexpr span(T* begin, T* end)
+      : data_{begin}, size_{static_cast<size_t>(end - begin)} {}
+
+  template <typename R, typename RD = decltype(std::data(std::declval<R>())),
+            typename RS = decltype(std::size(std::declval<R>())),
+            typename E = std::enable_if_t<std::is_constructible_v<T*, RD> &&
+                                          std::is_constructible_v<size_t, RS>>>
+  // NOLINTNEXTLINE runtime/explicit, non-const reference
+  constexpr span(R&& range) : data_{std::data(range)}, size_{std::size(range)} {}
+
+  constexpr T* begin() const { return data_; }
+  constexpr T* end() const { return data_ + size_; }
+  constexpr T* data() const { return data_; }
+
+  constexpr size_t size() const { return size_; }
+  constexpr size_t size_bytes() const { return size_ * sizeof(T); }
+  constexpr bool empty() const { return size_ == 0; }
+
+  constexpr T& operator[](size_t i) { return data_[i]; }
+  constexpr const T& operator[](size_t i) const { return data_[i]; }
+
+  constexpr span subspan(size_t offset) const {
+    if (offset > size_) return {data_, data_};
+    return {data_ + offset, size_ - offset};
+  }
+
+  constexpr span subspan(size_t offset, size_t count) const {
+    auto out = subspan(offset);
+    if (count < out.size_) {
+      out.size_ = count;
+    }
+    return out;
+  }
+
+  constexpr bool operator==(const span& other) const {
+    if (size_ != other.size_) return false;
+
+    if constexpr (std::is_integral_v<T>) {
+      if (size_ == 0) {
+        return true;  // memcmp does not handle null pointers, even if size_ == 0
+      }
+      return std::memcmp(data_, other.data_, size_bytes()) == 0;
+    } else {
+      T* ptr = data_;
+      for (T const& e : other) {
+        if (*ptr++ != e) return false;
+      }
+      return true;
+    }
+  }
+  constexpr bool operator!=(const span& other) const { return !(*this == other); }
+
+ private:
+  T* data_{};
+  size_t size_{};
+};
+
+template <typename R>
+span(R& range) -> span<std::remove_pointer_t<decltype(std::data(range))>>;
+
+template <typename T>
+span(T*, size_t) -> span<T>;
+
+template <typename T>
+constexpr span<std::byte const> as_bytes(span<T> s) {
+  return {reinterpret_cast<const std::byte*>(s.data()), s.size_bytes()};
+}
+
+template <typename T>
+constexpr span<std::byte> as_writable_bytes(span<T> s) {
+  return {reinterpret_cast<std::byte*>(s.data()), s.size_bytes()};
+}
+
+}  // namespace arrow::util
diff --git a/pyarrow/include/arrow/util/string.h b/pyarrow/include/arrow/util/string.h
new file mode 100644
index 0000000000000000000000000000000000000000..af8c948f48aeed209cb82d056b7eef37223f9033
--- /dev/null
+++ b/pyarrow/include/arrow/util/string.h
@@ -0,0 +1,161 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#if __has_include(<charconv>)
+#  include <charconv>
+#endif
+
+#include "arrow/result.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+class Status;
+
+ARROW_EXPORT std::string HexEncode(const uint8_t* data, size_t length);
+
+ARROW_EXPORT std::string Escape(const char* data, size_t length);
+
+ARROW_EXPORT std::string HexEncode(const char* data, size_t length);
+
+ARROW_EXPORT std::string HexEncode(std::string_view str);
+
+ARROW_EXPORT std::string Escape(std::string_view str);
+
+ARROW_EXPORT Status ParseHexValue(const char* hex_pair, uint8_t* out);
+
+ARROW_EXPORT Status ParseHexValues(std::string_view hex_string, uint8_t* out);
+
+namespace internal {
+
+/// \brief Split a string with a delimiter
+ARROW_EXPORT
+std::vector<std::string_view> SplitString(std::string_view v, char delim,
+                                          int64_t limit = 0);
+
+/// \brief Join strings with a delimiter
+ARROW_EXPORT
+std::string JoinStrings(const std::vector<std::string_view>& strings,
+                        std::string_view delimiter);
+
+/// \brief Join strings with a delimiter
+ARROW_EXPORT
+std::string JoinStrings(const std::vector<std::string>& strings,
+                        std::string_view delimiter);
+
+/// \brief Trim whitespace from left and right sides of string
+ARROW_EXPORT
+std::string TrimString(std::string value);
+
+ARROW_EXPORT
+bool AsciiEqualsCaseInsensitive(std::string_view left, std::string_view right);
+
+ARROW_EXPORT
+std::string AsciiToLower(std::string_view value);
+
+ARROW_EXPORT
+std::string AsciiToUpper(std::string_view value);
+
+/// \brief Search for the first instance of a token and replace it or return nullopt if
+/// the token is not found.
+ARROW_EXPORT
+std::optional<std::string> Replace(std::string_view s, std::string_view token,
+                                   std::string_view replacement);
+
+/// \brief Get boolean value from string
+///
+/// If "1", "true" (case-insensitive), returns true
+/// If "0", "false" (case-insensitive), returns false
+/// Otherwise, returns Status::Invalid
+ARROW_EXPORT
+arrow::Result<bool> ParseBoolean(std::string_view value);
+
+#if __has_include(<charconv>)
+
+namespace detail {
+template <typename T, typename = void>
+struct can_to_chars : public std::false_type {};
+
+template <typename T>
+struct can_to_chars<
+    T, std::void_t<decltype(std::to_chars(std::declval<char*>(), std::declval<char*>(),
+                                          std::declval<std::remove_reference_t<T>>()))>>
+    : public std::true_type {};
+}  // namespace detail
+
+/// \brief Whether std::to_chars exists for the current value type.
+///
+/// This is useful as some C++ libraries do not implement all specified overloads
+/// for std::to_chars.
+template <typename T>
+inline constexpr bool have_to_chars = detail::can_to_chars<T>::value;
+
+/// \brief An ergonomic wrapper around std::to_chars, returning a std::string
+///
+/// For most inputs, the std::string result will not incur any heap allocation
+/// thanks to small string optimization.
+///
+/// Compared to std::to_string, this function gives locale-agnostic results
+/// and might also be faster.
+template <typename T, typename... Args>
+std::string ToChars(T value, Args&&... args) {
+  if constexpr (!have_to_chars<T>) {
+    // Some C++ standard libraries do not yet implement std::to_chars for all types,
+    // in which case we have to fallback to std::string.
+    return std::to_string(value);
+  } else {
+    // According to various sources, the GNU libstdc++ and Microsoft's C++ STL
+    // allow up to 15 bytes of small string optimization, while clang's libc++
+    // goes up to 22 bytes. Choose the pessimistic value.
+    std::string out(15, 0);
+    auto res = std::to_chars(&out.front(), &out.back(), value, args...);
+    while (res.ec != std::errc{}) {
+      assert(res.ec == std::errc::value_too_large);
+      out.resize(out.capacity() * 2);
+      res = std::to_chars(&out.front(), &out.back(), value, args...);
+    }
+    const auto length = res.ptr - out.data();
+    assert(length <= static_cast<int64_t>(out.length()));
+    out.resize(length);
+    return out;
+  }
+}
+
+#else  // !__has_include(<charconv>)
+
+template <typename T>
+inline constexpr bool have_to_chars = false;
+
+template <typename T, typename... Args>
+std::string ToChars(T value, Args&&... args) {
+  return std::to_string(value);
+}
+
+#endif
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/string_util.h b/pyarrow/include/arrow/util/string_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f7803dc92d80fd39b329ae6a9e2a47cf6cfbf82
--- /dev/null
+++ b/pyarrow/include/arrow/util/string_util.h
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License. template <typename T>
+
+#pragma once
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+namespace internal {
+
+class ARROW_EXPORT StringStreamWrapper {
+ public:
+  StringStreamWrapper();
+  ~StringStreamWrapper();
+
+  std::ostream& stream() { return ostream_; }
+  std::string str();
+
+ protected:
+  std::unique_ptr<std::ostringstream> sstream_;
+  std::ostream& ostream_;
+};
+
+template <typename... Args>
+std::string JoinToString(Args&&... args) {
+  StringStreamWrapper ss;
+  (
+      [&ss](auto&& arg) {
+        // Avoid losing precision when printing floating point numbers
+        if constexpr (std::is_floating_point_v<std::decay_t<decltype(arg)>>) {
+          ss.stream() << std::to_string(arg);
+        } else {
+          ss.stream() << arg;
+        }
+      }(std::forward<Args>(args)),
+      ...);
+  return ss.str();
+}
+}  // namespace internal
+
+namespace util {
+/// CRTP helper for declaring string representation. Defines operator<<
+template <typename T>
+class ToStringOstreamable {
+ public:
+  ~ToStringOstreamable() {
+    static_assert(
+        std::is_same<decltype(std::declval<const T>().ToString()), std::string>::value,
+        "ToStringOstreamable depends on the method T::ToString() const");
+  }
+
+ private:
+  const T& cast() const { return static_cast<const T&>(*this); }
+
+  friend inline std::ostream& operator<<(std::ostream& os, const ToStringOstreamable& t) {
+    return os << t.cast().ToString();
+  }
+};
+
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/task_group.h b/pyarrow/include/arrow/util/task_group.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bb72f0d9cb7d7bb8b9ce8f2a65cc9f954924ca3
--- /dev/null
+++ b/pyarrow/include/arrow/util/task_group.h
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/cancel.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace internal {
+
+/// \brief A group of related tasks
+///
+/// A TaskGroup executes tasks with the signature `Status()`.
+/// Execution can be serial or parallel, depending on the TaskGroup
+/// implementation.  When Finish() returns, it is guaranteed that all
+/// tasks have finished, or at least one has errored.
+///
+/// Once an error has occurred any tasks that are submitted to the task group
+/// will not run.  The call to Append will simply return without scheduling the
+/// task.
+///
+/// If the task group is parallel it is possible that multiple tasks could be
+/// running at the same time and one of those tasks fails.  This will put the
+/// task group in a failure state (so additional tasks cannot be run) however
+/// it will not interrupt running tasks.  Finish will not complete
+/// until all running tasks have finished, even if one task fails.
+///
+/// Once a task group has finished new tasks may not be added to it.  If you need to start
+/// a new batch of work then you should create a new task group.
+class ARROW_EXPORT TaskGroup : public std::enable_shared_from_this<TaskGroup> {
+ public:
+  /// Add a Status-returning function to execute.  Execution order is
+  /// undefined.  The function may be executed immediately or later.
+  template <typename Function>
+  void Append(Function&& func) {
+    return AppendReal(std::forward<Function>(func));
+  }
+
+  /// Wait for execution of all tasks (and subgroups) to be finished,
+  /// or for at least one task (or subgroup) to error out.
+  /// The returned Status propagates the error status of the first failing
+  /// task (or subgroup).
+  virtual Status Finish() = 0;
+
+  /// Returns a future that will complete the first time all tasks are finished.
+  /// This should be called only after all top level tasks
+  /// have been added to the task group.
+  ///
+  /// If you are using a TaskGroup asynchronously there are a few considerations to keep
+  /// in mind.  The tasks should not block on I/O, etc (defeats the purpose of using
+  /// futures) and should not be doing any nested locking or you run the risk of the tasks
+  /// getting stuck in the thread pool waiting for tasks which cannot get scheduled.
+  ///
+  /// Primarily this call is intended to help migrate existing work written with TaskGroup
+  /// in mind to using futures without having to do a complete conversion on the first
+  /// pass.
+  virtual Future<> FinishAsync() = 0;
+
+  /// The current aggregate error Status.  Non-blocking, useful for stopping early.
+  virtual Status current_status() = 0;
+
+  /// Whether some tasks have already failed.  Non-blocking, useful for stopping early.
+  virtual bool ok() const = 0;
+
+  /// How many tasks can typically be executed in parallel.
+  /// This is only a hint, useful for testing or debugging.
+  virtual int parallelism() = 0;
+
+  static std::shared_ptr<TaskGroup> MakeSerial(StopToken = StopToken::Unstoppable());
+  static std::shared_ptr<TaskGroup> MakeThreaded(internal::Executor*,
+                                                 StopToken = StopToken::Unstoppable());
+
+  virtual ~TaskGroup() = default;
+
+ protected:
+  TaskGroup() = default;
+  ARROW_DISALLOW_COPY_AND_ASSIGN(TaskGroup);
+
+  virtual void AppendReal(FnOnce<Status()> task) = 0;
+};
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/test_common.h b/pyarrow/include/arrow/util/test_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba28836695cf615bbe86cca99548501acb978471
--- /dev/null
+++ b/pyarrow/include/arrow/util/test_common.h
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <iosfwd>
+
+#include "arrow/testing/gtest_util.h"
+#include "arrow/util/iterator.h"
+
+namespace arrow {
+
+struct TestInt {
+  TestInt();
+  TestInt(int i);  // NOLINT runtime/explicit
+  int value;
+
+  bool operator==(const TestInt& other) const;
+
+  friend std::ostream& operator<<(std::ostream& os, const TestInt& v);
+};
+
+template <>
+struct IterationTraits<TestInt> {
+  static TestInt End() { return TestInt(); }
+  static bool IsEnd(const TestInt& val) { return val == IterationTraits<TestInt>::End(); }
+};
+
+struct TestStr {
+  TestStr();
+  TestStr(const std::string& s);  // NOLINT runtime/explicit
+  TestStr(const char* s);         // NOLINT runtime/explicit
+  explicit TestStr(const TestInt& test_int);
+  std::string value;
+
+  bool operator==(const TestStr& other) const;
+
+  friend std::ostream& operator<<(std::ostream& os, const TestStr& v);
+};
+
+template <>
+struct IterationTraits<TestStr> {
+  static TestStr End() { return TestStr(); }
+  static bool IsEnd(const TestStr& val) { return val == IterationTraits<TestStr>::End(); }
+};
+
+std::vector<TestInt> RangeVector(unsigned int max, unsigned int step = 1);
+
+template <typename T>
+inline Iterator<T> VectorIt(std::vector<T> v) {
+  return MakeVectorIterator<T>(std::move(v));
+}
+
+template <typename T>
+inline Iterator<T> PossiblySlowVectorIt(std::vector<T> v, bool slow = false) {
+  auto iterator = MakeVectorIterator<T>(std::move(v));
+  if (slow) {
+    return MakeTransformedIterator<T, T>(std::move(iterator),
+                                         [](T item) -> Result<TransformFlow<T>> {
+                                           SleepABit();
+                                           return TransformYield(item);
+                                         });
+  } else {
+    return iterator;
+  }
+}
+
+template <typename T>
+inline void AssertIteratorExhausted(Iterator<T>& it) {
+  ASSERT_OK_AND_ASSIGN(T next, it.Next());
+  ASSERT_TRUE(IsIterationEnd(next));
+}
+
+Transformer<TestInt, TestStr> MakeFilter(std::function<bool(TestInt&)> filter);
+
+// Assert equal contents of a memory area and a vector of bytes
+void AssertBytesEqual(const uint8_t* left, const std::vector<uint8_t>& right);
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/thread_pool.h b/pyarrow/include/arrow/util/thread_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..201b8cef790d0bf617b91e93b85a507676660424
--- /dev/null
+++ b/pyarrow/include/arrow/util/thread_pool.h
@@ -0,0 +1,643 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <queue>
+#include <type_traits>
+#include <unordered_set>
+#include <utility>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/util/cancel.h"
+#include "arrow/util/config.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/future.h"
+#include "arrow/util/iterator.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+#if defined(_MSC_VER)
+// Disable harmless warning for decorated name length limit
+#  pragma warning(disable : 4503)
+#endif
+
+namespace arrow {
+
+/// \brief Get the capacity of the global thread pool
+///
+/// Return the number of worker threads in the thread pool to which
+/// Arrow dispatches various CPU-bound tasks.  This is an ideal number,
+/// not necessarily the exact number of threads at a given point in time.
+///
+/// You can change this number using SetCpuThreadPoolCapacity().
+ARROW_EXPORT int GetCpuThreadPoolCapacity();
+
+/// \brief Set the capacity of the global thread pool
+///
+/// Set the number of worker threads int the thread pool to which
+/// Arrow dispatches various CPU-bound tasks.
+///
+/// The current number is returned by GetCpuThreadPoolCapacity().
+ARROW_EXPORT Status SetCpuThreadPoolCapacity(int threads);
+
+namespace internal {
+
+// Hints about a task that may be used by an Executor.
+// They are ignored by the provided ThreadPool implementation.
+struct TaskHints {
+  // The lower, the more urgent
+  int32_t priority = 0;
+  // The IO transfer size in bytes
+  int64_t io_size = -1;
+  // The approximate CPU cost in number of instructions
+  int64_t cpu_cost = -1;
+  // An application-specific ID
+  int64_t external_id = -1;
+};
+
+class ARROW_EXPORT Executor {
+ public:
+  using StopCallback = internal::FnOnce<void(const Status&)>;
+
+  virtual ~Executor();
+
+  // Spawn a fire-and-forget task.
+  template <typename Function>
+  Status Spawn(Function&& func) {
+    return SpawnReal(TaskHints{}, std::forward<Function>(func), StopToken::Unstoppable(),
+                     StopCallback{});
+  }
+  template <typename Function>
+  Status Spawn(Function&& func, StopToken stop_token) {
+    return SpawnReal(TaskHints{}, std::forward<Function>(func), std::move(stop_token),
+                     StopCallback{});
+  }
+  template <typename Function>
+  Status Spawn(TaskHints hints, Function&& func) {
+    return SpawnReal(hints, std::forward<Function>(func), StopToken::Unstoppable(),
+                     StopCallback{});
+  }
+  template <typename Function>
+  Status Spawn(TaskHints hints, Function&& func, StopToken stop_token) {
+    return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
+                     StopCallback{});
+  }
+  template <typename Function>
+  Status Spawn(TaskHints hints, Function&& func, StopToken stop_token,
+               StopCallback stop_callback) {
+    return SpawnReal(hints, std::forward<Function>(func), std::move(stop_token),
+                     std::move(stop_callback));
+  }
+
+  // Transfers a future to this executor.  Any continuations added to the
+  // returned future will run in this executor.  Otherwise they would run
+  // on the same thread that called MarkFinished.
+  //
+  // This is necessary when (for example) an I/O task is completing a future.
+  // The continuations of that future should run on the CPU thread pool keeping
+  // CPU heavy work off the I/O thread pool.  So the I/O task should transfer
+  // the future to the CPU executor before returning.
+  //
+  // By default this method will only transfer if the future is not already completed.  If
+  // the future is already completed then any callback would be run synchronously and so
+  // no transfer is typically necessary.  However, in cases where you want to force a
+  // transfer (e.g. to help the scheduler break up units of work across multiple cores)
+  // then you can override this behavior with `always_transfer`.
+  template <typename T>
+  Future<T> Transfer(Future<T> future) {
+    return DoTransfer(std::move(future), false);
+  }
+
+  // Overload of Transfer which will always schedule callbacks on new threads even if the
+  // future is finished when the callback is added.
+  //
+  // This can be useful in cases where you want to ensure parallelism
+  template <typename T>
+  Future<T> TransferAlways(Future<T> future) {
+    return DoTransfer(std::move(future), true);
+  }
+
+  // Submit a callable and arguments for execution.  Return a future that
+  // will return the callable's result value once.
+  // The callable's arguments are copied before execution.
+  template <typename Function, typename... Args,
+            typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+                Function && (Args && ...)>>
+  Result<FutureType> Submit(TaskHints hints, StopToken stop_token, Function&& func,
+                            Args&&... args) {
+    using ValueType = typename FutureType::ValueType;
+
+    auto future = FutureType::Make();
+    auto task = std::bind(::arrow::detail::ContinueFuture{}, future,
+                          std::forward<Function>(func), std::forward<Args>(args)...);
+    struct {
+      WeakFuture<ValueType> weak_fut;
+
+      void operator()(const Status& st) {
+        auto fut = weak_fut.get();
+        if (fut.is_valid()) {
+          fut.MarkFinished(st);
+        }
+      }
+    } stop_callback{WeakFuture<ValueType>(future)};
+    ARROW_RETURN_NOT_OK(SpawnReal(hints, std::move(task), std::move(stop_token),
+                                  std::move(stop_callback)));
+
+    return future;
+  }
+
+  template <typename Function, typename... Args,
+            typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+                Function && (Args && ...)>>
+  Result<FutureType> Submit(StopToken stop_token, Function&& func, Args&&... args) {
+    return Submit(TaskHints{}, stop_token, std::forward<Function>(func),
+                  std::forward<Args>(args)...);
+  }
+
+  template <typename Function, typename... Args,
+            typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+                Function && (Args && ...)>>
+  Result<FutureType> Submit(TaskHints hints, Function&& func, Args&&... args) {
+    return Submit(std::move(hints), StopToken::Unstoppable(),
+                  std::forward<Function>(func), std::forward<Args>(args)...);
+  }
+
+  template <typename Function, typename... Args,
+            typename FutureType = typename ::arrow::detail::ContinueFuture::ForSignature<
+                Function && (Args && ...)>>
+  Result<FutureType> Submit(Function&& func, Args&&... args) {
+    return Submit(TaskHints{}, StopToken::Unstoppable(), std::forward<Function>(func),
+                  std::forward<Args>(args)...);
+  }
+
+  // Return the level of parallelism (the number of tasks that may be executed
+  // concurrently).  This may be an approximate number.
+  virtual int GetCapacity() = 0;
+
+  // Return true if the thread from which this function is called is owned by this
+  // Executor. Returns false if this Executor does not support this property.
+  virtual bool OwnsThisThread() { return false; }
+
+  // Return true if this is the current executor being called
+  // n.b. this defaults to just calling OwnsThisThread
+  // unless the threadpool is disabled
+  virtual bool IsCurrentExecutor() { return OwnsThisThread(); }
+
+  /// \brief An interface to represent something with a custom destructor
+  ///
+  /// \see KeepAlive
+  class ARROW_EXPORT Resource {
+   public:
+    virtual ~Resource() = default;
+  };
+
+  /// \brief Keep a resource alive until all executor threads have terminated
+  ///
+  /// Executors may have static storage duration.  In particular, the CPU and I/O
+  /// executors are currently implemented this way.  These threads may access other
+  /// objects with static storage duration such as the OpenTelemetry runtime context
+  /// the default memory pool, or other static executors.
+  ///
+  /// The order in which these objects are destroyed is difficult to control.  In order
+  /// to ensure those objects remain alive until all threads have finished those objects
+  /// should be wrapped in a Resource object and passed into this method.  The given
+  /// shared_ptr will be kept alive until all threads have finished their worker loops.
+  virtual void KeepAlive(std::shared_ptr<Resource> resource);
+
+ protected:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(Executor);
+
+  Executor() = default;
+
+  template <typename T, typename FT = Future<T>, typename FTSync = typename FT::SyncType>
+  Future<T> DoTransfer(Future<T> future, bool always_transfer = false) {
+    auto transferred = Future<T>::Make();
+    if (always_transfer) {
+      CallbackOptions callback_options = CallbackOptions::Defaults();
+      callback_options.should_schedule = ShouldSchedule::Always;
+      callback_options.executor = this;
+      auto sync_callback = [transferred](const FTSync& result) mutable {
+        transferred.MarkFinished(result);
+      };
+      future.AddCallback(sync_callback, callback_options);
+      return transferred;
+    }
+
+    // We could use AddCallback's ShouldSchedule::IfUnfinished but we can save a bit of
+    // work by doing the test here.
+    auto callback = [this, transferred](const FTSync& result) mutable {
+      auto spawn_status =
+          Spawn([transferred, result]() mutable { transferred.MarkFinished(result); });
+      if (!spawn_status.ok()) {
+        transferred.MarkFinished(spawn_status);
+      }
+    };
+    auto callback_factory = [&callback]() { return callback; };
+    if (future.TryAddCallback(callback_factory)) {
+      return transferred;
+    }
+    // If the future is already finished and we aren't going to force spawn a thread
+    // then we don't need to add another layer of callback and can return the original
+    // future
+    return future;
+  }
+
+  // Subclassing API
+  virtual Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
+                           StopCallback&&) = 0;
+};
+
+/// \brief An executor implementation that runs all tasks on a single thread using an
+/// event loop.
+///
+/// Note: Any sort of nested parallelism will deadlock this executor.  Blocking waits are
+/// fine but if one task needs to wait for another task it must be expressed as an
+/// asynchronous continuation.
+class ARROW_EXPORT SerialExecutor : public Executor {
+ public:
+  template <typename T = ::arrow::internal::Empty>
+  using TopLevelTask = internal::FnOnce<Future<T>(Executor*)>;
+
+  ~SerialExecutor() override;
+
+  int GetCapacity() override { return 1; };
+  bool OwnsThisThread() override;
+  Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
+                   StopCallback&&) override;
+
+  // Return the number of tasks either running or in the queue.
+  int GetNumTasks();
+
+  /// \brief Runs the TopLevelTask and any scheduled tasks
+  ///
+  /// The TopLevelTask (or one of the tasks it schedules) must either return an invalid
+  /// status or call the finish signal. Failure to do this will result in a deadlock.  For
+  /// this reason it is preferable (if possible) to use the helper methods (below)
+  /// RunSynchronously/RunSerially which delegates the responsibility onto a Future
+  /// producer's existing responsibility to always mark a future finished (which can
+  /// someday be aided by ARROW-12207).
+  template <typename T = internal::Empty, typename FT = Future<T>,
+            typename FTSync = typename FT::SyncType>
+  static FTSync RunInSerialExecutor(TopLevelTask<T> initial_task) {
+    Future<T> fut = SerialExecutor().Run<T>(std::move(initial_task));
+    return FutureToSync(fut);
+  }
+
+  /// \brief Transform an AsyncGenerator into an Iterator
+  ///
+  /// An event loop will be created and each call to Next will power the event loop with
+  /// the calling thread until the next item is ready to be delivered.
+  ///
+  /// Note: The iterator's destructor will run until the given generator is fully
+  /// exhausted. If you wish to abandon iteration before completion then the correct
+  /// approach is to use a stop token to cause the generator to exhaust early.
+  template <typename T>
+  static Iterator<T> IterateGenerator(
+      internal::FnOnce<Result<std::function<Future<T>()>>(Executor*)> initial_task) {
+    auto serial_executor = std::unique_ptr<SerialExecutor>(new SerialExecutor());
+    auto maybe_generator = std::move(initial_task)(serial_executor.get());
+    if (!maybe_generator.ok()) {
+      return MakeErrorIterator<T>(maybe_generator.status());
+    }
+    auto generator = maybe_generator.MoveValueUnsafe();
+    struct SerialIterator {
+      SerialIterator(std::unique_ptr<SerialExecutor> executor,
+                     std::function<Future<T>()> generator)
+          : executor(std::move(executor)), generator(std::move(generator)) {}
+      ARROW_DISALLOW_COPY_AND_ASSIGN(SerialIterator);
+      ARROW_DEFAULT_MOVE_AND_ASSIGN(SerialIterator);
+      ~SerialIterator() {
+        // A serial iterator must be consumed before it can be destroyed.  Allowing it to
+        // do otherwise would lead to resource leakage.  There will likely be deadlocks at
+        // this spot in the future but these will be the result of other bugs and not the
+        // fact that we are forcing consumption here.
+
+        // If a streaming API needs to support early abandonment then it should be done so
+        // with a cancellation token and not simply discarding the iterator and expecting
+        // the underlying work to clean up correctly.
+        if (executor && !executor->IsFinished()) {
+          while (true) {
+            Result<T> maybe_next = Next();
+            if (!maybe_next.ok() || IsIterationEnd(*maybe_next)) {
+              break;
+            }
+          }
+        }
+      }
+
+      Result<T> Next() {
+        executor->Unpause();
+        // This call may lead to tasks being scheduled in the serial executor
+        Future<T> next_fut = generator();
+        next_fut.AddCallback([this](const Result<T>& res) {
+          // If we're done iterating we should drain the rest of the tasks in the executor
+          if (!res.ok() || IsIterationEnd(*res)) {
+            executor->Finish();
+            return;
+          }
+          // Otherwise we will break out immediately, leaving the remaining tasks for
+          // the next call.
+          executor->Pause();
+        });
+#ifdef ARROW_ENABLE_THREADING
+        // future must run on this thread
+        // Borrow this thread and run tasks until the future is finished
+        executor->RunLoop();
+#else
+        next_fut.Wait();
+#endif
+        if (!next_fut.is_finished()) {
+          // Not clear this is possible since RunLoop wouldn't generally exit
+          // unless we paused/finished which would imply next_fut has been
+          // finished.
+          return Status::Invalid(
+              "Serial executor terminated before next result computed");
+        }
+        // At this point we may still have tasks in the executor, that is ok.
+        // We will run those tasks the next time through.
+        return next_fut.result();
+      }
+
+      std::unique_ptr<SerialExecutor> executor;
+      std::function<Future<T>()> generator;
+    };
+    return Iterator<T>(SerialIterator{std::move(serial_executor), std::move(generator)});
+  }
+
+#ifndef ARROW_ENABLE_THREADING
+  // run a pending task from loop
+  // returns true if any tasks were run in the last go round the loop (i.e. if it
+  // returns false, all executors are waiting)
+  static bool RunTasksOnAllExecutors();
+  static SerialExecutor* GetCurrentExecutor();
+
+  bool IsCurrentExecutor() override;
+
+#endif
+
+ protected:
+  virtual void RunLoop();
+
+  // State uses mutex
+  struct State;
+  std::shared_ptr<State> state_;
+
+  SerialExecutor();
+
+  // We mark the serial executor "finished" when there should be
+  // no more tasks scheduled on it.  It's not strictly needed but
+  // can help catch bugs where we are trying to use the executor
+  // after we are done with it.
+  void Finish();
+  bool IsFinished();
+  // We pause the executor when we are running an async generator
+  // and we have received an item that we can deliver.
+  void Pause();
+  void Unpause();
+
+  template <typename T, typename FTSync = typename Future<T>::SyncType>
+  Future<T> Run(TopLevelTask<T> initial_task) {
+    auto final_fut = std::move(initial_task)(this);
+    final_fut.AddCallback([this](const FTSync&) { Finish(); });
+    RunLoop();
+    return final_fut;
+  }
+
+#ifndef ARROW_ENABLE_THREADING
+  // we have to run tasks from all live executors
+  // during RunLoop if we don't have threading
+  static std::unordered_set<SerialExecutor*> all_executors;
+  // a pointer to the last one called by the loop
+  // so all tasks get spawned equally
+  // on multiple calls to RunTasksOnAllExecutors
+  static SerialExecutor* last_called_executor;
+  // without threading we can't tell which executor called the
+  // current process - so we set it in spawning the task
+  static SerialExecutor* current_executor;
+#endif  // ARROW_ENABLE_THREADING
+};
+
+#ifdef ARROW_ENABLE_THREADING
+
+/// An Executor implementation spawning tasks in FIFO manner on a fixed-size
+/// pool of worker threads.
+///
+/// Note: Any sort of nested parallelism will deadlock this executor.  Blocking waits are
+/// fine but if one task needs to wait for another task it must be expressed as an
+/// asynchronous continuation.
+class ARROW_EXPORT ThreadPool : public Executor {
+ public:
+  // Construct a thread pool with the given number of worker threads
+  static Result<std::shared_ptr<ThreadPool>> Make(int threads);
+
+  // Like Make(), but takes care that the returned ThreadPool is compatible
+  // with destruction late at process exit.
+  static Result<std::shared_ptr<ThreadPool>> MakeEternal(int threads);
+
+  // Destroy thread pool; the pool will first be shut down
+  ~ThreadPool() override;
+
+  // Return the desired number of worker threads.
+  // The actual number of workers may lag a bit before being adjusted to
+  // match this value.
+  int GetCapacity() override;
+
+  // Return the number of tasks either running or in the queue.
+  int GetNumTasks();
+
+  bool OwnsThisThread() override;
+  // Dynamically change the number of worker threads.
+  //
+  // This function always returns immediately.
+  // If fewer threads are running than this number, new threads are spawned
+  // on-demand when needed for task execution.
+  // If more threads are running than this number, excess threads are reaped
+  // as soon as possible.
+  Status SetCapacity(int threads);
+
+  // Heuristic for the default capacity of a thread pool for CPU-bound tasks.
+  // This is exposed as a static method to help with testing.
+  // The number returned is guaranteed to be greater or equal to one.
+  static int DefaultCapacity();
+
+  // Shutdown the pool.  Once the pool starts shutting down, new tasks
+  // cannot be submitted anymore.
+  // If "wait" is true, shutdown waits for all pending tasks to be finished.
+  // If "wait" is false, workers are stopped as soon as currently executing
+  // tasks are finished.
+  Status Shutdown(bool wait = true);
+
+  // Wait for the thread pool to become idle
+  //
+  // This is useful for sequencing tests
+  void WaitForIdle();
+
+  void KeepAlive(std::shared_ptr<Executor::Resource> resource) override;
+
+  struct State;
+
+ protected:
+  FRIEND_TEST(TestThreadPool, SetCapacity);
+  FRIEND_TEST(TestGlobalThreadPool, Capacity);
+  ARROW_FRIEND_EXPORT friend ThreadPool* GetCpuThreadPool();
+
+  ThreadPool();
+
+  Status SpawnReal(TaskHints hints, FnOnce<void()> task, StopToken,
+                   StopCallback&&) override;
+
+  // Collect finished worker threads, making sure the OS threads have exited
+  void CollectFinishedWorkersUnlocked();
+  // Launch a given number of additional workers
+  void LaunchWorkersUnlocked(int threads);
+  // Get the current actual capacity
+  int GetActualCapacity();
+
+  static std::shared_ptr<ThreadPool> MakeCpuThreadPool();
+
+  std::shared_ptr<State> sp_state_;
+  State* state_;
+  bool shutdown_on_destroy_;
+};
+#else  // ARROW_ENABLE_THREADING
+// an executor implementation which pretends to be a thread pool but runs everything
+// on the main thread using a static queue (shared between all thread pools, otherwise
+// cross-threadpool dependencies will break everything)
+class ARROW_EXPORT ThreadPool : public SerialExecutor {
+ public:
+  ARROW_FRIEND_EXPORT friend ThreadPool* GetCpuThreadPool();
+
+  static Result<std::shared_ptr<ThreadPool>> Make(int threads);
+
+  // Like Make(), but takes care that the returned ThreadPool is compatible
+  // with destruction late at process exit.
+  static Result<std::shared_ptr<ThreadPool>> MakeEternal(int threads);
+
+  // Destroy thread pool; the pool will first be shut down
+  ~ThreadPool() override;
+
+  // Return the desired number of worker threads.
+  // The actual number of workers may lag a bit before being adjusted to
+  // match this value.
+  int GetCapacity() override;
+
+  virtual int GetActualCapacity();
+
+  bool OwnsThisThread() override { return true; }
+
+  // Dynamically change the number of worker threads.
+  // without threading this is equal to the
+  // number of tasks that can be running at once
+  // (inside each other)
+  Status SetCapacity(int threads);
+
+  static int DefaultCapacity() { return 8; }
+
+  // Shutdown the pool.  Once the pool starts shutting down, new tasks
+  // cannot be submitted anymore.
+  // If "wait" is true, shutdown waits for all pending tasks to be finished.
+  // If "wait" is false, workers are stopped as soon as currently executing
+  // tasks are finished.
+  Status Shutdown(bool wait = true);
+
+  // Wait for the thread pool to become idle
+  //
+  // This is useful for sequencing tests
+  void WaitForIdle();
+
+ protected:
+  static std::shared_ptr<ThreadPool> MakeCpuThreadPool();
+  ThreadPool();
+};
+
+#endif  // ARROW_ENABLE_THREADING
+
+// Return the process-global thread pool for CPU-bound tasks.
+ARROW_EXPORT ThreadPool* GetCpuThreadPool();
+
+/// \brief Potentially run an async operation serially (if use_threads is false)
+/// \see RunSerially
+///
+/// If `use_threads` is true, the global CPU executor is used.
+/// If `use_threads` is false, a temporary SerialExecutor is used.
+/// `get_future` is called (from this thread) with the chosen executor and must
+/// return a future that will eventually finish. This function returns once the
+/// future has finished.
+template <typename Fut, typename ValueType = typename Fut::ValueType>
+typename Fut::SyncType RunSynchronously(FnOnce<Fut(Executor*)> get_future,
+                                        bool use_threads) {
+  if (use_threads) {
+    auto fut = std::move(get_future)(GetCpuThreadPool());
+    return FutureToSync(fut);
+  } else {
+    return SerialExecutor::RunInSerialExecutor<ValueType>(std::move(get_future));
+  }
+}
+
+/// \brief Potentially iterate an async generator serially (if use_threads is false)
+///   using a potentially custom Executor
+/// \see IterateGenerator
+///
+/// If `use_threads` is true, the custom executor or, if null,
+///   the global CPU executor will be used.  Each call to
+///   the iterator will simply wait until the next item is available.  Tasks may run in
+///   the background between calls.
+///
+/// If `use_threads` is false, the calling thread only will be used.  Each call to
+///   the iterator will use the calling thread to do enough work to generate one item.
+///   Tasks will be left in a queue until the next call and no work will be done between
+///   calls.
+template <typename T>
+Iterator<T> IterateSynchronously(
+    FnOnce<Result<std::function<Future<T>()>>(Executor*)> get_gen, bool use_threads,
+    Executor* executor) {
+  if (use_threads) {
+    auto used_executor = executor != NULLPTR ? executor : GetCpuThreadPool();
+    auto maybe_gen = std::move(get_gen)(used_executor);
+    if (!maybe_gen.ok()) {
+      return MakeErrorIterator<T>(maybe_gen.status());
+    }
+    return MakeGeneratorIterator(*maybe_gen);
+  } else {
+    return SerialExecutor::IterateGenerator(std::move(get_gen));
+  }
+}
+
+/// \brief Potentially iterate an async generator serially (if use_threads is false)
+///   using the default CPU thread pool
+/// \see IterateGenerator
+///
+/// If `use_threads` is true, the global CPU executor will be used.  Each call to
+///   the iterator will simply wait until the next item is available.  Tasks may run in
+///   the background between calls.
+///
+/// If `use_threads` is false, the calling thread only will be used.  Each call to
+///   the iterator will use the calling thread to do enough work to generate one item.
+///   Tasks will be left in a queue until the next call and no work will be done between
+///   calls.
+template <typename T>
+Iterator<T> IterateSynchronously(
+    FnOnce<Result<std::function<Future<T>()>>(Executor*)> get_gen, bool use_threads) {
+  return IterateSynchronously(std::move(get_gen), use_threads, NULLPTR);
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/time.h b/pyarrow/include/arrow/util/time.h
new file mode 100644
index 0000000000000000000000000000000000000000..05d3b85e057ae1a8d77b75b94bd4fb89f681ad73
--- /dev/null
+++ b/pyarrow/include/arrow/util/time.h
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <chrono>
+#include <cstdlib>
+#include <memory>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/int_util_overflow.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+enum DivideOrMultiply {
+  MULTIPLY,
+  DIVIDE,
+};
+
+ARROW_EXPORT
+std::pair<DivideOrMultiply, int64_t> GetTimestampConversion(TimeUnit::type in_unit,
+                                                            TimeUnit::type out_unit);
+
+// Converts a Timestamp value into another Timestamp value.
+//
+// This function takes care of properly transforming from one unit to another.
+//
+// \param[in] in the input type. Must be TimestampType.
+// \param[in] out the output type. Must be TimestampType.
+// \param[in] value the input value.
+//
+// \return The converted value, or an error.
+ARROW_EXPORT Result<int64_t> ConvertTimestampValue(const std::shared_ptr<DataType>& in,
+                                                   const std::shared_ptr<DataType>& out,
+                                                   int64_t value);
+
+template <typename Visitor, typename... Args>
+decltype(std::declval<Visitor>()(std::chrono::seconds{}, std::declval<Args&&>()...))
+VisitDuration(TimeUnit::type unit, Visitor&& visitor, Args&&... args) {
+  switch (unit) {
+    default:
+    case TimeUnit::SECOND:
+      break;
+    case TimeUnit::MILLI:
+      return visitor(std::chrono::milliseconds{}, std::forward<Args>(args)...);
+    case TimeUnit::MICRO:
+      return visitor(std::chrono::microseconds{}, std::forward<Args>(args)...);
+    case TimeUnit::NANO:
+      return visitor(std::chrono::nanoseconds{}, std::forward<Args>(args)...);
+  }
+  return visitor(std::chrono::seconds{}, std::forward<Args>(args)...);
+}
+
+inline std::optional<int64_t> CastSecondsToUnit(TimeUnit::type unit, int64_t seconds) {
+  auto cast_seconds_to_unit = [](auto duration,
+                                 int64_t seconds) -> std::optional<int64_t> {
+    constexpr auto kMultiplier = static_cast<int64_t>(decltype(duration)::period::den);
+    int64_t out;
+    if (ARROW_PREDICT_FALSE(
+            ::arrow::internal::MultiplyWithOverflow(seconds, kMultiplier, &out))) {
+      return {};
+    }
+    return out;
+  };
+  return VisitDuration(unit, cast_seconds_to_unit, seconds);
+}
+
+inline bool CastSecondsToUnit(TimeUnit::type unit, int64_t seconds, int64_t* out) {
+  auto maybe_value = CastSecondsToUnit(unit, seconds);
+  if (ARROW_PREDICT_TRUE(maybe_value.has_value())) {
+    *out = *maybe_value;
+  }
+  return maybe_value.has_value();
+}
+
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/tracing.h b/pyarrow/include/arrow/util/tracing.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7808256418eef0faaf54a189d11c6896583d68b
--- /dev/null
+++ b/pyarrow/include/arrow/util/tracing.h
@@ -0,0 +1,45 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+namespace tracing {
+
+class ARROW_EXPORT SpanDetails {
+ public:
+  virtual ~SpanDetails() {}
+};
+
+class ARROW_EXPORT Span {
+ public:
+  Span() noexcept;
+  /// True if this span has been started with START_SPAN
+  bool valid() const;
+  /// End the span early
+  void reset();
+  std::unique_ptr<SpanDetails> details;
+};
+
+}  // namespace tracing
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/type_fwd.h b/pyarrow/include/arrow/util/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8934ecbd4c22d3118e6037ddef6f3ed7411ab65
--- /dev/null
+++ b/pyarrow/include/arrow/util/type_fwd.h
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+
+namespace internal {
+struct Empty;
+}  // namespace internal
+
+template <typename T = internal::Empty>
+class WeakFuture;
+class FutureWaiter;
+
+class TimestampParser;
+
+namespace internal {
+
+class Executor;
+class TaskGroup;
+class ThreadPool;
+class CpuInfo;
+
+namespace tracing {
+
+struct Scope;
+
+}  // namespace tracing
+}  // namespace internal
+
+struct Compression {
+  /// \brief Compression algorithm
+  enum type {
+    UNCOMPRESSED,
+    SNAPPY,
+    GZIP,
+    BROTLI,
+    ZSTD,
+    LZ4,
+    LZ4_FRAME,
+    LZO,
+    BZ2,
+    LZ4_HADOOP
+  };
+};
+
+namespace util {
+class AsyncTaskScheduler;
+class Compressor;
+class Decompressor;
+class Codec;
+class Uri;
+}  // namespace util
+
+template <typename T>
+struct Enumerated {
+  T value;
+  int index;
+  bool last;
+
+  friend inline bool operator==(const Enumerated<T>& left, const Enumerated<T>& right) {
+    return left.index == right.index && left.last == right.last &&
+           left.value == right.value;
+  }
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/type_traits.h b/pyarrow/include/arrow/util/type_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c3b388dab29702a2a53dc9f346b5eed6a6cef3d
--- /dev/null
+++ b/pyarrow/include/arrow/util/type_traits.h
@@ -0,0 +1,73 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <type_traits>
+
+namespace arrow {
+namespace internal {
+
+/// \brief Metafunction to allow checking if a type matches any of another set of types
+template <typename...>
+struct IsOneOf : std::false_type {};  /// Base case: nothing has matched
+
+template <typename T, typename U, typename... Args>
+struct IsOneOf<T, U, Args...> {
+  /// Recursive case: T == U or T matches any other types provided (not including U).
+  static constexpr bool value = std::is_same<T, U>::value || IsOneOf<T, Args...>::value;
+};
+
+/// \brief Shorthand for using IsOneOf + std::enable_if
+template <typename T, typename... Args>
+using EnableIfIsOneOf = typename std::enable_if<IsOneOf<T, Args...>::value, T>::type;
+
+/// \brief is_null_pointer from C++17
+template <typename T>
+struct is_null_pointer : std::is_same<std::nullptr_t, typename std::remove_cv<T>::type> {
+};
+
+template <int kNumBytes>
+struct SizedIntImpl;
+
+template <>
+struct SizedIntImpl<1> {
+  using type = int8_t;
+};
+
+template <>
+struct SizedIntImpl<2> {
+  using type = int16_t;
+};
+
+template <>
+struct SizedIntImpl<4> {
+  using type = int32_t;
+};
+
+template <>
+struct SizedIntImpl<8> {
+  using type = int64_t;
+};
+
+// Map a number of bytes to a type
+template <int kNumBytes>
+using SizedInt = typename SizedIntImpl<kNumBytes>::type;
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/ubsan.h b/pyarrow/include/arrow/util/ubsan.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c6a8f419bbc5f658aacc5866e5dffee1b46ea7a
--- /dev/null
+++ b/pyarrow/include/arrow/util/ubsan.h
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Contains utilities for making UBSan happy.
+
+#pragma once
+
+#include <cstring>
+#include <memory>
+#include <type_traits>
+
+#include "arrow/util/aligned_storage.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace util {
+
+namespace internal {
+
+constexpr uint8_t kNonNullFiller = 0;
+
+}  // namespace internal
+
+/// \brief Returns maybe_null if not null or a non-null pointer to an arbitrary memory
+/// that shouldn't be dereferenced.
+///
+/// Memset/Memcpy are undefined when a nullptr is passed as an argument use this utility
+/// method to wrap locations where this could happen.
+///
+/// Note: Flatbuffers has UBSan warnings if a zero length vector is passed.
+/// https://github.com/google/flatbuffers/pull/5355 is trying to resolve
+/// them.
+template <typename T>
+inline T* MakeNonNull(T* maybe_null = NULLPTR) {
+  if (ARROW_PREDICT_TRUE(maybe_null != NULLPTR)) {
+    return maybe_null;
+  }
+
+  return const_cast<T*>(reinterpret_cast<const T*>(&internal::kNonNullFiller));
+}
+
+template <typename T>
+inline std::enable_if_t<std::is_trivially_copyable_v<T>, T> SafeLoadAs(
+    const uint8_t* unaligned) {
+  using Type = std::remove_const_t<T>;
+  arrow::internal::AlignedStorage<Type> raw_data;
+  std::memcpy(raw_data.get(), unaligned, sizeof(T));
+  auto data = *raw_data.get();
+  raw_data.destroy();
+  return data;
+}
+
+template <typename T>
+inline std::enable_if_t<std::is_trivially_copyable_v<T>, T> SafeLoad(const T* unaligned) {
+  using Type = std::remove_const_t<T>;
+  arrow::internal::AlignedStorage<Type> raw_data;
+  std::memcpy(raw_data.get(), static_cast<const void*>(unaligned), sizeof(T));
+  auto data = *raw_data.get();
+  raw_data.destroy();
+  return data;
+}
+
+template <typename U, typename T>
+inline std::enable_if_t<std::is_trivially_copyable_v<T> &&
+                            std::is_trivially_copyable_v<U> && sizeof(T) == sizeof(U),
+                        U>
+SafeCopy(T value) {
+  using TypeU = std::remove_const_t<U>;
+  arrow::internal::AlignedStorage<TypeU> raw_data;
+  std::memcpy(raw_data.get(), static_cast<const void*>(&value), sizeof(T));
+  auto data = *raw_data.get();
+  raw_data.destroy();
+  return data;
+}
+
+template <typename T>
+inline std::enable_if_t<std::is_trivially_copyable_v<T>, void> SafeStore(void* unaligned,
+                                                                         T value) {
+  std::memcpy(unaligned, &value, sizeof(T));
+}
+
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/union_util.h b/pyarrow/include/arrow/util/union_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f30d5a32781924a3c64904a203a03d9d3d48d79
--- /dev/null
+++ b/pyarrow/include/arrow/util/union_util.h
@@ -0,0 +1,31 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include "arrow/array/data.h"
+
+namespace arrow {
+namespace union_util {
+
+/// \brief Compute the number of of logical nulls in a sparse union array
+int64_t LogicalSparseUnionNullCount(const ArraySpan& span);
+
+/// \brief Compute the number of of logical nulls in a dense union array
+int64_t LogicalDenseUnionNullCount(const ArraySpan& span);
+
+}  // namespace union_util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/unreachable.h b/pyarrow/include/arrow/util/unreachable.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2e383e714b3eb8e0a0b6a23b1086913093a5c29
--- /dev/null
+++ b/pyarrow/include/arrow/util/unreachable.h
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/visibility.h"
+
+#include <string_view>
+
+namespace arrow {
+
+[[noreturn]] ARROW_EXPORT void Unreachable(const char* message = "Unreachable");
+
+[[noreturn]] ARROW_EXPORT void Unreachable(std::string_view message);
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/uri.h b/pyarrow/include/arrow/util/uri.h
new file mode 100644
index 0000000000000000000000000000000000000000..74dbe924ff23740fb603c558e87fc54253392030
--- /dev/null
+++ b/pyarrow/include/arrow/util/uri.h
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow::util {
+
+/// \brief A parsed URI
+class ARROW_EXPORT Uri {
+ public:
+  Uri();
+  ~Uri();
+  Uri(Uri&&);
+  Uri& operator=(Uri&&);
+
+  // XXX Should we use std::string_view instead?  These functions are
+  // not performance-critical.
+
+  /// The URI scheme, such as "http", or the empty string if the URI has no
+  /// explicit scheme.
+  std::string scheme() const;
+
+  /// Convenience function that returns true if the scheme() is "file"
+  bool is_file_scheme() const;
+
+  /// Whether the URI has an explicit host name.  This may return true if
+  /// the URI has an empty host (e.g. "file:///tmp/foo"), while it returns
+  /// false is the URI has not host component at all (e.g. "file:/tmp/foo").
+  bool has_host() const;
+  /// The URI host name, such as "localhost", "127.0.0.1" or "::1", or the empty
+  /// string is the URI does not have a host component.
+  std::string host() const;
+
+  /// The URI port number, as a string such as "80", or the empty string is the URI
+  /// does not have a port number component.
+  std::string port_text() const;
+  /// The URI port parsed as an integer, or -1 if the URI does not have a port
+  /// number component.
+  int32_t port() const;
+
+  /// The username specified in the URI.
+  std::string username() const;
+  /// The password specified in the URI.
+  std::string password() const;
+
+  /// The URI path component.
+  std::string path() const;
+
+  /// The URI query string
+  std::string query_string() const;
+
+  /// The URI query items
+  ///
+  /// Note this API doesn't allow differentiating between an empty value
+  /// and a missing value, such in "a&b=1" vs. "a=&b=1".
+  Result<std::vector<std::pair<std::string, std::string>>> query_items() const;
+
+  /// Get the string representation of this URI.
+  const std::string& ToString() const;
+
+  /// Factory function to parse a URI from its string representation.
+  Status Parse(const std::string& uri_string);
+
+  /// Factory function to parse a URI from its string representation.
+  static Result<Uri> FromString(const std::string& uri_string);
+
+ private:
+  struct Impl;
+  std::unique_ptr<Impl> impl_;
+};
+
+/// Percent-encode the input string, for use e.g. as a URI query parameter.
+///
+/// This will escape directory separators, making this function unsuitable
+/// for encoding URI paths directly. See UriFromAbsolutePath() instead.
+ARROW_EXPORT
+std::string UriEscape(std::string_view s);
+
+ARROW_EXPORT
+std::string UriUnescape(std::string_view s);
+
+/// Encode a host for use within a URI, such as "localhost",
+/// "127.0.0.1", or "[::1]".
+ARROW_EXPORT
+std::string UriEncodeHost(std::string_view host);
+
+/// Whether the string is a syntactically valid URI scheme according to RFC 3986.
+ARROW_EXPORT
+bool IsValidUriScheme(std::string_view s);
+
+/// Create a file uri from a given absolute path
+ARROW_EXPORT
+Result<std::string> UriFromAbsolutePath(std::string_view path);
+
+}  // namespace arrow::util
diff --git a/pyarrow/include/arrow/util/utf8.h b/pyarrow/include/arrow/util/utf8.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca93fab5b9f4e1f43d451689f0e75cb5572ce983
--- /dev/null
+++ b/pyarrow/include/arrow/util/utf8.h
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <string_view>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+namespace util {
+
+// Convert a UTF8 string to a wstring (either UTF16 or UTF32, depending
+// on the wchar_t width).
+ARROW_EXPORT Result<std::wstring> UTF8ToWideString(std::string_view source);
+
+// Similarly, convert a wstring to a UTF8 string.
+ARROW_EXPORT Result<std::string> WideStringToUTF8(const std::wstring& source);
+
+// Convert UTF8 string to a UTF16 string.
+ARROW_EXPORT Result<std::u16string> UTF8StringToUTF16(std::string_view source);
+
+// Convert UTF16 string to a UTF8 string.
+ARROW_EXPORT Result<std::string> UTF16StringToUTF8(std::u16string_view source);
+
+// This function needs to be called before doing UTF8 validation.
+ARROW_EXPORT void InitializeUTF8();
+
+ARROW_EXPORT bool ValidateUTF8(const uint8_t* data, int64_t size);
+
+ARROW_EXPORT bool ValidateUTF8(std::string_view str);
+
+// Skip UTF8 byte order mark, if any.
+ARROW_EXPORT
+Result<const uint8_t*> SkipUTF8BOM(const uint8_t* data, int64_t size);
+
+static constexpr uint32_t kMaxUnicodeCodepoint = 0x110000;
+
+}  // namespace util
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/value_parsing.h b/pyarrow/include/arrow/util/value_parsing.h
new file mode 100644
index 0000000000000000000000000000000000000000..195cdc843ac6951397eb1598b7d0b816d18bd6f4
--- /dev/null
+++ b/pyarrow/include/arrow/util/value_parsing.h
@@ -0,0 +1,965 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This is a private header for string-to-number parsing utilities
+
+#pragma once
+
+#include <cassert>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <type_traits>
+
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/config.h"
+#include "arrow/util/float16.h"
+#include "arrow/util/int_util_overflow.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/time.h"
+#include "arrow/util/visibility.h"
+#include "arrow/vendored/datetime.h"
+#include "arrow/vendored/strptime.h"
+
+namespace arrow {
+
+/// \brief A virtual string to timestamp parser
+class ARROW_EXPORT TimestampParser {
+ public:
+  virtual ~TimestampParser() = default;
+
+  virtual bool operator()(const char* s, size_t length, TimeUnit::type out_unit,
+                          int64_t* out,
+                          bool* out_zone_offset_present = NULLPTR) const = 0;
+
+  virtual const char* kind() const = 0;
+
+  virtual const char* format() const;
+
+  /// \brief Create a TimestampParser that recognizes strptime-like format strings
+  static std::shared_ptr<TimestampParser> MakeStrptime(std::string format);
+
+  /// \brief Create a TimestampParser that recognizes (locale-agnostic) ISO8601
+  /// timestamps
+  static std::shared_ptr<TimestampParser> MakeISO8601();
+};
+
+namespace internal {
+
+/// \brief The entry point for conversion from strings.
+///
+/// Specializations of StringConverter for `ARROW_TYPE` must define:
+/// - A default constructible member type `value_type` which will be yielded on a
+///   successful parse.
+/// - The static member function `Convert`, callable with signature
+///   `(const ARROW_TYPE& t, const char* s, size_t length, value_type* out)`.
+///   `Convert` returns truthy for successful parses and assigns the parsed values to
+///   `*out`. Parameters required for parsing (for example a timestamp's TimeUnit)
+///   are acquired from the type parameter `t`.
+template <typename ARROW_TYPE, typename Enable = void>
+struct StringConverter;
+
+template <typename T>
+struct is_parseable {
+  template <typename U, typename = typename StringConverter<U>::value_type>
+  static std::true_type Test(U*);
+
+  template <typename U>
+  static std::false_type Test(...);
+
+  static constexpr bool value = decltype(Test<T>(NULLPTR))::value;
+};
+
+template <typename T, typename R = void>
+using enable_if_parseable = enable_if_t<is_parseable<T>::value, R>;
+
+template <>
+struct StringConverter<BooleanType> {
+  using value_type = bool;
+
+  bool Convert(const BooleanType&, const char* s, size_t length, value_type* out) {
+    if (length == 1) {
+      // "0" or "1"?
+      if (s[0] == '0') {
+        *out = false;
+        return true;
+      }
+      if (s[0] == '1') {
+        *out = true;
+        return true;
+      }
+      return false;
+    }
+    if (length == 4) {
+      // "true"?
+      *out = true;
+      return ((s[0] == 't' || s[0] == 'T') && (s[1] == 'r' || s[1] == 'R') &&
+              (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'));
+    }
+    if (length == 5) {
+      // "false"?
+      *out = false;
+      return ((s[0] == 'f' || s[0] == 'F') && (s[1] == 'a' || s[1] == 'A') &&
+              (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') &&
+              (s[4] == 'e' || s[4] == 'E'));
+    }
+    return false;
+  }
+};
+
+// Ideas for faster float parsing:
+// - http://rapidjson.org/md_doc_internals.html#ParsingDouble
+// - https://github.com/google/double-conversion [used here]
+// - https://github.com/achan001/dtoa-fast
+
+ARROW_EXPORT
+bool StringToFloat(const char* s, size_t length, char decimal_point, float* out);
+
+ARROW_EXPORT
+bool StringToFloat(const char* s, size_t length, char decimal_point, double* out);
+
+ARROW_EXPORT
+bool StringToFloat(const char* s, size_t length, char decimal_point,
+                   ::arrow::util::Float16* out);
+
+template <>
+struct StringConverter<FloatType> {
+  using value_type = float;
+
+  explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {}
+
+  bool Convert(const FloatType&, const char* s, size_t length, value_type* out) {
+    return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out));
+  }
+
+ private:
+  const char decimal_point;
+};
+
+template <>
+struct StringConverter<DoubleType> {
+  using value_type = double;
+
+  explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {}
+
+  bool Convert(const DoubleType&, const char* s, size_t length, value_type* out) {
+    return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out));
+  }
+
+ private:
+  const char decimal_point;
+};
+
+template <>
+struct StringConverter<HalfFloatType> {
+  using value_type = ::arrow::util::Float16;
+
+  explicit StringConverter(char decimal_point = '.') : decimal_point(decimal_point) {}
+
+  bool Convert(const HalfFloatType&, const char* s, size_t length, value_type* out) {
+    return ARROW_PREDICT_TRUE(StringToFloat(s, length, decimal_point, out));
+  }
+
+ private:
+  const char decimal_point;
+};
+
+// NOTE: HalfFloatType would require a half<->float conversion library
+
+inline uint8_t ParseDecimalDigit(char c) { return static_cast<uint8_t>(c - '0'); }
+
+#define PARSE_UNSIGNED_ITERATION(C_TYPE)          \
+  if (length > 0) {                               \
+    uint8_t digit = ParseDecimalDigit(*s++);      \
+    result = static_cast<C_TYPE>(result * 10U);   \
+    length--;                                     \
+    if (ARROW_PREDICT_FALSE(digit > 9U)) {        \
+      /* Non-digit */                             \
+      return false;                               \
+    }                                             \
+    result = static_cast<C_TYPE>(result + digit); \
+  } else {                                        \
+    break;                                        \
+  }
+
+#define PARSE_UNSIGNED_ITERATION_LAST(C_TYPE)                                     \
+  if (length > 0) {                                                               \
+    if (ARROW_PREDICT_FALSE(result > std::numeric_limits<C_TYPE>::max() / 10U)) { \
+      /* Overflow */                                                              \
+      return false;                                                               \
+    }                                                                             \
+    uint8_t digit = ParseDecimalDigit(*s++);                                      \
+    result = static_cast<C_TYPE>(result * 10U);                                   \
+    C_TYPE new_result = static_cast<C_TYPE>(result + digit);                      \
+    if (ARROW_PREDICT_FALSE(--length > 0)) {                                      \
+      /* Too many digits */                                                       \
+      return false;                                                               \
+    }                                                                             \
+    if (ARROW_PREDICT_FALSE(digit > 9U)) {                                        \
+      /* Non-digit */                                                             \
+      return false;                                                               \
+    }                                                                             \
+    if (ARROW_PREDICT_FALSE(new_result < result)) {                               \
+      /* Overflow */                                                              \
+      return false;                                                               \
+    }                                                                             \
+    result = new_result;                                                          \
+  }
+
+inline bool ParseUnsigned(const char* s, size_t length, uint8_t* out) {
+  uint8_t result = 0;
+
+  do {
+    PARSE_UNSIGNED_ITERATION(uint8_t);
+    PARSE_UNSIGNED_ITERATION(uint8_t);
+    PARSE_UNSIGNED_ITERATION_LAST(uint8_t);
+  } while (false);
+  *out = result;
+  return true;
+}
+
+inline bool ParseUnsigned(const char* s, size_t length, uint16_t* out) {
+  uint16_t result = 0;
+  do {
+    PARSE_UNSIGNED_ITERATION(uint16_t);
+    PARSE_UNSIGNED_ITERATION(uint16_t);
+    PARSE_UNSIGNED_ITERATION(uint16_t);
+    PARSE_UNSIGNED_ITERATION(uint16_t);
+    PARSE_UNSIGNED_ITERATION_LAST(uint16_t);
+  } while (false);
+  *out = result;
+  return true;
+}
+
+inline bool ParseUnsigned(const char* s, size_t length, uint32_t* out) {
+  uint32_t result = 0;
+  do {
+    PARSE_UNSIGNED_ITERATION(uint32_t);
+    PARSE_UNSIGNED_ITERATION(uint32_t);
+    PARSE_UNSIGNED_ITERATION(uint32_t);
+    PARSE_UNSIGNED_ITERATION(uint32_t);
+    PARSE_UNSIGNED_ITERATION(uint32_t);
+
+    PARSE_UNSIGNED_ITERATION(uint32_t);
+    PARSE_UNSIGNED_ITERATION(uint32_t);
+    PARSE_UNSIGNED_ITERATION(uint32_t);
+    PARSE_UNSIGNED_ITERATION(uint32_t);
+
+    PARSE_UNSIGNED_ITERATION_LAST(uint32_t);
+  } while (false);
+  *out = result;
+  return true;
+}
+
+inline bool ParseUnsigned(const char* s, size_t length, uint64_t* out) {
+  uint64_t result = 0;
+  do {
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+    PARSE_UNSIGNED_ITERATION(uint64_t);
+
+    PARSE_UNSIGNED_ITERATION_LAST(uint64_t);
+  } while (false);
+  *out = result;
+  return true;
+}
+
+#undef PARSE_UNSIGNED_ITERATION
+#undef PARSE_UNSIGNED_ITERATION_LAST
+
+template <typename T>
+bool ParseHex(const char* s, size_t length, T* out) {
+  // lets make sure that the length of the string is not too big
+  if (!ARROW_PREDICT_TRUE(sizeof(T) * 2 >= length && length > 0)) {
+    return false;
+  }
+  T result = 0;
+  for (size_t i = 0; i < length; i++) {
+    result = static_cast<T>(result << 4);
+    if (s[i] >= '0' && s[i] <= '9') {
+      result = static_cast<T>(result | (s[i] - '0'));
+    } else if (s[i] >= 'A' && s[i] <= 'F') {
+      result = static_cast<T>(result | (s[i] - 'A' + 10));
+    } else if (s[i] >= 'a' && s[i] <= 'f') {
+      result = static_cast<T>(result | (s[i] - 'a' + 10));
+    } else {
+      /* Non-digit */
+      return false;
+    }
+  }
+  *out = result;
+  return true;
+}
+
+template <class ARROW_TYPE>
+struct StringToUnsignedIntConverterMixin {
+  using value_type = typename ARROW_TYPE::c_type;
+
+  bool Convert(const ARROW_TYPE&, const char* s, size_t length, value_type* out) {
+    if (ARROW_PREDICT_FALSE(length == 0)) {
+      return false;
+    }
+    // If it starts with 0x then its hex
+    if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) {
+      length -= 2;
+      s += 2;
+
+      return ARROW_PREDICT_TRUE(ParseHex(s, length, out));
+    }
+    // Skip leading zeros
+    while (length > 0 && *s == '0') {
+      length--;
+      s++;
+    }
+    return ParseUnsigned(s, length, out);
+  }
+};
+
+template <>
+struct StringConverter<UInt8Type> : public StringToUnsignedIntConverterMixin<UInt8Type> {
+  using StringToUnsignedIntConverterMixin<UInt8Type>::StringToUnsignedIntConverterMixin;
+};
+
+template <>
+struct StringConverter<UInt16Type>
+    : public StringToUnsignedIntConverterMixin<UInt16Type> {
+  using StringToUnsignedIntConverterMixin<UInt16Type>::StringToUnsignedIntConverterMixin;
+};
+
+template <>
+struct StringConverter<UInt32Type>
+    : public StringToUnsignedIntConverterMixin<UInt32Type> {
+  using StringToUnsignedIntConverterMixin<UInt32Type>::StringToUnsignedIntConverterMixin;
+};
+
+template <>
+struct StringConverter<UInt64Type>
+    : public StringToUnsignedIntConverterMixin<UInt64Type> {
+  using StringToUnsignedIntConverterMixin<UInt64Type>::StringToUnsignedIntConverterMixin;
+};
+
+template <class ARROW_TYPE>
+struct StringToSignedIntConverterMixin {
+  using value_type = typename ARROW_TYPE::c_type;
+  using unsigned_type = typename std::make_unsigned<value_type>::type;
+
+  bool Convert(const ARROW_TYPE&, const char* s, size_t length, value_type* out) {
+    static constexpr auto max_positive =
+        static_cast<unsigned_type>(std::numeric_limits<value_type>::max());
+    // Assuming two's complement
+    static constexpr unsigned_type max_negative = max_positive + 1;
+    bool negative = false;
+    unsigned_type unsigned_value = 0;
+
+    if (ARROW_PREDICT_FALSE(length == 0)) {
+      return false;
+    }
+    // If it starts with 0x then its hex
+    if (length > 2 && s[0] == '0' && ((s[1] == 'x') || (s[1] == 'X'))) {
+      length -= 2;
+      s += 2;
+
+      if (!ARROW_PREDICT_TRUE(ParseHex(s, length, &unsigned_value))) {
+        return false;
+      }
+      *out = static_cast<value_type>(unsigned_value);
+      return true;
+    }
+
+    if (*s == '-') {
+      negative = true;
+      s++;
+      if (--length == 0) {
+        return false;
+      }
+    }
+    // Skip leading zeros
+    while (length > 0 && *s == '0') {
+      length--;
+      s++;
+    }
+    if (!ARROW_PREDICT_TRUE(ParseUnsigned(s, length, &unsigned_value))) {
+      return false;
+    }
+    if (negative) {
+      if (ARROW_PREDICT_FALSE(unsigned_value > max_negative)) {
+        return false;
+      }
+      // To avoid both compiler warnings (with unsigned negation)
+      // and undefined behaviour (with signed negation overflow),
+      // use the expanded formula for 2's complement negation.
+      *out = static_cast<value_type>(~unsigned_value + 1);
+    } else {
+      if (ARROW_PREDICT_FALSE(unsigned_value > max_positive)) {
+        return false;
+      }
+      *out = static_cast<value_type>(unsigned_value);
+    }
+    return true;
+  }
+};
+
+template <>
+struct StringConverter<Int8Type> : public StringToSignedIntConverterMixin<Int8Type> {
+  using StringToSignedIntConverterMixin<Int8Type>::StringToSignedIntConverterMixin;
+};
+
+template <>
+struct StringConverter<Int16Type> : public StringToSignedIntConverterMixin<Int16Type> {
+  using StringToSignedIntConverterMixin<Int16Type>::StringToSignedIntConverterMixin;
+};
+
+template <>
+struct StringConverter<Int32Type> : public StringToSignedIntConverterMixin<Int32Type> {
+  using StringToSignedIntConverterMixin<Int32Type>::StringToSignedIntConverterMixin;
+};
+
+template <>
+struct StringConverter<Int64Type> : public StringToSignedIntConverterMixin<Int64Type> {
+  using StringToSignedIntConverterMixin<Int64Type>::StringToSignedIntConverterMixin;
+};
+
+namespace detail {
+
+// Inline-able ISO-8601 parser
+
+using ts_type = TimestampType::c_type;
+
+template <typename Duration>
+static inline bool ParseHH(const char* s, Duration* out) {
+  uint8_t hours = 0;
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(hours >= 24)) {
+    return false;
+  }
+  *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours));
+  return true;
+}
+
+template <typename Duration>
+static inline bool ParseHH_MM(const char* s, Duration* out) {
+  uint8_t hours = 0;
+  uint8_t minutes = 0;
+  if (ARROW_PREDICT_FALSE(s[2] != ':')) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(hours >= 24)) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+    return false;
+  }
+  *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
+                                              std::chrono::minutes(minutes));
+  return true;
+}
+
+template <typename Duration>
+static inline bool ParseHHMM(const char* s, Duration* out) {
+  uint8_t hours = 0;
+  uint8_t minutes = 0;
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 2, 2, &minutes))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(hours >= 24)) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+    return false;
+  }
+  *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
+                                              std::chrono::minutes(minutes));
+  return true;
+}
+
+template <typename Duration>
+static inline bool ParseHH_MM_SS(const char* s, Duration* out) {
+  uint8_t hours = 0;
+  uint8_t minutes = 0;
+  uint8_t seconds = 0;
+  if (ARROW_PREDICT_FALSE(s[2] != ':') || ARROW_PREDICT_FALSE(s[5] != ':')) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 2, &hours))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 3, 2, &minutes))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 6, 2, &seconds))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(hours >= 24)) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(minutes >= 60)) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(seconds >= 60)) {
+    return false;
+  }
+  *out = std::chrono::duration_cast<Duration>(std::chrono::hours(hours) +
+                                              std::chrono::minutes(minutes) +
+                                              std::chrono::seconds(seconds));
+  return true;
+}
+
+static inline bool ParseSubSeconds(const char* s, size_t length, TimeUnit::type unit,
+                                   uint32_t* out) {
+  // The decimal point has been peeled off at this point
+
+  // Fail if number of decimal places provided exceeds what the unit can hold.
+  // Calculate how many trailing decimal places are omitted for the unit
+  // e.g. if 4 decimal places are provided and unit is MICRO, 2 are missing
+  size_t omitted = 0;
+  switch (unit) {
+    case TimeUnit::MILLI:
+      if (ARROW_PREDICT_FALSE(length > 3)) {
+        return false;
+      }
+      if (length < 3) {
+        omitted = 3 - length;
+      }
+      break;
+    case TimeUnit::MICRO:
+      if (ARROW_PREDICT_FALSE(length > 6)) {
+        return false;
+      }
+      if (length < 6) {
+        omitted = 6 - length;
+      }
+      break;
+    case TimeUnit::NANO:
+      if (ARROW_PREDICT_FALSE(length > 9)) {
+        return false;
+      }
+      if (length < 9) {
+        omitted = 9 - length;
+      }
+      break;
+    default:
+      return false;
+  }
+
+  if (ARROW_PREDICT_TRUE(omitted == 0)) {
+    return ParseUnsigned(s, length, out);
+  } else {
+    uint32_t subseconds = 0;
+    bool success = ParseUnsigned(s, length, &subseconds);
+    if (ARROW_PREDICT_TRUE(success)) {
+      switch (omitted) {
+        case 1:
+          *out = subseconds * 10;
+          break;
+        case 2:
+          *out = subseconds * 100;
+          break;
+        case 3:
+          *out = subseconds * 1000;
+          break;
+        case 4:
+          *out = subseconds * 10000;
+          break;
+        case 5:
+          *out = subseconds * 100000;
+          break;
+        case 6:
+          *out = subseconds * 1000000;
+          break;
+        case 7:
+          *out = subseconds * 10000000;
+          break;
+        case 8:
+          *out = subseconds * 100000000;
+          break;
+        default:
+          // Impossible case
+          break;
+      }
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
+
+}  // namespace detail
+
+template <typename Duration>
+static inline bool ParseYYYY_MM_DD(const char* s, Duration* since_epoch) {
+  uint16_t year = 0;
+  uint8_t month = 0;
+  uint8_t day = 0;
+  if (ARROW_PREDICT_FALSE(s[4] != '-') || ARROW_PREDICT_FALSE(s[7] != '-')) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 0, 4, &year))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 5, 2, &month))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(!ParseUnsigned(s + 8, 2, &day))) {
+    return false;
+  }
+  arrow_vendored::date::year_month_day ymd{arrow_vendored::date::year{year},
+                                           arrow_vendored::date::month{month},
+                                           arrow_vendored::date::day{day}};
+  if (ARROW_PREDICT_FALSE(!ymd.ok())) return false;
+
+  *since_epoch = std::chrono::duration_cast<Duration>(
+      arrow_vendored::date::sys_days{ymd}.time_since_epoch());
+  return true;
+}
+
+static inline bool ParseTimestampISO8601(const char* s, size_t length,
+                                         TimeUnit::type unit, TimestampType::c_type* out,
+                                         bool* out_zone_offset_present = NULLPTR) {
+  using seconds_type = std::chrono::duration<TimestampType::c_type>;
+
+  // We allow the following zone offset formats:
+  // - (none)
+  // - Z
+  // - [+-]HH(:?MM)?
+  //
+  // We allow the following formats for all units:
+  // - "YYYY-MM-DD"
+  // - "YYYY-MM-DD[ T]hhZ?"
+  // - "YYYY-MM-DD[ T]hh:mmZ?"
+  // - "YYYY-MM-DD[ T]hh:mm:ssZ?"
+  //
+  // We allow the following formats for unit == MILLI, MICRO, or NANO:
+  // - "YYYY-MM-DD[ T]hh:mm:ss.s{1,3}Z?"
+  //
+  // We allow the following formats for unit == MICRO, or NANO:
+  // - "YYYY-MM-DD[ T]hh:mm:ss.s{4,6}Z?"
+  //
+  // We allow the following formats for unit == NANO:
+  // - "YYYY-MM-DD[ T]hh:mm:ss.s{7,9}Z?"
+  //
+  // UTC is always assumed, and the DataType's timezone is ignored.
+  //
+
+  if (ARROW_PREDICT_FALSE(length < 10)) return false;
+
+  seconds_type seconds_since_epoch;
+  if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &seconds_since_epoch))) {
+    return false;
+  }
+
+  if (length == 10) {
+    return util::CastSecondsToUnit(unit, seconds_since_epoch.count(), out);
+  }
+
+  if (ARROW_PREDICT_FALSE(s[10] != ' ') && ARROW_PREDICT_FALSE(s[10] != 'T')) {
+    return false;
+  }
+
+  if (out_zone_offset_present) {
+    *out_zone_offset_present = false;
+  }
+
+  seconds_type zone_offset(0);
+  if (s[length - 1] == 'Z') {
+    --length;
+    if (out_zone_offset_present) *out_zone_offset_present = true;
+  } else if (s[length - 3] == '+' || s[length - 3] == '-') {
+    // [+-]HH
+    length -= 3;
+    if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + length + 1, &zone_offset))) {
+      return false;
+    }
+    if (s[length] == '+') zone_offset *= -1;
+    if (out_zone_offset_present) *out_zone_offset_present = true;
+  } else if (s[length - 5] == '+' || s[length - 5] == '-') {
+    // [+-]HHMM
+    length -= 5;
+    if (ARROW_PREDICT_FALSE(!detail::ParseHHMM(s + length + 1, &zone_offset))) {
+      return false;
+    }
+    if (s[length] == '+') zone_offset *= -1;
+    if (out_zone_offset_present) *out_zone_offset_present = true;
+  } else if ((s[length - 6] == '+' || s[length - 6] == '-') && (s[length - 3] == ':')) {
+    // [+-]HH:MM
+    length -= 6;
+    if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + length + 1, &zone_offset))) {
+      return false;
+    }
+    if (s[length] == '+') zone_offset *= -1;
+    if (out_zone_offset_present) *out_zone_offset_present = true;
+  }
+
+  seconds_type seconds_since_midnight;
+  switch (length) {
+    case 13:  // YYYY-MM-DD[ T]hh
+      if (ARROW_PREDICT_FALSE(!detail::ParseHH(s + 11, &seconds_since_midnight))) {
+        return false;
+      }
+      break;
+    case 16:  // YYYY-MM-DD[ T]hh:mm
+      if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s + 11, &seconds_since_midnight))) {
+        return false;
+      }
+      break;
+    case 19:  // YYYY-MM-DD[ T]hh:mm:ss
+    case 21:  // YYYY-MM-DD[ T]hh:mm:ss.s
+    case 22:  // YYYY-MM-DD[ T]hh:mm:ss.ss
+    case 23:  // YYYY-MM-DD[ T]hh:mm:ss.sss
+    case 24:  // YYYY-MM-DD[ T]hh:mm:ss.ssss
+    case 25:  // YYYY-MM-DD[ T]hh:mm:ss.sssss
+    case 26:  // YYYY-MM-DD[ T]hh:mm:ss.ssssss
+    case 27:  // YYYY-MM-DD[ T]hh:mm:ss.sssssss
+    case 28:  // YYYY-MM-DD[ T]hh:mm:ss.ssssssss
+    case 29:  // YYYY-MM-DD[ T]hh:mm:ss.sssssssss
+      if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s + 11, &seconds_since_midnight))) {
+        return false;
+      }
+      break;
+    default:
+      return false;
+  }
+
+  // Switch to plain integers to take advantage of the overflow arithmetic ops
+  auto count = (seconds_since_midnight + zone_offset).count();
+
+  if (ARROW_PREDICT_FALSE(::arrow::internal::AddWithOverflow(
+          count, seconds_since_epoch.count(), &count))) {
+    return false;
+  }
+
+  if (length <= 19) {
+    return util::CastSecondsToUnit(unit, count, out);
+  }
+
+  if (ARROW_PREDICT_FALSE(s[19] != '.')) {
+    return false;
+  }
+
+  uint32_t subseconds = 0;
+  if (ARROW_PREDICT_FALSE(
+          !detail::ParseSubSeconds(s + 20, length - 20, unit, &subseconds))) {
+    return false;
+  }
+
+  if (ARROW_PREDICT_FALSE(!util::CastSecondsToUnit(unit, count, out))) {
+    return false;
+  }
+  if (ARROW_PREDICT_FALSE(::arrow::internal::AddWithOverflow(*out, subseconds, out))) {
+    return false;
+  }
+  return true;
+}
+
+#if defined(_WIN32) || defined(ARROW_WITH_MUSL)
+static constexpr bool kStrptimeSupportsZone = false;
+#else
+static constexpr bool kStrptimeSupportsZone = true;
+#endif
+
+/// \brief Returns time since the UNIX epoch in the requested unit
+static inline bool ParseTimestampStrptime(const char* buf, size_t length,
+                                          const char* format, bool ignore_time_in_day,
+                                          bool allow_trailing_chars, TimeUnit::type unit,
+                                          int64_t* out) {
+  // NOTE: strptime() is more than 10x faster than arrow_vendored::date::parse().
+  // The buffer may not be nul-terminated
+  std::string clean_copy(buf, length);
+  struct tm result;
+  memset(&result, 0, sizeof(struct tm));
+#ifdef _WIN32
+  char* ret = arrow_strptime(clean_copy.c_str(), format, &result);
+#else
+  char* ret = strptime(clean_copy.c_str(), format, &result);
+#endif
+  if (ret == NULLPTR) {
+    return false;
+  }
+  if (!allow_trailing_chars && static_cast<size_t>(ret - clean_copy.c_str()) != length) {
+    return false;
+  }
+  // ignore the time part
+  arrow_vendored::date::sys_seconds secs =
+      arrow_vendored::date::sys_days(arrow_vendored::date::year(result.tm_year + 1900) /
+                                     (result.tm_mon + 1) / std::max(result.tm_mday, 1));
+  if (!ignore_time_in_day) {
+    secs += (std::chrono::hours(result.tm_hour) + std::chrono::minutes(result.tm_min) +
+             std::chrono::seconds(result.tm_sec));
+#if !defined(_WIN32) && !defined(_AIX)
+    secs -= std::chrono::seconds(result.tm_gmtoff);
+#endif
+  }
+  return util::CastSecondsToUnit(unit, secs.time_since_epoch().count(), out);
+}
+
+template <>
+struct StringConverter<TimestampType> {
+  using value_type = int64_t;
+
+  bool Convert(const TimestampType& type, const char* s, size_t length, value_type* out) {
+    return ParseTimestampISO8601(s, length, type.unit(), out);
+  }
+};
+
+template <>
+struct StringConverter<DurationType>
+    : public StringToSignedIntConverterMixin<DurationType> {
+  using StringToSignedIntConverterMixin<DurationType>::StringToSignedIntConverterMixin;
+};
+
+template <typename DATE_TYPE>
+struct StringConverter<DATE_TYPE, enable_if_date<DATE_TYPE>> {
+  using value_type = typename DATE_TYPE::c_type;
+
+  using duration_type =
+      typename std::conditional<std::is_same<DATE_TYPE, Date32Type>::value,
+                                arrow_vendored::date::days,
+                                std::chrono::milliseconds>::type;
+
+  bool Convert(const DATE_TYPE& type, const char* s, size_t length, value_type* out) {
+    if (ARROW_PREDICT_FALSE(length != 10)) {
+      return false;
+    }
+
+    duration_type since_epoch;
+    if (ARROW_PREDICT_FALSE(!ParseYYYY_MM_DD(s, &since_epoch))) {
+      return false;
+    }
+
+    *out = static_cast<value_type>(since_epoch.count());
+    return true;
+  }
+};
+
+template <typename TIME_TYPE>
+struct StringConverter<TIME_TYPE, enable_if_time<TIME_TYPE>> {
+  using value_type = typename TIME_TYPE::c_type;
+
+  // We allow the following formats for all units:
+  // - "hh:mm"
+  // - "hh:mm:ss"
+  //
+  // We allow the following formats for unit == MILLI, MICRO, or NANO:
+  // - "hh:mm:ss.s{1,3}"
+  //
+  // We allow the following formats for unit == MICRO, or NANO:
+  // - "hh:mm:ss.s{4,6}"
+  //
+  // We allow the following formats for unit == NANO:
+  // - "hh:mm:ss.s{7,9}"
+
+  bool Convert(const TIME_TYPE& type, const char* s, size_t length, value_type* out) {
+    const auto unit = type.unit();
+    std::chrono::seconds since_midnight;
+
+    auto get_seconds_since_midnight = [&](value_type* out) -> bool {
+      int64_t long_out;
+      if (ARROW_PREDICT_FALSE(
+              !util::CastSecondsToUnit(unit, since_midnight.count(), &long_out))) {
+        return false;
+      }
+      *out = static_cast<value_type>(long_out);
+      return *out == long_out;
+    };
+
+    if (length == 5) {
+      if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM(s, &since_midnight))) {
+        return false;
+      }
+      return get_seconds_since_midnight(out);
+    }
+
+    if (ARROW_PREDICT_FALSE(length < 8)) {
+      return false;
+    }
+    if (ARROW_PREDICT_FALSE(!detail::ParseHH_MM_SS(s, &since_midnight))) {
+      return false;
+    }
+
+    if (ARROW_PREDICT_FALSE(!get_seconds_since_midnight(out))) {
+      return false;
+    }
+
+    if (length == 8) {
+      return true;
+    }
+
+    if (ARROW_PREDICT_FALSE(s[8] != '.')) {
+      return false;
+    }
+
+    uint32_t subseconds_count = 0;
+    if (ARROW_PREDICT_FALSE(
+            !detail::ParseSubSeconds(s + 9, length - 9, unit, &subseconds_count))) {
+      return false;
+    }
+
+    *out += subseconds_count;
+    return true;
+  }
+};
+
+/// \brief Convenience wrappers around internal::StringConverter.
+template <typename T>
+bool ParseValue(const T& type, const char* s, size_t length,
+                typename StringConverter<T>::value_type* out) {
+  return StringConverter<T>{}.Convert(type, s, length, out);
+}
+
+template <typename T>
+enable_if_parameter_free<T, bool> ParseValue(
+    const char* s, size_t length, typename StringConverter<T>::value_type* out) {
+  static T type;
+  return StringConverter<T>{}.Convert(type, s, length, out);
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/vector.h b/pyarrow/include/arrow/util/vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..809497b9aedb2fa4cc383f3f313a5c37ebdc2486
--- /dev/null
+++ b/pyarrow/include/arrow/util/vector.h
@@ -0,0 +1,171 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/util/algorithm.h"
+#include "arrow/util/functional.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace internal {
+
+template <typename T>
+std::vector<T> DeleteVectorElement(const std::vector<T>& values, size_t index) {
+  ARROW_DCHECK(!values.empty());
+  ARROW_DCHECK_LT(index, values.size());
+  std::vector<T> out;
+  out.reserve(values.size() - 1);
+  for (size_t i = 0; i < index; ++i) {
+    out.push_back(values[i]);
+  }
+  for (size_t i = index + 1; i < values.size(); ++i) {
+    out.push_back(values[i]);
+  }
+  return out;
+}
+
+template <typename T>
+std::vector<T> AddVectorElement(const std::vector<T>& values, size_t index,
+                                T new_element) {
+  ARROW_DCHECK_LE(index, values.size());
+  std::vector<T> out;
+  out.reserve(values.size() + 1);
+  for (size_t i = 0; i < index; ++i) {
+    out.push_back(values[i]);
+  }
+  out.emplace_back(std::move(new_element));
+  for (size_t i = index; i < values.size(); ++i) {
+    out.push_back(values[i]);
+  }
+  return out;
+}
+
+template <typename T>
+std::vector<T> ReplaceVectorElement(const std::vector<T>& values, size_t index,
+                                    T new_element) {
+  ARROW_DCHECK_LE(index, values.size());
+  std::vector<T> out;
+  out.reserve(values.size());
+  for (size_t i = 0; i < index; ++i) {
+    out.push_back(values[i]);
+  }
+  out.emplace_back(std::move(new_element));
+  for (size_t i = index + 1; i < values.size(); ++i) {
+    out.push_back(values[i]);
+  }
+  return out;
+}
+
+template <typename T, typename Predicate>
+std::vector<T> FilterVector(std::vector<T> values, Predicate&& predicate) {
+  auto new_end = std::remove_if(values.begin(), values.end(),
+                                [&](const T& value) { return !predicate(value); });
+  values.erase(new_end, values.end());
+  return values;
+}
+
+template <typename Fn, typename From,
+          typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
+std::vector<To> MapVector(Fn&& map, const std::vector<From>& source) {
+  std::vector<To> out;
+  out.reserve(source.size());
+  std::transform(source.begin(), source.end(), std::back_inserter(out),
+                 std::forward<Fn>(map));
+  return out;
+}
+
+template <typename Fn, typename From,
+          typename To = decltype(std::declval<Fn>()(std::declval<From>()))>
+std::vector<To> MapVector(Fn&& map, std::vector<From>&& source) {
+  std::vector<To> out;
+  out.reserve(source.size());
+  std::transform(std::make_move_iterator(source.begin()),
+                 std::make_move_iterator(source.end()), std::back_inserter(out),
+                 std::forward<Fn>(map));
+  return out;
+}
+
+/// \brief Like MapVector, but where the function can fail.
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+          typename To = typename internal::call_traits::return_type<Fn>::ValueType>
+Result<std::vector<To>> MaybeMapVector(Fn&& map, const std::vector<From>& source) {
+  std::vector<To> out;
+  out.reserve(source.size());
+  ARROW_RETURN_NOT_OK(MaybeTransform(source.begin(), source.end(),
+                                     std::back_inserter(out), std::forward<Fn>(map)));
+  return out;
+}
+
+template <typename Fn, typename From = internal::call_traits::argument_type<0, Fn>,
+          typename To = typename internal::call_traits::return_type<Fn>::ValueType>
+Result<std::vector<To>> MaybeMapVector(Fn&& map, std::vector<From>&& source) {
+  std::vector<To> out;
+  out.reserve(source.size());
+  ARROW_RETURN_NOT_OK(MaybeTransform(std::make_move_iterator(source.begin()),
+                                     std::make_move_iterator(source.end()),
+                                     std::back_inserter(out), std::forward<Fn>(map)));
+  return std::move(out);
+}
+
+template <typename T>
+std::vector<T> FlattenVectors(const std::vector<std::vector<T>>& vecs) {
+  std::size_t sum = 0;
+  for (const auto& vec : vecs) {
+    sum += vec.size();
+  }
+  std::vector<T> out;
+  out.reserve(sum);
+  for (const auto& vec : vecs) {
+    out.insert(out.end(), vec.begin(), vec.end());
+  }
+  return out;
+}
+
+template <typename T>
+Result<std::vector<T>> UnwrapOrRaise(std::vector<Result<T>>&& results) {
+  std::vector<T> out;
+  out.reserve(results.size());
+  for (auto&& result : results) {
+    if (!result.ok()) {
+      return result.status();
+    }
+    out.push_back(result.MoveValueUnsafe());
+  }
+  return out;
+}
+
+template <typename T>
+Result<std::vector<T>> UnwrapOrRaise(const std::vector<Result<T>>& results) {
+  std::vector<T> out;
+  out.reserve(results.size());
+  for (const auto& result : results) {
+    if (!result.ok()) {
+      return result.status();
+    }
+    out.push_back(result.ValueUnsafe());
+  }
+  return out;
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/util/visibility.h b/pyarrow/include/arrow/util/visibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..67988071c9d99ee6e9514f818bd93d1f1ca3df50
--- /dev/null
+++ b/pyarrow/include/arrow/util/visibility.h
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+// Windows
+
+#  if defined(_MSC_VER)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  if defined(__cplusplus) && defined(__GNUC__) && !defined(__clang__)
+// Use C++ attribute syntax where possible to avoid GCC parser bug
+// (https://stackoverflow.com/questions/57993818/gcc-how-to-combine-attribute-dllexport-and-nodiscard-in-a-struct-de)
+#    define ARROW_DLLEXPORT [[gnu::dllexport]]
+#    define ARROW_DLLIMPORT [[gnu::dllimport]]
+#  else
+#    define ARROW_DLLEXPORT __declspec(dllexport)
+#    define ARROW_DLLIMPORT __declspec(dllimport)
+#  endif
+
+// _declspec(dllexport) even when #included by a non-arrow source
+#  define ARROW_FORCE_EXPORT ARROW_DLLEXPORT
+
+#  ifdef ARROW_STATIC
+#    define ARROW_EXPORT
+#    define ARROW_FRIEND_EXPORT
+#    define ARROW_TEMPLATE_EXPORT
+#  elif defined(ARROW_EXPORTING)
+#    define ARROW_EXPORT ARROW_DLLEXPORT
+// For some reason [[gnu::dllexport]] doesn't work well with friend declarations
+#    define ARROW_FRIEND_EXPORT __declspec(dllexport)
+#    define ARROW_TEMPLATE_EXPORT ARROW_DLLEXPORT
+#  else
+#    define ARROW_EXPORT ARROW_DLLIMPORT
+#    define ARROW_FRIEND_EXPORT __declspec(dllimport)
+#    define ARROW_TEMPLATE_EXPORT ARROW_DLLIMPORT
+#  endif
+
+#  define ARROW_NO_EXPORT
+
+#else
+
+// Non-Windows
+
+#  if defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__))
+#    ifndef ARROW_EXPORT
+#      define ARROW_EXPORT [[gnu::visibility("default")]]
+#    endif
+#    ifndef ARROW_NO_EXPORT
+#      define ARROW_NO_EXPORT [[gnu::visibility("hidden")]]
+#    endif
+// The C++ language does not have clear rules for how to export explicit template
+// instantiations, and clang/gcc have differing syntax. See
+// https://github.com/llvm/llvm-project/issues/29464 and
+// https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0537r0.html
+#    if defined(__clang__)
+#      define ARROW_TEMPLATE_EXPORT
+#    else
+#      define ARROW_TEMPLATE_EXPORT ARROW_EXPORT
+#    endif
+#  else
+// Not C++, or not gcc/clang
+#    ifndef ARROW_EXPORT
+#      define ARROW_EXPORT
+#    endif
+#    ifndef ARROW_NO_EXPORT
+#      define ARROW_NO_EXPORT
+#    endif
+#    define ARROW_TEMPLATE_EXPORT
+#  endif
+
+#  define ARROW_FRIEND_EXPORT
+
+// [[gnu::visibility("default")]] even when #included by a non-arrow source
+#  define ARROW_FORCE_EXPORT [[gnu::visibility("default")]]
+
+#endif  // Non-Windows
diff --git a/pyarrow/include/arrow/util/windows_compatibility.h b/pyarrow/include/arrow/util/windows_compatibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..810a91201f3352d0c99a673d653ed21431c1c721
--- /dev/null
+++ b/pyarrow/include/arrow/util/windows_compatibility.h
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifdef _WIN32
+
+// Windows defines min and max macros that mess up std::min/max
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+
+#  define WIN32_LEAN_AND_MEAN
+
+// Set Windows 7 as a conservative minimum for Apache Arrow
+#  if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x601
+#    undef _WIN32_WINNT
+#  endif
+#  ifndef _WIN32_WINNT
+#    define _WIN32_WINNT 0x601
+#  endif
+
+#  include <winsock2.h>
+
+#  include "arrow/util/windows_fixup.h"
+
+#endif  // _WIN32
diff --git a/pyarrow/include/arrow/util/windows_fixup.h b/pyarrow/include/arrow/util/windows_fixup.h
new file mode 100644
index 0000000000000000000000000000000000000000..42e74f4a7857fc7732dff3f0021b9301d32f51ce
--- /dev/null
+++ b/pyarrow/include/arrow/util/windows_fixup.h
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This header needs to be included multiple times.
+
+#ifdef _WIN32
+
+#  ifdef max
+#    undef max
+#  endif
+#  ifdef min
+#    undef min
+#  endif
+
+// The Windows API defines macros from *File resolving to either
+// *FileA or *FileW.  Need to undo them.
+#  ifdef CopyFile
+#    undef CopyFile
+#  endif
+#  ifdef CreateFile
+#    undef CreateFile
+#  endif
+#  ifdef DeleteFile
+#    undef DeleteFile
+#  endif
+
+// Other annoying Windows macro definitions...
+#  ifdef IN
+#    undef IN
+#  endif
+#  ifdef OUT
+#    undef OUT
+#  endif
+
+// Note that we can't undefine OPTIONAL, because it can be used in other
+// Windows headers...
+
+#endif  // _WIN32
diff --git a/pyarrow/include/arrow/vendored/ProducerConsumerQueue.h b/pyarrow/include/arrow/vendored/ProducerConsumerQueue.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b7cfa1cb166fd7bf06474e27ae6d80a23edb400
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/ProducerConsumerQueue.h
@@ -0,0 +1,217 @@
+// Vendored from git tag v2021.02.15.00
+
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// @author Bo Hu (bhu@fb.com)
+// @author Jordan DeLong (delong.j@fb.com)
+
+// This file has been modified as part of Apache Arrow to conform to
+// Apache Arrow's coding conventions
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+namespace arrow_vendored {
+namespace folly {
+
+// Vendored from folly/Portability.h
+namespace {
+#if defined(__arm__)
+#define FOLLY_ARM 1
+#else
+#define FOLLY_ARM 0
+#endif
+
+#if defined(__s390x__)
+#define FOLLY_S390X 1
+#else
+#define FOLLY_S390X 0
+#endif
+
+constexpr bool kIsArchArm = FOLLY_ARM == 1;
+constexpr bool kIsArchS390X = FOLLY_S390X == 1;
+}  // namespace
+
+// Vendored from folly/lang/Align.h
+namespace {
+
+constexpr std::size_t hardware_destructive_interference_size =
+    (kIsArchArm || kIsArchS390X) ? 64 : 128;
+
+}  // namespace
+
+/*
+ * ProducerConsumerQueue is a one producer and one consumer queue
+ * without locks.
+ */
+template <class T>
+struct ProducerConsumerQueue {
+  typedef T value_type;
+
+  ProducerConsumerQueue(const ProducerConsumerQueue&) = delete;
+  ProducerConsumerQueue& operator=(const ProducerConsumerQueue&) = delete;
+
+  // size must be >= 2.
+  //
+  // Also, note that the number of usable slots in the queue at any
+  // given time is actually (size-1), so if you start with an empty queue,
+  // IsFull() will return true after size-1 insertions.
+  explicit ProducerConsumerQueue(uint32_t size)
+      : size_(size),
+        records_(static_cast<T*>(std::malloc(sizeof(T) * size))),
+        readIndex_(0),
+        writeIndex_(0) {
+    assert(size >= 2);
+    if (!records_) {
+      throw std::bad_alloc();
+    }
+  }
+
+  ~ProducerConsumerQueue() {
+    // We need to destruct anything that may still exist in our queue.
+    // (No real synchronization needed at destructor time: only one
+    // thread can be doing this.)
+    if (!std::is_trivially_destructible<T>::value) {
+      size_t readIndex = readIndex_;
+      size_t endIndex = writeIndex_;
+      while (readIndex != endIndex) {
+        records_[readIndex].~T();
+        if (++readIndex == size_) {
+          readIndex = 0;
+        }
+      }
+    }
+
+    std::free(records_);
+  }
+
+  template <class... Args>
+  bool Write(Args&&... recordArgs) {
+    auto const currentWrite = writeIndex_.load(std::memory_order_relaxed);
+    auto nextRecord = currentWrite + 1;
+    if (nextRecord == size_) {
+      nextRecord = 0;
+    }
+    if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
+      new (&records_[currentWrite]) T(std::forward<Args>(recordArgs)...);
+      writeIndex_.store(nextRecord, std::memory_order_release);
+      return true;
+    }
+
+    // queue is full
+    return false;
+  }
+
+  // move the value at the front of the queue to given variable
+  bool Read(T& record) {
+    auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+    if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
+      // queue is empty
+      return false;
+    }
+
+    auto nextRecord = currentRead + 1;
+    if (nextRecord == size_) {
+      nextRecord = 0;
+    }
+    record = std::move(records_[currentRead]);
+    records_[currentRead].~T();
+    readIndex_.store(nextRecord, std::memory_order_release);
+    return true;
+  }
+
+  // pointer to the value at the front of the queue (for use in-place) or
+  // nullptr if empty.
+  T* FrontPtr() {
+    auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+    if (currentRead == writeIndex_.load(std::memory_order_acquire)) {
+      // queue is empty
+      return nullptr;
+    }
+    return &records_[currentRead];
+  }
+
+  // queue must not be empty
+  void PopFront() {
+    auto const currentRead = readIndex_.load(std::memory_order_relaxed);
+    assert(currentRead != writeIndex_.load(std::memory_order_acquire));
+
+    auto nextRecord = currentRead + 1;
+    if (nextRecord == size_) {
+      nextRecord = 0;
+    }
+    records_[currentRead].~T();
+    readIndex_.store(nextRecord, std::memory_order_release);
+  }
+
+  bool IsEmpty() const {
+    return readIndex_.load(std::memory_order_acquire) ==
+           writeIndex_.load(std::memory_order_acquire);
+  }
+
+  bool IsFull() const {
+    auto nextRecord = writeIndex_.load(std::memory_order_acquire) + 1;
+    if (nextRecord == size_) {
+      nextRecord = 0;
+    }
+    if (nextRecord != readIndex_.load(std::memory_order_acquire)) {
+      return false;
+    }
+    // queue is full
+    return true;
+  }
+
+  // * If called by consumer, then true size may be more (because producer may
+  //   be adding items concurrently).
+  // * If called by producer, then true size may be less (because consumer may
+  //   be removing items concurrently).
+  // * It is undefined to call this from any other thread.
+  size_t SizeGuess() const {
+    int ret = writeIndex_.load(std::memory_order_acquire) -
+              readIndex_.load(std::memory_order_acquire);
+    if (ret < 0) {
+      ret += size_;
+    }
+    return ret;
+  }
+
+  // maximum number of items in the queue.
+  size_t capacity() const { return size_ - 1; }
+
+ private:
+  using AtomicIndex = std::atomic<unsigned int>;
+
+  char pad0_[hardware_destructive_interference_size];
+  const uint32_t size_;
+  T* const records_;
+
+  AtomicIndex readIndex_;
+  char pad1_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
+  AtomicIndex writeIndex_;
+
+  char pad2_[hardware_destructive_interference_size - sizeof(AtomicIndex)];
+};
+
+}  // namespace folly
+}  // namespace arrow_vendored
diff --git a/pyarrow/include/arrow/vendored/datetime.h b/pyarrow/include/arrow/vendored/datetime.h
new file mode 100644
index 0000000000000000000000000000000000000000..aea31ebe77f9e81b6c44e35bc81a735c041cac23
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/datetime.h
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/vendored/datetime/visibility.h"  // IWYU pragma: export
+#include "arrow/vendored/datetime/date.h"        // IWYU pragma: export
+#include "arrow/vendored/datetime/tz.h"          // IWYU pragma: export
+
+// Can be defined by date.h.
+#ifdef NOEXCEPT
+#  undef NOEXCEPT
+#endif
diff --git a/pyarrow/include/arrow/vendored/datetime/date.h b/pyarrow/include/arrow/vendored/datetime/date.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b06182a6daa3b4e9d20ade9fc9fbd5a9a50a65f
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/datetime/date.h
@@ -0,0 +1,8245 @@
+#ifndef DATE_H
+#define DATE_H
+
+// The MIT License (MIT)
+//
+// Copyright (c) 2015, 2016, 2017 Howard Hinnant
+// Copyright (c) 2016 Adrian Colomitchi
+// Copyright (c) 2017 Florian Dang
+// Copyright (c) 2017 Paul Thompson
+// Copyright (c) 2018, 2019 Tomasz Kamiński
+// Copyright (c) 2019 Jiangang Zhuang
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// Our apologies.  When the previous paragraph was written, lowercase had not yet
+// been invented (that would involve another several millennia of evolution).
+// We did not mean to shout.
+
+#ifndef HAS_STRING_VIEW
+#  if __cplusplus >= 201703 || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#    define HAS_STRING_VIEW 1
+#  else
+#    define HAS_STRING_VIEW 0
+#  endif
+#endif  // HAS_STRING_VIEW
+
+#include <cassert>
+#include <algorithm>
+#include <cctype>
+#include <chrono>
+#include <climits>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <ios>
+#include <istream>
+#include <iterator>
+#include <limits>
+#include <locale>
+#include <memory>
+#include <ostream>
+#include <ratio>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#if HAS_STRING_VIEW
+# include <string_view>
+#endif
+#include <utility>
+#include <type_traits>
+
+#ifdef __GNUC__
+# pragma GCC diagnostic push
+# if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 7)
+#  pragma GCC diagnostic ignored "-Wpedantic"
+# endif
+# if __GNUC__ < 5
+   // GCC 4.9 Bug 61489 Wrong warning with -Wmissing-field-initializers
+#  pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+# endif
+#endif
+
+#ifdef _MSC_VER
+#   pragma warning(push)
+// warning C4127: conditional expression is constant
+#   pragma warning(disable : 4127)
+#endif
+
+namespace arrow_vendored::date
+{
+
+//---------------+
+// Configuration |
+//---------------+
+
+#ifndef ONLY_C_LOCALE
+#  define ONLY_C_LOCALE 0
+#endif
+
+#if defined(_MSC_VER) && (!defined(__clang__) || (_MSC_VER < 1910))
+// MSVC
+#  ifndef _SILENCE_CXX17_UNCAUGHT_EXCEPTION_DEPRECATION_WARNING
+#    define _SILENCE_CXX17_UNCAUGHT_EXCEPTION_DEPRECATION_WARNING
+#  endif
+#  if _MSC_VER < 1910
+//   before VS2017
+#    define CONSTDATA const
+#    define CONSTCD11
+#    define CONSTCD14
+#    define NOEXCEPT _NOEXCEPT
+#  else
+//   VS2017 and later
+#    define CONSTDATA constexpr const
+#    define CONSTCD11 constexpr
+#    define CONSTCD14 constexpr
+#    define NOEXCEPT noexcept
+#  endif
+
+#elif defined(__SUNPRO_CC) && __SUNPRO_CC <= 0x5150
+// Oracle Developer Studio 12.6 and earlier
+#  define CONSTDATA constexpr const
+#  define CONSTCD11 constexpr
+#  define CONSTCD14
+#  define NOEXCEPT noexcept
+
+#elif __cplusplus >= 201402
+// C++14
+#  define CONSTDATA constexpr const
+#  define CONSTCD11 constexpr
+#  define CONSTCD14 constexpr
+#  define NOEXCEPT noexcept
+#else
+// C++11
+#  define CONSTDATA constexpr const
+#  define CONSTCD11 constexpr
+#  define CONSTCD14
+#  define NOEXCEPT noexcept
+#endif
+
+#ifndef HAS_UNCAUGHT_EXCEPTIONS
+#  if __cplusplus >= 201703 || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#    define HAS_UNCAUGHT_EXCEPTIONS 1
+#  else
+#    define HAS_UNCAUGHT_EXCEPTIONS 0
+#  endif
+#endif  // HAS_UNCAUGHT_EXCEPTIONS
+
+#ifndef HAS_VOID_T
+#  if __cplusplus >= 201703 || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L)
+#    define HAS_VOID_T 1
+#  else
+#    define HAS_VOID_T 0
+#  endif
+#endif  // HAS_VOID_T
+
+// Protect from Oracle sun macro
+#ifdef sun
+#  undef sun
+#endif
+
+// Work around for a NVCC compiler bug which causes it to fail
+// to compile std::ratio_{multiply,divide} when used directly
+// in the std::chrono::duration template instantiations below
+namespace detail {
+template <typename R1, typename R2>
+using ratio_multiply = decltype(std::ratio_multiply<R1, R2>{});
+
+template <typename R1, typename R2>
+using ratio_divide = decltype(std::ratio_divide<R1, R2>{});
+}  // namespace detail
+
+//-----------+
+// Interface |
+//-----------+
+
+// durations
+
+using days = std::chrono::duration
+    <int, detail::ratio_multiply<std::ratio<24>, std::chrono::hours::period>>;
+
+using weeks = std::chrono::duration
+    <int, detail::ratio_multiply<std::ratio<7>, days::period>>;
+
+using years = std::chrono::duration
+    <int, detail::ratio_multiply<std::ratio<146097, 400>, days::period>>;
+
+using months = std::chrono::duration
+    <int, detail::ratio_divide<years::period, std::ratio<12>>>;
+
+// time_point
+
+template <class Duration>
+    using sys_time = std::chrono::time_point<std::chrono::system_clock, Duration>;
+
+using sys_days    = sys_time<days>;
+using sys_seconds = sys_time<std::chrono::seconds>;
+
+struct local_t {};
+
+template <class Duration>
+    using local_time = std::chrono::time_point<local_t, Duration>;
+
+using local_seconds = local_time<std::chrono::seconds>;
+using local_days    = local_time<days>;
+
+// types
+
+struct last_spec
+{
+    explicit last_spec() = default;
+};
+
+class day;
+class month;
+class year;
+
+class weekday;
+class weekday_indexed;
+class weekday_last;
+
+class month_day;
+class month_day_last;
+class month_weekday;
+class month_weekday_last;
+
+class year_month;
+
+class year_month_day;
+class year_month_day_last;
+class year_month_weekday;
+class year_month_weekday_last;
+
+// date composition operators
+
+CONSTCD11 year_month operator/(const year& y, const month& m) NOEXCEPT;
+CONSTCD11 year_month operator/(const year& y, int          m) NOEXCEPT;
+
+CONSTCD11 month_day operator/(const day& d, const month& m) NOEXCEPT;
+CONSTCD11 month_day operator/(const day& d, int          m) NOEXCEPT;
+CONSTCD11 month_day operator/(const month& m, const day& d) NOEXCEPT;
+CONSTCD11 month_day operator/(const month& m, int        d) NOEXCEPT;
+CONSTCD11 month_day operator/(int          m, const day& d) NOEXCEPT;
+
+CONSTCD11 month_day_last operator/(const month& m, last_spec) NOEXCEPT;
+CONSTCD11 month_day_last operator/(int          m, last_spec) NOEXCEPT;
+CONSTCD11 month_day_last operator/(last_spec, const month& m) NOEXCEPT;
+CONSTCD11 month_day_last operator/(last_spec, int          m) NOEXCEPT;
+
+CONSTCD11 month_weekday operator/(const month& m, const weekday_indexed& wdi) NOEXCEPT;
+CONSTCD11 month_weekday operator/(int          m, const weekday_indexed& wdi) NOEXCEPT;
+CONSTCD11 month_weekday operator/(const weekday_indexed& wdi, const month& m) NOEXCEPT;
+CONSTCD11 month_weekday operator/(const weekday_indexed& wdi, int          m) NOEXCEPT;
+
+CONSTCD11 month_weekday_last operator/(const month& m, const weekday_last& wdl) NOEXCEPT;
+CONSTCD11 month_weekday_last operator/(int          m, const weekday_last& wdl) NOEXCEPT;
+CONSTCD11 month_weekday_last operator/(const weekday_last& wdl, const month& m) NOEXCEPT;
+CONSTCD11 month_weekday_last operator/(const weekday_last& wdl, int          m) NOEXCEPT;
+
+CONSTCD11 year_month_day operator/(const year_month& ym, const day& d) NOEXCEPT;
+CONSTCD11 year_month_day operator/(const year_month& ym, int        d) NOEXCEPT;
+CONSTCD11 year_month_day operator/(const year& y, const month_day& md) NOEXCEPT;
+CONSTCD11 year_month_day operator/(int         y, const month_day& md) NOEXCEPT;
+CONSTCD11 year_month_day operator/(const month_day& md, const year& y) NOEXCEPT;
+CONSTCD11 year_month_day operator/(const month_day& md, int         y) NOEXCEPT;
+
+CONSTCD11
+    year_month_day_last operator/(const year_month& ym,   last_spec) NOEXCEPT;
+CONSTCD11
+    year_month_day_last operator/(const year& y, const month_day_last& mdl) NOEXCEPT;
+CONSTCD11
+    year_month_day_last operator/(int         y, const month_day_last& mdl) NOEXCEPT;
+CONSTCD11
+    year_month_day_last operator/(const month_day_last& mdl, const year& y) NOEXCEPT;
+CONSTCD11
+    year_month_day_last operator/(const month_day_last& mdl, int         y) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator/(const year_month& ym, const weekday_indexed& wdi) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator/(const year&        y, const month_weekday&   mwd) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator/(int                y, const month_weekday&   mwd) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator/(const month_weekday& mwd, const year&          y) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator/(const month_weekday& mwd, int                  y) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator/(const year_month& ym, const weekday_last& wdl) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator/(const year& y, const month_weekday_last& mwdl) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator/(int         y, const month_weekday_last& mwdl) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator/(const month_weekday_last& mwdl, const year& y) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator/(const month_weekday_last& mwdl, int         y) NOEXCEPT;
+
+// Detailed interface
+
+// day
+
+class day
+{
+    unsigned char d_;
+
+public:
+    day() = default;
+    explicit CONSTCD11 day(unsigned d) NOEXCEPT;
+
+    CONSTCD14 day& operator++()    NOEXCEPT;
+    CONSTCD14 day  operator++(int) NOEXCEPT;
+    CONSTCD14 day& operator--()    NOEXCEPT;
+    CONSTCD14 day  operator--(int) NOEXCEPT;
+
+    CONSTCD14 day& operator+=(const days& d) NOEXCEPT;
+    CONSTCD14 day& operator-=(const days& d) NOEXCEPT;
+
+    CONSTCD11 explicit operator unsigned() const NOEXCEPT;
+    CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const day& x, const day& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const day& x, const day& y) NOEXCEPT;
+CONSTCD11 bool operator< (const day& x, const day& y) NOEXCEPT;
+CONSTCD11 bool operator> (const day& x, const day& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const day& x, const day& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const day& x, const day& y) NOEXCEPT;
+
+CONSTCD11 day  operator+(const day&  x, const days& y) NOEXCEPT;
+CONSTCD11 day  operator+(const days& x, const day&  y) NOEXCEPT;
+CONSTCD11 day  operator-(const day&  x, const days& y) NOEXCEPT;
+CONSTCD11 days operator-(const day&  x, const day&  y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const day& d);
+
+// month
+
+class month
+{
+    unsigned char m_;
+
+public:
+    month() = default;
+    explicit CONSTCD11 month(unsigned m) NOEXCEPT;
+
+    CONSTCD14 month& operator++()    NOEXCEPT;
+    CONSTCD14 month  operator++(int) NOEXCEPT;
+    CONSTCD14 month& operator--()    NOEXCEPT;
+    CONSTCD14 month  operator--(int) NOEXCEPT;
+
+    CONSTCD14 month& operator+=(const months& m) NOEXCEPT;
+    CONSTCD14 month& operator-=(const months& m) NOEXCEPT;
+
+    CONSTCD11 explicit operator unsigned() const NOEXCEPT;
+    CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const month& x, const month& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const month& x, const month& y) NOEXCEPT;
+CONSTCD11 bool operator< (const month& x, const month& y) NOEXCEPT;
+CONSTCD11 bool operator> (const month& x, const month& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const month& x, const month& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const month& x, const month& y) NOEXCEPT;
+
+CONSTCD14 month  operator+(const month&  x, const months& y) NOEXCEPT;
+CONSTCD14 month  operator+(const months& x,  const month& y) NOEXCEPT;
+CONSTCD14 month  operator-(const month&  x, const months& y) NOEXCEPT;
+CONSTCD14 months operator-(const month&  x,  const month& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month& m);
+
+// year
+
+class year
+{
+    short y_;
+
+public:
+    year() = default;
+    explicit CONSTCD11 year(int y) NOEXCEPT;
+
+    CONSTCD14 year& operator++()    NOEXCEPT;
+    CONSTCD14 year  operator++(int) NOEXCEPT;
+    CONSTCD14 year& operator--()    NOEXCEPT;
+    CONSTCD14 year  operator--(int) NOEXCEPT;
+
+    CONSTCD14 year& operator+=(const years& y) NOEXCEPT;
+    CONSTCD14 year& operator-=(const years& y) NOEXCEPT;
+
+    CONSTCD11 year operator-() const NOEXCEPT;
+    CONSTCD11 year operator+() const NOEXCEPT;
+
+    CONSTCD11 bool is_leap() const NOEXCEPT;
+
+    CONSTCD11 explicit operator int() const NOEXCEPT;
+    CONSTCD11 bool ok() const NOEXCEPT;
+
+    static CONSTCD11 year min() NOEXCEPT { return year{-32767}; }
+    static CONSTCD11 year max() NOEXCEPT { return year{32767}; }
+};
+
+CONSTCD11 bool operator==(const year& x, const year& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const year& x, const year& y) NOEXCEPT;
+CONSTCD11 bool operator< (const year& x, const year& y) NOEXCEPT;
+CONSTCD11 bool operator> (const year& x, const year& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const year& x, const year& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const year& x, const year& y) NOEXCEPT;
+
+CONSTCD11 year  operator+(const year&  x, const years& y) NOEXCEPT;
+CONSTCD11 year  operator+(const years& x, const year&  y) NOEXCEPT;
+CONSTCD11 year  operator-(const year&  x, const years& y) NOEXCEPT;
+CONSTCD11 years operator-(const year&  x, const year&  y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year& y);
+
+// weekday
+
+class weekday
+{
+    unsigned char wd_;
+public:
+    weekday() = default;
+    explicit CONSTCD11 weekday(unsigned wd) NOEXCEPT;
+    CONSTCD14 weekday(const sys_days& dp) NOEXCEPT;
+    CONSTCD14 explicit weekday(const local_days& dp) NOEXCEPT;
+
+    CONSTCD14 weekday& operator++()    NOEXCEPT;
+    CONSTCD14 weekday  operator++(int) NOEXCEPT;
+    CONSTCD14 weekday& operator--()    NOEXCEPT;
+    CONSTCD14 weekday  operator--(int) NOEXCEPT;
+
+    CONSTCD14 weekday& operator+=(const days& d) NOEXCEPT;
+    CONSTCD14 weekday& operator-=(const days& d) NOEXCEPT;
+
+    CONSTCD11 bool ok() const NOEXCEPT;
+
+    CONSTCD11 unsigned c_encoding() const NOEXCEPT;
+    CONSTCD11 unsigned iso_encoding() const NOEXCEPT;
+
+    CONSTCD11 weekday_indexed operator[](unsigned index) const NOEXCEPT;
+    CONSTCD11 weekday_last    operator[](last_spec)      const NOEXCEPT;
+
+private:
+    static CONSTCD14 unsigned char weekday_from_days(int z) NOEXCEPT;
+
+    friend CONSTCD11 bool operator==(const weekday& x, const weekday& y) NOEXCEPT;
+    friend CONSTCD14 days operator-(const weekday& x, const weekday& y) NOEXCEPT;
+    friend CONSTCD14 weekday operator+(const weekday& x, const days& y) NOEXCEPT;
+    template<class CharT, class Traits>
+        friend std::basic_ostream<CharT, Traits>&
+            operator<<(std::basic_ostream<CharT, Traits>& os, const weekday& wd);
+    friend class weekday_indexed;
+};
+
+CONSTCD11 bool operator==(const weekday& x, const weekday& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const weekday& x, const weekday& y) NOEXCEPT;
+
+CONSTCD14 weekday operator+(const weekday& x, const days&    y) NOEXCEPT;
+CONSTCD14 weekday operator+(const days&    x, const weekday& y) NOEXCEPT;
+CONSTCD14 weekday operator-(const weekday& x, const days&    y) NOEXCEPT;
+CONSTCD14 days    operator-(const weekday& x, const weekday& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const weekday& wd);
+
+// weekday_indexed
+
+class weekday_indexed
+{
+    unsigned char wd_    : 4;
+    unsigned char index_ : 4;
+
+public:
+    weekday_indexed() = default;
+    CONSTCD11 weekday_indexed(const date::weekday& wd, unsigned index) NOEXCEPT;
+
+    CONSTCD11 date::weekday weekday() const NOEXCEPT;
+    CONSTCD11 unsigned index() const NOEXCEPT;
+    CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const weekday_indexed& wdi);
+
+// weekday_last
+
+class weekday_last
+{
+    date::weekday wd_;
+
+public:
+    explicit CONSTCD11 weekday_last(const date::weekday& wd) NOEXCEPT;
+
+    CONSTCD11 date::weekday weekday() const NOEXCEPT;
+    CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const weekday_last& x, const weekday_last& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const weekday_last& x, const weekday_last& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const weekday_last& wdl);
+
+namespace detail
+{
+
+struct unspecified_month_disambiguator {};
+
+}  // namespace detail
+
+// year_month
+
+class year_month
+{
+    date::year  y_;
+    date::month m_;
+
+public:
+    year_month() = default;
+    CONSTCD11 year_month(const date::year& y, const date::month& m) NOEXCEPT;
+
+    CONSTCD11 date::year  year()  const NOEXCEPT;
+    CONSTCD11 date::month month() const NOEXCEPT;
+
+    template<class = detail::unspecified_month_disambiguator>
+    CONSTCD14 year_month& operator+=(const months& dm) NOEXCEPT;
+    template<class = detail::unspecified_month_disambiguator>
+    CONSTCD14 year_month& operator-=(const months& dm) NOEXCEPT;
+    CONSTCD14 year_month& operator+=(const years& dy) NOEXCEPT;
+    CONSTCD14 year_month& operator-=(const years& dy) NOEXCEPT;
+
+    CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const year_month& x, const year_month& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const year_month& x, const year_month& y) NOEXCEPT;
+CONSTCD11 bool operator< (const year_month& x, const year_month& y) NOEXCEPT;
+CONSTCD11 bool operator> (const year_month& x, const year_month& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const year_month& x, const year_month& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const year_month& x, const year_month& y) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14 year_month operator+(const year_month& ym, const months& dm) NOEXCEPT;
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14 year_month operator+(const months& dm, const year_month& ym) NOEXCEPT;
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14 year_month operator-(const year_month& ym, const months& dm) NOEXCEPT;
+
+CONSTCD11 months operator-(const year_month& x, const year_month& y) NOEXCEPT;
+CONSTCD11 year_month operator+(const year_month& ym, const years& dy) NOEXCEPT;
+CONSTCD11 year_month operator+(const years& dy, const year_month& ym) NOEXCEPT;
+CONSTCD11 year_month operator-(const year_month& ym, const years& dy) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month& ym);
+
+// month_day
+
+class month_day
+{
+    date::month m_;
+    date::day   d_;
+
+public:
+    month_day() = default;
+    CONSTCD11 month_day(const date::month& m, const date::day& d) NOEXCEPT;
+
+    CONSTCD11 date::month month() const NOEXCEPT;
+    CONSTCD11 date::day   day() const NOEXCEPT;
+
+    CONSTCD14 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const month_day& x, const month_day& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const month_day& x, const month_day& y) NOEXCEPT;
+CONSTCD11 bool operator< (const month_day& x, const month_day& y) NOEXCEPT;
+CONSTCD11 bool operator> (const month_day& x, const month_day& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const month_day& x, const month_day& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const month_day& x, const month_day& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_day& md);
+
+// month_day_last
+
+class month_day_last
+{
+    date::month m_;
+
+public:
+    CONSTCD11 explicit month_day_last(const date::month& m) NOEXCEPT;
+
+    CONSTCD11 date::month month() const NOEXCEPT;
+    CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const month_day_last& x, const month_day_last& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const month_day_last& x, const month_day_last& y) NOEXCEPT;
+CONSTCD11 bool operator< (const month_day_last& x, const month_day_last& y) NOEXCEPT;
+CONSTCD11 bool operator> (const month_day_last& x, const month_day_last& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const month_day_last& x, const month_day_last& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const month_day_last& x, const month_day_last& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_day_last& mdl);
+
+// month_weekday
+
+class month_weekday
+{
+    date::month           m_;
+    date::weekday_indexed wdi_;
+public:
+    CONSTCD11 month_weekday(const date::month& m,
+                            const date::weekday_indexed& wdi) NOEXCEPT;
+
+    CONSTCD11 date::month           month()           const NOEXCEPT;
+    CONSTCD11 date::weekday_indexed weekday_indexed() const NOEXCEPT;
+
+    CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const month_weekday& x, const month_weekday& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const month_weekday& x, const month_weekday& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_weekday& mwd);
+
+// month_weekday_last
+
+class month_weekday_last
+{
+    date::month        m_;
+    date::weekday_last wdl_;
+
+public:
+    CONSTCD11 month_weekday_last(const date::month& m,
+                                 const date::weekday_last& wd) NOEXCEPT;
+
+    CONSTCD11 date::month        month()        const NOEXCEPT;
+    CONSTCD11 date::weekday_last weekday_last() const NOEXCEPT;
+
+    CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11
+    bool operator==(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT;
+CONSTCD11
+    bool operator!=(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_weekday_last& mwdl);
+
+// class year_month_day
+
+class year_month_day
+{
+    date::year  y_;
+    date::month m_;
+    date::day   d_;
+
+public:
+    year_month_day() = default;
+    CONSTCD11 year_month_day(const date::year& y, const date::month& m,
+                             const date::day& d) NOEXCEPT;
+    CONSTCD14 year_month_day(const year_month_day_last& ymdl) NOEXCEPT;
+
+    CONSTCD14 year_month_day(sys_days dp) NOEXCEPT;
+    CONSTCD14 explicit year_month_day(local_days dp) NOEXCEPT;
+
+    template<class = detail::unspecified_month_disambiguator>
+    CONSTCD14 year_month_day& operator+=(const months& m) NOEXCEPT;
+    template<class = detail::unspecified_month_disambiguator>
+    CONSTCD14 year_month_day& operator-=(const months& m) NOEXCEPT;
+    CONSTCD14 year_month_day& operator+=(const years& y)  NOEXCEPT;
+    CONSTCD14 year_month_day& operator-=(const years& y)  NOEXCEPT;
+
+    CONSTCD11 date::year  year()  const NOEXCEPT;
+    CONSTCD11 date::month month() const NOEXCEPT;
+    CONSTCD11 date::day   day()   const NOEXCEPT;
+
+    CONSTCD14 operator sys_days() const NOEXCEPT;
+    CONSTCD14 explicit operator local_days() const NOEXCEPT;
+    CONSTCD14 bool ok() const NOEXCEPT;
+
+private:
+    static CONSTCD14 year_month_day from_days(days dp) NOEXCEPT;
+    CONSTCD14 days to_days() const NOEXCEPT;
+};
+
+CONSTCD11 bool operator==(const year_month_day& x, const year_month_day& y) NOEXCEPT;
+CONSTCD11 bool operator!=(const year_month_day& x, const year_month_day& y) NOEXCEPT;
+CONSTCD11 bool operator< (const year_month_day& x, const year_month_day& y) NOEXCEPT;
+CONSTCD11 bool operator> (const year_month_day& x, const year_month_day& y) NOEXCEPT;
+CONSTCD11 bool operator<=(const year_month_day& x, const year_month_day& y) NOEXCEPT;
+CONSTCD11 bool operator>=(const year_month_day& x, const year_month_day& y) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14 year_month_day operator+(const year_month_day& ymd, const months& dm) NOEXCEPT;
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14 year_month_day operator+(const months& dm, const year_month_day& ymd) NOEXCEPT;
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14 year_month_day operator-(const year_month_day& ymd, const months& dm) NOEXCEPT;
+CONSTCD11 year_month_day operator+(const year_month_day& ymd, const years& dy)  NOEXCEPT;
+CONSTCD11 year_month_day operator+(const years& dy, const year_month_day& ymd)  NOEXCEPT;
+CONSTCD11 year_month_day operator-(const year_month_day& ymd, const years& dy)  NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_day& ymd);
+
+// year_month_day_last
+
+class year_month_day_last
+{
+    date::year           y_;
+    date::month_day_last mdl_;
+
+public:
+    CONSTCD11 year_month_day_last(const date::year& y,
+                                  const date::month_day_last& mdl) NOEXCEPT;
+
+    template<class = detail::unspecified_month_disambiguator>
+    CONSTCD14 year_month_day_last& operator+=(const months& m) NOEXCEPT;
+    template<class = detail::unspecified_month_disambiguator>
+    CONSTCD14 year_month_day_last& operator-=(const months& m) NOEXCEPT;
+    CONSTCD14 year_month_day_last& operator+=(const years& y)  NOEXCEPT;
+    CONSTCD14 year_month_day_last& operator-=(const years& y)  NOEXCEPT;
+
+    CONSTCD11 date::year           year()           const NOEXCEPT;
+    CONSTCD11 date::month          month()          const NOEXCEPT;
+    CONSTCD11 date::month_day_last month_day_last() const NOEXCEPT;
+    CONSTCD14 date::day            day()            const NOEXCEPT;
+
+    CONSTCD14 operator sys_days() const NOEXCEPT;
+    CONSTCD14 explicit operator local_days() const NOEXCEPT;
+    CONSTCD11 bool ok() const NOEXCEPT;
+};
+
+CONSTCD11
+    bool operator==(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT;
+CONSTCD11
+    bool operator!=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT;
+CONSTCD11
+    bool operator< (const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT;
+CONSTCD11
+    bool operator> (const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT;
+CONSTCD11
+    bool operator<=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT;
+CONSTCD11
+    bool operator>=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_day_last
+operator+(const year_month_day_last& ymdl, const months& dm) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_day_last
+operator+(const months& dm, const year_month_day_last& ymdl) NOEXCEPT;
+
+CONSTCD11
+year_month_day_last
+operator+(const year_month_day_last& ymdl, const years& dy) NOEXCEPT;
+
+CONSTCD11
+year_month_day_last
+operator+(const years& dy, const year_month_day_last& ymdl) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_day_last
+operator-(const year_month_day_last& ymdl, const months& dm) NOEXCEPT;
+
+CONSTCD11
+year_month_day_last
+operator-(const year_month_day_last& ymdl, const years& dy) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_day_last& ymdl);
+
+// year_month_weekday
+
+class year_month_weekday
+{
+    date::year            y_;
+    date::month           m_;
+    date::weekday_indexed wdi_;
+
+public:
+    year_month_weekday() = default;
+    CONSTCD11 year_month_weekday(const date::year& y, const date::month& m,
+                                   const date::weekday_indexed& wdi) NOEXCEPT;
+    CONSTCD14 year_month_weekday(const sys_days& dp) NOEXCEPT;
+    CONSTCD14 explicit year_month_weekday(const local_days& dp) NOEXCEPT;
+
+    template<class = detail::unspecified_month_disambiguator>
+    CONSTCD14 year_month_weekday& operator+=(const months& m) NOEXCEPT;
+    template<class = detail::unspecified_month_disambiguator>
+    CONSTCD14 year_month_weekday& operator-=(const months& m) NOEXCEPT;
+    CONSTCD14 year_month_weekday& operator+=(const years& y)  NOEXCEPT;
+    CONSTCD14 year_month_weekday& operator-=(const years& y)  NOEXCEPT;
+
+    CONSTCD11 date::year year() const NOEXCEPT;
+    CONSTCD11 date::month month() const NOEXCEPT;
+    CONSTCD11 date::weekday weekday() const NOEXCEPT;
+    CONSTCD11 unsigned index() const NOEXCEPT;
+    CONSTCD11 date::weekday_indexed weekday_indexed() const NOEXCEPT;
+
+    CONSTCD14 operator sys_days() const NOEXCEPT;
+    CONSTCD14 explicit operator local_days() const NOEXCEPT;
+    CONSTCD14 bool ok() const NOEXCEPT;
+
+private:
+    static CONSTCD14 year_month_weekday from_days(days dp) NOEXCEPT;
+    CONSTCD14 days to_days() const NOEXCEPT;
+};
+
+CONSTCD11
+    bool operator==(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT;
+CONSTCD11
+    bool operator!=(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_weekday
+operator+(const year_month_weekday& ymwd, const months& dm) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_weekday
+operator+(const months& dm, const year_month_weekday& ymwd) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator+(const year_month_weekday& ymwd, const years& dy) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator+(const years& dy, const year_month_weekday& ymwd) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_weekday
+operator-(const year_month_weekday& ymwd, const months& dm) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday
+operator-(const year_month_weekday& ymwd, const years& dy) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_weekday& ymwdi);
+
+// year_month_weekday_last
+
+class year_month_weekday_last
+{
+    date::year y_;
+    date::month m_;
+    date::weekday_last wdl_;
+
+public:
+    CONSTCD11 year_month_weekday_last(const date::year& y, const date::month& m,
+                                      const date::weekday_last& wdl) NOEXCEPT;
+
+    template<class = detail::unspecified_month_disambiguator>
+    CONSTCD14 year_month_weekday_last& operator+=(const months& m) NOEXCEPT;
+    template<class = detail::unspecified_month_disambiguator>
+    CONSTCD14 year_month_weekday_last& operator-=(const months& m) NOEXCEPT;
+    CONSTCD14 year_month_weekday_last& operator+=(const years& y) NOEXCEPT;
+    CONSTCD14 year_month_weekday_last& operator-=(const years& y) NOEXCEPT;
+
+    CONSTCD11 date::year year() const NOEXCEPT;
+    CONSTCD11 date::month month() const NOEXCEPT;
+    CONSTCD11 date::weekday weekday() const NOEXCEPT;
+    CONSTCD11 date::weekday_last weekday_last() const NOEXCEPT;
+
+    CONSTCD14 operator sys_days() const NOEXCEPT;
+    CONSTCD14 explicit operator local_days() const NOEXCEPT;
+    CONSTCD11 bool ok() const NOEXCEPT;
+
+private:
+    CONSTCD14 days to_days() const NOEXCEPT;
+};
+
+CONSTCD11
+bool
+operator==(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT;
+
+CONSTCD11
+bool
+operator!=(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_weekday_last
+operator+(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_weekday_last
+operator+(const months& dm, const year_month_weekday_last& ymwdl) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator+(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator+(const years& dy, const year_month_weekday_last& ymwdl) NOEXCEPT;
+
+template<class = detail::unspecified_month_disambiguator>
+CONSTCD14
+year_month_weekday_last
+operator-(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT;
+
+CONSTCD11
+year_month_weekday_last
+operator-(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT;
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_weekday_last& ymwdl);
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+inline namespace literals
+{
+
+CONSTCD11 date::day  operator ""_d(unsigned long long d) NOEXCEPT;
+CONSTCD11 date::year operator ""_y(unsigned long long y) NOEXCEPT;
+
+}  // inline namespace literals
+#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900)
+
+// CONSTDATA date::month January{1};
+// CONSTDATA date::month February{2};
+// CONSTDATA date::month March{3};
+// CONSTDATA date::month April{4};
+// CONSTDATA date::month May{5};
+// CONSTDATA date::month June{6};
+// CONSTDATA date::month July{7};
+// CONSTDATA date::month August{8};
+// CONSTDATA date::month September{9};
+// CONSTDATA date::month October{10};
+// CONSTDATA date::month November{11};
+// CONSTDATA date::month December{12};
+//
+// CONSTDATA date::weekday Sunday{0u};
+// CONSTDATA date::weekday Monday{1u};
+// CONSTDATA date::weekday Tuesday{2u};
+// CONSTDATA date::weekday Wednesday{3u};
+// CONSTDATA date::weekday Thursday{4u};
+// CONSTDATA date::weekday Friday{5u};
+// CONSTDATA date::weekday Saturday{6u};
+
+#if HAS_VOID_T
+
+template <class T, class = std::void_t<>>
+struct is_clock
+    : std::false_type
+{};
+
+template <class T>
+struct is_clock<T, std::void_t<decltype(T::now()), typename T::rep, typename T::period,
+                               typename T::duration, typename T::time_point,
+                               decltype(T::is_steady)>>
+    : std::true_type
+{};
+
+template<class T> inline constexpr bool is_clock_v = is_clock<T>::value;
+
+#endif  // HAS_VOID_T
+
+//----------------+
+// Implementation |
+//----------------+
+
+// utilities
+namespace detail {
+
+template<class CharT, class Traits = std::char_traits<CharT>>
+class save_istream
+{
+protected:
+    std::basic_ios<CharT, Traits>& is_;
+    CharT fill_;
+    std::ios::fmtflags flags_;
+    std::streamsize precision_;
+    std::streamsize width_;
+    std::basic_ostream<CharT, Traits>* tie_;
+    std::locale loc_;
+
+public:
+    ~save_istream()
+    {
+        is_.fill(fill_);
+        is_.flags(flags_);
+        is_.precision(precision_);
+        is_.width(width_);
+        is_.imbue(loc_);
+        is_.tie(tie_);
+    }
+
+    save_istream(const save_istream&) = delete;
+    save_istream& operator=(const save_istream&) = delete;
+
+    explicit save_istream(std::basic_ios<CharT, Traits>& is)
+        : is_(is)
+        , fill_(is.fill())
+        , flags_(is.flags())
+        , precision_(is.precision())
+        , width_(is.width(0))
+        , tie_(is.tie(nullptr))
+        , loc_(is.getloc())
+        {
+            if (tie_ != nullptr)
+                tie_->flush();
+        }
+};
+
+template<class CharT, class Traits = std::char_traits<CharT>>
+class save_ostream
+    : private save_istream<CharT, Traits>
+{
+public:
+    ~save_ostream()
+    {
+        if ((this->flags_ & std::ios::unitbuf) &&
+#if HAS_UNCAUGHT_EXCEPTIONS
+                std::uncaught_exceptions() == 0 &&
+#else
+                !std::uncaught_exception() &&
+#endif
+                this->is_.good())
+            this->is_.rdbuf()->pubsync();
+    }
+
+    save_ostream(const save_ostream&) = delete;
+    save_ostream& operator=(const save_ostream&) = delete;
+
+    explicit save_ostream(std::basic_ios<CharT, Traits>& os)
+        : save_istream<CharT, Traits>(os)
+        {
+        }
+};
+
+template <class T>
+struct choose_trunc_type
+{
+    static const int digits = std::numeric_limits<T>::digits;
+    using type = typename std::conditional
+                 <
+                     digits < 32,
+                     std::int32_t,
+                     typename std::conditional
+                     <
+                         digits < 64,
+                         std::int64_t,
+#ifdef __SIZEOF_INT128__
+                         __int128
+#else
+                         std::int64_t
+#endif
+                     >::type
+                 >::type;
+};
+
+template <class T>
+CONSTCD11
+inline
+typename std::enable_if
+<
+    !std::chrono::treat_as_floating_point<T>::value,
+    T
+>::type
+trunc(T t) NOEXCEPT
+{
+    return t;
+}
+
+template <class T>
+CONSTCD14
+inline
+typename std::enable_if
+<
+    std::chrono::treat_as_floating_point<T>::value,
+    T
+>::type
+trunc(T t) NOEXCEPT
+{
+    using std::numeric_limits;
+    using I = typename choose_trunc_type<T>::type;
+    CONSTDATA auto digits = numeric_limits<T>::digits;
+    static_assert(digits < numeric_limits<I>::digits, "");
+    CONSTDATA auto max = I{1} << (digits-1);
+    CONSTDATA auto min = -max;
+    const auto negative = t < T{0};
+    if (min <= t && t <= max && t != 0 && t == t)
+    {
+        t = static_cast<T>(static_cast<I>(t));
+        if (t == 0 && negative)
+            t = -t;
+    }
+    return t;
+}
+
+template <std::intmax_t Xp, std::intmax_t Yp>
+struct static_gcd
+{
+    static const std::intmax_t value = static_gcd<Yp, Xp % Yp>::value;
+};
+
+template <std::intmax_t Xp>
+struct static_gcd<Xp, 0>
+{
+    static const std::intmax_t value = Xp;
+};
+
+template <>
+struct static_gcd<0, 0>
+{
+    static const std::intmax_t value = 1;
+};
+
+template <class R1, class R2>
+struct no_overflow
+{
+private:
+    static const std::intmax_t gcd_n1_n2 = static_gcd<R1::num, R2::num>::value;
+    static const std::intmax_t gcd_d1_d2 = static_gcd<R1::den, R2::den>::value;
+    static const std::intmax_t n1 = R1::num / gcd_n1_n2;
+    static const std::intmax_t d1 = R1::den / gcd_d1_d2;
+    static const std::intmax_t n2 = R2::num / gcd_n1_n2;
+    static const std::intmax_t d2 = R2::den / gcd_d1_d2;
+#ifdef __cpp_constexpr
+    static const std::intmax_t max = std::numeric_limits<std::intmax_t>::max();
+#else
+    static const std::intmax_t max = LLONG_MAX;
+#endif
+
+    template <std::intmax_t Xp, std::intmax_t Yp, bool overflow>
+    struct mul    // overflow == false
+    {
+        static const std::intmax_t value = Xp * Yp;
+    };
+
+    template <std::intmax_t Xp, std::intmax_t Yp>
+    struct mul<Xp, Yp, true>
+    {
+        static const std::intmax_t value = 1;
+    };
+
+public:
+    static const bool value = (n1 <= max / d2) && (n2 <= max / d1);
+    typedef std::ratio<mul<n1, d2, !value>::value,
+                       mul<n2, d1, !value>::value> type;
+};
+
+}  // detail
+
+// trunc towards zero
+template <class To, class Rep, class Period>
+CONSTCD11
+inline
+typename std::enable_if
+<
+    detail::no_overflow<Period, typename To::period>::value,
+    To
+>::type
+trunc(const std::chrono::duration<Rep, Period>& d)
+{
+    return To{detail::trunc(std::chrono::duration_cast<To>(d).count())};
+}
+
+template <class To, class Rep, class Period>
+CONSTCD11
+inline
+typename std::enable_if
+<
+    !detail::no_overflow<Period, typename To::period>::value,
+    To
+>::type
+trunc(const std::chrono::duration<Rep, Period>& d)
+{
+    using std::chrono::duration_cast;
+    using std::chrono::duration;
+    using rep = typename std::common_type<Rep, typename To::rep>::type;
+    return To{detail::trunc(duration_cast<To>(duration_cast<duration<rep>>(d)).count())};
+}
+
+#ifndef HAS_CHRONO_ROUNDING
+#  if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190023918 || (_MSC_FULL_VER >= 190000000 && defined (__clang__)))
+#    define HAS_CHRONO_ROUNDING 1
+#  elif defined(__cpp_lib_chrono) && __cplusplus > 201402 && __cpp_lib_chrono >= 201510
+#    define HAS_CHRONO_ROUNDING 1
+#  elif defined(_LIBCPP_VERSION) && __cplusplus > 201402 && _LIBCPP_VERSION >= 3800
+#    define HAS_CHRONO_ROUNDING 1
+#  else
+#    define HAS_CHRONO_ROUNDING 0
+#  endif
+#endif  // HAS_CHRONO_ROUNDING
+
+#if HAS_CHRONO_ROUNDING == 0
+
+// round down
+template <class To, class Rep, class Period>
+CONSTCD14
+inline
+typename std::enable_if
+<
+    detail::no_overflow<Period, typename To::period>::value,
+    To
+>::type
+floor(const std::chrono::duration<Rep, Period>& d)
+{
+    auto t = trunc<To>(d);
+    if (t > d)
+        return t - To{1};
+    return t;
+}
+
+template <class To, class Rep, class Period>
+CONSTCD14
+inline
+typename std::enable_if
+<
+    !detail::no_overflow<Period, typename To::period>::value,
+    To
+>::type
+floor(const std::chrono::duration<Rep, Period>& d)
+{
+    using rep = typename std::common_type<Rep, typename To::rep>::type;
+    return floor<To>(floor<std::chrono::duration<rep>>(d));
+}
+
+// round to nearest, to even on tie
+template <class To, class Rep, class Period>
+CONSTCD14
+inline
+To
+round(const std::chrono::duration<Rep, Period>& d)
+{
+    auto t0 = floor<To>(d);
+    auto t1 = t0 + To{1};
+    if (t1 == To{0} && t0 < To{0})
+        t1 = -t1;
+    auto diff0 = d - t0;
+    auto diff1 = t1 - d;
+    if (diff0 == diff1)
+    {
+        if (t0 - trunc<To>(t0/2)*2 == To{0})
+            return t0;
+        return t1;
+    }
+    if (diff0 < diff1)
+        return t0;
+    return t1;
+}
+
+// round up
+template <class To, class Rep, class Period>
+CONSTCD14
+inline
+To
+ceil(const std::chrono::duration<Rep, Period>& d)
+{
+    auto t = trunc<To>(d);
+    if (t < d)
+        return t + To{1};
+    return t;
+}
+
+template <class Rep, class Period,
+          class = typename std::enable_if
+          <
+              std::numeric_limits<Rep>::is_signed
+          >::type>
+CONSTCD11
+std::chrono::duration<Rep, Period>
+abs(std::chrono::duration<Rep, Period> d)
+{
+    return d >= d.zero() ? d : static_cast<decltype(d)>(-d);
+}
+
+// round down
+template <class To, class Clock, class FromDuration>
+CONSTCD11
+inline
+std::chrono::time_point<Clock, To>
+floor(const std::chrono::time_point<Clock, FromDuration>& tp)
+{
+    using std::chrono::time_point;
+    return time_point<Clock, To>{date::floor<To>(tp.time_since_epoch())};
+}
+
+// round to nearest, to even on tie
+template <class To, class Clock, class FromDuration>
+CONSTCD11
+inline
+std::chrono::time_point<Clock, To>
+round(const std::chrono::time_point<Clock, FromDuration>& tp)
+{
+    using std::chrono::time_point;
+    return time_point<Clock, To>{round<To>(tp.time_since_epoch())};
+}
+
+// round up
+template <class To, class Clock, class FromDuration>
+CONSTCD11
+inline
+std::chrono::time_point<Clock, To>
+ceil(const std::chrono::time_point<Clock, FromDuration>& tp)
+{
+    using std::chrono::time_point;
+    return time_point<Clock, To>{ceil<To>(tp.time_since_epoch())};
+}
+
+#else  // HAS_CHRONO_ROUNDING == 1
+
+using std::chrono::floor;
+using std::chrono::ceil;
+using std::chrono::round;
+using std::chrono::abs;
+
+#endif  // HAS_CHRONO_ROUNDING
+
+namespace detail
+{
+
+template <class To, class Rep, class Period>
+CONSTCD14
+inline
+typename std::enable_if
+<
+    !std::chrono::treat_as_floating_point<typename To::rep>::value,
+    To
+>::type
+round_i(const std::chrono::duration<Rep, Period>& d)
+{
+    return round<To>(d);
+}
+
+template <class To, class Rep, class Period>
+CONSTCD14
+inline
+typename std::enable_if
+<
+    std::chrono::treat_as_floating_point<typename To::rep>::value,
+    To
+>::type
+round_i(const std::chrono::duration<Rep, Period>& d)
+{
+    return d;
+}
+
+template <class To, class Clock, class FromDuration>
+CONSTCD11
+inline
+std::chrono::time_point<Clock, To>
+round_i(const std::chrono::time_point<Clock, FromDuration>& tp)
+{
+    using std::chrono::time_point;
+    return time_point<Clock, To>{round_i<To>(tp.time_since_epoch())};
+}
+
+}  // detail
+
+// trunc towards zero
+template <class To, class Clock, class FromDuration>
+CONSTCD11
+inline
+std::chrono::time_point<Clock, To>
+trunc(const std::chrono::time_point<Clock, FromDuration>& tp)
+{
+    using std::chrono::time_point;
+    return time_point<Clock, To>{trunc<To>(tp.time_since_epoch())};
+}
+
+// day
+
+CONSTCD11 inline day::day(unsigned d) NOEXCEPT : d_(static_cast<decltype(d_)>(d)) {}
+CONSTCD14 inline day& day::operator++() NOEXCEPT {++d_; return *this;}
+CONSTCD14 inline day day::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;}
+CONSTCD14 inline day& day::operator--() NOEXCEPT {--d_; return *this;}
+CONSTCD14 inline day day::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;}
+CONSTCD14 inline day& day::operator+=(const days& d) NOEXCEPT {*this = *this + d; return *this;}
+CONSTCD14 inline day& day::operator-=(const days& d) NOEXCEPT {*this = *this - d; return *this;}
+CONSTCD11 inline day::operator unsigned() const NOEXCEPT {return d_;}
+CONSTCD11 inline bool day::ok() const NOEXCEPT {return 1 <= d_ && d_ <= 31;}
+
+CONSTCD11
+inline
+bool
+operator==(const day& x, const day& y) NOEXCEPT
+{
+    return static_cast<unsigned>(x) == static_cast<unsigned>(y);
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const day& x, const day& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const day& x, const day& y) NOEXCEPT
+{
+    return static_cast<unsigned>(x) < static_cast<unsigned>(y);
+}
+
+CONSTCD11
+inline
+bool
+operator>(const day& x, const day& y) NOEXCEPT
+{
+    return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const day& x, const day& y) NOEXCEPT
+{
+    return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const day& x, const day& y) NOEXCEPT
+{
+    return !(x < y);
+}
+
+CONSTCD11
+inline
+days
+operator-(const day& x, const day& y) NOEXCEPT
+{
+    return days{static_cast<days::rep>(static_cast<unsigned>(x)
+                                     - static_cast<unsigned>(y))};
+}
+
+CONSTCD11
+inline
+day
+operator+(const day& x, const days& y) NOEXCEPT
+{
+    return day{static_cast<unsigned>(x) + static_cast<unsigned>(y.count())};
+}
+
+CONSTCD11
+inline
+day
+operator+(const days& x, const day& y) NOEXCEPT
+{
+    return y + x;
+}
+
+CONSTCD11
+inline
+day
+operator-(const day& x, const days& y) NOEXCEPT
+{
+    return x + -y;
+}
+
+namespace detail
+{
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+low_level_fmt(std::basic_ostream<CharT, Traits>& os, const day& d)
+{
+    detail::save_ostream<CharT, Traits> _(os);
+    os.fill('0');
+    os.flags(std::ios::dec | std::ios::right);
+    os.width(2);
+    os << static_cast<unsigned>(d);
+    return os;
+}
+
+}  // namespace detail
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const day& d)
+{
+    detail::low_level_fmt(os, d);
+    if (!d.ok())
+        os << " is not a valid day";
+    return os;
+}
+
+// month
+
+CONSTCD11 inline month::month(unsigned m) NOEXCEPT : m_(static_cast<decltype(m_)>(m)) {}
+CONSTCD14 inline month& month::operator++() NOEXCEPT {*this += months{1}; return *this;}
+CONSTCD14 inline month month::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;}
+CONSTCD14 inline month& month::operator--() NOEXCEPT {*this -= months{1}; return *this;}
+CONSTCD14 inline month month::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;}
+
+CONSTCD14
+inline
+month&
+month::operator+=(const months& m) NOEXCEPT
+{
+    *this = *this + m;
+    return *this;
+}
+
+CONSTCD14
+inline
+month&
+month::operator-=(const months& m) NOEXCEPT
+{
+    *this = *this - m;
+    return *this;
+}
+
+CONSTCD11 inline month::operator unsigned() const NOEXCEPT {return m_;}
+CONSTCD11 inline bool month::ok() const NOEXCEPT {return 1 <= m_ && m_ <= 12;}
+
+CONSTCD11
+inline
+bool
+operator==(const month& x, const month& y) NOEXCEPT
+{
+    return static_cast<unsigned>(x) == static_cast<unsigned>(y);
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const month& x, const month& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const month& x, const month& y) NOEXCEPT
+{
+    return static_cast<unsigned>(x) < static_cast<unsigned>(y);
+}
+
+CONSTCD11
+inline
+bool
+operator>(const month& x, const month& y) NOEXCEPT
+{
+    return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const month& x, const month& y) NOEXCEPT
+{
+    return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const month& x, const month& y) NOEXCEPT
+{
+    return !(x < y);
+}
+
+CONSTCD14
+inline
+months
+operator-(const month& x, const month& y) NOEXCEPT
+{
+    auto const d = static_cast<unsigned>(x) - static_cast<unsigned>(y);
+    return months(d <= 11 ? d : d + 12);
+}
+
+CONSTCD14
+inline
+month
+operator+(const month& x, const months& y) NOEXCEPT
+{
+    auto const mu = static_cast<long long>(static_cast<unsigned>(x)) + y.count() - 1;
+    auto const yr = (mu >= 0 ? mu : mu-11) / 12;
+    return month{static_cast<unsigned>(mu - yr * 12 + 1)};
+}
+
+CONSTCD14
+inline
+month
+operator+(const months& x, const month& y) NOEXCEPT
+{
+    return y + x;
+}
+
+CONSTCD14
+inline
+month
+operator-(const month& x, const months& y) NOEXCEPT
+{
+    return x + -y;
+}
+
+namespace detail
+{
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+low_level_fmt(std::basic_ostream<CharT, Traits>& os, const month& m)
+{
+    if (m.ok())
+    {
+        CharT fmt[] = {'%', 'b', 0};
+        os << format(os.getloc(), fmt, m);
+    }
+    else
+        os << static_cast<unsigned>(m);
+    return os;
+}
+
+}  // namespace detail
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month& m)
+{
+    detail::low_level_fmt(os, m);
+    if (!m.ok())
+        os << " is not a valid month";
+    return os;
+}
+
+// year
+
+CONSTCD11 inline year::year(int y) NOEXCEPT : y_(static_cast<decltype(y_)>(y)) {}
+CONSTCD14 inline year& year::operator++() NOEXCEPT {++y_; return *this;}
+CONSTCD14 inline year year::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;}
+CONSTCD14 inline year& year::operator--() NOEXCEPT {--y_; return *this;}
+CONSTCD14 inline year year::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;}
+CONSTCD14 inline year& year::operator+=(const years& y) NOEXCEPT {*this = *this + y; return *this;}
+CONSTCD14 inline year& year::operator-=(const years& y) NOEXCEPT {*this = *this - y; return *this;}
+CONSTCD11 inline year year::operator-() const NOEXCEPT {return year{-y_};}
+CONSTCD11 inline year year::operator+() const NOEXCEPT {return *this;}
+
+CONSTCD11
+inline
+bool
+year::is_leap() const NOEXCEPT
+{
+    return y_ % 4 == 0 && (y_ % 100 != 0 || y_ % 400 == 0);
+}
+
+CONSTCD11 inline year::operator int() const NOEXCEPT {return y_;}
+
+CONSTCD11
+inline
+bool
+year::ok() const NOEXCEPT
+{
+    return y_ != std::numeric_limits<short>::min();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const year& x, const year& y) NOEXCEPT
+{
+    return static_cast<int>(x) == static_cast<int>(y);
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const year& x, const year& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const year& x, const year& y) NOEXCEPT
+{
+    return static_cast<int>(x) < static_cast<int>(y);
+}
+
+CONSTCD11
+inline
+bool
+operator>(const year& x, const year& y) NOEXCEPT
+{
+    return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const year& x, const year& y) NOEXCEPT
+{
+    return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const year& x, const year& y) NOEXCEPT
+{
+    return !(x < y);
+}
+
+CONSTCD11
+inline
+years
+operator-(const year& x, const year& y) NOEXCEPT
+{
+    return years{static_cast<int>(x) - static_cast<int>(y)};
+}
+
+CONSTCD11
+inline
+year
+operator+(const year& x, const years& y) NOEXCEPT
+{
+    return year{static_cast<int>(x) + y.count()};
+}
+
+CONSTCD11
+inline
+year
+operator+(const years& x, const year& y) NOEXCEPT
+{
+    return y + x;
+}
+
+CONSTCD11
+inline
+year
+operator-(const year& x, const years& y) NOEXCEPT
+{
+    return year{static_cast<int>(x) - y.count()};
+}
+
+namespace detail
+{
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+low_level_fmt(std::basic_ostream<CharT, Traits>& os, const year& y)
+{
+    detail::save_ostream<CharT, Traits> _(os);
+    os.fill('0');
+    os.flags(std::ios::dec | std::ios::internal);
+    os.width(4 + (y < year{0}));
+    os.imbue(std::locale::classic());
+    os << static_cast<int>(y);
+    return os;
+}
+
+}  // namespace detail
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year& y)
+{
+    detail::low_level_fmt(os, y);
+    if (!y.ok())
+        os << " is not a valid year";
+    return os;
+}
+
+// weekday
+
+CONSTCD14
+inline
+unsigned char
+weekday::weekday_from_days(int z) NOEXCEPT
+{
+    auto u = static_cast<unsigned>(z);
+    return static_cast<unsigned char>(z >= -4 ? (u+4) % 7 : u % 7);
+}
+
+CONSTCD11
+inline
+weekday::weekday(unsigned wd) NOEXCEPT
+    : wd_(static_cast<decltype(wd_)>(wd != 7 ? wd : 0))
+    {}
+
+CONSTCD14
+inline
+weekday::weekday(const sys_days& dp) NOEXCEPT
+    : wd_(weekday_from_days(dp.time_since_epoch().count()))
+    {}
+
+CONSTCD14
+inline
+weekday::weekday(const local_days& dp) NOEXCEPT
+    : wd_(weekday_from_days(dp.time_since_epoch().count()))
+    {}
+
+CONSTCD14 inline weekday& weekday::operator++() NOEXCEPT {*this += days{1}; return *this;}
+CONSTCD14 inline weekday weekday::operator++(int) NOEXCEPT {auto tmp(*this); ++(*this); return tmp;}
+CONSTCD14 inline weekday& weekday::operator--() NOEXCEPT {*this -= days{1}; return *this;}
+CONSTCD14 inline weekday weekday::operator--(int) NOEXCEPT {auto tmp(*this); --(*this); return tmp;}
+
+CONSTCD14
+inline
+weekday&
+weekday::operator+=(const days& d) NOEXCEPT
+{
+    *this = *this + d;
+    return *this;
+}
+
+CONSTCD14
+inline
+weekday&
+weekday::operator-=(const days& d) NOEXCEPT
+{
+    *this = *this - d;
+    return *this;
+}
+
+CONSTCD11 inline bool weekday::ok() const NOEXCEPT {return wd_ <= 6;}
+
+CONSTCD11
+inline
+unsigned weekday::c_encoding() const NOEXCEPT
+{
+    return unsigned{wd_};
+}
+
+CONSTCD11
+inline
+unsigned weekday::iso_encoding() const NOEXCEPT
+{
+    return unsigned{((wd_ == 0u) ? 7u : wd_)};
+}
+
+CONSTCD11
+inline
+bool
+operator==(const weekday& x, const weekday& y) NOEXCEPT
+{
+    return x.wd_ == y.wd_;
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const weekday& x, const weekday& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+CONSTCD14
+inline
+days
+operator-(const weekday& x, const weekday& y) NOEXCEPT
+{
+    auto const wdu = x.wd_ - y.wd_;
+    auto const wk = (wdu >= 0 ? wdu : wdu-6) / 7;
+    return days{wdu - wk * 7};
+}
+
+CONSTCD14
+inline
+weekday
+operator+(const weekday& x, const days& y) NOEXCEPT
+{
+    auto const wdu = static_cast<long long>(static_cast<unsigned>(x.wd_)) + y.count();
+    auto const wk = (wdu >= 0 ? wdu : wdu-6) / 7;
+    return weekday{static_cast<unsigned>(wdu - wk * 7)};
+}
+
+CONSTCD14
+inline
+weekday
+operator+(const days& x, const weekday& y) NOEXCEPT
+{
+    return y + x;
+}
+
+CONSTCD14
+inline
+weekday
+operator-(const weekday& x, const days& y) NOEXCEPT
+{
+    return x + -y;
+}
+
+namespace detail
+{
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+low_level_fmt(std::basic_ostream<CharT, Traits>& os, const weekday& wd)
+{
+    if (wd.ok())
+    {
+        CharT fmt[] = {'%', 'a', 0};
+        os << format(fmt, wd);
+    }
+    else
+        os << wd.c_encoding();
+    return os;
+}
+
+}  // namespace detail
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const weekday& wd)
+{
+    detail::low_level_fmt(os, wd);
+    if (!wd.ok())
+        os << " is not a valid weekday";
+    return os;
+}
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+inline namespace literals
+{
+
+CONSTCD11
+inline
+date::day
+operator ""_d(unsigned long long d) NOEXCEPT
+{
+    return date::day{static_cast<unsigned>(d)};
+}
+
+CONSTCD11
+inline
+date::year
+operator ""_y(unsigned long long y) NOEXCEPT
+{
+    return date::year(static_cast<int>(y));
+}
+#endif  // !defined(_MSC_VER) || (_MSC_VER >= 1900)
+
+CONSTDATA date::last_spec last{};
+
+CONSTDATA date::month jan{1};
+CONSTDATA date::month feb{2};
+CONSTDATA date::month mar{3};
+CONSTDATA date::month apr{4};
+CONSTDATA date::month may{5};
+CONSTDATA date::month jun{6};
+CONSTDATA date::month jul{7};
+CONSTDATA date::month aug{8};
+CONSTDATA date::month sep{9};
+CONSTDATA date::month oct{10};
+CONSTDATA date::month nov{11};
+CONSTDATA date::month dec{12};
+
+CONSTDATA date::weekday sun{0u};
+CONSTDATA date::weekday mon{1u};
+CONSTDATA date::weekday tue{2u};
+CONSTDATA date::weekday wed{3u};
+CONSTDATA date::weekday thu{4u};
+CONSTDATA date::weekday fri{5u};
+CONSTDATA date::weekday sat{6u};
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+}  // inline namespace literals
+#endif
+
+CONSTDATA date::month January{1};
+CONSTDATA date::month February{2};
+CONSTDATA date::month March{3};
+CONSTDATA date::month April{4};
+CONSTDATA date::month May{5};
+CONSTDATA date::month June{6};
+CONSTDATA date::month July{7};
+CONSTDATA date::month August{8};
+CONSTDATA date::month September{9};
+CONSTDATA date::month October{10};
+CONSTDATA date::month November{11};
+CONSTDATA date::month December{12};
+
+CONSTDATA date::weekday Monday{1};
+CONSTDATA date::weekday Tuesday{2};
+CONSTDATA date::weekday Wednesday{3};
+CONSTDATA date::weekday Thursday{4};
+CONSTDATA date::weekday Friday{5};
+CONSTDATA date::weekday Saturday{6};
+CONSTDATA date::weekday Sunday{7};
+
+// weekday_indexed
+
+CONSTCD11
+inline
+weekday
+weekday_indexed::weekday() const NOEXCEPT
+{
+    return date::weekday{static_cast<unsigned>(wd_)};
+}
+
+CONSTCD11 inline unsigned weekday_indexed::index() const NOEXCEPT {return index_;}
+
+CONSTCD11
+inline
+bool
+weekday_indexed::ok() const NOEXCEPT
+{
+    return weekday().ok() && 1 <= index_ && index_ <= 5;
+}
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wconversion"
+#endif  // __GNUC__
+
+CONSTCD11
+inline
+weekday_indexed::weekday_indexed(const date::weekday& wd, unsigned index) NOEXCEPT
+    : wd_(static_cast<decltype(wd_)>(static_cast<unsigned>(wd.wd_)))
+    , index_(static_cast<decltype(index_)>(index))
+    {}
+
+#ifdef __GNUC__
+#  pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+namespace detail
+{
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+low_level_fmt(std::basic_ostream<CharT, Traits>& os, const weekday_indexed& wdi)
+{
+    return low_level_fmt(os, wdi.weekday()) << '[' << wdi.index() << ']';
+}
+
+}  // namespace detail
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const weekday_indexed& wdi)
+{
+    detail::low_level_fmt(os, wdi);
+    if (!wdi.ok())
+        os << " is not a valid weekday_indexed";
+    return os;
+}
+
+CONSTCD11
+inline
+weekday_indexed
+weekday::operator[](unsigned index) const NOEXCEPT
+{
+    return {*this, index};
+}
+
+CONSTCD11
+inline
+bool
+operator==(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT
+{
+    return x.weekday() == y.weekday() && x.index() == y.index();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const weekday_indexed& x, const weekday_indexed& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+// weekday_last
+
+CONSTCD11 inline date::weekday weekday_last::weekday() const NOEXCEPT {return wd_;}
+CONSTCD11 inline bool weekday_last::ok() const NOEXCEPT {return wd_.ok();}
+CONSTCD11 inline weekday_last::weekday_last(const date::weekday& wd) NOEXCEPT : wd_(wd) {}
+
+CONSTCD11
+inline
+bool
+operator==(const weekday_last& x, const weekday_last& y) NOEXCEPT
+{
+    return x.weekday() == y.weekday();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const weekday_last& x, const weekday_last& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+namespace detail
+{
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+low_level_fmt(std::basic_ostream<CharT, Traits>& os, const weekday_last& wdl)
+{
+    return low_level_fmt(os, wdl.weekday()) << "[last]";
+}
+
+}  // namespace detail
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const weekday_last& wdl)
+{
+    detail::low_level_fmt(os, wdl);
+    if (!wdl.ok())
+        os << " is not a valid weekday_last";
+    return os;
+}
+
+CONSTCD11
+inline
+weekday_last
+weekday::operator[](last_spec) const NOEXCEPT
+{
+    return weekday_last{*this};
+}
+
+// year_month
+
+CONSTCD11
+inline
+year_month::year_month(const date::year& y, const date::month& m) NOEXCEPT
+    : y_(y)
+    , m_(m)
+    {}
+
+CONSTCD11 inline year year_month::year() const NOEXCEPT {return y_;}
+CONSTCD11 inline month year_month::month() const NOEXCEPT {return m_;}
+CONSTCD11 inline bool year_month::ok() const NOEXCEPT {return y_.ok() && m_.ok();}
+
+template<class>
+CONSTCD14
+inline
+year_month&
+year_month::operator+=(const months& dm) NOEXCEPT
+{
+    *this = *this + dm;
+    return *this;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month&
+year_month::operator-=(const months& dm) NOEXCEPT
+{
+    *this = *this - dm;
+    return *this;
+}
+
+CONSTCD14
+inline
+year_month&
+year_month::operator+=(const years& dy) NOEXCEPT
+{
+    *this = *this + dy;
+    return *this;
+}
+
+CONSTCD14
+inline
+year_month&
+year_month::operator-=(const years& dy) NOEXCEPT
+{
+    *this = *this - dy;
+    return *this;
+}
+
+CONSTCD11
+inline
+bool
+operator==(const year_month& x, const year_month& y) NOEXCEPT
+{
+    return x.year() == y.year() && x.month() == y.month();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const year_month& x, const year_month& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const year_month& x, const year_month& y) NOEXCEPT
+{
+    return x.year() < y.year() ? true
+        : (x.year() > y.year() ? false
+        : (x.month() < y.month()));
+}
+
+CONSTCD11
+inline
+bool
+operator>(const year_month& x, const year_month& y) NOEXCEPT
+{
+    return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const year_month& x, const year_month& y) NOEXCEPT
+{
+    return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const year_month& x, const year_month& y) NOEXCEPT
+{
+    return !(x < y);
+}
+
+template<class>
+CONSTCD14
+inline
+year_month
+operator+(const year_month& ym, const months& dm) NOEXCEPT
+{
+    auto dmi = static_cast<int>(static_cast<unsigned>(ym.month())) - 1 + dm.count();
+    auto dy = (dmi >= 0 ? dmi : dmi-11) / 12;
+    dmi = dmi - dy * 12 + 1;
+    return (ym.year() + years(dy)) / month(static_cast<unsigned>(dmi));
+}
+
+template<class>
+CONSTCD14
+inline
+year_month
+operator+(const months& dm, const year_month& ym) NOEXCEPT
+{
+    return ym + dm;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month
+operator-(const year_month& ym, const months& dm) NOEXCEPT
+{
+    return ym + -dm;
+}
+
+CONSTCD11
+inline
+months
+operator-(const year_month& x, const year_month& y) NOEXCEPT
+{
+    return (x.year() - y.year()) +
+            months(static_cast<unsigned>(x.month()) - static_cast<unsigned>(y.month()));
+}
+
+CONSTCD11
+inline
+year_month
+operator+(const year_month& ym, const years& dy) NOEXCEPT
+{
+    return (ym.year() + dy) / ym.month();
+}
+
+CONSTCD11
+inline
+year_month
+operator+(const years& dy, const year_month& ym) NOEXCEPT
+{
+    return ym + dy;
+}
+
+CONSTCD11
+inline
+year_month
+operator-(const year_month& ym, const years& dy) NOEXCEPT
+{
+    return ym + -dy;
+}
+
+namespace detail
+{
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+low_level_fmt(std::basic_ostream<CharT, Traits>& os, const year_month& ym)
+{
+    low_level_fmt(os, ym.year()) << '/';
+    return low_level_fmt(os, ym.month());
+}
+
+}  // namespace detail
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month& ym)
+{
+    detail::low_level_fmt(os, ym);
+    if (!ym.ok())
+        os << " is not a valid year_month";
+    return os;
+}
+
+// month_day
+
+CONSTCD11
+inline
+month_day::month_day(const date::month& m, const date::day& d) NOEXCEPT
+    : m_(m)
+    , d_(d)
+    {}
+
+CONSTCD11 inline date::month month_day::month() const NOEXCEPT {return m_;}
+CONSTCD11 inline date::day month_day::day() const NOEXCEPT {return d_;}
+
+CONSTCD14
+inline
+bool
+month_day::ok() const NOEXCEPT
+{
+    CONSTDATA date::day d[] =
+    {
+        date::day(31), date::day(29), date::day(31),
+        date::day(30), date::day(31), date::day(30),
+        date::day(31), date::day(31), date::day(30),
+        date::day(31), date::day(30), date::day(31)
+    };
+    return m_.ok() && date::day{1} <= d_ && d_ <= d[static_cast<unsigned>(m_)-1];
+}
+
+CONSTCD11
+inline
+bool
+operator==(const month_day& x, const month_day& y) NOEXCEPT
+{
+    return x.month() == y.month() && x.day() == y.day();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const month_day& x, const month_day& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const month_day& x, const month_day& y) NOEXCEPT
+{
+    return x.month() < y.month() ? true
+        : (x.month() > y.month() ? false
+        : (x.day() < y.day()));
+}
+
+CONSTCD11
+inline
+bool
+operator>(const month_day& x, const month_day& y) NOEXCEPT
+{
+    return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const month_day& x, const month_day& y) NOEXCEPT
+{
+    return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const month_day& x, const month_day& y) NOEXCEPT
+{
+    return !(x < y);
+}
+
+namespace detail
+{
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+low_level_fmt(std::basic_ostream<CharT, Traits>& os, const month_day& md)
+{
+    low_level_fmt(os, md.month()) << '/';
+    return low_level_fmt(os, md.day());
+}
+
+}  // namespace detail
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_day& md)
+{
+    detail::low_level_fmt(os, md);
+    if (!md.ok())
+        os << " is not a valid month_day";
+    return os;
+}
+
+// month_day_last
+
+CONSTCD11 inline month month_day_last::month() const NOEXCEPT {return m_;}
+CONSTCD11 inline bool month_day_last::ok() const NOEXCEPT {return m_.ok();}
+CONSTCD11 inline month_day_last::month_day_last(const date::month& m) NOEXCEPT : m_(m) {}
+
+CONSTCD11
+inline
+bool
+operator==(const month_day_last& x, const month_day_last& y) NOEXCEPT
+{
+    return x.month() == y.month();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const month_day_last& x, const month_day_last& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const month_day_last& x, const month_day_last& y) NOEXCEPT
+{
+    return x.month() < y.month();
+}
+
+CONSTCD11
+inline
+bool
+operator>(const month_day_last& x, const month_day_last& y) NOEXCEPT
+{
+    return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const month_day_last& x, const month_day_last& y) NOEXCEPT
+{
+    return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const month_day_last& x, const month_day_last& y) NOEXCEPT
+{
+    return !(x < y);
+}
+
+namespace detail
+{
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+low_level_fmt(std::basic_ostream<CharT, Traits>& os, const month_day_last& mdl)
+{
+    return low_level_fmt(os, mdl.month()) << "/last";
+}
+
+}  // namespace detail
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_day_last& mdl)
+{
+    detail::low_level_fmt(os, mdl);
+    if (!mdl.ok())
+        os << " is not a valid month_day_last";
+    return os;
+}
+
+// month_weekday
+
+CONSTCD11
+inline
+month_weekday::month_weekday(const date::month& m,
+                             const date::weekday_indexed& wdi) NOEXCEPT
+    : m_(m)
+    , wdi_(wdi)
+    {}
+
+CONSTCD11 inline month month_weekday::month() const NOEXCEPT {return m_;}
+
+CONSTCD11
+inline
+weekday_indexed
+month_weekday::weekday_indexed() const NOEXCEPT
+{
+    return wdi_;
+}
+
+CONSTCD11
+inline
+bool
+month_weekday::ok() const NOEXCEPT
+{
+    return m_.ok() && wdi_.ok();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const month_weekday& x, const month_weekday& y) NOEXCEPT
+{
+    return x.month() == y.month() && x.weekday_indexed() == y.weekday_indexed();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const month_weekday& x, const month_weekday& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+namespace detail
+{
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+low_level_fmt(std::basic_ostream<CharT, Traits>& os, const month_weekday& mwd)
+{
+    low_level_fmt(os, mwd.month()) << '/';
+    return low_level_fmt(os, mwd.weekday_indexed());
+}
+
+}  // namespace detail
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_weekday& mwd)
+{
+    detail::low_level_fmt(os, mwd);
+    if (!mwd.ok())
+        os << " is not a valid month_weekday";
+    return os;
+}
+
+// month_weekday_last
+
+CONSTCD11
+inline
+month_weekday_last::month_weekday_last(const date::month& m,
+                                       const date::weekday_last& wdl) NOEXCEPT
+    : m_(m)
+    , wdl_(wdl)
+    {}
+
+CONSTCD11 inline month month_weekday_last::month() const NOEXCEPT {return m_;}
+
+CONSTCD11
+inline
+weekday_last
+month_weekday_last::weekday_last() const NOEXCEPT
+{
+    return wdl_;
+}
+
+CONSTCD11
+inline
+bool
+month_weekday_last::ok() const NOEXCEPT
+{
+    return m_.ok() && wdl_.ok();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT
+{
+    return x.month() == y.month() && x.weekday_last() == y.weekday_last();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const month_weekday_last& x, const month_weekday_last& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+namespace detail
+{
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+low_level_fmt(std::basic_ostream<CharT, Traits>& os, const month_weekday_last& mwdl)
+{
+    low_level_fmt(os, mwdl.month()) << '/';
+    return low_level_fmt(os, mwdl.weekday_last());
+}
+
+}  // namespace detail
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const month_weekday_last& mwdl)
+{
+    detail::low_level_fmt(os, mwdl);
+    if (!mwdl.ok())
+        os << " is not a valid month_weekday_last";
+    return os;
+}
+
+// year_month_day_last
+
+CONSTCD11
+inline
+year_month_day_last::year_month_day_last(const date::year& y,
+                                         const date::month_day_last& mdl) NOEXCEPT
+    : y_(y)
+    , mdl_(mdl)
+    {}
+
+template<class>
+CONSTCD14
+inline
+year_month_day_last&
+year_month_day_last::operator+=(const months& m) NOEXCEPT
+{
+    *this = *this + m;
+    return *this;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day_last&
+year_month_day_last::operator-=(const months& m) NOEXCEPT
+{
+    *this = *this - m;
+    return *this;
+}
+
+CONSTCD14
+inline
+year_month_day_last&
+year_month_day_last::operator+=(const years& y) NOEXCEPT
+{
+    *this = *this + y;
+    return *this;
+}
+
+CONSTCD14
+inline
+year_month_day_last&
+year_month_day_last::operator-=(const years& y) NOEXCEPT
+{
+    *this = *this - y;
+    return *this;
+}
+
+CONSTCD11 inline year year_month_day_last::year() const NOEXCEPT {return y_;}
+CONSTCD11 inline month year_month_day_last::month() const NOEXCEPT {return mdl_.month();}
+
+CONSTCD11
+inline
+month_day_last
+year_month_day_last::month_day_last() const NOEXCEPT
+{
+    return mdl_;
+}
+
+CONSTCD14
+inline
+day
+year_month_day_last::day() const NOEXCEPT
+{
+    CONSTDATA date::day d[] =
+    {
+        date::day(31), date::day(28), date::day(31),
+        date::day(30), date::day(31), date::day(30),
+        date::day(31), date::day(31), date::day(30),
+        date::day(31), date::day(30), date::day(31)
+    };
+    return (month() != February || !y_.is_leap()) && mdl_.ok() ?
+        d[static_cast<unsigned>(month()) - 1] : date::day{29};
+}
+
+CONSTCD14
+inline
+year_month_day_last::operator sys_days() const NOEXCEPT
+{
+    return sys_days(year()/month()/day());
+}
+
+CONSTCD14
+inline
+year_month_day_last::operator local_days() const NOEXCEPT
+{
+    return local_days(year()/month()/day());
+}
+
+CONSTCD11
+inline
+bool
+year_month_day_last::ok() const NOEXCEPT
+{
+    return y_.ok() && mdl_.ok();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT
+{
+    return x.year() == y.year() && x.month_day_last() == y.month_day_last();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT
+{
+    return x.year() < y.year() ? true
+        : (x.year() > y.year() ? false
+        : (x.month_day_last() < y.month_day_last()));
+}
+
+CONSTCD11
+inline
+bool
+operator>(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT
+{
+    return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT
+{
+    return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const year_month_day_last& x, const year_month_day_last& y) NOEXCEPT
+{
+    return !(x < y);
+}
+
+namespace detail
+{
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+low_level_fmt(std::basic_ostream<CharT, Traits>& os, const year_month_day_last& ymdl)
+{
+    low_level_fmt(os, ymdl.year()) << '/';
+    return low_level_fmt(os, ymdl.month_day_last());
+}
+
+}  // namespace detail
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_day_last& ymdl)
+{
+    detail::low_level_fmt(os, ymdl);
+    if (!ymdl.ok())
+        os << " is not a valid year_month_day_last";
+    return os;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day_last
+operator+(const year_month_day_last& ymdl, const months& dm) NOEXCEPT
+{
+    return (ymdl.year() / ymdl.month() + dm) / last;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day_last
+operator+(const months& dm, const year_month_day_last& ymdl) NOEXCEPT
+{
+    return ymdl + dm;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day_last
+operator-(const year_month_day_last& ymdl, const months& dm) NOEXCEPT
+{
+    return ymdl + (-dm);
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator+(const year_month_day_last& ymdl, const years& dy) NOEXCEPT
+{
+    return {ymdl.year()+dy, ymdl.month_day_last()};
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator+(const years& dy, const year_month_day_last& ymdl) NOEXCEPT
+{
+    return ymdl + dy;
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator-(const year_month_day_last& ymdl, const years& dy) NOEXCEPT
+{
+    return ymdl + (-dy);
+}
+
+// year_month_day
+
+CONSTCD11
+inline
+year_month_day::year_month_day(const date::year& y, const date::month& m,
+                               const date::day& d) NOEXCEPT
+    : y_(y)
+    , m_(m)
+    , d_(d)
+    {}
+
+CONSTCD14
+inline
+year_month_day::year_month_day(const year_month_day_last& ymdl) NOEXCEPT
+    : y_(ymdl.year())
+    , m_(ymdl.month())
+    , d_(ymdl.day())
+    {}
+
+CONSTCD14
+inline
+year_month_day::year_month_day(sys_days dp) NOEXCEPT
+    : year_month_day(from_days(dp.time_since_epoch()))
+    {}
+
+CONSTCD14
+inline
+year_month_day::year_month_day(local_days dp) NOEXCEPT
+    : year_month_day(from_days(dp.time_since_epoch()))
+    {}
+
+CONSTCD11 inline year year_month_day::year() const NOEXCEPT {return y_;}
+CONSTCD11 inline month year_month_day::month() const NOEXCEPT {return m_;}
+CONSTCD11 inline day year_month_day::day() const NOEXCEPT {return d_;}
+
+template<class>
+CONSTCD14
+inline
+year_month_day&
+year_month_day::operator+=(const months& m) NOEXCEPT
+{
+    *this = *this + m;
+    return *this;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day&
+year_month_day::operator-=(const months& m) NOEXCEPT
+{
+    *this = *this - m;
+    return *this;
+}
+
+CONSTCD14
+inline
+year_month_day&
+year_month_day::operator+=(const years& y) NOEXCEPT
+{
+    *this = *this + y;
+    return *this;
+}
+
+CONSTCD14
+inline
+year_month_day&
+year_month_day::operator-=(const years& y) NOEXCEPT
+{
+    *this = *this - y;
+    return *this;
+}
+
+CONSTCD14
+inline
+days
+year_month_day::to_days() const NOEXCEPT
+{
+    static_assert(std::numeric_limits<unsigned>::digits >= 18,
+             "This algorithm has not been ported to a 16 bit unsigned integer");
+    static_assert(std::numeric_limits<int>::digits >= 20,
+             "This algorithm has not been ported to a 16 bit signed integer");
+    auto const y = static_cast<int>(y_) - (m_ <= February);
+    auto const m = static_cast<unsigned>(m_);
+    auto const d = static_cast<unsigned>(d_);
+    auto const era = (y >= 0 ? y : y-399) / 400;
+    auto const yoe = static_cast<unsigned>(y - era * 400);       // [0, 399]
+    auto const doy = (153*(m > 2 ? m-3 : m+9) + 2)/5 + d-1;      // [0, 365]
+    auto const doe = yoe * 365 + yoe/4 - yoe/100 + doy;          // [0, 146096]
+    return days{era * 146097 + static_cast<int>(doe) - 719468};
+}
+
+CONSTCD14
+inline
+year_month_day::operator sys_days() const NOEXCEPT
+{
+    return sys_days{to_days()};
+}
+
+CONSTCD14
+inline
+year_month_day::operator local_days() const NOEXCEPT
+{
+    return local_days{to_days()};
+}
+
+CONSTCD14
+inline
+bool
+year_month_day::ok() const NOEXCEPT
+{
+    if (!(y_.ok() && m_.ok()))
+        return false;
+    return date::day{1} <= d_ && d_ <= (y_ / m_ / last).day();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const year_month_day& x, const year_month_day& y) NOEXCEPT
+{
+    return x.year() == y.year() && x.month() == y.month() && x.day() == y.day();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const year_month_day& x, const year_month_day& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+CONSTCD11
+inline
+bool
+operator<(const year_month_day& x, const year_month_day& y) NOEXCEPT
+{
+    return x.year() < y.year() ? true
+        : (x.year() > y.year() ? false
+        : (x.month() < y.month() ? true
+        : (x.month() > y.month() ? false
+        : (x.day() < y.day()))));
+}
+
+CONSTCD11
+inline
+bool
+operator>(const year_month_day& x, const year_month_day& y) NOEXCEPT
+{
+    return y < x;
+}
+
+CONSTCD11
+inline
+bool
+operator<=(const year_month_day& x, const year_month_day& y) NOEXCEPT
+{
+    return !(y < x);
+}
+
+CONSTCD11
+inline
+bool
+operator>=(const year_month_day& x, const year_month_day& y) NOEXCEPT
+{
+    return !(x < y);
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_day& ymd)
+{
+    detail::save_ostream<CharT, Traits> _(os);
+    os.fill('0');
+    os.flags(std::ios::dec | std::ios::right);
+    os.imbue(std::locale::classic());
+    os << static_cast<int>(ymd.year()) << '-';
+    os.width(2);
+    os << static_cast<unsigned>(ymd.month()) << '-';
+    os.width(2);
+    os << static_cast<unsigned>(ymd.day());
+    if (!ymd.ok())
+        os << " is not a valid year_month_day";
+    return os;
+}
+
+CONSTCD14
+inline
+year_month_day
+year_month_day::from_days(days dp) NOEXCEPT
+{
+    static_assert(std::numeric_limits<unsigned>::digits >= 18,
+             "This algorithm has not been ported to a 16 bit unsigned integer");
+    static_assert(std::numeric_limits<int>::digits >= 20,
+             "This algorithm has not been ported to a 16 bit signed integer");
+    auto const z = dp.count() + 719468;
+    auto const era = (z >= 0 ? z : z - 146096) / 146097;
+    auto const doe = static_cast<unsigned>(z - era * 146097);          // [0, 146096]
+    auto const yoe = (doe - doe/1460 + doe/36524 - doe/146096) / 365;  // [0, 399]
+    auto const y = static_cast<days::rep>(yoe) + era * 400;
+    auto const doy = doe - (365*yoe + yoe/4 - yoe/100);                // [0, 365]
+    auto const mp = (5*doy + 2)/153;                                   // [0, 11]
+    auto const d = doy - (153*mp+2)/5 + 1;                             // [1, 31]
+    auto const m = mp < 10 ? mp+3 : mp-9;                              // [1, 12]
+    return year_month_day{date::year{y + (m <= 2)}, date::month(m), date::day(d)};
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day
+operator+(const year_month_day& ymd, const months& dm) NOEXCEPT
+{
+    return (ymd.year() / ymd.month() + dm) / ymd.day();
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day
+operator+(const months& dm, const year_month_day& ymd) NOEXCEPT
+{
+    return ymd + dm;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_day
+operator-(const year_month_day& ymd, const months& dm) NOEXCEPT
+{
+    return ymd + (-dm);
+}
+
+CONSTCD11
+inline
+year_month_day
+operator+(const year_month_day& ymd, const years& dy) NOEXCEPT
+{
+    return (ymd.year() + dy) / ymd.month() / ymd.day();
+}
+
+CONSTCD11
+inline
+year_month_day
+operator+(const years& dy, const year_month_day& ymd) NOEXCEPT
+{
+    return ymd + dy;
+}
+
+CONSTCD11
+inline
+year_month_day
+operator-(const year_month_day& ymd, const years& dy) NOEXCEPT
+{
+    return ymd + (-dy);
+}
+
+// year_month_weekday
+
+CONSTCD11
+inline
+year_month_weekday::year_month_weekday(const date::year& y, const date::month& m,
+                                       const date::weekday_indexed& wdi)
+        NOEXCEPT
+    : y_(y)
+    , m_(m)
+    , wdi_(wdi)
+    {}
+
+CONSTCD14
+inline
+year_month_weekday::year_month_weekday(const sys_days& dp) NOEXCEPT
+    : year_month_weekday(from_days(dp.time_since_epoch()))
+    {}
+
+CONSTCD14
+inline
+year_month_weekday::year_month_weekday(const local_days& dp) NOEXCEPT
+    : year_month_weekday(from_days(dp.time_since_epoch()))
+    {}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday&
+year_month_weekday::operator+=(const months& m) NOEXCEPT
+{
+    *this = *this + m;
+    return *this;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday&
+year_month_weekday::operator-=(const months& m) NOEXCEPT
+{
+    *this = *this - m;
+    return *this;
+}
+
+CONSTCD14
+inline
+year_month_weekday&
+year_month_weekday::operator+=(const years& y) NOEXCEPT
+{
+    *this = *this + y;
+    return *this;
+}
+
+CONSTCD14
+inline
+year_month_weekday&
+year_month_weekday::operator-=(const years& y) NOEXCEPT
+{
+    *this = *this - y;
+    return *this;
+}
+
+CONSTCD11 inline year year_month_weekday::year() const NOEXCEPT {return y_;}
+CONSTCD11 inline month year_month_weekday::month() const NOEXCEPT {return m_;}
+
+CONSTCD11
+inline
+weekday
+year_month_weekday::weekday() const NOEXCEPT
+{
+    return wdi_.weekday();
+}
+
+CONSTCD11
+inline
+unsigned
+year_month_weekday::index() const NOEXCEPT
+{
+    return wdi_.index();
+}
+
+CONSTCD11
+inline
+weekday_indexed
+year_month_weekday::weekday_indexed() const NOEXCEPT
+{
+    return wdi_;
+}
+
+CONSTCD14
+inline
+year_month_weekday::operator sys_days() const NOEXCEPT
+{
+    return sys_days{to_days()};
+}
+
+CONSTCD14
+inline
+year_month_weekday::operator local_days() const NOEXCEPT
+{
+    return local_days{to_days()};
+}
+
+CONSTCD14
+inline
+bool
+year_month_weekday::ok() const NOEXCEPT
+{
+    if (!y_.ok() || !m_.ok() || !wdi_.weekday().ok() || wdi_.index() < 1)
+        return false;
+    if (wdi_.index() <= 4)
+        return true;
+    auto d2 = wdi_.weekday() - date::weekday(static_cast<sys_days>(y_/m_/1)) +
+                  days((wdi_.index()-1)*7 + 1);
+    return static_cast<unsigned>(d2.count()) <= static_cast<unsigned>((y_/m_/last).day());
+}
+
+CONSTCD14
+inline
+year_month_weekday
+year_month_weekday::from_days(days d) NOEXCEPT
+{
+    sys_days dp{d};
+    auto const wd = date::weekday(dp);
+    auto const ymd = year_month_day(dp);
+    return {ymd.year(), ymd.month(), wd[(static_cast<unsigned>(ymd.day())-1)/7+1]};
+}
+
+CONSTCD14
+inline
+days
+year_month_weekday::to_days() const NOEXCEPT
+{
+    auto d = sys_days(y_/m_/1);
+    return (d + (wdi_.weekday() - date::weekday(d) + days{(wdi_.index()-1)*7})
+           ).time_since_epoch();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT
+{
+    return x.year() == y.year() && x.month() == y.month() &&
+           x.weekday_indexed() == y.weekday_indexed();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const year_month_weekday& x, const year_month_weekday& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_weekday& ymwdi)
+{
+    detail::low_level_fmt(os, ymwdi.year()) << '/';
+    detail::low_level_fmt(os, ymwdi.month()) << '/';
+    detail::low_level_fmt(os, ymwdi.weekday_indexed());
+    if (!ymwdi.ok())
+        os << " is not a valid year_month_weekday";
+    return os;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday
+operator+(const year_month_weekday& ymwd, const months& dm) NOEXCEPT
+{
+    return (ymwd.year() / ymwd.month() + dm) / ymwd.weekday_indexed();
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday
+operator+(const months& dm, const year_month_weekday& ymwd) NOEXCEPT
+{
+    return ymwd + dm;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday
+operator-(const year_month_weekday& ymwd, const months& dm) NOEXCEPT
+{
+    return ymwd + (-dm);
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator+(const year_month_weekday& ymwd, const years& dy) NOEXCEPT
+{
+    return {ymwd.year()+dy, ymwd.month(), ymwd.weekday_indexed()};
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator+(const years& dy, const year_month_weekday& ymwd) NOEXCEPT
+{
+    return ymwd + dy;
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator-(const year_month_weekday& ymwd, const years& dy) NOEXCEPT
+{
+    return ymwd + (-dy);
+}
+
+// year_month_weekday_last
+
+CONSTCD11
+inline
+year_month_weekday_last::year_month_weekday_last(const date::year& y,
+                                                 const date::month& m,
+                                                 const date::weekday_last& wdl) NOEXCEPT
+    : y_(y)
+    , m_(m)
+    , wdl_(wdl)
+    {}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday_last&
+year_month_weekday_last::operator+=(const months& m) NOEXCEPT
+{
+    *this = *this + m;
+    return *this;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday_last&
+year_month_weekday_last::operator-=(const months& m) NOEXCEPT
+{
+    *this = *this - m;
+    return *this;
+}
+
+CONSTCD14
+inline
+year_month_weekday_last&
+year_month_weekday_last::operator+=(const years& y) NOEXCEPT
+{
+    *this = *this + y;
+    return *this;
+}
+
+CONSTCD14
+inline
+year_month_weekday_last&
+year_month_weekday_last::operator-=(const years& y) NOEXCEPT
+{
+    *this = *this - y;
+    return *this;
+}
+
+CONSTCD11 inline year year_month_weekday_last::year() const NOEXCEPT {return y_;}
+CONSTCD11 inline month year_month_weekday_last::month() const NOEXCEPT {return m_;}
+
+CONSTCD11
+inline
+weekday
+year_month_weekday_last::weekday() const NOEXCEPT
+{
+    return wdl_.weekday();
+}
+
+CONSTCD11
+inline
+weekday_last
+year_month_weekday_last::weekday_last() const NOEXCEPT
+{
+    return wdl_;
+}
+
+CONSTCD14
+inline
+year_month_weekday_last::operator sys_days() const NOEXCEPT
+{
+    return sys_days{to_days()};
+}
+
+CONSTCD14
+inline
+year_month_weekday_last::operator local_days() const NOEXCEPT
+{
+    return local_days{to_days()};
+}
+
+CONSTCD11
+inline
+bool
+year_month_weekday_last::ok() const NOEXCEPT
+{
+    return y_.ok() && m_.ok() && wdl_.ok();
+}
+
+CONSTCD14
+inline
+days
+year_month_weekday_last::to_days() const NOEXCEPT
+{
+    auto const d = sys_days(y_/m_/last);
+    return (d - (date::weekday{d} - wdl_.weekday())).time_since_epoch();
+}
+
+CONSTCD11
+inline
+bool
+operator==(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT
+{
+    return x.year() == y.year() && x.month() == y.month() &&
+           x.weekday_last() == y.weekday_last();
+}
+
+CONSTCD11
+inline
+bool
+operator!=(const year_month_weekday_last& x, const year_month_weekday_last& y) NOEXCEPT
+{
+    return !(x == y);
+}
+
+template<class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const year_month_weekday_last& ymwdl)
+{
+    detail::low_level_fmt(os, ymwdl.year()) << '/';
+    detail::low_level_fmt(os, ymwdl.month()) << '/';
+    detail::low_level_fmt(os, ymwdl.weekday_last());
+    if (!ymwdl.ok())
+        os << " is not a valid year_month_weekday_last";
+    return os;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday_last
+operator+(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT
+{
+    return (ymwdl.year() / ymwdl.month() + dm) / ymwdl.weekday_last();
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday_last
+operator+(const months& dm, const year_month_weekday_last& ymwdl) NOEXCEPT
+{
+    return ymwdl + dm;
+}
+
+template<class>
+CONSTCD14
+inline
+year_month_weekday_last
+operator-(const year_month_weekday_last& ymwdl, const months& dm) NOEXCEPT
+{
+    return ymwdl + (-dm);
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator+(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT
+{
+    return {ymwdl.year()+dy, ymwdl.month(), ymwdl.weekday_last()};
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator+(const years& dy, const year_month_weekday_last& ymwdl) NOEXCEPT
+{
+    return ymwdl + dy;
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator-(const year_month_weekday_last& ymwdl, const years& dy) NOEXCEPT
+{
+    return ymwdl + (-dy);
+}
+
+// year_month from operator/()
+
+CONSTCD11
+inline
+year_month
+operator/(const year& y, const month& m) NOEXCEPT
+{
+    return {y, m};
+}
+
+CONSTCD11
+inline
+year_month
+operator/(const year& y, int   m) NOEXCEPT
+{
+    return y / month(static_cast<unsigned>(m));
+}
+
+// month_day from operator/()
+
+CONSTCD11
+inline
+month_day
+operator/(const month& m, const day& d) NOEXCEPT
+{
+    return {m, d};
+}
+
+CONSTCD11
+inline
+month_day
+operator/(const day& d, const month& m) NOEXCEPT
+{
+    return m / d;
+}
+
+CONSTCD11
+inline
+month_day
+operator/(const month& m, int d) NOEXCEPT
+{
+    return m / day(static_cast<unsigned>(d));
+}
+
+CONSTCD11
+inline
+month_day
+operator/(int m, const day& d) NOEXCEPT
+{
+    return month(static_cast<unsigned>(m)) / d;
+}
+
+CONSTCD11 inline month_day operator/(const day& d, int m) NOEXCEPT {return m / d;}
+
+// month_day_last from operator/()
+
+CONSTCD11
+inline
+month_day_last
+operator/(const month& m, last_spec) NOEXCEPT
+{
+    return month_day_last{m};
+}
+
+CONSTCD11
+inline
+month_day_last
+operator/(last_spec, const month& m) NOEXCEPT
+{
+    return m/last;
+}
+
+CONSTCD11
+inline
+month_day_last
+operator/(int m, last_spec) NOEXCEPT
+{
+    return month(static_cast<unsigned>(m))/last;
+}
+
+CONSTCD11
+inline
+month_day_last
+operator/(last_spec, int m) NOEXCEPT
+{
+    return m/last;
+}
+
+// month_weekday from operator/()
+
+CONSTCD11
+inline
+month_weekday
+operator/(const month& m, const weekday_indexed& wdi) NOEXCEPT
+{
+    return {m, wdi};
+}
+
+CONSTCD11
+inline
+month_weekday
+operator/(const weekday_indexed& wdi, const month& m) NOEXCEPT
+{
+    return m / wdi;
+}
+
+CONSTCD11
+inline
+month_weekday
+operator/(int m, const weekday_indexed& wdi) NOEXCEPT
+{
+    return month(static_cast<unsigned>(m)) / wdi;
+}
+
+CONSTCD11
+inline
+month_weekday
+operator/(const weekday_indexed& wdi, int m) NOEXCEPT
+{
+    return m / wdi;
+}
+
+// month_weekday_last from operator/()
+
+CONSTCD11
+inline
+month_weekday_last
+operator/(const month& m, const weekday_last& wdl) NOEXCEPT
+{
+    return {m, wdl};
+}
+
+CONSTCD11
+inline
+month_weekday_last
+operator/(const weekday_last& wdl, const month& m) NOEXCEPT
+{
+    return m / wdl;
+}
+
+CONSTCD11
+inline
+month_weekday_last
+operator/(int m, const weekday_last& wdl) NOEXCEPT
+{
+    return month(static_cast<unsigned>(m)) / wdl;
+}
+
+CONSTCD11
+inline
+month_weekday_last
+operator/(const weekday_last& wdl, int m) NOEXCEPT
+{
+    return m / wdl;
+}
+
+// year_month_day from operator/()
+
+CONSTCD11
+inline
+year_month_day
+operator/(const year_month& ym, const day& d) NOEXCEPT
+{
+    return {ym.year(), ym.month(), d};
+}
+
+CONSTCD11
+inline
+year_month_day
+operator/(const year_month& ym, int d)  NOEXCEPT
+{
+    return ym / day(static_cast<unsigned>(d));
+}
+
+CONSTCD11
+inline
+year_month_day
+operator/(const year& y, const month_day& md) NOEXCEPT
+{
+    return y / md.month() / md.day();
+}
+
+CONSTCD11
+inline
+year_month_day
+operator/(int y, const month_day& md) NOEXCEPT
+{
+    return year(y) / md;
+}
+
+CONSTCD11
+inline
+year_month_day
+operator/(const month_day& md, const year& y)  NOEXCEPT
+{
+    return y / md;
+}
+
+CONSTCD11
+inline
+year_month_day
+operator/(const month_day& md, int y) NOEXCEPT
+{
+    return year(y) / md;
+}
+
+// year_month_day_last from operator/()
+
+CONSTCD11
+inline
+year_month_day_last
+operator/(const year_month& ym, last_spec) NOEXCEPT
+{
+    return {ym.year(), month_day_last{ym.month()}};
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator/(const year& y, const month_day_last& mdl) NOEXCEPT
+{
+    return {y, mdl};
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator/(int y, const month_day_last& mdl) NOEXCEPT
+{
+    return year(y) / mdl;
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator/(const month_day_last& mdl, const year& y) NOEXCEPT
+{
+    return y / mdl;
+}
+
+CONSTCD11
+inline
+year_month_day_last
+operator/(const month_day_last& mdl, int y) NOEXCEPT
+{
+    return year(y) / mdl;
+}
+
+// year_month_weekday from operator/()
+
+CONSTCD11
+inline
+year_month_weekday
+operator/(const year_month& ym, const weekday_indexed& wdi) NOEXCEPT
+{
+    return {ym.year(), ym.month(), wdi};
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator/(const year& y, const month_weekday& mwd) NOEXCEPT
+{
+    return {y, mwd.month(), mwd.weekday_indexed()};
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator/(int y, const month_weekday& mwd) NOEXCEPT
+{
+    return year(y) / mwd;
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator/(const month_weekday& mwd, const year& y) NOEXCEPT
+{
+    return y / mwd;
+}
+
+CONSTCD11
+inline
+year_month_weekday
+operator/(const month_weekday& mwd, int y) NOEXCEPT
+{
+    return year(y) / mwd;
+}
+
+// year_month_weekday_last from operator/()
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator/(const year_month& ym, const weekday_last& wdl) NOEXCEPT
+{
+    return {ym.year(), ym.month(), wdl};
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator/(const year& y, const month_weekday_last& mwdl) NOEXCEPT
+{
+    return {y, mwdl.month(), mwdl.weekday_last()};
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator/(int y, const month_weekday_last& mwdl) NOEXCEPT
+{
+    return year(y) / mwdl;
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator/(const month_weekday_last& mwdl, const year& y) NOEXCEPT
+{
+    return y / mwdl;
+}
+
+CONSTCD11
+inline
+year_month_weekday_last
+operator/(const month_weekday_last& mwdl, int y) NOEXCEPT
+{
+    return year(y) / mwdl;
+}
+
+template <class Duration>
+struct fields;
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+          const fields<Duration>& fds, const std::string* abbrev = nullptr,
+          const std::chrono::seconds* offset_sec = nullptr);
+
+template <class CharT, class Traits, class Duration, class Alloc>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+            fields<Duration>& fds, std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr);
+
+// hh_mm_ss
+
+namespace detail
+{
+
+struct undocumented {explicit undocumented() = default;};
+
+// width<n>::value is the number of fractional decimal digits in 1/n
+// width<0>::value and width<1>::value are defined to be 0
+// If 1/n takes more than 18 fractional decimal digits,
+//   the result is truncated to 19.
+// Example:  width<2>::value    ==  1
+// Example:  width<3>::value    == 19
+// Example:  width<4>::value    ==  2
+// Example:  width<10>::value   ==  1
+// Example:  width<1000>::value ==  3
+template <std::uint64_t n, std::uint64_t d, unsigned w = 0,
+          bool should_continue = n%d != 0 && (w < 19)>
+struct width
+{
+    static_assert(d > 0, "width called with zero denominator");
+    static CONSTDATA unsigned value = 1 + width<n%d*10, d, w+1>::value;
+};
+
+template <std::uint64_t n, std::uint64_t d, unsigned w>
+struct width<n, d, w, false>
+{
+    static CONSTDATA unsigned value = 0;
+};
+
+template <unsigned exp>
+struct static_pow10
+{
+private:
+    static CONSTDATA std::uint64_t h = static_pow10<exp/2>::value;
+public:
+    static CONSTDATA std::uint64_t value = h * h * (exp % 2 ? 10 : 1);
+};
+
+template <>
+struct static_pow10<0>
+{
+    static CONSTDATA std::uint64_t value = 1;
+};
+
+template <class Duration>
+class decimal_format_seconds
+{
+    using CT = typename std::common_type<Duration, std::chrono::seconds>::type;
+    using rep = typename CT::rep;
+    static unsigned CONSTDATA trial_width =
+        detail::width<CT::period::num, CT::period::den>::value;
+public:
+    static unsigned CONSTDATA width = trial_width < 19 ? trial_width : 6u;
+    using precision = std::chrono::duration<rep,
+                                            std::ratio<1, static_pow10<width>::value>>;
+
+private:
+    std::chrono::seconds s_;
+    precision            sub_s_;
+
+public:
+    CONSTCD11 decimal_format_seconds()
+        : s_()
+        , sub_s_()
+        {}
+
+    CONSTCD11 explicit decimal_format_seconds(const Duration& d) NOEXCEPT
+        : s_(std::chrono::duration_cast<std::chrono::seconds>(d))
+        , sub_s_(std::chrono::duration_cast<precision>(d - s_))
+        {}
+
+    CONSTCD14 std::chrono::seconds& seconds() NOEXCEPT {return s_;}
+    CONSTCD11 std::chrono::seconds seconds() const NOEXCEPT {return s_;}
+    CONSTCD11 precision subseconds() const NOEXCEPT {return sub_s_;}
+
+    CONSTCD14 precision to_duration() const NOEXCEPT
+    {
+        return s_ + sub_s_;
+    }
+
+    CONSTCD11 bool in_conventional_range() const NOEXCEPT
+    {
+        return sub_s_ < std::chrono::seconds{1} && s_ < std::chrono::minutes{1};
+    }
+
+    template <class CharT, class Traits>
+    friend
+    std::basic_ostream<CharT, Traits>&
+    operator<<(std::basic_ostream<CharT, Traits>& os, const decimal_format_seconds& x)
+    {
+        return x.print(os, std::chrono::treat_as_floating_point<rep>{});
+    }
+
+    template <class CharT, class Traits>
+    std::basic_ostream<CharT, Traits>&
+    print(std::basic_ostream<CharT, Traits>& os, std::true_type) const
+    {
+        date::detail::save_ostream<CharT, Traits> _(os);
+        std::chrono::duration<rep> d = s_ + sub_s_;
+        if (d < std::chrono::seconds{10})
+            os << '0';
+        os.precision(width+6);
+        os << std::fixed << d.count();
+        return os;
+    }
+
+    template <class CharT, class Traits>
+    std::basic_ostream<CharT, Traits>&
+    print(std::basic_ostream<CharT, Traits>& os, std::false_type) const
+    {
+        date::detail::save_ostream<CharT, Traits> _(os);
+        os.fill('0');
+        os.flags(std::ios::dec | std::ios::right);
+        os.width(2);
+        os << s_.count();
+        if (width > 0)
+        {
+#if !ONLY_C_LOCALE
+            os << std::use_facet<std::numpunct<CharT>>(os.getloc()).decimal_point();
+#else
+            os << '.';
+#endif
+            date::detail::save_ostream<CharT, Traits> _s(os);
+            os.imbue(std::locale::classic());
+            os.width(width);
+            os << sub_s_.count();
+        }
+        return os;
+    }
+};
+
+template <class Rep, class Period>
+inline
+CONSTCD11
+typename std::enable_if
+         <
+            std::numeric_limits<Rep>::is_signed,
+            std::chrono::duration<Rep, Period>
+         >::type
+abs(std::chrono::duration<Rep, Period> d)
+{
+    return d >= d.zero() ? +d : -d;
+}
+
+template <class Rep, class Period>
+inline
+CONSTCD11
+typename std::enable_if
+         <
+            !std::numeric_limits<Rep>::is_signed,
+            std::chrono::duration<Rep, Period>
+         >::type
+abs(std::chrono::duration<Rep, Period> d)
+{
+    return d;
+}
+
+}  // namespace detail
+
+template <class Duration>
+class hh_mm_ss
+{
+    using dfs = detail::decimal_format_seconds<typename std::common_type<Duration,
+                                               std::chrono::seconds>::type>;
+
+    std::chrono::hours h_;
+    std::chrono::minutes m_;
+    dfs s_;
+    bool neg_;
+
+public:
+    static unsigned CONSTDATA fractional_width = dfs::width;
+    using precision = typename dfs::precision;
+
+    CONSTCD11 hh_mm_ss() NOEXCEPT
+        : hh_mm_ss(Duration::zero())
+        {}
+
+    CONSTCD11 explicit hh_mm_ss(Duration d) NOEXCEPT
+        : h_(std::chrono::duration_cast<std::chrono::hours>(detail::abs(d)))
+        , m_(std::chrono::duration_cast<std::chrono::minutes>(detail::abs(d)) - h_)
+        , s_(detail::abs(d) - h_ - m_)
+        , neg_(d < Duration::zero())
+        {}
+
+    CONSTCD11 std::chrono::hours hours() const NOEXCEPT {return h_;}
+    CONSTCD11 std::chrono::minutes minutes() const NOEXCEPT {return m_;}
+    CONSTCD11 std::chrono::seconds seconds() const NOEXCEPT {return s_.seconds();}
+    CONSTCD14 std::chrono::seconds&
+        seconds(detail::undocumented) NOEXCEPT {return s_.seconds();}
+    CONSTCD11 precision subseconds() const NOEXCEPT {return s_.subseconds();}
+    CONSTCD11 bool is_negative() const NOEXCEPT {return neg_;}
+
+    CONSTCD11 explicit operator  precision()   const NOEXCEPT {return to_duration();}
+    CONSTCD11          precision to_duration() const NOEXCEPT
+        {return (s_.to_duration() + m_ + h_) * (1-2*neg_);}
+
+    CONSTCD11 bool in_conventional_range() const NOEXCEPT
+    {
+        return !neg_ && h_ < days{1} && m_ < std::chrono::hours{1} &&
+               s_.in_conventional_range();
+    }
+
+private:
+
+    template <class charT, class traits>
+    friend
+    std::basic_ostream<charT, traits>&
+    operator<<(std::basic_ostream<charT, traits>& os, hh_mm_ss const& tod)
+    {
+        if (tod.is_negative())
+            os << '-';
+        if (tod.h_ < std::chrono::hours{10})
+            os << '0';
+        os << tod.h_.count() << ':';
+        if (tod.m_ < std::chrono::minutes{10})
+            os << '0';
+        os << tod.m_.count() << ':' << tod.s_;
+        return os;
+    }
+
+    template <class CharT, class Traits, class Duration2>
+    friend
+    std::basic_ostream<CharT, Traits>&
+    date::to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+          const fields<Duration2>& fds, const std::string* abbrev,
+          const std::chrono::seconds* offset_sec);
+
+    template <class CharT, class Traits, class Duration2, class Alloc>
+    friend
+    std::basic_istream<CharT, Traits>&
+    date::from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+          fields<Duration2>& fds,
+          std::basic_string<CharT, Traits, Alloc>* abbrev, std::chrono::minutes* offset);
+};
+
+inline
+CONSTCD14
+bool
+is_am(std::chrono::hours const& h) NOEXCEPT
+{
+    using std::chrono::hours;
+    return hours{0} <= h && h < hours{12};
+}
+
+inline
+CONSTCD14
+bool
+is_pm(std::chrono::hours const& h) NOEXCEPT
+{
+    using std::chrono::hours;
+    return hours{12} <= h && h < hours{24};
+}
+
+inline
+CONSTCD14
+std::chrono::hours
+make12(std::chrono::hours h) NOEXCEPT
+{
+    using std::chrono::hours;
+    if (h < hours{12})
+    {
+        if (h == hours{0})
+            h = hours{12};
+    }
+    else
+    {
+        if (h != hours{12})
+            h = h - hours{12};
+    }
+    return h;
+}
+
+inline
+CONSTCD14
+std::chrono::hours
+make24(std::chrono::hours h, bool is_pm) NOEXCEPT
+{
+    using std::chrono::hours;
+    if (is_pm)
+    {
+        if (h != hours{12})
+            h = h + hours{12};
+    }
+    else if (h == hours{12})
+        h = hours{0};
+    return h;
+}
+
+template <class Duration>
+using time_of_day = hh_mm_ss<Duration>;
+
+template <class Rep, class Period>
+CONSTCD11
+inline
+hh_mm_ss<std::chrono::duration<Rep, Period>>
+make_time(const std::chrono::duration<Rep, Period>& d)
+{
+    return hh_mm_ss<std::chrono::duration<Rep, Period>>(d);
+}
+
+template <class CharT, class Traits, class Duration>
+inline
+typename std::enable_if
+<
+    !std::is_convertible<Duration, days>::value,
+    std::basic_ostream<CharT, Traits>&
+>::type
+operator<<(std::basic_ostream<CharT, Traits>& os, const sys_time<Duration>& tp)
+{
+    auto const dp = date::floor<days>(tp);
+    return os << year_month_day(dp) << ' ' << make_time(tp-dp);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const sys_days& dp)
+{
+    return os << year_month_day(dp);
+}
+
+template <class CharT, class Traits, class Duration>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const local_time<Duration>& ut)
+{
+    return (date::operator<<(os, sys_time<Duration>{ut.time_since_epoch()}));
+}
+
+namespace detail
+{
+
+template <class CharT, std::size_t N>
+class string_literal;
+
+template <class CharT1, class CharT2, std::size_t N1, std::size_t N2>
+inline
+CONSTCD14
+string_literal<typename std::conditional<sizeof(CharT2) <= sizeof(CharT1), CharT1, CharT2>::type,
+               N1 + N2 - 1>
+operator+(const string_literal<CharT1, N1>& x, const string_literal<CharT2, N2>& y) NOEXCEPT;
+
+template <class CharT, std::size_t N>
+class string_literal
+{
+    CharT p_[N];
+
+    CONSTCD11 string_literal() NOEXCEPT
+      : p_{}
+    {}
+
+public:
+    using const_iterator = const CharT*;
+
+    string_literal(string_literal const&) = default;
+    string_literal& operator=(string_literal const&) = delete;
+
+    template <std::size_t N1 = 2,
+              class = typename std::enable_if<N1 == N>::type>
+    CONSTCD11 string_literal(CharT c) NOEXCEPT
+        : p_{c}
+    {
+    }
+
+    template <std::size_t N1 = 3,
+              class = typename std::enable_if<N1 == N>::type>
+    CONSTCD11 string_literal(CharT c1, CharT c2) NOEXCEPT
+        : p_{c1, c2}
+    {
+    }
+
+    template <std::size_t N1 = 4,
+              class = typename std::enable_if<N1 == N>::type>
+    CONSTCD11 string_literal(CharT c1, CharT c2, CharT c3) NOEXCEPT
+        : p_{c1, c2, c3}
+    {
+    }
+
+    CONSTCD14 string_literal(const CharT(&a)[N]) NOEXCEPT
+        : p_{}
+    {
+        for (std::size_t i = 0; i < N; ++i)
+            p_[i] = a[i];
+    }
+
+    template <class U = CharT,
+              class = typename std::enable_if<(1 < sizeof(U))>::type>
+    CONSTCD14 string_literal(const char(&a)[N]) NOEXCEPT
+        : p_{}
+    {
+        for (std::size_t i = 0; i < N; ++i)
+            p_[i] = a[i];
+    }
+
+    template <class CharT2,
+              class = typename std::enable_if<!std::is_same<CharT2, CharT>::value>::type>
+    CONSTCD14 string_literal(string_literal<CharT2, N> const& a) NOEXCEPT
+        : p_{}
+    {
+        for (std::size_t i = 0; i < N; ++i)
+            p_[i] = a[i];
+    }
+
+    CONSTCD11 const CharT* data() const NOEXCEPT {return p_;}
+    CONSTCD11 std::size_t size() const NOEXCEPT {return N-1;}
+
+    CONSTCD11 const_iterator begin() const NOEXCEPT {return p_;}
+    CONSTCD11 const_iterator end()   const NOEXCEPT {return p_ + N-1;}
+
+    CONSTCD11 CharT const& operator[](std::size_t n) const NOEXCEPT
+    {
+        return p_[n];
+    }
+
+    template <class Traits>
+    friend
+    std::basic_ostream<CharT, Traits>&
+    operator<<(std::basic_ostream<CharT, Traits>& os, const string_literal& s)
+    {
+        return os << s.p_;
+    }
+
+    template <class CharT1, class CharT2, std::size_t N1, std::size_t N2>
+    friend
+    CONSTCD14
+    string_literal<typename std::conditional<sizeof(CharT2) <= sizeof(CharT1), CharT1, CharT2>::type,
+                   N1 + N2 - 1>
+    operator+(const string_literal<CharT1, N1>& x, const string_literal<CharT2, N2>& y) NOEXCEPT;
+};
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 3>
+operator+(const string_literal<CharT, 2>& x, const string_literal<CharT, 2>& y) NOEXCEPT
+{
+  return string_literal<CharT, 3>(x[0], y[0]);
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 4>
+operator+(const string_literal<CharT, 3>& x, const string_literal<CharT, 2>& y) NOEXCEPT
+{
+  return string_literal<CharT, 4>(x[0], x[1], y[0]);
+}
+
+template <class CharT1, class CharT2, std::size_t N1, std::size_t N2>
+CONSTCD14
+inline
+string_literal<typename std::conditional<sizeof(CharT2) <= sizeof(CharT1), CharT1, CharT2>::type,
+               N1 + N2 - 1>
+operator+(const string_literal<CharT1, N1>& x, const string_literal<CharT2, N2>& y) NOEXCEPT
+{
+    using CT = typename std::conditional<sizeof(CharT2) <= sizeof(CharT1), CharT1, CharT2>::type;
+
+    string_literal<CT, N1 + N2 - 1> r;
+    std::size_t i = 0;
+    for (; i < N1-1; ++i)
+       r.p_[i] = CT(x.p_[i]);
+    for (std::size_t j = 0; j < N2; ++j, ++i)
+       r.p_[i] = CT(y.p_[j]);
+
+    return r;
+}
+
+
+template <class CharT, class Traits, class Alloc, std::size_t N>
+inline
+std::basic_string<CharT, Traits, Alloc>
+operator+(std::basic_string<CharT, Traits, Alloc> x, const string_literal<CharT, N>& y)
+{
+    x.append(y.data(), y.size());
+    return x;
+}
+
+#if __cplusplus >= 201402  && (!defined(__EDG_VERSION__) || __EDG_VERSION__ > 411) \
+                           && (!defined(__SUNPRO_CC) || __SUNPRO_CC > 0x5150)
+
+template <class CharT,
+          class = std::enable_if_t<std::is_same<CharT, char>::value ||
+                                   std::is_same<CharT, wchar_t>::value ||
+                                   std::is_same<CharT, char16_t>::value ||
+                                   std::is_same<CharT, char32_t>::value>>
+CONSTCD14
+inline
+string_literal<CharT, 2>
+msl(CharT c) NOEXCEPT
+{
+    return string_literal<CharT, 2>{c};
+}
+
+CONSTCD14
+inline
+std::size_t
+to_string_len(std::intmax_t i)
+{
+    std::size_t r = 0;
+    do
+    {
+        i /= 10;
+        ++r;
+    } while (i > 0);
+    return r;
+}
+
+template <std::intmax_t N>
+CONSTCD14
+inline
+std::enable_if_t
+<
+    N < 10,
+    string_literal<char, to_string_len(N)+1>
+>
+msl() NOEXCEPT
+{
+    return msl(char(N % 10 + '0'));
+}
+
+template <std::intmax_t N>
+CONSTCD14
+inline
+std::enable_if_t
+<
+    10 <= N,
+    string_literal<char, to_string_len(N)+1>
+>
+msl() NOEXCEPT
+{
+    return msl<N/10>() + msl(char(N % 10 + '0'));
+}
+
+template <class CharT, std::intmax_t N, std::intmax_t D>
+CONSTCD14
+inline
+std::enable_if_t
+<
+    std::ratio<N, D>::type::den != 1,
+    string_literal<CharT, to_string_len(std::ratio<N, D>::type::num) +
+                          to_string_len(std::ratio<N, D>::type::den) + 4>
+>
+msl(std::ratio<N, D>) NOEXCEPT
+{
+    using R = typename std::ratio<N, D>::type;
+    return msl(CharT{'['}) + msl<R::num>() + msl(CharT{'/'}) +
+                             msl<R::den>() + msl(CharT{']'});
+}
+
+template <class CharT, std::intmax_t N, std::intmax_t D>
+CONSTCD14
+inline
+std::enable_if_t
+<
+    std::ratio<N, D>::type::den == 1,
+    string_literal<CharT, to_string_len(std::ratio<N, D>::type::num) + 3>
+>
+msl(std::ratio<N, D>) NOEXCEPT
+{
+    using R = typename std::ratio<N, D>::type;
+    return msl(CharT{'['}) + msl<R::num>() + msl(CharT{']'});
+}
+
+
+#else  // __cplusplus < 201402 || (defined(__EDG_VERSION__) && __EDG_VERSION__ <= 411)
+
+inline
+std::string
+to_string(std::uint64_t x)
+{
+    return std::to_string(x);
+}
+
+template <class CharT>
+inline
+std::basic_string<CharT>
+to_string(std::uint64_t x)
+{
+    auto y = std::to_string(x);
+    return std::basic_string<CharT>(y.begin(), y.end());
+}
+
+template <class CharT, std::intmax_t N, std::intmax_t D>
+inline
+typename std::enable_if
+<
+    std::ratio<N, D>::type::den != 1,
+    std::basic_string<CharT>
+>::type
+msl(std::ratio<N, D>)
+{
+    using R = typename std::ratio<N, D>::type;
+    return std::basic_string<CharT>(1, '[') + to_string<CharT>(R::num) + CharT{'/'} +
+                                              to_string<CharT>(R::den) + CharT{']'};
+}
+
+template <class CharT, std::intmax_t N, std::intmax_t D>
+inline
+typename std::enable_if
+<
+    std::ratio<N, D>::type::den == 1,
+    std::basic_string<CharT>
+>::type
+msl(std::ratio<N, D>)
+{
+    using R = typename std::ratio<N, D>::type;
+    return std::basic_string<CharT>(1, '[') + to_string<CharT>(R::num) + CharT{']'};
+}
+
+#endif  // __cplusplus < 201402 || (defined(__EDG_VERSION__) && __EDG_VERSION__ <= 411)
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::atto) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'a'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::femto) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'f'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::pico) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'p'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::nano) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'n'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+typename std::enable_if
+<
+    std::is_same<CharT, char>::value,
+    string_literal<char, 3>
+>::type
+msl(std::micro) NOEXCEPT
+{
+    return string_literal<char, 3>{'\xC2', '\xB5'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+typename std::enable_if
+<
+    !std::is_same<CharT, char>::value,
+    string_literal<CharT, 2>
+>::type
+msl(std::micro) NOEXCEPT
+{
+    return string_literal<CharT, 2>{CharT{static_cast<unsigned char>('\xB5')}};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::milli) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'m'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::centi) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'c'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 3>
+msl(std::deca) NOEXCEPT
+{
+    return string_literal<CharT, 3>{'d', 'a'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::deci) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'d'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::hecto) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'h'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::kilo) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'k'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::mega) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'M'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::giga) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'G'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::tera) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'T'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::peta) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'P'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+msl(std::exa) NOEXCEPT
+{
+    return string_literal<CharT, 2>{'E'};
+}
+
+template <class CharT, class Period>
+CONSTCD11
+inline
+auto
+get_units(Period p)
+ -> decltype(msl<CharT>(p) + string_literal<CharT, 2>{'s'})
+{
+    return msl<CharT>(p) + string_literal<CharT, 2>{'s'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+get_units(std::ratio<1>)
+{
+    return string_literal<CharT, 2>{'s'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+get_units(std::ratio<3600>)
+{
+    return string_literal<CharT, 2>{'h'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 4>
+get_units(std::ratio<60>)
+{
+    return string_literal<CharT, 4>{'m', 'i', 'n'};
+}
+
+template <class CharT>
+CONSTCD11
+inline
+string_literal<CharT, 2>
+get_units(std::ratio<86400>)
+{
+    return string_literal<CharT, 2>{'d'};
+}
+
+template <class CharT, class Traits = std::char_traits<CharT>>
+struct make_string;
+
+template <>
+struct make_string<char>
+{
+    template <class Rep>
+    static
+    std::string
+    from(Rep n)
+    {
+        return std::to_string(n);
+    }
+};
+
+template <class Traits>
+struct make_string<char, Traits>
+{
+    template <class Rep>
+    static
+    std::basic_string<char, Traits>
+    from(Rep n)
+    {
+        auto s = std::to_string(n);
+        return std::basic_string<char, Traits>(s.begin(), s.end());
+    }
+};
+
+template <>
+struct make_string<wchar_t>
+{
+    template <class Rep>
+    static
+    std::wstring
+    from(Rep n)
+    {
+        return std::to_wstring(n);
+    }
+};
+
+template <class Traits>
+struct make_string<wchar_t, Traits>
+{
+    template <class Rep>
+    static
+    std::basic_string<wchar_t, Traits>
+    from(Rep n)
+    {
+        auto s = std::to_wstring(n);
+        return std::basic_string<wchar_t, Traits>(s.begin(), s.end());
+    }
+};
+
+}  // namespace detail
+
+// to_stream
+
+CONSTDATA year nanyear{-32768};
+
+template <class Duration>
+struct fields
+{
+    year_month_day        ymd{nanyear/0/0};
+    weekday               wd{8u};
+    hh_mm_ss<Duration>    tod{};
+    bool                  has_tod = false;
+
+#if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ <= 409)
+    fields() : ymd{nanyear/0/0}, wd{8u}, tod{}, has_tod{false} {}
+#else
+    fields() = default;
+#endif
+
+    fields(year_month_day ymd_) : ymd(ymd_) {}
+    fields(weekday wd_) : wd(wd_) {}
+    fields(hh_mm_ss<Duration> tod_) : tod(tod_), has_tod(true) {}
+
+    fields(year_month_day ymd_, weekday wd_) : ymd(ymd_), wd(wd_) {}
+    fields(year_month_day ymd_, hh_mm_ss<Duration> tod_) : ymd(ymd_), tod(tod_),
+                                                           has_tod(true) {}
+
+    fields(weekday wd_, hh_mm_ss<Duration> tod_) : wd(wd_), tod(tod_), has_tod(true) {}
+
+    fields(year_month_day ymd_, weekday wd_, hh_mm_ss<Duration> tod_)
+        : ymd(ymd_)
+        , wd(wd_)
+        , tod(tod_)
+        , has_tod(true)
+        {}
+};
+
+namespace detail
+{
+
+template <class CharT, class Traits, class Duration>
+unsigned
+extract_weekday(std::basic_ostream<CharT, Traits>& os, const fields<Duration>& fds)
+{
+    if (!fds.ymd.ok() && !fds.wd.ok())
+    {
+        // fds does not contain a valid weekday
+        os.setstate(std::ios::failbit);
+        return 8;
+    }
+    weekday wd;
+    if (fds.ymd.ok())
+    {
+        wd = weekday{sys_days(fds.ymd)};
+        if (fds.wd.ok() && wd != fds.wd)
+        {
+            // fds.ymd and fds.wd are inconsistent
+            os.setstate(std::ios::failbit);
+            return 8;
+        }
+    }
+    else
+        wd = fds.wd;
+    return static_cast<unsigned>((wd - Sunday).count());
+}
+
+template <class CharT, class Traits, class Duration>
+unsigned
+extract_month(std::basic_ostream<CharT, Traits>& os, const fields<Duration>& fds)
+{
+    if (!fds.ymd.month().ok())
+    {
+        // fds does not contain a valid month
+        os.setstate(std::ios::failbit);
+        return 0;
+    }
+    return static_cast<unsigned>(fds.ymd.month());
+}
+
+}  // namespace detail
+
+#if ONLY_C_LOCALE
+
+namespace detail
+{
+
+inline
+std::pair<const std::string*, const std::string*>
+weekday_names()
+{
+    static const std::string nm[] =
+    {
+        "Sunday",
+        "Monday",
+        "Tuesday",
+        "Wednesday",
+        "Thursday",
+        "Friday",
+        "Saturday",
+        "Sun",
+        "Mon",
+        "Tue",
+        "Wed",
+        "Thu",
+        "Fri",
+        "Sat"
+    };
+    return std::make_pair(nm, nm+sizeof(nm)/sizeof(nm[0]));
+}
+
+inline
+std::pair<const std::string*, const std::string*>
+month_names()
+{
+    static const std::string nm[] =
+    {
+        "January",
+        "February",
+        "March",
+        "April",
+        "May",
+        "June",
+        "July",
+        "August",
+        "September",
+        "October",
+        "November",
+        "December",
+        "Jan",
+        "Feb",
+        "Mar",
+        "Apr",
+        "May",
+        "Jun",
+        "Jul",
+        "Aug",
+        "Sep",
+        "Oct",
+        "Nov",
+        "Dec"
+    };
+    return std::make_pair(nm, nm+sizeof(nm)/sizeof(nm[0]));
+}
+
+inline
+std::pair<const std::string*, const std::string*>
+ampm_names()
+{
+    static const std::string nm[] =
+    {
+        "AM",
+        "PM"
+    };
+    return std::make_pair(nm, nm+sizeof(nm)/sizeof(nm[0]));
+}
+
+template <class CharT, class Traits, class FwdIter>
+FwdIter
+scan_keyword(std::basic_istream<CharT, Traits>& is, FwdIter kb, FwdIter ke)
+{
+    size_t nkw = static_cast<size_t>(std::distance(kb, ke));
+    const unsigned char doesnt_match = '\0';
+    const unsigned char might_match = '\1';
+    const unsigned char does_match = '\2';
+    unsigned char statbuf[100];
+    unsigned char* status = statbuf;
+    std::unique_ptr<unsigned char, void(*)(void*)> stat_hold(0, free);
+    if (nkw > sizeof(statbuf))
+    {
+        status = (unsigned char*)std::malloc(nkw);
+        if (status == nullptr)
+            throw std::bad_alloc();
+        stat_hold.reset(status);
+    }
+    size_t n_might_match = nkw;  // At this point, any keyword might match
+    size_t n_does_match = 0;     // but none of them definitely do
+    // Initialize all statuses to might_match, except for "" keywords are does_match
+    unsigned char* st = status;
+    for (auto ky = kb; ky != ke; ++ky, ++st)
+    {
+        if (!ky->empty())
+            *st = might_match;
+        else
+        {
+            *st = does_match;
+            --n_might_match;
+            ++n_does_match;
+        }
+    }
+    // While there might be a match, test keywords against the next CharT
+    for (size_t indx = 0; is && n_might_match > 0; ++indx)
+    {
+        // Peek at the next CharT but don't consume it
+        auto ic = is.peek();
+        if (ic == EOF)
+        {
+            is.setstate(std::ios::eofbit);
+            break;
+        }
+        auto c = static_cast<char>(toupper(static_cast<unsigned char>(ic)));
+        bool consume = false;
+        // For each keyword which might match, see if the indx character is c
+        // If a match if found, consume c
+        // If a match is found, and that is the last character in the keyword,
+        //    then that keyword matches.
+        // If the keyword doesn't match this character, then change the keyword
+        //    to doesn't match
+        st = status;
+        for (auto ky = kb; ky != ke; ++ky, ++st)
+        {
+            if (*st == might_match)
+            {
+                if (c == static_cast<char>(toupper(static_cast<unsigned char>((*ky)[indx]))))
+                {
+                    consume = true;
+                    if (ky->size() == indx+1)
+                    {
+                        *st = does_match;
+                        --n_might_match;
+                        ++n_does_match;
+                    }
+                }
+                else
+                {
+                    *st = doesnt_match;
+                    --n_might_match;
+                }
+            }
+        }
+        // consume if we matched a character
+        if (consume)
+        {
+            (void)is.get();
+            // If we consumed a character and there might be a matched keyword that
+            //   was marked matched on a previous iteration, then such keywords
+            //   are now marked as not matching.
+            if (n_might_match + n_does_match > 1)
+            {
+                st = status;
+                for (auto ky = kb; ky != ke; ++ky, ++st)
+                {
+                    if (*st == does_match && ky->size() != indx+1)
+                    {
+                        *st = doesnt_match;
+                        --n_does_match;
+                    }
+                }
+            }
+        }
+    }
+    // We've exited the loop because we hit eof and/or we have no more "might matches".
+    // Return the first matching result
+    for (st = status; kb != ke; ++kb, ++st)
+        if (*st == does_match)
+            break;
+    if (kb == ke)
+        is.setstate(std::ios::failbit);
+    return kb;
+}
+
+}  // namespace detail
+
+#endif  // ONLY_C_LOCALE
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+          const fields<Duration>& fds, const std::string* abbrev,
+          const std::chrono::seconds* offset_sec)
+{
+#if ONLY_C_LOCALE
+    using detail::weekday_names;
+    using detail::month_names;
+    using detail::ampm_names;
+#endif
+    using detail::save_ostream;
+    using detail::get_units;
+    using detail::extract_weekday;
+    using detail::extract_month;
+    using std::ios;
+    using std::chrono::duration_cast;
+    using std::chrono::seconds;
+    using std::chrono::minutes;
+    using std::chrono::hours;
+    date::detail::save_ostream<CharT, Traits> ss(os);
+    os.fill(' ');
+    os.flags(std::ios::skipws | std::ios::dec);
+    os.width(0);
+    tm tm{};
+    bool insert_negative = fds.has_tod && fds.tod.to_duration() < Duration::zero();
+#if !ONLY_C_LOCALE
+    auto& facet = std::use_facet<std::time_put<CharT>>(os.getloc());
+#endif
+    const CharT* command = nullptr;
+    CharT modified = CharT{};
+    for (; *fmt; ++fmt)
+    {
+        switch (*fmt)
+        {
+        case 'a':
+        case 'A':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    tm.tm_wday = static_cast<int>(extract_weekday(os, fds));
+                    if (os.fail())
+                        return os;
+#if !ONLY_C_LOCALE
+                    const CharT f[] = {'%', *fmt};
+                    facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+#else  // ONLY_C_LOCALE
+                    os << weekday_names().first[tm.tm_wday+7*(*fmt == 'a')];
+#endif  // ONLY_C_LOCALE
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                    modified = CharT{};
+                }
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'b':
+        case 'B':
+        case 'h':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    tm.tm_mon = static_cast<int>(extract_month(os, fds)) - 1;
+#if !ONLY_C_LOCALE
+                    const CharT f[] = {'%', *fmt};
+                    facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+#else  // ONLY_C_LOCALE
+                    os << month_names().first[tm.tm_mon+12*(*fmt != 'B')];
+#endif  // ONLY_C_LOCALE
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                    modified = CharT{};
+                }
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'c':
+        case 'x':
+            if (command)
+            {
+                if (modified == CharT{'O'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    if (!fds.ymd.ok())
+                        os.setstate(std::ios::failbit);
+                    if (*fmt == 'c' && !fds.has_tod)
+                        os.setstate(std::ios::failbit);
+#if !ONLY_C_LOCALE
+                    tm = std::tm{};
+                    auto const& ymd = fds.ymd;
+                    auto ld = local_days(ymd);
+                    if (*fmt == 'c')
+                    {
+                        tm.tm_sec = static_cast<int>(fds.tod.seconds().count());
+                        tm.tm_min = static_cast<int>(fds.tod.minutes().count());
+                        tm.tm_hour = static_cast<int>(fds.tod.hours().count());
+                    }
+                    tm.tm_mday = static_cast<int>(static_cast<unsigned>(ymd.day()));
+                    tm.tm_mon = static_cast<int>(extract_month(os, fds) - 1);
+                    tm.tm_year = static_cast<int>(ymd.year()) - 1900;
+                    tm.tm_wday = static_cast<int>(extract_weekday(os, fds));
+                    if (os.fail())
+                        return os;
+                    tm.tm_yday = static_cast<int>((ld - local_days(ymd.year()/1/1)).count());
+                    CharT f[3] = {'%'};
+                    auto fe = std::begin(f) + 1;
+                    if (modified == CharT{'E'})
+                        *fe++ = modified;
+                    *fe++ = *fmt;
+                    facet.put(os, os, os.fill(), &tm, std::begin(f), fe);
+#else  // ONLY_C_LOCALE
+                    if (*fmt == 'c')
+                    {
+                        auto wd = static_cast<int>(extract_weekday(os, fds));
+                        os << weekday_names().first[static_cast<unsigned>(wd)+7]
+                           << ' ';
+                        os << month_names().first[extract_month(os, fds)-1+12] << ' ';
+                        auto d = static_cast<int>(static_cast<unsigned>(fds.ymd.day()));
+                        if (d < 10)
+                            os << ' ';
+                        os << d << ' '
+                           << make_time(duration_cast<seconds>(fds.tod.to_duration()))
+                           << ' ' << fds.ymd.year();
+
+                    }
+                    else  // *fmt == 'x'
+                    {
+                        auto const& ymd = fds.ymd;
+                        save_ostream<CharT, Traits> _(os);
+                        os.fill('0');
+                        os.flags(std::ios::dec | std::ios::right);
+                        os.width(2);
+                        os << static_cast<unsigned>(ymd.month()) << CharT{'/'};
+                        os.width(2);
+                        os << static_cast<unsigned>(ymd.day()) << CharT{'/'};
+                        os.width(2);
+                        os << static_cast<int>(ymd.year()) % 100;
+                    }
+#endif  // ONLY_C_LOCALE
+                }
+                command = nullptr;
+                modified = CharT{};
+            }
+            else
+                os << *fmt;
+            break;
+        case 'C':
+            if (command)
+            {
+                if (modified == CharT{'O'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    if (!fds.ymd.year().ok())
+                        os.setstate(std::ios::failbit);
+                    auto y = static_cast<int>(fds.ymd.year());
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#endif
+                    {
+                        save_ostream<CharT, Traits> _(os);
+                        os.fill('0');
+                        os.flags(std::ios::dec | std::ios::right);
+                        if (y >= 0)
+                        {
+                            os.width(2);
+                            os << y/100;
+                        }
+                        else
+                        {
+                            os << CharT{'-'};
+                            os.width(2);
+                            os << -(y-99)/100;
+                        }
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'E'})
+                    {
+                        tm.tm_year = y - 1900;
+                        CharT f[3] = {'%', 'E', 'C'};
+                        facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                    }
+#endif
+                }
+                command = nullptr;
+                modified = CharT{};
+            }
+            else
+                os << *fmt;
+            break;
+        case 'd':
+        case 'e':
+            if (command)
+            {
+                if (modified == CharT{'E'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    if (!fds.ymd.day().ok())
+                        os.setstate(std::ios::failbit);
+                    auto d = static_cast<int>(static_cast<unsigned>(fds.ymd.day()));
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#endif
+                    {
+                        save_ostream<CharT, Traits> _(os);
+                        if (*fmt == CharT{'d'})
+                            os.fill('0');
+                        else
+                            os.fill(' ');
+                        os.flags(std::ios::dec | std::ios::right);
+                        os.width(2);
+                        os << d;
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        tm.tm_mday = d;
+                        CharT f[3] = {'%', 'O', *fmt};
+                        facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                    }
+#endif
+                }
+                command = nullptr;
+                modified = CharT{};
+            }
+            else
+                os << *fmt;
+            break;
+        case 'D':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    if (!fds.ymd.ok())
+                        os.setstate(std::ios::failbit);
+                    auto const& ymd = fds.ymd;
+                    save_ostream<CharT, Traits> _(os);
+                    os.fill('0');
+                    os.flags(std::ios::dec | std::ios::right);
+                    os.width(2);
+                    os << static_cast<unsigned>(ymd.month()) << CharT{'/'};
+                    os.width(2);
+                    os << static_cast<unsigned>(ymd.day()) << CharT{'/'};
+                    os.width(2);
+                    os << static_cast<int>(ymd.year()) % 100;
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                    modified = CharT{};
+                }
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'F':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    if (!fds.ymd.ok())
+                        os.setstate(std::ios::failbit);
+                    auto const& ymd = fds.ymd;
+                    save_ostream<CharT, Traits> _(os);
+                    os.imbue(std::locale::classic());
+                    os.fill('0');
+                    os.flags(std::ios::dec | std::ios::right);
+                    os.width(4);
+                    os << static_cast<int>(ymd.year()) << CharT{'-'};
+                    os.width(2);
+                    os << static_cast<unsigned>(ymd.month()) << CharT{'-'};
+                    os.width(2);
+                    os << static_cast<unsigned>(ymd.day());
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                    modified = CharT{};
+                }
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'g':
+        case 'G':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    if (!fds.ymd.ok())
+                        os.setstate(std::ios::failbit);
+                    auto ld = local_days(fds.ymd);
+                    auto y = year_month_day{ld + days{3}}.year();
+                    auto start = local_days((y-years{1})/December/Thursday[last]) +
+                                 (Monday-Thursday);
+                    if (ld < start)
+                        --y;
+                    if (*fmt == CharT{'G'})
+                        os << y;
+                    else
+                    {
+                        save_ostream<CharT, Traits> _(os);
+                        os.fill('0');
+                        os.flags(std::ios::dec | std::ios::right);
+                        os.width(2);
+                        os << std::abs(static_cast<int>(y)) % 100;
+                    }
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                    modified = CharT{};
+                }
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'H':
+        case 'I':
+            if (command)
+            {
+                if (modified == CharT{'E'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    if (!fds.has_tod)
+                        os.setstate(std::ios::failbit);
+                    if (insert_negative)
+                    {
+                        os << '-';
+                        insert_negative = false;
+                    }
+                    auto hms = fds.tod;
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#endif
+                    {
+                        auto h = *fmt == CharT{'I'} ? date::make12(hms.hours()) : hms.hours();
+                        if (h < hours{10})
+                            os << CharT{'0'};
+                        os << h.count();
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        const CharT f[] = {'%', modified, *fmt};
+                        tm.tm_hour = static_cast<int>(hms.hours().count());
+                        facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                    }
+#endif
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'j':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    if (fds.ymd.ok() || fds.has_tod)
+                    {
+                        days doy;
+                        if (fds.ymd.ok())
+                        {
+                            auto ld = local_days(fds.ymd);
+                            auto y = fds.ymd.year();
+                            doy = ld - local_days(y/January/1) + days{1};
+                        }
+                        else
+                        {
+                            doy = duration_cast<days>(fds.tod.to_duration());
+                        }
+                        save_ostream<CharT, Traits> _(os);
+                        os.fill('0');
+                        os.flags(std::ios::dec | std::ios::right);
+                        os.width(3);
+                        os << doy.count();
+                    }
+                    else
+                    {
+                        os.setstate(std::ios::failbit);
+                    }
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                    modified = CharT{};
+                }
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'm':
+            if (command)
+            {
+                if (modified == CharT{'E'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    if (!fds.ymd.month().ok())
+                        os.setstate(std::ios::failbit);
+                    auto m = static_cast<unsigned>(fds.ymd.month());
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#endif
+                    {
+                        if (m < 10)
+                            os << CharT{'0'};
+                        os << m;
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        const CharT f[] = {'%', modified, *fmt};
+                        tm.tm_mon = static_cast<int>(m-1);
+                        facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                    }
+#endif
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'M':
+            if (command)
+            {
+                if (modified == CharT{'E'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    if (!fds.has_tod)
+                        os.setstate(std::ios::failbit);
+                    if (insert_negative)
+                    {
+                        os << '-';
+                        insert_negative = false;
+                    }
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#endif
+                    {
+                        if (fds.tod.minutes() < minutes{10})
+                            os << CharT{'0'};
+                        os << fds.tod.minutes().count();
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        const CharT f[] = {'%', modified, *fmt};
+                        tm.tm_min = static_cast<int>(fds.tod.minutes().count());
+                        facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                    }
+#endif
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'n':
+            if (command)
+            {
+                if (modified == CharT{})
+                    os << CharT{'\n'};
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                    modified = CharT{};
+                }
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'p':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    if (!fds.has_tod)
+                        os.setstate(std::ios::failbit);
+#if !ONLY_C_LOCALE
+                    const CharT f[] = {'%', *fmt};
+                    tm.tm_hour = static_cast<int>(fds.tod.hours().count());
+                    facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+#else
+                    if (date::is_am(fds.tod.hours()))
+                        os << ampm_names().first[0];
+                    else
+                        os << ampm_names().first[1];
+#endif
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'Q':
+        case 'q':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    if (!fds.has_tod)
+                        os.setstate(std::ios::failbit);
+                    auto d = fds.tod.to_duration();
+                    if (*fmt == 'q')
+                        os << get_units<CharT>(typename decltype(d)::period::type{});
+                    else
+                        os << d.count();
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'r':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    if (!fds.has_tod)
+                        os.setstate(std::ios::failbit);
+#if !ONLY_C_LOCALE
+                    const CharT f[] = {'%', *fmt};
+                    tm.tm_hour = static_cast<int>(fds.tod.hours().count());
+                    tm.tm_min = static_cast<int>(fds.tod.minutes().count());
+                    tm.tm_sec = static_cast<int>(fds.tod.seconds().count());
+                    facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+#else
+                    hh_mm_ss<seconds> tod(duration_cast<seconds>(fds.tod.to_duration()));
+                    save_ostream<CharT, Traits> _(os);
+                    os.fill('0');
+                    os.width(2);
+                    os << date::make12(tod.hours()).count() << CharT{':'};
+                    os.width(2);
+                    os << tod.minutes().count() << CharT{':'};
+                    os.width(2);
+                    os << tod.seconds().count() << CharT{' '};
+                    if (date::is_am(tod.hours()))
+                        os << ampm_names().first[0];
+                    else
+                        os << ampm_names().first[1];
+#endif
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'R':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    if (!fds.has_tod)
+                        os.setstate(std::ios::failbit);
+                    if (fds.tod.hours() < hours{10})
+                        os << CharT{'0'};
+                    os << fds.tod.hours().count() << CharT{':'};
+                    if (fds.tod.minutes() < minutes{10})
+                        os << CharT{'0'};
+                    os << fds.tod.minutes().count();
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                    modified = CharT{};
+                }
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'S':
+            if (command)
+            {
+                if (modified == CharT{'E'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    if (!fds.has_tod)
+                        os.setstate(std::ios::failbit);
+                    if (insert_negative)
+                    {
+                        os << '-';
+                        insert_negative = false;
+                    }
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#endif
+                    {
+                        os << fds.tod.s_;
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        const CharT f[] = {'%', modified, *fmt};
+                        tm.tm_sec = static_cast<int>(fds.tod.s_.seconds().count());
+                        facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                    }
+#endif
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 't':
+            if (command)
+            {
+                if (modified == CharT{})
+                    os << CharT{'\t'};
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                    modified = CharT{};
+                }
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'T':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    if (!fds.has_tod)
+                        os.setstate(std::ios::failbit);
+                    os << fds.tod;
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                    modified = CharT{};
+                }
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'u':
+            if (command)
+            {
+                if (modified == CharT{'E'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    auto wd = extract_weekday(os, fds);
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#endif
+                    {
+                        os << (wd != 0 ? wd : 7u);
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        const CharT f[] = {'%', modified, *fmt};
+                        tm.tm_wday = static_cast<int>(wd);
+                        facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                    }
+#endif
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'U':
+            if (command)
+            {
+                if (modified == CharT{'E'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    auto const& ymd = fds.ymd;
+                    if (!ymd.ok())
+                        os.setstate(std::ios::failbit);
+                    auto ld = local_days(ymd);
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#endif
+                    {
+                        auto st = local_days(Sunday[1]/January/ymd.year());
+                        if (ld < st)
+                            os << CharT{'0'} << CharT{'0'};
+                        else
+                        {
+                            auto wn = duration_cast<weeks>(ld - st).count() + 1;
+                            if (wn < 10)
+                                os << CharT{'0'};
+                            os << wn;
+                        }
+                   }
+ #if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        const CharT f[] = {'%', modified, *fmt};
+                        tm.tm_year = static_cast<int>(ymd.year()) - 1900;
+                        tm.tm_wday = static_cast<int>(extract_weekday(os, fds));
+                        if (os.fail())
+                            return os;
+                        tm.tm_yday = static_cast<int>((ld - local_days(ymd.year()/1/1)).count());
+                        facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                    }
+#endif
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'V':
+            if (command)
+            {
+                if (modified == CharT{'E'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    if (!fds.ymd.ok())
+                        os.setstate(std::ios::failbit);
+                    auto ld = local_days(fds.ymd);
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#endif
+                    {
+                        auto y = year_month_day{ld + days{3}}.year();
+                        auto st = local_days((y-years{1})/12/Thursday[last]) +
+                                  (Monday-Thursday);
+                        if (ld < st)
+                        {
+                            --y;
+                            st = local_days((y - years{1})/12/Thursday[last]) +
+                                 (Monday-Thursday);
+                        }
+                        auto wn = duration_cast<weeks>(ld - st).count() + 1;
+                        if (wn < 10)
+                            os << CharT{'0'};
+                        os << wn;
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        const CharT f[] = {'%', modified, *fmt};
+                        auto const& ymd = fds.ymd;
+                        tm.tm_year = static_cast<int>(ymd.year()) - 1900;
+                        tm.tm_wday = static_cast<int>(extract_weekday(os, fds));
+                        if (os.fail())
+                            return os;
+                        tm.tm_yday = static_cast<int>((ld - local_days(ymd.year()/1/1)).count());
+                        facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                    }
+#endif
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'w':
+            if (command)
+            {
+                auto wd = extract_weekday(os, fds);
+                if (os.fail())
+                    return os;
+#if !ONLY_C_LOCALE
+                if (modified == CharT{})
+#else
+                if (modified != CharT{'E'})
+#endif
+                {
+                    os << wd;
+                }
+#if !ONLY_C_LOCALE
+                else if (modified == CharT{'O'})
+                {
+                    const CharT f[] = {'%', modified, *fmt};
+                    tm.tm_wday = static_cast<int>(wd);
+                    facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                }
+#endif
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'W':
+            if (command)
+            {
+                if (modified == CharT{'E'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    auto const& ymd = fds.ymd;
+                    if (!ymd.ok())
+                        os.setstate(std::ios::failbit);
+                    auto ld = local_days(ymd);
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#endif
+                    {
+                        auto st = local_days(Monday[1]/January/ymd.year());
+                        if (ld < st)
+                            os << CharT{'0'} << CharT{'0'};
+                        else
+                        {
+                            auto wn = duration_cast<weeks>(ld - st).count() + 1;
+                            if (wn < 10)
+                                os << CharT{'0'};
+                            os << wn;
+                        }
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        const CharT f[] = {'%', modified, *fmt};
+                        tm.tm_year = static_cast<int>(ymd.year()) - 1900;
+                        tm.tm_wday = static_cast<int>(extract_weekday(os, fds));
+                        if (os.fail())
+                            return os;
+                        tm.tm_yday = static_cast<int>((ld - local_days(ymd.year()/1/1)).count());
+                        facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                    }
+#endif
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'X':
+            if (command)
+            {
+                if (modified == CharT{'O'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    if (!fds.has_tod)
+                        os.setstate(std::ios::failbit);
+#if !ONLY_C_LOCALE
+                    tm = std::tm{};
+                    tm.tm_sec = static_cast<int>(fds.tod.seconds().count());
+                    tm.tm_min = static_cast<int>(fds.tod.minutes().count());
+                    tm.tm_hour = static_cast<int>(fds.tod.hours().count());
+                    CharT f[3] = {'%'};
+                    auto fe = std::begin(f) + 1;
+                    if (modified == CharT{'E'})
+                        *fe++ = modified;
+                    *fe++ = *fmt;
+                    facet.put(os, os, os.fill(), &tm, std::begin(f), fe);
+#else
+                    os << fds.tod;
+#endif
+                }
+                command = nullptr;
+                modified = CharT{};
+            }
+            else
+                os << *fmt;
+            break;
+        case 'y':
+            if (command)
+            {
+                if (!fds.ymd.year().ok())
+                    os.setstate(std::ios::failbit);
+                auto y = static_cast<int>(fds.ymd.year());
+#if !ONLY_C_LOCALE
+                if (modified == CharT{})
+                {
+#endif
+                    y = std::abs(y) % 100;
+                    if (y < 10)
+                        os << CharT{'0'};
+                    os << y;
+#if !ONLY_C_LOCALE
+                }
+                else
+                {
+                    const CharT f[] = {'%', modified, *fmt};
+                    tm.tm_year = y - 1900;
+                    facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                }
+#endif
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'Y':
+            if (command)
+            {
+                if (modified == CharT{'O'})
+                    os << CharT{'%'} << modified << *fmt;
+                else
+                {
+                    if (!fds.ymd.year().ok())
+                        os.setstate(std::ios::failbit);
+                    auto y = fds.ymd.year();
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#endif
+                    {
+                        save_ostream<CharT, Traits> _(os);
+                        os.imbue(std::locale::classic());
+                        os << y;
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'E'})
+                    {
+                        const CharT f[] = {'%', modified, *fmt};
+                        tm.tm_year = static_cast<int>(y) - 1900;
+                        facet.put(os, os, os.fill(), &tm, std::begin(f), std::end(f));
+                    }
+#endif
+                }
+                modified = CharT{};
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'z':
+            if (command)
+            {
+                if (offset_sec == nullptr)
+                {
+                    // Can not format %z with unknown offset
+                    os.setstate(ios::failbit);
+                    return os;
+                }
+                auto m = duration_cast<minutes>(*offset_sec);
+                auto neg = m < minutes{0};
+                m = date::abs(m);
+                auto h = duration_cast<hours>(m);
+                m -= h;
+                if (neg)
+                    os << CharT{'-'};
+                else
+                    os << CharT{'+'};
+                if (h < hours{10})
+                    os << CharT{'0'};
+                os << h.count();
+                if (modified != CharT{})
+                    os << CharT{':'};
+                if (m < minutes{10})
+                    os << CharT{'0'};
+                os << m.count();
+                command = nullptr;
+                modified = CharT{};
+            }
+            else
+                os << *fmt;
+            break;
+        case 'Z':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    if (abbrev == nullptr)
+                    {
+                        // Can not format %Z with unknown time_zone
+                        os.setstate(ios::failbit);
+                        return os;
+                    }
+                    for (auto c : *abbrev)
+                        os << CharT(c);
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                    modified = CharT{};
+                }
+                command = nullptr;
+            }
+            else
+                os << *fmt;
+            break;
+        case 'E':
+        case 'O':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    modified = *fmt;
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << *fmt;
+                    command = nullptr;
+                    modified = CharT{};
+                }
+            }
+            else
+                os << *fmt;
+            break;
+        case '%':
+            if (command)
+            {
+                if (modified == CharT{})
+                {
+                    os << CharT{'%'};
+                    command = nullptr;
+                }
+                else
+                {
+                    os << CharT{'%'} << modified << CharT{'%'};
+                    command = nullptr;
+                    modified = CharT{};
+                }
+            }
+            else
+                command = fmt;
+            break;
+        default:
+            if (command)
+            {
+                os << CharT{'%'};
+                command = nullptr;
+            }
+            if (modified != CharT{})
+            {
+                os << modified;
+                modified = CharT{};
+            }
+            os << *fmt;
+            break;
+        }
+    }
+    if (command)
+        os << CharT{'%'};
+    if (modified != CharT{})
+        os << modified;
+    return os;
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt, const year& y)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{y/0/0};
+    return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt, const month& m)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{m/0/nanyear};
+    return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt, const day& d)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{d/0/nanyear};
+    return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt, const weekday& wd)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{wd};
+    return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt, const year_month& ym)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{ym/0};
+    return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt, const month_day& md)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{md/nanyear};
+    return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+          const year_month_day& ymd)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{ymd};
+    return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits, class Rep, class Period>
+inline
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+          const std::chrono::duration<Rep, Period>& d)
+{
+    using Duration = std::chrono::duration<Rep, Period>;
+    using CT = typename std::common_type<Duration, std::chrono::seconds>::type;
+    fields<CT> fds{hh_mm_ss<CT>{d}};
+    return to_stream(os, fmt, fds);
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+          const local_time<Duration>& tp, const std::string* abbrev = nullptr,
+          const std::chrono::seconds* offset_sec = nullptr)
+{
+    using CT = typename std::common_type<Duration, std::chrono::seconds>::type;
+    auto ld = std::chrono::time_point_cast<days>(tp);
+    fields<CT> fds;
+    if (ld <= tp)
+        fds = fields<CT>{year_month_day{ld}, hh_mm_ss<CT>{tp-local_seconds{ld}}};
+    else
+        fds = fields<CT>{year_month_day{ld - days{1}},
+                         hh_mm_ss<CT>{days{1} - (local_seconds{ld} - tp)}};
+    return to_stream(os, fmt, fds, abbrev, offset_sec);
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+          const sys_time<Duration>& tp)
+{
+    using std::chrono::seconds;
+    using CT = typename std::common_type<Duration, seconds>::type;
+    const std::string abbrev("UTC");
+    CONSTDATA seconds offset{0};
+    auto sd = std::chrono::time_point_cast<days>(tp);
+    fields<CT> fds;
+    if (sd <= tp)
+        fds = fields<CT>{year_month_day{sd}, hh_mm_ss<CT>{tp-sys_seconds{sd}}};
+    else
+        fds = fields<CT>{year_month_day{sd - days{1}},
+                         hh_mm_ss<CT>{days{1} - (sys_seconds{sd} - tp)}};
+    return to_stream(os, fmt, fds, &abbrev, &offset);
+}
+
+// format
+
+template <class CharT, class Streamable>
+auto
+format(const std::locale& loc, const CharT* fmt, const Streamable& tp)
+    -> decltype(to_stream(std::declval<std::basic_ostream<CharT>&>(), fmt, tp),
+                std::basic_string<CharT>{})
+{
+    std::basic_ostringstream<CharT> os;
+    os.exceptions(std::ios::failbit | std::ios::badbit);
+    os.imbue(loc);
+    to_stream(os, fmt, tp);
+    return os.str();
+}
+
+template <class CharT, class Streamable>
+auto
+format(const CharT* fmt, const Streamable& tp)
+    -> decltype(to_stream(std::declval<std::basic_ostream<CharT>&>(), fmt, tp),
+                std::basic_string<CharT>{})
+{
+    std::basic_ostringstream<CharT> os;
+    os.exceptions(std::ios::failbit | std::ios::badbit);
+    to_stream(os, fmt, tp);
+    return os.str();
+}
+
+template <class CharT, class Traits, class Alloc, class Streamable>
+auto
+format(const std::locale& loc, const std::basic_string<CharT, Traits, Alloc>& fmt,
+       const Streamable& tp)
+    -> decltype(to_stream(std::declval<std::basic_ostream<CharT, Traits>&>(), fmt.c_str(), tp),
+                std::basic_string<CharT, Traits, Alloc>{})
+{
+    std::basic_ostringstream<CharT, Traits, Alloc> os;
+    os.exceptions(std::ios::failbit | std::ios::badbit);
+    os.imbue(loc);
+    to_stream(os, fmt.c_str(), tp);
+    return os.str();
+}
+
+template <class CharT, class Traits, class Alloc, class Streamable>
+auto
+format(const std::basic_string<CharT, Traits, Alloc>& fmt, const Streamable& tp)
+    -> decltype(to_stream(std::declval<std::basic_ostream<CharT, Traits>&>(), fmt.c_str(), tp),
+                std::basic_string<CharT, Traits, Alloc>{})
+{
+    std::basic_ostringstream<CharT, Traits, Alloc> os;
+    os.exceptions(std::ios::failbit | std::ios::badbit);
+    to_stream(os, fmt.c_str(), tp);
+    return os.str();
+}
+
+// parse
+
+namespace detail
+{
+
+template <class CharT, class Traits>
+bool
+read_char(std::basic_istream<CharT, Traits>& is, CharT fmt, std::ios::iostate& err)
+{
+    auto ic = is.get();
+    if (Traits::eq_int_type(ic, Traits::eof()) ||
+       !Traits::eq(Traits::to_char_type(ic), fmt))
+    {
+        err |= std::ios::failbit;
+        is.setstate(std::ios::failbit);
+        return false;
+    }
+    return true;
+}
+
+template <class CharT, class Traits>
+unsigned
+read_unsigned(std::basic_istream<CharT, Traits>& is, unsigned m = 1, unsigned M = 10)
+{
+    unsigned x = 0;
+    unsigned count = 0;
+    while (true)
+    {
+        auto ic = is.peek();
+        if (Traits::eq_int_type(ic, Traits::eof()))
+            break;
+        auto c = static_cast<char>(Traits::to_char_type(ic));
+        if (!('0' <= c && c <= '9'))
+            break;
+        (void)is.get();
+        ++count;
+        x = 10*x + static_cast<unsigned>(c - '0');
+        if (count == M)
+            break;
+    }
+    if (count < m)
+        is.setstate(std::ios::failbit);
+    return x;
+}
+
+template <class CharT, class Traits>
+int
+read_signed(std::basic_istream<CharT, Traits>& is, unsigned m = 1, unsigned M = 10)
+{
+    auto ic = is.peek();
+    if (!Traits::eq_int_type(ic, Traits::eof()))
+    {
+        auto c = static_cast<char>(Traits::to_char_type(ic));
+        if (('0' <= c && c <= '9') || c == '-' || c == '+')
+        {
+            if (c == '-' || c == '+')
+            {
+                (void)is.get();
+                --M;
+            }
+            auto x = static_cast<int>(read_unsigned(is, std::max(m, 1u), M));
+            if (!is.fail())
+            {
+                if (c == '-')
+                    x = -x;
+                return x;
+            }
+        }
+    }
+    if (m > 0)
+        is.setstate(std::ios::failbit);
+    return 0;
+}
+
+template <class CharT, class Traits>
+long double
+read_long_double(std::basic_istream<CharT, Traits>& is, unsigned m = 1, unsigned M = 10)
+{
+    unsigned count = 0;
+    unsigned fcount = 0;
+    unsigned long long i = 0;
+    unsigned long long f = 0;
+    bool parsing_fraction = false;
+#if ONLY_C_LOCALE
+    typename Traits::int_type decimal_point = '.';
+#else
+    auto decimal_point = Traits::to_int_type(
+        std::use_facet<std::numpunct<CharT>>(is.getloc()).decimal_point());
+#endif
+    while (true)
+    {
+        auto ic = is.peek();
+        if (Traits::eq_int_type(ic, Traits::eof()))
+            break;
+        if (Traits::eq_int_type(ic, decimal_point))
+        {
+            decimal_point = Traits::eof();
+            parsing_fraction = true;
+        }
+        else
+        {
+            auto c = static_cast<char>(Traits::to_char_type(ic));
+            if (!('0' <= c && c <= '9'))
+                break;
+            if (!parsing_fraction)
+            {
+                i = 10*i + static_cast<unsigned>(c - '0');
+            }
+            else
+            {
+                f = 10*f + static_cast<unsigned>(c - '0');
+                ++fcount;
+            }
+        }
+        (void)is.get();
+        if (++count == M)
+            break;
+    }
+    if (count < m)
+    {
+        is.setstate(std::ios::failbit);
+        return 0;
+    }
+    return static_cast<long double>(i) + static_cast<long double>(f)/std::pow(10.L, fcount);
+}
+
+struct rs
+{
+    int& i;
+    unsigned m;
+    unsigned M;
+};
+
+struct ru
+{
+    int& i;
+    unsigned m;
+    unsigned M;
+};
+
+struct rld
+{
+    long double& i;
+    unsigned m;
+    unsigned M;
+};
+
+template <class CharT, class Traits>
+void
+read(std::basic_istream<CharT, Traits>&)
+{
+}
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, CharT a0, Args&& ...args);
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, rs a0, Args&& ...args);
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, ru a0, Args&& ...args);
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, int a0, Args&& ...args);
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, rld a0, Args&& ...args);
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, CharT a0, Args&& ...args)
+{
+    // No-op if a0 == CharT{}
+    if (a0 != CharT{})
+    {
+        auto ic = is.peek();
+        if (Traits::eq_int_type(ic, Traits::eof()))
+        {
+            is.setstate(std::ios::failbit | std::ios::eofbit);
+            return;
+        }
+        if (!Traits::eq(Traits::to_char_type(ic), a0))
+        {
+            is.setstate(std::ios::failbit);
+            return;
+        }
+        (void)is.get();
+    }
+    read(is, std::forward<Args>(args)...);
+}
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, rs a0, Args&& ...args)
+{
+    auto x = read_signed(is, a0.m, a0.M);
+    if (is.fail())
+        return;
+    a0.i = x;
+    read(is, std::forward<Args>(args)...);
+}
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, ru a0, Args&& ...args)
+{
+    auto x = read_unsigned(is, a0.m, a0.M);
+    if (is.fail())
+        return;
+    a0.i = static_cast<int>(x);
+    read(is, std::forward<Args>(args)...);
+}
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, int a0, Args&& ...args)
+{
+    if (a0 != -1)
+    {
+        auto u = static_cast<unsigned>(a0);
+        CharT buf[std::numeric_limits<unsigned>::digits10+2u] = {};
+        auto e = buf;
+        do
+        {
+            *e++ = static_cast<CharT>(CharT(u % 10) + CharT{'0'});
+            u /= 10;
+        } while (u > 0);
+#if defined(__GNUC__) && __GNUC__ >= 11
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+        std::reverse(buf, e);
+#if defined(__GNUC__) && __GNUC__ >= 11
+#pragma GCC diagnostic pop
+#endif
+        for (auto p = buf; p != e && is.rdstate() == std::ios::goodbit; ++p)
+            read(is, *p);
+    }
+    if (is.rdstate() == std::ios::goodbit)
+        read(is, std::forward<Args>(args)...);
+}
+
+template <class CharT, class Traits, class ...Args>
+void
+read(std::basic_istream<CharT, Traits>& is, rld a0, Args&& ...args)
+{
+    auto x = read_long_double(is, a0.m, a0.M);
+    if (is.fail())
+        return;
+    a0.i = x;
+    read(is, std::forward<Args>(args)...);
+}
+
+template <class T, class CharT, class Traits>
+inline
+void
+checked_set(T& value, T from, T not_a_value, std::basic_ios<CharT, Traits>& is)
+{
+    if (!is.fail())
+    {
+        if (value == not_a_value)
+            value = std::move(from);
+        else if (value != from)
+            is.setstate(std::ios::failbit);
+    }
+}
+
+}  // namespace detail;
+
+template <class CharT, class Traits, class Duration, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+            fields<Duration>& fds, std::basic_string<CharT, Traits, Alloc>* abbrev,
+            std::chrono::minutes* offset)
+{
+    using std::numeric_limits;
+    using std::ios;
+    using std::chrono::duration;
+    using std::chrono::duration_cast;
+    using std::chrono::seconds;
+    using std::chrono::minutes;
+    using std::chrono::hours;
+    using detail::round_i;
+    typename std::basic_istream<CharT, Traits>::sentry ok{is, true};
+    if (ok)
+    {
+        date::detail::save_istream<CharT, Traits> ss(is);
+        is.fill(' ');
+        is.flags(std::ios::skipws | std::ios::dec);
+        is.width(0);
+#if !ONLY_C_LOCALE
+        auto& f = std::use_facet<std::time_get<CharT>>(is.getloc());
+        std::tm tm{};
+#endif
+        const CharT* command = nullptr;
+        auto modified = CharT{};
+        auto width = -1;
+
+        CONSTDATA int not_a_year = numeric_limits<short>::min();
+        CONSTDATA int not_a_2digit_year = 100;
+        CONSTDATA int not_a_century = numeric_limits<int>::min();
+        CONSTDATA int not_a_month = 0;
+        CONSTDATA int not_a_day = 0;
+        CONSTDATA int not_a_hour = numeric_limits<int>::min();
+        CONSTDATA int not_a_hour_12_value = 0;
+        CONSTDATA int not_a_minute = not_a_hour;
+        CONSTDATA Duration not_a_second = Duration::min();
+        CONSTDATA int not_a_doy = -1;
+        CONSTDATA int not_a_weekday = 8;
+        CONSTDATA int not_a_week_num = 100;
+        CONSTDATA int not_a_ampm = -1;
+        CONSTDATA minutes not_a_offset = minutes::min();
+
+        int Y = not_a_year;             // c, F, Y                   *
+        int y = not_a_2digit_year;      // D, x, y                   *
+        int g = not_a_2digit_year;      // g                         *
+        int G = not_a_year;             // G                         *
+        int C = not_a_century;          // C                         *
+        int m = not_a_month;            // b, B, h, m, c, D, F, x    *
+        int d = not_a_day;              // c, d, D, e, F, x          *
+        int j = not_a_doy;              // j                         *
+        int wd = not_a_weekday;         // a, A, u, w                *
+        int H = not_a_hour;             // c, H, R, T, X             *
+        int I = not_a_hour_12_value;    // I, r                      *
+        int p = not_a_ampm;             // p, r                      *
+        int M = not_a_minute;           // c, M, r, R, T, X          *
+        Duration s = not_a_second;      // c, r, S, T, X             *
+        int U = not_a_week_num;         // U                         *
+        int V = not_a_week_num;         // V                         *
+        int W = not_a_week_num;         // W                         *
+        std::basic_string<CharT, Traits, Alloc> temp_abbrev;  // Z   *
+        minutes temp_offset = not_a_offset;  // z                    *
+
+        using detail::read;
+        using detail::rs;
+        using detail::ru;
+        using detail::rld;
+        using detail::checked_set;
+        for (; *fmt != CharT{} && !is.fail(); ++fmt)
+        {
+            switch (*fmt)
+            {
+            case 'a':
+            case 'A':
+            case 'u':
+            case 'w':  // wd:  a, A, u, w
+                if (command)
+                {
+                    int trial_wd = not_a_weekday;
+                    if (*fmt == 'a' || *fmt == 'A')
+                    {
+                        if (modified == CharT{})
+                        {
+#if !ONLY_C_LOCALE
+                            ios::iostate err = ios::goodbit;
+                            f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                            is.setstate(err);
+                            if (!is.fail())
+                                trial_wd = tm.tm_wday;
+#else
+                            auto nm = detail::weekday_names();
+                            auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first;
+                            if (!is.fail())
+                                trial_wd = i % 7;
+#endif
+                        }
+                        else
+                            read(is, CharT{'%'}, width, modified, *fmt);
+                    }
+                    else  // *fmt == 'u' || *fmt == 'w'
+                    {
+#if !ONLY_C_LOCALE
+                        if (modified == CharT{})
+#else
+                        if (modified != CharT{'E'})
+#endif
+                        {
+                            read(is, ru{trial_wd, 1, width == -1 ?
+                                                      1u : static_cast<unsigned>(width)});
+                            if (!is.fail())
+                            {
+                                if (*fmt == 'u')
+                                {
+                                    if (!(1 <= trial_wd && trial_wd <= 7))
+                                    {
+                                        trial_wd = not_a_weekday;
+                                        is.setstate(ios::failbit);
+                                    }
+                                    else if (trial_wd == 7)
+                                        trial_wd = 0;
+                                }
+                                else  // *fmt == 'w'
+                                {
+                                    if (!(0 <= trial_wd && trial_wd <= 6))
+                                    {
+                                        trial_wd = not_a_weekday;
+                                        is.setstate(ios::failbit);
+                                    }
+                                }
+                            }
+                        }
+#if !ONLY_C_LOCALE
+                        else if (modified == CharT{'O'})
+                        {
+                            ios::iostate err = ios::goodbit;
+                            f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                            is.setstate(err);
+                            if (!is.fail())
+                                trial_wd = tm.tm_wday;
+                        }
+#endif
+                        else
+                            read(is, CharT{'%'}, width, modified, *fmt);
+                    }
+                    if (trial_wd != not_a_weekday)
+                        checked_set(wd, trial_wd, not_a_weekday, is);
+                }
+                else  // !command
+                    read(is, *fmt);
+                command = nullptr;
+                width = -1;
+                modified = CharT{};
+                break;
+            case 'b':
+            case 'B':
+            case 'h':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        int ttm = not_a_month;
+#if !ONLY_C_LOCALE
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        if ((err & ios::failbit) == 0)
+                            ttm = tm.tm_mon + 1;
+                        is.setstate(err);
+#else
+                        auto nm = detail::month_names();
+                        auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first;
+                        if (!is.fail())
+                            ttm = i % 12 + 1;
+#endif
+                        checked_set(m, ttm, not_a_month, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'c':
+                if (command)
+                {
+                    if (modified != CharT{'O'})
+                    {
+#if !ONLY_C_LOCALE
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        if ((err & ios::failbit) == 0)
+                        {
+                            checked_set(Y, tm.tm_year + 1900, not_a_year, is);
+                            checked_set(m, tm.tm_mon + 1, not_a_month, is);
+                            checked_set(d, tm.tm_mday, not_a_day, is);
+                            checked_set(H, tm.tm_hour, not_a_hour, is);
+                            checked_set(M, tm.tm_min, not_a_minute, is);
+                            checked_set(s, duration_cast<Duration>(seconds{tm.tm_sec}),
+                                        not_a_second, is);
+                        }
+                        is.setstate(err);
+#else
+                        // "%a %b %e %T %Y"
+                        auto nm = detail::weekday_names();
+                        auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first;
+                        checked_set(wd, static_cast<int>(i % 7), not_a_weekday, is);
+                        ws(is);
+                        nm = detail::month_names();
+                        i = detail::scan_keyword(is, nm.first, nm.second) - nm.first;
+                        checked_set(m, static_cast<int>(i % 12 + 1), not_a_month, is);
+                        ws(is);
+                        int td = not_a_day;
+                        read(is, rs{td, 1, 2});
+                        checked_set(d, td, not_a_day, is);
+                        ws(is);
+                        using dfs = detail::decimal_format_seconds<Duration>;
+                        CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width;
+                        int tH;
+                        int tM;
+                        long double S{};
+                        read(is, ru{tH, 1, 2}, CharT{':'}, ru{tM, 1, 2},
+                                               CharT{':'}, rld{S, 1, w});
+                        checked_set(H, tH, not_a_hour, is);
+                        checked_set(M, tM, not_a_minute, is);
+                        checked_set(s, round_i<Duration>(duration<long double>{S}),
+                                    not_a_second, is);
+                        ws(is);
+                        int tY = not_a_year;
+                        read(is, rs{tY, 1, 4u});
+                        checked_set(Y, tY, not_a_year, is);
+#endif
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'x':
+                if (command)
+                {
+                    if (modified != CharT{'O'})
+                    {
+#if !ONLY_C_LOCALE
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        if ((err & ios::failbit) == 0)
+                        {
+                            checked_set(Y, tm.tm_year + 1900, not_a_year, is);
+                            checked_set(m, tm.tm_mon + 1, not_a_month, is);
+                            checked_set(d, tm.tm_mday, not_a_day, is);
+                        }
+                        is.setstate(err);
+#else
+                        // "%m/%d/%y"
+                        int ty = not_a_2digit_year;
+                        int tm = not_a_month;
+                        int td = not_a_day;
+                        read(is, ru{tm, 1, 2}, CharT{'/'}, ru{td, 1, 2}, CharT{'/'},
+                                 rs{ty, 1, 2});
+                        checked_set(y, ty, not_a_2digit_year, is);
+                        checked_set(m, tm, not_a_month, is);
+                        checked_set(d, td, not_a_day, is);
+#endif
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'X':
+                if (command)
+                {
+                    if (modified != CharT{'O'})
+                    {
+#if !ONLY_C_LOCALE
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        if ((err & ios::failbit) == 0)
+                        {
+                            checked_set(H, tm.tm_hour, not_a_hour, is);
+                            checked_set(M, tm.tm_min, not_a_minute, is);
+                            checked_set(s, duration_cast<Duration>(seconds{tm.tm_sec}),
+                                        not_a_second, is);
+                        }
+                        is.setstate(err);
+#else
+                        // "%T"
+                        using dfs = detail::decimal_format_seconds<Duration>;
+                        CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width;
+                        int tH = not_a_hour;
+                        int tM = not_a_minute;
+                        long double S{};
+                        read(is, ru{tH, 1, 2}, CharT{':'}, ru{tM, 1, 2},
+                                               CharT{':'}, rld{S, 1, w});
+                        checked_set(H, tH, not_a_hour, is);
+                        checked_set(M, tM, not_a_minute, is);
+                        checked_set(s, round_i<Duration>(duration<long double>{S}),
+                                    not_a_second, is);
+#endif
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'C':
+                if (command)
+                {
+                    int tC = not_a_century;
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+                    {
+#endif
+                        read(is, rs{tC, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+#if !ONLY_C_LOCALE
+                    }
+                    else
+                    {
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        if ((err & ios::failbit) == 0)
+                        {
+                            auto tY = tm.tm_year + 1900;
+                            tC = (tY >= 0 ? tY : tY-99) / 100;
+                        }
+                        is.setstate(err);
+                    }
+#endif
+                    checked_set(C, tC, not_a_century, is);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'D':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        int tn = not_a_month;
+                        int td = not_a_day;
+                        int ty = not_a_2digit_year;
+                        read(is, ru{tn, 1, 2}, CharT{'\0'}, CharT{'/'}, CharT{'\0'},
+                                 ru{td, 1, 2}, CharT{'\0'}, CharT{'/'}, CharT{'\0'},
+                                 rs{ty, 1, 2});
+                        checked_set(y, ty, not_a_2digit_year, is);
+                        checked_set(m, tn, not_a_month, is);
+                        checked_set(d, td, not_a_day, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'F':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        int tY = not_a_year;
+                        int tn = not_a_month;
+                        int td = not_a_day;
+                        read(is, rs{tY, 1, width == -1 ? 4u : static_cast<unsigned>(width)},
+                                 CharT{'-'}, ru{tn, 1, 2}, CharT{'-'}, ru{td, 1, 2});
+                        checked_set(Y, tY, not_a_year, is);
+                        checked_set(m, tn, not_a_month, is);
+                        checked_set(d, td, not_a_day, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'd':
+            case 'e':
+                if (command)
+                {
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#else
+                    if (modified != CharT{'E'})
+#endif
+                    {
+                        int td = not_a_day;
+                        read(is, rs{td, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+                        checked_set(d, td, not_a_day, is);
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        command = nullptr;
+                        width = -1;
+                        modified = CharT{};
+                        if ((err & ios::failbit) == 0)
+                            checked_set(d, tm.tm_mday, not_a_day, is);
+                        is.setstate(err);
+                    }
+#endif
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'H':
+                if (command)
+                {
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#else
+                    if (modified != CharT{'E'})
+#endif
+                    {
+                        int tH = not_a_hour;
+                        read(is, ru{tH, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+                        checked_set(H, tH, not_a_hour, is);
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        if ((err & ios::failbit) == 0)
+                            checked_set(H, tm.tm_hour, not_a_hour, is);
+                        is.setstate(err);
+                    }
+#endif
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'I':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        int tI = not_a_hour_12_value;
+                        // reads in an hour into I, but most be in [1, 12]
+                        read(is, rs{tI, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+                        if (!(1 <= tI && tI <= 12))
+                            is.setstate(ios::failbit);
+                        checked_set(I, tI, not_a_hour_12_value, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+               break;
+            case 'j':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        int tj = not_a_doy;
+                        read(is, ru{tj, 1, width == -1 ? 3u : static_cast<unsigned>(width)});
+                        checked_set(j, tj, not_a_doy, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'M':
+                if (command)
+                {
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#else
+                    if (modified != CharT{'E'})
+#endif
+                    {
+                        int tM = not_a_minute;
+                        read(is, ru{tM, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+                        checked_set(M, tM, not_a_minute, is);
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        if ((err & ios::failbit) == 0)
+                            checked_set(M, tm.tm_min, not_a_minute, is);
+                        is.setstate(err);
+                    }
+#endif
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'm':
+                if (command)
+                {
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#else
+                    if (modified != CharT{'E'})
+#endif
+                    {
+                        int tn = not_a_month;
+                        read(is, rs{tn, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+                        checked_set(m, tn, not_a_month, is);
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        if ((err & ios::failbit) == 0)
+                            checked_set(m, tm.tm_mon + 1, not_a_month, is);
+                        is.setstate(err);
+                    }
+#endif
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'n':
+            case 't':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        // %n matches a single white space character
+                        // %t matches 0 or 1 white space characters
+                        auto ic = is.peek();
+                        if (Traits::eq_int_type(ic, Traits::eof()))
+                        {
+                            ios::iostate err = ios::eofbit;
+                            if (*fmt == 'n')
+                                err |= ios::failbit;
+                            is.setstate(err);
+                            break;
+                        }
+                        if (isspace(ic))
+                        {
+                            (void)is.get();
+                        }
+                        else if (*fmt == 'n')
+                            is.setstate(ios::failbit);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'p':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        int tp = not_a_ampm;
+#if !ONLY_C_LOCALE
+                        tm = std::tm{};
+                        tm.tm_hour = 1;
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        is.setstate(err);
+                        if (tm.tm_hour == 1)
+                            tp = 0;
+                        else if (tm.tm_hour == 13)
+                            tp = 1;
+                        else
+                            is.setstate(err);
+#else
+                        auto nm = detail::ampm_names();
+                        auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first;
+                        tp = static_cast<decltype(tp)>(i);
+#endif
+                        checked_set(p, tp, not_a_ampm, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+
+               break;
+            case 'r':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+#if !ONLY_C_LOCALE
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        if ((err & ios::failbit) == 0)
+                        {
+                            checked_set(H, tm.tm_hour, not_a_hour, is);
+                            checked_set(M, tm.tm_min, not_a_hour, is);
+                            checked_set(s, duration_cast<Duration>(seconds{tm.tm_sec}),
+                                        not_a_second, is);
+                        }
+                        is.setstate(err);
+#else
+                        // "%I:%M:%S %p"
+                        using dfs = detail::decimal_format_seconds<Duration>;
+                        CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width;
+                        long double S{};
+                        int tI = not_a_hour_12_value;
+                        int tM = not_a_minute;
+                        read(is, ru{tI, 1, 2}, CharT{':'}, ru{tM, 1, 2},
+                                               CharT{':'}, rld{S, 1, w});
+                        checked_set(I, tI, not_a_hour_12_value, is);
+                        checked_set(M, tM, not_a_minute, is);
+                        checked_set(s, round_i<Duration>(duration<long double>{S}),
+                                    not_a_second, is);
+                        ws(is);
+                        auto nm = detail::ampm_names();
+                        auto i = detail::scan_keyword(is, nm.first, nm.second) - nm.first;
+                        checked_set(p, static_cast<int>(i), not_a_ampm, is);
+#endif
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'R':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        int tH = not_a_hour;
+                        int tM = not_a_minute;
+                        read(is, ru{tH, 1, 2}, CharT{'\0'}, CharT{':'}, CharT{'\0'},
+                                 ru{tM, 1, 2}, CharT{'\0'});
+                        checked_set(H, tH, not_a_hour, is);
+                        checked_set(M, tM, not_a_minute, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'S':
+                if (command)
+                {
+ #if !ONLY_C_LOCALE
+                   if (modified == CharT{})
+#else
+                   if (modified != CharT{'E'})
+#endif
+                    {
+                        using dfs = detail::decimal_format_seconds<Duration>;
+                        CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width;
+                        long double S{};
+                        read(is, rld{S, 1, width == -1 ? w : static_cast<unsigned>(width)});
+                        checked_set(s, round_i<Duration>(duration<long double>{S}),
+                                    not_a_second, is);
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'O'})
+                    {
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        if ((err & ios::failbit) == 0)
+                            checked_set(s, duration_cast<Duration>(seconds{tm.tm_sec}),
+                                        not_a_second, is);
+                        is.setstate(err);
+                    }
+#endif
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'T':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        using dfs = detail::decimal_format_seconds<Duration>;
+                        CONSTDATA auto w = Duration::period::den == 1 ? 2 : 3 + dfs::width;
+                        int tH = not_a_hour;
+                        int tM = not_a_minute;
+                        long double S{};
+                        read(is, ru{tH, 1, 2}, CharT{':'}, ru{tM, 1, 2},
+                                               CharT{':'}, rld{S, 1, w});
+                        checked_set(H, tH, not_a_hour, is);
+                        checked_set(M, tM, not_a_minute, is);
+                        checked_set(s, round_i<Duration>(duration<long double>{S}),
+                                    not_a_second, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'Y':
+                if (command)
+                {
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#else
+                    if (modified != CharT{'O'})
+#endif
+                    {
+                        int tY = not_a_year;
+                        read(is, rs{tY, 1, width == -1 ? 4u : static_cast<unsigned>(width)});
+                        checked_set(Y, tY, not_a_year, is);
+                    }
+#if !ONLY_C_LOCALE
+                    else if (modified == CharT{'E'})
+                    {
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        if ((err & ios::failbit) == 0)
+                            checked_set(Y, tm.tm_year + 1900, not_a_year, is);
+                        is.setstate(err);
+                    }
+#endif
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'y':
+                if (command)
+                {
+#if !ONLY_C_LOCALE
+                    if (modified == CharT{})
+#endif
+                    {
+                        int ty = not_a_2digit_year;
+                        read(is, ru{ty, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+                        checked_set(y, ty, not_a_2digit_year, is);
+                    }
+#if !ONLY_C_LOCALE
+                    else
+                    {
+                        ios::iostate err = ios::goodbit;
+                        f.get(is, nullptr, is, err, &tm, command, fmt+1);
+                        if ((err & ios::failbit) == 0)
+                            checked_set(Y, tm.tm_year + 1900, not_a_year, is);
+                        is.setstate(err);
+                    }
+#endif
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'g':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        int tg = not_a_2digit_year;
+                        read(is, ru{tg, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+                        checked_set(g, tg, not_a_2digit_year, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'G':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        int tG = not_a_year;
+                        read(is, rs{tG, 1, width == -1 ? 4u : static_cast<unsigned>(width)});
+                        checked_set(G, tG, not_a_year, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'U':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        int tU = not_a_week_num;
+                        read(is, ru{tU, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+                        checked_set(U, tU, not_a_week_num, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'V':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        int tV = not_a_week_num;
+                        read(is, ru{tV, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+                        checked_set(V, tV, not_a_week_num, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'W':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        int tW = not_a_week_num;
+                        read(is, ru{tW, 1, width == -1 ? 2u : static_cast<unsigned>(width)});
+                        checked_set(W, tW, not_a_week_num, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'E':
+            case 'O':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        modified = *fmt;
+                    }
+                    else
+                    {
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                        command = nullptr;
+                        width = -1;
+                        modified = CharT{};
+                    }
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case '%':
+                if (command)
+                {
+                    if (modified == CharT{})
+                        read(is, *fmt);
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    command = fmt;
+                break;
+            case 'z':
+                if (command)
+                {
+                    int tH, tM;
+                    minutes toff = not_a_offset;
+                    bool neg = false;
+                    auto ic = is.peek();
+                    if (!Traits::eq_int_type(ic, Traits::eof()))
+                    {
+                        auto c = static_cast<char>(Traits::to_char_type(ic));
+                        if (c == '-')
+                        {
+                            neg = true;
+                            (void)is.get();
+                        }
+                        else if (c == '+')
+                            (void)is.get();
+                    }
+                    if (modified == CharT{})
+                    {
+                        read(is, rs{tH, 2, 2});
+                        if (!is.fail())
+                            toff = hours{std::abs(tH)};
+                        if (is.good())
+                        {
+                            ic = is.peek();
+                            if (!Traits::eq_int_type(ic, Traits::eof()))
+                            {
+                                auto c = static_cast<char>(Traits::to_char_type(ic));
+                                if ('0' <= c && c <= '9')
+                                {
+                                    read(is, ru{tM, 2, 2});
+                                    if (!is.fail())
+                                        toff += minutes{tM};
+                                }
+                            }
+                        }
+                    }
+                    else
+                    {
+                        read(is, rs{tH, 1, 2});
+                        if (!is.fail())
+                            toff = hours{std::abs(tH)};
+                        if (is.good())
+                        {
+                            ic = is.peek();
+                            if (!Traits::eq_int_type(ic, Traits::eof()))
+                            {
+                                auto c = static_cast<char>(Traits::to_char_type(ic));
+                                if (c == ':')
+                                {
+                                    (void)is.get();
+                                    read(is, ru{tM, 2, 2});
+                                    if (!is.fail())
+                                        toff += minutes{tM};
+                                }
+                            }
+                        }
+                    }
+                    if (neg)
+                        toff = -toff;
+                    checked_set(temp_offset, toff, not_a_offset, is);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            case 'Z':
+                if (command)
+                {
+                    if (modified == CharT{})
+                    {
+                        std::basic_string<CharT, Traits, Alloc> buf;
+                        while (is.rdstate() == std::ios::goodbit)
+                        {
+                            auto i = is.rdbuf()->sgetc();
+                            if (Traits::eq_int_type(i, Traits::eof()))
+                            {
+                                is.setstate(ios::eofbit);
+                                break;
+                            }
+                            auto wc = Traits::to_char_type(i);
+                            auto c = static_cast<char>(wc);
+                            // is c a valid time zone name or abbreviation character?
+                            if (!(CharT{1} < wc && wc < CharT{127}) || !(isalnum(c) ||
+                                    c == '_' || c == '/' || c == '-' || c == '+'))
+                                break;
+                            buf.push_back(c);
+                            is.rdbuf()->sbumpc();
+                        }
+                        if (buf.empty())
+                            is.setstate(ios::failbit);
+                        checked_set(temp_abbrev, buf, {}, is);
+                    }
+                    else
+                        read(is, CharT{'%'}, width, modified, *fmt);
+                    command = nullptr;
+                    width = -1;
+                    modified = CharT{};
+                }
+                else
+                    read(is, *fmt);
+                break;
+            default:
+                if (command)
+                {
+                    if (width == -1 && modified == CharT{} && '0' <= *fmt && *fmt <= '9')
+                    {
+                        width = static_cast<char>(*fmt) - '0';
+                        while ('0' <= fmt[1] && fmt[1] <= '9')
+                            width = 10*width + static_cast<char>(*++fmt) - '0';
+                    }
+                    else
+                    {
+                        if (modified == CharT{})
+                            read(is, CharT{'%'}, width, *fmt);
+                        else
+                            read(is, CharT{'%'}, width, modified, *fmt);
+                        command = nullptr;
+                        width = -1;
+                        modified = CharT{};
+                    }
+                }
+                else  // !command
+                {
+                    if (isspace(static_cast<unsigned char>(*fmt)))
+                    {
+                        // space matches 0 or more white space characters
+                        if (is.good())
+                           ws(is);
+                    }
+                    else
+                        read(is, *fmt);
+                }
+                break;
+            }
+        }
+        // is.fail() || *fmt == CharT{}
+        if (is.rdstate() == ios::goodbit && command)
+        {
+            if (modified == CharT{})
+                read(is, CharT{'%'}, width);
+            else
+                read(is, CharT{'%'}, width, modified);
+        }
+        if (!is.fail())
+        {
+            if (y != not_a_2digit_year)
+            {
+                // Convert y and an optional C to Y
+                if (!(0 <= y && y <= 99))
+                    goto broken;
+                if (C == not_a_century)
+                {
+                    if (Y == not_a_year)
+                    {
+                        if (y >= 69)
+                            C = 19;
+                        else
+                            C = 20;
+                    }
+                    else
+                    {
+                        C = (Y >= 0 ? Y : Y-100) / 100;
+                    }
+                }
+                int tY;
+                if (C >= 0)
+                    tY = 100*C + y;
+                else
+                    tY = 100*(C+1) - (y == 0 ? 100 : y);
+                if (Y != not_a_year && Y != tY)
+                    goto broken;
+                Y = tY;
+            }
+            if (g != not_a_2digit_year)
+            {
+                // Convert g and an optional C to G
+                if (!(0 <= g && g <= 99))
+                    goto broken;
+                if (C == not_a_century)
+                {
+                    if (G == not_a_year)
+                    {
+                        if (g >= 69)
+                            C = 19;
+                        else
+                            C = 20;
+                    }
+                    else
+                    {
+                        C = (G >= 0 ? G : G-100) / 100;
+                    }
+                }
+                int tG;
+                if (C >= 0)
+                    tG = 100*C + g;
+                else
+                    tG = 100*(C+1) - (g == 0 ? 100 : g);
+                if (G != not_a_year && G != tG)
+                    goto broken;
+                G = tG;
+            }
+            if (Y < static_cast<int>(year::min()) || Y > static_cast<int>(year::max()))
+                Y = not_a_year;
+            bool computed = false;
+            if (G != not_a_year && V != not_a_week_num && wd != not_a_weekday)
+            {
+                year_month_day ymd_trial = sys_days(year{G-1}/December/Thursday[last]) +
+                                           (Monday-Thursday) + weeks{V-1} +
+                                           (weekday{static_cast<unsigned>(wd)}-Monday);
+                if (Y == not_a_year)
+                    Y = static_cast<int>(ymd_trial.year());
+                else if (year{Y} != ymd_trial.year())
+                    goto broken;
+                if (m == not_a_month)
+                    m = static_cast<int>(static_cast<unsigned>(ymd_trial.month()));
+                else if (month(static_cast<unsigned>(m)) != ymd_trial.month())
+                    goto broken;
+                if (d == not_a_day)
+                    d = static_cast<int>(static_cast<unsigned>(ymd_trial.day()));
+                else if (day(static_cast<unsigned>(d)) != ymd_trial.day())
+                    goto broken;
+                computed = true;
+            }
+            if (Y != not_a_year && U != not_a_week_num && wd != not_a_weekday)
+            {
+                year_month_day ymd_trial = sys_days(year{Y}/January/Sunday[1]) +
+                                           weeks{U-1} +
+                                           (weekday{static_cast<unsigned>(wd)} - Sunday);
+                if (year{Y} != ymd_trial.year())
+                    goto broken;
+                if (m == not_a_month)
+                    m = static_cast<int>(static_cast<unsigned>(ymd_trial.month()));
+                else if (month(static_cast<unsigned>(m)) != ymd_trial.month())
+                    goto broken;
+                if (d == not_a_day)
+                    d = static_cast<int>(static_cast<unsigned>(ymd_trial.day()));
+                else if (day(static_cast<unsigned>(d)) != ymd_trial.day())
+                    goto broken;
+                computed = true;
+            }
+            if (Y != not_a_year && W != not_a_week_num && wd != not_a_weekday)
+            {
+                year_month_day ymd_trial = sys_days(year{Y}/January/Monday[1]) +
+                                           weeks{W-1} +
+                                           (weekday{static_cast<unsigned>(wd)} - Monday);
+                if (year{Y} != ymd_trial.year())
+                    goto broken;
+                if (m == not_a_month)
+                    m = static_cast<int>(static_cast<unsigned>(ymd_trial.month()));
+                else if (month(static_cast<unsigned>(m)) != ymd_trial.month())
+                    goto broken;
+                if (d == not_a_day)
+                    d = static_cast<int>(static_cast<unsigned>(ymd_trial.day()));
+                else if (day(static_cast<unsigned>(d)) != ymd_trial.day())
+                    goto broken;
+                computed = true;
+            }
+            if (j != not_a_doy && Y != not_a_year)
+            {
+                auto ymd_trial = year_month_day{local_days(year{Y}/1/1) + days{j-1}};
+                if (m == not_a_month)
+                    m = static_cast<int>(static_cast<unsigned>(ymd_trial.month()));
+                else if (month(static_cast<unsigned>(m)) != ymd_trial.month())
+                    goto broken;
+                if (d == not_a_day)
+                    d = static_cast<int>(static_cast<unsigned>(ymd_trial.day()));
+                else if (day(static_cast<unsigned>(d)) != ymd_trial.day())
+                    goto broken;
+                j = not_a_doy;
+            }
+            auto ymd = year{Y}/m/d;
+            if (ymd.ok())
+            {
+                if (wd == not_a_weekday)
+                    wd = static_cast<int>((weekday(sys_days(ymd)) - Sunday).count());
+                else if (wd != static_cast<int>((weekday(sys_days(ymd)) - Sunday).count()))
+                    goto broken;
+                if (!computed)
+                {
+                    if (G != not_a_year || V != not_a_week_num)
+                    {
+                        sys_days sd = ymd;
+                        auto G_trial = year_month_day{sd + days{3}}.year();
+                        auto start = sys_days((G_trial - years{1})/December/Thursday[last]) +
+                                     (Monday - Thursday);
+                        if (sd < start)
+                        {
+                            --G_trial;
+                            if (V != not_a_week_num)
+                                start = sys_days((G_trial - years{1})/December/Thursday[last])
+                                        + (Monday - Thursday);
+                        }
+                        if (G != not_a_year && G != static_cast<int>(G_trial))
+                            goto broken;
+                        if (V != not_a_week_num)
+                        {
+                            auto V_trial = duration_cast<weeks>(sd - start).count() + 1;
+                            if (V != V_trial)
+                                goto broken;
+                        }
+                    }
+                    if (U != not_a_week_num)
+                    {
+                        auto start = sys_days(Sunday[1]/January/ymd.year());
+                        auto U_trial = floor<weeks>(sys_days(ymd) - start).count() + 1;
+                        if (U != U_trial)
+                            goto broken;
+                    }
+                    if (W != not_a_week_num)
+                    {
+                        auto start = sys_days(Monday[1]/January/ymd.year());
+                        auto W_trial = floor<weeks>(sys_days(ymd) - start).count() + 1;
+                        if (W != W_trial)
+                            goto broken;
+                    }
+                }
+            }
+            fds.ymd = ymd;
+            if (I != not_a_hour_12_value)
+            {
+                if (!(1 <= I && I <= 12))
+                    goto broken;
+                if (p != not_a_ampm)
+                {
+                    // p is in [0, 1] == [AM, PM]
+                    // Store trial H in I
+                    if (I == 12)
+                        --p;
+                    I += p*12;
+                    // Either set H from I or make sure H and I are consistent
+                    if (H == not_a_hour)
+                        H = I;
+                    else if (I != H)
+                        goto broken;
+                }
+                else  // p == not_a_ampm
+                {
+                    // if H, make sure H and I could be consistent
+                    if (H != not_a_hour)
+                    {
+                        if (I == 12)
+                        {
+                            if (H != 0 && H != 12)
+                                goto broken;
+                        }
+                        else if (!(I == H || I == H+12))
+                        {
+                            goto broken;
+                        }
+                    }
+                    else  // I is ambiguous, AM or PM?
+                        goto broken;
+                }
+            }
+            if (H != not_a_hour)
+            {
+                fds.has_tod = true;
+                fds.tod = hh_mm_ss<Duration>{hours{H}};
+            }
+            if (M != not_a_minute)
+            {
+                fds.has_tod = true;
+                fds.tod.m_ = minutes{M};
+            }
+            if (s != not_a_second)
+            {
+                fds.has_tod = true;
+                fds.tod.s_ = detail::decimal_format_seconds<Duration>{s};
+            }
+            if (j != not_a_doy)
+            {
+                fds.has_tod = true;
+                fds.tod.h_ += hours{days{j}};
+            }
+            if (wd != not_a_weekday)
+                fds.wd = weekday{static_cast<unsigned>(wd)};
+            if (abbrev != nullptr)
+                *abbrev = std::move(temp_abbrev);
+            if (offset != nullptr && temp_offset != not_a_offset)
+              *offset = temp_offset;
+        }
+       return is;
+    }
+broken:
+    is.setstate(ios::failbit);
+    return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt, year& y,
+            std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{};
+    date::from_stream(is, fmt, fds, abbrev, offset);
+    if (!fds.ymd.year().ok())
+        is.setstate(std::ios::failbit);
+    if (!is.fail())
+        y = fds.ymd.year();
+    return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt, month& m,
+            std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{};
+    date::from_stream(is, fmt, fds, abbrev, offset);
+    if (!fds.ymd.month().ok())
+        is.setstate(std::ios::failbit);
+    if (!is.fail())
+        m = fds.ymd.month();
+    return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt, day& d,
+            std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{};
+    date::from_stream(is, fmt, fds, abbrev, offset);
+    if (!fds.ymd.day().ok())
+        is.setstate(std::ios::failbit);
+    if (!is.fail())
+        d = fds.ymd.day();
+    return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt, weekday& wd,
+            std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{};
+    date::from_stream(is, fmt, fds, abbrev, offset);
+    if (!fds.wd.ok())
+        is.setstate(std::ios::failbit);
+    if (!is.fail())
+        wd = fds.wd;
+    return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt, year_month& ym,
+            std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{};
+    date::from_stream(is, fmt, fds, abbrev, offset);
+    if (!fds.ymd.month().ok())
+        is.setstate(std::ios::failbit);
+    if (!is.fail())
+        ym = fds.ymd.year()/fds.ymd.month();
+    return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt, month_day& md,
+            std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{};
+    date::from_stream(is, fmt, fds, abbrev, offset);
+    if (!fds.ymd.month().ok() || !fds.ymd.day().ok())
+        is.setstate(std::ios::failbit);
+    if (!is.fail())
+        md = fds.ymd.month()/fds.ymd.day();
+    return is;
+}
+
+template <class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+            year_month_day& ymd, std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    using CT = std::chrono::seconds;
+    fields<CT> fds{};
+    date::from_stream(is, fmt, fds, abbrev, offset);
+    if (!fds.ymd.ok())
+        is.setstate(std::ios::failbit);
+    if (!is.fail())
+        ymd = fds.ymd;
+    return is;
+}
+
+template <class Duration, class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+            sys_time<Duration>& tp, std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    using CT = typename std::common_type<Duration, std::chrono::seconds>::type;
+    using detail::round_i;
+    std::chrono::minutes offset_local{};
+    auto offptr = offset ? offset : &offset_local;
+    fields<CT> fds{};
+    fds.has_tod = true;
+    date::from_stream(is, fmt, fds, abbrev, offptr);
+    if (!fds.ymd.ok() || !fds.tod.in_conventional_range())
+        is.setstate(std::ios::failbit);
+    if (!is.fail())
+        tp = round_i<Duration>(sys_days(fds.ymd) - *offptr + fds.tod.to_duration());
+    return is;
+}
+
+template <class Duration, class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+            local_time<Duration>& tp, std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    using CT = typename std::common_type<Duration, std::chrono::seconds>::type;
+    using detail::round_i;
+    fields<CT> fds{};
+    fds.has_tod = true;
+    date::from_stream(is, fmt, fds, abbrev, offset);
+    if (!fds.ymd.ok() || !fds.tod.in_conventional_range())
+        is.setstate(std::ios::failbit);
+    if (!is.fail())
+        tp = round_i<Duration>(local_seconds{local_days(fds.ymd)} + fds.tod.to_duration());
+    return is;
+}
+
+template <class Rep, class Period, class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+            std::chrono::duration<Rep, Period>& d,
+            std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    using Duration = std::chrono::duration<Rep, Period>;
+    using CT = typename std::common_type<Duration, std::chrono::seconds>::type;
+    using detail::round_i;
+    fields<CT> fds{};
+    date::from_stream(is, fmt, fds, abbrev, offset);
+    if (!fds.has_tod)
+        is.setstate(std::ios::failbit);
+    if (!is.fail())
+        d = round_i<Duration>(fds.tod.to_duration());
+    return is;
+}
+
+template <class Parsable, class CharT, class Traits = std::char_traits<CharT>,
+          class Alloc = std::allocator<CharT>>
+struct parse_manip
+{
+    const std::basic_string<CharT, Traits, Alloc> format_;
+    Parsable&                                     tp_;
+    std::basic_string<CharT, Traits, Alloc>*      abbrev_;
+    std::chrono::minutes*                         offset_;
+
+public:
+    parse_manip(std::basic_string<CharT, Traits, Alloc> format, Parsable& tp,
+                std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+                std::chrono::minutes* offset = nullptr)
+        : format_(std::move(format))
+        , tp_(tp)
+        , abbrev_(abbrev)
+        , offset_(offset)
+        {}
+
+#if HAS_STRING_VIEW
+    parse_manip(const CharT* format, Parsable& tp,
+                std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+                std::chrono::minutes* offset = nullptr)
+        : format_(format)
+        , tp_(tp)
+        , abbrev_(abbrev)
+        , offset_(offset)
+        {}
+
+    parse_manip(std::basic_string_view<CharT, Traits> format, Parsable& tp,
+                std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+                std::chrono::minutes* offset = nullptr)
+        : format_(format)
+        , tp_(tp)
+        , abbrev_(abbrev)
+        , offset_(offset)
+        {}
+#endif  // HAS_STRING_VIEW
+};
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+std::basic_istream<CharT, Traits>&
+operator>>(std::basic_istream<CharT, Traits>& is,
+           const parse_manip<Parsable, CharT, Traits, Alloc>& x)
+{
+    return date::from_stream(is, x.format_.c_str(), x.tp_, x.abbrev_, x.offset_);
+}
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+inline
+auto
+parse(const std::basic_string<CharT, Traits, Alloc>& format, Parsable& tp)
+    -> decltype(date::from_stream(std::declval<std::basic_istream<CharT, Traits>&>(),
+                            format.c_str(), tp),
+                parse_manip<Parsable, CharT, Traits, Alloc>{format, tp})
+{
+    return {format, tp};
+}
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+inline
+auto
+parse(const std::basic_string<CharT, Traits, Alloc>& format, Parsable& tp,
+      std::basic_string<CharT, Traits, Alloc>& abbrev)
+    -> decltype(date::from_stream(std::declval<std::basic_istream<CharT, Traits>&>(),
+                            format.c_str(), tp, &abbrev),
+                parse_manip<Parsable, CharT, Traits, Alloc>{format, tp, &abbrev})
+{
+    return {format, tp, &abbrev};
+}
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+inline
+auto
+parse(const std::basic_string<CharT, Traits, Alloc>& format, Parsable& tp,
+      std::chrono::minutes& offset)
+    -> decltype(date::from_stream(std::declval<std::basic_istream<CharT, Traits>&>(),
+                            format.c_str(), tp,
+                            std::declval<std::basic_string<CharT, Traits, Alloc>*>(),
+                            &offset),
+                parse_manip<Parsable, CharT, Traits, Alloc>{format, tp, nullptr, &offset})
+{
+    return {format, tp, nullptr, &offset};
+}
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+inline
+auto
+parse(const std::basic_string<CharT, Traits, Alloc>& format, Parsable& tp,
+      std::basic_string<CharT, Traits, Alloc>& abbrev, std::chrono::minutes& offset)
+    -> decltype(date::from_stream(std::declval<std::basic_istream<CharT, Traits>&>(),
+                            format.c_str(), tp, &abbrev, &offset),
+                parse_manip<Parsable, CharT, Traits, Alloc>{format, tp, &abbrev, &offset})
+{
+    return {format, tp, &abbrev, &offset};
+}
+
+// const CharT* formats
+
+template <class Parsable, class CharT>
+inline
+auto
+parse(const CharT* format, Parsable& tp)
+    -> decltype(date::from_stream(std::declval<std::basic_istream<CharT>&>(), format, tp),
+                parse_manip<Parsable, CharT>{format, tp})
+{
+    return {format, tp};
+}
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+inline
+auto
+parse(const CharT* format, Parsable& tp, std::basic_string<CharT, Traits, Alloc>& abbrev)
+    -> decltype(date::from_stream(std::declval<std::basic_istream<CharT, Traits>&>(), format,
+                            tp, &abbrev),
+                parse_manip<Parsable, CharT, Traits, Alloc>{format, tp, &abbrev})
+{
+    return {format, tp, &abbrev};
+}
+
+template <class Parsable, class CharT>
+inline
+auto
+parse(const CharT* format, Parsable& tp, std::chrono::minutes& offset)
+    -> decltype(date::from_stream(std::declval<std::basic_istream<CharT>&>(), format,
+                            tp, std::declval<std::basic_string<CharT>*>(), &offset),
+                parse_manip<Parsable, CharT>{format, tp, nullptr, &offset})
+{
+    return {format, tp, nullptr, &offset};
+}
+
+template <class Parsable, class CharT, class Traits, class Alloc>
+inline
+auto
+parse(const CharT* format, Parsable& tp,
+      std::basic_string<CharT, Traits, Alloc>& abbrev, std::chrono::minutes& offset)
+    -> decltype(date::from_stream(std::declval<std::basic_istream<CharT, Traits>&>(), format,
+                            tp, &abbrev, &offset),
+                parse_manip<Parsable, CharT, Traits, Alloc>{format, tp, &abbrev, &offset})
+{
+    return {format, tp, &abbrev, &offset};
+}
+
+// duration streaming
+
+template <class CharT, class Traits, class Rep, class Period>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os,
+           const std::chrono::duration<Rep, Period>& d)
+{
+    return os << detail::make_string<CharT, Traits>::from(d.count()) +
+                 detail::get_units<CharT>(typename Period::type{});
+}
+
+}  // namespace arrow_vendored::date
+
+#ifdef _MSC_VER
+#   pragma warning(pop)
+#endif
+
+#ifdef __GNUC__
+# pragma GCC diagnostic pop
+#endif
+
+#endif  // DATE_H
diff --git a/pyarrow/include/arrow/vendored/datetime/ios.h b/pyarrow/include/arrow/vendored/datetime/ios.h
new file mode 100644
index 0000000000000000000000000000000000000000..d018e799a833ef19db0ef7ec7536ed950bf7dc60
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/datetime/ios.h
@@ -0,0 +1,50 @@
+//
+//  ios.h
+//  DateTimeLib
+//
+// The MIT License (MIT)
+//
+// Copyright (c) 2016 Alexander Kormanovsky
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef ios_hpp
+#define ios_hpp
+
+#if __APPLE__
+# include <TargetConditionals.h>
+# if TARGET_OS_IPHONE
+#   include <string>
+
+    namespace arrow_vendored::date
+    {
+    namespace iOSUtils
+    {
+
+    std::string get_tzdata_path();
+    std::string get_current_timezone();
+
+    }  // namespace iOSUtils
+    }  // namespace arrow_vendored::date
+
+# endif  // TARGET_OS_IPHONE
+#else   // !__APPLE__
+# define TARGET_OS_IPHONE 0
+#endif  // !__APPLE__
+#endif // ios_hpp
diff --git a/pyarrow/include/arrow/vendored/datetime/tz.h b/pyarrow/include/arrow/vendored/datetime/tz.h
new file mode 100644
index 0000000000000000000000000000000000000000..61ab3df106db0bc9b3034551dd68eab64d0d31b1
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/datetime/tz.h
@@ -0,0 +1,2808 @@
+#ifndef TZ_H
+#define TZ_H
+
+// The MIT License (MIT)
+//
+// Copyright (c) 2015, 2016, 2017 Howard Hinnant
+// Copyright (c) 2017 Jiangang Zhuang
+// Copyright (c) 2017 Aaron Bishop
+// Copyright (c) 2017 Tomasz Kamiński
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// Our apologies.  When the previous paragraph was written, lowercase had not yet
+// been invented (that would involve another several millennia of evolution).
+// We did not mean to shout.
+
+// Get more recent database at http://www.iana.org/time-zones
+
+// The notion of "current timezone" is something the operating system is expected to "just
+// know". How it knows this is system specific. It's often a value set by the user at OS
+// installation time and recorded by the OS somewhere. On Linux and Mac systems the current
+// timezone name is obtained by looking at the name or contents of a particular file on
+// disk. On Windows the current timezone name comes from the registry. In either method,
+// there is no guarantee that the "native" current timezone name obtained will match any
+// of the "Standard" names in this library's "database". On Linux, the names usually do
+// seem to match so mapping functions to map from native to "Standard" are typically not
+// required. On Windows, the names are never "Standard" so mapping is always required.
+// Technically any OS may use the mapping process but currently only Windows does use it.
+
+#ifndef USE_OS_TZDB
+#  define USE_OS_TZDB 0
+#endif
+
+#ifndef HAS_REMOTE_API
+#  if USE_OS_TZDB == 0
+#    if defined _WIN32 || defined __ANDROID__
+#      define HAS_REMOTE_API 0
+#    else
+#      define HAS_REMOTE_API 1
+#    endif
+#  else  // HAS_REMOTE_API makes no sense when using the OS timezone database
+#    define HAS_REMOTE_API 0
+#  endif
+#endif
+
+#ifdef __clang__
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wconstant-logical-operand"
+#endif
+
+static_assert(!(USE_OS_TZDB && HAS_REMOTE_API),
+              "USE_OS_TZDB and HAS_REMOTE_API can not be used together");
+
+#ifdef __clang__
+# pragma clang diagnostic pop
+#endif
+
+#ifndef AUTO_DOWNLOAD
+#  define AUTO_DOWNLOAD HAS_REMOTE_API
+#endif
+
+static_assert(HAS_REMOTE_API == 0 ? AUTO_DOWNLOAD == 0 : true,
+              "AUTO_DOWNLOAD can not be turned on without HAS_REMOTE_API");
+
+#ifndef USE_SHELL_API
+#  define USE_SHELL_API 1
+#endif
+
+#if USE_OS_TZDB
+#  ifdef _WIN32
+#    error "USE_OS_TZDB can not be used on Windows"
+#  endif
+#endif
+
+#ifndef HAS_DEDUCTION_GUIDES
+#  if __cplusplus >= 201703
+#    define HAS_DEDUCTION_GUIDES 1
+#  else
+#    define HAS_DEDUCTION_GUIDES 0
+#  endif
+#endif  // HAS_DEDUCTION_GUIDES
+
+#include "date.h"
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#include "tz_private.h"
+#endif
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <istream>
+#include <locale>
+#include <memory>
+#include <mutex>
+#include <ostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#ifdef _WIN32
+#  ifdef DATE_BUILD_DLL
+#    define DATE_API __declspec(dllexport)
+#  elif defined(DATE_USE_DLL)
+#    define DATE_API __declspec(dllimport)
+#  else
+#    define DATE_API
+#  endif
+#else
+#  ifdef DATE_BUILD_DLL
+#    define DATE_API __attribute__ ((visibility ("default")))
+#  else
+#    define DATE_API
+#  endif
+#endif
+
+namespace arrow_vendored::date
+{
+
+enum class choose {earliest, latest};
+
+#if defined(BUILD_TZ_LIB)
+# if defined(ANDROID) || defined(__ANDROID__)
+struct tzdb;
+static std::unique_ptr<tzdb> init_tzdb();
+# endif // defined(ANDROID) || defined(__ANDROID__)
+#endif // defined(BUILD_TZ_LIB)
+
+namespace detail
+{
+    struct undocumented;
+
+    template<typename T>
+    struct nodeduct
+    {
+       using type = T;
+    };
+
+    template<typename T>
+    using nodeduct_t = typename nodeduct<T>::type;
+}
+
+struct sys_info
+{
+    sys_seconds          begin;
+    sys_seconds          end;
+    std::chrono::seconds offset;
+    std::chrono::minutes save;
+    std::string          abbrev;
+};
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const sys_info& r)
+{
+    os << r.begin << '\n';
+    os << r.end << '\n';
+    os << make_time(r.offset) << "\n";
+    os << make_time(r.save) << "\n";
+    os << r.abbrev << '\n';
+    return os;
+}
+
+struct local_info
+{
+    enum {unique, nonexistent, ambiguous} result;
+    sys_info first;
+    sys_info second;
+};
+
+template<class CharT, class Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const local_info& r)
+{
+    if (r.result == local_info::nonexistent)
+        os << "nonexistent between\n";
+    else if (r.result == local_info::ambiguous)
+        os << "ambiguous between\n";
+    os << r.first;
+    if (r.result != local_info::unique)
+    {
+        os << "and\n";
+        os << r.second;
+    }
+    return os;
+}
+
+class nonexistent_local_time
+    : public std::runtime_error
+{
+public:
+    template <class Duration>
+        nonexistent_local_time(local_time<Duration> tp, const local_info& i);
+
+private:
+    template <class Duration>
+    static
+    std::string
+    make_msg(local_time<Duration> tp, const local_info& i);
+};
+
+template <class Duration>
+inline
+nonexistent_local_time::nonexistent_local_time(local_time<Duration> tp,
+                                               const local_info& i)
+    : std::runtime_error(make_msg(tp, i))
+{
+}
+
+template <class Duration>
+std::string
+nonexistent_local_time::make_msg(local_time<Duration> tp, const local_info& i)
+{
+    assert(i.result == local_info::nonexistent);
+    std::ostringstream os;
+    os << tp << " is in a gap between\n"
+       << local_seconds{i.first.end.time_since_epoch()} + i.first.offset << ' '
+       << i.first.abbrev << " and\n"
+       << local_seconds{i.second.begin.time_since_epoch()} + i.second.offset << ' '
+       << i.second.abbrev
+       << " which are both equivalent to\n";
+    date::operator<<(os, i.first.end) << " UTC";
+    return os.str();
+}
+
+class ambiguous_local_time
+    : public std::runtime_error
+{
+public:
+    template <class Duration>
+        ambiguous_local_time(local_time<Duration> tp, const local_info& i);
+
+private:
+    template <class Duration>
+    static
+    std::string
+    make_msg(local_time<Duration> tp, const local_info& i);
+};
+
+template <class Duration>
+inline
+ambiguous_local_time::ambiguous_local_time(local_time<Duration> tp, const local_info& i)
+    : std::runtime_error(make_msg(tp, i))
+{
+}
+
+template <class Duration>
+std::string
+ambiguous_local_time::make_msg(local_time<Duration> tp, const local_info& i)
+{
+    assert(i.result == local_info::ambiguous);
+    std::ostringstream os;
+    os << tp << " is ambiguous.  It could be\n"
+       << tp << ' ' << i.first.abbrev << " == "
+       << tp - i.first.offset << " UTC or\n"
+       << tp << ' ' << i.second.abbrev  << " == "
+       << tp - i.second.offset  << " UTC";
+    return os.str();
+}
+
+class time_zone;
+
+#if HAS_STRING_VIEW
+DATE_API const time_zone* locate_zone(std::string_view tz_name);
+#else
+DATE_API const time_zone* locate_zone(const std::string& tz_name);
+#endif
+
+DATE_API const time_zone* current_zone();
+
+template <class T>
+struct zoned_traits
+{
+};
+
+template <>
+struct zoned_traits<const time_zone*>
+{
+    static
+    const time_zone*
+    default_zone()
+    {
+        return date::locate_zone("Etc/UTC");
+    }
+
+#if HAS_STRING_VIEW
+
+    static
+    const time_zone*
+    locate_zone(std::string_view name)
+    {
+        return date::locate_zone(name);
+    }
+
+#else  // !HAS_STRING_VIEW
+
+    static
+    const time_zone*
+    locate_zone(const std::string& name)
+    {
+        return date::locate_zone(name);
+    }
+
+    static
+    const time_zone*
+    locate_zone(const char* name)
+    {
+        return date::locate_zone(name);
+    }
+
+#endif  // !HAS_STRING_VIEW
+};
+
+template <class Duration, class TimeZonePtr>
+class zoned_time;
+
+template <class Duration1, class Duration2, class TimeZonePtr>
+bool
+operator==(const zoned_time<Duration1, TimeZonePtr>& x,
+           const zoned_time<Duration2, TimeZonePtr>& y);
+
+template <class Duration, class TimeZonePtr = const time_zone*>
+class zoned_time
+{
+public:
+    using duration = typename std::common_type<Duration, std::chrono::seconds>::type;
+
+private:
+    TimeZonePtr        zone_;
+    sys_time<duration> tp_;
+
+public:
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class T = TimeZonePtr,
+              class = decltype(zoned_traits<T>::default_zone())>
+#endif
+        zoned_time();
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class T = TimeZonePtr,
+              class = decltype(zoned_traits<T>::default_zone())>
+#endif
+        zoned_time(const sys_time<Duration>& st);
+    explicit zoned_time(TimeZonePtr z);
+
+#if HAS_STRING_VIEW
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_constructible
+                  <
+                      zoned_time,
+                      decltype(zoned_traits<T>::locate_zone(std::string_view()))
+                  >::value
+              >::type>
+        explicit zoned_time(std::string_view name);
+#else
+#  if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_constructible
+                  <
+                      zoned_time,
+                      decltype(zoned_traits<T>::locate_zone(std::string()))
+                  >::value
+              >::type>
+#  endif
+        explicit zoned_time(const std::string& name);
+#endif
+
+    template <class Duration2,
+              class = typename std::enable_if
+                      <
+                          std::is_convertible<sys_time<Duration2>,
+                                              sys_time<Duration>>::value
+                      >::type>
+        zoned_time(const zoned_time<Duration2, TimeZonePtr>& zt) NOEXCEPT;
+
+    zoned_time(TimeZonePtr z, const sys_time<Duration>& st);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_convertible
+                  <
+                      decltype(std::declval<T&>()->to_sys(local_time<Duration>{})),
+                      sys_time<duration>
+                  >::value
+              >::type>
+#endif
+        zoned_time(TimeZonePtr z, const local_time<Duration>& tp);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_convertible
+                  <
+                      decltype(std::declval<T&>()->to_sys(local_time<Duration>{},
+                                                          choose::earliest)),
+                      sys_time<duration>
+                  >::value
+              >::type>
+#endif
+        zoned_time(TimeZonePtr z, const local_time<Duration>& tp, choose c);
+
+    template <class Duration2, class TimeZonePtr2,
+              class = typename std::enable_if
+                      <
+                          std::is_convertible<sys_time<Duration2>,
+                                              sys_time<Duration>>::value
+                      >::type>
+        zoned_time(TimeZonePtr z, const zoned_time<Duration2, TimeZonePtr2>& zt);
+
+    template <class Duration2, class TimeZonePtr2,
+              class = typename std::enable_if
+                      <
+                          std::is_convertible<sys_time<Duration2>,
+                                              sys_time<Duration>>::value
+                      >::type>
+        zoned_time(TimeZonePtr z, const zoned_time<Duration2, TimeZonePtr2>& zt, choose);
+
+#if HAS_STRING_VIEW
+
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_constructible
+                  <
+                      zoned_time,
+                      decltype(zoned_traits<T>::locate_zone(std::string_view())),
+                      sys_time<Duration>
+                  >::value
+              >::type>
+        zoned_time(std::string_view name, detail::nodeduct_t<const sys_time<Duration>&> st);
+
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_constructible
+                  <
+                      zoned_time,
+                      decltype(zoned_traits<T>::locate_zone(std::string_view())),
+                      local_time<Duration>
+                  >::value
+              >::type>
+        zoned_time(std::string_view name, detail::nodeduct_t<const local_time<Duration>&> tp);
+
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_constructible
+                  <
+                      zoned_time,
+                      decltype(zoned_traits<T>::locate_zone(std::string_view())),
+                      local_time<Duration>,
+                      choose
+                  >::value
+              >::type>
+        zoned_time(std::string_view name, detail::nodeduct_t<const local_time<Duration>&> tp, choose c);
+
+    template <class Duration2, class TimeZonePtr2, class T = TimeZonePtr,
+              class = typename std::enable_if
+                      <
+                          std::is_convertible<sys_time<Duration2>,
+                                              sys_time<Duration>>::value &&
+                          std::is_constructible
+                          <
+                              zoned_time,
+                              decltype(zoned_traits<T>::locate_zone(std::string_view())),
+                              zoned_time
+                          >::value
+                      >::type>
+        zoned_time(std::string_view name, const zoned_time<Duration2, TimeZonePtr2>& zt);
+
+    template <class Duration2, class TimeZonePtr2, class T = TimeZonePtr,
+              class = typename std::enable_if
+                      <
+                          std::is_convertible<sys_time<Duration2>,
+                                              sys_time<Duration>>::value &&
+                          std::is_constructible
+                          <
+                              zoned_time,
+                              decltype(zoned_traits<T>::locate_zone(std::string_view())),
+                              zoned_time,
+                              choose
+                          >::value
+                      >::type>
+        zoned_time(std::string_view name, const zoned_time<Duration2, TimeZonePtr2>& zt, choose);
+
+#else  // !HAS_STRING_VIEW
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_constructible
+                  <
+                      zoned_time,
+                      decltype(zoned_traits<T>::locate_zone(std::string())),
+                      sys_time<Duration>
+                  >::value
+              >::type>
+#endif
+        zoned_time(const std::string& name, const sys_time<Duration>& st);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_constructible
+                  <
+                      zoned_time,
+                      decltype(zoned_traits<T>::locate_zone(std::string())),
+                      sys_time<Duration>
+                  >::value
+              >::type>
+#endif
+        zoned_time(const char* name, const sys_time<Duration>& st);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_constructible
+                  <
+                      zoned_time,
+                      decltype(zoned_traits<T>::locate_zone(std::string())),
+                      local_time<Duration>
+                  >::value
+              >::type>
+#endif
+        zoned_time(const std::string& name, const local_time<Duration>& tp);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_constructible
+                  <
+                      zoned_time,
+                      decltype(zoned_traits<T>::locate_zone(std::string())),
+                      local_time<Duration>
+                  >::value
+              >::type>
+#endif
+        zoned_time(const char* name, const local_time<Duration>& tp);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_constructible
+                  <
+                      zoned_time,
+                      decltype(zoned_traits<T>::locate_zone(std::string())),
+                      local_time<Duration>,
+                      choose
+                  >::value
+              >::type>
+#endif
+        zoned_time(const std::string& name, const local_time<Duration>& tp, choose c);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class T = TimeZonePtr,
+              class = typename std::enable_if
+              <
+                  std::is_constructible
+                  <
+                      zoned_time,
+                      decltype(zoned_traits<T>::locate_zone(std::string())),
+                      local_time<Duration>,
+                      choose
+                  >::value
+              >::type>
+#endif
+        zoned_time(const char* name, const local_time<Duration>& tp, choose c);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class Duration2, class TimeZonePtr2, class T = TimeZonePtr,
+              class = typename std::enable_if
+                      <
+                          std::is_convertible<sys_time<Duration2>,
+                                              sys_time<Duration>>::value &&
+                          std::is_constructible
+                          <
+                              zoned_time,
+                              decltype(zoned_traits<T>::locate_zone(std::string())),
+                              zoned_time
+                          >::value
+                      >::type>
+#else
+    template <class Duration2, class TimeZonePtr2>
+#endif
+        zoned_time(const std::string& name, const zoned_time<Duration2, TimeZonePtr2>& zt);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class Duration2, class TimeZonePtr2, class T = TimeZonePtr,
+              class = typename std::enable_if
+                      <
+                          std::is_convertible<sys_time<Duration2>,
+                                              sys_time<Duration>>::value &&
+                          std::is_constructible
+                          <
+                              zoned_time,
+                              decltype(zoned_traits<T>::locate_zone(std::string())),
+                              zoned_time
+                          >::value
+                      >::type>
+#else
+    template <class Duration2, class TimeZonePtr2>
+#endif
+        zoned_time(const char* name, const zoned_time<Duration2, TimeZonePtr2>& zt);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class Duration2, class TimeZonePtr2, class T = TimeZonePtr,
+              class = typename std::enable_if
+                      <
+                          std::is_convertible<sys_time<Duration2>,
+                                              sys_time<Duration>>::value &&
+                          std::is_constructible
+                          <
+                              zoned_time,
+                              decltype(zoned_traits<T>::locate_zone(std::string())),
+                              zoned_time,
+                              choose
+                          >::value
+                      >::type>
+#else
+    template <class Duration2, class TimeZonePtr2>
+#endif
+        zoned_time(const std::string& name, const zoned_time<Duration2, TimeZonePtr2>& zt,
+                   choose);
+
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+    template <class Duration2, class TimeZonePtr2, class T = TimeZonePtr,
+              class = typename std::enable_if
+                      <
+                          std::is_convertible<sys_time<Duration2>,
+                                              sys_time<Duration>>::value &&
+                          std::is_constructible
+                          <
+                              zoned_time,
+                              decltype(zoned_traits<T>::locate_zone(std::string())),
+                              zoned_time,
+                              choose
+                          >::value
+                      >::type>
+#else
+    template <class Duration2, class TimeZonePtr2>
+#endif
+        zoned_time(const char* name, const zoned_time<Duration2, TimeZonePtr2>& zt,
+                   choose);
+
+#endif  // !HAS_STRING_VIEW
+
+    zoned_time& operator=(const sys_time<Duration>& st);
+    zoned_time& operator=(const local_time<Duration>& ut);
+
+    explicit operator sys_time<duration>() const;
+    explicit operator local_time<duration>() const;
+
+    TimeZonePtr          get_time_zone() const;
+    local_time<duration> get_local_time() const;
+    sys_time<duration>   get_sys_time() const;
+    sys_info             get_info() const;
+
+    template <class Duration1, class Duration2, class TimeZonePtr1>
+    friend
+    bool
+    operator==(const zoned_time<Duration1, TimeZonePtr1>& x,
+               const zoned_time<Duration2, TimeZonePtr1>& y);
+
+    template <class CharT, class Traits, class Duration1, class TimeZonePtr1>
+    friend
+    std::basic_ostream<CharT, Traits>&
+    operator<<(std::basic_ostream<CharT, Traits>& os,
+               const zoned_time<Duration1, TimeZonePtr1>& t);
+
+private:
+    template <class D, class T> friend class zoned_time;
+
+    template <class TimeZonePtr2>
+    static
+    TimeZonePtr2&&
+    check(TimeZonePtr2&& p);
+};
+
+using zoned_seconds = zoned_time<std::chrono::seconds>;
+
+#if HAS_DEDUCTION_GUIDES
+
+namespace detail
+{
+   template<typename TimeZonePtrOrName>
+   using time_zone_representation =
+       std::conditional_t
+       <
+           std::is_convertible<TimeZonePtrOrName, std::string_view>::value,
+           time_zone const*,
+           std::remove_cv_t<std::remove_reference_t<TimeZonePtrOrName>>
+       >;
+}
+
+zoned_time()
+    -> zoned_time<std::chrono::seconds>;
+
+template <class Duration>
+zoned_time(sys_time<Duration>)
+    -> zoned_time<std::common_type_t<Duration, std::chrono::seconds>>;
+
+template <class TimeZonePtrOrName>
+zoned_time(TimeZonePtrOrName&&)
+    -> zoned_time<std::chrono::seconds, detail::time_zone_representation<TimeZonePtrOrName>>;
+
+template <class TimeZonePtrOrName, class Duration>
+zoned_time(TimeZonePtrOrName&&, sys_time<Duration>)
+    -> zoned_time<std::common_type_t<Duration, std::chrono::seconds>, detail::time_zone_representation<TimeZonePtrOrName>>;
+
+template <class TimeZonePtrOrName, class Duration>
+zoned_time(TimeZonePtrOrName&&, local_time<Duration>, choose = choose::earliest)
+    -> zoned_time<std::common_type_t<Duration, std::chrono::seconds>, detail::time_zone_representation<TimeZonePtrOrName>>;
+
+template <class Duration, class TimeZonePtrOrName, class TimeZonePtr2>
+zoned_time(TimeZonePtrOrName&&, zoned_time<Duration, TimeZonePtr2>, choose = choose::earliest)
+    -> zoned_time<std::common_type_t<Duration, std::chrono::seconds>, detail::time_zone_representation<TimeZonePtrOrName>>;
+
+#endif  // HAS_DEDUCTION_GUIDES
+
+template <class Duration1, class Duration2, class TimeZonePtr>
+inline
+bool
+operator==(const zoned_time<Duration1, TimeZonePtr>& x,
+           const zoned_time<Duration2, TimeZonePtr>& y)
+{
+    return x.zone_ == y.zone_ && x.tp_ == y.tp_;
+}
+
+template <class Duration1, class Duration2, class TimeZonePtr>
+inline
+bool
+operator!=(const zoned_time<Duration1, TimeZonePtr>& x,
+           const zoned_time<Duration2, TimeZonePtr>& y)
+{
+    return !(x == y);
+}
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+
+namespace detail
+{
+#  if USE_OS_TZDB
+    struct transition;
+    struct expanded_ttinfo;
+#  else  // !USE_OS_TZDB
+    struct zonelet;
+    class Rule;
+#  endif  // !USE_OS_TZDB
+}
+
+#endif  // !defined(_MSC_VER) || (_MSC_VER >= 1900)
+
+class time_zone
+{
+private:
+    std::string                          name_;
+#if USE_OS_TZDB
+    std::vector<detail::transition>      transitions_;
+    std::vector<detail::expanded_ttinfo> ttinfos_;
+#else  // !USE_OS_TZDB
+    std::vector<detail::zonelet>         zonelets_;
+#endif  // !USE_OS_TZDB
+    std::unique_ptr<std::once_flag>      adjusted_;
+
+public:
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+    time_zone(time_zone&&) = default;
+    time_zone& operator=(time_zone&&) = default;
+#else   // defined(_MSC_VER) && (_MSC_VER < 1900)
+    time_zone(time_zone&& src);
+    time_zone& operator=(time_zone&& src);
+#endif  // defined(_MSC_VER) && (_MSC_VER < 1900)
+
+    DATE_API explicit time_zone(const std::string& s, detail::undocumented);
+
+    const std::string& name() const NOEXCEPT;
+
+    template <class Duration> sys_info   get_info(sys_time<Duration> st) const;
+    template <class Duration> local_info get_info(local_time<Duration> tp) const;
+
+    template <class Duration>
+        sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+        to_sys(local_time<Duration> tp) const;
+
+    template <class Duration>
+        sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+        to_sys(local_time<Duration> tp, choose z) const;
+
+    template <class Duration>
+        local_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+        to_local(sys_time<Duration> tp) const;
+
+    friend bool operator==(const time_zone& x, const time_zone& y) NOEXCEPT;
+    friend bool operator< (const time_zone& x, const time_zone& y) NOEXCEPT;
+    friend DATE_API std::ostream& operator<<(std::ostream& os, const time_zone& z);
+
+#if !USE_OS_TZDB
+    DATE_API void add(const std::string& s);
+#else
+# if defined(BUILD_TZ_LIB)
+#  if defined(ANDROID) || defined(__ANDROID__)
+    friend std::unique_ptr<tzdb> init_tzdb();
+#  endif // defined(ANDROID) || defined(__ANDROID__)
+# endif // defined(BUILD_TZ_LIB)
+#endif  // !USE_OS_TZDB
+
+private:
+    DATE_API sys_info   get_info_impl(sys_seconds tp) const;
+    DATE_API local_info get_info_impl(local_seconds tp) const;
+
+    template <class Duration>
+        sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+        to_sys_impl(local_time<Duration> tp, choose z, std::false_type) const;
+    template <class Duration>
+        sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+        to_sys_impl(local_time<Duration> tp, choose, std::true_type) const;
+
+#if USE_OS_TZDB
+    DATE_API void init() const;
+    DATE_API void init_impl();
+    DATE_API sys_info
+        load_sys_info(std::vector<detail::transition>::const_iterator i) const;
+
+    template <class TimeType>
+    DATE_API void
+    load_data(std::istream& inf, std::int32_t tzh_leapcnt, std::int32_t tzh_timecnt,
+                                 std::int32_t tzh_typecnt, std::int32_t tzh_charcnt);
+# if defined(ANDROID) || defined(__ANDROID__)
+    void parse_from_android_tzdata(std::ifstream& inf, const std::size_t off);
+# endif // defined(ANDROID) || defined(__ANDROID__)
+#else  // !USE_OS_TZDB
+    DATE_API sys_info   get_info_impl(sys_seconds tp, int tz_int) const;
+    DATE_API void adjust_infos(const std::vector<detail::Rule>& rules);
+    DATE_API void parse_info(std::istream& in);
+#endif  // !USE_OS_TZDB
+};
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+
+inline
+time_zone::time_zone(time_zone&& src)
+    : name_(std::move(src.name_))
+    , zonelets_(std::move(src.zonelets_))
+    , adjusted_(std::move(src.adjusted_))
+    {}
+
+inline
+time_zone&
+time_zone::operator=(time_zone&& src)
+{
+    name_ = std::move(src.name_);
+    zonelets_ = std::move(src.zonelets_);
+    adjusted_ = std::move(src.adjusted_);
+    return *this;
+}
+
+#endif  // defined(_MSC_VER) && (_MSC_VER < 1900)
+
+inline
+const std::string&
+time_zone::name() const NOEXCEPT
+{
+    return name_;
+}
+
+template <class Duration>
+inline
+sys_info
+time_zone::get_info(sys_time<Duration> st) const
+{
+    return get_info_impl(date::floor<std::chrono::seconds>(st));
+}
+
+template <class Duration>
+inline
+local_info
+time_zone::get_info(local_time<Duration> tp) const
+{
+    return get_info_impl(date::floor<std::chrono::seconds>(tp));
+}
+
+template <class Duration>
+inline
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+time_zone::to_sys(local_time<Duration> tp) const
+{
+    return to_sys_impl(tp, choose{}, std::true_type{});
+}
+
+template <class Duration>
+inline
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+time_zone::to_sys(local_time<Duration> tp, choose z) const
+{
+    return to_sys_impl(tp, z, std::false_type{});
+}
+
+template <class Duration>
+inline
+local_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+time_zone::to_local(sys_time<Duration> tp) const
+{
+    using LT = local_time<typename std::common_type<Duration, std::chrono::seconds>::type>;
+    auto i = get_info(tp);
+    return LT{(tp + i.offset).time_since_epoch()};
+}
+
+inline bool operator==(const time_zone& x, const time_zone& y) NOEXCEPT {return x.name_ == y.name_;}
+inline bool operator< (const time_zone& x, const time_zone& y) NOEXCEPT {return x.name_ < y.name_;}
+
+inline bool operator!=(const time_zone& x, const time_zone& y) NOEXCEPT {return !(x == y);}
+inline bool operator> (const time_zone& x, const time_zone& y) NOEXCEPT {return   y < x;}
+inline bool operator<=(const time_zone& x, const time_zone& y) NOEXCEPT {return !(y < x);}
+inline bool operator>=(const time_zone& x, const time_zone& y) NOEXCEPT {return !(x < y);}
+
+template <class Duration>
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+time_zone::to_sys_impl(local_time<Duration> tp, choose z, std::false_type) const
+{
+    auto i = get_info(tp);
+    if (i.result == local_info::nonexistent)
+    {
+        return i.first.end;
+    }
+    else if (i.result == local_info::ambiguous)
+    {
+        if (z == choose::latest)
+            return sys_time<Duration>{tp.time_since_epoch()} - i.second.offset;
+    }
+    return sys_time<Duration>{tp.time_since_epoch()} - i.first.offset;
+}
+
+template <class Duration>
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+time_zone::to_sys_impl(local_time<Duration> tp, choose, std::true_type) const
+{
+    auto i = get_info(tp);
+    if (i.result == local_info::nonexistent)
+        throw nonexistent_local_time(tp, i);
+    else if (i.result == local_info::ambiguous)
+        throw ambiguous_local_time(tp, i);
+    return sys_time<Duration>{tp.time_since_epoch()} - i.first.offset;
+}
+
+#if !USE_OS_TZDB
+
+class time_zone_link
+{
+private:
+    std::string name_;
+    std::string target_;
+public:
+    DATE_API explicit time_zone_link(const std::string& s);
+
+    const std::string& name() const {return name_;}
+    const std::string& target() const {return target_;}
+
+    friend bool operator==(const time_zone_link& x, const time_zone_link& y) {return x.name_ == y.name_;}
+    friend bool operator< (const time_zone_link& x, const time_zone_link& y) {return x.name_ < y.name_;}
+
+    friend DATE_API std::ostream& operator<<(std::ostream& os, const time_zone_link& x);
+};
+
+using link = time_zone_link;
+
+inline bool operator!=(const time_zone_link& x, const time_zone_link& y) {return !(x == y);}
+inline bool operator> (const time_zone_link& x, const time_zone_link& y) {return   y < x;}
+inline bool operator<=(const time_zone_link& x, const time_zone_link& y) {return !(y < x);}
+inline bool operator>=(const time_zone_link& x, const time_zone_link& y) {return !(x < y);}
+
+#endif  // !USE_OS_TZDB
+
+class leap_second
+{
+private:
+    sys_seconds date_;
+
+public:
+#if USE_OS_TZDB
+    DATE_API explicit leap_second(const sys_seconds& s, detail::undocumented);
+#else
+    DATE_API explicit leap_second(const std::string& s, detail::undocumented);
+#endif
+
+    sys_seconds date() const {return date_;}
+
+    friend bool operator==(const leap_second& x, const leap_second& y) {return x.date_ == y.date_;}
+    friend bool operator< (const leap_second& x, const leap_second& y) {return x.date_ < y.date_;}
+
+    template <class Duration>
+    friend
+    bool
+    operator==(const leap_second& x, const sys_time<Duration>& y)
+    {
+        return x.date_ == y;
+    }
+
+    template <class Duration>
+    friend
+    bool
+    operator< (const leap_second& x, const sys_time<Duration>& y)
+    {
+        return x.date_ < y;
+    }
+
+    template <class Duration>
+    friend
+    bool
+    operator< (const sys_time<Duration>& x, const leap_second& y)
+    {
+        return x < y.date_;
+    }
+
+    friend DATE_API std::ostream& operator<<(std::ostream& os, const leap_second& x);
+};
+
+inline bool operator!=(const leap_second& x, const leap_second& y) {return !(x == y);}
+inline bool operator> (const leap_second& x, const leap_second& y) {return   y < x;}
+inline bool operator<=(const leap_second& x, const leap_second& y) {return !(y < x);}
+inline bool operator>=(const leap_second& x, const leap_second& y) {return !(x < y);}
+
+template <class Duration>
+inline
+bool
+operator==(const sys_time<Duration>& x, const leap_second& y)
+{
+    return y == x;
+}
+
+template <class Duration>
+inline
+bool
+operator!=(const leap_second& x, const sys_time<Duration>& y)
+{
+    return !(x == y);
+}
+
+template <class Duration>
+inline
+bool
+operator!=(const sys_time<Duration>& x, const leap_second& y)
+{
+    return !(x == y);
+}
+
+template <class Duration>
+inline
+bool
+operator> (const leap_second& x, const sys_time<Duration>& y)
+{
+    return y < x;
+}
+
+template <class Duration>
+inline
+bool
+operator> (const sys_time<Duration>& x, const leap_second& y)
+{
+    return y < x;
+}
+
+template <class Duration>
+inline
+bool
+operator<=(const leap_second& x, const sys_time<Duration>& y)
+{
+    return !(y < x);
+}
+
+template <class Duration>
+inline
+bool
+operator<=(const sys_time<Duration>& x, const leap_second& y)
+{
+    return !(y < x);
+}
+
+template <class Duration>
+inline
+bool
+operator>=(const leap_second& x, const sys_time<Duration>& y)
+{
+    return !(x < y);
+}
+
+template <class Duration>
+inline
+bool
+operator>=(const sys_time<Duration>& x, const leap_second& y)
+{
+    return !(x < y);
+}
+
+using leap = leap_second;
+
+#ifdef _WIN32
+
+namespace detail
+{
+
+// The time zone mapping is modelled after this data file:
+// http://unicode.org/repos/cldr/trunk/common/supplemental/windowsZones.xml
+// and the field names match the element names from the mapZone element
+// of windowsZones.xml.
+// The website displays this file here:
+// http://www.unicode.org/cldr/charts/latest/supplemental/zone_tzid.html
+// The html view is sorted before being displayed but is otherwise the same
+// There is a mapping between the os centric view (in this case windows)
+// the html displays uses and the generic view the xml file.
+// That mapping is this:
+// display column "windows" -> xml field "other".
+// display column "region"  -> xml field "territory".
+// display column "tzid"    -> xml field "type".
+// This structure uses the generic terminology because it could be
+// used to to support other os/native name conversions, not just windows,
+// and using the same generic names helps retain the connection to the
+// origin of the data that we are using.
+struct timezone_mapping
+{
+    timezone_mapping(const char* other, const char* territory, const char* type)
+        : other(other), territory(territory), type(type)
+    {
+    }
+    timezone_mapping() = default;
+    std::string other;
+    std::string territory;
+    std::string type;
+};
+
+}  // detail
+
+#endif  // _WIN32
+
+struct tzdb
+{
+    std::string                 version = "unknown";
+    std::vector<time_zone>      zones;
+#if !USE_OS_TZDB
+    std::vector<time_zone_link> links;
+#endif
+    std::vector<leap_second>    leap_seconds;
+#if !USE_OS_TZDB
+    std::vector<detail::Rule>   rules;
+#endif
+#ifdef _WIN32
+    std::vector<detail::timezone_mapping> mappings;
+#endif
+    tzdb* next = nullptr;
+
+    tzdb() = default;
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+    tzdb(tzdb&&) = default;
+    tzdb& operator=(tzdb&&) = default;
+#else  // defined(_MSC_VER) && (_MSC_VER < 1900)
+    tzdb(tzdb&& src)
+        : version(std::move(src.version))
+        , zones(std::move(src.zones))
+        , links(std::move(src.links))
+        , leap_seconds(std::move(src.leap_seconds))
+        , rules(std::move(src.rules))
+        , mappings(std::move(src.mappings))
+    {}
+
+    tzdb& operator=(tzdb&& src)
+    {
+        version = std::move(src.version);
+        zones = std::move(src.zones);
+        links = std::move(src.links);
+        leap_seconds = std::move(src.leap_seconds);
+        rules = std::move(src.rules);
+        mappings = std::move(src.mappings);
+        return *this;
+    }
+#endif  // defined(_MSC_VER) && (_MSC_VER < 1900)
+
+#if HAS_STRING_VIEW
+    DATE_API const time_zone* locate_zone(std::string_view tz_name) const;
+#else
+    DATE_API const time_zone* locate_zone(const std::string& tz_name) const;
+#endif
+    DATE_API const time_zone* current_zone() const;
+};
+
+using TZ_DB = tzdb;
+
+DATE_API std::ostream&
+operator<<(std::ostream& os, const tzdb& db);
+
+DATE_API const tzdb& get_tzdb();
+
+class tzdb_list
+{
+    std::atomic<tzdb*> head_{nullptr};
+
+public:
+    DATE_API ~tzdb_list();
+    tzdb_list() = default;
+    DATE_API tzdb_list(tzdb_list&& x) NOEXCEPT;
+
+    const tzdb& front() const NOEXCEPT {return *head_;}
+          tzdb& front()       NOEXCEPT {return *head_;}
+
+    class const_iterator;
+
+    const_iterator begin() const NOEXCEPT;
+    const_iterator end() const NOEXCEPT;
+
+    const_iterator cbegin() const NOEXCEPT;
+    const_iterator cend() const NOEXCEPT;
+
+    DATE_API const_iterator erase_after(const_iterator p) NOEXCEPT;
+
+    struct undocumented_helper;
+private:
+    void push_front(tzdb* tzdb) NOEXCEPT;
+};
+
+class tzdb_list::const_iterator
+{
+    tzdb* p_ = nullptr;
+
+    explicit const_iterator(tzdb* p) NOEXCEPT : p_{p} {}
+public:
+    const_iterator() = default;
+
+    using iterator_category = std::forward_iterator_tag;
+    using value_type        = tzdb;
+    using reference         = const value_type&;
+    using pointer           = const value_type*;
+    using difference_type   = std::ptrdiff_t;
+
+    reference operator*() const NOEXCEPT {return *p_;}
+    pointer  operator->() const NOEXCEPT {return p_;}
+
+    const_iterator& operator++() NOEXCEPT {p_ = p_->next; return *this;}
+    const_iterator  operator++(int) NOEXCEPT {auto t = *this; ++(*this); return t;}
+
+    friend
+    bool
+    operator==(const const_iterator& x, const const_iterator& y) NOEXCEPT
+        {return x.p_ == y.p_;}
+
+    friend
+    bool
+    operator!=(const const_iterator& x, const const_iterator& y) NOEXCEPT
+        {return !(x == y);}
+
+    friend class tzdb_list;
+};
+
+inline
+tzdb_list::const_iterator
+tzdb_list::begin() const NOEXCEPT
+{
+    return const_iterator{head_};
+}
+
+inline
+tzdb_list::const_iterator
+tzdb_list::end() const NOEXCEPT
+{
+    return const_iterator{nullptr};
+}
+
+inline
+tzdb_list::const_iterator
+tzdb_list::cbegin() const NOEXCEPT
+{
+    return begin();
+}
+
+inline
+tzdb_list::const_iterator
+tzdb_list::cend() const NOEXCEPT
+{
+    return end();
+}
+
+DATE_API tzdb_list& get_tzdb_list();
+
+#if !USE_OS_TZDB
+
+DATE_API const tzdb& reload_tzdb();
+DATE_API void        set_install(const std::string& install);
+
+#endif  // !USE_OS_TZDB
+
+#if HAS_REMOTE_API
+
+DATE_API std::string remote_version();
+// if provided error_buffer size should be at least CURL_ERROR_SIZE
+DATE_API bool        remote_download(const std::string& version, char* error_buffer = nullptr);
+DATE_API bool        remote_install(const std::string& version);
+
+#endif
+
+// zoned_time
+
+namespace detail
+{
+
+template <class T>
+inline
+T*
+to_raw_pointer(T* p) NOEXCEPT
+{
+    return p;
+}
+
+template <class Pointer>
+inline
+auto
+to_raw_pointer(Pointer p) NOEXCEPT
+    -> decltype(detail::to_raw_pointer(p.operator->()))
+{
+    return detail::to_raw_pointer(p.operator->());
+}
+
+}  // namespace detail
+
+template <class Duration, class TimeZonePtr>
+template <class TimeZonePtr2>
+inline
+TimeZonePtr2&&
+zoned_time<Duration, TimeZonePtr>::check(TimeZonePtr2&& p)
+{
+    if (detail::to_raw_pointer(p) == nullptr)
+        throw std::runtime_error(
+            "zoned_time constructed with a time zone pointer == nullptr");
+    return std::forward<TimeZonePtr2>(p);
+}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time()
+    : zone_(check(zoned_traits<TimeZonePtr>::default_zone()))
+    {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const sys_time<Duration>& st)
+    : zone_(check(zoned_traits<TimeZonePtr>::default_zone()))
+    , tp_(st)
+    {}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(TimeZonePtr z)
+    : zone_(check(std::move(z)))
+    {}
+
+#if HAS_STRING_VIEW
+
+template <class Duration, class TimeZonePtr>
+template <class T, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(std::string_view name)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name))
+    {}
+
+#else  // !HAS_STRING_VIEW
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const std::string& name)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name))
+    {}
+
+#endif  // !HAS_STRING_VIEW
+
+template <class Duration, class TimeZonePtr>
+template <class Duration2, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const zoned_time<Duration2, TimeZonePtr>& zt) NOEXCEPT
+    : zone_(zt.zone_)
+    , tp_(zt.tp_)
+    {}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(TimeZonePtr z, const sys_time<Duration>& st)
+    : zone_(check(std::move(z)))
+    , tp_(st)
+    {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(TimeZonePtr z, const local_time<Duration>& t)
+    : zone_(check(std::move(z)))
+    , tp_(zone_->to_sys(t))
+    {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(TimeZonePtr z, const local_time<Duration>& t,
+                                              choose c)
+    : zone_(check(std::move(z)))
+    , tp_(zone_->to_sys(t, c))
+    {}
+
+template <class Duration, class TimeZonePtr>
+template <class Duration2, class TimeZonePtr2, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(TimeZonePtr z,
+                                              const zoned_time<Duration2, TimeZonePtr2>& zt)
+    : zone_(check(std::move(z)))
+    , tp_(zt.tp_)
+    {}
+
+template <class Duration, class TimeZonePtr>
+template <class Duration2, class TimeZonePtr2, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(TimeZonePtr z,
+                                      const zoned_time<Duration2, TimeZonePtr2>& zt, choose)
+    : zoned_time(std::move(z), zt)
+    {}
+
+#if HAS_STRING_VIEW
+
+template <class Duration, class TimeZonePtr>
+template <class T, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(std::string_view name,
+                                              detail::nodeduct_t<const sys_time<Duration>&> st)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), st)
+    {}
+
+template <class Duration, class TimeZonePtr>
+template <class T, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(std::string_view name,
+                                              detail::nodeduct_t<const local_time<Duration>&> t)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), t)
+    {}
+
+template <class Duration, class TimeZonePtr>
+template <class T, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(std::string_view name,
+                                              detail::nodeduct_t<const local_time<Duration>&> t, choose c)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), t, c)
+    {}
+
+template <class Duration, class TimeZonePtr>
+template <class Duration2, class TimeZonePtr2, class, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(std::string_view name,
+                                              const zoned_time<Duration2, TimeZonePtr2>& zt)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), zt)
+    {}
+
+template <class Duration, class TimeZonePtr>
+template <class Duration2, class TimeZonePtr2, class, class>
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(std::string_view name,
+                                              const zoned_time<Duration2, TimeZonePtr2>& zt,
+                                              choose c)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), zt, c)
+    {}
+
+#else  // !HAS_STRING_VIEW
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const std::string& name,
+                                              const sys_time<Duration>& st)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), st)
+    {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const char* name,
+                                              const sys_time<Duration>& st)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), st)
+    {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const std::string& name,
+                                              const local_time<Duration>& t)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), t)
+    {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const char* name,
+                                              const local_time<Duration>& t)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), t)
+    {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const std::string& name,
+                                              const local_time<Duration>& t, choose c)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), t, c)
+    {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class T, class>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const char* name,
+                                              const local_time<Duration>& t, choose c)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), t, c)
+    {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class Duration2, class TimeZonePtr2, class, class>
+#else
+template <class Duration2, class TimeZonePtr2>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const std::string& name,
+                                              const zoned_time<Duration2, TimeZonePtr2>& zt)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), zt)
+    {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class Duration2, class TimeZonePtr2, class, class>
+#else
+template <class Duration2, class TimeZonePtr2>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const char* name,
+                                              const zoned_time<Duration2, TimeZonePtr2>& zt)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), zt)
+    {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class Duration2, class TimeZonePtr2, class, class>
+#else
+template <class Duration2, class TimeZonePtr2>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const std::string& name,
+                                              const zoned_time<Duration2, TimeZonePtr2>& zt,
+                                              choose c)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), zt, c)
+    {}
+
+template <class Duration, class TimeZonePtr>
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+template <class Duration2, class TimeZonePtr2, class, class>
+#else
+template <class Duration2, class TimeZonePtr2>
+#endif
+inline
+zoned_time<Duration, TimeZonePtr>::zoned_time(const char* name,
+                                              const zoned_time<Duration2, TimeZonePtr2>& zt,
+                                              choose c)
+    : zoned_time(zoned_traits<TimeZonePtr>::locate_zone(name), zt, c)
+    {}
+
+#endif  // HAS_STRING_VIEW
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>&
+zoned_time<Duration, TimeZonePtr>::operator=(const sys_time<Duration>& st)
+{
+    tp_ = st;
+    return *this;
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>&
+zoned_time<Duration, TimeZonePtr>::operator=(const local_time<Duration>& ut)
+{
+    tp_ = zone_->to_sys(ut);
+    return *this;
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>::operator local_time<typename zoned_time<Duration, TimeZonePtr>::duration>() const
+{
+    return get_local_time();
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>::operator sys_time<typename zoned_time<Duration, TimeZonePtr>::duration>() const
+{
+    return get_sys_time();
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+TimeZonePtr
+zoned_time<Duration, TimeZonePtr>::get_time_zone() const
+{
+    return zone_;
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+local_time<typename zoned_time<Duration, TimeZonePtr>::duration>
+zoned_time<Duration, TimeZonePtr>::get_local_time() const
+{
+    return zone_->to_local(tp_);
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+sys_time<typename zoned_time<Duration, TimeZonePtr>::duration>
+zoned_time<Duration, TimeZonePtr>::get_sys_time() const
+{
+    return tp_;
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+sys_info
+zoned_time<Duration, TimeZonePtr>::get_info() const
+{
+    return zone_->get_info(tp_);
+}
+
+// make_zoned_time
+
+inline
+zoned_time<std::chrono::seconds>
+make_zoned()
+{
+    return zoned_time<std::chrono::seconds>();
+}
+
+template <class Duration>
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+make_zoned(const sys_time<Duration>& tp)
+{
+    return zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type>(tp);
+}
+
+template <class TimeZonePtr
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+#if !defined(__INTEL_COMPILER) || (__INTEL_COMPILER > 1600)
+          , class = typename std::enable_if
+          <
+            std::is_class
+            <
+                typename std::decay
+                <
+                    decltype(*detail::to_raw_pointer(std::declval<TimeZonePtr&>()))
+                >::type
+            >{}
+          >::type
+#endif
+#endif
+         >
+inline
+zoned_time<std::chrono::seconds, TimeZonePtr>
+make_zoned(TimeZonePtr z)
+{
+    return zoned_time<std::chrono::seconds, TimeZonePtr>(std::move(z));
+}
+
+inline
+zoned_seconds
+make_zoned(const std::string& name)
+{
+    return zoned_seconds(name);
+}
+
+template <class Duration, class TimeZonePtr
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+#if !defined(__INTEL_COMPILER) || (__INTEL_COMPILER > 1600)
+          , class = typename std::enable_if
+          <
+            std::is_class<typename std::decay<decltype(*std::declval<TimeZonePtr&>())>::type>{}
+          >::type
+#endif
+#endif
+         >
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type, TimeZonePtr>
+make_zoned(TimeZonePtr zone, const local_time<Duration>& tp)
+{
+    return zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type,
+                      TimeZonePtr>(std::move(zone), tp);
+}
+
+template <class Duration, class TimeZonePtr
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+#if !defined(__INTEL_COMPILER) || (__INTEL_COMPILER > 1600)
+          , class = typename std::enable_if
+          <
+            std::is_class<typename std::decay<decltype(*std::declval<TimeZonePtr&>())>::type>{}
+          >::type
+#endif
+#endif
+         >
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type, TimeZonePtr>
+make_zoned(TimeZonePtr zone, const local_time<Duration>& tp, choose c)
+{
+    return zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type,
+                      TimeZonePtr>(std::move(zone), tp, c);
+}
+
+template <class Duration>
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+make_zoned(const std::string& name, const local_time<Duration>& tp)
+{
+    return zoned_time<typename std::common_type<Duration,
+                      std::chrono::seconds>::type>(name, tp);
+}
+
+template <class Duration>
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+make_zoned(const std::string& name, const local_time<Duration>& tp, choose c)
+{
+    return zoned_time<typename std::common_type<Duration,
+                      std::chrono::seconds>::type>(name, tp, c);
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>
+make_zoned(TimeZonePtr zone, const zoned_time<Duration, TimeZonePtr>& zt)
+{
+    return zoned_time<Duration, TimeZonePtr>(std::move(zone), zt);
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>
+make_zoned(const std::string& name, const zoned_time<Duration, TimeZonePtr>& zt)
+{
+    return zoned_time<Duration, TimeZonePtr>(name, zt);
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>
+make_zoned(TimeZonePtr zone, const zoned_time<Duration, TimeZonePtr>& zt, choose c)
+{
+    return zoned_time<Duration, TimeZonePtr>(std::move(zone), zt, c);
+}
+
+template <class Duration, class TimeZonePtr>
+inline
+zoned_time<Duration, TimeZonePtr>
+make_zoned(const std::string& name, const zoned_time<Duration, TimeZonePtr>& zt, choose c)
+{
+    return zoned_time<Duration, TimeZonePtr>(name, zt, c);
+}
+
+template <class Duration, class TimeZonePtr
+#if !defined(_MSC_VER) || (_MSC_VER > 1916)
+#if !defined(__INTEL_COMPILER) || (__INTEL_COMPILER > 1600)
+          , class = typename std::enable_if
+          <
+            std::is_class<typename std::decay<decltype(*std::declval<TimeZonePtr&>())>::type>{}
+          >::type
+#endif
+#endif
+         >
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type, TimeZonePtr>
+make_zoned(TimeZonePtr zone, const sys_time<Duration>& st)
+{
+    return zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type,
+                      TimeZonePtr>(std::move(zone), st);
+}
+
+template <class Duration>
+inline
+zoned_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+make_zoned(const std::string& name, const sys_time<Duration>& st)
+{
+    return zoned_time<typename std::common_type<Duration,
+                      std::chrono::seconds>::type>(name, st);
+}
+
+template <class CharT, class Traits, class Duration, class TimeZonePtr>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+          const zoned_time<Duration, TimeZonePtr>& tp)
+{
+    using duration = typename zoned_time<Duration, TimeZonePtr>::duration;
+    using LT = local_time<duration>;
+    auto const st = tp.get_sys_time();
+    auto const info = tp.get_time_zone()->get_info(st);
+    return to_stream(os, fmt, LT{(st+info.offset).time_since_epoch()},
+                     &info.abbrev, &info.offset);
+}
+
+template <class CharT, class Traits, class Duration, class TimeZonePtr>
+inline
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const zoned_time<Duration, TimeZonePtr>& t)
+{
+    const CharT fmt[] = {'%', 'F', ' ', '%', 'T', ' ', '%', 'Z', CharT{}};
+    return to_stream(os, fmt, t);
+}
+
+class utc_clock
+{
+public:
+    using duration                  = std::chrono::system_clock::duration;
+    using rep                       = duration::rep;
+    using period                    = duration::period;
+    using time_point                = std::chrono::time_point<utc_clock>;
+    static CONSTDATA bool is_steady = false;
+
+    static time_point now();
+
+    template<typename Duration>
+    static
+    std::chrono::time_point<std::chrono::system_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+    to_sys(const std::chrono::time_point<utc_clock, Duration>&);
+
+    template<typename Duration>
+    static
+    std::chrono::time_point<utc_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+    from_sys(const std::chrono::time_point<std::chrono::system_clock, Duration>&);
+
+    template<typename Duration>
+    static
+    std::chrono::time_point<local_t, typename std::common_type<Duration, std::chrono::seconds>::type>
+    to_local(const std::chrono::time_point<utc_clock, Duration>&);
+
+    template<typename Duration>
+    static
+    std::chrono::time_point<utc_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+    from_local(const std::chrono::time_point<local_t, Duration>&);
+};
+
+template <class Duration>
+    using utc_time = std::chrono::time_point<utc_clock, Duration>;
+
+using utc_seconds = utc_time<std::chrono::seconds>;
+
+template <class Duration>
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+utc_clock::from_sys(const sys_time<Duration>& st)
+{
+    using std::chrono::seconds;
+    using CD = typename std::common_type<Duration, seconds>::type;
+    auto const& leaps = get_tzdb().leap_seconds;
+    auto const lt = std::upper_bound(leaps.begin(), leaps.end(), st);
+    return utc_time<CD>{st.time_since_epoch() + seconds{lt-leaps.begin()}};
+}
+
+// Return pair<is_leap_second, seconds{number_of_leap_seconds_since_1970}>
+// first is true if ut is during a leap second insertion, otherwise false.
+// If ut is during a leap second insertion, that leap second is included in the count
+template <class Duration>
+std::pair<bool, std::chrono::seconds>
+is_leap_second(date::utc_time<Duration> const& ut)
+{
+    using std::chrono::seconds;
+    using duration = typename std::common_type<Duration, seconds>::type;
+    auto const& leaps = get_tzdb().leap_seconds;
+    auto tp = sys_time<duration>{ut.time_since_epoch()};
+    auto const lt = std::upper_bound(leaps.begin(), leaps.end(), tp);
+    auto ds = seconds{lt-leaps.begin()};
+    tp -= ds;
+    auto ls = false;
+    if (lt > leaps.begin())
+    {
+        if (tp < lt[-1])
+        {
+            if (tp >= lt[-1].date() - seconds{1})
+                ls = true;
+            else
+                --ds;
+        }
+    }
+    return {ls, ds};
+}
+
+struct leap_second_info
+{
+    bool is_leap_second;
+    std::chrono::seconds elapsed;
+};
+
+template <class Duration>
+leap_second_info
+get_leap_second_info(date::utc_time<Duration> const& ut)
+{
+    auto p = is_leap_second(ut);
+    return {p.first, p.second};
+}
+
+template <class Duration>
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+utc_clock::to_sys(const utc_time<Duration>& ut)
+{
+    using std::chrono::seconds;
+    using CD = typename std::common_type<Duration, seconds>::type;
+    auto ls = is_leap_second(ut);
+    auto tp = sys_time<CD>{ut.time_since_epoch() - ls.second};
+    if (ls.first)
+        tp = floor<seconds>(tp) + seconds{1} - CD{1};
+    return tp;
+}
+
+inline
+utc_clock::time_point
+utc_clock::now()
+{
+    return from_sys(std::chrono::system_clock::now());
+}
+
+template <class Duration>
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+utc_clock::from_local(const local_time<Duration>& st)
+{
+    return from_sys(sys_time<Duration>{st.time_since_epoch()});
+}
+
+template <class Duration>
+local_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+utc_clock::to_local(const utc_time<Duration>& ut)
+{
+    using CD = typename std::common_type<Duration, std::chrono::seconds>::type;
+    return local_time<CD>{to_sys(ut).time_since_epoch()};
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+          const utc_time<Duration>& t)
+{
+    using std::chrono::seconds;
+    using CT = typename std::common_type<Duration, seconds>::type;
+    const std::string abbrev("UTC");
+    CONSTDATA seconds offset{0};
+    auto ls = is_leap_second(t);
+    auto tp = sys_time<CT>{t.time_since_epoch() - ls.second};
+    auto const sd = floor<days>(tp);
+    year_month_day ymd = sd;
+    auto time = make_time(tp - sys_seconds{sd});
+    time.seconds(detail::undocumented{}) += seconds{ls.first};
+    fields<CT> fds{ymd, time};
+    return to_stream(os, fmt, fds, &abbrev, &offset);
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const utc_time<Duration>& t)
+{
+    const CharT fmt[] = {'%', 'F', ' ', '%', 'T', CharT{}};
+    return to_stream(os, fmt, t);
+}
+
+template <class Duration, class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+            utc_time<Duration>& tp, std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    using std::chrono::seconds;
+    using std::chrono::minutes;
+    using CT = typename std::common_type<Duration, seconds>::type;
+    minutes offset_local{};
+    auto offptr = offset ? offset : &offset_local;
+    fields<CT> fds{};
+    fds.has_tod = true;
+    from_stream(is, fmt, fds, abbrev, offptr);
+    if (!fds.ymd.ok())
+        is.setstate(std::ios::failbit);
+    if (!is.fail())
+    {
+        bool is_60_sec = fds.tod.seconds() == seconds{60};
+        if (is_60_sec)
+            fds.tod.seconds(detail::undocumented{}) -= seconds{1};
+        auto tmp = utc_clock::from_sys(sys_days(fds.ymd) - *offptr + fds.tod.to_duration());
+        if (is_60_sec)
+            tmp += seconds{1};
+        if (is_60_sec != is_leap_second(tmp).first || !fds.tod.in_conventional_range())
+        {
+            is.setstate(std::ios::failbit);
+            return is;
+        }
+        tp = std::chrono::time_point_cast<Duration>(tmp);
+    }
+    return is;
+}
+
+// tai_clock
+
+class tai_clock
+{
+public:
+    using duration                  = std::chrono::system_clock::duration;
+    using rep                       = duration::rep;
+    using period                    = duration::period;
+    using time_point                = std::chrono::time_point<tai_clock>;
+    static const bool is_steady     = false;
+
+    static time_point now();
+
+    template<typename Duration>
+    static
+    std::chrono::time_point<utc_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+    to_utc(const std::chrono::time_point<tai_clock, Duration>&) NOEXCEPT;
+
+    template<typename Duration>
+    static
+    std::chrono::time_point<tai_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+    from_utc(const std::chrono::time_point<utc_clock, Duration>&) NOEXCEPT;
+
+    template<typename Duration>
+    static
+    std::chrono::time_point<local_t, typename std::common_type<Duration, date::days>::type>
+    to_local(const std::chrono::time_point<tai_clock, Duration>&) NOEXCEPT;
+
+    template<typename Duration>
+    static
+    std::chrono::time_point<tai_clock, typename std::common_type<Duration, date::days>::type>
+    from_local(const std::chrono::time_point<local_t, Duration>&) NOEXCEPT;
+};
+
+template <class Duration>
+    using tai_time = std::chrono::time_point<tai_clock, Duration>;
+
+using tai_seconds = tai_time<std::chrono::seconds>;
+
+template <class Duration>
+inline
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+tai_clock::to_utc(const tai_time<Duration>& t) NOEXCEPT
+{
+    using std::chrono::seconds;
+    using CD = typename std::common_type<Duration, seconds>::type;
+    return utc_time<CD>{t.time_since_epoch()} -
+            (sys_days(year{1970}/January/1) - sys_days(year{1958}/January/1) + seconds{10});
+}
+
+template <class Duration>
+inline
+tai_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+tai_clock::from_utc(const utc_time<Duration>& t) NOEXCEPT
+{
+    using std::chrono::seconds;
+    using CD = typename std::common_type<Duration, seconds>::type;
+    return tai_time<CD>{t.time_since_epoch()} +
+            (sys_days(year{1970}/January/1) - sys_days(year{1958}/January/1) + seconds{10});
+}
+
+inline
+tai_clock::time_point
+tai_clock::now()
+{
+    return from_utc(utc_clock::now());
+}
+
+template <class Duration>
+inline
+local_time<typename std::common_type<Duration, date::days>::type>
+tai_clock::to_local(const tai_time<Duration>& t) NOEXCEPT
+{
+    using CD = typename std::common_type<Duration, date::days>::type;
+    return local_time<CD>{t.time_since_epoch()} -
+           (local_days(year{1970}/January/1) - local_days(year{1958}/January/1));
+}
+
+template <class Duration>
+inline
+tai_time<typename std::common_type<Duration, date::days>::type>
+tai_clock::from_local(const local_time<Duration>& t) NOEXCEPT
+{
+    using CD = typename std::common_type<Duration, date::days>::type;
+    return tai_time<CD>{t.time_since_epoch()} +
+            (local_days(year{1970}/January/1) - local_days(year{1958}/January/1));
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+          const tai_time<Duration>& t)
+{
+    const std::string abbrev("TAI");
+    CONSTDATA std::chrono::seconds offset{0};
+    return to_stream(os, fmt, tai_clock::to_local(t), &abbrev, &offset);
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const tai_time<Duration>& t)
+{
+    const CharT fmt[] = {'%', 'F', ' ', '%', 'T', CharT{}};
+    return to_stream(os, fmt, t);
+}
+
+template <class Duration, class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+            tai_time<Duration>& tp,
+            std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    local_time<Duration> lp;
+    from_stream(is, fmt, lp, abbrev, offset);
+    if (!is.fail())
+        tp = tai_clock::from_local(lp);
+    return is;
+}
+
+// gps_clock
+
+class gps_clock
+{
+public:
+    using duration                  = std::chrono::system_clock::duration;
+    using rep                       = duration::rep;
+    using period                    = duration::period;
+    using time_point                = std::chrono::time_point<gps_clock>;
+    static const bool is_steady     = false;
+
+    static time_point now();
+
+    template<typename Duration>
+    static
+    std::chrono::time_point<utc_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+    to_utc(const std::chrono::time_point<gps_clock, Duration>&) NOEXCEPT;
+
+    template<typename Duration>
+    static
+    std::chrono::time_point<gps_clock, typename std::common_type<Duration, std::chrono::seconds>::type>
+    from_utc(const std::chrono::time_point<utc_clock, Duration>&) NOEXCEPT;
+
+    template<typename Duration>
+    static
+    std::chrono::time_point<local_t, typename std::common_type<Duration, date::days>::type>
+    to_local(const std::chrono::time_point<gps_clock, Duration>&) NOEXCEPT;
+
+    template<typename Duration>
+    static
+    std::chrono::time_point<gps_clock, typename std::common_type<Duration, date::days>::type>
+    from_local(const std::chrono::time_point<local_t, Duration>&) NOEXCEPT;
+};
+
+template <class Duration>
+    using gps_time = std::chrono::time_point<gps_clock, Duration>;
+
+using gps_seconds = gps_time<std::chrono::seconds>;
+
+template <class Duration>
+inline
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+gps_clock::to_utc(const gps_time<Duration>& t) NOEXCEPT
+{
+    using std::chrono::seconds;
+    using CD = typename std::common_type<Duration, seconds>::type;
+    return utc_time<CD>{t.time_since_epoch()} +
+            (sys_days(year{1980}/January/Sunday[1]) - sys_days(year{1970}/January/1) +
+             seconds{9});
+}
+
+template <class Duration>
+inline
+gps_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+gps_clock::from_utc(const utc_time<Duration>& t) NOEXCEPT
+{
+    using std::chrono::seconds;
+    using CD = typename std::common_type<Duration, seconds>::type;
+    return gps_time<CD>{t.time_since_epoch()} -
+            (sys_days(year{1980}/January/Sunday[1]) - sys_days(year{1970}/January/1) +
+             seconds{9});
+}
+
+inline
+gps_clock::time_point
+gps_clock::now()
+{
+    return from_utc(utc_clock::now());
+}
+
+template <class Duration>
+inline
+local_time<typename std::common_type<Duration, date::days>::type>
+gps_clock::to_local(const gps_time<Duration>& t) NOEXCEPT
+{
+    using CD = typename std::common_type<Duration, date::days>::type;
+    return local_time<CD>{t.time_since_epoch()} +
+            (local_days(year{1980}/January/Sunday[1]) - local_days(year{1970}/January/1));
+}
+
+template <class Duration>
+inline
+gps_time<typename std::common_type<Duration, date::days>::type>
+gps_clock::from_local(const local_time<Duration>& t) NOEXCEPT
+{
+    using CD = typename std::common_type<Duration, date::days>::type;
+    return gps_time<CD>{t.time_since_epoch()} -
+            (local_days(year{1980}/January/Sunday[1]) - local_days(year{1970}/January/1));
+}
+
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+to_stream(std::basic_ostream<CharT, Traits>& os, const CharT* fmt,
+          const gps_time<Duration>& t)
+{
+    const std::string abbrev("GPS");
+    CONSTDATA std::chrono::seconds offset{0};
+    return to_stream(os, fmt, gps_clock::to_local(t), &abbrev, &offset);
+}
+
+template <class CharT, class Traits, class Duration>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const gps_time<Duration>& t)
+{
+    const CharT fmt[] = {'%', 'F', ' ', '%', 'T', CharT{}};
+    return to_stream(os, fmt, t);
+}
+
+template <class Duration, class CharT, class Traits, class Alloc = std::allocator<CharT>>
+std::basic_istream<CharT, Traits>&
+from_stream(std::basic_istream<CharT, Traits>& is, const CharT* fmt,
+            gps_time<Duration>& tp,
+            std::basic_string<CharT, Traits, Alloc>* abbrev = nullptr,
+            std::chrono::minutes* offset = nullptr)
+{
+    local_time<Duration> lp;
+    from_stream(is, fmt, lp, abbrev, offset);
+    if (!is.fail())
+        tp = gps_clock::from_local(lp);
+    return is;
+}
+
+// clock_time_conversion
+
+template <class DstClock, class SrcClock>
+struct clock_time_conversion
+{};
+
+template <>
+struct clock_time_conversion<std::chrono::system_clock, std::chrono::system_clock>
+{
+    template <class Duration>
+    CONSTCD14
+    sys_time<Duration>
+    operator()(const sys_time<Duration>& st) const
+    {
+        return st;
+    }
+};
+
+template <>
+struct clock_time_conversion<utc_clock, utc_clock>
+{
+    template <class Duration>
+    CONSTCD14
+    utc_time<Duration>
+    operator()(const utc_time<Duration>& ut) const
+    {
+        return ut;
+    }
+};
+
+template<>
+struct clock_time_conversion<local_t, local_t>
+{
+    template <class Duration>
+    CONSTCD14
+    local_time<Duration>
+    operator()(const local_time<Duration>& lt) const
+    {
+        return lt;
+    }
+};
+
+template <>
+struct clock_time_conversion<utc_clock, std::chrono::system_clock>
+{
+    template <class Duration>
+    utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+    operator()(const sys_time<Duration>& st) const
+    {
+        return utc_clock::from_sys(st);
+    }
+};
+
+template <>
+struct clock_time_conversion<std::chrono::system_clock, utc_clock>
+{
+    template <class Duration>
+    sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+    operator()(const utc_time<Duration>& ut) const
+    {
+        return utc_clock::to_sys(ut);
+    }
+};
+
+template<>
+struct clock_time_conversion<local_t, std::chrono::system_clock>
+{
+    template <class Duration>
+    CONSTCD14
+    local_time<Duration>
+    operator()(const sys_time<Duration>& st) const
+    {
+       return local_time<Duration>{st.time_since_epoch()};
+    }
+};
+
+template<>
+struct clock_time_conversion<std::chrono::system_clock, local_t>
+{
+    template <class Duration>
+    CONSTCD14
+    sys_time<Duration>
+    operator()(const local_time<Duration>& lt) const
+    {
+        return sys_time<Duration>{lt.time_since_epoch()};
+    }
+};
+
+template<>
+struct clock_time_conversion<utc_clock, local_t>
+{
+    template <class Duration>
+    utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+    operator()(const local_time<Duration>& lt) const
+    {
+       return utc_clock::from_local(lt);
+    }
+};
+
+template<>
+struct clock_time_conversion<local_t, utc_clock>
+{
+    template <class Duration>
+    local_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+    operator()(const utc_time<Duration>& ut) const
+    {
+       return utc_clock::to_local(ut);
+    }
+};
+
+template<typename Clock>
+struct clock_time_conversion<Clock, Clock>
+{
+    template <class Duration>
+    CONSTCD14
+    std::chrono::time_point<Clock, Duration>
+    operator()(const std::chrono::time_point<Clock, Duration>& tp) const
+    {
+        return tp;
+    }
+};
+
+namespace ctc_detail
+{
+
+template <class Clock, class Duration>
+    using time_point = std::chrono::time_point<Clock, Duration>;
+
+using std::declval;
+using std::chrono::system_clock;
+
+//Check if TimePoint is time for given clock,
+//if not emits hard error
+template <class Clock, class TimePoint>
+struct return_clock_time
+{
+    using clock_time_point = time_point<Clock, typename TimePoint::duration>;
+    using type             = TimePoint;
+
+    static_assert(std::is_same<TimePoint, clock_time_point>::value,
+                  "time point with appropariate clock shall be returned");
+};
+
+// Check if Clock has to_sys method accepting TimePoint with given duration const& and
+// returning sys_time. If so has nested type member equal to return type to_sys.
+template <class Clock, class Duration, class = void>
+struct return_to_sys
+{};
+
+template <class Clock, class Duration>
+struct return_to_sys
+       <
+           Clock, Duration,
+           decltype(Clock::to_sys(declval<time_point<Clock, Duration> const&>()), void())
+       >
+    : return_clock_time
+      <
+          system_clock,
+          decltype(Clock::to_sys(declval<time_point<Clock, Duration> const&>()))
+      >
+{};
+
+// Similiar to above
+template <class Clock, class Duration, class = void>
+struct return_from_sys
+{};
+
+template <class Clock, class Duration>
+struct return_from_sys
+       <
+           Clock, Duration,
+           decltype(Clock::from_sys(declval<time_point<system_clock, Duration> const&>()),
+                    void())
+       >
+    : return_clock_time
+      <
+          Clock,
+          decltype(Clock::from_sys(declval<time_point<system_clock, Duration> const&>()))
+      >
+{};
+
+// Similiar to above
+template <class Clock, class Duration, class = void>
+struct return_to_utc
+{};
+
+template <class Clock, class Duration>
+struct return_to_utc
+       <
+           Clock, Duration,
+           decltype(Clock::to_utc(declval<time_point<Clock, Duration> const&>()), void())
+       >
+    : return_clock_time
+      <
+          utc_clock,
+          decltype(Clock::to_utc(declval<time_point<Clock, Duration> const&>()))>
+{};
+
+// Similiar to above
+template <class Clock, class Duration, class = void>
+struct return_from_utc
+{};
+
+template <class Clock, class Duration>
+struct return_from_utc
+       <
+           Clock, Duration,
+           decltype(Clock::from_utc(declval<time_point<utc_clock, Duration> const&>()),
+                    void())
+       >
+    : return_clock_time
+      <
+          Clock,
+          decltype(Clock::from_utc(declval<time_point<utc_clock, Duration> const&>()))
+      >
+{};
+
+// Similiar to above
+template<typename Clock, typename Duration, typename = void>
+struct return_to_local
+{};
+
+template<typename Clock, typename Duration>
+struct return_to_local
+       <
+          Clock, Duration,
+          decltype(Clock::to_local(declval<time_point<Clock, Duration> const&>()),
+                   void())
+       >
+     : return_clock_time
+       <
+           local_t,
+           decltype(Clock::to_local(declval<time_point<Clock, Duration> const&>()))
+       >
+{};
+
+// Similiar to above
+template<typename Clock, typename Duration, typename = void>
+struct return_from_local
+{};
+
+template<typename Clock, typename Duration>
+struct return_from_local
+       <
+           Clock, Duration,
+           decltype(Clock::from_local(declval<time_point<local_t, Duration> const&>()),
+                    void())
+       >
+     : return_clock_time
+       <
+           Clock,
+           decltype(Clock::from_local(declval<time_point<local_t, Duration> const&>()))
+       >
+{};
+
+}  // namespace ctc_detail
+
+template <class SrcClock>
+struct clock_time_conversion<std::chrono::system_clock, SrcClock>
+{
+    template <class Duration>
+    CONSTCD14
+    typename ctc_detail::return_to_sys<SrcClock, Duration>::type
+    operator()(const std::chrono::time_point<SrcClock, Duration>& tp) const
+    {
+        return SrcClock::to_sys(tp);
+    }
+};
+
+template <class DstClock>
+struct clock_time_conversion<DstClock, std::chrono::system_clock>
+{
+    template <class Duration>
+    CONSTCD14
+    typename ctc_detail::return_from_sys<DstClock, Duration>::type
+    operator()(const sys_time<Duration>& st) const
+    {
+        return DstClock::from_sys(st);
+    }
+};
+
+template <class SrcClock>
+struct clock_time_conversion<utc_clock, SrcClock>
+{
+    template <class Duration>
+    CONSTCD14
+    typename ctc_detail::return_to_utc<SrcClock, Duration>::type
+    operator()(const std::chrono::time_point<SrcClock, Duration>& tp) const
+    {
+        return SrcClock::to_utc(tp);
+    }
+};
+
+template <class DstClock>
+struct clock_time_conversion<DstClock, utc_clock>
+{
+    template <class Duration>
+    CONSTCD14
+    typename ctc_detail::return_from_utc<DstClock, Duration>::type
+    operator()(const utc_time<Duration>& ut) const
+    {
+        return DstClock::from_utc(ut);
+    }
+};
+
+template<typename SrcClock>
+struct clock_time_conversion<local_t, SrcClock>
+{
+    template <class Duration>
+    CONSTCD14
+    typename ctc_detail::return_to_local<SrcClock, Duration>::type
+    operator()(const std::chrono::time_point<SrcClock, Duration>& tp) const
+    {
+        return SrcClock::to_local(tp);
+    }
+};
+
+template<typename DstClock>
+struct clock_time_conversion<DstClock, local_t>
+{
+    template <class Duration>
+    CONSTCD14
+    typename ctc_detail::return_from_local<DstClock, Duration>::type
+    operator()(const local_time<Duration>& lt) const
+    {
+        return DstClock::from_local(lt);
+    }
+};
+
+namespace clock_cast_detail
+{
+
+template <class Clock, class Duration>
+    using time_point = std::chrono::time_point<Clock, Duration>;
+using std::chrono::system_clock;
+
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+conv_clock(const time_point<SrcClock, Duration>& t)
+    -> decltype(std::declval<clock_time_conversion<DstClock, SrcClock>>()(t))
+{
+    return clock_time_conversion<DstClock, SrcClock>{}(t);
+}
+
+//direct trait conversion, 1st candidate
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+cc_impl(const time_point<SrcClock, Duration>& t, const time_point<SrcClock, Duration>*)
+    -> decltype(conv_clock<DstClock>(t))
+{
+    return conv_clock<DstClock>(t);
+}
+
+//conversion through sys, 2nd candidate
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+cc_impl(const time_point<SrcClock, Duration>& t, const void*)
+    -> decltype(conv_clock<DstClock>(conv_clock<system_clock>(t)))
+{
+    return conv_clock<DstClock>(conv_clock<system_clock>(t));
+}
+
+//conversion through utc, 2nd candidate
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+cc_impl(const time_point<SrcClock, Duration>& t, const void*)
+    -> decltype(0,  // MSVC_WORKAROUND
+                conv_clock<DstClock>(conv_clock<utc_clock>(t)))
+{
+    return conv_clock<DstClock>(conv_clock<utc_clock>(t));
+}
+
+//conversion through sys and utc, 3rd candidate
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+cc_impl(const time_point<SrcClock, Duration>& t, ...)
+    -> decltype(conv_clock<DstClock>(conv_clock<utc_clock>(conv_clock<system_clock>(t))))
+{
+    return conv_clock<DstClock>(conv_clock<utc_clock>(conv_clock<system_clock>(t)));
+}
+
+//conversion through utc and sys, 3rd candidate
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+cc_impl(const time_point<SrcClock, Duration>& t, ...)
+    -> decltype(0,  // MSVC_WORKAROUND
+                conv_clock<DstClock>(conv_clock<system_clock>(conv_clock<utc_clock>(t))))
+{
+    return conv_clock<DstClock>(conv_clock<system_clock>(conv_clock<utc_clock>(t)));
+}
+
+}  // namespace clock_cast_detail
+
+template <class DstClock, class SrcClock, class Duration>
+CONSTCD14
+auto
+clock_cast(const std::chrono::time_point<SrcClock, Duration>& tp)
+    -> decltype(clock_cast_detail::cc_impl<DstClock>(tp, &tp))
+{
+    return clock_cast_detail::cc_impl<DstClock>(tp, &tp);
+}
+
+// Deprecated API
+
+template <class Duration>
+inline
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_sys_time(const utc_time<Duration>& t)
+{
+    return utc_clock::to_sys(t);
+}
+
+template <class Duration>
+inline
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_sys_time(const tai_time<Duration>& t)
+{
+    return utc_clock::to_sys(tai_clock::to_utc(t));
+}
+
+template <class Duration>
+inline
+sys_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_sys_time(const gps_time<Duration>& t)
+{
+    return utc_clock::to_sys(gps_clock::to_utc(t));
+}
+
+
+template <class Duration>
+inline
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_utc_time(const sys_time<Duration>& t)
+{
+    return utc_clock::from_sys(t);
+}
+
+template <class Duration>
+inline
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_utc_time(const tai_time<Duration>& t)
+{
+    return tai_clock::to_utc(t);
+}
+
+template <class Duration>
+inline
+utc_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_utc_time(const gps_time<Duration>& t)
+{
+    return gps_clock::to_utc(t);
+}
+
+
+template <class Duration>
+inline
+tai_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_tai_time(const sys_time<Duration>& t)
+{
+    return tai_clock::from_utc(utc_clock::from_sys(t));
+}
+
+template <class Duration>
+inline
+tai_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_tai_time(const utc_time<Duration>& t)
+{
+    return tai_clock::from_utc(t);
+}
+
+template <class Duration>
+inline
+tai_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_tai_time(const gps_time<Duration>& t)
+{
+    return tai_clock::from_utc(gps_clock::to_utc(t));
+}
+
+
+template <class Duration>
+inline
+gps_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_gps_time(const sys_time<Duration>& t)
+{
+    return gps_clock::from_utc(utc_clock::from_sys(t));
+}
+
+template <class Duration>
+inline
+gps_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_gps_time(const utc_time<Duration>& t)
+{
+    return gps_clock::from_utc(t);
+}
+
+template <class Duration>
+inline
+gps_time<typename std::common_type<Duration, std::chrono::seconds>::type>
+to_gps_time(const tai_time<Duration>& t)
+{
+    return gps_clock::from_utc(tai_clock::to_utc(t));
+}
+
+}  // namespace arrow_vendored::date
+
+#endif  // TZ_H
diff --git a/pyarrow/include/arrow/vendored/datetime/tz_private.h b/pyarrow/include/arrow/vendored/datetime/tz_private.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d7f858971106442d7ee9a4f83a2b9da626c0e76
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/datetime/tz_private.h
@@ -0,0 +1,315 @@
+#ifndef TZ_PRIVATE_H
+#define TZ_PRIVATE_H
+
+// The MIT License (MIT)
+//
+// Copyright (c) 2015, 2016 Howard Hinnant
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+// Our apologies.  When the previous paragraph was written, lowercase had not yet
+// been invented (that would involve another several millennia of evolution).
+// We did not mean to shout.
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+#include "tz.h"
+#else
+#include "date.h"
+#include <vector>
+#endif
+
+namespace arrow_vendored::date
+{
+
+namespace detail
+{
+
+#if !USE_OS_TZDB
+
+enum class tz {utc, local, standard};
+
+//forward declare to avoid warnings in gcc 6.2
+class MonthDayTime;
+std::istream& operator>>(std::istream& is, MonthDayTime& x);
+std::ostream& operator<<(std::ostream& os, const MonthDayTime& x);
+
+
+class MonthDayTime
+{
+private:
+    struct pair
+    {
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+        pair() : month_day_(date::jan / 1), weekday_(0U) {}
+
+        pair(const date::month_day& month_day, const date::weekday& weekday)
+            : month_day_(month_day), weekday_(weekday) {}
+#endif
+
+        date::month_day month_day_;
+        date::weekday   weekday_;
+    };
+
+    enum Type {month_day, month_last_dow, lteq, gteq};
+
+    Type                         type_{month_day};
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+    union U
+#else
+    struct U
+#endif
+    {
+        date::month_day          month_day_;
+        date::month_weekday_last month_weekday_last_;
+        pair                     month_day_weekday_;
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+        U() : month_day_{date::jan/1} {}
+#else
+        U() :
+            month_day_(date::jan/1),
+            month_weekday_last_(date::month(0U), date::weekday_last(date::weekday(0U)))
+        {}
+
+#endif // !defined(_MSC_VER) || (_MSC_VER >= 1900)
+
+        U& operator=(const date::month_day& x);
+        U& operator=(const date::month_weekday_last& x);
+        U& operator=(const pair& x);
+    } u;
+
+    std::chrono::hours           h_{0};
+    std::chrono::minutes         m_{0};
+    std::chrono::seconds         s_{0};
+    tz                           zone_{tz::local};
+
+public:
+    MonthDayTime() = default;
+    MonthDayTime(local_seconds tp, tz timezone);
+    MonthDayTime(const date::month_day& md, tz timezone);
+
+    date::day day() const;
+    date::month month() const;
+    tz zone() const {return zone_;}
+
+    void canonicalize(date::year y);
+
+    sys_seconds
+       to_sys(date::year y, std::chrono::seconds offset, std::chrono::seconds save) const;
+    sys_days to_sys_days(date::year y) const;
+
+    sys_seconds to_time_point(date::year y) const;
+    int compare(date::year y, const MonthDayTime& x, date::year yx,
+                std::chrono::seconds offset, std::chrono::minutes prev_save) const;
+
+    friend std::istream& operator>>(std::istream& is, MonthDayTime& x);
+    friend std::ostream& operator<<(std::ostream& os, const MonthDayTime& x);
+};
+
+// A Rule specifies one or more set of datetimes without using an offset.
+// Multiple dates are specified with multiple years.  The years in effect
+// go from starting_year_ to ending_year_, inclusive.  starting_year_ <=
+// ending_year_. save_ is in effect for times from the specified time
+// onward, including the specified time. When the specified time is
+// local, it uses the save_ from the chronologically previous Rule, or if
+// there is none, 0.
+
+//forward declare to avoid warnings in gcc 6.2
+class Rule;
+bool operator==(const Rule& x, const Rule& y);
+bool operator<(const Rule& x, const Rule& y);
+bool operator==(const Rule& x, const date::year& y);
+bool operator<(const Rule& x, const date::year& y);
+bool operator==(const date::year& x, const Rule& y);
+bool operator<(const date::year& x, const Rule& y);
+bool operator==(const Rule& x, const std::string& y);
+bool operator<(const Rule& x, const std::string& y);
+bool operator==(const std::string& x, const Rule& y);
+bool operator<(const std::string& x, const Rule& y);
+std::ostream& operator<<(std::ostream& os, const Rule& r);
+
+class Rule
+{
+private:
+    std::string          name_;
+    date::year           starting_year_{0};
+    date::year           ending_year_{0};
+    MonthDayTime         starting_at_;
+    std::chrono::minutes save_{0};
+    std::string          abbrev_;
+
+public:
+    Rule() = default;
+    explicit Rule(const std::string& s);
+    Rule(const Rule& r, date::year starting_year, date::year ending_year);
+
+    const std::string& name() const {return name_;}
+    const std::string& abbrev() const {return abbrev_;}
+
+    const MonthDayTime&         mdt()           const {return starting_at_;}
+    const date::year&           starting_year() const {return starting_year_;}
+    const date::year&           ending_year()   const {return ending_year_;}
+    const std::chrono::minutes& save()          const {return save_;}
+
+    static void split_overlaps(std::vector<Rule>& rules);
+
+    friend bool operator==(const Rule& x, const Rule& y);
+    friend bool operator<(const Rule& x, const Rule& y);
+    friend bool operator==(const Rule& x, const date::year& y);
+    friend bool operator<(const Rule& x, const date::year& y);
+    friend bool operator==(const date::year& x, const Rule& y);
+    friend bool operator<(const date::year& x, const Rule& y);
+    friend bool operator==(const Rule& x, const std::string& y);
+    friend bool operator<(const Rule& x, const std::string& y);
+    friend bool operator==(const std::string& x, const Rule& y);
+    friend bool operator<(const std::string& x, const Rule& y);
+
+    friend std::ostream& operator<<(std::ostream& os, const Rule& r);
+
+private:
+    date::day day() const;
+    date::month month() const;
+    static void split_overlaps(std::vector<Rule>& rules, std::size_t i, std::size_t& e);
+    static bool overlaps(const Rule& x, const Rule& y);
+    static void split(std::vector<Rule>& rules, std::size_t i, std::size_t k,
+                      std::size_t& e);
+};
+
+inline bool operator!=(const Rule& x, const Rule& y) {return !(x == y);}
+inline bool operator> (const Rule& x, const Rule& y) {return   y < x;}
+inline bool operator<=(const Rule& x, const Rule& y) {return !(y < x);}
+inline bool operator>=(const Rule& x, const Rule& y) {return !(x < y);}
+
+inline bool operator!=(const Rule& x, const date::year& y) {return !(x == y);}
+inline bool operator> (const Rule& x, const date::year& y) {return   y < x;}
+inline bool operator<=(const Rule& x, const date::year& y) {return !(y < x);}
+inline bool operator>=(const Rule& x, const date::year& y) {return !(x < y);}
+
+inline bool operator!=(const date::year& x, const Rule& y) {return !(x == y);}
+inline bool operator> (const date::year& x, const Rule& y) {return   y < x;}
+inline bool operator<=(const date::year& x, const Rule& y) {return !(y < x);}
+inline bool operator>=(const date::year& x, const Rule& y) {return !(x < y);}
+
+inline bool operator!=(const Rule& x, const std::string& y) {return !(x == y);}
+inline bool operator> (const Rule& x, const std::string& y) {return   y < x;}
+inline bool operator<=(const Rule& x, const std::string& y) {return !(y < x);}
+inline bool operator>=(const Rule& x, const std::string& y) {return !(x < y);}
+
+inline bool operator!=(const std::string& x, const Rule& y) {return !(x == y);}
+inline bool operator> (const std::string& x, const Rule& y) {return   y < x;}
+inline bool operator<=(const std::string& x, const Rule& y) {return !(y < x);}
+inline bool operator>=(const std::string& x, const Rule& y) {return !(x < y);}
+
+struct zonelet
+{
+    enum tag {has_rule, has_save, is_empty};
+
+    std::chrono::seconds gmtoff_;
+    tag tag_ = has_rule;
+
+#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
+    union U
+#else
+    struct U
+#endif
+    {
+        std::string          rule_;
+        std::chrono::minutes save_;
+
+        ~U() {}
+        U() {}
+        U(const U&) {}
+        U& operator=(const U&) = delete;
+    } u;
+
+    std::string                        format_;
+    date::year                         until_year_{0};
+    MonthDayTime                       until_date_;
+    sys_seconds                        until_utc_;
+    local_seconds                      until_std_;
+    local_seconds                      until_loc_;
+    std::chrono::minutes               initial_save_{0};
+    std::string                        initial_abbrev_;
+    std::pair<const Rule*, date::year> first_rule_{nullptr, date::year::min()};
+    std::pair<const Rule*, date::year> last_rule_{nullptr, date::year::max()};
+
+    ~zonelet();
+    zonelet();
+    zonelet(const zonelet& i);
+    zonelet& operator=(const zonelet&) = delete;
+};
+
+#else  // USE_OS_TZDB
+
+struct ttinfo
+{
+    std::int32_t  tt_gmtoff;
+    unsigned char tt_isdst;
+    unsigned char tt_abbrind;
+    unsigned char pad[2];
+};
+
+static_assert(sizeof(ttinfo) == 8, "");
+
+struct expanded_ttinfo
+{
+    std::chrono::seconds offset;
+    std::string          abbrev;
+    bool                 is_dst;
+};
+
+struct transition
+{
+    sys_seconds            timepoint;
+    const expanded_ttinfo* info;
+
+    transition(sys_seconds tp, const expanded_ttinfo* i = nullptr)
+        : timepoint(tp)
+        , info(i)
+        {}
+
+    friend
+    std::ostream&
+    operator<<(std::ostream& os, const transition& t)
+    {
+        date::operator<<(os, t.timepoint) << "Z ";
+        if (t.info->offset >= std::chrono::seconds{0})
+            os << '+';
+        os << make_time(t.info->offset);
+        if (t.info->is_dst > 0)
+            os << " daylight ";
+        else
+            os << " standard ";
+        os << t.info->abbrev;
+        return os;
+    }
+};
+
+#endif  // USE_OS_TZDB
+
+}  // namespace detail
+
+}  // namespace arrow_vendored::date
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+#include "tz.h"
+#endif
+
+#endif  // TZ_PRIVATE_H
diff --git a/pyarrow/include/arrow/vendored/datetime/visibility.h b/pyarrow/include/arrow/vendored/datetime/visibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..780c00d70bd9f2516efa01ee52f60155aba368e4
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/datetime/visibility.h
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#ifndef _WIN32
+#  define USE_OS_TZDB 1
+#endif
+
+#if defined(ARROW_STATIC)
+// intentially empty
+#elif defined(ARROW_EXPORTING)
+#  define DATE_BUILD_DLL
+#else
+#  define DATE_USE_DLL
+#endif
diff --git a/pyarrow/include/arrow/vendored/double-conversion/bignum-dtoa.h b/pyarrow/include/arrow/vendored/double-conversion/bignum-dtoa.h
new file mode 100644
index 0000000000000000000000000000000000000000..f56239e8e88956a319aca3ec25fa48e0db4d6547
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/double-conversion/bignum-dtoa.h
@@ -0,0 +1,86 @@
+// Copyright 2010 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_BIGNUM_DTOA_H_
+#define DOUBLE_CONVERSION_BIGNUM_DTOA_H_
+
+#include "utils.h"
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+enum BignumDtoaMode {
+  // Return the shortest correct representation.
+  // For example the output of 0.299999999999999988897 is (the less accurate but
+  // correct) 0.3.
+  BIGNUM_DTOA_SHORTEST,
+  // Same as BIGNUM_DTOA_SHORTEST but for single-precision floats.
+  BIGNUM_DTOA_SHORTEST_SINGLE,
+  // Return a fixed number of digits after the decimal point.
+  // For instance fixed(0.1, 4) becomes 0.1000
+  // If the input number is big, the output will be big.
+  BIGNUM_DTOA_FIXED,
+  // Return a fixed number of digits, no matter what the exponent is.
+  BIGNUM_DTOA_PRECISION
+};
+
+// Converts the given double 'v' to ascii.
+// The result should be interpreted as buffer * 10^(point-length).
+// The buffer will be null-terminated.
+//
+// The input v must be > 0 and different from NaN, and Infinity.
+//
+// The output depends on the given mode:
+//  - SHORTEST: produce the least amount of digits for which the internal
+//   identity requirement is still satisfied. If the digits are printed
+//   (together with the correct exponent) then reading this number will give
+//   'v' again. The buffer will choose the representation that is closest to
+//   'v'. If there are two at the same distance, than the number is round up.
+//   In this mode the 'requested_digits' parameter is ignored.
+//  - FIXED: produces digits necessary to print a given number with
+//   'requested_digits' digits after the decimal point. The produced digits
+//   might be too short in which case the caller has to fill the gaps with '0's.
+//   Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2.
+//   Halfway cases are rounded up. The call toFixed(0.15, 2) thus returns
+//     buffer="2", point=0.
+//   Note: the length of the returned buffer has no meaning wrt the significance
+//   of its digits. That is, just because it contains '0's does not mean that
+//   any other digit would not satisfy the internal identity requirement.
+//  - PRECISION: produces 'requested_digits' where the first digit is not '0'.
+//   Even though the length of produced digits usually equals
+//   'requested_digits', the function is allowed to return fewer digits, in
+//   which case the caller has to fill the missing digits with '0's.
+//   Halfway cases are again rounded up.
+// 'BignumDtoa' expects the given buffer to be big enough to hold all digits
+// and a terminating null-character.
+void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits,
+                Vector<char> buffer, int* length, int* point);
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_BIGNUM_DTOA_H_
diff --git a/pyarrow/include/arrow/vendored/double-conversion/bignum.h b/pyarrow/include/arrow/vendored/double-conversion/bignum.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bedb63b188f16e632df75a4b8d4256eb98ba387
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/double-conversion/bignum.h
@@ -0,0 +1,154 @@
+// Copyright 2010 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_BIGNUM_H_
+#define DOUBLE_CONVERSION_BIGNUM_H_
+
+#include "utils.h"
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+class Bignum {
+ public:
+  // 3584 = 128 * 28. We can represent 2^3584 > 10^1000 accurately.
+  // This bignum can encode much bigger numbers, since it contains an
+  // exponent.
+  static const int kMaxSignificantBits = 3584;
+
+  Bignum() : used_bigits_(0), exponent_(0) {}
+
+  void AssignUInt16(const uint16_t value);
+  void AssignUInt64(uint64_t value);
+  void AssignBignum(const Bignum& other);
+
+  void AssignDecimalString(const Vector<const char> value);
+  void AssignHexString(const Vector<const char> value);
+
+  void AssignPowerUInt16(uint16_t base, const int exponent);
+
+  void AddUInt64(const uint64_t operand);
+  void AddBignum(const Bignum& other);
+  // Precondition: this >= other.
+  void SubtractBignum(const Bignum& other);
+
+  void Square();
+  void ShiftLeft(const int shift_amount);
+  void MultiplyByUInt32(const uint32_t factor);
+  void MultiplyByUInt64(const uint64_t factor);
+  void MultiplyByPowerOfTen(const int exponent);
+  void Times10() { return MultiplyByUInt32(10); }
+  // Pseudocode:
+  //  int result = this / other;
+  //  this = this % other;
+  // In the worst case this function is in O(this/other).
+  uint16_t DivideModuloIntBignum(const Bignum& other);
+
+  bool ToHexString(char* buffer, const int buffer_size) const;
+
+  // Returns
+  //  -1 if a < b,
+  //   0 if a == b, and
+  //  +1 if a > b.
+  static int Compare(const Bignum& a, const Bignum& b);
+  static bool Equal(const Bignum& a, const Bignum& b) {
+    return Compare(a, b) == 0;
+  }
+  static bool LessEqual(const Bignum& a, const Bignum& b) {
+    return Compare(a, b) <= 0;
+  }
+  static bool Less(const Bignum& a, const Bignum& b) {
+    return Compare(a, b) < 0;
+  }
+  // Returns Compare(a + b, c);
+  static int PlusCompare(const Bignum& a, const Bignum& b, const Bignum& c);
+  // Returns a + b == c
+  static bool PlusEqual(const Bignum& a, const Bignum& b, const Bignum& c) {
+    return PlusCompare(a, b, c) == 0;
+  }
+  // Returns a + b <= c
+  static bool PlusLessEqual(const Bignum& a, const Bignum& b, const Bignum& c) {
+    return PlusCompare(a, b, c) <= 0;
+  }
+  // Returns a + b < c
+  static bool PlusLess(const Bignum& a, const Bignum& b, const Bignum& c) {
+    return PlusCompare(a, b, c) < 0;
+  }
+ private:
+  typedef uint32_t Chunk;
+  typedef uint64_t DoubleChunk;
+
+  static const int kChunkSize = sizeof(Chunk) * 8;
+  static const int kDoubleChunkSize = sizeof(DoubleChunk) * 8;
+  // With bigit size of 28 we loose some bits, but a double still fits easily
+  // into two chunks, and more importantly we can use the Comba multiplication.
+  static const int kBigitSize = 28;
+  static const Chunk kBigitMask = (1 << kBigitSize) - 1;
+  // Every instance allocates kBigitLength chunks on the stack. Bignums cannot
+  // grow. There are no checks if the stack-allocated space is sufficient.
+  static const int kBigitCapacity = kMaxSignificantBits / kBigitSize;
+
+  static void EnsureCapacity(const int size) {
+    if (size > kBigitCapacity) {
+      DOUBLE_CONVERSION_UNREACHABLE();
+    }
+  }
+  void Align(const Bignum& other);
+  void Clamp();
+  bool IsClamped() const {
+    return used_bigits_ == 0 || RawBigit(used_bigits_ - 1) != 0;
+  }
+  void Zero() {
+    used_bigits_ = 0;
+    exponent_ = 0;
+  }
+  // Requires this to have enough capacity (no tests done).
+  // Updates used_bigits_ if necessary.
+  // shift_amount must be < kBigitSize.
+  void BigitsShiftLeft(const int shift_amount);
+  // BigitLength includes the "hidden" bigits encoded in the exponent.
+  int BigitLength() const { return used_bigits_ + exponent_; }
+  Chunk& RawBigit(const int index);
+  const Chunk& RawBigit(const int index) const;
+  Chunk BigitOrZero(const int index) const;
+  void SubtractTimes(const Bignum& other, const int factor);
+
+  // The Bignum's value is value(bigits_buffer_) * 2^(exponent_ * kBigitSize),
+  // where the value of the buffer consists of the lower kBigitSize bits of
+  // the first used_bigits_ Chunks in bigits_buffer_, first chunk has lowest
+  // significant bits.
+  int16_t used_bigits_;
+  int16_t exponent_;
+  Chunk bigits_buffer_[kBigitCapacity];
+
+  DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(Bignum);
+};
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_BIGNUM_H_
diff --git a/pyarrow/include/arrow/vendored/double-conversion/cached-powers.h b/pyarrow/include/arrow/vendored/double-conversion/cached-powers.h
new file mode 100644
index 0000000000000000000000000000000000000000..68fd82d8059957a5af0099382b10e0ac8a9bac58
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/double-conversion/cached-powers.h
@@ -0,0 +1,66 @@
+// Copyright 2010 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_CACHED_POWERS_H_
+#define DOUBLE_CONVERSION_CACHED_POWERS_H_
+
+#include "diy-fp.h"
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+namespace PowersOfTenCache {
+
+  // Not all powers of ten are cached. The decimal exponent of two neighboring
+  // cached numbers will differ by kDecimalExponentDistance.
+  static const int kDecimalExponentDistance = 8;
+
+  static const int kMinDecimalExponent = -348;
+  static const int kMaxDecimalExponent = 340;
+
+  // Returns a cached power-of-ten with a binary exponent in the range
+  // [min_exponent; max_exponent] (boundaries included).
+  void GetCachedPowerForBinaryExponentRange(int min_exponent,
+                                            int max_exponent,
+                                            DiyFp* power,
+                                            int* decimal_exponent);
+
+  // Returns a cached power of ten x ~= 10^k such that
+  //   k <= decimal_exponent < k + kCachedPowersDecimalDistance.
+  // The given decimal_exponent must satisfy
+  //   kMinDecimalExponent <= requested_exponent, and
+  //   requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance.
+  void GetCachedPowerForDecimalExponent(int requested_exponent,
+                                        DiyFp* power,
+                                        int* found_exponent);
+
+}  // namespace PowersOfTenCache
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_CACHED_POWERS_H_
diff --git a/pyarrow/include/arrow/vendored/double-conversion/diy-fp.h b/pyarrow/include/arrow/vendored/double-conversion/diy-fp.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3367b9392a32cd41d3204009120c7654be866de
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/double-conversion/diy-fp.h
@@ -0,0 +1,139 @@
+// Copyright 2010 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_DIY_FP_H_
+#define DOUBLE_CONVERSION_DIY_FP_H_
+
+#include "utils.h"
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+// This "Do It Yourself Floating Point" class implements a floating-point number
+// with a uint64 significand and an int exponent. Normalized DiyFp numbers will
+// have the most significant bit of the significand set.
+// Multiplication and Subtraction do not normalize their results.
+// DiyFp store only non-negative numbers and are not designed to contain special
+// doubles (NaN and Infinity).
+class DiyFp {
+ public:
+  static const int kSignificandSize = 64;
+
+  DiyFp() : f_(0), e_(0) {}
+  DiyFp(const uint64_t significand, const int32_t exponent) : f_(significand), e_(exponent) {}
+
+  // this -= other.
+  // The exponents of both numbers must be the same and the significand of this
+  // must be greater or equal than the significand of other.
+  // The result will not be normalized.
+  void Subtract(const DiyFp& other) {
+    DOUBLE_CONVERSION_ASSERT(e_ == other.e_);
+    DOUBLE_CONVERSION_ASSERT(f_ >= other.f_);
+    f_ -= other.f_;
+  }
+
+  // Returns a - b.
+  // The exponents of both numbers must be the same and a must be greater
+  // or equal than b. The result will not be normalized.
+  static DiyFp Minus(const DiyFp& a, const DiyFp& b) {
+    DiyFp result = a;
+    result.Subtract(b);
+    return result;
+  }
+
+  // this *= other.
+  void Multiply(const DiyFp& other) {
+    // Simply "emulates" a 128 bit multiplication.
+    // However: the resulting number only contains 64 bits. The least
+    // significant 64 bits are only used for rounding the most significant 64
+    // bits.
+    const uint64_t kM32 = 0xFFFFFFFFU;
+    const uint64_t a = f_ >> 32;
+    const uint64_t b = f_ & kM32;
+    const uint64_t c = other.f_ >> 32;
+    const uint64_t d = other.f_ & kM32;
+    const uint64_t ac = a * c;
+    const uint64_t bc = b * c;
+    const uint64_t ad = a * d;
+    const uint64_t bd = b * d;
+    // By adding 1U << 31 to tmp we round the final result.
+    // Halfway cases will be rounded up.
+    const uint64_t tmp = (bd >> 32) + (ad & kM32) + (bc & kM32) + (1U << 31);
+    e_ += other.e_ + 64;
+    f_ = ac + (ad >> 32) + (bc >> 32) + (tmp >> 32);
+  }
+
+  // returns a * b;
+  static DiyFp Times(const DiyFp& a, const DiyFp& b) {
+    DiyFp result = a;
+    result.Multiply(b);
+    return result;
+  }
+
+  void Normalize() {
+    DOUBLE_CONVERSION_ASSERT(f_ != 0);
+    uint64_t significand = f_;
+    int32_t exponent = e_;
+
+    // This method is mainly called for normalizing boundaries. In general,
+    // boundaries need to be shifted by 10 bits, and we optimize for this case.
+    const uint64_t k10MSBits = DOUBLE_CONVERSION_UINT64_2PART_C(0xFFC00000, 00000000);
+    while ((significand & k10MSBits) == 0) {
+      significand <<= 10;
+      exponent -= 10;
+    }
+    while ((significand & kUint64MSB) == 0) {
+      significand <<= 1;
+      exponent--;
+    }
+    f_ = significand;
+    e_ = exponent;
+  }
+
+  static DiyFp Normalize(const DiyFp& a) {
+    DiyFp result = a;
+    result.Normalize();
+    return result;
+  }
+
+  uint64_t f() const { return f_; }
+  int32_t e() const { return e_; }
+
+  void set_f(uint64_t new_value) { f_ = new_value; }
+  void set_e(int32_t new_value) { e_ = new_value; }
+
+ private:
+  static const uint64_t kUint64MSB = DOUBLE_CONVERSION_UINT64_2PART_C(0x80000000, 00000000);
+
+  uint64_t f_;
+  int32_t e_;
+};
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_DIY_FP_H_
diff --git a/pyarrow/include/arrow/vendored/double-conversion/double-conversion.h b/pyarrow/include/arrow/vendored/double-conversion/double-conversion.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e8884d84ca56dbfd05964e463dd7999364b3b35
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/double-conversion/double-conversion.h
@@ -0,0 +1,34 @@
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_
+#define DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_
+
+#include "string-to-double.h"
+#include "double-to-string.h"
+
+#endif  // DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_
diff --git a/pyarrow/include/arrow/vendored/double-conversion/double-to-string.h b/pyarrow/include/arrow/vendored/double-conversion/double-to-string.h
new file mode 100644
index 0000000000000000000000000000000000000000..90a88b902d6ea12d3adf917cdbf9e63b818d71ee
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/double-conversion/double-to-string.h
@@ -0,0 +1,472 @@
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_DOUBLE_TO_STRING_H_
+#define DOUBLE_CONVERSION_DOUBLE_TO_STRING_H_
+
+#include "utils.h"
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+class DoubleToStringConverter {
+ public:
+  // When calling ToFixed with a double > 10^kMaxFixedDigitsBeforePoint
+  // or a requested_digits parameter > kMaxFixedDigitsAfterPoint then the
+  // function returns false.
+  static const int kMaxFixedDigitsBeforePoint = 60;
+  static const int kMaxFixedDigitsAfterPoint = 100;
+
+  // When calling ToExponential with a requested_digits
+  // parameter > kMaxExponentialDigits then the function returns false.
+  static const int kMaxExponentialDigits = 120;
+
+  // When calling ToPrecision with a requested_digits
+  // parameter < kMinPrecisionDigits or requested_digits > kMaxPrecisionDigits
+  // then the function returns false.
+  static const int kMinPrecisionDigits = 1;
+  static const int kMaxPrecisionDigits = 120;
+
+  // The maximal number of digits that are needed to emit a double in base 10.
+  // A higher precision can be achieved by using more digits, but the shortest
+  // accurate representation of any double will never use more digits than
+  // kBase10MaximalLength.
+  // Note that DoubleToAscii null-terminates its input. So the given buffer
+  // should be at least kBase10MaximalLength + 1 characters long.
+  static const int kBase10MaximalLength = 17;
+
+  // The maximal number of digits that are needed to emit a single in base 10.
+  // A higher precision can be achieved by using more digits, but the shortest
+  // accurate representation of any single will never use more digits than
+  // kBase10MaximalLengthSingle.
+  static const int kBase10MaximalLengthSingle = 9;
+
+  // The length of the longest string that 'ToShortest' can produce when the
+  // converter is instantiated with EcmaScript defaults (see
+  // 'EcmaScriptConverter')
+  // This value does not include the trailing '\0' character.
+  // This amount of characters is needed for negative values that hit the
+  // 'decimal_in_shortest_low' limit. For example: "-0.0000033333333333333333"
+  static const int kMaxCharsEcmaScriptShortest = 25;
+
+  enum Flags {
+    NO_FLAGS = 0,
+    EMIT_POSITIVE_EXPONENT_SIGN = 1,
+    EMIT_TRAILING_DECIMAL_POINT = 2,
+    EMIT_TRAILING_ZERO_AFTER_POINT = 4,
+    UNIQUE_ZERO = 8,
+    NO_TRAILING_ZERO = 16,
+    EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL = 32,
+    EMIT_TRAILING_ZERO_AFTER_POINT_IN_EXPONENTIAL = 64
+  };
+
+  // Flags should be a bit-or combination of the possible Flags-enum.
+  //  - NO_FLAGS: no special flags.
+  //  - EMIT_POSITIVE_EXPONENT_SIGN: when the number is converted into exponent
+  //    form, emits a '+' for positive exponents. Example: 1.2e+2.
+  //  - EMIT_TRAILING_DECIMAL_POINT: when the input number is an integer and is
+  //    converted into decimal format then a trailing decimal point is appended.
+  //    Example: 2345.0 is converted to "2345.".
+  //  - EMIT_TRAILING_ZERO_AFTER_POINT: in addition to a trailing decimal point
+  //    emits a trailing '0'-character. This flag requires the
+  //    EMIT_TRAILING_DECIMAL_POINT flag.
+  //    Example: 2345.0 is converted to "2345.0".
+  //  - UNIQUE_ZERO: "-0.0" is converted to "0.0".
+  //  - NO_TRAILING_ZERO: Trailing zeros are removed from the fractional portion
+  //    of the result in precision mode. Matches printf's %g.
+  //    When EMIT_TRAILING_ZERO_AFTER_POINT is also given, one trailing zero is
+  //    preserved.
+  //  - EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL: when the input number has
+  //    exactly one significant digit and is converted into exponent form then a
+  //    trailing decimal point is appended to the significand in shortest mode
+  //    or in precision mode with one requested digit.
+  //  - EMIT_TRAILING_ZERO_AFTER_POINT_IN_EXPONENTIAL: in addition to a trailing
+  //    decimal point emits a trailing '0'-character. This flag requires the
+  //    EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL flag.
+  //
+  // Infinity symbol and nan_symbol provide the string representation for these
+  // special values. If the string is NULL and the special value is encountered
+  // then the conversion functions return false.
+  //
+  // The exponent_character is used in exponential representations. It is
+  // usually 'e' or 'E'.
+  //
+  // When converting to the shortest representation the converter will
+  // represent input numbers in decimal format if they are in the interval
+  // [10^decimal_in_shortest_low; 10^decimal_in_shortest_high[
+  //    (lower boundary included, greater boundary excluded).
+  // Example: with decimal_in_shortest_low = -6 and
+  //               decimal_in_shortest_high = 21:
+  //   ToShortest(0.000001)  -> "0.000001"
+  //   ToShortest(0.0000001) -> "1e-7"
+  //   ToShortest(111111111111111111111.0)  -> "111111111111111110000"
+  //   ToShortest(100000000000000000000.0)  -> "100000000000000000000"
+  //   ToShortest(1111111111111111111111.0) -> "1.1111111111111111e+21"
+  //
+  // When converting to precision mode the converter may add
+  // max_leading_padding_zeroes before returning the number in exponential
+  // format.
+  // Example with max_leading_padding_zeroes_in_precision_mode = 6.
+  //   ToPrecision(0.0000012345, 2) -> "0.0000012"
+  //   ToPrecision(0.00000012345, 2) -> "1.2e-7"
+  // Similarly the converter may add up to
+  // max_trailing_padding_zeroes_in_precision_mode in precision mode to avoid
+  // returning an exponential representation. A zero added by the
+  // EMIT_TRAILING_ZERO_AFTER_POINT flag is counted for this limit.
+  // Examples for max_trailing_padding_zeroes_in_precision_mode = 1:
+  //   ToPrecision(230.0, 2) -> "230"
+  //   ToPrecision(230.0, 2) -> "230."  with EMIT_TRAILING_DECIMAL_POINT.
+  //   ToPrecision(230.0, 2) -> "2.3e2" with EMIT_TRAILING_ZERO_AFTER_POINT.
+  //
+  // When converting numbers with exactly one significant digit to exponent
+  // form in shortest mode or in precision mode with one requested digit, the
+  // EMIT_TRAILING_DECIMAL_POINT and EMIT_TRAILING_ZERO_AFTER_POINT flags have
+  // no effect. Use the EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL flag to
+  // append a decimal point in this case and the
+  // EMIT_TRAILING_ZERO_AFTER_POINT_IN_EXPONENTIAL flag to also append a
+  // '0'-character in this case.
+  // Example with decimal_in_shortest_low = 0:
+  //   ToShortest(0.0009) -> "9e-4"
+  //     with EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL deactivated.
+  //   ToShortest(0.0009) -> "9.e-4"
+  //     with EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL activated.
+  //   ToShortest(0.0009) -> "9.0e-4"
+  //     with EMIT_TRAILING_DECIMAL_POINT_IN_EXPONENTIAL activated and
+  //     EMIT_TRAILING_ZERO_AFTER_POINT_IN_EXPONENTIAL activated.
+  //
+  // The min_exponent_width is used for exponential representations.
+  // The converter adds leading '0's to the exponent until the exponent
+  // is at least min_exponent_width digits long.
+  // The min_exponent_width is clamped to 5.
+  // As such, the exponent may never have more than 5 digits in total.
+  DoubleToStringConverter(int flags,
+                          const char* infinity_symbol,
+                          const char* nan_symbol,
+                          char exponent_character,
+                          int decimal_in_shortest_low,
+                          int decimal_in_shortest_high,
+                          int max_leading_padding_zeroes_in_precision_mode,
+                          int max_trailing_padding_zeroes_in_precision_mode,
+                          int min_exponent_width = 0)
+      : flags_(flags),
+        infinity_symbol_(infinity_symbol),
+        nan_symbol_(nan_symbol),
+        exponent_character_(exponent_character),
+        decimal_in_shortest_low_(decimal_in_shortest_low),
+        decimal_in_shortest_high_(decimal_in_shortest_high),
+        max_leading_padding_zeroes_in_precision_mode_(
+            max_leading_padding_zeroes_in_precision_mode),
+        max_trailing_padding_zeroes_in_precision_mode_(
+            max_trailing_padding_zeroes_in_precision_mode),
+        min_exponent_width_(min_exponent_width) {
+    // When 'trailing zero after the point' is set, then 'trailing point'
+    // must be set too.
+    DOUBLE_CONVERSION_ASSERT(((flags & EMIT_TRAILING_DECIMAL_POINT) != 0) ||
+        !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0));
+  }
+
+  // Returns a converter following the EcmaScript specification.
+  //
+  // Flags: UNIQUE_ZERO and EMIT_POSITIVE_EXPONENT_SIGN.
+  // Special values: "Infinity" and "NaN".
+  // Lower case 'e' for exponential values.
+  // decimal_in_shortest_low: -6
+  // decimal_in_shortest_high: 21
+  // max_leading_padding_zeroes_in_precision_mode: 6
+  // max_trailing_padding_zeroes_in_precision_mode: 0
+  static const DoubleToStringConverter& EcmaScriptConverter();
+
+  // Computes the shortest string of digits that correctly represent the input
+  // number. Depending on decimal_in_shortest_low and decimal_in_shortest_high
+  // (see constructor) it then either returns a decimal representation, or an
+  // exponential representation.
+  // Example with decimal_in_shortest_low = -6,
+  //              decimal_in_shortest_high = 21,
+  //              EMIT_POSITIVE_EXPONENT_SIGN activated, and
+  //              EMIT_TRAILING_DECIMAL_POINT deactivated:
+  //   ToShortest(0.000001)  -> "0.000001"
+  //   ToShortest(0.0000001) -> "1e-7"
+  //   ToShortest(111111111111111111111.0)  -> "111111111111111110000"
+  //   ToShortest(100000000000000000000.0)  -> "100000000000000000000"
+  //   ToShortest(1111111111111111111111.0) -> "1.1111111111111111e+21"
+  //
+  // Note: the conversion may round the output if the returned string
+  // is accurate enough to uniquely identify the input-number.
+  // For example the most precise representation of the double 9e59 equals
+  // "899999999999999918767229449717619953810131273674690656206848", but
+  // the converter will return the shorter (but still correct) "9e59".
+  //
+  // Returns true if the conversion succeeds. The conversion always succeeds
+  // except when the input value is special and no infinity_symbol or
+  // nan_symbol has been given to the constructor.
+  //
+  // The length of the longest result is the maximum of the length of the
+  // following string representations (each with possible examples):
+  // - NaN and negative infinity: "NaN", "-Infinity", "-inf".
+  // - -10^(decimal_in_shortest_high - 1):
+  //      "-100000000000000000000", "-1000000000000000.0"
+  // - the longest string in range [0; -10^decimal_in_shortest_low]. Generally,
+  //   this string is 3 + kBase10MaximalLength - decimal_in_shortest_low.
+  //   (Sign, '0', decimal point, padding zeroes for decimal_in_shortest_low,
+  //   and the significant digits).
+  //      "-0.0000033333333333333333", "-0.0012345678901234567"
+  // - the longest exponential representation. (A negative number with
+  //   kBase10MaximalLength significant digits).
+  //      "-1.7976931348623157e+308", "-1.7976931348623157E308"
+  // In addition, the buffer must be able to hold the trailing '\0' character.
+  bool ToShortest(double value, StringBuilder* result_builder) const {
+    return ToShortestIeeeNumber(value, result_builder, SHORTEST);
+  }
+
+  // Same as ToShortest, but for single-precision floats.
+  bool ToShortestSingle(float value, StringBuilder* result_builder) const {
+    return ToShortestIeeeNumber(value, result_builder, SHORTEST_SINGLE);
+  }
+
+
+  // Computes a decimal representation with a fixed number of digits after the
+  // decimal point. The last emitted digit is rounded.
+  //
+  // Examples:
+  //   ToFixed(3.12, 1) -> "3.1"
+  //   ToFixed(3.1415, 3) -> "3.142"
+  //   ToFixed(1234.56789, 4) -> "1234.5679"
+  //   ToFixed(1.23, 5) -> "1.23000"
+  //   ToFixed(0.1, 4) -> "0.1000"
+  //   ToFixed(1e30, 2) -> "1000000000000000019884624838656.00"
+  //   ToFixed(0.1, 30) -> "0.100000000000000005551115123126"
+  //   ToFixed(0.1, 17) -> "0.10000000000000001"
+  //
+  // If requested_digits equals 0, then the tail of the result depends on
+  // the EMIT_TRAILING_DECIMAL_POINT and EMIT_TRAILING_ZERO_AFTER_POINT.
+  // Examples, for requested_digits == 0,
+  //   let EMIT_TRAILING_DECIMAL_POINT and EMIT_TRAILING_ZERO_AFTER_POINT be
+  //    - false and false: then 123.45 -> 123
+  //                             0.678 -> 1
+  //    - true and false: then 123.45 -> 123.
+  //                            0.678 -> 1.
+  //    - true and true: then 123.45 -> 123.0
+  //                           0.678 -> 1.0
+  //
+  // Returns true if the conversion succeeds. The conversion always succeeds
+  // except for the following cases:
+  //   - the input value is special and no infinity_symbol or nan_symbol has
+  //     been provided to the constructor,
+  //   - 'value' > 10^kMaxFixedDigitsBeforePoint, or
+  //   - 'requested_digits' > kMaxFixedDigitsAfterPoint.
+  // The last two conditions imply that the result for non-special values never
+  // contains more than
+  //  1 + kMaxFixedDigitsBeforePoint + 1 + kMaxFixedDigitsAfterPoint characters
+  // (one additional character for the sign, and one for the decimal point).
+  // In addition, the buffer must be able to hold the trailing '\0' character.
+  bool ToFixed(double value,
+               int requested_digits,
+               StringBuilder* result_builder) const;
+
+  // Computes a representation in exponential format with requested_digits
+  // after the decimal point. The last emitted digit is rounded.
+  // If requested_digits equals -1, then the shortest exponential representation
+  // is computed.
+  //
+  // Examples with EMIT_POSITIVE_EXPONENT_SIGN deactivated, and
+  //               exponent_character set to 'e'.
+  //   ToExponential(3.12, 1) -> "3.1e0"
+  //   ToExponential(5.0, 3) -> "5.000e0"
+  //   ToExponential(0.001, 2) -> "1.00e-3"
+  //   ToExponential(3.1415, -1) -> "3.1415e0"
+  //   ToExponential(3.1415, 4) -> "3.1415e0"
+  //   ToExponential(3.1415, 3) -> "3.142e0"
+  //   ToExponential(123456789000000, 3) -> "1.235e14"
+  //   ToExponential(1000000000000000019884624838656.0, -1) -> "1e30"
+  //   ToExponential(1000000000000000019884624838656.0, 32) ->
+  //                     "1.00000000000000001988462483865600e30"
+  //   ToExponential(1234, 0) -> "1e3"
+  //
+  // Returns true if the conversion succeeds. The conversion always succeeds
+  // except for the following cases:
+  //   - the input value is special and no infinity_symbol or nan_symbol has
+  //     been provided to the constructor,
+  //   - 'requested_digits' > kMaxExponentialDigits.
+  //
+  // The last condition implies that the result never contains more than
+  // kMaxExponentialDigits + 8 characters (the sign, the digit before the
+  // decimal point, the decimal point, the exponent character, the
+  // exponent's sign, and at most 3 exponent digits).
+  // In addition, the buffer must be able to hold the trailing '\0' character.
+  bool ToExponential(double value,
+                     int requested_digits,
+                     StringBuilder* result_builder) const;
+
+
+  // Computes 'precision' leading digits of the given 'value' and returns them
+  // either in exponential or decimal format, depending on
+  // max_{leading|trailing}_padding_zeroes_in_precision_mode (given to the
+  // constructor).
+  // The last computed digit is rounded.
+  //
+  // Example with max_leading_padding_zeroes_in_precision_mode = 6.
+  //   ToPrecision(0.0000012345, 2) -> "0.0000012"
+  //   ToPrecision(0.00000012345, 2) -> "1.2e-7"
+  // Similarly the converter may add up to
+  // max_trailing_padding_zeroes_in_precision_mode in precision mode to avoid
+  // returning an exponential representation. A zero added by the
+  // EMIT_TRAILING_ZERO_AFTER_POINT flag is counted for this limit.
+  // Examples for max_trailing_padding_zeroes_in_precision_mode = 1:
+  //   ToPrecision(230.0, 2) -> "230"
+  //   ToPrecision(230.0, 2) -> "230."  with EMIT_TRAILING_DECIMAL_POINT.
+  //   ToPrecision(230.0, 2) -> "2.3e2" with EMIT_TRAILING_ZERO_AFTER_POINT.
+  // Examples for max_trailing_padding_zeroes_in_precision_mode = 3, and no
+  //    EMIT_TRAILING_ZERO_AFTER_POINT:
+  //   ToPrecision(123450.0, 6) -> "123450"
+  //   ToPrecision(123450.0, 5) -> "123450"
+  //   ToPrecision(123450.0, 4) -> "123500"
+  //   ToPrecision(123450.0, 3) -> "123000"
+  //   ToPrecision(123450.0, 2) -> "1.2e5"
+  //
+  // Returns true if the conversion succeeds. The conversion always succeeds
+  // except for the following cases:
+  //   - the input value is special and no infinity_symbol or nan_symbol has
+  //     been provided to the constructor,
+  //   - precision < kMinPericisionDigits
+  //   - precision > kMaxPrecisionDigits
+  //
+  // The last condition implies that the result never contains more than
+  // kMaxPrecisionDigits + 7 characters (the sign, the decimal point, the
+  // exponent character, the exponent's sign, and at most 3 exponent digits).
+  // In addition, the buffer must be able to hold the trailing '\0' character.
+  bool ToPrecision(double value,
+                   int precision,
+                   StringBuilder* result_builder) const;
+
+  enum DtoaMode {
+    // Produce the shortest correct representation.
+    // For example the output of 0.299999999999999988897 is (the less accurate
+    // but correct) 0.3.
+    SHORTEST,
+    // Same as SHORTEST, but for single-precision floats.
+    SHORTEST_SINGLE,
+    // Produce a fixed number of digits after the decimal point.
+    // For instance fixed(0.1, 4) becomes 0.1000
+    // If the input number is big, the output will be big.
+    FIXED,
+    // Fixed number of digits (independent of the decimal point).
+    PRECISION
+  };
+
+  // Converts the given double 'v' to digit characters. 'v' must not be NaN,
+  // +Infinity, or -Infinity. In SHORTEST_SINGLE-mode this restriction also
+  // applies to 'v' after it has been casted to a single-precision float. That
+  // is, in this mode static_cast<float>(v) must not be NaN, +Infinity or
+  // -Infinity.
+  //
+  // The result should be interpreted as buffer * 10^(point-length).
+  //
+  // The digits are written to the buffer in the platform's charset, which is
+  // often UTF-8 (with ASCII-range digits) but may be another charset, such
+  // as EBCDIC.
+  //
+  // The output depends on the given mode:
+  //  - SHORTEST: produce the least amount of digits for which the internal
+  //   identity requirement is still satisfied. If the digits are printed
+  //   (together with the correct exponent) then reading this number will give
+  //   'v' again. The buffer will choose the representation that is closest to
+  //   'v'. If there are two at the same distance, than the one farther away
+  //   from 0 is chosen (halfway cases - ending with 5 - are rounded up).
+  //   In this mode the 'requested_digits' parameter is ignored.
+  //  - SHORTEST_SINGLE: same as SHORTEST but with single-precision.
+  //  - FIXED: produces digits necessary to print a given number with
+  //   'requested_digits' digits after the decimal point. The produced digits
+  //   might be too short in which case the caller has to fill the remainder
+  //   with '0's.
+  //   Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2.
+  //   Halfway cases are rounded towards +/-Infinity (away from 0). The call
+  //   toFixed(0.15, 2) thus returns buffer="2", point=0.
+  //   The returned buffer may contain digits that would be truncated from the
+  //   shortest representation of the input.
+  //  - PRECISION: produces 'requested_digits' where the first digit is not '0'.
+  //   Even though the length of produced digits usually equals
+  //   'requested_digits', the function is allowed to return fewer digits, in
+  //   which case the caller has to fill the missing digits with '0's.
+  //   Halfway cases are again rounded away from 0.
+  // DoubleToAscii expects the given buffer to be big enough to hold all
+  // digits and a terminating null-character. In SHORTEST-mode it expects a
+  // buffer of at least kBase10MaximalLength + 1. In all other modes the
+  // requested_digits parameter and the padding-zeroes limit the size of the
+  // output. Don't forget the decimal point, the exponent character and the
+  // terminating null-character when computing the maximal output size.
+  // The given length is only used in debug mode to ensure the buffer is big
+  // enough.
+  static void DoubleToAscii(double v,
+                            DtoaMode mode,
+                            int requested_digits,
+                            char* buffer,
+                            int buffer_length,
+                            bool* sign,
+                            int* length,
+                            int* point);
+
+ private:
+  // Implementation for ToShortest and ToShortestSingle.
+  bool ToShortestIeeeNumber(double value,
+                            StringBuilder* result_builder,
+                            DtoaMode mode) const;
+
+  // If the value is a special value (NaN or Infinity) constructs the
+  // corresponding string using the configured infinity/nan-symbol.
+  // If either of them is NULL or the value is not special then the
+  // function returns false.
+  bool HandleSpecialValues(double value, StringBuilder* result_builder) const;
+  // Constructs an exponential representation (i.e. 1.234e56).
+  // The given exponent assumes a decimal point after the first decimal digit.
+  void CreateExponentialRepresentation(const char* decimal_digits,
+                                       int length,
+                                       int exponent,
+                                       StringBuilder* result_builder) const;
+  // Creates a decimal representation (i.e 1234.5678).
+  void CreateDecimalRepresentation(const char* decimal_digits,
+                                   int length,
+                                   int decimal_point,
+                                   int digits_after_point,
+                                   StringBuilder* result_builder) const;
+
+  const int flags_;
+  const char* const infinity_symbol_;
+  const char* const nan_symbol_;
+  const char exponent_character_;
+  const int decimal_in_shortest_low_;
+  const int decimal_in_shortest_high_;
+  const int max_leading_padding_zeroes_in_precision_mode_;
+  const int max_trailing_padding_zeroes_in_precision_mode_;
+  const int min_exponent_width_;
+
+  DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS(DoubleToStringConverter);
+};
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_DOUBLE_TO_STRING_H_
diff --git a/pyarrow/include/arrow/vendored/double-conversion/fast-dtoa.h b/pyarrow/include/arrow/vendored/double-conversion/fast-dtoa.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddd0f04dcf02c8222b24e8e5c55654e80d127684
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/double-conversion/fast-dtoa.h
@@ -0,0 +1,90 @@
+// Copyright 2010 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_FAST_DTOA_H_
+#define DOUBLE_CONVERSION_FAST_DTOA_H_
+
+#include "utils.h"
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+enum FastDtoaMode {
+  // Computes the shortest representation of the given input. The returned
+  // result will be the most accurate number of this length. Longer
+  // representations might be more accurate.
+  FAST_DTOA_SHORTEST,
+  // Same as FAST_DTOA_SHORTEST but for single-precision floats.
+  FAST_DTOA_SHORTEST_SINGLE,
+  // Computes a representation where the precision (number of digits) is
+  // given as input. The precision is independent of the decimal point.
+  FAST_DTOA_PRECISION
+};
+
+// FastDtoa will produce at most kFastDtoaMaximalLength digits. This does not
+// include the terminating '\0' character.
+static const int kFastDtoaMaximalLength = 17;
+// Same for single-precision numbers.
+static const int kFastDtoaMaximalSingleLength = 9;
+
+// Provides a decimal representation of v.
+// The result should be interpreted as buffer * 10^(point - length).
+//
+// Precondition:
+//   * v must be a strictly positive finite double.
+//
+// Returns true if it succeeds, otherwise the result can not be trusted.
+// There will be *length digits inside the buffer followed by a null terminator.
+// If the function returns true and mode equals
+//   - FAST_DTOA_SHORTEST, then
+//     the parameter requested_digits is ignored.
+//     The result satisfies
+//         v == (double) (buffer * 10^(point - length)).
+//     The digits in the buffer are the shortest representation possible. E.g.
+//     if 0.099999999999 and 0.1 represent the same double then "1" is returned
+//     with point = 0.
+//     The last digit will be closest to the actual v. That is, even if several
+//     digits might correctly yield 'v' when read again, the buffer will contain
+//     the one closest to v.
+//   - FAST_DTOA_PRECISION, then
+//     the buffer contains requested_digits digits.
+//     the difference v - (buffer * 10^(point-length)) is closest to zero for
+//     all possible representations of requested_digits digits.
+//     If there are two values that are equally close, then FastDtoa returns
+//     false.
+// For both modes the buffer must be large enough to hold the result.
+bool FastDtoa(double d,
+              FastDtoaMode mode,
+              int requested_digits,
+              Vector<char> buffer,
+              int* length,
+              int* decimal_point);
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_FAST_DTOA_H_
diff --git a/pyarrow/include/arrow/vendored/double-conversion/fixed-dtoa.h b/pyarrow/include/arrow/vendored/double-conversion/fixed-dtoa.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf2a59a9805193063997ae621e6e3367d3cc029d
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/double-conversion/fixed-dtoa.h
@@ -0,0 +1,58 @@
+// Copyright 2010 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_FIXED_DTOA_H_
+#define DOUBLE_CONVERSION_FIXED_DTOA_H_
+
+#include "utils.h"
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+// Produces digits necessary to print a given number with
+// 'fractional_count' digits after the decimal point.
+// The buffer must be big enough to hold the result plus one terminating null
+// character.
+//
+// The produced digits might be too short in which case the caller has to fill
+// the gaps with '0's.
+// Example: FastFixedDtoa(0.001, 5, ...) is allowed to return buffer = "1", and
+// decimal_point = -2.
+// Halfway cases are rounded towards +/-Infinity (away from 0). The call
+// FastFixedDtoa(0.15, 2, ...) thus returns buffer = "2", decimal_point = 0.
+// The returned buffer may contain digits that would be truncated from the
+// shortest representation of the input.
+//
+// This method only works for some parameters. If it can't handle the input it
+// returns false. The output is null-terminated when the function succeeds.
+bool FastFixedDtoa(double v, int fractional_count,
+                   Vector<char> buffer, int* length, int* decimal_point);
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_FIXED_DTOA_H_
diff --git a/pyarrow/include/arrow/vendored/double-conversion/ieee.h b/pyarrow/include/arrow/vendored/double-conversion/ieee.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cedc0bee04e6470ce02d27b9390068e0ec0fcc1
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/double-conversion/ieee.h
@@ -0,0 +1,449 @@
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_DOUBLE_H_
+#define DOUBLE_CONVERSION_DOUBLE_H_
+
+#include "diy-fp.h"
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+// We assume that doubles and uint64_t have the same endianness.
+static uint64_t double_to_uint64(double d) { return BitCast<uint64_t>(d); }
+static double uint64_to_double(uint64_t d64) { return BitCast<double>(d64); }
+static uint32_t float_to_uint32(float f) { return BitCast<uint32_t>(f); }
+static float uint32_to_float(uint32_t d32) { return BitCast<float>(d32); }
+
+// Helper functions for doubles.
+class Double {
+ public:
+  static const uint64_t kSignMask = DOUBLE_CONVERSION_UINT64_2PART_C(0x80000000, 00000000);
+  static const uint64_t kExponentMask = DOUBLE_CONVERSION_UINT64_2PART_C(0x7FF00000, 00000000);
+  static const uint64_t kSignificandMask = DOUBLE_CONVERSION_UINT64_2PART_C(0x000FFFFF, FFFFFFFF);
+  static const uint64_t kHiddenBit = DOUBLE_CONVERSION_UINT64_2PART_C(0x00100000, 00000000);
+  static const uint64_t kQuietNanBit = DOUBLE_CONVERSION_UINT64_2PART_C(0x00080000, 00000000);
+  static const int kPhysicalSignificandSize = 52;  // Excludes the hidden bit.
+  static const int kSignificandSize = 53;
+  static const int kExponentBias = 0x3FF + kPhysicalSignificandSize;
+  static const int kMaxExponent = 0x7FF - kExponentBias;
+
+  Double() : d64_(0) {}
+  explicit Double(double d) : d64_(double_to_uint64(d)) {}
+  explicit Double(uint64_t d64) : d64_(d64) {}
+  explicit Double(DiyFp diy_fp)
+    : d64_(DiyFpToUint64(diy_fp)) {}
+
+  // The value encoded by this Double must be greater or equal to +0.0.
+  // It must not be special (infinity, or NaN).
+  DiyFp AsDiyFp() const {
+    DOUBLE_CONVERSION_ASSERT(Sign() > 0);
+    DOUBLE_CONVERSION_ASSERT(!IsSpecial());
+    return DiyFp(Significand(), Exponent());
+  }
+
+  // The value encoded by this Double must be strictly greater than 0.
+  DiyFp AsNormalizedDiyFp() const {
+    DOUBLE_CONVERSION_ASSERT(value() > 0.0);
+    uint64_t f = Significand();
+    int e = Exponent();
+
+    // The current double could be a denormal.
+    while ((f & kHiddenBit) == 0) {
+      f <<= 1;
+      e--;
+    }
+    // Do the final shifts in one go.
+    f <<= DiyFp::kSignificandSize - kSignificandSize;
+    e -= DiyFp::kSignificandSize - kSignificandSize;
+    return DiyFp(f, e);
+  }
+
+  // Returns the double's bit as uint64.
+  uint64_t AsUint64() const {
+    return d64_;
+  }
+
+  // Returns the next greater double. Returns +infinity on input +infinity.
+  double NextDouble() const {
+    if (d64_ == kInfinity) return Double(kInfinity).value();
+    if (Sign() < 0 && Significand() == 0) {
+      // -0.0
+      return 0.0;
+    }
+    if (Sign() < 0) {
+      return Double(d64_ - 1).value();
+    } else {
+      return Double(d64_ + 1).value();
+    }
+  }
+
+  double PreviousDouble() const {
+    if (d64_ == (kInfinity | kSignMask)) return -Infinity();
+    if (Sign() < 0) {
+      return Double(d64_ + 1).value();
+    } else {
+      if (Significand() == 0) return -0.0;
+      return Double(d64_ - 1).value();
+    }
+  }
+
+  int Exponent() const {
+    if (IsDenormal()) return kDenormalExponent;
+
+    uint64_t d64 = AsUint64();
+    int biased_e =
+        static_cast<int>((d64 & kExponentMask) >> kPhysicalSignificandSize);
+    return biased_e - kExponentBias;
+  }
+
+  uint64_t Significand() const {
+    uint64_t d64 = AsUint64();
+    uint64_t significand = d64 & kSignificandMask;
+    if (!IsDenormal()) {
+      return significand + kHiddenBit;
+    } else {
+      return significand;
+    }
+  }
+
+  // Returns true if the double is a denormal.
+  bool IsDenormal() const {
+    uint64_t d64 = AsUint64();
+    return (d64 & kExponentMask) == 0;
+  }
+
+  // We consider denormals not to be special.
+  // Hence only Infinity and NaN are special.
+  bool IsSpecial() const {
+    uint64_t d64 = AsUint64();
+    return (d64 & kExponentMask) == kExponentMask;
+  }
+
+  bool IsNan() const {
+    uint64_t d64 = AsUint64();
+    return ((d64 & kExponentMask) == kExponentMask) &&
+        ((d64 & kSignificandMask) != 0);
+  }
+
+  bool IsQuietNan() const {
+#if (defined(__mips__) && !defined(__mips_nan2008)) || defined(__hppa__)
+    return IsNan() && ((AsUint64() & kQuietNanBit) == 0);
+#else
+    return IsNan() && ((AsUint64() & kQuietNanBit) != 0);
+#endif
+  }
+
+  bool IsSignalingNan() const {
+#if (defined(__mips__) && !defined(__mips_nan2008)) || defined(__hppa__)
+    return IsNan() && ((AsUint64() & kQuietNanBit) != 0);
+#else
+    return IsNan() && ((AsUint64() & kQuietNanBit) == 0);
+#endif
+  }
+
+
+  bool IsInfinite() const {
+    uint64_t d64 = AsUint64();
+    return ((d64 & kExponentMask) == kExponentMask) &&
+        ((d64 & kSignificandMask) == 0);
+  }
+
+  int Sign() const {
+    uint64_t d64 = AsUint64();
+    return (d64 & kSignMask) == 0? 1: -1;
+  }
+
+  // Precondition: the value encoded by this Double must be greater or equal
+  // than +0.0.
+  DiyFp UpperBoundary() const {
+    DOUBLE_CONVERSION_ASSERT(Sign() > 0);
+    return DiyFp(Significand() * 2 + 1, Exponent() - 1);
+  }
+
+  // Computes the two boundaries of this.
+  // The bigger boundary (m_plus) is normalized. The lower boundary has the same
+  // exponent as m_plus.
+  // Precondition: the value encoded by this Double must be greater than 0.
+  void NormalizedBoundaries(DiyFp* out_m_minus, DiyFp* out_m_plus) const {
+    DOUBLE_CONVERSION_ASSERT(value() > 0.0);
+    DiyFp v = this->AsDiyFp();
+    DiyFp m_plus = DiyFp::Normalize(DiyFp((v.f() << 1) + 1, v.e() - 1));
+    DiyFp m_minus;
+    if (LowerBoundaryIsCloser()) {
+      m_minus = DiyFp((v.f() << 2) - 1, v.e() - 2);
+    } else {
+      m_minus = DiyFp((v.f() << 1) - 1, v.e() - 1);
+    }
+    m_minus.set_f(m_minus.f() << (m_minus.e() - m_plus.e()));
+    m_minus.set_e(m_plus.e());
+    *out_m_plus = m_plus;
+    *out_m_minus = m_minus;
+  }
+
+  bool LowerBoundaryIsCloser() const {
+    // The boundary is closer if the significand is of the form f == 2^p-1 then
+    // the lower boundary is closer.
+    // Think of v = 1000e10 and v- = 9999e9.
+    // Then the boundary (== (v - v-)/2) is not just at a distance of 1e9 but
+    // at a distance of 1e8.
+    // The only exception is for the smallest normal: the largest denormal is
+    // at the same distance as its successor.
+    // Note: denormals have the same exponent as the smallest normals.
+    bool physical_significand_is_zero = ((AsUint64() & kSignificandMask) == 0);
+    return physical_significand_is_zero && (Exponent() != kDenormalExponent);
+  }
+
+  double value() const { return uint64_to_double(d64_); }
+
+  // Returns the significand size for a given order of magnitude.
+  // If v = f*2^e with 2^p-1 <= f <= 2^p then p+e is v's order of magnitude.
+  // This function returns the number of significant binary digits v will have
+  // once it's encoded into a double. In almost all cases this is equal to
+  // kSignificandSize. The only exceptions are denormals. They start with
+  // leading zeroes and their effective significand-size is hence smaller.
+  static int SignificandSizeForOrderOfMagnitude(int order) {
+    if (order >= (kDenormalExponent + kSignificandSize)) {
+      return kSignificandSize;
+    }
+    if (order <= kDenormalExponent) return 0;
+    return order - kDenormalExponent;
+  }
+
+  static double Infinity() {
+    return Double(kInfinity).value();
+  }
+
+  static double NaN() {
+    return Double(kNaN).value();
+  }
+
+ private:
+  static const int kDenormalExponent = -kExponentBias + 1;
+  static const uint64_t kInfinity = DOUBLE_CONVERSION_UINT64_2PART_C(0x7FF00000, 00000000);
+#if (defined(__mips__) && !defined(__mips_nan2008)) || defined(__hppa__)
+  static const uint64_t kNaN = DOUBLE_CONVERSION_UINT64_2PART_C(0x7FF7FFFF, FFFFFFFF);
+#else
+  static const uint64_t kNaN = DOUBLE_CONVERSION_UINT64_2PART_C(0x7FF80000, 00000000);
+#endif
+
+
+  const uint64_t d64_;
+
+  static uint64_t DiyFpToUint64(DiyFp diy_fp) {
+    uint64_t significand = diy_fp.f();
+    int exponent = diy_fp.e();
+    while (significand > kHiddenBit + kSignificandMask) {
+      significand >>= 1;
+      exponent++;
+    }
+    if (exponent >= kMaxExponent) {
+      return kInfinity;
+    }
+    if (exponent < kDenormalExponent) {
+      return 0;
+    }
+    while (exponent > kDenormalExponent && (significand & kHiddenBit) == 0) {
+      significand <<= 1;
+      exponent--;
+    }
+    uint64_t biased_exponent;
+    if (exponent == kDenormalExponent && (significand & kHiddenBit) == 0) {
+      biased_exponent = 0;
+    } else {
+      biased_exponent = static_cast<uint64_t>(exponent + kExponentBias);
+    }
+    return (significand & kSignificandMask) |
+        (biased_exponent << kPhysicalSignificandSize);
+  }
+
+  DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(Double);
+};
+
+class Single {
+ public:
+  static const uint32_t kSignMask = 0x80000000;
+  static const uint32_t kExponentMask = 0x7F800000;
+  static const uint32_t kSignificandMask = 0x007FFFFF;
+  static const uint32_t kHiddenBit = 0x00800000;
+  static const uint32_t kQuietNanBit = 0x00400000;
+  static const int kPhysicalSignificandSize = 23;  // Excludes the hidden bit.
+  static const int kSignificandSize = 24;
+
+  Single() : d32_(0) {}
+  explicit Single(float f) : d32_(float_to_uint32(f)) {}
+  explicit Single(uint32_t d32) : d32_(d32) {}
+
+  // The value encoded by this Single must be greater or equal to +0.0.
+  // It must not be special (infinity, or NaN).
+  DiyFp AsDiyFp() const {
+    DOUBLE_CONVERSION_ASSERT(Sign() > 0);
+    DOUBLE_CONVERSION_ASSERT(!IsSpecial());
+    return DiyFp(Significand(), Exponent());
+  }
+
+  // Returns the single's bit as uint64.
+  uint32_t AsUint32() const {
+    return d32_;
+  }
+
+  int Exponent() const {
+    if (IsDenormal()) return kDenormalExponent;
+
+    uint32_t d32 = AsUint32();
+    int biased_e =
+        static_cast<int>((d32 & kExponentMask) >> kPhysicalSignificandSize);
+    return biased_e - kExponentBias;
+  }
+
+  uint32_t Significand() const {
+    uint32_t d32 = AsUint32();
+    uint32_t significand = d32 & kSignificandMask;
+    if (!IsDenormal()) {
+      return significand + kHiddenBit;
+    } else {
+      return significand;
+    }
+  }
+
+  // Returns true if the single is a denormal.
+  bool IsDenormal() const {
+    uint32_t d32 = AsUint32();
+    return (d32 & kExponentMask) == 0;
+  }
+
+  // We consider denormals not to be special.
+  // Hence only Infinity and NaN are special.
+  bool IsSpecial() const {
+    uint32_t d32 = AsUint32();
+    return (d32 & kExponentMask) == kExponentMask;
+  }
+
+  bool IsNan() const {
+    uint32_t d32 = AsUint32();
+    return ((d32 & kExponentMask) == kExponentMask) &&
+        ((d32 & kSignificandMask) != 0);
+  }
+
+  bool IsQuietNan() const {
+#if (defined(__mips__) && !defined(__mips_nan2008)) || defined(__hppa__)
+    return IsNan() && ((AsUint32() & kQuietNanBit) == 0);
+#else
+    return IsNan() && ((AsUint32() & kQuietNanBit) != 0);
+#endif
+  }
+
+  bool IsSignalingNan() const {
+#if (defined(__mips__) && !defined(__mips_nan2008)) || defined(__hppa__)
+    return IsNan() && ((AsUint32() & kQuietNanBit) != 0);
+#else
+    return IsNan() && ((AsUint32() & kQuietNanBit) == 0);
+#endif
+  }
+
+
+  bool IsInfinite() const {
+    uint32_t d32 = AsUint32();
+    return ((d32 & kExponentMask) == kExponentMask) &&
+        ((d32 & kSignificandMask) == 0);
+  }
+
+  int Sign() const {
+    uint32_t d32 = AsUint32();
+    return (d32 & kSignMask) == 0? 1: -1;
+  }
+
+  // Computes the two boundaries of this.
+  // The bigger boundary (m_plus) is normalized. The lower boundary has the same
+  // exponent as m_plus.
+  // Precondition: the value encoded by this Single must be greater than 0.
+  void NormalizedBoundaries(DiyFp* out_m_minus, DiyFp* out_m_plus) const {
+    DOUBLE_CONVERSION_ASSERT(value() > 0.0);
+    DiyFp v = this->AsDiyFp();
+    DiyFp m_plus = DiyFp::Normalize(DiyFp((v.f() << 1) + 1, v.e() - 1));
+    DiyFp m_minus;
+    if (LowerBoundaryIsCloser()) {
+      m_minus = DiyFp((v.f() << 2) - 1, v.e() - 2);
+    } else {
+      m_minus = DiyFp((v.f() << 1) - 1, v.e() - 1);
+    }
+    m_minus.set_f(m_minus.f() << (m_minus.e() - m_plus.e()));
+    m_minus.set_e(m_plus.e());
+    *out_m_plus = m_plus;
+    *out_m_minus = m_minus;
+  }
+
+  // Precondition: the value encoded by this Single must be greater or equal
+  // than +0.0.
+  DiyFp UpperBoundary() const {
+    DOUBLE_CONVERSION_ASSERT(Sign() > 0);
+    return DiyFp(Significand() * 2 + 1, Exponent() - 1);
+  }
+
+  bool LowerBoundaryIsCloser() const {
+    // The boundary is closer if the significand is of the form f == 2^p-1 then
+    // the lower boundary is closer.
+    // Think of v = 1000e10 and v- = 9999e9.
+    // Then the boundary (== (v - v-)/2) is not just at a distance of 1e9 but
+    // at a distance of 1e8.
+    // The only exception is for the smallest normal: the largest denormal is
+    // at the same distance as its successor.
+    // Note: denormals have the same exponent as the smallest normals.
+    bool physical_significand_is_zero = ((AsUint32() & kSignificandMask) == 0);
+    return physical_significand_is_zero && (Exponent() != kDenormalExponent);
+  }
+
+  float value() const { return uint32_to_float(d32_); }
+
+  static float Infinity() {
+    return Single(kInfinity).value();
+  }
+
+  static float NaN() {
+    return Single(kNaN).value();
+  }
+
+ private:
+  static const int kExponentBias = 0x7F + kPhysicalSignificandSize;
+  static const int kDenormalExponent = -kExponentBias + 1;
+  static const int kMaxExponent = 0xFF - kExponentBias;
+  static const uint32_t kInfinity = 0x7F800000;
+#if (defined(__mips__) && !defined(__mips_nan2008)) || defined(__hppa__)
+  static const uint32_t kNaN = 0x7FBFFFFF;
+#else
+  static const uint32_t kNaN = 0x7FC00000;
+#endif
+
+  const uint32_t d32_;
+
+  DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(Single);
+};
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_DOUBLE_H_
diff --git a/pyarrow/include/arrow/vendored/double-conversion/string-to-double.h b/pyarrow/include/arrow/vendored/double-conversion/string-to-double.h
new file mode 100644
index 0000000000000000000000000000000000000000..83eb6fec5f44400cf9a81d45862c4bfd71ac52fa
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/double-conversion/string-to-double.h
@@ -0,0 +1,240 @@
+// Copyright 2012 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_STRING_TO_DOUBLE_H_
+#define DOUBLE_CONVERSION_STRING_TO_DOUBLE_H_
+
+#include "utils.h"
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+class StringToDoubleConverter {
+ public:
+  // Enumeration for allowing octals and ignoring junk when converting
+  // strings to numbers.
+  enum Flags {
+    NO_FLAGS = 0,
+    ALLOW_HEX = 1,
+    ALLOW_OCTALS = 2,
+    ALLOW_TRAILING_JUNK = 4,
+    ALLOW_LEADING_SPACES = 8,
+    ALLOW_TRAILING_SPACES = 16,
+    ALLOW_SPACES_AFTER_SIGN = 32,
+    ALLOW_CASE_INSENSITIVITY = 64,
+    ALLOW_CASE_INSENSIBILITY = 64,  // Deprecated
+    ALLOW_HEX_FLOATS = 128,
+  };
+
+  static const uc16 kNoSeparator = '\0';
+
+  // Flags should be a bit-or combination of the possible Flags-enum.
+  //  - NO_FLAGS: no special flags.
+  //  - ALLOW_HEX: recognizes the prefix "0x". Hex numbers may only be integers.
+  //      Ex: StringToDouble("0x1234") -> 4660.0
+  //          In StringToDouble("0x1234.56") the characters ".56" are trailing
+  //          junk. The result of the call is hence dependent on
+  //          the ALLOW_TRAILING_JUNK flag and/or the junk value.
+  //      With this flag "0x" is a junk-string. Even with ALLOW_TRAILING_JUNK,
+  //      the string will not be parsed as "0" followed by junk.
+  //
+  //  - ALLOW_OCTALS: recognizes the prefix "0" for octals:
+  //      If a sequence of octal digits starts with '0', then the number is
+  //      read as octal integer. Octal numbers may only be integers.
+  //      Ex: StringToDouble("01234") -> 668.0
+  //          StringToDouble("012349") -> 12349.0  // Not a sequence of octal
+  //                                               // digits.
+  //          In StringToDouble("01234.56") the characters ".56" are trailing
+  //          junk. The result of the call is hence dependent on
+  //          the ALLOW_TRAILING_JUNK flag and/or the junk value.
+  //          In StringToDouble("01234e56") the characters "e56" are trailing
+  //          junk, too.
+  //  - ALLOW_TRAILING_JUNK: ignore trailing characters that are not part of
+  //      a double literal.
+  //  - ALLOW_LEADING_SPACES: skip over leading whitespace, including spaces,
+  //                          new-lines, and tabs.
+  //  - ALLOW_TRAILING_SPACES: ignore trailing whitespace.
+  //  - ALLOW_SPACES_AFTER_SIGN: ignore whitespace after the sign.
+  //       Ex: StringToDouble("-   123.2") -> -123.2.
+  //           StringToDouble("+   123.2") -> 123.2
+  //  - ALLOW_CASE_INSENSITIVITY: ignore case of characters for special values:
+  //      infinity and nan.
+  //  - ALLOW_HEX_FLOATS: allows hexadecimal float literals.
+  //      This *must* start with "0x" and separate the exponent with "p".
+  //      Examples: 0x1.2p3 == 9.0
+  //                0x10.1p0 == 16.0625
+  //      ALLOW_HEX and ALLOW_HEX_FLOATS are indented.
+  //
+  // empty_string_value is returned when an empty string is given as input.
+  // If ALLOW_LEADING_SPACES or ALLOW_TRAILING_SPACES are set, then a string
+  // containing only spaces is converted to the 'empty_string_value', too.
+  //
+  // junk_string_value is returned when
+  //  a) ALLOW_TRAILING_JUNK is not set, and a junk character (a character not
+  //     part of a double-literal) is found.
+  //  b) ALLOW_TRAILING_JUNK is set, but the string does not start with a
+  //     double literal.
+  //
+  // infinity_symbol and nan_symbol are strings that are used to detect
+  // inputs that represent infinity and NaN. They can be null, in which case
+  // they are ignored.
+  // The conversion routine first reads any possible signs. Then it compares the
+  // following character of the input-string with the first character of
+  // the infinity, and nan-symbol. If either matches, the function assumes, that
+  // a match has been found, and expects the following input characters to match
+  // the remaining characters of the special-value symbol.
+  // This means that the following restrictions apply to special-value symbols:
+  //  - they must not start with signs ('+', or '-'),
+  //  - they must not have the same first character.
+  //  - they must not start with digits.
+  //
+  // If the separator character is not kNoSeparator, then that specific
+  // character is ignored when in between two valid digits of the significant.
+  // It is not allowed to appear in the exponent.
+  // It is not allowed to lead or trail the number.
+  // It is not allowed to appear twice next to each other.
+  //
+  // Examples:
+  //  flags = ALLOW_HEX | ALLOW_TRAILING_JUNK,
+  //  empty_string_value = 0.0,
+  //  junk_string_value = NaN,
+  //  infinity_symbol = "infinity",
+  //  nan_symbol = "nan":
+  //    StringToDouble("0x1234") -> 4660.0.
+  //    StringToDouble("0x1234K") -> 4660.0.
+  //    StringToDouble("") -> 0.0  // empty_string_value.
+  //    StringToDouble(" ") -> NaN  // junk_string_value.
+  //    StringToDouble(" 1") -> NaN  // junk_string_value.
+  //    StringToDouble("0x") -> NaN  // junk_string_value.
+  //    StringToDouble("-123.45") -> -123.45.
+  //    StringToDouble("--123.45") -> NaN  // junk_string_value.
+  //    StringToDouble("123e45") -> 123e45.
+  //    StringToDouble("123E45") -> 123e45.
+  //    StringToDouble("123e+45") -> 123e45.
+  //    StringToDouble("123E-45") -> 123e-45.
+  //    StringToDouble("123e") -> 123.0  // trailing junk ignored.
+  //    StringToDouble("123e-") -> 123.0  // trailing junk ignored.
+  //    StringToDouble("+NaN") -> NaN  // NaN string literal.
+  //    StringToDouble("-infinity") -> -inf.  // infinity literal.
+  //    StringToDouble("Infinity") -> NaN  // junk_string_value.
+  //
+  //  flags = ALLOW_OCTAL | ALLOW_LEADING_SPACES,
+  //  empty_string_value = 0.0,
+  //  junk_string_value = NaN,
+  //  infinity_symbol = NULL,
+  //  nan_symbol = NULL:
+  //    StringToDouble("0x1234") -> NaN  // junk_string_value.
+  //    StringToDouble("01234") -> 668.0.
+  //    StringToDouble("") -> 0.0  // empty_string_value.
+  //    StringToDouble(" ") -> 0.0  // empty_string_value.
+  //    StringToDouble(" 1") -> 1.0
+  //    StringToDouble("0x") -> NaN  // junk_string_value.
+  //    StringToDouble("0123e45") -> NaN  // junk_string_value.
+  //    StringToDouble("01239E45") -> 1239e45.
+  //    StringToDouble("-infinity") -> NaN  // junk_string_value.
+  //    StringToDouble("NaN") -> NaN  // junk_string_value.
+  //
+  //  flags = NO_FLAGS,
+  //  separator = ' ':
+  //    StringToDouble("1 2 3 4") -> 1234.0
+  //    StringToDouble("1  2") -> NaN // junk_string_value
+  //    StringToDouble("1 000 000.0") -> 1000000.0
+  //    StringToDouble("1.000 000") -> 1.0
+  //    StringToDouble("1.0e1 000") -> NaN // junk_string_value
+  StringToDoubleConverter(int flags,
+                          double empty_string_value,
+                          double junk_string_value,
+                          const char* infinity_symbol,
+                          const char* nan_symbol,
+                          uc16 separator = kNoSeparator)
+      : flags_(flags),
+        empty_string_value_(empty_string_value),
+        junk_string_value_(junk_string_value),
+        infinity_symbol_(infinity_symbol),
+        nan_symbol_(nan_symbol),
+        separator_(separator) {
+  }
+
+  // Performs the conversion.
+  // The output parameter 'processed_characters_count' is set to the number
+  // of characters that have been processed to read the number.
+  // Spaces than are processed with ALLOW_{LEADING|TRAILING}_SPACES are included
+  // in the 'processed_characters_count'. Trailing junk is never included.
+  double StringToDouble(const char* buffer,
+                        int length,
+                        int* processed_characters_count) const;
+
+  // Same as StringToDouble above but for 16 bit characters.
+  double StringToDouble(const uc16* buffer,
+                        int length,
+                        int* processed_characters_count) const;
+
+  // Same as StringToDouble but reads a float.
+  // Note that this is not equivalent to static_cast<float>(StringToDouble(...))
+  // due to potential double-rounding.
+  float StringToFloat(const char* buffer,
+                      int length,
+                      int* processed_characters_count) const;
+
+  // Same as StringToFloat above but for 16 bit characters.
+  float StringToFloat(const uc16* buffer,
+                      int length,
+                      int* processed_characters_count) const;
+
+  // Same as StringToDouble for T = double, and StringToFloat for T = float.
+  template <typename T>
+  T StringTo(const char* buffer,
+             int length,
+             int* processed_characters_count) const;
+
+  // Same as StringTo above but for 16 bit characters.
+  template <typename T>
+  T StringTo(const uc16* buffer,
+             int length,
+             int* processed_characters_count) const;
+
+ private:
+  const int flags_;
+  const double empty_string_value_;
+  const double junk_string_value_;
+  const char* const infinity_symbol_;
+  const char* const nan_symbol_;
+  const uc16 separator_;
+
+  template <class Iterator>
+  double StringToIeee(Iterator start_pointer,
+                      int length,
+                      bool read_as_double,
+                      int* processed_characters_count) const;
+
+  DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS(StringToDoubleConverter);
+};
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_STRING_TO_DOUBLE_H_
diff --git a/pyarrow/include/arrow/vendored/double-conversion/strtod.h b/pyarrow/include/arrow/vendored/double-conversion/strtod.h
new file mode 100644
index 0000000000000000000000000000000000000000..619db5838d2f75b6ea7b18ac3df458a029b70294
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/double-conversion/strtod.h
@@ -0,0 +1,66 @@
+// Copyright 2010 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_STRTOD_H_
+#define DOUBLE_CONVERSION_STRTOD_H_
+
+#include "utils.h"
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+// The buffer must only contain digits in the range [0-9]. It must not
+// contain a dot or a sign. It must not start with '0', and must not be empty.
+double Strtod(Vector<const char> buffer, int exponent);
+
+// The buffer must only contain digits in the range [0-9]. It must not
+// contain a dot or a sign. It must not start with '0', and must not be empty.
+float Strtof(Vector<const char> buffer, int exponent);
+
+// Same as Strtod, but assumes that 'trimmed' is already trimmed, as if run
+// through TrimAndCut. That is, 'trimmed' must have no leading or trailing
+// zeros, must not be a lone zero, and must not have 'too many' digits.
+double StrtodTrimmed(Vector<const char> trimmed, int exponent);
+
+// Same as Strtof, but assumes that 'trimmed' is already trimmed, as if run
+// through TrimAndCut. That is, 'trimmed' must have no leading or trailing
+// zeros, must not be a lone zero, and must not have 'too many' digits.
+float StrtofTrimmed(Vector<const char> trimmed, int exponent);
+
+inline Vector<const char> TrimTrailingZeros(Vector<const char> buffer) {
+  for (int i = buffer.length() - 1; i >= 0; --i) {
+    if (buffer[i] != '0') {
+      return buffer.SubVector(0, i + 1);
+    }
+  }
+  return Vector<const char>(buffer.start(), 0);
+}
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_STRTOD_H_
diff --git a/pyarrow/include/arrow/vendored/double-conversion/utils.h b/pyarrow/include/arrow/vendored/double-conversion/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..332619a31270d709fda1c4f85248ae71546debde
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/double-conversion/utils.h
@@ -0,0 +1,420 @@
+// Copyright 2010 the V8 project authors. All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+//       notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+//       copyright notice, this list of conditions and the following
+//       disclaimer in the documentation and/or other materials provided
+//       with the distribution.
+//     * Neither the name of Google Inc. nor the names of its
+//       contributors may be used to endorse or promote products derived
+//       from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef DOUBLE_CONVERSION_UTILS_H_
+#define DOUBLE_CONVERSION_UTILS_H_
+
+// Use DOUBLE_CONVERSION_NON_PREFIXED_MACROS to get unprefixed macros as was
+// the case in double-conversion releases prior to 3.1.6
+
+#include <cstdlib>
+#include <cstring>
+
+// For pre-C++11 compatibility
+#if __cplusplus >= 201103L
+#define DOUBLE_CONVERSION_NULLPTR nullptr
+#else
+#define DOUBLE_CONVERSION_NULLPTR NULL
+#endif
+
+#include <cassert>
+#ifndef DOUBLE_CONVERSION_ASSERT
+#define DOUBLE_CONVERSION_ASSERT(condition)         \
+    assert(condition)
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(ASSERT)
+#define ASSERT DOUBLE_CONVERSION_ASSERT
+#endif
+
+#ifndef DOUBLE_CONVERSION_UNIMPLEMENTED
+#define DOUBLE_CONVERSION_UNIMPLEMENTED() (abort())
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(UNIMPLEMENTED)
+#define UNIMPLEMENTED DOUBLE_CONVERSION_UNIMPLEMENTED
+#endif
+
+#ifndef DOUBLE_CONVERSION_NO_RETURN
+#ifdef _MSC_VER
+#define DOUBLE_CONVERSION_NO_RETURN __declspec(noreturn)
+#else
+#define DOUBLE_CONVERSION_NO_RETURN __attribute__((noreturn))
+#endif
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(NO_RETURN)
+#define NO_RETURN DOUBLE_CONVERSION_NO_RETURN
+#endif
+
+#ifndef DOUBLE_CONVERSION_UNREACHABLE
+#ifdef _MSC_VER
+void DOUBLE_CONVERSION_NO_RETURN abort_noreturn();
+inline void abort_noreturn() { abort(); }
+#define DOUBLE_CONVERSION_UNREACHABLE()   (abort_noreturn())
+#else
+#define DOUBLE_CONVERSION_UNREACHABLE()   (abort())
+#endif
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(UNREACHABLE)
+#define UNREACHABLE DOUBLE_CONVERSION_UNREACHABLE
+#endif
+
+// Not all compilers support __has_attribute and combining a check for both
+// ifdef and __has_attribute on the same preprocessor line isn't portable.
+#ifdef __has_attribute
+#   define DOUBLE_CONVERSION_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+#   define DOUBLE_CONVERSION_HAS_ATTRIBUTE(x) 0
+#endif
+
+#ifndef DOUBLE_CONVERSION_UNUSED
+#if DOUBLE_CONVERSION_HAS_ATTRIBUTE(unused)
+#define DOUBLE_CONVERSION_UNUSED __attribute__((unused))
+#else
+#define DOUBLE_CONVERSION_UNUSED
+#endif
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(UNUSED)
+#define UNUSED DOUBLE_CONVERSION_UNUSED
+#endif
+
+#if DOUBLE_CONVERSION_HAS_ATTRIBUTE(uninitialized)
+#define DOUBLE_CONVERSION_STACK_UNINITIALIZED __attribute__((uninitialized))
+#else
+#define DOUBLE_CONVERSION_STACK_UNINITIALIZED
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(STACK_UNINITIALIZED)
+#define STACK_UNINITIALIZED DOUBLE_CONVERSION_STACK_UNINITIALIZED
+#endif
+
+// Double operations detection based on target architecture.
+// Linux uses a 80bit wide floating point stack on x86. This induces double
+// rounding, which in turn leads to wrong results.
+// An easy way to test if the floating-point operations are correct is to
+// evaluate: 89255.0/1e22. If the floating-point stack is 64 bits wide then
+// the result is equal to 89255e-22.
+// The best way to test this, is to create a division-function and to compare
+// the output of the division with the expected result. (Inlining must be
+// disabled.)
+// On Linux,x86 89255e-22 != Div_double(89255.0/1e22)
+//
+// For example:
+/*
+// -- in div.c
+double Div_double(double x, double y) { return x / y; }
+
+// -- in main.c
+double Div_double(double x, double y);  // Forward declaration.
+
+int main(int argc, char** argv) {
+  return Div_double(89255.0, 1e22) == 89255e-22;
+}
+*/
+// Run as follows ./main || echo "correct"
+//
+// If it prints "correct" then the architecture should be here, in the "correct" section.
+#if defined(_M_X64) || defined(__x86_64__) || \
+    defined(__ARMEL__) || defined(__avr32__) || defined(_M_ARM) || defined(_M_ARM64) || \
+    defined(__hppa__) || defined(__ia64__) || \
+    defined(__mips__) || \
+    defined(__loongarch__) || \
+    defined(__nios2__) || defined(__ghs) || \
+    defined(__powerpc__) || defined(__ppc__) || defined(__ppc64__) || \
+    defined(_POWER) || defined(_ARCH_PPC) || defined(_ARCH_PPC64) || \
+    defined(__sparc__) || defined(__sparc) || defined(__s390__) || \
+    defined(__SH4__) || defined(__alpha__) || \
+    defined(_MIPS_ARCH_MIPS32R2) || defined(__ARMEB__) ||\
+    defined(__AARCH64EL__) || defined(__aarch64__) || defined(__AARCH64EB__) || \
+    defined(__riscv) || defined(__e2k__) || \
+    defined(__or1k__) || defined(__arc__) || defined(__ARC64__) || \
+    defined(__microblaze__) || defined(__XTENSA__) || \
+    defined(__EMSCRIPTEN__) || defined(__wasm32__)
+#define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1
+#elif defined(__mc68000__) || \
+    defined(__pnacl__) || defined(__native_client__)
+#undef DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS
+#elif defined(_M_IX86) || defined(__i386__) || defined(__i386)
+#if defined(_WIN32)
+// Windows uses a 64bit wide floating point stack.
+#define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1
+#else
+#undef DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS
+#endif  // _WIN32
+#else
+#error Target architecture was not detected as supported by Double-Conversion.
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(CORRECT_DOUBLE_OPERATIONS)
+#define CORRECT_DOUBLE_OPERATIONS DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS
+#endif
+
+#if defined(_WIN32) && !defined(__MINGW32__)
+
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef short int16_t;  // NOLINT
+typedef unsigned short uint16_t;  // NOLINT
+typedef int int32_t;
+typedef unsigned int uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+// intptr_t and friends are defined in crtdefs.h through stdio.h.
+
+#else
+
+#include <stdint.h>
+
+#endif
+
+typedef uint16_t uc16;
+
+// The following macro works on both 32 and 64-bit platforms.
+// Usage: instead of writing 0x1234567890123456
+//      write DOUBLE_CONVERSION_UINT64_2PART_C(0x12345678,90123456);
+#define DOUBLE_CONVERSION_UINT64_2PART_C(a, b) (((static_cast<uint64_t>(a) << 32) + 0x##b##u))
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(UINT64_2PART_C)
+#define UINT64_2PART_C DOUBLE_CONVERSION_UINT64_2PART_C
+#endif
+
+// The expression DOUBLE_CONVERSION_ARRAY_SIZE(a) is a compile-time constant of type
+// size_t which represents the number of elements of the given
+// array. You should only use DOUBLE_CONVERSION_ARRAY_SIZE on statically allocated
+// arrays.
+#ifndef DOUBLE_CONVERSION_ARRAY_SIZE
+#define DOUBLE_CONVERSION_ARRAY_SIZE(a)                                   \
+  ((sizeof(a) / sizeof(*(a))) /                         \
+  static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(ARRAY_SIZE)
+#define ARRAY_SIZE DOUBLE_CONVERSION_ARRAY_SIZE
+#endif
+
+// A macro to disallow the evil copy constructor and operator= functions
+// This should be used in the private: declarations for a class
+#ifndef DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN
+#define DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(TypeName)      \
+  TypeName(const TypeName&);                    \
+  void operator=(const TypeName&)
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(DC_DISALLOW_COPY_AND_ASSIGN)
+#define DC_DISALLOW_COPY_AND_ASSIGN DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN
+#endif
+
+// A macro to disallow all the implicit constructors, namely the
+// default constructor, copy constructor and operator= functions.
+//
+// This should be used in the private: declarations for a class
+// that wants to prevent anyone from instantiating it. This is
+// especially useful for classes containing only static methods.
+#ifndef DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS
+#define DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
+  TypeName();                                    \
+  DOUBLE_CONVERSION_DISALLOW_COPY_AND_ASSIGN(TypeName)
+#endif
+#if defined(DOUBLE_CONVERSION_NON_PREFIXED_MACROS) && !defined(DC_DISALLOW_IMPLICIT_CONSTRUCTORS)
+#define DC_DISALLOW_IMPLICIT_CONSTRUCTORS DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS
+#endif
+
+namespace arrow_vendored {
+namespace double_conversion {
+
+inline int StrLength(const char* string) {
+  size_t length = strlen(string);
+  DOUBLE_CONVERSION_ASSERT(length == static_cast<size_t>(static_cast<int>(length)));
+  return static_cast<int>(length);
+}
+
+// This is a simplified version of V8's Vector class.
+template <typename T>
+class Vector {
+ public:
+  Vector() : start_(DOUBLE_CONVERSION_NULLPTR), length_(0) {}
+  Vector(T* data, int len) : start_(data), length_(len) {
+    DOUBLE_CONVERSION_ASSERT(len == 0 || (len > 0 && data != DOUBLE_CONVERSION_NULLPTR));
+  }
+
+  // Returns a vector using the same backing storage as this one,
+  // spanning from and including 'from', to but not including 'to'.
+  Vector<T> SubVector(int from, int to) {
+    DOUBLE_CONVERSION_ASSERT(to <= length_);
+    DOUBLE_CONVERSION_ASSERT(from < to);
+    DOUBLE_CONVERSION_ASSERT(0 <= from);
+    return Vector<T>(start() + from, to - from);
+  }
+
+  // Returns the length of the vector.
+  int length() const { return length_; }
+
+  // Returns whether or not the vector is empty.
+  bool is_empty() const { return length_ == 0; }
+
+  // Returns the pointer to the start of the data in the vector.
+  T* start() const { return start_; }
+
+  // Access individual vector elements - checks bounds in debug mode.
+  T& operator[](int index) const {
+    DOUBLE_CONVERSION_ASSERT(0 <= index && index < length_);
+    return start_[index];
+  }
+
+  T& first() { return start_[0]; }
+
+  T& last() { return start_[length_ - 1]; }
+
+  void pop_back() {
+    DOUBLE_CONVERSION_ASSERT(!is_empty());
+    --length_;
+  }
+
+ private:
+  T* start_;
+  int length_;
+};
+
+
+// Helper class for building result strings in a character buffer. The
+// purpose of the class is to use safe operations that checks the
+// buffer bounds on all operations in debug mode.
+class StringBuilder {
+ public:
+  StringBuilder(char* buffer, int buffer_size)
+      : buffer_(buffer, buffer_size), position_(0) { }
+
+  ~StringBuilder() { if (!is_finalized()) Finalize(); }
+
+  int size() const { return buffer_.length(); }
+
+  // Get the current position in the builder.
+  int position() const {
+    DOUBLE_CONVERSION_ASSERT(!is_finalized());
+    return position_;
+  }
+
+  // Reset the position.
+  void Reset() { position_ = 0; }
+
+  // Add a single character to the builder. It is not allowed to add
+  // 0-characters; use the Finalize() method to terminate the string
+  // instead.
+  void AddCharacter(char c) {
+    DOUBLE_CONVERSION_ASSERT(c != '\0');
+    DOUBLE_CONVERSION_ASSERT(!is_finalized() && position_ < buffer_.length());
+    buffer_[position_++] = c;
+  }
+
+  // Add an entire string to the builder. Uses strlen() internally to
+  // compute the length of the input string.
+  void AddString(const char* s) {
+    AddSubstring(s, StrLength(s));
+  }
+
+  // Add the first 'n' characters of the given string 's' to the
+  // builder. The input string must have enough characters.
+  void AddSubstring(const char* s, int n) {
+    DOUBLE_CONVERSION_ASSERT(!is_finalized() && position_ + n < buffer_.length());
+    DOUBLE_CONVERSION_ASSERT(static_cast<size_t>(n) <= strlen(s));
+    memmove(&buffer_[position_], s, static_cast<size_t>(n));
+    position_ += n;
+  }
+
+
+  // Add character padding to the builder. If count is non-positive,
+  // nothing is added to the builder.
+  void AddPadding(char c, int count) {
+    for (int i = 0; i < count; i++) {
+      AddCharacter(c);
+    }
+  }
+
+  // Finalize the string by 0-terminating it and returning the buffer.
+  char* Finalize() {
+    DOUBLE_CONVERSION_ASSERT(!is_finalized() && position_ < buffer_.length());
+    buffer_[position_] = '\0';
+    // Make sure nobody managed to add a 0-character to the
+    // buffer while building the string.
+    DOUBLE_CONVERSION_ASSERT(strlen(buffer_.start()) == static_cast<size_t>(position_));
+    position_ = -1;
+    DOUBLE_CONVERSION_ASSERT(is_finalized());
+    return buffer_.start();
+  }
+
+ private:
+  Vector<char> buffer_;
+  int position_;
+
+  bool is_finalized() const { return position_ < 0; }
+
+  DOUBLE_CONVERSION_DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder);
+};
+
+// The type-based aliasing rule allows the compiler to assume that pointers of
+// different types (for some definition of different) never alias each other.
+// Thus the following code does not work:
+//
+// float f = foo();
+// int fbits = *(int*)(&f);
+//
+// The compiler 'knows' that the int pointer can't refer to f since the types
+// don't match, so the compiler may cache f in a register, leaving random data
+// in fbits.  Using C++ style casts makes no difference, however a pointer to
+// char data is assumed to alias any other pointer.  This is the 'memcpy
+// exception'.
+//
+// Bit_cast uses the memcpy exception to move the bits from a variable of one
+// type of a variable of another type.  Of course the end result is likely to
+// be implementation dependent.  Most compilers (gcc-4.2 and MSVC 2005)
+// will completely optimize BitCast away.
+//
+// There is an additional use for BitCast.
+// Recent gccs will warn when they see casts that may result in breakage due to
+// the type-based aliasing rule.  If you have checked that there is no breakage
+// you can use BitCast to cast one pointer type to another.  This confuses gcc
+// enough that it can no longer see that you have cast one pointer type to
+// another thus avoiding the warning.
+template <class Dest, class Source>
+Dest BitCast(const Source& source) {
+  // Compile time assertion: sizeof(Dest) == sizeof(Source)
+  // A compile error here means your Dest and Source have different sizes.
+#if __cplusplus >= 201103L
+  static_assert(sizeof(Dest) == sizeof(Source),
+                "source and destination size mismatch");
+#else
+  DOUBLE_CONVERSION_UNUSED
+  typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1];
+#endif
+
+  Dest dest;
+  memmove(&dest, &source, sizeof(dest));
+  return dest;
+}
+
+template <class Dest, class Source>
+Dest BitCast(Source* source) {
+  return BitCast<Dest>(reinterpret_cast<uintptr_t>(source));
+}
+
+}  // namespace double_conversion
+}  // namespace arrow_vendored
+
+#endif  // DOUBLE_CONVERSION_UTILS_H_
diff --git a/pyarrow/include/arrow/vendored/pcg/pcg_extras.hpp b/pyarrow/include/arrow/vendored/pcg/pcg_extras.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..36576cfa91d8c903576d56fb807f594f23a5bf7b
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/pcg/pcg_extras.hpp
@@ -0,0 +1,649 @@
+/*
+ * PCG Random Number Generation for C++
+ *
+ * Copyright 2014-2017 Melissa O'Neill <oneill@pcg-random.org>,
+ *                     and the PCG Project contributors.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ *
+ * Licensed under the Apache License, Version 2.0 (provided in
+ * LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0)
+ * or under the MIT license (provided in LICENSE-MIT.txt and at
+ * http://opensource.org/licenses/MIT), at your option. This file may not
+ * be copied, modified, or distributed except according to those terms.
+ *
+ * Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied.  See your chosen license for details.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * visit http://www.pcg-random.org/.
+ */
+
+/*
+ * This file provides support code that is useful for random-number generation
+ * but not specific to the PCG generation scheme, including:
+ *      - 128-bit int support for platforms where it isn't available natively
+ *      - bit twiddling operations
+ *      - I/O of 128-bit and 8-bit integers
+ *      - Handling the evilness of SeedSeq
+ *      - Support for efficiently producing random numbers less than a given
+ *        bound
+ */
+
+#ifndef PCG_EXTRAS_HPP_INCLUDED
+#define PCG_EXTRAS_HPP_INCLUDED 1
+
+#include <cinttypes>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <limits>
+#include <iostream>
+#include <type_traits>
+#include <utility>
+#include <locale>
+#include <iterator>
+
+#ifdef __GNUC__
+    #include <cxxabi.h>
+#endif
+
+/*
+ * Abstractions for compiler-specific directives
+ */
+
+#ifdef __GNUC__
+    #define PCG_NOINLINE __attribute__((noinline))
+#else
+    #define PCG_NOINLINE
+#endif
+
+/*
+ * Some members of the PCG library use 128-bit math.  When compiling on 64-bit
+ * platforms, both GCC and Clang provide 128-bit integer types that are ideal
+ * for the job.
+ *
+ * On 32-bit platforms (or with other compilers), we fall back to a C++
+ * class that provides 128-bit unsigned integers instead.  It may seem
+ * like we're reinventing the wheel here, because libraries already exist
+ * that support large integers, but most existing libraries provide a very
+ * generic multiprecision code, but here we're operating at a fixed size.
+ * Also, most other libraries are fairly heavyweight.  So we use a direct
+ * implementation.  Sadly, it's much slower than hand-coded assembly or
+ * direct CPU support.
+ *
+ */
+#if __SIZEOF_INT128__ && !PCG_FORCE_EMULATED_128BIT_MATH
+    namespace arrow_vendored {
+    namespace pcg_extras {
+        typedef __uint128_t pcg128_t;
+    }
+    }
+    #define PCG_128BIT_CONSTANT(high,low) \
+            ((pcg_extras::pcg128_t(high) << 64) + low)
+#else
+    #include "pcg_uint128.hpp"
+    namespace arrow_vendored {
+    namespace pcg_extras {
+        typedef pcg_extras::uint_x4<uint32_t,uint64_t> pcg128_t;
+    }
+    }
+    #define PCG_128BIT_CONSTANT(high,low) \
+            pcg_extras::pcg128_t(high,low)
+    #define PCG_EMULATED_128BIT_MATH 1
+#endif
+
+
+namespace arrow_vendored {
+namespace pcg_extras {
+
+/*
+ * We often need to represent a "number of bits".  When used normally, these
+ * numbers are never greater than 128, so an unsigned char is plenty.
+ * If you're using a nonstandard generator of a larger size, you can set
+ * PCG_BITCOUNT_T to have it define it as a larger size.  (Some compilers
+ * might produce faster code if you set it to an unsigned int.)
+ */
+
+#ifndef PCG_BITCOUNT_T
+    typedef uint8_t bitcount_t;
+#else
+    typedef PCG_BITCOUNT_T bitcount_t;
+#endif
+
+/*
+ * C++ requires us to be able to serialize RNG state by printing or reading
+ * it from a stream.  Because we use 128-bit ints, we also need to be able
+ * ot print them, so here is code to do so.
+ *
+ * This code provides enough functionality to print 128-bit ints in decimal
+ * and zero-padded in hex.  It's not a full-featured implementation.
+ */
+
+template <typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits>& out, pcg128_t value)
+{
+    auto desired_base = out.flags() & out.basefield;
+    bool want_hex = desired_base == out.hex;
+
+    if (want_hex) {
+        uint64_t highpart = uint64_t(value >> 64);
+        uint64_t lowpart  = uint64_t(value);
+        auto desired_width = out.width();
+        if (desired_width > 16) {
+            out.width(desired_width - 16);
+        }
+        if (highpart != 0 || desired_width > 16)
+            out << highpart;
+        CharT oldfill = '\0';
+        if (highpart != 0) {
+            out.width(16);
+            oldfill = out.fill('0');
+        }
+        auto oldflags = out.setf(decltype(desired_base){}, out.showbase);
+        out << lowpart;
+        out.setf(oldflags);
+        if (highpart != 0) {
+            out.fill(oldfill);
+        }
+        return out;
+    }
+    constexpr size_t MAX_CHARS_128BIT = 40;
+
+    char buffer[MAX_CHARS_128BIT];
+    char* pos = buffer+sizeof(buffer);
+    *(--pos) = '\0';
+    constexpr auto BASE = pcg128_t(10ULL);
+    do {
+        auto div = value / BASE;
+        auto mod = uint32_t(value - (div * BASE));
+        *(--pos) = '0' + char(mod);
+        value = div;
+    } while(value != pcg128_t(0ULL));
+    return out << pos;
+}
+
+template <typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits>& in, pcg128_t& value)
+{
+    typename std::basic_istream<CharT,Traits>::sentry s(in);
+
+    if (!s)
+         return in;
+
+    constexpr auto BASE = pcg128_t(10ULL);
+    pcg128_t current(0ULL);
+    bool did_nothing = true;
+    bool overflow = false;
+    for(;;) {
+        CharT wide_ch = in.get();
+        if (!in.good())
+            break;
+        auto ch = in.narrow(wide_ch, '\0');
+        if (ch < '0' || ch > '9') {
+            in.unget();
+            break;
+        }
+        did_nothing = false;
+        pcg128_t digit(uint32_t(ch - '0'));
+        pcg128_t timesbase = current*BASE;
+        overflow = overflow || timesbase < current;
+        current = timesbase + digit;
+        overflow = overflow || current < digit;
+    }
+
+    if (did_nothing || overflow) {
+        in.setstate(std::ios::failbit);
+        if (overflow)
+            current = ~pcg128_t(0ULL);
+    }
+
+    value = current;
+
+    return in;
+}
+
+/*
+ * Likewise, if people use tiny rngs, we'll be serializing uint8_t.
+ * If we just used the provided IO operators, they'd read/write chars,
+ * not ints, so we need to define our own.  We *can* redefine this operator
+ * here because we're in our own namespace.
+ */
+
+template <typename CharT, typename Traits>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits>&out, uint8_t value)
+{
+    return out << uint32_t(value);
+}
+
+template <typename CharT, typename Traits>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits>& in, uint8_t& target)
+{
+    uint32_t value = 0xdecea5edU;
+    in >> value;
+    if (!in && value == 0xdecea5edU)
+        return in;
+    if (value > uint8_t(~0)) {
+        in.setstate(std::ios::failbit);
+        value = ~0U;
+    }
+    target = uint8_t(value);
+    return in;
+}
+
+/* Unfortunately, the above functions don't get found in preference to the
+ * built in ones, so we create some more specific overloads that will.
+ * Ugh.
+ */
+
+inline std::ostream& operator<<(std::ostream& out, uint8_t value)
+{
+    return pcg_extras::operator<< <char>(out, value);
+}
+
+inline std::istream& operator>>(std::istream& in, uint8_t& value)
+{
+    return pcg_extras::operator>> <char>(in, value);
+}
+
+
+
+/*
+ * Useful bitwise operations.
+ */
+
+/*
+ * XorShifts are invertable, but they are someting of a pain to invert.
+ * This function backs them out.  It's used by the whacky "inside out"
+ * generator defined later.
+ */
+
+template <typename itype>
+inline itype unxorshift(itype x, bitcount_t bits, bitcount_t shift)
+{
+    if (2*shift >= bits) {
+        return x ^ (x >> shift);
+    }
+    itype lowmask1 = (itype(1U) << (bits - shift*2)) - 1;
+    itype highmask1 = ~lowmask1;
+    itype top1 = x;
+    itype bottom1 = x & lowmask1;
+    top1 ^= top1 >> shift;
+    top1 &= highmask1;
+    x = top1 | bottom1;
+    itype lowmask2 = (itype(1U) << (bits - shift)) - 1;
+    itype bottom2 = x & lowmask2;
+    bottom2 = unxorshift(bottom2, bits - shift, shift);
+    bottom2 &= lowmask1;
+    return top1 | bottom2;
+}
+
+/*
+ * Rotate left and right.
+ *
+ * In ideal world, compilers would spot idiomatic rotate code and convert it
+ * to a rotate instruction.  Of course, opinions vary on what the correct
+ * idiom is and how to spot it.  For clang, sometimes it generates better
+ * (but still crappy) code if you define PCG_USE_ZEROCHECK_ROTATE_IDIOM.
+ */
+
+template <typename itype>
+inline itype rotl(itype value, bitcount_t rot)
+{
+    constexpr bitcount_t bits = sizeof(itype) * 8;
+    constexpr bitcount_t mask = bits - 1;
+#if PCG_USE_ZEROCHECK_ROTATE_IDIOM
+    return rot ? (value << rot) | (value >> (bits - rot)) : value;
+#else
+    return (value << rot) | (value >> ((- rot) & mask));
+#endif
+}
+
+template <typename itype>
+inline itype rotr(itype value, bitcount_t rot)
+{
+    constexpr bitcount_t bits = sizeof(itype) * 8;
+    constexpr bitcount_t mask = bits - 1;
+#if PCG_USE_ZEROCHECK_ROTATE_IDIOM
+    return rot ? (value >> rot) | (value << (bits - rot)) : value;
+#else
+    return (value >> rot) | (value << ((- rot) & mask));
+#endif
+}
+
+/* Unfortunately, both Clang and GCC sometimes perform poorly when it comes
+ * to properly recognizing idiomatic rotate code, so for we also provide
+ * assembler directives (enabled with PCG_USE_INLINE_ASM).  Boo, hiss.
+ * (I hope that these compilers get better so that this code can die.)
+ *
+ * These overloads will be preferred over the general template code above.
+ */
+#if PCG_USE_INLINE_ASM && __GNUC__ && (__x86_64__  || __i386__)
+
+inline uint8_t rotr(uint8_t value, bitcount_t rot)
+{
+    asm ("rorb   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
+    return value;
+}
+
+inline uint16_t rotr(uint16_t value, bitcount_t rot)
+{
+    asm ("rorw   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
+    return value;
+}
+
+inline uint32_t rotr(uint32_t value, bitcount_t rot)
+{
+    asm ("rorl   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
+    return value;
+}
+
+#if __x86_64__
+inline uint64_t rotr(uint64_t value, bitcount_t rot)
+{
+    asm ("rorq   %%cl, %0" : "=r" (value) : "0" (value), "c" (rot));
+    return value;
+}
+#endif // __x86_64__
+
+#elif defined(_MSC_VER)
+  // Use MSVC++ bit rotation intrinsics
+
+#pragma intrinsic(_rotr, _rotr64, _rotr8, _rotr16)
+
+inline uint8_t rotr(uint8_t value, bitcount_t rot)
+{
+    return _rotr8(value, rot);
+}
+
+inline uint16_t rotr(uint16_t value, bitcount_t rot)
+{
+    return _rotr16(value, rot);
+}
+
+inline uint32_t rotr(uint32_t value, bitcount_t rot)
+{
+    return _rotr(value, rot);
+}
+
+inline uint64_t rotr(uint64_t value, bitcount_t rot)
+{
+    return _rotr64(value, rot);
+}
+
+#endif // PCG_USE_INLINE_ASM
+
+
+/*
+ * The C++ SeedSeq concept (modelled by seed_seq) can fill an array of
+ * 32-bit integers with seed data, but sometimes we want to produce
+ * larger or smaller integers.
+ *
+ * The following code handles this annoyance.
+ *
+ * uneven_copy will copy an array of 32-bit ints to an array of larger or
+ * smaller ints (actually, the code is general it only needing forward
+ * iterators).  The copy is identical to the one that would be performed if
+ * we just did memcpy on a standard little-endian machine, but works
+ * regardless of the endian of the machine (or the weirdness of the ints
+ * involved).
+ *
+ * generate_to initializes an array of integers using a SeedSeq
+ * object.  It is given the size as a static constant at compile time and
+ * tries to avoid memory allocation.  If we're filling in 32-bit constants
+ * we just do it directly.  If we need a separate buffer and it's small,
+ * we allocate it on the stack.  Otherwise, we fall back to heap allocation.
+ * Ugh.
+ *
+ * generate_one produces a single value of some integral type using a
+ * SeedSeq object.
+ */
+
+ /* uneven_copy helper, case where destination ints are less than 32 bit. */
+
+template<class SrcIter, class DestIter>
+SrcIter uneven_copy_impl(
+    SrcIter src_first, DestIter dest_first, DestIter dest_last,
+    std::true_type)
+{
+    typedef typename std::iterator_traits<SrcIter>::value_type  src_t;
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+
+    constexpr bitcount_t SRC_SIZE  = sizeof(src_t);
+    constexpr bitcount_t DEST_SIZE = sizeof(dest_t);
+    constexpr bitcount_t DEST_BITS = DEST_SIZE * 8;
+    constexpr bitcount_t SCALE     = SRC_SIZE / DEST_SIZE;
+
+    size_t count = 0;
+    src_t value = 0;
+
+    while (dest_first != dest_last) {
+        if ((count++ % SCALE) == 0)
+            value = *src_first++;       // Get more bits
+        else
+            value >>= DEST_BITS;        // Move down bits
+
+        *dest_first++ = dest_t(value);  // Truncates, ignores high bits.
+    }
+    return src_first;
+}
+
+ /* uneven_copy helper, case where destination ints are more than 32 bit. */
+
+template<class SrcIter, class DestIter>
+SrcIter uneven_copy_impl(
+    SrcIter src_first, DestIter dest_first, DestIter dest_last,
+    std::false_type)
+{
+    typedef typename std::iterator_traits<SrcIter>::value_type  src_t;
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+
+    constexpr auto SRC_SIZE  = sizeof(src_t);
+    constexpr auto SRC_BITS  = SRC_SIZE * 8;
+    constexpr auto DEST_SIZE = sizeof(dest_t);
+    constexpr auto SCALE     = (DEST_SIZE+SRC_SIZE-1) / SRC_SIZE;
+
+    while (dest_first != dest_last) {
+        dest_t value(0UL);
+        unsigned int shift = 0;
+
+        for (size_t i = 0; i < SCALE; ++i) {
+            value |= dest_t(*src_first++) << shift;
+            shift += SRC_BITS;
+        }
+
+        *dest_first++ = value;
+    }
+    return src_first;
+}
+
+/* uneven_copy, call the right code for larger vs. smaller */
+
+template<class SrcIter, class DestIter>
+inline SrcIter uneven_copy(SrcIter src_first,
+                           DestIter dest_first, DestIter dest_last)
+{
+    typedef typename std::iterator_traits<SrcIter>::value_type  src_t;
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+
+    constexpr bool DEST_IS_SMALLER = sizeof(dest_t) < sizeof(src_t);
+
+    return uneven_copy_impl(src_first, dest_first, dest_last,
+                            std::integral_constant<bool, DEST_IS_SMALLER>{});
+}
+
+/* generate_to, fill in a fixed-size array of integral type using a SeedSeq
+ * (actually works for any random-access iterator)
+ */
+
+template <size_t size, typename SeedSeq, typename DestIter>
+inline void generate_to_impl(SeedSeq&& generator, DestIter dest,
+                             std::true_type)
+{
+    generator.generate(dest, dest+size);
+}
+
+template <size_t size, typename SeedSeq, typename DestIter>
+void generate_to_impl(SeedSeq&& generator, DestIter dest,
+                      std::false_type)
+{
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+    constexpr auto DEST_SIZE = sizeof(dest_t);
+    constexpr auto GEN_SIZE  = sizeof(uint32_t);
+
+    constexpr bool GEN_IS_SMALLER = GEN_SIZE < DEST_SIZE;
+    constexpr size_t FROM_ELEMS =
+        GEN_IS_SMALLER
+            ? size * ((DEST_SIZE+GEN_SIZE-1) / GEN_SIZE)
+            : (size + (GEN_SIZE / DEST_SIZE) - 1)
+                / ((GEN_SIZE / DEST_SIZE) + GEN_IS_SMALLER);
+                        //  this odd code ^^^^^^^^^^^^^^^^^ is work-around for
+                        //  a bug: http://llvm.org/bugs/show_bug.cgi?id=21287
+
+    if (FROM_ELEMS <= 1024) {
+        uint32_t buffer[FROM_ELEMS];
+        generator.generate(buffer, buffer+FROM_ELEMS);
+        uneven_copy(buffer, dest, dest+size);
+    } else {
+        uint32_t* buffer = static_cast<uint32_t*>(malloc(GEN_SIZE * FROM_ELEMS));
+        generator.generate(buffer, buffer+FROM_ELEMS);
+        uneven_copy(buffer, dest, dest+size);
+        free(static_cast<void*>(buffer));
+    }
+}
+
+template <size_t size, typename SeedSeq, typename DestIter>
+inline void generate_to(SeedSeq&& generator, DestIter dest)
+{
+    typedef typename std::iterator_traits<DestIter>::value_type dest_t;
+    constexpr bool IS_32BIT = sizeof(dest_t) == sizeof(uint32_t);
+
+    generate_to_impl<size>(std::forward<SeedSeq>(generator), dest,
+                           std::integral_constant<bool, IS_32BIT>{});
+}
+
+/* generate_one, produce a value of integral type using a SeedSeq
+ * (optionally, we can have it produce more than one and pick which one
+ * we want)
+ */
+
+template <typename UInt, size_t i = 0UL, size_t N = i+1UL, typename SeedSeq>
+inline UInt generate_one(SeedSeq&& generator)
+{
+    UInt result[N];
+    generate_to<N>(std::forward<SeedSeq>(generator), result);
+    return result[i];
+}
+
+template <typename RngType>
+auto bounded_rand(RngType& rng, typename RngType::result_type upper_bound)
+        -> typename RngType::result_type
+{
+    typedef typename RngType::result_type rtype;
+    rtype threshold = (RngType::max() - RngType::min() + rtype(1) - upper_bound)
+                    % upper_bound;
+    for (;;) {
+        rtype r = rng() - RngType::min();
+        if (r >= threshold)
+            return r % upper_bound;
+    }
+}
+
+template <typename Iter, typename RandType>
+void shuffle(Iter from, Iter to, RandType&& rng)
+{
+    typedef typename std::iterator_traits<Iter>::difference_type delta_t;
+    typedef typename std::remove_reference<RandType>::type::result_type result_t;
+    auto count = to - from;
+    while (count > 1) {
+        delta_t chosen = delta_t(bounded_rand(rng, result_t(count)));
+        --count;
+        --to;
+        using std::swap;
+        swap(*(from + chosen), *to);
+    }
+}
+
+/*
+ * Although std::seed_seq is useful, it isn't everything.  Often we want to
+ * initialize a random-number generator some other way, such as from a random
+ * device.
+ *
+ * Technically, it does not meet the requirements of a SeedSequence because
+ * it lacks some of the rarely-used member functions (some of which would
+ * be impossible to provide).  However the C++ standard is quite specific
+ * that actual engines only called the generate method, so it ought not to be
+ * a problem in practice.
+ */
+
+template <typename RngType>
+class seed_seq_from {
+private:
+    RngType rng_;
+
+    typedef uint_least32_t result_type;
+
+public:
+    template<typename... Args>
+    seed_seq_from(Args&&... args) :
+        rng_(std::forward<Args>(args)...)
+    {
+        // Nothing (else) to do...
+    }
+
+    template<typename Iter>
+    void generate(Iter start, Iter finish)
+    {
+        for (auto i = start; i != finish; ++i)
+            *i = result_type(rng_());
+    }
+
+    constexpr size_t size() const
+    {
+        return (sizeof(typename RngType::result_type) > sizeof(result_type)
+                && RngType::max() > ~size_t(0UL))
+             ? ~size_t(0UL)
+             : size_t(RngType::max());
+    }
+};
+
+// Sometimes, when debugging or testing, it's handy to be able print the name
+// of a (in human-readable form).  This code allows the idiom:
+//
+//      cout << printable_typename<my_foo_type_t>()
+//
+// to print out my_foo_type_t (or its concrete type if it is a synonym)
+
+#if __cpp_rtti || __GXX_RTTI
+
+template <typename T>
+struct printable_typename {};
+
+template <typename T>
+std::ostream& operator<<(std::ostream& out, printable_typename<T>) {
+    const char *implementation_typename = typeid(T).name();
+#ifdef __GNUC__
+    int status;
+    char* pretty_name =
+        abi::__cxa_demangle(implementation_typename, nullptr, nullptr, &status);
+    if (status == 0)
+        out << pretty_name;
+    free(static_cast<void*>(pretty_name));
+    if (status == 0)
+        return out;
+#endif
+    out << implementation_typename;
+    return out;
+}
+
+#endif  // __cpp_rtti || __GXX_RTTI
+
+} // namespace pcg_extras
+} // namespace arrow_vendored
+
+#endif // PCG_EXTRAS_HPP_INCLUDED
diff --git a/pyarrow/include/arrow/vendored/pcg/pcg_random.hpp b/pyarrow/include/arrow/vendored/pcg/pcg_random.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e39e61e908a2a30863673fecd0f19e5090b40179
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/pcg/pcg_random.hpp
@@ -0,0 +1,1954 @@
+/*
+ * PCG Random Number Generation for C++
+ *
+ * Copyright 2014-2019 Melissa O'Neill <oneill@pcg-random.org>,
+ *                     and the PCG Project contributors.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ *
+ * Licensed under the Apache License, Version 2.0 (provided in
+ * LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0)
+ * or under the MIT license (provided in LICENSE-MIT.txt and at
+ * http://opensource.org/licenses/MIT), at your option. This file may not
+ * be copied, modified, or distributed except according to those terms.
+ *
+ * Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied.  See your chosen license for details.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * visit http://www.pcg-random.org/.
+ */
+
+/*
+ * This code provides the reference implementation of the PCG family of
+ * random number generators.  The code is complex because it implements
+ *
+ *      - several members of the PCG family, specifically members corresponding
+ *        to the output functions:
+ *             - XSH RR         (good for 64-bit state, 32-bit output)
+ *             - XSH RS         (good for 64-bit state, 32-bit output)
+ *             - XSL RR         (good for 128-bit state, 64-bit output)
+ *             - RXS M XS       (statistically most powerful generator)
+ *             - XSL RR RR      (good for 128-bit state, 128-bit output)
+ *             - and RXS, RXS M, XSH, XSL       (mostly for testing)
+ *      - at potentially *arbitrary* bit sizes
+ *      - with four different techniques for random streams (MCG, one-stream
+ *        LCG, settable-stream LCG, unique-stream LCG)
+ *      - and the extended generation schemes allowing arbitrary periods
+ *      - with all features of C++11 random number generation (and more),
+ *        some of which are somewhat painful, including
+ *            - initializing with a SeedSequence which writes 32-bit values
+ *              to memory, even though the state of the generator may not
+ *              use 32-bit values (it might use smaller or larger integers)
+ *            - I/O for RNGs and a prescribed format, which needs to handle
+ *              the issue that 8-bit and 128-bit integers don't have working
+ *              I/O routines (e.g., normally 8-bit = char, not integer)
+ *            - equality and inequality for RNGs
+ *      - and a number of convenience typedefs to mask all the complexity
+ *
+ * The code employes a fairly heavy level of abstraction, and has to deal
+ * with various C++ minutia.  If you're looking to learn about how the PCG
+ * scheme works, you're probably best of starting with one of the other
+ * codebases (see www.pcg-random.org).  But if you're curious about the
+ * constants for the various output functions used in those other, simpler,
+ * codebases, this code shows how they are calculated.
+ *
+ * On the positive side, at least there are convenience typedefs so that you
+ * can say
+ *
+ *      pcg32 myRNG;
+ *
+ * rather than:
+ *
+ *      pcg_detail::engine<
+ *          uint32_t,                                           // Output Type
+ *          uint64_t,                                           // State Type
+ *          pcg_detail::xsh_rr_mixin<uint32_t, uint64_t>, true, // Output Func
+ *          pcg_detail::specific_stream<uint64_t>,              // Stream Kind
+ *          pcg_detail::default_multiplier<uint64_t>            // LCG Mult
+ *      > myRNG;
+ *
+ */
+
+#ifndef PCG_RAND_HPP_INCLUDED
+#define PCG_RAND_HPP_INCLUDED 1
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <limits>
+#include <iostream>
+#include <iterator>
+#include <type_traits>
+#include <utility>
+#include <locale>
+#include <new>
+#include <stdexcept>
+
+#ifdef _MSC_VER
+    #pragma warning(disable:4146)
+#endif
+
+#ifdef _MSC_VER
+    #define PCG_ALWAYS_INLINE __forceinline
+#elif __GNUC__
+    #define PCG_ALWAYS_INLINE __attribute__((always_inline))
+#else
+    #define PCG_ALWAYS_INLINE inline
+#endif
+
+/*
+ * The pcg_extras namespace contains some support code that is likley to
+ * be useful for a variety of RNGs, including:
+ *      - 128-bit int support for platforms where it isn't available natively
+ *      - bit twiddling operations
+ *      - I/O of 128-bit and 8-bit integers
+ *      - Handling the evilness of SeedSeq
+ *      - Support for efficiently producing random numbers less than a given
+ *        bound
+ */
+
+#include "pcg_extras.hpp"
+
+namespace arrow_vendored {
+namespace pcg_detail {
+
+using namespace pcg_extras;
+
+/*
+ * The LCG generators need some constants to function.  This code lets you
+ * look up the constant by *type*.  For example
+ *
+ *      default_multiplier<uint32_t>::multiplier()
+ *
+ * gives you the default multipler for 32-bit integers.  We use the name
+ * of the constant and not a generic word like value to allow these classes
+ * to be used as mixins.
+ */
+
+template <typename T>
+struct default_multiplier {
+    // Not defined for an arbitrary type
+};
+
+template <typename T>
+struct default_increment {
+    // Not defined for an arbitrary type
+};
+
+#define PCG_DEFINE_CONSTANT(type, what, kind, constant) \
+        template <>                                     \
+        struct what ## _ ## kind<type> {                \
+            static constexpr type kind() {              \
+                return constant;                        \
+            }                                           \
+        };
+
+PCG_DEFINE_CONSTANT(uint8_t,  default, multiplier, 141U)
+PCG_DEFINE_CONSTANT(uint8_t,  default, increment,  77U)
+
+PCG_DEFINE_CONSTANT(uint16_t, default, multiplier, 12829U)
+PCG_DEFINE_CONSTANT(uint16_t, default, increment,  47989U)
+
+PCG_DEFINE_CONSTANT(uint32_t, default, multiplier, 747796405U)
+PCG_DEFINE_CONSTANT(uint32_t, default, increment,  2891336453U)
+
+PCG_DEFINE_CONSTANT(uint64_t, default, multiplier, 6364136223846793005ULL)
+PCG_DEFINE_CONSTANT(uint64_t, default, increment,  1442695040888963407ULL)
+
+PCG_DEFINE_CONSTANT(pcg128_t, default, multiplier,
+        PCG_128BIT_CONSTANT(2549297995355413924ULL,4865540595714422341ULL))
+PCG_DEFINE_CONSTANT(pcg128_t, default, increment,
+        PCG_128BIT_CONSTANT(6364136223846793005ULL,1442695040888963407ULL))
+
+/* Alternative (cheaper) multipliers for 128-bit */
+
+template <typename T>
+struct cheap_multiplier : public default_multiplier<T> {
+    // For most types just use the default.
+};
+
+template <>
+struct cheap_multiplier<pcg128_t> {
+    static constexpr uint64_t multiplier() {
+        return 0xda942042e4dd58b5ULL;
+    }
+};
+
+
+/*
+ * Each PCG generator is available in four variants, based on how it applies
+ * the additive constant for its underlying LCG; the variations are:
+ *
+ *     single stream   - all instances use the same fixed constant, thus
+ *                       the RNG always somewhere in same sequence
+ *     mcg             - adds zero, resulting in a single stream and reduced
+ *                       period
+ *     specific stream - the constant can be changed at any time, selecting
+ *                       a different random sequence
+ *     unique stream   - the constant is based on the memory address of the
+ *                       object, thus every RNG has its own unique sequence
+ *
+ * This variation is provided though mixin classes which define a function
+ * value called increment() that returns the nesessary additive constant.
+ */
+
+
+
+/*
+ * unique stream
+ */
+
+
+template <typename itype>
+class unique_stream {
+protected:
+    static constexpr bool is_mcg = false;
+
+    // Is never called, but is provided for symmetry with specific_stream
+    void set_stream(...)
+    {
+        abort();
+    }
+
+public:
+    typedef itype state_type;
+
+    constexpr itype increment() const {
+        return itype(reinterpret_cast<uintptr_t>(this) | 1);
+    }
+
+    constexpr itype stream() const
+    {
+         return increment() >> 1;
+    }
+
+    static constexpr bool can_specify_stream = false;
+
+    static constexpr size_t streams_pow2()
+    {
+        return (sizeof(itype) < sizeof(size_t) ? sizeof(itype)
+                                               : sizeof(size_t))*8 - 1u;
+    }
+
+protected:
+    constexpr unique_stream() = default;
+};
+
+
+/*
+ * no stream (mcg)
+ */
+
+template <typename itype>
+class no_stream {
+protected:
+    static constexpr bool is_mcg = true;
+
+    // Is never called, but is provided for symmetry with specific_stream
+    void set_stream(...)
+    {
+        abort();
+    }
+
+public:
+    typedef itype state_type;
+
+    static constexpr itype increment() {
+        return 0;
+    }
+
+    static constexpr bool can_specify_stream = false;
+
+    static constexpr size_t streams_pow2()
+    {
+        return 0u;
+    }
+
+protected:
+    constexpr no_stream() = default;
+};
+
+
+/*
+ * single stream/sequence (oneseq)
+ */
+
+template <typename itype>
+class oneseq_stream : public default_increment<itype> {
+protected:
+    static constexpr bool is_mcg = false;
+
+    // Is never called, but is provided for symmetry with specific_stream
+    void set_stream(...)
+    {
+        abort();
+    }
+
+public:
+    typedef itype state_type;
+
+    static constexpr itype stream()
+    {
+         return default_increment<itype>::increment() >> 1;
+    }
+
+    static constexpr bool can_specify_stream = false;
+
+    static constexpr size_t streams_pow2()
+    {
+        return 0u;
+    }
+
+protected:
+    constexpr oneseq_stream() = default;
+};
+
+
+/*
+ * specific stream
+ */
+
+template <typename itype>
+class specific_stream {
+protected:
+    static constexpr bool is_mcg = false;
+
+    itype inc_ = default_increment<itype>::increment();
+
+public:
+    typedef itype state_type;
+    typedef itype stream_state;
+
+    constexpr itype increment() const {
+        return inc_;
+    }
+
+    itype stream()
+    {
+         return inc_ >> 1;
+    }
+
+    void set_stream(itype specific_seq)
+    {
+         inc_ = (specific_seq << 1) | 1;
+    }
+
+    static constexpr bool can_specify_stream = true;
+
+    static constexpr size_t streams_pow2()
+    {
+        return (sizeof(itype)*8) - 1u;
+    }
+
+protected:
+    specific_stream() = default;
+
+    specific_stream(itype specific_seq)
+        : inc_(itype(specific_seq << 1) | itype(1U))
+    {
+        // Nothing (else) to do.
+    }
+};
+
+
+/*
+ * This is where it all comes together.  This function joins together three
+ * mixin classes which define
+ *    - the LCG additive constant (the stream)
+ *    - the LCG multiplier
+ *    - the output function
+ * in addition, we specify the type of the LCG state, and the result type,
+ * and whether to use the pre-advance version of the state for the output
+ * (increasing instruction-level parallelism) or the post-advance version
+ * (reducing register pressure).
+ *
+ * Given the high level of parameterization, the code has to use some
+ * template-metaprogramming tricks to handle some of the suble variations
+ * involved.
+ */
+
+template <typename xtype, typename itype,
+          typename output_mixin,
+          bool output_previous = true,
+          typename stream_mixin = oneseq_stream<itype>,
+          typename multiplier_mixin = default_multiplier<itype> >
+class engine : protected output_mixin,
+               public stream_mixin,
+               protected multiplier_mixin {
+protected:
+    itype state_;
+
+    struct can_specify_stream_tag {};
+    struct no_specifiable_stream_tag {};
+
+    using stream_mixin::increment;
+    using multiplier_mixin::multiplier;
+
+public:
+    typedef xtype result_type;
+    typedef itype state_type;
+
+    static constexpr size_t period_pow2()
+    {
+        return sizeof(state_type)*8 - 2*stream_mixin::is_mcg;
+    }
+
+    // It would be nice to use std::numeric_limits for these, but
+    // we can't be sure that it'd be defined for the 128-bit types.
+
+    static constexpr result_type min()
+    {
+        return result_type(0UL);
+    }
+
+    static constexpr result_type max()
+    {
+        return result_type(~result_type(0UL));
+    }
+
+protected:
+    itype bump(itype state)
+    {
+        return state * multiplier() + increment();
+    }
+
+    itype base_generate()
+    {
+        return state_ = bump(state_);
+    }
+
+    itype base_generate0()
+    {
+        itype old_state = state_;
+        state_ = bump(state_);
+        return old_state;
+    }
+
+public:
+    result_type operator()()
+    {
+        if (output_previous)
+            return this->output(base_generate0());
+        else
+            return this->output(base_generate());
+    }
+
+    result_type operator()(result_type upper_bound)
+    {
+        return bounded_rand(*this, upper_bound);
+    }
+
+protected:
+    static itype advance(itype state, itype delta,
+                         itype cur_mult, itype cur_plus);
+
+    static itype distance(itype cur_state, itype newstate, itype cur_mult,
+                          itype cur_plus, itype mask = ~itype(0U));
+
+    itype distance(itype newstate, itype mask = itype(~itype(0U))) const
+    {
+        return distance(state_, newstate, multiplier(), increment(), mask);
+    }
+
+public:
+    void advance(itype delta)
+    {
+        state_ = advance(state_, delta, this->multiplier(), this->increment());
+    }
+
+    void backstep(itype delta)
+    {
+        advance(-delta);
+    }
+
+    void discard(itype delta)
+    {
+        advance(delta);
+    }
+
+    bool wrapped()
+    {
+        if (stream_mixin::is_mcg) {
+            // For MCGs, the low order two bits never change. In this
+            // implementation, we keep them fixed at 3 to make this test
+            // easier.
+            return state_ == 3;
+        } else {
+            return state_ == 0;
+        }
+    }
+
+    engine(itype state = itype(0xcafef00dd15ea5e5ULL))
+        : state_(this->is_mcg ? state|state_type(3U)
+                              : bump(state + this->increment()))
+    {
+        // Nothing else to do.
+    }
+
+    // This function may or may not exist.  It thus has to be a template
+    // to use SFINAE; users don't have to worry about its template-ness.
+
+    template <typename sm = stream_mixin>
+    engine(itype state, typename sm::stream_state stream_seed)
+        : stream_mixin(stream_seed),
+          state_(this->is_mcg ? state|state_type(3U)
+                              : bump(state + this->increment()))
+    {
+        // Nothing else to do.
+    }
+
+    template<typename SeedSeq>
+    engine(SeedSeq&& seedSeq, typename std::enable_if<
+                  !stream_mixin::can_specify_stream
+               && !std::is_convertible<SeedSeq, itype>::value
+               && !std::is_convertible<SeedSeq, engine>::value,
+               no_specifiable_stream_tag>::type = {})
+        : engine(generate_one<itype>(std::forward<SeedSeq>(seedSeq)))
+    {
+        // Nothing else to do.
+    }
+
+    template<typename SeedSeq>
+    engine(SeedSeq&& seedSeq, typename std::enable_if<
+                   stream_mixin::can_specify_stream
+               && !std::is_convertible<SeedSeq, itype>::value
+               && !std::is_convertible<SeedSeq, engine>::value,
+        can_specify_stream_tag>::type = {})
+    {
+        itype seeddata[2];
+        generate_to<2>(std::forward<SeedSeq>(seedSeq), seeddata);
+        seed(seeddata[1], seeddata[0]);
+    }
+
+
+    template<typename... Args>
+    void seed(Args&&... args)
+    {
+        new (this) engine(std::forward<Args>(args)...);
+    }
+
+    template <typename xtype1, typename itype1,
+              typename output_mixin1, bool output_previous1,
+              typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+              typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+    friend bool operator==(const engine<xtype1,itype1,
+                                     output_mixin1,output_previous1,
+                                     stream_mixin_lhs, multiplier_mixin_lhs>&,
+                           const engine<xtype1,itype1,
+                                     output_mixin1,output_previous1,
+                                     stream_mixin_rhs, multiplier_mixin_rhs>&);
+
+    template <typename xtype1, typename itype1,
+              typename output_mixin1, bool output_previous1,
+              typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+              typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+    friend itype1 operator-(const engine<xtype1,itype1,
+                                     output_mixin1,output_previous1,
+                                     stream_mixin_lhs, multiplier_mixin_lhs>&,
+                            const engine<xtype1,itype1,
+                                     output_mixin1,output_previous1,
+                                     stream_mixin_rhs, multiplier_mixin_rhs>&);
+
+    template <typename CharT, typename Traits,
+              typename xtype1, typename itype1,
+              typename output_mixin1, bool output_previous1,
+              typename stream_mixin1, typename multiplier_mixin1>
+    friend std::basic_ostream<CharT,Traits>&
+    operator<<(std::basic_ostream<CharT,Traits>& out,
+               const engine<xtype1,itype1,
+                              output_mixin1,output_previous1,
+                              stream_mixin1, multiplier_mixin1>&);
+
+    template <typename CharT, typename Traits,
+              typename xtype1, typename itype1,
+              typename output_mixin1, bool output_previous1,
+              typename stream_mixin1, typename multiplier_mixin1>
+    friend std::basic_istream<CharT,Traits>&
+    operator>>(std::basic_istream<CharT,Traits>& in,
+               engine<xtype1, itype1,
+                        output_mixin1, output_previous1,
+                        stream_mixin1, multiplier_mixin1>& rng);
+};
+
+template <typename CharT, typename Traits,
+          typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin, typename multiplier_mixin>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits>& out,
+           const engine<xtype,itype,
+                          output_mixin,output_previous,
+                          stream_mixin, multiplier_mixin>& rng)
+{
+    using pcg_extras::operator<<;
+
+    auto orig_flags = out.flags(std::ios_base::dec | std::ios_base::left);
+    auto space = out.widen(' ');
+    auto orig_fill = out.fill();
+
+    out << rng.multiplier() << space
+        << rng.increment() << space
+        << rng.state_;
+
+    out.flags(orig_flags);
+    out.fill(orig_fill);
+    return out;
+}
+
+
+template <typename CharT, typename Traits,
+          typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin, typename multiplier_mixin>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits>& in,
+           engine<xtype,itype,
+                    output_mixin,output_previous,
+                    stream_mixin, multiplier_mixin>& rng)
+{
+    using pcg_extras::operator>>;
+
+    auto orig_flags = in.flags(std::ios_base::dec | std::ios_base::skipws);
+
+    itype multiplier, increment, state;
+    in >> multiplier >> increment >> state;
+
+    if (!in.fail()) {
+        bool good = true;
+        if (multiplier != rng.multiplier()) {
+           good = false;
+        } else if (rng.can_specify_stream) {
+           rng.set_stream(increment >> 1);
+        } else if (increment != rng.increment()) {
+           good = false;
+        }
+        if (good) {
+            rng.state_ = state;
+        } else {
+            in.clear(std::ios::failbit);
+        }
+    }
+
+    in.flags(orig_flags);
+    return in;
+}
+
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin, typename multiplier_mixin>
+itype engine<xtype,itype,output_mixin,output_previous,stream_mixin,
+             multiplier_mixin>::advance(
+    itype state, itype delta, itype cur_mult, itype cur_plus)
+{
+    // The method used here is based on Brown, "Random Number Generation
+    // with Arbitrary Stride,", Transactions of the American Nuclear
+    // Society (Nov. 1994).  The algorithm is very similar to fast
+    // exponentiation.
+    //
+    // Even though delta is an unsigned integer, we can pass a
+    // signed integer to go backwards, it just goes "the long way round".
+
+    constexpr itype ZERO = 0u;  // itype may be a non-trivial types, so
+    constexpr itype ONE  = 1u;  // we define some ugly constants.
+    itype acc_mult = 1;
+    itype acc_plus = 0;
+    while (delta > ZERO) {
+       if (delta & ONE) {
+          acc_mult *= cur_mult;
+          acc_plus = acc_plus*cur_mult + cur_plus;
+       }
+       cur_plus = (cur_mult+ONE)*cur_plus;
+       cur_mult *= cur_mult;
+       delta >>= 1;
+    }
+    return acc_mult * state + acc_plus;
+}
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin, typename multiplier_mixin>
+itype engine<xtype,itype,output_mixin,output_previous,stream_mixin,
+               multiplier_mixin>::distance(
+    itype cur_state, itype newstate, itype cur_mult, itype cur_plus, itype mask)
+{
+    constexpr itype ONE  = 1u;  // itype could be weird, so use constant
+    bool is_mcg = cur_plus == itype(0);
+    itype the_bit = is_mcg ? itype(4u) : itype(1u);
+    itype distance = 0u;
+    while ((cur_state & mask) != (newstate & mask)) {
+       if ((cur_state & the_bit) != (newstate & the_bit)) {
+           cur_state = cur_state * cur_mult + cur_plus;
+           distance |= the_bit;
+       }
+       assert((cur_state & the_bit) == (newstate & the_bit));
+       the_bit <<= 1;
+       cur_plus = (cur_mult+ONE)*cur_plus;
+       cur_mult *= cur_mult;
+    }
+    return is_mcg ? distance >> 2 : distance;
+}
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+          typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+itype operator-(const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_lhs, multiplier_mixin_lhs>& lhs,
+               const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_rhs, multiplier_mixin_rhs>& rhs)
+{
+    static_assert(
+        std::is_same<stream_mixin_lhs, stream_mixin_rhs>::value &&
+            std::is_same<multiplier_mixin_lhs, multiplier_mixin_rhs>::value,
+        "Incomparable generators");
+    if (lhs.increment() == rhs.increment()) {
+       return rhs.distance(lhs.state_);
+    } else  {
+       constexpr itype ONE = 1u;
+       itype lhs_diff = lhs.increment() + (lhs.multiplier()-ONE) * lhs.state_;
+       itype rhs_diff = rhs.increment() + (rhs.multiplier()-ONE) * rhs.state_;
+       if ((lhs_diff & itype(3u)) != (rhs_diff & itype(3u))) {
+           rhs_diff = -rhs_diff;
+       }
+       return rhs.distance(rhs_diff, lhs_diff, rhs.multiplier(), itype(0u));
+    }
+}
+
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+          typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+bool operator==(const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_lhs, multiplier_mixin_lhs>& lhs,
+                const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_rhs, multiplier_mixin_rhs>& rhs)
+{
+    return    (lhs.multiplier() == rhs.multiplier())
+           && (lhs.increment()  == rhs.increment())
+           && (lhs.state_       == rhs.state_);
+}
+
+template <typename xtype, typename itype,
+          typename output_mixin, bool output_previous,
+          typename stream_mixin_lhs, typename multiplier_mixin_lhs,
+          typename stream_mixin_rhs, typename multiplier_mixin_rhs>
+inline bool operator!=(const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_lhs, multiplier_mixin_lhs>& lhs,
+                       const engine<xtype,itype,
+                               output_mixin,output_previous,
+                               stream_mixin_rhs, multiplier_mixin_rhs>& rhs)
+{
+    return !operator==(lhs,rhs);
+}
+
+
+template <typename xtype, typename itype,
+         template<typename XT,typename IT> class output_mixin,
+         bool output_previous = (sizeof(itype) <= 8),
+         template<typename IT> class multiplier_mixin = default_multiplier>
+using oneseq_base  = engine<xtype, itype,
+                        output_mixin<xtype, itype>, output_previous,
+                        oneseq_stream<itype>,
+                        multiplier_mixin<itype> >;
+
+template <typename xtype, typename itype,
+         template<typename XT,typename IT> class output_mixin,
+         bool output_previous = (sizeof(itype) <= 8),
+         template<typename IT> class multiplier_mixin = default_multiplier>
+using unique_base = engine<xtype, itype,
+                         output_mixin<xtype, itype>, output_previous,
+                         unique_stream<itype>,
+                         multiplier_mixin<itype> >;
+
+template <typename xtype, typename itype,
+         template<typename XT,typename IT> class output_mixin,
+         bool output_previous = (sizeof(itype) <= 8),
+         template<typename IT> class multiplier_mixin = default_multiplier>
+using setseq_base = engine<xtype, itype,
+                         output_mixin<xtype, itype>, output_previous,
+                         specific_stream<itype>,
+                         multiplier_mixin<itype> >;
+
+template <typename xtype, typename itype,
+         template<typename XT,typename IT> class output_mixin,
+         bool output_previous = (sizeof(itype) <= 8),
+         template<typename IT> class multiplier_mixin = default_multiplier>
+using mcg_base = engine<xtype, itype,
+                      output_mixin<xtype, itype>, output_previous,
+                      no_stream<itype>,
+                      multiplier_mixin<itype> >;
+
+/*
+ * OUTPUT FUNCTIONS.
+ *
+ * These are the core of the PCG generation scheme.  They specify how to
+ * turn the base LCG's internal state into the output value of the final
+ * generator.
+ *
+ * They're implemented as mixin classes.
+ *
+ * All of the classes have code that is written to allow it to be applied
+ * at *arbitrary* bit sizes, although in practice they'll only be used at
+ * standard sizes supported by C++.
+ */
+
+/*
+ * XSH RS -- high xorshift, followed by a random shift
+ *
+ * Fast.  A good performer.
+ */
+
+template <typename xtype, typename itype>
+struct xsh_rs_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t bits        = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t xtypebits   = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t sparebits   = bits - xtypebits;
+        constexpr bitcount_t opbits =
+                              sparebits-5 >= 64 ? 5
+                            : sparebits-4 >= 32 ? 4
+                            : sparebits-3 >= 16 ? 3
+                            : sparebits-2 >= 4  ? 2
+                            : sparebits-1 >= 1  ? 1
+                            :                     0;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        constexpr bitcount_t maxrandshift  = mask;
+        constexpr bitcount_t topspare     = opbits;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift     = topspare + (xtypebits+maxrandshift)/2;
+        bitcount_t rshift =
+            opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0;
+        internal ^= internal >> xshift;
+        xtype result = xtype(internal >> (bottomspare - maxrandshift + rshift));
+        return result;
+    }
+};
+
+/*
+ * XSH RR -- high xorshift, followed by a random rotate
+ *
+ * Fast.  A good performer.  Slightly better statistically than XSH RS.
+ */
+
+template <typename xtype, typename itype>
+struct xsh_rr_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t bits        = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t xtypebits   = bitcount_t(sizeof(xtype)*8);
+        constexpr bitcount_t sparebits   = bits - xtypebits;
+        constexpr bitcount_t wantedopbits =
+                              xtypebits >= 128 ? 7
+                            : xtypebits >=  64 ? 6
+                            : xtypebits >=  32 ? 5
+                            : xtypebits >=  16 ? 4
+                            :                    3;
+        constexpr bitcount_t opbits =
+                              sparebits >= wantedopbits ? wantedopbits
+                                                        : sparebits;
+        constexpr bitcount_t amplifier = wantedopbits - opbits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        constexpr bitcount_t topspare    = opbits;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift      = (topspare + xtypebits)/2;
+        bitcount_t rot = opbits ? bitcount_t(internal >> (bits - opbits)) & mask
+                                : 0;
+        bitcount_t amprot = (rot << amplifier) & mask;
+        internal ^= internal >> xshift;
+        xtype result = xtype(internal >> bottomspare);
+        result = rotr(result, amprot);
+        return result;
+    }
+};
+
+/*
+ * RXS -- random xorshift
+ */
+
+template <typename xtype, typename itype>
+struct rxs_mixin {
+static xtype output_rxs(itype internal)
+    {
+        constexpr bitcount_t bits        = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t xtypebits   = bitcount_t(sizeof(xtype)*8);
+        constexpr bitcount_t shift       = bits - xtypebits;
+        constexpr bitcount_t extrashift  = (xtypebits - shift)/2;
+        bitcount_t rshift = shift > 64+8 ? (internal >> (bits - 6)) & 63
+                       : shift > 32+4 ? (internal >> (bits - 5)) & 31
+                       : shift > 16+2 ? (internal >> (bits - 4)) & 15
+                       : shift >  8+1 ? (internal >> (bits - 3)) & 7
+                       : shift >  4+1 ? (internal >> (bits - 2)) & 3
+                       : shift >  2+1 ? (internal >> (bits - 1)) & 1
+                       :              0;
+        internal ^= internal >> (shift + extrashift - rshift);
+        xtype result = internal >> rshift;
+        return result;
+    }
+};
+
+/*
+ * RXS M XS -- random xorshift, mcg multiply, fixed xorshift
+ *
+ * The most statistically powerful generator, but all those steps
+ * make it slower than some of the others.  We give it the rottenest jobs.
+ *
+ * Because it's usually used in contexts where the state type and the
+ * result type are the same, it is a permutation and is thus invertable.
+ * We thus provide a function to invert it.  This function is used to
+ * for the "inside out" generator used by the extended generator.
+ */
+
+/* Defined type-based concepts for the multiplication step.  They're actually
+ * all derived by truncating the 128-bit, which was computed to be a good
+ * "universal" constant.
+ */
+
+template <typename T>
+struct mcg_multiplier {
+    // Not defined for an arbitrary type
+};
+
+template <typename T>
+struct mcg_unmultiplier {
+    // Not defined for an arbitrary type
+};
+
+PCG_DEFINE_CONSTANT(uint8_t,  mcg, multiplier,   217U)
+PCG_DEFINE_CONSTANT(uint8_t,  mcg, unmultiplier, 105U)
+
+PCG_DEFINE_CONSTANT(uint16_t, mcg, multiplier,   62169U)
+PCG_DEFINE_CONSTANT(uint16_t, mcg, unmultiplier, 28009U)
+
+PCG_DEFINE_CONSTANT(uint32_t, mcg, multiplier,   277803737U)
+PCG_DEFINE_CONSTANT(uint32_t, mcg, unmultiplier, 2897767785U)
+
+PCG_DEFINE_CONSTANT(uint64_t, mcg, multiplier,   12605985483714917081ULL)
+PCG_DEFINE_CONSTANT(uint64_t, mcg, unmultiplier, 15009553638781119849ULL)
+
+PCG_DEFINE_CONSTANT(pcg128_t, mcg, multiplier,
+        PCG_128BIT_CONSTANT(17766728186571221404ULL, 12605985483714917081ULL))
+PCG_DEFINE_CONSTANT(pcg128_t, mcg, unmultiplier,
+        PCG_128BIT_CONSTANT(14422606686972528997ULL, 15009553638781119849ULL))
+
+
+template <typename xtype, typename itype>
+struct rxs_m_xs_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t opbits = xtypebits >= 128 ? 6
+                                 : xtypebits >=  64 ? 5
+                                 : xtypebits >=  32 ? 4
+                                 : xtypebits >=  16 ? 3
+                                 :                    2;
+        constexpr bitcount_t shift = bits - xtypebits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        bitcount_t rshift =
+            opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0;
+        internal ^= internal >> (opbits + rshift);
+        internal *= mcg_multiplier<itype>::multiplier();
+        xtype result = internal >> shift;
+        result ^= result >> ((2U*xtypebits+2U)/3U);
+        return result;
+    }
+
+    static itype unoutput(itype internal)
+    {
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t opbits = bits >= 128 ? 6
+                                 : bits >=  64 ? 5
+                                 : bits >=  32 ? 4
+                                 : bits >=  16 ? 3
+                                 :               2;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+
+        internal = unxorshift(internal, bits, (2U*bits+2U)/3U);
+
+        internal *= mcg_unmultiplier<itype>::unmultiplier();
+
+        bitcount_t rshift = opbits ? (internal >> (bits - opbits)) & mask : 0;
+        internal = unxorshift(internal, bits, opbits + rshift);
+
+        return internal;
+    }
+};
+
+
+/*
+ * RXS M -- random xorshift, mcg multiply
+ */
+
+template <typename xtype, typename itype>
+struct rxs_m_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t opbits = xtypebits >= 128 ? 6
+                                 : xtypebits >=  64 ? 5
+                                 : xtypebits >=  32 ? 4
+                                 : xtypebits >=  16 ? 3
+                                 :                    2;
+        constexpr bitcount_t shift = bits - xtypebits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        bitcount_t rshift = opbits ? (internal >> (bits - opbits)) & mask : 0;
+        internal ^= internal >> (opbits + rshift);
+        internal *= mcg_multiplier<itype>::multiplier();
+        xtype result = internal >> shift;
+        return result;
+    }
+};
+
+
+/*
+ * DXSM -- double xorshift multiply
+ *
+ * This is a new, more powerful output permutation (added in 2019).  It's
+ * a more comprehensive scrambling than RXS M, but runs faster on 128-bit
+ * types.  Although primarily intended for use at large sizes, also works
+ * at smaller sizes as well.
+ *
+ * This permutation is similar to xorshift multiply hash functions, except
+ * that one of the multipliers is the LCG multiplier (to avoid needing to
+ * have a second constant) and the other is based on the low-order bits.
+ * This latter aspect means that the scrambling applied to the high bits
+ * depends on the low bits, and makes it (to my eye) impractical to back
+ * out the permutation without having the low-order bits.
+ */
+
+template <typename xtype, typename itype>
+struct dxsm_mixin {
+    inline xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t itypebits = bitcount_t(sizeof(itype) * 8);
+        static_assert(xtypebits <= itypebits/2,
+                      "Output type must be half the size of the state type.");
+        
+        xtype hi = xtype(internal >> (itypebits - xtypebits));
+        xtype lo = xtype(internal);
+
+        lo |= 1;
+        hi ^= hi >> (xtypebits/2);
+	hi *= xtype(cheap_multiplier<itype>::multiplier());
+	hi ^= hi >> (3*(xtypebits/4));
+	hi *= lo;
+	return hi;
+    }
+};
+
+
+/*
+ * XSL RR -- fixed xorshift (to low bits), random rotate
+ *
+ * Useful for 128-bit types that are split across two CPU registers.
+ */
+
+template <typename xtype, typename itype>
+struct xsl_rr_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t sparebits = bits - xtypebits;
+        constexpr bitcount_t wantedopbits = xtypebits >= 128 ? 7
+                                       : xtypebits >=  64 ? 6
+                                       : xtypebits >=  32 ? 5
+                                       : xtypebits >=  16 ? 4
+                                       :                    3;
+        constexpr bitcount_t opbits = sparebits >= wantedopbits ? wantedopbits
+                                                             : sparebits;
+        constexpr bitcount_t amplifier = wantedopbits - opbits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        constexpr bitcount_t topspare = sparebits;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift = (topspare + xtypebits) / 2;
+
+        bitcount_t rot =
+            opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0;
+        bitcount_t amprot = (rot << amplifier) & mask;
+        internal ^= internal >> xshift;
+        xtype result = xtype(internal >> bottomspare);
+        result = rotr(result, amprot);
+        return result;
+    }
+};
+
+
+/*
+ * XSL RR RR -- fixed xorshift (to low bits), random rotate (both parts)
+ *
+ * Useful for 128-bit types that are split across two CPU registers.
+ * If you really want an invertable 128-bit RNG, I guess this is the one.
+ */
+
+template <typename T> struct halfsize_trait {};
+template <> struct halfsize_trait<pcg128_t>  { typedef uint64_t type; };
+template <> struct halfsize_trait<uint64_t>  { typedef uint32_t type; };
+template <> struct halfsize_trait<uint32_t>  { typedef uint16_t type; };
+template <> struct halfsize_trait<uint16_t>  { typedef uint8_t type;  };
+
+template <typename xtype, typename itype>
+struct xsl_rr_rr_mixin {
+    typedef typename halfsize_trait<itype>::type htype;
+
+    static itype output(itype internal)
+    {
+        constexpr bitcount_t htypebits = bitcount_t(sizeof(htype) * 8);
+        constexpr bitcount_t bits      = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t sparebits = bits - htypebits;
+        constexpr bitcount_t wantedopbits = htypebits >= 128 ? 7
+                                       : htypebits >=  64 ? 6
+                                       : htypebits >=  32 ? 5
+                                       : htypebits >=  16 ? 4
+                                       :                    3;
+        constexpr bitcount_t opbits = sparebits >= wantedopbits ? wantedopbits
+                                                                : sparebits;
+        constexpr bitcount_t amplifier = wantedopbits - opbits;
+        constexpr bitcount_t mask = (1 << opbits) - 1;
+        constexpr bitcount_t topspare = sparebits;
+        constexpr bitcount_t xshift = (topspare + htypebits) / 2;
+
+        bitcount_t rot =
+            opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0;
+        bitcount_t amprot = (rot << amplifier) & mask;
+        internal ^= internal >> xshift;
+        htype lowbits = htype(internal);
+        lowbits = rotr(lowbits, amprot);
+        htype highbits = htype(internal >> topspare);
+        bitcount_t rot2 = lowbits & mask;
+        bitcount_t amprot2 = (rot2 << amplifier) & mask;
+        highbits = rotr(highbits, amprot2);
+        return (itype(highbits) << topspare) ^ itype(lowbits);
+    }
+};
+
+
+/*
+ * XSH -- fixed xorshift (to high bits)
+ *
+ * You shouldn't use this at 64-bits or less.
+ */
+
+template <typename xtype, typename itype>
+struct xsh_mixin {
+    static xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t sparebits = bits - xtypebits;
+        constexpr bitcount_t topspare = 0;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift = (topspare + xtypebits) / 2;
+
+        internal ^= internal >> xshift;
+        xtype result = internal >> bottomspare;
+        return result;
+    }
+};
+
+/*
+ * XSL -- fixed xorshift (to low bits)
+ *
+ * You shouldn't use this at 64-bits or less.
+ */
+
+template <typename xtype, typename itype>
+struct xsl_mixin {
+    inline xtype output(itype internal)
+    {
+        constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8);
+        constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8);
+        constexpr bitcount_t sparebits = bits - xtypebits;
+        constexpr bitcount_t topspare = sparebits;
+        constexpr bitcount_t bottomspare = sparebits - topspare;
+        constexpr bitcount_t xshift = (topspare + xtypebits) / 2;
+
+        internal ^= internal >> xshift;
+        xtype result = internal >> bottomspare;
+        return result;
+    }
+};
+
+
+/* ---- End of Output Functions ---- */
+
+
+template <typename baseclass>
+struct inside_out : private baseclass {
+    inside_out() = delete;
+
+    typedef typename baseclass::result_type result_type;
+    typedef typename baseclass::state_type  state_type;
+    static_assert(sizeof(result_type) == sizeof(state_type),
+                  "Require a RNG whose output function is a permutation");
+
+    static bool external_step(result_type& randval, size_t i)
+    {
+        state_type state = baseclass::unoutput(randval);
+        state = state * baseclass::multiplier() + baseclass::increment()
+                + state_type(i*2);
+        result_type result = baseclass::output(state);
+        randval = result;
+        state_type zero =
+            baseclass::is_mcg ? state & state_type(3U) : state_type(0U);
+        return result == zero;
+    }
+
+    static bool external_advance(result_type& randval, size_t i,
+                                 result_type delta, bool forwards = true)
+    {
+        state_type state = baseclass::unoutput(randval);
+        state_type mult  = baseclass::multiplier();
+        state_type inc   = baseclass::increment() + state_type(i*2);
+        state_type zero =
+            baseclass::is_mcg ? state & state_type(3U) : state_type(0U);
+        state_type dist_to_zero = baseclass::distance(state, zero, mult, inc);
+        bool crosses_zero =
+            forwards ? dist_to_zero <= delta
+                     : (-dist_to_zero) <= delta;
+        if (!forwards)
+            delta = -delta;
+        state = baseclass::advance(state, delta, mult, inc);
+        randval = baseclass::output(state);
+        return crosses_zero;
+    }
+};
+
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, typename baseclass, typename extvalclass, bool kdd = true>
+class extended : public baseclass {
+public:
+    typedef typename baseclass::state_type  state_type;
+    typedef typename baseclass::result_type result_type;
+    typedef inside_out<extvalclass> insideout;
+
+private:
+    static constexpr bitcount_t rtypebits = sizeof(result_type)*8;
+    static constexpr bitcount_t stypebits = sizeof(state_type)*8;
+
+    static constexpr bitcount_t tick_limit_pow2 = 64U;
+
+    static constexpr size_t table_size  = 1UL << table_pow2;
+    static constexpr size_t table_shift = stypebits - table_pow2;
+    static constexpr state_type table_mask =
+        (state_type(1U) << table_pow2) - state_type(1U);
+
+    static constexpr bool   may_tick  =
+        (advance_pow2 < stypebits) && (advance_pow2 < tick_limit_pow2);
+    static constexpr size_t tick_shift = stypebits - advance_pow2;
+    static constexpr state_type tick_mask  =
+        may_tick ? state_type(
+                       (uint64_t(1) << (advance_pow2*may_tick)) - 1)
+                                        // ^-- stupidity to appease GCC warnings
+                 : ~state_type(0U);
+
+    static constexpr bool may_tock = stypebits < tick_limit_pow2;
+
+    result_type data_[table_size];
+
+    PCG_NOINLINE void advance_table();
+
+    PCG_NOINLINE void advance_table(state_type delta, bool isForwards = true);
+
+    result_type& get_extended_value()
+    {
+        state_type state = this->state_;
+        if (kdd && baseclass::is_mcg) {
+            // The low order bits of an MCG are constant, so drop them.
+            state >>= 2;
+        }
+        size_t index       = kdd ? state &  table_mask
+                                 : state >> table_shift;
+
+        if (may_tick) {
+            bool tick = kdd ? (state & tick_mask) == state_type(0u)
+                            : (state >> tick_shift) == state_type(0u);
+            if (tick)
+                    advance_table();
+        }
+        if (may_tock) {
+            bool tock = state == state_type(0u);
+            if (tock)
+                advance_table();
+        }
+        return data_[index];
+    }
+
+public:
+    static constexpr size_t period_pow2()
+    {
+        return baseclass::period_pow2() + table_size*extvalclass::period_pow2();
+    }
+
+    PCG_ALWAYS_INLINE result_type operator()()
+    {
+        result_type rhs = get_extended_value();
+        result_type lhs = this->baseclass::operator()();
+        return lhs ^ rhs;
+    }
+
+    result_type operator()(result_type upper_bound)
+    {
+        return bounded_rand(*this, upper_bound);
+    }
+
+    void set(result_type wanted)
+    {
+        result_type& rhs = get_extended_value();
+        result_type lhs = this->baseclass::operator()();
+        rhs = lhs ^ wanted;
+    }
+
+    void advance(state_type distance, bool forwards = true);
+
+    void backstep(state_type distance)
+    {
+        advance(distance, false);
+    }
+
+    extended(const result_type* data)
+        : baseclass()
+    {
+        datainit(data);
+    }
+
+    extended(const result_type* data, state_type seed)
+        : baseclass(seed)
+    {
+        datainit(data);
+    }
+
+    // This function may or may not exist.  It thus has to be a template
+    // to use SFINAE; users don't have to worry about its template-ness.
+
+    template <typename bc = baseclass>
+    extended(const result_type* data, state_type seed,
+            typename bc::stream_state stream_seed)
+        : baseclass(seed, stream_seed)
+    {
+        datainit(data);
+    }
+
+    extended()
+        : baseclass()
+    {
+        selfinit();
+    }
+
+    extended(state_type seed)
+        : baseclass(seed)
+    {
+        selfinit();
+    }
+
+    // This function may or may not exist.  It thus has to be a template
+    // to use SFINAE; users don't have to worry about its template-ness.
+
+    template <typename bc = baseclass>
+    extended(state_type seed, typename bc::stream_state stream_seed)
+        : baseclass(seed, stream_seed)
+    {
+        selfinit();
+    }
+
+private:
+    void selfinit();
+    void datainit(const result_type* data);
+
+public:
+
+    template<typename SeedSeq, typename = typename std::enable_if<
+           !std::is_convertible<SeedSeq, result_type>::value
+        && !std::is_convertible<SeedSeq, extended>::value>::type>
+    extended(SeedSeq&& seedSeq)
+        : baseclass(seedSeq)
+    {
+        generate_to<table_size>(seedSeq, data_);
+    }
+
+    template<typename... Args>
+    void seed(Args&&... args)
+    {
+        new (this) extended(std::forward<Args>(args)...);
+    }
+
+    template <bitcount_t table_pow2_, bitcount_t advance_pow2_,
+              typename baseclass_, typename extvalclass_, bool kdd_>
+    friend bool operator==(const extended<table_pow2_, advance_pow2_,
+                                              baseclass_, extvalclass_, kdd_>&,
+                           const extended<table_pow2_, advance_pow2_,
+                                              baseclass_, extvalclass_, kdd_>&);
+
+    template <typename CharT, typename Traits,
+              bitcount_t table_pow2_, bitcount_t advance_pow2_,
+              typename baseclass_, typename extvalclass_, bool kdd_>
+    friend std::basic_ostream<CharT,Traits>&
+    operator<<(std::basic_ostream<CharT,Traits>& out,
+               const extended<table_pow2_, advance_pow2_,
+                              baseclass_, extvalclass_, kdd_>&);
+
+    template <typename CharT, typename Traits,
+              bitcount_t table_pow2_, bitcount_t advance_pow2_,
+              typename baseclass_, typename extvalclass_, bool kdd_>
+    friend std::basic_istream<CharT,Traits>&
+    operator>>(std::basic_istream<CharT,Traits>& in,
+               extended<table_pow2_, advance_pow2_,
+                        baseclass_, extvalclass_, kdd_>&);
+
+};
+
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::datainit(
+         const result_type* data)
+{
+    for (size_t i = 0; i < table_size; ++i)
+        data_[i] = data[i];
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::selfinit()
+{
+    // We need to fill the extended table with something, and we have
+    // very little provided data, so we use the base generator to
+    // produce values.  Although not ideal (use a seed sequence, folks!),
+    // unexpected correlations are mitigated by
+    //      - using XOR differences rather than the number directly
+    //      - the way the table is accessed, its values *won't* be accessed
+    //        in the same order the were written.
+    //      - any strange correlations would only be apparent if we
+    //        were to backstep the generator so that the base generator
+    //        was generating the same values again
+    result_type lhs = baseclass::operator()();
+    result_type rhs = baseclass::operator()();
+    result_type xdiff = lhs - rhs;
+    for (size_t i = 0; i < table_size; ++i) {
+        data_[i] = baseclass::operator()() ^ xdiff;
+    }
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+bool operator==(const extended<table_pow2, advance_pow2,
+                               baseclass, extvalclass, kdd>& lhs,
+                const extended<table_pow2, advance_pow2,
+                               baseclass, extvalclass, kdd>& rhs)
+{
+    auto& base_lhs = static_cast<const baseclass&>(lhs);
+    auto& base_rhs = static_cast<const baseclass&>(rhs);
+    return base_lhs == base_rhs
+        && std::equal(
+               std::begin(lhs.data_), std::end(lhs.data_),
+               std::begin(rhs.data_)
+           );
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+inline bool operator!=(const extended<table_pow2, advance_pow2,
+                                      baseclass, extvalclass, kdd>& lhs,
+                       const extended<table_pow2, advance_pow2,
+                                      baseclass, extvalclass, kdd>& rhs)
+{
+    return !operator==(lhs, rhs);
+}
+
+template <typename CharT, typename Traits,
+          bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+std::basic_ostream<CharT,Traits>&
+operator<<(std::basic_ostream<CharT,Traits>& out,
+           const extended<table_pow2, advance_pow2,
+                          baseclass, extvalclass, kdd>& rng)
+{
+    using pcg_extras::operator<<;
+
+    auto orig_flags = out.flags(std::ios_base::dec | std::ios_base::left);
+    auto space = out.widen(' ');
+    auto orig_fill = out.fill();
+
+    out << rng.multiplier() << space
+        << rng.increment() << space
+        << rng.state_;
+
+    for (const auto& datum : rng.data_)
+        out << space << datum;
+
+    out.flags(orig_flags);
+    out.fill(orig_fill);
+    return out;
+}
+
+template <typename CharT, typename Traits,
+          bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+std::basic_istream<CharT,Traits>&
+operator>>(std::basic_istream<CharT,Traits>& in,
+           extended<table_pow2, advance_pow2,
+                    baseclass, extvalclass, kdd>& rng)
+{
+    extended<table_pow2, advance_pow2, baseclass, extvalclass> new_rng;
+    auto& base_rng = static_cast<baseclass&>(new_rng);
+    in >> base_rng;
+
+    if (in.fail())
+        return in;
+
+    using pcg_extras::operator>>;
+
+    auto orig_flags = in.flags(std::ios_base::dec | std::ios_base::skipws);
+
+    for (auto& datum : new_rng.data_) {
+        in >> datum;
+        if (in.fail())
+            goto bail;
+    }
+
+    rng = new_rng;
+
+bail:
+    in.flags(orig_flags);
+    return in;
+}
+
+
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void
+extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::advance_table()
+{
+    bool carry = false;
+    for (size_t i = 0; i < table_size; ++i) {
+        if (carry) {
+            carry = insideout::external_step(data_[i],i+1);
+        }
+        bool carry2 = insideout::external_step(data_[i],i+1);
+        carry = carry || carry2;
+    }
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void
+extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::advance_table(
+        state_type delta, bool isForwards)
+{
+    typedef typename baseclass::state_type   base_state_t;
+    typedef typename extvalclass::state_type ext_state_t;
+    constexpr bitcount_t basebits = sizeof(base_state_t)*8;
+    constexpr bitcount_t extbits  = sizeof(ext_state_t)*8;
+    static_assert(basebits <= extbits || advance_pow2 > 0,
+                  "Current implementation might overflow its carry");
+
+    base_state_t carry = 0;
+    for (size_t i = 0; i < table_size; ++i) {
+        base_state_t total_delta = carry + delta;
+        ext_state_t  trunc_delta = ext_state_t(total_delta);
+        if (basebits > extbits) {
+            carry = total_delta >> extbits;
+        } else {
+            carry = 0;
+        }
+        carry +=
+            insideout::external_advance(data_[i],i+1, trunc_delta, isForwards);
+    }
+}
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename baseclass, typename extvalclass, bool kdd>
+void extended<table_pow2,advance_pow2,baseclass,extvalclass,kdd>::advance(
+    state_type distance, bool forwards)
+{
+    static_assert(kdd,
+        "Efficient advance is too hard for non-kdd extension. "
+        "For a weak advance, cast to base class");
+    state_type zero =
+        baseclass::is_mcg ? this->state_ & state_type(3U) : state_type(0U);
+    if (may_tick) {
+        state_type ticks = distance >> (advance_pow2*may_tick);
+                                        // ^-- stupidity to appease GCC
+                                        // warnings
+        state_type adv_mask =
+            baseclass::is_mcg ? tick_mask << 2 : tick_mask;
+        state_type next_advance_distance = this->distance(zero, adv_mask);
+        if (!forwards)
+            next_advance_distance = (-next_advance_distance) & tick_mask;
+        if (next_advance_distance < (distance & tick_mask)) {
+            ++ticks;
+        }
+        if (ticks)
+            advance_table(ticks, forwards);
+    }
+    if (forwards) {
+        if (may_tock && this->distance(zero) <= distance)
+            advance_table();
+        baseclass::advance(distance);
+    } else {
+        if (may_tock && -(this->distance(zero)) <= distance)
+            advance_table(state_type(1U), false);
+        baseclass::advance(-distance);
+    }
+}
+
+} // namespace pcg_detail
+
+namespace pcg_engines {
+
+using namespace pcg_detail;
+
+/* Predefined types for XSH RS */
+
+typedef oneseq_base<uint8_t,  uint16_t, xsh_rs_mixin>  oneseq_xsh_rs_16_8;
+typedef oneseq_base<uint16_t, uint32_t, xsh_rs_mixin>  oneseq_xsh_rs_32_16;
+typedef oneseq_base<uint32_t, uint64_t, xsh_rs_mixin>  oneseq_xsh_rs_64_32;
+typedef oneseq_base<uint64_t, pcg128_t, xsh_rs_mixin>  oneseq_xsh_rs_128_64;
+typedef oneseq_base<uint64_t, pcg128_t, xsh_rs_mixin, true, cheap_multiplier>
+                                                       cm_oneseq_xsh_rs_128_64;
+
+typedef unique_base<uint8_t,  uint16_t, xsh_rs_mixin>  unique_xsh_rs_16_8;
+typedef unique_base<uint16_t, uint32_t, xsh_rs_mixin>  unique_xsh_rs_32_16;
+typedef unique_base<uint32_t, uint64_t, xsh_rs_mixin>  unique_xsh_rs_64_32;
+typedef unique_base<uint64_t, pcg128_t, xsh_rs_mixin>  unique_xsh_rs_128_64;
+typedef unique_base<uint64_t, pcg128_t, xsh_rs_mixin, true, cheap_multiplier>
+                                                       cm_unique_xsh_rs_128_64;
+
+typedef setseq_base<uint8_t,  uint16_t, xsh_rs_mixin>  setseq_xsh_rs_16_8;
+typedef setseq_base<uint16_t, uint32_t, xsh_rs_mixin>  setseq_xsh_rs_32_16;
+typedef setseq_base<uint32_t, uint64_t, xsh_rs_mixin>  setseq_xsh_rs_64_32;
+typedef setseq_base<uint64_t, pcg128_t, xsh_rs_mixin>  setseq_xsh_rs_128_64;
+typedef setseq_base<uint64_t, pcg128_t, xsh_rs_mixin, true, cheap_multiplier>
+                                                       cm_setseq_xsh_rs_128_64;
+
+typedef mcg_base<uint8_t,  uint16_t, xsh_rs_mixin>  mcg_xsh_rs_16_8;
+typedef mcg_base<uint16_t, uint32_t, xsh_rs_mixin>  mcg_xsh_rs_32_16;
+typedef mcg_base<uint32_t, uint64_t, xsh_rs_mixin>  mcg_xsh_rs_64_32;
+typedef mcg_base<uint64_t, pcg128_t, xsh_rs_mixin>  mcg_xsh_rs_128_64;
+typedef mcg_base<uint64_t, pcg128_t, xsh_rs_mixin, true, cheap_multiplier>
+                                                    cm_mcg_xsh_rs_128_64;
+
+/* Predefined types for XSH RR */
+
+typedef oneseq_base<uint8_t,  uint16_t, xsh_rr_mixin>  oneseq_xsh_rr_16_8;
+typedef oneseq_base<uint16_t, uint32_t, xsh_rr_mixin>  oneseq_xsh_rr_32_16;
+typedef oneseq_base<uint32_t, uint64_t, xsh_rr_mixin>  oneseq_xsh_rr_64_32;
+typedef oneseq_base<uint64_t, pcg128_t, xsh_rr_mixin>  oneseq_xsh_rr_128_64;
+typedef oneseq_base<uint64_t, pcg128_t, xsh_rr_mixin, true, cheap_multiplier>
+                                                       cm_oneseq_xsh_rr_128_64;
+
+typedef unique_base<uint8_t,  uint16_t, xsh_rr_mixin>  unique_xsh_rr_16_8;
+typedef unique_base<uint16_t, uint32_t, xsh_rr_mixin>  unique_xsh_rr_32_16;
+typedef unique_base<uint32_t, uint64_t, xsh_rr_mixin>  unique_xsh_rr_64_32;
+typedef unique_base<uint64_t, pcg128_t, xsh_rr_mixin>  unique_xsh_rr_128_64;
+typedef unique_base<uint64_t, pcg128_t, xsh_rr_mixin, true, cheap_multiplier>
+                                                       cm_unique_xsh_rr_128_64;
+
+typedef setseq_base<uint8_t,  uint16_t, xsh_rr_mixin>  setseq_xsh_rr_16_8;
+typedef setseq_base<uint16_t, uint32_t, xsh_rr_mixin>  setseq_xsh_rr_32_16;
+typedef setseq_base<uint32_t, uint64_t, xsh_rr_mixin>  setseq_xsh_rr_64_32;
+typedef setseq_base<uint64_t, pcg128_t, xsh_rr_mixin>  setseq_xsh_rr_128_64;
+typedef setseq_base<uint64_t, pcg128_t, xsh_rr_mixin, true, cheap_multiplier>
+                                                       cm_setseq_xsh_rr_128_64;
+
+typedef mcg_base<uint8_t,  uint16_t, xsh_rr_mixin>  mcg_xsh_rr_16_8;
+typedef mcg_base<uint16_t, uint32_t, xsh_rr_mixin>  mcg_xsh_rr_32_16;
+typedef mcg_base<uint32_t, uint64_t, xsh_rr_mixin>  mcg_xsh_rr_64_32;
+typedef mcg_base<uint64_t, pcg128_t, xsh_rr_mixin>  mcg_xsh_rr_128_64;
+typedef mcg_base<uint64_t, pcg128_t, xsh_rr_mixin, true, cheap_multiplier>
+                                                    cm_mcg_xsh_rr_128_64;
+
+
+/* Predefined types for RXS M XS */
+
+typedef oneseq_base<uint8_t,  uint8_t, rxs_m_xs_mixin>   oneseq_rxs_m_xs_8_8;
+typedef oneseq_base<uint16_t, uint16_t, rxs_m_xs_mixin>  oneseq_rxs_m_xs_16_16;
+typedef oneseq_base<uint32_t, uint32_t, rxs_m_xs_mixin>  oneseq_rxs_m_xs_32_32;
+typedef oneseq_base<uint64_t, uint64_t, rxs_m_xs_mixin>  oneseq_rxs_m_xs_64_64;
+typedef oneseq_base<pcg128_t, pcg128_t, rxs_m_xs_mixin>
+                                                        oneseq_rxs_m_xs_128_128;
+typedef oneseq_base<pcg128_t, pcg128_t, rxs_m_xs_mixin, true, cheap_multiplier>
+                                                     cm_oneseq_rxs_m_xs_128_128;
+
+typedef unique_base<uint8_t,  uint8_t, rxs_m_xs_mixin>  unique_rxs_m_xs_8_8;
+typedef unique_base<uint16_t, uint16_t, rxs_m_xs_mixin> unique_rxs_m_xs_16_16;
+typedef unique_base<uint32_t, uint32_t, rxs_m_xs_mixin> unique_rxs_m_xs_32_32;
+typedef unique_base<uint64_t, uint64_t, rxs_m_xs_mixin> unique_rxs_m_xs_64_64;
+typedef unique_base<pcg128_t, pcg128_t, rxs_m_xs_mixin> unique_rxs_m_xs_128_128;
+typedef unique_base<pcg128_t, pcg128_t, rxs_m_xs_mixin, true, cheap_multiplier>
+                                                     cm_unique_rxs_m_xs_128_128;
+
+typedef setseq_base<uint8_t,  uint8_t, rxs_m_xs_mixin>  setseq_rxs_m_xs_8_8;
+typedef setseq_base<uint16_t, uint16_t, rxs_m_xs_mixin> setseq_rxs_m_xs_16_16;
+typedef setseq_base<uint32_t, uint32_t, rxs_m_xs_mixin> setseq_rxs_m_xs_32_32;
+typedef setseq_base<uint64_t, uint64_t, rxs_m_xs_mixin> setseq_rxs_m_xs_64_64;
+typedef setseq_base<pcg128_t, pcg128_t, rxs_m_xs_mixin> setseq_rxs_m_xs_128_128;
+typedef setseq_base<pcg128_t, pcg128_t, rxs_m_xs_mixin, true, cheap_multiplier>
+                                                     cm_setseq_rxs_m_xs_128_128;
+
+                // MCG versions don't make sense here, so aren't defined.
+
+/* Predefined types for RXS M */
+
+typedef oneseq_base<uint8_t,  uint16_t, rxs_m_mixin>  oneseq_rxs_m_16_8;
+typedef oneseq_base<uint16_t, uint32_t, rxs_m_mixin>  oneseq_rxs_m_32_16;
+typedef oneseq_base<uint32_t, uint64_t, rxs_m_mixin>  oneseq_rxs_m_64_32;
+typedef oneseq_base<uint64_t, pcg128_t, rxs_m_mixin>  oneseq_rxs_m_128_64;
+typedef oneseq_base<uint64_t, pcg128_t, rxs_m_mixin, true, cheap_multiplier>
+                                                      cm_oneseq_rxs_m_128_64;
+
+typedef unique_base<uint8_t,  uint16_t, rxs_m_mixin>  unique_rxs_m_16_8;
+typedef unique_base<uint16_t, uint32_t, rxs_m_mixin>  unique_rxs_m_32_16;
+typedef unique_base<uint32_t, uint64_t, rxs_m_mixin>  unique_rxs_m_64_32;
+typedef unique_base<uint64_t, pcg128_t, rxs_m_mixin>  unique_rxs_m_128_64;
+typedef unique_base<uint64_t, pcg128_t, rxs_m_mixin, true, cheap_multiplier>
+                                                      cm_unique_rxs_m_128_64;
+
+typedef setseq_base<uint8_t,  uint16_t, rxs_m_mixin>  setseq_rxs_m_16_8;
+typedef setseq_base<uint16_t, uint32_t, rxs_m_mixin>  setseq_rxs_m_32_16;
+typedef setseq_base<uint32_t, uint64_t, rxs_m_mixin>  setseq_rxs_m_64_32;
+typedef setseq_base<uint64_t, pcg128_t, rxs_m_mixin>  setseq_rxs_m_128_64;
+typedef setseq_base<uint64_t, pcg128_t, rxs_m_mixin, true, cheap_multiplier>
+                                                      cm_setseq_rxs_m_128_64;
+
+typedef mcg_base<uint8_t,  uint16_t, rxs_m_mixin>  mcg_rxs_m_16_8;
+typedef mcg_base<uint16_t, uint32_t, rxs_m_mixin>  mcg_rxs_m_32_16;
+typedef mcg_base<uint32_t, uint64_t, rxs_m_mixin>  mcg_rxs_m_64_32;
+typedef mcg_base<uint64_t, pcg128_t, rxs_m_mixin>  mcg_rxs_m_128_64;
+typedef mcg_base<uint64_t, pcg128_t, rxs_m_mixin, true, cheap_multiplier>
+                                                   cm_mcg_rxs_m_128_64;
+
+/* Predefined types for DXSM */
+
+typedef oneseq_base<uint8_t,  uint16_t, dxsm_mixin>  oneseq_dxsm_16_8;
+typedef oneseq_base<uint16_t, uint32_t, dxsm_mixin>  oneseq_dxsm_32_16;
+typedef oneseq_base<uint32_t, uint64_t, dxsm_mixin>  oneseq_dxsm_64_32;
+typedef oneseq_base<uint64_t, pcg128_t, dxsm_mixin>  oneseq_dxsm_128_64;
+typedef oneseq_base<uint64_t, pcg128_t, dxsm_mixin, true, cheap_multiplier>
+                                                     cm_oneseq_dxsm_128_64;
+
+typedef unique_base<uint8_t,  uint16_t, dxsm_mixin>  unique_dxsm_16_8;
+typedef unique_base<uint16_t, uint32_t, dxsm_mixin>  unique_dxsm_32_16;
+typedef unique_base<uint32_t, uint64_t, dxsm_mixin>  unique_dxsm_64_32;
+typedef unique_base<uint64_t, pcg128_t, dxsm_mixin>  unique_dxsm_128_64;
+typedef unique_base<uint64_t, pcg128_t, dxsm_mixin, true, cheap_multiplier>
+                                                     cm_unique_dxsm_128_64;
+
+typedef setseq_base<uint8_t,  uint16_t, dxsm_mixin>  setseq_dxsm_16_8;
+typedef setseq_base<uint16_t, uint32_t, dxsm_mixin>  setseq_dxsm_32_16;
+typedef setseq_base<uint32_t, uint64_t, dxsm_mixin>  setseq_dxsm_64_32;
+typedef setseq_base<uint64_t, pcg128_t, dxsm_mixin>  setseq_dxsm_128_64;
+typedef setseq_base<uint64_t, pcg128_t, dxsm_mixin, true, cheap_multiplier>
+                                                     cm_setseq_dxsm_128_64;
+
+typedef mcg_base<uint8_t,  uint16_t, dxsm_mixin>  mcg_dxsm_16_8;
+typedef mcg_base<uint16_t, uint32_t, dxsm_mixin>  mcg_dxsm_32_16;
+typedef mcg_base<uint32_t, uint64_t, dxsm_mixin>  mcg_dxsm_64_32;
+typedef mcg_base<uint64_t, pcg128_t, dxsm_mixin>  mcg_dxsm_128_64;
+typedef mcg_base<uint64_t, pcg128_t, dxsm_mixin, true, cheap_multiplier>
+                                                  cm_mcg_dxsm_128_64;
+
+/* Predefined types for XSL RR (only defined for "large" types) */
+
+typedef oneseq_base<uint32_t, uint64_t, xsl_rr_mixin>  oneseq_xsl_rr_64_32;
+typedef oneseq_base<uint64_t, pcg128_t, xsl_rr_mixin>  oneseq_xsl_rr_128_64;
+typedef oneseq_base<uint64_t, pcg128_t, xsl_rr_mixin, true, cheap_multiplier>
+                                                       cm_oneseq_xsl_rr_128_64;
+
+typedef unique_base<uint32_t, uint64_t, xsl_rr_mixin>  unique_xsl_rr_64_32;
+typedef unique_base<uint64_t, pcg128_t, xsl_rr_mixin>  unique_xsl_rr_128_64;
+typedef unique_base<uint64_t, pcg128_t, xsl_rr_mixin, true, cheap_multiplier>
+                                                       cm_unique_xsl_rr_128_64;
+
+typedef setseq_base<uint32_t, uint64_t, xsl_rr_mixin>  setseq_xsl_rr_64_32;
+typedef setseq_base<uint64_t, pcg128_t, xsl_rr_mixin>  setseq_xsl_rr_128_64;
+typedef setseq_base<uint64_t, pcg128_t, xsl_rr_mixin, true, cheap_multiplier>
+                                                       cm_setseq_xsl_rr_128_64;
+
+typedef mcg_base<uint32_t, uint64_t, xsl_rr_mixin>  mcg_xsl_rr_64_32;
+typedef mcg_base<uint64_t, pcg128_t, xsl_rr_mixin>  mcg_xsl_rr_128_64;
+typedef mcg_base<uint64_t, pcg128_t, xsl_rr_mixin, true, cheap_multiplier>
+                                                    cm_mcg_xsl_rr_128_64;
+
+
+/* Predefined types for XSL RR RR (only defined for "large" types) */
+
+typedef oneseq_base<uint64_t, uint64_t, xsl_rr_rr_mixin>
+    oneseq_xsl_rr_rr_64_64;
+typedef oneseq_base<pcg128_t, pcg128_t, xsl_rr_rr_mixin>
+    oneseq_xsl_rr_rr_128_128;
+typedef oneseq_base<pcg128_t, pcg128_t, xsl_rr_rr_mixin, true, cheap_multiplier>
+    cm_oneseq_xsl_rr_rr_128_128;
+
+typedef unique_base<uint64_t, uint64_t, xsl_rr_rr_mixin>
+    unique_xsl_rr_rr_64_64;
+typedef unique_base<pcg128_t, pcg128_t, xsl_rr_rr_mixin>
+    unique_xsl_rr_rr_128_128;
+typedef unique_base<pcg128_t, pcg128_t, xsl_rr_rr_mixin, true, cheap_multiplier>
+    cm_unique_xsl_rr_rr_128_128;
+
+typedef setseq_base<uint64_t, uint64_t, xsl_rr_rr_mixin>
+    setseq_xsl_rr_rr_64_64;
+typedef setseq_base<pcg128_t, pcg128_t, xsl_rr_rr_mixin>
+    setseq_xsl_rr_rr_128_128;
+typedef setseq_base<pcg128_t, pcg128_t, xsl_rr_rr_mixin, true, cheap_multiplier>
+    cm_setseq_xsl_rr_rr_128_128;
+
+                // MCG versions don't make sense here, so aren't defined.
+
+/* Extended generators */
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename BaseRNG, bool kdd = true>
+using ext_std8 = extended<table_pow2, advance_pow2, BaseRNG,
+                          oneseq_rxs_m_xs_8_8, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename BaseRNG, bool kdd = true>
+using ext_std16 = extended<table_pow2, advance_pow2, BaseRNG,
+                           oneseq_rxs_m_xs_16_16, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename BaseRNG, bool kdd = true>
+using ext_std32 = extended<table_pow2, advance_pow2, BaseRNG,
+                           oneseq_rxs_m_xs_32_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2,
+          typename BaseRNG, bool kdd = true>
+using ext_std64 = extended<table_pow2, advance_pow2, BaseRNG,
+                           oneseq_rxs_m_xs_64_64, kdd>;
+
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_oneseq_rxs_m_xs_32_32 =
+          ext_std32<table_pow2, advance_pow2, oneseq_rxs_m_xs_32_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_mcg_xsh_rs_64_32 =
+          ext_std32<table_pow2, advance_pow2, mcg_xsh_rs_64_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_oneseq_xsh_rs_64_32 =
+          ext_std32<table_pow2, advance_pow2, oneseq_xsh_rs_64_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_setseq_xsh_rr_64_32 =
+          ext_std32<table_pow2, advance_pow2, setseq_xsh_rr_64_32, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_mcg_xsl_rr_128_64 =
+          ext_std64<table_pow2, advance_pow2, mcg_xsl_rr_128_64, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_oneseq_xsl_rr_128_64 =
+          ext_std64<table_pow2, advance_pow2, oneseq_xsl_rr_128_64, kdd>;
+
+template <bitcount_t table_pow2, bitcount_t advance_pow2, bool kdd = true>
+using ext_setseq_xsl_rr_128_64 =
+          ext_std64<table_pow2, advance_pow2, setseq_xsl_rr_128_64, kdd>;
+
+} // namespace pcg_engines
+
+typedef pcg_engines::setseq_xsh_rr_64_32        pcg32;
+typedef pcg_engines::oneseq_xsh_rr_64_32        pcg32_oneseq;
+typedef pcg_engines::unique_xsh_rr_64_32        pcg32_unique;
+typedef pcg_engines::mcg_xsh_rs_64_32           pcg32_fast;
+
+typedef pcg_engines::setseq_xsl_rr_128_64       pcg64;
+typedef pcg_engines::oneseq_xsl_rr_128_64       pcg64_oneseq;
+typedef pcg_engines::unique_xsl_rr_128_64       pcg64_unique;
+typedef pcg_engines::mcg_xsl_rr_128_64          pcg64_fast;
+
+typedef pcg_engines::setseq_rxs_m_xs_8_8        pcg8_once_insecure;
+typedef pcg_engines::setseq_rxs_m_xs_16_16      pcg16_once_insecure;
+typedef pcg_engines::setseq_rxs_m_xs_32_32      pcg32_once_insecure;
+typedef pcg_engines::setseq_rxs_m_xs_64_64      pcg64_once_insecure;
+typedef pcg_engines::setseq_xsl_rr_rr_128_128   pcg128_once_insecure;
+
+typedef pcg_engines::oneseq_rxs_m_xs_8_8        pcg8_oneseq_once_insecure;
+typedef pcg_engines::oneseq_rxs_m_xs_16_16      pcg16_oneseq_once_insecure;
+typedef pcg_engines::oneseq_rxs_m_xs_32_32      pcg32_oneseq_once_insecure;
+typedef pcg_engines::oneseq_rxs_m_xs_64_64      pcg64_oneseq_once_insecure;
+typedef pcg_engines::oneseq_xsl_rr_rr_128_128   pcg128_oneseq_once_insecure;
+
+
+// These two extended RNGs provide two-dimensionally equidistributed
+// 32-bit generators.  pcg32_k2_fast occupies the same space as pcg64,
+// and can be called twice to generate 64 bits, but does not required
+// 128-bit math; on 32-bit systems, it's faster than pcg64 as well.
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<1,16,true>     pcg32_k2;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<1,32,true>     pcg32_k2_fast;
+
+// These eight extended RNGs have about as much state as arc4random
+//
+//  - the k variants are k-dimensionally equidistributed
+//  - the c variants offer better crypographic security
+//
+// (just how good the cryptographic security is an open question)
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,true>     pcg32_k64;
+typedef pcg_engines::ext_mcg_xsh_rs_64_32<6,32,true>        pcg32_k64_oneseq;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<6,32,true>     pcg32_k64_fast;
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,false>    pcg32_c64;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<6,32,false>    pcg32_c64_oneseq;
+typedef pcg_engines::ext_mcg_xsh_rs_64_32<6,32,false>       pcg32_c64_fast;
+
+typedef pcg_engines::ext_setseq_xsl_rr_128_64<5,16,true>    pcg64_k32;
+typedef pcg_engines::ext_oneseq_xsl_rr_128_64<5,128,true>   pcg64_k32_oneseq;
+typedef pcg_engines::ext_mcg_xsl_rr_128_64<5,128,true>      pcg64_k32_fast;
+
+typedef pcg_engines::ext_setseq_xsl_rr_128_64<5,16,false>   pcg64_c32;
+typedef pcg_engines::ext_oneseq_xsl_rr_128_64<5,128,false>  pcg64_c32_oneseq;
+typedef pcg_engines::ext_mcg_xsl_rr_128_64<5,128,false>     pcg64_c32_fast;
+
+// These eight extended RNGs have more state than the Mersenne twister
+//
+//  - the k variants are k-dimensionally equidistributed
+//  - the c variants offer better crypographic security
+//
+// (just how good the cryptographic security is an open question)
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<10,16,true>    pcg32_k1024;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<10,32,true>    pcg32_k1024_fast;
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<10,16,false>   pcg32_c1024;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<10,32,false>   pcg32_c1024_fast;
+
+typedef pcg_engines::ext_setseq_xsl_rr_128_64<10,16,true>   pcg64_k1024;
+typedef pcg_engines::ext_oneseq_xsl_rr_128_64<10,128,true>  pcg64_k1024_fast;
+
+typedef pcg_engines::ext_setseq_xsl_rr_128_64<10,16,false>  pcg64_c1024;
+typedef pcg_engines::ext_oneseq_xsl_rr_128_64<10,128,false> pcg64_c1024_fast;
+
+// These generators have an insanely huge period (2^524352), and is suitable
+// for silly party tricks, such as dumping out 64 KB ZIP files at an arbitrary
+// point in the future.   [Actually, over the full period of the generator, it
+// will produce every 64 KB ZIP file 2^64 times!]
+
+typedef pcg_engines::ext_setseq_xsh_rr_64_32<14,16,true>    pcg32_k16384;
+typedef pcg_engines::ext_oneseq_xsh_rs_64_32<14,32,true>    pcg32_k16384_fast;
+
+} // namespace arrow_vendored
+
+#ifdef _MSC_VER
+    #pragma warning(default:4146)
+#endif
+
+#endif // PCG_RAND_HPP_INCLUDED
diff --git a/pyarrow/include/arrow/vendored/pcg/pcg_uint128.hpp b/pyarrow/include/arrow/vendored/pcg/pcg_uint128.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..012f3d66823585105b9d8d70b6b9fd5f92b77661
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/pcg/pcg_uint128.hpp
@@ -0,0 +1,1021 @@
+/*
+ * PCG Random Number Generation for C++
+ *
+ * Copyright 2014-2021 Melissa O'Neill <oneill@pcg-random.org>,
+ *                     and the PCG Project contributors.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ *
+ * Licensed under the Apache License, Version 2.0 (provided in
+ * LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0)
+ * or under the MIT license (provided in LICENSE-MIT.txt and at
+ * http://opensource.org/licenses/MIT), at your option. This file may not
+ * be copied, modified, or distributed except according to those terms.
+ *
+ * Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either
+ * express or implied.  See your chosen license for details.
+ *
+ * For additional information about the PCG random number generation scheme,
+ * visit http://www.pcg-random.org/.
+ */
+
+/*
+ * This code provides a a C++ class that can provide 128-bit (or higher)
+ * integers.  To produce 2K-bit integers, it uses two K-bit integers,
+ * placed in a union that allowes the code to also see them as four K/2 bit
+ * integers (and access them either directly name, or by index).
+ *
+ * It may seem like we're reinventing the wheel here, because several
+ * libraries already exist that support large integers, but most existing
+ * libraries provide a very generic multiprecision code, but here we're
+ * operating at a fixed size.  Also, most other libraries are fairly
+ * heavyweight.  So we use a direct implementation.  Sadly, it's much slower
+ * than hand-coded assembly or direct CPU support.
+ */
+
+#ifndef PCG_UINT128_HPP_INCLUDED
+#define PCG_UINT128_HPP_INCLUDED 1
+
+#include <cstdint>
+#include <cstdio>
+#include <cassert>
+#include <climits>
+#include <utility>
+#include <initializer_list>
+#include <type_traits>
+
+#if defined(_MSC_VER)  // Use MSVC++ intrinsics
+#include <intrin.h>
+#endif
+
+/*
+ * We want to lay the type out the same way that a native type would be laid
+ * out, which means we must know the machine's endian, at compile time.
+ * This ugliness attempts to do so.
+ */
+
+#ifndef PCG_LITTLE_ENDIAN
+    #if defined(__BYTE_ORDER__)
+        #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+            #define PCG_LITTLE_ENDIAN 1
+        #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+            #define PCG_LITTLE_ENDIAN 0
+        #else
+            #error __BYTE_ORDER__ does not match a standard endian, pick a side
+        #endif
+    #elif __LITTLE_ENDIAN__ || _LITTLE_ENDIAN
+        #define PCG_LITTLE_ENDIAN 1
+    #elif __BIG_ENDIAN__ || _BIG_ENDIAN
+        #define PCG_LITTLE_ENDIAN 0
+    #elif __x86_64 || __x86_64__ || _M_X64 || __i386 || __i386__ || _M_IX86 || _M_ARM64
+        #define PCG_LITTLE_ENDIAN 1
+    #elif __powerpc__ || __POWERPC__ || __ppc__ || __PPC__ \
+          || __m68k__ || __mc68000__
+        #define PCG_LITTLE_ENDIAN 0
+    #else
+        #error Unable to determine target endianness
+    #endif
+#endif
+
+#if INTPTR_MAX == INT64_MAX && !defined(PCG_64BIT_SPECIALIZATIONS)
+    #define PCG_64BIT_SPECIALIZATIONS 1
+#endif
+
+namespace arrow_vendored {
+namespace pcg_extras {
+
+// Recent versions of GCC have intrinsics we can use to quickly calculate
+// the number of leading and trailing zeros in a number.  If possible, we
+// use them, otherwise we fall back to old-fashioned bit twiddling to figure
+// them out.
+
+#ifndef PCG_BITCOUNT_T
+    typedef uint8_t bitcount_t;
+#else
+    typedef PCG_BITCOUNT_T bitcount_t;
+#endif
+
+/*
+ * Provide some useful helper functions
+ *      * flog2                 floor(log2(x))
+ *      * trailingzeros         number of trailing zero bits
+ */
+
+#if defined(__GNUC__)   // Any GNU-compatible compiler supporting C++11 has
+                        // some useful intrinsics we can use.
+
+inline bitcount_t flog2(uint32_t v)
+{
+    return 31 - __builtin_clz(v);
+}
+
+inline bitcount_t trailingzeros(uint32_t v)
+{
+    return __builtin_ctz(v);
+}
+
+inline bitcount_t flog2(uint64_t v)
+{
+#if UINT64_MAX == ULONG_MAX
+    return 63 - __builtin_clzl(v);
+#elif UINT64_MAX == ULLONG_MAX
+    return 63 - __builtin_clzll(v);
+#else
+    #error Cannot find a function for uint64_t
+#endif
+}
+
+inline bitcount_t trailingzeros(uint64_t v)
+{
+#if UINT64_MAX == ULONG_MAX
+    return __builtin_ctzl(v);
+#elif UINT64_MAX == ULLONG_MAX
+    return __builtin_ctzll(v);
+#else
+    #error Cannot find a function for uint64_t
+#endif
+}
+
+#elif defined(_MSC_VER)  // Use MSVC++ intrinsics
+
+#pragma intrinsic(_BitScanReverse, _BitScanForward)
+#if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
+#pragma intrinsic(_BitScanReverse64, _BitScanForward64)
+#endif
+
+inline bitcount_t flog2(uint32_t v)
+{
+    unsigned long i;
+    _BitScanReverse(&i, v);
+    return bitcount_t(i);
+}
+
+inline bitcount_t trailingzeros(uint32_t v)
+{
+    unsigned long i;
+    _BitScanForward(&i, v);
+    return bitcount_t(i);
+}
+
+inline bitcount_t flog2(uint64_t v)
+{
+#if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
+    unsigned long i;
+    _BitScanReverse64(&i, v);
+    return bitcount_t(i);
+#else
+    // 32-bit x86
+    uint32_t high = v >> 32;
+    uint32_t low  = uint32_t(v);
+    return high ? 32+flog2(high) : flog2(low);
+#endif
+}
+
+inline bitcount_t trailingzeros(uint64_t v)
+{
+#if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)
+    unsigned long i;
+    _BitScanForward64(&i, v);
+    return bitcount_t(i);
+#else
+    // 32-bit x86
+    uint32_t high = v >> 32;
+    uint32_t low  = uint32_t(v);
+    return low ? trailingzeros(low) : trailingzeros(high)+32;
+#endif
+}
+
+#else                   // Otherwise, we fall back to bit twiddling
+                        // implementations
+
+inline bitcount_t flog2(uint32_t v)
+{
+    // Based on code by Eric Cole and Mark Dickinson, which appears at
+    // https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
+
+    static const uint8_t multiplyDeBruijnBitPos[32] = {
+      0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+      8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
+    };
+
+    v |= v >> 1; // first round down to one less than a power of 2
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+
+    return multiplyDeBruijnBitPos[(uint32_t)(v * 0x07C4ACDDU) >> 27];
+}
+
+inline bitcount_t trailingzeros(uint32_t v)
+{
+    static const uint8_t multiplyDeBruijnBitPos[32] = {
+      0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+      31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+    };
+
+    return multiplyDeBruijnBitPos[((uint32_t)((v & -v) * 0x077CB531U)) >> 27];
+}
+
+inline bitcount_t flog2(uint64_t v)
+{
+    uint32_t high = v >> 32;
+    uint32_t low  = uint32_t(v);
+
+    return high ? 32+flog2(high) : flog2(low);
+}
+
+inline bitcount_t trailingzeros(uint64_t v)
+{
+    uint32_t high = v >> 32;
+    uint32_t low  = uint32_t(v);
+
+    return low ? trailingzeros(low) : trailingzeros(high)+32;
+}
+
+#endif
+
+inline bitcount_t flog2(uint8_t v)
+{
+    return flog2(uint32_t(v));
+}
+
+inline bitcount_t flog2(uint16_t v)
+{
+    return flog2(uint32_t(v));
+}
+
+#if __SIZEOF_INT128__
+inline bitcount_t flog2(__uint128_t v)
+{
+    uint64_t high = uint64_t(v >> 64);
+    uint64_t low  = uint64_t(v);
+
+    return high ? 64+flog2(high) : flog2(low);
+}
+#endif
+
+inline bitcount_t trailingzeros(uint8_t v)
+{
+    return trailingzeros(uint32_t(v));
+}
+
+inline bitcount_t trailingzeros(uint16_t v)
+{
+    return trailingzeros(uint32_t(v));
+}
+
+#if __SIZEOF_INT128__
+inline bitcount_t trailingzeros(__uint128_t v)
+{
+    uint64_t high = uint64_t(v >> 64);
+    uint64_t low  = uint64_t(v);
+    return low ? trailingzeros(low) : trailingzeros(high)+64;
+}
+#endif
+
+template <typename UInt>
+inline bitcount_t clog2(UInt v)
+{
+    return flog2(v) + ((v & (-v)) != v);
+}
+
+template <typename UInt>
+inline UInt addwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
+{
+    UInt half_result = y + carryin;
+    UInt result = x + half_result;
+    *carryout = (half_result < y) || (result < x);
+    return result;
+}
+
+template <typename UInt>
+inline UInt subwithcarry(UInt x, UInt y, bool carryin, bool* carryout)
+{
+    UInt half_result = y + carryin;
+    UInt result = x - half_result;
+    *carryout = (half_result < y) || (result > x);
+    return result;
+}
+
+
+template <typename UInt, typename UIntX2>
+class uint_x4 {
+// private:
+    static constexpr unsigned int UINT_BITS = sizeof(UInt) * CHAR_BIT;
+public:
+    union {
+#if PCG_LITTLE_ENDIAN
+        struct {
+            UInt v0, v1, v2, v3;
+        } w;
+        struct {
+            UIntX2 v01, v23;
+        } d;
+#else
+        struct {
+            UInt v3, v2, v1, v0;
+        } w;
+        struct {
+            UIntX2 v23, v01;
+        } d;
+#endif
+        // For the array access versions, the code that uses the array
+        // must handle endian itself.  Yuck.
+        UInt wa[4];
+    };
+
+public:
+    uint_x4() = default;
+
+    constexpr uint_x4(UInt v3, UInt v2, UInt v1, UInt v0)
+#if PCG_LITTLE_ENDIAN
+       : w{v0, v1, v2, v3}
+#else
+       : w{v3, v2, v1, v0}
+#endif
+    {
+        // Nothing (else) to do
+    }
+
+    constexpr uint_x4(UIntX2 v23, UIntX2 v01)
+#if PCG_LITTLE_ENDIAN
+       : d{v01,v23}
+#else
+       : d{v23,v01}
+#endif
+    {
+        // Nothing (else) to do
+    }
+
+    constexpr uint_x4(UIntX2 v01)
+#if PCG_LITTLE_ENDIAN
+       : d{v01, UIntX2(0)}
+#else
+       : d{UIntX2(0),v01}
+#endif
+    {
+        // Nothing (else) to do
+    }
+
+    template<class Integral,
+             typename std::enable_if<(std::is_integral<Integral>::value
+                                      && sizeof(Integral) <= sizeof(UIntX2))
+                                    >::type* = nullptr>
+    constexpr uint_x4(Integral v01)
+#if PCG_LITTLE_ENDIAN
+       : d{UIntX2(v01), UIntX2(0)}
+#else
+       : d{UIntX2(0), UIntX2(v01)}
+#endif
+    {
+        // Nothing (else) to do
+    }
+
+    explicit constexpr operator UIntX2() const
+    {
+        return d.v01;
+    }
+
+    template<class Integral,
+             typename std::enable_if<(std::is_integral<Integral>::value
+                                      && sizeof(Integral) <= sizeof(UIntX2))
+                                    >::type* = nullptr>
+    explicit constexpr operator Integral() const
+    {
+        return Integral(d.v01);
+    }
+
+    explicit constexpr operator bool() const
+    {
+        return d.v01 || d.v23;
+    }
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator*(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator*(const uint_x4<U,V>&, V);
+
+    template<typename U, typename V>
+    friend std::pair< uint_x4<U,V>,uint_x4<U,V> >
+        divmod(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator+(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator-(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator<<(const uint_x4<U,V>&, const bitcount_t shift);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator>>(const uint_x4<U,V>&, const bitcount_t shift);
+
+#if PCG_64BIT_SPECIALIZATIONS
+    template<typename U>
+    friend uint_x4<U,uint64_t> operator<<(const uint_x4<U,uint64_t>&, const bitcount_t shift);
+
+    template<typename U>
+    friend uint_x4<U,uint64_t> operator>>(const uint_x4<U,uint64_t>&, const bitcount_t shift);
+#endif
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator&(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator|(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator^(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bool operator==(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bool operator!=(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bool operator<(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bool operator<=(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bool operator>(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bool operator>=(const uint_x4<U,V>&, const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator~(const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend uint_x4<U,V> operator-(const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bitcount_t flog2(const uint_x4<U,V>&);
+
+    template<typename U, typename V>
+    friend bitcount_t trailingzeros(const uint_x4<U,V>&);
+
+#if PCG_64BIT_SPECIALIZATIONS
+    template<typename U>
+    friend bitcount_t flog2(const uint_x4<U,uint64_t>&);
+
+    template<typename U>
+    friend bitcount_t trailingzeros(const uint_x4<U,uint64_t>&);
+#endif
+
+    uint_x4& operator*=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this * rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator*=(UIntX2 rhs)
+    {
+        uint_x4 result = *this * rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator/=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this / rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator%=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this % rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator+=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this + rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator-=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this - rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator&=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this & rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator|=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this | rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator^=(const uint_x4& rhs)
+    {
+        uint_x4 result = *this ^ rhs;
+        return *this = result;
+    }
+
+    uint_x4& operator>>=(bitcount_t shift)
+    {
+        uint_x4 result = *this >> shift;
+        return *this = result;
+    }
+
+    uint_x4& operator<<=(bitcount_t shift)
+    {
+        uint_x4 result = *this << shift;
+        return *this = result;
+    }
+
+};
+
+template<typename U, typename V>
+bitcount_t flog2(const uint_x4<U,V>& v)
+{
+#if PCG_LITTLE_ENDIAN
+    for (uint8_t i = 4; i !=0; /* dec in loop */) {
+        --i;
+#else
+    for (uint8_t i = 0; i < 4; ++i) {
+#endif
+        if (v.wa[i] == 0)
+             continue;
+        return flog2(v.wa[i]) + uint_x4<U,V>::UINT_BITS*i;
+    }
+    abort();
+}
+
+template<typename U, typename V>
+bitcount_t trailingzeros(const uint_x4<U,V>& v)
+{
+#if PCG_LITTLE_ENDIAN
+    for (uint8_t i = 0; i < 4; ++i) {
+#else
+    for (uint8_t i = 4; i !=0; /* dec in loop */) {
+        --i;
+#endif
+        if (v.wa[i] != 0)
+            return trailingzeros(v.wa[i]) + uint_x4<U,V>::UINT_BITS*i;
+    }
+    return uint_x4<U,V>::UINT_BITS*4;
+}
+
+#if PCG_64BIT_SPECIALIZATIONS
+template<typename UInt32>
+bitcount_t flog2(const uint_x4<UInt32,uint64_t>& v)
+{
+    return v.d.v23 > 0 ? flog2(v.d.v23) + uint_x4<UInt32,uint64_t>::UINT_BITS*2
+                       : flog2(v.d.v01);
+}
+
+template<typename UInt32>
+bitcount_t trailingzeros(const uint_x4<UInt32,uint64_t>& v)
+{
+    return v.d.v01 == 0 ? trailingzeros(v.d.v23) + uint_x4<UInt32,uint64_t>::UINT_BITS*2
+                        : trailingzeros(v.d.v01);
+}
+#endif
+
+template <typename UInt, typename UIntX2>
+std::pair< uint_x4<UInt,UIntX2>, uint_x4<UInt,UIntX2> >
+    divmod(const uint_x4<UInt,UIntX2>& orig_dividend,
+           const uint_x4<UInt,UIntX2>& divisor)
+{
+    // If the dividend is less than the divisor, the answer is always zero.
+    // This takes care of boundary cases like 0/x (which would otherwise be
+    // problematic because we can't take the log of zero.  (The boundary case
+    // of division by zero is undefined.)
+    if (orig_dividend < divisor)
+        return { uint_x4<UInt,UIntX2>(UIntX2(0)), orig_dividend };
+
+    auto dividend = orig_dividend;
+
+    auto log2_divisor  = flog2(divisor);
+    auto log2_dividend = flog2(dividend);
+    // assert(log2_dividend >= log2_divisor);
+    bitcount_t logdiff = log2_dividend - log2_divisor;
+
+    constexpr uint_x4<UInt,UIntX2> ONE(UIntX2(1));
+    if (logdiff == 0)
+        return { ONE, dividend - divisor };
+
+    // Now we change the log difference to
+    //  floor(log2(divisor)) - ceil(log2(dividend))
+    // to ensure that we *underestimate* the result.
+    logdiff -= 1;
+
+    uint_x4<UInt,UIntX2> quotient(UIntX2(0));
+
+    auto qfactor = ONE << logdiff;
+    auto factor  = divisor << logdiff;
+
+    do {
+        dividend -= factor;
+        quotient += qfactor;
+        while (dividend < factor) {
+            factor  >>= 1;
+            qfactor >>= 1;
+        }
+    } while (dividend >= divisor);
+
+    return { quotient, dividend };
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator/(const uint_x4<UInt,UIntX2>& dividend,
+                               const uint_x4<UInt,UIntX2>& divisor)
+{
+    return divmod(dividend, divisor).first;
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator%(const uint_x4<UInt,UIntX2>& dividend,
+                               const uint_x4<UInt,UIntX2>& divisor)
+{
+    return divmod(dividend, divisor).second;
+}
+
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator*(const uint_x4<UInt,UIntX2>& a,
+                               const uint_x4<UInt,UIntX2>& b)
+{
+    constexpr auto UINT_BITS = uint_x4<UInt,UIntX2>::UINT_BITS;
+    uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
+    bool carryin = false;
+    bool carryout;
+    UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(b.w.v0);
+    r.w.v0 = UInt(a0b0);
+    r.w.v1 = UInt(a0b0 >> UINT_BITS);
+
+    UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(b.w.v0);
+    r.w.v2 = UInt(a1b0 >> UINT_BITS);
+    r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout);
+    carryin = carryout;
+    r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
+
+    UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b.w.v1);
+    carryin = false;
+    r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> UINT_BITS), carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
+
+    carryin = false;
+    r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout);
+    carryin = carryout;
+    r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
+
+    UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b.w.v1);
+    carryin = false;
+    r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> UINT_BITS), carryin, &carryout);
+
+    r.d.v23 += a.d.v01 * b.d.v23 + a.d.v23 * b.d.v01;
+
+    return r;
+}
+
+ 
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator*(const uint_x4<UInt,UIntX2>& a,
+                               UIntX2 b01)
+{
+    constexpr auto UINT_BITS = uint_x4<UInt,UIntX2>::UINT_BITS;
+    uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
+    bool carryin = false;
+    bool carryout;
+    UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(UInt(b01));
+    r.w.v0 = UInt(a0b0);
+    r.w.v1 = UInt(a0b0 >> UINT_BITS);
+
+    UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(UInt(b01));
+    r.w.v2 = UInt(a1b0 >> UINT_BITS);
+    r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout);
+    carryin = carryout;
+    r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
+
+    UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b01 >> UINT_BITS);
+    carryin = false;
+    r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> UINT_BITS), carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
+
+    carryin = false;
+    r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout);
+    carryin = carryout;
+    r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout);
+
+    UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b01 >> UINT_BITS);
+    carryin = false;
+    r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> UINT_BITS), carryin, &carryout);
+
+    r.d.v23 += a.d.v23 * b01;
+
+    return r;
+}
+
+#if PCG_64BIT_SPECIALIZATIONS
+#if defined(_MSC_VER)
+#if defined(_M_X64) || defined(_M_IX86)
+#pragma intrinsic(_umul128)
+#elif defined(_M_ARM64)
+#pragma intrinsic(__umulh)
+#else
+#error Unsupported architecture
+#endif
+#endif
+
+#if defined(_MSC_VER) || __SIZEOF_INT128__
+template <typename UInt32>
+uint_x4<UInt32,uint64_t> operator*(const uint_x4<UInt32,uint64_t>& a,
+				   const uint_x4<UInt32,uint64_t>& b)
+{
+#if defined(_MSC_VER)
+#if defined(_M_X64) || defined(_M_IX86)
+    uint64_t hi;
+    uint64_t lo = _umul128(a.d.v01, b.d.v01, &hi);
+#elif defined(_M_ARM64)
+    uint64_t lo = a.d.v01 * b.d.v01;
+    uint64_t hi = __umulh(a.d.v01, b.d.v01);
+#else
+#error Unsupported architecture
+#endif
+#else
+    __uint128_t r = __uint128_t(a.d.v01) * __uint128_t(b.d.v01);
+    uint64_t lo = uint64_t(r);
+    uint64_t hi = r >> 64;
+#endif
+    hi += a.d.v23 * b.d.v01 + a.d.v01 * b.d.v23;
+    return {hi, lo};
+}
+#endif
+#endif
+
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator+(const uint_x4<UInt,UIntX2>& a,
+                               const uint_x4<UInt,UIntX2>& b)
+{
+    uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
+
+    bool carryin = false;
+    bool carryout;
+    r.w.v0 = addwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
+    carryin = carryout;
+    r.w.v1 = addwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
+    carryin = carryout;
+    r.w.v2 = addwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = addwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
+
+    return r;
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& a,
+                               const uint_x4<UInt,UIntX2>& b)
+{
+    uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
+
+    bool carryin = false;
+    bool carryout;
+    r.w.v0 = subwithcarry(a.w.v0, b.w.v0, carryin, &carryout);
+    carryin = carryout;
+    r.w.v1 = subwithcarry(a.w.v1, b.w.v1, carryin, &carryout);
+    carryin = carryout;
+    r.w.v2 = subwithcarry(a.w.v2, b.w.v2, carryin, &carryout);
+    carryin = carryout;
+    r.w.v3 = subwithcarry(a.w.v3, b.w.v3, carryin, &carryout);
+
+    return r;
+}
+
+#if PCG_64BIT_SPECIALIZATIONS
+template <typename UInt32>
+uint_x4<UInt32,uint64_t> operator+(const uint_x4<UInt32,uint64_t>& a,
+				   const uint_x4<UInt32,uint64_t>& b)
+{
+    uint_x4<UInt32,uint64_t> r = {uint64_t(0u), uint64_t(0u)};
+
+    bool carryin = false;
+    bool carryout;
+    r.d.v01 = addwithcarry(a.d.v01, b.d.v01, carryin, &carryout);
+    carryin = carryout;
+    r.d.v23 = addwithcarry(a.d.v23, b.d.v23, carryin, &carryout);
+
+    return r;
+}
+
+template <typename UInt32>
+uint_x4<UInt32,uint64_t> operator-(const uint_x4<UInt32,uint64_t>& a,
+				   const uint_x4<UInt32,uint64_t>& b)
+{
+    uint_x4<UInt32,uint64_t> r = {uint64_t(0u), uint64_t(0u)};
+
+    bool carryin = false;
+    bool carryout;
+    r.d.v01 = subwithcarry(a.d.v01, b.d.v01, carryin, &carryout);
+    carryin = carryout;
+    r.d.v23 = subwithcarry(a.d.v23, b.d.v23, carryin, &carryout);
+
+    return r;
+}
+#endif
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator&(const uint_x4<UInt,UIntX2>& a,
+                               const uint_x4<UInt,UIntX2>& b)
+{
+    return uint_x4<UInt,UIntX2>(a.d.v23 & b.d.v23, a.d.v01 & b.d.v01);
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator|(const uint_x4<UInt,UIntX2>& a,
+                               const uint_x4<UInt,UIntX2>& b)
+{
+    return uint_x4<UInt,UIntX2>(a.d.v23 | b.d.v23, a.d.v01 | b.d.v01);
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator^(const uint_x4<UInt,UIntX2>& a,
+                               const uint_x4<UInt,UIntX2>& b)
+{
+    return uint_x4<UInt,UIntX2>(a.d.v23 ^ b.d.v23, a.d.v01 ^ b.d.v01);
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator~(const uint_x4<UInt,UIntX2>& v)
+{
+    return uint_x4<UInt,UIntX2>(~v.d.v23, ~v.d.v01);
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator-(const uint_x4<UInt,UIntX2>& v)
+{
+    return uint_x4<UInt,UIntX2>(0UL,0UL) - v;
+}
+
+template <typename UInt, typename UIntX2>
+bool operator==(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
+{
+    return (a.d.v01 == b.d.v01) && (a.d.v23 == b.d.v23);
+}
+
+template <typename UInt, typename UIntX2>
+bool operator!=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
+{
+    return !operator==(a,b);
+}
+
+
+template <typename UInt, typename UIntX2>
+bool operator<(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
+{
+    return (a.d.v23 < b.d.v23)
+           || ((a.d.v23 == b.d.v23) && (a.d.v01 < b.d.v01));
+}
+
+template <typename UInt, typename UIntX2>
+bool operator>(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
+{
+    return operator<(b,a);
+}
+
+template <typename UInt, typename UIntX2>
+bool operator<=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
+{
+    return !(operator<(b,a));
+}
+
+template <typename UInt, typename UIntX2>
+bool operator>=(const uint_x4<UInt,UIntX2>& a, const uint_x4<UInt,UIntX2>& b)
+{
+    return !(operator<(a,b));
+}
+
+
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator<<(const uint_x4<UInt,UIntX2>& v,
+                                const bitcount_t shift)
+{
+    uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
+    const bitcount_t bits    = uint_x4<UInt,UIntX2>::UINT_BITS;
+    const bitcount_t bitmask = bits - 1;
+    const bitcount_t shiftdiv = shift / bits;
+    const bitcount_t shiftmod = shift & bitmask;
+
+    if (shiftmod) {
+        UInt carryover = 0;
+#if PCG_LITTLE_ENDIAN
+        for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
+#else
+        for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
+            --out, --in;
+#endif
+            r.wa[out] = (v.wa[in] << shiftmod) | carryover;
+            carryover = (v.wa[in] >> (bits - shiftmod));
+        }
+    } else {
+#if PCG_LITTLE_ENDIAN
+        for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
+#else
+        for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
+            --out, --in;
+#endif
+            r.wa[out] = v.wa[in];
+        }
+    }
+
+    return r;
+}
+
+template <typename UInt, typename UIntX2>
+uint_x4<UInt,UIntX2> operator>>(const uint_x4<UInt,UIntX2>& v,
+                                const bitcount_t shift)
+{
+    uint_x4<UInt,UIntX2> r = {0U, 0U, 0U, 0U};
+    const bitcount_t bits    = uint_x4<UInt,UIntX2>::UINT_BITS;
+    const bitcount_t bitmask = bits - 1;
+    const bitcount_t shiftdiv = shift / bits;
+    const bitcount_t shiftmod = shift & bitmask;
+
+    if (shiftmod) {
+        UInt carryover = 0;
+#if PCG_LITTLE_ENDIAN
+        for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
+            --out, --in;
+#else
+        for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
+#endif
+            r.wa[out] = (v.wa[in] >> shiftmod) | carryover;
+            carryover = (v.wa[in] << (bits - shiftmod));
+        }
+    } else {
+#if PCG_LITTLE_ENDIAN
+        for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) {
+            --out, --in;
+#else
+        for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) {
+#endif
+            r.wa[out] = v.wa[in];
+        }
+    }
+
+    return r;
+}
+
+#if PCG_64BIT_SPECIALIZATIONS
+template <typename UInt32>
+uint_x4<UInt32,uint64_t> operator<<(const uint_x4<UInt32,uint64_t>& v,
+				    const bitcount_t shift)
+{
+    constexpr bitcount_t bits2   = uint_x4<UInt32,uint64_t>::UINT_BITS * 2;
+    
+    if (shift >= bits2) {
+        return {v.d.v01 << (shift-bits2), uint64_t(0u)};
+    } else {
+        return {shift ? (v.d.v23 << shift) | (v.d.v01 >> (bits2-shift)) 
+                      : v.d.v23,
+                v.d.v01 << shift};
+    }
+}
+
+template <typename UInt32>
+uint_x4<UInt32,uint64_t> operator>>(const uint_x4<UInt32,uint64_t>& v,
+				    const bitcount_t shift)
+{
+    constexpr bitcount_t bits2   = uint_x4<UInt32,uint64_t>::UINT_BITS * 2;
+    
+    if (shift >= bits2) {
+        return {uint64_t(0u), v.d.v23 >> (shift-bits2)};
+    } else {
+        return {v.d.v23 >> shift,
+                shift ? (v.d.v01 >> shift) | (v.d.v23 << (bits2-shift))
+                      : v.d.v01};
+    }
+}
+#endif
+
+} // namespace pcg_extras
+} // namespace arrow_vendored
+
+#endif // PCG_UINT128_HPP_INCLUDED
diff --git a/pyarrow/include/arrow/vendored/portable-snippets/debug-trap.h b/pyarrow/include/arrow/vendored/portable-snippets/debug-trap.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d039064d6ab3f56a2215ae50d1b0286db61ddfe
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/portable-snippets/debug-trap.h
@@ -0,0 +1,83 @@
+/* Debugging assertions and traps
+ * Portable Snippets - https://github.com/nemequ/portable-snippets
+ * Created by Evan Nemerson <evan@nemerson.com>
+ *
+ *   To the extent possible under law, the authors have waived all
+ *   copyright and related or neighboring rights to this code.  For
+ *   details, see the Creative Commons Zero 1.0 Universal license at
+ *   https://creativecommons.org/publicdomain/zero/1.0/
+ */
+
+#if !defined(PSNIP_DEBUG_TRAP_H)
+#define PSNIP_DEBUG_TRAP_H
+
+#if !defined(PSNIP_NDEBUG) && defined(NDEBUG) && !defined(PSNIP_DEBUG)
+#  define PSNIP_NDEBUG 1
+#endif
+
+#if defined(__has_builtin) && !defined(__ibmxl__)
+#  if __has_builtin(__builtin_debugtrap)
+#    define psnip_trap() __builtin_debugtrap()
+#  elif __has_builtin(__debugbreak)
+#    define psnip_trap() __debugbreak()
+#  endif
+#endif
+#if !defined(psnip_trap)
+#  if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+#    define psnip_trap() __debugbreak()
+#  elif defined(__ARMCC_VERSION)
+#    define psnip_trap() __breakpoint(42)
+#  elif defined(__ibmxl__) || defined(__xlC__)
+#    include <builtins.h>
+#    define psnip_trap() __trap(42)
+#  elif defined(__DMC__) && defined(_M_IX86)
+     static inline void psnip_trap(void) { __asm int 3h; }
+#  elif defined(__i386__) || defined(__x86_64__)
+     static inline void psnip_trap(void) { __asm__ __volatile__("int $03"); }
+#  elif defined(__thumb__)
+     static inline void psnip_trap(void) { __asm__ __volatile__(".inst 0xde01"); }
+#  elif defined(__aarch64__)
+     static inline void psnip_trap(void) { __asm__ __volatile__(".inst 0xd4200000"); }
+#  elif defined(__arm__)
+     static inline void psnip_trap(void) { __asm__ __volatile__(".inst 0xe7f001f0"); }
+#  elif defined (__alpha__) && !defined(__osf__)
+     static inline void psnip_trap(void) { __asm__ __volatile__("bpt"); }
+#  elif defined(_54_)
+     static inline void psnip_trap(void) { __asm__ __volatile__("ESTOP"); }
+#  elif defined(_55_)
+     static inline void psnip_trap(void) { __asm__ __volatile__(";\n .if (.MNEMONIC)\n ESTOP_1\n .else\n ESTOP_1()\n .endif\n NOP"); }
+#  elif defined(_64P_)
+     static inline void psnip_trap(void) { __asm__ __volatile__("SWBP 0"); }
+#  elif defined(_6x_)
+     static inline void psnip_trap(void) { __asm__ __volatile__("NOP\n .word 0x10000000"); }
+#  elif defined(__STDC_HOSTED__) && (__STDC_HOSTED__ == 0) && defined(__GNUC__)
+#    define psnip_trap() __builtin_trap()
+#  else
+#    include <signal.h>
+#    if defined(SIGTRAP)
+#      define psnip_trap() raise(SIGTRAP)
+#    else
+#      define psnip_trap() raise(SIGABRT)
+#    endif
+#  endif
+#endif
+
+#if defined(HEDLEY_LIKELY)
+#  define PSNIP_DBG_LIKELY(expr) HEDLEY_LIKELY(expr)
+#elif defined(__GNUC__) && (__GNUC__ >= 3)
+#  define PSNIP_DBG_LIKELY(expr) __builtin_expect(!!(expr), 1)
+#else
+#  define PSNIP_DBG_LIKELY(expr) (!!(expr))
+#endif
+
+#if !defined(PSNIP_NDEBUG) || (PSNIP_NDEBUG == 0)
+#  define psnip_dbg_assert(expr) do { \
+    if (!PSNIP_DBG_LIKELY(expr)) { \
+      psnip_trap(); \
+    } \
+  } while (0)
+#else
+#  define psnip_dbg_assert(expr)
+#endif
+
+#endif /* !defined(PSNIP_DEBUG_TRAP_H) */
diff --git a/pyarrow/include/arrow/vendored/safeint/safe_math.h b/pyarrow/include/arrow/vendored/safeint/safe_math.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8100390ba8fc94b4ce706f8a44cd0260f4a0a61
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/safeint/safe_math.h
@@ -0,0 +1,1034 @@
+// Licensed under the MIT License.
+// Copyright David LeBlanc - dcl@dleblanc.net
+
+#if !defined SAFE_MATH_H
+#define SAFE_MATH_H
+
+#if defined SAFEINT_HPP
+#error use either the C++ SafeInt, or safe_math, not both
+#endif
+
+// C wants a prototype, if all warnings enabled
+// #if !defined SAFE_MATH_FAIL_DEFINED
+// static inline void safe_math_fail(const char* msg);
+// #endif
+
+#include "safe_math_impl.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/*
+	The following functions are defined in safe_math_impl.h:
+
+	// All check_cast functions return 0 if safe, non-zero if unsafe
+	// Casting test to int8
+	int check_cast_int8_int32(int32_t in)
+	int check_cast_int8_uint32(uint32_t in)
+	int check_cast_int8_int64(int64_t in)
+	int check_cast_int8_uint64(uint64_t in)
+
+	// Casting to int64
+	int check_cast_int16_int32(int32_t in)
+	int check_cast_int16_uint32(uint32_t in)
+	int check_cast_int16_int64(int64_t in)
+	int check_cast_int16_uint64(uint64_t in)
+
+	// Casting to int32
+	int check_cast_int32_uint32(uint32_t in)
+	int check_cast_int32_int64(int64_t in)
+	int check_cast_int32_uint64(uint64_t in)
+	int check_cast_int64_uint64(uint64_t in)
+
+	// Casting to uint8
+	int check_cast_uint8_int32(int32_t in)
+	int check_cast_uint8_uint32(uint32_t in)
+	int check_cast_uint8_int64(int64_t in)
+	int check_cast_uint8_uint64(uint64_t in)
+
+	// Casting to uint16
+	int check_cast_uint16_int32(int32_t in)
+	int check_cast_uint16_uint32(uint32_t in)
+	int check_cast_uint16_int64(int64_t in)
+	int check_cast_uint16_uint64(uint64_t in)
+
+	// Casting to uint32
+	int check_cast_uint32_int32(int32_t in)
+	int check_cast_uint32_int64(int64_t in)
+	int check_cast_uint32_uint64(uint64_t in)
+
+	// Casting to uint64
+	int check_cast_uint64_int64(int64_t in)
+
+	// safe_cast functions all abort on failure
+
+	// Casting to int8
+	int8_t safe_cast_int8_int32(int32_t in)
+	int8_t safe_cast_int8_uint32(uint32_t in)
+	int8_t safe_cast_int8_int64(int64_t in)
+	int8_t safe_cast_int8_uint64(uint64_t in)
+
+	// Casting to int16
+	int16_t safe_cast_int16_int32(int32_t in)
+	int16_t safe_cast_int16_uint32(uint32_t in)
+	int16_t safe_cast_int16_int64(int64_t in)
+	int16_t safe_cast_int16_uint64(uint64_t in)
+
+	// Casting to int32
+	int32_t safe_cast_int32_uint32(uint32_t in)
+	int32_t safe_cast_int32_int64(int64_t in)
+	int32_t safe_cast_int32_uint64(uint64_t in)
+
+	// Casting to int64
+	int64_t safe_cast_int64_uint64(uint64_t in)
+
+	// Casting to uint8
+	uint8_t safe_cast_uint8_int32(int32_t in)
+	uint8_t safe_cast_uint8_uint32(uint32_t in)
+	uint8_t safe_cast_uint8_int64(int64_t in)
+	uint8_t safe_cast_uint8_uint64(uint64_t in)
+
+	// Casting to uint16
+	uint16_t safe_cast_uint16_int32(int32_t in)
+	uint16_t safe_cast_uint16_uint32(uint32_t in)
+	uint16_t safe_cast_uint16_int64(int64_t in)
+	uint16_t safe_cast_uint16_uint64(uint64_t in)
+
+	// Casting to uint32
+	uint32_t safe_cast_uint32_int32(int32_t in)
+	uint32_t safe_cast_uint32_int64(int64_t in)
+	uint32_t safe_cast_uint32_uint64(uint64_t in)
+
+	// Casting to uint64
+	uint64_t safe_cast_uint64_int64(int64_t in)
+
+	// Only 32-bit or larger types are supported for 
+	// addition, subtraction, multiplication and division
+
+	// If smaller types are needed, either wrap the result
+	// in a safe_cast, or pass the smaller type in as a
+	// 32-bit type of the same signedness
+
+	// Addition functions, all of these abort on failure
+	// For all of the below, there are also non-aborting versions
+	// that have the signature of:
+	//
+	// bool check_op_intXX_intYY(intXX a, intYY b, intXX* ret)
+
+	int32_t safe_add_int32_int32(int32_t a, int32_t b)
+	int32_t safe_add_int32_uint32(int32_t a, uint32_t b)
+	int32_t safe_add_int32_int64(int32_t a, int64_t b)
+	int32_t safe_add_int32_uint64(int32_t a, uint64_t b)
+
+	uint32_t safe_add_uint32_int32(uint32_t a, int32_t b)
+	uint32_t safe_add_uint32_uint32(uint32_t a, uint32_t b)
+	uint32_t safe_add_uint32_int64(uint32_t a, int64_t b)
+	uint32_t safe_add_uint32_uint64(uint32_t a, uint64_t b)
+
+	int64_t safe_add_int64_int32(int64_t a, int32_t b)
+	int64_t safe_add_int64_uint32(int64_t a, uint32_t b)
+	int64_t safe_add_int64_int64(int64_t a, int64_t b)
+	int64_t safe_add_int64_uint64(int64_t a, uint64_t b)
+
+	uint64_t safe_add_uint64_int32(uint64_t a, int32_t b)
+	uint64_t safe_add_uint64_uint32(uint64_t a, uint32_t b)
+	uint64_t safe_add_uint64_int64(uint64_t a, int64_t b)
+	uint64_t safe_add_uint64_uint64(uint64_t a, uint64_t b)
+
+	// Multiplication
+	int32_t safe_div_int32_int32(int32_t a, int32_t b)
+	int32_t safe_div_int32_uint32(int32_t a, uint32_t b)
+	int32_t safe_div_int32_int64(int32_t a, int64_t b)
+	int32_t safe_div_int32_uint64(int32_t a, uint64_t b)
+
+	uint32_t safe_div_uint32_int32(uint32_t a, int32_t b)
+	uint32_t safe_div_uint32_uint32(uint32_t a, uint32_t b)
+	uint32_t safe_div_uint32_int64(uint32_t a, int64_t b)
+	uint32_t safe_div_uint32_uint64(uint32_t a, uint64_t b)
+
+	int64_t safe_div_int64_int32(int64_t a, int32_t b)
+	int64_t safe_div_int64_uint32(int64_t a, uint32_t b)
+	int64_t safe_div_int64_int64(int64_t a, int64_t b)
+	int64_t safe_div_int64_uint64(int64_t a, uint64_t b)
+
+	uint64_t safe_div_uint64_int32(uint64_t a, int32_t b)
+	uint64_t safe_div_uint64_uint32(uint64_t a, uint32_t b)
+	uint64_t safe_div_uint64_int64(uint64_t a, int64_t b)
+	uint64_t safe_div_uint64_uint64(uint64_t a, uint64_t b)
+
+	// Division
+	int32_t safe_div_int32_int32(int32_t a, int32_t b)
+	int32_t safe_div_int32_uint32(int32_t a, uint32_t b)
+	int32_t safe_div_int32_int64(int32_t a, int64_t b)
+	int32_t safe_div_int32_uint64(int32_t a, uint64_t b)
+
+	uint32_t safe_div_uint32_int32(uint32_t a, int32_t b)
+	uint32_t safe_div_uint32_uint32(uint32_t a, uint32_t b)
+	uint32_t safe_div_uint32_int64(uint32_t a, int64_t b)
+	uint32_t safe_div_uint32_uint64(uint32_t a, uint64_t b)
+
+	int64_t safe_div_int64_int32(int64_t a, int32_t b)
+	int64_t safe_div_int64_uint32(int64_t a, uint32_t b)
+	int64_t safe_div_int64_int64(int64_t a, int64_t b)
+	int64_t safe_div_int64_uint64(int64_t a, uint64_t b)
+
+	uint64_t safe_div_uint64_int32(uint64_t a, int32_t b)
+	uint64_t safe_div_uint64_uint32(uint64_t a, uint32_t b)
+	uint64_t safe_div_uint64_int64(uint64_t a, int64_t b)
+	uint64_t safe_div_uint64_uint64(uint64_t a, uint64_t b)
+
+	// Subtraction
+	int32_t safe_sub_int32_int32(int32_t a, int32_t b)
+	int32_t safe_sub_int32_uint32(int32_t a, uint32_t b)
+	int32_t safe_sub_int32_int64(int32_t a, int64_t b)
+	int32_t safe_sub_int32_uint64(int32_t a, uint64_t b)
+
+	uint32_t safe_sub_uint32_int32(uint32_t a, int32_t b)
+	uint32_t safe_sub_uint32_uint32(uint32_t a, uint32_t b)
+	uint32_t safe_sub_uint32_int64(uint32_t a, int64_t b)
+	uint32_t safe_sub_uint32_uint64(uint32_t a, uint64_t b)
+
+	int64_t safe_sub_int64_int32(int64_t a, int32_t b)
+	int64_t safe_sub_int64_uint32(int64_t a, uint32_t b)
+	int64_t safe_sub_int64_int64(int64_t a, int64_t b)
+	int64_t safe_sub_int64_uint64(int64_t a, uint64_t b)
+
+	uint64_t safe_sub_uint64_int32(uint64_t a, int32_t b)
+	uint64_t safe_sub_uint64_uint32(uint64_t a, uint32_t b)
+	uint64_t safe_sub_uint64_int64(uint64_t a, int64_t b)
+	uint64_t safe_sub_uint64_uint64(uint64_t a, uint64_t b)
+*/
+
+// Do some sorting out of standard types and sizes
+
+#if CHAR_MIN != 0
+#define SAFE_MATH_SIGNED_CHAR 1
+#else
+#define SAFE_MATH_SIGNED_CHAR 0
+#endif
+
+#if LONG_MAX == LLONG_MAX
+#define SAFE_MATH_LONG 64
+#else
+#define SAFE_MATH_LONG 32
+#endif
+
+// Not going to support odd sizes of things
+extern char SAFE_MATH_CHECK_SHORT_IS_16[1 / ((sizeof(short)-2) ? 0 : 1)];
+extern char SAFE_MATH_CHECK_INT_IS_32[1 / ((sizeof(int) - 4) ? 0 : 1)];
+
+// In order to help keep people from making mistakes by 
+// incorrectly guessing which types match which of the intXX types,
+// make some functions.
+
+// Cast to char, char might be signed or unsigned
+#if SAFE_MATH_SIGNED_CHAR
+static inline char safe_cast_char_int(int in) { return safe_cast_int8_int32(in); }
+static inline char safe_cast_char_uint(unsigned int in) { return safe_cast_int8_uint32(in); }
+
+static inline int check_cast_char_int(int in) { return safe_cast_int8_int32(in); }
+static inline int check_cast_char_uint(unsigned int in) { return safe_cast_int8_uint32(in); }
+
+#if SAFE_MATH_LONG == 64
+static inline char safe_cast_char_long(long in) { return safe_cast_int8_int64(in); }
+static inline int check_cast_char_long(long in) { return check_cast_int8_int64(in); }
+#else
+static inline char safe_cast_char_long(long in) { return safe_cast_int8_int32(in); }
+static inline int check_cast_char_long(long in) { return check_cast_int8_int32(in); }
+#endif
+
+static inline char safe_cast_char_longlong(long long in) { return safe_cast_int8_int64(in); }
+static inline char safe_cast_char_ulonglong(unsigned long long in) { return safe_cast_int8_uint64(in); }
+
+static inline int check_cast_char_longlong(long long in) { return check_cast_int8_int64(in); }
+static inline int check_cast_char_ulonglong(unsigned long long in) { return check_cast_int8_uint64(in); }
+#else
+static inline char safe_cast_char_int(int in) { return safe_cast_uint8_int32(in); }
+static inline char safe_cast_char_uint(unsigned int in) { return safe_cast_uint8_uint32(in); }
+
+static inline int check_cast_char_int(int in) { return check_cast_uint8_int32(in); }
+static inline int check_cast_char_uint(unsigned int in) { return check_cast_uint8_uint32(in); }
+
+#if SAFE_MATH_LONG == 64
+static inline char safe_cast_char_long(long in) { return safe_cast_uint8_int64(in); }
+static inline int check_cast_char_long(long in) { return check_cast_uint8_int64(in); }
+#else
+static inline char safe_cast_char_long(long in) { return safe_cast_uint8_int32(in); }
+static inline int check_cast_char_long(long in) { return check_cast_uint8_int32(in); }
+#endif
+
+static inline char safe_cast_char_longlong(long long in) { return safe_cast_uint8_int64(in); }
+static inline char safe_cast_char_ulonglong(unsigned long long in) { return safe_cast_uint8_uint64(in); }
+
+static inline int check_cast_char_longlong(long long in) { return check_cast_uint8_int64(in); }
+static inline int check_cast_char_ulonglong(unsigned long long in) { return check_cast_uint8_uint64(in); }
+#endif
+
+// Signed char
+static inline signed char safe_cast_schar_int(int in) { return safe_cast_int8_int32(in); }
+static inline signed char safe_cast_schar_uint(unsigned int in) { return safe_cast_int8_uint32(in); }
+
+static inline int check_cast_schar_int(int in) { return check_cast_int8_int32(in); }
+static inline int check_cast_schar_uint(unsigned int in) { return check_cast_int8_uint32(in); }
+
+#if SAFE_MATH_LONG == 64
+static inline signed char safe_cast_schar_long(long in) { return safe_cast_int8_int64(in); }
+static inline int check_cast_schar_long(long in) { return check_cast_int8_int64(in); }
+#else
+static inline signed char safe_cast_schar_long(long in) { return safe_cast_int8_int32(in); }
+static inline int check_cast_schar_long(long in) { return check_cast_int8_int32(in); }
+#endif
+
+static inline signed char safe_cast_schar_longlong(long long in) { return safe_cast_int8_int64(in); }
+static inline signed char safe_cast_schar_ulonglong(unsigned long long in) { return safe_cast_int8_uint64(in); }
+
+static inline int check_cast_schar_longlong(long long in) { return check_cast_int8_int64(in); }
+static inline int check_cast_schar_ulonglong(unsigned long long in) { return check_cast_int8_uint64(in); }
+
+// Unsigned char
+static inline unsigned char safe_cast_uchar_int(int in) { return safe_cast_uint8_int32(in); }
+static inline unsigned char safe_cast_uchar_uint(unsigned int in) { return safe_cast_uint8_uint32(in); }
+
+static inline int check_cast_uchar_int(int in) { return check_cast_uint8_int32(in); }
+static inline int check_cast_uchar_uint(unsigned int in) { return check_cast_uint8_uint32(in); }
+
+#if SAFE_MATH_LONG == 64
+static inline unsigned char safe_cast_uchar_long(long in) { return safe_cast_uint8_int64(in); }
+static inline int check_cast_uchar_long(long in) { return check_cast_uint8_int64(in); }
+#else
+static inline unsigned char safe_cast_uchar_long(long in) { return safe_cast_uint8_int32(in); }
+static inline int check_cast_uchar_long(long in) { return check_cast_uint8_int32(in); }
+#endif
+
+static inline unsigned char safe_cast_uchar_longlong(long long in) { return safe_cast_uint8_int64(in); }
+static inline unsigned char safe_cast_uchar_ulonglong(unsigned long long in) { return safe_cast_uint8_uint64(in); }
+
+static inline int check_cast_uchar_longlong(long long in) { return check_cast_uint8_int64(in); }
+static inline int check_cast_uchar_ulonglong(unsigned long long in) { return check_cast_uint8_uint64(in); }
+
+// 16-bit signed casting
+static inline short safe_cast_short_int(int in) { return safe_cast_int16_int32(in); }
+static inline short safe_cast_short_uint(unsigned int in) { return safe_cast_int16_uint32(in); }
+
+static inline int check_cast_short_int(int in) { return check_cast_int16_int32(in); }
+static inline int check_cast_short_uint(unsigned int in) { return check_cast_int16_uint32(in); }
+#if SAFE_MATH_LONG == 64
+static inline short safe_cast_short_long(long in) { return safe_cast_int16_int64(in); }
+static inline int check_cast_short_long(long in) { return check_cast_int16_int64(in); }
+#else
+static inline short safe_cast_short_long(long in) { return safe_cast_int16_int32(in); }
+static inline int check_cast_short_long(long in) { return check_cast_int16_int32(in); }
+#endif
+
+static inline short safe_cast_short_longlong(long long in) { return safe_cast_int16_int64(in); }
+static inline short safe_cast_short_ulonglong(unsigned long long in) { return safe_cast_int16_uint64(in); }
+
+static inline int check_cast_short_longlong(long long in) { return check_cast_int16_int64(in); }
+static inline int check_cast_short_ulonglong(unsigned long long in) { return check_cast_int16_uint64(in); }
+
+// 16-bit unsigned casting
+static inline unsigned short safe_cast_ushort_int(int in) { return safe_cast_uint16_int32(in); }
+static inline unsigned short safe_cast_ushort_uint(unsigned int in) { return safe_cast_uint16_uint32(in); }
+
+static inline int check_cast_ushort_int(int in) { return check_cast_uint16_int32(in); }
+static inline int check_cast_ushort_uint(unsigned int in) { return check_cast_uint16_uint32(in); }
+#if SAFE_MATH_LONG == 64
+static inline unsigned short safe_cast_ushort_long(long in) { return safe_cast_uint16_int64(in); }
+static inline int check_cast_ushort_long(long in) { return check_cast_uint16_int64(in); }
+#else
+static inline unsigned short safe_cast_ushort_long(long in) { return safe_cast_uint16_int32(in); }
+static inline int check_cast_ushort_long(long in) { return check_cast_uint16_int32(in); }
+#endif
+
+static inline unsigned short safe_cast_ushort_longlong(long long in) { return safe_cast_uint16_int64(in); }
+static inline unsigned short safe_cast_ushort_ulonglong(unsigned long long in) { return safe_cast_uint16_uint64(in); }
+
+static inline int check_cast_ushort_longlong(long long in) { return check_cast_uint16_int64(in); }
+static inline int check_cast_ushort_ulonglong(unsigned long long in) { return check_cast_uint16_uint64(in); }
+
+// Cast to int
+static inline int safe_cast_int_uint(unsigned int in) { return safe_cast_int32_uint32(in); }
+static inline int check_cast_int_uint(unsigned int in) { return check_cast_int32_uint32(in); }
+
+#if SAFE_MATH_LONG == 64
+static inline int safe_cast_int_long(long in) { return safe_cast_int32_int64(in); }
+static inline int safe_cast_int_ulong(unsigned long in) { return safe_cast_int32_uint64(in); }
+
+static inline int check_cast_int_long(long in) { return check_cast_int32_int64(in); }
+static inline int check_cast_int_ulong(unsigned long in) { return check_cast_int32_uint64(in); }
+#else
+static inline int safe_cast_int_long(long in) { return in; }
+static inline int safe_cast_int_ulong(unsigned long in) { return safe_cast_int32_uint32(in); }
+
+static inline int check_cast_int_long(long in) { (void)in;  return 0; }
+static inline int check_cast_int_ulong(unsigned long in) { return check_cast_int32_uint32(in); }
+#endif
+
+static inline int safe_cast_int_longlong(long long in) { return safe_cast_int32_int64(in); }
+static inline int safe_cast_int_ulonglong(unsigned long long in) { return safe_cast_int32_uint64(in); }
+
+static inline int check_cast_int_longlong(long long in) { return check_cast_int32_int64(in); }
+static inline int check_cast_int_ulonglong(unsigned long long in) { return check_cast_int32_uint64(in); }
+
+// Cast to unsigned int
+static inline unsigned int safe_cast_uint_int(int in) { return safe_cast_uint32_int32(in); }
+static inline int check_cast_uint_int(int in) { return check_cast_uint32_int32(in); }
+#if SAFE_MATH_LONG == 64
+static inline unsigned int safe_cast_uint_long(long in) { return safe_cast_uint32_int64(in); }
+static inline unsigned int safe_cast_uint_ulong(unsigned long in) { return safe_cast_uint32_uint64(in); }
+
+static inline int check_cast_uint_long(long in) { return check_cast_uint32_int64(in); }
+static inline int check_cast_uint_ulong(unsigned long in) { return check_cast_uint32_uint64(in); }
+#else
+static inline unsigned int safe_cast_uint_long(long in) { return safe_cast_uint32_int32(in); }
+static inline unsigned int safe_cast_uint_ulong(unsigned long in) { return in; }
+
+static inline int check_cast_uint_long(long in) { return check_cast_uint32_int32(in); }
+static inline int check_cast_uint_ulong(unsigned long in) { (void)in; return 0; }
+#endif
+
+static inline unsigned int safe_cast_uint_longlong(long long in) { return safe_cast_uint32_int64(in); }
+static inline unsigned int safe_cast_uint_ulonglong(unsigned long long in) { return safe_cast_uint32_uint64(in); }
+
+static inline int check_cast_uint_longlong(long long in) { return check_cast_uint32_int64(in); }
+static inline int check_cast_uint_ulonglong(unsigned long long in) { return check_cast_uint32_uint64(in); }
+
+// Cast to long
+// Also have to keep parity in the case of different compilations
+// of the same code.
+#if SAFE_MATH_LONG == 64
+static inline long safe_cast_long_ulong(unsigned long in) { return safe_cast_int64_uint64(in); }
+static inline long safe_cast_long_longlong(long long in) { return in; }
+static inline long safe_cast_long_ulonglong(unsigned long long in) { return safe_cast_int64_uint64(in); }
+
+static inline int check_cast_long_ulong(unsigned long in) { return check_cast_int64_uint64(in); }
+static inline int check_cast_long_longlong(long long in) { (void)in; return 0; }
+static inline int check_cast_long_ulonglong(unsigned long long in) { return check_cast_int64_uint64(in); }
+
+static inline unsigned long safe_cast_ulong_long(long in) { return safe_cast_uint64_int64(in); }
+static inline unsigned long safe_cast_ulong_ulonglong(unsigned long long in) { return in; }
+static inline unsigned long safe_cast_ulong_longlong(long long in) { return safe_cast_uint64_int64(in); }
+
+static inline int check_cast_ulong_long(long in) { return check_cast_uint64_int64(in); }
+static inline int check_cast_ulong_ulonglong(unsigned long long in) { (void)in; return 0; }
+static inline int check_cast_ulong_longlong(long long in) { return check_cast_uint64_int64(in); }
+#else
+static inline long safe_cast_long_ulong(unsigned long in) { return safe_cast_int32_uint32(in); }
+static inline long safe_cast_long_longlong(long long in) { return safe_cast_int32_int64(in); }
+static inline long safe_cast_long_ulonglong(unsigned long long in) { return safe_cast_int32_uint64(in); }
+
+static inline int check_cast_long_ulong(unsigned long in) { return check_cast_int32_uint32(in); }
+static inline int check_cast_long_longlong(long long in) { return check_cast_int32_int64(in); }
+static inline int check_cast_long_ulonglong(unsigned long long in) { return check_cast_int32_uint64(in); }
+
+static inline unsigned long safe_cast_ulong_long(long in) { return safe_cast_uint32_int32(in); }
+static inline unsigned long safe_cast_ulong_ulonglong(unsigned long long in) { return safe_cast_uint32_uint64(in); }
+static inline unsigned long safe_cast_ulong_longlong(long long in) { return safe_cast_uint32_int64(in); }
+
+static inline int check_cast_ulong_long(long in) { return check_cast_uint32_int32(in); }
+static inline int check_cast_ulong_ulonglong(unsigned long long in) { return check_cast_uint32_uint64(in); }
+static inline int check_cast_ulong_longlong(long long in) { return check_cast_uint32_int64(in); }
+#endif
+
+// And long long
+static inline long long safe_cast_longlong_ulonglong(unsigned long long in) { return safe_cast_int64_uint64(in); }
+static inline unsigned long long safe_cast_ulonglong_longlong(long long in) { return safe_cast_uint64_int64(in); }
+
+static inline int check_cast_longlong_ulonglong(unsigned long long in) { return check_cast_int64_uint64(in); }
+static inline int check_cast_ulonglong_longlong(long long in) { return check_cast_uint64_int64(in); }
+
+// Addition
+static inline int safe_add_int_int(int a, int b) { return safe_add_int32_int32(a, b); }
+static inline int safe_add_int_uint(int a, unsigned int b) { return safe_add_int32_uint32(a, b); }
+static inline int safe_add_int_longlong(int a, long long b) { return safe_add_int32_int64(a, b); }
+static inline int safe_add_int_ulonglong(int a, unsigned long long b) { return safe_add_int32_uint64(a, b); }
+
+static inline bool check_add_int_int(int a, int b, int* ret) { return check_add_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_add_int_uint(int a, unsigned int b, int* ret) { return check_add_int32_uint32(a, b, (int32_t*)ret); }
+static inline bool check_add_int_longlong(int a, long long b, int* ret) { return check_add_int32_int64(a, b, (int32_t*)ret); }
+static inline bool check_add_int_ulonglong(int a, unsigned long long b, int* ret) { return check_add_int32_uint64(a, b, (int32_t*)ret); }
+
+static inline unsigned int safe_add_uint_int(unsigned int a, int b) { return safe_add_uint32_int32(a, b); }
+static inline unsigned int safe_add_uint_uint(unsigned int a, unsigned int b) { return safe_add_uint32_uint32(a, b); }
+static inline unsigned int safe_add_uint_longlong(unsigned int a, long long b) { return safe_add_uint32_int64(a, b); }
+static inline unsigned int safe_add_uint_ulonglong(unsigned int a, unsigned long long b) { return safe_add_uint32_uint64(a, b); }
+
+static inline bool check_add_uint_int(unsigned int a, int b, unsigned int* ret) { return check_add_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_add_uint_uint(unsigned int a, unsigned int b, unsigned int* ret) { return check_add_uint32_uint32(a, b, (uint32_t*)ret); }
+static inline bool check_add_uint_longlong(unsigned int a, long long b, unsigned int* ret) { return check_add_uint32_int64(a, b, (uint32_t*)ret); }
+static inline bool check_add_uint_ulonglong(unsigned int a, unsigned long long b, unsigned int* ret) { return check_add_uint32_uint64(a, b, (uint32_t*)ret); }
+
+#if SAFE_MATH_LONG == 64
+static inline int safe_add_int_long(int a, long b) { return safe_add_int32_int64(a, b); }
+static inline int safe_add_int_ulong(int a, unsigned long b) { return safe_add_int32_uint64(a, b); }
+
+static inline bool check_add_int_long(int a, long b, int* ret) { return check_add_int32_int64(a, b, (int32_t*)ret); }
+static inline bool check_add_int_ulong(int a, unsigned long b, int* ret) { return check_add_int32_uint64(a, b, (int32_t*)ret); }
+
+static inline unsigned int safe_add_uint_long(unsigned int a, long b) { return safe_add_uint32_int64(a, b); }
+static inline unsigned int safe_add_uint_ulong(unsigned int a, unsigned long b) { return safe_add_uint32_uint64(a, b); }
+
+static inline bool check_add_uint_long(unsigned int a, long b, unsigned int* ret) { return check_add_uint32_int64(a, b, (uint32_t*)ret); }
+static inline bool check_add_uint_ulong(unsigned int a, unsigned long b, unsigned int* ret) { return check_add_uint32_uint64(a, b, (uint32_t*)ret); }
+
+static inline long safe_add_long_int(long a, int b) { return safe_add_int64_int32(a, b); }
+static inline long safe_add_long_uint(long a, unsigned int b) { return safe_add_int64_uint32(a, b); }
+static inline long safe_add_long_long(long a, long b) { return safe_add_int64_int64(a, b); }
+static inline long safe_add_long_ulong(long a, unsigned long b) { return safe_add_int64_uint64(a, b); }
+static inline long safe_add_long_longlong(long a, long long b) { return safe_add_int64_int64(a, b); }
+static inline long safe_add_long_ulonglong(long a, unsigned long long b) { return safe_add_int64_uint64(a, b); }
+
+static inline bool check_add_long_int(long a, int b, long* ret) { return check_add_int64_int32(a, b, (int64_t*)ret); }
+static inline bool check_add_long_uint(long a, unsigned int b, long* ret) { return check_add_int64_uint32(a, b, (int64_t*)ret); }
+static inline bool check_add_long_long(long a, long b, long* ret) { return check_add_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_add_long_ulong(long a, unsigned long b, long* ret) { return check_add_int64_uint64(a, b, (int64_t*)ret); }
+static inline bool check_add_long_longlong(long a, long long b, long* ret) { return check_add_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_add_long_ulonglong(long a, unsigned long long b, long* ret) { return check_add_int64_uint64(a, b, (int64_t*)ret); }
+
+static inline unsigned long safe_add_ulong_int(unsigned long a, int b) { return safe_add_uint64_int32(a, b); }
+static inline unsigned long safe_add_ulong_uint(unsigned long a, unsigned int b) { return safe_add_uint64_uint32(a, b); }
+static inline unsigned long safe_add_ulong_long(unsigned long a, long b) { return safe_add_uint64_int64(a, b); }
+static inline unsigned long safe_add_ulong_ulong(unsigned long a, unsigned long b) { return safe_add_uint64_uint64(a, b); }
+static inline unsigned long safe_add_ulong_longlong(unsigned long a, long long b) { return safe_add_uint64_int64(a, b); }
+static inline unsigned long safe_add_ulong_ulonglong(unsigned long a, unsigned long long b) { return safe_add_uint64_uint64(a, b); }
+
+static inline bool check_add_ulong_int(unsigned long a, int b, unsigned long* ret) { return check_add_uint64_int32(a, b, (uint64_t*)ret); }
+static inline bool check_add_ulong_uint(unsigned long a, unsigned int b, unsigned long* ret) { return check_add_uint64_uint32(a, b, (uint64_t*)ret); }
+static inline bool check_add_ulong_long(unsigned long a, long b, unsigned long* ret) { return check_add_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_add_ulong_ulong(unsigned long a, unsigned long b, unsigned long* ret) { return check_add_uint64_uint64(a, b, (uint64_t*)ret); }
+static inline bool check_add_ulong_longlong(unsigned long a, long long b, unsigned long* ret) { return check_add_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_add_ulong_ulonglong(unsigned long a, unsigned long long b, unsigned long* ret) { return check_add_uint64_uint64(a, b, (uint64_t*)ret); }
+
+static inline long long safe_add_longlong_long(long long a, long b) { return safe_add_int64_int64(a, b); }
+static inline long long safe_add_longlong_ulong(long long a, unsigned long b) { return safe_add_int64_uint64(a, b); }
+
+static inline bool check_add_longlong_long(long long a, long b, long long* ret) { return check_add_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_add_longlong_ulong(long long a, unsigned long b, long long* ret) { return check_add_int64_uint64(a, b, (int64_t*)ret); }
+
+static inline unsigned long long safe_add_ulonglong_long(unsigned long long a, long b) { return safe_add_uint64_int64(a, b); }
+static inline unsigned long long safe_add_ulonglong_ulong(unsigned long long a, unsigned long b) { return safe_add_uint64_uint64(a, b); }
+
+static inline bool check_add_ulonglong_long(unsigned long long a, long b, unsigned long long* ret) { return check_add_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_add_ulonglong_ulong(unsigned long long a, unsigned long b, unsigned long long* ret) { return check_add_uint64_uint64(a, b, (uint64_t*)ret); }
+#else
+static inline int safe_add_int_long(int a, long b) { return safe_add_int32_int32(a, b); }
+static inline int safe_add_int_ulong(int a, unsigned long b) { return safe_add_int32_uint32(a, b); }
+
+static inline bool check_add_int_long(int a, long b, int* ret) { return check_add_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_add_int_ulong(int a, unsigned long b, int* ret) { return check_add_int32_uint32(a, b, (int32_t*)ret); }
+
+static inline unsigned int safe_add_uint_long(unsigned int a, long b) { return safe_add_uint32_int32(a, b); }
+static inline unsigned int safe_add_uint_ulong(unsigned int a, unsigned long b) { return safe_add_uint32_uint32(a, b); }
+
+static inline bool check_add_uint_long(unsigned int a, long b, unsigned int* ret) { return check_add_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_add_uint_ulong(unsigned int a, unsigned long b, unsigned int* ret) { return check_add_uint32_uint32(a, b, (uint32_t*)ret); }
+
+static inline long safe_add_long_int(long a, int b) { return safe_add_int32_int32(a, b); }
+static inline long safe_add_long_uint(long a, unsigned int b) { return safe_add_int32_uint32(a, b); }
+static inline long safe_add_long_long(long a, long b) { return safe_add_int32_int32(a, b); }
+static inline long safe_add_long_ulong(long a, unsigned long b) { return safe_add_int32_uint32(a, b); }
+static inline long safe_add_long_longlong(long a, long long b) { return safe_add_int32_int64(a, b); }
+static inline long safe_add_long_ulonglong(long a, unsigned long long b) { return safe_add_int32_uint64(a, b); }
+
+static inline bool check_add_long_int(long a, int b, long* ret) { return check_add_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_add_long_uint(long a, unsigned int b, long* ret) { return check_add_int32_uint32(a, b, (int32_t*)ret); }
+static inline bool check_add_long_long(long a, long b, long* ret) { return check_add_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_add_long_ulong(long a, unsigned long b, long* ret) { return check_add_int32_uint32(a, b, (int32_t*)ret); }
+static inline bool check_add_long_longlong(long a, long long b, long* ret) { return check_add_int32_int64(a, b, (int32_t*)ret); }
+static inline bool check_add_long_ulonglong(long a, unsigned long long b, long* ret) { return check_add_int32_uint64(a, b, (int32_t*)ret); }
+
+static inline unsigned long safe_add_ulong_int(unsigned long a, int b) { return safe_add_uint32_int32(a, b); }
+static inline unsigned long safe_add_ulong_uint(unsigned long a, unsigned int b) { return safe_add_uint32_uint32(a, b); }
+static inline unsigned long safe_add_ulong_long(unsigned long a, long b) { return safe_add_uint32_int32(a, b); }
+static inline unsigned long safe_add_ulong_ulong(unsigned long a, unsigned long b) { return safe_add_uint32_uint32(a, b); }
+static inline unsigned long safe_add_ulong_longlong(unsigned long a, long long b) { return safe_add_uint32_int64(a, b); }
+static inline unsigned long safe_add_ulong_ulonglong(unsigned long a, unsigned long long b) { return safe_add_uint32_uint64(a, b); }
+
+static inline bool check_add_ulong_int(unsigned long a, int b, unsigned long* ret) { return check_add_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_add_ulong_uint(unsigned long a, unsigned int b, unsigned long* ret) { return check_add_uint32_uint32(a, b, (uint32_t*)ret); }
+static inline bool check_add_ulong_long(unsigned long a, long b, unsigned long* ret) { return check_add_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_add_ulong_ulong(unsigned long a, unsigned long b, unsigned long* ret) { return check_add_uint32_uint32(a, b, (uint32_t*)ret); }
+static inline bool check_add_ulong_longlong(unsigned long a, long long b, unsigned long* ret) { return check_add_uint32_int64(a, b, (uint32_t*)ret); }
+static inline bool check_add_ulong_ulonglong(unsigned long a, unsigned long long b, unsigned long* ret) { return check_add_uint32_uint64(a, b, (uint32_t*)ret); }
+
+static inline long long safe_add_longlong_long(long long a, long b) { return safe_add_int64_int32(a, b); }
+static inline long long safe_add_longlong_ulong(long long a, unsigned long b) { return safe_add_int64_uint32(a, b); }
+
+static inline bool check_add_longlong_long(long long a, long b, long long* ret) { return check_add_int64_int32(a, b, ret); }
+static inline bool check_add_longlong_ulong(long long a, unsigned long b, long long* ret) { return check_add_int64_uint32(a, b, ret); }
+
+static inline unsigned long long safe_add_ulonglong_long(unsigned long long a, long b) { return safe_add_uint64_int32(a, b); }
+static inline unsigned long long safe_add_ulonglong_ulong(unsigned long long a, unsigned long b) { return safe_add_uint64_uint32(a, b); }
+
+static inline bool check_add_ulonglong_long(unsigned long long a, long b, unsigned long long* ret) { return check_add_uint64_int32(a, b, ret); }
+static inline bool check_add_ulonglong_ulong(unsigned long long a, unsigned long b, unsigned long long* ret) { return check_add_uint64_uint32(a, b, ret); }
+#endif
+
+static inline long long safe_add_longlong_int(long long a, int b) { return safe_add_int64_int32(a, b); }
+static inline long long safe_add_longlong_uint(long long a, unsigned int b) { return safe_add_int64_uint32(a, b); }
+static inline long long safe_add_longlong_longlong(long long a, long long b) { return safe_add_int64_int64(a, b); }
+static inline long long safe_add_longlong_ulonglong(long long a, unsigned long long b) { return safe_add_int64_uint64(a, b); }
+
+static inline bool check_add_longlong_int(long long a, int b, long long* ret) { return check_add_int64_int32(a, b, (int64_t*)ret); }
+static inline bool check_add_longlong_uint(long long a, unsigned int b, long long* ret) { return check_add_int64_uint32(a, b, (int64_t*)ret); }
+static inline bool check_add_longlong_longlong(long long a, long long b, long long* ret) { return check_add_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_add_longlong_ulonglong(long long a, unsigned long long b, long long* ret) { return check_add_int64_uint64(a, b, (int64_t*)ret); }
+
+static inline unsigned long long safe_add_ulonglong_int(unsigned long long a, int b) { return safe_add_uint64_int32(a, b); }
+static inline unsigned long long safe_add_ulonglong_uint(unsigned long long a, unsigned int b) { return safe_add_uint64_uint32(a, b); }
+static inline unsigned long long safe_add_ulonglong_longlong(unsigned long long a, long long b) { return safe_add_uint64_int64(a, b); }
+static inline unsigned long long safe_add_ulonglong_ulonglong(unsigned long long a, unsigned long long b) { return safe_add_uint64_uint64(a, b); }
+
+static inline bool check_add_ulonglong_int(unsigned long long a, int b, unsigned long long* ret) { return check_add_uint64_int32(a, b, (uint64_t*)ret); }
+static inline bool check_add_ulonglong_uint(unsigned long long a, unsigned int b, unsigned long long* ret) { return check_add_uint64_uint32(a, b, (uint64_t*)ret); }
+static inline bool check_add_ulonglong_longlong(unsigned long long a, long long b, unsigned long long* ret) { return check_add_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_add_ulonglong_ulonglong(unsigned long long a, unsigned long long b, unsigned long long* ret) { return check_add_uint64_uint64(a, b, (uint64_t*)ret); }
+
+// Multiplication
+static inline int safe_mul_int_int(int a, int b) { return safe_mul_int32_int32(a, b); }
+static inline int safe_mul_int_uint(int a, unsigned int b) { return safe_mul_int32_uint32(a, b); }
+static inline int safe_mul_int_longlong(int a, long long b) { return safe_mul_int32_int64(a, b); }
+static inline int safe_mul_int_ulonglong(int a, unsigned long long b) { return safe_mul_int32_uint64(a, b); }
+
+static inline bool check_mul_int_int(int a, int b, int* ret) { return check_mul_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_mul_int_uint(int a, unsigned int b, int* ret) { return check_mul_int32_uint32(a, b, (int32_t*)ret); }
+static inline bool check_mul_int_longlong(int a, long long b, int* ret) { return check_mul_int32_int64(a, b, (int32_t*)ret); }
+static inline bool check_mul_int_ulonglong(int a, unsigned long long b, int* ret) { return check_mul_int32_uint64(a, b, (int32_t*)ret); }
+
+static inline unsigned int safe_mul_uint_int(unsigned int a, int b) { return safe_mul_uint32_int32(a, b); }
+static inline unsigned int safe_mul_uint_uint(unsigned int a, unsigned int b) { return safe_mul_uint32_uint32(a, b); }
+static inline unsigned int safe_mul_uint_longlong(unsigned int a, long long b) { return safe_mul_uint32_int64(a, b); }
+static inline unsigned int safe_mul_uint_ulonglong(unsigned int a, unsigned long long b) { return safe_mul_uint32_uint64(a, b); }
+
+static inline bool check_mul_uint_int(unsigned int a, int b, unsigned int* ret) { return check_mul_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_mul_uint_uint(unsigned int a, unsigned int b, unsigned int* ret) { return check_mul_uint32_uint32(a, b, (uint32_t*)ret); }
+static inline bool check_mul_uint_longlong(unsigned int a, long long b, unsigned int* ret) { return check_mul_uint32_int64(a, b, (uint32_t*)ret); }
+static inline bool check_mul_uint_ulonglong(unsigned int a, unsigned long long b, unsigned int* ret) { return check_mul_uint32_uint64(a, b, (uint32_t*)ret); }
+
+#if SAFE_MATH_LONG == 64
+static inline int safe_mul_int_long(int a, long b) { return safe_mul_int32_int64(a, b); }
+static inline int safe_mul_int_ulong(int a, unsigned long b) { return safe_mul_int32_uint64(a, b); }
+
+static inline bool check_mul_int_long(int a, long b, int* ret) { return check_mul_int32_int64(a, b, (int32_t*)ret); }
+static inline bool check_mul_int_ulong(int a, unsigned long b, int* ret) { return check_mul_int32_uint64(a, b, (int32_t*)ret); }
+
+static inline unsigned int safe_mul_uint_long(unsigned int a, long b) { return safe_mul_uint32_int64(a, b); }
+static inline unsigned int safe_mul_uint_ulong(unsigned int a, unsigned long b) { return safe_mul_uint32_uint64(a, b); }
+
+static inline bool check_mul_uint_long(unsigned int a, long b, unsigned int* ret) { return check_mul_uint32_int64(a, b, (uint32_t*)ret); }
+static inline bool check_mul_uint_ulong(unsigned int a, unsigned long b, unsigned int* ret) { return check_mul_uint32_uint64(a, b, (uint32_t*)ret); }
+
+static inline long safe_mul_long_int(long a, int b) { return safe_mul_int64_int32(a, b); }
+static inline long safe_mul_long_uint(long a, unsigned int b) { return safe_mul_int64_uint32(a, b); }
+static inline long safe_mul_long_long(long a, long b) { return safe_mul_int64_int64(a, b); }
+static inline long safe_mul_long_ulong(long a, unsigned long b) { return safe_mul_int64_uint64(a, b); }
+static inline long safe_mul_long_longlong(long a, long long b) { return safe_mul_int64_int64(a, b); }
+static inline long safe_mul_long_ulonglong(long a, unsigned long long b) { return safe_mul_int64_uint64(a, b); }
+
+static inline bool check_mul_long_int(long a, int b, long* ret) { return check_mul_int64_int32(a, b, (int64_t*)ret); }
+static inline bool check_mul_long_uint(long a, unsigned int b, long* ret) { return check_mul_int64_uint32(a, b, (int64_t*)ret); }
+static inline bool check_mul_long_long(long a, long b, long* ret) { return check_mul_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_mul_long_ulong(long a, unsigned long b, long* ret) { return check_mul_int64_uint64(a, b, (int64_t*)ret); }
+static inline bool check_mul_long_longlong(long a, long long b, long* ret) { return check_mul_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_mul_long_ulonglong(long a, unsigned long long b, long* ret) { return check_mul_int64_uint64(a, b, (int64_t*)ret); }
+
+static inline unsigned long safe_mul_ulong_int(unsigned long a, int b) { return safe_mul_uint64_int32(a, b); }
+static inline unsigned long safe_mul_ulong_uint(unsigned long a, unsigned int b) { return safe_mul_uint64_uint32(a, b); }
+static inline unsigned long safe_mul_ulong_long(unsigned long a, long b) { return safe_mul_uint64_int64(a, b); }
+static inline unsigned long safe_mul_ulong_ulong(unsigned long a, unsigned long b) { return safe_mul_uint64_uint64(a, b); }
+static inline unsigned long safe_mul_ulong_longlong(unsigned long a, long long b) { return safe_mul_uint64_int64(a, b); }
+static inline unsigned long safe_mul_ulong_ulonglong(unsigned long a, unsigned long long b) { return safe_mul_uint64_uint64(a, b); }
+
+static inline bool check_mul_ulong_int(unsigned long a, int b, unsigned long* ret) { return check_mul_uint64_int32(a, b, (uint64_t*)ret); }
+static inline bool check_mul_ulong_uint(unsigned long a, unsigned int b, unsigned long* ret) { return check_mul_uint64_uint32(a, b, (uint64_t*)ret); }
+static inline bool check_mul_ulong_long(unsigned long a, long b, unsigned long* ret) { return check_mul_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_mul_ulong_ulong(unsigned long a, unsigned long b, unsigned long* ret) { return check_mul_uint64_uint64(a, b, (uint64_t*)ret); }
+static inline bool check_mul_ulong_longlong(unsigned long a, long long b, unsigned long* ret) { return check_mul_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_mul_ulong_ulonglong(unsigned long a, unsigned long long b, unsigned long* ret) { return check_mul_uint64_uint64(a, b,(uint64_t*)ret); }
+
+static inline long long safe_mul_longlong_long(long long a, long b) { return safe_mul_int64_int64(a, b); }
+static inline long long safe_mul_longlong_ulong(long long a, unsigned long b) { return safe_mul_int64_uint64(a, b); }
+
+static inline bool check_mul_longlong_long(long long a, long b, long long* ret) { return check_mul_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_mul_longlong_ulong(long long a, unsigned long b, long long* ret) { return check_mul_int64_uint64(a, b, (int64_t*)ret); }
+
+static inline unsigned long long safe_mul_ulonglong_long(unsigned long long a, long b) { return safe_mul_uint64_int64(a, b); }
+static inline unsigned long long safe_mul_ulonglong_ulong(unsigned long long a, unsigned long b) { return safe_mul_uint64_uint64(a, b); }
+
+static inline bool check_mul_ulonglong_long(unsigned long long a, long b, unsigned long long* ret) { return check_mul_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_mul_ulonglong_ulong(unsigned long long a, unsigned long b, unsigned long long* ret) { return check_mul_uint64_uint64(a, b, (uint64_t*)ret); }
+#else
+static inline int safe_mul_int_long(int a, long b) { return safe_mul_int32_int32(a, b); }
+static inline int safe_mul_int_ulong(int a, unsigned long b) { return safe_mul_int32_uint32(a, b); }
+
+static inline bool check_mul_int_long(int a, long b, int* ret) { return check_mul_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_mul_int_ulong(int a, unsigned long b, int* ret) { return check_mul_int32_uint32(a, b, (int32_t*)ret); }
+
+static inline unsigned int safe_mul_uint_long(unsigned int a, long b) { return safe_mul_uint32_int32(a, b); }
+static inline unsigned int safe_mul_uint_ulong(unsigned int a, unsigned long b) { return safe_mul_uint32_uint32(a, b); }
+
+static inline bool check_mul_uint_long(unsigned int a, long b, unsigned int* ret) { return check_mul_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_mul_uint_ulong(unsigned int a, unsigned long b, unsigned int* ret) { return check_mul_uint32_uint32(a, b, (uint32_t*)ret); }
+
+static inline long safe_mul_long_int(long a, int b) { return safe_mul_int32_int32(a, b); }
+static inline long safe_mul_long_uint(long a, unsigned int b) { return safe_mul_int32_uint32(a, b); }
+static inline long safe_mul_long_long(long a, long b) { return safe_mul_int32_int32(a, b); }
+static inline long safe_mul_long_ulong(long a, unsigned long b) { return safe_mul_int32_uint32(a, b); }
+static inline long safe_mul_long_longlong(long a, long long b) { return safe_mul_int32_int64(a, b); }
+static inline long safe_mul_long_ulonglong(long a, unsigned long long b) { return safe_mul_int32_uint64(a, b); }
+
+static inline bool check_mul_long_int(long a, int b, long* ret) { return check_mul_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_mul_long_uint(long a, unsigned int b, long* ret) { return check_mul_int32_uint32(a, b, (int32_t*)ret); }
+static inline bool check_mul_long_long(long a, long b, long* ret) { return check_mul_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_mul_long_ulong(long a, unsigned long b, long* ret) { return check_mul_int32_uint32(a, b, (int32_t*)ret); }
+static inline bool check_mul_long_longlong(long a, long long b, long* ret) { return check_mul_int32_int64(a, b, (int32_t*)ret); }
+static inline bool check_mul_long_ulonglong(long a, unsigned long long b, long* ret) { return check_mul_int32_uint64(a, b, (int32_t*)ret); }
+
+static inline unsigned long safe_mul_ulong_int(unsigned long a, int b) { return safe_mul_uint32_int32(a, b); }
+static inline unsigned long safe_mul_ulong_uint(unsigned long a, unsigned int b) { return safe_mul_uint32_uint32(a, b); }
+static inline unsigned long safe_mul_ulong_long(unsigned long a, long b) { return safe_mul_uint32_int32(a, b); }
+static inline unsigned long safe_mul_ulong_ulong(unsigned long a, unsigned long b) { return safe_mul_uint32_uint32(a, b); }
+static inline unsigned long safe_mul_ulong_longlong(unsigned long a, long long b) { return safe_mul_uint32_int64(a, b); }
+static inline unsigned long safe_mul_ulong_ulonglong(unsigned long a, unsigned long long b) { return safe_mul_uint32_uint64(a, b); }
+
+static inline bool check_mul_ulong_int(unsigned long a, int b, unsigned long* ret) { return check_mul_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_mul_ulong_uint(unsigned long a, unsigned int b, unsigned long* ret) { return check_mul_uint32_uint32(a, b, (uint32_t*)ret); }
+static inline bool check_mul_ulong_long(unsigned long a, long b, unsigned long* ret) { return check_mul_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_mul_ulong_ulong(unsigned long a, unsigned long b, unsigned long* ret) { return check_mul_uint32_uint32(a, b, (uint32_t*)ret); }
+static inline bool check_mul_ulong_longlong(unsigned long a, long long b, unsigned long* ret) { return check_mul_uint32_int64(a, b, (uint32_t*)ret); }
+static inline bool check_mul_ulong_ulonglong(unsigned long a, unsigned long long b, unsigned long* ret) { return check_mul_uint32_uint64(a, b, (uint32_t*)ret); }
+
+static inline long long safe_mul_longlong_long(long long a, long b) { return safe_mul_int64_int32(a, b); }
+static inline long long safe_mul_longlong_ulong(long long a, unsigned long b) { return safe_mul_int64_uint32(a, b); }
+
+static inline bool check_mul_longlong_long(long long a, long b, long long* ret) { return check_mul_int64_int32(a, b, ret); }
+static inline bool check_mul_longlong_ulong(long long a, unsigned long b, long long* ret) { return check_mul_int64_uint64(a, b, ret); }
+
+static inline unsigned long long safe_mul_ulonglong_long(unsigned long long a, long b) { return safe_mul_uint64_int32(a, b); }
+static inline unsigned long long safe_mul_ulonglong_ulong(unsigned long long a, unsigned long b) { return safe_mul_uint64_uint32(a, b); }
+
+static inline bool check_mul_ulonglong_long(unsigned long long a, long b, unsigned long long* ret) { return check_mul_uint64_int32(a, b, ret); }
+static inline bool check_mul_ulonglong_ulong(unsigned long long a, unsigned long b, unsigned long long* ret) { return check_mul_uint64_uint32(a, b, ret); }
+#endif
+
+static inline long long safe_mul_longlong_int(long long a, int b) { return safe_mul_int64_int32(a, b); }
+static inline long long safe_mul_longlong_uint(long long a, unsigned int b) { return safe_mul_int64_uint32(a, b); }
+static inline long long safe_mul_longlong_longlong(long long a, long long b) { return safe_mul_int64_int64(a, b); }
+static inline long long safe_mul_longlong_ulonglong(long long a, unsigned long long b) { return safe_mul_int64_uint64(a, b); }
+
+static inline bool check_mul_longlong_int(long long a, int b, long long* ret) { return check_mul_int64_int32(a, b, (int64_t*)ret); }
+static inline bool check_mul_longlong_uint(long long a, unsigned int b, long long* ret) { return check_mul_int64_uint32(a, b, (int64_t*)ret); }
+static inline bool check_mul_longlong_longlong(long long a, long long b, long long* ret) { return check_mul_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_mul_longlong_ulonglong(long long a, unsigned long long b, long long* ret) { return check_mul_int64_uint64(a, b, (int64_t*)ret); }
+
+static inline unsigned long long safe_mul_ulonglong_int(unsigned long long a, int b) { return safe_mul_uint64_int32(a, b); }
+static inline unsigned long long safe_mul_ulonglong_uint(unsigned long long a, unsigned int b) { return safe_mul_uint64_uint32(a, b); }
+static inline unsigned long long safe_mul_ulonglong_longlong(unsigned long long a, long long b) { return safe_mul_uint64_int64(a, b); }
+static inline unsigned long long safe_mul_ulonglong_ulonglong(unsigned long long a, unsigned long long b) { return safe_mul_uint64_uint64(a, b); }
+
+static inline bool check_mul_ulonglong_int(unsigned long long a, int b, unsigned long long* ret) { return check_mul_uint64_int32(a, b, (uint64_t*)ret); }
+static inline bool check_mul_ulonglong_uint(unsigned long long a, unsigned int b, unsigned long long* ret) { return check_mul_uint64_uint32(a, b, (uint64_t*)ret); }
+static inline bool check_mul_ulonglong_longlong(unsigned long long a, long long b, unsigned long long* ret) { return check_mul_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_mul_ulonglong_ulonglong(unsigned long long a, unsigned long long b, unsigned long long* ret) { return check_mul_uint64_uint64(a, b, (uint64_t*)ret); }
+
+// Subtraction
+static inline int safe_sub_int_int(int a, int b) { return safe_sub_int32_int32(a, b); }
+static inline int safe_sub_int_uint(int a, unsigned int b) { return safe_sub_int32_uint32(a, b); }
+static inline int safe_sub_int_longlong(int a, long long b) { return safe_sub_int32_int64(a, b); }
+static inline int safe_sub_int_ulonglong(int a, unsigned long long b) { return safe_sub_int32_uint64(a, b); }
+
+static inline bool check_sub_int_int(int a, int b, int* ret) { return check_sub_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_sub_int_uint(int a, unsigned int b, int* ret) { return check_sub_int32_uint32(a, b, (int32_t*)ret); }
+static inline bool check_sub_int_longlong(int a, long long b, int* ret) { return check_sub_int32_int64(a, b, (int32_t*)ret); }
+static inline bool check_sub_int_ulonglong(int a, unsigned long long b, int* ret) { return check_sub_int32_uint64(a, b, (int32_t*)ret); }
+
+static inline unsigned int safe_sub_uint_int(unsigned int a, int b) { return safe_sub_uint32_int32(a, b); }
+static inline unsigned int safe_sub_uint_uint(unsigned int a, unsigned int b) { return safe_sub_uint32_uint32(a, b); }
+static inline unsigned int safe_sub_uint_longlong(unsigned int a, long long b) { return safe_sub_uint32_int64(a, b); }
+static inline unsigned int safe_sub_uint_ulonglong(unsigned int a, unsigned long long b) { return safe_sub_uint32_uint64(a, b); }
+
+static inline bool check_sub_uint_int(unsigned int a, int b, unsigned int* ret) { return check_sub_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_sub_uint_uint(unsigned int a, unsigned int b, unsigned int* ret) { return check_sub_uint32_uint32(a, b, (uint32_t*)ret); }
+static inline bool check_sub_uint_longlong(unsigned int a, long long b, unsigned int* ret) { return check_sub_uint32_int64(a, b, (uint32_t*)ret); }
+static inline bool check_sub_uint_ulonglong(unsigned int a, unsigned long long b, unsigned int* ret) { return check_sub_uint32_uint64(a, b, (uint32_t*)ret); }
+
+#if SAFE_MATH_LONG == 64
+static inline int safe_sub_int_long(int a, long b) { return safe_sub_int32_int64(a, b); }
+static inline int safe_sub_int_ulong(int a, unsigned long b) { return safe_sub_int32_uint64(a, b); }
+
+static inline bool check_sub_int_long(int a, long b, int* ret) { return check_sub_int32_int64(a, b, (int32_t*)ret); }
+static inline bool check_sub_int_ulong(int a, unsigned long b, int* ret) { return check_sub_int32_uint64(a, b, (int32_t*)ret); }
+
+static inline unsigned int safe_sub_uint_long(unsigned int a, long b) { return safe_sub_uint32_int64(a, b); }
+static inline unsigned int safe_sub_uint_ulong(unsigned int a, unsigned long b) { return safe_sub_uint32_uint64(a, b); }
+
+static inline bool check_sub_uint_long(unsigned int a, long b, unsigned int* ret) { return check_sub_uint32_int64(a, b, (uint32_t*)ret); }
+static inline bool check_sub_uint_ulong(unsigned int a, unsigned long b, unsigned int* ret) { return check_sub_uint32_uint64(a, b, (uint32_t*)ret); }
+
+static inline long safe_sub_long_int(long a, int b) { return safe_sub_int64_int32(a, b); }
+static inline long safe_sub_long_uint(long a, unsigned int b) { return safe_sub_int64_uint32(a, b); }
+static inline long safe_sub_long_long(long a, long b) { return safe_sub_int64_int64(a, b); }
+static inline long safe_sub_long_ulong(long a, unsigned long b) { return safe_sub_int64_uint64(a, b); }
+static inline long safe_sub_long_longlong(long a, long long b) { return safe_sub_int64_int64(a, b); }
+static inline long safe_sub_long_ulonglong(long a, unsigned long long b) { return safe_sub_int64_uint64(a, b); }
+
+static inline bool check_sub_long_int(long a, int b, long* ret) { return check_sub_int64_int32(a, b, (int64_t*)ret); }
+static inline bool check_sub_long_uint(long a, unsigned int b, long* ret) { return check_sub_int64_uint32(a, b, (int64_t*)ret); }
+static inline bool check_sub_long_long(long a, long b, long* ret) { return check_sub_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_sub_long_ulong(long a, unsigned long b, long* ret) { return check_sub_int64_uint64(a, b, (int64_t*)ret); }
+static inline bool check_sub_long_longlong(long a, long long b, long* ret) { return check_sub_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_sub_long_ulonglong(long a, unsigned long long b, long* ret) { return check_sub_int64_uint64(a, b, (int64_t*)ret); }
+
+static inline unsigned long safe_sub_ulong_int(unsigned long a, int b) { return safe_sub_uint64_int32(a, b); }
+static inline unsigned long safe_sub_ulong_uint(unsigned long a, unsigned int b) { return safe_sub_uint64_uint32(a, b); }
+static inline unsigned long safe_sub_ulong_long(unsigned long a, long b) { return safe_sub_uint64_int64(a, b); }
+static inline unsigned long safe_sub_ulong_ulong(unsigned long a, unsigned long b) { return safe_sub_uint64_uint64(a, b); }
+static inline unsigned long safe_sub_ulong_longlong(unsigned long a, long long b) { return safe_sub_uint64_int64(a, b); }
+static inline unsigned long safe_sub_ulong_ulonglong(unsigned long a, unsigned long long b) { return safe_sub_uint64_uint64(a, b); }
+
+static inline bool check_sub_ulong_int(unsigned long a, int b, unsigned long* ret) { return check_sub_uint64_int32(a, b, (uint64_t*)ret); }
+static inline bool check_sub_ulong_uint(unsigned long a, unsigned int b, unsigned long* ret) { return check_sub_uint64_uint32(a, b, (uint64_t*)ret); }
+static inline bool check_sub_ulong_long(unsigned long a, long b, unsigned long* ret) { return check_sub_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_sub_ulong_ulong(unsigned long a, unsigned long b, unsigned long* ret) { return check_sub_uint64_uint64(a, b, (uint64_t*)ret); }
+static inline bool check_sub_ulong_longlong(unsigned long a, long long b, unsigned long* ret) { return check_sub_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_sub_ulong_ulonglong(unsigned long a, unsigned long long b, unsigned long* ret) { return check_sub_uint64_uint64(a, b, (uint64_t*)ret); }
+
+static inline long long safe_sub_longlong_long(long long a, long b) { return safe_sub_int64_int64(a, b); }
+static inline long long safe_sub_longlong_ulong(long long a, unsigned long b) { return safe_sub_int64_uint64(a, b); }
+
+static inline bool check_sub_longlong_long(long long a, long b, long long* ret) { return check_sub_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_sub_longlong_ulong(long long a, unsigned long b, long long* ret) { return check_sub_int64_uint64(a, b, (int64_t*)ret); }
+
+static inline unsigned long long safe_sub_ulonglong_long(unsigned long long a, long b) { return safe_sub_uint64_int64(a, b); }
+static inline unsigned long long safe_sub_ulonglong_ulong(unsigned long long a, unsigned long b) { return safe_sub_uint64_uint64(a, b); }
+
+static inline bool check_sub_ulonglong_long(unsigned long long a, long b, unsigned long long* ret) { return check_sub_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_sub_ulonglong_ulong(unsigned long long a, unsigned long b, unsigned long long* ret) { return check_sub_uint64_uint64(a, b, (uint64_t*)ret); }
+#else
+static inline int safe_sub_int_long(int a, long b) { return safe_sub_int32_int32(a, b); }
+static inline int safe_sub_int_ulong(int a, unsigned long b) { return safe_sub_int32_uint32(a, b); }
+
+static inline bool check_sub_int_long(int a, long b, int* ret) { return check_sub_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_sub_int_ulong(int a, unsigned long b, int* ret) { return check_sub_int32_uint32(a, b, (int32_t*)ret); }
+
+static inline unsigned int safe_sub_uint_long(unsigned int a, long b) { return safe_sub_uint32_int32(a, b); }
+static inline unsigned int safe_sub_uint_ulong(unsigned int a, unsigned long b) { return safe_sub_uint32_uint32(a, b); }
+
+static inline bool check_sub_uint_long(unsigned int a, long b, unsigned int* ret) { return check_sub_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_sub_uint_ulong(unsigned int a, unsigned long b, unsigned int* ret) { return check_sub_uint32_uint32(a, b, (uint32_t*)ret); }
+
+static inline long safe_sub_long_int(long a, int b) { return safe_sub_int32_int32(a, b); }
+static inline long safe_sub_long_uint(long a, unsigned int b) { return safe_sub_int32_uint32(a, b); }
+static inline long safe_sub_long_long(long a, long b) { return safe_sub_int32_int32(a, b); }
+static inline long safe_sub_long_ulong(long a, unsigned long b) { return safe_sub_int32_uint32(a, b); }
+static inline long safe_sub_long_longlong(long a, long long b) { return safe_sub_int32_int64(a, b); }
+static inline long safe_sub_long_ulonglong(long a, unsigned long long b) { return safe_sub_int32_uint64(a, b); }
+
+static inline bool check_sub_long_int(long a, int b, long* ret) { return check_sub_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_sub_long_uint(long a, unsigned int b, long* ret) { return check_sub_int32_uint32(a, b, (int32_t*)ret); }
+static inline bool check_sub_long_long(long a, long b, long* ret) { return check_sub_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_sub_long_ulong(long a, unsigned long b, long* ret) { return check_sub_int32_uint32(a, b, (int32_t*)ret); }
+static inline bool check_sub_long_longlong(long a, long long b, long* ret) { return check_sub_int32_int64(a, b, (int32_t*)ret); }
+static inline bool check_sub_long_ulonglong(long a, unsigned long long b, long* ret) { return check_sub_int32_uint64(a, b, (int32_t*)ret); }
+
+static inline unsigned long safe_sub_ulong_int(unsigned long a, int b) { return safe_sub_uint32_int32(a, b); }
+static inline unsigned long safe_sub_ulong_uint(unsigned long a, unsigned int b) { return safe_sub_uint32_uint32(a, b); }
+static inline unsigned long safe_sub_ulong_long(unsigned long a, long b) { return safe_sub_uint32_int32(a, b); }
+static inline unsigned long safe_sub_ulong_ulong(unsigned long a, unsigned long b) { return safe_sub_uint32_uint32(a, b); }
+static inline unsigned long safe_sub_ulong_longlong(unsigned long a, long long b) { return safe_sub_uint32_int64(a, b); }
+static inline unsigned long safe_sub_ulong_ulonglong(unsigned long a, unsigned long long b) { return safe_sub_uint32_uint64(a, b); }
+
+static inline bool check_sub_ulong_int(unsigned long a, int b, unsigned long* ret) { return check_sub_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_sub_ulong_uint(unsigned long a, unsigned int b, unsigned long* ret) { return check_sub_uint32_uint32(a, b, (uint32_t*)ret); }
+static inline bool check_sub_ulong_long(unsigned long a, long b, unsigned long* ret) { return check_sub_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_sub_ulong_ulong(unsigned long a, unsigned long b, unsigned long* ret) { return check_sub_uint32_uint32(a, b, (uint32_t*)ret); }
+static inline bool check_sub_ulong_longlong(unsigned long a, long long b, unsigned long* ret) { return check_sub_uint32_int64(a, b, (uint32_t*)ret); }
+static inline bool check_sub_ulong_ulonglong(unsigned long a, unsigned long long b, unsigned long* ret) { return check_sub_uint32_uint64(a, b, (uint32_t*)ret); }
+
+static inline long long safe_sub_longlong_long(long long a, long b) { return safe_sub_int64_int32(a, b); }
+static inline long long safe_sub_longlong_ulong(long long a, unsigned long b) { return safe_sub_int64_uint32(a, b); }
+
+static inline bool check_sub_longlong_long(long long a, long b, long long* ret) { return check_sub_int64_int32(a, b, ret); }
+static inline bool check_sub_longlong_ulong(long long a, unsigned long b, long long* ret) { return check_sub_int64_uint32(a, b, ret); }
+
+static inline unsigned long long safe_sub_ulonglong_long(unsigned long long a, long b) { return safe_sub_uint64_int32(a, b); }
+static inline unsigned long long safe_sub_ulonglong_ulong(unsigned long long a, unsigned long b) { return safe_sub_uint64_uint32(a, b); }
+
+static inline bool check_sub_ulonglong_long(unsigned long long a, long b, unsigned long long* ret) { return check_sub_uint64_int32(a, b, ret); }
+static inline bool check_sub_ulonglong_ulong(unsigned long long a, unsigned long b, unsigned long long* ret) { return check_sub_uint64_uint64(a, b, ret); }
+#endif
+
+static inline long long safe_sub_longlong_int(long long a, int b) { return safe_sub_int64_int32(a, b); }
+static inline long long safe_sub_longlong_uint(long long a, unsigned int b) { return safe_sub_int64_uint32(a, b); }
+static inline long long safe_sub_longlong_longlong(long long a, long long b) { return safe_sub_int64_int64(a, b); }
+static inline long long safe_sub_longlong_ulonglong(long long a, unsigned long long b) { return safe_sub_int64_uint64(a, b); }
+
+static inline bool check_sub_longlong_int(long long a, int b, long long* ret) { return check_sub_int64_int32(a, b, (int64_t*)ret); }
+static inline bool check_sub_longlong_uint(long long a, unsigned int b, long long* ret) { return check_sub_int64_uint32(a, b, (int64_t*)ret); }
+static inline bool check_sub_longlong_longlong(long long a, long long b, long long* ret) { return check_sub_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_sub_longlong_ulonglong(long long a, unsigned long long b, long long* ret) { return check_sub_int64_uint64(a, b, (int64_t*)ret); }
+
+static inline unsigned long long safe_sub_ulonglong_int(unsigned long long a, int b) { return safe_sub_uint64_int32(a, b); }
+static inline unsigned long long safe_sub_ulonglong_uint(unsigned long long a, unsigned int b) { return safe_sub_uint64_uint32(a, b); }
+static inline unsigned long long safe_sub_ulonglong_longlong(unsigned long long a, long long b) { return safe_sub_uint64_int64(a, b); }
+static inline unsigned long long safe_sub_ulonglong_ulonglong(unsigned long long a, unsigned long long b) { return safe_sub_uint64_uint64(a, b); }
+
+static inline bool check_sub_ulonglong_int(unsigned long long a, int b, unsigned long long* ret) { return check_sub_uint64_int32(a, b, (uint64_t*)ret); }
+static inline bool check_sub_ulonglong_uint(unsigned long long a, unsigned int b, unsigned long long* ret) { return check_sub_uint64_uint32(a, b, (uint64_t*)ret); }
+static inline bool check_sub_ulonglong_longlong(unsigned long long a, long long b, unsigned long long* ret) { return check_sub_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_sub_ulonglong_ulonglong(unsigned long long a, unsigned long long b, unsigned long long* ret) { return check_sub_uint64_uint64(a, b, (uint64_t*)ret); }
+
+// Division
+static inline int safe_div_int_int(int a, int b) { return safe_div_int32_int32(a, b); }
+static inline int safe_div_int_uint(int a, unsigned int b) { return safe_div_int32_uint32(a, b); }
+static inline int safe_div_int_longlong(int a, long long b) { return safe_div_int32_int64(a, b); }
+static inline int safe_div_int_ulonglong(int a, unsigned long long b) { return safe_div_int32_uint64(a, b); }
+
+static inline bool check_div_int_int(int a, int b, int* ret) { return check_div_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_div_int_uint(int a, unsigned int b, int* ret) { return check_div_int32_uint32(a, b, (int32_t*)ret); }
+static inline bool check_div_int_longlong(int a, long long b, int* ret) { return check_div_int32_int64(a, b, (int32_t*)ret); }
+static inline bool check_div_int_ulonglong(int a, unsigned long long b, int* ret) { return check_div_int32_uint64(a, b, (int32_t*)ret); }
+
+static inline unsigned int safe_div_uint_int(unsigned int a, int b) { return safe_div_uint32_int32(a, b); }
+static inline unsigned int safe_div_uint_uint(unsigned int a, unsigned int b) { return safe_div_uint32_uint32(a, b); }
+static inline unsigned int safe_div_uint_longlong(unsigned int a, long long b) { return safe_div_uint32_int64(a, b); }
+static inline unsigned int safe_div_uint_ulonglong(unsigned int a, unsigned long long b) { return safe_div_uint32_uint64(a, b); }
+
+static inline bool check_div_uint_int(unsigned int a, int b, unsigned int* ret) { return check_div_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_div_uint_uint(unsigned int a, unsigned int b, unsigned int* ret) { return check_div_uint32_uint32(a, b, (uint32_t*)ret); }
+static inline bool check_div_uint_longlong(unsigned int a, long long b, unsigned int* ret) { return check_div_uint32_int64(a, b, (uint32_t*)ret); }
+static inline bool check_div_uint_ulonglong(unsigned int a, unsigned long long b, unsigned int* ret) { return check_div_uint32_uint64(a, b, (uint32_t*)ret); }
+
+#if SAFE_MATH_LONG == 64
+static inline int safe_div_int_long(int a, long b) { return safe_div_int32_int64(a, b); }
+static inline int safe_div_int_ulong(int a, unsigned long b) { return safe_div_int32_uint64(a, b); }
+
+static inline bool check_div_int_long(int a, long b, int* ret) { return check_div_int32_int64(a, b, (int32_t*)ret); }
+static inline bool check_div_int_ulong(int a, unsigned long b, int* ret) { return check_div_int32_uint64(a, b, (int32_t*)ret); }
+
+static inline unsigned int safe_div_uint_long(unsigned int a, long b) { return safe_div_uint32_int64(a, b); }
+static inline unsigned int safe_div_uint_ulong(unsigned int a, unsigned long b) { return safe_div_uint32_uint64(a, b); }
+
+static inline bool check_div_uint_long(unsigned int a, long b, unsigned int* ret) { return check_div_uint32_int64(a, b, (uint32_t*)ret); }
+static inline bool check_div_uint_ulong(unsigned int a, unsigned long b, unsigned int* ret) { return check_div_uint32_uint64(a, b, (uint32_t*)ret); }
+
+static inline long safe_div_long_int(long a, int b) { return safe_div_int64_int32(a, b); }
+static inline long safe_div_long_uint(long a, unsigned int b) { return safe_div_int64_uint32(a, b); }
+static inline long safe_div_long_long(long a, long b) { return safe_div_int64_int64(a, b); }
+static inline long safe_div_long_ulong(long a, unsigned long b) { return safe_div_int64_uint64(a, b); }
+static inline long safe_div_long_longlong(long a, long long b) { return safe_div_int64_int64(a, b); }
+static inline long safe_div_long_ulonglong(long a, unsigned long long b) { return safe_div_int64_uint64(a, b); }
+
+static inline bool check_div_long_int(long a, int b, long* ret) { return check_div_int64_int32(a, b, (int64_t*)ret); }
+static inline bool check_div_long_uint(long a, unsigned int b, long* ret) { return check_div_int64_uint32(a, b, (int64_t*)ret); }
+static inline bool check_div_long_long(long a, long b, long* ret) { return check_div_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_div_long_ulong(long a, unsigned long b, long* ret) { return check_div_int64_uint64(a, b, (int64_t*)ret); }
+static inline bool check_div_long_longlong(long a, long long b, long* ret) { return check_div_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_div_long_ulonglong(long a, unsigned long long b, long* ret) { return check_div_int64_uint64(a, b, (int64_t*)ret); }
+
+static inline unsigned long safe_div_ulong_int(unsigned long a, int b) { return safe_div_uint64_int32(a, b); }
+static inline unsigned long safe_div_ulong_uint(unsigned long a, unsigned int b) { return safe_div_uint64_uint32(a, b); }
+static inline unsigned long safe_div_ulong_long(unsigned long a, long b) { return safe_div_uint64_int64(a, b); }
+static inline unsigned long safe_div_ulong_ulong(unsigned long a, unsigned long b) { return safe_div_uint64_uint64(a, b); }
+static inline unsigned long safe_div_ulong_longlong(unsigned long a, long long b) { return safe_div_uint64_int64(a, b); }
+static inline unsigned long safe_div_ulong_ulonglong(unsigned long a, unsigned long long b) { return safe_div_uint64_uint64(a, b); }
+
+static inline bool check_div_ulong_int(unsigned long a, int b, unsigned long* ret) { return check_div_uint64_int32(a, b, (uint64_t*)ret); }
+static inline bool check_div_ulong_uint(unsigned long a, unsigned int b, unsigned long* ret) { return check_div_uint64_uint32(a, b, (uint64_t*)ret); }
+static inline bool check_div_ulong_long(unsigned long a, long b, unsigned long* ret) { return check_div_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_div_ulong_ulong(unsigned long a, unsigned long b, unsigned long* ret) { return check_div_uint64_uint64(a, b, (uint64_t*)ret); }
+static inline bool check_div_ulong_longlong(unsigned long a, long long b, unsigned long* ret) { return check_div_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_div_ulong_ulonglong(unsigned long a, unsigned long long b, unsigned long* ret) { return check_div_uint64_uint64(a, b, (uint64_t*)ret); }
+
+static inline long long safe_div_longlong_long(long long a, long b) { return safe_div_int64_int64(a, b); }
+static inline long long safe_div_longlong_ulong(long long a, unsigned long b) { return safe_div_int64_uint64(a, b); }
+
+static inline bool check_div_longlong_long(long long a, long b, long long* ret) { return check_div_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_div_longlong_ulong(long long a, unsigned long b, long long* ret) { return check_div_int64_uint64(a, b, (int64_t*)ret); }
+
+static inline unsigned long long safe_div_ulonglong_long(unsigned long long a, long b) { return safe_div_uint64_int64(a, b); }
+static inline unsigned long long safe_div_ulonglong_ulong(unsigned long long a, unsigned long b) { return safe_div_uint64_uint64(a, b); }
+
+static inline bool check_div_ulonglong_long(unsigned long long a, long b, unsigned long long* ret) { return check_div_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_div_ulonglong_ulong(unsigned long long a, unsigned long b, unsigned long long* ret) { return check_div_uint64_uint64(a, b, (uint64_t*)ret); }
+#else
+static inline int safe_div_int_long(int a, long b) { return safe_div_int32_int32(a, b); }
+static inline int safe_div_int_ulong(int a, unsigned long b) { return safe_div_int32_uint32(a, b); }
+
+static inline bool check_div_int_long(int a, long b, int* ret) { return check_div_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_div_int_ulong(int a, unsigned long b, int* ret) { return check_div_int32_uint32(a, b, (int32_t*)ret); }
+
+static inline unsigned int safe_div_uint_long(unsigned int a, long b) { return safe_div_uint32_int32(a, b); }
+static inline unsigned int safe_div_uint_ulong(unsigned int a, unsigned long b) { return safe_div_uint32_uint32(a, b); }
+
+static inline bool check_div_uint_long(unsigned int a, long b, unsigned int* ret) { return check_div_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_div_uint_ulong(unsigned int a, unsigned long b, unsigned int* ret) { return check_div_uint32_uint32(a, b, (uint32_t*)ret); }
+
+static inline long safe_div_long_int(long a, int b) { return safe_div_int32_int32(a, b); }
+static inline long safe_div_long_uint(long a, unsigned int b) { return safe_div_int32_uint32(a, b); }
+static inline long safe_div_long_long(long a, long b) { return safe_div_int32_int32(a, b); }
+static inline long safe_div_long_ulong(long a, unsigned long b) { return safe_div_int32_uint32(a, b); }
+static inline long safe_div_long_longlong(long a, long long b) { return safe_div_int32_int64(a, b); }
+static inline long safe_div_long_ulonglong(long a, unsigned long long b) { return safe_div_int32_uint64(a, b); }
+
+static inline bool check_div_long_int(long a, int b, long* ret) { return check_div_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_div_long_uint(long a, unsigned int b, long* ret) { return check_div_int32_uint32(a, b, (int32_t*)ret); }
+static inline bool check_div_long_long(long a, long b, long* ret) { return check_div_int32_int32(a, b, (int32_t*)ret); }
+static inline bool check_div_long_ulong(long a, unsigned long b, long* ret) { return check_div_int32_uint32(a, b, (int32_t*)ret); }
+static inline bool check_div_long_longlong(long a, long long b, long* ret) { return check_div_int32_int64(a, b, (int32_t*)ret); }
+static inline bool check_div_long_ulonglong(long a, unsigned long long b, long* ret) { return check_div_int32_uint64(a, b, (int32_t*)ret); }
+
+static inline unsigned long safe_div_ulong_int(unsigned long a, int b) { return safe_div_uint32_int32(a, b); }
+static inline unsigned long safe_div_ulong_uint(unsigned long a, unsigned int b) { return safe_div_uint32_uint32(a, b); }
+static inline unsigned long safe_div_ulong_long(unsigned long a, long b) { return safe_div_uint32_int32(a, b); }
+static inline unsigned long safe_div_ulong_ulong(unsigned long a, unsigned long b) { return safe_div_uint32_uint32(a, b); }
+static inline unsigned long safe_div_ulong_longlong(unsigned long a, long long b) { return safe_div_uint32_int64(a, b); }
+static inline unsigned long safe_div_ulong_ulonglong(unsigned long a, unsigned long long b) { return safe_div_uint32_uint64(a, b); }
+
+static inline bool check_div_ulong_int(unsigned long a, int b, unsigned long* ret) { return check_div_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_div_ulong_uint(unsigned long a, unsigned int b, unsigned long* ret) { return check_div_uint32_uint32(a, b, (uint32_t*)ret); }
+static inline bool check_div_ulong_long(unsigned long a, long b, unsigned long* ret) { return check_div_uint32_int32(a, b, (uint32_t*)ret); }
+static inline bool check_div_ulong_ulong(unsigned long a, unsigned long b, unsigned long* ret) { return check_div_uint32_uint64(a, b, (uint32_t*)ret); }
+static inline bool check_div_ulong_longlong(unsigned long a, long long b, unsigned long* ret) { return check_div_uint32_int64(a, b, (uint32_t*)ret); }
+static inline bool check_div_ulong_ulonglong(unsigned long a, unsigned long long b, unsigned long* ret) { return check_div_uint32_uint64(a, b, (uint32_t*)ret); }
+
+static inline long long safe_div_longlong_long(long long a, long b) { return safe_div_int64_int32(a, b); }
+static inline long long safe_div_longlong_ulong(long long a, unsigned long b) { return safe_div_int64_uint32(a, b); }
+
+static inline bool check_div_longlong_long(long long a, long b, long long* ret) { return check_div_int64_int32(a, b, ret); }
+static inline bool check_div_longlong_ulong(long long a, unsigned long b, long long* ret) { return check_div_int64_uint32(a, b, ret); }
+
+static inline unsigned long long safe_div_ulonglong_long(unsigned long long a, long b) { return safe_div_uint64_int32(a, b); }
+static inline unsigned long long safe_div_ulonglong_ulong(unsigned long long a, unsigned long b) { return safe_div_uint64_uint32(a, b); }
+
+static inline bool check_div_ulonglong_long(unsigned long long a, long b, unsigned long long* ret) { return check_div_uint64_int32(a, b, ret); }
+static inline bool check_div_ulonglong_ulong(unsigned long long a, unsigned long b, unsigned long long* ret) { return check_div_uint64_uint32(a, b, ret); }
+#endif
+
+static inline long long safe_div_longlong_int(long long a, int b) { return safe_div_int64_int32(a, b); }
+static inline long long safe_div_longlong_uint(long long a, unsigned int b) { return safe_div_int64_uint32(a, b); }
+static inline long long safe_div_longlong_longlong(long long a, long long b) { return safe_div_int64_int64(a, b); }
+static inline long long safe_div_longlong_ulonglong(long long a, unsigned long long b) { return safe_div_int64_uint64(a, b); }
+
+static inline bool check_div_longlong_int(long long a, int b, long long* ret) { return check_div_int64_int32(a, b, (int64_t*)ret); }
+static inline bool check_div_longlong_uint(long long a, unsigned int b, long long* ret) { return check_div_int64_uint32(a, b, (int64_t*)ret); }
+static inline bool check_div_longlong_longlong(long long a, long long b, long long* ret) { return check_div_int64_int64(a, b, (int64_t*)ret); }
+static inline bool check_div_longlong_ulonglong(long long a, unsigned long long b, long long* ret) { return check_div_int64_uint64(a, b, (int64_t*)ret); }
+
+static inline unsigned long long safe_div_ulonglong_int(unsigned long long a, int b) { return safe_div_uint64_int32(a, b); }
+static inline unsigned long long safe_div_ulonglong_uint(unsigned long long a, unsigned int b) { return safe_div_uint64_uint32(a, b); }
+static inline unsigned long long safe_div_ulonglong_longlong(unsigned long long a, long long b) { return safe_div_uint64_int64(a, b); }
+static inline unsigned long long safe_div_ulonglong_ulonglong(unsigned long long a, unsigned long long b) { return safe_div_uint64_uint64(a, b); }
+
+static inline bool check_div_ulonglong_int(unsigned long long a, int b, unsigned long long* ret) { return check_div_uint64_int32(a, b, (uint64_t*)ret); }
+static inline bool check_div_ulonglong_uint(unsigned long long a, unsigned int b, unsigned long long* ret) { return check_div_uint64_uint32(a, b, (uint64_t*)ret); }
+static inline bool check_div_ulonglong_longlong(unsigned long long a, long long b, unsigned long long* ret) { return check_div_uint64_int64(a, b, (uint64_t*)ret); }
+static inline bool check_div_ulonglong_ulonglong(unsigned long long a, unsigned long long b, unsigned long long* ret) { return check_div_uint64_uint64(a, b, (uint64_t*)ret); }
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/pyarrow/include/arrow/vendored/safeint/safe_math_impl.h b/pyarrow/include/arrow/vendored/safeint/safe_math_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb782a79b12729037430015dea9a0fdc4f648ea4
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/safeint/safe_math_impl.h
@@ -0,0 +1,2587 @@
+// Licensed under the MIT License.
+// Copyright David LeBlanc - dcl@dleblanc.net
+
+/*-----------------------------------------------------------------------------------------------------------
+c_safe_math
+Version 1.0 - 6/21/22
+
+This header implements a set of functions that check for integer overflows in C code.
+It is based on code and logic from SafeInt.hpp, but ported to C.
+
+Portions copied from SafeInt.hpp are Licensed under the MIT License,
+and are originally copyrighted to Microsoft.
+*/
+
+#ifndef C_SAFE_MATH_IMPL
+#define C_SAFE_MATH_IMPL
+
+#if defined _MSC_VER
+// static inline expansion warnings
+#pragma warning(disable:4710 4711)
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    // It is a bit tricky to sort out what compiler we are actually using,
+    // do this once here, and avoid cluttering the code
+#define VISUAL_STUDIO_COMPILER 0
+#define CLANG_COMPILER 1
+#define GCC_COMPILER 2
+#define UNKNOWN_COMPILER -1
+
+// Clang will sometimes pretend to be Visual Studio
+// and does pretend to be gcc. Check it first, as nothing else pretends to be clang
+#if defined __clang__
+#define SAFEINT_COMPILER CLANG_COMPILER
+#elif defined __GNUC__
+#define SAFEINT_COMPILER GCC_COMPILER
+#elif defined _MSC_VER
+#define SAFEINT_COMPILER VISUAL_STUDIO_COMPILER
+#else
+#define SAFEINT_COMPILER UNKNOWN_COMPILER
+#endif
+
+// Various defines to help make working with multiple compilers easier - from SafeInt.hpp
+#if SAFEINT_COMPILER == GCC_COMPILER || SAFEINT_COMPILER == CLANG_COMPILER
+#define SAFEINT_NORETURN __attribute__((noreturn))
+#define SAFEINT_STDCALL
+#define SAFEINT_VISIBLE __attribute__ ((__visibility__("default")))
+#define SAFEINT_WEAK __attribute__ ((weak))
+#else
+#define SAFEINT_NORETURN __declspec(noreturn)
+#define SAFEINT_STDCALL __stdcall
+#define SAFEINT_VISIBLE
+#define SAFEINT_WEAK
+#endif
+
+#if SAFEINT_COMPILER == VISUAL_STUDIO_COMPILER
+    // limits.h checks __STDC_WANT_SECURE_LIB__, but doesn't include what sets it
+#if !defined __STDC_WANT_SECURE_LIB__
+#define __STDC_WANT_SECURE_LIB__ 0
+#endif
+
+#endif
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <limits.h>
+
+// Figure out if we should use intrinsics
+// If the user has already decided, let that override
+#define SAFEINT_MULTIPLY_MATH        0 // no intrinsics, no built in, no 128-bit
+#define SAFEINT_MULTIPLY_INTRINSICS  1 // 64-bit Visual Studio
+#define SAFEINT_MULTIPLY_BUILTIN     2 // gcc, clang
+#define SAFEINT_MULTIPLY_INT128      3 // Best case
+
+// We might have 128-bit int support, check for that, as it should work best
+#if !defined SAFEINT_HAS_INT128
+
+#if defined __SIZEOF_INT128__ && __SIZEOF_INT128__ == 16
+#define SAFEINT_HAS_INT128 1
+#else
+#define SAFEINT_HAS_INT128 0
+#endif
+
+#endif
+
+#if SAFEINT_HAS_INT128
+#define SAFEINT_MULTIPLY_METHOD SAFEINT_MULTIPLY_INT128
+#else
+
+#if !defined SAFEINT_USE_INTRINSICS
+// If it is the Visual Studio compiler, then it has to be 64-bit, and not ARM64EC
+#if SAFEINT_COMPILER == VISUAL_STUDIO_COMPILER
+#if defined _M_AMD64 && !defined _M_ARM64EC
+#include <intrin.h>
+#define SAFEINT_MULTIPLY_METHOD SAFEINT_MULTIPLY_INTRINSICS
+#else
+#define SAFEINT_MULTIPLY_METHOD SAFEINT_MULTIPLY_MATH
+#endif
+
+#else // Not VISUAL_STUDIO_COMPILER
+
+    // Else for gcc and clang, we can use builtin functions
+#if SAFEINT_COMPILER == CLANG_COMPILER || SAFEINT_COMPILER == GCC_COMPILER
+#define SAFEINT_MULTIPLY_METHOD SAFEINT_MULTIPLY_BUILTIN
+#else
+#define SAFEINT_MULTIPLY_METHOD SAFEINT_MULTIPLY_MATH
+#endif
+#endif
+
+#endif // SAFEINT_USE_INTRINSICS
+#endif // SAFEINT_HAS_INT128
+
+/*
+    To replace safe_math_fail, wrap this header,
+    implement safe_math_fail how you prefer,
+    and set SAFE_MATH_FAIL_DEFINED
+*/
+
+#if !defined SAFE_MATH_FAIL_DEFINED
+#define SAFE_MATH_FAIL_DEFINED
+#include <stdlib.h>
+
+SAFEINT_NORETURN
+static inline void safe_math_fail(const char* msg)
+{
+    (void)msg;
+    abort();
+}
+#endif
+
+#if !defined UINT64_MAX
+
+#define INT8_MIN         (-127i8 - 1)
+#define INT16_MIN        (-32767i16 - 1)
+#define INT32_MIN        (-2147483647i32 - 1)
+#define INT64_MIN        (-9223372036854775807i64 - 1)
+#define INT8_MAX         127i8
+#define INT16_MAX        32767i16
+#define INT32_MAX        2147483647i32
+#define INT64_MAX        9223372036854775807i64
+#define UINT8_MAX        0xffui8
+#define UINT16_MAX       0xffffui16
+#define UINT32_MAX       0xffffffffui32
+#define UINT64_MAX       0xffffffffffffffffui64
+
+#endif
+
+// Utility functions
+
+// Purpose of this is to negate an int in a way
+// where the compiler won't remove it if the input is a 
+// compile time constant MIN_INT
+static inline int32_t negate32(int32_t in) { return (int32_t)(~(uint32_t)in + 1); }
+static inline int64_t negate64(int64_t in) { return (int64_t)(~(uint64_t)in + 1); }
+
+static inline uint32_t safe_abs32(int32_t in)
+{
+    if (in < 0)
+        return ~(uint32_t)in + 1;
+
+    return (uint32_t)in;
+}
+
+static inline uint64_t safe_abs64(int64_t in)
+{
+    if (in < 0)
+        return ~(uint64_t)in + 1;
+
+    return (uint64_t)in;
+}
+
+// Checked casting functions
+// 0 if the cast is safe, non-zero if unsafe
+static inline int check_cast_int8_int32(int32_t in) { return (in < INT8_MIN || in > INT8_MAX); }
+static inline int check_cast_int8_uint32(uint32_t in) { return in > INT8_MAX; }
+static inline int check_cast_int8_int64(int64_t in) { return in < INT8_MIN || in > INT8_MAX; }
+static inline int check_cast_int8_uint64(uint64_t in) { return (in > INT8_MAX); }
+static inline int check_cast_int16_int32(int32_t in) { return in < INT16_MIN || in > INT16_MAX; }
+static inline int check_cast_int16_uint32(uint32_t in) { return (in > INT16_MAX); }
+static inline int check_cast_int16_int64(int64_t in) { return (in < INT16_MIN || in > INT16_MAX); }
+static inline int check_cast_int16_uint64(uint64_t in) { return (in > INT16_MAX); }
+static inline int check_cast_int32_uint32(uint32_t in) { return (in > INT32_MAX); }
+static inline int check_cast_int32_int64(int64_t in) { return (in < INT32_MIN || in > INT32_MAX); }
+static inline int check_cast_int32_uint64(uint64_t in) { return (in > INT32_MAX); }
+static inline int check_cast_int64_uint64(uint64_t in) { return (in > INT64_MAX); }
+static inline int check_cast_uint8_int32(int32_t in) { return (in < 0 || in > UINT8_MAX); }
+static inline int check_cast_uint8_uint32(uint32_t in) { return (in > UINT8_MAX); }
+static inline int check_cast_uint8_int64(int64_t in) { return (in < 0 || in > UINT8_MAX); }
+static inline int check_cast_uint8_uint64(uint64_t in) { return (in > UINT8_MAX); }
+static inline int check_cast_uint16_int32(int32_t in) { return (in < 0 || in > UINT16_MAX); }
+static inline int check_cast_uint16_uint32(uint32_t in) { return (in > UINT16_MAX); }
+static inline int check_cast_uint16_int64(int64_t in) { return (in < 0 || in > UINT16_MAX); }
+static inline int check_cast_uint16_uint64(uint64_t in) { return (in > UINT16_MAX); }
+static inline int check_cast_uint32_int32(int32_t in) { return (in < 0); }
+static inline int check_cast_uint32_int64(int64_t in) { return (in < 0 || in > UINT32_MAX); }
+static inline int check_cast_uint32_uint64(uint64_t in) { return (in > UINT32_MAX); }
+static inline int check_cast_uint64_int64(int64_t in) { return (in < 0); }
+
+static inline int8_t safe_cast_int8_int32(int32_t in)
+{
+    if (!check_cast_int8_int32(in))
+        safe_math_fail("safe_math_fail safe_cast_int8_int32");
+
+    return (int8_t)in;
+}
+
+static inline int8_t safe_cast_int8_uint32(uint32_t in)
+{
+    if (check_cast_int8_uint32(in))
+        safe_math_fail("safe_math_fail safe_cast_int8_uint32");
+
+    return (int8_t)in;
+}
+
+static inline int8_t safe_cast_int8_int64(int64_t in)
+{
+    if (check_cast_int8_int64(in))
+        safe_math_fail("safe_math_fail safe_cast_int8_int64");
+
+    return (int8_t)in;
+}
+
+static inline int8_t safe_cast_int8_uint64(uint64_t in)
+{
+    if (check_cast_int8_uint64(in))
+        safe_math_fail("safe_math_fail safe_cast_int8_uint64");
+
+    return (int8_t)in;
+}
+
+static inline int16_t safe_cast_int16_int32(int32_t in)
+{
+    if (check_cast_int16_int32(in))
+        safe_math_fail("safe_math_fail safe_cast_int16_int32");
+
+    return (int16_t)in;
+}
+
+static inline int16_t safe_cast_int16_uint32(uint32_t in)
+{
+    if (check_cast_int16_uint32(in))
+        safe_math_fail("safe_math_fail safe_cast_int16_uint32");
+
+    return (int16_t)in;
+}
+
+static inline int16_t safe_cast_int16_int64(int64_t in)
+{
+    if (check_cast_int16_int64(in))
+        safe_math_fail("safe_math_fail safe_cast_int16_int64");
+
+    return (int16_t)in;
+}
+
+static inline int16_t safe_cast_int16_uint64(uint64_t in)
+{
+    if (in > INT16_MAX)
+        safe_math_fail("safe_math_fail safe_cast_int16_uint64");
+
+    return (int16_t)in;
+}
+
+static inline int32_t safe_cast_int32_uint32(uint32_t in)
+{
+    if (check_cast_int32_uint32(in))
+        safe_math_fail("safe_math_fail safe_cast_int32_uint32");
+
+    return (int32_t)in;
+}
+
+static inline int32_t safe_cast_int32_int64(int64_t in)
+{
+    if (check_cast_int32_int64(in))
+        safe_math_fail("safe_math_fail safe_cast_int32_int64");
+
+    return (int32_t)in;
+}
+
+static inline int32_t safe_cast_int32_uint64(uint64_t in)
+{
+    if (check_cast_int32_uint64(in))
+        safe_math_fail("safe_math_fail safe_cast_int32_uint64");
+
+    return (int32_t)in;
+}
+
+static inline int64_t safe_cast_int64_uint64(uint64_t in)
+{
+    if (check_cast_int64_uint64(in))
+        safe_math_fail("safe_math_fail safe_cast_int64_uint64");
+
+    return (int64_t)in;
+}
+
+static inline uint8_t safe_cast_uint8_int32(int32_t in)
+{
+    if (check_cast_uint8_int32(in))
+        safe_math_fail("safe_math_fail safe_cast_uint8_int32");
+
+    return (uint8_t)in;
+}
+
+static inline uint8_t safe_cast_uint8_uint32(uint32_t in)
+{
+    if (check_cast_uint8_uint32(in))
+        safe_math_fail("safe_math_fail safe_cast_uint8_uint32");
+
+    return (uint8_t)in;
+}
+
+static inline uint8_t safe_cast_uint8_int64(int64_t in)
+{
+    if (check_cast_uint8_int64(in))
+        safe_math_fail("safe_math_fail safe_cast_uint8_int64");
+
+    return (uint8_t)in;
+}
+
+static inline uint8_t safe_cast_uint8_uint64(uint64_t in)
+{
+    if (check_cast_uint8_uint64(in))
+        safe_math_fail("safe_math_fail safe_cast_uint8_uint64");
+
+    return (uint8_t)in;
+}
+
+static inline uint16_t safe_cast_uint16_int32(int32_t in)
+{
+    if (check_cast_uint16_int32(in))
+        safe_math_fail("safe_math_fail safe_cast_uint16_int32");
+
+    return (uint16_t)in;
+}
+
+static inline uint16_t safe_cast_uint16_uint32(uint32_t in)
+{
+    if (check_cast_uint16_uint32(in))
+        safe_math_fail("safe_math_fail safe_cast_uint16_uint32");
+
+    return (uint16_t)in;
+}
+
+static inline uint16_t safe_cast_uint16_int64(int64_t in)
+{
+    if (check_cast_uint16_int64(in))
+        safe_math_fail("safe_math_fail safe_cast_uint16_int64");
+
+    return (uint16_t)in;
+}
+
+static inline uint16_t safe_cast_uint16_uint64(uint64_t in)
+{
+    if (check_cast_uint16_uint64(in))
+        safe_math_fail("safe_math_fail safe_cast_int16_uint64");
+
+    return (uint16_t)in;
+}
+
+static inline uint32_t safe_cast_uint32_int32(int32_t in)
+{
+    if (check_cast_uint32_int32(in))
+        safe_math_fail("safe_math_fail safe_cast_uint32_int32");
+
+    return (uint32_t)in;
+}
+
+static inline uint32_t safe_cast_uint32_int64(int64_t in)
+{
+    if (check_cast_uint32_int64(in))
+        safe_math_fail("safe_math_fail safe_cast_int32_int64");
+
+    return (uint32_t)in;
+}
+
+static inline uint32_t safe_cast_uint32_uint64(uint64_t in)
+{
+    if (check_cast_uint32_uint64(in))
+        safe_math_fail("safe_math_fail safe_cast_uint32_uint64");
+
+    return (uint32_t)in;
+}
+
+static inline uint64_t safe_cast_uint64_int64(int64_t in)
+{
+    if (check_cast_uint64_int64(in))
+        safe_math_fail("safe_math_fail safe_cast_int64_uint64");
+
+    return (uint64_t)in;
+}
+
+// Addition
+/*
+    For addition and multiplication, there will be checks for the following matrix:
+    - int32
+    - uint32
+    - int64
+    - uint64
+
+    If you want to add smaller types, then do it inside the appropriate safe_cast function,
+    or if adding one of the above and a smaller type, pass it into one that takes a larger
+    size of the same type, for example, uint16 -> uint32.
+*/
+
+static inline int32_t safe_add_int32_int32(int32_t a, int32_t b)
+{
+    return safe_cast_int32_int64((int64_t)a + b);
+}
+
+static inline bool check_add_int32_int32(int32_t a, int32_t b, int32_t* ret)
+{
+    int64_t tmp = (int64_t)a + b;
+    *ret = (int32_t)tmp;
+    return check_cast_int32_int64(tmp) == 0;
+}
+
+static inline int32_t safe_add_int32_uint32(int32_t a, uint32_t b)
+{
+    return safe_cast_int32_int64((int64_t)a + b);
+}
+
+static inline bool check_add_int32_uint32(int32_t a, uint32_t b, int32_t* ret)
+{
+    int64_t tmp = (int64_t)a + b;
+    *ret = (int32_t)tmp;
+    return check_cast_int32_int64(tmp) == 0;
+}
+
+static inline int32_t safe_add_int32_int64(int32_t a, int64_t b)
+{
+    int64_t tmp = (int64_t)((uint64_t)a + (uint64_t)b);
+    
+    if (a >= 0)
+    {
+        // mixed sign cannot overflow
+        if (b >= 0 && tmp < a)
+            safe_math_fail("safe_math_fail safe_add_int32_int64");
+    }
+    else
+    {
+        // lhs negative
+        if (b < 0 && tmp > a)
+            safe_math_fail("safe_math_fail safe_add_int32_int64");
+    }
+
+    return safe_cast_int32_int64(tmp);
+}
+
+static inline bool check_add_int32_int64(int32_t a, int64_t b, int32_t* ret)
+{
+    int64_t tmp = (int64_t)((uint64_t)a + (uint64_t)b);
+    *ret = (int32_t)tmp;
+
+    if (a >= 0)
+    {
+        // mixed sign cannot overflow
+        if (b >= 0 && tmp < a)
+            return false;
+    }
+    else
+    {
+        // lhs negative
+        if (b < 0 && tmp > a)
+            return false;
+    }
+
+    return check_cast_int32_int64(tmp) == 0;
+}
+
+static inline int32_t safe_add_int32_uint64(int32_t a, uint64_t b)
+{
+    if ((uint32_t)(b >> 32) == 0)
+    {
+        // Now it just happens to work out that the standard behavior does what we want
+        // Adding explicit casts to show exactly what's happening here
+        uint32_t tmp = (uint32_t)a + (uint32_t)b;
+
+        if ((int32_t)tmp >= a)
+        {
+            return (int32_t)tmp;
+        }
+    }
+
+    safe_math_fail("safe_math_fail safe_add_int32_uint64");
+}
+
+static inline bool check_add_int32_uint64(int32_t a, uint64_t b, int32_t* ret)
+{
+    if ((uint32_t)(b >> 32) == 0)
+    {
+        // Now it just happens to work out that the standard behavior does what we want
+        // Adding explicit casts to show exactly what's happening here
+        uint32_t tmp = (uint32_t)a + (uint32_t)b;
+        *ret = (int32_t)tmp;
+
+        if ((int32_t)tmp >= a)
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static inline uint32_t safe_add_uint32_int32(uint32_t a, int32_t b)
+{
+    return safe_cast_uint32_int64((int64_t)a + b);
+}
+
+static inline bool check_add_uint32_int32(uint32_t a, int32_t b, uint32_t* ret)
+{
+    int64_t tmp = (int64_t)a + b;
+    *ret = (uint32_t)tmp;
+    return check_cast_uint32_int64(tmp) == 0;
+}
+
+static inline uint32_t safe_add_uint32_uint32(uint32_t a, uint32_t b)
+{
+    uint32_t tmp = a + b;
+    
+    if (tmp < a)
+    {
+        safe_math_fail("safe_math_fail safe_add_uint32_uint32");
+    }
+
+    return tmp;
+}
+
+static inline bool check_add_uint32_uint32(uint32_t a, uint32_t b, uint32_t* ret)
+{
+    uint32_t tmp = a + b;
+    *ret = tmp;
+    return tmp >= a;
+}
+
+static inline uint32_t safe_add_uint32_int64(uint32_t a, int64_t b)
+{
+    if (b < 0)
+    {
+        if (a >= safe_abs64(b)) //negation is safe, since rhs is 64-bit
+        {
+            return (uint32_t)(a + b);
+        }
+    }
+    else
+    {
+        // now we know that rhs can be safely cast into an std::uint64_t
+        uint64_t tmp = (uint64_t)a + (uint64_t)b;
+
+        // special case - rhs cannot be larger than 0x7fffffffffffffff, lhs cannot be larger than 0xffffffff
+        // it is not possible for the operation above to overflow, so just check max
+        return safe_cast_uint32_uint64(tmp);
+    }
+
+    safe_math_fail("safe_math_fail safe_add_uint32_int64");
+}
+
+static inline bool check_add_uint32_int64(uint32_t a, int64_t b, uint32_t* ret)
+{
+    if (b < 0)
+    {
+        if (a >= safe_abs64(b)) //negation is safe, since rhs is 64-bit
+        {
+            *ret = (uint32_t)(a + b);
+            return true;
+        }
+    }
+    else
+    {
+        // now we know that rhs can be safely cast into an std::uint64_t
+        uint64_t tmp = (uint64_t)a + (uint64_t)b;
+
+        // special case - rhs cannot be larger than 0x7fffffffffffffff, lhs cannot be larger than 0xffffffff
+        // it is not possible for the operation above to overflow, so just check max
+        *ret = (uint32_t)tmp;
+        return check_cast_uint32_uint64(tmp) == 0;
+    }
+
+    return false;
+}
+
+static inline uint32_t safe_add_uint32_uint64(uint32_t a, uint64_t b)
+{
+    uint64_t tmp = (uint64_t)a + b;
+    
+    if (tmp >= a && tmp <= UINT32_MAX)
+    {
+        return (uint32_t)tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_add_uint32_uint64");
+}
+
+static inline bool check_add_uint32_uint64(uint32_t a, uint64_t b, uint32_t* ret)
+{
+    uint64_t tmp = (uint64_t)a + b;
+    *ret = (uint32_t)tmp;
+
+    return (tmp >= a && tmp <= UINT32_MAX);
+}
+
+static inline int64_t safe_add_int64_int32(int64_t a, int32_t b)
+{
+    int64_t tmp = (int64_t)((uint64_t)a + (uint64_t)b);
+
+    if (a >= 0)
+    {
+        // mixed sign cannot overflow
+        if (b >= 0 && tmp < a)
+            safe_math_fail("safe_math_fail safe_add_int64_int32");
+    }
+    else
+    {
+        // lhs negative
+        if (b < 0 && tmp > a)
+            safe_math_fail("safe_math_fail safe_add_int64_int32");
+    }
+
+    return tmp;
+}
+
+static inline bool check_add_int64_int32(int64_t a, int32_t b, int64_t* ret)
+{
+    int64_t tmp = (int64_t)((uint64_t)a + (uint64_t)b);
+    *ret = tmp;
+
+    if (a >= 0)
+    {
+        // mixed sign cannot overflow
+        if (b >= 0 && tmp < a)
+            return false;
+    }
+    else
+    {
+        // lhs negative
+        if (b < 0 && tmp > a)
+            return false;
+    }
+
+    return true;
+}
+
+static inline int64_t safe_add_int64_uint32(int64_t a, uint32_t b)
+{
+    uint64_t tmp = (uint64_t)a + (uint64_t)b;
+
+    if ((int64_t)tmp >= a)
+    {
+        return (int64_t)tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_add_int64_uint32");
+}
+
+static inline bool check_add_int64_uint32(int64_t a, uint32_t b, int64_t* ret)
+{
+    uint64_t tmp = (uint64_t)a + (uint64_t)b;
+    *ret = (int64_t)tmp;
+
+    return ((int64_t)tmp >= a);
+}
+
+static inline int64_t safe_add_int64_int64(int64_t a, int64_t b)
+{
+    int64_t tmp = (int64_t)((uint64_t)a + (uint64_t)b);
+
+    if (a >= 0)
+    {
+        // mixed sign cannot overflow
+        if (b >= 0 && tmp < a)
+            safe_math_fail("safe_math_fail safe_add_int64_int64");
+    }
+    else
+    {
+        // lhs negative
+        if (b < 0 && tmp > a)
+            safe_math_fail("safe_math_fail safe_add_int64_int64");
+    }
+
+    return tmp;
+}
+
+static inline bool check_add_int64_int64(int64_t a, int64_t b, int64_t* ret)
+{
+    int64_t tmp = (int64_t)((uint64_t)a + (uint64_t)b);
+    *ret = tmp;
+
+    if (a >= 0)
+    {
+        // mixed sign cannot overflow
+        if (b >= 0 && tmp < a)
+            return false;
+    }
+    else
+    {
+        // lhs negative
+        if (b < 0 && tmp > a)
+            return false;
+    }
+
+    return true;
+}
+
+static inline int64_t safe_add_int64_uint64(int64_t a, uint64_t b)
+{
+    uint64_t tmp = (uint64_t)a + b;
+
+    if ((int64_t)tmp >= a)
+    {
+        return (int64_t)tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_add_int64_uint64");
+}
+
+static inline bool check_add_int64_uint64(int64_t a, uint64_t b, int64_t* ret)
+{
+    uint64_t tmp = (uint64_t)a + b;
+    *ret = (int64_t)tmp;
+
+    return ((int64_t)tmp >= a);
+}
+
+static inline uint64_t safe_add_uint64_int32(uint64_t a, int32_t b)
+{
+    uint64_t tmp = 0;
+
+    if (b < 0)
+    {
+        // So we're effectively subtracting
+        tmp = safe_abs32(b);
+
+        if (tmp <= a)
+        {
+            return a - tmp;
+        }
+    }
+    else
+    {
+        // now we know that rhs can be safely cast into an std::uint64_t
+        tmp = (uint64_t)a + (uint64_t)b;
+
+        // We added and it did not become smaller
+        if (tmp >= a)
+        {
+            return tmp;
+        }
+    }
+
+    safe_math_fail("safe_math_fail safe_add_uint64_int32");
+}
+
+static inline bool check_add_uint64_int32(uint64_t a, int32_t b, uint64_t* ret)
+{
+    uint64_t tmp = 0;
+
+    if (b < 0)
+    {
+        // So we're effectively subtracting
+        tmp = safe_abs32(b);
+
+        if (tmp <= a)
+        {
+            *ret = a - tmp;
+            return true;
+        }
+    }
+    else
+    {
+        // now we know that rhs can be safely cast into an std::uint64_t
+        tmp = (uint64_t)a + (uint64_t)b;
+
+        // We added and it did not become smaller
+        if (tmp >= a)
+        {
+            *ret = tmp;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+
+static inline uint64_t safe_add_uint64_uint32(uint64_t a, uint32_t b)
+{
+    uint64_t tmp = (uint64_t)a + (uint64_t)b;
+
+    // We added and it didn't get smaller
+    if (tmp >= a)
+    {
+        return tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_add_uint64_uint32");
+}
+
+static inline bool check_add_uint64_uint32(uint64_t a, uint32_t b, uint64_t* ret)
+{
+    uint64_t tmp = (uint64_t)a + (uint64_t)b;
+    *ret = tmp;
+
+    // We added and it didn't get smaller
+    return (tmp >= a);
+}
+
+static inline uint64_t safe_add_uint64_int64(uint64_t a, int64_t b)
+{
+    uint64_t tmp = 0;
+
+    if (b < 0)
+    {
+        // So we're effectively subtracting
+        tmp = safe_abs64(b);
+
+        if (tmp <= a)
+        {
+            return a - tmp;
+        }
+    }
+    else
+    {
+        // now we know that rhs can be safely cast into an std::uint64_t
+        tmp = (uint64_t)a + (uint64_t)b;
+
+        // We added and it did not become smaller
+        if (tmp >= a)
+        {
+            return tmp;
+        }
+    }
+
+    safe_math_fail("safe_math_fail safe_add_uint64_int64");
+}
+
+static inline bool check_add_uint64_int64(uint64_t a, int64_t b, uint64_t* ret)
+{
+    uint64_t tmp = 0;
+
+    if (b < 0)
+    {
+        // So we're effectively subtracting
+        tmp = safe_abs64(b);
+
+        if (tmp <= a)
+        {
+            *ret = a - tmp;
+            return true;
+        }
+    }
+    else
+    {
+        // now we know that rhs can be safely cast into an std::uint64_t
+        tmp = (uint64_t)a + (uint64_t)b;
+
+        // We added and it did not become smaller
+        if (tmp >= a)
+        {
+            *ret = tmp;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static inline uint64_t safe_add_uint64_uint64(uint64_t a, uint64_t b)
+{
+    uint64_t tmp = a + b;
+
+    if(tmp < a)
+        safe_math_fail("safe_math_fail safe_add_uint64_uint64");
+
+    return tmp;
+}
+
+static inline bool check_add_uint64_uint64(uint64_t a, uint64_t b, uint64_t* ret)
+{
+    uint64_t tmp = a + b;
+    *ret = tmp;
+    return (tmp >= a);
+}
+
+// As we're working in C, use defines
+// It would be nice to use an enum, but the compiler 
+// will complain that it isn't a proper C++ enum
+#define SAFE_INT_MUL_FAIL 0
+#define SAFE_INT_MUL_SUCCESS 1
+
+// Multiplication primatives
+#if SAFEINT_MULTIPLY_METHOD == SAFEINT_MULTIPLY_INT128
+
+static inline int MultiplyUint64(uint64_t a, uint64_t b, uint64_t* pRet)
+{
+    unsigned __int128 tmp = (unsigned __int128)a * (unsigned __int128)b;
+
+    if ((tmp >> 64) == 0)
+    {
+        *pRet = (uint64_t)tmp;
+        return SAFE_INT_MUL_SUCCESS;
+    }
+
+    return SAFE_INT_MUL_FAIL;
+}
+
+static inline int MultiplyInt64(int64_t a, int64_t b, int64_t* pRet)
+{
+    __int128 tmp = (__int128)a * (__int128)b;
+    int64_t tmp_high = (int64_t)((unsigned __int128)tmp >> 64);
+    *pRet = (int64_t)tmp;
+
+    // If only one input is negative, result must be negative, or zero
+    if ((a ^ b) < 0)
+    {
+        if ((tmp_high == -1 && *pRet < 0) ||
+            (tmp_high == 0 && *pRet == 0))
+        {
+            return SAFE_INT_MUL_SUCCESS;
+        }
+    }
+    else
+    {
+        if (tmp_high == 0 && (uint64_t)*pRet <= (uint64_t)INT64_MAX)
+        {
+            return SAFE_INT_MUL_SUCCESS;
+        }
+    }
+
+    return SAFE_INT_MUL_FAIL;
+}
+
+#elif SAFEINT_MULTIPLY_METHOD == SAFEINT_MULTIPLY_INTRINSICS // Implies Visual Studio compiler
+
+// As usual, unsigned is easy
+static inline int MultiplyUint64(uint64_t a, uint64_t b, uint64_t * pRet)
+{
+    uint64_t ulHigh = 0;
+    *pRet = _umul128(a, b, &ulHigh);
+    return ulHigh == 0 ? SAFE_INT_MUL_SUCCESS : SAFE_INT_MUL_FAIL;
+}
+
+// Signed, is not so easy
+static inline int MultiplyInt64(int64_t a, int64_t b, int64_t* pRet)
+{
+    int64_t llHigh = 0;
+    *pRet = _mul128(a, b, &llHigh);
+
+    // Now we need to figure out what we expect
+    // If llHigh is 0, then treat *pRet as unsigned
+    // If llHigh is < 0, then treat *pRet as signed
+
+    if ((a ^ b) < 0)
+    {
+        // Negative (or zero) result expected
+        if (llHigh == -1 && *pRet < 0 ||
+            llHigh == 0 && *pRet == 0)
+        {
+            // Everything is within range
+            return SAFE_INT_MUL_SUCCESS;
+        }
+    }
+    else
+    {
+        // Result should be positive
+        // Check for overflow
+        if (llHigh == 0 && (uint64_t)*pRet <= (uint64_t)INT64_MAX)
+            return SAFE_INT_MUL_SUCCESS;
+    }
+    return SAFE_INT_MUL_FAIL;
+}
+#elif SAFEINT_MULTIPLY_METHOD == SAFEINT_MULTIPLY_BUILTIN // Implies gcc or clang
+
+static inline int MultiplyUint64(uint64_t a, uint64_t b, uint64_t* pRet)
+{
+    return !__builtin_umulll_overflow(a, b, (unsigned long long*)pRet) ? SAFE_INT_MUL_SUCCESS : SAFE_INT_MUL_FAIL;
+}
+
+static inline int MultiplyInt64(int64_t a, int64_t b, int64_t* pRet)
+{
+    return !__builtin_smulll_overflow(a, b, (long long*)pRet) ? SAFE_INT_MUL_SUCCESS : SAFE_INT_MUL_FAIL;
+}
+
+#elif SAFEINT_MULTIPLY_METHOD == SAFEINT_MULTIPLY_MATH // Just going to have to do the math...
+
+static inline int MultiplyUint64(uint64_t a, uint64_t b, uint64_t* pRet)
+{
+    uint32_t a_high = a >> 32;
+    uint32_t a_low = (uint32_t)a;
+    uint32_t b_high = b >> 32;
+    uint32_t b_low = (uint32_t)b;
+    uint64_t tmp = 0;
+    uint64_t tmp2 = 0;
+
+    /*
+    * Now we have the equivalent of (a_high * 2^32 + a_low) * (b_high * 2^32 + b_low)
+    * Expanding:
+    * result = a_high * b_high * 2^64 + a_high * b_low * 2^32 + b_high * a_low * 2^32 + a_low * b_low
+    * We now get to short circult some things - if a_high > 0 && b_high > 0, fail
+    * and this then implies that only one of the two middle expressions must be evaluated and checked if the result is >= 2^32
+    * finally, do the last term, check addition
+    */
+
+    if (a_high > 0 && b_high > 0)
+    {
+        return SAFE_INT_MUL_FAIL;
+    }
+
+    if (a_high > 0)
+    {
+        tmp = (uint64_t)a_high * b_low;
+    }
+    else
+    {
+        tmp = (uint64_t)b_high * a_low;
+    }
+
+    if (tmp >> 32 != 0)
+    {
+        return SAFE_INT_MUL_FAIL;
+    }
+
+    tmp2 = (uint64_t)a_low * b_low;
+    *pRet = (tmp << 32) + tmp2;
+    return *pRet >= tmp2 ? SAFE_INT_MUL_SUCCESS : SAFE_INT_MUL_FAIL;
+}
+
+static inline int MultiplyInt64(int64_t a, int64_t b, int64_t* pRet)
+{
+    bool aNegative = false;
+    bool bNegative = false;
+
+    uint64_t tmp = 0;
+    int64_t a1 = a;
+    int64_t b1 = b;
+
+    if (a1 < 0)
+    {
+        aNegative = true;
+        a1 = (int64_t)safe_abs64(a1);
+    }
+
+    if (b1 < 0)
+    {
+        bNegative = true;
+        b1 = (int64_t)safe_abs64(b);
+    }
+
+    if (MultiplyUint64((uint64_t)a1, (uint64_t)b1, &tmp))
+    {
+        // The unsigned multiplication didn't overflow
+        if (aNegative ^ bNegative)
+        {
+            // Result must be negative
+            if (tmp <= (uint64_t)INT64_MIN)
+            {
+                *pRet = (int64_t)negate64((int64_t)tmp);
+                return SAFE_INT_MUL_SUCCESS;
+            }
+        }
+        else
+        {
+            // Result must be positive
+            if (tmp <= (uint64_t)INT64_MAX)
+            {
+                *pRet = (int64_t)tmp;
+                return SAFE_INT_MUL_SUCCESS;
+            }
+        }
+    }
+
+    return SAFE_INT_MUL_FAIL;
+}
+
+#else // Shouldn't happen, go find out what's broken
+// If you are aware of intrinsics for some other platform, please file an issue
+# error Intrinsics enabled, no available intrinics defined
+#endif
+
+static inline int32_t safe_mul_int32_int32(int32_t a, int32_t b)
+{
+    int64_t tmp = (int64_t)a * (int64_t)b;
+    return safe_cast_int32_int64(tmp);
+}
+
+static inline bool check_mul_int32_int32(int32_t a, int32_t b, int32_t* ret)
+{
+    int64_t tmp = (int64_t)a * (int64_t)b;
+    *ret = (int32_t)tmp;
+    return check_cast_int32_int64(tmp) == 0;
+}
+
+static inline int32_t safe_mul_int32_uint32(int32_t a, uint32_t b)
+{
+    int64_t tmp = (int64_t)a * (int64_t)b;
+    return safe_cast_int32_int64(tmp);
+}
+
+static inline bool check_mul_int32_uint32(int32_t a, uint32_t b, int32_t* ret)
+{
+    int64_t tmp = (int64_t)a * (int64_t)b;
+    *ret = (int32_t)tmp;
+    return check_cast_int32_int64(tmp) == 0;
+}
+
+static inline int32_t safe_mul_int32_int64(int32_t a, int64_t b)
+{
+    int64_t tmp = 0;
+
+    if (MultiplyInt64((int64_t)a, b, &tmp))
+    {
+        return safe_cast_int32_int64(tmp);
+    }
+
+    safe_math_fail("safe_math_fail safe_mul_int32_int64");
+}
+
+static inline bool check_mul_int32_int64(int32_t a, int64_t b, int32_t* ret)
+{
+    int64_t tmp = 0;
+
+    if (MultiplyInt64((int64_t)a, b, &tmp))
+    {
+        *ret = (int32_t)tmp;
+        return check_cast_int32_int64(tmp) == 0;
+    }
+
+    return false;
+}
+
+static inline int32_t safe_mul_int32_uint64(int32_t a, uint64_t b)
+{
+    uint64_t tmp = 0;
+    if (a < 0)
+    {
+        // Flip sign, use the unsigned function
+        uint64_t a2 = safe_abs64(a);
+        if (MultiplyUint64(a2, b, &tmp) == SAFE_INT_MUL_SUCCESS && tmp <= (uint64_t)INT32_MAX + 1)
+        {
+            // Not too big, flip it back
+            return (int32_t)(tmp + 1);
+        }
+    }
+    else
+    {
+        if (MultiplyUint64((uint64_t)a, b, &tmp) == SAFE_INT_MUL_SUCCESS && tmp <= INT32_MAX)
+        {
+            return (int32_t)tmp;
+        }
+    }
+ 
+    safe_math_fail("safe_math_fail safe_mul_int32_uint64");
+}
+
+static inline bool check_mul_int32_uint64(int32_t a, uint64_t b, int32_t* ret)
+{
+    uint64_t tmp = 0;
+    if (a < 0)
+    {
+        // Flip sign, use the unsigned function
+        uint64_t a2 = safe_abs64(a);
+        if (MultiplyUint64(a2, b, &tmp) == SAFE_INT_MUL_SUCCESS && tmp <= (uint64_t)INT32_MAX + 1)
+        {
+            // Not too big, flip it back
+            *ret = (int32_t)(tmp + 1);
+            return true;
+        }
+    }
+    else
+    {
+        if (MultiplyUint64((uint64_t)a, b, &tmp) == SAFE_INT_MUL_SUCCESS && tmp <= INT32_MAX)
+        {
+            *ret = (int32_t)tmp;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static inline uint32_t safe_mul_uint32_int32(uint32_t a, int32_t b)
+{
+    int64_t tmp = (int64_t)a * (int64_t)b;
+    return safe_cast_uint32_int64(tmp);
+}
+
+static inline bool check_mul_uint32_int32(uint32_t a, int32_t b, uint32_t* ret)
+{
+    int64_t tmp = (int64_t)a * (int64_t)b;
+    *ret = (uint32_t)tmp;
+    return check_cast_uint32_int64(tmp) == 0;
+}
+
+static inline uint32_t safe_mul_uint32_uint32(uint32_t a, uint32_t b)
+{
+    uint64_t tmp = (uint64_t)a * (uint64_t)b;
+    return safe_cast_uint32_uint64(tmp);
+}
+
+static inline bool check_mul_uint32_uint32(uint32_t a, uint32_t b, uint32_t* ret)
+{
+    uint64_t tmp = (uint64_t)a * (uint64_t)b;
+    *ret = (uint32_t)tmp;
+    return check_cast_uint32_uint64(tmp) == 0;
+}
+
+static inline uint32_t safe_mul_uint32_int64(uint32_t a, int64_t b)
+{
+    int64_t tmp = 0;
+
+    if (MultiplyInt64((int64_t)a, b, &tmp) == SAFE_INT_MUL_SUCCESS && tmp <= UINT32_MAX && tmp >= 0)
+    {
+        return (uint32_t)tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_mul_uint32_int64");
+}
+
+static inline bool check_mul_uint32_int64(uint32_t a, int64_t b, uint32_t* ret)
+{
+    int64_t tmp = 0;
+
+    if (MultiplyInt64((int64_t)a, b, &tmp) == SAFE_INT_MUL_SUCCESS && tmp <= UINT32_MAX && tmp >= 0)
+    {
+        *ret = (uint32_t)tmp;
+        return true;
+    }
+
+    return false;
+}
+
+static inline uint32_t safe_mul_uint32_uint64(uint32_t a, uint64_t b)
+{
+    uint64_t tmp = 0;
+
+    if (MultiplyUint64((uint64_t)a, b, &tmp) == SAFE_INT_MUL_SUCCESS && tmp <= UINT32_MAX)
+    {
+        return (uint32_t)tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_mul_uint32_uint64");
+}
+
+static inline bool check_mul_uint32_uint64(uint32_t a, uint64_t b, uint32_t* ret)
+{
+    uint64_t tmp = 0;
+
+    if (MultiplyUint64((uint64_t)a, b, &tmp) == SAFE_INT_MUL_SUCCESS && tmp <= UINT32_MAX)
+    {
+        *ret = (uint32_t)tmp;
+        return true;
+    }
+
+    return false;
+}
+
+static inline int64_t safe_mul_int64_int32(int64_t a, int32_t b)
+{
+    int64_t tmp = 0;
+
+    if (MultiplyInt64(a, (int64_t)b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        return tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_mul_int64_int32");
+}
+
+static inline bool check_mul_int64_int32(int64_t a, int32_t b, int64_t* ret)
+{
+    int64_t tmp = 0;
+
+    if (MultiplyInt64(a, (int64_t)b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        *ret = tmp;
+        return true;
+    }
+
+    return false;
+}
+
+static inline int64_t safe_mul_int64_uint32(int64_t a, uint32_t b)
+{
+    int64_t tmp = 0;
+
+    if (MultiplyInt64(a, (int64_t)b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        return tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_mul_int64_uint32");
+}
+
+static inline bool check_mul_int64_uint32(int64_t a, uint32_t b, int64_t* ret)
+{
+    int64_t tmp = 0;
+
+    if (MultiplyInt64(a, (int64_t)b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        *ret = tmp;
+        return true;
+    }
+
+    return false;
+}
+
+static inline int64_t safe_mul_int64_int64(int64_t a, int64_t b)
+{
+    int64_t tmp = 0;
+
+    if (MultiplyInt64(a, b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        return tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_mul_int64_int64");
+}
+
+static inline bool check_mul_int64_int64(int64_t a, int64_t b, int64_t* ret)
+{
+    int64_t tmp = 0;
+
+    if (MultiplyInt64(a, b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        *ret = tmp;
+        return true;
+    }
+
+    return false;
+}
+
+static inline int64_t safe_mul_int64_uint64(int64_t a, uint64_t b)
+{
+    uint64_t tmp = 0;
+
+    if (a < 0)
+    {
+        uint64_t a2 = safe_abs64(a);
+
+        if (MultiplyUint64(a2, b, &tmp) == SAFE_INT_MUL_SUCCESS && tmp <= (uint64_t)0x8000000000000000)
+        {
+            return negate64((int64_t)tmp);
+        }
+    }
+    else
+    {
+        if (MultiplyUint64((uint64_t)a, b, &tmp) == SAFE_INT_MUL_SUCCESS && tmp <= (uint64_t)INT64_MAX)
+        {
+            return (int64_t)tmp;
+        }
+    }
+
+    safe_math_fail("safe_math_fail safe_mul_int64_uint64");
+}
+
+static inline bool check_mul_int64_uint64(int64_t a, uint64_t b, int64_t* ret)
+{
+    uint64_t tmp = 0;
+
+    if (a < 0)
+    {
+        uint64_t a2 = safe_abs64(a);
+
+        if (MultiplyUint64(a2, b, &tmp) == SAFE_INT_MUL_SUCCESS && tmp <= (uint64_t)0x8000000000000000)
+        {
+            *ret = negate64((int64_t)tmp);
+            return true;
+        }
+    }
+    else
+    {
+        if (MultiplyUint64((uint64_t)a, b, &tmp) == SAFE_INT_MUL_SUCCESS && tmp <= (uint64_t)INT64_MAX)
+        {
+            *ret = (int64_t)tmp;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static inline uint64_t safe_mul_uint64_int32(uint64_t a, int32_t b)
+{
+    uint64_t tmp;
+
+    if (b < 0)
+    {
+        if (a == 0)
+            return 0;
+
+        safe_math_fail("safe_math_fail safe_mul_uint64_int32");
+    }
+   
+    if (MultiplyUint64(a, (uint64_t)b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        return tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_mul_uint64_int32");
+}
+
+static inline bool check_mul_uint64_int32(uint64_t a, int32_t b, uint64_t* ret)
+{
+    uint64_t tmp;
+
+    if (b < 0)
+    {
+        if (a == 0)
+        {
+            *ret = 0;
+            return true;
+        }
+
+        return false;
+    }
+
+    if (MultiplyUint64(a, (uint64_t)b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        *ret = tmp;
+        return true;
+    }
+
+    return false;
+}
+
+static inline uint64_t safe_mul_uint64_uint32(uint64_t a, uint32_t b)
+{
+    uint64_t tmp;
+
+    if (MultiplyUint64(a, (uint64_t)b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        return tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_mul_uint64_uint32");
+}
+
+static inline bool check_mul_uint64_uint32(uint64_t a, uint32_t b, uint64_t* ret)
+{
+    uint64_t tmp;
+
+    if (MultiplyUint64(a, (uint64_t)b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        *ret = tmp;
+        return true;
+    }
+
+    return false;
+}
+
+static inline uint64_t safe_mul_uint64_int64(uint64_t a, int64_t b)
+{
+    uint64_t tmp;
+
+    if (b < 0)
+    {
+        if (a == 0)
+            return 0;
+
+        safe_math_fail("safe_math_fail safe_mul_uint64_int32");
+    }
+
+    if (MultiplyUint64(a, (uint64_t)b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        return tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_mul_uint64_int64");
+}
+
+static inline bool check_mul_uint64_int64(uint64_t a, int64_t b, uint64_t* ret)
+{
+    uint64_t tmp;
+
+    if (b < 0)
+    {
+        if (a == 0)
+        {
+            *ret = 0;
+            return true;
+        }
+
+        return false;
+    }
+
+    if (MultiplyUint64(a, (uint64_t)b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        *ret = tmp;
+        return true;
+    }
+
+    return false;
+}
+
+static inline uint64_t safe_mul_uint64_uint64(uint64_t a, uint64_t b)
+{
+    uint64_t tmp;
+
+    if (MultiplyUint64(a, b, &tmp) == SAFE_INT_MUL_SUCCESS)
+    {
+        return tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_mul_uint64_uint64");
+}
+
+static inline bool check_mul_uint64_uint64(uint64_t a, uint64_t b, uint64_t* ret)
+{
+    return (MultiplyUint64(a, b, ret) == SAFE_INT_MUL_SUCCESS);
+}
+
+static inline int32_t safe_div_int32_int32(int32_t a, int32_t b)
+{
+    if (b != 0 && !(a == INT32_MIN && b == -1))
+    {
+        return a / b;
+    }
+    safe_math_fail("safe_math_fail safe_div_int32_int32");
+}
+
+static inline bool check_div_int32_int32(int32_t a, int32_t b, int32_t* ret)
+{
+    if (b != 0 && !(a == INT32_MIN && b == -1))
+    {
+        *ret = a / b;
+        return true;
+    }
+    return false;
+}
+
+static inline int32_t safe_div_int32_uint32(int32_t a, uint32_t b)
+{
+    if (b != 0)
+    {
+        return (int32_t)((int64_t)a / (int64_t)b);
+    }
+    safe_math_fail("safe_math_fail safe_div_int32_uint32");
+}
+
+static inline bool check_div_int32_uint32(int32_t a, uint32_t b, int32_t* ret)
+{
+    if (b != 0)
+    {
+        *ret = (int32_t)((int64_t)a / (int64_t)b);
+        return true;
+    }
+
+    return false;
+}
+
+static inline int32_t safe_div_int32_int64(int32_t a, int64_t b)
+{
+    if (b != 0 && !(a == INT32_MIN && b == -1))
+    {
+        return (int32_t)(a / b);
+    }
+    safe_math_fail("safe_math_fail safe_div_int32_int64");
+}
+
+static inline bool check_div_int32_int64(int32_t a, int64_t b, int32_t* ret)
+{
+    if (b != 0 && !(a == INT32_MIN && b == -1))
+    {
+        *ret = (int32_t)(a / b);
+        return true;
+    }
+
+    return false;
+}
+
+static inline int32_t safe_div_int32_uint64(int32_t a, uint64_t b)
+{
+    if (b == 0)
+    {
+        safe_math_fail("safe_math_fail safe_div_int32_uint64");
+    }
+
+    if (a > 0)
+    {
+        return (int32_t)((uint64_t)a / b);
+    }
+    else
+    {
+        uint64_t a2 = (uint64_t)safe_abs32(a);
+        a2 /= b;
+        return (int32_t)negate32((int32_t)a2);
+    }
+}
+
+static inline bool check_div_int32_uint64(int32_t a, uint64_t b, int32_t* ret)
+{
+    if (b == 0)
+    {
+        return false;
+    }
+
+    if (a > 0)
+    {
+        *ret = (int32_t)((uint64_t)a / b);
+        return true;
+    }
+    else
+    {
+        uint64_t a2 = (uint64_t)safe_abs32(a);
+        a2 /= b;
+        *ret = (int32_t)negate32((int32_t)a2);
+        return true;
+    }
+}
+
+static inline uint32_t safe_div_uint32_int32(uint32_t a, int32_t b)
+{
+    // Follow original SafeInt logic for this case
+    if (b == 0) // div 0 always a problem
+    {
+        safe_math_fail("safe_math_fail safe_div_uint32_int32");
+    }
+
+    if (a == 0) // zero divided by anything is zero
+    {
+        return 0;
+    }
+
+    if (b > 0) // if b is positive, just do the math
+    {
+        return (a / (uint32_t)b);
+    }
+    else // now have to check magnitude
+    {
+        uint32_t tmp = safe_abs32(b);
+
+        if (a < tmp)
+        {
+            return 0;
+        }
+    }
+
+    safe_math_fail("safe_math_fail safe_div_uint32_int32");
+}
+
+static inline bool check_div_uint32_int32(uint32_t a, int32_t b, uint32_t* ret)
+{
+    // Follow original SafeInt logic for this case
+    if (b == 0) // div 0 always a problem
+    {
+        return false;
+    }
+
+    if (a == 0) // zero divided by anything is zero
+    {
+        *ret = 0;
+        return true;
+    }
+
+    if (b > 0) // if b is positive, just do the math
+    {
+        *ret = (a / (uint32_t)b);
+        return true;
+    }
+    else // now have to check magnitude
+    {
+        uint32_t tmp = safe_abs32(b);
+
+        if (a < tmp)
+        {
+            *ret = 0;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static inline uint32_t safe_div_uint32_uint32(uint32_t a, uint32_t b)
+{
+    if (b > 0)
+    {
+        return (uint32_t)(a / b);
+    }
+    safe_math_fail("safe_math_fail safe_div_uint32_uint32");
+}
+
+static inline bool check_div_uint32_uint32(uint32_t a, uint32_t b, uint32_t* ret)
+{
+    if (b > 0)
+    {
+        *ret = (uint32_t)(a / b);
+        return true;
+    }
+
+    return false;
+}
+
+static inline uint32_t safe_div_uint32_int64(uint32_t a, int64_t b)
+{
+    // Follow original SafeInt logic for this case
+    if (b == 0) // div 0 always a problem
+    {
+        safe_math_fail("safe_math_fail safe_div_uint32_int64");
+    }
+
+    if (a == 0) // zero divided by anything is zero
+    {
+        return 0;
+    }
+
+    if (b > 0) // if b is positive, just do the math
+    {
+        return (uint32_t)(a / b);
+    }
+    else // now have to check magnitude
+    {
+        uint64_t tmp = safe_abs64(b);
+
+        if (a < tmp)
+        {
+            return 0;
+        }
+    }
+
+    safe_math_fail("safe_math_fail safe_div_uint32_int64");
+}
+
+static inline bool check_div_uint32_int64(uint32_t a, int64_t b, uint32_t* ret)
+{
+    // Follow original SafeInt logic for this case
+    if (b == 0) // div 0 always a problem
+    {
+        return false;
+    }
+
+    if (a == 0) // zero divided by anything is zero
+    {
+        *ret = 0;
+        return true;
+    }
+
+    if (b > 0) // if b is positive, just do the math
+    {
+        *ret = (uint32_t)(a / b);
+        return true;
+    }
+    else // now have to check magnitude
+    {
+        uint64_t tmp = safe_abs64(b);
+
+        if (a < tmp)
+        {
+            *ret = 0;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static inline uint32_t safe_div_uint32_uint64(uint32_t a, uint64_t b)
+{
+    if (b > 0)
+    {
+        return (uint32_t)(a / b);
+    }
+    safe_math_fail("safe_math_fail safe_div_uint32_uint64");
+}
+
+static inline bool check_div_uint32_uint64(uint32_t a, uint64_t b, uint32_t* ret)
+{
+    if (b > 0)
+    {
+        *ret = (uint32_t)(a / b);
+        return true;
+    }
+    return false;
+}
+
+static inline int64_t safe_div_int64_int32(int64_t a, int32_t b)
+{
+    if(b == 0 || (b == -1 && a == INT64_MIN))
+        safe_math_fail("safe_math_fail safe_div_int64_int32");
+
+    return a / b;
+}
+
+static inline bool check_div_int64_int32(int64_t a, int32_t b, int64_t* ret)
+{
+    if (b == 0 || (b == -1 && a == INT64_MIN))
+        return false;
+
+    *ret = a / b;
+    return true;
+}
+
+static inline int64_t safe_div_int64_uint32(int64_t a, uint32_t b)
+{
+    if (b == 0)
+        safe_math_fail("safe_math_fail safe_div_int64_int32");
+
+    return a / b;
+}
+
+static inline bool check_div_int64_uint32(int64_t a, uint32_t b, int64_t* ret)
+{
+    if (b == 0)
+        return false;
+
+    *ret = a / b;
+    return true;
+}
+
+static inline int64_t safe_div_int64_int64(int64_t a, int64_t b)
+{
+    if (b == 0 || (b == -1 && a == INT64_MIN))
+        safe_math_fail("safe_math_fail safe_div_int64_int32");
+
+    return a / b;
+}
+
+static inline bool check_div_int64_int64(int64_t a, int64_t b, int64_t* ret)
+{
+    if (b == 0 || (b == -1 && a == INT64_MIN))
+        return false;
+
+    *ret = a / b;
+    return true;
+
+}
+
+static inline int64_t safe_div_int64_uint64(int64_t a, uint64_t b)
+{
+    if (b == 0)
+        safe_math_fail("safe_math_fail safe_div_int64_int32");
+
+    if(a >= 0)
+    {
+        return (int64_t)((uint64_t)a / b);
+    }
+    else
+    {
+        // Need to get the magnitude, divide, and then negate
+        uint64_t tmp = safe_abs64(a);
+        tmp /= b;
+        return negate64((int64_t)tmp);
+    }
+}
+
+static inline bool check_div_int64_uint64(int64_t a, uint64_t b, int64_t* ret)
+{
+    if (b == 0)
+        return false;
+
+    if(a >= 0)
+    {
+        *ret = (int64_t)((uint64_t)a / b);
+    }
+    else
+    {
+        // Need to get the magnitude, divide, and then negate
+        uint64_t tmp = safe_abs64(a);
+        tmp /= b;
+        *ret = negate64((int64_t)tmp);
+    }
+        return true;
+}
+
+static inline uint64_t safe_div_uint64_int32(uint64_t a, int32_t b)
+{
+    // Follow original SafeInt logic for this case
+    if (b == 0) // div 0 always a problem
+    {
+        safe_math_fail("safe_math_fail safe_div_int64_int32");
+    }
+
+    if (a == 0) // zero divided by anything is zero
+    {
+        return 0;
+    }
+
+    if (b > 0) // if b is positive, just do the math
+    {
+        return a / (uint64_t)b;
+    }
+    else // now have to check magnitude
+    {
+        uint32_t tmp = safe_abs32(b);
+
+        if (a < tmp)
+        {
+            return 0;
+        }
+    }
+
+    safe_math_fail("safe_math_fail safe_div_int64_int32");
+}
+
+static inline bool check_div_uint64_int32(uint64_t a, int32_t b, uint64_t* ret)
+{
+    // Follow original SafeInt logic for this case
+    if (b == 0) // div 0 always a problem
+    {
+        return false;
+    }
+
+    if (a == 0) // zero divided by anything is zero
+    {
+        *ret = 0;
+        return true;
+    }
+
+    if (b > 0) // if b is positive, just do the math
+    {
+        *ret = a / (uint64_t)b;
+        return true;
+    }
+    else // now have to check magnitude
+    {
+        uint32_t tmp = safe_abs32(b);
+
+        if (a < tmp)
+        {
+            *ret = 0;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static inline uint64_t safe_div_uint64_uint32(uint64_t a, uint32_t b)
+{
+    if (b != 0)
+        return a / b;
+
+    safe_math_fail("safe_math_fail safe_div_int64_uint32");
+}
+
+static inline bool check_div_uint64_uint32(uint64_t a, uint32_t b, uint64_t* ret)
+{
+    if (b != 0)
+    {
+        *ret = a / b;
+        return true;
+    }
+    
+    return false;
+}
+
+static inline uint64_t safe_div_uint64_int64(uint64_t a, int64_t b)
+{
+    // Follow original SafeInt logic for this case
+    if (b == 0) // div 0 always a problem
+    {
+        safe_math_fail("safe_math_fail safe_div_int64_int32");
+    }
+
+    if (a == 0) // zero divided by anything is zero
+    {
+        return 0;
+    }
+
+    if (b > 0) // if b is positive, just do the math
+    {
+        return a / (uint64_t)b;
+    }
+    else // now have to check magnitude
+    {
+        uint64_t tmp = safe_abs64(b);
+
+        if (a < tmp)
+        {
+            return 0;
+        }
+    }
+
+    safe_math_fail("safe_math_fail safe_div_int64_int32");
+}
+
+static inline bool check_div_uint64_int64(uint64_t a, int64_t b, uint64_t* ret)
+{
+    // Follow original SafeInt logic for this case
+    if (b == 0) // div 0 always a problem
+    {
+        return false;
+    }
+
+    if (a == 0) // zero divided by anything is zero
+    {
+        *ret = 0;
+        return true;
+    }
+
+    if (b > 0) // if b is positive, just do the math
+    {
+        *ret = a / (uint64_t)b;
+        return true;
+    }
+    else // now have to check magnitude
+    {
+        uint64_t tmp = safe_abs64(b);
+
+        if (a < tmp)
+        {
+            *ret = 0;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static inline uint64_t safe_div_uint64_uint64(uint64_t a, uint64_t b)
+{
+    if (b != 0)
+        return a / b;
+
+    safe_math_fail("safe_math_fail safe_div_int64_uint32");
+}
+
+static inline bool check_div_uint64_uint64(uint64_t a, uint64_t b, uint64_t* ret)
+{
+    if (b != 0)
+    {
+        *ret = a / b;
+        return true;
+    }
+
+    return false;
+}
+
+static inline int32_t safe_sub_int32_int32(int32_t a, int32_t b)
+{
+    int64_t tmp = (int64_t)a - (int64_t)b;
+    return safe_cast_int32_int64(tmp);
+}
+
+static inline bool check_sub_int32_int32(int32_t a, int32_t b, int32_t* ret)
+{
+    int64_t tmp = (int64_t)a - (int64_t)b;
+    *ret = (int32_t)tmp;
+    return check_cast_int32_int64(tmp) == 0;
+}
+
+static inline int32_t safe_sub_int32_uint32(int32_t a, uint32_t b)
+{
+    int64_t tmp = (int64_t)a - (int64_t)b;
+    return safe_cast_int32_int64(tmp);
+}
+
+static inline bool check_sub_int32_uint32(int32_t a, uint32_t b, int32_t* ret)
+{
+    int64_t tmp = (int64_t)a - (int64_t)b;
+    *ret = (int32_t)tmp;
+    return check_cast_int32_int64(tmp) == 0;
+}
+
+static inline int32_t safe_sub_int32_int64(int32_t a, int64_t b)
+{
+    // We have 4 fairly complex cases:
+    // lhs positive, rhs positive - rhs could be larger than lhs can represent
+    // lhs positive, rhs negative - additive case - check tmp >= lhs and tmp > max int
+    // lhs negative, rhs positive - check tmp <= lhs and tmp < min int
+    // lhs negative, rhs negative - addition cannot internally overflow, check against max
+
+    int64_t tmp = (int64_t)((uint64_t)a - (uint64_t)b);
+
+    if (a >= 0)
+    {
+        // first case
+        if (b >= 0)
+        {
+            if (tmp >= INT32_MIN)
+            {
+                return (int32_t)tmp;
+            }
+        }
+        else
+        {
+            // second case
+            if (tmp >= a && tmp <= INT32_MAX)
+            {
+                return (int32_t)tmp;
+            }
+        }
+    }
+    else
+    {
+        // lhs < 0
+        // third case
+        if (b >= 0)
+        {
+            if (tmp <= a && tmp >= INT32_MIN)
+            {
+                return (int32_t)tmp;
+            }
+        }
+        else
+        {
+            // fourth case
+            if (tmp <= INT32_MAX)
+            {
+                return (int32_t)tmp;
+            }
+        }
+    }
+
+    safe_math_fail("safe_math_fail safe_sub_int32_int64");
+}
+
+static inline bool check_sub_int32_int64(int32_t a, int64_t b, int32_t* ret)
+{
+    // See above for documentation
+    int64_t tmp = (int64_t)((uint64_t)a - (uint64_t)b);
+
+    if (a >= 0)
+    {
+        // first case
+        if (b >= 0)
+        {
+            if (tmp >= INT32_MIN)
+            {
+                *ret = (int32_t)tmp;
+                return true;
+            }
+        }
+        else
+        {
+            // second case
+            if (tmp >= a && tmp <= INT32_MAX)
+            {
+                *ret = (int32_t)tmp;
+                return true;
+            }
+        }
+    }
+    else
+    {
+        // lhs < 0
+        // third case
+        if (b >= 0)
+        {
+            if (tmp <= a && tmp >= INT32_MIN)
+            {
+                *ret = (int32_t)tmp;
+                return true;
+            }
+        }
+        else
+        {
+            // fourth case
+            if (tmp <= INT32_MAX)
+            {
+                *ret = (int32_t)tmp;
+                return true;
+            }
+        }
+    }
+
+    return false;
+}
+
+static inline int32_t safe_sub_int32_uint64(int32_t a, uint64_t b)
+{
+    // We need the absolute value of INT32_MIN
+    // This will give it to us without extraneous compiler warnings
+    const uint64_t AbsMinInt32 = (uint64_t)INT32_MAX + 1;
+
+    if (a < 0)
+    {
+        if (b <= AbsMinInt32 - safe_abs32(a))
+        {
+            return (int32_t)(a - (int64_t)b);
+        }
+    }
+    else
+    {
+        if (b <= AbsMinInt32 + (uint64_t)a)
+        {
+            return (int32_t)(a - (int64_t)b);
+        }
+    }
+
+    safe_math_fail("safe_math_fail safe_sub_int32_uint64");
+}
+
+static inline bool check_sub_int32_uint64(int32_t a, uint64_t b, int32_t* ret)
+{
+    // We need the absolute value of INT32_MIN
+    // This will give it to us without extraneous compiler warnings
+    const uint64_t AbsMinInt32 = (uint64_t)INT32_MAX + 1;
+
+    if (a < 0)
+    {
+        if (b <= AbsMinInt32 - safe_abs32(a))
+        {
+            *ret = (int32_t)(a - (int64_t)b);
+            return true;
+        }
+    }
+    else
+    {
+        if (b <= AbsMinInt32 + (uint64_t)a)
+        {
+            *ret = (int32_t)((int64_t)a - (int64_t)b);
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static inline uint32_t safe_sub_uint32_int32(uint32_t a, int32_t b)
+{
+    int64_t tmp = (int64_t)a - (int64_t)b;
+    return safe_cast_uint32_int64(tmp);
+}
+
+static inline bool check_sub_uint32_int32(uint32_t a, int32_t b, uint32_t* ret)
+{
+    int64_t tmp = (int64_t)a - (int64_t)b;
+    *ret = (uint32_t)tmp;
+    return check_cast_uint32_int64(tmp) == 0;
+}
+
+static inline uint32_t safe_sub_uint32_uint32(uint32_t a, uint32_t b)
+{
+    if (a >= b)
+        return a - b;
+
+    safe_math_fail("safe_math_fail safe_sub_uint32_uint32");
+}
+
+static inline bool check_sub_uint32_uint32(uint32_t a, uint32_t b, uint32_t* ret)
+{
+    if (a >= b)
+    {
+        *ret = a - b;
+        return true;
+    }
+
+    return false;
+}
+
+static inline uint32_t safe_sub_uint32_int64(uint32_t a, int64_t b)
+{
+    // must first see if rhs is positive or negative
+    if (b >= 0)
+    {
+        if ((uint64_t)b <= a)
+        {
+            return (uint32_t)(a - (uint32_t)b);
+        }
+    }
+    else
+    {
+        // we're now effectively adding
+        // since lhs is 32-bit, and rhs cannot exceed 2^63
+        // this addition cannot overflow
+        uint64_t tmp = a + (uint64_t)negate64(b); // negation safe
+
+        // but we could exceed UINT32_MAX
+        if (tmp <= UINT32_MAX)
+        {
+            return (uint32_t)tmp;
+        }
+    }
+
+    safe_math_fail("safe_math_fail safe_sub_uint32_int64");
+}
+
+static inline bool check_sub_uint32_int64(uint32_t a, int64_t b, uint32_t* ret)
+{
+    // must first see if rhs is positive or negative
+    if (b >= 0)
+    {
+        if ((uint64_t)b <= a)
+        {
+            *ret = (uint32_t)(a - (uint32_t)b);
+            return true;
+        }
+    }
+    else
+    {
+        // we're now effectively adding
+        // since lhs is 32-bit, and rhs cannot exceed 2^63
+        // this addition cannot overflow
+        uint64_t tmp = a + (uint64_t)negate64(b); // negation safe
+
+        // but we could exceed UINT32_MAX
+        if (tmp <= UINT32_MAX)
+        {
+            *ret = (uint32_t)tmp;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static inline uint32_t safe_sub_uint32_uint64(uint32_t a, uint64_t b)
+{
+    if (a >= b)
+        return (uint32_t)(a - b);
+
+    safe_math_fail("safe_math_fail safe_sub_uint32_uint64");
+}
+
+static inline bool check_sub_uint32_uint64(uint32_t a, uint64_t b, uint32_t* ret)
+{
+    if (a >= b)
+    {
+        *ret = (uint32_t)(a - b);
+        return true;
+    }
+    return false;
+}
+
+static inline int64_t safe_sub_int64_int32(int64_t a, int32_t b)
+{
+    // we have essentially 4 cases:
+    //
+    // 1) lhs positive, rhs positive - overflow not possible
+    // 2) lhs positive, rhs negative - equivalent to addition - result >= lhs or error
+    // 3) lhs negative, rhs positive - check result <= lhs
+    // 4) lhs negative, rhs negative - overflow not possible
+
+    int64_t tmp = (int64_t)((uint64_t)a - (uint64_t)b);
+
+    // Note - ideally, we can order these so that true conditionals
+    // lead to success, which enables better pipelining
+    // It isn't practical here
+    if ((a >= 0 && b < 0 && tmp < a) || // condition 2
+        (b >= 0 && tmp > a))              // condition 3
+    {
+        safe_math_fail("safe_math_fail safe_sub_int64_int32");
+    }
+
+    return tmp;
+}
+
+static inline bool check_sub_int64_int32(int64_t a, int32_t b, int64_t* ret)
+{
+    int64_t tmp = (int64_t)((uint64_t)a - (uint64_t)b);
+
+    // Note - ideally, we can order these so that true conditionals
+    // lead to success, which enables better pipelining
+    // It isn't practical here
+    if ((a >= 0 && b < 0 && tmp < a) || // condition 2
+        (b >= 0 && tmp > a))              // condition 3
+    {
+        return false;
+    }
+
+    *ret = tmp;
+    return true;
+}
+
+static inline int64_t safe_sub_int64_uint32(int64_t a, uint32_t b)
+{
+    // lhs is a 64-bit int, rhs unsigned int32 or smaller
+    // perform test as unsigned to prevent unwanted optimizations
+    uint64_t tmp = (uint64_t)a - (uint64_t)b;
+
+    if ((int64_t)tmp <= a)
+    {
+        return (int64_t)tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_sub_int64_int64");
+}
+
+static inline bool check_sub_int64_uint32(int64_t a, uint32_t b, int64_t* ret)
+{
+    uint64_t tmp = (uint64_t)a - (uint64_t)b;
+
+    if ((int64_t)tmp <= a)
+    {
+        *ret = (int64_t)tmp;
+        return true;
+    }
+
+    return false;
+}
+
+static inline int64_t safe_sub_int64_int64(int64_t a, int64_t b)
+{
+    // we have essentially 4 cases:
+    //
+    // 1) lhs positive, rhs positive - overflow not possible
+    // 2) lhs positive, rhs negative - equivalent to addition - result >= lhs or error
+    // 3) lhs negative, rhs positive - check result <= lhs
+    // 4) lhs negative, rhs negative - overflow not possible
+
+    int64_t tmp = (int64_t)((uint64_t)a - (uint64_t)b);
+
+    // Note - ideally, we can order these so that true conditionals
+    // lead to success, which enables better pipelining
+    // It isn't practical here
+    if ((a >= 0 && b < 0 && tmp < a) || // condition 2
+        (b >= 0 && tmp > a))              // condition 3
+    {
+        safe_math_fail("safe_math_fail safe_sub_int64_int64");
+    }
+
+    return tmp;
+}
+
+static inline bool check_sub_int64_int64(int64_t a, int64_t b, int64_t* ret)
+{
+    int64_t tmp = (int64_t)((uint64_t)a - (uint64_t)b);
+
+    // Note - ideally, we can order these so that true conditionals
+    // lead to success, which enables better pipelining
+    // It isn't practical here
+    if ((a >= 0 && b < 0 && tmp < a) || // condition 2
+        (b >= 0 && tmp > a))              // condition 3
+    {
+        return false;
+    }
+
+    *ret = tmp;
+    return true;
+}
+
+static inline int64_t safe_sub_int64_uint64(int64_t a, uint64_t b)
+{
+    // if we subtract, and it gets larger, there's a problem
+    // Perform test as unsigned to prevent unwanted optimizations
+    uint64_t tmp = (uint64_t)a - b;
+
+    if ((int64_t)tmp <= a)
+    {
+        return (int64_t)tmp;
+    }
+
+    safe_math_fail("safe_math_fail safe_sub_int64_uint64");
+}
+
+static inline bool check_sub_int64_uint64(int64_t a, uint64_t b, int64_t* ret)
+{
+    uint64_t tmp = (uint64_t)a - b;
+    *ret = (int64_t)tmp;
+
+    return ((int64_t)tmp <= a);
+}
+
+static inline uint64_t safe_sub_uint64_int32(uint64_t a, int32_t b)
+{
+    // lhs is an uint64_t, rhs signed
+    // must first see if rhs is positive or negative
+    if (b >= 0)
+    {
+        if ((uint64_t)b <= a)
+        {
+            return (uint64_t)(a - (uint64_t)b);
+        }
+    }
+    else
+    {
+        uint64_t tmp = a;
+        // we're now effectively adding
+        uint64_t result = a + safe_abs64(b);
+
+        if (result >= tmp)
+            return result;
+    }
+
+    safe_math_fail("safe_math_fail safe_sub_uint64_int32");
+}
+
+static inline bool check_sub_uint64_int32(uint64_t a, int32_t b, uint64_t* ret)
+{
+    if (b >= 0)
+    {
+        if ((uint64_t)b <= a)
+        {
+            *ret = (uint64_t)(a - (uint64_t)b);
+            return true;
+        }
+    }
+    else
+    {
+        uint64_t tmp = a;
+        // we're now effectively adding
+        uint64_t result = a + safe_abs64(b);
+
+        if (result >= tmp)
+        {
+            *ret = result;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static inline uint64_t safe_sub_uint64_uint32(uint64_t a, uint32_t b)
+{
+    uint64_t tmp = a - b;
+
+    if (tmp <= a)
+        return tmp;
+
+    safe_math_fail("safe_math_fail safe_sub_uint64_uint32");
+}
+
+static inline bool check_sub_uint64_uint32(uint64_t a, uint32_t b, uint64_t* ret)
+{
+    uint64_t tmp = a - b;
+    *ret = tmp;
+    return (tmp <= a);
+}
+
+static inline uint64_t safe_sub_uint64_int64(uint64_t a, int64_t b)
+{
+    uint64_t result = 0;
+
+    // must first see if rhs is positive or negative
+    if (b >= 0)
+    {
+        if ((uint64_t)b <= a)
+        {
+            return (a - (uint64_t)b);
+        }
+    }
+    else
+    {
+        // we're now effectively adding
+        result = a + safe_abs64(b);
+
+        if (result >= a)
+            return result;
+    }
+
+    safe_math_fail("safe_math_fail safe_sub_uint64_int64");
+}
+
+static inline bool check_sub_uint64_int64(uint64_t a, int64_t b, uint64_t* ret)
+{
+    uint64_t result = 0;
+
+    // must first see if rhs is positive or negative
+    if (b >= 0)
+    {
+        if ((uint64_t)b <= a)
+        {
+            *ret = (a - (uint64_t)b);
+            return true;
+        }
+    }
+    else
+    {
+        // we're now effectively adding
+        result = a + safe_abs64(b);
+
+        if (result >= a)
+        {
+            *ret = result;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static inline uint64_t safe_sub_uint64_uint64(uint64_t a, uint64_t b)
+{
+    uint64_t tmp = a - b;
+
+    if (tmp <= a)
+        return tmp;
+
+    safe_math_fail("safe_math_fail safe_sub_uint64_uint64");
+}
+
+static inline bool check_sub_uint64_uint64(uint64_t a, uint64_t b, uint64_t* ret)
+{
+    uint64_t tmp = a - b;
+    *ret = tmp;
+    return (tmp <= a);
+}
+
+#ifdef __cplusplus
+} 
+#endif
+
+#endif // C_SAFE_MATH_IMPL
diff --git a/pyarrow/include/arrow/vendored/strptime.h b/pyarrow/include/arrow/vendored/strptime.h
new file mode 100644
index 0000000000000000000000000000000000000000..764a4440ee4973dd6506c3c5d467e5c5a260e428
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/strptime.h
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <time.h>
+
+#include "arrow/util/visibility.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// A less featureful implementation of strptime() for platforms lacking
+// a standard implementation (e.g. Windows).
+ARROW_EXPORT char* arrow_strptime(const char* __restrict, const char* __restrict,
+                                  struct tm* __restrict);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/pyarrow/include/arrow/vendored/xxhash.h b/pyarrow/include/arrow/vendored/xxhash.h
new file mode 100644
index 0000000000000000000000000000000000000000..a33cdf8610dd6aa3b22bc24eef8ea3f871b8be9f
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/xxhash.h
@@ -0,0 +1,18 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/vendored/xxhash/xxhash.h"
diff --git a/pyarrow/include/arrow/vendored/xxhash/xxhash.h b/pyarrow/include/arrow/vendored/xxhash/xxhash.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c819f93b79dfb03301206578377da96469c1ebe
--- /dev/null
+++ b/pyarrow/include/arrow/vendored/xxhash/xxhash.h
@@ -0,0 +1,7487 @@
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (C) 2012-2023 Yann Collet
+ *
+ * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ *      notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ *      copyright notice, this list of conditions and the following disclaimer
+ *      in the documentation and/or other materials provided with the
+ *      distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at:
+ *   - xxHash homepage: https://www.xxhash.com
+ *   - xxHash source repository: https://github.com/Cyan4973/xxHash
+ */
+
+/*!
+ * @mainpage xxHash
+ *
+ * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed
+ * limits.
+ *
+ * It is proposed in four flavors, in three families:
+ * 1. @ref XXH32_family
+ *   - Classic 32-bit hash function. Simple, compact, and runs on almost all
+ *     32-bit and 64-bit systems.
+ * 2. @ref XXH64_family
+ *   - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most
+ *     64-bit systems (but _not_ 32-bit systems).
+ * 3. @ref XXH3_family
+ *   - Modern 64-bit and 128-bit hash function family which features improved
+ *     strength and performance across the board, especially on smaller data.
+ *     It benefits greatly from SIMD and 64-bit without requiring it.
+ *
+ * Benchmarks
+ * ---
+ * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04.
+ * The open source benchmark program is compiled with clang v10.0 using -O3 flag.
+ *
+ * | Hash Name            | ISA ext | Width | Large Data Speed | Small Data Velocity |
+ * | -------------------- | ------- | ----: | ---------------: | ------------------: |
+ * | XXH3_64bits()        | @b AVX2 |    64 |        59.4 GB/s |               133.1 |
+ * | MeowHash             | AES-NI  |   128 |        58.2 GB/s |                52.5 |
+ * | XXH3_128bits()       | @b AVX2 |   128 |        57.9 GB/s |               118.1 |
+ * | CLHash               | PCLMUL  |    64 |        37.1 GB/s |                58.1 |
+ * | XXH3_64bits()        | @b SSE2 |    64 |        31.5 GB/s |               133.1 |
+ * | XXH3_128bits()       | @b SSE2 |   128 |        29.6 GB/s |               118.1 |
+ * | RAM sequential read  |         |   N/A |        28.0 GB/s |                 N/A |
+ * | ahash                | AES-NI  |    64 |        22.5 GB/s |               107.2 |
+ * | City64               |         |    64 |        22.0 GB/s |                76.6 |
+ * | T1ha2                |         |    64 |        22.0 GB/s |                99.0 |
+ * | City128              |         |   128 |        21.7 GB/s |                57.7 |
+ * | FarmHash             | AES-NI  |    64 |        21.3 GB/s |                71.9 |
+ * | XXH64()              |         |    64 |        19.4 GB/s |                71.0 |
+ * | SpookyHash           |         |    64 |        19.3 GB/s |                53.2 |
+ * | Mum                  |         |    64 |        18.0 GB/s |                67.0 |
+ * | CRC32C               | SSE4.2  |    32 |        13.0 GB/s |                57.9 |
+ * | XXH32()              |         |    32 |         9.7 GB/s |                71.9 |
+ * | City32               |         |    32 |         9.1 GB/s |                66.0 |
+ * | Blake3*              | @b AVX2 |   256 |         4.4 GB/s |                 8.1 |
+ * | Murmur3              |         |    32 |         3.9 GB/s |                56.1 |
+ * | SipHash*             |         |    64 |         3.0 GB/s |                43.2 |
+ * | Blake3*              | @b SSE2 |   256 |         2.4 GB/s |                 8.1 |
+ * | HighwayHash          |         |    64 |         1.4 GB/s |                 6.0 |
+ * | FNV64                |         |    64 |         1.2 GB/s |                62.7 |
+ * | Blake2*              |         |   256 |         1.1 GB/s |                 5.1 |
+ * | SHA1*                |         |   160 |         0.8 GB/s |                 5.6 |
+ * | MD5*                 |         |   128 |         0.6 GB/s |                 7.8 |
+ * @note
+ *   - Hashes which require a specific ISA extension are noted. SSE2 is also noted,
+ *     even though it is mandatory on x64.
+ *   - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic
+ *     by modern standards.
+ *   - Small data velocity is a rough average of algorithm's efficiency for small
+ *     data. For more accurate information, see the wiki.
+ *   - More benchmarks and strength tests are found on the wiki:
+ *         https://github.com/Cyan4973/xxHash/wiki
+ *
+ * Usage
+ * ------
+ * All xxHash variants use a similar API. Changing the algorithm is a trivial
+ * substitution.
+ *
+ * @pre
+ *    For functions which take an input and length parameter, the following
+ *    requirements are assumed:
+ *    - The range from [`input`, `input + length`) is valid, readable memory.
+ *      - The only exception is if the `length` is `0`, `input` may be `NULL`.
+ *    - For C++, the objects must have the *TriviallyCopyable* property, as the
+ *      functions access bytes directly as if it was an array of `unsigned char`.
+ *
+ * @anchor single_shot_example
+ * **Single Shot**
+ *
+ * These functions are stateless functions which hash a contiguous block of memory,
+ * immediately returning the result. They are the easiest and usually the fastest
+ * option.
+ *
+ * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits()
+ *
+ * @code{.c}
+ *   #include <string.h>
+ *   #include "xxhash.h"
+ *
+ *   // Example for a function which hashes a null terminated string with XXH32().
+ *   XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed)
+ *   {
+ *       // NULL pointers are only valid if the length is zero
+ *       size_t length = (string == NULL) ? 0 : strlen(string);
+ *       return XXH32(string, length, seed);
+ *   }
+ * @endcode
+ *
+ *
+ * @anchor streaming_example
+ * **Streaming**
+ *
+ * These groups of functions allow incremental hashing of unknown size, even
+ * more than what would fit in a size_t.
+ *
+ * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset()
+ *
+ * @code{.c}
+ *   #include <stdio.h>
+ *   #include <assert.h>
+ *   #include "xxhash.h"
+ *   // Example for a function which hashes a FILE incrementally with XXH3_64bits().
+ *   XXH64_hash_t hashFile(FILE* f)
+ *   {
+ *       // Allocate a state struct. Do not just use malloc() or new.
+ *       XXH3_state_t* state = XXH3_createState();
+ *       assert(state != NULL && "Out of memory!");
+ *       // Reset the state to start a new hashing session.
+ *       XXH3_64bits_reset(state);
+ *       char buffer[4096];
+ *       size_t count;
+ *       // Read the file in chunks
+ *       while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) {
+ *           // Run update() as many times as necessary to process the data
+ *           XXH3_64bits_update(state, buffer, count);
+ *       }
+ *       // Retrieve the finalized hash. This will not change the state.
+ *       XXH64_hash_t result = XXH3_64bits_digest(state);
+ *       // Free the state. Do not use free().
+ *       XXH3_freeState(state);
+ *       return result;
+ *   }
+ * @endcode
+ *
+ * Streaming functions generate the xxHash value from an incremental input.
+ * This method is slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * An XXH state must first be allocated using `XXH*_createState()`.
+ *
+ * Start a new hash by initializing the state with a seed using `XXH*_reset()`.
+ *
+ * Then, feed the hash state by calling `XXH*_update()` as many times as necessary.
+ *
+ * The function returns an error code, with 0 meaning OK, and any other value
+ * meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using `XXH*_digest()`.
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a
+ * digest, and generate new hash values later on by invoking `XXH*_digest()`.
+ *
+ * When done, release the state using `XXH*_freeState()`.
+ *
+ *
+ * @anchor canonical_representation_example
+ * **Canonical Representation**
+ *
+ * The default return values from XXH functions are unsigned 32, 64 and 128 bit
+ * integers.
+ * This the simplest and fastest format for further post-processing.
+ *
+ * However, this leaves open the question of what is the order on the byte level,
+ * since little and big endian conventions will store the same number differently.
+ *
+ * The canonical representation settles this issue by mandating big-endian
+ * convention, the same convention as human-readable numbers (large digits first).
+ *
+ * When writing hash values to storage, sending them over a network, or printing
+ * them, it's highly recommended to use the canonical representation to ensure
+ * portability across a wider range of systems, present and future.
+ *
+ * The following functions allow transformation of hash values to and from
+ * canonical format.
+ *
+ * XXH32_canonicalFromHash(), XXH32_hashFromCanonical(),
+ * XXH64_canonicalFromHash(), XXH64_hashFromCanonical(),
+ * XXH128_canonicalFromHash(), XXH128_hashFromCanonical(),
+ *
+ * @code{.c}
+ *   #include <stdio.h>
+ *   #include "xxhash.h"
+ *
+ *   // Example for a function which prints XXH32_hash_t in human readable format
+ *   void printXxh32(XXH32_hash_t hash)
+ *   {
+ *       XXH32_canonical_t cano;
+ *       XXH32_canonicalFromHash(&cano, hash);
+ *       size_t i;
+ *       for(i = 0; i < sizeof(cano.digest); ++i) {
+ *           printf("%02x", cano.digest[i]);
+ *       }
+ *       printf("\n");
+ *   }
+ *
+ *   // Example for a function which converts XXH32_canonical_t to XXH32_hash_t
+ *   XXH32_hash_t convertCanonicalToXxh32(XXH32_canonical_t cano)
+ *   {
+ *       XXH32_hash_t hash = XXH32_hashFromCanonical(&cano);
+ *       return hash;
+ *   }
+ * @endcode
+ *
+ *
+ * @file xxhash.h
+ * xxHash prototypes and implementation
+ */
+
+#if defined(__cplusplus) && !defined(XXH_NO_EXTERNC_GUARD)
+extern "C" {
+#endif
+
+/* ****************************
+ *  INLINE mode
+ ******************************/
+/*!
+ * @defgroup public Public API
+ * Contains details on the public xxHash functions.
+ * @{
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Gives access to internal state declaration, required for static allocation.
+ *
+ * Incompatible with dynamic linking, due to risks of ABI changes.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_STATIC_LINKING_ONLY
+ *     #include "xxhash.h"
+ * @endcode
+ */
+#  define XXH_STATIC_LINKING_ONLY
+/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */
+
+/*!
+ * @brief Gives access to internal definitions.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_STATIC_LINKING_ONLY
+ *     #define XXH_IMPLEMENTATION
+ *     #include "xxhash.h"
+ * @endcode
+ */
+#  define XXH_IMPLEMENTATION
+/* Do not undef XXH_IMPLEMENTATION for Doxygen */
+
+/*!
+ * @brief Exposes the implementation and marks all functions as `inline`.
+ *
+ * Use these build macros to inline xxhash into the target unit.
+ * Inlining improves performance on small inputs, especially when the length is
+ * expressed as a compile-time constant:
+ *
+ *  https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html
+ *
+ * It also keeps xxHash symbols private to the unit, so they are not exported.
+ *
+ * Usage:
+ * @code{.c}
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * @endcode
+ * Do not compile and link xxhash.o as a separate object, as it is not useful.
+ */
+#  define XXH_INLINE_ALL
+#  undef XXH_INLINE_ALL
+/*!
+ * @brief Exposes the implementation without marking functions as inline.
+ */
+#  define XXH_PRIVATE_API
+#  undef XXH_PRIVATE_API
+/*!
+ * @brief Emulate a namespace by transparently prefixing all symbols.
+ *
+ * If you want to include _and expose_ xxHash functions from within your own
+ * library, but also want to avoid symbol collisions with other libraries which
+ * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix
+ * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE
+ * (therefore, avoid empty or numeric values).
+ *
+ * Note that no change is required within the calling program as long as it
+ * includes `xxhash.h`: Regular symbol names will be automatically translated
+ * by this header.
+ */
+#  define XXH_NAMESPACE /* YOUR NAME HERE */
+#  undef XXH_NAMESPACE
+#endif
+
+#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \
+    && !defined(XXH_INLINE_ALL_31684351384)
+   /* this section should be traversed only once */
+#  define XXH_INLINE_ALL_31684351384
+   /* give access to the advanced API, required to compile implementations */
+#  undef XXH_STATIC_LINKING_ONLY   /* avoid macro redef */
+#  define XXH_STATIC_LINKING_ONLY
+   /* make all functions private */
+#  undef XXH_PUBLIC_API
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((__unused__))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* note: this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+
+   /*
+    * This part deals with the special case where a unit wants to inline xxHash,
+    * but "xxhash.h" has previously been included without XXH_INLINE_ALL,
+    * such as part of some previously included *.h header file.
+    * Without further action, the new include would just be ignored,
+    * and functions would effectively _not_ be inlined (silent failure).
+    * The following macros solve this situation by prefixing all inlined names,
+    * avoiding naming collision with previous inclusions.
+    */
+   /* Before that, we unconditionally #undef all symbols,
+    * in case they were already defined with XXH_NAMESPACE.
+    * They will then be redefined for XXH_INLINE_ALL
+    */
+#  undef XXH_versionNumber
+    /* XXH32 */
+#  undef XXH32
+#  undef XXH32_createState
+#  undef XXH32_freeState
+#  undef XXH32_reset
+#  undef XXH32_update
+#  undef XXH32_digest
+#  undef XXH32_copyState
+#  undef XXH32_canonicalFromHash
+#  undef XXH32_hashFromCanonical
+    /* XXH64 */
+#  undef XXH64
+#  undef XXH64_createState
+#  undef XXH64_freeState
+#  undef XXH64_reset
+#  undef XXH64_update
+#  undef XXH64_digest
+#  undef XXH64_copyState
+#  undef XXH64_canonicalFromHash
+#  undef XXH64_hashFromCanonical
+    /* XXH3_64bits */
+#  undef XXH3_64bits
+#  undef XXH3_64bits_withSecret
+#  undef XXH3_64bits_withSeed
+#  undef XXH3_64bits_withSecretandSeed
+#  undef XXH3_createState
+#  undef XXH3_freeState
+#  undef XXH3_copyState
+#  undef XXH3_64bits_reset
+#  undef XXH3_64bits_reset_withSeed
+#  undef XXH3_64bits_reset_withSecret
+#  undef XXH3_64bits_update
+#  undef XXH3_64bits_digest
+#  undef XXH3_generateSecret
+    /* XXH3_128bits */
+#  undef XXH128
+#  undef XXH3_128bits
+#  undef XXH3_128bits_withSeed
+#  undef XXH3_128bits_withSecret
+#  undef XXH3_128bits_reset
+#  undef XXH3_128bits_reset_withSeed
+#  undef XXH3_128bits_reset_withSecret
+#  undef XXH3_128bits_reset_withSecretandSeed
+#  undef XXH3_128bits_update
+#  undef XXH3_128bits_digest
+#  undef XXH128_isEqual
+#  undef XXH128_cmp
+#  undef XXH128_canonicalFromHash
+#  undef XXH128_hashFromCanonical
+    /* Finally, free the namespace itself */
+#  undef XXH_NAMESPACE
+
+    /* employ the namespace for XXH_INLINE_ALL */
+#  define XXH_NAMESPACE XXH_INLINE_
+   /*
+    * Some identifiers (enums, type names) are not symbols,
+    * but they must nonetheless be renamed to avoid redeclaration.
+    * Alternative solution: do not redeclare them.
+    * However, this requires some #ifdefs, and has a more dispersed impact.
+    * Meanwhile, renaming can be achieved in a single place.
+    */
+#  define XXH_IPREF(Id)   XXH_NAMESPACE ## Id
+#  define XXH_OK XXH_IPREF(XXH_OK)
+#  define XXH_ERROR XXH_IPREF(XXH_ERROR)
+#  define XXH_errorcode XXH_IPREF(XXH_errorcode)
+#  define XXH32_canonical_t  XXH_IPREF(XXH32_canonical_t)
+#  define XXH64_canonical_t  XXH_IPREF(XXH64_canonical_t)
+#  define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t)
+#  define XXH32_state_s XXH_IPREF(XXH32_state_s)
+#  define XXH32_state_t XXH_IPREF(XXH32_state_t)
+#  define XXH64_state_s XXH_IPREF(XXH64_state_s)
+#  define XXH64_state_t XXH_IPREF(XXH64_state_t)
+#  define XXH3_state_s  XXH_IPREF(XXH3_state_s)
+#  define XXH3_state_t  XXH_IPREF(XXH3_state_t)
+#  define XXH128_hash_t XXH_IPREF(XXH128_hash_t)
+   /* Ensure the header is parsed again, even if it was previously included */
+#  undef XXHASH_H_5627135585666179
+#  undef XXHASH_H_STATIC_13879238742
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/* ****************************************************************
+ *  Stable API
+ *****************************************************************/
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+/*! @brief Marks a global symbol. */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+/* XXH32 */
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+/* XXH64 */
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+/* XXH3_64bits */
+#  define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits)
+#  define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret)
+#  define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed)
+#  define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed)
+#  define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState)
+#  define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState)
+#  define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState)
+#  define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset)
+#  define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed)
+#  define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret)
+#  define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed)
+#  define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update)
+#  define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest)
+#  define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret)
+#  define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed)
+/* XXH3_128bits */
+#  define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128)
+#  define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits)
+#  define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed)
+#  define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret)
+#  define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed)
+#  define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset)
+#  define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed)
+#  define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret)
+#  define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed)
+#  define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update)
+#  define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest)
+#  define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual)
+#  define XXH128_cmp     XXH_NAME2(XXH_NAMESPACE, XXH128_cmp)
+#  define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash)
+#  define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Compiler specifics
+***************************************/
+
+/* specific declaration modes for Windows */
+#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API)
+#  if defined(_WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT))
+#    ifdef XXH_EXPORT
+#      define XXH_PUBLIC_API __declspec(dllexport)
+#    elif XXH_IMPORT
+#      define XXH_PUBLIC_API __declspec(dllimport)
+#    endif
+#  else
+#    define XXH_PUBLIC_API   /* do nothing */
+#  endif
+#endif
+
+#if defined (__GNUC__)
+# define XXH_CONSTF  __attribute__((__const__))
+# define XXH_PUREF   __attribute__((__pure__))
+# define XXH_MALLOCF __attribute__((__malloc__))
+#else
+# define XXH_CONSTF  /* disable */
+# define XXH_PUREF
+# define XXH_MALLOCF
+#endif
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    8
+#define XXH_VERSION_RELEASE  3
+/*! @brief Version number, encoded as two digits each */
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+
+/*!
+ * @brief Obtains the xxHash version.
+ *
+ * This is mostly useful when xxHash is compiled as a shared library,
+ * since the returned value comes from the library, as opposed to header file.
+ *
+ * @return @ref XXH_VERSION_NUMBER of the invoked library.
+ */
+XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Common basic types
+******************************/
+#include <stddef.h>   /* size_t */
+/*!
+ * @brief Exit code for the streaming API.
+ */
+typedef enum {
+    XXH_OK = 0, /*!< OK */
+    XXH_ERROR   /*!< Error */
+} XXH_errorcode;
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */
+/*!
+ * @brief An unsigned 32-bit integer.
+ *
+ * Not necessarily defined to `uint32_t` but functionally equivalent.
+ */
+typedef uint32_t XXH32_hash_t;
+
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   ifdef _AIX
+#     include <inttypes.h>
+#   else
+#     include <stdint.h>
+#   endif
+    typedef uint32_t XXH32_hash_t;
+
+#else
+#   include <limits.h>
+#   if UINT_MAX == 0xFFFFFFFFUL
+      typedef unsigned int XXH32_hash_t;
+#   elif ULONG_MAX == 0xFFFFFFFFUL
+      typedef unsigned long XXH32_hash_t;
+#   else
+#     error "unsupported platform: need a 32-bit type"
+#   endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup XXH32_family XXH32 family
+ * @ingroup public
+ * Contains functions used in the classic 32-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH32 is useful for older platforms, with no or poor 64-bit performance.
+ *   Note that the @ref XXH3_family provides competitive speed for both 32-bit
+ *   and 64-bit systems, and offers true 64/128 bit hash results.
+ *
+ * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families
+ * @see @ref XXH32_impl for implementation details
+ * @{
+ */
+
+/*!
+ * @brief Calculates the 32-bit hash of @p input using xxHash32.
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 32-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 32-bit xxHash32 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed);
+
+#ifndef XXH_NO_STREAM
+/*!
+ * @typedef struct XXH32_state_s XXH32_state_t
+ * @brief The opaque state struct for the XXH32 streaming API.
+ *
+ * @see XXH32_state_s for details.
+ * @see @ref streaming_example "Streaming Example"
+ */
+typedef struct XXH32_state_s XXH32_state_t;
+
+/*!
+ * @brief Allocates an @ref XXH32_state_t.
+ *
+ * @return An allocated pointer of @ref XXH32_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH32_freeState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void);
+/*!
+ * @brief Frees an @ref XXH32_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note @p statePtr must be allocated with XXH32_createState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+/*!
+ * @brief Copies one @ref XXH32_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH32_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 32-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note This function resets and seeds a state. Call it before @ref XXH32_update().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, XXH32_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH32_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH32_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated 32-bit xxHash32 value from that state.
+ *
+ * @note
+ *   Calling XXH32_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/*******   Canonical representation   *******/
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH32_hash_t.
+ */
+typedef struct {
+    unsigned char digest[4]; /*!< Hash bytes, big endian */
+} XXH32_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t.
+ *
+ * @param dst  The @ref XXH32_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH32_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t.
+ *
+ * @param src The @ref XXH32_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+
+/*! @cond Doxygen ignores this part */
+#ifdef __has_attribute
+# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x)
+#else
+# define XXH_HAS_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/* C-language Attributes are added in C23. */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 202311L) && defined(__has_c_attribute)
+# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x)
+#else
+# define XXH_HAS_C_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+#if defined(__cplusplus) && defined(__has_cpp_attribute)
+# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+# define XXH_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute
+ * introduced in CPP17 and C23.
+ * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough
+ * C23   : https://en.cppreference.com/w/c/language/attributes/fallthrough
+ */
+#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough)
+# define XXH_FALLTHROUGH [[fallthrough]]
+#elif XXH_HAS_ATTRIBUTE(__fallthrough__)
+# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__))
+#else
+# define XXH_FALLTHROUGH /* fallthrough */
+#endif
+/*! @endcond */
+
+/*! @cond Doxygen ignores this part */
+/*
+ * Define XXH_NOESCAPE for annotated pointers in public API.
+ * https://clang.llvm.org/docs/AttributeReference.html#noescape
+ * As of writing this, only supported by clang.
+ */
+#if XXH_HAS_ATTRIBUTE(noescape)
+# define XXH_NOESCAPE __attribute__((__noescape__))
+#else
+# define XXH_NOESCAPE
+#endif
+/*! @endcond */
+
+
+/*!
+ * @}
+ * @ingroup public
+ * @{
+ */
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */
+/*!
+ * @brief An unsigned 64-bit integer.
+ *
+ * Not necessarily defined to `uint64_t` but functionally equivalent.
+ */
+typedef uint64_t XXH64_hash_t;
+#elif !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   ifdef _AIX
+#     include <inttypes.h>
+#   else
+#     include <stdint.h>
+#   endif
+   typedef uint64_t XXH64_hash_t;
+#else
+#  include <limits.h>
+#  if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL
+     /* LP64 ABI says uint64_t is unsigned long */
+     typedef unsigned long XXH64_hash_t;
+#  else
+     /* the following type must have a width of 64-bit */
+     typedef unsigned long long XXH64_hash_t;
+#  endif
+#endif
+
+/*!
+ * @}
+ *
+ * @defgroup XXH64_family XXH64 family
+ * @ingroup public
+ * @{
+ * Contains functions used in the classic 64-bit xxHash algorithm.
+ *
+ * @note
+ *   XXH3 provides competitive speed for both 32-bit and 64-bit systems,
+ *   and offers true 64/128 bit hash results.
+ *   It provides better speed for systems with vector processing capabilities.
+ */
+
+/*!
+ * @brief Calculates the 64-bit hash of @p input using xxHash64.
+ *
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit xxHash64 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*!
+ * @brief The opaque state struct for the XXH64 streaming API.
+ *
+ * @see XXH64_state_s for details.
+ * @see @ref streaming_example "Streaming Example"
+ */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+
+/*!
+ * @brief Allocates an @ref XXH64_state_t.
+ *
+ * @return An allocated pointer of @ref XXH64_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH64_freeState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void);
+
+/*!
+ * @brief Frees an @ref XXH64_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note @p statePtr must be allocated with XXH64_createState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+
+/*!
+ * @brief Copies one @ref XXH64_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH64_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note This function resets and seeds a state. Call it before @ref XXH64_update().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH64_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated hash value from an @ref XXH64_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated 64-bit xxHash64 value from that state.
+ *
+ * @note
+ *   Calling XXH64_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+/*******   Canonical representation   *******/
+
+/*!
+ * @brief Canonical (big endian) representation of @ref XXH64_hash_t.
+ */
+typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t;
+
+/*!
+ * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t.
+ *
+ * @param dst The @ref XXH64_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH64_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t.
+ *
+ * @param src The @ref XXH64_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ *
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src);
+
+#ifndef XXH_NO_XXH3
+
+/*!
+ * @}
+ * ************************************************************************
+ * @defgroup XXH3_family XXH3 family
+ * @ingroup public
+ * @{
+ *
+ * XXH3 is a more recent hash algorithm featuring:
+ *  - Improved speed for both small and large inputs
+ *  - True 64-bit and 128-bit outputs
+ *  - SIMD acceleration
+ *  - Improved 32-bit viability
+ *
+ * Speed analysis methodology is explained here:
+ *
+ *    https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html
+ *
+ * Compared to XXH64, expect XXH3 to run approximately
+ * ~2x faster on large inputs and >3x faster on small ones,
+ * exact differences vary depending on platform.
+ *
+ * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic,
+ * but does not require it.
+ * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3
+ * at competitive speeds, even without vector support. Further details are
+ * explained in the implementation.
+ *
+ * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD
+ * implementations for many common platforms:
+ *   - AVX512
+ *   - AVX2
+ *   - SSE2
+ *   - ARM NEON
+ *   - WebAssembly SIMD128
+ *   - POWER8 VSX
+ *   - s390x ZVector
+ * This can be controlled via the @ref XXH_VECTOR macro, but it automatically
+ * selects the best version according to predefined macros. For the x86 family, an
+ * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c.
+ *
+ * XXH3 implementation is portable:
+ * it has a generic C90 formulation that can be compiled on any platform,
+ * all implementations generate exactly the same hash value on all platforms.
+ * Starting from v0.8.0, it's also labelled "stable", meaning that
+ * any future version will also generate the same hash value.
+ *
+ * XXH3 offers 2 variants, _64bits and _128bits.
+ *
+ * When only 64 bits are needed, prefer invoking the _64bits variant, as it
+ * reduces the amount of mixing, resulting in faster speed on small inputs.
+ * It's also generally simpler to manipulate a scalar return type than a struct.
+ *
+ * The API supports one-shot hashing, streaming mode, and custom secrets.
+ */
+
+/*!
+ * @ingroup tuning
+ * @brief Possible values for @ref XXH_VECTOR.
+ *
+ * Unless set explicitly, determined automatically.
+ */
+#  define XXH_SCALAR 0 /*!< Portable scalar version */
+#  define XXH_SSE2   1 /*!< SSE2 for Pentium 4, Opteron, all x86_64. */
+#  define XXH_AVX2   2 /*!< AVX2 for Haswell and Bulldozer */
+#  define XXH_AVX512 3 /*!< AVX512 for Skylake and Icelake */
+#  define XXH_NEON   4 /*!< NEON for most ARMv7-A, all AArch64, and WASM SIMD128 */
+#  define XXH_VSX    5 /*!< VSX and ZVector for POWER8/z13 (64-bit) */
+#  define XXH_SVE    6 /*!< SVE for some ARMv8-A and ARMv9-A */
+#  define XXH_LSX    7 /*!< LSX (128-bit SIMD) for LoongArch64 */
+#  define XXH_LASX   8 /*!< LASX (256-bit SIMD) for LoongArch64 */
+#  define XXH_RVV    9 /*!< RVV (RISC-V Vector) for RISC-V */
+
+/*-**********************************************************************
+*  XXH3 64-bit variant
+************************************************************************/
+
+/*!
+ * @brief Calculates 64-bit unseeded variant of XXH3 hash of @p input.
+ *
+ * @param input  The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @note
+ *   This is equivalent to @ref XXH3_64bits_withSeed() with a seed of `0`, however
+ *   it may have slightly better performance due to constant propagation of the
+ *   defaults.
+ *
+ * @see
+ *    XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Calculates 64-bit seeded variant of XXH3 hash of @p input.
+ *
+ * @param input  The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ * @param seed   The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @note
+ *    seed == 0 produces the same results as @ref XXH3_64bits().
+ *
+ * This variant generates a custom secret on the fly based on default secret
+ * altered using the @p seed value.
+ *
+ * While this operation is decently fast, note that it's not completely free.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed);
+
+/*!
+ * The bare minimum size for a custom secret.
+ *
+ * @see
+ *  XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(),
+ *  XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret().
+ */
+#define XXH3_SECRET_SIZE_MIN 136
+
+/*!
+ * @brief Calculates 64-bit variant of XXH3 with a custom "secret".
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @return The calculated 64-bit XXH3 hash value.
+ *
+ * @pre
+ *   The memory between @p data and @p data + @p len must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p data may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing @ref XXH3_generateSecret() instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ */
+
+/*!
+ * @brief The opaque state struct for the XXH3 streaming API.
+ *
+ * @see XXH3_state_s for details.
+ * @see @ref streaming_example "Streaming Example"
+ */
+typedef struct XXH3_state_s XXH3_state_t;
+XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void);
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr);
+
+/*!
+ * @brief Copies one @ref XXH3_state_t to another.
+ *
+ * @param dst_state The state to copy to.
+ * @param src_state The state to copy from.
+ * @pre
+ *   @p dst_state and @p src_state must not be `NULL` and must not overlap.
+ */
+XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret with default parameters.
+ *   - Call this function before @ref XXH3_64bits_update().
+ *   - Digest will be equivalent to `XXH3_64bits()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed     The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret from `seed`.
+ *   - Call this function before @ref XXH3_64bits_update().
+ *   - Digest will be equivalent to `XXH3_64bits_withSeed()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   `secret` is referenced, it _must outlive_ the hash streaming session.
+ *
+ * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ * @pre
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note Call this to incrementally consume blocks of data.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated XXH3 64-bit hash value from that state.
+ *
+ * @note
+ *   Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/* note : canonical representation of XXH3 is the same as XXH64
+ * since they both produce XXH64_hash_t values */
+
+
+/*-**********************************************************************
+*  XXH3 128-bit variant
+************************************************************************/
+
+/*!
+ * @brief The return value from 128-bit hashes.
+ *
+ * Stored in little endian order, although the fields themselves are in native
+ * endianness.
+ */
+typedef struct {
+    XXH64_hash_t low64;   /*!< `value & 0xFFFFFFFFFFFFFFFF` */
+    XXH64_hash_t high64;  /*!< `value >> 64` */
+} XXH128_hash_t;
+
+/*!
+ * @brief Calculates 128-bit unseeded variant of XXH3 of @p data.
+ *
+ * @param data The block of data to be hashed, at least @p length bytes in size.
+ * @param len  The length of @p data, in bytes.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead
+ * for shorter inputs.
+ *
+ * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of `0`, however
+ * it may have slightly better performance due to constant propagation of the
+ * defaults.
+ *
+ * @see XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len);
+/*! @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param data The block of data to be hashed, at least @p length bytes in size.
+ * @param len  The length of @p data, in bytes.
+ * @param seed The 64-bit seed to alter the hash result predictably.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * @note
+ *    seed == 0 produces the same results as @ref XXH3_64bits().
+ *
+ * This variant generates a custom secret on the fly based on default secret
+ * altered using the @p seed value.
+ *
+ * While this operation is decently fast, note that it's not completely free.
+ *
+ * @see XXH3_128bits(), XXH3_128bits_withSecret(): other seeding variants
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
+/*!
+ * @brief Calculates 128-bit variant of XXH3 with a custom "secret".
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @return The calculated 128-bit variant of XXH3 value.
+ *
+ * It's possible to provide any blob of bytes as a "secret" to generate the hash.
+ * This makes it more difficult for an external actor to prepare an intentional collision.
+ * The main condition is that @p secretSize *must* be large enough (>= @ref XXH3_SECRET_SIZE_MIN).
+ * However, the quality of the secret impacts the dispersion of the hash algorithm.
+ * Therefore, the secret _must_ look like a bunch of random bytes.
+ * Avoid "trivial" or structured data such as repeated sequences or a text document.
+ * Whenever in doubt about the "randomness" of the blob of bytes,
+ * consider employing @ref XXH3_generateSecret() instead (see below).
+ * It will generate a proper high entropy secret derived from the blob of bytes.
+ * Another advantage of using XXH3_generateSecret() is that
+ * it guarantees that all bits within the initial blob of bytes
+ * will impact every bit of the output.
+ * This is not necessarily the case when using the blob of bytes directly
+ * because, when hashing _small_ inputs, only a portion of the secret is employed.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*******   Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*
+ * Streaming requires state maintenance.
+ * This operation costs memory and CPU.
+ * As a consequence, streaming is slower than one-shot hashing.
+ * For better performance, prefer one-shot functions whenever applicable.
+ *
+ * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits().
+ * Use already declared XXH3_createState() and XXH3_freeState().
+ *
+ * All reset and streaming functions have same meaning as their 64-bit counterpart.
+ */
+
+/*!
+ * @brief Resets an @ref XXH3_state_t to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret with default parameters.
+ *   - Call it before @ref XXH3_128bits_update().
+ *   - Digest will be equivalent to `XXH3_128bits()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash.
+ *
+ * @param statePtr The state struct to reset.
+ * @param seed     The 64-bit seed to alter the hash result predictably.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   - This function resets `statePtr` and generate a secret from `seed`.
+ *   - Call it before @ref XXH3_128bits_update().
+ *   - Digest will be equivalent to `XXH3_128bits_withSeed()`.
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed);
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   The state struct to reset.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * `secret` is referenced, it _must outlive_ the hash streaming session.
+ * Similar to one-shot API, `secretSize` must be >= @ref XXH3_SECRET_SIZE_MIN,
+ * and the quality of produced hash values depends on secret's entropy
+ * (secret's content should look like a bunch of random bytes).
+ * When in doubt about the randomness of a candidate `secret`,
+ * consider employing `XXH3_generateSecret()` instead (see below).
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize);
+
+/*!
+ * @brief Consumes a block of @p input to an @ref XXH3_state_t.
+ *
+ * Call this to incrementally consume blocks of data.
+ *
+ * @param statePtr The state struct to update.
+ * @param input The block of data to be hashed, at least @p length bytes in size.
+ * @param length The length of @p input, in bytes.
+ *
+ * @pre
+ *   @p statePtr must not be `NULL`.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @note
+ *   The memory between @p input and @p input + @p length must be valid,
+ *   readable, contiguous memory. However, if @p length is `0`, @p input may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length);
+
+/*!
+ * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t.
+ *
+ * @param statePtr The state struct to calculate the hash from.
+ *
+ * @pre
+ *  @p statePtr must not be `NULL`.
+ *
+ * @return The calculated XXH3 128-bit hash value from that state.
+ *
+ * @note
+ *   Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update,
+ *   digest, and update again.
+ *
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr);
+#endif /* !XXH_NO_STREAM */
+
+/* Following helper functions make it possible to compare XXH128_hast_t values.
+ * Since XXH128_hash_t is a structure, this capability is not offered by the language.
+ * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */
+
+/*!
+ * @brief Check equality of two XXH128_hash_t values
+ *
+ * @param h1 The 128-bit hash value.
+ * @param h2 Another 128-bit hash value.
+ *
+ * @return `1` if `h1` and `h2` are equal.
+ * @return `0` if they are not.
+ */
+XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2);
+
+/*!
+ * @brief Compares two @ref XXH128_hash_t
+ *
+ * This comparator is compatible with stdlib's `qsort()`/`bsearch()`.
+ *
+ * @param h128_1 Left-hand side value
+ * @param h128_2 Right-hand side value
+ *
+ * @return >0 if @p h128_1  > @p h128_2
+ * @return =0 if @p h128_1 == @p h128_2
+ * @return <0 if @p h128_1  < @p h128_2
+ */
+XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2);
+
+
+/*******   Canonical representation   *******/
+typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t;
+
+
+/*!
+ * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t.
+ *
+ * @param dst  The @ref XXH128_canonical_t pointer to be stored to.
+ * @param hash The @ref XXH128_hash_t to be converted.
+ *
+ * @pre
+ *   @p dst must not be `NULL`.
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash);
+
+/*!
+ * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t.
+ *
+ * @param src The @ref XXH128_canonical_t to convert.
+ *
+ * @pre
+ *   @p src must not be `NULL`.
+ *
+ * @return The converted hash.
+ * @see @ref canonical_representation_example "Canonical Representation Example"
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src);
+
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+
+/*!
+ * @}
+ */
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742)
+#define XXHASH_H_STATIC_13879238742
+/* ****************************************************************************
+ * This section contains declarations which are not guaranteed to remain stable.
+ * They may change in future versions, becoming incompatible with a different
+ * version of the library.
+ * These declarations should only be used with static linking.
+ * Never use them in association with dynamic linking!
+ ***************************************************************************** */
+
+/*
+ * These definitions are only present to allow static allocation
+ * of XXH states, on stack or in a struct, for example.
+ * Never **ever** access their members directly.
+ */
+
+/*!
+ * @internal
+ * @brief Structure for XXH32 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH32_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH64_state_s, XXH3_state_s
+ */
+struct XXH32_state_s {
+   XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */
+   XXH32_hash_t large_len;    /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */
+   XXH32_hash_t acc[4];       /*!< Accumulator lanes */
+   unsigned char buffer[16];  /*!< Internal buffer for partial reads. */
+   XXH32_hash_t bufferedSize; /*!< Amount of data in @ref buffer */
+   XXH32_hash_t reserved;     /*!< Reserved field. Do not read nor write to it. */
+};   /* typedef'd to XXH32_state_t */
+
+
+#ifndef XXH_NO_LONG_LONG  /* defined when there is no 64-bit support */
+
+/*!
+ * @internal
+ * @brief Structure for XXH64 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is
+ * an opaque type. This allows fields to safely be changed.
+ *
+ * Typedef'd to @ref XXH64_state_t.
+ * Do not access the members of this struct directly.
+ * @see XXH32_state_s, XXH3_state_s
+ */
+struct XXH64_state_s {
+   XXH64_hash_t total_len;    /*!< Total length hashed. This is always 64-bit. */
+   XXH64_hash_t acc[4];       /*!< Accumulator lanes */
+   unsigned char buffer[32];  /*!< Internal buffer for partial reads.. */
+   XXH32_hash_t bufferedSize; /*!< Amount of data in @ref buffer */
+   XXH32_hash_t reserved32;   /*!< Reserved field, needed for padding anyways*/
+   XXH64_hash_t reserved64;   /*!< Reserved field. Do not read or write to it. */
+};   /* typedef'd to XXH64_state_t */
+
+#ifndef XXH_NO_XXH3
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */
+#  define XXH_ALIGN(n)      _Alignas(n)
+#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */
+/* In C++ alignas() is a keyword */
+#  define XXH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define XXH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define XXH_ALIGN(n)      __declspec(align(n))
+#else
+#  define XXH_ALIGN(n)   /* disabled */
+#endif
+
+/* Old GCC versions only accept the attribute after the type in structures. */
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L))   /* C11+ */ \
+    && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \
+    && defined(__GNUC__)
+#   define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align)
+#else
+#   define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type
+#endif
+
+/*!
+ * @internal
+ * @brief The size of the internal XXH3 buffer.
+ *
+ * This is the optimal update size for incremental hashing.
+ *
+ * @see XXH3_64b_update(), XXH3_128b_update().
+ */
+#define XXH3_INTERNALBUFFER_SIZE 256
+
+/*!
+ * @def XXH3_SECRET_DEFAULT_SIZE
+ * @brief Default Secret's size
+ *
+ * This is the size of internal XXH3_kSecret
+ * and is needed by XXH3_generateSecret_fromSeed().
+ *
+ * Not to be confused with @ref XXH3_SECRET_SIZE_MIN.
+ */
+#define XXH3_SECRET_DEFAULT_SIZE 192
+
+/*!
+ * @internal
+ * @brief Structure for XXH3 streaming API.
+ *
+ * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY,
+ * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined.
+ * Otherwise it is an opaque type.
+ * Never use this definition in combination with dynamic library.
+ * This allows fields to safely be changed in the future.
+ *
+ * @note ** This structure has a strict alignment requirement of 64 bytes!! **
+ * Do not allocate this with `malloc()` or `new`,
+ * it will not be sufficiently aligned.
+ * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation.
+ *
+ * Typedef'd to @ref XXH3_state_t.
+ * Do never access the members of this struct directly.
+ *
+ * @see XXH3_INITSTATE() for stack initialization.
+ * @see XXH3_createState(), XXH3_freeState().
+ * @see XXH32_state_s, XXH64_state_s
+ */
+struct XXH3_state_s {
+   XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]);
+       /*!< The 8 accumulators. See @ref XXH32_state_s::acc and @ref XXH64_state_s::acc */
+   XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]);
+       /*!< Used to store a custom secret generated from a seed. */
+   XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]);
+       /*!< The internal buffer. @see XXH32_state_s::mem32 */
+   XXH32_hash_t bufferedSize;
+       /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */
+   XXH32_hash_t useSeed;
+       /*!< Reserved field. Needed for padding on 64-bit. */
+   size_t nbStripesSoFar;
+       /*!< Number or stripes processed. */
+   XXH64_hash_t totalLen;
+       /*!< Total length hashed. 64-bit even on 32-bit targets. */
+   size_t nbStripesPerBlock;
+       /*!< Number of stripes per block. */
+   size_t secretLimit;
+       /*!< Size of @ref customSecret or @ref extSecret */
+   XXH64_hash_t seed;
+       /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */
+   XXH64_hash_t reserved64;
+       /*!< Reserved field. */
+   const unsigned char* extSecret;
+       /*!< Reference to an external secret for the _withSecret variants, NULL
+        *   for other variants. */
+   /* note: there may be some padding at the end due to alignment on 64 bytes */
+}; /* typedef'd to XXH3_state_t */
+
+#undef XXH_ALIGN_MEMBER
+
+/*!
+ * @brief Initializes a stack-allocated `XXH3_state_s`.
+ *
+ * When the @ref XXH3_state_t structure is merely emplaced on stack,
+ * it should be initialized with XXH3_INITSTATE() or a memset()
+ * in case its first reset uses XXH3_NNbits_reset_withSeed().
+ * This init can be omitted if the first reset uses default or _withSecret mode.
+ * This operation isn't necessary when the state is created with XXH3_createState().
+ * Note that this doesn't prepare the state for a streaming operation,
+ * it's still necessary to use XXH3_NNbits_reset*() afterwards.
+ */
+#define XXH3_INITSTATE(XXH3_state_ptr)                       \
+    do {                                                     \
+        XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \
+        tmp_xxh3_state_ptr->seed = 0;                        \
+        tmp_xxh3_state_ptr->extSecret = NULL;                \
+    } while(0)
+
+
+/*!
+ * @brief Calculates the 128-bit hash of @p data using XXH3.
+ *
+ * @param data The block of data to be hashed, at least @p len bytes in size.
+ * @param len  The length of @p data, in bytes.
+ * @param seed The 64-bit seed to alter the hash's output predictably.
+ *
+ * @pre
+ *   The memory between @p data and @p data + @p len must be valid,
+ *   readable, contiguous memory. However, if @p len is `0`, @p data may be
+ *   `NULL`. In C++, this also must be *TriviallyCopyable*.
+ *
+ * @return The calculated 128-bit XXH3 value.
+ *
+ * @see @ref single_shot_example "Single Shot Example" for an example.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed);
+
+
+/* ===   Experimental API   === */
+/* Symbols defined below must be considered tied to a specific library version. */
+
+/*!
+ * @brief Derive a high-entropy secret from any user-defined content, named customSeed.
+ *
+ * @param secretBuffer    A writable buffer for derived high-entropy secret data.
+ * @param secretSize      Size of secretBuffer, in bytes.  Must be >= XXH3_SECRET_SIZE_MIN.
+ * @param customSeed      A user-defined content.
+ * @param customSeedSize  Size of customSeed, in bytes.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * The generated secret can be used in combination with `*_withSecret()` functions.
+ * The `_withSecret()` variants are useful to provide a higher level of protection
+ * than 64-bit seed, as it becomes much more difficult for an external actor to
+ * guess how to impact the calculation logic.
+ *
+ * The function accepts as input a custom seed of any length and any content,
+ * and derives from it a high-entropy secret of length @p secretSize into an
+ * already allocated buffer @p secretBuffer.
+ *
+ * The generated secret can then be used with any `*_withSecret()` variant.
+ * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(),
+ * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret()
+ * are part of this list. They all accept a `secret` parameter
+ * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN)
+ * _and_ feature very high entropy (consist of random-looking bytes).
+ * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can
+ * be employed to ensure proper quality.
+ *
+ * @p customSeed can be anything. It can have any size, even small ones,
+ * and its content can be anything, even "poor entropy" sources such as a bunch
+ * of zeroes. The resulting `secret` will nonetheless provide all required qualities.
+ *
+ * @pre
+ *   - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN
+ *   - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior.
+ *
+ * Example code:
+ * @code{.c}
+ *    #include <stdio.h>
+ *    #include <stdlib.h>
+ *    #include <string.h>
+ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
+ *    #include "xxhash.h"
+ *    // Hashes argv[2] using the entropy from argv[1].
+ *    int main(int argc, char* argv[])
+ *    {
+ *        char secret[XXH3_SECRET_SIZE_MIN];
+ *        if (argv != 3) { return 1; }
+ *        XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1]));
+ *        XXH64_hash_t h = XXH3_64bits_withSecret(
+ *             argv[2], strlen(argv[2]),
+ *             secret, sizeof(secret)
+ *        );
+ *        printf("%016llx\n", (unsigned long long) h);
+ *    }
+ * @endcode
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize);
+
+/*!
+ * @brief Generate the same secret as the _withSeed() variants.
+ *
+ * @param secretBuffer A writable buffer of @ref XXH3_SECRET_DEFAULT_SIZE bytes
+ * @param seed         The 64-bit seed to alter the hash result predictably.
+ *
+ * The generated secret can be used in combination with
+ *`*_withSecret()` and `_withSecretandSeed()` variants.
+ *
+ * Example C++ `std::string` hash class:
+ * @code{.cpp}
+ *    #include <string>
+ *    #define XXH_STATIC_LINKING_ONLY // expose unstable API
+ *    #include "xxhash.h"
+ *    // Slow, seeds each time
+ *    class HashSlow {
+ *        XXH64_hash_t seed;
+ *    public:
+ *        HashSlow(XXH64_hash_t s) : seed{s} {}
+ *        size_t operator()(const std::string& x) const {
+ *            return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)};
+ *        }
+ *    };
+ *    // Fast, caches the seeded secret for future uses.
+ *    class HashFast {
+ *        unsigned char secret[XXH3_SECRET_DEFAULT_SIZE];
+ *    public:
+ *        HashFast(XXH64_hash_t s) {
+ *            XXH3_generateSecret_fromSeed(secret, seed);
+ *        }
+ *        size_t operator()(const std::string& x) const {
+ *            return size_t{
+ *                XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret))
+ *            };
+ *        }
+ *    };
+ * @endcode
+ */
+XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed);
+
+/*!
+ * @brief Maximum size of "short" key in bytes.
+ */
+#define XXH3_MIDSIZE_MAX 240
+
+/*!
+ * @brief Calculates 64/128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param data       The block of data to be hashed, at least @p len bytes in size.
+ * @param len        The length of @p data, in bytes.
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed       The 64-bit seed to alter the hash result predictably.
+ *
+ * These variants generate hash values using either:
+ * - @p seed for "short" keys (< @ref XXH3_MIDSIZE_MAX = 240 bytes)
+ * - @p secret for "large" keys (>= @ref XXH3_MIDSIZE_MAX).
+ *
+ * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`.
+ * `_withSeed()` has to generate the secret on the fly for "large" keys.
+ * It's fast, but can be perceptible for "not so large" keys (< 1 KB).
+ * `_withSecret()` has to generate the masks on the fly for "small" keys,
+ * which requires more instructions than _withSeed() variants.
+ * Therefore, _withSecretandSeed variant combines the best of both worlds.
+ *
+ * When @p secret has been generated by XXH3_generateSecret_fromSeed(),
+ * this variant produces *exactly* the same results as `_withSeed()` variant,
+ * hence offering only a pure speed benefit on "large" input,
+ * by skipping the need to regenerate the secret for every large input.
+ *
+ * Another usage scenario is to hash the secret to a 64-bit hash value,
+ * for example with XXH3_64bits(), which then becomes the seed,
+ * and then employ both the seed and the secret in _withSecretandSeed().
+ * On top of speed, an added benefit is that each bit in the secret
+ * has a 50% chance to swap each bit in the output, via its impact to the seed.
+ *
+ * This is not guaranteed when using the secret directly in "small data" scenarios,
+ * because only portions of the secret are employed for small data.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH64_hash_t
+XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len,
+                              XXH_NOESCAPE const void* secret, size_t secretSize,
+                              XXH64_hash_t seed);
+
+/*!
+ * @brief Calculates 128-bit seeded variant of XXH3 hash of @p data.
+ *
+ * @param input      The memory segment to be hashed, at least @p len bytes in size.
+ * @param length     The length of @p data, in bytes.
+ * @param secret     The secret used to alter hash result predictably.
+ * @param secretSize The length of @p secret, in bytes (must be >= XXH3_SECRET_SIZE_MIN)
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed(): contract is the same.
+ */
+XXH_PUBLIC_API XXH_PUREF XXH128_hash_t
+XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length,
+                               XXH_NOESCAPE const void* secret, size_t secretSize,
+                               XXH64_hash_t seed64);
+
+#ifndef XXH_NO_STREAM
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed(). Contract is identical.
+ */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
+                                    XXH_NOESCAPE const void* secret, size_t secretSize,
+                                    XXH64_hash_t seed64);
+
+/*!
+ * @brief Resets an @ref XXH3_state_t with secret data to begin a new hash.
+ *
+ * @param statePtr   A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ * @param secret     The secret data.
+ * @param secretSize The length of @p secret, in bytes.
+ * @param seed64     The 64-bit seed to alter the hash result predictably.
+ *
+ * @return @ref XXH_OK on success.
+ * @return @ref XXH_ERROR on failure.
+ *
+ * @see XXH3_64bits_withSecretandSeed(). Contract is identical.
+ *
+ * Note: there was a bug in an earlier version of this function (<= v0.8.2)
+ * that would make it generate an incorrect hash value
+ * when @p seed == 0 and @p length < XXH3_MIDSIZE_MAX
+ * and @p secret is different from XXH3_generateSecret_fromSeed().
+ * As stated in the contract, the correct hash result must be
+ * the same as XXH3_128bits_withSeed() when @p length <= XXH3_MIDSIZE_MAX.
+ * Results generated by this older version are wrong, hence not comparable.
+ */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr,
+                                     XXH_NOESCAPE const void* secret, size_t secretSize,
+                                     XXH64_hash_t seed64);
+
+#endif /* !XXH_NO_STREAM */
+
+#endif  /* !XXH_NO_XXH3 */
+#endif  /* XXH_NO_LONG_LONG */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  define XXH_IMPLEMENTATION
+#endif
+
+#endif  /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */
+
+
+/* ======================================================================== */
+/* ======================================================================== */
+/* ======================================================================== */
+
+
+/*-**********************************************************************
+ * xxHash implementation
+ *-**********************************************************************
+ * xxHash's implementation used to be hosted inside xxhash.c.
+ *
+ * However, inlining requires implementation to be visible to the compiler,
+ * hence be included alongside the header.
+ * Previously, implementation was hosted inside xxhash.c,
+ * which was then #included when inlining was activated.
+ * This construction created issues with a few build and install systems,
+ * as it required xxhash.c to be stored in /include directory.
+ *
+ * xxHash implementation is now directly integrated within xxhash.h.
+ * As a consequence, xxhash.c is no longer needed in /include.
+ *
+ * xxhash.c is still available and is still useful.
+ * In a "normal" setup, when xxhash is not inlined,
+ * xxhash.h only exposes the prototypes and public symbols,
+ * while xxhash.c can be built into an object file xxhash.o
+ * which can then be linked into the final binary.
+ ************************************************************************/
+
+#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \
+   || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387)
+#  define XXH_IMPLEM_13a8737387
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+
+/*!
+ * @defgroup tuning Tuning parameters
+ * @{
+ *
+ * Various macros to control xxHash's behavior.
+ */
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Define this to disable 64-bit code.
+ *
+ * Useful if only using the @ref XXH32_family and you have a strict C90 compiler.
+ */
+#  define XXH_NO_LONG_LONG
+#  undef XXH_NO_LONG_LONG /* don't actually */
+/*!
+ * @brief Controls how unaligned memory is accessed.
+ *
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is
+ * safe and portable.
+ *
+ * Unfortunately, on some target/compiler combinations, the generated assembly
+ * is sub-optimal.
+ *
+ * The below switch allow selection of a different access method
+ * in the search for improved performance.
+ *
+ * @par Possible options:
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy`
+ *   @par
+ *     Use `memcpy()`. Safe and portable. Note that most modern compilers will
+ *     eliminate the function call and treat it as an unaligned access.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))`
+ *   @par
+ *     Depends on compiler extensions and is therefore not portable.
+ *     This method is safe _if_ your compiler supports it,
+ *     and *generally* as fast or faster than `memcpy`.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast
+ *  @par
+ *     Casts directly and dereferences. This method doesn't depend on the
+ *     compiler, but it violates the C standard as it directly dereferences an
+ *     unaligned pointer. It can generate buggy code on targets which do not
+ *     support unaligned memory accesses, but in some circumstances, it's the
+ *     only known way to get the most performance.
+ *
+ *  - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift
+ *  @par
+ *     Also portable. This can generate the best code on old compilers which don't
+ *     inline small `memcpy()` calls, and it might also be faster on big-endian
+ *     systems which lack a native byteswap instruction. However, some compilers
+ *     will emit literal byteshifts even if the target supports unaligned access.
+ *
+ *
+ * @warning
+ *   Methods 1 and 2 rely on implementation-defined behavior. Use these with
+ *   care, as what works on one compiler/platform/optimization level may cause
+ *   another to read garbage data or even crash.
+ *
+ * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details.
+ *
+ * Prefer these methods in priority order (0 > 3 > 1 > 2)
+ */
+#  define XXH_FORCE_MEMORY_ACCESS 0
+
+/*!
+ * @def XXH_SIZE_OPT
+ * @brief Controls how much xxHash optimizes for size.
+ *
+ * xxHash, when compiled, tends to result in a rather large binary size. This
+ * is mostly due to heavy usage to forced inlining and constant folding of the
+ * @ref XXH3_family to increase performance.
+ *
+ * However, some developers prefer size over speed. This option can
+ * significantly reduce the size of the generated code. When using the `-Os`
+ * or `-Oz` options on GCC or Clang, this is defined to 1 by default,
+ * otherwise it is defined to 0.
+ *
+ * Most of these size optimizations can be controlled manually.
+ *
+ * This is a number from 0-2.
+ *  - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed
+ *    comes first.
+ *  - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more
+ *    conservative and disables hacks that increase code size. It implies the
+ *    options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0,
+ *    and @ref XXH3_NEON_LANES == 8 if they are not already defined.
+ *  - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible.
+ *    Performance may cry. For example, the single shot functions just use the
+ *    streaming API.
+ */
+#  define XXH_SIZE_OPT 0
+
+/*!
+ * @def XXH_FORCE_ALIGN_CHECK
+ * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32()
+ * and XXH64() only).
+ *
+ * This is an important performance trick for architectures without decent
+ * unaligned memory access performance.
+ *
+ * It checks for input alignment, and when conditions are met, uses a "fast
+ * path" employing direct 32-bit/64-bit reads, resulting in _dramatically
+ * faster_ read speed.
+ *
+ * The check costs one initial branch per hash, which is generally negligible,
+ * but not zero.
+ *
+ * Moreover, it's not useful to generate an additional code path if memory
+ * access uses the same instruction for both aligned and unaligned
+ * addresses (e.g. x86 and aarch64).
+ *
+ * In these cases, the alignment check can be removed by setting this macro to 0.
+ * Then the code will always use unaligned memory access.
+ * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips
+ * which are platforms known to offer good unaligned memory accesses performance.
+ *
+ * It is also disabled by default when @ref XXH_SIZE_OPT >= 1.
+ *
+ * This option does not affect XXH3 (only XXH32 and XXH64).
+ */
+#  define XXH_FORCE_ALIGN_CHECK 0
+
+/*!
+ * @def XXH_NO_INLINE_HINTS
+ * @brief When non-zero, sets all functions to `static`.
+ *
+ * By default, xxHash tries to force the compiler to inline almost all internal
+ * functions.
+ *
+ * This can usually improve performance due to reduced jumping and improved
+ * constant folding, but significantly increases the size of the binary which
+ * might not be favorable.
+ *
+ * Additionally, sometimes the forced inlining can be detrimental to performance,
+ * depending on the architecture.
+ *
+ * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the
+ * compiler full control on whether to inline or not.
+ *
+ * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if
+ * @ref XXH_SIZE_OPT >= 1, this will automatically be defined.
+ */
+#  define XXH_NO_INLINE_HINTS 0
+
+/*!
+ * @def XXH3_INLINE_SECRET
+ * @brief Determines whether to inline the XXH3 withSecret code.
+ *
+ * When the secret size is known, the compiler can improve the performance
+ * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret().
+ *
+ * However, if the secret size is not known, it doesn't have any benefit. This
+ * happens when xxHash is compiled into a global symbol. Therefore, if
+ * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0.
+ *
+ * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers
+ * that are *sometimes* force inline on -Og, and it is impossible to automatically
+ * detect this optimization level.
+ */
+#  define XXH3_INLINE_SECRET 0
+
+/*!
+ * @def XXH32_ENDJMP
+ * @brief Whether to use a jump for `XXH32_finalize`.
+ *
+ * For performance, `XXH32_finalize` uses multiple branches in the finalizer.
+ * This is generally preferable for performance,
+ * but depending on exact architecture, a jmp may be preferable.
+ *
+ * This setting is only possibly making a difference for very small inputs.
+ */
+#  define XXH32_ENDJMP 0
+
+/*!
+ * @internal
+ * @brief Redefines old internal names.
+ *
+ * For compatibility with code that uses xxHash's internals before the names
+ * were changed to improve namespacing. There is no other reason to use this.
+ */
+#  define XXH_OLD_NAMES
+#  undef XXH_OLD_NAMES /* don't actually use, it is ugly. */
+
+/*!
+ * @def XXH_NO_STREAM
+ * @brief Disables the streaming API.
+ *
+ * When xxHash is not inlined and the streaming functions are not used, disabling
+ * the streaming functions can improve code size significantly, especially with
+ * the @ref XXH3_family which tends to make constant folded copies of itself.
+ */
+#  define XXH_NO_STREAM
+#  undef XXH_NO_STREAM /* don't actually */
+#endif /* XXH_DOXYGEN */
+/*!
+ * @}
+ */
+
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+   /* prefer __packed__ structures (method 1) for GCC
+    * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy
+    * which for some reason does unaligned loads. */
+#  if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+#ifndef XXH_SIZE_OPT
+   /* default to 1 for -Os or -Oz */
+#  if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__)
+#    define XXH_SIZE_OPT 1
+#  else
+#    define XXH_SIZE_OPT 0
+#  endif
+#endif
+
+#ifndef XXH_FORCE_ALIGN_CHECK  /* can be defined externally */
+   /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */
+#  if XXH_SIZE_OPT >= 1 || \
+      defined(__i386)  || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \
+   || defined(_M_IX86) || defined(_M_X64)     || defined(_M_ARM64)    || defined(_M_ARM) /* visual */
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+#ifndef XXH_NO_INLINE_HINTS
+#  if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__)  /* -O0, -fno-inline */
+#    define XXH_NO_INLINE_HINTS 1
+#  else
+#    define XXH_NO_INLINE_HINTS 0
+#  endif
+#endif
+
+#ifndef XXH3_INLINE_SECRET
+#  if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \
+     || !defined(XXH_INLINE_ALL)
+#    define XXH3_INLINE_SECRET 0
+#  else
+#    define XXH3_INLINE_SECRET 1
+#  endif
+#endif
+
+#ifndef XXH32_ENDJMP
+/* generally preferable for performance */
+#  define XXH32_ENDJMP 0
+#endif
+
+/*!
+ * @defgroup impl Implementation
+ * @{
+ */
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+#if defined(XXH_NO_STREAM)
+/* nothing */
+#elif defined(XXH_NO_STDLIB)
+
+/* When requesting to disable any mention of stdlib,
+ * the library loses the ability to invoked malloc / free.
+ * In practice, it means that functions like `XXH*_createState()`
+ * will always fail, and return NULL.
+ * This flag is useful in situations where
+ * xxhash.h is integrated into some kernel, embedded or limited environment
+ * without access to dynamic allocation.
+ */
+
+static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; }
+static void XXH_free(void* p) { (void)p; }
+
+#else
+
+/*
+ * Modify the local functions below should you wish to use
+ * different memory routines for malloc() and free()
+ */
+#include <stdlib.h>
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than malloc().
+ */
+static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); }
+
+/*!
+ * @internal
+ * @brief Modify this function to use a different routine than free().
+ */
+static void XXH_free(void* p) { free(p); }
+
+#endif  /* XXH_NO_STDLIB */
+
+#ifndef XXH_memcpy
+/*!
+ * @internal
+ * @brief XXH_memcpy() macro can be redirected at compile time
+ */
+#  include <string.h>
+#  define XXH_memcpy memcpy
+#endif
+
+#ifndef XXH_memset
+/*!
+ * @internal
+ * @brief XXH_memset() macro can be redirected at compile time
+ */
+#  include <string.h>
+#  define XXH_memset memset
+#endif
+
+#ifndef XXH_memcmp
+/*!
+ * @internal
+ * @brief XXH_memcmp() macro can be redirected at compile time
+ * Note: only needed by XXH128.
+ */
+#  include <string.h>
+#  define XXH_memcmp memcmp
+#endif
+
+
+
+#include <limits.h>   /* ULLONG_MAX */
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER /* Visual Studio warning fix */
+#  pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */
+#endif
+
+#if XXH_NO_INLINE_HINTS  /* disable inlining hints */
+#  if defined(__GNUC__) || defined(__clang__)
+#    define XXH_FORCE_INLINE static __attribute__((__unused__))
+#  else
+#    define XXH_FORCE_INLINE static
+#  endif
+#  define XXH_NO_INLINE static
+/* enable inlining hints */
+#elif defined(__GNUC__) || defined(__clang__)
+#  define XXH_FORCE_INLINE static __inline__ __attribute__((__always_inline__, __unused__))
+#  define XXH_NO_INLINE static __attribute__((__noinline__))
+#elif defined(_MSC_VER)  /* Visual Studio */
+#  define XXH_FORCE_INLINE static __forceinline
+#  define XXH_NO_INLINE static __declspec(noinline)
+#elif defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* C99 */
+#  define XXH_FORCE_INLINE static inline
+#  define XXH_NO_INLINE static
+#else
+#  define XXH_FORCE_INLINE static
+#  define XXH_NO_INLINE static
+#endif
+
+#if defined(XXH_INLINE_ALL)
+#  define XXH_STATIC XXH_FORCE_INLINE
+#else
+#  define XXH_STATIC static
+#endif
+
+#if XXH3_INLINE_SECRET
+#  define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE
+#else
+#  define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE
+#endif
+
+#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+#  define XXH_RESTRICT   /* disable */
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define XXH_RESTRICT   restrict
+#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
+   || (defined (__clang__)) \
+   || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \
+   || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300))
+/*
+ * There are a LOT more compilers that recognize __restrict but this
+ * covers the major ones.
+ */
+#  define XXH_RESTRICT   __restrict
+#else
+#  define XXH_RESTRICT   /* disable */
+#endif
+
+/* *************************************
+*  Debug
+***************************************/
+/*!
+ * @ingroup tuning
+ * @def XXH_DEBUGLEVEL
+ * @brief Sets the debugging level.
+ *
+ * XXH_DEBUGLEVEL is expected to be defined externally, typically via the
+ * compiler's command line options. The value must be a number.
+ */
+#ifndef XXH_DEBUGLEVEL
+#  ifdef DEBUGLEVEL /* backwards compat */
+#    define XXH_DEBUGLEVEL DEBUGLEVEL
+#  else
+#    define XXH_DEBUGLEVEL 0
+#  endif
+#endif
+
+#if (XXH_DEBUGLEVEL>=1)
+#  include <assert.h>   /* note: can still be disabled with NDEBUG */
+#  define XXH_ASSERT(c)   assert(c)
+#else
+#  if defined(__INTEL_COMPILER)
+#    define XXH_ASSERT(c)   XXH_ASSUME((unsigned char) (c))
+#  else
+#    define XXH_ASSERT(c)   XXH_ASSUME(c)
+#  endif
+#endif
+
+/* note: use after variable declarations */
+#ifndef XXH_STATIC_ASSERT
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)    /* C11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0)
+#  elif defined(__cplusplus) && (__cplusplus >= 201103L)            /* C++11 */
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0)
+#  else
+#    define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0)
+#  endif
+#  define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c)
+#endif
+
+/*!
+ * @internal
+ * @def XXH_COMPILER_GUARD(var)
+ * @brief Used to prevent unwanted optimizations for @p var.
+ *
+ * It uses an empty GCC inline assembly statement with a register constraint
+ * which forces @p var into a general purpose register (eg eax, ebx, ecx
+ * on x86) and marks it as modified.
+ *
+ * This is used in a few places to avoid unwanted autovectorization (e.g.
+ * XXH32_round()). All vectorization we want is explicit via intrinsics,
+ * and _usually_ isn't wanted elsewhere.
+ *
+ * We also use it to prevent unwanted constant folding for AArch64 in
+ * XXH3_initCustomSecret_scalar().
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var))
+#else
+#  define XXH_COMPILER_GUARD(var) ((void)0)
+#endif
+
+/* Specifically for NEON vectors which use the "w" constraint, on
+ * Clang. */
+#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__)
+#  define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var))
+#else
+#  define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0)
+#endif
+
+/* *************************************
+*  Basic Types
+***************************************/
+#if !defined (__VMS) \
+ && (defined (__cplusplus) \
+ || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   ifdef _AIX
+#     include <inttypes.h>
+#   else
+#     include <stdint.h>
+#   endif
+    typedef uint8_t xxh_u8;
+#else
+    typedef unsigned char xxh_u8;
+#endif
+typedef XXH32_hash_t xxh_u32;
+
+#ifdef XXH_OLD_NAMES
+#  warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly"
+#  define BYTE xxh_u8
+#  define U8   xxh_u8
+#  define U32  xxh_u32
+#endif
+
+/* ***   Memory access   *** */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_read32(const void* ptr)
+ * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit native endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit little endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readBE32(const void* ptr)
+ * @brief Reads an unaligned 32-bit big endian integer from @p ptr.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ *
+ * @param ptr The pointer to read from.
+ * @return The 32-bit big endian integer from the bytes at @p ptr.
+ */
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align)
+ * @brief Like @ref XXH_readLE32(), but has an option for aligned reads.
+ *
+ * Affected by @ref XXH_FORCE_MEMORY_ACCESS.
+ * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is
+ * always @ref XXH_alignment::XXH_unaligned.
+ *
+ * @param ptr The pointer to read from.
+ * @param align Whether @p ptr is aligned.
+ * @pre
+ *   If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte
+ *   aligned.
+ * @return The 32-bit little endian integer from the bytes at @p ptr.
+ */
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE32 and XXH_readBE32.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/*
+ * Force direct memory access. Only works on CPU which support unaligned memory
+ * access in hardware.
+ */
+static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
+ * documentation claimed that it only increased the alignment, but actually it
+ * can decrease it on gcc, clang, and icc:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
+ * https://gcc.godbolt.org/z/xYez1j67Y.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; } __attribute__((__packed__)) unalign;
+#endif
+static xxh_u32 XXH_read32(const void* ptr)
+{
+    typedef __attribute__((__aligned__(1))) __attribute__((__may_alias__)) xxh_u32 xxh_unalign32;
+    return *((const xxh_unalign32*)ptr);
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u32 XXH_read32(const void* memPtr)
+{
+    xxh_u32 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ***   Endianness   *** */
+
+/*!
+ * @ingroup tuning
+ * @def XXH_CPU_LITTLE_ENDIAN
+ * @brief Whether the target is little endian.
+ *
+ * Defined to 1 if the target is little endian, or 0 if it is big endian.
+ * It can be defined externally, for example on the compiler command line.
+ *
+ * If it is not defined,
+ * a runtime check (which is usually constant folded) is used instead.
+ *
+ * @note
+ *   This is not necessarily defined to an integer constant.
+ *
+ * @see XXH_isLittleEndian() for the runtime check.
+ */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+/*
+ * Try to detect endianness automatically, to avoid the nonstandard behavior
+ * in `XXH_isLittleEndian()`
+ */
+#  if defined(_WIN32) /* Windows is always little endian */ \
+     || defined(__LITTLE_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) \
+     || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_CPU_LITTLE_ENDIAN 0
+#  else
+/*!
+ * @internal
+ * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN.
+ *
+ * Most compilers will constant fold this.
+ */
+static int XXH_isLittleEndian(void)
+{
+    /*
+     * Portable and well-defined behavior.
+     * Don't use static: it is detrimental to performance.
+     */
+    const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 };
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#  endif
+#endif
+
+
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#ifdef __has_builtin
+#  define XXH_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define XXH_HAS_BUILTIN(x) 0
+#endif
+
+
+
+/*
+ * C23 and future versions have standard "unreachable()".
+ * Once it has been implemented reliably we can add it as an
+ * additional case:
+ *
+ * ```
+ * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 202311L)
+ * #  include <stddef.h>
+ * #  ifdef unreachable
+ * #    define XXH_UNREACHABLE() unreachable()
+ * #  endif
+ * #endif
+ * ```
+ *
+ * Note C++23 also has std::unreachable() which can be detected
+ * as follows:
+ * ```
+ * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L)
+ * #  include <utility>
+ * #  define XXH_UNREACHABLE() std::unreachable()
+ * #endif
+ * ```
+ * NB: `__cpp_lib_unreachable` is defined in the `<version>` header.
+ * We don't use that as including `<utility>` in `extern "C"` blocks
+ * doesn't work on GCC12
+ */
+
+#if XXH_HAS_BUILTIN(__builtin_unreachable)
+#  define XXH_UNREACHABLE() __builtin_unreachable()
+
+#elif defined(_MSC_VER)
+#  define XXH_UNREACHABLE() __assume(0)
+
+#else
+#  define XXH_UNREACHABLE()
+#endif
+
+#if XXH_HAS_BUILTIN(__builtin_assume)
+#  define XXH_ASSUME(c) __builtin_assume(c)
+#else
+#  define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); }
+#endif
+
+/*!
+ * @internal
+ * @def XXH_rotl32(x,r)
+ * @brief 32-bit rotate left.
+ *
+ * @param x The 32-bit integer to be rotated.
+ * @param r The number of bits to rotate.
+ * @pre
+ *   @p r > 0 && @p r < 32
+ * @note
+ *   @p x and @p r may be evaluated multiple times.
+ * @return The rotated result.
+ */
+#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \
+                               && XXH_HAS_BUILTIN(__builtin_rotateleft64)
+#  define XXH_rotl32 __builtin_rotateleft32
+#  define XXH_rotl64 __builtin_rotateleft64
+#elif XXH_HAS_BUILTIN(__builtin_stdc_rotate_left)
+#  define XXH_rotl32 __builtin_stdc_rotate_left
+#  define XXH_rotl64 __builtin_stdc_rotate_left
+/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#elif defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
+#endif
+
+/*!
+ * @internal
+ * @fn xxh_u32 XXH_swap32(xxh_u32 x)
+ * @brief A 32-bit byteswap.
+ *
+ * @param x The 32-bit integer to byteswap.
+ * @return @p x, byteswapped.
+ */
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static xxh_u32 XXH_swap32 (xxh_u32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+
+/*!
+ * @internal
+ * @brief Enum to indicate whether a pointer is aligned.
+ */
+typedef enum {
+    XXH_aligned,  /*!< Aligned */
+    XXH_unaligned /*!< Possibly unaligned */
+} XXH_alignment;
+
+/*
+ * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load.
+ *
+ * This is ideal for older compilers which don't inline memcpy.
+ */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u32)bytePtr[1] << 8)
+         | ((xxh_u32)bytePtr[2] << 16)
+         | ((xxh_u32)bytePtr[3] << 24);
+}
+
+XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[3]
+         | ((xxh_u32)bytePtr[2] << 8)
+         | ((xxh_u32)bytePtr[1] << 16)
+         | ((xxh_u32)bytePtr[0] << 24);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+}
+
+static xxh_u32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u32
+XXH_readLE32_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned) {
+        return XXH_readLE32(ptr);
+    } else {
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr);
+    }
+}
+
+
+/* *************************************
+*  Misc
+***************************************/
+/*! @ingroup public */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @defgroup XXH32_impl XXH32 implementation
+ * @ingroup impl
+ *
+ * Details on the XXH32 implementation.
+ * @{
+ */
+ /* #define instead of static const, to be used as initializers */
+#define XXH_PRIME32_1  0x9E3779B1U  /*!< 0b10011110001101110111100110110001 */
+#define XXH_PRIME32_2  0x85EBCA77U  /*!< 0b10000101111010111100101001110111 */
+#define XXH_PRIME32_3  0xC2B2AE3DU  /*!< 0b11000010101100101010111000111101 */
+#define XXH_PRIME32_4  0x27D4EB2FU  /*!< 0b00100111110101001110101100101111 */
+#define XXH_PRIME32_5  0x165667B1U  /*!< 0b00010110010101100110011110110001 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME32_1 XXH_PRIME32_1
+#  define PRIME32_2 XXH_PRIME32_2
+#  define PRIME32_3 XXH_PRIME32_3
+#  define PRIME32_4 XXH_PRIME32_4
+#  define PRIME32_5 XXH_PRIME32_5
+#endif
+
+/*!
+ * @internal
+ * @brief Normal stripe processing routine.
+ *
+ * This shuffles the bits so that any bit from @p input impacts several bits in
+ * @p acc.
+ *
+ * @param acc The accumulator lane.
+ * @param input The stripe of input to mix.
+ * @return The mixed accumulator lane.
+ */
+static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input)
+{
+    acc += input * XXH_PRIME32_2;
+    acc  = XXH_rotl32(acc, 13);
+    acc *= XXH_PRIME32_1;
+#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * UGLY HACK:
+     * A compiler fence is used to prevent GCC and Clang from
+     * autovectorizing the XXH32 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling SSE4.1.
+     *
+     * The reason we want to avoid vectorization is because despite working on
+     * 4 integers at a time, there are multiple factors slowing XXH32 down on
+     * SSE4:
+     * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on
+     *   newer chips!) making it slightly slower to multiply four integers at
+     *   once compared to four integers independently. Even when pmulld was
+     *   fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE
+     *   just to multiply unless doing a long operation.
+     *
+     * - Four instructions are required to rotate,
+     *      movqda tmp,  v // not required with VEX encoding
+     *      pslld  tmp, 13 // tmp <<= 13
+     *      psrld  v,   19 // x >>= 19
+     *      por    v,  tmp // x |= tmp
+     *   compared to one for scalar:
+     *      roll   v, 13    // reliably fast across the board
+     *      shldl  v, v, 13 // Sandy Bridge and later prefer this for some reason
+     *
+     * - Instruction level parallelism is actually more beneficial here because
+     *   the SIMD actually serializes this operation: While v1 is rotating, v2
+     *   can load data, while v3 can multiply. SSE forces them to operate
+     *   together.
+     *
+     * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing
+     * the loop. NEON is only faster on the A53, and with the newer cores, it is less
+     * than half the speed.
+     *
+     * Additionally, this is used on WASM SIMD128 because it JITs to the same
+     * SIMD instructions and has the same issue.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+/*!
+ * @internal
+ * @brief Mixes all bits to finalize the hash.
+ *
+ * The final mix ensures that all input bits have a chance to impact any bit in
+ * the output digest, resulting in an unbiased distribution.
+ *
+ * @param hash The hash to avalanche.
+ * @return The avalanched hash.
+ */
+static xxh_u32 XXH32_avalanche(xxh_u32 hash)
+{
+    hash ^= hash >> 15;
+    hash *= XXH_PRIME32_2;
+    hash ^= hash >> 13;
+    hash *= XXH_PRIME32_3;
+    hash ^= hash >> 16;
+    return hash;
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, align)
+
+/*!
+ * @internal
+ * @brief Sets up the initial accumulator state for XXH32().
+ */
+XXH_FORCE_INLINE void
+XXH32_initAccs(xxh_u32 *acc, xxh_u32 seed)
+{
+    XXH_ASSERT(acc != NULL);
+    acc[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2;
+    acc[1] = seed + XXH_PRIME32_2;
+    acc[2] = seed + 0;
+    acc[3] = seed - XXH_PRIME32_1;
+}
+
+/*!
+ * @internal
+ * @brief Consumes a block of data for XXH32().
+ *
+ * @return the end input pointer.
+ */
+XXH_FORCE_INLINE const xxh_u8 *
+XXH32_consumeLong(
+    xxh_u32 *XXH_RESTRICT acc,
+    xxh_u8 const *XXH_RESTRICT input,
+    size_t len,
+    XXH_alignment align
+)
+{
+    const xxh_u8* const bEnd = input + len;
+    const xxh_u8* const limit = bEnd - 15;
+    XXH_ASSERT(acc != NULL);
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(len >= 16);
+    do {
+        acc[0] = XXH32_round(acc[0], XXH_get32bits(input)); input += 4;
+        acc[1] = XXH32_round(acc[1], XXH_get32bits(input)); input += 4;
+        acc[2] = XXH32_round(acc[2], XXH_get32bits(input)); input += 4;
+        acc[3] = XXH32_round(acc[3], XXH_get32bits(input)); input += 4;
+    } while (input < limit);
+
+    return input;
+}
+
+/*!
+ * @internal
+ * @brief Merges the accumulator lanes together for XXH32()
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u32
+XXH32_mergeAccs(const xxh_u32 *acc)
+{
+    XXH_ASSERT(acc != NULL);
+    return XXH_rotl32(acc[0], 1)  + XXH_rotl32(acc[1], 7)
+         + XXH_rotl32(acc[2], 12) + XXH_rotl32(acc[3], 18);
+}
+
+/*!
+ * @internal
+ * @brief Processes the last 0-15 bytes of @p ptr.
+ *
+ * There may be up to 15 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param hash The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 16.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash.
+ * @see XXH64_finalize().
+ */
+static XXH_PUREF xxh_u32
+XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+#define XXH_PROCESS1 do {                             \
+    hash += (*ptr++) * XXH_PRIME32_5;                 \
+    hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1;      \
+} while (0)
+
+#define XXH_PROCESS4 do {                             \
+    hash += XXH_get32bits(ptr) * XXH_PRIME32_3;       \
+    ptr += 4;                                         \
+    hash  = XXH_rotl32(hash, 17) * XXH_PRIME32_4;     \
+} while (0)
+
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+
+    /* Compact rerolled version; generally faster */
+    if (!XXH32_ENDJMP) {
+        len &= 15;
+        while (len >= 4) {
+            XXH_PROCESS4;
+            len -= 4;
+        }
+        while (len > 0) {
+            XXH_PROCESS1;
+            --len;
+        }
+        return XXH32_avalanche(hash);
+    } else {
+         switch(len&15) /* or switch(bEnd - p) */ {
+           case 12:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 8:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 4:       XXH_PROCESS4;
+                         return XXH32_avalanche(hash);
+
+           case 13:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 9:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 5:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(hash);
+
+           case 14:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 10:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 6:       XXH_PROCESS4;
+                         XXH_PROCESS1;
+                         XXH_PROCESS1;
+                         return XXH32_avalanche(hash);
+
+           case 15:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 11:      XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 7:       XXH_PROCESS4;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 3:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 2:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 1:       XXH_PROCESS1;
+                         XXH_FALLTHROUGH;  /* fallthrough */
+           case 0:       return XXH32_avalanche(hash);
+        }
+        XXH_ASSERT(0);
+        return hash;   /* reaching this point is deemed impossible */
+    }
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1 XXH_PROCESS1
+#  define PROCESS4 XXH_PROCESS4
+#else
+#  undef XXH_PROCESS1
+#  undef XXH_PROCESS4
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH32().
+ *
+ * @param input , len , seed Directly passed from @ref XXH32().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u32
+XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align)
+{
+    xxh_u32 h32;
+
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=16) {
+        xxh_u32 acc[4];
+        XXH32_initAccs(acc, seed);
+
+        input = XXH32_consumeLong(acc, input, len, align);
+
+        h32 = XXH32_mergeAccs(acc);
+    } else {
+        h32  = seed + XXH_PRIME32_5;
+    }
+
+    h32 += (xxh_u32)len;
+
+    return XXH32_finalize(h32, input, len&15, align);
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed)
+{
+#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, (const xxh_u8*)input, len);
+    return XXH32_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+#endif
+}
+
+
+
+/*******   Hash streaming   *******/
+#ifndef XXH_NO_STREAM
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    XXH_memset(statePtr, 0, sizeof(*statePtr));
+    XXH32_initAccs(statePtr->acc, seed);
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH32_update(XXH32_state_t* state, const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    state->total_len_32 += (XXH32_hash_t)len;
+    state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16));
+
+    XXH_ASSERT(state->bufferedSize < sizeof(state->buffer));
+    if (len < sizeof(state->buffer) - state->bufferedSize)  {   /* fill in tmp buffer */
+        XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+        state->bufferedSize += (XXH32_hash_t)len;
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* xinput = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = xinput + len;
+
+        if (state->bufferedSize) {   /* non-empty buffer: complete first */
+            XXH_memcpy(state->buffer + state->bufferedSize, xinput, sizeof(state->buffer) - state->bufferedSize);
+            xinput += sizeof(state->buffer) - state->bufferedSize;
+            /* then process one round */
+            (void)XXH32_consumeLong(state->acc, state->buffer, sizeof(state->buffer), XXH_aligned);
+            state->bufferedSize = 0;
+        }
+
+        XXH_ASSERT(xinput <= bEnd);
+        if ((size_t)(bEnd - xinput) >= sizeof(state->buffer)) {
+            /* Process the remaining data */
+            xinput = XXH32_consumeLong(state->acc, xinput, (size_t)(bEnd - xinput), XXH_unaligned);
+        }
+
+        if (xinput < bEnd) {
+            /* Copy the leftover to the tmp buffer */
+            XXH_memcpy(state->buffer, xinput, (size_t)(bEnd-xinput));
+            state->bufferedSize = (unsigned)(bEnd-xinput);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state)
+{
+    xxh_u32 h32;
+
+    if (state->large_len) {
+        h32 = XXH32_mergeAccs(state->acc);
+    } else {
+        h32 = state->acc[2] /* == seed */ + XXH_PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, state->buffer, state->bufferedSize, XXH_aligned);
+}
+#endif /* !XXH_NO_STREAM */
+
+/*******   Canonical representation   *******/
+
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+/*! @ingroup XXH32_family */
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+/*!
+ * @}
+ * @ingroup impl
+ * @{
+ */
+/*******   Memory access   *******/
+
+typedef XXH64_hash_t xxh_u64;
+
+#ifdef XXH_OLD_NAMES
+#  define U64 xxh_u64
+#endif
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+/*
+ * Manual byteshift. Best for old compilers which don't inline memcpy.
+ * We actually directly use XXH_readLE64 and XXH_readBE64.
+ */
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    return *(const xxh_u64*) memPtr;
+}
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/*
+ * __attribute__((aligned(1))) is supported by gcc and clang. Originally the
+ * documentation claimed that it only increased the alignment, but actually it
+ * can decrease it on gcc, clang, and icc:
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502,
+ * https://gcc.godbolt.org/z/xYez1j67Y.
+ */
+#ifdef XXH_OLD_NAMES
+typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((__packed__)) unalign64;
+#endif
+static xxh_u64 XXH_read64(const void* ptr)
+{
+    typedef __attribute__((__aligned__(1))) __attribute__((__may_alias__)) xxh_u64 xxh_unalign64;
+    return *((const xxh_unalign64*)ptr);
+}
+
+#else
+
+/*
+ * Portable and safe solution. Generally efficient.
+ * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html
+ */
+static xxh_u64 XXH_read64(const void* memPtr)
+{
+    xxh_u64 val;
+    XXH_memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static xxh_u64 XXH_swap64(xxh_u64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3))
+
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[0]
+         | ((xxh_u64)bytePtr[1] << 8)
+         | ((xxh_u64)bytePtr[2] << 16)
+         | ((xxh_u64)bytePtr[3] << 24)
+         | ((xxh_u64)bytePtr[4] << 32)
+         | ((xxh_u64)bytePtr[5] << 40)
+         | ((xxh_u64)bytePtr[6] << 48)
+         | ((xxh_u64)bytePtr[7] << 56);
+}
+
+XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr)
+{
+    const xxh_u8* bytePtr = (const xxh_u8 *)memPtr;
+    return bytePtr[7]
+         | ((xxh_u64)bytePtr[6] << 8)
+         | ((xxh_u64)bytePtr[5] << 16)
+         | ((xxh_u64)bytePtr[4] << 24)
+         | ((xxh_u64)bytePtr[3] << 32)
+         | ((xxh_u64)bytePtr[2] << 40)
+         | ((xxh_u64)bytePtr[1] << 48)
+         | ((xxh_u64)bytePtr[0] << 56);
+}
+
+#else
+XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+}
+
+static xxh_u64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+#endif
+
+XXH_FORCE_INLINE xxh_u64
+XXH_readLE64_align(const void* ptr, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return XXH_readLE64(ptr);
+    else
+        return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr);
+}
+
+
+/*******   xxh64   *******/
+/*!
+ * @}
+ * @defgroup XXH64_impl XXH64 implementation
+ * @ingroup impl
+ *
+ * Details on the XXH64 implementation.
+ * @{
+ */
+/* #define rather that static const, to be used as initializers */
+#define XXH_PRIME64_1  0x9E3779B185EBCA87ULL  /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */
+#define XXH_PRIME64_2  0xC2B2AE3D27D4EB4FULL  /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */
+#define XXH_PRIME64_3  0x165667B19E3779F9ULL  /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */
+#define XXH_PRIME64_4  0x85EBCA77C2B2AE63ULL  /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */
+#define XXH_PRIME64_5  0x27D4EB2F165667C5ULL  /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */
+
+#ifdef XXH_OLD_NAMES
+#  define PRIME64_1 XXH_PRIME64_1
+#  define PRIME64_2 XXH_PRIME64_2
+#  define PRIME64_3 XXH_PRIME64_3
+#  define PRIME64_4 XXH_PRIME64_4
+#  define PRIME64_5 XXH_PRIME64_5
+#endif
+
+/*! @copydoc XXH32_round */
+static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input)
+{
+    acc += input * XXH_PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= XXH_PRIME64_1;
+#if (defined(__AVX512F__)) && !defined(XXH_ENABLE_AUTOVECTORIZE)
+    /*
+     * DISABLE AUTOVECTORIZATION:
+     * A compiler fence is used to prevent GCC and Clang from
+     * autovectorizing the XXH64 loop (pragmas and attributes don't work for some
+     * reason) without globally disabling AVX512.
+     *
+     * Autovectorization of XXH64 tends to be detrimental,
+     * though the exact outcome may change depending on exact cpu and compiler version.
+     * For information, it has been reported as detrimental for Skylake-X,
+     * but possibly beneficial for Zen4.
+     *
+     * The default is to disable auto-vectorization,
+     * but you can select to enable it instead using `XXH_ENABLE_AUTOVECTORIZE` build variable.
+     */
+    XXH_COMPILER_GUARD(acc);
+#endif
+    return acc;
+}
+
+static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * XXH_PRIME64_1 + XXH_PRIME64_4;
+    return acc;
+}
+
+/*! @copydoc XXH32_avalanche */
+static xxh_u64 XXH64_avalanche(xxh_u64 hash)
+{
+    hash ^= hash >> 33;
+    hash *= XXH_PRIME64_2;
+    hash ^= hash >> 29;
+    hash *= XXH_PRIME64_3;
+    hash ^= hash >> 32;
+    return hash;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, align)
+
+/*!
+ * @internal
+ * @brief Sets up the initial accumulator state for XXH64().
+ */
+XXH_FORCE_INLINE void
+XXH64_initAccs(xxh_u64 *acc, xxh_u64 seed)
+{
+    XXH_ASSERT(acc != NULL);
+    acc[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2;
+    acc[1] = seed + XXH_PRIME64_2;
+    acc[2] = seed + 0;
+    acc[3] = seed - XXH_PRIME64_1;
+}
+
+/*!
+ * @internal
+ * @brief Consumes a block of data for XXH64().
+ *
+ * @return the end input pointer.
+ */
+XXH_FORCE_INLINE const xxh_u8 *
+XXH64_consumeLong(
+    xxh_u64 *XXH_RESTRICT acc,
+    xxh_u8 const *XXH_RESTRICT input,
+    size_t len,
+    XXH_alignment align
+)
+{
+    const xxh_u8* const bEnd = input + len;
+    const xxh_u8* const limit = bEnd - 31;
+    XXH_ASSERT(acc != NULL);
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(len >= 32);
+    do {
+        /* reroll on 32-bit */
+        if (sizeof(void *) < sizeof(xxh_u64)) {
+            size_t i;
+            for (i = 0; i < 4; i++) {
+                acc[i] = XXH64_round(acc[i], XXH_get64bits(input));
+                input += 8;
+            }
+        } else {
+            acc[0] = XXH64_round(acc[0], XXH_get64bits(input)); input += 8;
+            acc[1] = XXH64_round(acc[1], XXH_get64bits(input)); input += 8;
+            acc[2] = XXH64_round(acc[2], XXH_get64bits(input)); input += 8;
+            acc[3] = XXH64_round(acc[3], XXH_get64bits(input)); input += 8;
+        }
+    } while (input < limit);
+
+    return input;
+}
+
+/*!
+ * @internal
+ * @brief Merges the accumulator lanes together for XXH64()
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u64
+XXH64_mergeAccs(const xxh_u64 *acc)
+{
+    XXH_ASSERT(acc != NULL);
+    {
+        xxh_u64 h64 = XXH_rotl64(acc[0], 1) + XXH_rotl64(acc[1], 7)
+                    + XXH_rotl64(acc[2], 12) + XXH_rotl64(acc[3], 18);
+        /* reroll on 32-bit */
+        if (sizeof(void *) < sizeof(xxh_u64)) {
+            size_t i;
+            for (i = 0; i < 4; i++) {
+                h64 = XXH64_mergeRound(h64, acc[i]);
+            }
+        } else {
+            h64 = XXH64_mergeRound(h64, acc[0]);
+            h64 = XXH64_mergeRound(h64, acc[1]);
+            h64 = XXH64_mergeRound(h64, acc[2]);
+            h64 = XXH64_mergeRound(h64, acc[3]);
+        }
+        return h64;
+    }
+}
+
+/*!
+ * @internal
+ * @brief Processes the last 0-31 bytes of @p ptr.
+ *
+ * There may be up to 31 bytes remaining to consume from the input.
+ * This final stage will digest them to ensure that all input bytes are present
+ * in the final mix.
+ *
+ * @param hash The hash to finalize.
+ * @param ptr The pointer to the remaining input.
+ * @param len The remaining length, modulo 32.
+ * @param align Whether @p ptr is aligned.
+ * @return The finalized hash
+ * @see XXH32_finalize().
+ */
+XXH_STATIC XXH_PUREF xxh_u64
+XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align)
+{
+    if (ptr==NULL) XXH_ASSERT(len == 0);
+    len &= 31;
+    while (len >= 8) {
+        xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr));
+        ptr += 8;
+        hash ^= k1;
+        hash  = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4;
+        len -= 8;
+    }
+    if (len >= 4) {
+        hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1;
+        ptr += 4;
+        hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3;
+        len -= 4;
+    }
+    while (len > 0) {
+        hash ^= (*ptr++) * XXH_PRIME64_5;
+        hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1;
+        --len;
+    }
+    return  XXH64_avalanche(hash);
+}
+
+#ifdef XXH_OLD_NAMES
+#  define PROCESS1_64 XXH_PROCESS1_64
+#  define PROCESS4_64 XXH_PROCESS4_64
+#  define PROCESS8_64 XXH_PROCESS8_64
+#else
+#  undef XXH_PROCESS1_64
+#  undef XXH_PROCESS4_64
+#  undef XXH_PROCESS8_64
+#endif
+
+/*!
+ * @internal
+ * @brief The implementation for @ref XXH64().
+ *
+ * @param input , len , seed Directly passed from @ref XXH64().
+ * @param align Whether @p input is aligned.
+ * @return The calculated hash.
+ */
+XXH_FORCE_INLINE XXH_PUREF xxh_u64
+XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align)
+{
+    xxh_u64 h64;
+    if (input==NULL) XXH_ASSERT(len == 0);
+
+    if (len>=32) {  /* Process a large block of data */
+        xxh_u64 acc[4];
+        XXH64_initAccs(acc, seed);
+
+        input = XXH64_consumeLong(acc, input, len, align);
+
+        h64 = XXH64_mergeAccs(acc);
+    } else {
+        h64  = seed + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) len;
+
+    return XXH64_finalize(h64, input, len, align);
+}
+
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, (const xxh_u8*)input, len);
+    return XXH64_digest(&state);
+#else
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned);
+    }   }
+
+    return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned);
+
+#endif
+}
+
+/*******   Hash Streaming   *******/
+#ifndef XXH_NO_STREAM
+/*! @ingroup XXH64_family*/
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    XXH_memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed)
+{
+    XXH_ASSERT(statePtr != NULL);
+    XXH_memset(statePtr, 0, sizeof(*statePtr));
+    XXH64_initAccs(statePtr->acc, seed);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    state->total_len += len;
+
+    XXH_ASSERT(state->bufferedSize <= sizeof(state->buffer));
+    if (len < sizeof(state->buffer) - state->bufferedSize)  {   /* fill in tmp buffer */
+        XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+        state->bufferedSize += (XXH32_hash_t)len;
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* xinput = (const xxh_u8*)input;
+        const xxh_u8* const bEnd = xinput + len;
+
+        if (state->bufferedSize) {   /* non-empty buffer => complete first */
+            XXH_memcpy(state->buffer + state->bufferedSize, xinput, sizeof(state->buffer) - state->bufferedSize);
+            xinput += sizeof(state->buffer) - state->bufferedSize;
+            /* and process one round */
+            (void)XXH64_consumeLong(state->acc, state->buffer, sizeof(state->buffer), XXH_aligned);
+            state->bufferedSize = 0;
+        }
+
+        XXH_ASSERT(xinput <= bEnd);
+        if ((size_t)(bEnd - xinput) >= sizeof(state->buffer)) {
+            /* Process the remaining data */
+            xinput = XXH64_consumeLong(state->acc, xinput, (size_t)(bEnd - xinput), XXH_unaligned);
+        }
+
+        if (xinput < bEnd) {
+            /* Copy the leftover to the tmp buffer */
+            XXH_memcpy(state->buffer, xinput, (size_t)(bEnd-xinput));
+            state->bufferedSize = (unsigned)(bEnd-xinput);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state)
+{
+    xxh_u64 h64;
+
+    if (state->total_len >= 32) {
+        h64 = XXH64_mergeAccs(state->acc);
+    } else {
+        h64  = state->acc[2] /*seed*/ + XXH_PRIME64_5;
+    }
+
+    h64 += (xxh_u64) state->total_len;
+
+    return XXH64_finalize(h64, state->buffer, (size_t)state->total_len, XXH_aligned);
+}
+#endif /* !XXH_NO_STREAM */
+
+/******* Canonical representation   *******/
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    XXH_memcpy(dst, &hash, sizeof(*dst));
+}
+
+/*! @ingroup XXH64_family */
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+#ifndef XXH_NO_XXH3
+
+/* *********************************************************************
+*  XXH3
+*  New generation hash designed for speed on small keys and vectorization
+************************************************************************ */
+/*!
+ * @}
+ * @defgroup XXH3_impl XXH3 implementation
+ * @ingroup impl
+ * @{
+ */
+
+/* ===   Compiler specifics   === */
+
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
+
+#ifndef XXH_HAS_INCLUDE
+#  ifdef __has_include
+/*
+ * Not defined as XXH_HAS_INCLUDE(x) (function-like) because
+ * this causes segfaults in Apple Clang 4.2 (on Mac OS X 10.7 Lion)
+ */
+#    define XXH_HAS_INCLUDE __has_include
+#  else
+#    define XXH_HAS_INCLUDE(x) 0
+#  endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(__ARM_FEATURE_SVE)
+#    include <arm_sve.h>
+#  endif
+#  if defined(__ARM_NEON__) || defined(__ARM_NEON) \
+   || (defined(_M_ARM) && _M_ARM >= 7) \
+   || defined(_M_ARM64) || defined(_M_ARM64EC) \
+   || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* WASM SIMD128 via SIMDe */
+#    define inline __inline__  /* circumvent a clang bug */
+#    include <arm_neon.h>
+#    undef inline
+#  elif defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  elif defined(__loongarch_asx)
+#    include <lasxintrin.h>
+#    include <lsxintrin.h>
+#  elif defined(__loongarch_sx)
+#    include <lsxintrin.h>
+#  elif defined(__riscv_vector)
+#    include <riscv_vector.h>
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+/*
+ * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while
+ * remaining a true 64-bit/128-bit hash function.
+ *
+ * This is done by prioritizing a subset of 64-bit operations that can be
+ * emulated without too many steps on the average 32-bit machine.
+ *
+ * For example, these two lines seem similar, and run equally fast on 64-bit:
+ *
+ *   xxh_u64 x;
+ *   x ^= (x >> 47); // good
+ *   x ^= (x >> 13); // bad
+ *
+ * However, to a 32-bit machine, there is a major difference.
+ *
+ * x ^= (x >> 47) looks like this:
+ *
+ *   x.lo ^= (x.hi >> (47 - 32));
+ *
+ * while x ^= (x >> 13) looks like this:
+ *
+ *   // note: funnel shifts are not usually cheap.
+ *   x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13));
+ *   x.hi ^= (x.hi >> 13);
+ *
+ * The first one is significantly faster than the second, simply because the
+ * shift is larger than 32. This means:
+ *  - All the bits we need are in the upper 32 bits, so we can ignore the lower
+ *    32 bits in the shift.
+ *  - The shift result will always fit in the lower 32 bits, and therefore,
+ *    we can ignore the upper 32 bits in the xor.
+ *
+ * Thanks to this optimization, XXH3 only requires these features to be efficient:
+ *
+ *  - Usable unaligned access
+ *  - A 32-bit or 64-bit ALU
+ *      - If 32-bit, a decent ADC instruction
+ *  - A 32 or 64-bit multiply with a 64-bit result
+ *  - For the 128-bit variant, a decent byteswap helps short inputs.
+ *
+ * The first two are already required by XXH32, and almost all 32-bit and 64-bit
+ * platforms which can run XXH32 can run XXH3 efficiently.
+ *
+ * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one
+ * notable exception.
+ *
+ * First of all, Thumb-1 lacks support for the UMULL instruction which
+ * performs the important long multiply. This means numerous __aeabi_lmul
+ * calls.
+ *
+ * Second of all, the 8 functional registers are just not enough.
+ * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need
+ * Lo registers, and this shuffling results in thousands more MOVs than A32.
+ *
+ * A32 and T32 don't have this limitation. They can access all 14 registers,
+ * do a 32->64 multiply with UMULL, and the flexible operand allowing free
+ * shifts is helpful, too.
+ *
+ * Therefore, we do a quick sanity check.
+ *
+ * If compiling Thumb-1 for a target which supports ARM instructions, we will
+ * emit a warning, as it is not a "sane" platform to compile for.
+ *
+ * Usually, if this happens, it is because of an accident and you probably need
+ * to specify -march, as you likely meant to compile for a newer architecture.
+ *
+ * Credit: large sections of the vectorial and asm source code paths
+ *         have been contributed by @easyaspi314
+ */
+#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM)
+#   warning "XXH3 is highly inefficient without ARM or Thumb-2."
+#endif
+
+/* ==========================================
+ * Vectorization detection
+ * ========================================== */
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @ingroup tuning
+ * @brief Overrides the vectorization implementation chosen for XXH3.
+ *
+ * Can be defined to 0 to disable SIMD,
+ * or any other authorized value of @ref XXH_VECTOR.
+ *
+ * If this is not defined, it uses predefined macros to determine the best
+ * implementation.
+ */
+#  define XXH_VECTOR XXH_SCALAR
+/*!
+ * @ingroup tuning
+ * @brief Selects the minimum alignment for XXH3's accumulators.
+ *
+ * When using SIMD, this should match the alignment required for said vector
+ * type, so, for example, 32 for AVX2.
+ *
+ * Default: Auto detected.
+ */
+#  define XXH_ACC_ALIGN 8
+#endif
+
+/* Actual definition */
+#ifndef XXH_DOXYGEN
+#endif
+
+#ifndef XXH_VECTOR    /* can be defined on command line */
+#  if ( \
+        defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
+     || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
+     || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* wasm simd128 via SIMDe */ \
+   ) && ( \
+        defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \
+    || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
+   )
+#    define XXH_VECTOR XXH_NEON
+#  elif defined(__ARM_FEATURE_SVE)
+#    define XXH_VECTOR XXH_SVE
+#  elif defined(__AVX512F__)
+#    define XXH_VECTOR XXH_AVX512
+#  elif defined(__AVX2__)
+#    define XXH_VECTOR XXH_AVX2
+#  elif defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define XXH_VECTOR XXH_SSE2
+#  elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \
+     || (defined(__s390x__) && defined(__VEC__)) \
+     && defined(__GNUC__) /* TODO: IBM XL */
+#    define XXH_VECTOR XXH_VSX
+#  elif defined(__loongarch_asx)
+#    define XXH_VECTOR XXH_LASX
+#  elif defined(__loongarch_sx)
+#    define XXH_VECTOR XXH_LSX
+#  elif defined(__riscv_vector)
+#    define XXH_VECTOR XXH_RVV
+#  else
+#    define XXH_VECTOR XXH_SCALAR
+#  endif
+#endif
+
+/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
+#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
+#  ifdef _MSC_VER
+#    pragma warning(once : 4606)
+#  else
+#    warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
+#  endif
+#  undef XXH_VECTOR
+#  define XXH_VECTOR XXH_SCALAR
+#endif
+
+/*
+ * Controls the alignment of the accumulator,
+ * for compatibility with aligned vector loads, which are usually faster.
+ */
+#ifndef XXH_ACC_ALIGN
+#  if defined(XXH_X86DISPATCH)
+#     define XXH_ACC_ALIGN 64  /* for compatibility with avx512 */
+#  elif XXH_VECTOR == XXH_SCALAR  /* scalar */
+#     define XXH_ACC_ALIGN 8
+#  elif XXH_VECTOR == XXH_SSE2  /* sse2 */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX2  /* avx2 */
+#     define XXH_ACC_ALIGN 32
+#  elif XXH_VECTOR == XXH_NEON  /* neon */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_VSX   /* vsx */
+#     define XXH_ACC_ALIGN 16
+#  elif XXH_VECTOR == XXH_AVX512  /* avx512 */
+#     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_SVE   /* sve */
+#     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_LASX   /* lasx */
+#     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_LSX   /* lsx */
+#     define XXH_ACC_ALIGN 64
+#  elif XXH_VECTOR == XXH_RVV   /* rvv */
+#     define XXH_ACC_ALIGN 64
+#  endif
+#endif
+
+#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
+    || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#elif XXH_VECTOR == XXH_SVE
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#elif XXH_VECTOR == XXH_RVV
+#  define XXH_SEC_ALIGN XXH_ACC_ALIGN
+#else
+#  define XXH_SEC_ALIGN 8
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  define XXH_ALIASING __attribute__((__may_alias__))
+#else
+#  define XXH_ALIASING /* nothing */
+#endif
+
+/*
+ * UGLY HACK:
+ * GCC usually generates the best code with -O3 for xxHash.
+ *
+ * However, when targeting AVX2, it is overzealous in its unrolling resulting
+ * in code roughly 3/4 the speed of Clang.
+ *
+ * There are other issues, such as GCC splitting _mm256_loadu_si256 into
+ * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which
+ * only applies to Sandy and Ivy Bridge... which don't even support AVX2.
+ *
+ * That is why when compiling the AVX2 version, it is recommended to use either
+ *   -O2 -mavx2 -march=haswell
+ * or
+ *   -O2 -mavx2 -mno-avx256-split-unaligned-load
+ * for decent performance, or to use Clang instead.
+ *
+ * Fortunately, we can control the first one with a pragma that forces GCC into
+ * -O2, but the other one we can't control without "failed to inline always
+ * inline function due to target mismatch" warnings.
+ */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+#  pragma GCC push_options
+#  pragma GCC optimize("-O2")
+#endif
+
+#if XXH_VECTOR == XXH_NEON
+
+/*
+ * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3
+ * optimizes out the entire hashLong loop because of the aliasing violation.
+ *
+ * However, GCC is also inefficient at load-store optimization with vld1q/vst1q,
+ * so the only option is to mark it as aliasing.
+ */
+typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING;
+
+/*!
+ * @internal
+ * @brief `vld1q_u64` but faster and alignment-safe.
+ *
+ * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only
+ * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86).
+ *
+ * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it
+ * prohibits load-store optimizations. Therefore, a direct dereference is used.
+ *
+ * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe
+ * unaligned load.
+ */
+#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__)
+XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */
+{
+    return *(xxh_aliasing_uint64x2_t const *)ptr;
+}
+#else
+XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr)
+{
+    return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr));
+}
+#endif
+
+/*!
+ * @internal
+ * @brief `vmlal_u32` on low and high halves of a vector.
+ *
+ * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with
+ * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32`
+ * with `vmlal_u32`.
+ */
+#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    /* Inline assembly is the only way */
+    __asm__("umlal   %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs));
+    return acc;
+}
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    /* This intrinsic works as expected */
+    return vmlal_high_u32(acc, lhs, rhs);
+}
+#else
+/* Portable intrinsic versions */
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs));
+}
+/*! @copydoc XXH_vmlal_low_u32
+ * Assume the compiler converts this to vmlal_high_u32 on aarch64 */
+XXH_FORCE_INLINE uint64x2_t
+XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs)
+{
+    return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs));
+}
+#endif
+
+/*!
+ * @ingroup tuning
+ * @brief Controls the NEON to scalar ratio for XXH3
+ *
+ * This can be set to 2, 4, 6, or 8.
+ *
+ * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used.
+ *
+ * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those
+ * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU
+ * bandwidth.
+ *
+ * This is even more noticeable on the more advanced cores like the Cortex-A76 which
+ * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once.
+ *
+ * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes
+ * and 2 scalar lanes, which is chosen by default.
+ *
+ * This does not apply to Apple processors or 32-bit processors, which run better with
+ * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes.
+ *
+ * This change benefits CPUs with large micro-op buffers without negatively affecting
+ * most other CPUs:
+ *
+ *  | Chipset               | Dispatch type       | NEON only | 6:2 hybrid | Diff. |
+ *  |:----------------------|:--------------------|----------:|-----------:|------:|
+ *  | Snapdragon 730 (A76)  | 2 NEON/8 micro-ops  |  8.8 GB/s |  10.1 GB/s |  ~16% |
+ *  | Snapdragon 835 (A73)  | 2 NEON/3 micro-ops  |  5.1 GB/s |   5.3 GB/s |   ~5% |
+ *  | Marvell PXA1928 (A53) | In-order dual-issue |  1.9 GB/s |   1.9 GB/s |    0% |
+ *  | Apple M1              | 4 NEON/8 micro-ops  | 37.3 GB/s |  36.1 GB/s |  ~-3% |
+ *
+ * It also seems to fix some bad codegen on GCC, making it almost as fast as clang.
+ *
+ * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning
+ * it effectively becomes worse 4.
+ *
+ * @see XXH3_accumulate_512_neon()
+ */
+# ifndef XXH3_NEON_LANES
+#  if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \
+   && !defined(__APPLE__) && XXH_SIZE_OPT <= 0
+#   define XXH3_NEON_LANES 6
+#  else
+#   define XXH3_NEON_LANES XXH_ACC_NB
+#  endif
+# endif
+#endif  /* XXH_VECTOR == XXH_NEON */
+
+/*
+ * VSX and Z Vector helpers.
+ *
+ * This is very messy, and any pull requests to clean this up are welcome.
+ *
+ * There are a lot of problems with supporting VSX and s390x, due to
+ * inconsistent intrinsics, spotty coverage, and multiple endiannesses.
+ */
+#if XXH_VECTOR == XXH_VSX
+/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`,
+ * and `pixel`. This is a problem for obvious reasons.
+ *
+ * These keywords are unnecessary; the spec literally says they are
+ * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd
+ * after including the header.
+ *
+ * We use pragma push_macro/pop_macro to keep the namespace clean. */
+#  pragma push_macro("bool")
+#  pragma push_macro("vector")
+#  pragma push_macro("pixel")
+/* silence potential macro redefined warnings */
+#  undef bool
+#  undef vector
+#  undef pixel
+
+#  if defined(__s390x__)
+#    include <s390intrin.h>
+#  else
+#    include <altivec.h>
+#  endif
+
+/* Restore the original macro values, if applicable. */
+#  pragma pop_macro("pixel")
+#  pragma pop_macro("vector")
+#  pragma pop_macro("bool")
+
+typedef __vector unsigned long long xxh_u64x2;
+typedef __vector unsigned char xxh_u8x16;
+typedef __vector unsigned xxh_u32x4;
+
+/*
+ * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue.
+ */
+typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING;
+
+# ifndef XXH_VSX_BE
+#  if defined(__BIG_ENDIAN__) \
+  || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define XXH_VSX_BE 1
+#  elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__
+#    warning "-maltivec=be is not recommended. Please use native endianness."
+#    define XXH_VSX_BE 1
+#  else
+#    define XXH_VSX_BE 0
+#  endif
+# endif /* !defined(XXH_VSX_BE) */
+
+# if XXH_VSX_BE
+#  if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__))
+#    define XXH_vec_revb vec_revb
+#  else
+/*!
+ * A polyfill for POWER9's vec_revb().
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val)
+{
+    xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
+                                  0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 };
+    return vec_perm(val, val, vByteSwap);
+}
+#  endif
+# endif /* XXH_VSX_BE */
+
+/*!
+ * Performs an unaligned vector load and byte swaps it on big endian.
+ */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr)
+{
+    xxh_u64x2 ret;
+    XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2));
+# if XXH_VSX_BE
+    ret = XXH_vec_revb(ret);
+# endif
+    return ret;
+}
+
+/*
+ * vec_mulo and vec_mule are very problematic intrinsics on PowerPC
+ *
+ * These intrinsics weren't added until GCC 8, despite existing for a while,
+ * and they are endian dependent. Also, their meaning swap depending on version.
+ * */
+# if defined(__s390x__)
+ /* s390x is always big endian, no issue on this platform */
+#  define XXH_vec_mulo vec_mulo
+#  define XXH_vec_mule vec_mule
+# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__)
+/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */
+ /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */
+#  define XXH_vec_mulo __builtin_altivec_vmulouw
+#  define XXH_vec_mule __builtin_altivec_vmuleuw
+# else
+/* gcc needs inline assembly */
+/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
+{
+    xxh_u64x2 result;
+    __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b));
+    return result;
+}
+# endif /* XXH_vec_mulo, XXH_vec_mule */
+#endif /* XXH_VECTOR == XXH_VSX */
+
+#if XXH_VECTOR == XXH_SVE
+#define ACCRND(acc, offset) \
+do { \
+    svuint64_t input_vec = svld1_u64(mask, xinput + offset);         \
+    svuint64_t secret_vec = svld1_u64(mask, xsecret + offset);       \
+    svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec);     \
+    svuint64_t swapped = svtbl_u64(input_vec, kSwap);                \
+    svuint64_t mixed_lo = svextw_u64_x(mask, mixed);                 \
+    svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32);            \
+    svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
+    acc = svadd_u64_x(mask, acc, mul);                               \
+} while (0)
+#endif /* XXH_VECTOR == XXH_SVE */
+
+/* prefetch
+ * can be disabled, by declaring XXH_NO_PREFETCH build macro */
+#if defined(XXH_NO_PREFETCH)
+#  define XXH_PREFETCH(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if XXH_SIZE_OPT >= 1
+#    define XXH_PREFETCH(ptr) (void)(ptr)
+#  elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  /* _mm_prefetch() not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define XXH_PREFETCH(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define XXH_PREFETCH(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#  else
+#    define XXH_PREFETCH(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* XXH_NO_PREFETCH */
+
+
+/* ==========================================
+ * XXH3 default settings
+ * ========================================== */
+
+#define XXH_SECRET_DEFAULT_SIZE 192   /* minimum XXH3_SECRET_SIZE_MIN */
+
+#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN)
+#  error "default keyset is not large enough"
+#endif
+
+/*!
+ * @internal
+ * @def XXH3_kSecret
+ * @brief Pseudorandom secret taken directly from FARSH. */
+XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = {
+    0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+    0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+    0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+    0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+    0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+    0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+    0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+    0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+    0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+    0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+    0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+    0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+};
+
+static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL;  /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */
+static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL;  /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */
+
+#ifdef XXH_OLD_NAMES
+#  define kSecret XXH3_kSecret
+#endif
+
+#ifdef XXH_DOXYGEN
+/*!
+ * @brief Calculates a 32-bit to 64-bit long multiply.
+ *
+ * Implemented as a macro.
+ *
+ * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't
+ * need to (but it shouldn't need to anyways, it is about 7 instructions to do
+ * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we
+ * use that instead of the normal method.
+ *
+ * If you are compiling for platforms like Thumb-1 and don't have a better option,
+ * you may also want to write your own long multiply routine here.
+ *
+ * @param x, y Numbers to be multiplied
+ * @return 64-bit product of the low 32 bits of @p x and @p y.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64(xxh_u64 x, xxh_u64 y)
+{
+   return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF);
+}
+#elif defined(_MSC_VER) && defined(_M_IX86)
+#    define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y))
+#else
+/*
+ * Downcast + upcast is usually better than masking on older compilers like
+ * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers.
+ *
+ * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands
+ * and perform a full 64x64 multiply -- entirely redundant on 32-bit.
+ */
+#    define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y))
+#endif
+
+/*!
+ * @brief Calculates a 64->128-bit long multiply.
+ *
+ * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar
+ * version.
+ *
+ * @param lhs , rhs The 64-bit integers to be multiplied
+ * @return The 128-bit result represented in an @ref XXH128_hash_t.
+ */
+static XXH128_hash_t
+XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs)
+{
+    /*
+     * GCC/Clang __uint128_t method.
+     *
+     * On most 64-bit targets, GCC and Clang define a __uint128_t type.
+     * This is usually the best way as it usually uses a native long 64-bit
+     * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64.
+     *
+     * Usually.
+     *
+     * Despite being a 32-bit platform, Clang (and emscripten) define this type
+     * despite not having the arithmetic for it. This results in a laggy
+     * compiler builtin call which calculates a full 128-bit multiply.
+     * In that case it is best to use the portable one.
+     * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677
+     */
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
+
+    __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs;
+    XXH128_hash_t r128;
+    r128.low64  = (xxh_u64)(product);
+    r128.high64 = (xxh_u64)(product >> 64);
+    return r128;
+
+    /*
+     * MSVC for x64's _umul128 method.
+     *
+     * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct);
+     *
+     * This compiles to single operand MUL on x64.
+     */
+#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+    xxh_u64 product_high;
+    xxh_u64 const product_low = _umul128(lhs, rhs, &product_high);
+    XXH128_hash_t r128;
+    r128.low64  = product_low;
+    r128.high64 = product_high;
+    return r128;
+
+    /*
+     * MSVC for ARM64's __umulh method.
+     *
+     * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method.
+     */
+#elif defined(_M_ARM64) || defined(_M_ARM64EC)
+
+#ifndef _MSC_VER
+#   pragma intrinsic(__umulh)
+#endif
+    XXH128_hash_t r128;
+    r128.low64  = lhs * rhs;
+    r128.high64 = __umulh(lhs, rhs);
+    return r128;
+
+#else
+    /*
+     * Portable scalar method. Optimized for 32-bit and 64-bit ALUs.
+     *
+     * This is a fast and simple grade school multiply, which is shown below
+     * with base 10 arithmetic instead of base 0x100000000.
+     *
+     *           9 3 // D2 lhs = 93
+     *         x 7 5 // D2 rhs = 75
+     *     ----------
+     *           1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15
+     *         4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45
+     *         2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21
+     *     + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63
+     *     ---------
+     *         2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27
+     *     + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67
+     *     ---------
+     *       6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975
+     *
+     * The reasons for adding the products like this are:
+     *  1. It avoids manual carry tracking. Just like how
+     *     (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX.
+     *     This avoids a lot of complexity.
+     *
+     *  2. It hints for, and on Clang, compiles to, the powerful UMAAL
+     *     instruction available in ARM's Digital Signal Processing extension
+     *     in 32-bit ARMv6 and later, which is shown below:
+     *
+     *         void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm)
+     *         {
+     *             xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm;
+     *             *RdLo = (xxh_u32)(product & 0xFFFFFFFF);
+     *             *RdHi = (xxh_u32)(product >> 32);
+     *         }
+     *
+     *     This instruction was designed for efficient long multiplication, and
+     *     allows this to be calculated in only 4 instructions at speeds
+     *     comparable to some 64-bit ALUs.
+     *
+     *  3. It isn't terrible on other platforms. Usually this will be a couple
+     *     of 32-bit ADD/ADCs.
+     */
+
+    /* First calculate all of the cross products. */
+    xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+    xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32,        rhs & 0xFFFFFFFF);
+    xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+    xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32,        rhs >> 32);
+
+    /* Now add the products together. These will never overflow. */
+    xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+    xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32)        + hi_hi;
+    xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+    XXH128_hash_t r128;
+    r128.low64  = lower;
+    r128.high64 = upper;
+    return r128;
+#endif
+}
+
+/*!
+ * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it.
+ *
+ * The reason for the separate function is to prevent passing too many structs
+ * around by value. This will hopefully inline the multiply, but we don't force it.
+ *
+ * @param lhs , rhs The 64-bit integers to multiply
+ * @return The low 64 bits of the product XOR'd by the high 64 bits.
+ * @see XXH_mult64to128()
+ */
+static xxh_u64
+XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs)
+{
+    XXH128_hash_t product = XXH_mult64to128(lhs, rhs);
+    return product.low64 ^ product.high64;
+}
+
+/*! Seems to produce slightly better code on GCC for some reason. */
+XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift)
+{
+    XXH_ASSERT(0 <= shift && shift < 64);
+    return v64 ^ (v64 >> shift);
+}
+
+/*
+ * This is a fast avalanche stage,
+ * suitable when input bits are already partially mixed
+ */
+static XXH64_hash_t XXH3_avalanche(xxh_u64 h64)
+{
+    h64 = XXH_xorshift64(h64, 37);
+    h64 *= PRIME_MX1;
+    h64 = XXH_xorshift64(h64, 32);
+    return h64;
+}
+
+/*
+ * This is a stronger avalanche,
+ * inspired by Pelle Evensen's rrmxmx
+ * preferable when input has not been previously mixed
+ */
+static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len)
+{
+    /* this mix is inspired by Pelle Evensen's rrmxmx */
+    h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24);
+    h64 *= PRIME_MX2;
+    h64 ^= (h64 >> 35) + len ;
+    h64 *= PRIME_MX2;
+    return XXH_xorshift64(h64, 28);
+}
+
+
+/* ==========================================
+ * Short keys
+ * ==========================================
+ * One of the shortcomings of XXH32 and XXH64 was that their performance was
+ * sub-optimal on short lengths. It used an iterative algorithm which strongly
+ * favored lengths that were a multiple of 4 or 8.
+ *
+ * Instead of iterating over individual inputs, we use a set of single shot
+ * functions which piece together a range of lengths and operate in constant time.
+ *
+ * Additionally, the number of multiplies has been significantly reduced. This
+ * reduces latency, especially when emulating 64-bit multiplies on 32-bit.
+ *
+ * Depending on the platform, this may or may not be faster than XXH32, but it
+ * is almost guaranteed to be faster than XXH64.
+ */
+
+/*
+ * At very short lengths, there isn't enough input to fully hide secrets, or use
+ * the entire secret.
+ *
+ * There is also only a limited amount of mixing we can do before significantly
+ * impacting performance.
+ *
+ * Therefore, we use different sections of the secret and always mix two secret
+ * samples with an XOR. This should have no effect on performance on the
+ * seedless or withSeed variants because everything _should_ be constant folded
+ * by modern compilers.
+ *
+ * The XOR mixing hides individual parts of the secret and increases entropy.
+ *
+ * This adds an extra layer of strength for custom secrets.
+ */
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combined = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combined = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combined = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8  const c1 = input[0];
+        xxh_u8  const c2 = input[len >> 1];
+        xxh_u8  const c3 = input[len - 1];
+        xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2  << 24)
+                               | ((xxh_u32)c3 <<  0) | ((xxh_u32)len << 8);
+        xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const keyed = (xxh_u64)combined ^ bitflip;
+        return XXH64_avalanche(keyed);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input1 = XXH_readLE32(input);
+        xxh_u32 const input2 = XXH_readLE32(input + len - 4);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed;
+        xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32);
+        xxh_u64 const keyed = input64 ^ bitflip;
+        return XXH3_rrmxmx(keyed, len);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed;
+        xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed;
+        xxh_u64 const input_lo = XXH_readLE64(input)           ^ bitflip1;
+        xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2;
+        xxh_u64 const acc = len
+                          + XXH_swap64(input_lo) + input_hi
+                          + XXH3_mul128_fold64(input_lo, input_hi);
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (XXH_likely(len >  8)) return XXH3_len_9to16_64b(input, len, secret, seed);
+        if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_64b(input, len, secret, seed);
+        return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64)));
+    }
+}
+
+/*
+ * DISCLAIMER: There are known *seed-dependent* multicollisions here due to
+ * multiplication by zero, affecting hashes of lengths 17 to 240.
+ *
+ * However, they are very unlikely.
+ *
+ * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all
+ * unseeded non-cryptographic hashes, it does not attempt to defend itself
+ * against specially crafted inputs, only random inputs.
+ *
+ * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes
+ * cancelling out the secret is taken an arbitrary number of times (addressed
+ * in XXH3_accumulate_512), this collision is very unlikely with random inputs
+ * and/or proper seeding:
+ *
+ * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a
+ * function that is only called up to 16 times per hash with up to 240 bytes of
+ * input.
+ *
+ * This is not too bad for a non-cryptographic hash function, especially with
+ * only 64 bit outputs.
+ *
+ * The 128-bit variant (which trades some speed for strength) is NOT affected
+ * by this, although it is always a good idea to use a proper seed if you care
+ * about strength.
+ */
+XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input,
+                                     const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64)
+{
+#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__i386__) && defined(__SSE2__)  /* x86 + SSE2 */ \
+  && !defined(XXH_ENABLE_AUTOVECTORIZE)      /* Define to disable like XXH32 hack */
+    /*
+     * UGLY HACK:
+     * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in
+     * slower code.
+     *
+     * By forcing seed64 into a register, we disrupt the cost model and
+     * cause it to scalarize. See `XXH32_round()`
+     *
+     * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600,
+     * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on
+     * GCC 9.2, despite both emitting scalar code.
+     *
+     * GCC generates much better scalar code than Clang for the rest of XXH3,
+     * which is why finding a more optimal codepath is an interest.
+     */
+    XXH_COMPILER_GUARD(seed64);
+#endif
+    {   xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64 const input_hi = XXH_readLE64(input+8);
+        return XXH3_mul128_fold64(
+            input_lo ^ (XXH_readLE64(secret)   + seed64),
+            input_hi ^ (XXH_readLE64(secret+8) - seed64)
+        );
+    }
+}
+
+/* For mid range keys, XXH3 uses a Mum-hash variant. */
+XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                     const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                     XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+#if XXH_SIZE_OPT >= 1
+        /* Smaller and cleaner, but slightly slower. */
+        unsigned int i = (unsigned int)(len - 1) / 32;
+        do {
+            acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
+            acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
+        } while (i-- != 0);
+#else
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc += XXH3_mix16B(input+48, secret+96, seed);
+                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                }
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc += XXH3_mix16B(input+len-48, secret+80, seed);
+            }
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc += XXH3_mix16B(input+len-32, secret+48, seed);
+        }
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc += XXH3_mix16B(input+len-16, secret+16, seed);
+#endif
+        return XXH3_avalanche(acc);
+    }
+}
+
+XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    #define XXH3_MIDSIZE_STARTOFFSET 3
+    #define XXH3_MIDSIZE_LASTOFFSET  17
+
+    {   xxh_u64 acc = len * XXH_PRIME64_1;
+        xxh_u64 acc_end;
+        unsigned int const nbRounds = (unsigned int)len / 16;
+        unsigned int i;
+        XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+        for (i=0; i<8; i++) {
+            acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed);
+        }
+        /* last bytes */
+        acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed);
+        XXH_ASSERT(nbRounds >= 8);
+        acc = XXH3_avalanche(acc);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86.
+         * In everywhere else, it uses scalar code.
+         *
+         * For 64->128-bit multiplies, even if the NEON was 100% optimal, it
+         * would still be slower than UMAAL (see XXH_mult64to128).
+         *
+         * Unfortunately, Clang doesn't handle the long multiplies properly and
+         * converts them to the nonexistent "vmulq_u64" intrinsic, which is then
+         * scalarized into an ugly mess of VMOV.32 instructions.
+         *
+         * This mess is difficult to avoid without turning autovectorization
+         * off completely, but they are usually relatively minor and/or not
+         * worth it to fix.
+         *
+         * This loop is the easiest to fix, as unlike XXH32, this pragma
+         * _actually works_ because it is a loop vectorization instead of an
+         * SLP vectorization.
+         */
+        #pragma clang loop vectorize(disable)
+#endif
+        for (i=8 ; i < nbRounds; i++) {
+            /*
+             * Prevents clang for unrolling the acc loop and interleaving with this one.
+             */
+            XXH_COMPILER_GUARD(acc);
+            acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed);
+        }
+        return XXH3_avalanche(acc + acc_end);
+    }
+}
+
+
+/* =======     Long Keys     ======= */
+
+#define XXH_STRIPE_LEN 64
+#define XXH_SECRET_CONSUME_RATE 8   /* nb of secret bytes consumed at each accumulation */
+#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64))
+
+#ifdef XXH_OLD_NAMES
+#  define STRIPE_LEN XXH_STRIPE_LEN
+#  define ACC_NB XXH_ACC_NB
+#endif
+
+#ifndef XXH_PREFETCH_DIST
+#  ifdef __clang__
+#    define XXH_PREFETCH_DIST 320
+#  else
+#    if (XXH_VECTOR == XXH_AVX512)
+#      define XXH_PREFETCH_DIST 512
+#    else
+#      define XXH_PREFETCH_DIST 384
+#    endif
+#  endif  /* __clang__ */
+#endif  /* XXH_PREFETCH_DIST */
+
+/*
+ * These macros are to generate an XXH3_accumulate() function.
+ * The two arguments select the name suffix and target attribute.
+ *
+ * The name of this symbol is XXH3_accumulate_<name>() and it calls
+ * XXH3_accumulate_512_<name>().
+ *
+ * It may be useful to hand implement this function if the compiler fails to
+ * optimize the inline function.
+ */
+#define XXH3_ACCUMULATE_TEMPLATE(name)                      \
+void                                                        \
+XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc,           \
+                       const xxh_u8* XXH_RESTRICT input,    \
+                       const xxh_u8* XXH_RESTRICT secret,   \
+                       size_t nbStripes)                    \
+{                                                           \
+    size_t n;                                               \
+    for (n = 0; n < nbStripes; n++ ) {                      \
+        const xxh_u8* const in = input + n*XXH_STRIPE_LEN;  \
+        XXH_PREFETCH(in + XXH_PREFETCH_DIST);               \
+        XXH3_accumulate_512_##name(                         \
+                 acc,                                       \
+                 in,                                        \
+                 secret + n*XXH_SECRET_CONSUME_RATE);       \
+    }                                                       \
+}
+
+
+XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64)
+{
+    if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64);
+    XXH_memcpy(dst, &v64, sizeof(v64));
+}
+
+/* Several intrinsic functions below are supposed to accept __int64 as argument,
+ * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ .
+ * However, several environments do not define __int64 type,
+ * requiring a workaround.
+ */
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+    typedef int64_t xxh_i64;
+#else
+    /* the following type must have a width of 64-bit */
+    typedef long long xxh_i64;
+#endif
+
+
+/*
+ * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized.
+ *
+ * It is a hardened version of UMAC, based off of FARSH's implementation.
+ *
+ * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD
+ * implementations, and it is ridiculously fast.
+ *
+ * We harden it by mixing the original input to the accumulators as well as the product.
+ *
+ * This means that in the (relatively likely) case of a multiply by zero, the
+ * original input is preserved.
+ *
+ * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve
+ * cross-pollination, as otherwise the upper and lower halves would be
+ * essentially independent.
+ *
+ * This doesn't matter on 64-bit hashes since they all get merged together in
+ * the end, so we skip the extra step.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+#if (XXH_VECTOR == XXH_AVX512) \
+     || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0)
+
+#ifndef XXH_TARGET_AVX512
+# define XXH_TARGET_AVX512  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    __m512i* const xacc = (__m512i *) acc;
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+
+    {
+        /* data_vec    = input[0]; */
+        __m512i const data_vec    = _mm512_loadu_si512   (input);
+        /* key_vec     = secret[0]; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        /* data_key    = data_vec ^ key_vec; */
+        __m512i const data_key    = _mm512_xor_si512     (data_vec, key_vec);
+        /* data_key_lo = data_key >> 32; */
+        __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32);
+        /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+        __m512i const product     = _mm512_mul_epu32     (data_key, data_key_lo);
+        /* xacc[0] += swap(data_vec); */
+        __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2));
+        __m512i const sum       = _mm512_add_epi64(*xacc, data_swap);
+        /* xacc[0] += product; */
+        *xacc = _mm512_add_epi64(product, sum);
+    }
+}
+XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512)
+
+/*
+ * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing.
+ *
+ * Multiplication isn't perfect, as explained by Google in HighwayHash:
+ *
+ *  // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to
+ *  // varying degrees. In descending order of goodness, bytes
+ *  // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32.
+ *  // As expected, the upper and lower bytes are much worse.
+ *
+ * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291
+ *
+ * Since our algorithm uses a pseudorandom secret to add some variance into the
+ * mix, we don't need to (or want to) mix as often or as much as HighwayHash does.
+ *
+ * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid
+ * extraction.
+ *
+ * Both XXH3_64bits and XXH3_128bits use this subroutine.
+ */
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i));
+    {   __m512i* const xacc = (__m512i*) acc;
+        const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1);
+
+        /* xacc[0] ^= (xacc[0] >> 47) */
+        __m512i const acc_vec     = *xacc;
+        __m512i const shifted     = _mm512_srli_epi64    (acc_vec, 47);
+        /* xacc[0] ^= secret; */
+        __m512i const key_vec     = _mm512_loadu_si512   (secret);
+        __m512i const data_key    = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */);
+
+        /* xacc[0] *= XXH_PRIME32_1; */
+        __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32);
+        __m512i const prod_lo     = _mm512_mul_epu32     (data_key, prime32);
+        __m512i const prod_hi     = _mm512_mul_epu32     (data_key_hi, prime32);
+        *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32));
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX512 void
+XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+    XXH_ASSERT(((size_t)customSecret & 63) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i);
+        __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64);
+        __m512i const seed     = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos);
+
+        const __m512i* const src  = (const __m512i*) ((const void*) XXH3_kSecret);
+              __m512i* const dest = (      __m512i*) customSecret;
+        int i;
+        XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 63) == 0);
+        for (i=0; i < nbRounds; ++i) {
+            dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_AVX2) \
+    || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0)
+
+#ifndef XXH_TARGET_AVX2
+# define XXH_TARGET_AVX2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc    =       (__m256i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires  a const __m256i * pointer for some reason. */
+        const         __m256i* const xinput  = (const __m256i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m256i const data_vec    = _mm256_loadu_si256    (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32);
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product     = _mm256_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2));
+            __m256i const sum       = _mm256_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm256_add_epi64(product, sum);
+    }   }
+}
+XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2)
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void
+XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {   __m256i* const xacc = (__m256i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */
+        const         __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec     = xacc[i];
+            __m256i const shifted     = _mm256_srli_epi64    (acc_vec, 47);
+            __m256i const data_vec    = _mm256_xor_si256     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret; */
+            __m256i const key_vec     = _mm256_loadu_si256   (xsecret+i);
+            __m256i const data_key    = _mm256_xor_si256     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32);
+            __m256i const prod_lo     = _mm256_mul_epu32     (data_key, prime32);
+            __m256i const prod_hi     = _mm256_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0);
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64);
+    (void)(&XXH_writeLE64);
+    XXH_PREFETCH(customSecret);
+    {   __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64);
+
+        const __m256i* const src  = (const __m256i*) ((const void*) XXH3_kSecret);
+              __m256i*       dest = (      __m256i*) customSecret;
+
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dest);
+#       endif
+        XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dest & 31) == 0);
+
+        /* GCC -O2 need unroll loop manually */
+        dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed);
+        dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed);
+        dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed);
+        dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed);
+        dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed);
+        dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed);
+    }
+}
+
+#endif
+
+/* x86dispatch always generates SSE2 */
+#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH)
+
+#ifndef XXH_TARGET_SSE2
+# define XXH_TARGET_SSE2  /* disable attribute target */
+#endif
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* SSE2 is just a half-scale version of the AVX2 version. */
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc    =       (__m128i *) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xinput  = (const __m128i *) input;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* data_vec    = xinput[i]; */
+            __m128i const data_vec    = _mm_loadu_si128   (xinput+i);
+            /* key_vec     = xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            /* data_key    = data_vec ^ key_vec; */
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product     = _mm_mul_epu32     (data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2));
+            __m128i const sum       = _mm_add_epi64(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = _mm_add_epi64(product, sum);
+    }   }
+}
+XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2)
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void
+XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {   __m128i* const xacc = (__m128i*) acc;
+        /* Unaligned. This is mainly for pointer arithmetic, and because
+         * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */
+        const         __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1);
+
+        size_t i;
+        for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec     = xacc[i];
+            __m128i const shifted     = _mm_srli_epi64    (acc_vec, 47);
+            __m128i const data_vec    = _mm_xor_si128     (acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec     = _mm_loadu_si128   (xsecret+i);
+            __m128i const data_key    = _mm_xor_si128     (data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1));
+            __m128i const prod_lo     = _mm_mul_epu32     (data_key, prime32);
+            __m128i const prod_hi     = _mm_mul_epu32     (data_key_hi, prime32);
+            xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32));
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+    (void)(&XXH_writeLE64);
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i);
+
+#       if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER <= 1900
+        /* MSVC 32bit mode does not support _mm_set_epi64x before 2015
+         * and some specific variants of 2015 may also lack it */
+        /* Cast to unsigned 64-bit first to avoid signed arithmetic issues */
+        xxh_u64 const seed64_unsigned = (xxh_u64)seed64;
+        xxh_u64 const neg_seed64 = (xxh_u64)(0ULL - seed64_unsigned);
+        __m128i const seed = _mm_set_epi32(
+            (int)(neg_seed64 >> 32),      /* high 32 bits of negated seed */
+            (int)(neg_seed64),            /* low 32 bits of negated seed */
+            (int)(seed64_unsigned >> 32), /* high 32 bits of original seed */
+            (int)(seed64_unsigned)        /* low 32 bits of original seed */
+        );
+#       else
+        __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64);
+#       endif
+        int i;
+
+        const void* const src16 = XXH3_kSecret;
+        __m128i* dst16 = (__m128i*) customSecret;
+#       if defined(__GNUC__) || defined(__clang__)
+        /*
+         * On GCC & Clang, marking 'dest' as modified will cause the compiler:
+         *   - do not extract the secret from sse registers in the internal loop
+         *   - use less common registers, and avoid pushing these reg into stack
+         */
+        XXH_COMPILER_GUARD(dst16);
+#       endif
+        XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */
+        XXH_ASSERT(((size_t)dst16 & 15) == 0);
+
+        for (i=0; i < nbRounds; ++i) {
+            dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_NEON)
+
+/* forward declarations for the scalar routines */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret, size_t lane);
+
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret, size_t lane);
+
+/*!
+ * @internal
+ * @brief The bulk processing loop for NEON and WASM SIMD128.
+ *
+ * The NEON code path is actually partially scalar when running on AArch64. This
+ * is to optimize the pipelining and can have up to 15% speedup depending on the
+ * CPU, and it also mitigates some GCC codegen issues.
+ *
+ * @see XXH3_NEON_LANES for configuring this and details about this optimization.
+ *
+ * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit
+ * integers instead of the other platforms which mask full 64-bit vectors,
+ * so the setup is more complicated than just shifting right.
+ *
+ * Additionally, there is an optimization for 4 lanes at once noted below.
+ *
+ * Since, as stated, the most optimal amount of lanes for Cortexes is 6,
+ * there needs to be *three* versions of the accumulate operation used
+ * for the remaining 2 lanes.
+ *
+ * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap
+ * nearly perfectly.
+ */
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_neon( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0);
+    {   /* GCC for darwin arm64 does not like aliasing here */
+        xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc;
+        /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */
+        uint8_t const* xinput = (const uint8_t *) input;
+        uint8_t const* xsecret  = (const uint8_t *) secret;
+
+        size_t i;
+#ifdef __wasm_simd128__
+        /*
+         * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret
+         * is constant propagated, which results in it converting it to this
+         * inside the loop:
+         *
+         *    a = v128.load(XXH3_kSecret +  0 + $secret_offset, offset = 0)
+         *    b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0)
+         *    ...
+         *
+         * This requires a full 32-bit address immediate (and therefore a 6 byte
+         * instruction) as well as an add for each offset.
+         *
+         * Putting an asm guard prevents it from folding (at the cost of losing
+         * the alignment hint), and uses the free offset in `v128.load` instead
+         * of adding secret_offset each time which overall reduces code size by
+         * about a kilobyte and improves performance.
+         */
+        XXH_COMPILER_GUARD(xsecret);
+#endif
+        /* Scalar lanes use the normal scalarRound routine */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarRound(acc, input, secret, i);
+        }
+        i = 0;
+        /* 4 NEON lanes at a time. */
+        for (; i+1 < XXH3_NEON_LANES / 2; i+=2) {
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput  + (i * 16));
+            uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput  + ((i+1) * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec_1  = XXH_vld1q_u64(xsecret + (i * 16));
+            uint64x2_t key_vec_2  = XXH_vld1q_u64(xsecret + ((i+1) * 16));
+            /* data_swap = swap(data_vec) */
+            uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1);
+            uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1);
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1);
+            uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2);
+
+            /*
+             * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a
+             * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to
+             * get one vector with the low 32 bits of each lane, and one vector
+             * with the high 32 bits of each lane.
+             *
+             * The intrinsic returns a double vector because the original ARMv7-a
+             * instruction modified both arguments in place. AArch64 and SIMD128 emit
+             * two instructions from this intrinsic.
+             *
+             *  [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ]
+             *  [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ]
+             */
+            uint32x4x2_t unzipped = vuzpq_u32(
+                vreinterpretq_u32_u64(data_key_1),
+                vreinterpretq_u32_u64(data_key_2)
+            );
+            /* data_key_lo = data_key & 0xFFFFFFFF */
+            uint32x4_t data_key_lo = unzipped.val[0];
+            /* data_key_hi = data_key >> 32 */
+            uint32x4_t data_key_hi = unzipped.val[1];
+            /*
+             * Then, we can split the vectors horizontally and multiply which, as for most
+             * widening intrinsics, have a variant that works on both high half vectors
+             * for free on AArch64. A similar instruction is available on SIMD128.
+             *
+             * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi
+             */
+            uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi);
+            uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi);
+            /*
+             * Clang reorders
+             *    a += b * c;     // umlal   swap.2d, dkl.2s, dkh.2s
+             *    c += a;         // add     acc.2d, acc.2d, swap.2d
+             * to
+             *    c += a;         // add     acc.2d, acc.2d, swap.2d
+             *    c += b * c;     // umlal   acc.2d, dkl.2s, dkh.2s
+             *
+             * While it would make sense in theory since the addition is faster,
+             * for reasons likely related to umlal being limited to certain NEON
+             * pipelines, this is worse. A compiler guard fixes this.
+             */
+            XXH_COMPILER_GUARD_CLANG_NEON(sum_1);
+            XXH_COMPILER_GUARD_CLANG_NEON(sum_2);
+            /* xacc[i] = acc_vec + sum; */
+            xacc[i]   = vaddq_u64(xacc[i], sum_1);
+            xacc[i+1] = vaddq_u64(xacc[i+1], sum_2);
+        }
+        /* Operate on the remaining NEON lanes 2 at a time. */
+        for (; i < XXH3_NEON_LANES / 2; i++) {
+            /* data_vec = xinput[i]; */
+            uint64x2_t data_vec = XXH_vld1q_u64(xinput  + (i * 16));
+            /* key_vec  = xsecret[i];  */
+            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
+            /* acc_vec_2 = swap(data_vec) */
+            uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1);
+            /* data_key = data_vec ^ key_vec; */
+            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+            /* For two lanes, just use VMOVN and VSHRN. */
+            /* data_key_lo = data_key & 0xFFFFFFFF; */
+            uint32x2_t data_key_lo = vmovn_u64(data_key);
+            /* data_key_hi = data_key >> 32; */
+            uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32);
+            /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */
+            uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi);
+            /* Same Clang workaround as before */
+            XXH_COMPILER_GUARD_CLANG_NEON(sum);
+            /* xacc[i] = acc_vec + sum; */
+            xacc[i] = vaddq_u64 (xacc[i], sum);
+        }
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   xxh_aliasing_uint64x2_t* xacc       = (xxh_aliasing_uint64x2_t*) acc;
+        uint8_t const* xsecret = (uint8_t const*) secret;
+
+        size_t i;
+        /* WASM uses operator overloads and doesn't need these. */
+#ifndef __wasm_simd128__
+        /* { prime32_1, prime32_1 } */
+        uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1);
+        /* { 0, prime32_1, 0, prime32_1 } */
+        uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32));
+#endif
+
+        /* AArch64 uses both scalar and neon at the same time */
+        for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) {
+            XXH3_scalarScrambleRound(acc, secret, i);
+        }
+        for (i=0; i < XXH3_NEON_LANES / 2; i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            uint64x2_t acc_vec  = xacc[i];
+            uint64x2_t shifted  = vshrq_n_u64(acc_vec, 47);
+            uint64x2_t data_vec = veorq_u64(acc_vec, shifted);
+
+            /* xacc[i] ^= xsecret[i]; */
+            uint64x2_t key_vec  = XXH_vld1q_u64(xsecret + (i * 16));
+            uint64x2_t data_key = veorq_u64(data_vec, key_vec);
+            /* xacc[i] *= XXH_PRIME32_1 */
+#ifdef __wasm_simd128__
+            /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */
+            xacc[i] = data_key * XXH_PRIME32_1;
+#else
+            /*
+             * Expanded version with portable NEON intrinsics
+             *
+             *    lo(x) * lo(y) + (hi(x) * lo(y) << 32)
+             *
+             * prod_hi = hi(data_key) * lo(prime) << 32
+             *
+             * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector
+             * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits
+             * and avoid the shift.
+             */
+            uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi);
+            /* Extract low bits for vmlal_u32  */
+            uint32x2_t data_key_lo = vmovn_u64(data_key);
+            /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */
+            xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo);
+#endif
+        }
+    }
+}
+#endif
+
+#if (XXH_VECTOR == XXH_VSX)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_vsx(  void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    /* presumed aligned */
+    xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
+    xxh_u8 const* const xinput   = (xxh_u8 const*) input;   /* no alignment restriction */
+    xxh_u8 const* const xsecret  = (xxh_u8 const*) secret;    /* no alignment restriction */
+    xxh_u64x2 const v32 = { 32, 32 };
+    size_t i;
+    for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+        /* data_vec = xinput[i]; */
+        xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i);
+        /* key_vec = xsecret[i]; */
+        xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
+        xxh_u64x2 const data_key = data_vec ^ key_vec;
+        /* shuffled = (data_key << 32) | (data_key >> 32); */
+        xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32);
+        /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */
+        xxh_u64x2 const product  = XXH_vec_mulo((xxh_u32x4)data_key, shuffled);
+        /* acc_vec = xacc[i]; */
+        xxh_u64x2 acc_vec        = xacc[i];
+        acc_vec += product;
+
+        /* swap high and low halves */
+#ifdef __s390x__
+        acc_vec += vec_permi(data_vec, data_vec, 2);
+#else
+        acc_vec += vec_xxpermdi(data_vec, data_vec, 2);
+#endif
+        xacc[i] = acc_vec;
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+
+    {   xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc;
+        const xxh_u8* const xsecret = (const xxh_u8*) secret;
+        /* constants */
+        xxh_u64x2 const v32  = { 32, 32 };
+        xxh_u64x2 const v47 = { 47, 47 };
+        xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 };
+        size_t i;
+        for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47); */
+            xxh_u64x2 const acc_vec  = xacc[i];
+            xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47);
+
+            /* xacc[i] ^= xsecret[i]; */
+            xxh_u64x2 const key_vec  = XXH_vec_loadu(xsecret + 16*i);
+            xxh_u64x2 const data_key = data_vec ^ key_vec;
+
+            /* xacc[i] *= XXH_PRIME32_1 */
+            /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF);  */
+            xxh_u64x2 const prod_even  = XXH_vec_mule((xxh_u32x4)data_key, prime);
+            /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32);  */
+            xxh_u64x2 const prod_odd  = XXH_vec_mulo((xxh_u32x4)data_key, prime);
+            xacc[i] = prod_odd + (prod_even << v32);
+    }   }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_SVE)
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
+                   const void* XXH_RESTRICT input,
+                   const void* XXH_RESTRICT secret)
+{
+    uint64_t *xacc = (uint64_t *)acc;
+    const uint64_t *xinput = (const uint64_t *)(const void *)input;
+    const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
+    svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
+    uint64_t element_count = svcntd();
+    if (element_count >= 8) {
+        svbool_t mask = svptrue_pat_b64(SV_VL8);
+        svuint64_t vacc = svld1_u64(mask, xacc);
+        ACCRND(vacc, 0);
+        svst1_u64(mask, xacc, vacc);
+    } else if (element_count == 2) {   /* sve128 */
+        svbool_t mask = svptrue_pat_b64(SV_VL2);
+        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+        svuint64_t acc1 = svld1_u64(mask, xacc + 2);
+        svuint64_t acc2 = svld1_u64(mask, xacc + 4);
+        svuint64_t acc3 = svld1_u64(mask, xacc + 6);
+        ACCRND(acc0, 0);
+        ACCRND(acc1, 2);
+        ACCRND(acc2, 4);
+        ACCRND(acc3, 6);
+        svst1_u64(mask, xacc + 0, acc0);
+        svst1_u64(mask, xacc + 2, acc1);
+        svst1_u64(mask, xacc + 4, acc2);
+        svst1_u64(mask, xacc + 6, acc3);
+    } else {
+        svbool_t mask = svptrue_pat_b64(SV_VL4);
+        svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+        svuint64_t acc1 = svld1_u64(mask, xacc + 4);
+        ACCRND(acc0, 0);
+        ACCRND(acc1, 4);
+        svst1_u64(mask, xacc + 0, acc0);
+        svst1_u64(mask, xacc + 4, acc1);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc,
+               const xxh_u8* XXH_RESTRICT input,
+               const xxh_u8* XXH_RESTRICT secret,
+               size_t nbStripes)
+{
+    if (nbStripes != 0) {
+        uint64_t *xacc = (uint64_t *)acc;
+        const uint64_t *xinput = (const uint64_t *)(const void *)input;
+        const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
+        svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
+        uint64_t element_count = svcntd();
+        if (element_count >= 8) {
+            svbool_t mask = svptrue_pat_b64(SV_VL8);
+            svuint64_t vacc = svld1_u64(mask, xacc + 0);
+            do {
+                /* svprfd(svbool_t, void *, enum svfprop); */
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(vacc, 0);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, vacc);
+        } else if (element_count == 2) { /* sve128 */
+            svbool_t mask = svptrue_pat_b64(SV_VL2);
+            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+            svuint64_t acc1 = svld1_u64(mask, xacc + 2);
+            svuint64_t acc2 = svld1_u64(mask, xacc + 4);
+            svuint64_t acc3 = svld1_u64(mask, xacc + 6);
+            do {
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(acc0, 0);
+                ACCRND(acc1, 2);
+                ACCRND(acc2, 4);
+                ACCRND(acc3, 6);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, acc0);
+           svst1_u64(mask, xacc + 2, acc1);
+           svst1_u64(mask, xacc + 4, acc2);
+           svst1_u64(mask, xacc + 6, acc3);
+        } else {
+            svbool_t mask = svptrue_pat_b64(SV_VL4);
+            svuint64_t acc0 = svld1_u64(mask, xacc + 0);
+            svuint64_t acc1 = svld1_u64(mask, xacc + 4);
+            do {
+                svprfd(mask, xinput + 128, SV_PLDL1STRM);
+                ACCRND(acc0, 0);
+                ACCRND(acc1, 4);
+                xinput += 8;
+                xsecret += 1;
+                nbStripes--;
+           } while (nbStripes != 0);
+
+           svst1_u64(mask, xacc + 0, acc0);
+           svst1_u64(mask, xacc + 4, acc1);
+       }
+    }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_LSX)
+#define _LSX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_lsx( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        __m128i* const xacc    =       (__m128i *) acc;
+        const __m128i* const xinput  = (const __m128i *) input;
+        const __m128i* const xsecret = (const __m128i *) secret;
+
+        for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
+            /* data_vec = xinput[i]; */
+            __m128i const data_vec = __lsx_vld(xinput + i, 0);
+            /* key_vec = xsecret[i]; */
+            __m128i const key_vec = __lsx_vld(xsecret + i, 0);
+            /* data_key = data_vec ^ key_vec; */
+            __m128i const data_key = __lsx_vxor_v(data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32);
+            // __m128i const data_key_lo = __lsx_vsrli_d(data_key, 32);
+            /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m128i const product = __lsx_vmulwev_d_wu(data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m128i const data_swap = __lsx_vshuf4i_w(data_vec, _LSX_SHUFFLE(1, 0, 3, 2));
+            __m128i const sum = __lsx_vadd_d(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = __lsx_vadd_d(product, sum);
+        }
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lsx)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_lsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 15) == 0);
+    {
+        __m128i* const xacc = (__m128i*) acc;
+        const __m128i* const xsecret = (const __m128i *) secret;
+        const __m128i prime32 = __lsx_vreplgr2vr_d(XXH_PRIME32_1);
+
+        for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m128i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m128i const acc_vec = xacc[i];
+            __m128i const shifted = __lsx_vsrli_d(acc_vec, 47);
+            __m128i const data_vec = __lsx_vxor_v(acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m128i const key_vec = __lsx_vld(xsecret + i, 0);
+            __m128i const data_key = __lsx_vxor_v(data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            xacc[i] = __lsx_vmul_d(data_key, prime32);
+        }
+    }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_LASX)
+#define _LASX_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_lasx( void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {
+        __m256i* const xacc    =       (__m256i *) acc;
+        const __m256i* const xinput  = (const __m256i *) input;
+        const __m256i* const xsecret = (const __m256i *) secret;
+
+        for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
+            /* data_vec = xinput[i]; */
+            __m256i const data_vec = __lasx_xvld(xinput + i, 0);
+            /* key_vec = xsecret[i]; */
+            __m256i const key_vec = __lasx_xvld(xsecret + i, 0);
+            /* data_key = data_vec ^ key_vec; */
+            __m256i const data_key = __lasx_xvxor_v(data_vec, key_vec);
+            /* data_key_lo = data_key >> 32; */
+            __m256i const data_key_lo = __lasx_xvsrli_d(data_key, 32);
+            // __m256i const data_key_lo = __lasx_xvsrli_d(data_key, 32);
+            /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            __m256i const product = __lasx_xvmulwev_d_wu(data_key, data_key_lo);
+            /* xacc[i] += swap(data_vec); */
+            __m256i const data_swap = __lasx_xvshuf4i_w(data_vec, _LASX_SHUFFLE(1, 0, 3, 2));
+            __m256i const sum = __lasx_xvadd_d(xacc[i], data_swap);
+            /* xacc[i] += product; */
+            xacc[i] = __lasx_xvadd_d(product, sum);
+        }
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(lasx)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_lasx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 31) == 0);
+    {
+        __m256i* const xacc = (__m256i*) acc;
+        const __m256i* const xsecret = (const __m256i *) secret;
+        const __m256i prime32 = __lasx_xvreplgr2vr_d(XXH_PRIME32_1);
+
+        for (size_t i = 0; i < XXH_STRIPE_LEN / sizeof(__m256i); i++) {
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            __m256i const acc_vec = xacc[i];
+            __m256i const shifted = __lasx_xvsrli_d(acc_vec, 47);
+            __m256i const data_vec = __lasx_xvxor_v(acc_vec, shifted);
+            /* xacc[i] ^= xsecret[i]; */
+            __m256i const key_vec = __lasx_xvld(xsecret + i, 0);
+            __m256i const data_key = __lasx_xvxor_v(data_vec, key_vec);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            xacc[i] = __lasx_xvmul_d(data_key, prime32);
+        }
+    }
+}
+
+#endif
+
+#if (XXH_VECTOR == XXH_RVV)
+#if ((defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 13) || \
+        (defined(__clang__) && __clang_major__ < 16))
+    #define RVV_OP(op) op
+#else
+    #define concat2(X, Y) X ## Y
+    #define concat(X, Y) concat2(X, Y)
+    #define RVV_OP(op) concat(__riscv_, op)
+#endif
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_rvv(  void* XXH_RESTRICT acc,
+                    const void* XXH_RESTRICT input,
+                    const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    {
+        // Try to set vector lenght to 512 bits.
+        // If this length is unavailable, then maximum available will be used
+        size_t vl = RVV_OP(vsetvl_e64m2)(8);
+
+        uint64_t* const xacc = (uint64_t*) acc;
+        const uint64_t* const xinput = (const uint64_t*) input;
+        const uint64_t* const xsecret = (const uint64_t*) secret;
+        uint64_t swap_mask[16] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+        vuint64m2_t xswap_mask = RVV_OP(vle64_v_u64m2)(swap_mask, vl);
+
+        // vuint64m1_t is sizeless.
+        // But we can assume that vl can be only 4(vlen=128) or 8(vlen=256,512)
+        for(size_t i = 0; i < XXH_STRIPE_LEN/(8 * vl); i++){
+            /* data_vec    = input[i]; */
+            vuint64m2_t data_vec = RVV_OP(vreinterpret_v_u8m2_u64m2)(RVV_OP(vle8_v_u8m2)((const uint8_t*)(xinput + vl * i), vl * 8));
+            /* key_vec     = secret[i]; */
+            vuint64m2_t key_vec = RVV_OP(vreinterpret_v_u8m2_u64m2)(RVV_OP(vle8_v_u8m2)((const uint8_t*)(xsecret + vl * i), vl * 8));
+            /* data_key    = data_vec ^ key_vec; */
+            vuint64m2_t data_key = RVV_OP(vxor_vv_u64m2)(data_vec, key_vec, vl);
+            /* data_key_lo = data_key >> 32; */
+            vuint64m2_t data_key_lo = RVV_OP(vsrl_vx_u64m2)(data_key, 32, vl);
+            /* product     = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */
+            vuint64m2_t product = RVV_OP(vmul_vv_u64m2)(RVV_OP(vand_vx_u64m2)(data_key, 0xffffffff, vl), RVV_OP(vand_vx_u64m2)(data_key_lo, 0xffffffff, vl), vl);
+            /* acc_vec = xacc[i]; */
+            vuint64m2_t acc_vec = RVV_OP(vle64_v_u64m2)(xacc + vl * i, vl);
+            acc_vec = RVV_OP(vadd_vv_u64m2)(acc_vec, product, vl);
+            {
+                /* swap high and low halves */
+                vuint64m2_t data_swap = RVV_OP(vrgather_vv_u64m2)(data_vec, xswap_mask, vl);
+                acc_vec = RVV_OP(vadd_vv_u64m2)(acc_vec, data_swap, vl);
+            }
+            RVV_OP(vse64_v_u64m2)(xacc + vl * i, acc_vec, vl);
+        }
+    }
+}
+
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(rvv)
+
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_rvv(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    XXH_ASSERT((((size_t)acc) & 63) == 0);
+    {
+        // Try to set vector lenght to 512 bits.
+        // If this length is unavailable, then maximum available will be used
+        size_t vl = RVV_OP(vsetvl_e64m2)(8);
+        uint64_t* const xacc = (uint64_t*) acc;
+        const uint64_t* const xsecret = (const uint64_t*) secret;
+
+        uint64_t prime[16] = {XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1,\
+                                XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1};
+        vuint64m2_t vprime = RVV_OP(vle64_v_u64m2)(prime, vl);
+
+        // vuint64m2_t is sizeless.
+        // But we can assume that vl can be only 4(vlen=128) or 8(vlen=256,512)
+        for(size_t i = 0; i < XXH_STRIPE_LEN/(8 * vl); i++){
+            /* xacc[i] ^= (xacc[i] >> 47) */
+            vuint64m2_t acc_vec = RVV_OP(vle64_v_u64m2)(xacc + vl * i, vl);
+            vuint64m2_t shifted = RVV_OP(vsrl_vx_u64m2)(acc_vec, 47, vl);
+            vuint64m2_t data_vec = RVV_OP(vxor_vv_u64m2)(acc_vec, shifted, vl);
+            /* xacc[i] ^= xsecret[i]; */
+            vuint64m2_t key_vec = RVV_OP(vreinterpret_v_u8m2_u64m2)(RVV_OP(vle8_v_u8m2)((const uint8_t*)(xsecret + vl * i), vl * 8));
+            vuint64m2_t data_key = RVV_OP(vxor_vv_u64m2)(data_vec, key_vec, vl);
+
+            /* xacc[i] *= XXH_PRIME32_1; */
+            vuint64m2_t prod_even = RVV_OP(vmul_vv_u64m2)(RVV_OP(vand_vx_u64m2)(data_key, 0xffffffff, vl), vprime, vl);
+            vuint64m2_t prod_odd = RVV_OP(vmul_vv_u64m2)(RVV_OP(vsrl_vx_u64m2)(data_key, 32, vl), vprime, vl);
+            vuint64m2_t prod = RVV_OP(vadd_vv_u64m2)(prod_even, RVV_OP(vsll_vx_u64m2)(prod_odd, 32, vl), vl);
+            RVV_OP(vse64_v_u64m2)(xacc + vl * i, prod, vl);
+        }
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_initCustomSecret_rvv(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0);
+    XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64);
+    XXH_ASSERT(((size_t)customSecret & 63) == 0);
+    {
+        uint64_t* const xcustomSecret = (uint64_t*)customSecret;
+
+        (void)(&XXH_writeLE64);
+        {
+            // Calculate the number of 64-bit elements in the `XXH3_kSecret` secret
+            size_t XXH3_kSecret_64b_len = XXH_SECRET_DEFAULT_SIZE / 8;
+            // Create an array of repeated seed values, alternating between seed64 and -seed64.
+            uint64_t seed_pos[16] = {seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64), \
+                                    seed64, (uint64_t)(-(int64_t)seed64)};
+            // Cast the default secret to a signed 64-bit pointer for vectorized access
+            const int64_t* const xXXH3_kSecret = (const int64_t*)((const void*)XXH3_kSecret);
+            size_t vl = 0;
+            for (size_t i=0; i < XXH3_kSecret_64b_len; i += vl) {
+
+                vl = RVV_OP(vsetvl_e64m2)(XXH3_kSecret_64b_len - i);
+                {
+                    vint64m2_t seed = RVV_OP(vle64_v_i64m2)((int64_t*)seed_pos, vl);
+                    vint64m2_t src = RVV_OP(vle64_v_i64m2)((const int64_t*)&xXXH3_kSecret[i], vl);
+                    vint64m2_t res = RVV_OP(vadd_vv_i64m2)(src, seed, vl);
+                    RVV_OP(vse64_v_i64m2)((int64_t*)&xcustomSecret[i], res, vl);
+                }
+            }
+        }
+    }
+}
+#endif
+
+
+/* scalar variants - universal */
+
+#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__))
+/*
+ * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they
+ * emit an excess mask and a full 64-bit multiply-add (MADD X-form).
+ *
+ * While this might not seem like much, as AArch64 is a 64-bit architecture, only
+ * big Cortex designs have a full 64-bit multiplier.
+ *
+ * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit
+ * multiplies expand to 2-3 multiplies in microcode. This has a major penalty
+ * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline.
+ *
+ * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does
+ * not have this penalty and does the mask automatically.
+ */
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
+{
+    xxh_u64 ret;
+    /* note: %x = 64-bit register, %w = 32-bit register */
+    __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc));
+    return ret;
+}
+#else
+XXH_FORCE_INLINE xxh_u64
+XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc)
+{
+    return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc;
+}
+#endif
+
+/*!
+ * @internal
+ * @brief Scalar round for @ref XXH3_accumulate_512_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarRound(void* XXH_RESTRICT acc,
+                 void const* XXH_RESTRICT input,
+                 void const* XXH_RESTRICT secret,
+                 size_t lane)
+{
+    xxh_u64* xacc = (xxh_u64*) acc;
+    xxh_u8 const* xinput  = (xxh_u8 const*) input;
+    xxh_u8 const* xsecret = (xxh_u8 const*) secret;
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0);
+    {
+        xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8);
+        xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8);
+        xacc[lane ^ 1] += data_val; /* swap adjacent lanes */
+        xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]);
+    }
+}
+
+/*!
+ * @internal
+ * @brief Processes a 64 byte block of data using the scalar path.
+ */
+XXH_FORCE_INLINE void
+XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc,
+                     const void* XXH_RESTRICT input,
+                     const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */
+#if defined(__GNUC__) && !defined(__clang__) \
+  && (defined(__arm__) || defined(__thumb2__)) \
+  && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \
+  && XXH_SIZE_OPT <= 0
+#  pragma GCC unroll 8
+#endif
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarRound(acc, input, secret, i);
+    }
+}
+XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar)
+
+/*!
+ * @internal
+ * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar().
+ *
+ * This is extracted to its own function because the NEON path uses a combination
+ * of NEON and scalar.
+ */
+XXH_FORCE_INLINE void
+XXH3_scalarScrambleRound(void* XXH_RESTRICT acc,
+                         void const* XXH_RESTRICT secret,
+                         size_t lane)
+{
+    xxh_u64* const xacc = (xxh_u64*) acc;   /* presumed aligned */
+    const xxh_u8* const xsecret = (const xxh_u8*) secret;   /* no alignment restriction */
+    XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0);
+    XXH_ASSERT(lane < XXH_ACC_NB);
+    {
+        xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8);
+        xxh_u64 acc64 = xacc[lane];
+        acc64 = XXH_xorshift64(acc64, 47);
+        acc64 ^= key64;
+        acc64 *= XXH_PRIME32_1;
+        xacc[lane] = acc64;
+    }
+}
+
+/*!
+ * @internal
+ * @brief Scrambles the accumulators after a large chunk has been read
+ */
+XXH_FORCE_INLINE void
+XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+{
+    size_t i;
+    for (i=0; i < XXH_ACC_NB; i++) {
+        XXH3_scalarScrambleRound(acc, secret, i);
+    }
+}
+
+XXH_FORCE_INLINE void
+XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64)
+{
+    /*
+     * We need a separate pointer for the hack below,
+     * which requires a non-const pointer.
+     * Any decent compiler will optimize this out otherwise.
+     */
+    const xxh_u8* kSecretPtr = XXH3_kSecret;
+    XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0);
+
+#if defined(__GNUC__) && defined(__aarch64__)
+    /*
+     * UGLY HACK:
+     * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are
+     * placed sequentially, in order, at the top of the unrolled loop.
+     *
+     * While MOVK is great for generating constants (2 cycles for a 64-bit
+     * constant compared to 4 cycles for LDR), it fights for bandwidth with
+     * the arithmetic instructions.
+     *
+     *   I   L   S
+     * MOVK
+     * MOVK
+     * MOVK
+     * MOVK
+     * ADD
+     * SUB      STR
+     *          STR
+     * By forcing loads from memory (as the asm line causes the compiler to assume
+     * that XXH3_kSecretPtr has been changed), the pipelines are used more
+     * efficiently:
+     *   I   L   S
+     *      LDR
+     *  ADD LDR
+     *  SUB     STR
+     *          STR
+     *
+     * See XXH3_NEON_LANES for details on the pipeline.
+     *
+     * XXH3_64bits_withSeed, len == 256, Snapdragon 835
+     *   without hack: 2654.4 MB/s
+     *   with hack:    3202.9 MB/s
+     */
+    XXH_COMPILER_GUARD(kSecretPtr);
+#endif
+    {   int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16;
+        int i;
+        for (i=0; i < nbRounds; i++) {
+            /*
+             * The asm hack causes the compiler to assume that kSecretPtr aliases with
+             * customSecret, and on aarch64, this prevented LDP from merging two
+             * loads together for free. Putting the loads together before the stores
+             * properly generates LDP.
+             */
+            xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i)     + seed64;
+            xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64;
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i,     lo);
+            XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi);
+    }   }
+}
+
+
+typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t);
+typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*);
+typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
+
+
+#if (XXH_VECTOR == XXH_AVX512)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx512
+#define XXH3_accumulate     XXH3_accumulate_avx512
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx512
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512
+
+#elif (XXH_VECTOR == XXH_AVX2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_avx2
+#define XXH3_accumulate     XXH3_accumulate_avx2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_avx2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2
+
+#elif (XXH_VECTOR == XXH_SSE2)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_sse2
+#define XXH3_accumulate     XXH3_accumulate_sse2
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_sse2
+#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2
+
+#elif (XXH_VECTOR == XXH_NEON)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_neon
+#define XXH3_accumulate     XXH3_accumulate_neon
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_neon
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_VSX)
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_vsx
+#define XXH3_accumulate     XXH3_accumulate_vsx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_vsx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_SVE)
+#define XXH3_accumulate_512 XXH3_accumulate_512_sve
+#define XXH3_accumulate     XXH3_accumulate_sve
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_LASX)
+#define XXH3_accumulate_512 XXH3_accumulate_512_lasx
+#define XXH3_accumulate     XXH3_accumulate_lasx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_lasx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_LSX)
+#define XXH3_accumulate_512 XXH3_accumulate_512_lsx
+#define XXH3_accumulate     XXH3_accumulate_lsx
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_lsx
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#elif (XXH_VECTOR == XXH_RVV)
+#define XXH3_accumulate_512 XXH3_accumulate_512_rvv
+#define XXH3_accumulate     XXH3_accumulate_rvv
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_rvv
+#define XXH3_initCustomSecret XXH3_initCustomSecret_rvv
+
+#else /* scalar */
+
+#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
+#define XXH3_accumulate     XXH3_accumulate_scalar
+#define XXH3_scrambleAcc    XXH3_scrambleAcc_scalar
+#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+
+#endif
+
+#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */
+#  undef XXH3_initCustomSecret
+#  define XXH3_initCustomSecret XXH3_initCustomSecret_scalar
+#endif
+
+XXH_FORCE_INLINE void
+XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc,
+                      const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE;
+    size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock;
+    size_t const nb_blocks = (len - 1) / block_len;
+
+    size_t n;
+
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+
+    for (n = 0; n < nb_blocks; n++) {
+        f_acc(acc, input + n*block_len, secret, nbStripesPerBlock);
+        f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN);
+    }
+
+    /* last partial block */
+    XXH_ASSERT(len > XXH_STRIPE_LEN);
+    {   size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN;
+        XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE));
+        f_acc(acc, input + nb_blocks*block_len, secret, nbStripes);
+
+        /* last stripe */
+        {   const xxh_u8* const p = input + len - XXH_STRIPE_LEN;
+#define XXH_SECRET_LASTACC_START 7  /* not aligned on 8, last secret is different from acc & scrambler */
+            XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START);
+    }   }
+}
+
+XXH_FORCE_INLINE xxh_u64
+XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret)
+{
+    return XXH3_mul128_fold64(
+               acc[0] ^ XXH_readLE64(secret),
+               acc[1] ^ XXH_readLE64(secret+8) );
+}
+
+static XXH_PUREF XXH64_hash_t
+XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start)
+{
+    xxh_u64 result64 = start;
+    size_t i = 0;
+
+    for (i = 0; i < 4; i++) {
+        result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i);
+#if defined(__clang__)                                /* Clang */ \
+    && (defined(__arm__) || defined(__thumb__))       /* ARMv7 */ \
+    && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */  \
+    && !defined(XXH_ENABLE_AUTOVECTORIZE)             /* Define to disable */
+        /*
+         * UGLY HACK:
+         * Prevent autovectorization on Clang ARMv7-a. Exact same problem as
+         * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b.
+         * XXH3_64bits, len == 256, Snapdragon 835:
+         *   without hack: 2063.7 MB/s
+         *   with hack:    2560.7 MB/s
+         */
+        XXH_COMPILER_GUARD(result64);
+#endif
+    }
+
+    return XXH3_avalanche(result64);
+}
+
+/* do not align on 8, so that the secret is different from the accumulator */
+#define XXH_SECRET_MERGEACCS_START 11
+
+static XXH_PUREF XXH64_hash_t
+XXH3_finalizeLong_64b(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 len)
+{
+    return XXH3_mergeAccs(acc, secret + XXH_SECRET_MERGEACCS_START, len * XXH_PRIME64_1);
+}
+
+#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \
+                        XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 }
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len,
+                           const void* XXH_RESTRICT secret, size_t secretSize,
+                           XXH3_f_accumulate f_acc,
+                           XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_finalizeLong_64b(acc, (const xxh_u8*)secret, (xxh_u64)len);
+}
+
+/*
+ * It's important for performance to transmit secret's size (when it's static)
+ * so that the compiler can properly optimize the vectorized loop.
+ * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set.
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
+ * breaks -Og, this is XXH_NO_INLINE.
+ */
+XXH3_WITH_SECRET_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                             XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * It's preferable for performance that XXH3_hashLong is not inlined,
+ * as it results in a smaller function for small data, easier to the instruction cache.
+ * Note that inside this no_inline function, we do inline the internal loop,
+ * and provide a statically defined secret size to allow optimization of vector loop.
+ */
+XXH_NO_INLINE XXH_PUREF XXH64_hash_t
+XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len,
+                          XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * XXH3_hashLong_64b_withSeed():
+ * Generate a custom key based on alteration of default XXH3_kSecret with the seed,
+ * and then use this key for long mode hashing.
+ *
+ * This operation is decently fast but nonetheless costs a little bit of time.
+ * Try to avoid it whenever possible (typically when seed==0).
+ *
+ * It's important for performance that XXH3_hashLong is not inlined. Not sure
+ * why (uop cache maybe?), but the difference is large and easily measurable.
+ */
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len,
+                                    XXH64_hash_t seed,
+                                    XXH3_f_accumulate f_acc,
+                                    XXH3_f_scrambleAcc f_scramble,
+                                    XXH3_f_initCustomSecret f_initSec)
+{
+#if XXH_SIZE_OPT <= 0
+    if (seed == 0)
+        return XXH3_hashLong_64b_internal(input, len,
+                                          XXH3_kSecret, sizeof(XXH3_kSecret),
+                                          f_acc, f_scramble);
+#endif
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed);
+        return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret),
+                                          f_acc, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH64_hash_t
+XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_64b_withSeed_internal(input, len, seed,
+                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+
+typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t,
+                                          XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH64_hash_t
+XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len,
+                     XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                     XXH3_hashLong64_f f_hashLong)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secretLen` condition is not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     * Also, note that function signature doesn't offer room to return an error.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen);
+}
+
+
+/* ===   Public entry point   === */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length)
+{
+    return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed)
+{
+    return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed);
+}
+
+XXH_PUBLIC_API XXH64_hash_t
+XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (length <= XXH3_MIDSIZE_MAX)
+        return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize);
+}
+
+
+/* ===   XXH3 streaming   === */
+#ifndef XXH_NO_STREAM
+/*
+ * Malloc's a pointer that is always aligned to @align.
+ *
+ * This must be freed with `XXH_alignedFree()`.
+ *
+ * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte
+ * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2
+ * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON.
+ *
+ * This underalignment previously caused a rather obvious crash which went
+ * completely unnoticed due to XXH3_createState() not actually being tested.
+ * Credit to RedSpah for noticing this bug.
+ *
+ * The alignment is done manually: Functions like posix_memalign or _mm_malloc
+ * are avoided: To maintain portability, we would have to write a fallback
+ * like this anyways, and besides, testing for the existence of library
+ * functions without relying on external build tools is impossible.
+ *
+ * The method is simple: Overallocate, manually align, and store the offset
+ * to the original behind the returned pointer.
+ *
+ * Align must be a power of 2 and 8 <= align <= 128.
+ */
+static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align)
+{
+    XXH_ASSERT(align <= 128 && align >= 8); /* range check */
+    XXH_ASSERT((align & (align-1)) == 0);   /* power of 2 */
+    XXH_ASSERT(s != 0 && s < (s + align));  /* empty/overflow */
+    {   /* Overallocate to make room for manual realignment and an offset byte */
+        xxh_u8* base = (xxh_u8*)XXH_malloc(s + align);
+        if (base != NULL) {
+            /*
+             * Get the offset needed to align this pointer.
+             *
+             * Even if the returned pointer is aligned, there will always be
+             * at least one byte to store the offset to the original pointer.
+             */
+            size_t offset = align - ((size_t)base & (align - 1)); /* base % align */
+            /* Add the offset for the now-aligned pointer */
+            xxh_u8* ptr = base + offset;
+
+            XXH_ASSERT((size_t)ptr % align == 0);
+
+            /* Store the offset immediately before the returned pointer. */
+            ptr[-1] = (xxh_u8)offset;
+            return ptr;
+        }
+        return NULL;
+    }
+}
+/*
+ * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass
+ * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout.
+ */
+static void XXH_alignedFree(void* p)
+{
+    if (p != NULL) {
+        xxh_u8* ptr = (xxh_u8*)p;
+        /* Get the offset byte we added in XXH_malloc. */
+        xxh_u8 offset = ptr[-1];
+        /* Free the original malloc'd pointer */
+        xxh_u8* base = ptr - offset;
+        XXH_free(base);
+    }
+}
+/*! @ingroup XXH3_family */
+/*!
+ * @brief Allocate an @ref XXH3_state_t.
+ *
+ * @return An allocated pointer of @ref XXH3_state_t on success.
+ * @return `NULL` on failure.
+ *
+ * @note Must be freed with XXH3_freeState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void)
+{
+    XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64);
+    if (state==NULL) return NULL;
+    XXH3_INITSTATE(state);
+    return state;
+}
+
+/*! @ingroup XXH3_family */
+/*!
+ * @brief Frees an @ref XXH3_state_t.
+ *
+ * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState().
+ *
+ * @return @ref XXH_OK.
+ *
+ * @note Must be allocated with XXH3_createState().
+ *
+ * @see @ref streaming_example "Streaming Example"
+ */
+XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr)
+{
+    XXH_alignedFree(statePtr);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state)
+{
+    XXH_memcpy(dst_state, src_state, sizeof(*dst_state));
+}
+
+static void
+XXH3_reset_internal(XXH3_state_t* statePtr,
+                    XXH64_hash_t seed,
+                    const void* secret, size_t secretSize)
+{
+    size_t const initStart = offsetof(XXH3_state_t, bufferedSize);
+    size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart;
+    XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart);
+    XXH_ASSERT(statePtr != NULL);
+    /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */
+    XXH_memset((char*)statePtr + initStart, 0, initLength);
+    statePtr->acc[0] = XXH_PRIME32_3;
+    statePtr->acc[1] = XXH_PRIME64_1;
+    statePtr->acc[2] = XXH_PRIME64_2;
+    statePtr->acc[3] = XXH_PRIME64_3;
+    statePtr->acc[4] = XXH_PRIME64_4;
+    statePtr->acc[5] = XXH_PRIME32_2;
+    statePtr->acc[6] = XXH_PRIME64_5;
+    statePtr->acc[7] = XXH_PRIME32_1;
+    statePtr->seed = seed;
+    statePtr->useSeed = (seed != 0);
+    statePtr->extSecret = (const unsigned char*)secret;
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+    statePtr->secretLimit = secretSize - XXH_STRIPE_LEN;
+    statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, 0, secret, secretSize);
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (seed==0) return XXH3_64bits_reset(statePtr);
+    if ((seed != statePtr->seed) || (statePtr->extSecret != NULL))
+        XXH3_initCustomSecret(statePtr->customSecret, seed);
+    XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE);
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64)
+{
+    if (statePtr == NULL) return XXH_ERROR;
+    if (secret == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+    XXH3_reset_internal(statePtr, seed64, secret, secretSize);
+    statePtr->useSeed = 1; /* always, even if seed64==0 */
+    return XXH_OK;
+}
+
+/*!
+ * @internal
+ * @brief Processes a large input for XXH3_update() and XXH3_digest_long().
+ *
+ * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block.
+ *
+ * @param acc                Pointer to the 8 accumulator lanes
+ * @param nbStripesSoFarPtr  In/out pointer to the number of leftover stripes in the block*
+ * @param nbStripesPerBlock  Number of stripes in a block
+ * @param input              Input pointer
+ * @param nbStripes          Number of stripes to process
+ * @param secret             Secret pointer
+ * @param secretLimit        Offset of the last block in @p secret
+ * @param f_acc              Pointer to an XXH3_accumulate implementation
+ * @param f_scramble         Pointer to an XXH3_scrambleAcc implementation
+ * @return                   Pointer past the end of @p input after processing
+ */
+XXH_FORCE_INLINE const xxh_u8 *
+XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc,
+                    size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock,
+                    const xxh_u8* XXH_RESTRICT input, size_t nbStripes,
+                    const xxh_u8* XXH_RESTRICT secret, size_t secretLimit,
+                    XXH3_f_accumulate f_acc,
+                    XXH3_f_scrambleAcc f_scramble)
+{
+    const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE;
+    /* Process full blocks */
+    if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) {
+        /* Process the initial partial block... */
+        size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr;
+
+        do {
+            /* Accumulate and scramble */
+            f_acc(acc, input, initialSecret, nbStripesThisIter);
+            f_scramble(acc, secret + secretLimit);
+            input += nbStripesThisIter * XXH_STRIPE_LEN;
+            nbStripes -= nbStripesThisIter;
+            /* Then continue the loop with the full block size */
+            nbStripesThisIter = nbStripesPerBlock;
+            initialSecret = secret;
+        } while (nbStripes >= nbStripesPerBlock);
+        *nbStripesSoFarPtr = 0;
+    }
+    /* Process a partial block */
+    if (nbStripes > 0) {
+        f_acc(acc, input, initialSecret, nbStripes);
+        input += nbStripes * XXH_STRIPE_LEN;
+        *nbStripesSoFarPtr += nbStripes;
+    }
+    /* Return end pointer */
+    return input;
+}
+
+#ifndef XXH3_STREAM_USE_STACK
+# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */
+#   define XXH3_STREAM_USE_STACK 1
+# endif
+#endif
+/* This function accepts f_acc and f_scramble as function pointers,
+ * making it possible to implement multiple variants with different acc & scramble stages.
+ * This is notably useful to implement multiple vector variants with different intrinsics.
+ */
+XXH_FORCE_INLINE XXH_errorcode
+XXH3_update(XXH3_state_t* XXH_RESTRICT const state,
+            const xxh_u8* XXH_RESTRICT input, size_t len,
+            XXH3_f_accumulate f_acc,
+            XXH3_f_scrambleAcc f_scramble)
+{
+    if (input==NULL) {
+        XXH_ASSERT(len == 0);
+        return XXH_OK;
+    }
+
+    XXH_ASSERT(state != NULL);
+    state->totalLen += len;
+
+    /* small input : just fill in tmp buffer */
+    XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE);
+    if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) {
+        XXH_memcpy(state->buffer + state->bufferedSize, input, len);
+        state->bufferedSize += (XXH32_hash_t)len;
+        return XXH_OK;
+    }
+
+    {   const xxh_u8* const bEnd = input + len;
+        const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* For some reason, gcc and MSVC seem to suffer greatly
+         * when operating accumulators directly into state.
+         * Operating into stack space seems to enable proper optimization.
+         * clang, on the other hand, doesn't seem to need this trick */
+        XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8];
+        XXH_memcpy(acc, state->acc, sizeof(acc));
+#else
+        xxh_u64* XXH_RESTRICT const acc = state->acc;
+#endif
+
+        /* total input is now > XXH3_INTERNALBUFFER_SIZE */
+        #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN)
+        XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0);   /* clean multiple */
+
+        /*
+         * Internal buffer is partially filled (always, except at beginning)
+         * Complete it, then consume it.
+         */
+        if (state->bufferedSize) {
+            size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize;
+            XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize);
+            input += loadSize;
+            XXH3_consumeStripes(acc,
+                               &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                state->buffer, XXH3_INTERNALBUFFER_STRIPES,
+                                secret, state->secretLimit,
+                                f_acc, f_scramble);
+            state->bufferedSize = 0;
+        }
+        XXH_ASSERT(input < bEnd);
+        if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) {
+            size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN;
+            input = XXH3_consumeStripes(acc,
+                                       &state->nbStripesSoFar, state->nbStripesPerBlock,
+                                       input, nbStripes,
+                                       secret, state->secretLimit,
+                                       f_acc, f_scramble);
+            XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN);
+
+        }
+        /* Some remaining input (always) : buffer it */
+        XXH_ASSERT(input < bEnd);
+        XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE);
+        XXH_ASSERT(state->bufferedSize == 0);
+        XXH_memcpy(state->buffer, input, (size_t)(bEnd-input));
+        state->bufferedSize = (XXH32_hash_t)(bEnd-input);
+#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1
+        /* save stack accumulators into state */
+        XXH_memcpy(state->acc, acc, sizeof(acc));
+#endif
+    }
+
+    return XXH_OK;
+}
+
+/*
+ * Both XXH3_64bits_update and XXH3_128bits_update use this routine.
+ */
+XXH_NO_INLINE XXH_errorcode
+XXH3_update_regular(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_update(state, (const xxh_u8*)input, len,
+                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_update_regular(state, input, len);
+}
+
+
+XXH_FORCE_INLINE void
+XXH3_digest_long (XXH64_hash_t* acc,
+                  const XXH3_state_t* state,
+                  const unsigned char* secret)
+{
+    xxh_u8 lastStripe[XXH_STRIPE_LEN];
+    const xxh_u8* lastStripePtr;
+
+    /*
+     * Digest on a local copy. This way, the state remains unaltered, and it can
+     * continue ingesting more input afterwards.
+     */
+    XXH_memcpy(acc, state->acc, sizeof(state->acc));
+    if (state->bufferedSize >= XXH_STRIPE_LEN) {
+        /* Consume remaining stripes then point to remaining data in buffer */
+        size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN;
+        size_t nbStripesSoFar = state->nbStripesSoFar;
+        XXH3_consumeStripes(acc,
+                           &nbStripesSoFar, state->nbStripesPerBlock,
+                            state->buffer, nbStripes,
+                            secret, state->secretLimit,
+                            XXH3_accumulate, XXH3_scrambleAcc);
+        lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN;
+    } else {  /* bufferedSize < XXH_STRIPE_LEN */
+        /* Copy to temp buffer */
+        size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize;
+        XXH_ASSERT(state->bufferedSize > 0);  /* there is always some input buffered */
+        XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize);
+        XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize);
+        lastStripePtr = lastStripe;
+    }
+    /* Last stripe */
+    XXH3_accumulate_512(acc,
+                        lastStripePtr,
+                        secret + state->secretLimit - XXH_SECRET_LASTACC_START);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        return XXH3_finalizeLong_64b(acc, secret, (xxh_u64)state->totalLen);
+    }
+    /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */
+    if (state->useSeed)
+        return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                  secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+#endif /* !XXH_NO_STREAM */
+
+
+/* ==========================================
+ * XXH3 128 bits (a.k.a XXH128)
+ * ==========================================
+ * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant,
+ * even without counting the significantly larger output size.
+ *
+ * For example, extra steps are taken to avoid the seed-dependent collisions
+ * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B).
+ *
+ * This strength naturally comes at the cost of some speed, especially on short
+ * lengths. Note that longer hashes are about as fast as the 64-bit version
+ * due to it using only a slight modification of the 64-bit loop.
+ *
+ * XXH128 is also more oriented towards 64-bit machines. It is still extremely
+ * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64).
+ */
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    /* A doubled version of 1to3_64b with different constants. */
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(1 <= len && len <= 3);
+    XXH_ASSERT(secret != NULL);
+    /*
+     * len = 1: combinedl = { input[0], 0x01, input[0], input[0] }
+     * len = 2: combinedl = { input[1], 0x02, input[0], input[1] }
+     * len = 3: combinedl = { input[2], 0x03, input[0], input[1] }
+     */
+    {   xxh_u8 const c1 = input[0];
+        xxh_u8 const c2 = input[len >> 1];
+        xxh_u8 const c3 = input[len - 1];
+        xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24)
+                                | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8);
+        xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13);
+        xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed;
+        xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed;
+        xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl;
+        xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph;
+        XXH128_hash_t h128;
+        h128.low64  = XXH64_avalanche(keyed_lo);
+        h128.high64 = XXH64_avalanche(keyed_hi);
+        return h128;
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(4 <= len && len <= 8);
+    seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32;
+    {   xxh_u32 const input_lo = XXH_readLE32(input);
+        xxh_u32 const input_hi = XXH_readLE32(input + len - 4);
+        xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32);
+        xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed;
+        xxh_u64 const keyed = input_64 ^ bitflip;
+
+        /* Shift len to the left to ensure it is even, this avoids even multiplies. */
+        XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2));
+
+        m128.high64 += (m128.low64 << 1);
+        m128.low64  ^= (m128.high64 >> 3);
+
+        m128.low64   = XXH_xorshift64(m128.low64, 35);
+        m128.low64  *= PRIME_MX2;
+        m128.low64   = XXH_xorshift64(m128.low64, 28);
+        m128.high64  = XXH3_avalanche(m128.high64);
+        return m128;
+    }
+}
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(input != NULL);
+    XXH_ASSERT(secret != NULL);
+    XXH_ASSERT(9 <= len && len <= 16);
+    {   xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed;
+        xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed;
+        xxh_u64 const input_lo = XXH_readLE64(input);
+        xxh_u64       input_hi = XXH_readLE64(input + len - 8);
+        XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1);
+        /*
+         * Put len in the middle of m128 to ensure that the length gets mixed to
+         * both the low and high bits in the 128x64 multiply below.
+         */
+        m128.low64 += (xxh_u64)(len - 1) << 54;
+        input_hi   ^= bitfliph;
+        /*
+         * Add the high 32 bits of input_hi to the high 32 bits of m128, then
+         * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to
+         * the high 64 bits of m128.
+         *
+         * The best approach to this operation is different on 32-bit and 64-bit.
+         */
+        if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */
+            /*
+             * 32-bit optimized version, which is more readable.
+             *
+             * On 32-bit, it removes an ADC and delays a dependency between the two
+             * halves of m128.high64, but it generates an extra mask on 64-bit.
+             */
+            m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2);
+        } else {
+            /*
+             * 64-bit optimized (albeit more confusing) version.
+             *
+             * Uses some properties of addition and multiplication to remove the mask:
+             *
+             * Let:
+             *    a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF)
+             *    b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000)
+             *    c = XXH_PRIME32_2
+             *
+             *    a + (b * c)
+             * Inverse Property: x + y - x == y
+             *    a + (b * (1 + c - 1))
+             * Distributive Property: x * (y + z) == (x * y) + (x * z)
+             *    a + (b * 1) + (b * (c - 1))
+             * Identity Property: x * 1 == x
+             *    a + b + (b * (c - 1))
+             *
+             * Substitute a, b, and c:
+             *    input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             *
+             * Since input_hi.hi + input_hi.lo == input_hi, we get this:
+             *    input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1))
+             */
+            m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1);
+        }
+        /* m128 ^= XXH_swap64(m128 >> 64); */
+        m128.low64  ^= XXH_swap64(m128.high64);
+
+        {   /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */
+            XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2);
+            h128.high64 += m128.high64 * XXH_PRIME64_2;
+
+            h128.low64   = XXH3_avalanche(h128.low64);
+            h128.high64  = XXH3_avalanche(h128.high64);
+            return h128;
+    }   }
+}
+
+/*
+ * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN
+ */
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed)
+{
+    XXH_ASSERT(len <= 16);
+    {   if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed);
+        if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed);
+        if (len) return XXH3_len_1to3_128b(input, len, secret, seed);
+        {   XXH128_hash_t h128;
+            xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72);
+            xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88);
+            h128.low64 = XXH64_avalanche(seed ^ bitflipl);
+            h128.high64 = XXH64_avalanche( seed ^ bitfliph);
+            return h128;
+    }   }
+}
+
+/*
+ * A bit slower than XXH3_mix16B, but handles multiply by zero better.
+ */
+XXH_FORCE_INLINE XXH128_hash_t
+XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2,
+              const xxh_u8* secret, XXH64_hash_t seed)
+{
+    acc.low64  += XXH3_mix16B (input_1, secret+0, seed);
+    acc.low64  ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8);
+    acc.high64 += XXH3_mix16B (input_2, secret+16, seed);
+    acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8);
+    return acc;
+}
+
+
+XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                      const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                      XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(16 < len && len <= 128);
+
+    {   XXH128_hash_t acc;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+
+#if XXH_SIZE_OPT >= 1
+        {
+            /* Smaller, but slightly slower. */
+            unsigned int i = (unsigned int)(len - 1) / 32;
+            do {
+                acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed);
+            } while (i-- != 0);
+        }
+#else
+        if (len > 32) {
+            if (len > 64) {
+                if (len > 96) {
+                    acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed);
+                }
+                acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed);
+            }
+            acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed);
+        }
+        acc = XXH128_mix32B(acc, input, input+len-16, secret, seed);
+#endif
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len,
+                       const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                       XXH64_hash_t seed)
+{
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
+    XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX);
+
+    {   XXH128_hash_t acc;
+        unsigned i;
+        acc.low64 = len * XXH_PRIME64_1;
+        acc.high64 = 0;
+        /*
+         *  We set as `i` as offset + 32. We do this so that unchanged
+         * `len` can be used as upper bound. This reaches a sweet spot
+         * where both x86 and aarch64 get simple agen and good codegen
+         * for the loop.
+         */
+        for (i = 32; i < 160; i += 32) {
+            acc = XXH128_mix32B(acc,
+                                input  + i - 32,
+                                input  + i - 16,
+                                secret + i - 32,
+                                seed);
+        }
+        acc.low64 = XXH3_avalanche(acc.low64);
+        acc.high64 = XXH3_avalanche(acc.high64);
+        /*
+         * NB: `i <= len` will duplicate the last 32-bytes if
+         * len % 32 was zero. This is an unfortunate necessity to keep
+         * the hash result stable.
+         */
+        for (i=160; i <= len; i += 32) {
+            acc = XXH128_mix32B(acc,
+                                input + i - 32,
+                                input + i - 16,
+                                secret + XXH3_MIDSIZE_STARTOFFSET + i - 160,
+                                seed);
+        }
+        /* last bytes */
+        acc = XXH128_mix32B(acc,
+                            input + len - 16,
+                            input + len - 32,
+                            secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16,
+                            (XXH64_hash_t)0 - seed);
+
+        {   XXH128_hash_t h128;
+            h128.low64  = acc.low64 + acc.high64;
+            h128.high64 = (acc.low64    * XXH_PRIME64_1)
+                        + (acc.high64   * XXH_PRIME64_4)
+                        + ((len - seed) * XXH_PRIME64_2);
+            h128.low64  = XXH3_avalanche(h128.low64);
+            h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64);
+            return h128;
+        }
+    }
+}
+
+static XXH_PUREF XXH128_hash_t
+XXH3_finalizeLong_128b(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, size_t secretSize, xxh_u64 len)
+{
+    XXH128_hash_t h128;
+    h128.low64 = XXH3_finalizeLong_64b(acc, secret, len);
+    h128.high64 = XXH3_mergeAccs(acc, secret + secretSize
+                                             - XXH_STRIPE_LEN - XXH_SECRET_MERGEACCS_START,
+                                             ~(len * XXH_PRIME64_2));
+    return h128;
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len,
+                            const xxh_u8* XXH_RESTRICT secret, size_t secretSize,
+                            XXH3_f_accumulate f_acc,
+                            XXH3_f_scrambleAcc f_scramble)
+{
+    XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC;
+
+    XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble);
+
+    /* converge into final hash */
+    XXH_STATIC_ASSERT(sizeof(acc) == 64);
+    XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+    return XXH3_finalizeLong_128b(acc, secret, secretSize, (xxh_u64)len);
+}
+
+/*
+ * It's important for performance that XXH3_hashLong() is not inlined.
+ */
+XXH_NO_INLINE XXH_PUREF XXH128_hash_t
+XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len,
+                           XXH64_hash_t seed64,
+                           const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64; (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret),
+                                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+/*
+ * It's important for performance to pass @p secretLen (when it's static)
+ * to the compiler, so that it can properly optimize the vectorized loop.
+ *
+ * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE
+ * breaks -Og, this is XXH_NO_INLINE.
+ */
+XXH3_WITH_SECRET_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len,
+                              XXH64_hash_t seed64,
+                              const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)seed64;
+    return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen,
+                                       XXH3_accumulate, XXH3_scrambleAcc);
+}
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len,
+                                XXH64_hash_t seed64,
+                                XXH3_f_accumulate f_acc,
+                                XXH3_f_scrambleAcc f_scramble,
+                                XXH3_f_initCustomSecret f_initSec)
+{
+    if (seed64 == 0)
+        return XXH3_hashLong_128b_internal(input, len,
+                                           XXH3_kSecret, sizeof(XXH3_kSecret),
+                                           f_acc, f_scramble);
+    {   XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+        f_initSec(secret, seed64);
+        return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret),
+                                           f_acc, f_scramble);
+    }
+}
+
+/*
+ * It's important for performance that XXH3_hashLong is not inlined.
+ */
+XXH_NO_INLINE XXH128_hash_t
+XXH3_hashLong_128b_withSeed(const void* input, size_t len,
+                            XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen)
+{
+    (void)secret; (void)secretLen;
+    return XXH3_hashLong_128b_withSeed_internal(input, len, seed64,
+                XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret);
+}
+
+typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t,
+                                            XXH64_hash_t, const void* XXH_RESTRICT, size_t);
+
+XXH_FORCE_INLINE XXH128_hash_t
+XXH3_128bits_internal(const void* input, size_t len,
+                      XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen,
+                      XXH3_hashLong128_f f_hl128)
+{
+    XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN);
+    /*
+     * If an action is to be taken if `secret` conditions are not respected,
+     * it should be done here.
+     * For now, it's a contract pre-condition.
+     * Adding a check and a branch here would cost performance at every hash.
+     */
+    if (len <= 16)
+        return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64);
+    if (len <= 128)
+        return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64);
+    return f_hl128(input, len, seed64, secret, secretLen);
+}
+
+
+/* ===   Public XXH128 API   === */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_default);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_128bits_internal(input, len, 0,
+                                 (const xxh_u8*)secret, secretSize,
+                                 XXH3_hashLong_128b_withSecret);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_internal(input, len, seed,
+                                 XXH3_kSecret, sizeof(XXH3_kSecret),
+                                 XXH3_hashLong_128b_withSeed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    if (len <= XXH3_MIDSIZE_MAX)
+        return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL);
+    return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed)
+{
+    return XXH3_128bits_withSeed(input, len, seed);
+}
+
+
+/* ===   XXH3 128-bit streaming   === */
+#ifndef XXH_NO_STREAM
+/*
+ * All initialization and update functions are identical to 64-bit streaming variant.
+ * The only difference is the finalization routine.
+ */
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr)
+{
+    return XXH3_64bits_reset(statePtr);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize)
+{
+    return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSeed(statePtr, seed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed)
+{
+    return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len)
+{
+    return XXH3_update_regular(state, input, len);
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state)
+{
+    const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret;
+    if (state->totalLen > XXH3_MIDSIZE_MAX) {
+        XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB];
+        XXH3_digest_long(acc, state, secret);
+        XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START);
+        return XXH3_finalizeLong_128b(acc, secret, state->secretLimit + XXH_STRIPE_LEN,  (xxh_u64)state->totalLen);
+    }
+    /* len <= XXH3_MIDSIZE_MAX : short code */
+    if (state->useSeed)
+        return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed);
+    return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen),
+                                   secret, state->secretLimit + XXH_STRIPE_LEN);
+}
+#endif /* !XXH_NO_STREAM */
+/* 128-bit utility functions */
+
+/* return : 1 is equal, 0 if different */
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2)
+{
+    /* note : XXH128_hash_t is compact, it has no padding byte */
+    return !(XXH_memcmp(&h1, &h2, sizeof(h1)));
+}
+
+/* This prototype is compatible with stdlib's qsort().
+ * @return : >0 if *h128_1  > *h128_2
+ *           <0 if *h128_1  < *h128_2
+ *           =0 if *h128_1 == *h128_2  */
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2)
+{
+    XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1;
+    XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2;
+    int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64);
+    /* note : bets that, in most cases, hash values are different */
+    if (hcmp) return hcmp;
+    return (h1.low64 > h2.low64) - (h2.low64 > h1.low64);
+}
+
+
+/*======   Canonical representation   ======*/
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) {
+        hash.high64 = XXH_swap64(hash.high64);
+        hash.low64  = XXH_swap64(hash.low64);
+    }
+    XXH_memcpy(dst, &hash.high64, sizeof(hash.high64));
+    XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH128_hash_t
+XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src)
+{
+    XXH128_hash_t h;
+    h.high64 = XXH_readBE64(src);
+    h.low64  = XXH_readBE64(src->digest + 8);
+    return h;
+}
+
+
+
+/* ==========================================
+ * Secret generators
+ * ==========================================
+ */
+#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128)
+{
+    XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 );
+    XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 );
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API XXH_errorcode
+XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize)
+{
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(secretBuffer != NULL);
+    XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN);
+#else
+    /* production mode, assert() are disabled */
+    if (secretBuffer == NULL) return XXH_ERROR;
+    if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR;
+#endif
+
+    if (customSeedSize == 0) {
+        customSeed = XXH3_kSecret;
+        customSeedSize = XXH_SECRET_DEFAULT_SIZE;
+    }
+#if (XXH_DEBUGLEVEL >= 1)
+    XXH_ASSERT(customSeed != NULL);
+#else
+    if (customSeed == NULL) return XXH_ERROR;
+#endif
+
+    /* Fill secretBuffer with a copy of customSeed - repeat as needed */
+    {   size_t pos = 0;
+        while (pos < secretSize) {
+            size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize);
+            XXH_memcpy((char*)secretBuffer + pos, customSeed, toCopy);
+            pos += toCopy;
+    }   }
+
+    {   size_t const nbSeg16 = secretSize / 16;
+        size_t n;
+        XXH128_canonical_t scrambler;
+        XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0));
+        for (n=0; n<nbSeg16; n++) {
+            XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n);
+            XXH3_combine16((char*)secretBuffer + n*16, h128);
+        }
+        /* last segment */
+        XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler));
+    }
+    return XXH_OK;
+}
+
+/*! @ingroup XXH3_family */
+XXH_PUBLIC_API void
+XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed)
+{
+    XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE];
+    XXH3_initCustomSecret(secret, seed);
+    XXH_ASSERT(secretBuffer != NULL);
+    XXH_memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE);
+}
+
+
+
+/* Pop our optimization override from above */
+#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \
+  && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \
+  && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */
+#  pragma GCC pop_options
+#endif
+
+#endif  /* XXH_NO_LONG_LONG */
+
+#endif  /* XXH_NO_XXH3 */
+
+/*!
+ * @}
+ */
+#endif  /* XXH_IMPLEMENTATION */
+
+
+#if defined (__cplusplus) && !defined(XXH_NO_EXTERNC_GUARD)
+} /* extern "C" */
+#endif
diff --git a/pyarrow/include/arrow/visit_array_inline.h b/pyarrow/include/arrow/visit_array_inline.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb6ff49b6950d51f4e189488307f5f0037c2eef6
--- /dev/null
+++ b/pyarrow/include/arrow/visit_array_inline.h
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/array.h"
+#include "arrow/extension_type.h"
+#include "arrow/visitor_generate.h"
+
+namespace arrow {
+
+#define ARRAY_VISIT_INLINE(TYPE_CLASS)                                                   \
+  case TYPE_CLASS##Type::type_id:                                                        \
+    return visitor->Visit(                                                               \
+        internal::checked_cast<const typename TypeTraits<TYPE_CLASS##Type>::ArrayType&>( \
+            array),                                                                      \
+        std::forward<ARGS>(args)...);
+
+/// \brief Apply the visitors Visit() method specialized to the array type
+///
+/// \tparam VISITOR Visitor type that implements Visit() for all array types.
+/// \tparam ARGS Additional arguments, if any, will be passed to the Visit function after
+/// the `arr` argument
+/// \return Status
+///
+/// A visitor is a type that implements specialized logic for each Arrow type.
+/// Example usage:
+///
+/// ```
+/// class ExampleVisitor {
+///   arrow::Status Visit(arrow::NumericArray<Int32Type> arr) { ... }
+///   arrow::Status Visit(arrow::NumericArray<Int64Type> arr) { ... }
+///   ...
+/// }
+/// ExampleVisitor visitor;
+/// VisitArrayInline(some_array, &visitor);
+/// ```
+template <typename VISITOR, typename... ARGS>
+inline Status VisitArrayInline(const Array& array, VISITOR* visitor, ARGS&&... args) {
+  switch (array.type_id()) {
+    ARROW_GENERATE_FOR_ALL_TYPES(ARRAY_VISIT_INLINE);
+    default:
+      break;
+  }
+  return Status::NotImplemented("Type not implemented");
+}
+
+#undef ARRAY_VISIT_INLINE
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/visit_data_inline.h b/pyarrow/include/arrow/visit_data_inline.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fa557af2079d6c42e54a7b9fe20f7ffc19eb506
--- /dev/null
+++ b/pyarrow/include/arrow/visit_data_inline.h
@@ -0,0 +1,337 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string_view>
+
+#include "arrow/array.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/binary_view_util.h"
+#include "arrow/util/bit_block_counter.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/functional.h"
+
+namespace arrow {
+namespace internal {
+
+template <typename T, typename Enable = void>
+struct ArraySpanInlineVisitor {};
+
+// Numeric and primitive C-compatible types
+template <typename T>
+struct ArraySpanInlineVisitor<T, enable_if_has_c_type<T>> {
+  using c_type = typename T::c_type;
+
+  template <typename ValidFunc, typename NullFunc>
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
+                            NullFunc&& null_func) {
+    if constexpr (std::is_same_v<T, BooleanType>) {
+      int64_t offset = arr.offset;
+      const uint8_t* data = arr.buffers[1].data;
+      return VisitBitBlocks(
+          arr.buffers[0].data, offset, arr.length,
+          [&](int64_t i) { return valid_func(bit_util::GetBit(data, offset + i)); },
+          std::forward<NullFunc>(null_func));
+    } else {
+      const c_type* data = arr.GetValues<c_type>(1);
+      auto visit_valid = [&](int64_t i) { return valid_func(data[i]); };
+      return VisitBitBlocks(arr.buffers[0].data, arr.offset, arr.length,
+                            std::move(visit_valid), std::forward<NullFunc>(null_func));
+    }
+  }
+
+  template <typename ValidFunc, typename NullFunc>
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
+                        NullFunc&& null_func) {
+    if constexpr (std::is_same_v<T, BooleanType>) {
+      int64_t offset = arr.offset;
+      const uint8_t* data = arr.buffers[1].data;
+      VisitBitBlocksVoid(
+          arr.buffers[0].data, offset, arr.length,
+          [&](int64_t i) { valid_func(bit_util::GetBit(data, offset + i)); },
+          std::forward<NullFunc>(null_func));
+    } else {
+      const c_type* data = arr.GetValues<c_type>(1);
+      auto visit_valid = [&](int64_t i) { valid_func(data[i]); };
+      VisitBitBlocksVoid(arr.buffers[0].data, arr.offset, arr.length,
+                         std::move(visit_valid), std::forward<NullFunc>(null_func));
+    }
+  }
+};
+
+// Binary, String...
+template <typename T>
+struct ArraySpanInlineVisitor<T, enable_if_base_binary<T>> {
+  using c_type = std::string_view;
+
+  template <typename ValidFunc, typename NullFunc>
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
+                            NullFunc&& null_func) {
+    using offset_type = typename T::offset_type;
+    constexpr char empty_value = 0;
+
+    if (arr.length == 0) {
+      return Status::OK();
+    }
+    const offset_type* offsets = arr.GetValues<offset_type>(1);
+    const char* data;
+    if (arr.buffers[2].data == NULLPTR) {
+      data = &empty_value;
+    } else {
+      // Do not apply the array offset to the values array; the value_offsets
+      // index the non-sliced values array.
+      data = arr.GetValues<char>(2, /*absolute_offset=*/0);
+    }
+    offset_type cur_offset = *offsets++;
+    return VisitBitBlocks(
+        arr.buffers[0].data, arr.offset, arr.length,
+        [&](int64_t i) {
+          ARROW_UNUSED(i);
+          auto value = std::string_view(data + cur_offset, *offsets - cur_offset);
+          cur_offset = *offsets++;
+          return valid_func(value);
+        },
+        [&]() {
+          cur_offset = *offsets++;
+          return null_func();
+        });
+  }
+
+  template <typename ValidFunc, typename NullFunc>
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
+                        NullFunc&& null_func) {
+    using offset_type = typename T::offset_type;
+    constexpr uint8_t empty_value = 0;
+
+    if (arr.length == 0) {
+      return;
+    }
+    const offset_type* offsets = arr.GetValues<offset_type>(1);
+    const uint8_t* data;
+    if (arr.buffers[2].data == NULLPTR) {
+      data = &empty_value;
+    } else {
+      // Do not apply the array offset to the values array; the value_offsets
+      // index the non-sliced values array.
+      data = arr.GetValues<uint8_t>(2, /*absolute_offset=*/0);
+    }
+
+    VisitBitBlocksVoid(
+        arr.buffers[0].data, arr.offset, arr.length,
+        [&](int64_t i) {
+          auto value = std::string_view(reinterpret_cast<const char*>(data + offsets[i]),
+                                        offsets[i + 1] - offsets[i]);
+          valid_func(value);
+        },
+        std::forward<NullFunc>(null_func));
+  }
+};
+
+// BinaryView, StringView...
+template <typename T>
+struct ArraySpanInlineVisitor<T, enable_if_binary_view_like<T>> {
+  using c_type = std::string_view;
+
+  template <typename ValidFunc, typename NullFunc>
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
+                            NullFunc&& null_func) {
+    if (arr.length == 0) {
+      return Status::OK();
+    }
+    auto* s = arr.GetValues<BinaryViewType::c_type>(1);
+    auto* data_buffers = arr.GetVariadicBuffers().data();
+    return VisitBitBlocks(
+        arr.buffers[0].data, arr.offset, arr.length,
+        [&](int64_t index) {
+          return valid_func(util::FromBinaryView(s[index], data_buffers));
+        },
+        [&]() { return null_func(); });
+  }
+
+  template <typename ValidFunc, typename NullFunc>
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
+                        NullFunc&& null_func) {
+    if (arr.length == 0) {
+      return;
+    }
+    auto* s = arr.GetValues<BinaryViewType::c_type>(1);
+    auto* data_buffers = arr.GetVariadicBuffers().data();
+    VisitBitBlocksVoid(
+        arr.buffers[0].data, arr.offset, arr.length,
+        [&](int64_t index) { valid_func(util::FromBinaryView(s[index], data_buffers)); },
+        std::forward<NullFunc>(null_func));
+  }
+};
+
+// FixedSizeBinary, Decimal128
+template <typename T>
+struct ArraySpanInlineVisitor<T, enable_if_fixed_size_binary<T>> {
+  using c_type = std::string_view;
+
+  template <typename ValidFunc, typename NullFunc>
+  static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func,
+                            NullFunc&& null_func) {
+    const int32_t byte_width = arr.type->byte_width();
+    const char* data = arr.GetValues<char>(1,
+                                           /*absolute_offset=*/arr.offset * byte_width);
+    return VisitBitBlocks(
+        arr.buffers[0].data, arr.offset, arr.length,
+        [&](int64_t i) {
+          auto value = std::string_view(data, byte_width);
+          data += byte_width;
+          return valid_func(value);
+        },
+        [&]() {
+          data += byte_width;
+          return null_func();
+        });
+  }
+
+  template <typename ValidFunc, typename NullFunc>
+  static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func,
+                        NullFunc&& null_func) {
+    const int32_t byte_width = arr.type->byte_width();
+    const char* data = arr.GetValues<char>(1,
+                                           /*absolute_offset=*/arr.offset * byte_width);
+    VisitBitBlocksVoid(
+        arr.buffers[0].data, arr.offset, arr.length,
+        [&](int64_t i) {
+          valid_func(std::string_view(data, byte_width));
+          data += byte_width;
+        },
+        [&]() {
+          data += byte_width;
+          null_func();
+        });
+  }
+};
+
+}  // namespace internal
+
+template <typename T, typename ValidFunc, typename NullFunc>
+typename internal::call_traits::enable_if_return<ValidFunc, Status>::type
+VisitArraySpanInline(const ArraySpan& arr, ValidFunc&& valid_func, NullFunc&& null_func) {
+  return internal::ArraySpanInlineVisitor<T>::VisitStatus(
+      arr, std::forward<ValidFunc>(valid_func), std::forward<NullFunc>(null_func));
+}
+
+template <typename T, typename ValidFunc, typename NullFunc>
+typename internal::call_traits::enable_if_return<ValidFunc, void>::type
+VisitArraySpanInline(const ArraySpan& arr, ValidFunc&& valid_func, NullFunc&& null_func) {
+  return internal::ArraySpanInlineVisitor<T>::VisitVoid(
+      arr, std::forward<ValidFunc>(valid_func), std::forward<NullFunc>(null_func));
+}
+
+// Visit an array's data values, in order, without overhead.
+//
+// The Visit method's `visitor` argument should be an object with two public methods:
+// - Status VisitNull()
+// - Status VisitValue(<scalar>)
+//
+// The scalar value's type depends on the array data type:
+// - the type's `c_type`, if any
+// - for boolean arrays, a `bool`
+// - for binary, string, large binary and string, binary and string view, and fixed-size
+//   binary arrays, a `std::string_view`
+
+template <typename T>
+struct ArraySpanVisitor {
+  using InlineVisitorType = internal::ArraySpanInlineVisitor<T>;
+  using c_type = typename InlineVisitorType::c_type;
+
+  template <typename Visitor>
+  static Status Visit(const ArraySpan& arr, Visitor* visitor) {
+    return InlineVisitorType::VisitStatus(
+        arr, [visitor](c_type v) { return visitor->VisitValue(v); },
+        [visitor]() { return visitor->VisitNull(); });
+  }
+};
+
+// Visit a null bitmap, in order, without overhead.
+//
+// The given `ValidFunc` should be a callable with either of these signatures:
+// - void()
+// - Status()
+//
+// The `NullFunc` should have the same return type as `ValidFunc`.
+
+template <typename ValidFunc, typename NullFunc>
+typename internal::call_traits::enable_if_return<ValidFunc, Status>::type
+VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset,
+                      int64_t num_values, int64_t null_count, ValidFunc&& valid_func,
+                      NullFunc&& null_func) {
+  internal::OptionalBitBlockCounter bit_counter(null_count == 0 ? NULLPTR : valid_bits,
+                                                valid_bits_offset, num_values);
+  int64_t position = 0;
+  int64_t offset_position = valid_bits_offset;
+  while (position < num_values) {
+    internal::BitBlockCount block = bit_counter.NextBlock();
+    if (block.AllSet()) {
+      for (int64_t i = 0; i < block.length; ++i) {
+        ARROW_RETURN_NOT_OK(valid_func());
+      }
+    } else if (block.NoneSet()) {
+      for (int64_t i = 0; i < block.length; ++i) {
+        ARROW_RETURN_NOT_OK(null_func());
+      }
+    } else {
+      for (int64_t i = 0; i < block.length; ++i) {
+        ARROW_RETURN_NOT_OK(bit_util::GetBit(valid_bits, offset_position + i)
+                                ? valid_func()
+                                : null_func());
+      }
+    }
+    position += block.length;
+    offset_position += block.length;
+  }
+  return Status::OK();
+}
+
+template <typename ValidFunc, typename NullFunc>
+typename internal::call_traits::enable_if_return<ValidFunc, void>::type
+VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset,
+                      int64_t num_values, int64_t null_count, ValidFunc&& valid_func,
+                      NullFunc&& null_func) {
+  internal::OptionalBitBlockCounter bit_counter(null_count == 0 ? NULLPTR : valid_bits,
+                                                valid_bits_offset, num_values);
+  int64_t position = 0;
+  int64_t offset_position = valid_bits_offset;
+  while (position < num_values) {
+    internal::BitBlockCount block = bit_counter.NextBlock();
+    if (block.AllSet()) {
+      for (int64_t i = 0; i < block.length; ++i) {
+        valid_func();
+      }
+    } else if (block.NoneSet()) {
+      for (int64_t i = 0; i < block.length; ++i) {
+        null_func();
+      }
+    } else {
+      for (int64_t i = 0; i < block.length; ++i) {
+        bit_util::GetBit(valid_bits, offset_position + i) ? valid_func() : null_func();
+      }
+    }
+    position += block.length;
+    offset_position += block.length;
+  }
+}
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/visit_scalar_inline.h b/pyarrow/include/arrow/visit_scalar_inline.h
new file mode 100644
index 0000000000000000000000000000000000000000..85357f288c63b328f93afb1ee419775caac53b2c
--- /dev/null
+++ b/pyarrow/include/arrow/visit_scalar_inline.h
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Private header, not to be exported
+
+#pragma once
+
+#include <utility>
+
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/visitor_generate.h"
+
+namespace arrow {
+
+#define SCALAR_VISIT_INLINE(TYPE_CLASS)                                              \
+  case TYPE_CLASS##Type::type_id:                                                    \
+    return visitor->Visit(internal::checked_cast<const TYPE_CLASS##Scalar&>(scalar), \
+                          std::forward<ARGS>(args)...);
+
+/// \brief Apply the visitors Visit() method specialized to the scalar type
+///
+/// \tparam VISITOR Visitor type that implements Visit() for all scalar types.
+/// \tparam ARGS Additional arguments, if any, will be passed to the Visit function after
+/// the `scalar` argument
+/// \return Status
+///
+/// A visitor is a type that implements specialized logic for each Arrow type.
+/// Example usage:
+///
+/// ```
+/// class ExampleVisitor {
+///   arrow::Status Visit(arrow::Int32Scalar scalar) { ... }
+///   arrow::Status Visit(arrow::Int64Scalar scalar) { ... }
+///   ...
+/// }
+/// ExampleVisitor visitor;
+/// VisitScalarInline(some_scalar, &visitor);
+/// ```
+template <typename VISITOR, typename... ARGS>
+inline Status VisitScalarInline(const Scalar& scalar, VISITOR* visitor, ARGS&&... args) {
+  switch (scalar.type->id()) {
+    ARROW_GENERATE_FOR_ALL_TYPES(SCALAR_VISIT_INLINE);
+    default:
+      break;
+  }
+  return Status::NotImplemented("Scalar visitor for type not implemented ",
+                                scalar.type->ToString());
+}
+
+#undef SCALAR_VISIT_INLINE
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/visit_type_inline.h b/pyarrow/include/arrow/visit_type_inline.h
new file mode 100644
index 0000000000000000000000000000000000000000..30f5bb5416218f3cc786a375b68b9e9b979ff117
--- /dev/null
+++ b/pyarrow/include/arrow/visit_type_inline.h
@@ -0,0 +1,117 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/extension_type.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/unreachable.h"
+#include "arrow/visitor_generate.h"
+
+namespace arrow {
+
+#define TYPE_VISIT_INLINE(TYPE_CLASS)                                            \
+  case TYPE_CLASS##Type::type_id:                                                \
+    return visitor->Visit(internal::checked_cast<const TYPE_CLASS##Type&>(type), \
+                          std::forward<ARGS>(args)...);
+
+/// \brief Calls `visitor` with the corresponding concrete type class
+///
+/// \tparam VISITOR Visitor type that implements Visit() for all Arrow types.
+/// \tparam ARGS Additional arguments, if any, will be passed to the Visit function after
+/// the `type` argument
+/// \return Status
+///
+/// A visitor is a type that implements specialized logic for each Arrow type.
+/// Example usage:
+///
+/// ```
+/// class ExampleVisitor {
+///   arrow::Status Visit(const arrow::Int32Type& type) { ... }
+///   arrow::Status Visit(const arrow::Int64Type& type) { ... }
+///   ...
+/// }
+/// ExampleVisitor visitor;
+/// VisitTypeInline(some_type, &visitor);
+/// ```
+template <typename VISITOR, typename... ARGS>
+inline Status VisitTypeInline(const DataType& type, VISITOR* visitor, ARGS&&... args) {
+  switch (type.id()) {
+    ARROW_GENERATE_FOR_ALL_TYPES(TYPE_VISIT_INLINE);
+    default:
+      break;
+  }
+  return Status::NotImplemented("Type not implemented");
+}
+
+#undef TYPE_VISIT_INLINE
+
+#define TYPE_VISIT_INLINE(TYPE_CLASS)                          \
+  case TYPE_CLASS##Type::type_id:                              \
+    return std::forward<VISITOR>(visitor)(                     \
+        internal::checked_cast<const TYPE_CLASS##Type&>(type), \
+        std::forward<ARGS>(args)...);
+
+/// \brief Call `visitor` with the corresponding concrete type class
+/// \tparam ARGS Additional arguments, if any, will be passed to the Visit function after
+/// the `type` argument
+///
+/// Unlike VisitTypeInline which calls `visitor.Visit`, here `visitor`
+/// itself is called.
+/// `visitor` must support a `const DataType&` argument as a fallback,
+/// in addition to concrete type classes.
+///
+/// The intent is for this to be called on a generic lambda
+/// that may internally use `if constexpr` or similar constructs.
+template <typename VISITOR, typename... ARGS>
+inline auto VisitType(const DataType& type, VISITOR&& visitor, ARGS&&... args)
+    -> decltype(std::forward<VISITOR>(visitor)(type, args...)) {
+  switch (type.id()) {
+    ARROW_GENERATE_FOR_ALL_TYPES(TYPE_VISIT_INLINE);
+    default:
+      Unreachable("Type not implemented");
+  }
+}
+
+#undef TYPE_VISIT_INLINE
+
+#define TYPE_ID_VISIT_INLINE(TYPE_CLASS)                              \
+  case TYPE_CLASS##Type::type_id: {                                   \
+    const TYPE_CLASS##Type* concrete_ptr = NULLPTR;                   \
+    return visitor->Visit(concrete_ptr, std::forward<ARGS>(args)...); \
+  }
+
+/// \brief Calls `visitor` with a nullptr of the corresponding concrete type class
+///
+/// \tparam VISITOR Visitor type that implements Visit() for all Arrow types.
+/// \tparam ARGS Additional arguments, if any, will be passed to the Visit function after
+/// the `type` argument
+/// \return Status
+template <typename VISITOR, typename... ARGS>
+inline Status VisitTypeIdInline(Type::type id, VISITOR* visitor, ARGS&&... args) {
+  switch (id) {
+    ARROW_GENERATE_FOR_ALL_TYPES(TYPE_ID_VISIT_INLINE);
+    default:
+      break;
+  }
+  return Status::NotImplemented("Type not implemented");
+}
+
+#undef TYPE_ID_VISIT_INLINE
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/visitor.h b/pyarrow/include/arrow/visitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..87f23b2bbe8007fc36fd01b0ff5dfdc27ef7366e
--- /dev/null
+++ b/pyarrow/include/arrow/visitor.h
@@ -0,0 +1,191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/status.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/visibility.h"
+
+namespace arrow {
+
+/// \brief Abstract array visitor class
+///
+/// Subclass this to create a visitor that can be used with the Array::Accept()
+/// method.
+class ARROW_EXPORT ArrayVisitor {
+ public:
+  virtual ~ArrayVisitor() = default;
+
+  virtual Status Visit(const NullArray& array);
+  virtual Status Visit(const BooleanArray& array);
+  virtual Status Visit(const Int8Array& array);
+  virtual Status Visit(const Int16Array& array);
+  virtual Status Visit(const Int32Array& array);
+  virtual Status Visit(const Int64Array& array);
+  virtual Status Visit(const UInt8Array& array);
+  virtual Status Visit(const UInt16Array& array);
+  virtual Status Visit(const UInt32Array& array);
+  virtual Status Visit(const UInt64Array& array);
+  virtual Status Visit(const HalfFloatArray& array);
+  virtual Status Visit(const FloatArray& array);
+  virtual Status Visit(const DoubleArray& array);
+  virtual Status Visit(const StringArray& array);
+  virtual Status Visit(const StringViewArray& array);
+  virtual Status Visit(const BinaryArray& array);
+  virtual Status Visit(const BinaryViewArray& array);
+  virtual Status Visit(const LargeStringArray& array);
+  virtual Status Visit(const LargeBinaryArray& array);
+  virtual Status Visit(const FixedSizeBinaryArray& array);
+  virtual Status Visit(const Date32Array& array);
+  virtual Status Visit(const Date64Array& array);
+  virtual Status Visit(const Time32Array& array);
+  virtual Status Visit(const Time64Array& array);
+  virtual Status Visit(const TimestampArray& array);
+  virtual Status Visit(const DayTimeIntervalArray& array);
+  virtual Status Visit(const MonthDayNanoIntervalArray& array);
+  virtual Status Visit(const MonthIntervalArray& array);
+  virtual Status Visit(const DurationArray& array);
+  virtual Status Visit(const Decimal32Array& array);
+  virtual Status Visit(const Decimal64Array& array);
+  virtual Status Visit(const Decimal128Array& array);
+  virtual Status Visit(const Decimal256Array& array);
+  virtual Status Visit(const ListArray& array);
+  virtual Status Visit(const LargeListArray& array);
+  virtual Status Visit(const ListViewArray& array);
+  virtual Status Visit(const LargeListViewArray& array);
+  virtual Status Visit(const MapArray& array);
+  virtual Status Visit(const FixedSizeListArray& array);
+  virtual Status Visit(const StructArray& array);
+  virtual Status Visit(const SparseUnionArray& array);
+  virtual Status Visit(const DenseUnionArray& array);
+  virtual Status Visit(const DictionaryArray& array);
+  virtual Status Visit(const RunEndEncodedArray& array);
+  virtual Status Visit(const ExtensionArray& array);
+};
+
+/// \brief Abstract type visitor class
+///
+/// Subclass this to create a visitor that can be used with the DataType::Accept()
+/// method.
+class ARROW_EXPORT TypeVisitor {
+ public:
+  virtual ~TypeVisitor() = default;
+
+  virtual Status Visit(const NullType& type);
+  virtual Status Visit(const BooleanType& type);
+  virtual Status Visit(const Int8Type& type);
+  virtual Status Visit(const Int16Type& type);
+  virtual Status Visit(const Int32Type& type);
+  virtual Status Visit(const Int64Type& type);
+  virtual Status Visit(const UInt8Type& type);
+  virtual Status Visit(const UInt16Type& type);
+  virtual Status Visit(const UInt32Type& type);
+  virtual Status Visit(const UInt64Type& type);
+  virtual Status Visit(const HalfFloatType& type);
+  virtual Status Visit(const FloatType& type);
+  virtual Status Visit(const DoubleType& type);
+  virtual Status Visit(const StringType& type);
+  virtual Status Visit(const StringViewType& type);
+  virtual Status Visit(const BinaryType& type);
+  virtual Status Visit(const BinaryViewType& type);
+  virtual Status Visit(const LargeStringType& type);
+  virtual Status Visit(const LargeBinaryType& type);
+  virtual Status Visit(const FixedSizeBinaryType& type);
+  virtual Status Visit(const Date64Type& type);
+  virtual Status Visit(const Date32Type& type);
+  virtual Status Visit(const Time32Type& type);
+  virtual Status Visit(const Time64Type& type);
+  virtual Status Visit(const TimestampType& type);
+  virtual Status Visit(const MonthDayNanoIntervalType& type);
+  virtual Status Visit(const MonthIntervalType& type);
+  virtual Status Visit(const DayTimeIntervalType& type);
+  virtual Status Visit(const DurationType& type);
+  virtual Status Visit(const Decimal32Type& type);
+  virtual Status Visit(const Decimal64Type& type);
+  virtual Status Visit(const Decimal128Type& type);
+  virtual Status Visit(const Decimal256Type& type);
+  virtual Status Visit(const ListType& type);
+  virtual Status Visit(const LargeListType& type);
+  virtual Status Visit(const ListViewType& scalar);
+  virtual Status Visit(const LargeListViewType& scalar);
+  virtual Status Visit(const MapType& type);
+  virtual Status Visit(const FixedSizeListType& type);
+  virtual Status Visit(const StructType& type);
+  virtual Status Visit(const SparseUnionType& type);
+  virtual Status Visit(const DenseUnionType& type);
+  virtual Status Visit(const DictionaryType& type);
+  virtual Status Visit(const RunEndEncodedType& type);
+  virtual Status Visit(const ExtensionType& type);
+};
+
+/// \brief Abstract scalar visitor class
+///
+/// Subclass this to create a visitor that can be used with the Scalar::Accept()
+/// method.
+class ARROW_EXPORT ScalarVisitor {
+ public:
+  virtual ~ScalarVisitor() = default;
+
+  virtual Status Visit(const NullScalar& scalar);
+  virtual Status Visit(const BooleanScalar& scalar);
+  virtual Status Visit(const Int8Scalar& scalar);
+  virtual Status Visit(const Int16Scalar& scalar);
+  virtual Status Visit(const Int32Scalar& scalar);
+  virtual Status Visit(const Int64Scalar& scalar);
+  virtual Status Visit(const UInt8Scalar& scalar);
+  virtual Status Visit(const UInt16Scalar& scalar);
+  virtual Status Visit(const UInt32Scalar& scalar);
+  virtual Status Visit(const UInt64Scalar& scalar);
+  virtual Status Visit(const HalfFloatScalar& scalar);
+  virtual Status Visit(const FloatScalar& scalar);
+  virtual Status Visit(const DoubleScalar& scalar);
+  virtual Status Visit(const StringScalar& scalar);
+  virtual Status Visit(const StringViewScalar& scalar);
+  virtual Status Visit(const BinaryScalar& scalar);
+  virtual Status Visit(const BinaryViewScalar& scalar);
+  virtual Status Visit(const LargeStringScalar& scalar);
+  virtual Status Visit(const LargeBinaryScalar& scalar);
+  virtual Status Visit(const FixedSizeBinaryScalar& scalar);
+  virtual Status Visit(const Date64Scalar& scalar);
+  virtual Status Visit(const Date32Scalar& scalar);
+  virtual Status Visit(const Time32Scalar& scalar);
+  virtual Status Visit(const Time64Scalar& scalar);
+  virtual Status Visit(const TimestampScalar& scalar);
+  virtual Status Visit(const DayTimeIntervalScalar& scalar);
+  virtual Status Visit(const MonthDayNanoIntervalScalar& type);
+  virtual Status Visit(const MonthIntervalScalar& scalar);
+  virtual Status Visit(const DurationScalar& scalar);
+  virtual Status Visit(const Decimal32Scalar& scalar);
+  virtual Status Visit(const Decimal64Scalar& scalar);
+  virtual Status Visit(const Decimal128Scalar& scalar);
+  virtual Status Visit(const Decimal256Scalar& scalar);
+  virtual Status Visit(const ListScalar& scalar);
+  virtual Status Visit(const LargeListScalar& scalar);
+  virtual Status Visit(const ListViewScalar& scalar);
+  virtual Status Visit(const LargeListViewScalar& scalar);
+  virtual Status Visit(const MapScalar& scalar);
+  virtual Status Visit(const FixedSizeListScalar& scalar);
+  virtual Status Visit(const StructScalar& scalar);
+  virtual Status Visit(const DictionaryScalar& scalar);
+  virtual Status Visit(const SparseUnionScalar& scalar);
+  virtual Status Visit(const DenseUnionScalar& scalar);
+  virtual Status Visit(const RunEndEncodedScalar& scalar);
+  virtual Status Visit(const ExtensionScalar& scalar);
+};
+
+}  // namespace arrow
diff --git a/pyarrow/include/arrow/visitor_generate.h b/pyarrow/include/arrow/visitor_generate.h
new file mode 100644
index 0000000000000000000000000000000000000000..a87a97764845dc5af5287a59d8003d81625cc5c5
--- /dev/null
+++ b/pyarrow/include/arrow/visitor_generate.h
@@ -0,0 +1,75 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace arrow {
+
+#define ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(ACTION) \
+  ACTION(Int8);                                      \
+  ACTION(UInt8);                                     \
+  ACTION(Int16);                                     \
+  ACTION(UInt16);                                    \
+  ACTION(Int32);                                     \
+  ACTION(UInt32);                                    \
+  ACTION(Int64);                                     \
+  ACTION(UInt64)
+
+#define ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION) \
+  ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(ACTION);      \
+  ACTION(HalfFloat);                                 \
+  ACTION(Float);                                     \
+  ACTION(Double)
+
+#define ARROW_GENERATE_FOR_ALL_TYPES(ACTION)    \
+  ACTION(Null);                                 \
+  ACTION(Boolean);                              \
+  ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \
+  ACTION(String);                               \
+  ACTION(StringView);                           \
+  ACTION(Binary);                               \
+  ACTION(BinaryView);                           \
+  ACTION(LargeString);                          \
+  ACTION(LargeBinary);                          \
+  ACTION(FixedSizeBinary);                      \
+  ACTION(Duration);                             \
+  ACTION(Date32);                               \
+  ACTION(Date64);                               \
+  ACTION(Timestamp);                            \
+  ACTION(Time32);                               \
+  ACTION(Time64);                               \
+  ACTION(MonthDayNanoInterval);                 \
+  ACTION(MonthInterval);                        \
+  ACTION(DayTimeInterval);                      \
+  ACTION(Decimal32);                            \
+  ACTION(Decimal64);                            \
+  ACTION(Decimal128);                           \
+  ACTION(Decimal256);                           \
+  ACTION(List);                                 \
+  ACTION(LargeList);                            \
+  ACTION(ListView);                             \
+  ACTION(LargeListView);                        \
+  ACTION(Map);                                  \
+  ACTION(FixedSizeList);                        \
+  ACTION(Struct);                               \
+  ACTION(SparseUnion);                          \
+  ACTION(DenseUnion);                           \
+  ACTION(Dictionary);                           \
+  ACTION(RunEndEncoded);                        \
+  ACTION(Extension)
+
+}  // namespace arrow
diff --git a/pyarrow/include/parquet/api/io.h b/pyarrow/include/parquet/api/io.h
new file mode 100644
index 0000000000000000000000000000000000000000..28a00f12a7a616136beb328d20120d6458294eab
--- /dev/null
+++ b/pyarrow/include/parquet/api/io.h
@@ -0,0 +1,20 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "parquet/exception.h"
diff --git a/pyarrow/include/parquet/api/reader.h b/pyarrow/include/parquet/api/reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e746e8c5bbf551e84431552f688a493e2d62bc4
--- /dev/null
+++ b/pyarrow/include/parquet/api/reader.h
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+// Column reader API
+#include "parquet/column_reader.h"
+#include "parquet/column_scanner.h"
+#include "parquet/exception.h"
+#include "parquet/file_reader.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/printer.h"
+#include "parquet/properties.h"
+#include "parquet/statistics.h"
+
+// Schemas
+#include "parquet/api/schema.h"
+
+// IO
+#include "parquet/api/io.h"
diff --git a/pyarrow/include/parquet/api/schema.h b/pyarrow/include/parquet/api/schema.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ca714f47b5448974c460e424ab3821d10f7a384
--- /dev/null
+++ b/pyarrow/include/parquet/api/schema.h
@@ -0,0 +1,21 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+// Schemas
+#include "parquet/schema.h"
diff --git a/pyarrow/include/parquet/api/writer.h b/pyarrow/include/parquet/api/writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b072dcf74dea7233723ae55599d95be47c674716
--- /dev/null
+++ b/pyarrow/include/parquet/api/writer.h
@@ -0,0 +1,25 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "parquet/api/io.h"
+#include "parquet/api/schema.h"
+#include "parquet/column_writer.h"
+#include "parquet/exception.h"
+#include "parquet/file_writer.h"
+#include "parquet/statistics.h"
diff --git a/pyarrow/include/parquet/arrow/reader.h b/pyarrow/include/parquet/arrow/reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..54620b3d0f564bb3e6680193c61a8fa843f38f63
--- /dev/null
+++ b/pyarrow/include/parquet/arrow/reader.h
@@ -0,0 +1,392 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+// N.B. we don't include async_generator.h as it's relatively heavy
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "parquet/file_reader.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+namespace arrow {
+
+class ChunkedArray;
+class KeyValueMetadata;
+class RecordBatchReader;
+struct Scalar;
+class Schema;
+class Table;
+class RecordBatch;
+
+}  // namespace arrow
+
+namespace parquet {
+
+class FileMetaData;
+class SchemaDescriptor;
+
+namespace arrow {
+
+class ColumnChunkReader;
+class ColumnReader;
+struct SchemaManifest;
+class RowGroupReader;
+
+/// \brief Arrow read adapter class for deserializing Parquet files as Arrow row batches.
+///
+/// This interfaces caters for different use cases and thus provides different
+/// interfaces. In its most simplistic form, we cater for a user that wants to
+/// read the whole Parquet at once with the `FileReader::ReadTable` method.
+///
+/// More advanced users that also want to implement parallelism on top of each
+/// single Parquet files should do this on the RowGroup level. For this, they can
+/// call `FileReader::RowGroup(i)->ReadTable` to receive only the specified
+/// RowGroup as a table.
+///
+/// In the most advanced situation, where a consumer wants to independently read
+/// RowGroups in parallel and consume each column individually, they can call
+/// `FileReader::RowGroup(i)->Column(j)->Read` and receive an `arrow::Column`
+/// instance.
+///
+/// Finally, one can also get a stream of record batches using
+/// `FileReader::GetRecordBatchReader()`. This can internally decode columns
+/// in parallel if use_threads was enabled in the ArrowReaderProperties.
+///
+/// The parquet format supports an optional integer field_id which can be assigned
+/// to a field.  Arrow will convert these field IDs to a metadata key named
+/// PARQUET:field_id on the appropriate field.
+// TODO(wesm): nested data does not always make sense with this user
+// interface unless you are only reading a single leaf node from a branch of
+// a table. For example:
+//
+// repeated group data {
+//   optional group record {
+//     optional int32 val1;
+//     optional byte_array val2;
+//     optional bool val3;
+//   }
+//   optional int32 val4;
+// }
+//
+// In the Parquet file, there are 4 leaf nodes:
+//
+// * data.record.val1
+// * data.record.val2
+// * data.record.val3
+// * data.val4
+//
+// When materializing this data in an Arrow array, we would have:
+//
+// data: list<struct<
+//   record: struct<
+//    val1: int32,
+//    val2: string (= list<uint8>),
+//    val3: bool,
+//   >,
+//   val4: int32
+// >>
+//
+// However, in the Parquet format, each leaf node has its own repetition and
+// definition levels describing the structure of the intermediate nodes in
+// this array structure. Thus, we will need to scan the leaf data for a group
+// of leaf nodes part of the same type tree to create a single result Arrow
+// nested array structure.
+//
+// This is additionally complicated "chunky" repeated fields or very large byte
+// arrays
+class PARQUET_EXPORT FileReader {
+ public:
+  /// Factory function to create a FileReader from a ParquetFileReader and properties
+  /// \deprecated Deprecated in 23.0.0. Use arrow::Result version instead.
+  ARROW_DEPRECATED("Deprecated in 23.0.0. Use arrow::Result version instead.")
+  static ::arrow::Status Make(::arrow::MemoryPool* pool,
+                              std::unique_ptr<ParquetFileReader> reader,
+                              const ArrowReaderProperties& properties,
+                              std::unique_ptr<FileReader>* out);
+
+  /// Factory function to create a FileReader from a ParquetFileReader
+  /// \deprecated Deprecated in 23.0.0. Use arrow::Result version instead.
+  ARROW_DEPRECATED("Deprecated in 23.0.0. Use arrow::Result version instead.")
+  static ::arrow::Status Make(::arrow::MemoryPool* pool,
+                              std::unique_ptr<ParquetFileReader> reader,
+                              std::unique_ptr<FileReader>* out);
+
+  /// Factory function to create a FileReader from a ParquetFileReader and properties
+  static ::arrow::Result<std::unique_ptr<FileReader>> Make(
+      ::arrow::MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader,
+      const ArrowReaderProperties& properties);
+
+  /// Factory function to create a FileReader from a ParquetFileReader
+  static ::arrow::Result<std::unique_ptr<FileReader>> Make(
+      ::arrow::MemoryPool* pool, std::unique_ptr<ParquetFileReader> reader);
+
+  // Since the distribution of columns amongst a Parquet file's row groups may
+  // be uneven (the number of values in each column chunk can be different), we
+  // provide a column-oriented read interface. The ColumnReader hides the
+  // details of paging through the file's row groups and yielding
+  // fully-materialized arrow::Array instances
+  //
+  // Returns error status if the column of interest is not flat.
+  // The indicated column index is relative to the schema
+  virtual ::arrow::Status GetColumn(int i, std::unique_ptr<ColumnReader>* out) = 0;
+
+  /// \brief Return arrow schema for all the columns.
+  virtual ::arrow::Status GetSchema(std::shared_ptr<::arrow::Schema>* out) = 0;
+
+  /// \brief Read column as a whole into a chunked array.
+  ///
+  /// The index i refers the index of the top level schema field, which may
+  /// be nested or flat - e.g.
+  ///
+  /// 0 foo.bar
+  ///   foo.bar.baz
+  ///   foo.qux
+  /// 1 foo2
+  /// 2 foo3
+  ///
+  /// i=0 will read the entire foo struct, i=1 the foo2 primitive column etc
+  virtual ::arrow::Status ReadColumn(int i,
+                                     std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+
+  /// \brief Return a RecordBatchReader of all row groups and columns.
+  virtual ::arrow::Result<std::unique_ptr<::arrow::RecordBatchReader>>
+  GetRecordBatchReader() = 0;
+
+  /// \brief Return a RecordBatchReader of row groups selected from row_group_indices.
+  ///
+  /// Note that the ordering in row_group_indices matters. FileReaders must outlive
+  /// their RecordBatchReaders.
+  ///
+  /// \returns error Result if row_group_indices contains an invalid index
+  virtual ::arrow::Result<std::unique_ptr<::arrow::RecordBatchReader>>
+  GetRecordBatchReader(const std::vector<int>& row_group_indices) = 0;
+
+  /// \brief Return a RecordBatchReader of row groups selected from
+  /// row_group_indices, whose columns are selected by column_indices.
+  ///
+  /// Note that the ordering in row_group_indices and column_indices
+  /// matter. FileReaders must outlive their RecordBatchReaders.
+  ///
+  /// \returns error Result if either row_group_indices or column_indices
+  ///     contains an invalid index
+  virtual ::arrow::Result<std::unique_ptr<::arrow::RecordBatchReader>>
+  GetRecordBatchReader(const std::vector<int>& row_group_indices,
+                       const std::vector<int>& column_indices) = 0;
+
+  /// \brief Return a RecordBatchReader of row groups selected from
+  /// row_group_indices, whose columns are selected by column_indices.
+  ///
+  /// Note that the ordering in row_group_indices and column_indices
+  /// matter. FileReaders must outlive their RecordBatchReaders.
+  ///
+  /// \param row_group_indices which row groups to read (order determines read order).
+  /// \param column_indices which columns to read (order determines output schema).
+  /// \param[out] out record batch stream from parquet data.
+  ///
+  /// \returns error Status if either row_group_indices or column_indices
+  ///     contains an invalid index
+  /// \deprecated Deprecated in 21.0.0. Use arrow::Result version instead.
+  ARROW_DEPRECATED("Deprecated in 21.0.0. Use arrow::Result version instead.")
+  ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+                                       const std::vector<int>& column_indices,
+                                       std::shared_ptr<::arrow::RecordBatchReader>* out);
+
+  /// \deprecated Deprecated in 21.0.0. Use arrow::Result version instead.
+  ARROW_DEPRECATED("Deprecated in 21.0.0. Use arrow::Result version instead.")
+  ::arrow::Status GetRecordBatchReader(const std::vector<int>& row_group_indices,
+                                       std::shared_ptr<::arrow::RecordBatchReader>* out);
+
+  /// \deprecated Deprecated in 21.0.0. Use arrow::Result version instead.
+  ARROW_DEPRECATED("Deprecated in 21.0.0. Use arrow::Result version instead.")
+  ::arrow::Status GetRecordBatchReader(std::shared_ptr<::arrow::RecordBatchReader>* out);
+
+  /// \brief Return a generator of record batches.
+  ///
+  /// The FileReader must outlive the generator, so this requires that you pass in a
+  /// shared_ptr.
+  ///
+  /// \returns error Result if either row_group_indices or column_indices contains an
+  ///     invalid index
+  virtual ::arrow::Result<
+      std::function<::arrow::Future<std::shared_ptr<::arrow::RecordBatch>>()>>
+  GetRecordBatchGenerator(std::shared_ptr<FileReader> reader,
+                          const std::vector<int> row_group_indices,
+                          const std::vector<int> column_indices,
+                          ::arrow::internal::Executor* cpu_executor = NULLPTR,
+                          int64_t rows_to_readahead = 0) = 0;
+
+  /// Read all columns into a Table
+  virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
+
+  /// \brief Read the given columns into a Table
+  ///
+  /// The indicated column indices are relative to the internal representation
+  /// of the parquet table. For instance :
+  /// 0 foo.bar
+  ///       foo.bar.baz           0
+  ///       foo.bar.baz2          1
+  ///   foo.qux                   2
+  /// 1 foo2                      3
+  /// 2 foo3                      4
+  ///
+  /// i=0 will read foo.bar.baz, i=1 will read only foo.bar.baz2 and so on.
+  /// Only leaf fields have indices; foo itself doesn't have an index.
+  /// To get the index for a particular leaf field, one can use
+  /// manifest().schema_fields to get the top level fields, and then walk the
+  /// tree to identify the relevant leaf fields and access its column_index.
+  /// To get the total number of leaf fields, use FileMetadata.num_columns().
+  virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
+                                    std::shared_ptr<::arrow::Table>* out) = 0;
+
+  virtual ::arrow::Status ReadRowGroup(int i, const std::vector<int>& column_indices,
+                                       std::shared_ptr<::arrow::Table>* out) = 0;
+
+  virtual ::arrow::Status ReadRowGroup(int i, std::shared_ptr<::arrow::Table>* out) = 0;
+
+  virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
+                                        const std::vector<int>& column_indices,
+                                        std::shared_ptr<::arrow::Table>* out) = 0;
+
+  virtual ::arrow::Status ReadRowGroups(const std::vector<int>& row_groups,
+                                        std::shared_ptr<::arrow::Table>* out) = 0;
+
+  /// \brief Scan file contents with one thread, return number of rows
+  virtual ::arrow::Status ScanContents(std::vector<int> columns,
+                                       const int32_t column_batch_size,
+                                       int64_t* num_rows) = 0;
+
+  /// \brief Return a reader for the RowGroup, this object must not outlive the
+  ///   FileReader.
+  virtual std::shared_ptr<RowGroupReader> RowGroup(int row_group_index) = 0;
+
+  /// \brief The number of row groups in the file
+  virtual int num_row_groups() const = 0;
+
+  virtual ParquetFileReader* parquet_reader() const = 0;
+
+  /// Set whether to use multiple threads during reads of multiple columns.
+  /// By default only one thread is used.
+  virtual void set_use_threads(bool use_threads) = 0;
+
+  /// Set number of records to read per batch for the RecordBatchReader.
+  virtual void set_batch_size(int64_t batch_size) = 0;
+
+  virtual const ArrowReaderProperties& properties() const = 0;
+
+  virtual const SchemaManifest& manifest() const = 0;
+
+  virtual ~FileReader() = default;
+};
+
+class RowGroupReader {
+ public:
+  virtual ~RowGroupReader() = default;
+  virtual std::shared_ptr<ColumnChunkReader> Column(int column_index) = 0;
+  virtual ::arrow::Status ReadTable(const std::vector<int>& column_indices,
+                                    std::shared_ptr<::arrow::Table>* out) = 0;
+  virtual ::arrow::Status ReadTable(std::shared_ptr<::arrow::Table>* out) = 0;
+
+ private:
+  struct Iterator;
+};
+
+class ColumnChunkReader {
+ public:
+  virtual ~ColumnChunkReader() = default;
+  virtual ::arrow::Status Read(std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+};
+
+// At this point, the column reader is a stream iterator. It only knows how to
+// read the next batch of values for a particular column from the file until it
+// runs out.
+//
+// We also do not expose any internal Parquet details, such as row groups. This
+// might change in the future.
+class PARQUET_EXPORT ColumnReader {
+ public:
+  virtual ~ColumnReader() = default;
+
+  // Scan the next array of the indicated size. The actual size of the
+  // returned array may be less than the passed size depending how much data is
+  // available in the file.
+  //
+  // When all the data in the file has been exhausted, the result is set to
+  // nullptr.
+  //
+  // Returns Status::OK on a successful read, including if you have exhausted
+  // the data available in the file.
+  virtual ::arrow::Status NextBatch(int64_t batch_size,
+                                    std::shared_ptr<::arrow::ChunkedArray>* out) = 0;
+};
+
+/// \brief Experimental helper class for bindings (like Python) that struggle
+/// either with std::move or C++ exceptions
+class PARQUET_EXPORT FileReaderBuilder {
+ public:
+  FileReaderBuilder();
+
+  /// Create FileReaderBuilder from Arrow file and optional properties / metadata
+  ::arrow::Status Open(std::shared_ptr<::arrow::io::RandomAccessFile> file,
+                       const ReaderProperties& properties = default_reader_properties(),
+                       std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+  /// Create FileReaderBuilder from file path and optional properties / metadata
+  ::arrow::Status OpenFile(const std::string& path, bool memory_map = false,
+                           const ReaderProperties& props = default_reader_properties(),
+                           std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+  ParquetFileReader* raw_reader() { return raw_reader_.get(); }
+
+  /// Set Arrow MemoryPool for memory allocation
+  FileReaderBuilder* memory_pool(::arrow::MemoryPool* pool);
+  /// Set Arrow reader properties
+  FileReaderBuilder* properties(const ArrowReaderProperties& arg_properties);
+  /// Build FileReader instance
+  ::arrow::Status Build(std::unique_ptr<FileReader>* out);
+  ::arrow::Result<std::unique_ptr<FileReader>> Build();
+
+ private:
+  ::arrow::MemoryPool* pool_;
+  ArrowReaderProperties properties_;
+  std::unique_ptr<ParquetFileReader> raw_reader_;
+};
+
+/// \defgroup parquet-arrow-reader-factories Factory functions for Parquet Arrow readers
+///
+/// @{
+
+/// \brief Build FileReader from Arrow file and MemoryPool
+///
+/// Advanced settings are supported through the FileReaderBuilder class.
+PARQUET_EXPORT
+::arrow::Result<std::unique_ptr<FileReader>> OpenFile(
+    std::shared_ptr<::arrow::io::RandomAccessFile>, ::arrow::MemoryPool* allocator);
+
+/// @}
+
+PARQUET_EXPORT
+::arrow::Status StatisticsAsScalars(const Statistics& Statistics,
+                                    std::shared_ptr<::arrow::Scalar>* min,
+                                    std::shared_ptr<::arrow::Scalar>* max);
+
+}  // namespace arrow
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/arrow/schema.h b/pyarrow/include/parquet/arrow/schema.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd60fde43422889c53ebd7cf86fbac99c8c6f282
--- /dev/null
+++ b/pyarrow/include/parquet/arrow/schema.h
@@ -0,0 +1,184 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+
+#include "parquet/level_conversion.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+class ArrowReaderProperties;
+class ArrowWriterProperties;
+class WriterProperties;
+
+namespace arrow {
+
+/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
+/// schema into a Parquet schema.
+///
+/// @{
+
+PARQUET_EXPORT
+::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
+                            const WriterProperties& properties,
+                            const ArrowWriterProperties& arrow_properties,
+                            schema::NodePtr* out);
+
+PARQUET_EXPORT
+::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+                                const WriterProperties& properties,
+                                const ArrowWriterProperties& arrow_properties,
+                                std::shared_ptr<SchemaDescriptor>* out);
+
+PARQUET_EXPORT
+::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
+                                const WriterProperties& properties,
+                                std::shared_ptr<SchemaDescriptor>* out);
+
+/// @}
+
+/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
+/// schema into an Arrow schema.
+///
+/// @{
+
+PARQUET_EXPORT
+::arrow::Status FromParquetSchema(
+    const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
+    const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
+    std::shared_ptr<::arrow::Schema>* out);
+
+PARQUET_EXPORT
+::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+                                  const ArrowReaderProperties& properties,
+                                  std::shared_ptr<::arrow::Schema>* out);
+
+PARQUET_EXPORT
+::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
+                                  std::shared_ptr<::arrow::Schema>* out);
+
+/// @}
+
+/// \brief Bridge between an arrow::Field and parquet column indices.
+struct PARQUET_EXPORT SchemaField {
+  std::shared_ptr<::arrow::Field> field;
+  std::vector<SchemaField> children;
+
+  // Only set for leaf nodes
+  int column_index = -1;
+
+  parquet::internal::LevelInfo level_info;
+
+  bool is_leaf() const { return column_index != -1; }
+};
+
+/// \brief Bridge between a parquet Schema and an arrow Schema.
+///
+/// Expose parquet columns as a tree structure. Useful traverse and link
+/// between arrow's Schema and parquet's Schema.
+struct PARQUET_EXPORT SchemaManifest {
+  static ::arrow::Status Make(
+      const SchemaDescriptor* schema,
+      const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
+      const ArrowReaderProperties& properties, SchemaManifest* manifest);
+
+  const SchemaDescriptor* descr;
+  std::shared_ptr<::arrow::Schema> origin_schema;
+  std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
+  std::vector<SchemaField> schema_fields;
+
+  std::unordered_map<int, const SchemaField*> column_index_to_field;
+  std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
+
+  ::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
+    auto it = column_index_to_field.find(column_index);
+    if (it == column_index_to_field.end()) {
+      return ::arrow::Status::KeyError("Column index ", column_index,
+                                       " not found in schema manifest, may be malformed");
+    }
+    *out = it->second;
+    return ::arrow::Status::OK();
+  }
+
+  const SchemaField* GetParent(const SchemaField* field) const {
+    // Returns nullptr also if not found
+    auto it = child_to_parent.find(field);
+    if (it == child_to_parent.end()) {
+      return NULLPTR;
+    }
+    return it->second;
+  }
+
+  /// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
+  /// correspond to the column root (first node below the parquet schema's root group) of
+  /// each leaf referenced in column_indices.
+  ///
+  /// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
+  /// the roots are `a` and `i` (return=[0,2]).
+  ///
+  /// root
+  /// -- a  <------
+  /// -- -- b  |  |
+  /// -- -- -- c  |
+  /// -- -- -- d  |
+  /// -- -- -- -- e
+  /// -- f
+  /// -- -- g
+  /// -- -- -- h
+  /// -- i  <---
+  /// -- -- j  |
+  /// -- -- -- k
+  ::arrow::Result<std::vector<int>> GetFieldIndices(
+      const std::vector<int>& column_indices) const {
+    const schema::GroupNode* group = descr->group_node();
+    std::unordered_set<int> already_added;
+
+    std::vector<int> out;
+    for (int column_idx : column_indices) {
+      if (column_idx < 0 || column_idx >= descr->num_columns()) {
+        return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
+      }
+
+      auto field_node = descr->GetColumnRoot(column_idx);
+      auto field_idx = group->FieldIndex(*field_node);
+      if (field_idx == -1) {
+        return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
+      }
+
+      if (already_added.insert(field_idx).second) {
+        out.push_back(field_idx);
+      }
+    }
+    return out;
+  }
+};
+
+}  // namespace arrow
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/arrow/test_util.h b/pyarrow/include/parquet/arrow/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..05f6fd24ac038e4609f5f182dbff64dae4d21e4d
--- /dev/null
+++ b/pyarrow/include/parquet/arrow/test_util.h
@@ -0,0 +1,487 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <limits>
+#include <memory>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_decimal.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/testing/gtest_util.h"
+#include "arrow/testing/random.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/float16.h"
+#include "parquet/column_reader.h"
+#include "parquet/test_util.h"
+
+namespace parquet {
+
+using internal::RecordReader;
+
+namespace arrow {
+
+using ::arrow::Array;
+using ::arrow::ChunkedArray;
+using ::arrow::Status;
+
+template <typename T, int32_t PRECISION, typename = ::arrow::enable_if_decimal<T>>
+struct DecimalWithPrecisionAndScale {
+  using type = T;
+  static_assert(PRECISION >= T::kMinPrecision && PRECISION <= T::kMaxPrecision,
+                "Invalid precision value");
+  static constexpr ::arrow::Type::type type_id = T::type_id;
+  static constexpr int32_t precision = PRECISION;
+  static constexpr int32_t scale = PRECISION - 1;
+};
+template <int32_t PRECISION>
+using Decimal32WithPrecisionAndScale =
+    DecimalWithPrecisionAndScale<::arrow::Decimal32Type, PRECISION>;
+template <int32_t PRECISION>
+using Decimal64WithPrecisionAndScale =
+    DecimalWithPrecisionAndScale<::arrow::Decimal64Type, PRECISION>;
+template <int32_t PRECISION>
+using Decimal128WithPrecisionAndScale =
+    DecimalWithPrecisionAndScale<::arrow::Decimal128Type, PRECISION>;
+template <int32_t PRECISION>
+using Decimal256WithPrecisionAndScale =
+    DecimalWithPrecisionAndScale<::arrow::Decimal256Type, PRECISION>;
+
+template <class ArrowType>
+::arrow::enable_if_floating_point<ArrowType, Status> NonNullArray(
+    size_t size, std::shared_ptr<Array>* out) {
+  using c_type = typename ArrowType::c_type;
+  std::vector<c_type> values;
+  if constexpr (::arrow::is_half_float_type<ArrowType>::value) {
+    values.resize(size);
+    test::random_float16_numbers(static_cast<int>(size), 0, ::arrow::util::Float16(0.0f),
+                                 ::arrow::util::Float16(1.0f), values.data());
+  } else {
+    ::arrow::random_real(size, 0, static_cast<c_type>(0), static_cast<c_type>(1),
+                         &values);
+  }
+  ::arrow::NumericBuilder<ArrowType> builder;
+  RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
+  return builder.Finish(out);
+}
+
+template <class ArrowType>
+::arrow::enable_if_integer<ArrowType, Status> NonNullArray(size_t size,
+                                                           std::shared_ptr<Array>* out) {
+  std::vector<typename ArrowType::c_type> values;
+  ::arrow::randint(size, 0, 64, &values);
+
+  // Passing data type so this will work with TimestampType too
+  ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
+                                             ::arrow::default_memory_pool());
+  RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
+  return builder.Finish(out);
+}
+
+template <class ArrowType>
+::arrow::enable_if_date<ArrowType, Status> NonNullArray(size_t size,
+                                                        std::shared_ptr<Array>* out) {
+  std::vector<typename ArrowType::c_type> values;
+  ::arrow::randint(size, 0, 24, &values);
+  for (size_t i = 0; i < size; i++) {
+    values[i] *= 86400000;
+  }
+
+  // Passing data type so this will work with TimestampType too
+  ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
+                                             ::arrow::default_memory_pool());
+  RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
+  return builder.Finish(out);
+}
+
+template <class ArrowType>
+::arrow::enable_if_base_binary<ArrowType, Status> NonNullArray(
+    size_t size, std::shared_ptr<Array>* out) {
+  using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
+  BuilderType builder;
+  for (size_t i = 0; i < size; i++) {
+    RETURN_NOT_OK(builder.Append("test-string"));
+  }
+  return builder.Finish(out);
+}
+
+template <typename ArrowType>
+::arrow::enable_if_fixed_size_binary<ArrowType, Status> NonNullArray(
+    size_t size, std::shared_ptr<Array>* out) {
+  using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
+  // set byte_width to the length of "fixed": 5
+  // todo: find a way to generate test data with more diversity.
+  BuilderType builder(::arrow::fixed_size_binary(5));
+  for (size_t i = 0; i < size; i++) {
+    RETURN_NOT_OK(builder.Append("fixed"));
+  }
+  return builder.Finish(out);
+}
+
+template <int32_t byte_width>
+static void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) {
+  auto gen = ::arrow::random::RandomArrayGenerator(seed);
+  std::shared_ptr<Array> decimals;
+  if constexpr (byte_width == 4) {
+    decimals = gen.Decimal32(::arrow::decimal32(precision, 0), n);
+  } else if constexpr (byte_width == 8) {
+    decimals = gen.Decimal64(::arrow::decimal64(precision, 0), n);
+  } else if constexpr (byte_width == 16) {
+    decimals = gen.Decimal128(::arrow::decimal128(precision, 0), n);
+  } else {
+    decimals = gen.Decimal256(::arrow::decimal256(precision, 0), n);
+  }
+  std::memcpy(out, decimals->data()->GetValues<uint8_t>(1, 0), byte_width * n);
+}
+
+template <typename ArrowType, int32_t precision = ArrowType::precision>
+::arrow::enable_if_t<std::is_same_v<ArrowType, DecimalWithPrecisionAndScale<
+                                                   typename ArrowType::type, precision>>,
+                     Status>
+NonNullArray(size_t size, std::shared_ptr<Array>* out) {
+  constexpr int32_t kDecimalPrecision = precision;
+  constexpr int32_t kDecimalScale = ArrowType::scale;
+
+  const auto type =
+      std::make_shared<typename ArrowType::type>(kDecimalPrecision, kDecimalScale);
+  const int32_t byte_width = type->byte_width();
+
+  constexpr int32_t seed = 0;
+
+  ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
+  random_decimals<ArrowType::type::kByteWidth>(size, seed, kDecimalPrecision,
+                                               out_buf->mutable_data());
+
+  using Builder = typename ::arrow::TypeTraits<typename ArrowType::type>::BuilderType;
+  Builder builder(type);
+  RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size));
+  return builder.Finish(out);
+}
+
+template <class ArrowType>
+::arrow::enable_if_boolean<ArrowType, Status> NonNullArray(size_t size,
+                                                           std::shared_ptr<Array>* out) {
+  std::vector<uint8_t> values;
+  ::arrow::randint(size, 0, 1, &values);
+  ::arrow::BooleanBuilder builder;
+  RETURN_NOT_OK(builder.AppendValues(values.data(), values.size()));
+  return builder.Finish(out);
+}
+
+// This helper function only supports (size/2) nulls.
+template <typename ArrowType>
+::arrow::enable_if_floating_point<ArrowType, Status> NullableArray(
+    size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<Array>* out) {
+  using c_type = typename ArrowType::c_type;
+  std::vector<c_type> values;
+  if constexpr (::arrow::is_half_float_type<ArrowType>::value) {
+    values.resize(size);
+    test::random_float16_numbers(static_cast<int>(size), 0, ::arrow::util::Float16(-1e4f),
+                                 ::arrow::util::Float16(1e4f), values.data());
+  } else {
+    ::arrow::random_real(size, seed, static_cast<c_type>(-1e10),
+                         static_cast<c_type>(1e10), &values);
+  }
+  std::vector<uint8_t> valid_bytes(size, 1);
+
+  for (size_t i = 0; i < num_nulls; i++) {
+    valid_bytes[i * 2] = 0;
+  }
+
+  ::arrow::NumericBuilder<ArrowType> builder;
+  if (values.size() > 0) {
+    RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
+  }
+  return builder.Finish(out);
+}
+
+// This helper function only supports (size/2) nulls.
+template <typename ArrowType>
+::arrow::enable_if_integer<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
+                                                            uint32_t seed,
+                                                            std::shared_ptr<Array>* out) {
+  std::vector<typename ArrowType::c_type> values;
+
+  // Seed is random in Arrow right now
+  (void)seed;
+  ::arrow::randint(size, 0, 64, &values);
+  std::vector<uint8_t> valid_bytes(size, 1);
+
+  for (size_t i = 0; i < num_nulls; i++) {
+    valid_bytes[i * 2] = 0;
+  }
+
+  // Passing data type so this will work with TimestampType too
+  ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
+                                             ::arrow::default_memory_pool());
+  RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
+  return builder.Finish(out);
+}
+
+template <typename ArrowType>
+::arrow::enable_if_date<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
+                                                         uint32_t seed,
+                                                         std::shared_ptr<Array>* out) {
+  std::vector<typename ArrowType::c_type> values;
+
+  // Seed is random in Arrow right now
+  (void)seed;
+  ::arrow::randint(size, 0, 24, &values);
+  for (size_t i = 0; i < size; i++) {
+    values[i] *= 86400000;
+  }
+  std::vector<uint8_t> valid_bytes(size, 1);
+
+  for (size_t i = 0; i < num_nulls; i++) {
+    valid_bytes[i * 2] = 0;
+  }
+
+  // Passing data type so this will work with TimestampType too
+  ::arrow::NumericBuilder<ArrowType> builder(std::make_shared<ArrowType>(),
+                                             ::arrow::default_memory_pool());
+  RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
+  return builder.Finish(out);
+}
+
+// This helper function only supports (size/2) nulls yet.
+template <typename ArrowType>
+::arrow::enable_if_base_binary<ArrowType, Status> NullableArray(
+    size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
+  std::vector<uint8_t> valid_bytes(size, 1);
+
+  for (size_t i = 0; i < num_nulls; i++) {
+    valid_bytes[i * 2] = 0;
+  }
+
+  using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
+  BuilderType builder;
+
+  const int kBufferSize = 10;
+  uint8_t buffer[kBufferSize];
+  for (size_t i = 0; i < size; i++) {
+    if (!valid_bytes[i]) {
+      RETURN_NOT_OK(builder.AppendNull());
+    } else {
+      ::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
+      if (ArrowType::is_utf8) {
+        // Trivially force data to be valid UTF8 by making it all ASCII
+        for (auto& byte : buffer) {
+          byte &= 0x7f;
+        }
+      }
+      RETURN_NOT_OK(builder.Append(buffer, kBufferSize));
+    }
+  }
+  return builder.Finish(out);
+}
+
+// This helper function only supports (size/2) nulls yet,
+// same as NullableArray<String|Binary>(..)
+template <typename ArrowType>
+::arrow::enable_if_fixed_size_binary<ArrowType, Status> NullableArray(
+    size_t size, size_t num_nulls, uint32_t seed, std::shared_ptr<::arrow::Array>* out) {
+  std::vector<uint8_t> valid_bytes(size, 1);
+
+  for (size_t i = 0; i < num_nulls; i++) {
+    valid_bytes[i * 2] = 0;
+  }
+
+  using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
+  const int byte_width = 10;
+  BuilderType builder(::arrow::fixed_size_binary(byte_width));
+
+  const int kBufferSize = byte_width;
+  uint8_t buffer[kBufferSize];
+  for (size_t i = 0; i < size; i++) {
+    if (!valid_bytes[i]) {
+      RETURN_NOT_OK(builder.AppendNull());
+    } else {
+      ::arrow::random_bytes(kBufferSize, seed + static_cast<uint32_t>(i), buffer);
+      RETURN_NOT_OK(builder.Append(buffer));
+    }
+  }
+  return builder.Finish(out);
+}
+
+template <typename ArrowType, int32_t precision = ArrowType::precision>
+::arrow::enable_if_t<std::is_same_v<ArrowType, DecimalWithPrecisionAndScale<
+                                                   typename ArrowType::type, precision>>,
+                     Status>
+NullableArray(size_t size, size_t num_nulls, uint32_t seed,
+              std::shared_ptr<::arrow::Array>* out) {
+  std::vector<uint8_t> valid_bytes(size, '\1');
+
+  for (size_t i = 0; i < num_nulls; ++i) {
+    valid_bytes[i * 2] = '\0';
+  }
+
+  constexpr int32_t kDecimalPrecision = precision;
+  constexpr int32_t kDecimalScale = ArrowType::scale;
+
+  const auto type =
+      std::make_shared<typename ArrowType::type>(kDecimalPrecision, kDecimalScale);
+  const int32_t byte_width = type->byte_width();
+
+  ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
+  random_decimals<ArrowType::type::kByteWidth>(size, seed, precision,
+                                               out_buf->mutable_data());
+
+  using Builder = typename ::arrow::TypeTraits<typename ArrowType::type>::BuilderType;
+  Builder builder(type);
+  RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data()));
+  return builder.Finish(out);
+}
+
+// This helper function only supports (size/2) nulls yet.
+template <class ArrowType>
+::arrow::enable_if_boolean<ArrowType, Status> NullableArray(size_t size, size_t num_nulls,
+                                                            uint32_t seed,
+                                                            std::shared_ptr<Array>* out) {
+  std::vector<uint8_t> values;
+
+  // Seed is random in Arrow right now
+  (void)seed;
+
+  ::arrow::randint(size, 0, 1, &values);
+  std::vector<uint8_t> valid_bytes(size, 1);
+
+  for (size_t i = 0; i < num_nulls; i++) {
+    valid_bytes[i * 2] = 0;
+  }
+
+  ::arrow::BooleanBuilder builder;
+  RETURN_NOT_OK(builder.AppendValues(values.data(), values.size(), valid_bytes.data()));
+  return builder.Finish(out);
+}
+
+/// Wrap an Array into a ListArray by splitting it up into size lists.
+///
+/// This helper function only supports (size/2) nulls.
+Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size,
+                     int64_t null_count, const std::string& item_name,
+                     bool nullable_values, std::shared_ptr<::arrow::ListArray>* out) {
+  // We always include an empty list
+  int64_t non_null_entries = size - null_count - 1;
+  int64_t length_per_entry = values->length() / non_null_entries;
+
+  auto offsets = AllocateBuffer();
+  RETURN_NOT_OK(offsets->Resize((size + 1) * sizeof(int32_t)));
+  int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data());
+
+  auto null_bitmap = AllocateBuffer();
+  int64_t bitmap_size = ::arrow::bit_util::BytesForBits(size);
+  RETURN_NOT_OK(null_bitmap->Resize(bitmap_size));
+  uint8_t* null_bitmap_ptr = null_bitmap->mutable_data();
+  memset(null_bitmap_ptr, 0, bitmap_size);
+
+  int32_t current_offset = 0;
+  for (int64_t i = 0; i < size; i++) {
+    offsets_ptr[i] = current_offset;
+    if (!(((i % 2) == 0) && ((i / 2) < null_count))) {
+      // Non-null list (list with index 1 is always empty).
+      ::arrow::bit_util::SetBit(null_bitmap_ptr, i);
+      if (i != 1) {
+        current_offset += static_cast<int32_t>(length_per_entry);
+      }
+    }
+  }
+  offsets_ptr[size] = static_cast<int32_t>(values->length());
+
+  auto value_field = ::arrow::field(item_name, values->type(), nullable_values);
+  *out = std::make_shared<::arrow::ListArray>(::arrow::list(value_field), size, offsets,
+                                              values, null_bitmap, null_count);
+
+  return Status::OK();
+}
+
+// Make an array containing only empty lists, with a null values array
+Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
+  // Allocate an offsets buffer containing only zeroes
+  const int64_t offsets_nbytes = (size + 1) * sizeof(int32_t);
+  ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, ::arrow::AllocateBuffer(offsets_nbytes));
+  memset(offsets_buffer->mutable_data(), 0, offsets_nbytes);
+
+  auto value_field =
+      ::arrow::field("item", ::arrow::float64(), false /* nullable_values */);
+  auto list_type = ::arrow::list(value_field);
+
+  std::vector<std::shared_ptr<Buffer>> child_buffers = {nullptr /* null bitmap */,
+                                                        nullptr /* values */};
+  auto child_data =
+      ::arrow::ArrayData::Make(value_field->type(), 0, std::move(child_buffers));
+
+  std::vector<std::shared_ptr<Buffer>> buffers = {nullptr /* bitmap */,
+                                                  std::move(offsets_buffer)};
+  auto array_data = ::arrow::ArrayData::Make(list_type, size, std::move(buffers));
+  array_data->child_data.push_back(child_data);
+
+  *out_array = ::arrow::MakeArray(array_data);
+  return Status::OK();
+}
+
+std::shared_ptr<::arrow::Table> MakeSimpleTable(
+    const std::shared_ptr<ChunkedArray>& values, bool nullable) {
+  auto schema = ::arrow::schema({::arrow::field("col", values->type(), nullable)});
+  return ::arrow::Table::Make(schema, {values});
+}
+
+std::shared_ptr<::arrow::Table> MakeSimpleTable(const std::shared_ptr<Array>& values,
+                                                bool nullable) {
+  auto carr = std::make_shared<::arrow::ChunkedArray>(values);
+  return MakeSimpleTable(carr, nullable);
+}
+
+template <typename T>
+void ExpectArray(T* expected, Array* result) {
+  auto p_array = static_cast<::arrow::PrimitiveArray*>(result);
+  for (int i = 0; i < result->length(); i++) {
+    EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->values()->data())[i]);
+  }
+}
+
+template <typename ArrowType>
+void ExpectArrayT(void* expected, Array* result) {
+  ::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result);
+  for (int64_t i = 0; i < result->length(); i++) {
+    EXPECT_EQ(reinterpret_cast<typename ArrowType::c_type*>(expected)[i],
+              reinterpret_cast<const typename ArrowType::c_type*>(
+                  p_array->values()->data())[i]);
+  }
+}
+
+template <>
+void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) {
+  ::arrow::BooleanBuilder builder;
+  ARROW_EXPECT_OK(
+      builder.AppendValues(reinterpret_cast<uint8_t*>(expected), result->length()));
+
+  std::shared_ptr<Array> expected_array;
+  ARROW_EXPECT_OK(builder.Finish(&expected_array));
+  EXPECT_TRUE(result->Equals(*expected_array));
+}
+
+}  // namespace arrow
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/arrow/writer.h b/pyarrow/include/parquet/arrow/writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ec8796ffd17516cef6d24eac481a7b44703d562
--- /dev/null
+++ b/pyarrow/include/parquet/arrow/writer.h
@@ -0,0 +1,176 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class RecordBatch;
+class Schema;
+class Table;
+
+}  // namespace arrow
+
+namespace parquet {
+
+class FileMetaData;
+class ParquetFileWriter;
+
+namespace arrow {
+
+/// \brief Iterative FileWriter class
+///
+/// For basic usage, can write a Table at a time, creating one or more row
+/// groups per write call.
+///
+/// For advanced usage, can write column-by-column: Start a new RowGroup or
+/// Chunk with NewRowGroup, then write column-by-column the whole column chunk.
+///
+/// If PARQUET:field_id is present as a metadata key on a field, and the corresponding
+/// value is a nonnegative integer, then it will be used as the field_id in the parquet
+/// file.
+class PARQUET_EXPORT FileWriter {
+ public:
+  static ::arrow::Status Make(MemoryPool* pool, std::unique_ptr<ParquetFileWriter> writer,
+                              std::shared_ptr<::arrow::Schema> schema,
+                              std::shared_ptr<ArrowWriterProperties> arrow_properties,
+                              std::unique_ptr<FileWriter>* out);
+
+  /// \brief Try to create an Arrow to Parquet file writer.
+  ///
+  /// \param schema schema of data that will be passed.
+  /// \param pool memory pool to use.
+  /// \param sink output stream to write Parquet data.
+  /// \param properties general Parquet writer properties.
+  /// \param arrow_properties Arrow-specific writer properties.
+  ///
+  /// \since 11.0.0
+  static ::arrow::Result<std::unique_ptr<FileWriter>> Open(
+      const ::arrow::Schema& schema, MemoryPool* pool,
+      std::shared_ptr<::arrow::io::OutputStream> sink,
+      std::shared_ptr<WriterProperties> properties = default_writer_properties(),
+      std::shared_ptr<ArrowWriterProperties> arrow_properties =
+          default_arrow_writer_properties());
+
+  /// Return the Arrow schema to be written to.
+  virtual std::shared_ptr<::arrow::Schema> schema() const = 0;
+
+  /// \brief Write a Table to Parquet.
+  ///
+  /// \param table Arrow table to write.
+  /// \param chunk_size maximum number of rows to write per row group.
+  virtual ::arrow::Status WriteTable(
+      const ::arrow::Table& table, int64_t chunk_size = DEFAULT_MAX_ROW_GROUP_LENGTH) = 0;
+
+  /// \brief Start a new row group.
+  ///
+  /// Returns an error if not all columns have been written.
+  virtual ::arrow::Status NewRowGroup() = 0;
+
+  /// \brief Write ColumnChunk in row group using an array.
+  virtual ::arrow::Status WriteColumnChunk(const ::arrow::Array& data) = 0;
+
+  /// \brief Write ColumnChunk in row group using slice of a ChunkedArray
+  virtual ::arrow::Status WriteColumnChunk(
+      const std::shared_ptr<::arrow::ChunkedArray>& data, int64_t offset,
+      int64_t size) = 0;
+
+  /// \brief Write ColumnChunk in a row group using a ChunkedArray
+  virtual ::arrow::Status WriteColumnChunk(
+      const std::shared_ptr<::arrow::ChunkedArray>& data) = 0;
+
+  /// \brief Start a new buffered row group.
+  ///
+  /// Returns an error if not all columns have been written.
+  virtual ::arrow::Status NewBufferedRowGroup() = 0;
+
+  /// \brief Write a RecordBatch into the buffered row group.
+  ///
+  /// Multiple RecordBatches can be written into the same row group
+  /// through this method.
+  ///
+  /// WriterProperties.max_row_group_length() is respected and a new
+  /// row group will be created if the current row group exceeds the
+  /// limit.
+  ///
+  /// Batches get flushed to the output stream once NewBufferedRowGroup()
+  /// or Close() is called.
+  ///
+  /// WARNING: If you are writing multiple files in parallel in the same
+  /// executor, deadlock may occur if ArrowWriterProperties::use_threads
+  /// is set to true to write columns in parallel. Please disable use_threads
+  /// option in this case.
+  virtual ::arrow::Status WriteRecordBatch(const ::arrow::RecordBatch& batch) = 0;
+
+  /// \brief Write the footer and close the file.
+  virtual ::arrow::Status Close() = 0;
+  virtual ~FileWriter();
+
+  virtual MemoryPool* memory_pool() const = 0;
+  /// \brief Add key-value metadata to the file.
+  /// \param[in] key_value_metadata the metadata to add.
+  /// \note This will overwrite any existing metadata with the same key.
+  /// \return Error if Close() has been called.
+  ///
+  /// WARNING: If `store_schema` is enabled, `ARROW:schema` would be stored
+  /// in the key-value metadata. Overwriting this key would result in
+  /// `store_schema` being unusable during read.
+  virtual ::arrow::Status AddKeyValueMetadata(
+      const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata) = 0;
+  /// \brief Return the file metadata, only available after calling Close().
+  virtual const std::shared_ptr<FileMetaData> metadata() const = 0;
+};
+
+/// \brief Write Parquet file metadata only to indicated Arrow OutputStream
+PARQUET_EXPORT
+::arrow::Status WriteFileMetaData(const FileMetaData& file_metadata,
+                                  ::arrow::io::OutputStream* sink);
+
+/// \brief Write metadata-only Parquet file to indicated Arrow OutputStream
+PARQUET_EXPORT
+::arrow::Status WriteMetaDataFile(const FileMetaData& file_metadata,
+                                  ::arrow::io::OutputStream* sink);
+
+/// \brief Write a Table to Parquet.
+///
+/// This writes one table in a single shot. To write a Parquet file with
+/// multiple tables iteratively, see parquet::arrow::FileWriter.
+///
+/// \param table Table to write.
+/// \param pool memory pool to use.
+/// \param sink output stream to write Parquet data.
+/// \param chunk_size maximum number of rows to write per row group.
+/// \param properties general Parquet writer properties.
+/// \param arrow_properties Arrow-specific writer properties.
+::arrow::Status PARQUET_EXPORT
+WriteTable(const ::arrow::Table& table, MemoryPool* pool,
+           std::shared_ptr<::arrow::io::OutputStream> sink,
+           int64_t chunk_size = DEFAULT_MAX_ROW_GROUP_LENGTH,
+           std::shared_ptr<WriterProperties> properties = default_writer_properties(),
+           std::shared_ptr<ArrowWriterProperties> arrow_properties =
+               default_arrow_writer_properties());
+
+}  // namespace arrow
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/benchmark_util.h b/pyarrow/include/parquet/benchmark_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..abfb33bd5ba9e9d3699f0350a110672036949534
--- /dev/null
+++ b/pyarrow/include/parquet/benchmark_util.h
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <random>
+#include <string>
+#include <vector>
+
+#include "parquet/types.h"
+
+namespace parquet::benchmarks {
+
+template <typename T>
+void GenerateBenchmarkData(uint32_t size, uint32_t seed, T* data,
+                           std::vector<uint8_t>* heap, uint32_t data_string_length);
+
+#define _GENERATE_BENCHMARK_DATA_DECL(KLASS)                            \
+  template <>                                                           \
+  void GenerateBenchmarkData(uint32_t size, uint32_t seed, KLASS* data, \
+                             std::vector<uint8_t>* heap, uint32_t data_string_length);
+
+_GENERATE_BENCHMARK_DATA_DECL(int32_t)
+_GENERATE_BENCHMARK_DATA_DECL(int64_t)
+_GENERATE_BENCHMARK_DATA_DECL(float)
+_GENERATE_BENCHMARK_DATA_DECL(double)
+_GENERATE_BENCHMARK_DATA_DECL(ByteArray)
+_GENERATE_BENCHMARK_DATA_DECL(FLBA)
+_GENERATE_BENCHMARK_DATA_DECL(Int96)
+
+#undef _GENERATE_BENCHMARK_DATA_DECL
+
+}  // namespace parquet::benchmarks
diff --git a/pyarrow/include/parquet/bloom_filter.h b/pyarrow/include/parquet/bloom_filter.h
new file mode 100644
index 0000000000000000000000000000000000000000..82172f363ba7ee920ea47f562688e81d704f4b8e
--- /dev/null
+++ b/pyarrow/include/parquet/bloom_filter.h
@@ -0,0 +1,363 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cmath>
+#include <cstdint>
+#include <memory>
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/logging.h"
+#include "parquet/hasher.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+// A Bloom filter is a compact structure to indicate whether an item is not in a set or
+// probably in a set. The Bloom filter usually consists of a bit set that represents a
+// set of elements, a hash strategy and a Bloom filter algorithm.
+class PARQUET_EXPORT BloomFilter {
+ public:
+  // Maximum Bloom filter size, it sets to HDFS default block size 128MB
+  // This value will be reconsidered when implementing Bloom filter producer.
+  static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
+
+  /// Determine whether an element exist in set or not.
+  ///
+  /// @param hash the element to contain.
+  /// @return false if value is definitely not in set, and true means PROBABLY
+  /// in set.
+  virtual bool FindHash(uint64_t hash) const = 0;
+
+  /// Insert element to set represented by Bloom filter bitset.
+  /// @param hash the hash of value to insert into Bloom filter.
+  virtual void InsertHash(uint64_t hash) = 0;
+
+  /// Insert elements to set represented by Bloom filter bitset.
+  /// @param hashes the hash values to insert into Bloom filter.
+  /// @param num_values the number of hash values to insert.
+  virtual void InsertHashes(const uint64_t* hashes, int num_values) = 0;
+
+  /// Write this Bloom filter to an output stream. A Bloom filter structure should
+  /// include bitset length, hash strategy, algorithm, and bitset.
+  ///
+  /// @param sink the output stream to write
+  virtual void WriteTo(ArrowOutputStream* sink) const = 0;
+
+  /// Get the number of bytes of bitset
+  virtual uint32_t GetBitsetSize() const = 0;
+
+  /// Compute hash for 32 bits value by using its plain encoding result.
+  ///
+  /// @param value the value to hash.
+  /// @return hash result.
+  virtual uint64_t Hash(int32_t value) const = 0;
+
+  /// Compute hash for 64 bits value by using its plain encoding result.
+  ///
+  /// @param value the value to hash.
+  /// @return hash result.
+  virtual uint64_t Hash(int64_t value) const = 0;
+
+  /// Compute hash for float value by using its plain encoding result.
+  ///
+  /// @param value the value to hash.
+  /// @return hash result.
+  virtual uint64_t Hash(float value) const = 0;
+
+  /// Compute hash for double value by using its plain encoding result.
+  ///
+  /// @param value the value to hash.
+  /// @return hash result.
+  virtual uint64_t Hash(double value) const = 0;
+
+  /// Compute hash for Int96 value by using its plain encoding result.
+  ///
+  /// @param value the value to hash.
+  /// @return hash result.
+  virtual uint64_t Hash(const Int96* value) const = 0;
+
+  /// Compute hash for ByteArray value by using its plain encoding result.
+  ///
+  /// @param value the value to hash.
+  /// @return hash result.
+  virtual uint64_t Hash(const ByteArray* value) const = 0;
+
+  /// Compute hash for fixed byte array value by using its plain encoding result.
+  ///
+  /// @param value the value address.
+  /// @param len the value length.
+  /// @return hash result.
+  virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
+
+  /// Batch compute hashes for 32 bits values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for 64 bits values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for float values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const float* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for double values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const double* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for Int96 values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const Int96* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for ByteArray values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const ByteArray* values, int num_values,
+                      uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for fixed byte array values by using its plain encoding result.
+  ///
+  /// @param values values a pointer to the values to hash.
+  /// @param type_len the value length.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const FLBA* values, uint32_t type_len, int num_values,
+                      uint64_t* hashes) const = 0;
+
+  virtual ~BloomFilter() = default;
+
+ protected:
+  // Hash strategy available for Bloom filter.
+  enum class HashStrategy : uint32_t { XXHASH = 0 };
+
+  // Bloom filter algorithm.
+  enum class Algorithm : uint32_t { BLOCK = 0 };
+
+  enum class CompressionStrategy : uint32_t { UNCOMPRESSED = 0 };
+};
+
+/// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
+/// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
+/// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
+///
+/// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
+/// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
+class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
+ public:
+  /// The constructor of BlockSplitBloomFilter. It uses XXH64 as hash function.
+  ///
+  /// \param pool memory pool to use.
+  explicit BlockSplitBloomFilter(
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+  /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
+  /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
+  /// rounded up/down to lower/upper bound if num_bytes is out of range and also
+  /// will be rounded up to a power of 2.
+  ///
+  /// @param num_bytes The number of bytes to store Bloom filter bitset.
+  void Init(uint32_t num_bytes);
+
+  /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
+  /// bitset because the given bitset may not satisfy the 32-byte alignment requirement
+  /// which may lead to segfault when performing SIMD instructions. It is the caller's
+  /// responsibility to free the bitset passed in. This is used when reconstructing
+  /// a Bloom filter from a parquet file.
+  ///
+  /// @param bitset The given bitset to initialize the Bloom filter.
+  /// @param num_bytes  The number of bytes of given bitset.
+  void Init(const uint8_t* bitset, uint32_t num_bytes);
+
+  /// Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
+  static constexpr uint32_t kMinimumBloomFilterBytes = 32;
+
+  /// Calculate optimal size according to the number of distinct values and false
+  /// positive probability.
+  ///
+  /// @param ndv The number of distinct values.
+  /// @param fpp The false positive probability.
+  /// @return it always return a value between kMinimumBloomFilterBytes and
+  /// kMaximumBloomFilterBytes, and the return value is always a power of 2
+  static uint32_t OptimalNumOfBytes(uint32_t ndv, double fpp) {
+    uint32_t optimal_num_of_bits = OptimalNumOfBits(ndv, fpp);
+    ARROW_DCHECK(::arrow::bit_util::IsMultipleOf8(optimal_num_of_bits));
+    return optimal_num_of_bits >> 3;
+  }
+
+  /// Calculate optimal size according to the number of distinct values and false
+  /// positive probability.
+  ///
+  /// @param ndv The number of distinct values.
+  /// @param fpp The false positive probability.
+  /// @return it always return a value between kMinimumBloomFilterBytes * 8 and
+  /// kMaximumBloomFilterBytes * 8, and the return value is always a power of 16
+  static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
+    ARROW_DCHECK(fpp > 0.0 && fpp < 1.0);
+    const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
+    uint32_t num_bits;
+
+    // Handle overflow.
+    if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
+      num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
+    } else {
+      num_bits = static_cast<uint32_t>(m);
+    }
+
+    // Round up to lower bound
+    if (num_bits < kMinimumBloomFilterBytes << 3) {
+      num_bits = kMinimumBloomFilterBytes << 3;
+    }
+
+    // Get next power of 2 if bits is not power of 2.
+    if ((num_bits & (num_bits - 1)) != 0) {
+      num_bits = static_cast<uint32_t>(::arrow::bit_util::NextPower2(num_bits));
+    }
+
+    // Round down to upper bound
+    if (num_bits > kMaximumBloomFilterBytes << 3) {
+      num_bits = kMaximumBloomFilterBytes << 3;
+    }
+
+    return num_bits;
+  }
+
+  bool FindHash(uint64_t hash) const override;
+  void InsertHash(uint64_t hash) override;
+  void InsertHashes(const uint64_t* hashes, int num_values) override;
+  void WriteTo(ArrowOutputStream* sink) const override;
+  uint32_t GetBitsetSize() const override { return num_bytes_; }
+
+  uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
+  uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
+  uint64_t Hash(float value) const override { return hasher_->Hash(value); }
+  uint64_t Hash(double value) const override { return hasher_->Hash(value); }
+  uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
+  uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
+  uint64_t Hash(const FLBA* value, uint32_t len) const override {
+    return hasher_->Hash(value, len);
+  }
+
+  void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const float* values, int num_values, uint64_t* hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const double* values, int num_values, uint64_t* hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const Int96* values, int num_values, uint64_t* hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const ByteArray* values, int num_values, uint64_t* hashes) const override {
+    hasher_->Hashes(values, num_values, hashes);
+  }
+  void Hashes(const FLBA* values, uint32_t type_len, int num_values,
+              uint64_t* hashes) const override {
+    hasher_->Hashes(values, type_len, num_values, hashes);
+  }
+
+  uint64_t Hash(const int32_t* value) const { return hasher_->Hash(*value); }
+  uint64_t Hash(const int64_t* value) const { return hasher_->Hash(*value); }
+  uint64_t Hash(const float* value) const { return hasher_->Hash(*value); }
+  uint64_t Hash(const double* value) const { return hasher_->Hash(*value); }
+
+  /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
+  /// a Bloom filter from a parquet filter.
+  ///
+  /// @param properties The parquet reader properties.
+  /// @param input_stream The input stream from which to construct the bloom filter.
+  /// @param bloom_filter_length The length of the serialized bloom filter including
+  /// header.
+  /// @return The BlockSplitBloomFilter.
+  static BlockSplitBloomFilter Deserialize(
+      const ReaderProperties& properties, ArrowInputStream* input_stream,
+      std::optional<int64_t> bloom_filter_length = std::nullopt);
+
+ private:
+  inline void InsertHashImpl(uint64_t hash);
+
+  // Bytes in a tiny Bloom filter block.
+  static constexpr int kBytesPerFilterBlock = 32;
+
+  // The number of bits to be set in each tiny Bloom filter
+  static constexpr int kBitsSetPerBlock = 8;
+
+  // A mask structure used to set bits in each tiny Bloom filter.
+  struct BlockMask {
+    uint32_t item[kBitsSetPerBlock];
+  };
+
+  // The block-based algorithm needs eight odd SALT values to calculate eight indexes
+  // of bit to set, one bit in each 32-bit word.
+  static constexpr uint32_t SALT[kBitsSetPerBlock] = {
+      0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
+      0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
+
+  // Memory pool to allocate aligned buffer for bitset
+  ::arrow::MemoryPool* pool_;
+
+  // The underlying buffer of bitset.
+  std::shared_ptr<Buffer> data_;
+
+  // The number of bytes of Bloom filter bitset.
+  uint32_t num_bytes_;
+
+  // Hash strategy used in this Bloom filter.
+  HashStrategy hash_strategy_;
+
+  // Algorithm used in this Bloom filter.
+  Algorithm algorithm_;
+
+  // Compression used in this Bloom filter.
+  CompressionStrategy compression_strategy_;
+
+  // The hash pointer points to actual hash class used.
+  std::unique_ptr<Hasher> hasher_;
+};
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/bloom_filter_reader.h b/pyarrow/include/parquet/bloom_filter_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbd267dd1972dcde98382dda3c84a6a544ddc3e3
--- /dev/null
+++ b/pyarrow/include/parquet/bloom_filter_reader.h
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/interfaces.h"
+#include "parquet/properties.h"
+#include "parquet/type_fwd.h"
+
+namespace parquet {
+
+class InternalFileDecryptor;
+class BloomFilter;
+
+class PARQUET_EXPORT RowGroupBloomFilterReader {
+ public:
+  virtual ~RowGroupBloomFilterReader() = default;
+
+  /// \brief Read bloom filter of a column chunk.
+  ///
+  /// \param[in] i column ordinal of the column chunk.
+  /// \returns bloom filter of the column or nullptr if it does not exist.
+  /// \throws ParquetException if the index is out of bound, or read bloom
+  /// filter failed.
+  virtual std::unique_ptr<BloomFilter> GetColumnBloomFilter(int i) = 0;
+};
+
+/// \brief Interface for reading the bloom filter for a Parquet file.
+class PARQUET_EXPORT BloomFilterReader {
+ public:
+  virtual ~BloomFilterReader() = default;
+
+  /// \brief Create a BloomFilterReader instance.
+  /// \returns a BloomFilterReader instance.
+  /// WARNING: The returned BloomFilterReader references to all the input parameters, so
+  /// it must not outlive all of the input parameters. Usually these input parameters
+  /// come from the same ParquetFileReader object, so it must not outlive the reader
+  /// that creates this BloomFilterReader.
+  static std::unique_ptr<BloomFilterReader> Make(
+      std::shared_ptr<::arrow::io::RandomAccessFile> input,
+      std::shared_ptr<FileMetaData> file_metadata, const ReaderProperties& properties,
+      std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+  /// \brief Get the bloom filter reader of a specific row group.
+  /// \param[in] i row group ordinal to get bloom filter reader.
+  /// \returns RowGroupBloomFilterReader of the specified row group. A nullptr may or may
+  ///          not be returned if the bloom filter for the row group is unavailable. It
+  ///          is the caller's responsibility to check the return value of follow-up calls
+  ///          to the RowGroupBloomFilterReader.
+  /// \throws ParquetException if the index is out of bound.
+  virtual std::shared_ptr<RowGroupBloomFilterReader> RowGroup(int i) = 0;
+};
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/column_page.h b/pyarrow/include/parquet/column_page.h
new file mode 100644
index 0000000000000000000000000000000000000000..111265a842ee7e61753a1067e78e80a868936627
--- /dev/null
+++ b/pyarrow/include/parquet/column_page.h
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "parquet/size_statistics.h"
+#include "parquet/statistics.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+// TODO: Parallel processing is not yet safe because of memory-ownership
+// semantics (the PageReader may or may not own the memory referenced by a
+// page)
+//
+// TODO(wesm): In the future Parquet implementations may store the crc code
+// in format::PageHeader. parquet-mr currently does not, so we also skip it
+// here, both on the read and write path
+class Page {
+ public:
+  Page(const std::shared_ptr<Buffer>& buffer, PageType::type type)
+      : buffer_(buffer), type_(type) {}
+
+  PageType::type type() const { return type_; }
+
+  std::shared_ptr<Buffer> buffer() const { return buffer_; }
+
+  // @returns: a pointer to the page's data
+  const uint8_t* data() const { return buffer_->data(); }
+
+  // @returns: the total size in bytes of the page's data buffer
+  int32_t size() const { return static_cast<int32_t>(buffer_->size()); }
+
+ private:
+  std::shared_ptr<Buffer> buffer_;
+  PageType::type type_;
+};
+
+/// \brief Base type for DataPageV1 and DataPageV2 including common attributes
+class DataPage : public Page {
+ public:
+  int32_t num_values() const { return num_values_; }
+  Encoding::type encoding() const { return encoding_; }
+  int64_t uncompressed_size() const { return uncompressed_size_; }
+  const EncodedStatistics& statistics() const { return statistics_; }
+  /// Return the row ordinal within the row group to the first row in the data page.
+  /// Currently it is only present from data pages created by ColumnWriter in order
+  /// to collect page index.
+  std::optional<int64_t> first_row_index() const { return first_row_index_; }
+  const SizeStatistics& size_statistics() const { return size_statistics_; }
+
+  virtual ~DataPage() = default;
+
+ protected:
+  DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
+           Encoding::type encoding, int64_t uncompressed_size,
+           EncodedStatistics statistics, std::optional<int64_t> first_row_index,
+           SizeStatistics size_statistics)
+      : Page(buffer, type),
+        num_values_(num_values),
+        encoding_(encoding),
+        uncompressed_size_(uncompressed_size),
+        statistics_(std::move(statistics)),
+        first_row_index_(std::move(first_row_index)),
+        size_statistics_(std::move(size_statistics)) {}
+
+  int32_t num_values_;
+  Encoding::type encoding_;
+  int64_t uncompressed_size_;
+  EncodedStatistics statistics_;
+  /// Row ordinal within the row group to the first row in the data page.
+  std::optional<int64_t> first_row_index_;
+  SizeStatistics size_statistics_;
+};
+
+class DataPageV1 : public DataPage {
+ public:
+  DataPageV1(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
+             Encoding::type encoding, Encoding::type definition_level_encoding,
+             Encoding::type repetition_level_encoding, int64_t uncompressed_size,
+             EncodedStatistics statistics = EncodedStatistics(),
+             std::optional<int64_t> first_row_index = std::nullopt,
+             SizeStatistics size_statistics = SizeStatistics())
+      : DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
+                 std::move(statistics), std::move(first_row_index),
+                 std::move(size_statistics)),
+        definition_level_encoding_(definition_level_encoding),
+        repetition_level_encoding_(repetition_level_encoding) {}
+
+  Encoding::type repetition_level_encoding() const { return repetition_level_encoding_; }
+
+  Encoding::type definition_level_encoding() const { return definition_level_encoding_; }
+
+ private:
+  Encoding::type definition_level_encoding_;
+  Encoding::type repetition_level_encoding_;
+};
+
+class DataPageV2 : public DataPage {
+ public:
+  DataPageV2(const std::shared_ptr<Buffer>& buffer, int32_t num_values, int32_t num_nulls,
+             int32_t num_rows, Encoding::type encoding,
+             int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
+             int64_t uncompressed_size, bool is_compressed = false,
+             EncodedStatistics statistics = EncodedStatistics(),
+             std::optional<int64_t> first_row_index = std::nullopt,
+             SizeStatistics size_statistics = SizeStatistics())
+      : DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
+                 std::move(statistics), std::move(first_row_index),
+                 std::move(size_statistics)),
+        num_nulls_(num_nulls),
+        num_rows_(num_rows),
+        definition_levels_byte_length_(definition_levels_byte_length),
+        repetition_levels_byte_length_(repetition_levels_byte_length),
+        is_compressed_(is_compressed) {}
+
+  int32_t num_nulls() const { return num_nulls_; }
+
+  int32_t num_rows() const { return num_rows_; }
+
+  int32_t definition_levels_byte_length() const { return definition_levels_byte_length_; }
+
+  int32_t repetition_levels_byte_length() const { return repetition_levels_byte_length_; }
+
+  bool is_compressed() const { return is_compressed_; }
+
+ private:
+  int32_t num_nulls_;
+  int32_t num_rows_;
+  int32_t definition_levels_byte_length_;
+  int32_t repetition_levels_byte_length_;
+  bool is_compressed_;
+};
+
+class DictionaryPage : public Page {
+ public:
+  DictionaryPage(const std::shared_ptr<Buffer>& buffer, int32_t num_values,
+                 Encoding::type encoding, bool is_sorted = false)
+      : Page(buffer, PageType::DICTIONARY_PAGE),
+        num_values_(num_values),
+        encoding_(encoding),
+        is_sorted_(is_sorted) {}
+
+  int32_t num_values() const { return num_values_; }
+
+  Encoding::type encoding() const { return encoding_; }
+
+  bool is_sorted() const { return is_sorted_; }
+
+ private:
+  int32_t num_values_;
+  Encoding::type encoding_;
+  bool is_sorted_;
+};
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/column_reader.h b/pyarrow/include/parquet/column_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac4469b1904f8cd971010bd5627fef42b3ca6e3b
--- /dev/null
+++ b/pyarrow/include/parquet/column_reader.h
@@ -0,0 +1,458 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/macros.h"
+#include "parquet/exception.h"
+#include "parquet/level_conversion.h"
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+namespace bit_util {
+class BitReader;
+}  // namespace bit_util
+
+namespace util {
+template <typename T>
+class RleBitPackedDecoder;
+}  // namespace util
+
+}  // namespace arrow
+
+namespace parquet {
+
+class Decryptor;
+class Page;
+
+// 16 MB is the default maximum page header size
+static constexpr uint32_t kDefaultMaxPageHeaderSize = 16 * 1024 * 1024;
+
+// 16 KB is the default expected page header size
+static constexpr uint32_t kDefaultPageHeaderSize = 16 * 1024;
+
+// \brief DataPageStats stores encoded statistics and number of values/rows for
+// a page.
+struct PARQUET_EXPORT DataPageStats {
+  DataPageStats(const EncodedStatistics* encoded_statistics, int32_t num_values,
+                std::optional<int32_t> num_rows)
+      : encoded_statistics(encoded_statistics),
+        num_values(num_values),
+        num_rows(num_rows) {}
+
+  // Encoded statistics extracted from the page header.
+  // Nullptr if there are no statistics in the page header.
+  const EncodedStatistics* encoded_statistics;
+  // Number of values stored in the page. Filled for both V1 and V2 data pages.
+  // For repeated fields, this can be greater than number of rows. For
+  // non-repeated fields, this will be the same as the number of rows.
+  int32_t num_values;
+  // Number of rows stored in the page. std::nullopt if not available.
+  std::optional<int32_t> num_rows;
+};
+
+class PARQUET_EXPORT LevelDecoder {
+ public:
+  LevelDecoder();
+  ~LevelDecoder();
+
+  // Initialize the LevelDecoder state with new data
+  // and return the number of bytes consumed
+  int SetData(Encoding::type encoding, int16_t max_level, int num_buffered_values,
+              const uint8_t* data, int32_t data_size);
+
+  void SetDataV2(int32_t num_bytes, int16_t max_level, int num_buffered_values,
+                 const uint8_t* data);
+
+  // Decodes a batch of levels into an array and returns the number of levels decoded
+  int Decode(int batch_size, int16_t* levels);
+
+ private:
+  int bit_width_;
+  int num_values_remaining_;
+  Encoding::type encoding_;
+  std::unique_ptr<::arrow::util::RleBitPackedDecoder<int16_t>> rle_decoder_;
+  std::unique_ptr<::arrow::bit_util::BitReader> bit_packed_decoder_;
+  int16_t max_level_;
+};
+
+struct CryptoContext {
+  bool start_decrypt_with_dictionary_page = false;
+  int16_t row_group_ordinal = -1;
+  int16_t column_ordinal = -1;
+  std::function<std::unique_ptr<Decryptor>()> meta_decryptor_factory;
+  std::function<std::unique_ptr<Decryptor>()> data_decryptor_factory;
+};
+
+// Abstract page iterator interface. This way, we can feed column pages to the
+// ColumnReader through whatever mechanism we choose
+class PARQUET_EXPORT PageReader {
+  using DataPageFilter = std::function<bool(const DataPageStats&)>;
+
+ public:
+  virtual ~PageReader() = default;
+
+  static std::unique_ptr<PageReader> Open(
+      std::shared_ptr<ArrowInputStream> stream, int64_t total_num_values,
+      Compression::type codec, bool always_compressed = false,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+      const CryptoContext* ctx = NULLPTR);
+  static std::unique_ptr<PageReader> Open(std::shared_ptr<ArrowInputStream> stream,
+                                          int64_t total_num_values,
+                                          Compression::type codec,
+                                          const ReaderProperties& properties,
+                                          bool always_compressed = false,
+                                          const CryptoContext* ctx = NULLPTR);
+
+  // If data_page_filter is present (not null), NextPage() will call the
+  // callback function exactly once per page in the order the pages appear in
+  // the column. If the callback function returns true the page will be
+  // skipped. The callback will be called only if the page type is DATA_PAGE or
+  // DATA_PAGE_V2. Dictionary pages will not be skipped.
+  // Caller is responsible for checking that statistics are correct using
+  // ApplicationVersion::HasCorrectStatistics().
+  // \note API EXPERIMENTAL
+  void set_data_page_filter(DataPageFilter data_page_filter) {
+    data_page_filter_ = std::move(data_page_filter);
+  }
+
+  // @returns: shared_ptr<Page>(nullptr) on EOS, std::shared_ptr<Page>
+  // containing new Page otherwise
+  //
+  // The returned Page may contain references that aren't guaranteed to live
+  // beyond the next call to NextPage().
+  virtual std::shared_ptr<Page> NextPage() = 0;
+
+  virtual void set_max_page_header_size(uint32_t size) = 0;
+
+ protected:
+  // Callback that decides if we should skip a page or not.
+  DataPageFilter data_page_filter_;
+};
+
+class PARQUET_EXPORT ColumnReader {
+ public:
+  virtual ~ColumnReader() = default;
+
+  static std::shared_ptr<ColumnReader> Make(
+      const ColumnDescriptor* descr, std::unique_ptr<PageReader> pager,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+  // Returns true if there are still values in this column.
+  virtual bool HasNext() = 0;
+
+  virtual Type::type type() const = 0;
+
+  virtual const ColumnDescriptor* descr() const = 0;
+
+  // Get the encoding that can be exposed by this reader. If it returns
+  // dictionary encoding, then ReadBatchWithDictionary can be used to read data.
+  //
+  // \note API EXPERIMENTAL
+  virtual ExposedEncoding GetExposedEncoding() = 0;
+
+ protected:
+  friend class RowGroupReader;
+  // Set the encoding that can be exposed by this reader.
+  //
+  // \note API EXPERIMENTAL
+  virtual void SetExposedEncoding(ExposedEncoding encoding) = 0;
+};
+
+// API to read values from a single column. This is a main client facing API.
+template <typename DType>
+class TypedColumnReader : public ColumnReader {
+ public:
+  using T = typename DType::c_type;
+
+  // Read a batch of repetition levels, definition levels, and values from the
+  // column.
+  //
+  // Since null values are not stored in the values, the number of values read
+  // may be less than the number of repetition and definition levels. With
+  // nested data this is almost certainly true.
+  //
+  // Set def_levels or rep_levels to nullptr if you want to skip reading them.
+  // This is only safe if you know through some other source that there are no
+  // undefined values.
+  //
+  // To fully exhaust a row group, you must read batches until the number of
+  // values read reaches the number of stored values according to the metadata.
+  //
+  // This API is the same for both V1 and V2 of the DataPage
+  //
+  // @returns: actual number of levels read (see values_read for number of values read)
+  virtual int64_t ReadBatch(int64_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+                            T* values, int64_t* values_read) = 0;
+
+  // Skip reading values. This method will work for both repeated and
+  // non-repeated fields. Note that this method is skipping values and not
+  // records. This distinction is important for repeated fields, meaning that
+  // we are not skipping over the values to the next record. For example,
+  // consider the following two consecutive records containing one repeated field:
+  // {[1, 2, 3]}, {[4, 5]}. If we Skip(2), our next read value will be 3, which
+  // is inside the first record.
+  // Returns the number of values skipped.
+  virtual int64_t Skip(int64_t num_values_to_skip) = 0;
+
+  // Read a batch of repetition levels, definition levels, and indices from the
+  // column. And read the dictionary if a dictionary page is encountered during
+  // reading pages. This API is similar to ReadBatch(), with ability to read
+  // dictionary and indices. It is only valid to call this method  when the reader can
+  // expose dictionary encoding. (i.e., the reader's GetExposedEncoding() returns
+  // DICTIONARY).
+  //
+  // The dictionary is read along with the data page. When there's no data page,
+  // the dictionary won't be returned.
+  //
+  // @param batch_size The batch size to read
+  // @param[out] def_levels The Parquet definition levels.
+  // @param[out] rep_levels The Parquet repetition levels.
+  // @param[out] indices The dictionary indices.
+  // @param[out] indices_read The number of indices read.
+  // @param[out] dict The pointer to dictionary values. It will return nullptr if
+  // there's no data page. Each column chunk only has one dictionary page. The dictionary
+  // is owned by the reader, so the caller is responsible for copying the dictionary
+  // values before the reader gets destroyed.
+  // @param[out] dict_len The dictionary length. It will return 0 if there's no data
+  // page.
+  // @returns: actual number of levels read (see indices_read for number of
+  // indices read
+  //
+  // \note API EXPERIMENTAL
+  virtual int64_t ReadBatchWithDictionary(int64_t batch_size, int16_t* def_levels,
+                                          int16_t* rep_levels, int32_t* indices,
+                                          int64_t* indices_read, const T** dict,
+                                          int32_t* dict_len) = 0;
+};
+
+namespace internal {
+
+/// \brief Stateful column reader that delimits semantic records for both flat
+/// and nested columns
+///
+/// \note API EXPERIMENTAL
+/// \since 1.3.0
+class PARQUET_EXPORT RecordReader {
+ public:
+  /// \brief Creates a record reader.
+  /// @param descr Column descriptor
+  /// @param leaf_info Level info, used to determine if a column is nullable or not
+  /// @param pool Memory pool to use for buffering values and rep/def levels
+  /// @param read_dictionary True if reading directly as Arrow dictionary-encoded
+  /// @param read_dense_for_nullable True if reading dense and not leaving space for null
+  /// values
+  /// @param arrow_type Which type to read this column as (optional). Currently
+  /// only used for byte array columns (see BinaryRecordReader::GetBuilderChunks).
+  static std::shared_ptr<RecordReader> Make(
+      const ColumnDescriptor* descr, LevelInfo leaf_info,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+      bool read_dictionary = false, bool read_dense_for_nullable = false,
+      const std::shared_ptr<::arrow::DataType>& arrow_type = NULLPTR);
+
+  virtual ~RecordReader() = default;
+
+  /// \brief Attempt to read indicated number of records from column chunk
+  /// Note that for repeated fields, a record may have more than one value
+  /// and all of them are read. If read_dense_for_nullable() it will
+  /// not leave any space for null values. Otherwise, it will read spaced.
+  /// \return number of records read
+  virtual int64_t ReadRecords(int64_t num_records) = 0;
+
+  /// \brief Attempt to skip indicated number of records from column chunk.
+  /// Note that for repeated fields, a record may have more than one value
+  /// and all of them are skipped.
+  /// \return number of records skipped
+  virtual int64_t SkipRecords(int64_t num_records) = 0;
+
+  /// \brief Pre-allocate space for data. Results in better flat read performance
+  virtual void Reserve(int64_t num_values) = 0;
+
+  /// \brief Clear consumed values and repetition/definition levels as the
+  /// result of calling ReadRecords
+  /// For FLBA and ByteArray types, call GetBuilderChunks() to reset them.
+  virtual void Reset() = 0;
+
+  /// \brief Transfer filled values buffer to caller. A new one will be
+  /// allocated in subsequent ReadRecords calls
+  virtual std::shared_ptr<ResizableBuffer> ReleaseValues() = 0;
+
+  /// \brief Transfer filled validity bitmap buffer to caller. A new one will
+  /// be allocated in subsequent ReadRecords calls
+  virtual std::shared_ptr<ResizableBuffer> ReleaseIsValid() = 0;
+
+  /// \brief Return true if the record reader has more internal data yet to
+  /// process
+  virtual bool HasMoreData() const = 0;
+
+  /// \brief Advance record reader to the next row group. Must be set before
+  /// any records could be read/skipped.
+  /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader
+  virtual void SetPageReader(std::unique_ptr<PageReader> reader) = 0;
+
+  /// \brief Returns the underlying column reader's descriptor.
+  virtual const ColumnDescriptor* descr() const = 0;
+
+  virtual void DebugPrintState() = 0;
+
+  /// \brief Returns the dictionary owned by the current decoder. Throws an
+  /// exception if the current decoder is not for dictionary encoding. The caller is
+  /// responsible for casting the returned pointer to proper type depending on the
+  /// column's physical type. An example:
+  ///   const ByteArray* dict = reinterpret_cast<const ByteArray*>(ReadDictionary(&len));
+  /// or:
+  ///   const float* dict = reinterpret_cast<const float*>(ReadDictionary(&len));
+  /// \param[out] dictionary_length The number of dictionary entries.
+  virtual const void* ReadDictionary(int32_t* dictionary_length) = 0;
+
+  /// \brief Decoded definition levels
+  int16_t* def_levels() const {
+    return reinterpret_cast<int16_t*>(def_levels_->mutable_data());
+  }
+
+  /// \brief Decoded repetition levels
+  int16_t* rep_levels() const {
+    return reinterpret_cast<int16_t*>(rep_levels_->mutable_data());
+  }
+
+  /// \brief Decoded values, including nulls, if any
+  /// FLBA and ByteArray types do not use this array and read into their own
+  /// builders.
+  uint8_t* values() const { return values_->mutable_data(); }
+
+  /// \brief Number of values written, including space left for nulls if any.
+  /// If this Reader was constructed with read_dense_for_nullable(), there is no space for
+  /// nulls and null_count() will be 0. There is no read-ahead/buffering for values. For
+  /// FLBA and ByteArray types this value reflects the values written with the last
+  /// ReadRecords call since those readers will reset the values after each call.
+  int64_t values_written() const { return values_written_; }
+
+  /// \brief Number of definition / repetition levels (from those that have
+  /// been decoded) that have been consumed inside the reader.
+  int64_t levels_position() const { return levels_position_; }
+
+  /// \brief Number of definition / repetition levels that have been written
+  /// internally in the reader. This may be larger than values_written() because
+  /// for repeated fields we need to look at the levels in advance to figure out
+  /// the record boundaries.
+  int64_t levels_written() const { return levels_written_; }
+
+  /// \brief Number of nulls in the leaf that we have read so far into the
+  /// values vector. This is only valid when !read_dense_for_nullable(). When
+  /// read_dense_for_nullable() it will always be 0.
+  int64_t null_count() const { return null_count_; }
+
+  /// \brief True if the leaf values are nullable
+  bool nullable_values() const { return nullable_values_; }
+
+  /// \brief True if reading directly as Arrow dictionary-encoded
+  bool read_dictionary() const { return read_dictionary_; }
+
+  /// \brief True if reading dense for nullable columns.
+  bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
+
+ protected:
+  /// \brief Indicates if we can have nullable values. Note that repeated fields
+  /// may or may not be nullable.
+  bool nullable_values_;
+
+  bool at_record_start_;
+  int64_t records_read_;
+
+  /// \brief Stores values. These values are populated based on each ReadRecords
+  /// call. No extra values are buffered for the next call. SkipRecords will not
+  /// add any value to this buffer.
+  std::shared_ptr<::arrow::ResizableBuffer> values_;
+  /// \brief False for FIXED_LEN_BYTE_ARRAY and BYTE_ARRAY, in which case we
+  /// don't allocate the values buffer and we directly read into builder classes.
+  bool uses_values_;
+
+  /// \brief Values that we have read into 'values_' + 'null_count_'.
+  int64_t values_written_;
+  int64_t values_capacity_;
+  int64_t null_count_;
+
+  /// \brief Each bit corresponds to one element in 'values_' and specifies if it
+  /// is null or not null.
+  ///
+  /// Not set if leaf type is not nullable or read_dense_for_nullable_ is true.
+  std::shared_ptr<::arrow::ResizableBuffer> valid_bits_;
+
+  /// \brief Buffer for definition levels. May contain more levels than
+  /// is actually read. This is because we read levels ahead to
+  /// figure out record boundaries for repeated fields.
+  /// For flat required fields, 'def_levels_' and 'rep_levels_' are not
+  ///  populated. For non-repeated fields 'rep_levels_' is not populated.
+  /// 'def_levels_' and 'rep_levels_' must be of the same size if present.
+  std::shared_ptr<::arrow::ResizableBuffer> def_levels_;
+  /// \brief Buffer for repetition levels. Only populated for repeated
+  /// fields.
+  std::shared_ptr<::arrow::ResizableBuffer> rep_levels_;
+
+  /// \brief Number of definition / repetition levels that have been written
+  /// internally in the reader. This may be larger than values_written() since
+  /// for repeated fields we need to look at the levels in advance to figure out
+  /// the record boundaries.
+  int64_t levels_written_;
+  /// \brief Position of the next level that should be consumed.
+  int64_t levels_position_;
+  int64_t levels_capacity_;
+
+  bool read_dictionary_ = false;
+  // If true, we will not leave any space for the null values in the values_
+  // vector or fill nulls values in BinaryRecordReader/DictionaryRecordReader.
+  //
+  // If read_dense_for_nullable_ is true, the BinaryRecordReader/DictionaryRecordReader
+  // might still populate the validity bitmap buffer.
+  bool read_dense_for_nullable_ = false;
+};
+
+class BinaryRecordReader : virtual public RecordReader {
+ public:
+  virtual std::vector<std::shared_ptr<::arrow::Array>> GetBuilderChunks() = 0;
+};
+
+/// \brief Read records directly to dictionary-encoded Arrow form (int32
+/// indices). Only valid for BYTE_ARRAY columns
+class DictionaryRecordReader : virtual public RecordReader {
+ public:
+  virtual std::shared_ptr<::arrow::ChunkedArray> GetResult() = 0;
+};
+
+}  // namespace internal
+
+using BoolReader = TypedColumnReader<BooleanType>;
+using Int32Reader = TypedColumnReader<Int32Type>;
+using Int64Reader = TypedColumnReader<Int64Type>;
+using Int96Reader = TypedColumnReader<Int96Type>;
+using FloatReader = TypedColumnReader<FloatType>;
+using DoubleReader = TypedColumnReader<DoubleType>;
+using ByteArrayReader = TypedColumnReader<ByteArrayType>;
+using FixedLenByteArrayReader = TypedColumnReader<FLBAType>;
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/column_scanner.h b/pyarrow/include/parquet/column_scanner.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9953866fab22ee6db13a92578f85556ea6f99ba
--- /dev/null
+++ b/pyarrow/include/parquet/column_scanner.h
@@ -0,0 +1,264 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <stdio.h>
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "parquet/column_reader.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+static constexpr int64_t DEFAULT_SCANNER_BATCH_SIZE = 128;
+
+class PARQUET_EXPORT Scanner {
+ public:
+  explicit Scanner(std::shared_ptr<ColumnReader> reader,
+                   int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
+                   ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+      : batch_size_(batch_size),
+        level_offset_(0),
+        levels_buffered_(0),
+        value_buffer_(AllocateBuffer(pool)),
+        value_offset_(0),
+        values_buffered_(0),
+        reader_(std::move(reader)) {
+    def_levels_.resize(
+        descr()->max_definition_level() > 0 ? static_cast<size_t>(batch_size_) : 0);
+    rep_levels_.resize(
+        descr()->max_repetition_level() > 0 ? static_cast<size_t>(batch_size_) : 0);
+  }
+
+  virtual ~Scanner() {}
+
+  static std::shared_ptr<Scanner> Make(
+      std::shared_ptr<ColumnReader> col_reader,
+      int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+  virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) = 0;
+
+  bool HasNext() { return level_offset_ < levels_buffered_ || reader_->HasNext(); }
+
+  const ColumnDescriptor* descr() const { return reader_->descr(); }
+
+  int64_t batch_size() const { return batch_size_; }
+
+  void SetBatchSize(int64_t batch_size) { batch_size_ = batch_size; }
+
+ protected:
+  int64_t batch_size_;
+
+  std::vector<int16_t> def_levels_;
+  std::vector<int16_t> rep_levels_;
+  int level_offset_;
+  int levels_buffered_;
+
+  std::shared_ptr<ResizableBuffer> value_buffer_;
+  int value_offset_;
+  int64_t values_buffered_;
+  std::shared_ptr<ColumnReader> reader_;
+};
+
+template <typename DType>
+class PARQUET_TEMPLATE_CLASS_EXPORT TypedScanner : public Scanner {
+ public:
+  typedef typename DType::c_type T;
+
+  explicit TypedScanner(std::shared_ptr<ColumnReader> reader,
+                        int64_t batch_size = DEFAULT_SCANNER_BATCH_SIZE,
+                        ::arrow::MemoryPool* pool = ::arrow::default_memory_pool())
+      : Scanner(std::move(reader), batch_size, pool) {
+    typed_reader_ = static_cast<TypedColumnReader<DType>*>(reader_.get());
+    int value_byte_size = type_traits<DType::type_num>::value_byte_size;
+    PARQUET_THROW_NOT_OK(value_buffer_->Resize(batch_size_ * value_byte_size));
+    values_ = reinterpret_cast<T*>(value_buffer_->mutable_data());
+  }
+
+  virtual ~TypedScanner() {}
+
+  bool NextLevels(int16_t* def_level, int16_t* rep_level) {
+    if (level_offset_ == levels_buffered_) {
+      levels_buffered_ = static_cast<int>(
+          typed_reader_->ReadBatch(static_cast<int>(batch_size_), def_levels_.data(),
+                                   rep_levels_.data(), values_, &values_buffered_));
+
+      value_offset_ = 0;
+      level_offset_ = 0;
+      if (!levels_buffered_) {
+        return false;
+      }
+    }
+    *def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0;
+    *rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0;
+    level_offset_++;
+    return true;
+  }
+
+  bool Next(T* val, int16_t* def_level, int16_t* rep_level, bool* is_null) {
+    if (level_offset_ == levels_buffered_) {
+      if (!HasNext()) {
+        // Out of data pages
+        return false;
+      }
+    }
+
+    NextLevels(def_level, rep_level);
+    *is_null = *def_level < descr()->max_definition_level();
+
+    if (*is_null) {
+      return true;
+    }
+
+    if (value_offset_ == values_buffered_) {
+      throw ParquetException("Value was non-null, but has not been buffered");
+    }
+    *val = values_[value_offset_++];
+    return true;
+  }
+
+  // Returns true if there is a next value
+  bool NextValue(T* val, bool* is_null) {
+    if (level_offset_ == levels_buffered_) {
+      if (!HasNext()) {
+        // Out of data pages
+        return false;
+      }
+    }
+
+    // Out of values
+    int16_t def_level = -1;
+    int16_t rep_level = -1;
+    NextLevels(&def_level, &rep_level);
+    *is_null = def_level < descr()->max_definition_level();
+
+    if (*is_null) {
+      return true;
+    }
+
+    if (value_offset_ == values_buffered_) {
+      throw ParquetException("Value was non-null, but has not been buffered");
+    }
+    *val = values_[value_offset_++];
+    return true;
+  }
+
+  virtual void PrintNext(std::ostream& out, int width, bool with_levels = false) {
+    T val{};
+    int16_t def_level = -1;
+    int16_t rep_level = -1;
+    bool is_null = false;
+    char buffer[80];
+
+    if (!Next(&val, &def_level, &rep_level, &is_null)) {
+      throw ParquetException("No more values buffered");
+    }
+
+    if (with_levels) {
+      out << "  D:" << def_level << " R:" << rep_level << " ";
+      if (!is_null) {
+        out << "V:";
+      }
+    }
+
+    if (is_null) {
+      std::string null_fmt = format_fwf<ByteArrayType>(width);
+      snprintf(buffer, sizeof(buffer), null_fmt.c_str(), "NULL");
+    } else {
+      FormatValue(&val, buffer, sizeof(buffer), width);
+    }
+    out << buffer;
+  }
+
+ private:
+  // The ownership of this object is expressed through the reader_ variable in the base
+  TypedColumnReader<DType>* typed_reader_;
+
+  inline void FormatValue(void* val, char* buffer, int bufsize, int width);
+
+  T* values_;
+};
+
+template <typename DType>
+inline void TypedScanner<DType>::FormatValue(void* val, char* buffer, int bufsize,
+                                             int width) {
+  std::string fmt = format_fwf<DType>(width);
+  snprintf(buffer, bufsize, fmt.c_str(), *reinterpret_cast<T*>(val));
+}
+
+template <>
+inline void TypedScanner<Int96Type>::FormatValue(void* val, char* buffer, int bufsize,
+                                                 int width) {
+  std::string fmt = format_fwf<Int96Type>(width);
+  std::string result = Int96ToString(*reinterpret_cast<Int96*>(val));
+  snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
+template <>
+inline void TypedScanner<ByteArrayType>::FormatValue(void* val, char* buffer, int bufsize,
+                                                     int width) {
+  std::string fmt = format_fwf<ByteArrayType>(width);
+  std::string result = ByteArrayToString(*reinterpret_cast<ByteArray*>(val));
+  snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
+template <>
+inline void TypedScanner<FLBAType>::FormatValue(void* val, char* buffer, int bufsize,
+                                                int width) {
+  std::string fmt = format_fwf<FLBAType>(width);
+  std::string result = FixedLenByteArrayToString(
+      *reinterpret_cast<FixedLenByteArray*>(val), descr()->type_length());
+  snprintf(buffer, bufsize, fmt.c_str(), result.c_str());
+}
+
+typedef TypedScanner<BooleanType> BoolScanner;
+typedef TypedScanner<Int32Type> Int32Scanner;
+typedef TypedScanner<Int64Type> Int64Scanner;
+typedef TypedScanner<Int96Type> Int96Scanner;
+typedef TypedScanner<FloatType> FloatScanner;
+typedef TypedScanner<DoubleType> DoubleScanner;
+typedef TypedScanner<ByteArrayType> ByteArrayScanner;
+typedef TypedScanner<FLBAType> FixedLenByteArrayScanner;
+
+template <typename RType>
+int64_t ScanAll(int32_t batch_size, int16_t* def_levels, int16_t* rep_levels,
+                uint8_t* values, int64_t* values_buffered,
+                parquet::ColumnReader* reader) {
+  typedef typename RType::T Type;
+  auto typed_reader = static_cast<RType*>(reader);
+  auto vals = reinterpret_cast<Type*>(&values[0]);
+  return typed_reader->ReadBatch(batch_size, def_levels, rep_levels, vals,
+                                 values_buffered);
+}
+
+int64_t PARQUET_EXPORT ScanAllValues(int32_t batch_size, int16_t* def_levels,
+                                     int16_t* rep_levels, uint8_t* values,
+                                     int64_t* values_buffered,
+                                     parquet::ColumnReader* reader);
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/column_writer.h b/pyarrow/include/parquet/column_writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b56eb010a242c4b32f51f39bc4d1f21dbe054d8
--- /dev/null
+++ b/pyarrow/include/parquet/column_writer.h
@@ -0,0 +1,307 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/compression.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+
+namespace bit_util {
+class BitWriter;
+}  // namespace bit_util
+
+namespace util {
+class RleBitPackedEncoder;
+class CodecOptions;
+}  // namespace util
+
+}  // namespace arrow
+
+namespace parquet {
+
+struct ArrowWriteContext;
+class ColumnChunkMetaDataBuilder;
+class ColumnDescriptor;
+class ColumnIndexBuilder;
+class DataPage;
+class DictionaryPage;
+class Encryptor;
+class OffsetIndexBuilder;
+class WriterProperties;
+
+class PARQUET_EXPORT LevelEncoder {
+ public:
+  LevelEncoder();
+  ~LevelEncoder();
+
+  static int MaxBufferSize(Encoding::type encoding, int16_t max_level,
+                           int num_buffered_values);
+
+  // Initialize the LevelEncoder.
+  void Init(Encoding::type encoding, int16_t max_level, int num_buffered_values,
+            uint8_t* data, int data_size);
+
+  // Encodes a batch of levels from an array and returns the number of levels encoded
+  int Encode(int batch_size, const int16_t* levels);
+
+  int32_t len() {
+    if (encoding_ != Encoding::RLE) {
+      throw ParquetException("Only implemented for RLE encoding");
+    }
+    return rle_length_;
+  }
+
+ private:
+  int bit_width_;
+  int rle_length_;
+  Encoding::type encoding_;
+  std::unique_ptr<::arrow::util::RleBitPackedEncoder> rle_encoder_;
+  std::unique_ptr<::arrow::bit_util::BitWriter> bit_packed_encoder_;
+};
+
+class PARQUET_EXPORT PageWriter {
+ public:
+  virtual ~PageWriter() {}
+
+  static std::unique_ptr<PageWriter> Open(
+      std::shared_ptr<ArrowOutputStream> sink, Compression::type codec,
+      ColumnChunkMetaDataBuilder* metadata, int16_t row_group_ordinal = -1,
+      int16_t column_chunk_ordinal = -1,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(),
+      bool buffered_row_group = false,
+      std::shared_ptr<Encryptor> header_encryptor = NULLPTR,
+      std::shared_ptr<Encryptor> data_encryptor = NULLPTR,
+      bool page_write_checksum_enabled = false,
+      // column_index_builder MUST outlive the PageWriter
+      ColumnIndexBuilder* column_index_builder = NULLPTR,
+      // offset_index_builder MUST outlive the PageWriter
+      OffsetIndexBuilder* offset_index_builder = NULLPTR,
+      const CodecOptions& codec_options = CodecOptions{});
+
+  // The Column Writer decides if dictionary encoding is used if set and
+  // if the dictionary encoding has fallen back to default encoding on reaching dictionary
+  // page limit
+  virtual void Close(bool has_dictionary, bool fallback) = 0;
+
+  // Return the number of uncompressed bytes written (including header size)
+  virtual int64_t WriteDataPage(const DataPage& page) = 0;
+
+  // Return the number of uncompressed bytes written (including header size)
+  virtual int64_t WriteDictionaryPage(const DictionaryPage& page) = 0;
+
+  /// \brief The total number of bytes written as serialized data and
+  /// dictionary pages to the sink so far.
+  virtual int64_t total_compressed_bytes_written() const = 0;
+
+  virtual bool has_compressor() = 0;
+
+  virtual void Compress(const Buffer& src_buffer, ResizableBuffer* dest_buffer) = 0;
+};
+
+class PARQUET_EXPORT ColumnWriter {
+ public:
+  virtual ~ColumnWriter() = default;
+
+  static std::shared_ptr<ColumnWriter> Make(ColumnChunkMetaDataBuilder*,
+                                            std::unique_ptr<PageWriter>,
+                                            const WriterProperties* properties);
+
+  /// \brief Closes the ColumnWriter, commits any buffered values to pages.
+  /// \return Total size of the column in bytes
+  virtual int64_t Close() = 0;
+
+  /// \brief The physical Parquet type of the column
+  virtual Type::type type() const = 0;
+
+  /// \brief The schema for the column
+  virtual const ColumnDescriptor* descr() const = 0;
+
+  /// \brief The number of rows written so far
+  virtual int64_t rows_written() const = 0;
+
+  /// \brief The total size of the compressed pages + page headers. Values
+  /// are still buffered and not written to a pager yet
+  ///
+  /// So in un-buffered mode, it always returns 0
+  virtual int64_t total_compressed_bytes() const = 0;
+
+  /// \brief The total number of bytes written as serialized data and
+  /// dictionary pages to the ColumnChunk so far
+  /// These bytes are uncompressed bytes.
+  virtual int64_t total_bytes_written() const = 0;
+
+  /// \brief The total number of bytes written as serialized data and
+  /// dictionary pages to the ColumnChunk so far.
+  /// If the column is uncompressed, the value would be equal to
+  /// total_bytes_written().
+  virtual int64_t total_compressed_bytes_written() const = 0;
+
+  /// \brief Estimated size of the values that are not written to a page yet.
+  virtual int64_t estimated_buffered_value_bytes() const = 0;
+
+  /// \brief The file-level writer properties
+  virtual const WriterProperties* properties() = 0;
+
+  /// \brief Add key-value metadata to the ColumnChunk.
+  /// \param[in] key_value_metadata the metadata to add.
+  /// \note This will overwrite any existing metadata with the same key.
+  /// \throw ParquetException if Close() has been called.
+  virtual void AddKeyValueMetadata(
+      const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata) = 0;
+
+  /// \brief Reset the ColumnChunk key-value metadata.
+  /// \throw ParquetException if Close() has been called.
+  virtual void ResetKeyValueMetadata() = 0;
+
+  /// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns
+  /// error status if the array data type is not compatible with the concrete
+  /// writer type.
+  ///
+  /// leaf_array is always a primitive (possibly dictionary encoded type).
+  /// Leaf_field_nullable indicates whether the leaf array is considered nullable
+  /// according to its schema in a Table or its parent array.
+  virtual ::arrow::Status WriteArrow(const int16_t* def_levels, const int16_t* rep_levels,
+                                     int64_t num_levels, const ::arrow::Array& leaf_array,
+                                     ArrowWriteContext* ctx,
+                                     bool leaf_field_nullable) = 0;
+};
+
+// API to write values to a single column. This is the main client facing API.
+template <typename DType>
+class TypedColumnWriter : public ColumnWriter {
+ public:
+  using T = typename DType::c_type;
+
+  // Write a batch of repetition levels, definition levels, and values to the
+  // column.
+  // `num_values` is the number of logical leaf values.
+  // `def_levels` (resp. `rep_levels`) can be null if the column's max definition level
+  // (resp. max repetition level) is 0.
+  // If not null, each of `def_levels` and `rep_levels` must have at least
+  // `num_values`.
+  //
+  // The number of physical values written (taken from `values`) is returned.
+  // It can be smaller than `num_values` is there are some undefined values.
+  virtual int64_t WriteBatch(int64_t num_values, const int16_t* def_levels,
+                             const int16_t* rep_levels, const T* values) = 0;
+
+  /// Write a batch of repetition levels, definition levels, and values to the
+  /// column.
+  ///
+  /// In comparison to WriteBatch the length of repetition and definition levels
+  /// is the same as of the number of values read for max_definition_level == 1.
+  /// In the case of max_definition_level > 1, the repetition and definition
+  /// levels are larger than the values but the values include the null entries
+  /// with definition_level == (max_definition_level - 1). Thus we have to differentiate
+  /// in the parameters of this function if the input has the length of num_values or the
+  /// _number of rows in the lowest nesting level_.
+  ///
+  /// In the case that the most inner node in the Parquet is required, the _number of rows
+  /// in the lowest nesting level_ is equal to the number of non-null values. If the
+  /// inner-most schema node is optional, the _number of rows in the lowest nesting level_
+  /// also includes all values with definition_level == (max_definition_level - 1).
+  ///
+  /// @param num_values number of levels to write.
+  /// @param def_levels The Parquet definition levels, length is num_values
+  /// @param rep_levels The Parquet repetition levels, length is num_values
+  /// @param valid_bits Bitmap that indicates if the row is null on the lowest nesting
+  ///   level. The length is number of rows in the lowest nesting level.
+  /// @param valid_bits_offset The offset in bits of the valid_bits where the
+  ///   first relevant bit resides.
+  /// @param values The values in the lowest nested level including
+  ///   spacing for nulls on the lowest levels; input has the length
+  ///   of the number of rows on the lowest nesting level.
+  virtual void WriteBatchSpaced(int64_t num_values, const int16_t* def_levels,
+                                const int16_t* rep_levels, const uint8_t* valid_bits,
+                                int64_t valid_bits_offset, const T* values) = 0;
+};
+
+using BoolWriter = TypedColumnWriter<BooleanType>;
+using Int32Writer = TypedColumnWriter<Int32Type>;
+using Int64Writer = TypedColumnWriter<Int64Type>;
+using Int96Writer = TypedColumnWriter<Int96Type>;
+using FloatWriter = TypedColumnWriter<FloatType>;
+using DoubleWriter = TypedColumnWriter<DoubleType>;
+using ByteArrayWriter = TypedColumnWriter<ByteArrayType>;
+using FixedLenByteArrayWriter = TypedColumnWriter<FLBAType>;
+
+namespace internal {
+
+/**
+ * Timestamp conversion constants
+ */
+constexpr int64_t kJulianEpochOffsetDays = INT64_C(2440588);
+
+template <int64_t UnitPerDay, int64_t NanosecondsPerUnit>
+inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_timestamp) {
+  auto julian_days = static_cast<int32_t>(time / UnitPerDay + kJulianEpochOffsetDays);
+  int64_t last_day_units = time % UnitPerDay;
+  if (last_day_units < 0) {
+    --julian_days;
+    last_day_units += UnitPerDay;
+  }
+  impala_timestamp->value[2] = static_cast<uint32_t>(julian_days);
+  uint64_t last_day_nanos = static_cast<uint64_t>(last_day_units) * NanosecondsPerUnit;
+  // impala_timestamp will be unaligned every other entry so do memcpy instead
+  // of assign and reinterpret cast to avoid undefined behavior.
+  std::memcpy(impala_timestamp, &last_day_nanos, sizeof(uint64_t));
+}
+
+constexpr int64_t kSecondsInNanos = INT64_C(1000000000);
+
+inline void SecondsToImpalaTimestamp(const int64_t seconds, Int96* impala_timestamp) {
+  ArrowTimestampToImpalaTimestamp<kSecondsPerDay, kSecondsInNanos>(seconds,
+                                                                   impala_timestamp);
+}
+
+constexpr int64_t kMillisecondsInNanos = kSecondsInNanos / INT64_C(1000);
+
+inline void MillisecondsToImpalaTimestamp(const int64_t milliseconds,
+                                          Int96* impala_timestamp) {
+  ArrowTimestampToImpalaTimestamp<kMillisecondsPerDay, kMillisecondsInNanos>(
+      milliseconds, impala_timestamp);
+}
+
+constexpr int64_t kMicrosecondsInNanos = kMillisecondsInNanos / INT64_C(1000);
+
+inline void MicrosecondsToImpalaTimestamp(const int64_t microseconds,
+                                          Int96* impala_timestamp) {
+  ArrowTimestampToImpalaTimestamp<kMicrosecondsPerDay, kMicrosecondsInNanos>(
+      microseconds, impala_timestamp);
+}
+
+constexpr int64_t kNanosecondsInNanos = INT64_C(1);
+
+inline void NanosecondsToImpalaTimestamp(const int64_t nanoseconds,
+                                         Int96* impala_timestamp) {
+  ArrowTimestampToImpalaTimestamp<kNanosecondsPerDay, kNanosecondsInNanos>(
+      nanoseconds, impala_timestamp);
+}
+
+}  // namespace internal
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/encoding.h b/pyarrow/include/parquet/encoding.h
new file mode 100644
index 0000000000000000000000000000000000000000..d80bf0edcae41cf213e497abdf07ad2d6e980ec0
--- /dev/null
+++ b/pyarrow/include/parquet/encoding.h
@@ -0,0 +1,458 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "arrow/type_fwd.h"
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace arrow {
+template <typename T>
+class Dictionary32Builder;
+}
+
+namespace parquet {
+
+template <typename DType>
+class TypedEncoder;
+
+using BooleanEncoder = TypedEncoder<BooleanType>;
+using Int32Encoder = TypedEncoder<Int32Type>;
+using Int64Encoder = TypedEncoder<Int64Type>;
+using Int96Encoder = TypedEncoder<Int96Type>;
+using FloatEncoder = TypedEncoder<FloatType>;
+using DoubleEncoder = TypedEncoder<DoubleType>;
+using ByteArrayEncoder = TypedEncoder<ByteArrayType>;
+using FLBAEncoder = TypedEncoder<FLBAType>;
+
+template <typename DType>
+class TypedDecoder;
+
+class BooleanDecoder;
+using Int32Decoder = TypedDecoder<Int32Type>;
+using Int64Decoder = TypedDecoder<Int64Type>;
+using Int96Decoder = TypedDecoder<Int96Type>;
+using FloatDecoder = TypedDecoder<FloatType>;
+using DoubleDecoder = TypedDecoder<DoubleType>;
+using ByteArrayDecoder = TypedDecoder<ByteArrayType>;
+class FLBADecoder;
+
+template <typename T>
+struct EncodingTraits;
+
+template <>
+struct EncodingTraits<BooleanType> {
+  using Encoder = BooleanEncoder;
+  using Decoder = BooleanDecoder;
+
+  using ArrowType = ::arrow::BooleanType;
+  using Accumulator = ::arrow::BooleanBuilder;
+  struct DictAccumulator {};
+};
+
+template <>
+struct EncodingTraits<Int32Type> {
+  using Encoder = Int32Encoder;
+  using Decoder = Int32Decoder;
+
+  using ArrowType = ::arrow::Int32Type;
+  using Accumulator = ::arrow::NumericBuilder<::arrow::Int32Type>;
+  using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int32Type>;
+};
+
+template <>
+struct EncodingTraits<Int64Type> {
+  using Encoder = Int64Encoder;
+  using Decoder = Int64Decoder;
+
+  using ArrowType = ::arrow::Int64Type;
+  using Accumulator = ::arrow::NumericBuilder<::arrow::Int64Type>;
+  using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::Int64Type>;
+};
+
+template <>
+struct EncodingTraits<Int96Type> {
+  using Encoder = Int96Encoder;
+  using Decoder = Int96Decoder;
+
+  struct Accumulator {};
+  struct DictAccumulator {};
+};
+
+template <>
+struct EncodingTraits<FloatType> {
+  using Encoder = FloatEncoder;
+  using Decoder = FloatDecoder;
+
+  using ArrowType = ::arrow::FloatType;
+  using Accumulator = ::arrow::NumericBuilder<::arrow::FloatType>;
+  using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FloatType>;
+};
+
+template <>
+struct EncodingTraits<DoubleType> {
+  using Encoder = DoubleEncoder;
+  using Decoder = DoubleDecoder;
+
+  using ArrowType = ::arrow::DoubleType;
+  using Accumulator = ::arrow::NumericBuilder<::arrow::DoubleType>;
+  using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::DoubleType>;
+};
+
+template <>
+struct EncodingTraits<ByteArrayType> {
+  using Encoder = ByteArrayEncoder;
+  using Decoder = ByteArrayDecoder;
+
+  /// \brief Internal helper class for decoding BYTE_ARRAY data
+  ///
+  /// This class allows the caller to choose the concrete Arrow data type
+  /// by passing a corresponding `ArrayBuilder`.
+  /// Supported `ArrayBuilder` classes are `BinaryBuilder`, `LargeBinaryBuilder`
+  /// and `BinaryViewBuilder`.
+  /// If the builder is a `BinaryBuilder`, `chunks` can accumulate several
+  /// arrays as needed to work around the 32-bit offset limit.
+  struct Accumulator {
+    std::unique_ptr<::arrow::ArrayBuilder> builder;
+    std::vector<std::shared_ptr<::arrow::Array>> chunks;
+  };
+  using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::BinaryType>;
+};
+
+template <>
+struct EncodingTraits<FLBAType> {
+  using Encoder = FLBAEncoder;
+  using Decoder = FLBADecoder;
+
+  using ArrowType = ::arrow::FixedSizeBinaryType;
+  using Accumulator = ::arrow::FixedSizeBinaryBuilder;
+  using DictAccumulator = ::arrow::Dictionary32Builder<::arrow::FixedSizeBinaryType>;
+};
+
+class ColumnDescriptor;
+
+// Untyped base for all encoders
+class Encoder {
+ public:
+  virtual ~Encoder() = default;
+
+  virtual int64_t EstimatedDataEncodedSize() = 0;
+  virtual std::shared_ptr<Buffer> FlushValues() = 0;
+  virtual Encoding::type encoding() const = 0;
+
+  virtual void Put(const ::arrow::Array& values) = 0;
+
+  // Report the number of bytes written to the encoder since the last report.
+  // It only works for BYTE_ARRAY type and throw for other types.
+  // This call is not idempotent since it resets the internal counter.
+  virtual int64_t ReportUnencodedDataBytes() = 0;
+
+  virtual MemoryPool* memory_pool() const = 0;
+};
+
+// Base class for value encoders. Since encoders may or not have state (e.g.,
+// dictionary encoding) we use a class instance to maintain any state.
+//
+// Encode interfaces are internal, subject to change without deprecation.
+template <typename DType>
+class TypedEncoder : virtual public Encoder {
+ public:
+  using T = typename DType::c_type;
+
+  using Encoder::Put;
+
+  virtual void Put(const T* src, int num_values) = 0;
+
+  virtual void Put(const std::vector<T>& src, int num_values = -1);
+
+  virtual void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits,
+                         int64_t valid_bits_offset) = 0;
+};
+
+template <typename DType>
+void TypedEncoder<DType>::Put(const std::vector<T>& src, int num_values) {
+  if (num_values == -1) {
+    num_values = static_cast<int>(src.size());
+  }
+  Put(src.data(), num_values);
+}
+
+template <>
+inline void TypedEncoder<BooleanType>::Put(const std::vector<bool>& src, int num_values) {
+  // NOTE(wesm): This stub is here only to satisfy the compiler; it is
+  // overridden later with the actual implementation
+}
+
+// Base class for dictionary encoders
+template <typename DType>
+class DictEncoder : virtual public TypedEncoder<DType> {
+ public:
+  /// Writes out any buffered indices to buffer preceded by the bit width of this data.
+  /// Returns the number of bytes written.
+  /// If the supplied buffer is not big enough, returns -1.
+  /// buffer must be preallocated with buffer_len bytes. Use EstimatedDataEncodedSize()
+  /// to size buffer.
+  virtual int WriteIndices(uint8_t* buffer, int buffer_len) = 0;
+
+  virtual int dict_encoded_size() const = 0;
+
+  virtual int bit_width() const = 0;
+
+  /// Writes out the encoded dictionary to buffer. buffer must be preallocated to
+  /// dict_encoded_size() bytes.
+  virtual void WriteDict(uint8_t* buffer) const = 0;
+
+  virtual int num_entries() const = 0;
+
+  /// \brief EXPERIMENTAL: Append dictionary indices into the encoder. It is
+  /// assumed (without any boundschecking) that the indices reference
+  /// preexisting dictionary values
+  /// \param[in] indices the dictionary index values. Only Int32Array currently
+  /// supported
+  virtual void PutIndices(const ::arrow::Array& indices) = 0;
+
+  /// \brief EXPERIMENTAL: Append dictionary into encoder, inserting indices
+  /// separately. Currently throws exception if the current dictionary memo is
+  /// non-empty
+  /// \param[in] values the dictionary values. Only valid for certain
+  /// Parquet/Arrow type combinations, like BYTE_ARRAY/BinaryArray
+  virtual void PutDictionary(const ::arrow::Array& values) = 0;
+};
+
+// ----------------------------------------------------------------------
+// Value decoding
+
+class Decoder {
+ public:
+  virtual ~Decoder() = default;
+
+  // Sets the data for a new page. This will be called multiple times on the same
+  // decoder and should reset all internal state.
+  //
+  // `num_values` comes from the data page header, and may be greater than the number of
+  // physical values in the data buffer if there are some omitted (null) values.
+  // `len`, on the other hand, is the size in bytes of the data buffer and
+  // directly relates to the number of physical values.
+  virtual void SetData(int num_values, const uint8_t* data, int len) = 0;
+
+  // Returns the number of values left (for the last call to SetData()). This is
+  // the number of values left in this page.
+  virtual int values_left() const = 0;
+  virtual Encoding::type encoding() const = 0;
+};
+
+template <typename DType>
+class TypedDecoder : virtual public Decoder {
+ public:
+  using T = typename DType::c_type;
+
+  /// \brief Decode values into a buffer
+  ///
+  /// Subclasses may override the more specialized Decode methods below.
+  ///
+  /// \param[in] buffer destination for decoded values
+  /// \param[in] max_values maximum number of values to decode
+  /// \return The number of values decoded. Should be identical to max_values except
+  /// at the end of the current data page.
+  virtual int Decode(T* buffer, int max_values) = 0;
+
+  /// \brief Decode the values in this data page but leave spaces for null entries.
+  ///
+  /// \param[in] buffer destination for decoded values
+  /// \param[in] num_values size of the def_levels and buffer arrays including the number
+  /// of null slots
+  /// \param[in] null_count number of null slots
+  /// \param[in] valid_bits bitmap data indicating position of valid slots
+  /// \param[in] valid_bits_offset offset into valid_bits
+  /// \return The number of values decoded, including nulls.
+  virtual int DecodeSpaced(T* buffer, int num_values, int null_count,
+                           const uint8_t* valid_bits, int64_t valid_bits_offset) = 0;
+
+  /// \brief Decode into an ArrayBuilder or other accumulator
+  ///
+  /// This function assumes the definition levels were already decoded
+  /// as a validity bitmap in the given `valid_bits`.  `null_count`
+  /// is the number of 0s in `valid_bits`.
+  /// As a space optimization, it is allowed for `valid_bits` to be null
+  /// if `null_count` is zero.
+  ///
+  /// \return number of values decoded
+  virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                          int64_t valid_bits_offset,
+                          typename EncodingTraits<DType>::Accumulator* out) = 0;
+
+  /// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
+  ///
+  /// \return number of values decoded
+  int DecodeArrowNonNull(int num_values,
+                         typename EncodingTraits<DType>::Accumulator* out) {
+    return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
+  }
+
+  /// \brief Decode into a DictionaryBuilder
+  ///
+  /// This function assumes the definition levels were already decoded
+  /// as a validity bitmap in the given `valid_bits`.  `null_count`
+  /// is the number of 0s in `valid_bits`.
+  /// As a space optimization, it is allowed for `valid_bits` to be null
+  /// if `null_count` is zero.
+  ///
+  /// \return number of values decoded
+  virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
+                          int64_t valid_bits_offset,
+                          typename EncodingTraits<DType>::DictAccumulator* builder) = 0;
+
+  /// \brief Decode into a DictionaryBuilder ignoring nulls
+  ///
+  /// \return number of values decoded
+  int DecodeArrowNonNull(int num_values,
+                         typename EncodingTraits<DType>::DictAccumulator* builder) {
+    return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, builder);
+  }
+};
+
+template <typename DType>
+class DictDecoder : virtual public TypedDecoder<DType> {
+ public:
+  using T = typename DType::c_type;
+
+  virtual void SetDict(TypedDecoder<DType>* dictionary) = 0;
+
+  /// \brief Insert dictionary values into the Arrow dictionary builder's memo,
+  /// but do not append any indices
+  virtual void InsertDictionary(::arrow::ArrayBuilder* builder) = 0;
+
+  /// \brief Decode only dictionary indices and append to dictionary
+  /// builder. The builder must have had the dictionary from this decoder
+  /// inserted already.
+  ///
+  /// \warning Remember to reset the builder each time the dict decoder is initialized
+  /// with a new dictionary page
+  virtual int DecodeIndicesSpaced(int num_values, int null_count,
+                                  const uint8_t* valid_bits, int64_t valid_bits_offset,
+                                  ::arrow::ArrayBuilder* builder) = 0;
+
+  /// \brief Decode only dictionary indices (no nulls)
+  ///
+  /// \warning Remember to reset the builder each time the dict decoder is initialized
+  /// with a new dictionary page
+  virtual int DecodeIndices(int num_values, ::arrow::ArrayBuilder* builder) = 0;
+
+  /// \brief Decode only dictionary indices (no nulls). Same as above
+  /// DecodeIndices but target is an array instead of a builder.
+  ///
+  /// \note API EXPERIMENTAL
+  virtual int DecodeIndices(int num_values, int32_t* indices) = 0;
+
+  /// \brief Get dictionary. The reader will call this API when it encounters a
+  /// new dictionary.
+  ///
+  /// @param[out] dictionary The pointer to dictionary values. Dictionary is owned by
+  /// the decoder and is destroyed when the decoder is destroyed.
+  /// @param[out] dictionary_length The dictionary length.
+  ///
+  /// \note API EXPERIMENTAL
+  virtual void GetDictionary(const T** dictionary, int32_t* dictionary_length) = 0;
+};
+
+// ----------------------------------------------------------------------
+// TypedEncoder specializations, traits, and factory functions
+
+class BooleanDecoder : virtual public TypedDecoder<BooleanType> {
+ public:
+  using TypedDecoder<BooleanType>::Decode;
+
+  /// \brief Decode and bit-pack values into a buffer
+  ///
+  /// \param[in] buffer destination for decoded values
+  /// This buffer will contain bit-packed values. If
+  /// max_values is not a multiple of 8, the trailing bits
+  /// of the last byte will be undefined.
+  /// \param[in] max_values max values to decode.
+  /// \return The number of values decoded. Should be identical to max_values except
+  /// at the end of the current data page.
+  virtual int Decode(uint8_t* buffer, int max_values) = 0;
+};
+
+class FLBADecoder : virtual public TypedDecoder<FLBAType> {
+ public:
+  using TypedDecoder<FLBAType>::DecodeSpaced;
+
+  // TODO(wesm): As possible follow-up to PARQUET-1508, we should examine if
+  // there is value in adding specialized read methods for
+  // FIXED_LEN_BYTE_ARRAY. If only Decimal data can occur with this data type
+  // then perhaps not
+};
+
+PARQUET_EXPORT
+std::unique_ptr<Encoder> MakeEncoder(
+    Type::type type_num, Encoding::type encoding, bool use_dictionary = false,
+    const ColumnDescriptor* descr = NULLPTR,
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+template <typename DType>
+std::unique_ptr<typename EncodingTraits<DType>::Encoder> MakeTypedEncoder(
+    Encoding::type encoding, bool use_dictionary = false,
+    const ColumnDescriptor* descr = NULLPTR,
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+  using OutType = typename EncodingTraits<DType>::Encoder;
+  std::unique_ptr<Encoder> base =
+      MakeEncoder(DType::type_num, encoding, use_dictionary, descr, pool);
+  return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
+}
+
+PARQUET_EXPORT
+std::unique_ptr<Decoder> MakeDecoder(
+    Type::type type_num, Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+namespace detail {
+
+PARQUET_EXPORT
+std::unique_ptr<Decoder> MakeDictDecoder(Type::type type_num,
+                                         const ColumnDescriptor* descr,
+                                         ::arrow::MemoryPool* pool);
+
+}  // namespace detail
+
+template <typename DType>
+std::unique_ptr<DictDecoder<DType>> MakeDictDecoder(
+    const ColumnDescriptor* descr = NULLPTR,
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+  using OutType = DictDecoder<DType>;
+  auto decoder = detail::MakeDictDecoder(DType::type_num, descr, pool);
+  return std::unique_ptr<OutType>(dynamic_cast<OutType*>(decoder.release()));
+}
+
+template <typename DType>
+std::unique_ptr<typename EncodingTraits<DType>::Decoder> MakeTypedDecoder(
+    Encoding::type encoding, const ColumnDescriptor* descr = NULLPTR,
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+  using OutType = typename EncodingTraits<DType>::Decoder;
+  std::unique_ptr<Decoder> base = MakeDecoder(DType::type_num, encoding, descr, pool);
+  return std::unique_ptr<OutType>(dynamic_cast<OutType*>(base.release()));
+}
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/encryption/crypto_factory.h b/pyarrow/include/parquet/encryption/crypto_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c6a5f29ea88eb6849abb78ce334406ba293428f
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/crypto_factory.h
@@ -0,0 +1,152 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "parquet/encryption/encryption.h"
+#include "parquet/encryption/file_key_wrapper.h"
+#include "parquet/encryption/key_toolkit.h"
+#include "parquet/encryption/kms_client_factory.h"
+#include "parquet/platform.h"
+
+namespace parquet::encryption {
+
+static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
+    ParquetCipher::AES_GCM_V1;
+static constexpr bool kDefaultPlaintextFooter = false;
+static constexpr bool kDefaultDoubleWrapping = true;
+static constexpr double kDefaultCacheLifetimeSeconds = 600;  // 10 minutes
+static constexpr bool kDefaultInternalKeyMaterial = true;
+static constexpr bool kDefaultUniformEncryption = false;
+static constexpr int32_t kDefaultDataKeyLengthBits = 128;
+
+struct PARQUET_EXPORT EncryptionConfiguration {
+  explicit EncryptionConfiguration(const std::string& footer_key)
+      : footer_key(footer_key) {}
+
+  /// ID of the master key for footer encryption/signing
+  std::string footer_key;
+
+  /// List of columns to encrypt, with column master key IDs (see HIVE-21848).
+  /// Format: "columnKeyID:colName,colName;columnKeyID:colName..."
+  /// Either
+  /// (1) column_keys must be set
+  /// or
+  /// (2) uniform_encryption must be set to true
+  /// If none of (1) and (2) are true, or if both are true, an exception will be
+  /// thrown.
+  std::string column_keys;
+
+  /// Encrypt footer and all columns with the same encryption key.
+  bool uniform_encryption = kDefaultUniformEncryption;
+
+  /// Parquet encryption algorithm. Can be "AES_GCM_V1" (default), or "AES_GCM_CTR_V1".
+  ParquetCipher::type encryption_algorithm = kDefaultEncryptionAlgorithm;
+
+  /// Write files with plaintext footer.
+  /// The default is false - files are written with encrypted footer.
+  bool plaintext_footer = kDefaultPlaintextFooter;
+
+  /// Use double wrapping - where data encryption keys (DEKs) are encrypted with key
+  /// encryption keys (KEKs), which in turn are encrypted with master keys.
+  /// The default is true. If set to false, use single wrapping - where DEKs are
+  /// encrypted directly with master keys.
+  bool double_wrapping = kDefaultDoubleWrapping;
+
+  /// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client
+  /// objects).
+  /// The default is 600 (10 minutes).
+  double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
+
+  /// Store key material inside Parquet file footers; this mode doesn’t produce
+  /// additional files. By default, true. If set to false, key material is stored in
+  /// separate files in the same folder, which enables key rotation for immutable
+  /// Parquet files.
+  bool internal_key_material = kDefaultInternalKeyMaterial;
+
+  /// Length of data encryption keys (DEKs), randomly generated by parquet key
+  /// management tools. Can be 128, 192 or 256 bits.
+  /// The default is 128 bits.
+  int32_t data_key_length_bits = kDefaultDataKeyLengthBits;
+};
+
+struct PARQUET_EXPORT DecryptionConfiguration {
+  /// Lifetime of cached entities (key encryption keys, local wrapping keys, KMS client
+  /// objects).
+  /// The default is 600 (10 minutes).
+  double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds;
+};
+
+/// This is a core class, that translates the parameters of high level encryption (like
+/// the names of encrypted columns, names of master keys, etc), into parameters of low
+/// level encryption (like the key metadata, DEK, etc). A factory that produces the low
+/// level FileEncryptionProperties and FileDecryptionProperties objects, from the high
+/// level parameters.
+class PARQUET_EXPORT CryptoFactory {
+ public:
+  /// a KmsClientFactory object must be registered via this method before calling any of
+  /// GetFileEncryptionProperties()/GetFileDecryptionProperties() methods.
+  void RegisterKmsClientFactory(std::shared_ptr<KmsClientFactory> kms_client_factory);
+
+  /// Get the encryption properties for a Parquet file.
+  /// If external key material is used then a file system and path to the
+  /// parquet file must be provided.
+  std::shared_ptr<FileEncryptionProperties> GetFileEncryptionProperties(
+      const KmsConnectionConfig& kms_connection_config,
+      const EncryptionConfiguration& encryption_config, const std::string& file_path = "",
+      const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);
+
+  /// Get decryption properties for a Parquet file.
+  /// If external key material is used then a file system and path to the
+  /// parquet file must be provided.
+  std::shared_ptr<FileDecryptionProperties> GetFileDecryptionProperties(
+      const KmsConnectionConfig& kms_connection_config,
+      const DecryptionConfiguration& decryption_config, const std::string& file_path = "",
+      const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);
+
+  void RemoveCacheEntriesForToken(const std::string& access_token) {
+    key_toolkit_->RemoveCacheEntriesForToken(access_token);
+  }
+
+  void RemoveCacheEntriesForAllTokens() {
+    key_toolkit_->RemoveCacheEntriesForAllTokens();
+  }
+
+  /// Rotates master encryption keys for a Parquet file that uses external key material.
+  /// In single wrapping mode, data encryption keys are decrypted with the old master keys
+  /// and then re-encrypted with new master keys.
+  /// In double wrapping mode, key encryption keys are decrypted with the old master keys
+  /// and then re-encrypted with new master keys.
+  /// This relies on the KMS supporting versioning, such that the old master key is
+  /// used when unwrapping a key, and the latest version is used when wrapping a key.
+  void RotateMasterKeys(const KmsConnectionConfig& kms_connection_config,
+                        const std::string& parquet_file_path,
+                        const std::shared_ptr<::arrow::fs::FileSystem>& file_system,
+                        bool double_wrapping = kDefaultDoubleWrapping,
+                        double cache_lifetime_seconds = kDefaultCacheLifetimeSeconds);
+
+ private:
+  ColumnPathToEncryptionPropertiesMap GetColumnEncryptionProperties(
+      int dek_length, const std::string& column_keys, FileKeyWrapper* key_wrapper);
+
+  /// Key utilities object for kms client initialization and cache control
+  std::shared_ptr<KeyToolkit> key_toolkit_ = std::make_shared<KeyToolkit>();
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/encryption.h b/pyarrow/include/parquet/encryption/encryption.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4634b704735a8a34ca8a9073c639fdac2da3521
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/encryption.h
@@ -0,0 +1,433 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/util/secure_string.h"
+#include "parquet/exception.h"
+#include "parquet/schema.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+static constexpr ParquetCipher::type kDefaultEncryptionAlgorithm =
+    ParquetCipher::AES_GCM_V1;
+static constexpr int32_t kMaximalAadMetadataLength = 256;
+static constexpr bool kDefaultEncryptedFooter = true;
+static constexpr bool kDefaultCheckSignature = true;
+static constexpr bool kDefaultAllowPlaintextFiles = false;
+static constexpr int32_t kAadFileUniqueLength = 8;
+
+class ColumnDecryptionProperties;
+using ColumnPathToDecryptionPropertiesMap =
+    std::map<std::string, std::shared_ptr<ColumnDecryptionProperties>>;
+
+class ColumnEncryptionProperties;
+using ColumnPathToEncryptionPropertiesMap =
+    std::map<std::string, std::shared_ptr<ColumnEncryptionProperties>>;
+
+class PARQUET_EXPORT DecryptionKeyRetriever {
+ public:
+  /// \brief Retrieve a key.
+  virtual ::arrow::util::SecureString GetKey(const std::string& key_id) = 0;
+
+  virtual ~DecryptionKeyRetriever() {}
+};
+
+/// Simple integer key retriever
+class PARQUET_EXPORT IntegerKeyIdRetriever : public DecryptionKeyRetriever {
+ public:
+  void PutKey(uint32_t key_id, ::arrow::util::SecureString key);
+
+  ::arrow::util::SecureString GetKey(const std::string& key_id_string) override {
+    // key_id_string is string but for IntegerKeyIdRetriever it encodes
+    // a native-endian 32 bit unsigned integer key_id
+    uint32_t key_id;
+    assert(key_id_string.size() == sizeof(key_id));
+    memcpy(&key_id, key_id_string.data(), sizeof(key_id));
+
+    return key_map_.at(key_id);
+  }
+
+ private:
+  std::map<uint32_t, ::arrow::util::SecureString> key_map_;
+};
+
+// Simple string key retriever
+class PARQUET_EXPORT StringKeyIdRetriever : public DecryptionKeyRetriever {
+ public:
+  void PutKey(std::string key_id, ::arrow::util::SecureString key);
+  ::arrow::util::SecureString GetKey(const std::string& key_id) override;
+
+ private:
+  std::map<std::string, ::arrow::util::SecureString> key_map_;
+};
+
+class PARQUET_EXPORT HiddenColumnException : public ParquetException {
+ public:
+  explicit HiddenColumnException(const std::string& columnPath)
+      : ParquetException(columnPath.c_str()) {}
+};
+
+class PARQUET_EXPORT KeyAccessDeniedException : public ParquetException {
+ public:
+  explicit KeyAccessDeniedException(const std::string& columnPath)
+      : ParquetException(columnPath.c_str()) {}
+};
+
+inline ::arrow::util::span<const uint8_t> str2span(const std::string& str) {
+  if (str.empty()) {
+    return {};
+  }
+
+  return {reinterpret_cast<const uint8_t*>(str.data()), str.size()};
+}
+
+class PARQUET_EXPORT ColumnEncryptionProperties {
+ public:
+  class PARQUET_EXPORT Builder {
+   public:
+    PARQUET_DEPRECATED("name argument is ignored, use default constructor instead")
+    explicit Builder(const std::string& name) : encrypted_(true) {}
+
+    PARQUET_DEPRECATED("path argument is ignored, use default constructor instead")
+    explicit Builder(const schema::ColumnPath& path) : encrypted_(true) {}
+
+    Builder() = default;
+
+    /// Set a column-specific key.
+    /// If key is not set on an encrypted column, the column will
+    /// be encrypted with the footer key.
+    /// keyBytes Key length must be either 16, 24 or 32 bytes.
+    /// Caller is responsible for wiping out the input key array.
+    Builder* key(::arrow::util::SecureString column_key);
+
+    /// Set a key retrieval metadata.
+    /// use either key_metadata() or key_id(), not both
+    Builder* key_metadata(std::string key_metadata);
+
+    /// A convenience function to set key metadata using a string id.
+    /// Set a key retrieval metadata (converted from String).
+    /// use either key_metadata() or key_id(), not both
+    /// key_id will be converted to metadata (UTF-8 array).
+    Builder* key_id(std::string key_id);
+
+    std::shared_ptr<ColumnEncryptionProperties> build() {
+      return std::shared_ptr<ColumnEncryptionProperties>(
+          new ColumnEncryptionProperties(encrypted_, key_, key_metadata_));
+    }
+
+   private:
+    bool encrypted_ = true;
+    ::arrow::util::SecureString key_;
+    std::string key_metadata_;
+  };
+
+  bool is_encrypted() const { return encrypted_; }
+  bool is_encrypted_with_footer_key() const { return encrypted_with_footer_key_; }
+  const ::arrow::util::SecureString& key() const { return key_; }
+  const std::string& key_metadata() const { return key_metadata_; }
+
+  static std::shared_ptr<ColumnEncryptionProperties> Unencrypted();
+  static std::shared_ptr<ColumnEncryptionProperties> WithFooterKey();
+  static std::shared_ptr<ColumnEncryptionProperties> WithColumnKey(
+      ::arrow::util::SecureString key, std::string key_metadata = "");
+
+ private:
+  bool encrypted_;
+  bool encrypted_with_footer_key_;
+  ::arrow::util::SecureString key_;
+  std::string key_metadata_;
+  explicit ColumnEncryptionProperties(bool encrypted, ::arrow::util::SecureString key,
+                                      std::string key_metadata);
+};
+
+class PARQUET_EXPORT ColumnDecryptionProperties {
+ public:
+  class PARQUET_EXPORT Builder {
+   public:
+    explicit Builder(std::string name) : column_path_(std::move(name)) {}
+
+    explicit Builder(const schema::ColumnPath& path) : Builder(path.ToDotString()) {}
+
+    /// Set an explicit column key. If applied on a file that contains
+    /// key metadata for this column the metadata will be ignored,
+    /// the column will be decrypted with this key.
+    /// key length must be either 16, 24 or 32 bytes.
+    Builder* key(::arrow::util::SecureString key);
+
+    std::shared_ptr<ColumnDecryptionProperties> build();
+
+   private:
+    std::string column_path_;
+    ::arrow::util::SecureString key_;
+  };
+
+  const std::string& column_path() const { return column_path_; }
+  const ::arrow::util::SecureString& key() const { return key_; }
+
+ private:
+  std::string column_path_;
+  ::arrow::util::SecureString key_;
+
+  /// This class is only required for setting explicit column decryption keys -
+  /// to override key retriever (or to provide keys when key metadata and/or
+  /// key retriever are not available)
+  explicit ColumnDecryptionProperties(std::string column_path,
+                                      ::arrow::util::SecureString key);
+};
+
+class PARQUET_EXPORT AADPrefixVerifier {
+ public:
+  /// Verifies identity (AAD Prefix) of individual file,
+  /// or of file collection in a data set.
+  /// Throws exception if an AAD prefix is wrong.
+  /// In a data set, AAD Prefixes should be collected,
+  /// and then checked for missing files.
+  virtual void Verify(const std::string& aad_prefix) = 0;
+  virtual ~AADPrefixVerifier() {}
+};
+
+class PARQUET_EXPORT FileDecryptionProperties {
+ public:
+  class PARQUET_EXPORT Builder {
+   public:
+    Builder() {
+      check_plaintext_footer_integrity_ = kDefaultCheckSignature;
+      plaintext_files_allowed_ = kDefaultAllowPlaintextFiles;
+    }
+
+    /// Set an explicit footer key. If applied on a file that contains
+    /// footer key metadata the metadata will be ignored, the footer
+    /// will be decrypted/verified with this key.
+    /// If explicit key is not set, footer key will be fetched from
+    /// key retriever.
+    /// With explicit keys or AAD prefix, new encryption properties object must be
+    /// created for each encrypted file.
+    /// Explicit encryption keys (footer and column) are cloned.
+    /// Upon completion of file reading, the cloned encryption keys in the properties
+    /// will be wiped out (array values set to 0).
+    /// Caller is responsible for wiping out the input key array.
+    /// param footerKey Key length must be either 16, 24 or 32 bytes.
+    Builder* footer_key(::arrow::util::SecureString footer_key);
+
+    /// Set explicit column keys (decryption properties).
+    /// Its also possible to set a key retriever on this property object.
+    /// Upon file decryption, availability of explicit keys is checked before
+    /// invocation of the retriever callback.
+    /// If an explicit key is available for a footer or a column,
+    /// its key metadata will be ignored.
+    Builder* column_keys(
+        ColumnPathToDecryptionPropertiesMap column_decryption_properties);
+
+    /// Set a key retriever callback. Its also possible to
+    /// set explicit footer or column keys on this file property object.
+    /// Upon file decryption, availability of explicit keys is checked before
+    /// invocation of the retriever callback.
+    /// If an explicit key is available for a footer or a column,
+    /// its key metadata will be ignored.
+    Builder* key_retriever(std::shared_ptr<DecryptionKeyRetriever> key_retriever);
+
+    /// Skip integrity verification of plaintext footers.
+    /// If not called, integrity of plaintext footers will be checked in runtime,
+    /// and an exception will be thrown in the following situations:
+    /// - footer signing key is not available
+    /// (not passed, or not found by key retriever)
+    /// - footer content and signature don't match
+    Builder* disable_footer_signature_verification() {
+      check_plaintext_footer_integrity_ = false;
+      return this;
+    }
+
+    /// Explicitly supply the file AAD prefix.
+    /// A must when a prefix is used for file encryption, but not stored in file.
+    /// If AAD prefix is stored in file, it will be compared to the explicitly
+    /// supplied value and an exception will be thrown if they differ.
+    Builder* aad_prefix(std::string aad_prefix);
+
+    /// Set callback for verification of AAD Prefixes stored in file.
+    Builder* aad_prefix_verifier(std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier);
+
+    /// By default, reading plaintext (unencrypted) files is not
+    /// allowed when using a decryptor
+    /// - in order to detect files that were not encrypted by mistake.
+    /// However, the default behavior can be overridden by calling this method.
+    /// The caller should use then a different method to ensure encryption
+    /// of files with sensitive data.
+    Builder* plaintext_files_allowed() {
+      plaintext_files_allowed_ = true;
+      return this;
+    }
+
+    std::shared_ptr<FileDecryptionProperties> build() {
+      return std::shared_ptr<FileDecryptionProperties>(new FileDecryptionProperties(
+          footer_key_, key_retriever_, check_plaintext_footer_integrity_, aad_prefix_,
+          aad_prefix_verifier_, column_decryption_properties_, plaintext_files_allowed_));
+    }
+
+   private:
+    ::arrow::util::SecureString footer_key_;
+    std::string aad_prefix_;
+    std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
+    ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
+
+    std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
+    bool check_plaintext_footer_integrity_;
+    bool plaintext_files_allowed_;
+  };
+
+  const ::arrow::util::SecureString& column_key(const std::string& column_path) const;
+
+  const ::arrow::util::SecureString& footer_key() const { return footer_key_; }
+
+  const std::string& aad_prefix() const { return aad_prefix_; }
+
+  const std::shared_ptr<DecryptionKeyRetriever>& key_retriever() const {
+    return key_retriever_;
+  }
+
+  bool check_plaintext_footer_integrity() const {
+    return check_plaintext_footer_integrity_;
+  }
+
+  bool plaintext_files_allowed() const { return plaintext_files_allowed_; }
+
+  const std::shared_ptr<AADPrefixVerifier>& aad_prefix_verifier() const {
+    return aad_prefix_verifier_;
+  }
+
+ private:
+  ::arrow::util::SecureString footer_key_;
+  std::string aad_prefix_;
+  std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier_;
+  ColumnPathToDecryptionPropertiesMap column_decryption_properties_;
+  std::shared_ptr<DecryptionKeyRetriever> key_retriever_;
+  bool check_plaintext_footer_integrity_;
+  bool plaintext_files_allowed_;
+
+  FileDecryptionProperties(
+      ::arrow::util::SecureString footer_key,
+      std::shared_ptr<DecryptionKeyRetriever> key_retriever,
+      bool check_plaintext_footer_integrity, std::string aad_prefix,
+      std::shared_ptr<AADPrefixVerifier> aad_prefix_verifier,
+      ColumnPathToDecryptionPropertiesMap column_decryption_properties,
+      bool plaintext_files_allowed);
+};
+
+class PARQUET_EXPORT FileEncryptionProperties {
+ public:
+  class PARQUET_EXPORT Builder {
+   public:
+    explicit Builder(::arrow::util::SecureString footer_key)
+        : parquet_cipher_(kDefaultEncryptionAlgorithm),
+          encrypted_footer_(kDefaultEncryptedFooter),
+          footer_key_(std::move(footer_key)) {
+      store_aad_prefix_in_file_ = false;
+    }
+
+    /// Create files with plaintext footer.
+    /// If not called, the files will be created with encrypted footer (default).
+    Builder* set_plaintext_footer() {
+      encrypted_footer_ = false;
+      return this;
+    }
+
+    /// Set encryption algorithm.
+    /// If not called, files will be encrypted with AES_GCM_V1 (default).
+    Builder* algorithm(ParquetCipher::type parquet_cipher) {
+      parquet_cipher_ = parquet_cipher;
+      return this;
+    }
+
+    /// Set a key retrieval metadata (converted from String).
+    /// use either footer_key_metadata or footer_key_id, not both.
+    Builder* footer_key_id(std::string key_id);
+
+    /// Set a key retrieval metadata.
+    /// use either footer_key_metadata or footer_key_id, not both.
+    Builder* footer_key_metadata(std::string footer_key_metadata);
+
+    /// Set the file AAD Prefix.
+    Builder* aad_prefix(std::string aad_prefix);
+
+    /// Skip storing AAD Prefix in file.
+    /// If not called, and if AAD Prefix is set, it will be stored.
+    Builder* disable_aad_prefix_storage();
+
+    /// Set the list of encrypted columns and their properties (keys etc).
+    /// If not called, all columns will be encrypted with the footer key.
+    /// If called, the file columns not in the list will be left unencrypted.
+    Builder* encrypted_columns(ColumnPathToEncryptionPropertiesMap encrypted_columns);
+
+    std::shared_ptr<FileEncryptionProperties> build() {
+      return std::shared_ptr<FileEncryptionProperties>(new FileEncryptionProperties(
+          parquet_cipher_, footer_key_, footer_key_metadata_, encrypted_footer_,
+          aad_prefix_, store_aad_prefix_in_file_, encrypted_columns_));
+    }
+
+   private:
+    ParquetCipher::type parquet_cipher_;
+    bool encrypted_footer_;
+    ::arrow::util::SecureString footer_key_;
+    std::string footer_key_metadata_;
+
+    std::string aad_prefix_;
+    bool store_aad_prefix_in_file_;
+    ColumnPathToEncryptionPropertiesMap encrypted_columns_;
+  };
+
+  bool encrypted_footer() const { return encrypted_footer_; }
+
+  EncryptionAlgorithm algorithm() const { return algorithm_; }
+
+  const ::arrow::util::SecureString& footer_key() const { return footer_key_; }
+
+  const std::string& footer_key_metadata() const { return footer_key_metadata_; }
+
+  const std::string& file_aad() const { return file_aad_; }
+
+  std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
+      const std::string& column_path);
+
+  const ColumnPathToEncryptionPropertiesMap& encrypted_columns() const {
+    return encrypted_columns_;
+  }
+
+ private:
+  EncryptionAlgorithm algorithm_;
+  ::arrow::util::SecureString footer_key_;
+  std::string footer_key_metadata_;
+  bool encrypted_footer_;
+  std::string file_aad_;
+  std::string aad_prefix_;
+  bool store_aad_prefix_in_file_;
+  ColumnPathToEncryptionPropertiesMap encrypted_columns_;
+
+  FileEncryptionProperties(ParquetCipher::type cipher,
+                           ::arrow::util::SecureString footer_key,
+                           std::string footer_key_metadata, bool encrypted_footer,
+                           std::string aad_prefix, bool store_aad_prefix_in_file,
+                           ColumnPathToEncryptionPropertiesMap encrypted_columns);
+};
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/encryption/file_key_material_store.h b/pyarrow/include/parquet/encryption/file_key_material_store.h
new file mode 100644
index 0000000000000000000000000000000000000000..83f028a4bc1e9e0d24e21e7acfb785af0e5b37f7
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/file_key_material_store.h
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+#include <unordered_map>
+
+#include "arrow/filesystem/filesystem.h"
+#include "parquet/platform.h"
+
+namespace parquet::encryption {
+
+/// Stores encryption key material outside the Parquet file, for example in a separate
+/// small file in the same folder. This is important for “key rotation”, when MEKs have to
+/// be changed (if compromised; or periodically, just in case) - without modifying the
+/// Parquet files (often  immutable).
+class PARQUET_EXPORT FileKeyMaterialStore {
+ public:
+  /// Add key material for one encryption key.
+  virtual void AddKeyMaterial(std::string key_id_in_file, std::string key_material) = 0;
+
+  /// Get key material
+  virtual std::string GetKeyMaterial(std::string key_id_in_file) = 0;
+
+  /// After key material was added for all keys in the given Parquet file,
+  /// save material in persistent store.
+  virtual void SaveMaterial() = 0;
+
+  /// Remove key material from persistent store. Used in key rotation.
+  virtual void RemoveMaterial() = 0;
+
+  /// Move key material to another store. Used in key rotation.
+  virtual void MoveMaterialTo(std::shared_ptr<FileKeyMaterialStore> target_key_store) = 0;
+
+  /// Returns the Set of all key IDs in this store (for the given Parquet file)
+  virtual std::vector<std::string> GetKeyIDSet() = 0;
+
+  virtual ~FileKeyMaterialStore() {}
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/file_key_unwrapper.h b/pyarrow/include/parquet/encryption/file_key_unwrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..c86f68121c87231549f129fb5714d0b266b8dd1b
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/file_key_unwrapper.h
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/concurrent_map.h"
+#include "arrow/util/secure_string.h"
+
+#include "parquet/encryption/encryption.h"
+#include "parquet/encryption/file_system_key_material_store.h"
+#include "parquet/encryption/key_material.h"
+#include "parquet/encryption/key_toolkit.h"
+#include "parquet/encryption/key_toolkit_internal.h"
+#include "parquet/encryption/kms_client.h"
+#include "parquet/platform.h"
+
+namespace parquet::encryption {
+
+// This class will retrieve the key from "key metadata", following these steps:
+// 1. Parse "key metadata" (see structure in KeyMetadata class).
+// 2. Retrieve "key material" which can be stored inside or outside "key metadata".
+// 3. Unwrap the "data encryption key" from "key material". There are 2 modes:
+// 3.1. single wrapping: decrypt the wrapped "data encryption key" directly with "master
+// encryption key" 3.2. double wrapping: 2 steps: 3.2.1. "key encryption key" is decrypted
+// with "master encryption key" 3.2.2. "data encryption key" is decrypted with the above
+// "key encryption key"
+class PARQUET_EXPORT FileKeyUnwrapper : public DecryptionKeyRetriever {
+ public:
+  /// key_toolkit and kms_connection_config is to get KmsClient from cache or create
+  /// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of
+  /// KmsClient in the cache.
+  /// If the file uses external key material then the Parquet file path and file
+  /// system must be specified.
+  FileKeyUnwrapper(std::shared_ptr<KeyToolkit> key_toolkit,
+                   const KmsConnectionConfig& kms_connection_config,
+                   double cache_lifetime_seconds, const std::string& file_path = "",
+                   const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);
+
+  /// Constructor overload that takes a raw pointer to the KeyToolkit
+  FileKeyUnwrapper(KeyToolkit* key_toolkit,
+                   const KmsConnectionConfig& kms_connection_config,
+                   double cache_lifetime_seconds, const std::string& file_path = "",
+                   const std::shared_ptr<::arrow::fs::FileSystem>& file_system = NULLPTR);
+
+  /// Constructor overload that takes a raw pointer to the KeyToolkit and
+  /// accepts an existing key_material_store rather than using
+  /// the file path and file system to create one when needed.
+  FileKeyUnwrapper(KeyToolkit* key_toolkit,
+                   const KmsConnectionConfig& kms_connection_config,
+                   double cache_lifetime_seconds,
+                   std::shared_ptr<FileKeyMaterialStore> key_material_store);
+
+  /// Get the data key from key metadata
+  ::arrow::util::SecureString GetKey(const std::string& key_metadata_bytes) override;
+
+  /// Get the data key along with the master key id from key material
+  KeyWithMasterId GetDataEncryptionKey(const KeyMaterial& key_material);
+
+ private:
+  FileKeyUnwrapper(std::shared_ptr<KeyToolkit> key_toolkit_owner, KeyToolkit* key_toolkit,
+                   const KmsConnectionConfig& kms_connection_config,
+                   double cache_lifetime_seconds,
+                   std::shared_ptr<FileKeyMaterialStore> key_material_store,
+                   const std::string& file_path,
+                   const std::shared_ptr<::arrow::fs::FileSystem>& file_system);
+
+  std::shared_ptr<KmsClient> GetKmsClientFromConfigOrKeyMaterial(
+      const KeyMaterial& key_material);
+
+  /// A map of Key Encryption Key (KEK) ID -> KEK bytes, for the current token
+  std::shared_ptr<::arrow::util::ConcurrentMap<std::string, ::arrow::util::SecureString>>
+      kek_per_kek_id_;
+  std::shared_ptr<KeyToolkit> key_toolkit_owner_;
+  KeyToolkit* key_toolkit_;
+  KmsConnectionConfig kms_connection_config_;
+  const double cache_entry_lifetime_seconds_;
+  std::shared_ptr<FileKeyMaterialStore> key_material_store_;
+  const std::string file_path_;
+  std::shared_ptr<::arrow::fs::FileSystem> file_system_;
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/file_key_wrapper.h b/pyarrow/include/parquet/encryption/file_key_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa6d878bafe19c73a2bfc099f81df0bba9ce49a9
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/file_key_wrapper.h
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "arrow/util/concurrent_map.h"
+
+#include "parquet/encryption/file_key_material_store.h"
+#include "parquet/encryption/key_encryption_key.h"
+#include "parquet/encryption/key_toolkit.h"
+#include "parquet/encryption/kms_client.h"
+#include "parquet/platform.h"
+
+namespace parquet::encryption {
+
+// This class will generate "key metadata" from "data encryption key" and "master key",
+// following these steps:
+// 1. Wrap "data encryption key". There are 2 modes:
+//   1.1. single wrapping: encrypt "data encryption key" directly with "master encryption
+//        key"
+//   1.2. double wrapping: 2 steps:
+//     1.2.1. "key encryption key" is randomized (see KeyEncryptionKey class)
+//     1.2.2. "data encryption key" is encrypted with the above "key encryption key"
+// 2. Create "key material" (see structure in KeyMaterial class)
+// 3. Create "key metadata" with "key material" inside or a reference to outside "key
+//    material" (see structure in KeyMetadata class).
+class PARQUET_EXPORT FileKeyWrapper {
+ public:
+  static constexpr int kKeyEncryptionKeyLength = 16;
+  static constexpr int kKeyEncryptionKeyIdLength = 16;
+
+  /// key_toolkit and kms_connection_config is to get KmsClient from the cache or create
+  /// KmsClient if it's not in the cache yet. cache_entry_lifetime_seconds is life time of
+  /// KmsClient in the cache. key_material_store is to store "key material" outside
+  /// parquet file, NULL if "key material" is stored inside parquet file.
+  FileKeyWrapper(KeyToolkit* key_toolkit,
+                 const KmsConnectionConfig& kms_connection_config,
+                 std::shared_ptr<FileKeyMaterialStore> key_material_store,
+                 double cache_entry_lifetime_seconds, bool double_wrapping);
+
+  /// Creates key_metadata field for a given data key, via wrapping the key with the
+  /// master key.
+  /// When external key material is used, an identifier is usually generated automatically
+  /// but may be specified explicitly to support key rotation,
+  /// which requires keeping the same identifiers.
+  std::string GetEncryptionKeyMetadata(const ::arrow::util::SecureString& data_key,
+                                       const std::string& master_key_id,
+                                       bool is_footer_key,
+                                       std::string key_id_in_file = "");
+
+ private:
+  KeyEncryptionKey CreateKeyEncryptionKey(const std::string& master_key_id);
+
+  /// A map of Master Encryption Key ID -> KeyEncryptionKey, for the current token
+  std::shared_ptr<::arrow::util::ConcurrentMap<std::string, KeyEncryptionKey>>
+      kek_per_master_key_id_;
+
+  std::shared_ptr<KmsClient> kms_client_;
+  KmsConnectionConfig kms_connection_config_;
+  std::shared_ptr<FileKeyMaterialStore> key_material_store_;
+  const double cache_entry_lifetime_seconds_;
+  const bool double_wrapping_;
+  uint16_t key_counter_;
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/file_system_key_material_store.h b/pyarrow/include/parquet/encryption/file_system_key_material_store.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecbadb90a94833881ca851aa43fd213ba7ed93c2
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/file_system_key_material_store.h
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <set>
+#include <string>
+#include <unordered_map>
+
+#include "arrow/filesystem/filesystem.h"
+
+#include "parquet/encryption/file_key_material_store.h"
+#include "parquet/exception.h"
+
+namespace parquet::encryption {
+
+/// A FileKeyMaterialStore that stores key material in a file system file in the same
+/// folder as the Parquet file.
+class PARQUET_EXPORT FileSystemKeyMaterialStore : public FileKeyMaterialStore {
+ public:
+  static constexpr const char kKeyMaterialFilePrefix[] = "_KEY_MATERIAL_FOR_";
+  static constexpr const char kTempFilePrefix[] = "_TMP";
+  static constexpr const char kKeyMaterialFileSuffix[] = ".json";
+
+  FileSystemKeyMaterialStore() {}
+  FileSystemKeyMaterialStore(std::string key_material_file_path,
+                             std::shared_ptr<::arrow::fs::FileSystem> file_system);
+
+  /// Creates a new file system key material store for a parquet file.
+  /// When use_tmp_prefix is true, files are saved with an extra _TMP prefix so they don't
+  /// conflict with existing external material files. This is useful during key rotation
+  /// so that temporary key material files can be created while using the existing key
+  /// material, before moving the key material to the non-temporary location.
+  static std::shared_ptr<FileSystemKeyMaterialStore> Make(
+      std::string parquet_file_path, std::shared_ptr<::arrow::fs::FileSystem> file_system,
+      bool use_tmp_prefix);
+
+  /// Add key material for one encryption key.
+  void AddKeyMaterial(std::string key_id_in_file, std::string key_material) {
+    key_material_map_.emplace(std::move(key_id_in_file), std::move(key_material));
+  }
+
+  /// Get key material
+  std::string GetKeyMaterial(std::string key_id_in_file) {
+    if (key_material_map_.empty()) {
+      LoadKeyMaterialMap();
+    }
+    auto found = key_material_map_.find(key_id_in_file);
+    if (found == key_material_map_.end()) {
+      throw ParquetException("Invalid key id");
+    }
+    return found->second;
+  }
+
+  /// After key material was added for all keys in the given Parquet file,
+  /// save material in persistent store.
+  void SaveMaterial();
+
+  /// Remove key material from persistent store. Used in key rotation.
+  void RemoveMaterial();
+
+  /// Move key material to another store. Used in key rotation.
+  void MoveMaterialTo(std::shared_ptr<FileKeyMaterialStore> target_key_store);
+
+  ///  Returns the Set of all key IDs in this store (for the given Parquet file)
+  std::vector<std::string> GetKeyIDSet();
+
+ private:
+  std::string GetStorageFilePath() { return key_material_file_path_; }
+
+  std::string BuildKeyMaterialMapJson();
+  void LoadKeyMaterialMap();
+  std::string key_material_file_path_;
+  std::shared_ptr<::arrow::fs::FileSystem> file_system_;
+  /// Maps ID of a key in Parquet file and key material
+  std::unordered_map<std::string, std::string> key_material_map_;
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/key_encryption_key.h b/pyarrow/include/parquet/encryption/key_encryption_key.h
new file mode 100644
index 0000000000000000000000000000000000000000..1157937632afb1deca6568d75655c6195eac564d
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/key_encryption_key.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <vector>
+
+#include "arrow/util/base64.h"
+#include "arrow/util/secure_string.h"
+
+namespace parquet::encryption {
+
+// In the double wrapping mode, each "data encryption key" (DEK) is encrypted with a “key
+// encryption key” (KEK), that in turn is encrypted with a "master encryption key" (MEK).
+// In a writer process, a random KEK is generated for each MEK ID, and cached in a <MEK-ID
+// : KEK> map. This allows to perform an interaction with a KMS server only once for each
+// MEK, in order to wrap its KEK. "Data encryption key" (DEK) wrapping is performed
+// locally, and does not involve an interaction with a KMS server.
+class KeyEncryptionKey {
+ public:
+  KeyEncryptionKey(::arrow::util::SecureString kek_bytes, std::string kek_id,
+                   std::string encoded_wrapped_kek)
+      : kek_bytes_(std::move(kek_bytes)),
+        kek_id_(std::move(kek_id)),
+        encoded_kek_id_(::arrow::util::base64_encode(kek_id_)),
+        encoded_wrapped_kek_(std::move(encoded_wrapped_kek)) {}
+
+  const ::arrow::util::SecureString& kek_bytes() const { return kek_bytes_; }
+
+  const std::string& kek_id() const { return kek_id_; }
+
+  const std::string& encoded_kek_id() const { return encoded_kek_id_; }
+
+  const std::string& encoded_wrapped_kek() const { return encoded_wrapped_kek_; }
+
+ private:
+  ::arrow::util::SecureString kek_bytes_;
+  std::string kek_id_;
+  std::string encoded_kek_id_;
+  std::string encoded_wrapped_kek_;
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/key_material.h b/pyarrow/include/parquet/encryption/key_material.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e7e862c996d3f0b0c016f3953dc40dcb314a8a0
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/key_material.h
@@ -0,0 +1,129 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "parquet/platform.h"
+
+namespace arrow {
+namespace json {
+namespace internal {
+class ObjectParser;
+}  // namespace internal
+}  // namespace json
+}  // namespace arrow
+
+namespace parquet::encryption {
+
+// KeyMaterial class represents the "key material", keeping the information that allows
+// readers to recover an encryption key (see description of the KeyMetadata class). The
+// keytools package (PARQUET-1373) implements the "envelope encryption" pattern, in a
+// "single wrapping" or "double wrapping" mode. In the single wrapping mode, the key
+// material is generated by encrypting the "data encryption key" (DEK) by a "master key".
+// In the double wrapping mode, the key material is generated by encrypting the DEK by a
+// "key encryption key" (KEK), that in turn is encrypted by a "master key".
+//
+// Key material is kept in a flat json object, with the following fields:
+// 1. "keyMaterialType" - a String, with the type of  key material. In the current
+// version, only one value is allowed - "PKMT1" (stands
+//     for "parquet key management tools, version 1"). For external key material storage,
+//     this field is written in both "key metadata" and "key material" jsons. For internal
+//     key material storage, this field is written only once in the common json.
+// 2. "isFooterKey" - a boolean. If true, means that the material belongs to a file footer
+// key, and keeps additional information (such as
+//     KMS instance ID and URL). If false, means that the material belongs to a column
+//     key.
+// 3. "kmsInstanceID" - a String, with the KMS Instance ID. Written only in footer key
+// material.
+// 4. "kmsInstanceURL" - a String, with the KMS Instance URL. Written only in footer key
+// material.
+// 5. "masterKeyID" - a String, with the ID of the master key used to generate the
+// material.
+// 6. "wrappedDEK" - a String, with the wrapped DEK (base64 encoding).
+// 7. "doubleWrapping" - a boolean. If true, means that the material was generated in
+// double wrapping mode.
+//     If false - in single wrapping mode.
+// 8. "keyEncryptionKeyID" - a String, with the ID of the KEK used to generate the
+// material. Written only in double wrapping mode.
+// 9. "wrappedKEK" - a String, with the wrapped KEK (base64 encoding). Written only in
+// double wrapping mode.
+class PARQUET_EXPORT KeyMaterial {
+ public:
+  // these fields are defined in a specification and should never be changed
+  static constexpr const char kKeyMaterialTypeField[] = "keyMaterialType";
+  static constexpr const char kKeyMaterialType1[] = "PKMT1";
+
+  static constexpr const char kFooterKeyIdInFile[] = "footerKey";
+  static constexpr const char kColumnKeyIdInFilePrefix[] = "columnKey";
+
+  static constexpr const char kIsFooterKeyField[] = "isFooterKey";
+  static constexpr const char kDoubleWrappingField[] = "doubleWrapping";
+  static constexpr const char kKmsInstanceIdField[] = "kmsInstanceID";
+  static constexpr const char kKmsInstanceUrlField[] = "kmsInstanceURL";
+  static constexpr const char kMasterKeyIdField[] = "masterKeyID";
+  static constexpr const char kWrappedDataEncryptionKeyField[] = "wrappedDEK";
+  static constexpr const char kKeyEncryptionKeyIdField[] = "keyEncryptionKeyID";
+  static constexpr const char kWrappedKeyEncryptionKeyField[] = "wrappedKEK";
+
+ public:
+  KeyMaterial() = default;
+
+  static KeyMaterial Parse(const std::string& key_material_string);
+
+  static KeyMaterial Parse(
+      const ::arrow::json::internal::ObjectParser* key_material_json);
+
+  /// This method returns a json string that will be stored either inside a parquet file
+  /// or in a key material store outside the parquet file.
+  static std::string SerializeToJson(bool is_footer_key,
+                                     const std::string& kms_instance_id,
+                                     const std::string& kms_instance_url,
+                                     const std::string& master_key_id,
+                                     bool is_double_wrapped, const std::string& kek_id,
+                                     const std::string& encoded_wrapped_kek,
+                                     const std::string& encoded_wrapped_dek,
+                                     bool is_internal_storage);
+
+  bool is_footer_key() const { return is_footer_key_; }
+  bool is_double_wrapped() const { return is_double_wrapped_; }
+  const std::string& master_key_id() const { return master_key_id_; }
+  const std::string& wrapped_dek() const { return encoded_wrapped_dek_; }
+  const std::string& kek_id() const { return kek_id_; }
+  const std::string& wrapped_kek() const { return encoded_wrapped_kek_; }
+  const std::string& kms_instance_id() const { return kms_instance_id_; }
+  const std::string& kms_instance_url() const { return kms_instance_url_; }
+
+ private:
+  KeyMaterial(bool is_footer_key, const std::string& kms_instance_id,
+              const std::string& kms_instance_url, const std::string& master_key_id,
+              bool is_double_wrapped, const std::string& kek_id,
+              const std::string& encoded_wrapped_kek,
+              const std::string& encoded_wrapped_dek);
+
+  bool is_footer_key_;
+  std::string kms_instance_id_;
+  std::string kms_instance_url_;
+  std::string master_key_id_;
+  bool is_double_wrapped_;
+  std::string kek_id_;
+  std::string encoded_wrapped_kek_;
+  std::string encoded_wrapped_dek_;
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/key_metadata.h b/pyarrow/include/parquet/encryption/key_metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fe8ac7ccb9db3fb92da42064f9fe2aeabdbfb52
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/key_metadata.h
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include <variant>
+
+#include "parquet/encryption/key_material.h"
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+
+namespace parquet::encryption {
+
+// Parquet encryption specification defines "key metadata" as an arbitrary byte array,
+// generated by file writers for each encryption key, and passed to the low level API for
+// storage in the file footer. The "key metadata" field is made available to file readers
+// to enable recovery of the key. This interface can be utilized for implementation
+// of any key management scheme.
+//
+// The keytools package (PARQUET-1373) implements one approach, of many possible, to key
+// management and to generation of the "key metadata" fields. This approach, based on the
+// "envelope encryption" pattern, allows integration with KMS servers. It keeps the actual
+// material, required to recover a key, in a "key material" object (see the KeyMaterial
+// class for details). This class is implemented to support version 1 of the parquet key
+// management tools specification.
+//
+// KeyMetadata writes (and reads) the "key metadata" field as a flat json object,
+// with the following fields:
+// 1. "keyMaterialType" - a String, with the type of  key material.
+// 2. "internalStorage" - a boolean. If true, means that "key material" is kept inside the
+// "key metadata" field. If false, "key material" is kept externally (outside Parquet
+// files) - in this case, "key metadata" keeps a reference to the external "key material".
+// 3. "keyReference" - a String, with the reference to the external "key material".
+// Written only if internalStorage is false.
+//
+// If internalStorage is true, "key material" is a part of "key metadata", and the json
+// keeps additional fields, described in the KeyMaterial class.
+class PARQUET_EXPORT KeyMetadata {
+ public:
+  static constexpr const char kKeyMaterialInternalStorageField[] = "internalStorage";
+  static constexpr const char kKeyReferenceField[] = "keyReference";
+
+  /// key_metadata_bytes is the key metadata field stored in the parquet file,
+  /// in the serialized json object format.
+  static KeyMetadata Parse(const std::string& key_metadata_bytes);
+
+  static std::string CreateSerializedForExternalMaterial(
+      const std::string& key_reference);
+
+  bool key_material_stored_internally() const { return is_internal_storage_; }
+
+  const KeyMaterial& key_material() const {
+    if (!is_internal_storage_) {
+      throw ParquetException("key material is stored externally.");
+    }
+    return ::std::get<KeyMaterial>(key_material_or_reference_);
+  }
+
+  const std::string& key_reference() const {
+    if (is_internal_storage_) {
+      throw ParquetException("key material is stored internally.");
+    }
+    return ::std::get<std::string>(key_material_or_reference_);
+  }
+
+ private:
+  explicit KeyMetadata(const KeyMaterial& key_material);
+  explicit KeyMetadata(const std::string& key_reference);
+
+  bool is_internal_storage_;
+  /// If is_internal_storage_ is true, KeyMaterial is set,
+  /// else a string referencing to an outside "key material" is set.
+  ::std::variant<KeyMaterial, std::string> key_material_or_reference_;
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/key_toolkit.h b/pyarrow/include/parquet/encryption/key_toolkit.h
new file mode 100644
index 0000000000000000000000000000000000000000..a0b929877eebd71b416d771d003bfaf1abe43369
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/key_toolkit.h
@@ -0,0 +1,106 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "parquet/encryption/key_encryption_key.h"
+#include "parquet/encryption/kms_client.h"
+#include "parquet/encryption/kms_client_factory.h"
+#include "parquet/encryption/two_level_cache_with_expiration.h"
+#include "parquet/platform.h"
+
+namespace parquet::encryption {
+
+static constexpr uint64_t kCacheCleanPeriodForKeyRotation = 60 * 60;  // 1 hour
+
+// KeyToolkit is a utility that keeps various tools for key management (such as key
+// rotation, kms client instantiation, cache control, etc), plus a number of auxiliary
+// classes for internal use.
+class PARQUET_EXPORT KeyToolkit {
+ public:
+  KeyToolkit() { last_cache_clean_for_key_rotation_time_ = {}; }
+
+  /// KMS client two level cache: token -> KMSInstanceId -> KmsClient
+  TwoLevelCacheWithExpiration<std::shared_ptr<KmsClient>>& kms_client_cache_per_token() {
+    return kms_client_cache_;
+  }
+  /// Key encryption key two level cache for wrapping: token -> MasterEncryptionKeyId ->
+  /// KeyEncryptionKey
+  TwoLevelCacheWithExpiration<KeyEncryptionKey>& kek_write_cache_per_token() {
+    return key_encryption_key_write_cache_;
+  }
+
+  /// Key encryption key two level cache for unwrapping: token -> KeyEncryptionKeyId ->
+  /// KeyEncryptionKeyBytes
+  TwoLevelCacheWithExpiration<::arrow::util::SecureString>& kek_read_cache_per_token() {
+    return key_encryption_key_read_cache_;
+  }
+
+  std::shared_ptr<KmsClient> GetKmsClient(
+      const KmsConnectionConfig& kms_connection_config, double cache_entry_lifetime_ms);
+
+  /// Flush any caches that are tied to the (compromised) access_token
+  void RemoveCacheEntriesForToken(const std::string& access_token);
+
+  void RemoveCacheEntriesForAllTokens();
+
+  void RegisterKmsClientFactory(std::shared_ptr<KmsClientFactory> kms_client_factory) {
+    if (kms_client_factory_ != NULLPTR) {
+      throw ParquetException("KMS client factory has already been registered.");
+    }
+    kms_client_factory_ = std::move(kms_client_factory);
+  }
+
+  /// Key rotation. In the single wrapping mode, decrypts data keys with old master keys,
+  /// then encrypts them with new master keys. In the double wrapping mode, decrypts KEKs
+  /// (key encryption keys) with old master keys, generates new KEKs and encrypts them
+  /// with new master keys. Works only if key material is not stored internally in file
+  /// footers. Not supported in local key wrapping mode. Method can be run by multiple
+  /// threads, but each thread must work on different files.
+  void RotateMasterKeys(const KmsConnectionConfig& kms_connection_config,
+                        const std::string& parquet_file_path,
+                        const std::shared_ptr<::arrow::fs::FileSystem>& file_system,
+                        bool double_wrapping, double cache_lifetime_seconds);
+
+ private:
+  TwoLevelCacheWithExpiration<std::shared_ptr<KmsClient>> kms_client_cache_;
+  TwoLevelCacheWithExpiration<KeyEncryptionKey> key_encryption_key_write_cache_;
+  TwoLevelCacheWithExpiration<::arrow::util::SecureString> key_encryption_key_read_cache_;
+  std::shared_ptr<KmsClientFactory> kms_client_factory_;
+  mutable ::arrow::util::Mutex last_cache_clean_for_key_rotation_time_mutex_;
+  internal::TimePoint last_cache_clean_for_key_rotation_time_;
+};
+
+// "data encryption key" and "master key identifier" are paired together as output when
+// parsing from "key material"
+class PARQUET_EXPORT KeyWithMasterId {
+ public:
+  KeyWithMasterId(::arrow::util::SecureString key_bytes, std::string master_id)
+      : key_bytes_(std::move(key_bytes)), master_id_(std::move(master_id)) {}
+
+  const ::arrow::util::SecureString& data_key() const { return key_bytes_; }
+  const std::string& master_id() const { return master_id_; }
+
+ private:
+  ::arrow::util::SecureString key_bytes_;
+  std::string master_id_;
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/kms_client.h b/pyarrow/include/parquet/encryption/kms_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c67e7cae492de85df5ab656b3f0c0e1f92a7b96
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/kms_client.h
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "arrow/util/mutex.h"
+#include "arrow/util/secure_string.h"
+
+#include "parquet/exception.h"
+#include "parquet/platform.h"
+
+namespace parquet::encryption {
+
+/// This class wraps the key access token of a KMS server. If your token changes over
+/// time, you should keep the reference to the KeyAccessToken object and call Refresh()
+/// method every time you have a new token.
+class PARQUET_EXPORT KeyAccessToken {
+ public:
+  KeyAccessToken() = default;
+
+  explicit KeyAccessToken(const std::string value) : value_(value) {}
+
+  void Refresh(const std::string& new_value) {
+    auto lock = mutex_.Lock();
+    value_ = new_value;
+  }
+
+  const std::string& value() const {
+    auto lock = mutex_.Lock();
+    return value_;
+  }
+
+ private:
+  std::string value_;
+  mutable ::arrow::util::Mutex mutex_;
+};
+
+struct PARQUET_EXPORT KmsConnectionConfig {
+  std::string kms_instance_id;
+  std::string kms_instance_url;
+  /// If the access token is changed in the future, you should keep a reference to
+  /// this object and call Refresh() on it whenever there is a new access token.
+  std::shared_ptr<KeyAccessToken> refreshable_key_access_token;
+  std::unordered_map<std::string, std::string> custom_kms_conf;
+
+  KmsConnectionConfig();
+
+  const std::string& key_access_token() const {
+    if (refreshable_key_access_token == NULLPTR ||
+        refreshable_key_access_token->value().empty()) {
+      throw ParquetException("key access token is not set!");
+    }
+    return refreshable_key_access_token->value();
+  }
+
+  void SetDefaultIfEmpty();
+};
+
+class PARQUET_EXPORT KmsClient {
+ public:
+  static constexpr const char kKmsInstanceIdDefault[] = "DEFAULT";
+  static constexpr const char kKmsInstanceUrlDefault[] = "DEFAULT";
+  static constexpr const char kKeyAccessTokenDefault[] = "DEFAULT";
+
+  /// \brief Wraps a key.
+  ///
+  /// Encrypts it with the master key, encodes the result
+  /// and potentially adds a KMS-specific metadata.
+  virtual std::string WrapKey(const ::arrow::util::SecureString& key_bytes,
+                              const std::string& master_key_identifier) = 0;
+
+  /// \brief Decrypts (unwraps) a key with the master key.
+  virtual ::arrow::util::SecureString UnwrapKey(
+      const std::string& wrapped_key, const std::string& master_key_identifier) = 0;
+
+  virtual ~KmsClient() {}
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/kms_client_factory.h b/pyarrow/include/parquet/encryption/kms_client_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a7c77c7eebbfbb687575acb12b89c1c2e99461a
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/kms_client_factory.h
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "parquet/encryption/kms_client.h"
+#include "parquet/platform.h"
+
+namespace parquet::encryption {
+
+class PARQUET_EXPORT KmsClientFactory {
+ public:
+  explicit KmsClientFactory(bool wrap_locally = false) : wrap_locally_(wrap_locally) {}
+
+  virtual ~KmsClientFactory() = default;
+
+  virtual std::shared_ptr<KmsClient> CreateKmsClient(
+      const KmsConnectionConfig& kms_connection_config) = 0;
+
+ protected:
+  bool wrap_locally_;
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/local_wrap_kms_client.h b/pyarrow/include/parquet/encryption/local_wrap_kms_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..607c75a4c2e8828a55a73c85c48c0551d1b72aa2
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/local_wrap_kms_client.h
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+
+#include "arrow/util/concurrent_map.h"
+
+#include "parquet/encryption/kms_client.h"
+#include "parquet/platform.h"
+
+namespace parquet::encryption {
+
+/// This class supports local wrapping mode, master keys will be fetched from the KMS
+/// server and used to encrypt other keys (data encryption keys or key encryption keys).
+class PARQUET_EXPORT LocalWrapKmsClient : public KmsClient {
+ public:
+  static constexpr const char kLocalWrapNoKeyVersion[] = "NO_VERSION";
+
+  explicit LocalWrapKmsClient(const KmsConnectionConfig& kms_connection_config);
+
+  std::string WrapKey(const ::arrow::util::SecureString& key_bytes,
+                      const std::string& master_key_identifier) override;
+
+  ::arrow::util::SecureString UnwrapKey(
+      const std::string& wrapped_key, const std::string& master_key_identifier) override;
+
+ protected:
+  /// Get master key from the remote KMS server.
+  /// Note: this function might be called by multiple threads
+  virtual const ::arrow::util::SecureString& GetMasterKeyFromServer(
+      const std::string& master_key_identifier) = 0;
+
+ private:
+  /// KMS systems wrap keys by encrypting them by master keys, and attaching additional
+  /// information (such as the version number of the masker key) to the result of
+  /// encryption. The master key version is required in  key rotation. Currently, the
+  /// local wrapping mode does not support key rotation (because not all KMS systems allow
+  /// to fetch a master key by its ID and version number). Still, the local wrapping mode
+  /// adds a placeholder for the master key version, that will enable support for key
+  /// rotation in this mode in the future, with appropriate KMS systems. This will also
+  /// enable backward compatibility, where future readers will be able to extract master
+  /// key version in the files written by the current code.
+  ///
+  /// LocalKeyWrap class writes (and reads) the "key wrap" as a flat json with the
+  /// following fields:
+  /// 1. "masterKeyVersion" - a String, with the master key version. In the current
+  /// version, only one value is allowed - "NO_VERSION".
+  /// 2. "encryptedKey" - a String, with the key encrypted by the master key
+  /// (base64-encoded).
+  class LocalKeyWrap {
+   public:
+    static constexpr const char kLocalWrapKeyVersionField[] = "masterKeyVersion";
+    static constexpr const char kLocalWrapEncryptedKeyField[] = "encryptedKey";
+
+    LocalKeyWrap(std::string master_key_version, std::string encrypted_encoded_key);
+
+    static std::string CreateSerialized(const std::string& encrypted_encoded_key);
+
+    static LocalKeyWrap Parse(const std::string& wrapped_key);
+
+    const std::string& master_key_version() const { return master_key_version_; }
+
+    const std::string& encrypted_encoded_key() const { return encrypted_encoded_key_; }
+
+   private:
+    std::string encrypted_encoded_key_;
+    std::string master_key_version_;
+  };
+
+  const ::arrow::util::SecureString& GetKeyFromServer(const std::string& key_identifier);
+
+ protected:
+  KmsConnectionConfig kms_connection_config_;
+  ::arrow::util::ConcurrentMap<std::string, ::arrow::util::SecureString>
+      master_key_cache_;
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/test_encryption_util.h b/pyarrow/include/parquet/encryption/test_encryption_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..46b84a0d0a33d9a71570403ff4a0561da91b3464
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/test_encryption_util.h
@@ -0,0 +1,136 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include <gtest/gtest.h>
+
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/filesystem/localfs.h"
+#include "arrow/status.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/secure_string.h"
+
+#include "parquet/encryption/encryption.h"
+#include "parquet/test_util.h"
+
+namespace parquet {
+class ParquetFileReader;
+namespace encryption::test {
+
+using ::arrow::internal::TemporaryDir;
+using ::arrow::util::SecureString;
+
+constexpr int kFixedLength = 10;
+
+const SecureString kFooterEncryptionKey("0123456789012345");
+const SecureString kColumnEncryptionKey1("1234567890123450");
+const SecureString kColumnEncryptionKey2("1234567890123451");
+const SecureString kColumnEncryptionKey3("1234567890123452");
+const char kFileName[] = "tester";
+
+// Get the path of file inside parquet test data directory
+std::string data_file(const char* file);
+
+// A temporary directory that contains the encrypted files generated in the tests.
+extern std::unique_ptr<TemporaryDir> temp_dir;
+
+inline ::arrow::Result<std::unique_ptr<TemporaryDir>> temp_data_dir() {
+  return TemporaryDir::Make("parquet-encryption-test-");
+}
+
+const char kDoubleFieldName[] = "double_field";
+const char kFloatFieldName[] = "float_field";
+const char kBooleanFieldName[] = "boolean_field";
+const char kInt32FieldName[] = "int32_field";
+const char kInt64FieldName[] = "int64_field";
+const char kInt96FieldName[] = "int96_field";
+const char kByteArrayFieldName[] = "ba_field";
+const char kFixedLenByteArrayFieldName[] = "flba_field";
+
+const char kFooterMasterKey[] = "0123456789012345";
+const char kFooterMasterKeyId[] = "kf";
+const char* const kColumnMasterKeys[] = {"1234567890123450", "1234567890123451",
+                                         "1234567890123452", "1234567890123453",
+                                         "1234567890123454", "1234567890123455"};
+const char* const kColumnMasterKeyIds[] = {"kc1", "kc2", "kc3", "kc4", "kc5", "kc6"};
+
+// New master key values used to simulate key rotation
+const char kNewFooterMasterKey[] = "9123456789012345";
+const char* const kNewColumnMasterKeys[] = {"9234567890123450", "9234567890123451",
+                                            "9234567890123452", "9234567890123453",
+                                            "9234567890123454", "9234567890123455"};
+
+// The result of this function will be used to set into TestOnlyInMemoryKmsClientFactory
+// as the key mapping to look at.
+std::unordered_map<std::string, SecureString> BuildKeyMap(const char* const* column_ids,
+                                                          const char* const* column_keys,
+                                                          const char* footer_id,
+                                                          const char* footer_key);
+
+// The result of this function will be used to set into EncryptionConfiguration
+// as column keys.
+std::string BuildColumnKeyMapping();
+
+// FileEncryptor and FileDecryptor are helper classes to write/read an encrypted parquet
+// file corresponding to each pair of FileEncryptionProperties/FileDecryptionProperties.
+// FileEncryptor writes the file with fixed data values and FileDecryptor reads the file
+// and verify the correctness of data values.
+class FileEncryptor {
+ public:
+  FileEncryptor();
+
+  void EncryptFile(
+      std::string file,
+      std::shared_ptr<parquet::FileEncryptionProperties> encryption_configurations);
+
+ private:
+  std::shared_ptr<schema::GroupNode> SetupEncryptionSchema();
+
+  int num_rowgroups_ = 5;
+  int rows_per_rowgroup_ = 50;
+  std::shared_ptr<schema::GroupNode> schema_;
+};
+
+class FileDecryptor {
+ public:
+  void DecryptFile(
+      const std::string& file_name,
+      const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties);
+  void DecryptPageIndex(
+      const std::string& file_name,
+      const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties);
+
+ private:
+  void CheckFile(
+      parquet::ParquetFileReader* file_reader,
+      const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties);
+  void CheckPageIndex(
+      parquet::ParquetFileReader* file_reader,
+      const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties);
+};
+
+}  // namespace encryption::test
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/encryption/test_in_memory_kms.h b/pyarrow/include/parquet/encryption/test_in_memory_kms.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9d4169c6345f9f1d565615919830d94f107479c
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/test_in_memory_kms.h
@@ -0,0 +1,101 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "arrow/util/base64.h"
+
+#include "parquet/encryption/kms_client_factory.h"
+#include "parquet/encryption/local_wrap_kms_client.h"
+#include "parquet/platform.h"
+
+namespace parquet::encryption {
+
+// This is a mock class, built for testing only. Don't use it as an example of
+// LocalWrapKmsClient implementation.
+class TestOnlyLocalWrapInMemoryKms : public LocalWrapKmsClient {
+ public:
+  explicit TestOnlyLocalWrapInMemoryKms(const KmsConnectionConfig& kms_connection_config);
+
+  static void InitializeMasterKeys(
+      const std::unordered_map<std::string, ::arrow::util::SecureString>&
+          master_keys_map);
+
+ protected:
+  const ::arrow::util::SecureString& GetMasterKeyFromServer(
+      const std::string& master_key_identifier) override;
+
+ private:
+  static std::unordered_map<std::string, ::arrow::util::SecureString> master_key_map_;
+};
+
+// This is a mock class, built for testing only. Don't use it as an example of KmsClient
+// implementation.
+class TestOnlyInServerWrapKms : public KmsClient {
+ public:
+  static void InitializeMasterKeys(
+      const std::unordered_map<std::string, ::arrow::util::SecureString>&
+          master_keys_map);
+
+  std::string WrapKey(const ::arrow::util::SecureString& key_bytes,
+                      const std::string& master_key_identifier) override;
+
+  ::arrow::util::SecureString UnwrapKey(
+      const std::string& wrapped_key, const std::string& master_key_identifier) override;
+
+  static void StartKeyRotation(
+      const std::unordered_map<std::string, ::arrow::util::SecureString>&
+          new_master_keys_map);
+  static void FinishKeyRotation();
+
+ private:
+  ::arrow::util::SecureString GetMasterKeyFromServer(
+      const std::string& master_key_identifier);
+
+  // Different wrapping and unwrapping key maps to imitate versioning
+  // and support key rotation.
+  static std::unordered_map<std::string, ::arrow::util::SecureString>
+      unwrapping_master_key_map_;
+  static std::unordered_map<std::string, ::arrow::util::SecureString>
+      wrapping_master_key_map_;
+};
+
+// This is a mock class, built for testing only. Don't use it as an example of
+// KmsClientFactory implementation.
+class TestOnlyInMemoryKmsClientFactory : public KmsClientFactory {
+ public:
+  TestOnlyInMemoryKmsClientFactory(
+      bool wrap_locally,
+      const std::unordered_map<std::string, ::arrow::util::SecureString>& master_keys_map)
+      : KmsClientFactory(wrap_locally) {
+    TestOnlyLocalWrapInMemoryKms::InitializeMasterKeys(master_keys_map);
+    TestOnlyInServerWrapKms::InitializeMasterKeys(master_keys_map);
+  }
+
+  std::shared_ptr<KmsClient> CreateKmsClient(
+      const KmsConnectionConfig& kms_connection_config) {
+    if (wrap_locally_) {
+      return std::make_shared<TestOnlyLocalWrapInMemoryKms>(kms_connection_config);
+    } else {
+      return std::make_shared<TestOnlyInServerWrapKms>();
+    }
+  }
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/two_level_cache_with_expiration.h b/pyarrow/include/parquet/encryption/two_level_cache_with_expiration.h
new file mode 100644
index 0000000000000000000000000000000000000000..283ebd97b714c14638b43aae7b9acc75c2c7b51b
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/two_level_cache_with_expiration.h
@@ -0,0 +1,149 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <chrono>
+#include <unordered_map>
+
+#include "arrow/util/concurrent_map.h"
+#include "arrow/util/mutex.h"
+
+namespace parquet::encryption {
+
+using ::arrow::util::ConcurrentMap;
+
+namespace internal {
+
+using TimePoint =
+    std::chrono::time_point<std::chrono::system_clock, std::chrono::duration<double>>;
+
+inline TimePoint CurrentTimePoint() { return std::chrono::system_clock::now(); }
+
+template <typename E>
+class ExpiringCacheEntry {
+ public:
+  ExpiringCacheEntry() = default;
+
+  ExpiringCacheEntry(E cached_item, double expiration_interval_seconds)
+      : expiration_timestamp_(CurrentTimePoint() +
+                              std::chrono::duration<double>(expiration_interval_seconds)),
+        cached_item_(std::move(cached_item)) {}
+
+  bool IsExpired() const {
+    const auto now = CurrentTimePoint();
+    return (now > expiration_timestamp_);
+  }
+
+  E cached_item() { return cached_item_; }
+
+ private:
+  const TimePoint expiration_timestamp_;
+  E cached_item_;
+};
+
+// This class is to avoid the below warning when compiling KeyToolkit class with VS2015
+// warning C4503: decorated name length exceeded, name was truncated
+template <typename V>
+class ExpiringCacheMapEntry {
+ public:
+  ExpiringCacheMapEntry() = default;
+
+  explicit ExpiringCacheMapEntry(
+      std::shared_ptr<ConcurrentMap<std::string, V>> cached_item,
+      double expiration_interval_seconds)
+      : map_cache_(cached_item, expiration_interval_seconds) {}
+
+  bool IsExpired() { return map_cache_.IsExpired(); }
+
+  std::shared_ptr<ConcurrentMap<std::string, V>> cached_item() {
+    return map_cache_.cached_item();
+  }
+
+ private:
+  // ConcurrentMap object may be accessed and modified at many places at the same time,
+  // from multiple threads, or even removed from cache.
+  ExpiringCacheEntry<std::shared_ptr<ConcurrentMap<std::string, V>>> map_cache_;
+};
+
+}  // namespace internal
+
+// Two-level cache with expiration of internal caches according to token lifetime.
+// External cache is per token, internal is per string key.
+// Wrapper class around:
+//    std::unordered_map<std::string,
+//    internal::ExpiringCacheEntry<std::unordered_map<std::string, V>>>
+// This cache is safe to be shared between threads.
+template <typename V>
+class TwoLevelCacheWithExpiration {
+ public:
+  TwoLevelCacheWithExpiration() {
+    last_cache_cleanup_timestamp_ = internal::CurrentTimePoint();
+  }
+
+  std::shared_ptr<ConcurrentMap<std::string, V>> GetOrCreateInternalCache(
+      const std::string& access_token, double cache_entry_lifetime_seconds) {
+    auto lock = mutex_.Lock();
+
+    auto external_cache_entry = cache_.find(access_token);
+    if (external_cache_entry == cache_.end() ||
+        external_cache_entry->second.IsExpired()) {
+      cache_.insert({access_token, internal::ExpiringCacheMapEntry<V>(
+                                       std::make_shared<ConcurrentMap<std::string, V>>(),
+                                       cache_entry_lifetime_seconds)});
+    }
+
+    return cache_[access_token].cached_item();
+  }
+
+  void CheckCacheForExpiredTokens(double cache_cleanup_period_seconds = 0.0) {
+    auto lock = mutex_.Lock();
+
+    const auto now = internal::CurrentTimePoint();
+    if (now > (last_cache_cleanup_timestamp_ +
+               std::chrono::duration<double>(cache_cleanup_period_seconds))) {
+      RemoveExpiredEntriesNoMutex();
+      last_cache_cleanup_timestamp_ = now;
+    }
+  }
+
+  void Remove(const std::string& access_token) {
+    auto lock = mutex_.Lock();
+    cache_.erase(access_token);
+  }
+
+  void Clear() {
+    auto lock = mutex_.Lock();
+    cache_.clear();
+  }
+
+ private:
+  void RemoveExpiredEntriesNoMutex() {
+    for (auto it = cache_.begin(); it != cache_.end();) {
+      if (it->second.IsExpired()) {
+        it = cache_.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+  std::unordered_map<std::string, internal::ExpiringCacheMapEntry<V>> cache_;
+  internal::TimePoint last_cache_cleanup_timestamp_;
+  ::arrow::util::Mutex mutex_;
+};
+
+}  // namespace parquet::encryption
diff --git a/pyarrow/include/parquet/encryption/type_fwd.h b/pyarrow/include/parquet/encryption/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..623811718482c591e708a297dff9eb35ae0c85a9
--- /dev/null
+++ b/pyarrow/include/parquet/encryption/type_fwd.h
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace parquet {
+
+class Decryptor;
+class Encryptor;
+
+class InternalFileDecryptor;
+class InternalFileEncryptor;
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/exception.h b/pyarrow/include/parquet/exception.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c6c9ce8a726ec9c359b2c1b58103a9f96738df8
--- /dev/null
+++ b/pyarrow/include/parquet/exception.h
@@ -0,0 +1,175 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <exception>
+#include <sstream>
+#include <string>
+#include <utility>
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/string_util.h"
+#include "parquet/platform.h"
+
+#ifdef _MSC_VER
+#  pragma warning(push)
+// Disable warning for STL types usage in DLL interface
+// https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports
+#  pragma warning(disable : 4275 4251)
+// Disable diamond inheritance warnings
+#  pragma warning(disable : 4250)
+// Disable macro redefinition warnings
+#  pragma warning(disable : 4005)
+// Disable extern before exported template warnings
+#  pragma warning(disable : 4910)
+#endif
+
+// PARQUET-1085
+#if !defined(ARROW_UNUSED)
+#  define ARROW_UNUSED(x) UNUSED(x)
+#endif
+
+// Parquet exception to Arrow Status
+
+#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
+#define END_PARQUET_CATCH_EXCEPTIONS                   \
+  }                                                    \
+  catch (const ::parquet::ParquetStatusException& e) { \
+    return e.status();                                 \
+  }                                                    \
+  catch (const ::parquet::ParquetException& e) {       \
+    return ::arrow::Status::IOError(e.what());         \
+  }
+
+// clang-format off
+
+#define PARQUET_CATCH_NOT_OK(s)    \
+  BEGIN_PARQUET_CATCH_EXCEPTIONS   \
+  (s);                             \
+  END_PARQUET_CATCH_EXCEPTIONS
+
+// clang-format on
+
+#define PARQUET_CATCH_AND_RETURN(s) \
+  BEGIN_PARQUET_CATCH_EXCEPTIONS    \
+  return (s);                       \
+  END_PARQUET_CATCH_EXCEPTIONS
+
+// Arrow Status to Parquet exception
+
+#define PARQUET_IGNORE_NOT_OK(s)               \
+  do {                                         \
+    ::arrow::Status _s = ::arrow::ToStatus(s); \
+    ARROW_UNUSED(_s);                          \
+  } while (0)
+
+#define PARQUET_THROW_NOT_OK(s)                               \
+  do {                                                        \
+    ::arrow::Status _s = ::arrow::ToStatus(s);                \
+    if (!_s.ok()) {                                           \
+      throw ::parquet::ParquetStatusException(std::move(_s)); \
+    }                                                         \
+  } while (0)
+
+#define PARQUET_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \
+  auto status_name = (rexpr);                                 \
+  PARQUET_THROW_NOT_OK(status_name.status());                 \
+  lhs = std::move(status_name).ValueOrDie();
+
+#define PARQUET_ASSIGN_OR_THROW(lhs, rexpr)                                              \
+  PARQUET_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_RAISE_NAME(_error_or_value, __COUNTER__), \
+                               lhs, rexpr);
+
+namespace parquet {
+
+class PARQUET_EXPORT ParquetException : public std::exception {
+ public:
+  PARQUET_NORETURN static void EofException(const std::string& msg = "") {
+    static std::string prefix = "Unexpected end of stream";
+    if (msg.empty()) {
+      throw ParquetException(prefix);
+    }
+    throw ParquetException(prefix, ": ", msg);
+  }
+
+  PARQUET_NORETURN static void NYI(const std::string& msg = "") {
+    throw ParquetException("Not yet implemented: ", msg, ".");
+  }
+
+  template <typename... Args>
+  explicit ParquetException(Args&&... args)
+      : msg_(::arrow::internal::JoinToString(std::forward<Args>(args)...)) {}
+
+  explicit ParquetException(std::string msg) : msg_(std::move(msg)) {}
+
+  explicit ParquetException(const char* msg, const std::exception&) : msg_(msg) {}
+
+  ParquetException(const ParquetException&) = default;
+  ParquetException& operator=(const ParquetException&) = default;
+  ParquetException(ParquetException&&) = default;
+  ParquetException& operator=(ParquetException&&) = default;
+
+  const char* what() const noexcept override { return msg_.c_str(); }
+
+ private:
+  std::string msg_;
+};
+
+// Support printing a ParquetException.
+// This is needed for clang-on-MSVC as there operator<< is not defined for
+// std::exception.
+PARQUET_EXPORT
+std::ostream& operator<<(std::ostream& os, const ParquetException& exception);
+
+class ParquetStatusException : public ParquetException {
+ public:
+  explicit ParquetStatusException(::arrow::Status status)
+      : ParquetException(status.ToString()), status_(std::move(status)) {}
+
+  const ::arrow::Status& status() const { return status_; }
+
+ private:
+  ::arrow::Status status_;
+};
+
+// This class exists for the purpose of detecting an invalid or corrupted file.
+class ParquetInvalidOrCorruptedFileException : public ParquetStatusException {
+ public:
+  ParquetInvalidOrCorruptedFileException(const ParquetInvalidOrCorruptedFileException&) =
+      default;
+
+  template <typename Arg,
+            typename std::enable_if<
+                !std::is_base_of<ParquetInvalidOrCorruptedFileException, Arg>::value,
+                int>::type = 0,
+            typename... Args>
+  explicit ParquetInvalidOrCorruptedFileException(Arg arg, Args&&... args)
+      : ParquetStatusException(::arrow::Status::Invalid(std::forward<Arg>(arg),
+                                                        std::forward<Args>(args)...)) {}
+};
+
+template <typename StatusReturnBlock>
+void ThrowNotOk(StatusReturnBlock&& b) {
+  PARQUET_THROW_NOT_OK(b());
+}
+
+}  // namespace parquet
+
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
diff --git a/pyarrow/include/parquet/file_reader.h b/pyarrow/include/parquet/file_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..c42163276cdaa44129ca70f682c25387e0a8cbbf
--- /dev/null
+++ b/pyarrow/include/parquet/file_reader.h
@@ -0,0 +1,257 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/io/caching.h"
+#include "arrow/util/type_fwd.h"
+#include "parquet/metadata.h"  // IWYU pragma: keep
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+
+namespace parquet {
+
+class ColumnReader;
+class FileMetaData;
+class PageIndexReader;
+class BloomFilterReader;
+class PageReader;
+class RowGroupMetaData;
+
+namespace internal {
+class RecordReader;
+}
+
+class PARQUET_EXPORT RowGroupReader {
+ public:
+  // Forward declare a virtual class 'Contents' to aid dependency injection and more
+  // easily create test fixtures
+  // An implementation of the Contents class is defined in the .cc file
+  struct Contents {
+    virtual ~Contents() {}
+    virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
+    virtual const RowGroupMetaData* metadata() const = 0;
+    virtual const ReaderProperties* properties() const = 0;
+  };
+
+  explicit RowGroupReader(std::unique_ptr<Contents> contents);
+
+  // Returns the rowgroup metadata
+  const RowGroupMetaData* metadata() const;
+
+  // Construct a ColumnReader for the indicated row group-relative
+  // column. Ownership is shared with the RowGroupReader.
+  std::shared_ptr<ColumnReader> Column(int i);
+
+  // EXPERIMENTAL: Construct a RecordReader for the indicated column of the row group.
+  // Ownership is shared with the RowGroupReader.
+  std::shared_ptr<internal::RecordReader> RecordReader(int i,
+                                                       bool read_dictionary = false);
+
+  // Construct a ColumnReader, trying to enable exposed encoding.
+  //
+  // For dictionary encoding, currently we only support column chunks that are fully
+  // dictionary encoded, i.e., all data pages in the column chunk are dictionary encoded.
+  // If a column chunk uses dictionary encoding but then falls back to plain encoding, the
+  // encoding will not be exposed.
+  //
+  // The returned column reader provides an API GetExposedEncoding() for the
+  // users to check the exposed encoding and determine how to read the batches.
+  //
+  // \note API EXPERIMENTAL
+  std::shared_ptr<ColumnReader> ColumnWithExposeEncoding(
+      int i, ExposedEncoding encoding_to_expose);
+
+  // Construct a RecordReader, trying to enable exposed encoding.
+  //
+  // For dictionary encoding, currently we only support column chunks that are
+  // fully dictionary encoded byte arrays. The caller should verify if the reader can read
+  // and expose the dictionary by checking the reader's read_dictionary(). If a column
+  // chunk uses dictionary encoding but then falls back to plain encoding, the returned
+  // reader will read decoded data without exposing the dictionary.
+  //
+  // \note API EXPERIMENTAL
+  std::shared_ptr<internal::RecordReader> RecordReaderWithExposeEncoding(
+      int i, ExposedEncoding encoding_to_expose);
+
+  std::unique_ptr<PageReader> GetColumnPageReader(int i);
+
+ private:
+  // Holds a pointer to an instance of Contents implementation
+  std::unique_ptr<Contents> contents_;
+};
+
+class PARQUET_EXPORT ParquetFileReader {
+ public:
+  // Declare a virtual class 'Contents' to aid dependency injection and more
+  // easily create test fixtures
+  // An implementation of the Contents class is defined in the .cc file
+  struct PARQUET_EXPORT Contents {
+    static std::unique_ptr<Contents> Open(
+        std::shared_ptr<::arrow::io::RandomAccessFile> source,
+        const ReaderProperties& props = default_reader_properties(),
+        std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+    static ::arrow::Future<std::unique_ptr<Contents>> OpenAsync(
+        std::shared_ptr<::arrow::io::RandomAccessFile> source,
+        const ReaderProperties& props = default_reader_properties(),
+        std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+    virtual ~Contents() = default;
+    // Perform any cleanup associated with the file contents
+    virtual void Close() = 0;
+    virtual std::shared_ptr<RowGroupReader> GetRowGroup(int i) = 0;
+    virtual std::shared_ptr<FileMetaData> metadata() const = 0;
+    virtual std::shared_ptr<PageIndexReader> GetPageIndexReader() = 0;
+    virtual BloomFilterReader& GetBloomFilterReader() = 0;
+  };
+
+  ParquetFileReader();
+  ~ParquetFileReader();
+
+  // Create a file reader instance from an Arrow file object. Thread-safety is
+  // the responsibility of the file implementation
+  static std::unique_ptr<ParquetFileReader> Open(
+      std::shared_ptr<::arrow::io::RandomAccessFile> source,
+      const ReaderProperties& props = default_reader_properties(),
+      std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+  // API Convenience to open a serialized Parquet file on disk, using Arrow IO
+  // interfaces.
+  static std::unique_ptr<ParquetFileReader> OpenFile(
+      const std::string& path, bool memory_map = false,
+      const ReaderProperties& props = default_reader_properties(),
+      std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+  // Asynchronously open a file reader from an Arrow file object.
+  // Does not throw - all errors are reported through the Future.
+  static ::arrow::Future<std::unique_ptr<ParquetFileReader>> OpenAsync(
+      std::shared_ptr<::arrow::io::RandomAccessFile> source,
+      const ReaderProperties& props = default_reader_properties(),
+      std::shared_ptr<FileMetaData> metadata = NULLPTR);
+
+  void Open(std::unique_ptr<Contents> contents);
+  void Close();
+
+  // The RowGroupReader is owned by the FileReader
+  std::shared_ptr<RowGroupReader> RowGroup(int i);
+
+  // Returns the file metadata. Only one instance is ever created
+  std::shared_ptr<FileMetaData> metadata() const;
+
+  /// Returns the PageIndexReader. Only one instance is ever created.
+  ///
+  /// If the file does not have the page index, nullptr may be returned.
+  /// Because it pays to check existence of page index in the file, it
+  /// is possible to return a non null value even if page index does
+  /// not exist. It is the caller's responsibility to check the return
+  /// value and follow-up calls to PageIndexReader.
+  ///
+  /// WARNING: The returned PageIndexReader must not outlive the ParquetFileReader.
+  /// Initialize GetPageIndexReader() is not thread-safety.
+  std::shared_ptr<PageIndexReader> GetPageIndexReader();
+
+  /// Returns the BloomFilterReader. Only one instance is ever created.
+  ///
+  /// WARNING: The returned BloomFilterReader must not outlive the ParquetFileReader.
+  /// Initialize GetBloomFilterReader() is not thread-safety.
+  BloomFilterReader& GetBloomFilterReader();
+
+  /// Pre-buffer the specified column indices in all row groups.
+  ///
+  /// Readers can optionally call this to cache the necessary slices
+  /// of the file in-memory before deserialization. Arrow readers can
+  /// automatically do this via an option. This is intended to
+  /// increase performance when reading from high-latency filesystems
+  /// (e.g. Amazon S3).
+  ///
+  /// After calling this, creating readers for row groups/column
+  /// indices that were not buffered may fail. Creating multiple
+  /// readers for the a subset of the buffered regions is
+  /// acceptable. This may be called again to buffer a different set
+  /// of row groups/columns.
+  ///
+  /// If memory usage is a concern, note that data will remain
+  /// buffered in memory until either \a PreBuffer() is called again,
+  /// or the reader itself is destructed. Reading - and buffering -
+  /// only one row group at a time may be useful.
+  ///
+  /// This method may throw.
+  void PreBuffer(const std::vector<int>& row_groups,
+                 const std::vector<int>& column_indices,
+                 const ::arrow::io::IOContext& ctx,
+                 const ::arrow::io::CacheOptions& options);
+
+  /// Retrieve the list of byte ranges that would need to be read to retrieve
+  /// the data for the specified row groups and column indices.
+  ///
+  /// A reader can optionally call this if they wish to handle their own
+  /// caching and management of file reads (or offload them to other readers).
+  /// Unlike PreBuffer, this method will not perform any actual caching or
+  /// reads, instead just using the file metadata to determine the byte ranges
+  /// that would need to be read if you were to consume the entirety of the column
+  /// chunks for the provided columns in the specified row groups.
+  ///
+  /// If row_groups or column_indices are empty, then the result of this will be empty.
+  ///
+  /// hole_size_limit represents the maximum distance, in bytes, between two
+  /// consecutive ranges; beyond this value, ranges will not be combined. The default
+  /// value is 1MB.
+  ///
+  /// range_size_limit is the maximum size in bytes of a combined range; if combining
+  /// two consecutive ranges would produce a range larger than this, they are not
+  /// combined. The default values is 64MB. This *must* be larger than hole_size_limit.
+  ///
+  /// This will not take into account page indexes or any other predicate push down
+  /// benefits that may be available.
+  ::arrow::Result<std::vector<::arrow::io::ReadRange>> GetReadRanges(
+      const std::vector<int>& row_groups, const std::vector<int>& column_indices,
+      int64_t hole_size_limit = 1024 * 1024, int64_t range_size_limit = 64 * 1024 * 1024);
+
+  /// Wait for the specified row groups and column indices to be pre-buffered.
+  ///
+  /// After the returned Future completes, reading the specified row
+  /// groups/columns will not block.
+  ///
+  /// PreBuffer must be called first. This method does not throw.
+  ::arrow::Future<> WhenBuffered(const std::vector<int>& row_groups,
+                                 const std::vector<int>& column_indices) const;
+
+ private:
+  // Holds a pointer to an instance of Contents implementation
+  std::unique_ptr<Contents> contents_;
+};
+
+// Read only Parquet file metadata
+std::shared_ptr<FileMetaData> PARQUET_EXPORT
+ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
+
+/// \brief Scan all values in file. Useful for performance testing
+/// \param[in] columns the column numbers to scan. If empty scans all
+/// \param[in] column_batch_size number of values to read at a time when scanning column
+/// \param[in] reader a ParquetFileReader instance
+/// \return number of semantic rows in file
+PARQUET_EXPORT
+int64_t ScanFileContents(std::vector<int> columns, const int32_t column_batch_size,
+                         ParquetFileReader* reader);
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/file_writer.h b/pyarrow/include/parquet/file_writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5ea1d7c98a0ef8509b4821ab111007d7601996b
--- /dev/null
+++ b/pyarrow/include/parquet/file_writer.h
@@ -0,0 +1,245 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "parquet/metadata.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/schema.h"
+
+namespace parquet {
+
+class ColumnWriter;
+
+// FIXME: copied from reader-internal.cc
+static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
+static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};
+
+class PARQUET_EXPORT RowGroupWriter {
+ public:
+  // Forward declare a virtual class 'Contents' to aid dependency injection and more
+  // easily create test fixtures
+  // An implementation of the Contents class is defined in the .cc file
+  struct Contents {
+    virtual ~Contents() = default;
+    virtual int num_columns() const = 0;
+    virtual int64_t num_rows() const = 0;
+
+    // to be used only with ParquetFileWriter::AppendRowGroup
+    virtual ColumnWriter* NextColumn() = 0;
+    // to be used only with ParquetFileWriter::AppendBufferedRowGroup
+    virtual ColumnWriter* column(int i) = 0;
+
+    virtual int current_column() const = 0;
+    virtual void Close() = 0;
+
+    /// \brief total uncompressed bytes written by the page writer
+    virtual int64_t total_bytes_written() const = 0;
+    /// \brief total bytes still compressed but not written by the page writer
+    virtual int64_t total_compressed_bytes() const = 0;
+    /// \brief total compressed bytes written by the page writer
+    virtual int64_t total_compressed_bytes_written() const = 0;
+
+    virtual bool buffered() const = 0;
+  };
+
+  explicit RowGroupWriter(std::unique_ptr<Contents> contents);
+
+  /// Construct a ColumnWriter for the indicated row group-relative column.
+  ///
+  /// To be used only with ParquetFileWriter::AppendRowGroup
+  /// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
+  /// valid until the next call to NextColumn or Close. As the contents are
+  /// directly written to the sink, once a new column is started, the contents
+  /// of the previous one cannot be modified anymore.
+  ColumnWriter* NextColumn();
+  /// Index of currently written column. Equal to -1 if NextColumn()
+  /// has not been called yet.
+  int current_column();
+  void Close();
+
+  int num_columns() const;
+
+  /// Construct a ColumnWriter for the indicated row group column.
+  ///
+  /// To be used only with ParquetFileWriter::AppendBufferedRowGroup
+  /// Ownership is solely within the RowGroupWriter. The ColumnWriter is
+  /// valid until Close. The contents are buffered in memory and written to sink
+  /// on Close
+  ColumnWriter* column(int i);
+
+  /**
+   * Number of rows that shall be written as part of this RowGroup.
+   */
+  int64_t num_rows() const;
+
+  /// \brief total uncompressed bytes written by the page writer
+  int64_t total_bytes_written() const;
+  /// \brief total bytes still compressed but not written by the page writer.
+  /// It will always return 0 from the SerializedPageWriter.
+  int64_t total_compressed_bytes() const;
+  /// \brief total compressed bytes written by the page writer
+  int64_t total_compressed_bytes_written() const;
+
+  /// Returns whether the current RowGroupWriter is in the buffered mode and is created
+  /// by calling ParquetFileWriter::AppendBufferedRowGroup.
+  bool buffered() const;
+
+ private:
+  // Holds a pointer to an instance of Contents implementation
+  std::unique_ptr<Contents> contents_;
+};
+
+PARQUET_EXPORT
+void WriteFileMetaData(const FileMetaData& file_metadata,
+                       ::arrow::io::OutputStream* sink);
+
+PARQUET_EXPORT
+void WriteMetaDataFile(const FileMetaData& file_metadata,
+                       ::arrow::io::OutputStream* sink);
+
+PARQUET_EXPORT
+void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
+                                ArrowOutputStream* sink,
+                                const std::shared_ptr<Encryptor>& encryptor,
+                                bool encrypt_footer);
+
+PARQUET_EXPORT
+void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
+                                ::arrow::io::OutputStream* sink,
+                                const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
+                                bool encrypt_footer = false);
+PARQUET_EXPORT
+void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
+                             ::arrow::io::OutputStream* sink);
+
+class PARQUET_EXPORT ParquetFileWriter {
+ public:
+  // Forward declare a virtual class 'Contents' to aid dependency injection and more
+  // easily create test fixtures
+  // An implementation of the Contents class is defined in the .cc file
+  struct Contents {
+    Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
+             std::shared_ptr<const KeyValueMetadata> key_value_metadata)
+        : schema_(), key_value_metadata_(std::move(key_value_metadata)) {
+      schema_.Init(std::move(schema));
+    }
+    virtual ~Contents() {}
+    // Perform any cleanup associated with the file contents
+    virtual void Close() = 0;
+
+    virtual RowGroupWriter* AppendRowGroup() = 0;
+    virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
+
+    virtual int64_t num_rows() const = 0;
+    virtual int num_columns() const = 0;
+    virtual int num_row_groups() const = 0;
+
+    virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
+
+    const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
+      return key_value_metadata_;
+    }
+
+    virtual void AddKeyValueMetadata(
+        const std::shared_ptr<const KeyValueMetadata>& key_value_metadata) = 0;
+
+    // Return const-pointer to make it clear that this object is not to be copied
+    const SchemaDescriptor* schema() const { return &schema_; }
+
+    SchemaDescriptor schema_;
+
+    /// This should be the only place this is stored. Everything else is a const reference
+    std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
+
+    const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
+    std::shared_ptr<FileMetaData> file_metadata_;
+  };
+
+  ParquetFileWriter();
+  ~ParquetFileWriter();
+
+  static std::unique_ptr<ParquetFileWriter> Open(
+      std::shared_ptr<::arrow::io::OutputStream> sink,
+      std::shared_ptr<schema::GroupNode> schema,
+      std::shared_ptr<WriterProperties> properties = default_writer_properties(),
+      std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
+
+  void Open(std::unique_ptr<Contents> contents);
+  void Close();
+
+  /// Construct a RowGroupWriter with an arbitrary number of rows.
+  ///
+  /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
+  /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
+  RowGroupWriter* AppendRowGroup();
+
+  /// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
+  /// Use this if you want to write a RowGroup based on a certain size
+  ///
+  /// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
+  /// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
+  RowGroupWriter* AppendBufferedRowGroup();
+
+  /// \brief Add key-value metadata to the file.
+  /// \param[in] key_value_metadata the metadata to add.
+  /// \note This will overwrite any existing metadata with the same key(s).
+  /// \throw ParquetException if Close() has been called.
+  void AddKeyValueMetadata(
+      const std::shared_ptr<const KeyValueMetadata>& key_value_metadata);
+
+  /// Number of columns.
+  ///
+  /// This number is fixed during the lifetime of the writer as it is determined via
+  /// the schema.
+  int num_columns() const;
+
+  /// Number of rows in the yet started RowGroups.
+  ///
+  /// Changes on the addition of a new RowGroup.
+  int64_t num_rows() const;
+
+  /// Number of started RowGroups.
+  int num_row_groups() const;
+
+  /// Configuration passed to the writer, e.g. the used Parquet format version.
+  const std::shared_ptr<WriterProperties>& properties() const;
+
+  /// Returns the file schema descriptor
+  const SchemaDescriptor* schema() const;
+
+  /// Returns a column descriptor in schema
+  const ColumnDescriptor* descr(int i) const;
+
+  /// Returns the file custom metadata
+  const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
+
+  /// Returns the file metadata, only available after calling Close().
+  const std::shared_ptr<FileMetaData> metadata() const;
+
+ private:
+  // Holds a pointer to an instance of Contents implementation
+  std::unique_ptr<Contents> contents_;
+  std::shared_ptr<FileMetaData> file_metadata_;
+};
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/geospatial/statistics.h b/pyarrow/include/parquet/geospatial/statistics.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb4c31af1b7d9ab45ef012d6b9dedaed05194bfc
--- /dev/null
+++ b/pyarrow/include/parquet/geospatial/statistics.h
@@ -0,0 +1,198 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet::geospatial {
+
+/// \brief The maximum number of dimensions represented by a geospatial type
+/// (i.e., X, Y, Z, and M)
+inline constexpr int kMaxDimensions = 4;
+
+/// \brief NaN, used to represent bounds for which predicate pushdown cannnot
+/// be applied (e.g., because a writer did not provide bounds for a given dimension)
+inline constexpr double kNaN = std::numeric_limits<double>::quiet_NaN();
+
+/// \brief Structure represented encoded statistics to be written to and read from Parquet
+/// serialized metadata.
+///
+/// See the Parquet Thrift definition and GeoStatistics for the specific definition
+/// of field values.
+struct PARQUET_EXPORT EncodedGeoStatistics {
+  bool xy_bounds_present{false};
+  double xmin{kNaN};
+  double xmax{kNaN};
+  double ymin{kNaN};
+  double ymax{kNaN};
+
+  bool z_bounds_present{false};
+  double zmin{kNaN};
+  double zmax{kNaN};
+
+  bool m_bounds_present{false};
+  double mmin{kNaN};
+  double mmax{kNaN};
+
+  bool geospatial_types_present() const { return !geospatial_types.empty(); }
+  std::vector<int32_t> geospatial_types;
+};
+
+class GeoStatisticsImpl;
+
+/// \brief Base type for computing geospatial column statistics while writing a file
+/// or representing them when reading a file
+///
+/// These statistics track the minimum and maximum value (omitting NaN values) of the
+/// four possible dimensions (X, Y, Z, and M) and the distinct set of geometry
+/// type/dimension combinations (e.g., point XY, linestring XYZM) present in the data.
+/// Any of these individual components may be "invalid": for example, when reading a
+/// Parquet file, information about individual components obtained from the column
+/// chunk metadata may have been missing or deemed unusable. Orthogonally,
+/// any of these individual components may be "empty": for example, when using
+/// GeoStatistics to accumulate bounds whilst writing, if all geometries in a column chunk
+/// are null, all ranges (X, Y, Z, and M) will be empty. If all geometries in a column
+/// chunk contain only XY coordinates (the most common case), the Z and M ranges will
+/// be empty but the X and Y ranges will contain finite bounds. Empty ranges are
+/// considered "valid" because they are known to represent exactly zero values (in
+/// contrast to an invalid range, whose contents is completely unknown). These concepts
+/// are all necessary for this object to accurately represent (1) accumulated or partially
+/// accumulated statistics during the writing process and (2) deserialized statistics read
+/// from the column chunk metadata during the reading process.
+///
+/// EXPERIMENTAL
+class PARQUET_EXPORT GeoStatistics {
+ public:
+  GeoStatistics();
+  explicit GeoStatistics(const EncodedGeoStatistics& encoded);
+
+  ~GeoStatistics();
+
+  /// \brief Return true if bounds, geometry types, and validity are identical
+  bool Equals(const GeoStatistics& other) const;
+
+  /// \brief Update these statistics based on previously calculated or decoded statistics
+  ///
+  /// Merging statistics with wraparound X values is not currently supported. Merging
+  /// two GeoStatistics where one or both has a wraparound X range will result in these
+  /// statistics having an X dimension marked as invalid.
+  void Merge(const GeoStatistics& other);
+
+  /// \brief Update these statistics based on values
+  void Update(const ByteArray* values, int64_t num_values);
+
+  /// \brief Update these statistics based on the non-null elements of values
+  void UpdateSpaced(const ByteArray* values, const uint8_t* valid_bits,
+                    int64_t valid_bits_offset, int64_t num_spaced_values,
+                    int64_t num_values);
+
+  /// \brief Update these statistics based on the non-null elements of values
+  ///
+  /// Currently, BinaryArray and LargeBinaryArray input is supported.
+  void Update(const ::arrow::Array& values);
+
+  /// \brief Return these statistics to an empty state
+  void Reset();
+
+  /// \brief Encode the statistics for serializing to Thrift
+  ///
+  /// If invalid WKB was encountered or if the statistics contain NaN
+  /// for any reason, Encode() will return nullopt to indicate that
+  /// statistics should not be written to thrift.
+  std::optional<EncodedGeoStatistics> Encode() const;
+
+  /// \brief Returns false if invalid WKB was encountered
+  bool is_valid() const;
+
+  /// \brief Reset existing statistics and populate them from previously-encoded ones
+  void Decode(const EncodedGeoStatistics& encoded);
+
+  /// \brief Minimum values in XYZM order
+  ///
+  /// For dimensions where dimension_valid() is false, the value will be NaN. For
+  /// dimensions where dimension_empty() is true, the value will be +Inf.
+  ///
+  /// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this
+  /// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf,
+  /// xmin]. This implementation does not yet generate these types of bounds but they may
+  /// be encountered in statistics when reading a Parquet file.
+  std::array<double, kMaxDimensions> lower_bound() const;
+
+  /// \brief Maximum values in XYZM order
+  ///
+  /// For dimensions where dimension_valid() is false, the value will be NaN. For
+  /// dimensions where dimension_empty() is true, the value will be -Inf.
+  ///
+  /// For the first dimension (X) only, wraparound bounds apply where xmin > xmax. In this
+  /// case, these bounds represent the union of the intervals [xmax, Inf] and [-Inf,
+  /// xmin]. This implementation does not yet generate these types of bounds but they may
+  /// be encountered in statistics when reading a Parquet file.
+  std::array<double, kMaxDimensions> upper_bound() const;
+
+  /// \brief Dimension emptiness in XYZM order
+  ///
+  /// True for a given dimension if and only if zero non-NaN values were encountered
+  /// in that dimension and dimension_valid() is true for that dimension.
+  ///
+  /// When calculating statistics, zero or more of these values may be true because
+  /// this implementation calculates bounds for all dimensions; however, it may be
+  /// true that zero coordinates were encountered in a given dimension. For example,
+  /// dimension_empty() will return four true values if Update() was not called
+  /// or if Update() was called with only null values. If Update() was provided
+  /// one or more geometries with X and Y dimensions but not Z or M dimensions,
+  /// dimension_empty() will return true, true, false, false.
+  ///
+  /// For statistics read from a Parquet file, dimension_empty() will always contain
+  /// false values because there is no mechanism to communicate an empty interval
+  /// in the Thrift metadata.
+  std::array<bool, kMaxDimensions> dimension_empty() const;
+
+  /// \brief Dimension validity (i.e. presence) in XYZM order
+  ///
+  /// When calculating statistics, this will always be true because this implementation
+  /// calculates statistics for all dimensions. When reading a Parquet file, one or more
+  /// of these values may be false because the file may not have provided bounds for all
+  /// dimensions.
+  ///
+  /// See documentation for dimension_empty(), lower_bound(), and/or upper_bound() for the
+  /// canonical values of those outputs for the dimensions where dimension_valid() is
+  /// false.
+  std::array<bool, kMaxDimensions> dimension_valid() const;
+
+  /// \brief Return the geometry type codes
+  ///
+  /// This implementation always returns sorted output with no duplicates. When
+  /// calculating statistics, a value will always be returned (although the returned
+  /// vector may be empty if Update() was never called or was only called with null
+  /// values). When reading a Parquet file, std::nullopt may be returned because
+  /// the file may not have provided this information.
+  std::optional<std::vector<int32_t>> geometry_types() const;
+
+  /// \brief Return a string representation of these statistics
+  std::string ToString() const;
+
+ private:
+  std::unique_ptr<GeoStatisticsImpl> impl_;
+};
+
+}  // namespace parquet::geospatial
diff --git a/pyarrow/include/parquet/hasher.h b/pyarrow/include/parquet/hasher.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ff41f17b5e6491529b5985f4fb9bd0808a37454
--- /dev/null
+++ b/pyarrow/include/parquet/hasher.h
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include "parquet/types.h"
+
+namespace parquet {
+// Abstract class for hash
+class PARQUET_EXPORT Hasher {
+ public:
+  /// Compute hash for 32 bits value by using its plain encoding result.
+  ///
+  /// @param value the value to hash.
+  /// @return hash result.
+  virtual uint64_t Hash(int32_t value) const = 0;
+
+  /// Compute hash for 64 bits value by using its plain encoding result.
+  ///
+  /// @param value the value to hash.
+  /// @return hash result.
+  virtual uint64_t Hash(int64_t value) const = 0;
+
+  /// Compute hash for float value by using its plain encoding result.
+  ///
+  /// @param value the value to hash.
+  /// @return hash result.
+  virtual uint64_t Hash(float value) const = 0;
+
+  /// Compute hash for double value by using its plain encoding result.
+  ///
+  /// @param value the value to hash.
+  /// @return hash result.
+  virtual uint64_t Hash(double value) const = 0;
+
+  /// Compute hash for Int96 value by using its plain encoding result.
+  ///
+  /// @param value the value to hash.
+  /// @return hash result.
+  virtual uint64_t Hash(const Int96* value) const = 0;
+
+  /// Compute hash for ByteArray value by using its plain encoding result.
+  ///
+  /// @param value the value to hash.
+  /// @return hash result.
+  virtual uint64_t Hash(const ByteArray* value) const = 0;
+
+  /// Compute hash for fixed byte array value by using its plain encoding result.
+  ///
+  /// @param value the value address.
+  /// @param len the value length.
+  virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
+
+  /// Batch compute hashes for 32 bits values by using its plain encoding result.
+  ///
+  /// @param values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for 64 bits values by using its plain encoding result.
+  ///
+  /// @param values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for float values by using its plain encoding result.
+  ///
+  /// @param values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const float* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for double values by using its plain encoding result.
+  ///
+  /// @param values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const double* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for Int96 values by using its plain encoding result.
+  ///
+  /// @param values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const Int96* values, int num_values, uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for ByteArray values by using its plain encoding result.
+  ///
+  /// @param values a pointer to the values to hash.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const ByteArray* values, int num_values,
+                      uint64_t* hashes) const = 0;
+
+  /// Batch compute hashes for fixed byte array values by using its plain encoding result.
+  ///
+  /// @param values the value address.
+  /// @param type_len the value length.
+  /// @param num_values the number of values to hash.
+  /// @param hashes a pointer to the output hash values, its length should be equal to
+  /// num_values.
+  virtual void Hashes(const FLBA* values, uint32_t type_len, int num_values,
+                      uint64_t* hashes) const = 0;
+
+  virtual ~Hasher() = default;
+};
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/level_comparison.h b/pyarrow/include/parquet/level_comparison.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ae442dd46e57b7f86b405d9502442d3195719e8
--- /dev/null
+++ b/pyarrow/include/parquet/level_comparison.h
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+
+#include "parquet/platform.h"
+
+namespace parquet::internal {
+
+/// Builds a  bitmap where each set bit indicates the corresponding level is greater
+/// than rhs.
+uint64_t PARQUET_EXPORT GreaterThanBitmap(const int16_t* levels, int64_t num_levels,
+                                          int16_t rhs);
+
+struct MinMax {
+  int16_t min;
+  int16_t max;
+};
+
+MinMax FindMinMax(const int16_t* levels, int64_t num_levels);
+
+}  // namespace parquet::internal
diff --git a/pyarrow/include/parquet/level_comparison_inc.h b/pyarrow/include/parquet/level_comparison_inc.h
new file mode 100644
index 0000000000000000000000000000000000000000..04f628d53311166233bd24158de07a7a12003f61
--- /dev/null
+++ b/pyarrow/include/parquet/level_comparison_inc.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "arrow/util/bit_util.h"
+#include "arrow/util/endian.h"
+#include "parquet/level_comparison.h"
+
+// Used to make sure ODR rule isn't violated.
+#ifndef PARQUET_IMPL_NAMESPACE
+#  error "PARQUET_IMPL_NAMESPACE must be defined"
+#endif
+namespace parquet::internal::PARQUET_IMPL_NAMESPACE {
+/// Builds a bitmap by applying predicate to the level vector provided.
+///
+/// \param[in] levels Rep or def level array.
+/// \param[in] num_levels The number of levels to process (must be [0, 64])
+/// \param[in] predicate The predicate to apply (must have the signature `bool
+/// predicate(int16_t)`.
+/// \returns The bitmap using least significant "bit" ordering.
+///
+template <typename Predicate>
+inline uint64_t LevelsToBitmap(const int16_t* levels, int64_t num_levels,
+                               Predicate predicate) {
+  // Both clang and GCC can vectorize this automatically with SSE4/AVX2.
+  uint64_t mask = 0;
+  for (int x = 0; x < num_levels; x++) {
+    mask |= static_cast<uint64_t>(predicate(levels[x]) ? 1 : 0) << x;
+  }
+  return ::arrow::bit_util::ToLittleEndian(mask);
+}
+
+inline MinMax FindMinMaxImpl(const int16_t* levels, int64_t num_levels) {
+  MinMax out{std::numeric_limits<int16_t>::max(), std::numeric_limits<int16_t>::min()};
+  for (int x = 0; x < num_levels; x++) {
+    out.min = std::min(levels[x], out.min);
+    out.max = std::max(levels[x], out.max);
+  }
+  return out;
+}
+
+inline uint64_t GreaterThanBitmapImpl(const int16_t* levels, int64_t num_levels,
+                                      int16_t rhs) {
+  return LevelsToBitmap(levels, num_levels, [rhs](int16_t value) { return value > rhs; });
+}
+
+}  // namespace parquet::internal::PARQUET_IMPL_NAMESPACE
diff --git a/pyarrow/include/parquet/level_conversion.h b/pyarrow/include/parquet/level_conversion.h
new file mode 100644
index 0000000000000000000000000000000000000000..31de95be41c473814c52cd9d2f5902d63f1b944b
--- /dev/null
+++ b/pyarrow/include/parquet/level_conversion.h
@@ -0,0 +1,216 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/util/endian.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+
+namespace parquet::internal {
+
+struct PARQUET_EXPORT LevelInfo {
+  LevelInfo()
+      : null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {}
+  LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level,
+            int32_t repeated_ancestor_definition_level)
+      : null_slot_usage(null_slots),
+        def_level(static_cast<int16_t>(definition_level)),
+        rep_level(static_cast<int16_t>(repetition_level)),
+        repeated_ancestor_def_level(
+            static_cast<int16_t>(repeated_ancestor_definition_level)) {}
+
+  bool operator==(const LevelInfo& b) const {
+    return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
+           rep_level == b.rep_level &&
+           repeated_ancestor_def_level == b.repeated_ancestor_def_level;
+  }
+
+  bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; }
+
+  // How many slots an undefined but present (i.e. null) element in
+  // parquet consumes when decoding to Arrow.
+  // "Slot" is used in the same context as the Arrow specification
+  // (i.e. a value holder).
+  // This is only ever >1 for descendents of FixedSizeList.
+  int32_t null_slot_usage = 1;
+
+  // The definition level at which the value for the field
+  // is considered not null (definition levels greater than
+  // or equal to this value indicate a not-null
+  // value for the field). For list fields definition levels
+  // greater than or equal to this field indicate a present,
+  // possibly null, child value.
+  int16_t def_level = 0;
+
+  // The repetition level corresponding to this element
+  // or the closest repeated ancestor.  Any repetition
+  // level less than this indicates either a new list OR
+  // an empty list (which is determined in conjunction
+  // with definition levels).
+  int16_t rep_level = 0;
+
+  // The definition level indicating the level at which the closest
+  // repeated ancestor is not empty.  This is used to discriminate
+  // between a value less than |def_level| being null or excluded entirely.
+  // For instance if we have an arrow schema like:
+  // list(struct(f0: int)).  Then then there are the following
+  // definition levels:
+  //   0 = null list
+  //   1 = present but empty list.
+  //   2 = a null value in the list
+  //   3 = a non null struct but null integer.
+  //   4 = a present integer.
+  // When reconstructing, the struct and integer arrays'
+  // repeated_ancestor_def_level would be 2.  Any
+  // def_level < 2 indicates that there isn't a corresponding
+  // child value in the list.
+  // i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
+  // has the def levels [0, 1, 2, 3, 4].  The actual
+  // struct array is only of length 3: [not-set, set, set] and
+  // the int array is also of length 3: [N/A, null, 1].
+  //
+  int16_t repeated_ancestor_def_level = 0;
+
+  /// Increments levels according to the cardinality of node.
+  void Increment(const schema::Node& node) {
+    if (node.is_repeated()) {
+      IncrementRepeated();
+      return;
+    }
+    if (node.is_optional()) {
+      IncrementOptional();
+      return;
+    }
+  }
+
+  /// Increments level for a optional node.
+  void IncrementOptional() { def_level++; }
+
+  /// Increments levels for the repeated node.  Returns
+  /// the previous ancestor_list_def_level.
+  int16_t IncrementRepeated() {
+    int16_t last_repeated_ancestor = repeated_ancestor_def_level;
+
+    // Repeated fields add both a repetition and definition level. This is used
+    // to distinguish between an empty list and a list with an item in it.
+    ++rep_level;
+    ++def_level;
+    // For levels >= repeated_ancestor_def_level it indicates the list was
+    // non-null and had at least one element.  This is important
+    // for later decoding because we need to add a slot for these
+    // values.  for levels < current_def_level no slots are added
+    // to arrays.
+    repeated_ancestor_def_level = def_level;
+    return last_repeated_ancestor;
+  }
+
+  // Calculates and returns LevelInfo for a column descriptor.
+  static LevelInfo ComputeLevelInfo(const ColumnDescriptor* descr) {
+    LevelInfo level_info;
+    level_info.def_level = descr->max_definition_level();
+    level_info.rep_level = descr->max_repetition_level();
+
+    int16_t min_spaced_def_level = descr->max_definition_level();
+    const ::parquet::schema::Node* node = descr->schema_node().get();
+    while (node && !node->is_repeated()) {
+      if (node->is_optional()) {
+        min_spaced_def_level--;
+      }
+      node = node->parent();
+    }
+    level_info.repeated_ancestor_def_level = min_spaced_def_level;
+    return level_info;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) {
+    // This print method is to silence valgrind issues.  What's printed
+    // is not important because all asserts happen directly on
+    // members.
+    os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
+       << ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level;
+    if (levels.null_slot_usage > 1) {
+      os << ", null_slot_usage=" << levels.null_slot_usage;
+    }
+    os << "}";
+    return os;
+  }
+};
+
+// Input/Output structure for reconstructed validity bitmaps.
+struct PARQUET_EXPORT ValidityBitmapInputOutput {
+  // Input only.
+  // The maximum number of values_read expected (actual
+  // values read must be less than or equal to this value).
+  // If this number is exceeded methods will throw a
+  // ParquetException. Exceeding this limit indicates
+  // either a corrupt or incorrectly written file.
+  int64_t values_read_upper_bound = 0;
+  // Output only. The number of values added to the encountered
+  // (this is logically the count of the number of elements
+  // for an Arrow array).
+  int64_t values_read = 0;
+  // Input/Output. The number of nulls encountered.
+  int64_t null_count = 0;
+  // Output only. The validity bitmap to populate. Maybe be null only
+  // for DefRepLevelsToListInfo (if all that is needed is list offsets).
+  uint8_t* valid_bits = NULLPTR;
+  // Input only, offset into valid_bits to start at.
+  int64_t valid_bits_offset = 0;
+};
+
+//  Converts def_levels to validity bitmaps for non-list arrays and structs that have
+//  at least one member that is not a list and has no list descendents.
+//  For lists use DefRepLevelsToList and structs where all descendants contain
+//  a list use DefRepLevelsToBitmap.
+void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
+                                      LevelInfo level_info,
+                                      ValidityBitmapInputOutput* output);
+
+// Reconstructs a validity bitmap and list offsets for a list arrays based on
+// def/rep levels. The first element of offsets will not be modified if rep_levels
+// starts with a new list.  The first element of offsets will be used when calculating
+// the next offset.  See documentation onf DefLevelsToBitmap for when to use this
+// method vs the other ones in this file for reconstruction.
+//
+// Offsets must be sized to 1 + values_read_upper_bound.
+void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
+                                       const int16_t* rep_levels, int64_t num_def_levels,
+                                       LevelInfo level_info,
+                                       ValidityBitmapInputOutput* output,
+                                       int32_t* offsets);
+void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
+                                       const int16_t* rep_levels, int64_t num_def_levels,
+                                       LevelInfo level_info,
+                                       ValidityBitmapInputOutput* output,
+                                       int64_t* offsets);
+
+// Reconstructs a validity bitmap for a struct every member is a list or has
+// a list descendant.  See documentation on DefLevelsToBitmap for when more
+// details on this method compared to the other ones defined above.
+void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels,
+                                         const int16_t* rep_levels,
+                                         int64_t num_def_levels, LevelInfo level_info,
+                                         ValidityBitmapInputOutput* output);
+
+// This is exposed to ensure we can properly test a software simulated pext function
+// (i.e. it isn't hidden by runtime dispatch).
+uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection);
+
+}  // namespace parquet::internal
diff --git a/pyarrow/include/parquet/level_conversion_inc.h b/pyarrow/include/parquet/level_conversion_inc.h
new file mode 100644
index 0000000000000000000000000000000000000000..335f5b92154b389babb15ac644b6d3b51076fcaa
--- /dev/null
+++ b/pyarrow/include/parquet/level_conversion_inc.h
@@ -0,0 +1,355 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "parquet/level_conversion.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+
+#include "arrow/util/bit_run_reader.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_writer.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/simd.h"
+#include "parquet/exception.h"
+#include "parquet/level_comparison.h"
+
+#ifndef PARQUET_IMPL_NAMESPACE
+#  error "PARQUET_IMPL_NAMESPACE must be defined"
+#endif
+
+namespace parquet::internal::PARQUET_IMPL_NAMESPACE {
+
+// clang-format off
+/* Python code to generate lookup table:
+
+kLookupBits = 5
+count = 0
+print('constexpr int kLookupBits = {};'.format(kLookupBits))
+print('constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {')
+print(' ', end = '')
+for mask in range(1 << kLookupBits):
+    for data in range(1 << kLookupBits):
+        bit_value = 0
+        bit_len = 0
+        for i in range(kLookupBits):
+            if mask & (1 << i):
+                bit_value |= (((data >> i) & 1) << bit_len)
+                bit_len += 1
+        out = '0x{:02X},'.format(bit_value)
+        count += 1
+        if count % (1 << kLookupBits) == 1:
+            print(' {')
+        if count % 8 == 1:
+            print('    ', end = '')
+        if count % 8 == 0:
+            print(out, end = '\n')
+        else:
+            print(out, end = ' ')
+        if count % (1 << kLookupBits) == 0:
+            print('  },', end = '')
+print('\n};')
+
+*/
+// clang-format on
+
+constexpr int kLookupBits = 5;
+constexpr uint8_t kPextTable[1 << kLookupBits][1 << kLookupBits] = {
+    {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    },
+    {
+        0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
+        0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+        0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+    },
+    {
+        0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
+        0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+        0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01,
+    },
+    {
+        0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
+        0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
+        0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03,
+    },
+    {
+        0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+        0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+        0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
+    },
+    {
+        0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
+        0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
+        0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03,
+    },
+    {
+        0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
+        0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
+        0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03,
+    },
+    {
+        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
+        0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+        0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    },
+    {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    },
+    {
+        0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
+        0x03, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
+        0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+    },
+    {
+        0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+        0x03, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00,
+        0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+    },
+    {
+        0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+        0x07, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01,
+        0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
+    },
+    {
+        0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
+        0x02, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
+        0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
+    },
+    {
+        0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
+        0x05, 0x06, 0x07, 0x06, 0x07, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03,
+        0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
+    },
+    {
+        0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
+        0x05, 0x06, 0x06, 0x07, 0x07, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02,
+        0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
+    },
+    {
+        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+        0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+        0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+    },
+    {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+    },
+    {
+        0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
+        0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+        0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03, 0x02, 0x03,
+    },
+    {
+        0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01,
+        0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02,
+        0x03, 0x03, 0x02, 0x02, 0x03, 0x03, 0x02, 0x02, 0x03, 0x03,
+    },
+    {
+        0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02,
+        0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05,
+        0x06, 0x07, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
+    },
+    {
+        0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
+        0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03,
+        0x03, 0x03, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03,
+    },
+    {
+        0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x00, 0x01, 0x00,
+        0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07,
+        0x06, 0x07, 0x04, 0x05, 0x04, 0x05, 0x06, 0x07, 0x06, 0x07,
+    },
+    {
+        0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x00, 0x00, 0x01,
+        0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06,
+        0x07, 0x07, 0x04, 0x04, 0x05, 0x05, 0x06, 0x06, 0x07, 0x07,
+    },
+    {
+        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02,
+        0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+        0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+    },
+    {
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01,
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+        0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+    },
+    {
+        0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02,
+        0x03, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04, 0x05, 0x04, 0x05,
+        0x04, 0x05, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
+    },
+    {
+        0x00, 0x00, 0x01, 0x01, 0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03,
+        0x03, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05, 0x05, 0x04, 0x04,
+        0x05, 0x05, 0x06, 0x06, 0x07, 0x07, 0x06, 0x06, 0x07, 0x07,
+    },
+    {
+        0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+        0x07, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09,
+        0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F,
+    },
+    {
+        0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02,
+        0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x05, 0x05,
+        0x05, 0x05, 0x06, 0x06, 0x06, 0x06, 0x07, 0x07, 0x07, 0x07,
+    },
+    {
+        0x00, 0x01, 0x00, 0x01, 0x02, 0x03, 0x02, 0x03, 0x04, 0x05, 0x04,
+        0x05, 0x06, 0x07, 0x06, 0x07, 0x08, 0x09, 0x08, 0x09, 0x0A, 0x0B,
+        0x0A, 0x0B, 0x0C, 0x0D, 0x0C, 0x0D, 0x0E, 0x0F, 0x0E, 0x0F,
+    },
+    {
+        0x00, 0x00, 0x01, 0x01, 0x02, 0x02, 0x03, 0x03, 0x04, 0x04, 0x05,
+        0x05, 0x06, 0x06, 0x07, 0x07, 0x08, 0x08, 0x09, 0x09, 0x0A, 0x0A,
+        0x0B, 0x0B, 0x0C, 0x0C, 0x0D, 0x0D, 0x0E, 0x0E, 0x0F, 0x0F,
+    },
+    {
+        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
+        0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
+        0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+    },
+};
+
+inline uint64_t ExtractBitsSoftware(uint64_t bitmap, uint64_t select_bitmap) {
+  // A software emulation of _pext_u64
+
+  // These checks should be inline and are likely to be common cases.
+  if (select_bitmap == ~uint64_t{0}) {
+    return bitmap;
+  } else if (select_bitmap == 0) {
+    return 0;
+  }
+
+  // Fallback to lookup table method
+  uint64_t bit_value = 0;
+  int bit_len = 0;
+  constexpr uint8_t kLookupMask = (1U << kLookupBits) - 1;
+  while (select_bitmap != 0) {
+    const auto mask_len = ARROW_POPCOUNT32(select_bitmap & kLookupMask);
+    const uint64_t value = kPextTable[select_bitmap & kLookupMask][bitmap & kLookupMask];
+    bit_value |= (value << bit_len);
+    bit_len += mask_len;
+    bitmap >>= kLookupBits;
+    select_bitmap >>= kLookupBits;
+  }
+  return bit_value;
+}
+
+#ifdef ARROW_HAVE_BMI2
+
+// Use _pext_u64 on 64-bit builds, _pext_u32 on 32-bit builds,
+#  if UINTPTR_MAX == 0xFFFFFFFF
+
+using extract_bitmap_t = uint32_t;
+inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
+                                    extract_bitmap_t select_bitmap) {
+  return _pext_u32(bitmap, select_bitmap);
+}
+
+#  else
+
+using extract_bitmap_t = uint64_t;
+inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
+                                    extract_bitmap_t select_bitmap) {
+  return _pext_u64(bitmap, select_bitmap);
+}
+
+#  endif
+
+#else  // !defined(ARROW_HAVE_BMI2)
+
+// Use 64-bit pext emulation when BMI2 isn't available.
+using extract_bitmap_t = uint64_t;
+inline extract_bitmap_t ExtractBits(extract_bitmap_t bitmap,
+                                    extract_bitmap_t select_bitmap) {
+  return ExtractBitsSoftware(bitmap, select_bitmap);
+}
+
+#endif
+
+static constexpr int64_t kExtractBitsSize = 8 * sizeof(extract_bitmap_t);
+
+template <bool has_repeated_parent>
+int64_t DefLevelsBatchToBitmap(const int16_t* def_levels, const int64_t batch_size,
+                               int64_t upper_bound_remaining, LevelInfo level_info,
+                               ::arrow::internal::FirstTimeBitmapWriter* writer) {
+  ARROW_DCHECK_LE(batch_size, kExtractBitsSize);
+
+  // Greater than level_info.def_level - 1 implies >= the def_level
+  auto defined_bitmap = static_cast<extract_bitmap_t>(::arrow::bit_util::FromLittleEndian(
+      internal::GreaterThanBitmap(def_levels, batch_size, level_info.def_level - 1)));
+
+  if (has_repeated_parent) {
+    // Greater than level_info.repeated_ancestor_def_level - 1 implies >= the
+    // repeated_ancestor_def_level
+    auto present_bitmap = static_cast<extract_bitmap_t>(
+        ::arrow::bit_util::FromLittleEndian(internal::GreaterThanBitmap(
+            def_levels, batch_size, level_info.repeated_ancestor_def_level - 1)));
+    auto selected_bits = ExtractBits(defined_bitmap, present_bitmap);
+    int64_t selected_count = ::arrow::bit_util::PopCount(present_bitmap);
+    if (ARROW_PREDICT_FALSE(selected_count > upper_bound_remaining)) {
+      throw ParquetException("Values read exceeded upper bound");
+    }
+    writer->AppendWord(selected_bits, selected_count);
+    return ::arrow::bit_util::PopCount(selected_bits);
+  } else {
+    if (ARROW_PREDICT_FALSE(batch_size > upper_bound_remaining)) {
+      std::stringstream ss;
+      ss << "Values read exceeded upper bound";
+      throw ParquetException(ss.str());
+    }
+
+    writer->AppendWord(defined_bitmap, batch_size);
+    return ::arrow::bit_util::PopCount(defined_bitmap);
+  }
+}
+
+template <bool has_repeated_parent>
+void DefLevelsToBitmapSimd(const int16_t* def_levels, int64_t num_def_levels,
+                           LevelInfo level_info, ValidityBitmapInputOutput* output) {
+  ::arrow::internal::FirstTimeBitmapWriter writer(
+      output->valid_bits,
+      /*start_offset=*/output->valid_bits_offset,
+      /*length=*/output->values_read_upper_bound);
+  int64_t set_count = 0;
+  output->values_read = 0;
+  int64_t values_read_remaining = output->values_read_upper_bound;
+  while (num_def_levels > kExtractBitsSize) {
+    set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
+        def_levels, kExtractBitsSize, values_read_remaining, level_info, &writer);
+    def_levels += kExtractBitsSize;
+    num_def_levels -= kExtractBitsSize;
+    values_read_remaining = output->values_read_upper_bound - writer.position();
+  }
+  set_count += DefLevelsBatchToBitmap<has_repeated_parent>(
+      def_levels, num_def_levels, values_read_remaining, level_info, &writer);
+
+  output->values_read = writer.position();
+  output->null_count += output->values_read - set_count;
+  writer.Finish();
+}
+
+}  // namespace parquet::internal::PARQUET_IMPL_NAMESPACE
diff --git a/pyarrow/include/parquet/metadata.h b/pyarrow/include/parquet/metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..1235aae9ad7241436e122aa209ba09516a1cb550
--- /dev/null
+++ b/pyarrow/include/parquet/metadata.h
@@ -0,0 +1,560 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <optional>
+#include <span>
+#include <string>
+#include <vector>
+
+#include "parquet/encryption/type_fwd.h"
+#include "parquet/platform.h"
+#include "parquet/properties.h"
+#include "parquet/type_fwd.h"
+
+namespace parquet {
+
+using KeyValueMetadata = ::arrow::KeyValueMetadata;
+
+class PARQUET_EXPORT ApplicationVersion {
+ public:
+  // Known Versions with Issues
+  static const ApplicationVersion& PARQUET_251_FIXED_VERSION();
+  static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
+  static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
+  static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
+  static const ApplicationVersion& PARQUET_CPP_10353_FIXED_VERSION();
+
+  // Application that wrote the file. e.g. "IMPALA"
+  std::string application_;
+  // Build name
+  std::string build_;
+
+  // Version of the application that wrote the file, expressed as
+  // (<major>.<minor>.<patch>). Unmatched parts default to 0.
+  // "1.2.3"    => {1, 2, 3}
+  // "1.2"      => {1, 2, 0}
+  // "1.2-cdh5" => {1, 2, 0}
+  struct {
+    int major;
+    int minor;
+    int patch;
+    std::string unknown;
+    std::string pre_release;
+    std::string build_info;
+  } version;
+
+  ApplicationVersion() = default;
+  explicit ApplicationVersion(const std::string& created_by);
+  ApplicationVersion(std::string application, int major, int minor, int patch);
+
+  // Returns true if version is strictly less than other_version
+  bool VersionLt(const ApplicationVersion& other_version) const;
+
+  // Returns true if version is strictly equal with other_version
+  bool VersionEq(const ApplicationVersion& other_version) const;
+
+  // Checks if the Version has the correct statistics for a given column
+  bool HasCorrectStatistics(Type::type primitive, const EncodedStatistics& statistics,
+                            SortOrder::type sort_order = SortOrder::SIGNED) const;
+};
+
+class PARQUET_EXPORT ColumnCryptoMetaData {
+ public:
+  static std::unique_ptr<ColumnCryptoMetaData> Make(const uint8_t* metadata);
+  ~ColumnCryptoMetaData();
+
+  bool Equals(const ColumnCryptoMetaData& other) const;
+
+  std::shared_ptr<schema::ColumnPath> path_in_schema() const;
+  bool encrypted_with_footer_key() const;
+  const std::string& key_metadata() const;
+
+ private:
+  explicit ColumnCryptoMetaData(const uint8_t* metadata);
+
+  class ColumnCryptoMetaDataImpl;
+  std::unique_ptr<ColumnCryptoMetaDataImpl> impl_;
+};
+
+/// \brief Public struct for Thrift PageEncodingStats in ColumnChunkMetaData
+struct PageEncodingStats {
+  PageType::type page_type;
+  Encoding::type encoding;
+  int32_t count;
+};
+
+/// \brief Public struct for location to page index in ColumnChunkMetaData.
+struct IndexLocation {
+  /// File offset of the given index, in bytes
+  int64_t offset;
+  /// Length of the given index, in bytes
+  int32_t length;
+};
+
+/// \brief ColumnChunkMetaData is a proxy around format::ColumnChunkMetaData.
+class PARQUET_EXPORT ColumnChunkMetaData {
+ public:
+  // API convenience to get a MetaData accessor
+  static std::unique_ptr<ColumnChunkMetaData> Make(
+      const void* metadata, const ColumnDescriptor* descr,
+      const ReaderProperties& properties = default_reader_properties(),
+      const ApplicationVersion* writer_version = NULLPTR, int16_t row_group_ordinal = -1,
+      int16_t column_ordinal = -1,
+      std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+  ~ColumnChunkMetaData();
+
+  bool Equals(const ColumnChunkMetaData& other) const;
+
+  // Byte offset of `ColumnMetaData` in `file_path()`.
+  //
+  // Note that the meaning of this field has been inconsistent among implementations
+  // so its use has since been deprecated in the Parquet specification. Modern
+  // implementations will set this to `0` to indicate that the `ColumnMetaData` is solely
+  // contained in the `ColumnChunk` struct.
+  int64_t file_offset() const;
+
+  // parameter is only used when a dataset is spread across multiple files
+  const std::string& file_path() const;
+
+  // column metadata
+  bool is_metadata_set() const;
+  Type::type type() const;
+  int64_t num_values() const;
+  std::shared_ptr<schema::ColumnPath> path_in_schema() const;
+  bool is_stats_set() const;
+  bool is_geo_stats_set() const;
+  std::shared_ptr<Statistics> statistics() const;
+  std::shared_ptr<EncodedStatistics> encoded_statistics() const;
+  std::shared_ptr<SizeStatistics> size_statistics() const;
+  std::shared_ptr<geospatial::GeoStatistics> geo_statistics() const;
+
+  Compression::type compression() const;
+  // Indicate if the ColumnChunk compression is supported by the current
+  // compiled parquet library.
+  bool can_decompress() const;
+
+  const std::vector<Encoding::type>& encodings() const;
+  const std::vector<PageEncodingStats>& encoding_stats() const;
+  std::optional<int64_t> bloom_filter_offset() const;
+  std::optional<int64_t> bloom_filter_length() const;
+  bool has_dictionary_page() const;
+  int64_t dictionary_page_offset() const;
+  int64_t data_page_offset() const;
+  bool has_index_page() const;
+  int64_t index_page_offset() const;
+  int64_t total_compressed_size() const;
+  int64_t total_uncompressed_size() const;
+  std::unique_ptr<ColumnCryptoMetaData> crypto_metadata() const;
+  std::optional<IndexLocation> GetColumnIndexLocation() const;
+  std::optional<IndexLocation> GetOffsetIndexLocation() const;
+  const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
+
+ private:
+  explicit ColumnChunkMetaData(
+      const void* metadata, const ColumnDescriptor* descr, int16_t row_group_ordinal,
+      int16_t column_ordinal, const ReaderProperties& properties,
+      const ApplicationVersion* writer_version = NULLPTR,
+      std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+  // PIMPL Idiom
+  class ColumnChunkMetaDataImpl;
+  std::unique_ptr<ColumnChunkMetaDataImpl> impl_;
+};
+
+/// \brief RowGroupMetaData is a proxy around format::RowGroupMetaData.
+class PARQUET_EXPORT RowGroupMetaData {
+ public:
+  /// \brief Create a RowGroupMetaData from a serialized thrift message.
+  static std::unique_ptr<RowGroupMetaData> Make(
+      const void* metadata, const SchemaDescriptor* schema,
+      const ReaderProperties& properties = default_reader_properties(),
+      const ApplicationVersion* writer_version = NULLPTR,
+      std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+  ~RowGroupMetaData();
+
+  bool Equals(const RowGroupMetaData& other) const;
+
+  /// \brief The number of columns in this row group. The order must match the
+  /// parent's column ordering.
+  int num_columns() const;
+
+  /// \brief Return the ColumnChunkMetaData of the corresponding column ordinal.
+  ///
+  /// WARNING, the returned object references memory location in it's parent
+  /// (RowGroupMetaData) object. Hence, the parent must outlive the returned
+  /// object.
+  ///
+  /// \param[in] index of the ColumnChunkMetaData to retrieve.
+  ///
+  /// \throws ParquetException if the index is out of bound.
+  std::unique_ptr<ColumnChunkMetaData> ColumnChunk(int index) const;
+
+  /// \brief Number of rows in this row group.
+  int64_t num_rows() const;
+
+  /// \brief Total byte size of all the uncompressed column data in this row group.
+  int64_t total_byte_size() const;
+
+  /// \brief Total byte size of all the compressed (and potentially encrypted)
+  /// column data in this row group.
+  ///
+  /// This information is optional and may be 0 if omitted.
+  int64_t total_compressed_size() const;
+
+  /// \brief Byte offset from beginning of file to first page (data or
+  /// dictionary) in this row group
+  ///
+  /// The file_offset field that this method exposes is optional. This method
+  /// will return 0 if that field is not set to a meaningful value.
+  int64_t file_offset() const;
+  // Return const-pointer to make it clear that this object is not to be copied
+  const SchemaDescriptor* schema() const;
+  // Indicate if all of the RowGroup's ColumnChunks can be decompressed.
+  bool can_decompress() const;
+  // Sorting columns of the row group if any.
+  std::vector<SortingColumn> sorting_columns() const;
+
+ private:
+  explicit RowGroupMetaData(
+      const void* metadata, const SchemaDescriptor* schema,
+      const ReaderProperties& properties,
+      const ApplicationVersion* writer_version = NULLPTR,
+      std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+  // PIMPL Idiom
+  class RowGroupMetaDataImpl;
+  std::unique_ptr<RowGroupMetaDataImpl> impl_;
+};
+
+class FileMetaDataBuilder;
+
+/// \brief FileMetaData is a proxy around format::FileMetaData.
+class PARQUET_EXPORT FileMetaData {
+ public:
+  /// \brief Create a FileMetaData from a serialized thrift message.
+  static std::shared_ptr<FileMetaData> Make(
+      const void* serialized_metadata, uint32_t* inout_metadata_len,
+      const ReaderProperties& properties = default_reader_properties(),
+      std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+  ~FileMetaData();
+
+  bool Equals(const FileMetaData& other) const;
+
+  /// \brief The number of parquet "leaf" columns.
+  ///
+  /// Parquet thrift definition requires that nested schema elements are
+  /// flattened. This method returns the number of columns in the flattened
+  /// version.
+  /// For instance, if the schema looks like this :
+  /// 0 foo.bar
+  ///       foo.bar.baz           0
+  ///       foo.bar.baz2          1
+  ///   foo.qux                   2
+  /// 1 foo2                      3
+  /// 2 foo3                      4
+  /// This method will return 5, because there are 5 "leaf" fields (so 5
+  /// flattened fields)
+  int num_columns() const;
+
+  /// \brief The number of flattened schema elements.
+  ///
+  /// Parquet thrift definition requires that nested schema elements are
+  /// flattened. This method returns the total number of elements in the
+  /// flattened list.
+  int num_schema_elements() const;
+
+  /// \brief The total number of rows.
+  ///
+  /// If the FileMetaData was obtained by calling `SubSet()`, this is the total
+  /// number of rows in the selected row groups.
+  int64_t num_rows() const;
+
+  /// \brief The number of row groups in the file.
+  ///
+  /// If the FileMetaData was obtained by calling `SubSet()`, this is the number
+  /// of selected row groups.
+  int num_row_groups() const;
+
+  /// \brief Return the RowGroupMetaData of the corresponding row group ordinal.
+  ///
+  /// WARNING, the returned object references memory location in it's parent
+  /// (FileMetaData) object. Hence, the parent must outlive the returned object.
+  ///
+  /// \param[in] index of the RowGroup to retrieve.
+  ///
+  /// \throws ParquetException if the index is out of bound.
+  std::unique_ptr<RowGroupMetaData> RowGroup(int index) const;
+
+  /// \brief Return the "version" of the file
+  ///
+  /// WARNING: The value returned by this method is unreliable as 1) the Parquet
+  /// file metadata stores the version as a single integer and 2) some producers
+  /// are known to always write a hardcoded value.  Therefore, you cannot use
+  /// this value to know which features are used in the file.
+  ParquetVersion::type version() const;
+
+  /// \brief Return the application's user-agent string of the writer.
+  const std::string& created_by() const;
+
+  /// \brief Return the application's version of the writer.
+  const ApplicationVersion& writer_version() const;
+
+  /// \brief Size of the original thrift encoded metadata footer.
+  uint32_t size() const;
+
+  /// \brief Indicate if all of the FileMetaData's RowGroups can be decompressed.
+  ///
+  /// This will return false if any of the RowGroup's page is compressed with a
+  /// compression format which is not compiled in the current parquet library.
+  bool can_decompress() const;
+
+  bool is_encryption_algorithm_set() const;
+  EncryptionAlgorithm encryption_algorithm() const;
+  const std::string& footer_signing_key_metadata() const;
+
+  PARQUET_DEPRECATED(
+      "Deprecated in 24.0.0. If you need this functionality, please report an issue.")
+  bool VerifySignature(const void* signature);
+
+  void WriteTo(::arrow::io::OutputStream* dst,
+               const std::shared_ptr<Encryptor>& encryptor = NULLPTR) const;
+
+  /// \brief Return Thrift-serialized representation of the metadata as a
+  /// string
+  std::string SerializeToString() const;
+
+  // Return const-pointer to make it clear that this object is not to be copied
+  const SchemaDescriptor* schema() const;
+
+  const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
+
+  /// \brief Set a path to all ColumnChunk for all RowGroups.
+  ///
+  /// Commonly used by systems (Dask, Spark) who generates an metadata-only
+  /// parquet file. The path is usually relative to said index file.
+  ///
+  /// \param[in] path to set.
+  void set_file_path(const std::string& path);
+
+  /// \brief Merge row groups from another metadata file into this one.
+  ///
+  /// The schema of the input FileMetaData must be equal to the
+  /// schema of this object.
+  ///
+  /// This is used by systems who creates an aggregate metadata-only file by
+  /// concatenating the row groups of multiple files. This newly created
+  /// metadata file acts as an index of all available row groups.
+  ///
+  /// \param[in] other FileMetaData to merge the row groups from.
+  ///
+  /// \throws ParquetException if schemas are not equal.
+  void AppendRowGroups(const FileMetaData& other);
+
+  /// \brief Return a FileMetaData containing a subset of the row groups in this
+  /// FileMetaData.
+  std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;
+
+  /// \brief Serialize metadata unencrypted as string
+  ///
+  /// \param[in] scrub whether to remove sensitive information from the metadata.
+  /// \param[in] debug whether to serialize the metadata as Thrift (if false) or
+  /// debug text (if true).
+  std::string SerializeUnencrypted(bool scrub, bool debug) const;
+
+ private:
+  friend FileMetaDataBuilder;
+  friend class SerializedFile;
+  friend class SerializedRowGroup;
+
+  explicit FileMetaData(const void* serialized_metadata, uint32_t* metadata_len,
+                        const ReaderProperties& properties,
+                        std::shared_ptr<InternalFileDecryptor> file_decryptor = NULLPTR);
+
+  void set_file_decryptor(std::shared_ptr<InternalFileDecryptor> file_decryptor);
+  const std::shared_ptr<InternalFileDecryptor>& file_decryptor() const;
+
+  // Verify the signature of a plaintext footer.
+  static bool VerifySignature(std::span<const uint8_t> serialized_metadata,
+                              std::span<const uint8_t> signature,
+                              InternalFileDecryptor* file_decryptor);
+
+  // PIMPL Idiom
+  FileMetaData();
+  class FileMetaDataImpl;
+  std::unique_ptr<FileMetaDataImpl> impl_;
+};
+
+class PARQUET_EXPORT FileCryptoMetaData {
+ public:
+  // API convenience to get a MetaData accessor
+  static std::shared_ptr<FileCryptoMetaData> Make(
+      const uint8_t* serialized_metadata, uint32_t* metadata_len,
+      const ReaderProperties& properties = default_reader_properties());
+  ~FileCryptoMetaData();
+
+  EncryptionAlgorithm encryption_algorithm() const;
+  const std::string& key_metadata() const;
+
+  void WriteTo(::arrow::io::OutputStream* dst) const;
+
+ private:
+  friend FileMetaDataBuilder;
+  FileCryptoMetaData(const uint8_t* serialized_metadata, uint32_t* metadata_len,
+                     const ReaderProperties& properties);
+
+  // PIMPL Idiom
+  FileCryptoMetaData();
+  class FileCryptoMetaDataImpl;
+  std::unique_ptr<FileCryptoMetaDataImpl> impl_;
+};
+
+// Builder API
+class PARQUET_EXPORT ColumnChunkMetaDataBuilder {
+ public:
+  // API convenience to get a MetaData reader
+  static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
+      std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column);
+
+  static std::unique_ptr<ColumnChunkMetaDataBuilder> Make(
+      std::shared_ptr<WriterProperties> props, const ColumnDescriptor* column,
+      void* contents);
+
+  ~ColumnChunkMetaDataBuilder();
+
+  // column chunk
+  // Used when a dataset is spread across multiple files
+  void set_file_path(const std::string& path);
+
+  // column metadata
+  void SetStatistics(const EncodedStatistics& stats);
+  void SetSizeStatistics(const SizeStatistics& size_stats);
+
+  // column geometry statistics
+  void SetGeoStatistics(const geospatial::EncodedGeoStatistics& geo_stats);
+
+  void SetKeyValueMetadata(std::shared_ptr<const KeyValueMetadata> key_value_metadata);
+
+  // get the column descriptor
+  const ColumnDescriptor* descr() const;
+
+  int64_t total_compressed_size() const;
+  // commit the metadata
+
+  void Finish(int64_t num_values, int64_t dictionary_page_offset,
+              int64_t index_page_offset, int64_t data_page_offset,
+              int64_t compressed_size, int64_t uncompressed_size, bool has_dictionary,
+              bool dictionary_fallback,
+              const std::map<Encoding::type, int32_t>& dict_encoding_stats_,
+              const std::map<Encoding::type, int32_t>& data_encoding_stats_,
+              const std::shared_ptr<Encryptor>& encryptor = NULLPTR);
+
+  // The metadata contents, suitable for passing to ColumnChunkMetaData::Make
+  const void* contents() const;
+
+  // For writing metadata at end of column chunk
+  void WriteTo(::arrow::io::OutputStream* sink);
+
+ private:
+  explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+                                      const ColumnDescriptor* column);
+  explicit ColumnChunkMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+                                      const ColumnDescriptor* column, void* contents);
+  // PIMPL Idiom
+  class ColumnChunkMetaDataBuilderImpl;
+  std::unique_ptr<ColumnChunkMetaDataBuilderImpl> impl_;
+};
+
+class PARQUET_EXPORT RowGroupMetaDataBuilder {
+ public:
+  // API convenience to get a MetaData reader
+  static std::unique_ptr<RowGroupMetaDataBuilder> Make(
+      std::shared_ptr<WriterProperties> props, const SchemaDescriptor* schema_,
+      void* contents);
+
+  ~RowGroupMetaDataBuilder();
+
+  ColumnChunkMetaDataBuilder* NextColumnChunk();
+  int num_columns();
+  int64_t num_rows();
+  int current_column() const;
+
+  void set_num_rows(int64_t num_rows);
+
+  // commit the metadata
+  void Finish(int64_t total_bytes_written, int16_t row_group_ordinal = -1);
+
+ private:
+  explicit RowGroupMetaDataBuilder(std::shared_ptr<WriterProperties> props,
+                                   const SchemaDescriptor* schema_, void* contents);
+  // PIMPL Idiom
+  class RowGroupMetaDataBuilderImpl;
+  std::unique_ptr<RowGroupMetaDataBuilderImpl> impl_;
+};
+
+/// \brief Public struct for location to all page indexes in a parquet file.
+struct PageIndexLocation {
+  /// Alias type of page index location of a row group. The index location
+  /// is located by column ordinal. If the column does not have the page index,
+  /// its value is set to std::nullopt.
+  using RowGroupIndexLocation = std::vector<std::optional<IndexLocation>>;
+  /// Alias type of page index location of a parquet file. The index location
+  /// is located by the row group ordinal.
+  using FileIndexLocation = std::map<size_t, RowGroupIndexLocation>;
+  /// Row group column index locations which uses row group ordinal as the key.
+  FileIndexLocation column_index_location;
+  /// Row group offset index locations which uses row group ordinal as the key.
+  FileIndexLocation offset_index_location;
+};
+
+class PARQUET_EXPORT FileMetaDataBuilder {
+ public:
+  // API convenience to get a MetaData builder
+  static std::unique_ptr<FileMetaDataBuilder> Make(
+      const SchemaDescriptor* schema, std::shared_ptr<WriterProperties> props);
+
+  ~FileMetaDataBuilder();
+
+  // The prior RowGroupMetaDataBuilder (if any) is destroyed
+  RowGroupMetaDataBuilder* AppendRowGroup();
+
+  // Update location to all page indexes in the parquet file
+  void SetPageIndexLocation(const PageIndexLocation& location);
+
+  // Complete the Thrift structure
+  std::unique_ptr<FileMetaData> Finish(
+      const std::shared_ptr<const KeyValueMetadata>& key_value_metadata = NULLPTR);
+
+  // crypto metadata
+  std::unique_ptr<FileCryptoMetaData> GetCryptoMetaData();
+
+ private:
+  explicit FileMetaDataBuilder(const SchemaDescriptor* schema,
+                               std::shared_ptr<WriterProperties> props);
+  // PIMPL Idiom
+  class FileMetaDataBuilderImpl;
+  std::unique_ptr<FileMetaDataBuilderImpl> impl_;
+};
+
+PARQUET_EXPORT std::string ParquetVersionToString(ParquetVersion::type ver);
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/page_index.h b/pyarrow/include/parquet/page_index.h
new file mode 100644
index 0000000000000000000000000000000000000000..3083159783ba793763f221c4b01c91ff6efef304
--- /dev/null
+++ b/pyarrow/include/parquet/page_index.h
@@ -0,0 +1,386 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/io/interfaces.h"
+#include "parquet/encryption/type_fwd.h"
+#include "parquet/type_fwd.h"
+#include "parquet/types.h"
+
+#include <optional>
+#include <vector>
+
+namespace parquet {
+
+/// \brief ColumnIndex is a proxy around format::ColumnIndex.
+class PARQUET_EXPORT ColumnIndex {
+ public:
+  /// \brief Create a ColumnIndex from a serialized thrift message.
+  static std::unique_ptr<ColumnIndex> Make(const ColumnDescriptor& descr,
+                                           const void* serialized_index,
+                                           uint32_t index_len,
+                                           const ReaderProperties& properties,
+                                           Decryptor* decryptor = NULLPTR);
+
+  virtual ~ColumnIndex() = default;
+
+  /// \brief A bitmap with a bit set for each data page that has only null values.
+  ///
+  /// The length of this vector is equal to the number of data pages in the column.
+  virtual const std::vector<bool>& null_pages() const = 0;
+
+  /// \brief A vector of encoded lower bounds for each data page in this column.
+  ///
+  /// `null_pages` should be inspected first, as only pages with non-null values
+  /// may have their lower bounds populated.
+  virtual const std::vector<std::string>& encoded_min_values() const = 0;
+
+  /// \brief A vector of encoded upper bounds for each data page in this column.
+  ///
+  /// `null_pages` should be inspected first, as only pages with non-null values
+  /// may have their upper bounds populated.
+  virtual const std::vector<std::string>& encoded_max_values() const = 0;
+
+  /// \brief The ordering of lower and upper bounds.
+  ///
+  /// The boundary order applies across all lower bounds, and all upper bounds,
+  /// respectively. However, the order between lower bounds and upper bounds
+  /// cannot be derived from this.
+  virtual BoundaryOrder::type boundary_order() const = 0;
+
+  /// \brief Whether per-page null count information is available.
+  virtual bool has_null_counts() const = 0;
+
+  /// \brief An optional vector with the number of null values in each data page.
+  ///
+  /// `has_null_counts` should be called first to determine if this information is
+  /// available.
+  virtual const std::vector<int64_t>& null_counts() const = 0;
+
+  /// \brief A vector of page indices for non-null pages.
+  virtual const std::vector<int32_t>& non_null_page_indices() const = 0;
+
+  /// \brief Whether definition level histogram is available.
+  virtual bool has_definition_level_histograms() const = 0;
+
+  /// \brief Whether repetition level histogram is available.
+  virtual bool has_repetition_level_histograms() const = 0;
+
+  /// \brief List of definition level histograms for each page concatenated together.
+  virtual const std::vector<int64_t>& definition_level_histograms() const = 0;
+
+  /// \brief List of repetition level histograms for each page concatenated together.
+  virtual const std::vector<int64_t>& repetition_level_histograms() const = 0;
+};
+
+/// \brief Typed implementation of ColumnIndex.
+template <typename DType>
+class PARQUET_EXPORT TypedColumnIndex : public ColumnIndex {
+ public:
+  using T = typename DType::c_type;
+
+  /// \brief A vector of lower bounds for each data page in this column.
+  ///
+  /// This is like `encoded_min_values`, but with the values decoded according to
+  /// the column's physical type.
+  /// `min_values` and `max_values` can be used together with `boundary_order`
+  /// in order to prune some data pages when searching for specific values.
+  virtual const std::vector<T>& min_values() const = 0;
+
+  /// \brief A vector of upper bounds for each data page in this column.
+  ///
+  /// Just like `min_values`, but for upper bounds instead of lower bounds.
+  virtual const std::vector<T>& max_values() const = 0;
+};
+
+using BoolColumnIndex = TypedColumnIndex<BooleanType>;
+using Int32ColumnIndex = TypedColumnIndex<Int32Type>;
+using Int64ColumnIndex = TypedColumnIndex<Int64Type>;
+using FloatColumnIndex = TypedColumnIndex<FloatType>;
+using DoubleColumnIndex = TypedColumnIndex<DoubleType>;
+using ByteArrayColumnIndex = TypedColumnIndex<ByteArrayType>;
+using FLBAColumnIndex = TypedColumnIndex<FLBAType>;
+
+/// \brief PageLocation is a proxy around format::PageLocation.
+struct PARQUET_EXPORT PageLocation {
+  /// File offset of the data page.
+  int64_t offset;
+  /// Total compressed size of the data page and header.
+  int32_t compressed_page_size;
+  /// Row id of the first row in the page within the row group.
+  int64_t first_row_index;
+};
+
+/// \brief OffsetIndex is a proxy around format::OffsetIndex.
+class PARQUET_EXPORT OffsetIndex {
+ public:
+  /// \brief Create a OffsetIndex from a serialized thrift message.
+  static std::unique_ptr<OffsetIndex> Make(const void* serialized_index,
+                                           uint32_t index_len,
+                                           const ReaderProperties& properties,
+                                           Decryptor* decryptor = NULLPTR);
+
+  virtual ~OffsetIndex() = default;
+
+  /// \brief A vector of locations for each data page in this column.
+  virtual const std::vector<PageLocation>& page_locations() const = 0;
+
+  /// \brief A vector of unencoded/uncompressed size of each page for BYTE_ARRAY types,
+  /// or empty for other types.
+  virtual const std::vector<int64_t>& unencoded_byte_array_data_bytes() const = 0;
+};
+
+/// \brief Interface for reading the page index for a Parquet row group.
+class PARQUET_EXPORT RowGroupPageIndexReader {
+ public:
+  virtual ~RowGroupPageIndexReader() = default;
+
+  /// \brief Read column index of a column chunk.
+  ///
+  /// \param[in] i column ordinal of the column chunk.
+  /// \returns column index of the column or nullptr if it does not exist.
+  /// \throws ParquetException if the index is out of bound.
+  virtual std::shared_ptr<ColumnIndex> GetColumnIndex(int32_t i) = 0;
+
+  /// \brief Read offset index of a column chunk.
+  ///
+  /// \param[in] i column ordinal of the column chunk.
+  /// \returns offset index of the column or nullptr if it does not exist.
+  /// \throws ParquetException if the index is out of bound.
+  virtual std::shared_ptr<OffsetIndex> GetOffsetIndex(int32_t i) = 0;
+};
+
+struct PageIndexSelection {
+  /// Specifies whether to read the column index.
+  bool column_index = false;
+  /// Specifies whether to read the offset index.
+  bool offset_index = false;
+};
+
+PARQUET_EXPORT
+std::ostream& operator<<(std::ostream& out, const PageIndexSelection& params);
+
+struct RowGroupIndexReadRange {
+  /// Base start and total size of column index of all column chunks in a row group.
+  /// If none of the column chunks have column index, it is set to std::nullopt.
+  std::optional<::arrow::io::ReadRange> column_index = std::nullopt;
+  /// Base start and total size of offset index of all column chunks in a row group.
+  /// If none of the column chunks have offset index, it is set to std::nullopt.
+  std::optional<::arrow::io::ReadRange> offset_index = std::nullopt;
+};
+
+/// \brief Interface for reading the page index for a Parquet file.
+class PARQUET_EXPORT PageIndexReader {
+ public:
+  virtual ~PageIndexReader() = default;
+
+  /// \brief Create a PageIndexReader instance.
+  /// \returns a PageIndexReader instance.
+  /// WARNING: The returned PageIndexReader references to all the input parameters, so
+  /// it must not outlive all of the input parameters. Usually these input parameters
+  /// come from the same ParquetFileReader object, so it must not outlive the reader
+  /// that creates this PageIndexReader.
+  static std::shared_ptr<PageIndexReader> Make(
+      ::arrow::io::RandomAccessFile* input, std::shared_ptr<FileMetaData> file_metadata,
+      const ReaderProperties& properties,
+      InternalFileDecryptor* file_decryptor = NULLPTR);
+
+  /// \brief Get the page index reader of a specific row group.
+  /// \param[in] i row group ordinal to get page index reader.
+  /// \returns RowGroupPageIndexReader of the specified row group. A nullptr may or may
+  ///          not be returned if the page index for the row group is unavailable. It is
+  ///          the caller's responsibility to check the return value of follow-up calls
+  ///          to the RowGroupPageIndexReader.
+  /// \throws ParquetException if the index is out of bound.
+  virtual std::shared_ptr<RowGroupPageIndexReader> RowGroup(int i) = 0;
+
+  /// \brief Advise the reader which part of page index will be read later.
+  ///
+  /// The PageIndexReader can optionally prefetch and cache page index that
+  /// may be read later to get better performance.
+  ///
+  /// The contract of this function is as below:
+  /// 1) If WillNeed() has not been called for a specific row group and the page index
+  ///    exists, follow-up calls to get column index or offset index of all columns in
+  ///    this row group SHOULD NOT FAIL, but the performance may not be optimal.
+  /// 2) If WillNeed() has been called for a specific row group, follow-up calls to get
+  ///    page index are limited to columns and index type requested by WillNeed().
+  ///    So it MAY FAIL if columns that are not requested by WillNeed() are requested.
+  /// 3) Later calls to WillNeed() MAY OVERRIDE previous calls of same row groups.
+  /// For example,
+  /// 1) If WillNeed() is not called for row group 0, then follow-up calls to read
+  ///    column index and/or offset index of all columns of row group 0 should not
+  ///    fail if its page index exists.
+  /// 2) If WillNeed() is called for columns 0 and 1 for row group 0, then follow-up
+  ///    call to read page index of column 2 for row group 0 MAY FAIL even if its
+  ///    page index exists.
+  /// 3) If WillNeed() is called for row group 0 with offset index only, then
+  ///    follow-up call to read column index of row group 0 MAY FAIL even if
+  ///    the column index of this column exists.
+  /// 4) If WillNeed() is called for columns 0 and 1 for row group 0, then later
+  ///    call to WillNeed() for columns 1 and 2 for row group 0. The later one
+  ///    overrides previous call and only columns 1 and 2 of row group 0 are allowed
+  ///    to access.
+  ///
+  /// \param[in] row_group_indices list of row group ordinal to read page index later.
+  /// \param[in] column_indices list of column ordinal to read page index later. If it is
+  ///            empty, it means all columns in the row group will be read.
+  /// \param[in] selection which kind of page index is required later.
+  virtual void WillNeed(const std::vector<int32_t>& row_group_indices,
+                        const std::vector<int32_t>& column_indices,
+                        const PageIndexSelection& selection) = 0;
+
+  /// \brief Advise the reader page index of these row groups will not be read anymore.
+  ///
+  /// The PageIndexReader implementation has the opportunity to cancel any prefetch or
+  /// release resource that are related to these row groups.
+  ///
+  /// \param[in] row_group_indices list of row group ordinal that whose page index will
+  /// not be accessed anymore.
+  virtual void WillNotNeed(const std::vector<int32_t>& row_group_indices) = 0;
+
+  /// \brief Determine the column index and offset index ranges for the given row group.
+  ///
+  /// \param[in] row_group_metadata row group metadata to get column chunk metadata.
+  /// \param[in] columns list of column ordinals to get page index. If the list is empty,
+  ///            it means all columns in the row group.
+  /// \returns RowGroupIndexReadRange of the specified row group. Throws ParquetException
+  ///          if the selected column ordinal is out of bound or metadata of page index
+  ///          is corrupted.
+  static RowGroupIndexReadRange DeterminePageIndexRangesInRowGroup(
+      const RowGroupMetaData& row_group_metadata, const std::vector<int32_t>& columns);
+};
+
+/// \brief Interface for collecting column index of data pages in a column chunk.
+class PARQUET_EXPORT ColumnIndexBuilder {
+ public:
+  /// \brief API convenience to create a ColumnIndexBuilder.
+  static std::unique_ptr<ColumnIndexBuilder> Make(const ColumnDescriptor* descr);
+
+  virtual ~ColumnIndexBuilder() = default;
+
+  /// \brief Add statistics of a data page.
+  ///
+  /// If the ColumnIndexBuilder has seen any corrupted statistics, it will
+  /// not update statistics anymore.
+  ///
+  /// \param stats Page statistics in the encoded form.
+  /// \param size_stats Size statistics of the page if available.
+  virtual void AddPage(const EncodedStatistics& stats,
+                       const SizeStatistics& size_stats) = 0;
+
+  /// \brief Complete the column index.
+  ///
+  /// Once called, AddPage() can no longer be called.
+  /// WriteTo() and Build() can only called after Finish() has been called.
+  virtual void Finish() = 0;
+
+  /// \brief Serialize the column index thrift message.
+  ///
+  /// If the ColumnIndexBuilder has seen any corrupted statistics, it will
+  /// not write any data to the sink.
+  ///
+  /// \param[out] sink output stream to write the serialized message.
+  /// \param[in] encryptor encryptor to encrypt the serialized column index.
+  virtual void WriteTo(::arrow::io::OutputStream* sink,
+                       Encryptor* encryptor = NULLPTR) const = 0;
+
+  /// \brief Create a ColumnIndex directly.
+  ///
+  /// \return If the ColumnIndexBuilder has seen any corrupted statistics, it simply
+  /// returns nullptr. Otherwise the column index is built and returned.
+  virtual std::unique_ptr<ColumnIndex> Build() const = 0;
+};
+
+/// \brief Interface for collecting offset index of data pages in a column chunk.
+class PARQUET_EXPORT OffsetIndexBuilder {
+ public:
+  /// \brief API convenience to create a OffsetIndexBuilder.
+  static std::unique_ptr<OffsetIndexBuilder> Make();
+
+  virtual ~OffsetIndexBuilder() = default;
+
+  /// \brief Add page location and size stats of a data page.
+  virtual void AddPage(int64_t offset, int32_t compressed_page_size,
+                       int64_t first_row_index,
+                       std::optional<int64_t> unencoded_byte_array_length = {}) = 0;
+
+  /// \brief Add page location and size stats of a data page.
+  void AddPage(const PageLocation& page_location, const SizeStatistics& size_stats);
+
+  /// \brief Complete the offset index.
+  ///
+  /// In the buffered row group mode, data pages are flushed into memory
+  /// sink and the OffsetIndexBuilder has only collected the relative offset
+  /// which requires adjustment once they are flushed to the file.
+  ///
+  /// \param final_position Final stream offset to add for page offset adjustment.
+  virtual void Finish(int64_t final_position) = 0;
+
+  /// \brief Serialize the offset index thrift message.
+  ///
+  /// \param[out] sink output stream to write the serialized message.
+  /// \param[in] encryptor encryptor to encrypt the serialized offset index.
+  virtual void WriteTo(::arrow::io::OutputStream* sink,
+                       Encryptor* encryptor = NULLPTR) const = 0;
+
+  /// \brief Create an OffsetIndex directly.
+  virtual std::unique_ptr<OffsetIndex> Build() const = 0;
+};
+
+/// \brief Interface for collecting page index of a parquet file.
+class PARQUET_EXPORT PageIndexBuilder {
+ public:
+  /// \brief API convenience to create a PageIndexBuilder.
+  static std::unique_ptr<PageIndexBuilder> Make(
+      const SchemaDescriptor* schema, InternalFileEncryptor* file_encryptor = NULLPTR);
+
+  virtual ~PageIndexBuilder() = default;
+
+  /// \brief Start a new row group.
+  virtual void AppendRowGroup() = 0;
+
+  /// \brief Get the ColumnIndexBuilder from column ordinal.
+  ///
+  /// \param i Column ordinal.
+  /// \return ColumnIndexBuilder for the column and its memory ownership belongs to
+  /// the PageIndexBuilder.
+  virtual ColumnIndexBuilder* GetColumnIndexBuilder(int32_t i) = 0;
+
+  /// \brief Get the OffsetIndexBuilder from column ordinal.
+  ///
+  /// \param i Column ordinal.
+  /// \return OffsetIndexBuilder for the column and its memory ownership belongs to
+  /// the PageIndexBuilder.
+  virtual OffsetIndexBuilder* GetOffsetIndexBuilder(int32_t i) = 0;
+
+  /// \brief Complete the page index builder and no more write is allowed.
+  virtual void Finish() = 0;
+
+  /// \brief Serialize the page index thrift message.
+  ///
+  /// Only valid column indexes and offset indexes are serialized and their locations
+  /// are set.
+  ///
+  /// \param[out] sink The output stream to write the page index.
+  /// \param[out] location The location of all page index to the start of sink.
+  virtual void WriteTo(::arrow::io::OutputStream* sink,
+                       PageIndexLocation* location) const = 0;
+};
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/parquet_version.h b/pyarrow/include/parquet/parquet_version.h
new file mode 100644
index 0000000000000000000000000000000000000000..4560653c1d90b212a144c1031da9a33bad29edc2
--- /dev/null
+++ b/pyarrow/include/parquet/parquet_version.h
@@ -0,0 +1,31 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PARQUET_VERSION_H
+#define PARQUET_VERSION_H
+
+#define PARQUET_VERSION_MAJOR 23
+#define PARQUET_VERSION_MINOR 0
+#define PARQUET_VERSION_PATCH 1
+
+#define PARQUET_SO_VERSION "2300"
+#define PARQUET_FULL_SO_VERSION "2300.1.0"
+
+// define the parquet created by version
+#define CREATED_BY_VERSION "parquet-cpp-arrow version 23.0.1"
+
+#endif  // PARQUET_VERSION_H
diff --git a/pyarrow/include/parquet/platform.h b/pyarrow/include/parquet/platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..92849347d4e9dafc6d8b81300626fb5dbefe411c
--- /dev/null
+++ b/pyarrow/include/parquet/platform.h
@@ -0,0 +1,116 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/buffer.h"         // IWYU pragma: export
+#include "arrow/io/interfaces.h"  // IWYU pragma: export
+#include "arrow/status.h"         // IWYU pragma: export
+#include "arrow/type_fwd.h"       // IWYU pragma: export
+#include "arrow/util/macros.h"    // IWYU pragma: export
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+
+#  ifdef _MSC_VER
+#    pragma warning(push)
+// Disable warning for STL types usage in DLL interface
+// https://web.archive.org/web/20130317015847/http://connect.microsoft.com/VisualStudio/feedback/details/696593/vc-10-vs-2010-basic-string-exports
+#    pragma warning(disable : 4275 4251)
+// Disable diamond inheritance warnings
+#    pragma warning(disable : 4250)
+// Disable macro redefinition warnings
+#    pragma warning(disable : 4005)
+// Disable extern before exported template warnings
+#    pragma warning(disable : 4910)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef PARQUET_STATIC
+#    define PARQUET_EXPORT
+#  elif defined(PARQUET_EXPORTING)
+#    define PARQUET_EXPORT __declspec(dllexport)
+#  else
+#    define PARQUET_EXPORT __declspec(dllimport)
+#  endif
+
+#  define PARQUET_NO_EXPORT
+
+#else  // Not Windows
+#  ifndef PARQUET_EXPORT
+#    define PARQUET_EXPORT __attribute__((visibility("default")))
+#  endif
+#  ifndef PARQUET_NO_EXPORT
+#    define PARQUET_NO_EXPORT __attribute__((visibility("hidden")))
+#  endif
+#endif  // Non-Windows
+
+// This is a complicated topic, some reading on it:
+// http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/
+#if defined(_MSC_VER) || defined(__clang__)
+#  define PARQUET_TEMPLATE_CLASS_EXPORT
+#  define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT
+#else
+#  define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT
+#  define PARQUET_TEMPLATE_EXPORT
+#endif
+
+#define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN
+
+#define PARQUET_NORETURN ARROW_NORETURN
+#define PARQUET_DEPRECATED ARROW_DEPRECATED
+
+// If ARROW_VALGRIND set when compiling unit tests, also define
+// PARQUET_VALGRIND
+#ifdef ARROW_VALGRIND
+#  define PARQUET_VALGRIND
+#endif
+
+namespace parquet {
+
+using Buffer = ::arrow::Buffer;
+using Codec = ::arrow::util::Codec;
+using CodecOptions = ::arrow::util::CodecOptions;
+using Compression = ::arrow::Compression;
+using MemoryPool = ::arrow::MemoryPool;
+using MutableBuffer = ::arrow::MutableBuffer;
+using ResizableBuffer = ::arrow::ResizableBuffer;
+using ResizableBuffer = ::arrow::ResizableBuffer;
+using ArrowInputFile = ::arrow::io::RandomAccessFile;
+using ArrowInputStream = ::arrow::io::InputStream;
+using ArrowOutputStream = ::arrow::io::OutputStream;
+
+constexpr int64_t kDefaultOutputStreamSize = 1024;
+
+constexpr int16_t kNonPageOrdinal = static_cast<int16_t>(-1);
+
+PARQUET_EXPORT
+std::shared_ptr<::arrow::io::BufferOutputStream> CreateOutputStream(
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+PARQUET_EXPORT
+std::shared_ptr<ResizableBuffer> AllocateBuffer(
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool(), int64_t size = 0);
+
+}  // namespace parquet
+
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
diff --git a/pyarrow/include/parquet/printer.h b/pyarrow/include/parquet/printer.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb86b107f9f9ba048b07a85638df383db025b390
--- /dev/null
+++ b/pyarrow/include/parquet/printer.h
@@ -0,0 +1,46 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <iosfwd>
+#include <list>
+
+#include "parquet/platform.h"
+
+namespace parquet {
+
+class ParquetFileReader;
+
+class PARQUET_EXPORT ParquetFilePrinter {
+ private:
+  ParquetFileReader* fileReader;
+
+ public:
+  explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
+  ~ParquetFilePrinter() = default;
+
+  void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
+                  bool print_values = false, bool format_dump = false,
+                  bool print_key_value_metadata = false,
+                  const char* filename = "No Name");
+
+  void JSONPrint(std::ostream& stream, std::list<int> selected_columns,
+                 const char* filename = "No Name");
+};
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/properties.h b/pyarrow/include/parquet/properties.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb5aee29695ea010df67dff67ea393a01ab86419
--- /dev/null
+++ b/pyarrow/include/parquet/properties.h
@@ -0,0 +1,1429 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "arrow/io/caching.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/compression.h"
+#include "arrow/util/type_fwd.h"
+#include "parquet/encryption/encryption.h"
+#include "parquet/exception.h"
+#include "parquet/parquet_version.h"
+#include "parquet/platform.h"
+#include "parquet/schema.h"
+#include "parquet/type_fwd.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+/// Controls serialization format of data pages.  parquet-format v2.0.0
+/// introduced a new data page metadata type DataPageV2 and serialized page
+/// structure (for example, encoded levels are no longer compressed). Prior to
+/// the completion of PARQUET-457 in 2020, this library did not implement
+/// DataPageV2 correctly, so if you use the V2 data page format, you may have
+/// forward compatibility issues (older versions of the library will be unable
+/// to read the files). Note that some Parquet implementations do not implement
+/// DataPageV2 at all.
+enum class ParquetDataPageVersion { V1, V2 };
+
+/// Controls the level of size statistics that are written to the file.
+enum class SizeStatisticsLevel : uint8_t {
+  // No size statistics are written.
+  None = 0,
+  // Only column chunk size statistics are written.
+  ColumnChunk,
+  // Both size statistics in the column chunk and page index are written.
+  PageAndColumnChunk
+};
+
+/// Align the default buffer size to a small multiple of a page size.
+constexpr int64_t kDefaultBufferSize = 4096 * 4;
+
+constexpr int32_t kDefaultThriftStringSizeLimit = 100 * 1000 * 1000;
+// Structs in the thrift definition are relatively large (at least 300 bytes).
+// This limits total memory to the same order of magnitude as
+// kDefaultStringSizeLimit.
+constexpr int32_t kDefaultThriftContainerSizeLimit = 1000 * 1000;
+
+// PARQUET-978: Minimize footer reads by reading 64 KB from the end of the file
+constexpr int64_t kDefaultFooterReadSize = 64 * 1024;
+
+class PARQUET_EXPORT ReaderProperties {
+ public:
+  explicit ReaderProperties(MemoryPool* pool = ::arrow::default_memory_pool())
+      : pool_(pool) {}
+
+  MemoryPool* memory_pool() const { return pool_; }
+
+  std::shared_ptr<ArrowInputStream> GetStream(std::shared_ptr<ArrowInputFile> source,
+                                              int64_t start, int64_t num_bytes);
+
+  /// Buffered stream reading allows the user to control the memory usage of
+  /// parquet readers. This ensure that all `RandomAccessFile::ReadAt` calls are
+  /// wrapped in a buffered reader that uses a fix sized buffer (of size
+  /// `buffer_size()`) instead of the full size of the ReadAt.
+  ///
+  /// The primary reason for this control knobs is for resource control and not
+  /// performance.
+  bool is_buffered_stream_enabled() const { return buffered_stream_enabled_; }
+  /// Enable buffered stream reading.
+  void enable_buffered_stream() { buffered_stream_enabled_ = true; }
+  /// Disable buffered stream reading.
+  void disable_buffered_stream() { buffered_stream_enabled_ = false; }
+
+  bool read_dense_for_nullable() const { return read_dense_for_nullable_; }
+  void enable_read_dense_for_nullable() { read_dense_for_nullable_ = true; }
+  void disable_read_dense_for_nullable() { read_dense_for_nullable_ = false; }
+
+  /// Return the size of the buffered stream buffer.
+  int64_t buffer_size() const { return buffer_size_; }
+  /// Set the size of the buffered stream buffer in bytes.
+  void set_buffer_size(int64_t size) { buffer_size_ = size; }
+
+  /// \brief Return the size limit on thrift strings.
+  ///
+  /// This limit helps prevent space and time bombs in files, but may need to
+  /// be increased in order to read files with especially large headers.
+  int32_t thrift_string_size_limit() const { return thrift_string_size_limit_; }
+  /// Set the size limit on thrift strings.
+  void set_thrift_string_size_limit(int32_t size) { thrift_string_size_limit_ = size; }
+
+  /// \brief Return the size limit on thrift containers.
+  ///
+  /// This limit helps prevent space and time bombs in files, but may need to
+  /// be increased in order to read files with especially large headers.
+  int32_t thrift_container_size_limit() const { return thrift_container_size_limit_; }
+  /// Set the size limit on thrift containers.
+  void set_thrift_container_size_limit(int32_t size) {
+    thrift_container_size_limit_ = size;
+  }
+
+  /// Set the decryption properties.
+  void file_decryption_properties(std::shared_ptr<FileDecryptionProperties> decryption) {
+    file_decryption_properties_ = std::move(decryption);
+  }
+  /// Return the decryption properties.
+  const std::shared_ptr<FileDecryptionProperties>& file_decryption_properties() const {
+    return file_decryption_properties_;
+  }
+
+  bool page_checksum_verification() const { return page_checksum_verification_; }
+  void set_page_checksum_verification(bool check_crc) {
+    page_checksum_verification_ = check_crc;
+  }
+
+  // Set the default read size to read the footer from a file. For high latency
+  // file systems and files with large metadata (>64KB) this can increase performance
+  // by reducing the number of round-trips to retrieve the entire file metadata.
+  void set_footer_read_size(size_t size) { footer_read_size_ = size; }
+  size_t footer_read_size() const { return footer_read_size_; }
+
+ private:
+  MemoryPool* pool_;
+  int64_t buffer_size_ = kDefaultBufferSize;
+  int32_t thrift_string_size_limit_ = kDefaultThriftStringSizeLimit;
+  int32_t thrift_container_size_limit_ = kDefaultThriftContainerSizeLimit;
+  bool buffered_stream_enabled_ = false;
+  bool page_checksum_verification_ = false;
+  // Used with a RecordReader.
+  bool read_dense_for_nullable_ = false;
+  size_t footer_read_size_ = kDefaultFooterReadSize;
+  std::shared_ptr<FileDecryptionProperties> file_decryption_properties_;
+};
+
+ReaderProperties PARQUET_EXPORT default_reader_properties();
+
+static constexpr int64_t kDefaultDataPageSize = 1024 * 1024;
+static constexpr int64_t kDefaultMaxRowsPerPage = 20'000;
+static constexpr bool DEFAULT_IS_DICTIONARY_ENABLED = true;
+static constexpr int64_t DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT = kDefaultDataPageSize;
+static constexpr int64_t DEFAULT_WRITE_BATCH_SIZE = 1024;
+static constexpr int64_t DEFAULT_MAX_ROW_GROUP_LENGTH = 1024 * 1024;
+static constexpr bool DEFAULT_ARE_STATISTICS_ENABLED = true;
+static constexpr int64_t DEFAULT_MAX_STATISTICS_SIZE = 4096;
+static constexpr Encoding::type DEFAULT_ENCODING = Encoding::UNKNOWN;
+static const char DEFAULT_CREATED_BY[] = CREATED_BY_VERSION;
+static constexpr Compression::type DEFAULT_COMPRESSION_TYPE = Compression::UNCOMPRESSED;
+static constexpr bool DEFAULT_IS_PAGE_INDEX_ENABLED = true;
+static constexpr SizeStatisticsLevel DEFAULT_SIZE_STATISTICS_LEVEL =
+    SizeStatisticsLevel::PageAndColumnChunk;
+
+class PARQUET_EXPORT ColumnProperties {
+ public:
+  ColumnProperties(Encoding::type encoding = DEFAULT_ENCODING,
+                   Compression::type codec = DEFAULT_COMPRESSION_TYPE,
+                   bool dictionary_enabled = DEFAULT_IS_DICTIONARY_ENABLED,
+                   bool statistics_enabled = DEFAULT_ARE_STATISTICS_ENABLED,
+                   size_t max_stats_size = DEFAULT_MAX_STATISTICS_SIZE,
+                   bool page_index_enabled = DEFAULT_IS_PAGE_INDEX_ENABLED)
+      : encoding_(encoding),
+        codec_(codec),
+        dictionary_enabled_(dictionary_enabled),
+        statistics_enabled_(statistics_enabled),
+        max_stats_size_(max_stats_size),
+        page_index_enabled_(page_index_enabled) {}
+
+  void set_encoding(Encoding::type encoding) { encoding_ = encoding; }
+
+  void set_compression(Compression::type codec) { codec_ = codec; }
+
+  void set_dictionary_enabled(bool dictionary_enabled) {
+    dictionary_enabled_ = dictionary_enabled;
+  }
+
+  void set_statistics_enabled(bool statistics_enabled) {
+    statistics_enabled_ = statistics_enabled;
+  }
+
+  void set_max_statistics_size(size_t max_stats_size) {
+    max_stats_size_ = max_stats_size;
+  }
+
+  void set_compression_level(int compression_level) {
+    if (!codec_options_) {
+      codec_options_ = std::make_shared<CodecOptions>();
+    }
+    codec_options_->compression_level = compression_level;
+  }
+
+  void set_codec_options(const std::shared_ptr<CodecOptions>& codec_options) {
+    codec_options_ = codec_options;
+  }
+
+  void set_page_index_enabled(bool page_index_enabled) {
+    page_index_enabled_ = page_index_enabled;
+  }
+
+  Encoding::type encoding() const { return encoding_; }
+
+  Compression::type compression() const { return codec_; }
+
+  bool dictionary_enabled() const { return dictionary_enabled_; }
+
+  bool statistics_enabled() const { return statistics_enabled_; }
+
+  size_t max_statistics_size() const { return max_stats_size_; }
+
+  int compression_level() const {
+    if (!codec_options_) {
+      return ::arrow::util::kUseDefaultCompressionLevel;
+    }
+    return codec_options_->compression_level;
+  }
+
+  const std::shared_ptr<CodecOptions>& codec_options() const { return codec_options_; }
+
+  bool page_index_enabled() const { return page_index_enabled_; }
+
+ private:
+  Encoding::type encoding_;
+  Compression::type codec_;
+  bool dictionary_enabled_;
+  bool statistics_enabled_;
+  size_t max_stats_size_;
+  std::shared_ptr<CodecOptions> codec_options_;
+  bool page_index_enabled_;
+};
+
+// EXPERIMENTAL: Options for content-defined chunking.
+///
+/// Content-defined chunking is an experimental feature that optimizes parquet
+/// files for content addressable storage (CAS) systems by writing data pages
+/// according to content-defined chunk boundaries. This allows for more
+/// efficient deduplication of data across files, hence more efficient network
+/// transfers and storage.
+/// Each content-defined chunk is written as a separate parquet data page. The
+/// following options control the chunks' size and the chunking process. Note
+/// that the chunk size is calculated based on the logical value of the data,
+/// before any encoding or compression is applied.
+struct PARQUET_EXPORT CdcOptions {
+  /// Minimum chunk size in bytes, default is 256 KiB
+  /// The rolling hash will not be updated until this size is reached for each chunk.
+  /// Note that all data sent through the hash function is counted towards the chunk
+  /// size, including definition and repetition levels if present.
+  int64_t min_chunk_size = 256 * 1024;
+  /// Maximum chunk size in bytes, default is 1024 KiB
+  /// The chunker will create a new chunk whenever the chunk size exceeds this value.
+  /// Note that the parquet writer has a related `pagesize` property that controls
+  /// the maximum size of a parquet data page after encoding. While setting
+  /// `pagesize` to a smaller value than `max_chunk_size` doesn't affect the
+  /// chunking effectiveness, it results in more small parquet data pages.
+  int64_t max_chunk_size = 1024 * 1024;
+  /// Number of bit adjustment to the gearhash mask in order to center the chunk size
+  /// around the average size more aggressively, default is 0
+  /// Increasing the normalization level increases the probability of finding a chunk,
+  /// improving the deduplication ratio, but also increasing the number of small chunks
+  /// resulting in many small parquet data pages. The default value provides a good
+  /// balance between deduplication ratio and fragmentation.
+  /// Use norm_level=1 or norm_level=2 to reach a higher deduplication ratio at the
+  /// expense of fragmentation. Negative values can also be used to reduce the
+  /// probability of finding a chunk, resulting in larger chunks and fewer data pages.
+  /// Note that values outside [-3, 3] are not recommended, prefer using the default
+  /// value of 0 for most use cases.
+  int norm_level = 0;
+};
+
+class PARQUET_EXPORT WriterProperties {
+ public:
+  class PARQUET_EXPORT Builder {
+   public:
+    Builder()
+        : pool_(::arrow::default_memory_pool()),
+          dictionary_pagesize_limit_(DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT),
+          write_batch_size_(DEFAULT_WRITE_BATCH_SIZE),
+          max_row_group_length_(DEFAULT_MAX_ROW_GROUP_LENGTH),
+          pagesize_(kDefaultDataPageSize),
+          max_rows_per_page_(kDefaultMaxRowsPerPage),
+          version_(ParquetVersion::PARQUET_2_6),
+          data_page_version_(ParquetDataPageVersion::V1),
+          created_by_(DEFAULT_CREATED_BY),
+          store_decimal_as_integer_(false),
+          page_checksum_enabled_(false),
+          size_statistics_level_(DEFAULT_SIZE_STATISTICS_LEVEL),
+          content_defined_chunking_enabled_(false),
+          content_defined_chunking_options_({}) {}
+
+    explicit Builder(const WriterProperties& properties)
+        : pool_(properties.memory_pool()),
+          dictionary_pagesize_limit_(properties.dictionary_pagesize_limit()),
+          write_batch_size_(properties.write_batch_size()),
+          max_row_group_length_(properties.max_row_group_length()),
+          pagesize_(properties.data_pagesize()),
+          max_rows_per_page_(properties.max_rows_per_page()),
+          version_(properties.version()),
+          data_page_version_(properties.data_page_version()),
+          created_by_(properties.created_by()),
+          store_decimal_as_integer_(properties.store_decimal_as_integer()),
+          page_checksum_enabled_(properties.page_checksum_enabled()),
+          size_statistics_level_(properties.size_statistics_level()),
+          sorting_columns_(properties.sorting_columns()),
+          default_column_properties_(properties.default_column_properties()),
+          content_defined_chunking_enabled_(
+              properties.content_defined_chunking_enabled()),
+          content_defined_chunking_options_(
+              properties.content_defined_chunking_options()) {
+      CopyColumnSpecificProperties(properties);
+    }
+
+    /// \brief EXPERIMENTAL: Use content-defined page chunking for all columns.
+    ///
+    /// Optimize parquet files for content addressable storage (CAS) systems by writing
+    /// data pages according to content-defined chunk boundaries. This allows for more
+    /// efficient deduplication of data across files, hence more efficient network
+    /// transfers and storage. The chunking is based on a rolling hash algorithm that
+    /// identifies chunk boundaries based on the actual content of the data.
+    ///
+    /// Note that only the WriteArrow() interface is supported at the moment.
+    Builder* enable_content_defined_chunking() {
+      content_defined_chunking_enabled_ = true;
+      return this;
+    }
+
+    /// \brief EXPERIMENTAL: Disable content-defined page chunking for all columns.
+    Builder* disable_content_defined_chunking() {
+      content_defined_chunking_enabled_ = false;
+      return this;
+    }
+
+    /// \brief EXPERIMENTAL: Specify content-defined chunking options, see CdcOptions.
+    Builder* content_defined_chunking_options(const CdcOptions& options) {
+      content_defined_chunking_options_ = options;
+      return this;
+    }
+
+    /// Specify the memory pool for the writer. Default default_memory_pool.
+    Builder* memory_pool(MemoryPool* pool) {
+      pool_ = pool;
+      return this;
+    }
+
+    /// Enable dictionary encoding in general for all columns. Default
+    /// enabled.
+    Builder* enable_dictionary() {
+      default_column_properties_.set_dictionary_enabled(true);
+      return this;
+    }
+
+    /// Disable dictionary encoding in general for all columns. Default
+    /// enabled.
+    Builder* disable_dictionary() {
+      default_column_properties_.set_dictionary_enabled(false);
+      return this;
+    }
+
+    /// Enable dictionary encoding for column specified by `path`. Default
+    /// enabled.
+    Builder* enable_dictionary(const std::string& path) {
+      dictionary_enabled_[path] = true;
+      return this;
+    }
+
+    /// Enable dictionary encoding for column specified by `path`. Default
+    /// enabled.
+    Builder* enable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
+      return this->enable_dictionary(path->ToDotString());
+    }
+
+    /// Disable dictionary encoding for column specified by `path`. Default
+    /// enabled.
+    Builder* disable_dictionary(const std::string& path) {
+      dictionary_enabled_[path] = false;
+      return this;
+    }
+
+    /// Disable dictionary encoding for column specified by `path`. Default
+    /// enabled.
+    Builder* disable_dictionary(const std::shared_ptr<schema::ColumnPath>& path) {
+      return this->disable_dictionary(path->ToDotString());
+    }
+
+    /// Specify the dictionary page size limit per row group. Default 1MB.
+    Builder* dictionary_pagesize_limit(int64_t dictionary_psize_limit) {
+      dictionary_pagesize_limit_ = dictionary_psize_limit;
+      return this;
+    }
+
+    /// Specify the write batch size while writing batches of Arrow values
+    /// into Parquet. Default 1024.
+    Builder* write_batch_size(int64_t write_batch_size) {
+      write_batch_size_ = write_batch_size;
+      return this;
+    }
+
+    /// Specify the max number of rows to put in a single row group.
+    /// Default 1Mi rows.
+    Builder* max_row_group_length(int64_t max_row_group_length) {
+      max_row_group_length_ = max_row_group_length;
+      return this;
+    }
+
+    /// Specify the data page size.
+    /// Default 1MB.
+    Builder* data_pagesize(int64_t pg_size) {
+      pagesize_ = pg_size;
+      return this;
+    }
+
+    /// Specify the maximum number of rows per data page.
+    /// Default 20K rows.
+    Builder* max_rows_per_page(int64_t max_rows) {
+      max_rows_per_page_ = max_rows;
+      return this;
+    }
+
+    /// Specify the data page version.
+    /// Default V1.
+    Builder* data_page_version(ParquetDataPageVersion data_page_version) {
+      data_page_version_ = data_page_version;
+      return this;
+    }
+
+    /// Specify the Parquet file version.
+    /// Default PARQUET_2_6.
+    Builder* version(ParquetVersion::type version) {
+      version_ = version;
+      return this;
+    }
+
+    Builder* created_by(const std::string& created_by) {
+      created_by_ = created_by;
+      return this;
+    }
+
+    Builder* enable_page_checksum() {
+      page_checksum_enabled_ = true;
+      return this;
+    }
+
+    Builder* disable_page_checksum() {
+      page_checksum_enabled_ = false;
+      return this;
+    }
+
+    /// \brief Define the encoding that is used when we don't utilise dictionary encoding.
+    //
+    /// This is only applied if dictionary encoding is disabled. If the dictionary grows
+    /// too large we always fall back to the PLAIN encoding.
+    Builder* encoding(Encoding::type encoding_type) {
+      if (encoding_type == Encoding::PLAIN_DICTIONARY ||
+          encoding_type == Encoding::RLE_DICTIONARY) {
+        throw ParquetException("Can't use dictionary encoding as fallback encoding");
+      }
+
+      default_column_properties_.set_encoding(encoding_type);
+      return this;
+    }
+
+    /// \brief Define the encoding that is used when we don't utilise dictionary encoding.
+    //
+    /// This is only applied if dictionary encoding is disabled. If the dictionary grows
+    /// too large we always fall back to the PLAIN encoding.
+    Builder* encoding(const std::string& path, Encoding::type encoding_type) {
+      if (encoding_type == Encoding::PLAIN_DICTIONARY ||
+          encoding_type == Encoding::RLE_DICTIONARY) {
+        throw ParquetException("Can't use dictionary encoding as fallback encoding");
+      }
+
+      encodings_[path] = encoding_type;
+      return this;
+    }
+
+    /// \brief Define the encoding that is used when we don't utilise dictionary encoding.
+    //
+    /// This is only applied if dictionary encoding is disabled. If the dictionary grows
+    /// too large we always fall back to the PLAIN encoding.
+    Builder* encoding(const std::shared_ptr<schema::ColumnPath>& path,
+                      Encoding::type encoding_type) {
+      return this->encoding(path->ToDotString(), encoding_type);
+    }
+
+    /// Specify compression codec in general for all columns.
+    /// Default UNCOMPRESSED.
+    Builder* compression(Compression::type codec) {
+      default_column_properties_.set_compression(codec);
+      return this;
+    }
+
+    /// Specify max statistics size to store min max value.
+    /// Default 4KB.
+    Builder* max_statistics_size(size_t max_stats_sz) {
+      default_column_properties_.set_max_statistics_size(max_stats_sz);
+      return this;
+    }
+
+    /// Specify compression codec for the column specified by `path`.
+    /// Default UNCOMPRESSED.
+    Builder* compression(const std::string& path, Compression::type codec) {
+      codecs_[path] = codec;
+      return this;
+    }
+
+    /// Specify compression codec for the column specified by `path`.
+    /// Default UNCOMPRESSED.
+    Builder* compression(const std::shared_ptr<schema::ColumnPath>& path,
+                         Compression::type codec) {
+      return this->compression(path->ToDotString(), codec);
+    }
+
+    /// \brief Specify the default compression level for the compressor in
+    /// every column.  In case a column does not have an explicitly specified
+    /// compression level, the default one would be used.
+    ///
+    /// The provided compression level is compressor specific. The user would
+    /// have to familiarize oneself with the available levels for the selected
+    /// compressor.  If the compressor does not allow for selecting different
+    /// compression levels, calling this function would not have any effect.
+    /// Parquet and Arrow do not validate the passed compression level.  If no
+    /// level is selected by the user or if the special
+    /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+    /// compression level.
+    ///
+    /// If other compressor-specific options need to be set in addition to the compression
+    /// level, use the codec_options method.
+    Builder* compression_level(int compression_level) {
+      default_column_properties_.set_compression_level(compression_level);
+      return this;
+    }
+
+    /// \brief Specify a compression level for the compressor for the column
+    /// described by path.
+    ///
+    /// The provided compression level is compressor specific. The user would
+    /// have to familiarize oneself with the available levels for the selected
+    /// compressor.  If the compressor does not allow for selecting different
+    /// compression levels, calling this function would not have any effect.
+    /// Parquet and Arrow do not validate the passed compression level.  If no
+    /// level is selected by the user or if the special
+    /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+    /// compression level.
+    Builder* compression_level(const std::string& path, int compression_level) {
+      if (!codec_options_[path]) {
+        codec_options_[path] = std::make_shared<CodecOptions>();
+      }
+      codec_options_[path]->compression_level = compression_level;
+      return this;
+    }
+
+    /// \brief Specify a compression level for the compressor for the column
+    /// described by path.
+    ///
+    /// The provided compression level is compressor specific. The user would
+    /// have to familiarize oneself with the available levels for the selected
+    /// compressor.  If the compressor does not allow for selecting different
+    /// compression levels, calling this function would not have any effect.
+    /// Parquet and Arrow do not validate the passed compression level.  If no
+    /// level is selected by the user or if the special
+    /// std::numeric_limits<int>::min() value is passed, then Arrow selects the
+    /// compression level.
+    Builder* compression_level(const std::shared_ptr<schema::ColumnPath>& path,
+                               int compression_level) {
+      return this->compression_level(path->ToDotString(), compression_level);
+    }
+
+    /// \brief Specify the default codec options for the compressor in
+    /// every column.
+    ///
+    /// The codec options allow configuring the compression level as well
+    /// as other codec-specific options.
+    Builder* codec_options(
+        const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) {
+      default_column_properties_.set_codec_options(codec_options);
+      return this;
+    }
+
+    /// \brief Specify the codec options for the compressor for the column
+    /// described by path.
+    Builder* codec_options(
+        const std::string& path,
+        const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) {
+      codec_options_[path] = codec_options;
+      return this;
+    }
+
+    /// \brief Specify the codec options for the compressor for the column
+    /// described by path.
+    Builder* codec_options(
+        const std::shared_ptr<schema::ColumnPath>& path,
+        const std::shared_ptr<::arrow::util::CodecOptions>& codec_options) {
+      return this->codec_options(path->ToDotString(), codec_options);
+    }
+
+    /// Define the file encryption properties.
+    /// Default NULL.
+    Builder* encryption(
+        std::shared_ptr<FileEncryptionProperties> file_encryption_properties) {
+      file_encryption_properties_ = std::move(file_encryption_properties);
+      return this;
+    }
+
+    /// Enable statistics in general.
+    /// Default enabled.
+    Builder* enable_statistics() {
+      default_column_properties_.set_statistics_enabled(true);
+      return this;
+    }
+
+    /// Disable statistics in general.
+    /// Default enabled.
+    Builder* disable_statistics() {
+      default_column_properties_.set_statistics_enabled(false);
+      return this;
+    }
+
+    /// Enable statistics for the column specified by `path`.
+    /// Default enabled.
+    Builder* enable_statistics(const std::string& path) {
+      statistics_enabled_[path] = true;
+      return this;
+    }
+
+    /// Enable statistics for the column specified by `path`.
+    /// Default enabled.
+    Builder* enable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
+      return this->enable_statistics(path->ToDotString());
+    }
+
+    /// Define the sorting columns.
+    /// Default empty.
+    ///
+    /// If sorting columns are set, user should ensure that records
+    /// are sorted by sorting columns. Otherwise, the storing data
+    /// will be inconsistent with sorting_columns metadata.
+    Builder* set_sorting_columns(std::vector<SortingColumn> sorting_columns) {
+      sorting_columns_ = std::move(sorting_columns);
+      return this;
+    }
+
+    /// Disable statistics for the column specified by `path`.
+    /// Default enabled.
+    Builder* disable_statistics(const std::string& path) {
+      statistics_enabled_[path] = false;
+      return this;
+    }
+
+    /// Disable statistics for the column specified by `path`.
+    /// Default enabled.
+    Builder* disable_statistics(const std::shared_ptr<schema::ColumnPath>& path) {
+      return this->disable_statistics(path->ToDotString());
+    }
+
+    /// Allow decimals with 1 <= precision <= 18 to be stored as integers.
+    ///
+    /// In Parquet, DECIMAL can be stored in any of the following physical types:
+    /// - int32: for 1 <= precision <= 9.
+    /// - int64: for 10 <= precision <= 18.
+    /// - fixed_len_byte_array: precision is limited by the array size.
+    ///   Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits.
+    /// - binary: precision is unlimited. The minimum number of bytes to store
+    ///   the unscaled value is used.
+    ///
+    /// By default, this is DISABLED and all decimal types annotate fixed_len_byte_array.
+    ///
+    /// When enabled, the C++ writer will use following physical types to store decimals:
+    /// - int32: for 1 <= precision <= 9.
+    /// - int64: for 10 <= precision <= 18.
+    /// - fixed_len_byte_array: for precision > 18.
+    ///
+    /// As a consequence, decimal columns stored in integer types are more compact.
+    Builder* enable_store_decimal_as_integer() {
+      store_decimal_as_integer_ = true;
+      return this;
+    }
+
+    /// Disable decimal logical type with 1 <= precision <= 18 to be stored
+    /// as integer physical type.
+    ///
+    /// Default disabled.
+    Builder* disable_store_decimal_as_integer() {
+      store_decimal_as_integer_ = false;
+      return this;
+    }
+
+    /// Enable writing page index in general for all columns. Default enabled.
+    ///
+    /// Writing statistics to the page index disables the old method of writing
+    /// statistics to each data page header.
+    /// The page index makes filtering more efficient than the page header, as
+    /// it gathers all the statistics for a Parquet file in a single place,
+    /// avoiding scattered I/O.
+    ///
+    /// Please check the link below for more details:
+    /// https://github.com/apache/parquet-format/blob/master/PageIndex.md
+    Builder* enable_write_page_index() {
+      default_column_properties_.set_page_index_enabled(true);
+      return this;
+    }
+
+    /// Disable writing page index in general for all columns. Default enabled.
+    Builder* disable_write_page_index() {
+      default_column_properties_.set_page_index_enabled(false);
+      return this;
+    }
+
+    /// Enable writing page index for column specified by `path`. Default enabled.
+    Builder* enable_write_page_index(const std::string& path) {
+      page_index_enabled_[path] = true;
+      return this;
+    }
+
+    /// Enable writing page index for column specified by `path`. Default enabled.
+    Builder* enable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) {
+      return this->enable_write_page_index(path->ToDotString());
+    }
+
+    /// Disable writing page index for column specified by `path`. Default enabled.
+    Builder* disable_write_page_index(const std::string& path) {
+      page_index_enabled_[path] = false;
+      return this;
+    }
+
+    /// Disable writing page index for column specified by `path`. Default enabled.
+    Builder* disable_write_page_index(const std::shared_ptr<schema::ColumnPath>& path) {
+      return this->disable_write_page_index(path->ToDotString());
+    }
+
+    /// \brief Set the level to write size statistics for all columns. Default is
+    /// PageAndColumnChunk.
+    ///
+    /// \param level The level to write size statistics. Note that if page index is not
+    /// enabled, page level size statistics will not be written even if the level
+    /// is set to PageAndColumnChunk.
+    Builder* set_size_statistics_level(SizeStatisticsLevel level) {
+      size_statistics_level_ = level;
+      return this;
+    }
+
+    /// \brief Build the WriterProperties with the builder parameters.
+    /// \return The WriterProperties defined by the builder.
+    std::shared_ptr<WriterProperties> build() {
+      std::unordered_map<std::string, ColumnProperties> column_properties;
+      auto get = [&](const std::string& key) -> ColumnProperties& {
+        auto it = column_properties.find(key);
+        if (it == column_properties.end())
+          return column_properties[key] = default_column_properties_;
+        else
+          return it->second;
+      };
+
+      for (const auto& item : encodings_) get(item.first).set_encoding(item.second);
+      for (const auto& item : codecs_) get(item.first).set_compression(item.second);
+      for (const auto& item : codec_options_)
+        get(item.first).set_codec_options(item.second);
+      for (const auto& item : dictionary_enabled_)
+        get(item.first).set_dictionary_enabled(item.second);
+      for (const auto& item : statistics_enabled_)
+        get(item.first).set_statistics_enabled(item.second);
+      for (const auto& item : page_index_enabled_)
+        get(item.first).set_page_index_enabled(item.second);
+
+      return std::shared_ptr<WriterProperties>(new WriterProperties(
+          pool_, dictionary_pagesize_limit_, write_batch_size_, max_row_group_length_,
+          pagesize_, max_rows_per_page_, version_, created_by_, page_checksum_enabled_,
+          size_statistics_level_, std::move(file_encryption_properties_),
+          default_column_properties_, column_properties, data_page_version_,
+          store_decimal_as_integer_, std::move(sorting_columns_),
+          content_defined_chunking_enabled_, content_defined_chunking_options_));
+    }
+
+   private:
+    void CopyColumnSpecificProperties(const WriterProperties& properties);
+
+    MemoryPool* pool_;
+    int64_t dictionary_pagesize_limit_;
+    int64_t write_batch_size_;
+    int64_t max_row_group_length_;
+    int64_t pagesize_;
+    int64_t max_rows_per_page_;
+    ParquetVersion::type version_;
+    ParquetDataPageVersion data_page_version_;
+    std::string created_by_;
+    bool store_decimal_as_integer_;
+    bool page_checksum_enabled_;
+    SizeStatisticsLevel size_statistics_level_;
+
+    std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
+
+    // If empty, there is no sorting columns.
+    std::vector<SortingColumn> sorting_columns_;
+
+    // Settings used for each column unless overridden in any of the maps below
+    ColumnProperties default_column_properties_;
+    std::unordered_map<std::string, Encoding::type> encodings_;
+    std::unordered_map<std::string, Compression::type> codecs_;
+    std::unordered_map<std::string, std::shared_ptr<CodecOptions>> codec_options_;
+    std::unordered_map<std::string, bool> dictionary_enabled_;
+    std::unordered_map<std::string, bool> statistics_enabled_;
+    std::unordered_map<std::string, bool> page_index_enabled_;
+
+    bool content_defined_chunking_enabled_;
+    CdcOptions content_defined_chunking_options_;
+  };
+
+  inline MemoryPool* memory_pool() const { return pool_; }
+
+  inline int64_t dictionary_pagesize_limit() const { return dictionary_pagesize_limit_; }
+
+  inline int64_t write_batch_size() const { return write_batch_size_; }
+
+  inline int64_t max_row_group_length() const { return max_row_group_length_; }
+
+  inline int64_t data_pagesize() const { return pagesize_; }
+
+  inline int64_t max_rows_per_page() const { return max_rows_per_page_; }
+
+  inline ParquetDataPageVersion data_page_version() const {
+    return parquet_data_page_version_;
+  }
+
+  inline ParquetVersion::type version() const { return parquet_version_; }
+
+  inline std::string created_by() const { return parquet_created_by_; }
+
+  inline bool store_decimal_as_integer() const { return store_decimal_as_integer_; }
+
+  inline bool page_checksum_enabled() const { return page_checksum_enabled_; }
+
+  inline bool content_defined_chunking_enabled() const {
+    return content_defined_chunking_enabled_;
+  }
+  inline CdcOptions content_defined_chunking_options() const {
+    return content_defined_chunking_options_;
+  }
+
+  inline SizeStatisticsLevel size_statistics_level() const {
+    return size_statistics_level_;
+  }
+
+  inline Encoding::type dictionary_index_encoding() const {
+    if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
+      return Encoding::PLAIN_DICTIONARY;
+    } else {
+      return Encoding::RLE_DICTIONARY;
+    }
+  }
+
+  inline Encoding::type dictionary_page_encoding() const {
+    if (parquet_version_ == ParquetVersion::PARQUET_1_0) {
+      return Encoding::PLAIN_DICTIONARY;
+    } else {
+      return Encoding::PLAIN;
+    }
+  }
+
+  const ColumnProperties& column_properties(
+      const std::shared_ptr<schema::ColumnPath>& path) const {
+    auto it = column_properties_.find(path->ToDotString());
+    if (it != column_properties_.end()) return it->second;
+    return default_column_properties_;
+  }
+
+  Encoding::type encoding(const std::shared_ptr<schema::ColumnPath>& path) const {
+    return column_properties(path).encoding();
+  }
+
+  Compression::type compression(const std::shared_ptr<schema::ColumnPath>& path) const {
+    return column_properties(path).compression();
+  }
+
+  int compression_level(const std::shared_ptr<schema::ColumnPath>& path) const {
+    return column_properties(path).compression_level();
+  }
+
+  const std::shared_ptr<CodecOptions> codec_options(
+      const std::shared_ptr<schema::ColumnPath>& path) const {
+    return column_properties(path).codec_options();
+  }
+
+  bool dictionary_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
+    return column_properties(path).dictionary_enabled();
+  }
+
+  const std::vector<SortingColumn>& sorting_columns() const { return sorting_columns_; }
+
+  bool statistics_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
+    return column_properties(path).statistics_enabled();
+  }
+
+  size_t max_statistics_size(const std::shared_ptr<schema::ColumnPath>& path) const {
+    return column_properties(path).max_statistics_size();
+  }
+
+  bool page_index_enabled(const std::shared_ptr<schema::ColumnPath>& path) const {
+    return column_properties(path).page_index_enabled();
+  }
+
+  bool page_index_enabled() const {
+    if (default_column_properties_.page_index_enabled()) {
+      return true;
+    }
+    for (const auto& item : column_properties_) {
+      if (item.second.page_index_enabled()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  inline FileEncryptionProperties* file_encryption_properties() const {
+    return file_encryption_properties_.get();
+  }
+
+  std::shared_ptr<ColumnEncryptionProperties> column_encryption_properties(
+      const std::string& path) const {
+    if (file_encryption_properties_) {
+      return file_encryption_properties_->column_encryption_properties(path);
+    } else {
+      return NULLPTR;
+    }
+  }
+
+  // \brief Return the default column properties
+  const ColumnProperties& default_column_properties() const {
+    return default_column_properties_;
+  }
+
+ private:
+  explicit WriterProperties(
+      MemoryPool* pool, int64_t dictionary_pagesize_limit, int64_t write_batch_size,
+      int64_t max_row_group_length, int64_t pagesize, int64_t max_rows_per_page,
+      ParquetVersion::type version, const std::string& created_by,
+      bool page_write_checksum_enabled, SizeStatisticsLevel size_statistics_level,
+      std::shared_ptr<FileEncryptionProperties> file_encryption_properties,
+      const ColumnProperties& default_column_properties,
+      const std::unordered_map<std::string, ColumnProperties>& column_properties,
+      ParquetDataPageVersion data_page_version, bool store_short_decimal_as_integer,
+      std::vector<SortingColumn> sorting_columns, bool content_defined_chunking_enabled,
+      CdcOptions content_defined_chunking_options)
+      : pool_(pool),
+        dictionary_pagesize_limit_(dictionary_pagesize_limit),
+        write_batch_size_(write_batch_size),
+        max_row_group_length_(max_row_group_length),
+        pagesize_(pagesize),
+        max_rows_per_page_(max_rows_per_page),
+        parquet_data_page_version_(data_page_version),
+        parquet_version_(version),
+        parquet_created_by_(created_by),
+        store_decimal_as_integer_(store_short_decimal_as_integer),
+        page_checksum_enabled_(page_write_checksum_enabled),
+        size_statistics_level_(size_statistics_level),
+        file_encryption_properties_(file_encryption_properties),
+        sorting_columns_(std::move(sorting_columns)),
+        default_column_properties_(default_column_properties),
+        column_properties_(column_properties),
+        content_defined_chunking_enabled_(content_defined_chunking_enabled),
+        content_defined_chunking_options_(content_defined_chunking_options) {}
+
+  MemoryPool* pool_;
+  int64_t dictionary_pagesize_limit_;
+  int64_t write_batch_size_;
+  int64_t max_row_group_length_;
+  int64_t pagesize_;
+  int64_t max_rows_per_page_;
+  ParquetDataPageVersion parquet_data_page_version_;
+  ParquetVersion::type parquet_version_;
+  std::string parquet_created_by_;
+  bool store_decimal_as_integer_;
+  bool page_checksum_enabled_;
+  SizeStatisticsLevel size_statistics_level_;
+
+  std::shared_ptr<FileEncryptionProperties> file_encryption_properties_;
+
+  std::vector<SortingColumn> sorting_columns_;
+
+  ColumnProperties default_column_properties_;
+  std::unordered_map<std::string, ColumnProperties> column_properties_;
+
+  bool content_defined_chunking_enabled_;
+  CdcOptions content_defined_chunking_options_;
+};
+
+PARQUET_EXPORT const std::shared_ptr<WriterProperties>& default_writer_properties();
+
+// ----------------------------------------------------------------------
+// Properties specific to Apache Arrow columnar read and write
+
+static constexpr bool kArrowDefaultUseThreads = false;
+
+// Default number of rows to read when using ::arrow::RecordBatchReader
+static constexpr int64_t kArrowDefaultBatchSize = 64 * 1024;
+
+constexpr inline ::arrow::Type::type kArrowDefaultBinaryType = ::arrow::Type::BINARY;
+constexpr inline ::arrow::Type::type kArrowDefaultListType = ::arrow::Type::LIST;
+
+/// EXPERIMENTAL: Properties for configuring FileReader behavior.
+class PARQUET_EXPORT ArrowReaderProperties {
+ public:
+  explicit ArrowReaderProperties(bool use_threads = kArrowDefaultUseThreads)
+      : use_threads_(use_threads),
+        read_dict_indices_(),
+        batch_size_(kArrowDefaultBatchSize),
+        pre_buffer_(true),
+        cache_options_(::arrow::io::CacheOptions::LazyDefaults()),
+        coerce_int96_timestamp_unit_(::arrow::TimeUnit::NANO),
+        binary_type_(kArrowDefaultBinaryType),
+        list_type_(kArrowDefaultListType),
+        arrow_extensions_enabled_(false),
+        should_load_statistics_(false),
+        smallest_decimal_enabled_(false) {}
+
+  /// \brief Set whether to use the IO thread pool to parse columns in parallel.
+  ///
+  /// Default is false.
+  void set_use_threads(bool use_threads) { use_threads_ = use_threads; }
+  /// Return whether will use multiple threads.
+  bool use_threads() const { return use_threads_; }
+
+  /// \brief Set whether to read a particular column as dictionary encoded.
+  ///
+  /// If the file metadata contains a serialized Arrow schema, then ...
+  ////
+  /// This is only supported for columns with a Parquet physical type of
+  /// BYTE_ARRAY, such as string or binary types.
+  void set_read_dictionary(int column_index, bool read_dict) {
+    if (read_dict) {
+      read_dict_indices_.insert(column_index);
+    } else {
+      read_dict_indices_.erase(column_index);
+    }
+  }
+  /// Return whether the column at the index will be read as dictionary.
+  bool read_dictionary(int column_index) const {
+    if (read_dict_indices_.find(column_index) != read_dict_indices_.end()) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /// \brief Set the Arrow binary type to read BYTE_ARRAY columns as.
+  ///
+  /// Allowed values are Type::BINARY, Type::LARGE_BINARY and Type::BINARY_VIEW.
+  /// Default is Type::BINARY.
+  ///
+  /// If a BYTE_ARRAY column has the STRING logical type, it is read as the
+  /// Arrow string type corresponding to the configured binary type (for example
+  /// Type::LARGE_STRING if the configured binary type is Type::LARGE_BINARY).
+  ///
+  /// However, if a serialized Arrow schema is found in the Parquet metadata,
+  /// this setting is ignored and the Arrow schema takes precedence
+  /// (see ArrowWriterProperties::store_schema).
+  void set_binary_type(::arrow::Type::type value) { binary_type_ = value; }
+  /// Return the Arrow binary type to read BYTE_ARRAY columns as.
+  ::arrow::Type::type binary_type() const { return binary_type_; }
+
+  /// \brief Set the Arrow list type to read Parquet list columns as.
+  ///
+  /// Allowed values are Type::LIST and Type::LARGE_LIST.
+  /// Default is Type::LIST.
+  ///
+  /// However, if a serialized Arrow schema is found in the Parquet metadata,
+  /// this setting is ignored and the Arrow schema takes precedence
+  /// (see ArrowWriterProperties::store_schema).
+  void set_list_type(::arrow::Type::type value) { list_type_ = value; }
+  /// Return the Arrow list type to read Parquet list columns as.
+  ::arrow::Type::type list_type() const { return list_type_; }
+
+  /// \brief Set the maximum number of rows to read into a record batch.
+  ///
+  /// Will only be fewer rows when there are no more rows in the file.
+  /// Note that some APIs such as ReadTable may ignore this setting.
+  void set_batch_size(int64_t batch_size) { batch_size_ = batch_size; }
+  /// Return the batch size in rows.
+  ///
+  /// Note that some APIs such as ReadTable may ignore this setting.
+  int64_t batch_size() const { return batch_size_; }
+
+  /// Enable read coalescing (default false).
+  ///
+  /// When enabled, the Arrow reader will pre-buffer necessary regions
+  /// of the file in-memory. This is intended to improve performance on
+  /// high-latency filesystems (e.g. Amazon S3).
+  void set_pre_buffer(bool pre_buffer) { pre_buffer_ = pre_buffer; }
+  /// Return whether read coalescing is enabled.
+  bool pre_buffer() const { return pre_buffer_; }
+
+  /// Set options for read coalescing. This can be used to tune the
+  /// implementation for characteristics of different filesystems.
+  void set_cache_options(::arrow::io::CacheOptions options) { cache_options_ = options; }
+  /// Return the options for read coalescing.
+  const ::arrow::io::CacheOptions& cache_options() const { return cache_options_; }
+
+  /// Set execution context for read coalescing.
+  void set_io_context(const ::arrow::io::IOContext& ctx) { io_context_ = ctx; }
+  /// Return the execution context used for read coalescing.
+  const ::arrow::io::IOContext& io_context() const { return io_context_; }
+
+  /// Set timestamp unit to use for deprecated INT96-encoded timestamps
+  /// (default is NANO).
+  void set_coerce_int96_timestamp_unit(::arrow::TimeUnit::type unit) {
+    coerce_int96_timestamp_unit_ = unit;
+  }
+
+  ::arrow::TimeUnit::type coerce_int96_timestamp_unit() const {
+    return coerce_int96_timestamp_unit_;
+  }
+
+  /// Enable Parquet-supported Arrow extension types.
+  ///
+  /// When enabled, Parquet logical types will be mapped to their corresponding Arrow
+  /// extension types at read time, if such exist. Currently only arrow::extension::json()
+  /// extension type is supported. Columns whose LogicalType is JSON will be interpreted
+  /// as arrow::extension::json(), with storage type inferred from the serialized Arrow
+  /// schema if present, or `utf8` by default.
+  void set_arrow_extensions_enabled(bool extensions_enabled) {
+    arrow_extensions_enabled_ = extensions_enabled;
+  }
+  bool get_arrow_extensions_enabled() const { return arrow_extensions_enabled_; }
+
+  /// \brief Set whether to load statistics as much as possible.
+  ///
+  /// Default is false.
+  void set_should_load_statistics(bool should_load_statistics) {
+    should_load_statistics_ = should_load_statistics;
+  }
+  /// Return whether loading statistics as much as possible.
+  bool should_load_statistics() const { return should_load_statistics_; }
+
+  /// \brief Set whether to infer Decimal32/64 from Parquet decimal logical types.
+  ///
+  /// Default is false for compatibility, meaning that only Decimal128 and Decimal256
+  /// can be inferred.
+  void set_smallest_decimal_enabled(bool smallest_decimal_enable) {
+    smallest_decimal_enabled_ = smallest_decimal_enable;
+  }
+  /// \brief Whether to infer Decimal32/64 from Parquet decimal logical types.
+  ///
+  /// When enabled, Parquet decimal columns will be inferred as the smallest possible
+  /// Arrow Decimal type.
+  /// When disabled, Parquet decimal columns will be inferred as either Decimal128 or
+  /// Decimal256, but not Decimal32/64.
+  ///
+  /// Note: if an Arrow schema is found in the Parquet metadata, it will take priority and
+  /// this setting will be ignored.
+  bool smallest_decimal_enabled() const { return smallest_decimal_enabled_; }
+
+ private:
+  bool use_threads_;
+  std::unordered_set<int> read_dict_indices_;
+  int64_t batch_size_;
+  bool pre_buffer_;
+  ::arrow::io::IOContext io_context_;
+  ::arrow::io::CacheOptions cache_options_;
+  ::arrow::TimeUnit::type coerce_int96_timestamp_unit_;
+  ::arrow::Type::type binary_type_;
+  ::arrow::Type::type list_type_;
+  bool arrow_extensions_enabled_;
+  bool should_load_statistics_;
+  bool smallest_decimal_enabled_;
+};
+
+/// EXPERIMENTAL: Constructs the default ArrowReaderProperties
+PARQUET_EXPORT
+ArrowReaderProperties default_arrow_reader_properties();
+
+class PARQUET_EXPORT ArrowWriterProperties {
+ public:
+  enum EngineVersion {
+    V1,  // Supports only nested lists.
+    V2   // Full support for all nesting combinations
+  };
+  class Builder {
+   public:
+    Builder()
+        : write_timestamps_as_int96_(false),
+          coerce_timestamps_enabled_(false),
+          coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
+          truncated_timestamps_allowed_(false),
+          store_schema_(false),
+          compliant_nested_types_(true),
+          engine_version_(V2),
+          use_threads_(kArrowDefaultUseThreads),
+          executor_(NULLPTR),
+          write_time_adjusted_to_utc_(false) {}
+
+    /// \brief Disable writing legacy int96 timestamps (default disabled).
+    Builder* disable_deprecated_int96_timestamps() {
+      write_timestamps_as_int96_ = false;
+      return this;
+    }
+
+    /// \brief Enable writing legacy int96 timestamps (default disabled).
+    ///
+    /// May be turned on to write timestamps compatible with older Parquet writers.
+    /// This takes precedent over coerce_timestamps.
+    Builder* enable_deprecated_int96_timestamps() {
+      write_timestamps_as_int96_ = true;
+      return this;
+    }
+
+    /// \brief Coerce all timestamps to the specified time unit.
+    /// \param unit time unit to truncate to.
+    /// For Parquet versions 1.0 and 2.4, nanoseconds are casted to microseconds.
+    Builder* coerce_timestamps(::arrow::TimeUnit::type unit) {
+      coerce_timestamps_enabled_ = true;
+      coerce_timestamps_unit_ = unit;
+      return this;
+    }
+
+    /// \brief Allow loss of data when truncating timestamps.
+    ///
+    /// This is disallowed by default and an error will be returned.
+    Builder* allow_truncated_timestamps() {
+      truncated_timestamps_allowed_ = true;
+      return this;
+    }
+
+    /// \brief Disallow loss of data when truncating timestamps (default).
+    Builder* disallow_truncated_timestamps() {
+      truncated_timestamps_allowed_ = false;
+      return this;
+    }
+
+    /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file,
+    /// to enable certain read options (like "read_dictionary") to be set
+    /// automatically
+    Builder* store_schema() {
+      store_schema_ = true;
+      return this;
+    }
+
+    /// \brief When enabled, will not preserve Arrow field names for list types.
+    ///
+    /// Instead of using the field names Arrow uses for the values array of
+    /// list types (default "item"), will use "element", as is specified in
+    /// the Parquet spec.
+    ///
+    /// This is enabled by default.
+    Builder* enable_compliant_nested_types() {
+      compliant_nested_types_ = true;
+      return this;
+    }
+
+    /// Preserve Arrow list field name.
+    Builder* disable_compliant_nested_types() {
+      compliant_nested_types_ = false;
+      return this;
+    }
+
+    /// Set the version of the Parquet writer engine.
+    Builder* set_engine_version(EngineVersion version) {
+      engine_version_ = version;
+      return this;
+    }
+
+    /// \brief Set whether to use multiple threads to write columns
+    /// in parallel in the buffered row group mode.
+    ///
+    /// WARNING: If writing multiple files in parallel in the same
+    /// executor, deadlock may occur if use_threads is true. Please
+    /// disable it in this case.
+    ///
+    /// Default is false.
+    Builder* set_use_threads(bool use_threads) {
+      use_threads_ = use_threads;
+      return this;
+    }
+
+    /// \brief Set the executor to write columns in parallel in the
+    /// buffered row group mode.
+    ///
+    /// Default is nullptr and the default cpu executor will be used.
+    Builder* set_executor(::arrow::internal::Executor* executor) {
+      executor_ = executor;
+      return this;
+    }
+
+    /// \brief Set the value of isAdjustedTOUTC when writing a TIME column
+    ///
+    /// Default is false because Arrow TIME data is expressed in an unspecified timezone.
+    /// Note this setting doesn't affect TIMESTAMP data.
+    Builder* set_time_adjusted_to_utc(bool adjusted) {
+      write_time_adjusted_to_utc_ = adjusted;
+      return this;
+    }
+
+    /// Create the final properties.
+    std::shared_ptr<ArrowWriterProperties> build() {
+      return std::shared_ptr<ArrowWriterProperties>(new ArrowWriterProperties(
+          write_timestamps_as_int96_, coerce_timestamps_enabled_, coerce_timestamps_unit_,
+          truncated_timestamps_allowed_, store_schema_, compliant_nested_types_,
+          engine_version_, use_threads_, executor_, write_time_adjusted_to_utc_));
+    }
+
+   private:
+    bool write_timestamps_as_int96_;
+
+    bool coerce_timestamps_enabled_;
+    ::arrow::TimeUnit::type coerce_timestamps_unit_;
+    bool truncated_timestamps_allowed_;
+
+    bool store_schema_;
+    bool compliant_nested_types_;
+    EngineVersion engine_version_;
+
+    bool use_threads_;
+    ::arrow::internal::Executor* executor_;
+
+    bool write_time_adjusted_to_utc_;
+  };
+
+  bool support_deprecated_int96_timestamps() const { return write_timestamps_as_int96_; }
+
+  bool coerce_timestamps_enabled() const { return coerce_timestamps_enabled_; }
+  ::arrow::TimeUnit::type coerce_timestamps_unit() const {
+    return coerce_timestamps_unit_;
+  }
+
+  bool truncated_timestamps_allowed() const { return truncated_timestamps_allowed_; }
+
+  bool store_schema() const { return store_schema_; }
+
+  /// \brief Enable nested type naming according to the parquet specification.
+  ///
+  /// Older versions of arrow wrote out field names for nested lists based on the name
+  /// of the field.  According to the parquet specification they should always be
+  /// "element".
+  bool compliant_nested_types() const { return compliant_nested_types_; }
+
+  /// \brief The underlying engine version to use when writing Arrow data.
+  ///
+  /// V2 is currently the latest V1 is considered deprecated but left in
+  /// place in case there are bugs detected in V2.
+  EngineVersion engine_version() const { return engine_version_; }
+
+  /// \brief Returns whether the writer will use multiple threads
+  /// to write columns in parallel in the buffered row group mode.
+  bool use_threads() const { return use_threads_; }
+
+  /// \brief Returns the executor used to write columns in parallel.
+  ::arrow::internal::Executor* executor() const;
+
+  /// \brief The value of isAdjustedTOUTC when writing a TIME column
+  ///
+  /// Note this setting doesn't affect TIMESTAMP data.
+  bool write_time_adjusted_to_utc() const { return write_time_adjusted_to_utc_; }
+
+ private:
+  explicit ArrowWriterProperties(bool write_nanos_as_int96,
+                                 bool coerce_timestamps_enabled,
+                                 ::arrow::TimeUnit::type coerce_timestamps_unit,
+                                 bool truncated_timestamps_allowed, bool store_schema,
+                                 bool compliant_nested_types,
+                                 EngineVersion engine_version, bool use_threads,
+                                 ::arrow::internal::Executor* executor,
+                                 bool write_time_adjusted_to_utc)
+      : write_timestamps_as_int96_(write_nanos_as_int96),
+        coerce_timestamps_enabled_(coerce_timestamps_enabled),
+        coerce_timestamps_unit_(coerce_timestamps_unit),
+        truncated_timestamps_allowed_(truncated_timestamps_allowed),
+        store_schema_(store_schema),
+        compliant_nested_types_(compliant_nested_types),
+        engine_version_(engine_version),
+        use_threads_(use_threads),
+        executor_(executor),
+        write_time_adjusted_to_utc_(write_time_adjusted_to_utc) {}
+
+  const bool write_timestamps_as_int96_;
+  const bool coerce_timestamps_enabled_;
+  const ::arrow::TimeUnit::type coerce_timestamps_unit_;
+  const bool truncated_timestamps_allowed_;
+  const bool store_schema_;
+  const bool compliant_nested_types_;
+  const EngineVersion engine_version_;
+  const bool use_threads_;
+  ::arrow::internal::Executor* executor_;
+  const bool write_time_adjusted_to_utc_;
+};
+
+/// \brief State object used for writing Arrow data directly to a Parquet
+/// column chunk. API possibly not stable
+struct ArrowWriteContext {
+  ArrowWriteContext(MemoryPool* memory_pool, ArrowWriterProperties* properties)
+      : memory_pool(memory_pool),
+        properties(properties),
+        data_buffer(AllocateBuffer(memory_pool)),
+        def_levels_buffer(AllocateBuffer(memory_pool)) {}
+
+  template <typename T>
+  ::arrow::Status GetScratchData(const int64_t num_values, T** out) {
+    ARROW_RETURN_NOT_OK(this->data_buffer->Resize(num_values * sizeof(T), false));
+    *out = reinterpret_cast<T*>(this->data_buffer->mutable_data());
+    return ::arrow::Status::OK();
+  }
+
+  MemoryPool* memory_pool;
+  const ArrowWriterProperties* properties;
+
+  // Buffer used for storing the data of an array converted to the physical type
+  // as expected by parquet-cpp.
+  std::shared_ptr<ResizableBuffer> data_buffer;
+
+  // We use the shared ownership of this buffer
+  std::shared_ptr<ResizableBuffer> def_levels_buffer;
+};
+
+PARQUET_EXPORT
+std::shared_ptr<ArrowWriterProperties> default_arrow_writer_properties();
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/schema.h b/pyarrow/include/parquet/schema.h
new file mode 100644
index 0000000000000000000000000000000000000000..1addc73bd367d38d517172602541bd978b6a0f8d
--- /dev/null
+++ b/pyarrow/include/parquet/schema.h
@@ -0,0 +1,494 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module contains the logical parquet-cpp types (independent of Thrift
+// structures), schema nodes, and related type tools
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "parquet/platform.h"
+#include "parquet/types.h"
+#include "parquet/windows_fixup.h"  // for OPTIONAL
+
+namespace parquet {
+
+class SchemaDescriptor;
+
+namespace schema {
+
+class Node;
+
+// List encodings: using the terminology from Impala to define different styles
+// of representing logical lists (a.k.a. ARRAY types) in Parquet schemas. Since
+// the converted type named in the Parquet metadata is ConvertedType::LIST we
+// use that terminology here. It also helps distinguish from the *_ARRAY
+// primitive types.
+//
+// One-level encoding: Only allows required lists with required cells
+//   repeated value_type name
+//
+// Two-level encoding: Enables optional lists with only required cells
+//   <required/optional> group list
+//     repeated value_type item
+//
+// Three-level encoding: Enables optional lists with optional cells
+//   <required/optional> group bag
+//     repeated group list
+//       <required/optional> value_type item
+//
+// 2- and 1-level encoding are respectively equivalent to 3-level encoding with
+// the non-repeated nodes set to required.
+//
+// The "official" encoding recommended in the Parquet spec is the 3-level, and
+// we use that as the default when creating list types. For semantic completeness
+// we allow the other two. Since all types of encodings will occur "in the
+// wild" we need to be able to interpret the associated definition levels in
+// the context of the actual encoding used in the file.
+//
+// NB: Some Parquet writers may not set ConvertedType::LIST on the repeated
+// SchemaElement, which could make things challenging if we are trying to infer
+// that a sequence of nodes semantically represents an array according to one
+// of these encodings (versus a struct containing an array). We should refuse
+// the temptation to guess, as they say.
+struct ListEncoding {
+  enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL };
+};
+
+class PARQUET_EXPORT ColumnPath {
+ public:
+  ColumnPath() : path_() {}
+  explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {}
+  explicit ColumnPath(std::vector<std::string>&& path) : path_(std::move(path)) {}
+
+  static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring);
+  static std::shared_ptr<ColumnPath> FromNode(const Node& node);
+
+  std::shared_ptr<ColumnPath> extend(const std::string& node_name) const;
+  std::string ToDotString() const;
+  const std::vector<std::string>& ToDotVector() const;
+
+ protected:
+  std::vector<std::string> path_;
+};
+
+// Base class for logical schema types. A type has a name, repetition level,
+// and optionally a logical type (ConvertedType in Parquet metadata parlance)
+class PARQUET_EXPORT Node {
+ public:
+  enum type { PRIMITIVE, GROUP };
+
+  virtual ~Node() {}
+
+  bool is_primitive() const { return type_ == Node::PRIMITIVE; }
+
+  bool is_group() const { return type_ == Node::GROUP; }
+
+  bool is_optional() const { return repetition_ == Repetition::OPTIONAL; }
+
+  bool is_repeated() const { return repetition_ == Repetition::REPEATED; }
+
+  bool is_required() const { return repetition_ == Repetition::REQUIRED; }
+
+  virtual bool Equals(const Node* other) const = 0;
+
+  const std::string& name() const { return name_; }
+
+  Node::type node_type() const { return type_; }
+
+  Repetition::type repetition() const { return repetition_; }
+
+  ConvertedType::type converted_type() const { return converted_type_; }
+
+  const std::shared_ptr<const LogicalType>& logical_type() const { return logical_type_; }
+
+  /// \brief The field_id value for the serialized SchemaElement. If the
+  /// field_id is less than 0 (e.g. -1), it will not be set when serialized to
+  /// Thrift.
+  int field_id() const { return field_id_; }
+
+  const Node* parent() const { return parent_; }
+
+  const std::shared_ptr<ColumnPath> path() const;
+
+  virtual void ToParquet(void* element) const = 0;
+
+  // Node::Visitor abstract class for walking schemas with the visitor pattern
+  class Visitor {
+   public:
+    virtual ~Visitor() {}
+
+    virtual void Visit(Node* node) = 0;
+  };
+  class ConstVisitor {
+   public:
+    virtual ~ConstVisitor() {}
+
+    virtual void Visit(const Node* node) = 0;
+  };
+
+  virtual void Visit(Visitor* visitor) = 0;
+  virtual void VisitConst(ConstVisitor* visitor) const = 0;
+
+ protected:
+  friend class GroupNode;
+
+  Node(Node::type type, const std::string& name, Repetition::type repetition,
+       ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1)
+      : type_(type),
+        name_(name),
+        repetition_(repetition),
+        converted_type_(converted_type),
+        field_id_(field_id),
+        parent_(NULLPTR) {}
+
+  Node(Node::type type, const std::string& name, Repetition::type repetition,
+       std::shared_ptr<const LogicalType> logical_type, int field_id = -1)
+      : type_(type),
+        name_(name),
+        repetition_(repetition),
+        logical_type_(std::move(logical_type)),
+        field_id_(field_id),
+        parent_(NULLPTR) {}
+
+  Node::type type_;
+  std::string name_;
+  Repetition::type repetition_;
+  ConvertedType::type converted_type_{ConvertedType::NONE};
+  std::shared_ptr<const LogicalType> logical_type_;
+  int field_id_;
+  // Nodes should not be shared, they have a single parent.
+  const Node* parent_;
+
+  bool EqualsInternal(const Node* other) const;
+  void SetParent(const Node* p_parent);
+
+ private:
+  PARQUET_DISALLOW_COPY_AND_ASSIGN(Node);
+};
+
+// Save our breath all over the place with these typedefs
+using NodePtr = std::shared_ptr<Node>;
+using NodeVector = std::vector<NodePtr>;
+
+// A type that is one of the primitive Parquet storage types. In addition to
+// the other type metadata (name, repetition level, logical type), also has the
+// physical storage type and their type-specific metadata (byte width, decimal
+// parameters)
+class PARQUET_EXPORT PrimitiveNode : public Node {
+ public:
+  static std::unique_ptr<Node> FromParquet(const void* opaque_element);
+
+  // A field_id -1 (or any negative value) will be serialized as null in Thrift
+  static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+                             Type::type type,
+                             ConvertedType::type converted_type = ConvertedType::NONE,
+                             int length = -1, int precision = -1, int scale = -1,
+                             int field_id = -1) {
+    return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length,
+                                     precision, scale, field_id));
+  }
+
+  // If no logical type, pass LogicalType::None() or nullptr
+  // A field_id -1 (or any negative value) will be serialized as null in Thrift
+  static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+                             std::shared_ptr<const LogicalType> logical_type,
+                             Type::type primitive_type, int primitive_length = -1,
+                             int field_id = -1) {
+    return NodePtr(new PrimitiveNode(name, repetition, std::move(logical_type),
+                                     primitive_type, primitive_length, field_id));
+  }
+
+  bool Equals(const Node* other) const override;
+
+  Type::type physical_type() const { return physical_type_; }
+
+  ColumnOrder column_order() const { return column_order_; }
+
+  void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; }
+
+  int32_t type_length() const { return type_length_; }
+
+  const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; }
+
+  void ToParquet(void* element) const override;
+  void Visit(Visitor* visitor) override;
+  void VisitConst(ConstVisitor* visitor) const override;
+
+ private:
+  PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type,
+                ConvertedType::type converted_type = ConvertedType::NONE, int length = -1,
+                int precision = -1, int scale = -1, int field_id = -1);
+
+  PrimitiveNode(const std::string& name, Repetition::type repetition,
+                std::shared_ptr<const LogicalType> logical_type,
+                Type::type primitive_type, int primitive_length = -1, int field_id = -1);
+
+  Type::type physical_type_;
+  int32_t type_length_;
+  DecimalMetadata decimal_metadata_;
+  ColumnOrder column_order_;
+
+  // For FIXED_LEN_BYTE_ARRAY
+  void SetTypeLength(int32_t length) { type_length_ = length; }
+
+  bool EqualsInternal(const PrimitiveNode* other) const;
+
+  FRIEND_TEST(TestPrimitiveNode, Attrs);
+  FRIEND_TEST(TestPrimitiveNode, Equals);
+  FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
+  FRIEND_TEST(TestPrimitiveNode, FromParquet);
+};
+
+class PARQUET_EXPORT GroupNode : public Node {
+ public:
+  static std::unique_ptr<Node> FromParquet(const void* opaque_element,
+                                           NodeVector fields = {});
+
+  // A field_id -1 (or any negative value) will be serialized as null in Thrift
+  static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+                             const NodeVector& fields,
+                             ConvertedType::type converted_type = ConvertedType::NONE,
+                             int field_id = -1) {
+    return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id));
+  }
+
+  // If no logical type, pass nullptr
+  // A field_id -1 (or any negative value) will be serialized as null in Thrift
+  static inline NodePtr Make(const std::string& name, Repetition::type repetition,
+                             const NodeVector& fields,
+                             std::shared_ptr<const LogicalType> logical_type,
+                             int field_id = -1) {
+    return NodePtr(
+        new GroupNode(name, repetition, fields, std::move(logical_type), field_id));
+  }
+
+  bool Equals(const Node* other) const override;
+
+  const NodePtr& field(int i) const { return fields_[i]; }
+  // Get the index of a field by its name, or negative value if not found.
+  // If several fields share the same name, it is unspecified which one
+  // is returned.
+  int FieldIndex(const std::string& name) const;
+  // Get the index of a field by its node, or negative value if not found.
+  int FieldIndex(const Node& node) const;
+
+  int field_count() const { return static_cast<int>(fields_.size()); }
+
+  void ToParquet(void* element) const override;
+  void Visit(Visitor* visitor) override;
+  void VisitConst(ConstVisitor* visitor) const override;
+
+  /// \brief Return true if this node or any child node has REPEATED repetition
+  /// type
+  bool HasRepeatedFields() const;
+
+ private:
+  GroupNode(const std::string& name, Repetition::type repetition,
+            const NodeVector& fields,
+            ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1);
+
+  GroupNode(const std::string& name, Repetition::type repetition,
+            const NodeVector& fields, std::shared_ptr<const LogicalType> logical_type,
+            int field_id = -1);
+
+  NodeVector fields_;
+  bool EqualsInternal(const GroupNode* other) const;
+
+  // Mapping between field name to the field index
+  std::unordered_multimap<std::string, int> field_name_to_idx_;
+
+  FRIEND_TEST(TestGroupNode, Attrs);
+  FRIEND_TEST(TestGroupNode, Equals);
+  FRIEND_TEST(TestGroupNode, FieldIndex);
+  FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName);
+};
+
+// ----------------------------------------------------------------------
+// Convenience primitive type factory functions
+
+#define PRIMITIVE_FACTORY(FuncName, TYPE)                                                \
+  static inline NodePtr FuncName(const std::string& name,                                \
+                                 Repetition::type repetition = Repetition::OPTIONAL,     \
+                                 int field_id = -1) {                                    \
+    return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE,        \
+                               /*length=*/-1, /*precision=*/-1, /*scale=*/-1, field_id); \
+  }
+
+PRIMITIVE_FACTORY(Boolean, BOOLEAN)
+PRIMITIVE_FACTORY(Int32, INT32)
+PRIMITIVE_FACTORY(Int64, INT64)
+PRIMITIVE_FACTORY(Int96, INT96)
+PRIMITIVE_FACTORY(Float, FLOAT)
+PRIMITIVE_FACTORY(Double, DOUBLE)
+PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY)
+
+void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream,
+                                int indent_width = 2);
+
+}  // namespace schema
+
+// The ColumnDescriptor encapsulates information necessary to interpret
+// primitive column data in the context of a particular schema. We have to
+// examine the node structure of a column's path to the root in the schema tree
+// to be able to reassemble the nested structure from the repetition and
+// definition levels.
+class PARQUET_EXPORT ColumnDescriptor {
+ public:
+  ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level,
+                   int16_t max_repetition_level,
+                   const SchemaDescriptor* schema_descr = NULLPTR);
+
+  bool Equals(const ColumnDescriptor& other) const;
+
+  int16_t max_definition_level() const { return max_definition_level_; }
+
+  int16_t max_repetition_level() const { return max_repetition_level_; }
+
+  Type::type physical_type() const { return primitive_node_->physical_type(); }
+
+  ConvertedType::type converted_type() const { return primitive_node_->converted_type(); }
+
+  const std::shared_ptr<const LogicalType>& logical_type() const {
+    return primitive_node_->logical_type();
+  }
+
+  ColumnOrder column_order() const { return primitive_node_->column_order(); }
+
+  SortOrder::type sort_order() const {
+    const auto& la = logical_type();
+    auto pt = physical_type();
+    return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt);
+  }
+
+  const std::string& name() const { return primitive_node_->name(); }
+
+  const std::shared_ptr<schema::ColumnPath> path() const;
+
+  const schema::NodePtr& schema_node() const { return node_; }
+
+  std::string ToString() const;
+
+  int type_length() const;
+
+  int type_precision() const;
+
+  int type_scale() const;
+
+ private:
+  schema::NodePtr node_;
+  const schema::PrimitiveNode* primitive_node_;
+
+  int16_t max_definition_level_;
+  int16_t max_repetition_level_;
+};
+
+// Container for the converted Parquet schema with a computed information from
+// the schema analysis needed for file reading
+//
+// * Column index to Node
+// * Max repetition / definition levels for each primitive node
+//
+// The ColumnDescriptor objects produced by this class can be used to assist in
+// the reconstruction of fully materialized data structures from the
+// repetition-definition level encoding of nested data
+//
+// TODO(wesm): this object can be recomputed from a Schema
+class PARQUET_EXPORT SchemaDescriptor {
+ public:
+  SchemaDescriptor() = default;
+  ~SchemaDescriptor() = default;
+
+  // Analyze the schema
+  void Init(std::unique_ptr<schema::Node> schema);
+  void Init(schema::NodePtr schema);
+
+  const ColumnDescriptor* Column(int i) const;
+
+  // Get the index of a column by its dotstring path, or negative value if not found.
+  // If several columns share the same dotstring path, it is unspecified which one
+  // is returned.
+  int ColumnIndex(const std::string& node_path) const;
+  // Get the index of a column by its node, or negative value if not found.
+  int ColumnIndex(const schema::Node& node) const;
+
+  bool Equals(const SchemaDescriptor& other, std::ostream* diff_output = NULLPTR) const;
+
+  // The number of physical columns appearing in the file
+  int num_columns() const { return static_cast<int>(leaves_.size()); }
+
+  const schema::NodePtr& schema_root() const { return schema_; }
+
+  const schema::GroupNode* group_node() const { return group_node_; }
+
+  // Returns the root (child of the schema root) node of the leaf(column) node
+  const schema::Node* GetColumnRoot(int i) const;
+
+  const std::string& name() const { return group_node_->name(); }
+
+  std::string ToString() const;
+
+  void updateColumnOrders(const std::vector<ColumnOrder>& column_orders);
+
+  /// \brief Return column index corresponding to a particular
+  /// PrimitiveNode. Returns -1 if not found
+  int GetColumnIndex(const schema::PrimitiveNode& node) const;
+
+  /// \brief Return true if any field or their children have REPEATED repetition
+  /// type
+  bool HasRepeatedFields() const;
+
+ private:
+  friend class ColumnDescriptor;
+
+  // Root Node
+  schema::NodePtr schema_;
+  // Root Node
+  // Would never be NULLPTR.
+  const schema::GroupNode* group_node_;
+
+  void BuildTree(const schema::NodePtr& node, int16_t max_def_level,
+                 int16_t max_rep_level, const schema::NodePtr& base);
+
+  // Result of leaf node / tree analysis
+  std::vector<ColumnDescriptor> leaves_;
+
+  std::unordered_map<const schema::PrimitiveNode*, int> node_to_leaf_index_;
+
+  // Mapping between leaf nodes and root group of leaf (first node
+  // below the schema's root group)
+  //
+  // For example, the leaf `a.b.c.d` would have a link back to `a`
+  //
+  // -- a  <------
+  // -- -- b     |
+  // -- -- -- c  |
+  // -- -- -- -- d
+  std::unordered_map<int, schema::NodePtr> leaf_to_base_;
+
+  // Mapping between ColumnPath DotString to the leaf index
+  std::unordered_multimap<std::string, int> leaf_to_idx_;
+};
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/size_statistics.h b/pyarrow/include/parquet/size_statistics.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec79b8c4f8b8c07b69ac92e44d053a3fd821befd
--- /dev/null
+++ b/pyarrow/include/parquet/size_statistics.h
@@ -0,0 +1,102 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <iosfwd>
+#include <optional>
+#include <vector>
+
+#include "arrow/util/span.h"
+#include "parquet/platform.h"
+#include "parquet/type_fwd.h"
+
+namespace parquet {
+
+/// A structure for capturing metadata for estimating the unencoded,
+/// uncompressed size of data written. This is useful for readers to estimate
+/// how much memory is needed to reconstruct data in their memory model and for
+/// fine-grained filter push down on nested structures (the histograms contained
+/// in this structure can help determine the number of nulls at a particular
+/// nesting level and maximum length of lists).
+struct PARQUET_EXPORT SizeStatistics {
+  /// When present, there is expected to be one element corresponding to each
+  /// definition (i.e. size=max definition+1) where each element
+  /// represents the number of times the definition level was observed in the
+  /// data.
+  ///
+  /// This field may be omitted (a.k.a. zero-length vector) if max_definition_level
+  /// is 0 without loss of information.
+  std::vector<int64_t> definition_level_histogram;
+
+  /// Same as definition_level_histogram except for repetition levels.
+  ///
+  /// This field may be omitted (a.k.a. zero-length vector) if max_repetition_level
+  /// is 0 without loss of information.
+  std::vector<int64_t> repetition_level_histogram;
+
+  /// The number of physical bytes stored for BYTE_ARRAY data values assuming
+  /// no encoding. This is exclusive of the bytes needed to store the length of
+  /// each byte array. In other words, this field is equivalent to the `(size
+  /// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values
+  /// written)`. To determine unencoded sizes of other types readers can use
+  /// schema information multiplied by the number of non-null and null values.
+  /// The number of null/non-null values can be inferred from the histograms
+  /// below.
+  ///
+  /// For example, if a column chunk is dictionary-encoded with dictionary
+  /// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2],
+  /// then this value for that data page should be 7 (1 + 1 + 2 + 3).
+  ///
+  /// This field should only be set for types that use BYTE_ARRAY as their
+  /// physical type.
+  std::optional<int64_t> unencoded_byte_array_data_bytes;
+
+  /// \brief Check if the SizeStatistics is set.
+  bool is_set() const {
+    return !repetition_level_histogram.empty() || !definition_level_histogram.empty() ||
+           unencoded_byte_array_data_bytes.has_value();
+  }
+
+  /// \brief Increment the unencoded byte array data bytes.
+  void IncrementUnencodedByteArrayDataBytes(int64_t value);
+
+  /// \brief Merge two SizeStatistics.
+  /// \throws ParquetException if SizeStatistics to merge is not compatible.
+  void Merge(const SizeStatistics& other);
+
+  /// \brief Validate the SizeStatistics
+  /// \throws ParquetException if the histograms don't have the right length,
+  /// or if unencoded_byte_array_data_bytes is present for a non-BYTE_ARRAY column.
+  void Validate(const ColumnDescriptor* descr) const;
+
+  /// \brief Reset the SizeStatistics to be empty.
+  void Reset();
+
+  /// \brief Make an empty SizeStatistics object for specific type.
+  static std::unique_ptr<SizeStatistics> Make(const ColumnDescriptor* descr);
+};
+
+PARQUET_EXPORT
+std::ostream& operator<<(std::ostream&, const SizeStatistics&);
+
+PARQUET_EXPORT
+void UpdateLevelHistogram(::arrow::util::span<const int16_t> levels,
+                          ::arrow::util::span<int64_t> histogram);
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/statistics.h b/pyarrow/include/parquet/statistics.h
new file mode 100644
index 0000000000000000000000000000000000000000..c80fb8e3b52847b3803791988c448d58242a8555
--- /dev/null
+++ b/pyarrow/include/parquet/statistics.h
@@ -0,0 +1,441 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace arrow {
+
+class Array;
+class BinaryArray;
+
+}  // namespace arrow
+
+namespace parquet {
+
+class ColumnDescriptor;
+
+// ----------------------------------------------------------------------
+// Value comparator interfaces
+
+/// \brief Base class for value comparators. Generally used with
+/// TypedComparator<T>
+class PARQUET_EXPORT Comparator {
+ public:
+  virtual ~Comparator() {}
+
+  /// \brief Create a comparator explicitly from physical type and
+  /// sort order
+  /// \param[in] physical_type the physical type for the typed
+  /// comparator
+  /// \param[in] sort_order either SortOrder::SIGNED or
+  /// SortOrder::UNSIGNED
+  /// \param[in] type_length for FIXED_LEN_BYTE_ARRAY only
+  static std::shared_ptr<Comparator> Make(Type::type physical_type,
+                                          SortOrder::type sort_order,
+                                          int type_length = -1);
+
+  /// \brief Create typed comparator inferring default sort order from
+  /// ColumnDescriptor
+  /// \param[in] descr the Parquet column schema
+  static std::shared_ptr<Comparator> Make(const ColumnDescriptor* descr);
+};
+
+/// \brief Interface for comparison of physical types according to the
+/// semantics of a particular logical type.
+template <typename DType>
+class TypedComparator : public Comparator {
+ public:
+  using T = typename DType::c_type;
+
+  /// \brief Scalar comparison of two elements, return true if first
+  /// is strictly less than the second
+  virtual bool Compare(const T& a, const T& b) const = 0;
+
+  /// \brief Compute maximum and minimum elements in a batch of
+  /// elements without any nulls
+  virtual std::pair<T, T> GetMinMax(const T* values, int64_t length) const = 0;
+
+  /// \brief Compute minimum and maximum elements from an Arrow array. Only
+  /// valid for certain Parquet Type / Arrow Type combinations, like BYTE_ARRAY
+  /// / arrow::BinaryArray
+  virtual std::pair<T, T> GetMinMax(const ::arrow::Array& values) const = 0;
+
+  /// \brief Compute maximum and minimum elements in a batch of
+  /// elements with accompanying bitmap indicating which elements are
+  /// included (bit set) and excluded (bit not set)
+  ///
+  /// \param[in] values the sequence of values
+  /// \param[in] length the length of the sequence
+  /// \param[in] valid_bits a bitmap indicating which elements are
+  /// included (1) or excluded (0)
+  /// \param[in] valid_bits_offset the bit offset into the bitmap of
+  /// the first element in the sequence
+  virtual std::pair<T, T> GetMinMaxSpaced(const T* values, int64_t length,
+                                          const uint8_t* valid_bits,
+                                          int64_t valid_bits_offset) const = 0;
+};
+
+/// \brief Typed version of Comparator::Make
+template <typename DType>
+std::shared_ptr<TypedComparator<DType>> MakeComparator(Type::type physical_type,
+                                                       SortOrder::type sort_order,
+                                                       int type_length = -1) {
+  return std::static_pointer_cast<TypedComparator<DType>>(
+      Comparator::Make(physical_type, sort_order, type_length));
+}
+
+/// \brief Typed version of Comparator::Make
+template <typename DType>
+std::shared_ptr<TypedComparator<DType>> MakeComparator(const ColumnDescriptor* descr) {
+  return std::static_pointer_cast<TypedComparator<DType>>(Comparator::Make(descr));
+}
+
+// ----------------------------------------------------------------------
+
+/// \brief Structure represented encoded statistics to be written to
+/// and read from Parquet serialized metadata.
+class PARQUET_EXPORT EncodedStatistics {
+  std::string max_, min_;
+  bool is_signed_ = false;
+
+ public:
+  EncodedStatistics() = default;
+
+  const std::string& max() const { return max_; }
+  const std::string& min() const { return min_; }
+
+  std::optional<bool> is_max_value_exact;
+  std::optional<bool> is_min_value_exact;
+
+  int64_t null_count = 0;
+  int64_t distinct_count = 0;
+
+  bool has_min = false;
+  bool has_max = false;
+  bool has_null_count = false;
+  bool has_distinct_count = false;
+
+  // When all values in the statistics are null, it is set to true.
+  // Otherwise, at least one value is not null, or we are not sure at all.
+  // Page index requires this information to decide whether a data page
+  // is a null page or not.
+  bool all_null_value = false;
+
+  // From parquet-mr
+  // Don't write stats larger than the max size rather than truncating. The
+  // rationale is that some engines may use the minimum value in the page as
+  // the true minimum for aggregations and there is no way to mark that a
+  // value has been truncated and is a lower bound and not in the page.
+  void ApplyStatSizeLimits(size_t length) {
+    if (max_.length() > length) {
+      has_max = false;
+      max_.clear();
+      is_max_value_exact = std::nullopt;
+    }
+    if (min_.length() > length) {
+      has_min = false;
+      min_.clear();
+      is_min_value_exact = std::nullopt;
+    }
+  }
+
+  // Clear Min Max.
+  void ClearMinMax() {
+    has_max = false;
+    max_.clear();
+    has_min = false;
+    min_.clear();
+  }
+
+  bool is_set() const {
+    return has_min || has_max || has_null_count || has_distinct_count;
+  }
+
+  bool is_signed() const { return is_signed_; }
+
+  void set_is_signed(bool is_signed) { is_signed_ = is_signed; }
+
+  EncodedStatistics& set_max(std::string value) {
+    max_ = std::move(value);
+    has_max = true;
+    return *this;
+  }
+
+  EncodedStatistics& set_min(std::string value) {
+    min_ = std::move(value);
+    has_min = true;
+    return *this;
+  }
+
+  EncodedStatistics& set_null_count(int64_t value) {
+    null_count = value;
+    has_null_count = true;
+    return *this;
+  }
+
+  EncodedStatistics& set_distinct_count(int64_t value) {
+    distinct_count = value;
+    has_distinct_count = true;
+    return *this;
+  }
+};
+
+/// \brief Base type for computing column statistics while writing a file
+class PARQUET_EXPORT Statistics {
+ public:
+  virtual ~Statistics() {}
+
+  /// \brief Create a new statistics instance given a column schema
+  /// definition
+  /// \param[in] descr the column schema
+  /// \param[in] pool a memory pool to use for any memory allocations, optional
+  static std::shared_ptr<Statistics> Make(
+      const ColumnDescriptor* descr,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+  /// \brief Create a new statistics instance given a column schema
+  /// definition and preexisting state
+  /// \param[in] descr the column schema
+  /// \param[in] encoded_min the encoded minimum value
+  /// \param[in] encoded_max the encoded maximum value
+  /// \param[in] num_values total number of values
+  /// \param[in] null_count number of null values
+  /// \param[in] distinct_count number of distinct values
+  /// \param[in] has_min_max whether the min/max statistics are set
+  /// \param[in] has_null_count whether the null_count statistics are set
+  /// \param[in] has_distinct_count whether the distinct_count statistics are set
+  /// \param[in] pool a memory pool to use for any memory allocations, optional
+  static std::shared_ptr<Statistics> Make(
+      const ColumnDescriptor* descr, const std::string& encoded_min,
+      const std::string& encoded_max, int64_t num_values, int64_t null_count,
+      int64_t distinct_count, bool has_min_max, bool has_null_count,
+      bool has_distinct_count,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+  /// \brief Create a new statistics instance given a column schema
+  /// definition and preexisting state
+  /// \param[in] descr the column schema
+  /// \param[in] encoded_min the encoded minimum value
+  /// \param[in] encoded_max the encoded maximum value
+  /// \param[in] num_values total number of values
+  /// \param[in] null_count number of null values
+  /// \param[in] distinct_count number of distinct values
+  /// \param[in] has_min_max whether the min/max statistics are set
+  /// \param[in] has_null_count whether the null_count statistics are set
+  /// \param[in] has_distinct_count whether the distinct_count statistics are set
+  /// \param[in] is_min_value_exact whether the min value is exact
+  /// \param[in] is_max_value_exact whether the max value is exact
+  /// \param[in] pool a memory pool to use for any memory allocations, optional
+  static std::shared_ptr<Statistics> Make(
+      const ColumnDescriptor* descr, const std::string& encoded_min,
+      const std::string& encoded_max, int64_t num_values, int64_t null_count,
+      int64_t distinct_count, bool has_min_max, bool has_null_count,
+      bool has_distinct_count, std::optional<bool> is_min_value_exact,
+      std::optional<bool> is_max_value_exact,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+  // Helper function to convert EncodedStatistics to Statistics.
+  // EncodedStatistics does not contain number of non-null values, and it can be
+  // passed using the num_values parameter.
+  static std::shared_ptr<Statistics> Make(
+      const ColumnDescriptor* descr, const EncodedStatistics* encoded_statistics,
+      int64_t num_values = -1,
+      ::arrow::MemoryPool* pool = ::arrow::default_memory_pool());
+
+  /// \brief Return true if the count of null values is set
+  virtual bool HasNullCount() const = 0;
+
+  /// \brief The number of null values, may not be set
+  virtual int64_t null_count() const = 0;
+
+  /// \brief Return true if the count of distinct values is set
+  virtual bool HasDistinctCount() const = 0;
+
+  /// \brief The number of distinct values, may not be set
+  virtual int64_t distinct_count() const = 0;
+
+  /// \brief The number of non-null values in the column
+  virtual int64_t num_values() const = 0;
+
+  /// \brief Return true if both min and max statistics are set. Obtain
+  /// with TypedStatistics<T>::min and max
+  virtual bool HasMinMax() const = 0;
+
+  /// \brief Reset state of object to initial (no data observed) state
+  virtual void Reset() = 0;
+
+  /// \brief Plain-encoded minimum value
+  virtual std::string EncodeMin() const = 0;
+
+  /// \brief Plain-encoded maximum value
+  virtual std::string EncodeMax() const = 0;
+
+  /// \brief Return the minimum value exact flag if set.
+  /// It will be true if there was no truncation.
+  virtual std::optional<bool> is_min_value_exact() const = 0;
+
+  /// \brief Return the maximum value exact flag if set.
+  /// It will be true if there was no truncation.
+  virtual std::optional<bool> is_max_value_exact() const = 0;
+
+  /// \brief The finalized encoded form of the statistics for transport
+  virtual EncodedStatistics Encode() = 0;
+
+  /// \brief The physical type of the column schema
+  virtual Type::type physical_type() const = 0;
+
+  /// \brief The full type descriptor from the column schema
+  virtual const ColumnDescriptor* descr() const = 0;
+
+  /// \brief Check two Statistics for equality
+  virtual bool Equals(const Statistics& other) const = 0;
+
+ protected:
+  static std::shared_ptr<Statistics> Make(Type::type physical_type, const void* min,
+                                          const void* max, int64_t num_values,
+                                          int64_t null_count, int64_t distinct_count);
+};
+
+/// \brief A typed implementation of Statistics
+template <typename DType>
+class TypedStatistics : public Statistics {
+ public:
+  using T = typename DType::c_type;
+
+  /// \brief The current minimum value
+  virtual const T& min() const = 0;
+
+  /// \brief The current maximum value
+  virtual const T& max() const = 0;
+
+  /// \brief Update state with state of another Statistics object
+  virtual void Merge(const TypedStatistics<DType>& other) = 0;
+
+  /// \brief Batch statistics update
+  virtual void Update(const T* values, int64_t num_values, int64_t null_count) = 0;
+
+  /// \brief Batch statistics update with supplied validity bitmap
+  /// \param[in] values pointer to column values
+  /// \param[in] valid_bits Pointer to bitmap representing if values are non-null.
+  /// \param[in] valid_bits_offset Offset offset into valid_bits where the slice of
+  ///                              data begins.
+  /// \param[in] num_spaced_values The length of values in values/valid_bits to inspect
+  ///                              when calculating statistics. This can be smaller than
+  ///                              num_values+null_count as null_count can include nulls
+  ///                              from parents while num_spaced_values does not.
+  /// \param[in] num_values Number of values that are not null.
+  /// \param[in] null_count Number of values that are null.
+  virtual void UpdateSpaced(const T* values, const uint8_t* valid_bits,
+                            int64_t valid_bits_offset, int64_t num_spaced_values,
+                            int64_t num_values, int64_t null_count) = 0;
+
+  /// \brief EXPERIMENTAL: Update statistics with an Arrow array without
+  /// conversion to a primitive Parquet C type. Only implemented for certain
+  /// Parquet type / Arrow type combinations like BYTE_ARRAY /
+  /// arrow::BinaryArray
+  ///
+  /// If update_counts is true then the null_count and num_values will be updated
+  /// based on the null_count of values.  Set to false if these are updated
+  /// elsewhere (e.g. when updating a dictionary where the counts are taken from
+  /// the indices and not the values)
+  virtual void Update(const ::arrow::Array& values, bool update_counts = true) = 0;
+
+  /// \brief Set min and max values to particular values
+  virtual void SetMinMax(const T& min, const T& max) = 0;
+
+  /// \brief Increments the null count directly
+  /// Use Update to extract the null count from data.  Use this if you determine
+  /// the null count through some other means (e.g. dictionary arrays where the
+  /// null count is determined from the indices)
+  virtual void IncrementNullCount(int64_t n) = 0;
+
+  /// \brief Increments the number of values directly
+  /// The same note on IncrementNullCount applies here
+  virtual void IncrementNumValues(int64_t n) = 0;
+};
+
+using BoolStatistics = TypedStatistics<BooleanType>;
+using Int32Statistics = TypedStatistics<Int32Type>;
+using Int64Statistics = TypedStatistics<Int64Type>;
+using FloatStatistics = TypedStatistics<FloatType>;
+using DoubleStatistics = TypedStatistics<DoubleType>;
+using ByteArrayStatistics = TypedStatistics<ByteArrayType>;
+using FLBAStatistics = TypedStatistics<FLBAType>;
+
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+    const ColumnDescriptor* descr,
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+  return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(descr, pool));
+}
+
+/// \brief Create Statistics initialized to a particular state
+/// \param[in] min the minimum value
+/// \param[in] max the minimum value
+/// \param[in] num_values number of values
+/// \param[in] null_count number of null values
+/// \param[in] distinct_count number of distinct values
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(const typename DType::c_type& min,
+                                                       const typename DType::c_type& max,
+                                                       int64_t num_values,
+                                                       int64_t null_count,
+                                                       int64_t distinct_count) {
+  return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
+      DType::type_num, &min, &max, num_values, null_count, distinct_count));
+}
+
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+    const ColumnDescriptor* descr, const std::string& encoded_min,
+    const std::string& encoded_max, int64_t num_values, int64_t null_count,
+    int64_t distinct_count, bool has_min_max, bool has_null_count,
+    bool has_distinct_count, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+  return std::static_pointer_cast<TypedStatistics<DType>>(Statistics::Make(
+      descr, encoded_min, encoded_max, num_values, null_count, distinct_count,
+      has_min_max, has_null_count, has_distinct_count,
+      /*is_min_value_exact=*/std::nullopt, /*is_max_value_exact=*/std::nullopt, pool));
+}
+
+/// \brief Typed version of Statistics::Make
+template <typename DType>
+std::shared_ptr<TypedStatistics<DType>> MakeStatistics(
+    const ColumnDescriptor* descr, const std::string& encoded_min,
+    const std::string& encoded_max, int64_t num_values, int64_t null_count,
+    int64_t distinct_count, bool has_min_max, bool has_null_count,
+    bool has_distinct_count, std::optional<bool> is_min_value_exact,
+    std::optional<bool> is_max_value_exact,
+    ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) {
+  return std::static_pointer_cast<TypedStatistics<DType>>(
+      Statistics::Make(descr, encoded_min, encoded_max, num_values, null_count,
+                       distinct_count, has_min_max, has_null_count, has_distinct_count,
+                       is_min_value_exact, is_max_value_exact, pool));
+}
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/stream_reader.h b/pyarrow/include/parquet/stream_reader.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7dadac92c89277a104e3acc4149a77258177c8c
--- /dev/null
+++ b/pyarrow/include/parquet/stream_reader.h
@@ -0,0 +1,303 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "parquet/column_reader.h"
+#include "parquet/file_reader.h"
+#include "parquet/stream_writer.h"
+
+namespace parquet {
+
+/// \brief A class for reading Parquet files using an output stream type API.
+///
+/// The values given must be of the correct type i.e. the type must
+/// match the file schema exactly otherwise a ParquetException will be
+/// thrown.
+///
+/// The user must explicitly advance to the next row using the
+/// EndRow() function or EndRow input manipulator.
+///
+/// Required and optional fields are supported:
+/// - Required fields are read using operator>>(T)
+/// - Optional fields are read with
+///   operator>>(std::optional<T>)
+///
+/// Note that operator>>(std::optional<T>) can be used to read
+/// required fields.
+///
+/// Similarly operator>>(T) can be used to read optional fields.
+/// However, if the value is not present then a ParquetException will
+/// be raised.
+///
+/// Currently there is no support for repeated fields.
+///
+class PARQUET_EXPORT StreamReader {
+ public:
+  template <typename T>
+  using optional = ::std::optional<T>;
+
+  // N.B. Default constructed objects are not usable.  This
+  //      constructor is provided so that the object may be move
+  //      assigned afterwards.
+  StreamReader() = default;
+
+  explicit StreamReader(std::unique_ptr<ParquetFileReader> reader);
+
+  ~StreamReader() = default;
+
+  bool eof() const { return eof_; }
+
+  int current_column() const { return column_index_; }
+
+  int64_t current_row() const { return current_row_; }
+
+  int num_columns() const;
+
+  int64_t num_rows() const;
+
+  // Moving is possible.
+  StreamReader(StreamReader&&) = default;
+  StreamReader& operator=(StreamReader&&) = default;
+
+  // Copying is not allowed.
+  StreamReader(const StreamReader&) = delete;
+  StreamReader& operator=(const StreamReader&) = delete;
+
+  StreamReader& operator>>(bool& v);
+
+  StreamReader& operator>>(int8_t& v);
+
+  StreamReader& operator>>(uint8_t& v);
+
+  StreamReader& operator>>(int16_t& v);
+
+  StreamReader& operator>>(uint16_t& v);
+
+  StreamReader& operator>>(int32_t& v);
+
+  StreamReader& operator>>(uint32_t& v);
+
+  StreamReader& operator>>(int64_t& v);
+
+  StreamReader& operator>>(uint64_t& v);
+
+  StreamReader& operator>>(std::chrono::milliseconds& v);
+
+  StreamReader& operator>>(std::chrono::microseconds& v);
+
+  StreamReader& operator>>(float& v);
+
+  StreamReader& operator>>(double& v);
+
+  StreamReader& operator>>(char& v);
+
+  template <int N>
+  StreamReader& operator>>(char (&v)[N]) {
+    ReadFixedLength(v, N);
+    return *this;
+  }
+
+  template <std::size_t N>
+  StreamReader& operator>>(std::array<char, N>& v) {
+    ReadFixedLength(v.data(), static_cast<int>(N));
+    return *this;
+  }
+
+  // N.B. Cannot allow for reading to a arbitrary char pointer as the
+  //      length cannot be verified.  Also it would overshadow the
+  //      char[N] input operator.
+  // StreamReader& operator>>(char * v);
+
+  StreamReader& operator>>(std::string& v);
+
+  StreamReader& operator>>(::arrow::Decimal128& v);
+
+  // Input operators for optional fields.
+
+  StreamReader& operator>>(optional<bool>& v);
+
+  StreamReader& operator>>(optional<int8_t>& v);
+
+  StreamReader& operator>>(optional<uint8_t>& v);
+
+  StreamReader& operator>>(optional<int16_t>& v);
+
+  StreamReader& operator>>(optional<uint16_t>& v);
+
+  StreamReader& operator>>(optional<int32_t>& v);
+
+  StreamReader& operator>>(optional<uint32_t>& v);
+
+  StreamReader& operator>>(optional<int64_t>& v);
+
+  StreamReader& operator>>(optional<uint64_t>& v);
+
+  StreamReader& operator>>(optional<float>& v);
+
+  StreamReader& operator>>(optional<double>& v);
+
+  StreamReader& operator>>(optional<std::chrono::milliseconds>& v);
+
+  StreamReader& operator>>(optional<std::chrono::microseconds>& v);
+
+  StreamReader& operator>>(optional<char>& v);
+
+  StreamReader& operator>>(optional<std::string>& v);
+
+  StreamReader& operator>>(optional<::arrow::Decimal128>& v);
+
+  template <std::size_t N>
+  StreamReader& operator>>(optional<std::array<char, N>>& v) {
+    CheckColumn(Type::FIXED_LEN_BYTE_ARRAY, ConvertedType::NONE, N);
+    FixedLenByteArray flba;
+    if (ReadOptional(&flba)) {
+      v = std::array<char, N>{};
+      std::memcpy(v->data(), flba.ptr, N);
+    } else {
+      v.reset();
+    }
+    return *this;
+  }
+
+  /// \brief Terminate current row and advance to next one.
+  /// \throws ParquetException if all columns in the row were not
+  /// read or skipped.
+  void EndRow();
+
+  /// \brief Skip the data in the next columns.
+  /// If the number of columns exceeds the columns remaining on the
+  /// current row then skipping is terminated - it does _not_ continue
+  /// skipping columns on the next row.
+  /// Skipping of columns still requires the use 'EndRow' even if all
+  /// remaining columns were skipped.
+  /// \return Number of columns actually skipped.
+  int64_t SkipColumns(int64_t num_columns_to_skip);
+
+  /// \brief Skip the data in the next rows.
+  /// Skipping of rows is not allowed if reading of data for the
+  /// current row is not finished.
+  /// Skipping of rows will be terminated if the end of file is
+  /// reached.
+  /// \return Number of rows actually skipped.
+  int64_t SkipRows(int64_t num_rows_to_skip);
+
+ protected:
+  [[noreturn]] void ThrowReadFailedException(
+      const std::shared_ptr<schema::PrimitiveNode>& node);
+
+  template <typename ReaderType, typename T>
+  void Read(T* v) {
+    const auto& node = nodes_[column_index_];
+    auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
+    int16_t def_level;
+    int16_t rep_level;
+    int64_t values_read;
+
+    reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, v, &values_read);
+
+    if (values_read != 1) {
+      ThrowReadFailedException(node);
+    }
+  }
+
+  template <typename ReaderType, typename ReadType, typename T>
+  void Read(T* v) {
+    const auto& node = nodes_[column_index_];
+    auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
+    int16_t def_level;
+    int16_t rep_level;
+    ReadType tmp;
+    int64_t values_read;
+
+    reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
+
+    if (values_read == 1) {
+      *v = tmp;
+    } else {
+      ThrowReadFailedException(node);
+    }
+  }
+
+  template <typename ReaderType, typename ReadType = typename ReaderType::T, typename T>
+  void ReadOptional(optional<T>* v) {
+    const auto& node = nodes_[column_index_];
+    auto reader = static_cast<ReaderType*>(column_readers_[column_index_++].get());
+    int16_t def_level;
+    int16_t rep_level;
+    ReadType tmp;
+    int64_t values_read;
+
+    reader->ReadBatch(kBatchSizeOne, &def_level, &rep_level, &tmp, &values_read);
+
+    if (values_read == 1) {
+      *v = T(tmp);
+    } else if ((values_read == 0) && (def_level == 0)) {
+      v->reset();
+    } else {
+      ThrowReadFailedException(node);
+    }
+  }
+
+  void ReadFixedLength(char* ptr, int len);
+
+  void Read(ByteArray* v);
+
+  void Read(FixedLenByteArray* v);
+
+  bool ReadOptional(ByteArray* v);
+
+  bool ReadOptional(FixedLenByteArray* v);
+
+  void NextRowGroup();
+
+  void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
+                   int length = 0);
+
+  void SkipRowsInColumn(ColumnReader* reader, int64_t num_rows_to_skip);
+
+  void SetEof();
+
+ private:
+  std::unique_ptr<ParquetFileReader> file_reader_;
+  std::shared_ptr<FileMetaData> file_metadata_;
+  std::shared_ptr<RowGroupReader> row_group_reader_;
+  std::vector<std::shared_ptr<ColumnReader>> column_readers_;
+  std::vector<std::shared_ptr<schema::PrimitiveNode>> nodes_;
+
+  bool eof_{true};
+  int row_group_index_{0};
+  int column_index_{0};
+  int64_t current_row_{0};
+  int64_t row_group_row_offset_{0};
+
+  static constexpr int64_t kBatchSizeOne = 1;
+};  // namespace parquet
+
+PARQUET_EXPORT
+StreamReader& operator>>(StreamReader&, EndRowType);
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/stream_writer.h b/pyarrow/include/parquet/stream_writer.h
new file mode 100644
index 0000000000000000000000000000000000000000..7626514022216e00d6b3524a9010df39531087ef
--- /dev/null
+++ b/pyarrow/include/parquet/stream_writer.h
@@ -0,0 +1,252 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <array>
+#include <chrono>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "arrow/util/span.h"
+
+#include "parquet/column_writer.h"
+#include "parquet/file_writer.h"
+
+namespace parquet {
+
+/// \brief A class for writing Parquet files using an output stream type API.
+///
+/// The values given must be of the correct type i.e. the type must
+/// match the file schema exactly otherwise a ParquetException will be
+/// thrown.
+///
+/// The user must explicitly indicate the end of the row using the
+/// EndRow() function or EndRow output manipulator.
+///
+/// A maximum row group size can be configured, the default size is
+/// 512MB.  Alternatively the row group size can be set to zero and the
+/// user can create new row groups by calling the EndRowGroup()
+/// function or using the EndRowGroup output manipulator.
+///
+/// Required and optional fields are supported:
+/// - Required fields are written using operator<<(T)
+/// - Optional fields are written using
+///   operator<<(std::optional<T>).
+///
+/// Note that operator<<(T) can be used to write optional fields.
+///
+/// Similarly, operator<<(std::optional<T>) can be used to
+/// write required fields.  However if the optional parameter does not
+/// have a value (i.e. it is nullopt) then a ParquetException will be
+/// raised.
+///
+/// Currently there is no support for repeated fields.
+///
+class PARQUET_EXPORT StreamWriter {
+ public:
+  template <typename T>
+  using optional = ::std::optional<T>;
+
+  // N.B. Default constructed objects are not usable.  This
+  //      constructor is provided so that the object may be move
+  //      assigned afterwards.
+  StreamWriter() = default;
+
+  explicit StreamWriter(std::unique_ptr<ParquetFileWriter> writer);
+
+  ~StreamWriter() = default;
+
+  static void SetDefaultMaxRowGroupSize(int64_t max_size);
+
+  void SetMaxRowGroupSize(int64_t max_size);
+
+  int current_column() const { return column_index_; }
+
+  int64_t current_row() const { return current_row_; }
+
+  int num_columns() const;
+
+  // Moving is possible.
+  StreamWriter(StreamWriter&&) = default;
+  StreamWriter& operator=(StreamWriter&&) = default;
+
+  // Copying is not allowed.
+  StreamWriter(const StreamWriter&) = delete;
+  StreamWriter& operator=(const StreamWriter&) = delete;
+
+  /// \brief Output operators for required fields.
+  /// These can also be used for optional fields when a value must be set.
+  StreamWriter& operator<<(bool v);
+
+  StreamWriter& operator<<(int8_t v);
+
+  StreamWriter& operator<<(uint8_t v);
+
+  StreamWriter& operator<<(int16_t v);
+
+  StreamWriter& operator<<(uint16_t v);
+
+  StreamWriter& operator<<(int32_t v);
+
+  StreamWriter& operator<<(uint32_t v);
+
+  StreamWriter& operator<<(int64_t v);
+
+  StreamWriter& operator<<(uint64_t v);
+
+  StreamWriter& operator<<(const std::chrono::milliseconds& v);
+
+  StreamWriter& operator<<(const std::chrono::microseconds& v);
+
+  StreamWriter& operator<<(float v);
+
+  StreamWriter& operator<<(double v);
+
+  StreamWriter& operator<<(char v);
+
+  /// \brief Helper class to write fixed length strings.
+  /// This is useful as the standard string view (such as
+  /// std::string_view) is for variable length data.
+  struct PARQUET_EXPORT FixedStringView {
+    FixedStringView() = default;
+
+    explicit FixedStringView(const char* data_ptr);
+
+    FixedStringView(const char* data_ptr, std::size_t data_len);
+
+    const char* data{NULLPTR};
+    std::size_t size{0};
+  };
+
+  /// \brief Output operators for fixed length strings.
+  template <int N>
+  StreamWriter& operator<<(const char (&v)[N]) {
+    return WriteFixedLength(v, N);
+  }
+  template <std::size_t N>
+  StreamWriter& operator<<(const std::array<char, N>& v) {
+    return WriteFixedLength(v.data(), N);
+  }
+  StreamWriter& operator<<(FixedStringView v);
+
+  /// \brief Output operators for variable length strings.
+  StreamWriter& operator<<(const char* v);
+  StreamWriter& operator<<(const std::string& v);
+  StreamWriter& operator<<(::std::string_view v);
+
+  /// \brief Helper class to write variable length raw data.
+  using RawDataView = ::arrow::util::span<const uint8_t>;
+
+  /// \brief Output operators for variable length raw data.
+  StreamWriter& operator<<(RawDataView v);
+
+  /// \brief Output operator for optional fields.
+  template <typename T>
+  StreamWriter& operator<<(const optional<T>& v) {
+    if (v) {
+      return operator<<(*v);
+    }
+    SkipOptionalColumn();
+    return *this;
+  }
+
+  /// \brief Skip the next N columns of optional data.  If there are
+  /// less than N columns remaining then the excess columns are
+  /// ignored.
+  /// \throws ParquetException if there is an attempt to skip any
+  /// required column.
+  /// \return Number of columns actually skipped.
+  int64_t SkipColumns(int num_columns_to_skip);
+
+  /// \brief Terminate the current row and advance to next one.
+  /// \throws ParquetException if all columns in the row were not
+  /// written or skipped.
+  void EndRow();
+
+  /// \brief Terminate the current row group and create new one.
+  void EndRowGroup();
+
+ protected:
+  template <typename WriterType, typename T>
+  StreamWriter& Write(const T v) {
+    auto writer = static_cast<WriterType*>(row_group_writer_->column(column_index_++));
+
+    writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &v);
+
+    if (max_row_group_size_ > 0) {
+      row_group_size_ += writer->estimated_buffered_value_bytes();
+    }
+    return *this;
+  }
+
+  StreamWriter& WriteVariableLength(const char* data_ptr, std::size_t data_len,
+                                    ConvertedType::type converted_type);
+
+  StreamWriter& WriteFixedLength(const char* data_ptr, std::size_t data_len);
+
+  void CheckColumn(Type::type physical_type, ConvertedType::type converted_type,
+                   int length = -1);
+
+  /// \brief Skip the next column which must be optional.
+  /// \throws ParquetException if the next column does not exist or is
+  /// not optional.
+  void SkipOptionalColumn();
+
+  void WriteNullValue(ColumnWriter* writer);
+
+ private:
+  using node_ptr_type = std::shared_ptr<schema::PrimitiveNode>;
+
+  struct null_deleter {
+    void operator()(void*) {}
+  };
+
+  int32_t column_index_{0};
+  int64_t current_row_{0};
+  int64_t row_group_size_{0};
+  int64_t max_row_group_size_{default_row_group_size_};
+
+  std::unique_ptr<ParquetFileWriter> file_writer_;
+  std::unique_ptr<RowGroupWriter, null_deleter> row_group_writer_;
+  std::vector<node_ptr_type> nodes_;
+
+  static constexpr int16_t kDefLevelZero = 0;
+  static constexpr int16_t kDefLevelOne = 1;
+  static constexpr int16_t kRepLevelZero = 0;
+  static constexpr int64_t kBatchSizeOne = 1;
+
+  static int64_t default_row_group_size_;
+};
+
+struct PARQUET_EXPORT EndRowType {};
+constexpr EndRowType EndRow = {};
+
+struct PARQUET_EXPORT EndRowGroupType {};
+constexpr EndRowGroupType EndRowGroup = {};
+
+PARQUET_EXPORT
+StreamWriter& operator<<(StreamWriter&, EndRowType);
+
+PARQUET_EXPORT
+StreamWriter& operator<<(StreamWriter&, EndRowGroupType);
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/test_util.h b/pyarrow/include/parquet/test_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..9271dc290cad2fa36ae672b170260be0c5ddac9b
--- /dev/null
+++ b/pyarrow/include/parquet/test_util.h
@@ -0,0 +1,893 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/extension_type.h"
+#include "arrow/io/memory.h"
+#include "arrow/testing/util.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/float16.h"
+
+#include "parquet/column_page.h"
+#include "parquet/column_reader.h"
+#include "parquet/column_writer.h"
+#include "parquet/encoding.h"
+#include "parquet/platform.h"
+
+// https://github.com/google/googletest/pull/2904 might not be available
+// in our version of gtest/gmock
+#define EXPECT_THROW_THAT(callable, ex_type, property)   \
+  EXPECT_THROW(                                          \
+      try { (callable)(); } catch (const ex_type& err) { \
+        EXPECT_THAT(err, (property));                    \
+        throw;                                           \
+      },                                                 \
+      ex_type)
+
+namespace parquet {
+
+static constexpr int FLBA_LENGTH = 12;
+
+inline bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) {
+  return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH);
+}
+
+namespace test {
+
+typedef ::testing::Types<BooleanType, Int32Type, Int64Type, Int96Type, FloatType,
+                         DoubleType, ByteArrayType, FLBAType>
+    ParquetTypes;
+
+class ParquetTestException : public parquet::ParquetException {
+  using ParquetException::ParquetException;
+};
+
+const char* get_data_dir();
+std::string get_bad_data_dir();
+
+std::string get_data_file(const std::string& filename, bool is_good = true);
+
+template <typename T>
+static inline void assert_vector_equal(const std::vector<T>& left,
+                                       const std::vector<T>& right) {
+  ASSERT_EQ(left.size(), right.size());
+
+  for (size_t i = 0; i < left.size(); ++i) {
+    ASSERT_EQ(left[i], right[i]) << i;
+  }
+}
+
+template <typename T>
+static inline bool vector_equal(const std::vector<T>& left, const std::vector<T>& right) {
+  if (left.size() != right.size()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < left.size(); ++i) {
+    if (left[i] != right[i]) {
+      std::cerr << "index " << i << " left was " << left[i] << " right was " << right[i]
+                << std::endl;
+      return false;
+    }
+  }
+
+  return true;
+}
+
+template <typename T>
+static std::vector<T> slice(const std::vector<T>& values, int start, int end) {
+  if (end < start) {
+    return std::vector<T>(0);
+  }
+
+  std::vector<T> out(end - start);
+  for (int i = start; i < end; ++i) {
+    out[i - start] = values[i];
+  }
+  return out;
+}
+
+void random_bytes(int n, uint32_t seed, std::vector<uint8_t>* out);
+void random_bools(int n, double p, uint32_t seed, bool* out);
+
+template <typename T>
+inline void random_numbers(int n, uint32_t seed, T min_value, T max_value, T* out) {
+  std::default_random_engine gen(seed);
+  std::uniform_int_distribution<T> d(min_value, max_value);
+  for (int i = 0; i < n; ++i) {
+    out[i] = d(gen);
+  }
+}
+
+template <>
+inline void random_numbers(int n, uint32_t seed, float min_value, float max_value,
+                           float* out) {
+  std::default_random_engine gen(seed);
+  std::uniform_real_distribution<float> d(min_value, max_value);
+  for (int i = 0; i < n; ++i) {
+    out[i] = d(gen);
+  }
+}
+
+template <>
+inline void random_numbers(int n, uint32_t seed, double min_value, double max_value,
+                           double* out) {
+  std::default_random_engine gen(seed);
+  std::uniform_real_distribution<double> d(min_value, max_value);
+  for (int i = 0; i < n; ++i) {
+    out[i] = d(gen);
+  }
+}
+
+void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value,
+                          Int96* out);
+
+void random_float16_numbers(int n, uint32_t seed, ::arrow::util::Float16 min_value,
+                            ::arrow::util::Float16 max_value, uint16_t* out);
+
+void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out);
+
+void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size,
+                       int max_size);
+
+void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int max_size);
+
+void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out,
+                                int min_size, int max_size, double prefixed_probability);
+
+void prefixed_random_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out,
+                                double prefixed_probability);
+
+template <typename Type, typename Sequence>
+std::shared_ptr<Buffer> EncodeValues(Encoding::type encoding, bool use_dictionary,
+                                     const Sequence& values, int length,
+                                     const ColumnDescriptor* descr) {
+  auto encoder = MakeTypedEncoder<Type>(encoding, use_dictionary, descr);
+  encoder->Put(values, length);
+  return encoder->FlushValues();
+}
+
+template <typename T>
+static void InitValues(int num_values, uint32_t seed, std::vector<T>& values,
+                       std::vector<uint8_t>& buffer) {
+  random_numbers(num_values, seed, std::numeric_limits<T>::min(),
+                 std::numeric_limits<T>::max(), values.data());
+}
+
+template <typename T>
+static void InitValues(int num_values, std::vector<T>& values,
+                       std::vector<uint8_t>& buffer) {
+  InitValues(num_values, 0, values, buffer);
+}
+
+template <typename T>
+static void InitDictValues(int num_values, int num_dicts, std::vector<T>& values,
+                           std::vector<uint8_t>& buffer) {
+  int repeat_factor = num_values / num_dicts;
+  InitValues<T>(num_dicts, values, buffer);
+  // add some repeated values
+  for (int j = 1; j < repeat_factor; ++j) {
+    for (int i = 0; i < num_dicts; ++i) {
+      std::memcpy(&values[num_dicts * j + i], &values[i], sizeof(T));
+    }
+  }
+  // computed only dict_per_page * repeat_factor - 1 values < num_values
+  // compute remaining
+  for (int i = num_dicts * repeat_factor; i < num_values; ++i) {
+    std::memcpy(&values[i], &values[i - num_dicts * repeat_factor], sizeof(T));
+  }
+}
+
+template <>
+inline void InitDictValues<bool>(int num_values, int num_dicts, std::vector<bool>& values,
+                                 std::vector<uint8_t>& buffer) {
+  // No op for bool
+}
+
+class MockPageReader : public PageReader {
+ public:
+  explicit MockPageReader(const std::vector<std::shared_ptr<Page>>& pages)
+      : pages_(pages), page_index_(0) {}
+
+  std::shared_ptr<Page> NextPage() override {
+    if (page_index_ == static_cast<int>(pages_.size())) {
+      // EOS to consumer
+      return std::shared_ptr<Page>(nullptr);
+    }
+    return pages_[page_index_++];
+  }
+
+  // No-op
+  void set_max_page_header_size(uint32_t size) override {}
+
+ private:
+  std::vector<std::shared_ptr<Page>> pages_;
+  int page_index_;
+};
+
+// TODO(wesm): this is only used for testing for now. Refactor to form part of
+// primary file write path
+template <typename Type>
+class DataPageBuilder {
+ public:
+  using c_type = typename Type::c_type;
+
+  // This class writes data and metadata to the passed inputs
+  explicit DataPageBuilder(ArrowOutputStream* sink)
+      : sink_(sink),
+        num_values_(0),
+        encoding_(Encoding::PLAIN),
+        definition_level_encoding_(Encoding::RLE),
+        repetition_level_encoding_(Encoding::RLE),
+        have_def_levels_(false),
+        have_rep_levels_(false),
+        have_values_(false) {}
+
+  void AppendDefLevels(const std::vector<int16_t>& levels, int16_t max_level,
+                       Encoding::type encoding = Encoding::RLE) {
+    AppendLevels(levels, max_level, encoding);
+
+    num_values_ = std::max(static_cast<int32_t>(levels.size()), num_values_);
+    definition_level_encoding_ = encoding;
+    have_def_levels_ = true;
+  }
+
+  void AppendRepLevels(const std::vector<int16_t>& levels, int16_t max_level,
+                       Encoding::type encoding = Encoding::RLE) {
+    AppendLevels(levels, max_level, encoding);
+
+    num_values_ = std::max(static_cast<int32_t>(levels.size()), num_values_);
+    repetition_level_encoding_ = encoding;
+    have_rep_levels_ = true;
+  }
+
+  void AppendValues(const ColumnDescriptor* d, const std::vector<c_type>& values,
+                    Encoding::type encoding = Encoding::PLAIN) {
+    std::shared_ptr<Buffer> values_sink = EncodeValues<Type>(
+        encoding, false, values.data(), static_cast<int>(values.size()), d);
+    PARQUET_THROW_NOT_OK(sink_->Write(values_sink->data(), values_sink->size()));
+
+    num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
+    encoding_ = encoding;
+    have_values_ = true;
+  }
+
+  int32_t num_values() const { return num_values_; }
+
+  Encoding::type encoding() const { return encoding_; }
+
+  Encoding::type rep_level_encoding() const { return repetition_level_encoding_; }
+
+  Encoding::type def_level_encoding() const { return definition_level_encoding_; }
+
+ private:
+  ArrowOutputStream* sink_;
+
+  int32_t num_values_;
+  Encoding::type encoding_;
+  Encoding::type definition_level_encoding_;
+  Encoding::type repetition_level_encoding_;
+
+  bool have_def_levels_;
+  bool have_rep_levels_;
+  bool have_values_;
+
+  // Used internally for both repetition and definition levels
+  void AppendLevels(const std::vector<int16_t>& levels, int16_t max_level,
+                    Encoding::type encoding) {
+    if (encoding != Encoding::RLE) {
+      ParquetException::NYI("only rle encoding currently implemented");
+    }
+
+    std::vector<uint8_t> encode_buffer(LevelEncoder::MaxBufferSize(
+        Encoding::RLE, max_level, static_cast<int>(levels.size())));
+
+    // We encode into separate memory from the output stream because the
+    // RLE-encoded bytes have to be preceded in the stream by their absolute
+    // size.
+    LevelEncoder encoder;
+    encoder.Init(encoding, max_level, static_cast<int>(levels.size()),
+                 encode_buffer.data(), static_cast<int>(encode_buffer.size()));
+
+    encoder.Encode(static_cast<int>(levels.size()), levels.data());
+
+    int32_t rle_bytes = encoder.len();
+    int32_t rle_bytes_le = ::arrow::bit_util::ToLittleEndian(rle_bytes);
+    PARQUET_THROW_NOT_OK(
+        sink_->Write(reinterpret_cast<const uint8_t*>(&rle_bytes_le), sizeof(int32_t)));
+    PARQUET_THROW_NOT_OK(sink_->Write(encode_buffer.data(), rle_bytes));
+  }
+};
+
+template <>
+inline void DataPageBuilder<BooleanType>::AppendValues(const ColumnDescriptor* d,
+                                                       const std::vector<bool>& values,
+                                                       Encoding::type encoding) {
+  if (encoding != Encoding::PLAIN) {
+    ParquetException::NYI("only plain encoding currently implemented");
+  }
+
+  auto encoder = MakeTypedEncoder<BooleanType>(Encoding::PLAIN, false, d);
+  dynamic_cast<BooleanEncoder*>(encoder.get())
+      ->Put(values, static_cast<int>(values.size()));
+  std::shared_ptr<Buffer> buffer = encoder->FlushValues();
+  PARQUET_THROW_NOT_OK(sink_->Write(buffer->data(), buffer->size()));
+
+  num_values_ = std::max(static_cast<int32_t>(values.size()), num_values_);
+  encoding_ = encoding;
+  have_values_ = true;
+}
+
+template <typename Type>
+static std::shared_ptr<DataPageV1> MakeDataPage(
+    const ColumnDescriptor* d, const std::vector<typename Type::c_type>& values,
+    int num_vals, Encoding::type encoding, const uint8_t* indices, int indices_size,
+    const std::vector<int16_t>& def_levels, int16_t max_def_level,
+    const std::vector<int16_t>& rep_levels, int16_t max_rep_level) {
+  int num_values = 0;
+
+  auto page_stream = CreateOutputStream();
+  test::DataPageBuilder<Type> page_builder(page_stream.get());
+
+  if (!rep_levels.empty()) {
+    page_builder.AppendRepLevels(rep_levels, max_rep_level);
+  }
+  if (!def_levels.empty()) {
+    page_builder.AppendDefLevels(def_levels, max_def_level);
+  }
+
+  if (encoding == Encoding::PLAIN) {
+    page_builder.AppendValues(d, values, encoding);
+    num_values = std::max(page_builder.num_values(), num_vals);
+  } else {  // DICTIONARY PAGES
+    PARQUET_THROW_NOT_OK(page_stream->Write(indices, indices_size));
+    num_values = std::max(page_builder.num_values(), num_vals);
+  }
+
+  PARQUET_ASSIGN_OR_THROW(auto buffer, page_stream->Finish());
+
+  return std::make_shared<DataPageV1>(buffer, num_values, encoding,
+                                      page_builder.def_level_encoding(),
+                                      page_builder.rep_level_encoding(), buffer->size());
+}
+
+template <typename TYPE>
+class DictionaryPageBuilder {
+ public:
+  typedef typename TYPE::c_type TC;
+  static constexpr int TN = TYPE::type_num;
+  using SpecializedEncoder = typename EncodingTraits<TYPE>::Encoder;
+
+  // This class writes data and metadata to the passed inputs
+  explicit DictionaryPageBuilder(const ColumnDescriptor* d)
+      : num_dict_values_(0), have_values_(false) {
+    auto encoder = MakeTypedEncoder<TYPE>(Encoding::PLAIN, true, d);
+    dict_traits_ = dynamic_cast<DictEncoder<TYPE>*>(encoder.get());
+    encoder_.reset(dynamic_cast<SpecializedEncoder*>(encoder.release()));
+  }
+
+  ~DictionaryPageBuilder() {}
+
+  std::shared_ptr<Buffer> AppendValues(const std::vector<TC>& values) {
+    int num_values = static_cast<int>(values.size());
+    // Dictionary encoding
+    encoder_->Put(values.data(), num_values);
+    num_dict_values_ = dict_traits_->num_entries();
+    have_values_ = true;
+    return encoder_->FlushValues();
+  }
+
+  std::shared_ptr<Buffer> WriteDict() {
+    std::shared_ptr<Buffer> dict_buffer =
+        AllocateBuffer(::arrow::default_memory_pool(), dict_traits_->dict_encoded_size());
+    dict_traits_->WriteDict(dict_buffer->mutable_data());
+    return dict_buffer;
+  }
+
+  int32_t num_values() const { return num_dict_values_; }
+
+ private:
+  DictEncoder<TYPE>* dict_traits_;
+  std::unique_ptr<SpecializedEncoder> encoder_;
+  int32_t num_dict_values_;
+  bool have_values_;
+};
+
+template <>
+inline DictionaryPageBuilder<BooleanType>::DictionaryPageBuilder(
+    const ColumnDescriptor* d) {
+  ParquetException::NYI("only plain encoding currently implemented for boolean");
+}
+
+template <>
+inline std::shared_ptr<Buffer> DictionaryPageBuilder<BooleanType>::WriteDict() {
+  ParquetException::NYI("only plain encoding currently implemented for boolean");
+  return nullptr;
+}
+
+template <>
+inline std::shared_ptr<Buffer> DictionaryPageBuilder<BooleanType>::AppendValues(
+    const std::vector<TC>& values) {
+  ParquetException::NYI("only plain encoding currently implemented for boolean");
+  return nullptr;
+}
+
+template <typename Type>
+inline static std::shared_ptr<DictionaryPage> MakeDictPage(
+    const ColumnDescriptor* d, const std::vector<typename Type::c_type>& values,
+    const std::vector<int>& values_per_page, Encoding::type encoding,
+    std::vector<std::shared_ptr<Buffer>>& rle_indices) {
+  test::DictionaryPageBuilder<Type> page_builder(d);
+  int num_pages = static_cast<int>(values_per_page.size());
+  int value_start = 0;
+
+  for (int i = 0; i < num_pages; i++) {
+    rle_indices.push_back(page_builder.AppendValues(
+        slice(values, value_start, value_start + values_per_page[i])));
+    value_start += values_per_page[i];
+  }
+
+  auto buffer = page_builder.WriteDict();
+
+  return std::make_shared<DictionaryPage>(buffer, page_builder.num_values(),
+                                          Encoding::PLAIN);
+}
+
+// Given def/rep levels and values create multiple dict pages
+template <typename Type>
+inline static void PaginateDict(const ColumnDescriptor* d,
+                                const std::vector<typename Type::c_type>& values,
+                                const std::vector<int16_t>& def_levels,
+                                int16_t max_def_level,
+                                const std::vector<int16_t>& rep_levels,
+                                int16_t max_rep_level, int num_levels_per_page,
+                                const std::vector<int>& values_per_page,
+                                std::vector<std::shared_ptr<Page>>& pages,
+                                Encoding::type encoding = Encoding::RLE_DICTIONARY) {
+  int num_pages = static_cast<int>(values_per_page.size());
+  std::vector<std::shared_ptr<Buffer>> rle_indices;
+  std::shared_ptr<DictionaryPage> dict_page =
+      MakeDictPage<Type>(d, values, values_per_page, encoding, rle_indices);
+  pages.push_back(dict_page);
+  int def_level_start = 0;
+  int def_level_end = 0;
+  int rep_level_start = 0;
+  int rep_level_end = 0;
+  for (int i = 0; i < num_pages; i++) {
+    if (max_def_level > 0) {
+      def_level_start = i * num_levels_per_page;
+      def_level_end = (i + 1) * num_levels_per_page;
+    }
+    if (max_rep_level > 0) {
+      rep_level_start = i * num_levels_per_page;
+      rep_level_end = (i + 1) * num_levels_per_page;
+    }
+    std::shared_ptr<DataPageV1> data_page = MakeDataPage<Int32Type>(
+        d, {}, values_per_page[i], encoding, rle_indices[i]->data(),
+        static_cast<int>(rle_indices[i]->size()),
+        slice(def_levels, def_level_start, def_level_end), max_def_level,
+        slice(rep_levels, rep_level_start, rep_level_end), max_rep_level);
+    pages.push_back(data_page);
+  }
+}
+
+// Given def/rep levels and values create multiple plain pages
+template <typename Type>
+static inline void PaginatePlain(const ColumnDescriptor* d,
+                                 const std::vector<typename Type::c_type>& values,
+                                 const std::vector<int16_t>& def_levels,
+                                 int16_t max_def_level,
+                                 const std::vector<int16_t>& rep_levels,
+                                 int16_t max_rep_level, int num_levels_per_page,
+                                 const std::vector<int>& values_per_page,
+                                 std::vector<std::shared_ptr<Page>>& pages,
+                                 Encoding::type encoding = Encoding::PLAIN) {
+  int num_pages = static_cast<int>(values_per_page.size());
+  int def_level_start = 0;
+  int def_level_end = 0;
+  int rep_level_start = 0;
+  int rep_level_end = 0;
+  int value_start = 0;
+  for (int i = 0; i < num_pages; i++) {
+    if (max_def_level > 0) {
+      def_level_start = i * num_levels_per_page;
+      def_level_end = (i + 1) * num_levels_per_page;
+    }
+    if (max_rep_level > 0) {
+      rep_level_start = i * num_levels_per_page;
+      rep_level_end = (i + 1) * num_levels_per_page;
+    }
+    std::shared_ptr<DataPage> page = MakeDataPage<Type>(
+        d, slice(values, value_start, value_start + values_per_page[i]),
+        values_per_page[i], encoding, nullptr, 0,
+        slice(def_levels, def_level_start, def_level_end), max_def_level,
+        slice(rep_levels, rep_level_start, rep_level_end), max_rep_level);
+    pages.push_back(page);
+    value_start += values_per_page[i];
+  }
+}
+
+// Generates pages from randomly generated data
+template <typename Type>
+static inline int MakePages(const ColumnDescriptor* d, int num_pages, int levels_per_page,
+                            std::vector<int16_t>& def_levels,
+                            std::vector<int16_t>& rep_levels,
+                            std::vector<typename Type::c_type>& values,
+                            std::vector<uint8_t>& buffer,
+                            std::vector<std::shared_ptr<Page>>& pages,
+                            Encoding::type encoding = Encoding::PLAIN,
+                            uint32_t seed = 0) {
+  int num_levels = levels_per_page * num_pages;
+  int num_values = 0;
+  int16_t zero = 0;
+  int16_t max_def_level = d->max_definition_level();
+  int16_t max_rep_level = d->max_repetition_level();
+  std::vector<int> values_per_page(num_pages, levels_per_page);
+  // Create definition levels
+  if (max_def_level > 0 && num_levels != 0) {
+    def_levels.resize(num_levels);
+    random_numbers(num_levels, seed, zero, max_def_level, def_levels.data());
+    for (int p = 0; p < num_pages; p++) {
+      int num_values_per_page = 0;
+      for (int i = 0; i < levels_per_page; i++) {
+        if (def_levels[i + p * levels_per_page] == max_def_level) {
+          num_values_per_page++;
+          num_values++;
+        }
+      }
+      values_per_page[p] = num_values_per_page;
+    }
+  } else {
+    num_values = num_levels;
+  }
+  // Create repetition levels
+  if (max_rep_level > 0 && num_levels != 0) {
+    rep_levels.resize(num_levels);
+    // Using a different seed so that def_levels and rep_levels are different.
+    random_numbers(num_levels, seed + 789, zero, max_rep_level, rep_levels.data());
+    // The generated levels are random. Force the very first page to start with a new
+    // record.
+    rep_levels[0] = 0;
+    // For a null value, rep_levels and def_levels are both 0.
+    // If we have a repeated value right after this, it needs to start with
+    // rep_level = 0 to indicate a new record.
+    for (int i = 0; i < num_levels - 1; ++i) {
+      if (rep_levels[i] == 0 && def_levels[i] == 0) {
+        rep_levels[i + 1] = 0;
+      }
+    }
+  }
+  // Create values
+  values.resize(num_values);
+  if (encoding == Encoding::PLAIN) {
+    InitValues<typename Type::c_type>(num_values, values, buffer);
+    PaginatePlain<Type>(d, values, def_levels, max_def_level, rep_levels, max_rep_level,
+                        levels_per_page, values_per_page, pages);
+  } else if (encoding == Encoding::RLE_DICTIONARY ||
+             encoding == Encoding::PLAIN_DICTIONARY) {
+    // Calls InitValues and repeats the data
+    InitDictValues<typename Type::c_type>(num_values, levels_per_page, values, buffer);
+    PaginateDict<Type>(d, values, def_levels, max_def_level, rep_levels, max_rep_level,
+                       levels_per_page, values_per_page, pages);
+  }
+
+  return num_values;
+}
+
+// ----------------------------------------------------------------------
+// Test data generation
+
+template <>
+void inline InitValues<bool>(int num_values, uint32_t seed, std::vector<bool>& values,
+                             std::vector<uint8_t>& buffer) {
+  values = {};
+  if (seed == 0) {
+    seed = static_cast<uint32_t>(::arrow::random_seed());
+  }
+  ::arrow::random_is_valid(num_values, 0.5, &values, static_cast<int>(seed));
+}
+
+template <>
+inline void InitValues<ByteArray>(int num_values, uint32_t seed,
+                                  std::vector<ByteArray>& values,
+                                  std::vector<uint8_t>& buffer) {
+  int max_byte_array_len = 12;
+  int num_bytes = static_cast<int>(max_byte_array_len + sizeof(uint32_t));
+  size_t nbytes = num_values * num_bytes;
+  buffer.resize(nbytes);
+  random_byte_array(num_values, seed, buffer.data(), values.data(), max_byte_array_len);
+}
+
+inline void InitWideByteArrayValues(int num_values, std::vector<ByteArray>& values,
+                                    std::vector<uint8_t>& buffer, int min_len,
+                                    int max_len) {
+  int num_bytes = static_cast<int>(max_len + sizeof(uint32_t));
+  size_t nbytes = num_values * num_bytes;
+  buffer.resize(nbytes);
+  random_byte_array(num_values, 0, buffer.data(), values.data(), min_len, max_len);
+}
+
+template <>
+inline void InitValues<FLBA>(int num_values, uint32_t seed, std::vector<FLBA>& values,
+                             std::vector<uint8_t>& buffer) {
+  size_t nbytes = num_values * FLBA_LENGTH;
+  buffer.resize(nbytes);
+  random_fixed_byte_array(num_values, seed, buffer.data(), FLBA_LENGTH, values.data());
+}
+
+template <>
+inline void InitValues<Int96>(int num_values, uint32_t seed, std::vector<Int96>& values,
+                              std::vector<uint8_t>& buffer) {
+  random_Int96_numbers(num_values, seed, std::numeric_limits<int32_t>::min(),
+                       std::numeric_limits<int32_t>::max(), values.data());
+}
+
+inline std::string TestColumnName(int i) {
+  std::stringstream col_name;
+  col_name << "column_" << i;
+  return col_name.str();
+}
+
+// This class lives here because of its dependency on the InitValues specializations.
+template <typename TestType>
+class PrimitiveTypedTest : public ::testing::Test {
+ public:
+  using c_type = typename TestType::c_type;
+
+  virtual void SetUpSchema(Repetition::type repetition, int num_columns) {
+    std::vector<schema::NodePtr> fields;
+
+    for (int i = 0; i < num_columns; ++i) {
+      std::string name = TestColumnName(i);
+      fields.push_back(schema::PrimitiveNode::Make(name, repetition, TestType::type_num,
+                                                   ConvertedType::NONE, FLBA_LENGTH));
+    }
+    node_ = schema::GroupNode::Make("schema", Repetition::REQUIRED, fields);
+    schema_.Init(node_);
+  }
+
+  void SetUpSchema(Repetition::type repetition) { this->SetUpSchema(repetition, 1); }
+
+  void GenerateData(int64_t num_values, uint32_t seed = 0);
+  void SetupValuesOut(int64_t num_values);
+  void SyncValuesOut();
+
+ protected:
+  schema::NodePtr node_;
+  SchemaDescriptor schema_;
+
+  // Input buffers
+  std::vector<c_type> values_;
+
+  std::vector<int16_t> def_levels_;
+
+  std::vector<uint8_t> buffer_;
+  // Pointer to the values, needed as we cannot use std::vector<bool>::data()
+  c_type* values_ptr_;
+  std::vector<uint8_t> bool_buffer_;
+
+  // Output buffers
+  std::vector<c_type> values_out_;
+  std::vector<uint8_t> bool_buffer_out_;
+  c_type* values_out_ptr_;
+};
+
+template <typename TestType>
+inline void PrimitiveTypedTest<TestType>::SyncValuesOut() {}
+
+template <>
+inline void PrimitiveTypedTest<BooleanType>::SyncValuesOut() {
+  std::vector<uint8_t>::const_iterator source_iterator = bool_buffer_out_.begin();
+  std::vector<c_type>::iterator destination_iterator = values_out_.begin();
+  while (source_iterator != bool_buffer_out_.end()) {
+    *destination_iterator++ = *source_iterator++ != 0;
+  }
+}
+
+template <typename TestType>
+inline void PrimitiveTypedTest<TestType>::SetupValuesOut(int64_t num_values) {
+  values_out_.clear();
+  values_out_.resize(num_values);
+  values_out_ptr_ = values_out_.data();
+}
+
+template <>
+inline void PrimitiveTypedTest<BooleanType>::SetupValuesOut(int64_t num_values) {
+  values_out_.clear();
+  values_out_.resize(num_values);
+
+  bool_buffer_out_.clear();
+  bool_buffer_out_.resize(num_values);
+  // Write once to all values so we can copy it without getting Valgrind errors
+  // about uninitialised values.
+  std::fill(bool_buffer_out_.begin(), bool_buffer_out_.end(), true);
+  values_out_ptr_ = reinterpret_cast<bool*>(bool_buffer_out_.data());
+}
+
+template <typename TestType>
+inline void PrimitiveTypedTest<TestType>::GenerateData(int64_t num_values,
+                                                       uint32_t seed) {
+  def_levels_.resize(num_values);
+  values_.resize(num_values);
+
+  InitValues<c_type>(static_cast<int>(num_values), seed, values_, buffer_);
+  values_ptr_ = values_.data();
+
+  std::fill(def_levels_.begin(), def_levels_.end(), 1);
+}
+
+template <>
+inline void PrimitiveTypedTest<BooleanType>::GenerateData(int64_t num_values,
+                                                          uint32_t seed) {
+  def_levels_.resize(num_values);
+  values_.resize(num_values);
+
+  InitValues<c_type>(static_cast<int>(num_values), seed, values_, buffer_);
+  bool_buffer_.resize(num_values);
+  std::copy(values_.begin(), values_.end(), bool_buffer_.begin());
+  values_ptr_ = reinterpret_cast<bool*>(bool_buffer_.data());
+
+  std::fill(def_levels_.begin(), def_levels_.end(), 1);
+}
+
+// ----------------------------------------------------------------------
+// test data generation
+
+template <typename T>
+inline void GenerateData(int num_values, T* out, std::vector<uint8_t>* heap) {
+  // seed the prng so failure is deterministic
+  random_numbers(num_values, 0, std::numeric_limits<T>::min(),
+                 std::numeric_limits<T>::max(), out);
+}
+
+template <typename T>
+inline void GenerateBoundData(int num_values, T* out, T min, T max,
+                              std::vector<uint8_t>* heap) {
+  // seed the prng so failure is deterministic
+  random_numbers(num_values, 0, min, max, out);
+}
+
+template <>
+inline void GenerateData<bool>(int num_values, bool* out, std::vector<uint8_t>* heap) {
+  // seed the prng so failure is deterministic
+  random_bools(num_values, 0.5, 0, out);
+}
+
+template <>
+inline void GenerateData<Int96>(int num_values, Int96* out, std::vector<uint8_t>* heap) {
+  // seed the prng so failure is deterministic
+  random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(),
+                       std::numeric_limits<int32_t>::max(), out);
+}
+
+template <>
+inline void GenerateData<ByteArray>(int num_values, ByteArray* out,
+                                    std::vector<uint8_t>* heap) {
+  int max_byte_array_len = 12;
+  heap->resize(num_values * max_byte_array_len);
+  // seed the prng so failure is deterministic
+  random_byte_array(num_values, 0, heap->data(), out, 2, max_byte_array_len);
+}
+
+// Generate ByteArray or FLBA data where there is a given probability
+// for each value to share a common prefix with its predecessor.
+// This is useful to exercise prefix-based encodings such as DELTA_BYTE_ARRAY.
+template <typename T>
+inline void GeneratePrefixedData(int num_values, T* out, std::vector<uint8_t>* heap,
+                                 double prefixed_probability);
+
+template <>
+inline void GeneratePrefixedData(int num_values, ByteArray* out,
+                                 std::vector<uint8_t>* heap,
+                                 double prefixed_probability) {
+  int max_byte_array_len = 12;
+  heap->resize(num_values * max_byte_array_len);
+  // seed the prng so failure is deterministic
+  prefixed_random_byte_array(num_values, /*seed=*/0, heap->data(), out, /*min_size=*/2,
+                             /*max_size=*/max_byte_array_len, prefixed_probability);
+}
+
+static constexpr int kGenerateDataFLBALength = 8;
+
+template <>
+inline void GeneratePrefixedData<FLBA>(int num_values, FLBA* out,
+                                       std::vector<uint8_t>* heap,
+                                       double prefixed_probability) {
+  heap->resize(num_values * kGenerateDataFLBALength);
+  // seed the prng so failure is deterministic
+  prefixed_random_byte_array(num_values, /*seed=*/0, heap->data(),
+                             kGenerateDataFLBALength, out, prefixed_probability);
+}
+
+template <>
+inline void GenerateData<FLBA>(int num_values, FLBA* out, std::vector<uint8_t>* heap) {
+  heap->resize(num_values * kGenerateDataFLBALength);
+  // seed the prng so failure is deterministic
+  random_fixed_byte_array(num_values, 0, heap->data(), kGenerateDataFLBALength, out);
+}
+
+// ----------------------------------------------------------------------
+// Test utility functions for geometry
+
+#if ARROW_LITTLE_ENDIAN
+static constexpr uint8_t kWkbNativeEndianness = 0x01;
+#else
+static constexpr uint8_t kWkbNativeEndianness = 0x00;
+#endif
+
+/// \brief Number of bytes in a WKB Point with X and Y dimensions (uint8_t endian,
+/// uint32_t geometry type, 2 * double coordinates)
+static constexpr int kWkbPointXYSize = 21;
+
+std::string MakeWKBPoint(const std::vector<double>& xyzm, bool has_z, bool has_m);
+
+std::optional<std::pair<double, double>> GetWKBPointCoordinateXY(const ByteArray& value);
+
+// A minimal version of a geoarrow.wkb extension type to test interoperability
+class GeoArrowWkbExtensionType : public ::arrow::ExtensionType {
+ public:
+  explicit GeoArrowWkbExtensionType(std::shared_ptr<::arrow::DataType> storage_type,
+                                    std::string metadata)
+      : ::arrow::ExtensionType(std::move(storage_type)), metadata_(std::move(metadata)) {}
+
+  std::string extension_name() const override { return "geoarrow.wkb"; }
+
+  std::string Serialize() const override { return metadata_; }
+
+  ::arrow::Result<std::shared_ptr<::arrow::DataType>> Deserialize(
+      std::shared_ptr<::arrow::DataType> storage_type,
+      const std::string& serialized_data) const override {
+    return std::make_shared<GeoArrowWkbExtensionType>(std::move(storage_type),
+                                                      serialized_data);
+  }
+
+  std::shared_ptr<::arrow::Array> MakeArray(
+      std::shared_ptr<::arrow::ArrayData> data) const override {
+    return std::make_shared<::arrow::ExtensionArray>(data);
+  }
+
+  bool ExtensionEquals(const ExtensionType& other) const override {
+    return other.extension_name() == extension_name() && other.Serialize() == Serialize();
+  }
+
+ private:
+  std::string metadata_;
+};
+
+std::shared_ptr<::arrow::DataType> geoarrow_wkb(
+    std::string metadata = "{}",
+    const std::shared_ptr<::arrow::DataType> storage = ::arrow::binary());
+
+std::shared_ptr<::arrow::DataType> geoarrow_wkb_lonlat(
+    const std::shared_ptr<::arrow::DataType> storage = ::arrow::binary());
+
+}  // namespace test
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/type_fwd.h b/pyarrow/include/parquet/type_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..02e896598bfa1637634f99a41718804237ae6ce8
--- /dev/null
+++ b/pyarrow/include/parquet/type_fwd.h
@@ -0,0 +1,105 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+namespace parquet {
+
+/// \brief Feature selection when writing Parquet files
+///
+/// `ParquetVersion::type` governs which data types are allowed and how they
+/// are represented. For example, uint32_t data will be written differently
+/// depending on this value (as INT64 for PARQUET_1_0, as UINT32 for other
+/// versions).
+///
+/// However, some features - such as compression algorithms, encryption,
+/// or the improved "v2" data page format - must be enabled separately in
+/// ArrowWriterProperties.
+struct ParquetVersion {
+  enum type : int {
+    /// Enable only pre-2.2 Parquet format features when writing
+    ///
+    /// This setting is useful for maximum compatibility with legacy readers.
+    /// Note that logical types may still be emitted, as long they have a
+    /// corresponding converted type.
+    PARQUET_1_0,
+
+    /// Enable Parquet format 2.4 and earlier features when writing
+    ///
+    /// This enables UINT32 as well as logical types which don't have
+    /// a corresponding converted type.
+    ///
+    /// Note: Parquet format 2.4.0 was released in October 2017.
+    PARQUET_2_4,
+
+    /// Enable Parquet format 2.6 and earlier features when writing
+    ///
+    /// This enables the NANOS time unit in addition to the PARQUET_2_4
+    /// features.
+    ///
+    /// Note: Parquet format 2.6.0 was released in September 2018.
+    PARQUET_2_6,
+
+    /// Enable latest Parquet format 2.x features
+    ///
+    /// This value is equal to the greatest 2.x version supported by
+    /// this library.
+    PARQUET_2_LATEST = PARQUET_2_6
+  };
+};
+
+struct PageIndexLocation;
+
+class FileMetaData;
+class FileCryptoMetaData;
+class RowGroupMetaData;
+
+class ColumnDescriptor;
+class SchemaDescriptor;
+
+class ReaderProperties;
+class ArrowReaderProperties;
+
+class WriterProperties;
+class WriterPropertiesBuilder;
+class ArrowWriterProperties;
+class ArrowWriterPropertiesBuilder;
+
+class EncodedStatistics;
+class Statistics;
+struct SizeStatistics;
+
+namespace geospatial {
+class GeoStatistics;
+struct EncodedGeoStatistics;
+}  // namespace geospatial
+
+class ColumnIndex;
+class OffsetIndex;
+
+namespace arrow {
+
+class FileWriter;
+class FileReader;
+
+}  // namespace arrow
+
+namespace schema {
+class ColumnPath;
+}  // namespace schema
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/types.h b/pyarrow/include/parquet/types.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e8a18fc94d68ec426bc8765538d35128542c899
--- /dev/null
+++ b/pyarrow/include/parquet/types.h
@@ -0,0 +1,883 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <string_view>
+
+#include "parquet/platform.h"
+#include "parquet/type_fwd.h"
+#include "parquet/windows_fixup.h"  // for OPTIONAL
+
+namespace arrow::util {
+
+class Codec;
+
+}  // namespace arrow::util
+
+namespace parquet {
+
+// ----------------------------------------------------------------------
+// Metadata enums to match Thrift metadata
+//
+// The reason we maintain our own enums is to avoid transitive dependency on
+// the compiled Thrift headers (and thus thrift/Thrift.h) for users of the
+// public API. After building parquet-cpp, you should not need to include
+// Thrift headers in your application. This means some boilerplate to convert
+// between our types and Parquet's Thrift types.
+//
+// We can also add special values like NONE to distinguish between metadata
+// values being set and not set. As an example consider ConvertedType and
+// CompressionCodec
+
+// Mirrors parquet::Type
+struct Type {
+  enum type {
+    BOOLEAN = 0,
+    INT32 = 1,
+    INT64 = 2,
+    INT96 = 3,
+    FLOAT = 4,
+    DOUBLE = 5,
+    BYTE_ARRAY = 6,
+    FIXED_LEN_BYTE_ARRAY = 7,
+    // Should always be last element.
+    UNDEFINED = 8
+  };
+};
+
+// Mirrors parquet::ConvertedType
+struct ConvertedType {
+  enum type {
+    NONE,  // Not a real converted type, but means no converted type is specified
+    UTF8,
+    MAP,
+    MAP_KEY_VALUE,
+    LIST,
+    ENUM,
+    DECIMAL,
+    DATE,
+    TIME_MILLIS,
+    TIME_MICROS,
+    TIMESTAMP_MILLIS,
+    TIMESTAMP_MICROS,
+    UINT_8,
+    UINT_16,
+    UINT_32,
+    UINT_64,
+    INT_8,
+    INT_16,
+    INT_32,
+    INT_64,
+    JSON,
+    BSON,
+    INTERVAL,
+    // DEPRECATED INVALID ConvertedType for all-null data.
+    // Only useful for reading legacy files written out by interim Parquet C++ releases.
+    // For writing, always emit LogicalType::Null instead.
+    // See PARQUET-1990.
+    NA = 25,
+    UNDEFINED = 26  // Not a real converted type; should always be last element
+  };
+};
+
+// forward declaration
+namespace format {
+
+class LogicalType;
+
+}
+
+// Mirrors parquet::FieldRepetitionType
+struct Repetition {
+  enum type { REQUIRED = 0, OPTIONAL = 1, REPEATED = 2, /*Always last*/ UNDEFINED = 3 };
+};
+
+// Reference:
+// parquet-mr/parquet-hadoop/src/main/java/org/apache/parquet/
+//                            format/converter/ParquetMetadataConverter.java
+// Sort order for page and column statistics. Types are associated with sort
+// orders (e.g., UTF8 columns should use UNSIGNED) and column stats are
+// aggregated using a sort order. As of parquet-format version 2.3.1, the
+// order used to aggregate stats is always SIGNED and is not stored in the
+// Parquet file. These stats are discarded for types that need unsigned.
+// See PARQUET-686.
+struct SortOrder {
+  enum type { SIGNED, UNSIGNED, UNKNOWN };
+};
+
+namespace schema {
+
+struct DecimalMetadata {
+  bool isset;
+  int32_t scale;
+  int32_t precision;
+};
+
+}  // namespace schema
+
+/// \brief Implementation of parquet.thrift LogicalType types.
+class PARQUET_EXPORT LogicalType {
+ public:
+  struct Type {
+    enum type {
+      UNDEFINED = 0,  // Not a real logical type
+      STRING = 1,
+      MAP,
+      LIST,
+      ENUM,
+      DECIMAL,
+      DATE,
+      TIME,
+      TIMESTAMP,
+      INTERVAL,
+      INT,
+      NIL,  // Thrift NullType: annotates data that is always null
+      JSON,
+      BSON,
+      UUID,
+      FLOAT16,
+      GEOMETRY,
+      GEOGRAPHY,
+      VARIANT,
+      NONE  // Not a real logical type; should always be last element
+    };
+  };
+
+  struct TimeUnit {
+    enum unit { UNKNOWN = 0, MILLIS = 1, MICROS, NANOS };
+  };
+
+  enum class EdgeInterpolationAlgorithm {
+    UNKNOWN = 0,
+    SPHERICAL = 1,
+    VINCENTY = 2,
+    THOMAS = 3,
+    ANDOYER = 4,
+    KARNEY = 5
+  };
+
+  /// \brief The latest supported Variant specification version by this library
+  static constexpr int8_t kVariantSpecVersion = 1;
+
+  /// \brief If possible, return a logical type equivalent to the given legacy
+  /// converted type (and decimal metadata if applicable).
+  static std::shared_ptr<const LogicalType> FromConvertedType(
+      const parquet::ConvertedType::type converted_type,
+      const parquet::schema::DecimalMetadata converted_decimal_metadata = {false, -1,
+                                                                           -1});
+
+  /// \brief Return the logical type represented by the Thrift intermediary object.
+  static std::shared_ptr<const LogicalType> FromThrift(
+      const parquet::format::LogicalType& thrift_logical_type);
+
+  /// \brief Return the explicitly requested logical type.
+  static std::shared_ptr<const LogicalType> String();
+  static std::shared_ptr<const LogicalType> Map();
+  static std::shared_ptr<const LogicalType> List();
+  static std::shared_ptr<const LogicalType> Enum();
+  static std::shared_ptr<const LogicalType> Decimal(int32_t precision, int32_t scale = 0);
+  static std::shared_ptr<const LogicalType> Date();
+  static std::shared_ptr<const LogicalType> Time(bool is_adjusted_to_utc,
+                                                 LogicalType::TimeUnit::unit time_unit);
+
+  /// \brief Create a Timestamp logical type
+  /// \param[in] is_adjusted_to_utc set true if the data is UTC-normalized
+  /// \param[in] time_unit the resolution of the timestamp
+  /// \param[in] is_from_converted_type if true, the timestamp was generated
+  /// by translating a legacy converted type of TIMESTAMP_MILLIS or
+  /// TIMESTAMP_MICROS. Default is false.
+  /// \param[in] force_set_converted_type if true, always set the
+  /// legacy ConvertedType TIMESTAMP_MICROS and TIMESTAMP_MILLIS
+  /// metadata. Default is false
+  static std::shared_ptr<const LogicalType> Timestamp(
+      bool is_adjusted_to_utc, LogicalType::TimeUnit::unit time_unit,
+      bool is_from_converted_type = false, bool force_set_converted_type = false);
+
+  static std::shared_ptr<const LogicalType> Interval();
+  static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);
+
+  /// \brief Create a logical type for data that's always null
+  ///
+  /// Any physical type can be annotated with this logical type.
+  static std::shared_ptr<const LogicalType> Null();
+
+  static std::shared_ptr<const LogicalType> JSON();
+  static std::shared_ptr<const LogicalType> BSON();
+  static std::shared_ptr<const LogicalType> UUID();
+  static std::shared_ptr<const LogicalType> Float16();
+  static std::shared_ptr<const LogicalType> Variant(
+      int8_t specVersion = kVariantSpecVersion);
+
+  static std::shared_ptr<const LogicalType> Geometry(std::string crs = "");
+
+  static std::shared_ptr<const LogicalType> Geography(
+      std::string crs = "", LogicalType::EdgeInterpolationAlgorithm algorithm =
+                                EdgeInterpolationAlgorithm::SPHERICAL);
+
+  /// \brief Create a placeholder for when no logical type is specified
+  static std::shared_ptr<const LogicalType> None();
+
+  /// \brief Return true if this logical type is consistent with the given underlying
+  /// physical type.
+  bool is_applicable(parquet::Type::type primitive_type,
+                     int32_t primitive_length = -1) const;
+
+  /// \brief Return true if this logical type is equivalent to the given legacy converted
+  /// type (and decimal metadata if applicable).
+  bool is_compatible(parquet::ConvertedType::type converted_type,
+                     parquet::schema::DecimalMetadata converted_decimal_metadata = {
+                         false, -1, -1}) const;
+
+  /// \brief If possible, return the legacy converted type (and decimal metadata if
+  /// applicable) equivalent to this logical type.
+  parquet::ConvertedType::type ToConvertedType(
+      parquet::schema::DecimalMetadata* out_decimal_metadata) const;
+
+  /// \brief Return a printable representation of this logical type.
+  std::string ToString() const;
+
+  /// \brief Return a JSON representation of this logical type.
+  std::string ToJSON() const;
+
+  /// \brief Return a serializable Thrift object for this logical type.
+  parquet::format::LogicalType ToThrift() const;
+
+  /// \brief Return true if the given logical type is equivalent to this logical type.
+  bool Equals(const LogicalType& other) const;
+
+  /// \brief Return the enumerated type of this logical type.
+  LogicalType::Type::type type() const;
+
+  /// \brief Return the appropriate sort order for this logical type.
+  SortOrder::type sort_order() const;
+
+  // Type checks ...
+  bool is_string() const;
+  bool is_map() const;
+  bool is_list() const;
+  bool is_enum() const;
+  bool is_decimal() const;
+  bool is_date() const;
+  bool is_time() const;
+  bool is_timestamp() const;
+  bool is_interval() const;
+  bool is_int() const;
+  bool is_null() const;
+  bool is_JSON() const;
+  bool is_BSON() const;
+  bool is_UUID() const;
+  bool is_float16() const;
+  bool is_geometry() const;
+  bool is_geography() const;
+  bool is_variant() const;
+  bool is_none() const;
+  /// \brief Return true if this logical type is of a known type.
+  bool is_valid() const;
+  bool is_invalid() const;
+  /// \brief Return true if this logical type is suitable for a schema GroupNode.
+  bool is_nested() const;
+  bool is_nonnested() const;
+  /// \brief Return true if this logical type is included in the Thrift output for its
+  /// node.
+  bool is_serialized() const;
+
+  LogicalType(const LogicalType&) = delete;
+  LogicalType& operator=(const LogicalType&) = delete;
+  virtual ~LogicalType() noexcept;
+
+ protected:
+  LogicalType();
+
+  class Impl;
+  std::unique_ptr<const Impl> impl_;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
+class PARQUET_EXPORT StringLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  StringLogicalType() = default;
+};
+
+/// \brief Allowed for group nodes only.
+class PARQUET_EXPORT MapLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  MapLogicalType() = default;
+};
+
+/// \brief Allowed for group nodes only.
+class PARQUET_EXPORT ListLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  ListLogicalType() = default;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY, must be encoded as UTF-8.
+class PARQUET_EXPORT EnumLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  EnumLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32, INT64, FIXED_LEN_BYTE_ARRAY, or BYTE_ARRAY,
+/// depending on the precision.
+class PARQUET_EXPORT DecimalLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make(int32_t precision, int32_t scale = 0);
+  int32_t precision() const;
+  int32_t scale() const;
+
+ private:
+  DecimalLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32.
+class PARQUET_EXPORT DateLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  DateLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32 (for MILLIS) or INT64 (for MICROS and NANOS).
+class PARQUET_EXPORT TimeLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
+                                                 LogicalType::TimeUnit::unit time_unit);
+  bool is_adjusted_to_utc() const;
+  LogicalType::TimeUnit::unit time_unit() const;
+
+ private:
+  TimeLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT64.
+class PARQUET_EXPORT TimestampLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make(bool is_adjusted_to_utc,
+                                                 LogicalType::TimeUnit::unit time_unit,
+                                                 bool is_from_converted_type = false,
+                                                 bool force_set_converted_type = false);
+  bool is_adjusted_to_utc() const;
+  LogicalType::TimeUnit::unit time_unit() const;
+
+  /// \brief If true, will not set LogicalType in Thrift metadata
+  bool is_from_converted_type() const;
+
+  /// \brief If true, will set ConvertedType for micros and millis
+  /// resolution in legacy ConvertedType Thrift metadata
+  bool force_set_converted_type() const;
+
+ private:
+  TimestampLogicalType() = default;
+};
+
+/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 12
+class PARQUET_EXPORT IntervalLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  IntervalLogicalType() = default;
+};
+
+/// \brief Allowed for physical type INT32 (for bit widths 8, 16, and 32) and INT64
+/// (for bit width 64).
+class PARQUET_EXPORT IntLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make(int bit_width, bool is_signed);
+  int bit_width() const;
+  bool is_signed() const;
+
+ private:
+  IntLogicalType() = default;
+};
+
+/// \brief Allowed for any physical type.
+class PARQUET_EXPORT NullLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  NullLogicalType() = default;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY.
+class PARQUET_EXPORT JSONLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  JSONLogicalType() = default;
+};
+
+/// \brief Allowed for physical type BYTE_ARRAY.
+class PARQUET_EXPORT BSONLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  BSONLogicalType() = default;
+};
+
+/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 16,
+/// must encode raw UUID bytes.
+class PARQUET_EXPORT UUIDLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  UUIDLogicalType() = default;
+};
+
+/// \brief Allowed for physical type FIXED_LEN_BYTE_ARRAY with length 2,
+/// must encode raw FLOAT16 bytes.
+class PARQUET_EXPORT Float16LogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  Float16LogicalType() = default;
+};
+
+class PARQUET_EXPORT GeometryLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make(std::string crs = "");
+
+  const std::string& crs() const;
+
+ private:
+  GeometryLogicalType() = default;
+};
+
+class PARQUET_EXPORT GeographyLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make(
+      std::string crs = "", LogicalType::EdgeInterpolationAlgorithm algorithm =
+                                EdgeInterpolationAlgorithm::SPHERICAL);
+
+  const std::string& crs() const;
+  LogicalType::EdgeInterpolationAlgorithm algorithm() const;
+  std::string_view algorithm_name() const;
+
+ private:
+  GeographyLogicalType() = default;
+};
+
+/// \brief Allowed for group nodes only.
+class PARQUET_EXPORT VariantLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make(
+      int8_t specVersion = kVariantSpecVersion);
+
+  int8_t spec_version() const;
+
+ private:
+  VariantLogicalType() = default;
+};
+
+/// \brief Allowed for any physical type.
+class PARQUET_EXPORT NoLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  NoLogicalType() = default;
+};
+
+// Internal API, for unrecognized logical types
+class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
+ public:
+  static std::shared_ptr<const LogicalType> Make();
+
+ private:
+  UndefinedLogicalType() = default;
+};
+
+// Data encodings. Mirrors parquet::Encoding
+struct Encoding {
+  enum type {
+    PLAIN = 0,
+    PLAIN_DICTIONARY = 2,
+    RLE = 3,
+    BIT_PACKED = 4,
+    DELTA_BINARY_PACKED = 5,
+    DELTA_LENGTH_BYTE_ARRAY = 6,
+    DELTA_BYTE_ARRAY = 7,
+    RLE_DICTIONARY = 8,
+    BYTE_STREAM_SPLIT = 9,
+    // Should always be last element (except UNKNOWN)
+    UNDEFINED = 10,
+    UNKNOWN = 999
+  };
+};
+
+// Exposed data encodings. It is the encoding of the data read from the file,
+// rather than the encoding of the data in the file. E.g., the data encoded as
+// RLE_DICTIONARY in the file can be read as dictionary indices by RLE
+// decoding, in which case the data read from the file is DICTIONARY encoded.
+enum class ExposedEncoding {
+  NO_ENCODING = 0,  // data is not encoded, i.e. already decoded during reading
+  DICTIONARY = 1
+};
+
+/// \brief Return true if Parquet supports indicated compression type
+PARQUET_EXPORT
+bool IsCodecSupported(Compression::type codec);
+
+PARQUET_EXPORT
+std::unique_ptr<Codec> GetCodec(Compression::type codec);
+
+PARQUET_EXPORT
+std::unique_ptr<Codec> GetCodec(Compression::type codec,
+                                const CodecOptions& codec_options);
+
+PARQUET_EXPORT
+std::unique_ptr<Codec> GetCodec(Compression::type codec, int compression_level);
+
+struct ParquetCipher {
+  enum type { AES_GCM_V1 = 0, AES_GCM_CTR_V1 = 1 };
+};
+
+struct AadMetadata {
+  std::string aad_prefix;
+  std::string aad_file_unique;
+  bool supply_aad_prefix;
+};
+
+struct EncryptionAlgorithm {
+  ParquetCipher::type algorithm;
+  AadMetadata aad;
+};
+
+// parquet::PageType
+struct PageType {
+  enum type {
+    DATA_PAGE,
+    INDEX_PAGE,
+    DICTIONARY_PAGE,
+    DATA_PAGE_V2,
+    // Should always be last element
+    UNDEFINED
+  };
+};
+
+bool PageCanUseChecksum(PageType::type pageType);
+
+class ColumnOrder {
+ public:
+  enum type { UNDEFINED, TYPE_DEFINED_ORDER };
+  explicit ColumnOrder(ColumnOrder::type column_order) : column_order_(column_order) {}
+  // Default to Type Defined Order
+  ColumnOrder() : column_order_(type::TYPE_DEFINED_ORDER) {}
+  ColumnOrder::type get_order() { return column_order_; }
+
+  static ColumnOrder undefined_;
+  static ColumnOrder type_defined_;
+
+ private:
+  ColumnOrder::type column_order_;
+};
+
+/// \brief BoundaryOrder is a proxy around format::BoundaryOrder.
+struct BoundaryOrder {
+  enum type {
+    Unordered = 0,
+    Ascending = 1,
+    Descending = 2,
+    // Should always be last element
+    UNDEFINED = 3
+  };
+};
+
+/// \brief SortingColumn is a proxy around format::SortingColumn.
+struct PARQUET_EXPORT SortingColumn {
+  // The column index (in this row group)
+  int32_t column_idx;
+
+  // If true, indicates this column is sorted in descending order.
+  bool descending;
+
+  // If true, nulls will come before non-null values, otherwise, nulls go at the end.
+  bool nulls_first;
+};
+
+inline bool operator==(const SortingColumn& left, const SortingColumn& right) {
+  return left.nulls_first == right.nulls_first && left.descending == right.descending &&
+         left.column_idx == right.column_idx;
+}
+
+inline bool operator!=(const SortingColumn& left, const SortingColumn& right) {
+  return !(left == right);
+}
+
+// ----------------------------------------------------------------------
+
+struct ByteArray {
+  ByteArray() : len(0), ptr(NULLPTR) {}
+  ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {}
+
+  ByteArray(::std::string_view view)  // NOLINT implicit conversion
+      : ByteArray(static_cast<uint32_t>(view.size()),
+                  reinterpret_cast<const uint8_t*>(view.data())) {}
+
+  explicit operator std::string_view() const {
+    return std::string_view{reinterpret_cast<const char*>(ptr), len};
+  }
+
+  uint32_t len;
+  const uint8_t* ptr;
+};
+
+inline bool operator==(const ByteArray& left, const ByteArray& right) {
+  return left.len == right.len &&
+         (left.len == 0 || std::memcmp(left.ptr, right.ptr, left.len) == 0);
+}
+
+inline bool operator!=(const ByteArray& left, const ByteArray& right) {
+  return !(left == right);
+}
+
+struct FixedLenByteArray {
+  FixedLenByteArray() : ptr(NULLPTR) {}
+  explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {}
+  const uint8_t* ptr;
+};
+
+using FLBA = FixedLenByteArray;
+
+// Julian day at unix epoch.
+//
+// The Julian Day Number (JDN) is the integer assigned to a whole solar day in
+// the Julian day count starting from noon Universal time, with Julian day
+// number 0 assigned to the day starting at noon on Monday, January 1, 4713 BC,
+// proleptic Julian calendar (November 24, 4714 BC, in the proleptic Gregorian
+// calendar),
+constexpr int64_t kJulianToUnixEpochDays = INT64_C(2440588);
+constexpr int64_t kSecondsPerDay = INT64_C(60 * 60 * 24);
+constexpr int64_t kMillisecondsPerDay = kSecondsPerDay * INT64_C(1000);
+constexpr int64_t kMicrosecondsPerDay = kMillisecondsPerDay * INT64_C(1000);
+constexpr int64_t kNanosecondsPerDay = kMicrosecondsPerDay * INT64_C(1000);
+
+MANUALLY_ALIGNED_STRUCT(1) Int96 { uint32_t value[3]; };
+STRUCT_END(Int96, 12);
+
+inline bool operator==(const Int96& left, const Int96& right) {
+  return std::equal(left.value, left.value + 3, right.value);
+}
+
+inline bool operator!=(const Int96& left, const Int96& right) { return !(left == right); }
+
+static inline std::string ByteArrayToString(const ByteArray& a) {
+  return std::string(reinterpret_cast<const char*>(a.ptr), a.len);
+}
+
+static inline void Int96SetNanoSeconds(parquet::Int96& i96, int64_t nanoseconds) {
+  std::memcpy(&i96.value, &nanoseconds, sizeof(nanoseconds));
+}
+
+struct DecodedInt96 {
+  uint64_t days_since_epoch;
+  uint64_t nanoseconds;
+};
+
+static inline DecodedInt96 DecodeInt96Timestamp(const parquet::Int96& i96) {
+  // We do the computations in the unsigned domain to avoid unsigned behaviour
+  // on overflow.
+  DecodedInt96 result;
+  result.days_since_epoch = i96.value[2] - static_cast<uint64_t>(kJulianToUnixEpochDays);
+  result.nanoseconds = 0;
+
+  memcpy(&result.nanoseconds, &i96.value, sizeof(uint64_t));
+  return result;
+}
+
+static inline int64_t Int96GetNanoSeconds(const parquet::Int96& i96) {
+  const auto decoded = DecodeInt96Timestamp(i96);
+  return static_cast<int64_t>(decoded.days_since_epoch * kNanosecondsPerDay +
+                              decoded.nanoseconds);
+}
+
+static inline int64_t Int96GetMicroSeconds(const parquet::Int96& i96) {
+  const auto decoded = DecodeInt96Timestamp(i96);
+  uint64_t microseconds = decoded.nanoseconds / static_cast<uint64_t>(1000);
+  return static_cast<int64_t>(decoded.days_since_epoch * kMicrosecondsPerDay +
+                              microseconds);
+}
+
+static inline int64_t Int96GetMilliSeconds(const parquet::Int96& i96) {
+  const auto decoded = DecodeInt96Timestamp(i96);
+  uint64_t milliseconds = decoded.nanoseconds / static_cast<uint64_t>(1000000);
+  return static_cast<int64_t>(decoded.days_since_epoch * kMillisecondsPerDay +
+                              milliseconds);
+}
+
+static inline int64_t Int96GetSeconds(const parquet::Int96& i96) {
+  const auto decoded = DecodeInt96Timestamp(i96);
+  uint64_t seconds = decoded.nanoseconds / static_cast<uint64_t>(1000000000);
+  return static_cast<int64_t>(decoded.days_since_epoch * kSecondsPerDay + seconds);
+}
+
+static inline std::string Int96ToString(const Int96& a) {
+  std::ostringstream result;
+  std::copy(a.value, a.value + 3, std::ostream_iterator<uint32_t>(result, " "));
+  return result.str();
+}
+
+static inline std::string FixedLenByteArrayToString(const FixedLenByteArray& a, int len) {
+  std::ostringstream result;
+  std::copy(a.ptr, a.ptr + len, std::ostream_iterator<uint32_t>(result, " "));
+  return result.str();
+}
+
+template <Type::type TYPE>
+struct type_traits {};
+
+template <>
+struct type_traits<Type::BOOLEAN> {
+  using value_type = bool;
+
+  static constexpr int value_byte_size = 1;
+  static constexpr const char* printf_code = "d";
+};
+
+template <>
+struct type_traits<Type::INT32> {
+  using value_type = int32_t;
+
+  static constexpr int value_byte_size = 4;
+  static constexpr const char* printf_code = "d";
+};
+
+template <>
+struct type_traits<Type::INT64> {
+  using value_type = int64_t;
+
+  static constexpr int value_byte_size = 8;
+  static constexpr const char* printf_code =
+      (sizeof(long) == 64) ? "ld" : "lld";  // NOLINT: runtime/int
+};
+
+template <>
+struct type_traits<Type::INT96> {
+  using value_type = Int96;
+
+  static constexpr int value_byte_size = 12;
+  static constexpr const char* printf_code = "s";
+};
+
+template <>
+struct type_traits<Type::FLOAT> {
+  using value_type = float;
+
+  static constexpr int value_byte_size = 4;
+  static constexpr const char* printf_code = "f";
+};
+
+template <>
+struct type_traits<Type::DOUBLE> {
+  using value_type = double;
+
+  static constexpr int value_byte_size = 8;
+  static constexpr const char* printf_code = "lf";
+};
+
+template <>
+struct type_traits<Type::BYTE_ARRAY> {
+  using value_type = ByteArray;
+
+  static constexpr int value_byte_size = sizeof(ByteArray);
+  static constexpr const char* printf_code = "s";
+};
+
+template <>
+struct type_traits<Type::FIXED_LEN_BYTE_ARRAY> {
+  using value_type = FixedLenByteArray;
+
+  static constexpr int value_byte_size = sizeof(FixedLenByteArray);
+  static constexpr const char* printf_code = "s";
+};
+
+template <Type::type TYPE>
+struct PhysicalType {
+  using c_type = typename type_traits<TYPE>::value_type;
+  static constexpr Type::type type_num = TYPE;
+};
+
+using BooleanType = PhysicalType<Type::BOOLEAN>;
+using Int32Type = PhysicalType<Type::INT32>;
+using Int64Type = PhysicalType<Type::INT64>;
+using Int96Type = PhysicalType<Type::INT96>;
+using FloatType = PhysicalType<Type::FLOAT>;
+using DoubleType = PhysicalType<Type::DOUBLE>;
+using ByteArrayType = PhysicalType<Type::BYTE_ARRAY>;
+using FLBAType = PhysicalType<Type::FIXED_LEN_BYTE_ARRAY>;
+
+template <typename Type>
+inline std::string format_fwf(int width) {
+  std::stringstream ss;
+  ss << "%-" << width << type_traits<Type::type_num>::printf_code;
+  return ss.str();
+}
+
+PARQUET_EXPORT std::string EncodingToString(Encoding::type t);
+
+PARQUET_EXPORT std::string ConvertedTypeToString(ConvertedType::type t);
+
+PARQUET_EXPORT std::string TypeToString(Type::type t);
+
+PARQUET_EXPORT std::string TypeToString(Type::type t, int type_length);
+
+PARQUET_EXPORT std::string FormatStatValue(
+    Type::type parquet_type, ::std::string_view val,
+    const std::shared_ptr<const LogicalType>& logical_type = NULLPTR);
+
+PARQUET_EXPORT int GetTypeByteSize(Type::type t);
+
+PARQUET_EXPORT SortOrder::type DefaultSortOrder(Type::type primitive);
+
+PARQUET_EXPORT SortOrder::type GetSortOrder(ConvertedType::type converted,
+                                            Type::type primitive);
+
+PARQUET_EXPORT SortOrder::type GetSortOrder(
+    const std::shared_ptr<const LogicalType>& logical_type, Type::type primitive);
+
+// PLAIN_DICTIONARY is deprecated but used to be used as a dictionary index
+// encoding.
+constexpr bool IsDictionaryIndexEncoding(Encoding::type e) {
+  return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY;
+}
+
+}  // namespace parquet
diff --git a/pyarrow/include/parquet/windows_compatibility.h b/pyarrow/include/parquet/windows_compatibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe84d8c6ce06e12b2b50563997b40337f691ee53
--- /dev/null
+++ b/pyarrow/include/parquet/windows_compatibility.h
@@ -0,0 +1,21 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/util/windows_compatibility.h"
+#include "parquet/windows_fixup.h"
diff --git a/pyarrow/include/parquet/windows_fixup.h b/pyarrow/include/parquet/windows_fixup.h
new file mode 100644
index 0000000000000000000000000000000000000000..feac4e64d19761595105de7cc189e92c9ef6ce09
--- /dev/null
+++ b/pyarrow/include/parquet/windows_fixup.h
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This header needs to be included multiple times.
+
+#include "arrow/util/windows_fixup.h"
+
+#ifdef _WIN32
+
+// parquet.thrift's OPTIONAL RepetitionType conflicts with a Windows #define
+#  ifdef OPTIONAL
+#    undef OPTIONAL
+#  endif
+
+#endif  // _WIN32
diff --git a/pyarrow/include/parquet/xxhasher.h b/pyarrow/include/parquet/xxhasher.h
new file mode 100644
index 0000000000000000000000000000000000000000..a54f287883e006e9cd6d9aeeb2efeb1d6f9db2df
--- /dev/null
+++ b/pyarrow/include/parquet/xxhasher.h
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "parquet/hasher.h"
+#include "parquet/platform.h"
+#include "parquet/types.h"
+
+namespace parquet {
+
+class PARQUET_EXPORT XxHasher : public Hasher {
+ public:
+  uint64_t Hash(int32_t value) const override;
+  uint64_t Hash(int64_t value) const override;
+  uint64_t Hash(float value) const override;
+  uint64_t Hash(double value) const override;
+  uint64_t Hash(const Int96* value) const override;
+  uint64_t Hash(const ByteArray* value) const override;
+  uint64_t Hash(const FLBA* val, uint32_t len) const override;
+
+  void Hashes(const int32_t* values, int num_values, uint64_t* hashes) const override;
+  void Hashes(const int64_t* values, int num_values, uint64_t* hashes) const override;
+  void Hashes(const float* values, int num_values, uint64_t* hashes) const override;
+  void Hashes(const double* values, int num_values, uint64_t* hashes) const override;
+  void Hashes(const Int96* values, int num_values, uint64_t* hashes) const override;
+  void Hashes(const ByteArray* values, int num_values, uint64_t* hashes) const override;
+  void Hashes(const FLBA* values, uint32_t type_len, int num_values,
+              uint64_t* hashes) const override;
+
+  static constexpr int kParquetBloomXxHashSeed = 0;
+};
+
+}  // namespace parquet
diff --git a/pyarrow/parquet/__init__.py b/pyarrow/parquet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..134f3c097ef004f83fdc8e24e5cb45166c17577e
--- /dev/null
+++ b/pyarrow/parquet/__init__.py
@@ -0,0 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# flake8: noqa
+
+from .core import *
diff --git a/pyarrow/parquet/core.py b/pyarrow/parquet/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..676bc445238e1bead69620272f2405673a48e854
--- /dev/null
+++ b/pyarrow/parquet/core.py
@@ -0,0 +1,2454 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+from collections import defaultdict
+from contextlib import nullcontext
+from functools import reduce
+
+import inspect
+import json
+import os
+import re
+import operator
+
+import pyarrow as pa
+
+try:
+    import pyarrow._parquet as _parquet
+except ImportError as exc:
+    raise ImportError(
+        "The pyarrow installation is not built with support "
+        f"for the Parquet file format ({str(exc)})"
+    ) from None
+
+from pyarrow._parquet import (ParquetReader, Statistics,  # noqa
+                              FileMetaData, RowGroupMetaData,
+                              ColumnChunkMetaData,
+                              ParquetSchema, ColumnSchema,
+                              ParquetLogicalType,
+                              FileEncryptionProperties,
+                              FileDecryptionProperties,
+                              SortingColumn)
+from pyarrow.fs import (LocalFileSystem, FileType, _resolve_filesystem_and_path,
+                        _ensure_filesystem)
+from pyarrow.util import guid, _is_path_like, _stringify_path, _deprecate_api
+
+
+def _check_contains_null(val):
+    if isinstance(val, bytes):
+        for byte in val:
+            if isinstance(byte, bytes):
+                compare_to = chr(0)
+            else:
+                compare_to = 0
+            if byte == compare_to:
+                return True
+    elif isinstance(val, str):
+        return '\x00' in val
+    return False
+
+
+def _check_filters(filters, check_null_strings=True):
+    """
+    Check if filters are well-formed.
+    """
+    if filters is not None:
+        if len(filters) == 0 or any(len(f) == 0 for f in filters):
+            raise ValueError("Malformed filters")
+        if isinstance(filters[0][0], str):
+            # We have encountered the situation where we have one nesting level
+            # too few:
+            #   We have [(,,), ..] instead of [[(,,), ..]]
+            filters = [filters]
+        if check_null_strings:
+            for conjunction in filters:
+                for col, op, val in conjunction:
+                    if (
+                        isinstance(val, list) and
+                        all(_check_contains_null(v) for v in val) or
+                        _check_contains_null(val)
+                    ):
+                        raise NotImplementedError(
+                            "Null-terminated binary strings are not supported "
+                            "as filter values."
+                        )
+    return filters
+
+
+_DNF_filter_doc = """Predicates are expressed using an ``Expression`` or using
+    the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``.
+    DNF allows arbitrary boolean logical combinations of single column predicates.
+    The innermost tuples each describe a single column predicate. The list of inner
+    predicates is interpreted as a conjunction (AND), forming a more selective and
+    multiple column predicate. Finally, the most outer list combines these filters
+    as a disjunction (OR).
+
+    Predicates may also be passed as List[Tuple]. This form is interpreted
+    as a single conjunction. To express OR in predicates, one must
+    use the (preferred) List[List[Tuple]] notation.
+
+    Each tuple has format: (``key``, ``op``, ``value``) and compares the
+    ``key`` with the ``value``.
+    The supported ``op`` are:  ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``,
+    ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the
+    ``value`` must be a collection such as a ``list``, a ``set`` or a
+    ``tuple``.
+
+    Examples:
+
+    Using the ``Expression`` API:
+
+    .. code-block:: python
+
+        import pyarrow.compute as pc
+        pc.field('x') = 0
+        pc.field('y').isin(['a', 'b', 'c'])
+        ~pc.field('y').isin({'a', 'b'})
+
+    Using the DNF format:
+
+    .. code-block:: python
+
+        ('x', '=', 0)
+        ('y', 'in', ['a', 'b', 'c'])
+        ('z', 'not in', {'a','b'})
+
+    """
+
+
+def filters_to_expression(filters):
+    """
+    Check if filters are well-formed and convert to an ``Expression``.
+
+    Parameters
+    ----------
+    filters : List[Tuple] or List[List[Tuple]]
+
+    Notes
+    -----
+    See internal ``pyarrow._DNF_filter_doc`` attribute for more details.
+
+    Examples
+    --------
+
+    >>> filters_to_expression([('foo', '==', 'bar')])
+    <pyarrow.compute.Expression (foo == "bar")>
+
+    Returns
+    -------
+    pyarrow.compute.Expression
+        An Expression representing the filters
+    """
+    import pyarrow.dataset as ds
+
+    if isinstance(filters, ds.Expression):
+        return filters
+
+    filters = _check_filters(filters, check_null_strings=False)
+
+    def convert_single_predicate(col, op, val):
+        field = ds.field(col)
+
+        if op == "=" or op == "==":
+            return field == val
+        elif op == "!=":
+            return field != val
+        elif op == '<':
+            return field < val
+        elif op == '>':
+            return field > val
+        elif op == '<=':
+            return field <= val
+        elif op == '>=':
+            return field >= val
+        elif op == 'in':
+            return field.isin(val)
+        elif op == 'not in':
+            return ~field.isin(val)
+        else:
+            raise ValueError(f'"{col}" is not a valid operator in predicates.')
+
+    disjunction_members = []
+
+    for conjunction in filters:
+        conjunction_members = [
+            convert_single_predicate(col, op, val)
+            for col, op, val in conjunction
+        ]
+
+        disjunction_members.append(reduce(operator.and_, conjunction_members))
+
+    return reduce(operator.or_, disjunction_members)
+
+
+_filters_to_expression = _deprecate_api(
+    "_filters_to_expression", "filters_to_expression",
+    filters_to_expression, "10.0.0", DeprecationWarning)
+
+
+# ----------------------------------------------------------------------
+# Reading a single Parquet file
+
+
+class ParquetFile:
+    """
+    Reader interface for a single Parquet file.
+
+    Parameters
+    ----------
+    source : str, pathlib.Path, pyarrow.NativeFile, or file-like object
+        Readable source. For passing bytes or buffer-like file containing a
+        Parquet file, use pyarrow.BufferReader.
+    metadata : FileMetaData, default None
+        Use existing metadata object, rather than reading from file.
+    common_metadata : FileMetaData, default None
+        Will be used in reads for pandas schema metadata if not found in the
+        main file's metadata, no other uses at the moment.
+    read_dictionary : list
+        List of column names to read directly as DictionaryArray.
+    binary_type : pyarrow.DataType, default None
+        If given, Parquet binary columns will be read as this datatype.
+        This setting is ignored if a serialized Arrow schema is found in
+        the Parquet metadata.
+    list_type : subclass of pyarrow.DataType, default None
+        If given, non-MAP repeated columns will be read as an instance of
+        this datatype (either pyarrow.ListType or pyarrow.LargeListType).
+        This setting is ignored if a serialized Arrow schema is found in
+        the Parquet metadata.
+    memory_map : bool, default False
+        If the source is a file path, use a memory map to read file, which can
+        improve performance in some environments.
+    buffer_size : int, default 0
+        If positive, perform read buffering when deserializing individual
+        column chunks. Otherwise IO calls are unbuffered.
+    pre_buffer : bool, default False
+        Coalesce and issue file reads in parallel to improve performance on
+        high-latency filesystems (e.g. S3). If True, Arrow will use a
+        background I/O thread pool.
+    coerce_int96_timestamp_unit : str, default None
+        Cast timestamps that are stored in INT96 format to a particular
+        resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
+        and therefore INT96 timestamps will be inferred as timestamps
+        in nanoseconds.
+    decryption_properties : FileDecryptionProperties, default None
+        File decryption properties for Parquet Modular Encryption.
+    thrift_string_size_limit : int, default None
+        If not None, override the maximum total string size allocated
+        when decoding Thrift structures. The default limit should be
+        sufficient for most Parquet files.
+    thrift_container_size_limit : int, default None
+        If not None, override the maximum total size of containers allocated
+        when decoding Thrift structures. The default limit should be
+        sufficient for most Parquet files.
+    filesystem : FileSystem, default None
+        If nothing passed, will be inferred based on path.
+        Path will try to be found in the local on-disk filesystem otherwise
+        it will be parsed as an URI to determine the filesystem.
+    page_checksum_verification : bool, default False
+        If True, verify the checksum for each page read from the file.
+    arrow_extensions_enabled : bool, default True
+        If True, read Parquet logical types as Arrow extension types where possible,
+        (e.g., read JSON as the canonical `arrow.json` extension type or UUID as
+        the canonical `arrow.uuid` extension type).
+
+    Examples
+    --------
+
+    Generate an example PyArrow Table and write it to Parquet file:
+
+    >>> import pyarrow as pa
+    >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+    ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+    ...                              "Brittle stars", "Centipede"]})
+
+    >>> import pyarrow.parquet as pq
+    >>> pq.write_table(table, 'example.parquet')
+
+    Create a ``ParquetFile`` object from the Parquet file:
+
+    >>> parquet_file = pq.ParquetFile('example.parquet')
+
+    Read the data:
+
+    >>> parquet_file.read()
+    pyarrow.Table
+    n_legs: int64
+    animal: string
+    ----
+    n_legs: [[2,2,4,4,5,100]]
+    animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]]
+
+    Create a ParquetFile object with "animal" column as DictionaryArray:
+
+    >>> parquet_file = pq.ParquetFile('example.parquet',
+    ...                               read_dictionary=["animal"])
+    >>> parquet_file.read()
+    pyarrow.Table
+    n_legs: int64
+    animal: dictionary<values=string, indices=int32, ordered=0>
+    ----
+    n_legs: [[2,2,4,4,5,100]]
+    animal: [  -- dictionary:
+    ["Flamingo","Parrot",...,"Brittle stars","Centipede"]  -- indices:
+    [0,1,2,3,4,5]]
+    """
+
+    def __init__(self, source, *, metadata=None, common_metadata=None,
+                 read_dictionary=None, binary_type=None, list_type=None,
+                 memory_map=False, buffer_size=0, pre_buffer=False,
+                 coerce_int96_timestamp_unit=None,
+                 decryption_properties=None, thrift_string_size_limit=None,
+                 thrift_container_size_limit=None, filesystem=None,
+                 page_checksum_verification=False, arrow_extensions_enabled=True):
+
+        self._close_source = getattr(source, 'closed', True)
+
+        filesystem, source = _resolve_filesystem_and_path(
+            source, filesystem, memory_map=memory_map)
+        if filesystem is not None:
+            source = filesystem.open_input_file(source)
+            self._close_source = True  # We opened it here, ensure we close it.
+
+        self.reader = ParquetReader()
+        self.reader.open(
+            source, use_memory_map=memory_map,
+            buffer_size=buffer_size, pre_buffer=pre_buffer,
+            read_dictionary=read_dictionary, metadata=metadata,
+            binary_type=binary_type, list_type=list_type,
+            coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+            decryption_properties=decryption_properties,
+            thrift_string_size_limit=thrift_string_size_limit,
+            thrift_container_size_limit=thrift_container_size_limit,
+            page_checksum_verification=page_checksum_verification,
+            arrow_extensions_enabled=arrow_extensions_enabled,
+        )
+        self.common_metadata = common_metadata
+        self._nested_paths_by_prefix = self._build_nested_paths()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        self.close()
+
+    def _build_nested_paths(self):
+        paths = self.reader.column_paths
+
+        result = defaultdict(list)
+
+        for i, path in enumerate(paths):
+            key = path[0]
+            rest = path[1:]
+            while True:
+                result[key].append(i)
+
+                if not rest:
+                    break
+
+                key = '.'.join((key, rest[0]))
+                rest = rest[1:]
+
+        return result
+
+    @property
+    def metadata(self):
+        """
+        Return the Parquet metadata.
+        """
+        return self.reader.metadata
+
+    @property
+    def schema(self):
+        """
+        Return the Parquet schema, unconverted to Arrow types
+        """
+        return self.metadata.schema
+
+    @property
+    def schema_arrow(self):
+        """
+        Return the inferred Arrow schema, converted from the whole Parquet
+        file's schema
+
+        Examples
+        --------
+        Generate an example Parquet file:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+
+        Read the Arrow schema:
+
+        >>> parquet_file.schema_arrow
+        n_legs: int64
+        animal: string
+        """
+        return self.reader.schema_arrow
+
+    @property
+    def num_row_groups(self):
+        """
+        Return the number of row groups of the Parquet file.
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+
+        >>> parquet_file.num_row_groups
+        1
+        """
+        return self.reader.num_row_groups
+
+    def close(self, force: bool = False):
+        if self._close_source or force:
+            self.reader.close()
+
+    @property
+    def closed(self) -> bool:
+        return self.reader.closed
+
+    def read_row_group(self, i, columns=None, use_threads=True,
+                       use_pandas_metadata=False):
+        """
+        Read a single row group from a Parquet file.
+
+        Parameters
+        ----------
+        i : int
+            Index of the individual row group that we want to read.
+        columns : list
+            If not None, only these columns will be read from the row group. A
+            column name may be a prefix of a nested field, e.g. 'a' will select
+            'a.b', 'a.c', and 'a.d.e'.
+        use_threads : bool, default True
+            Perform multi-threaded column reads.
+        use_pandas_metadata : bool, default False
+            If True and file has custom pandas schema metadata, ensure that
+            index columns are also loaded.
+
+        Returns
+        -------
+        pyarrow.table.Table
+            Content of the row group as a table (of columns)
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+
+        >>> parquet_file.read_row_group(0)
+        pyarrow.Table
+        n_legs: int64
+        animal: string
+        ----
+        n_legs: [[2,2,4,4,5,100]]
+        animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]]
+        """
+        column_indices = self._get_column_indices(
+            columns, use_pandas_metadata=use_pandas_metadata)
+        return self.reader.read_row_group(i, column_indices=column_indices,
+                                          use_threads=use_threads)
+
+    def read_row_groups(self, row_groups, columns=None, use_threads=True,
+                        use_pandas_metadata=False):
+        """
+        Read a multiple row groups from a Parquet file.
+
+        Parameters
+        ----------
+        row_groups : list
+            Only these row groups will be read from the file.
+        columns : list
+            If not None, only these columns will be read from the row group. A
+            column name may be a prefix of a nested field, e.g. 'a' will select
+            'a.b', 'a.c', and 'a.d.e'.
+        use_threads : bool, default True
+            Perform multi-threaded column reads.
+        use_pandas_metadata : bool, default False
+            If True and file has custom pandas schema metadata, ensure that
+            index columns are also loaded.
+
+        Returns
+        -------
+        pyarrow.table.Table
+            Content of the row groups as a table (of columns).
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+
+        >>> parquet_file.read_row_groups([0,0])
+        pyarrow.Table
+        n_legs: int64
+        animal: string
+        ----
+        n_legs: [[2,2,4,4,5,...,2,4,4,5,100]]
+        animal: [["Flamingo","Parrot","Dog",...,"Brittle stars","Centipede"]]
+        """
+        column_indices = self._get_column_indices(
+            columns, use_pandas_metadata=use_pandas_metadata)
+        return self.reader.read_row_groups(row_groups,
+                                           column_indices=column_indices,
+                                           use_threads=use_threads)
+
+    def iter_batches(self, batch_size=65536, row_groups=None, columns=None,
+                     use_threads=True, use_pandas_metadata=False):
+        """
+        Read streaming batches from a Parquet file.
+
+        Parameters
+        ----------
+        batch_size : int, default 64K
+            Maximum number of records to yield per batch. Batches may be
+            smaller if there aren't enough rows in the file.
+        row_groups : list
+            Only these row groups will be read from the file.
+        columns : list
+            If not None, only these columns will be read from the file. A
+            column name may be a prefix of a nested field, e.g. 'a' will select
+            'a.b', 'a.c', and 'a.d.e'.
+        use_threads : boolean, default True
+            Perform multi-threaded column reads.
+        use_pandas_metadata : boolean, default False
+            If True and file has custom pandas schema metadata, ensure that
+            index columns are also loaded.
+
+        Yields
+        ------
+        pyarrow.RecordBatch
+            Contents of each batch as a record batch
+
+        Examples
+        --------
+        Generate an example Parquet file:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+        >>> for i in parquet_file.iter_batches():
+        ...     print("RecordBatch")
+        ...     print(i.to_pandas())
+        ...
+        RecordBatch
+           n_legs         animal
+        0       2       Flamingo
+        1       2         Parrot
+        2       4            Dog
+        3       4          Horse
+        4       5  Brittle stars
+        5     100      Centipede
+        """
+        if batch_size <= 0:
+            raise ValueError("batch_size must be greater than zero")
+
+        if row_groups is None:
+            row_groups = range(0, self.metadata.num_row_groups)
+        column_indices = self._get_column_indices(
+            columns, use_pandas_metadata=use_pandas_metadata)
+
+        batches = self.reader.iter_batches(batch_size,
+                                           row_groups=row_groups,
+                                           column_indices=column_indices,
+                                           use_threads=use_threads)
+        return batches
+
+    def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
+        """
+        Read a Table from Parquet format.
+
+        Parameters
+        ----------
+        columns : list
+            If not None, only these columns will be read from the file. A
+            column name may be a prefix of a nested field, e.g. 'a' will select
+            'a.b', 'a.c', and 'a.d.e'.
+        use_threads : bool, default True
+            Perform multi-threaded column reads.
+        use_pandas_metadata : bool, default False
+            If True and file has custom pandas schema metadata, ensure that
+            index columns are also loaded.
+
+        Returns
+        -------
+        pyarrow.table.Table
+            Content of the file as a table (of columns).
+
+        Examples
+        --------
+        Generate an example Parquet file:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+
+        Read a Table:
+
+        >>> parquet_file.read(columns=["animal"])
+        pyarrow.Table
+        animal: string
+        ----
+        animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]]
+        """
+        column_indices = self._get_column_indices(
+            columns, use_pandas_metadata=use_pandas_metadata)
+        return self.reader.read_all(column_indices=column_indices,
+                                    use_threads=use_threads)
+
+    def scan_contents(self, columns=None, batch_size=65536):
+        """
+        Read contents of file for the given columns and batch size.
+
+        Notes
+        -----
+        This function's primary purpose is benchmarking.
+        The scan is executed on a single thread.
+
+        Parameters
+        ----------
+        columns : list of integers, default None
+            Select columns to read, if None scan all columns.
+        batch_size : int, default 64K
+            Number of rows to read at a time internally.
+
+        Returns
+        -------
+        num_rows : int
+            Number of rows in file
+
+        Examples
+        --------
+        >>> import pyarrow as pa
+        >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'example.parquet')
+        >>> parquet_file = pq.ParquetFile('example.parquet')
+
+        >>> parquet_file.scan_contents()
+        6
+        """
+        column_indices = self._get_column_indices(columns)
+        return self.reader.scan_contents(column_indices,
+                                         batch_size=batch_size)
+
+    def _get_column_indices(self, column_names, use_pandas_metadata=False):
+        if column_names is None:
+            return None
+
+        indices = []
+
+        for name in column_names:
+            if name in self._nested_paths_by_prefix:
+                indices.extend(self._nested_paths_by_prefix[name])
+
+        if use_pandas_metadata:
+            file_keyvalues = self.metadata.metadata
+            common_keyvalues = (self.common_metadata.metadata
+                                if self.common_metadata is not None
+                                else None)
+
+            if file_keyvalues and b'pandas' in file_keyvalues:
+                index_columns = _get_pandas_index_columns(file_keyvalues)
+            elif common_keyvalues and b'pandas' in common_keyvalues:
+                index_columns = _get_pandas_index_columns(common_keyvalues)
+            else:
+                index_columns = []
+
+            if indices is not None and index_columns:
+                indices += [self.reader.column_name_idx(descr)
+                            for descr in index_columns
+                            if not isinstance(descr, dict)]
+
+        return indices
+
+
+_SPARK_DISALLOWED_CHARS = re.compile('[ ,;{}()\n\t=]')
+
+
+def _sanitized_spark_field_name(name):
+    return _SPARK_DISALLOWED_CHARS.sub('_', name)
+
+
+def _sanitize_schema(schema, flavor):
+    if 'spark' in flavor:
+        sanitized_fields = []
+
+        schema_changed = False
+
+        for field in schema:
+            name = field.name
+            sanitized_name = _sanitized_spark_field_name(name)
+
+            if sanitized_name != name:
+                schema_changed = True
+                sanitized_field = pa.field(sanitized_name, field.type,
+                                           field.nullable, field.metadata)
+                sanitized_fields.append(sanitized_field)
+            else:
+                sanitized_fields.append(field)
+
+        new_schema = pa.schema(sanitized_fields, metadata=schema.metadata)
+        return new_schema, schema_changed
+    else:
+        return schema, False
+
+
+def _sanitize_table(table, new_schema, flavor):
+    # TODO: This will not handle prohibited characters in nested field names
+    if 'spark' in flavor:
+        column_data = [table[i] for i in range(table.num_columns)]
+        return pa.Table.from_arrays(column_data, schema=new_schema)
+    else:
+        return table
+
+
+_parquet_writer_arg_docs = """version : {"1.0", "2.4", "2.6"}, default "2.6"
+    Determine which Parquet logical types are available for use, whether the
+    reduced set from the Parquet 1.x.x format or the expanded logical types
+    added in later format versions.
+    Files written with version='2.4' or '2.6' may not be readable in all
+    Parquet implementations, so version='1.0' is likely the choice that
+    maximizes file compatibility.
+    UINT32 and some logical types are only available with version '2.4'.
+    Nanosecond timestamps are only available with version '2.6'.
+    Other features such as compression algorithms or the new serialized
+    data page format must be enabled separately (see 'compression' and
+    'data_page_version').
+use_dictionary : bool or list, default True
+    Specify if we should use dictionary encoding in general or only for
+    some columns.
+    When encoding the column, if the dictionary size is too large, the
+    column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type
+    doesn't support dictionary encoding.
+compression : str or dict, default 'snappy'
+    Specify the compression codec, either on a general basis or per-column.
+    Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}.
+write_statistics : bool or list, default True
+    Specify if we should write statistics in general (default is True) or only
+    for some columns.
+use_deprecated_int96_timestamps : bool, default None
+    Write timestamps to INT96 Parquet format. Defaults to False unless enabled
+    by flavor argument. This take priority over the coerce_timestamps option.
+coerce_timestamps : str, default None
+    Cast timestamps to a particular resolution. If omitted, defaults are chosen
+    depending on `version`. For ``version='1.0'`` and ``version='2.4'``,
+    nanoseconds are cast to microseconds ('us'), while for
+    ``version='2.6'`` (the default), they are written natively without loss
+    of resolution.  Seconds are always cast to milliseconds ('ms') by default,
+    as Parquet does not have any temporal type with seconds resolution.
+    If the casting results in loss of data, it will raise an exception
+    unless ``allow_truncated_timestamps=True`` is given.
+    Valid values: {None, 'ms', 'us'}
+allow_truncated_timestamps : bool, default False
+    Allow loss of data when coercing timestamps to a particular
+    resolution. E.g. if microsecond or nanosecond data is lost when coercing to
+    'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True``
+    will NOT result in the truncation exception being ignored unless
+    ``coerce_timestamps`` is not None.
+data_page_size : int, default None
+    Set a target threshold for the approximate encoded size of data
+    pages within a column chunk (in bytes). If None, use the default data page
+    size of 1MByte.
+max_rows_per_page : int, default None
+    Maximum number of rows per page within a column chunk.
+    If None, use the default of 20000.
+    Smaller values reduce memory usage during reads but increase metadata overhead.
+flavor : {'spark'}, default None
+    Sanitize schema or set other compatibility options to work with
+    various target systems.
+filesystem : FileSystem, default None
+    If nothing passed, will be inferred from `where` if path-like, else
+    `where` is already a file-like object so no filesystem is needed.
+compression_level : int or dict, default None
+    Specify the compression level for a codec, either on a general basis or
+    per-column. If None is passed, arrow selects the compression level for
+    the compression codec in use. The compression level has a different
+    meaning for each codec, so you have to read the documentation of the
+    codec you are using.
+    An exception is thrown if the compression codec does not allow specifying
+    a compression level.
+use_byte_stream_split : bool or list, default False
+    Specify if the byte_stream_split encoding should be used in general or
+    only for some columns. If both dictionary and byte_stream_stream are
+    enabled, then dictionary is preferred.
+    The byte_stream_split encoding is valid for integer, floating-point
+    and fixed-size binary data types (including decimals); it should be
+    combined with a compression codec so as to achieve size reduction.
+column_encoding : string or dict, default None
+    Specify the encoding scheme on a per column basis.
+    Can only be used when ``use_dictionary`` is set to False, and
+    cannot be used in combination with ``use_byte_stream_split``.
+    Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT',
+    'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}.
+    Certain encodings are only compatible with certain data types.
+    Please refer to the encodings section of `Reading and writing Parquet
+    files <https://arrow.apache.org/docs/cpp/parquet.html#encodings>`_.
+data_page_version : {"1.0", "2.0"}, default "1.0"
+    The serialized Parquet data page format version to write, defaults to
+    1.0. This does not impact the file schema logical types and Arrow to
+    Parquet type casting behavior; for that use the "version" option.
+use_compliant_nested_type : bool, default True
+    Whether to write compliant Parquet nested type (lists) as defined
+    `here <https://github.com/apache/parquet-format/blob/master/
+    LogicalTypes.md#nested-types>`_, defaults to ``True``.
+    For ``use_compliant_nested_type=True``, this will write into a list
+    with 3-level structure where the middle level, named ``list``,
+    is a repeated group with a single field named ``element``::
+
+        <list-repetition> group <name> (LIST) {
+            repeated group list {
+                  <element-repetition> <element-type> element;
+            }
+        }
+
+    For ``use_compliant_nested_type=False``, this will also write into a list
+    with 3-level structure, where the name of the single field of the middle
+    level ``list`` is taken from the element name for nested columns in Arrow,
+    which defaults to ``item``::
+
+        <list-repetition> group <name> (LIST) {
+            repeated group list {
+                <element-repetition> <element-type> item;
+            }
+        }
+encryption_properties : FileEncryptionProperties, default None
+    File encryption properties for Parquet Modular Encryption.
+    If None, no encryption will be done.
+    The encryption properties can be created using:
+    ``CryptoFactory.file_encryption_properties()``.
+write_batch_size : int, default None
+    Number of values to write to a page at a time. If None, use the default of
+    1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages
+    are exceeding the ``data_page_size`` due to large column values, lowering
+    the batch size can help keep page sizes closer to the intended size.
+dictionary_pagesize_limit : int, default None
+    Specify the dictionary page size limit per row group. If None, use the
+    default 1MB.
+store_schema : bool, default True
+    By default, the Arrow schema is serialized and stored in the Parquet
+    file metadata (in the "ARROW:schema" key). When reading the file,
+    if this key is available, it will be used to more faithfully recreate
+    the original Arrow data. For example, for tz-aware timestamp columns
+    it will restore the timezone (Parquet only stores the UTC values without
+    timezone), or columns with duration type will be restored from the int64
+    Parquet column.
+write_page_index : bool, default False
+    Whether to write a page index in general for all columns.
+    Writing statistics to the page index disables the old method of writing
+    statistics to each data page header. The page index makes statistics-based
+    filtering more efficient than the page header, as it gathers all the
+    statistics for a Parquet file in a single place, avoiding scattered I/O.
+    Note that the page index is not yet used on the read size by PyArrow.
+write_page_checksum : bool, default False
+    Whether to write page checksums in general for all columns.
+    Page checksums enable detection of data corruption, which might occur during
+    transmission or in the storage.
+sorting_columns : Sequence of SortingColumn, default None
+    Specify the sort order of the data being written. The writer does not sort
+    the data nor does it verify that the data is sorted. The sort order is
+    written to the row group metadata, which can then be used by readers.
+store_decimal_as_integer : bool, default False
+    Allow decimals with 1 <= precision <= 18 to be stored as integers.
+    In Parquet, DECIMAL can be stored in any of the following physical types:
+    - int32: for 1 <= precision <= 9.
+    - int64: for 10 <= precision <= 18.
+    - fixed_len_byte_array: precision is limited by the array size.
+      Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits.
+    - binary: precision is unlimited. The minimum number of bytes to store the
+      unscaled value is used.
+
+    By default, this is DISABLED and all decimal types annotate fixed_len_byte_array.
+    When enabled, the writer will use the following physical types to store decimals:
+    - int32: for 1 <= precision <= 9.
+    - int64: for 10 <= precision <= 18.
+    - fixed_len_byte_array: for precision > 18.
+
+    As a consequence, decimal columns stored in integer types are more compact.
+use_content_defined_chunking : bool or dict, default False
+    Optimize parquet files for content addressable storage (CAS) systems by writing
+    data pages according to content-defined chunk boundaries. This allows for more
+    efficient deduplication of data across files, hence more efficient network
+    transfers and storage. The chunking is based on a rolling hash algorithm that
+    identifies chunk boundaries based on the actual content of the data.
+
+    Note that it is an experimental feature and the API may change in the future.
+
+    If set to ``True``, a default configuration is used with `min_chunk_size=256 KiB`
+    and `max_chunk_size=1024 KiB`. The chunk size distribution approximates a normal
+    distribution between `min_chunk_size` and `max_chunk_size` (sizes are accounted
+    before any Parquet encodings).
+
+    A `dict` can be passed to adjust the chunker parameters with the following keys:
+    - `min_chunk_size`: minimum chunk size in bytes, default 256 KiB
+      The rolling hash will not be updated until this size is reached for each chunk.
+      Note that all data sent through the hash function is counted towards the chunk
+      size, including definition and repetition levels if present.
+    - `max_chunk_size`: maximum chunk size in bytes, default is 1024 KiB
+      The chunker will create a new chunk whenever the chunk size exceeds this value.
+      Note that the parquet writer has a related `data_pagesize` property that controls
+      the maximum size of a parquet data page after encoding. While setting
+      `data_page_size` to a smaller value than `max_chunk_size` doesn't affect the
+      chunking effectiveness, it results in more small parquet data pages.
+    - `norm_level`: normalization level to center the chunk size around the average
+      size more aggressively, default 0
+      Increasing the normalization level increases the probability of finding a chunk,
+      improving the deduplication ratio, but also increasing the number of small chunks
+      resulting in many small parquet data pages. The default value provides a good
+      balance between deduplication ratio and fragmentation. Use norm_level=1 or
+      norm_level=2 to reach a higher deduplication ratio at the expense of
+      fragmentation.
+write_time_adjusted_to_utc : bool, default False
+    Set the value of isAdjustedTOUTC when writing a TIME column.
+    If True, this tells the Parquet reader that the TIME columns
+    are expressed in reference to midnight in the UTC timezone.
+    If False (the default), the TIME columns are assumed to be expressed
+    in reference to midnight in an unknown, presumably local, timezone.
+"""
+
+_parquet_writer_example_doc = """\
+Generate an example PyArrow Table and RecordBatch:
+
+>>> import pyarrow as pa
+>>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+...                              "Brittle stars", "Centipede"]})
+>>> batch = pa.record_batch([[2, 2, 4, 4, 5, 100],
+...                         ["Flamingo", "Parrot", "Dog", "Horse",
+...                          "Brittle stars", "Centipede"]],
+...                         names=['n_legs', 'animal'])
+
+create a ParquetWriter object:
+
+>>> import pyarrow.parquet as pq
+>>> writer = pq.ParquetWriter('example.parquet', table.schema)
+
+and write the Table into the Parquet file:
+
+>>> writer.write_table(table)
+>>> writer.close()
+
+>>> pq.read_table('example.parquet').to_pandas()
+   n_legs         animal
+0       2       Flamingo
+1       2         Parrot
+2       4            Dog
+3       4          Horse
+4       5  Brittle stars
+5     100      Centipede
+
+create a ParquetWriter object for the RecordBatch:
+
+>>> writer2 = pq.ParquetWriter('example2.parquet', batch.schema)
+
+and write the RecordBatch into the Parquet file:
+
+>>> writer2.write_batch(batch)
+>>> writer2.close()
+
+>>> pq.read_table('example2.parquet').to_pandas()
+   n_legs         animal
+0       2       Flamingo
+1       2         Parrot
+2       4            Dog
+3       4          Horse
+4       5  Brittle stars
+5     100      Centipede
+"""
+
+
+class ParquetWriter:
+
+    __doc__ = f"""
+Class for incrementally building a Parquet file for Arrow tables.
+
+Parameters
+----------
+where : path or file-like object
+schema : pyarrow.Schema
+{_parquet_writer_arg_docs}
+writer_engine_version : unused
+**options : dict
+    If options contains a key `metadata_collector` then the
+    corresponding value is assumed to be a list (or any object with
+    `.append` method) that will be filled with the file metadata instance
+    of the written file.
+
+Examples
+--------
+{_parquet_writer_example_doc}
+"""
+
+    def __init__(self, where, schema, filesystem=None,
+                 flavor=None,
+                 version='2.6',
+                 use_dictionary=True,
+                 compression='snappy',
+                 write_statistics=True,
+                 use_deprecated_int96_timestamps=None,
+                 compression_level=None,
+                 use_byte_stream_split=False,
+                 column_encoding=None,
+                 writer_engine_version=None,
+                 data_page_version='1.0',
+                 use_compliant_nested_type=True,
+                 encryption_properties=None,
+                 write_batch_size=None,
+                 dictionary_pagesize_limit=None,
+                 store_schema=True,
+                 write_page_index=False,
+                 write_page_checksum=False,
+                 sorting_columns=None,
+                 store_decimal_as_integer=False,
+                 write_time_adjusted_to_utc=False,
+                 max_rows_per_page=None,
+                 **options):
+        if use_deprecated_int96_timestamps is None:
+            # Use int96 timestamps for Spark
+            if flavor is not None and 'spark' in flavor:
+                use_deprecated_int96_timestamps = True
+            else:
+                use_deprecated_int96_timestamps = False
+
+        self.flavor = flavor
+        if flavor is not None:
+            schema, self.schema_changed = _sanitize_schema(schema, flavor)
+        else:
+            self.schema_changed = False
+
+        self.schema = schema
+        self.where = where
+
+        # If we open a file using a filesystem, store file handle so we can be
+        # sure to close it when `self.close` is called.
+        self.file_handle = None
+
+        filesystem, path = _resolve_filesystem_and_path(where, filesystem)
+        if filesystem is not None:
+            # ARROW-10480: do not auto-detect compression.  While
+            # a filename like foo.parquet.gz is nonconforming, it
+            # shouldn't implicitly apply compression.
+            sink = self.file_handle = filesystem.open_output_stream(
+                path, compression=None)
+        else:
+            sink = where
+        self._metadata_collector = options.pop('metadata_collector', None)
+        engine_version = 'V2'
+        self.writer = _parquet.ParquetWriter(
+            sink, schema,
+            version=version,
+            compression=compression,
+            use_dictionary=use_dictionary,
+            write_statistics=write_statistics,
+            use_deprecated_int96_timestamps=use_deprecated_int96_timestamps,
+            compression_level=compression_level,
+            use_byte_stream_split=use_byte_stream_split,
+            column_encoding=column_encoding,
+            writer_engine_version=engine_version,
+            data_page_version=data_page_version,
+            use_compliant_nested_type=use_compliant_nested_type,
+            encryption_properties=encryption_properties,
+            write_batch_size=write_batch_size,
+            dictionary_pagesize_limit=dictionary_pagesize_limit,
+            store_schema=store_schema,
+            write_page_index=write_page_index,
+            write_page_checksum=write_page_checksum,
+            sorting_columns=sorting_columns,
+            store_decimal_as_integer=store_decimal_as_integer,
+            write_time_adjusted_to_utc=write_time_adjusted_to_utc,
+            max_rows_per_page=max_rows_per_page,
+            **options)
+        self.is_open = True
+
+    def __del__(self):
+        if getattr(self, 'is_open', False):
+            self.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        self.close()
+        # return false since we want to propagate exceptions
+        return False
+
+    def write(self, table_or_batch, row_group_size=None):
+        """
+        Write RecordBatch or Table to the Parquet file.
+
+        Parameters
+        ----------
+        table_or_batch : {RecordBatch, Table}
+        row_group_size : int, default None
+            Maximum number of rows in each written row group. If None, the row
+            group size will be the minimum of the number of rows in the
+            Table/RecordBatch and 1024 * 1024.
+        """
+        if isinstance(table_or_batch, pa.RecordBatch):
+            self.write_batch(table_or_batch, row_group_size)
+        elif isinstance(table_or_batch, pa.Table):
+            self.write_table(table_or_batch, row_group_size)
+        else:
+            raise TypeError(type(table_or_batch))
+
+    def write_batch(self, batch, row_group_size=None):
+        """
+        Write RecordBatch to the Parquet file.
+
+        Parameters
+        ----------
+        batch : RecordBatch
+        row_group_size : int, default None
+            Maximum number of rows in written row group. If None, the
+            row group size will be the minimum of the RecordBatch
+            size (in rows) and 1024 * 1024. If set larger than 64 * 1024 * 1024
+            then 64 * 1024 * 1024 will be used instead.
+        """
+        table = pa.Table.from_batches([batch], batch.schema)
+        self.write_table(table, row_group_size)
+
+    def write_table(self, table, row_group_size=None):
+        """
+        Write Table to the Parquet file.
+
+        Parameters
+        ----------
+        table : Table
+        row_group_size : int, default None
+            Maximum number of rows in each written row group. If None,
+            the row group size will be the minimum of the Table size (in rows)
+            and 1024 * 1024. If set larger than 64 * 1024 * 1024 then
+            64 * 1024 * 1024 will be used instead.
+
+        """
+        if self.schema_changed:
+            table = _sanitize_table(table, self.schema, self.flavor)
+        assert self.is_open
+
+        if not table.schema.equals(self.schema, check_metadata=False):
+            msg = (
+                "Table schema does not match schema used to create file: \n"
+                f"table:\n{table.schema!s} vs. \nfile:\n{self.schema!s}"
+            )
+            raise ValueError(msg)
+
+        self.writer.write_table(table, row_group_size=row_group_size)
+
+    def close(self):
+        """
+        Close the connection to the Parquet file.
+        """
+        if self.is_open:
+            self.writer.close()
+            self.is_open = False
+            if self._metadata_collector is not None:
+                self._metadata_collector.append(self.writer.metadata)
+        if self.file_handle is not None:
+            self.file_handle.close()
+
+    def add_key_value_metadata(self, key_value_metadata):
+        """
+        Add key-value metadata to the file.
+        This will overwrite any existing metadata with the same key.
+
+        Parameters
+        ----------
+        key_value_metadata : dict
+            Keys and values must be string-like / coercible to bytes.
+        """
+        assert self.is_open
+        self.writer.add_key_value_metadata(key_value_metadata)
+
+
+def _get_pandas_index_columns(keyvalues):
+    return (json.loads(keyvalues[b'pandas'].decode('utf8'))
+            ['index_columns'])
+
+
+EXCLUDED_PARQUET_PATHS = {'_SUCCESS'}
+
+
+_read_docstring_common = """\
+read_dictionary : list, default None
+    List of names or column paths (for nested types) to read directly
+    as DictionaryArray. Only supported for BYTE_ARRAY storage. To read
+    a flat column as dictionary-encoded pass the column name. For
+    nested types, you must pass the full column "path", which could be
+    something like level1.level2.list.item. Refer to the Parquet
+    file's schema to obtain the paths.
+binary_type : pyarrow.DataType, default None
+    If given, Parquet binary columns will be read as this datatype.
+    This setting is ignored if a serialized Arrow schema is found in
+    the Parquet metadata.
+list_type : subclass of pyarrow.DataType, default None
+    If given, non-MAP repeated columns will be read as an instance of
+    this datatype (either pyarrow.ListType or pyarrow.LargeListType).
+    This setting is ignored if a serialized Arrow schema is found in
+    the Parquet metadata.
+memory_map : bool, default False
+    If the source is a file path, use a memory map to read file, which can
+    improve performance in some environments.
+buffer_size : int, default 0
+    If positive, perform read buffering when deserializing individual
+    column chunks. Otherwise IO calls are unbuffered.
+partitioning : pyarrow.dataset.Partitioning or str or list of str, \
+default "hive"
+    The partitioning scheme for a partitioned dataset. The default of "hive"
+    assumes directory names with key=value pairs like "/year=2009/month=11".
+    In addition, a scheme like "/2009/11" is also supported, in which case
+    you need to specify the field names or a full schema. See the
+    ``pyarrow.dataset.partitioning()`` function for more details."""
+
+
+_parquet_dataset_example = """\
+Generate an example PyArrow Table and write it to a partitioned dataset:
+
+>>> import pyarrow as pa
+>>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+...                   'n_legs': [2, 2, 4, 4, 5, 100],
+...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+...                              "Brittle stars", "Centipede"]})
+>>> import pyarrow.parquet as pq
+>>> pq.write_to_dataset(table, root_path='dataset_v2',
+...                     partition_cols=['year'])
+
+create a ParquetDataset object from the dataset source:
+
+>>> dataset = pq.ParquetDataset('dataset_v2/')
+
+and read the data:
+
+>>> dataset.read().to_pandas()
+   n_legs         animal  year
+0       5  Brittle stars  2019
+1       2       Flamingo  2020
+2       4            Dog  2021
+3     100      Centipede  2021
+4       2         Parrot  2022
+5       4          Horse  2022
+
+create a ParquetDataset object with filter:
+
+>>> dataset = pq.ParquetDataset('dataset_v2/',
+...                             filters=[('n_legs','=',4)])
+>>> dataset.read().to_pandas()
+   n_legs animal  year
+0       4    Dog  2021
+1       4  Horse  2022
+"""
+
+
+class ParquetDataset:
+    __doc__ = f"""
+Encapsulates details of reading a complete Parquet dataset possibly
+consisting of multiple files and partitions in subdirectories.
+
+Parameters
+----------
+path_or_paths : str or List[str]
+    A directory name, single file name, or list of file names.
+filesystem : FileSystem, default None
+    If nothing passed, will be inferred based on path.
+    Path will try to be found in the local on-disk filesystem otherwise
+    it will be parsed as an URI to determine the filesystem.
+schema : pyarrow.parquet.Schema
+    Optionally provide the Schema for the Dataset, in which case it will
+    not be inferred from the source.
+filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None
+    Rows which do not match the filter predicate will be removed from scanned
+    data. Partition keys embedded in a nested directory structure will be
+    exploited to avoid loading files at all if they contain no matching rows.
+    Within-file level filtering and different partitioning schemes are supported.
+
+    {_DNF_filter_doc}
+{_read_docstring_common}
+ignore_prefixes : list, optional
+    Files matching any of these prefixes will be ignored by the
+    discovery process.
+    This is matched to the basename of a path.
+    By default this is ['.', '_'].
+    Note that discovery happens only if a directory is passed as source.
+pre_buffer : bool, default True
+    Coalesce and issue file reads in parallel to improve performance on
+    high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a
+    background I/O thread pool. If using a filesystem layer that itself
+    performs readahead (e.g. fsspec's S3FS), disable readahead for best
+    results. Set to False if you want to prioritize minimal memory usage
+    over maximum speed.
+coerce_int96_timestamp_unit : str, default None
+    Cast timestamps that are stored in INT96 format to a particular resolution
+    (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96
+    timestamps will be inferred as timestamps in nanoseconds.
+decryption_properties : FileDecryptionProperties or None
+    File-level decryption properties.
+    The decryption properties can be created using
+    ``CryptoFactory.file_decryption_properties()``.
+thrift_string_size_limit : int, default None
+    If not None, override the maximum total string size allocated
+    when decoding Thrift structures. The default limit should be
+    sufficient for most Parquet files.
+thrift_container_size_limit : int, default None
+    If not None, override the maximum total size of containers allocated
+    when decoding Thrift structures. The default limit should be
+    sufficient for most Parquet files.
+page_checksum_verification : bool, default False
+    If True, verify the page checksum for each page read from the file.
+arrow_extensions_enabled : bool, default True
+    If True, read Parquet logical types as Arrow extension types where possible,
+    (e.g., read JSON as the canonical `arrow.json` extension type or UUID as
+    the canonical `arrow.uuid` extension type).
+
+Examples
+--------
+{_parquet_dataset_example}
+"""
+
+    def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None,
+                 read_dictionary=None, binary_type=None, list_type=None,
+                 memory_map=False, buffer_size=None, partitioning="hive",
+                 ignore_prefixes=None,
+                 pre_buffer=True, coerce_int96_timestamp_unit=None,
+                 decryption_properties=None, thrift_string_size_limit=None,
+                 thrift_container_size_limit=None,
+                 page_checksum_verification=False,
+                 arrow_extensions_enabled=True):
+        import pyarrow.dataset as ds
+
+        # map format arguments
+        read_options = {
+            "pre_buffer": pre_buffer,
+            "coerce_int96_timestamp_unit": coerce_int96_timestamp_unit,
+            "thrift_string_size_limit": thrift_string_size_limit,
+            "thrift_container_size_limit": thrift_container_size_limit,
+            "page_checksum_verification": page_checksum_verification,
+            "arrow_extensions_enabled": arrow_extensions_enabled,
+            "binary_type": binary_type,
+            "list_type": list_type,
+        }
+        if buffer_size:
+            read_options.update(use_buffered_stream=True,
+                                buffer_size=buffer_size)
+        if read_dictionary is not None:
+            read_options.update(dictionary_columns=read_dictionary)
+
+        if decryption_properties is not None:
+            read_options.update(decryption_properties=decryption_properties)
+
+        self._filter_expression = None
+        if filters is not None:
+            self._filter_expression = filters_to_expression(filters)
+
+        # map old filesystems to new one
+        if filesystem is not None:
+            filesystem = _ensure_filesystem(
+                filesystem, use_mmap=memory_map)
+        elif filesystem is None and memory_map:
+            # if memory_map is specified, assume local file system (string
+            # path can in principle be URI for any filesystem)
+            filesystem = LocalFileSystem(use_mmap=memory_map)
+
+        # This needs to be checked after _ensure_filesystem, because that
+        # handles the case of an fsspec LocalFileSystem
+        if (
+            hasattr(path_or_paths, "__fspath__") and
+            filesystem is not None and
+            not isinstance(filesystem, LocalFileSystem)
+        ):
+            raise TypeError(
+                "Path-like objects with __fspath__ must only be used with "
+                f"local file systems, not {type(filesystem)}"
+            )
+
+        # check for single fragment dataset or dataset directory
+        single_file = None
+        self._base_dir = None
+        if not isinstance(path_or_paths, list):
+            if _is_path_like(path_or_paths):
+                filesystem, path_or_paths = _resolve_filesystem_and_path(
+                    path_or_paths, filesystem, memory_map=memory_map
+                )
+                finfo = filesystem.get_file_info(path_or_paths)
+                if finfo.type == FileType.Directory:
+                    self._base_dir = path_or_paths
+            else:
+                single_file = path_or_paths
+
+        parquet_format = ds.ParquetFileFormat(**read_options)
+
+        if single_file is not None:
+            fragment = parquet_format.make_fragment(single_file, filesystem)
+
+            self._dataset = ds.FileSystemDataset(
+                [fragment], schema=schema or fragment.physical_schema,
+                format=parquet_format,
+                filesystem=fragment.filesystem
+            )
+            return
+
+        # check partitioning to enable dictionary encoding
+        if partitioning == "hive":
+            partitioning = ds.HivePartitioning.discover(
+                infer_dictionary=True)
+
+        self._dataset = ds.dataset(path_or_paths, filesystem=filesystem,
+                                   schema=schema, format=parquet_format,
+                                   partitioning=partitioning,
+                                   ignore_prefixes=ignore_prefixes)
+
+    def equals(self, other):
+        if not isinstance(other, ParquetDataset):
+            raise TypeError('`other` must be an instance of ParquetDataset')
+
+        return (self.schema == other.schema and
+                self._dataset.format == other._dataset.format and
+                self.filesystem == other.filesystem and
+                # self.fragments == other.fragments and
+                self.files == other.files)
+
+    def __eq__(self, other):
+        try:
+            return self.equals(other)
+        except TypeError:
+            return NotImplemented
+
+    @property
+    def schema(self):
+        """
+        Schema of the Dataset.
+
+        Examples
+        --------
+        Generate an example dataset:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_v2_schema',
+        ...                     partition_cols=['year'])
+        >>> dataset = pq.ParquetDataset('dataset_v2_schema/')
+
+        Read the schema:
+
+        >>> dataset.schema
+        n_legs: int64
+        animal: string
+        year: dictionary<values=int32, indices=int32, ordered=0>
+        """
+        return self._dataset.schema
+
+    def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
+        """
+        Read (multiple) Parquet files as a single pyarrow.Table.
+
+        Parameters
+        ----------
+        columns : List[str]
+            Names of columns to read from the dataset. The partition fields
+            are not automatically included.
+        use_threads : bool, default True
+            Perform multi-threaded column reads.
+        use_pandas_metadata : bool, default False
+            If True and file has custom pandas schema metadata, ensure that
+            index columns are also loaded.
+
+        Returns
+        -------
+        pyarrow.Table
+            Content of the file as a table (of columns).
+
+        Examples
+        --------
+        Generate an example dataset:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_v2_read',
+        ...                     partition_cols=['year'])
+        >>> dataset = pq.ParquetDataset('dataset_v2_read/')
+
+        Read the dataset:
+
+        >>> dataset.read(columns=["n_legs"])
+        pyarrow.Table
+        n_legs: int64
+        ----
+        n_legs: [[5],[2],[4,100],[2,4]]
+        """
+        # if use_pandas_metadata, we need to include index columns in the
+        # column selection, to be able to restore those in the pandas DataFrame
+        metadata = self.schema.metadata or {}
+
+        if use_pandas_metadata:
+            # if the dataset schema metadata itself doesn't have pandas
+            # then try to get this from common file (for backwards compat)
+            if b"pandas" not in metadata:
+                common_metadata = self._get_common_pandas_metadata()
+                if common_metadata:
+                    metadata = common_metadata
+
+        if columns is not None and use_pandas_metadata:
+            if metadata and b'pandas' in metadata:
+                # RangeIndex can be represented as dict instead of column name
+                index_columns = [
+                    col for col in _get_pandas_index_columns(metadata)
+                    if not isinstance(col, dict)
+                ]
+                columns = (
+                    list(columns) + list(set(index_columns) - set(columns))
+                )
+
+        table = self._dataset.to_table(
+            columns=columns, filter=self._filter_expression,
+            use_threads=use_threads
+        )
+
+        # if use_pandas_metadata, restore the pandas metadata (which gets
+        # lost if doing a specific `columns` selection in to_table)
+        if use_pandas_metadata:
+            if metadata and b"pandas" in metadata:
+                new_metadata = table.schema.metadata or {}
+                new_metadata.update({b"pandas": metadata[b"pandas"]})
+                table = table.replace_schema_metadata(new_metadata)
+
+        return table
+
+    def _get_common_pandas_metadata(self):
+
+        if not self._base_dir:
+            return None
+
+        metadata = None
+        for name in ["_common_metadata", "_metadata"]:
+            metadata_path = os.path.join(str(self._base_dir), name)
+            finfo = self.filesystem.get_file_info(metadata_path)
+            if finfo.is_file:
+                pq_meta = read_metadata(
+                    metadata_path, filesystem=self.filesystem)
+                metadata = pq_meta.metadata
+                if metadata and b'pandas' in metadata:
+                    break
+
+        return metadata
+
+    def read_pandas(self, **kwargs):
+        """
+        Read dataset including pandas metadata, if any. Other arguments passed
+        through to :func:`read`, see docstring for further details.
+
+        Parameters
+        ----------
+        **kwargs : optional
+            Additional options for :func:`read`
+
+        Examples
+        --------
+        Generate an example parquet file:
+
+        >>> import pyarrow as pa
+        >>> import pandas as pd
+        >>> df = pd.DataFrame({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                    'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                    'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                    "Brittle stars", "Centipede"]})
+        >>> table = pa.Table.from_pandas(df)
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_table(table, 'table_V2.parquet')
+        >>> dataset = pq.ParquetDataset('table_V2.parquet')
+
+        Read the dataset with pandas metadata:
+
+        >>> dataset.read_pandas(columns=["n_legs"])
+        pyarrow.Table
+        n_legs: int64
+        ----
+        n_legs: [[2,2,4,4,5,100]]
+
+        >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata
+        {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, ...}
+        """
+        return self.read(use_pandas_metadata=True, **kwargs)
+
+    @property
+    def fragments(self):
+        """
+        A list of the Dataset source fragments or pieces with absolute
+        file paths.
+
+        Examples
+        --------
+        Generate an example dataset:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_v2_fragments',
+        ...                     partition_cols=['year'])
+        >>> dataset = pq.ParquetDataset('dataset_v2_fragments/')
+
+        List the fragments:
+
+        >>> dataset.fragments
+        [<pyarrow.dataset.ParquetFileFragment path=dataset_v2_fragments/...
+        """
+        return list(self._dataset.get_fragments())
+
+    @property
+    def files(self):
+        """
+        A list of absolute Parquet file paths in the Dataset source.
+
+        Examples
+        --------
+        Generate an example dataset:
+
+        >>> import pyarrow as pa
+        >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+        ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+        ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+        ...                              "Brittle stars", "Centipede"]})
+        >>> import pyarrow.parquet as pq
+        >>> pq.write_to_dataset(table, root_path='dataset_v2_files',
+        ...                     partition_cols=['year'])
+        >>> dataset = pq.ParquetDataset('dataset_v2_files/')
+
+        List the files:
+
+        >>> dataset.files
+        ['dataset_v2_files/year=2019/...-0.parquet', ...
+        """
+        return self._dataset.files
+
+    @property
+    def filesystem(self):
+        """
+        The filesystem type of the Dataset source.
+        """
+        return self._dataset.filesystem
+
+    @property
+    def partitioning(self):
+        """
+        The partitioning of the Dataset source, if discovered.
+        """
+        return self._dataset.partitioning
+
+
+_read_table_docstring = """
+{0}
+
+Parameters
+----------
+source : str, list of str, pyarrow.NativeFile, or file-like object
+    If a string is passed, can be a single file name or directory name. If a
+    list of strings is passed, should be file names. For file-like objects,
+    only read a single file. Use pyarrow.BufferReader to read a file contained
+    in a bytes or buffer-like object.
+columns : list
+    If not None, only these columns will be read from the file. A column
+    name may be a prefix of a nested field, e.g. 'a' will select 'a.b',
+    'a.c', and 'a.d.e'. If empty, no columns will be read. Note
+    that the table will still have the correct num_rows set despite having
+    no columns.
+use_threads : bool, default True
+    Perform multi-threaded column reads.
+schema : Schema, optional
+    Optionally provide the Schema for the parquet dataset, in which case it
+    will not be inferred from the source.
+{1}
+filesystem : FileSystem, default None
+    If nothing passed, will be inferred based on path.
+    Path will try to be found in the local on-disk filesystem otherwise
+    it will be parsed as an URI to determine the filesystem.
+filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None
+    Rows which do not match the filter predicate will be removed from scanned
+    data. Partition keys embedded in a nested directory structure will be
+    exploited to avoid loading files at all if they contain no matching rows.
+    Within-file level filtering and different partitioning schemes are supported.
+
+    {3}
+ignore_prefixes : list, optional
+    Files matching any of these prefixes will be ignored by the
+    discovery process.
+    This is matched to the basename of a path.
+    By default this is ['.', '_'].
+    Note that discovery happens only if a directory is passed as source.
+pre_buffer : bool, default True
+    Coalesce and issue file reads in parallel to improve performance on
+    high-latency filesystems (e.g. S3). If True, Arrow will use a
+    background I/O thread pool. If using a filesystem layer that itself
+    performs readahead (e.g. fsspec's S3FS), disable readahead for best
+    results.
+coerce_int96_timestamp_unit : str, default None
+    Cast timestamps that are stored in INT96 format to a particular
+    resolution (e.g. 'ms'). Setting to None is equivalent to 'ns'
+    and therefore INT96 timestamps will be inferred as timestamps
+    in nanoseconds.
+decryption_properties : FileDecryptionProperties or None
+    File-level decryption properties.
+    The decryption properties can be created using
+    ``CryptoFactory.file_decryption_properties()``.
+thrift_string_size_limit : int, default None
+    If not None, override the maximum total string size allocated
+    when decoding Thrift structures. The default limit should be
+    sufficient for most Parquet files.
+thrift_container_size_limit : int, default None
+    If not None, override the maximum total size of containers allocated
+    when decoding Thrift structures. The default limit should be
+    sufficient for most Parquet files.
+page_checksum_verification : bool, default False
+    If True, verify the checksum for each page read from the file.
+arrow_extensions_enabled : bool, default True
+    If True, read Parquet logical types as Arrow extension types where possible,
+    (e.g., read JSON as the canonical `arrow.json` extension type or UUID as
+    the canonical `arrow.uuid` extension type).
+
+Returns
+-------
+{2}
+
+{4}
+"""
+
+_read_table_example = """\
+
+Examples
+--------
+
+Generate an example PyArrow Table and write it to a partitioned dataset:
+
+>>> import pyarrow as pa
+>>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+...                   'n_legs': [2, 2, 4, 4, 5, 100],
+...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+...                              "Brittle stars", "Centipede"]})
+>>> import pyarrow.parquet as pq
+>>> pq.write_to_dataset(table, root_path='dataset_name_2',
+...                     partition_cols=['year'])
+
+Read the data:
+
+>>> pq.read_table('dataset_name_2').to_pandas()
+   n_legs         animal  year
+0       5  Brittle stars  2019
+1       2       Flamingo  2020
+2       4            Dog  2021
+3     100      Centipede  2021
+4       2         Parrot  2022
+5       4          Horse  2022
+
+
+Read only a subset of columns:
+
+>>> pq.read_table('dataset_name_2', columns=["n_legs", "animal"])
+pyarrow.Table
+n_legs: int64
+animal: string
+----
+n_legs: [[5],[2],[4,100],[2,4]]
+animal: [["Brittle stars"],["Flamingo"],["Dog","Centipede"],["Parrot","Horse"]]
+
+Read a subset of columns and read one column as DictionaryArray:
+
+>>> pq.read_table('dataset_name_2', columns=["n_legs", "animal"],
+...               read_dictionary=["animal"])
+pyarrow.Table
+n_legs: int64
+animal: dictionary<values=string, indices=int32, ordered=0>
+----
+n_legs: [[5],[2],[4,100],[2,4]]
+animal: [  -- dictionary:
+["Brittle stars"]  -- indices:
+[0],  -- dictionary:
+["Flamingo"]  -- indices:
+[0],  -- dictionary:
+["Dog","Centipede"]  -- indices:
+[0,1],  -- dictionary:
+["Parrot","Horse"]  -- indices:
+[0,1]]
+
+Read the table with filter:
+
+>>> pq.read_table('dataset_name_2', columns=["n_legs", "animal"],
+...               filters=[('n_legs','<',4)]).to_pandas()
+   n_legs    animal
+0       2  Flamingo
+1       2    Parrot
+
+Read data from a single Parquet file:
+
+>>> pq.write_table(table, 'example.parquet')
+>>> pq.read_table('dataset_name_2').to_pandas()
+   n_legs         animal  year
+0       5  Brittle stars  2019
+1       2       Flamingo  2020
+2       4            Dog  2021
+3     100      Centipede  2021
+4       2         Parrot  2022
+5       4          Horse  2022
+"""
+
+
+def read_table(source, *, columns=None, use_threads=True,
+               schema=None, use_pandas_metadata=False, read_dictionary=None,
+               binary_type=None, list_type=None, memory_map=False, buffer_size=0,
+               partitioning="hive", filesystem=None, filters=None,
+               ignore_prefixes=None, pre_buffer=True,
+               coerce_int96_timestamp_unit=None,
+               decryption_properties=None, thrift_string_size_limit=None,
+               thrift_container_size_limit=None,
+               page_checksum_verification=False,
+               arrow_extensions_enabled=True):
+
+    try:
+        dataset = ParquetDataset(
+            source,
+            schema=schema,
+            filesystem=filesystem,
+            partitioning=partitioning,
+            memory_map=memory_map,
+            read_dictionary=read_dictionary,
+            binary_type=binary_type,
+            list_type=list_type,
+            buffer_size=buffer_size,
+            filters=filters,
+            ignore_prefixes=ignore_prefixes,
+            pre_buffer=pre_buffer,
+            coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+            decryption_properties=decryption_properties,
+            thrift_string_size_limit=thrift_string_size_limit,
+            thrift_container_size_limit=thrift_container_size_limit,
+            page_checksum_verification=page_checksum_verification,
+            arrow_extensions_enabled=arrow_extensions_enabled,
+        )
+    except ImportError:
+        # fall back on ParquetFile for simple cases when pyarrow.dataset
+        # module is not available
+        if filters is not None:
+            raise ValueError(
+                "the 'filters' keyword is not supported when the "
+                "pyarrow.dataset module is not available"
+            )
+        if partitioning != "hive":
+            raise ValueError(
+                "the 'partitioning' keyword is not supported when the "
+                "pyarrow.dataset module is not available"
+            )
+        if schema is not None:
+            raise ValueError(
+                "the 'schema' argument is not supported when the "
+                "pyarrow.dataset module is not available"
+            )
+        if isinstance(source, list):
+            raise ValueError(
+                "the 'source' argument cannot be a list of files "
+                "when the pyarrow.dataset module is not available"
+            )
+
+        filesystem, path = _resolve_filesystem_and_path(source, filesystem)
+        if filesystem is not None:
+            if not filesystem.get_file_info(path).is_file:
+                raise ValueError(
+                    "the 'source' argument should be "
+                    "an existing parquet file and not a directory "
+                    "when the pyarrow.dataset module is not available"
+                )
+
+            source = filesystem.open_input_file(path)
+
+        dataset = ParquetFile(
+            source, read_dictionary=read_dictionary,
+            binary_type=binary_type,
+            list_type=list_type,
+            memory_map=memory_map, buffer_size=buffer_size,
+            pre_buffer=pre_buffer,
+            coerce_int96_timestamp_unit=coerce_int96_timestamp_unit,
+            decryption_properties=decryption_properties,
+            thrift_string_size_limit=thrift_string_size_limit,
+            thrift_container_size_limit=thrift_container_size_limit,
+            page_checksum_verification=page_checksum_verification,
+        )
+
+    return dataset.read(columns=columns, use_threads=use_threads,
+                        use_pandas_metadata=use_pandas_metadata)
+
+
+read_table.__doc__ = _read_table_docstring.format(
+    """Read a Table from Parquet format""",
+    "\n".join(("""use_pandas_metadata : bool, default False
+    If True and file has custom pandas schema metadata, ensure that
+    index columns are also loaded.""", _read_docstring_common)),
+    """pyarrow.Table
+    Content of the file as a table (of columns)""",
+    _DNF_filter_doc, _read_table_example)
+
+
+def read_pandas(source, columns=None, **kwargs):
+    return read_table(
+        source, columns=columns, use_pandas_metadata=True, **kwargs
+    )
+
+
+read_pandas.__doc__ = _read_table_docstring.format(
+    'Read a Table from Parquet format, also reading DataFrame\n'
+    'index values if known in the file metadata',
+    "\n".join((_read_docstring_common,
+               """**kwargs
+    additional options for :func:`read_table`""")),
+    """pyarrow.Table
+    Content of the file as a Table of Columns, including DataFrame
+    indexes as columns""",
+    _DNF_filter_doc, "")
+
+
+def write_table(table, where, row_group_size=None, version='2.6',
+                use_dictionary=True, compression='snappy',
+                write_statistics=True,
+                use_deprecated_int96_timestamps=None,
+                coerce_timestamps=None,
+                allow_truncated_timestamps=False,
+                data_page_size=None, flavor=None,
+                filesystem=None,
+                compression_level=None,
+                use_byte_stream_split=False,
+                column_encoding=None,
+                data_page_version='1.0',
+                use_compliant_nested_type=True,
+                encryption_properties=None,
+                write_batch_size=None,
+                dictionary_pagesize_limit=None,
+                store_schema=True,
+                write_page_index=False,
+                write_page_checksum=False,
+                sorting_columns=None,
+                store_decimal_as_integer=False,
+                write_time_adjusted_to_utc=False,
+                max_rows_per_page=None,
+                **kwargs):
+    # Implementor's note: when adding keywords here / updating defaults, also
+    # update it in write_to_dataset and _dataset_parquet.pyx ParquetFileWriteOptions
+    row_group_size = kwargs.pop('chunk_size', row_group_size)
+    use_int96 = use_deprecated_int96_timestamps
+    try:
+        with ParquetWriter(
+                where, table.schema,
+                filesystem=filesystem,
+                version=version,
+                flavor=flavor,
+                use_dictionary=use_dictionary,
+                write_statistics=write_statistics,
+                coerce_timestamps=coerce_timestamps,
+                data_page_size=data_page_size,
+                allow_truncated_timestamps=allow_truncated_timestamps,
+                compression=compression,
+                use_deprecated_int96_timestamps=use_int96,
+                compression_level=compression_level,
+                use_byte_stream_split=use_byte_stream_split,
+                column_encoding=column_encoding,
+                data_page_version=data_page_version,
+                use_compliant_nested_type=use_compliant_nested_type,
+                encryption_properties=encryption_properties,
+                write_batch_size=write_batch_size,
+                dictionary_pagesize_limit=dictionary_pagesize_limit,
+                store_schema=store_schema,
+                write_page_index=write_page_index,
+                write_page_checksum=write_page_checksum,
+                sorting_columns=sorting_columns,
+                store_decimal_as_integer=store_decimal_as_integer,
+                write_time_adjusted_to_utc=write_time_adjusted_to_utc,
+                max_rows_per_page=max_rows_per_page,
+                **kwargs) as writer:
+            writer.write_table(table, row_group_size=row_group_size)
+    except Exception:
+        if _is_path_like(where):
+            try:
+                os.remove(_stringify_path(where))
+            except os.error:
+                pass
+        raise
+
+
+_write_table_example = """\
+Generate an example PyArrow Table:
+
+>>> import pyarrow as pa
+>>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+...                              "Brittle stars", "Centipede"]})
+
+and write the Table into Parquet file:
+
+>>> import pyarrow.parquet as pq
+>>> pq.write_table(table, 'example.parquet')
+
+Defining row group size for the Parquet file:
+
+>>> pq.write_table(table, 'example.parquet', row_group_size=3)
+
+Defining row group compression (default is Snappy):
+
+>>> pq.write_table(table, 'example.parquet', compression='none')
+
+Defining row group compression and encoding per-column:
+
+>>> pq.write_table(table, 'example.parquet',
+...                compression={'n_legs': 'snappy', 'animal': 'gzip'},
+...                use_dictionary=['n_legs', 'animal'])
+
+Defining column encoding per-column:
+
+>>> pq.write_table(table, 'example.parquet',
+...                column_encoding={'animal':'PLAIN'},
+...                use_dictionary=False)
+"""
+
+write_table.__doc__ = f"""
+Write a Table to Parquet format.
+
+Parameters
+----------
+table : pyarrow.Table
+where : string or pyarrow.NativeFile
+row_group_size : int, default None
+    Maximum number of rows in each written row group. If None, the
+    row group size will be the minimum of the Table size (in rows)
+    and 1024 * 1024. If set larger than 64 * 1024 * 1024 then
+    64 * 1024 * 1024 will be used instead.
+{_parquet_writer_arg_docs}
+**kwargs : optional
+    Additional options for ParquetWriter
+
+Examples
+--------
+{_write_table_example}
+"""
+
+
+def write_to_dataset(table, root_path, partition_cols=None,
+                     filesystem=None, schema=None, partitioning=None,
+                     basename_template=None, use_threads=None,
+                     file_visitor=None, existing_data_behavior=None,
+                     **kwargs):
+    """Wrapper around dataset.write_dataset for writing a Table to
+    Parquet format by partitions.
+    For each combination of partition columns and values,
+    a subdirectories are created in the following
+    manner:
+
+    root_dir/
+      group1=value1
+        group2=value1
+          <uuid>.parquet
+        group2=value2
+          <uuid>.parquet
+      group1=valueN
+        group2=value1
+          <uuid>.parquet
+        group2=valueN
+          <uuid>.parquet
+
+    Parameters
+    ----------
+    table : pyarrow.Table
+    root_path : str, pathlib.Path
+        The root directory of the dataset.
+    partition_cols : list,
+        Column names by which to partition the dataset.
+        Columns are partitioned in the order they are given.
+    filesystem : FileSystem, default None
+        If nothing passed, will be inferred based on path.
+        Path will try to be found in the local on-disk filesystem otherwise
+        it will be parsed as an URI to determine the filesystem.
+    schema : Schema, optional
+        This Schema of the dataset.
+    partitioning : Partitioning or list[str], optional
+        The partitioning scheme specified with the
+        ``pyarrow.dataset.partitioning()`` function or a list of field names.
+        When providing a list of field names, you can use
+        ``partitioning_flavor`` to drive which partitioning type should be
+        used.
+    basename_template : str, optional
+        A template string used to generate basenames of written data files.
+        The token '{i}' will be replaced with an automatically incremented
+        integer. If not specified, it defaults to "guid-{i}.parquet".
+    use_threads : bool, default True
+        Write files in parallel. If enabled, then maximum parallelism will be
+        used determined by the number of available CPU cores.
+    file_visitor : function
+        If set, this function will be called with a WrittenFile instance
+        for each file created during the call.  This object will have both
+        a path attribute and a metadata attribute.
+
+        The path attribute will be a string containing the path to
+        the created file.
+
+        The metadata attribute will be the parquet metadata of the file.
+        This metadata will have the file path attribute set and can be used
+        to build a _metadata file.  The metadata attribute will be None if
+        the format is not parquet.
+
+        Example visitor which simple collects the filenames created::
+
+            visited_paths = []
+
+            def file_visitor(written_file):
+                visited_paths.append(written_file.path)
+
+    existing_data_behavior : 'overwrite_or_ignore' | 'error' | \
+'delete_matching'
+        Controls how the dataset will handle data that already exists in
+        the destination. The default behaviour is 'overwrite_or_ignore'.
+
+        'overwrite_or_ignore' will ignore any existing data and will
+        overwrite files with the same name as an output file.  Other
+        existing files will be ignored.  This behavior, in combination
+        with a unique basename_template for each write, will allow for
+        an append workflow.
+
+        'error' will raise an error if any data exists in the destination.
+
+        'delete_matching' is useful when you are writing a partitioned
+        dataset.  The first time each partition directory is encountered
+        the entire directory will be deleted.  This allows you to overwrite
+        old partitions completely.
+    **kwargs : dict,
+        Used as additional kwargs for :func:`pyarrow.dataset.write_dataset`
+        function for matching kwargs, and remainder to
+        :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`.
+        See the docstring of :func:`write_table` and
+        :func:`pyarrow.dataset.write_dataset` for the available options.
+        Using `metadata_collector` in kwargs allows one to collect the
+        file metadata instances of dataset pieces. The file paths in the
+        ColumnChunkMetaData will be set relative to `root_path`.
+
+    Examples
+    --------
+    Generate an example PyArrow Table:
+
+    >>> import pyarrow as pa
+    >>> table = pa.table({'year': [2020, 2022, 2021, 2022, 2019, 2021],
+    ...                   'n_legs': [2, 2, 4, 4, 5, 100],
+    ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+    ...                              "Brittle stars", "Centipede"]})
+
+    and write it to a partitioned dataset:
+
+    >>> import pyarrow.parquet as pq
+    >>> pq.write_to_dataset(table, root_path='dataset_name_3',
+    ...                     partition_cols=['year'])
+    >>> pq.ParquetDataset('dataset_name_3').files
+    ['dataset_name_3/year=2019/...-0.parquet', ...
+
+    Write a single Parquet file into the root folder:
+
+    >>> pq.write_to_dataset(table, root_path='dataset_name_4')
+    >>> pq.ParquetDataset('dataset_name_4/').files
+    ['dataset_name_4/...-0.parquet']
+    """
+    metadata_collector = kwargs.pop('metadata_collector', None)
+
+    # Check for conflicting keywords
+    msg_confl = (
+        "The '{1}' argument is not supported. "
+        "Use only '{0}' instead."
+    )
+    if partition_cols is not None and partitioning is not None:
+        raise ValueError(msg_confl.format("partitioning",
+                                          "partition_cols"))
+
+    if metadata_collector is not None and file_visitor is not None:
+        raise ValueError(msg_confl.format("file_visitor",
+                                          "metadata_collector"))
+
+    import pyarrow.dataset as ds
+
+    # extract write_dataset specific options
+    # reset assumed to go to make_write_options
+    write_dataset_kwargs = dict()
+    for key in inspect.signature(ds.write_dataset).parameters:
+        if key in kwargs:
+            write_dataset_kwargs[key] = kwargs.pop(key)
+    write_dataset_kwargs['max_rows_per_group'] = kwargs.pop(
+        'row_group_size', kwargs.pop("chunk_size", None)
+    )
+
+    if metadata_collector is not None:
+        def file_visitor(written_file):
+            metadata_collector.append(written_file.metadata)
+
+    # map format arguments
+    parquet_format = ds.ParquetFileFormat()
+    write_options = parquet_format.make_write_options(**kwargs)
+
+    # map old filesystems to new one
+    if filesystem is not None:
+        filesystem = _ensure_filesystem(filesystem)
+
+    if partition_cols:
+        part_schema = table.select(partition_cols).schema
+        partitioning = ds.partitioning(part_schema, flavor="hive")
+
+    if basename_template is None:
+        basename_template = guid() + '-{i}.parquet'
+
+    if existing_data_behavior is None:
+        existing_data_behavior = 'overwrite_or_ignore'
+
+    ds.write_dataset(
+        table, root_path, filesystem=filesystem,
+        format=parquet_format, file_options=write_options, schema=schema,
+        partitioning=partitioning, use_threads=use_threads,
+        file_visitor=file_visitor,
+        basename_template=basename_template,
+        existing_data_behavior=existing_data_behavior,
+        **write_dataset_kwargs)
+    return
+
+
+def write_metadata(schema, where, metadata_collector=None, filesystem=None,
+                   **kwargs):
+    """
+    Write metadata-only Parquet file from schema. This can be used with
+    `write_to_dataset` to generate `_common_metadata` and `_metadata` sidecar
+    files.
+
+    Parameters
+    ----------
+    schema : pyarrow.Schema
+    where : string or pyarrow.NativeFile
+    metadata_collector : list
+        where to collect metadata information.
+    filesystem : FileSystem, default None
+        If nothing passed, will be inferred from `where` if path-like, else
+        `where` is already a file-like object so no filesystem is needed.
+    **kwargs : dict,
+        Additional kwargs for ParquetWriter class. See docstring for
+        `ParquetWriter` for more information.
+
+    Examples
+    --------
+    Generate example data:
+
+    >>> import pyarrow as pa
+    >>> table = pa.table({'n_legs': [2, 2, 4, 4, 5, 100],
+    ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
+    ...                              "Brittle stars", "Centipede"]})
+
+    Write a dataset and collect metadata information.
+
+    >>> metadata_collector = []
+    >>> import pyarrow.parquet as pq
+    >>> pq.write_to_dataset(
+    ...     table, 'dataset_metadata',
+    ...      metadata_collector=metadata_collector)
+
+    Write the `_common_metadata` parquet file without row groups statistics.
+
+    >>> pq.write_metadata(
+    ...     table.schema, 'dataset_metadata/_common_metadata')
+
+    Write the `_metadata` parquet file with row groups statistics.
+
+    >>> pq.write_metadata(
+    ...     table.schema, 'dataset_metadata/_metadata',
+    ...     metadata_collector=metadata_collector)
+    """
+    filesystem, where = _resolve_filesystem_and_path(where, filesystem)
+
+    if hasattr(where, "seek"):  # file-like
+        cursor_position = where.tell()
+
+    writer = ParquetWriter(where, schema, filesystem, **kwargs)
+    writer.close()
+
+    if metadata_collector is not None:
+        # ParquetWriter doesn't expose the metadata until it's written. Write
+        # it and read it again.
+        metadata = read_metadata(where, filesystem=filesystem)
+        if hasattr(where, "seek"):
+            where.seek(cursor_position)  # file-like, set cursor back.
+
+        for m in metadata_collector:
+            metadata.append_row_groups(m)
+        if filesystem is not None:
+            with filesystem.open_output_stream(where) as f:
+                metadata.write_metadata_file(f)
+        else:
+            metadata.write_metadata_file(where)
+
+
+def read_metadata(where, memory_map=False, decryption_properties=None,
+                  filesystem=None):
+    """
+    Read FileMetaData from footer of a single Parquet file.
+
+    Parameters
+    ----------
+    where : str (file path) or file-like object
+    memory_map : bool, default False
+        Create memory map when the source is a file path.
+    decryption_properties : FileDecryptionProperties, default None
+        Decryption properties for reading encrypted Parquet files.
+    filesystem : FileSystem, default None
+        If nothing passed, will be inferred based on path.
+        Path will try to be found in the local on-disk filesystem otherwise
+        it will be parsed as an URI to determine the filesystem.
+
+    Returns
+    -------
+    metadata : FileMetaData
+        The metadata of the Parquet file
+
+    Examples
+    --------
+    >>> import pyarrow as pa
+    >>> import pyarrow.parquet as pq
+    >>> table = pa.table({'n_legs': [4, 5, 100],
+    ...                   'animal': ["Dog", "Brittle stars", "Centipede"]})
+    >>> pq.write_table(table, 'example.parquet')
+
+    >>> pq.read_metadata('example.parquet')
+    <pyarrow._parquet.FileMetaData object at ...>
+      created_by: parquet-cpp-arrow version ...
+      num_columns: 2
+      num_rows: 3
+      num_row_groups: 1
+      format_version: 2.6
+      serialized_size: ...
+    """
+    filesystem, where = _resolve_filesystem_and_path(where, filesystem)
+    file_ctx = nullcontext()
+    if filesystem is not None:
+        file_ctx = where = filesystem.open_input_file(where)
+
+    with file_ctx:
+        file = ParquetFile(where, memory_map=memory_map,
+                           decryption_properties=decryption_properties)
+        return file.metadata
+
+
+def read_schema(where, memory_map=False, decryption_properties=None,
+                filesystem=None):
+    """
+    Read effective Arrow schema from Parquet file metadata.
+
+    Parameters
+    ----------
+    where : str (file path) or file-like object
+    memory_map : bool, default False
+        Create memory map when the source is a file path.
+    decryption_properties : FileDecryptionProperties, default None
+        Decryption properties for reading encrypted Parquet files.
+    filesystem : FileSystem, default None
+        If nothing passed, will be inferred based on path.
+        Path will try to be found in the local on-disk filesystem otherwise
+        it will be parsed as an URI to determine the filesystem.
+
+    Returns
+    -------
+    schema : pyarrow.Schema
+        The schema of the Parquet file
+
+    Examples
+    --------
+    >>> import pyarrow as pa
+    >>> import pyarrow.parquet as pq
+    >>> table = pa.table({'n_legs': [4, 5, 100],
+    ...                   'animal': ["Dog", "Brittle stars", "Centipede"]})
+    >>> pq.write_table(table, 'example.parquet')
+
+    >>> pq.read_schema('example.parquet')
+    n_legs: int64
+    animal: string
+    """
+    filesystem, where = _resolve_filesystem_and_path(where, filesystem)
+    file_ctx = nullcontext()
+    if filesystem is not None:
+        file_ctx = where = filesystem.open_input_file(where)
+
+    with file_ctx:
+        file = ParquetFile(
+            where, memory_map=memory_map,
+            decryption_properties=decryption_properties)
+        return file.schema.to_arrow_schema()
+
+
+__all__ = (
+    "ColumnChunkMetaData",
+    "ColumnSchema",
+    "FileDecryptionProperties",
+    "FileEncryptionProperties",
+    "FileMetaData",
+    "ParquetDataset",
+    "ParquetFile",
+    "ParquetLogicalType",
+    "ParquetReader",
+    "ParquetSchema",
+    "ParquetWriter",
+    "RowGroupMetaData",
+    "SortingColumn",
+    "Statistics",
+    "read_metadata",
+    "read_pandas",
+    "read_schema",
+    "read_table",
+    "write_metadata",
+    "write_table",
+    "write_to_dataset",
+    "_filters_to_expression",
+    "filters_to_expression",
+)
diff --git a/pyarrow/src/arrow/python/CMakeLists.txt b/pyarrow/src/arrow/python/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..67508982eab82eacb6d00bb28986b79f6fed5078
--- /dev/null
+++ b/pyarrow/src/arrow/python/CMakeLists.txt
@@ -0,0 +1,19 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+arrow_install_all_headers("arrow/python")
+add_subdirectory(vendored)
diff --git a/pyarrow/src/arrow/python/api.h b/pyarrow/src/arrow/python/api.h
new file mode 100644
index 0000000000000000000000000000000000000000..2af0963a9c0444bb858f10323f914e21747cebaf
--- /dev/null
+++ b/pyarrow/src/arrow/python/api.h
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/arrow_to_pandas.h"
+#include "arrow/python/common.h"
+#include "arrow/python/datetime.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/inference.h"
+#include "arrow/python/io.h"
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/numpy_to_arrow.h"
+#include "arrow/python/python_to_arrow.h"
+#include "arrow/python/util.h"
diff --git a/pyarrow/src/arrow/python/arrow_to_pandas.cc b/pyarrow/src/arrow/python/arrow_to_pandas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f163266f3b876796f8447d0eb356cc28652c9bac
--- /dev/null
+++ b/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -0,0 +1,2659 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for pandas conversion via NumPy
+
+#include "arrow/python/arrow_to_pandas.h"
+#include "arrow/python/numpy_interop.h"  // IWYU pragma: expand
+
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/datum.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/int_util.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/parallel.h"
+#include "arrow/visit_type_inline.h"
+
+#include "arrow/compute/api.h"
+
+#include "arrow/python/arrow_to_python_internal.h"
+#include "arrow/python/common.h"
+#include "arrow/python/datetime.h"
+#include "arrow/python/decimal.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/numpy_internal.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/python_to_arrow.h"
+#include "arrow/python/type_traits.h"
+
+namespace arrow {
+
+class MemoryPool;
+
+using internal::checked_cast;
+using internal::CheckIndexBounds;
+using internal::OptionalParallelFor;
+
+namespace py {
+namespace {
+
+// Fix options for conversion of an inner (child) array.
+PandasOptions MakeInnerOptions(PandasOptions options) {
+  // Make sure conversion of inner dictionary arrays always returns an array,
+  // not a dict {'indices': array, 'dictionary': array, 'ordered': bool}
+  options.decode_dictionaries = true;
+  options.categorical_columns.reset();
+  options.strings_to_categorical = false;
+
+  // In ARROW-7723, we found as a result of ARROW-3789 that second
+  // through microsecond resolution tz-aware timestamps were being promoted to
+  // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy
+  // array in this function. PyArray_GETITEM returns datetime.datetime for
+  // units second through microsecond but PyLong for nanosecond (because
+  // datetime.datetime does not support nanoseconds).
+  // We force the object conversion to preserve the value of the timezone.
+  // Nanoseconds are returned as integers.
+  options.coerce_temporal_nanoseconds = false;
+
+  return options;
+}
+
+// ----------------------------------------------------------------------
+// PyCapsule code for setting ndarray base to reference C++ object
+
+struct ArrayCapsule {
+  std::shared_ptr<Array> array;
+};
+
+struct BufferCapsule {
+  std::shared_ptr<Buffer> buffer;
+};
+
+void ArrayCapsule_Destructor(PyObject* capsule) {
+  delete reinterpret_cast<ArrayCapsule*>(PyCapsule_GetPointer(capsule, "arrow::Array"));
+}
+
+void BufferCapsule_Destructor(PyObject* capsule) {
+  delete reinterpret_cast<BufferCapsule*>(PyCapsule_GetPointer(capsule, "arrow::Buffer"));
+}
+
+// ----------------------------------------------------------------------
+// pandas 0.x DataFrame conversion internals
+
+using internal::arrow_traits;
+using internal::npy_traits;
+
+template <typename T>
+struct WrapBytes {};
+
+template <>
+struct WrapBytes<StringType> {
+  static inline PyObject* Wrap(const char* data, int64_t length) {
+    return PyUnicode_FromStringAndSize(data, length);
+  }
+};
+
+template <>
+struct WrapBytes<LargeStringType> {
+  static inline PyObject* Wrap(const char* data, int64_t length) {
+    return PyUnicode_FromStringAndSize(data, length);
+  }
+};
+
+template <>
+struct WrapBytes<StringViewType> {
+  static inline PyObject* Wrap(const char* data, int64_t length) {
+    return PyUnicode_FromStringAndSize(data, length);
+  }
+};
+
+template <>
+struct WrapBytes<BinaryType> {
+  static inline PyObject* Wrap(const char* data, int64_t length) {
+    return PyBytes_FromStringAndSize(data, length);
+  }
+};
+
+template <>
+struct WrapBytes<LargeBinaryType> {
+  static inline PyObject* Wrap(const char* data, int64_t length) {
+    return PyBytes_FromStringAndSize(data, length);
+  }
+};
+
+template <>
+struct WrapBytes<BinaryViewType> {
+  static inline PyObject* Wrap(const char* data, int64_t length) {
+    return PyBytes_FromStringAndSize(data, length);
+  }
+};
+
+template <>
+struct WrapBytes<FixedSizeBinaryType> {
+  static inline PyObject* Wrap(const char* data, int64_t length) {
+    return PyBytes_FromStringAndSize(data, length);
+  }
+};
+
+static inline bool ListTypeSupported(const DataType& type) {
+  switch (type.id()) {
+    case Type::BOOL:
+    case Type::UINT8:
+    case Type::INT8:
+    case Type::UINT16:
+    case Type::INT16:
+    case Type::UINT32:
+    case Type::INT32:
+    case Type::INT64:
+    case Type::UINT64:
+    case Type::HALF_FLOAT:
+    case Type::FLOAT:
+    case Type::DOUBLE:
+    case Type::DECIMAL128:
+    case Type::DECIMAL256:
+    case Type::BINARY:
+    case Type::LARGE_BINARY:
+    case Type::STRING:
+    case Type::LARGE_STRING:
+    case Type::DATE32:
+    case Type::DATE64:
+    case Type::STRUCT:
+    case Type::MAP:
+    case Type::TIME32:
+    case Type::TIME64:
+    case Type::TIMESTAMP:
+    case Type::DURATION:
+    case Type::DICTIONARY:
+    case Type::INTERVAL_MONTH_DAY_NANO:
+    case Type::NA:  // empty list
+      // The above types are all supported.
+      return true;
+    case Type::FIXED_SIZE_LIST:
+    case Type::LIST:
+    case Type::LARGE_LIST:
+    case Type::LIST_VIEW:
+    case Type::LARGE_LIST_VIEW: {
+      const auto& list_type = checked_cast<const BaseListType&>(type);
+      return ListTypeSupported(*list_type.value_type());
+    }
+    case Type::EXTENSION: {
+      const auto& ext = checked_cast<const ExtensionType&>(*type.GetSharedPtr());
+      return ListTypeSupported(*(ext.storage_type()));
+    }
+    default:
+      break;
+  }
+  return false;
+}
+
+Status CapsulizeArray(const std::shared_ptr<Array>& arr, PyObject** out) {
+  auto capsule = new ArrayCapsule{{arr}};
+  *out = PyCapsule_New(reinterpret_cast<void*>(capsule), "arrow::Array",
+                       &ArrayCapsule_Destructor);
+  if (*out == nullptr) {
+    delete capsule;
+    RETURN_IF_PYERROR();
+  }
+  return Status::OK();
+}
+
+Status CapsulizeBuffer(const std::shared_ptr<Buffer>& buffer, PyObject** out) {
+  auto capsule = new BufferCapsule{{buffer}};
+  *out = PyCapsule_New(reinterpret_cast<void*>(capsule), "arrow::Buffer",
+                       &BufferCapsule_Destructor);
+  if (*out == nullptr) {
+    delete capsule;
+    RETURN_IF_PYERROR();
+  }
+  return Status::OK();
+}
+
+Status SetNdarrayBase(PyArrayObject* arr, PyObject* base) {
+  if (PyArray_SetBaseObject(arr, base) == -1) {
+    // Error occurred, trust that SetBaseObject sets the error state
+    Py_XDECREF(base);
+    RETURN_IF_PYERROR();
+  }
+  return Status::OK();
+}
+
+Status SetBufferBase(PyArrayObject* arr, const std::shared_ptr<Buffer>& buffer) {
+  PyObject* base;
+  RETURN_NOT_OK(CapsulizeBuffer(buffer, &base));
+  return SetNdarrayBase(arr, base);
+}
+
+inline void set_numpy_metadata(int type, const DataType* datatype, PyArray_Descr* out) {
+  auto metadata =
+      reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(out));
+  if (type == NPY_DATETIME) {
+    if (datatype->id() == Type::TIMESTAMP) {
+      const auto& timestamp_type = checked_cast<const TimestampType&>(*datatype);
+      metadata->meta.base = internal::NumPyFrequency(timestamp_type.unit());
+    } else {
+      ARROW_DCHECK(false)
+          << "NPY_DATETIME views only supported for Arrow TIMESTAMP types";
+    }
+  } else if (type == NPY_TIMEDELTA) {
+    ARROW_DCHECK_EQ(datatype->id(), Type::DURATION);
+    const auto& duration_type = checked_cast<const DurationType&>(*datatype);
+    metadata->meta.base = internal::NumPyFrequency(duration_type.unit());
+  }
+}
+
+Status PyArray_NewFromPool(int nd, npy_intp* dims, PyArray_Descr* descr, MemoryPool* pool,
+                           PyObject** out) {
+  // ARROW-6570: Allocate memory from MemoryPool for a couple reasons
+  //
+  // * Track allocations
+  // * Get better performance through custom allocators
+  int64_t total_size = PyDataType_ELSIZE(descr);
+  for (int i = 0; i < nd; ++i) {
+    total_size *= dims[i];
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(total_size, pool));
+  *out = PyArray_NewFromDescr(&PyArray_Type, descr, nd, dims,
+                              /*strides=*/nullptr,
+                              /*data=*/buffer->mutable_data(),
+                              /*flags=*/NPY_ARRAY_CARRAY | NPY_ARRAY_WRITEABLE,
+                              /*obj=*/nullptr);
+  if (*out == nullptr) {
+    RETURN_IF_PYERROR();
+    // Trust that error set if NULL returned
+  }
+  return SetBufferBase(reinterpret_cast<PyArrayObject*>(*out), std::move(buffer));
+}
+
+template <typename T = void>
+inline const T* GetPrimitiveValues(const Array& arr) {
+  if (arr.length() == 0) {
+    return nullptr;
+  }
+  const int elsize = arr.type()->byte_width();
+  const auto& prim_arr = checked_cast<const PrimitiveArray&>(arr);
+  return reinterpret_cast<const T*>(prim_arr.values()->data() + arr.offset() * elsize);
+}
+
+Status MakeNumPyView(std::shared_ptr<Array> arr, PyObject* py_ref, int npy_type, int ndim,
+                     npy_intp* dims, PyObject** out) {
+  PyAcquireGIL lock;
+
+  PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type);
+  set_numpy_metadata(npy_type, arr->type().get(), descr);
+  PyObject* result = PyArray_NewFromDescr(
+      &PyArray_Type, descr, ndim, dims, /*strides=*/nullptr,
+      const_cast<void*>(GetPrimitiveValues(*arr)), /*flags=*/0, nullptr);
+  PyArrayObject* np_arr = reinterpret_cast<PyArrayObject*>(result);
+  if (np_arr == nullptr) {
+    // Error occurred, trust that error set
+    return Status::OK();
+  }
+
+  PyObject* base;
+  if (py_ref == nullptr) {
+    // Capsule will be owned by the ndarray, no incref necessary. See
+    // ARROW-1973
+    RETURN_NOT_OK(CapsulizeArray(arr, &base));
+  } else {
+    Py_INCREF(py_ref);
+    base = py_ref;
+  }
+  RETURN_NOT_OK(SetNdarrayBase(np_arr, base));
+
+  // Do not allow Arrow data to be mutated
+  PyArray_CLEARFLAGS(np_arr, NPY_ARRAY_WRITEABLE);
+  *out = result;
+  return Status::OK();
+}
+
+class PandasWriter {
+ public:
+  enum type {
+    OBJECT,
+    UINT8,
+    INT8,
+    UINT16,
+    INT16,
+    UINT32,
+    INT32,
+    UINT64,
+    INT64,
+    HALF_FLOAT,
+    FLOAT,
+    DOUBLE,
+    BOOL,
+    DATETIME_DAY,
+    DATETIME_SECOND,
+    DATETIME_MILLI,
+    DATETIME_MICRO,
+    DATETIME_NANO,
+    DATETIME_SECOND_TZ,
+    DATETIME_MILLI_TZ,
+    DATETIME_MICRO_TZ,
+    DATETIME_NANO_TZ,
+    TIMEDELTA_SECOND,
+    TIMEDELTA_MILLI,
+    TIMEDELTA_MICRO,
+    TIMEDELTA_NANO,
+    CATEGORICAL,
+    EXTENSION
+  };
+
+  PandasWriter(const PandasOptions& options, int64_t num_rows, int num_columns)
+      : options_(options), num_rows_(num_rows), num_columns_(num_columns) {
+    PyAcquireGIL lock;
+    internal::InitPandasStaticData();
+  }
+  virtual ~PandasWriter() {}
+
+  void SetBlockData(PyObject* arr) {
+    block_arr_.reset(arr);
+    block_data_ =
+        reinterpret_cast<uint8_t*>(PyArray_DATA(reinterpret_cast<PyArrayObject*>(arr)));
+  }
+
+  /// \brief Either copy or wrap single array to create pandas-compatible array
+  /// for Series or DataFrame. num_columns_ can only be 1. Will try to zero
+  /// copy if possible (or error if not possible and zero_copy_only=True)
+  virtual Status TransferSingle(std::shared_ptr<ChunkedArray> data, PyObject* py_ref) = 0;
+
+  /// \brief Copy ChunkedArray into a multi-column block
+  virtual Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) = 0;
+
+  Status EnsurePlacementAllocated() {
+    std::lock_guard<std::mutex> guard(allocation_lock_);
+    if (placement_data_ != nullptr) {
+      return Status::OK();
+    }
+    PyAcquireGIL lock;
+    npy_intp placement_dims[1] = {num_columns_};
+    PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64);
+    RETURN_IF_PYERROR();
+    placement_arr_.reset(placement_arr);
+    placement_data_ = reinterpret_cast<int64_t*>(
+        PyArray_DATA(reinterpret_cast<PyArrayObject*>(placement_arr)));
+    return Status::OK();
+  }
+
+  Status EnsureAllocated() {
+    std::lock_guard<std::mutex> guard(allocation_lock_);
+    if (block_data_ != nullptr) {
+      return Status::OK();
+    }
+    RETURN_NOT_OK(Allocate());
+    return Status::OK();
+  }
+
+  virtual bool CanZeroCopy(const ChunkedArray& data) const { return false; }
+
+  virtual Status Write(std::shared_ptr<ChunkedArray> data, int64_t abs_placement,
+                       int64_t rel_placement) {
+    RETURN_NOT_OK(EnsurePlacementAllocated());
+    if (num_columns_ == 1 && options_.allow_zero_copy_blocks) {
+      RETURN_NOT_OK(TransferSingle(data, /*py_ref=*/nullptr));
+    } else {
+      RETURN_NOT_OK(
+          CheckNoZeroCopy("Cannot do zero copy conversion into "
+                          "multi-column DataFrame block"));
+      RETURN_NOT_OK(EnsureAllocated());
+      RETURN_NOT_OK(CopyInto(data, rel_placement));
+    }
+    placement_data_[rel_placement] = abs_placement;
+    return Status::OK();
+  }
+
+  virtual Status GetDataFrameResult(PyObject** out) {
+    PyObject* result = PyDict_New();
+    RETURN_IF_PYERROR();
+
+    PyObject* block;
+    RETURN_NOT_OK(GetResultBlock(&block));
+
+    PyDict_SetItemString(result, "block", block);
+    PyDict_SetItemString(result, "placement", placement_arr_.obj());
+
+    RETURN_NOT_OK(AddResultMetadata(result));
+    *out = result;
+    return Status::OK();
+  }
+
+  // Caller steals the reference to this object
+  virtual Status GetSeriesResult(PyObject** out) {
+    RETURN_NOT_OK(MakeBlock1D());
+    // Caller owns the object now
+    *out = block_arr_.detach();
+    return Status::OK();
+  }
+
+ protected:
+  virtual Status AddResultMetadata(PyObject* result) { return Status::OK(); }
+
+  Status MakeBlock1D() {
+    // For Series or for certain DataFrame block types, we need to shape to a
+    // 1D array when there is only one column
+    PyAcquireGIL lock;
+
+    ARROW_DCHECK_EQ(1, num_columns_);
+
+    npy_intp new_dims[1] = {static_cast<npy_intp>(num_rows_)};
+    PyArray_Dims dims;
+    dims.ptr = new_dims;
+    dims.len = 1;
+
+    PyObject* reshaped = PyArray_Newshape(
+        reinterpret_cast<PyArrayObject*>(block_arr_.obj()), &dims, NPY_ANYORDER);
+    RETURN_IF_PYERROR();
+
+    // ARROW-8801: Here a PyArrayObject is created that is not being managed by
+    // any OwnedRef object. This object is then put in the resulting object
+    // with PyDict_SetItemString, which increments the reference count, so a
+    // memory leak ensues. There are several ways to fix the memory leak but a
+    // simple one is to put the reshaped 1D block array in this OwnedRefNoGIL
+    // so it will be correctly decref'd when this class is destructed.
+    block_arr_.reset(reshaped);
+    return Status::OK();
+  }
+
+  virtual Status GetResultBlock(PyObject** out) {
+    *out = block_arr_.obj();
+    return Status::OK();
+  }
+
+  Status CheckNoZeroCopy(const std::string& message) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid(message);
+    }
+    return Status::OK();
+  }
+
+  Status CheckNotZeroCopyOnly(const ChunkedArray& data) {
+    if (options_.zero_copy_only) {
+      return Status::Invalid("Needed to copy ", data.num_chunks(), " chunks with ",
+                             data.null_count(), " nulls, but zero_copy_only was True");
+    }
+    return Status::OK();
+  }
+
+  virtual Status Allocate() {
+    return Status::NotImplemented("Override Allocate in subclasses");
+  }
+
+  Status AllocateNDArray(int npy_type, int ndim = 2) {
+    PyAcquireGIL lock;
+
+    PyObject* block_arr = nullptr;
+    npy_intp block_dims[2] = {0, 0};
+
+    if (ndim == 2) {
+      block_dims[0] = num_columns_;
+      block_dims[1] = num_rows_;
+    } else {
+      block_dims[0] = num_rows_;
+    }
+    PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type);
+    if (PyDataType_REFCHK(descr)) {
+      // ARROW-6876: if the array has refcounted items, let Numpy
+      // own the array memory so as to decref elements on array destruction
+      block_arr = PyArray_SimpleNewFromDescr(ndim, block_dims, descr);
+      RETURN_IF_PYERROR();
+    } else {
+      RETURN_NOT_OK(
+          PyArray_NewFromPool(ndim, block_dims, descr, options_.pool, &block_arr));
+    }
+
+    SetBlockData(block_arr);
+    return Status::OK();
+  }
+
+  void SetDatetimeUnit(NPY_DATETIMEUNIT unit) {
+    PyAcquireGIL lock;
+    auto date_dtype =
+        reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(
+            PyArray_DESCR(reinterpret_cast<PyArrayObject*>(block_arr_.obj()))));
+    date_dtype->meta.base = unit;
+  }
+
+  PandasOptions options_;
+
+  std::mutex allocation_lock_;
+
+  int64_t num_rows_;
+  int num_columns_;
+
+  OwnedRefNoGIL block_arr_;
+  uint8_t* block_data_ = nullptr;
+
+  // ndarray<int32>
+  OwnedRefNoGIL placement_arr_;
+  int64_t* placement_data_ = nullptr;
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(PandasWriter);
+};
+
+template <typename InType, typename OutType>
+inline void ConvertIntegerWithNulls(const PandasOptions& options,
+                                    const ChunkedArray& data, OutType* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = *data.chunk(c);
+    const InType* in_values = GetPrimitiveValues<InType>(arr);
+    // Upcast to double, set NaN as appropriate
+
+    for (int i = 0; i < arr.length(); ++i) {
+      *out_values++ =
+          arr.IsNull(i) ? static_cast<OutType>(NAN) : static_cast<OutType>(in_values[i]);
+    }
+  }
+}
+
+template <typename T>
+inline void ConvertIntegerNoNullsSameType(const PandasOptions& options,
+                                          const ChunkedArray& data, T* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = *data.chunk(c);
+    if (arr.length() > 0) {
+      const T* in_values = GetPrimitiveValues<T>(arr);
+      memcpy(out_values, in_values, sizeof(T) * arr.length());
+      out_values += arr.length();
+    }
+  }
+}
+
+template <typename InType, typename OutType>
+inline void ConvertIntegerNoNullsCast(const PandasOptions& options,
+                                      const ChunkedArray& data, OutType* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = *data.chunk(c);
+    const InType* in_values = GetPrimitiveValues<InType>(arr);
+    for (int64_t i = 0; i < arr.length(); ++i) {
+      *out_values = in_values[i];
+    }
+  }
+}
+
+template <typename T, typename Enable = void>
+struct MemoizationTraits {
+  using Scalar = typename T::c_type;
+};
+
+template <typename T>
+struct MemoizationTraits<T, enable_if_has_string_view<T>> {
+  // For binary, we memoize string_view as a scalar value to avoid having to
+  // unnecessarily copy the memory into the memo table data structure
+  using Scalar = std::string_view;
+};
+
+// Generic Array -> PyObject** converter that handles object deduplication, if
+// requested
+template <typename Type, typename WrapFunction>
+inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data,
+                                 WrapFunction&& wrap_func, PyObject** out_values) {
+  using ArrayType = typename TypeTraits<Type>::ArrayType;
+  using Scalar = typename MemoizationTraits<Type>::Scalar;
+
+  auto convert_chunks = [&](auto&& wrap_func) -> Status {
+    for (int c = 0; c < data.num_chunks(); c++) {
+      const auto& arr = arrow::internal::checked_cast<const ArrayType&>(*data.chunk(c));
+      RETURN_NOT_OK(internal::WriteArrayObjects(arr, wrap_func, out_values));
+      out_values += arr.length();
+    }
+    return Status::OK();
+  };
+
+  if (options.deduplicate_objects) {
+    // GH-40316: only allocate a memo table if deduplication is enabled.
+    ::arrow::internal::ScalarMemoTable<Scalar> memo_table(options.pool);
+    std::vector<PyObject*> unique_values;
+    int32_t memo_size = 0;
+
+    auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) {
+      int32_t memo_index;
+      RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index));
+      if (memo_index == memo_size) {
+        // New entry
+        RETURN_NOT_OK(wrap_func(value, out_values));
+        unique_values.push_back(*out_values);
+        ++memo_size;
+      } else {
+        // Duplicate entry
+        Py_INCREF(unique_values[memo_index]);
+        *out_values = unique_values[memo_index];
+      }
+      return Status::OK();
+    };
+    return convert_chunks(std::move(WrapMemoized));
+  } else {
+    return convert_chunks(std::forward<WrapFunction>(wrap_func));
+  }
+}
+
+Status ConvertStruct(PandasOptions options, const ChunkedArray& data,
+                     PyObject** out_values) {
+  if (data.num_chunks() == 0) {
+    return Status::OK();
+  }
+  // ChunkedArray has at least one chunk
+  auto arr = checked_cast<const StructArray*>(data.chunk(0).get());
+  // Use it to cache the struct type and number of fields for all chunks
+  int32_t num_fields = arr->num_fields();
+  auto array_type = arr->type();
+  std::vector<OwnedRef> fields_data(num_fields * data.num_chunks());
+  OwnedRef dict_item;
+
+  // See notes in MakeInnerOptions.
+  options = MakeInnerOptions(std::move(options));
+  // Don't blindly convert because timestamps in lists are handled differently.
+  options.timestamp_as_object = true;
+
+  for (int c = 0; c < data.num_chunks(); c++) {
+    auto fields_data_offset = c * num_fields;
+    auto arr = checked_cast<const StructArray*>(data.chunk(c).get());
+    // Convert the struct arrays first
+    for (int32_t i = 0; i < num_fields; i++) {
+      auto field = arr->field(static_cast<int>(i));
+      // In case the field is an extension array, use .storage() to convert to Pandas
+      if (field->type()->id() == Type::EXTENSION) {
+        const ExtensionArray& arr_ext = checked_cast<const ExtensionArray&>(*field);
+        field = arr_ext.storage();
+      }
+      RETURN_NOT_OK(ConvertArrayToPandas(options, field, nullptr,
+                                         fields_data[i + fields_data_offset].ref()));
+      ARROW_DCHECK(PyArray_Check(fields_data[i + fields_data_offset].obj()));
+    }
+
+    // Construct a dictionary for each row
+    const bool has_nulls = data.null_count() > 0;
+    for (int64_t i = 0; i < arr->length(); ++i) {
+      if (has_nulls && arr->IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        // Build the new dict object for the row
+        dict_item.reset(PyDict_New());
+        RETURN_IF_PYERROR();
+        for (int32_t field_idx = 0; field_idx < num_fields; ++field_idx) {
+          OwnedRef field_value;
+          auto name = array_type->field(static_cast<int>(field_idx))->name();
+          if (!arr->field(static_cast<int>(field_idx))->IsNull(i)) {
+            // Value exists in child array, obtain it
+            auto array = reinterpret_cast<PyArrayObject*>(
+                fields_data[field_idx + fields_data_offset].obj());
+            auto ptr = reinterpret_cast<const char*>(PyArray_GETPTR1(array, i));
+            field_value.reset(PyArray_GETITEM(array, ptr));
+            RETURN_IF_PYERROR();
+          } else {
+            // Translate the Null to a None
+            Py_INCREF(Py_None);
+            field_value.reset(Py_None);
+          }
+          // PyDict_SetItemString increments reference count
+          auto setitem_result =
+              PyDict_SetItemString(dict_item.obj(), name.c_str(), field_value.obj());
+          RETURN_IF_PYERROR();
+          ARROW_DCHECK_EQ(setitem_result, 0);
+        }
+        *out_values = dict_item.obj();
+        // Grant ownership to the resulting array
+        Py_INCREF(*out_values);
+      }
+      ++out_values;
+    }
+  }
+  return Status::OK();
+}
+
+Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr<DataType>& dense_type,
+                          ArrayVector* arrays) {
+  compute::ExecContext ctx(pool);
+  compute::CastOptions options;
+  for (size_t i = 0; i < arrays->size(); ++i) {
+    ARROW_ASSIGN_OR_RAISE((*arrays)[i],
+                          compute::Cast(*(*arrays)[i], dense_type, options, &ctx));
+  }
+  return Status::OK();
+}
+
+Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr<DataType>& dense_type,
+                          std::shared_ptr<ChunkedArray>* array) {
+  auto chunks = (*array)->chunks();
+  RETURN_NOT_OK(DecodeDictionaries(pool, dense_type, &chunks));
+  *array = std::make_shared<ChunkedArray>(std::move(chunks), dense_type);
+  return Status::OK();
+}
+
+template <typename T>
+enable_if_list_like<T, Status> ConvertListsLike(PandasOptions options,
+                                                const ChunkedArray& data,
+                                                PyObject** out_values) {
+  using ListArrayT = typename TypeTraits<T>::ArrayType;
+  // Get column of underlying value arrays
+  ArrayVector value_arrays;
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));
+    // values() does not account for offsets, so we need to slice into it.
+    // We can't use Flatten(), because it removes the values behind a null list
+    // value, and that makes the offsets into original list values and our
+    // flattened_values array different.
+    std::shared_ptr<Array> flattened_values = arr.values()->Slice(
+        arr.value_offset(0), arr.value_offset(arr.length()) - arr.value_offset(0));
+    if (arr.value_type()->id() == Type::EXTENSION) {
+      const auto& arr_ext = checked_cast<const ExtensionArray&>(*flattened_values);
+      value_arrays.emplace_back(arr_ext.storage());
+    } else {
+      value_arrays.emplace_back(flattened_values);
+    }
+  }
+
+  using ListArrayType = typename ListArrayT::TypeClass;
+  const auto& list_type = checked_cast<const ListArrayType&>(*data.type());
+  auto value_type = list_type.value_type();
+  if (value_type->id() == Type::EXTENSION) {
+    value_type = checked_cast<const ExtensionType&>(*value_type).storage_type();
+  }
+
+  auto flat_column = std::make_shared<ChunkedArray>(value_arrays, value_type);
+
+  options = MakeInnerOptions(std::move(options));
+
+  OwnedRefNoGIL owned_numpy_array;
+  RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_column, nullptr,
+                                            owned_numpy_array.ref()));
+  PyObject* numpy_array = owned_numpy_array.obj();
+  ARROW_DCHECK(PyArray_Check(numpy_array));
+
+  int64_t chunk_offset = 0;
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));
+    const bool has_nulls = data.null_count() > 0;
+    for (int64_t i = 0; i < arr.length(); ++i) {
+      if (has_nulls && arr.IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        // Need to subtract value_offset(0) since the original chunk might be a slice
+        // into another array.
+        OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset -
+                                           arr.value_offset(0)));
+        OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset -
+                                         arr.value_offset(0)));
+        OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr));
+
+        if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) {
+          // Fall out of loop, will return from RETURN_IF_PYERROR
+          break;
+        }
+        *out_values = PyObject_GetItem(numpy_array, slice.obj());
+
+        if (*out_values == nullptr) {
+          // Fall out of loop, will return from RETURN_IF_PYERROR
+          break;
+        }
+      }
+      ++out_values;
+    }
+    RETURN_IF_PYERROR();
+
+    chunk_offset += arr.value_offset(arr.length()) - arr.value_offset(0);
+  }
+
+  return Status::OK();
+}
+
+// TODO GH-40579: optimize ListView conversion to avoid unnecessary copies
+template <typename T>
+enable_if_list_view<T, Status> ConvertListsLike(PandasOptions options,
+                                                const ChunkedArray& data,
+                                                PyObject** out_values) {
+  using ListViewArrayType = typename TypeTraits<T>::ArrayType;
+  using NonViewType =
+      std::conditional_t<T::type_id == Type::LIST_VIEW, ListType, LargeListType>;
+  using NonViewClass = typename TypeTraits<NonViewType>::ArrayType;
+  ArrayVector list_arrays;
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = checked_cast<const ListViewArrayType&>(*data.chunk(c));
+    ARROW_ASSIGN_OR_RAISE(auto non_view_array,
+                          NonViewClass::FromListView(arr, options.pool));
+    list_arrays.emplace_back(non_view_array);
+  }
+  auto chunked_array = std::make_shared<ChunkedArray>(list_arrays);
+  return ConvertListsLike<NonViewType>(options, *chunked_array, out_values);
+}
+
+template <typename F1, typename F2, typename F3>
+Status ConvertMapHelper(F1 resetRow, F2 addPairToRow, F3 stealRow,
+                        const ChunkedArray& data, PyArrayObject* py_keys,
+                        PyArrayObject* py_items,
+                        // needed for null checks in items
+                        const std::vector<std::shared_ptr<Array>> item_arrays,
+                        PyObject** out_values) {
+  OwnedRef key_value;
+  OwnedRef item_value;
+
+  int64_t chunk_offset = 0;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& arr = checked_cast<const MapArray&>(*data.chunk(c));
+    const bool has_nulls = data.null_count() > 0;
+
+    // Make a list of key/item pairs for each row in array
+    for (int64_t i = 0; i < arr.length(); ++i) {
+      if (has_nulls && arr.IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        int64_t entry_offset = arr.value_offset(i);
+        int64_t num_pairs = arr.value_offset(i + 1) - entry_offset;
+
+        // Build the new list object for the row of Python pairs
+        RETURN_NOT_OK(resetRow(num_pairs));
+
+        // Add each key/item pair in the row
+        for (int64_t j = 0; j < num_pairs; ++j) {
+          // Get key value, key is non-nullable for a valid row
+          auto ptr_key = reinterpret_cast<const char*>(
+              PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
+          key_value.reset(PyArray_GETITEM(py_keys, ptr_key));
+          RETURN_IF_PYERROR();
+
+          if (item_arrays[c]->IsNull(entry_offset + j)) {
+            // Translate the Null to a None
+            Py_INCREF(Py_None);
+            item_value.reset(Py_None);
+          } else {
+            // Get valid value from item array
+            auto ptr_item = reinterpret_cast<const char*>(
+                PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j));
+            item_value.reset(PyArray_GETITEM(py_items, ptr_item));
+            RETURN_IF_PYERROR();
+          }
+
+          // Add the key/item pair to the row
+          RETURN_NOT_OK(addPairToRow(j, key_value, item_value));
+        }
+
+        // Pass ownership to the resulting array
+        *out_values = stealRow();
+      }
+      ++out_values;
+    }
+    RETURN_IF_PYERROR();
+
+    chunk_offset += arr.values()->length();
+  }
+
+  return Status::OK();
+}
+
+// A more helpful error message around TypeErrors that may stem from unhashable keys
+Status CheckMapAsPydictsTypeError() {
+  if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
+    return Status::OK();
+  }
+  if (PyErr_ExceptionMatches(PyExc_TypeError)) {
+    // Modify the error string directly, so it is re-raised
+    // with our additional info.
+    //
+    // There are not many interesting things happening when this
+    // is hit. This is intended to only be called directly after
+    // PyDict_SetItem, where a finite set of errors could occur.
+    PyObject *type, *value, *traceback;
+    PyErr_Fetch(&type, &value, &traceback);
+    std::string message;
+    RETURN_NOT_OK(internal::PyObject_StdStringStr(value, &message));
+    message +=
+        ". If keys are not hashable, then you must use the option "
+        "[maps_as_pydicts=None (default)]";
+
+    // resets the error
+    PyErr_SetString(PyExc_TypeError, message.c_str());
+  }
+  return ConvertPyError();
+}
+
+Status CheckForDuplicateKeys(bool error_on_duplicate_keys, Py_ssize_t total_dict_len,
+                             Py_ssize_t total_raw_len) {
+  if (total_dict_len < total_raw_len) {
+    const char* message =
+        "[maps_as_pydicts] "
+        "After conversion of Arrow maps to pydicts, "
+        "detected data loss due to duplicate keys. "
+        "Original input length is [%lld], total converted pydict length is [%lld].";
+    std::array<char, 256> buf;
+    std::snprintf(buf.data(), buf.size(), message, total_raw_len, total_dict_len);
+
+    if (error_on_duplicate_keys) {
+      return Status::UnknownError(buf.data());
+    } else {
+      ARROW_LOG(WARNING) << buf.data();
+    }
+  }
+  return Status::OK();
+}
+
+Status ConvertMap(PandasOptions options, const ChunkedArray& data,
+                  PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // See notes in MakeInnerOptions.
+  options = MakeInnerOptions(std::move(options));
+  // Don't blindly convert because timestamps in lists are handled differently.
+  options.timestamp_as_object = true;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(
+      ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(
+      ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref()));
+  PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  if (options.maps_as_pydicts == MapConversionType::DEFAULT) {
+    // The default behavior to express an Arrow MAP as a list of [(key, value), ...] pairs
+    OwnedRef list_item;
+    return ConvertMapHelper(
+        [&list_item](int64_t num_pairs) {
+          list_item.reset(PyList_New(num_pairs));
+          return CheckPyError();
+        },
+        [&list_item](int64_t idx, OwnedRef& key_value, OwnedRef& item_value) {
+          PyList_SET_ITEM(list_item.obj(), idx,
+                          PyTuple_Pack(2, key_value.obj(), item_value.obj()));
+          return CheckPyError();
+        },
+        [&list_item] { return list_item.detach(); }, data, py_keys, py_items, item_arrays,
+        out_values);
+  } else {
+    // Use a native pydict
+    OwnedRef dict_item;
+    Py_ssize_t total_dict_len{0};
+    Py_ssize_t total_raw_len{0};
+
+    bool error_on_duplicate_keys;
+    if (options.maps_as_pydicts == MapConversionType::LOSSY) {
+      error_on_duplicate_keys = false;
+    } else if (options.maps_as_pydicts == MapConversionType::STRICT_) {
+      error_on_duplicate_keys = true;
+    } else {
+      auto val = std::underlying_type_t<MapConversionType>(options.maps_as_pydicts);
+      return Status::UnknownError("Received unknown option for maps_as_pydicts: " +
+                                  std::to_string(val));
+    }
+
+    auto status = ConvertMapHelper(
+        [&dict_item, &total_raw_len](int64_t num_pairs) {
+          total_raw_len += num_pairs;
+          dict_item.reset(PyDict_New());
+          return CheckPyError();
+        },
+        [&dict_item]([[maybe_unused]] int64_t idx, OwnedRef& key_value,
+                     OwnedRef& item_value) {
+          auto setitem_result =
+              PyDict_SetItem(dict_item.obj(), key_value.obj(), item_value.obj());
+          ARROW_RETURN_NOT_OK(CheckMapAsPydictsTypeError());
+          // returns -1 if there are internal errors around hashing/resizing
+          return setitem_result == 0 ? Status::OK()
+                                     : Status::UnknownError(
+                                           "[maps_as_pydicts] "
+                                           "Unexpected failure inserting Arrow (key, "
+                                           "value) pair into Python dict");
+        },
+        [&dict_item, &total_dict_len] {
+          total_dict_len += PyDict_Size(dict_item.obj());
+          return dict_item.detach();
+        },
+        data, py_keys, py_items, item_arrays, out_values);
+
+    ARROW_RETURN_NOT_OK(status);
+    // If there were no errors generating the pydicts,
+    // then check if we detected any data loss from duplicate keys.
+    return CheckForDuplicateKeys(error_on_duplicate_keys, total_dict_len, total_raw_len);
+  }
+}
+
+template <typename InType, typename OutType>
+inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value,
+                                   OutType* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = *data.chunk(c);
+    const InType* in_values = GetPrimitiveValues<InType>(arr);
+
+    if (arr.null_count() > 0) {
+      for (int64_t i = 0; i < arr.length(); ++i) {
+        *out_values++ = arr.IsNull(i) ? na_value : in_values[i];
+      }
+    } else {
+      memcpy(out_values, in_values, sizeof(InType) * arr.length());
+      out_values += arr.length();
+    }
+  }
+}
+
+template <typename InType, typename OutType>
+inline void ConvertNumericNullableCast(const ChunkedArray& data, InType na_value,
+                                       OutType* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = *data.chunk(c);
+    const InType* in_values = GetPrimitiveValues<InType>(arr);
+
+    for (int64_t i = 0; i < arr.length(); ++i) {
+      *out_values++ = arr.IsNull(i) ? static_cast<OutType>(na_value)
+                                    : static_cast<OutType>(in_values[i]);
+    }
+  }
+}
+
+template <int NPY_TYPE>
+class TypedPandasWriter : public PandasWriter {
+ public:
+  using T = typename npy_traits<NPY_TYPE>::value_type;
+
+  using PandasWriter::PandasWriter;
+
+  Status TransferSingle(std::shared_ptr<ChunkedArray> data, PyObject* py_ref) override {
+    if (CanZeroCopy(*data)) {
+      PyObject* wrapped;
+      npy_intp dims[2] = {static_cast<npy_intp>(num_columns_),
+                          static_cast<npy_intp>(num_rows_)};
+      RETURN_NOT_OK(
+          MakeNumPyView(data->chunk(0), py_ref, NPY_TYPE, /*ndim=*/2, dims, &wrapped));
+      SetBlockData(wrapped);
+      return Status::OK();
+    } else {
+      RETURN_NOT_OK(CheckNotZeroCopyOnly(*data));
+      RETURN_NOT_OK(EnsureAllocated());
+      return CopyInto(data, /*rel_placement=*/0);
+    }
+  }
+
+  Status CheckTypeExact(const DataType& type, Type::type expected) {
+    if (type.id() != expected) {
+      return Status::NotImplemented("Cannot write Arrow data of type ", type.ToString(),
+                                    " to pandas block with NumPy type ",
+                                    GetNumPyTypeName(NPY_TYPE));
+    }
+    return Status::OK();
+  }
+
+  T* GetBlockColumnStart(int64_t rel_placement) {
+    return reinterpret_cast<T*>(block_data_) + rel_placement * num_rows_;
+  }
+
+ protected:
+  Status Allocate() override { return AllocateNDArray(NPY_TYPE); }
+};
+
+struct ObjectWriterVisitor {
+  const PandasOptions& options;
+  const ChunkedArray& data;
+  PyObject** out_values;
+
+  Status Visit(const NullType& type) {
+    for (int c = 0; c < data.num_chunks(); c++) {
+      std::shared_ptr<Array> arr = data.chunk(c);
+
+      for (int64_t i = 0; i < arr->length(); ++i) {
+        // All values are null
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+        ++out_values;
+      }
+    }
+    return Status::OK();
+  }
+
+  Status Visit(const BooleanType& type) {
+    for (int c = 0; c < data.num_chunks(); c++) {
+      const auto& arr = checked_cast<const BooleanArray&>(*data.chunk(c));
+
+      for (int64_t i = 0; i < arr.length(); ++i) {
+        if (arr.IsNull(i)) {
+          Py_INCREF(Py_None);
+          *out_values++ = Py_None;
+        } else if (arr.Value(i)) {
+          // True
+          Py_INCREF(Py_True);
+          *out_values++ = Py_True;
+        } else {
+          // False
+          Py_INCREF(Py_False);
+          *out_values++ = Py_False;
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_integer<Type, Status> Visit(const Type& type) {
+    using T = typename Type::c_type;
+    auto WrapValue = [](T value, PyObject** out) {
+      *out = std::is_signed<T>::value ? PyLong_FromLongLong(value)
+                                      : PyLong_FromUnsignedLongLong(value);
+      RETURN_IF_PYERROR();
+      return Status::OK();
+    };
+    return ConvertAsPyObjects<Type>(options, data, WrapValue, out_values);
+  }
+
+  template <typename Type>
+  enable_if_t<is_base_binary_type<Type>::value || is_binary_view_like_type<Type>::value ||
+                  is_fixed_size_binary_type<Type>::value,
+              Status>
+  Visit(const Type& type) {
+    auto WrapValue = [](const std::string_view& view, PyObject** out) {
+      *out = WrapBytes<Type>::Wrap(view.data(), view.length());
+      if (*out == nullptr) {
+        PyErr_Clear();
+        return Status::UnknownError("Wrapping ", view, " failed");
+      }
+      return Status::OK();
+    };
+    return ConvertAsPyObjects<Type>(options, data, WrapValue, out_values);
+  }
+
+  template <typename Type>
+  enable_if_date<Type, Status> Visit(const Type& type) {
+    auto WrapValue = [](typename Type::c_type value, PyObject** out) {
+      RETURN_NOT_OK(internal::PyDate_from_int(value, Type::UNIT, out));
+      RETURN_IF_PYERROR();
+      return Status::OK();
+    };
+    return ConvertAsPyObjects<Type>(options, data, WrapValue, out_values);
+  }
+
+  template <typename Type>
+  enable_if_time<Type, Status> Visit(const Type& type) {
+    const TimeUnit::type unit = type.unit();
+    auto WrapValue = [unit](typename Type::c_type value, PyObject** out) {
+      RETURN_NOT_OK(internal::PyTime_from_int(value, unit, out));
+      RETURN_IF_PYERROR();
+      return Status::OK();
+    };
+    return ConvertAsPyObjects<Type>(options, data, WrapValue, out_values);
+  }
+
+  template <typename Type>
+  enable_if_timestamp<Type, Status> Visit(const Type& type) {
+    const TimeUnit::type unit = type.unit();
+    OwnedRef tzinfo;
+
+    auto ConvertTimezoneNaive = [&](typename Type::c_type value, PyObject** out) {
+      RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out));
+      RETURN_IF_PYERROR();
+      return Status::OK();
+    };
+    auto ConvertTimezoneAware = [&](typename Type::c_type value, PyObject** out) {
+      PyObject* naive_datetime;
+      RETURN_NOT_OK(ConvertTimezoneNaive(value, &naive_datetime));
+
+      // convert the timezone naive datetime object to timezone aware
+      // two step conversion of the datetime mimics Python's code:
+      // dt.replace(tzinfo=datetime.timezone.utc).astimezone(tzinfo)
+      // first step: replacing timezone with timezone.utc (replace method)
+      OwnedRef args(PyTuple_New(0));
+      OwnedRef keywords(PyDict_New());
+      PyDict_SetItemString(keywords.obj(), "tzinfo", PyDateTime_TimeZone_UTC);
+      OwnedRef naive_datetime_replace(PyObject_GetAttrString(naive_datetime, "replace"));
+      OwnedRef datetime_utc(
+          PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj()));
+      // second step: adjust the datetime to tzinfo timezone (astimezone method)
+      *out = PyObject_CallMethod(datetime_utc.obj(), "astimezone", "O", tzinfo.obj());
+
+      // the timezone naive object is no longer required
+      Py_DECREF(naive_datetime);
+      RETURN_IF_PYERROR();
+
+      return Status::OK();
+    };
+
+    if (!type.timezone().empty() && !options.ignore_timezone) {
+      // convert timezone aware
+      PyObject* tzobj;
+      ARROW_ASSIGN_OR_RAISE(tzobj, internal::StringToTzinfo(type.timezone()));
+      tzinfo.reset(tzobj);
+      RETURN_IF_PYERROR();
+      RETURN_NOT_OK(
+          ConvertAsPyObjects<Type>(options, data, ConvertTimezoneAware, out_values));
+    } else {
+      // convert timezone naive
+      RETURN_NOT_OK(
+          ConvertAsPyObjects<Type>(options, data, ConvertTimezoneNaive, out_values));
+    }
+
+    return Status::OK();
+  }
+
+  template <typename Type>
+  enable_if_t<std::is_same<Type, MonthDayNanoIntervalType>::value, Status> Visit(
+      const Type& type) {
+    OwnedRef args(PyTuple_New(0));
+    OwnedRef kwargs(PyDict_New());
+    RETURN_IF_PYERROR();
+    auto to_date_offset = [&](const MonthDayNanoIntervalType::MonthDayNanos& interval,
+                              PyObject** out) {
+      ARROW_DCHECK(internal::BorrowPandasDataOffsetType() != nullptr);
+      // DateOffset objects do not add nanoseconds component to pd.Timestamp.
+      // as of  Pandas 1.3.3
+      // (https://github.com/pandas-dev/pandas/issues/43892).
+      // So convert microseconds and remainder to preserve data
+      // but give users more expected results.
+      int64_t microseconds = interval.nanoseconds / 1000;
+      int64_t nanoseconds;
+      if (interval.nanoseconds >= 0) {
+        nanoseconds = interval.nanoseconds % 1000;
+      } else {
+        nanoseconds = -((-interval.nanoseconds) % 1000);
+      }
+
+      PyDict_SetItemString(kwargs.obj(), "months", PyLong_FromLong(interval.months));
+      PyDict_SetItemString(kwargs.obj(), "days", PyLong_FromLong(interval.days));
+      PyDict_SetItemString(kwargs.obj(), "microseconds",
+                           PyLong_FromLongLong(microseconds));
+      PyDict_SetItemString(kwargs.obj(), "nanoseconds", PyLong_FromLongLong(nanoseconds));
+      *out =
+          PyObject_Call(internal::BorrowPandasDataOffsetType(), args.obj(), kwargs.obj());
+      RETURN_IF_PYERROR();
+      return Status::OK();
+    };
+    return ConvertAsPyObjects<MonthDayNanoIntervalType>(options, data, to_date_offset,
+                                                        out_values);
+  }
+
+  template <typename DecimalT, typename DecimalArrayT>
+  Status VisitDecimal(const DecimalT& type) {
+    OwnedRef decimal;
+    OwnedRef Decimal;
+    RETURN_NOT_OK(internal::ImportModule("decimal", &decimal));
+    RETURN_NOT_OK(internal::ImportFromModule(decimal.obj(), "Decimal", &Decimal));
+    PyObject* decimal_constructor = Decimal.obj();
+
+    for (int c = 0; c < data.num_chunks(); c++) {
+      const auto& arr = checked_cast<const DecimalArrayT&>(*data.chunk(c));
+
+      for (int64_t i = 0; i < arr.length(); ++i) {
+        if (arr.IsNull(i)) {
+          Py_INCREF(Py_None);
+          *out_values++ = Py_None;
+        } else {
+          *out_values++ =
+              internal::DecimalFromString(decimal_constructor, arr.FormatValue(i));
+          RETURN_IF_PYERROR();
+        }
+      }
+    }
+
+    return Status::OK();
+  }
+
+  Status Visit(const Decimal32Type& type) {
+    return VisitDecimal<Decimal32Type, Decimal32Array>(type);
+  }
+
+  Status Visit(const Decimal64Type& type) {
+    return VisitDecimal<Decimal64Type, Decimal64Array>(type);
+  }
+
+  Status Visit(const Decimal128Type& type) {
+    return VisitDecimal<Decimal128Type, Decimal128Array>(type);
+  }
+
+  Status Visit(const Decimal256Type& type) {
+    return VisitDecimal<Decimal256Type, Decimal256Array>(type);
+  }
+
+  template <typename T>
+  enable_if_t<is_list_like_type<T>::value || is_list_view_type<T>::value, Status> Visit(
+      const T& type) {
+    if (!ListTypeSupported(*type.value_type())) {
+      return Status::NotImplemented(
+          "Not implemented type for conversion from List to Pandas: ",
+          type.value_type()->ToString());
+    }
+    return ConvertListsLike<T>(options, data, out_values);
+  }
+
+  Status Visit(const MapType& type) { return ConvertMap(options, data, out_values); }
+
+  Status Visit(const StructType& type) {
+    return ConvertStruct(options, data, out_values);
+  }
+
+  template <typename Type>
+  enable_if_t<is_floating_type<Type>::value ||
+                  std::is_same<DictionaryType, Type>::value ||
+                  std::is_same<DurationType, Type>::value ||
+                  std::is_same<RunEndEncodedType, Type>::value ||
+                  std::is_same<ExtensionType, Type>::value ||
+                  (std::is_base_of<IntervalType, Type>::value &&
+                   !std::is_same<MonthDayNanoIntervalType, Type>::value) ||
+                  std::is_base_of<UnionType, Type>::value,
+              Status>
+  Visit(const Type& type) {
+    return Status::NotImplemented("No implemented conversion to object dtype: ",
+                                  type.ToString());
+  }
+};
+
+class ObjectWriter : public TypedPandasWriter<NPY_OBJECT> {
+ public:
+  using TypedPandasWriter<NPY_OBJECT>::TypedPandasWriter;
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    PyAcquireGIL lock;
+    ObjectWriterVisitor visitor{this->options_, *data,
+                                this->GetBlockColumnStart(rel_placement)};
+    return VisitTypeInline(*data->type(), &visitor);
+  }
+};
+
+static inline bool IsNonNullContiguous(const ChunkedArray& data) {
+  return data.num_chunks() == 1 && data.null_count() == 0;
+}
+
+template <int NPY_TYPE>
+class IntWriter : public TypedPandasWriter<NPY_TYPE> {
+ public:
+  using ArrowType = typename npy_traits<NPY_TYPE>::TypeClass;
+  using TypedPandasWriter<NPY_TYPE>::TypedPandasWriter;
+
+  bool CanZeroCopy(const ChunkedArray& data) const override {
+    return IsNonNullContiguous(data);
+  }
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    RETURN_NOT_OK(this->CheckTypeExact(*data->type(), ArrowType::type_id));
+    ConvertIntegerNoNullsSameType<typename ArrowType::c_type>(
+        this->options_, *data, this->GetBlockColumnStart(rel_placement));
+    return Status::OK();
+  }
+};
+
+template <int NPY_TYPE>
+class FloatWriter : public TypedPandasWriter<NPY_TYPE> {
+ public:
+  using ArrowType = typename npy_traits<NPY_TYPE>::TypeClass;
+  using TypedPandasWriter<NPY_TYPE>::TypedPandasWriter;
+  using T = typename ArrowType::c_type;
+
+  bool CanZeroCopy(const ChunkedArray& data) const override {
+    return IsNonNullContiguous(data) && data.type()->id() == ArrowType::type_id;
+  }
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    Type::type in_type = data->type()->id();
+    auto out_values = this->GetBlockColumnStart(rel_placement);
+
+#define INTEGER_CASE(IN_TYPE)                                             \
+  ConvertIntegerWithNulls<IN_TYPE, T>(this->options_, *data, out_values); \
+  break;
+
+    switch (in_type) {
+      case Type::UINT8:
+        INTEGER_CASE(uint8_t);
+      case Type::INT8:
+        INTEGER_CASE(int8_t);
+      case Type::UINT16:
+        INTEGER_CASE(uint16_t);
+      case Type::INT16:
+        INTEGER_CASE(int16_t);
+      case Type::UINT32:
+        INTEGER_CASE(uint32_t);
+      case Type::INT32:
+        INTEGER_CASE(int32_t);
+      case Type::UINT64:
+        INTEGER_CASE(uint64_t);
+      case Type::INT64:
+        INTEGER_CASE(int64_t);
+      case Type::HALF_FLOAT:
+        ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
+      case Type::FLOAT:
+        ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
+        break;
+      case Type::DOUBLE:
+        ConvertNumericNullableCast(*data, npy_traits<NPY_TYPE>::na_sentinel, out_values);
+        break;
+      default:
+        return Status::NotImplemented("Cannot write Arrow data of type ",
+                                      data->type()->ToString(),
+                                      " to a Pandas floating point block");
+    }
+
+#undef INTEGER_CASE
+
+    return Status::OK();
+  }
+};
+
+using UInt8Writer = IntWriter<NPY_UINT8>;
+using Int8Writer = IntWriter<NPY_INT8>;
+using UInt16Writer = IntWriter<NPY_UINT16>;
+using Int16Writer = IntWriter<NPY_INT16>;
+using UInt32Writer = IntWriter<NPY_UINT32>;
+using Int32Writer = IntWriter<NPY_INT32>;
+using UInt64Writer = IntWriter<NPY_UINT64>;
+using Int64Writer = IntWriter<NPY_INT64>;
+using Float16Writer = FloatWriter<NPY_FLOAT16>;
+using Float32Writer = FloatWriter<NPY_FLOAT32>;
+using Float64Writer = FloatWriter<NPY_FLOAT64>;
+
+class BoolWriter : public TypedPandasWriter<NPY_BOOL> {
+ public:
+  using TypedPandasWriter<NPY_BOOL>::TypedPandasWriter;
+
+  Status TransferSingle(std::shared_ptr<ChunkedArray> data, PyObject* py_ref) override {
+    RETURN_NOT_OK(
+        CheckNoZeroCopy("Zero copy conversions not possible with "
+                        "boolean types"));
+    RETURN_NOT_OK(EnsureAllocated());
+    return CopyInto(data, /*rel_placement=*/0);
+  }
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    RETURN_NOT_OK(this->CheckTypeExact(*data->type(), Type::BOOL));
+    auto out_values = this->GetBlockColumnStart(rel_placement);
+    for (int c = 0; c < data->num_chunks(); c++) {
+      const auto& arr = checked_cast<const BooleanArray&>(*data->chunk(c));
+      for (int64_t i = 0; i < arr.length(); ++i) {
+        *out_values++ = static_cast<uint8_t>(arr.Value(i));
+      }
+    }
+    return Status::OK();
+  }
+};
+
+// ----------------------------------------------------------------------
+// Date / timestamp types
+
+template <typename T, int64_t SHIFT>
+inline void ConvertDatetime(const ChunkedArray& data, int64_t* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = *data.chunk(c);
+    const T* in_values = GetPrimitiveValues<T>(arr);
+
+    for (int64_t i = 0; i < arr.length(); ++i) {
+      *out_values++ = arr.IsNull(i) ? kPandasTimestampNull
+                                    : (static_cast<int64_t>(in_values[i]) * SHIFT);
+    }
+  }
+}
+
+template <typename T, int SHIFT>
+void ConvertDatesShift(const ChunkedArray& data, int64_t* out_values) {
+  for (int c = 0; c < data.num_chunks(); c++) {
+    const auto& arr = *data.chunk(c);
+    const T* in_values = GetPrimitiveValues<T>(arr);
+    for (int64_t i = 0; i < arr.length(); ++i) {
+      *out_values++ = arr.IsNull(i) ? kPandasTimestampNull
+                                    : static_cast<int64_t>(in_values[i]) / SHIFT;
+    }
+  }
+}
+
+class DatetimeDayWriter : public TypedPandasWriter<NPY_DATETIME> {
+ public:
+  using TypedPandasWriter<NPY_DATETIME>::TypedPandasWriter;
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    int64_t* out_values = this->GetBlockColumnStart(rel_placement);
+    const auto& type = checked_cast<const DateType&>(*data->type());
+    switch (type.unit()) {
+      case DateUnit::DAY:
+        ConvertDatesShift<int32_t, 1LL>(*data, out_values);
+        break;
+      case DateUnit::MILLI:
+        ConvertDatesShift<int64_t, 86400000LL>(*data, out_values);
+        break;
+    }
+    return Status::OK();
+  }
+
+ protected:
+  Status Allocate() override {
+    RETURN_NOT_OK(this->AllocateNDArray(NPY_DATETIME));
+    SetDatetimeUnit(NPY_FR_D);
+    return Status::OK();
+  }
+};
+
+template <TimeUnit::type UNIT>
+class DatetimeWriter : public TypedPandasWriter<NPY_DATETIME> {
+ public:
+  using TypedPandasWriter<NPY_DATETIME>::TypedPandasWriter;
+
+  bool CanZeroCopy(const ChunkedArray& data) const override {
+    if (data.type()->id() == Type::TIMESTAMP) {
+      const auto& type = checked_cast<const TimestampType&>(*data.type());
+      return IsNonNullContiguous(data) && type.unit() == UNIT;
+    } else {
+      return false;
+    }
+  }
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    const auto& ts_type = checked_cast<const TimestampType&>(*data->type());
+    ARROW_DCHECK_EQ(UNIT, ts_type.unit()) << "Should only call instances of this writer "
+                                          << "with arrays of the correct unit";
+    ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull,
+                                    this->GetBlockColumnStart(rel_placement));
+    return Status::OK();
+  }
+
+ protected:
+  Status Allocate() override {
+    RETURN_NOT_OK(this->AllocateNDArray(NPY_DATETIME));
+    SetDatetimeUnit(internal::NumPyFrequency(UNIT));
+    return Status::OK();
+  }
+};
+
+using DatetimeSecondWriter = DatetimeWriter<TimeUnit::SECOND>;
+
+class DatetimeMilliWriter : public DatetimeWriter<TimeUnit::MILLI> {
+ public:
+  using DatetimeWriter<TimeUnit::MILLI>::DatetimeWriter;
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    Type::type type = data->type()->id();
+    int64_t* out_values = this->GetBlockColumnStart(rel_placement);
+    if (type == Type::DATE32) {
+      // Convert from days since epoch to datetime64[ms]
+      ConvertDatetime<int32_t, 86400000L>(*data, out_values);
+    } else if (type == Type::DATE64) {
+      ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
+    } else {
+      const auto& ts_type = checked_cast<const TimestampType&>(*data->type());
+      ARROW_DCHECK_EQ(TimeUnit::MILLI, ts_type.unit())
+          << "Should only call instances of this writer "
+          << "with arrays of the correct unit";
+      ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
+    }
+    return Status::OK();
+  }
+};
+
+using DatetimeMicroWriter = DatetimeWriter<TimeUnit::MICRO>;
+
+class DatetimeNanoWriter : public DatetimeWriter<TimeUnit::NANO> {
+ public:
+  using DatetimeWriter<TimeUnit::NANO>::DatetimeWriter;
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    Type::type type = data->type()->id();
+    int64_t* out_values = this->GetBlockColumnStart(rel_placement);
+    compute::ExecContext ctx(options_.pool);
+    compute::CastOptions options;
+    if (options_.safe_cast) {
+      options = compute::CastOptions::Safe();
+    } else {
+      options = compute::CastOptions::Unsafe();
+    }
+    Datum out;
+    auto target_type = timestamp(TimeUnit::NANO);
+
+    if (type == Type::DATE32) {
+      // Convert from days since epoch to datetime64[ns]
+      ConvertDatetime<int32_t, kNanosecondsInDay>(*data, out_values);
+    } else if (type == Type::DATE64) {
+      // Date64Type is millisecond timestamp stored as int64_t
+      // TODO(wesm): Do we want to make sure to zero out the milliseconds?
+      ConvertDatetime<int64_t, 1000000L>(*data, out_values);
+    } else if (type == Type::TIMESTAMP) {
+      const auto& ts_type = checked_cast<const TimestampType&>(*data->type());
+
+      if (ts_type.unit() == TimeUnit::NANO) {
+        ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
+      } else if (ts_type.unit() == TimeUnit::MICRO || ts_type.unit() == TimeUnit::MILLI ||
+                 ts_type.unit() == TimeUnit::SECOND) {
+        ARROW_ASSIGN_OR_RAISE(out, compute::Cast(data, target_type, options, &ctx));
+        ConvertNumericNullable<int64_t>(*out.chunked_array(), kPandasTimestampNull,
+                                        out_values);
+      } else {
+        return Status::NotImplemented("Unsupported time unit");
+      }
+    } else {
+      return Status::NotImplemented("Cannot write Arrow data of type ",
+                                    data->type()->ToString(),
+                                    " to a Pandas datetime block.");
+    }
+    return Status::OK();
+  }
+};
+
+template <typename BASE>
+class DatetimeTZWriter : public BASE {
+ public:
+  DatetimeTZWriter(const PandasOptions& options, const std::string& timezone,
+                   int64_t num_rows)
+      : BASE(options, num_rows, 1), timezone_(timezone) {}
+
+ protected:
+  Status GetResultBlock(PyObject** out) override {
+    RETURN_NOT_OK(this->MakeBlock1D());
+    *out = this->block_arr_.obj();
+    return Status::OK();
+  }
+
+  Status AddResultMetadata(PyObject* result) override {
+    PyObject* py_tz = PyUnicode_FromStringAndSize(
+        timezone_.c_str(), static_cast<Py_ssize_t>(timezone_.size()));
+    RETURN_IF_PYERROR();
+    PyDict_SetItemString(result, "timezone", py_tz);
+    Py_DECREF(py_tz);
+    return Status::OK();
+  }
+
+ private:
+  std::string timezone_;
+};
+
+using DatetimeSecondTZWriter = DatetimeTZWriter<DatetimeSecondWriter>;
+using DatetimeMilliTZWriter = DatetimeTZWriter<DatetimeMilliWriter>;
+using DatetimeMicroTZWriter = DatetimeTZWriter<DatetimeMicroWriter>;
+using DatetimeNanoTZWriter = DatetimeTZWriter<DatetimeNanoWriter>;
+
+template <TimeUnit::type UNIT>
+class TimedeltaWriter : public TypedPandasWriter<NPY_TIMEDELTA> {
+ public:
+  using TypedPandasWriter<NPY_TIMEDELTA>::TypedPandasWriter;
+
+  Status AllocateTimedelta(int ndim) {
+    RETURN_NOT_OK(this->AllocateNDArray(NPY_TIMEDELTA, ndim));
+    SetDatetimeUnit(internal::NumPyFrequency(UNIT));
+    return Status::OK();
+  }
+
+  bool CanZeroCopy(const ChunkedArray& data) const override {
+    const auto& type = checked_cast<const DurationType&>(*data.type());
+    return IsNonNullContiguous(data) && type.unit() == UNIT;
+  }
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    const auto& type = checked_cast<const DurationType&>(*data->type());
+    ARROW_DCHECK_EQ(UNIT, type.unit()) << "Should only call instances of this writer "
+                                       << "with arrays of the correct unit";
+    ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull,
+                                    this->GetBlockColumnStart(rel_placement));
+    return Status::OK();
+  }
+
+ protected:
+  Status Allocate() override { return AllocateTimedelta(2); }
+};
+
+using TimedeltaSecondWriter = TimedeltaWriter<TimeUnit::SECOND>;
+using TimedeltaMilliWriter = TimedeltaWriter<TimeUnit::MILLI>;
+using TimedeltaMicroWriter = TimedeltaWriter<TimeUnit::MICRO>;
+
+class TimedeltaNanoWriter : public TimedeltaWriter<TimeUnit::NANO> {
+ public:
+  using TimedeltaWriter<TimeUnit::NANO>::TimedeltaWriter;
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    Type::type type = data->type()->id();
+    int64_t* out_values = this->GetBlockColumnStart(rel_placement);
+    if (type == Type::DURATION) {
+      const auto& ts_type = checked_cast<const DurationType&>(*data->type());
+      if (ts_type.unit() == TimeUnit::NANO) {
+        ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
+      } else if (ts_type.unit() == TimeUnit::MICRO) {
+        ConvertDatetime<int64_t, 1000L>(*data, out_values);
+      } else if (ts_type.unit() == TimeUnit::MILLI) {
+        ConvertDatetime<int64_t, 1000000L>(*data, out_values);
+      } else if (ts_type.unit() == TimeUnit::SECOND) {
+        ConvertDatetime<int64_t, 1000000000L>(*data, out_values);
+      } else {
+        return Status::NotImplemented("Unsupported time unit");
+      }
+    } else {
+      return Status::NotImplemented("Cannot write Arrow data of type ",
+                                    data->type()->ToString(),
+                                    " to a Pandas timedelta block.");
+    }
+    return Status::OK();
+  }
+};
+
+Status MakeZeroLengthArray(const std::shared_ptr<DataType>& type,
+                           std::shared_ptr<Array>* out) {
+  std::unique_ptr<ArrayBuilder> builder;
+  RETURN_NOT_OK(MakeBuilder(default_memory_pool(), type, &builder));
+  RETURN_NOT_OK(builder->Resize(0));
+  return builder->Finish(out);
+}
+
+bool NeedDictionaryUnification(const ChunkedArray& data) {
+  if (data.num_chunks() < 2) {
+    return false;
+  }
+  const auto& arr_first = checked_cast<const DictionaryArray&>(*data.chunk(0));
+  for (int c = 1; c < data.num_chunks(); c++) {
+    const auto& arr = checked_cast<const DictionaryArray&>(*data.chunk(c));
+    if (!(arr_first.dictionary()->Equals(arr.dictionary()))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename IndexType>
+class CategoricalWriter
+    : public TypedPandasWriter<arrow_traits<IndexType::type_id>::npy_type> {
+ public:
+  using TRAITS = arrow_traits<IndexType::type_id>;
+  using ArrayType = typename TypeTraits<IndexType>::ArrayType;
+  using T = typename TRAITS::T;
+
+  explicit CategoricalWriter(const PandasOptions& options, int64_t num_rows)
+      : TypedPandasWriter<TRAITS::npy_type>(options, num_rows, 1),
+        ordered_(false),
+        needs_copy_(false) {}
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    return Status::NotImplemented("categorical type");
+  }
+
+  Status TransferSingle(std::shared_ptr<ChunkedArray> data, PyObject* py_ref) override {
+    const auto& dict_type = checked_cast<const DictionaryType&>(*data->type());
+    std::shared_ptr<Array> dict;
+    if (data->num_chunks() == 0) {
+      // no dictionary values => create empty array
+      RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1));
+      RETURN_NOT_OK(MakeZeroLengthArray(dict_type.value_type(), &dict));
+    } else {
+      ARROW_DCHECK_EQ(IndexType::type_id, dict_type.index_type()->id());
+      RETURN_NOT_OK(WriteIndices(*data, &dict));
+    }
+
+    PyObject* pydict;
+    RETURN_NOT_OK(ConvertArrayToPandas(this->options_, dict, nullptr, &pydict));
+    dictionary_.reset(pydict);
+    ordered_ = dict_type.ordered();
+    return Status::OK();
+  }
+
+  Status Write(std::shared_ptr<ChunkedArray> data, int64_t abs_placement,
+               int64_t rel_placement) override {
+    RETURN_NOT_OK(this->EnsurePlacementAllocated());
+    RETURN_NOT_OK(TransferSingle(data, /*py_ref=*/nullptr));
+    this->placement_data_[rel_placement] = abs_placement;
+    return Status::OK();
+  }
+
+  Status GetSeriesResult(PyObject** out) override {
+    PyAcquireGIL lock;
+
+    PyObject* result = PyDict_New();
+    RETURN_IF_PYERROR();
+
+    // Expected single array dictionary layout
+    PyDict_SetItemString(result, "indices", this->block_arr_.obj());
+    RETURN_IF_PYERROR();
+    RETURN_NOT_OK(AddResultMetadata(result));
+
+    *out = result;
+    return Status::OK();
+  }
+
+ protected:
+  Status AddResultMetadata(PyObject* result) override {
+    PyDict_SetItemString(result, "dictionary", dictionary_.obj());
+    PyObject* py_ordered = ordered_ ? Py_True : Py_False;
+    Py_INCREF(py_ordered);
+    PyDict_SetItemString(result, "ordered", py_ordered);
+    return Status::OK();
+  }
+
+  Status WriteIndicesUniform(const ChunkedArray& data) {
+    // For unsigned types, upcast to signed since pandas uses -1 for nulls
+    // uint8 to int16, uint16 to int32, uint32 to int64, signed types unchanged
+    using OutputType = std::conditional_t<
+        std::is_same<T, uint8_t>::value, int16_t,
+        std::conditional_t<
+            std::is_same<T, uint16_t>::value, int32_t,
+            std::conditional_t<std::is_same<T, uint32_t>::value, int64_t, T>>>;
+    const int npy_output_type = std::is_same<OutputType, int16_t>::value   ? NPY_INT16
+                                : std::is_same<OutputType, int32_t>::value ? NPY_INT32
+                                : std::is_same<OutputType, int64_t>::value
+                                    ? NPY_INT64
+                                    : TRAITS::npy_type;
+
+    RETURN_NOT_OK(this->AllocateNDArray(npy_output_type, 1));
+    auto out_values = reinterpret_cast<OutputType*>(this->block_data_);
+
+    for (int c = 0; c < data.num_chunks(); c++) {
+      const auto& arr = checked_cast<const DictionaryArray&>(*data.chunk(c));
+      const auto& indices = checked_cast<const ArrayType&>(*arr.indices());
+      auto values = reinterpret_cast<const T*>(indices.raw_values());
+
+      RETURN_NOT_OK(CheckIndexBounds(*indices.data(), arr.dictionary()->length()));
+      // Null is -1 in CategoricalBlock
+      for (int i = 0; i < arr.length(); ++i) {
+        if (indices.IsValid(i)) {
+          *out_values++ = static_cast<OutputType>(values[i]);
+        } else {
+          *out_values++ = -1;
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  Status WriteIndicesVarying(const ChunkedArray& data, std::shared_ptr<Array>* out_dict) {
+    // Yield int32 indices to allow for dictionary outgrowing the current index
+    // type
+    RETURN_NOT_OK(this->AllocateNDArray(NPY_INT32, 1));
+    auto out_values = reinterpret_cast<int32_t*>(this->block_data_);
+
+    const auto& dict_type = checked_cast<const DictionaryType&>(*data.type());
+
+    ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(dict_type.value_type(),
+                                                                this->options_.pool));
+    for (int c = 0; c < data.num_chunks(); c++) {
+      const auto& arr = checked_cast<const DictionaryArray&>(*data.chunk(c));
+      const auto& indices = checked_cast<const ArrayType&>(*arr.indices());
+      auto values = reinterpret_cast<const T*>(indices.raw_values());
+
+      std::shared_ptr<Buffer> transpose_buffer;
+      RETURN_NOT_OK(unifier->Unify(*arr.dictionary(), &transpose_buffer));
+
+      auto transpose = reinterpret_cast<const int32_t*>(transpose_buffer->data());
+      int64_t dict_length = arr.dictionary()->length();
+
+      RETURN_NOT_OK(CheckIndexBounds(*indices.data(), dict_length));
+
+      // Null is -1 in CategoricalBlock
+      for (int i = 0; i < arr.length(); ++i) {
+        if (indices.IsValid(i)) {
+          *out_values++ = transpose[values[i]];
+        } else {
+          *out_values++ = -1;
+        }
+      }
+    }
+
+    std::shared_ptr<DataType> unused_type;
+    return unifier->GetResult(&unused_type, out_dict);
+  }
+
+  Status WriteIndices(const ChunkedArray& data, std::shared_ptr<Array>* out_dict) {
+    ARROW_DCHECK_GT(data.num_chunks(), 0);
+
+    // Sniff the first chunk
+    const auto& arr_first = checked_cast<const DictionaryArray&>(*data.chunk(0));
+    const auto indices_first = std::static_pointer_cast<ArrayType>(arr_first.indices());
+
+    // For unsigned types, we need to convert to signed for pandas compatibility
+    // even when there are no nulls, so we skip the fast path
+    const bool is_unsigned = std::is_unsigned<T>::value;
+
+    if (data.num_chunks() == 1 && indices_first->null_count() == 0 && !is_unsigned) {
+      RETURN_NOT_OK(
+          CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length()));
+
+      PyObject* wrapped;
+      npy_intp dims[1] = {static_cast<npy_intp>(this->num_rows_)};
+      RETURN_NOT_OK(MakeNumPyView(indices_first, /*py_ref=*/nullptr, TRAITS::npy_type,
+                                  /*ndim=*/1, dims, &wrapped));
+      this->SetBlockData(wrapped);
+      *out_dict = arr_first.dictionary();
+    } else {
+      RETURN_NOT_OK(this->CheckNotZeroCopyOnly(data));
+      if (NeedDictionaryUnification(data)) {
+        RETURN_NOT_OK(WriteIndicesVarying(data, out_dict));
+      } else {
+        RETURN_NOT_OK(WriteIndicesUniform(data));
+        *out_dict = arr_first.dictionary();
+      }
+    }
+    return Status::OK();
+  }
+
+  OwnedRefNoGIL dictionary_;
+  bool ordered_;
+  bool needs_copy_;
+};
+
+class ExtensionWriter : public PandasWriter {
+ public:
+  using PandasWriter::PandasWriter;
+
+  Status Allocate() override {
+    // no-op
+    return Status::OK();
+  }
+
+  Status TransferSingle(std::shared_ptr<ChunkedArray> data, PyObject* py_ref) override {
+    PyAcquireGIL lock;
+    PyObject* py_array;
+    py_array = wrap_chunked_array(data);
+    py_array_.reset(py_array);
+
+    return Status::OK();
+  }
+
+  Status CopyInto(std::shared_ptr<ChunkedArray> data, int64_t rel_placement) override {
+    return TransferSingle(data, nullptr);
+  }
+
+  Status GetDataFrameResult(PyObject** out) override {
+    PyAcquireGIL lock;
+    PyObject* result = PyDict_New();
+    RETURN_IF_PYERROR();
+
+    PyDict_SetItemString(result, "py_array", py_array_.obj());
+    PyDict_SetItemString(result, "placement", placement_arr_.obj());
+    *out = result;
+    return Status::OK();
+  }
+
+  Status GetSeriesResult(PyObject** out) override {
+    *out = py_array_.detach();
+    return Status::OK();
+  }
+
+ protected:
+  OwnedRefNoGIL py_array_;
+};
+
+Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type,
+                  const DataType& type, int64_t num_rows, int num_columns,
+                  std::shared_ptr<PandasWriter>* writer) {
+#define BLOCK_CASE(NAME, TYPE)                                        \
+  case PandasWriter::NAME:                                            \
+    *writer = std::make_shared<TYPE>(options, num_rows, num_columns); \
+    break;
+
+#define CATEGORICAL_CASE(TYPE)                                              \
+  case TYPE::type_id:                                                       \
+    *writer = std::make_shared<CategoricalWriter<TYPE>>(options, num_rows); \
+    break;
+
+#define TZ_CASE(NAME, TYPE)                                                  \
+  case PandasWriter::NAME: {                                                 \
+    const auto& ts_type = checked_cast<const TimestampType&>(type);          \
+    *writer = std::make_shared<TYPE>(options, ts_type.timezone(), num_rows); \
+  } break;
+
+  switch (writer_type) {
+    case PandasWriter::CATEGORICAL: {
+      const auto& index_type = *checked_cast<const DictionaryType&>(type).index_type();
+      switch (index_type.id()) {
+        CATEGORICAL_CASE(Int8Type);
+        CATEGORICAL_CASE(Int16Type);
+        CATEGORICAL_CASE(Int32Type);
+        CATEGORICAL_CASE(Int64Type);
+        CATEGORICAL_CASE(UInt8Type);
+        CATEGORICAL_CASE(UInt16Type);
+        CATEGORICAL_CASE(UInt32Type);
+        case Type::UINT64:
+          return Status::TypeError(
+              "Converting UInt64 dictionary indices to pandas is not supported.");
+        default:
+          // Unreachable
+          ARROW_DCHECK(false);
+          break;
+      }
+    } break;
+    case PandasWriter::EXTENSION:
+      *writer = std::make_shared<ExtensionWriter>(options, num_rows, num_columns);
+      break;
+      BLOCK_CASE(OBJECT, ObjectWriter);
+      BLOCK_CASE(UINT8, UInt8Writer);
+      BLOCK_CASE(INT8, Int8Writer);
+      BLOCK_CASE(UINT16, UInt16Writer);
+      BLOCK_CASE(INT16, Int16Writer);
+      BLOCK_CASE(UINT32, UInt32Writer);
+      BLOCK_CASE(INT32, Int32Writer);
+      BLOCK_CASE(UINT64, UInt64Writer);
+      BLOCK_CASE(INT64, Int64Writer);
+      BLOCK_CASE(HALF_FLOAT, Float16Writer);
+      BLOCK_CASE(FLOAT, Float32Writer);
+      BLOCK_CASE(DOUBLE, Float64Writer);
+      BLOCK_CASE(BOOL, BoolWriter);
+      BLOCK_CASE(DATETIME_DAY, DatetimeDayWriter);
+      BLOCK_CASE(DATETIME_SECOND, DatetimeSecondWriter);
+      BLOCK_CASE(DATETIME_MILLI, DatetimeMilliWriter);
+      BLOCK_CASE(DATETIME_MICRO, DatetimeMicroWriter);
+      BLOCK_CASE(DATETIME_NANO, DatetimeNanoWriter);
+      BLOCK_CASE(TIMEDELTA_SECOND, TimedeltaSecondWriter);
+      BLOCK_CASE(TIMEDELTA_MILLI, TimedeltaMilliWriter);
+      BLOCK_CASE(TIMEDELTA_MICRO, TimedeltaMicroWriter);
+      BLOCK_CASE(TIMEDELTA_NANO, TimedeltaNanoWriter);
+      TZ_CASE(DATETIME_SECOND_TZ, DatetimeSecondTZWriter);
+      TZ_CASE(DATETIME_MILLI_TZ, DatetimeMilliTZWriter);
+      TZ_CASE(DATETIME_MICRO_TZ, DatetimeMicroTZWriter);
+      TZ_CASE(DATETIME_NANO_TZ, DatetimeNanoTZWriter);
+    default:
+      return Status::NotImplemented("Unsupported block type");
+  }
+
+#undef BLOCK_CASE
+#undef CATEGORICAL_CASE
+
+  return Status::OK();
+}
+
+static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& options,
+                                  PandasWriter::type* output_type) {
+#define INTEGER_CASE(NAME)                                                             \
+  *output_type =                                                                       \
+      data.null_count() > 0                                                            \
+          ? options.integer_object_nulls ? PandasWriter::OBJECT : PandasWriter::DOUBLE \
+          : PandasWriter::NAME;                                                        \
+  break;
+
+  switch (data.type()->id()) {
+    case Type::BOOL:
+      *output_type = data.null_count() > 0 ? PandasWriter::OBJECT : PandasWriter::BOOL;
+      break;
+    case Type::UINT8:
+      INTEGER_CASE(UINT8);
+    case Type::INT8:
+      INTEGER_CASE(INT8);
+    case Type::UINT16:
+      INTEGER_CASE(UINT16);
+    case Type::INT16:
+      INTEGER_CASE(INT16);
+    case Type::UINT32:
+      INTEGER_CASE(UINT32);
+    case Type::INT32:
+      INTEGER_CASE(INT32);
+    case Type::UINT64:
+      INTEGER_CASE(UINT64);
+    case Type::INT64:
+      INTEGER_CASE(INT64);
+    case Type::HALF_FLOAT:
+      *output_type = PandasWriter::HALF_FLOAT;
+      break;
+    case Type::FLOAT:
+      *output_type = PandasWriter::FLOAT;
+      break;
+    case Type::DOUBLE:
+      *output_type = PandasWriter::DOUBLE;
+      break;
+    case Type::STRING:        // fall through
+    case Type::LARGE_STRING:  // fall through
+    case Type::STRING_VIEW:   // fall through
+    case Type::BINARY:        // fall through
+    case Type::LARGE_BINARY:
+    case Type::BINARY_VIEW:
+    case Type::NA:                       // fall through
+    case Type::FIXED_SIZE_BINARY:        // fall through
+    case Type::STRUCT:                   // fall through
+    case Type::TIME32:                   // fall through
+    case Type::TIME64:                   // fall through
+    case Type::DECIMAL32:                // fall through
+    case Type::DECIMAL64:                // fall through
+    case Type::DECIMAL128:               // fall through
+    case Type::DECIMAL256:               // fall through
+    case Type::INTERVAL_MONTH_DAY_NANO:  // fall through
+      *output_type = PandasWriter::OBJECT;
+      break;
+    case Type::DATE32:
+      if (options.date_as_object) {
+        *output_type = PandasWriter::OBJECT;
+      } else if (options.coerce_temporal_nanoseconds) {
+        *output_type = PandasWriter::DATETIME_NANO;
+      } else if (options.to_numpy) {
+        // Numpy supports Day, but Pandas does not
+        *output_type = PandasWriter::DATETIME_DAY;
+      } else {
+        *output_type = PandasWriter::DATETIME_MILLI;
+      }
+      break;
+    case Type::DATE64:
+      if (options.date_as_object) {
+        *output_type = PandasWriter::OBJECT;
+      } else if (options.coerce_temporal_nanoseconds) {
+        *output_type = PandasWriter::DATETIME_NANO;
+      } else {
+        *output_type = PandasWriter::DATETIME_MILLI;
+      }
+      break;
+    case Type::TIMESTAMP: {
+      const auto& ts_type = checked_cast<const TimestampType&>(*data.type());
+      if (options.timestamp_as_object && ts_type.unit() != TimeUnit::NANO) {
+        // Nanoseconds are never out of bounds for pandas, so in that case
+        // we don't convert to object
+        *output_type = PandasWriter::OBJECT;
+      } else if (options.coerce_temporal_nanoseconds) {
+        if (!ts_type.timezone().empty()) {
+          *output_type = PandasWriter::DATETIME_NANO_TZ;
+        } else {
+          *output_type = PandasWriter::DATETIME_NANO;
+        }
+      } else {
+        if (!ts_type.timezone().empty()) {
+          switch (ts_type.unit()) {
+            case TimeUnit::SECOND:
+              *output_type = PandasWriter::DATETIME_SECOND_TZ;
+              break;
+            case TimeUnit::MILLI:
+              *output_type = PandasWriter::DATETIME_MILLI_TZ;
+              break;
+            case TimeUnit::MICRO:
+              *output_type = PandasWriter::DATETIME_MICRO_TZ;
+              break;
+            case TimeUnit::NANO:
+              *output_type = PandasWriter::DATETIME_NANO_TZ;
+              break;
+          }
+        } else {
+          switch (ts_type.unit()) {
+            case TimeUnit::SECOND:
+              *output_type = PandasWriter::DATETIME_SECOND;
+              break;
+            case TimeUnit::MILLI:
+              *output_type = PandasWriter::DATETIME_MILLI;
+              break;
+            case TimeUnit::MICRO:
+              *output_type = PandasWriter::DATETIME_MICRO;
+              break;
+            case TimeUnit::NANO:
+              *output_type = PandasWriter::DATETIME_NANO;
+              break;
+          }
+        }
+      }
+    } break;
+    case Type::DURATION: {
+      const auto& dur_type = checked_cast<const DurationType&>(*data.type());
+      if (options.coerce_temporal_nanoseconds) {
+        *output_type = PandasWriter::TIMEDELTA_NANO;
+      } else {
+        switch (dur_type.unit()) {
+          case TimeUnit::SECOND:
+            *output_type = PandasWriter::TIMEDELTA_SECOND;
+            break;
+          case TimeUnit::MILLI:
+            *output_type = PandasWriter::TIMEDELTA_MILLI;
+            break;
+          case TimeUnit::MICRO:
+            *output_type = PandasWriter::TIMEDELTA_MICRO;
+            break;
+          case TimeUnit::NANO:
+            *output_type = PandasWriter::TIMEDELTA_NANO;
+            break;
+        }
+      }
+    } break;
+    case Type::FIXED_SIZE_LIST:
+    case Type::LIST:
+    case Type::LARGE_LIST:
+    case Type::LIST_VIEW:
+    case Type::LARGE_LIST_VIEW:
+    case Type::MAP: {
+      auto list_type = std::static_pointer_cast<BaseListType>(data.type());
+      if (!ListTypeSupported(*list_type->value_type())) {
+        return Status::NotImplemented("Not implemented type for Arrow list to pandas: ",
+                                      list_type->value_type()->ToString());
+      }
+      *output_type = PandasWriter::OBJECT;
+    } break;
+    case Type::DICTIONARY:
+      *output_type = PandasWriter::CATEGORICAL;
+      break;
+    case Type::EXTENSION:
+      *output_type = PandasWriter::EXTENSION;
+      break;
+    default:
+      return Status::NotImplemented(
+          "No known equivalent Pandas block for Arrow data of type ",
+          data.type()->ToString(), " is known.");
+  }
+  return Status::OK();
+}
+
+// Construct the exact pandas "BlockManager" memory layout
+//
+// * For each column determine the correct output pandas type
+// * Allocate 2D blocks (ncols x nrows) for each distinct data type in output
+// * Allocate  block placement arrays
+// * Write Arrow columns out into each slice of memory; populate block
+// * placement arrays as we go
+class PandasBlockCreator {
+ public:
+  using WriterMap = std::unordered_map<int, std::shared_ptr<PandasWriter>>;
+
+  explicit PandasBlockCreator(const PandasOptions& options, FieldVector fields,
+                              ChunkedArrayVector arrays)
+      : options_(options), fields_(std::move(fields)), arrays_(std::move(arrays)) {
+    num_columns_ = static_cast<int>(arrays_.size());
+    if (num_columns_ > 0) {
+      num_rows_ = arrays_[0]->length();
+    }
+    column_block_placement_.resize(num_columns_);
+  }
+  virtual ~PandasBlockCreator() = default;
+
+  virtual Status Convert(PyObject** out) = 0;
+
+  Status AppendBlocks(const WriterMap& blocks, PyObject* list) {
+    for (const auto& it : blocks) {
+      PyObject* item;
+      RETURN_NOT_OK(it.second->GetDataFrameResult(&item));
+      if (PyList_Append(list, item) < 0) {
+        RETURN_IF_PYERROR();
+      }
+
+      // ARROW-1017; PyList_Append increments object refcount
+      Py_DECREF(item);
+    }
+    return Status::OK();
+  }
+
+ protected:
+  PandasOptions options_;
+
+  FieldVector fields_;
+  ChunkedArrayVector arrays_;
+  int num_columns_;
+  int64_t num_rows_;
+
+  // column num -> relative placement within internal block
+  std::vector<int> column_block_placement_;
+};
+
+// Helper function for extension chunked arrays
+// Constructing a storage chunked array of an extension chunked array
+std::shared_ptr<ChunkedArray> GetStorageChunkedArray(std::shared_ptr<ChunkedArray> arr) {
+  auto value_type = checked_cast<const ExtensionType&>(*arr->type()).storage_type();
+  ArrayVector storage_arrays;
+  for (int c = 0; c < arr->num_chunks(); c++) {
+    const auto& arr_ext = checked_cast<const ExtensionArray&>(*arr->chunk(c));
+    storage_arrays.emplace_back(arr_ext.storage());
+  }
+  return std::make_shared<ChunkedArray>(std::move(storage_arrays), value_type);
+};
+
+// Helper function to decode RunEndEncodedArray
+Result<std::shared_ptr<ChunkedArray>> GetDecodedChunkedArray(
+    std::shared_ptr<ChunkedArray> arr) {
+  ARROW_ASSIGN_OR_RAISE(Datum decoded, compute::RunEndDecode(arr));
+  ARROW_DCHECK(decoded.is_chunked_array());
+  return decoded.chunked_array();
+};
+
+class ConsolidatedBlockCreator : public PandasBlockCreator {
+ public:
+  using PandasBlockCreator::PandasBlockCreator;
+
+  Status Convert(PyObject** out) override {
+    column_types_.resize(num_columns_);
+    RETURN_NOT_OK(CreateBlocks());
+    RETURN_NOT_OK(WriteTableToBlocks());
+    PyAcquireGIL lock;
+
+    PyObject* result = PyList_New(0);
+    RETURN_IF_PYERROR();
+
+    RETURN_NOT_OK(AppendBlocks(blocks_, result));
+    RETURN_NOT_OK(AppendBlocks(singleton_blocks_, result));
+
+    *out = result;
+    return Status::OK();
+  }
+
+  Status GetBlockType(int column_index, PandasWriter::type* out) {
+    if (options_.IsExtensionColumn(fields_[column_index]->name())) {
+      *out = PandasWriter::EXTENSION;
+      return Status::OK();
+    } else {
+      // In case of an extension array default to the storage type
+      if (arrays_[column_index]->type()->id() == Type::EXTENSION) {
+        arrays_[column_index] = GetStorageChunkedArray(arrays_[column_index]);
+      }
+      // In case of a RunEndEncodedArray default to the values type
+      else if (arrays_[column_index]->type()->id() == Type::RUN_END_ENCODED) {
+        ARROW_ASSIGN_OR_RAISE(arrays_[column_index],
+                              GetDecodedChunkedArray(arrays_[column_index]));
+      }
+      return GetPandasWriterType(*arrays_[column_index], options_, out);
+    }
+  }
+
+  Status CreateBlocks() {
+    for (int i = 0; i < num_columns_; ++i) {
+      const DataType& type = *arrays_[i]->type();
+      PandasWriter::type output_type;
+      RETURN_NOT_OK(GetBlockType(i, &output_type));
+
+      int block_placement = 0;
+      std::shared_ptr<PandasWriter> writer;
+      if (output_type == PandasWriter::CATEGORICAL ||
+          output_type == PandasWriter::DATETIME_SECOND_TZ ||
+          output_type == PandasWriter::DATETIME_MILLI_TZ ||
+          output_type == PandasWriter::DATETIME_MICRO_TZ ||
+          output_type == PandasWriter::DATETIME_NANO_TZ ||
+          output_type == PandasWriter::EXTENSION) {
+        RETURN_NOT_OK(MakeWriter(options_, output_type, type, num_rows_,
+                                 /*num_columns=*/1, &writer));
+        singleton_blocks_[i] = writer;
+      } else {
+        auto it = block_sizes_.find(output_type);
+        if (it != block_sizes_.end()) {
+          block_placement = it->second;
+          // Increment count
+          ++it->second;
+        } else {
+          // Add key to map
+          block_sizes_[output_type] = 1;
+        }
+      }
+      column_types_[i] = output_type;
+      column_block_placement_[i] = block_placement;
+    }
+
+    // Create normal non-categorical blocks
+    for (const auto& it : this->block_sizes_) {
+      PandasWriter::type output_type = static_cast<PandasWriter::type>(it.first);
+      std::shared_ptr<PandasWriter> block;
+      RETURN_NOT_OK(MakeWriter(this->options_, output_type, /*unused*/ *null(), num_rows_,
+                               it.second, &block));
+      this->blocks_[output_type] = block;
+    }
+    return Status::OK();
+  }
+
+  Status GetWriter(int i, std::shared_ptr<PandasWriter>* block) {
+    PandasWriter::type output_type = this->column_types_[i];
+    switch (output_type) {
+      case PandasWriter::CATEGORICAL:
+      case PandasWriter::DATETIME_SECOND_TZ:
+      case PandasWriter::DATETIME_MILLI_TZ:
+      case PandasWriter::DATETIME_MICRO_TZ:
+      case PandasWriter::DATETIME_NANO_TZ:
+      case PandasWriter::EXTENSION: {
+        auto it = this->singleton_blocks_.find(i);
+        if (it == this->singleton_blocks_.end()) {
+          return Status::KeyError("No block allocated");
+        }
+        *block = it->second;
+      } break;
+      default:
+        auto it = this->blocks_.find(output_type);
+        if (it == this->blocks_.end()) {
+          return Status::KeyError("No block allocated");
+        }
+        *block = it->second;
+        break;
+    }
+    return Status::OK();
+  }
+
+  Status WriteTableToBlocks() {
+    auto WriteColumn = [this](int i) {
+      std::shared_ptr<PandasWriter> block;
+      RETURN_NOT_OK(this->GetWriter(i, &block));
+      // ARROW-3789 Use std::move on the array to permit self-destructing
+      return block->Write(std::move(arrays_[i]), i, this->column_block_placement_[i]);
+    };
+
+    return OptionalParallelFor(options_.use_threads, num_columns_, WriteColumn);
+  }
+
+ private:
+  // column num -> block type id
+  std::vector<PandasWriter::type> column_types_;
+
+  // block type -> type count
+  std::unordered_map<int, int> block_sizes_;
+  std::unordered_map<int, const DataType*> block_types_;
+
+  // block type -> block
+  WriterMap blocks_;
+
+  WriterMap singleton_blocks_;
+};
+
+/// \brief Create blocks for pandas.DataFrame block manager using one block per
+/// column strategy. This permits some zero-copy optimizations as well as the
+/// ability for the table to "self-destruct" if selected by the user.
+class SplitBlockCreator : public PandasBlockCreator {
+ public:
+  using PandasBlockCreator::PandasBlockCreator;
+
+  Status GetWriter(int i, std::shared_ptr<PandasWriter>* writer) {
+    PandasWriter::type output_type = PandasWriter::OBJECT;
+    const DataType& type = *arrays_[i]->type();
+    if (options_.IsExtensionColumn(fields_[i]->name())) {
+      output_type = PandasWriter::EXTENSION;
+    } else {
+      // Null count needed to determine output type
+      RETURN_NOT_OK(GetPandasWriterType(*arrays_[i], options_, &output_type));
+    }
+    return MakeWriter(this->options_, output_type, type, num_rows_, 1, writer);
+  }
+
+  Status Convert(PyObject** out) override {
+    PyAcquireGIL lock;
+
+    PyObject* result = PyList_New(0);
+    RETURN_IF_PYERROR();
+
+    for (int i = 0; i < num_columns_; ++i) {
+      std::shared_ptr<PandasWriter> writer;
+      RETURN_NOT_OK(GetWriter(i, &writer));
+      // ARROW-3789 Use std::move on the array to permit self-destructing
+      RETURN_NOT_OK(writer->Write(std::move(arrays_[i]), i, /*rel_placement=*/0));
+
+      PyObject* item;
+      RETURN_NOT_OK(writer->GetDataFrameResult(&item));
+      if (PyList_Append(result, item) < 0) {
+        RETURN_IF_PYERROR();
+      }
+      // PyList_Append increments object refcount
+      Py_DECREF(item);
+    }
+
+    *out = result;
+    return Status::OK();
+  }
+
+ private:
+  std::vector<std::shared_ptr<PandasWriter>> writers_;
+};
+
+Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arrays,
+                           FieldVector* fields) {
+  std::vector<int> columns_to_encode;
+
+  // For Categorical conversions
+  auto EncodeColumn = [&](int j) {
+    int i = columns_to_encode[j];
+    if (options.zero_copy_only) {
+      return Status::Invalid("Need to dictionary encode a column, but ",
+                             "only zero-copy conversions allowed");
+    }
+    compute::ExecContext ctx(options.pool);
+    ARROW_ASSIGN_OR_RAISE(
+        Datum out, DictionaryEncode((*arrays)[i],
+                                    compute::DictionaryEncodeOptions::Defaults(), &ctx));
+    (*arrays)[i] = out.chunked_array();
+    (*fields)[i] = (*fields)[i]->WithType((*arrays)[i]->type());
+    return Status::OK();
+  };
+
+  if (options.HasCategoricalColumns()) {
+    for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
+      if ((*arrays)[i]->type()->id() != Type::DICTIONARY &&
+          options.IsCategoricalColumn((*fields)[i]->name())) {
+        columns_to_encode.push_back(i);
+      }
+    }
+  }
+  if (options.strings_to_categorical) {
+    for (int i = 0; i < static_cast<int>(arrays->size()); i++) {
+      if (is_base_binary_like((*arrays)[i]->type()->id()) ||
+          is_binary_view_like((*arrays)[i]->type()->id())) {
+        columns_to_encode.push_back(i);
+      }
+    }
+  }
+  return OptionalParallelFor(options.use_threads,
+                             static_cast<int>(columns_to_encode.size()), EncodeColumn);
+}
+
+}  // namespace
+
+Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr<Array> arr,
+                            PyObject* py_ref, PyObject** out) {
+  return ConvertChunkedArrayToPandas(
+      options, std::make_shared<ChunkedArray>(std::move(arr)), py_ref, out);
+}
+
+Status ConvertChunkedArrayToPandas(const PandasOptions& options,
+                                   std::shared_ptr<ChunkedArray> arr, PyObject* py_ref,
+                                   PyObject** out) {
+  if (options.decode_dictionaries && arr->type()->id() == Type::DICTIONARY) {
+    // XXX we should return an error as below if options.zero_copy_only
+    // is true, but that would break compatibility with existing tests.
+    const auto& dense_type =
+        checked_cast<const DictionaryType&>(*arr->type()).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &arr));
+    ARROW_DCHECK_NE(arr->type()->id(), Type::DICTIONARY);
+
+    // The original Python DictionaryArray won't own the memory anymore
+    // as we actually built a new array when we decoded the DictionaryArray
+    // thus let the final resulting numpy array own the memory through a Capsule
+    py_ref = nullptr;
+  }
+
+  if (options.strings_to_categorical && (is_base_binary_like(arr->type()->id()) ||
+                                         is_binary_view_like(arr->type()->id()))) {
+    if (options.zero_copy_only) {
+      return Status::Invalid("Need to dictionary encode a column, but ",
+                             "only zero-copy conversions allowed");
+    }
+    compute::ExecContext ctx(options.pool);
+    ARROW_ASSIGN_OR_RAISE(
+        Datum out,
+        DictionaryEncode(arr, compute::DictionaryEncodeOptions::Defaults(), &ctx));
+    arr = out.chunked_array();
+  }
+
+  PandasOptions modified_options = options;
+  modified_options.strings_to_categorical = false;
+
+  // ARROW-7596: We permit the hybrid Series/DataFrame code path to do zero copy
+  // optimizations that we do not allow in the default case when converting
+  // Table->DataFrame
+  modified_options.allow_zero_copy_blocks = true;
+
+  // In case of an extension array default to the storage type
+  if (arr->type()->id() == Type::EXTENSION) {
+    arr = GetStorageChunkedArray(arr);
+  }
+  // In case of a RunEndEncodedArray decode the array
+  else if (arr->type()->id() == Type::RUN_END_ENCODED) {
+    if (options.zero_copy_only) {
+      return Status::Invalid("Need to dencode a RunEndEncodedArray, but ",
+                             "only zero-copy conversions allowed");
+    }
+    ARROW_ASSIGN_OR_RAISE(arr, GetDecodedChunkedArray(arr));
+
+    // Because we built a new array when we decoded the RunEndEncodedArray
+    // the final resulting numpy array should own the memory through a Capsule
+    py_ref = nullptr;
+  }
+
+  PandasWriter::type output_type;
+  RETURN_NOT_OK(GetPandasWriterType(*arr, modified_options, &output_type));
+  if (options.decode_dictionaries) {
+    ARROW_DCHECK_NE(output_type, PandasWriter::CATEGORICAL);
+  }
+
+  std::shared_ptr<PandasWriter> writer;
+  RETURN_NOT_OK(MakeWriter(modified_options, output_type, *arr->type(), arr->length(),
+                           /*num_columns=*/1, &writer));
+  RETURN_NOT_OK(writer->TransferSingle(std::move(arr), py_ref));
+  return writer->GetSeriesResult(out);
+}
+
+Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table> table,
+                            PyObject** out) {
+  ChunkedArrayVector arrays = table->columns();
+  FieldVector fields = table->fields();
+
+  // ARROW-3789: allow "self-destructing" by releasing references to columns as
+  // we convert them to pandas
+  table = nullptr;
+
+  RETURN_NOT_OK(ConvertCategoricals(options, &arrays, &fields));
+
+  PandasOptions modified_options = options;
+  modified_options.strings_to_categorical = false;
+  modified_options.categorical_columns.reset();
+
+  if (options.split_blocks) {
+    modified_options.allow_zero_copy_blocks = true;
+    SplitBlockCreator helper(modified_options, std::move(fields), std::move(arrays));
+    return helper.Convert(out);
+  } else {
+    ConsolidatedBlockCreator helper(modified_options, std::move(fields),
+                                    std::move(arrays));
+    return helper.Convert(out);
+  }
+}
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/arrow_to_pandas.h b/pyarrow/src/arrow/python/arrow_to_pandas.h
new file mode 100644
index 0000000000000000000000000000000000000000..b4e91e6cf5ab4469236c7eec3c8174d4b0d47427
--- /dev/null
+++ b/pyarrow/src/arrow/python/arrow_to_pandas.h
@@ -0,0 +1,165 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for converting between pandas's NumPy-based data representation
+// and Arrow data structures
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "arrow/memory_pool.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class Column;
+class DataType;
+class MemoryPool;
+class Status;
+class Table;
+
+namespace py {
+
+enum class MapConversionType {
+  DEFAULT,  // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas
+  LOSSY,    // report warnings when lossiness is encountered due to duplicate keys
+  STRICT_,  // raise a Python exception when lossiness is encountered due to duplicate
+            // keys
+};
+
+struct PandasOptions {
+  bool HasCategoricalColumns() const {
+    return categorical_columns && !categorical_columns->empty();
+  }
+
+  bool IsCategoricalColumn(const std::string& name) const {
+    return categorical_columns && categorical_columns->count(name);
+  }
+
+  bool HasExtensionColumns() const {
+    return extension_columns && !extension_columns->empty();
+  }
+
+  bool IsExtensionColumn(const std::string& name) const {
+    return extension_columns && extension_columns->count(name);
+  }
+
+  /// arrow::MemoryPool to use for memory allocations
+  MemoryPool* pool = default_memory_pool();
+
+  /// If true, we will convert all string columns to categoricals
+  bool strings_to_categorical = false;
+  bool zero_copy_only = false;
+  bool integer_object_nulls = false;
+  bool date_as_object = false;
+  bool timestamp_as_object = false;
+  bool use_threads = false;
+
+  /// Coerce all date and timestamp to datetime64[ns]
+  bool coerce_temporal_nanoseconds = false;
+
+  /// Used to maintain backwards compatibility for
+  /// timezone bugs (see ARROW-9528).  Should be removed
+  /// after Arrow 2.0 release.
+  bool ignore_timezone = false;
+
+  /// \brief If true, do not create duplicate PyObject versions of equal
+  /// objects. This only applies to immutable objects like strings or datetime
+  /// objects
+  bool deduplicate_objects = false;
+
+  /// \brief For certain data types, a cast is needed in order to store the
+  /// data in a pandas DataFrame or Series (e.g. timestamps are always stored
+  /// as nanoseconds in pandas). This option controls whether it is a safe
+  /// cast or not.
+  bool safe_cast = true;
+
+  /// \brief If true, create one block per column rather than consolidated
+  /// blocks (1 per data type). Do zero-copy wrapping when there are no
+  /// nulls. pandas currently will consolidate the blocks on its own, causing
+  /// increased memory use, so keep this in mind if you are working on a
+  /// memory-constrained situation.
+  bool split_blocks = false;
+
+  /// \brief If true, allow non-writable zero-copy views to be created for
+  /// single column blocks. This option is also used to provide zero copy for
+  /// Series data
+  bool allow_zero_copy_blocks = false;
+
+  /// \brief If true, attempt to deallocate buffers in passed Arrow object if
+  /// it is the only remaining shared_ptr copy of it. See ARROW-3789 for
+  /// original context for this feature. Only currently implemented for Table
+  /// conversions
+  bool self_destruct = false;
+
+  /// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to
+  /// Python association lists (list-of-tuples) in the same order as the Arrow
+  /// Map, as in [(key1, value1), (key2, value2), ...]
+  /// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts.
+  /// This can change the ordering of (key, value) pairs, and will deduplicate
+  /// multiple keys, resulting in a possible loss of data.
+  /// If 'lossy', this key deduplication results in a warning printed
+  /// when detected. If 'strict', this instead results in an exception
+  /// being raised when detected.
+  MapConversionType maps_as_pydicts = MapConversionType::DEFAULT;
+
+  // Used internally for nested arrays.
+  bool decode_dictionaries = false;
+
+  // Columns that should be casted to categorical
+  //
+  // This is wrapped in a shared_ptr because this struct is copied internally for
+  // each column or nested field (see GH-47861).
+  std::shared_ptr<const std::unordered_set<std::string>> categorical_columns;
+
+  // Columns that should be passed through to be converted to
+  // ExtensionArray/Block
+  std::shared_ptr<const std::unordered_set<std::string>> extension_columns;
+
+  // Used internally to decipher between to_numpy() and to_pandas() when
+  // the expected output differs
+  bool to_numpy = false;
+};
+
+ARROW_PYTHON_EXPORT
+Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr<Array> arr,
+                            PyObject* py_ref, PyObject** out);
+
+ARROW_PYTHON_EXPORT
+Status ConvertChunkedArrayToPandas(const PandasOptions& options,
+                                   std::shared_ptr<ChunkedArray> col, PyObject* py_ref,
+                                   PyObject** out);
+
+// Convert a whole table as efficiently as possible to a pandas.DataFrame.
+//
+// The returned Python object is a list of tuples consisting of the exact 2D
+// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x.
+//
+// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
+ARROW_PYTHON_EXPORT
+Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table> table,
+                            PyObject** out);
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/arrow_to_python_internal.h b/pyarrow/src/arrow/python/arrow_to_python_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..514cda320012316b1f9bc04a76c45159dc5bd181
--- /dev/null
+++ b/pyarrow/src/arrow/python/arrow_to_python_internal.h
@@ -0,0 +1,49 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/array.h"
+#include "arrow/python/platform.h"
+
+namespace arrow {
+namespace py {
+namespace internal {
+// TODO(ARROW-12976):  See if we can refactor Pandas ObjectWriter logic
+// to the .cc file and move this there as well if we can.
+
+// Converts array to a sequency of python objects.
+template <typename ArrayType, typename WriteValue, typename Assigner>
+inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func,
+                                Assigner out_values) {
+  // TODO(ARROW-12976): Use visitor here?
+  const bool has_nulls = arr.null_count() > 0;
+  for (int64_t i = 0; i < arr.length(); ++i) {
+    if (has_nulls && arr.IsNull(i)) {
+      Py_INCREF(Py_None);
+      *out_values = Py_None;
+    } else {
+      RETURN_NOT_OK(write_func(arr.GetView(i), out_values));
+    }
+    ++out_values;
+  }
+  return Status::OK();
+}
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/async.h b/pyarrow/src/arrow/python/async.h
new file mode 100644
index 0000000000000000000000000000000000000000..1568d21938e6e79e724d957120e68a7576ba9c2a
--- /dev/null
+++ b/pyarrow/src/arrow/python/async.h
@@ -0,0 +1,60 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <utility>
+
+#include "arrow/python/common.h"
+#include "arrow/status.h"
+#include "arrow/util/future.h"
+
+namespace arrow::py {
+
+/// \brief Bind a Python callback to an arrow::Future.
+///
+/// If the Future finishes successfully, py_wrapper is called with its
+/// result value and should return a PyObject*. If py_wrapper is successful,
+/// py_cb is called with its return value.
+///
+/// If either the Future or py_wrapper fails, py_cb is called with the
+/// associated Python exception.
+///
+/// \param future The future to bind to.
+/// \param py_cb The Python callback function. Will be passed the result of
+///   py_wrapper, or a Python exception if the future failed or one was
+///   raised by py_wrapper.
+/// \param py_wrapper A function (likely defined in Cython) to convert the C++
+///   result of the future to a Python object.
+template <typename T, typename PyWrapper = PyObject* (*)(T)>
+void BindFuture(Future<T> future, PyObject* py_cb, PyWrapper py_wrapper) {
+  Py_INCREF(py_cb);
+  OwnedRefNoGIL cb_ref(py_cb);
+
+  auto future_cb = [cb_ref = std::move(cb_ref),
+                    py_wrapper = std::move(py_wrapper)](Result<T> result) {
+    SafeCallIntoPythonVoid([&]() {
+      OwnedRef py_value_or_exc{WrapResult(std::move(result), std::move(py_wrapper))};
+      Py_XDECREF(
+          PyObject_CallFunctionObjArgs(cb_ref.obj(), py_value_or_exc.obj(), NULLPTR));
+      ARROW_WARN_NOT_OK(CheckPyError(), "Internal error in async call");
+    });
+  };
+  future.AddCallback(std::move(future_cb));
+}
+
+}  // namespace arrow::py
diff --git a/pyarrow/src/arrow/python/benchmark.cc b/pyarrow/src/arrow/python/benchmark.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6dcc959ed221247eb93a80179e61a1f40a726e29
--- /dev/null
+++ b/pyarrow/src/arrow/python/benchmark.cc
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/benchmark.h"
+#include "arrow/python/helpers.h"
+
+namespace arrow {
+namespace py {
+namespace benchmark {
+
+void Benchmark_PandasObjectIsNull(PyObject* list) {
+  if (!PyList_CheckExact(list)) {
+    PyErr_SetString(PyExc_TypeError, "expected a list");
+    return;
+  }
+  Py_ssize_t i, n = PyList_GET_SIZE(list);
+  for (i = 0; i < n; i++) {
+    internal::PandasObjectIsNull(PyList_GET_ITEM(list, i));
+  }
+}
+
+}  // namespace benchmark
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/benchmark.h b/pyarrow/src/arrow/python/benchmark.h
new file mode 100644
index 0000000000000000000000000000000000000000..8060dd33722a08eb0935687ea5cb306dbd38a9f0
--- /dev/null
+++ b/pyarrow/src/arrow/python/benchmark.h
@@ -0,0 +1,36 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+namespace py {
+namespace benchmark {
+
+// Micro-benchmark routines for use from ASV
+
+// Run PandasObjectIsNull() once over every object in *list*
+ARROW_PYTHON_EXPORT
+void Benchmark_PandasObjectIsNull(PyObject* list);
+
+}  // namespace benchmark
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/common.cc b/pyarrow/src/arrow/python/common.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6d963b9b4e43ace79d686f9739baf002bf1a3a7
--- /dev/null
+++ b/pyarrow/src/arrow/python/common.cc
@@ -0,0 +1,246 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/common.h"
+
+#include <cstdlib>
+#include <mutex>
+#include <sstream>
+#include <string>
+
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+#include "arrow/python/helpers.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace py {
+
+static std::mutex memory_pool_mutex;
+static MemoryPool* default_python_pool = nullptr;
+
+void set_default_memory_pool(MemoryPool* pool) {
+  std::lock_guard<std::mutex> guard(memory_pool_mutex);
+  default_python_pool = pool;
+}
+
+MemoryPool* get_memory_pool() {
+  std::lock_guard<std::mutex> guard(memory_pool_mutex);
+  if (default_python_pool) {
+    return default_python_pool;
+  } else {
+    return default_memory_pool();
+  }
+}
+
+// ----------------------------------------------------------------------
+// PythonErrorDetail
+
+namespace {
+
+const char kErrorDetailTypeId[] = "arrow::py::PythonErrorDetail";
+
+// Try to match the Python exception type with an appropriate Status code
+StatusCode MapPyError(PyObject* exc_type) {
+  StatusCode code;
+
+  if (PyErr_GivenExceptionMatches(exc_type, PyExc_MemoryError)) {
+    code = StatusCode::OutOfMemory;
+  } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_IndexError)) {
+    code = StatusCode::IndexError;
+  } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_KeyError)) {
+    code = StatusCode::KeyError;
+  } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_TypeError)) {
+    code = StatusCode::TypeError;
+  } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_ValueError) ||
+             PyErr_GivenExceptionMatches(exc_type, PyExc_OverflowError)) {
+    code = StatusCode::Invalid;
+  } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_EnvironmentError)) {
+    code = StatusCode::IOError;
+  } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_NotImplementedError)) {
+    code = StatusCode::NotImplemented;
+  } else {
+    code = StatusCode::UnknownError;
+  }
+  return code;
+}
+
+// PythonErrorDetail indicates a Python exception was raised.
+class PythonErrorDetail : public StatusDetail {
+ public:
+  const char* type_id() const override { return kErrorDetailTypeId; }
+
+  std::string ToString() const override {
+    // This is simple enough not to need the GIL
+    Result<std::string> result = FormatImpl();
+
+    if (result.ok()) {
+      return result.ValueOrDie();
+    } else {
+      // Fallback to just the exception type
+      const auto ty = reinterpret_cast<const PyTypeObject*>(exc_type_.obj());
+      return std::string("Python exception: ") + ty->tp_name;
+    }
+  }
+
+  void RestorePyError() const {
+    Py_INCREF(exc_type_.obj());
+    Py_INCREF(exc_value_.obj());
+    Py_INCREF(exc_traceback_.obj());
+    PyErr_Restore(exc_type_.obj(), exc_value_.obj(), exc_traceback_.obj());
+  }
+
+  PyObject* exc_type() const { return exc_type_.obj(); }
+
+  PyObject* exc_value() const { return exc_value_.obj(); }
+
+  static std::shared_ptr<PythonErrorDetail> FromPyError() {
+    PyObject* exc_type = nullptr;
+    PyObject* exc_value = nullptr;
+    PyObject* exc_traceback = nullptr;
+
+    PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
+    PyErr_NormalizeException(&exc_type, &exc_value, &exc_traceback);
+    ARROW_CHECK(exc_type)
+        << "PythonErrorDetail::FromPyError called without a Python error set";
+    ARROW_DCHECK(PyType_Check(exc_type));
+    ARROW_DCHECK(exc_value);  // Ensured by PyErr_NormalizeException, double-check
+    if (exc_traceback == nullptr) {
+      // Needed by PyErr_Restore()
+      Py_INCREF(Py_None);
+      exc_traceback = Py_None;
+    }
+
+    std::shared_ptr<PythonErrorDetail> detail(new PythonErrorDetail);
+    detail->exc_type_.reset(exc_type);
+    detail->exc_value_.reset(exc_value);
+    detail->exc_traceback_.reset(exc_traceback);
+    return detail;
+  }
+
+ protected:
+  Result<std::string> FormatImpl() const {
+    PyAcquireGIL lock;
+
+    // Use traceback.format_exception()
+    OwnedRef traceback_module;
+    RETURN_NOT_OK(internal::ImportModule("traceback", &traceback_module));
+
+    OwnedRef fmt_exception;
+    RETURN_NOT_OK(internal::ImportFromModule(traceback_module.obj(), "format_exception",
+                                             &fmt_exception));
+
+    OwnedRef formatted;
+    formatted.reset(PyObject_CallFunctionObjArgs(fmt_exception.obj(), exc_type_.obj(),
+                                                 exc_value_.obj(), exc_traceback_.obj(),
+                                                 NULL));
+    RETURN_IF_PYERROR();
+
+    std::stringstream ss;
+    ss << "Python exception: ";
+    Py_ssize_t num_lines = PySequence_Length(formatted.obj());
+    RETURN_IF_PYERROR();
+
+    for (Py_ssize_t i = 0; i < num_lines; ++i) {
+      Py_ssize_t line_size;
+
+      PyObject* line = PySequence_GetItem(formatted.obj(), i);
+      RETURN_IF_PYERROR();
+
+      const char* data = PyUnicode_AsUTF8AndSize(line, &line_size);
+      RETURN_IF_PYERROR();
+
+      ss << std::string_view(data, line_size);
+    }
+    return ss.str();
+  }
+
+  PythonErrorDetail() = default;
+
+  OwnedRefNoGIL exc_type_, exc_value_, exc_traceback_;
+};
+
+}  // namespace
+
+// ----------------------------------------------------------------------
+// Python exception <-> Status
+
+Status ConvertPyError(StatusCode code) {
+  auto detail = PythonErrorDetail::FromPyError();
+  if (code == StatusCode::UnknownError) {
+    code = MapPyError(detail->exc_type());
+  }
+
+  std::string message;
+  RETURN_NOT_OK(internal::PyObject_StdStringStr(detail->exc_value(), &message));
+  return Status(code, message, detail);
+}
+
+bool IsPyError(const Status& status) {
+  if (status.ok()) {
+    return false;
+  }
+  auto detail = status.detail();
+  bool result = detail != nullptr && detail->type_id() == kErrorDetailTypeId;
+  return result;
+}
+
+void RestorePyError(const Status& status) {
+  ARROW_CHECK(IsPyError(status));
+  const auto& detail = checked_cast<const PythonErrorDetail&>(*status.detail());
+  detail.RestorePyError();
+}
+
+// ----------------------------------------------------------------------
+// PyBuffer
+
+PyBuffer::PyBuffer() : Buffer(nullptr, 0) {}
+
+Status PyBuffer::Init(PyObject* obj) {
+  if (!PyObject_GetBuffer(obj, &py_buf_, PyBUF_ANY_CONTIGUOUS)) {
+    data_ = reinterpret_cast<const uint8_t*>(py_buf_.buf);
+    ARROW_CHECK_NE(data_, nullptr) << "Null pointer in Py_buffer";
+    size_ = py_buf_.len;
+    capacity_ = py_buf_.len;
+    is_mutable_ = !py_buf_.readonly;
+    return Status::OK();
+  } else {
+    return ConvertPyError(StatusCode::Invalid);
+  }
+}
+
+Result<std::shared_ptr<Buffer>> PyBuffer::FromPyObject(PyObject* obj) {
+  PyBuffer* buf = new PyBuffer();
+  std::shared_ptr<Buffer> res(buf);
+  RETURN_NOT_OK(buf->Init(obj));
+  return res;
+}
+
+PyBuffer::~PyBuffer() {
+  if (data_ != nullptr) {
+    PyAcquireGIL lock;
+    PyBuffer_Release(&py_buf_);
+  }
+}
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/common.h b/pyarrow/src/arrow/python/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..affefe2859b65f04f711de1f90d14f642641c5fb
--- /dev/null
+++ b/pyarrow/src/arrow/python/common.h
@@ -0,0 +1,457 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "arrow/buffer.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/visibility.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+class MemoryPool;
+template <class T>
+class Result;
+
+namespace py {
+
+// Convert current Python error to a Status.  The Python error state is cleared
+// and can be restored with RestorePyError().
+ARROW_PYTHON_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError);
+// Query whether the given Status is a Python error (as wrapped by ConvertPyError()).
+ARROW_PYTHON_EXPORT bool IsPyError(const Status& status);
+// Restore a Python error wrapped in a Status.
+ARROW_PYTHON_EXPORT void RestorePyError(const Status& status);
+
+// Catch a pending Python exception and return the corresponding Status.
+// If no exception is pending, Status::OK() is returned.
+inline Status CheckPyError(StatusCode code = StatusCode::UnknownError) {
+  if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
+    return Status::OK();
+  } else {
+    return ConvertPyError(code);
+  }
+}
+
+#define RETURN_IF_PYERROR() ARROW_RETURN_NOT_OK(CheckPyError())
+
+#define PY_RETURN_IF_ERROR(CODE) ARROW_RETURN_NOT_OK(CheckPyError(CODE))
+
+// For Cython, as you can't define template C++ functions in Cython, only use them.
+// This function can set a Python exception.  It assumes that T has a (cheap)
+// default constructor.
+template <class T>
+T GetResultValue(Result<T> result) {
+  if (ARROW_PREDICT_TRUE(result.ok())) {
+    return *std::move(result);
+  } else {
+    int r = internal::check_status(result.status());  // takes the GIL
+    assert(r == -1);                                  // should have errored out
+    ARROW_UNUSED(r);
+    return {};
+  }
+}
+
+/// \brief Wrap a Result and return the corresponding Python object.
+///
+/// If the Result is successful, py_wrapper is called with its result value
+/// and should return a PyObject*. If py_wrapper is successful (returns
+/// a non-NULL value), its return value is returned.
+///
+/// If either the Result or py_wrapper fails, the associated Python exception
+/// is raised and NULL is returned.
+//
+/// \param result The Result whose value to wrap in a Python object.
+/// \param py_wrapper A function (likely defined in Cython) to convert the C++
+///   value of the Result to a Python object.
+/// \return A new Python reference, or NULL if an exception occurred
+template <typename T, typename PyWrapper = PyObject* (*)(T)>
+PyObject* WrapResult(Result<T> result, PyWrapper&& py_wrapper) {
+  static_assert(std::is_same_v<PyObject*, decltype(py_wrapper(std::declval<T>()))>,
+                "PyWrapper argument to WrapResult should return a PyObject* "
+                "when called with a T*");
+  Status st = result.status();
+  if (st.ok()) {
+    PyObject* py_value = py_wrapper(result.MoveValueUnsafe());
+    st = CheckPyError();
+    if (st.ok()) {
+      return py_value;
+    }
+    Py_XDECREF(py_value);  // should be null, but who knows
+  }
+  // Status is an error, convert it to an exception.
+  return internal::convert_status(st);
+}
+
+// A RAII-style helper that ensures the GIL is acquired inside a lexical block.
+class ARROW_PYTHON_EXPORT PyAcquireGIL {
+ public:
+  PyAcquireGIL() : acquired_gil_(false) { acquire(); }
+
+  ~PyAcquireGIL() { release(); }
+
+  void acquire() {
+    if (!acquired_gil_) {
+      state_ = PyGILState_Ensure();
+      acquired_gil_ = true;
+    }
+  }
+
+  // idempotent
+  void release() {
+    if (acquired_gil_) {
+      PyGILState_Release(state_);
+      acquired_gil_ = false;
+    }
+  }
+
+ private:
+  bool acquired_gil_;
+  PyGILState_STATE state_;
+  ARROW_DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL);
+};
+
+// A RAII-style helper that releases the GIL until the end of a lexical block
+class ARROW_PYTHON_EXPORT PyReleaseGIL {
+ public:
+  PyReleaseGIL() : ptr_(PyEval_SaveThread(), &unique_ptr_deleter) {}
+
+ private:
+  static void unique_ptr_deleter(PyThreadState* state) {
+    if (state) {
+      PyEval_RestoreThread(state);
+    }
+  }
+  std::unique_ptr<PyThreadState, decltype(&unique_ptr_deleter)> ptr_;
+};
+
+// A helper to call safely into the Python interpreter from arbitrary C++ code.
+// The GIL is acquired, and the current thread's error status is preserved.
+template <typename Function>
+auto SafeCallIntoPython(Function&& func) -> decltype(func()) {
+  PyAcquireGIL lock;
+  PyObject* exc_type;
+  PyObject* exc_value;
+  PyObject* exc_traceback;
+  PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
+  auto maybe_status = std::forward<Function>(func)();
+  // If the return Status is a "Python error", the current Python error status
+  // describes the error and shouldn't be clobbered.
+  if (!IsPyError(::arrow::ToStatus(maybe_status)) && exc_type != NULLPTR) {
+    PyErr_Restore(exc_type, exc_value, exc_traceback);
+  }
+  return maybe_status;
+}
+
+template <typename Function>
+auto SafeCallIntoPythonVoid(Function&& func) -> decltype(func()) {
+  PyAcquireGIL lock;
+  PyObject* exc_type;
+  PyObject* exc_value;
+  PyObject* exc_traceback;
+  PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
+  func();
+  if (exc_type != NULLPTR) {
+    PyErr_Restore(exc_type, exc_value, exc_traceback);
+  }
+}
+
+// A RAII primitive that DECREFs the underlying PyObject* when it
+// goes out of scope.
+class ARROW_PYTHON_EXPORT OwnedRef {
+ public:
+  OwnedRef() : obj_(NULLPTR) {}
+  OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {}
+  explicit OwnedRef(PyObject* obj) : obj_(obj) {}
+
+  OwnedRef& operator=(OwnedRef&& other) {
+    obj_ = other.detach();
+    return *this;
+  }
+
+  ~OwnedRef() {
+    // GH-38626: destructor may be called after the Python interpreter is finalized.
+    if (Py_IsInitialized()) {
+      reset();
+    }
+  }
+
+  void reset(PyObject* obj) {
+    Py_XDECREF(obj_);
+    obj_ = obj;
+  }
+
+  void reset() { reset(NULLPTR); }
+
+  PyObject* detach() {
+    PyObject* result = obj_;
+    obj_ = NULLPTR;
+    return result;
+  }
+
+  PyObject* obj() const { return obj_; }
+
+  PyObject** ref() { return &obj_; }
+
+  operator bool() const { return obj_ != NULLPTR; }
+
+ private:
+  ARROW_DISALLOW_COPY_AND_ASSIGN(OwnedRef);
+
+  PyObject* obj_;
+};
+
+// Same as OwnedRef, but ensures the GIL is taken when it goes out of scope.
+// This is for situations where the GIL is not always known to be held
+// (e.g. if it is released in the middle of a function for performance reasons)
+class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef {
+ public:
+  OwnedRefNoGIL() : OwnedRef() {}
+  OwnedRefNoGIL(OwnedRefNoGIL&& other) : OwnedRef(other.detach()) {}
+  explicit OwnedRefNoGIL(PyObject* obj) : OwnedRef(obj) {}
+
+  ~OwnedRefNoGIL() {
+    // GH-38626: destructor may be called after the Python interpreter is finalized.
+    if (Py_IsInitialized() && obj() != NULLPTR) {
+      PyAcquireGIL lock;
+      reset();
+    }
+  }
+};
+
+template <template <typename...> typename SmartPtr, typename... Ts>
+class SmartPtrNoGIL : public SmartPtr<Ts...> {
+  using Base = SmartPtr<Ts...>;
+
+ public:
+  template <typename... Args>
+  SmartPtrNoGIL(Args&&... args) : Base(std::forward<Args>(args)...) {}
+
+  ~SmartPtrNoGIL() { reset(); }
+
+  template <typename... Args>
+  void reset(Args&&... args) {
+    auto release_guard = optional_gil_release();
+    Base::reset(std::forward<Args>(args)...);
+  }
+
+  template <typename V>
+  SmartPtrNoGIL& operator=(V&& v) {
+    auto release_guard = optional_gil_release();
+    Base::operator=(std::forward<V>(v));
+    return *this;
+  }
+
+ private:
+  // Only release the GIL if we own an object *and* the Python runtime is
+  // valid *and* the GIL is held.
+  std::optional<PyReleaseGIL> optional_gil_release() const {
+    if (this->get() != nullptr && Py_IsInitialized() && PyGILState_Check()) {
+      return PyReleaseGIL();
+    }
+    return {};
+  }
+};
+
+/// \brief A std::shared_ptr<T, ...> subclass that releases the GIL when destroying T
+template <typename... Ts>
+using SharedPtrNoGIL = SmartPtrNoGIL<std::shared_ptr, Ts...>;
+
+/// \brief A std::unique_ptr<T, ...> subclass that releases the GIL when destroying T
+template <typename... Ts>
+using UniquePtrNoGIL = SmartPtrNoGIL<std::unique_ptr, Ts...>;
+
+template <typename Fn>
+struct BoundFunction;
+
+template <typename... Args>
+struct BoundFunction<void(PyObject*, Args...)> {
+  // We bind `cdef void fn(object, ...)` to get a `Status(...)`
+  // where the Status contains any Python error raised by `fn`
+  using Unbound = void(PyObject*, Args...);
+  using Bound = Status(Args...);
+
+  BoundFunction(Unbound* unbound, PyObject* bound_arg)
+      : unbound_(unbound), bound_arg_(bound_arg) {}
+
+  Status Invoke(Args... args) const {
+    PyAcquireGIL lock;
+    unbound_(bound_arg_.obj(), std::forward<Args>(args)...);
+    RETURN_IF_PYERROR();
+    return Status::OK();
+  }
+
+  Unbound* unbound_;
+  OwnedRefNoGIL bound_arg_;
+};
+
+template <typename Return, typename... Args>
+struct BoundFunction<Return(PyObject*, Args...)> {
+  // We bind `cdef Return fn(object, ...)` to get a `Result<Return>(...)`
+  // where the Result contains any Python error raised by `fn` or the
+  // return value from `fn`.
+  using Unbound = Return(PyObject*, Args...);
+  using Bound = Result<Return>(Args...);
+
+  BoundFunction(Unbound* unbound, PyObject* bound_arg)
+      : unbound_(unbound), bound_arg_(bound_arg) {}
+
+  Result<Return> Invoke(Args... args) const {
+    PyAcquireGIL lock;
+    Return ret = unbound_(bound_arg_.obj(), std::forward<Args>(args)...);
+    RETURN_IF_PYERROR();
+    return ret;
+  }
+
+  Unbound* unbound_;
+  OwnedRefNoGIL bound_arg_;
+};
+
+template <typename OutFn, typename Return, typename... Args>
+std::function<OutFn> BindFunction(Return (*unbound)(PyObject*, Args...),
+                                  PyObject* bound_arg) {
+  using Fn = BoundFunction<Return(PyObject*, Args...)>;
+
+  static_assert(std::is_same<typename Fn::Bound, OutFn>::value,
+                "requested bound function of unsupported type");
+
+  Py_XINCREF(bound_arg);
+  auto bound_fn = std::make_shared<Fn>(unbound, bound_arg);
+  return
+      [bound_fn](Args... args) { return bound_fn->Invoke(std::forward<Args>(args)...); };
+}
+
+// A temporary conversion of a Python object to a bytes area.
+struct PyBytesView {
+  const char* bytes;
+  Py_ssize_t size;
+  bool is_utf8;
+
+  static Result<PyBytesView> FromString(PyObject* obj, bool check_utf8 = false) {
+    PyBytesView self;
+    ARROW_RETURN_NOT_OK(self.ParseString(obj, check_utf8));
+    return std::move(self);
+  }
+
+  static Result<PyBytesView> FromUnicode(PyObject* obj) {
+    PyBytesView self;
+    ARROW_RETURN_NOT_OK(self.ParseUnicode(obj));
+    return std::move(self);
+  }
+
+  static Result<PyBytesView> FromBinary(PyObject* obj) {
+    PyBytesView self;
+    ARROW_RETURN_NOT_OK(self.ParseBinary(obj));
+    return std::move(self);
+  }
+
+  // View the given Python object as string-like, i.e. str or (utf8) bytes
+  Status ParseString(PyObject* obj, bool check_utf8 = false) {
+    if (PyUnicode_Check(obj)) {
+      return ParseUnicode(obj);
+    } else {
+      ARROW_RETURN_NOT_OK(ParseBinary(obj));
+      if (check_utf8) {
+        // Check the bytes are utf8 utf-8
+        OwnedRef decoded(PyUnicode_FromStringAndSize(bytes, size));
+        if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) {
+          is_utf8 = true;
+        } else {
+          PyErr_Clear();
+          is_utf8 = false;
+        }
+      }
+      return Status::OK();
+    }
+  }
+
+  // View the given Python object as unicode string
+  Status ParseUnicode(PyObject* obj) {
+    // The utf-8 representation is cached on the unicode object
+    bytes = PyUnicode_AsUTF8AndSize(obj, &size);
+    RETURN_IF_PYERROR();
+    is_utf8 = true;
+    return Status::OK();
+  }
+
+  // View the given Python object as binary-like, i.e. bytes
+  Status ParseBinary(PyObject* obj) {
+    if (PyBytes_Check(obj)) {
+      bytes = PyBytes_AS_STRING(obj);
+      size = PyBytes_GET_SIZE(obj);
+      is_utf8 = false;
+    } else if (PyByteArray_Check(obj)) {
+      bytes = PyByteArray_AS_STRING(obj);
+      size = PyByteArray_GET_SIZE(obj);
+      is_utf8 = false;
+    } else if (PyMemoryView_Check(obj)) {
+      PyObject* ref = PyMemoryView_GetContiguous(obj, PyBUF_READ, 'C');
+      RETURN_IF_PYERROR();
+      Py_buffer* buffer = PyMemoryView_GET_BUFFER(ref);
+      bytes = reinterpret_cast<const char*>(buffer->buf);
+      size = buffer->len;
+      is_utf8 = false;
+    } else {
+      return Status::TypeError("Expected bytes, got a '", Py_TYPE(obj)->tp_name,
+                               "' object");
+    }
+    return Status::OK();
+  }
+
+ protected:
+  OwnedRef ref;
+};
+
+class ARROW_PYTHON_EXPORT PyBuffer : public Buffer {
+ public:
+  /// While memoryview objects support multi-dimensional buffers, PyBuffer only supports
+  /// one-dimensional byte buffers.
+  ~PyBuffer();
+
+  static Result<std::shared_ptr<Buffer>> FromPyObject(PyObject* obj);
+
+ private:
+  PyBuffer();
+  Status Init(PyObject*);
+
+  Py_buffer py_buf_;
+};
+
+// Return the common PyArrow memory pool
+ARROW_PYTHON_EXPORT void set_default_memory_pool(MemoryPool* pool);
+ARROW_PYTHON_EXPORT MemoryPool* get_memory_pool();
+
+// This is annoying: because C++11 does not allow implicit conversion of string
+// literals to non-const char*, we need to go through some gymnastics to use
+// PyObject_CallMethod without a lot of pain (its arguments are non-const
+// char*)
+template <typename... ArgTypes>
+static inline PyObject* cpp_PyObject_CallMethod(PyObject* obj, const char* method_name,
+                                                const char* argspec, ArgTypes... args) {
+  return PyObject_CallMethod(obj, const_cast<char*>(method_name),
+                             const_cast<char*>(argspec), args...);
+}
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/config.cc b/pyarrow/src/arrow/python/config.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d66476398724ade81d23bd45e87dce2295543640
--- /dev/null
+++ b/pyarrow/src/arrow/python/config.cc
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/config.h"
+#include "arrow/python/config_internal.h"
+
+namespace arrow {
+namespace py {
+
+namespace {
+
+const BuildInfo kBuildInfo = {
+    PYARROW_BUILD_TYPE,
+};
+
+}  // namespace
+
+const BuildInfo& GetBuildInfo() { return kBuildInfo; }
+
+}  // namespace py
+}  // namespace arrow
\ No newline at end of file
diff --git a/pyarrow/src/arrow/python/config.h b/pyarrow/src/arrow/python/config.h
new file mode 100644
index 0000000000000000000000000000000000000000..9af06f271661e53f1790b5f408bd62f3707aff78
--- /dev/null
+++ b/pyarrow/src/arrow/python/config.h
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+namespace py {
+
+struct BuildInfo {
+  // The uppercase build type, e.g. "DEBUG" or "RELEASE"
+  std::string build_type;
+};
+
+/// \brief Get build info for PyArrow.
+///
+ARROW_PYTHON_EXPORT
+const BuildInfo& GetBuildInfo();
+
+}  // namespace py
+}  // namespace arrow
\ No newline at end of file
diff --git a/pyarrow/src/arrow/python/config_internal.h.cmake b/pyarrow/src/arrow/python/config_internal.h.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e8a6e78c48a0a774727e905df11ddf6009267b62
--- /dev/null
+++ b/pyarrow/src/arrow/python/config_internal.h.cmake
@@ -0,0 +1,18 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#define PYARROW_BUILD_TYPE "@UPPERCASE_PYBUILD_TYPE@"
\ No newline at end of file
diff --git a/pyarrow/src/arrow/python/csv.cc b/pyarrow/src/arrow/python/csv.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1df3a94cef225f44de87c241bddded527f66804f
--- /dev/null
+++ b/pyarrow/src/arrow/python/csv.cc
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "csv.h"
+
+#include <memory>
+
+#include "arrow/python/common.h"
+
+namespace arrow {
+
+using csv::InvalidRow;
+using csv::InvalidRowHandler;
+using csv::InvalidRowResult;
+
+namespace py {
+namespace csv {
+
+InvalidRowHandler MakeInvalidRowHandler(PyInvalidRowCallback cb, PyObject* py_handler) {
+  if (cb == nullptr) {
+    return InvalidRowHandler{};
+  }
+
+  struct Handler {
+    PyInvalidRowCallback cb;
+    std::shared_ptr<OwnedRefNoGIL> handler_ref;
+
+    InvalidRowResult operator()(const InvalidRow& invalid_row) {
+      InvalidRowResult result;
+      auto st = SafeCallIntoPython([&]() -> Status {
+        result = cb(handler_ref->obj(), invalid_row);
+        if (PyErr_Occurred()) {
+          PyErr_WriteUnraisable(handler_ref->obj());
+        }
+        return Status::OK();
+      });
+      ARROW_UNUSED(st);
+      return result;
+    }
+  };
+
+  Py_INCREF(py_handler);
+  return Handler{cb, std::make_shared<OwnedRefNoGIL>(py_handler)};
+}
+
+}  // namespace csv
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/csv.h b/pyarrow/src/arrow/python/csv.h
new file mode 100644
index 0000000000000000000000000000000000000000..34302e93667394d616692a6a4603e6d0be67d211
--- /dev/null
+++ b/pyarrow/src/arrow/python/csv.h
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/csv/options.h"
+#include "arrow/python/common.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace py {
+namespace csv {
+
+using PyInvalidRowCallback = std::function<::arrow::csv::InvalidRowResult(
+    PyObject*, const ::arrow::csv::InvalidRow&)>;
+
+ARROW_PYTHON_EXPORT
+::arrow::csv::InvalidRowHandler MakeInvalidRowHandler(PyInvalidRowCallback,
+                                                      PyObject* handler);
+
+}  // namespace csv
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/datetime.cc b/pyarrow/src/arrow/python/datetime.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c4e66064d1d16926e8d6765e9d631798b73add5
--- /dev/null
+++ b/pyarrow/src/arrow/python/datetime.cc
@@ -0,0 +1,665 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "datetime.h"
+
+#include <algorithm>
+#include <chrono>
+#include <iomanip>
+#include <regex>
+#include <string_view>
+
+#include "arrow/array.h"
+#include "arrow/python/arrow_to_python_internal.h"
+#include "arrow/python/common.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/platform.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/regex.h"
+#include "arrow/util/value_parsing.h"
+
+namespace arrow {
+
+using internal::RegexMatch;
+
+namespace py {
+namespace internal {
+
+namespace {
+
+bool MatchFixedOffset(const std::string& tz, std::string_view* sign,
+                      std::string_view* hour, std::string_view* minute) {
+  static const std::regex regex("^([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$");
+  if (tz.size() < 5) {
+    return false;
+  }
+  return RegexMatch(regex, tz, {sign, hour, minute});
+}
+
+constexpr char* NonConst(const char* st) {
+  // Hack for python versions < 3.7 where members of PyStruct members
+  // where non-const (C++ doesn't like assigning string literals to these types)
+  return const_cast<char*>(st);
+}
+
+static PyTypeObject MonthDayNanoTupleType = {};
+
+static PyStructSequence_Field MonthDayNanoField[] = {
+    {NonConst("months"), NonConst("The number of months in the interval")},
+    {NonConst("days"), NonConst("The number days in the interval")},
+    {NonConst("nanoseconds"), NonConst("The number of nanoseconds in the interval")},
+    {nullptr, nullptr}};
+
+static PyStructSequence_Desc MonthDayNanoTupleDesc = {
+    NonConst("MonthDayNano"),
+    NonConst("A calendar interval consisting of months, days and nanoseconds."),
+    MonthDayNanoField,
+    /*n_in_sequence=*/3};
+
+}  // namespace
+
+#ifndef PYPY_VERSION
+PyDateTime_CAPI* datetime_api = nullptr;
+
+void InitDatetime() {
+  PyAcquireGIL lock;
+  datetime_api =
+      reinterpret_cast<PyDateTime_CAPI*>(PyCapsule_Import(PyDateTime_CAPSULE_NAME, 0));
+  if (datetime_api == nullptr) {
+    Py_FatalError("Could not import datetime C API");
+  }
+}
+#endif
+
+// The following code is adapted from
+// https://github.com/numpy/numpy/blob/main/numpy/core/src/multiarray/datetime.c
+
+// Days per month, regular year and leap year
+static int64_t _days_per_month_table[2][12] = {
+    {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
+    {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
+
+static bool is_leapyear(int64_t year) {
+  return (year & 0x3) == 0 &&  // year % 4 == 0
+         ((year % 100) != 0 || (year % 400) == 0);
+}
+
+// Calculates the days offset from the 1970 epoch.
+static int64_t get_days_from_date(int64_t date_year, int64_t date_month,
+                                  int64_t date_day) {
+  int64_t i, month;
+  int64_t year, days = 0;
+  int64_t* month_lengths;
+
+  year = date_year - 1970;
+  days = year * 365;
+
+  // Adjust for leap years
+  if (days >= 0) {
+    // 1968 is the closest leap year before 1970.
+    // Exclude the current year, so add 1.
+    year += 1;
+    // Add one day for each 4 years
+    days += year / 4;
+    // 1900 is the closest previous year divisible by 100
+    year += 68;
+    // Subtract one day for each 100 years
+    days -= year / 100;
+    // 1600 is the closest previous year divisible by 400
+    year += 300;
+    // Add one day for each 400 years
+    days += year / 400;
+  } else {
+    // 1972 is the closest later year after 1970.
+    // Include the current year, so subtract 2.
+    year -= 2;
+    // Subtract one day for each 4 years
+    days += year / 4;
+    // 2000 is the closest later year divisible by 100
+    year -= 28;
+    // Add one day for each 100 years
+    days -= year / 100;
+    // 2000 is also the closest later year divisible by 400
+    // Subtract one day for each 400 years
+    days += year / 400;
+  }
+
+  month_lengths = _days_per_month_table[is_leapyear(date_year)];
+  month = date_month - 1;
+
+  // Add the months
+  for (i = 0; i < month; ++i) {
+    days += month_lengths[i];
+  }
+
+  // Add the days
+  days += date_day - 1;
+
+  return days;
+}
+
+// Modifies '*days_' to be the day offset within the year,
+// and returns the year.
+static int64_t days_to_yearsdays(int64_t* days_) {
+  const int64_t days_per_400years = (400 * 365 + 100 - 4 + 1);
+  // Adjust so it's relative to the year 2000 (divisible by 400)
+  int64_t days = (*days_) - (365 * 30 + 7);
+  int64_t year;
+
+  // Break down the 400 year cycle to get the year and day within the year
+  if (days >= 0) {
+    year = 400 * (days / days_per_400years);
+    days = days % days_per_400years;
+  } else {
+    year = 400 * ((days - (days_per_400years - 1)) / days_per_400years);
+    days = days % days_per_400years;
+    if (days < 0) {
+      days += days_per_400years;
+    }
+  }
+
+  // Work out the year/day within the 400 year cycle
+  if (days >= 366) {
+    year += 100 * ((days - 1) / (100 * 365 + 25 - 1));
+    days = (days - 1) % (100 * 365 + 25 - 1);
+    if (days >= 365) {
+      year += 4 * ((days + 1) / (4 * 365 + 1));
+      days = (days + 1) % (4 * 365 + 1);
+      if (days >= 366) {
+        year += (days - 1) / 365;
+        days = (days - 1) % 365;
+      }
+    }
+  }
+
+  *days_ = days;
+  return year + 2000;
+}
+
+// Extracts the month and year and day number from a number of days
+static void get_date_from_days(int64_t days, int64_t* date_year, int64_t* date_month,
+                               int64_t* date_day) {
+  int64_t *month_lengths, i;
+
+  *date_year = days_to_yearsdays(&days);
+  month_lengths = _days_per_month_table[is_leapyear(*date_year)];
+
+  for (i = 0; i < 12; ++i) {
+    if (days < month_lengths[i]) {
+      *date_month = i + 1;
+      *date_day = days + 1;
+      return;
+    } else {
+      days -= month_lengths[i];
+    }
+  }
+
+  // Should never get here
+  return;
+}
+
+// Splitting time quantities, for example splitting total seconds into
+// minutes and remaining seconds. After we run
+// int64_t remaining = split_time(total, quotient, &next)
+// we have
+// total = next * quotient + remaining. Handles negative values by propagating
+// them: If total is negative, next will be negative and remaining will
+// always be non-negative.
+static inline int64_t split_time(int64_t total, int64_t quotient, int64_t* next) {
+  int64_t r = total % quotient;
+  if (r < 0) {
+    *next = total / quotient - 1;
+    return r + quotient;
+  } else {
+    *next = total / quotient;
+    return r;
+  }
+}
+
+static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit,
+                                        int64_t* hour, int64_t* minute, int64_t* second,
+                                        int64_t* microsecond) {
+  switch (unit) {
+    case TimeUnit::NANO:
+      if (val % 1000 != 0) {
+        return Status::Invalid("Value ", val, " has non-zero nanoseconds");
+      }
+      val /= 1000;
+    // fall through
+    case TimeUnit::MICRO:
+      *microsecond = split_time(val, 1000000LL, &val);
+      *second = split_time(val, 60, &val);
+      *minute = split_time(val, 60, hour);
+      break;
+    case TimeUnit::MILLI:
+      *microsecond = split_time(val, 1000, &val) * 1000;
+    // fall through
+    case TimeUnit::SECOND:
+      *second = split_time(val, 60, &val);
+      *minute = split_time(val, 60, hour);
+      break;
+    default:
+      break;
+  }
+  return Status::OK();
+}
+
+static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year,
+                                        int64_t* month, int64_t* day) {
+  switch (unit) {
+    case DateUnit::MILLI:
+      val /= 86400000LL;  // fall through
+    case DateUnit::DAY:
+      get_date_from_days(val, year, month, day);
+    default:
+      break;
+  }
+  return Status::OK();
+}
+
+PyObject* NewMonthDayNanoTupleType() {
+  if (MonthDayNanoTupleType.tp_name == nullptr) {
+    if (PyStructSequence_InitType2(&MonthDayNanoTupleType, &MonthDayNanoTupleDesc) != 0) {
+      Py_FatalError("Could not initialize MonthDayNanoTuple");
+    }
+  }
+  Py_INCREF(&MonthDayNanoTupleType);
+  return (PyObject*)&MonthDayNanoTupleType;
+}
+
+Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) {
+  int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
+  RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, &microsecond));
+  *out = PyTime_FromTime(static_cast<int32_t>(hour), static_cast<int32_t>(minute),
+                         static_cast<int32_t>(second), static_cast<int32_t>(microsecond));
+  return Status::OK();
+}
+
+Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) {
+  int64_t year = 0, month = 0, day = 0;
+  RETURN_NOT_OK(PyDate_convert_int(val, unit, &year, &month, &day));
+  *out = PyDate_FromDate(static_cast<int32_t>(year), static_cast<int32_t>(month),
+                         static_cast<int32_t>(day));
+  return Status::OK();
+}
+
+Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) {
+  int64_t hour = 0, minute = 0, second = 0, microsecond = 0;
+  RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, &microsecond));
+  int64_t total_days = 0;
+  hour = split_time(hour, 24, &total_days);
+  int64_t year = 0, month = 0, day = 0;
+  get_date_from_days(total_days, &year, &month, &day);
+  *out = PyDateTime_FromDateAndTime(
+      static_cast<int32_t>(year), static_cast<int32_t>(month), static_cast<int32_t>(day),
+      static_cast<int32_t>(hour), static_cast<int32_t>(minute),
+      static_cast<int32_t>(second), static_cast<int32_t>(microsecond));
+  return Status::OK();
+}
+
+int64_t PyDate_to_days(PyDateTime_Date* pydate) {
+  return get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate),
+                            PyDateTime_GET_DAY(pydate));
+}
+
+Result<int64_t> PyDateTime_utcoffset_s(PyObject* obj) {
+  // calculate offset from UTC timezone in seconds
+  // supports only PyDateTime_DateTime and PyDateTime_Time objects
+  OwnedRef pyoffset(PyObject_CallMethod(obj, "utcoffset", NULL));
+  RETURN_IF_PYERROR();
+  if (pyoffset.obj() != nullptr && pyoffset.obj() != Py_None) {
+    auto delta = reinterpret_cast<PyDateTime_Delta*>(pyoffset.obj());
+    return internal::PyDelta_to_s(delta);
+  } else {
+    return 0;
+  }
+}
+
+Result<std::string> PyTZInfo_utcoffset_hhmm(PyObject* pytzinfo) {
+  // attempt to convert timezone offset objects to "+/-{hh}:{mm}" format
+  OwnedRef pydelta_object(PyObject_CallMethod(pytzinfo, "utcoffset", "O", Py_None));
+  RETURN_IF_PYERROR();
+
+  if (!PyDelta_Check(pydelta_object.obj())) {
+    return Status::Invalid(
+        "Object returned by tzinfo.utcoffset(None) is not an instance of "
+        "datetime.timedelta");
+  }
+  auto pydelta = reinterpret_cast<PyDateTime_Delta*>(pydelta_object.obj());
+
+  // retrieve the offset as seconds
+  auto total_seconds = internal::PyDelta_to_s(pydelta);
+
+  // determine whether the offset is positive or negative
+  auto sign = (total_seconds < 0) ? "-" : "+";
+  total_seconds = abs(total_seconds);
+
+  // calculate offset components
+  int64_t hours, minutes, seconds;
+  seconds = split_time(total_seconds, 60, &minutes);
+  minutes = split_time(minutes, 60, &hours);
+  if (seconds > 0) {
+    // check there are no remaining seconds
+    return Status::Invalid("Offset must represent whole number of minutes");
+  }
+
+  // construct the timezone string
+  std::stringstream stream;
+  stream << sign << std::setfill('0') << std::setw(2) << hours << ":" << std::setfill('0')
+         << std::setw(2) << minutes;
+  return stream.str();
+}
+
+// Converted from python.  See https://github.com/apache/arrow/pull/7604
+// for details.
+Result<PyObject*> StringToTzinfo(const std::string& tz) {
+  std::string_view sign_str, hour_str, minute_str;
+  OwnedRef pytz;
+  OwnedRef zoneinfo;
+  OwnedRef datetime;
+
+  if (internal::ImportModule("pytz", &pytz).ok()) {
+    if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) {
+      int sign = -1;
+      if (sign_str == "+") {
+        sign = 1;
+      }
+      OwnedRef fixed_offset;
+      RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "FixedOffset", &fixed_offset));
+      uint32_t minutes, hours;
+      if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) ||
+          !::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(),
+                                            &minutes)) {
+        return Status::Invalid("Invalid timezone: ", tz);
+      }
+      OwnedRef total_minutes(PyLong_FromLong(
+          sign * ((static_cast<int>(hours) * 60) + static_cast<int>(minutes))));
+      RETURN_IF_PYERROR();
+      auto tzinfo =
+          PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL);
+      RETURN_IF_PYERROR();
+      return tzinfo;
+    }
+
+    OwnedRef timezone;
+    RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone));
+    OwnedRef py_tz_string(
+        PyUnicode_FromStringAndSize(tz.c_str(), static_cast<Py_ssize_t>(tz.size())));
+    auto tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL);
+    RETURN_IF_PYERROR();
+    return tzinfo;
+  }
+
+  // catch fixed offset if pytz is not present
+  if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) {
+    RETURN_NOT_OK(internal::ImportModule("datetime", &datetime));
+    int sign = -1;
+    if (sign_str == "+") {
+      sign = 1;
+    }
+
+    // import timezone and timedelta module to create a tzinfo object
+    OwnedRef class_timezone;
+    OwnedRef class_timedelta;
+    RETURN_NOT_OK(
+        internal::ImportFromModule(datetime.obj(), "timezone", &class_timezone));
+    RETURN_NOT_OK(
+        internal::ImportFromModule(datetime.obj(), "timedelta", &class_timedelta));
+
+    // check input
+    uint32_t minutes, hours;
+    if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) ||
+        !::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(),
+                                          &minutes)) {
+      return Status::Invalid("Invalid timezone: ", tz);
+    }
+
+    // save offset as a signed integer
+    OwnedRef total_minutes(PyLong_FromLong(
+        sign * ((static_cast<int>(hours) * 60) + static_cast<int>(minutes))));
+    // create zero integers for empty arguments in datetime.timedelta
+    OwnedRef zero(PyLong_FromLong(static_cast<int>(0)));
+
+    // call datetime.timedelta to get correct offset object for datetime.timezone
+    auto offset =
+        PyObject_CallFunctionObjArgs(class_timedelta.obj(), zero.obj(), zero.obj(),
+                                     zero.obj(), zero.obj(), total_minutes.obj(), NULL);
+    RETURN_IF_PYERROR();
+    // call datetime.timezone
+    auto tzinfo = PyObject_CallFunctionObjArgs(class_timezone.obj(), offset, NULL);
+    RETURN_IF_PYERROR();
+    return tzinfo;
+  }
+
+  // fallback on zoneinfo if tz is string and pytz is not present
+  if (internal::ImportModule("zoneinfo", &zoneinfo).ok()) {
+    OwnedRef class_zoneinfo;
+    RETURN_NOT_OK(
+        internal::ImportFromModule(zoneinfo.obj(), "ZoneInfo", &class_zoneinfo));
+    OwnedRef py_tz_string(
+        PyUnicode_FromStringAndSize(tz.c_str(), static_cast<Py_ssize_t>(tz.size())));
+    auto tzinfo =
+        PyObject_CallFunctionObjArgs(class_zoneinfo.obj(), py_tz_string.obj(), NULL);
+    RETURN_IF_PYERROR();
+    return tzinfo;
+  }
+
+  return Status::Invalid(
+      "Pytz package or Python>=3.8 for zoneinfo module must be installed.");
+}
+
+Result<std::string> TzinfoToString(PyObject* tzinfo) {
+  OwnedRef module_pytz;        // import pytz
+  OwnedRef module_datetime;    // import datetime
+  OwnedRef module_zoneinfo;    // import zoneinfo
+  OwnedRef module_dateutil;    // import dateutil
+  OwnedRef class_timezone;     // from datetime import timezone
+  OwnedRef class_fixedoffset;  // from pytz import _FixedOffset
+  OwnedRef class_basetzinfo;   // from pytz import BaseTzInfo
+  OwnedRef class_zoneinfo;     // from zoneinfo import ZoneInfo
+  OwnedRef class_tzfile;       // from zoneinfo import tzfile
+
+  // import necessary modules
+  RETURN_NOT_OK(internal::ImportModule("datetime", &module_datetime));
+  // import necessary classes
+  RETURN_NOT_OK(
+      internal::ImportFromModule(module_datetime.obj(), "timezone", &class_timezone));
+
+  // check that it's a valid tzinfo object
+  if (!PyTZInfo_Check(tzinfo)) {
+    return Status::TypeError("Not an instance of datetime.tzinfo");
+  }
+
+  // if tzinfo is an instance of datetime.timezone return the
+  // HH:MM offset string representation
+  if (PyObject_IsInstance(tzinfo, class_timezone.obj())) {
+    // still recognize datetime.timezone.utc as UTC (instead of +00:00)
+    OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None));
+    RETURN_IF_PYERROR();
+    if (PyUnicode_Check(tzname_object.obj())) {
+      std::string result;
+      RETURN_NOT_OK(internal::PyUnicode_AsStdString(tzname_object.obj(), &result));
+      if (result == "UTC") {
+        return result;
+      }
+    }
+    return PyTZInfo_utcoffset_hhmm(tzinfo);
+  }
+
+  // Try to import pytz if it is available
+  if (internal::ImportModule("pytz", &module_pytz).ok()) {
+    RETURN_NOT_OK(internal::ImportFromModule(module_pytz.obj(), "_FixedOffset",
+                                             &class_fixedoffset));
+    RETURN_NOT_OK(
+        internal::ImportFromModule(module_pytz.obj(), "BaseTzInfo", &class_basetzinfo));
+  }
+
+  // if tzinfo is an instance of pytz._FixedOffset return the
+  // HH:MM offset string representation
+  if (module_pytz.obj() != nullptr &&
+      PyObject_IsInstance(tzinfo, class_fixedoffset.obj())) {
+    OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None));
+    RETURN_IF_PYERROR();
+    return PyTZInfo_utcoffset_hhmm(tzinfo);
+  }
+
+  // if pytz is installed and tzinfo is and instance of pytz.BaseTzInfo
+  if (module_pytz.obj() != nullptr &&
+      PyObject_IsInstance(tzinfo, class_basetzinfo.obj())) {
+    OwnedRef zone(PyObject_GetAttrString(tzinfo, "zone"));
+    RETURN_IF_PYERROR();
+    std::string result;
+    RETURN_NOT_OK(internal::PyUnicode_AsStdString(zone.obj(), &result));
+    return result;
+  }
+
+  // Try to import zoneinfo if it is available
+  if (internal::ImportModule("zoneinfo", &module_zoneinfo).ok()) {
+    RETURN_NOT_OK(
+        internal::ImportFromModule(module_zoneinfo.obj(), "ZoneInfo", &class_zoneinfo));
+  }
+
+  // if zoneinfo is installed and tzinfo is an instance of zoneinfo.ZoneInfo
+  if (module_zoneinfo.obj() != nullptr &&
+      PyObject_IsInstance(tzinfo, class_zoneinfo.obj())) {
+    OwnedRef key(PyObject_GetAttrString(tzinfo, "key"));
+    RETURN_IF_PYERROR();
+    std::string result;
+    RETURN_NOT_OK(internal::PyUnicode_AsStdString(key.obj(), &result));
+    return result;
+  }
+
+  // Try to import dateutil if it is available
+  if (internal::ImportModule("dateutil.tz", &module_dateutil).ok()) {
+    RETURN_NOT_OK(
+        internal::ImportFromModule(module_dateutil.obj(), "tzfile", &class_tzfile));
+  }
+
+  // if dateutil is installed and tzinfo is an instance of dateutil.tz.tzfile
+  if (module_dateutil.obj() != nullptr &&
+      PyObject_IsInstance(tzinfo, class_tzfile.obj())) {
+    OwnedRef _filename(PyObject_GetAttrString(tzinfo, "_filename"));
+    RETURN_IF_PYERROR();
+    std::string result;
+    RETURN_NOT_OK(internal::PyUnicode_AsStdString(_filename.obj(), &result));
+    // _filename returns a full path in general ('/usr/share/zoneinfo/Europe/Paris')
+    // or POSIX name on Windows ('Europe/Paris') - we need a substring in first case
+    std::size_t pos = result.find("zoneinfo/");
+    if (pos != std::string::npos) {
+      return result.substr(pos + 9);
+    }
+    return result;
+  }
+
+  // attempt to call tzinfo.tzname(None)
+  OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None));
+  RETURN_IF_PYERROR();
+  if (PyUnicode_Check(tzname_object.obj())) {
+    std::string result;
+    RETURN_NOT_OK(internal::PyUnicode_AsStdString(tzname_object.obj(), &result));
+    return result;
+  }
+
+  // fall back to HH:MM offset string representation based on tzinfo.utcoffset(None)
+  return PyTZInfo_utcoffset_hhmm(tzinfo);
+}
+
+PyObject* MonthDayNanoIntervalToNamedTuple(
+    const MonthDayNanoIntervalType::MonthDayNanos& interval) {
+  OwnedRef tuple(PyStructSequence_New(&MonthDayNanoTupleType));
+  if (ARROW_PREDICT_FALSE(tuple.obj() == nullptr)) {
+    return nullptr;
+  }
+  PyStructSequence_SetItem(tuple.obj(), /*pos=*/0, PyLong_FromLong(interval.months));
+  PyStructSequence_SetItem(tuple.obj(), /*pos=*/1, PyLong_FromLong(interval.days));
+  PyStructSequence_SetItem(tuple.obj(), /*pos=*/2,
+                           PyLong_FromLongLong(interval.nanoseconds));
+  return tuple.detach();
+}
+
+namespace {
+
+// Wrapper around a Python list object that mimics dereference and assignment
+// operations.
+struct PyListAssigner {
+ public:
+  explicit PyListAssigner(PyObject* list) : list_(list) {
+    ARROW_DCHECK(PyList_Check(list_));
+  }
+
+  PyListAssigner& operator*() { return *this; }
+
+  void operator=(PyObject* obj) {
+    if (ARROW_PREDICT_FALSE(PyList_SetItem(list_, current_index_, obj) == -1)) {
+      Py_FatalError("list did not have the correct preallocated size.");
+    }
+  }
+
+  PyListAssigner& operator++() {
+    current_index_++;
+    return *this;
+  }
+
+  PyListAssigner& operator+=(int64_t offset) {
+    current_index_ += offset;
+    return *this;
+  }
+
+ private:
+  PyObject* list_;
+  int64_t current_index_ = 0;
+};
+
+}  // namespace
+
+Result<PyObject*> MonthDayNanoIntervalArrayToPyList(
+    const MonthDayNanoIntervalArray& array) {
+  OwnedRef out_list(PyList_New(array.length()));
+  RETURN_IF_PYERROR();
+  PyListAssigner out_objects(out_list.obj());
+  auto& interval_array =
+      arrow::internal::checked_cast<const MonthDayNanoIntervalArray&>(array);
+  RETURN_NOT_OK(internal::WriteArrayObjects(
+      interval_array,
+      [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyListAssigner& out) {
+        PyObject* tuple = internal::MonthDayNanoIntervalToNamedTuple(interval);
+        if (ARROW_PREDICT_FALSE(tuple == nullptr)) {
+          RETURN_IF_PYERROR();
+        }
+
+        *out = tuple;
+        return Status::OK();
+      },
+      out_objects));
+  return out_list.detach();
+}
+
+Result<PyObject*> MonthDayNanoIntervalScalarToPyObject(
+    const MonthDayNanoIntervalScalar& scalar) {
+  if (scalar.is_valid) {
+    return internal::MonthDayNanoIntervalToNamedTuple(scalar.value);
+  } else {
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+}
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/datetime.h b/pyarrow/src/arrow/python/datetime.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b21eeb4342170f5462ee6a532003985fe5882f6
--- /dev/null
+++ b/pyarrow/src/arrow/python/datetime.h
@@ -0,0 +1,231 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+
+#include "arrow/python/platform.h"
+#include "arrow/python/visibility.h"
+#include "arrow/result.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/int_util_overflow.h"
+#include "arrow/util/logging.h"
+
+// By default, PyDateTimeAPI is a *static* variable.  This forces
+// PyDateTime_IMPORT to be called in every C/C++ module using the
+// C datetime API.  This is error-prone and potentially costly.
+// Instead, we redefine PyDateTimeAPI to point to a global variable,
+// which is initialized once by calling InitDatetime().
+#ifdef PYPY_VERSION
+#  include "datetime.h"
+#else
+#  define PyDateTimeAPI ::arrow::py::internal::datetime_api
+#endif
+
+namespace arrow {
+using internal::AddWithOverflow;
+using internal::MultiplyWithOverflow;
+namespace py {
+namespace internal {
+
+#ifndef PYPY_VERSION
+extern PyDateTime_CAPI* datetime_api;
+
+ARROW_PYTHON_EXPORT
+void InitDatetime();
+#endif
+
+// Returns the MonthDayNano namedtuple type (increments the reference count).
+ARROW_PYTHON_EXPORT
+PyObject* NewMonthDayNanoTupleType();
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyTime_to_us(PyObject* pytime) {
+  return (PyDateTime_TIME_GET_HOUR(pytime) * 3600000000LL +
+          PyDateTime_TIME_GET_MINUTE(pytime) * 60000000LL +
+          PyDateTime_TIME_GET_SECOND(pytime) * 1000000LL +
+          PyDateTime_TIME_GET_MICROSECOND(pytime));
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyTime_to_s(PyObject* pytime) { return PyTime_to_us(pytime) / 1000000; }
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyTime_to_ms(PyObject* pytime) { return PyTime_to_us(pytime) / 1000; }
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyTime_to_ns(PyObject* pytime) { return PyTime_to_us(pytime) * 1000; }
+
+ARROW_PYTHON_EXPORT
+Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out);
+
+ARROW_PYTHON_EXPORT
+Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out);
+
+// WARNING: This function returns a naive datetime.
+ARROW_PYTHON_EXPORT
+Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out);
+
+// This declaration must be the same as in filesystem/filesystem.h
+using TimePoint =
+    std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>;
+
+ARROW_PYTHON_EXPORT
+int64_t PyDate_to_days(PyDateTime_Date* pydate);
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDate_to_s(PyDateTime_Date* pydate) {
+  return PyDate_to_days(pydate) * 86400LL;
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDate_to_ms(PyDateTime_Date* pydate) {
+  return PyDate_to_days(pydate) * 86400000LL;
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDateTime_to_s(PyDateTime_DateTime* pydatetime) {
+  return (PyDate_to_s(reinterpret_cast<PyDateTime_Date*>(pydatetime)) +
+          PyDateTime_DATE_GET_HOUR(pydatetime) * 3600LL +
+          PyDateTime_DATE_GET_MINUTE(pydatetime) * 60LL +
+          PyDateTime_DATE_GET_SECOND(pydatetime));
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDateTime_to_ms(PyDateTime_DateTime* pydatetime) {
+  return (PyDateTime_to_s(pydatetime) * 1000LL +
+          PyDateTime_DATE_GET_MICROSECOND(pydatetime) / 1000);
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDateTime_to_us(PyDateTime_DateTime* pydatetime) {
+  return (PyDateTime_to_s(pydatetime) * 1000000LL +
+          PyDateTime_DATE_GET_MICROSECOND(pydatetime));
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDateTime_to_ns(PyDateTime_DateTime* pydatetime) {
+  return PyDateTime_to_us(pydatetime) * 1000LL;
+}
+
+ARROW_PYTHON_EXPORT
+inline TimePoint PyDateTime_to_TimePoint(PyDateTime_DateTime* pydatetime) {
+  return TimePoint(TimePoint::duration(PyDateTime_to_ns(pydatetime)));
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t TimePoint_to_ns(TimePoint val) { return val.time_since_epoch().count(); }
+
+ARROW_PYTHON_EXPORT
+inline TimePoint TimePoint_from_s(double val) {
+  return TimePoint(TimePoint::duration(static_cast<int64_t>(1e9 * val)));
+}
+
+ARROW_PYTHON_EXPORT
+inline TimePoint TimePoint_from_ns(int64_t val) {
+  return TimePoint(TimePoint::duration(val));
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDelta_to_s(PyDateTime_Delta* pytimedelta) {
+  return (PyDateTime_DELTA_GET_DAYS(pytimedelta) * 86400LL +
+          PyDateTime_DELTA_GET_SECONDS(pytimedelta));
+}
+
+ARROW_PYTHON_EXPORT
+inline int64_t PyDelta_to_ms(PyDateTime_Delta* pytimedelta) {
+  return (PyDelta_to_s(pytimedelta) * 1000LL +
+          PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta) / 1000);
+}
+
+ARROW_PYTHON_EXPORT
+inline Result<int64_t> PyDelta_to_us(PyDateTime_Delta* pytimedelta) {
+  int64_t result = PyDelta_to_s(pytimedelta);
+  if (MultiplyWithOverflow(result, 1000000LL, &result)) {
+    return Status::Invalid("Timedelta too large to fit in 64-bit integer");
+  }
+  if (AddWithOverflow(result, PyDateTime_DELTA_GET_MICROSECONDS(pytimedelta), &result)) {
+    return Status::Invalid("Timedelta too large to fit in 64-bit integer");
+  }
+  return result;
+}
+
+ARROW_PYTHON_EXPORT
+inline Result<int64_t> PyDelta_to_ns(PyDateTime_Delta* pytimedelta) {
+  ARROW_ASSIGN_OR_RAISE(int64_t result, PyDelta_to_us(pytimedelta));
+  if (MultiplyWithOverflow(result, 1000LL, &result)) {
+    return Status::Invalid("Timedelta too large to fit in 64-bit integer");
+  }
+  return result;
+}
+
+ARROW_PYTHON_EXPORT
+Result<int64_t> PyDateTime_utcoffset_s(PyObject* pydatetime);
+
+/// \brief Convert a time zone name into a time zone object.
+///
+/// Supported input strings are:
+/// * As used in the Olson time zone database (the "tz database" or
+///   "tzdata"), such as "America/New_York"
+/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+/// GIL must be held when calling this method.
+ARROW_PYTHON_EXPORT
+Result<PyObject*> StringToTzinfo(const std::string& tz);
+
+/// \brief Convert a time zone object to a string representation.
+///
+/// The output strings are:
+/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+///   if the input object is either an instance of pytz._FixedOffset or
+///   datetime.timedelta
+/// * The timezone's name if the input object's tzname() method returns with a
+///   non-empty timezone name such as "UTC" or "America/New_York"
+///
+/// GIL must be held when calling this method.
+ARROW_PYTHON_EXPORT
+Result<std::string> TzinfoToString(PyObject* pytzinfo);
+
+/// \brief Convert MonthDayNano to a python namedtuple.
+///
+/// Return a named tuple (pyarrow.MonthDayNano) containing attributes
+/// "months", "days", "nanoseconds" in the given order
+/// with values extracted from the fields on interval.
+///
+/// GIL must be held when calling this method.
+ARROW_PYTHON_EXPORT
+PyObject* MonthDayNanoIntervalToNamedTuple(
+    const MonthDayNanoIntervalType::MonthDayNanos& interval);
+
+/// \brief Convert the given Array to a PyList object containing
+/// pyarrow.MonthDayNano objects.
+ARROW_PYTHON_EXPORT
+Result<PyObject*> MonthDayNanoIntervalArrayToPyList(
+    const MonthDayNanoIntervalArray& array);
+
+/// \brief Convert the Scalar object to a pyarrow.MonthDayNano (or None if
+/// is isn't valid).
+ARROW_PYTHON_EXPORT
+Result<PyObject*> MonthDayNanoIntervalScalarToPyObject(
+    const MonthDayNanoIntervalScalar& scalar);
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/decimal.cc b/pyarrow/src/arrow/python/decimal.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ba8d32192af6693ee264eeaef75646919d5ad51
--- /dev/null
+++ b/pyarrow/src/arrow/python/decimal.cc
@@ -0,0 +1,265 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <limits>
+
+#include "arrow/python/common.h"
+#include "arrow/python/decimal.h"
+#include "arrow/python/helpers.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace py {
+namespace internal {
+
+Status ImportDecimalType(OwnedRef* decimal_type) {
+  OwnedRef decimal_module;
+  RETURN_NOT_OK(ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(ImportFromModule(decimal_module.obj(), "Decimal", decimal_type));
+  return Status::OK();
+}
+
+Status PythonDecimalToString(PyObject* python_decimal, std::string* out) {
+  // Call Python's str(decimal_object)
+  return PyObject_StdStringStr(python_decimal, out);
+}
+
+// \brief Infer the precision and scale of a Python decimal.Decimal instance
+// \param python_decimal[in] An instance of decimal.Decimal
+// \param precision[out] The value of the inferred precision
+// \param scale[out] The value of the inferred scale
+// \return The status of the operation
+static Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision,
+                                            int32_t* scale) {
+  ARROW_DCHECK_NE(python_decimal, NULLPTR);
+  ARROW_DCHECK_NE(precision, NULLPTR);
+  ARROW_DCHECK_NE(scale, NULLPTR);
+
+  // TODO(phillipc): Make sure we perform PyDecimal_Check(python_decimal) as a
+  // ARROW_DCHECK
+  OwnedRef as_tuple(PyObject_CallMethod(python_decimal, const_cast<char*>("as_tuple"),
+                                        const_cast<char*>("")));
+  RETURN_IF_PYERROR();
+  ARROW_DCHECK(PyTuple_Check(as_tuple.obj()));
+
+  OwnedRef digits(PyObject_GetAttrString(as_tuple.obj(), "digits"));
+  RETURN_IF_PYERROR();
+  ARROW_DCHECK(PyTuple_Check(digits.obj()));
+
+  const auto num_digits = static_cast<int32_t>(PyTuple_Size(digits.obj()));
+  RETURN_IF_PYERROR();
+
+  OwnedRef py_exponent(PyObject_GetAttrString(as_tuple.obj(), "exponent"));
+  RETURN_IF_PYERROR();
+  ARROW_DCHECK(IsPyInteger(py_exponent.obj()));
+
+  const auto exponent = static_cast<int32_t>(PyLong_AsLong(py_exponent.obj()));
+  RETURN_IF_PYERROR();
+
+  if (exponent < 0) {
+    // If exponent > num_digits, we have a number with leading zeros
+    // such as 0.01234.  Ensure we have enough precision for leading zeros
+    // (which are not included in num_digits).
+    *precision = std::max(num_digits, -exponent);
+    *scale = -exponent;
+  } else {
+    // Trailing zeros are not included in num_digits, need to add to precision.
+    // Note we don't generate negative scales as they are poorly supported
+    // in non-Arrow systems.
+    *precision = num_digits + exponent;
+    *scale = 0;
+  }
+  return Status::OK();
+}
+
+PyObject* DecimalFromString(PyObject* decimal_constructor,
+                            const std::string& decimal_string) {
+  ARROW_DCHECK_NE(decimal_constructor, nullptr);
+
+  auto string_size = decimal_string.size();
+  ARROW_DCHECK_GT(string_size, 0);
+
+  auto string_bytes = decimal_string.c_str();
+  ARROW_DCHECK_NE(string_bytes, nullptr);
+
+  return PyObject_CallFunction(decimal_constructor, const_cast<char*>("s#"), string_bytes,
+                               static_cast<Py_ssize_t>(string_size));
+}
+
+namespace {
+
+template <typename ArrowDecimal>
+Status DecimalFromStdString(const std::string& decimal_string,
+                            const DecimalType& arrow_type, ArrowDecimal* out) {
+  int32_t inferred_precision;
+  int32_t inferred_scale;
+
+  RETURN_NOT_OK(ArrowDecimal::FromString(decimal_string, out, &inferred_precision,
+                                         &inferred_scale));
+
+  const int32_t precision = arrow_type.precision();
+  const int32_t scale = arrow_type.scale();
+
+  if (scale != inferred_scale) {
+    ARROW_DCHECK_NE(out, NULLPTR);
+    ARROW_ASSIGN_OR_RAISE(*out, out->Rescale(inferred_scale, scale));
+  }
+
+  auto inferred_scale_delta = inferred_scale - scale;
+  if (ARROW_PREDICT_FALSE((inferred_precision - inferred_scale_delta) > precision)) {
+    return Status::Invalid(
+        "Decimal type with precision ", inferred_precision,
+        " does not fit into precision inferred from first array element: ", precision);
+  }
+
+  return Status::OK();
+}
+
+template <typename ArrowDecimal>
+Status InternalDecimalFromPythonDecimal(PyObject* python_decimal,
+                                        const DecimalType& arrow_type,
+                                        ArrowDecimal* out) {
+  ARROW_DCHECK_NE(python_decimal, NULLPTR);
+  ARROW_DCHECK_NE(out, NULLPTR);
+
+  std::string string;
+  RETURN_NOT_OK(PythonDecimalToString(python_decimal, &string));
+  return DecimalFromStdString(string, arrow_type, out);
+}
+
+template <typename ArrowDecimal>
+Status InternalDecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type,
+                                   ArrowDecimal* out) {
+  ARROW_DCHECK_NE(obj, NULLPTR);
+  ARROW_DCHECK_NE(out, NULLPTR);
+
+  if (IsPyInteger(obj)) {
+    // TODO: add a fast path for small-ish ints
+    std::string string;
+    RETURN_NOT_OK(PyObject_StdStringStr(obj, &string));
+    return DecimalFromStdString(string, arrow_type, out);
+  } else if (PyDecimal_Check(obj)) {
+    return InternalDecimalFromPythonDecimal<ArrowDecimal>(obj, arrow_type, out);
+  } else {
+    return Status::TypeError("int or Decimal object expected, got ",
+                             Py_TYPE(obj)->tp_name);
+  }
+}
+
+}  // namespace
+
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal32* out) {
+  return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
+}
+
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal32* out) {
+  return InternalDecimalFromPyObject(obj, arrow_type, out);
+}
+
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal64* out) {
+  return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
+}
+
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal64* out) {
+  return InternalDecimalFromPyObject(obj, arrow_type, out);
+}
+
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal128* out) {
+  return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
+}
+
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type,
+                           Decimal128* out) {
+  return InternalDecimalFromPyObject(obj, arrow_type, out);
+}
+
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal256* out) {
+  return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out);
+}
+
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type,
+                           Decimal256* out) {
+  return InternalDecimalFromPyObject(obj, arrow_type, out);
+}
+
+bool PyDecimal_Check(PyObject* obj) {
+  static OwnedRef decimal_type;
+  if (!decimal_type.obj()) {
+    ARROW_CHECK_OK(ImportDecimalType(&decimal_type));
+    ARROW_DCHECK(PyType_Check(decimal_type.obj()));
+  }
+  // PyObject_IsInstance() is slower as it has to check for virtual subclasses
+  const int result =
+      PyType_IsSubtype(Py_TYPE(obj), reinterpret_cast<PyTypeObject*>(decimal_type.obj()));
+  ARROW_CHECK_NE(result, -1) << " error during PyType_IsSubtype check";
+  return result == 1;
+}
+
+bool PyDecimal_ISNAN(PyObject* obj) {
+  ARROW_DCHECK(PyDecimal_Check(obj)) << "obj is not an instance of decimal.Decimal";
+  OwnedRef is_nan(
+      PyObject_CallMethod(obj, const_cast<char*>("is_nan"), const_cast<char*>("")));
+  return PyObject_IsTrue(is_nan.obj()) == 1;
+}
+
+DecimalMetadata::DecimalMetadata()
+    : DecimalMetadata(std::numeric_limits<int32_t>::min(),
+                      std::numeric_limits<int32_t>::min()) {}
+
+DecimalMetadata::DecimalMetadata(int32_t precision, int32_t scale)
+    : precision_(precision), scale_(scale) {}
+
+Status DecimalMetadata::Update(int32_t suggested_precision, int32_t suggested_scale) {
+  const int32_t current_scale = scale_;
+  scale_ = std::max(current_scale, suggested_scale);
+
+  const int32_t current_precision = precision_;
+
+  if (current_precision == std::numeric_limits<int32_t>::min()) {
+    precision_ = suggested_precision;
+  } else {
+    auto num_digits = std::max(current_precision - current_scale,
+                               suggested_precision - suggested_scale);
+    precision_ = std::max(num_digits + scale_, current_precision);
+  }
+
+  return Status::OK();
+}
+
+Status DecimalMetadata::Update(PyObject* object) {
+  bool is_decimal = PyDecimal_Check(object);
+
+  if (ARROW_PREDICT_FALSE(!is_decimal || PyDecimal_ISNAN(object))) {
+    return Status::OK();
+  }
+
+  int32_t precision = 0;
+  int32_t scale = 0;
+  RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale));
+  return Update(precision, scale);
+}
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/decimal.h b/pyarrow/src/arrow/python/decimal.h
new file mode 100644
index 0000000000000000000000000000000000000000..83ded0b82b922afe2afdd2e9b5f405ccf1dd2062
--- /dev/null
+++ b/pyarrow/src/arrow/python/decimal.h
@@ -0,0 +1,162 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+#include "arrow/python/visibility.h"
+#include "arrow/type.h"
+
+namespace arrow {
+
+class Decimal128;
+class Decimal256;
+
+namespace py {
+
+class OwnedRef;
+
+//
+// Python Decimal support
+//
+
+namespace internal {
+
+// \brief Import the Python Decimal type
+ARROW_PYTHON_EXPORT
+Status ImportDecimalType(OwnedRef* decimal_type);
+
+// \brief Convert a Python Decimal object to a C++ string
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[out] The string representation of the Python Decimal instance
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status PythonDecimalToString(PyObject* python_decimal, std::string* out);
+
+// \brief Convert a C++ std::string to a Python Decimal instance
+// \param[in] decimal_constructor The decimal type object
+// \param[in] decimal_string A decimal string
+// \return An instance of decimal.Decimal
+ARROW_PYTHON_EXPORT
+PyObject* DecimalFromString(PyObject* decimal_constructor,
+                            const std::string& decimal_string);
+
+// \brief Convert a Python decimal to an Arrow Decimal128 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal32* out);
+
+// \brief Convert a Python object to an Arrow Decimal128 object
+// \param[in] python_decimal A Python int or decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal32* out);
+
+// \brief Convert a Python decimal to an Arrow Decimal128 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal64* out);
+
+// \brief Convert a Python object to an Arrow Decimal128 object
+// \param[in] python_decimal A Python int or decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal64* out);
+
+// \brief Convert a Python decimal to an Arrow Decimal128 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal128* out);
+
+// \brief Convert a Python object to an Arrow Decimal128 object
+// \param[in] python_decimal A Python int or decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal128
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal128* out);
+
+// \brief Convert a Python decimal to an Arrow Decimal256 object
+// \param[in] python_decimal A Python decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal256
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
+                                Decimal256* out);
+
+// \brief Convert a Python object to an Arrow Decimal256 object
+// \param[in] python_decimal A Python int or decimal.Decimal instance
+// \param[in] arrow_type An instance of arrow::DecimalType
+// \param[out] out A pointer to a Decimal256
+// \return The status of the operation
+ARROW_PYTHON_EXPORT
+Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, Decimal256* out);
+
+// \brief Check whether obj is an instance of Decimal
+ARROW_PYTHON_EXPORT
+bool PyDecimal_Check(PyObject* obj);
+
+// \brief Check whether obj is nan. This function will abort the program if the argument
+// is not a Decimal instance
+ARROW_PYTHON_EXPORT
+bool PyDecimal_ISNAN(PyObject* obj);
+
+// \brief Helper class to track and update the precision and scale of a decimal
+class ARROW_PYTHON_EXPORT DecimalMetadata {
+ public:
+  DecimalMetadata();
+  DecimalMetadata(int32_t precision, int32_t scale);
+
+  // \brief Adjust the precision and scale of a decimal type given a new precision and a
+  // new scale \param[in] suggested_precision A candidate precision \param[in]
+  // suggested_scale A candidate scale \return The status of the operation
+  Status Update(int32_t suggested_precision, int32_t suggested_scale);
+
+  // \brief A convenient interface for updating the precision and scale based on a Python
+  // Decimal object \param object A Python Decimal object \return The status of the
+  // operation
+  Status Update(PyObject* object);
+
+  int32_t precision() const { return precision_; }
+  int32_t scale() const { return scale_; }
+
+ private:
+  int32_t precision_;
+  int32_t scale_;
+};
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/extension_type.cc b/pyarrow/src/arrow/python/extension_type.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8c3f3f1d8d0f7fe5c5d6d3bdab58fd7c48136654
--- /dev/null
+++ b/pyarrow/src/arrow/python/extension_type.cc
@@ -0,0 +1,221 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "arrow/python/extension_type.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/vendored/pythoncapi_compat.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace py {
+
+namespace {
+
+// Serialize a Python ExtensionType instance
+Status SerializeExtInstance(PyObject* type_instance, std::string* out) {
+  OwnedRef res(
+      cpp_PyObject_CallMethod(type_instance, "__arrow_ext_serialize__", nullptr));
+  if (!res) {
+    return ConvertPyError();
+  }
+  if (!PyBytes_Check(res.obj())) {
+    return Status::TypeError(
+        "__arrow_ext_serialize__ should return bytes object, "
+        "got ",
+        internal::PyObject_StdStringRepr(res.obj()));
+  }
+  *out = internal::PyBytes_AsStdString(res.obj());
+  return Status::OK();
+}
+
+// Deserialize a Python ExtensionType instance
+PyObject* DeserializeExtInstance(PyObject* type_class,
+                                 std::shared_ptr<DataType> storage_type,
+                                 const std::string& serialized_data) {
+  OwnedRef storage_ref(wrap_data_type(storage_type));
+  if (!storage_ref) {
+    return nullptr;
+  }
+  OwnedRef data_ref(PyBytes_FromStringAndSize(
+      serialized_data.data(), static_cast<Py_ssize_t>(serialized_data.size())));
+  if (!data_ref) {
+    return nullptr;
+  }
+
+  return cpp_PyObject_CallMethod(type_class, "__arrow_ext_deserialize__", "OO",
+                                 storage_ref.obj(), data_ref.obj());
+}
+
+}  // namespace
+
+static const char* kExtensionName = "arrow.py_extension_type";
+
+std::string PyExtensionType::ToString(bool show_metadata) const {
+  PyAcquireGIL lock;
+
+  std::stringstream ss;
+  OwnedRef instance(GetInstance());
+  ss << "extension<" << this->extension_name() << "<" << Py_TYPE(instance.obj())->tp_name
+     << ">>";
+  return ss.str();
+}
+
+PyExtensionType::PyExtensionType(std::shared_ptr<DataType> storage_type, PyObject* typ,
+                                 PyObject* inst)
+    : ExtensionType(storage_type),
+      extension_name_(kExtensionName),
+      type_class_(typ),
+      type_instance_(inst) {}
+
+PyExtensionType::PyExtensionType(std::shared_ptr<DataType> storage_type,
+                                 std::string extension_name, PyObject* typ,
+                                 PyObject* inst)
+    : ExtensionType(storage_type),
+      extension_name_(std::move(extension_name)),
+      type_class_(typ),
+      type_instance_(inst) {}
+
+bool PyExtensionType::ExtensionEquals(const ExtensionType& other) const {
+  PyAcquireGIL lock;
+
+  if (other.extension_name() != extension_name()) {
+    return false;
+  }
+  const auto& other_ext = checked_cast<const PyExtensionType&>(other);
+  int res = -1;
+  if (!type_instance_) {
+    if (other_ext.type_instance_) {
+      return false;
+    }
+    // Compare Python types
+    res = PyObject_RichCompareBool(type_class_.obj(), other_ext.type_class_.obj(), Py_EQ);
+  } else {
+    if (!other_ext.type_instance_) {
+      return false;
+    }
+    // Compare Python instances
+    OwnedRef left(GetInstance());
+    OwnedRef right(other_ext.GetInstance());
+    if (!left || !right) {
+      goto error;
+    }
+    res = PyObject_RichCompareBool(left.obj(), right.obj(), Py_EQ);
+  }
+  if (res == -1) {
+    goto error;
+  }
+  return res == 1;
+
+error:
+  // Cannot propagate error
+  PyErr_WriteUnraisable(nullptr);
+  return false;
+}
+
+std::shared_ptr<Array> PyExtensionType::MakeArray(std::shared_ptr<ArrayData> data) const {
+  ARROW_DCHECK_EQ(data->type->id(), Type::EXTENSION);
+  return std::make_shared<ExtensionArray>(data);
+}
+
+std::string PyExtensionType::Serialize() const {
+  ARROW_DCHECK(type_instance_);
+  return serialized_;
+}
+
+Result<std::shared_ptr<DataType>> PyExtensionType::Deserialize(
+    std::shared_ptr<DataType> storage_type, const std::string& serialized_data) const {
+  PyAcquireGIL lock;
+
+  if (import_pyarrow()) {
+    return ConvertPyError();
+  }
+  OwnedRef res(DeserializeExtInstance(type_class_.obj(), storage_type, serialized_data));
+  if (!res) {
+    return ConvertPyError();
+  }
+  return unwrap_data_type(res.obj());
+}
+
+PyObject* PyExtensionType::GetInstance() const {
+  if (!type_instance_) {
+    PyErr_SetString(PyExc_TypeError, "Not an instance");
+    return nullptr;
+  }
+  ARROW_DCHECK(PyWeakref_CheckRef(type_instance_.obj()));
+  PyObject* inst = NULL;
+  int result = PyWeakref_GetRef(type_instance_.obj(), &inst);
+  if (result == 1) {
+    // Alive: inst is a new strong reference
+    return inst;
+  } else if (result == 0) {
+    // Weakref is dead, must reconstruct from serialized form
+    // XXX cache again?
+    return DeserializeExtInstance(type_class_.obj(), storage_type_, serialized_);
+  } else {
+    // -1 = exception
+    return nullptr;
+  }
+}
+
+Status PyExtensionType::SetInstance(PyObject* inst) const {
+  // Check we have the right type
+  PyObject* typ = reinterpret_cast<PyObject*>(Py_TYPE(inst));
+  if (typ != type_class_.obj()) {
+    return Status::TypeError("Unexpected Python ExtensionType class ",
+                             internal::PyObject_StdStringRepr(typ), " expected ",
+                             internal::PyObject_StdStringRepr(type_class_.obj()));
+  }
+
+  PyObject* wr = PyWeakref_NewRef(inst, nullptr);
+  if (wr == NULL) {
+    return ConvertPyError();
+  }
+  type_instance_.reset(wr);
+  return SerializeExtInstance(inst, &serialized_);
+}
+
+Status PyExtensionType::FromClass(const std::shared_ptr<DataType> storage_type,
+                                  const std::string extension_name, PyObject* typ,
+                                  std::shared_ptr<ExtensionType>* out) {
+  Py_INCREF(typ);
+  out->reset(new PyExtensionType(storage_type, std::move(extension_name), typ));
+  return Status::OK();
+}
+
+Status RegisterPyExtensionType(const std::shared_ptr<DataType>& type) {
+  ARROW_DCHECK_EQ(type->id(), Type::EXTENSION);
+  auto ext_type = std::dynamic_pointer_cast<ExtensionType>(type);
+  return RegisterExtensionType(ext_type);
+}
+
+Status UnregisterPyExtensionType(const std::string& type_name) {
+  return UnregisterExtensionType(type_name);
+}
+
+std::string PyExtensionName() { return kExtensionName; }
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/extension_type.h b/pyarrow/src/arrow/python/extension_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6523824eb9634c18b87e4e3e5c827d8be43f8a8
--- /dev/null
+++ b/pyarrow/src/arrow/python/extension_type.h
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "arrow/extension_type.h"
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace py {
+
+class ARROW_PYTHON_EXPORT PyExtensionType : public ExtensionType {
+ public:
+  // Implement extensionType API
+  std::string extension_name() const override { return extension_name_; }
+
+  std::string ToString(bool show_metadata = false) const override;
+
+  bool ExtensionEquals(const ExtensionType& other) const override;
+
+  std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
+
+  Result<std::shared_ptr<DataType>> Deserialize(
+      std::shared_ptr<DataType> storage_type,
+      const std::string& serialized) const override;
+
+  std::string Serialize() const override;
+
+  // For use from Cython
+  // Assumes that `typ` is borrowed
+  static Status FromClass(const std::shared_ptr<DataType> storage_type,
+                          const std::string extension_name, PyObject* typ,
+                          std::shared_ptr<ExtensionType>* out);
+
+  // Return new ref
+  PyObject* GetInstance() const;
+  Status SetInstance(PyObject*) const;
+
+ protected:
+  PyExtensionType(std::shared_ptr<DataType> storage_type, PyObject* typ,
+                  PyObject* inst = NULLPTR);
+  PyExtensionType(std::shared_ptr<DataType> storage_type, std::string extension_name,
+                  PyObject* typ, PyObject* inst = NULLPTR);
+
+  std::string extension_name_;
+
+  // These fields are mutable because of two-step initialization.
+  mutable OwnedRefNoGIL type_class_;
+  // A weakref or null.  Storing a strong reference to the Python extension type
+  // instance would create an unreclaimable reference cycle between Python and C++
+  // (the Python instance has to keep a strong reference to the C++ ExtensionType
+  //  in other direction).  Instead, we store a weakref to the instance.
+  // If the weakref is dead, we reconstruct the instance from its serialized form.
+  mutable OwnedRefNoGIL type_instance_;
+  // Empty if type_instance_ is null
+  mutable std::string serialized_;
+};
+
+ARROW_PYTHON_EXPORT std::string PyExtensionName();
+
+ARROW_PYTHON_EXPORT Status RegisterPyExtensionType(const std::shared_ptr<DataType>&);
+
+ARROW_PYTHON_EXPORT Status UnregisterPyExtensionType(const std::string& type_name);
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/filesystem.cc b/pyarrow/src/arrow/python/filesystem.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e9b500a4f7b4a024ba8021f05e230c837494226
--- /dev/null
+++ b/pyarrow/src/arrow/python/filesystem.cc
@@ -0,0 +1,206 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/filesystem.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using fs::FileInfo;
+using fs::FileSelector;
+
+namespace py {
+namespace fs {
+
+PyFileSystem::PyFileSystem(PyObject* handler, PyFileSystemVtable vtable)
+    : handler_(handler), vtable_(std::move(vtable)) {
+  Py_INCREF(handler);
+}
+
+PyFileSystem::~PyFileSystem() {}
+
+std::shared_ptr<PyFileSystem> PyFileSystem::Make(PyObject* handler,
+                                                 PyFileSystemVtable vtable) {
+  return std::make_shared<PyFileSystem>(handler, std::move(vtable));
+}
+
+std::string PyFileSystem::type_name() const {
+  std::string result;
+  auto st = SafeCallIntoPython([&]() -> Status {
+    vtable_.get_type_name(handler_.obj(), &result);
+    if (PyErr_Occurred()) {
+      PyErr_WriteUnraisable(handler_.obj());
+    }
+    return Status::OK();
+  });
+  ARROW_UNUSED(st);
+  return result;
+}
+
+bool PyFileSystem::Equals(const FileSystem& other) const {
+  bool result;
+  auto st = SafeCallIntoPython([&]() -> Status {
+    result = vtable_.equals(handler_.obj(), other);
+    if (PyErr_Occurred()) {
+      PyErr_WriteUnraisable(handler_.obj());
+    }
+    return Status::OK();
+  });
+  ARROW_UNUSED(st);
+  return result;
+}
+
+Result<FileInfo> PyFileSystem::GetFileInfo(const std::string& path) {
+  FileInfo info;
+
+  auto st = SafeCallIntoPython([&]() -> Status {
+    vtable_.get_file_info(handler_.obj(), path, &info);
+    return CheckPyError();
+  });
+  RETURN_NOT_OK(st);
+  return info;
+}
+
+Result<std::vector<FileInfo>> PyFileSystem::GetFileInfo(
+    const std::vector<std::string>& paths) {
+  std::vector<FileInfo> infos;
+
+  auto st = SafeCallIntoPython([&]() -> Status {
+    vtable_.get_file_info_vector(handler_.obj(), paths, &infos);
+    return CheckPyError();
+  });
+  RETURN_NOT_OK(st);
+  return infos;
+}
+
+Result<std::vector<FileInfo>> PyFileSystem::GetFileInfo(const FileSelector& select) {
+  std::vector<FileInfo> infos;
+
+  auto st = SafeCallIntoPython([&]() -> Status {
+    vtable_.get_file_info_selector(handler_.obj(), select, &infos);
+    return CheckPyError();
+  });
+  RETURN_NOT_OK(st);
+  return infos;
+}
+
+Status PyFileSystem::CreateDir(const std::string& path, bool recursive) {
+  return SafeCallIntoPython([&]() -> Status {
+    vtable_.create_dir(handler_.obj(), path, recursive);
+    return CheckPyError();
+  });
+}
+
+Status PyFileSystem::DeleteDir(const std::string& path) {
+  return SafeCallIntoPython([&]() -> Status {
+    vtable_.delete_dir(handler_.obj(), path);
+    return CheckPyError();
+  });
+}
+
+Status PyFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) {
+  return SafeCallIntoPython([&]() -> Status {
+    vtable_.delete_dir_contents(handler_.obj(), path, missing_dir_ok);
+    return CheckPyError();
+  });
+}
+
+Status PyFileSystem::DeleteRootDirContents() {
+  return SafeCallIntoPython([&]() -> Status {
+    vtable_.delete_root_dir_contents(handler_.obj());
+    return CheckPyError();
+  });
+}
+
+Status PyFileSystem::DeleteFile(const std::string& path) {
+  return SafeCallIntoPython([&]() -> Status {
+    vtable_.delete_file(handler_.obj(), path);
+    return CheckPyError();
+  });
+}
+
+Status PyFileSystem::Move(const std::string& src, const std::string& dest) {
+  return SafeCallIntoPython([&]() -> Status {
+    vtable_.move(handler_.obj(), src, dest);
+    return CheckPyError();
+  });
+}
+
+Status PyFileSystem::CopyFile(const std::string& src, const std::string& dest) {
+  return SafeCallIntoPython([&]() -> Status {
+    vtable_.copy_file(handler_.obj(), src, dest);
+    return CheckPyError();
+  });
+}
+
+Result<std::shared_ptr<io::InputStream>> PyFileSystem::OpenInputStream(
+    const std::string& path) {
+  std::shared_ptr<io::InputStream> stream;
+  auto st = SafeCallIntoPython([&]() -> Status {
+    vtable_.open_input_stream(handler_.obj(), path, &stream);
+    return CheckPyError();
+  });
+  RETURN_NOT_OK(st);
+  return stream;
+}
+
+Result<std::shared_ptr<io::RandomAccessFile>> PyFileSystem::OpenInputFile(
+    const std::string& path) {
+  std::shared_ptr<io::RandomAccessFile> stream;
+  auto st = SafeCallIntoPython([&]() -> Status {
+    vtable_.open_input_file(handler_.obj(), path, &stream);
+    return CheckPyError();
+  });
+  RETURN_NOT_OK(st);
+  return stream;
+}
+
+Result<std::shared_ptr<io::OutputStream>> PyFileSystem::OpenOutputStream(
+    const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) {
+  std::shared_ptr<io::OutputStream> stream;
+  auto st = SafeCallIntoPython([&]() -> Status {
+    vtable_.open_output_stream(handler_.obj(), path, metadata, &stream);
+    return CheckPyError();
+  });
+  RETURN_NOT_OK(st);
+  return stream;
+}
+
+Result<std::shared_ptr<io::OutputStream>> PyFileSystem::OpenAppendStream(
+    const std::string& path, const std::shared_ptr<const KeyValueMetadata>& metadata) {
+  std::shared_ptr<io::OutputStream> stream;
+  auto st = SafeCallIntoPython([&]() -> Status {
+    vtable_.open_append_stream(handler_.obj(), path, metadata, &stream);
+    return CheckPyError();
+  });
+  RETURN_NOT_OK(st);
+  return stream;
+}
+
+Result<std::string> PyFileSystem::NormalizePath(std::string path) {
+  std::string normalized;
+  auto st = SafeCallIntoPython([&]() -> Status {
+    vtable_.normalize_path(handler_.obj(), path, &normalized);
+    return CheckPyError();
+  });
+  RETURN_NOT_OK(st);
+  return normalized;
+}
+
+}  // namespace fs
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/filesystem.h b/pyarrow/src/arrow/python/filesystem.h
new file mode 100644
index 0000000000000000000000000000000000000000..194b226ac5c35d4b3518c2e9fa9443c2ba1007ae
--- /dev/null
+++ b/pyarrow/src/arrow/python/filesystem.h
@@ -0,0 +1,130 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/filesystem/filesystem.h"
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+#include "arrow/util/macros.h"
+
+namespace arrow::py::fs {
+
+class ARROW_PYTHON_EXPORT PyFileSystemVtable {
+ public:
+  std::function<void(PyObject*, std::string* out)> get_type_name;
+  std::function<bool(PyObject*, const arrow::fs::FileSystem& other)> equals;
+
+  std::function<void(PyObject*, const std::string& path, arrow::fs::FileInfo* out)>
+      get_file_info;
+  std::function<void(PyObject*, const std::vector<std::string>& paths,
+                     std::vector<arrow::fs::FileInfo>* out)>
+      get_file_info_vector;
+  std::function<void(PyObject*, const arrow::fs::FileSelector&,
+                     std::vector<arrow::fs::FileInfo>* out)>
+      get_file_info_selector;
+
+  std::function<void(PyObject*, const std::string& path, bool)> create_dir;
+  std::function<void(PyObject*, const std::string& path)> delete_dir;
+  std::function<void(PyObject*, const std::string& path, bool)> delete_dir_contents;
+  std::function<void(PyObject*)> delete_root_dir_contents;
+  std::function<void(PyObject*, const std::string& path)> delete_file;
+  std::function<void(PyObject*, const std::string& src, const std::string& dest)> move;
+  std::function<void(PyObject*, const std::string& src, const std::string& dest)>
+      copy_file;
+
+  std::function<void(PyObject*, const std::string& path,
+                     std::shared_ptr<io::InputStream>* out)>
+      open_input_stream;
+  std::function<void(PyObject*, const std::string& path,
+                     std::shared_ptr<io::RandomAccessFile>* out)>
+      open_input_file;
+  std::function<void(PyObject*, const std::string& path,
+                     const std::shared_ptr<const KeyValueMetadata>&,
+                     std::shared_ptr<io::OutputStream>* out)>
+      open_output_stream;
+  std::function<void(PyObject*, const std::string& path,
+                     const std::shared_ptr<const KeyValueMetadata>&,
+                     std::shared_ptr<io::OutputStream>* out)>
+      open_append_stream;
+
+  std::function<void(PyObject*, const std::string& path, std::string* out)>
+      normalize_path;
+};
+
+class ARROW_PYTHON_EXPORT PyFileSystem : public arrow::fs::FileSystem {
+ public:
+  PyFileSystem(PyObject* handler, PyFileSystemVtable vtable);
+  ~PyFileSystem() override;
+
+  static std::shared_ptr<PyFileSystem> Make(PyObject* handler, PyFileSystemVtable vtable);
+
+  std::string type_name() const override;
+
+  bool Equals(const FileSystem& other) const override;
+
+  /// \cond FALSE
+  using FileSystem::CreateDir;
+  using FileSystem::DeleteDirContents;
+  using FileSystem::GetFileInfo;
+  using FileSystem::OpenAppendStream;
+  using FileSystem::OpenOutputStream;
+  /// \endcond
+
+  Result<arrow::fs::FileInfo> GetFileInfo(const std::string& path) override;
+  Result<std::vector<arrow::fs::FileInfo>> GetFileInfo(
+      const std::vector<std::string>& paths) override;
+  Result<std::vector<arrow::fs::FileInfo>> GetFileInfo(
+      const arrow::fs::FileSelector& select) override;
+
+  Status CreateDir(const std::string& path, bool recursive) override;
+
+  Status DeleteDir(const std::string& path) override;
+  Status DeleteDirContents(const std::string& path, bool missing_dir_ok) override;
+  Status DeleteRootDirContents() override;
+
+  Status DeleteFile(const std::string& path) override;
+
+  Status Move(const std::string& src, const std::string& dest) override;
+
+  Status CopyFile(const std::string& src, const std::string& dest) override;
+
+  Result<std::shared_ptr<io::InputStream>> OpenInputStream(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile(
+      const std::string& path) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenOutputStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+  Result<std::shared_ptr<io::OutputStream>> OpenAppendStream(
+      const std::string& path,
+      const std::shared_ptr<const KeyValueMetadata>& metadata) override;
+
+  Result<std::string> NormalizePath(std::string path) override;
+
+  PyObject* handler() const { return handler_.obj(); }
+
+ private:
+  OwnedRefNoGIL handler_;
+  PyFileSystemVtable vtable_;
+};
+
+}  // namespace arrow::py::fs
diff --git a/pyarrow/src/arrow/python/flight.cc b/pyarrow/src/arrow/python/flight.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ef8a1dd6b0506b95e89e556f634fae9b93bba59
--- /dev/null
+++ b/pyarrow/src/arrow/python/flight.cc
@@ -0,0 +1,390 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <signal.h>
+#include <utility>
+
+#include "arrow/python/flight.h"
+#include "arrow/util/io_util.h"
+#include "arrow/util/logging.h"
+
+using arrow::flight::FlightPayload;
+
+namespace arrow {
+namespace py {
+namespace flight {
+
+const char* kPyServerMiddlewareName = "arrow.py_server_middleware";
+
+PyServerAuthHandler::PyServerAuthHandler(PyObject* handler,
+                                         const PyServerAuthHandlerVtable& vtable)
+    : vtable_(vtable) {
+  Py_INCREF(handler);
+  handler_.reset(handler);
+}
+
+Status PyServerAuthHandler::Authenticate(const arrow::flight::ServerCallContext& context,
+                                         arrow::flight::ServerAuthSender* outgoing,
+                                         arrow::flight::ServerAuthReader* incoming) {
+  return SafeCallIntoPython([=] {
+    const Status status = vtable_.authenticate(handler_.obj(), outgoing, incoming);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+Status PyServerAuthHandler::IsValid(const std::string& token,
+                                    std::string* peer_identity) {
+  return SafeCallIntoPython([=] {
+    const Status status = vtable_.is_valid(handler_.obj(), token, peer_identity);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+PyClientAuthHandler::PyClientAuthHandler(PyObject* handler,
+                                         const PyClientAuthHandlerVtable& vtable)
+    : vtable_(vtable) {
+  Py_INCREF(handler);
+  handler_.reset(handler);
+}
+
+Status PyClientAuthHandler::Authenticate(arrow::flight::ClientAuthSender* outgoing,
+                                         arrow::flight::ClientAuthReader* incoming) {
+  return SafeCallIntoPython([=] {
+    const Status status = vtable_.authenticate(handler_.obj(), outgoing, incoming);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+Status PyClientAuthHandler::GetToken(std::string* token) {
+  return SafeCallIntoPython([=] {
+    const Status status = vtable_.get_token(handler_.obj(), token);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+PyFlightServer::PyFlightServer(PyObject* server, const PyFlightServerVtable& vtable)
+    : vtable_(vtable) {
+  Py_INCREF(server);
+  server_.reset(server);
+}
+
+Status PyFlightServer::ListFlights(
+    const arrow::flight::ServerCallContext& context,
+    const arrow::flight::Criteria* criteria,
+    std::unique_ptr<arrow::flight::FlightListing>* listings) {
+  return SafeCallIntoPython([&] {
+    const Status status =
+        vtable_.list_flights(server_.obj(), context, criteria, listings);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+Status PyFlightServer::GetFlightInfo(const arrow::flight::ServerCallContext& context,
+                                     const arrow::flight::FlightDescriptor& request,
+                                     std::unique_ptr<arrow::flight::FlightInfo>* info) {
+  return SafeCallIntoPython([&] {
+    const Status status = vtable_.get_flight_info(server_.obj(), context, request, info);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+Status PyFlightServer::GetSchema(const arrow::flight::ServerCallContext& context,
+                                 const arrow::flight::FlightDescriptor& request,
+                                 std::unique_ptr<arrow::flight::SchemaResult>* result) {
+  return SafeCallIntoPython([&] {
+    const Status status = vtable_.get_schema(server_.obj(), context, request, result);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+Status PyFlightServer::DoGet(const arrow::flight::ServerCallContext& context,
+                             const arrow::flight::Ticket& request,
+                             std::unique_ptr<arrow::flight::FlightDataStream>* stream) {
+  return SafeCallIntoPython([&] {
+    const Status status = vtable_.do_get(server_.obj(), context, request, stream);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+Status PyFlightServer::DoPut(
+    const arrow::flight::ServerCallContext& context,
+    std::unique_ptr<arrow::flight::FlightMessageReader> reader,
+    std::unique_ptr<arrow::flight::FlightMetadataWriter> writer) {
+  return SafeCallIntoPython([&] {
+    const Status status =
+        vtable_.do_put(server_.obj(), context, std::move(reader), std::move(writer));
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+Status PyFlightServer::DoExchange(
+    const arrow::flight::ServerCallContext& context,
+    std::unique_ptr<arrow::flight::FlightMessageReader> reader,
+    std::unique_ptr<arrow::flight::FlightMessageWriter> writer) {
+  return SafeCallIntoPython([&] {
+    const Status status =
+        vtable_.do_exchange(server_.obj(), context, std::move(reader), std::move(writer));
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+Status PyFlightServer::DoAction(const arrow::flight::ServerCallContext& context,
+                                const arrow::flight::Action& action,
+                                std::unique_ptr<arrow::flight::ResultStream>* result) {
+  return SafeCallIntoPython([&] {
+    const Status status = vtable_.do_action(server_.obj(), context, action, result);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+Status PyFlightServer::ListActions(const arrow::flight::ServerCallContext& context,
+                                   std::vector<arrow::flight::ActionType>* actions) {
+  return SafeCallIntoPython([&] {
+    const Status status = vtable_.list_actions(server_.obj(), context, actions);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+Status PyFlightServer::ServeWithSignals() {
+  // Respect the current Python settings, i.e. only interrupt the server if there is
+  // an active signal handler for SIGINT and SIGTERM.
+  std::vector<int> signals;
+  for (const int signum : {SIGINT, SIGTERM}) {
+    ARROW_ASSIGN_OR_RAISE(auto handler, ::arrow::internal::GetSignalHandler(signum));
+    auto cb = handler.callback();
+    if (cb != SIG_DFL && cb != SIG_IGN) {
+      signals.push_back(signum);
+    }
+  }
+  RETURN_NOT_OK(SetShutdownOnSignals(signals));
+
+  // Serve until we got told to shutdown or a signal interrupted us
+  RETURN_NOT_OK(Serve());
+  int signum = GotSignal();
+  if (signum != 0) {
+    // Issue the signal again with Python's signal handlers restored
+    PyAcquireGIL lock;
+    raise(signum);
+    // XXX Ideally we would loop and serve again if no exception was raised.
+    // Unfortunately, gRPC will return immediately if Serve() is called again.
+    ARROW_UNUSED(PyErr_CheckSignals());
+  }
+
+  return Status::OK();
+}
+
+PyFlightResultStream::PyFlightResultStream(PyObject* generator,
+                                           PyFlightResultStreamCallback callback)
+    : callback_(callback) {
+  Py_INCREF(generator);
+  generator_.reset(generator);
+}
+
+arrow::Result<std::unique_ptr<arrow::flight::Result>> PyFlightResultStream::Next() {
+  return SafeCallIntoPython(
+      [=]() -> arrow::Result<std::unique_ptr<arrow::flight::Result>> {
+        std::unique_ptr<arrow::flight::Result> result;
+        const Status status = callback_(generator_.obj(), &result);
+        RETURN_NOT_OK(CheckPyError());
+        RETURN_NOT_OK(status);
+        return result;
+      });
+}
+
+PyFlightDataStream::PyFlightDataStream(
+    PyObject* data_source, std::unique_ptr<arrow::flight::FlightDataStream> stream)
+    : stream_(std::move(stream)) {
+  Py_INCREF(data_source);
+  data_source_.reset(data_source);
+}
+
+std::shared_ptr<Schema> PyFlightDataStream::schema() { return stream_->schema(); }
+
+arrow::Result<FlightPayload> PyFlightDataStream::GetSchemaPayload() {
+  return stream_->GetSchemaPayload();
+}
+
+arrow::Result<FlightPayload> PyFlightDataStream::Next() { return stream_->Next(); }
+
+PyGeneratorFlightDataStream::PyGeneratorFlightDataStream(
+    PyObject* generator, std::shared_ptr<arrow::Schema> schema,
+    PyGeneratorFlightDataStreamCallback callback, const ipc::IpcWriteOptions& options)
+    : schema_(schema), mapper_(*schema_), options_(options), callback_(callback) {
+  Py_INCREF(generator);
+  generator_.reset(generator);
+}
+
+std::shared_ptr<Schema> PyGeneratorFlightDataStream::schema() { return schema_; }
+
+arrow::Result<FlightPayload> PyGeneratorFlightDataStream::GetSchemaPayload() {
+  FlightPayload payload;
+  RETURN_NOT_OK(ipc::GetSchemaPayload(*schema_, options_, mapper_, &payload.ipc_message));
+  return payload;
+}
+
+arrow::Result<FlightPayload> PyGeneratorFlightDataStream::Next() {
+  return SafeCallIntoPython([=]() -> arrow::Result<FlightPayload> {
+    FlightPayload payload;
+    const Status status = callback_(generator_.obj(), &payload);
+    RETURN_NOT_OK(CheckPyError());
+    RETURN_NOT_OK(status);
+    return payload;
+  });
+}
+
+// Flight Server Middleware
+
+PyServerMiddlewareFactory::PyServerMiddlewareFactory(PyObject* factory,
+                                                     StartCallCallback start_call)
+    : start_call_(start_call) {
+  Py_INCREF(factory);
+  factory_.reset(factory);
+}
+
+Status PyServerMiddlewareFactory::StartCall(
+    const arrow::flight::CallInfo& info, const arrow::flight::ServerCallContext& context,
+    std::shared_ptr<arrow::flight::ServerMiddleware>* middleware) {
+  return SafeCallIntoPython([&] {
+    const Status status =
+        start_call_(factory_.obj(), info, context.incoming_headers(), middleware);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+}
+
+PyServerMiddleware::PyServerMiddleware(PyObject* middleware, Vtable vtable)
+    : vtable_(vtable) {
+  Py_INCREF(middleware);
+  middleware_.reset(middleware);
+}
+
+void PyServerMiddleware::SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) {
+  const Status& status = SafeCallIntoPython([&] {
+    const Status status = vtable_.sending_headers(middleware_.obj(), outgoing_headers);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+
+  ARROW_WARN_NOT_OK(status, "Python server middleware failed in SendingHeaders");
+}
+
+void PyServerMiddleware::CallCompleted(const Status& call_status) {
+  const Status& status = SafeCallIntoPython([&] {
+    const Status status = vtable_.call_completed(middleware_.obj(), call_status);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+
+  ARROW_WARN_NOT_OK(status, "Python server middleware failed in CallCompleted");
+}
+
+std::string PyServerMiddleware::name() const { return kPyServerMiddlewareName; }
+
+PyObject* PyServerMiddleware::py_object() const { return middleware_.obj(); }
+
+// Flight Client Middleware
+
+PyClientMiddlewareFactory::PyClientMiddlewareFactory(PyObject* factory,
+                                                     StartCallCallback start_call)
+    : start_call_(start_call) {
+  Py_INCREF(factory);
+  factory_.reset(factory);
+}
+
+void PyClientMiddlewareFactory::StartCall(
+    const arrow::flight::CallInfo& info,
+    std::unique_ptr<arrow::flight::ClientMiddleware>* middleware) {
+  const Status& status = SafeCallIntoPython([&] {
+    const Status status = start_call_(factory_.obj(), info, middleware);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+
+  ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall");
+}
+
+PyClientMiddleware::PyClientMiddleware(PyObject* middleware, Vtable vtable)
+    : vtable_(vtable) {
+  Py_INCREF(middleware);
+  middleware_.reset(middleware);
+}
+
+void PyClientMiddleware::SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) {
+  const Status& status = SafeCallIntoPython([&] {
+    const Status status = vtable_.sending_headers(middleware_.obj(), outgoing_headers);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+
+  ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall");
+}
+
+void PyClientMiddleware::ReceivedHeaders(
+    const arrow::flight::CallHeaders& incoming_headers) {
+  const Status& status = SafeCallIntoPython([&] {
+    const Status status = vtable_.received_headers(middleware_.obj(), incoming_headers);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+
+  ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall");
+}
+
+void PyClientMiddleware::CallCompleted(const Status& call_status) {
+  const Status& status = SafeCallIntoPython([&] {
+    const Status status = vtable_.call_completed(middleware_.obj(), call_status);
+    RETURN_NOT_OK(CheckPyError());
+    return status;
+  });
+
+  ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall");
+}
+
+Status CreateFlightInfo(const std::shared_ptr<arrow::Schema>& schema,
+                        const arrow::flight::FlightDescriptor& descriptor,
+                        const std::vector<arrow::flight::FlightEndpoint>& endpoints,
+                        int64_t total_records, int64_t total_bytes, bool ordered,
+                        const std::string& app_metadata,
+                        std::unique_ptr<arrow::flight::FlightInfo>* out) {
+  ARROW_ASSIGN_OR_RAISE(auto result, arrow::flight::FlightInfo::Make(
+                                         schema, descriptor, endpoints, total_records,
+                                         total_bytes, ordered, app_metadata));
+  *out = std::unique_ptr<arrow::flight::FlightInfo>(
+      new arrow::flight::FlightInfo(std::move(result)));
+  return Status::OK();
+}
+
+Status CreateSchemaResult(const std::shared_ptr<arrow::Schema>& schema,
+                          std::unique_ptr<arrow::flight::SchemaResult>* out) {
+  return arrow::flight::SchemaResult::Make(*schema).Value(out);
+}
+
+}  // namespace flight
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/flight.h b/pyarrow/src/arrow/python/flight.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a1f4c750aac77573fd7eb84e9d1cc24eae92700
--- /dev/null
+++ b/pyarrow/src/arrow/python/flight.h
@@ -0,0 +1,352 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/flight/api.h"
+#include "arrow/ipc/dictionary.h"
+#include "arrow/python/common.h"
+
+#if defined(_WIN32) || defined(__CYGWIN__)  // Windows
+#  if defined(_MSC_VER)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_PYTHON_STATIC
+#    define ARROW_PYFLIGHT_EXPORT
+#  elif defined(ARROW_PYFLIGHT_EXPORTING)
+#    define ARROW_PYFLIGHT_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_PYFLIGHT_EXPORT __declspec(dllimport)
+#  endif
+
+#else  // Not Windows
+#  ifndef ARROW_PYFLIGHT_EXPORT
+#    define ARROW_PYFLIGHT_EXPORT __attribute__((visibility("default")))
+#  endif
+#endif  // Non-Windows
+
+namespace arrow {
+
+namespace py {
+
+namespace flight {
+
+ARROW_PYFLIGHT_EXPORT
+extern const char* kPyServerMiddlewareName;
+
+/// \brief A table of function pointers for calling from C++ into
+/// Python.
+class ARROW_PYFLIGHT_EXPORT PyFlightServerVtable {
+ public:
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       const arrow::flight::Criteria*,
+                       std::unique_ptr<arrow::flight::FlightListing>*)>
+      list_flights;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       const arrow::flight::FlightDescriptor&,
+                       std::unique_ptr<arrow::flight::FlightInfo>*)>
+      get_flight_info;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       const arrow::flight::FlightDescriptor&,
+                       std::unique_ptr<arrow::flight::SchemaResult>*)>
+      get_schema;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       const arrow::flight::Ticket&,
+                       std::unique_ptr<arrow::flight::FlightDataStream>*)>
+      do_get;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       std::unique_ptr<arrow::flight::FlightMessageReader>,
+                       std::unique_ptr<arrow::flight::FlightMetadataWriter>)>
+      do_put;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       std::unique_ptr<arrow::flight::FlightMessageReader>,
+                       std::unique_ptr<arrow::flight::FlightMessageWriter>)>
+      do_exchange;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       const arrow::flight::Action&,
+                       std::unique_ptr<arrow::flight::ResultStream>*)>
+      do_action;
+  std::function<Status(PyObject*, const arrow::flight::ServerCallContext&,
+                       std::vector<arrow::flight::ActionType>*)>
+      list_actions;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyServerAuthHandlerVtable {
+ public:
+  std::function<Status(PyObject*, arrow::flight::ServerAuthSender*,
+                       arrow::flight::ServerAuthReader*)>
+      authenticate;
+  std::function<Status(PyObject*, const std::string&, std::string*)> is_valid;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyClientAuthHandlerVtable {
+ public:
+  std::function<Status(PyObject*, arrow::flight::ClientAuthSender*,
+                       arrow::flight::ClientAuthReader*)>
+      authenticate;
+  std::function<Status(PyObject*, std::string*)> get_token;
+};
+
+/// \brief A helper to implement an auth mechanism in Python.
+class ARROW_PYFLIGHT_EXPORT PyServerAuthHandler
+    : public arrow::flight::ServerAuthHandler {
+ public:
+  explicit PyServerAuthHandler(PyObject* handler,
+                               const PyServerAuthHandlerVtable& vtable);
+  Status Authenticate(const arrow::flight::ServerCallContext& context,
+                      arrow::flight::ServerAuthSender* outgoing,
+                      arrow::flight::ServerAuthReader* incoming) override;
+  Status IsValid(const std::string& token, std::string* peer_identity) override;
+
+ private:
+  OwnedRefNoGIL handler_;
+  PyServerAuthHandlerVtable vtable_;
+};
+
+/// \brief A helper to implement an auth mechanism in Python.
+class ARROW_PYFLIGHT_EXPORT PyClientAuthHandler
+    : public arrow::flight::ClientAuthHandler {
+ public:
+  explicit PyClientAuthHandler(PyObject* handler,
+                               const PyClientAuthHandlerVtable& vtable);
+  Status Authenticate(arrow::flight::ClientAuthSender* outgoing,
+                      arrow::flight::ClientAuthReader* incoming) override;
+  Status GetToken(std::string* token) override;
+
+ private:
+  OwnedRefNoGIL handler_;
+  PyClientAuthHandlerVtable vtable_;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyFlightServer : public arrow::flight::FlightServerBase {
+ public:
+  explicit PyFlightServer(PyObject* server, const PyFlightServerVtable& vtable);
+
+  // Like Serve(), but set up signals and invoke Python signal handlers
+  // if necessary.  This function may return with a Python exception set.
+  Status ServeWithSignals();
+
+  Status ListFlights(const arrow::flight::ServerCallContext& context,
+                     const arrow::flight::Criteria* criteria,
+                     std::unique_ptr<arrow::flight::FlightListing>* listings) override;
+  Status GetFlightInfo(const arrow::flight::ServerCallContext& context,
+                       const arrow::flight::FlightDescriptor& request,
+                       std::unique_ptr<arrow::flight::FlightInfo>* info) override;
+  Status GetSchema(const arrow::flight::ServerCallContext& context,
+                   const arrow::flight::FlightDescriptor& request,
+                   std::unique_ptr<arrow::flight::SchemaResult>* result) override;
+  Status DoGet(const arrow::flight::ServerCallContext& context,
+               const arrow::flight::Ticket& request,
+               std::unique_ptr<arrow::flight::FlightDataStream>* stream) override;
+  Status DoPut(const arrow::flight::ServerCallContext& context,
+               std::unique_ptr<arrow::flight::FlightMessageReader> reader,
+               std::unique_ptr<arrow::flight::FlightMetadataWriter> writer) override;
+  Status DoExchange(const arrow::flight::ServerCallContext& context,
+                    std::unique_ptr<arrow::flight::FlightMessageReader> reader,
+                    std::unique_ptr<arrow::flight::FlightMessageWriter> writer) override;
+  Status DoAction(const arrow::flight::ServerCallContext& context,
+                  const arrow::flight::Action& action,
+                  std::unique_ptr<arrow::flight::ResultStream>* result) override;
+  Status ListActions(const arrow::flight::ServerCallContext& context,
+                     std::vector<arrow::flight::ActionType>* actions) override;
+
+ private:
+  OwnedRefNoGIL server_;
+  PyFlightServerVtable vtable_;
+};
+
+/// \brief A callback that obtains the next result from a Flight action.
+typedef std::function<Status(PyObject*, std::unique_ptr<arrow::flight::Result>*)>
+    PyFlightResultStreamCallback;
+
+/// \brief A ResultStream built around a Python callback.
+class ARROW_PYFLIGHT_EXPORT PyFlightResultStream : public arrow::flight::ResultStream {
+ public:
+  /// \brief Construct a FlightResultStream from a Python object and callback.
+  /// Must only be called while holding the GIL.
+  explicit PyFlightResultStream(PyObject* generator,
+                                PyFlightResultStreamCallback callback);
+  arrow::Result<std::unique_ptr<arrow::flight::Result>> Next() override;
+
+ private:
+  OwnedRefNoGIL generator_;
+  PyFlightResultStreamCallback callback_;
+};
+
+/// \brief A wrapper around a FlightDataStream that keeps alive a
+/// Python object backing it.
+class ARROW_PYFLIGHT_EXPORT PyFlightDataStream : public arrow::flight::FlightDataStream {
+ public:
+  /// \brief Construct a FlightDataStream from a Python object and underlying stream.
+  /// Must only be called while holding the GIL.
+  explicit PyFlightDataStream(PyObject* data_source,
+                              std::unique_ptr<arrow::flight::FlightDataStream> stream);
+
+  std::shared_ptr<Schema> schema() override;
+  arrow::Result<arrow::flight::FlightPayload> GetSchemaPayload() override;
+  arrow::Result<arrow::flight::FlightPayload> Next() override;
+
+ private:
+  OwnedRefNoGIL data_source_;
+  std::unique_ptr<arrow::flight::FlightDataStream> stream_;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyServerMiddlewareFactory
+    : public arrow::flight::ServerMiddlewareFactory {
+ public:
+  /// \brief A callback to create the middleware instance in Python
+  typedef std::function<Status(
+      PyObject*, const arrow::flight::CallInfo& info,
+      const arrow::flight::CallHeaders& incoming_headers,
+      std::shared_ptr<arrow::flight::ServerMiddleware>* middleware)>
+      StartCallCallback;
+
+  /// \brief Must only be called while holding the GIL.
+  explicit PyServerMiddlewareFactory(PyObject* factory, StartCallCallback start_call);
+
+  Status StartCall(const arrow::flight::CallInfo& info,
+                   const arrow::flight::ServerCallContext& context,
+                   std::shared_ptr<arrow::flight::ServerMiddleware>* middleware) override;
+
+ private:
+  OwnedRefNoGIL factory_;
+  StartCallCallback start_call_;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyServerMiddleware : public arrow::flight::ServerMiddleware {
+ public:
+  typedef std::function<Status(PyObject*,
+                               arrow::flight::AddCallHeaders* outgoing_headers)>
+      SendingHeadersCallback;
+  typedef std::function<Status(PyObject*, const Status& status)> CallCompletedCallback;
+
+  struct Vtable {
+    SendingHeadersCallback sending_headers;
+    CallCompletedCallback call_completed;
+  };
+
+  /// \brief Must only be called while holding the GIL.
+  explicit PyServerMiddleware(PyObject* middleware, Vtable vtable);
+
+  void SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) override;
+  void CallCompleted(const Status& status) override;
+  std::string name() const override;
+  /// \brief Get the underlying Python object.
+  PyObject* py_object() const;
+
+ private:
+  OwnedRefNoGIL middleware_;
+  Vtable vtable_;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyClientMiddlewareFactory
+    : public arrow::flight::ClientMiddlewareFactory {
+ public:
+  /// \brief A callback to create the middleware instance in Python
+  typedef std::function<Status(
+      PyObject*, const arrow::flight::CallInfo& info,
+      std::unique_ptr<arrow::flight::ClientMiddleware>* middleware)>
+      StartCallCallback;
+
+  /// \brief Must only be called while holding the GIL.
+  explicit PyClientMiddlewareFactory(PyObject* factory, StartCallCallback start_call);
+
+  void StartCall(const arrow::flight::CallInfo& info,
+                 std::unique_ptr<arrow::flight::ClientMiddleware>* middleware) override;
+
+ private:
+  OwnedRefNoGIL factory_;
+  StartCallCallback start_call_;
+};
+
+class ARROW_PYFLIGHT_EXPORT PyClientMiddleware : public arrow::flight::ClientMiddleware {
+ public:
+  typedef std::function<Status(PyObject*,
+                               arrow::flight::AddCallHeaders* outgoing_headers)>
+      SendingHeadersCallback;
+  typedef std::function<Status(PyObject*,
+                               const arrow::flight::CallHeaders& incoming_headers)>
+      ReceivedHeadersCallback;
+  typedef std::function<Status(PyObject*, const Status& status)> CallCompletedCallback;
+
+  struct Vtable {
+    SendingHeadersCallback sending_headers;
+    ReceivedHeadersCallback received_headers;
+    CallCompletedCallback call_completed;
+  };
+
+  /// \brief Must only be called while holding the GIL.
+  explicit PyClientMiddleware(PyObject* factory, Vtable vtable);
+
+  void SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) override;
+  void ReceivedHeaders(const arrow::flight::CallHeaders& incoming_headers) override;
+  void CallCompleted(const Status& status) override;
+
+ private:
+  OwnedRefNoGIL middleware_;
+  Vtable vtable_;
+};
+
+/// \brief A callback that obtains the next payload from a Flight result stream.
+typedef std::function<Status(PyObject*, arrow::flight::FlightPayload*)>
+    PyGeneratorFlightDataStreamCallback;
+
+/// \brief A FlightDataStream built around a Python callback.
+class ARROW_PYFLIGHT_EXPORT PyGeneratorFlightDataStream
+    : public arrow::flight::FlightDataStream {
+ public:
+  /// \brief Construct a FlightDataStream from a Python object and underlying stream.
+  /// Must only be called while holding the GIL.
+  explicit PyGeneratorFlightDataStream(PyObject* generator,
+                                       std::shared_ptr<arrow::Schema> schema,
+                                       PyGeneratorFlightDataStreamCallback callback,
+                                       const ipc::IpcWriteOptions& options);
+  std::shared_ptr<Schema> schema() override;
+  arrow::Result<arrow::flight::FlightPayload> GetSchemaPayload() override;
+  arrow::Result<arrow::flight::FlightPayload> Next() override;
+
+ private:
+  OwnedRefNoGIL generator_;
+  std::shared_ptr<arrow::Schema> schema_;
+  ipc::DictionaryFieldMapper mapper_;
+  ipc::IpcWriteOptions options_;
+  PyGeneratorFlightDataStreamCallback callback_;
+};
+
+ARROW_PYFLIGHT_EXPORT
+Status CreateFlightInfo(const std::shared_ptr<arrow::Schema>& schema,
+                        const arrow::flight::FlightDescriptor& descriptor,
+                        const std::vector<arrow::flight::FlightEndpoint>& endpoints,
+                        int64_t total_records, int64_t total_bytes, bool ordered,
+                        const std::string& app_metadata,
+                        std::unique_ptr<arrow::flight::FlightInfo>* out);
+
+/// \brief Create a SchemaResult from schema.
+ARROW_PYFLIGHT_EXPORT
+Status CreateSchemaResult(const std::shared_ptr<arrow::Schema>& schema,
+                          std::unique_ptr<arrow::flight::SchemaResult>* out);
+
+}  // namespace flight
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/gdb.cc b/pyarrow/src/arrow/python/gdb.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a7d2eda4bf2301eed6ee66dc5a09b44de416f04
--- /dev/null
+++ b/pyarrow/src/arrow/python/gdb.cc
@@ -0,0 +1,505 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdlib>
+#include <memory>
+#include <utility>
+
+#include "arrow/array.h"
+#include "arrow/chunked_array.h"
+#include "arrow/datum.h"
+#include "arrow/extension/uuid.h"
+#include "arrow/json/from_string.h"
+#include "arrow/python/gdb.h"
+#include "arrow/record_batch.h"
+#include "arrow/scalar.h"
+#include "arrow/table.h"
+#include "arrow/type.h"
+#include "arrow/util/debug.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/key_value_metadata.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+using extension::uuid;
+using extension::UuidType;
+using json::ArrayFromJSONString;
+using json::ChunkedArrayFromJSONString;
+using json::ScalarFromJSONString;
+
+namespace gdb {
+
+// Add a nested `arrow` namespace to exercise type lookup from GDB (ARROW-15652)
+namespace arrow {
+void DummyFunction() {}
+}  // namespace arrow
+
+namespace {
+
+class CustomStatusDetail : public StatusDetail {
+ public:
+  const char* type_id() const override { return "custom-detail-id"; }
+  std::string ToString() const override { return "This is a detail"; }
+};
+
+std::shared_ptr<Array> SliceArrayFromJSON(const std::shared_ptr<DataType>& ty,
+                                          std::string_view json, int64_t offset = 0,
+                                          int64_t length = -1) {
+  auto array = *ArrayFromJSONString(ty, json);
+  if (length != -1) {
+    return array->Slice(offset, length);
+  } else {
+    return array->Slice(offset);
+  }
+}
+
+}  // namespace
+
+void TestSession() {
+  // We define local variables for all types for which we want to test
+  // pretty-printing.
+  // Then, at the end of this function, we trap to the debugger, so that
+  // test instrumentation can print values from this frame by interacting
+  // with the debugger.
+  // The test instrumentation is in pyarrow/tests/test_gdb.py
+
+#ifdef __clang__
+  _Pragma("clang diagnostic push");
+  _Pragma("clang diagnostic ignored \"-Wunused-variable\"");
+#elif defined(__GNUC__)
+  _Pragma("GCC diagnostic push");
+  _Pragma("GCC diagnostic ignored \"-Wunused-variable\"");
+#endif
+
+  arrow::DummyFunction();
+
+  // Status & Result
+  auto ok_status = Status::OK();
+  auto error_status = Status::IOError("This is an error");
+  auto error_detail_status =
+      error_status.WithDetail(std::make_shared<CustomStatusDetail>());
+  auto ok_result = Result<int>(42);
+  auto error_result = Result<int>(error_status);
+  auto error_detail_result = Result<int>(error_detail_status);
+
+  // String views
+  std::string_view string_view_abc{"abc"};
+  std::string special_chars = std::string("foo\"bar") + '\x00' + "\r\n\t\x1f";
+  std::string_view string_view_special_chars(special_chars);
+
+  // Buffers
+  Buffer buffer_null{nullptr, 0};
+  Buffer buffer_abc{string_view_abc};
+  Buffer buffer_special_chars{string_view_special_chars};
+  char mutable_array[3] = {'a', 'b', 'c'};
+  MutableBuffer buffer_mutable{reinterpret_cast<uint8_t*>(mutable_array), 3};
+  auto heap_buffer = std::make_shared<Buffer>(string_view_abc);
+  auto heap_buffer_mutable = *AllocateBuffer(buffer_abc.size());
+  memcpy(heap_buffer_mutable->mutable_data(), buffer_abc.data(), buffer_abc.size());
+
+  // KeyValueMetadata
+  auto empty_metadata = key_value_metadata({}, {});
+  auto metadata = key_value_metadata(
+      {"key_text", "key_binary"}, {"some value", std::string("z") + '\x00' + "\x1f\xff"});
+
+  // Decimals
+  Decimal128 decimal128_zero{};
+  Decimal128 decimal128_pos{"98765432109876543210987654321098765432"};
+  Decimal128 decimal128_neg{"-98765432109876543210987654321098765432"};
+  BasicDecimal128 basic_decimal128_zero{};
+  BasicDecimal128 basic_decimal128_pos{decimal128_pos.native_endian_array()};
+  BasicDecimal128 basic_decimal128_neg{decimal128_neg.native_endian_array()};
+  Decimal256 decimal256_zero{};
+  Decimal256 decimal256_pos{
+      "9876543210987654321098765432109876543210987654321098765432109876543210987654"};
+  Decimal256 decimal256_neg{
+      "-9876543210987654321098765432109876543210987654321098765432109876543210987654"};
+  BasicDecimal256 basic_decimal256_zero{};
+  BasicDecimal256 basic_decimal256_pos{decimal256_pos.native_endian_array()};
+  BasicDecimal256 basic_decimal256_neg{decimal256_neg.native_endian_array()};
+
+  // Data types
+  NullType null_type;
+  auto heap_null_type = null();
+  BooleanType bool_type;
+  auto heap_bool_type = boolean();
+
+  Date32Type date32_type;
+  Date64Type date64_type;
+  Time32Type time_type_s(TimeUnit::SECOND);
+  Time32Type time_type_ms(TimeUnit::MILLI);
+  Time64Type time_type_us(TimeUnit::MICRO);
+  Time64Type time_type_ns(TimeUnit::NANO);
+  auto heap_time_type_ns = time64(TimeUnit::NANO);
+
+  TimestampType timestamp_type_s(TimeUnit::SECOND);
+  TimestampType timestamp_type_ms_timezone(TimeUnit::MILLI, "Europe/Paris");
+  TimestampType timestamp_type_us(TimeUnit::MICRO);
+  TimestampType timestamp_type_ns_timezone(TimeUnit::NANO, "Europe/Paris");
+  auto heap_timestamp_type_ns_timezone = timestamp(TimeUnit::NANO, "Europe/Paris");
+
+  DayTimeIntervalType day_time_interval_type;
+  MonthIntervalType month_interval_type;
+  MonthDayNanoIntervalType month_day_nano_interval_type;
+
+  DurationType duration_type_s(TimeUnit::SECOND);
+  DurationType duration_type_ns(TimeUnit::NANO);
+
+  BinaryType binary_type;
+  StringType string_type;
+  LargeBinaryType large_binary_type;
+  LargeStringType large_string_type;
+  FixedSizeBinaryType fixed_size_binary_type(10);
+  auto heap_fixed_size_binary_type = fixed_size_binary(10);
+
+  Decimal128Type decimal128_type(16, 5);
+  Decimal256Type decimal256_type(42, 12);
+  auto heap_decimal128_type = decimal128(16, 5);
+
+  ListType list_type(uint8());
+  LargeListType large_list_type(large_utf8());
+  auto heap_list_type = list(uint8());
+  auto heap_large_list_type = large_list(large_utf8());
+
+  FixedSizeListType fixed_size_list_type(float64(), 3);
+  auto heap_fixed_size_list_type = fixed_size_list(float64(), 3);
+
+  DictionaryType dict_type_unordered(int16(), utf8());
+  DictionaryType dict_type_ordered(int16(), utf8(), /*ordered=*/true);
+  auto heap_dict_type = dictionary(int16(), utf8());
+
+  MapType map_type_unsorted(utf8(), binary());
+  MapType map_type_sorted(utf8(), binary(), /*keys_sorted=*/true);
+  auto heap_map_type = map(utf8(), binary());
+
+  StructType struct_type_empty({});
+  StructType struct_type(
+      {field("ints", int8()), field("strs", utf8(), /*nullable=*/false)});
+  auto heap_struct_type =
+      struct_({field("ints", int8()), field("strs", utf8(), /*nullable=*/false)});
+
+  std::vector<int8_t> union_type_codes({7, 42});
+  FieldVector union_fields(
+      {field("ints", int8()), field("strs", utf8(), /*nullable=*/false)});
+  SparseUnionType sparse_union_type(union_fields, union_type_codes);
+  DenseUnionType dense_union_type(union_fields, union_type_codes);
+
+  UuidType uuid_type{};
+  std::shared_ptr<DataType> heap_uuid_type = std::make_shared<UuidType>();
+
+  // Schema
+  auto schema_empty = schema({});
+  auto schema_non_empty = schema({field("ints", int8()), field("strs", utf8())});
+  auto schema_with_metadata = schema_non_empty->WithMetadata(
+      key_value_metadata({"key1", "key2"}, {"value1", "value2"}));
+
+  // Fields
+  Field int_field("ints", int64());
+  Field float_field("floats", float32(), /*nullable=*/false);
+  auto heap_int_field = field("ints", int64());
+
+  // Scalars
+  NullScalar null_scalar;
+  auto heap_null_scalar = MakeNullScalar(null());
+
+  BooleanScalar bool_scalar_null{};
+  BooleanScalar bool_scalar{true};
+  auto heap_bool_scalar = *MakeScalar(boolean(), true);
+
+  Int8Scalar int8_scalar_null{};
+  UInt8Scalar uint8_scalar_null{};
+  Int64Scalar int64_scalar_null{};
+  UInt64Scalar uint64_scalar_null{};
+  Int8Scalar int8_scalar{-42};
+  UInt8Scalar uint8_scalar{234};
+  Int64Scalar int64_scalar{-9223372036854775807LL - 1};
+  UInt64Scalar uint64_scalar{18446744073709551615ULL};
+  HalfFloatScalar half_float_scalar{48640};  // -1.5
+  FloatScalar float_scalar{1.25f};
+  DoubleScalar double_scalar{2.5};
+
+  Time32Scalar time_scalar_s{100, TimeUnit::SECOND};
+  Time32Scalar time_scalar_ms{1000, TimeUnit::MILLI};
+  Time64Scalar time_scalar_us{10000, TimeUnit::MICRO};
+  Time64Scalar time_scalar_ns{100000, TimeUnit::NANO};
+  Time64Scalar time_scalar_null{time64(TimeUnit::NANO)};
+
+  DurationScalar duration_scalar_s{-100, TimeUnit::SECOND};
+  DurationScalar duration_scalar_ms{-1000, TimeUnit::MILLI};
+  DurationScalar duration_scalar_us{-10000, TimeUnit::MICRO};
+  DurationScalar duration_scalar_ns{-100000, TimeUnit::NANO};
+  DurationScalar duration_scalar_null{duration(TimeUnit::NANO)};
+
+  TimestampScalar timestamp_scalar_s{12345, timestamp(TimeUnit::SECOND)};
+  TimestampScalar timestamp_scalar_ms{-123456, timestamp(TimeUnit::MILLI)};
+  TimestampScalar timestamp_scalar_us{1234567, timestamp(TimeUnit::MICRO)};
+  TimestampScalar timestamp_scalar_ns{-12345678, timestamp(TimeUnit::NANO)};
+  TimestampScalar timestamp_scalar_null{timestamp(TimeUnit::NANO)};
+
+  TimestampScalar timestamp_scalar_s_tz{12345,
+                                        timestamp(TimeUnit::SECOND, "Europe/Paris")};
+  TimestampScalar timestamp_scalar_ms_tz{-123456,
+                                         timestamp(TimeUnit::MILLI, "Europe/Paris")};
+  TimestampScalar timestamp_scalar_us_tz{1234567,
+                                         timestamp(TimeUnit::MICRO, "Europe/Paris")};
+  TimestampScalar timestamp_scalar_ns_tz{-12345678,
+                                         timestamp(TimeUnit::NANO, "Europe/Paris")};
+  TimestampScalar timestamp_scalar_null_tz{timestamp(TimeUnit::NANO, "Europe/Paris")};
+
+  MonthIntervalScalar month_interval_scalar{23};
+  MonthIntervalScalar month_interval_scalar_null{};
+  DayTimeIntervalScalar day_time_interval_scalar{{23, -456}};
+  DayTimeIntervalScalar day_time_interval_scalar_null{};
+  MonthDayNanoIntervalScalar month_day_nano_interval_scalar{{1, 23, -456}};
+  MonthDayNanoIntervalScalar month_day_nano_interval_scalar_null{};
+
+  Date32Scalar date32_scalar{23};
+  Date32Scalar date32_scalar_null{};
+  Date64Scalar date64_scalar{45 * 86400000LL};
+  Date64Scalar date64_scalar_null{};
+
+  Decimal128Scalar decimal128_scalar_pos_scale_pos{Decimal128("1234567"),
+                                                   decimal128(10, 4)};
+  Decimal128Scalar decimal128_scalar_pos_scale_neg{Decimal128("-1234567"),
+                                                   decimal128(10, 4)};
+  Decimal128Scalar decimal128_scalar_neg_scale_pos{Decimal128("1234567"),
+                                                   decimal128(10, -4)};
+  Decimal128Scalar decimal128_scalar_neg_scale_neg{Decimal128("-1234567"),
+                                                   decimal128(10, -4)};
+  Decimal128Scalar decimal128_scalar_null{decimal128(10, 4)};
+  auto heap_decimal128_scalar = *MakeScalar(decimal128(10, 4), Decimal128("1234567"));
+
+  Decimal256Scalar decimal256_scalar_pos_scale_pos{
+      Decimal256("1234567890123456789012345678901234567890123456"), decimal256(50, 4)};
+  Decimal256Scalar decimal256_scalar_pos_scale_neg{
+      Decimal256("-1234567890123456789012345678901234567890123456"), decimal256(50, 4)};
+  Decimal256Scalar decimal256_scalar_neg_scale_pos{
+      Decimal256("1234567890123456789012345678901234567890123456"), decimal256(50, -4)};
+  Decimal256Scalar decimal256_scalar_neg_scale_neg{
+      Decimal256("-1234567890123456789012345678901234567890123456"), decimal256(50, -4)};
+  Decimal256Scalar decimal256_scalar_null{decimal256(50, 4)};
+  auto heap_decimal256_scalar = *MakeScalar(
+      decimal256(50, 4), Decimal256("1234567890123456789012345678901234567890123456"));
+
+  BinaryScalar binary_scalar_null{};
+  BinaryScalar binary_scalar_unallocated{std::shared_ptr<Buffer>{nullptr}};
+  BinaryScalar binary_scalar_empty{Buffer::FromString("")};
+  BinaryScalar binary_scalar_abc{Buffer::FromString("abc")};
+  BinaryScalar binary_scalar_bytes{
+      Buffer::FromString(std::string() + '\x00' + "\x1f\xff")};
+
+  StringScalar string_scalar_null{};
+  StringScalar string_scalar_unallocated{std::shared_ptr<Buffer>{nullptr}};
+  StringScalar string_scalar_empty{Buffer::FromString("")};
+  StringScalar string_scalar_hehe{Buffer::FromString("héhé")};
+  StringScalar string_scalar_invalid_chars{
+      Buffer::FromString(std::string("abc") + '\x00' + "def\xffghi")};
+
+  LargeBinaryScalar large_binary_scalar_abc{Buffer::FromString("abc")};
+  LargeStringScalar large_string_scalar_hehe{Buffer::FromString("héhé")};
+
+  FixedSizeBinaryScalar fixed_size_binary_scalar{Buffer::FromString("abc"),
+                                                 fixed_size_binary(3)};
+  FixedSizeBinaryScalar fixed_size_binary_scalar_null{
+      Buffer::FromString("   "), fixed_size_binary(3), /*is_valid=*/false};
+
+  std::shared_ptr<Array> dict_array;
+  dict_array = *ArrayFromJSONString(utf8(), R"(["foo", "bar", "quux"])");
+  DictionaryScalar dict_scalar{{std::make_shared<Int8Scalar>(42), dict_array},
+                               dictionary(int8(), utf8())};
+  DictionaryScalar dict_scalar_null{dictionary(int8(), utf8())};
+
+  std::shared_ptr<Array> list_value_array = *ArrayFromJSONString(int32(), R"([4, 5, 6])");
+  std::shared_ptr<Array> list_zero_length = *ArrayFromJSONString(int32(), R"([])");
+  ListScalar list_scalar{list_value_array};
+  ListScalar list_scalar_null{list_zero_length, list(int32()), /*is_valid=*/false};
+  LargeListScalar large_list_scalar{list_value_array};
+  LargeListScalar large_list_scalar_null{list_zero_length, large_list(int32()),
+                                         /*is_valid=*/false};
+  FixedSizeListScalar fixed_size_list_scalar{list_value_array};
+  FixedSizeListScalar fixed_size_list_scalar_null{
+      list_value_array, fixed_size_list(int32(), 3), /*is_valid=*/false};
+
+  auto struct_scalar_type = struct_({field("ints", int32()), field("strs", utf8())});
+  StructScalar struct_scalar{
+      ScalarVector{MakeScalar(int32_t(42)), MakeScalar("some text")}, struct_scalar_type};
+  StructScalar struct_scalar_null{struct_scalar.value, struct_scalar_type,
+                                  /*is_valid=*/false};
+
+  auto sparse_union_scalar_type =
+      sparse_union(FieldVector{field("ints", int32()), field("strs", utf8())}, {7, 42});
+  auto dense_union_scalar_type =
+      dense_union(FieldVector{field("ints", int32()), field("strs", utf8())}, {7, 42});
+  std::vector<std::shared_ptr<Scalar>> union_values = {MakeScalar(int32_t(43)),
+                                                       MakeNullScalar(utf8())};
+  SparseUnionScalar sparse_union_scalar{union_values, 7, sparse_union_scalar_type};
+  DenseUnionScalar dense_union_scalar{union_values[0], 7, dense_union_scalar_type};
+
+  union_values[0] = MakeNullScalar(int32());
+  SparseUnionScalar sparse_union_scalar_null{union_values, 7, sparse_union_scalar_type};
+  DenseUnionScalar dense_union_scalar_null{union_values[0], 7, dense_union_scalar_type};
+
+  auto extension_scalar_type = std::make_shared<UuidType>();
+  ExtensionScalar extension_scalar{
+      std::make_shared<FixedSizeBinaryScalar>(Buffer::FromString("0123456789abcdef"),
+                                              extension_scalar_type->storage_type()),
+      extension_scalar_type};
+  ExtensionScalar extension_scalar_null{extension_scalar.value, extension_scalar_type,
+                                        /*is_valid=*/false};
+
+  auto heap_map_scalar =
+      *ScalarFromJSONString(map(utf8(), int32()), R"([["a", 5], ["b", 6]])");
+  auto heap_map_scalar_null = MakeNullScalar(heap_map_scalar->type);
+
+  // Array and ArrayData
+  auto heap_null_array = SliceArrayFromJSON(null(), "[null, null]");
+
+  auto heap_int32_array = SliceArrayFromJSON(int32(), "[-5, 6, null, 42]");
+  ArrayData int32_array_data{*heap_int32_array->data()};
+  Int32Array int32_array{heap_int32_array->data()->Copy()};
+
+  auto heap_int32_array_no_nulls = SliceArrayFromJSON(int32(), "[-5, 6, 3, 42]");
+
+  const char* json_int32_array = "[-1, 2, -3, 4, null, -5, 6, -7, 8, null, -9, -10]";
+  auto heap_int32_array_sliced_1_9 = SliceArrayFromJSON(int32(), json_int32_array, 1, 9);
+  auto heap_int32_array_sliced_2_6 = SliceArrayFromJSON(int32(), json_int32_array, 2, 6);
+  auto heap_int32_array_sliced_8_4 = SliceArrayFromJSON(int32(), json_int32_array, 8, 4);
+  auto heap_int32_array_sliced_empty =
+      SliceArrayFromJSON(int32(), json_int32_array, 6, 0);
+
+  const char* json_bool_array =
+      "[false, false, true, true, null, null, false, false, true, true, "
+      "null, null, false, false, true, true, null, null]";
+  auto heap_bool_array = SliceArrayFromJSON(boolean(), json_bool_array);
+  auto heap_bool_array_sliced_1_9 = SliceArrayFromJSON(boolean(), json_bool_array, 1, 9);
+  auto heap_bool_array_sliced_2_6 = SliceArrayFromJSON(boolean(), json_bool_array, 2, 6);
+  auto heap_bool_array_sliced_empty =
+      SliceArrayFromJSON(boolean(), json_bool_array, 6, 0);
+
+  auto heap_list_array = SliceArrayFromJSON(list(int64()), "[[1, 2], null, []]");
+  ListArray list_array{heap_list_array->data()};
+
+  const char* json_double_array = "[-1.5, null]";
+  auto heap_double_array = SliceArrayFromJSON(float64(), json_double_array);
+
+  const char* json_float16_array = "[0, 48640]";
+  auto heap_float16_array =
+      *SliceArrayFromJSON(uint16(), json_float16_array)->View(float16());
+
+  auto heap_date32_array =
+      SliceArrayFromJSON(date32(), "[0, null, 18336, -9004, -719162, -719163]");
+  auto heap_date64_array = SliceArrayFromJSON(
+      date64(), "[1584230400000, -777945600000, -62135596800000, -62135683200000, 123]");
+
+  const char* json_time_array = "[null, -123, 456]";
+  auto heap_time32_array_s =
+      SliceArrayFromJSON(time32(TimeUnit::SECOND), json_time_array);
+  auto heap_time32_array_ms =
+      SliceArrayFromJSON(time32(TimeUnit::MILLI), json_time_array);
+  auto heap_time64_array_us =
+      SliceArrayFromJSON(time64(TimeUnit::MICRO), json_time_array);
+  auto heap_time64_array_ns = SliceArrayFromJSON(time64(TimeUnit::NANO), json_time_array);
+
+  auto heap_month_interval_array =
+      SliceArrayFromJSON(month_interval(), "[123, -456, null]");
+  auto heap_day_time_interval_array =
+      SliceArrayFromJSON(day_time_interval(), "[[1, -600], null]");
+  auto heap_month_day_nano_interval_array =
+      SliceArrayFromJSON(month_day_nano_interval(), "[[1, -600, 5000], null]");
+
+  const char* json_duration_array = "[null, -1234567890123456789]";
+  auto heap_duration_array_s =
+      SliceArrayFromJSON(duration(TimeUnit::SECOND), json_duration_array);
+  auto heap_duration_array_ns =
+      SliceArrayFromJSON(duration(TimeUnit::NANO), json_duration_array);
+
+  auto heap_timestamp_array_s = SliceArrayFromJSON(
+      timestamp(TimeUnit::SECOND),
+      R"([null, "1970-01-01 00:00:00", "1900-02-28 12:34:56", "3989-07-14 00:00:00"])");
+  auto heap_timestamp_array_ms = SliceArrayFromJSON(
+      timestamp(TimeUnit::MILLI),
+      R"([null, "1900-02-28 12:34:56.123", "3989-07-14 00:00:00.789"])");
+  auto heap_timestamp_array_us = SliceArrayFromJSON(
+      timestamp(TimeUnit::MICRO),
+      R"([null, "1900-02-28 12:34:56.654321", "3989-07-14 00:00:00.456789"])");
+  auto heap_timestamp_array_ns = SliceArrayFromJSON(
+      timestamp(TimeUnit::NANO), R"([null, "1900-02-28 12:34:56.987654321"])");
+
+  auto heap_decimal128_array = SliceArrayFromJSON(
+      decimal128(30, 6),
+      R"([null, "-1234567890123456789.012345", "1234567890123456789.012345"])");
+  auto heap_decimal256_array = SliceArrayFromJSON(
+      decimal256(50, 6), R"([null, "-123456789012345678901234567890123456789.012345"])");
+  auto heap_decimal128_array_sliced = heap_decimal128_array->Slice(1, 1);
+
+  auto heap_fixed_size_binary_array =
+      SliceArrayFromJSON(fixed_size_binary(3), "[null, \"abc\", \"\\u0000\\u001f\xff\"]");
+  auto heap_fixed_size_binary_array_zero_width =
+      SliceArrayFromJSON(fixed_size_binary(0), R"([null, ""])");
+  auto heap_fixed_size_binary_array_sliced = heap_fixed_size_binary_array->Slice(1, 1);
+
+  const char* json_binary_array = "[null, \"abcd\", \"\\u0000\\u001f\xff\"]";
+  auto heap_binary_array = SliceArrayFromJSON(binary(), json_binary_array);
+  auto heap_large_binary_array = SliceArrayFromJSON(large_binary(), json_binary_array);
+  const char* json_string_array = "[null, \"héhé\", \"invalid \xff char\"]";
+  auto heap_string_array = SliceArrayFromJSON(utf8(), json_string_array);
+  auto heap_large_string_array = SliceArrayFromJSON(large_utf8(), json_string_array);
+  auto heap_binary_array_sliced = heap_binary_array->Slice(1, 1);
+
+  // ChunkedArray
+  ArrayVector array_chunks(2);
+  array_chunks[0] = *ArrayFromJSONString(int32(), "[1, 2]");
+  array_chunks[1] = *ArrayFromJSONString(int32(), "[3, null, 4]");
+  ChunkedArray chunked_array{array_chunks};
+
+  // RecordBatch
+  auto batch_schema = schema({field("ints", int32()), field("strs", utf8())});
+  ArrayVector batch_columns{2};
+  batch_columns[0] = *ArrayFromJSONString(int32(), "[1, 2, 3]");
+  batch_columns[1] = *ArrayFromJSONString(utf8(), R"(["abc", null, "def"])");
+  auto batch = RecordBatch::Make(batch_schema, /*num_rows=*/3, batch_columns);
+  auto batch_with_metadata = batch->ReplaceSchemaMetadata(
+      key_value_metadata({"key1", "key2", "key3"}, {"value1", "value2", "value3"}));
+
+  // Table
+  auto col1 = ChunkedArrayFromJSONString(int32(), {"[1, 2, 3]", "[4, 5]"});
+  auto col2 = ChunkedArrayFromJSONString(
+      utf8(), {R"(["abc", null])", R"(["def"])", R"(["ghi", "jkl"])"});
+  auto table = Table::Make(batch_schema, {*col1, *col2});
+
+  // Datum
+  Datum empty_datum{};
+  Datum scalar_datum{MakeNullScalar(boolean())};
+  Datum array_datum{heap_int32_array};
+  Datum chunked_array_datum{chunked_array};
+  Datum batch_datum{batch};
+  Datum table_datum{table};
+
+#ifdef __clang__
+  _Pragma("clang diagnostic pop");
+#elif defined(__GNUC__)
+  _Pragma("GCC diagnostic pop");
+#endif
+
+  // Hook into debugger
+  ::arrow::internal::DebugTrap();
+}
+
+}  // namespace gdb
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/gdb.h b/pyarrow/src/arrow/python/gdb.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ddcbb51f6e0b70c1b16dc9a9ce6caf79fb2369e
--- /dev/null
+++ b/pyarrow/src/arrow/python/gdb.h
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+namespace gdb {
+
+ARROW_PYTHON_EXPORT
+void TestSession();
+
+}  // namespace gdb
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/helpers.cc b/pyarrow/src/arrow/python/helpers.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a24b259310e585a34f69c00c2b2a5609894ee73
--- /dev/null
+++ b/pyarrow/src/arrow/python/helpers.cc
@@ -0,0 +1,504 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// helpers.h includes a NumPy header, so we include this first
+#include "arrow/python/numpy_init.h"
+#include "arrow/python/numpy_interop.h"
+
+#include "arrow/python/helpers.h"
+
+#include <cmath>
+#include <limits>
+#include <mutex>
+#include <sstream>
+#include <type_traits>
+
+#include "arrow/python/common.h"
+#include "arrow/python/decimal.h"
+#include "arrow/type_fwd.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/config.h"
+#include "arrow/util/float16.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace py {
+
+#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \
+  case Type::NAME:                        \
+    return FACTORY()
+
+std::shared_ptr<DataType> GetPrimitiveType(Type::type type) {
+  switch (type) {
+    case Type::NA:
+      return null();
+      GET_PRIMITIVE_TYPE(UINT8, uint8);
+      GET_PRIMITIVE_TYPE(INT8, int8);
+      GET_PRIMITIVE_TYPE(UINT16, uint16);
+      GET_PRIMITIVE_TYPE(INT16, int16);
+      GET_PRIMITIVE_TYPE(UINT32, uint32);
+      GET_PRIMITIVE_TYPE(INT32, int32);
+      GET_PRIMITIVE_TYPE(UINT64, uint64);
+      GET_PRIMITIVE_TYPE(INT64, int64);
+      GET_PRIMITIVE_TYPE(DATE32, date32);
+      GET_PRIMITIVE_TYPE(DATE64, date64);
+      GET_PRIMITIVE_TYPE(BOOL, boolean);
+      GET_PRIMITIVE_TYPE(HALF_FLOAT, float16);
+      GET_PRIMITIVE_TYPE(FLOAT, float32);
+      GET_PRIMITIVE_TYPE(DOUBLE, float64);
+      GET_PRIMITIVE_TYPE(BINARY, binary);
+      GET_PRIMITIVE_TYPE(STRING, utf8);
+      GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary);
+      GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8);
+      GET_PRIMITIVE_TYPE(BINARY_VIEW, binary_view);
+      GET_PRIMITIVE_TYPE(STRING_VIEW, utf8_view);
+      GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval);
+    default:
+      return nullptr;
+  }
+}
+
+PyObject* PyFloat_FromHalf(uint16_t value) {
+  // Convert the uint16_t Float16 value to a PyFloat object
+  arrow::util::Float16 half_val = arrow::util::Float16::FromBits(value);
+  return PyFloat_FromDouble(half_val.ToDouble());
+}
+
+Result<uint16_t> PyFloat_AsHalf(PyObject* obj) {
+  if (PyFloat_Check(obj)) {
+    arrow::util::Float16 half_val =
+        arrow::util::Float16::FromDouble(PyFloat_AsDouble(obj));
+    return half_val.bits();
+  } else if (has_numpy() && PyArray_IsScalar(obj, Half)) {
+    return PyArrayScalar_VAL(obj, Half);
+  } else {
+    return Status::TypeError("conversion to float16 expects a `float` or ",
+                             "`np.float16` object, got ", Py_TYPE(obj)->tp_name);
+  }
+}
+
+namespace internal {
+
+std::string PyBytes_AsStdString(PyObject* obj) {
+  ARROW_DCHECK(PyBytes_Check(obj));
+  return std::string(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj));
+}
+
+Status PyUnicode_AsStdString(PyObject* obj, std::string* out) {
+  ARROW_DCHECK(PyUnicode_Check(obj));
+  Py_ssize_t size;
+  // The utf-8 representation is cached on the unicode object
+  const char* data = PyUnicode_AsUTF8AndSize(obj, &size);
+  RETURN_IF_PYERROR();
+  *out = std::string(data, size);
+  return Status::OK();
+}
+
+std::string PyObject_StdStringRepr(PyObject* obj) {
+  OwnedRef unicode_ref(PyObject_Repr(obj));
+  OwnedRef bytes_ref;
+
+  if (unicode_ref) {
+    bytes_ref.reset(
+        PyUnicode_AsEncodedString(unicode_ref.obj(), "utf8", "backslashreplace"));
+  }
+  if (!bytes_ref) {
+    PyErr_Clear();
+    std::stringstream ss;
+    ss << "<object of type '" << Py_TYPE(obj)->tp_name << "' repr() failed>";
+    return ss.str();
+  }
+  return PyBytes_AsStdString(bytes_ref.obj());
+}
+
+Status PyObject_StdStringStr(PyObject* obj, std::string* out) {
+  OwnedRef string_ref(PyObject_Str(obj));
+  RETURN_IF_PYERROR();
+  return PyUnicode_AsStdString(string_ref.obj(), out);
+}
+
+Result<bool> IsModuleImported(const std::string& module_name) {
+  // PyImport_GetModuleDict returns with a borrowed reference
+  OwnedRef key(PyUnicode_FromString(module_name.c_str()));
+  auto is_imported = PyDict_Contains(PyImport_GetModuleDict(), key.obj());
+  RETURN_IF_PYERROR();
+  return is_imported;
+}
+
+Status ImportModule(const std::string& module_name, OwnedRef* ref) {
+  PyObject* module = PyImport_ImportModule(module_name.c_str());
+  RETURN_IF_PYERROR();
+  ref->reset(module);
+  return Status::OK();
+}
+
+Status ImportFromModule(PyObject* module, const std::string& name, OwnedRef* ref) {
+  PyObject* attr = PyObject_GetAttrString(module, name.c_str());
+  RETURN_IF_PYERROR();
+  ref->reset(attr);
+  return Status::OK();
+}
+
+namespace {
+
+Status IntegerOverflowStatus(PyObject* obj, const std::string& overflow_message) {
+  if (overflow_message.empty()) {
+    std::string obj_as_stdstring;
+    RETURN_NOT_OK(PyObject_StdStringStr(obj, &obj_as_stdstring));
+    return Status::Invalid("Value ", obj_as_stdstring,
+                           " too large to fit in C integer type");
+  } else {
+    return Status::Invalid(overflow_message);
+  }
+}
+
+Result<OwnedRef> PyObjectToPyInt(PyObject* obj) {
+  // Try to call __index__ or __int__ on `obj`
+  // (starting from Python 3.10, the latter isn't done anymore by PyLong_AsLong*).
+  OwnedRef ref(PyNumber_Index(obj));
+  if (ref) {
+    return std::move(ref);
+  }
+  PyErr_Clear();
+  const auto nb = Py_TYPE(obj)->tp_as_number;
+  if (nb && nb->nb_int) {
+    ref.reset(nb->nb_int(obj));
+    if (!ref) {
+      RETURN_IF_PYERROR();
+    }
+    ARROW_DCHECK(ref);
+    return std::move(ref);
+  }
+  return Status::TypeError(
+      "object of type ",
+      PyObject_StdStringRepr(reinterpret_cast<PyObject*>(Py_TYPE(obj))),
+      " cannot be converted to int");
+}
+
+// Extract C signed int from Python object
+template <typename Int, enable_if_t<std::is_signed<Int>::value, Int> = 0>
+Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) {
+  static_assert(sizeof(Int) <= sizeof(long long),  // NOLINT
+                "integer type larger than long long");
+
+  OwnedRef ref;
+  if (!PyLong_Check(obj)) {
+    ARROW_ASSIGN_OR_RAISE(ref, PyObjectToPyInt(obj));
+    obj = ref.obj();
+  }
+
+  if (sizeof(Int) > sizeof(long)) {  // NOLINT
+    const auto value = PyLong_AsLongLong(obj);
+    if (ARROW_PREDICT_FALSE(value == -1)) {
+      RETURN_IF_PYERROR();
+    }
+    if (ARROW_PREDICT_FALSE(value < std::numeric_limits<Int>::min() ||
+                            value > std::numeric_limits<Int>::max())) {
+      return IntegerOverflowStatus(obj, overflow_message);
+    }
+    *out = static_cast<Int>(value);
+  } else {
+    const auto value = PyLong_AsLong(obj);
+    if (ARROW_PREDICT_FALSE(value == -1)) {
+      RETURN_IF_PYERROR();
+    }
+    if (ARROW_PREDICT_FALSE(value < std::numeric_limits<Int>::min() ||
+                            value > std::numeric_limits<Int>::max())) {
+      return IntegerOverflowStatus(obj, overflow_message);
+    }
+    *out = static_cast<Int>(value);
+  }
+  return Status::OK();
+}
+
+// Extract C unsigned int from Python object
+template <typename Int, enable_if_t<std::is_unsigned<Int>::value, Int> = 0>
+Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) {
+  static_assert(sizeof(Int) <= sizeof(unsigned long long),  // NOLINT
+                "integer type larger than unsigned long long");
+
+  OwnedRef ref;
+  if (!PyLong_Check(obj)) {
+    ARROW_ASSIGN_OR_RAISE(ref, PyObjectToPyInt(obj));
+    obj = ref.obj();
+  }
+
+  if (sizeof(Int) > sizeof(unsigned long)) {  // NOLINT
+    const auto value = PyLong_AsUnsignedLongLong(obj);
+    if (ARROW_PREDICT_FALSE(value == static_cast<decltype(value)>(-1))) {
+      RETURN_IF_PYERROR();
+    }
+    if (ARROW_PREDICT_FALSE(value > std::numeric_limits<Int>::max())) {
+      return IntegerOverflowStatus(obj, overflow_message);
+    }
+    *out = static_cast<Int>(value);
+  } else {
+    const auto value = PyLong_AsUnsignedLong(obj);
+    if (ARROW_PREDICT_FALSE(value == static_cast<decltype(value)>(-1))) {
+      RETURN_IF_PYERROR();
+    }
+    if (ARROW_PREDICT_FALSE(value > std::numeric_limits<Int>::max())) {
+      return IntegerOverflowStatus(obj, overflow_message);
+    }
+    *out = static_cast<Int>(value);
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+template <typename Int>
+Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message) {
+  if (PyBool_Check(obj)) {
+    return Status::TypeError("Expected integer, got bool");
+  }
+  return CIntFromPythonImpl(obj, out, overflow_message);
+}
+
+template Status CIntFromPython(PyObject*, int8_t*, const std::string&);
+template Status CIntFromPython(PyObject*, int16_t*, const std::string&);
+template Status CIntFromPython(PyObject*, int32_t*, const std::string&);
+template Status CIntFromPython(PyObject*, int64_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint8_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint16_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint32_t*, const std::string&);
+template Status CIntFromPython(PyObject*, uint64_t*, const std::string&);
+
+inline bool MayHaveNaN(PyObject* obj) {
+  // Some core types can be very quickly type-checked and do not allow NaN values
+  const int64_t non_nan_tpflags = Py_TPFLAGS_LONG_SUBCLASS | Py_TPFLAGS_LIST_SUBCLASS |
+                                  Py_TPFLAGS_TUPLE_SUBCLASS | Py_TPFLAGS_BYTES_SUBCLASS |
+                                  Py_TPFLAGS_UNICODE_SUBCLASS | Py_TPFLAGS_DICT_SUBCLASS |
+                                  Py_TPFLAGS_BASE_EXC_SUBCLASS | Py_TPFLAGS_TYPE_SUBCLASS;
+  return !PyType_HasFeature(Py_TYPE(obj), non_nan_tpflags);
+}
+
+bool PyFloat_IsNaN(PyObject* obj) {
+  return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj));
+}
+
+namespace {
+
+// This needs a conditional, because using std::once_flag could introduce
+// a deadlock when the GIL is enabled. See
+// https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 for
+// more info.
+#ifdef Py_GIL_DISABLED
+static std::once_flag pandas_static_initialized;
+#else
+static bool pandas_static_initialized = false;
+#endif
+
+// Once initialized, these variables hold borrowed references to Pandas static data.
+// We should not use OwnedRef here because Python destructors would be
+// called on a finalized interpreter.
+static PyObject* pandas_NA = nullptr;
+static PyObject* pandas_NaT = nullptr;
+static PyObject* pandas_Timedelta = nullptr;
+static PyObject* pandas_Timestamp = nullptr;
+static PyTypeObject* pandas_NaTType = nullptr;
+static PyObject* pandas_DateOffset = nullptr;
+
+void GetPandasStaticSymbols() {
+  OwnedRef pandas;
+
+  // Import pandas
+  Status s = ImportModule("pandas", &pandas);
+  if (!s.ok()) {
+    return;
+  }
+
+#ifndef Py_GIL_DISABLED
+  // Since ImportModule can release the GIL, another thread could have
+  // already initialized the static data.
+  if (pandas_static_initialized) {
+    return;
+  }
+#endif
+
+  OwnedRef ref;
+
+  // set NaT sentinel and its type
+  if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) {
+    pandas_NaT = ref.obj();
+    // PyObject_Type returns a new reference but we trust that pandas.NaT will
+    // outlive our use of this PyObject*
+    pandas_NaTType = Py_TYPE(ref.obj());
+  }
+
+  // retain a reference to Timedelta
+  if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) {
+    pandas_Timedelta = ref.obj();
+  }
+
+  // retain a reference to Timestamp
+  if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) {
+    pandas_Timestamp = ref.obj();
+  }
+
+  // if pandas.NA exists, retain a reference to it
+  if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) {
+    pandas_NA = ref.obj();
+  }
+
+  // Import DateOffset type
+  if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) {
+    pandas_DateOffset = ref.obj();
+  }
+}
+
+}  // namespace
+
+#ifdef Py_GIL_DISABLED
+void InitPandasStaticData() {
+  std::call_once(pandas_static_initialized, GetPandasStaticSymbols);
+}
+#else
+void InitPandasStaticData() {
+  // NOTE: This is called with the GIL held.  We needn't (and shouldn't,
+  // to avoid deadlocks) use an additional C++ lock (ARROW-10519).
+  if (pandas_static_initialized) {
+    return;
+  }
+  GetPandasStaticSymbols();
+  pandas_static_initialized = true;
+}
+#endif
+
+bool PandasObjectIsNull(PyObject* obj) {
+  if (!MayHaveNaN(obj)) {
+    return false;
+  }
+  if (obj == Py_None) {
+    return true;
+  }
+  if (PyFloat_IsNaN(obj) || (pandas_NA && obj == pandas_NA) ||
+      (pandas_NaTType && PyObject_TypeCheck(obj, pandas_NaTType)) ||
+      (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj))) {
+    return true;
+  }
+  return false;
+}
+
+bool IsPandasTimedelta(PyObject* obj) {
+  return pandas_Timedelta && PyObject_IsInstance(obj, pandas_Timedelta);
+}
+
+bool IsPandasTimestamp(PyObject* obj) {
+  return pandas_Timestamp && PyObject_IsInstance(obj, pandas_Timestamp);
+}
+
+PyObject* BorrowPandasDataOffsetType() { return pandas_DateOffset; }
+
+Status InvalidValue(PyObject* obj, const std::string& why) {
+  auto obj_as_str = PyObject_StdStringRepr(obj);
+  return Status::Invalid("Could not convert ", std::move(obj_as_str), " with type ",
+                         Py_TYPE(obj)->tp_name, ": ", why);
+}
+
+Status InvalidType(PyObject* obj, const std::string& why) {
+  auto obj_as_str = PyObject_StdStringRepr(obj);
+  return Status::TypeError("Could not convert ", std::move(obj_as_str), " with type ",
+                           Py_TYPE(obj)->tp_name, ": ", why);
+}
+
+Status UnboxIntegerAsInt64(PyObject* obj, int64_t* out) {
+  if (PyLong_Check(obj)) {
+    int overflow = 0;
+    *out = PyLong_AsLongLongAndOverflow(obj, &overflow);
+    if (overflow) {
+      return Status::Invalid("PyLong is too large to fit int64");
+    }
+  } else if (PyArray_IsScalar(obj, Byte)) {
+    *out = reinterpret_cast<PyByteScalarObject*>(obj)->obval;
+  } else if (PyArray_IsScalar(obj, UByte)) {
+    *out = reinterpret_cast<PyUByteScalarObject*>(obj)->obval;
+  } else if (PyArray_IsScalar(obj, Short)) {
+    *out = reinterpret_cast<PyShortScalarObject*>(obj)->obval;
+  } else if (PyArray_IsScalar(obj, UShort)) {
+    *out = reinterpret_cast<PyUShortScalarObject*>(obj)->obval;
+  } else if (PyArray_IsScalar(obj, Int)) {
+    *out = reinterpret_cast<PyIntScalarObject*>(obj)->obval;
+  } else if (PyArray_IsScalar(obj, UInt)) {
+    *out = reinterpret_cast<PyUIntScalarObject*>(obj)->obval;
+  } else if (PyArray_IsScalar(obj, Long)) {
+    *out = reinterpret_cast<PyLongScalarObject*>(obj)->obval;
+  } else if (PyArray_IsScalar(obj, ULong)) {
+    *out = reinterpret_cast<PyULongScalarObject*>(obj)->obval;
+  } else if (PyArray_IsScalar(obj, LongLong)) {
+    *out = reinterpret_cast<PyLongLongScalarObject*>(obj)->obval;
+  } else if (PyArray_IsScalar(obj, Int64)) {
+    *out = reinterpret_cast<PyInt64ScalarObject*>(obj)->obval;
+  } else if (PyArray_IsScalar(obj, ULongLong)) {
+    *out = reinterpret_cast<PyULongLongScalarObject*>(obj)->obval;
+  } else if (PyArray_IsScalar(obj, UInt64)) {
+    *out = reinterpret_cast<PyUInt64ScalarObject*>(obj)->obval;
+  } else {
+    return Status::Invalid("Integer scalar type not recognized");
+  }
+  return Status::OK();
+}
+
+Status IntegerScalarToDoubleSafe(PyObject* obj, double* out) {
+  int64_t value = 0;
+  RETURN_NOT_OK(UnboxIntegerAsInt64(obj, &value));
+
+  constexpr int64_t kDoubleMax = 1LL << 53;
+  constexpr int64_t kDoubleMin = -(1LL << 53);
+
+  if (value < kDoubleMin || value > kDoubleMax) {
+    return Status::Invalid("Integer value ", value, " is outside of the range exactly",
+                           " representable by a IEEE 754 double precision value");
+  }
+  *out = static_cast<double>(value);
+  return Status::OK();
+}
+
+Status IntegerScalarToFloat32Safe(PyObject* obj, float* out) {
+  int64_t value = 0;
+  RETURN_NOT_OK(UnboxIntegerAsInt64(obj, &value));
+
+  constexpr int64_t kFloatMax = 1LL << 24;
+  constexpr int64_t kFloatMin = -(1LL << 24);
+
+  if (value < kFloatMin || value > kFloatMax) {
+    return Status::Invalid("Integer value ", value, " is outside of the range exactly",
+                           " representable by a IEEE 754 single precision value");
+  }
+  *out = static_cast<float>(value);
+  return Status::OK();
+}
+
+void DebugPrint(PyObject* obj) {
+  std::string repr = PyObject_StdStringRepr(obj);
+  PySys_WriteStderr("%s\n", repr.c_str());
+}
+
+bool IsThreadingEnabled() {
+#ifdef ARROW_ENABLE_THREADING
+  return true;
+#else
+  return false;
+#endif
+}
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/helpers.h b/pyarrow/src/arrow/python/helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0cf1010289ead191c735ad48c999f1a850953b4
--- /dev/null
+++ b/pyarrow/src/arrow/python/helpers.h
@@ -0,0 +1,160 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "arrow/python/numpy_interop.h"
+
+#include "arrow/python/visibility.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+
+namespace py {
+
+class OwnedRef;
+
+// \brief Get an arrow DataType instance from Arrow's Type::type enum
+// \param[in] type One of the values of Arrow's Type::type enum
+// \return A shared pointer to DataType
+ARROW_PYTHON_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
+
+// \brief Construct a Python float object from a half-float uint16_t value.
+ARROW_PYTHON_EXPORT PyObject* PyFloat_FromHalf(uint16_t value);
+
+// \brief Convert a Python object to a half-float uint16_t value.
+ARROW_PYTHON_EXPORT Result<uint16_t> PyFloat_AsHalf(PyObject* obj);
+
+namespace internal {
+
+// \brief Check that a Python module has been already imported
+// \param[in] module_name The name of the module
+Result<bool> IsModuleImported(const std::string& module_name);
+
+// \brief Import a Python module
+// \param[in] module_name The name of the module
+// \param[out] ref The OwnedRef containing the module PyObject*
+ARROW_PYTHON_EXPORT
+Status ImportModule(const std::string& module_name, OwnedRef* ref);
+
+// \brief Import an object from a Python module
+// \param[in] module A Python module
+// \param[in] name The name of the object to import
+// \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c
+// module
+ARROW_PYTHON_EXPORT
+Status ImportFromModule(PyObject* module, const std::string& name, OwnedRef* ref);
+
+// \brief Check whether obj is an integer, independent of Python versions.
+inline bool IsPyInteger(PyObject* obj) { return PyLong_Check(obj); }
+
+// \brief Import symbols from pandas that we need for various type-checking,
+// like pandas.NaT or pandas.NA
+void InitPandasStaticData();
+
+// \brief Use pandas missing value semantics to check if a value is null
+ARROW_PYTHON_EXPORT
+bool PandasObjectIsNull(PyObject* obj);
+
+// \brief Check that obj is a pandas.Timedelta instance
+ARROW_PYTHON_EXPORT
+bool IsPandasTimedelta(PyObject* obj);
+
+// \brief Check that obj is a pandas.Timestamp instance
+bool IsPandasTimestamp(PyObject* obj);
+
+// \brief Returned a borrowed reference to the pandas.tseries.offsets.DateOffset
+PyObject* BorrowPandasDataOffsetType();
+
+// \brief Check whether obj is a floating-point NaN
+ARROW_PYTHON_EXPORT
+bool PyFloat_IsNaN(PyObject* obj);
+
+inline bool IsPyBinary(PyObject* obj) {
+  return PyBytes_Check(obj) || PyByteArray_Check(obj) || PyMemoryView_Check(obj);
+}
+
+// \brief Convert a Python integer into a C integer
+// \param[in] obj A Python integer
+// \param[out] out A pointer to a C integer to hold the result of the conversion
+// \return The status of the operation
+template <typename Int>
+Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message = "");
+
+// \brief Convert a Python unicode string to a std::string
+ARROW_PYTHON_EXPORT
+Status PyUnicode_AsStdString(PyObject* obj, std::string* out);
+
+// \brief Convert a Python bytes object to a std::string
+ARROW_PYTHON_EXPORT
+std::string PyBytes_AsStdString(PyObject* obj);
+
+// \brief Call str() on the given object and return the result as a std::string
+ARROW_PYTHON_EXPORT
+Status PyObject_StdStringStr(PyObject* obj, std::string* out);
+
+// \brief Return the repr() of the given object (always succeeds)
+ARROW_PYTHON_EXPORT
+std::string PyObject_StdStringRepr(PyObject* obj);
+
+// \brief Cast the given size to int32_t, with error checking
+inline Status CastSize(Py_ssize_t size, int32_t* out,
+                       const char* error_msg = "Maximum size exceeded (2GB)") {
+  // size is assumed to be positive
+  if (size > std::numeric_limits<int32_t>::max()) {
+    return Status::Invalid(error_msg);
+  }
+  *out = static_cast<int32_t>(size);
+  return Status::OK();
+}
+
+inline Status CastSize(Py_ssize_t size, int64_t* out, const char* error_msg = NULLPTR) {
+  // size is assumed to be positive
+  *out = static_cast<int64_t>(size);
+  return Status::OK();
+}
+
+// \brief Print the Python object's __str__ form along with the passed error
+// message
+ARROW_PYTHON_EXPORT
+Status InvalidValue(PyObject* obj, const std::string& why);
+
+ARROW_PYTHON_EXPORT
+Status InvalidType(PyObject* obj, const std::string& why);
+
+ARROW_PYTHON_EXPORT
+Status IntegerScalarToDoubleSafe(PyObject* obj, double* result);
+ARROW_PYTHON_EXPORT
+Status IntegerScalarToFloat32Safe(PyObject* obj, float* result);
+
+// \brief Print Python object __repr__
+void DebugPrint(PyObject* obj);
+
+ARROW_PYTHON_EXPORT
+bool IsThreadingEnabled();
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/inference.cc b/pyarrow/src/arrow/python/inference.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5714862e41e6fbebb3d76c53a6809e650fb2a71
--- /dev/null
+++ b/pyarrow/src/arrow/python/inference.cc
@@ -0,0 +1,808 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/inference.h"
+#include "arrow/python/numpy_interop.h"
+
+#include <datetime.h>
+
+#include <algorithm>
+#include <limits>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/logging.h"
+
+#include "arrow/python/datetime.h"
+#include "arrow/python/decimal.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/iterators.h"
+#include "arrow/python/numpy_convert.h"
+
+namespace arrow {
+namespace py {
+namespace {
+// Assigns a tuple to interval_types_tuple containing the nametuple for
+// MonthDayNanoIntervalType and if present dateutil's relativedelta and
+// pandas DateOffset.
+Status ImportPresentIntervalTypes(OwnedRefNoGIL* interval_types_tuple) {
+  OwnedRef relative_delta_module;
+  // These are Optional imports so swallow errors.
+  OwnedRef relative_delta_type;
+  // Try to import pandas to get types.
+  internal::InitPandasStaticData();
+  if (internal::ImportModule("dateutil.relativedelta", &relative_delta_module).ok()) {
+    RETURN_NOT_OK(internal::ImportFromModule(relative_delta_module.obj(), "relativedelta",
+                                             &relative_delta_type));
+  }
+
+  PyObject* date_offset_type = internal::BorrowPandasDataOffsetType();
+  interval_types_tuple->reset(
+      PyTuple_New(1 + (date_offset_type != nullptr ? 1 : 0) +
+                  (relative_delta_type.obj() != nullptr ? 1 : 0)));
+  RETURN_IF_PYERROR();
+  int index = 0;
+  PyTuple_SetItem(interval_types_tuple->obj(), index++,
+                  internal::NewMonthDayNanoTupleType());
+  RETURN_IF_PYERROR();
+  if (date_offset_type != nullptr) {
+    Py_XINCREF(date_offset_type);
+    PyTuple_SetItem(interval_types_tuple->obj(), index++, date_offset_type);
+    RETURN_IF_PYERROR();
+  }
+  if (relative_delta_type.obj() != nullptr) {
+    PyTuple_SetItem(interval_types_tuple->obj(), index++, relative_delta_type.detach());
+    RETURN_IF_PYERROR();
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+#define _NUMPY_UNIFY_NOOP(DTYPE) \
+  case NPY_##DTYPE:              \
+    return OK;
+
+#define _NUMPY_UNIFY_PROMOTE(DTYPE) \
+  case NPY_##DTYPE:                 \
+    current_type_num_ = dtype;      \
+    current_dtype_ = descr;         \
+    return OK;
+
+#define _NUMPY_UNIFY_PROMOTE_TO(DTYPE, NEW_TYPE)               \
+  case NPY_##DTYPE:                                            \
+    current_type_num_ = NPY_##NEW_TYPE;                        \
+    current_dtype_ = PyArray_DescrFromType(current_type_num_); \
+    return OK;
+
+// Form a consensus NumPy dtype to use for Arrow conversion for a
+// collection of dtype objects observed one at a time
+class NumPyDtypeUnifier {
+ public:
+  enum Action { OK, INVALID };
+
+  NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(nullptr) {}
+
+  Status InvalidMix(int new_dtype) {
+    return Status::Invalid("Cannot mix NumPy dtypes ",
+                           GetNumPyTypeName(current_type_num_), " and ",
+                           GetNumPyTypeName(new_dtype));
+  }
+
+  Status InvalidDatetimeUnitMix(PyArray_Descr* new_descr) {
+    auto new_meta = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+        PyDataType_C_METADATA(new_descr));
+    auto current_meta = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+        PyDataType_C_METADATA(current_dtype_));
+
+    return Status::Invalid("Cannot mix NumPy datetime64 units ",
+                           DatetimeUnitName(current_meta->meta.base), " and ",
+                           DatetimeUnitName(new_meta->meta.base));
+  }
+
+  int Observe_BOOL(PyArray_Descr* descr, int dtype) { return INVALID; }
+
+  int Observe_INT8(PyArray_Descr* descr, int dtype) {
+    switch (dtype) {
+      _NUMPY_UNIFY_PROMOTE(INT16);
+      _NUMPY_UNIFY_PROMOTE(INT32);
+      _NUMPY_UNIFY_PROMOTE(INT64);
+      _NUMPY_UNIFY_PROMOTE(FLOAT32);
+      _NUMPY_UNIFY_PROMOTE(FLOAT64);
+      default:
+        return INVALID;
+    }
+  }
+
+  int Observe_INT16(PyArray_Descr* descr, int dtype) {
+    switch (dtype) {
+      _NUMPY_UNIFY_NOOP(INT8);
+      _NUMPY_UNIFY_PROMOTE(INT32);
+      _NUMPY_UNIFY_PROMOTE(INT64);
+      _NUMPY_UNIFY_NOOP(UINT8);
+      _NUMPY_UNIFY_PROMOTE(FLOAT32);
+      _NUMPY_UNIFY_PROMOTE(FLOAT64);
+      default:
+        return INVALID;
+    }
+  }
+
+  int Observe_INT32(PyArray_Descr* descr, int dtype) {
+    switch (dtype) {
+      _NUMPY_UNIFY_NOOP(INT8);
+      _NUMPY_UNIFY_NOOP(INT16);
+      _NUMPY_UNIFY_PROMOTE(INT32);
+      _NUMPY_UNIFY_PROMOTE(INT64);
+      _NUMPY_UNIFY_NOOP(UINT8);
+      _NUMPY_UNIFY_NOOP(UINT16);
+      _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
+      _NUMPY_UNIFY_PROMOTE(FLOAT64);
+      default:
+        return INVALID;
+    }
+  }
+
+  int Observe_INT64(PyArray_Descr* descr, int dtype) {
+    switch (dtype) {
+      _NUMPY_UNIFY_NOOP(INT8);
+      _NUMPY_UNIFY_NOOP(INT16);
+      _NUMPY_UNIFY_NOOP(INT32);
+      _NUMPY_UNIFY_NOOP(INT64);
+      _NUMPY_UNIFY_NOOP(UINT8);
+      _NUMPY_UNIFY_NOOP(UINT16);
+      _NUMPY_UNIFY_NOOP(UINT32);
+      _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
+      _NUMPY_UNIFY_PROMOTE(FLOAT64);
+      default:
+        return INVALID;
+    }
+  }
+
+  int Observe_UINT8(PyArray_Descr* descr, int dtype) {
+    switch (dtype) {
+      _NUMPY_UNIFY_PROMOTE(UINT16);
+      _NUMPY_UNIFY_PROMOTE(UINT32);
+      _NUMPY_UNIFY_PROMOTE(UINT64);
+      _NUMPY_UNIFY_PROMOTE(FLOAT32);
+      _NUMPY_UNIFY_PROMOTE(FLOAT64);
+      default:
+        return INVALID;
+    }
+  }
+
+  int Observe_UINT16(PyArray_Descr* descr, int dtype) {
+    switch (dtype) {
+      _NUMPY_UNIFY_NOOP(UINT8);
+      _NUMPY_UNIFY_PROMOTE(UINT32);
+      _NUMPY_UNIFY_PROMOTE(UINT64);
+      _NUMPY_UNIFY_PROMOTE(FLOAT32);
+      _NUMPY_UNIFY_PROMOTE(FLOAT64);
+      default:
+        return INVALID;
+    }
+  }
+
+  int Observe_UINT32(PyArray_Descr* descr, int dtype) {
+    switch (dtype) {
+      _NUMPY_UNIFY_NOOP(UINT8);
+      _NUMPY_UNIFY_NOOP(UINT16);
+      _NUMPY_UNIFY_PROMOTE(UINT64);
+      _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
+      _NUMPY_UNIFY_PROMOTE(FLOAT64);
+      default:
+        return INVALID;
+    }
+  }
+
+  int Observe_UINT64(PyArray_Descr* descr, int dtype) {
+    switch (dtype) {
+      _NUMPY_UNIFY_NOOP(UINT8);
+      _NUMPY_UNIFY_NOOP(UINT16);
+      _NUMPY_UNIFY_NOOP(UINT32);
+      _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
+      _NUMPY_UNIFY_PROMOTE(FLOAT64);
+      default:
+        return INVALID;
+    }
+  }
+
+  int Observe_FLOAT16(PyArray_Descr* descr, int dtype) {
+    switch (dtype) {
+      _NUMPY_UNIFY_PROMOTE(FLOAT32);
+      _NUMPY_UNIFY_PROMOTE(FLOAT64);
+      default:
+        return INVALID;
+    }
+  }
+
+  int Observe_FLOAT32(PyArray_Descr* descr, int dtype) {
+    switch (dtype) {
+      _NUMPY_UNIFY_NOOP(INT8);
+      _NUMPY_UNIFY_NOOP(INT16);
+      _NUMPY_UNIFY_NOOP(INT32);
+      _NUMPY_UNIFY_NOOP(INT64);
+      _NUMPY_UNIFY_NOOP(UINT8);
+      _NUMPY_UNIFY_NOOP(UINT16);
+      _NUMPY_UNIFY_NOOP(UINT32);
+      _NUMPY_UNIFY_NOOP(UINT64);
+      _NUMPY_UNIFY_PROMOTE(FLOAT64);
+      default:
+        return INVALID;
+    }
+  }
+
+  int Observe_FLOAT64(PyArray_Descr* descr, int dtype) {
+    switch (dtype) {
+      _NUMPY_UNIFY_NOOP(INT8);
+      _NUMPY_UNIFY_NOOP(INT16);
+      _NUMPY_UNIFY_NOOP(INT32);
+      _NUMPY_UNIFY_NOOP(INT64);
+      _NUMPY_UNIFY_NOOP(UINT8);
+      _NUMPY_UNIFY_NOOP(UINT16);
+      _NUMPY_UNIFY_NOOP(UINT32);
+      _NUMPY_UNIFY_NOOP(UINT64);
+      default:
+        return INVALID;
+    }
+  }
+
+  int Observe_DATETIME(PyArray_Descr* dtype_obj) {
+    // Check that datetime units are consistent across all values
+    auto datetime_meta = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+        PyDataType_C_METADATA(dtype_obj));
+    auto current_meta = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(
+        PyDataType_C_METADATA(current_dtype_));
+
+    if (datetime_meta->meta.base != current_meta->meta.base) {
+      // Units don't match - this is invalid
+      return INVALID;
+    }
+
+    return OK;
+  }
+
+  Status Observe(PyArray_Descr* descr) {
+    int dtype = fix_numpy_type_num(descr->type_num);
+
+    if (current_type_num_ == -1) {
+      current_dtype_ = descr;
+      current_type_num_ = dtype;
+      return Status::OK();
+    } else if (current_type_num_ == dtype) {
+      // Same type, but for datetime we still need to check units match
+      if (dtype == NPY_DATETIME) {
+        int action = Observe_DATETIME(descr);
+        if (action == INVALID) {
+          return InvalidDatetimeUnitMix(descr);
+        }
+      }
+      return Status::OK();
+    }
+
+#define OBSERVE_CASE(DTYPE)                 \
+  case NPY_##DTYPE:                         \
+    action = Observe_##DTYPE(descr, dtype); \
+    break;
+
+    int action = OK;
+    switch (current_type_num_) {
+      OBSERVE_CASE(BOOL);
+      OBSERVE_CASE(INT8);
+      OBSERVE_CASE(INT16);
+      OBSERVE_CASE(INT32);
+      OBSERVE_CASE(INT64);
+      OBSERVE_CASE(UINT8);
+      OBSERVE_CASE(UINT16);
+      OBSERVE_CASE(UINT32);
+      OBSERVE_CASE(UINT64);
+      OBSERVE_CASE(FLOAT16);
+      OBSERVE_CASE(FLOAT32);
+      OBSERVE_CASE(FLOAT64);
+      case NPY_DATETIME:
+        action = Observe_DATETIME(descr);
+        break;
+      default:
+        return Status::NotImplemented("Unsupported numpy type ", GetNumPyTypeName(dtype));
+    }
+
+    if (action == INVALID) {
+      return InvalidMix(dtype);
+    }
+    return Status::OK();
+  }
+
+  bool dtype_was_observed() const { return current_type_num_ != -1; }
+
+  PyArray_Descr* current_dtype() const { return current_dtype_; }
+
+  int current_type_num() const { return current_type_num_; }
+
+ private:
+  static std::string DatetimeUnitName(NPY_DATETIMEUNIT unit) {
+    switch (unit) {
+      case NPY_FR_Y:
+        return "Y";
+      case NPY_FR_M:
+        return "M";
+      case NPY_FR_W:
+        return "W";
+      case NPY_FR_D:
+        return "D";
+      case NPY_FR_h:
+        return "h";
+      case NPY_FR_m:
+        return "m";
+      case NPY_FR_s:
+        return "s";
+      case NPY_FR_ms:
+        return "ms";
+      case NPY_FR_us:
+        return "us";
+      case NPY_FR_ns:
+        return "ns";
+      case NPY_FR_ps:
+        return "ps";
+      case NPY_FR_fs:
+        return "fs";
+      case NPY_FR_as:
+        return "as";
+      case NPY_FR_GENERIC:
+        return "generic";
+      default:
+        return "unknown (" + std::to_string(static_cast<int>(unit)) + ")";
+    }
+  }
+
+  int current_type_num_;
+  PyArray_Descr* current_dtype_;
+};
+
+class TypeInferrer {
+  // A type inference visitor for Python values
+ public:
+  // \param validate_interval the number of elements to observe before checking
+  // whether the data is mixed type or has other problems. This helps avoid
+  // excess computation for each element while also making sure we "bail out"
+  // early with long sequences that may have problems up front
+  // \param make_unions permit mixed-type data by creating union types (not yet
+  // implemented)
+  explicit TypeInferrer(bool pandas_null_sentinels = false,
+                        int64_t validate_interval = 100, bool make_unions = false)
+      : pandas_null_sentinels_(pandas_null_sentinels),
+        validate_interval_(validate_interval),
+        make_unions_(make_unions),
+        total_count_(0),
+        none_count_(0),
+        bool_count_(0),
+        int_count_(0),
+        date_count_(0),
+        time_count_(0),
+        timestamp_micro_count_(0),
+        duration_count_(0),
+        float_count_(0),
+        binary_count_(0),
+        unicode_count_(0),
+        decimal_count_(0),
+        list_count_(0),
+        struct_count_(0),
+        arrow_scalar_count_(0),
+        numpy_dtype_count_(0),
+        interval_count_(0),
+        max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
+                              std::numeric_limits<int32_t>::min()),
+        decimal_type_() {
+    ARROW_CHECK_OK(internal::ImportDecimalType(&decimal_type_));
+    ARROW_CHECK_OK(ImportPresentIntervalTypes(&interval_types_));
+  }
+
+  /// \param[in] obj a Python object in the sequence
+  /// \param[out] keep_going if sufficient information has been gathered to
+  /// attempt to begin converting the sequence, *keep_going will be set to true
+  /// to signal to the calling visitor loop to terminate
+  Status Visit(PyObject* obj, bool* keep_going) {
+    ++total_count_;
+
+    if (obj == Py_None || (pandas_null_sentinels_ && internal::PandasObjectIsNull(obj))) {
+      ++none_count_;
+    } else if (PyBool_Check(obj)) {
+      ++bool_count_;
+      *keep_going = make_unions_;
+    } else if (PyFloat_Check(obj)) {
+      ++float_count_;
+      *keep_going = make_unions_;
+    } else if (internal::IsPyInteger(obj)) {
+      ++int_count_;
+    } else if (PyDateTime_Check(obj)) {
+      // infer timezone from the first encountered datetime object
+      if (!timestamp_micro_count_) {
+        OwnedRef tzinfo(PyObject_GetAttrString(obj, "tzinfo"));
+        if (tzinfo.obj() != nullptr && tzinfo.obj() != Py_None) {
+          ARROW_ASSIGN_OR_RAISE(timezone_, internal::TzinfoToString(tzinfo.obj()));
+        }
+      }
+      ++timestamp_micro_count_;
+      *keep_going = make_unions_;
+    } else if (PyDelta_Check(obj)) {
+      ++duration_count_;
+      *keep_going = make_unions_;
+    } else if (PyDate_Check(obj)) {
+      ++date_count_;
+      *keep_going = make_unions_;
+    } else if (PyTime_Check(obj)) {
+      ++time_count_;
+      *keep_going = make_unions_;
+    } else if (internal::IsPyBinary(obj)) {
+      ++binary_count_;
+      *keep_going = make_unions_;
+    } else if (PyUnicode_Check(obj)) {
+      ++unicode_count_;
+      *keep_going = make_unions_;
+    } else if (arrow::py::is_scalar(obj)) {
+      RETURN_NOT_OK(VisitArrowScalar(obj, keep_going));
+    } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
+      RETURN_NOT_OK(VisitDType(PyArray_DescrFromScalar(obj), keep_going));
+    } else if (PySet_Check(obj) || (Py_TYPE(obj) == &PyDictValues_Type)) {
+      RETURN_NOT_OK(VisitSet(obj, keep_going));
+    } else if (has_numpy() && PyArray_Check(obj)) {
+      RETURN_NOT_OK(VisitNdarray(obj, keep_going));
+    } else if (PyDict_Check(obj)) {
+      RETURN_NOT_OK(VisitDict(obj));
+    } else if (PyList_Check(obj) ||
+               (PyTuple_Check(obj) &&
+                !PyObject_IsInstance(obj, PyTuple_GetItem(interval_types_.obj(), 0)))) {
+      RETURN_NOT_OK(VisitList(obj, keep_going));
+    } else if (PyObject_IsInstance(obj, decimal_type_.obj())) {
+      RETURN_NOT_OK(max_decimal_metadata_.Update(obj));
+      ++decimal_count_;
+    } else if (PyObject_IsInstance(obj, interval_types_.obj())) {
+      ++interval_count_;
+    } else {
+      return internal::InvalidValue(obj,
+                                    "did not recognize Python value type when inferring "
+                                    "an Arrow data type");
+    }
+
+    if (total_count_ % validate_interval_ == 0) {
+      RETURN_NOT_OK(Validate());
+    }
+
+    return Status::OK();
+  }
+
+  // Infer value type from a sequence of values
+  Status VisitSequence(PyObject* obj, PyObject* mask = nullptr) {
+    if (mask == nullptr || mask == Py_None) {
+      return internal::VisitSequence(
+          obj, /*offset=*/0,
+          [this](PyObject* value, bool* keep_going) { return Visit(value, keep_going); });
+    } else {
+      return internal::VisitSequenceMasked(
+          obj, mask, /*offset=*/0,
+          [this](PyObject* value, uint8_t masked, bool* keep_going) {
+            if (!masked) {
+              return Visit(value, keep_going);
+            } else {
+              return Status::OK();
+            }
+          });
+    }
+  }
+
+  // Infer value type from a sequence of values
+  Status VisitIterable(PyObject* obj) {
+    return internal::VisitIterable(obj, [this](PyObject* value, bool* keep_going) {
+      return Visit(value, keep_going);
+    });
+  }
+
+  Status GetType(std::shared_ptr<DataType>* out) {
+    // TODO(wesm): handling forming unions
+    if (make_unions_) {
+      return Status::NotImplemented("Creating union types not yet supported");
+    }
+
+    RETURN_NOT_OK(Validate());
+
+    if (arrow_scalar_count_ > 0 && arrow_scalar_count_ + none_count_ != total_count_) {
+      return Status::Invalid(
+          "pyarrow scalars cannot be mixed "
+          "with other Python scalar values currently");
+    }
+
+    if (numpy_dtype_count_ > 0) {
+      // All NumPy scalars and Nones/nulls
+      if (numpy_dtype_count_ + none_count_ == total_count_) {
+        return NumPyDtypeToArrow(numpy_unifier_.current_dtype()).Value(out);
+      }
+
+      // The "bad path": data contains a mix of NumPy scalars and
+      // other kinds of scalars. Note this can happen innocuously
+      // because numpy.nan is not a NumPy scalar (it's a built-in
+      // PyFloat)
+
+      // TODO(ARROW-5564): Merge together type unification so this
+      // hack is not necessary
+      switch (numpy_unifier_.current_type_num()) {
+        case NPY_BOOL:
+          bool_count_ += numpy_dtype_count_;
+          break;
+        case NPY_INT8:
+        case NPY_INT16:
+        case NPY_INT32:
+        case NPY_INT64:
+        case NPY_UINT8:
+        case NPY_UINT16:
+        case NPY_UINT32:
+        case NPY_UINT64:
+          int_count_ += numpy_dtype_count_;
+          break;
+        case NPY_FLOAT32:
+        case NPY_FLOAT64:
+          float_count_ += numpy_dtype_count_;
+          break;
+        case NPY_DATETIME:
+          return Status::Invalid(
+              "numpy.datetime64 scalars cannot be mixed "
+              "with other Python scalar values currently");
+      }
+    }
+
+    if (list_count_) {
+      std::shared_ptr<DataType> value_type;
+      RETURN_NOT_OK(list_inferrer_->GetType(&value_type));
+      *out = list(value_type);
+    } else if (struct_count_) {
+      RETURN_NOT_OK(GetStructType(out));
+    } else if (decimal_count_) {
+      if (max_decimal_metadata_.precision() > Decimal128Type::kMaxPrecision) {
+        // the default constructor does not validate the precision and scale
+        ARROW_ASSIGN_OR_RAISE(*out,
+                              Decimal256Type::Make(max_decimal_metadata_.precision(),
+                                                   max_decimal_metadata_.scale()));
+      } else {
+        ARROW_ASSIGN_OR_RAISE(*out,
+                              Decimal128Type::Make(max_decimal_metadata_.precision(),
+                                                   max_decimal_metadata_.scale()));
+      }
+    } else if (float_count_) {
+      // Prioritize floats before integers
+      *out = float64();
+    } else if (int_count_) {
+      *out = int64();
+    } else if (date_count_) {
+      *out = date32();
+    } else if (time_count_) {
+      *out = time64(TimeUnit::MICRO);
+    } else if (timestamp_micro_count_) {
+      *out = timestamp(TimeUnit::MICRO, timezone_);
+    } else if (duration_count_) {
+      *out = duration(TimeUnit::MICRO);
+    } else if (bool_count_) {
+      *out = boolean();
+    } else if (binary_count_) {
+      *out = binary();
+    } else if (unicode_count_) {
+      *out = utf8();
+    } else if (interval_count_) {
+      *out = month_day_nano_interval();
+    } else if (arrow_scalar_count_) {
+      *out = scalar_type_;
+    } else {
+      *out = null();
+    }
+    return Status::OK();
+  }
+
+  int64_t total_count() const { return total_count_; }
+
+ protected:
+  Status Validate() const {
+    if (list_count_ > 0) {
+      if (list_count_ + none_count_ != total_count_) {
+        return Status::Invalid("cannot mix list and non-list, non-null values");
+      }
+      RETURN_NOT_OK(list_inferrer_->Validate());
+    } else if (struct_count_ > 0) {
+      if (struct_count_ + none_count_ != total_count_) {
+        return Status::Invalid("cannot mix struct and non-struct, non-null values");
+      }
+      for (const auto& it : struct_inferrers_) {
+        RETURN_NOT_OK(it.second.Validate());
+      }
+    }
+    return Status::OK();
+  }
+
+  Status VisitArrowScalar(PyObject* obj, bool* keep_going /* unused */) {
+    ARROW_ASSIGN_OR_RAISE(auto scalar, arrow::py::unwrap_scalar(obj));
+    // Check that all the scalar types for the sequence are the same
+    if (arrow_scalar_count_ > 0 && *scalar->type != *scalar_type_) {
+      return internal::InvalidValue(obj, "cannot mix scalars with different types");
+    }
+    scalar_type_ = scalar->type;
+    ++arrow_scalar_count_;
+    return Status::OK();
+  }
+
+  Status VisitDType(PyArray_Descr* dtype, bool* keep_going) {
+    // Continue visiting dtypes for now.
+    // TODO(wesm): devise approach for unions
+    ++numpy_dtype_count_;
+    *keep_going = true;
+    return numpy_unifier_.Observe(dtype);
+  }
+
+  Status VisitList(PyObject* obj, bool* keep_going /* unused */) {
+    if (!list_inferrer_) {
+      list_inferrer_.reset(
+          new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
+    }
+    ++list_count_;
+    return list_inferrer_->VisitSequence(obj);
+  }
+
+  Status VisitSet(PyObject* obj, bool* keep_going /* unused */) {
+    if (!list_inferrer_) {
+      list_inferrer_.reset(
+          new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
+    }
+    ++list_count_;
+    return list_inferrer_->VisitIterable(obj);
+  }
+
+  Status VisitNdarray(PyObject* obj, bool* keep_going) {
+    PyArray_Descr* dtype = PyArray_DESCR(reinterpret_cast<PyArrayObject*>(obj));
+    if (dtype->type_num == NPY_OBJECT) {
+      return VisitList(obj, keep_going);
+    }
+    // Not an object array: infer child Arrow type from dtype
+    if (!list_inferrer_) {
+      list_inferrer_.reset(
+          new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
+    }
+    ++list_count_;
+
+    // XXX(wesm): In ARROW-4324 I added accounting to check whether
+    // all of the non-null values have NumPy dtypes, but the
+    // total_count not being properly incremented here
+    ++(*list_inferrer_).total_count_;
+    return list_inferrer_->VisitDType(dtype, keep_going);
+  }
+
+  Status VisitDict(PyObject* obj) {
+    PyObject* key_obj;
+    PyObject* value_obj;
+    Py_ssize_t pos = 0;
+
+    while (PyDict_Next(obj, &pos, &key_obj, &value_obj)) {
+      std::string key;
+      if (PyUnicode_Check(key_obj)) {
+        RETURN_NOT_OK(internal::PyUnicode_AsStdString(key_obj, &key));
+      } else if (PyBytes_Check(key_obj)) {
+        key = internal::PyBytes_AsStdString(key_obj);
+      } else {
+        return Status::TypeError("Expected dict key of type str or bytes, got '",
+                                 Py_TYPE(key_obj)->tp_name, "'");
+      }
+      // Get or create visitor for this key
+      auto it = struct_inferrers_.find(key);
+      if (it == struct_inferrers_.end()) {
+        it = struct_inferrers_
+                 .insert(
+                     std::make_pair(key, TypeInferrer(pandas_null_sentinels_,
+                                                      validate_interval_, make_unions_)))
+                 .first;
+      }
+      TypeInferrer* visitor = &it->second;
+
+      // We ignore termination signals from child visitors for now
+      //
+      // TODO(wesm): keep track of whether type inference has terminated for
+      // the child visitors to avoid doing unneeded work
+      bool keep_going = true;
+      RETURN_NOT_OK(visitor->Visit(value_obj, &keep_going));
+    }
+
+    // We do not terminate visiting dicts since we want the union of all
+    // observed keys
+    ++struct_count_;
+    return Status::OK();
+  }
+
+  Status GetStructType(std::shared_ptr<DataType>* out) {
+    std::vector<std::shared_ptr<Field>> fields;
+    for (auto&& it : struct_inferrers_) {
+      std::shared_ptr<DataType> field_type;
+      RETURN_NOT_OK(it.second.GetType(&field_type));
+      fields.emplace_back(field(it.first, field_type));
+    }
+    *out = struct_(fields);
+    return Status::OK();
+  }
+
+ private:
+  bool pandas_null_sentinels_;
+  int64_t validate_interval_;
+  bool make_unions_;
+  int64_t total_count_;
+  int64_t none_count_;
+  int64_t bool_count_;
+  int64_t int_count_;
+  int64_t date_count_;
+  int64_t time_count_;
+  int64_t timestamp_micro_count_;
+  std::string timezone_;
+  int64_t duration_count_;
+  int64_t float_count_;
+  int64_t binary_count_;
+  int64_t unicode_count_;
+  int64_t decimal_count_;
+  int64_t list_count_;
+  int64_t struct_count_;
+  int64_t arrow_scalar_count_;
+  int64_t numpy_dtype_count_;
+  int64_t interval_count_;
+  std::unique_ptr<TypeInferrer> list_inferrer_;
+  std::map<std::string, TypeInferrer> struct_inferrers_;
+  std::shared_ptr<DataType> scalar_type_;
+
+  // If we observe a strongly-typed value in e.g. a NumPy array, we can store
+  // it here to skip the type counting logic above
+  NumPyDtypeUnifier numpy_unifier_;
+
+  internal::DecimalMetadata max_decimal_metadata_;
+
+  OwnedRefNoGIL decimal_type_;
+  OwnedRefNoGIL interval_types_;
+};
+
+// Non-exhaustive type inference
+Result<std::shared_ptr<DataType>> InferArrowType(PyObject* obj, PyObject* mask,
+                                                 bool pandas_null_sentinels) {
+  if (pandas_null_sentinels) {
+    // ARROW-842: If pandas is not installed then null checks will be less
+    // comprehensive, but that is okay.
+    internal::InitPandasStaticData();
+  }
+
+  std::shared_ptr<DataType> out_type;
+  TypeInferrer inferrer(pandas_null_sentinels);
+  RETURN_NOT_OK(inferrer.VisitSequence(obj, mask));
+  RETURN_NOT_OK(inferrer.GetType(&out_type));
+  if (out_type == nullptr) {
+    return Status::TypeError("Unable to determine data type");
+  } else {
+    return std::move(out_type);
+  }
+}
+
+ARROW_PYTHON_EXPORT
+bool IsPyBool(PyObject* obj) { return internal::PyBoolScalar_Check(obj); }
+
+ARROW_PYTHON_EXPORT
+bool IsPyInt(PyObject* obj) { return internal::PyIntScalar_Check(obj); }
+
+ARROW_PYTHON_EXPORT
+bool IsPyFloat(PyObject* obj) { return internal::PyFloatScalar_Check(obj); }
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/inference.h b/pyarrow/src/arrow/python/inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..983384db118a16141e49a679388b83c75d1d77d6
--- /dev/null
+++ b/pyarrow/src/arrow/python/inference.h
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for converting between CPython built-in data structures and Arrow
+// data structures
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <memory>
+
+#include "arrow/python/visibility.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+
+#include "common.h"
+
+namespace arrow {
+
+class Array;
+class Status;
+
+namespace py {
+
+// These functions take a sequence input, not arbitrary iterables
+
+/// \brief Infer Arrow type from a Python sequence
+/// \param[in] obj the sequence of values
+/// \param[in] mask an optional mask where True values are null. May
+/// be nullptr
+/// \param[in] pandas_null_sentinels use pandas's null value markers
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<arrow::DataType>> InferArrowType(PyObject* obj, PyObject* mask,
+                                                        bool pandas_null_sentinels);
+
+/// Checks whether the passed Python object is a boolean scalar
+ARROW_PYTHON_EXPORT
+bool IsPyBool(PyObject* obj);
+
+/// Checks whether the passed Python object is an integer scalar
+ARROW_PYTHON_EXPORT
+bool IsPyInt(PyObject* obj);
+
+/// Checks whether the passed Python object is a float scalar
+ARROW_PYTHON_EXPORT
+bool IsPyFloat(PyObject* obj);
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/io.cc b/pyarrow/src/arrow/python/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7a4d18ab9fa23a24248158db227901cd45803b91
--- /dev/null
+++ b/pyarrow/src/arrow/python/io.cc
@@ -0,0 +1,387 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "io.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <mutex>
+#include <string>
+
+#include "arrow/io/memory.h"
+#include "arrow/memory_pool.h"
+#include "arrow/status.h"
+#include "arrow/util/logging.h"
+
+#include "arrow/python/common.h"
+#include "arrow/python/pyarrow.h"
+
+namespace arrow {
+
+using arrow::io::TransformInputStream;
+
+namespace py {
+
+// ----------------------------------------------------------------------
+// Python file
+
+// A common interface to a Python file-like object. Must acquire GIL before
+// calling any methods
+class PythonFile {
+ public:
+  explicit PythonFile(PyObject* file) : file_(file), checked_read_buffer_(false) {
+    Py_INCREF(file);
+  }
+
+  Status CheckClosed() const {
+    if (!file_) {
+      return Status::Invalid("operation on closed Python file");
+    }
+    return Status::OK();
+  }
+
+  Status Close() {
+    if (file_) {
+      PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "close", "()");
+      Py_XDECREF(result);
+      file_.reset();
+      PY_RETURN_IF_ERROR(StatusCode::IOError);
+    }
+    return Status::OK();
+  }
+
+  Status Abort() {
+    file_.reset();
+    return Status::OK();
+  }
+
+  bool closed() const {
+    if (!file_) {
+      return true;
+    }
+    PyObject* result = PyObject_GetAttrString(file_.obj(), "closed");
+    if (result == NULL) {
+      // Can't propagate the error, so write it out and return an arbitrary value
+      PyErr_WriteUnraisable(NULL);
+      return true;
+    }
+    int ret = PyObject_IsTrue(result);
+    Py_XDECREF(result);
+    if (ret < 0) {
+      PyErr_WriteUnraisable(NULL);
+      return true;
+    }
+    return ret != 0;
+  }
+
+  Status Seek(int64_t position, int whence) {
+    RETURN_NOT_OK(CheckClosed());
+
+    // NOTE: `long long` is at least 64 bits in the C standard, the cast below is
+    // therefore safe.
+
+    // whence: 0 for relative to start of file, 2 for end of file
+    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(Li)",
+                                               static_cast<long long>(position), whence);
+    Py_XDECREF(result);
+    PY_RETURN_IF_ERROR(StatusCode::IOError);
+    return Status::OK();
+  }
+
+  Status Read(int64_t nbytes, PyObject** out) {
+    RETURN_NOT_OK(CheckClosed());
+
+    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(L)",
+                                               static_cast<long long>(nbytes));
+    PY_RETURN_IF_ERROR(StatusCode::IOError);
+    *out = result;
+    return Status::OK();
+  }
+
+  Status ReadBuffer(int64_t nbytes, PyObject** out) {
+    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", "(L)",
+                                               static_cast<long long>(nbytes));
+    PY_RETURN_IF_ERROR(StatusCode::IOError);
+    *out = result;
+    return Status::OK();
+  }
+
+  Status Write(const void* data, int64_t nbytes) {
+    RETURN_NOT_OK(CheckClosed());
+
+    // Since the data isn't owned, we have to make a copy
+    PyObject* py_data =
+        PyBytes_FromStringAndSize(reinterpret_cast<const char*>(data), nbytes);
+    PY_RETURN_IF_ERROR(StatusCode::IOError);
+
+    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "write", "(O)", py_data);
+    Py_XDECREF(py_data);
+    Py_XDECREF(result);
+    PY_RETURN_IF_ERROR(StatusCode::IOError);
+    return Status::OK();
+  }
+
+  Status Write(const std::shared_ptr<Buffer>& buffer) {
+    RETURN_NOT_OK(CheckClosed());
+
+    PyObject* py_data = wrap_buffer(buffer);
+    PY_RETURN_IF_ERROR(StatusCode::IOError);
+
+    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "write", "(O)", py_data);
+    Py_XDECREF(py_data);
+    Py_XDECREF(result);
+    PY_RETURN_IF_ERROR(StatusCode::IOError);
+    return Status::OK();
+  }
+
+  Result<int64_t> Tell() {
+    RETURN_NOT_OK(CheckClosed());
+
+    PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "tell", "()");
+    PY_RETURN_IF_ERROR(StatusCode::IOError);
+
+    int64_t position = PyLong_AsLongLong(result);
+    Py_DECREF(result);
+
+    // PyLong_AsLongLong can raise OverflowError
+    PY_RETURN_IF_ERROR(StatusCode::IOError);
+    return position;
+  }
+
+  std::mutex& lock() { return lock_; }
+
+  bool HasReadBuffer() {
+    if (!checked_read_buffer_) {  // we don't want to check this each time
+      has_read_buffer_ = PyObject_HasAttrString(file_.obj(), "read_buffer") == 1;
+      checked_read_buffer_ = true;
+    }
+    return has_read_buffer_;
+  }
+
+ private:
+  std::mutex lock_;
+  OwnedRefNoGIL file_;
+  bool has_read_buffer_;
+  bool checked_read_buffer_;
+};
+
+// ----------------------------------------------------------------------
+// Seekable input stream
+
+PyReadableFile::PyReadableFile(PyObject* file) { file_.reset(new PythonFile(file)); }
+
+// The destructor does not close the underlying Python file object, as
+// there may be multiple references to it.  Instead let the Python
+// destructor do its job.
+PyReadableFile::~PyReadableFile() {}
+
+Status PyReadableFile::Abort() {
+  return SafeCallIntoPython([this]() { return file_->Abort(); });
+}
+
+Status PyReadableFile::Close() {
+  return SafeCallIntoPython([this]() { return file_->Close(); });
+}
+
+bool PyReadableFile::closed() const {
+  bool res;
+  Status st = SafeCallIntoPython([this, &res]() {
+    res = file_->closed();
+    return Status::OK();
+  });
+  return res;
+}
+
+Status PyReadableFile::Seek(int64_t position) {
+  return SafeCallIntoPython([=] { return file_->Seek(position, 0); });
+}
+
+Result<int64_t> PyReadableFile::Tell() const {
+  return SafeCallIntoPython([=]() -> Result<int64_t> { return file_->Tell(); });
+}
+
+Result<int64_t> PyReadableFile::Read(int64_t nbytes, void* out) {
+  return SafeCallIntoPython([=]() -> Result<int64_t> {
+    OwnedRef bytes;
+    RETURN_NOT_OK(file_->Read(nbytes, bytes.ref()));
+    PyObject* bytes_obj = bytes.obj();
+    ARROW_DCHECK(bytes_obj != NULL);
+
+    Py_buffer py_buf;
+    if (!PyObject_GetBuffer(bytes_obj, &py_buf, PyBUF_ANY_CONTIGUOUS)) {
+      const uint8_t* data = reinterpret_cast<const uint8_t*>(py_buf.buf);
+      std::memcpy(out, data, py_buf.len);
+      int64_t len = py_buf.len;
+      PyBuffer_Release(&py_buf);
+      return len;
+    } else {
+      return Status::TypeError(
+          "Python file read() should have returned a bytes object or an object "
+          "supporting the buffer protocol, got '",
+          Py_TYPE(bytes_obj)->tp_name, "' (did you open the file in binary mode?)");
+    }
+  });
+}
+
+Result<std::shared_ptr<Buffer>> PyReadableFile::Read(int64_t nbytes) {
+  return SafeCallIntoPython([=]() -> Result<std::shared_ptr<Buffer>> {
+    OwnedRef buffer_obj;
+    if (file_->HasReadBuffer()) {
+      RETURN_NOT_OK(file_->ReadBuffer(nbytes, buffer_obj.ref()));
+    } else {
+      RETURN_NOT_OK(file_->Read(nbytes, buffer_obj.ref()));
+    }
+    ARROW_DCHECK(buffer_obj.obj() != NULL);
+
+    return PyBuffer::FromPyObject(buffer_obj.obj());
+  });
+}
+
+Result<int64_t> PyReadableFile::ReadAt(int64_t position, int64_t nbytes, void* out) {
+  std::lock_guard<std::mutex> guard(file_->lock());
+  return SafeCallIntoPython([=]() -> Result<int64_t> {
+    RETURN_NOT_OK(Seek(position));
+    return Read(nbytes, out);
+  });
+}
+
+Result<std::shared_ptr<Buffer>> PyReadableFile::ReadAt(int64_t position, int64_t nbytes) {
+  std::lock_guard<std::mutex> guard(file_->lock());
+  return SafeCallIntoPython([=]() -> Result<std::shared_ptr<Buffer>> {
+    RETURN_NOT_OK(Seek(position));
+    return Read(nbytes);
+  });
+}
+
+Result<int64_t> PyReadableFile::GetSize() {
+  return SafeCallIntoPython([=]() -> Result<int64_t> {
+    ARROW_ASSIGN_OR_RAISE(int64_t current_position, file_->Tell());
+    RETURN_NOT_OK(file_->Seek(0, 2));
+
+    ARROW_ASSIGN_OR_RAISE(int64_t file_size, file_->Tell());
+    // Restore previous file position
+    RETURN_NOT_OK(file_->Seek(current_position, 0));
+
+    return file_size;
+  });
+}
+
+// ----------------------------------------------------------------------
+// Output stream
+
+PyOutputStream::PyOutputStream(PyObject* file) : position_(0) {
+  file_.reset(new PythonFile(file));
+}
+
+// The destructor does not close the underlying Python file object, as
+// there may be multiple references to it.  Instead let the Python
+// destructor do its job.
+PyOutputStream::~PyOutputStream() {}
+
+Status PyOutputStream::Abort() {
+  return SafeCallIntoPython([=]() { return file_->Abort(); });
+}
+
+Status PyOutputStream::Close() {
+  return SafeCallIntoPython([=]() { return file_->Close(); });
+}
+
+bool PyOutputStream::closed() const {
+  bool res;
+  Status st = SafeCallIntoPython([this, &res]() {
+    res = file_->closed();
+    return Status::OK();
+  });
+  return res;
+}
+
+Result<int64_t> PyOutputStream::Tell() const { return position_; }
+
+Status PyOutputStream::Write(const void* data, int64_t nbytes) {
+  return SafeCallIntoPython([=]() {
+    position_ += nbytes;
+    return file_->Write(data, nbytes);
+  });
+}
+
+Status PyOutputStream::Write(const std::shared_ptr<Buffer>& buffer) {
+  return SafeCallIntoPython([=]() {
+    position_ += buffer->size();
+    return file_->Write(buffer);
+  });
+}
+
+// ----------------------------------------------------------------------
+// Foreign buffer
+
+Status PyForeignBuffer::Make(const uint8_t* data, int64_t size, PyObject* base,
+                             std::shared_ptr<Buffer>* out) {
+  PyForeignBuffer* buf = new PyForeignBuffer(data, size, base);
+  if (buf == NULL) {
+    return Status::OutOfMemory("could not allocate foreign buffer object");
+  } else {
+    *out = std::shared_ptr<Buffer>(buf);
+    return Status::OK();
+  }
+}
+
+// ----------------------------------------------------------------------
+// TransformInputStream::TransformFunc wrapper
+
+struct TransformFunctionWrapper {
+  TransformFunctionWrapper(TransformCallback cb, PyObject* arg)
+      : cb_(std::move(cb)), arg_(std::make_shared<OwnedRefNoGIL>(arg)) {
+    Py_INCREF(arg);
+  }
+
+  Result<std::shared_ptr<Buffer>> operator()(const std::shared_ptr<Buffer>& src) {
+    return SafeCallIntoPython([=]() -> Result<std::shared_ptr<Buffer>> {
+      std::shared_ptr<Buffer> dest;
+      cb_(arg_->obj(), src, &dest);
+      RETURN_NOT_OK(CheckPyError());
+      return dest;
+    });
+  }
+
+ protected:
+  // Need to wrap OwnedRefNoGIL because std::function needs the callable
+  // to be copy-constructible...
+  TransformCallback cb_;
+  std::shared_ptr<OwnedRefNoGIL> arg_;
+};
+
+std::shared_ptr<::arrow::io::InputStream> MakeTransformInputStream(
+    std::shared_ptr<::arrow::io::InputStream> wrapped, TransformInputStreamVTable vtable,
+    PyObject* handler) {
+  TransformInputStream::TransformFunc transform(
+      TransformFunctionWrapper{std::move(vtable.transform), handler});
+  return std::make_shared<TransformInputStream>(std::move(wrapped), std::move(transform));
+}
+
+std::shared_ptr<StreamWrapFunc> MakeStreamTransformFunc(TransformInputStreamVTable vtable,
+                                                        PyObject* handler) {
+  TransformInputStream::TransformFunc transform(
+      TransformFunctionWrapper{std::move(vtable.transform), handler});
+  StreamWrapFunc func = [transform](std::shared_ptr<::arrow::io::InputStream> wrapped) {
+    return std::make_shared<TransformInputStream>(wrapped, transform);
+  };
+  return std::make_shared<StreamWrapFunc>(func);
+}
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/io.h b/pyarrow/src/arrow/python/io.h
new file mode 100644
index 0000000000000000000000000000000000000000..10489c1351b5a73d7fd19cf17b187c716ea31dbd
--- /dev/null
+++ b/pyarrow/src/arrow/python/io.h
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/io/interfaces.h"
+#include "arrow/io/transform.h"
+
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+namespace py {
+
+class ARROW_NO_EXPORT PythonFile;
+
+class ARROW_PYTHON_EXPORT PyReadableFile : public io::RandomAccessFile {
+ public:
+  explicit PyReadableFile(PyObject* file);
+  ~PyReadableFile() override;
+
+  Status Close() override;
+  Status Abort() override;
+  bool closed() const override;
+
+  Result<int64_t> Read(int64_t nbytes, void* out) override;
+  Result<std::shared_ptr<Buffer>> Read(int64_t nbytes) override;
+
+  // Thread-safe version
+  Result<int64_t> ReadAt(int64_t position, int64_t nbytes, void* out) override;
+
+  // Thread-safe version
+  Result<std::shared_ptr<Buffer>> ReadAt(int64_t position, int64_t nbytes) override;
+
+  Result<int64_t> GetSize() override;
+
+  Status Seek(int64_t position) override;
+
+  Result<int64_t> Tell() const override;
+
+ private:
+  std::unique_ptr<PythonFile> file_;
+};
+
+class ARROW_PYTHON_EXPORT PyOutputStream : public io::OutputStream {
+ public:
+  explicit PyOutputStream(PyObject* file);
+  ~PyOutputStream() override;
+
+  Status Close() override;
+  Status Abort() override;
+  bool closed() const override;
+  Result<int64_t> Tell() const override;
+  Status Write(const void* data, int64_t nbytes) override;
+  Status Write(const std::shared_ptr<Buffer>& buffer) override;
+
+ private:
+  std::unique_ptr<PythonFile> file_;
+  int64_t position_;
+};
+
+// TODO(wesm): seekable output files
+
+// A Buffer subclass that keeps a PyObject reference throughout its
+// lifetime, such that the Python object is kept alive as long as the
+// C++ buffer is still needed.
+// Keeping the reference in a Python wrapper would be incorrect as
+// the Python wrapper can get destroyed even though the wrapped C++
+// buffer is still alive (ARROW-2270).
+class ARROW_PYTHON_EXPORT PyForeignBuffer : public Buffer {
+ public:
+  static Status Make(const uint8_t* data, int64_t size, PyObject* base,
+                     std::shared_ptr<Buffer>* out);
+
+ private:
+  PyForeignBuffer(const uint8_t* data, int64_t size, PyObject* base)
+      : Buffer(data, size) {
+    Py_INCREF(base);
+    base_.reset(base);
+  }
+
+  OwnedRefNoGIL base_;
+};
+
+// All this rigamarole because Cython is really poor with std::function<>
+
+using TransformCallback = std::function<void(
+    PyObject*, const std::shared_ptr<Buffer>& src, std::shared_ptr<Buffer>* out)>;
+
+struct TransformInputStreamVTable {
+  TransformCallback transform;
+};
+
+ARROW_PYTHON_EXPORT
+std::shared_ptr<::arrow::io::InputStream> MakeTransformInputStream(
+    std::shared_ptr<::arrow::io::InputStream> wrapped, TransformInputStreamVTable vtable,
+    PyObject* arg);
+
+using StreamWrapFunc = std::function<Result<std::shared_ptr<io::InputStream>>(
+    std::shared_ptr<io::InputStream>)>;
+ARROW_PYTHON_EXPORT
+std::shared_ptr<StreamWrapFunc> MakeStreamTransformFunc(TransformInputStreamVTable vtable,
+                                                        PyObject* handler);
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/ipc.cc b/pyarrow/src/arrow/python/ipc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e18c13e06c050a6357bb629d862559d1cffbf1b6
--- /dev/null
+++ b/pyarrow/src/arrow/python/ipc.cc
@@ -0,0 +1,135 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "ipc.h"
+
+#include <memory>
+
+#include "arrow/compute/cast.h"
+#include "arrow/python/pyarrow.h"
+
+namespace arrow {
+namespace py {
+
+PyRecordBatchReader::PyRecordBatchReader() {}
+
+Status PyRecordBatchReader::Init(std::shared_ptr<Schema> schema, PyObject* iterable) {
+  schema_ = std::move(schema);
+
+  iterator_.reset(PyObject_GetIter(iterable));
+  return CheckPyError();
+}
+
+std::shared_ptr<Schema> PyRecordBatchReader::schema() const { return schema_; }
+
+Status PyRecordBatchReader::ReadNext(std::shared_ptr<RecordBatch>* batch) {
+  PyAcquireGIL lock;
+
+  if (!iterator_) {
+    // End of stream
+    batch->reset();
+    return Status::OK();
+  }
+
+  OwnedRef py_batch(PyIter_Next(iterator_.obj()));
+  if (!py_batch) {
+    RETURN_IF_PYERROR();
+    // End of stream
+    batch->reset();
+    iterator_.reset();
+    return Status::OK();
+  }
+
+  return unwrap_batch(py_batch.obj()).Value(batch);
+}
+
+Result<std::shared_ptr<RecordBatchReader>> PyRecordBatchReader::Make(
+    std::shared_ptr<Schema> schema, PyObject* iterable) {
+  auto reader = std::shared_ptr<PyRecordBatchReader>(new PyRecordBatchReader());
+  RETURN_NOT_OK(reader->Init(std::move(schema), iterable));
+  return reader;
+}
+
+CastingRecordBatchReader::CastingRecordBatchReader() = default;
+
+Status CastingRecordBatchReader::Init(std::shared_ptr<RecordBatchReader> parent,
+                                      std::shared_ptr<Schema> schema) {
+  std::shared_ptr<Schema> src = parent->schema();
+
+  // The check for names has already been done in Python where it's easier to
+  // generate a nice error message.
+  int num_fields = schema->num_fields();
+  if (src->num_fields() != num_fields) {
+    return Status::Invalid("Number of fields not equal");
+  }
+
+  // Ensure all columns can be cast before succeeding
+  for (int i = 0; i < num_fields; i++) {
+    auto& src_type = src->field(i)->type();
+    auto& schema_type = schema->field(i)->type();
+    if (!src_type->Equals(schema_type) && !compute::CanCast(*src_type, *schema_type)) {
+      return Status::TypeError("Field ", i, " cannot be cast from ",
+                               src->field(i)->type()->ToString(), " to ",
+                               schema->field(i)->type()->ToString());
+    }
+  }
+
+  parent_ = std::move(parent);
+  schema_ = std::move(schema);
+
+  return Status::OK();
+}
+
+std::shared_ptr<Schema> CastingRecordBatchReader::schema() const { return schema_; }
+
+Status CastingRecordBatchReader::ReadNext(std::shared_ptr<RecordBatch>* batch) {
+  std::shared_ptr<RecordBatch> out;
+  ARROW_RETURN_NOT_OK(parent_->ReadNext(&out));
+  if (!out) {
+    batch->reset();
+    return Status::OK();
+  }
+
+  auto num_columns = out->num_columns();
+  auto options = compute::CastOptions::Safe();
+  ArrayVector columns(num_columns);
+  for (int i = 0; i < num_columns; i++) {
+    const Array& src = *out->column(i);
+    if (!schema_->field(i)->nullable() && src.null_count() > 0) {
+      return Status::Invalid(
+          "Can't cast array that contains nulls to non-nullable field at index ", i);
+    }
+
+    ARROW_ASSIGN_OR_RAISE(columns[i],
+                          compute::Cast(src, schema_->field(i)->type(), options));
+  }
+
+  *batch = RecordBatch::Make(schema_, out->num_rows(), std::move(columns));
+  return Status::OK();
+}
+
+Result<std::shared_ptr<RecordBatchReader>> CastingRecordBatchReader::Make(
+    std::shared_ptr<RecordBatchReader> parent, std::shared_ptr<Schema> schema) {
+  auto reader = std::shared_ptr<CastingRecordBatchReader>(new CastingRecordBatchReader());
+  ARROW_RETURN_NOT_OK(reader->Init(parent, schema));
+  return reader;
+}
+
+Status CastingRecordBatchReader::Close() { return parent_->Close(); }
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/ipc.h b/pyarrow/src/arrow/python/ipc.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c16d8c967ff0bffc52e7803d4d894adb72b1215
--- /dev/null
+++ b/pyarrow/src/arrow/python/ipc.h
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+#include "arrow/record_batch.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace py {
+
+class ARROW_PYTHON_EXPORT PyRecordBatchReader : public RecordBatchReader {
+ public:
+  std::shared_ptr<Schema> schema() const override;
+
+  Status ReadNext(std::shared_ptr<RecordBatch>* batch) override;
+
+  // For use from Cython
+  // Assumes that `iterable` is borrowed
+  static Result<std::shared_ptr<RecordBatchReader>> Make(std::shared_ptr<Schema>,
+                                                         PyObject* iterable);
+
+ protected:
+  PyRecordBatchReader();
+
+  Status Init(std::shared_ptr<Schema>, PyObject* iterable);
+
+  std::shared_ptr<Schema> schema_;
+  OwnedRefNoGIL iterator_;
+};
+
+class ARROW_PYTHON_EXPORT CastingRecordBatchReader : public RecordBatchReader {
+ public:
+  std::shared_ptr<Schema> schema() const override;
+
+  Status ReadNext(std::shared_ptr<RecordBatch>* batch) override;
+
+  static Result<std::shared_ptr<RecordBatchReader>> Make(
+      std::shared_ptr<RecordBatchReader> parent, std::shared_ptr<Schema> schema);
+
+  Status Close() override;
+
+ protected:
+  CastingRecordBatchReader();
+
+  Status Init(std::shared_ptr<RecordBatchReader> parent, std::shared_ptr<Schema> schema);
+
+  std::shared_ptr<RecordBatchReader> parent_;
+  std::shared_ptr<Schema> schema_;
+};
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/iterators.h b/pyarrow/src/arrow/python/iterators.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd467f6ac407705e62787dd22c9413616647de17
--- /dev/null
+++ b/pyarrow/src/arrow/python/iterators.h
@@ -0,0 +1,200 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <utility>
+
+#include "arrow/array/array_primitive.h"
+
+#include "arrow/python/common.h"
+#include "arrow/python/numpy_init.h"
+#include "arrow/python/numpy_internal.h"
+
+namespace arrow {
+namespace py {
+namespace internal {
+
+using arrow::internal::checked_cast;
+
+// Visit the Python sequence, calling the given callable on each element.  If
+// the callable returns a non-OK status, iteration stops and the status is
+// returned.
+//
+// The call signature for Visitor must be
+//
+// Visit(PyObject* obj, int64_t index, bool* keep_going)
+//
+// If keep_going is set to false, the iteration terminates
+template <class VisitorFunc>
+inline Status VisitSequenceGeneric(PyObject* obj, int64_t offset, VisitorFunc&& func) {
+  // VisitorFunc may set to false to terminate iteration
+  bool keep_going = true;
+
+  if (has_numpy() && PyArray_Check(obj)) {
+    PyArrayObject* arr_obj = reinterpret_cast<PyArrayObject*>(obj);
+    if (PyArray_NDIM(arr_obj) != 1) {
+      return Status::Invalid("Only 1D arrays accepted");
+    }
+
+    if (PyArray_DESCR(arr_obj)->type_num == NPY_OBJECT) {
+      // It's an array object, we can fetch object pointers directly
+      const Ndarray1DIndexer<PyObject*> objects(arr_obj);
+      for (int64_t i = offset; keep_going && i < objects.size(); ++i) {
+        RETURN_NOT_OK(func(objects[i], i, &keep_going));
+      }
+      return Status::OK();
+    }
+    // It's a non-object array, fall back on regular sequence access.
+    // (note PyArray_GETITEM() is slightly different: it returns standard
+    //  Python types, not Numpy scalar types)
+    // This code path is inefficient: callers should implement dedicated
+    // logic for non-object arrays.
+  }
+
+  if (PySequence_Check(obj)) {
+#ifdef Py_GIL_DISABLED
+    if (PyTuple_Check(obj)) {
+#else
+    if (PyList_Check(obj) || PyTuple_Check(obj)) {
+#endif
+      // Use fast item access
+      const Py_ssize_t size = PySequence_Fast_GET_SIZE(obj);
+      for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
+        PyObject* value = PySequence_Fast_GET_ITEM(obj, i);
+        RETURN_NOT_OK(func(value, static_cast<int64_t>(i), &keep_going));
+      }
+    } else {
+      // Regular sequence: avoid making a potentially large copy
+      const Py_ssize_t size = PySequence_Size(obj);
+      RETURN_IF_PYERROR();
+      for (Py_ssize_t i = offset; keep_going && i < size; ++i) {
+        OwnedRef value_ref(PySequence_ITEM(obj, i));
+        RETURN_IF_PYERROR();
+        RETURN_NOT_OK(func(value_ref.obj(), static_cast<int64_t>(i), &keep_going));
+      }
+    }
+  } else {
+    return Status::TypeError("Object is not a sequence");
+  }
+  return Status::OK();
+}
+
+// Visit sequence with no null mask
+template <class VisitorFunc>
+inline Status VisitSequence(PyObject* obj, int64_t offset, VisitorFunc&& func) {
+  return VisitSequenceGeneric(
+      obj, offset, [&func](PyObject* value, int64_t i /* unused */, bool* keep_going) {
+        return func(value, keep_going);
+      });
+}
+
+/// Visit sequence with null mask
+template <class VisitorFunc>
+inline Status VisitSequenceMasked(PyObject* obj, PyObject* mo, int64_t offset,
+                                  VisitorFunc&& func) {
+  if (has_numpy() && PyArray_Check(mo)) {
+    PyArrayObject* mask = reinterpret_cast<PyArrayObject*>(mo);
+    if (PyArray_NDIM(mask) != 1) {
+      return Status::Invalid("Mask must be 1D array");
+    }
+    if (PyArray_SIZE(mask) != static_cast<int64_t>(PySequence_Size(obj))) {
+      return Status::Invalid("Mask was a different length from sequence being converted");
+    }
+
+    const int dtype = fix_numpy_type_num(PyArray_DESCR(mask)->type_num);
+    if (dtype == NPY_BOOL) {
+      Ndarray1DIndexer<uint8_t> mask_values(mask);
+
+      return VisitSequenceGeneric(
+          obj, offset,
+          [&func, &mask_values](PyObject* value, int64_t i, bool* keep_going) {
+            return func(value, mask_values[i], keep_going);
+          });
+    } else {
+      return Status::TypeError("Mask must be boolean dtype");
+    }
+  } else if (py::is_array(mo)) {
+    auto unwrap_mask_result = unwrap_array(mo);
+    ARROW_RETURN_NOT_OK(unwrap_mask_result);
+    std::shared_ptr<Array> mask_ = unwrap_mask_result.ValueOrDie();
+    if (mask_->type_id() != Type::type::BOOL) {
+      return Status::TypeError("Mask must be an array of booleans");
+    }
+
+    if (mask_->length() != PySequence_Size(obj)) {
+      return Status::Invalid("Mask was a different length from sequence being converted");
+    }
+
+    if (mask_->null_count() != 0) {
+      return Status::TypeError("Mask must be an array of booleans");
+    }
+
+    BooleanArray* boolmask = checked_cast<BooleanArray*>(mask_.get());
+    return VisitSequenceGeneric(
+        obj, offset, [&func, &boolmask](PyObject* value, int64_t i, bool* keep_going) {
+          return func(value, boolmask->Value(i), keep_going);
+        });
+  } else if (PySequence_Check(mo)) {
+    if (PySequence_Size(mo) != PySequence_Size(obj)) {
+      return Status::Invalid("Mask was a different length from sequence being converted");
+    }
+    RETURN_IF_PYERROR();
+
+    return VisitSequenceGeneric(
+        obj, offset, [&func, &mo](PyObject* value, int64_t i, bool* keep_going) {
+          OwnedRef value_ref(PySequence_ITEM(mo, i));
+          if (!PyBool_Check(value_ref.obj()))
+            return Status::TypeError("Mask must be a sequence of booleans");
+          return func(value, value_ref.obj() == Py_True, keep_going);
+        });
+  } else {
+    return Status::Invalid("Null mask must be a NumPy array, Arrow array or a Sequence");
+  }
+
+  return Status::OK();
+}
+
+// Like IterateSequence, but accepts any generic iterable (including
+// non-restartable iterators, e.g. generators).
+//
+// The call signature for VisitorFunc must be Visit(PyObject*, bool*
+// keep_going). If keep_going is set to false, the iteration terminates
+template <class VisitorFunc>
+inline Status VisitIterable(PyObject* obj, VisitorFunc&& func) {
+  if (PySequence_Check(obj)) {
+    // Numpy arrays fall here as well
+    return VisitSequence(obj, /*offset=*/0, std::forward<VisitorFunc>(func));
+  }
+  // Fall back on the iterator protocol
+  OwnedRef iter_ref(PyObject_GetIter(obj));
+  PyObject* iter = iter_ref.obj();
+  RETURN_IF_PYERROR();
+  PyObject* value;
+
+  bool keep_going = true;
+  while (keep_going && (value = PyIter_Next(iter))) {
+    OwnedRef value_ref(value);
+    RETURN_NOT_OK(func(value_ref.obj(), &keep_going));
+  }
+  RETURN_IF_PYERROR();  // __next__() might have raised
+  return Status::OK();
+}
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/numpy_convert.cc b/pyarrow/src/arrow/python/numpy_convert.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4113cc67d2fc6f1bb899d6d822e50a85e771c4a3
--- /dev/null
+++ b/pyarrow/src/arrow/python/numpy_convert.cc
@@ -0,0 +1,563 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/numpy_interop.h"
+
+#include "arrow/python/numpy_convert.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/sparse_tensor.h"
+#include "arrow/tensor.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+
+#include "arrow/python/common.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/type_traits.h"
+
+namespace arrow {
+namespace py {
+
+NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) {
+  PyAcquireGIL lock;
+  arr_ = ao;
+  Py_INCREF(ao);
+
+  if (PyArray_Check(ao)) {
+    PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(ao);
+    auto ptr = reinterpret_cast<uint8_t*>(PyArray_DATA(ndarray));
+    data_ = const_cast<const uint8_t*>(ptr);
+    size_ = PyArray_NBYTES(ndarray);
+    capacity_ = size_;
+    is_mutable_ = !!(PyArray_FLAGS(ndarray) & NPY_ARRAY_WRITEABLE);
+  }
+}
+
+NumPyBuffer::~NumPyBuffer() {
+  PyAcquireGIL lock;
+  Py_XDECREF(arr_);
+}
+
+#define TO_ARROW_TYPE_CASE(NPY_NAME, FACTORY) \
+  case NPY_##NPY_NAME:                        \
+    return FACTORY();
+
+namespace {
+
+Result<std::shared_ptr<DataType>> GetTensorType(PyObject* dtype) {
+  if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) {
+    return Status::TypeError("Did not pass numpy.dtype object");
+  }
+  PyArray_Descr* descr = reinterpret_cast<PyArray_Descr*>(dtype);
+  int type_num = fix_numpy_type_num(descr->type_num);
+
+  switch (type_num) {
+    TO_ARROW_TYPE_CASE(BOOL, uint8);
+    TO_ARROW_TYPE_CASE(INT8, int8);
+    TO_ARROW_TYPE_CASE(INT16, int16);
+    TO_ARROW_TYPE_CASE(INT32, int32);
+    TO_ARROW_TYPE_CASE(INT64, int64);
+    TO_ARROW_TYPE_CASE(UINT8, uint8);
+    TO_ARROW_TYPE_CASE(UINT16, uint16);
+    TO_ARROW_TYPE_CASE(UINT32, uint32);
+    TO_ARROW_TYPE_CASE(UINT64, uint64);
+    TO_ARROW_TYPE_CASE(FLOAT16, float16);
+    TO_ARROW_TYPE_CASE(FLOAT32, float32);
+    TO_ARROW_TYPE_CASE(FLOAT64, float64);
+  }
+  return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
+}
+
+Status GetNumPyType(const DataType& type, int* type_num) {
+#define NUMPY_TYPE_CASE(ARROW_NAME, NPY_NAME) \
+  case Type::ARROW_NAME:                      \
+    *type_num = NPY_##NPY_NAME;               \
+    break;
+
+  switch (type.id()) {
+    NUMPY_TYPE_CASE(UINT8, UINT8);
+    NUMPY_TYPE_CASE(INT8, INT8);
+    NUMPY_TYPE_CASE(UINT16, UINT16);
+    NUMPY_TYPE_CASE(INT16, INT16);
+    NUMPY_TYPE_CASE(UINT32, UINT32);
+    NUMPY_TYPE_CASE(INT32, INT32);
+    NUMPY_TYPE_CASE(UINT64, UINT64);
+    NUMPY_TYPE_CASE(INT64, INT64);
+    NUMPY_TYPE_CASE(HALF_FLOAT, FLOAT16);
+    NUMPY_TYPE_CASE(FLOAT, FLOAT32);
+    NUMPY_TYPE_CASE(DOUBLE, FLOAT64);
+    default: {
+      return Status::NotImplemented("Unsupported tensor type: ", type.ToString());
+    }
+  }
+#undef NUMPY_TYPE_CASE
+
+  return Status::OK();
+}
+
+}  // namespace
+
+Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar) {
+  PyArray_Descr* descr = PyArray_DescrFromScalar(scalar);
+  OwnedRef descr_ref(reinterpret_cast<PyObject*>(descr));
+  return NumPyDtypeToArrow(descr);
+}
+
+Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype) {
+  if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) {
+    return Status::TypeError("Did not pass numpy.dtype object");
+  }
+  PyArray_Descr* descr = reinterpret_cast<PyArray_Descr*>(dtype);
+  return NumPyDtypeToArrow(descr);
+}
+
+Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr) {
+  int type_num = fix_numpy_type_num(descr->type_num);
+
+  switch (type_num) {
+    TO_ARROW_TYPE_CASE(BOOL, boolean);
+    TO_ARROW_TYPE_CASE(INT8, int8);
+    TO_ARROW_TYPE_CASE(INT16, int16);
+    TO_ARROW_TYPE_CASE(INT32, int32);
+    TO_ARROW_TYPE_CASE(INT64, int64);
+    TO_ARROW_TYPE_CASE(UINT8, uint8);
+    TO_ARROW_TYPE_CASE(UINT16, uint16);
+    TO_ARROW_TYPE_CASE(UINT32, uint32);
+    TO_ARROW_TYPE_CASE(UINT64, uint64);
+    TO_ARROW_TYPE_CASE(FLOAT16, float16);
+    TO_ARROW_TYPE_CASE(FLOAT32, float32);
+    TO_ARROW_TYPE_CASE(FLOAT64, float64);
+    TO_ARROW_TYPE_CASE(STRING, binary);
+    TO_ARROW_TYPE_CASE(UNICODE, utf8);
+    case NPY_DATETIME: {
+      auto date_dtype =
+          reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(descr));
+      switch (date_dtype->meta.base) {
+        case NPY_FR_s:
+          return timestamp(TimeUnit::SECOND);
+        case NPY_FR_ms:
+          return timestamp(TimeUnit::MILLI);
+        case NPY_FR_us:
+          return timestamp(TimeUnit::MICRO);
+        case NPY_FR_ns:
+          return timestamp(TimeUnit::NANO);
+        case NPY_FR_D:
+          return date32();
+        case NPY_FR_GENERIC:
+          return Status::NotImplemented("Unbound or generic datetime64 time unit");
+        default:
+          return Status::NotImplemented("Unsupported datetime64 time unit");
+      }
+    } break;
+    case NPY_TIMEDELTA: {
+      auto timedelta_dtype =
+          reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(descr));
+      switch (timedelta_dtype->meta.base) {
+        case NPY_FR_s:
+          return duration(TimeUnit::SECOND);
+        case NPY_FR_ms:
+          return duration(TimeUnit::MILLI);
+        case NPY_FR_us:
+          return duration(TimeUnit::MICRO);
+        case NPY_FR_ns:
+          return duration(TimeUnit::NANO);
+        case NPY_FR_GENERIC:
+          return Status::NotImplemented("Unbound or generic timedelta64 time unit");
+        default:
+          return Status::NotImplemented("Unsupported timedelta64 time unit");
+      }
+    } break;
+  }
+
+  return Status::NotImplemented("Unsupported numpy type ", descr->type_num);
+}
+
+#undef TO_ARROW_TYPE_CASE
+
+Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
+                       const std::vector<std::string>& dim_names,
+                       std::shared_ptr<Tensor>* out) {
+  if (!PyArray_Check(ao)) {
+    return Status::TypeError("Did not pass ndarray object");
+  }
+
+  PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(ao);
+
+  // TODO(wesm): What do we want to do with non-contiguous memory and negative strides?
+
+  int ndim = PyArray_NDIM(ndarray);
+
+  std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(ao);
+  std::vector<int64_t> shape(ndim);
+  std::vector<int64_t> strides(ndim);
+
+  npy_intp* array_strides = PyArray_STRIDES(ndarray);
+  npy_intp* array_shape = PyArray_SHAPE(ndarray);
+  for (int i = 0; i < ndim; ++i) {
+    if (array_strides[i] < 0) {
+      return Status::Invalid("Negative ndarray strides not supported");
+    }
+    shape[i] = array_shape[i];
+    strides[i] = array_strides[i];
+  }
+
+  ARROW_ASSIGN_OR_RAISE(
+      auto type, GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray))));
+  *out = std::make_shared<Tensor>(type, data, shape, strides, dim_names);
+  return Status::OK();
+}
+
+Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,
+                       PyObject** out) {
+  int type_num = 0;
+  RETURN_NOT_OK(GetNumPyType(*tensor->type(), &type_num));
+  PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num);
+  RETURN_IF_PYERROR();
+
+  const int ndim = tensor->ndim();
+  std::vector<npy_intp> npy_shape(ndim);
+  std::vector<npy_intp> npy_strides(ndim);
+
+  for (int i = 0; i < ndim; ++i) {
+    npy_shape[i] = tensor->shape()[i];
+    npy_strides[i] = tensor->strides()[i];
+  }
+
+  const void* immutable_data = nullptr;
+  if (tensor->data()) {
+    immutable_data = tensor->data()->data();
+  }
+
+  // Remove const =(
+  void* mutable_data = const_cast<void*>(immutable_data);
+
+  int array_flags = 0;
+  if (tensor->is_row_major()) {
+    array_flags |= NPY_ARRAY_C_CONTIGUOUS;
+  }
+  if (tensor->is_column_major()) {
+    array_flags |= NPY_ARRAY_F_CONTIGUOUS;
+  }
+  if (tensor->is_mutable()) {
+    array_flags |= NPY_ARRAY_WRITEABLE;
+  }
+
+  PyObject* result =
+      PyArray_NewFromDescr(&PyArray_Type, dtype, ndim, npy_shape.data(),
+                           npy_strides.data(), mutable_data, array_flags, nullptr);
+  RETURN_IF_PYERROR();
+
+  if (base == Py_None || base == nullptr) {
+    base = py::wrap_tensor(tensor);
+  } else {
+    Py_XINCREF(base);
+  }
+  PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(result), base);
+  *out = result;
+  return Status::OK();
+}
+
+// Wrap the dense data of a sparse tensor in a ndarray
+static Status SparseTensorDataToNdarray(const SparseTensor& sparse_tensor,
+                                        std::vector<npy_intp> data_shape, PyObject* base,
+                                        PyObject** out_data) {
+  int type_num_data = 0;
+  RETURN_NOT_OK(GetNumPyType(*sparse_tensor.type(), &type_num_data));
+  PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data);
+  RETURN_IF_PYERROR();
+
+  const void* immutable_data = sparse_tensor.data()->data();
+  // Remove const =(
+  void* mutable_data = const_cast<void*>(immutable_data);
+  int array_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS;
+  if (sparse_tensor.is_mutable()) {
+    array_flags |= NPY_ARRAY_WRITEABLE;
+  }
+
+  *out_data = PyArray_NewFromDescr(&PyArray_Type, dtype_data,
+                                   static_cast<int>(data_shape.size()), data_shape.data(),
+                                   nullptr, mutable_data, array_flags, nullptr);
+  RETURN_IF_PYERROR();
+  Py_XINCREF(base);
+  PyArray_SetBaseObject(reinterpret_cast<PyArrayObject*>(*out_data), base);
+  return Status::OK();
+}
+
+Status SparseCOOTensorToNdarray(const std::shared_ptr<SparseCOOTensor>& sparse_tensor,
+                                PyObject* base, PyObject** out_data,
+                                PyObject** out_coords) {
+  const auto& sparse_index = arrow::internal::checked_cast<const SparseCOOIndex&>(
+      *sparse_tensor->sparse_index());
+
+  // Wrap tensor data
+  OwnedRef result_data;
+  RETURN_NOT_OK(SparseTensorDataToNdarray(
+      *sparse_tensor, {static_cast<npy_intp>(sparse_tensor->non_zero_length()), 1}, base,
+      result_data.ref()));
+
+  // Wrap indices
+  PyObject* result_coords;
+  RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, &result_coords));
+
+  *out_data = result_data.detach();
+  *out_coords = result_coords;
+  return Status::OK();
+}
+
+Status SparseCSXMatrixToNdarray(const std::shared_ptr<SparseTensor>& sparse_tensor,
+                                PyObject* base, PyObject** out_data,
+                                PyObject** out_indptr, PyObject** out_indices) {
+  // Wrap indices
+  OwnedRef result_indptr;
+  OwnedRef result_indices;
+
+  switch (sparse_tensor->format_id()) {
+    case SparseTensorFormat::CSR: {
+      const auto& sparse_index = arrow::internal::checked_cast<const SparseCSRIndex&>(
+          *sparse_tensor->sparse_index());
+      RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref()));
+      RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref()));
+      break;
+    }
+    case SparseTensorFormat::CSC: {
+      const auto& sparse_index = arrow::internal::checked_cast<const SparseCSCIndex&>(
+          *sparse_tensor->sparse_index());
+      RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref()));
+      RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref()));
+      break;
+    }
+    default:
+      return Status::NotImplemented("Invalid SparseTensor type.");
+  }
+
+  // Wrap tensor data
+  OwnedRef result_data;
+  RETURN_NOT_OK(SparseTensorDataToNdarray(
+      *sparse_tensor, {static_cast<npy_intp>(sparse_tensor->non_zero_length()), 1}, base,
+      result_data.ref()));
+
+  *out_data = result_data.detach();
+  *out_indptr = result_indptr.detach();
+  *out_indices = result_indices.detach();
+  return Status::OK();
+}
+
+Status SparseCSRMatrixToNdarray(const std::shared_ptr<SparseCSRMatrix>& sparse_tensor,
+                                PyObject* base, PyObject** out_data,
+                                PyObject** out_indptr, PyObject** out_indices) {
+  return SparseCSXMatrixToNdarray(sparse_tensor, base, out_data, out_indptr, out_indices);
+}
+
+Status SparseCSCMatrixToNdarray(const std::shared_ptr<SparseCSCMatrix>& sparse_tensor,
+                                PyObject* base, PyObject** out_data,
+                                PyObject** out_indptr, PyObject** out_indices) {
+  return SparseCSXMatrixToNdarray(sparse_tensor, base, out_data, out_indptr, out_indices);
+}
+
+Status SparseCSFTensorToNdarray(const std::shared_ptr<SparseCSFTensor>& sparse_tensor,
+                                PyObject* base, PyObject** out_data,
+                                PyObject** out_indptr, PyObject** out_indices) {
+  const auto& sparse_index = arrow::internal::checked_cast<const SparseCSFIndex&>(
+      *sparse_tensor->sparse_index());
+
+  // Wrap tensor data
+  OwnedRef result_data;
+  RETURN_NOT_OK(SparseTensorDataToNdarray(
+      *sparse_tensor, {static_cast<npy_intp>(sparse_tensor->non_zero_length()), 1}, base,
+      result_data.ref()));
+
+  // Wrap indices
+  int ndim = static_cast<int>(sparse_index.indices().size());
+  OwnedRef indptr(PyList_New(ndim - 1));
+  OwnedRef indices(PyList_New(ndim));
+  RETURN_IF_PYERROR();
+
+  for (int i = 0; i < ndim - 1; ++i) {
+    PyObject* item;
+    RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr()[i], base, &item));
+    if (PyList_SetItem(indptr.obj(), i, item) < 0) {
+      Py_XDECREF(item);
+      RETURN_IF_PYERROR();
+    }
+  }
+  for (int i = 0; i < ndim; ++i) {
+    PyObject* item;
+    RETURN_NOT_OK(TensorToNdarray(sparse_index.indices()[i], base, &item));
+    if (PyList_SetItem(indices.obj(), i, item) < 0) {
+      Py_XDECREF(item);
+      RETURN_IF_PYERROR();
+    }
+  }
+
+  *out_indptr = indptr.detach();
+  *out_indices = indices.detach();
+  *out_data = result_data.detach();
+  return Status::OK();
+}
+
+Status NdarraysToSparseCOOTensor(MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
+                                 const std::vector<int64_t>& shape,
+                                 const std::vector<std::string>& dim_names,
+                                 std::shared_ptr<SparseCOOTensor>* out) {
+  if (!PyArray_Check(data_ao) || !PyArray_Check(coords_ao)) {
+    return Status::TypeError("Did not pass ndarray object");
+  }
+
+  PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
+  std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
+  ARROW_ASSIGN_OR_RAISE(
+      auto type_data,
+      GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data))));
+
+  std::shared_ptr<Tensor> coords;
+  RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords));
+  ARROW_CHECK_EQ(coords->type_id(), Type::INT64);  // Should be ensured by caller
+
+  ARROW_ASSIGN_OR_RAISE(std::shared_ptr<SparseCOOIndex> sparse_index,
+                        SparseCOOIndex::Make(coords));
+  *out = std::make_shared<SparseTensorImpl<SparseCOOIndex>>(sparse_index, type_data, data,
+                                                            shape, dim_names);
+  return Status::OK();
+}
+
+template <class IndexType>
+Status NdarraysToSparseCSXMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
+                                 PyObject* indices_ao, const std::vector<int64_t>& shape,
+                                 const std::vector<std::string>& dim_names,
+                                 std::shared_ptr<SparseTensorImpl<IndexType>>* out) {
+  if (!PyArray_Check(data_ao) || !PyArray_Check(indptr_ao) ||
+      !PyArray_Check(indices_ao)) {
+    return Status::TypeError("Did not pass ndarray object");
+  }
+
+  PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
+  std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
+  ARROW_ASSIGN_OR_RAISE(
+      auto type_data,
+      GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data))));
+
+  std::shared_ptr<Tensor> indptr, indices;
+  RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr));
+  RETURN_NOT_OK(NdarrayToTensor(pool, indices_ao, {}, &indices));
+  ARROW_CHECK_EQ(indptr->type_id(), Type::INT64);   // Should be ensured by caller
+  ARROW_CHECK_EQ(indices->type_id(), Type::INT64);  // Should be ensured by caller
+
+  auto sparse_index = std::make_shared<IndexType>(
+      std::static_pointer_cast<NumericTensor<Int64Type>>(indptr),
+      std::static_pointer_cast<NumericTensor<Int64Type>>(indices));
+  *out = std::make_shared<SparseTensorImpl<IndexType>>(sparse_index, type_data, data,
+                                                       shape, dim_names);
+  return Status::OK();
+}
+
+Status NdarraysToSparseCSFTensor(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
+                                 PyObject* indices_ao, const std::vector<int64_t>& shape,
+                                 const std::vector<int64_t>& axis_order,
+                                 const std::vector<std::string>& dim_names,
+                                 std::shared_ptr<SparseCSFTensor>* out) {
+  if (!PyArray_Check(data_ao)) {
+    return Status::TypeError("Did not pass ndarray object for data");
+  }
+  const int ndim = static_cast<const int>(shape.size());
+  PyArrayObject* ndarray_data = reinterpret_cast<PyArrayObject*>(data_ao);
+  std::shared_ptr<Buffer> data = std::make_shared<NumPyBuffer>(data_ao);
+  ARROW_ASSIGN_OR_RAISE(
+      auto type_data,
+      GetTensorType(reinterpret_cast<PyObject*>(PyArray_DESCR(ndarray_data))));
+
+  std::vector<std::shared_ptr<Tensor>> indptr(ndim - 1);
+  std::vector<std::shared_ptr<Tensor>> indices(ndim);
+
+  for (int i = 0; i < ndim - 1; ++i) {
+#ifdef Py_GIL_DISABLED
+    PyObject* item = PySequence_ITEM(indptr_ao, i);
+    RETURN_IF_PYERROR();
+    OwnedRef item_ref(item);
+#else
+    PyObject* item = PySequence_Fast_GET_ITEM(indptr_ao, i);
+#endif
+    if (!PyArray_Check(item)) {
+      return Status::TypeError("Did not pass ndarray object for indptr");
+    }
+    RETURN_NOT_OK(NdarrayToTensor(pool, item, {}, &indptr[i]));
+    ARROW_CHECK_EQ(indptr[i]->type_id(), Type::INT64);  // Should be ensured by caller
+  }
+
+  for (int i = 0; i < ndim; ++i) {
+#ifdef Py_GIL_DISABLED
+    PyObject* item = PySequence_ITEM(indices_ao, i);
+    RETURN_IF_PYERROR();
+    OwnedRef item_ref(item);
+#else
+    PyObject* item = PySequence_Fast_GET_ITEM(indices_ao, i);
+#endif
+    if (!PyArray_Check(item)) {
+      return Status::TypeError("Did not pass ndarray object for indices");
+    }
+    RETURN_NOT_OK(NdarrayToTensor(pool, item, {}, &indices[i]));
+    ARROW_CHECK_EQ(indices[i]->type_id(), Type::INT64);  // Should be ensured by caller
+  }
+
+  auto sparse_index = std::make_shared<SparseCSFIndex>(indptr, indices, axis_order);
+  *out = std::make_shared<SparseTensorImpl<SparseCSFIndex>>(sparse_index, type_data, data,
+                                                            shape, dim_names);
+  return Status::OK();
+}
+
+Status NdarraysToSparseCSRMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
+                                 PyObject* indices_ao, const std::vector<int64_t>& shape,
+                                 const std::vector<std::string>& dim_names,
+                                 std::shared_ptr<SparseCSRMatrix>* out) {
+  return NdarraysToSparseCSXMatrix<SparseCSRIndex>(pool, data_ao, indptr_ao, indices_ao,
+                                                   shape, dim_names, out);
+}
+
+Status NdarraysToSparseCSCMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao,
+                                 PyObject* indices_ao, const std::vector<int64_t>& shape,
+                                 const std::vector<std::string>& dim_names,
+                                 std::shared_ptr<SparseCSCMatrix>* out) {
+  return NdarraysToSparseCSXMatrix<SparseCSCIndex>(pool, data_ao, indptr_ao, indices_ao,
+                                                   shape, dim_names, out);
+}
+
+Status TensorToSparseCOOTensor(const std::shared_ptr<Tensor>& tensor,
+                               std::shared_ptr<SparseCOOTensor>* out) {
+  return SparseCOOTensor::Make(*tensor).Value(out);
+}
+
+Status TensorToSparseCSRMatrix(const std::shared_ptr<Tensor>& tensor,
+                               std::shared_ptr<SparseCSRMatrix>* out) {
+  return SparseCSRMatrix::Make(*tensor).Value(out);
+}
+
+Status TensorToSparseCSCMatrix(const std::shared_ptr<Tensor>& tensor,
+                               std::shared_ptr<SparseCSCMatrix>* out) {
+  return SparseCSCMatrix::Make(*tensor).Value(out);
+}
+
+Status TensorToSparseCSFTensor(const std::shared_ptr<Tensor>& tensor,
+                               std::shared_ptr<SparseCSFTensor>* out) {
+  return SparseCSFTensor::Make(*tensor).Value(out);
+}
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/numpy_convert.h b/pyarrow/src/arrow/python/numpy_convert.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d1086e13552885f09431848fabf0829e670d681
--- /dev/null
+++ b/pyarrow/src/arrow/python/numpy_convert.h
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for converting between pandas's NumPy-based data representation
+// and Arrow data structures
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "arrow/buffer.h"
+#include "arrow/python/visibility.h"
+#include "arrow/sparse_tensor.h"
+
+namespace arrow {
+
+class DataType;
+class MemoryPool;
+class Status;
+class Tensor;
+
+namespace py {
+
+class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer {
+ public:
+  explicit NumPyBuffer(PyObject* arr);
+  virtual ~NumPyBuffer();
+
+ private:
+  PyObject* arr_;
+};
+
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyObject* dtype);
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<DataType>> NumPyDtypeToArrow(PyArray_Descr* descr);
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<DataType>> NumPyScalarToArrowDataType(PyObject* scalar);
+
+ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
+                                           const std::vector<std::string>& dim_names,
+                                           std::shared_ptr<Tensor>* out);
+
+ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor,
+                                           PyObject* base, PyObject** out);
+
+ARROW_PYTHON_EXPORT Status
+SparseCOOTensorToNdarray(const std::shared_ptr<SparseCOOTensor>& sparse_tensor,
+                         PyObject* base, PyObject** out_data, PyObject** out_coords);
+
+Status SparseCSXMatrixToNdarray(const std::shared_ptr<SparseTensor>& sparse_tensor,
+                                PyObject* base, PyObject** out_data,
+                                PyObject** out_indptr, PyObject** out_indices);
+
+ARROW_PYTHON_EXPORT Status SparseCSRMatrixToNdarray(
+    const std::shared_ptr<SparseCSRMatrix>& sparse_tensor, PyObject* base,
+    PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
+
+ARROW_PYTHON_EXPORT Status SparseCSCMatrixToNdarray(
+    const std::shared_ptr<SparseCSCMatrix>& sparse_tensor, PyObject* base,
+    PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
+
+ARROW_PYTHON_EXPORT Status SparseCSFTensorToNdarray(
+    const std::shared_ptr<SparseCSFTensor>& sparse_tensor, PyObject* base,
+    PyObject** out_data, PyObject** out_indptr, PyObject** out_indices);
+
+ARROW_PYTHON_EXPORT Status NdarraysToSparseCOOTensor(
+    MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao,
+    const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
+    std::shared_ptr<SparseCOOTensor>* out);
+
+ARROW_PYTHON_EXPORT Status NdarraysToSparseCSRMatrix(
+    MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
+    const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
+    std::shared_ptr<SparseCSRMatrix>* out);
+
+ARROW_PYTHON_EXPORT Status NdarraysToSparseCSCMatrix(
+    MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
+    const std::vector<int64_t>& shape, const std::vector<std::string>& dim_names,
+    std::shared_ptr<SparseCSCMatrix>* out);
+
+ARROW_PYTHON_EXPORT Status NdarraysToSparseCSFTensor(
+    MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, PyObject* indices_ao,
+    const std::vector<int64_t>& shape, const std::vector<int64_t>& axis_order,
+    const std::vector<std::string>& dim_names, std::shared_ptr<SparseCSFTensor>* out);
+
+ARROW_PYTHON_EXPORT Status
+TensorToSparseCOOTensor(const std::shared_ptr<Tensor>& tensor,
+                        std::shared_ptr<SparseCOOTensor>* csparse_tensor);
+
+ARROW_PYTHON_EXPORT Status
+TensorToSparseCSRMatrix(const std::shared_ptr<Tensor>& tensor,
+                        std::shared_ptr<SparseCSRMatrix>* csparse_tensor);
+
+ARROW_PYTHON_EXPORT Status
+TensorToSparseCSCMatrix(const std::shared_ptr<Tensor>& tensor,
+                        std::shared_ptr<SparseCSCMatrix>* csparse_tensor);
+
+ARROW_PYTHON_EXPORT Status
+TensorToSparseCSFTensor(const std::shared_ptr<Tensor>& tensor,
+                        std::shared_ptr<SparseCSFTensor>* csparse_tensor);
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/numpy_init.cc b/pyarrow/src/arrow/python/numpy_init.cc
new file mode 100644
index 0000000000000000000000000000000000000000..96e2c7b7ccb5cc1bc988aa4826c2dbe856e5fd23
--- /dev/null
+++ b/pyarrow/src/arrow/python/numpy_init.cc
@@ -0,0 +1,33 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Trigger the array import (inversion of NO_IMPORT_ARRAY)
+#define NUMPY_IMPORT_ARRAY
+
+#include "arrow/python/numpy_init.h"
+#include "arrow/python/numpy_interop.h"
+
+namespace arrow::py {
+bool numpy_imported = false;
+
+int arrow_init_numpy() {
+  numpy_imported = true;
+  return arrow::py::import_numpy();
+}
+
+bool has_numpy() { return numpy_imported; }
+}  // namespace arrow::py
diff --git a/pyarrow/src/arrow/python/numpy_init.h b/pyarrow/src/arrow/python/numpy_init.h
new file mode 100644
index 0000000000000000000000000000000000000000..36c544c1b51fd431e1f7d3b4c4f01c0e18e527df
--- /dev/null
+++ b/pyarrow/src/arrow/python/numpy_init.h
@@ -0,0 +1,27 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/platform.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow::py {
+ARROW_PYTHON_EXPORT
+int arrow_init_numpy();
+bool has_numpy();
+}  // namespace arrow::py
diff --git a/pyarrow/src/arrow/python/numpy_internal.h b/pyarrow/src/arrow/python/numpy_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb930ce9c5b9149719c927b599199fc4d1934c96
--- /dev/null
+++ b/pyarrow/src/arrow/python/numpy_internal.h
@@ -0,0 +1,195 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Internal utilities for dealing with NumPy
+
+#pragma once
+
+#include "arrow/python/numpy_init.h"
+#include "arrow/python/numpy_interop.h"
+
+#include "arrow/status.h"
+
+#include "arrow/python/platform.h"
+
+#include <cstdint>
+#include <sstream>
+#include <string>
+
+namespace arrow {
+namespace py {
+
+/// Indexing convenience for interacting with strided 1-dim ndarray objects
+template <typename T>
+class Ndarray1DIndexer {
+ public:
+  typedef int64_t size_type;
+
+  Ndarray1DIndexer() : arr_(NULLPTR), data_(NULLPTR) {}
+
+  explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() {
+    arr_ = arr;
+    ARROW_DCHECK_EQ(1, PyArray_NDIM(arr)) << "Only works with 1-dimensional arrays";
+    data_ = reinterpret_cast<uint8_t*>(PyArray_DATA(arr));
+    stride_ = PyArray_STRIDES(arr)[0];
+  }
+
+  ~Ndarray1DIndexer() = default;
+
+  int64_t size() const { return PyArray_SIZE(arr_); }
+
+  const T* data() const { return reinterpret_cast<const T*>(data_); }
+
+  bool is_strided() const { return stride_ != sizeof(T); }
+
+  T& operator[](size_type index) {
+    return *reinterpret_cast<T*>(data_ + index * stride_);
+  }
+  const T& operator[](size_type index) const {
+    return *reinterpret_cast<const T*>(data_ + index * stride_);
+  }
+
+ private:
+  PyArrayObject* arr_;
+  uint8_t* data_;
+  int64_t stride_;
+};
+
+// Handling of Numpy Types by their static numbers
+// (the NPY_TYPES enum and related defines)
+
+static inline std::string GetNumPyTypeName(int npy_type) {
+#define TYPE_CASE(TYPE, NAME) \
+  case NPY_##TYPE:            \
+    return NAME;
+
+  switch (npy_type) {
+    TYPE_CASE(BOOL, "bool")
+    TYPE_CASE(INT8, "int8")
+    TYPE_CASE(INT16, "int16")
+    TYPE_CASE(INT32, "int32")
+    TYPE_CASE(INT64, "int64")
+#if !NPY_INT32_IS_INT
+    TYPE_CASE(INT, "intc")
+#endif
+#if !NPY_INT64_IS_LONG_LONG
+    TYPE_CASE(LONGLONG, "longlong")
+#endif
+    TYPE_CASE(UINT8, "uint8")
+    TYPE_CASE(UINT16, "uint16")
+    TYPE_CASE(UINT32, "uint32")
+    TYPE_CASE(UINT64, "uint64")
+#if !NPY_INT32_IS_INT
+    TYPE_CASE(UINT, "uintc")
+#endif
+#if !NPY_INT64_IS_LONG_LONG
+    TYPE_CASE(ULONGLONG, "ulonglong")
+#endif
+    TYPE_CASE(FLOAT16, "float16")
+    TYPE_CASE(FLOAT32, "float32")
+    TYPE_CASE(FLOAT64, "float64")
+    TYPE_CASE(DATETIME, "datetime64")
+    TYPE_CASE(TIMEDELTA, "timedelta64")
+    TYPE_CASE(OBJECT, "object")
+    TYPE_CASE(VOID, "void")
+    default:
+      break;
+  }
+
+#undef TYPE_CASE
+  std::stringstream ss;
+  ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName";
+  return ss.str();
+}
+
+#define TYPE_VISIT_INLINE(TYPE) \
+  case NPY_##TYPE:              \
+    return visitor->template Visit<NPY_##TYPE>(arr);
+
+template <typename VISITOR>
+inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) {
+  switch (PyArray_TYPE(arr)) {
+    TYPE_VISIT_INLINE(BOOL);
+    TYPE_VISIT_INLINE(INT8);
+    TYPE_VISIT_INLINE(UINT8);
+    TYPE_VISIT_INLINE(INT16);
+    TYPE_VISIT_INLINE(UINT16);
+    TYPE_VISIT_INLINE(INT32);
+    TYPE_VISIT_INLINE(UINT32);
+    TYPE_VISIT_INLINE(INT64);
+    TYPE_VISIT_INLINE(UINT64);
+#if !NPY_INT32_IS_INT
+    TYPE_VISIT_INLINE(INT);
+    TYPE_VISIT_INLINE(UINT);
+#endif
+#if !NPY_INT64_IS_LONG_LONG
+    TYPE_VISIT_INLINE(LONGLONG);
+    TYPE_VISIT_INLINE(ULONGLONG);
+#endif
+    TYPE_VISIT_INLINE(FLOAT16);
+    TYPE_VISIT_INLINE(FLOAT32);
+    TYPE_VISIT_INLINE(FLOAT64);
+    TYPE_VISIT_INLINE(DATETIME);
+    TYPE_VISIT_INLINE(TIMEDELTA);
+    TYPE_VISIT_INLINE(OBJECT);
+  }
+  return Status::NotImplemented("NumPy type not implemented: ",
+                                GetNumPyTypeName(PyArray_TYPE(arr)));
+}
+
+#undef TYPE_VISIT_INLINE
+
+namespace internal {
+
+inline bool PyFloatScalar_Check(PyObject* obj) {
+  if (has_numpy()) {
+    return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating);
+  } else {
+    return PyFloat_Check(obj);
+  }
+}
+
+inline bool PyIntScalar_Check(PyObject* obj) {
+  if (has_numpy()) {
+    return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer);
+  } else {
+    return PyLong_Check(obj);
+  }
+}
+
+inline bool PyBoolScalar_Check(PyObject* obj) {
+  if (has_numpy()) {
+    return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool);
+  } else {
+    return PyBool_Check(obj);
+  }
+}
+
+static inline PyArray_Descr* GetSafeNumPyDtype(int type) {
+  if (type == NPY_DATETIME || type == NPY_TIMEDELTA) {
+    // It is not safe to mutate the result of DescrFromType for datetime and
+    // timedelta descriptors
+    return PyArray_DescrNewFromType(type);
+  } else {
+    return PyArray_DescrFromType(type);
+  }
+}
+
+}  // namespace internal
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/numpy_interop.h b/pyarrow/src/arrow/python/numpy_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..a83ae4a62b944c71af70d58c7107befd659baa8c
--- /dev/null
+++ b/pyarrow/src/arrow/python/numpy_interop.h
@@ -0,0 +1,103 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/platform.h"  // IWYU pragma: export
+
+#include <numpy/numpyconfig.h>  // IWYU pragma: export
+
+// Don't use the deprecated Numpy functions
+#ifdef NPY_1_7_API_VERSION
+#  define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#else
+#  define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED
+#  define NPY_ARRAY_ALIGNED NPY_ALIGNED
+#  define NPY_ARRAY_WRITEABLE NPY_WRITEABLE
+#  define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY
+#endif
+
+// This is required to be able to access the NumPy C API properly in C++ files
+// other than init.cc.
+#define PY_ARRAY_UNIQUE_SYMBOL arrow_ARRAY_API
+#ifndef NUMPY_IMPORT_ARRAY
+#  define NO_IMPORT_ARRAY
+#endif
+
+#include <numpy/arrayobject.h>   // IWYU pragma: export
+#include <numpy/arrayscalars.h>  // IWYU pragma: export
+#include <numpy/ufuncobject.h>   // IWYU pragma: export
+
+// A bit subtle. Numpy has 5 canonical integer types:
+// (or, rather, type pairs: signed and unsigned)
+//   NPY_BYTE, NPY_SHORT, NPY_INT, NPY_LONG, NPY_LONGLONG
+// It also has 4 fixed-width integer aliases.
+// When mapping Arrow integer types to these 4 fixed-width aliases,
+// we always miss one of the canonical types (even though it may
+// have the same width as one of the aliases).
+// Which one depends on the platform...
+// On a LP64 system, NPY_INT64 maps to NPY_LONG and
+// NPY_LONGLONG needs to be handled separately.
+// On a LLP64 system, NPY_INT32 maps to NPY_LONG and
+// NPY_INT needs to be handled separately.
+
+#if NPY_BITSOF_LONG == 32 && NPY_BITSOF_LONGLONG == 64
+#  define NPY_INT64_IS_LONG_LONG 1
+#else
+#  define NPY_INT64_IS_LONG_LONG 0
+#endif
+
+#if NPY_BITSOF_INT == 32 && NPY_BITSOF_LONG == 64
+#  define NPY_INT32_IS_INT 1
+#else
+#  define NPY_INT32_IS_INT 0
+#endif
+
+// Backported NumPy 2 API (can be removed if numpy 2 is required)
+#if NPY_ABI_VERSION < 0x02000000
+#  define PyDataType_ELSIZE(descr) ((descr)->elsize)
+#  define PyDataType_C_METADATA(descr) ((descr)->c_metadata)
+#  define PyDataType_FIELDS(descr) ((descr)->fields)
+#endif
+
+namespace arrow {
+namespace py {
+
+inline int import_numpy() {
+#ifdef NUMPY_IMPORT_ARRAY
+  import_array1(-1);
+  import_umath1(-1);
+#endif
+
+  return 0;
+}
+
+// See above about the missing Numpy integer type numbers
+inline int fix_numpy_type_num(int type_num) {
+#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32
+  if (type_num == NPY_INT) return NPY_INT32;
+  if (type_num == NPY_UINT) return NPY_UINT32;
+#endif
+#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64
+  if (type_num == NPY_LONGLONG) return NPY_INT64;
+  if (type_num == NPY_ULONGLONG) return NPY_UINT64;
+#endif
+  return type_num;
+}
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/numpy_to_arrow.cc b/pyarrow/src/arrow/python/numpy_to_arrow.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5647e895d0f70787bdd6cb6869b890c9099a8e8b
--- /dev/null
+++ b/pyarrow/src/arrow/python/numpy_to_arrow.cc
@@ -0,0 +1,945 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for pandas conversion via NumPy
+
+#include "arrow/python/numpy_to_arrow.h"
+#include "arrow/python/numpy_interop.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_generate.h"
+#include "arrow/util/bitmap_ops.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/endian.h"
+#include "arrow/util/logging.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string.h"
+#include "arrow/util/utf8.h"
+#include "arrow/visit_type_inline.h"
+
+#include "arrow/compute/api_scalar.h"
+
+#include "arrow/python/common.h"
+#include "arrow/python/datetime.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/iterators.h"
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/numpy_internal.h"
+#include "arrow/python/python_to_arrow.h"
+#include "arrow/python/type_traits.h"
+#include "arrow/python/vendored/pythoncapi_compat.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::CopyBitmap;
+using internal::GenerateBitsUnrolled;
+
+namespace py {
+
+using internal::NumPyTypeSize;
+
+// ----------------------------------------------------------------------
+// Conversion utilities
+
+namespace {
+
+Status AllocateNullBitmap(MemoryPool* pool, int64_t length,
+                          std::shared_ptr<ResizableBuffer>* out) {
+  int64_t null_bytes = bit_util::BytesForBits(length);
+  ARROW_ASSIGN_OR_RAISE(auto null_bitmap, AllocateResizableBuffer(null_bytes, pool));
+
+  // Padding zeroed by AllocateResizableBuffer
+  memset(null_bitmap->mutable_data(), 0, static_cast<size_t>(null_bytes));
+  *out = std::move(null_bitmap);
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
+// Conversion from NumPy-in-Pandas to Arrow null bitmap
+
+template <int TYPE>
+inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
+  typedef internal::npy_traits<TYPE> traits;
+  typedef typename traits::value_type T;
+
+  int64_t null_count = 0;
+
+  Ndarray1DIndexer<T> values(arr);
+  for (int i = 0; i < values.size(); ++i) {
+    if (traits::isnull(values[i])) {
+      ++null_count;
+    } else {
+      bit_util::SetBit(bitmap, i);
+    }
+  }
+
+  return null_count;
+}
+
+class NumPyNullsConverter {
+ public:
+  /// Convert the given array's null values to a null bitmap.
+  /// The null bitmap is only allocated if null values are ever possible.
+  static Status Convert(MemoryPool* pool, PyArrayObject* arr, bool from_pandas,
+                        std::shared_ptr<ResizableBuffer>* out_null_bitmap_,
+                        int64_t* out_null_count) {
+    NumPyNullsConverter converter(pool, arr, from_pandas);
+    RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
+    *out_null_bitmap_ = converter.null_bitmap_;
+    *out_null_count = converter.null_count_;
+    return Status::OK();
+  }
+
+  template <int TYPE>
+  Status Visit(PyArrayObject* arr) {
+    typedef internal::npy_traits<TYPE> traits;
+
+    const bool null_sentinels_possible =
+        // Always treat Numpy's NaT as null
+        TYPE == NPY_DATETIME || TYPE == NPY_TIMEDELTA ||
+        // Observing pandas's null sentinels
+        (from_pandas_ && traits::supports_nulls);
+
+    if (null_sentinels_possible) {
+      RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), &null_bitmap_));
+      null_count_ = ValuesToBitmap<TYPE>(arr, null_bitmap_->mutable_data());
+    }
+    return Status::OK();
+  }
+
+ protected:
+  NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr, bool from_pandas)
+      : pool_(pool),
+        arr_(arr),
+        from_pandas_(from_pandas),
+        null_bitmap_data_(nullptr),
+        null_count_(0) {}
+
+  MemoryPool* pool_;
+  PyArrayObject* arr_;
+  bool from_pandas_;
+  std::shared_ptr<ResizableBuffer> null_bitmap_;
+  uint8_t* null_bitmap_data_;
+  int64_t null_count_;
+};
+
+// Returns null count
+int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) {
+  int64_t null_count = 0;
+
+  if (!PyArray_Check(mask)) return -1;
+
+  Ndarray1DIndexer<uint8_t> mask_values(mask);
+  for (int i = 0; i < length; ++i) {
+    if (mask_values[i]) {
+      ++null_count;
+      bit_util::ClearBit(bitmap, i);
+    } else {
+      bit_util::SetBit(bitmap, i);
+    }
+  }
+  return null_count;
+}
+
+}  // namespace
+
+// ----------------------------------------------------------------------
+// Conversion from NumPy arrays (possibly originating from pandas) to Arrow
+// format. Does not handle NPY_OBJECT dtype arrays; use ConvertPySequence for
+// that
+
+class NumPyConverter {
+ public:
+  NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo,
+                 const std::shared_ptr<DataType>& type, bool from_pandas,
+                 const compute::CastOptions& cast_options = compute::CastOptions())
+      : pool_(pool),
+        type_(type),
+        arr_(reinterpret_cast<PyArrayObject*>(arr)),
+        dtype_(PyArray_DESCR(arr_)),
+        mask_(nullptr),
+        from_pandas_(from_pandas),
+        cast_options_(cast_options),
+        null_bitmap_data_(nullptr),
+        null_count_(0) {
+    if (mo != nullptr && mo != Py_None) {
+      mask_ = reinterpret_cast<PyArrayObject*>(mo);
+    }
+    length_ = static_cast<int64_t>(PyArray_SIZE(arr_));
+    itemsize_ = static_cast<int64_t>(PyArray_ITEMSIZE(arr_));
+    stride_ = static_cast<int64_t>(PyArray_STRIDES(arr_)[0]);
+  }
+
+  bool is_strided() const { return itemsize_ != stride_; }
+
+  Status Convert();
+
+  const ArrayVector& result() const { return out_arrays_; }
+
+  template <typename T>
+  enable_if_primitive_ctype<T, Status> Visit(const T& type) {
+    return VisitNative<T>();
+  }
+
+  Status Visit(const HalfFloatType& type) { return VisitNative<UInt16Type>(); }
+
+  Status Visit(const Date32Type& type) { return VisitNative<Date32Type>(); }
+  Status Visit(const Date64Type& type) { return VisitNative<Date64Type>(); }
+  Status Visit(const TimestampType& type) { return VisitNative<TimestampType>(); }
+  Status Visit(const Time32Type& type) { return VisitNative<Int32Type>(); }
+  Status Visit(const Time64Type& type) { return VisitNative<Int64Type>(); }
+  Status Visit(const DurationType& type) { return VisitNative<DurationType>(); }
+
+  Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); }
+
+  // NumPy ascii string arrays
+  Status Visit(const BinaryType& type);
+  Status Visit(const LargeBinaryType& type);
+  Status Visit(const BinaryViewType& type);
+
+  // NumPy unicode arrays
+  Status Visit(const StringType& type);
+  Status Visit(const LargeStringType& type);
+  Status Visit(const StringViewType& type);
+
+  Status Visit(const StructType& type);
+
+  Status Visit(const FixedSizeBinaryType& type);
+
+  // Default case
+  Status Visit(const DataType& type) { return TypeNotImplemented(type.ToString()); }
+
+ protected:
+  Status InitNullBitmap() {
+    RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_));
+    null_bitmap_data_ = null_bitmap_->mutable_data();
+    return Status::OK();
+  }
+
+  // Called before ConvertData to ensure Numpy input buffer is in expected
+  // Arrow layout
+  template <typename ArrowType>
+  Status PrepareInputData(std::shared_ptr<Buffer>* data);
+
+  // ----------------------------------------------------------------------
+  // Traditional visitor conversion for non-object arrays
+
+  template <typename ArrowType>
+  Status ConvertData(std::shared_ptr<Buffer>* data);
+
+  template <typename T>
+  Status PushBuilderResult(T* builder) {
+    std::shared_ptr<Array> out;
+    RETURN_NOT_OK(builder->Finish(&out));
+    out_arrays_.emplace_back(out);
+    return Status::OK();
+  }
+
+  Status PushArray(const std::shared_ptr<ArrayData>& data) {
+    out_arrays_.emplace_back(MakeArray(data));
+    return Status::OK();
+  }
+
+  template <typename ArrowType>
+  Status VisitNative() {
+    if (mask_ != nullptr) {
+      RETURN_NOT_OK(InitNullBitmap());
+      null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_);
+      if (null_count_ == -1) return Status::Invalid("Invalid mask type");
+    } else {
+      RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
+                                                 &null_count_));
+    }
+
+    std::shared_ptr<Buffer> data;
+    RETURN_NOT_OK(ConvertData<ArrowType>(&data));
+
+    auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0);
+    return PushArray(arr_data);
+  }
+
+  template <typename T>
+  Status VisitBinary(T* builder);
+
+  template <typename T>
+  Status VisitString(T* builder);
+
+  Status TypeNotImplemented(std::string type_name) {
+    return Status::NotImplemented("NumPyConverter doesn't implement <", type_name,
+                                  "> conversion. ");
+  }
+
+  MemoryPool* pool_;
+  std::shared_ptr<DataType> type_;
+  PyArrayObject* arr_;
+  PyArray_Descr* dtype_;
+  PyArrayObject* mask_;
+  int64_t length_;
+  int64_t stride_;
+  int64_t itemsize_;
+
+  bool from_pandas_;
+  compute::CastOptions cast_options_;
+
+  // Used in visitor pattern
+  ArrayVector out_arrays_;
+
+  std::shared_ptr<ResizableBuffer> null_bitmap_;
+  uint8_t* null_bitmap_data_;
+  int64_t null_count_;
+};
+
+Status NumPyConverter::Convert() {
+  if (PyArray_NDIM(arr_) != 1) {
+    return Status::Invalid("only handle 1-dimensional arrays");
+  }
+
+  if (dtype_->type_num == NPY_OBJECT) {
+    // If an object array, convert it like a normal Python sequence
+    PyConversionOptions py_options;
+    py_options.type = type_;
+    py_options.from_pandas = from_pandas_;
+    ARROW_ASSIGN_OR_RAISE(
+        auto chunked_array,
+        ConvertPySequence(reinterpret_cast<PyObject*>(arr_),
+                          reinterpret_cast<PyObject*>(mask_), py_options, pool_));
+    out_arrays_ = chunked_array->chunks();
+    return Status::OK();
+  }
+
+  if (type_ == nullptr) {
+    return Status::Invalid("Must pass data type for non-object arrays");
+  }
+
+  // Visit the type to perform conversion
+  return VisitTypeInline(*type_, this);
+}
+
+namespace {
+
+Status CastBuffer(const std::shared_ptr<DataType>& in_type,
+                  const std::shared_ptr<Buffer>& input, const int64_t length,
+                  const std::shared_ptr<Buffer>& valid_bitmap, const int64_t null_count,
+                  const std::shared_ptr<DataType>& out_type,
+                  const compute::CastOptions& cast_options, MemoryPool* pool,
+                  std::shared_ptr<Buffer>* out) {
+  // Must cast
+  auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count);
+  compute::ExecContext context(pool);
+  ARROW_ASSIGN_OR_RAISE(
+      std::shared_ptr<Array> casted_array,
+      compute::Cast(*MakeArray(tmp_data), out_type, cast_options, &context));
+  *out = casted_array->data()->buffers[1];
+  return Status::OK();
+}
+
+template <typename FromType, typename ToType>
+Status StaticCastBuffer(const Buffer& input, const int64_t length, MemoryPool* pool,
+                        std::shared_ptr<Buffer>* out) {
+  ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(sizeof(ToType) * length, pool));
+
+  auto in_values = reinterpret_cast<const FromType*>(input.data());
+  auto out_values = reinterpret_cast<ToType*>(result->mutable_data());
+  for (int64_t i = 0; i < length; ++i) {
+    *out_values++ = static_cast<ToType>(*in_values++);
+  }
+  *out = std::move(result);
+  return Status::OK();
+}
+
+template <typename T>
+void CopyStridedBytewise(int8_t* input_data, int64_t length, int64_t stride,
+                         T* output_data) {
+  // Passing input_data as non-const is a concession to PyObject*
+  for (int64_t i = 0; i < length; ++i) {
+    memcpy(output_data + i, input_data, sizeof(T));
+    input_data += stride;
+  }
+}
+
+template <typename T>
+void CopyStridedNatural(T* input_data, int64_t length, int64_t stride, T* output_data) {
+  // Passing input_data as non-const is a concession to PyObject*
+  int64_t j = 0;
+  for (int64_t i = 0; i < length; ++i) {
+    output_data[i] = input_data[j];
+    j += stride;
+  }
+}
+
+class NumPyStridedConverter {
+ public:
+  static Status Convert(PyArrayObject* arr, int64_t length, MemoryPool* pool,
+                        std::shared_ptr<Buffer>* out) {
+    NumPyStridedConverter converter(arr, length, pool);
+    RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
+    *out = converter.buffer_;
+    return Status::OK();
+  }
+  template <int TYPE>
+  Status Visit(PyArrayObject* arr) {
+    using traits = internal::npy_traits<TYPE>;
+    using T = typename traits::value_type;
+
+    ARROW_ASSIGN_OR_RAISE(buffer_, AllocateBuffer(sizeof(T) * length_, pool_));
+
+    const int64_t stride = PyArray_STRIDES(arr)[0];
+    // ARROW-16013: convert sizeof(T) to signed int64 first, otherwise dividing by it
+    // would do an unsigned division. This cannot be caught by tests without ubsan, since
+    // common signed overflow behavior and the fact that the sizeof(T) is currently always
+    // a power of two here cause CopyStridedNatural to still produce correct results
+    const int64_t element_size = sizeof(T);
+    if (stride % element_size == 0) {
+      const int64_t stride_elements = stride / element_size;
+      CopyStridedNatural(reinterpret_cast<T*>(PyArray_DATA(arr)), length_,
+                         stride_elements, reinterpret_cast<T*>(buffer_->mutable_data()));
+    } else {
+      CopyStridedBytewise(reinterpret_cast<int8_t*>(PyArray_DATA(arr)), length_, stride,
+                          reinterpret_cast<T*>(buffer_->mutable_data()));
+    }
+    return Status::OK();
+  }
+
+ protected:
+  NumPyStridedConverter(PyArrayObject* arr, int64_t length, MemoryPool* pool)
+      : arr_(arr), length_(length), pool_(pool), buffer_(nullptr) {}
+  PyArrayObject* arr_;
+  int64_t length_;
+  MemoryPool* pool_;
+  std::shared_ptr<Buffer> buffer_;
+};
+
+}  // namespace
+
+template <typename ArrowType>
+inline Status NumPyConverter::PrepareInputData(std::shared_ptr<Buffer>* data) {
+  if (PyArray_ISBYTESWAPPED(arr_)) {
+    // TODO
+    return Status::NotImplemented("Byte-swapped arrays not supported");
+  }
+
+  if (dtype_->type_num == NPY_BOOL) {
+    int64_t nbytes = bit_util::BytesForBits(length_);
+    ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(nbytes, pool_));
+
+    Ndarray1DIndexer<uint8_t> values(arr_);
+    int64_t i = 0;
+    const auto generate = [&values, &i]() -> bool { return values[i++] > 0; };
+    GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate);
+
+    *data = std::move(buffer);
+  } else if (is_strided()) {
+    RETURN_NOT_OK(NumPyStridedConverter::Convert(arr_, length_, pool_, data));
+  } else {
+    // Can zero-copy
+    *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
+  }
+
+  return Status::OK();
+}
+
+template <typename ArrowType>
+inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
+  RETURN_NOT_OK(PrepareInputData<ArrowType>(data));
+
+  ARROW_ASSIGN_OR_RAISE(auto input_type, NumPyDtypeToArrow(dtype_));
+
+  if (!input_type->Equals(*type_)) {
+    RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_,
+                             cast_options_, pool_, data));
+  }
+
+  return Status::OK();
+}
+
+template <>
+inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
+  std::shared_ptr<DataType> input_type;
+
+  RETURN_NOT_OK(PrepareInputData<Date32Type>(data));
+
+  auto date_dtype =
+      reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(dtype_));
+  if (dtype_->type_num == NPY_DATETIME) {
+    // If we have inbound datetime64[D] data, this needs to be downcasted
+    // separately here from int64_t to int32_t, because this data is not
+    // supported in compute::Cast
+    if (date_dtype->meta.base == NPY_FR_D) {
+      // TODO(wesm): How pedantic do we really want to be about checking for int32
+      // overflow here?
+      Status s = StaticCastBuffer<int64_t, int32_t>(**data, length_, pool_, data);
+      RETURN_NOT_OK(s);
+    } else {
+      ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
+      if (!input_type->Equals(*type_)) {
+        // The null bitmap was already computed in VisitNative()
+        RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
+                                 type_, cast_options_, pool_, data));
+      }
+    }
+  } else {
+    ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
+    if (!input_type->Equals(*type_)) {
+      RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
+                               type_, cast_options_, pool_, data));
+    }
+  }
+
+  return Status::OK();
+}
+
+template <>
+inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* data) {
+  constexpr int64_t kMillisecondsInDay = 86400000;
+  std::shared_ptr<DataType> input_type;
+
+  RETURN_NOT_OK(PrepareInputData<Date64Type>(data));
+
+  auto date_dtype =
+      reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(PyDataType_C_METADATA(dtype_));
+  if (dtype_->type_num == NPY_DATETIME) {
+    // If we have inbound datetime64[D] data, this needs to be downcasted
+    // separately here from int64_t to int32_t, because this data is not
+    // supported in compute::Cast
+    if (date_dtype->meta.base == NPY_FR_D) {
+      ARROW_ASSIGN_OR_RAISE(auto result,
+                            AllocateBuffer(sizeof(int64_t) * length_, pool_));
+
+      auto in_values = reinterpret_cast<const int64_t*>((*data)->data());
+      auto out_values = reinterpret_cast<int64_t*>(result->mutable_data());
+      for (int64_t i = 0; i < length_; ++i) {
+        *out_values++ = kMillisecondsInDay * (*in_values++);
+      }
+      *data = std::move(result);
+    } else {
+      ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
+      if (!input_type->Equals(*type_)) {
+        // The null bitmap was already computed in VisitNative()
+        RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
+                                 type_, cast_options_, pool_, data));
+      }
+    }
+  } else {
+    ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_));
+    if (!input_type->Equals(*type_)) {
+      RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_,
+                               type_, cast_options_, pool_, data));
+    }
+  }
+
+  return Status::OK();
+}
+
+// Create 16MB chunks for binary data
+constexpr int32_t kBinaryChunksize = 1 << 24;
+
+template <typename T>
+Status NumPyConverter::VisitBinary(T* builder) {
+  auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+
+  auto AppendNotNull = [builder, this](const uint8_t* data) {
+    // This is annoying. NumPy allows strings to have nul-terminators, so
+    // we must check for them here
+    const size_t item_size =
+        strnlen(reinterpret_cast<const char*>(data), static_cast<size_t>(itemsize_));
+    return builder->Append(data, static_cast<int32_t>(item_size));
+  };
+
+  if (mask_ != nullptr) {
+    Ndarray1DIndexer<uint8_t> mask_values(mask_);
+    for (int64_t i = 0; i < length_; ++i) {
+      if (mask_values[i]) {
+        RETURN_NOT_OK(builder->AppendNull());
+      } else {
+        RETURN_NOT_OK(AppendNotNull(data));
+      }
+      data += stride_;
+    }
+  } else {
+    for (int64_t i = 0; i < length_; ++i) {
+      RETURN_NOT_OK(AppendNotNull(data));
+      data += stride_;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status NumPyConverter::Visit(const BinaryType& type) {
+  ::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_);
+
+  RETURN_NOT_OK(VisitBinary(&builder));
+
+  ArrayVector result;
+  RETURN_NOT_OK(builder.Finish(&result));
+  for (auto arr : result) {
+    RETURN_NOT_OK(PushArray(arr->data()));
+  }
+  return Status::OK();
+}
+
+Status NumPyConverter::Visit(const LargeBinaryType& type) {
+  ::arrow::LargeBinaryBuilder builder(pool_);
+
+  RETURN_NOT_OK(VisitBinary(&builder));
+
+  std::shared_ptr<Array> result;
+  RETURN_NOT_OK(builder.Finish(&result));
+  return PushArray(result->data());
+}
+
+Status NumPyConverter::Visit(const BinaryViewType& type) {
+  ::arrow::BinaryViewBuilder builder(pool_);
+
+  RETURN_NOT_OK(VisitBinary(&builder));
+
+  std::shared_ptr<Array> result;
+  RETURN_NOT_OK(builder.Finish(&result));
+  return PushArray(result->data());
+}
+
+Status NumPyConverter::Visit(const FixedSizeBinaryType& type) {
+  auto byte_width = type.byte_width();
+
+  if (itemsize_ != byte_width) {
+    return Status::Invalid("Got bytestring of length ", itemsize_, " (expected ",
+                           byte_width, ")");
+  }
+
+  FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_);
+  auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+
+  if (mask_ != nullptr) {
+    Ndarray1DIndexer<uint8_t> mask_values(mask_);
+    RETURN_NOT_OK(builder.Reserve(length_));
+    for (int64_t i = 0; i < length_; ++i) {
+      if (mask_values[i]) {
+        RETURN_NOT_OK(builder.AppendNull());
+      } else {
+        RETURN_NOT_OK(builder.Append(data));
+      }
+      data += stride_;
+    }
+  } else {
+    for (int64_t i = 0; i < length_; ++i) {
+      RETURN_NOT_OK(builder.Append(data));
+      data += stride_;
+    }
+  }
+
+  std::shared_ptr<Array> result;
+  RETURN_NOT_OK(builder.Finish(&result));
+  return PushArray(result->data());
+}
+
+namespace {
+
+// NumPy unicode is UCS4/UTF32 always
+constexpr int kNumPyUnicodeSize = 4;
+
+template <typename T>
+Status AppendUTF32(const char* data, int64_t itemsize, int byteorder, T* builder) {
+  // The binary \x00\x00\x00\x00 indicates a nul terminator in NumPy unicode,
+  // so we need to detect that here to truncate if necessary. Yep.
+  Py_ssize_t actual_length = 0;
+  for (; actual_length < itemsize / kNumPyUnicodeSize; ++actual_length) {
+    const char* code_point = data + actual_length * kNumPyUnicodeSize;
+    if ((*code_point == '\0') && (*(code_point + 1) == '\0') &&
+        (*(code_point + 2) == '\0') && (*(code_point + 3) == '\0')) {
+      break;
+    }
+  }
+
+  OwnedRef unicode_obj(PyUnicode_DecodeUTF32(data, actual_length * kNumPyUnicodeSize,
+                                             nullptr, &byteorder));
+  RETURN_IF_PYERROR();
+  OwnedRef utf8_obj(PyUnicode_AsUTF8String(unicode_obj.obj()));
+  if (utf8_obj.obj() == NULL) {
+    PyErr_Clear();
+    return Status::Invalid("failed converting UTF32 to UTF8");
+  }
+
+  const int32_t length = static_cast<int32_t>(PyBytes_GET_SIZE(utf8_obj.obj()));
+  return builder->Append(
+      reinterpret_cast<const uint8_t*>(PyBytes_AS_STRING(utf8_obj.obj())), length);
+}
+
+}  // namespace
+
+template <typename T>
+Status NumPyConverter::VisitString(T* builder) {
+  auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+
+  char numpy_byteorder = dtype_->byteorder;
+
+  // For Python C API, -1 is little-endian, 1 is big-endian
+#if ARROW_LITTLE_ENDIAN
+  // Yield little-endian from both '|' (native) and '<'
+  int byteorder = numpy_byteorder == '>' ? 1 : -1;
+#else
+  // Yield big-endian from both '|' (native) and '>'
+  int byteorder = numpy_byteorder == '<' ? -1 : 1;
+#endif
+
+  PyAcquireGIL gil_lock;
+
+  const bool is_binary_type = dtype_->type_num == NPY_STRING;
+  const bool is_unicode_type = dtype_->type_num == NPY_UNICODE;
+
+  if (!is_binary_type && !is_unicode_type) {
+    const bool is_float_type = dtype_->kind == 'f';
+    if (from_pandas_ && is_float_type) {
+      // in case of from_pandas=True, accept an all-NaN float array as input
+      RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_,
+                                                 &null_count_));
+      if (null_count_ == length_) {
+        auto arr = std::make_shared<NullArray>(length_);
+        compute::ExecContext context(pool_);
+        ARROW_ASSIGN_OR_RAISE(
+            std::shared_ptr<Array> out,
+            compute::Cast(*arr, arrow::utf8(), cast_options_, &context));
+        out_arrays_.emplace_back(out);
+        return Status::OK();
+      }
+    }
+    std::string dtype_string;
+    RETURN_NOT_OK(internal::PyObject_StdStringStr(reinterpret_cast<PyObject*>(dtype_),
+                                                  &dtype_string));
+    return Status::TypeError("Expected a string or bytes dtype, got ", dtype_string);
+  }
+
+  auto AppendNonNullValue = [&](const uint8_t* data) {
+    if (is_binary_type) {
+      if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) {
+        return builder->Append(data, static_cast<int32_t>(itemsize_));
+      } else {
+        return Status::Invalid("Encountered non-UTF8 binary value: ",
+                               HexEncode(data, itemsize_));
+      }
+    } else {
+      // is_unicode_type case
+      return AppendUTF32(reinterpret_cast<const char*>(data), itemsize_, byteorder,
+                         builder);
+    }
+  };
+
+  if (mask_ != nullptr) {
+    Ndarray1DIndexer<uint8_t> mask_values(mask_);
+    for (int64_t i = 0; i < length_; ++i) {
+      if (mask_values[i]) {
+        RETURN_NOT_OK(builder->AppendNull());
+      } else {
+        RETURN_NOT_OK(AppendNonNullValue(data));
+      }
+      data += stride_;
+    }
+  } else {
+    for (int64_t i = 0; i < length_; ++i) {
+      RETURN_NOT_OK(AppendNonNullValue(data));
+      data += stride_;
+    }
+  }
+
+  return Status::OK();
+}
+
+Status NumPyConverter::Visit(const StringType& type) {
+  util::InitializeUTF8();
+
+  ::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_);
+
+  RETURN_NOT_OK(VisitString(&builder));
+
+  ArrayVector result;
+  RETURN_NOT_OK(builder.Finish(&result));
+  for (auto arr : result) {
+    RETURN_NOT_OK(PushArray(arr->data()));
+  }
+  return Status::OK();
+}
+
+Status NumPyConverter::Visit(const LargeStringType& type) {
+  util::InitializeUTF8();
+
+  ::arrow::LargeStringBuilder builder(pool_);
+
+  RETURN_NOT_OK(VisitString(&builder));
+
+  std::shared_ptr<Array> result;
+  RETURN_NOT_OK(builder.Finish(&result));
+  RETURN_NOT_OK(PushArray(result->data()));
+  return Status::OK();
+}
+
+Status NumPyConverter::Visit(const StringViewType& type) {
+  util::InitializeUTF8();
+
+  ::arrow::StringViewBuilder builder(pool_);
+
+  RETURN_NOT_OK(VisitString(&builder));
+
+  std::shared_ptr<Array> result;
+  RETURN_NOT_OK(builder.Finish(&result));
+  RETURN_NOT_OK(PushArray(result->data()));
+  return Status::OK();
+}
+
+Status NumPyConverter::Visit(const StructType& type) {
+  std::vector<NumPyConverter> sub_converters;
+  std::vector<OwnedRefNoGIL> sub_arrays;
+
+  {
+    PyAcquireGIL gil_lock;
+
+    // Create converters for each struct type field
+    if (PyDataType_FIELDS(dtype_) == NULL || !PyDict_Check(PyDataType_FIELDS(dtype_))) {
+      return Status::TypeError("Expected struct array");
+    }
+
+    for (auto field : type.fields()) {
+      PyObject* tup;
+      PyDict_GetItemStringRef(PyDataType_FIELDS(dtype_), field->name().c_str(), &tup);
+      RETURN_IF_PYERROR();
+      OwnedRef tupref(tup);
+      if (tup == NULL) {
+        return Status::Invalid("Missing field '", field->name(), "' in struct array");
+      }
+      PyArray_Descr* sub_dtype =
+          reinterpret_cast<PyArray_Descr*>(PyTuple_GET_ITEM(tup, 0));
+      ARROW_DCHECK(PyObject_TypeCheck(sub_dtype, &PyArrayDescr_Type));
+      int offset = static_cast<int>(PyLong_AsLong(PyTuple_GET_ITEM(tup, 1)));
+      RETURN_IF_PYERROR();
+      Py_INCREF(sub_dtype); /* PyArray_GetField() steals ref */
+      PyObject* sub_array = PyArray_GetField(arr_, sub_dtype, offset);
+      RETURN_IF_PYERROR();
+      sub_arrays.emplace_back(sub_array);
+      sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */, field->type(),
+                                  from_pandas_);
+    }
+  }
+
+  std::vector<ArrayVector> groups;
+  int64_t null_count = 0;
+
+  // Compute null bitmap and store it as a Boolean Array to include it
+  // in the rechunking below
+  {
+    if (mask_ != nullptr) {
+      RETURN_NOT_OK(InitNullBitmap());
+      null_count = MaskToBitmap(mask_, length_, null_bitmap_data_);
+      if (null_count_ == -1) return Status::Invalid("Invalid mask type");
+    }
+    groups.push_back({std::make_shared<BooleanArray>(length_, null_bitmap_)});
+  }
+
+  // Convert child data
+  for (auto& converter : sub_converters) {
+    RETURN_NOT_OK(converter.Convert());
+    groups.push_back(converter.result());
+  }
+  // Ensure the different array groups are chunked consistently
+  groups = ::arrow::internal::RechunkArraysConsistently(groups);
+
+  // Make struct array chunks by combining groups
+  size_t ngroups = groups.size();
+  size_t nchunks = groups[0].size();
+  for (size_t chunk = 0; chunk < nchunks; chunk++) {
+    // First group has the null bitmaps as Boolean Arrays
+    const auto& null_data = groups[0][chunk]->data();
+    ARROW_DCHECK_EQ(null_data->type->id(), Type::BOOL);
+    ARROW_DCHECK_EQ(null_data->buffers.size(), 2);
+    const auto& null_buffer = null_data->buffers[1];
+    // Careful: the rechunked null bitmap may have a non-zero offset
+    // to its buffer, and it may not even start on a byte boundary
+    int64_t null_offset = null_data->offset;
+    std::shared_ptr<Buffer> fixed_null_buffer;
+
+    if (!null_buffer) {
+      fixed_null_buffer = null_buffer;
+    } else if (null_offset % 8 == 0) {
+      fixed_null_buffer =
+          std::make_shared<Buffer>(null_buffer,
+                                   // byte offset
+                                   null_offset / 8,
+                                   // byte size
+                                   bit_util::BytesForBits(null_data->length));
+    } else {
+      ARROW_ASSIGN_OR_RAISE(
+          fixed_null_buffer,
+          CopyBitmap(pool_, null_buffer->data(), null_offset, null_data->length));
+    }
+
+    // Create struct array chunk and populate it
+    auto arr_data =
+        ArrayData::Make(type_, null_data->length, null_count ? kUnknownNullCount : 0, 0);
+    arr_data->buffers.push_back(fixed_null_buffer);
+    // Append child chunks
+    for (size_t i = 1; i < ngroups; i++) {
+      arr_data->child_data.push_back(groups[i][chunk]->data());
+    }
+    RETURN_NOT_OK(PushArray(arr_data));
+  }
+
+  return Status::OK();
+}
+
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+                      const std::shared_ptr<DataType>& type,
+                      const compute::CastOptions& cast_options,
+                      std::shared_ptr<ChunkedArray>* out) {
+  if (!PyArray_Check(ao)) {
+    // This code path cannot be reached by Python unit tests currently so this
+    // is only a sanity check.
+    return Status::TypeError("Input object was not a NumPy array");
+  }
+  if (PyArray_NDIM(reinterpret_cast<PyArrayObject*>(ao)) != 1) {
+    return Status::Invalid("only handle 1-dimensional arrays");
+  }
+
+  NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
+  RETURN_NOT_OK(converter.Convert());
+  const auto& output_arrays = converter.result();
+  ARROW_DCHECK_GT(output_arrays.size(), 0);
+  *out = std::make_shared<ChunkedArray>(output_arrays);
+  return Status::OK();
+}
+
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+                      const std::shared_ptr<DataType>& type,
+                      std::shared_ptr<ChunkedArray>* out) {
+  return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out);
+}
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/numpy_to_arrow.h b/pyarrow/src/arrow/python/numpy_to_arrow.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6cd093e5542008cf173f43de311e40c418e7c8d
--- /dev/null
+++ b/pyarrow/src/arrow/python/numpy_to_arrow.h
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Converting from pandas memory representation to Arrow data structures
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <memory>
+
+#include "arrow/compute/api.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+
+class Array;
+class ChunkedArray;
+class DataType;
+class MemoryPool;
+class Status;
+
+namespace py {
+
+/// Convert NumPy arrays to Arrow. If target data type is not known, pass a
+/// type with null
+///
+/// \param[in] pool Memory pool for any memory allocations
+/// \param[in] ao an ndarray with the array data
+/// \param[in] mo an ndarray with a null mask (True is null), optional
+/// \param[in] from_pandas If true, use pandas's null sentinels to determine
+/// whether values are null
+/// \param[in] type a specific type to cast to, may be null
+/// \param[in] cast_options casting options
+/// \param[out] out a ChunkedArray, to accommodate chunked output
+ARROW_PYTHON_EXPORT
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+                      const std::shared_ptr<DataType>& type,
+                      const compute::CastOptions& cast_options,
+                      std::shared_ptr<ChunkedArray>* out);
+
+/// Safely convert NumPy arrays to Arrow. If target data type is not known,
+/// pass a type with null.
+///
+/// \param[in] pool Memory pool for any memory allocations
+/// \param[in] ao an ndarray with the array data
+/// \param[in] mo an ndarray with a null mask (True is null), optional
+/// \param[in] from_pandas If true, use pandas's null sentinels to determine
+/// whether values are null
+/// \param[in] type a specific type to cast to, may be null
+/// \param[out] out a ChunkedArray, to accommodate chunked output
+ARROW_PYTHON_EXPORT
+Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
+                      const std::shared_ptr<DataType>& type,
+                      std::shared_ptr<ChunkedArray>* out);
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/parquet_encryption.cc b/pyarrow/src/arrow/python/parquet_encryption.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4fcce64cdbe811ef061955957b048f5409ef0909
--- /dev/null
+++ b/pyarrow/src/arrow/python/parquet_encryption.cc
@@ -0,0 +1,113 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/parquet_encryption.h"
+#include "parquet/exception.h"
+
+namespace arrow {
+namespace py {
+namespace parquet {
+namespace encryption {
+
+PyKmsClient::PyKmsClient(PyObject* handler, PyKmsClientVtable vtable)
+    : handler_(handler), vtable_(std::move(vtable)) {
+  Py_INCREF(handler);
+}
+
+PyKmsClient::~PyKmsClient() {}
+
+std::string PyKmsClient::WrapKey(const ::arrow::util::SecureString& key,
+                                 const std::string& master_key_identifier) {
+  std::string wrapped;
+  auto st = SafeCallIntoPython([&]() -> Status {
+    vtable_.wrap_key(handler_.obj(), key, master_key_identifier, &wrapped);
+    return CheckPyError();
+  });
+  if (!st.ok()) {
+    throw ::parquet::ParquetStatusException(st);
+  }
+  return wrapped;
+}
+
+::arrow::util::SecureString PyKmsClient::UnwrapKey(
+    const std::string& wrapped_key, const std::string& master_key_identifier) {
+  arrow::util::SecureString unwrapped;
+  auto st = SafeCallIntoPython([&]() -> Status {
+    vtable_.unwrap_key(handler_.obj(), wrapped_key, master_key_identifier, &unwrapped);
+    return CheckPyError();
+  });
+  if (!st.ok()) {
+    throw ::parquet::ParquetStatusException(st);
+  }
+  return unwrapped;
+}
+
+PyKmsClientFactory::PyKmsClientFactory(PyObject* handler, PyKmsClientFactoryVtable vtable)
+    : handler_(handler), vtable_(std::move(vtable)) {
+  Py_INCREF(handler);
+}
+
+PyKmsClientFactory::~PyKmsClientFactory() {}
+
+std::shared_ptr<::parquet::encryption::KmsClient> PyKmsClientFactory::CreateKmsClient(
+    const ::parquet::encryption::KmsConnectionConfig& kms_connection_config) {
+  std::shared_ptr<::parquet::encryption::KmsClient> kms_client;
+  auto st = SafeCallIntoPython([&]() -> Status {
+    vtable_.create_kms_client(handler_.obj(), kms_connection_config, &kms_client);
+    return CheckPyError();
+  });
+  if (!st.ok()) {
+    throw ::parquet::ParquetStatusException(st);
+  }
+  return kms_client;
+}
+
+arrow::Result<std::shared_ptr<::parquet::FileEncryptionProperties>>
+PyCryptoFactory::SafeGetFileEncryptionProperties(
+    const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
+    const ::parquet::encryption::EncryptionConfiguration& encryption_config,
+    const std::string& parquet_file_path,
+    const std::shared_ptr<::arrow::fs::FileSystem>& filesystem) {
+  PARQUET_CATCH_AND_RETURN(this->GetFileEncryptionProperties(
+      kms_connection_config, encryption_config, parquet_file_path, filesystem));
+}
+
+arrow::Result<std::shared_ptr<::parquet::FileDecryptionProperties>>
+PyCryptoFactory::SafeGetFileDecryptionProperties(
+    const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
+    const ::parquet::encryption::DecryptionConfiguration& decryption_config,
+    const std::string& parquet_file_path,
+    const std::shared_ptr<::arrow::fs::FileSystem>& filesystem) {
+  PARQUET_CATCH_AND_RETURN(this->GetFileDecryptionProperties(
+      kms_connection_config, decryption_config, parquet_file_path, filesystem));
+}
+
+arrow::Status PyCryptoFactory::SafeRotateMasterKeys(
+    const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
+    const std::string& parquet_file_path,
+    const std::shared_ptr<::arrow::fs::FileSystem>& filesystem, bool double_wrapping,
+    double cache_lifetime_seconds) {
+  PARQUET_CATCH_NOT_OK(this->RotateMasterKeys(kms_connection_config, parquet_file_path,
+                                              filesystem, double_wrapping,
+                                              cache_lifetime_seconds));
+  return arrow::Status::OK();
+}
+
+}  // namespace encryption
+}  // namespace parquet
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/parquet_encryption.h b/pyarrow/src/arrow/python/parquet_encryption.h
new file mode 100644
index 0000000000000000000000000000000000000000..b485b8b11537009479787d5bba8c50c6e2744ec5
--- /dev/null
+++ b/pyarrow/src/arrow/python/parquet_encryption.h
@@ -0,0 +1,146 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+#include "arrow/result.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/secure_string.h"
+#include "parquet/encryption/crypto_factory.h"
+#include "parquet/encryption/file_system_key_material_store.h"
+#include "parquet/encryption/key_material.h"
+#include "parquet/encryption/kms_client.h"
+#include "parquet/encryption/kms_client_factory.h"
+
+#if defined(_WIN32) || defined(__CYGWIN__)  // Windows
+#  if defined(_MSC_VER)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_PYTHON_STATIC
+#    define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT
+#  elif defined(ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORTING)
+#    define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __declspec(dllimport)
+#  endif
+
+#else  // Not Windows
+#  ifndef ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT
+#    define ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT __attribute__((visibility("default")))
+#  endif
+#endif  // Non-Windows
+
+namespace arrow {
+namespace py {
+namespace parquet {
+namespace encryption {
+
+/// \brief A table of function pointers for calling from C++ into
+/// Python.
+class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientVtable {
+ public:
+  std::function<void(PyObject*, const ::arrow::util::SecureString& key,
+                     const std::string& master_key_identifier, std::string* out)>
+      wrap_key;
+  std::function<void(PyObject*, const std::string& wrapped_key,
+                     const std::string& master_key_identifier,
+                     ::arrow::util::SecureString* out)>
+      unwrap_key;
+};
+
+/// \brief A helper for KmsClient implementation in Python.
+class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClient
+    : public ::parquet::encryption::KmsClient {
+ public:
+  PyKmsClient(PyObject* handler, PyKmsClientVtable vtable);
+  ~PyKmsClient() override;
+
+  std::string WrapKey(const ::arrow::util::SecureString& key,
+                      const std::string& master_key_identifier) override;
+
+  ::arrow::util::SecureString UnwrapKey(
+      const std::string& wrapped_key, const std::string& master_key_identifier) override;
+
+ private:
+  OwnedRefNoGIL handler_;
+  PyKmsClientVtable vtable_;
+};
+
+/// \brief A table of function pointers for calling from C++ into
+/// Python.
+class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientFactoryVtable {
+ public:
+  std::function<void(
+      PyObject*, const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
+      std::shared_ptr<::parquet::encryption::KmsClient>* out)>
+      create_kms_client;
+};
+
+/// \brief A helper for KmsClientFactory implementation in Python.
+class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyKmsClientFactory
+    : public ::parquet::encryption::KmsClientFactory {
+ public:
+  PyKmsClientFactory(PyObject* handler, PyKmsClientFactoryVtable vtable);
+  ~PyKmsClientFactory() override;
+
+  std::shared_ptr<::parquet::encryption::KmsClient> CreateKmsClient(
+      const ::parquet::encryption::KmsConnectionConfig& kms_connection_config) override;
+
+ private:
+  OwnedRefNoGIL handler_;
+  PyKmsClientFactoryVtable vtable_;
+};
+
+/// \brief A CryptoFactory that returns Results instead of throwing exceptions.
+class ARROW_PYTHON_PARQUET_ENCRYPTION_EXPORT PyCryptoFactory
+    : public ::parquet::encryption::CryptoFactory {
+ public:
+  arrow::Result<std::shared_ptr<::parquet::FileEncryptionProperties>>
+  SafeGetFileEncryptionProperties(
+      const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
+      const ::parquet::encryption::EncryptionConfiguration& encryption_config,
+      const std::string& parquet_file_path,
+      const std::shared_ptr<::arrow::fs::FileSystem>& filesystem);
+
+  /// The returned FileDecryptionProperties object will use the cache inside this
+  /// CryptoFactory object, so please keep this
+  /// CryptoFactory object alive along with the returned
+  /// FileDecryptionProperties object.
+  arrow::Result<std::shared_ptr<::parquet::FileDecryptionProperties>>
+  SafeGetFileDecryptionProperties(
+      const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
+      const ::parquet::encryption::DecryptionConfiguration& decryption_config,
+      const std::string& parquet_file_path,
+      const std::shared_ptr<::arrow::fs::FileSystem>& filesystem);
+
+  arrow::Status SafeRotateMasterKeys(
+      const ::parquet::encryption::KmsConnectionConfig& kms_connection_config,
+      const std::string& parquet_file_path,
+      const std::shared_ptr<::arrow::fs::FileSystem>& filesystem, bool double_wrapping,
+      double cache_lifetime_seconds);
+};
+
+}  // namespace encryption
+}  // namespace parquet
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/platform.h b/pyarrow/src/arrow/python/platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..04fb9fb8089944ea41b3ef5bfb3addaac710f39e
--- /dev/null
+++ b/pyarrow/src/arrow/python/platform.h
@@ -0,0 +1,35 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for converting between pandas's NumPy-based data representation
+// and Arrow data structures
+
+#pragma once
+
+// If PY_SSIZE_T_CLEAN is defined, argument parsing functions treat #-specifier
+// to mean Py_ssize_t (defining this to suppress deprecation warning)
+#define PY_SSIZE_T_CLEAN
+
+#include <Python.h>  // IWYU pragma: export
+#include <datetime.h>
+
+// Work around C2528 error
+#ifdef _MSC_VER
+#  if _MSC_VER >= 1900
+#    undef timezone
+#  endif
+#endif
diff --git a/pyarrow/src/arrow/python/pyarrow.cc b/pyarrow/src/arrow/python/pyarrow.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f675921708e198a7cab706fe6a0188818f73dd1
--- /dev/null
+++ b/pyarrow/src/arrow/python/pyarrow.cc
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/pyarrow.h"
+
+#include <memory>
+#include <utility>
+
+#include "arrow/array.h"
+#include "arrow/table.h"
+#include "arrow/tensor.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+
+#include "arrow/python/common.h"
+#include "arrow/python/datetime.h"
+namespace {
+#include "arrow/python/pyarrow_api.h"
+}
+
+namespace arrow {
+namespace py {
+
+static Status UnwrapError(PyObject* obj, const char* expected_type) {
+  return Status::TypeError("Could not unwrap ", expected_type,
+                           " from Python object of type '", Py_TYPE(obj)->tp_name, "'");
+}
+
+int import_pyarrow() {
+#ifdef PYPY_VERSION
+  PyDateTime_IMPORT;
+#else
+  internal::InitDatetime();
+#endif
+  return ::import_pyarrow__lib();
+}
+
+#define DEFINE_WRAP_FUNCTIONS(FUNC_SUFFIX, TYPE_NAME)                                   \
+  bool is_##FUNC_SUFFIX(PyObject* obj) { return ::pyarrow_is_##FUNC_SUFFIX(obj) != 0; } \
+                                                                                        \
+  PyObject* wrap_##FUNC_SUFFIX(const std::shared_ptr<TYPE_NAME>& src) {                 \
+    return ::pyarrow_wrap_##FUNC_SUFFIX(src);                                           \
+  }                                                                                     \
+  Result<std::shared_ptr<TYPE_NAME>> unwrap_##FUNC_SUFFIX(PyObject* obj) {              \
+    auto out = ::pyarrow_unwrap_##FUNC_SUFFIX(obj);                                     \
+    if (out) {                                                                          \
+      return std::move(out);                                                            \
+    } else {                                                                            \
+      return UnwrapError(obj, #TYPE_NAME);                                              \
+    }                                                                                   \
+  }
+
+DEFINE_WRAP_FUNCTIONS(buffer, Buffer)
+
+DEFINE_WRAP_FUNCTIONS(data_type, DataType)
+DEFINE_WRAP_FUNCTIONS(field, Field)
+DEFINE_WRAP_FUNCTIONS(schema, Schema)
+
+DEFINE_WRAP_FUNCTIONS(scalar, Scalar)
+
+DEFINE_WRAP_FUNCTIONS(array, Array)
+DEFINE_WRAP_FUNCTIONS(chunked_array, ChunkedArray)
+
+DEFINE_WRAP_FUNCTIONS(sparse_coo_tensor, SparseCOOTensor)
+DEFINE_WRAP_FUNCTIONS(sparse_csc_matrix, SparseCSCMatrix)
+DEFINE_WRAP_FUNCTIONS(sparse_csf_tensor, SparseCSFTensor)
+DEFINE_WRAP_FUNCTIONS(sparse_csr_matrix, SparseCSRMatrix)
+DEFINE_WRAP_FUNCTIONS(tensor, Tensor)
+
+DEFINE_WRAP_FUNCTIONS(batch, RecordBatch)
+DEFINE_WRAP_FUNCTIONS(table, Table)
+
+#undef DEFINE_WRAP_FUNCTIONS
+
+namespace internal {
+
+int check_status(const Status& status) { return ::pyarrow_internal_check_status(status); }
+
+PyObject* convert_status(const Status& status) {
+  ARROW_DCHECK(!status.ok());
+  return ::pyarrow_internal_convert_status(status);
+}
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/pyarrow.h b/pyarrow/src/arrow/python/pyarrow.h
new file mode 100644
index 0000000000000000000000000000000000000000..113035500c0053dbb9dde5a99216aec1aefd1140
--- /dev/null
+++ b/pyarrow/src/arrow/python/pyarrow.h
@@ -0,0 +1,89 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <memory>
+
+#include "arrow/python/visibility.h"
+
+#include "arrow/sparse_tensor.h"
+
+// Work around ARROW-2317 (C linkage warning from Cython)
+extern "C++" {
+
+namespace arrow {
+
+class Array;
+class Buffer;
+class DataType;
+class Field;
+class RecordBatch;
+class Schema;
+class Status;
+class Table;
+class Tensor;
+
+namespace py {
+
+// Returns 0 on success, -1 on error.
+ARROW_PYTHON_EXPORT int import_pyarrow();
+
+#define DECLARE_WRAP_FUNCTIONS(FUNC_SUFFIX, TYPE_NAME)                         \
+  ARROW_PYTHON_EXPORT bool is_##FUNC_SUFFIX(PyObject*);                        \
+  ARROW_PYTHON_EXPORT Result<std::shared_ptr<TYPE_NAME>> unwrap_##FUNC_SUFFIX( \
+      PyObject*);                                                              \
+  ARROW_PYTHON_EXPORT PyObject* wrap_##FUNC_SUFFIX(const std::shared_ptr<TYPE_NAME>&);
+
+DECLARE_WRAP_FUNCTIONS(buffer, Buffer)
+
+DECLARE_WRAP_FUNCTIONS(data_type, DataType)
+DECLARE_WRAP_FUNCTIONS(field, Field)
+DECLARE_WRAP_FUNCTIONS(schema, Schema)
+
+DECLARE_WRAP_FUNCTIONS(scalar, Scalar)
+
+DECLARE_WRAP_FUNCTIONS(array, Array)
+DECLARE_WRAP_FUNCTIONS(chunked_array, ChunkedArray)
+
+DECLARE_WRAP_FUNCTIONS(sparse_coo_tensor, SparseCOOTensor)
+DECLARE_WRAP_FUNCTIONS(sparse_csc_matrix, SparseCSCMatrix)
+DECLARE_WRAP_FUNCTIONS(sparse_csf_tensor, SparseCSFTensor)
+DECLARE_WRAP_FUNCTIONS(sparse_csr_matrix, SparseCSRMatrix)
+DECLARE_WRAP_FUNCTIONS(tensor, Tensor)
+
+DECLARE_WRAP_FUNCTIONS(batch, RecordBatch)
+DECLARE_WRAP_FUNCTIONS(table, Table)
+
+#undef DECLARE_WRAP_FUNCTIONS
+
+namespace internal {
+
+// If status is ok, return 0.
+// If status is not ok, set Python error indicator and return -1.
+ARROW_PYTHON_EXPORT int check_status(const Status& status);
+
+// Convert status to a Python exception object.  Status must not be ok.
+ARROW_PYTHON_EXPORT PyObject* convert_status(const Status& status);
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
+
+}  // extern "C++"
diff --git a/pyarrow/src/arrow/python/pyarrow_api.h b/pyarrow/src/arrow/python/pyarrow_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..a476e55a2a111332ed8594ace0fd29e2987046cb
--- /dev/null
+++ b/pyarrow/src/arrow/python/pyarrow_api.h
@@ -0,0 +1,19 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// For backward compatibility.
+#include "arrow/python/lib_api.h"
diff --git a/pyarrow/src/arrow/python/pyarrow_lib.h b/pyarrow/src/arrow/python/pyarrow_lib.h
new file mode 100644
index 0000000000000000000000000000000000000000..e509593c254468a62216e0e4a7ea073ad9a3f1d4
--- /dev/null
+++ b/pyarrow/src/arrow/python/pyarrow_lib.h
@@ -0,0 +1,19 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// For backward compatibility.
+#include "arrow/python/lib.h"
diff --git a/pyarrow/src/arrow/python/python_test.cc b/pyarrow/src/arrow/python/python_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85cc6c9a969884ea46680602383c582f0648edaf
--- /dev/null
+++ b/pyarrow/src/arrow/python/python_test.cc
@@ -0,0 +1,911 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <memory>
+#include <optional>
+#include <sstream>
+#include <string>
+
+#include "platform.h"
+
+#include "arrow/array.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/table.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/logging.h"
+
+#include "arrow/python/arrow_to_pandas.h"
+#include "arrow/python/decimal.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/numpy_internal.h"
+#include "arrow/python/numpy_interop.h"
+#include "arrow/python/python_test.h"
+#include "arrow/python/python_to_arrow.h"
+
+#define ASSERT_EQ(x, y)                                                        \
+  {                                                                            \
+    auto&& _left = (x);                                                        \
+    auto&& _right = (y);                                                       \
+    if (_left != _right) {                                                     \
+      return Status::Invalid("Expected equality between `", #x, "` and `", #y, \
+                             "`, but ", arrow::py::testing::ToString(_left),   \
+                             " != ", arrow::py::testing::ToString(_right));    \
+    }                                                                          \
+  }
+
+#define ASSERT_NE(x, y)                                                          \
+  {                                                                              \
+    auto&& _left = (x);                                                          \
+    auto&& _right = (y);                                                         \
+    if (_left == _right) {                                                       \
+      return Status::Invalid("Expected inequality between `", #x, "` and `", #y, \
+                             "`, but ", arrow::py::testing::ToString(_left),     \
+                             " == ", arrow::py::testing::ToString(_right));      \
+    }                                                                            \
+  }
+
+#define ASSERT_FALSE(v)                                                            \
+  {                                                                                \
+    auto&& _v = (v);                                                               \
+    if (!!_v) {                                                                    \
+      return Status::Invalid("Expected `", #v, "` to evaluate to false, but got ", \
+                             arrow::py::testing::ToString(_v));                    \
+    }                                                                              \
+  }
+
+#define ASSERT_TRUE(v)                                                            \
+  {                                                                               \
+    auto&& _v = (v);                                                              \
+    if (!_v) {                                                                    \
+      return Status::Invalid("Expected `", #v, "` to evaluate to true, but got ", \
+                             arrow::py::testing::ToString(_v));                   \
+    }                                                                             \
+  }
+
+#define ASSERT_FALSE_MSG(v, msg)                                                   \
+  {                                                                                \
+    auto&& _v = (v);                                                               \
+    if (!!_v) {                                                                    \
+      return Status::Invalid("Expected `", #v, "` to evaluate to false, but got ", \
+                             arrow::py::testing::ToString(_v), ": ", msg);         \
+    }                                                                              \
+  }
+
+#define ASSERT_TRUE_MSG(v, msg)                                                   \
+  {                                                                               \
+    auto&& _v = (v);                                                              \
+    if (!_v) {                                                                    \
+      return Status::Invalid("Expected `", #v, "` to evaluate to true, but got ", \
+                             arrow::py::testing::ToString(_v), ": ", msg);        \
+    }                                                                             \
+  }
+
+#define ASSERT_OK(expr)                                                     \
+  {                                                                         \
+    for (::arrow::Status _st = ::arrow::ToStatus((expr)); !_st.ok();)       \
+      return Status::Invalid("`", #expr, "` failed with ", _st.ToString()); \
+  }
+
+#define ASSERT_RAISES(code, expr)                                                     \
+  {                                                                                   \
+    for (::arrow::Status _st_expr = ::arrow::ToStatus((expr)); !_st_expr.Is##code();) \
+      return Status::Invalid("Expected `", #expr, "` to fail with ", #code,           \
+                             ", but got ", _st_expr.ToString());                      \
+  }
+
+namespace arrow {
+
+using internal::checked_cast;
+
+namespace py {
+namespace testing {
+
+// ARROW-17938: Some standard libraries have ambiguous operator<<(nullptr_t),
+// work around it using a custom printer function.
+
+template <typename T>
+std::string ToString(const T& t) {
+  std::stringstream ss;
+  ss << t;
+  return ss.str();
+}
+
+template <>
+std::string ToString(const std::nullptr_t&) {
+  return "nullptr";
+}
+
+namespace {
+
+Status TestOwnedRefMoves() {
+  std::vector<OwnedRef> vec;
+  PyObject *u, *v;
+  u = PyList_New(0);
+  v = PyList_New(0);
+
+  {
+    OwnedRef ref(u);
+    vec.push_back(std::move(ref));
+    ASSERT_EQ(ref.obj(), nullptr);
+  }
+  vec.emplace_back(v);
+  ASSERT_EQ(Py_REFCNT(u), 1);
+  ASSERT_EQ(Py_REFCNT(v), 1);
+  return Status::OK();
+}
+
+Status TestOwnedRefNoGILMoves() {
+  PyAcquireGIL lock;
+  lock.release();
+
+  {
+    std::vector<OwnedRef> vec;
+    PyObject *u, *v;
+    {
+      lock.acquire();
+      u = PyList_New(0);
+      v = PyList_New(0);
+      lock.release();
+    }
+    {
+      OwnedRefNoGIL ref(u);
+      vec.push_back(std::move(ref));
+      ASSERT_EQ(ref.obj(), nullptr);
+    }
+    vec.emplace_back(v);
+    ASSERT_EQ(Py_REFCNT(u), 1);
+    ASSERT_EQ(Py_REFCNT(v), 1);
+    return Status::OK();
+  }
+}
+
+std::string FormatPythonException(const std::string& exc_class_name,
+                                  const std::string& exc_value) {
+  std::stringstream ss;
+  ss << "Python exception: ";
+  ss << exc_class_name;
+  ss << ": ";
+  ss << exc_value;
+  ss << "\n";
+  return ss.str();
+}
+
+Status TestCheckPyErrorStatus() {
+  Status st;
+  std::string expected_detail = "";
+
+  auto check_error = [](Status& st, const char* expected_message = "some error",
+                        std::string expected_detail = "") {
+    st = CheckPyError();
+    ASSERT_EQ(st.message(), expected_message);
+    ASSERT_FALSE(PyErr_Occurred());
+    if (expected_detail.size() > 0) {
+      auto detail = st.detail();
+      ASSERT_NE(detail, nullptr);
+      ASSERT_EQ(detail->ToString(), expected_detail);
+    }
+    return Status::OK();
+  };
+
+  for (PyObject* exc_type : {PyExc_Exception, PyExc_SyntaxError}) {
+    PyErr_SetString(exc_type, "some error");
+    ASSERT_OK(check_error(st));
+    ASSERT_TRUE(st.IsUnknownError());
+  }
+
+  PyErr_SetString(PyExc_TypeError, "some error");
+  ASSERT_OK(
+      check_error(st, "some error", FormatPythonException("TypeError", "some error")));
+  ASSERT_TRUE(st.IsTypeError());
+
+  PyErr_SetString(PyExc_ValueError, "some error");
+  ASSERT_OK(check_error(st));
+  ASSERT_TRUE(st.IsInvalid());
+
+  PyErr_SetString(PyExc_KeyError, "some error");
+  ASSERT_OK(check_error(st, "'some error'"));
+  ASSERT_TRUE(st.IsKeyError());
+
+  for (PyObject* exc_type : {PyExc_OSError, PyExc_IOError}) {
+    PyErr_SetString(exc_type, "some error");
+    ASSERT_OK(check_error(st));
+    ASSERT_TRUE(st.IsIOError());
+  }
+
+  PyErr_SetString(PyExc_NotImplementedError, "some error");
+  ASSERT_OK(check_error(st, "some error",
+                        FormatPythonException("NotImplementedError", "some error")));
+  ASSERT_TRUE(st.IsNotImplemented());
+
+  // No override if a specific status code is given
+  PyErr_SetString(PyExc_TypeError, "some error");
+  st = CheckPyError(StatusCode::SerializationError);
+  ASSERT_TRUE(st.IsSerializationError());
+  ASSERT_EQ(st.message(), "some error");
+  ASSERT_FALSE(PyErr_Occurred());
+
+  return Status::OK();
+}
+
+Status TestCheckPyErrorStatusNoGIL() {
+  PyAcquireGIL lock;
+  {
+    Status st;
+    PyErr_SetString(PyExc_ZeroDivisionError, "zzzt");
+    st = ConvertPyError();
+    ASSERT_FALSE(PyErr_Occurred());
+    lock.release();
+    ASSERT_TRUE(st.IsUnknownError());
+    ASSERT_EQ(st.message(), "zzzt");
+    ASSERT_EQ(st.detail()->ToString(),
+              FormatPythonException("ZeroDivisionError", "zzzt"));
+    return Status::OK();
+  }
+}
+
+Status TestRestorePyErrorBasics() {
+  PyErr_SetString(PyExc_ZeroDivisionError, "zzzt");
+  auto st = ConvertPyError();
+  ASSERT_FALSE(PyErr_Occurred());
+  ASSERT_TRUE(st.IsUnknownError());
+  ASSERT_EQ(st.message(), "zzzt");
+  ASSERT_EQ(st.detail()->ToString(), FormatPythonException("ZeroDivisionError", "zzzt"));
+
+  RestorePyError(st);
+  ASSERT_TRUE(PyErr_Occurred());
+  PyObject* exc_type;
+  PyObject* exc_value;
+  PyObject* exc_traceback;
+  PyErr_Fetch(&exc_type, &exc_value, &exc_traceback);
+  ASSERT_TRUE(PyErr_GivenExceptionMatches(exc_type, PyExc_ZeroDivisionError));
+  std::string py_message;
+  ASSERT_OK(internal::PyObject_StdStringStr(exc_value, &py_message));
+  ASSERT_EQ(py_message, "zzzt");
+
+  return Status::OK();
+}
+
+Status TestPyBufferInvalidInputObject() {
+  std::shared_ptr<Buffer> res;
+  PyObject* input = Py_None;
+  auto old_refcnt = Py_REFCNT(input);
+  {
+    Status st = PyBuffer::FromPyObject(input).status();
+    ASSERT_TRUE_MSG(IsPyError(st), st.ToString());
+    ASSERT_FALSE(PyErr_Occurred());
+  }
+  ASSERT_EQ(old_refcnt, Py_REFCNT(input));
+  return Status::OK();
+}
+
+// Because of how it is declared, the Numpy C API instance initialized
+// within libarrow_python.dll may not be visible in this test under Windows
+// ("unresolved external symbol arrow_ARRAY_API referenced").
+#ifndef _WIN32
+Status TestPyBufferNumpyArray() {
+  npy_intp dims[1] = {10};
+
+  OwnedRef arr_ref(PyArray_SimpleNew(1, dims, NPY_FLOAT));
+  PyObject* arr = arr_ref.obj();
+  ASSERT_NE(arr, nullptr);
+  auto old_refcnt = Py_REFCNT(arr);
+  auto buf = std::move(PyBuffer::FromPyObject(arr)).ValueOrDie();
+
+  ASSERT_TRUE(buf->is_cpu());
+  ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast<PyArrayObject*>(arr)));
+  ASSERT_TRUE(buf->is_mutable());
+  ASSERT_EQ(buf->mutable_data(), buf->data());
+  ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr));
+  buf.reset();
+  ASSERT_EQ(old_refcnt, Py_REFCNT(arr));
+
+  // Read-only
+  PyArray_CLEARFLAGS(reinterpret_cast<PyArrayObject*>(arr), NPY_ARRAY_WRITEABLE);
+  buf = std::move(PyBuffer::FromPyObject(arr)).ValueOrDie();
+  ASSERT_TRUE(buf->is_cpu());
+  ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast<PyArrayObject*>(arr)));
+  ASSERT_FALSE(buf->is_mutable());
+  ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr));
+  buf.reset();
+  ASSERT_EQ(old_refcnt, Py_REFCNT(arr));
+
+  return Status::OK();
+}
+
+Status TestNumPyBufferNumpyArray() {
+  npy_intp dims[1] = {10};
+
+  OwnedRef arr_ref(PyArray_SimpleNew(1, dims, NPY_FLOAT));
+  PyObject* arr = arr_ref.obj();
+  ASSERT_NE(arr, nullptr);
+  auto old_refcnt = Py_REFCNT(arr);
+
+  auto buf = std::make_shared<NumPyBuffer>(arr);
+  ASSERT_TRUE(buf->is_cpu());
+  ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast<PyArrayObject*>(arr)));
+  ASSERT_TRUE(buf->is_mutable());
+  ASSERT_EQ(buf->mutable_data(), buf->data());
+  ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr));
+  buf.reset();
+  ASSERT_EQ(old_refcnt, Py_REFCNT(arr));
+
+  // Read-only
+  PyArray_CLEARFLAGS(reinterpret_cast<PyArrayObject*>(arr), NPY_ARRAY_WRITEABLE);
+  buf = std::make_shared<NumPyBuffer>(arr);
+  ASSERT_TRUE(buf->is_cpu());
+  ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast<PyArrayObject*>(arr)));
+  ASSERT_FALSE(buf->is_mutable());
+  ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr));
+  buf.reset();
+  ASSERT_EQ(old_refcnt, Py_REFCNT(arr));
+
+  return Status::OK();
+}
+#endif
+
+Status TestPythonDecimalToString() {
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string("-39402950693754869342983");
+  PyObject* python_object =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+  ASSERT_NE(python_object, nullptr);
+
+  std::string string_result;
+  ASSERT_OK(internal::PythonDecimalToString(python_object, &string_result));
+
+  return Status::OK();
+}
+
+Status TestInferPrecisionAndScale() {
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string("-394029506937548693.42983");
+  PyObject* python_decimal =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+
+  internal::DecimalMetadata metadata;
+  ASSERT_OK(metadata.Update(python_decimal));
+
+  const auto expected_precision =
+      static_cast<int32_t>(decimal_string.size() - 2);  // 1 for -, 1 for .
+  const int32_t expected_scale = 5;
+
+  ASSERT_EQ(expected_precision, metadata.precision());
+  ASSERT_EQ(expected_scale, metadata.scale());
+
+  return Status::OK();
+}
+
+Status TestInferPrecisionAndNegativeScale() {
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string("-3.94042983E+10");
+  PyObject* python_decimal =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+
+  internal::DecimalMetadata metadata;
+  ASSERT_OK(metadata.Update(python_decimal));
+
+  const auto expected_precision = 11;
+  const int32_t expected_scale = 0;
+
+  ASSERT_EQ(expected_precision, metadata.precision());
+  ASSERT_EQ(expected_scale, metadata.scale());
+
+  return Status::OK();
+}
+
+Status TestInferAllLeadingZeros() {
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string("0.001");
+  PyObject* python_decimal =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+
+  internal::DecimalMetadata metadata;
+  ASSERT_OK(metadata.Update(python_decimal));
+  ASSERT_EQ(3, metadata.precision());
+  ASSERT_EQ(3, metadata.scale());
+
+  return Status::OK();
+}
+
+Status TestInferAllLeadingZerosExponentialNotationPositive() {
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string("0.01E5");
+  PyObject* python_decimal =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+
+  internal::DecimalMetadata metadata;
+  ASSERT_OK(metadata.Update(python_decimal));
+  ASSERT_EQ(4, metadata.precision());
+  ASSERT_EQ(0, metadata.scale());
+
+  return Status::OK();
+}
+
+Status TestInferAllLeadingZerosExponentialNotationNegative() {
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string("0.01E3");
+  PyObject* python_decimal =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+  internal::DecimalMetadata metadata;
+  ASSERT_OK(metadata.Update(python_decimal));
+  ASSERT_EQ(2, metadata.precision());
+  ASSERT_EQ(0, metadata.scale());
+
+  return Status::OK();
+}
+
+Status TestObjectBlockWriteFails() {
+  StringBuilder builder;
+  const char value[] = {'\xf1', '\0'};
+
+  for (int i = 0; i < 1000; ++i) {
+    ASSERT_OK(builder.Append(value, static_cast<int32_t>(strlen(value))));
+  }
+
+  std::shared_ptr<Array> arr;
+  ASSERT_OK(builder.Finish(&arr));
+
+  auto f1 = field("f1", utf8());
+  auto f2 = field("f2", utf8());
+  auto f3 = field("f3", utf8());
+  std::vector<std::shared_ptr<Field>> fields = {f1, f2, f3};
+  std::vector<std::shared_ptr<Array>> cols = {arr, arr, arr};
+
+  auto schema = ::arrow::schema(fields);
+  auto table = Table::Make(schema, cols);
+
+  Status st;
+  Py_BEGIN_ALLOW_THREADS;
+  PyObject* out;
+  PandasOptions options;
+  options.use_threads = true;
+  st = ConvertTableToPandas(options, table, &out);
+  Py_END_ALLOW_THREADS;
+  ASSERT_RAISES(UnknownError, st);
+
+  return Status::OK();
+}
+
+Status TestMixedTypeFails() {
+  OwnedRef list_ref(PyList_New(3));
+  PyObject* list = list_ref.obj();
+
+  ASSERT_NE(list, nullptr);
+
+  PyObject* str = PyUnicode_FromString("abc");
+  ASSERT_NE(str, nullptr);
+
+  PyObject* integer = PyLong_FromLong(1234L);
+  ASSERT_NE(integer, nullptr);
+
+  PyObject* doub = PyFloat_FromDouble(123.0234);
+  ASSERT_NE(doub, nullptr);
+
+  // This steals a reference to each object, so we don't need to decref them later
+  // just the list
+  ASSERT_EQ(PyList_SetItem(list, 0, str), 0);
+  ASSERT_EQ(PyList_SetItem(list, 1, integer), 0);
+  ASSERT_EQ(PyList_SetItem(list, 2, doub), 0);
+
+  ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, {}));
+
+  return Status::OK();
+}
+
+template <typename DecimalValue>
+Status DecimalTestFromPythonDecimalRescale(std::shared_ptr<DataType> type,
+                                           PyObject* python_decimal,
+                                           std::optional<int> expected) {
+  DecimalValue value;
+  const auto& decimal_type = checked_cast<const DecimalType&>(*type);
+
+  if (expected.has_value()) {
+    ASSERT_OK(internal::DecimalFromPythonDecimal(python_decimal, decimal_type, &value));
+    ASSERT_EQ(expected.value(), value);
+
+    ASSERT_OK(internal::DecimalFromPyObject(python_decimal, decimal_type, &value));
+    ASSERT_EQ(expected.value(), value);
+  } else {
+    ASSERT_RAISES(Invalid, internal::DecimalFromPythonDecimal(python_decimal,
+                                                              decimal_type, &value));
+    ASSERT_RAISES(Invalid,
+                  internal::DecimalFromPyObject(python_decimal, decimal_type, &value));
+  }
+  return Status::OK();
+}
+
+Status TestFromPythonDecimalRescaleNotTruncateable() {
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string("1.001");
+  PyObject* python_decimal =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+  // We fail when truncating values that would lose data if cast to a decimal type with
+  // lower scale
+  ASSERT_OK(DecimalTestFromPythonDecimalRescale<Decimal128>(::arrow::decimal128(10, 2),
+                                                            python_decimal, {}));
+  ASSERT_OK(DecimalTestFromPythonDecimalRescale<Decimal256>(::arrow::decimal256(10, 2),
+                                                            python_decimal, {}));
+
+  return Status::OK();
+}
+
+Status TestFromPythonDecimalRescaleTruncateable() {
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string("1.000");
+  PyObject* python_decimal =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+  // We allow truncation of values that do not lose precision when dividing by 10 * the
+  // difference between the scales, e.g., 1.000 -> 1.00
+  ASSERT_OK(DecimalTestFromPythonDecimalRescale<Decimal128>(::arrow::decimal128(10, 2),
+                                                            python_decimal, 100));
+  ASSERT_OK(DecimalTestFromPythonDecimalRescale<Decimal256>(::arrow::decimal256(10, 2),
+                                                            python_decimal, 100));
+
+  return Status::OK();
+}
+
+Status TestFromPythonNegativeDecimalRescale() {
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string("-1.000");
+  PyObject* python_decimal =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+  ASSERT_OK(DecimalTestFromPythonDecimalRescale<Decimal128>(::arrow::decimal128(10, 9),
+                                                            python_decimal, -1000000000));
+  ASSERT_OK(DecimalTestFromPythonDecimalRescale<Decimal256>(::arrow::decimal256(10, 9),
+                                                            python_decimal, -1000000000));
+
+  return Status::OK();
+}
+
+Status TestDecimal128FromPythonInteger() {
+  Decimal128 value;
+  OwnedRef python_long(PyLong_FromLong(42));
+  auto type = ::arrow::decimal128(10, 2);
+  const auto& decimal_type = checked_cast<const DecimalType&>(*type);
+  ASSERT_OK(internal::DecimalFromPyObject(python_long.obj(), decimal_type, &value));
+  ASSERT_EQ(4200, value);
+  return Status::OK();
+}
+
+Status TestDecimal256FromPythonInteger() {
+  Decimal256 value;
+  OwnedRef python_long(PyLong_FromLong(42));
+  auto type = ::arrow::decimal256(10, 2);
+  const auto& decimal_type = checked_cast<const DecimalType&>(*type);
+  ASSERT_OK(internal::DecimalFromPyObject(python_long.obj(), decimal_type, &value));
+  ASSERT_EQ(4200, value);
+  return Status::OK();
+}
+
+Status TestDecimal128OverflowFails() {
+  Decimal128 value;
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string("9999999999999999999999999999999999999.9");
+  PyObject* python_decimal =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+  internal::DecimalMetadata metadata;
+  ASSERT_OK(metadata.Update(python_decimal));
+  ASSERT_EQ(38, metadata.precision());
+  ASSERT_EQ(1, metadata.scale());
+
+  auto type = ::arrow::smallest_decimal(38, 38);
+  const auto& decimal_type = checked_cast<const DecimalType&>(*type);
+  ASSERT_RAISES(Invalid,
+                internal::DecimalFromPythonDecimal(python_decimal, decimal_type, &value));
+  return Status::OK();
+}
+
+Status TestDecimal256OverflowFails() {
+  Decimal256 value;
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string(
+      "999999999999999999999999999999999999999999999999999999999999999999999999999.9");
+  PyObject* python_decimal =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+
+  internal::DecimalMetadata metadata;
+  ASSERT_OK(metadata.Update(python_decimal));
+  ASSERT_EQ(76, metadata.precision());
+  ASSERT_EQ(1, metadata.scale());
+
+  auto type = ::arrow::smallest_decimal(76, 76);
+  const auto& decimal_type = checked_cast<const DecimalType&>(*type);
+  ASSERT_RAISES(Invalid,
+                internal::DecimalFromPythonDecimal(python_decimal, decimal_type, &value));
+  return Status::OK();
+}
+
+Status TestNoneAndNaN() {
+  OwnedRef list_ref(PyList_New(4));
+  PyObject* list = list_ref.obj();
+
+  ASSERT_NE(list, nullptr);
+
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+  PyObject* constructor = decimal_constructor_.obj();
+  PyObject* decimal_value = internal::DecimalFromString(constructor, "1.234");
+  ASSERT_NE(decimal_value, nullptr);
+
+  Py_INCREF(Py_None);
+  PyObject* missing_value1 = Py_None;
+  ASSERT_NE(missing_value1, nullptr);
+
+  PyObject* missing_value2 = PyFloat_FromDouble(NPY_NAN);
+  ASSERT_NE(missing_value2, nullptr);
+
+  PyObject* missing_value3 = internal::DecimalFromString(constructor, "nan");
+  ASSERT_NE(missing_value3, nullptr);
+
+  // This steals a reference to each object, so we don't need to decref them later,
+  // just the list
+  ASSERT_EQ(0, PyList_SetItem(list, 0, decimal_value));
+  ASSERT_EQ(0, PyList_SetItem(list, 1, missing_value1));
+  ASSERT_EQ(0, PyList_SetItem(list, 2, missing_value2));
+  ASSERT_EQ(0, PyList_SetItem(list, 3, missing_value3));
+
+  PyConversionOptions options;
+  ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, options));
+
+  options.from_pandas = true;
+  auto chunked = std::move(ConvertPySequence(list, nullptr, options)).ValueOrDie();
+  ASSERT_EQ(chunked->num_chunks(), 1);
+
+  auto arr = chunked->chunk(0);
+  ASSERT_TRUE(arr->IsValid(0));
+  ASSERT_TRUE(arr->IsNull(1));
+  ASSERT_TRUE(arr->IsNull(2));
+  ASSERT_TRUE(arr->IsNull(3));
+
+  return Status::OK();
+}
+
+Status TestMixedPrecisionAndScale() {
+  std::vector<std::string> strings{{"0.001", "1.01E5", "1.01E5"}};
+
+  OwnedRef list_ref(PyList_New(static_cast<Py_ssize_t>(strings.size())));
+  PyObject* list = list_ref.obj();
+
+  ASSERT_NE(list, nullptr);
+
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+  // PyList_SetItem steals a reference to the item so we don't decref it later
+  PyObject* decimal_constructor = decimal_constructor_.obj();
+  for (Py_ssize_t i = 0; i < static_cast<Py_ssize_t>(strings.size()); ++i) {
+    const int result = PyList_SetItem(
+        list, i, internal::DecimalFromString(decimal_constructor, strings.at(i)));
+    ASSERT_EQ(0, result);
+  }
+
+  auto arr = std::move(ConvertPySequence(list, nullptr, {})).ValueOrDie();
+  const auto& type = checked_cast<const DecimalType&>(*arr->type());
+
+  int32_t expected_precision = 9;
+  int32_t expected_scale = 3;
+  ASSERT_EQ(expected_precision, type.precision());
+  ASSERT_EQ(expected_scale, type.scale());
+
+  return Status::OK();
+}
+
+Status TestMixedPrecisionAndScaleSequenceConvert() {
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string_1("0.01");
+  PyObject* value1 =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string_1);
+  ASSERT_NE(value1, nullptr);
+
+  std::string decimal_string_2("0.001");
+  PyObject* value2 =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string_2);
+  ASSERT_NE(value2, nullptr);
+
+  OwnedRef list_ref(PyList_New(2));
+  PyObject* list = list_ref.obj();
+
+  // This steals a reference to each object, so we don't need to decref them later
+  // just the list
+  ASSERT_EQ(PyList_SetItem(list, 0, value1), 0);
+  ASSERT_EQ(PyList_SetItem(list, 1, value2), 0);
+
+  auto arr = std::move(ConvertPySequence(list, nullptr, {})).ValueOrDie();
+  const auto& type = checked_cast<const Decimal128Type&>(*arr->type());
+  ASSERT_EQ(3, type.precision());
+  ASSERT_EQ(3, type.scale());
+
+  return Status::OK();
+}
+
+Status TestSimpleInference() {
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+
+  std::string decimal_string("0.01");
+  PyObject* value =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+  ASSERT_NE(value, nullptr);
+  internal::DecimalMetadata metadata;
+  ASSERT_OK(metadata.Update(value));
+  ASSERT_EQ(2, metadata.precision());
+  ASSERT_EQ(2, metadata.scale());
+
+  return Status::OK();
+}
+
+Status TestUpdateWithNaN() {
+  internal::DecimalMetadata metadata;
+  OwnedRef decimal_constructor_;
+  OwnedRef decimal_module;
+  RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module));
+  RETURN_NOT_OK(
+      internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_));
+  std::string decimal_string("nan");
+  PyObject* nan_value =
+      internal::DecimalFromString(decimal_constructor_.obj(), decimal_string);
+
+  ASSERT_OK(metadata.Update(nan_value));
+  ASSERT_EQ(std::numeric_limits<int32_t>::min(), metadata.precision());
+  ASSERT_EQ(std::numeric_limits<int32_t>::min(), metadata.scale());
+
+  return Status::OK();
+}
+
+Status TestGetNumPyTypeName() {
+  ASSERT_EQ(GetNumPyTypeName(NPY_BOOL), "bool");
+  ASSERT_EQ(GetNumPyTypeName(NPY_INT8), "int8");
+  ASSERT_EQ(GetNumPyTypeName(NPY_INT16), "int16");
+  ASSERT_EQ(GetNumPyTypeName(NPY_INT32), "int32");
+  ASSERT_EQ(GetNumPyTypeName(NPY_INT64), "int64");
+  ASSERT_EQ(GetNumPyTypeName(NPY_UINT8), "uint8");
+  ASSERT_EQ(GetNumPyTypeName(NPY_UINT16), "uint16");
+  ASSERT_EQ(GetNumPyTypeName(NPY_UINT32), "uint32");
+  ASSERT_EQ(GetNumPyTypeName(NPY_UINT64), "uint64");
+  ASSERT_EQ(GetNumPyTypeName(NPY_FLOAT32), "float32");
+  ASSERT_EQ(GetNumPyTypeName(NPY_FLOAT64), "float64");
+  return Status::OK();
+}
+
+}  // namespace
+
+std::vector<TestCase> GetCppTestCases() {
+  return {
+      {"test_owned_ref_moves", TestOwnedRefMoves},
+      {"test_owned_ref_nogil_moves", TestOwnedRefNoGILMoves},
+      {"test_check_pyerror_status", TestCheckPyErrorStatus},
+      {"test_check_pyerror_status_nogil", TestCheckPyErrorStatusNoGIL},
+      {"test_restore_pyerror_basics", TestRestorePyErrorBasics},
+      {"test_pybuffer_invalid_input_object", TestPyBufferInvalidInputObject},
+#ifndef _WIN32
+      {"test_pybuffer_numpy_array", TestPyBufferNumpyArray},
+      {"test_numpybuffer_numpy_array", TestNumPyBufferNumpyArray},
+#endif
+      {"test_python_decimal_to_string", TestPythonDecimalToString},
+      {"test_infer_precision_and_scale", TestInferPrecisionAndScale},
+      {"test_infer_precision_and_negative_scale", TestInferPrecisionAndNegativeScale},
+      {"test_infer_all_leading_zeros", TestInferAllLeadingZeros},
+      {"test_infer_all_leading_zeros_exponential_notation_positive",
+       TestInferAllLeadingZerosExponentialNotationPositive},
+      {"test_infer_all_leading_zeros_exponential_notation_negative",
+       TestInferAllLeadingZerosExponentialNotationNegative},
+      {"test_object_block_write_fails_pandas_convert", TestObjectBlockWriteFails},
+      {"test_mixed_type_fails", TestMixedTypeFails},
+      {"test_from_python_decimal_rescale_not_truncateable",
+       TestFromPythonDecimalRescaleNotTruncateable},
+      {"test_from_python_decimal_rescale_truncateable",
+       TestFromPythonDecimalRescaleTruncateable},
+      {"test_from_python_negative_decimal_rescale", TestFromPythonNegativeDecimalRescale},
+      {"test_decimal128_from_python_integer", TestDecimal128FromPythonInteger},
+      {"test_decimal256_from_python_integer", TestDecimal256FromPythonInteger},
+      {"test_decimal128_overflow_fails", TestDecimal128OverflowFails},
+      {"test_decimal256_overflow_fails", TestDecimal256OverflowFails},
+      {"test_none_and_nan", TestNoneAndNaN},
+      {"test_mixed_precision_and_scale", TestMixedPrecisionAndScale},
+      {"test_mixed_precision_and_scale_sequence_convert",
+       TestMixedPrecisionAndScaleSequenceConvert},
+      {"test_simple_inference", TestSimpleInference},
+      {"test_update_with_nan", TestUpdateWithNaN},
+      {"test_get_numpy_type_name", TestGetNumPyTypeName},
+  };
+}
+
+}  // namespace testing
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/python_test.h b/pyarrow/src/arrow/python/python_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2eb62fc29accb670f5d53e326381d68a6534335
--- /dev/null
+++ b/pyarrow/src/arrow/python/python_test.h
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "arrow/status.h"
+
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+namespace py {
+namespace testing {
+
+struct TestCase {
+  std::string name;
+  std::function<Status()> func;
+};
+
+ARROW_PYTHON_EXPORT
+std::vector<TestCase> GetCppTestCases();
+
+}  // namespace testing
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/python_to_arrow.cc b/pyarrow/src/arrow/python/python_to_arrow.cc
new file mode 100644
index 0000000000000000000000000000000000000000..139eb1d7f4ffe56737e461d1cc88711f6baf2ab9
--- /dev/null
+++ b/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -0,0 +1,1305 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/python_to_arrow.h"
+#include "arrow/python/numpy_interop.h"
+
+#include <datetime.h>
+
+#include <algorithm>
+#include <limits>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/array/builder_binary.h"
+#include "arrow/array/builder_decimal.h"
+#include "arrow/array/builder_dict.h"
+#include "arrow/array/builder_nested.h"
+#include "arrow/array/builder_primitive.h"
+#include "arrow/array/builder_time.h"
+#include "arrow/chunked_array.h"
+#include "arrow/result.h"
+#include "arrow/scalar.h"
+#include "arrow/status.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/converter.h"
+#include "arrow/util/decimal.h"
+#include "arrow/util/int_util_overflow.h"
+#include "arrow/util/logging.h"
+
+#include "arrow/python/datetime.h"
+#include "arrow/python/decimal.h"
+#include "arrow/python/helpers.h"
+#include "arrow/python/inference.h"
+#include "arrow/python/iterators.h"
+#include "arrow/python/numpy_convert.h"
+#include "arrow/python/type_traits.h"
+#include "arrow/python/vendored/pythoncapi_compat.h"
+#include "arrow/visit_type_inline.h"
+
+namespace arrow {
+
+using internal::checked_cast;
+using internal::checked_pointer_cast;
+
+using internal::Converter;
+using internal::DictionaryConverter;
+using internal::ListConverter;
+using internal::PrimitiveConverter;
+using internal::StructConverter;
+
+using internal::MakeChunker;
+using internal::MakeConverter;
+
+namespace py {
+
+namespace {
+enum class MonthDayNanoField { kMonths, kWeeksAndDays, kDaysOnly, kNanoseconds };
+
+template <MonthDayNanoField field>
+struct MonthDayNanoTraits;
+
+struct MonthDayNanoAttrData {
+  const char* name;
+  const int64_t multiplier;
+};
+
+template <>
+struct MonthDayNanoTraits<MonthDayNanoField::kMonths> {
+  using c_type = int32_t;
+  static const MonthDayNanoAttrData attrs[];
+};
+
+const MonthDayNanoAttrData MonthDayNanoTraits<MonthDayNanoField::kMonths>::attrs[] = {
+    {"years", 1}, {"months", /*months_in_year=*/12}, {nullptr, 0}};
+
+template <>
+struct MonthDayNanoTraits<MonthDayNanoField::kWeeksAndDays> {
+  using c_type = int32_t;
+  static const MonthDayNanoAttrData attrs[];
+};
+
+const MonthDayNanoAttrData MonthDayNanoTraits<MonthDayNanoField::kWeeksAndDays>::attrs[] =
+    {{"weeks", 1}, {"days", /*days_in_week=*/7}, {nullptr, 0}};
+
+template <>
+struct MonthDayNanoTraits<MonthDayNanoField::kDaysOnly> {
+  using c_type = int32_t;
+  static const MonthDayNanoAttrData attrs[];
+};
+
+const MonthDayNanoAttrData MonthDayNanoTraits<MonthDayNanoField::kDaysOnly>::attrs[] = {
+    {"days", 1}, {nullptr, 0}};
+
+template <>
+struct MonthDayNanoTraits<MonthDayNanoField::kNanoseconds> {
+  using c_type = int64_t;
+  static const MonthDayNanoAttrData attrs[];
+};
+
+const MonthDayNanoAttrData MonthDayNanoTraits<MonthDayNanoField::kNanoseconds>::attrs[] =
+    {{"hours", 1},
+     {"minutes", /*minutes_in_hours=*/60},
+     {"seconds", /*seconds_in_minute=*/60},
+     {"milliseconds", /*milliseconds_in_seconds*/ 1000},
+     {"microseconds", /*microseconds_in_milliseconds=*/1000},
+     {"nanoseconds", /*nanoseconds_in_microseconds=*/1000},
+     {nullptr, 0}};
+
+template <MonthDayNanoField field>
+struct PopulateMonthDayNano {
+  using Traits = MonthDayNanoTraits<field>;
+  using field_c_type = typename Traits::c_type;
+
+  static Status Field(PyObject* obj, field_c_type* out, bool* found_attrs) {
+    *out = 0;
+    for (const MonthDayNanoAttrData* attr = &Traits::attrs[0]; attr->multiplier != 0;
+         ++attr) {
+      if (attr->multiplier != 1 &&
+          ::arrow::internal::MultiplyWithOverflow(
+              static_cast<field_c_type>(attr->multiplier), *out, out)) {
+        return Status::Invalid("Overflow on: ", (attr - 1)->name,
+                               " for: ", internal::PyObject_StdStringRepr(obj));
+      }
+
+      OwnedRef field_value(PyObject_GetAttrString(obj, attr->name));
+      if (field_value.obj() == nullptr) {
+        // No attribute present, skip  to the next one.
+        PyErr_Clear();
+        continue;
+      }
+      RETURN_IF_PYERROR();
+      *found_attrs = true;
+      field_c_type value;
+      RETURN_NOT_OK(internal::CIntFromPython(field_value.obj(), &value, attr->name));
+      if (::arrow::internal::AddWithOverflow(*out, value, out)) {
+        return Status::Invalid("Overflow on: ", attr->name,
+                               " for: ", internal::PyObject_StdStringRepr(obj));
+      }
+    }
+
+    return Status::OK();
+  }
+};
+
+// Utility for converting single python objects to their intermediate C representations
+// which can be fed to the typed builders
+class PyValue {
+ public:
+  // Type aliases for shorter signature definitions
+  using I = PyObject*;
+  using O = PyConversionOptions;
+
+  // Used for null checking before actually converting the values
+  static bool IsNull(const O& options, I obj) {
+    if (options.from_pandas) {
+      return internal::PandasObjectIsNull(obj);
+    } else {
+      return obj == Py_None;
+    }
+  }
+
+  // Used for post-conversion numpy NaT sentinel checking
+  static bool IsNaT(const TimestampType*, int64_t value) {
+    return internal::npy_traits<NPY_DATETIME>::isnull(value);
+  }
+
+  // Used for post-conversion numpy NaT sentinel checking
+  static bool IsNaT(const DurationType*, int64_t value) {
+    return internal::npy_traits<NPY_TIMEDELTA>::isnull(value);
+  }
+
+  static Result<std::nullptr_t> Convert(const NullType*, const O&, I obj) {
+    if (obj == Py_None) {
+      return nullptr;
+    } else {
+      return Status::Invalid("Invalid null value");
+    }
+  }
+
+  static Result<bool> Convert(const BooleanType*, const O&, I obj) {
+    if (obj == Py_True) {
+      return true;
+    } else if (obj == Py_False) {
+      return false;
+    } else if (has_numpy() && PyArray_IsScalar(obj, Bool)) {
+      return reinterpret_cast<PyBoolScalarObject*>(obj)->obval == NPY_TRUE;
+    } else {
+      return internal::InvalidValue(obj, "tried to convert to boolean");
+    }
+  }
+
+  template <typename T>
+  static enable_if_integer<T, Result<typename T::c_type>> Convert(const T* type, const O&,
+                                                                  I obj) {
+    typename T::c_type value;
+    auto status = internal::CIntFromPython(obj, &value);
+    if (ARROW_PREDICT_TRUE(status.ok())) {
+      return value;
+    } else if (!internal::PyIntScalar_Check(obj)) {
+      std::stringstream ss;
+      ss << "tried to convert to " << type->ToString();
+      return internal::InvalidValue(obj, ss.str());
+    } else {
+      return status;
+    }
+  }
+
+  static Result<uint16_t> Convert(const HalfFloatType*, const O&, I obj) {
+    if (internal::PyFloatScalar_Check(obj)) {
+      return PyFloat_AsHalf(obj);
+    } else if (internal::PyIntScalar_Check(obj)) {
+      double float_val{};
+      RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &float_val));
+      const auto half_val = arrow::util::Float16::FromDouble(float_val);
+      return half_val.bits();
+    } else {
+      return internal::InvalidValue(obj, "tried to convert to float16");
+    }
+  }
+
+  static Result<float> Convert(const FloatType*, const O&, I obj) {
+    float value;
+    if (internal::PyFloatScalar_Check(obj)) {
+      value = static_cast<float>(PyFloat_AsDouble(obj));
+      RETURN_IF_PYERROR();
+    } else if (internal::PyIntScalar_Check(obj)) {
+      RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &value));
+    } else {
+      return internal::InvalidValue(obj, "tried to convert to float32");
+    }
+    return value;
+  }
+
+  static Result<double> Convert(const DoubleType*, const O&, I obj) {
+    double value;
+    if (PyFloat_Check(obj)) {
+      value = PyFloat_AS_DOUBLE(obj);
+    } else if (internal::PyFloatScalar_Check(obj)) {
+      // Other kinds of float-y things
+      value = PyFloat_AsDouble(obj);
+      RETURN_IF_PYERROR();
+    } else if (internal::PyIntScalar_Check(obj)) {
+      RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &value));
+    } else {
+      return internal::InvalidValue(obj, "tried to convert to double");
+    }
+    return value;
+  }
+
+  static Result<Decimal32> Convert(const Decimal32Type* type, const O&, I obj) {
+    Decimal32 value;
+    RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value));
+    return value;
+  }
+
+  static Result<Decimal64> Convert(const Decimal64Type* type, const O&, I obj) {
+    Decimal64 value;
+    RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value));
+    return value;
+  }
+
+  static Result<Decimal128> Convert(const Decimal128Type* type, const O&, I obj) {
+    Decimal128 value;
+    RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value));
+    return value;
+  }
+
+  static Result<Decimal256> Convert(const Decimal256Type* type, const O&, I obj) {
+    Decimal256 value;
+    RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value));
+    return value;
+  }
+
+  static Result<int32_t> Convert(const Date32Type*, const O&, I obj) {
+    int32_t value;
+    if (PyDate_Check(obj)) {
+      auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
+      value = static_cast<int32_t>(internal::PyDate_to_days(pydate));
+    } else {
+      RETURN_NOT_OK(
+          internal::CIntFromPython(obj, &value, "Integer too large for date32"));
+    }
+    return value;
+  }
+
+  static Result<int64_t> Convert(const Date64Type*, const O&, I obj) {
+    int64_t value;
+    if (PyDateTime_Check(obj)) {
+      auto pydate = reinterpret_cast<PyDateTime_DateTime*>(obj);
+      value = internal::PyDateTime_to_ms(pydate);
+      // Truncate any intraday milliseconds
+      // TODO: introduce an option for this
+      value -= value % 86400000LL;
+    } else if (PyDate_Check(obj)) {
+      auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
+      value = internal::PyDate_to_ms(pydate);
+    } else {
+      RETURN_NOT_OK(
+          internal::CIntFromPython(obj, &value, "Integer too large for date64"));
+    }
+    return value;
+  }
+
+  static Result<int32_t> Convert(const Time32Type* type, const O&, I obj) {
+    int32_t value;
+    if (PyTime_Check(obj)) {
+      switch (type->unit()) {
+        case TimeUnit::SECOND:
+          value = static_cast<int32_t>(internal::PyTime_to_s(obj));
+          break;
+        case TimeUnit::MILLI:
+          value = static_cast<int32_t>(internal::PyTime_to_ms(obj));
+          break;
+        default:
+          return Status::UnknownError("Invalid time unit");
+      }
+    } else {
+      RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int32"));
+    }
+    return value;
+  }
+
+  static Result<int64_t> Convert(const Time64Type* type, const O&, I obj) {
+    int64_t value;
+    if (PyTime_Check(obj)) {
+      switch (type->unit()) {
+        case TimeUnit::MICRO:
+          value = internal::PyTime_to_us(obj);
+          break;
+        case TimeUnit::NANO:
+          value = internal::PyTime_to_ns(obj);
+          break;
+        default:
+          return Status::UnknownError("Invalid time unit");
+      }
+    } else {
+      RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int64"));
+    }
+    return value;
+  }
+
+  static Result<int64_t> Convert(const TimestampType* type, const O& options, I obj) {
+    int64_t value, offset;
+    if (PyDateTime_Check(obj)) {
+      if (ARROW_PREDICT_FALSE(options.ignore_timezone)) {
+        offset = 0;
+      } else {
+        ARROW_ASSIGN_OR_RAISE(offset, internal::PyDateTime_utcoffset_s(obj));
+      }
+      auto dt = reinterpret_cast<PyDateTime_DateTime*>(obj);
+      switch (type->unit()) {
+        case TimeUnit::SECOND:
+          value = internal::PyDateTime_to_s(dt) - offset;
+          break;
+        case TimeUnit::MILLI:
+          value = internal::PyDateTime_to_ms(dt) - offset * 1000LL;
+          break;
+        case TimeUnit::MICRO:
+          value = internal::PyDateTime_to_us(dt) - offset * 1000000LL;
+          break;
+        case TimeUnit::NANO:
+          if (internal::IsPandasTimestamp(obj)) {
+            // pd.Timestamp value attribute contains the offset from unix epoch
+            // so no adjustment for timezone is need.
+            OwnedRef nanos(PyObject_GetAttrString(obj, "value"));
+            RETURN_IF_PYERROR();
+            RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value));
+          } else {
+            // Conversion to nanoseconds can overflow -> check multiply of microseconds
+            value = internal::PyDateTime_to_us(dt);
+            if (arrow::internal::MultiplyWithOverflow(value, 1000LL, &value)) {
+              return internal::InvalidValue(obj,
+                                            "out of bounds for nanosecond resolution");
+            }
+
+            // Adjust with offset and check for overflow
+            if (arrow::internal::SubtractWithOverflow(value, offset * 1000000000LL,
+                                                      &value)) {
+              return internal::InvalidValue(obj,
+                                            "out of bounds for nanosecond resolution");
+            }
+          }
+          break;
+        default:
+          return Status::UnknownError("Invalid time unit");
+      }
+    } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
+      // validate that the numpy scalar has np.datetime64 dtype
+      ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj));
+      if (!numpy_type->Equals(*type)) {
+        return Status::NotImplemented("Expected np.datetime64 but got: ",
+                                      numpy_type->ToString());
+      }
+      return reinterpret_cast<PyDatetimeScalarObject*>(obj)->obval;
+    } else {
+      RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
+    }
+    return value;
+  }
+
+  static Result<MonthDayNanoIntervalType::MonthDayNanos> Convert(
+      const MonthDayNanoIntervalType* /*type*/, const O& /*options*/, I obj) {
+    MonthDayNanoIntervalType::MonthDayNanos output;
+    bool found_attrs = false;
+    RETURN_NOT_OK(PopulateMonthDayNano<MonthDayNanoField::kMonths>::Field(
+        obj, &output.months, &found_attrs));
+    // on relativeoffset weeks is a property calculated from days.  On
+    // DateOffset is a field on its own. timedelta doesn't have a weeks
+    // attribute.
+    PyObject* pandas_date_offset_type = internal::BorrowPandasDataOffsetType();
+    bool is_date_offset = pandas_date_offset_type == (PyObject*)Py_TYPE(obj);
+    if (!is_date_offset) {
+      RETURN_NOT_OK(PopulateMonthDayNano<MonthDayNanoField::kDaysOnly>::Field(
+          obj, &output.days, &found_attrs));
+    } else {
+      RETURN_NOT_OK(PopulateMonthDayNano<MonthDayNanoField::kWeeksAndDays>::Field(
+          obj, &output.days, &found_attrs));
+    }
+    RETURN_NOT_OK(PopulateMonthDayNano<MonthDayNanoField::kNanoseconds>::Field(
+        obj, &output.nanoseconds, &found_attrs));
+
+    // date_offset can have zero fields.
+    if (found_attrs || is_date_offset) {
+      return output;
+    }
+    if (PyTuple_Check(obj) && PyTuple_Size(obj) == 3) {
+      RETURN_NOT_OK(internal::CIntFromPython(PyTuple_GET_ITEM(obj, 0), &output.months,
+                                             "Months (tuple item #0) too large"));
+      RETURN_NOT_OK(internal::CIntFromPython(PyTuple_GET_ITEM(obj, 1), &output.days,
+                                             "Days (tuple item #1) too large"));
+      RETURN_NOT_OK(internal::CIntFromPython(PyTuple_GET_ITEM(obj, 2),
+                                             &output.nanoseconds,
+                                             "Nanoseconds (tuple item #2) too large"));
+      return output;
+    }
+    return Status::TypeError("No temporal attributes found on object.");
+  }
+
+  static Result<int64_t> Convert(const DurationType* type, const O&, I obj) {
+    int64_t value;
+    if (PyDelta_Check(obj)) {
+      auto dt = reinterpret_cast<PyDateTime_Delta*>(obj);
+      switch (type->unit()) {
+        case TimeUnit::SECOND:
+          value = internal::PyDelta_to_s(dt);
+          break;
+        case TimeUnit::MILLI:
+          value = internal::PyDelta_to_ms(dt);
+          break;
+        case TimeUnit::MICRO: {
+          ARROW_ASSIGN_OR_RAISE(value, internal::PyDelta_to_us(dt));
+          break;
+        }
+        case TimeUnit::NANO:
+          if (internal::IsPandasTimedelta(obj)) {
+            OwnedRef nanos(PyObject_GetAttrString(obj, "value"));
+            RETURN_IF_PYERROR();
+            RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value));
+          } else {
+            ARROW_ASSIGN_OR_RAISE(value, internal::PyDelta_to_ns(dt));
+          }
+          break;
+        default:
+          return Status::UnknownError("Invalid time unit");
+      }
+    } else if (has_numpy() && PyArray_CheckAnyScalarExact(obj)) {
+      // validate that the numpy scalar has np.datetime64 dtype
+      ARROW_ASSIGN_OR_RAISE(auto numpy_type, NumPyScalarToArrowDataType(obj));
+      if (!numpy_type->Equals(*type)) {
+        return Status::NotImplemented("Expected np.timedelta64 but got: ",
+                                      numpy_type->ToString());
+      }
+      return reinterpret_cast<PyTimedeltaScalarObject*>(obj)->obval;
+    } else {
+      RETURN_NOT_OK(internal::CIntFromPython(obj, &value));
+    }
+    return value;
+  }
+
+  // The binary-like intermediate representation is PyBytesView because it keeps temporary
+  // python objects alive (non-contiguous memoryview) and stores whether the original
+  // object was unicode encoded or not, which is used for unicode -> bytes coercion if
+  // there is a non-unicode object observed.
+
+  static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) {
+    return view.ParseString(obj);
+  }
+
+  static Status Convert(const BinaryViewType*, const O&, I obj, PyBytesView& view) {
+    return view.ParseString(obj);
+  }
+
+  static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
+                        PyBytesView& view) {
+    ARROW_RETURN_NOT_OK(view.ParseString(obj));
+    if (view.size != type->byte_width()) {
+      std::stringstream ss;
+      ss << "expected to be length " << type->byte_width() << " was " << view.size;
+      return internal::InvalidValue(obj, ss.str());
+    } else {
+      return Status::OK();
+    }
+  }
+
+  template <typename T>
+  static enable_if_t<is_string_type<T>::value || is_string_view_type<T>::value, Status>
+  Convert(const T*, const O& options, I obj, PyBytesView& view) {
+    if (options.strict) {
+      // Strict conversion, force output to be unicode / utf8 and validate that
+      // any binary values are utf8
+      ARROW_RETURN_NOT_OK(view.ParseString(obj, true));
+      if (!view.is_utf8) {
+        return internal::InvalidValue(obj, "was not a utf8 string");
+      }
+      return Status::OK();
+    } else {
+      // Non-strict conversion; keep track of whether values are unicode or bytes
+      return view.ParseString(obj);
+    }
+  }
+
+  static Result<bool> Convert(const DataType* type, const O&, I obj) {
+    return Status::NotImplemented("PyValue::Convert is not implemented for type ", type);
+  }
+};
+
+// The base Converter class is a mixin with predefined behavior and constructors.
+class PyConverter : public Converter<PyObject*, PyConversionOptions> {
+ public:
+  // Iterate over the input values and defer the conversion to the Append method
+  Status Extend(PyObject* values, int64_t size, int64_t offset = 0) override {
+    ARROW_DCHECK_GE(size, offset);
+    /// Ensure we've allocated enough space
+    RETURN_NOT_OK(this->Reserve(size - offset));
+    // Iterate over the items adding each one
+    return internal::VisitSequence(
+        values, offset,
+        [this](PyObject* item, bool* /* unused */) { return this->Append(item); });
+  }
+
+  // Convert and append a sequence of values masked with a numpy array
+  Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size,
+                      int64_t offset = 0) override {
+    ARROW_DCHECK_GE(size, offset);
+    /// Ensure we've allocated enough space
+    RETURN_NOT_OK(this->Reserve(size - offset));
+    // Iterate over the items adding each one
+    return internal::VisitSequenceMasked(
+        values, mask, offset, [this](PyObject* item, bool is_masked, bool* /* unused */) {
+          if (is_masked) {
+            return this->AppendNull();
+          } else {
+            // This will also apply the null-checking convention in the event
+            // that the value is not masked
+            return this->Append(item);  // perhaps use AppendValue instead?
+          }
+        });
+  }
+};
+
+template <typename T, typename Enable = void>
+class PyPrimitiveConverter;
+
+template <typename T>
+class PyListConverter;
+
+template <typename U, typename Enable = void>
+class PyDictionaryConverter;
+
+class PyStructConverter;
+
+template <typename T, typename Enable = void>
+struct PyConverterTrait;
+
+template <typename T>
+struct PyConverterTrait<
+    T, enable_if_t<(!is_nested_type<T>::value && !is_interval_type<T>::value &&
+                    !is_extension_type<T>::value) ||
+                   std::is_same<T, MonthDayNanoIntervalType>::value>> {
+  using type = PyPrimitiveConverter<T>;
+};
+
+template <typename T>
+struct PyConverterTrait<
+    T, enable_if_t<is_list_like_type<T>::value || is_list_view_type<T>::value>> {
+  using type = PyListConverter<T>;
+};
+
+template <>
+struct PyConverterTrait<StructType> {
+  using type = PyStructConverter;
+};
+
+template <>
+struct PyConverterTrait<DictionaryType> {
+  template <typename T>
+  using dictionary_type = PyDictionaryConverter<T>;
+};
+
+template <typename T>
+class PyPrimitiveConverter<T, enable_if_null<T>>
+    : public PrimitiveConverter<T, PyConverter> {
+ public:
+  Status Append(PyObject* value) override {
+    if (PyValue::IsNull(this->options_, value)) {
+      return this->primitive_builder_->AppendNull();
+    } else if (arrow::py::is_scalar(value)) {
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+                            arrow::py::unwrap_scalar(value));
+      if (scalar->is_valid) {
+        return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(),
+                               " to builder for type null");
+      } else {
+        return this->primitive_builder_->AppendNull();
+      }
+    } else {
+      ARROW_ASSIGN_OR_RAISE(
+          auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
+      return this->primitive_builder_->Append(converted);
+    }
+  }
+};
+
+template <typename T>
+class PyPrimitiveConverter<
+    T, enable_if_t<is_boolean_type<T>::value || is_number_type<T>::value ||
+                   is_decimal_type<T>::value || is_date_type<T>::value ||
+                   is_time_type<T>::value ||
+                   std::is_same<MonthDayNanoIntervalType, T>::value>>
+    : public PrimitiveConverter<T, PyConverter> {
+ public:
+  Status Append(PyObject* value) override {
+    // Since the required space has been already allocated in the Extend functions we can
+    // rely on the Unsafe builder API which improves the performance.
+    if (PyValue::IsNull(this->options_, value)) {
+      this->primitive_builder_->UnsafeAppendNull();
+    } else if (arrow::py::is_scalar(value)) {
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+                            arrow::py::unwrap_scalar(value));
+      ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
+    } else {
+      ARROW_ASSIGN_OR_RAISE(
+          auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
+      this->primitive_builder_->UnsafeAppend(converted);
+    }
+    return Status::OK();
+  }
+};
+
+template <typename T>
+class PyPrimitiveConverter<
+    T, enable_if_t<is_timestamp_type<T>::value || is_duration_type<T>::value>>
+    : public PrimitiveConverter<T, PyConverter> {
+ public:
+  Status Append(PyObject* value) override {
+    if (PyValue::IsNull(this->options_, value)) {
+      this->primitive_builder_->UnsafeAppendNull();
+    } else if (arrow::py::is_scalar(value)) {
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+                            arrow::py::unwrap_scalar(value));
+      ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
+    } else {
+      ARROW_ASSIGN_OR_RAISE(
+          auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
+      // Numpy NaT sentinels can be checked after the conversion
+      if (has_numpy() && PyArray_CheckAnyScalarExact(value) &&
+          PyValue::IsNaT(this->primitive_type_, converted)) {
+        this->primitive_builder_->UnsafeAppendNull();
+      } else {
+        this->primitive_builder_->UnsafeAppend(converted);
+      }
+    }
+    return Status::OK();
+  }
+};
+
+template <typename T>
+class PyPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::value>>
+    : public PrimitiveConverter<T, PyConverter> {
+ public:
+  Status Append(PyObject* value) override {
+    if (PyValue::IsNull(this->options_, value)) {
+      this->primitive_builder_->UnsafeAppendNull();
+    } else if (arrow::py::is_scalar(value)) {
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+                            arrow::py::unwrap_scalar(value));
+      ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
+    } else {
+      ARROW_RETURN_NOT_OK(
+          PyValue::Convert(this->primitive_type_, this->options_, value, view_));
+      ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size));
+      this->primitive_builder_->UnsafeAppend(view_.bytes);
+    }
+    return Status::OK();
+  }
+
+ protected:
+  PyBytesView view_;
+};
+
+template <typename T, typename Enable = void>
+struct OffsetTypeTrait {
+  using type = typename T::offset_type;
+};
+
+template <typename T>
+struct OffsetTypeTrait<T, enable_if_binary_view_like<T>> {
+  using type = int64_t;
+};
+
+template <typename T>
+class PyPrimitiveConverter<
+    T, enable_if_t<is_base_binary_type<T>::value || is_binary_view_like_type<T>::value>>
+    : public PrimitiveConverter<T, PyConverter> {
+ public:
+  using OffsetType = typename OffsetTypeTrait<T>::type;
+
+  Status Append(PyObject* value) override {
+    if (PyValue::IsNull(this->options_, value)) {
+      this->primitive_builder_->UnsafeAppendNull();
+    } else if (arrow::py::is_scalar(value)) {
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+                            arrow::py::unwrap_scalar(value));
+      ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
+    } else {
+      ARROW_RETURN_NOT_OK(
+          PyValue::Convert(this->primitive_type_, this->options_, value, view_));
+      if (!view_.is_utf8) {
+        // observed binary value
+        observed_binary_ = true;
+      }
+      // Since we don't know the varying length input size in advance, we need to
+      // reserve space in the value builder one by one. ReserveData raises CapacityError
+      // if the value would not fit into the array.
+      ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size));
+      this->primitive_builder_->UnsafeAppend(view_.bytes,
+                                             static_cast<OffsetType>(view_.size));
+    }
+    return Status::OK();
+  }
+
+  Result<std::shared_ptr<Array>> ToArray() override {
+    ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter<T, PyConverter>::ToArray()));
+    if (observed_binary_) {
+      // if we saw any non-unicode, cast results to BinaryArray
+      auto binary_type = TypeTraits<typename T::PhysicalType>::type_singleton();
+      return array->View(binary_type);
+    } else {
+      return array;
+    }
+  }
+
+ protected:
+  PyBytesView view_;
+  bool observed_binary_ = false;
+};
+
+template <typename U>
+class PyDictionaryConverter<U, enable_if_has_c_type<U>>
+    : public DictionaryConverter<U, PyConverter> {
+ public:
+  Status Append(PyObject* value) override {
+    if (PyValue::IsNull(this->options_, value)) {
+      return this->value_builder_->AppendNull();
+    } else if (arrow::py::is_scalar(value)) {
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+                            arrow::py::unwrap_scalar(value));
+      return this->value_builder_->AppendScalar(*scalar, 1);
+    } else {
+      ARROW_ASSIGN_OR_RAISE(auto converted,
+                            PyValue::Convert(this->value_type_, this->options_, value));
+      return this->value_builder_->Append(converted);
+    }
+  }
+};
+
+template <typename U>
+class PyDictionaryConverter<U, enable_if_has_string_view<U>>
+    : public DictionaryConverter<U, PyConverter> {
+ public:
+  Status Append(PyObject* value) override {
+    if (PyValue::IsNull(this->options_, value)) {
+      return this->value_builder_->AppendNull();
+    } else if (arrow::py::is_scalar(value)) {
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+                            arrow::py::unwrap_scalar(value));
+      return this->value_builder_->AppendScalar(*scalar, 1);
+    } else {
+      ARROW_RETURN_NOT_OK(
+          PyValue::Convert(this->value_type_, this->options_, value, view_));
+      return this->value_builder_->Append(view_.bytes, static_cast<int32_t>(view_.size));
+    }
+  }
+
+ protected:
+  PyBytesView view_;
+};
+
+template <typename T>
+class PyListConverter : public ListConverter<T, PyConverter, PyConverterTrait> {
+ public:
+  Status Append(PyObject* value) override {
+    if (PyValue::IsNull(this->options_, value)) {
+      return this->list_builder_->AppendNull();
+    }
+    if (has_numpy() && PyArray_Check(value)) {
+      RETURN_NOT_OK(AppendNdarray(value));
+    } else if (PySequence_Check(value)) {
+      RETURN_NOT_OK(AppendSequence(value));
+    } else if (PySet_Check(value) || (Py_TYPE(value) == &PyDictValues_Type)) {
+      RETURN_NOT_OK(AppendIterable(value));
+    } else if (PyDict_Check(value) && this->type()->id() == Type::MAP) {
+      // Branch to support Python Dict with `map` DataType.
+      auto items = PyDict_Items(value);
+      OwnedRef item_ref(items);
+      RETURN_NOT_OK(AppendSequence(items));
+    } else {
+      return internal::InvalidType(
+          value, "was not a sequence or recognized null for conversion to list type");
+    }
+
+    return ValidateBuilder(this->list_type_);
+  }
+
+ protected:
+  // MapType does not support args in the Append() method
+  Status AppendTo(const MapType*, int64_t size) { return this->list_builder_->Append(); }
+
+  // FixedSizeListType does not support args in the Append() method
+  Status AppendTo(const FixedSizeListType*, int64_t size) {
+    return this->list_builder_->Append();
+  }
+
+  // ListType requires the size argument in the Append() method
+  // in order to be convertible to a ListViewType. ListViewType
+  // requires the size argument in the Append() method always.
+  Status AppendTo(const BaseListType*, int64_t size) {
+    return this->list_builder_->Append(true, size);
+  }
+
+  Status ValidateBuilder(const MapType*) {
+    if (this->list_builder_->key_builder()->null_count() > 0) {
+      return Status::Invalid("Invalid Map: key field cannot contain null values");
+    } else {
+      return Status::OK();
+    }
+  }
+
+  Status ValidateBuilder(const BaseListType*) { return Status::OK(); }
+
+  Status AppendSequence(PyObject* value) {
+    int64_t size = static_cast<int64_t>(PySequence_Size(value));
+    RETURN_NOT_OK(AppendTo(this->list_type_, size));
+    RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));
+    return this->value_converter_->Extend(value, size);
+  }
+
+  Status AppendIterable(PyObject* value) {
+    auto size = static_cast<int64_t>(PyObject_Size(value));
+    RETURN_NOT_OK(AppendTo(this->list_type_, size));
+    PyObject* iterator = PyObject_GetIter(value);
+    OwnedRef iter_ref(iterator);
+    while (PyObject* item = PyIter_Next(iterator)) {
+      OwnedRef item_ref(item);
+      RETURN_NOT_OK(this->value_converter_->Reserve(1));
+      RETURN_NOT_OK(this->value_converter_->Append(item));
+    }
+    return Status::OK();
+  }
+
+  Status AppendNdarray(PyObject* value) {
+    PyArrayObject* ndarray = reinterpret_cast<PyArrayObject*>(value);
+    if (PyArray_NDIM(ndarray) != 1) {
+      return Status::Invalid("Can only convert 1-dimensional array values");
+    }
+    if (PyArray_ISBYTESWAPPED(ndarray)) {
+      // TODO
+      return Status::NotImplemented("Byte-swapped arrays not supported");
+    }
+    const int64_t size = PyArray_SIZE(ndarray);
+    RETURN_NOT_OK(AppendTo(this->list_type_, size));
+    RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size));
+
+    const auto value_type = this->value_converter_->builder()->type();
+    switch (value_type->id()) {
+// If the value type does not match the expected NumPy dtype, then fall through
+// to a slower PySequence-based path
+#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE)         \
+  case Type::TYPE_ID: {                                   \
+    if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \
+      return this->value_converter_->Extend(value, size); \
+    }                                                     \
+    return AppendNdarrayTyped<TYPE, NUMPY_TYPE>(ndarray); \
+  }
+      LIST_FAST_CASE(BOOL, BooleanType, NPY_BOOL)
+      LIST_FAST_CASE(UINT8, UInt8Type, NPY_UINT8)
+      LIST_FAST_CASE(INT8, Int8Type, NPY_INT8)
+      LIST_FAST_CASE(UINT16, UInt16Type, NPY_UINT16)
+      LIST_FAST_CASE(INT16, Int16Type, NPY_INT16)
+      LIST_FAST_CASE(UINT32, UInt32Type, NPY_UINT32)
+      LIST_FAST_CASE(INT32, Int32Type, NPY_INT32)
+      LIST_FAST_CASE(UINT64, UInt64Type, NPY_UINT64)
+      LIST_FAST_CASE(INT64, Int64Type, NPY_INT64)
+      LIST_FAST_CASE(HALF_FLOAT, HalfFloatType, NPY_FLOAT16)
+      LIST_FAST_CASE(FLOAT, FloatType, NPY_FLOAT)
+      LIST_FAST_CASE(DOUBLE, DoubleType, NPY_DOUBLE)
+      LIST_FAST_CASE(TIMESTAMP, TimestampType, NPY_DATETIME)
+      LIST_FAST_CASE(DURATION, DurationType, NPY_TIMEDELTA)
+#undef LIST_FAST_CASE
+      default: {
+        return this->value_converter_->Extend(value, size);
+      }
+    }
+  }
+
+  template <typename ArrowType, int NUMPY_TYPE>
+  Status AppendNdarrayTyped(PyArrayObject* ndarray) {
+    // no need to go through the conversion
+    using NumpyTrait = internal::npy_traits<NUMPY_TYPE>;
+    using NumpyType = typename NumpyTrait::value_type;
+    using ValueBuilderType = typename TypeTraits<ArrowType>::BuilderType;
+
+    const bool null_sentinels_possible =
+        // Always treat Numpy's NaT as null
+        NUMPY_TYPE == NPY_DATETIME || NUMPY_TYPE == NPY_TIMEDELTA ||
+        // Observing pandas's null sentinels
+        (this->options_.from_pandas && NumpyTrait::supports_nulls);
+
+    auto value_builder =
+        checked_cast<ValueBuilderType*>(this->value_converter_->builder().get());
+
+    Ndarray1DIndexer<NumpyType> values(ndarray);
+    if (null_sentinels_possible) {
+      for (int64_t i = 0; i < values.size(); ++i) {
+        if (NumpyTrait::isnull(values[i])) {
+          RETURN_NOT_OK(value_builder->AppendNull());
+        } else {
+          RETURN_NOT_OK(value_builder->Append(values[i]));
+        }
+      }
+    } else if (!values.is_strided()) {
+      RETURN_NOT_OK(value_builder->AppendValues(values.data(), values.size()));
+    } else {
+      for (int64_t i = 0; i < values.size(); ++i) {
+        RETURN_NOT_OK(value_builder->Append(values[i]));
+      }
+    }
+    return Status::OK();
+  }
+};
+
+class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait> {
+ public:
+  Status Append(PyObject* value) override {
+    if (PyValue::IsNull(this->options_, value)) {
+      return this->struct_builder_->AppendNull();
+    } else if (arrow::py::is_scalar(value)) {
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
+                            arrow::py::unwrap_scalar(value));
+      return this->struct_builder_->AppendScalar(*scalar);
+    }
+    switch (input_kind_) {
+      case InputKind::DICT:
+        RETURN_NOT_OK(AppendDict(value));
+        return this->struct_builder_->Append();
+      case InputKind::TUPLE:
+        RETURN_NOT_OK(AppendTuple(value));
+        return this->struct_builder_->Append();
+      case InputKind::ITEMS:
+        RETURN_NOT_OK(AppendItems(value));
+        return this->struct_builder_->Append();
+      default:
+        RETURN_NOT_OK(InferInputKind(value));
+        return Append(value);
+    }
+  }
+
+ protected:
+  Status Init(MemoryPool* pool) override {
+    RETURN_NOT_OK((StructConverter<PyConverter, PyConverterTrait>::Init(pool)));
+
+    // This implementation will check the child values before appending itself,
+    // so no rewind is necessary
+    this->rewind_on_overflow_ = false;
+
+    // Store the field names as a PyObjects for dict matching
+    num_fields_ = this->struct_type_->num_fields();
+    bytes_field_names_.reset(PyList_New(num_fields_));
+    unicode_field_names_.reset(PyList_New(num_fields_));
+    RETURN_IF_PYERROR();
+
+    for (int i = 0; i < num_fields_; i++) {
+      const auto& field_name = this->struct_type_->field(i)->name();
+      PyObject* bytes = PyBytes_FromStringAndSize(field_name.c_str(), field_name.size());
+      PyObject* unicode =
+          PyUnicode_FromStringAndSize(field_name.c_str(), field_name.size());
+      RETURN_IF_PYERROR();
+      PyList_SET_ITEM(bytes_field_names_.obj(), i, bytes);
+      PyList_SET_ITEM(unicode_field_names_.obj(), i, unicode);
+    }
+    return Status::OK();
+  }
+
+  Status InferInputKind(PyObject* value) {
+    // Infer input object's type, note that heterogeneous sequences are not allowed
+    if (PyDict_Check(value)) {
+      input_kind_ = InputKind::DICT;
+    } else if (PyTuple_Check(value)) {
+      input_kind_ = InputKind::TUPLE;
+    } else if (PySequence_Check(value)) {
+      input_kind_ = InputKind::ITEMS;
+    } else {
+      return internal::InvalidType(value,
+                                   "was not a dict, tuple, or recognized null value "
+                                   "for conversion to struct type");
+    }
+    return Status::OK();
+  }
+
+  Status InferKeyKind(PyObject* items) {
+    for (int i = 0; i < PySequence_Length(items); i++) {
+      // retrieve the key from the passed key-value pairs
+      ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i));
+
+      // check key exists between the unicode field names
+      bool do_contain = PySequence_Contains(unicode_field_names_.obj(), pair.first);
+      RETURN_IF_PYERROR();
+      if (do_contain) {
+        key_kind_ = KeyKind::UNICODE;
+        return Status::OK();
+      }
+
+      // check key exists between the bytes field names
+      do_contain = PySequence_Contains(bytes_field_names_.obj(), pair.first);
+      RETURN_IF_PYERROR();
+      if (do_contain) {
+        key_kind_ = KeyKind::BYTES;
+        return Status::OK();
+      }
+    }
+    return Status::OK();
+  }
+
+  Status AppendEmpty() {
+    for (int i = 0; i < num_fields_; i++) {
+      RETURN_NOT_OK(this->children_[i]->Append(Py_None));
+    }
+    return Status::OK();
+  }
+
+  Status AppendTuple(PyObject* tuple) {
+    if (!PyTuple_Check(tuple)) {
+      return internal::InvalidType(tuple, "was expecting a tuple");
+    }
+    if (PyTuple_GET_SIZE(tuple) != num_fields_) {
+      return Status::Invalid("Tuple size must be equal to number of struct fields");
+    }
+    for (int i = 0; i < num_fields_; i++) {
+      PyObject* value = PyTuple_GET_ITEM(tuple, i);
+      RETURN_NOT_OK(this->children_[i]->Append(value));
+    }
+    return Status::OK();
+  }
+
+  Status AppendDict(PyObject* dict) {
+    if (!PyDict_Check(dict)) {
+      return internal::InvalidType(dict, "was expecting a dict");
+    }
+    switch (key_kind_) {
+      case KeyKind::UNICODE:
+        return AppendDict(dict, unicode_field_names_.obj());
+      case KeyKind::BYTES:
+        return AppendDict(dict, bytes_field_names_.obj());
+      default:
+        OwnedRef item_ref(PyDict_Items(dict));
+        RETURN_NOT_OK(InferKeyKind(item_ref.obj()));
+        if (key_kind_ == KeyKind::UNKNOWN) {
+          // was unable to infer the type which means that all keys are absent
+          return AppendEmpty();
+        } else {
+          return AppendDict(dict);
+        }
+    }
+  }
+
+  Status AppendItems(PyObject* items) {
+    if (!PySequence_Check(items)) {
+      return internal::InvalidType(items, "was expecting a sequence of key-value items");
+    }
+    switch (key_kind_) {
+      case KeyKind::UNICODE:
+        return AppendItems(items, unicode_field_names_.obj());
+      case KeyKind::BYTES:
+        return AppendItems(items, bytes_field_names_.obj());
+      default:
+        RETURN_NOT_OK(InferKeyKind(items));
+        if (key_kind_ == KeyKind::UNKNOWN) {
+          // was unable to infer the type which means that all keys are absent
+          return AppendEmpty();
+        } else {
+          return AppendItems(items);
+        }
+    }
+  }
+
+  Status AppendDict(PyObject* dict, PyObject* field_names) {
+    // NOTE we're ignoring any extraneous dict items
+    for (int i = 0; i < num_fields_; i++) {
+      PyObject* name = PyList_GetItemRef(field_names, i);
+      RETURN_IF_PYERROR();
+      OwnedRef nameref(name);
+      PyObject* value;
+      PyDict_GetItemRef(dict, name, &value);
+      RETURN_IF_PYERROR();
+      OwnedRef valueref(value);
+      RETURN_NOT_OK(this->children_[i]->Append(value ? value : Py_None));
+    }
+    return Status::OK();
+  }
+
+  Result<std::pair<PyObject*, PyObject*>> GetKeyValuePair(PyObject* seq, int index) {
+    PyObject* pair = PySequence_GetItem(seq, index);
+    RETURN_IF_PYERROR();
+    OwnedRef pair_ref(pair);  // ensure reference count is decreased at scope end
+    if (!PyTuple_Check(pair) || PyTuple_Size(pair) != 2) {
+      return internal::InvalidType(pair, "was expecting tuple of (key, value) pair");
+    }
+    PyObject* key = PyTuple_GetItem(pair, 0);
+    RETURN_IF_PYERROR();
+    PyObject* value = PyTuple_GetItem(pair, 1);
+    RETURN_IF_PYERROR();
+    return std::make_pair(key, value);
+  }
+
+  Status AppendItems(PyObject* items, PyObject* field_names) {
+    auto length = static_cast<int>(PySequence_Size(items));
+    RETURN_IF_PYERROR();
+
+    // append the values for the defined fields
+    for (int i = 0; i < std::min(num_fields_, length); i++) {
+      // retrieve the key-value pair
+      ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i));
+
+      // validate that the key and the field name are equal
+      PyObject* name = PyList_GetItemRef(field_names, i);
+      RETURN_IF_PYERROR();
+      OwnedRef nameref(name);
+      bool are_equal = PyObject_RichCompareBool(pair.first, name, Py_EQ);
+      RETURN_IF_PYERROR();
+
+      // finally append to the respective child builder
+      if (are_equal) {
+        RETURN_NOT_OK(this->children_[i]->Append(pair.second));
+      } else {
+        ARROW_ASSIGN_OR_RAISE(auto key_view, PyBytesView::FromString(pair.first));
+        ARROW_ASSIGN_OR_RAISE(auto name_view, PyBytesView::FromString(name));
+        return Status::Invalid("The expected field name is `", name_view.bytes, "` but `",
+                               key_view.bytes, "` was given");
+      }
+    }
+    // insert null values for missing fields
+    for (int i = length; i < num_fields_; i++) {
+      RETURN_NOT_OK(this->children_[i]->AppendNull());
+    }
+    return Status::OK();
+  }
+
+  // Whether we're converting from a sequence of dicts or tuples or list of pairs
+  enum class InputKind { UNKNOWN, DICT, TUPLE, ITEMS } input_kind_ = InputKind::UNKNOWN;
+  // Whether the input dictionary keys' type is python bytes or unicode
+  enum class KeyKind { UNKNOWN, BYTES, UNICODE } key_kind_ = KeyKind::UNKNOWN;
+  // Store the field names as a PyObjects for dict matching
+  OwnedRef bytes_field_names_;
+  OwnedRef unicode_field_names_;
+  // Store the number of fields for later reuse
+  int num_fields_;
+};
+
+// Convert *obj* to a sequence if necessary
+// Fill *size* to its length.  If >= 0 on entry, *size* is an upper size
+// bound that may lead to truncation.
+Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* size) {
+  if (PySequence_Check(obj)) {
+    // obj is already a sequence
+    int64_t real_size = static_cast<int64_t>(PySequence_Size(obj));
+    RETURN_IF_PYERROR();
+    if (*size < 0) {
+      *size = real_size;
+    } else {
+      *size = std::min(real_size, *size);
+    }
+    Py_INCREF(obj);
+    *seq = obj;
+  } else if (*size < 0) {
+    // unknown size, exhaust iterator
+    *seq = PySequence_List(obj);
+    RETURN_IF_PYERROR();
+    *size = static_cast<int64_t>(PyList_GET_SIZE(*seq));
+  } else {
+    // size is known but iterator could be infinite
+    Py_ssize_t i, n = *size;
+    PyObject* iter = PyObject_GetIter(obj);
+    RETURN_IF_PYERROR();
+    OwnedRef iter_ref(iter);
+    PyObject* lst = PyList_New(n);
+    RETURN_IF_PYERROR();
+    for (i = 0; i < n; i++) {
+      PyObject* item = PyIter_Next(iter);
+      if (!item) {
+        // either an error occurred or the iterator ended
+        RETURN_IF_PYERROR();
+        break;
+      }
+      PyList_SET_ITEM(lst, i, item);
+    }
+    // Shrink list if len(iterator) < size
+    if (i < n && PyList_SetSlice(lst, i, n, NULL)) {
+      Py_DECREF(lst);
+      RETURN_IF_PYERROR();
+    }
+    *seq = lst;
+    *size = std::min<int64_t>(i, *size);
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject* mask,
+                                                        PyConversionOptions options,
+                                                        MemoryPool* pool) {
+  PyAcquireGIL lock;
+
+  PyObject* seq = nullptr;
+  OwnedRef tmp_seq_nanny;
+
+  ARROW_ASSIGN_OR_RAISE(auto is_pandas_imported, internal::IsModuleImported("pandas"));
+  if (is_pandas_imported) {
+    // If pandas has been already imported initialize the static pandas objects to
+    // support converting from pd.Timedelta and pd.Timestamp objects
+    internal::InitPandasStaticData();
+  }
+
+  int64_t size = options.size;
+  RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size));
+  tmp_seq_nanny.reset(seq);
+
+  // In some cases, type inference may be "loose", like strings. If the user
+  // passed pa.string(), then we will error if we encounter any non-UTF8
+  // value. If not, then we will allow the result to be a BinaryArray
+  if (options.type == nullptr) {
+    ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas));
+    options.strict = false;
+  } else {
+    options.strict = true;
+  }
+  ARROW_DCHECK_GE(size, 0);
+
+  ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter<PyConverter, PyConverterTrait>(
+                                            options.type, options, pool)));
+  if (converter->may_overflow()) {
+    // The converter hierarchy contains binary- or list-like builders which can overflow
+    // depending on the input values. Wrap the converter with a chunker which detects
+    // the overflow and automatically creates new chunks.
+    ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(std::move(converter)));
+    if (mask != nullptr && mask != Py_None) {
+      RETURN_NOT_OK(chunked_converter->ExtendMasked(seq, mask, size));
+    } else {
+      RETURN_NOT_OK(chunked_converter->Extend(seq, size));
+    }
+    return chunked_converter->ToChunkedArray();
+  } else {
+    // If the converter can't overflow spare the capacity error checking on the hot-path,
+    // this improves the performance roughly by ~10% for primitive types.
+    if (mask != nullptr && mask != Py_None) {
+      RETURN_NOT_OK(converter->ExtendMasked(seq, mask, size));
+    } else {
+      RETURN_NOT_OK(converter->Extend(seq, size));
+    }
+    return converter->ToChunkedArray();
+  }
+}
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/python_to_arrow.h b/pyarrow/src/arrow/python/python_to_arrow.h
new file mode 100644
index 0000000000000000000000000000000000000000..d167996ba8da6796ac62da0fa0186419a3211930
--- /dev/null
+++ b/pyarrow/src/arrow/python/python_to_arrow.h
@@ -0,0 +1,80 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Functions for converting between CPython built-in data structures and Arrow
+// data structures
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "arrow/python/visibility.h"
+#include "arrow/type.h"
+#include "arrow/util/macros.h"
+
+#include "arrow/python/common.h"
+
+namespace arrow {
+
+class Array;
+class Status;
+
+namespace py {
+
+struct PyConversionOptions {
+  PyConversionOptions() = default;
+
+  PyConversionOptions(const std::shared_ptr<DataType>& type, int64_t size,
+                      MemoryPool* pool, bool from_pandas)
+      : type(type), size(size), from_pandas(from_pandas) {}
+
+  // Set to null if to be inferred
+  std::shared_ptr<DataType> type;
+
+  // Default is -1, which indicates the size should the same as the input sequence
+  int64_t size = -1;
+
+  bool from_pandas = false;
+
+  /// Used to maintain backwards compatibility for
+  /// timezone bugs (see ARROW-9528).  Should be removed
+  /// after Arrow 2.0 release.
+  bool ignore_timezone = false;
+
+  bool strict = false;
+};
+
+/// \brief Convert sequence (list, generator, NumPy array with dtype object) of
+/// Python objects.
+/// \param[in] obj the sequence to convert
+/// \param[in] mask a NumPy array of true/false values to indicate whether
+/// values in the sequence are null (true) or not null (false). This parameter
+/// may be null
+/// \param[in] options various conversion options
+/// \param[in] pool MemoryPool to use for allocations
+/// \return Result ChunkedArray
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(
+    PyObject* obj, PyObject* mask, PyConversionOptions options,
+    MemoryPool* pool = default_memory_pool());
+
+}  // namespace py
+
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/type_traits.h b/pyarrow/src/arrow/python/type_traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..865e1af4276711b07de28185ce22bf7663a3cdbb
--- /dev/null
+++ b/pyarrow/src/arrow/python/type_traits.h
@@ -0,0 +1,353 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Internal header
+
+#pragma once
+
+#include "arrow/python/platform.h"
+
+#include <cstdint>
+#include <limits>
+
+#include "arrow/python/numpy_interop.h"
+
+#include "arrow/type_fwd.h"
+#include "arrow/util/float16.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace py {
+
+static constexpr int64_t kPandasTimestampNull = std::numeric_limits<int64_t>::min();
+constexpr int64_t kNanosecondsInDay = 86400000000000LL;
+
+namespace internal {
+
+//
+// Type traits for Numpy -> Arrow equivalence
+//
+template <int TYPE>
+struct npy_traits {};
+
+template <>
+struct npy_traits<NPY_BOOL> {
+  typedef uint8_t value_type;
+  using TypeClass = BooleanType;
+  using BuilderClass = BooleanBuilder;
+
+  static constexpr bool supports_nulls = false;
+  static inline bool isnull(uint8_t v) { return false; }
+};
+
+#define NPY_INT_DECL(TYPE, CapType, T)               \
+  template <>                                        \
+  struct npy_traits<NPY_##TYPE> {                    \
+    typedef T value_type;                            \
+    using TypeClass = CapType##Type;                 \
+    using BuilderClass = CapType##Builder;           \
+                                                     \
+    static constexpr bool supports_nulls = false;    \
+    static inline bool isnull(T v) { return false; } \
+  };
+
+NPY_INT_DECL(INT8, Int8, int8_t);
+NPY_INT_DECL(INT16, Int16, int16_t);
+NPY_INT_DECL(INT32, Int32, int32_t);
+NPY_INT_DECL(INT64, Int64, int64_t);
+
+NPY_INT_DECL(UINT8, UInt8, uint8_t);
+NPY_INT_DECL(UINT16, UInt16, uint16_t);
+NPY_INT_DECL(UINT32, UInt32, uint32_t);
+NPY_INT_DECL(UINT64, UInt64, uint64_t);
+
+#if !NPY_INT32_IS_INT && NPY_BITSOF_INT == 32
+NPY_INT_DECL(INT, Int32, int32_t);
+NPY_INT_DECL(UINT, UInt32, uint32_t);
+#endif
+#if !NPY_INT64_IS_LONG_LONG && NPY_BITSOF_LONGLONG == 64
+NPY_INT_DECL(LONGLONG, Int64, int64_t);
+NPY_INT_DECL(ULONGLONG, UInt64, uint64_t);
+#endif
+
+template <>
+struct npy_traits<NPY_FLOAT16> {
+  typedef uint16_t value_type;
+  using TypeClass = HalfFloatType;
+  using BuilderClass = HalfFloatBuilder;
+
+  static constexpr uint16_t na_sentinel =
+      std::numeric_limits<arrow::util::Float16>::quiet_NaN().bits();
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(uint16_t v) {
+    return arrow::util::Float16::FromBits(v).is_nan();
+  }
+};
+
+template <>
+struct npy_traits<NPY_FLOAT32> {
+  typedef float value_type;
+  using TypeClass = FloatType;
+  using BuilderClass = FloatBuilder;
+
+  // We need to use quiet_NaN here instead of the NAN macro as on Windows
+  // the NAN macro leads to "division-by-zero" compile-time error with clang.
+  static constexpr float na_sentinel = std::numeric_limits<float>::quiet_NaN();
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(float v) { return v != v; }
+};
+
+template <>
+struct npy_traits<NPY_FLOAT64> {
+  typedef double value_type;
+  using TypeClass = DoubleType;
+  using BuilderClass = DoubleBuilder;
+
+  static constexpr double na_sentinel = std::numeric_limits<double>::quiet_NaN();
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(double v) { return v != v; }
+};
+
+template <>
+struct npy_traits<NPY_DATETIME> {
+  typedef int64_t value_type;
+  using TypeClass = TimestampType;
+  using BuilderClass = TimestampBuilder;
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(int64_t v) {
+    // NaT = -2**63
+    // = -0x8000000000000000
+    // = -9223372036854775808;
+    // = std::numeric_limits<int64_t>::min()
+    return v == std::numeric_limits<int64_t>::min();
+  }
+};
+
+template <>
+struct npy_traits<NPY_TIMEDELTA> {
+  typedef int64_t value_type;
+  using TypeClass = DurationType;
+  using BuilderClass = DurationBuilder;
+
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(int64_t v) {
+    // NaT = -2**63 = std::numeric_limits<int64_t>::min()
+    return v == std::numeric_limits<int64_t>::min();
+  }
+};
+
+template <>
+struct npy_traits<NPY_OBJECT> {
+  typedef PyObject* value_type;
+  static constexpr bool supports_nulls = true;
+
+  static inline bool isnull(PyObject* v) { return v == Py_None; }
+};
+
+//
+// Type traits for Arrow -> Numpy equivalence
+// Note *supports_nulls* means the equivalent Numpy type support nulls
+//
+template <int TYPE>
+struct arrow_traits {};
+
+template <>
+struct arrow_traits<Type::BOOL> {
+  static constexpr int npy_type = NPY_BOOL;
+  static constexpr bool supports_nulls = false;
+  typedef typename npy_traits<NPY_BOOL>::value_type T;
+};
+
+#define INT_DECL(TYPE)                                                           \
+  template <>                                                                    \
+  struct arrow_traits<Type::TYPE> {                                              \
+    static constexpr int npy_type = NPY_##TYPE;                                  \
+    static constexpr bool supports_nulls = false;                                \
+    static constexpr double na_value = std::numeric_limits<double>::quiet_NaN(); \
+    typedef typename npy_traits<NPY_##TYPE>::value_type T;                       \
+  };
+
+INT_DECL(INT8);
+INT_DECL(INT16);
+INT_DECL(INT32);
+INT_DECL(INT64);
+INT_DECL(UINT8);
+INT_DECL(UINT16);
+INT_DECL(UINT32);
+INT_DECL(UINT64);
+
+template <>
+struct arrow_traits<Type::HALF_FLOAT> {
+  static constexpr int npy_type = NPY_FLOAT16;
+  static constexpr bool supports_nulls = true;
+  static constexpr uint16_t na_value =
+      std::numeric_limits<arrow::util::Float16>::quiet_NaN().bits();
+  typedef typename npy_traits<NPY_FLOAT16>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::FLOAT> {
+  static constexpr int npy_type = NPY_FLOAT32;
+  static constexpr bool supports_nulls = true;
+  static constexpr float na_value = std::numeric_limits<float>::quiet_NaN();
+  typedef typename npy_traits<NPY_FLOAT32>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::DOUBLE> {
+  static constexpr int npy_type = NPY_FLOAT64;
+  static constexpr bool supports_nulls = true;
+  static constexpr double na_value = std::numeric_limits<double>::quiet_NaN();
+  typedef typename npy_traits<NPY_FLOAT64>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::TIMESTAMP> {
+  static constexpr int npy_type = NPY_DATETIME;
+  static constexpr int64_t npy_shift = 1;
+
+  static constexpr bool supports_nulls = true;
+  static constexpr int64_t na_value = kPandasTimestampNull;
+  typedef typename npy_traits<NPY_DATETIME>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::DURATION> {
+  static constexpr int npy_type = NPY_TIMEDELTA;
+  static constexpr int64_t npy_shift = 1;
+
+  static constexpr bool supports_nulls = true;
+  static constexpr int64_t na_value = kPandasTimestampNull;
+  typedef typename npy_traits<NPY_TIMEDELTA>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::DATE32> {
+  // Data stores as FR_D day unit
+  static constexpr int npy_type = NPY_DATETIME;
+  static constexpr int64_t npy_shift = 1;
+
+  static constexpr bool supports_nulls = true;
+  typedef typename npy_traits<NPY_DATETIME>::value_type T;
+
+  static constexpr int64_t na_value = kPandasTimestampNull;
+  static inline bool isnull(int64_t v) { return npy_traits<NPY_DATETIME>::isnull(v); }
+};
+
+template <>
+struct arrow_traits<Type::DATE64> {
+  // Data stores as FR_D day unit
+  static constexpr int npy_type = NPY_DATETIME;
+
+  // There are 1000 * 60 * 60 * 24 = 86400000ms in a day
+  static constexpr int64_t npy_shift = 86400000;
+
+  static constexpr bool supports_nulls = true;
+  typedef typename npy_traits<NPY_DATETIME>::value_type T;
+
+  static constexpr int64_t na_value = kPandasTimestampNull;
+  static inline bool isnull(int64_t v) { return npy_traits<NPY_DATETIME>::isnull(v); }
+};
+
+template <>
+struct arrow_traits<Type::TIME32> {
+  static constexpr int npy_type = NPY_OBJECT;
+  static constexpr bool supports_nulls = true;
+  static constexpr int64_t na_value = kPandasTimestampNull;
+  typedef typename npy_traits<NPY_DATETIME>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::TIME64> {
+  static constexpr int npy_type = NPY_OBJECT;
+  static constexpr bool supports_nulls = true;
+  typedef typename npy_traits<NPY_DATETIME>::value_type T;
+};
+
+template <>
+struct arrow_traits<Type::STRING> {
+  static constexpr int npy_type = NPY_OBJECT;
+  static constexpr bool supports_nulls = true;
+};
+
+template <>
+struct arrow_traits<Type::BINARY> {
+  static constexpr int npy_type = NPY_OBJECT;
+  static constexpr bool supports_nulls = true;
+};
+
+static inline NPY_DATETIMEUNIT NumPyFrequency(TimeUnit::type unit) {
+  switch (unit) {
+    case TimestampType::Unit::SECOND:
+      return NPY_FR_s;
+    case TimestampType::Unit::MILLI:
+      return NPY_FR_ms;
+      break;
+    case TimestampType::Unit::MICRO:
+      return NPY_FR_us;
+    default:
+      // NANO
+      return NPY_FR_ns;
+  }
+}
+
+static inline int NumPyTypeSize(int npy_type) {
+  npy_type = fix_numpy_type_num(npy_type);
+
+  switch (npy_type) {
+    case NPY_BOOL:
+    case NPY_INT8:
+    case NPY_UINT8:
+      return 1;
+    case NPY_INT16:
+    case NPY_UINT16:
+      return 2;
+    case NPY_INT32:
+    case NPY_UINT32:
+      return 4;
+    case NPY_INT64:
+    case NPY_UINT64:
+      return 8;
+    case NPY_FLOAT16:
+      return 2;
+    case NPY_FLOAT32:
+      return 4;
+    case NPY_FLOAT64:
+      return 8;
+    case NPY_DATETIME:
+      return 8;
+    case NPY_OBJECT:
+      return sizeof(void*);
+    default:
+      ARROW_CHECK(false) << "unhandled numpy type";
+      break;
+  }
+  return -1;
+}
+
+}  // namespace internal
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/udf.cc b/pyarrow/src/arrow/python/udf.cc
new file mode 100644
index 0000000000000000000000000000000000000000..940e403ef693bad99aeab11022cb5c4e868a4854
--- /dev/null
+++ b/pyarrow/src/arrow/python/udf.cc
@@ -0,0 +1,707 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/udf.h"
+
+#include "arrow/array/array_nested.h"
+#include "arrow/array/builder_base.h"
+#include "arrow/buffer_builder.h"
+#include "arrow/compute/api_aggregate.h"
+#include "arrow/compute/api_vector.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/kernel.h"
+#include "arrow/compute/row/grouper.h"
+#include "arrow/python/common.h"
+#include "arrow/python/vendored/pythoncapi_compat.h"
+#include "arrow/table.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+using compute::ExecSpan;
+using compute::Grouper;
+using compute::KernelContext;
+using compute::KernelState;
+using internal::checked_cast;
+
+namespace py {
+namespace {
+
+struct PythonUdfKernelState : public compute::KernelState {
+  // NOTE: this KernelState constructor doesn't require the GIL.
+  // If it did, the corresponding KernelInit::operator() should be wrapped
+  // within SafeCallIntoPython (GH-43487).
+  explicit PythonUdfKernelState(std::shared_ptr<OwnedRefNoGIL> function)
+      : function(std::move(function)) {}
+
+  std::shared_ptr<OwnedRefNoGIL> function;
+};
+
+struct PythonUdfKernelInit {
+  explicit PythonUdfKernelInit(std::shared_ptr<OwnedRefNoGIL> function)
+      : function(std::move(function)) {}
+
+  Result<std::unique_ptr<compute::KernelState>> operator()(
+      compute::KernelContext*, const compute::KernelInitArgs&) {
+    return std::make_unique<PythonUdfKernelState>(function);
+  }
+
+  std::shared_ptr<OwnedRefNoGIL> function;
+};
+
+struct ScalarUdfAggregator : public compute::KernelState {
+  virtual Status Consume(compute::KernelContext* ctx, const compute::ExecSpan& batch) = 0;
+  virtual Status MergeFrom(compute::KernelContext* ctx, compute::KernelState&& src) = 0;
+  virtual Status Finalize(compute::KernelContext* ctx, Datum* out) = 0;
+};
+
+struct HashUdfAggregator : public compute::KernelState {
+  virtual Status Resize(KernelContext* ctx, int64_t size) = 0;
+  virtual Status Consume(KernelContext* ctx, const ExecSpan& batch) = 0;
+  virtual Status Merge(KernelContext* ct, KernelState&& other, const ArrayData&) = 0;
+  virtual Status Finalize(KernelContext* ctx, Datum* out) = 0;
+};
+
+Status AggregateUdfConsume(compute::KernelContext* ctx, const compute::ExecSpan& batch) {
+  return checked_cast<ScalarUdfAggregator*>(ctx->state())->Consume(ctx, batch);
+}
+
+Status AggregateUdfMerge(compute::KernelContext* ctx, compute::KernelState&& src,
+                         compute::KernelState* dst) {
+  return checked_cast<ScalarUdfAggregator*>(dst)->MergeFrom(ctx, std::move(src));
+}
+
+Status AggregateUdfFinalize(compute::KernelContext* ctx, arrow::Datum* out) {
+  return checked_cast<ScalarUdfAggregator*>(ctx->state())->Finalize(ctx, out);
+}
+
+Status HashAggregateUdfResize(KernelContext* ctx, int64_t size) {
+  return checked_cast<HashUdfAggregator*>(ctx->state())->Resize(ctx, size);
+}
+
+Status HashAggregateUdfConsume(KernelContext* ctx, const ExecSpan& batch) {
+  return checked_cast<HashUdfAggregator*>(ctx->state())->Consume(ctx, batch);
+}
+
+Status HashAggregateUdfMerge(KernelContext* ctx, KernelState&& src,
+                             const ArrayData& group_id_mapping) {
+  return checked_cast<HashUdfAggregator*>(ctx->state())
+      ->Merge(ctx, std::move(src), group_id_mapping);
+}
+
+Status HashAggregateUdfFinalize(KernelContext* ctx, Datum* out) {
+  return checked_cast<HashUdfAggregator*>(ctx->state())->Finalize(ctx, out);
+}
+
+struct PythonTableUdfKernelInit {
+  PythonTableUdfKernelInit(std::shared_ptr<OwnedRefNoGIL> function_maker,
+                           UdfWrapperCallback cb)
+      : function_maker(std::move(function_maker)), cb(std::move(cb)) {}
+
+  Result<std::unique_ptr<compute::KernelState>> operator()(
+      compute::KernelContext* ctx, const compute::KernelInitArgs&) {
+    return SafeCallIntoPython(
+        [this, ctx]() -> Result<std::unique_ptr<compute::KernelState>> {
+          UdfContext udf_context{ctx->memory_pool(), /*batch_length=*/0};
+          OwnedRef empty_tuple(PyTuple_New(0));
+          auto function = std::make_shared<OwnedRefNoGIL>(
+              cb(function_maker->obj(), udf_context, empty_tuple.obj()));
+          RETURN_NOT_OK(CheckPyError());
+          if (!PyCallable_Check(function->obj())) {
+            return Status::TypeError("Expected a callable Python object.");
+          }
+          return std::make_unique<PythonUdfKernelState>(std::move(function));
+        });
+  }
+
+  std::shared_ptr<OwnedRefNoGIL> function_maker;
+  UdfWrapperCallback cb;
+};
+
+struct PythonUdfScalarAggregatorImpl : public ScalarUdfAggregator {
+  PythonUdfScalarAggregatorImpl(std::shared_ptr<OwnedRefNoGIL> function,
+                                UdfWrapperCallback cb,
+                                std::vector<std::shared_ptr<DataType>> input_types,
+                                std::shared_ptr<DataType> output_type)
+      : function(std::move(function)),
+        cb(std::move(cb)),
+        output_type(std::move(output_type)) {
+    std::vector<std::shared_ptr<Field>> fields;
+    for (size_t i = 0; i < input_types.size(); i++) {
+      fields.push_back(field("", input_types[i]));
+    }
+    input_schema = schema(std::move(fields));
+  };
+
+  Status Consume(compute::KernelContext* ctx, const compute::ExecSpan& batch) override {
+    ARROW_ASSIGN_OR_RAISE(
+        auto rb, batch.ToExecBatch().ToRecordBatch(input_schema, ctx->memory_pool()));
+    values.push_back(std::move(rb));
+    return Status::OK();
+  }
+
+  Status MergeFrom(compute::KernelContext* ctx, compute::KernelState&& src) override {
+    auto& other_values = checked_cast<PythonUdfScalarAggregatorImpl&>(src).values;
+    values.insert(values.end(), std::make_move_iterator(other_values.begin()),
+                  std::make_move_iterator(other_values.end()));
+
+    other_values.erase(other_values.begin(), other_values.end());
+    return Status::OK();
+  }
+
+  Status Finalize(compute::KernelContext* ctx, Datum* out) override {
+    auto state =
+        arrow::internal::checked_cast<PythonUdfScalarAggregatorImpl*>(ctx->state());
+    const int num_args = input_schema->num_fields();
+
+    // Note: The way that batches are concatenated together
+    // would result in using double amount of the memory.
+    // This is OK for now because non decomposable aggregate
+    // UDF is supposed to be used with segmented aggregation
+    // where the size of the segment is more or less constant
+    // so doubling that is not a big deal. This can be also
+    // improved in the future to use more efficient way to
+    // concatenate.
+    ARROW_ASSIGN_OR_RAISE(auto table,
+                          arrow::Table::FromRecordBatches(input_schema, values));
+    ARROW_ASSIGN_OR_RAISE(table, table->CombineChunks(ctx->memory_pool()));
+    UdfContext udf_context{ctx->memory_pool(), table->num_rows()};
+
+    if (table->num_rows() == 0) {
+      return Status::Invalid("Finalized is called with empty inputs");
+    }
+
+    RETURN_NOT_OK(SafeCallIntoPython([&] {
+      std::unique_ptr<OwnedRef> result;
+      OwnedRef arg_tuple(PyTuple_New(num_args));
+      RETURN_NOT_OK(CheckPyError());
+
+      for (int arg_id = 0; arg_id < num_args; arg_id++) {
+        // Since we combined chunks there is only one chunk
+        std::shared_ptr<Array> c_data = table->column(arg_id)->chunk(0);
+        PyObject* data = wrap_array(c_data);
+        PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
+      }
+      result =
+          std::make_unique<OwnedRef>(cb(function->obj(), udf_context, arg_tuple.obj()));
+      RETURN_NOT_OK(CheckPyError());
+      // unwrapping the output for expected output type
+      if (is_scalar(result->obj())) {
+        ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> val, unwrap_scalar(result->obj()));
+        if (*output_type != *val->type) {
+          return Status::TypeError("Expected output datatype ", output_type->ToString(),
+                                   ", but function returned datatype ",
+                                   val->type->ToString());
+        }
+        out->value = std::move(val);
+        return Status::OK();
+      }
+      return Status::TypeError("Unexpected output type: ",
+                               Py_TYPE(result->obj())->tp_name, " (expected Scalar)");
+    }));
+    return Status::OK();
+  }
+
+  std::shared_ptr<OwnedRefNoGIL> function;
+  UdfWrapperCallback cb;
+  std::vector<std::shared_ptr<RecordBatch>> values;
+  std::shared_ptr<Schema> input_schema;
+  std::shared_ptr<DataType> output_type;
+};
+
+struct PythonUdfHashAggregatorImpl : public HashUdfAggregator {
+  PythonUdfHashAggregatorImpl(std::shared_ptr<OwnedRefNoGIL> function,
+                              UdfWrapperCallback cb,
+                              std::vector<std::shared_ptr<DataType>> input_types,
+                              std::shared_ptr<DataType> output_type)
+      : function(std::move(function)),
+        cb(std::move(cb)),
+        output_type(std::move(output_type)) {
+    std::vector<std::shared_ptr<Field>> fields;
+    fields.reserve(input_types.size());
+    for (size_t i = 0; i < input_types.size(); i++) {
+      fields.push_back(field("", input_types[i]));
+    }
+    input_schema = schema(std::move(fields));
+  };
+
+  // same as ApplyGrouping in partition.cc
+  // replicated the code here to avoid complicating the dependencies
+  static Result<RecordBatchVector> ApplyGroupings(
+      const ListArray& groupings, const std::shared_ptr<RecordBatch>& batch) {
+    ARROW_ASSIGN_OR_RAISE(Datum sorted,
+                          compute::Take(batch, groupings.data()->child_data[0]));
+
+    const auto& sorted_batch = *sorted.record_batch();
+
+    RecordBatchVector out(static_cast<size_t>(groupings.length()));
+    for (size_t i = 0; i < out.size(); ++i) {
+      out[i] = sorted_batch.Slice(groupings.value_offset(i), groupings.value_length(i));
+    }
+
+    return out;
+  }
+
+  Status Resize(KernelContext* ctx, int64_t new_num_groups) override {
+    // We only need to change num_groups in resize
+    // similar to other hash aggregate kernels
+    num_groups = new_num_groups;
+    return Status::OK();
+  }
+
+  Status Consume(KernelContext* ctx, const ExecSpan& batch) override {
+    ARROW_ASSIGN_OR_RAISE(
+        std::shared_ptr<RecordBatch> rb,
+        batch.ToExecBatch().ToRecordBatch(input_schema, ctx->memory_pool()));
+
+    // This is similar to GroupedListImpl
+    // last array is the group id
+    const ArraySpan& groups_array_data = batch[batch.num_values() - 1].array;
+    ARROW_DCHECK_EQ(groups_array_data.offset, 0);
+    int64_t batch_num_values = groups_array_data.length;
+    const auto* batch_groups = groups_array_data.GetValues<uint32_t>(1);
+    RETURN_NOT_OK(groups.Append(batch_groups, batch_num_values));
+    values.push_back(std::move(rb));
+    num_values += batch_num_values;
+    return Status::OK();
+  }
+  Status Merge(KernelContext* ctx, KernelState&& other_state,
+               const ArrayData& group_id_mapping) override {
+    // This is similar to GroupedListImpl
+    auto& other = checked_cast<PythonUdfHashAggregatorImpl&>(other_state);
+    auto& other_values = other.values;
+    const uint32_t* other_raw_groups = other.groups.data();
+    values.insert(values.end(), std::make_move_iterator(other_values.begin()),
+                  std::make_move_iterator(other_values.end()));
+
+    auto g = group_id_mapping.GetValues<uint32_t>(1);
+    for (uint32_t other_g = 0; static_cast<int64_t>(other_g) < other.num_values;
+         ++other_g) {
+      // Different state can have different group_id mappings, so we
+      // need to translate the ids
+      RETURN_NOT_OK(groups.Append(g[other_raw_groups[other_g]]));
+    }
+
+    num_values += other.num_values;
+    return Status::OK();
+  }
+
+  Status Finalize(KernelContext* ctx, Datum* out) override {
+    // Exclude the last column which is the group id
+    const int num_args = input_schema->num_fields() - 1;
+
+    ARROW_ASSIGN_OR_RAISE(auto groups_buffer, groups.Finish());
+    ARROW_ASSIGN_OR_RAISE(auto groupings,
+                          Grouper::MakeGroupings(UInt32Array(num_values, groups_buffer),
+                                                 static_cast<uint32_t>(num_groups)));
+
+    ARROW_ASSIGN_OR_RAISE(auto table,
+                          arrow::Table::FromRecordBatches(input_schema, values));
+    ARROW_ASSIGN_OR_RAISE(auto rb, table->CombineChunksToBatch(ctx->memory_pool()));
+    UdfContext udf_context{ctx->memory_pool(), table->num_rows()};
+
+    if (rb->num_rows() == 0) {
+      *out = Datum();
+      return Status::OK();
+    }
+
+    ARROW_ASSIGN_OR_RAISE(RecordBatchVector rbs, ApplyGroupings(*groupings, rb));
+
+    return SafeCallIntoPython([&] {
+      ARROW_ASSIGN_OR_RAISE(std::unique_ptr<ArrayBuilder> builder,
+                            MakeBuilder(output_type, ctx->memory_pool()));
+      for (auto& group_rb : rbs) {
+        std::unique_ptr<OwnedRef> result;
+        OwnedRef arg_tuple(PyTuple_New(num_args));
+        RETURN_NOT_OK(CheckPyError());
+
+        for (int arg_id = 0; arg_id < num_args; arg_id++) {
+          // Since we combined chunks there is only one chunk
+          std::shared_ptr<Array> c_data = group_rb->column(arg_id);
+          PyObject* data = wrap_array(c_data);
+          PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
+        }
+
+        result =
+            std::make_unique<OwnedRef>(cb(function->obj(), udf_context, arg_tuple.obj()));
+        RETURN_NOT_OK(CheckPyError());
+
+        // unwrapping the output for expected output type
+        if (is_scalar(result->obj())) {
+          ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> val,
+                                unwrap_scalar(result->obj()));
+          if (*output_type != *val->type) {
+            return Status::TypeError("Expected output datatype ", output_type->ToString(),
+                                     ", but function returned datatype ",
+                                     val->type->ToString());
+          }
+          ARROW_RETURN_NOT_OK(builder->AppendScalar(std::move(*val)));
+        } else {
+          return Status::TypeError("Unexpected output type: ",
+                                   Py_TYPE(result->obj())->tp_name, " (expected Scalar)");
+        }
+      }
+      ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+      out->value = std::move(result->data());
+      return Status::OK();
+    });
+  }
+
+  std::shared_ptr<OwnedRefNoGIL> function;
+  UdfWrapperCallback cb;
+  // Accumulated input batches
+  std::vector<std::shared_ptr<RecordBatch>> values;
+  // Group ids - extracted from the last column from the batch
+  TypedBufferBuilder<uint32_t> groups;
+  int64_t num_groups = 0;
+  int64_t num_values = 0;
+  std::shared_ptr<Schema> input_schema;
+  std::shared_ptr<DataType> output_type;
+};
+
+struct PythonUdf : public PythonUdfKernelState {
+  PythonUdf(std::shared_ptr<OwnedRefNoGIL> function, UdfWrapperCallback cb,
+            std::vector<TypeHolder> input_types, compute::OutputType output_type)
+      : PythonUdfKernelState(std::move(function)),
+        cb(std::move(cb)),
+        input_types(std::move(input_types)),
+        output_type(std::move(output_type)) {}
+
+  UdfWrapperCallback cb;
+  std::vector<TypeHolder> input_types;
+  compute::OutputType output_type;
+  TypeHolder resolved_type;
+
+  Result<TypeHolder> ResolveType(compute::KernelContext* ctx,
+                                 const std::vector<TypeHolder>& types) {
+    if (input_types == types) {
+      if (!resolved_type) {
+        ARROW_ASSIGN_OR_RAISE(resolved_type, output_type.Resolve(ctx, input_types));
+      }
+      return resolved_type;
+    }
+    return output_type.Resolve(ctx, types);
+  }
+
+  Status Exec(compute::KernelContext* ctx, const compute::ExecSpan& batch,
+              compute::ExecResult* out) {
+    auto state = arrow::internal::checked_cast<PythonUdfKernelState*>(ctx->state());
+    PyObject* function = state->function->obj();
+    const int num_args = batch.num_values();
+    UdfContext udf_context{ctx->memory_pool(), batch.length};
+
+    OwnedRef arg_tuple(PyTuple_New(num_args));
+    RETURN_NOT_OK(CheckPyError());
+    for (int arg_id = 0; arg_id < num_args; arg_id++) {
+      if (batch[arg_id].is_scalar()) {
+        std::shared_ptr<Scalar> c_data = batch[arg_id].scalar->GetSharedPtr();
+        PyObject* data = wrap_scalar(c_data);
+        PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
+      } else {
+        std::shared_ptr<Array> c_data = batch[arg_id].array.ToArray();
+        PyObject* data = wrap_array(c_data);
+        PyTuple_SetItem(arg_tuple.obj(), arg_id, data);
+      }
+    }
+
+    OwnedRef result(cb(function, udf_context, arg_tuple.obj()));
+    RETURN_NOT_OK(CheckPyError());
+    // unwrapping the output for expected output type
+    if (is_array(result.obj())) {
+      ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Array> val, unwrap_array(result.obj()));
+      ARROW_ASSIGN_OR_RAISE(TypeHolder type, ResolveType(ctx, batch.GetTypes()));
+      if (type.type == NULLPTR) {
+        return Status::TypeError("expected output datatype is null");
+      }
+      if (*type.type != *val->type()) {
+        return Status::TypeError("Expected output datatype ", type.type->ToString(),
+                                 ", but function returned datatype ",
+                                 val->type()->ToString());
+      }
+      out->value = std::move(val->data());
+      return Status::OK();
+    } else {
+      return Status::TypeError("Unexpected output type: ", Py_TYPE(result.obj())->tp_name,
+                               " (expected Array)");
+    }
+    return Status::OK();
+  }
+};
+
+Status PythonUdfExec(compute::KernelContext* ctx, const compute::ExecSpan& batch,
+                     compute::ExecResult* out) {
+  auto udf = static_cast<PythonUdf*>(ctx->kernel()->data.get());
+  return SafeCallIntoPython([&]() -> Status { return udf->Exec(ctx, batch, out); });
+}
+
+template <class Function, class Kernel>
+Status RegisterUdf(PyObject* function, compute::KernelInit kernel_init,
+                   UdfWrapperCallback cb, const UdfOptions& options,
+                   compute::FunctionRegistry* registry) {
+  if (!PyCallable_Check(function)) {
+    return Status::TypeError("Expected a callable Python object.");
+  }
+  auto scalar_func =
+      std::make_shared<Function>(options.func_name, options.arity, options.func_doc);
+  std::vector<compute::InputType> input_types;
+  for (const auto& in_dtype : options.input_types) {
+    input_types.emplace_back(in_dtype);
+  }
+  compute::OutputType output_type(options.output_type);
+  // Take reference before wrapping with OwnedRefNoGIL
+  Py_INCREF(function);
+  auto udf_data = std::make_shared<PythonUdf>(
+      std::make_shared<OwnedRefNoGIL>(function), cb,
+      TypeHolder::FromTypes(options.input_types), options.output_type);
+  Kernel kernel(
+      compute::KernelSignature::Make(std::move(input_types), std::move(output_type),
+                                     options.arity.is_varargs),
+      PythonUdfExec, kernel_init);
+  kernel.data = std::move(udf_data);
+
+  kernel.mem_allocation = compute::MemAllocation::NO_PREALLOCATE;
+  kernel.null_handling = compute::NullHandling::COMPUTED_NO_PREALLOCATE;
+  RETURN_NOT_OK(scalar_func->AddKernel(std::move(kernel)));
+  if (registry == NULLPTR) {
+    registry = compute::GetFunctionRegistry();
+  }
+  RETURN_NOT_OK(registry->AddFunction(std::move(scalar_func)));
+  return Status::OK();
+}
+
+}  // namespace
+
+Status RegisterScalarFunction(PyObject* function, UdfWrapperCallback cb,
+                              const UdfOptions& options,
+                              compute::FunctionRegistry* registry) {
+  return RegisterUdf<compute::ScalarFunction, compute::ScalarKernel>(
+      function, PythonUdfKernelInit{std::make_shared<OwnedRefNoGIL>(function)}, cb,
+      options, registry);
+}
+
+Status RegisterVectorFunction(PyObject* function, UdfWrapperCallback cb,
+                              const UdfOptions& options,
+                              compute::FunctionRegistry* registry) {
+  return RegisterUdf<compute::VectorFunction, compute::VectorKernel>(
+      function, PythonUdfKernelInit{std::make_shared<OwnedRefNoGIL>(function)}, cb,
+      options, registry);
+}
+
+Status RegisterTabularFunction(PyObject* function, UdfWrapperCallback cb,
+                               const UdfOptions& options,
+                               compute::FunctionRegistry* registry) {
+  if (options.arity.num_args != 0 || options.arity.is_varargs) {
+    return Status::NotImplemented("tabular function of non-null arity");
+  }
+  if (options.output_type->id() != Type::type::STRUCT) {
+    return Status::Invalid("tabular function with non-struct output");
+  }
+  return RegisterUdf<compute::ScalarFunction, compute::ScalarKernel>(
+      function, PythonTableUdfKernelInit{std::make_shared<OwnedRefNoGIL>(function), cb},
+      cb, options, registry);
+}
+
+Status RegisterScalarAggregateFunction(PyObject* function, UdfWrapperCallback cb,
+                                       const UdfOptions& options,
+                                       compute::FunctionRegistry* registry) {
+  if (!PyCallable_Check(function)) {
+    return Status::TypeError("Expected a callable Python object.");
+  }
+
+  if (registry == NULLPTR) {
+    registry = compute::GetFunctionRegistry();
+  }
+
+  static auto default_scalar_aggregate_options =
+      compute::ScalarAggregateOptions::Defaults();
+  auto aggregate_func = std::make_shared<compute::ScalarAggregateFunction>(
+      options.func_name, options.arity, options.func_doc,
+      &default_scalar_aggregate_options);
+
+  std::vector<compute::InputType> input_types;
+  for (const auto& in_dtype : options.input_types) {
+    input_types.emplace_back(in_dtype);
+  }
+  compute::OutputType output_type(options.output_type);
+
+  // Take reference before wrapping with OwnedRefNoGIL
+  Py_INCREF(function);
+  auto function_ref = std::make_shared<OwnedRefNoGIL>(function);
+
+  compute::KernelInit init = [cb, function_ref, options](
+                                 compute::KernelContext* ctx,
+                                 const compute::KernelInitArgs& args)
+      -> Result<std::unique_ptr<compute::KernelState>> {
+    return std::make_unique<PythonUdfScalarAggregatorImpl>(
+        function_ref, cb, options.input_types, options.output_type);
+  };
+
+  auto sig = compute::KernelSignature::Make(
+      std::move(input_types), std::move(output_type), options.arity.is_varargs);
+  compute::ScalarAggregateKernel kernel(std::move(sig), std::move(init),
+                                        AggregateUdfConsume, AggregateUdfMerge,
+                                        AggregateUdfFinalize, /*ordered=*/false);
+  RETURN_NOT_OK(aggregate_func->AddKernel(std::move(kernel)));
+  RETURN_NOT_OK(registry->AddFunction(std::move(aggregate_func)));
+  return Status::OK();
+}
+
+/// \brief Create a new UdfOptions with adjustment for hash kernel
+/// \param options User provided udf options
+UdfOptions AdjustForHashAggregate(const UdfOptions& options) {
+  UdfOptions hash_options;
+  // Append hash_ before the function name to separate from the scalar
+  // version
+  hash_options.func_name = "hash_" + options.func_name;
+  // Extend input types with group id. Group id is appended by the group
+  // aggregation node. Here we change both arity and input types
+  if (options.arity.is_varargs) {
+    hash_options.arity = options.arity;
+  } else {
+    hash_options.arity = compute::Arity(options.arity.num_args + 1, false);
+  }
+  // Changing the function doc shouldn't be necessarily because group id
+  // is not user visible, however, this is currently needed to pass the
+  // function validation. The name group_id_array is consistent with
+  // hash kernels in hash_aggregate.cc
+  hash_options.func_doc = options.func_doc;
+  hash_options.func_doc.arg_names.emplace_back("group_id_array");
+  std::vector<std::shared_ptr<DataType>> input_dtypes = options.input_types;
+  input_dtypes.emplace_back(uint32());
+  hash_options.input_types = std::move(input_dtypes);
+  hash_options.output_type = options.output_type;
+  return hash_options;
+}
+
+Status RegisterHashAggregateFunction(PyObject* function, UdfWrapperCallback cb,
+                                     const UdfOptions& options,
+                                     compute::FunctionRegistry* registry) {
+  if (!PyCallable_Check(function)) {
+    return Status::TypeError("Expected a callable Python object.");
+  }
+
+  if (registry == NULLPTR) {
+    registry = compute::GetFunctionRegistry();
+  }
+
+  UdfOptions hash_options = AdjustForHashAggregate(options);
+
+  std::vector<compute::InputType> input_types;
+  for (const auto& in_dtype : hash_options.input_types) {
+    input_types.emplace_back(in_dtype);
+  }
+  compute::OutputType output_type(hash_options.output_type);
+
+  static auto default_hash_aggregate_options =
+      compute::ScalarAggregateOptions::Defaults();
+  auto hash_aggregate_func = std::make_shared<compute::HashAggregateFunction>(
+      hash_options.func_name, hash_options.arity, hash_options.func_doc,
+      &default_hash_aggregate_options);
+
+  // Take reference before wrapping with OwnedRefNoGIL
+  Py_INCREF(function);
+  auto function_ref = std::make_shared<OwnedRefNoGIL>(function);
+  compute::KernelInit init = [function_ref, cb, hash_options](
+                                 compute::KernelContext* ctx,
+                                 const compute::KernelInitArgs& args)
+      -> Result<std::unique_ptr<compute::KernelState>> {
+    return std::make_unique<PythonUdfHashAggregatorImpl>(
+        function_ref, cb, hash_options.input_types, hash_options.output_type);
+  };
+
+  auto sig = compute::KernelSignature::Make(
+      std::move(input_types), std::move(output_type), hash_options.arity.is_varargs);
+
+  compute::HashAggregateKernel kernel(
+      std::move(sig), std::move(init), HashAggregateUdfResize, HashAggregateUdfConsume,
+      HashAggregateUdfMerge, HashAggregateUdfFinalize, /*ordered=*/false);
+  RETURN_NOT_OK(hash_aggregate_func->AddKernel(std::move(kernel)));
+  RETURN_NOT_OK(registry->AddFunction(std::move(hash_aggregate_func)));
+  return Status::OK();
+}
+
+Status RegisterAggregateFunction(PyObject* function, UdfWrapperCallback cb,
+                                 const UdfOptions& options,
+                                 compute::FunctionRegistry* registry) {
+  RETURN_NOT_OK(RegisterScalarAggregateFunction(function, cb, options, registry));
+  RETURN_NOT_OK(RegisterHashAggregateFunction(function, cb, options, registry));
+
+  return Status::OK();
+}
+
+Result<std::shared_ptr<RecordBatchReader>> CallTabularFunction(
+    const std::string& func_name, const std::vector<Datum>& args,
+    compute::FunctionRegistry* registry) {
+  if (args.size() != 0) {
+    return Status::NotImplemented("non-empty arguments to tabular function");
+  }
+  if (registry == NULLPTR) {
+    registry = compute::GetFunctionRegistry();
+  }
+  ARROW_ASSIGN_OR_RAISE(auto func, registry->GetFunction(func_name));
+  if (func->kind() != compute::Function::SCALAR) {
+    return Status::Invalid("tabular function of non-scalar kind");
+  }
+  auto arity = func->arity();
+  if (arity.num_args != 0 || arity.is_varargs) {
+    return Status::NotImplemented("tabular function of non-null arity");
+  }
+  auto kernels =
+      arrow::internal::checked_pointer_cast<compute::ScalarFunction>(func)->kernels();
+  if (kernels.size() != 1) {
+    return Status::NotImplemented("tabular function with non-single kernel");
+  }
+  const compute::ScalarKernel* kernel = kernels[0];
+  auto out_type = kernel->signature->out_type();
+  if (out_type.kind() != compute::OutputType::FIXED) {
+    return Status::Invalid("tabular kernel of non-fixed kind");
+  }
+  auto datatype = out_type.type();
+  if (datatype->id() != Type::type::STRUCT) {
+    return Status::Invalid("tabular kernel with non-struct output");
+  }
+  auto struct_type = arrow::internal::checked_cast<StructType*>(datatype.get());
+  auto schema = ::arrow::schema(struct_type->fields());
+  std::vector<TypeHolder> in_types;
+  ARROW_ASSIGN_OR_RAISE(auto func_exec,
+                        GetFunctionExecutor(func_name, in_types, NULLPTR, registry));
+  auto next_func = [schema, func_exec = std::move(
+                                func_exec)]() -> Result<std::shared_ptr<RecordBatch>> {
+    std::vector<Datum> args;
+    // passed_length of -1 or 0 with args.size() of 0 leads to an empty ExecSpanIterator
+    // in exec.cc and to never invoking the source function, so 1 is passed instead
+    // TODO: GH-33612: Support batch size in user-defined tabular functions
+    ARROW_ASSIGN_OR_RAISE(auto datum, func_exec->Execute(args, /*passed_length=*/1));
+    if (!datum.is_array()) {
+      return Status::Invalid("UDF result of non-array kind");
+    }
+    std::shared_ptr<Array> array = datum.make_array();
+    if (array->length() == 0) {
+      return IterationTraits<std::shared_ptr<RecordBatch>>::End();
+    }
+    ARROW_ASSIGN_OR_RAISE(auto batch, RecordBatch::FromStructArray(std::move(array)));
+    if (!schema->Equals(batch->schema())) {
+      return Status::Invalid("UDF result with shape not conforming to schema");
+    }
+    return std::move(batch);
+  };
+  return RecordBatchReader::MakeFromIterator(MakeFunctionIterator(std::move(next_func)),
+                                             schema);
+}
+
+}  // namespace py
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/udf.h b/pyarrow/src/arrow/python/udf.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8c4e430e53d49a8fe7d237ffe7ba8feae5e452f
--- /dev/null
+++ b/pyarrow/src/arrow/python/udf.h
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/compute/exec.h"
+#include "arrow/compute/function.h"
+#include "arrow/compute/registry.h"
+#include "arrow/python/platform.h"
+#include "arrow/record_batch.h"
+#include "arrow/util/iterator.h"
+
+#include "arrow/python/common.h"
+#include "arrow/python/pyarrow.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow {
+
+namespace py {
+
+// TODO: TODO(ARROW-16041): UDF Options are not exposed to the Python
+// users. This feature will be included when extending to provide advanced
+// options for the users.
+struct ARROW_PYTHON_EXPORT UdfOptions {
+  std::string func_name;
+  compute::Arity arity;
+  compute::FunctionDoc func_doc;
+  std::vector<std::shared_ptr<DataType>> input_types;
+  std::shared_ptr<DataType> output_type;
+};
+
+/// \brief A context passed as the first argument of UDF functions.
+struct ARROW_PYTHON_EXPORT UdfContext {
+  MemoryPool* pool;
+  int64_t batch_length;
+};
+
+using UdfWrapperCallback = std::function<PyObject*(
+    PyObject* user_function, const UdfContext& context, PyObject* inputs)>;
+
+/// \brief register a Scalar user-defined-function from Python
+Status ARROW_PYTHON_EXPORT RegisterScalarFunction(
+    PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options,
+    compute::FunctionRegistry* registry = NULLPTR);
+
+/// \brief register a Table user-defined-function from Python
+Status ARROW_PYTHON_EXPORT RegisterTabularFunction(
+    PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options,
+    compute::FunctionRegistry* registry = NULLPTR);
+
+/// \brief register a Aggregate user-defined-function from Python
+Status ARROW_PYTHON_EXPORT RegisterAggregateFunction(
+    PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options,
+    compute::FunctionRegistry* registry = NULLPTR);
+
+/// \brief register a Vector user-defined-function from Python
+Status ARROW_PYTHON_EXPORT RegisterVectorFunction(
+    PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options,
+    compute::FunctionRegistry* registry = NULLPTR);
+
+Result<std::shared_ptr<RecordBatchReader>> ARROW_PYTHON_EXPORT
+CallTabularFunction(const std::string& func_name, const std::vector<Datum>& args,
+                    compute::FunctionRegistry* registry = NULLPTR);
+
+}  // namespace py
+
+}  // namespace arrow
diff --git a/pyarrow/src/arrow/python/util.cc b/pyarrow/src/arrow/python/util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cffe1eb956b3ed86cade1acfbba58635a71b20d5
--- /dev/null
+++ b/pyarrow/src/arrow/python/util.cc
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/python/util.h"
+
+#include "arrow/array.h"
+#include "arrow/python/common.h"
+
+namespace arrow ::py {
+
+Result<std::shared_ptr<Array>> Arange(int64_t start, int64_t stop, int64_t step,
+                                      MemoryPool* pool) {
+  int64_t size;
+  if (step == 0) {
+    return Status::Invalid("Step must not be zero");
+  }
+  if (step > 0 && stop > start) {
+    // Ceiling division for positive step
+    size = (stop - start + step - 1) / step;
+  } else if (step < 0 && stop < start) {
+    // Ceiling division for negative step
+    size = (start - stop - step - 1) / (-step);
+  } else {
+    return MakeEmptyArray(int64());
+  }
+  std::shared_ptr<Buffer> data_buffer;
+  ARROW_ASSIGN_OR_RAISE(data_buffer, AllocateBuffer(size * sizeof(int64_t), pool));
+  auto values = reinterpret_cast<int64_t*>(data_buffer->mutable_data());
+  for (int64_t i = 0; i < size; ++i) {
+    values[i] = start + i * step;
+  }
+  auto data = ArrayData::Make(int64(), size, {nullptr, data_buffer}, 0);
+  return MakeArray(data);
+}
+
+}  // namespace arrow::py
diff --git a/pyarrow/src/arrow/python/util.h b/pyarrow/src/arrow/python/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff2ffcaea9cfd7835733bce04b72447cca7ee372
--- /dev/null
+++ b/pyarrow/src/arrow/python/util.h
@@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "arrow/python/common.h"
+#include "arrow/python/visibility.h"
+
+namespace arrow::py {
+
+/// \brief Create an array of evenly spaced values within a given interval.
+/// This function is similar to Python's `range` function.
+/// The resulting array will contain values starting from `start` up to but not
+/// including `stop`, with a step size of `step`. If `step` is zero, the function
+/// will return an error.
+/// The resulting array will have a data type of `int64`.
+/// \param[in] start initial value of the sequence.
+/// \param[in] stop final value of the sequence (exclusive).
+/// \param[in] step step size between consecutive values.
+/// \param[in] pool Memory pool for any memory allocations.
+/// \return Result Array
+ARROW_PYTHON_EXPORT
+Result<std::shared_ptr<Array>> Arange(int64_t start, int64_t stop, int64_t step,
+                                      MemoryPool* pool);
+
+}  // namespace arrow::py
diff --git a/pyarrow/src/arrow/python/vendored/CMakeLists.txt b/pyarrow/src/arrow/python/vendored/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6190072c0d384a9dc6b72290a34353ccc0b556c2
--- /dev/null
+++ b/pyarrow/src/arrow/python/vendored/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+arrow_install_all_headers("arrow/python/vendored")
diff --git a/pyarrow/src/arrow/python/vendored/pythoncapi_compat.h b/pyarrow/src/arrow/python/vendored/pythoncapi_compat.h
new file mode 100644
index 0000000000000000000000000000000000000000..4baa7b34a93500e0d0d120a60332fba1ed5091fe
--- /dev/null
+++ b/pyarrow/src/arrow/python/vendored/pythoncapi_compat.h
@@ -0,0 +1,1519 @@
+// Header file providing new C API functions to old Python versions.
+//
+// File distributed under the Zero Clause BSD (0BSD) license.
+// Copyright Contributors to the pythoncapi_compat project.
+//
+// Homepage:
+// https://github.com/python/pythoncapi_compat
+//
+// Latest version:
+// https://raw.githubusercontent.com/python/pythoncapi_compat/master/pythoncapi_compat.h
+//
+// Vendored from git revision:
+// 39e2663e6acc0b68d5dd75bdaad0af33152552ae
+// https://raw.githubusercontent.com/python/pythoncapi-compat/39e2663e6acc0b68d5dd75bdaad0af33152552ae/pythoncapi_compat.h
+//
+// SPDX-License-Identifier: 0BSD
+
+/* clang-format off */
+
+#ifndef PYTHONCAPI_COMPAT
+#define PYTHONCAPI_COMPAT
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <Python.h>
+
+// Python 3.11.0b4 added PyFrame_Back() to Python.h
+#if PY_VERSION_HEX < 0x030b00B4 && !defined(PYPY_VERSION)
+#  include "frameobject.h"        // PyFrameObject, PyFrame_GetBack()
+#endif
+
+
+#ifndef _Py_CAST
+#  define _Py_CAST(type, expr) ((type)(expr))
+#endif
+
+// Static inline functions should use _Py_NULL rather than using directly NULL
+// to prevent C++ compiler warnings. On C23 and newer and on C++11 and newer,
+// _Py_NULL is defined as nullptr.
+#if (defined (__STDC_VERSION__) && __STDC_VERSION__ > 201710L) \
+        || (defined(__cplusplus) && __cplusplus >= 201103)
+#  define _Py_NULL nullptr
+#else
+#  define _Py_NULL NULL
+#endif
+
+// Cast argument to PyObject* type.
+#ifndef _PyObject_CAST
+#  define _PyObject_CAST(op) _Py_CAST(PyObject*, op)
+#endif
+
+
+// bpo-42262 added Py_NewRef() to Python 3.10.0a3
+#if PY_VERSION_HEX < 0x030A00A3 && !defined(Py_NewRef)
+static inline PyObject* _Py_NewRef(PyObject *obj)
+{
+    Py_INCREF(obj);
+    return obj;
+}
+#define Py_NewRef(obj) _Py_NewRef(_PyObject_CAST(obj))
+#endif
+
+
+// bpo-42262 added Py_XNewRef() to Python 3.10.0a3
+#if PY_VERSION_HEX < 0x030A00A3 && !defined(Py_XNewRef)
+static inline PyObject* _Py_XNewRef(PyObject *obj)
+{
+    Py_XINCREF(obj);
+    return obj;
+}
+#define Py_XNewRef(obj) _Py_XNewRef(_PyObject_CAST(obj))
+#endif
+
+
+// bpo-39573 added Py_SET_REFCNT() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_REFCNT)
+static inline void _Py_SET_REFCNT(PyObject *ob, Py_ssize_t refcnt)
+{
+    ob->ob_refcnt = refcnt;
+}
+#define Py_SET_REFCNT(ob, refcnt) _Py_SET_REFCNT(_PyObject_CAST(ob), refcnt)
+#endif
+
+
+// Py_SETREF() and Py_XSETREF() were added to Python 3.5.2.
+// It is excluded from the limited C API.
+#if (PY_VERSION_HEX < 0x03050200 && !defined(Py_SETREF)) && !defined(Py_LIMITED_API)
+#define Py_SETREF(dst, src)                                     \
+    do {                                                        \
+        PyObject **_tmp_dst_ptr = _Py_CAST(PyObject**, &(dst)); \
+        PyObject *_tmp_dst = (*_tmp_dst_ptr);                   \
+        *_tmp_dst_ptr = _PyObject_CAST(src);                    \
+        Py_DECREF(_tmp_dst);                                    \
+    } while (0)
+
+#define Py_XSETREF(dst, src)                                    \
+    do {                                                        \
+        PyObject **_tmp_dst_ptr = _Py_CAST(PyObject**, &(dst)); \
+        PyObject *_tmp_dst = (*_tmp_dst_ptr);                   \
+        *_tmp_dst_ptr = _PyObject_CAST(src);                    \
+        Py_XDECREF(_tmp_dst);                                   \
+    } while (0)
+#endif
+
+
+// bpo-43753 added Py_Is(), Py_IsNone(), Py_IsTrue() and Py_IsFalse()
+// to Python 3.10.0b1.
+#if PY_VERSION_HEX < 0x030A00B1 && !defined(Py_Is)
+#  define Py_Is(x, y) ((x) == (y))
+#endif
+#if PY_VERSION_HEX < 0x030A00B1 && !defined(Py_IsNone)
+#  define Py_IsNone(x) Py_Is(x, Py_None)
+#endif
+#if (PY_VERSION_HEX < 0x030A00B1 || defined(PYPY_VERSION)) && !defined(Py_IsTrue)
+#  define Py_IsTrue(x) Py_Is(x, Py_True)
+#endif
+#if (PY_VERSION_HEX < 0x030A00B1 || defined(PYPY_VERSION)) && !defined(Py_IsFalse)
+#  define Py_IsFalse(x) Py_Is(x, Py_False)
+#endif
+
+
+// bpo-39573 added Py_SET_TYPE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_TYPE)
+static inline void _Py_SET_TYPE(PyObject *ob, PyTypeObject *type)
+{
+    ob->ob_type = type;
+}
+#define Py_SET_TYPE(ob, type) _Py_SET_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+
+// bpo-39573 added Py_SET_SIZE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_SET_SIZE)
+static inline void _Py_SET_SIZE(PyVarObject *ob, Py_ssize_t size)
+{
+    ob->ob_size = size;
+}
+#define Py_SET_SIZE(ob, size) _Py_SET_SIZE((PyVarObject*)(ob), size)
+#endif
+
+
+// bpo-40421 added PyFrame_GetCode() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 || defined(PYPY_VERSION)
+static inline PyCodeObject* PyFrame_GetCode(PyFrameObject *frame)
+{
+    assert(frame != _Py_NULL);
+    assert(frame->f_code != _Py_NULL);
+    return _Py_CAST(PyCodeObject*, Py_NewRef(frame->f_code));
+}
+#endif
+
+static inline PyCodeObject* _PyFrame_GetCodeBorrow(PyFrameObject *frame)
+{
+    PyCodeObject *code = PyFrame_GetCode(frame);
+    Py_DECREF(code);
+    return code;
+}
+
+
+// bpo-40421 added PyFrame_GetBack() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
+static inline PyFrameObject* PyFrame_GetBack(PyFrameObject *frame)
+{
+    assert(frame != _Py_NULL);
+    return _Py_CAST(PyFrameObject*, Py_XNewRef(frame->f_back));
+}
+#endif
+
+#if !defined(PYPY_VERSION)
+static inline PyFrameObject* _PyFrame_GetBackBorrow(PyFrameObject *frame)
+{
+    PyFrameObject *back = PyFrame_GetBack(frame);
+    Py_XDECREF(back);
+    return back;
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetLocals() to Python 3.11.0a7
+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
+static inline PyObject* PyFrame_GetLocals(PyFrameObject *frame)
+{
+#if PY_VERSION_HEX >= 0x030400B1
+    if (PyFrame_FastToLocalsWithError(frame) < 0) {
+        return NULL;
+    }
+#else
+    PyFrame_FastToLocals(frame);
+#endif
+    return Py_NewRef(frame->f_locals);
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetGlobals() to Python 3.11.0a7
+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
+static inline PyObject* PyFrame_GetGlobals(PyFrameObject *frame)
+{
+    return Py_NewRef(frame->f_globals);
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetBuiltins() to Python 3.11.0a7
+#if PY_VERSION_HEX < 0x030B00A7 && !defined(PYPY_VERSION)
+static inline PyObject* PyFrame_GetBuiltins(PyFrameObject *frame)
+{
+    return Py_NewRef(frame->f_builtins);
+}
+#endif
+
+
+// bpo-40421 added PyFrame_GetLasti() to Python 3.11.0b1
+#if PY_VERSION_HEX < 0x030B00B1 && !defined(PYPY_VERSION)
+static inline int PyFrame_GetLasti(PyFrameObject *frame)
+{
+#if PY_VERSION_HEX >= 0x030A00A7
+    // bpo-27129: Since Python 3.10.0a7, f_lasti is an instruction offset,
+    // not a bytes offset anymore. Python uses 16-bit "wordcode" (2 bytes)
+    // instructions.
+    if (frame->f_lasti < 0) {
+        return -1;
+    }
+    return frame->f_lasti * 2;
+#else
+    return frame->f_lasti;
+#endif
+}
+#endif
+
+
+// gh-91248 added PyFrame_GetVar() to Python 3.12.0a2
+#if PY_VERSION_HEX < 0x030C00A2 && !defined(PYPY_VERSION)
+static inline PyObject* PyFrame_GetVar(PyFrameObject *frame, PyObject *name)
+{
+    PyObject *locals, *value;
+
+    locals = PyFrame_GetLocals(frame);
+    if (locals == NULL) {
+        return NULL;
+    }
+#if PY_VERSION_HEX >= 0x03000000
+    value = PyDict_GetItemWithError(locals, name);
+#else
+    value = _PyDict_GetItemWithError(locals, name);
+#endif
+    Py_DECREF(locals);
+
+    if (value == NULL) {
+        if (PyErr_Occurred()) {
+            return NULL;
+        }
+#if PY_VERSION_HEX >= 0x03000000
+        PyErr_Format(PyExc_NameError, "variable %R does not exist", name);
+#else
+        PyErr_SetString(PyExc_NameError, "variable does not exist");
+#endif
+        return NULL;
+    }
+    return Py_NewRef(value);
+}
+#endif
+
+
+// gh-91248 added PyFrame_GetVarString() to Python 3.12.0a2
+#if PY_VERSION_HEX < 0x030C00A2 && !defined(PYPY_VERSION)
+static inline PyObject*
+PyFrame_GetVarString(PyFrameObject *frame, const char *name)
+{
+    PyObject *name_obj, *value;
+#if PY_VERSION_HEX >= 0x03000000
+    name_obj = PyUnicode_FromString(name);
+#else
+    name_obj = PyString_FromString(name);
+#endif
+    if (name_obj == NULL) {
+        return NULL;
+    }
+    value = PyFrame_GetVar(frame, name_obj);
+    Py_DECREF(name_obj);
+    return value;
+}
+#endif
+
+
+// bpo-39947 added PyThreadState_GetInterpreter() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
+static inline PyInterpreterState *
+PyThreadState_GetInterpreter(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return tstate->interp;
+}
+#endif
+
+
+// bpo-40429 added PyThreadState_GetFrame() to Python 3.9.0b1
+#if PY_VERSION_HEX < 0x030900B1 && !defined(PYPY_VERSION)
+static inline PyFrameObject* PyThreadState_GetFrame(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return _Py_CAST(PyFrameObject *, Py_XNewRef(tstate->frame));
+}
+#endif
+
+#if !defined(PYPY_VERSION)
+static inline PyFrameObject*
+_PyThreadState_GetFrameBorrow(PyThreadState *tstate)
+{
+    PyFrameObject *frame = PyThreadState_GetFrame(tstate);
+    Py_XDECREF(frame);
+    return frame;
+}
+#endif
+
+
+// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5 || defined(PYPY_VERSION)
+static inline PyInterpreterState* PyInterpreterState_Get(void)
+{
+    PyThreadState *tstate;
+    PyInterpreterState *interp;
+
+    tstate = PyThreadState_GET();
+    if (tstate == _Py_NULL) {
+        Py_FatalError("GIL released (tstate is NULL)");
+    }
+    interp = tstate->interp;
+    if (interp == _Py_NULL) {
+        Py_FatalError("no current interpreter");
+    }
+    return interp;
+}
+#endif
+
+
+// bpo-39947 added PyInterpreterState_Get() to Python 3.9.0a6
+#if 0x030700A1 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
+static inline uint64_t PyThreadState_GetID(PyThreadState *tstate)
+{
+    assert(tstate != _Py_NULL);
+    return tstate->id;
+}
+#endif
+
+// bpo-43760 added PyThreadState_EnterTracing() to Python 3.11.0a2
+#if PY_VERSION_HEX < 0x030B00A2 && !defined(PYPY_VERSION)
+static inline void PyThreadState_EnterTracing(PyThreadState *tstate)
+{
+    tstate->tracing++;
+#if PY_VERSION_HEX >= 0x030A00A1
+    tstate->cframe->use_tracing = 0;
+#else
+    tstate->use_tracing = 0;
+#endif
+}
+#endif
+
+// bpo-43760 added PyThreadState_LeaveTracing() to Python 3.11.0a2
+#if PY_VERSION_HEX < 0x030B00A2 && !defined(PYPY_VERSION)
+static inline void PyThreadState_LeaveTracing(PyThreadState *tstate)
+{
+    int use_tracing = (tstate->c_tracefunc != _Py_NULL
+                       || tstate->c_profilefunc != _Py_NULL);
+    tstate->tracing--;
+#if PY_VERSION_HEX >= 0x030A00A1
+    tstate->cframe->use_tracing = use_tracing;
+#else
+    tstate->use_tracing = use_tracing;
+#endif
+}
+#endif
+
+
+// bpo-37194 added PyObject_CallNoArgs() to Python 3.9.0a1
+// PyObject_CallNoArgs() added to PyPy 3.9.16-v7.3.11
+#if !defined(PyObject_CallNoArgs) && PY_VERSION_HEX < 0x030900A1
+static inline PyObject* PyObject_CallNoArgs(PyObject *func)
+{
+    return PyObject_CallFunctionObjArgs(func, NULL);
+}
+#endif
+
+
+// bpo-39245 made PyObject_CallOneArg() public (previously called
+// _PyObject_CallOneArg) in Python 3.9.0a4
+// PyObject_CallOneArg() added to PyPy 3.9.16-v7.3.11
+#if !defined(PyObject_CallOneArg) && PY_VERSION_HEX < 0x030900A4
+static inline PyObject* PyObject_CallOneArg(PyObject *func, PyObject *arg)
+{
+    return PyObject_CallFunctionObjArgs(func, arg, NULL);
+}
+#endif
+
+
+// bpo-1635741 added PyModule_AddObjectRef() to Python 3.10.0a3
+#if PY_VERSION_HEX < 0x030A00A3
+static inline int
+PyModule_AddObjectRef(PyObject *module, const char *name, PyObject *value)
+{
+    int res;
+
+    if (!value && !PyErr_Occurred()) {
+        // PyModule_AddObject() raises TypeError in this case
+        PyErr_SetString(PyExc_SystemError,
+                        "PyModule_AddObjectRef() must be called "
+                        "with an exception raised if value is NULL");
+        return -1;
+    }
+
+    Py_XINCREF(value);
+    res = PyModule_AddObject(module, name, value);
+    if (res < 0) {
+        Py_XDECREF(value);
+    }
+    return res;
+}
+#endif
+
+
+// bpo-40024 added PyModule_AddType() to Python 3.9.0a5
+#if PY_VERSION_HEX < 0x030900A5
+static inline int PyModule_AddType(PyObject *module, PyTypeObject *type)
+{
+    const char *name, *dot;
+
+    if (PyType_Ready(type) < 0) {
+        return -1;
+    }
+
+    // inline _PyType_Name()
+    name = type->tp_name;
+    assert(name != _Py_NULL);
+    dot = strrchr(name, '.');
+    if (dot != _Py_NULL) {
+        name = dot + 1;
+    }
+
+    return PyModule_AddObjectRef(module, name, _PyObject_CAST(type));
+}
+#endif
+
+
+// bpo-40241 added PyObject_GC_IsTracked() to Python 3.9.0a6.
+// bpo-4688 added _PyObject_GC_IS_TRACKED() to Python 2.7.0a2.
+#if PY_VERSION_HEX < 0x030900A6 && !defined(PYPY_VERSION)
+static inline int PyObject_GC_IsTracked(PyObject* obj)
+{
+    return (PyObject_IS_GC(obj) && _PyObject_GC_IS_TRACKED(obj));
+}
+#endif
+
+// bpo-40241 added PyObject_GC_IsFinalized() to Python 3.9.0a6.
+// bpo-18112 added _PyGCHead_FINALIZED() to Python 3.4.0 final.
+#if PY_VERSION_HEX < 0x030900A6 && PY_VERSION_HEX >= 0x030400F0 && !defined(PYPY_VERSION)
+static inline int PyObject_GC_IsFinalized(PyObject *obj)
+{
+    PyGC_Head *gc = _Py_CAST(PyGC_Head*, obj) - 1;
+    return (PyObject_IS_GC(obj) && _PyGCHead_FINALIZED(gc));
+}
+#endif
+
+
+// bpo-39573 added Py_IS_TYPE() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4 && !defined(Py_IS_TYPE)
+static inline int _Py_IS_TYPE(PyObject *ob, PyTypeObject *type) {
+    return Py_TYPE(ob) == type;
+}
+#define Py_IS_TYPE(ob, type) _Py_IS_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+
+// bpo-46906 added PyFloat_Pack2() and PyFloat_Unpack2() to Python 3.11a7.
+// bpo-11734 added _PyFloat_Pack2() and _PyFloat_Unpack2() to Python 3.6.0b1.
+// Python 3.11a2 moved _PyFloat_Pack2() and _PyFloat_Unpack2() to the internal
+// C API: Python 3.11a2-3.11a6 versions are not supported.
+#if 0x030600B1 <= PY_VERSION_HEX && PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+static inline int PyFloat_Pack2(double x, char *p, int le)
+{ return _PyFloat_Pack2(x, (unsigned char*)p, le); }
+
+static inline double PyFloat_Unpack2(const char *p, int le)
+{ return _PyFloat_Unpack2((const unsigned char *)p, le); }
+#endif
+
+
+// bpo-46906 added PyFloat_Pack4(), PyFloat_Pack8(), PyFloat_Unpack4() and
+// PyFloat_Unpack8() to Python 3.11a7.
+// Python 3.11a2 moved _PyFloat_Pack4(), _PyFloat_Pack8(), _PyFloat_Unpack4()
+// and _PyFloat_Unpack8() to the internal C API: Python 3.11a2-3.11a6 versions
+// are not supported.
+#if PY_VERSION_HEX <= 0x030B00A1 && !defined(PYPY_VERSION)
+static inline int PyFloat_Pack4(double x, char *p, int le)
+{ return _PyFloat_Pack4(x, (unsigned char*)p, le); }
+
+static inline int PyFloat_Pack8(double x, char *p, int le)
+{ return _PyFloat_Pack8(x, (unsigned char*)p, le); }
+
+static inline double PyFloat_Unpack4(const char *p, int le)
+{ return _PyFloat_Unpack4((const unsigned char *)p, le); }
+
+static inline double PyFloat_Unpack8(const char *p, int le)
+{ return _PyFloat_Unpack8((const unsigned char *)p, le); }
+#endif
+
+
+// gh-92154 added PyCode_GetCode() to Python 3.11.0b1
+#if PY_VERSION_HEX < 0x030B00B1 && !defined(PYPY_VERSION)
+static inline PyObject* PyCode_GetCode(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_code);
+}
+#endif
+
+
+// gh-95008 added PyCode_GetVarnames() to Python 3.11.0rc1
+#if PY_VERSION_HEX < 0x030B00C1 && !defined(PYPY_VERSION)
+static inline PyObject* PyCode_GetVarnames(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_varnames);
+}
+#endif
+
+// gh-95008 added PyCode_GetFreevars() to Python 3.11.0rc1
+#if PY_VERSION_HEX < 0x030B00C1 && !defined(PYPY_VERSION)
+static inline PyObject* PyCode_GetFreevars(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_freevars);
+}
+#endif
+
+// gh-95008 added PyCode_GetCellvars() to Python 3.11.0rc1
+#if PY_VERSION_HEX < 0x030B00C1 && !defined(PYPY_VERSION)
+static inline PyObject* PyCode_GetCellvars(PyCodeObject *code)
+{
+    return Py_NewRef(code->co_cellvars);
+}
+#endif
+
+
+// Py_UNUSED() was added to Python 3.4.0b2.
+#if PY_VERSION_HEX < 0x030400B2 && !defined(Py_UNUSED)
+#  if defined(__GNUC__) || defined(__clang__)
+#    define Py_UNUSED(name) _unused_ ## name __attribute__((unused))
+#  else
+#    define Py_UNUSED(name) _unused_ ## name
+#  endif
+#endif
+
+
+// gh-105922 added PyImport_AddModuleRef() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A0
+static inline PyObject* PyImport_AddModuleRef(const char *name)
+{
+    return Py_XNewRef(PyImport_AddModule(name));
+}
+#endif
+
+
+// gh-105927 added PyWeakref_GetRef() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D0000
+static inline int PyWeakref_GetRef(PyObject *ref, PyObject **pobj)
+{
+    PyObject *obj;
+    if (ref != NULL && !PyWeakref_Check(ref)) {
+        *pobj = NULL;
+        PyErr_SetString(PyExc_TypeError, "expected a weakref");
+        return -1;
+    }
+    obj = PyWeakref_GetObject(ref);
+    if (obj == NULL) {
+        // SystemError if ref is NULL
+        *pobj = NULL;
+        return -1;
+    }
+    if (obj == Py_None) {
+        *pobj = NULL;
+        return 0;
+    }
+    *pobj = Py_NewRef(obj);
+    return (*pobj != NULL);
+}
+#endif
+
+
+// bpo-36974 added PY_VECTORCALL_ARGUMENTS_OFFSET to Python 3.8b1
+#ifndef PY_VECTORCALL_ARGUMENTS_OFFSET
+#  define PY_VECTORCALL_ARGUMENTS_OFFSET (_Py_CAST(size_t, 1) << (8 * sizeof(size_t) - 1))
+#endif
+
+// bpo-36974 added PyVectorcall_NARGS() to Python 3.8b1
+#if PY_VERSION_HEX < 0x030800B1
+static inline Py_ssize_t PyVectorcall_NARGS(size_t n)
+{
+    return n & ~PY_VECTORCALL_ARGUMENTS_OFFSET;
+}
+#endif
+
+
+// gh-105922 added PyObject_Vectorcall() to Python 3.9.0a4
+#if PY_VERSION_HEX < 0x030900A4
+static inline PyObject*
+PyObject_Vectorcall(PyObject *callable, PyObject *const *args,
+                     size_t nargsf, PyObject *kwnames)
+{
+#if PY_VERSION_HEX >= 0x030800B1 && !defined(PYPY_VERSION)
+    // bpo-36974 added _PyObject_Vectorcall() to Python 3.8.0b1
+    return _PyObject_Vectorcall(callable, args, nargsf, kwnames);
+#else
+    PyObject *posargs = NULL, *kwargs = NULL;
+    PyObject *res;
+    Py_ssize_t nposargs, nkwargs, i;
+
+    if (nargsf != 0 && args == NULL) {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+    if (kwnames != NULL && !PyTuple_Check(kwnames)) {
+        PyErr_BadInternalCall();
+        goto error;
+    }
+
+    nposargs = (Py_ssize_t)PyVectorcall_NARGS(nargsf);
+    if (kwnames) {
+        nkwargs = PyTuple_GET_SIZE(kwnames);
+    }
+    else {
+        nkwargs = 0;
+    }
+
+    posargs = PyTuple_New(nposargs);
+    if (posargs == NULL) {
+        goto error;
+    }
+    if (nposargs) {
+        for (i=0; i < nposargs; i++) {
+            PyTuple_SET_ITEM(posargs, i, Py_NewRef(*args));
+            args++;
+        }
+    }
+
+    if (nkwargs) {
+        kwargs = PyDict_New();
+        if (kwargs == NULL) {
+            goto error;
+        }
+
+        for (i = 0; i < nkwargs; i++) {
+            PyObject *key = PyTuple_GET_ITEM(kwnames, i);
+            PyObject *value = *args;
+            args++;
+            if (PyDict_SetItem(kwargs, key, value) < 0) {
+                goto error;
+            }
+        }
+    }
+    else {
+        kwargs = NULL;
+    }
+
+    res = PyObject_Call(callable, posargs, kwargs);
+    Py_DECREF(posargs);
+    Py_XDECREF(kwargs);
+    return res;
+
+error:
+    Py_DECREF(posargs);
+    Py_XDECREF(kwargs);
+    return NULL;
+#endif
+}
+#endif
+
+
+// gh-106521 added PyObject_GetOptionalAttr() and
+// PyObject_GetOptionalAttrString() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyObject_GetOptionalAttr(PyObject *obj, PyObject *attr_name, PyObject **result)
+{
+    // bpo-32571 added _PyObject_LookupAttr() to Python 3.7.0b1
+#if PY_VERSION_HEX >= 0x030700B1 && !defined(PYPY_VERSION)
+    return _PyObject_LookupAttr(obj, attr_name, result);
+#else
+    *result = PyObject_GetAttr(obj, attr_name);
+    if (*result != NULL) {
+        return 1;
+    }
+    if (!PyErr_Occurred()) {
+        return 0;
+    }
+    if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+        return 0;
+    }
+    return -1;
+#endif
+}
+
+static inline int
+PyObject_GetOptionalAttrString(PyObject *obj, const char *attr_name, PyObject **result)
+{
+    PyObject *name_obj;
+    int rc;
+#if PY_VERSION_HEX >= 0x03000000
+    name_obj = PyUnicode_FromString(attr_name);
+#else
+    name_obj = PyString_FromString(attr_name);
+#endif
+    if (name_obj == NULL) {
+        *result = NULL;
+        return -1;
+    }
+    rc = PyObject_GetOptionalAttr(obj, name_obj, result);
+    Py_DECREF(name_obj);
+    return rc;
+}
+#endif
+
+
+// gh-106307 added PyObject_GetOptionalAttr() and
+// PyMapping_GetOptionalItemString() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyMapping_GetOptionalItem(PyObject *obj, PyObject *key, PyObject **result)
+{
+    *result = PyObject_GetItem(obj, key);
+    if (*result) {
+        return 1;
+    }
+    if (!PyErr_ExceptionMatches(PyExc_KeyError)) {
+        return -1;
+    }
+    PyErr_Clear();
+    return 0;
+}
+
+static inline int
+PyMapping_GetOptionalItemString(PyObject *obj, const char *key, PyObject **result)
+{
+    PyObject *key_obj;
+    int rc;
+#if PY_VERSION_HEX >= 0x03000000
+    key_obj = PyUnicode_FromString(key);
+#else
+    key_obj = PyString_FromString(key);
+#endif
+    if (key_obj == NULL) {
+        *result = NULL;
+        return -1;
+    }
+    rc = PyMapping_GetOptionalItem(obj, key_obj, result);
+    Py_DECREF(key_obj);
+    return rc;
+}
+#endif
+
+// gh-108511 added PyMapping_HasKeyWithError() and
+// PyMapping_HasKeyStringWithError() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyMapping_HasKeyWithError(PyObject *obj, PyObject *key)
+{
+    PyObject *res;
+    int rc = PyMapping_GetOptionalItem(obj, key, &res);
+    Py_XDECREF(res);
+    return rc;
+}
+
+static inline int
+PyMapping_HasKeyStringWithError(PyObject *obj, const char *key)
+{
+    PyObject *res;
+    int rc = PyMapping_GetOptionalItemString(obj, key, &res);
+    Py_XDECREF(res);
+    return rc;
+}
+#endif
+
+
+// gh-108511 added PyObject_HasAttrWithError() and
+// PyObject_HasAttrStringWithError() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyObject_HasAttrWithError(PyObject *obj, PyObject *attr)
+{
+    PyObject *res;
+    int rc = PyObject_GetOptionalAttr(obj, attr, &res);
+    Py_XDECREF(res);
+    return rc;
+}
+
+static inline int
+PyObject_HasAttrStringWithError(PyObject *obj, const char *attr)
+{
+    PyObject *res;
+    int rc = PyObject_GetOptionalAttrString(obj, attr, &res);
+    Py_XDECREF(res);
+    return rc;
+}
+#endif
+
+
+// gh-106004 added PyDict_GetItemRef() and PyDict_GetItemStringRef()
+// to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyDict_GetItemRef(PyObject *mp, PyObject *key, PyObject **result)
+{
+#if PY_VERSION_HEX >= 0x03000000
+    PyObject *item = PyDict_GetItemWithError(mp, key);
+#else
+    PyObject *item = _PyDict_GetItemWithError(mp, key);
+#endif
+    if (item != NULL) {
+        *result = Py_NewRef(item);
+        return 1;  // found
+    }
+    if (!PyErr_Occurred()) {
+        *result = NULL;
+        return 0;  // not found
+    }
+    *result = NULL;
+    return -1;
+}
+
+static inline int
+PyDict_GetItemStringRef(PyObject *mp, const char *key, PyObject **result)
+{
+    int res;
+#if PY_VERSION_HEX >= 0x03000000
+    PyObject *key_obj = PyUnicode_FromString(key);
+#else
+    PyObject *key_obj = PyString_FromString(key);
+#endif
+    if (key_obj == NULL) {
+        *result = NULL;
+        return -1;
+    }
+    res = PyDict_GetItemRef(mp, key_obj, result);
+    Py_DECREF(key_obj);
+    return res;
+}
+#endif
+
+
+// gh-106307 added PyModule_Add() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyModule_Add(PyObject *mod, const char *name, PyObject *value)
+{
+    int res = PyModule_AddObjectRef(mod, name, value);
+    Py_XDECREF(value);
+    return res;
+}
+#endif
+
+
+// gh-108014 added Py_IsFinalizing() to Python 3.13.0a1
+// bpo-1856 added _Py_Finalizing to Python 3.2.1b1.
+// _Py_IsFinalizing() was added to PyPy 7.3.0.
+#if (0x030201B1 <= PY_VERSION_HEX && PY_VERSION_HEX < 0x030D00A1) \
+        && (!defined(PYPY_VERSION_NUM) || PYPY_VERSION_NUM >= 0x7030000)
+static inline int Py_IsFinalizing(void)
+{
+#if PY_VERSION_HEX >= 0x030700A1
+    // _Py_IsFinalizing() was added to Python 3.7.0a1.
+    return _Py_IsFinalizing();
+#else
+    return (_Py_Finalizing != NULL);
+#endif
+}
+#endif
+
+
+// gh-108323 added PyDict_ContainsString() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int PyDict_ContainsString(PyObject *op, const char *key)
+{
+    PyObject *key_obj = PyUnicode_FromString(key);
+    if (key_obj == NULL) {
+        return -1;
+    }
+    int res = PyDict_Contains(op, key_obj);
+    Py_DECREF(key_obj);
+    return res;
+}
+#endif
+
+
+// gh-108445 added PyLong_AsInt() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int PyLong_AsInt(PyObject *obj)
+{
+#ifdef PYPY_VERSION
+    long value = PyLong_AsLong(obj);
+    if (value == -1 && PyErr_Occurred()) {
+        return -1;
+    }
+    if (value < (long)INT_MIN || (long)INT_MAX < value) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "Python int too large to convert to C int");
+        return -1;
+    }
+    return (int)value;
+#else
+    return _PyLong_AsInt(obj);
+#endif
+}
+#endif
+
+
+// gh-107073 added PyObject_VisitManagedDict() to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyObject_VisitManagedDict(PyObject *obj, visitproc visit, void *arg)
+{
+    PyObject **dict = _PyObject_GetDictPtr(obj);
+    if (*dict == NULL) {
+        return -1;
+    }
+    Py_VISIT(*dict);
+    return 0;
+}
+
+static inline void
+PyObject_ClearManagedDict(PyObject *obj)
+{
+    PyObject **dict = _PyObject_GetDictPtr(obj);
+    if (*dict == NULL) {
+        return;
+    }
+    Py_CLEAR(*dict);
+}
+#endif
+
+// gh-108867 added PyThreadState_GetUnchecked() to Python 3.13.0a1
+// Python 3.5.2 added _PyThreadState_UncheckedGet().
+#if PY_VERSION_HEX >= 0x03050200 && PY_VERSION_HEX < 0x030D00A1
+static inline PyThreadState*
+PyThreadState_GetUnchecked(void)
+{
+    return _PyThreadState_UncheckedGet();
+}
+#endif
+
+// gh-110289 added PyUnicode_EqualToUTF8() and PyUnicode_EqualToUTF8AndSize()
+// to Python 3.13.0a1
+#if PY_VERSION_HEX < 0x030D00A1
+static inline int
+PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t str_len)
+{
+    Py_ssize_t len;
+    const void *utf8;
+    PyObject *exc_type, *exc_value, *exc_tb;
+    int res;
+
+    // API cannot report errors so save/restore the exception
+    PyErr_Fetch(&exc_type, &exc_value, &exc_tb);
+
+    // Python 3.3.0a1 added PyUnicode_AsUTF8AndSize()
+#if PY_VERSION_HEX >= 0x030300A1
+    if (PyUnicode_IS_ASCII(unicode)) {
+        utf8 = PyUnicode_DATA(unicode);
+        len = PyUnicode_GET_LENGTH(unicode);
+    }
+    else {
+        utf8 = PyUnicode_AsUTF8AndSize(unicode, &len);
+        if (utf8 == NULL) {
+            // Memory allocation failure. The API cannot report error,
+            // so ignore the exception and return 0.
+            res = 0;
+            goto done;
+        }
+    }
+
+    if (len != str_len) {
+        res = 0;
+        goto done;
+    }
+    res = (memcmp(utf8, str, (size_t)len) == 0);
+#else
+    PyObject *bytes = PyUnicode_AsUTF8String(unicode);
+    if (bytes == NULL) {
+        // Memory allocation failure. The API cannot report error,
+        // so ignore the exception and return 0.
+        res = 0;
+        goto done;
+    }
+
+#if PY_VERSION_HEX >= 0x03000000
+    len = PyBytes_GET_SIZE(bytes);
+    utf8 = PyBytes_AS_STRING(bytes);
+#else
+    len = PyString_GET_SIZE(bytes);
+    utf8 = PyString_AS_STRING(bytes);
+#endif
+    if (len != str_len) {
+        Py_DECREF(bytes);
+        res = 0;
+        goto done;
+    }
+
+    res = (memcmp(utf8, str, (size_t)len) == 0);
+    Py_DECREF(bytes);
+#endif
+
+done:
+    PyErr_Restore(exc_type, exc_value, exc_tb);
+    return res;
+}
+
+static inline int
+PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
+{
+    return PyUnicode_EqualToUTF8AndSize(unicode, str, (Py_ssize_t)strlen(str));
+}
+#endif
+
+
+// gh-111138 added PyList_Extend() and PyList_Clear() to Python 3.13.0a2
+#if PY_VERSION_HEX < 0x030D00A2
+static inline int
+PyList_Extend(PyObject *list, PyObject *iterable)
+{
+    return PyList_SetSlice(list, PY_SSIZE_T_MAX, PY_SSIZE_T_MAX, iterable);
+}
+
+static inline int
+PyList_Clear(PyObject *list)
+{
+    return PyList_SetSlice(list, 0, PY_SSIZE_T_MAX, NULL);
+}
+#endif
+
+// gh-111262 added PyDict_Pop() and PyDict_PopString() to Python 3.13.0a2
+#if PY_VERSION_HEX < 0x030D00A2
+static inline int
+PyDict_Pop(PyObject *dict, PyObject *key, PyObject **result)
+{
+    PyObject *value;
+
+    if (!PyDict_Check(dict)) {
+        PyErr_BadInternalCall();
+        if (result) {
+            *result = NULL;
+        }
+        return -1;
+    }
+
+    // bpo-16991 added _PyDict_Pop() to Python 3.5.0b2.
+    // Python 3.6.0b3 changed _PyDict_Pop() first argument type to PyObject*.
+    // Python 3.13.0a1 removed _PyDict_Pop().
+#if defined(PYPY_VERSION) || PY_VERSION_HEX < 0x030500b2 || PY_VERSION_HEX >= 0x030D0000
+    value = PyObject_CallMethod(dict, "pop", "O", key);
+#elif PY_VERSION_HEX < 0x030600b3
+    value = _PyDict_Pop(_Py_CAST(PyDictObject*, dict), key, NULL);
+#else
+    value = _PyDict_Pop(dict, key, NULL);
+#endif
+    if (value == NULL) {
+        if (result) {
+            *result = NULL;
+        }
+        if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_KeyError)) {
+            return -1;
+        }
+        PyErr_Clear();
+        return 0;
+    }
+    if (result) {
+        *result = value;
+    }
+    else {
+        Py_DECREF(value);
+    }
+    return 1;
+}
+
+static inline int
+PyDict_PopString(PyObject *dict, const char *key, PyObject **result)
+{
+    PyObject *key_obj = PyUnicode_FromString(key);
+    if (key_obj == NULL) {
+        if (result != NULL) {
+            *result = NULL;
+        }
+        return -1;
+    }
+
+    int res = PyDict_Pop(dict, key_obj, result);
+    Py_DECREF(key_obj);
+    return res;
+}
+#endif
+
+
+#if PY_VERSION_HEX < 0x030200A4
+// Python 3.2.0a4 added Py_hash_t type
+typedef Py_ssize_t Py_hash_t;
+#endif
+
+
+// gh-111545 added Py_HashPointer() to Python 3.13.0a3
+#if PY_VERSION_HEX < 0x030D00A3
+static inline Py_hash_t Py_HashPointer(const void *ptr)
+{
+#if PY_VERSION_HEX >= 0x030900A4 && !defined(PYPY_VERSION)
+    return _Py_HashPointer(ptr);
+#else
+    return _Py_HashPointer(_Py_CAST(void*, ptr));
+#endif
+}
+#endif
+
+
+// Python 3.13a4 added a PyTime API.
+// Use the private API added to Python 3.5.
+#if PY_VERSION_HEX < 0x030D00A4 && PY_VERSION_HEX  >= 0x03050000
+typedef _PyTime_t PyTime_t;
+#define PyTime_MIN _PyTime_MIN
+#define PyTime_MAX _PyTime_MAX
+
+static inline double PyTime_AsSecondsDouble(PyTime_t t)
+{ return _PyTime_AsSecondsDouble(t); }
+
+static inline int PyTime_Monotonic(PyTime_t *result)
+{ return _PyTime_GetMonotonicClockWithInfo(result, NULL); }
+
+static inline int PyTime_Time(PyTime_t *result)
+{ return _PyTime_GetSystemClockWithInfo(result, NULL); }
+
+static inline int PyTime_PerfCounter(PyTime_t *result)
+{
+#if PY_VERSION_HEX >= 0x03070000 && !defined(PYPY_VERSION)
+    return _PyTime_GetPerfCounterWithInfo(result, NULL);
+#elif PY_VERSION_HEX >= 0x03070000
+    // Call time.perf_counter_ns() and convert Python int object to PyTime_t.
+    // Cache time.perf_counter_ns() function for best performance.
+    static PyObject *func = NULL;
+    if (func == NULL) {
+        PyObject *mod = PyImport_ImportModule("time");
+        if (mod == NULL) {
+            return -1;
+        }
+
+        func = PyObject_GetAttrString(mod, "perf_counter_ns");
+        Py_DECREF(mod);
+        if (func == NULL) {
+            return -1;
+        }
+    }
+
+    PyObject *res = PyObject_CallNoArgs(func);
+    if (res == NULL) {
+        return -1;
+    }
+    long long value = PyLong_AsLongLong(res);
+    Py_DECREF(res);
+
+    if (value == -1 && PyErr_Occurred()) {
+        return -1;
+    }
+
+    Py_BUILD_ASSERT(sizeof(value) >= sizeof(PyTime_t));
+    *result = (PyTime_t)value;
+    return 0;
+#else
+    // Call time.perf_counter() and convert C double to PyTime_t.
+    // Cache time.perf_counter() function for best performance.
+    static PyObject *func = NULL;
+    if (func == NULL) {
+        PyObject *mod = PyImport_ImportModule("time");
+        if (mod == NULL) {
+            return -1;
+        }
+
+        func = PyObject_GetAttrString(mod, "perf_counter");
+        Py_DECREF(mod);
+        if (func == NULL) {
+            return -1;
+        }
+    }
+
+    PyObject *res = PyObject_CallNoArgs(func);
+    if (res == NULL) {
+        return -1;
+    }
+    double d = PyFloat_AsDouble(res);
+    Py_DECREF(res);
+
+    if (d == -1.0 && PyErr_Occurred()) {
+        return -1;
+    }
+
+    // Avoid floor() to avoid having to link to libm
+    *result = (PyTime_t)(d * 1e9);
+    return 0;
+#endif
+}
+
+#endif
+
+// gh-111389 added hash constants to Python 3.13.0a5. These constants were
+// added first as private macros to Python 3.4.0b1 and PyPy 7.3.9.
+#if (!defined(PyHASH_BITS) \
+     && ((!defined(PYPY_VERSION) && PY_VERSION_HEX >= 0x030400B1) \
+         || (defined(PYPY_VERSION) && PY_VERSION_HEX >= 0x03070000 \
+             && PYPY_VERSION_NUM >= 0x07090000)))
+#  define PyHASH_BITS _PyHASH_BITS
+#  define PyHASH_MODULUS _PyHASH_MODULUS
+#  define PyHASH_INF _PyHASH_INF
+#  define PyHASH_IMAG _PyHASH_IMAG
+#endif
+
+
+// gh-111545 added Py_GetConstant() and Py_GetConstantBorrowed()
+// to Python 3.13.0a6
+#if PY_VERSION_HEX < 0x030D00A6 && !defined(Py_CONSTANT_NONE)
+
+#define Py_CONSTANT_NONE 0
+#define Py_CONSTANT_FALSE 1
+#define Py_CONSTANT_TRUE 2
+#define Py_CONSTANT_ELLIPSIS 3
+#define Py_CONSTANT_NOT_IMPLEMENTED 4
+#define Py_CONSTANT_ZERO 5
+#define Py_CONSTANT_ONE 6
+#define Py_CONSTANT_EMPTY_STR 7
+#define Py_CONSTANT_EMPTY_BYTES 8
+#define Py_CONSTANT_EMPTY_TUPLE 9
+
+static inline PyObject* Py_GetConstant(unsigned int constant_id)
+{
+    static PyObject* constants[Py_CONSTANT_EMPTY_TUPLE + 1] = {NULL};
+
+    if (constants[Py_CONSTANT_NONE] == NULL) {
+        constants[Py_CONSTANT_NONE] = Py_None;
+        constants[Py_CONSTANT_FALSE] = Py_False;
+        constants[Py_CONSTANT_TRUE] = Py_True;
+        constants[Py_CONSTANT_ELLIPSIS] = Py_Ellipsis;
+        constants[Py_CONSTANT_NOT_IMPLEMENTED] = Py_NotImplemented;
+
+        constants[Py_CONSTANT_ZERO] = PyLong_FromLong(0);
+        if (constants[Py_CONSTANT_ZERO] == NULL) {
+            goto fatal_error;
+        }
+
+        constants[Py_CONSTANT_ONE] = PyLong_FromLong(1);
+        if (constants[Py_CONSTANT_ONE] == NULL) {
+            goto fatal_error;
+        }
+
+        constants[Py_CONSTANT_EMPTY_STR] = PyUnicode_FromStringAndSize("", 0);
+        if (constants[Py_CONSTANT_EMPTY_STR] == NULL) {
+            goto fatal_error;
+        }
+
+        constants[Py_CONSTANT_EMPTY_BYTES] = PyBytes_FromStringAndSize("", 0);
+        if (constants[Py_CONSTANT_EMPTY_BYTES] == NULL) {
+            goto fatal_error;
+        }
+
+        constants[Py_CONSTANT_EMPTY_TUPLE] = PyTuple_New(0);
+        if (constants[Py_CONSTANT_EMPTY_TUPLE] == NULL) {
+            goto fatal_error;
+        }
+        // goto dance to avoid compiler warnings about Py_FatalError()
+        goto init_done;
+
+fatal_error:
+        // This case should never happen
+        Py_FatalError("Py_GetConstant() failed to get constants");
+    }
+
+init_done:
+    if (constant_id <= Py_CONSTANT_EMPTY_TUPLE) {
+        return Py_NewRef(constants[constant_id]);
+    }
+    else {
+        PyErr_BadInternalCall();
+        return NULL;
+    }
+}
+
+static inline PyObject* Py_GetConstantBorrowed(unsigned int constant_id)
+{
+    PyObject *obj = Py_GetConstant(constant_id);
+    Py_XDECREF(obj);
+    return obj;
+}
+#endif
+
+
+// gh-114329 added PyList_GetItemRef() to Python 3.13.0a4
+#if PY_VERSION_HEX < 0x030D00A4
+static inline PyObject *
+PyList_GetItemRef(PyObject *op, Py_ssize_t index)
+{
+    PyObject *item = PyList_GetItem(op, index);
+    Py_XINCREF(item);
+    return item;
+}
+#endif
+
+
+// gh-114329 added PyList_GetItemRef() to Python 3.13.0a4
+#if PY_VERSION_HEX < 0x030D00A4
+static inline int
+PyDict_SetDefaultRef(PyObject *d, PyObject *key, PyObject *default_value,
+                     PyObject **result)
+{
+    PyObject *value;
+    if (PyDict_GetItemRef(d, key, &value) < 0) {
+        // get error
+        if (result) {
+            *result = NULL;
+        }
+        return -1;
+    }
+    if (value != NULL) {
+        // present
+        if (result) {
+            *result = value;
+        }
+        else {
+            Py_DECREF(value);
+        }
+        return 1;
+    }
+
+    // missing: set the item
+    if (PyDict_SetItem(d, key, default_value) < 0) {
+        // set error
+        if (result) {
+            *result = NULL;
+        }
+        return -1;
+    }
+    if (result) {
+        *result = Py_NewRef(default_value);
+    }
+    return 0;
+}
+#endif
+
+#if PY_VERSION_HEX < 0x030E0000 && PY_VERSION_HEX >= 0x03060000 && !defined(PYPY_VERSION)
+typedef struct PyUnicodeWriter PyUnicodeWriter;
+
+static inline void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
+{
+    _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
+    PyMem_Free(writer);
+}
+
+static inline PyUnicodeWriter* PyUnicodeWriter_Create(Py_ssize_t length)
+{
+    if (length < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "length must be positive");
+        return NULL;
+    }
+
+    const size_t size = sizeof(_PyUnicodeWriter);
+    PyUnicodeWriter *pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
+    if (pub_writer == _Py_NULL) {
+        PyErr_NoMemory();
+        return _Py_NULL;
+    }
+    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
+
+    _PyUnicodeWriter_Init(writer);
+    if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
+        PyUnicodeWriter_Discard(pub_writer);
+        return NULL;
+    }
+    writer->overallocate = 1;
+    return pub_writer;
+}
+
+static inline PyObject* PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
+{
+    PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
+    assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
+    PyMem_Free(writer);
+    return str;
+}
+
+static inline int
+PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
+{
+    if (ch > 0x10ffff) {
+        PyErr_SetString(PyExc_ValueError,
+                        "character must be in range(0x110000)");
+        return -1;
+    }
+
+    return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
+}
+
+static inline int
+PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
+{
+    PyObject *str = PyObject_Str(obj);
+    if (str == NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
+    Py_DECREF(str);
+    return res;
+}
+
+static inline int
+PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
+{
+    PyObject *str = PyObject_Repr(obj);
+    if (str == NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
+    Py_DECREF(str);
+    return res;
+}
+
+static inline int
+PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
+                          const char *str, Py_ssize_t size)
+{
+    if (size < 0) {
+        size = (Py_ssize_t)strlen(str);
+    }
+
+    PyObject *str_obj = PyUnicode_FromStringAndSize(str, size);
+    if (str_obj == _Py_NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str_obj);
+    Py_DECREF(str_obj);
+    return res;
+}
+
+static inline int
+PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer,
+                              const wchar_t *str, Py_ssize_t size)
+{
+    if (size < 0) {
+        size = (Py_ssize_t)wcslen(str);
+    }
+
+    PyObject *str_obj = PyUnicode_FromWideChar(str, size);
+    if (str_obj == _Py_NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str_obj);
+    Py_DECREF(str_obj);
+    return res;
+}
+
+static inline int
+PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
+                               Py_ssize_t start, Py_ssize_t end)
+{
+    if (!PyUnicode_Check(str)) {
+        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
+        return -1;
+    }
+    if (start < 0 || start > end) {
+        PyErr_Format(PyExc_ValueError, "invalid start argument");
+        return -1;
+    }
+    if (end > PyUnicode_GET_LENGTH(str)) {
+        PyErr_Format(PyExc_ValueError, "invalid end argument");
+        return -1;
+    }
+
+    return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
+                                           start, end);
+}
+
+static inline int
+PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
+{
+    va_list vargs;
+    va_start(vargs, format);
+    PyObject *str = PyUnicode_FromFormatV(format, vargs);
+    va_end(vargs);
+    if (str == _Py_NULL) {
+        return -1;
+    }
+
+    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
+    Py_DECREF(str);
+    return res;
+}
+#endif  // PY_VERSION_HEX < 0x030E0000
+
+// gh-116560 added PyLong_GetSign() to Python 3.14.0a0
+#if PY_VERSION_HEX < 0x030E00A0
+static inline int PyLong_GetSign(PyObject *obj, int *sign)
+{
+    if (!PyLong_Check(obj)) {
+        PyErr_Format(PyExc_TypeError, "expect int, got %s", Py_TYPE(obj)->tp_name);
+        return -1;
+    }
+
+    *sign = _PyLong_Sign(obj);
+    return 0;
+}
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // PYTHONCAPI_COMPAT
diff --git a/pyarrow/src/arrow/python/visibility.h b/pyarrow/src/arrow/python/visibility.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bf9680a06bf016478232f3914d3728bfd9ebffd
--- /dev/null
+++ b/pyarrow/src/arrow/python/visibility.h
@@ -0,0 +1,39 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#if defined(_WIN32) || defined(__CYGWIN__)  // Windows
+#  if defined(_MSC_VER)
+#    pragma warning(disable : 4251)
+#  else
+#    pragma GCC diagnostic ignored "-Wattributes"
+#  endif
+
+#  ifdef ARROW_PYTHON_STATIC
+#    define ARROW_PYTHON_EXPORT
+#  elif defined(ARROW_PYTHON_EXPORTING)
+#    define ARROW_PYTHON_EXPORT __declspec(dllexport)
+#  else
+#    define ARROW_PYTHON_EXPORT __declspec(dllimport)
+#  endif
+
+#else  // Not Windows
+#  ifndef ARROW_PYTHON_EXPORT
+#    define ARROW_PYTHON_EXPORT __attribute__((visibility("default")))
+#  endif
+#endif  // Non-Windows
diff --git a/pyarrow/tests/data/feather/v0.17.0.version.2-compression.lz4.feather b/pyarrow/tests/data/feather/v0.17.0.version.2-compression.lz4.feather
new file mode 100644
index 0000000000000000000000000000000000000000..562b0b2c53d8684fcb2b0417bad36877d92af53e
Binary files /dev/null and b/pyarrow/tests/data/feather/v0.17.0.version.2-compression.lz4.feather differ
diff --git a/pyarrow/tests/data/orc/README.md b/pyarrow/tests/data/orc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c548225155331252e380ef9317af43f105c509d0
--- /dev/null
+++ b/pyarrow/tests/data/orc/README.md
@@ -0,0 +1,22 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+The ORC and JSON files come from the `examples` directory in the Apache ORC
+source tree:
+https://github.com/apache/orc/tree/main/examples
diff --git a/pyarrow/tests/data/orc/TestOrcFile.emptyFile.orc b/pyarrow/tests/data/orc/TestOrcFile.emptyFile.orc
new file mode 100644
index 0000000000000000000000000000000000000000..ecdadcbff134615d7eefcb740d55fe710cee059b
Binary files /dev/null and b/pyarrow/tests/data/orc/TestOrcFile.emptyFile.orc differ
diff --git a/pyarrow/tests/data/orc/TestOrcFile.test1.orc b/pyarrow/tests/data/orc/TestOrcFile.test1.orc
new file mode 100644
index 0000000000000000000000000000000000000000..4fb0beff868971efb653739fe6ae47a37e4a1c66
Binary files /dev/null and b/pyarrow/tests/data/orc/TestOrcFile.test1.orc differ
diff --git a/pyarrow/tests/data/orc/TestOrcFile.testDate1900.orc b/pyarrow/tests/data/orc/TestOrcFile.testDate1900.orc
new file mode 100644
index 0000000000000000000000000000000000000000..f51ffdbd03a43fadbedce302ffa8e5967a30ad59
Binary files /dev/null and b/pyarrow/tests/data/orc/TestOrcFile.testDate1900.orc differ
diff --git a/pyarrow/tests/data/orc/decimal.orc b/pyarrow/tests/data/orc/decimal.orc
new file mode 100644
index 0000000000000000000000000000000000000000..cb0f7b9d767a37159c5509da1a47877d1c8b411e
Binary files /dev/null and b/pyarrow/tests/data/orc/decimal.orc differ
diff --git a/pyarrow/tests/interchange/__init__.py b/pyarrow/tests/interchange/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..13a83393a9124bf6ec36540556b4808abd47e206
--- /dev/null
+++ b/pyarrow/tests/interchange/__init__.py
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
diff --git a/pyarrow/tests/interchange/test_conversion.py b/pyarrow/tests/interchange/test_conversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..50da6693afff19e931fbf3837305fa6262046987
--- /dev/null
+++ b/pyarrow/tests/interchange/test_conversion.py
@@ -0,0 +1,529 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from datetime import datetime as dt
+import pyarrow as pa
+from pyarrow.vendored.version import Version
+import pytest
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
+import pyarrow.interchange as pi
+from pyarrow.interchange.column import (
+    _PyArrowColumn,
+    ColumnNullType,
+    DtypeKind,
+)
+from pyarrow.interchange.from_dataframe import _from_dataframe
+
+try:
+    import pandas as pd
+    # import pandas.testing as tm
+except ImportError:
+    pass
+
+
+@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+@pytest.mark.parametrize("tz", ['', 'America/New_York', '+07:30', '-04:30'])
+def test_datetime(unit, tz):
+    dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), None]
+    table = pa.table({"A": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))})
+    col = table.__dataframe__().get_column_by_name("A")
+
+    assert col.size() == 3
+    assert col.offset == 0
+    assert col.null_count == 1
+    assert col.dtype[0] == DtypeKind.DATETIME
+    assert col.describe_null == (ColumnNullType.USE_BITMASK, 0)
+
+
+@pytest.mark.parametrize(
+    ["test_data", "kind"],
+    [
+        (["foo", "bar"], 21),
+        ([1.5, 2.5, 3.5], 2),
+        ([1, 2, 3, 4], 0),
+    ],
+)
+def test_array_to_pyarrowcolumn(test_data, kind):
+    arr = pa.array(test_data)
+    arr_column = _PyArrowColumn(arr)
+
+    assert arr_column._col == arr
+    assert arr_column.size() == len(test_data)
+    assert arr_column.dtype[0] == kind
+    assert arr_column.num_chunks() == 1
+    assert arr_column.null_count == 0
+    assert arr_column.get_buffers()["validity"] is None
+    assert len(list(arr_column.get_chunks())) == 1
+
+    for chunk in arr_column.get_chunks():
+        assert chunk == arr_column
+
+
+def test_offset_of_sliced_array():
+    arr = pa.array([1, 2, 3, 4])
+    arr_sliced = arr.slice(2, 2)
+
+    table = pa.table([arr], names=["arr"])
+    table_sliced = pa.table([arr_sliced], names=["arr_sliced"])
+
+    col = table_sliced.__dataframe__().get_column(0)
+    assert col.offset == 2
+
+    result = _from_dataframe(table_sliced.__dataframe__())
+    assert table_sliced.equals(result)
+    assert not table.equals(result)
+
+    # pandas hardcodes offset to 0:
+    # https://github.com/pandas-dev/pandas/blob/5c66e65d7b9fef47ccb585ce2fd0b3ea18dc82ea/pandas/core/interchange/from_dataframe.py#L247
+    # so conversion to pandas can't be tested currently
+
+    # df = pandas_from_dataframe(table)
+    # df_sliced = pandas_from_dataframe(table_sliced)
+
+    # tm.assert_series_equal(df["arr"][2:4], df_sliced["arr_sliced"],
+    #                        check_index=False, check_names=False)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize(
+    "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
+)
+@pytest.mark.parametrize(
+    "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+)
+@pytest.mark.parametrize(
+    "float, np_float_str", [
+        # (pa.float16(), np.float16),   #not supported by pandas
+        (pa.float32(), "float32"),
+        (pa.float64(), "float64")
+    ]
+)
+def test_pandas_roundtrip(uint, int, float, np_float_str):
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    arr = [1, 2, 3]
+    table = pa.table(
+        {
+            "a": pa.array(arr, type=uint),
+            "b": pa.array(arr, type=int),
+            "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)), type=float),
+            "d": [True, False, True],
+        }
+    )
+    from pandas.api.interchange import (
+        from_dataframe as pandas_from_dataframe
+    )
+    pandas_df = pandas_from_dataframe(table)
+    result = pi.from_dataframe(pandas_df)
+    assert table.equals(result)
+
+    table_protocol = table.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert table_protocol.num_columns() == result_protocol.num_columns()
+    assert table_protocol.num_rows() == result_protocol.num_rows()
+    assert table_protocol.num_chunks() == result_protocol.num_chunks()
+    assert table_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.pandas
+def test_pandas_roundtrip_string():
+    # See https://github.com/pandas-dev/pandas/issues/50554
+    if Version(pd.__version__) < Version("1.6"):
+        pytest.skip("Column.size() bug in pandas")
+
+    arr = ["a", "", "c"]
+    table = pa.table({"a": pa.array(arr)})
+
+    from pandas.api.interchange import (
+        from_dataframe as pandas_from_dataframe
+    )
+
+    pandas_df = pandas_from_dataframe(table)
+    result = pi.from_dataframe(pandas_df)
+
+    assert result["a"].to_pylist() == table["a"].to_pylist()
+    assert pa.types.is_string(table["a"].type)
+    assert pa.types.is_large_string(result["a"].type)
+
+    table_protocol = table.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert table_protocol.num_columns() == result_protocol.num_columns()
+    assert table_protocol.num_rows() == result_protocol.num_rows()
+    assert table_protocol.num_chunks() == result_protocol.num_chunks()
+    assert table_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.pandas
+def test_pandas_roundtrip_large_string():
+    # See https://github.com/pandas-dev/pandas/issues/50554
+    if Version(pd.__version__) < Version("1.6"):
+        pytest.skip("Column.size() bug in pandas")
+
+    arr = ["a", "", "c"]
+    table = pa.table({"a_large": pa.array(arr, type=pa.large_string())})
+
+    from pandas.api.interchange import (
+        from_dataframe as pandas_from_dataframe
+    )
+
+    if Version(pd.__version__) >= Version("2.0.1"):
+        pandas_df = pandas_from_dataframe(table)
+        result = pi.from_dataframe(pandas_df)
+
+        assert result["a_large"].to_pylist() == table["a_large"].to_pylist()
+        assert pa.types.is_large_string(table["a_large"].type)
+        assert pa.types.is_large_string(result["a_large"].type)
+
+        table_protocol = table.__dataframe__()
+        result_protocol = result.__dataframe__()
+
+        assert table_protocol.num_columns() == result_protocol.num_columns()
+        assert table_protocol.num_rows() == result_protocol.num_rows()
+        assert table_protocol.num_chunks() == result_protocol.num_chunks()
+        assert table_protocol.column_names() == result_protocol.column_names()
+
+    else:
+        # large string not supported by pandas implementation for
+        # older versions of pandas
+        # https://github.com/pandas-dev/pandas/issues/52795
+        with pytest.raises(AssertionError):
+            pandas_from_dataframe(table)
+
+
+@pytest.mark.pandas
+def test_pandas_roundtrip_string_with_missing():
+    # See https://github.com/pandas-dev/pandas/issues/50554
+    if Version(pd.__version__) < Version("1.6"):
+        pytest.skip("Column.size() bug in pandas")
+
+    arr = ["a", "", "c", None]
+    table = pa.table({"a": pa.array(arr),
+                      "a_large": pa.array(arr, type=pa.large_string())})
+
+    from pandas.api.interchange import (
+        from_dataframe as pandas_from_dataframe
+    )
+
+    if Version(pd.__version__) >= Version("2.0.2"):
+        pandas_df = pandas_from_dataframe(table)
+        result = pi.from_dataframe(pandas_df)
+
+        assert result["a"].to_pylist() == table["a"].to_pylist()
+        assert pa.types.is_string(table["a"].type)
+        assert pa.types.is_large_string(result["a"].type)
+
+        assert result["a_large"].to_pylist() == table["a_large"].to_pylist()
+        assert pa.types.is_large_string(table["a_large"].type)
+        assert pa.types.is_large_string(result["a_large"].type)
+    else:
+        # older versions of pandas do not have bitmask support
+        # https://github.com/pandas-dev/pandas/issues/49888
+        with pytest.raises(NotImplementedError):
+            pandas_from_dataframe(table)
+
+
+@pytest.mark.pandas
+def test_pandas_roundtrip_categorical():
+    if Version(pd.__version__) < Version("2.0.2"):
+        pytest.skip("Bitmasks not supported in pandas interchange implementation")
+
+    arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", None]
+    table = pa.table(
+        {"weekday": pa.array(arr).dictionary_encode()}
+    )
+
+    from pandas.api.interchange import (
+        from_dataframe as pandas_from_dataframe
+    )
+    pandas_df = pandas_from_dataframe(table)
+    result = pi.from_dataframe(pandas_df)
+
+    assert result["weekday"].to_pylist() == table["weekday"].to_pylist()
+    assert pa.types.is_dictionary(table["weekday"].type)
+    assert pa.types.is_dictionary(result["weekday"].type)
+    assert pa.types.is_string(table["weekday"].chunk(0).dictionary.type)
+    assert pa.types.is_large_string(result["weekday"].chunk(0).dictionary.type)
+    assert pa.types.is_int32(table["weekday"].chunk(0).indices.type)
+    assert pa.types.is_int8(result["weekday"].chunk(0).indices.type)
+
+    table_protocol = table.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert table_protocol.num_columns() == result_protocol.num_columns()
+    assert table_protocol.num_rows() == result_protocol.num_rows()
+    assert table_protocol.num_chunks() == result_protocol.num_chunks()
+    assert table_protocol.column_names() == result_protocol.column_names()
+
+    col_table = table_protocol.get_column(0)
+    col_result = result_protocol.get_column(0)
+
+    assert col_result.dtype[0] == DtypeKind.CATEGORICAL
+    assert col_result.dtype[0] == col_table.dtype[0]
+    assert col_result.size() == col_table.size()
+    assert col_result.offset == col_table.offset
+
+    desc_cat_table = col_result.describe_categorical
+    desc_cat_result = col_result.describe_categorical
+
+    assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"]
+    assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"]
+    assert isinstance(desc_cat_result["categories"]._col, pa.Array)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+def test_pandas_roundtrip_datetime(unit):
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+    from datetime import datetime as dt
+
+    # timezones not included as they are not yet supported in
+    # the pandas implementation
+    dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), dt(2007, 7, 15)]
+    table = pa.table({"a": pa.array(dt_arr, type=pa.timestamp(unit))})
+
+    if Version(pd.__version__) < Version("1.6"):
+        # pandas < 2.0 always creates datetime64 in "ns"
+        # resolution
+        expected = pa.table({"a": pa.array(dt_arr, type=pa.timestamp('ns'))})
+    else:
+        expected = table
+
+    from pandas.api.interchange import (
+        from_dataframe as pandas_from_dataframe
+    )
+    pandas_df = pandas_from_dataframe(table)
+    result = pi.from_dataframe(pandas_df)
+
+    assert expected.equals(result)
+
+    expected_protocol = expected.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert expected_protocol.num_columns() == result_protocol.num_columns()
+    assert expected_protocol.num_rows() == result_protocol.num_rows()
+    assert expected_protocol.num_chunks() == result_protocol.num_chunks()
+    assert expected_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize(
+    "np_float_str", ["float32", "float64"]
+)
+def test_pandas_to_pyarrow_with_missing(np_float_str):
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    np_array = np.array([0, np.nan, 2], dtype=np.dtype(np_float_str))
+    datetime_array = [None, dt(2007, 7, 14), dt(2007, 7, 15)]
+    df = pd.DataFrame({
+        # float, ColumnNullType.USE_NAN
+        "a": np_array,
+        # ColumnNullType.USE_SENTINEL
+        "dt": np.array(datetime_array, dtype="datetime64[ns]")
+    })
+    expected = pa.table({
+        "a": pa.array(np_array, from_pandas=True),
+        "dt": pa.array(datetime_array, type=pa.timestamp("ns"))
+    })
+    result = pi.from_dataframe(df)
+
+    assert result.equals(expected)
+
+
+@pytest.mark.pandas
+def test_pandas_to_pyarrow_float16_with_missing():
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    # np.float16 errors if ps.is_nan is used
+    # pyarrow.lib.ArrowNotImplementedError: Function 'is_nan' has no kernel
+    # matching input types (halffloat)
+    np_array = np.array([0, np.nan, 2], dtype=np.float16)
+    df = pd.DataFrame({"a": np_array})
+
+    with pytest.raises(NotImplementedError):
+        pi.from_dataframe(df)
+
+
+@pytest.mark.numpy
+@pytest.mark.parametrize(
+    "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
+)
+@pytest.mark.parametrize(
+    "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+)
+@pytest.mark.parametrize(
+    "float, np_float_str", [
+        (pa.float16(), "float16"),
+        (pa.float32(), "float32"),
+        (pa.float64(), "float64")
+    ]
+)
+@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+@pytest.mark.parametrize("tz", ['America/New_York', '+07:30', '-04:30'])
+@pytest.mark.parametrize("offset, length", [(0, 3), (0, 2), (1, 2), (2, 1)])
+def test_pyarrow_roundtrip(uint, int, float, np_float_str,
+                           unit, tz, offset, length):
+
+    from datetime import datetime as dt
+    arr = [1, 2, None]
+    dt_arr = [dt(2007, 7, 13), None, dt(2007, 7, 15)]
+
+    table = pa.table(
+        {
+            "a": pa.array(arr, type=uint),
+            "b": pa.array(arr, type=int),
+            "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)),
+                          type=float, from_pandas=True),
+            "d": [True, False, True],
+            "e": [True, False, None],
+            "f": ["a", None, "c"],
+            "g": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))
+        }
+    )
+    table = table.slice(offset, length)
+    result = _from_dataframe(table.__dataframe__())
+
+    assert table.equals(result)
+
+    table_protocol = table.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert table_protocol.num_columns() == result_protocol.num_columns()
+    assert table_protocol.num_rows() == result_protocol.num_rows()
+    assert table_protocol.num_chunks() == result_protocol.num_chunks()
+    assert table_protocol.column_names() == result_protocol.column_names()
+
+
+@pytest.mark.parametrize("offset, length", [(0, 10), (0, 2), (7, 3), (2, 1)])
+def test_pyarrow_roundtrip_categorical(offset, length):
+    arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", None, "Sun"]
+    table = pa.table(
+        {"weekday": pa.array(arr).dictionary_encode()}
+    )
+    table = table.slice(offset, length)
+    result = _from_dataframe(table.__dataframe__())
+
+    assert table.equals(result)
+
+    table_protocol = table.__dataframe__()
+    result_protocol = result.__dataframe__()
+
+    assert table_protocol.num_columns() == result_protocol.num_columns()
+    assert table_protocol.num_rows() == result_protocol.num_rows()
+    assert table_protocol.num_chunks() == result_protocol.num_chunks()
+    assert table_protocol.column_names() == result_protocol.column_names()
+
+    col_table = table_protocol.get_column(0)
+    col_result = result_protocol.get_column(0)
+
+    assert col_result.dtype[0] == DtypeKind.CATEGORICAL
+    assert col_result.dtype[0] == col_table.dtype[0]
+    assert col_result.size() == col_table.size()
+    assert col_result.offset == col_table.offset
+
+    desc_cat_table = col_table.describe_categorical
+    desc_cat_result = col_result.describe_categorical
+
+    assert desc_cat_table["is_ordered"] == desc_cat_result["is_ordered"]
+    assert desc_cat_table["is_dictionary"] == desc_cat_result["is_dictionary"]
+    assert isinstance(desc_cat_result["categories"]._col, pa.Array)
+
+
+@pytest.mark.large_memory
+def test_pyarrow_roundtrip_large_string():
+
+    data = np.array([b'x'*1024]*(3*1024**2), dtype='object')  # 3GB bytes data
+    arr = pa.array(data, type=pa.large_string())
+    table = pa.table([arr], names=["large_string"])
+
+    result = _from_dataframe(table.__dataframe__())
+    col = result.__dataframe__().get_column(0)
+
+    assert col.size() == 3*1024**2
+    assert pa.types.is_large_string(table[0].type)
+    assert pa.types.is_large_string(result[0].type)
+
+    assert table.equals(result)
+
+
+def test_nan_as_null():
+    table = pa.table({"a": [1, 2, 3, 4]})
+    with pytest.raises(RuntimeError):
+        table.__dataframe__(nan_as_null=True)
+
+
+@pytest.mark.pandas
+def test_allow_copy_false():
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    # Test that an error is raised when a copy is needed
+    # to create a bitmask
+
+    df = pd.DataFrame({"a": [0, 1.0, 2.0]})
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+    df = pd.DataFrame({
+        "dt": [None, dt(2007, 7, 14), dt(2007, 7, 15)]
+    })
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+
+@pytest.mark.pandas
+def test_allow_copy_false_bool_categorical():
+    if Version(pd.__version__) < Version("1.5.0"):
+        pytest.skip("__dataframe__ added to pandas in 1.5.0")
+
+    # Test that an error is raised for boolean
+    # and categorical dtype (copy is always made)
+
+    df = pd.DataFrame({"a": [None, False, True]})
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+    df = pd.DataFrame({"a": [True, False, True]})
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+    df = pd.DataFrame({"weekday": ["a", "b", None]})
+    df = df.astype("category")
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+    df = pd.DataFrame({"weekday": ["a", "b", "c"]})
+    df = df.astype("category")
+    with pytest.raises(RuntimeError):
+        pi.from_dataframe(df, allow_copy=False)
+
+
+def test_empty_dataframe():
+    schema = pa.schema([('col1', pa.int8())])
+    df = pa.table([[]], schema=schema)
+    dfi = df.__dataframe__()
+    assert pi.from_dataframe(dfi) == df
diff --git a/pyarrow/tests/interchange/test_interchange_spec.py b/pyarrow/tests/interchange/test_interchange_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..cea694d1c1ee24f5af933f59fcb07e3e38141f9d
--- /dev/null
+++ b/pyarrow/tests/interchange/test_interchange_spec.py
@@ -0,0 +1,294 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import ctypes
+import hypothesis as h
+import hypothesis.strategies as st
+
+import pytest
+try:
+    import numpy as np
+except ImportError:
+    np = None
+import pyarrow as pa
+import pyarrow.tests.strategies as past
+
+
+all_types = st.deferred(
+    lambda: (
+        past.signed_integer_types |
+        past.unsigned_integer_types |
+        past.floating_types |
+        past.bool_type |
+        past.string_type |
+        past.large_string_type
+    )
+)
+
+
+# datetime is tested in test_extra.py
+# dictionary is tested in test_categorical()
+@pytest.mark.numpy
+@h.settings(suppress_health_check=(h.HealthCheck.too_slow,))
+@h.given(past.arrays(all_types, size=3))
+def test_dtypes(arr):
+    table = pa.table([arr], names=["a"])
+    df = table.__dataframe__()
+
+    null_count = df.get_column(0).null_count
+    assert null_count == arr.null_count
+    assert isinstance(null_count, int)
+    assert df.get_column(0).size() == 3
+    assert df.get_column(0).offset == 0
+
+
+@pytest.mark.numpy
+@pytest.mark.parametrize(
+    "uint, uint_bw",
+    [
+        (pa.uint8(), 8),
+        (pa.uint16(), 16),
+        (pa.uint32(), 32)
+    ]
+)
+@pytest.mark.parametrize(
+    "int, int_bw", [
+        (pa.int8(), 8),
+        (pa.int16(), 16),
+        (pa.int32(), 32),
+        (pa.int64(), 64)
+    ]
+)
+@pytest.mark.parametrize(
+    "float, float_bw, np_float_str", [
+        (pa.float16(), 16, "float16"),
+        (pa.float32(), 32, "float32"),
+        (pa.float64(), 64, "float64")
+    ]
+)
+@pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns'])
+@pytest.mark.parametrize("tz", ['', 'America/New_York', '+07:30', '-04:30'])
+@pytest.mark.parametrize("use_batch", [False, True])
+def test_mixed_dtypes(uint, uint_bw, int, int_bw,
+                      float, float_bw, np_float_str, unit, tz,
+                      use_batch):
+    from datetime import datetime as dt
+    arr = [1, 2, 3]
+    dt_arr = [dt(2007, 7, 13), dt(2007, 7, 14), dt(2007, 7, 15)]
+    table = pa.table(
+        {
+            "a": pa.array(arr, type=uint),
+            "b": pa.array(arr, type=int),
+            "c": pa.array(np.array(arr, dtype=np.dtype(np_float_str)), type=float),
+            "d": [True, False, True],
+            "e": ["a", "", "c"],
+            "f": pa.array(dt_arr, type=pa.timestamp(unit, tz=tz))
+        }
+    )
+    if use_batch:
+        table = table.to_batches()[0]
+    df = table.__dataframe__()
+    # 0 = DtypeKind.INT, 1 = DtypeKind.UINT, 2 = DtypeKind.FLOAT,
+    # 20 = DtypeKind.BOOL, 21 = DtypeKind.STRING, 22 = DtypeKind.DATETIME
+    # see DtypeKind class in column.py
+    columns = {"a": 1, "b": 0, "c": 2, "d": 20, "e": 21, "f": 22}
+
+    for column, kind in columns.items():
+        col = df.get_column_by_name(column)
+
+        assert col.null_count == 0
+        assert col.size() == 3
+        assert col.offset == 0
+        assert col.dtype[0] == kind
+
+    assert df.get_column_by_name("a").dtype[1] == uint_bw
+    assert df.get_column_by_name("b").dtype[1] == int_bw
+    assert df.get_column_by_name("c").dtype[1] == float_bw
+
+
+def test_na_float():
+    table = pa.table({"a": [1.0, None, 2.0]})
+    df = table.__dataframe__()
+    col = df.get_column_by_name("a")
+    assert col.null_count == 1
+    assert isinstance(col.null_count, int)
+
+
+def test_noncategorical():
+    table = pa.table({"a": [1, 2, 3]})
+    df = table.__dataframe__()
+    col = df.get_column_by_name("a")
+    with pytest.raises(TypeError, match=".*categorical.*"):
+        col.describe_categorical
+
+
+@pytest.mark.parametrize("use_batch", [False, True])
+def test_categorical(use_batch):
+    import pyarrow as pa
+    arr = ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", None]
+    table = pa.table(
+        {"weekday": pa.array(arr).dictionary_encode()}
+    )
+    if use_batch:
+        table = table.to_batches()[0]
+
+    col = table.__dataframe__().get_column_by_name("weekday")
+    categorical = col.describe_categorical
+    assert isinstance(categorical["is_ordered"], bool)
+    assert isinstance(categorical["is_dictionary"], bool)
+
+
+@pytest.mark.parametrize("use_batch", [False, True])
+def test_dataframe(use_batch):
+    n = pa.chunked_array([[2, 2, 4], [4, 5, 100]])
+    a = pa.chunked_array([["Flamingo", "Parrot", "Cow"],
+                         ["Horse", "Brittle stars", "Centipede"]])
+    table = pa.table([n, a], names=['n_legs', 'animals'])
+    if use_batch:
+        table = table.combine_chunks().to_batches()[0]
+    df = table.__dataframe__()
+
+    assert df.num_columns() == 2
+    assert df.num_rows() == 6
+    if use_batch:
+        assert df.num_chunks() == 1
+    else:
+        assert df.num_chunks() == 2
+    assert list(df.column_names()) == ['n_legs', 'animals']
+    assert list(df.select_columns((1,)).column_names()) == list(
+        df.select_columns_by_name(("animals",)).column_names()
+    )
+
+
+@pytest.mark.parametrize("use_batch", [False, True])
+@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
+def test_df_get_chunks(use_batch, size, n_chunks):
+    table = pa.table({"x": list(range(size))})
+    if use_batch:
+        table = table.to_batches()[0]
+    df = table.__dataframe__()
+    chunks = list(df.get_chunks(n_chunks))
+    assert len(chunks) == n_chunks
+    assert sum(chunk.num_rows() for chunk in chunks) == size
+
+
+@pytest.mark.parametrize("use_batch", [False, True])
+@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)])
+def test_column_get_chunks(use_batch, size, n_chunks):
+    table = pa.table({"x": list(range(size))})
+    if use_batch:
+        table = table.to_batches()[0]
+    df = table.__dataframe__()
+    chunks = list(df.get_column(0).get_chunks(n_chunks))
+    assert len(chunks) == n_chunks
+    assert sum(chunk.size() for chunk in chunks) == size
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize(
+    "uint", [pa.uint8(), pa.uint16(), pa.uint32()]
+)
+@pytest.mark.parametrize(
+    "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+)
+@pytest.mark.parametrize(
+    "float, np_float_str", [
+        (pa.float16(), "float16"),
+        (pa.float32(), "float32"),
+        (pa.float64(), "float64")
+    ]
+)
+@pytest.mark.parametrize("use_batch", [False, True])
+def test_get_columns(uint, int, float, np_float_str, use_batch):
+    arr = [[1, 2, 3], [4, 5]]
+    arr_float = np.array([1, 2, 3, 4, 5], dtype=np.dtype(np_float_str))
+    table = pa.table(
+        {
+            "a": pa.chunked_array(arr, type=uint),
+            "b": pa.chunked_array(arr, type=int),
+            "c": pa.array(arr_float, type=float)
+        }
+    )
+    if use_batch:
+        table = table.combine_chunks().to_batches()[0]
+    df = table.__dataframe__()
+    for col in df.get_columns():
+        assert col.size() == 5
+        assert col.num_chunks() == 1
+
+    # 0 = DtypeKind.INT, 1 = DtypeKind.UINT, 2 = DtypeKind.FLOAT,
+    # see DtypeKind class in column.py
+    assert df.get_column(0).dtype[0] == 1  # UINT
+    assert df.get_column(1).dtype[0] == 0  # INT
+    assert df.get_column(2).dtype[0] == 2  # FLOAT
+
+
+@pytest.mark.parametrize(
+    "int", [pa.int8(), pa.int16(), pa.int32(), pa.int64()]
+)
+@pytest.mark.parametrize("use_batch", [False, True])
+def test_buffer(int, use_batch):
+    arr = [0, 1, -1]
+    table = pa.table({"a": pa.array(arr, type=int)})
+    if use_batch:
+        table = table.to_batches()[0]
+    df = table.__dataframe__()
+    col = df.get_column(0)
+    buf = col.get_buffers()
+
+    dataBuf, dataDtype = buf["data"]
+
+    assert dataBuf.bufsize > 0
+    assert dataBuf.ptr != 0
+    device, _ = dataBuf.__dlpack_device__()
+
+    # 0 = DtypeKind.INT
+    # see DtypeKind class in column.py
+    assert dataDtype[0] == 0
+
+    if device == 1:  # CPU-only as we're going to directly read memory here
+        bitwidth = dataDtype[1]
+        ctype = {
+            8: ctypes.c_int8,
+            16: ctypes.c_int16,
+            32: ctypes.c_int32,
+            64: ctypes.c_int64,
+        }[bitwidth]
+
+        for idx, truth in enumerate(arr):
+            val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value
+            assert val == truth, f"Buffer at index {idx} mismatch"
+
+
+@pytest.mark.parametrize(
+    "indices_type, bitwidth, f_string", [
+        (pa.int8(), 8, "c"),
+        (pa.int16(), 16, "s"),
+        (pa.int32(), 32, "i"),
+        (pa.int64(), 64, "l")
+    ]
+)
+def test_categorical_dtype(indices_type, bitwidth, f_string):
+    type = pa.dictionary(indices_type, pa.string())
+    arr = pa.array(["a", "b", None, "d"], type)
+    table = pa.table({'a': arr})
+
+    df = table.__dataframe__()
+    col = df.get_column(0)
+    assert col.dtype[0] == 23  # <DtypeKind.CATEGORICAL: 23>
+    assert col.dtype[1] == bitwidth
+    assert col.dtype[2] == f_string
diff --git a/pyarrow/tests/parquet/__init__.py b/pyarrow/tests/parquet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d08d67d2860f480a17d3b2508d4cad70ea8b27b1
--- /dev/null
+++ b/pyarrow/tests/parquet/__init__.py
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = [
+    pytest.mark.parquet,
+]
diff --git a/pyarrow/tests/parquet/common.py b/pyarrow/tests/parquet/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..5390a24b90d207cb159523e9bf571136536f580e
--- /dev/null
+++ b/pyarrow/tests/parquet/common.py
@@ -0,0 +1,179 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import io
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
+import pyarrow as pa
+from pyarrow.tests import util
+
+
+def _write_table(table, path, **kwargs):
+    # So we see the ImportError somewhere
+    import pyarrow.parquet as pq
+    from pyarrow.pandas_compat import _pandas_api
+
+    if _pandas_api.is_data_frame(table):
+        table = pa.Table.from_pandas(table)
+
+    pq.write_table(table, path, **kwargs)
+    return table
+
+
+def _read_table(*args, **kwargs):
+    import pyarrow.parquet as pq
+
+    table = pq.read_table(*args, **kwargs)
+    table.validate(full=True)
+    return table
+
+
+def _roundtrip_table(table, read_table_kwargs=None,
+                     write_table_kwargs=None):
+    read_table_kwargs = read_table_kwargs or {}
+    write_table_kwargs = write_table_kwargs or {}
+
+    writer = pa.BufferOutputStream()
+    _write_table(table, writer, **write_table_kwargs)
+    reader = pa.BufferReader(writer.getvalue())
+    return _read_table(reader, **read_table_kwargs)
+
+
+def _check_roundtrip(table, expected=None, read_table_kwargs=None,
+                     **write_table_kwargs):
+    if expected is None:
+        expected = table
+
+    read_table_kwargs = read_table_kwargs or {}
+
+    # intentionally check twice
+    result = _roundtrip_table(table, read_table_kwargs=read_table_kwargs,
+                              write_table_kwargs=write_table_kwargs)
+    assert result.schema == expected.schema
+    assert result.equals(expected)
+    result = _roundtrip_table(result, read_table_kwargs=read_table_kwargs,
+                              write_table_kwargs=write_table_kwargs)
+    assert result.schema == expected.schema
+    assert result.equals(expected)
+
+
+def _roundtrip_pandas_dataframe(df, write_kwargs):
+    table = pa.Table.from_pandas(df)
+    result = _roundtrip_table(
+        table, write_table_kwargs=write_kwargs)
+    return result.to_pandas()
+
+
+def _random_integers(size, dtype):
+    # We do not generate integers outside the int64 range
+    platform_int_info = np.iinfo('int_')
+    iinfo = np.iinfo(dtype)
+    return np.random.randint(max(iinfo.min, platform_int_info.min),
+                             min(iinfo.max, platform_int_info.max),
+                             size=size, dtype=dtype)
+
+
+def _range_integers(size, dtype):
+    return pa.array(np.arange(size, dtype=dtype))
+
+
+def _test_dict(size=10000, seed=0):
+    np.random.seed(seed)
+    return {
+        'uint8': _random_integers(size, np.uint8),
+        'uint16': _random_integers(size, np.uint16),
+        'uint32': _random_integers(size, np.uint32),
+        'uint64': _random_integers(size, np.uint64),
+        'int8': _random_integers(size, np.int8),
+        'int16': _random_integers(size, np.int16),
+        'int32': _random_integers(size, np.int32),
+        'int64': _random_integers(size, np.int64),
+        'float32': np.random.randn(size).astype(np.float32),
+        'float64': np.arange(size, dtype=np.float64),
+        'bool': np.random.randn(size) > 0,
+        'strings': [util.rands(10) for i in range(size)],
+        'all_none': [None] * size,
+        'all_none_category': [None] * size
+    }
+
+
+def _test_dataframe(size=10000, seed=0):
+    import pandas as pd
+
+    df = pd.DataFrame(_test_dict(size, seed))
+
+    # TODO(PARQUET-1015)
+    # df['all_none_category'] = df['all_none_category'].astype('category')
+    return df
+
+
+def _test_table(size=10000, seed=0):
+    return pa.Table.from_pydict(_test_dict(size, seed))
+
+
+def make_sample_file(table_or_df):
+    import pyarrow.parquet as pq
+
+    if isinstance(table_or_df, pa.Table):
+        a_table = table_or_df
+    else:
+        a_table = pa.Table.from_pandas(table_or_df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, compression='SNAPPY', version='2.6')
+
+    buf.seek(0)
+    return pq.ParquetFile(buf)
+
+
+def alltypes_sample(size=10000, seed=0, categorical=False):
+    import pandas as pd
+
+    np.random.seed(seed)
+    arrays = {
+        'uint8': np.arange(size, dtype=np.uint8),
+        'uint16': np.arange(size, dtype=np.uint16),
+        'uint32': np.arange(size, dtype=np.uint32),
+        'uint64': np.arange(size, dtype=np.uint64),
+        'int8': np.arange(size, dtype=np.int16),
+        'int16': np.arange(size, dtype=np.int16),
+        'int32': np.arange(size, dtype=np.int32),
+        'int64': np.arange(size, dtype=np.int64),
+        'float16': np.arange(size, dtype=np.float16),
+        'float32': np.arange(size, dtype=np.float32),
+        'float64': np.arange(size, dtype=np.float64),
+        'bool': np.random.randn(size) > 0,
+        'datetime_ms': np.arange("2016-01-01T00:00:00.001", size,
+                                 dtype='datetime64[ms]'),
+        'datetime_us': np.arange("2016-01-01T00:00:00.000001", size,
+                                 dtype='datetime64[us]'),
+        'datetime_ns': np.arange("2016-01-01T00:00:00.000000001", size,
+                                 dtype='datetime64[ns]'),
+        'timedelta': np.arange(0, size, dtype="timedelta64[s]"),
+        'str': pd.Series([str(x) for x in range(size)]),
+        'empty_str': [''] * size,
+        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
+        'null': [None] * size,
+        'null_list': [None] * 2 + [[None] * (x % 4) for x in range(size - 2)],
+    }
+    if categorical:
+        arrays['str_category'] = arrays['str'].astype('category')
+    return pd.DataFrame(arrays)
diff --git a/pyarrow/tests/parquet/conftest.py b/pyarrow/tests/parquet/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9685d6b8bbaa013f168cdb7524313316c4833ae
--- /dev/null
+++ b/pyarrow/tests/parquet/conftest.py
@@ -0,0 +1,110 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import pathlib
+import sys
+
+import pytest
+
+from pyarrow.util import guid
+
+
+@pytest.fixture(scope='module')
+def datadir(base_datadir):
+    return base_datadir / 'parquet'
+
+
+@pytest.fixture(scope='module')
+def parquet_test_datadir():
+    if sys.platform == 'emscripten':
+        pytest.skip("needs PARQUET_TEST_DATA files access")
+    result = os.environ.get('PARQUET_TEST_DATA')
+    if not result:
+        raise RuntimeError('Please point the PARQUET_TEST_DATA environment '
+                           'variable to the test data directory')
+    return pathlib.Path(result)
+
+
+@pytest.fixture
+def s3_bucket(s3_server):
+    boto3 = pytest.importorskip('boto3')
+    botocore = pytest.importorskip('botocore')
+    s3_bucket_name = 'test-s3fs'
+
+    host, port, access_key, secret_key = s3_server['connection']
+    s3_client = boto3.client(
+        's3',
+        endpoint_url=f'http://{host}:{port}',
+        aws_access_key_id=access_key,
+        aws_secret_access_key=secret_key,
+        config=botocore.client.Config(signature_version='s3v4'),
+        region_name='us-east-1'
+    )
+
+    try:
+        s3_client.create_bucket(Bucket=s3_bucket_name)
+    except Exception:
+        pass  # we get BucketAlreadyOwnedByYou error with fsspec handler
+    finally:
+        s3_client.close()
+
+    return s3_bucket_name
+
+
+@pytest.fixture
+def s3_example_s3fs(s3_server, s3_bucket):
+    s3fs = pytest.importorskip('s3fs')
+
+    host, port, access_key, secret_key = s3_server['connection']
+    fs = s3fs.S3FileSystem(
+        key=access_key,
+        secret=secret_key,
+        client_kwargs={
+            'endpoint_url': f'http://{host}:{port}'
+        }
+    )
+
+    test_path = f'{s3_bucket}/{guid()}'
+
+    fs.mkdir(test_path)
+    yield fs, test_path
+    try:
+        fs.rm(test_path, recursive=True)
+    except FileNotFoundError:
+        pass
+
+
+@pytest.fixture
+def s3_example_fs(s3_server):
+    from pyarrow.fs import FileSystem
+
+    host, port, access_key, secret_key = s3_server['connection']
+    uri = (
+        f"s3://{access_key}:{secret_key}@mybucket/data.parquet?scheme=http"
+        f"&endpoint_override={host}:{port}&allow_bucket_creation=True"
+    )
+    fs, path = FileSystem.from_uri(uri)
+
+    fs.create_dir("mybucket")
+
+    yield fs, uri, path
+
+
+@pytest.fixture(scope="class")
+def reusable_tempdir(tmp_path_factory):
+    return tmp_path_factory.mktemp('pyarrow-parquet')
diff --git a/pyarrow/tests/parquet/encryption.py b/pyarrow/tests/parquet/encryption.py
new file mode 100644
index 0000000000000000000000000000000000000000..efaee1d08a9386d2940b546adabd71a872b447b7
--- /dev/null
+++ b/pyarrow/tests/parquet/encryption.py
@@ -0,0 +1,127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import base64
+import pyarrow.parquet.encryption as pe
+from pyarrow._parquet_encryption import FileSystemKeyMaterialStore
+import re
+
+
+class InMemoryKmsClient(pe.KmsClient):
+    """This is a mock class implementation of KmsClient, built for testing
+    only.
+    """
+
+    def __init__(self, config):
+        """Create an InMemoryKmsClient instance."""
+        pe.KmsClient.__init__(self)
+        self.master_keys_map = config.custom_kms_conf
+
+    def wrap_key(self, key_bytes, master_key_identifier):
+        """Not a secure cipher - the wrapped key
+        is just the master key concatenated with key bytes"""
+        master_key_bytes = self.master_keys_map[master_key_identifier].encode(
+            'utf-8')
+        wrapped_key = b"".join([master_key_bytes, key_bytes])
+        result = base64.b64encode(wrapped_key)
+        return result
+
+    def unwrap_key(self, wrapped_key, master_key_identifier):
+        """Not a secure cipher - just extract the key from
+        the wrapped key"""
+        if master_key_identifier not in self.master_keys_map:
+            raise ValueError("Unknown master key", master_key_identifier)
+        expected_master_key = self.master_keys_map[master_key_identifier]
+        decoded_wrapped_key = base64.b64decode(wrapped_key)
+        master_key_bytes = decoded_wrapped_key[:16]
+        decrypted_key = decoded_wrapped_key[16:]
+        if (expected_master_key == master_key_bytes.decode('utf-8')):
+            return decrypted_key
+        raise ValueError("Incorrect master key used",
+                         master_key_bytes, decrypted_key)
+
+
+def parse_wrapped_key(wrapped_key: str) -> tuple[str, int, bytes]:
+    """Parses a wrapped key string into a tuple: (key id, version, key) given
+    input in the form: <key id>:v<version>:<bas64 encoded key>"""
+    ptn = re.compile("(.+?):v([0-9]+?):(.+)")
+    if m := ptn.fullmatch(wrapped_key):
+        id, version, b64key = m.groups()
+        version = int(version)
+        key = base64.b64decode(b64key)
+        return (id, version, key)
+    else:
+        raise ValueError("Cannot parse wrapped key", wrapped_key)
+
+
+MASTER_KEY_VERSION = "master_key_version"
+
+
+class MockVersioningKmsClient(pe.KmsClient):
+    """This is a mock class implementation of KmsClient, built for testing
+    only.
+
+    During tests that involve CryptoFactory.rotate_master_keys, separate
+    instances of this client will be created when writing, rotating keys, and
+    reading back parquet data. To help unit tests verify that external key
+    material was stored correctly at each step, this client wraps keys with a
+    master_key_identifier and a version number. To ensure each client wraps
+    with the correct version, the current version is persisted in the
+    key_access_token attribute of the KmsConnectionConfig shared by all clients
+    """
+
+    def __init__(self, connection_config) -> None:
+        pe.KmsClient.__init__(self)
+        self.connection_config = connection_config
+
+    @property
+    def master_key_version(self) -> int:
+        return int(self.connection_config.key_access_token)
+
+    def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str:
+        b64key = base64.b64encode(key_bytes).decode('utf-8')
+        return f"{master_key_identifier}:v{self.master_key_version}:{b64key}"
+
+    def unwrap_key(
+            self,
+            wrapped_key: str,
+            master_key_identifier: str) -> bytes:
+        key_id, _, key = parse_wrapped_key(wrapped_key)
+        if key_id != master_key_identifier:
+            raise ValueError("Mismatched master key identifiers:",
+                             key_id, master_key_identifier)
+        return key
+
+
+def verify_file_encrypted(path):
+    """Verify that the file is encrypted by looking at its first 4 bytes.
+    If it's the magic string PARE
+    then this is a parquet with encrypted footer."""
+    with open(path, "rb") as file:
+        magic_str = file.read(4)
+        # Verify magic string for parquet with encrypted footer is PARE
+        assert magic_str == b'PARE'
+
+
+def read_external_keys_to_dict(path):
+    """Reads an external key material store given a parquet file path and
+    returns a dict mapping master_key_id to KeyMaterial objects"""
+    store = FileSystemKeyMaterialStore.for_file(path)
+    keys = dict()
+    for id in store.get_key_id_set():
+        key_material = store.get_key_material(id)
+        keys[key_material.master_key_id] = key_material
+    return keys
diff --git a/pyarrow/tests/parquet/test_basic.py b/pyarrow/tests/parquet/test_basic.py
new file mode 100644
index 0000000000000000000000000000000000000000..94868741f39aaea05e9b3578e0efe0465dd745e0
--- /dev/null
+++ b/pyarrow/tests/parquet/test_basic.py
@@ -0,0 +1,1002 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import sys
+from collections import OrderedDict
+import io
+import warnings
+from shutil import copytree
+from decimal import Decimal
+
+import pytest
+
+import pyarrow as pa
+from pyarrow import fs
+from pyarrow.tests import util
+from pyarrow.tests.parquet.common import (_check_roundtrip, _roundtrip_table,
+                                          _test_table)
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import _read_table, _write_table
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.pandas_examples import dataframe_with_lists
+    from pyarrow.tests.parquet.common import alltypes_sample
+except ImportError:
+    pd = tm = None
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = pytest.mark.parquet
+
+
+def test_parquet_invalid_version(tempdir):
+    table = pa.table({'a': [1, 2, 3]})
+    with pytest.raises(ValueError, match="Unsupported Parquet format version"):
+        _write_table(table, tempdir / 'test_version.parquet', version="2.2")
+    with pytest.raises(ValueError, match="Unsupported Parquet data page " +
+                       "version"):
+        _write_table(table, tempdir / 'test_version.parquet',
+                     data_page_version="2.2")
+
+
+def test_set_data_page_size():
+    arr = pa.array([1, 2, 3] * 100000)
+    t = pa.Table.from_arrays([arr], names=['f0'])
+
+    # 128K, 512K
+    page_sizes = [2 << 16, 2 << 18]
+    for target_page_size in page_sizes:
+        _check_roundtrip(t, data_page_size=target_page_size)
+
+
+@pytest.mark.numpy
+def test_set_write_batch_size():
+    table = _test_table(100)
+
+    _check_roundtrip(
+        table, data_page_size=10, write_batch_size=1, version='2.4'
+    )
+
+
+@pytest.mark.numpy
+def test_set_dictionary_pagesize_limit():
+    table = _test_table(100)
+
+    _check_roundtrip(table, dictionary_pagesize_limit=1,
+                     data_page_size=10, version='2.4')
+
+    with pytest.raises(TypeError):
+        _check_roundtrip(table, dictionary_pagesize_limit="a",
+                         data_page_size=10, version='2.4')
+
+
+@pytest.mark.pandas
+def test_chunked_table_write():
+    # ARROW-232
+    tables = []
+    batch = pa.RecordBatch.from_pandas(alltypes_sample(size=10))
+    tables.append(pa.Table.from_batches([batch] * 3))
+    df, _ = dataframe_with_lists()
+    batch = pa.RecordBatch.from_pandas(df)
+    tables.append(pa.Table.from_batches([batch] * 3))
+
+    for data_page_version in ['1.0', '2.0']:
+        for use_dictionary in [True, False]:
+            for table in tables:
+                _check_roundtrip(
+                    table, version='2.6',
+                    data_page_version=data_page_version,
+                    use_dictionary=use_dictionary)
+
+
+@pytest.mark.pandas
+def test_memory_map(tempdir):
+    df = alltypes_sample(size=10)
+
+    table = pa.Table.from_pandas(df)
+    _check_roundtrip(table, read_table_kwargs={'memory_map': True},
+                     version='2.6')
+
+    filename = str(tempdir / 'tmp_file')
+    with open(filename, 'wb') as f:
+        _write_table(table, f, version='2.6')
+    table_read = pq.read_pandas(filename, memory_map=True)
+    assert table_read.equals(table)
+
+
+@pytest.mark.pandas
+def test_enable_buffered_stream(tempdir):
+    df = alltypes_sample(size=10)
+
+    table = pa.Table.from_pandas(df)
+    _check_roundtrip(table, read_table_kwargs={'buffer_size': 1025},
+                     version='2.6')
+
+    filename = str(tempdir / 'tmp_file')
+    with open(filename, 'wb') as f:
+        _write_table(table, f, version='2.6')
+    table_read = pq.read_pandas(filename, buffer_size=4096)
+    assert table_read.equals(table)
+
+
+def test_special_chars_filename(tempdir):
+    table = pa.Table.from_arrays([pa.array([42])], ["ints"])
+    filename = "foo # bar"
+    path = tempdir / filename
+    assert not path.exists()
+    _write_table(table, str(path))
+    assert path.exists()
+    table_read = _read_table(str(path))
+    assert table_read.equals(table)
+
+
+def test_invalid_source():
+    # Test that we provide an helpful error message pointing out
+    # that None wasn't expected when trying to open a Parquet None file.
+    with pytest.raises(TypeError, match="None"):
+        pq.read_table(None)
+
+    with pytest.raises(TypeError, match="None"):
+        pq.ParquetFile(None)
+
+
+def test_read_table_without_dataset(tempdir):
+    from unittest import mock
+
+    class MockParquetDataset:
+        def __init__(self, *args, **kwargs):
+            raise ImportError("MockParquetDataset")
+
+    path = tempdir / "test.parquet"
+    table = pa.table({"a": [1, 2, 3]})
+    _write_table(table, path)
+
+    with mock.patch('pyarrow.parquet.core.ParquetDataset', new=MockParquetDataset):
+        with pytest.raises(ValueError, match="the 'filters' keyword"):
+            pq.read_table(path, filters=[('integer', '=', 1)])
+        with pytest.raises(ValueError, match="the 'partitioning' keyword"):
+            pq.read_table(path, partitioning=['week', 'color'])
+        with pytest.raises(ValueError, match="the 'schema' argument"):
+            pq.read_table(path, schema=table.schema)
+        with pytest.raises(ValueError, match="the 'source' argument"):
+            pq.read_table(tempdir)
+        result = pq.read_table(path)
+        assert result == table
+
+
+@pytest.mark.slow
+def test_file_with_over_int16_max_row_groups():
+    # PARQUET-1857: Parquet encryption support introduced a INT16_MAX upper
+    # limit on the number of row groups, but this limit only impacts files with
+    # encrypted row group metadata because of the int16 row group ordinal used
+    # in the Parquet Thrift metadata. Unencrypted files are not impacted, so
+    # this test checks that it works (even if it isn't a good idea)
+    t = pa.table([list(range(40000))], names=['f0'])
+    _check_roundtrip(t, row_group_size=1)
+
+
+@pytest.mark.pandas
+def test_empty_table_roundtrip():
+    df = alltypes_sample(size=10)
+
+    # Create a non-empty table to infer the types correctly, then slice to 0
+    table = pa.Table.from_pandas(df)
+    table = pa.Table.from_arrays(
+        [col.chunk(0)[:0] for col in table.itercolumns()],
+        names=table.schema.names)
+
+    assert table.schema.field('null').type == pa.null()
+    assert table.schema.field('null_list').type == pa.list_(pa.null())
+    _check_roundtrip(
+        table, version='2.6')
+
+
+@pytest.mark.pandas
+def test_empty_table_no_columns():
+    df = pd.DataFrame()
+    empty = pa.Table.from_pandas(df, preserve_index=False)
+    _check_roundtrip(empty)
+
+
+def test_write_nested_zero_length_array_chunk_failure():
+    # Bug report in ARROW-3792
+    cols = OrderedDict(
+        int32=pa.int32(),
+        list_string=pa.list_(pa.string())
+    )
+    data = [[], [OrderedDict(int32=1, list_string=('G',)), ]]
+
+    # This produces a table with a column like
+    # <Column name='list_string' type=ListType(list<item: string>)>
+    # [
+    #   [],
+    #   [
+    #     [
+    #       "G"
+    #     ]
+    #   ]
+    # ]
+    #
+    # Each column is a ChunkedArray with 2 elements
+    my_arrays = [pa.array(batch, type=pa.struct(cols)).flatten()
+                 for batch in data]
+    my_batches = [pa.RecordBatch.from_arrays(batch, schema=pa.schema(cols))
+                  for batch in my_arrays]
+    tbl = pa.Table.from_batches(my_batches, pa.schema(cols))
+    _check_roundtrip(tbl)
+
+
+@pytest.mark.pandas
+def test_multiple_path_types(tempdir):
+    # Test compatibility with PEP 519 path-like objects
+    path = tempdir / 'zzz.parquet'
+    df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
+    _write_table(df, path)
+    table_read = _read_table(path)
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+    # Test compatibility with plain string paths
+    path = str(tempdir) + 'zzz.parquet'
+    df = pd.DataFrame({'x': np.arange(10, dtype=np.int64)})
+    _write_table(df, path)
+    table_read = _read_table(path)
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+def test_fspath(tempdir):
+    # ARROW-12472 support __fspath__ objects without using str()
+    path = tempdir / "test.parquet"
+    table = pa.table({"a": [1, 2, 3]})
+    _write_table(table, path)
+
+    fs_protocol_obj = util.FSProtocolClass(path)
+
+    result = _read_table(fs_protocol_obj)
+    assert result.equals(table)
+
+    # combined with non-local filesystem raises
+    with pytest.raises(TypeError):
+        _read_table(fs_protocol_obj, filesystem=fs.FileSystem())
+
+
+@pytest.mark.parametrize("filesystem", [
+    None, fs.LocalFileSystem()
+])
+@pytest.mark.parametrize("name", ("data.parquet", "例.parquet"))
+def test_relative_paths(tempdir, filesystem, name):
+    # reading and writing from relative paths
+    table = pa.table({"a": [1, 2, 3]})
+    path = tempdir / name
+
+    # reading
+    pq.write_table(table, str(path))
+    with util.change_cwd(tempdir):
+        result = pq.read_table(name, filesystem=filesystem)
+    assert result.equals(table)
+
+    path.unlink()
+    assert not path.exists()
+
+    # writing
+    with util.change_cwd(tempdir):
+        pq.write_table(table, name, filesystem=filesystem)
+    result = pq.read_table(path)
+    assert result.equals(table)
+
+
+def test_read_non_existing_file():
+    # ensure we have a proper error message
+    with pytest.raises(FileNotFoundError):
+        pq.read_table('i-am-not-existing.parquet')
+
+
+def test_file_error_python_exception():
+    class BogusFile(io.BytesIO):
+        def read(self, *args):
+            raise ZeroDivisionError("zorglub")
+
+        def seek(self, *args):
+            raise ZeroDivisionError("zorglub")
+
+    # ensure the Python exception is restored
+    with pytest.raises(ZeroDivisionError, match="zorglub"):
+        pq.read_table(BogusFile(b""))
+
+
+def test_parquet_read_from_buffer(tempdir):
+    # reading from a buffer from python's open()
+    table = pa.table({"a": [1, 2, 3]})
+    pq.write_table(table, str(tempdir / "data.parquet"))
+
+    with open(str(tempdir / "data.parquet"), "rb") as f:
+        result = pq.read_table(f)
+    assert result.equals(table)
+
+    with open(str(tempdir / "data.parquet"), "rb") as f:
+        result = pq.read_table(pa.PythonFile(f))
+    assert result.equals(table)
+
+
+def test_byte_stream_split():
+    # This is only a smoke test.
+    arr_float = pa.array(list(map(float, range(100))))
+    arr_int = pa.array(list(map(int, range(100))))
+    arr_bool = pa.array([True, False] * 50)
+    data_float = [arr_float, arr_float]
+    table = pa.Table.from_arrays(data_float, names=['a', 'b'])
+
+    # Check with byte_stream_split for both columns.
+    _check_roundtrip(table, expected=table, compression="gzip",
+                     use_dictionary=False, use_byte_stream_split=True)
+
+    # Check with byte_stream_split for column 'b' and dictionary
+    # for column 'a'.
+    _check_roundtrip(table, expected=table, compression="gzip",
+                     use_dictionary=['a'],
+                     use_byte_stream_split=['b'])
+
+    # Check with a collision for both columns.
+    _check_roundtrip(table, expected=table, compression="gzip",
+                     use_dictionary=['a', 'b'],
+                     use_byte_stream_split=['a', 'b'])
+
+    # Check with mixed column types.
+    mixed_table = pa.Table.from_arrays([arr_float, arr_float, arr_int, arr_int],
+                                       names=['a', 'b', 'c', 'd'])
+    _check_roundtrip(mixed_table, expected=mixed_table,
+                     use_dictionary=['b', 'd'],
+                     use_byte_stream_split=['a', 'c'])
+
+    # Try to use the wrong data type with the byte_stream_split encoding.
+    # This should throw an exception.
+    table = pa.Table.from_arrays([arr_bool], names=['tmp'])
+    with pytest.raises(IOError, match='BYTE_STREAM_SPLIT only supports'):
+        _check_roundtrip(table, expected=table, use_byte_stream_split=True,
+                         use_dictionary=False)
+
+
+def test_store_decimal_as_integer(tempdir):
+    arr_decimal_1_9 = pa.array(list(map(Decimal, range(100))),
+                               type=pa.decimal128(5, 2))
+    arr_decimal_10_18 = pa.array(list(map(Decimal, range(100))),
+                                 type=pa.decimal128(16, 9))
+    arr_decimal_gt18 = pa.array(list(map(Decimal, range(100))),
+                                type=pa.decimal128(22, 2))
+    arr_bool = pa.array([True, False] * 50)
+    data_decimal = [arr_decimal_1_9, arr_decimal_10_18, arr_decimal_gt18]
+    table = pa.Table.from_arrays(data_decimal, names=['a', 'b', 'c'])
+
+    # Check with store_decimal_as_integer.
+    _check_roundtrip(table,
+                     expected=table,
+                     compression="gzip",
+                     use_dictionary=False,
+                     store_decimal_as_integer=True)
+
+    # Check physical type in parquet schema
+    pqtestfile_path = os.path.join(tempdir, 'test.parquet')
+    pq.write_table(table, pqtestfile_path,
+                   compression="gzip",
+                   use_dictionary=False,
+                   store_decimal_as_integer=True)
+
+    pqtestfile = pq.ParquetFile(pqtestfile_path)
+    pqcol_decimal_1_9 = pqtestfile.schema.column(0)
+    pqcol_decimal_10_18 = pqtestfile.schema.column(1)
+
+    assert pqcol_decimal_1_9.physical_type == 'INT32'
+    assert pqcol_decimal_10_18.physical_type == 'INT64'
+
+    # Check with store_decimal_as_integer and delta-int encoding.
+    # DELTA_BINARY_PACKED requires parquet physical type to be INT64 or INT32
+    _check_roundtrip(table,
+                     expected=table,
+                     compression="gzip",
+                     use_dictionary=False,
+                     store_decimal_as_integer=True,
+                     column_encoding={
+                         'a': 'DELTA_BINARY_PACKED',
+                         'b': 'DELTA_BINARY_PACKED'
+                     })
+
+    # Check with mixed column types.
+    mixed_table = pa.Table.from_arrays(
+        [arr_decimal_1_9, arr_decimal_10_18, arr_decimal_gt18, arr_bool],
+        names=['a', 'b', 'c', 'd'])
+    _check_roundtrip(mixed_table,
+                     expected=mixed_table,
+                     use_dictionary=False,
+                     store_decimal_as_integer=True)
+
+
+def test_column_encoding():
+    arr_float = pa.array(list(map(float, range(100))))
+    arr_int = pa.array(list(map(int, range(100))))
+    arr_bin = pa.array([str(x) for x in range(100)], type=pa.binary())
+    arr_flba = pa.array(
+        [str(x).zfill(10) for x in range(100)], type=pa.binary(10))
+    arr_bool = pa.array([False, True, False, False] * 25)
+    mixed_table = pa.Table.from_arrays(
+        [arr_float, arr_int, arr_bin, arr_flba, arr_bool],
+        names=['a', 'b', 'c', 'd', 'e'])
+
+    # Check "BYTE_STREAM_SPLIT" for columns 'a', 'b', 'd'
+    # and "PLAIN" column_encoding for column 'c'.
+    _check_roundtrip(mixed_table, expected=mixed_table, use_dictionary=False,
+                     column_encoding={'a': "BYTE_STREAM_SPLIT",
+                                      'b': "BYTE_STREAM_SPLIT",
+                                      'c': "PLAIN",
+                                      'd': "BYTE_STREAM_SPLIT"})
+
+    # Check "PLAIN" for all columns.
+    _check_roundtrip(mixed_table, expected=mixed_table,
+                     use_dictionary=False,
+                     column_encoding="PLAIN")
+
+    # Check "DELTA_BINARY_PACKED" for integer columns.
+    _check_roundtrip(mixed_table, expected=mixed_table,
+                     use_dictionary=False,
+                     column_encoding={'a': "PLAIN",
+                                      'b': "DELTA_BINARY_PACKED",
+                                      'c': "PLAIN"})
+
+    # Check "DELTA_LENGTH_BYTE_ARRAY" for byte columns.
+    _check_roundtrip(mixed_table, expected=mixed_table,
+                     use_dictionary=False,
+                     column_encoding={'a': "PLAIN",
+                                      'b': "DELTA_BINARY_PACKED",
+                                      'c': "DELTA_LENGTH_BYTE_ARRAY"})
+
+    # Check "DELTA_BYTE_ARRAY" for byte columns.
+    _check_roundtrip(mixed_table, expected=mixed_table,
+                     use_dictionary=False,
+                     column_encoding={'a': "PLAIN",
+                                      'b': "DELTA_BINARY_PACKED",
+                                      'c': "DELTA_BYTE_ARRAY",
+                                      'd': "DELTA_BYTE_ARRAY"})
+
+    # Check "RLE" for boolean columns.
+    _check_roundtrip(mixed_table, expected=mixed_table,
+                     use_dictionary=False,
+                     column_encoding={'e': "RLE"})
+
+    # Try to pass "BYTE_STREAM_SPLIT" column encoding for boolean column 'e'.
+    # This should throw an error as it is does not support BOOLEAN.
+    with pytest.raises(IOError,
+                       match="BYTE_STREAM_SPLIT only supports"):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding={'a': "PLAIN",
+                                          'c': "PLAIN",
+                                          'e': "BYTE_STREAM_SPLIT"})
+
+    # Try to pass use "DELTA_BINARY_PACKED" encoding on float column.
+    # This should throw an error as only integers are supported.
+    with pytest.raises(OSError,
+                       match="DELTA_BINARY_PACKED encoder only supports"):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding={'a': "DELTA_BINARY_PACKED",
+                                          'b': "PLAIN",
+                                          'c': "PLAIN"})
+
+    # Try to pass "RLE_DICTIONARY".
+    # This should throw an error as dictionary encoding is already used by
+    # default and not supported to be specified as "fallback" encoding
+    with pytest.raises(ValueError,
+                       match="'RLE_DICTIONARY' is already used by default"):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding="RLE_DICTIONARY")
+
+    # Try to pass unsupported encoding.
+    with pytest.raises(ValueError,
+                       match="Unsupported column encoding: 'MADE_UP_ENCODING'"):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding={'a': "MADE_UP_ENCODING"})
+
+    # Try to pass column_encoding and use_dictionary.
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=['b'],
+                         column_encoding={'b': "PLAIN"})
+
+    # Try to pass column_encoding and use_dictionary=True (default value).
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         column_encoding={'b': "PLAIN"})
+
+    # Try to pass column_encoding and use_byte_stream_split on same column.
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         use_byte_stream_split=['a'],
+                         column_encoding={'a': "RLE",
+                                          'b': "BYTE_STREAM_SPLIT",
+                                          'c': "PLAIN"})
+
+    # Try to pass column_encoding and use_byte_stream_split=True.
+    # This should throw an error.
+    with pytest.raises(ValueError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         use_byte_stream_split=True,
+                         column_encoding={'a': "RLE",
+                                          'b': "BYTE_STREAM_SPLIT",
+                                          'c': "PLAIN"})
+
+    # Try to pass column_encoding=True.
+    # This should throw an error.
+    with pytest.raises(TypeError):
+        _check_roundtrip(mixed_table, expected=mixed_table,
+                         use_dictionary=False,
+                         column_encoding=True)
+
+
+def test_compression_level():
+    arr = pa.array(list(map(int, range(1000))))
+    data = [arr, arr]
+    table = pa.Table.from_arrays(data, names=['a', 'b'])
+
+    # Check one compression level.
+    _check_roundtrip(table, expected=table, compression="gzip",
+                     compression_level=1)
+
+    # Check another one to make sure that compression_level=1 does not
+    # coincide with the default one in Arrow.
+    _check_roundtrip(table, expected=table, compression="gzip",
+                     compression_level=5)
+
+    # Check that the user can provide a compression per column
+    _check_roundtrip(table, expected=table,
+                     compression={'a': "gzip", 'b': "snappy"})
+
+    # Check that the user can provide a compression level per column
+    _check_roundtrip(table, expected=table, compression="gzip",
+                     compression_level={'a': 2, 'b': 3})
+
+    # Check if both LZ4 compressors are working
+    # (level < 3 -> fast, level >= 3 -> HC)
+    _check_roundtrip(table, expected=table, compression="lz4",
+                     compression_level=1)
+
+    _check_roundtrip(table, expected=table, compression="lz4",
+                     compression_level=9)
+
+    # Check that specifying a compression level for a codec which does allow
+    # specifying one, results into an error.
+    # Uncompressed, snappy and lzo do not support specifying a compression
+    # level.
+    # GZIP (zlib) allows for specifying a compression level but as of up
+    # to version 1.2.11 the valid range is [-1, 9].
+    invalid_combinations = [("snappy", 4), ("gzip", -1337),
+                            ("None", 444), ("lzo", 14)]
+    buf = io.BytesIO()
+    for (codec, level) in invalid_combinations:
+        with pytest.raises((ValueError, OSError)):
+            _write_table(table, buf, compression=codec,
+                         compression_level=level)
+
+
+def test_sanitized_spark_field_names():
+    a0 = pa.array([0, 1, 2, 3, 4])
+    name = 'prohib; ,\t{}'
+    table = pa.Table.from_arrays([a0], [name])
+
+    result = _roundtrip_table(table, write_table_kwargs={'flavor': 'spark'})
+
+    expected_name = 'prohib______'
+    assert result.schema[0].name == expected_name
+
+
+@pytest.mark.pandas
+def test_multithreaded_read():
+    df = alltypes_sample(size=10000)
+
+    table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(table, buf, compression='SNAPPY', version='2.6')
+
+    buf.seek(0)
+    table1 = _read_table(buf, use_threads=True)
+
+    buf.seek(0)
+    table2 = _read_table(buf, use_threads=False)
+
+    assert table1.equals(table2)
+
+
+@pytest.mark.pandas
+def test_min_chunksize():
+    data = pd.DataFrame([np.arange(4)], columns=['A', 'B', 'C', 'D'])
+    table = pa.Table.from_pandas(data.reset_index())
+
+    buf = io.BytesIO()
+    _write_table(table, buf, chunk_size=-1)
+
+    buf.seek(0)
+    result = _read_table(buf)
+
+    assert result.equals(table)
+
+    with pytest.raises(ValueError):
+        _write_table(table, buf, chunk_size=0)
+
+
+@pytest.mark.pandas
+def test_write_error_deletes_incomplete_file(tempdir):
+    # ARROW-1285
+    df = pd.DataFrame({'a': list('abc'),
+                       'b': list(range(1, 4)),
+                       'c': np.arange(3, 6).astype('u1'),
+                       'd': np.arange(4.0, 7.0, dtype='float64'),
+                       'e': [True, False, True],
+                       'f': pd.Categorical(list('abc')),
+                       'g': pd.date_range('20130101', periods=3),
+                       'h': pd.date_range('20130101', periods=3,
+                                          tz='US/Eastern'),
+                       'i': pd.date_range('20130101', periods=3, freq='ns')})
+
+    pdf = pa.Table.from_pandas(df)
+
+    filename = tempdir / 'tmp_file'
+    try:
+        # Test relies on writing nanoseconds to raise an error
+        # true for Parquet 2.4
+        _write_table(pdf, filename, version="2.4")
+    except pa.ArrowException:
+        pass
+
+    assert not filename.exists()
+
+
+def test_read_non_existent_file(tempdir):
+    path = 'nonexistent-file.parquet'
+    try:
+        pq.read_table(path)
+    except Exception as e:
+        assert path in e.args[0]
+
+
+def test_read_table_doesnt_warn(datadir):
+    with warnings.catch_warnings():
+        warnings.simplefilter(action="error")
+        pq.read_table(datadir / 'v0.7.1.parquet')
+
+
+@pytest.mark.pandas
+def test_zlib_compression_bug():
+    # ARROW-3514: "zlib deflate failed, output buffer too small"
+    table = pa.Table.from_arrays([pa.array(['abc', 'def'])], ['some_col'])
+    f = io.BytesIO()
+    pq.write_table(table, f, compression='gzip')
+
+    f.seek(0)
+    roundtrip = pq.read_table(f)
+    tm.assert_frame_equal(roundtrip.to_pandas(), table.to_pandas())
+
+
+def test_parquet_file_too_small(tempdir):
+    path = str(tempdir / "test.parquet")
+    with pytest.raises(pa.ArrowInvalid, match='size is 0 bytes'):
+        with open(path, 'wb') as f:
+            pass
+        pq.read_table(path)
+
+    with pytest.raises(pa.ArrowInvalid, match='size is 4 bytes'):
+        with open(path, 'wb') as f:
+            f.write(b'ffff')
+        pq.read_table(path)
+
+
+@pytest.mark.pandas
+@pytest.mark.fastparquet
+@pytest.mark.filterwarnings("ignore:RangeIndex:FutureWarning")
+@pytest.mark.filterwarnings("ignore:tostring:DeprecationWarning:fastparquet")
+def test_fastparquet_cross_compatibility(tempdir):
+    fp = pytest.importorskip('fastparquet')
+
+    df = pd.DataFrame(
+        {
+            "a": list("abc"),
+            "b": list(range(1, 4)),
+            "c": np.arange(4.0, 7.0, dtype="float64"),
+            "d": [True, False, True],
+            "e": pd.date_range("20130101", periods=3),
+            "f": pd.Categorical(["a", "b", "a"]),
+            # fastparquet writes list as BYTE_ARRAY JSON, so no roundtrip
+            # "g": [[1, 2], None, [1, 2, 3]],
+        }
+    )
+    table = pa.table(df)
+
+    # Arrow -> fastparquet
+    file_arrow = str(tempdir / "cross_compat_arrow.parquet")
+    pq.write_table(table, file_arrow, compression=None)
+
+    fp_file = fp.ParquetFile(file_arrow)
+    df_fp = fp_file.to_pandas()
+    tm.assert_frame_equal(df, df_fp)
+
+    # Fastparquet -> arrow
+    file_fastparquet = str(tempdir / "cross_compat_fastparquet.parquet")
+    fp.write(file_fastparquet, df)
+
+    table_fp = pq.read_pandas(file_fastparquet)
+    # for fastparquet written file, categoricals comes back as strings
+    # (no arrow schema in parquet metadata)
+    df['f'] = df['f'].astype(object)
+    tm.assert_frame_equal(table_fp.to_pandas(), df)
+
+
+@pytest.mark.parametrize('array_factory', [
+    lambda: pa.array([0, None] * 10),
+    lambda: pa.array([0, None] * 10).dictionary_encode(),
+    lambda: pa.array(["", None] * 10),
+    lambda: pa.array(["", None] * 10).dictionary_encode(),
+])
+@pytest.mark.parametrize('read_dictionary', [False, True])
+def test_buffer_contents(
+        array_factory, read_dictionary
+):
+    # Test that null values are deterministically initialized to zero
+    # after a roundtrip through Parquet.
+    # See ARROW-8006 and ARROW-8011.
+    orig_table = pa.Table.from_pydict({"col": array_factory()})
+    bio = io.BytesIO()
+    pq.write_table(orig_table, bio, use_dictionary=True)
+    bio.seek(0)
+    read_dictionary = ['col'] if read_dictionary else None
+    table = pq.read_table(bio, use_threads=False,
+                          read_dictionary=read_dictionary)
+
+    for col in table.columns:
+        [chunk] = col.chunks
+        buf = chunk.buffers()[1]
+        assert buf.to_pybytes() == buf.size * b"\0"
+
+
+def test_parquet_compression_roundtrip(tempdir):
+    # ARROW-10480: ensure even with nonstandard Parquet file naming
+    # conventions, writing and then reading a file works. In
+    # particular, ensure that we don't automatically double-compress
+    # the stream due to auto-detecting the extension in the filename
+    table = pa.table([pa.array(range(4))], names=["ints"])
+    path = tempdir / "arrow-10480.pyarrow.gz"
+    pq.write_table(table, path, compression="GZIP")
+    result = pq.read_table(path)
+    assert result.equals(table)
+
+
+def test_empty_row_groups(tempdir):
+    # ARROW-3020
+    table = pa.Table.from_arrays([pa.array([], type='int32')], ['f0'])
+
+    path = tempdir / 'empty_row_groups.parquet'
+
+    num_groups = 3
+    with pq.ParquetWriter(path, table.schema) as writer:
+        for i in range(num_groups):
+            writer.write_table(table)
+
+    reader = pq.ParquetFile(path)
+    assert reader.metadata.num_row_groups == num_groups
+
+    for i in range(num_groups):
+        assert reader.read_row_group(i).equals(table)
+
+
+def test_reads_over_batch(tempdir):
+    data = [None] * (1 << 20)
+    data.append([1])
+    # Large list<int64> with mostly nones and one final
+    # value.  This should force batched reads when
+    # reading back.
+    table = pa.Table.from_arrays([data], ['column'])
+
+    path = tempdir / 'arrow-11607.parquet'
+    pq.write_table(table, path)
+    table2 = pq.read_table(path)
+    assert table == table2
+
+
+def test_permutation_of_column_order(tempdir):
+    # ARROW-2366
+    case = tempdir / "dataset_column_order_permutation"
+    case.mkdir(exist_ok=True)
+
+    data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
+    pq.write_table(data1, case / "data1.parquet")
+
+    data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
+    pq.write_table(data2, case / "data2.parquet")
+
+    table = pq.read_table(str(case))
+    table2 = pa.table([[1, 2, 3, 4, 5, 6],
+                       [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]],
+                      names=['a', 'b'])
+
+    assert table == table2
+
+
+def test_thrift_size_limits(tempdir):
+    path = tempdir / 'largethrift.parquet'
+
+    array = pa.array(list(range(10)))
+    num_cols = 1000
+    table = pa.table(
+        [array] * num_cols,
+        names=[f'some_long_column_name_{i}' for i in range(num_cols)])
+    pq.write_table(table, path)
+
+    with pytest.raises(
+            OSError,
+            match="Couldn't deserialize thrift:.*Exceeded size limit"):
+        pq.read_table(path, thrift_string_size_limit=50 * num_cols)
+    with pytest.raises(
+            OSError,
+            match="Couldn't deserialize thrift:.*Exceeded size limit"):
+        pq.read_table(path, thrift_container_size_limit=num_cols)
+
+    got = pq.read_table(path, thrift_string_size_limit=100 * num_cols)
+    assert got == table
+    got = pq.read_table(path, thrift_container_size_limit=2 * num_cols)
+    assert got == table
+    got = pq.read_table(path)
+    assert got == table
+
+
+def test_page_checksum_verification_write_table(tempdir):
+    """Check that checksum verification works for datasets created with
+    pq.write_table()"""
+
+    # Write some sample data into a parquet file with page checksum enabled
+    original_path = tempdir / 'correct.parquet'
+    table_orig = pa.table({'a': [1, 2, 3, 4]})
+    pq.write_table(table_orig, original_path, write_page_checksum=True)
+
+    # Read file and verify that the data is correct
+    table_check = pq.read_table(original_path, page_checksum_verification=True)
+    assert table_orig == table_check
+
+    # Read the original file as binary and swap the 31-th and 36-th bytes. This
+    # should be equivalent to storing the following data:
+    #    pa.table({'a': [1, 3, 2, 4]})
+    bin_data = bytearray(original_path.read_bytes())
+
+    # Swap two bytes to emulate corruption. Also, check that the two bytes are
+    # different, otherwise no corruption occurs
+    assert bin_data[31] != bin_data[36]
+    bin_data[31], bin_data[36] = bin_data[36], bin_data[31]
+
+    # Write the corrupted data to another parquet file
+    corrupted_path = tempdir / 'corrupted.parquet'
+    corrupted_path.write_bytes(bin_data)
+
+    # Case 1: Reading the corrupted file with read_table() and without page
+    # checksum verification succeeds but yields corrupted data
+    table_corrupt = pq.read_table(corrupted_path,
+                                  page_checksum_verification=False)
+    # The read should complete without error, but the table has different
+    # content than the original file!
+    assert table_corrupt != table_orig
+    assert table_corrupt == pa.table({'a': [1, 3, 2, 4]})
+
+    # Case 2: Reading the corrupted file with read_table() and with page
+    # checksum verification enabled raises an exception
+    with pytest.raises(OSError, match="CRC checksum verification"):
+        _ = pq.read_table(corrupted_path, page_checksum_verification=True)
+
+    # Case 3: Reading the corrupted file with ParquetFile.read() and without
+    # page checksum verification succeeds but yields corrupted data
+    corrupted_pq_file = pq.ParquetFile(corrupted_path,
+                                       page_checksum_verification=False)
+    table_corrupt2 = corrupted_pq_file.read()
+    assert table_corrupt2 != table_orig
+    assert table_corrupt2 == pa.table({'a': [1, 3, 2, 4]})
+
+    # Case 4: Reading the corrupted file with ParquetFile.read() and with page
+    # checksum verification enabled raises an exception
+    corrupted_pq_file = pq.ParquetFile(corrupted_path,
+                                       page_checksum_verification=True)
+    # Accessing the data should result in an error
+    with pytest.raises(OSError, match="CRC checksum verification"):
+        _ = corrupted_pq_file.read()
+
+
+@pytest.mark.dataset
+def test_checksum_write_to_dataset(tempdir):
+    """Check that checksum verification works for datasets created with
+    pq.write_to_dataset"""
+
+    table_orig = pa.table({'a': [1, 2, 3, 4]})
+
+    # Write a sample dataset with page checksum enabled
+    original_dir_path = tempdir / 'correct_dir'
+    pq.write_to_dataset(table_orig,
+                        original_dir_path,
+                        write_page_checksum=True)
+
+    # Read file and verify that the data is correct
+    original_file_path_list = list(original_dir_path.iterdir())
+    assert len(original_file_path_list) == 1
+    original_path = original_file_path_list[0]
+    table_check = pq.read_table(original_path, page_checksum_verification=True)
+    assert table_orig == table_check
+
+    # Read the original file as binary and swap the 31-th and 36-th bytes. This
+    # should be equivalent to storing the following data:
+    #    pa.table({'a': [1, 3, 2, 4]})
+    bin_data = bytearray(original_path.read_bytes())
+
+    # Swap two bytes to emulate corruption. Also, check that the two bytes are
+    # different, otherwise no corruption occurs
+    assert bin_data[31] != bin_data[36]
+    bin_data[31], bin_data[36] = bin_data[36], bin_data[31]
+
+    # Write the corrupted data to another parquet dataset
+    # Copy dataset dir (which should be just one file)
+    corrupted_dir_path = tempdir / 'corrupted_dir'
+    copytree(original_dir_path, corrupted_dir_path)
+    # Corrupt just the one file with the dataset
+    corrupted_file_path = corrupted_dir_path / original_path.name
+    corrupted_file_path.write_bytes(bin_data)
+
+    # Case 1: Reading the corrupted file with read_table() and without page
+    # checksum verification succeeds but yields corrupted data
+    table_corrupt = pq.read_table(corrupted_file_path,
+                                  page_checksum_verification=False)
+    # The read should complete without error, but the table has different
+    # content than the original file!
+    assert table_corrupt != table_orig
+    assert table_corrupt == pa.table({'a': [1, 3, 2, 4]})
+
+    # Case 2: Reading the corrupted file with read_table() and with page
+    # checksum verification enabled raises an exception
+    with pytest.raises(OSError, match="CRC checksum verification"):
+        _ = pq.read_table(corrupted_file_path, page_checksum_verification=True)
+
+
+@pytest.mark.parametrize(
+    "source", ["/tmp/", ["/tmp/file1.parquet", "/tmp/file2.parquet"]])
+def test_read_table_raises_value_error_when_ds_is_unavailable(monkeypatch, source):
+    # GH-47728
+    monkeypatch.setitem(sys.modules, "pyarrow.dataset", None)
+
+    with pytest.raises(ValueError, match="the 'source' argument"):
+        pq.read_table(source=source)
diff --git a/pyarrow/tests/parquet/test_compliant_nested_type.py b/pyarrow/tests/parquet/test_compliant_nested_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..2345855a3321b6af48acfc0fcba0732e4af2c92c
--- /dev/null
+++ b/pyarrow/tests/parquet/test_compliant_nested_type.py
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+import pyarrow as pa
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import (_read_table,
+                                              _check_roundtrip)
+except ImportError:
+    pq = None
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
+except ImportError:
+    pd = tm = None
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = pytest.mark.parquet
+
+
+# Tests for ARROW-11497
+_test_data_simple = [
+    {'items': [1, 2]},
+    {'items': [0]},
+]
+
+_test_data_complex = [
+    {'items': [{'name': 'elem1', 'value': '1'},
+               {'name': 'elem2', 'value': '2'}]},
+    {'items': [{'name': 'elem1', 'value': '0'}]},
+]
+
+parametrize_test_data = pytest.mark.parametrize(
+    "test_data", [_test_data_simple, _test_data_complex])
+
+
+@pytest.mark.pandas
+@parametrize_test_data
+def test_write_compliant_nested_type_enable(tempdir, test_data):
+    # prepare dataframe for testing
+    df = pd.DataFrame(data=test_data)
+    # verify that we can read/write pandas df with new flag (default behaviour)
+    _roundtrip_pandas_dataframe(df,
+                                write_kwargs={})
+
+    # Write to a parquet file with compliant nested type
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    path = str(tempdir / 'data.parquet')
+    with pq.ParquetWriter(path, table.schema,
+                          version='2.6') as writer:
+        writer.write_table(table)
+    # Read back as a table
+    new_table = _read_table(path)
+    # Validate that "items" columns compliant to Parquet nested format
+    # Should be like this: list<element: struct<name: string, value: string>>
+    assert isinstance(new_table.schema.types[0], pa.ListType)
+    assert new_table.schema.types[0].value_field.name == 'element'
+
+    # Verify that the new table can be read/written correctly
+    _check_roundtrip(new_table)
+
+
+@pytest.mark.pandas
+@parametrize_test_data
+def test_write_compliant_nested_type_disable(tempdir, test_data):
+    # prepare dataframe for testing
+    df = pd.DataFrame(data=test_data)
+    # verify that we can read/write with new flag disabled
+    _roundtrip_pandas_dataframe(df, write_kwargs={
+        'use_compliant_nested_type': False})
+
+    # Write to a parquet file while disabling compliant nested type
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    path = str(tempdir / 'data.parquet')
+    with pq.ParquetWriter(path, table.schema, version='2.6',
+                          use_compliant_nested_type=False) as writer:
+        writer.write_table(table)
+    new_table = _read_table(path)
+
+    # Validate that "items" columns is not compliant to Parquet nested format
+    # Should be like this: list<item: struct<name: string, value: string>>
+    assert isinstance(new_table.schema.types[0], pa.ListType)
+    assert new_table.schema.types[0].value_field.name == 'item'
+
+    # Verify that the new table can be read/written correctly
+    _check_roundtrip(new_table,
+                     use_compliant_nested_type=False)
diff --git a/pyarrow/tests/parquet/test_data_types.py b/pyarrow/tests/parquet/test_data_types.py
new file mode 100644
index 0000000000000000000000000000000000000000..c546bc1532ac3fcc975cbc3178146fa3e35c0b01
--- /dev/null
+++ b/pyarrow/tests/parquet/test_data_types.py
@@ -0,0 +1,616 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import decimal
+import io
+import random
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+import pytest
+
+import pyarrow as pa
+from pyarrow.tests import util
+from pyarrow.tests.parquet.common import _check_roundtrip, _roundtrip_table
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import _read_table, _write_table
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.pandas_examples import (dataframe_with_arrays,
+                                               dataframe_with_lists)
+    from pyarrow.tests.parquet.common import alltypes_sample
+except ImportError:
+    pd = tm = None
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = pytest.mark.parquet
+
+
+# General roundtrip of data types
+# -----------------------------------------------------------------------------
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('chunk_size', [None, 1000])
+def test_parquet_2_6_roundtrip(tempdir, chunk_size):
+    df = alltypes_sample(size=10000, categorical=True)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    assert arrow_table.schema.pandas_metadata is not None
+
+    _write_table(arrow_table, filename, version='2.6',
+                 chunk_size=chunk_size)
+    table_read = pq.read_pandas(filename)
+    assert table_read.schema.pandas_metadata is not None
+
+    read_metadata = table_read.schema.metadata
+    assert arrow_table.schema.metadata == read_metadata
+
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_parquet_1_0_roundtrip(tempdir):
+    size = 10000
+    np.random.seed(0)
+    df = pd.DataFrame({
+        'uint8': np.arange(size, dtype=np.uint8),
+        'uint16': np.arange(size, dtype=np.uint16),
+        'uint32': np.arange(size, dtype=np.uint32),
+        'uint64': np.arange(size, dtype=np.uint64),
+        'int8': np.arange(size, dtype=np.int16),
+        'int16': np.arange(size, dtype=np.int16),
+        'int32': np.arange(size, dtype=np.int32),
+        'int64': np.arange(size, dtype=np.int64),
+        'float32': np.arange(size, dtype=np.float32),
+        'float64': np.arange(size, dtype=np.float64),
+        'bool': np.random.randn(size) > 0,
+        'str': [str(x) for x in range(size)],
+        'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
+        'empty_str': [''] * size
+    })
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    _write_table(arrow_table, filename, version='1.0')
+    table_read = _read_table(filename)
+    df_read = table_read.to_pandas()
+
+    # We pass uint32_t as int64_t if we write Parquet version 1.0
+    df['uint32'] = df['uint32'].values.astype(np.int64)
+
+    tm.assert_frame_equal(df, df_read)
+
+
+# Dictionary
+# -----------------------------------------------------------------------------
+
+
+def _simple_table_write_read(table):
+    bio = pa.BufferOutputStream()
+    pq.write_table(table, bio)
+    contents = bio.getvalue()
+    return pq.read_table(
+        pa.BufferReader(contents)
+    )
+
+
+@pytest.mark.pandas
+def test_direct_read_dictionary():
+    # ARROW-3325
+    repeats = 10
+    nunique = 5
+
+    data = [
+        [util.rands(10) for i in range(nunique)] * repeats,
+
+    ]
+    table = pa.table(data, names=['f0'])
+
+    bio = pa.BufferOutputStream()
+    pq.write_table(table, bio)
+    contents = bio.getvalue()
+
+    result = pq.read_table(pa.BufferReader(contents),
+                           read_dictionary=['f0'])
+
+    # Compute dictionary-encoded subfield
+    expected = pa.table([table[0].dictionary_encode()], names=['f0'])
+    assert result.equals(expected)
+
+
+@pytest.mark.pandas
+def test_direct_read_dictionary_subfield():
+    repeats = 10
+    nunique = 5
+
+    data = [
+        [[util.rands(10)] for i in range(nunique)] * repeats,
+    ]
+    table = pa.table(data, names=['f0'])
+
+    bio = pa.BufferOutputStream()
+    pq.write_table(table, bio)
+    contents = bio.getvalue()
+    result = pq.read_table(pa.BufferReader(contents),
+                           read_dictionary=['f0.list.element'])
+
+    arr = pa.array(data[0])
+    values_as_dict = arr.values.dictionary_encode()
+
+    inner_indices = values_as_dict.indices.cast('int32')
+    new_values = pa.DictionaryArray.from_arrays(inner_indices,
+                                                values_as_dict.dictionary)
+
+    offsets = pa.array(range(51), type='int32')
+    expected_arr = pa.ListArray.from_arrays(offsets, new_values)
+    expected = pa.table([expected_arr], names=['f0'])
+
+    assert result.equals(expected)
+    assert result[0].num_chunks == 1
+
+
+@pytest.mark.numpy
+def test_dictionary_array_automatically_read():
+    # ARROW-3246
+
+    # Make a large dictionary, a little over 4MB of data
+    dict_length = 4000
+    dict_values = pa.array([('x' * 1000 + f'_{i}')
+                            for i in range(dict_length)])
+
+    num_chunks = 10
+    chunk_size = 100
+    chunks = []
+    for i in range(num_chunks):
+        indices = np.random.randint(0, dict_length,
+                                    size=chunk_size).astype(np.int32)
+        chunks.append(pa.DictionaryArray.from_arrays(pa.array(indices),
+                                                     dict_values))
+
+    table = pa.table([pa.chunked_array(chunks)], names=['f0'])
+    result = _simple_table_write_read(table)
+
+    assert result.equals(table)
+
+    # The only key in the metadata was the Arrow schema key
+    assert result.schema.metadata is None
+
+
+# Decimal
+# -----------------------------------------------------------------------------
+
+
+@pytest.mark.pandas
+def test_decimal_roundtrip(tempdir):
+    num_values = 10
+
+    columns = {}
+    for precision in range(1, 39):
+        for scale in range(0, precision + 1):
+            with util.random_seed(0):
+                random_decimal_values = [
+                    util.randdecimal(precision, scale)
+                    for _ in range(num_values)
+                ]
+            column_name = f'dec_precision_{precision}_scale_{scale}'
+            columns[column_name] = random_decimal_values
+
+    expected = pd.DataFrame(columns)
+    filename = tempdir / 'decimals.parquet'
+    string_filename = str(filename)
+    table = pa.Table.from_pandas(expected)
+    _write_table(table, string_filename)
+    result_table = _read_table(string_filename)
+    result = result_table.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.pandas
+@pytest.mark.xfail(
+    raises=OSError, reason='Parquet does not support negative scale'
+)
+def test_decimal_roundtrip_negative_scale(tempdir):
+    expected = pd.DataFrame({'decimal_num': [decimal.Decimal('1.23E4')]})
+    filename = tempdir / 'decimals.parquet'
+    string_filename = str(filename)
+    t = pa.Table.from_pandas(expected)
+    _write_table(t, string_filename)
+    result_table = _read_table(string_filename)
+    result = result_table.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+
+# List types
+# -----------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize('dtype', [int, float])
+def test_single_pylist_column_roundtrip(tempdir, dtype,):
+    filename = tempdir / f'single_{dtype.__name__}_column.parquet'
+    data = [pa.array(list(map(dtype, range(5))))]
+    table = pa.Table.from_arrays(data, names=['a'])
+    _write_table(table, filename)
+    table_read = _read_table(filename)
+    for i in range(table.num_columns):
+        col_written = table[i]
+        col_read = table_read[i]
+        assert table.field(i).name == table_read.field(i).name
+        assert col_read.num_chunks == 1
+        data_written = col_written.chunk(0)
+        data_read = col_read.chunk(0)
+        assert data_written.equals(data_read)
+
+
+def test_empty_lists_table_roundtrip():
+    # ARROW-2744: Shouldn't crash when writing an array of empty lists
+    arr = pa.array([[], []], type=pa.list_(pa.int32()))
+    table = pa.Table.from_arrays([arr], ["A"])
+    _check_roundtrip(table)
+
+
+def test_nested_list_nonnullable_roundtrip_bug():
+    # Reproduce failure in ARROW-5630
+    typ = pa.list_(pa.field("item", pa.float32(), False))
+    num_rows = 10000
+    t = pa.table([
+        pa.array(([[0] * ((i + 5) % 10) for i in range(0, 10)] *
+                  (num_rows // 10)), type=typ)
+    ], ['a'])
+    _check_roundtrip(
+        t, data_page_size=4096)
+
+
+def test_nested_list_struct_multiple_batches_roundtrip(tempdir):
+    # Reproduce failure in ARROW-11024
+    data = [[{'x': 'abc', 'y': 'abc'}]]*100 + [[{'x': 'abc', 'y': 'gcb'}]]*100
+    table = pa.table([pa.array(data)], names=['column'])
+    _check_roundtrip(
+        table, row_group_size=20)
+
+    # Reproduce failure in ARROW-11069 (plain non-nested structs with strings)
+    data = pa.array(
+        [{'a': '1', 'b': '2'}, {'a': '3', 'b': '4'}, {'a': '5', 'b': '6'}]*10
+    )
+    table = pa.table({'column': data})
+    _check_roundtrip(table, row_group_size=10)
+
+
+def test_writing_empty_lists():
+    # ARROW-2591: [Python] Segmentation fault issue in pq.write_table
+    arr1 = pa.array([[], []], pa.list_(pa.int32()))
+    table = pa.Table.from_arrays([arr1], ['list(int32)'])
+    _check_roundtrip(table)
+
+
+@pytest.mark.pandas
+def test_column_of_arrays(tempdir):
+    df, schema = dataframe_with_arrays()
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df, schema=schema)
+    _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms')
+    table_read = _read_table(filename)
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_column_of_lists(tempdir):
+    df, schema = dataframe_with_lists(parquet_compatible=True)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df, schema=schema)
+    _write_table(arrow_table, filename, version='2.6')
+    table_read = _read_table(filename)
+    df_read = table_read.to_pandas()
+
+    tm.assert_frame_equal(df, df_read)
+
+
+def test_large_list_records():
+    # This was fixed in PARQUET-1100
+
+    list_lengths = [random.randint(0, 500) for _ in range(50)]
+    list_lengths[::10] = [0, 0, 0, 0, 0]
+
+    list_values = [list(map(int, [random.randint(0, 100) for _ in range(x)]))
+                   if i % 8 else None
+                   for i, x in enumerate(list_lengths)]
+
+    a1 = pa.array(list_values)
+
+    table = pa.Table.from_arrays([a1], ['int_lists'])
+    _check_roundtrip(table)
+
+
+list_types = [
+    (pa.ListType, pa.list_),
+    (pa.LargeListType, pa.large_list),
+]
+
+
+def test_list_types():
+    data = [[1, 2, None]] * 50
+    for _, in_factory in list_types:
+        array = pa.array(data, type=in_factory(pa.int32()))
+        table = pa.Table.from_arrays([array], ['lists'])
+        for out_type, out_factory in list_types:
+            for store_schema in (True, False):
+                if store_schema:
+                    expected_table = table
+                else:
+                    expected_table = pa.Table.from_arrays(
+                        [pa.array(data, type=out_factory(pa.int32()))], ['lists'])
+                result = _roundtrip_table(
+                    table, write_table_kwargs=dict(store_schema=store_schema),
+                    read_table_kwargs=dict(list_type=out_type))
+                assert result == expected_table
+
+
+@pytest.mark.pandas
+def test_parquet_nested_convenience(tempdir):
+    # ARROW-1684
+    df = pd.DataFrame({
+        'a': [[1, 2, 3], None, [4, 5], []],
+        'b': [[1.], None, None, [6., 7.]],
+    })
+
+    path = str(tempdir / 'nested_convenience.parquet')
+
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    _write_table(table, path)
+
+    read = pq.read_table(
+        path, columns=['a'])
+    tm.assert_frame_equal(read.to_pandas(), df[['a']])
+
+    read = pq.read_table(
+        path, columns=['a', 'b'])
+    tm.assert_frame_equal(read.to_pandas(), df)
+
+
+# Binary
+# -----------------------------------------------------------------------------
+
+
+def test_fixed_size_binary():
+    t0 = pa.binary(10)
+    data = [b'fooooooooo', None, b'barooooooo', b'quxooooooo']
+    a0 = pa.array(data, type=t0)
+
+    table = pa.Table.from_arrays([a0],
+                                 ['binary[10]'])
+    _check_roundtrip(table)
+
+
+def test_binary_types():
+    types = [pa.binary(), pa.large_binary(), pa.binary_view()]
+    data = [b'abc', None, b'defg', b'x' * 30]
+    for in_type in types:
+        array = pa.array(data, in_type)
+        table = pa.Table.from_arrays([array], ['binary'])
+        for out_type in types:
+            for store_schema in (False, True):
+                result = _roundtrip_table(
+                    table, write_table_kwargs=dict(store_schema=store_schema),
+                    read_table_kwargs=dict(binary_type=out_type))
+                if store_schema:
+                    expected_table = table
+                else:
+                    expected_table = pa.Table.from_arrays(
+                        [pa.array(data, out_type)], ['binary'])
+                assert result == expected_table
+
+
+# Large types
+# -----------------------------------------------------------------------------
+
+
+@pytest.mark.slow
+@pytest.mark.large_memory
+def test_large_table_int32_overflow():
+    size = np.iinfo('int32').max + 1
+
+    arr = np.ones(size, dtype='uint8')
+
+    parr = pa.array(arr, type=pa.uint8())
+
+    table = pa.Table.from_arrays([parr], names=['one'])
+    f = io.BytesIO()
+    _write_table(table, f)
+
+
+def _simple_table_roundtrip(table, **write_kwargs):
+    stream = pa.BufferOutputStream()
+    _write_table(table, stream, **write_kwargs)
+    buf = stream.getvalue()
+    return _read_table(buf)
+
+
+@pytest.mark.slow
+@pytest.mark.large_memory
+def test_byte_array_exactly_2gb():
+    # Test edge case reported in ARROW-3762
+    val = b'x' * (1 << 10)
+
+    base = pa.array([val] * ((1 << 21) - 1))
+    cases = [
+        [b'x' * 1023],  # 2^31 - 1
+        [b'x' * 1024],  # 2^31
+        [b'x' * 1025]   # 2^31 + 1
+    ]
+    for case in cases:
+        values = pa.chunked_array([base, pa.array(case)])
+        t = pa.table([values], names=['f0'])
+        result = _simple_table_roundtrip(
+            t, use_dictionary=False)
+        assert t.equals(result)
+
+
+@pytest.mark.slow
+@pytest.mark.pandas
+@pytest.mark.large_memory
+def test_binary_array_overflow_to_chunked():
+    # ARROW-3762
+
+    # 2^31 + 1 bytes
+    values = [b'x'] + [
+        b'x' * (1 << 20)
+    ] * 2 * (1 << 10)
+    df = pd.DataFrame({'byte_col': values})
+
+    tbl = pa.Table.from_pandas(df, preserve_index=False)
+    read_tbl = _simple_table_roundtrip(tbl)
+
+    col0_data = read_tbl[0]
+    assert isinstance(col0_data, pa.ChunkedArray)
+
+    # Split up into 2GB chunks
+    assert col0_data.num_chunks == 2
+
+    assert tbl.equals(read_tbl)
+
+
+@pytest.mark.slow
+@pytest.mark.pandas
+@pytest.mark.large_memory
+def test_list_of_binary_large_cell():
+    # ARROW-4688
+    data = []
+
+    # TODO(wesm): handle chunked children
+    # 2^31 - 1 bytes in a single cell
+    # data.append([b'x' * (1 << 20)] * 2047 + [b'x' * ((1 << 20) - 1)])
+
+    # A little under 2GB in cell each containing approximately 10MB each
+    data.extend([[b'x' * 1000000] * 10] * 214)
+
+    arr = pa.array(data)
+    table = pa.Table.from_arrays([arr], ['chunky_cells'])
+    read_table = _simple_table_roundtrip(table)
+    assert table.equals(read_table)
+
+
+def test_large_binary_and_binary_view():
+    data = [b'foo', b'bar'] * 50
+    for type in [pa.large_binary(), pa.binary_view()]:
+        arr = pa.array(data, type=type)
+        table = pa.Table.from_arrays([arr], names=['strs'])
+        for use_dictionary in [False, True]:
+            _check_roundtrip(table, use_dictionary=use_dictionary)
+
+
+@pytest.mark.slow
+@pytest.mark.large_memory
+def test_large_binary_and_binary_view_huge():
+    s = b'xy' * 997
+    data = [s] * ((1 << 33) // len(s))
+    for type in [pa.large_binary(), pa.binary_view()]:
+        arr = pa.array(data, type=type)
+        table = pa.Table.from_arrays([arr], names=['strs'])
+        for use_dictionary in [False, True]:
+            _check_roundtrip(table, use_dictionary=use_dictionary)
+        del arr, table
+
+
+@pytest.mark.large_memory
+def test_large_binary_overflow():
+    s = b'x' * (1 << 31)
+    arr = pa.array([s], type=pa.large_binary())
+    table = pa.Table.from_arrays([arr], names=['strs'])
+    for use_dictionary in [False, True]:
+        writer = pa.BufferOutputStream()
+        with pytest.raises(
+                pa.ArrowInvalid,
+                match="Parquet cannot store strings with size 2GB or more"):
+            _write_table(table, writer, use_dictionary=use_dictionary)
+
+
+@pytest.mark.parametrize("storage_type", (
+    pa.string(), pa.large_string()))
+def test_json_extension_type(storage_type):
+    data = ['{"a": 1}', '{"b": 2}', None]
+    arr = pa.array(data, type=pa.json_(storage_type))
+
+    table = pa.table([arr], names=["ext"])
+
+    # With defaults, this should roundtrip (because store_schema=True)
+    _check_roundtrip(table, table)
+
+    # When store_schema is False, we get a string back by default
+    _check_roundtrip(
+        table,
+        pa.table({"ext": pa.array(data, pa.string())}),
+        {"arrow_extensions_enabled": False},
+        store_schema=False)
+
+    # With arrow_extensions_enabled=True on read, we get a arrow.json back
+    # (but with string() storage)
+    _check_roundtrip(
+        table,
+        pa.table({"ext": pa.array(data, pa.json_(pa.string()))}),
+        {"arrow_extensions_enabled": True},
+        store_schema=False)
+
+
+def test_uuid_extension_type():
+    data = [
+        b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb',
+        b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf',
+        None
+    ]
+    arr = pa.array(data, type=pa.uuid())
+
+    table = pa.table([arr], names=["ext"])
+
+    _check_roundtrip(table, table)
+    _check_roundtrip(
+        table,
+        pa.table({"ext": pa.array(data, pa.binary(16))}),
+        {"arrow_extensions_enabled": False},
+        store_schema=False)
+    _check_roundtrip(
+        table,
+        table,
+        {"arrow_extensions_enabled": True},
+        store_schema=False)
+
+
+def test_undefined_logical_type(parquet_test_datadir):
+    test_file = f"{parquet_test_datadir}/unknown-logical-type.parquet"
+
+    table = _read_table(test_file)
+    assert table.column_names == ["column with known type", "column with unknown type"]
+    assert table["column with unknown type"].to_pylist() == [
+        b"unknown string 1",
+        b"unknown string 2",
+        b"unknown string 3"
+    ]
diff --git a/pyarrow/tests/parquet/test_dataset.py b/pyarrow/tests/parquet/test_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3e9cda730187d53ce6ee8f0ceab280480efadb9
--- /dev/null
+++ b/pyarrow/tests/parquet/test_dataset.py
@@ -0,0 +1,1335 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime
+import inspect
+import os
+import pathlib
+import sys
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+import pytest
+import unittest.mock as mock
+
+import pyarrow as pa
+import pyarrow.compute as pc
+from pyarrow.fs import (FileSelector, FileSystem, LocalFileSystem,
+                        PyFileSystem, SubTreeFileSystem, FSSpecHandler)
+from pyarrow.tests import util
+from pyarrow.util import guid
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import (
+        _read_table, _test_dataframe, _test_table, _write_table)
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+except ImportError:
+    pd = tm = None
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = [pytest.mark.parquet, pytest.mark.dataset]
+
+
+def test_filesystem_uri(tempdir):
+    table = pa.table({"a": [1, 2, 3]})
+
+    directory = tempdir / "data_dir"
+    directory.mkdir()
+    path = directory / "data.parquet"
+    pq.write_table(table, str(path))
+
+    # filesystem object
+    result = pq.read_table(
+        path, filesystem=LocalFileSystem())
+    assert result.equals(table)
+
+    # filesystem URI
+    result = pq.read_table(
+        "data_dir/data.parquet", filesystem=util._filesystem_uri(tempdir))
+    assert result.equals(table)
+
+
+@pytest.mark.pandas
+def test_read_partitioned_directory(tempdir):
+    local = LocalFileSystem()
+    _partition_test_for_filesystem(local, tempdir)
+
+
+@pytest.mark.pandas
+def test_read_partitioned_columns_selection(tempdir):
+    # ARROW-3861 - do not include partition columns in resulting table when
+    # `columns` keyword was passed without those columns
+    local = LocalFileSystem()
+    base_path = tempdir
+    _partition_test_for_filesystem(local, base_path)
+
+    dataset = pq.ParquetDataset(base_path)
+    result = dataset.read(columns=["values"])
+    assert result.column_names == ["values"]
+
+
+@pytest.mark.pandas
+def test_filters_equivalency(tempdir):
+    local = LocalFileSystem()
+    base_path = tempdir
+
+    integer_keys = [0, 1]
+    string_keys = ['a', 'b', 'c']
+    boolean_keys = [True, False]
+    partition_spec = [
+        ['integer', integer_keys],
+        ['string', string_keys],
+        ['boolean', boolean_keys]
+    ]
+
+    df = pd.DataFrame({
+        'integer': np.array(integer_keys, dtype='i4').repeat(15),
+        'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2),
+        'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), 3),
+        'values': np.arange(30),
+    })
+
+    _generate_partition_directories(local, base_path, partition_spec, df)
+
+    # Old filters syntax:
+    #  integer == 1 AND string != b AND boolean == True
+    dataset = pq.ParquetDataset(
+        base_path, filesystem=local,
+        filters=[('integer', '=', 1), ('string', '!=', 'b'),
+                 ('boolean', '==', 'True')],
+    )
+    table = dataset.read()
+    result_df = (table.to_pandas().reset_index(drop=True))
+
+    assert 0 not in result_df['integer'].values
+    assert 'b' not in result_df['string'].values
+    assert False not in result_df['boolean'].values
+
+    # filters in disjunctive normal form:
+    #  (integer == 1 AND string != b AND boolean == True) OR
+    #  (integer == 2 AND boolean == False)
+    # TODO(ARROW-3388): boolean columns are reconstructed as string
+    filters = [
+        [
+            ('integer', '=', 1),
+            ('string', '!=', 'b'),
+            ('boolean', '==', 'True')
+        ],
+        [('integer', '=', 0), ('boolean', '==', 'False')]
+    ]
+    dataset = pq.ParquetDataset(
+        base_path, filesystem=local, filters=filters)
+    table = dataset.read()
+    result_df = table.to_pandas().reset_index(drop=True)
+
+    # Check that all rows in the DF fulfill the filter
+    df_filter_1 = (result_df['integer'] == 1) \
+        & (result_df['string'] != 'b') \
+        & (result_df['boolean'] == 'True')
+    df_filter_2 = (np.array(result_df['integer']) == 0) \
+        & (result_df['boolean'] == 'False')
+    assert df_filter_1.sum() > 0
+    assert df_filter_2.sum() > 0
+    assert result_df.shape[0] == (df_filter_1.sum() + df_filter_2.sum())
+
+    for filters in [[[('string', '==', b'1\0a')]],
+                    [[('string', '==', '1\0a')]]]:
+        dataset = pq.ParquetDataset(
+            base_path, filesystem=local, filters=filters)
+        assert dataset.read().num_rows == 0
+
+
+@pytest.mark.pandas
+def test_filters_cutoff_exclusive_integer(tempdir):
+    local = LocalFileSystem()
+    base_path = tempdir
+
+    integer_keys = [0, 1, 2, 3, 4]
+    partition_spec = [
+        ['integers', integer_keys],
+    ]
+    N = 5
+
+    df = pd.DataFrame({
+        'index': np.arange(N),
+        'integers': np.array(integer_keys, dtype='i4'),
+    }, columns=['index', 'integers'])
+
+    _generate_partition_directories(local, base_path, partition_spec, df)
+
+    dataset = pq.ParquetDataset(
+        base_path, filesystem=local,
+        filters=[
+            ('integers', '<', 4),
+            ('integers', '>', 1),
+        ],
+    )
+    table = dataset.read()
+    result_df = (table.to_pandas()
+                      .sort_values(by='index')
+                      .reset_index(drop=True))
+
+    result_list = [x for x in map(int, result_df['integers'].values)]
+    assert result_list == [2, 3]
+
+
+@pytest.mark.xfail(
+    # different error with use_legacy_datasets because result_df is no longer
+    # categorical
+    raises=(TypeError, AssertionError),
+    reason='Loss of type information in creation of categoricals.'
+)
+@pytest.mark.pandas
+def test_filters_cutoff_exclusive_datetime(tempdir):
+    local = LocalFileSystem()
+    base_path = tempdir
+
+    date_keys = [
+        datetime.date(2018, 4, 9),
+        datetime.date(2018, 4, 10),
+        datetime.date(2018, 4, 11),
+        datetime.date(2018, 4, 12),
+        datetime.date(2018, 4, 13)
+    ]
+    partition_spec = [
+        ['dates', date_keys]
+    ]
+    N = 5
+
+    df = pd.DataFrame({
+        'index': np.arange(N),
+        'dates': np.array(date_keys, dtype='datetime64'),
+    }, columns=['index', 'dates'])
+
+    _generate_partition_directories(local, base_path, partition_spec, df)
+
+    dataset = pq.ParquetDataset(
+        base_path, filesystem=local,
+        filters=[
+            ('dates', '<', "2018-04-12"),
+            ('dates', '>', "2018-04-10")
+        ],
+    )
+    table = dataset.read()
+    result_df = (table.to_pandas()
+                      .sort_values(by='index')
+                      .reset_index(drop=True))
+
+    expected = pd.Categorical(
+        np.array([datetime.date(2018, 4, 11)], dtype='datetime64'),
+        categories=np.array(date_keys, dtype='datetime64'))
+
+    assert result_df['dates'].values == expected
+
+
+@pytest.mark.pandas
+def test_filters_inclusive_datetime(tempdir):
+    # ARROW-11480
+    path = tempdir / 'timestamps.parquet'
+
+    pd.DataFrame({
+        "dates": pd.date_range("2020-01-01", periods=10, freq="D"),
+        "id": range(10)
+    }).to_parquet(path, use_deprecated_int96_timestamps=True)
+
+    table = pq.read_table(path, filters=[
+        ("dates", "<=", datetime.datetime(2020, 1, 5))
+    ])
+
+    assert table.column('id').to_pylist() == [0, 1, 2, 3, 4]
+
+
+@pytest.mark.pandas
+def test_filters_inclusive_integer(tempdir):
+    local = LocalFileSystem()
+    base_path = tempdir
+
+    integer_keys = [0, 1, 2, 3, 4]
+    partition_spec = [
+        ['integers', integer_keys],
+    ]
+    N = 5
+
+    df = pd.DataFrame({
+        'index': np.arange(N),
+        'integers': np.array(integer_keys, dtype='i4'),
+    }, columns=['index', 'integers'])
+
+    _generate_partition_directories(local, base_path, partition_spec, df)
+
+    dataset = pq.ParquetDataset(
+        base_path, filesystem=local,
+        filters=[
+            ('integers', '<=', 3),
+            ('integers', '>=', 2),
+        ],
+    )
+    table = dataset.read()
+    result_df = (table.to_pandas()
+                 .sort_values(by='index')
+                 .reset_index(drop=True))
+
+    result_list = [int(x) for x in map(int, result_df['integers'].values)]
+    assert result_list == [2, 3]
+
+
+@pytest.mark.pandas
+def test_filters_inclusive_set(tempdir):
+    local = LocalFileSystem()
+    base_path = tempdir
+
+    integer_keys = [0, 1]
+    string_keys = ['a', 'b', 'c']
+    boolean_keys = [True, False]
+    partition_spec = [
+        ['integer', integer_keys],
+        ['string', string_keys],
+        ['boolean', boolean_keys]
+    ]
+
+    df = pd.DataFrame({
+        'integer': np.array(integer_keys, dtype='i4').repeat(15),
+        'string': np.tile(np.tile(np.array(string_keys, dtype=object), 5), 2),
+        'boolean': np.tile(np.tile(np.array(boolean_keys, dtype='bool'), 5), 3),
+        'values': np.arange(30),
+    })
+
+    _generate_partition_directories(local, base_path, partition_spec, df)
+
+    dataset = pq.ParquetDataset(
+        base_path, filesystem=local,
+        filters=[('string', 'in', 'ab')],
+    )
+    table = dataset.read()
+    result_df = (table.to_pandas().reset_index(drop=True))
+
+    assert 'a' in result_df['string'].values
+    assert 'b' in result_df['string'].values
+    assert 'c' not in result_df['string'].values
+
+    dataset = pq.ParquetDataset(
+        base_path, filesystem=local,
+        filters=[('integer', 'in', [1]), ('string', 'in', ('a', 'b')),
+                 ('boolean', 'not in', {'False'})],
+    )
+    table = dataset.read()
+    result_df = (table.to_pandas().reset_index(drop=True))
+
+    assert 0 not in result_df['integer'].values
+    assert 'c' not in result_df['string'].values
+    assert False not in result_df['boolean'].values
+
+
+@pytest.mark.pandas
+def test_filters_invalid_pred_op(tempdir):
+    local = LocalFileSystem()
+    base_path = tempdir
+
+    integer_keys = [0, 1, 2, 3, 4]
+    partition_spec = [
+        ['integers', integer_keys],
+    ]
+    N = 5
+
+    df = pd.DataFrame({
+        'index': np.arange(N),
+        'integers': np.array(integer_keys, dtype='i4'),
+    }, columns=['index', 'integers'])
+
+    _generate_partition_directories(local, base_path, partition_spec, df)
+
+    with pytest.raises(TypeError):
+        pq.ParquetDataset(base_path,
+                          filesystem=local,
+                          filters=[('integers', 'in', 3), ])
+
+    with pytest.raises(ValueError):
+        pq.ParquetDataset(base_path,
+                          filesystem=local,
+                          filters=[('integers', '=<', 3), ])
+
+    # Dataset API returns empty table
+    dataset = pq.ParquetDataset(base_path,
+                                filesystem=local,
+                                filters=[('integers', 'in', set()), ])
+    assert dataset.read().num_rows == 0
+
+    dataset = pq.ParquetDataset(base_path,
+                                filesystem=local,
+                                filters=[('integers', '!=', {3})])
+    with pytest.raises(NotImplementedError):
+        assert dataset.read().num_rows == 0
+
+
+@pytest.mark.pandas
+def test_filters_invalid_column(tempdir):
+    # ARROW-5572 - raise error on invalid name in filter specification
+    # works with new dataset
+    local = LocalFileSystem()
+    base_path = tempdir
+
+    integer_keys = [0, 1, 2, 3, 4]
+    partition_spec = [['integers', integer_keys]]
+    N = 5
+
+    df = pd.DataFrame({
+        'index': np.arange(N),
+        'integers': np.array(integer_keys, dtype='i4'),
+    }, columns=['index', 'integers'])
+
+    _generate_partition_directories(local, base_path, partition_spec, df)
+
+    msg = r"No match for FieldRef.Name\(non_existent_column\)"
+    with pytest.raises(ValueError, match=msg):
+        pq.ParquetDataset(base_path, filesystem=local,
+                          filters=[('non_existent_column', '<', 3), ]).read()
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize("filters",
+                         ([('integers', '<', 3)],
+                          [[('integers', '<', 3)]],
+                          pc.field('integers') < 3,
+                          pc.field('nested', 'a') < 3,
+                          pc.field('nested', 'b').cast(pa.int64()) < 3))
+@pytest.mark.parametrize("read_method", ("read_table", "read_pandas"))
+def test_filters_read_table(tempdir, filters, read_method):
+    read = getattr(pq, read_method)
+    # test that filters keyword is passed through in read_table
+    local = LocalFileSystem()
+    base_path = tempdir
+
+    integer_keys = [0, 1, 2, 3, 4]
+    partition_spec = [
+        ['integers', integer_keys],
+    ]
+    N = len(integer_keys)
+
+    df = pd.DataFrame({
+        'index': np.arange(N),
+        'integers': np.array(integer_keys, dtype='i4'),
+        'nested': np.array([{'a': i, 'b': str(i)} for i in range(N)])
+    })
+
+    _generate_partition_directories(local, base_path, partition_spec, df)
+
+    kwargs = dict(filesystem=local, filters=filters)
+
+    table = read(base_path, **kwargs)
+    assert table.num_rows == 3
+
+
+@pytest.mark.pandas
+def test_partition_keys_with_underscores(tempdir):
+    # ARROW-5666 - partition field values with underscores preserve underscores
+    local = LocalFileSystem()
+    base_path = tempdir
+
+    string_keys = ["2019_2", "2019_3"]
+    partition_spec = [
+        ['year_week', string_keys],
+    ]
+    N = 2
+
+    df = pd.DataFrame({
+        'index': np.arange(N),
+        'year_week': np.array(string_keys, dtype='object'),
+    }, columns=['index', 'year_week'])
+
+    _generate_partition_directories(local, base_path, partition_spec, df)
+
+    dataset = pq.ParquetDataset(base_path)
+    result = dataset.read()
+    assert result.column("year_week").to_pylist() == string_keys
+
+
+@pytest.mark.s3
+def test_read_s3fs(s3_example_s3fs, ):
+    fs, path = s3_example_s3fs
+    path = path + "/test.parquet"
+    table = pa.table({"a": [1, 2, 3]})
+    _write_table(table, path, filesystem=fs)
+
+    result = _read_table(path, filesystem=fs)
+    assert result.equals(table)
+
+
+@pytest.mark.s3
+def test_read_directory_s3fs(s3_example_s3fs):
+    fs, directory = s3_example_s3fs
+    path = directory + "/test.parquet"
+    table = pa.table({"a": [1, 2, 3]})
+    _write_table(table, path, filesystem=fs)
+
+    result = _read_table(directory, filesystem=fs)
+    assert result.equals(table)
+
+
+@pytest.mark.pandas
+def test_read_single_file_list(tempdir):
+    data_path = str(tempdir / 'data.parquet')
+
+    table = pa.table({"a": [1, 2, 3]})
+    _write_table(table, data_path)
+
+    result = pq.ParquetDataset([data_path]).read()
+    assert result.equals(table)
+
+
+@pytest.mark.pandas
+@pytest.mark.s3
+def test_read_partitioned_directory_s3fs(s3_example_s3fs):
+    fs, path = s3_example_s3fs
+    _partition_test_for_filesystem(fs, path)
+
+
+def _partition_test_for_filesystem(fs, base_path):
+    foo_keys = [0, 1]
+    bar_keys = ['a', 'b', 'c']
+    partition_spec = [
+        ['foo', foo_keys],
+        ['bar', bar_keys]
+    ]
+    N = 30
+
+    df = pd.DataFrame({
+        'index': np.arange(N),
+        'foo': np.array(foo_keys, dtype='i4').repeat(15),
+        'bar': np.tile(np.tile(np.array(bar_keys, dtype=object), 5), 2),
+        'values': np.random.randn(N)
+    }, columns=['index', 'foo', 'bar', 'values'])
+
+    _generate_partition_directories(fs, base_path, partition_spec, df)
+
+    dataset = pq.ParquetDataset(base_path, filesystem=fs)
+    table = dataset.read()
+    result_df = (table.to_pandas()
+                 .sort_values(by='index')
+                 .reset_index(drop=True))
+
+    expected_df = (df.sort_values(by='index')
+                   .reset_index(drop=True)
+                   .reindex(columns=result_df.columns))
+
+    # With pandas 2.0.0 Index can store all numeric dtypes (not just
+    # int64/uint64/float64). Using astype() to create a categorical
+    # column preserves original dtype (int32)
+    expected_df['foo'] = expected_df['foo'].astype("category")
+    expected_df['bar'] = expected_df['bar'].astype("category")
+
+    assert (result_df.columns == ['index', 'values', 'foo', 'bar']).all()
+
+    tm.assert_frame_equal(result_df, expected_df)
+
+
+def _generate_partition_directories(fs, base_dir, partition_spec, df):
+    # partition_spec : list of lists, e.g. [['foo', [0, 1, 2],
+    #                                       ['bar', ['a', 'b', 'c']]
+    # part_table : a pyarrow.Table to write to each partition
+    if not isinstance(fs, FileSystem):
+        fs = PyFileSystem(FSSpecHandler(fs))
+
+    DEPTH = len(partition_spec)
+
+    pathsep = getattr(fs, "pathsep", getattr(fs, "sep", "/"))
+
+    def _visit_level(base_dir, level, part_keys):
+        name, values = partition_spec[level]
+        for value in values:
+            this_part_keys = part_keys + [(name, value)]
+
+            level_dir = pathsep.join([
+                str(base_dir),
+                f'{name}={value}'
+            ])
+            fs.create_dir(level_dir)
+
+            if level == DEPTH - 1:
+                # Generate example data
+                from pyarrow.fs import FileType
+
+                file_path = pathsep.join([level_dir, guid()])
+                filtered_df = _filter_partition(df, this_part_keys)
+                part_table = pa.Table.from_pandas(filtered_df)
+                with fs.open_output_stream(file_path) as f:
+                    _write_table(part_table, f)
+                assert fs.get_file_info(file_path).type != FileType.NotFound
+                assert fs.get_file_info(file_path).type == FileType.File
+
+                file_success = pathsep.join([level_dir, '_SUCCESS'])
+                with fs.open_output_stream(file_success) as f:
+                    pass
+            else:
+                _visit_level(level_dir, level + 1, this_part_keys)
+                file_success = pathsep.join([level_dir, '_SUCCESS'])
+                with fs.open_output_stream(file_success) as f:
+                    pass
+
+    _visit_level(base_dir, 0, [])
+
+
+def _filter_partition(df, part_keys):
+    predicate = np.ones(len(df), dtype=bool)
+
+    to_drop = []
+    for name, value in part_keys:
+        to_drop.append(name)
+
+        # to avoid pandas warning
+        if isinstance(value, (datetime.date, datetime.datetime)):
+            value = pd.Timestamp(value)
+
+        predicate &= df[name] == value
+
+    return df[predicate].drop(to_drop, axis=1)
+
+
+@pytest.mark.pandas
+def test_filter_before_validate_schema(tempdir):
+    # ARROW-4076 apply filter before schema validation
+    # to avoid checking unneeded schemas
+
+    # create partitioned dataset with mismatching schemas which would
+    # otherwise raise if first validation all schemas
+    dir1 = tempdir / 'A=0'
+    dir1.mkdir()
+    table1 = pa.Table.from_pandas(pd.DataFrame({'B': [1, 2, 3]}))
+    pq.write_table(table1, dir1 / 'data.parquet')
+
+    dir2 = tempdir / 'A=1'
+    dir2.mkdir()
+    table2 = pa.Table.from_pandas(pd.DataFrame({'B': ['a', 'b', 'c']}))
+    pq.write_table(table2, dir2 / 'data.parquet')
+
+    # read single file using filter
+    table = pq.read_table(tempdir, filters=[[('A', '==', 0)]])
+    assert table.column('B').equals(pa.chunked_array([[1, 2, 3]]))
+
+
+@pytest.mark.pandas
+def test_read_multiple_files(tempdir):
+    nfiles = 10
+    size = 5
+
+    dirpath = tempdir / guid()
+    dirpath.mkdir()
+
+    test_data = []
+    paths = []
+    for i in range(nfiles):
+        df = _test_dataframe(size, seed=i)
+
+        # Hack so that we don't have a dtype cast in v1 files
+        df['uint32'] = df['uint32'].astype(np.int64)
+
+        path = dirpath / f'{i}.parquet'
+
+        table = pa.Table.from_pandas(df)
+        _write_table(table, path)
+
+        test_data.append(table)
+        paths.append(path)
+
+    # Write a _SUCCESS.crc file
+    (dirpath / '_SUCCESS.crc').touch()
+
+    def read_multiple_files(paths, columns=None, use_threads=True, **kwargs):
+        dataset = pq.ParquetDataset(paths, **kwargs)
+        return dataset.read(columns=columns, use_threads=use_threads)
+
+    result = read_multiple_files(paths)
+    expected = pa.concat_tables(test_data)
+
+    assert result.equals(expected)
+
+    # Read column subset
+    to_read = [0, 2, 6, result.num_columns - 1]
+
+    col_names = [result.field(i).name for i in to_read]
+    out = pq.read_table(dirpath, columns=col_names)
+    expected = pa.Table.from_arrays([result.column(i) for i in to_read],
+                                    names=col_names,
+                                    metadata=result.schema.metadata)
+    assert out.equals(expected)
+
+    # Read with multiple threads
+    pq.read_table(dirpath, use_threads=True)
+
+    # Test failure modes with non-uniform metadata
+    bad_apple = _test_dataframe(size, seed=i).iloc[:, :4]
+    bad_apple_path = tempdir / f'{guid()}.parquet'
+
+    t = pa.Table.from_pandas(bad_apple)
+    _write_table(t, bad_apple_path)
+
+    # TODO(dataset) Dataset API skips bad files
+
+    # bad_meta = pq.read_metadata(bad_apple_path)
+
+    # with pytest.raises(ValueError):
+    #     read_multiple_files(paths + [bad_apple_path])
+
+    # with pytest.raises(ValueError):
+    #     read_multiple_files(paths, metadata=bad_meta)
+
+    # mixed_paths = [bad_apple_path, paths[0]]
+
+    # with pytest.raises(ValueError):
+    #     read_multiple_files(mixed_paths)
+
+
+@pytest.mark.pandas
+def test_dataset_read_pandas(tempdir):
+    nfiles = 5
+    size = 5
+
+    dirpath = tempdir / guid()
+    dirpath.mkdir()
+
+    test_data = []
+    frames = []
+    paths = []
+    for i in range(nfiles):
+        df = _test_dataframe(size, seed=i)
+        df.index = np.arange(i * size, (i + 1) * size)
+        df.index.name = 'index'
+
+        path = dirpath / f'{i}.parquet'
+
+        table = pa.Table.from_pandas(df)
+        _write_table(table, path)
+        test_data.append(table)
+        frames.append(df)
+        paths.append(path)
+
+    dataset = pq.ParquetDataset(dirpath)
+    columns = ['uint8', 'strings']
+    result = dataset.read_pandas(columns=columns).to_pandas()
+    expected = pd.concat([x[columns] for x in frames])
+
+    tm.assert_frame_equal(result, expected)
+
+    # also be able to pass the columns as a set (ARROW-12314)
+    result = dataset.read_pandas(columns=set(columns)).to_pandas()
+    assert result.shape == expected.shape
+    # column order can be different because of using a set
+    tm.assert_frame_equal(result.reindex(columns=expected.columns), expected)
+
+
+@pytest.mark.numpy
+def test_dataset_memory_map(tempdir):
+    # ARROW-2627: Check that we can use ParquetDataset with memory-mapping
+    dirpath = tempdir / guid()
+    dirpath.mkdir()
+
+    table = _test_table(10, seed=0)
+    path = dirpath / '0.parquet'
+    _write_table(table, path, version='2.6')
+
+    dataset = pq.ParquetDataset(
+        dirpath, memory_map=True)
+    assert dataset.read().equals(table)
+
+
+@pytest.mark.numpy
+def test_dataset_enable_buffered_stream(tempdir):
+    dirpath = tempdir / guid()
+    dirpath.mkdir()
+
+    table = _test_table(10, seed=0)
+    path = dirpath / '0.parquet'
+    _write_table(table, path, version='2.6')
+
+    with pytest.raises(ValueError):
+        pq.ParquetDataset(
+            dirpath, buffer_size=-64)
+
+    for buffer_size in [128, 1024]:
+        dataset = pq.ParquetDataset(
+            dirpath, buffer_size=buffer_size)
+        assert dataset.read().equals(table)
+
+
+@pytest.mark.numpy
+def test_dataset_enable_pre_buffer(tempdir):
+    dirpath = tempdir / guid()
+    dirpath.mkdir()
+
+    table = _test_table(10, seed=0)
+    path = dirpath / '0.parquet'
+    _write_table(table, path, version='2.6')
+
+    for pre_buffer in (True, False):
+        dataset = pq.ParquetDataset(
+            dirpath, pre_buffer=pre_buffer)
+        assert dataset.read().equals(table)
+        actual = pq.read_table(dirpath, pre_buffer=pre_buffer)
+        assert actual.equals(table)
+
+
+def _make_example_multifile_dataset(base_path, nfiles=10, file_nrows=5):
+    test_data = []
+    paths = []
+    for i in range(nfiles):
+        table = _test_table(file_nrows, seed=i)
+        path = base_path / f'{i}.parquet'
+
+        test_data.append(_write_table(table, path))
+        paths.append(path)
+    return paths
+
+
+def _assert_dataset_paths(dataset, paths):
+    paths = [str(path.as_posix()) for path in paths]
+    assert set(paths) == set(dataset.files)
+
+
+@pytest.mark.numpy
+@pytest.mark.parametrize('dir_prefix', ['_', '.'])
+def test_ignore_private_directories(tempdir, dir_prefix):
+    dirpath = tempdir / guid()
+    dirpath.mkdir()
+
+    paths = _make_example_multifile_dataset(dirpath, nfiles=10,
+                                            file_nrows=5)
+
+    # private directory
+    (dirpath / f'{dir_prefix}staging').mkdir()
+
+    dataset = pq.ParquetDataset(dirpath)
+
+    _assert_dataset_paths(dataset, paths)
+
+
+@pytest.mark.numpy
+def test_ignore_hidden_files_dot(tempdir):
+    dirpath = tempdir / guid()
+    dirpath.mkdir()
+
+    paths = _make_example_multifile_dataset(dirpath, nfiles=10,
+                                            file_nrows=5)
+
+    with (dirpath / '.DS_Store').open('wb') as f:
+        f.write(b'gibberish')
+
+    with (dirpath / '.private').open('wb') as f:
+        f.write(b'gibberish')
+
+    dataset = pq.ParquetDataset(dirpath)
+
+    _assert_dataset_paths(dataset, paths)
+
+
+@pytest.mark.numpy
+def test_ignore_hidden_files_underscore(tempdir):
+    dirpath = tempdir / guid()
+    dirpath.mkdir()
+
+    paths = _make_example_multifile_dataset(dirpath, nfiles=10,
+                                            file_nrows=5)
+
+    with (dirpath / '_committed_123').open('wb') as f:
+        f.write(b'abcd')
+
+    with (dirpath / '_started_321').open('wb') as f:
+        f.write(b'abcd')
+
+    dataset = pq.ParquetDataset(dirpath)
+
+    _assert_dataset_paths(dataset, paths)
+
+
+@pytest.mark.numpy
+@pytest.mark.parametrize('dir_prefix', ['_', '.'])
+def test_ignore_no_private_directories_in_base_path(tempdir, dir_prefix):
+    # ARROW-8427 - don't ignore explicitly listed files if parent directory
+    # is a private directory
+    dirpath = tempdir / f'{dir_prefix}data' / guid()
+    dirpath.mkdir(parents=True)
+
+    paths = _make_example_multifile_dataset(dirpath, nfiles=10,
+                                            file_nrows=5)
+
+    dataset = pq.ParquetDataset(paths)
+    _assert_dataset_paths(dataset, paths)
+
+    # ARROW-9644 - don't ignore full directory with underscore in base path
+    dataset = pq.ParquetDataset(dirpath)
+    _assert_dataset_paths(dataset, paths)
+
+
+def test_ignore_custom_prefixes(tempdir):
+    # ARROW-9573 - allow override of default ignore_prefixes
+    part = ["xxx"] * 3 + ["yyy"] * 3
+    table = pa.table([
+        pa.array(range(len(part))),
+        pa.array(part).dictionary_encode(),
+    ], names=['index', '_part'])
+
+    pq.write_to_dataset(table, str(tempdir), partition_cols=['_part'])
+
+    private_duplicate = tempdir / '_private_duplicate'
+    private_duplicate.mkdir()
+    pq.write_to_dataset(table, str(private_duplicate),
+                        partition_cols=['_part'])
+
+    read = pq.read_table(
+        tempdir, ignore_prefixes=['_private'])
+
+    assert read.equals(table)
+
+
+def test_empty_directory(tempdir):
+    # ARROW-5310
+    empty_dir = tempdir / 'dataset'
+    empty_dir.mkdir()
+
+    dataset = pq.ParquetDataset(empty_dir)
+    result = dataset.read()
+    assert result.num_rows == 0
+    assert result.num_columns == 0
+
+
+def _test_write_to_dataset_with_partitions(base_path,
+                                           filesystem=None,
+                                           schema=None,
+                                           index_name=None):
+    import pandas as pd
+    import pandas.testing as tm
+
+    import pyarrow.parquet as pq
+
+    # ARROW-1400
+    output_df = pd.DataFrame({
+        'group1': list('aaabbbbccc'),
+        'group2': list('eefeffgeee'),
+        'num': list(range(10)),
+        'nan': [np.nan] * 10,
+        'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]').astype(
+            'datetime64[ns]')
+    })
+    cols = output_df.columns.tolist()
+    partition_by = ['group1', 'group2']
+    output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False,
+                                        preserve_index=False)
+    pq.write_to_dataset(output_table, base_path, partition_by,
+                        filesystem=filesystem)
+
+    metadata_path = os.path.join(str(base_path), '_common_metadata')
+
+    if filesystem is not None:
+        with filesystem.open(metadata_path, 'wb') as f:
+            pq.write_metadata(output_table.schema, f)
+    else:
+        pq.write_metadata(output_table.schema, metadata_path)
+
+    dataset = pq.ParquetDataset(base_path,
+                                filesystem=filesystem)
+    # ARROW-2209: Ensure the dataset schema also includes the partition columns
+    # NB schema property is an arrow and not parquet schema
+    dataset_cols = set(dataset.schema.names)
+
+    assert dataset_cols == set(output_table.schema.names)
+
+    input_table = dataset.read()
+    input_df = input_table.to_pandas()
+
+    # Read data back in and compare with original DataFrame
+    # Partitioned columns added to the end of the DataFrame when read
+    input_df_cols = input_df.columns.tolist()
+    assert partition_by == input_df_cols[-1 * len(partition_by):]
+
+    input_df = input_df[cols]
+    # Partitioned columns become 'categorical' dtypes
+    for col in partition_by:
+        output_df[col] = output_df[col].astype('category')
+
+    if schema:
+        expected_date_type = schema.field('date').type.to_pandas_dtype()
+        output_df["date"] = output_df["date"].astype(expected_date_type)
+
+    tm.assert_frame_equal(output_df, input_df)
+
+
+def _test_write_to_dataset_no_partitions(base_path,
+                                         filesystem=None):
+    import pandas as pd
+
+    import pyarrow.parquet as pq
+
+    # ARROW-1400
+    output_df = pd.DataFrame({
+        'group1': list('aaabbbbccc'),
+        'group2': list('eefeffgeee'),
+        'num': list(range(10)),
+        'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]').astype(
+            'datetime64[ns]')
+    })
+    cols = output_df.columns.tolist()
+    output_table = pa.Table.from_pandas(output_df)
+
+    if filesystem is None:
+        filesystem = LocalFileSystem()
+    elif not isinstance(filesystem, FileSystem):
+        filesystem = PyFileSystem(FSSpecHandler(filesystem))
+
+    # Without partitions, append files to root_path
+    n = 5
+    for i in range(n):
+        pq.write_to_dataset(output_table, base_path,
+                            filesystem=filesystem)
+
+    selector = FileSelector(str(base_path), allow_not_found=False,
+                            recursive=True)
+
+    infos = filesystem.get_file_info(selector)
+    output_files = [info for info in infos if info.path.endswith(".parquet")]
+    assert len(output_files) == n
+
+    # Deduplicated incoming DataFrame should match
+    # original outgoing Dataframe
+    input_table = pq.ParquetDataset(
+        base_path, filesystem=filesystem
+    ).read()
+    input_df = input_table.to_pandas()
+    input_df = input_df.drop_duplicates()
+    input_df = input_df[cols]
+    tm.assert_frame_equal(output_df, input_df)
+
+
+@pytest.mark.pandas
+def test_write_to_dataset_with_partitions(tempdir):
+    _test_write_to_dataset_with_partitions(str(tempdir))
+
+
+@pytest.mark.pandas
+def test_write_to_dataset_with_partitions_and_schema(tempdir):
+    schema = pa.schema([pa.field('group1', type=pa.string()),
+                        pa.field('group2', type=pa.string()),
+                        pa.field('num', type=pa.int64()),
+                        pa.field('nan', type=pa.int32()),
+                        pa.field('date', type=pa.timestamp(unit='us'))])
+    _test_write_to_dataset_with_partitions(
+        str(tempdir), schema=schema)
+
+
+@pytest.mark.pandas
+def test_write_to_dataset_with_partitions_and_index_name(tempdir):
+    _test_write_to_dataset_with_partitions(
+        str(tempdir), index_name='index_name')
+
+
+@pytest.mark.pandas
+def test_write_to_dataset_no_partitions(tempdir):
+    _test_write_to_dataset_no_partitions(str(tempdir))
+
+
+@pytest.mark.pandas
+def test_write_to_dataset_pathlib(tempdir):
+    _test_write_to_dataset_with_partitions(tempdir / "test1")
+    _test_write_to_dataset_no_partitions(tempdir / "test2")
+
+
+@pytest.mark.pandas
+@pytest.mark.s3
+def test_write_to_dataset_pathlib_nonlocal(tempdir, s3_example_s3fs):
+    # pathlib paths are only accepted for local files
+    fs, _ = s3_example_s3fs
+
+    with pytest.raises(TypeError, match="path-like objects are only allowed"):
+        _test_write_to_dataset_with_partitions(
+            tempdir / "test1", filesystem=fs)
+
+    with pytest.raises(TypeError, match="path-like objects are only allowed"):
+        _test_write_to_dataset_no_partitions(
+            tempdir / "test2", filesystem=fs)
+
+
+@pytest.mark.pandas
+@pytest.mark.s3
+# See https://github.com/apache/arrow/pull/44225#issuecomment-2378365291
+@pytest.mark.skipif(sys.platform == "win32",
+                    reason="test fails because of unsupported characters")
+def test_write_to_dataset_with_partitions_s3fs(s3_example_s3fs):
+    fs, path = s3_example_s3fs
+
+    _test_write_to_dataset_with_partitions(
+        path, filesystem=fs)
+
+
+@pytest.mark.pandas
+@pytest.mark.s3
+def test_write_to_dataset_no_partitions_s3fs(s3_example_s3fs):
+    fs, path = s3_example_s3fs
+
+    _test_write_to_dataset_no_partitions(
+        path, filesystem=fs)
+
+
+@pytest.mark.pandas
+def test_write_to_dataset_filesystem(tempdir):
+    df = pd.DataFrame({'A': [1, 2, 3]})
+    table = pa.Table.from_pandas(df)
+    path = str(tempdir)
+
+    pq.write_to_dataset(table, path, filesystem=LocalFileSystem())
+    result = pq.read_table(path)
+    assert result.equals(table)
+
+
+def _make_dataset_for_pickling(tempdir, N=100):
+    path = tempdir / 'data.parquet'
+    local = LocalFileSystem()
+
+    df = pd.DataFrame({
+        'index': np.arange(N),
+        'values': np.random.randn(N)
+    }, columns=['index', 'values'])
+    table = pa.Table.from_pandas(df)
+
+    num_groups = 3
+    with pq.ParquetWriter(path, table.schema) as writer:
+        for i in range(num_groups):
+            writer.write_table(table)
+
+    reader = pq.ParquetFile(path)
+    assert reader.metadata.num_row_groups == num_groups
+
+    metadata_path = tempdir / '_metadata'
+    with local.open_output_stream(str(metadata_path)) as f:
+        pq.write_metadata(table.schema, f)
+
+    dataset = pq.ParquetDataset(
+        tempdir, filesystem=local)
+
+    return dataset
+
+
+@pytest.mark.pandas
+def test_pickle_dataset(tempdir, pickle_module):
+    def is_pickleable(obj):
+        return obj == pickle_module.loads(pickle_module.dumps(obj))
+
+    dataset = _make_dataset_for_pickling(tempdir)
+    assert is_pickleable(dataset)
+
+
+@pytest.mark.pandas
+def test_partitioned_dataset(tempdir):
+    # ARROW-3208: Segmentation fault when reading a Parquet partitioned dataset
+    # to a Parquet file
+    path = tempdir / "ARROW-3208"
+    df = pd.DataFrame({
+        'one': [-1, 10, 2.5, 100, 1000, 1, 29.2],
+        'two': [-1, 10, 2, 100, 1000, 1, 11],
+        'three': [0, 0, 0, 0, 0, 0, 0]
+    })
+    table = pa.Table.from_pandas(df)
+    pq.write_to_dataset(table, root_path=str(path),
+                        partition_cols=['one', 'two'])
+    table = pq.ParquetDataset(path).read()
+    pq.write_table(table, path / "output.parquet")
+
+
+def test_dataset_read_dictionary(tempdir):
+    path = tempdir / "ARROW-3325-dataset"
+    t1 = pa.table([[util.rands(10) for i in range(5)] * 10], names=['f0'])
+    t2 = pa.table([[util.rands(10) for i in range(5)] * 10], names=['f0'])
+    pq.write_to_dataset(t1, root_path=str(path))
+    pq.write_to_dataset(t2, root_path=str(path))
+
+    result = pq.ParquetDataset(
+        path, read_dictionary=['f0']).read()
+
+    # The order of the chunks is non-deterministic
+    ex_chunks = [t1[0].chunk(0).dictionary_encode(),
+                 t2[0].chunk(0).dictionary_encode()]
+
+    assert result[0].num_chunks == 2
+    c0, c1 = result[0].chunk(0), result[0].chunk(1)
+    if c0.equals(ex_chunks[0]):
+        assert c1.equals(ex_chunks[1])
+    else:
+        assert c0.equals(ex_chunks[1])
+        assert c1.equals(ex_chunks[0])
+
+
+def test_read_table_schema(tempdir):
+    # test that schema keyword is passed through in read_table
+    table = pa.table({'a': pa.array([1, 2, 3], pa.int32())})
+    pq.write_table(table, tempdir / "data1.parquet")
+    pq.write_table(table, tempdir / "data2.parquet")
+
+    schema = pa.schema([('a', 'int64')])
+
+    # reading single file (which is special cased in the code)
+    result = pq.read_table(tempdir / "data1.parquet", schema=schema)
+    expected = pa.table({'a': [1, 2, 3]}, schema=schema)
+    assert result.equals(expected)
+
+    # reading multiple fields
+    result = pq.read_table(tempdir, schema=schema)
+    expected = pa.table({'a': [1, 2, 3, 1, 2, 3]}, schema=schema)
+    assert result.equals(expected)
+
+    result = pq.ParquetDataset(tempdir, schema=schema)
+    expected = pa.table({'a': [1, 2, 3, 1, 2, 3]}, schema=schema)
+    assert result.read().equals(expected)
+
+
+def test_read_table_duplicate_column_selection(tempdir):
+    # test that duplicate column selection gives duplicate columns
+    table = pa.table({'a': pa.array([1, 2, 3], pa.int32()),
+                      'b': pa.array([1, 2, 3], pa.uint8())})
+    pq.write_table(table, tempdir / "data.parquet")
+
+    result = pq.read_table(tempdir / "data.parquet", columns=['a', 'a'])
+    expected_schema = pa.schema([('a', 'int32'), ('a', 'int32')])
+
+    assert result.column_names == ['a', 'a']
+    assert result.schema == expected_schema
+
+
+def test_dataset_partitioning(tempdir):
+    import pyarrow.dataset as ds
+
+    # create small dataset with directory partitioning
+    root_path = tempdir / "test_partitioning"
+    (root_path / "2012" / "10" / "01").mkdir(parents=True)
+
+    table = pa.table({'a': [1, 2, 3]})
+    pq.write_table(
+        table, str(root_path / "2012" / "10" / "01" / "data.parquet"))
+
+    # This works with new dataset API
+
+    # read_table
+    part = ds.partitioning(field_names=["year", "month", "day"])
+    result = pq.read_table(
+        str(root_path), partitioning=part)
+    assert result.column_names == ["a", "year", "month", "day"]
+
+    result = pq.ParquetDataset(
+        str(root_path), partitioning=part).read()
+    assert result.column_names == ["a", "year", "month", "day"]
+
+
+def test_parquet_dataset_new_filesystem(tempdir):
+    # Ensure we can pass new FileSystem object to ParquetDataset
+    table = pa.table({'a': [1, 2, 3]})
+    pq.write_table(table, tempdir / 'data.parquet')
+    filesystem = SubTreeFileSystem(str(tempdir), LocalFileSystem())
+    dataset = pq.ParquetDataset('.', filesystem=filesystem)
+    result = dataset.read()
+    assert result.equals(table)
+
+
+def test_parquet_dataset_partitions_piece_path_with_fsspec(tempdir):
+    # ARROW-10462 ensure that on Windows we properly use posix-style paths
+    # as used by fsspec
+    fsspec = pytest.importorskip("fsspec")
+    filesystem = fsspec.filesystem('file')
+    table = pa.table({'a': [1, 2, 3]})
+    pq.write_table(table, tempdir / 'data.parquet')
+
+    # pass a posix-style path (using "/" also on Windows)
+    path = str(tempdir).replace("\\", "/")
+    dataset = pq.ParquetDataset(
+        path, filesystem=filesystem)
+    # ensure the piece path is also posix-style
+    expected = path + "/data.parquet"
+    assert dataset.fragments[0].path == expected
+
+
+def test_parquet_write_to_dataset_exposed_keywords(tempdir):
+    table = pa.table({'a': [1, 2, 3]})
+    path = tempdir / 'partitioning'
+
+    paths_written = []
+
+    def file_visitor(written_file):
+        paths_written.append(written_file.path)
+
+    basename_template = 'part-{i}.parquet'
+
+    pq.write_to_dataset(table, path, partitioning=["a"],
+                        file_visitor=file_visitor,
+                        basename_template=basename_template)
+
+    expected_paths = {
+        path / '1' / 'part-0.parquet',
+        path / '2' / 'part-0.parquet',
+        path / '3' / 'part-0.parquet'
+    }
+    paths_written_set = set(map(pathlib.Path, paths_written))
+    assert paths_written_set == expected_paths
+
+
+@pytest.mark.parametrize("write_dataset_kwarg", (
+    ("create_dir", True),
+    ("create_dir", False),
+))
+def test_write_to_dataset_kwargs_passed(tempdir, write_dataset_kwarg):
+    """Verify kwargs in pq.write_to_dataset are passed onto ds.write_dataset"""
+    import pyarrow.dataset as ds
+
+    table = pa.table({"a": [1, 2, 3]})
+    path = tempdir / 'out.parquet'
+
+    signature = inspect.signature(ds.write_dataset)
+    key, arg = write_dataset_kwarg
+
+    # kwarg not in pq.write_to_dataset, but will be passed to ds.write_dataset
+    assert key not in inspect.signature(pq.write_to_dataset).parameters
+    assert key in signature.parameters
+
+    with mock.patch.object(ds, "write_dataset", autospec=True)\
+            as mock_write_dataset:
+        pq.write_to_dataset(table, path, **{key: arg})
+        _name, _args, kwargs = mock_write_dataset.mock_calls[0]
+        assert kwargs[key] == arg
+
+
+@pytest.mark.pandas
+def test_write_to_dataset_category_observed(tempdir):
+    # if we partition on a categorical variable with "unobserved" categories
+    # (values present in the dictionary, but not in the actual data)
+    # ensure those are not creating empty files/directories
+    df = pd.DataFrame({
+        "cat": pd.Categorical(["a", "b", "a"], categories=["a", "b", "c"]),
+        "col": [1, 2, 3]
+    })
+    table = pa.table(df)
+    path = tempdir / "dataset"
+    pq.write_to_dataset(
+        table, tempdir / "dataset", partition_cols=["cat"]
+    )
+    subdirs = [f.name for f in path.iterdir() if f.is_dir()]
+    assert len(subdirs) == 2
+    assert "cat=c" not in subdirs
diff --git a/pyarrow/tests/parquet/test_datetime.py b/pyarrow/tests/parquet/test_datetime.py
new file mode 100644
index 0000000000000000000000000000000000000000..b89fd97cb91e690552c39e4641dfdbc1b354d955
--- /dev/null
+++ b/pyarrow/tests/parquet/test_datetime.py
@@ -0,0 +1,461 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime
+import io
+import warnings
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+import pytest
+
+import pyarrow as pa
+from pyarrow.tests.parquet.common import _check_roundtrip
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import _read_table, _write_table
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe
+except ImportError:
+    pd = tm = None
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = pytest.mark.parquet
+
+
+@pytest.mark.pandas
+def test_pandas_parquet_datetime_tz():
+    # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units
+    # so we need to cast the pandas dtype. Pandas v1 will always silently
+    # coerce to [ns] due to lack of non-[ns] support.
+    s = pd.Series([datetime.datetime(2017, 9, 6)], dtype='datetime64[us]')
+    s = s.dt.tz_localize('utc')
+    s.index = s
+
+    # Both a column and an index to hit both use cases
+    df = pd.DataFrame({'tz_aware': s,
+                       'tz_eastern': s.dt.tz_convert('US/Eastern')},
+                      index=s)
+
+    f = io.BytesIO()
+
+    arrow_table = pa.Table.from_pandas(df)
+
+    _write_table(arrow_table, f)
+    f.seek(0)
+
+    table_read = pq.read_pandas(f)
+
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_datetime_timezone_tzinfo():
+    value = datetime.datetime(2018, 1, 1, 1, 23, 45,
+                              tzinfo=datetime.timezone.utc)
+    df = pd.DataFrame({'foo': [value]})
+
+    _roundtrip_pandas_dataframe(df, write_kwargs={})
+
+
+@pytest.mark.pandas
+def test_coerce_timestamps(tempdir):
+    from collections import OrderedDict
+
+    # ARROW-622
+    arrays = OrderedDict()
+    fields = [pa.field('datetime64',
+                       pa.list_(pa.timestamp('ms')))]
+    arrays['datetime64'] = [
+        np.array(['2007-07-13T01:23:34.123456789',
+                  None,
+                  '2010-08-13T05:46:57.437699912'],
+                 dtype='datetime64[ms]'),
+        None,
+        None,
+        np.array(['2007-07-13T02',
+                  None,
+                  '2010-08-13T05:46:57.437699912'],
+                 dtype='datetime64[ms]'),
+    ]
+
+    df = pd.DataFrame(arrays)
+    schema = pa.schema(fields)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df, schema=schema)
+
+    _write_table(arrow_table, filename, version='2.6', coerce_timestamps='us')
+    table_read = _read_table(filename)
+    df_read = table_read.to_pandas()
+
+    df_expected = df.copy()
+    for i, x in enumerate(df_expected['datetime64']):
+        if isinstance(x, np.ndarray):
+            df_expected.loc[i, 'datetime64'] = x.astype('M8[us]')
+
+    tm.assert_frame_equal(df_expected, df_read)
+
+    with pytest.raises(ValueError):
+        _write_table(arrow_table, filename, version='2.6',
+                     coerce_timestamps='unknown')
+
+
+@pytest.mark.pandas
+def test_coerce_timestamps_truncated(tempdir):
+    """
+    ARROW-2555: Test that we can truncate timestamps when coercing if
+    explicitly allowed.
+    """
+    dt_us = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
+                              second=1, microsecond=1)
+    dt_ms = datetime.datetime(year=2017, month=1, day=1, hour=1, minute=1,
+                              second=1)
+
+    fields_us = [pa.field('datetime64', pa.timestamp('us'))]
+    arrays_us = {'datetime64': [dt_us, dt_ms]}
+
+    df_us = pd.DataFrame(arrays_us)
+    schema_us = pa.schema(fields_us)
+
+    filename = tempdir / 'pandas_truncated.parquet'
+    table_us = pa.Table.from_pandas(df_us, schema=schema_us)
+
+    _write_table(table_us, filename, version='2.6', coerce_timestamps='ms',
+                 allow_truncated_timestamps=True)
+    table_ms = _read_table(filename)
+    df_ms = table_ms.to_pandas()
+
+    arrays_expected = {'datetime64': [dt_ms, dt_ms]}
+    df_expected = pd.DataFrame(arrays_expected, dtype='datetime64[ms]')
+    tm.assert_frame_equal(df_expected, df_ms)
+
+
+@pytest.mark.pandas
+def test_date_time_types(tempdir):
+    t1 = pa.date32()
+    data1 = np.array([17259, 17260, 17261], dtype='int32')
+    a1 = pa.array(data1, type=t1)
+
+    t2 = pa.date64()
+    data2 = data1.astype('int64') * 86400000
+    a2 = pa.array(data2, type=t2)
+
+    t3 = pa.timestamp('us')
+    start = pd.Timestamp('2001-01-01').value / 1000
+    data3 = np.array([start, start + 1, start + 2], dtype='int64')
+    a3 = pa.array(data3, type=t3)
+
+    t4 = pa.time32('ms')
+    data4 = np.arange(3, dtype='i4')
+    a4 = pa.array(data4, type=t4)
+
+    t5 = pa.time64('us')
+    a5 = pa.array(data4.astype('int64'), type=t5)
+
+    t6 = pa.time32('s')
+    a6 = pa.array(data4, type=t6)
+
+    ex_t6 = pa.time32('ms')
+    ex_a6 = pa.array(data4 * 1000, type=ex_t6)
+
+    t7 = pa.timestamp('ns')
+    start = pd.Timestamp('2001-01-01').value
+    data7 = np.array([start, start + 1000, start + 2000],
+                     dtype='int64')
+    a7 = pa.array(data7, type=t7)
+
+    table = pa.Table.from_arrays([a1, a2, a3, a4, a5, a6, a7],
+                                 ['date32', 'date64', 'timestamp[us]',
+                                  'time32[s]', 'time64[us]',
+                                  'time32_from64[s]',
+                                  'timestamp[ns]'])
+
+    # date64 as date32
+    # time32[s] to time32[ms]
+    expected = pa.Table.from_arrays([a1, a1, a3, a4, a5, ex_a6, a7],
+                                    ['date32', 'date64', 'timestamp[us]',
+                                     'time32[s]', 'time64[us]',
+                                     'time32_from64[s]',
+                                     'timestamp[ns]'])
+
+    _check_roundtrip(table, expected=expected, version='2.6')
+
+    t0 = pa.timestamp('ms')
+    data0 = np.arange(4, dtype='int64')
+    a0 = pa.array(data0, type=t0)
+
+    t1 = pa.timestamp('us')
+    data1 = np.arange(4, dtype='int64')
+    a1 = pa.array(data1, type=t1)
+
+    t2 = pa.timestamp('ns')
+    data2 = np.arange(4, dtype='int64')
+    a2 = pa.array(data2, type=t2)
+
+    table = pa.Table.from_arrays([a0, a1, a2],
+                                 ['ts[ms]', 'ts[us]', 'ts[ns]'])
+    expected = pa.Table.from_arrays([a0, a1, a2],
+                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])
+
+    # int64 for all timestamps supported by default
+    filename = tempdir / 'int64_timestamps.parquet'
+    _write_table(table, filename, version='2.6')
+    parquet_schema = pq.ParquetFile(filename).schema
+    for i in range(3):
+        assert parquet_schema.column(i).physical_type == 'INT64'
+    read_table = _read_table(filename)
+    assert read_table.equals(expected)
+
+    t0_ns = pa.timestamp('ns')
+    data0_ns = np.array(data0 * 1000000, dtype='int64')
+    a0_ns = pa.array(data0_ns, type=t0_ns)
+
+    t1_ns = pa.timestamp('ns')
+    data1_ns = np.array(data1 * 1000, dtype='int64')
+    a1_ns = pa.array(data1_ns, type=t1_ns)
+
+    expected = pa.Table.from_arrays([a0_ns, a1_ns, a2],
+                                    ['ts[ms]', 'ts[us]', 'ts[ns]'])
+
+    # int96 nanosecond timestamps produced upon request
+    filename = tempdir / 'explicit_int96_timestamps.parquet'
+    _write_table(table, filename, version='2.6',
+                 use_deprecated_int96_timestamps=True)
+    parquet_schema = pq.ParquetFile(filename).schema
+    for i in range(3):
+        assert parquet_schema.column(i).physical_type == 'INT96'
+    read_table = _read_table(filename)
+    assert read_table.equals(expected)
+
+    # int96 nanosecond timestamps implied by flavor 'spark'
+    filename = tempdir / 'spark_int96_timestamps.parquet'
+    _write_table(table, filename, version='2.6',
+                 flavor='spark')
+    parquet_schema = pq.ParquetFile(filename).schema
+    for i in range(3):
+        assert parquet_schema.column(i).physical_type == 'INT96'
+    read_table = _read_table(filename)
+    assert read_table.equals(expected)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns'])
+def test_coerce_int96_timestamp_unit(unit):
+    i_s = pd.Timestamp('2010-01-01').value / 1000000000  # := 1262304000
+
+    d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
+    d_ms = d_s * 1000
+    d_us = d_ms * 1000
+    d_ns = d_us * 1000
+
+    a_s = pa.array(d_s, type=pa.timestamp('s'))
+    a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
+    a_us = pa.array(d_us, type=pa.timestamp('us'))
+    a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
+
+    arrays = {"s": a_s, "ms": a_ms, "us": a_us, "ns": a_ns}
+    names = ['ts_s', 'ts_ms', 'ts_us', 'ts_ns']
+    table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
+
+    # For either Parquet version, coercing to nanoseconds is allowed
+    # if Int96 storage is used
+    expected = pa.Table.from_arrays([arrays.get(unit)]*4, names)
+    read_table_kwargs = {"coerce_int96_timestamp_unit": unit}
+    _check_roundtrip(table, expected,
+                     read_table_kwargs=read_table_kwargs,
+                     use_deprecated_int96_timestamps=True)
+    _check_roundtrip(table, expected, version='2.6',
+                     read_table_kwargs=read_table_kwargs,
+                     use_deprecated_int96_timestamps=True)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('pq_reader_method', ['ParquetFile', 'read_table'])
+def test_coerce_int96_timestamp_overflow(pq_reader_method, tempdir):
+
+    def get_table(pq_reader_method, filename, **kwargs):
+        if pq_reader_method == "ParquetFile":
+            return pq.ParquetFile(filename, **kwargs).read()
+        elif pq_reader_method == "read_table":
+            return pq.read_table(filename, **kwargs)
+
+    # Recreating the initial JIRA issue referenced in ARROW-12096
+    oob_dts = [
+        datetime.datetime(1000, 1, 1),
+        datetime.datetime(2000, 1, 1),
+        datetime.datetime(3000, 1, 1)
+    ]
+    df = pd.DataFrame({"a": oob_dts})
+    table = pa.table(df)
+
+    filename = tempdir / "test_round_trip_overflow.parquet"
+    pq.write_table(table, filename, use_deprecated_int96_timestamps=True,
+                   version="1.0")
+
+    # with the default resolution of ns, we get wrong values for INT96
+    # that are out of bounds for nanosecond range
+    tab_error = get_table(pq_reader_method, filename)
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore",
+                                "Discarding nonzero nanoseconds in conversion",
+                                UserWarning)
+        assert tab_error["a"].to_pylist() != oob_dts
+
+    # avoid this overflow by specifying the resolution to use for INT96 values
+    tab_correct = get_table(
+        pq_reader_method, filename, coerce_int96_timestamp_unit="s"
+    )
+    df_correct = tab_correct.to_pandas(timestamp_as_object=True)
+    df["a"] = df["a"].astype(object)
+    tm.assert_frame_equal(df, df_correct)
+
+
+@pytest.mark.parametrize('unit', ['ms', 'us', 'ns'])
+def test_timestamp_restore_timezone(unit):
+    # ARROW-5888, restore timezone from serialized metadata
+    ty = pa.timestamp(unit, tz='America/New_York')
+    arr = pa.array([1, 2, 3], type=ty)
+    t = pa.table([arr], names=['f0'])
+    _check_roundtrip(t)
+
+
+def test_timestamp_restore_timezone_nanosecond():
+    # ARROW-9634, also restore timezone for nanosecond data that get stored
+    # as microseconds in the parquet file for Parquet ver 2.4 and less
+    ty = pa.timestamp('ns', tz='America/New_York')
+    arr = pa.array([1000, 2000, 3000], type=ty)
+    table = pa.table([arr], names=['f0'])
+    ty_us = pa.timestamp('us', tz='America/New_York')
+    expected = pa.table([arr.cast(ty_us)], names=['f0'])
+    _check_roundtrip(table, expected=expected, version='2.4')
+
+
+@pytest.mark.pandas
+def test_list_of_datetime_time_roundtrip():
+    # ARROW-4135
+    times = pd.to_datetime(['09:00', '09:30', '10:00', '10:30', '11:00',
+                            '11:30', '12:00'], format="%H:%M")
+    df = pd.DataFrame({'time': [times.time]})
+    _roundtrip_pandas_dataframe(df, write_kwargs={})
+
+
+@pytest.mark.pandas
+def test_parquet_version_timestamp_differences():
+    i_s = pd.Timestamp('2010-01-01').value / 1000000000  # := 1262304000
+
+    d_s = np.arange(i_s, i_s + 10, 1, dtype='int64')
+    d_ms = d_s * 1000
+    d_us = d_ms * 1000
+    d_ns = d_us * 1000
+
+    a_s = pa.array(d_s, type=pa.timestamp('s'))
+    a_ms = pa.array(d_ms, type=pa.timestamp('ms'))
+    a_us = pa.array(d_us, type=pa.timestamp('us'))
+    a_ns = pa.array(d_ns, type=pa.timestamp('ns'))
+
+    all_versions = ['1.0', '2.4', '2.6']
+
+    names = ['ts:s', 'ts:ms', 'ts:us', 'ts:ns']
+    table = pa.Table.from_arrays([a_s, a_ms, a_us, a_ns], names)
+
+    # Using Parquet version 1.0 and 2.4, seconds should be coerced to milliseconds
+    # and nanoseconds should be coerced to microseconds by default
+    expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_us], names)
+    _check_roundtrip(table, expected, version='1.0')
+    _check_roundtrip(table, expected, version='2.4')
+
+    # Using Parquet version 2.6, seconds should be coerced to milliseconds
+    # and nanoseconds should be retained by default
+    expected = pa.Table.from_arrays([a_ms, a_ms, a_us, a_ns], names)
+    _check_roundtrip(table, expected, version='2.6')
+
+    # For either Parquet version coercing to milliseconds or microseconds
+    # is allowed
+    expected = pa.Table.from_arrays([a_ms, a_ms, a_ms, a_ms], names)
+    for ver in all_versions:
+        _check_roundtrip(table, expected, coerce_timestamps='ms', version=ver)
+
+    expected = pa.Table.from_arrays([a_us, a_us, a_us, a_us], names)
+    for ver in all_versions:
+        _check_roundtrip(table, expected, version=ver, coerce_timestamps='us')
+
+    # TODO: after pyarrow allows coerce_timestamps='ns', tests like the
+    # following should pass ...
+
+    # Using Parquet version 1.0, coercing to nanoseconds is not allowed
+    # expected = None
+    # with pytest.raises(NotImplementedError):
+    #     _roundtrip_table(table, coerce_timestamps='ns')
+
+    # Using Parquet version 2.0, coercing to nanoseconds is allowed
+    # expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
+    # _check_roundtrip(table, expected, version='2.6', coerce_timestamps='ns')
+
+    # For either Parquet version, coercing to nanoseconds is allowed
+    # if Int96 storage is used
+    expected = pa.Table.from_arrays([a_ns, a_ns, a_ns, a_ns], names)
+    for ver in all_versions:
+        _check_roundtrip(table, expected, version=ver,
+                         use_deprecated_int96_timestamps=True)
+
+
+@pytest.mark.pandas
+def test_noncoerced_nanoseconds_written_without_exception(tempdir):
+    # ARROW-1957: the Parquet version 2.0 writer preserves Arrow
+    # nanosecond timestamps by default
+    n = 9
+    df = pd.DataFrame({'x': range(n)},
+                      index=pd.date_range('2017-01-01', freq='ns', periods=n))
+    tb = pa.Table.from_pandas(df)
+
+    filename = tempdir / 'written.parquet'
+    try:
+        pq.write_table(tb, filename, version='2.6')
+    except Exception:
+        pass
+    assert filename.exists()
+
+    recovered_table = pq.read_table(filename)
+    assert tb.equals(recovered_table)
+
+    # Loss of data through coercion (without explicit override) still an error
+    filename = tempdir / 'not_written.parquet'
+    with pytest.raises(ValueError):
+        pq.write_table(tb, filename, coerce_timestamps='ms', version='2.6')
+
+
+def test_duration_type():
+    # ARROW-6780
+    arrays = [pa.array([0, 1, 2, 3], type=pa.duration(unit))
+              for unit in ["s", "ms", "us", "ns"]]
+    table = pa.Table.from_arrays(arrays, ["d[s]", "d[ms]", "d[us]", "d[ns]"])
+
+    _check_roundtrip(table)
diff --git a/pyarrow/tests/parquet/test_encryption.py b/pyarrow/tests/parquet/test_encryption.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e2fb069bd06b745ec128d993c8a4f75973c56f8
--- /dev/null
+++ b/pyarrow/tests/parquet/test_encryption.py
@@ -0,0 +1,724 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import pytest
+from datetime import timedelta
+import pyarrow as pa
+try:
+    import pyarrow.parquet as pq
+    import pyarrow.parquet.encryption as pe
+except ImportError:
+    pq = None
+    pe = None
+else:
+    from pyarrow.tests.parquet.encryption import (InMemoryKmsClient,
+                                                  MockVersioningKmsClient,
+                                                  verify_file_encrypted,
+                                                  read_external_keys_to_dict,
+                                                  parse_wrapped_key)
+
+
+PARQUET_NAME = 'encrypted_table.in_mem.parquet'
+FOOTER_KEY = b"0123456789112345"
+FOOTER_KEY_NAME = "footer_key"
+COL_KEY = b"1234567890123450"
+COL_KEY_NAME = "col_key"
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet_encryption'
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = [
+    pytest.mark.parquet_encryption,
+    pytest.mark.parquet
+]
+
+
+@pytest.fixture(scope='module')
+def data_table():
+    data_table = pa.Table.from_pydict({
+        'a': pa.array([1, 2, 3]),
+        'b': pa.array(['a', 'b', 'c']),
+        'c': pa.array(['x', 'y', 'z'])
+    })
+    return data_table
+
+
+@pytest.fixture(scope='module')
+def basic_encryption_config():
+    basic_encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={
+            COL_KEY_NAME: ["a", "b"],
+        })
+    return basic_encryption_config
+
+
+@pytest.fixture(scope='module')
+def external_encryption_config():
+    external_encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={
+            COL_KEY_NAME: ["a", "b"],
+        },
+        internal_key_material=False)
+    return external_encryption_config
+
+
+def setup_encryption_environment(custom_kms_conf):
+    """
+    Sets up and returns the KMS connection configuration and crypto factory
+    based on provided KMS configuration parameters.
+    """
+    kms_connection_config = pe.KmsConnectionConfig(custom_kms_conf=custom_kms_conf)
+
+    def kms_factory(kms_connection_configuration):
+        return InMemoryKmsClient(kms_connection_configuration)
+
+    # Create our CryptoFactory
+    crypto_factory = pe.CryptoFactory(kms_factory)
+
+    return kms_connection_config, crypto_factory
+
+
+def write_encrypted_file(path, data_table, footer_key_name, col_key_name,
+                         footer_key, col_key, encryption_config):
+    """
+    Writes an encrypted parquet file based on the provided parameters.
+    """
+    # Setup the custom KMS configuration with provided keys
+    custom_kms_conf = {
+        footer_key_name: footer_key.decode("UTF-8"),
+        col_key_name: col_key.decode("UTF-8"),
+    }
+
+    # Setup encryption environment
+    kms_connection_config, crypto_factory = setup_encryption_environment(
+        custom_kms_conf)
+
+    # Write the encrypted parquet file
+    write_encrypted_parquet(path, data_table, encryption_config,
+                            kms_connection_config, crypto_factory)
+
+    return kms_connection_config, crypto_factory
+
+
+def test_encrypted_parquet_write_read(tempdir, data_table):
+    """Write an encrypted parquet, verify it's encrypted, and then read it."""
+    path = tempdir / PARQUET_NAME
+
+    # Encrypt the footer with the footer key,
+    # encrypt column `a` and column `b` with another key,
+    # keep `c` plaintext
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={
+            COL_KEY_NAME: ["a", "b"],
+        },
+        encryption_algorithm="AES_GCM_V1",
+        cache_lifetime=timedelta(minutes=5.0),
+        data_key_length_bits=256)
+    assert encryption_config.uniform_encryption is False
+
+    kms_connection_config, crypto_factory = write_encrypted_file(
+        path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
+        encryption_config)
+
+    verify_file_encrypted(path)
+
+    # Read with decryption properties
+    decryption_config = pe.DecryptionConfiguration(
+        cache_lifetime=timedelta(minutes=5.0))
+    result_table = read_encrypted_parquet(
+        path, decryption_config, kms_connection_config, crypto_factory)
+    assert data_table.equals(result_table)
+
+
+def test_uniform_encrypted_parquet_write_read(tempdir, data_table):
+    """Write an encrypted parquet, verify it's encrypted, and then read it."""
+    path = tempdir / PARQUET_NAME
+
+    # Encrypt the footer and all columns with the footer key,
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        uniform_encryption=True,
+        encryption_algorithm="AES_GCM_V1",
+        cache_lifetime=timedelta(minutes=5.0),
+        data_key_length_bits=256)
+    assert encryption_config.uniform_encryption is True
+
+    kms_connection_config, crypto_factory = write_encrypted_file(
+        path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, b"",
+        encryption_config)
+
+    verify_file_encrypted(path)
+
+    # Read with decryption properties
+    decryption_config = pe.DecryptionConfiguration(
+        cache_lifetime=timedelta(minutes=5.0))
+    result_table = read_encrypted_parquet(
+        path, decryption_config, kms_connection_config, crypto_factory)
+    assert data_table.equals(result_table)
+
+
+def write_encrypted_parquet(path, table, encryption_config,
+                            kms_connection_config, crypto_factory):
+    if encryption_config.internal_key_material:
+        file_encryption_properties = crypto_factory.file_encryption_properties(
+            kms_connection_config, encryption_config)
+    else:
+        file_encryption_properties = crypto_factory.file_encryption_properties(
+            kms_connection_config, encryption_config, path)
+    assert file_encryption_properties is not None
+    with pq.ParquetWriter(
+            path, table.schema,
+            encryption_properties=file_encryption_properties) as writer:
+        writer.write_table(table)
+
+
+def read_encrypted_parquet(path, decryption_config,
+                           kms_connection_config, crypto_factory,
+                           internal_key_material=True):
+    if internal_key_material:
+        file_decryption_properties = crypto_factory.file_decryption_properties(
+            kms_connection_config, decryption_config)
+    else:
+        file_decryption_properties = crypto_factory.file_decryption_properties(
+            kms_connection_config, decryption_config, path)
+
+    assert file_decryption_properties is not None
+    meta = pq.read_metadata(
+        path, decryption_properties=file_decryption_properties)
+    assert meta.num_columns == 3
+    schema = pq.read_schema(
+        path, decryption_properties=file_decryption_properties)
+    assert len(schema.names) == 3
+
+    result = pq.ParquetFile(
+        path, decryption_properties=file_decryption_properties)
+    return result.read(use_threads=True)
+
+
+def test_encrypted_parquet_write_read_wrong_key(tempdir, data_table):
+    """Write an encrypted parquet, verify it's encrypted,
+    and then read it using wrong keys."""
+    path = tempdir / PARQUET_NAME
+
+    # Encrypt the footer with the footer key,
+    # encrypt column `a` and column `b` with another key,
+    # keep `c` plaintext
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={
+            COL_KEY_NAME: ["a", "b"],
+        },
+        encryption_algorithm="AES_GCM_V1",
+        cache_lifetime=timedelta(minutes=5.0),
+        data_key_length_bits=256)
+
+    write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME,
+                         FOOTER_KEY, COL_KEY, encryption_config)
+
+    verify_file_encrypted(path)
+
+    wrong_kms_connection_config, wrong_crypto_factory = setup_encryption_environment({
+        FOOTER_KEY_NAME: COL_KEY.decode("UTF-8"),  # Intentionally wrong
+        COL_KEY_NAME: FOOTER_KEY.decode("UTF-8"),  # Intentionally wrong
+    })
+
+    decryption_config = pe.DecryptionConfiguration(
+        cache_lifetime=timedelta(minutes=5.0))
+    with pytest.raises(ValueError, match=r"Incorrect master key used"):
+        read_encrypted_parquet(
+            path, decryption_config, wrong_kms_connection_config,
+            wrong_crypto_factory)
+
+
+def test_encrypted_parquet_read_no_decryption_config(tempdir, data_table):
+    """Write an encrypted parquet, verify it's encrypted,
+    but then try to read it without decryption properties."""
+    test_encrypted_parquet_write_read(tempdir, data_table)
+    # Read without decryption properties
+    with pytest.raises(IOError, match=r"no decryption"):
+        pq.ParquetFile(tempdir / PARQUET_NAME).read()
+
+
+def test_encrypted_parquet_read_metadata_no_decryption_config(
+        tempdir, data_table):
+    """Write an encrypted parquet, verify it's encrypted,
+    but then try to read its metadata without decryption properties."""
+    test_encrypted_parquet_write_read(tempdir, data_table)
+    # Read metadata without decryption properties
+    with pytest.raises(IOError, match=r"no decryption"):
+        pq.read_metadata(tempdir / PARQUET_NAME)
+
+
+def test_encrypted_parquet_read_schema_no_decryption_config(
+        tempdir, data_table):
+    """Write an encrypted parquet, verify it's encrypted,
+    but then try to read its schema without decryption properties."""
+    test_encrypted_parquet_write_read(tempdir, data_table)
+    with pytest.raises(IOError, match=r"no decryption"):
+        pq.read_schema(tempdir / PARQUET_NAME)
+
+
+def test_encrypted_parquet_write_no_col_key(tempdir, data_table):
+    """Write an encrypted parquet, but give only footer key,
+    without column key."""
+    path = tempdir / 'encrypted_table_no_col_key.in_mem.parquet'
+
+    # Encrypt the footer with the footer key
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME)
+
+    with pytest.raises(OSError,
+                       match="Either column_keys or uniform_encryption "
+                       "must be set"):
+        # Write with encryption properties
+        write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME,
+                             FOOTER_KEY, b"", encryption_config)
+
+
+def test_encrypted_parquet_write_col_key_and_uniform_encryption(tempdir, data_table):
+    """Write an encrypted parquet, but give only footer key,
+    without column key."""
+    path = tempdir / 'encrypted_table_col_key_and_uniform_encryption.in_mem.parquet'
+
+    # Encrypt the footer with the footer key
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={
+            COL_KEY_NAME: ["a", "b"],
+        },
+        uniform_encryption=True)
+
+    with pytest.raises(OSError,
+                       match=r"Cannot set both column_keys and uniform_encryption"):
+        # Write with encryption properties
+        write_encrypted_file(path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME,
+                             FOOTER_KEY, b"", encryption_config)
+
+
+def test_encrypted_parquet_write_kms_error(tempdir, data_table,
+                                           basic_encryption_config):
+    """Write an encrypted parquet, but raise KeyError in KmsClient."""
+    path = tempdir / 'encrypted_table_kms_error.in_mem.parquet'
+    encryption_config = basic_encryption_config
+
+    # Empty master_keys_map
+    kms_connection_config = pe.KmsConnectionConfig()
+
+    def kms_factory(kms_connection_configuration):
+        # Empty master keys map will cause KeyError to be raised
+        # on wrap/unwrap calls
+        return InMemoryKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    with pytest.raises(KeyError, match="footer_key"):
+        # Write with encryption properties
+        write_encrypted_parquet(path, data_table, encryption_config,
+                                kms_connection_config, crypto_factory)
+
+
+def test_encrypted_parquet_write_kms_specific_error(tempdir, data_table,
+                                                    basic_encryption_config):
+    """Write an encrypted parquet, but raise KeyError in KmsClient."""
+    path = tempdir / 'encrypted_table_kms_error.in_mem.parquet'
+    encryption_config = basic_encryption_config
+
+    # Empty master_keys_map
+    kms_connection_config = pe.KmsConnectionConfig()
+
+    class ThrowingKmsClient(pe.KmsClient):
+        """A KmsClient implementation that throws exception in
+        wrap/unwrap calls
+        """
+
+        def __init__(self, config):
+            """Create an InMemoryKmsClient instance."""
+            pe.KmsClient.__init__(self)
+            self.config = config
+
+        def wrap_key(self, key_bytes, master_key_identifier):
+            raise ValueError("Cannot Wrap Key")
+
+        def unwrap_key(self, wrapped_key, master_key_identifier):
+            raise ValueError("Cannot Unwrap Key")
+
+    def kms_factory(kms_connection_configuration):
+        # Exception thrown in wrap/unwrap calls
+        return ThrowingKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    with pytest.raises(ValueError, match="Cannot Wrap Key"):
+        # Write with encryption properties
+        write_encrypted_parquet(path, data_table, encryption_config,
+                                kms_connection_config, crypto_factory)
+
+
+def test_encrypted_parquet_write_kms_factory_error(tempdir, data_table,
+                                                   basic_encryption_config):
+    """Write an encrypted parquet, but raise ValueError in kms_factory."""
+    path = tempdir / 'encrypted_table_kms_factory_error.in_mem.parquet'
+    encryption_config = basic_encryption_config
+
+    # Empty master_keys_map
+    kms_connection_config = pe.KmsConnectionConfig()
+
+    def kms_factory(kms_connection_configuration):
+        raise ValueError('Cannot create KmsClient')
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    with pytest.raises(ValueError,
+                       match="Cannot create KmsClient"):
+        # Write with encryption properties
+        write_encrypted_parquet(path, data_table, encryption_config,
+                                kms_connection_config, crypto_factory)
+
+
+def test_encrypted_parquet_write_kms_factory_type_error(
+        tempdir, data_table, basic_encryption_config):
+    """Write an encrypted parquet, but use wrong KMS client type
+    that doesn't implement KmsClient."""
+    path = tempdir / 'encrypted_table_kms_factory_error.in_mem.parquet'
+    encryption_config = basic_encryption_config
+
+    # Empty master_keys_map
+    kms_connection_config = pe.KmsConnectionConfig()
+
+    class WrongTypeKmsClient():
+        """This is not an implementation of KmsClient.
+        """
+
+        def __init__(self, config):
+            self.master_keys_map = config.custom_kms_conf
+
+        def wrap_key(self, key_bytes, master_key_identifier):
+            return None
+
+        def unwrap_key(self, wrapped_key, master_key_identifier):
+            return None
+
+    def kms_factory(kms_connection_configuration):
+        return WrongTypeKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    with pytest.raises(TypeError):
+        # Write with encryption properties
+        write_encrypted_parquet(path, data_table, encryption_config,
+                                kms_connection_config, crypto_factory)
+
+
+def test_encrypted_parquet_encryption_configuration():
+    def validate_encryption_configuration(encryption_config):
+        assert FOOTER_KEY_NAME == encryption_config.footer_key
+        assert ["a", "b"] == encryption_config.column_keys[COL_KEY_NAME]
+        assert "AES_GCM_CTR_V1" == encryption_config.encryption_algorithm
+        assert encryption_config.plaintext_footer
+        assert not encryption_config.double_wrapping
+        assert timedelta(minutes=10.0) == encryption_config.cache_lifetime
+        assert not encryption_config.internal_key_material
+        assert 192 == encryption_config.data_key_length_bits
+
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={COL_KEY_NAME: ["a", "b"], },
+        encryption_algorithm="AES_GCM_CTR_V1",
+        plaintext_footer=True,
+        double_wrapping=False,
+        cache_lifetime=timedelta(minutes=10.0),
+        internal_key_material=False,
+        data_key_length_bits=192,
+    )
+    validate_encryption_configuration(encryption_config)
+
+    encryption_config_1 = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME)
+    encryption_config_1.column_keys = {COL_KEY_NAME: ["a", "b"], }
+    encryption_config_1.encryption_algorithm = "AES_GCM_CTR_V1"
+    encryption_config_1.plaintext_footer = True
+    encryption_config_1.double_wrapping = False
+    encryption_config_1.cache_lifetime = timedelta(minutes=10.0)
+    encryption_config_1.internal_key_material = False
+    encryption_config_1.data_key_length_bits = 192
+    validate_encryption_configuration(encryption_config_1)
+
+
+def test_encrypted_parquet_decryption_configuration():
+    decryption_config = pe.DecryptionConfiguration(
+        cache_lifetime=timedelta(minutes=10.0))
+    assert timedelta(minutes=10.0) == decryption_config.cache_lifetime
+
+    decryption_config_1 = pe.DecryptionConfiguration()
+    decryption_config_1.cache_lifetime = timedelta(minutes=10.0)
+    assert timedelta(minutes=10.0) == decryption_config_1.cache_lifetime
+
+
+def test_encrypted_parquet_kms_configuration():
+    def validate_kms_connection_config(kms_connection_config):
+        assert "Instance1" == kms_connection_config.kms_instance_id
+        assert "URL1" == kms_connection_config.kms_instance_url
+        assert "MyToken" == kms_connection_config.key_access_token
+        assert ({"key1": "key_material_1", "key2": "key_material_2"} ==
+                kms_connection_config.custom_kms_conf)
+
+    kms_connection_config = pe.KmsConnectionConfig(
+        kms_instance_id="Instance1",
+        kms_instance_url="URL1",
+        key_access_token="MyToken",
+        custom_kms_conf={
+            "key1": "key_material_1",
+            "key2": "key_material_2",
+        })
+    validate_kms_connection_config(kms_connection_config)
+
+    kms_connection_config_1 = pe.KmsConnectionConfig()
+    kms_connection_config_1.kms_instance_id = "Instance1"
+    kms_connection_config_1.kms_instance_url = "URL1"
+    kms_connection_config_1.key_access_token = "MyToken"
+    kms_connection_config_1.custom_kms_conf = {
+        "key1": "key_material_1",
+        "key2": "key_material_2",
+    }
+    validate_kms_connection_config(kms_connection_config_1)
+
+
+@pytest.mark.xfail(reason="Plaintext footer - reading plaintext column subset"
+                   " reads encrypted columns too")
+def test_encrypted_parquet_write_read_plain_footer_single_wrapping(
+        tempdir, data_table):
+    """Write an encrypted parquet, with plaintext footer
+    and with single wrapping,
+    verify it's encrypted, and then read plaintext columns."""
+    path = tempdir / PARQUET_NAME
+
+    # Encrypt the footer with the footer key,
+    # encrypt column `a` and column `b` with another key,
+    # keep `c` plaintext
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={
+            COL_KEY_NAME: ["a", "b"],
+        },
+        plaintext_footer=True,
+        double_wrapping=False)
+
+    kms_connection_config = pe.KmsConnectionConfig(
+        custom_kms_conf={
+            FOOTER_KEY_NAME: FOOTER_KEY.decode("UTF-8"),
+            COL_KEY_NAME: COL_KEY.decode("UTF-8"),
+        }
+    )
+
+    def kms_factory(kms_connection_configuration):
+        return InMemoryKmsClient(kms_connection_configuration)
+
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    # Write with encryption properties
+    write_encrypted_parquet(path, data_table, encryption_config,
+                            kms_connection_config, crypto_factory)
+
+    # # Read without decryption properties only the plaintext column
+    # result = pq.ParquetFile(path)
+    # result_table = result.read(columns='c', use_threads=False)
+    # assert table.num_rows == result_table.num_rows
+
+
+def test_encrypted_parquet_write_read_external(tempdir, data_table,
+                                               external_encryption_config):
+    """Write an encrypted parquet file with external key material, verify
+    it's encrypted, then read both the table and external store.
+    """
+    path = tempdir / PARQUET_NAME
+
+    kms_connection_config, crypto_factory = write_encrypted_file(
+        path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
+        external_encryption_config)
+
+    verify_file_encrypted(path)
+
+    decryption_config = pe.DecryptionConfiguration()
+    result_table = read_encrypted_parquet(
+        path, decryption_config, kms_connection_config, crypto_factory,
+        internal_key_material=False)
+    store = pa._parquet_encryption.FileSystemKeyMaterialStore.for_file(path)
+
+    assert len(key_ids := store.get_key_id_set()) == (
+        len(external_encryption_config.column_keys[COL_KEY_NAME]) + 1)
+    assert all([store.get_key_material(k) is not None for k in key_ids])
+    assert data_table.equals(result_table)
+
+
+@pytest.mark.parametrize(
+    ("double_wrap_initial", "double_wrap_rotated"), [
+        pytest.param(True, True, id="double wrapping"),
+        pytest.param(False, True, id="single to double wrapped"),
+        pytest.param(True, False, id="double to singe wrapped"),
+        pytest.param(False, False, id="single wrapping")])
+def test_external_key_material_rotation(
+        reusable_tempdir,
+        data_table,
+        double_wrap_initial,
+        double_wrap_rotated):
+    """Tests CryptoFactory.rotate_master_keys
+
+    Note: The CryptoFactory.rotate_master_keys() double_wrapping keword arg
+    may be either True (the default) or False regardless of whether
+    EncryptionConfig.double_wrapping was set to true (also the default) when
+    the external key material store was written. This means double wrapping may
+    be set one way initially and then applied or removed during rotation.
+    """
+    path = reusable_tempdir / PARQUET_NAME
+    encryption_config = pe.EncryptionConfiguration(
+        footer_key=FOOTER_KEY_NAME,
+        column_keys={COL_KEY_NAME: ["a", "b"]},
+        internal_key_material=False,
+        double_wrapping=double_wrap_initial)
+
+    # initial master key version - see MockVersioningKmsClient docstring
+    kms_connection_config = pe.KmsConnectionConfig(key_access_token="1")
+
+    def kms_factory(kms_connection_configuration):
+        return MockVersioningKmsClient(kms_connection_configuration)
+    crypto_factory = pe.CryptoFactory(kms_factory)
+    write_encrypted_parquet(
+        path,
+        data_table,
+        encryption_config,
+        kms_connection_config,
+        crypto_factory)
+    before_keys = read_external_keys_to_dict(path)
+
+    # "rotate" kms master key
+    kms_connection_config.refresh_key_access_token("2")
+
+    crypto_factory.rotate_master_keys(
+        kms_connection_config,
+        path,
+        double_wrapping=double_wrap_rotated)
+
+    after_keys = read_external_keys_to_dict(path)
+    verify_file_encrypted(path)
+    table_read_after_rotation = read_encrypted_parquet(
+        path,
+        pe.DecryptionConfiguration(),
+        kms_connection_config,
+        crypto_factory,
+        internal_key_material=False)
+    assert FOOTER_KEY_NAME in before_keys
+    assert COL_KEY_NAME in before_keys
+    assert FOOTER_KEY_NAME in after_keys
+    assert COL_KEY_NAME in after_keys
+
+    def check_rotated_external_keys(master_key_id: str) -> None:
+        before_key_mat = before_keys[master_key_id]
+        if double_wrap_initial:
+            before_key_wrapped = before_key_mat.wrapped_kek
+        else:
+            before_key_wrapped = before_key_mat.wrapped_dek
+        _, before_ver, _ = parse_wrapped_key(before_key_wrapped)
+
+        after_key_mat = after_keys[master_key_id]
+        if double_wrap_rotated:
+            after_key_wrapped = after_key_mat.wrapped_kek
+        else:
+            after_key_wrapped = after_key_mat.wrapped_dek
+        _, after_ver, _ = parse_wrapped_key(after_key_wrapped)
+
+        # CryptoFactory rewrapped keys if after version is later than before
+        assert before_ver < after_ver
+    check_rotated_external_keys(FOOTER_KEY_NAME)
+    check_rotated_external_keys(COL_KEY_NAME)
+    assert data_table.equals(table_read_after_rotation)
+
+
+def test_encrypted_parquet_loop(tempdir, data_table, basic_encryption_config):
+    """Write an encrypted parquet, verify it's encrypted,
+    and then read it multithreaded in a loop."""
+    path = tempdir / PARQUET_NAME
+
+    # Encrypt the footer with the footer key,
+    # encrypt column `a` and column `b` with another key,
+    # keep `c` plaintext, defined in basic_encryption_config
+    kms_connection_config, crypto_factory = write_encrypted_file(
+        path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
+        basic_encryption_config)
+
+    verify_file_encrypted(path)
+
+    decryption_config = pe.DecryptionConfiguration(
+        cache_lifetime=timedelta(minutes=5.0))
+
+    for i in range(50):
+        # Read with decryption properties
+        file_decryption_properties = crypto_factory.file_decryption_properties(
+            kms_connection_config, decryption_config)
+        assert file_decryption_properties is not None
+
+        result = pq.ParquetFile(
+            path, decryption_properties=file_decryption_properties)
+        result_table = result.read(use_threads=True)
+        assert data_table.equals(result_table)
+
+
+def test_read_with_deleted_crypto_factory(tempdir, data_table, basic_encryption_config):
+    """
+    Test that decryption properties can be used if the crypto factory is no longer alive
+    """
+    path = tempdir / PARQUET_NAME
+    kms_connection_config, crypto_factory = write_encrypted_file(
+        path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
+        basic_encryption_config)
+    verify_file_encrypted(path)
+
+    # Create decryption properties and delete the crypto factory that created
+    # the properties afterwards.
+    decryption_config = pe.DecryptionConfiguration(
+        cache_lifetime=timedelta(minutes=5.0))
+    file_decryption_properties = crypto_factory.file_decryption_properties(
+        kms_connection_config, decryption_config)
+    del crypto_factory
+
+    result = pq.ParquetFile(
+        path, decryption_properties=file_decryption_properties)
+    result_table = result.read(use_threads=True)
+    assert data_table.equals(result_table)
+
+
+def test_encrypted_parquet_read_table(tempdir, data_table, basic_encryption_config):
+    """Write an encrypted parquet then read it back using read_table."""
+    path = tempdir / PARQUET_NAME
+
+    # Write the encrypted parquet file using the utility function
+    kms_connection_config, crypto_factory = write_encrypted_file(
+        path, data_table, FOOTER_KEY_NAME, COL_KEY_NAME, FOOTER_KEY, COL_KEY,
+        basic_encryption_config)
+
+    decryption_config = pe.DecryptionConfiguration(
+        cache_lifetime=timedelta(minutes=5.0))
+    file_decryption_properties = crypto_factory.file_decryption_properties(
+        kms_connection_config, decryption_config)
+
+    # Read the encrypted parquet file using read_table
+    result_table = pq.read_table(path, decryption_properties=file_decryption_properties)
+
+    # Assert that the read table matches the original data
+    assert data_table.equals(result_table)
+
+    # Read the encrypted parquet folder using read_table
+    result_table = pq.read_table(
+        tempdir, decryption_properties=file_decryption_properties)
+    assert data_table.equals(result_table)
diff --git a/pyarrow/tests/parquet/test_metadata.py b/pyarrow/tests/parquet/test_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..148bfebaa67f71bcae20e675148b838bfda06cea
--- /dev/null
+++ b/pyarrow/tests/parquet/test_metadata.py
@@ -0,0 +1,816 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import datetime
+import decimal
+from collections import OrderedDict
+import io
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+import pytest
+
+import pyarrow as pa
+from pyarrow.tests.parquet.common import _check_roundtrip, make_sample_file
+from pyarrow.fs import LocalFileSystem
+from pyarrow.tests import util
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import _write_table
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.parquet.common import alltypes_sample
+except ImportError:
+    pd = tm = None
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = pytest.mark.parquet
+
+
+@pytest.mark.pandas
+def test_parquet_metadata_api():
+    df = alltypes_sample(size=10000)
+    df = df.reindex(columns=sorted(df.columns))
+    df.index = np.random.randint(0, 1000000, size=len(df))
+
+    fileh = make_sample_file(df)
+    ncols = len(df.columns)
+
+    # Series of sniff tests
+    meta = fileh.metadata
+    repr(meta)
+    assert meta.num_rows == len(df)
+    assert meta.num_columns == ncols + 1  # +1 for index
+    assert meta.num_row_groups == 1
+    assert meta.format_version == '2.6'
+    assert 'parquet-cpp' in meta.created_by
+    assert isinstance(meta.serialized_size, int)
+    assert isinstance(meta.metadata, dict)
+
+    # Schema
+    schema = fileh.schema
+    assert meta.schema is schema
+    assert len(schema) == ncols + 1  # +1 for index
+    repr(schema)
+
+    col = schema[0]
+    repr(col)
+    assert col.name == df.columns[0]
+    assert col.max_definition_level == 1
+    assert col.max_repetition_level == 0
+    assert col.max_repetition_level == 0
+    assert col.physical_type == 'BOOLEAN'
+    assert col.converted_type == 'NONE'
+
+    col_float16 = schema[5]
+    assert col_float16.logical_type.type == 'FLOAT16'
+
+    with pytest.raises(IndexError):
+        schema[ncols + 1]  # +1 for index
+
+    with pytest.raises(IndexError):
+        schema[-1]
+
+    # Row group
+    for rg in range(meta.num_row_groups):
+        rg_meta = meta.row_group(rg)
+        assert isinstance(rg_meta, pq.RowGroupMetaData)
+        repr(rg_meta)
+
+        for col in range(rg_meta.num_columns):
+            col_meta = rg_meta.column(col)
+            assert isinstance(col_meta, pq.ColumnChunkMetaData)
+            repr(col_meta)
+
+    with pytest.raises(IndexError):
+        meta.row_group(-1)
+
+    with pytest.raises(IndexError):
+        meta.row_group(meta.num_row_groups + 1)
+
+    rg_meta = meta.row_group(0)
+    assert rg_meta.num_rows == len(df)
+    assert rg_meta.num_columns == ncols + 1  # +1 for index
+    assert rg_meta.total_byte_size > 0
+
+    with pytest.raises(IndexError):
+        col_meta = rg_meta.column(-1)
+
+    with pytest.raises(IndexError):
+        col_meta = rg_meta.column(ncols + 2)
+
+    col_meta = rg_meta.column(0)
+    assert col_meta.file_offset == 0
+    assert col_meta.file_path == ''  # created from BytesIO
+    assert col_meta.physical_type == 'BOOLEAN'
+    assert col_meta.num_values == 10000
+    assert col_meta.path_in_schema == 'bool'
+    assert col_meta.is_stats_set is True
+    assert isinstance(col_meta.statistics, pq.Statistics)
+    assert col_meta.compression == 'SNAPPY'
+    assert set(col_meta.encodings) == {'PLAIN', 'RLE'}
+    assert col_meta.has_dictionary_page is False
+    assert col_meta.dictionary_page_offset is None
+    assert col_meta.data_page_offset > 0
+    assert col_meta.total_compressed_size > 0
+    assert col_meta.total_uncompressed_size > 0
+    with pytest.raises(NotImplementedError):
+        col_meta.has_index_page
+    with pytest.raises(NotImplementedError):
+        col_meta.index_page_offset
+
+
+def test_parquet_metadata_lifetime(tempdir):
+    # ARROW-6642 - ensure that chained access keeps parent objects alive
+    table = pa.table({'a': [1, 2, 3]})
+    pq.write_table(table, tempdir / 'test_metadata_segfault.parquet')
+    parquet_file = pq.ParquetFile(tempdir / 'test_metadata_segfault.parquet')
+    parquet_file.metadata.row_group(0).column(0).statistics
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize(
+    (
+        'data',
+        'type',
+        'physical_type',
+        'min_value',
+        'max_value',
+        'null_count',
+        'num_values',
+        'distinct_count'
+    ),
+    [
+        ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, None),
+        ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, None),
+        ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, None),
+        ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, None),
+        ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, None),
+        ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, None),
+        ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, None),
+        ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, None),
+        (
+            [-1.1, 2.2, 2.3, None, 4.4], pa.float32(),
+            'FLOAT', -1.1, 4.4, 1, 4, None
+        ),
+        (
+            [-1.1, 2.2, 2.3, None, 4.4], pa.float64(),
+            'DOUBLE', -1.1, 4.4, 1, 4, None
+        ),
+        (
+            ['', 'b', chr(1000), None, 'aaa'], pa.binary(),
+            'BYTE_ARRAY', b'', chr(1000).encode('utf-8'), 1, 4, None
+        ),
+        (
+            [True, False, False, True, True], pa.bool_(),
+            'BOOLEAN', False, True, 0, 5, None
+        ),
+        (
+            [b'\x00', b'b', b'12', None, b'aaa'], pa.binary(),
+            'BYTE_ARRAY', b'\x00', b'b', 1, 4, None
+        ),
+    ]
+)
+def test_parquet_column_statistics_api(data, type, physical_type, min_value,
+                                       max_value, null_count, num_values,
+                                       distinct_count):
+    df = pd.DataFrame({'data': data})
+    schema = pa.schema([pa.field('data', type)])
+    table = pa.Table.from_pandas(df, schema=schema, safe=False)
+    fileh = make_sample_file(table)
+
+    meta = fileh.metadata
+
+    rg_meta = meta.row_group(0)
+    col_meta = rg_meta.column(0)
+
+    stat = col_meta.statistics
+    assert stat.has_min_max
+    assert _close(type, stat.min, min_value)
+    assert _close(type, stat.max, max_value)
+    assert stat.null_count == null_count
+    assert stat.num_values == num_values
+    # TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount
+    # method, missing distinct_count is represented as zero instead of None
+    assert stat.distinct_count == distinct_count
+    assert stat.physical_type == physical_type
+
+
+def _close(type, left, right):
+    if type == pa.float32():
+        return abs(left - right) < 1E-7
+    elif type == pa.float64():
+        return abs(left - right) < 1E-13
+    else:
+        return left == right
+
+
+# ARROW-6339
+@pytest.mark.pandas
+def test_parquet_raise_on_unset_statistics():
+    df = pd.DataFrame({"t": pd.Series([pd.NaT], dtype="datetime64[ns]")})
+    meta = make_sample_file(pa.Table.from_pandas(df)).metadata
+
+    assert not meta.row_group(0).column(0).statistics.has_min_max
+    assert meta.row_group(0).column(0).statistics.max is None
+
+
+def test_statistics_convert_logical_types(tempdir):
+    # ARROW-5166, ARROW-4139
+
+    # (min, max, type)
+    cases = [(10, 11164359321221007157, pa.uint64()),
+             (10, 4294967295, pa.uint32()),
+             ("ähnlich", "öffentlich", pa.utf8()),
+             (datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0, 1000),
+              pa.time32('ms')),
+             (datetime.time(10, 30, 0, 1000), datetime.time(15, 30, 0, 1000),
+              pa.time64('us')),
+             (datetime.datetime(2019, 6, 24, 0, 0, 0, 1000),
+              datetime.datetime(2019, 6, 25, 0, 0, 0, 1000),
+              pa.timestamp('ms')),
+             (datetime.datetime(2019, 6, 24, 0, 0, 0, 1000),
+              datetime.datetime(2019, 6, 25, 0, 0, 0, 1000),
+              pa.timestamp('us')),
+             (datetime.date(2019, 6, 24),
+              datetime.date(2019, 6, 25),
+              pa.date32()),
+             (decimal.Decimal("20.123"),
+              decimal.Decimal("20.124"),
+              pa.decimal128(12, 5))]
+
+    for i, (min_val, max_val, typ) in enumerate(cases):
+        t = pa.Table.from_arrays([pa.array([min_val, max_val], type=typ)],
+                                 ['col'])
+        path = str(tempdir / f'example{i}.parquet')
+        pq.write_table(t, path, version='2.6')
+        pf = pq.ParquetFile(path)
+        stats = pf.metadata.row_group(0).column(0).statistics
+        assert stats.min == min_val
+        assert stats.max == max_val
+
+
+def test_parquet_write_disable_statistics(tempdir):
+    table = pa.Table.from_pydict(
+        OrderedDict([
+            ('a', pa.array([1, 2, 3])),
+            ('b', pa.array(['a', 'b', 'c']))
+        ])
+    )
+    _write_table(table, tempdir / 'data.parquet')
+    meta = pq.read_metadata(tempdir / 'data.parquet')
+    for col in [0, 1]:
+        cc = meta.row_group(0).column(col)
+        assert cc.is_stats_set is True
+        assert cc.statistics is not None
+
+    _write_table(table, tempdir / 'data2.parquet', write_statistics=False)
+    meta = pq.read_metadata(tempdir / 'data2.parquet')
+    for col in [0, 1]:
+        cc = meta.row_group(0).column(col)
+        assert cc.is_stats_set is False
+        assert cc.statistics is None
+
+    _write_table(table, tempdir / 'data3.parquet', write_statistics=['a'])
+    meta = pq.read_metadata(tempdir / 'data3.parquet')
+    cc_a = meta.row_group(0).column(0)
+    cc_b = meta.row_group(0).column(1)
+    assert cc_a.is_stats_set is True
+    assert cc_b.is_stats_set is False
+    assert cc_a.statistics is not None
+    assert cc_b.statistics is None
+
+
+def test_parquet_sorting_column():
+    sorting_col = pq.SortingColumn(10)
+    assert sorting_col.to_dict() == {
+        'column_index': 10,
+        'descending': False,
+        'nulls_first': False
+    }
+
+    sorting_col = pq.SortingColumn(0, descending=True, nulls_first=True)
+    assert sorting_col.to_dict() == {
+        'column_index': 0,
+        'descending': True,
+        'nulls_first': True
+    }
+
+    schema = pa.schema([('a', pa.int64()), ('b', pa.int64())])
+    sorting_cols = (
+        pq.SortingColumn(1, descending=True),
+        pq.SortingColumn(0, descending=False),
+    )
+    sort_order, null_placement = pq.SortingColumn.to_ordering(schema, sorting_cols)
+    assert sort_order == (('b', "descending"), ('a', "ascending"))
+    assert null_placement == "at_end"
+
+    sorting_cols_roundtripped = pq.SortingColumn.from_ordering(
+        schema, sort_order, null_placement)
+    assert sorting_cols_roundtripped == sorting_cols
+
+    sorting_cols = pq.SortingColumn.from_ordering(
+        schema, ('a', ('b', "descending")), null_placement="at_start")
+    expected = (
+        pq.SortingColumn(0, descending=False, nulls_first=True),
+        pq.SortingColumn(1, descending=True, nulls_first=True),
+    )
+    assert sorting_cols == expected
+
+    # Conversions handle empty tuples
+    empty_sorting_cols = pq.SortingColumn.from_ordering(schema, ())
+    assert empty_sorting_cols == ()
+
+    assert pq.SortingColumn.to_ordering(schema, ()) == ((), "at_end")
+
+    with pytest.raises(ValueError):
+        pq.SortingColumn.from_ordering(schema, (("a", "not a valid sort order")))
+
+    with pytest.raises(ValueError, match="inconsistent null placement"):
+        sorting_cols = (
+            pq.SortingColumn(1, nulls_first=True),
+            pq.SortingColumn(0, nulls_first=False),
+        )
+        pq.SortingColumn.to_ordering(schema, sorting_cols)
+
+
+def test_parquet_sorting_column_nested():
+    schema = pa.schema({
+        'a': pa.struct([('x', pa.int64()), ('y', pa.int64())]),
+        'b': pa.int64()
+    })
+
+    sorting_columns = [
+        pq.SortingColumn(0, descending=True),  # a.x
+        pq.SortingColumn(2, descending=False)  # b
+    ]
+
+    sort_order, null_placement = pq.SortingColumn.to_ordering(schema, sorting_columns)
+    assert null_placement == "at_end"
+    assert len(sort_order) == 2
+    assert sort_order[0] == ("a.x", "descending")
+    assert sort_order[1] == ("b", "ascending")
+
+
+def test_parquet_file_sorting_columns():
+    table = pa.table({'a': [1, 2, 3], 'b': ['a', 'b', 'c']})
+
+    sorting_columns = (
+        pq.SortingColumn(column_index=0, descending=True, nulls_first=True),
+        pq.SortingColumn(column_index=1, descending=False),
+    )
+    writer = pa.BufferOutputStream()
+    _write_table(table, writer, sorting_columns=sorting_columns)
+    reader = pa.BufferReader(writer.getvalue())
+
+    # Can retrieve sorting columns from metadata
+    metadata = pq.read_metadata(reader)
+    assert sorting_columns == metadata.row_group(0).sorting_columns
+
+    metadata_dict = metadata.to_dict()
+    assert metadata_dict.get('num_columns') == 2
+    assert metadata_dict.get('num_rows') == 3
+    assert metadata_dict.get('num_row_groups') == 1
+
+
+def test_field_id_metadata():
+    # ARROW-7080
+    field_id = b'PARQUET:field_id'
+    inner = pa.field('inner', pa.int32(), metadata={field_id: b'100'})
+    middle = pa.field('middle', pa.struct(
+        [inner]), metadata={field_id: b'101'})
+    fields = [
+        pa.field('basic', pa.int32(), metadata={
+                 b'other': b'abc', field_id: b'1'}),
+        pa.field(
+            'list',
+            pa.list_(pa.field('list-inner', pa.int32(),
+                              metadata={field_id: b'10'})),
+            metadata={field_id: b'11'}),
+        pa.field('struct', pa.struct([middle]), metadata={field_id: b'102'}),
+        pa.field('no-metadata', pa.int32()),
+        pa.field('non-integral-field-id', pa.int32(),
+                 metadata={field_id: b'xyz'}),
+        pa.field('negative-field-id', pa.int32(),
+                 metadata={field_id: b'-1000'})
+    ]
+    arrs = [[] for _ in fields]
+    table = pa.table(arrs, schema=pa.schema(fields))
+
+    bio = pa.BufferOutputStream()
+    pq.write_table(table, bio)
+    contents = bio.getvalue()
+
+    pf = pq.ParquetFile(pa.BufferReader(contents))
+    schema = pf.schema_arrow
+
+    assert schema[0].metadata[field_id] == b'1'
+    assert schema[0].metadata[b'other'] == b'abc'
+
+    list_field = schema[1]
+    assert list_field.metadata[field_id] == b'11'
+
+    list_item_field = list_field.type.value_field
+    assert list_item_field.metadata[field_id] == b'10'
+
+    struct_field = schema[2]
+    assert struct_field.metadata[field_id] == b'102'
+
+    struct_middle_field = struct_field.type[0]
+    assert struct_middle_field.metadata[field_id] == b'101'
+
+    struct_inner_field = struct_middle_field.type[0]
+    assert struct_inner_field.metadata[field_id] == b'100'
+
+    assert schema[3].metadata is None
+    # Invalid input is passed through (ok) but does not
+    # have field_id in parquet (not tested)
+    assert schema[4].metadata[field_id] == b'xyz'
+    assert schema[5].metadata[field_id] == b'-1000'
+
+
+def test_parquet_file_page_index():
+    for write_page_index in (False, True):
+        table = pa.table({'a': [1, 2, 3]})
+
+        writer = pa.BufferOutputStream()
+        _write_table(table, writer, write_page_index=write_page_index)
+        reader = pa.BufferReader(writer.getvalue())
+
+        # Can retrieve sorting columns from metadata
+        metadata = pq.read_metadata(reader)
+        cc = metadata.row_group(0).column(0)
+        assert cc.has_offset_index is write_page_index
+        assert cc.has_column_index is write_page_index
+
+
+@pytest.mark.pandas
+def test_multi_dataset_metadata(tempdir):
+    filenames = ["ARROW-1983-dataset.0", "ARROW-1983-dataset.1"]
+    metapath = str(tempdir / "_metadata")
+
+    # create a test dataset
+    df = pd.DataFrame({
+        'one': [1, 2, 3],
+        'two': [-1, -2, -3],
+        'three': [[1, 2], [2, 3], [3, 4]],
+    })
+    table = pa.Table.from_pandas(df)
+
+    # write dataset twice and collect/merge metadata
+    _meta = None
+    for filename in filenames:
+        meta = []
+        pq.write_table(table, str(tempdir / filename),
+                       metadata_collector=meta)
+        meta[0].set_file_path(filename)
+        if _meta is None:
+            _meta = meta[0]
+        else:
+            _meta.append_row_groups(meta[0])
+
+    # Write merged metadata-only file
+    with open(metapath, "wb") as f:
+        _meta.write_metadata_file(f)
+
+    # Read back the metadata
+    meta = pq.read_metadata(metapath)
+    md = meta.to_dict()
+    _md = _meta.to_dict()
+    for key in _md:
+        if key != 'serialized_size':
+            assert _md[key] == md[key]
+    assert _md['num_columns'] == 3
+    assert _md['num_rows'] == 6
+    assert _md['num_row_groups'] == 2
+    assert _md['serialized_size'] == 0
+    assert md['serialized_size'] > 0
+
+
+def test_metadata_hashing(tempdir):
+    path1 = str(tempdir / "metadata1")
+    schema1 = pa.schema([("a", "int64"), ("b", "float64")])
+    pq.write_metadata(schema1, path1)
+    parquet_meta1 = pq.read_metadata(path1)
+
+    # Same as 1, just different path
+    path2 = str(tempdir / "metadata2")
+    schema2 = pa.schema([("a", "int64"), ("b", "float64")])
+    pq.write_metadata(schema2, path2)
+    parquet_meta2 = pq.read_metadata(path2)
+
+    # different schema
+    path3 = str(tempdir / "metadata3")
+    schema3 = pa.schema([("a", "int64"), ("b", "float32")])
+    pq.write_metadata(schema3, path3)
+    parquet_meta3 = pq.read_metadata(path3)
+
+    # Deterministic
+    assert hash(parquet_meta1) == hash(parquet_meta1)  # equal w/ same instance
+    assert hash(parquet_meta1) == hash(parquet_meta2)  # equal w/ different instance
+
+    # Not the same as other metadata with different schema
+    assert hash(parquet_meta1) != hash(parquet_meta3)
+
+
+@pytest.mark.filterwarnings("ignore:Parquet format:FutureWarning")
+def test_write_metadata(tempdir):
+    path = str(tempdir / "metadata")
+    schema = pa.schema([("a", "int64"), ("b", "float64")])
+
+    # write a pyarrow schema
+    pq.write_metadata(schema, path)
+    parquet_meta = pq.read_metadata(path)
+    schema_as_arrow = parquet_meta.schema.to_arrow_schema()
+    assert schema_as_arrow.equals(schema)
+
+    # ARROW-8980: Check that the ARROW:schema metadata key was removed
+    if schema_as_arrow.metadata:
+        assert b'ARROW:schema' not in schema_as_arrow.metadata
+
+    # pass through writer keyword arguments
+    for version in ["1.0", "2.4", "2.6"]:
+        pq.write_metadata(schema, path, version=version)
+        parquet_meta = pq.read_metadata(path)
+        # The version is stored as a single integer in the Parquet metadata,
+        # so it cannot correctly express dotted format versions
+        expected_version = "1.0" if version == "1.0" else "2.6"
+        assert parquet_meta.format_version == expected_version
+
+    # metadata_collector: list of FileMetaData objects
+    table = pa.table({'a': [1, 2], 'b': [.1, .2]}, schema=schema)
+    pq.write_table(table, tempdir / "data.parquet")
+    parquet_meta = pq.read_metadata(str(tempdir / "data.parquet"))
+    pq.write_metadata(
+        schema, path, metadata_collector=[parquet_meta, parquet_meta]
+    )
+    parquet_meta_mult = pq.read_metadata(path)
+    assert parquet_meta_mult.num_row_groups == 2
+
+    # append metadata with different schema raises an error
+    msg = ("AppendRowGroups requires equal schemas.\n"
+           "The two columns with index 0 differ.")
+    with pytest.raises(RuntimeError, match=msg):
+        pq.write_metadata(
+            pa.schema([("a", "int32"), ("b", "null")]),
+            path, metadata_collector=[parquet_meta, parquet_meta]
+        )
+
+
+def test_table_large_metadata():
+    # ARROW-8694
+    my_schema = pa.schema([pa.field('f0', 'double')],
+                          metadata={'large': 'x' * 10000000})
+
+    table = pa.table([range(10)], schema=my_schema)
+    _check_roundtrip(table)
+
+
+@pytest.mark.pandas
+def test_compare_schemas():
+    df = alltypes_sample(size=10000)
+
+    fileh = make_sample_file(df)
+    fileh2 = make_sample_file(df)
+    fileh3 = make_sample_file(df[df.columns[::2]])
+
+    # ParquetSchema
+    assert isinstance(fileh.schema, pq.ParquetSchema)
+    assert fileh.schema.equals(fileh.schema)
+    assert fileh.schema == fileh.schema
+    assert fileh.schema.equals(fileh2.schema)
+    assert fileh.schema == fileh2.schema
+    assert fileh.schema != 'arbitrary object'
+    assert not fileh.schema.equals(fileh3.schema)
+    assert fileh.schema != fileh3.schema
+
+    # ColumnSchema
+    assert isinstance(fileh.schema[0], pq.ColumnSchema)
+    assert fileh.schema[0].equals(fileh.schema[0])
+    assert fileh.schema[0] == fileh.schema[0]
+    assert not fileh.schema[0].equals(fileh.schema[1])
+    assert fileh.schema[0] != fileh.schema[1]
+    assert fileh.schema[0] != 'arbitrary object'
+
+
+@pytest.mark.pandas
+def test_read_schema(tempdir):
+    N = 100
+    df = pd.DataFrame({
+        'index': np.arange(N),
+        'values': np.random.randn(N)
+    }, columns=['index', 'values'])
+
+    data_path = tempdir / 'test.parquet'
+
+    table = pa.Table.from_pandas(df)
+    _write_table(table, data_path)
+
+    read1 = pq.read_schema(data_path)
+    read2 = pq.read_schema(data_path, memory_map=True)
+    assert table.schema.equals(read1)
+    assert table.schema.equals(read2)
+
+    assert table.schema.metadata[b'pandas'] == read1.metadata[b'pandas']
+
+
+def test_parquet_metadata_empty_to_dict(tempdir):
+    # https://issues.apache.org/jira/browse/ARROW-10146
+    table = pa.table({"a": pa.array([], type="int64")})
+    pq.write_table(table, tempdir / "data.parquet")
+    metadata = pq.read_metadata(tempdir / "data.parquet")
+    # ensure this doesn't error / statistics set to None
+    metadata_dict = metadata.to_dict()
+    assert len(metadata_dict["row_groups"]) == 1
+    assert len(metadata_dict["row_groups"][0]["columns"]) == 1
+    assert metadata_dict["row_groups"][0]["columns"][0]["statistics"] is None
+
+
+@pytest.mark.slow
+@pytest.mark.large_memory
+def test_metadata_exceeds_message_size():
+    # ARROW-13655: Thrift may enable a default message size that limits
+    # the size of Parquet metadata that can be written.
+    NCOLS = 1000
+    NREPEATS = 4000
+
+    table = pa.table({str(i): np.random.randn(10) for i in range(NCOLS)})
+
+    with pa.BufferOutputStream() as out:
+        pq.write_table(table, out)
+        buf = out.getvalue()
+
+    original_metadata = pq.read_metadata(pa.BufferReader(buf))
+    metadata = pq.read_metadata(pa.BufferReader(buf))
+    for i in range(NREPEATS):
+        metadata.append_row_groups(original_metadata)
+
+    with pa.BufferOutputStream() as out:
+        metadata.write_metadata_file(out)
+        buf = out.getvalue()
+
+    metadata = pq.read_metadata(pa.BufferReader(buf))
+
+
+def test_metadata_schema_filesystem(tempdir):
+    table = pa.table({"a": [1, 2, 3]})
+
+    # URI writing to local file.
+    fname = "data.parquet"
+    file_path = str(tempdir / fname)
+    file_uri = 'file:///' + file_path
+
+    pq.write_table(table, file_path)
+
+    # Get expected `metadata` from path.
+    metadata = pq.read_metadata(tempdir / fname)
+    schema = table.schema
+
+    assert pq.read_metadata(file_uri).equals(metadata)
+    assert pq.read_metadata(
+        file_path, filesystem=LocalFileSystem()).equals(metadata)
+    assert pq.read_metadata(
+        fname, filesystem=f'file:///{tempdir}').equals(metadata)
+
+    assert pq.read_schema(file_uri).equals(schema)
+    assert pq.read_schema(
+        file_path, filesystem=LocalFileSystem()).equals(schema)
+    assert pq.read_schema(
+        fname, filesystem=f'file:///{tempdir}').equals(schema)
+
+    with util.change_cwd(tempdir):
+        # Pass `filesystem` arg
+        assert pq.read_metadata(
+            fname, filesystem=LocalFileSystem()).equals(metadata)
+
+        assert pq.read_schema(
+            fname, filesystem=LocalFileSystem()).equals(schema)
+
+
+def test_metadata_equals():
+    table = pa.table({"a": [1, 2, 3]})
+    with pa.BufferOutputStream() as out:
+        pq.write_table(table, out)
+        buf = out.getvalue()
+
+    original_metadata = pq.read_metadata(pa.BufferReader(buf))
+    match = "Argument 'other' has incorrect type"
+    with pytest.raises(TypeError, match=match):
+        original_metadata.equals(None)
+
+
+@pytest.mark.parametrize("t1,t2,expected_error", (
+    ({'col1': range(10)}, {'col1': range(10)}, None),
+    ({'col1': range(10)}, {'col2': range(10)},
+     "The two columns with index 0 differ."),
+    ({'col1': range(10), 'col2': range(10)}, {'col3': range(10)},
+     "This schema has 2 columns, other has 1")
+))
+def test_metadata_append_row_groups_diff(t1, t2, expected_error):
+    table1 = pa.table(t1)
+    table2 = pa.table(t2)
+
+    buf1 = io.BytesIO()
+    buf2 = io.BytesIO()
+    pq.write_table(table1, buf1)
+    pq.write_table(table2, buf2)
+    buf1.seek(0)
+    buf2.seek(0)
+
+    meta1 = pq.ParquetFile(buf1).metadata
+    meta2 = pq.ParquetFile(buf2).metadata
+
+    if expected_error:
+        # Error clearly defines it's happening at append row groups call
+        prefix = "AppendRowGroups requires equal schemas.\n"
+        with pytest.raises(RuntimeError, match=prefix + expected_error):
+            meta1.append_row_groups(meta2)
+    else:
+        meta1.append_row_groups(meta2)
+
+
+@pytest.mark.s3
+def test_write_metadata_fs_file_combinations(tempdir, s3_example_s3fs):
+    s3_fs, s3_path = s3_example_s3fs
+
+    meta1 = tempdir / "meta1"
+    meta2 = tempdir / "meta2"
+    meta3 = tempdir / "meta3"
+    meta4 = tempdir / "meta4"
+    meta5 = f"{s3_path}/meta5"
+
+    table = pa.table({"col": range(5)})
+
+    # plain local path
+    pq.write_metadata(table.schema, meta1, [])
+
+    # Used the localfilesystem to resolve opening an output stream
+    pq.write_metadata(table.schema, meta2, [], filesystem=LocalFileSystem())
+
+    # Can resolve local file URI
+    pq.write_metadata(table.schema, meta3.as_uri(), [])
+
+    # Take a file-like obj all the way thru?
+    with meta4.open('wb+') as meta4_stream:
+        pq.write_metadata(table.schema, meta4_stream, [])
+
+    # S3FileSystem
+    pq.write_metadata(table.schema, meta5, [], filesystem=s3_fs)
+
+    assert meta1.read_bytes() == meta2.read_bytes() \
+        == meta3.read_bytes() == meta4.read_bytes() \
+        == s3_fs.open(meta5).read()
+
+
+def test_column_chunk_key_value_metadata(parquet_test_datadir):
+    metadata = pq.read_metadata(parquet_test_datadir /
+                                'column_chunk_key_value_metadata.parquet')
+    key_value_metadata1 = metadata.row_group(0).column(0).metadata
+    assert key_value_metadata1 == {b'foo': b'bar', b'thisiskeywithoutvalue': b''}
+    key_value_metadata2 = metadata.row_group(0).column(1).metadata
+    assert key_value_metadata2 is None
+
+
+def test_internal_class_instantiation():
+    def msg(c):
+        return f"Do not call {c}'s constructor directly"
+
+    with pytest.raises(TypeError, match=msg("Statistics")):
+        pq.Statistics()
+
+    with pytest.raises(TypeError, match=msg("ParquetLogicalType")):
+        pq.ParquetLogicalType()
+
+    with pytest.raises(TypeError, match=msg("ColumnChunkMetaData")):
+        pq.ColumnChunkMetaData()
+
+    with pytest.raises(TypeError, match=msg("RowGroupMetaData")):
+        pq.RowGroupMetaData()
+
+    with pytest.raises(TypeError, match=msg("FileMetaData")):
+        pq.FileMetaData()
diff --git a/pyarrow/tests/parquet/test_pandas.py b/pyarrow/tests/parquet/test_pandas.py
new file mode 100644
index 0000000000000000000000000000000000000000..53864ff15ea2355cc4d2a69a9afad250575698dc
--- /dev/null
+++ b/pyarrow/tests/parquet/test_pandas.py
@@ -0,0 +1,680 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import io
+import json
+
+try:
+    import numpy as np
+except ImportError:
+    np = None
+import pytest
+
+import pyarrow as pa
+from pyarrow.fs import LocalFileSystem, SubTreeFileSystem
+from pyarrow.util import guid
+from pyarrow.vendored.version import Version
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
+                                              _write_table)
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe,
+                                              alltypes_sample)
+except ImportError:
+    pd = tm = None
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = pytest.mark.parquet
+
+
+@pytest.mark.pandas
+def test_pandas_parquet_custom_metadata(tempdir):
+    df = alltypes_sample(size=10000)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    assert b'pandas' in arrow_table.schema.metadata
+
+    _write_table(arrow_table, filename)
+
+    metadata = pq.read_metadata(filename).metadata
+    assert b'pandas' in metadata
+
+    js = json.loads(metadata[b'pandas'].decode('utf8'))
+    assert js['index_columns'] == [{'kind': 'range',
+                                    'name': None,
+                                    'start': 0, 'stop': 10000,
+                                    'step': 1}]
+
+
+@pytest.mark.pandas
+def test_merging_parquet_tables_with_different_pandas_metadata(tempdir):
+    # ARROW-3728: Merging Parquet Files - Pandas Meta in Schema Mismatch
+    schema = pa.schema([
+        pa.field('int', pa.int16()),
+        pa.field('float', pa.float32()),
+        pa.field('string', pa.string())
+    ])
+    df1 = pd.DataFrame({
+        'int': np.arange(3, dtype=np.uint8),
+        'float': np.arange(3, dtype=np.float32),
+        'string': ['ABBA', 'EDDA', 'ACDC']
+    })
+    df2 = pd.DataFrame({
+        'int': [4, 5],
+        'float': [1.1, None],
+        'string': [None, None]
+    })
+    table1 = pa.Table.from_pandas(df1, schema=schema, preserve_index=False)
+    table2 = pa.Table.from_pandas(df2, schema=schema, preserve_index=False)
+
+    assert not table1.schema.equals(table2.schema, check_metadata=True)
+    assert table1.schema.equals(table2.schema)
+
+    writer = pq.ParquetWriter(tempdir / 'merged.parquet', schema=schema)
+    writer.write_table(table1)
+    writer.write_table(table2)
+
+
+@pytest.mark.pandas
+def test_attributes_metadata_persistence(tempdir):
+    # GH-45382: Add support for pandas DataFrame.attrs
+    # During the .parquet file writing, the attrs are serialised into json
+    # along with the rest of the pandas.DataFrame metadata.
+
+    filename = tempdir / "metadata_persistence.parquet"
+    df = alltypes_sample(size=10000)
+    df.attrs = {
+        'float16': 'half-precision',
+        'float32': 'single precision',
+        'float64': 'double precision',
+        'desciption': 'Attributes Persistence Test DataFrame',
+    }
+
+    table = pa.Table.from_pandas(df)
+    assert b'attributes' in table.schema.metadata[b'pandas']
+
+    _write_table(table, filename)
+    metadata = pq.read_metadata(filename).metadata
+    js = json.loads(metadata[b'pandas'].decode('utf8'))
+    assert 'attributes' in js
+    assert js['attributes'] == df.attrs
+
+
+@pytest.mark.pandas
+def test_pandas_parquet_column_multiindex(tempdir):
+    df = alltypes_sample(size=10)
+    df.columns = pd.MultiIndex.from_tuples(
+        list(zip(df.columns, df.columns[::-1])),
+        names=['level_1', 'level_2']
+    )
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    assert arrow_table.schema.pandas_metadata is not None
+
+    _write_table(arrow_table, filename)
+
+    table_read = pq.read_pandas(filename)
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_pandas_parquet_2_roundtrip_read_pandas_no_index_written(tempdir):
+    df = alltypes_sample(size=10000)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+    js = arrow_table.schema.pandas_metadata
+    assert not js['index_columns']
+    # ARROW-2170
+    # While index_columns should be empty, columns needs to be filled still.
+    assert js['columns']
+
+    _write_table(arrow_table, filename)
+    table_read = pq.read_pandas(filename)
+
+    js = table_read.schema.pandas_metadata
+    assert not js['index_columns']
+
+    read_metadata = table_read.schema.metadata
+    assert arrow_table.schema.metadata == read_metadata
+
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_pandas_parquet_native_file_roundtrip():
+    df = _test_dataframe(10000)
+    arrow_table = pa.Table.from_pandas(df)
+    imos = pa.BufferOutputStream()
+    _write_table(arrow_table, imos, version='2.6')
+    buf = imos.getvalue()
+    reader = pa.BufferReader(buf)
+    df_read = _read_table(reader).to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_read_pandas_column_subset():
+    df = _test_dataframe(10000)
+    arrow_table = pa.Table.from_pandas(df)
+    imos = pa.BufferOutputStream()
+    _write_table(arrow_table, imos, version='2.6')
+    buf = imos.getvalue()
+    reader = pa.BufferReader(buf)
+    df_read = pq.read_pandas(
+        reader, columns=['strings', 'uint8'],
+    ).to_pandas()
+    tm.assert_frame_equal(df[['strings', 'uint8']], df_read)
+
+
+@pytest.mark.pandas
+def test_pandas_parquet_empty_roundtrip():
+    df = _test_dataframe(0)
+    arrow_table = pa.Table.from_pandas(df)
+    imos = pa.BufferOutputStream()
+    _write_table(arrow_table, imos, version='2.6')
+    buf = imos.getvalue()
+    reader = pa.BufferReader(buf)
+    df_read = _read_table(reader).to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_pandas_can_write_nested_data():
+    data = {
+        "agg_col": [
+            {"page_type": 1},
+            {"record_type": 1},
+            {"non_consecutive_home": 0},
+        ],
+        "uid_first": "1001"
+    }
+    df = pd.DataFrame(data=data)
+    arrow_table = pa.Table.from_pandas(df)
+    imos = pa.BufferOutputStream()
+    # This succeeds under V2
+    _write_table(arrow_table, imos)
+
+
+@pytest.mark.pandas
+def test_pandas_parquet_pyfile_roundtrip(tempdir):
+    filename = tempdir / 'pandas_pyfile_roundtrip.parquet'
+    size = 5
+    df = pd.DataFrame({
+        'int64': np.arange(size, dtype=np.int64),
+        'float32': np.arange(size, dtype=np.float32),
+        'float64': np.arange(size, dtype=np.float64),
+        'bool': np.random.randn(size) > 0,
+        'strings': ['foo', 'bar', None, 'baz', 'qux']
+    })
+
+    arrow_table = pa.Table.from_pandas(df)
+
+    with filename.open('wb') as f:
+        _write_table(arrow_table, f, version="2.6")
+
+    data = io.BytesIO(filename.read_bytes())
+
+    table_read = _read_table(data)
+    df_read = table_read.to_pandas()
+    tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_pandas_parquet_configuration_options(tempdir):
+    size = 10000
+    np.random.seed(0)
+    df = pd.DataFrame({
+        'uint8': np.arange(size, dtype=np.uint8),
+        'uint16': np.arange(size, dtype=np.uint16),
+        'uint32': np.arange(size, dtype=np.uint32),
+        'uint64': np.arange(size, dtype=np.uint64),
+        'int8': np.arange(size, dtype=np.int16),
+        'int16': np.arange(size, dtype=np.int16),
+        'int32': np.arange(size, dtype=np.int32),
+        'int64': np.arange(size, dtype=np.int64),
+        'float32': np.arange(size, dtype=np.float32),
+        'float64': np.arange(size, dtype=np.float64),
+        'bool': np.random.randn(size) > 0
+    })
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+
+    for use_dictionary in [True, False]:
+        _write_table(arrow_table, filename, version='2.6',
+                     use_dictionary=use_dictionary)
+        table_read = _read_table(filename)
+        df_read = table_read.to_pandas()
+        tm.assert_frame_equal(df, df_read)
+
+    for write_statistics in [True, False]:
+        _write_table(arrow_table, filename, version='2.6',
+                     write_statistics=write_statistics)
+        table_read = _read_table(filename)
+        df_read = table_read.to_pandas()
+        tm.assert_frame_equal(df, df_read)
+
+    for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']:
+        if (compression != 'NONE' and
+                not pa.lib.Codec.is_available(compression)):
+            continue
+        _write_table(arrow_table, filename, version='2.6',
+                     compression=compression)
+        table_read = _read_table(filename)
+        df_read = table_read.to_pandas()
+        tm.assert_frame_equal(df, df_read)
+
+
+@pytest.mark.pandas
+def test_spark_flavor_preserves_pandas_metadata():
+    df = _test_dataframe(size=100)
+    df.index = np.arange(0, 10 * len(df), 10)
+    df.index.name = 'foo'
+
+    result = _roundtrip_pandas_dataframe(df, {'flavor': 'spark'})
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+def test_index_column_name_duplicate(tempdir):
+    data = {
+        'close': {
+            pd.Timestamp('2017-06-30 01:31:00'): 154.99958999999998,
+            pd.Timestamp('2017-06-30 01:32:00'): 154.99958999999998,
+        },
+        'time': {
+            pd.Timestamp('2017-06-30 01:31:00'): pd.Timestamp(
+                '2017-06-30 01:31:00'
+            ),
+            pd.Timestamp('2017-06-30 01:32:00'): pd.Timestamp(
+                '2017-06-30 01:32:00'
+            ),
+        }
+    }
+    path = str(tempdir / 'data.parquet')
+
+    # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units
+    # so we need to cast the pandas dtype. Pandas v1 will always silently
+    # coerce to [ns] due to lack of non-[ns] support.
+    dfx = pd.DataFrame(data, dtype='datetime64[us]').set_index('time', drop=False)
+
+    tdfx = pa.Table.from_pandas(dfx)
+    _write_table(tdfx, path)
+    arrow_table = _read_table(path)
+    result_df = arrow_table.to_pandas()
+    tm.assert_frame_equal(result_df, dfx)
+
+
+@pytest.mark.pandas
+def test_multiindex_duplicate_values(tempdir):
+    num_rows = 3
+    numbers = list(range(num_rows))
+    index = pd.MultiIndex.from_arrays(
+        [['foo', 'foo', 'bar'], numbers],
+        names=['foobar', 'some_numbers'],
+    )
+
+    df = pd.DataFrame({'numbers': numbers}, index=index)
+    table = pa.Table.from_pandas(df)
+
+    filename = tempdir / 'dup_multi_index_levels.parquet'
+
+    _write_table(table, filename)
+    result_table = _read_table(filename)
+    assert table.equals(result_table)
+
+    result_df = result_table.to_pandas()
+    tm.assert_frame_equal(result_df, df)
+
+
+@pytest.mark.pandas
+def test_backwards_compatible_index_naming(datadir):
+    expected_string = b"""\
+carat        cut  color  clarity  depth  table  price     x     y     z
+ 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+ 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+ 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+ 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+ 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+ 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+ 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+ 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+ 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+ 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+    expected = pd.read_csv(io.BytesIO(expected_string), sep=r'\s{2,}',
+                           index_col=None, header=0, engine='python')
+    table = _read_table(datadir / 'v0.7.1.parquet')
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.pandas
+def test_backwards_compatible_index_multi_level_named(datadir):
+    expected_string = b"""\
+carat        cut  color  clarity  depth  table  price     x     y     z
+ 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+ 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+ 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+ 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+ 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+ 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+ 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+ 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+ 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+ 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+    expected = pd.read_csv(
+        io.BytesIO(expected_string), sep=r'\s{2,}',
+        index_col=['cut', 'color', 'clarity'],
+        header=0, engine='python'
+    ).sort_index()
+
+    table = _read_table(datadir / 'v0.7.1.all-named-index.parquet')
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.pandas
+def test_backwards_compatible_index_multi_level_some_named(datadir):
+    expected_string = b"""\
+carat        cut  color  clarity  depth  table  price     x     y     z
+ 0.23      Ideal      E      SI2   61.5   55.0    326  3.95  3.98  2.43
+ 0.21    Premium      E      SI1   59.8   61.0    326  3.89  3.84  2.31
+ 0.23       Good      E      VS1   56.9   65.0    327  4.05  4.07  2.31
+ 0.29    Premium      I      VS2   62.4   58.0    334  4.20  4.23  2.63
+ 0.31       Good      J      SI2   63.3   58.0    335  4.34  4.35  2.75
+ 0.24  Very Good      J     VVS2   62.8   57.0    336  3.94  3.96  2.48
+ 0.24  Very Good      I     VVS1   62.3   57.0    336  3.95  3.98  2.47
+ 0.26  Very Good      H      SI1   61.9   55.0    337  4.07  4.11  2.53
+ 0.22       Fair      E      VS2   65.1   61.0    337  3.87  3.78  2.49
+ 0.23  Very Good      H      VS1   59.4   61.0    338  4.00  4.05  2.39"""
+    expected = pd.read_csv(
+        io.BytesIO(expected_string),
+        sep=r'\s{2,}', index_col=['cut', 'color', 'clarity'],
+        header=0, engine='python'
+    ).sort_index()
+    expected.index = expected.index.set_names(['cut', None, 'clarity'])
+
+    table = _read_table(datadir / 'v0.7.1.some-named-index.parquet')
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.pandas
+def test_backwards_compatible_column_metadata_handling(datadir):
+    if Version("2.2.0") <= Version(pd.__version__):
+        # TODO: regression in pandas
+        # https://github.com/pandas-dev/pandas/issues/56775
+        pytest.skip("Regression in pandas 2.2.0")
+    expected = pd.DataFrame(
+        {'a': [1, 2, 3], 'b': [.1, .2, .3],
+         'c': pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')})
+    expected.index = pd.MultiIndex.from_arrays(
+        [['a', 'b', 'c'],
+         pd.date_range("2017-01-01", periods=3, tz='Europe/Brussels')],
+        names=['index', None])
+
+    path = datadir / 'v0.7.1.column-metadata-handling.parquet'
+    table = _read_table(path)
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, expected)
+
+    table = _read_table(
+        path, columns=['a'])
+    result = table.to_pandas()
+    tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True))
+
+
+@pytest.mark.pandas
+def test_categorical_index_survives_roundtrip():
+    # ARROW-3652, addressed by ARROW-3246
+    df = pd.DataFrame([['a', 'b'], ['c', 'd']], columns=['c1', 'c2'])
+    df['c1'] = df['c1'].astype('category')
+    df = df.set_index(['c1'])
+
+    table = pa.Table.from_pandas(df)
+    bos = pa.BufferOutputStream()
+    pq.write_table(table, bos)
+    ref_df = pq.read_pandas(bos.getvalue()).to_pandas()
+    assert isinstance(ref_df.index, pd.CategoricalIndex)
+    assert ref_df.index.equals(df.index)
+
+
+@pytest.mark.pandas
+def test_categorical_order_survives_roundtrip():
+    # ARROW-6302
+    df = pd.DataFrame({"a": pd.Categorical(
+        ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=True)})
+
+    table = pa.Table.from_pandas(df)
+    bos = pa.BufferOutputStream()
+    pq.write_table(table, bos)
+
+    contents = bos.getvalue()
+    result = pq.read_pandas(contents).to_pandas()
+
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+def test_pandas_categorical_na_type_row_groups():
+    # ARROW-5085
+    df = pd.DataFrame({"col": [None] * 100, "int": [1.0] * 100})
+    df_category = df.astype({"col": "category", "int": "category"})
+    table = pa.Table.from_pandas(df)
+    table_cat = pa.Table.from_pandas(df_category)
+    buf = pa.BufferOutputStream()
+
+    # it works
+    pq.write_table(table_cat, buf, version='2.6', chunk_size=10)
+    result = pq.read_table(buf.getvalue())
+
+    # Result is non-categorical
+    assert result[0].equals(table[0])
+    assert result[1].equals(table[1])
+
+
+@pytest.mark.pandas
+def test_pandas_categorical_roundtrip():
+    # ARROW-5480, this was enabled by ARROW-3246
+
+    # Have one of the categories unobserved and include a null (-1)
+    codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32')
+    categories = ['foo', 'bar', 'baz']
+    df = pd.DataFrame({'x': pd.Categorical.from_codes(
+        codes, categories=categories)})
+
+    buf = pa.BufferOutputStream()
+    pq.write_table(pa.table(df), buf)
+
+    result = pq.read_table(buf.getvalue()).to_pandas()
+    assert result.x.dtype == 'category'
+    assert (result.x.cat.categories == categories).all()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+def test_categories_with_string_pyarrow_dtype(tempdir):
+    # gh-33727: writing to parquet should not fail
+    if Version(pd.__version__) < Version("1.3.0"):
+        pytest.skip("PyArrow backed string data type introduced in pandas 1.3.0")
+
+    df1 = pd.DataFrame({"x": ["foo", "bar", "foo"]}, dtype="string[pyarrow]")
+    df1 = df1.astype("category")
+
+    df2 = pd.DataFrame({"x": ["foo", "bar", "foo"]})
+    df2 = df2.astype("category")
+
+    # categories should be converted to pa.Array
+    assert pa.array(df1["x"]).to_pylist() == pa.array(df2["x"]).to_pylist()
+    assert pa.array(df1["x"].cat.categories.values).to_pylist() == pa.array(
+        df2["x"].cat.categories.values).to_pylist()
+
+    path = str(tempdir / 'cat.parquet')
+    pq.write_table(pa.table(df1), path)
+    result = pq.read_table(path).to_pandas()
+
+    tm.assert_frame_equal(result, df2)
+
+
+@pytest.mark.pandas
+def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir):
+    df = pd.DataFrame({'part': 'a', "col": [1, 2, 3]})
+    df['col'] = df['col'].astype("Int64")
+    table = pa.table(df)
+
+    pq.write_to_dataset(
+        table, str(tempdir / "case1"), partition_cols=['part'],
+    )
+    result = pq.read_table(str(tempdir / "case1")).to_pandas()
+    tm.assert_frame_equal(result[["col"]], df[["col"]])
+
+    pq.write_to_dataset(table, str(tempdir / "case2"))
+    result = pq.read_table(str(tempdir / "case2")).to_pandas()
+    tm.assert_frame_equal(result[["col"]], df[["col"]])
+
+    pq.write_table(table, str(tempdir / "data.parquet"))
+    result = pq.read_table(str(tempdir / "data.parquet")).to_pandas()
+    tm.assert_frame_equal(result[["col"]], df[["col"]])
+
+
+@pytest.mark.pandas
+def test_write_to_dataset_pandas_preserve_index(tempdir):
+    # ARROW-8251 - preserve pandas index in roundtrip
+
+    df = pd.DataFrame({'part': ['a', 'a', 'b'], "col": [1, 2, 3]})
+    df.index = pd.Index(['a', 'b', 'c'], name="idx")
+    table = pa.table(df)
+    df_cat = df[["col", "part"]].copy()
+    df_cat["part"] = df_cat["part"].astype("category")
+
+    pq.write_to_dataset(
+        table, str(tempdir / "case1"), partition_cols=['part'],
+    )
+    result = pq.read_table(str(tempdir / "case1")).to_pandas()
+    tm.assert_frame_equal(result, df_cat)
+
+    pq.write_to_dataset(table, str(tempdir / "case2"))
+    result = pq.read_table(str(tempdir / "case2")).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+    pq.write_table(table, str(tempdir / "data.parquet"))
+    result = pq.read_table(str(tempdir / "data.parquet")).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('preserve_index', [True, False, None])
+@pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"])
+def test_dataset_read_pandas_common_metadata(
+    tempdir, preserve_index, metadata_fname
+):
+    # ARROW-1103
+    nfiles = 5
+    size = 5
+
+    dirpath = tempdir / guid()
+    dirpath.mkdir()
+
+    test_data = []
+    frames = []
+    paths = []
+    for i in range(nfiles):
+        df = _test_dataframe(size, seed=i)
+        df.index = pd.Index(
+            np.arange(i * size, (i + 1) * size, dtype="int64"), name='index'
+        )
+
+        path = dirpath / f'{i}.parquet'
+
+        table = pa.Table.from_pandas(df, preserve_index=preserve_index)
+
+        # Obliterate metadata
+        table = table.replace_schema_metadata(None)
+        assert table.schema.metadata is None
+
+        _write_table(table, path)
+        test_data.append(table)
+        frames.append(df)
+        paths.append(path)
+
+    # Write _metadata common file
+    table_for_metadata = pa.Table.from_pandas(
+        df, preserve_index=preserve_index
+    )
+    pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname)
+
+    dataset = pq.ParquetDataset(dirpath)
+    columns = ['uint8', 'strings']
+    result = dataset.read_pandas(columns=columns).to_pandas()
+    expected = pd.concat([x[columns] for x in frames])
+    expected.index.name = (
+        df.index.name if preserve_index is not False else None)
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.pandas
+def test_read_pandas_passthrough_keywords(tempdir):
+    # ARROW-11464 - previously not all keywords were passed through (such as
+    # the filesystem keyword)
+    df = pd.DataFrame({'a': [1, 2, 3]})
+
+    filename = tempdir / 'data.parquet'
+    _write_table(df, filename)
+
+    result = pq.read_pandas(
+        'data.parquet',
+        filesystem=SubTreeFileSystem(str(tempdir), LocalFileSystem())
+    )
+    assert result.equals(pa.table(df))
+
+
+@pytest.mark.pandas
+def test_read_pandas_map_fields(tempdir):
+    # ARROW-10140 - table created from Pandas with mapping fields
+    df = pd.DataFrame({
+        'col1': pd.Series([
+            [('id', 'something'), ('value2', 'else')],
+            [('id', 'something2'), ('value', 'else2')],
+        ]),
+        'col2': pd.Series(['foo', 'bar'])
+    })
+
+    filename = tempdir / 'data.parquet'
+
+    udt = pa.map_(pa.string(), pa.string())
+    schema = pa.schema([pa.field('col1', udt), pa.field('col2', pa.string())])
+    arrow_table = pa.Table.from_pandas(df, schema)
+
+    _write_table(arrow_table, filename)
+
+    result = pq.read_pandas(filename).to_pandas()
+    tm.assert_frame_equal(result, df)
diff --git a/pyarrow/tests/parquet/test_parquet_file.py b/pyarrow/tests/parquet/test_parquet_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..a62b5c3298c9ceb95e9fd62a462828f8a7155843
--- /dev/null
+++ b/pyarrow/tests/parquet/test_parquet_file.py
@@ -0,0 +1,447 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import io
+import os
+import re
+import sys
+import types
+
+import pytest
+from unittest import mock
+
+import pyarrow as pa
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import _write_table
+except ImportError:
+    pq = None
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+    from pyarrow.tests.parquet.common import alltypes_sample
+except ImportError:
+    pd = tm = None
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = pytest.mark.parquet
+
+
+@pytest.mark.pandas
+def test_pass_separate_metadata():
+    # ARROW-471
+    df = alltypes_sample(size=10000)
+
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, compression='snappy', version='2.6')
+
+    buf.seek(0)
+    metadata = pq.read_metadata(buf)
+
+    buf.seek(0)
+
+    fileh = pq.ParquetFile(buf, metadata=metadata)
+
+    tm.assert_frame_equal(df, fileh.read().to_pandas())
+
+
+@pytest.mark.pandas
+def test_read_single_row_group():
+    # ARROW-471
+    N, K = 10000, 4
+    df = alltypes_sample(size=N)
+
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.6')
+
+    buf.seek(0)
+
+    pf = pq.ParquetFile(buf)
+
+    assert pf.num_row_groups == K
+
+    row_groups = [pf.read_row_group(i) for i in range(K)]
+    result = pa.concat_tables(row_groups)
+    tm.assert_frame_equal(df, result.to_pandas())
+
+
+@pytest.mark.pandas
+def test_read_single_row_group_with_column_subset():
+    N, K = 10000, 4
+    df = alltypes_sample(size=N)
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.6')
+
+    buf.seek(0)
+    pf = pq.ParquetFile(buf)
+
+    cols = list(df.columns[:2])
+    row_groups = [pf.read_row_group(i, columns=cols) for i in range(K)]
+    result = pa.concat_tables(row_groups)
+    tm.assert_frame_equal(df[cols], result.to_pandas())
+
+    # ARROW-4267: Selection of duplicate columns still leads to these columns
+    # being read uniquely.
+    row_groups = [pf.read_row_group(i, columns=cols + cols) for i in range(K)]
+    result = pa.concat_tables(row_groups)
+    tm.assert_frame_equal(df[cols], result.to_pandas())
+
+
+@pytest.mark.pandas
+def test_read_multiple_row_groups():
+    N, K = 10000, 4
+    df = alltypes_sample(size=N)
+
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.6')
+
+    buf.seek(0)
+
+    pf = pq.ParquetFile(buf)
+
+    assert pf.num_row_groups == K
+
+    result = pf.read_row_groups(range(K))
+    tm.assert_frame_equal(df, result.to_pandas())
+
+
+@pytest.mark.pandas
+def test_read_multiple_row_groups_with_column_subset():
+    N, K = 10000, 4
+    df = alltypes_sample(size=N)
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.6')
+
+    buf.seek(0)
+    pf = pq.ParquetFile(buf)
+
+    cols = list(df.columns[:2])
+    result = pf.read_row_groups(range(K), columns=cols)
+    tm.assert_frame_equal(df[cols], result.to_pandas())
+
+    # ARROW-4267: Selection of duplicate columns still leads to these columns
+    # being read uniquely.
+    result = pf.read_row_groups(range(K), columns=cols + cols)
+    tm.assert_frame_equal(df[cols], result.to_pandas())
+
+
+@pytest.mark.pandas
+def test_scan_contents():
+    N, K = 10000, 4
+    df = alltypes_sample(size=N)
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.6')
+
+    buf.seek(0)
+    pf = pq.ParquetFile(buf)
+
+    assert pf.scan_contents() == 10000
+    assert pf.scan_contents(df.columns[:4]) == 10000
+
+
+def test_parquet_file_pass_directory_instead_of_file(tempdir):
+    # ARROW-7208
+    path = tempdir / 'directory'
+    os.mkdir(str(path))
+
+    msg = f"Cannot open for reading: path '{str(path)}' is a directory"
+    with pytest.raises(IOError) as exc:
+        pq.ParquetFile(path)
+    if exc.errisinstance(PermissionError) and sys.platform == 'win32':
+        return  # Windows CI can get a PermissionError here.
+    exc.match(msg)
+
+
+def test_read_column_invalid_index():
+    table = pa.table([pa.array([4, 5]), pa.array(["foo", "bar"])],
+                     names=['ints', 'strs'])
+    bio = pa.BufferOutputStream()
+    pq.write_table(table, bio)
+    f = pq.ParquetFile(bio.getvalue())
+    assert f.reader.read_column(0).to_pylist() == [4, 5]
+    assert f.reader.read_column(1).to_pylist() == ["foo", "bar"]
+    for index in (-1, 2):
+        with pytest.raises((ValueError, IndexError)):
+            f.reader.read_column(index)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('batch_size', [300, 1000, 1300])
+def test_iter_batches_columns_reader(tempdir, batch_size):
+    total_size = 3000
+    chunk_size = 1000
+    # TODO: Add categorical support
+    df = alltypes_sample(size=total_size)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    _write_table(arrow_table, filename, version='2.6',
+                 chunk_size=chunk_size)
+
+    file_ = pq.ParquetFile(filename)
+    for columns in [df.columns[:10], df.columns[10:]]:
+        batches = file_.iter_batches(batch_size=batch_size, columns=columns)
+        batch_starts = range(0, total_size+batch_size, batch_size)
+        for batch, start in zip(batches, batch_starts):
+            end = min(total_size, start + batch_size)
+            tm.assert_frame_equal(
+                batch.to_pandas(),
+                df.iloc[start:end, :].loc[:, columns].reset_index(drop=True)
+            )
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('chunk_size', [1000])
+def test_iter_batches_reader(tempdir, chunk_size):
+    df = alltypes_sample(size=10000, categorical=True)
+
+    filename = tempdir / 'pandas_roundtrip.parquet'
+    arrow_table = pa.Table.from_pandas(df)
+    assert arrow_table.schema.pandas_metadata is not None
+
+    _write_table(arrow_table, filename, version='2.6',
+                 chunk_size=chunk_size)
+
+    file_ = pq.ParquetFile(filename)
+
+    def get_all_batches(f):
+        for row_group in range(f.num_row_groups):
+            batches = f.iter_batches(
+                batch_size=900,
+                row_groups=[row_group],
+            )
+
+            for batch in batches:
+                yield batch
+
+    batches = list(get_all_batches(file_))
+    batch_no = 0
+
+    for i in range(file_.num_row_groups):
+        tm.assert_frame_equal(
+            batches[batch_no].to_pandas(),
+            file_.read_row_groups([i]).to_pandas().head(900)
+        )
+
+        batch_no += 1
+
+        tm.assert_frame_equal(
+            batches[batch_no].to_pandas().reset_index(drop=True),
+            file_.read_row_groups([i]).to_pandas().iloc[900:].reset_index(
+                drop=True
+            )
+        )
+
+        batch_no += 1
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize('pre_buffer', [False, True])
+def test_pre_buffer(pre_buffer):
+    N, K = 10000, 4
+    df = alltypes_sample(size=N)
+    a_table = pa.Table.from_pandas(df)
+
+    buf = io.BytesIO()
+    _write_table(a_table, buf, row_group_size=N / K,
+                 compression='snappy', version='2.6')
+
+    buf.seek(0)
+    pf = pq.ParquetFile(buf, pre_buffer=pre_buffer)
+    assert pf.read().num_rows == N
+
+
+def test_parquet_file_explicitly_closed(tempdir):
+    """
+    Unopened files should be closed explicitly after use,
+    and previously opened files should be left open.
+    Applies to read_table, ParquetDataset, and ParquetFile
+    """
+    # create test parquet file
+    fn = tempdir.joinpath('file.parquet')
+    table = pa.table({'col1': [0, 1], 'col2': [0, 1]})
+    pq.write_table(table, fn)
+
+    # ParquetFile with opened file (will leave open)
+    with open(fn, 'rb') as f:
+        with pq.ParquetFile(f) as p:
+            p.read()
+            assert not f.closed
+            assert not p.closed
+        assert not f.closed  # opened input file was not closed
+        assert not p.closed  # parquet file obj reports as not closed
+    assert f.closed
+    assert p.closed  # parquet file being closed reflects underlying file
+
+    # ParquetFile with unopened file (will close)
+    with pq.ParquetFile(fn) as p:
+        p.read()
+        assert not p.closed
+    assert p.closed  # parquet file obj reports as closed
+
+
+@pytest.mark.s3
+@pytest.mark.parametrize("use_uri", (True, False))
+def test_parquet_file_with_filesystem(s3_example_fs, use_uri):
+    s3_fs, s3_uri, s3_path = s3_example_fs
+
+    args = (s3_uri if use_uri else s3_path,)
+    kwargs = {} if use_uri else dict(filesystem=s3_fs)
+
+    table = pa.table({"a": range(10)})
+    pq.write_table(table, s3_path, filesystem=s3_fs)
+
+    parquet_file = pq.ParquetFile(*args, **kwargs)
+    assert parquet_file.read() == table
+    assert not parquet_file.closed
+    parquet_file.close()
+    assert parquet_file.closed
+
+    with pq.ParquetFile(*args, **kwargs) as f:
+        assert f.read() == table
+        assert not f.closed
+    assert f.closed
+
+
+def test_read_statistics():
+    table = pa.table({"value": pa.array([-1, None, 3])})
+    buf = io.BytesIO()
+    _write_table(table, buf)
+    buf.seek(0)
+
+    statistics = pq.ParquetFile(buf).read().columns[0].chunks[0].statistics
+    assert statistics.is_null_count_exact is True
+    assert statistics.null_count == 1
+    assert statistics.distinct_count is None
+    # TODO: add tests for is_distinct_count_exact == None and True
+    # once Python API allows
+    assert statistics.is_distinct_count_exact is False
+    assert statistics.min == -1
+    assert statistics.is_min_exact
+    assert statistics.max == 3
+    assert statistics.is_max_exact
+    assert repr(statistics) == ("arrow.ArrayStatistics<"
+                                "null_count=1, distinct_count=None, "
+                                "min=-1, is_min_exact=True, "
+                                "max=3, is_max_exact=True>")
+
+
+def test_read_undefined_logical_type(parquet_test_datadir):
+    test_file = f"{parquet_test_datadir}/unknown-logical-type.parquet"
+
+    table = pq.ParquetFile(test_file).read()
+    assert table.column_names == ["column with known type", "column with unknown type"]
+    assert table["column with unknown type"].to_pylist() == [
+        b"unknown string 1",
+        b"unknown string 2",
+        b"unknown string 3"
+    ]
+
+
+def test_parquet_file_fsspec_support():
+    pytest.importorskip("fsspec")
+
+    table = pa.table({"a": range(10)})
+    pq.write_table(table, "fsspec+memory://example.parquet")
+    table2 = pq.read_table("fsspec+memory://example.parquet")
+    assert table.equals(table2)
+
+    msg = "Unrecognized filesystem type in URI"
+    with pytest.raises(pa.ArrowInvalid, match=msg):
+        pq.read_table("non-existing://example.parquet")
+
+
+def test_parquet_file_fsspec_support_through_filesystem_argument():
+    try:
+        from fsspec.implementations.memory import MemoryFileSystem
+    except ImportError:
+        pytest.skip("fsspec is not installed, skipping test")
+
+    table = pa.table({"b": range(10)})
+
+    fs = MemoryFileSystem()
+    fs.mkdir("/path/to/prefix", create_parents=True)
+    assert fs.exists("/path/to/prefix")
+
+    fs_str = "fsspec+memory://path/to/prefix"
+    pq.write_table(table, "b.parquet", filesystem=fs_str)
+    table2 = pq.read_table("fsspec+memory://path/to/prefix/b.parquet")
+    assert table.equals(table2)
+
+
+def test_parquet_file_hugginface_support():
+    try:
+        from fsspec.implementations.memory import MemoryFileSystem
+    except ImportError:
+        pytest.skip("fsspec is not installed, skipping Hugging Face test")
+
+    fake_hf_module = types.ModuleType("huggingface_hub")
+    fake_hf_module.HfFileSystem = MemoryFileSystem
+    with mock.patch.dict("sys.modules", {"huggingface_hub": fake_hf_module}):
+        uri = "hf://datasets/apache/arrow/test.parquet"
+        table = pa.table({"a": range(10)})
+        pq.write_table(table, uri)
+        table2 = pq.read_table(uri)
+        assert table.equals(table2)
+
+
+def test_fsspec_uri_raises_if_fsspec_is_not_available():
+    # sadly cannot patch sys.modules because cython will still be able to import fsspec
+    try:
+        import fsspec  # noqa: F401
+    except ImportError:
+        pass
+    else:
+        pytest.skip("fsspec is available, skipping test")
+
+    msg = re.escape(
+        "`fsspec` is required to handle `fsspec+<filesystem>://` and `hf://` URIs.")
+    with pytest.raises(ImportError, match=msg):
+        pq.read_table("fsspec+memory://example.parquet")
+
+
+def test_iter_batches_raises_batch_size_zero(tempdir):
+    # See https://github.com/apache/arrow/issues/46811
+    schema = pa.schema([])
+    empty_table = pa.Table.from_batches([], schema=schema)
+    parquet_file_path = tempdir / "empty_file.parquet"
+    pq.write_table(empty_table, parquet_file_path)
+    parquet_file = pq.ParquetFile(parquet_file_path)
+    with pytest.raises(ValueError):
+        parquet_file.iter_batches(batch_size=0)
diff --git a/pyarrow/tests/parquet/test_parquet_writer.py b/pyarrow/tests/parquet/test_parquet_writer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a49441f09f45455f242c836ead3305af98f01d35
--- /dev/null
+++ b/pyarrow/tests/parquet/test_parquet_writer.py
@@ -0,0 +1,546 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+import pyarrow as pa
+from pyarrow import fs
+
+try:
+    import pyarrow.parquet as pq
+    from pyarrow.tests.parquet.common import (_read_table, _test_dataframe,
+                                              _test_table, _range_integers)
+except ImportError:
+    pq = None
+
+
+try:
+    import pandas as pd
+    import pandas.testing as tm
+
+except ImportError:
+    pd = tm = None
+
+
+# Marks all of the tests in this module
+# Ignore these with pytest ... -m 'not parquet'
+pytestmark = pytest.mark.parquet
+
+
+@pytest.mark.pandas
+def test_parquet_incremental_file_build(tempdir):
+    df = _test_dataframe(100)
+    df['unique_id'] = 0
+
+    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+    out = pa.BufferOutputStream()
+
+    writer = pq.ParquetWriter(out, arrow_table.schema, version='2.6')
+
+    frames = []
+    for i in range(10):
+        df['unique_id'] = i
+        arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+        writer.write_table(arrow_table)
+
+        frames.append(df.copy())
+
+    writer.close()
+
+    buf = out.getvalue()
+    result = _read_table(pa.BufferReader(buf))
+
+    expected = pd.concat(frames, ignore_index=True)
+    tm.assert_frame_equal(result.to_pandas(), expected)
+
+
+def test_validate_schema_write_table(tempdir):
+    # ARROW-2926
+    simple_fields = [
+        pa.field('POS', pa.uint32()),
+        pa.field('desc', pa.string())
+    ]
+
+    simple_schema = pa.schema(simple_fields)
+
+    # simple_table schema does not match simple_schema
+    simple_from_array = [pa.array([1]), pa.array(['bla'])]
+    simple_table = pa.Table.from_arrays(simple_from_array, ['POS', 'desc'])
+
+    path = tempdir / 'simple_validate_schema.parquet'
+
+    with pq.ParquetWriter(path, simple_schema,
+                          version='2.6',
+                          compression='snappy', flavor='spark') as w:
+        with pytest.raises(ValueError):
+            w.write_table(simple_table)
+
+
+def test_parquet_invalid_writer(tempdir):
+    # avoid segfaults with invalid construction
+    with pytest.raises(TypeError):
+        some_schema = pa.schema([pa.field("x", pa.int32())])
+        pq.ParquetWriter(None, some_schema)
+
+    with pytest.raises(TypeError):
+        pq.ParquetWriter(tempdir / "some_path", None)
+
+
+@pytest.mark.pandas
+def test_parquet_writer_context_obj(tempdir):
+    df = _test_dataframe(100)
+    df['unique_id'] = 0
+
+    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+    out = pa.BufferOutputStream()
+
+    with pq.ParquetWriter(out, arrow_table.schema, version='2.6') as writer:
+
+        frames = []
+        for i in range(10):
+            df['unique_id'] = i
+            arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+            writer.write_table(arrow_table)
+
+            frames.append(df.copy())
+
+    buf = out.getvalue()
+    result = _read_table(pa.BufferReader(buf))
+
+    expected = pd.concat(frames, ignore_index=True)
+    tm.assert_frame_equal(result.to_pandas(), expected)
+
+
+@pytest.mark.pandas
+def test_parquet_writer_context_obj_with_exception(tempdir):
+    df = _test_dataframe(100)
+    df['unique_id'] = 0
+
+    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+    out = pa.BufferOutputStream()
+    error_text = 'Artificial Error'
+
+    try:
+        with pq.ParquetWriter(out,
+                              arrow_table.schema,
+                              version='2.6') as writer:
+
+            frames = []
+            for i in range(10):
+                df['unique_id'] = i
+                arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+                writer.write_table(arrow_table)
+                frames.append(df.copy())
+                if i == 5:
+                    raise ValueError(error_text)
+    except Exception as e:
+        assert str(e) == error_text
+
+    buf = out.getvalue()
+    result = _read_table(pa.BufferReader(buf))
+
+    expected = pd.concat(frames, ignore_index=True)
+    tm.assert_frame_equal(result.to_pandas(), expected)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize("filesystem", [
+    None,
+    fs.LocalFileSystem(),
+])
+def test_parquet_writer_write_wrappers(tempdir, filesystem):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    batch = pa.RecordBatch.from_pandas(df, preserve_index=False)
+    path_table = str(tempdir / 'data_table.parquet')
+    path_batch = str(tempdir / 'data_batch.parquet')
+
+    with pq.ParquetWriter(
+        path_table, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write_table(table)
+
+    result = _read_table(path_table).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+    with pq.ParquetWriter(
+        path_batch, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write_batch(batch)
+
+    result = _read_table(path_batch).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+    with pq.ParquetWriter(
+        path_table, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write(table)
+
+    result = _read_table(path_table).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+    with pq.ParquetWriter(
+        path_batch, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write(batch)
+
+    result = _read_table(path_batch).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.large_memory
+@pytest.mark.pandas
+def test_parquet_writer_chunk_size(tempdir):
+    default_chunk_size = 1024 * 1024
+    abs_max_chunk_size = 64 * 1024 * 1024
+
+    def check_chunk_size(data_size, chunk_size, expect_num_chunks):
+        table = pa.Table.from_arrays([
+            _range_integers(data_size, 'b')
+        ], names=['x'])
+        if chunk_size is None:
+            pq.write_table(table, tempdir / 'test.parquet')
+        else:
+            pq.write_table(table, tempdir / 'test.parquet', row_group_size=chunk_size)
+        metadata = pq.read_metadata(tempdir / 'test.parquet')
+        expected_chunk_size = default_chunk_size if chunk_size is None else chunk_size
+        assert metadata.num_row_groups == expect_num_chunks
+        latched_chunk_size = min(expected_chunk_size, abs_max_chunk_size)
+        # First chunks should be full size
+        for chunk_idx in range(expect_num_chunks - 1):
+            assert metadata.row_group(chunk_idx).num_rows == latched_chunk_size
+        # Last chunk may be smaller
+        remainder = data_size - (expected_chunk_size * (expect_num_chunks - 1))
+        if remainder == 0:
+            assert metadata.row_group(
+                expect_num_chunks - 1).num_rows == latched_chunk_size
+        else:
+            assert metadata.row_group(expect_num_chunks - 1).num_rows == remainder
+
+    check_chunk_size(default_chunk_size * 2, default_chunk_size - 100, 3)
+    check_chunk_size(default_chunk_size * 2, default_chunk_size, 2)
+    check_chunk_size(default_chunk_size * 2, default_chunk_size + 100, 2)
+    check_chunk_size(default_chunk_size + 100, default_chunk_size + 100, 1)
+    # Even though the chunk size requested is large enough it will be capped
+    # by the absolute max chunk size
+    check_chunk_size(abs_max_chunk_size * 2, abs_max_chunk_size * 2, 2)
+
+    # These tests don't pass a chunk_size to write_table and so the chunk size
+    # should be default_chunk_size
+    check_chunk_size(default_chunk_size, None, 1)
+    check_chunk_size(default_chunk_size + 1, None, 2)
+
+
+@pytest.mark.pandas
+@pytest.mark.parametrize("filesystem", [
+    None,
+    fs.LocalFileSystem(),
+])
+def test_parquet_writer_filesystem_local(tempdir, filesystem):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+    path = str(tempdir / 'data.parquet')
+
+    with pq.ParquetWriter(
+        path, table.schema, filesystem=filesystem, version='2.6'
+    ) as writer:
+        writer.write_table(table)
+
+    result = _read_table(path).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+@pytest.mark.s3
+def test_parquet_writer_filesystem_s3(s3_example_fs):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+
+    fs, uri, path = s3_example_fs
+
+    with pq.ParquetWriter(
+        path, table.schema, filesystem=fs, version='2.6'
+    ) as writer:
+        writer.write_table(table)
+
+    result = _read_table(uri).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+@pytest.mark.s3
+def test_parquet_writer_filesystem_s3_uri(s3_example_fs):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+
+    fs, uri, path = s3_example_fs
+
+    with pq.ParquetWriter(uri, table.schema, version='2.6') as writer:
+        writer.write_table(table)
+
+    result = _read_table(path, filesystem=fs).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.pandas
+@pytest.mark.s3
+def test_parquet_writer_filesystem_s3fs(s3_example_s3fs):
+    df = _test_dataframe(100)
+    table = pa.Table.from_pandas(df, preserve_index=False)
+
+    fs, directory = s3_example_s3fs
+    path = directory + "/test.parquet"
+
+    with pq.ParquetWriter(
+        path, table.schema, filesystem=fs, version='2.6'
+    ) as writer:
+        writer.write_table(table)
+
+    result = _read_table(path, filesystem=fs).to_pandas()
+    tm.assert_frame_equal(result, df)
+
+
+@pytest.mark.numpy
+def test_parquet_writer_filesystem_buffer_raises():
+    table = _test_table(100)
+    filesystem = fs.LocalFileSystem()
+
+    # Should raise ValueError when filesystem is passed with file-like object
+    with pytest.raises(ValueError, match="specified path is file-like"):
+        pq.ParquetWriter(
+            pa.BufferOutputStream(), table.schema, filesystem=filesystem
+        )
+
+
+def test_parquet_writer_store_schema(tempdir):
+    table = pa.table({'a': [1, 2, 3]})
+
+    # default -> write schema information
+    path1 = tempdir / 'test_with_schema.parquet'
+    with pq.ParquetWriter(path1, table.schema) as writer:
+        writer.write_table(table)
+
+    meta = pq.read_metadata(path1)
+    assert b'ARROW:schema' in meta.metadata
+    assert meta.metadata[b'ARROW:schema']
+
+    # disable adding schema information
+    path2 = tempdir / 'test_without_schema.parquet'
+    with pq.ParquetWriter(path2, table.schema, store_schema=False) as writer:
+        writer.write_table(table)
+
+    meta = pq.read_metadata(path2)
+    assert meta.metadata is None
+
+
+def test_parquet_writer_append_key_value_metadata(tempdir):
+    table = pa.Table.from_arrays([pa.array([], type='int32')], ['f0'])
+    path = tempdir / 'metadata.parquet'
+
+    with pq.ParquetWriter(path, table.schema) as writer:
+        writer.write_table(table)
+        writer.add_key_value_metadata({'key1': '1', 'key2': 'x'})
+        writer.add_key_value_metadata({'key2': '2', 'key3': '3'})
+    reader = pq.ParquetFile(path)
+    metadata = reader.metadata.metadata
+    assert metadata[b'key1'] == b'1'
+    assert metadata[b'key2'] == b'2'
+    assert metadata[b'key3'] == b'3'
+
+
+def test_parquet_content_defined_chunking(tempdir):
+    table = pa.table({'a': range(100_000)})
+
+    # use PLAIN encoding because we compare the overall size of the row groups
+    # which would vary depending on the encoding making the assertions wrong
+    pq.write_table(table, tempdir / 'unchunked.parquet',
+                   use_dictionary=False,
+                   column_encoding="PLAIN")
+    pq.write_table(table, tempdir / 'chunked-default.parquet',
+                   use_dictionary=False,
+                   column_encoding="PLAIN",
+                   use_content_defined_chunking=True)
+    pq.write_table(table, tempdir / 'chunked-custom.parquet',
+                   use_dictionary=False,
+                   column_encoding="PLAIN",
+                   use_content_defined_chunking={"min_chunk_size": 32_768,
+                                                 "max_chunk_size": 65_536})
+
+    # the data must be the same
+    unchunked = pq.read_table(tempdir / 'unchunked.parquet')
+    chunked_default = pq.read_table(tempdir / 'chunked-default.parquet')
+    chunked_custom = pq.read_table(tempdir / 'chunked-custom.parquet')
+    assert unchunked.equals(chunked_default)
+    assert unchunked.equals(chunked_custom)
+
+    # number of row groups and their sizes are not affected by content defined chunking
+    unchunked_metadata = pq.read_metadata(tempdir / 'unchunked.parquet')
+    chunked_default_metadata = pq.read_metadata(tempdir / 'chunked-default.parquet')
+    chunked_custom_metadata = pq.read_metadata(tempdir / 'chunked-custom.parquet')
+
+    assert unchunked_metadata.num_row_groups == chunked_default_metadata.num_row_groups
+    assert unchunked_metadata.num_row_groups == chunked_custom_metadata.num_row_groups
+
+    for i in range(unchunked_metadata.num_row_groups):
+        rg_unchunked = unchunked_metadata.row_group(i)
+        rg_chunked_default = chunked_default_metadata.row_group(i)
+        rg_chunked_custom = chunked_custom_metadata.row_group(i)
+        assert rg_unchunked.num_rows == rg_chunked_default.num_rows
+        assert rg_unchunked.num_rows == rg_chunked_custom.num_rows
+        # since PageReader is not exposed we cannot inspect the page sizes
+        # so just check that the total byte size is different
+        assert rg_unchunked.total_byte_size < rg_chunked_default.total_byte_size
+        assert rg_unchunked.total_byte_size < rg_chunked_custom.total_byte_size
+        assert rg_chunked_default.total_byte_size < rg_chunked_custom.total_byte_size
+
+
+def test_parquet_content_defined_chunking_parameters(tempdir):
+    table = pa.table({'a': range(100)})
+    path = tempdir / 'chunked-invalid.parquet'
+
+    # it raises OSError, not ideal but this is how parquet exceptions are handled
+    # currently
+    msg = "max_chunk_size must be greater than min_chunk_size"
+    with pytest.raises(Exception, match=msg):
+        cdc_options = {"min_chunk_size": 65_536, "max_chunk_size": 32_768}
+        pq.write_table(table, path, use_content_defined_chunking=cdc_options)
+
+    cases = [
+        (
+            {"min_chunk_size": 64 * 1024, "unknown_option": True},
+            "Unknown options in 'use_content_defined_chunking': {'unknown_option'}"
+        ),
+        (
+            {"min_chunk_size": 64 * 1024},
+            "Missing options in 'use_content_defined_chunking': {'max_chunk_size'}"
+        ),
+        (
+            {"max_chunk_size": 64 * 1024},
+            "Missing options in 'use_content_defined_chunking': {'min_chunk_size'}"
+        )
+    ]
+    for cdc_options, msg in cases:
+        with pytest.raises(ValueError, match=msg):
+            pq.write_table(table, path, use_content_defined_chunking=cdc_options)
+
+    # using the default parametrization
+    pq.write_table(table, path, use_content_defined_chunking=True)
+
+    # using min_chunk_size and max_chunk_size
+    cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536}
+    pq.write_table(table, path, use_content_defined_chunking=cdc_options)
+
+    # using min_chunk_size, max_chunk_size and norm_level
+    cdc_options = {"min_chunk_size": 32_768, "max_chunk_size": 65_536, "norm_level": 1}
+    pq.write_table(table, path, use_content_defined_chunking=cdc_options)
+
+
+@pytest.mark.parametrize("time_type, time_unit", [
+    (pa.time32, "s"),
+    (pa.time32, "ms"),
+    (pa.time64, "us"),
+    (pa.time64, "ns"),
+])
+@pytest.mark.parametrize("utc_flag_val", [False, True])
+def test_arrow_writer_props_time_adjusted_to_utc(
+    tempdir,
+    utc_flag_val,
+    time_type,
+    time_unit,
+):
+    # GH-47441
+    filename = tempdir / "time_adjusted_to_utc.parquet"
+
+    time_values = [0, 123, 10_000, 86_399]
+
+    table = pa.table({
+        "time_col": pa.array(time_values, type=time_type(time_unit)),
+    })
+
+    schema = pa.schema([
+        ("time_col", time_type(time_unit)),
+    ])
+
+    with pq.ParquetWriter(
+        where=filename,
+        schema=schema,
+        write_time_adjusted_to_utc=utc_flag_val,
+    ) as writer:
+        writer.write_table(table)
+
+    result = pq.read_table(filename, schema=schema)
+
+    result.validate(full=True)
+
+    assert result.equals(table)
+
+
+@pytest.mark.parametrize(
+    "max_rows_per_page",
+    [1, 10, 100, 1_000, None],
+)
+def test_writer_props_max_rows_per_page(tempdir, max_rows_per_page):
+    # GH-48096
+    filename = tempdir / "max_rows_per_page.parquet"
+
+    table = pa.table({
+        "x": pa.array([1, 2, 3, 4, 5, 6, 7], type=pa.int8()),
+        "y": pa.array([11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0], type=pa.float16()),
+    })
+
+    schema = pa.schema([
+        ("x", pa.int8()),
+        ("y", pa.float16()),
+    ])
+
+    with pq.ParquetWriter(
+        where=filename,
+        schema=schema,
+        max_rows_per_page=max_rows_per_page,
+    ) as writer:
+        writer.write_table(table)
+
+    result = pq.read_table(filename, schema=schema)
+
+    result.validate(full=True)
+
+    assert result.equals(table)
+
+
+def test_writer_props_max_rows_per_page_file_size(tempdir):
+    # GH-48096
+    table = pa.table({
+        "x": pa.array(range(1_000_000))
+    })
+
+    local = fs.LocalFileSystem()
+    file_infos = []
+
+    for max_rows in (1_000, 10_000):
+        path = f"{tempdir}/max_rows_per_page_{max_rows}.parquet"
+
+        with pq.ParquetWriter(
+            where=path,
+            schema=table.schema,
+            max_rows_per_page=max_rows,
+        ) as writer:
+            writer.write_table(table)
+
+        file_infos.append(local.get_file_info(path))
+
+    # A smaller maximum rows parameter should produce a larger file
+    assert file_infos[0].size > file_infos[1].size
diff --git a/scikit_learn-1.8.0.dist-info/licenses/COPYING b/scikit_learn-1.8.0.dist-info/licenses/COPYING
new file mode 100644
index 0000000000000000000000000000000000000000..f2fc85ebc240fbc4201e5dbec01401bad114cf8c
--- /dev/null
+++ b/scikit_learn-1.8.0.dist-info/licenses/COPYING
@@ -0,0 +1,112 @@
+BSD 3-Clause License
+
+Copyright (c) 2007-2024 The scikit-learn developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+----
+
+This binary distribution of scikit-learn also bundles the following software:
+
+----
+
+Name: GCC runtime library
+Files: scikit_learn.libs/libgomp*.so*
+Availability: https://gcc.gnu.org/git/?p=gcc.git;a=tree;f=libgomp
+
+GCC RUNTIME LIBRARY EXCEPTION
+
+Version 3.1, 31 March 2009
+
+Copyright (C) 2009 Free Software Foundation, Inc. <http://fsf.org/>
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+This GCC Runtime Library Exception ("Exception") is an additional
+permission under section 7 of the GNU General Public License, version
+3 ("GPLv3"). It applies to a given file (the "Runtime Library") that
+bears a notice placed by the copyright holder of the file stating that
+the file is governed by GPLv3 along with this Exception.
+
+When you use GCC to compile a program, GCC may combine portions of
+certain GCC header files and runtime libraries with the compiled
+program. The purpose of this Exception is to allow compilation of
+non-GPL (including proprietary) programs to use, in this way, the
+header files and runtime libraries covered by this Exception.
+
+0. Definitions.
+
+A file is an "Independent Module" if it either requires the Runtime
+Library for execution after a Compilation Process, or makes use of an
+interface provided by the Runtime Library, but is not otherwise based
+on the Runtime Library.
+
+"GCC" means a version of the GNU Compiler Collection, with or without
+modifications, governed by version 3 (or a specified later version) of
+the GNU General Public License (GPL) with the option of using any
+subsequent versions published by the FSF.
+
+"GPL-compatible Software" is software whose conditions of propagation,
+modification and use would permit combination with GCC in accord with
+the license of GCC.
+
+"Target Code" refers to output from any compiler for a real or virtual
+target processor architecture, in executable form or suitable for
+input to an assembler, loader, linker and/or execution
+phase. Notwithstanding that, Target Code does not include data in any
+format that is used as a compiler intermediate representation, or used
+for producing a compiler intermediate representation.
+
+The "Compilation Process" transforms code entirely represented in
+non-intermediate languages designed for human-written code, and/or in
+Java Virtual Machine byte code, into Target Code. Thus, for example,
+use of source code generators and preprocessors need not be considered
+part of the Compilation Process, since the Compilation Process can be
+understood as starting with the output of the generators or
+preprocessors.
+
+A Compilation Process is "Eligible" if it is done using GCC, alone or
+with other GPL-compatible software, or if it is done without using any
+work based on GCC. For example, using non-GPL-compatible Software to
+optimize any GCC intermediate representations would not qualify as an
+Eligible Compilation Process.
+
+1. Grant of Additional Permission.
+
+You have permission to propagate a work of Target Code formed by
+combining the Runtime Library with Independent Modules, even if such
+propagation would otherwise violate the terms of GPLv3, provided that
+all Target Code was generated by Eligible Compilation Processes. You
+may then convey such a combination under terms of your choice,
+consistent with the licensing of the Independent Modules.
+
+2. No Weakening of GCC Copyleft.
+
+The availability of this Exception does not imply any general
+presumption that third-party software is unaffected by the copyleft
+requirements of the license of GCC.
diff --git a/scikit_learn-1.8.0.dist-info/sboms/auditwheel.cdx.json b/scikit_learn-1.8.0.dist-info/sboms/auditwheel.cdx.json
new file mode 100644
index 0000000000000000000000000000000000000000..20bf1364b9ca30bca2485852c363ac8d48d01965
--- /dev/null
+++ b/scikit_learn-1.8.0.dist-info/sboms/auditwheel.cdx.json
@@ -0,0 +1 @@
+{"bomFormat": "CycloneDX", "specVersion": "1.4", "version": 1, "metadata": {"component": {"type": "library", "bom-ref": "pkg:pypi/scikit_learn@1.8.0?file_name=scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", "name": "scikit_learn", "version": "1.8.0", "purl": "pkg:pypi/scikit_learn@1.8.0?file_name=scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl"}, "tools": [{"name": "auditwheel", "version": "6.5.0"}]}, "components": [{"type": "library", "bom-ref": "pkg:pypi/scikit_learn@1.8.0?file_name=scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", "name": "scikit_learn", "version": "1.8.0", "purl": "pkg:pypi/scikit_learn@1.8.0?file_name=scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl"}, {"type": "library", "bom-ref": "pkg:rpm/almalinux/libgomp@8.5.0-28.el8_10.alma.1#c61017c9a24eb6e1e1a3cdc9becd004a6419cbda3d54b4848b98f240a4829571", "name": "libgomp", "version": "8.5.0-28.el8_10.alma.1", "purl": "pkg:rpm/almalinux/libgomp@8.5.0-28.el8_10.alma.1"}], "dependencies": [{"ref": "pkg:pypi/scikit_learn@1.8.0?file_name=scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", "dependsOn": ["pkg:rpm/almalinux/libgomp@8.5.0-28.el8_10.alma.1#c61017c9a24eb6e1e1a3cdc9becd004a6419cbda3d54b4848b98f240a4829571"]}, {"ref": "pkg:rpm/almalinux/libgomp@8.5.0-28.el8_10.alma.1#c61017c9a24eb6e1e1a3cdc9becd004a6419cbda3d54b4848b98f240a4829571"}]}
\ No newline at end of file